diff --git a/.bazelrc b/.bazelrc
index 24bfaae60b6..f2aa3ac447b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -143,6 +143,11 @@ build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl --define=build_with_mkl_dnn_v1_only=true
 build:mkl -c opt
 
+# config to build OneDNN backend with a user specified threadpool.
+build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
+build:mkl_threadpool -c opt
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
@@ -163,6 +168,8 @@ build:cuda_clang --action_env TF_CUDA_CLANG=1
 build:dbg --config=opt -c dbg
 # for now, disable arm_neon. see: https://github.com/tensorflow/tensorflow/issues/33360
 build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
+# AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
+build:dbg --copt -DDEBUG_BUILD
 
 build:tensorrt --action_env TF_NEED_TENSORRT=1
 
@@ -233,10 +240,15 @@ build:c++17 --cxxopt=-std=c++1z
 build:c++17 --cxxopt=-stdlib=libc++
 build:c++1z --config=c++17
 
-# Enable using platform specific build settings
+# Enable using platform specific build settings, except when cross-compiling for
+# mobile platforms.
 build --enable_platform_specific_config
+build:android --noenable_platform_specific_config
+build:ios --noenable_platform_specific_config
 
 # Suppress C++ compiler warnings, otherwise build logs become 10s of MBs.
+build:android --copt=-w
+build:ios --copt=-w
 build:linux --copt=-w
 build:macos --copt=-w
 build:windows --copt=/w
@@ -256,6 +268,10 @@ build:macos --define=INCLUDEDIR=$(PREFIX)/include
 # TF_SYSTEM_LIBS do not work on windows.
 
 # By default, build TF in C++ 14 mode.
+build:android --cxxopt=-std=c++14
+build:android --host_cxxopt=-std=c++14
+build:ios --cxxopt=-std=c++14
+build:ios --host_cxxopt=-std=c++14
 build:linux --cxxopt=-std=c++14
 build:linux --host_cxxopt=-std=c++14
 build:macos --cxxopt=-std=c++14
@@ -356,9 +372,10 @@ build:rbe_linux --linkopt=-lm
 build:rbe_cpu_linux --config=rbe_linux
 build:rbe_cpu_linux --crosstool_top="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain"
 build:rbe_cpu_linux --extra_toolchains="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:cc-toolchain-k8"
-build:rbe_cpu_linux --extra_execution_platforms"=@org_tensorflow//third_party/toolchains:rbe_ubuntu16.04-manylinux2010"
-build:rbe_cpu_linux --host_platform="@org_tensorflow//third_party/toolchains:rbe_ubuntu16.04-manylinux2010"
-build:rbe_cpu_linux --platforms="@org_tensorflow//third_party/toolchains:rbe_ubuntu16.04-manylinux2010"
+build:rbe_cpu_linux --extra_execution_platforms="@ubuntu16.04-manylinux2010-py3_config_platform//:platform"
+build:rbe_cpu_linux --extra_execution_platforms="@ubuntu16.04-manylinux2010-py3_config_platform//:platform"
+build:rbe_cpu_linux --host_platform="@ubuntu16.04-manylinux2010-py3_config_platform//:platform"
+build:rbe_cpu_linux --platforms="@ubuntu16.04-manylinux2010-py3_config_platform//:platform"
 
 build:rbe_linux_cuda_base --config=rbe_linux
 build:rbe_linux_cuda_base --repo_env=TF_NEED_TENSORRT=1
@@ -380,17 +397,37 @@ build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu16.04-py3-gcc7_
 build:rbe_linux_cuda_nvcc --define=using_cuda_nvcc=true
 test:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_base
 
-build:rbe_linux_cuda_clang --config=rbe_linux_cuda_base
-build:rbe_linux_cuda_clang --crosstool_top="@ubuntu16.04-py3-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_clang --extra_toolchains="@ubuntu16.04-py3-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_clang --extra_execution_platforms="@ubuntu16.04-py3-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_clang --host_platform="@ubuntu16.04-py3-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_clang --platforms="@ubuntu16.04-py3-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_clang --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu16.04-py3-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
-build:rbe_linux_cuda_clang --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu16.04-py3-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
-build:rbe_linux_cuda_clang --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu16.04-py3-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
-build:rbe_linux_cuda_clang --define=using_cuda_clang=true
-test:rbe_linux_cuda_clang --config=rbe_linux_cuda_base
+build:rbe_linux_cuda_nvcc_base --config=rbe_linux_cuda_base
+build:rbe_linux_cuda_nvcc_base --crosstool_top="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda_nvcc_base --extra_toolchains="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda_nvcc_base --extra_execution_platforms="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda_nvcc_base --host_platform="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda_nvcc_base --platforms="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
+build:rbe_linux_cuda_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
+build:rbe_linux_cuda_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
+build:rbe_linux_cuda_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda_nvcc_py27 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python2.7"
+build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.5"
+build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.6"
+build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
+build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
+
+build:rbe_linux_cuda_clang_base --config=rbe_linux_cuda_base
+build:rbe_linux_cuda_clang_base --crosstool_top="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda_clang_base --extra_toolchains="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda_clang_base --extra_execution_platforms="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda_clang_base --host_platform="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda_clang_base --platforms="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda_clang_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
+build:rbe_linux_cuda_clang_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
+build:rbe_linux_cuda_clang_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
+build:rbe_linux_cuda_clang_base --define=using_cuda_clang=true
+build:rbe_linux_cuda_clang_py27 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python2.7"
+build:rbe_linux_cuda_clang_py35 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.5"
+build:rbe_linux_cuda_clang_py36 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.6"
+build:rbe_linux_cuda_clang_py37 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
+build:rbe_linux_cuda_clang_py38 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
 
 common:rbe_gpu_linux --config=rbe_linux_cuda_nvcc
 
diff --git a/.bazelversion b/.bazelversion
index 227cea21564..4a36342fcab 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1 +1 @@
-2.0.0
+3.0.0
diff --git a/.github/ISSUE_TEMPLATE/00-bug-issue.md b/.github/ISSUE_TEMPLATE/00-bug-issue.md
index 0c2bcb27c7d..6a135d1c61b 100644
--- a/.github/ISSUE_TEMPLATE/00-bug-issue.md
+++ b/.github/ISSUE_TEMPLATE/00-bug-issue.md
@@ -10,32 +10,30 @@ labels: 'type:bug'
 we only address code/doc bugs, performance issues, feature requests and
 build/installation issues on GitHub. tag:bug_template</em>
 
-**System information** 
-- Have I written custom code (as opposed to using a stock
-example script provided in TensorFlow): 
-- OS Platform and Distribution (e.g.,
-Linux Ubuntu 16.04): 
-- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if
-the issue happens on mobile device: 
-- TensorFlow installed from (source or
-binary): - TensorFlow version (use command below): 
-- Python version: - Bazel
-version (if compiling from source):
-- GCC/Compiler version (if compiling from
-source): 
-- CUDA/cuDNN version: - GPU model and memory:
+**System information**
+- Have I written custom code (as opposed to using a stock example script provided in TensorFlow):
+- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
+- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device:
+- TensorFlow installed from (source or binary):
+- TensorFlow version (use command below):
+- Python version:
+- Bazel version (if compiling from source):
+- GCC/Compiler version (if compiling from source):
+- CUDA/cuDNN version:
+- GPU model and memory:
 
 You can collect some of this information using our environment capture
 [script](https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh)
-You can also obtain the TensorFlow version with: 1. TF 1.0: `python -c "import
-tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"` 2. TF 2.0: `python -c
-"import tensorflow as tf; print(tf.version.GIT_VERSION, tf.version.VERSION)"`
+You can also obtain the TensorFlow version with:
+1. TF 1.0: `python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"`
+2. TF 2.0: `python -c "import tensorflow as tf; print(tf.version.GIT_VERSION, tf.version.VERSION)"`
+
 
 **Describe the current behavior**
 
 **Describe the expected behavior**
 
-**Standalone code to reproduce the issue** 
+**Standalone code to reproduce the issue**
 Provide a reproducible test case that is the bare minimum necessary to generate
 the problem. If possible, please share a link to Colab/Jupyter/any notebook.
 
diff --git a/.github/ISSUE_TEMPLATE/60-tflite-converter-issue.md b/.github/ISSUE_TEMPLATE/60-tflite-converter-issue.md
index 32ebaff1a9c..6eab765e84e 100644
--- a/.github/ISSUE_TEMPLATE/60-tflite-converter-issue.md
+++ b/.github/ISSUE_TEMPLATE/60-tflite-converter-issue.md
@@ -38,6 +38,9 @@ state what is wrong:
 - Producing correct results, but the model is slower than expected (model generated from old converter)
 
 
+**RNN conversion support**
+If converting TF RNN to TFLite fused RNN ops, please prefix [RNN] in the title.
+
 **Any other info / logs**
 
 Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
diff --git a/.github/ISSUE_TEMPLATE/80-performance-issue.md b/.github/ISSUE_TEMPLATE/80-performance-issue.md
index a1cbf23df4b..3f0c8c58b90 100644
--- a/.github/ISSUE_TEMPLATE/80-performance-issue.md
+++ b/.github/ISSUE_TEMPLATE/80-performance-issue.md
@@ -11,32 +11,29 @@ As per our
 we only address code/doc bugs, performance issues, feature requests and
 build/installation issues on GitHub. tag:performance_template</em>
 
-**System information** 
-- Have I written custom code (as opposed to using a stock
-example script provided in TensorFlow): 
-- OS Platform and Distribution (e.g.,
-Linux Ubuntu 16.04): 
-- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if
-the issue happens on mobile device: 
-- TensorFlow installed from (source or
-binary): - TensorFlow version (use command below): 
-- Python version: - Bazel
-version (if compiling from source):
-- GCC/Compiler version (if compiling from
-source): 
-- CUDA/cuDNN version: - GPU model and memory:
+**System information**
+- Have I written custom code (as opposed to using a stock example script provided in TensorFlow):
+- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
+- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device:
+- TensorFlow installed from (source or binary):
+- TensorFlow version (use command below):
+- Python version:
+- Bazel version (if compiling from source):
+- GCC/Compiler version (if compiling from source):
+- CUDA/cuDNN version:
+- GPU model and memory:
 
 You can collect some of this information using our environment capture
 [script](https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh)
-You can also obtain the TensorFlow version with: 1. TF 1.0: `python -c "import
-tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"` 2. TF 2.0: `python -c
-"import tensorflow as tf; print(tf.version.GIT_VERSION, tf.version.VERSION)"`
+You can also obtain the TensorFlow version with:
+1. TF 1.0: `python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"`
+2. TF 2.0: `python -c "import tensorflow as tf; print(tf.version.GIT_VERSION, tf.version.VERSION)"`
 
 **Describe the current behavior**
 
 **Describe the expected behavior**
 
-**Standalone code to reproduce the issue** 
+**Standalone code to reproduce the issue**
 Provide a reproducible test case that is the bare minimum necessary to generate
 the problem. If possible, please share a link to Colab/Jupyter/any notebook.
 
diff --git a/.github/bot_config.yml b/.github/bot_config.yml
new file mode 100644
index 00000000000..88c737f41e2
--- /dev/null
+++ b/.github/bot_config.yml
@@ -0,0 +1,87 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+# A list of assignees
+assignees:
+   - amahendrakar
+   - ravikyram
+   - Saduf2019
+# A list of assignees for compiler folder
+compiler_assignees:
+   - joker-eph
+# Cuda Comment
+cuda_comment: >
+   From the template it looks like you are installing **TensorFlow** (TF) prebuilt binaries:
+      * For TF-GPU - See point 1
+      * For TF-CPU - See point 2
+   -----------------------------------------------------------------------------------------------
+   
+   **1. Installing **TensorFlow-GPU** (TF) prebuilt binaries**
+   
+   
+   Make sure you are using compatible TF and CUDA versions.
+   Please refer following TF version and CUDA version compatibility table.
+   
+   | TF  | CUDA |
+   
+   | :-------------: | :-------------: |
+   
+   | 2.1.0 - 2.2.0  | 10.1 |
+   
+   | 1.13.1 - 2.0  | 10.0  |
+   
+   | 1.5.0 - 1.12.0 | 9.0 |
+   
+     * If you have above configuration and using _**Windows**_ platform -
+       * Try adding the CUDA, CUPTI, and cuDNN installation directories to the %PATH% environment variable.
+       * Refer [windows setup guide](https://www.tensorflow.org/install/gpu#windows_setup).
+     * If you have above configuration and using _**Ubuntu/Linux**_ platform -
+       * Try adding the CUDA, CUPTI, and cuDNN installation directories to the $LD_LIBRARY_PATH environment variable.
+       * Refer [linux setup guide](https://www.tensorflow.org/install/gpu#linux_setup).
+     * If error still persists then, apparently your CPU model does not support AVX instruction sets.
+       * Refer [hardware requirements](https://www.tensorflow.org/install/pip#hardware-requirements).
+   
+   -----------------------------------------------------------------------------------------------
+   
+   **2. Installing **TensorFlow** (TF) CPU prebuilt binaries**
+   
+   
+   *TensorFlow release binaries version 1.6 and higher are prebuilt with AVX instruction sets.*
+   
+   
+   Therefore on any CPU that does not have these instruction sets, either CPU or GPU version of TF will fail to load.
+   
+   Apparently, your CPU model does not support AVX instruction sets. You can still use TensorFlow with the alternatives given below:
+   
+      * Try Google Colab to use TensorFlow.
+         * The easiest way to use TF will be to switch to [google colab](https://colab.sandbox.google.com/notebooks/welcome.ipynb#recent=true). You get pre-installed latest stable TF version. Also you can use ```pip install```  to install any other preferred TF version.
+         * It has an added advantage since you can you easily switch to different hardware accelerators (cpu, gpu, tpu) as per the task.
+         * All you need is a good internet connection and you are all set.
+      * Try to build TF from sources by changing CPU optimization flags.
+   
+   *Please let us know if this helps.*
+   
+windows_comment: >
+   From the stack trace it looks like you are hitting windows path length limit.
+      * Try to disable path length limit on Windows 10.
+        * Refer [disable path length limit instructions guide.](https://mspoweruser.com/ntfs-260-character-windows-10/)
+   
+   Please let us know if this helps.
diff --git a/.github/stale.yml b/.github/stale.yml
new file mode 100644
index 00000000000..e1184ce37b4
--- /dev/null
+++ b/.github/stale.yml
@@ -0,0 +1,39 @@
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ============================================================================
+    #
+    # THIS IS A GENERATED DOCKERFILE.
+    #
+    # This file was assembled from multiple pieces, whose use is documented
+    # throughout. Please refer to the TensorFlow dockerfiles documentation
+    # for more information.
+
+# Number of days of inactivity before an Issue or Pull Request becomes stale
+daysUntilStale: 7
+# Number of days of inactivity before a stale Issue or Pull Request is closed
+daysUntilClose: 7
+# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
+onlyLabels:
+ - stat:awaiting response
+# Comment to post when marking as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you.
+# Comment to post when removing the stale label. Set to `false` to disable
+unmarkComment: false
+closeComment: >
+  Closing as stale. Please reopen if you'd like to work on this further.
+limitPerRun: 30
+# Limit to only `issues` or `pulls`
+only: issues
diff --git a/README.md b/README.md
index 27032043e07..ba4597af14c 100644
--- a/README.md
+++ b/README.md
@@ -103,17 +103,17 @@ open-source software development:
 
 ### Official Builds
 
-Build Type               | Status                                                                                                                                                                                                                                                                                                                                        | Artifacts
------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
-**Linux CPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html)                                                                                                                                                                        | [PyPI](https://pypi.org/project/tf-nightly/)
-**Linux GPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html)                                                                                                                                                              | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
-**Linux XLA**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html)                                                                                                                                                                      | TBA
-**macOS**                | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html)                                                                                                                                                                  | [PyPI](https://pypi.org/project/tf-nightly/)
-**Windows CPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)                                                                                                                                                                    | [PyPI](https://pypi.org/project/tf-nightly/)
-**Windows GPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)                                                                                                                                                                    | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
-**Android**              | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)                                                                                                                                                                            | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
-**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv6l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
-**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv7l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
+Build Type               | Status                                                                                                                                                                           | Artifacts
+------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**Linux CPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html)           | [PyPI](https://pypi.org/project/tf-nightly/)
+**Linux GPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html) | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
+**Linux XLA**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html)         | TBA
+**macOS**                | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html)     | [PyPI](https://pypi.org/project/tf-nightly/)
+**Windows CPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)       | [PyPI](https://pypi.org/project/tf-nightly/)
+**Windows GPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)       | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
+**Android**              | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)               | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
+**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
 
 ### Community Supported Builds
 
diff --git a/RELEASE.md b/RELEASE.md
index b5d088821e4..f251f6ceffa 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,172 @@
+# Release 2.1.1
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+* Fixes a versioning bug which causes Keras layers from TF 1.x to be used instead of those from TF 2.x
+
+# Release 2.0.2
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+
+# Release 1.15.3
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+
+# Release 2.2.0
+
+TensorFlow 2.2 discontinues support for Python 2, [previously announced](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ) as following [Python 2's EOL on January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update).
+
+Coinciding with this change, new releases of [TensorFlow's Docker images](https://hub.docker.com/r/tensorflow/tensorflow/) provide Python 3 exclusively. Because all images now use Python 3, Docker tags containing `-py3` will no longer be provided and existing `-py3` tags like `latest-py3` will not be updated.
+
+## Major Features and Improvements
+
+* Replaced the scalar type for string tensors from `std::string` to `tensorflow::tstring` which is now ABI stable.
+* A new Profiler for TF 2 for CPU/GPU/TPU. It offers both device and host performance analysis, including input pipeline and TF Ops. Optimization advisory is provided whenever possible. Please see [this tutorial](https://www.tensorflow.org/tensorboard/tensorboard_profiling_keras) and [guide](https://www.tensorflow.org/guide/profiler) for usage guidelines.
+* Export C++ functions to Python using `pybind11` as opposed to `SWIG` as a part of our [deprecation of swig efforts](https://github.com/tensorflow/community/blob/master/rfcs/20190208-pybind11.md).
+* `tf.distribute`:
+  * Support added for global sync `BatchNormalization` by using the newly added `tf.keras.layers.experimental.SyncBatchNormalization` layer. This layer will sync `BatchNormalization` statistics every step across all replicas taking part in sync training.
+  * Performance improvements for GPU multi-worker distributed training using `tf.distribute.experimental.MultiWorkerMirroredStrategy`
+    * Update NVIDIA `NCCL` to `2.5.7-1` for better performance and performance tuning. Please see [nccl developer guide](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html) for more information on this.
+    * Support gradient `allreduce` in `float16`. See this [example](https://github.com/tensorflow/models/blob/master/official/staging/training/grad_utils.py) usage.
+    * Experimental support of [all reduce gradient packing](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/CollectiveHints) to allow overlapping gradient aggregation with backward path computation.
+    * Deprecated `experimental_run_v2` method for distribution strategies and renamed the method `run` as it is no longer experimental.
+    * Add CompositeTensor support for DistributedIterators. This should help prevent unnecessary function retracing and memory leaks.
+* `tf.keras`:
+  * `Model.fit` major improvements:
+     * You can now use custom training logic with `Model.fit` by overriding `Model.train_step`.
+     * Easily write state-of-the-art training loops without worrying about all of the features `Model.fit` handles for you (distribution strategies, callbacks, data formats, looping logic, etc)
+     * See the default [`Model.train_step`](https://github.com/tensorflow/tensorflow/blob/1381fc8e15e22402417b98e3881dfd409998daea/tensorflow/python/keras/engine/training.py#L540) for an example of what this function should look like. Same applies for validation and inference via `Model.test_step` and `Model.predict_step`.
+     * SavedModel uses its own `Model._saved_model_inputs_spec` attr now instead of
+       relying on `Model.inputs` and `Model.input_names`, which are no longer set for subclass Models.
+       This attr is set in eager, `tf.function`, and graph modes. This gets rid of the need for users to
+       manually call `Model._set_inputs` when using Custom Training Loops(CTLs).
+     * Dynamic shapes are supported for generators by calling the Model on the first batch we "peek" from the generator.
+       This used to happen implicitly in `Model._standardize_user_data`. Long-term, a solution where the
+       `DataAdapter` doesn't need to call the Model is probably preferable.
+  * The SavedModel format now supports all Keras built-in layers (including metrics, preprocessing layers, and stateful RNN layers)
+  * Update Keras batch normalization layer to use the running mean and average computation in the `fused_batch_norm`. You should see significant performance improvements when using `fused_batch_norm` in Eager mode.
+
+* `tf.lite`:
+  * Enable TFLite experimental new converter by default.
+* XLA
+  * XLA now builds and works on windows. All prebuilt packages come with XLA available.
+  * XLA can be [enabled for a `tf.function`](https://www.tensorflow.org/xla#explicit_compilation_with_tffunction
+) with “compile or throw exception” semantics on CPU and GPU.
+
+## Breaking Changes
+* `tf.keras`:
+  * In `tf.keras.applications` the name of the "top" layer has been standardized to "predictions". This is only a problem if your code relies on the exact name of the layer.
+  * Huber loss function has been updated to be consistent with other Keras losses. It now computes mean over the last axis of per-sample losses before applying the reduction function.
+* AutoGraph no longer converts functions passed to `tf.py_function`, `tf.py_func` and `tf.numpy_function`.
+* Deprecating `XLA_CPU` and `XLA_GPU` devices with this release.
+* Increasing the minimum bazel version to build TF to 2.0.0 to use Bazel's `cc_experimental_shared_library`.
+* Keras compile/fit behavior for functional and subclassed models have been unified. Model properties such as `metrics`, `metrics_names` will now be available only after **training/evaluating the model on actual data** for functional models. `metrics` will **now include** model `loss` and output losses.`loss_functions` property has been removed from the model. This was an undocumented property that was accidentally public and has now been removed.
+
+## Known Caveats
+* The current TensorFlow release now **requires** [gast](https://pypi.org/project/gast/) version 0.3.3.
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * Removed `autotune_algorithm` from experimental optimization options.
+* TF Core:
+  * `tf.constant` always creates CPU tensors irrespective of the current device context.
+  * Eager `TensorHandles` maintain a list of mirrors for any copies to local or remote devices. This avoids any redundant copies due to op execution.
+  * For `tf.Tensor` & `tf.Variable`, `.experimental_ref()` is no longer experimental and is available as simply `.ref()`.
+  * `pfor/vectorized_map`: Added support for vectorizing 56 more ops. Vectorizing `tf.cond` is also supported now.
+  * Set as much partial shape as we can infer statically within the gradient impl of the gather op.
+  * Gradient of `tf.while_loop` emits `StatelessWhile` op if `cond` and body functions are stateless. This allows multiple gradients while ops to run in parallel under distribution strategy.
+  * Speed up `GradientTape` in eager mode by auto-generating list of op inputs/outputs which are unused and hence not cached for gradient functions.
+  * Support `back_prop=False` in `while_v2` but mark it as deprecated.
+  * Improve error message when attempting to use `None` in data-dependent control flow.
+  * Add `RaggedTensor.numpy()`.
+  * Update `RaggedTensor.__getitem__` to preserve uniform dimensions & allow indexing into uniform dimensions.
+  * Update `tf.expand_dims` to always insert the new dimension as a non-ragged dimension.
+  * Update `tf.embedding_lookup` to use `partition_strategy` and `max_norm` when `ids` is ragged.
+  * Allow `batch_dims==rank(indices)` in `tf.gather`.
+  * Add support for bfloat16 in `tf.print`.
+* `tf.distribute`:
+  * Support `embedding_column` with variable-length input features for `MultiWorkerMirroredStrategy`.
+* `tf.keras`:
+  * Added `experimental_aggregate_gradients` argument to `tf.keras.optimizer.Optimizer.apply_gradients`. This allows custom gradient aggregation and processing aggregated gradients in custom training loop.
+  * Allow `pathlib.Path` paths for loading models via Keras API.
+* `tf.function`/AutoGraph:
+  * AutoGraph is now available in `ReplicaContext.merge_call`, `Strategy.extended.update` and `Strategy.extended.update_non_slot`.
+  * Experimental support for shape invariants has been enabled in `tf.function`. See the API docs for `tf.autograph.experimental.set_loop_options` for additonal info.
+  * AutoGraph error messages now exclude frames corresponding to APIs internal to AutoGraph.
+  * Improve shape inference for `tf.function` input arguments to unlock more Grappler optimizations in TensorFlow 2.x.
+  * Improve automatic control dependency management of resources by allowing resource reads to occur in parallel and synchronizing only on writes.
+  * Fix execution order of multiple stateful calls to `experimental_run_v2` in `tf.function`.
+  * You can now iterate over `RaggedTensors` using a for loop inside `tf.function`.
+* `tf.lite`:
+  *  Migrated the `tf.lite` C inference API out of experimental into lite/c.
+  * Add an option to disallow `NNAPI` CPU / partial acceleration on Android 10
+  * TFLite Android AARs now include the C headers and APIs are required to use TFLite from native code.
+  * Refactors the delegate and delegate kernel sources to allow usage in the linter.
+  * Limit delegated ops to actually supported ones if a device name is specified or `NNAPI` CPU Fallback is disabled.
+  * TFLite now supports `tf.math.reciprocal1` op by lowering to `tf.div op`.
+  * TFLite's unpack op now supports boolean tensor inputs.
+  * Microcontroller and embedded code moved from experimental to main TensorFlow Lite folder
+  * Check for large TFLite tensors.
+  * Fix GPU delegate crash with C++17.
+  * Add 5D support to TFLite `strided_slice`.
+  * Fix error in delegation of `DEPTH_TO_SPACE` to `NNAPI` causing op not to be accelerated.
+  * Fix segmentation fault when running a model with LSTM nodes using `NNAPI` Delegate
+  * Fix `NNAPI` delegate failure when an operand for Maximum/Minimum operation is a scalar.
+  * Fix `NNAPI` delegate failure when Axis input for reduce operation is a scalar.
+  * Expose option to limit the number of partitions that will be delegated to `NNAPI`.
+  * If a target accelerator is specified, use its feature level to determine operations to delegate instead of SDK version.
+* `tf.random`:
+  * Various random number generation improvements:
+    * Add a fast path for default `random_uniform`
+    * `random_seed` documentation improvement.
+    * `RandomBinomial` broadcasts and appends the sample shape to the left rather than the right.
+  * Added `tf.random.stateless_binomial`, `tf.random.stateless_gamma`, `tf.random.stateless_poisson`
+  * `tf.random.stateless_uniform` now supports unbounded sampling of `int` types.
+* Math and Linear Algebra:
+  * Add `tf.linalg.LinearOperatorTridiag`.
+  * Add `LinearOperatorBlockLowerTriangular`
+  * Add broadcasting support to tf.linalg.triangular_solve[#26204](https://github.com/tensorflow/tensorflow/issues/26204), tf.math.invert_permutation.
+  * Add `tf.math.sobol_sample` op.
+  * Add `tf.math.xlog1py`.
+  * Add `tf.math.special.{dawsn,expi,fresnel_cos,fresnel_sin,spence}`.
+  * Add a Modified Discrete Cosine Transform (MDCT) and its inverse to `tf.signal`.
+* TPU Enhancements:
+  * Refactor `TpuClusterResolver` to move shared logic to a separate pip package.
+  * Support configuring TPU software version from cloud tpu client.
+  * Allowed TPU embedding weight decay factor to be multiplied by learning rate.
+* XLA Support:
+  * Add standalone XLA AOT runtime target + relevant .cc sources to pip package.
+  * Add check for memory alignment to MemoryAllocation::MemoryAllocation() on 32-bit ARM. This ensures a deterministic early exit instead of a hard to debug bus error later.
+  * `saved_model_cli aot_compile_cpu` allows you to compile saved models to XLA header+object files and include them in your C++ programs.
+  * Enable `Igamma`, `Igammac` for XLA.
+* Deterministic Op Functionality:
+  * XLA reduction emitter is deterministic when the environment variable `TF_DETERMINISTIC_OPS` is set to "true" or "1". This extends deterministic `tf.nn.bias_add` back-prop functionality (and therefore also deterministic back-prop of bias-addition in Keras layers) to include when XLA JIT complilation is enabled.
+  * Fix problem, when running on a CUDA GPU and when either environment variable `TF_DETERMINSTIC_OPS` or environment variable `TF_CUDNN_DETERMINISTIC` is set to "true" or "1", in which some layer configurations led to an exception with the message "No algorithm worked!"
+* Tracing and Debugging:
+  * Add source, destination name to `_send` traceme to allow easier debugging.
+  * Add traceme event to `fastpathexecute`.
+* Other:
+  * Fix an issue with AUC.reset_states for multi-label AUC [#35852](https://github.com/tensorflow/tensorflow/issues/35852)
+  * Fix the TF upgrade script to not delete files when there is a parsing error and the output mode is `in-place`.
+  * Move `tensorflow/core:framework/*_pyclif` rules to `tensorflow/core/framework:*_pyclif`.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+372046933, 8bitmp3, aaronhma, Abin Shahab, Aditya Patwardhan, Agoniii, Ahti Kitsik, Alan Yee, Albin Joy, Alex Hoffman, Alexander Grund, Alexandre E. Eichenberger, Amit Kumar Jaiswal, amoitra, Andrew Anderson, Angus-Luo, Anthony Barbier, Anton Kachatkou, Anuj Rawat, archis, Arpan-Dhatt, Arvind Sundararajan, Ashutosh Hathidara, autoih, Bairen Yi, Balint Cristian, Bas Aarts, BashirSbaiti, Basit Ayantunde, Ben Barsdell, Benjamin Gaillard, boron, Brett Koonce, Bryan Cutler, Christian Goll, Christian Sachs, Clayne Robison, comet, Daniel Falbel, Daria Zhuravleva, darsh8200, David Truby, Dayananda-V, deepakm, Denis Khalikov, Devansh Singh, Dheeraj R Reddy, Diederik Van Liere, Diego Caballero, Dominic Jack, dothinking, Douman, Drake Gens, Duncan Riach, Ehsan Toosi, ekuznetsov139, Elena Zhelezina, elzino, Ending2015a, Eric Schweitz, Erik Zettel, Ethan Saadia, Eugene Kuznetsov, Evgeniy Zheltonozhskiy, Ewout Ter Hoeven, exfalso, FAIJUL, Fangjun Kuang, Fei Hu, Frank Laub, Frederic Bastien, Fredrik Knutsson, frreiss, Frédéric Rechtenstein, fsx950223, Gaurav Singh, gbaned, George Grzegorz Pawelczak, George Sterpu, Gian Marco Iodice, Giorgio Arena, Hans Gaiser, Hans Pabst, Haoyu Wu, Harry Slatyer, hsahovic, Hugo, Hugo Sjöberg, IrinaM21, jacco, Jake Tae, Jean-Denis Lesage, Jean-Michel Gorius, Jeff Daily, Jens Elofsson, Jerry Shih, jerryyin, Jin Mingjian, Jinjing Zhou, JKIsaacLee, jojimonv, Jonathan Dekhtiar, Jose Ignacio Gomez, Joseph-Rance, Judd, Julian Gross, Kaixi Hou, Kaustubh Maske Patil, Keunwoo Choi, Kevin Hanselman, Khor Chean Wei, Kilaru Yasaswi Sri Chandra Gandhi, Koan-Sin Tan, Koki Ibukuro, Kristian Holsheimer, kurileo, Lakshay Tokas, Lee Netherton, leike666666, Leslie-Fang-Intel, Li, Guizi, LIUJIAN435, Lukas Geiger, Lyo Nguyen, madisetti, Maher Jendoubi, Mahmoud Abuzaina, Manuel Freiberger, Marcel Koester, Marco Jacopo Ferrarotti, Markus Franke, marload, Mbah-Javis, mbhuiyan, Meng Zhang, Michael Liao, MichaelKonobeev, Michal Tarnowski, Milan Straka, minoring, Mohamed Nour Abouelseoud, MoussaMM, Mrinal Jain, mrTsjolder, Måns Nilsson, Namrata Bhave, Nicholas Gao, Niels Ole Salscheider, nikochiko, Niranjan Hasabnis, Nishidha Panpaliya, nmostafa, Noah Trenaman, nuka137, Officium, Owen L - Sfe, Pallavi G, Paul Andrey, Peng Sun, Peng Wu, Phil Pearl, PhilipMay, pingsutw, Pooya Davoodi, PragmaTwice, pshiko, Qwerty71, R Gomathi, Rahul Huilgol, Richard Xiao, Rick Wierenga, Roberto Rosmaninho, ruchit2801, Rushabh Vasani, Sami, Sana Damani, Sarvesh Dubey, Sasan Jafarnejad, Sergii Khomenko, Shane Smiskol, Shaochen Shi, sharkdtu, Shawn Presser, ShengYang1, Shreyash Patodia, Shyam Sundar Dhanabalan, Siju Samuel, Somyajit Chakraborty Sam, Srihari Humbarwadi, srinivasan.narayanamoorthy, Srishti Yadav, Steph-En-M, Stephan Uphoff, Stephen Mugisha, SumanSudhir, Taehun Kim, Tamas Bela Feher, TengLu, Tetragramm, Thierry Herrmann, Tian Jin, tigertang, Tom Carchrae, Tom Forbes, Trent Lo, Victor Peng, vijayphoenix, Vincent Abriou, Vishal Bhola, Vishnuvardhan Janapati, vladbataev, VoVAllen, Wallyss Lima, Wen-Heng (Jack) Chung, wenxizhu, William D. Irons, William Zhang, Xiaoming (Jason) Cui, Xiaoquan Kong, Xinan Jiang, Yasir Modak, Yasuhiro Matsumoto, Yaxun (Sam) Liu, Yong Tang, Ytyt-Yt, yuan, Yuan Mingshuai, Yuan Tang, Yuki Ueda, Yusup, zhangshijin, zhuwenxi
+
 # Release 2.0.1
 
 ## Bug Fixes and Other Changes
diff --git a/SECURITY.md b/SECURITY.md
index 6fc2c3aa9cc..f3a6c148b2e 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -64,7 +64,7 @@ your model, and we recommend you run the TensorFlow process in a sandbox.
 
 It is possible to write models that are secure in a sense that they can safely
 process untrusted inputs assuming there are no bugs. There are two main reasons
-to not rely on this: first, it is easy to write models which must not be exposed
+to not rely on this: First, it is easy to write models which must not be exposed
 to untrusted inputs, and second, there are bugs in any software system of
 sufficient complexity. Letting users control inputs could allow them to trigger
 bugs either in TensorFlow or in dependent libraries.
@@ -149,7 +149,7 @@ attack (or worse). Because TensorFlow behaves correctly, this is not a
 vulnerability in TensorFlow (although it would be a vulnerability of this
 hypothetical system).
 
-As a general rule, it is incorrect behavior for Tensorflow to access memory it
+As a general rule, it is incorrect behavior for TensorFlow to access memory it
 does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to
 such behaviors constitute a vulnerability.
 
diff --git a/configure b/configure
index 66b66ba54ed..e43908e39da 100755
--- a/configure
+++ b/configure
@@ -4,7 +4,7 @@ set -e
 set -o pipefail
 
 if [ -z "$PYTHON_BIN_PATH" ]; then
-  PYTHON_BIN_PATH=$(which python || which python3 || true)
+  PYTHON_BIN_PATH=$(which python3 || which python || true)
 fi
 
 # Set all env variables
diff --git a/configure.py b/configure.py
index fcce0ccd061..9154000d944 100644
--- a/configure.py
+++ b/configure.py
@@ -50,7 +50,7 @@ _TF_WORKSPACE_ROOT = ''
 _TF_BAZELRC = ''
 _TF_CURRENT_BAZEL_VERSION = None
 _TF_MIN_BAZEL_VERSION = '2.0.0'
-_TF_MAX_BAZEL_VERSION = '2.0.0'
+_TF_MAX_BAZEL_VERSION = '3.99.0'
 
 NCCL_LIB_PATHS = [
     'lib64/', 'lib/powerpc64le-linux-gnu/', 'lib/x86_64-linux-gnu/', ''
@@ -144,7 +144,7 @@ def write_to_bazelrc(line):
 
 
 def write_action_env_to_bazelrc(var_name, var):
-  write_to_bazelrc('build --action_env %s="%s"' % (var_name, str(var)))
+  write_to_bazelrc('build --action_env {}="{}"'.format(var_name, str(var)))
 
 
 def run_shell(cmd, allow_non_zero=False, stderr=None):
@@ -205,7 +205,7 @@ def setup_python(environ_cp):
   # Get PYTHON_BIN_PATH, default is the current running python.
   default_python_bin_path = sys.executable
   ask_python_bin_path = ('Please specify the location of python. [Default is '
-                         '%s]: ') % default_python_bin_path
+                         '{}]: ').format(default_python_bin_path)
   while True:
     python_bin_path = get_from_env_or_user_or_default(environ_cp,
                                                       'PYTHON_BIN_PATH',
@@ -215,9 +215,10 @@ def setup_python(environ_cp):
     if os.path.isfile(python_bin_path) and os.access(python_bin_path, os.X_OK):
       break
     elif not os.path.exists(python_bin_path):
-      print('Invalid python path: %s cannot be found.' % python_bin_path)
+      print('Invalid python path: {} cannot be found.'.format(python_bin_path))
     else:
-      print('%s is not executable.  Is it the python binary?' % python_bin_path)
+      print('{} is not executable.  Is it the python binary?'.format(
+          python_bin_path))
     environ_cp['PYTHON_BIN_PATH'] = ''
 
   # Convert python path to Windows style before checking lib and version
@@ -236,7 +237,7 @@ def setup_python(environ_cp):
       default_python_lib_path = python_lib_paths[0]
       python_lib_path = get_input(
           'Please input the desired Python library path to use.  '
-          'Default is [%s]\n' % python_lib_paths[0])
+          'Default is [{}]\n'.format(python_lib_paths[0]))
       if not python_lib_path:
         python_lib_path = default_python_lib_path
     environ_cp['PYTHON_LIB_PATH'] = python_lib_path
@@ -252,7 +253,7 @@ def setup_python(environ_cp):
   # Set-up env variables used by python_configure.bzl
   write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
   write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
-  write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
+  write_to_bazelrc('build --python_path=\"{}"'.format(python_bin_path))
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
   # If choosen python_lib_path is from a path specified in the PYTHONPATH
@@ -266,7 +267,7 @@ def setup_python(environ_cp):
   with open(
       os.path.join(_TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'),
       'w') as f:
-    f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path)
+    f.write('export PYTHON_BIN_PATH="{}"'.format(python_bin_path))
 
 
 def reset_tf_configure_bazelrc():
@@ -320,11 +321,12 @@ def get_var(environ_cp,
       Raise the error to avoid infinitely looping.
   """
   if not question:
-    question = 'Do you wish to build TensorFlow with %s support?' % query_item
+    question = 'Do you wish to build TensorFlow with {} support?'.format(
+        query_item)
   if not yes_reply:
-    yes_reply = '%s support will be enabled for TensorFlow.' % query_item
+    yes_reply = '{} support will be enabled for TensorFlow.'.format(query_item)
   if not no_reply:
-    no_reply = 'No %s' % yes_reply
+    no_reply = 'No {}'.format(yes_reply)
 
   yes_reply += '\n'
   no_reply += '\n'
@@ -368,7 +370,7 @@ def get_var(environ_cp,
         print(no_reply)
         var = False
     else:
-      print('Invalid selection: %s' % user_input_origin)
+      print('Invalid selection: {}'.format(user_input_origin))
   return var
 
 
@@ -479,13 +481,13 @@ def check_bazel_version(min_version, max_version):
   if which('bazel') is None:
     print('Cannot find bazel. Please install bazel.')
     sys.exit(0)
-  curr_version = run_shell(
-      ['bazel', '--batch', '--bazelrc=/dev/null', 'version'])
 
-  for line in curr_version.split('\n'):
-    if 'Build label: ' in line:
-      curr_version = line.split('Build label: ')[1]
-      break
+  stderr = open(os.devnull, 'wb')
+  curr_version = run_shell(['bazel', '--version'],
+                           allow_non_zero = True,
+                           stderr = stderr)
+  if curr_version.startswith('bazel '):
+    curr_version = curr_version.split('bazel ')[1]
 
   min_version_int = convert_version_to_int(min_version)
   curr_version_int = convert_version_to_int(curr_version)
@@ -1171,14 +1173,16 @@ def system_specific_test_config(environ_cp):
   test_only_filters = ['-oss_serial']
   if is_windows():
     test_and_build_filters.append('-no_windows')
-    if environ_cp.get('TF_NEED_CUDA', None) == '1':
+    if ((environ_cp.get('TF_NEED_CUDA', None) == '1') or
+        (environ_cp.get('TF_NEED_ROCM', None) == '1')):
       test_and_build_filters += ['-no_windows_gpu', '-no_gpu']
     else:
       test_and_build_filters.append('-gpu')
   elif is_macos():
     test_and_build_filters += ['-gpu', '-nomac', '-no_mac']
   elif is_linux():
-    if environ_cp.get('TF_NEED_CUDA', None) == '1':
+    if ((environ_cp.get('TF_NEED_CUDA', None) == '1') or
+        (environ_cp.get('TF_NEED_ROCM', None) == '1')):
       test_and_build_filters.append('-no_gpu')
       write_to_bazelrc('test --test_env=LD_LIBRARY_PATH')
     else:
@@ -1383,7 +1387,6 @@ def main():
     # Windows.
     environ_cp['TF_DOWNLOAD_CLANG'] = '0'
     environ_cp['TF_NEED_MPI'] = '0'
-    environ_cp['TF_SET_ANDROID_WORKSPACE'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_TENSORRT'] = '0'
@@ -1416,6 +1419,10 @@ def main():
     write_action_env_to_bazelrc('LD_LIBRARY_PATH',
                                 environ_cp.get('LD_LIBRARY_PATH'))
 
+  if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('ROCM_PATH')):
+    write_action_env_to_bazelrc('ROCM_PATH', environ_cp.get('ROCM_PATH'))
+    write_action_env_to_bazelrc('ROCM_ROOT', environ_cp.get('ROCM_PATH'))
+
   environ_cp['TF_NEED_CUDA'] = str(
       int(get_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)))
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 36ce3fa4fe5..ab4316d5ed0 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -517,12 +517,26 @@ package_group(
         "//perftools/accelerators/xprof/api/...",
         "//third_party/py/autograph/...",
         "//third_party/swift/tensorflow/x10/...",
+        "//third_party/swift/tensorflow_apis/...",
         "//tensorflow/...",
         "//tensorflow_estimator/python/estimator/...",
         "//tensorflow_models/official/...",
     ],
 )
 
+package_group(name = "ndarray_tensor_allow_list")
+
+# Packages that use composite tensors or dispatch.
+# TODO(b/154762408) Remove this package group once it's no longer needed.
+package_group(name = "composite_tensor_whitelist")
+
+# Packages that use private types symbols, until they are exported.
+# TODO(b/154650521) Remove.
+package_group(
+    name = "types_whitelist",
+    packages = ["//learning/deepmind/tensorflow/replicator/..."],
+)
+
 filegroup(
     name = "intel_binary_blob",
     data = if_mkl_ml(
@@ -709,8 +723,8 @@ tf_cc_shared_object(
         "//tensorflow/c:version_script.lds",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/core:distributed_tensorflow_dependencies",
         "//tensorflow/core:tensorflow",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_session",
     ],
 )
 
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index d22eafada16..f0f977aa0b5 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -116,7 +116,7 @@ from tensorflow.python.lib.io import file_io as _fi
 
 # Get sitepackages directories for the python installation.
 _site_packages_dirs = []
-_site_packages_dirs += [_site.USER_SITE]
+_site_packages_dirs += [] if _site.USER_SITE is None else [_site.USER_SITE]
 _site_packages_dirs += [_p for _p in _sys.path if 'site-packages' in _p]
 if 'getsitepackages' in dir(_site):
   _site_packages_dirs += _site.getsitepackages()
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index f2856f893bb..dad91f2d5b2 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -126,7 +126,7 @@ from tensorflow.python.lib.io import file_io as _fi
 
 # Get sitepackages directories for the python installation.
 _site_packages_dirs = []
-_site_packages_dirs += [_site.USER_SITE]
+_site_packages_dirs += [] if _site.USER_SITE is None else [_site.USER_SITE]
 _site_packages_dirs += [_p for _p in _sys.path if 'site-packages' in _p]
 if 'getsitepackages' in dir(_site):
   _site_packages_dirs += _site.getsitepackages()
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 9bc96ff5242..e2781afc3e5 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -58,6 +58,7 @@ filegroup(
     name = "pywrap_required_hdrs",
     srcs = [
         "c_api_internal.h",
+        "conversion_macros.h",
         "python_api.h",
         "tensor_interface.h",
         "tf_status_helper.h",
@@ -84,7 +85,14 @@ tf_cuda_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
+        ],
+        "//tensorflow:chromiumos": [
+            ":tf_attrtype",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core/platform:platform",
         ],
         "//conditions:default": [
             ":tf_attrtype",
@@ -118,6 +126,13 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "c_api_macros",
+    hdrs = ["c_api_macros.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+)
+
 tf_cuda_library(
     name = "c_api",
     hdrs = [
@@ -167,7 +182,7 @@ tf_cuda_library(
         ":tf_status_internal",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tf_status",
@@ -204,7 +219,7 @@ tf_cuda_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -217,12 +232,13 @@ cc_library(
     srcs = ["tf_status.cc"],
     hdrs = ["tf_status.h"],
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":tf_status_internal",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tf_status_internal",
             "//tensorflow/core:lib",
         ],
     }),
@@ -244,10 +260,15 @@ cc_library(
     name = "tensor_interface",
     hdrs = ["tensor_interface.h"],
     visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
 )
 
 cc_library(
@@ -257,7 +278,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -271,16 +292,17 @@ cc_library(
     srcs = ["tf_tensor.cc"],
     hdrs = ["tf_tensor.h"],
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":tensor_interface",
+        ":tf_datatype",
+        ":tf_status",
+        ":tf_status_helper",
+        ":tf_tensor_internal",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tensor_interface",
-            ":tf_datatype",
-            ":tf_status",
-            ":tf_status_helper",
-            ":tf_tensor_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core:protos_all_cc",
@@ -296,14 +318,15 @@ tf_cuda_library(
         "tf_tensor_internal.h",
     ],
     visibility = ["//tensorflow:internal"],
-    deps = select({
+    deps = [
+        ":tensor_interface",
+        ":tf_datatype",
+        ":tf_status",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tensor_interface",
-            ":tf_datatype",
-            ":tf_status",
             "//tensorflow/core:framework",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/platform:casts",
@@ -327,6 +350,9 @@ tf_cuda_library(
         ":checkpoint_reader",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/c/eager:tfe_op_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -368,8 +394,14 @@ tf_cuda_library(
     deps = [
         ":tf_status",
         ":tf_status_internal",
-        "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
 tf_cc_test(
@@ -408,7 +440,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -439,7 +471,7 @@ tf_cuda_library(
     ] + select({
         "//tensorflow:android": [
             ":c_api_internal",
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":c_api_internal",
@@ -466,7 +498,7 @@ tf_cuda_library(
         ":tf_status_helper",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -517,12 +549,12 @@ tf_cuda_cc_test(
         ":test_op1.so",
         "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     ],
-    kernels = [":test_op_kernel"],
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     tags = [
+        "no_windows",  # TODO(b/155444728)
         "noasan",
     ],
     # We must ensure that the dependencies can be dynamically linked since
@@ -531,6 +563,7 @@ tf_cuda_cc_test(
     deps = [
         ":c_api",
         ":c_test_util",
+        ":test_op_kernel",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:grad_ops",
         "//tensorflow/cc/saved_model:signature_constants",
@@ -597,6 +630,7 @@ tf_cc_test(
         ":c_api",
         ":c_api_internal",
         ":c_test_util",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -721,3 +755,11 @@ tf_cuda_library(
     ],
     alwayslink = 1,
 )
+
+cc_library(
+    name = "conversion_macros",
+    hdrs = [
+        "conversion_macros.h",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index bd1ada3e5d2..132761da4bf 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eval_const_tensor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
@@ -53,7 +54,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index eb7bd61ee89..e9e6d470c68 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -21,6 +21,9 @@ limitations under the License.
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -322,205 +325,6 @@ TF_Buffer* TFE_GetServerDef(const char* text_proto, TF_Status* status) {
   return ret;
 }
 
-TFE_Context* TFE_CreateContextFromSession(TF_Session* session,
-                                          TF_Status* status) {
-  auto* opts = TFE_NewContextOptions();
-
-  // Reduce GPU memory allocation, and set appropriate config options for TFE
-  // context.
-  auto* config = TF_CreateConfig(
-      /*xla*/ false, /* gpu_memory_allow_growth */ true, /* num_cpu_devices */
-      10);
-  TFE_ContextOptionsSetConfig(opts, config->data, config->length, status);
-  if (!status->status.ok()) {
-    CHECK(!config);
-    TFE_DeleteContextOptions(opts);
-    return nullptr;
-  }
-
-  auto* ctx = TFE_NewContextFromSession(opts, session, status);
-  TF_DeleteBuffer(config);
-  TFE_DeleteContextOptions(opts);
-  return ctx;
-}
-
-// TODO: retrieve the device string via TFE_ContextListDevices()
-static const char DEFAULT_CPU_DEVICE[] =
-    "/job:localhost/replica:0/task:0/device:CPU:0";
-
-static TFE_TensorHandle* createTFEQueue(TFE_Context* ctx, TF_DataType inputType,
-                                        int tensor_id, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> queueOp(
-      TFE_NewOp(ctx, "FIFOQueueV2", status), TFE_DeleteOp);
-  TFE_OpSetDevice(queueOp.get(), DEFAULT_CPU_DEVICE, status);
-  if (!status->status.ok()) return nullptr;
-  // TODO: use NAMED_TENSOR_QUEUE_CAPACITY in S4TF compiler.
-  TFE_OpSetAttrInt(queueOp.get(), "capacity", 1);
-  TFE_OpSetAttrTypeList(queueOp.get(), "component_types", &inputType, 1);
-  auto shared_name = tensorflow::strings::StrCat("fifo_queue_", tensor_id);
-  TFE_OpSetAttrString(queueOp.get(), "shared_name", shared_name.data(),
-                      shared_name.size());
-  TFE_OpSetAttrString(queueOp.get(), "container", "", 0);
-
-  // TODO: consider making this an unknown shape.
-  const int64_t* dims_ptr = nullptr;
-  int num_dims = 0;
-  TFE_OpSetAttrShapeList(queueOp.get(), "shapes", &dims_ptr, &num_dims,
-                         /*num_values*/ 0, status);
-  if (!status->status.ok()) return nullptr;
-
-  int num_retvals = 1;
-  TFE_TensorHandle* queue = nullptr;
-  TFE_Execute(queueOp.get(), &queue, &num_retvals, status);
-  if (!status->status.ok()) return nullptr;
-  CHECK_EQ(num_retvals, 1);
-
-  return queue;
-}
-
-static void createTFEEnqueue(TFE_Context* ctx, TF_DataType inputType,
-                             TFE_TensorHandle* queue, TFE_TensorHandle* tensor,
-                             TF_Status* status) {
-  TFE_Op* op = TFE_NewOp(ctx, "QueueEnqueueV2", status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op_deleter(op, TFE_DeleteOp);
-  TFE_OpSetDevice(op, DEFAULT_CPU_DEVICE, status);
-  if (!status->status.ok()) return;
-  TFE_OpAddInput(op, queue, status);
-  if (!status->status.ok()) return;
-  TFE_OpAddInput(op, tensor, status);
-  if (!status->status.ok()) return;
-  TFE_OpSetAttrTypeList(op, "Tcomponents", &inputType, 1);
-  TFE_OpSetAttrInt(op, "timeout_ms", -1);
-
-  int num_retvals = 0;
-  TFE_Execute(op, nullptr /*retvals*/, &num_retvals, status);
-  if (!status->status.ok()) return;
-  CHECK_EQ(num_retvals, 0);
-}
-
-static TFE_TensorHandle* createTFEDequeue(TFE_Context* ctx,
-                                          TF_DataType inputType,
-                                          TFE_TensorHandle* queue,
-                                          TF_Status* status) {
-  TFE_Op* op = TFE_NewOp(ctx, "QueueDequeueV2", status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op_deleter(op, TFE_DeleteOp);
-  TFE_OpSetDevice(op, DEFAULT_CPU_DEVICE, status);
-  if (!status->status.ok()) return nullptr;
-
-  TFE_OpAddInput(op, queue, status);
-  if (!status->status.ok()) return nullptr;
-  TFE_OpSetAttrTypeList(op, "component_types", &inputType, 1);
-  TFE_OpSetAttrInt(op, "timeout_ms", -1);
-  TFE_TensorHandle* ret;
-  int num_retvals = 1;
-  TFE_Execute(op, &ret, &num_retvals, status);
-  if (!status->status.ok()) return nullptr;
-  CHECK_EQ(num_retvals, 1);
-  return ret;
-}
-
-TFE_TensorHandle* TFE_DequeueNamedTensor(TF_Session* session, int tensor_id,
-                                         TF_DataType inputType,
-                                         TF_Status* status) {
-  assert(session);
-  VLOG(1) << "Dequeuing data tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  auto* ret = createTFEDequeue(ctx, inputType, queue, status);
-  return ret;
-}
-
-TFE_TensorHandle* TFE_DequeueNamedTensorFromCtx(TFE_Context* ctx, int tensor_id,
-                                                TF_DataType inputType,
-                                                TF_Status* status) {
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  auto* ret = createTFEDequeue(ctx, inputType, queue, status);
-
-  return ret;
-}
-
-void TFE_EnqueueNamedTensor(TF_Session* session, int tensor_id,
-                            TFE_TensorHandle* tensor, TF_Status* status) {
-  assert(session);
-  VLOG(1) << "Enqueuing data tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TF_DataType inputType = TFE_TensorHandleDataType(tensor);
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  createTFEEnqueue(ctx, inputType, queue, tensor, status);
-}
-
-void TFE_EnqueueNamedTensorFromCtx(TFE_Context* ctx, int tensor_id,
-                                   TFE_TensorHandle* tensor,
-                                   TF_Status* status) {
-  VLOG(1) << "Enqueuing data tensor with id " << tensor_id;
-
-  TF_DataType inputType = TFE_TensorHandleDataType(tensor);
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  createTFEEnqueue(ctx, inputType, queue, tensor, status);
-}
-
-void TFE_EnqueueVariantTensor(TF_Session* session, int tensor_id,
-                              TFE_TensorHandle* tensor, TF_Status* status) {
-  VLOG(1) << "Enqueuing variant tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TFE_TensorHandle* queue = createTFEQueue(ctx, TF_VARIANT, tensor_id, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  createTFEEnqueue(ctx, TF_VARIANT, queue, tensor, status);
-}
-
-TFE_TensorHandle* TFE_DequeueVariantTensor(TF_Session* session, int tensor_id,
-                                           TF_Status* status) {
-  VLOG(1) << "Dequeuing variant tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TFE_TensorHandle* queue = createTFEQueue(ctx, TF_VARIANT, tensor_id, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  return createTFEDequeue(ctx, TF_VARIANT, queue, status);
-}
-
 void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg) {
   status->status = tensorflow::errors::Internal(errMsg);
 }
@@ -619,10 +423,9 @@ void TF_AttrBuilderSetType(TF_AttrBuilder* builder, const char* attr_name,
 void TF_AttrBuilderSetTypeList(TF_AttrBuilder* builder, const char* attr_name,
                                const TF_DataType* values, int num_values) {
   auto iter = builder->attr_names.insert(attr_name).first;
-  builder->Set(
-      (*iter).c_str(),
-      tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
-          reinterpret_cast<const tensorflow::DataType*>(values), num_values));
+  builder->Set(*iter, tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
+                          reinterpret_cast<const tensorflow::DataType*>(values),
+                          num_values));
 }
 
 void TF_AttrBuilderCheckCanRunOnDevice(TF_AttrBuilder* builder,
@@ -686,8 +489,7 @@ TFE_TensorHandle* TFE_NewTensorHandleFromScalar(TF_DataType data_type,
   std::memcpy(tensorflow::TensorCApi::Buffer(tensor)->data(), data, len);
 
   status->status = tensorflow::Status::OK();
-  return new TFE_TensorHandle{
-      tensorflow::TensorHandle::CreateLocalHandle(tensor)};
+  return tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(tensor));
 }
 
 namespace {
@@ -708,7 +510,7 @@ tensorflow::Status EnableCollectiveOps(const tensorflow::ServerDef& server_def,
 
   // New server created for new server_def. Unused if updating server_def.
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   tensorflow::GrpcServer* grpc_server =
       dynamic_cast<tensorflow::GrpcServer*>(context->GetServer());
   if (grpc_server == nullptr) {
@@ -822,14 +624,13 @@ void TFE_InferShapes(TFE_Op* tfe_op, TF_ShapeAndTypeList* input_shapes,
 
   const int num_inputs = input_shapes->num_items;
   NodeDef node_def;
-  node_def.set_name(tfe_op->operation->Name());
-  node_def.set_op(tfe_op->operation->Name());
+  tensorflow::AbstractOperationInterface* op = tensorflow::unwrap(tfe_op);
+  node_def.set_name(op->Name());
+  node_def.set_op(op->Name());
   for (int i = 0; i < num_inputs; ++i) {
     node_def.add_input("dummy_input");
   }
-  OperationFromInterface(tfe_op->operation)
-      ->Attrs()
-      .FillAttrValueMap(node_def.mutable_attr());
+  OperationFromInterface(op)->Attrs().FillAttrValueMap(node_def.mutable_attr());
 
   const tensorflow::OpRegistrationData* op_reg_data;
   status->status =
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 551a45d92c4..d0ffbf125fb 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -146,48 +146,6 @@ TF_CAPI_EXPORT extern void TF_EnqueueNamedTensor(TF_Session* session,
 // Create a serialized tensorflow.ServerDef proto.
 TF_Buffer* TFE_GetServerDef(const char* text_proto, TF_Status* status);
 
-// TODO: remove this API in favor of the next one.
-TF_CAPI_EXPORT extern TFE_Context* TFE_NewContextFromSession(
-    const TFE_ContextOptions* opts, TF_Session* sess, TF_Status* status);
-
-// Creates from `session` a new eager context to run a graph function or
-// sends/recvs, so that these concurrent TFE executions can share (via
-// `session` and its associated device mgr) the same set of fifo queue resource
-// ops, used for host<->TF tensor transfers. This way the sends/recvs calls and
-// graph function execution can access the same fifo queue resource handles
-// (associated with devices managed by the device manager, which can be obtained
-// from `session`).
-//
-// TODO: Remove this function once we migrate away from using session.
-TF_CAPI_EXPORT extern TFE_Context* TFE_CreateContextFromSession(
-    TF_Session* session, TF_Status* status);
-
-// TODO: Retire this API in favor of the next one.
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueNamedTensor(
-    TF_Session* session, int tensor_id, TF_DataType inputType,
-    TF_Status* status);
-
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueNamedTensorFromCtx(
-    TFE_Context* ctx, int tensor_id, TF_DataType inputType, TF_Status* status);
-
-TF_CAPI_EXPORT extern void TFE_EnqueueNamedTensor(TF_Session* session,
-                                                  int tensor_id,
-                                                  TFE_TensorHandle* tensor,
-                                                  TF_Status* status);
-
-TF_CAPI_EXPORT extern void TFE_EnqueueNamedTensorFromCtx(
-    TFE_Context* ctx, int tensor_id, TFE_TensorHandle* tensor,
-    TF_Status* status);
-
-// TODO: consider folding the 2 APIs below into the ones above.
-TF_CAPI_EXPORT extern void TFE_EnqueueVariantTensor(TF_Session* session,
-                                                    int tensor_id,
-                                                    TFE_TensorHandle* tensor,
-                                                    TF_Status* status);
-
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
-    TF_Session* session, int tensor_id, TF_Status* status);
-
 TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
                                                       const char* errMsg);
 
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index bbf645200c6..3fff9bcd371 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/hash/hash.h"
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 4896087615d..0d128b23e32 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -38,7 +38,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/c/c_api_macros.h b/tensorflow/c/c_api_macros.h
new file mode 100644
index 00000000000..85c9507db87
--- /dev/null
+++ b/tensorflow/c/c_api_macros.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_C_API_MACROS_H_
+#define TENSORFLOW_C_C_API_MACROS_H_
+
+#ifdef SWIG
+#define TF_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TF_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TF_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+#endif  // TENSORFLOW_C_C_API_MACROS_H_
diff --git a/tensorflow/c/conversion_macros.h b/tensorflow/c/conversion_macros.h
new file mode 100644
index 00000000000..d1f99b7b5b0
--- /dev/null
+++ b/tensorflow/c/conversion_macros.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_CONVERSION_MACROS_H_
+#define TENSORFLOW_C_CONVERSION_MACROS_H_
+
+#define DEFINE_CONVERSION_FUNCTIONS(cpp_impl, wrapper)                         \
+  inline cpp_impl *unwrap(wrapper *w) {                                        \
+    return reinterpret_cast<cpp_impl *>(w);                                    \
+  }                                                                            \
+                                                                               \
+  inline const cpp_impl *unwrap(const wrapper *w) {                            \
+    return reinterpret_cast<const cpp_impl *>(w);                              \
+  }                                                                            \
+                                                                               \
+  inline wrapper *wrap(cpp_impl *i) { return reinterpret_cast<wrapper *>(i); } \
+  inline const wrapper *wrap(const cpp_impl *i) {                              \
+    return reinterpret_cast<const wrapper *>(i);                               \
+  }
+
+#endif  // TENSORFLOW_C_CONVERSION_MACROS_H_
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index d49f679083e..eb3035cc3d7 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -35,18 +35,26 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":context_interface",
             ":operation_interface",
             ":tensor_handle_interface",
+            ":tfe_context_internal",
+            ":tfe_cancellation_manager_internal",
+            ":tfe_executor_internal",
+            ":tfe_monitoring_internal",
+            ":tfe_op_attrs_internal",
+            ":tfe_op_internal",
+            ":tfe_tensor_debug_info_internal",
+            ":tfe_tensorhandle_internal",
             "@com_google_absl//absl/algorithm:container",
-            "@com_google_absl//absl/container:fixed_array",
             "@com_google_absl//absl/types:span",
             "@com_google_absl//absl/types:variant",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
+            "//tensorflow/c:tf_status_internal",
             "//tensorflow/c:tf_tensor_internal",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core/common_runtime/eager:attr_builder",
@@ -100,6 +108,11 @@ filegroup(
         "dlpack.h",
         "operation_interface.h",
         "tensor_handle_interface.h",
+        "tfe_cancellation_manager_internal.h",
+        "tfe_executor_internal.h",
+        "tfe_monitoring_internal.h",
+        "tfe_op_attrs_internal.h",
+        "tfe_tensor_debug_info_internal.h",
     ],
     visibility = [
         "//tensorflow/core:__pkg__",
@@ -107,33 +120,27 @@ filegroup(
     ],
 )
 
-tf_cuda_library(
+cc_library(
     name = "c_api_internal",
-    srcs = [
+    hdrs = [
         "c_api_experimental.h",
-        "c_api_unified_experimental.h",
+        "c_api_internal.h",
     ],
-    hdrs = ["c_api_internal.h"],
     visibility = [
         "//learning/deepmind/courier:__subpackages__",
         "//tensorflow:internal",
     ],
     deps = [
         ":c_api",
-        ":context_interface",
-        ":operation_interface",
-        ":tensor_handle_interface",
-        "//tensorflow/c:c_api",
+        ":tfe_cancellation_manager_internal",
+        ":tfe_context_internal",
+        ":tfe_executor_internal",
+        ":tfe_monitoring_internal",
+        ":tfe_op_attrs_internal",
+        ":tfe_op_internal",
+        ":tfe_tensor_debug_info_internal",
+        ":tfe_tensorhandle_internal",
         "//tensorflow/c:c_api_internal",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/common_runtime/eager:attr_builder",
-        "//tensorflow/core/common_runtime/eager:eager_executor",
     ],
 )
 
@@ -177,13 +184,110 @@ cc_library(
         ":operation_interface",
         ":tensor_handle_interface",
         "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/experimental/saved_model/core:saved_model_api",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
 
+cc_library(
+    name = "tfe_context_internal",
+    hdrs = ["tfe_context_internal.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":context_interface",
+        "//tensorflow/c:conversion_macros",
+    ],
+)
+
+cc_library(
+    name = "tfe_cancellation_manager_internal",
+    hdrs = ["tfe_cancellation_manager_internal.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+)
+
+cc_library(
+    name = "tfe_executor_internal",
+    hdrs = ["tfe_executor_internal.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/core/common_runtime/eager:eager_executor",
+    ],
+)
+
+cc_library(
+    name = "tfe_monitoring_internal",
+    hdrs = ["tfe_monitoring_internal.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "tfe_op_attrs_internal",
+    hdrs = ["tfe_op_attrs_internal.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:attr_builder",
+    ],
+)
+
+cc_library(
+    name = "tfe_op_internal",
+    hdrs = ["tfe_op_internal.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":operation_interface",
+        "//tensorflow/c:conversion_macros",
+    ],
+)
+
+cc_library(
+    name = "tfe_tensor_debug_info_internal",
+    hdrs = ["tfe_tensor_debug_info_internal.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "tfe_tensorhandle_internal",
+    hdrs = ["tfe_tensorhandle_internal.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":tensor_handle_interface",
+        "//tensorflow/c:conversion_macros",
+    ],
+)
+
 tf_cuda_library(
     name = "c_api_test_util",
     testonly = 1,
@@ -213,7 +317,9 @@ tf_cuda_cc_test(
     ],
     extra_copts = tfe_xla_copts(),
     tags = [
-        "guitar",
+        "noguitar",  # TODO(b/155445984): flaky
+        #"guitar",
+        "notap",  # TODO(b/156981931): flaky
         "multi_gpu",
     ],
     deps = [
@@ -221,6 +327,8 @@ tf_cuda_cc_test(
         ":c_api_experimental",
         ":c_api_internal",
         ":c_api_test_util",
+        ":tfe_op_internal",
+        ":tfe_tensorhandle_internal",
         "//tensorflow/c:c_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -239,17 +347,49 @@ tf_cuda_cc_test(
     srcs = [
         "c_api_remote_test.cc",
     ],
+    # TODO(b/136478427): Figure out how to correctly shut the server down
+    args = ["--heap_check=local"],
     extra_copts = tfe_xla_copts(),
     tags = [
-        "guitar",
-        "multi_gpu",
-        "no_oss",
+        "noasan",  # leaks gRPC server instances
+        "notsan",  # b/157098283
     ],
     deps = [
         ":c_api",
         ":c_api_experimental",
         ":c_api_internal",
         ":c_api_test_util",
+        ":tfe_tensorhandle_internal",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:function_optimization_registry",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "c_api_cluster_test",
+    size = "small",
+    srcs = [
+        "c_api_cluster_test.cc",
+    ],
+    # TODO(b/136478427): Figure out how to correctly shut the server down
+    args = ["--heap_check=local"],
+    extra_copts = tfe_xla_copts(),
+    tags = ["noasan"],  # leaks gRPC server instances
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        ":c_api_internal",
+        ":c_api_test_util",
+        ":tfe_tensorhandle_internal",
         "//tensorflow/c:c_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -257,6 +397,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/platform:env",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -266,6 +407,9 @@ tf_cuda_library(
     srcs = [
         "c_api_experimental.cc",
         "c_api_unified_experimental.cc",
+        "c_api_unified_experimental_eager.cc",
+        "c_api_unified_experimental_graph.cc",
+        "c_api_unified_experimental_internal.h",
     ],
     hdrs = [
         "c_api_experimental.h",
@@ -275,11 +419,14 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":c_api",
             ":c_api_internal",
+            ":tfe_context_internal",
+            ":tfe_op_internal",
+            ":tfe_tensorhandle_internal",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
             "//tensorflow/core:core_cpu",
@@ -308,6 +455,8 @@ tf_cuda_library(
         "//conditions:default": [],
     }) + [
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
@@ -362,6 +511,7 @@ tf_cuda_cc_test(
         ":c_api",
         ":c_api_experimental",
         ":c_api_test_util",
+        "//tensorflow/c:c_api",
         "//tensorflow/c:c_test_util",
         "//tensorflow/cc/profiler",
         "//tensorflow/core:lib",
@@ -443,8 +593,9 @@ cc_library(
     deps = [
         ":c_api",
         ":c_api_experimental",
-        ":c_api_internal",
+        ":tfe_tensorhandle_internal",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_status_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -466,7 +617,6 @@ filegroup(
         ],
         exclude = [
             "c_api_experimental.cc",
-            "*c_api_tfrt*",
             "*test*",
             "*dlpack*",
         ],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index b34d1026e08..912cd184b77 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -26,7 +26,6 @@ limitations under the License.
 // clang-format on
 
 #include "absl/algorithm/container.h"
-#include "absl/container/fixed_array.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
@@ -34,9 +33,12 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/operation_interface.h"
 #include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #ifdef PLATFORM_GOOGLE
-#include "tensorflow/c/eager/c_api_tfrt.h"
+#include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
 #endif
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -298,7 +300,7 @@ tensorflow::Status CreateRemoteContexts(
 
     std::vector<bool> filtered_device_mask;
     tensorflow::EagerContext* context =
-        tensorflow::ContextFromInterface(ctx->context);
+        tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
     context->FilterDevicesForRemoteWorkers(
         remote_worker, base_request.cluster_device_attributes(),
         &filtered_device_mask);
@@ -383,7 +385,7 @@ tensorflow::Status UpdateRemoteContexts(
 
     std::vector<bool> filtered_device_mask;
     tensorflow::EagerContext* context =
-        tensorflow::ContextFromInterface(ctx->context);
+        tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
     context->FilterDevicesForRemoteWorkers(
         remote_worker, base_request.cluster_device_attributes(),
         &filtered_device_mask);
@@ -464,7 +466,7 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
   // New server created for new server_def. Unused if updating server_def.
   std::unique_ptr<tensorflow::ServerInterface> new_server;
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   tensorflow::GrpcServer* grpc_server;
   if (reset_context) {
     LOG_AND_RETURN_IF_ERROR(tensorflow::NewServer(server_def, &new_server));
@@ -498,6 +500,17 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
       grpc_server->master_env()->worker_cache->GetEagerClientCache(
           &remote_eager_workers));
 
+  // For cluster update, use a status group to aggregate statuses from
+  //   * adding and removing remote devices
+  //   * creating remote contexts on newly added workers
+  //   * updating remote contexts on existing workers
+  //   * updating the master context
+  // Note that we should not return immediately on errors in the middle of these
+  // updates to prevent cluster from having inconsistent context views.
+  //
+  // Unused if `reset_context` is True.
+  tensorflow::StatusGroup sg;
+
   // When updating an existing context, populate the following lists with:
   // * added_workers: set(remote_workers) - set(curr_remote_workers)
   // * removed_workers: set(curr_remote_workers) - set(remote_workers)
@@ -533,7 +546,7 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
     DifferentiateWorkerLists(&curr_remote_workers, &remote_workers,
                              &added_workers, &removed_workers,
                              &existing_workers);
-    LOG_AND_RETURN_IF_ERROR(GetReplacedFromExistingWorkers(
+    sg.Update(GetReplacedFromExistingWorkers(
         &existing_workers, context_id, context->GetContextViewId(), server_def,
         remote_eager_workers.get(), &replaced_workers));
     if (VLOG_IS_ON(1)) {
@@ -557,11 +570,10 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
             existing_workers.end());
       }
     }
-    LOG_AND_RETURN_IF_ERROR(
-        RemoveRemoteDevicesFromMgr(removed_workers, remote_device_mgr));
-    LOG_AND_RETURN_IF_ERROR(AddRemoteDevicesToMgr(
-        added_workers, grpc_server->master_env()->worker_cache,
-        remote_device_mgr));
+    sg.Update(RemoveRemoteDevicesFromMgr(removed_workers, remote_device_mgr));
+    sg.Update(AddRemoteDevicesToMgr(added_workers,
+                                    grpc_server->master_env()->worker_cache,
+                                    remote_device_mgr));
   }
 
   std::vector<tensorflow::DeviceAttributes> cluster_device_attributes;
@@ -582,7 +594,6 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
   }
 
   // Initialize remote eager workers.
-  // TODO(b/138847548) Create remote eager contexts in async mode by default.
   if (reset_context) {
     LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts(
         ctx, remote_workers, context_id, context_view_id, keep_alive_secs,
@@ -594,7 +605,7 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
     // existing workers to also have the updated context_view_id, so
     // we must set their context_view_id to the existing master's
     // context_view_id + 1.
-    LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts(
+    sg.Update(CreateRemoteContexts(
         ctx, added_workers, context_id, context_view_id + 1, keep_alive_secs,
         server_def, remote_eager_workers.get(), context->Executor().Async(),
         context->LazyCopyFunctionRemoteInputs(), base_request));
@@ -604,20 +615,19 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
           VLOG(1) << "Updating cluster with existing worker " << w;
         }
       }
-      LOG_AND_RETURN_IF_ERROR(UpdateRemoteContexts(
-          ctx, existing_workers, added_workers, removed_workers, context_id,
-          context_view_id + 1, server_def, remote_eager_workers.get(),
-          base_request));
+      sg.Update(UpdateRemoteContexts(ctx, existing_workers, added_workers,
+                                     removed_workers, context_id,
+                                     context_view_id + 1, server_def,
+                                     remote_eager_workers.get(), base_request));
     }
   }
 
-  tensorflow::RemoteRendezvous* r =
-      grpc_server->worker_env()->rendezvous_mgr->Find(context_id);
   auto session_name = tensorflow::strings::StrCat("eager_", context_id);
-  auto* device_mgr = grpc_server->worker_env()->device_mgr;
-  std::shared_ptr<tensorflow::WorkerSession> worker_session;
-
   if (reset_context) {
+    tensorflow::RemoteRendezvous* r =
+        grpc_server->worker_env()->rendezvous_mgr->Find(context_id);
+    auto* device_mgr = grpc_server->worker_env()->device_mgr;
+    std::shared_ptr<tensorflow::WorkerSession> worker_session;
     TF_RETURN_IF_ERROR(grpc_server->worker_env()->session_mgr->CreateSession(
         session_name, server_def, base_request.cluster_device_attributes(),
         true));
@@ -644,13 +654,13 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
     // GrpcServer cannot be destroyed after it is started.
     LOG_AND_RETURN_IF_ERROR(grpc_server->Start());
   } else {
-    LOG_AND_RETURN_IF_ERROR(
-        grpc_server->worker_env()->session_mgr->UpdateSession(
-            session_name, server_def, base_request.cluster_device_attributes(),
-            true));
-    LOG_AND_RETURN_IF_ERROR(context->UpdateRemoteMaster(
-        grpc_server->worker_env(), std::move(remote_eager_workers),
-        added_workers, removed_workers, context_id, r));
+    sg.Update(grpc_server->worker_env()->session_mgr->UpdateSession(
+        session_name, server_def, base_request.cluster_device_attributes(),
+        /*isolate_session_state=*/true));
+    sg.Update(context->UpdateRemoteMaster(context_id,
+                                          std::move(remote_eager_workers),
+                                          added_workers, removed_workers));
+    LOG_AND_RETURN_IF_ERROR(sg.as_summary_status());
   }
 #undef LOG_AND_RETURN_IF_ERROR
 
@@ -684,8 +694,13 @@ void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
   if (opts->use_tfrt) {
 #ifdef PLATFORM_GOOGLE
-    status->status = tensorflow::Status::OK();
-    return new TFE_Context{new tfrt::ContextInterface()};
+    tfrt::SmallVector<std::string, 4> op_handler_chains;
+    tfrt::SmallVector<tensorflow::DeviceAttributes, 4> device_attributes;
+    status->status = tfrt::ListOpHandlerChains(
+        opts->session_options.options, &op_handler_chains, &device_attributes);
+    if (!status->status.ok()) return nullptr;
+    return tensorflow::wrap(
+        new tfrt::ContextInterface(op_handler_chains, device_attributes));
 #else
     status->status = tensorflow::errors::Unimplemented("TFRT is not supported");
     return nullptr;
@@ -702,32 +717,14 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
   tensorflow::Rendezvous* r =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
 
-  return new TFE_Context{new tensorflow::EagerContext(
+  return tensorflow::wrap(new tensorflow::EagerContext(
       opts->session_options.options,
       static_cast<tensorflow::ContextDevicePlacementPolicy>(
           opts->device_placement_policy),
       static_cast<tensorflow::ContextMirroringPolicy>(opts->mirroring_policy),
       opts->async, opts->lazy_remote_inputs_copy, device_mgr.release(),
       /*device_mgr_owned*/ true, r,
-      tensorflow::GetDefaultCustomKernelCreator())};
-}
-
-TFE_Context* TFE_NewContextFromSession(const TFE_ContextOptions* opts,
-                                       TF_Session* sess, TF_Status* status) {
-  const tensorflow::DeviceMgr* device_mgr = nullptr;
-  status->status = sess->session->LocalDeviceManager(&device_mgr);
-  if (!status->status.ok()) return nullptr;
-  tensorflow::Rendezvous* r =
-      new tensorflow::IntraProcessRendezvous(device_mgr);
-
-  return new TFE_Context{new tensorflow::EagerContext(
-      opts->session_options.options,
-      static_cast<tensorflow::ContextDevicePlacementPolicy>(
-          opts->device_placement_policy),
-      static_cast<tensorflow::ContextMirroringPolicy>(opts->mirroring_policy),
-      opts->async, opts->lazy_remote_inputs_copy, device_mgr,
-      /*device_mgr_owned*/ false, r,
-      tensorflow::GetDefaultCustomKernelCreator())};
+      tensorflow::GetDefaultCustomKernelCreator()));
 }
 
 void TFE_DeleteContext(TFE_Context* ctx) {
@@ -735,23 +732,18 @@ void TFE_DeleteContext(TFE_Context* ctx) {
     return;
   }
 
-  // context->RefCountIsOne() should be true here.
-  // TODO(iga): Remove EagerContext refcounting.
-  ctx->context->Release();
-
-  delete ctx;
+  // ctx->RefCountIsOne() should be true here.
+  tensorflow::unwrap(ctx)->Release();
 }
 
 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   TF_DeviceList* l = new TF_DeviceList;
-  ctx->context->ListDevices(&l->response);
+  tensorflow::unwrap(ctx)->ListDevices(&l->response);
   return l;
 }
 
 void TFE_ContextClearCaches(TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
-  context->ClearCachesAndThreadExecutors();
+  tensorflow::unwrap(ctx)->ClearCachesAndThreadExecutors();
 }
 
 // Set server_def on the context, possibly updating it.
@@ -773,7 +765,7 @@ TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
   if (server_def.has_cluster_device_filters()) {
     const auto& cdf = server_def.cluster_device_filters();
     for (const auto& jdf : cdf.jobs()) {
-      const string& remote_prefix = "/job:" + jdf.name() + "/task:";
+      const string remote_prefix = "/job:" + jdf.name() + "/task:";
       for (const auto& tdf : jdf.tasks()) {
         const int32_t task_index = tdf.first;
         std::vector<string> device_filters(tdf.second.device_filters_size());
@@ -782,7 +774,7 @@ TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
         }
         const string remote_worker = remote_prefix + std::to_string(task_index);
         tensorflow::EagerContext* context =
-            tensorflow::ContextFromInterface(ctx->context);
+            tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
         status->status =
             context->SetRemoteDeviceFilters(remote_worker, device_filters);
       }
@@ -804,7 +796,7 @@ TF_CAPI_EXPORT extern void TFE_ContextUpdateServerDef(TFE_Context* ctx,
 #else   // !defined(IS_MOBILE_PLATFORM)
   tensorflow::ServerDef server_def;
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   if (!server_def.ParseFromArray(proto, proto_len)) {
     status->status = tensorflow::errors::InvalidArgument(
         "Invalid tensorflow.ServerDef protocol buffer");
@@ -834,7 +826,7 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
   return false;
 #else   // !defined(IS_MOBILE_PLATFORM)
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   tensorflow::GrpcServer* grpc_server =
       static_cast<tensorflow::GrpcServer*>(context->GetServer());
 
@@ -889,16 +881,14 @@ TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context* ctx,
 #if defined(IS_MOBILE_PLATFORM)
   status->status = tensorflow::Status::OK();
 #else   // !defined(IS_MOBILE_PLATFORM)
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
-  status->status = context->SyncExecutors();
+  status->status = tensorflow::unwrap(ctx)->AsyncWait();
 #endif  // !IS_MOBILE_PLATFORM
 }
 
 void TFE_ContextSetThreadLocalDevicePlacementPolicy(
     TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   context->SetThreadLocalDevicePlacementPolicy(
       static_cast<tensorflow::ContextDevicePlacementPolicy>(policy));
 }
@@ -909,18 +899,17 @@ void TFE_ContextSetThreadLocalDevicePlacementPolicy(
 extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
     TFE_Context* ctx) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   return static_cast<TFE_ContextDevicePlacementPolicy>(
       context->GetDevicePlacementPolicy());
 }
 
-TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
+TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t, TF_Status* status) {
   tensorflow::Tensor tensor;
   status->status = tensorflow::TF_TensorToTensor(t, &tensor);
   if (!status->status.ok()) return nullptr;
 
-  return new TFE_TensorHandle{
-      tensorflow::TensorHandle::CreateLocalHandle(tensor)};
+  return tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(tensor));
 }
 
 void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
@@ -928,84 +917,84 @@ void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
 
   tensorflow::profiler::TraceMe activity(
       "TFE_DeleteTensorHandle", tensorflow::profiler::TraceMeLevel::kInfo);
-  if (h->handle) {
-    h->handle->Release();
+  if (h) {
+    tensorflow::unwrap(h)->Release();
   }
-  delete h;
 }
 
 TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h) {
-  return static_cast<TF_DataType>(h->handle->DataType());
+  return static_cast<TF_DataType>(tensorflow::unwrap(h)->DataType());
 }
 
 int TFE_TensorHandleNumDims(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return -1;
   }
 
   int num_dims = -1;
-  status->status = h->handle->NumDims(&num_dims);
+  status->status = tensorflow::unwrap(h)->NumDims(&num_dims);
   return num_dims;
 }
 
 int64_t TFE_TensorHandleNumElements(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return -1;
   }
 
   int64 num_elements = -1;
-  status->status = h->handle->NumElements(&num_elements);
+  status->status = tensorflow::unwrap(h)->NumElements(&num_elements);
   return num_elements;
 }
 
 int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index,
                             TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return -1;
   }
 
   int64 dim = -1;
-  status->status = h->handle->Dim(dim_index, &dim);
+  status->status = tensorflow::unwrap(h)->Dim(dim_index, &dim);
   return dim;
 }
 
 const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return nullptr;
   }
-  return h->handle->DeviceName(&status->status);
+  return tensorflow::unwrap(h)->DeviceName(&status->status);
 }
 
 const char* TFE_TensorHandleBackingDeviceName(TFE_TensorHandle* h,
                                               TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return nullptr;
   }
-  return h->handle->BackingDeviceName(&status->status);
+  return tensorflow::unwrap(h)->BackingDeviceName(&status->status);
 }
 
 TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopySharingTensor(
     TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return nullptr;
   }
 
-  return new TFE_TensorHandle{h->handle->Copy()};
+  return tensorflow::wrap(tensorflow::unwrap(h)->Copy());
 }
 
 TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return nullptr;
   }
 
-  tensorflow::AbstractTensorInterface* t = h->handle->Resolve(&status->status);
+  tensorflow::AbstractTensorInterface* t =
+      tensorflow::unwrap(h)->Resolve(&status->status);
   if (t == nullptr) {
     return nullptr;
   }
@@ -1014,22 +1003,22 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
 }
 
 void* TFE_TensorHandleDevicePointer(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return nullptr;
   }
   tensorflow::TensorHandle* handle =
-      tensorflow::TensorHandleFromInterface(h->handle);
+      tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h));
   if (VariantDeviceIsCustom(handle->device())) {
     const tensorflow::Tensor* t;
     status->status = handle->Tensor(&t);
     return t->data();
   }
 
-  if (handle->IsRemote()) {
+  if (handle->Type() != tensorflow::TensorHandle::LOCAL) {
     status->status = tensorflow::errors::InvalidArgument(
-        "TFE_TensorHandleDevicePointer may not be called on a remote tensor "
-        "handle.");
+        "TFE_TensorHandleDevicePointer may not be called on a ",
+        handle->TypeString(), " tensor handle.");
     return nullptr;
   }
   tensorflow::Device* device(absl::get<tensorflow::Device*>(handle->device()));
@@ -1055,7 +1044,7 @@ TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
     void* deallocator_arg, TF_Status* status) {
   tensorflow::Device* device = nullptr;
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   status->status = context->FindDeviceFromName(device_name, &device);
   tensorflow::CustomDevice* custom_device = nullptr;
   if (!status->status.ok()) {
@@ -1081,11 +1070,11 @@ TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
                        tensorflow::TensorShape(dimvec), buf);
   buf->Unref();
   if (custom_device == nullptr) {
-    return new TFE_TensorHandle{tensorflow::TensorHandle::CreateLocalHandle(
-        std::move(t), device, device, context)};
+    return tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(
+        std::move(t), device, device, context));
   } else {
-    return new TFE_TensorHandle{tensorflow::TensorHandle::CreateLocalHandle(
-        std::move(t), custom_device, context)};
+    return tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(
+        std::move(t), custom_device, context));
   }
 }
 
@@ -1094,16 +1083,16 @@ TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
 // bytes of the memory pointed to by the device pointer returned above.
 size_t TFE_TensorHandleDeviceMemorySize(TFE_TensorHandle* h,
                                         TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return 0;
   }
   tensorflow::TensorHandle* handle =
-      tensorflow::TensorHandleFromInterface(h->handle);
-  if (handle->IsRemote()) {
+      tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h));
+  if (handle->Type() != tensorflow::TensorHandle::LOCAL) {
     status->status = tensorflow::errors::InvalidArgument(
-        "TFE_TensorHandleDeviceMemorySize may not be called on a remote tensor "
-        "handle.");
+        "TFE_TensorHandleDeviceMemorySize may not be called on a ",
+        handle->TypeString(), " tensor handle.");
     return 0;
   }
   const tensorflow::Tensor* tensor;
@@ -1116,12 +1105,14 @@ size_t TFE_TensorHandleDeviceMemorySize(TFE_TensorHandle* h,
 
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
-  std::unique_ptr<TFE_Op> new_op(new TFE_Op{ctx->context->CreateOperation()});
-  status->status = new_op->operation->Reset(op_or_function_name, nullptr);
+  tensorflow::AbstractOperationInterface* new_op =
+      tensorflow::unwrap(ctx)->CreateOperation();
+  status->status = new_op->Reset(op_or_function_name, nullptr);
   if (!status->status.ok()) {
-    new_op.reset();
+    new_op->Release();
+    new_op = nullptr;
   }
-  return new_op.release();
+  return tensorflow::wrap(new_op);
 }
 
 void TFE_DeleteOp(TFE_Op* op) {
@@ -1129,24 +1120,20 @@ void TFE_DeleteOp(TFE_Op* op) {
     return;
   }
 
-  if (op->operation) {
-    op->operation->Release();
-  }
-
-  delete op;
+  tensorflow::unwrap(op)->Release();
 }
 
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
-  status->status = op->operation->SetDeviceName(device_name);
+  status->status = tensorflow::unwrap(op)->SetDeviceName(device_name);
 }
 
 const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
-  return op->operation->DeviceName().c_str();
+  return tensorflow::unwrap(op)->DeviceName().c_str();
 }
 
 void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 #ifdef TENSORFLOW_EAGER_USE_XLA
-  tensorflow::Status s = op->operation->SetUseXla(enable);
+  tensorflow::Status s = tensorflow::unwrap(op)->SetUseXla(enable);
   if (!s.ok()) {
     LOG(ERROR) << "Could not enable XLA compilation for op: " << s;
   }
@@ -1157,18 +1144,13 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input, TF_Status* status) {
-  status->status = op->operation->AddInput(input->handle);
+  status->status = tensorflow::unwrap(op)->AddInput(tensorflow::unwrap(input));
 }
 
 void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,
                         TF_Status* status) {
-  absl::FixedArray<tensorflow::AbstractTensorHandleInterface*> handles(
-      num_inputs);
-  for (int i = 0; i < num_inputs; ++i) {
-    handles[i] = inputs[i]->handle;
-  }
-  status->status =
-      op->operation->AddInputList({handles.data(), handles.size()});
+  status->status = tensorflow::unwrap(op)->AddInputList(
+      {tensorflow::unwrap(inputs), static_cast<size_t>(num_inputs)});
 }
 
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
@@ -1176,8 +1158,8 @@ TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
   TF_AttrType ret = TF_ATTR_INT;
   const tensorflow::AttrTypeMap* attr_types_;
   bool is_function;
-  status->status = tensorflow::AttrTypeMapForOp(op->operation->Name().c_str(),
-                                                &attr_types_, &is_function);
+  status->status = tensorflow::AttrTypeMapForOp(
+      tensorflow::unwrap(op)->Name().c_str(), &attr_types_, &is_function);
   if (!status->status.ok()) {
     return ret;
   }
@@ -1203,7 +1185,7 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx,
 
 void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const void* value,
                          size_t length) {
-  auto s = op->operation->SetAttrString(
+  auto s = tensorflow::unwrap(op)->SetAttrString(
       attr_name, static_cast<const char*>(value), length);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
@@ -1211,29 +1193,30 @@ void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const void* value,
 }
 
 void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value) {
-  auto s = op->operation->SetAttrInt(attr_name, value);
+  auto s = tensorflow::unwrap(op)->SetAttrInt(attr_name, value);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
 }
 
 void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value) {
-  auto s = op->operation->SetAttrFloat(attr_name, value);
+  auto s = tensorflow::unwrap(op)->SetAttrFloat(attr_name, value);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
 }
 
 void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, unsigned char value) {
-  auto s = op->operation->SetAttrBool(attr_name, (value == 0) ? false : true);
+  auto s = tensorflow::unwrap(op)->SetAttrBool(attr_name,
+                                               (value == 0) ? false : true);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
 }
 
 void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, TF_DataType value) {
-  auto s = op->operation->SetAttrType(attr_name,
-                                      static_cast<tensorflow::DataType>(value));
+  auto s = tensorflow::unwrap(op)->SetAttrType(
+      attr_name, static_cast<tensorflow::DataType>(value));
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1241,12 +1224,14 @@ void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, TF_DataType value) {
 
 void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims,
                         const int num_dims, TF_Status* out_status) {
-  out_status->status = op->operation->SetAttrShape(attr_name, dims, num_dims);
+  out_status->status =
+      tensorflow::unwrap(op)->SetAttrShape(attr_name, dims, num_dims);
 }
 
 void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name,
                            const TFE_Op* value) {
-  auto s = op->operation->SetAttrFunction(attr_name, value->operation);
+  auto s = tensorflow::unwrap(op)->SetAttrFunction(
+      attr_name, tensorflow::unwrap(const_cast<TFE_Op*>(value)));
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1254,7 +1239,7 @@ void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name,
 
 void TFE_OpSetAttrFunctionName(TFE_Op* op, const char* attr_name,
                                const char* data, size_t length) {
-  auto s = op->operation->SetAttrFunctionName(attr_name, data, length);
+  auto s = tensorflow::unwrap(op)->SetAttrFunctionName(attr_name, data, length);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1265,14 +1250,14 @@ void TFE_OpSetAttrTensor(TFE_Op* op, const char* attr_name, TF_Tensor* tensor,
   tensorflow::Tensor t;
   status->status = TF_TensorToTensor(tensor, &t);
   tensorflow::TensorInterface interface(t);
-  status->status = op->operation->SetAttrTensor(attr_name, &interface);
+  status->status = tensorflow::unwrap(op)->SetAttrTensor(attr_name, &interface);
 }
 
 void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name,
                              const void* const* values, const size_t* lengths,
                              int num_values) {
-  auto s =
-      op->operation->SetAttrStringList(attr_name, values, lengths, num_values);
+  auto s = tensorflow::unwrap(op)->SetAttrStringList(attr_name, values, lengths,
+                                                     num_values);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1280,7 +1265,8 @@ void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name,
 
 void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name,
                             const float* values, int num_values) {
-  auto s = op->operation->SetAttrFloatList(attr_name, values, num_values);
+  auto s =
+      tensorflow::unwrap(op)->SetAttrFloatList(attr_name, values, num_values);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1288,7 +1274,8 @@ void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name,
 
 void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
                           const int64_t* values, int num_values) {
-  auto s = op->operation->SetAttrIntList(attr_name, values, num_values);
+  auto s =
+      tensorflow::unwrap(op)->SetAttrIntList(attr_name, values, num_values);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1296,7 +1283,7 @@ void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
 
 void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
                            const TF_DataType* values, int num_values) {
-  auto s = op->operation->SetAttrTypeList(
+  auto s = tensorflow::unwrap(op)->SetAttrTypeList(
       attr_name, reinterpret_cast<const tensorflow::DataType*>(values),
       num_values);
   if (!s.ok()) {
@@ -1306,7 +1293,8 @@ void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
 
 void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
                            const unsigned char* values, int num_values) {
-  auto s = op->operation->SetAttrBoolList(attr_name, values, num_values);
+  auto s =
+      tensorflow::unwrap(op)->SetAttrBoolList(attr_name, values, num_values);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1315,19 +1303,14 @@ void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
 void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
                             const int64_t** dims, const int* num_dims,
                             int num_values, TF_Status* out_status) {
-  out_status->status =
-      op->operation->SetAttrShapeList(attr_name, dims, num_dims, num_values);
+  out_status->status = tensorflow::unwrap(op)->SetAttrShapeList(
+      attr_name, dims, num_dims, num_values);
 }
 
 void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
                                const TFE_Op** value, int num_values) {
-  absl::FixedArray<const tensorflow::AbstractOperationInterface*> values(
-      num_values);
-  for (int i = 0; i < num_values; ++i) {
-    values[i] = value[i]->operation;
-  }
-  auto s = op->operation->SetAttrFunctionList(attr_name,
-                                              {values.data(), values.size()});
+  auto s = tensorflow::unwrap(op)->SetAttrFunctionList(
+      attr_name, {tensorflow::unwrap(value), static_cast<size_t>(num_values)});
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1342,12 +1325,13 @@ void TFE_OpSetAttrValueProto(const TFE_Op* op, const char* attr_name,
         tensorflow::errors::InvalidArgument("Unparseable AttrValue proto");
     return;
   }
-  if (op == nullptr || op->operation == nullptr) {
+  if (op == nullptr) {
     status->status = tensorflow::errors::InvalidArgument(
         "Got a null or uninitialized `op` argument");
     return;
   }
-  tensorflow::EagerOperation* operation = OperationFromInterface(op->operation);
+  tensorflow::EagerOperation* operation =
+      OperationFromInterface(tensorflow::unwrap(const_cast<TFE_Op*>(op)));
   operation->MutableAttrs()->Set(attr_name, attr_value);
 }
 
@@ -1355,7 +1339,7 @@ TF_CAPI_EXPORT extern int TFE_OpGetInputLength(TFE_Op* op,
                                                const char* input_name,
                                                TF_Status* status) {
   int ret = -1;
-  status->status = op->operation->InputLength(input_name, &ret);
+  status->status = tensorflow::unwrap(op)->InputLength(input_name, &ret);
   return ret;
 }
 
@@ -1363,71 +1347,29 @@ TF_CAPI_EXPORT extern int TFE_OpGetOutputLength(TFE_Op* op,
                                                 const char* output_name,
                                                 TF_Status* status) {
   int ret = -1;
-  status->status = op->operation->OutputLength(output_name, &ret);
+  status->status = tensorflow::unwrap(op)->OutputLength(output_name, &ret);
   return ret;
 }
 
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
-  absl::FixedArray<tensorflow::AbstractTensorHandleInterface*> handles(
-      *num_retvals);
-  status->status = op->operation->Execute(absl::MakeSpan(handles), num_retvals);
-  if (!status->status.ok()) {
-    return;
-  }
-  for (int i = 0; i < *num_retvals; ++i) {
-    retvals[i] = new TFE_TensorHandle{handles[i]};
-  }
+  status->status = tensorflow::unwrap(op)->Execute(
+      absl::MakeSpan(tensorflow::unwrap(retvals), *num_retvals), num_retvals);
 }
 
 TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
                                                TFE_Context* ctx,
                                                const char* device_name,
                                                TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return nullptr;
   }
 
-  tensorflow::TensorHandle* handle = nullptr;
-  tensorflow::Device* device;
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
-  status->status = context->FindDeviceFromName(device_name, &device);
-  if (!status->status.ok()) {
-    tensorflow::CustomDevice* dev;
-    status->status = context->FindCustomDeviceFromName(device_name, &dev);
-    if (status->status.ok()) {
-      status->status = dev->CopyTensorToDevice(
-          tensorflow::TensorHandleFromInterface(h->handle), &handle);
-      if (status->status.ok()) {
-        return new TFE_TensorHandle{handle};
-      }
-    }
-    return nullptr;
-  }
-  // Handle tensor handles currently in custom devices
-  const char* handle_device_name = h->handle->DeviceName(&status->status);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-  tensorflow::CustomDevice* dev;
-  status->status = context->FindCustomDeviceFromName(handle_device_name, &dev);
+  auto* result = tensorflow::unwrap(ctx)->CopyTensorHandleToDevice(
+      tensorflow::unwrap(h), device_name, &status->status);
   if (status->status.ok()) {
-    status->status = dev->CopyTensorFromDevice(
-        tensorflow::TensorHandleFromInterface(h->handle), device_name, &handle);
-    if (status->status.ok()) {
-      return new TFE_TensorHandle{handle};
-    }
-    return nullptr;
-  }
-
-  // Handle regular case.
-  status->status = tensorflow::EagerCopyToDevice(
-      tensorflow::TensorHandleFromInterface(h->handle), context,
-      &context->Executor(), device, false, &handle);
-  if (status->status.ok()) {
-    return new TFE_TensorHandle{handle};
+    return tensorflow::wrap(result);
   }
   return nullptr;
 }
@@ -1442,39 +1384,39 @@ void TFE_ContextAddFunctionDef(TFE_Context* ctx,
     return;
   }
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   status->status = context->AddFunctionDef(function_def);
 }
 
 void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
                             TF_Status* status) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   status->status = context->AddFunctionDef(function->fdef);
 }
 
 void TFE_ContextRemoveFunction(TFE_Context* ctx, const char* name,
                                TF_Status* status) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   status->status = context->RemoveFunction(name);
 }
 
 unsigned char TFE_ContextHasFunction(TFE_Context* ctx, const char* name) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   return context->FindFunctionDef(name) != nullptr;
 }
 
 void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   context->SetShouldStoreGraphs(true);
 }
 
 void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   context->SetShouldStoreGraphs(false);
 }
 
@@ -1482,13 +1424,13 @@ void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
 
 TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t,
                                       TF_Status* status) {
-  return new TFE_TensorHandle{tensorflow::TensorHandle::CreateLocalHandle(t)};
+  return tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(t));
 }
 
 void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
                                   TF_Status* status) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   status->status = context->Executor().WaitForAllPendingNodes();
   if (!status->status.ok()) return;
   tensorflow::mutex_lock ml(*context->MetadataMu());
@@ -1510,26 +1452,23 @@ TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
 }  // namespace
 
 void TFE_ContextStartStep(TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
-  context->StartStep();
+  tensorflow::unwrap(ctx)->StartStep();
 }
 
 void TFE_ContextEndStep(TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
-  context->EndStep();
+  tensorflow::unwrap(ctx)->EndStep();
 }
 
-void TFE_OpGetAttrs(TFE_Op* op, TFE_OpAttrs* attrs) {
-  tensorflow::EagerOperation* operation = OperationFromInterface(op->operation);
-  *attrs = TFE_OpAttrs(&operation->Attrs(), operation->Name().c_str());
+const TFE_OpAttrs* TFE_OpGetAttrs(TFE_Op* op) {
+  return tensorflow::wrap(
+      &OperationFromInterface(tensorflow::unwrap(op))->Attrs());
 }
 
 void TFE_OpAddAttrs(TFE_Op* op, const TFE_OpAttrs* attrs) {
   tensorflow::AttrValueMap m;
-  attrs->attributes->FillAttrValueMap(&m);
-  tensorflow::EagerOperation* operation = OperationFromInterface(op->operation);
+  tensorflow::unwrap(attrs)->FillAttrValueMap(&m);
+  tensorflow::EagerOperation* operation =
+      OperationFromInterface(tensorflow::unwrap(op));
   tensorflow::AttrBuilder* destination = operation->MutableAttrs();
   for (const auto& attribute : m) {
     destination->Set(attribute.first, attribute.second);
@@ -1539,8 +1478,8 @@ void TFE_OpAddAttrs(TFE_Op* op, const TFE_OpAttrs* attrs) {
 void TFE_OpAttrsSerialize(const TFE_OpAttrs* attrs, TF_Buffer* buf,
                           TF_Status* status) {
   tensorflow::NameAttrList name_and_attrs;
-  attrs->attributes->FillAttrValueMap(name_and_attrs.mutable_attr());
-  name_and_attrs.set_name(attrs->name);
+  tensorflow::unwrap(attrs)->FillAttrValueMap(name_and_attrs.mutable_attr());
+  name_and_attrs.set_name(tensorflow::unwrap(attrs)->op_name());
   status->status = MessageToBuffer(name_and_attrs, buf);
 }
 
@@ -1587,6 +1526,7 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
       // require TFE_Op* and just convert it internally a NameAttrValue, so
       // consider adding an overload to the C API to make this case easier.
       TFE_OpSetAttrFunction(op, attr_name, func_op);
+      TFE_DeleteOp(func_op);
     } break;
     case tensorflow::AttrValue::kList:
       TF_FALLTHROUGH_INTENDED;
@@ -1616,33 +1556,34 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
   const string& name() override { return name_; }
 
   tensorflow::Status CopyTensorToDevice(
-      tensorflow::TensorHandle* tensor,
+      tensorflow::TensorHandle* handle,
       tensorflow::TensorHandle** result) override {
-    tensor->Ref();
-    TFE_TensorHandle tensor_handle{tensor};
+    handle->Ref();
     TF_Status status;
-    TFE_TensorHandle* result_handle =
-        device_.copy_tensor_to_device(context_, &tensor_handle, &status, info_);
-    tensor_handle.handle->Release();
+    TFE_TensorHandle* result_handle = device_.copy_tensor_to_device(
+        context_, tensorflow::wrap(handle), &status, info_);
+    handle->Release();
     if (!status.status.ok()) return status.status;
-    *result = tensorflow::TensorHandleFromInterface(result_handle->handle);
+    *result = tensorflow::TensorHandleFromInterface(
+        tensorflow::unwrap(result_handle));
     (*result)->Ref();
     TFE_DeleteTensorHandle(result_handle);
     return status.status;
   }
 
   tensorflow::Status CopyTensorFromDevice(
-      tensorflow::TensorHandle* tensor,
+      tensorflow::TensorHandle* handle,
       const tensorflow::string& target_device_name,
       tensorflow::TensorHandle** result) override {
     TF_Status status;
-    tensor->Ref();
-    TFE_TensorHandle tensor_handle{tensor};
+    handle->Ref();
     TFE_TensorHandle* result_handle = device_.copy_tensor_from_device(
-        context_, &tensor_handle, target_device_name.c_str(), &status, info_);
-    tensor_handle.handle->Release();
+        context_, tensorflow::wrap(handle), target_device_name.c_str(), &status,
+        info_);
+    handle->Release();
     if (!status.status.ok()) return status.status;
-    *result = tensorflow::TensorHandleFromInterface(result_handle->handle);
+    *result = tensorflow::TensorHandleFromInterface(
+        tensorflow::unwrap(result_handle));
     (*result)->Ref();
     TFE_DeleteTensorHandle(result_handle);
     return status.status;
@@ -1655,16 +1596,17 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
     inputs.reserve(op->Inputs().size());
     for (int i = 0; i < op->Inputs().size(); ++i) {
       op->Inputs()[i]->Ref();
-      inputs.push_back(new TFE_TensorHandle{op->Inputs()[i]});
+      inputs.push_back(tensorflow::wrap(op->Inputs()[i]));
     }
     std::vector<TFE_TensorHandle*> outputs(*num_retvals);
     TF_Status status;
-    TFE_OpAttrs attributes(&op->Attrs(), op->Name().c_str());
     device_.execute(context_, inputs.size(), inputs.data(), op->Name().c_str(),
-                    &attributes, num_retvals, outputs.data(), &status, info_);
+                    wrap(&op->Attrs()), num_retvals, outputs.data(), &status,
+                    info_);
     if (status.status.ok()) {
       for (int i = 0; i < *num_retvals; ++i) {
-        retvals[i] = tensorflow::TensorHandleFromInterface(outputs[i]->handle);
+        retvals[i] = tensorflow::TensorHandleFromInterface(
+            tensorflow::unwrap(outputs[i]));
         retvals[i]->Ref();
         TFE_DeleteTensorHandle(outputs[i]);
       }
@@ -1692,7 +1634,7 @@ void TFE_RegisterCustomDevice(TFE_Context* ctx, TFE_CustomDevice device,
   auto custom_device =
       std::make_unique<CustomDeviceAPI>(ctx, device, device_info, device_name);
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   status->status =
       context->RegisterCustomDevice(device_name, std::move(custom_device));
 }
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 070b3a9bb60..5afe3047dd7 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -137,7 +137,7 @@ TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
 // placed in memory of different devices or remote address spaces.
 typedef struct TFE_TensorHandle TFE_TensorHandle;
 
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t,
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t,
                                                             TF_Status* status);
 // Indicates that the caller will not be using `h` any more.
 TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
diff --git a/tensorflow/c/eager/c_api_cluster_test.cc b/tensorflow/c/eager/c_api_cluster_test.cc
new file mode 100644
index 00000000000..252a0408758
--- /dev/null
+++ b/tensorflow/c/eager/c_api_cluster_test.cc
@@ -0,0 +1,433 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace {
+
+using ::tensorflow::string;
+
+tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) {
+  tensorflow::ServerDef server_def;
+  server_def.set_protocol("grpc");
+  server_def.set_job_name(job_name);
+  server_def.set_task_index(0);
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+  tensorflow::JobDef* job_def = cluster_def->add_job();
+  job_def->set_name(job_name);
+  for (int i = 0; i < num_tasks; i++) {
+    int port = tensorflow::testing::PickUnusedPortOrDie();
+    job_def->mutable_tasks()->insert(
+        {i, tensorflow::strings::StrCat("localhost:", port)});
+  }
+  return server_def;
+}
+
+tensorflow::ServerDef GetServerDef(int num_tasks) {
+  return GetServerDef("localhost", num_tasks);
+}
+
+void ReplaceTaskInServerDef(tensorflow::ServerDef* server_def, int task_index) {
+  tensorflow::JobDef* job_def = server_def->mutable_cluster()->mutable_job(0);
+  int port = tensorflow::testing::PickUnusedPortOrDie();
+  job_def->mutable_tasks()->at(task_index) =
+      tensorflow::strings::StrCat("localhost:", port);
+}
+
+void CheckTFE_TensorHandleHasFloats(TFE_TensorHandle* handle,
+                                    const std::vector<float>& expected_values) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Tensor* t = TFE_TensorHandleResolve(handle, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  std::unique_ptr<float[]> actual_values(new float[expected_values.size()]);
+  EXPECT_EQ(sizeof(float) * expected_values.size(), TF_TensorByteSize(t));
+  memcpy(actual_values.get(), TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+
+  for (int i = 0; i < expected_values.size(); i++) {
+    EXPECT_EQ(expected_values[i], actual_values[i])
+        << "Mismatch in expected values at (zero-based) index " << i;
+  }
+}
+
+void CheckRemoteMatMulExecutesOK(TFE_Context* ctx,
+                                 const char* remote_device_name,
+                                 const char* local_device_name) {
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(ctx);
+
+  TFE_Op* matmul = MatMulOp(ctx, h0_task0, h0_task0);
+  TFE_OpSetDevice(matmul, remote_device_name, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  auto* retval_task0 =
+      TFE_TensorHandleCopyToDevice(retvals[0], ctx, local_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  CheckTFE_TensorHandleHasFloats(retval_task0, {7, 10, 15, 22});
+
+  TFE_DeleteTensorHandle(retval_task0);
+  TFE_DeleteTensorHandle(h0_task0);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(matmul);
+
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
+  TF_DeleteStatus(status);
+}
+
+// Read the value of variable `var` and save it into `out_value`.
+void ReadVariable(TFE_Context* ctx, TFE_TensorHandle* var,
+                  TFE_TensorHandle** out_value) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpAddInput(op, var, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  int num_retvals = 1;
+  TFE_Execute(op, out_value, &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+  TF_DeleteStatus(status);
+}
+
+void TestRemoteExecuteChangeServerDef(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+  const char local_device_name[] =
+      "/job:localhost/replica:0/task:0/device:CPU:0";
+  CheckRemoteMatMulExecutesOK(ctx, remote_device_name, local_device_name);
+
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+
+  // Update the server def with a new set of names (worker instead of
+  // localhost).
+  tensorflow::ServerDef updated_server_def = GetServerDef("worker", 2);
+  serialized = updated_server_def.SerializeAsString();
+
+  updated_server_def.set_task_index(1);
+  tensorflow::Status s = tensorflow::GrpcServer::Create(
+      updated_server_def, tensorflow::Env::Default(), &worker_server);
+  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // Create a new tensor_handle.
+  TFE_TensorHandle* h0_task0_new = TestMatrixTensorHandle(ctx);
+
+  // Check that copying it to the old remote device (named localhost) fails.
+  TFE_TensorHandleCopyToDevice(h0_task0_new, ctx, remote_device_name, status);
+  EXPECT_NE(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // Copying and executing on the new remote device works.
+  const char new_remote_device_name[] =
+      "/job:worker/replica:0/task:1/device:CPU:0";
+  const char new_local_device_name[] =
+      "/job:worker/replica:0/task:0/device:CPU:0";
+
+  auto* h0_task1_new = TFE_TensorHandleCopyToDevice(
+      h0_task0_new, ctx, new_remote_device_name, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_DeleteTensorHandle(h0_task0_new);
+  TFE_DeleteTensorHandle(h0_task1_new);
+
+  CheckRemoteMatMulExecutesOK(ctx, new_remote_device_name,
+                              new_local_device_name);
+
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
+
+  TF_DeleteStatus(status);
+
+  TFE_DeleteContext(ctx);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, RemoteExecuteChangeServerDef) {
+  TestRemoteExecuteChangeServerDef(false);
+}
+TEST(CAPI, RemoteExecuteChangeServerDefAsync) {
+  TestRemoteExecuteChangeServerDef(true);
+}
+
+void TestRemoteExecuteUpdateServerDef(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char local_device_name[] =
+      "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+  CheckRemoteMatMulExecutesOK(ctx, remote_device_name, local_device_name);
+
+  TFE_ContextUpdateServerDef(ctx, 0, serialized.data(), serialized.size(),
+                             status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  CheckRemoteMatMulExecutesOK(ctx, remote_device_name, local_device_name);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, RemoteExecuteUpdateServerDef) {
+  TestRemoteExecuteUpdateServerDef(false);
+}
+
+TEST(CAPI, RemoteExecuteUpdateServerDefAsync) {
+  TestRemoteExecuteUpdateServerDef(true);
+}
+
+void TestRemoteExecuteUpdateServerDefResourceAccess(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char dev0_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char dev1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
+
+  TFE_TensorHandle* var_handle0 = TestVariable(ctx, 1.0, dev0_name);
+  EXPECT_NE(var_handle0, nullptr);
+  TFE_TensorHandle* var_handle1 = TestVariable(ctx, 2.0, dev1_name);
+  EXPECT_NE(var_handle1, nullptr);
+
+  TFE_TensorHandle* value_handle = nullptr;
+  ReadVariable(ctx, var_handle1, &value_handle);
+  CheckTFE_TensorHandleHasFloats(value_handle, {2});
+  TFE_DeleteTensorHandle(value_handle);
+
+  // Start a new worker to replace task:1
+  ReplaceTaskInServerDef(&server_def, 1);
+  server_def.set_task_index(1);
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  // Update server def to replace the remote device with the device info on the
+  // new worker (different incarnation ID).
+  server_def.set_task_index(0);
+  string serialized_update = server_def.SerializeAsString();
+  TFE_ContextUpdateServerDef(ctx, 0, serialized_update.data(),
+                             serialized_update.size(), status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // The device of var_handle0 is local device which is the same before and
+  // after cluster update. Remove resource with valid device should succeed.
+  TFE_Op* op = TFE_NewOp(ctx, "DestroyResourceOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle0, status);
+  TFE_OpSetDevice(op, dev0_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  int num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+
+  // The device of var_handle1 is remote device, which was replaced during
+  // cluster update. Removing resource with invalid device should fail
+  // gracefully (i.e., with error status) instead of crashing with segfaults.
+  op = TFE_NewOp(ctx, "DestroyResourceOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle1, status);
+  TFE_OpSetDevice(op, dev1_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  EXPECT_NE(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+
+  TFE_DeleteTensorHandle(var_handle0);
+  TFE_DeleteTensorHandle(var_handle1);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, TestRemoteExecuteUpdateServerDefResourceAccess) {
+  TestRemoteExecuteUpdateServerDefResourceAccess(false);
+}
+
+TEST(CAPI, TestRemoteExecuteUpdateServerDefResourceAccessAsync) {
+  TestRemoteExecuteUpdateServerDefResourceAccess(true);
+}
+
+void TestRemoteExecuteUpdateServerDefWithFailures(bool async) {
+  // Fail fast on GetStatus requests so we can get errors instead of timeout
+  // when updating cluster with non-exsitent worker
+  tensorflow::setenv("GRPC_FAIL_FAST", "TRUE", /*overwrite=*/1);
+
+  tensorflow::ServerDef server_def = GetServerDef(2);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char local_device_name[] =
+      "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+  CheckRemoteMatMulExecutesOK(ctx, remote_device_name, local_device_name);
+
+  // Adding a non-existent remote worker to cluster def. This should cause the
+  // UpdateServerDef call to fail.
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+  tensorflow::JobDef* job_def = cluster_def->mutable_job(0);
+  int port = tensorflow::testing::PickUnusedPortOrDie();
+  job_def->mutable_tasks()->insert(
+      {2, tensorflow::strings::StrCat("localhost:", port)});
+  server_def.set_task_index(0);
+  string serialized_update = server_def.SerializeAsString();
+  TFE_ContextUpdateServerDef(ctx, 0, serialized_update.data(),
+                             serialized_update.size(), status);
+  EXPECT_NE(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // Even after the prevoiusly failed cluster update, another update and op
+  // execution should work fine as long as the provided server_def is valid.
+  TFE_ContextUpdateServerDef(ctx, 0, serialized.data(), serialized.size(),
+                             status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  CheckRemoteMatMulExecutesOK(ctx, remote_device_name, local_device_name);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+  tensorflow::unsetenv("GRPC_FAIL_FAST");
+}
+
+TEST(CAPI, RemoteExecuteUpdateServerDefWithFailures) {
+  TestRemoteExecuteUpdateServerDefWithFailures(false);
+}
+
+TEST(CAPI, RemoteExecuteUpdateServerDefWithFailuresAsync) {
+  TestRemoteExecuteUpdateServerDefWithFailures(true);
+}
+
+}  // namespace
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index f5bf029a000..6827021455b 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -17,8 +17,11 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
-#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/tfe_tensor_debug_info_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/tf_status_internal.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/platform/status.h"
 #ifdef TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/compiler/jit/xla_device.h"
 #endif  // TENSORFLOW_EAGER_USE_XLA
@@ -54,7 +57,8 @@ extern "C" {
 
 TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
     TFE_TensorHandle* h, TF_Status* status) {
-  tensorflow::TensorHandle* handle = TensorHandleFromInterface(h->handle);
+  tensorflow::TensorHandle* handle =
+      TensorHandleFromInterface(tensorflow::unwrap(h));
   const tensorflow::Tensor* tensor;
   status->status = handle->Tensor(&tensor);
   if (!status->status.ok()) {
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index b43af710c04..0d71b11531b 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -19,7 +19,11 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
@@ -34,9 +38,10 @@ using tensorflow::string;
 void TFE_OpReset(TFE_Op* op_to_reset, const char* op_or_function_name,
                  const char* raw_device_name, TF_Status* status) {
   if (op_to_reset) {
-    op_to_reset->operation->Clear();
-    status->status =
-        op_to_reset->operation->Reset(op_or_function_name, raw_device_name);
+    tensorflow::AbstractOperationInterface* op =
+        tensorflow::unwrap(op_to_reset);
+    op->Clear();
+    status->status = op->Reset(op_or_function_name, raw_device_name);
   } else {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
                  "op_to_reset should not be nullptr");
@@ -45,13 +50,13 @@ void TFE_OpReset(TFE_Op* op_to_reset, const char* op_or_function_name,
 
 void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   context->SetShouldStoreGraphs(true);
 }
 
 void TFE_ContextDisableGraphCollection(TFE_Context* ctx) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   context->SetShouldStoreGraphs(false);
 }
 
@@ -483,7 +488,7 @@ void TFE_ContextOptionsSetMirroringPolicy(TFE_ContextOptions* options,
 void TFE_ContextSetThreadLocalMirroringPolicy(
     TFE_Context* ctx, TFE_ContextMirroringPolicy policy) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   context->SetThreadLocalMirroringPolicy(
       static_cast<tensorflow::ContextMirroringPolicy>(policy));
 }
@@ -494,7 +499,7 @@ void TFE_ContextSetThreadLocalMirroringPolicy(
 extern TFE_ContextMirroringPolicy TFE_ContextGetMirroringPolicy(
     TFE_Context* ctx) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   return static_cast<TFE_ContextMirroringPolicy>(context->GetMirroringPolicy());
 }
 
@@ -530,7 +535,7 @@ void TFE_OpSetCancellationManager(TFE_Op* op,
                                   TFE_CancellationManager* cancellation_manager,
                                   TF_Status* status) {
   tensorflow::EagerOperation* operation =
-      tensorflow::OperationFromInterface(op->operation);
+      tensorflow::OperationFromInterface(tensorflow::unwrap(op));
   operation->SetCancellationManager(
       &cancellation_manager->cancellation_manager);
   status->status = tensorflow::Status::OK();
@@ -557,19 +562,19 @@ void TFE_ExecutorClearError(TFE_Executor* executor) {
 
 void TFE_ContextSetExecutorForThread(TFE_Context* ctx, TFE_Executor* executor) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   context->SetExecutorForThread(executor->executor());
 }
 
 TFE_Executor* TFE_ContextGetExecutorForThread(TFE_Context* ctx) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   return new TFE_Executor(&context->Executor());
 }
 
 void TFE_HostAddressSpace(TFE_Context* ctx, TF_Buffer* buf) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   auto address_space = tensorflow::DeviceNameUtils::AddressSpace(
       context->HostCPU()->parsed_name());
   auto str = tensorflow::DeviceNameUtils::ParsedNameToString(address_space);
@@ -585,7 +590,7 @@ void TFE_HostAddressSpace(TFE_Context* ctx, TF_Buffer* buf) {
 void TFE_ContextGetFunctionDef(TFE_Context* ctx, const char* function_name,
                                TF_Buffer* buf, TF_Status* status) {
   tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(ctx->context);
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   auto* function_def = context->FindFunctionDef(function_name);
   if (function_def == nullptr) {
     status->status = tensorflow::errors::NotFound(
@@ -611,13 +616,14 @@ TF_Tensor* TFE_AllocateHostTensor(TFE_Context* ctx, TF_DataType dtype,
     dimvec[i] = static_cast<tensorflow::int64>(dims[i]);
   }
 
-  if (ctx == nullptr || ctx->context == nullptr) {
+  if (ctx == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid Context");
     return nullptr;
   }
 
-  tensorflow::AbstractTensorInterface* t = ctx->context->CreateTensor(
-      static_cast<tensorflow::DataType>(dtype), dimvec);
+  tensorflow::AbstractTensorInterface* t =
+      tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), dimvec);
 
   if (t == nullptr) {
     status->status =
@@ -630,5 +636,38 @@ TF_Tensor* TFE_AllocateHostTensor(TFE_Context* ctx, TF_DataType dtype,
 
 TFE_TensorHandle* TFE_NewTensorHandleFromTensor(TFE_Context* ctx, TF_Tensor* t,
                                                 TF_Status* status) {
-  return new TFE_TensorHandle{ctx->context->CreateLocalHandle(t->tensor)};
+  return tensorflow::wrap(
+      tensorflow::unwrap(ctx)->CreateLocalHandle(t->tensor));
+}
+
+TFE_TensorHandle* TFE_CreatePackedTensorHandle(TFE_Context* ctx,
+                                               TFE_TensorHandle** handles,
+                                               int* num_handles,
+                                               TF_Status* status) {
+  std::vector<tensorflow::TensorHandle*> tensor_handles;
+  tensor_handles.reserve(*num_handles);
+  for (int i = 0; i < *num_handles; ++i) {
+    tensor_handles.push_back(
+        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(handles[i])));
+  }
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  tensorflow::TensorHandle* handle = nullptr;
+  status->status = tensorflow::TensorHandle::CreatePackedHandle(
+      std::move(tensor_handles), context, &handle);
+  return tensorflow::wrap(handle);
+}
+
+void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx, unsigned char enable,
+                                       TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  context->SetAllowSoftPlacement(enable);
+}
+
+void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx, unsigned char enable,
+                                      TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  context->SetLogDevicePlacement(enable);
 }
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index dc1f9eaade3..1b8efe61ee0 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -431,11 +431,9 @@ TF_CAPI_EXPORT extern void TFE_HostAddressSpace(TFE_Context* ctx,
 // A reference to an op's name -> attribute mapping
 typedef struct TFE_OpAttrs TFE_OpAttrs;
 
-// Fetch a struct with a reference to information about attributes of `op`.
-//
-// The `attrs` struct does not own any memory, and `op` must outlive it.
-TF_CAPI_EXPORT extern void TFE_OpGetAttrs(TFE_Op* op, TFE_OpAttrs* attrs);
-
+// Fetch a reference to `op`'s attributes. The returned reference is only valid
+// while `op` is alive.
+const TFE_OpAttrs* TFE_OpGetAttrs(TFE_Op* op);
 // Add attributes in `attrs` to `op`.
 //
 // Does not overwrite or update existing attributes, but adds new ones.
@@ -543,6 +541,26 @@ TF_CAPI_EXPORT extern TF_Tensor* TFE_AllocateHostTensor(TFE_Context* ctx,
 TF_CAPI_EXPORT TFE_TensorHandle* TFE_NewTensorHandleFromTensor(
     TFE_Context* ctx, TF_Tensor* t, TF_Status* status);
 
+// Create a packed TensorHandle with the given list of TensorHandles.
+// If `handles` are on the same device, assign the same device to the packed
+// handle; if `handles` are on different deivces, assign a CompositeDevice to
+// it.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_CreatePackedTensorHandle(
+    TFE_Context* ctx, TFE_TensorHandle** handles, int* num_handles,
+    TF_Status* status);
+
+// Configure soft device placement policy for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx,
+                                                      unsigned char enable,
+                                                      TF_Status* status);
+
+// Configure device placement policy logging for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx,
+                                                     unsigned char enable,
+                                                     TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 00798c367f0..4d9be0c2501 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -15,39 +15,17 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
 #define TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
 
-#include <algorithm>
-#include <cstddef>
-#include <map>
-#include <memory>
-#include <queue>
-#include <string>
-#include <vector>
-
-#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
-#include "tensorflow/c/eager/context_interface.h"
-#include "tensorflow/c/eager/operation_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/eager/attr_builder.h"
-#include "tensorflow/core/common_runtime/eager/eager_executor.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
-#include "tensorflow/core/framework/cancellation.h"
-#include "tensorflow/core/framework/rendezvous.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/monitoring/counter.h"
-#include "tensorflow/core/lib/monitoring/gauge.h"
-#include "tensorflow/core/lib/monitoring/sampler.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/stringpiece.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/public/version.h"
+#include "tensorflow/c/eager/tfe_cancellation_manager_internal.h"  // IWYU pragma: export
+#include "tensorflow/c/eager/tfe_executor_internal.h"  // IWYU pragma: export
+#include "tensorflow/c/eager/tfe_monitoring_internal.h"  // IWYU pragma: export
+#include "tensorflow/c/eager/tfe_op_attrs_internal.h"  // IWYU pragma: export
+#include "tensorflow/c/eager/tfe_tensor_debug_info_internal.h"  // IWYU pragma: export
 
+// TODO(b/154564140): Move this to its own header. This requires splitting
+// c_api_experimental.h
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
   // true if async execution is enabled.
@@ -61,199 +39,4 @@ struct TFE_ContextOptions {
   bool use_tfrt = false;
 };
 
-// Wraps a pointer to a context implementation.
-//
-// WARNING: Since the underlying object could be ref-counted a user of this
-// interface cannot destruct the underlying context object. Instead, call
-// TFE_DeleteContext who calls Release() on the context pointer and deletes
-// the TFE_Context structure.
-struct TFE_Context {
-  tensorflow::AbstractContextInterface* context;
-};
-
-// Wraps a pointer to a tensor handle implementation.
-//
-// WARNING: Since the underlying object could be ref-counted a user of this
-// interface cannot destruct the underlying handle object. Instead, call
-// TFE_DeleteTensorHandle who calls Release() on the handle pointer and deletes
-// the TFE_TensorHandle structure.
-struct TFE_TensorHandle {
-  tensorflow::AbstractTensorHandleInterface* handle;
-};
-
-struct TFE_TensorDebugInfo {
-  explicit TFE_TensorDebugInfo(const std::vector<tensorflow::int64>& dims)
-      : dev_dims(dims) {}
-
-  // Fully-padded, minor-to-major.
-  std::vector<tensorflow::int64> dev_dims;
-};
-
-// Wraps a pointer to an operation implementation.
-//
-// WARNING: Since the underlying object could be ref-counted a user of this
-// interface cannot destruct the underlying operation object. Instead, call
-// TFE_DeleteOp who calls Release() on the operation pointer and deletes
-// the TFE_Op structure.
-struct TFE_Op {
-  tensorflow::AbstractOperationInterface* operation;
-};
-
-struct TFE_MonitoringCounterCell {
-  tensorflow::monitoring::CounterCell cell;
-};
-
-template <int NumLabels>
-struct TFE_MonitoringCounter {
-  template <typename... LabelDesc>
-  TFE_MonitoringCounter(const char* name, const char* description,
-                        LabelDesc&&... label) {
-    counter = absl::WrapUnique(tensorflow::monitoring::Counter<NumLabels>::New(
-        name, description, label...));
-  }
-
-  std::unique_ptr<tensorflow::monitoring::Counter<NumLabels>> counter;
-};
-
-struct TFE_MonitoringCounter0 : TFE_MonitoringCounter<0> {
-  using TFE_MonitoringCounter::TFE_MonitoringCounter;
-};
-struct TFE_MonitoringCounter1 : TFE_MonitoringCounter<1> {
-  using TFE_MonitoringCounter::TFE_MonitoringCounter;
-};
-struct TFE_MonitoringCounter2 : TFE_MonitoringCounter<2> {
-  using TFE_MonitoringCounter::TFE_MonitoringCounter;
-};
-
-struct TFE_MonitoringIntGaugeCell {
-  tensorflow::monitoring::GaugeCell<tensorflow::int64> cell;
-};
-struct TFE_MonitoringStringGaugeCell {
-  tensorflow::monitoring::GaugeCell<tensorflow::string> cell;
-};
-struct TFE_MonitoringBoolGaugeCell {
-  tensorflow::monitoring::GaugeCell<bool> cell;
-};
-
-template <typename ValueType, int NumLabels>
-struct TFE_MonitoringGauge {
-  template <typename... LabelDesc>
-  TFE_MonitoringGauge(const char* name, const char* description,
-                      LabelDesc&&... label) {
-    gauge = absl::WrapUnique(
-        tensorflow::monitoring::Gauge<ValueType, NumLabels>::New(
-            name, description, label...));
-  }
-
-  std::unique_ptr<tensorflow::monitoring::Gauge<ValueType, NumLabels>> gauge;
-};
-
-struct TFE_MonitoringIntGauge0 : TFE_MonitoringGauge<tensorflow::int64, 0> {
-  using TFE_MonitoringGauge::TFE_MonitoringGauge;
-};
-struct TFE_MonitoringIntGauge1 : TFE_MonitoringGauge<tensorflow::int64, 1> {
-  using TFE_MonitoringGauge::TFE_MonitoringGauge;
-};
-struct TFE_MonitoringIntGauge2 : TFE_MonitoringGauge<tensorflow::int64, 2> {
-  using TFE_MonitoringGauge::TFE_MonitoringGauge;
-};
-
-struct TFE_MonitoringStringGauge0 : TFE_MonitoringGauge<tensorflow::string, 0> {
-  using TFE_MonitoringGauge::TFE_MonitoringGauge;
-};
-struct TFE_MonitoringStringGauge1 : TFE_MonitoringGauge<tensorflow::string, 1> {
-  using TFE_MonitoringGauge::TFE_MonitoringGauge;
-};
-struct TFE_MonitoringStringGauge2 : TFE_MonitoringGauge<tensorflow::string, 2> {
-  using TFE_MonitoringGauge::TFE_MonitoringGauge;
-};
-
-struct TFE_MonitoringBoolGauge0 : TFE_MonitoringGauge<bool, 0> {
-  using TFE_MonitoringGauge::TFE_MonitoringGauge;
-};
-struct TFE_MonitoringBoolGauge1 : TFE_MonitoringGauge<bool, 1> {
-  using TFE_MonitoringGauge::TFE_MonitoringGauge;
-};
-struct TFE_MonitoringBoolGauge2 : TFE_MonitoringGauge<bool, 2> {
-  using TFE_MonitoringGauge::TFE_MonitoringGauge;
-};
-
-struct TFE_MonitoringBuckets {
-  explicit TFE_MonitoringBuckets(
-      std::function<std::unique_ptr<tensorflow::monitoring::Buckets>(void)>
-          fn) {
-    create_buckets = fn;
-  }
-
-  std::function<std::unique_ptr<tensorflow::monitoring::Buckets>(void)>
-      create_buckets;
-};
-
-struct TFE_MonitoringSamplerCell {
-  tensorflow::monitoring::SamplerCell cell;
-};
-
-template <int NumLabels>
-struct TFE_MonitoringSampler {
-  template <typename... LabelDesc>
-  TFE_MonitoringSampler(
-      const char* name,
-      std::unique_ptr<tensorflow::monitoring::Buckets> buckets,
-      const char* description, LabelDesc&&... label) {
-    sampler = absl::WrapUnique(tensorflow::monitoring::Sampler<NumLabels>::New(
-        {name, description, label...}, std::move(buckets)));
-  }
-
-  std::unique_ptr<tensorflow::monitoring::Sampler<NumLabels>> sampler;
-};
-
-struct TFE_MonitoringSampler0 : TFE_MonitoringSampler<0> {
-  using TFE_MonitoringSampler::TFE_MonitoringSampler;
-};
-struct TFE_MonitoringSampler1 : TFE_MonitoringSampler<1> {
-  using TFE_MonitoringSampler::TFE_MonitoringSampler;
-};
-struct TFE_MonitoringSampler2 : TFE_MonitoringSampler<2> {
-  using TFE_MonitoringSampler::TFE_MonitoringSampler;
-};
-
-namespace tensorflow {
-// Set an AttrValue on the op. Doesn't handle the list types.
-void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
-                          const tensorflow::AttrValue& default_value,
-                          const char* attr_name, TF_Status* status);
-}  // namespace tensorflow
-
-struct TFE_CancellationManager {
-  tensorflow::CancellationManager cancellation_manager;
-};
-
-struct TFE_Executor {
-  explicit TFE_Executor(bool async)
-      : owned_executor(new tensorflow::EagerExecutor(async)) {}
-
-  explicit TFE_Executor(tensorflow::EagerExecutor* executor)
-      : owned_executor(nullptr), unowned_executor(executor) {}
-
-  tensorflow::EagerExecutor* executor() {
-    return owned_executor == nullptr ? unowned_executor : owned_executor.get();
-  }
-
-  std::unique_ptr<tensorflow::EagerExecutor> owned_executor;
-  tensorflow::EagerExecutor* unowned_executor;
-};
-
-// An equivalent of a tensorflow::NameAttrList protocol buffer, but used in ways
-// that sometimes do not require serialization.
-struct TFE_OpAttrs {
-  explicit TFE_OpAttrs() : name(nullptr), attributes(nullptr) {}
-
-  explicit TFE_OpAttrs(const tensorflow::AttrBuilder* value,
-                       const char* op_name)
-      : name(op_name), attributes(value) {}
-
-  const char* name;
-  const tensorflow::AttrBuilder* attributes;
-};
-
 #endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 91d19280c4c..93d830d2c90 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -17,12 +17,18 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 namespace {
@@ -129,7 +135,49 @@ void TestRemoteExecute(bool async) {
 TEST(CAPI, RemoteExecute) { TestRemoteExecute(false); }
 TEST(CAPI, RemoteExecuteAsync) { TestRemoteExecute(true); }
 
-void TestRemoteExecuteSilentCopies(bool async, bool remote) {
+string MatMulFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'MatMulFunction'"
+      "      input_arg {"
+      "        name: 'a'"
+      "        type: DT_FLOAT"
+      "      }"
+      "      input_arg {"
+      "        name: 'b'"
+      "        type: DT_FLOAT"
+      "      }"
+      "      output_arg {"
+      "        name: 'm'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'matmul'"
+      "      op: 'MatMul'"
+      "      input: 'a'"
+      "      input: 'b'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'm'"
+      "      value: 'matmul:product'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+// If heavy_load_on_streaming_rpc is true, send some rpc reqeusts before the one
+// which creates a remote remote input, to simulate a scenario that the remote
+// input is not ready when we start running an op or a function.
+void TestRemoteExecuteSilentCopies(bool async, bool remote, bool func,
+                                   bool heavy_load_on_streaming_rpc) {
   tensorflow::ServerDef server_def = GetServerDef(3);
 
   // This server def has the task index set to 0.
@@ -154,48 +202,87 @@ void TestRemoteExecuteSilentCopies(bool async, bool remote) {
   TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
   TFE_Context* ctx = TFE_NewContext(opts, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
   TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
 
   TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(ctx);
   TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle(ctx);
+  std::vector<TFE_TensorHandle*> handles_task0;
+  if (heavy_load_on_streaming_rpc) {
+    // Send 50 tensor copy requests to simulate that there have been some RPC
+    // requests been enqueued.
+    for (int i = 0; i < 50; ++i) {
+      handles_task0.push_back(TestMatrixTensorHandle(ctx));
+    }
+  }
   const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
   const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
 
+  std::vector<TFE_TensorHandle*> handles_task2;
+  for (auto* h_task0 : handles_task0) {
+    handles_task2.push_back(
+        TFE_TensorHandleCopyToDevice(h_task0, ctx, task2_name, status));
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  }
+
   auto* h1_task2 =
       TFE_TensorHandleCopyToDevice(h1_task0, ctx, task2_name, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
 
-  // Handles are on task0 (local), and task2, but op is on task1.
-  TFE_Op* matmul = MatMulOp(ctx, h0_task0, h1_task2);
+  TFE_Op* matmul = nullptr;
+  if (func) {
+    string function_def = MatMulFunction();
+    TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                              status);
+    CHECK_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    matmul = TFE_NewOp(ctx, "MatMulFunction", status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    TFE_OpAddInput(matmul, h0_task0, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    TFE_OpAddInput(matmul, h1_task2, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  } else {
+    // Handles are on task0 (local), and task2, but op is on task1.
+    matmul = MatMulOp(ctx, h0_task0, h1_task2);
+  }
   if (remote) {
     TFE_OpSetDevice(matmul, task1_name, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  } else if (!async) {
+    // Set the local device to CPU to easily validate mirroring
+    string cpu_device_name;
+    ASSERT_TRUE(GetDeviceName(ctx, &cpu_device_name, "CPU"));
+    TFE_OpSetDevice(matmul, cpu_device_name.c_str(), status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    auto remote_arg =
+        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h1_task2));
+    // The input handles should never change since they have been mirrored.
+    ASSERT_FALSE(remote_arg->HasLocalMirror(nullptr));
   }
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_TensorHandle* retvals[1];
   int num_retvals = 1;
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
 
   // TODO(gjn): Add support for waiting on async local mirrors
-  if (!async) {
-    auto remote_arg = tensorflow::TensorHandleFromInterface(h1_task2->handle);
-    tensorflow::EagerOperation* op =
-        tensorflow::OperationFromInterface(matmul->operation);
+  if (!remote && !async) {
+    auto remote_arg =
+        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h1_task2));
     // The input handles should never change since they have been mirrored.
-    ASSERT_EQ(op->Inputs()[1], remote_arg);
+    ASSERT_TRUE(remote_arg->HasLocalMirror(nullptr));
   }
 
   auto* retval_task0 = TFE_TensorHandleCopyToDevice(
       retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
 
   TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
   TFE_DeleteTensorHandle(retval_task0);
   float product[4] = {0};
   EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
@@ -210,13 +297,22 @@ void TestRemoteExecuteSilentCopies(bool async, bool remote) {
   TFE_DeleteTensorHandle(h1_task0);
   TFE_DeleteTensorHandle(h1_task2);
   TFE_DeleteTensorHandle(retvals[0]);
+  for (auto* h : handles_task0) {
+    TFE_DeleteTensorHandle(h);
+  }
+  for (auto* h : handles_task2) {
+    TFE_DeleteTensorHandle(h);
+  }
 
   TFE_DeleteOp(matmul);
 
   TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
   TFE_ExecutorWaitForAllPendingNodes(executor, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
   TFE_DeleteExecutor(executor);
+  if (func) {
+    TFE_ContextRemoveFunction(ctx, "MatMulFunction", status);
+  }
   TFE_DeleteContext(ctx);
 
   TF_DeleteStatus(status);
@@ -227,16 +323,435 @@ void TestRemoteExecuteSilentCopies(bool async, bool remote) {
 }
 
 TEST(CAPI, RemoteExecuteSilentCopies) {
-  TestRemoteExecuteSilentCopies(false, true);
+  TestRemoteExecuteSilentCopies(/*async=*/false, /*remote=*/true,
+                                /*func=*/false,
+                                /*heavy_load_on_streaming_rpc=*/false);
 }
 TEST(CAPI, RemoteExecuteSilentCopiesAsync) {
-  TestRemoteExecuteSilentCopies(true, true);
+  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/true, /*func=*/false,
+                                /*heavy_load_on_streaming_rpc=*/false);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesAsyncFunc) {
+  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/true, /*func=*/true,
+                                /*heavy_load_on_streaming_rpc=*/false);
 }
 TEST(CAPI, RemoteExecuteSilentCopiesLocal) {
-  TestRemoteExecuteSilentCopies(false, false);
+  TestRemoteExecuteSilentCopies(/*async=*/false, /*remote=*/false,
+                                /*func=*/false,
+                                /*heavy_load_on_streaming_rpc=*/false);
 }
 TEST(CAPI, RemoteExecuteSilentCopiesLocalAsync) {
-  TestRemoteExecuteSilentCopies(true, false);
+  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/false,
+                                /*func=*/false,
+                                /*heavy_load_on_streaming_rpc=*/false);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFunc) {
+  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/false, /*func=*/true,
+                                /*heavy_load_on_streaming_rpc=*/false);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFuncOrdering) {
+  // A remote input may be not ready when we start running a function. Test that
+  // the function execution should wait until the remote input is ready.
+  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/false, /*func=*/true,
+                                /*heavy_load_on_streaming_rpc=*/true);
+}
+
+// Add the values of three variables on three different tasks.
+string AddVariablesFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'AddVariablesFunction'"
+      "      input_arg {"
+      "        name: 'var'"
+      "        type: DT_RESOURCE"
+      "      }"
+      "      output_arg {"
+      "        name: 'sum'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read0'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read1'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:1/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read2'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:2/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add1'"
+      "      op: 'Add'"
+      "      input: 'read0:value:0'"
+      "      input: 'read1:value:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add2'"
+      "      op: 'Add'"
+      "      input: 'add1:z:0'"
+      "      input: 'read2:value:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'sum'"
+      "      value: 'add2:z:0'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, "VarIsInitializedOp", status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle, status);
+  TFE_TensorHandle* is_initialized[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(op, &is_initialized[0], &num_retvals, status);
+  CHECK_EQ(1, num_retvals);
+  TF_Tensor* t = TFE_TensorHandleResolve(is_initialized[0], status);
+  bool initialized = false;
+  memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
+  EXPECT_EQ(initialized, true);
+  TF_DeleteTensor(t);
+  TFE_DeleteTensorHandle(is_initialized[0]);
+  TFE_DeleteOp(op);
+  delete status;
+}
+
+void TestFunctionWithPackedInput(const bool remote) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(/*enable=*/true));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  const char task0_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  // Create one variable per task.
+  TFE_TensorHandle* h0 = TestVariable(ctx, 1.0, task0_name);
+  TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task1_name);
+  TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task2_name);
+
+  // Add a sync point in order to make sure that variables have been initialized
+  // before the function execution starts.
+  // TODO(b/155789951): Remove once b/155789951 is fixed.
+  VarIsInitialized(ctx, h1);
+  VarIsInitialized(ctx, h2);
+
+  // Pack 3 variable handles into one TFE_TensorHandle.
+  int num_replicas = 3;
+  std::vector<TFE_TensorHandle*> handles = {h0, h1, h2};
+  TFE_TensorHandle* packed_handle =
+      TFE_CreatePackedTensorHandle(ctx, handles.data(), &num_replicas, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  EXPECT_EQ(TFE_TensorHandleDataType(packed_handle), TF_RESOURCE);
+  EXPECT_EQ(TFE_TensorHandleNumDims(packed_handle, status), 0);
+  EXPECT_EQ(TFE_TensorHandleNumElements(packed_handle, status), 1);
+
+  const string composite_device_name =
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
+  EXPECT_EQ(TFE_TensorHandleDeviceName(packed_handle, status),
+            composite_device_name);
+  EXPECT_EQ(TFE_TensorHandleBackingDeviceName(packed_handle, status),
+            composite_device_name);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  // Register and run a function which returns the sum of 3 variables.
+  const string function_def = AddVariablesFunction();
+  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                            status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* func = TFE_NewOp(ctx, "AddVariablesFunction", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(func, packed_handle, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  if (remote) {
+    TFE_OpSetDevice(func, task1_name, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  }
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(func, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+  TFE_DeleteOp(func);
+  TFE_DeleteTensorHandle(packed_handle);
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  float sum = 0;
+  EXPECT_EQ(sizeof(sum), TF_TensorByteSize(t));
+  memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(sum, 6.0);
+
+  TFE_DeleteTensorHandle(h0);
+  TFE_DeleteTensorHandle(h1);
+  TFE_DeleteTensorHandle(h2);
+
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
+  TFE_ContextRemoveFunction(ctx, "AddVariablesFunction", status);
+  TFE_DeleteContext(ctx);
+
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(CAPI, TestLocalFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/false);
+}
+
+TEST(CAPI, TestRemoteFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/true);
+}
+
+string VariableAddFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'VariableAddFunction'"
+      "      input_arg {"
+      "        name: 'var0'"
+      "        type: DT_RESOURCE"
+      "      }"
+      "      output_arg {"
+      "        name: 'var0_value'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read0'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add'"
+      "      op: 'Add'"
+      "      input: 'read0:value:0'"
+      "      input: 'read0:value:0'"
+      "      device: '/job:localhost/task:1/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'identity'"
+      "      op: 'Identity'"
+      "      input: 'add:z:0'"
+      "      device: '/job:localhost/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'var0_value'"
+      "      value: 'identity:output:0'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
+ public:
+  FunctionErrorInjectionPass(string error_node, string error_device)
+      : error_node_(error_node), error_device_(error_device) {}
+  tensorflow::Status Run(const tensorflow::DeviceSet& device_set,
+                         const tensorflow::ConfigProto& config_proto,
+                         std::unique_ptr<tensorflow::Graph>* graph,
+                         tensorflow::FunctionLibraryDefinition* flib_def,
+                         std::vector<std::string>* control_ret_node_names,
+                         bool* control_rets_updated) override {
+    // Inject failure to function instantiation if finding a node that contains
+    // the given node name (error_node_) and requested device (error_device_).
+    for (const auto node : graph->get()->nodes()) {
+      if (node->name().find(error_node_) != string::npos &&
+          node->requested_device() == error_device_) {
+        return tensorflow::errors::Internal("Injected graph pass error.");
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+ private:
+  const string error_node_;
+  const string error_device_;
+};
+
+void TestDistributedFunctionCancellation(bool inject_error) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+  const char dev2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  if (inject_error) {
+    // Inject a function optimization pass failure when it sees the 'read0' op
+    // having a requested device `dev2_name`. During execution:
+    //   * task:0 processes the main function `VariableAddFunction` and places
+    //     the read0 op on task:2
+    //   * task:0 partitions the main function with a subgraph containing read0
+    //     sent to task:2
+    //   * task:2 graph pass reports an error when it sees read0 with dev2_name
+    tensorflow::function_optimization_registration::
+        FunctionOptimizationPassRegistration register_test_pass(
+            std::make_unique<FunctionErrorInjectionPass>("read0", dev2_name));
+  }
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
+  EXPECT_NE(var_handle, nullptr);
+
+  const string function_def = VariableAddFunction();
+  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                            status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* func = TFE_NewOp(ctx, "VariableAddFunction", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(func, var_handle, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(func, &retvals[0], &num_retvals, status);
+
+  if (inject_error) {
+    ASSERT_EQ(TF_INTERNAL, TF_GetCode(status)) << TF_Message(status);
+  } else {
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    ASSERT_EQ(1, num_retvals);
+    TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteTensorHandle(retvals[0]);
+    float sum = 0;
+    ASSERT_EQ(sizeof(sum), TF_TensorByteSize(t));
+    memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
+    TF_DeleteTensor(t);
+    ASSERT_EQ(sum, 4.0);
+  }
+
+  TFE_DeleteOp(func);
+  TFE_DeleteTensorHandle(var_handle);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(CAPI, DistributedFunctionNoError) {
+  TestDistributedFunctionCancellation(false);
+}
+
+TEST(CAPI, DistributedFunctionCancelledOnError) {
+  TestDistributedFunctionCancellation(true);
 }
 
 void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
@@ -309,150 +824,4 @@ TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPC) {
 TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPCAsync) {
   TestRemoteExecuteDeleteContextWithOutstandingRPC(true);
 }
-
-void CheckTFE_TensorHandleHasFloats(TFE_TensorHandle* handle,
-                                    const std::vector<float>& expected_values) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TF_Tensor* t = TFE_TensorHandleResolve(handle, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  std::unique_ptr<float[]> actual_values(new float[expected_values.size()]);
-  EXPECT_EQ(sizeof(float) * expected_values.size(), TF_TensorByteSize(t));
-  memcpy(actual_values.get(), TF_TensorData(t), TF_TensorByteSize(t));
-  TF_DeleteTensor(t);
-
-  for (int i = 0; i < expected_values.size(); i++) {
-    EXPECT_EQ(expected_values[i], actual_values[i])
-        << "Mismatch in expected values at (zero-based) index " << i;
-  }
-}
-
-void CheckRemoteMatMulExecutesOK(TFE_Context* ctx,
-                                 const char* remote_device_name,
-                                 const char* local_device_name) {
-  TF_Status* status = TF_NewStatus();
-  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(ctx);
-
-  TFE_Op* matmul = MatMulOp(ctx, h0_task0, h0_task0);
-  TFE_OpSetDevice(matmul, remote_device_name, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_TensorHandle* retvals[1];
-  int num_retvals = 1;
-  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  auto* retval_task0 =
-      TFE_TensorHandleCopyToDevice(retvals[0], ctx, local_device_name, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  CheckTFE_TensorHandleHasFloats(retval_task0, {7, 10, 15, 22});
-
-  TFE_DeleteTensorHandle(retval_task0);
-  TFE_DeleteTensorHandle(h0_task0);
-  TFE_DeleteTensorHandle(retvals[0]);
-
-  TFE_DeleteOp(matmul);
-
-  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
-  TFE_ExecutorWaitForAllPendingNodes(executor, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteExecutor(executor);
-  TF_DeleteStatus(status);
-}
-
-void TestRemoteExecuteChangeServerDef(bool async) {
-  tensorflow::ServerDef server_def = GetServerDef(2);
-
-  // This server def has the task index set to 0.
-  string serialized = server_def.SerializeAsString();
-
-  server_def.set_task_index(1);
-
-  std::unique_ptr<tensorflow::GrpcServer> worker_server;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(
-                  server_def, tensorflow::Env::Default(), &worker_server)
-                  .ok());
-  ASSERT_TRUE(worker_server->Start().ok());
-
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
-  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  const char remote_device_name[] =
-      "/job:localhost/replica:0/task:1/device:CPU:0";
-  const char local_device_name[] =
-      "/job:localhost/replica:0/task:0/device:CPU:0";
-  CheckRemoteMatMulExecutesOK(ctx, remote_device_name, local_device_name);
-
-  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
-  TFE_ExecutorWaitForAllPendingNodes(executor, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  // TODO(b/136478427): Figure out how to correctly shut the server down.
-  worker_server.release();
-
-  // Update the server def with a new set of names (worker instead of
-  // localhost).
-  tensorflow::ServerDef updated_server_def = GetServerDef("worker", 2);
-  serialized = updated_server_def.SerializeAsString();
-
-  updated_server_def.set_task_index(1);
-  tensorflow::Status s = tensorflow::GrpcServer::Create(
-      updated_server_def, tensorflow::Env::Default(), &worker_server);
-  ASSERT_TRUE(s.ok()) << s.error_message();
-  ASSERT_TRUE(worker_server->Start().ok());
-
-  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  // Create a new tensor_handle.
-  TFE_TensorHandle* h0_task0_new = TestMatrixTensorHandle(ctx);
-
-  // Check that copying it to the old remote device (named localhost) fails.
-  TFE_TensorHandleCopyToDevice(h0_task0_new, ctx, remote_device_name, status);
-  EXPECT_NE(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  // Copying and executing on the new remote device works.
-  const char new_remote_device_name[] =
-      "/job:worker/replica:0/task:1/device:CPU:0";
-  const char new_local_device_name[] =
-      "/job:worker/replica:0/task:0/device:CPU:0";
-
-  auto* h0_task1_new = TFE_TensorHandleCopyToDevice(
-      h0_task0_new, ctx, new_remote_device_name, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_DeleteTensorHandle(h0_task0_new);
-  TFE_DeleteTensorHandle(h0_task1_new);
-
-  CheckRemoteMatMulExecutesOK(ctx, new_remote_device_name,
-                              new_local_device_name);
-
-  TFE_ExecutorWaitForAllPendingNodes(executor, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteExecutor(executor);
-
-  TF_DeleteStatus(status);
-
-  TFE_DeleteContext(ctx);
-
-  // TODO(b/136478427): Figure out how to correctly shut the server down.
-  worker_server.release();
-}
-
-TEST(CAPI, RemoteExecuteChangeServerDef) {
-  TestRemoteExecuteChangeServerDef(false);
-}
-TEST(CAPI, RemoteExecuteChangeServerDefAsync) {
-  TestRemoteExecuteChangeServerDef(true);
-}
-
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index e61cf7ef040..724176505ba 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -78,11 +80,18 @@ void BM_Execute(int iters, int async) {
   TFE_DeleteContextOptions(opts);
 
   TFE_TensorHandle* m = TestMatrixTensorHandle(ctx);
-  TFE_Op* matmul = MatMulOp(ctx, m, m);
+  TFE_Op* matmul = TFE_NewOp(ctx, "MatMul", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_TensorHandle* retvals[1];
   int num_retvals = 1;
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
+    TFE_OpReset(matmul, "MatMul", nullptr, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_OpAddInput(matmul, m, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_OpAddInput(matmul, m, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
     TFE_Execute(matmul, &retvals[0], &num_retvals, status);
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
@@ -113,11 +122,15 @@ void BM_Execute_Identity(int iters, int async) {
   TFE_DeleteContextOptions(opts);
 
   TFE_TensorHandle* m = TestMatrixTensorHandle(ctx);
-  TFE_Op* identity = IdentityOp(ctx, m);
+  TFE_Op* identity = TFE_NewOp(ctx, "Identity", status);
   TFE_TensorHandle* retvals[1];
   int num_retvals = 1;
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
+    TFE_OpReset(identity, "Identity", nullptr, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_OpAddInput(identity, m, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
     TFE_Execute(identity, &retvals[0], &num_retvals, status);
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
@@ -405,6 +418,13 @@ void TensorHandleSilentCopy(bool async,
         hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
 
+    auto cpu_arg =
+        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(hcpu));
+    auto gpu_arg =
+        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(hgpu));
+    auto gpu_device = absl::get<tensorflow::Device*>(gpu_arg->device());
+    ASSERT_FALSE(cpu_arg->HasLocalMirror(gpu_device));
+
     TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
     if (cpu_op) {
       string cpu_device_name;
@@ -420,15 +440,8 @@ void TensorHandleSilentCopy(bool async,
     TFE_Execute(matmul, &retvals[0], &num_retvals, status.get());
     ASSERT_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
 
-    // Validate if the input was replaced with a different TensorHandle
-    auto arg0 = tensorflow::TensorHandleFromInterface(hcpu->handle);
-    auto arg1 = tensorflow::TensorHandleFromInterface(hgpu->handle);
-    tensorflow::EagerOperation* op =
-        tensorflow::OperationFromInterface(matmul->operation);
-
-    // The input handles should never change since they have been mirrored.
-    EXPECT_EQ(op->Inputs()[0], arg0);
-    EXPECT_EQ(op->Inputs()[1], arg1);
+    // The CPU handle should have been copied and have a mirror on the GPU
+    ASSERT_TRUE(cpu_arg->HasLocalMirror(gpu_device));
 
     TFE_DeleteOp(matmul);
     TFE_DeleteTensorHandle(retvals[0]);
@@ -626,17 +639,6 @@ void ExecuteAdd(bool async, bool forward_input, bool tfrt) {
   }
 
   int num_retvals = 1;
-
-  if (async) {
-    // Enqueue dummy ops so we backlog async execution & actually test async.
-    for (int i = 0; i < 10000; ++i) {
-      TFE_TensorHandle* dummy = nullptr;
-      TFE_Execute(add_op, &dummy, &num_retvals, status);
-      ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-      TFE_DeleteTensorHandle(dummy);
-    }
-  }
-
   TFE_TensorHandle* retval = nullptr;
   TFE_Execute(add_op, &retval, &num_retvals, status);
   EXPECT_EQ(1, num_retvals);
@@ -1130,51 +1132,6 @@ void BM_ExecuteFunction(int iters, int async) {
 }
 BENCHMARK(BM_ExecuteFunction)->Arg(0)->Arg(1);
 
-TFE_TensorHandle* CreateVariable(TFE_Context* ctx, float value,
-                                 TF_Status* status) {
-  // Create the variable handle.
-  TFE_Op* op = TFE_NewOp(ctx, "VarHandleOp", status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
-  TFE_OpSetAttrShape(op, "shape", {}, 0, status);
-  TFE_OpSetAttrString(op, "container", "", 0);
-  TFE_OpSetAttrString(op, "shared_name", "", 0);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_TensorHandle* var_handle = nullptr;
-  int num_retvals = 1;
-  TFE_Execute(op, &var_handle, &num_retvals, status);
-  TFE_DeleteOp(op);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  CHECK_EQ(1, num_retvals);
-
-  // Assign 'value' to it.
-  op = TFE_NewOp(ctx, "AssignVariableOp", status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
-  TFE_OpAddInput(op, var_handle, status);
-
-  // Convert 'value' to a TF_Tensor then a TFE_TensorHandle.
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> t(
-      TF_AllocateTensor(TF_FLOAT, nullptr, 0, sizeof(value)), TF_DeleteTensor);
-  memcpy(TF_TensorData(t.get()), &value, TF_TensorByteSize(t.get()));
-
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      value_handle(TFE_NewTensorHandle(t.get(), status),
-                   TFE_DeleteTensorHandle);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-
-  TFE_OpAddInput(op, value_handle.get(), status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-
-  num_retvals = 0;
-  TFE_Execute(op, nullptr, &num_retvals, status);
-  TFE_DeleteOp(op);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  CHECK_EQ(0, num_retvals);
-
-  return var_handle;
-}
-
 TEST(CAPI, Variables) {
   // Variables use resource handles, so this is really a test for resource
   // tensor handling.
@@ -1184,7 +1141,7 @@ TEST(CAPI, Variables) {
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_TensorHandle* var_handle = CreateVariable(ctx, 12.0, status);
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 12.0);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
@@ -1225,7 +1182,7 @@ void BM_ReadVariable(int iters) {
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_TensorHandle* var_handle = CreateVariable(ctx, 5.0, status);
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 5.0);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
@@ -1246,6 +1203,8 @@ void BM_ReadVariable(int iters) {
     CHECK_EQ(0, TFE_TensorHandleNumDims(h, status));
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
     h = nullptr;
+    TFE_OpAddInput(op, var_handle, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
   tensorflow::testing::StopTiming();
   TFE_DeleteOp(op);
@@ -1348,7 +1307,7 @@ TEST(CAPI, TestTFE_TensorHandleCopySharingUnderlyingTensorHandle) {
 tensorflow::AttrValueMap ExtractAttrs(TFE_Op* op) {
   tensorflow::AttrValueMap attr_values;
   tensorflow::EagerOperation* operation =
-      tensorflow::OperationFromInterface(op->operation);
+      tensorflow::OperationFromInterface(tensorflow::unwrap(op));
   operation->Attrs().FillAttrValueMap(&attr_values);
   return attr_values;
 }
@@ -1484,10 +1443,10 @@ TEST(CAPI, TestTFE_OpAttrsInferenceDisabledWhenNotCallingOpAddInputList) {
   TFE_TensorHandle* inputs[] = {input1, input2};
   TFE_OpAddInput(concatOp, dim, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  CHECK(concatOp->operation->OpDef());
+  CHECK(tensorflow::unwrap(concatOp)->OpDef());
   TFE_OpAddInput(concatOp, inputs[0], status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  EXPECT_FALSE(concatOp->operation->OpDef())
+  EXPECT_FALSE(tensorflow::unwrap(concatOp)->OpDef())
       << "Inference context is still present";
   TFE_OpAddInput(concatOp, inputs[1], status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
@@ -1579,7 +1538,7 @@ TEST(CAPI, TestTFE_OpGetInputAndOutputLengthsFailForUnknownArguments) {
   TFE_DeleteContext(ctx);
 }
 
-TEST(CAPI, TestTFE_OpGetAttrs) {
+TEST(CAPI, TestTFE_OpAddAttrs) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
   TFE_Context* ctx = TFE_NewContext(opts, status);
@@ -1589,12 +1548,11 @@ TEST(CAPI, TestTFE_OpGetAttrs) {
   TFE_Op* var_op = TFE_NewOp(ctx, "VarHandleOp", status);
   TFE_OpSetAttrType(var_op, "dtype", TF_INT64);
   TFE_OpSetAttrShape(var_op, "shape", {}, 0, status);
-  TFE_OpAttrs attributes;
-  TFE_OpGetAttrs(var_op, &attributes);
+  const TFE_OpAttrs* attributes = TFE_OpGetAttrs(var_op);
 
   TFE_Op* copy_op = TFE_NewOp(ctx, "VarHandleOp", status);
   TFE_OpSetAttrType(copy_op, "dtype", TF_FLOAT);
-  TFE_OpAddAttrs(copy_op, &attributes);
+  TFE_OpAddAttrs(copy_op, attributes);
   unsigned char is_list = 0;
   ASSERT_EQ(TF_ATTR_TYPE,
             TFE_OpGetAttrType(copy_op, "dtype", &is_list, status));
@@ -1605,7 +1563,7 @@ TEST(CAPI, TestTFE_OpGetAttrs) {
 
   tensorflow::AttrValueMap attr_values;
   tensorflow::EagerOperation* op =
-      tensorflow::OperationFromInterface(copy_op->operation);
+      tensorflow::OperationFromInterface(tensorflow::unwrap(copy_op));
   op->Attrs().FillAttrValueMap(&attr_values);
   EXPECT_EQ(tensorflow::DT_FLOAT, attr_values.find("dtype")->second.type());
 
@@ -1626,11 +1584,10 @@ TEST(CAPI, TestTFE_OpAttrsSerialize) {
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_OpSetAttrType(var_op, "dtype", TF_INT64);
   TFE_OpSetAttrShape(var_op, "shape", {}, 0, status);
-  TFE_OpAttrs attributes;
-  TFE_OpGetAttrs(var_op, &attributes);
+  const TFE_OpAttrs* attributes = TFE_OpGetAttrs(var_op);
 
   TF_Buffer* serialized_attr_values = TF_NewBuffer();
-  TFE_OpAttrsSerialize(&attributes, serialized_attr_values, status);
+  TFE_OpAttrsSerialize(attributes, serialized_attr_values, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   tensorflow::NameAttrList name_and_attrs;
   ASSERT_TRUE(name_and_attrs.ParseFromArray(serialized_attr_values->data,
@@ -1653,7 +1610,7 @@ TEST(CAPI, TestTFE_OpAttrsSerialize) {
 
   tensorflow::AttrValueMap attr_values;
   tensorflow::EagerOperation* op =
-      tensorflow::OperationFromInterface(var_op_2->operation);
+      tensorflow::OperationFromInterface(tensorflow::unwrap(var_op_2));
   op->Attrs().FillAttrValueMap(&attr_values);
   EXPECT_EQ(tensorflow::DT_INT64, attr_values.find("dtype")->second.type());
 
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index e67e17963b3..29b624b8537 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -133,6 +133,58 @@ TFE_TensorHandle* TestMatrixTensorHandle3X2(TFE_Context* ctx) {
   return th;
 }
 
+TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
+                               const tensorflow::string& device_name) {
+  TF_Status* status = TF_NewStatus();
+  // Create the variable handle.
+  TFE_Op* op = TFE_NewOp(ctx, "VarHandleOp", status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpSetAttrShape(op, "shape", {}, 0, status);
+  TFE_OpSetAttrString(op, "container", "", 0);
+  TFE_OpSetAttrString(op, "shared_name", "", 0);
+  if (!device_name.empty()) {
+    TFE_OpSetDevice(op, device_name.c_str(), status);
+  }
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_TensorHandle* var_handle = nullptr;
+  int num_retvals = 1;
+  TFE_Execute(op, &var_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_DeleteOp(op);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  CHECK_EQ(1, num_retvals);
+
+  // Assign 'value' to it.
+  op = TFE_NewOp(ctx, "AssignVariableOp", status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpAddInput(op, var_handle, status);
+
+  // Convert 'value' to a TF_Tensor then a TFE_TensorHandle.
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> t(
+      TF_AllocateTensor(TF_FLOAT, nullptr, 0, sizeof(value)), TF_DeleteTensor);
+  memcpy(TF_TensorData(t.get()), &value, TF_TensorByteSize(t.get()));
+
+  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
+      value_handle(TFE_NewTensorHandle(t.get(), status),
+                   TFE_DeleteTensorHandle);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  TFE_OpAddInput(op, value_handle.get(), status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  TFE_DeleteOp(op);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  CHECK_EQ(0, num_retvals);
+
+  TF_DeleteStatus(status);
+
+  return var_handle;
+}
+
 TFE_Op* AddOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   TF_Status* status = TF_NewStatus();
 
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 11ae6d1181b..4c43f8d5833 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -42,6 +42,11 @@ TFE_TensorHandle* DoubleTestMatrixTensorHandle3X2(TFE_Context* ctx);
 // Return a tensor handle containing a 3x2 matrix of floats
 TFE_TensorHandle* TestMatrixTensorHandle3X2(TFE_Context* ctx);
 
+// Return a variable handle referring to a variable with the given initial value
+// on the given device.
+TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
+                               const tensorflow::string& device_name = "");
+
 // Return an add op multiplying `a` by `b`.
 TFE_Op* AddOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 9c472551bc6..e5030a602b3 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -15,247 +15,151 @@ limitations under the License.
 
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
 
-#include "absl/types/variant.h"
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/c/eager/c_api.h"
-#include "tensorflow/c/eager/c_api_internal.h"
-#include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/lib/monitoring/counter.h"
-#include "tensorflow/core/lib/monitoring/gauge.h"
-#include "tensorflow/core/lib/monitoring/sampler.h"
-#include "tensorflow/core/platform/casts.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/strcat.h"
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/platform/types.h"
 
 using tensorflow::string;
+using tensorflow::internal::OutputList;
+using tensorflow::internal::unwrap;
+
+namespace tensorflow {
+namespace internal {
+typedef absl::flat_hash_map<std::string, FactoryFunction> FactoriesMap;
+
+static FactoriesMap& GetFactories() {
+  static FactoriesMap* factories = new FactoriesMap;
+  return *factories;
+}
+
+static const char* default_factory = "<unset>";
+
+void RegisterTracingEngineFactory(const string& name, FactoryFunction factory) {
+  assert((!GetFactories().count(name)) ||
+         (GetFactories()[name] == factory) &&
+             "Duplicate tracing factory registration");
+  GetFactories()[name] = factory;
+}
+
+void SetDefaultTracingEngine(const char* name) { default_factory = name; }
+
+static ExecutionContext* CreateTracingExecutionContext(const char* fn_name,
+                                                       TF_Status* s) {
+  auto entry = GetFactories().find(default_factory);
+  if (entry != GetFactories().end()) return entry->second(fn_name, s);
+  string msg = absl::StrCat(
+      "No tracing engine factory has been registered with the key '",
+      default_factory, "' (available: ");
+  // Ensure deterministic (sorted) order in the error message
+  std::set<string> factories_sorted;
+  for (const auto& factory : GetFactories())
+    factories_sorted.insert(factory.first);
+  const char* comma = "";
+  for (const string& factory : factories_sorted) {
+    msg += comma + factory;
+    comma = ", ";
+  }
+  msg += ")";
+
+  TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
+  return nullptr;
+}
+
+}  // end namespace internal
+}  // end namespace tensorflow
 
 // =============================================================================
-// Unified Execution APIs for Eager and tracing backends.
+// Public C API entry points
+//
+// These are only the generic entry points for the C API. This file does not
+// have any visibility into the graph/eager implementation and is only providing
+// C bindings to the abstract classes defined in the
+// c_api_unified_experimental_internal.h header.
+//
 // =============================================================================
 
-typedef void (*ExecuteOperation)(TF_AbstractOp* op, int num_inputs,
-                                 TF_AbstractTensor* const* inputs,
-                                 TF_OutputList* o, TF_ExecutionContext* ctx,
-                                 TF_Status* s);
-struct TF_ExecutionContext {
-  explicit TF_ExecutionContext() {}
-  absl::variant<TFE_Context*, TF_GraphContext*> ctx;
-  ExecuteOperation execution_callback;
-};
-
-struct TF_AbstractTensor {
-  absl::variant<TFE_TensorHandle*, TF_GraphTensor*> t;
-};
-
-struct TF_AbstractOp {
-  string op_type;
-  string op_name;
-};
-
-TF_ExecutionContext* TF_NewExecutionContext() {
-  return new TF_ExecutionContext();
+void TF_SetTracingImplementation(const char* name) {
+  tensorflow::internal::SetDefaultTracingEngine(name);
 }
 
-void TF_DeleteExecutionContext(TF_ExecutionContext* c) { delete c; }
-
-TF_AbstractOp* TF_NewAbstractOp() {
-  TF_AbstractOp* op = new TF_AbstractOp;
-  return op;
+// Creates a new TensorFlow function, it is an execution context attached to a
+// given tracing context.
+TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* s) {
+  return wrap(tensorflow::internal::CreateTracingExecutionContext(fn_name, s));
 }
 
-void TF_DeleteAbstractOp(TF_AbstractOp* op) { delete op; }
-
-TF_AbstractTensor* TF_NewAbstractTensor() {
-  TF_AbstractTensor* t = new TF_AbstractTensor;
-  return t;
+TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
+                                         TF_OutputList* outputs, TF_Status* s) {
+  auto* func = wrap(unwrap(ctx)->Finalize(unwrap(outputs), s));
+  TF_DeleteExecutionContext(ctx);
+  return func;
 }
 
-void TF_DeleteAbstractTensor(TF_AbstractTensor* t) { delete t; }
-
-struct TF_GraphContext {
-  TF_Graph* graph;
-  // TODO(srbs): Handle captures.
-};
-
-TF_GraphContext* TF_NewGraphContext(TF_Graph* g) {
-  auto ctx = new TF_GraphContext;
-  ctx->graph = g;
-  return ctx;
+TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
+                                           TF_DataType dtype, TF_Status* s) {
+  return wrap(unwrap(func)->AddParameter(dtype, s));
 }
 
-void TF_DeleteGraphContext(TF_GraphContext* ctx) { delete ctx; }
+void TF_DeleteExecutionContext(TF_ExecutionContext* c) { delete unwrap(c); }
 
-struct TF_GraphTensor {
-  TF_Output output;
-  TF_GraphContext* ctx;
-};
-TF_GraphTensor* TF_NewGraphTensor(TF_GraphContext* ctx, TF_Output output,
-                                  TF_Status* s) {
-  TF_GraphTensor* t = new TF_GraphTensor;
-  t->output = output;
-  t->ctx = ctx;
-  return t;
-}
-TF_Output TF_GraphTensorToOutput(const TF_GraphTensor* const t, TF_Status* s) {
-  return t->output;
-}
-void TF_DeleteGraphTensor(TF_GraphTensor* t) { delete t; }
-void TF_AbstractTensorSetEagerTensor(TF_AbstractTensor* at, TFE_TensorHandle* t,
-                                     TF_Status* s) {
-  at->t = t;
-}
-TFE_TensorHandle* TF_AbstractTensorGetEagerTensor(TF_AbstractTensor* at,
-                                                  TF_Status* s) {
-  if (!absl::holds_alternative<TFE_TensorHandle*>(at->t)) {
-    string msg = absl::StrCat("Not an eager tensor handle.",
-                              reinterpret_cast<uintptr_t>(at));
-    TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
-    return nullptr;
-  }
-  return absl::get<TFE_TensorHandle*>(at->t);
-}
-void TF_AbstractTensorSetGraphTensor(TF_AbstractTensor* at, TF_GraphTensor* t,
-                                     TF_Status* s) {
-  at->t = t;
-}
-TF_GraphTensor* TF_AbstractTensorGetGraphTensor(TF_AbstractTensor* at,
-                                                TF_Status* s) {
-  if (!absl::holds_alternative<TF_GraphTensor*>(at->t)) {
-    string msg = absl::StrCat("Not an graph tensor handle.");
-    TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
-    return nullptr;
-  }
-  return absl::get<TF_GraphTensor*>(at->t);
+TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* c) {
+  return wrap(unwrap(c)->CreateOperation());
 }
 
-bool IsEagerTensor(const TF_AbstractTensor* const t) {
-  return absl::holds_alternative<TFE_TensorHandle*>(t->t);
-}
+void TF_DeleteAbstractOp(TF_AbstractOp* op) { delete unwrap(op); }
 
-struct TF_OutputList {
-  std::vector<TF_AbstractTensor*> outputs;
-  int expected_num_outputs = -1;
-};
+void TF_DeleteAbstractTensor(TF_AbstractTensor* t) { delete unwrap(t); }
 
-TF_OutputList* TF_NewOutputList() { return new TF_OutputList; }
-void TF_DeleteOutputList(TF_OutputList* o) { delete o; }
+TF_OutputList* TF_NewOutputList() { return wrap(new OutputList); }
+void TF_DeleteOutputList(TF_OutputList* o) { delete unwrap(o); }
 void TF_OutputListSetNumOutputs(TF_OutputList* o, int num_outputs,
                                 TF_Status* s) {
-  o->expected_num_outputs = num_outputs;
+  unwrap(o)->expected_num_outputs = num_outputs;
+}
+int TF_OutputListNumOutputs(TF_OutputList* o) {
+  return unwrap(o)->outputs.size();
 }
-int TF_OutputListNumOutputs(TF_OutputList* o) { return o->outputs.size(); }
 TF_AbstractTensor* TF_OutputListGet(TF_OutputList* o, int i) {
-  return o->outputs[i];
+  return wrap(unwrap(o)->outputs[i]);
 }
-
-void ExecuteOperationEager(TF_AbstractOp* op, int num_inputs,
-                           TF_AbstractTensor* const* inputs, TF_OutputList* o,
-                           TF_ExecutionContext* ctx, TF_Status* s) {
-  auto* tfe_op =
-      TFE_NewOp(absl::get<TFE_Context*>(ctx->ctx), op->op_type.c_str(), s);
-  if (TF_GetCode(s) != TF_OK) return;
-  for (int i = 0; i < num_inputs; ++i) {
-    if (!IsEagerTensor(inputs[i])) {
-      TF_SetStatus(s, TF_INVALID_ARGUMENT, "Not an eager tensor.");
-      return;
-    }
-    TFE_OpAddInput(tfe_op, absl::get<TFE_TensorHandle*>(inputs[i]->t), s);
-    if (TF_GetCode(s) != TF_OK) return;
-  }
-  if (o->expected_num_outputs == -1) {
-    string msg =
-        "The number of outputs must be provided in eager mode. Use "
-        "TF_OutputListSetNumOutputs.";
-    TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
-    return;
-  }
-  tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 2> retvals;
-  int num_retvals = o->expected_num_outputs;
-  retvals.resize(num_retvals);
-  TFE_Execute(tfe_op, retvals.data(), &num_retvals, s);
-  TFE_DeleteOp(tfe_op);
-  if (TF_GetCode(s) != TF_OK) {
-    return;
-  }
-  o->outputs.clear();
-  o->outputs.reserve(num_retvals);
-  for (int i = 0; i < num_retvals; ++i) {
-    auto* t = TF_NewAbstractTensor();
-    t->t = retvals[i];
-    o->outputs.push_back(t);
-  }
-}
-
-TF_GraphContext* GetGraphContext(TF_AbstractTensor const* t) {
-  return absl::get<TF_GraphTensor*>(t->t)->ctx;
-}
-
-void ExecuteOperationGraph(TF_AbstractOp* op, int num_inputs,
-                           TF_AbstractTensor* const* inputs, TF_OutputList* o,
-                           TF_ExecutionContext* ctx, TF_Status* s) {
-  TF_GraphContext* graph_ctx = absl::get<TF_GraphContext*>(ctx->ctx);
-  TF_Graph* g = graph_ctx->graph;
-  auto* tf_opdesc =
-      TF_NewOperation(g, op->op_type.c_str(), op->op_name.c_str());
-  for (int i = 0; i < num_inputs; ++i) {
-    auto* input = inputs[i];
-    if (IsEagerTensor(input)) {
-      TF_SetStatus(s, TF_INVALID_ARGUMENT,
-                   "Capturing eager tensors is not supported yet.");
-      return;
-    } else {
-      if (GetGraphContext(input) != graph_ctx) {
-        TF_SetStatus(
-            s, TF_INVALID_ARGUMENT,
-            "Capturing tensors from other graphs is not supported yet.");
-        return;
-      }
-      TF_AddInput(tf_opdesc, absl::get<TF_GraphTensor*>(input->t)->output);
-    }
-  }
-  auto* operation = TF_FinishOperation(tf_opdesc, s);
-  if (TF_GetCode(s) != TF_OK) return;
-  int num_outputs = TF_OperationNumOutputs(operation);
-  o->outputs.clear();
-  o->outputs.reserve(num_outputs);
-  for (int i = 0; i < num_outputs; ++i) {
-    auto* t = TF_NewAbstractTensor();
-    TF_GraphTensor* output_t = TF_NewGraphTensor(graph_ctx, {operation, i}, s);
-    if (TF_GetCode(s) != TF_OK) {
-      return;
-    }
-    t->t = output_t;
-    o->outputs.push_back(t);
-  }
-}
-
-void TF_ExecutionContextSetEagerContext(TF_ExecutionContext* context,
-                                        TFE_Context* eager_context,
-                                        TF_Status* s) {
-  context->ctx = eager_context;
-  context->execution_callback = &ExecuteOperationEager;
-}
-
-void TF_ExecutionContextSetGraphContext(TF_ExecutionContext* context,
-                                        TF_GraphContext* graph_context,
-                                        TF_Status* s) {
-  context->ctx = graph_context;
-  context->execution_callback = &ExecuteOperationGraph;
+void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
+                           TF_Status* s) {
+  unwrap(o)->outputs.push_back(unwrap(tensor));
 }
 
 void TF_AbstractOpSetOpType(TF_AbstractOp* op, const char* const op_type,
                             TF_Status* s) {
-  op->op_type = op_type;
+  unwrap(op)->SetOpType(op_type, s);
 }
 
 void TF_AbstractOpSetOpName(TF_AbstractOp* op, const char* const op_name,
                             TF_Status* s) {
-  op->op_name = op_name;
+  unwrap(op)->SetOpName(op_name, s);
+}
+
+void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
+                              TF_DataType value, TF_Status* s) {
+  unwrap(op)->SetAttrType(attr_name, value, s);
 }
 
 void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
                          TF_AbstractTensor* const* inputs, TF_OutputList* o,
                          TF_ExecutionContext* ctx, TF_Status* s) {
-  ctx->execution_callback(op, num_inputs, inputs, o, ctx, s);
+  unwrap(ctx)->ExecuteOperation(unwrap(op), num_inputs, &unwrap(*inputs),
+                                unwrap(o), s);
+}
+
+void TF_DeleteAbstractFunction(TF_AbstractFunction* func) {
+  delete unwrap(func);
+}
+
+void TF_ExecutionContextRegisterFunction(TF_ExecutionContext* ctx,
+                                         TF_AbstractFunction* func,
+                                         TF_Status* s) {
+  unwrap(ctx)->RegisterFunction(unwrap(func), s);
 }
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index 6346ceaf26e..86c59a7f625 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -15,8 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_H_
 #define TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_H_
 
-#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -34,39 +35,45 @@ extern "C" {
 // E.g. it could know whether we're in eager mode or in graph mode, keeps track
 // of gradient tapes, etc.
 typedef struct TF_ExecutionContext TF_ExecutionContext;
+
 // A TF_AbstractTensor is an input to an operation. E.g. it could be a union
-// type of eager and graph tensors.
+// type of eager and graph tensors. It is also the result of executing an
+// operation.
 typedef struct TF_AbstractTensor TF_AbstractTensor;
+
 // A TF_AbstractOp is the metadata we need to execute an operation. E.g. this
 // could contain the op type and other attributes.
 typedef struct TF_AbstractOp TF_AbstractOp;
 
-TF_ExecutionContext* TF_NewExecutionContext();
+// Stores a function representation that can be used for execution or for
+// setting functional attributes of other composite ops e.g. control flow.
+typedef struct TF_AbstractFunction TF_AbstractFunction;
+
+// This allows the client to swap the implementation of the tracing engine.
+// Any future call to TF_CreateFunction will use the implementation defined
+// here.
+void TF_SetTracingImplementation(const char* name);
+
+// Creates a new TensorFlow function. A Function is an execution context, and as
+// such it can trace operations through TF_ExecuteOperation. After completing
+// tracing, a function can be obtained by TF_FinalizeFunction.
+TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* status);
+
+// Creates a context for eager execution of operations.
+TF_ExecutionContext* TF_NewEagerExecutionContext(TFE_ContextOptions*,
+                                                 TF_Status* s);
 void TF_DeleteExecutionContext(TF_ExecutionContext*);
 
-TF_AbstractOp* TF_NewAbstractOp();
+// Add a new parameter to a TensorFlow Function.
+// TODO(aminim): what about shape?
+TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
+                                           TF_DataType dtype, TF_Status* s);
+
+// Create an operation suitable to use with the provided context. The operation
+// requires its type (e.g. "AddV2") to be set independently.
+TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* ctx);
 void TF_DeleteAbstractOp(TF_AbstractOp*);
 
-TF_AbstractTensor* TF_NewAbstractTensor();
-void TF_DeleteAbstractTensor(TF_AbstractTensor*);
-
-// -----------------------------------------------------------------------------
-// APIs for Eager and graph modes
-// -----------------------------------------------------------------------------
-
-// Keeps track of the current graph and other state e.g. captures etc.
-typedef struct TF_GraphContext TF_GraphContext;
-TF_GraphContext* TF_NewGraphContext(TF_Graph*);
-void TF_DeleteGraphContext(TF_GraphContext*);
-
-// `eager_context` must outlive `context`.
-void TF_ExecutionContextSetEagerContext(TF_ExecutionContext* context,
-                                        TFE_Context* eager_context, TF_Status*);
-// `graph_context` must outlive `context`.
-void TF_ExecutionContextSetGraphContext(TF_ExecutionContext* context,
-                                        TF_GraphContext* graph_context,
-                                        TF_Status*);
-
 // TODO(srbs): Add APIs for specifying attrs etc.
 // `op_type` must outlive `op`.
 void TF_AbstractOpSetOpType(TF_AbstractOp* op, const char* const op_type,
@@ -74,44 +81,64 @@ void TF_AbstractOpSetOpType(TF_AbstractOp* op, const char* const op_type,
 // `op_name` must outlive `op`.
 void TF_AbstractOpSetOpName(TF_AbstractOp* op, const char* const op_name,
                             TF_Status* s);
+// `attr_name` must outlive `op`.
+void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
+                              TF_DataType value, TF_Status* s);
 
-// Wrapper for TF_Output but contains a pointer to TF_GraphContext as well.
-typedef struct TF_GraphTensor TF_GraphTensor;
-TF_GraphTensor* TF_NewGraphTensor(TF_GraphContext* c, TF_Output t,
-                                  TF_Status* s);
-TF_Output TF_GraphTensorToOutput(const TF_GraphTensor* const t, TF_Status* s);
-void TF_DeleteGraphTensor(TF_GraphTensor* t);
+void TF_DeleteAbstractTensor(TF_AbstractTensor*);
 
-// `t` must outlive `at`.
-void TF_AbstractTensorSetEagerTensor(TF_AbstractTensor* at, TFE_TensorHandle* t,
-                                     TF_Status* s);
-TFE_TensorHandle* TF_AbstractTensorGetEagerTensor(TF_AbstractTensor* at,
-                                                  TF_Status* s);
-
-// `t` must outlive `at`.
-void TF_AbstractTensorSetGraphTensor(TF_AbstractTensor* at, TF_GraphTensor* t,
-                                     TF_Status* s);
-TF_GraphTensor* TF_AbstractTensorGetGraphTensor(TF_AbstractTensor* at,
-                                                TF_Status* s);
-
-// TF_OutputList just lets us not specify the number of outputs of an operation
-// beforehand. This forces a memory allocation in the runtime, which is bad, but
-// it allows for generic code.
+// TF_OutputList holds the list of TF_AbstractTensor that results from executing
+// an operation, or provided to create a function.
+// When executing an operation in an eager context, the expected number of
+// outputs must be set beforehand with `TF_OutputListSetNumOutputs`.
 typedef struct TF_OutputList TF_OutputList;
 TF_OutputList* TF_NewOutputList();
 void TF_DeleteOutputList(TF_OutputList* o);
-void TF_OutputListSetNumOutputs(TF_OutputList* o, int, TF_Status*);
+// Prepare tracing to the expected number of output for an operation.
+void TF_OutputListSetNumOutputs(TF_OutputList* o, int num_outputs, TF_Status*);
+// Return the number of outputs in the list.
 int TF_OutputListNumOutputs(TF_OutputList* o);
+// Return the `i`th output in the list.
 TF_AbstractTensor* TF_OutputListGet(TF_OutputList* o, int i);
+// Append a tensor at the end of the output list, growing its size by one.
+void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
+                           TF_Status*);
 
 // TF_ExecuteOperation will, if in eager mode, execute, if in graph mode, maybe
-// capture some inputs and then add a node in the graph, and after
-// execution/node creation it'll go and record things that happened in any tape
-// which happens to be active.
+// capture some inputs and then add a node in the graph. The output tensors are
+// returned through the provided TF_OutputList.
+// Any active tape will observe the effects of this execution.
 void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
                          TF_AbstractTensor* const* inputs, TF_OutputList* o,
                          TF_ExecutionContext* ctx, TF_Status* s);
 
+// Creates a new TF_AbstractFunction from the current tracing states in the
+// context. The provided `ctx` is consumed by this API call and deleted.
+// The returned TF_AbstractFunction must be deleted by the client,
+// TODO(aminim): clarify the contract on the state of the context after this
+// call.
+TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
+                                         TF_OutputList*, TF_Status*);
+
+void TF_DeleteAbstractFunction(TF_AbstractFunction*);
+
+// Register the function with the given context. This is particularly useful for
+// making a function available to an eager context.
+void TF_ExecutionContextRegisterFunction(TF_ExecutionContext*,
+                                         TF_AbstractFunction*, TF_Status*);
+
+// -----------------------------------------------------------------------------
+// APIs specific to Eager modes
+// -----------------------------------------------------------------------------
+
+// Temporary APIs till we figure out how to create scalar valued Eager
+// tensors and how to get value out of eager abstract tensors.
+TF_AbstractTensor* TF_CreateAbstractTensorFromEagerTensor(TFE_TensorHandle* t,
+                                                          TF_Status* s);
+TFE_TensorHandle* TF_AbstractTensorGetEagerTensor(TF_AbstractTensor* at,
+                                                  TF_Status* s);
+TFE_Context* TF_ExecutionContextGetTFEContext(TF_ExecutionContext*);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_unified_experimental_eager.cc b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
new file mode 100644
index 00000000000..cf8cf845834
--- /dev/null
+++ b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
@@ -0,0 +1,194 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::string;
+
+namespace tensorflow {
+namespace internal {
+
+// Simple wrapper over a TFE_TensorHandle
+struct EagerTensor : public AbstractTensor {
+  TFE_TensorHandle* t = nullptr;
+  EagerTensor() : AbstractTensor(kKind) {}
+  explicit EagerTensor(TFE_TensorHandle* t) : AbstractTensor(kKind), t(t) {}
+  ~EagerTensor() override { TFE_DeleteTensorHandle(t); }
+  static constexpr AbstractTensorKind kKind = kEagerTensor;
+};
+
+// Simple wrapper over a TFE_Op
+class EagerOp : public AbstractOp {
+ public:
+  explicit EagerOp(TFE_Context* ctx) : AbstractOp(kKind), ctx_(ctx) {}
+  void SetOpType(const char* const op_type, TF_Status* s) override {
+    op_ = TFE_NewOp(ctx_, op_type, s);
+  }
+  void SetOpName(const char* const op_name, TF_Status* s) override {
+    // Name is ignored in eager mode.
+  }
+  void SetAttrType(const char* const attr_name, TF_DataType value,
+                   TF_Status* s) override {
+    if (op_ == nullptr) {
+      TF_SetStatus(s, TF_FAILED_PRECONDITION,
+                   "op_type must be specified before specifying attrs.");
+      return;
+    }
+    TFE_OpSetAttrType(op_, attr_name, value);
+  }
+
+  ~EagerOp() override { TFE_DeleteOp(op_); }
+  static constexpr AbstractOpKind kKind = kEagerOp;
+
+ private:
+  friend class EagerContext;  // For access to op_.
+  TFE_Op* op_ = nullptr;
+  TFE_Context* ctx_;
+};
+
+// Wraps a TFE_Context and dispatch EagerOp with EagerTensor inputs.
+class EagerContext : public ExecutionContext {
+ public:
+  EagerContext() : ExecutionContext(kKind) {}
+
+  void Build(TFE_ContextOptions* options, TF_Status* status) {
+    eager_ctx_ = TFE_NewContext(options, status);
+  }
+
+  AbstractOp* CreateOperation() override {
+    // TODO(srbs): Should the lifetime of this op be tied to the context.
+    return new EagerOp(eager_ctx_);
+  }
+
+  void ExecuteOperation(AbstractOp* op, int num_inputs,
+                        AbstractTensor* const* inputs, OutputList* o,
+                        TF_Status* s) override {
+    auto* eager_op = dyncast<EagerOp>(op);
+    if (eager_op == nullptr) {
+      TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                   "Unable to cast AbstractOp to TF_EagerOp.");
+      return;
+    }
+    auto* tfe_op = eager_op->op_;
+    if (TF_GetCode(s) != TF_OK) return;
+    for (int i = 0; i < num_inputs; ++i) {
+      auto* eager_tensor = dyncast<const EagerTensor>(inputs[i]);
+      if (!eager_tensor) {
+        TF_SetStatus(s, TF_INVALID_ARGUMENT, "Not an eager tensor.");
+        return;
+      }
+      TFE_OpAddInput(tfe_op, eager_tensor->t, s);
+      if (TF_GetCode(s) != TF_OK) return;
+    }
+    if (o->expected_num_outputs == -1) {
+      string msg =
+          "The number of outputs must be provided in eager mode. Use "
+          "TF_OutputListSetNumOutputs.";
+      TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
+      return;
+    }
+    tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 2> retvals;
+    int num_retvals = o->expected_num_outputs;
+    retvals.resize(num_retvals);
+    TFE_Execute(tfe_op, retvals.data(), &num_retvals, s);
+    if (TF_GetCode(s) != TF_OK) {
+      return;
+    }
+    o->outputs.clear();
+    o->outputs.reserve(num_retvals);
+    for (int i = 0; i < num_retvals; ++i) {
+      o->outputs.push_back(new EagerTensor(retvals[i]));
+    }
+  }
+
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
+    TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                 "Can't add function parameter on an eager context.");
+    return nullptr;
+  }
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
+    TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                 "Can't use finalize function on an eager context.");
+    return nullptr;
+  }
+
+  void RegisterFunction(AbstractFunction* afunc, TF_Status* s) override {
+    auto* func = afunc->GetTfFunction(s);
+    if (!func) {
+      return;
+    }
+    TFE_ContextAddFunction(eager_ctx_, func, s);
+  }
+
+  ~EagerContext() override { TFE_DeleteContext(eager_ctx_); }
+
+  static constexpr ExecutionContextKind kKind = kEagerContext;
+
+ private:
+  friend TFE_Context* ::TF_ExecutionContextGetTFEContext(
+      TF_ExecutionContext* ctx);
+  TFE_Context* eager_ctx_;
+};
+
+}  // namespace internal
+}  // namespace tensorflow
+
+// =============================================================================
+// Public C API entry points
+// These are only the entry points specific to the Eager API.
+// =============================================================================
+
+using tensorflow::internal::dyncast;
+using tensorflow::internal::unwrap;
+
+TF_ExecutionContext* TF_NewEagerExecutionContext(TFE_ContextOptions* options,
+                                                 TF_Status* s) {
+  auto* ctx = new tensorflow::internal::EagerContext();
+  ctx->Build(options, s);
+  return wrap(ctx);
+}
+
+TF_AbstractTensor* TF_CreateAbstractTensorFromEagerTensor(TFE_TensorHandle* t,
+                                                          TF_Status* s) {
+  return wrap(new tensorflow::internal::EagerTensor(t));
+}
+
+TFE_TensorHandle* TF_AbstractTensorGetEagerTensor(TF_AbstractTensor* at,
+                                                  TF_Status* s) {
+  auto* eager_tensor = dyncast<tensorflow::internal::EagerTensor>(unwrap(at));
+  if (!eager_tensor) {
+    string msg = tensorflow::strings::StrCat("Not an eager tensor handle.",
+                                             reinterpret_cast<uintptr_t>(at));
+    TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
+    return nullptr;
+  }
+  return eager_tensor->t;
+}
+
+TFE_Context* TF_ExecutionContextGetTFEContext(TF_ExecutionContext* ctx) {
+  auto* eager_ctx = dyncast<tensorflow::internal::EagerContext>(unwrap(ctx));
+  if (!eager_ctx) return nullptr;
+  return eager_ctx->eager_ctx_;
+}
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
new file mode 100644
index 00000000000..dd5a95b3526
--- /dev/null
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -0,0 +1,235 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::string;
+
+namespace tensorflow {
+namespace internal {
+
+class GraphContext;
+
+// GraphTensor wraps a `TF_Output`, i.e. a pointer to TF_Operation and the index
+// into the list of outputs for the operation.
+struct GraphTensor : public AbstractTensor {
+  TF_Output output{};
+  GraphContext* ctx = nullptr;
+  GraphTensor() : AbstractTensor(kKind) {}
+  GraphTensor(TF_Output output, GraphContext* ctx)
+      : AbstractTensor(kKind), output(output), ctx(ctx) {}
+  static constexpr AbstractTensorKind kKind = kGraphTensor;
+};
+
+// GraphOp wraps and populate a TF_OperationDescription.
+class GraphOp : public AbstractOp {
+ public:
+  explicit GraphOp(TF_Graph* g) : AbstractOp(kKind), g_(g) {}
+  void SetOpType(const char* const op_type, TF_Status* s) override {
+    if (op_) {
+      TF_SetStatus(
+          s, TF_FAILED_PRECONDITION,
+          strings::StrCat("SetOpType called on already built op.").c_str());
+      return;
+    }
+    if (op_name_ != nullptr) {
+      op_.reset(TF_NewOperation(g_, op_type, op_name_));
+      op_name_ = nullptr;
+    } else {
+      op_type_ = op_type;
+    }
+  }
+  void SetOpName(const char* const op_name, TF_Status* s) override {
+    if (op_) {
+      TF_SetStatus(
+          s, TF_FAILED_PRECONDITION,
+          strings::StrCat("SetOpName called on already built op.").c_str());
+      return;
+    }
+    if (op_type_ != nullptr) {
+      op_.reset(TF_NewOperation(g_, op_type_, op_name));
+      op_type_ = nullptr;
+    } else {
+      op_name_ = op_name;
+    }
+  }
+  void SetAttrType(const char* const attr_name, TF_DataType value,
+                   TF_Status* s) override {
+    if (!op_) {
+      TF_SetStatus(
+          s, TF_FAILED_PRECONDITION,
+          "op_type and op_name must be specified before specifying attrs.");
+      return;
+    }
+    TF_SetAttrType(op_.get(), attr_name, value);
+  }
+  ~GraphOp() override {}
+
+  static constexpr AbstractOpKind kKind = kGraphOp;
+
+ private:
+  friend class GraphContext;  // For access to op_.
+  TF_Graph* g_;
+  std::unique_ptr<TF_OperationDescription> op_;
+  // Hold `op_type` and `op_name` till both are available since we need both
+  // to build a graph operation.
+  const char* op_type_ = nullptr;
+  const char* op_name_ = nullptr;
+};
+
+// GraphFunction is a thin wrapper over a TF_Function.
+struct GraphFunction : public AbstractFunction {
+  TF_Function* func = nullptr;
+  GraphFunction() : AbstractFunction(kKind) {}
+  explicit GraphFunction(TF_Function* func)
+      : AbstractFunction(kKind), func(func) {}
+  ~GraphFunction() override {
+    if (func) TF_DeleteFunction(func);
+  }
+
+  TF_Function* GetTfFunction(TF_Status* s) override { return func; }
+
+  static constexpr AbstractFunctionKind kKind = kGraphFunc;
+};
+
+// GraphContext wraps a TF_Graph modeling a single function and manages the
+// "execution" of operation, i.e. adding them to the function.
+class GraphContext : public ExecutionContext {
+ public:
+  explicit GraphContext(const char* name)
+      : ExecutionContext(kKind),
+        graph_(new TF_Graph(), TF_DeleteGraph),
+        name_(name) {}
+
+  AbstractOp* CreateOperation() override {
+    // TODO(srbs): Should the lifetime of this op be tied to the context.
+    return new GraphOp(graph_.get());
+  }
+
+  void ExecuteOperation(AbstractOp* op, int num_inputs,
+                        AbstractTensor* const* inputs, OutputList* o,
+                        TF_Status* s) override {
+    auto* graph_op = dyncast<GraphOp>(op);
+    if (graph_op == nullptr) {
+      TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                   "Unable to cast AbstractOp to TF_GraphOp.");
+      return;
+    }
+    auto* tf_opdesc = graph_op->op_.release();
+    if (tf_opdesc == nullptr) {
+      TF_SetStatus(s, TF_INVALID_ARGUMENT, "AbstractOp is incomplete.");
+      return;
+    }
+    for (int i = 0; i < num_inputs; ++i) {
+      auto* graph_tensor = dyncast<GraphTensor>(inputs[i]);
+      if (!graph_tensor) {
+        TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                     "Capturing eager tensors is not supported yet.");
+        return;
+      } else {
+        if (graph_tensor->ctx != this) {
+          TF_SetStatus(
+              s, TF_INVALID_ARGUMENT,
+              "Capturing tensors from other graphs is not supported yet.");
+          return;
+        }
+        TF_AddInput(tf_opdesc, graph_tensor->output);
+      }
+    }
+    auto* operation = TF_FinishOperation(tf_opdesc, s);
+    // TF_FinishOperation deletes `tf_opdesc` so clear its reference.
+    graph_op->op_ = nullptr;
+    if (TF_GetCode(s) != TF_OK) return;
+    int num_outputs = TF_OperationNumOutputs(operation);
+    o->outputs.clear();
+    o->outputs.reserve(num_outputs);
+    for (int i = 0; i < num_outputs; ++i) {
+      o->outputs.push_back(new GraphTensor({operation, i}, this));
+    }
+  }
+
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
+    TF_OperationDescription* opdesc =
+        TF_NewOperation(graph_.get(), "Placeholder",
+                        absl::StrCat("_input_", inputs_.size()).c_str());
+    TF_SetAttrType(opdesc, "dtype", dtype);
+    auto* operation = TF_FinishOperation(opdesc, s);
+    if (!s->status.ok()) return nullptr;
+
+    inputs_.push_back(TF_Output{operation, 0});
+    return new GraphTensor(inputs_.back(), this);
+  }
+
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
+    std::unique_ptr<GraphFunction> func(new GraphFunction);
+    std::vector<TF_Output> graph_outputs;
+    graph_outputs.reserve(outputs->outputs.size());
+    for (AbstractTensor* abstract_output : outputs->outputs) {
+      GraphTensor* output = dyncast<GraphTensor>(abstract_output);
+      if (!output) {
+        TF_SetStatus(s, TF_UNIMPLEMENTED,
+                     "Returning a non-graph tensor from a function has not "
+                     "been implemented yet.");
+        return nullptr;
+      }
+      graph_outputs.push_back(output->output);
+    }
+
+    func->func = TF_GraphToFunction(
+        graph_.get(), name_, 0, -1, nullptr, inputs_.size(), inputs_.data(),
+        graph_outputs.size(), graph_outputs.data(), nullptr, nullptr, name_, s);
+    if (TF_GetCode(s) != TF_OK) return nullptr;
+    return func.release();
+  }
+
+  void RegisterFunction(AbstractFunction* func, TF_Status* s) override {
+    TF_SetStatus(s, TF_UNIMPLEMENTED,
+                 "Registering graph functions has not been implemented yet.");
+  }
+
+  ~GraphContext() override {}
+
+  static constexpr ExecutionContextKind kKind = kGraphContext;
+
+ private:
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> graph_;
+  std::vector<TF_Output> inputs_;
+  const char* name_;
+};
+
+static ExecutionContext* GraphTracingFactory(const char* name, TF_Status* s) {
+  return new GraphContext(name);
+}
+
+// Register the tracing implemented in this file as the default tracing engine.
+static bool register_tracing = [] {
+  RegisterTracingEngineFactory("graphdef", GraphTracingFactory);
+  SetDefaultTracingEngine("graphdef");
+  return true;
+}();
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_unified_experimental_internal.h b/tensorflow/c/eager/c_api_unified_experimental_internal.h
new file mode 100644
index 00000000000..49212a230ee
--- /dev/null
+++ b/tensorflow/c/eager/c_api_unified_experimental_internal.h
@@ -0,0 +1,201 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_INTERNAL_H_
+
+#include <vector>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace internal {
+
+// =============================================================================
+// Implementation detail for the unified execution APIs for Eager and tracing
+// backends (graph/MLIR).
+//
+// This defines a set of abstract classes that are intended to provide the
+// functionality of the opaque C types exposed in the public APIs defined in the
+// `c_api_unified_experimental.h` header.
+// =============================================================================
+
+// We can't depend on C++ rtti, but we still want to be able to have a safe
+// dynamic_cast to provide diagnostics to the user when the API is misused.
+// Instead we model RTTI by listing all the possible subclasses for each
+// abstract base. Each subclass initializes the base class with the right
+// `kind`, which allows an equivalent to `std::dynamic_cast` provided by this
+// utility.
+template <typename T, typename S>
+T* dyncast(S source) {
+  if (source->getKind() != T::kKind) {
+    return nullptr;
+  }
+  return tensorflow::down_cast<T*>(source);
+}
+
+// Represents either an EagerTensor or a GraphTensor.
+// This base class does not expose any public methods other than to distinguish
+// which subclass it actually is. The user is responsible to use the right
+// type of AbstractTensor in their context (do not pass an EagerTensor to a
+// GraphContext and vice-versa).
+class AbstractTensor {
+ protected:
+  enum AbstractTensorKind { kGraphTensor, kEagerTensor, kMLIRTensor };
+  explicit AbstractTensor(AbstractTensorKind kind) : kind_(kind) {}
+
+ public:
+  // Returns which subclass is this instance of.
+  AbstractTensorKind getKind() const { return kind_; }
+  virtual ~AbstractTensor() = default;
+
+ private:
+  const AbstractTensorKind kind_;
+};
+
+// Represents the results of the execution of an operation.
+struct OutputList {
+  std::vector<AbstractTensor*> outputs;
+  int expected_num_outputs = -1;
+};
+
+// Holds the result of tracing a function.
+class AbstractFunction {
+ protected:
+  enum AbstractFunctionKind { kGraphFunc };
+  explicit AbstractFunction(AbstractFunctionKind kind) : kind_(kind) {}
+
+ public:
+  // Returns which subclass is this instance of.
+  AbstractFunctionKind getKind() const { return kind_; }
+  virtual ~AbstractFunction() = default;
+
+  // Temporary API till we figure the right abstraction for AbstractFunction.
+  // At the moment both Eager and Graph needs access to a "TF_Function" object.
+  virtual TF_Function* GetTfFunction(TF_Status* s) = 0;
+
+ private:
+  const AbstractFunctionKind kind_;
+};
+
+// An abstract operation describes an operation by its type, name, and
+// attributes. It can be "executed" by the context with some input tensors.
+// It is allowed to reusing the same abstract operation for multiple execution
+// on a given context, with the same or different input tensors.
+class AbstractOp {
+ protected:
+  enum AbstractOpKind { kGraphOp, kEagerOp };
+  explicit AbstractOp(AbstractOpKind kind) : kind_(kind) {}
+
+ public:
+  // Returns which subclass is this instance of.
+  AbstractOpKind getKind() const { return kind_; }
+  virtual ~AbstractOp() = default;
+
+  // Sets the type of the operation (for example `AddV2`).
+  virtual void SetOpType(const char* op_type, TF_Status* s) = 0;
+
+  // Sets the name of the operation: this is an optional identifier that is
+  // not intended to carry semantics and preserved/propagated without
+  // guarantees.
+  virtual void SetOpName(const char* op_name, TF_Status* s) = 0;
+
+  // Add a `TypeAttribute` on the operation.
+  virtual void SetAttrType(const char* attr_name, TF_DataType value,
+                           TF_Status* s) = 0;
+
+ private:
+  const AbstractOpKind kind_;
+};
+
+// This holds the context for the execution: dispatching operations either to an
+// eager implementation or to a graph implementation.
+struct ExecutionContext {
+ protected:
+  enum ExecutionContextKind { kGraphContext, kEagerContext };
+  explicit ExecutionContext(ExecutionContextKind kind) : k(kind) {}
+
+ public:
+  // Returns which subclass is this instance of.
+  ExecutionContextKind getKind() const { return k; }
+  virtual ~ExecutionContext() = default;
+
+  // Executes the operation on the provided inputs and populate the OutputList
+  // with the results. The input tensors must match the current context.
+  // The effect of "executing" an operation depends on the context: in an Eager
+  // context it will dispatch it to the runtime for execution, while in a
+  // tracing context it will add the operation to the current function.
+  virtual void ExecuteOperation(AbstractOp* op, int num_inputs,
+                                AbstractTensor* const* inputs, OutputList* o,
+                                TF_Status* s) = 0;
+
+  // Creates an empty AbstractOperation suitable to use with this context.
+  virtual AbstractOp* CreateOperation() = 0;
+
+  // Add a function parameter and return the corresponding tensor.
+  // This is only valid with an ExecutionContext obtained from a TracingContext,
+  // it'll always error out with an eager context.
+  virtual AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) = 0;
+
+  // Finalize this context and make a function out of it. The context is in a
+  // invalid state after this call and must be destroyed.
+  // This is only valid with an ExecutionContext obtained from a TracingContext,
+  // it'll always error out with an eager context.
+  virtual AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) = 0;
+
+  // Registers a functions with this context, after this the function is
+  // available to be called/referenced by its name in this context.
+  virtual void RegisterFunction(AbstractFunction* func, TF_Status* s) = 0;
+
+ private:
+  const ExecutionContextKind k;
+};
+
+typedef ExecutionContext* (*FactoryFunction)(const char* fn_name, TF_Status*);
+void SetDefaultTracingEngine(const char* name);
+void RegisterTracingEngineFactory(const ::tensorflow::string& name,
+                                  FactoryFunction factory);
+
+// Create utilities to wrap/unwrap: this convert from the C opaque types to the
+// C++ implementation, and back.
+#define MAKE_WRAP_UNWRAP(C_TYPEDEF, CPP_CLASS)                              \
+  static inline CPP_CLASS* const& unwrap(C_TYPEDEF* const& o) {             \
+    return reinterpret_cast<CPP_CLASS* const&>(o);                          \
+  }                                                                         \
+  static inline const CPP_CLASS* const& unwrap(const C_TYPEDEF* const& o) { \
+    return reinterpret_cast<const CPP_CLASS* const&>(o);                    \
+  }                                                                         \
+  static inline C_TYPEDEF* const& wrap(CPP_CLASS* const& o) {               \
+    return reinterpret_cast<C_TYPEDEF* const&>(o);                          \
+  }                                                                         \
+  static inline const C_TYPEDEF* const& wrap(const CPP_CLASS* const& o) {   \
+    return reinterpret_cast<const C_TYPEDEF* const&>(o);                    \
+  }
+
+MAKE_WRAP_UNWRAP(TF_ExecutionContext, ExecutionContext)
+MAKE_WRAP_UNWRAP(TF_AbstractFunction, AbstractFunction)
+MAKE_WRAP_UNWRAP(TF_AbstractTensor, AbstractTensor)
+MAKE_WRAP_UNWRAP(TF_AbstractOp, AbstractOp)
+MAKE_WRAP_UNWRAP(TF_OutputList, OutputList)
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 104ede9ebbd..9776b4d13ed 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -15,44 +15,44 @@ limitations under the License.
 
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
 
-#include <string.h>
+#include <memory>
 
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
-#include "tensorflow/cc/profiler/profiler.h"
-#include "tensorflow/core/lib/monitoring/collection_registry.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
 
 using tensorflow::string;
 
 namespace tensorflow {
 namespace {
 
-TEST(UnifedCAPI, TestBasicEager) {
-  TF_ExecutionContext* ctx = TF_NewExecutionContext();
+class UnifiedCAPI : public ::testing::TestWithParam<const char*> {
+ protected:
+  void SetUp() override { TF_SetTracingImplementation(GetParam()); }
+};
+
+TEST_P(UnifiedCAPI, TestBasicEager) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_Context* eager_ctx = TFE_NewContext(opts, status.get());
+  TF_ExecutionContext* ctx = TF_NewEagerExecutionContext(opts, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_DeleteContextOptions(opts);
 
-  // Enter the eager context.
-  TF_ExecutionContextSetEagerContext(ctx, eager_ctx, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build an abstract input tensor.
+  TFE_Context* eager_ctx = TF_ExecutionContextGetTFEContext(ctx);
   TFE_TensorHandle* t = TestScalarTensorHandle(eager_ctx, 2.0f);
-  TF_AbstractTensor* at = TF_NewAbstractTensor();
-  TF_AbstractTensorSetEagerTensor(at, t, status.get());
+  TF_AbstractTensor* at =
+      TF_CreateAbstractTensorFromEagerTensor(t, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build an abstract operation.
-  auto* op = TF_NewAbstractOp();
+  auto* op = TF_NewAbstractOp(ctx);
   TF_AbstractOpSetOpType(op, "Add", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
@@ -69,7 +69,6 @@ TEST(UnifedCAPI, TestBasicEager) {
   // Clean up operation and inputs.
   TF_DeleteAbstractOp(op);
   TF_DeleteAbstractTensor(at);
-  TFE_DeleteTensorHandle(t);
 
   // Verify the results.
   ASSERT_EQ(1, TF_OutputListNumOutputs(o));
@@ -83,100 +82,75 @@ TEST(UnifedCAPI, TestBasicEager) {
 
   TF_DeleteTensor(result_tensor);
   TF_DeleteAbstractTensor(result);
-  TFE_DeleteTensorHandle(result_t);
   TF_DeleteOutputList(o);
-  TFE_DeleteContext(eager_ctx);
   TF_DeleteExecutionContext(ctx);
 }
 
-TEST(UnifedCAPI, TestBasicGraph) {
-  TF_ExecutionContext* ctx = TF_NewExecutionContext();
+TEST_P(UnifiedCAPI, TestBasicGraph) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-
-  // Enter a graph context.
-  TF_Graph* g = TF_NewGraph();
-  TF_GraphContext* graph_context = TF_NewGraphContext(g);
-  TF_ExecutionContextSetGraphContext(ctx, graph_context, status.get());
+  // Start a new function / execution context.
+  string fn_name = "double";
+  TF_ExecutionContext* graph_ctx =
+      TF_CreateFunction(fn_name.c_str(), status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  // Add a placeholder to the graph.
-  auto* placeholder_op = TF_NewOperation(g, "Placeholder", "Placeholder");
-  TF_SetAttrType(placeholder_op, "dtype", TF_FLOAT);
-  auto* operation = TF_FinishOperation(placeholder_op, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_Output placeholder_t = {operation, 0};
-  TF_GraphTensor* graph_t =
-      TF_NewGraphTensor(graph_context, placeholder_t, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractTensor* t = TF_NewAbstractTensor();
-  TF_AbstractTensorSetGraphTensor(t, graph_t, status.get());
+  auto* placeholder_t =
+      TF_AddFunctionParameter(graph_ctx, TF_FLOAT, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build an abstract operation.
-  auto* op = TF_NewAbstractOp();
-  TF_AbstractOpSetOpType(op, "Add", status.get());
+  auto* add_op = TF_NewAbstractOp(graph_ctx);
+  TF_AbstractOpSetOpType(add_op, "Add", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetOpName(op, "my_add", status.get());
+  TF_AbstractOpSetOpName(add_op, "my_add", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build inputs and outputs.
-  TF_AbstractTensor* inputs[2] = {t, t};
-  TF_OutputList* o = TF_NewOutputList();
+  TF_AbstractTensor* inputs[2] = {placeholder_t, placeholder_t};
+  TF_OutputList* add_outputs = TF_NewOutputList();
 
   // Execute.
-  TF_ExecuteOperation(op, 2, inputs, o, ctx, status.get());
+  TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Clean up operation and inputs.
-  TF_DeleteAbstractOp(op);
-  TF_DeleteAbstractTensor(t);
-  TF_DeleteGraphTensor(graph_t);
+  TF_DeleteAbstractOp(add_op);
 
-  TF_AbstractTensor* result = TF_OutputListGet(o, 0);
-  TF_GraphTensor* result_graph_tensor =
-      TF_AbstractTensorGetGraphTensor(result, status.get());
-  TF_DeleteAbstractTensor(result);
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_Output result_output =
-      TF_GraphTensorToOutput(result_graph_tensor, status.get());
-  TF_DeleteGraphTensor(result_graph_tensor);
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  string fn_name = "double";
-  TF_Function* f = TF_GraphToFunction(
-      g, fn_name.c_str(), 0, -1, nullptr, 1, &placeholder_t, 1, &result_output,
-      nullptr, nullptr, fn_name.c_str(), status.get());
+  TF_AbstractFunction* func =
+      TF_FinalizeFunction(graph_ctx, add_outputs, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  // Build an eager context to run the function.
+  // Build eager context.
   TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_Context* eager_ctx = TFE_NewContext(opts, status.get());
+  TF_ExecutionContext* eager_execution_ctx =
+      TF_NewEagerExecutionContext(opts, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_DeleteContextOptions(opts);
 
-  // Build the abstract op to run the function.
-  TFE_ContextAddFunction(eager_ctx, f, status.get());
+  TF_ExecutionContextRegisterFunction(eager_execution_ctx, func, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOp* fn_op = TF_NewAbstractOp();
+  // Build the abstract op to run the function.
+  TF_AbstractOp* fn_op = TF_NewAbstractOp(eager_execution_ctx);
   TF_AbstractOpSetOpType(fn_op, fn_name.c_str(), status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build an abstract input tensor.
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(eager_execution_ctx);
   TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, 2.0f);
-  TF_AbstractTensor* input_t = TF_NewAbstractTensor();
-  TF_AbstractTensorSetEagerTensor(input_t, input_eager, status.get());
+  TF_AbstractTensor* input_t =
+      TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  // Enter the eager context.
-  TF_ExecutionContextSetEagerContext(ctx, eager_ctx, status.get());
+  TF_OutputListSetNumOutputs(add_outputs, 1, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_OutputListSetNumOutputs(o, 1, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_ExecuteOperation(fn_op, 1, &input_t, o, ctx, status.get());
+  TF_ExecuteOperation(fn_op, 1, &input_t, add_outputs, eager_execution_ctx,
+                      status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  ASSERT_EQ(1, TF_OutputListNumOutputs(o));
-  TF_AbstractTensor* final_result = TF_OutputListGet(o, 0);
+  ASSERT_EQ(1, TF_OutputListNumOutputs(add_outputs));
+  TF_AbstractTensor* final_result = TF_OutputListGet(add_outputs, 0);
   TFE_TensorHandle* final =
       TF_AbstractTensorGetEagerTensor(final_result, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
@@ -185,20 +159,325 @@ TEST(UnifedCAPI, TestBasicGraph) {
   float* f_value = static_cast<float*>(TF_TensorData(f_t));
   ASSERT_EQ(*f_value, 4.0);
 
-  TF_DeleteOutputList(o);
+  TF_DeleteOutputList(add_outputs);
   TF_DeleteAbstractOp(fn_op);
   TF_DeleteAbstractTensor(input_t);
-  TFE_DeleteTensorHandle(input_eager);
   TF_DeleteAbstractTensor(final_result);
-  TFE_DeleteTensorHandle(final);
   TF_DeleteTensor(f_t);
-  TF_DeleteFunction(f);
+  TF_DeleteAbstractFunction(func);
 
-  TF_DeleteGraphContext(graph_context);
-  TF_DeleteGraph(g);
-  TFE_DeleteContext(eager_ctx);
-  TF_DeleteExecutionContext(ctx);
+  TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
+TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Status* s = status.get();
+
+  // Start a new function / execution context.
+  string fn_name = "two_adds";
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  auto* arg0 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  auto* arg1 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Create a first "Add" computing `arg0 + arg1`.
+  TF_AbstractTensor* add_output1;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add1", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg0, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+    // Extract the resulting tensor.
+    add_output1 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // Same with a second "Add" computing `arg1 + arg1`.
+  TF_AbstractTensor* add_output2;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add2", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg1, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+    // Extract the resulting tensor.
+    add_output2 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // Finalize the function by providing the returned values.
+  TF_AbstractFunction* func;
+  {
+    // We want to return the output of both add operations, create a new list
+    // and populate it.
+    TF_OutputList* func_outputs = TF_NewOutputList();
+    TF_OutputListPushBack(func_outputs, add_output1, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_OutputListPushBack(func_outputs, add_output2, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    func = TF_FinalizeFunction(graph_ctx, func_outputs, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteOutputList(func_outputs);
+  }
+
+  /**
+   * We traced so far this function:
+   *
+   *   def two_adds(a, b):
+   *     my_add1 = a + b
+   *     my_add2 = b + b
+   *     return my_add1, my_add2
+   *
+   * Now we will execute this function with an eager context:
+   *
+   *   output1, output2 = two_adds(2.0, 3.0)
+   *
+   * and check that we got 5.0 and 6.0 as results.
+   */
+
+  // Build eager context.
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* eager_execution_ctx =
+      TF_NewEagerExecutionContext(opts, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TFE_DeleteContextOptions(opts);
+
+  TF_ExecutionContextRegisterFunction(eager_execution_ctx, func, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build the abstract op to run the function.
+  TF_AbstractOp* fn_op = TF_NewAbstractOp(eager_execution_ctx);
+  TF_AbstractOpSetOpType(fn_op, fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build two abstract input tensors as function arguments.
+  std::vector<TF_AbstractTensor*> func_args;
+  {
+    TFE_Context* eager_ctx =
+        TF_ExecutionContextGetTFEContext(eager_execution_ctx);
+    TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, 2.0f);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    input_eager = TestScalarTensorHandle(eager_ctx, 3.0f);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  }
+
+  TF_OutputList* func_outputs = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(func_outputs, 2, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_ExecuteOperation(fn_op, func_args.size(), func_args.data(), func_outputs,
+                      eager_execution_ctx, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteAbstractOp(fn_op);
+  for (TF_AbstractTensor* t : func_args) TF_DeleteAbstractTensor(t);
+
+  ASSERT_EQ(2, TF_OutputListNumOutputs(func_outputs));
+  float results[2];
+  for (int idx = 0; idx < 2; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TFE_TensorHandle* handle = TF_AbstractTensorGetEagerTensor(result, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_Tensor* f_t = TFE_TensorHandleResolve(handle, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    results[idx] = *static_cast<float*>(TF_TensorData(f_t));
+    TF_DeleteTensor(f_t);
+  }
+  ASSERT_EQ(results[0], 5.0);
+  ASSERT_EQ(results[1], 6.0);
+
+  for (int idx = 0; idx < 2; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TF_DeleteAbstractTensor(result);
+  }
+  TF_DeleteOutputList(func_outputs);
+  TF_DeleteExecutionContext(eager_execution_ctx);
+  TF_DeleteAbstractFunction(func);
+}
+
+TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* ctx = TF_NewEagerExecutionContext(opts, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContextOptions(opts);
+
+  TF_AbstractFunction* func = TF_FinalizeFunction(ctx, nullptr, status.get());
+  ASSERT_EQ(nullptr, func);
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+}
+
+TEST_P(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Add a placeholder to the graph.
+  auto* placeholder_op = TF_NewAbstractOp(graph_ctx);
+  TF_AbstractOpSetOpType(placeholder_op, "Placeholder", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_AbstractOpSetOpName(placeholder_op, "my_ph", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // This should fail.
+  TF_AbstractOpSetOpType(placeholder_op, "Placeholder", status.get());
+  ASSERT_EQ(TF_FAILED_PRECONDITION, TF_GetCode(status.get()));
+
+  TF_DeleteAbstractOp(placeholder_op);
+  TF_DeleteExecutionContext(graph_ctx);
+}
+
+TEST_P(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Add a placeholder to the graph.
+  auto* placeholder_op = TF_NewAbstractOp(graph_ctx);
+  TF_AbstractOpSetOpType(placeholder_op, "Placeholder", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_AbstractOpSetOpName(placeholder_op, "my_ph", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // This should fail.
+  TF_AbstractOpSetOpName(placeholder_op, "my_ph", status.get());
+  ASSERT_EQ(TF_FAILED_PRECONDITION, TF_GetCode(status.get()));
+
+  TF_DeleteAbstractOp(placeholder_op);
+  TF_DeleteExecutionContext(graph_ctx);
+}
+
+TEST_P(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
+  // Build an Eager context.
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* ctx = TF_NewEagerExecutionContext(opts, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContextOptions(opts);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build an Eager operation.
+  auto* op = TF_NewAbstractOp(ctx);
+  TF_AbstractOpSetOpType(op, "Add", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build an abstract input tensor.
+  TFE_Context* eager_ctx = TF_ExecutionContextGetTFEContext(ctx);
+  TFE_TensorHandle* t = TestScalarTensorHandle(eager_ctx, 2.0f);
+  TF_AbstractTensor* at =
+      TF_CreateAbstractTensorFromEagerTensor(t, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build inputs and outputs.
+  TF_AbstractTensor* inputs[2] = {at, at};
+  TF_OutputList* o = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(o, 1, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build a Graph context.
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Execute eager op using graph context.
+  TF_ExecuteOperation(op, 2, inputs, o, graph_ctx, status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+
+  // Clean up operation and inputs.
+  TF_DeleteAbstractOp(op);
+  TF_DeleteAbstractTensor(at);
+
+  TF_DeleteOutputList(o);
+  TF_DeleteExecutionContext(ctx);
+  TF_DeleteExecutionContext(graph_ctx);
+}
+
+TEST_P(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Add a placeholder to the graph.
+  auto* placeholder_op = TF_NewAbstractOp(graph_ctx);
+  TF_AbstractOpSetOpType(placeholder_op, "Placeholder", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_AbstractOpSetOpName(placeholder_op, "my_ph", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_AbstractOpSetAttrType(placeholder_op, "dtype", TF_FLOAT, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build inputs and outputs.
+  TF_OutputList* placeholder_outputs = TF_NewOutputList();
+
+  // Execute.
+  TF_ExecuteOperation(placeholder_op, 0, nullptr, placeholder_outputs,
+                      graph_ctx, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_EQ(1, TF_OutputListNumOutputs(placeholder_outputs));
+  TF_AbstractTensor* placeholder_t = TF_OutputListGet(placeholder_outputs, 0);
+
+  // Delete placeholder op.
+  TF_DeleteAbstractOp(placeholder_op);
+
+  // Build an abstract operation.
+  auto* add_op = TF_NewAbstractOp(graph_ctx);
+  TF_AbstractOpSetOpType(add_op, "Add", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_AbstractOpSetOpName(add_op, "my_add", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build inputs and outputs.
+  TF_AbstractTensor* inputs[2] = {placeholder_t, placeholder_t};
+  TF_OutputList* add_outputs = TF_NewOutputList();
+
+  // Build eager context.
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* eager_execution_ctx =
+      TF_NewEagerExecutionContext(opts, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContextOptions(opts);
+
+  // Execute.
+  TF_ExecuteOperation(add_op, 2, inputs, add_outputs, eager_execution_ctx,
+                      status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+
+  // Clean up operation and inputs.
+  TF_DeleteAbstractTensor(placeholder_t);
+  TF_DeleteAbstractOp(add_op);
+  TF_DeleteOutputList(add_outputs);
+  TF_DeleteOutputList(placeholder_outputs);
+  TF_DeleteExecutionContext(graph_ctx);
+  TF_DeleteExecutionContext(eager_execution_ctx);
+}
+
+INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI, ::testing::Values("graphdef"));
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/context_interface.h b/tensorflow/c/eager/context_interface.h
index 157f10c7fec..2861fa43b66 100644
--- a/tensorflow/c/eager/context_interface.h
+++ b/tensorflow/c/eager/context_interface.h
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/operation_interface.h"
 #include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
 #include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -57,16 +59,51 @@ class AbstractContextInterface {
   virtual AbstractTensorInterface* CreateTensor(
       DataType dtype, absl::Span<const int64> dim_sizes) = 0;
 
+  typedef void (*MemoryReleaser)(void* data, size_t len, void* arg);
+
+  // Create a tensor instance from the given data buffer and description.
+  // `memory_releaser` will be called on destruction, and it's responsible for
+  // cleaning up the underlying buffer. `convert_string` indicates whether it
+  // has to handle tstring conversion. Expected to be removed once tstring
+  // migration is done.
+  virtual AbstractTensorInterface* CreateTensor(DataType dtype,
+                                                const int64_t* dims,
+                                                int num_dims, void* data,
+                                                size_t len, bool convert_string,
+                                                MemoryReleaser memory_releaser,
+                                                void* memory_releaser_arg) = 0;
+
   // Create a handle to wrap and manage a Tensor
   virtual AbstractTensorHandleInterface* CreateLocalHandle(
       AbstractTensorInterface* t) = 0;
+  // Copy the handle to another device.
+  virtual AbstractTensorHandleInterface* CopyTensorHandleToDevice(
+      AbstractTensorHandleInterface* handle, const char* device_name,
+      Status* status) = 0;
 
   // Create an operation to perform op execution
   virtual AbstractOperationInterface* CreateOperation() = 0;
 
+  // Load a SavedModelAPI object from the given directory and tags
+  virtual std::unique_ptr<SavedModelAPI> LoadSavedModelAPI(
+      const std::string& directory,
+      const absl::optional<std::unordered_set<std::string>>& tags,
+      tensorflow::Status* status) = 0;
+
   // List attributes of available devices
   virtual void ListDevices(std::vector<DeviceAttributes>* devices) = 0;
 
+  virtual void ClearCachesAndThreadExecutors() = 0;
+
+  // Initialize the step resource container for a training step. This is used
+  // in current TF runtime. For tfrt, it is used by fallback op handler.
+  virtual void StartStep() = 0;
+  // Destroy the step resource container for a training step.
+  virtual void EndStep() = 0;
+
+  // Block until all pending nodes are finished.
+  virtual Status AsyncWait() = 0;
+
  protected:
   virtual ~AbstractContextInterface() {}
 };
diff --git a/tensorflow/c/eager/custom_device_test.cc b/tensorflow/c/eager/custom_device_test.cc
index 1ec9e9bd99a..1c078d4f42c 100644
--- a/tensorflow/c/eager/custom_device_test.cc
+++ b/tensorflow/c/eager/custom_device_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // A simple logging device to test custom device registration.
 #include <memory>
 
+#include "absl/strings/match.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/test.h"
 
-
 TEST(CUSTOM_DEVICE, RegisterSimpleDevice) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -176,7 +176,7 @@ TEST(CUSTOM_DEVICE, MakeVariable) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 }
 
-TEST(CUSTOM_DEVICE, AccessVariableOnWrongDevice) {
+TEST(CUSTOM_DEVICE, AccessVariableOnCustomDevice) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
@@ -226,16 +226,21 @@ TEST(CUSTOM_DEVICE, AccessVariableOnWrongDevice) {
 
   // Read the variable's value.
   op.reset(TFE_NewOp(context.get(), "ReadVariableOp", status.get()));
-  TFE_OpAddInput(op.get(), var_handle, status.get());
-  TFE_OpSetAttrType(op.get(), "dtype", TF_FLOAT);
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_OpAddInput(op.get(), var_handle, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_OpSetAttrType(op.get(), "dtype", TF_FLOAT);
   executed = false;
   num_retvals = 1;
   TFE_TensorHandle* var_value = nullptr;
   TFE_Execute(op.get(), &var_value, &num_retvals, status.get());
-  EXPECT_FALSE(TF_GetCode(status.get()) == TF_OK)
-      << "Execution should fail because the variable is being used on the "
-         "wrong device.";
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_TRUE(executed);
+  ASSERT_EQ(
+      tensorflow::string(name),
+      tensorflow::string(TFE_TensorHandleDeviceName(var_value, status.get())));
+  TFE_DeleteTensorHandle(var_value);
+
   // Free the backing buffer for the variable.
   op.reset(TFE_NewOp(context.get(), "DestroyResourceOp", status.get()));
   TFE_OpAddInput(op.get(), var_handle, status.get());
@@ -246,6 +251,79 @@ TEST(CUSTOM_DEVICE, AccessVariableOnWrongDevice) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 }
 
+TEST(CUSTOM_DEVICE, InputBasedPlacement) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  const char* custom0 = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
+  const char* custom1 = "/job:localhost/replica:0/task:0/device:CUSTOM:1";
+  bool arrived = false;
+  bool executed = false;
+  RegisterLoggingDevice(context.get(), custom0, &arrived, &executed,
+                        status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  RegisterLoggingDevice(context.get(), custom1, &arrived, &executed,
+                        status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)> hcpu(
+      TestMatrixTensorHandle(context.get()), TFE_DeleteTensorHandle);
+  ASSERT_FALSE(arrived);
+  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)> hcustom0(
+      TFE_TensorHandleCopyToDevice(hcpu.get(), context.get(), custom0,
+                                   status.get()),
+      TFE_DeleteTensorHandle);
+  ASSERT_TRUE(arrived);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  arrived = false;
+  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)> hcustom1(
+      TFE_TensorHandleCopyToDevice(hcpu.get(), context.get(), custom1,
+                                   status.get()),
+      TFE_DeleteTensorHandle);
+  ASSERT_TRUE(arrived);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  // Base case: two CPU inputs executes fine.
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> matmul(
+      MatMulOp(context.get(), hcpu.get(), hcpu.get()), TFE_DeleteOp);
+  TFE_TensorHandle* retval;
+  int num_retvals = 1;
+  TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_DeleteTensorHandle(retval);
+
+  // Custom device: inputs in same custom device works.
+  matmul.reset(MatMulOp(context.get(), hcustom0.get(), hcustom0.get()));
+  num_retvals = 1;
+  executed = false;
+  TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_TRUE(executed);
+  TFE_DeleteTensorHandle(retval);
+
+  // Custom device: inputs in different custom devices fails.
+  matmul.reset(MatMulOp(context.get(), hcustom0.get(), hcustom1.get()));
+  num_retvals = 1;
+  TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
+  ASSERT_NE(TF_OK, TF_GetCode(status.get()));
+  ASSERT_TRUE(absl::StrContains(TF_Message(status.get()), custom0));
+  ASSERT_TRUE(absl::StrContains(TF_Message(status.get()), custom1));
+
+  // Custom device: mix of custom/physical fails.
+  matmul.reset(MatMulOp(context.get(), hcustom0.get(), hcpu.get()));
+  num_retvals = 1;
+  TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
+  ASSERT_NE(TF_OK, TF_GetCode(status.get()));
+  ASSERT_TRUE(absl::StrContains(TF_Message(status.get()), custom0));
+  ASSERT_TRUE(
+      absl::StrContains(TF_Message(status.get()), "[]"));  // kVariantDeviceNull
+}
+
 TEST(CUSTOM_DEVICE, InvalidRegistrationError) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
diff --git a/tensorflow/c/eager/dlpack.cc b/tensorflow/c/eager/dlpack.cc
index 9f9bd85eba2..a0d6fe914c2 100644
--- a/tensorflow/c/eager/dlpack.cc
+++ b/tensorflow/c/eager/dlpack.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/c/eager/dlpack.h"
 
 #include "include/dlpack/dlpack.h"  // from @dlpack
-#include "tensorflow/c/eager/c_api_internal.h"
-#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/tf_status_internal.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_reference.h"
@@ -41,15 +43,15 @@ struct TfDlManagedTensorCtx {
 
 // Gets tensor from eager tensor handle.
 const Tensor* GetTensorFromHandle(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
     status->status = tensorflow::errors::InvalidArgument("Invalid handle");
     return nullptr;
   }
   tensorflow::TensorHandle* handle =
-      tensorflow::TensorHandleFromInterface(h->handle);
-  if (handle->IsRemote()) {
+      tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h));
+  if (handle->Type() != TensorHandle::LOCAL) {
     status->status = tensorflow::errors::InvalidArgument(
-        "DLPack doesn't support remote tensor");
+        "DLPack doesn't support ", handle->TypeString(), " tensor");
     return nullptr;
   }
   const tensorflow::Tensor* tensor;
@@ -107,7 +109,7 @@ DLDataType GetDlDataType(TF_DataType data_type, TF_Status* status) {
 // Gets DLPack's DLContext from eager tensor handle.
 DLContext GetDlContext(TFE_TensorHandle* h, TF_Status* status) {
   DLContext ctx;
-  const char* device_name = h->handle->DeviceName(&status->status);
+  const char* device_name = tensorflow::unwrap(h)->DeviceName(&status->status);
   DeviceNameUtils::ParsedName parsed_name;
   tensorflow::DeviceNameUtils::ParseFullName(device_name, &parsed_name);
   std::string device_type = parsed_name.type;
diff --git a/tensorflow/c/eager/operation_interface.h b/tensorflow/c/eager/operation_interface.h
index 4651d45ec04..844ba6c14bd 100644
--- a/tensorflow/c/eager/operation_interface.h
+++ b/tensorflow/c/eager/operation_interface.h
@@ -42,7 +42,28 @@ class AbstractOperationInterface {
   virtual Status Reset(const char* op, const char* raw_device_name) = 0;
 
   virtual const string& Name() const = 0;
+
+  // Returns the operation's device name.
+  //
+  // The value returned may be different from the one set by SetDeviceName, but
+  // it will be compatible with it: the name will be updated by device placement
+  // logic to refer to the specific device chosen.
+  //
+  // Example: If one calls `op->SetDeviceName("/device:GPU")`, the value
+  // returned by DeviceName should be "/device:GPU:*" until a particular GPU is
+  // chosen for the operation by the device placement logic in the
+  // executor. After that, the value returned by DeviceName will be a full
+  // device name such as "/job:localhost/replica:0/task:0/device:GPU:1".
   virtual const string& DeviceName() const = 0;
+
+  // Sets the operation device name.
+  //
+  // The given `name` must be parseable by DeviceNameUtils::ParseFullName, and
+  // the result will be used as a constraint for device placement. See the
+  // documentation for DeviceName for more details.
+  //
+  // The value will override the previous value - that is, no "merging" of
+  // existing and given constraints will be performed.
   virtual Status SetDeviceName(const char* name) = 0;
 
   virtual Status AddInput(AbstractTensorHandleInterface* input) = 0;
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 9d787d26433..3b2640e14d1 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -7,10 +7,27 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+# Currently pybind extension shared objects must use only C API headers since
+# the C API has static initializers duplicated in the Python bindings. So we
+# need a second rule that omits .cc files, in
+# tensorflow/python:_pywrap_parallel_device.
+filegroup(
+    name = "headers",
+    srcs = ["parallel_device.h"],
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+filegroup(
+    name = "sources",
+    srcs = ["parallel_device.cc"],
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
 cc_library(
     name = "parallel_device",
-    srcs = ["parallel_device.cc"],
-    hdrs = ["parallel_device.h"],
+    srcs = [":sources"],
+    hdrs = [":headers"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/c/eager:c_api",
@@ -27,6 +44,7 @@ tf_cc_test(
     srcs = ["parallel_device_test.cc"],
     deps = [
         ":parallel_device",
+        ":parallel_device_ops",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c/eager:c_api",
@@ -36,3 +54,19 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+# Note: ParallelDevice-specific ops are experimental and not currently linked in
+# to TensorFlow by default, just used in a few tests.
+filegroup(
+    name = "parallel_device_ops_srcs",
+    srcs = ["parallel_device_ops.cc"],
+    visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
+)
+
+cc_library(
+    name = "parallel_device_ops",
+    srcs = [":parallel_device_ops_srcs"],
+    visibility = ["//tensorflow:internal"],
+    deps = ["//tensorflow/core:framework"],
+    alwayslink = 1,
+)
diff --git a/tensorflow/c/eager/parallel_device/parallel_device.cc b/tensorflow/c/eager/parallel_device/parallel_device.cc
index bd5d8e777f2..27c2699c4c2 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device.cc
@@ -92,6 +92,10 @@ class ParallelDevice {
                                                        TFE_TensorHandle* tensor,
                                                        TF_Status* status) const;
 
+  // A parallel tensor with scalar integers numbering component devices.
+  std::unique_ptr<ParallelTensor> DeviceIDs(TFE_Context* context,
+                                            TF_Status* status) const;
+
   // Takes a description of a single operation being executed on the
   // ParallelDevice, and in turn runs one operation per component device with
   // its corresponding inputs from the input ParallelTensors (or
@@ -208,6 +212,46 @@ std::unique_ptr<ParallelTensor> ParallelDevice::CopyToParallelDevice(
                                            status);
 }
 
+std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
+    TFE_Context* context, TF_Status* status) const {
+  // TODO(allenl): We could cache DeviceIDs (keyed by context).
+  std::vector<TensorHandlePtr> components;
+  components.reserve(underlying_devices_.size());
+  for (int device_index = 0; device_index < underlying_devices_.size();
+       ++device_index) {
+    int64_t* device_id = new int64_t;
+    *device_id = device_index;
+    std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
+        TF_NewTensor(
+            TF_INT64, /*dims=*/nullptr, /*num_dims=*/0, device_id,
+            sizeof(int64_t),
+            [](void* data, size_t, void* arg) {
+              delete reinterpret_cast<int64_t*>(data);
+            },
+            nullptr),
+        TF_DeleteTensor);
+    // TODO(allenl): Here and when executing regular operations, we could hold
+    // on to one TFE_Op per device and just call TFE_ResetOp to avoid parsing
+    // device names repeatedly.
+    OpPtr const_op(TFE_NewOp(context, "Const", status));
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetDevice(const_op.get(), underlying_devices_[device_index].c_str(),
+                    status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrTensor(const_op.get(), "value", tensor.get(), status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT64);
+    TFE_TensorHandle* device_handle;
+    int num_outputs = 1;
+    TFE_Execute(const_op.get(), &device_handle, &num_outputs, status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    components.emplace_back(device_handle);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  return ParallelTensor::FromTensorHandles(*this, std::move(components),
+                                           status);
+}
+
 absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     TFE_Context* context, std::vector<MaybeParallelTensorUnowned> inputs,
     const char* operation_name, const TFE_OpAttrs* attributes,
@@ -282,6 +326,13 @@ absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     }
     result.emplace(std::move(outputs));
     return result;
+  } else if (operation_name == std::string("DeviceID")) {
+    std::vector<MaybeParallelTensorOwned> result_content;
+    result_content.reserve(1);
+    result_content.push_back(DeviceIDs(context, status));
+    if (TF_GetCode(status) != TF_OK) return result;
+    result.emplace(std::move(result_content));
+    return result;
   }
   absl::optional<std::vector<std::unique_ptr<ParallelTensor>>>
       maybe_parallel_results(
@@ -574,23 +625,21 @@ void DeleteParallelDevice(void* device_info) {
 
 }  // namespace
 
-void RegisterParallelDevice(TFE_Context* context, const char* device_name,
-                            const char** underlying_devices,
-                            int num_underlying_devices, TF_Status* status) {
-  TFE_CustomDevice custom_device;
-  custom_device.copy_tensor_to_device = &CopyToParallelDevice;
-  custom_device.copy_tensor_from_device = &CopyTensorFromParallelDevice;
-  custom_device.delete_device = &DeleteParallelDevice;
-  custom_device.execute = &ParallelDeviceExecute;
+void AllocateParallelDevice(const char* device_name,
+                            const char* const* underlying_devices,
+                            int num_underlying_devices,
+                            TFE_CustomDevice* device, void** device_info) {
+  device->copy_tensor_to_device = &CopyToParallelDevice;
+  device->copy_tensor_from_device = &CopyTensorFromParallelDevice;
+  device->delete_device = &DeleteParallelDevice;
+  device->execute = &ParallelDeviceExecute;
   std::vector<std::string> underlying_devices_vector;
   underlying_devices_vector.reserve(num_underlying_devices);
   for (int device_index = 0; device_index < num_underlying_devices;
        ++device_index) {
     underlying_devices_vector.push_back(underlying_devices[device_index]);
   }
-  ParallelDevice* d =
-      new ParallelDevice(device_name, underlying_devices_vector);
-  TFE_RegisterCustomDevice(context, custom_device, device_name, d, status);
+  *device_info = new ParallelDevice(device_name, underlying_devices_vector);
 }
 
 }  // namespace eager
diff --git a/tensorflow/c/eager/parallel_device/parallel_device.h b/tensorflow/c/eager/parallel_device/parallel_device.h
index b106524401f..f448a4c5b83 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device.h
+++ b/tensorflow/c/eager/parallel_device/parallel_device.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_H_
 #define TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_H_
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
 
 namespace tensorflow {
 namespace eager {
 
-// Register a parallel device named `device_name` which forwards operations to
+// Allocate a parallel device named `device_name` which forwards operations to
 // `underlying_devices`, maintaining "parallel tensors" with components placed
 // on each underlying device.
 //
@@ -50,11 +52,12 @@ namespace eager {
 // TPUReplicatedOutput(input=x, num_replicas=2)` un-packs the parallel tensor
 // into its components.
 //
-// `context` owns the parallel device. `underlying_devices` must stay valid
-// while the parallel device is in use.
-void RegisterParallelDevice(TFE_Context* context, const char* device_name,
-                            const char** underlying_devices,
-                            int num_underlying_devices, TF_Status* status);
+// The filled `device` struct and the allocated `device_info` struct may be
+// passed to TFE_RegisterCustomDevice. The `device_name` arguments must match.
+void AllocateParallelDevice(const char* device_name,
+                            const char* const* underlying_devices,
+                            int num_underlying_devices,
+                            TFE_CustomDevice* device, void** device_info);
 
 }  // namespace eager
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_ops.cc b/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
new file mode 100644
index 00000000000..1decffca047
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+// TODO(allenl): Figure out if we need this op, and if so whether we should move
+// it to core TF. Right now the eager C API does some checking of op
+// registrations before calling into custom devices, but we may be able to avoid
+// that.
+REGISTER_OP("DeviceID")
+    .Output("device_id: int64")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
index 41c7d64e231..fdc140407df 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_test.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
@@ -278,14 +278,28 @@ TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
 }
 
 // Assert that `handle` is equal to `expected_value`.
-void AssertScalarFloatEq(TFE_TensorHandle* handle, float expected_value) {
+template <typename value_type>
+void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> value_zero(
       TFE_TensorHandleResolve(handle, status.get()), TF_DeleteTensor);
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  ASSERT_EQ(expected_value,
-            *static_cast<float*>(TF_TensorData(value_zero.get())));
+  EXPECT_EQ(expected_value,
+            *static_cast<value_type*>(TF_TensorData(value_zero.get())));
+}
+
+template <std::size_t num_devices>
+void RegisterParallelDevice(
+    TFE_Context* context, const char* device_name,
+    const std::array<const char*, num_devices>& underlying_devices,
+    TF_Status* status) {
+  TFE_CustomDevice device;
+  void* device_info;
+  tensorflow::eager::AllocateParallelDevice(
+      device_name, underlying_devices.data(), underlying_devices.size(),
+      &device, &device_info);
+  TFE_RegisterCustomDevice(context, device, device_name, device_info, status);
 }
 
 // Create and modify a variable placed on a parallel device which composes
@@ -297,9 +311,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
       TF_NewStatus(), TF_DeleteStatus);
   const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
   std::array<const char*, 2> underlying_devices{first_device, second_device};
-  tensorflow::eager::RegisterParallelDevice(
-      context, device_name, underlying_devices.data(),
-      underlying_devices.size(), status.get());
+  RegisterParallelDevice(context, device_name, underlying_devices,
+                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create a variable handle (uninitialized to start) placed on the parallel
@@ -331,8 +344,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
     ExtractPerDeviceValues(context, read.get(), &components, status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    AssertScalarFloatEq(components[0].get(), 20.);
-    AssertScalarFloatEq(components[1].get(), 20.);
+    ExpectScalarEq<float>(components[0].get(), 20.);
+    ExpectScalarEq<float>(components[1].get(), 20.);
 
     std::string first_device =
         TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
@@ -361,8 +374,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
     ExtractPerDeviceValues(context, read.get(), &components, status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    AssertScalarFloatEq(components[0].get(), 23.);
-    AssertScalarFloatEq(components[1].get(), 18.);
+    ExpectScalarEq<float>(components[0].get(), 23.);
+    ExpectScalarEq<float>(components[1].get(), 18.);
 
     std::string first_device =
         TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
@@ -371,6 +384,32 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
         TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
     ASSERT_EQ(underlying_devices[1], second_device);
   }
+  // Compute the device ID twice and verify the result
+  for (int i = 0; i < 2; ++i) {
+    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+        TFE_NewOp(context, "DeviceID", status.get()), TFE_DeleteOp);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_OpSetDevice(op.get(), device_name, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_TensorHandle* result_handle;
+    int num_retvals = 1;
+    TFE_Execute(op.get(), &result_handle, &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    std::array<TensorHandlePtr, 2> components;
+    ExtractPerDeviceValues(context, result_handle, &components, status.get());
+    TFE_DeleteTensorHandle(result_handle);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<int64_t>(components[0].get(), 0);
+    ExpectScalarEq<int64_t>(components[1].get(), 1);
+    std::string first_device =
+        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
+    ASSERT_EQ(underlying_devices[0], first_device);
+    std::string second_device =
+        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
+    ASSERT_EQ(underlying_devices[1], second_device);
+  }
 }
 
 TEST(PARALLEL_DEVICE, TestBasicCPU) {
@@ -456,16 +495,14 @@ TEST(PARALLEL_DEVICE, TestExplicitCopies) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  std::vector<const char*> underlying_devices;
   const char* first_device_name =
       "/job:localhost/replica:0/task:0/device:CPU:0";
-  underlying_devices.push_back(first_device_name);
   const char* second_device_name =
       "/job:localhost/replica:0/task:0/device:CPU:1";
-  underlying_devices.push_back(second_device_name);
-  tensorflow::eager::RegisterParallelDevice(
-      context.get(), device_name, underlying_devices.data(),
-      underlying_devices.size(), status.get());
+  std::array<const char*, 2> underlying_devices{first_device_name,
+                                                second_device_name};
+  RegisterParallelDevice(context.get(), device_name, underlying_devices,
+                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   TensorHandlePtr cpu_value(FloatTensorHandle(3., status.get()));
@@ -488,8 +525,8 @@ TEST(PARALLEL_DEVICE, TestExplicitCopies) {
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
     // The value of the original tensor is replicated on each device.
-    AssertScalarFloatEq(components[0].get(), 3.);
-    AssertScalarFloatEq(components[1].get(), 3.);
+    ExpectScalarEq<float>(components[0].get(), 3.);
+    ExpectScalarEq<float>(components[1].get(), 3.);
 
     // Verify that the mirrors are placed on the component devices.
     std::string first_device =
@@ -524,12 +561,11 @@ TEST(PARALLEL_DEVICE, TestDifferentShapes) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  std::vector<const char*> underlying_devices;
-  underlying_devices.push_back("/job:localhost/replica:0/task:0/device:CPU:0");
-  underlying_devices.push_back("/job:localhost/replica:0/task:0/device:CPU:1");
-  tensorflow::eager::RegisterParallelDevice(
-      context.get(), device_name, underlying_devices.data(),
-      underlying_devices.size(), status.get());
+  std::array<const char*, 2> underlying_devices{
+      "/job:localhost/replica:0/task:0/device:CPU:0",
+      "/job:localhost/replica:0/task:0/device:CPU:1"};
+  RegisterParallelDevice(context.get(), device_name, underlying_devices,
+                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create two vectors with different lengths
@@ -570,24 +606,22 @@ TEST(PARALLEL_DEVICE, TestNestedParallelDevices) {
   // Create a parallel device with two CPUs
   const char* first_device_name =
       "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  std::vector<const char*> first_underlying_devices{
+  std::array<const char*, 2> first_underlying_devices{
       "/job:localhost/replica:0/task:0/device:CPU:0",
       "/job:localhost/replica:0/task:0/device:CPU:1"};
-  tensorflow::eager::RegisterParallelDevice(
-      context.get(), first_device_name, first_underlying_devices.data(),
-      first_underlying_devices.size(), status.get());
+  RegisterParallelDevice(context.get(), first_device_name,
+                         first_underlying_devices, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create a second parallel device with the first parallel device and one
   // additional CPU.
   const char* second_device_name =
       "/job:localhost/replica:0/task:0/device:CUSTOM:1";
-  std::vector<const char*> second_underlying_devices{
+  std::array<const char*, 2> second_underlying_devices{
       "/job:localhost/replica:0/task:0/device:CUSTOM:0",
       "/job:localhost/replica:0/task:0/device:CPU:2"};
-  tensorflow::eager::RegisterParallelDevice(
-      context.get(), second_device_name, second_underlying_devices.data(),
-      second_underlying_devices.size(), status.get());
+  RegisterParallelDevice(context.get(), second_device_name,
+                         second_underlying_devices, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create a tensor on the first parallel device
@@ -623,7 +657,7 @@ TEST(PARALLEL_DEVICE, TestNestedParallelDevices) {
                          &second_components, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-  AssertScalarFloatEq(second_components[1].get(), 9.);
+  ExpectScalarEq<float>(second_components[1].get(), 9.);
 
   // Verify that the mirrors are placed on the component devices.
   std::string first_device = TFE_TensorHandleBackingDeviceName(
@@ -637,8 +671,8 @@ TEST(PARALLEL_DEVICE, TestNestedParallelDevices) {
   std::array<TensorHandlePtr, 2> first_components;
   ExtractPerDeviceValues(context.get(), second_components[0].get(),
                          &first_components, status.get());
-  AssertScalarFloatEq(first_components[0].get(), 3.);
-  AssertScalarFloatEq(first_components[1].get(), 6.);
+  ExpectScalarEq<float>(first_components[0].get(), 3.);
+  ExpectScalarEq<float>(first_components[1].get(), 6.);
 
   first_device = TFE_TensorHandleBackingDeviceName(first_components[0].get(),
                                                    status.get());
@@ -656,11 +690,10 @@ TEST(PARALLEL_DEVICE, TestInvalidPacking) {
   std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
       TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
   const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  std::vector<const char*> underlying_devices;
-  underlying_devices.push_back("/job:localhost/replica:0/task:0/device:CPU:0");
-  tensorflow::eager::RegisterParallelDevice(
-      context.get(), device_name, underlying_devices.data(),
-      underlying_devices.size(), status.get());
+  std::array<const char*, 1> underlying_devices{
+      "/job:localhost/replica:0/task:0/device:CPU:0"};
+  RegisterParallelDevice(context.get(), device_name, underlying_devices,
+                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   TensorHandlePtr value_one(FloatTensorHandle(1., status.get()));
@@ -775,12 +808,11 @@ TEST(PARALLEL_DEVICE, TestCollective) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  std::vector<const char*> underlying_devices;
-  underlying_devices.push_back("/job:localhost/replica:0/task:0/device:CPU:0");
-  underlying_devices.push_back("/job:localhost/replica:0/task:0/device:CPU:1");
-  tensorflow::eager::RegisterParallelDevice(
-      context.get(), device_name, underlying_devices.data(),
-      underlying_devices.size(), status.get());
+  std::array<const char*, 2> underlying_devices{
+      "/job:localhost/replica:0/task:0/device:CPU:0",
+      "/job:localhost/replica:0/task:0/device:CPU:1"};
+  RegisterParallelDevice(context.get(), device_name, underlying_devices,
+                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create a tensor on the parallel device
@@ -801,8 +833,8 @@ TEST(PARALLEL_DEVICE, TestCollective) {
   ExtractPerDeviceValues(context.get(), reduced.get(), &result_components,
                          status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  AssertScalarFloatEq(result_components[0].get(), 3.);
-  AssertScalarFloatEq(result_components[1].get(), 3.);
+  ExpectScalarEq<float>(result_components[0].get(), 3.);
+  ExpectScalarEq<float>(result_components[1].get(), 3.);
 }
 
 void RegisterCollectiveMulFunction(TFE_Context* context,
@@ -867,12 +899,11 @@ TEST(PARALLEL_DEVICE, TestFunction) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  std::vector<const char*> underlying_devices;
-  underlying_devices.push_back("/job:localhost/replica:0/task:0/device:CPU:0");
-  underlying_devices.push_back("/job:localhost/replica:0/task:0/device:CPU:1");
-  tensorflow::eager::RegisterParallelDevice(
-      context.get(), device_name, underlying_devices.data(),
-      underlying_devices.size(), status.get());
+  std::array<const char*, 2> underlying_devices{
+      "/job:localhost/replica:0/task:0/device:CPU:0",
+      "/job:localhost/replica:0/task:0/device:CPU:1"};
+  RegisterParallelDevice(context.get(), device_name, underlying_devices,
+                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   const char* function_name = "test_reduce_mul";
@@ -905,8 +936,8 @@ TEST(PARALLEL_DEVICE, TestFunction) {
   ExtractPerDeviceValues(context.get(), reduced.get(), &result_components,
                          status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  AssertScalarFloatEq(result_components[0].get(), 7. * 9.);
-  AssertScalarFloatEq(result_components[1].get(), 7. * 9.);
+  ExpectScalarEq<float>(result_components[0].get(), 7. * 9.);
+  ExpectScalarEq<float>(result_components[1].get(), 7. * 9.);
 
   std::string first_device = TFE_TensorHandleBackingDeviceName(
       result_components[0].get(), status.get());
diff --git a/tensorflow/c/eager/tfe_cancellation_manager_internal.h b/tensorflow/c/eager/tfe_cancellation_manager_internal.h
new file mode 100644
index 00000000000..7d500c874e6
--- /dev/null
+++ b/tensorflow/c/eager/tfe_cancellation_manager_internal.h
@@ -0,0 +1,24 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_CANCELLATION_MANAGER_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_CANCELLATION_MANAGER_INTERNAL_H_
+
+#include "tensorflow/core/framework/cancellation.h"
+
+struct TFE_CancellationManager {
+  tensorflow::CancellationManager cancellation_manager;
+};
+
+#endif  // TENSORFLOW_C_EAGER_TFE_CANCELLATION_MANAGER_INTERNAL_H_
diff --git a/tensorflow/c/eager/tfe_context_internal.h b/tensorflow/c/eager/tfe_context_internal.h
new file mode 100644
index 00000000000..1d29bee9ee3
--- /dev/null
+++ b/tensorflow/c/eager/tfe_context_internal.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_CONTEXT_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_CONTEXT_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/context_interface.h"
+
+// Wraps a pointer to a context implementation.
+//
+// WARNING: Since the underlying object could be ref-counted a user of this
+// interface cannot destruct the underlying context object. Instead, call
+// TFE_DeleteContext who calls Release() on the context pointer and deletes
+// the TFE_Context structure.
+typedef struct TFE_Context TFE_Context;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractContextInterface, TFE_Context);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TFE_CONTEXT_INTERNAL_H_
diff --git a/tensorflow/c/eager/tfe_executor_internal.h b/tensorflow/c/eager/tfe_executor_internal.h
new file mode 100644
index 00000000000..442103fcae3
--- /dev/null
+++ b/tensorflow/c/eager/tfe_executor_internal.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_EXECUTOR_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_EXECUTOR_INTERNAL_H_
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+
+struct TFE_Executor {
+  explicit TFE_Executor(bool async)
+      : owned_executor(new tensorflow::EagerExecutor(async)) {}
+
+  explicit TFE_Executor(tensorflow::EagerExecutor* executor)
+      : owned_executor(nullptr), unowned_executor(executor) {}
+
+  tensorflow::EagerExecutor* executor() {
+    return owned_executor == nullptr ? unowned_executor : owned_executor.get();
+  }
+
+  std::unique_ptr<tensorflow::EagerExecutor> owned_executor;
+  tensorflow::EagerExecutor* unowned_executor;
+};
+
+#endif  // TENSORFLOW_C_EAGER_TFE_EXECUTOR_INTERNAL_H_
diff --git a/tensorflow/c/eager/tfe_monitoring_internal.h b/tensorflow/c/eager/tfe_monitoring_internal.h
new file mode 100644
index 00000000000..d8226855e9e
--- /dev/null
+++ b/tensorflow/c/eager/tfe_monitoring_internal.h
@@ -0,0 +1,146 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_MONITORING_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_MONITORING_INTERNAL_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tensorflow/core/platform/types.h"
+
+struct TFE_MonitoringCounterCell {
+  tensorflow::monitoring::CounterCell cell;
+};
+
+template <int NumLabels>
+struct TFE_MonitoringCounter {
+  template <typename... LabelDesc>
+  TFE_MonitoringCounter(const char* name, const char* description,
+                        LabelDesc&&... label) {
+    counter = absl::WrapUnique(tensorflow::monitoring::Counter<NumLabels>::New(
+        name, description, label...));
+  }
+
+  std::unique_ptr<tensorflow::monitoring::Counter<NumLabels>> counter;
+};
+
+struct TFE_MonitoringCounter0 : TFE_MonitoringCounter<0> {
+  using TFE_MonitoringCounter::TFE_MonitoringCounter;
+};
+struct TFE_MonitoringCounter1 : TFE_MonitoringCounter<1> {
+  using TFE_MonitoringCounter::TFE_MonitoringCounter;
+};
+struct TFE_MonitoringCounter2 : TFE_MonitoringCounter<2> {
+  using TFE_MonitoringCounter::TFE_MonitoringCounter;
+};
+
+struct TFE_MonitoringIntGaugeCell {
+  tensorflow::monitoring::GaugeCell<tensorflow::int64> cell;
+};
+struct TFE_MonitoringStringGaugeCell {
+  tensorflow::monitoring::GaugeCell<tensorflow::string> cell;
+};
+struct TFE_MonitoringBoolGaugeCell {
+  tensorflow::monitoring::GaugeCell<bool> cell;
+};
+
+template <typename ValueType, int NumLabels>
+struct TFE_MonitoringGauge {
+  template <typename... LabelDesc>
+  TFE_MonitoringGauge(const char* name, const char* description,
+                      LabelDesc&&... label) {
+    gauge = absl::WrapUnique(
+        tensorflow::monitoring::Gauge<ValueType, NumLabels>::New(
+            name, description, label...));
+  }
+
+  std::unique_ptr<tensorflow::monitoring::Gauge<ValueType, NumLabels>> gauge;
+};
+
+struct TFE_MonitoringIntGauge0 : TFE_MonitoringGauge<tensorflow::int64, 0> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringIntGauge1 : TFE_MonitoringGauge<tensorflow::int64, 1> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringIntGauge2 : TFE_MonitoringGauge<tensorflow::int64, 2> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+
+struct TFE_MonitoringStringGauge0 : TFE_MonitoringGauge<tensorflow::string, 0> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringStringGauge1 : TFE_MonitoringGauge<tensorflow::string, 1> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringStringGauge2 : TFE_MonitoringGauge<tensorflow::string, 2> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+
+struct TFE_MonitoringBoolGauge0 : TFE_MonitoringGauge<bool, 0> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringBoolGauge1 : TFE_MonitoringGauge<bool, 1> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringBoolGauge2 : TFE_MonitoringGauge<bool, 2> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+
+struct TFE_MonitoringBuckets {
+  explicit TFE_MonitoringBuckets(
+      std::function<std::unique_ptr<tensorflow::monitoring::Buckets>(void)>
+          fn) {
+    create_buckets = fn;
+  }
+
+  std::function<std::unique_ptr<tensorflow::monitoring::Buckets>(void)>
+      create_buckets;
+};
+
+struct TFE_MonitoringSamplerCell {
+  tensorflow::monitoring::SamplerCell cell;
+};
+
+template <int NumLabels>
+struct TFE_MonitoringSampler {
+  template <typename... LabelDesc>
+  TFE_MonitoringSampler(
+      const char* name,
+      std::unique_ptr<tensorflow::monitoring::Buckets> buckets,
+      const char* description, LabelDesc&&... label) {
+    sampler = absl::WrapUnique(tensorflow::monitoring::Sampler<NumLabels>::New(
+        {name, description, label...}, std::move(buckets)));
+  }
+
+  std::unique_ptr<tensorflow::monitoring::Sampler<NumLabels>> sampler;
+};
+
+struct TFE_MonitoringSampler0 : TFE_MonitoringSampler<0> {
+  using TFE_MonitoringSampler::TFE_MonitoringSampler;
+};
+struct TFE_MonitoringSampler1 : TFE_MonitoringSampler<1> {
+  using TFE_MonitoringSampler::TFE_MonitoringSampler;
+};
+struct TFE_MonitoringSampler2 : TFE_MonitoringSampler<2> {
+  using TFE_MonitoringSampler::TFE_MonitoringSampler;
+};
+
+#endif  // TENSORFLOW_C_EAGER_TFE_MONITORING_INTERNAL_H_
diff --git a/tensorflow/c/eager/tfe_op_attrs_internal.h b/tensorflow/c/eager/tfe_op_attrs_internal.h
new file mode 100644
index 00000000000..0287502dea6
--- /dev/null
+++ b/tensorflow/c/eager/tfe_op_attrs_internal.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_OP_ATTRS_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_OP_ATTRS_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+
+// An equivalent of a tensorflow::NameAttrList protocol buffer, but used in ways
+// that sometimes do not require serialization.
+typedef struct TFE_OpAttrs TFE_OpAttrs;
+
+typedef struct TFE_Context TFE_Context;
+typedef struct TFE_Op TFE_Op;
+
+namespace tensorflow {
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::AttrBuilder, TFE_OpAttrs);
+
+// Set an AttrValue on the op. Doesn't handle the list types.
+void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
+                          const tensorflow::AttrValue& default_value,
+                          const char* attr_name, TF_Status* status);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TFE_OP_ATTRS_INTERNAL_H_
diff --git a/tensorflow/c/eager/tfe_op_internal.h b/tensorflow/c/eager/tfe_op_internal.h
new file mode 100644
index 00000000000..6ca7f741d16
--- /dev/null
+++ b/tensorflow/c/eager/tfe_op_internal.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_OP_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_OP_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/operation_interface.h"
+
+// Wraps a pointer to an operation implementation.
+//
+// WARNING: Since the underlying object could be ref-counted a user of this
+// interface cannot destruct the underlying operation object. Instead, call
+// TFE_DeleteOp who calls Release() on the operation pointer and deletes
+// the TFE_Op structure.
+typedef struct TFE_Op TFE_Op;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractOperationInterface, TFE_Op);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractOperationInterface*, TFE_Op*);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TFE_OP_INTERNAL_H_
diff --git a/tensorflow/c/eager/tfe_tensor_debug_info_internal.h b/tensorflow/c/eager/tfe_tensor_debug_info_internal.h
new file mode 100644
index 00000000000..a9cf12a588f
--- /dev/null
+++ b/tensorflow/c/eager/tfe_tensor_debug_info_internal.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_TENSOR_DEBUG_INFO_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_TENSOR_DEBUG_INFO_INTERNAL_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+struct TFE_TensorDebugInfo {
+  explicit TFE_TensorDebugInfo(const std::vector<tensorflow::int64>& dims)
+      : dev_dims(dims) {}
+
+  // Fully-padded, minor-to-major.
+  std::vector<tensorflow::int64> dev_dims;
+};
+
+#endif  // TENSORFLOW_C_EAGER_TFE_TENSOR_DEBUG_INFO_INTERNAL_H_
diff --git a/tensorflow/c/eager/tfe_tensorhandle_internal.h b/tensorflow/c/eager/tfe_tensorhandle_internal.h
new file mode 100644
index 00000000000..543e5f1d932
--- /dev/null
+++ b/tensorflow/c/eager/tfe_tensorhandle_internal.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_TENSORHANDLE_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_TENSORHANDLE_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+
+// Wraps a pointer to a tensor handle implementation.
+//
+// WARNING: Since the underlying object could be ref-counted a user of this
+// interface cannot destruct the underlying handle object. Instead, call
+// TFE_DeleteTensorHandle who calls Release() on the handle pointer and deletes
+// the TFE_TensorHandle structure.
+typedef struct TFE_TensorHandle TFE_TensorHandle;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractTensorHandleInterface,
+                            TFE_TensorHandle);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractTensorHandleInterface*,
+                            TFE_TensorHandle*);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TFE_TENSORHANDLE_INTERNAL_H_
diff --git a/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc b/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc
index 53e247cd038..8ee47da01dd 100644
--- a/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc
@@ -85,17 +85,36 @@ class ModularFileSystemTest : public ::testing::TestWithParam<std::string> {
     const std::string test_name = tensorflow::str_util::StringReplace(
         ::testing::UnitTest::GetInstance()->current_test_info()->name(), "/",
         "_", /*replace_all=*/true);
-    root_dir_ = tensorflow::io::JoinPath(
-        ::testing::TempDir(),
-        tensorflow::strings::StrCat("tf_fs_", rng_val_, "_", test_name));
+    if (!cloud_path_.empty()) {
+      // We have to join path for non-local filesystem manually to make sure
+      // that this test will run on Windows since `tensorflow::io::JoinPath`
+      // behaves differently on Windows. `tmp_dir` should be something like
+      // `path/to/tmp/dir/`. After joining path, we will have
+      // /path/to/tmp/dir/tf_fs_rng_name/`
+      root_dir_ = tensorflow::strings::StrCat(
+          "/", tmp_dir_,
+          tensorflow::strings::StrCat("tf_fs_", rng_val_, "_", test_name), "/");
+    } else {
+      root_dir_ = tensorflow::io::JoinPath(
+          tmp_dir_,
+          tensorflow::strings::StrCat("tf_fs_", rng_val_, "_", test_name));
+    }
+    if (!GetParam().empty()) {
+      root_dir_ = tensorflow::strings::StrCat(GetParam(), "://", cloud_path_,
+                                              root_dir_);
+    }
     env_ = Env::Default();
   }
 
   void SetUp() override {
-    if (mkdir(root_dir_.c_str(), 0755) != 0) {
-      int error_code = errno;
-      GTEST_SKIP() << "Cannot create working directory: "
-                   << tensorflow::IOError(root_dir_, error_code);
+    FileSystem* fs = nullptr;
+    Status s = env_->GetFileSystemForFile(root_dir_, &fs);
+    if (fs == nullptr || !s.ok())
+      GTEST_SKIP() << "No filesystem registered: " << s;
+
+    s = fs->CreateDir(root_dir_);
+    if (!s.ok()) {
+      GTEST_SKIP() << "Cannot create working directory: " << s;
     }
   }
 
@@ -115,9 +134,10 @@ class ModularFileSystemTest : public ::testing::TestWithParam<std::string> {
   std::string GetURIForPath(StringPiece path) {
     const std::string translated_name =
         tensorflow::io::JoinPath(root_dir_, path);
-    if (GetParam().empty()) return translated_name;
-
-    return tensorflow::strings::StrCat(GetParam(), "://", translated_name);
+    // We have already checked `GetParam().empty()` in
+    // `ModularFileSystemTest()`. root_dir_ should contain `GetParam() + "://"`
+    // if it isn't empty.
+    return translated_name;
   }
 
   // Converts absolute paths to paths relative to root_dir_.
@@ -133,15 +153,28 @@ class ModularFileSystemTest : public ::testing::TestWithParam<std::string> {
     rng_val_ = distribution(gen);
   }
 
+  static void SetCloudPath(const std::string& cloud_path) {
+    cloud_path_ = cloud_path;
+    if (cloud_path_.back() == '/') cloud_path_.pop_back();
+  }
+
+  static void SetTmpDir(const std::string& tmp_dir) {
+    tmp_dir_ = tmp_dir.empty() ? ::testing::TempDir() : tmp_dir;
+  }
+
  protected:
   Env* env_;
 
  private:
   std::string root_dir_;
   static int rng_val_;
+  static std::string cloud_path_;
+  static std::string tmp_dir_;
 };
 
 int ModularFileSystemTest::rng_val_;
+std::string ModularFileSystemTest::cloud_path_;
+std::string ModularFileSystemTest::tmp_dir_;
 
 // As some of the implementations might be missing, the tests should still pass
 // if the returned `Status` signals the unimplemented state.
@@ -1729,6 +1762,20 @@ static bool GetURIScheme(const std::string& scheme) {
   return true;
 }
 
+// This function is used for cloud filesystem
+// `S3` and `GCS` require the `root_dir_` to have bucket name
+// `HDFS` requires the `root_dir` to have namenode
+// `root_dir_ = scheme + "://" cloud_path_ + root_dir_`
+static bool SetCloudPath(const std::string& cloud_path_) {
+  ModularFileSystemTest::SetCloudPath(cloud_path_);
+  return true;
+}
+
+static bool SetTmpDir(const std::string& tmp_dir_) {
+  ModularFileSystemTest::SetTmpDir(tmp_dir_);
+  return true;
+}
+
 }  // namespace
 }  // namespace tensorflow
 
@@ -1741,7 +1788,12 @@ GTEST_API_ int main(int argc, char** argv) {
       tensorflow::Flag("dso", tensorflow::LoadDSO, "",
                        "Path to shared object to load"),
       tensorflow::Flag("scheme", tensorflow::GetURIScheme, "",
-                       "URI scheme to test")};
+                       "URI scheme to test"),
+      tensorflow::Flag("cloud_path", tensorflow::SetCloudPath, "",
+                       "Path for cloud filesystem (namenode for hdfs, "
+                       "bucketname for s3/gcs)"),
+      tensorflow::Flag("tmp_dir", tensorflow::SetTmpDir, "",
+                       "Temporary directory to store test data.")};
   if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
     std::cout << tensorflow::Flags::Usage(argv[0], flag_list);
     return -1;
diff --git a/tensorflow/c/experimental/saved_model/README.md b/tensorflow/c/experimental/saved_model/README.md
new file mode 100644
index 00000000000..2fdb8137598
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/README.md
@@ -0,0 +1,66 @@
+# Tensorflow C SavedModel API
+
+## Overview
+
+These are the new experimental C SavedModel APIs for loading and running
+SavedModels in a TF2-idiomatic fashion. See
+[RFC 207](https://github.com/tensorflow/community/pull/207) for additional
+context.
+
+The directory structure is as follows:
+
+```none
+saved_model/
+
+  public/
+
+  internal/
+
+  core/
+
+```
+
+## saved_model/public
+
+`saved_model/public` is intended to house *only the public headers* of the
+SavedModel C API.
+
+These headers:
+
+1. declare opaque C types (like `TF_SavedModel`),
+
+2. declare the functions that operate on these types (like `TF_LoadSavedModel`).
+
+Once they leave experimental, these APIs should be considered stable for use
+by external clients.
+
+These headers are in a separate directory to make it obvious to clients which
+headers they should depend on, and which headers are implementation details.
+Separating these public headers by directory also allow future programmatic
+checks to ensure that TF public headers only `#include` other public TF headers.
+
+## saved_model/internal
+
+`saved_model/internal` is the "glue" between the C API and the internal C++
+implementation.
+
+Its role is to:
+
+1. implement the C API functions declared in `saved_model/public`
+
+2. define the C API types declared in `saved_model/public`
+
+The files fulfilling 1. are named `*.cc` (eg: `concrete_function.cc`), while
+the files fulfilling 2. are `*type.h` (eg: `concrete_function_type.h`).
+
+The headers exposing the internal implementation of the opaque C types are only
+visible to other implementors of the C API. This is similar to how other
+TF C API implementations use `tf_status_internal.h` (to extract the underlying
+`tensorflow::Status`). All other targets in this directory are private.
+
+## saved_model/core
+
+`saved_model/core` contains pure C++ "Classes" underlying the C API types
+in `saved_model/public/`. These are implementation
+details subject to change, and have limited visibility to implementors only.
+This is the bottom-most layer of the `C++ -> C -> C++` sandwich.
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
new file mode 100644
index 00000000000..8cebdd08170
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -0,0 +1,85 @@
+# Experimental SavedModel C APIs for TensorFlow. See RFC
+# https://github.com/tensorflow/community/pull/207
+# Targets in this directory are pure C++ "Classes" underlying the C API types
+# under tf/c/experimental/saved_model/public/. They are subject to change and
+# have visibility limited to Tensorflow's implementation only.
+
+package(
+    default_visibility = [
+        "//tensorflow/c:__subpackages__",
+        "//tensorflow/c/experimental/saved_model/internal:__pkg__",
+        "//tensorflow/core:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "concrete_function",
+    srcs = [
+        "concrete_function.cc",
+    ],
+    hdrs = [
+        "concrete_function.h",
+    ],
+    deps = [
+        ":function_metadata",
+        "//tensorflow/c/eager:operation_interface",
+        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "function_metadata",
+    hdrs = [
+        "function_metadata.h",
+    ],
+)
+
+cc_library(
+    name = "saved_model_api",
+    hdrs = [
+        "saved_model_api.h",
+    ],
+    deps = [
+        ":concrete_function",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "tf_saved_model_impl",
+    srcs = [
+        "tf_saved_model_impl.cc",
+    ],
+    hdrs = ["tf_saved_model_impl.h"],
+    deps = [
+        ":concrete_function",
+        ":saved_model_api",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "pywrap_required_hdrs",
+    textual_hdrs = [
+        "concrete_function.h",
+        "function_metadata.h",
+        "saved_model_api.h",
+    ],
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+filegroup(
+    name = "mobile_srcs_only_runtime",
+    srcs = [
+        "concrete_function.cc",
+        "concrete_function.h",
+        "function_metadata.h",
+        "saved_model_api.h",
+        "tf_saved_model_impl.cc",
+        "tf_saved_model_impl.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.cc b/tensorflow/c/experimental/saved_model/core/concrete_function.cc
new file mode 100644
index 00000000000..d5da2ca9bf4
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.cc
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
+
+namespace tensorflow {
+
+const std::vector<tensorflow::AbstractTensorHandleInterface*>&
+ConcreteFunction::GetCaptures() const {
+  return captures_;
+}
+
+const FunctionMetadata& ConcreteFunction::GetFunctionMetadata() const {
+  return metadata_;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.h b/tensorflow/c/experimental/saved_model/core/concrete_function.h
new file mode 100644
index 00000000000..6f8a5375277
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_CONCRETE_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_CONCRETE_FUNCTION_H_
+
+#include <vector>
+
+#include "tensorflow/c/eager/operation_interface.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
+#include "tensorflow/core/framework/function.pb.h"
+
+namespace tensorflow {
+
+// Note that ConcreteFunctions's lifetimes are effectively bound
+// to the SavedModel they are loaded from, since they retain pointers
+// to the TensorHandles owned by the SavedModel, and the FunctionDef
+// of the SavedModel.
+// Note(bmzhao): This class is only TEMPORARILY virtual, as a way to unblock
+// TFRT integration with TF Serving. Do not add more virtual implementations of
+// this class. Eventually we want to remove this virtual base class indirection
+// and have only a single implementation.
+class ConcreteFunction {
+ public:
+  virtual ~ConcreteFunction() = 0;
+
+  // This method returns the "Call" Op used to execute the function.
+  virtual AbstractOperationInterface* GetCallOp() = 0;
+
+  const std::vector<tensorflow::AbstractTensorHandleInterface*>& GetCaptures()
+      const;
+  const FunctionMetadata& GetFunctionMetadata() const;
+
+ private:
+  FunctionMetadata metadata_;
+  std::vector<tensorflow::AbstractTensorHandleInterface*> captures_;
+  FunctionDef* function_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_CONCRETE_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/core/function_metadata.h b/tensorflow/c/experimental/saved_model/core/function_metadata.h
new file mode 100644
index 00000000000..8499288f032
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/function_metadata.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_FUNCTION_METADATA_H_
+
+namespace tensorflow {
+
+class FunctionMetadata {
+  // TODO(bmzhao): Fill in with fields as necessary
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_FUNCTION_METADATA_H_
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_api.h b/tensorflow/c/experimental/saved_model/core/saved_model_api.h
new file mode 100644
index 00000000000..5d0ed63a765
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_api.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_API_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_API_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Note(bmzhao): This class is only TEMPORARILY virtual, as a way to unblock
+// TFRT integration with TF Serving. Do not add more virtual implementations of
+// this class. Eventually we want to remove this virtual base class indirection
+// and have only a single implementation.
+class SavedModelAPI {
+ public:
+  // Retrieve a function from the TF2 SavedModel, using the "path" to a function
+  // in a TF2 savedmodel.
+  // Note: `function` is a double pointer, so that implementations are
+  // able to return a pointer to an internal member.
+  virtual Status GetFunction(const std::string& function_path,
+                             ConcreteFunction** function) = 0;
+
+  // Retrieve a function from a SavedModel, using the key of the
+  // SignatureDef map:
+  // https://github.com/tensorflow/tensorflow/blob/69b08900b1e991d84bce31f3b404f5ed768f339f/tensorflow/core/protobuf/meta_graph.proto#L89
+  virtual Status GetSignatureDefFunction(const std::string& signature_def_key,
+                                         ConcreteFunction** function) = 0;
+
+  virtual std::vector<ConcreteFunction*> ListFunctions() = 0;
+
+  virtual ~SavedModelAPI() = default;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_impl.cc b/tensorflow/c/experimental/saved_model/core/tf_saved_model_impl.cc
new file mode 100644
index 00000000000..d1b71214d02
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_impl.cc
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/tf_saved_model_impl.h"
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+
+Status TFSavedModelAPIImpl::GetFunction(const std::string& function_path,
+                                        ConcreteFunction** function) {
+  // TODO(bmzhao): Add support for retrieving a function.
+  return errors::Unimplemented(
+      "Retrieving functions is unimplemented currently");
+}
+
+Status TFSavedModelAPIImpl::GetSignatureDefFunction(
+    const std::string& signature_def_key, ConcreteFunction** function) {
+  // TODO(bmzhao): Add support for retrieving a signaturedef function.
+  return errors::Unimplemented(
+      "Retrieving functions is unimplemented currently");
+}
+
+std::vector<ConcreteFunction*> TFSavedModelAPIImpl::ListFunctions() {
+  std::vector<ConcreteFunction*> result;
+  result.reserve(functions_.size());
+  for (ConcreteFunction& function : functions_) {
+    result.push_back(&function);
+  }
+  return result;
+}
+
+Status TFSavedModelAPIImpl::Load(
+    const std::string& directory,
+    const absl::optional<std::unordered_set<std::string>>& tags,
+    TFSavedModelAPIImpl* out) {
+  // TODO(bmzhao): Add support for loading a TFSavedModelImpl.
+  return errors::Unimplemented(
+      "TFSavedModelAPIImpl loading is unimplemented currently");
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_impl.h b/tensorflow/c/experimental/saved_model/core/tf_saved_model_impl.h
new file mode 100644
index 00000000000..f45dd22f773
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_impl.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SAVED_MODEL_IMPL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SAVED_MODEL_IMPL_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class TFSavedModelAPIImpl : public SavedModelAPI {
+ public:
+  TFSavedModelAPIImpl() = default;
+
+  Status GetFunction(const std::string& function_path,
+                     ConcreteFunction** function) override;
+
+  Status GetSignatureDefFunction(const std::string& signature_def_key,
+                                 ConcreteFunction** function) override;
+
+  static Status Load(
+      const std::string& directory,
+      const absl::optional<std::unordered_set<std::string>>& tags,
+      TFSavedModelAPIImpl* out);
+
+  std::vector<ConcreteFunction*> ListFunctions() override;
+
+  ~TFSavedModelAPIImpl() override = default;
+
+ private:
+  std::vector<ConcreteFunction> functions_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SAVED_MODEL_IMPL_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
new file mode 100644
index 00000000000..5c51e26f925
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -0,0 +1,212 @@
+# Experimental Implementation of SavedModel C APIs for TensorFlow. See RFC
+# https://github.com/tensorflow/community/pull/207
+# External clients should not worry about this directory; all contents are implementation details.
+# Code in this directory is intended to form the glue between the C API and the internal C++
+# implementation by
+# 1. mapping C API calls onto correponding methods of C++ objects
+# 2. mapping opaque C types onto C++ classes
+
+# Note(bmzhao): The *.cc files in this directory form the direct implementation of the
+# C API functions exposed in tf/c/experimental/saved_model/public/.
+
+# Note(bmzhao): All *type.h files in this directory are the internal definitions of
+# the opaque C types. These headers should only be visible to internal tensorflow
+# implementors.
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_copts",
+)
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "concrete_function",
+    srcs = [
+        "concrete_function.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:concrete_function.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":concrete_function_type",
+        ":function_metadata",
+        ":function_metadata_type",
+        ":tensorhandle_list",
+        ":tensorhandle_list_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:tfe_op_internal",
+        "//tensorflow/c/experimental/saved_model/core:concrete_function",
+        "//tensorflow/c/experimental/saved_model/core:function_metadata",
+    ],
+)
+
+cc_library(
+    name = "concrete_function_list",
+    srcs = [
+        "concrete_function_list.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:concrete_function_list.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":concrete_function",
+        ":concrete_function_list_type",
+        ":concrete_function_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c/experimental/saved_model/core:concrete_function",
+    ],
+)
+
+cc_library(
+    name = "concrete_function_list_type",
+    hdrs = [
+        "concrete_function_list_type.h",
+    ],
+    deps = [
+        "//tensorflow/c/experimental/saved_model/core:concrete_function",
+    ],
+)
+
+cc_library(
+    name = "concrete_function_type",
+    hdrs = [
+        "concrete_function_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/experimental/saved_model/core:concrete_function",
+    ],
+)
+
+cc_library(
+    name = "function_metadata",
+    srcs = [
+        "function_metadata.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:function_metadata.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":function_metadata_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c/experimental/saved_model/core:function_metadata",
+    ],
+)
+
+cc_library(
+    name = "function_metadata_type",
+    hdrs = [
+        "function_metadata_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/experimental/saved_model/core:function_metadata",
+    ],
+)
+
+cc_library(
+    name = "saved_model_api",
+    srcs = [
+        "saved_model_api.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:saved_model_api.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":concrete_function",
+        ":concrete_function_list",
+        ":concrete_function_list_type",
+        ":concrete_function_type",
+        ":saved_model_api_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_status_internal",
+        "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/c/experimental/saved_model/core:saved_model_api",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "saved_model_api_type",
+    hdrs = [
+        "saved_model_api_type.h",
+    ],
+    deps = [
+        "//tensorflow/c/experimental/saved_model/core:saved_model_api",
+    ],
+)
+
+cc_library(
+    name = "tensorhandle_list",
+    srcs = [
+        "tensorhandle_list.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:tensorhandle_list.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":tensorhandle_list_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+    ],
+)
+
+cc_library(
+    name = "tensorhandle_list_type",
+    hdrs = [
+        "tensorhandle_list_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/eager:tensor_handle_interface",
+    ],
+)
+
+tf_cc_test(
+    name = "saved_model_api_test",
+    size = "small",
+    srcs = [
+        "saved_model_api_test.cc",
+    ],
+    data = [
+        "//tensorflow/cc/saved_model:saved_model_half_plus_two",
+    ],
+    deps = [
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/c/experimental/saved_model/public:saved_model_api",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
new file mode 100644
index 00000000000..dd54416ddf9
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/internal/concrete_function_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/function_metadata_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
+
+extern "C" {
+
+TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(TF_ConcreteFunction* func) {
+  return tensorflow::wrap(const_cast<tensorflow::FunctionMetadata*>(
+      &tensorflow::unwrap(func)->GetFunctionMetadata()));
+}
+
+const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
+    TF_ConcreteFunction* func) {
+  return tensorflow::wrap(&tensorflow::unwrap(func)->GetCaptures());
+}
+
+TFE_Op* TF_ConcreteFunctionGetCallOp(TF_ConcreteFunction* func) {
+  return tensorflow::wrap(tensorflow::unwrap(func)->GetCallOp());
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function_list.cc b/tensorflow/c/experimental/saved_model/internal/concrete_function_list.cc
new file mode 100644
index 00000000000..85b6dc6183c
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function_list.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stddef.h>
+
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/concrete_function_type.h"
+
+extern "C" {
+
+size_t TF_ConcreteFunctionListNumOutputs(TF_ConcreteFunctionList* list) {
+  return list->list.size();
+}
+
+TF_ConcreteFunction* TF_ConcreteFunctionListGet(TF_ConcreteFunctionList* list,
+                                                int i) {
+  return tensorflow::wrap(list->list[i]);
+}
+
+void TF_DeleteConcreteFunctionList(TF_ConcreteFunctionList* list) {
+  delete list;
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h b/tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h
new file mode 100644
index 00000000000..66e0a8f97d7
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+
+// Internal structures used by the SavedModel C API. These are likely to change
+// and should not be depended on.
+
+struct TF_ConcreteFunctionList {
+  std::vector<tensorflow::ConcreteFunction*> list;
+};
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function_type.h b/tensorflow/c/experimental/saved_model/internal/concrete_function_type.h
new file mode 100644
index 00000000000..bc36b0c6f08
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function_type.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+
+// Internal structures used by the SavedModel C API. These are likely to change
+// and should not be depended on.
+
+// It doesn't make sense to wrap tensorflow::ConcreteFunction* in a separate
+// struct, since the lifetime of the struct and the raw pointer it wraps would
+// be different. Therefore TF_ConcreteFunction* = tensorflow::ConcreteFunction*.
+typedef struct TF_ConcreteFunction TF_ConcreteFunction;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ConcreteFunction, TF_ConcreteFunction)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_TYPE_H_
diff --git a/tensorflow/lite/experimental/kernels/hashtable_ops.i b/tensorflow/c/experimental/saved_model/internal/function_metadata.cc
similarity index 75%
rename from tensorflow/lite/experimental/kernels/hashtable_ops.i
rename to tensorflow/c/experimental/saved_model/internal/function_metadata.cc
index fa2e6facc75..4cf31e1abe1 100644
--- a/tensorflow/lite/experimental/kernels/hashtable_ops.i
+++ b/tensorflow/c/experimental/saved_model/internal/function_metadata.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-%{
-#include "tensorflow/lite/experimental/kernels/hashtable_ops.h"
-%}
+#include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 
-%include "tensorflow/lite/experimental/kernels/hashtable_ops.h"
+#include "tensorflow/c/experimental/saved_model/internal/function_metadata_type.h"
+
+// TODO(bmzhao): Add getter functions here as necessary.
diff --git a/tensorflow/c/experimental/saved_model/internal/function_metadata_type.h b/tensorflow/c/experimental/saved_model/internal/function_metadata_type.h
new file mode 100644
index 00000000000..40f05f9117d
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/function_metadata_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_FUNCTION_METADATA_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_FUNCTION_METADATA_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
+
+typedef struct TF_FunctionMetadata TF_FunctionMetadata;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::FunctionMetadata, TF_FunctionMetadata)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_FUNCTION_METADATA_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
new file mode 100644
index 00000000000..629610dbe29
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/concrete_function_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/core/platform/status.h"
+
+extern "C" {
+
+TF_SavedModel* TF_LoadSavedModel(const char* dirname, TFE_Context* ctx,
+                                 TF_Status* status) {
+  std::string saved_model_dir(dirname);
+
+  std::unique_ptr<tensorflow::SavedModelAPI> result =
+      tensorflow::unwrap(ctx)->LoadSavedModelAPI(dirname, absl::nullopt,
+                                                 &status->status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+  return new TF_SavedModel{std::move(result)};
+}
+
+TF_SavedModel* TF_LoadSavedModelWithTags(const char* dirname, TFE_Context* ctx,
+                                         const char* const* tags, int tags_len,
+                                         TF_Status* status) {
+  std::string saved_model_dir(dirname);
+
+  std::unordered_set<std::string> tagset;
+  for (int i = 0; i < tags_len; ++i) {
+    tagset.insert(std::string(tags[i]));
+  }
+
+  std::unique_ptr<tensorflow::SavedModelAPI> result =
+      tensorflow::unwrap(ctx)->LoadSavedModelAPI(dirname, std::move(tagset),
+                                                 &status->status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+  return new TF_SavedModel{std::move(result)};
+}
+
+void TF_DeleteSavedModel(TF_SavedModel* model) { delete model; }
+
+TF_ConcreteFunction* TF_GetSavedModelConcreteFunction(TF_SavedModel* model,
+                                                      const char* function_path,
+                                                      TF_Status* status) {
+  tensorflow::ConcreteFunction* result = nullptr;
+  tensorflow::Status get_function_status =
+      model->saved_model->GetFunction(function_path, &result);
+  status->status.Update(get_function_status);
+  if (!get_function_status.ok()) {
+    return nullptr;
+  }
+  return tensorflow::wrap(result);
+}
+
+TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
+    TF_SavedModel* model, const char* signature_def_key, TF_Status* status) {
+  tensorflow::ConcreteFunction* result = nullptr;
+  tensorflow::Status get_function_status =
+      model->saved_model->GetSignatureDefFunction(signature_def_key, &result);
+  status->status.Update(get_function_status);
+  if (!get_function_status.ok()) {
+    return nullptr;
+  }
+  return tensorflow::wrap(result);
+}
+
+TF_ConcreteFunctionList* TF_ListSavedModelFunctions(TF_SavedModel* model) {
+  return new TF_ConcreteFunctionList{model->saved_model->ListFunctions()};
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
new file mode 100644
index 00000000000..aa0b00ab847
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+
+#include <string>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+constexpr char kTestData[] = "cc/saved_model/testdata";
+const char* kServeTag[] = {"serve"};
+
+std::string SavedModelPath(tensorflow::StringPiece saved_model_dir) {
+  return tensorflow::io::JoinPath(tensorflow::testing::TensorFlowSrcRoot(),
+                                  kTestData, saved_model_dir);
+}
+
+// This value parameterized test allows us to test both TFRT
+// and non TFRT runtimes.
+// https://github.com/google/googletest/blob/dcc92d0ab6c4ce022162a23566d44f673251eee4/googletest/docs/advanced.md#value-parameterized-tests
+class CSavedModelAPITest : public ::testing::TestWithParam<bool> {};
+
+TEST_P(CSavedModelAPITest, LoadsSavedModelWithTags) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  bool use_tfrt = GetParam();
+  if (use_tfrt) {
+    TFE_DeleteContextOptions(opts);
+    TF_DeleteStatus(status);
+    GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
+  }
+
+  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
+
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
+
+  TF_SavedModel* saved_model =
+      TF_LoadSavedModelWithTags(model_dir.c_str(), ctx, kServeTag, 1, status);
+
+  // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
+  // That unblocks writing other tests that require a TF_SavedModel*,
+  // like loading a ConcreteFunction. This test at least checks that the
+  // C API builds and can be minimally run.
+  EXPECT_EQ(TF_GetCode(status), TF_UNIMPLEMENTED);
+
+  TF_DeleteSavedModel(saved_model);
+  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
+}
+
+TEST_P(CSavedModelAPITest, LoadsSavedModel) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  bool use_tfrt = GetParam();
+  if (use_tfrt) {
+    TFE_DeleteContextOptions(opts);
+    TF_DeleteStatus(status);
+    GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
+  }
+
+  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
+
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
+
+  TF_SavedModel* saved_model =
+      TF_LoadSavedModel(model_dir.c_str(), ctx, status);
+
+  // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
+  // That unblocks writing other tests that require a TF_SavedModel*,
+  // like loading a ConcreteFunction. This test at least checks that the
+  // C API builds and can be minimally run.
+  EXPECT_EQ(TF_GetCode(status), TF_UNIMPLEMENTED);
+
+  TF_DeleteSavedModel(saved_model);
+  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
+}
+
+INSTANTIATE_TEST_SUITE_P(RuntimeAgnosticSavedModelTests, CSavedModelAPITest,
+                         ::testing::Bool());
+
+}  // namespace
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h b/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
new file mode 100644
index 00000000000..9e2d1117463
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SAVED_MODEL_API_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SAVED_MODEL_API_TYPE_H_
+
+#include <memory>
+
+#include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+
+// Internal structures used by the SavedModel C API. These are likely to change
+// and should not be depended on.
+
+struct TF_SavedModel {
+  std::unique_ptr<tensorflow::SavedModelAPI> saved_model;
+};
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SAVED_MODEL_API_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
new file mode 100644
index 00000000000..7d018658101
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
+
+#include <stddef.h>
+
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
+
+extern "C" {
+
+size_t TF_TensorHandleListSize(const TF_TensorHandleList* list) {
+  return tensorflow::unwrap(list)->size();
+}
+
+TFE_TensorHandle* TF_TensorHandleListGet(const TF_TensorHandleList* list,
+                                         int i) {
+  return tensorflow::wrap((*tensorflow::unwrap(list))[i]);
+}
+
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
new file mode 100644
index 00000000000..8cbec2806a8
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+
+#include <vector>
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+
+// Internal structures used by the SavedModel C API. These are likely to
+// change and should not be depended on.
+
+typedef struct TF_TensorHandleList TF_TensorHandleList;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(
+    std::vector<tensorflow::AbstractTensorHandleInterface*>,
+    TF_TensorHandleList)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/public/BUILD b/tensorflow/c/experimental/saved_model/public/BUILD
new file mode 100644
index 00000000000..0cfa0a2c005
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/BUILD
@@ -0,0 +1,70 @@
+# Experimental SavedModel C APIs for TensorFlow.
+# See RFC https://github.com/tensorflow/community/pull/207
+# All headers are on the public surface of Tensorflow's C API.
+# Once moved out of experimental, these will be stable.
+# The idea behind a separate public/ directory is to make apparent
+# which headers are part of TF's public interface (and which headers)
+# are implementation details. This structure allows us to also perform future
+# programmatic checks that all "public" headers only include other "public"
+# headers.
+
+package(
+    # This is intentionally public
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# TODO(bmzhao): Remove these exports_files and rules, swap with cc_public_library instead.
+# cc_public_library would allows us to separate the header dep graph from header+srcs dep graph.
+exports_files(
+    [
+        "concrete_function.h",
+        "concrete_function_list.h",
+        "function_metadata.h",
+        "saved_model_api.h",
+        "tensorhandle_list.h",
+    ],
+    visibility = ["//tensorflow/c/experimental/saved_model/internal:__pkg__"],
+)
+
+# The purpose of this header is to provide insulation against
+# future changes where we rename/move a public header, without
+# forcing all clients to change their "#includes".
+cc_library(
+    name = "c_saved_model_api",
+    hdrs = ["c_saved_model_api.h"],
+    deps = [
+        ":concrete_function",
+        ":concrete_function_list",
+        ":function_metadata",
+        ":saved_model_api",
+        ":tensorhandle_list",
+    ],
+)
+
+alias(
+    name = "concrete_function",
+    actual = "//tensorflow/c/experimental/saved_model/internal:concrete_function",
+)
+
+alias(
+    name = "concrete_function_list",
+    actual = "//tensorflow/c/experimental/saved_model/internal:concrete_function_list",
+)
+
+alias(
+    name = "function_metadata",
+    actual = "//tensorflow/c/experimental/saved_model/internal:function_metadata",
+)
+
+alias(
+    name = "saved_model_api",
+    actual = "//tensorflow/c/experimental/saved_model/internal:saved_model_api",
+)
+
+alias(
+    name = "tensorhandle_list",
+    actual = "//tensorflow/c/experimental/saved_model/internal:tensorhandle_list",
+)
diff --git a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
new file mode 100644
index 00000000000..aae95a5477c
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
+
+// IWYU pragma: begin_exports
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
+#include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
+// IWYU pragma: end_exports
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function.h b/tensorflow/c/experimental/saved_model/public/concrete_function.h
new file mode 100644
index 00000000000..2a87214270c
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that corresponds to a Function loaded from a SavedModel.
+// TODO(bmzhao): Work together w/srbs@ to make sure this composes w/the
+// C++ Unified Eager/Graph API's AbstractFunction
+typedef struct TF_ConcreteFunction TF_ConcreteFunction;
+
+// Returns FunctionMetadata associated with `func`. Metadata's lifetime is
+// bound to `func`, which is bound to the TF_SavedModel it was loaded from.
+TF_CAPI_EXPORT extern TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(
+    TF_ConcreteFunction* func);
+
+// Returns a list of TensorHandles implicitly captured by this function.
+TF_CAPI_EXPORT extern const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
+    TF_ConcreteFunction* func);
+
+// Returns a TFE_Op suitable for executing this function.
+TF_CAPI_EXPORT extern TFE_Op* TF_ConcreteFunctionGetCallOp(
+    TF_ConcreteFunction* func);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function_list.h b/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
new file mode 100644
index 00000000000..e35546751f1
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that is acts like a list of TF_ConcreteFunction pointers.
+typedef struct TF_ConcreteFunctionList TF_ConcreteFunctionList;
+
+// Returns the size of `list`.
+TF_CAPI_EXPORT extern size_t TF_ConcreteFunctionListSize(
+    TF_ConcreteFunctionList* list);
+
+// Returns the `i`th TF_ConcreteFunction in the list.
+TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_ConcreteFunctionListGet(
+    TF_ConcreteFunctionList* list, int i);
+
+// Deletes `list`.
+TF_CAPI_EXPORT extern void TF_DeleteConcreteFunctionList(
+    TF_ConcreteFunctionList* list);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/tensorflow/c/experimental/saved_model/public/function_metadata.h b/tensorflow/c/experimental/saved_model/public/function_metadata.h
new file mode 100644
index 00000000000..83ca3c73523
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/function_metadata.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_FUNCTION_METADATA_H_
+
+#include "tensorflow/c/c_api_macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type used to store any metadata associated with a function.
+typedef struct TF_FunctionMetadata TF_FunctionMetadata;
+
+// TODO(bmzhao): Add getters for fields as we determine what metadata
+// we want to expose.
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_FUNCTION_METADATA_H_
diff --git a/tensorflow/c/experimental/saved_model/public/saved_model_api.h b/tensorflow/c/experimental/saved_model/public/saved_model_api.h
new file mode 100644
index 00000000000..875167bec63
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/saved_model_api.h
@@ -0,0 +1,108 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SAVED_MODEL_API_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SAVED_MODEL_API_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
+#include "tensorflow/c/tf_status.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type representing a Tensorflow "SavedModel"
+// (https://www.tensorflow.org/guide/saved_model) that we always pass by pointer
+// to achieve ABI stability.
+typedef struct TF_SavedModel TF_SavedModel;
+
+// Load a SavedModel from `dirname`. We expect the SavedModel to contain a
+// single Metagraph (as for those exported from TF2's `tf.saved_model.save`).
+//
+// Params:
+//  dirname - A directory filepath that the SavedModel is at.
+//  ctx - A TFE_Context containing optional load/TF runtime options.
+//        `ctx` must outlive the returned TF_SavedModel pointer.
+//  status - Set to OK on success and an appropriate error on failure.
+// Returns:
+//  If status is not OK, returns nullptr. Otherwise, returns a newly created
+//  TF_SavedModel instance. It must be deleted by calling TF_DeleteSavedModel.
+TF_CAPI_EXPORT extern TF_SavedModel* TF_LoadSavedModel(const char* dirname,
+                                                       TFE_Context* ctx,
+                                                       TF_Status* status);
+
+// Load a SavedModel from `dirname`.
+//
+// Params:
+//  dirname - A directory filepath that the SavedModel is at.
+//  ctx - A TFE_Context containing optional load/TF runtime options.
+//        `ctx` must outlive the returned TF_SavedModel pointer.
+//  tags - char* array of SavedModel tags. We will load the metagraph matching
+//         the tags.
+//  tags_len - number of elements in the `tags` array.
+//  status - Set to OK on success and an appropriate error on failure.
+// Returns:
+//  If status is not OK, returns nullptr. Otherwise, returns a newly created
+//  TF_SavedModel instance. It must be deleted by calling TF_DeleteSavedModel.
+TF_CAPI_EXPORT extern TF_SavedModel* TF_LoadSavedModelWithTags(
+    const char* dirname, TFE_Context* ctx, const char* const* tags,
+    int tags_len, TF_Status* status);
+
+// Deletes a TF_SavedModel, and frees any resources owned by it.
+TF_CAPI_EXPORT extern void TF_DeleteSavedModel(TF_SavedModel* model);
+
+// Retrieve a function from the TF2 SavedModel via function path.
+//
+// Params:
+//  model - The TF2 SavedModel to load a function from.
+//  function_path - A string containing the path from the root saved python
+//                  object to a tf.function method.
+//                  TODO(bmzhao): Add a detailed example of this with a
+//                  python tf.module before moving this out of experimental.
+//  status - Set to OK on success and an appropriate error on failure.
+// Returns:
+//  If status is not OK, returns nullptr. Otherwise, returns a
+//  TF_ConcreteFunction instance. The lifetime of this instance is
+//  "conceptually" bound to `model`. Once `model` is deleted, all
+//  `TF_ConcreteFunctions` retrieved from it are invalid, and have been deleted.
+TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelConcreteFunction(
+    TF_SavedModel* model, const char* function_path, TF_Status* status);
+
+// Retrieve a function from the TF SavedModel via a SignatureDef key.
+//
+// Params:
+//  model - The SavedModel to load a function from.
+//  signature_def_key - The string key of the SignatureDef map of a SavedModel:
+//                      https://github.com/tensorflow/tensorflow/blob/69b08900b1e991d84bce31f3b404f5ed768f339f/tensorflow/core/protobuf/meta_graph.proto#L89
+//  status - Set to OK on success and an appropriate error on failure.
+// Returns:
+//  If status is not OK, returns nullptr. Otherwise, returns a
+//  TF_ConcreteFunction instance. Once `model` is deleted, all
+//  `TF_ConcreteFunctions` retrieved from it are invalid, and have been deleted.
+TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
+    TF_SavedModel* model, const char* signature_def_key, TF_Status* status);
+
+// Returns a list of all ConcreteFunctions stored in this SavedModel.
+// The lifetime of the returned list is bound to `model`.
+TF_CAPI_EXPORT extern TF_ConcreteFunctionList* TF_ListSavedModelFunctions(
+    TF_SavedModel* model);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
new file mode 100644
index 00000000000..a1e88db3474
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that is acts like a list of TF_ConcreteFunction pointers.
+typedef struct TF_TensorHandleList TF_TensorHandleList;
+
+// Returns the size of `list`.
+TF_CAPI_EXPORT extern size_t TF_TensorHandleListSize(
+    const TF_TensorHandleList* list);
+
+// Returns the `i`th TFE_TensorHandle in the list.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TF_TensorHandleListGet(
+    const TF_TensorHandleList* list, int i);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 022989bfbf2..e1fad8e697a 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -156,6 +156,7 @@ cc_library(
         ":array_grad",
         ":data_flow_grad",
         ":image_grad",
+        ":manip_grad",
         ":math_grad",
         ":nn_grad",
     ],
@@ -177,10 +178,11 @@ cc_library_with_android_deps(
     name = "ops",
     srcs = ["framework/ops.cc"],
     hdrs = ["framework/ops.h"],
-    android_deps = ["//tensorflow/core:android_tensorflow_lib"],
+    android_deps = ["//tensorflow/core:portable_tensorflow_lib"],
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
@@ -195,7 +197,7 @@ cc_library_with_android_deps(
         "framework/scope_internal.h",
     ],
     hdrs = ["framework/scope.h"],
-    android_deps = ["//tensorflow/core:android_tensorflow_lib"],
+    android_deps = ["//tensorflow/core:portable_tensorflow_lib"],
     common_deps = [
         ":ops",
     ],
@@ -235,7 +237,7 @@ cc_library_with_android_deps(
     name = "client_session",
     srcs = ["client/client_session.cc"],
     hdrs = ["client/client_session.h"],
-    android_deps = ["//tensorflow/core:android_tensorflow_lib"],
+    android_deps = ["//tensorflow/core:portable_tensorflow_lib"],
     common_deps = [
         ":ops",
         ":scope",
@@ -273,7 +275,7 @@ cc_library_with_android_deps(
     srcs = ["ops/const_op.cc"],
     hdrs = ["ops/const_op.h"],
     android_deps = [
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ],
     common_deps = [
         ":ops",
@@ -302,7 +304,7 @@ cc_library_with_android_deps(
     srcs = ["ops/while_loop.cc"],
     hdrs = ["ops/while_loop.h"],
     android_deps = [
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ],
     common_deps = [
         ":cc_ops",
@@ -494,6 +496,32 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "manip_grad",
+    srcs = ["gradients/manip_grad.cc"],
+    deps = [
+        ":cc_ops",
+        ":grad_op_registry",
+        ":gradients",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "gradients_manip_grad_test",
+    srcs = ["gradients/manip_grad_test.cc"],
+    deps = [
+        ":array_ops",
+        ":cc_ops",
+        ":gradient_checker",
+        ":manip_grad",
+        ":testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 # Generates separate libraries for array_ops and math_ops to reduce the dependency count of targets that depend on only these
 tf_gen_op_wrappers_cc(
     name = "math_ops",
diff --git a/tensorflow/cc/experimental/base/public/BUILD b/tensorflow/cc/experimental/base/public/BUILD
new file mode 100644
index 00000000000..045d4e6cd97
--- /dev/null
+++ b/tensorflow/cc/experimental/base/public/BUILD
@@ -0,0 +1,78 @@
+# Experimental C++ APIs for TensorFlow.
+# New TF C++ APIs under the tensorflow::cc namespace aim to guarantee ABI stability.
+# Users are expected to compile against public c++ headers, and link against
+# libtensorflow (https://www.tensorflow.org/install/lang_c).
+# We aim to achieve ABI stability in new C++ APIs by only using types
+# on the API surface that:
+# 1. Have a header-only implementation
+# 2. Are std:: types
+# 3. Wrap an opaque C type
+
+package(
+    # This is intentionally public
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "runtime",
+    hdrs = [
+        "runtime.h",
+    ],
+    deps = [
+        ":status",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+    ],
+)
+
+cc_library(
+    name = "runtime_builder",
+    hdrs = [
+        "runtime_builder.h",
+    ],
+    deps = [
+        ":runtime",
+        ":status",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+    ],
+)
+
+cc_library(
+    name = "status",
+    hdrs = [
+        "status.h",
+    ],
+    deps = [
+        "//tensorflow/c:tf_status",
+    ],
+)
+
+cc_library(
+    name = "tensor",
+    hdrs = [
+        "tensor.h",
+    ],
+    deps = [
+        ":status",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/c:tf_tensor",
+    ],
+)
+
+cc_library(
+    name = "tensorhandle",
+    hdrs = [
+        "tensorhandle.h",
+    ],
+    deps = [
+        ":runtime",
+        ":status",
+        ":tensor",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+    ],
+)
diff --git a/tensorflow/cc/experimental/base/public/runtime.h b/tensorflow/cc/experimental/base/public/runtime.h
new file mode 100644
index 00000000000..711a38c233a
--- /dev/null
+++ b/tensorflow/cc/experimental/base/public/runtime.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/c_api_experimental.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// Runtime represents an opaque instance of a Tensorflow runtime, with its own
+// resources, threadpools, etc. Clients are expected to construct a Runtime
+// object through tensorflow::cc::RuntimeBuilder::Build, after setting any
+// relevant configuration options. Many Tensorflow functions take a reference to
+// the runtime as an argument (eg: tensorflow::cc::SavedModelAPI::Load), and
+// may have different implementations depending on the runtime. For many of
+// these Runtime-attached objects (such as tensorflow::cc::TensorHandle), the
+// Runtime must outlive these objects.
+class Runtime {
+ public:
+  // Runtime is movable, but not copyable.
+  Runtime(Runtime&&) = default;
+  Runtime& operator=(Runtime&&) = default;
+
+ private:
+  friend class RuntimeBuilder;
+  friend class SavedModelAPI;
+  friend class TensorHandle;
+
+  // Wraps a TFE_Context. Takes ownership of ctx.
+  explicit Runtime(TFE_Context* ctx) : ctx_(ctx) {}
+
+  // Deletes the currently wrapped TFE_Context, swaps it with ctx,
+  // and takes ownership of ctx.
+  void Reset(TFE_Context* ctx) { ctx_.reset(ctx); }
+
+  // Returns the TFE_Context that this object wraps. This object
+  // retains ownership of the pointer.
+  TFE_Context* GetTFEContext() const { return ctx_.get(); }
+
+  // Runtime is not copyable
+  Runtime(const Runtime&) = delete;
+  Runtime& operator=(const Runtime&) = delete;
+
+  struct TFEContextDeleter {
+    void operator()(TFE_Context* p) const { TFE_DeleteContext(p); }
+  };
+  std::unique_ptr<TFE_Context, TFEContextDeleter> ctx_;
+};
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_H_
diff --git a/tensorflow/cc/experimental/base/public/runtime_builder.h b/tensorflow/cc/experimental/base/public/runtime_builder.h
new file mode 100644
index 00000000000..737e06cb2c6
--- /dev/null
+++ b/tensorflow/cc/experimental/base/public/runtime_builder.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_BUILDER_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_BUILDER_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// RuntimeBuilder is a builder used to construct a tensorflow::cc::Runtime.
+// Use this to set configuration options, like threadpool size, etc.
+class RuntimeBuilder {
+ public:
+  RuntimeBuilder() : options_(TFE_NewContextOptions()) {}
+
+  // If `use_tfrt` is true, we will use the new Tensorflow Runtime
+  // (https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html) as
+  // our runtime implementation.
+  RuntimeBuilder& SetUseTFRT(bool use_tfrt);
+
+  // Build a Tensorflow Runtime.
+  //
+  // Params:
+  //  status - Set to OK on success and an appropriate error on failure.
+  // Returns:
+  //  If status is not OK, returns nullptr. Otherwise, returns a
+  //  unique_ptr<tensorflow::cc::Runtime>.
+  std::unique_ptr<Runtime> Build(Status* status);
+
+  // RuntimeBuilder is movable, but not copyable.
+  RuntimeBuilder(RuntimeBuilder&&) = default;
+  RuntimeBuilder& operator=(RuntimeBuilder&&) = default;
+
+ private:
+  // RuntimeBuilder is not copyable
+  RuntimeBuilder(const RuntimeBuilder&) = delete;
+  RuntimeBuilder& operator=(const RuntimeBuilder&) = delete;
+
+  struct TFEContextOptionsDeleter {
+    void operator()(TFE_ContextOptions* p) const {
+      TFE_DeleteContextOptions(p);
+    }
+  };
+  std::unique_ptr<TFE_ContextOptions, TFEContextOptionsDeleter> options_;
+};
+
+inline RuntimeBuilder& RuntimeBuilder::SetUseTFRT(bool use_tfrt) {
+  TFE_ContextOptionsSetTfrt(options_.get(), use_tfrt);
+  return *this;
+}
+
+inline std::unique_ptr<Runtime> RuntimeBuilder::Build(Status* status) {
+  TFE_Context* result = TFE_NewContext(options_.get(), status->GetTFStatus());
+  if (!status->ok()) {
+    return nullptr;
+  }
+  // We can't use std::make_unique here because of its interaction with a
+  // private constructor: https://abseil.io/tips/134
+  return std::unique_ptr<Runtime>(new Runtime(result));
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_BUILDER_H_
diff --git a/tensorflow/cc/experimental/base/public/status.h b/tensorflow/cc/experimental/base/public/status.h
new file mode 100644
index 00000000000..98c8cf6ced2
--- /dev/null
+++ b/tensorflow/cc/experimental/base/public/status.h
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_STATUS_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_STATUS_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/c/tf_status.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// Status is a wrapper around an error code and an optional error message.
+// The set of error codes are defined here:
+// https://github.com/tensorflow/tensorflow/blob/08931c1e3e9eb2e26230502d678408e66730826c/tensorflow/c/tf_status.h#L39-L60
+// Many Tensorflow APIs return a Status, or take a Status as an out parameter.
+// Clients should check for status.ok() after calling these APIs, and either
+// handle or propagate the error appropriately.
+// TODO(bmzhao): Add a detailed code example before moving out of experimental.
+class Status {
+ public:
+  // Create a success status
+  Status() : status_(TF_NewStatus()) {}
+
+  // Return the status code
+  TF_Code code() const;
+
+  // Returns the error message in Status.
+  std::string message() const;
+
+  // Returns the error message in Status.
+  bool ok() const;
+
+  // Record <code, msg> in Status. Any previous information is lost.
+  // A common use is to clear a status: SetStatus(TF_OK, "");
+  void SetStatus(TF_Code code, const std::string& msg);
+
+  // Status is movable, but not copyable.
+  Status(Status&&) = default;
+  Status& operator=(Status&&) = default;
+
+ private:
+  friend class RuntimeBuilder;
+  friend class Runtime;
+  friend class SavedModelAPI;
+  friend class TensorHandle;
+
+  // Wraps a TF_Status*, and takes ownership of it.
+  explicit Status(TF_Status* status) : status_(status) {}
+
+  // Status is not copyable
+  Status(const Status&) = delete;
+  Status& operator=(const Status&) = delete;
+
+  // Returns the TF_Status that this object wraps. This object
+  // retains ownership of the pointer.
+  TF_Status* GetTFStatus() const { return status_.get(); }
+
+  struct TFStatusDeleter {
+    void operator()(TF_Status* p) const { TF_DeleteStatus(p); }
+  };
+  std::unique_ptr<TF_Status, TFStatusDeleter> status_;
+};
+
+inline TF_Code Status::code() const { return TF_GetCode(status_.get()); }
+
+inline std::string Status::message() const {
+  return std::string(TF_Message(status_.get()));
+}
+
+inline bool Status::ok() const { return code() == TF_OK; }
+
+inline void Status::SetStatus(TF_Code code, const std::string& msg) {
+  TF_SetStatus(status_.get(), code, msg.c_str());
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_STATUS_H_
diff --git a/tensorflow/cc/experimental/base/public/tensor.h b/tensorflow/cc/experimental/base/public/tensor.h
new file mode 100644
index 00000000000..fc447262ce1
--- /dev/null
+++ b/tensorflow/cc/experimental/base/public/tensor.h
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSOR_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSOR_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// Tensor represents an n-dimensional array of values.
+class Tensor {
+ public:
+  using DeleterCallback = std::function<void(void*, size_t)>;
+
+  // Constructs a Tensor from user provided buffer.
+  //
+  // Params:
+  //  dtype - The dtype of the tensor's data.
+  //  shape - A shape vector, where each element corresponds to the size of
+  //          the tensor's corresponding dimension.
+  //  data - Pointer to a buffer of memory to construct a Tensor out of.
+  //  len - The length (in bytes) of `data`
+  //  deleter - A std::function to be called when the Tensor no longer needs the
+  //            memory in `data`. This can be used to free `data`, or
+  //            perhaps decrement a refcount associated with `data`, etc.
+  //  status - Set to OK on success and an error on failure.
+  // Returns:
+  // If an error occurred, status->ok() will be false, and the returned
+  // Tensor must not be used.
+  // TODO(bmzhao): Add Runtime as an argument to this function so we can swap to
+  // a TFRT backed tensor.
+  // TODO(bmzhao): Add benchmarks on overhead for this function; we can
+  // consider using int64_t* + length rather than vector.
+  static Tensor FromBuffer(TF_DataType dtype, const std::vector<int64_t>& shape,
+                           void* data, size_t len, DeleterCallback deleter,
+                           Status* status);
+
+  // TODO(bmzhao): In the case we construct a tensor from non-owned memory,
+  // we should offer a way to deep copy the tensor into a new tensor, which
+  // owns the underlying memory. This could be a .deepcopy()/clone() method.
+
+  // TODO(bmzhao): In the future, we want to relax the non-copyability
+  // constraint. To do so, we can add a C API function that acts like
+  // CopyFrom:
+  // https://github.com/tensorflow/tensorflow/blob/08931c1e3e9eb2e26230502d678408e66730826c/tensorflow/core/framework/tensor.h#L301-L311
+
+  // Tensor is movable, but not copyable
+  Tensor(Tensor&&) = default;
+  Tensor& operator=(Tensor&&) = default;
+
+  // Returns the number of dimensions in the tensor. Can be -1, which represents
+  // unknown rank.
+  int dims() const;
+
+  // Returns the number of elements in in demension `d`.
+  // REQUIRES: `0 <= d < dims()`
+  int64_t dim_size(int d) const;
+
+  // Returns a pointer to the underlying data buffer.
+  void* data() const;
+
+  // Returns the data type of the tensor.
+  TF_DataType dtype() const;
+
+  // Returns the number of elements in the tensor. For a tensor with a partially
+  // defined shape, -1 means not fully defined.
+  int64_t num_elements() const;
+
+  // Returns the size of the underlying data in bytes.
+  size_t num_bytes() const;
+
+ private:
+  friend class TensorHandle;
+  friend class Runtime;
+
+  // Wraps a TF_Tensor. Takes ownership of handle.
+  explicit Tensor(TF_Tensor* tensor) : tensor_(tensor) {}
+
+  // Tensor is not copyable
+  Tensor(const Tensor&) = delete;
+  Tensor& operator=(const Tensor&) = delete;
+
+  // Returns the underlying TF_Tensor that this object wraps.
+  // This object retains ownership of the pointer.
+  TF_Tensor* GetTFTensor() const { return tensor_.get(); }
+
+  struct DeleterStruct {
+    std::function<void(void*, size_t)> deleter;
+  };
+
+  static void DeleterFunction(void* memory, size_t len, void* deleter_struct) {
+    DeleterStruct* deleter = reinterpret_cast<DeleterStruct*>(deleter_struct);
+    deleter->deleter(memory, len);
+    delete deleter;
+  }
+
+  struct TFTensorDeleter {
+    void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
+  };
+  std::unique_ptr<TF_Tensor, TFTensorDeleter> tensor_;
+};
+
+inline void* Tensor::data() const { return TF_TensorData(tensor_.get()); }
+
+inline int Tensor::dims() const { return TF_NumDims(tensor_.get()); }
+
+inline int64_t Tensor::dim_size(int d) const {
+  return TF_Dim(tensor_.get(), d);
+}
+
+inline TF_DataType Tensor::dtype() const {
+  return TF_TensorType(tensor_.get());
+}
+
+inline int64_t Tensor::num_elements() const {
+  return TF_TensorElementCount(tensor_.get());
+}
+
+inline size_t Tensor::num_bytes() const {
+  return TF_TensorByteSize(tensor_.get());
+}
+
+inline Tensor Tensor::FromBuffer(TF_DataType dtype,
+                                 const std::vector<int64_t>& shape, void* data,
+                                 size_t len, DeleterCallback deleter,
+                                 Status* status) {
+  // Credit to apassos@ for this technique:
+  // Despite the fact that our API takes a std::function deleter, we are able
+  // to maintain ABI stability because:
+  // 1. Only a function pointer is sent across the C API (&DeleterFunction)
+  // 2. DeleterFunction is defined in the same build artifact that constructed
+  //    the std::function (so there isn't confusion about std::function ABI).
+  // Note that 2. is satisifed by the fact that this is a header-only API, where
+  // the function implementations are inline.
+
+  DeleterStruct* deleter_struct = new DeleterStruct{deleter};
+  TF_Tensor* tensor = TF_NewTensor(dtype, shape.data(), shape.size(), data, len,
+                                   &DeleterFunction, deleter_struct);
+  if (tensor == nullptr) {
+    status->SetStatus(TF_INVALID_ARGUMENT,
+                      "Failed to create tensor for input buffer");
+    return Tensor(nullptr);
+  }
+  return Tensor(tensor);
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSOR_H_
diff --git a/tensorflow/cc/experimental/base/public/tensorhandle.h b/tensorflow/cc/experimental/base/public/tensorhandle.h
new file mode 100644
index 00000000000..99453ee7ea8
--- /dev/null
+++ b/tensorflow/cc/experimental/base/public/tensorhandle.h
@@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// An opaque representation of a tensor computed/managed by the Tensorflow
+// runtime (tensorflow:cc::Runtime). Unlike a tensor, a Tensorhandle may refer
+// to tensors placed in memory of different devices or remote address spaces.
+// Note that tensorflow::cc::Runtime MUST outlive all TensorHandles created
+// from it.
+class TensorHandle {
+ public:
+  // Unwraps a Tensor from the given TensorHandle. If an error occurred,
+  // status->ok() will be false, and the returned Tensor must not be used.
+  Tensor Resolve(Status* status);
+
+  // Constructs a TensorHandle from a Tensor. If an error occurred,
+  // status->ok() will be false, and the returned TensorHandle must not be used.
+  static TensorHandle FromTensor(const Tensor& tensor, const Runtime& runtime,
+                                 Status* status);
+
+  // TensorHandle is movable, and not copyable
+  TensorHandle(TensorHandle&&) = default;
+  TensorHandle& operator=(TensorHandle&&) = default;
+
+ private:
+  // Wraps a TFE_TensorHandle. Takes ownership of handle.
+  explicit TensorHandle(TFE_TensorHandle* handle) : handle_(handle) {}
+
+  // TensorHandle is not copyable
+  TensorHandle(const TensorHandle&) = delete;
+  TensorHandle& operator=(const TensorHandle&) = delete;
+
+  // Returns the underlying TFE_TensorHandle that this object wraps.
+  // This object retains ownership of the pointer.
+  TFE_TensorHandle* GetTFETensorHandle() const { return handle_.get(); }
+
+  // Deletes the currently wrapped TFE_TensorHandle, and swaps it with handle,
+  // and takes ownership of handle.
+  void Reset(TFE_TensorHandle* handle) { handle_.reset(handle); }
+
+  struct TFETensorHandleDeleter {
+    void operator()(TFE_TensorHandle* p) const { TFE_DeleteTensorHandle(p); }
+  };
+  std::unique_ptr<TFE_TensorHandle, TFETensorHandleDeleter> handle_;
+};
+
+inline Tensor TensorHandle::Resolve(Status* status) {
+  TF_Tensor* tensor =
+      TFE_TensorHandleResolve(handle_.get(), status->GetTFStatus());
+  if (!status->ok()) {
+    return Tensor(nullptr);
+  }
+  return Tensor(tensor);
+}
+
+inline TensorHandle TensorHandle::FromTensor(const Tensor& tensor,
+                                             const Runtime& runtime,
+                                             Status* status) {
+  TFE_TensorHandle* tensor_handle = TFE_NewTensorHandleFromTensor(
+      runtime.GetTFEContext(), tensor.GetTFTensor(), status->GetTFStatus());
+  if (!status->ok()) {
+    return TensorHandle(nullptr);
+  }
+  return TensorHandle(tensor_handle);
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
diff --git a/tensorflow/cc/experimental/base/tests/BUILD b/tensorflow/cc/experimental/base/tests/BUILD
new file mode 100644
index 00000000000..f449d618f72
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/BUILD
@@ -0,0 +1,50 @@
+# Tests for the C++ header-only base types.
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "tensor_types_test_util",
+    testonly = True,
+    hdrs = ["tensor_types_test_util.h"],
+    deps = [
+        "//tensorflow/c:tf_datatype",
+    ],
+)
+
+tf_cc_test(
+    name = "tensor_test",
+    srcs = [
+        "tensor_test.cc",
+    ],
+    deps = [
+        ":tensor_types_test_util",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/cc/experimental/base/public:status",
+        "//tensorflow/cc/experimental/base/public:tensor",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "tensorhandle_test",
+    srcs = [
+        "tensorhandle_test.cc",
+    ],
+    deps = [
+        ":tensor_types_test_util",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/cc/experimental/base/public:runtime",
+        "//tensorflow/cc/experimental/base/public:runtime_builder",
+        "//tensorflow/cc/experimental/base/public:status",
+        "//tensorflow/cc/experimental/base/public:tensor",
+        "//tensorflow/cc/experimental/base/public:tensorhandle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/cc/experimental/base/tests/tensor_test.cc b/tensorflow/cc/experimental/base/tests/tensor_test.cc
new file mode 100644
index 00000000000..33f9ab637e8
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensor_test.cc
@@ -0,0 +1,163 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/cc/experimental/base/tests/tensor_types_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+using tensorflow::experimental::cc::Status;
+using tensorflow::experimental::cc::Tensor;
+
+using SimpleTypes = ::testing::Types<
+    tensorflow::FloatType, tensorflow::DoubleType, tensorflow::Int32Type,
+    tensorflow::UINT8Type, tensorflow::INT8Type, tensorflow::INT64Type,
+    tensorflow::UINT16Type, tensorflow::UINT32Type, tensorflow::UINT64Type>;
+
+template <typename T>
+class ConstructScalarTensorTest : public ::testing::Test {};
+TYPED_TEST_SUITE(ConstructScalarTensorTest, SimpleTypes);
+
+// This test constructs a scalar tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(ConstructScalarTensorTest, ValidTensorAttributesAfterConstruction) {
+  Status status;
+  TF_DataType dtype = TypeParam::kDType;
+  typename TypeParam::type value = 42;
+  Tensor tensor = Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
+                                     /*data=*/&value,
+                                     /*len=*/sizeof(value),
+                                     /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 0);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  EXPECT_EQ(*reinterpret_cast<typename TypeParam::type*>(tensor.data()), 42);
+  EXPECT_EQ(tensor.num_bytes(), sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), 1);
+}
+
+template <typename T>
+class Construct1DTensorTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct1DTensorTest, SimpleTypes);
+
+// This test constructs a 1D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
+  Status status;
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 1 vector.
+  std::vector<int64_t> shape;
+  shape.push_back(value.size());
+
+  Tensor tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 1);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+template <typename T>
+class Construct2DTensorTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct2DTensorTest, SimpleTypes);
+
+// This test constructs a 2D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
+  Status status;
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 2 vector with shape 2 x 3.
+  std::vector<int64_t> shape({2, 3});
+
+  Tensor tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 2);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+TEST(CPPTensorAPI, ConstructTensorFromBuffer) {
+  bool done = false;
+  Status status;
+  std::vector<int32_t> data_vector({12, 14, 20, 18, 39, 42, 100});
+  {
+    // data_vector is a rank 1 tensor.
+    std::vector<int64_t> shape;
+    shape.push_back(data_vector.size());
+
+    Tensor::DeleterCallback callback = [&done](void* data, size_t len) {
+      done = true;
+    };
+
+    Tensor tensor =
+        Tensor::FromBuffer(/*dtype=*/TF_INT32, /*shape=*/shape,
+                           /*data=*/data_vector.data(),
+                           /*len=*/data_vector.size() * sizeof(int32_t),
+                           /*deleter=*/callback, &status);
+    ASSERT_TRUE(status.ok()) << status.message();
+  }
+  // At this point, tensor has been destroyed, and the deleter callback should
+  // have run.
+  EXPECT_TRUE(done);
+}
+
+}  // namespace
diff --git a/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h b/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
new file mode 100644
index 00000000000..af9cad7529b
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
+
+#include <stdint.h>
+
+#include "tensorflow/c/tf_datatype.h"
+
+namespace tensorflow {
+
+// Each of the following struct types have two members: a kDType that
+// corresponds to a TF_Datatype enum value, and a typedef "type"
+// of its corresponding C++ type. These types allow us to write Dtype-agnostic
+// tests via GoogleTest's TypedTests:
+// https://github.com/google/googletest/blob/e589a337170554c48bc658cc857cf15080c9eacc/googletest/docs/advanced.md#typed-tests
+struct FloatType {
+  using type = float;
+  static constexpr TF_DataType kDType = TF_FLOAT;
+};
+
+struct DoubleType {
+  using type = double;
+  static constexpr TF_DataType kDType = TF_DOUBLE;
+};
+
+struct Int32Type {
+  using type = int32_t;
+  static constexpr TF_DataType kDType = TF_INT32;
+};
+
+struct UINT8Type {
+  using type = uint8_t;
+  static constexpr TF_DataType kDType = TF_UINT8;
+};
+
+struct INT8Type {
+  using type = int8_t;
+  static constexpr TF_DataType kDType = TF_INT8;
+};
+
+struct INT64Type {
+  using type = int64_t;
+  static constexpr TF_DataType kDType = TF_INT64;
+};
+
+struct UINT16Type {
+  using type = uint16_t;
+  static constexpr TF_DataType kDType = TF_UINT16;
+};
+
+struct UINT32Type {
+  using type = uint32_t;
+  static constexpr TF_DataType kDType = TF_UINT32;
+};
+
+struct UINT64Type {
+  using type = uint64_t;
+  static constexpr TF_DataType kDType = TF_UINT64;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
diff --git a/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
new file mode 100644
index 00000000000..cfeaba4e392
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/experimental/base/public/tensorhandle.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/runtime_builder.h"
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+#include "tensorflow/cc/experimental/base/tests/tensor_types_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using tensorflow::experimental::cc::Runtime;
+using tensorflow::experimental::cc::RuntimeBuilder;
+using tensorflow::experimental::cc::Status;
+using tensorflow::experimental::cc::Tensor;
+using tensorflow::experimental::cc::TensorHandle;
+
+using SimpleTypes = ::testing::Types<
+    tensorflow::FloatType, tensorflow::DoubleType, tensorflow::Int32Type,
+    tensorflow::UINT8Type, tensorflow::INT8Type, tensorflow::INT64Type,
+    tensorflow::UINT16Type, tensorflow::UINT32Type, tensorflow::UINT64Type>;
+
+template <typename T>
+class ConstructScalarTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(ConstructScalarTensorHandleTest, SimpleTypes);
+
+// This test constructs a scalar tensor for each of the types in "SimpleTypes",
+// then wraps it in a TensorHandle. We then unwrap it back into a Tensor, and
+// verify the expected dims, dtype, value, num bytes, and num elements.
+TYPED_TEST(ConstructScalarTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  typename TypeParam::type value = 42;
+  Tensor original_tensor =
+      Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
+                         /*data=*/&value,
+                         /*len=*/sizeof(value),
+                         /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 0);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  EXPECT_EQ(*reinterpret_cast<typename TypeParam::type*>(tensor.data()), 42);
+  EXPECT_EQ(tensor.num_bytes(), sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), 1);
+}
+
+template <typename T>
+class Construct1DTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct1DTensorHandleTest, SimpleTypes);
+
+// This test constructs a 1D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct1DTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 1 vector.
+  std::vector<int64_t> shape;
+  shape.push_back(value.size());
+
+  Tensor original_tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 1);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+template <typename T>
+class Construct2DTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct2DTensorHandleTest, SimpleTypes);
+
+// This test constructs a 2D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct2DTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 2 vector with shape 2 x 3.
+  std::vector<int64_t> shape({2, 3});
+
+  Tensor original_tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 2);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index 8dfdd01318d..88cd3fe79d6 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -13,19 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/cc/framework/gradients.h"
+
 #include <deque>
 #include <vector>
 
 #include "tensorflow/cc/framework/grad_op_registry.h"
-#include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/framework/while_gradients.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/while_context.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index 63a555b7217..368c5026db4 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
diff --git a/tensorflow/cc/gradients/manip_grad.cc b/tensorflow/cc/gradients/manip_grad.cc
new file mode 100644
index 00000000000..2a47c608441
--- /dev/null
+++ b/tensorflow/cc/gradients/manip_grad.cc
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/framework/gradients.h"
+#include "tensorflow/cc/ops/manip_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace ops {
+namespace {
+
+Status RollGrad(const Scope& scope, const Operation& op,
+                const std::vector<Output>& grad_inputs,
+                std::vector<Output>* grad_outputs) {
+  auto shift = op.input(1);
+  auto axis = op.input(2);
+  auto grad_op = Roll(scope, grad_inputs[0], Neg(scope, shift), axis);
+  grad_outputs->push_back(grad_op);
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Roll", RollGrad);
+
+}  // namespace
+}  // namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/manip_grad_test.cc b/tensorflow/cc/gradients/manip_grad_test.cc
new file mode 100644
index 00000000000..4d0f1634da8
--- /dev/null
+++ b/tensorflow/cc/gradients/manip_grad_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/gradient_checker.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/manip_ops.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+using ops::Placeholder;
+using ops::Roll;
+
+class ManipGradTest : public ::testing::Test {
+ protected:
+  ManipGradTest() : scope_(Scope::NewRootScope()) {}
+
+  void RunTest(const Output& x, const TensorShape& x_shape, const Output& y,
+               const TensorShape& y_shape) {
+    TF_ASSERT_OK(scope_.status());
+    float max_error;
+    TF_ASSERT_OK((ComputeGradientError<float, float, float>(
+        scope_, {x}, {x_shape}, {y}, {y_shape}, &max_error)));
+    EXPECT_LT(max_error, 1e-4);
+  }
+
+  Scope scope_;
+};
+
+TEST_F(ManipGradTest, RollGrad) {
+  TensorShape shape({5, 4, 3});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Roll(scope_, x, {2, 1}, {0, 1});
+  RunTest(x, shape, y, shape);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 882b4032f76..b13d8db48a9 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -4,7 +4,6 @@
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
-    "if_ios",
     "if_mobile",
     "if_not_mobile",
     "tf_cc_test",
@@ -85,7 +84,7 @@ cc_library(
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
     ]) + if_android([
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ]),
 )
 
diff --git a/tensorflow/cc/saved_model/experimental/public/BUILD b/tensorflow/cc/saved_model/experimental/public/BUILD
new file mode 100644
index 00000000000..3e9a671a61f
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/public/BUILD
@@ -0,0 +1,58 @@
+# Experimental C++ SavedModel Header Only APIs. See RFC
+# https://github.com/tensorflow/community/pull/207
+
+package(
+    # This is intentionally public
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "concrete_function",
+    hdrs = [
+        "concrete_function.h",
+    ],
+    deps = [
+        ":function_metadata",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/experimental/saved_model/public:concrete_function",
+        "//tensorflow/cc/experimental/base/public:status",
+    ],
+)
+
+cc_library(
+    name = "concrete_function_list",
+    hdrs = [
+        "concrete_function_list.h",
+    ],
+    deps = [
+        ":concrete_function",
+        "//tensorflow/c/experimental/saved_model/public:concrete_function_list",
+    ],
+)
+
+cc_library(
+    name = "function_metadata",
+    hdrs = [
+        "function_metadata.h",
+    ],
+    deps = [
+        "//tensorflow/c/experimental/saved_model/public:function_metadata",
+    ],
+)
+
+cc_library(
+    name = "saved_model_api",
+    hdrs = [
+        "saved_model_api.h",
+    ],
+    deps = [
+        ":concrete_function",
+        ":concrete_function_list",
+        "//tensorflow/c/experimental/saved_model/public:saved_model_api",
+        "//tensorflow/cc/experimental/base/public:runtime",
+        "//tensorflow/cc/experimental/base/public:status",
+    ],
+)
diff --git a/tensorflow/cc/saved_model/experimental/public/concrete_function.h b/tensorflow/cc/saved_model/experimental/public/concrete_function.h
new file mode 100644
index 00000000000..1adaf70b01a
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/public/concrete_function.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_H_
+
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/saved_model/experimental/public/function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// ConcreteFunction is an executable "function" loaded from a SavedModelAPI.
+class ConcreteFunction final {
+ public:
+  // TODO(bmzhao): Adding ConcreteFunction::Run in subsequent CL, since
+  // it depends on tensorflow::cc::Tensor and tensorflow::cc::TensorHandle
+
+  // Returns FunctionMetadata associated with this ConcreteFunction.
+  const FunctionMetadata* GetFunctionMetadata();
+
+ private:
+  friend class SavedModelAPI;
+  friend class ConcreteFunctionList;
+
+  // TODO(bmzhao): Consider adding a macro for wrapping/unwrapping
+  // when moving out of experimental.
+  static ConcreteFunction* wrap(TF_ConcreteFunction* p) {
+    return reinterpret_cast<ConcreteFunction*>(p);
+  }
+  static TF_ConcreteFunction* unwrap(ConcreteFunction* p) {
+    return reinterpret_cast<TF_ConcreteFunction*>(p);
+  }
+};
+
+inline const FunctionMetadata* ConcreteFunction::GetFunctionMetadata() {
+  return FunctionMetadata::wrap(TF_ConcreteFunctionGetMetadata(unwrap(this)));
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h b/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
new file mode 100644
index 00000000000..88cb779ef15
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
+#include "tensorflow/cc/saved_model/experimental/public/concrete_function.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// ConcreteFunctionList helps convert an opaque pointer to an array of
+// ConcreteFunction pointers to a std::vector.
+class ConcreteFunctionList {
+ public:
+  // Converts this object to a std::vector<ConcreteFunction*>
+  std::vector<ConcreteFunction*> ToVector();
+
+ private:
+  friend class SavedModelAPI;
+  // Wraps a TF_ConcreteFunctionList. Takes ownership of list.
+  explicit ConcreteFunctionList(TF_ConcreteFunctionList* list) : list_(list) {}
+
+  struct TFConcreteFunctionListDeleter {
+    void operator()(TF_ConcreteFunctionList* p) const {
+      TF_DeleteConcreteFunctionList(p);
+    }
+  };
+  std::unique_ptr<TF_ConcreteFunctionList, TFConcreteFunctionListDeleter> list_;
+};
+
+inline std::vector<ConcreteFunction*> ConcreteFunctionList::ToVector() {
+  int size = TF_ConcreteFunctionListSize(list_.get());
+  std::vector<ConcreteFunction*> result;
+  result.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    result.push_back(
+        ConcreteFunction::wrap(TF_ConcreteFunctionListGet(list_.get(), i)));
+  }
+  return result;
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/function_metadata.h b/tensorflow/cc/saved_model/experimental/public/function_metadata.h
new file mode 100644
index 00000000000..11e1a860d84
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/public/function_metadata.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_FUNCTION_METADATA_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_FUNCTION_METADATA_H_
+
+#include <memory>
+
+#include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// FunctionMetadata stores additional function information, including
+// optional signaturedef feeds/fetches (for TF1-based ConcreteFunctions),
+// a valid function path (for TF2-based ConcreteFunctions), and
+// the types + number of inputs and outputs.
+class FunctionMetadata final {
+  // TODO(bmzhao): Add getters here as necessary.
+ private:
+  friend class ConcreteFunction;
+  static FunctionMetadata* wrap(TF_FunctionMetadata* p) {
+    return reinterpret_cast<FunctionMetadata*>(p);
+  }
+  static TF_FunctionMetadata* unwrap(FunctionMetadata* p) {
+    return reinterpret_cast<TF_FunctionMetadata*>(p);
+  }
+};
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_FUNCTION_METADATA_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
new file mode 100644
index 00000000000..04018bf2aab
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
@@ -0,0 +1,162 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SAVED_MODEL_API_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SAVED_MODEL_API_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/saved_model/experimental/public/concrete_function.h"
+#include "tensorflow/cc/saved_model/experimental/public/concrete_function_list.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// SavedModelAPI offers a way to load Tensorflow Saved Models
+// (https://www.tensorflow.org/guide/saved_model) and execute saved
+// tf.functions or legacy SignatureDefs in a TF2-idiomatic fashion.
+// See RFC 207
+// (https://github.com/tensorflow/community/blob/master/rfcs/20200218-tf-c-saved-model.md)
+// TODO(bmzhao): Add an e2e example here, once ConcreteFunction::Run is added.
+class SavedModelAPI {
+ public:
+  // Load a SavedModel from `dirname`.
+  //
+  // Params:
+  //  saved_model_path - A directory filepath that the SavedModel is at.
+  //  runtime - A runtime used to load SavedModelAPI. `runtime` must outlive the
+  //            returned TF_SavedModel pointer.
+  //  tags - Optional set of tags. If tags = nullptr, we expect the SavedModel
+  //         to contain a single Metagraph (as for those exported from TF2's
+  //         `tf.saved_model.save`). If tags != nullptr, we load the metagraph
+  //         matching the tags:
+  //         https://github.com/tensorflow/tensorflow/blob/428cdeda09aef81e958eeb274b83d27ad635b57b/tensorflow/core/protobuf/meta_graph.proto#L50-L56
+  //  status - Set to OK on success and an appropriate error on failure.
+  // Returns:
+  //  If status is not OK, returns nullptr.
+  static std::unique_ptr<SavedModelAPI> Load(
+      const std::string& saved_model_path, const Runtime& runtime,
+      Status* status, const std::unordered_set<std::string>* tags = nullptr);
+
+  // Retrieve a function from the TF2 SavedModel via function path.
+  //
+  // Params:
+  //  function_path - A string containing the path from the root saved python
+  //                  object to a tf.function method.
+  //  status - Set to OK on success and an appropriate error on failure.
+  // Returns:
+  //  If status is not OK, returns nullptr. Otherwise, returns a
+  //  tensorflow::cc::ConcreteFunction pointer. The lifetime of this pointer
+  //  is bound to SavedModelAPI it was loaded from.
+  ConcreteFunction* GetConcreteFunction(const std::string& function_path,
+                                        Status* status);
+
+  // Retrieve a function from the TF SavedModel via a SignatureDef key.
+  //
+  // Params:
+  //  signature_def_key - String key of SignatureDef map of a SavedModel:
+  //                      https://github.com/tensorflow/tensorflow/blob/69b08900b1e991d84bce31f3b404f5ed768f339f/tensorflow/core/protobuf/meta_graph.proto#L89
+  //  status - Set to OK on success and an appropriate error on failure.
+  // Returns:
+  //  If status is not OK, returns nullptr. Otherwise, returns a
+  //  tensorflow::cc::ConcreteFunction pointer. The lifetime of this pointer
+  //  is bound to SavedModelAPI it was loaded from.
+  ConcreteFunction* GetSignatureDefFunction(const std::string& function_path,
+                                            Status* status);
+
+  // Lists all Conrete Functions available from the SavedModel.
+  std::vector<ConcreteFunction*> ListFunctions();
+
+  // SavedModelAPI is movable, but not copyable.
+  SavedModelAPI(SavedModelAPI&&) = default;
+  SavedModelAPI& operator=(SavedModelAPI&&) = default;
+
+ private:
+  SavedModelAPI(const SavedModelAPI&) = delete;
+  SavedModelAPI& operator=(const SavedModelAPI&) = delete;
+
+  explicit SavedModelAPI(TF_SavedModel* model) : saved_model_(model) {}
+  struct TFSavedModelDeleter {
+    void operator()(TF_SavedModel* p) const { TF_DeleteSavedModel(p); }
+  };
+  std::unique_ptr<TF_SavedModel, TFSavedModelDeleter> saved_model_;
+};
+
+inline std::unique_ptr<SavedModelAPI> SavedModelAPI::Load(
+    const std::string& saved_model_path, const Runtime& runtime, Status* status,
+    const std::unordered_set<std::string>* tags) {
+  TF_SavedModel* saved_model = nullptr;
+
+  if (tags == nullptr) {
+    saved_model =
+        TF_LoadSavedModel(saved_model_path.c_str(), runtime.GetTFEContext(),
+                          status->GetTFStatus());
+  } else {
+    std::vector<const char*> tags_vector;
+    tags_vector.reserve(tags->size());
+    for (const std::string& tag : *tags) {
+      tags_vector.push_back(tag.c_str());
+    }
+    saved_model = TF_LoadSavedModelWithTags(
+        saved_model_path.c_str(), runtime.GetTFEContext(), tags_vector.data(),
+        tags_vector.size(), status->GetTFStatus());
+  }
+
+  if (!status->ok()) {
+    return nullptr;
+  }
+
+  // We can't use std::make_unique here because of its interaction with a
+  // private constructor: https://abseil.io/tips/134
+  return std::unique_ptr<SavedModelAPI>(new SavedModelAPI(saved_model));
+}
+
+inline ConcreteFunction* SavedModelAPI::GetConcreteFunction(
+    const std::string& function_path, Status* status) {
+  TF_ConcreteFunction* function = TF_GetSavedModelConcreteFunction(
+      saved_model_.get(), function_path.c_str(), status->GetTFStatus());
+  if (!status->ok()) {
+    return nullptr;
+  }
+  return ConcreteFunction::wrap(function);
+}
+
+inline ConcreteFunction* SavedModelAPI::GetSignatureDefFunction(
+    const std::string& function_path, Status* status) {
+  TF_ConcreteFunction* function = TF_GetSavedModelSignatureDefFunction(
+      saved_model_.get(), function_path.c_str(), status->GetTFStatus());
+  if (!status->ok()) {
+    return nullptr;
+  }
+  return ConcreteFunction::wrap(function);
+}
+
+inline std::vector<ConcreteFunction*> SavedModelAPI::ListFunctions() {
+  ConcreteFunctionList list(TF_ListSavedModelFunctions(saved_model_.get()));
+  return list.ToVector();
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SAVED_MODEL_API_H_
diff --git a/tensorflow/cc/saved_model/experimental/tests/BUILD b/tensorflow/cc/saved_model/experimental/tests/BUILD
new file mode 100644
index 00000000000..f24bcfdee2a
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/tests/BUILD
@@ -0,0 +1,22 @@
+# Tests for the C++ header-only SavedModelAPI.
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+tf_cc_test(
+    name = "saved_model_api_test",
+    srcs = [
+        "saved_model_api_test.cc",
+    ],
+    deps = [
+        "//tensorflow/cc/experimental/base/public:runtime",
+        "//tensorflow/cc/experimental/base/public:runtime_builder",
+        "//tensorflow/cc/experimental/base/public:status",
+        "//tensorflow/cc/saved_model/experimental/public:saved_model_api",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
new file mode 100644
index 00000000000..7f7f6b09a6d
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/saved_model/experimental/public/saved_model_api.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/runtime_builder.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/test.h"
+
+
+namespace {
+
+using tensorflow::experimental::cc::Runtime;
+using tensorflow::experimental::cc::RuntimeBuilder;
+using tensorflow::experimental::cc::SavedModelAPI;
+using tensorflow::experimental::cc::Status;
+
+constexpr char kTestData[] = "cc/saved_model/testdata";
+
+std::string SavedModelPath(tensorflow::StringPiece saved_model_dir) {
+  return tensorflow::io::JoinPath(tensorflow::testing::TensorFlowSrcRoot(),
+                                  kTestData, saved_model_dir);
+}
+
+// This value parameterized test allows us to test both TFRT
+// and non TFRT runtimes.
+// https://github.com/google/googletest/blob/dcc92d0ab6c4ce022162a23566d44f673251eee4/googletest/docs/advanced.md#value-parameterized-tests
+class CPPSavedModelAPITest : public ::testing::TestWithParam<bool> {};
+
+TEST_P(CPPSavedModelAPITest, LoadsSavedModelWithTags) {
+  Status status;
+  RuntimeBuilder builder;
+  bool use_tfrt = GetParam();
+  if (use_tfrt) {
+    GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
+  }
+
+  builder.SetUseTFRT(use_tfrt);
+  std::unique_ptr<Runtime> runtime = builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
+  std::unordered_set<std::string> tags = {"serve"};
+  std::unique_ptr<SavedModelAPI> model =
+      SavedModelAPI::Load(model_dir, *runtime, &status, &tags);
+
+  // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
+  // That unblocks writing other tests that require a TF_SavedModel*,
+  // like loading a ConcreteFunction. This test at least checks that the
+  // C API builds and can be minimally run.
+  EXPECT_EQ(status.code(), TF_UNIMPLEMENTED);
+}
+
+TEST_P(CPPSavedModelAPITest, LoadsSavedModel) {
+  Status status;
+  RuntimeBuilder builder;
+  bool use_tfrt = GetParam();
+  if (use_tfrt) {
+    GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
+  }
+
+  builder.SetUseTFRT(use_tfrt);
+  std::unique_ptr<Runtime> runtime = builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
+  std::unique_ptr<SavedModelAPI> model =
+      SavedModelAPI::Load(model_dir, *runtime, &status);
+
+  // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
+  // That unblocks writing other tests that require a TF_SavedModel*,
+  // like loading a ConcreteFunction. This test at least checks that the
+  // C API builds and can be minimally run.
+  EXPECT_EQ(status.code(), TF_UNIMPLEMENTED);
+}
+
+INSTANTIATE_TEST_SUITE_P(RuntimeAgnosticCPPSavedModelTests,
+                         CPPSavedModelAPITest, ::testing::Bool());
+
+}  // namespace
+
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 3bb4660e449..6c967dcf464 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -19,12 +19,16 @@ limitations under the License.
 
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/reader.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
@@ -65,12 +69,39 @@ uint64 GetLatencyMicroseconds(const uint64 start_microseconds) {
   return end_microseconds - start_microseconds;
 }
 
+// Ensure that constant tensors loaded from the saved model have valid shape.
+// Also ensure that constant nodes have a value assigned to them.
+// TODO(b/154763635): this is temporary and will be replaced with a better audit
+static Status ValidateSavedTensors(const GraphDef& graph_def) {
+  for (const auto& node : graph_def.node()) {
+    const auto node_iterator = node.attr().find("value");
+    if (node_iterator != node.attr().end()) {
+      AttrValue node_value = node_iterator->second;
+      if (node_value.has_tensor()) {
+        const PartialTensorShape node_shape(node_value.tensor().tensor_shape());
+        if (node_shape.num_elements() < 0) {
+          return errors::FailedPrecondition(
+              "Saved model contains node \"", node.name(), "\" (op \"",
+              node.op(), "\") which initializes from a tensor with ",
+              node_shape.num_elements(), " elements");
+        }
+      }
+    } else if (node.op() == "Const") {
+      return errors::FailedPrecondition(
+          "Saved model contains node \"", node.name(),
+          "\" which is a constant tensor but no value has been provided");
+    }
+  }
+  return Status::OK();
+}
+
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                 const SessionOptions& session_options,
                                 std::unique_ptr<Session>* session) {
   Session* session_p = nullptr;
   TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
   session->reset(session_p);
+  TF_RETURN_IF_ERROR(ValidateSavedTensors(meta_graph_def.graph_def()));
   return (*session)->Create(meta_graph_def.graph_def());
 }
 
diff --git a/tensorflow/cc/saved_model/saved_model_bundle_test.cc b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
index 9fc71552d6f..d6c375c7448 100644
--- a/tensorflow/cc/saved_model/saved_model_bundle_test.cc
+++ b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
@@ -40,6 +40,10 @@ constexpr char kTestDataInitOpV2[] =
     "cc/saved_model/testdata/half_plus_two_v2/00000123";
 constexpr char kTestDataV2DebugInfo[] =
     "cc/saved_model/testdata/x_plus_y_v2_debuginfo";
+constexpr char kTestFuzzGeneratedNegativeShape[] =
+    "cc/saved_model/testdata/fuzz_generated/negative_shape";
+constexpr char kTestFuzzGeneratedConstWithNoValue[] =
+    "cc/saved_model/testdata/fuzz_generated/const_with_no_value";
 
 class LoaderTest : public ::testing::Test {
  protected:
@@ -256,5 +260,29 @@ TEST_F(LoaderTest, SavedModelV2DebugInfo) {
   EXPECT_NE(bundle.debug_info.get(), nullptr);
 }
 
+TEST_F(LoaderTest, NegativeShapeDimension) {
+  SavedModelBundle bundle;
+  RunOptions run_options;
+  SessionOptions session_options;
+
+  const string export_dir = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                         kTestFuzzGeneratedNegativeShape);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe}, &bundle);
+  EXPECT_FALSE(st.ok());
+}
+
+TEST_F(LoaderTest, ConstNoValue) {
+  SavedModelBundle bundle;
+  RunOptions run_options;
+  SessionOptions session_options;
+
+  const string export_dir = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                         kTestFuzzGeneratedConstWithNoValue);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe}, &bundle);
+  EXPECT_FALSE(st.ok());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value
new file mode 100644
index 00000000000..438d52e8050
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value differ
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape
new file mode 100644
index 00000000000..5ee5c360ce0
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape differ
diff --git a/tensorflow/compiler/aot/benchmark.h b/tensorflow/compiler/aot/benchmark.h
index 266b7fefc7e..95bb7663b35 100644
--- a/tensorflow/compiler/aot/benchmark.h
+++ b/tensorflow/compiler/aot/benchmark.h
@@ -38,7 +38,7 @@ namespace benchmark {
 struct Options {
   // kDefaultMicros specifies the default time to run the benchmark, and is used
   // if neither max_iters nor max_micros is set.
-  static const int64 kDefaultMicros = 3000000;
+  static constexpr int64 kDefaultMicros = 3000000;
 
   int64 max_iters = 0;   // Maximum iterations to run, ignored if <= 0.
   int64 max_micros = 0;  // Maximum microseconds to run, ignored if <= 0.
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index c9a36b88795..e4df3090046 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -131,6 +131,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
   TF_RETURN_IF_ERROR(XLATypeToCpp(shape.element_type(), &type));
   std::vector<string> dim_vars;
   string dim_sizes, indices;
+  int count = 1;
   if (shape.rank() == 0 ||
       (shape.dimensions_size() == 1 && shape.dimensions(0) == 1)) {
     dim_sizes = "[1]";
@@ -140,6 +141,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
       dim_vars.push_back(absl::StrCat("size_t dim", dim));
       dim_sizes += absl::StrCat("[", shape.dimensions(dim), "]");
       indices += absl::StrCat("[dim", dim, "]");
+      count *= shape.dimensions(dim);
     }
   }
   rewrites->push_back({"{{I}}", absl::StrCat(i)});
@@ -147,6 +149,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
   rewrites->push_back({"{{DIM_VARS}}", absl::StrJoin(dim_vars, ", ")});
   rewrites->push_back({"{{DIM_SIZES}}", dim_sizes});
   rewrites->push_back({"{{INDICES}}", indices});
+  rewrites->push_back({"{{COUNT}}", absl::StrCat(count)});
   return Status::OK();
 }
 
@@ -199,6 +202,12 @@ Status GenArgMethods(const tf2xla::Config& config,
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
         arg_data({{I}}))){{INDICES}};
   }
+  int arg{{NAME}}_size() const {
+    return {{COUNT}} * sizeof({{TYPE}});
+  }
+  int arg{{NAME}}_count() const {
+    return {{COUNT}};
+  }
 )";
     *methods += RewriteWithName(absl::StrCat(i), code, rewrites);
     if (!config.feed(i).name().empty()) {
@@ -246,6 +255,12 @@ Status GenResultMethods(const tf2xla::Config& config,
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
         result_data({{I}}))){{INDICES}};
   }
+  int result{{NAME}}_size() const {
+    return {{COUNT}} * sizeof({{TYPE}});
+  }
+  int result{{NAME}}_count() const {
+    return {{COUNT}};
+  }
 )";
     *methods += RewriteWithName(absl::StrCat(i), code, rewrites);
     if (!config.fetch(i).name().empty()) {
@@ -281,6 +296,12 @@ Status GenVariableMethods(const tf2xla::Config& config,
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
         arg_data({{I}}))){{INDICES}};
   }
+  int var_{{NAME}}_size() const {
+    return {{COUNT}} * sizeof({{TYPE}});
+  }
+  int var_{{NAME}}_count() const {
+    return {{COUNT}};
+  }
 )";
     const tf2xla::Variable& var = config.variable(i - config.feed_size());
     rewrites.emplace_back("{{MAYBE_CONST}}", var.readonly() ? "const " : "");
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index af58ca233f0..d011279dbb7 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -138,6 +138,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1][2]>(
         arg_data(0)))[dim0][dim1];
   }
+  int arg0_size() const {
+    return 2 * sizeof(float);
+  }
+  int arg0_count() const {
+    return 2;
+  }
 
   void set_arg_myfeed_data(const void* data) {
     set_arg_data(0, data);
@@ -156,6 +162,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1][2]>(
         arg_data(0)))[dim0][dim1];
   }
+  int arg_myfeed_size() const {
+    return 2 * sizeof(float);
+  }
+  int arg_myfeed_count() const {
+    return 2;
+  }
 
   void set_arg1_data(const void* data) {
     set_arg_data(1, data);
@@ -174,6 +186,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::int64(*)[3][4]>(
         arg_data(1)))[dim0][dim1];
   }
+  int arg1_size() const {
+    return 12 * sizeof(tensorflow::int64);
+  }
+  int arg1_count() const {
+    return 12;
+  }
 
   // Result methods for managing output buffers. Buffers are in row-major order.
   // Must only be called after a successful Run call. There is a set of methods
@@ -204,6 +222,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::uint32(*)[5][6]>(
         result_data(0)))[dim0][dim1];
   }
+  int result0_size() const {
+    return 30 * sizeof(tensorflow::uint32);
+  }
+  int result0_count() const {
+    return 30;
+  }
 
   tensorflow::uint32* result_myfetch_data() {
     return static_cast<tensorflow::uint32*>(result_data(0));
@@ -219,6 +243,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::uint32(*)[5][6]>(
         result_data(0)))[dim0][dim1];
   }
+  int result_myfetch_size() const {
+    return 30 * sizeof(tensorflow::uint32);
+  }
+  int result_myfetch_count() const {
+    return 30;
+  }
 
   // Methods for managing variable buffers. Buffers are in row-major order.
   //
@@ -261,6 +291,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1]>(
         arg_data(2)))[0];
   }
+  int var_myvar_readonly_size() const {
+    return 1 * sizeof(float);
+  }
+  int var_myvar_readonly_count() const {
+    return 1;
+  }
 
   void set_var_myvar_data(float* data) {
     set_arg_data(3, data);
@@ -279,6 +315,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1]>(
         arg_data(3)))[0];
   }
+  int var_myvar_size() const {
+    return 1 * sizeof(float);
+  }
+  int var_myvar_count() const {
+    return 1;
+  }
 
   void set_var_myvar2_data(tensorflow::int32* data) {
     set_arg_data(4, data);
@@ -297,6 +339,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::int32(*)[5]>(
         arg_data(4)))[dim0];
   }
+  int var_myvar2_size() const {
+    return 5 * sizeof(tensorflow::int32);
+  }
+  int var_myvar2_count() const {
+    return 5;
+  }
 
  private:
   // Number of buffers for the compiled computation.
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 35a054a1aab..f2b28e70ff1 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -20,7 +20,7 @@ load(
     "tf_cc_test",
     "tf_copts",
 )
-load("//tensorflow:tensorflow.bzl", "tfcompile_extra_flags")
+load("//tensorflow:tensorflow.bzl", "tfcompile_target_cpu")
 
 def tf_library(
         name,
@@ -38,10 +38,12 @@ def tf_library(
         tfcompile_tool = "//tensorflow/compiler/aot:tfcompile",
         include_standard_runtime_deps = True,
         enable_xla_hlo_profiling = False,
+        enable_tracemes = False,
         mlir_components = "None",
         deps = None,
         tags = []):
-    """Runs tfcompile to compile a TensorFlow graph into executable code.
+    """Runs tfcompile to compile a TensorFlow graph into executable code with fast
+    math enabled on cpu.
 
     Given an invocation of tf_library(name="foo", ...), generates the following
     build targets:
@@ -89,6 +91,9 @@ def tf_library(
       enable_xla_hlo_profiling: Enable XLA HLO profiling in the generated
         program, and emit metadata that lets us pretty-print the gathered
         profile counters.
+      enable_tracemes: Tell tfcompile to generate calls to
+        TraceMe::Activity{Start|End} around HLO instructions that can be used by
+        Xprof to construct profiler timelines.
       mlir_components: When the value is "None", no components use MLIR. When
         the value is "Bridge", use MLIR to translate GraphDef to HLO.
       deps: a list of deps to include on the build rules for the generated
@@ -183,13 +188,20 @@ def tf_library(
     # `find` on such an object.
     need_xla_data_proto = flags and flags.find("--gen_program_shape") != -1
 
-    flags = tfcompile_extra_flags() + flags
+    target_cpu = tfcompile_target_cpu()
+    extra_flags = "--target_cpu=" + target_cpu + " " if target_cpu else " "
+    flags = extra_flags + flags
 
     if enable_xla_hlo_profiling:
         profiling_flag = "--xla_hlo_profile"
     else:
         profiling_flag = ""
 
+    if enable_tracemes:
+        traceme_flag = "--xla_cpu_enable_xprof_traceme=true"
+    else:
+        traceme_flag = "--xla_cpu_enable_xprof_traceme=false"
+
     mlir_flag = "--mlir_components=" + mlir_components
 
     srcs = [tfcompile_graph, config]
@@ -198,6 +210,15 @@ def tf_library(
         srcs.append(debug_info)
         debug_info_flag = " --debug_info=$(location " + debug_info + ")"
 
+    default_fast_math_xla_flags = ("XLA_FLAGS='" +
+                                   "--xla_cpu_enable_fast_math=true " +
+                                   "--xla_cpu_fast_math_honor_nans=false " +
+                                   "--xla_cpu_fast_math_honor_infs=false " +
+                                   "--xla_cpu_fast_math_honor_functions=false " +
+                                   "--xla_cpu_fast_math_honor_division=false " +
+                                   "--xla_cpu_enable_fast_min_max=true " +
+                                   "$${XLA_FLAGS:-}' ")
+
     native.genrule(
         name = ("gen_" + name),
         srcs = srcs,
@@ -207,6 +228,7 @@ def tf_library(
             function_object_file,
         ],
         cmd = (
+            default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
@@ -218,7 +240,7 @@ def tf_library(
             " --out_header=$(@D)/" + header_file +
             " --out_metadata_object=$(@D)/" + metadata_object_file +
             " --out_function_object=$(@D)/" + function_object_file +
-            " " + flags + " " + profiling_flag + " " + mlir_flag
+            " " + flags + " " + profiling_flag + " " + mlir_flag + " " + traceme_flag
         ),
         tools = [tfcompile_tool],
         visibility = visibility,
@@ -247,6 +269,7 @@ def tf_library(
             session_module_pb,
         ],
         cmd = (
+            default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index f0cf8f2ded9..846947454bb 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -67,6 +67,8 @@ int main(int argc, char** argv) {
   flags.entry_point = "entry";
   flags.debug_info_path_begin_marker = "";
 
+  // Note that tfcompile.bzl's tf_library macro sets fast math flags as that is
+  // generally the preferred case.
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
   xla::AppendDebugOptionsFlags(&flag_list);
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 28d922f9e3c..bc8fac0e88f 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -251,7 +251,7 @@ cc_library(
     visibility = [":friends"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:graph",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 91e3483a8f0..5a57008cf61 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/public/version.h"
diff --git a/tensorflow/compiler/jit/cluster_scoping_pass_test.cc b/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
index 5798d519bd7..436d2f867c9 100644
--- a/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
+++ b/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/test_util.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 363d9424e6f..6d4bc51f1b2 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -55,7 +56,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 9c06f023643..a21cb6b98dd 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/function.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
diff --git a/tensorflow/compiler/jit/compilability_check_util_test.cc b/tensorflow/compiler/jit/compilability_check_util_test.cc
index e6e49ae7957..3ea38e69ad9 100644
--- a/tensorflow/compiler/jit/compilability_check_util_test.cc
+++ b/tensorflow/compiler/jit/compilability_check_util_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 17438935af5..a2d966efea8 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -25,12 +25,11 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 770526f61a3..6640a5d5dba 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -28,9 +28,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index 192e1c7b324..cc177036591 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/test_util.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
diff --git a/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc b/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc
index 477539865f8..93776be446c 100644
--- a/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc
+++ b/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/test_util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index c64f4d32535..0fc1a349adc 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -358,13 +358,6 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
         ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_,
         resources_, constants_, /*lazy=*/false, &client, &variables, &kernel,
         &executable);
-    if (!s.ok() && (platform_info_.device_type().type_string() == DEVICE_CPU ||
-                    platform_info_.device_type().type_string() == DEVICE_GPU)) {
-      // Suggest auto jit if the failure was with GPU or CPU.
-      errors::AppendToMessage(&s,
-                              xla::status_macros::kPossibleAutoJitAlternative);
-    }
-
     OP_REQUIRES_OK(ctx, s);
   }
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 77496fe7960..174250f18bd 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
@@ -49,7 +50,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
@@ -1891,6 +1891,7 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "DynamicStitch",
                                      "Einsum",
                                      "EmptyTensorList",
+                                     "EnsureShape",
                                      "ExtractImagePatches",
                                      "Igamma",
                                      "IgammaGradA",
@@ -2077,6 +2078,8 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "XlaSend",
                                      "XlaSharding",
                                      "XlaSort",
+                                     "XlaSpmdFullToShardShape",
+                                     "XlaSpmdShardToFullShape",
                                      "XlaSvd",
                                      "XlaWhile",
                                      "_Arg",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index c670f2e54f1..0e1cc2d19fe 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
-
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
@@ -28,15 +26,16 @@ limitations under the License.
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
 #include "tensorflow/compiler/jit/node_matchers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index d352ec8977b..7378d17f88d 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -28,14 +28,14 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
index 67304412fd3..5529a7cbc72 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
@@ -26,12 +26,11 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index 15fb2f3ffc3..412dfefb9b7 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -18,6 +18,7 @@ cc_library(
         "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
index 726f7f0b068..cf6d86cde7c 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.h b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.h
index d59b220ca45..c30cf7b42a3 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.h
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_TESTS_AUTO_CLUSTERING_TEST_HELPER_H_
 
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
diff --git a/tensorflow/compiler/jit/xla_cluster_util_test.cc b/tensorflow/compiler/jit/xla_cluster_util_test.cc
index 6333499b0c8..edb7f78cb1b 100644
--- a/tensorflow/compiler/jit/xla_cluster_util_test.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util_test.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index b51749bc332..62b0c0ab4cf 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -31,16 +31,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/public/version.h"
@@ -277,29 +278,25 @@ Status XlaCompilationCache::CompileSingleOp(
     const NodeDef& node_def = ctx->op_kernel().def();
     TF_ASSIGN_OR_RETURN(auto graph, CreateGraph(node_def, args, result_dtypes));
 
-    bool are_params = absl::c_all_of(args, [](const XlaCompiler::Argument arg) {
-      return arg.kind == XlaCompiler::Argument::kParameter;
-    });
+    bool are_args_supported =
+        absl::c_all_of(args, [](const XlaCompiler::Argument arg) {
+          return arg.kind == XlaCompiler::Argument::kConstant ||
+                 arg.kind == XlaCompiler::Argument::kParameter;
+        });
     const ConfigProto* config = ctx->function_library()->config_proto();
     bool use_mlir = config && config->experimental().enable_mlir_bridge();
-    // Use MLIR bridge if all the arguments are parameters.
-    // TODO(hinsu): Support other argument types instead of silently falling
-    // back to the XLA compiler.
-    if (!are_params || !use_mlir) {
+    // TODO(b/155596779): Understand the source of other argument types and
+    // depending on the source either support those or avoid these codepath.
+    if (!use_mlir || !are_args_supported) {
       return compiler->CompileGraph(compile_options, node_def.name(),
                                     std::move(graph), args, result);
     }
 
-    absl::InlinedVector<TensorShape, 4> arg_shapes;
-    arg_shapes.reserve(args.size());
-    for (const XlaCompiler::Argument& arg : args) {
-      arg_shapes.push_back(absl::get<TensorShape>(arg.shape));
-    }
     GraphDebugInfo debug_info;
     return CompileGraphToXlaHlo(
-        *graph, {arg_shapes.data(), arg_shapes.size()},
-        options.device_type.type_string(), compile_options.use_tuple_arg,
-        *options.flib_def, debug_info, options.shape_representation_fn, result);
+        *graph, {args.data(), args.size()}, options.device_type.type_string(),
+        compile_options.use_tuple_arg, *options.flib_def, debug_info,
+        options.shape_representation_fn, result);
   };
   return CompileImpl(options, name, args, compile_op,
                      /*compile_threshold=*/absl::nullopt,
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 45ce68ba9c0..e1ad0e8c5af 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -145,16 +145,9 @@ Status XlaCompileOnDemandOp::Compile(
         attrs.set_on_host(true);
         TF_RETURN_IF_ERROR(ctx->allocate_temp(
             device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
-        Notification n;
-        Status status;
-        ctx->op_device_context()->CopyDeviceTensorToCPU(
+        Status status = ctx->op_device_context()->CopyDeviceTensorToCPUSync(
             &device_tensor, "ConstantArgument",
-            reinterpret_cast<Device*>(ctx->device()), &host_tensor,
-            [&](Status s) {
-              status = s;
-              n.Notify();
-            });
-        n.WaitForNotification();
+            reinterpret_cast<Device*>(ctx->device()), &host_tensor);
         if (!status.ok()) {
           LOG(ERROR) << "Copying tensor of shape "
                      << device_tensor.shape().DebugString() << " from "
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 0cc462678b1..abb42aa1815 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -488,15 +488,8 @@ Status XlaDevice::MakeTensorFromProto(XlaDeviceContext* device_context,
     mutex_lock lock(mu_);
     Allocator* allocator = GetAllocatorLocked(alloc_attrs);
     Tensor copy(allocator, parsed.dtype(), parsed.shape());
-    Notification n;
-    device_context->CopyCPUTensorToDevice(
-        &parsed, this, &copy,
-        [&n, &status](const Status& s) {
-          status = s;
-          n.Notify();
-        },
-        true /*sync_dst_compute*/);
-    n.WaitForNotification();
+    TF_RETURN_IF_ERROR(
+        device_context->CopyCPUTensorToDeviceSync(&parsed, this, &copy));
     *tensor = copy;
   }
   VLOG(2) << "Allocated tensor at " << DMAHelper::base(tensor);
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 4948fc9965f..e1cef25e33e 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -69,6 +69,7 @@ absl::optional<AllocatorStats> XlaDeviceAllocator::GetStats() {
   tf_stats.bytes_reserved = se_stats->bytes_reserved;
   tf_stats.peak_bytes_reserved = se_stats->peak_bytes_reserved;
   tf_stats.bytes_reservable_limit = se_stats->bytes_reservable_limit;
+  tf_stats.largest_free_block_bytes = se_stats->largest_free_block_bytes;
   return tf_stats;
 }
 
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 34ff0c55615..17e4226405a 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -180,12 +180,10 @@ class XlaAssignVariableOp : public OpKernel {
       data::MakeIteratorOp);                                                   \
   REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE),            \
                           data::AnonymousIteratorHandleOp);                    \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("AnonymousIteratorV2").Device(DEVICE).HostMemory("deleter"),        \
-      data::AnonymousIteratorHandleOp);                                        \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("DeleteIterator").Device(DEVICE).HostMemory("deleter"),             \
-      data::DeleteIteratorOp);                                                 \
+  REGISTER_KERNEL_BUILDER(Name("AnonymousIteratorV2").Device(DEVICE),          \
+                          data::AnonymousIteratorHandleOp);                    \
+  REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE),               \
+                          data::DeleteIteratorOp);                             \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE),              \
                           data::IteratorGetNextOp);                            \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE),    \
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 402a5990a25..e0ec990462b 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -479,6 +479,12 @@ Status XlaComputationLaunchContext::PopulateOutputs(
               input_output_alias, output_num, ctx, i, shape, &output,
               definition_event, stream, use_multiple_streams_));
         } else {
+          if (type == DT_VARIANT) {
+            return errors::Unimplemented(
+                "Support for TensorList crossing the XLA/TF boundary "
+                "is not implemented");
+          }
+
           se::DeviceMemoryBase buffer = output.buffer({output_num});
           Tensor output_tensor = GetOrCreateTensorForOutput(
               output_num, ctx, missing_ctx_input_prefix, input_output_alias,
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index bc4094bbad1..c0066ecda03 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -48,7 +48,6 @@ cc_library(
         "@llvm-project//mlir:MlirOptLib",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir/test:TestTransforms",
     ],
 )
 
@@ -77,6 +76,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_test_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
+        "//tensorflow/compiler/mlir/tfjs:tensorflow_js_passes",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/glob_lit_test.bzl b/tensorflow/compiler/mlir/glob_lit_test.bzl
index d69560220f2..9f6856f3636 100644
--- a/tensorflow/compiler/mlir/glob_lit_test.bzl
+++ b/tensorflow/compiler/mlir/glob_lit_test.bzl
@@ -26,7 +26,7 @@ _ALWAYS_EXCLUDE = [
     "**/* */**",
 ]
 
-def _run_lit_test(name, data, size, tags, driver, features):
+def _run_lit_test(name, data, size, tags, driver, features, exec_properties):
     """Runs lit on all tests it can find in `data` under tensorflow/compiler/mlir.
 
     Note that, due to Bazel's hermetic builds, lit only sees the tests that
@@ -64,6 +64,7 @@ def _run_lit_test(name, data, size, tags, driver, features):
         ],
         size = size,
         main = "lit.py",
+        exec_properties = exec_properties,
     )
 
 def glob_lit_tests(
@@ -76,7 +77,8 @@ def glob_lit_tests(
         default_tags = _default_tags,
         tags_override = {},
         driver = _default_driver,
-        features = []):
+        features = [],
+        exec_properties = {}):
     """Creates all plausible Lit tests (and their inputs) under this directory.
 
     Args:
@@ -92,6 +94,7 @@ def glob_lit_tests(
               Note: use of a custom driver is not currently supported
               and specifying a default driver will abort the tests.
       features: [str], list of extra features to enable.
+      exec_properties: a dictionary of properties to pass on.
     """
 
     # Ignore some patterns by default for tests and input data.
@@ -115,6 +118,7 @@ def glob_lit_tests(
             tags = default_tags + tags_override.pop(curr_test, []),
             driver = driver,
             features = features,
+            exec_properties = exec_properties,
         )
 
 def lit_test(
@@ -123,7 +127,8 @@ def lit_test(
         size = _default_size,
         tags = _default_tags,
         driver = _default_driver,
-        features = []):
+        features = [],
+        exec_properties = {}):
     """Runs test files under lit.
 
     Args:
@@ -136,4 +141,4 @@ def lit_test(
               and specifying a default driver will abort the tests.
       features: [str], list of extra features to enable.
     """
-    _run_lit_test(name + ".test", data + [name], size, tags, driver, features)
+    _run_lit_test(name + ".test", data + [name], size, tags, driver, features, exec_properties)
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 6705db29105..9b5b0c209e5 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -31,7 +31,7 @@ filegroup(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -296,11 +296,9 @@ cc_library(
     name = "tensorflow_lite_legalize_tf",
     srcs = [
         "transforms/dilated_conv.cc",
-        "transforms/extract_ophint.cc",
         "transforms/generated_legalize_tf.inc",
         "transforms/generated_lower_static_tensor_list.inc",
         "transforms/generated_prepare_tf.inc",
-        "transforms/legalize_ophint_func_op.cc",
         "transforms/legalize_tf.cc",
         "transforms/legalize_tf_while.cc",
         "transforms/lower_static_tensor_list.cc",
@@ -419,12 +417,14 @@ cc_library(
     ],
     deps = [
         ":tensorflow_lite",
+        "//tensorflow/lite/tools/optimize/sparsity:format_converter",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
     ],
     alwayslink = 1,
 )
@@ -512,7 +512,7 @@ cc_library(
     ],
     deps = [
         ":tensorflow_lite",
-        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
@@ -523,7 +523,6 @@ cc_library(
         "@flatbuffers",
         "@llvm-project//llvm:analysis",
         "@llvm-project//llvm:support",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TransformUtils",
     ],
@@ -562,19 +561,16 @@ cc_library(
 )
 
 cc_library(
-    name = "flatbuffer_translate_lib",
+    name = "flatbuffer_export",
     srcs = [
         "flatbuffer_export.cc",
-        "flatbuffer_import.cc",
-        "utils/convert_type.cc",
     ],
     hdrs = [
         "flatbuffer_export.h",
         "flatbuffer_export_flags.h",
-        "flatbuffer_import.h",
-        "utils/convert_type.h",
     ],
     deps = [
+        ":convert_type",
         ":flatbuffer_tflite_operator_lib",
         ":stateful_ops_utils",
         ":tensorflow_lite",
@@ -592,14 +588,12 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:status",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/delegates/flex:whitelisted_flex_ops_lib",
         "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/versioning",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -614,6 +608,78 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "flatbuffer_import",
+    srcs = [
+        "flatbuffer_import.cc",
+    ],
+    hdrs = [
+        "flatbuffer_import.h",
+    ],
+    deps = [
+        ":convert_type",
+        ":flatbuffer_tflite_operator_lib",
+        ":tensorflow_lite",
+        ":tensorflow_lite_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Translation",
+    ],
+)
+
+cc_library(
+    name = "convert_type",
+    srcs = [
+        "utils/convert_type.cc",
+    ],
+    hdrs = [
+        "utils/convert_type.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "flatbuffer_translate_lib",
+    hdrs = [
+        "flatbuffer_export.h",
+        "flatbuffer_export_flags.h",
+        "flatbuffer_import.h",
+        "utils/convert_type.h",
+    ],
+    deps = [
+        ":flatbuffer_export",
+        ":flatbuffer_import",
+        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "flatbuffer_translate_registeration",
     srcs = [
@@ -629,9 +695,9 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LoopOpsTransforms",
         "@llvm-project//mlir:MlirTranslateMain",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Translation",
@@ -643,6 +709,8 @@ tf_cc_binary(
     name = "flatbuffer_translate",
     deps = [
         ":flatbuffer_translate_registeration",
+        # TODO(b/155809683): Link only necessary dialects.
+        "@llvm-project//mlir:AllPassesAndDialects",
     ],
 )
 
@@ -691,6 +759,13 @@ tf_cc_binary(
         ":tf_tfl_passes",
         ":tf_tfl_translate_cl_options",
         ":tf_to_tfl_flatbuffer",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        # TODO(b/155809683): Link only necessary dialects.
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/core:protos_all_cc",
@@ -698,11 +773,6 @@ tf_cc_binary(
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/stream_executor/lib",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -714,17 +784,19 @@ tf_cc_binary(
     deps = [
         ":flatbuffer_translate_lib",
         ":flatbuffer_translate_registeration",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        # TODO(b/155809683): Link only necessary dialects.
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:logging",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD b/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD
index 79ee35f83fc..04d5d3db918 100644
--- a/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD
@@ -9,7 +9,9 @@ cc_library(
     name = "cost_estimators",
     textual_hdrs = [
         "estimator.h",
+        "cpu_estimators.h",
         "gpu_estimators.h",
         "hardware.h",
+        "arithmetic_count_util.h",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/arithmetic_count_util.h b/tensorflow/compiler/mlir/lite/experimental/estimators/arithmetic_count_util.h
new file mode 100644
index 00000000000..2ca49e4e1e5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/experimental/estimators/arithmetic_count_util.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_ARITHMETIC_COUNT_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_ARITHMETIC_COUNT_UTIL_H_
+
+// For add/mul/div/sub and other broadcastable ops.
+class ArithmeticCountUtilHelper {
+ public:
+  static bool GetArithmeticCountForBroadcastableOp(mlir::Operation* op,
+                                                   int64_t* count) {
+    auto output = op->getResult(0);
+    auto output_type = output.getType().dyn_cast_or_null<RankedTensorType>();
+    if (!output_type || !output_type.hasStaticShape()) return false;
+
+    *count = output_type.getNumElements();
+    return true;
+  }
+
+  static bool GetInputTensorTotalSize(mlir::Operation* op, int64_t* count) {
+    int64_t total_count = 0;
+    for (auto input : op->getOperands()) {
+      auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+      if (!input_type || !input_type.hasStaticShape()) {
+        return false;
+      }
+      total_count += input_type.getNumElements();
+    }
+    *count = total_count;
+    return true;
+  }
+
+  // For conv2d/depthwise_conv/fully_connected ops.
+  // This algorithm actually comes from TOCO tooling_util.cc
+  static bool GetArithmeticCountForConvAndFullyconnectedOp(Operation* op,
+                                                           int64_t* count) {
+    auto weight = op->getOperand(1);
+    auto weight_type = weight.getType().dyn_cast_or_null<RankedTensorType>();
+    if (weight_type == nullptr || !weight_type.hasStaticShape()) return false;
+
+    auto output = op->getResult(0);
+    auto output_type = output.getType().dyn_cast_or_null<RankedTensorType>();
+    if (output_type == nullptr || !output_type.hasStaticShape()) return false;
+
+    int64_t cols = 1;
+    for (int i = 0; i < output_type.getRank() - 1; ++i) {
+      cols *= output_type.getDimSize(i);
+    }
+    const int64_t cost_per_col = 2 * weight_type.getNumElements();
+
+    *count = 2 * cost_per_col * cols;
+
+    auto bias = op->getOperand(2);
+    if (bias) {
+      auto bias_type = bias.getType().dyn_cast_or_null<RankedTensorType>();
+      if (bias_type && bias_type.hasStaticShape()) {
+        *count += bias_type.getNumElements();
+      }
+    }
+
+    return true;
+  }
+};
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_ARITHMETIC_COUNT_UTIL_H_
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/cpu_estimators.h b/tensorflow/compiler/mlir/lite/experimental/estimators/cpu_estimators.h
new file mode 100644
index 00000000000..b47c08c7cb4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/experimental/estimators/cpu_estimators.h
@@ -0,0 +1,149 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_CPU_ESTIMATORS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_CPU_ESTIMATORS_H_
+
+// CPU
+constexpr float kCPUArithmeticUnitCost = 1.0;
+
+// This basically assumes pure load/store. This is just fake data.
+constexpr float kCPUCopyUnitCost = 0.5;
+constexpr float kCPUDefaultCost = 3.0f;
+
+// Default values.
+constexpr float kCPUDefaultFixedValuedCost = 10000.0;
+
+// tfl.add
+template <>
+class TFLiteCostEstimator<AddOp, hardware::CPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    int64_t count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForBroadcastableOp(op,
+                                                                        &count))
+      return kCPUArithmeticUnitCost * count;
+    return kCPUDefaultFixedValuedCost;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
+// tfl.concatenation
+template <>
+class TFLiteCostEstimator<ConcatenationOp, hardware::CPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    int64_t count;
+    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
+      return kCPUCopyUnitCost * count;
+    return kCPUDefaultFixedValuedCost;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
+// tfl.conv_2d
+template <>
+class TFLiteCostEstimator<Conv2DOp, hardware::CPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    int64_t arithmetic_count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
+            op, &arithmetic_count)) {
+      return arithmetic_count * kCPUArithmeticUnitCost;
+    }
+    return kCPUDefaultFixedValuedCost;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
+// tfl.depthwise_conv_2d
+template <>
+class TFLiteCostEstimator<DepthwiseConv2DOp, hardware::CPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    int64_t arithmetic_count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
+            op, &arithmetic_count)) {
+      return arithmetic_count * kCPUArithmeticUnitCost;
+    }
+    return kCPUDefaultFixedValuedCost;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
+// tfl.fully_connected
+template <>
+class TFLiteCostEstimator<FullyConnectedOp, hardware::CPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    int64_t arithmetic_count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
+            op, &arithmetic_count)) {
+      return arithmetic_count * kCPUArithmeticUnitCost;
+    }
+    return kCPUDefaultFixedValuedCost;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
+// tfl.mul
+template <>
+class TFLiteCostEstimator<MulOp, hardware::CPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    int64_t count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForBroadcastableOp(op,
+                                                                        &count))
+      return kCPUArithmeticUnitCost * count;
+    return kCPUDefaultFixedValuedCost;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
+// tfl.pack
+template <>
+class TFLiteCostEstimator<PackOp, hardware::CPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    int64_t count;
+    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
+      return kCPUCopyUnitCost * count;
+    return kCPUDefaultFixedValuedCost;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
+// tfl.reshape
+template <>
+class TFLiteCostEstimator<ReshapeOp, hardware::CPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    int64_t count;
+    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
+      return kCPUCopyUnitCost * count;
+    return kCPUDefaultFixedValuedCost;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_CPU_ESTIMATORS_H_
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/gpu_estimators.h b/tensorflow/compiler/mlir/lite/experimental/estimators/gpu_estimators.h
index 96b1aa3d1f3..45e8707ef44 100644
--- a/tensorflow/compiler/mlir/lite/experimental/estimators/gpu_estimators.h
+++ b/tensorflow/compiler/mlir/lite/experimental/estimators/gpu_estimators.h
@@ -16,6 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_GPU_ESTIMATORS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_GPU_ESTIMATORS_H_
 
+// GPU
+constexpr float kGPUArithmeticUnitCost = 0.2;
+
+// The copy can be non-consectutive copy. This is just fake data.
+constexpr float kGPUCopyUnitCost = 0.2;
+constexpr float kGPUDefaultCost = 1.0f;
+
+// Default values.
+constexpr float kGPUDefaultFixedValuedCost = 10000.0;
+
 // tfl.abs
 template <>
 class TFLiteCostEstimator<AbsOp, hardware::GPU> {
@@ -34,9 +44,11 @@ template <>
 class TFLiteCostEstimator<AddOp, hardware::GPU> {
  public:
   static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
+    int64_t count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForBroadcastableOp(op,
+                                                                        &count))
+      return kGPUArithmeticUnitCost * count;
+    return kGPUDefaultFixedValuedCost;
   }
 
   static bool IsSupported(mlir::Operation* op) { return true; }
@@ -60,9 +72,10 @@ template <>
 class TFLiteCostEstimator<ConcatenationOp, hardware::GPU> {
  public:
   static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
+    int64_t count;
+    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
+      return kGPUCopyUnitCost * count;
+    return kGPUDefaultFixedValuedCost;
   }
 
   // TODO(renjieliu): We probably need to check for dynamic weights.
@@ -74,9 +87,12 @@ template <>
 class TFLiteCostEstimator<Conv2DOp, hardware::GPU> {
  public:
   static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
+    int64_t arithmetic_count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
+            op, &arithmetic_count)) {
+      return arithmetic_count * kGPUArithmeticUnitCost;
+    }
+    return kGPUDefaultFixedValuedCost;
   }
 
   // TODO(renjieliu): We probably need to check for dynamic weights.
@@ -101,9 +117,12 @@ template <>
 class TFLiteCostEstimator<DepthwiseConv2DOp, hardware::GPU> {
  public:
   static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
+    int64_t arithmetic_count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
+            op, &arithmetic_count)) {
+      return arithmetic_count * kGPUArithmeticUnitCost;
+    }
+    return kGPUDefaultFixedValuedCost;
   }
 
   static bool IsSupported(mlir::Operation* op) { return true; }
@@ -140,9 +159,12 @@ template <>
 class TFLiteCostEstimator<FullyConnectedOp, hardware::GPU> {
  public:
   static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
+    int64_t arithmetic_count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
+            op, &arithmetic_count)) {
+      return arithmetic_count * kGPUArithmeticUnitCost;
+    }
+    return kGPUDefaultFixedValuedCost;
   }
 
   // TODO(renjieliu): we need to check for dynamic weights.
@@ -227,6 +249,33 @@ class TFLiteCostEstimator<MaximumOp, hardware::GPU> {
   static bool IsSupported(mlir::Operation* op) { return true; }
 };
 
+// tfl.custom
+template <>
+class TFLiteCostEstimator<CustomOp, hardware::GPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    llvm::errs() << "No defined cost function for op: "
+                 << op->getName().getStringRef().str();
+    return 0.0;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
+// tfl.mean
+template <>
+class TFLiteCostEstimator<MeanOp, hardware::GPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    llvm::errs() << "No defined cost function for op: "
+                 << op->getName().getStringRef().str();
+    return 0.0;
+  }
+
+  // TODO(renjieiu): check for constraints.
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
 // tfl.minimum
 template <>
 class TFLiteCostEstimator<MinimumOp, hardware::GPU> {
@@ -245,9 +294,11 @@ template <>
 class TFLiteCostEstimator<MulOp, hardware::GPU> {
  public:
   static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
+    int64_t count;
+    if (ArithmeticCountUtilHelper::GetArithmeticCountForBroadcastableOp(op,
+                                                                        &count))
+      return kGPUArithmeticUnitCost * count;
+    return kGPUDefaultFixedValuedCost;
   }
 
   static bool IsSupported(mlir::Operation* op) { return true; }
@@ -323,9 +374,10 @@ template <>
 class TFLiteCostEstimator<ReshapeOp, hardware::GPU> {
  public:
   static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
+    int64_t count;
+    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
+      return kGPUCopyUnitCost * count;
+    return kGPUDefaultFixedValuedCost;
   }
 
   static bool IsSupported(mlir::Operation* op) { return true; }
@@ -383,6 +435,19 @@ class TFLiteCostEstimator<SoftmaxOp, hardware::GPU> {
   static bool IsSupported(mlir::Operation* op) { return true; }
 };
 
+// tfl.space_to_depth
+template <>
+class TFLiteCostEstimator<SpaceToDepthOp, hardware::GPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    llvm::errs() << "No defined cost function for op: "
+                 << op->getName().getStringRef().str();
+    return 0.0;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
 // tfl.sqrt
 template <>
 class TFLiteCostEstimator<SqrtOp, hardware::GPU> {
@@ -435,6 +500,19 @@ class TFLiteCostEstimator<StridedSliceOp, hardware::GPU> {
   static bool IsSupported(mlir::Operation* op) { return true; }
 };
 
+// tfl.tanh
+template <>
+class TFLiteCostEstimator<TanhOp, hardware::GPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    llvm::errs() << "No defined cost function for op: "
+                 << op->getName().getStringRef().str();
+    return 0.0;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
 // tfl.transpose
 template <>
 class TFLiteCostEstimator<TransposeOp, hardware::GPU> {
@@ -448,5 +526,18 @@ class TFLiteCostEstimator<TransposeOp, hardware::GPU> {
   static bool IsSupported(mlir::Operation* op) { return true; }
 };
 
+// tfl.transpose_conv
+template <>
+class TFLiteCostEstimator<TransposeConvOp, hardware::GPU> {
+ public:
+  static double GetCost(mlir::Operation* op) {
+    llvm::errs() << "No defined cost function for op: "
+                 << op->getName().getStringRef().str();
+    return 0.0;
+  }
+
+  static bool IsSupported(mlir::Operation* op) { return true; }
+};
+
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_GPU_ESTIMATORS_H_
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index f9739bfa626..df84b028f63 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -191,7 +191,8 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
 
 static bool IsConst(Operation* op) {
   return isa<mlir::ConstantOp>(op) || isa<mlir::TF::ConstOp>(op) ||
-         isa<tfl::ConstOp>(op) || isa<tfl::QConstOp>(op);
+         isa<tfl::ConstOp>(op) || isa<tfl::QConstOp>(op) ||
+         isa<tfl::SparseConstOp>(op) || isa<tfl::SparseQConstOp>(op);
 }
 
 template <typename T>
@@ -403,17 +404,8 @@ class Translator {
   BufferOffset<tflite::Operator> BuildNumericVerifyOperator(
       mlir::TFL::NumericVerifyOp op, const std::vector<int32_t>& operands,
       const std::vector<int32_t>& results);
-  Optional<BufferOffset<tflite::Operator>>
-  BuildConvolution2DTransposeBiasOperator(
-      Operation* inst, mlir::TFL::Convolution2DTransposeBiasOp op,
-      const std::vector<int32_t>& operands,
-      const std::vector<int32_t>& results);
-  Optional<BufferOffset<tflite::Operator>> BuildMaxPoolingWithArgMax2DOperator(
-      Operation* inst, mlir::TFL::MaxPoolingWithArgMax2DOp op,
-      const std::vector<int32_t>& operands,
-      const std::vector<int32_t>& results);
-  Optional<BufferOffset<tflite::Operator>> BuildMaxUnpooling2DOperator(
-      Operation* inst, mlir::TFL::MaxUnpooling2DOp op,
+  BufferOffset<tflite::Operator> BuildCustomOperator(
+      Operation* inst, mlir::TFL::CustomOp op,
       const std::vector<int32_t>& operands,
       const std::vector<int32_t>& results);
 
@@ -435,7 +427,7 @@ class Translator {
   // Builds operator for the given operation with specified operand and result
   // tensor indices. Emits an error and returns llvm::None on failure.
   Optional<BufferOffset<tflite::Operator>> BuildOperator(
-      Operation* inst, const std::vector<int32_t>& operands,
+      Operation* inst, std::vector<int32_t> operands,
       const std::vector<int32_t>& results,
       const std::vector<int32_t>& intermediates);
 
@@ -464,6 +456,9 @@ class Translator {
   // Returns a unique name for `val`.
   std::string UniqueName(mlir::Value val);
 
+  BufferOffset<tflite::SparsityParameters> BuildSparsityParameters(
+      const mlir::TFL::SparsityParameterAttr& s_attr);
+
   ModuleOp module_;
 
   tensorflow::OpOrArgNameMapper& name_mapper_;
@@ -510,9 +505,9 @@ Optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
   } else if (auto cst = dyn_cast<tfl::QConstOp>(inst)) {
     attr = cst.value();
   } else if (auto cst = dyn_cast<tfl::SparseConstOp>(inst)) {
-    attr = cst.value();
+    attr = cst.compressed_data();
   } else if (auto cst = dyn_cast<tfl::SparseQConstOp>(inst)) {
-    attr = cst.value();
+    attr = cst.compressed_data();
   } else {
     return empty_buffer_;
   }
@@ -599,23 +594,22 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
 
   std::vector<int32_t> shape;
   std::vector<int32_t> shape_signature;
+  auto* inst = value.getDefiningOp();
   if (type.hasStaticShape()) {
     llvm::ArrayRef<int64_t> shape_ref = type.getShape();
     if (mlir::failed(check_shape(shape_ref))) return llvm::None;
 
     shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
-  } else if (auto* inst = value.getDefiningOp()) {
-    if (IsConst(inst)) {
-      // Const op can have a result of dynamic shaped type (e.g. due to constant
-      // folding), but we can still derive the shape of a constant tensor for
-      // its attribute type.
-      mlir::Attribute tensor_attr = inst->getAttr("value");
-      llvm::ArrayRef<int64_t> shape_ref =
-          tensor_attr.getType().cast<TensorType>().getShape();
-      if (mlir::failed(check_shape(shape_ref))) return llvm::None;
+  } else if (inst && IsConst(inst)) {
+    // Const op can have a result of dynamic shaped type (e.g. due to constant
+    // folding), but we can still derive the shape of a constant tensor for
+    // its attribute type.
+    mlir::Attribute tensor_attr = inst->getAttr("value");
+    llvm::ArrayRef<int64_t> shape_ref =
+        tensor_attr.getType().cast<TensorType>().getShape();
+    if (mlir::failed(check_shape(shape_ref))) return llvm::None;
 
-      shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
-    }
+    shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
   } else if (type.hasRank()) {
     llvm::ArrayRef<int64_t> shape_ref = type.getShape();
     if (mlir::failed(check_shape(shape_ref))) return llvm::None;
@@ -627,11 +621,12 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
     shape_signature = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
   }
 
+  BufferOffset<tflite::SparsityParameters> s_params = 0;
   if (auto* inst = value.getDefiningOp()) {
     if (auto cst = dyn_cast<tfl::SparseConstOp>(inst)) {
-      // CreateSparsityParameters(cst.s_param());
+      s_params = BuildSparsityParameters(cst.s_param());
     } else if (auto cst = dyn_cast<tfl::SparseQConstOp>(inst)) {
-      // CreateSparsityParameters(cst.s_param());
+      s_params = BuildSparsityParameters(cst.s_param());
     }
   }
 
@@ -676,12 +671,12 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
     return tflite::CreateTensor(
         builder_, builder_.CreateVector(shape), tflite_element_type,
         (is_variable ? 0 : buffer_idx), builder_.CreateString(name), q_params,
-        /*is_variable=*/is_variable);
+        /*is_variable=*/is_variable, s_params);
   } else {
     return tflite::CreateTensor(
         builder_, builder_.CreateVector(shape), tflite_element_type,
         (is_variable ? 0 : buffer_idx), builder_.CreateString(name), q_params,
-        /*is_variable=*/is_variable, /*sparsity=*/0,
+        /*is_variable=*/is_variable, s_params,
         /*shape_signature=*/builder_.CreateVector(shape_signature));
   }
 }
@@ -768,48 +763,21 @@ BufferOffset<tflite::Operator> Translator::BuildNumericVerifyOperator(
   return BuildCustomOperator(tolerance, "NumericVerify", op, operands, results);
 }
 
-Optional<BufferOffset<tflite::Operator>>
-Translator::BuildConvolution2DTransposeBiasOperator(
-    Operation* inst, mlir::TFL::Convolution2DTransposeBiasOp op,
+BufferOffset<tflite::Operator> Translator::BuildCustomOperator(
+    Operation* inst, mlir::TFL::CustomOp op,
     const std::vector<int32_t>& operands, const std::vector<int32_t>& results) {
-  TfLiteTransposeConvParams conv_params;
-  conv_params.stride_height = op.stride_h().getSExtValue();
-  conv_params.stride_width = op.stride_w().getSExtValue();
-  const auto padding = GetTflitePadding(inst, op.padding());
-  if (padding) {
-    conv_params.padding = *padding;
-    return BuildCustomOperator(conv_params, "Convolution2DTransposeBias", op,
-                               operands, results);
-  }
-
-  return llvm::None;
-}
-
-Optional<BufferOffset<tflite::Operator>>
-Translator::BuildMaxPoolingWithArgMax2DOperator(
-    Operation* inst, mlir::TFL::MaxPoolingWithArgMax2DOp op,
-    const std::vector<int32_t>& operands, const std::vector<int32_t>& results) {
-  const auto pool_params = GetTflitePoolParams(inst, op);
-  if (pool_params) {
-    return BuildCustomOperator(*pool_params, "MaxPoolingWithArgmax2D", op,
-                               operands, results);
-  }
-
-  return llvm::None;
-}
-
-Optional<BufferOffset<tflite::Operator>>
-Translator::BuildMaxUnpooling2DOperator(Operation* inst,
-                                        mlir::TFL::MaxUnpooling2DOp op,
-                                        const std::vector<int32_t>& operands,
-                                        const std::vector<int32_t>& results) {
-  const auto pool_params = GetTflitePoolParams(inst, op);
-  if (pool_params) {
-    return BuildCustomOperator(*pool_params, "MaxUnpooling2D", op, operands,
-                               results);
-  }
-
-  return llvm::None;
+  const std::string attrs =
+      op.custom_option().cast<mlir::OpaqueElementsAttr>().getValue().str();
+  std::vector<uint8_t> custom_option_vector(attrs.size());
+  memcpy(custom_option_vector.data(), attrs.data(), attrs.size());
+  auto opcode_index =
+      GetOpcodeIndex(op.custom_code().str(), tflite::BuiltinOperator_CUSTOM);
+  return tflite::CreateOperator(
+      builder_, opcode_index, builder_.CreateVector(operands),
+      builder_.CreateVector(results), tflite::BuiltinOptions_NONE,
+      /*builtin_options=*/0,
+      builder_.CreateVector<uint8_t>(custom_option_vector),
+      tflite::CustomOptionsFormat_FLEXBUFFERS);
 }
 
 Optional<CustomOptionsOffset> Translator::CreateFlexOpCustomOptions(
@@ -831,11 +799,6 @@ Optional<CustomOptionsOffset> Translator::CreateFlexOpCustomOptions(
 
 Optional<CustomOptionsOffset> Translator::CreateCustomOpCustomOptions(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
-  std::string node_def_str;
-  if (!node_def.SerializeToString(&node_def_str)) {
-    return emitError(loc, "failed to serialize tensorflow node_def"),
-           llvm::None;
-  }
   auto flex_builder = CreateFlexBuilderWithNodeAttrs(node_def, loc);
   return builder_.CreateVector(flex_builder->GetBuffer());
 }
@@ -845,9 +808,13 @@ Translator::CreateFlexBuilderWithNodeAttrs(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
   auto flex_builder = absl::make_unique<flexbuffers::Builder>();
   size_t map_start = flex_builder->StartMap();
-  for (const auto& pair : node_def.attr()) {
+  using Item = std::pair<std::string, ::tensorflow::AttrValue>;
+  std::vector<Item> attrs(node_def.attr().begin(), node_def.attr().end());
+  std::sort(attrs.begin(), attrs.end(),
+            [](Item& p1, Item& p2) -> bool { return p1.first < p2.first; });
+  for (const Item& pair : attrs) {
     const char* key = pair.first.c_str();
-    const auto& attr = pair.second;
+    const ::tensorflow::AttrValue& attr = pair.second;
     switch (attr.value_case()) {
       case ::tensorflow::AttrValue::kS:
         flex_builder->String(key, attr.s());
@@ -928,7 +895,7 @@ uint32_t Translator::GetOpcodeIndex(const std::string& op_name,
 }
 
 Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
-    Operation* inst, const std::vector<int32_t>& operands,
+    Operation* inst, std::vector<int32_t> operands,
     const std::vector<int32_t>& results,
     const std::vector<int32_t>& intermediates) {
   const auto* dialect = inst->getDialect();
@@ -952,19 +919,8 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
       if (auto verify_op = dyn_cast<mlir::TFL::NumericVerifyOp>(inst)) {
         return BuildNumericVerifyOperator(verify_op, operands, results);
       }
-      if (auto conv_transpose_bias_op =
-              dyn_cast<mlir::TFL::Convolution2DTransposeBiasOp>(inst)) {
-        return BuildConvolution2DTransposeBiasOperator(
-            inst, conv_transpose_bias_op, operands, results);
-      }
-      if (auto max_pooling_with_arg_max_op =
-              dyn_cast<mlir::TFL::MaxPoolingWithArgMax2DOp>(inst)) {
-        return BuildMaxPoolingWithArgMax2DOperator(
-            inst, max_pooling_with_arg_max_op, operands, results);
-      }
-      if (auto max_unpooling_op = dyn_cast<mlir::TFL::MaxUnpooling2DOp>(inst)) {
-        return BuildMaxUnpooling2DOperator(inst, max_unpooling_op, operands,
-                                           results);
+      if (auto custom_op = dyn_cast<mlir::TFL::CustomOp>(inst)) {
+        return BuildCustomOperator(inst, custom_op, operands, results);
       }
       if (auto whileOp = dyn_cast<mlir::TFL::WhileOp>(inst)) {
         if (inst->getNumOperands() != inst->getNumResults()) {
@@ -982,6 +938,15 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
 
     std::string op_name = inst->getName().getStringRef().str();
     uint32_t opcode_index = GetOpcodeIndex(op_name, *builtin_code);
+
+    // If this is TransposeConv we need to do a special case of ignoring the
+    // optional tensor, to allow newly created models to run on old runtimes.
+    if (*builtin_code == tflite::BuiltinOperator_TRANSPOSE_CONV) {
+      if (operands.size() == 4 && operands.at(3) == -1) {
+        operands.pop_back();
+      }
+    }
+
     auto offset = CreateFlatBufferOperator(inst, opcode_index, operands,
                                            results, intermediates, &builder_);
     if (!offset) {
@@ -1051,10 +1016,10 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
       inst->getName().print(os);
       // Print out attributes except for large elementsattributes (which should
       // rarely be the cause why the legalization didn't happen).
-      if (!inst->getAttrList().getAttrs().empty()) {
+      if (!inst->getMutableAttrDict().getAttrs().empty()) {
         os << " {";
         bool first = true;
-        for (auto& named_attr : inst->getAttrList().getDictionary()) {
+        for (auto& named_attr : inst->getAttrDictionary()) {
           os << (!first ? ", " : "");
           first = false;
           named_attr.first.print(os);
@@ -1422,6 +1387,60 @@ Optional<std::string> Translator::TranslateInternal() {
                      builder_.GetSize());
 }
 
+BufferOffset<tflite::SparsityParameters> Translator::BuildSparsityParameters(
+    const mlir::TFL::SparsityParameterAttr& s_attr) {
+  const int dim_size = s_attr.dim_metadata().size();
+  std::vector<flatbuffers::Offset<tflite::DimensionMetadata>> fb_dim_metadata(
+      dim_size);
+  for (int i = 0; i < dim_size; i++) {
+    const auto dim_metadata =
+        s_attr.dim_metadata()[i].dyn_cast<mlir::TFL::DimensionMetadataAttr>();
+    if (dim_metadata.format().getValue() == "DENSE") {
+      fb_dim_metadata[i] =
+          tflite::CreateDimensionMetadata(builder_, tflite::DimensionType_DENSE,
+                                          dim_metadata.dense_size().getInt());
+
+    } else {
+      auto segments = dim_metadata.segments();
+      std::vector<int> vector_segments(segments.size(), 0);
+      for (int j = 0; j < segments.size(); j++) {
+        vector_segments[j] = segments[j].dyn_cast<mlir::IntegerAttr>().getInt();
+      }
+      auto array_segments =
+          tflite::CreateInt32Vector(builder_,
+                                    builder_.CreateVector(vector_segments))
+              .Union();
+      auto indices = dim_metadata.indices();
+      std::vector<int> vector_indices(indices.size(), 0);
+      for (int j = 0; j < indices.size(); j++) {
+        vector_indices[j] = indices[j].dyn_cast<mlir::IntegerAttr>().getInt();
+      }
+      auto array_indices = tflite::CreateInt32Vector(
+                               builder_, builder_.CreateVector(vector_indices))
+                               .Union();
+      fb_dim_metadata[i] = tflite::CreateDimensionMetadata(
+          builder_, tflite::DimensionType_SPARSE_CSR, 0,
+          tflite::SparseIndexVector_Int32Vector, array_segments,
+          tflite::SparseIndexVector_Int32Vector, array_indices);
+    }
+  }
+
+  std::vector<int> traversal_order(dim_size);
+  for (int i = 0; i < dim_size; i++) {
+    traversal_order[i] =
+        s_attr.traversal_order()[i].dyn_cast<mlir::IntegerAttr>().getInt();
+  }
+  const int block_map_size = s_attr.block_map().size();
+  std::vector<int> block_map(block_map_size);
+  for (int i = 0; i < block_map_size; i++) {
+    block_map[i] = s_attr.block_map()[i].dyn_cast<mlir::IntegerAttr>().getInt();
+  }
+
+  return tflite::CreateSparsityParameters(
+      builder_, builder_.CreateVector(traversal_order),
+      builder_.CreateVector(block_map), builder_.CreateVector(fb_dim_metadata));
+}
+
 }  // namespace
 
 // Translates the given MLIR module in the TFLite dialect to TFLite FlatBuffer
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index f41baca36df..59b0b07a2ed 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -59,13 +59,11 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/flatbuffer_operator.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -185,6 +183,12 @@ StatusOr<mlir::TensorType> GetTensorType(const TensorT& tensor, Builder builder,
     return RankedTensorType::get({}, elem_type);
   }
 
+  if (!tensor.shape_signature.empty()) {
+    llvm::SmallVector<int64_t, 4> shape(tensor.shape_signature.begin(),
+                                        tensor.shape_signature.end());
+    return RankedTensorType::get(shape, elem_type);
+  }
+
   if (!tensor.shape.empty()) {
     llvm::SmallVector<int64_t, 4> shape(tensor.shape.begin(),
                                         tensor.shape.end());
@@ -242,23 +246,8 @@ mlir::Operation* ConvertMinMaxToStatsOp(const TensorT& tensor, OpBuilder b,
 }
 
 StatusOr<std::string> OpNameForOpCode(const tflite::OperatorCodeT opcode) {
-  // TODO(b/143872630): Support custom ops
   if (opcode.builtin_code == tflite::BuiltinOperator_CUSTOM) {
-    // Adding some custom op supported on GPU.
-    const absl::string_view custom_name = opcode.custom_code;
-    if (custom_name == "MaxPoolingWithArgmax2D") {
-      return std::string("tfl.max_pooling_with_argmax_2d");
-    }
-    if (custom_name == "Convolution2DTransposeBias") {
-      return std::string("tfl.convolution_2d_transpose_bias");
-    }
-    if (custom_name == "MaxUnpooling2D") {
-      return std::string("tfl.max_unpooling_2d");
-    }
-    // Use an unsupported op name instead of throwing an error here in case the
-    // op is pruned during the import.
-    return std::string(
-        llvm::Twine("tfl.UNSUPPORTED_custom_", opcode.custom_code).str());
+    return std::string("tfl.custom");
   }
   if (opcode.builtin_code == tflite::BuiltinOperator_IF) {
     return std::string("tf.If");
@@ -453,6 +442,15 @@ StatusOr<Operation*> BuildConstOp(const tflite::TensorT& tensor,
              elem_type.isa<QuantizedType>()) {
     TF_ASSIGN_OR_RETURN(value,
                         ConvertIntBuffer(shaped_type, elem_type, buffer));
+  } else if (elem_type.isa<mlir::TF::StringType>()) {
+    tensorflow::TensorProto repr = ConvertTfliteConstTensor(tensor, buffer);
+    std::vector<llvm::StringRef> refs;
+    refs.reserve(repr.string_val_size());
+
+    for (const auto& ref : repr.string_val())
+      refs.push_back({ref.data(), ref.size()});
+
+    value = mlir::DenseStringElementsAttr::get(shaped_type, refs);
   } else if (elem_type.isa<mlir::ComplexType>() ||
              elem_type.isa<mlir::TF::TensorFlowType>()) {
     auto dialect = elem_type.getContext()->getRegisteredDialect("tf");
@@ -510,18 +508,13 @@ bool IsBasicLSTMOp(tflite::BuiltinOptionsUnion op_union) {
   }
 }
 
-// Returns true if this is a custom op.
-bool IsCustomOp(const std::string& op_name) {
-  return op_name == "tfl.max_pooling_with_argmax_2d" ||
-         op_name == "tfl.max_unpooling_2d" ||
-         op_name == "tfl.convolution_2d_transpose_bias";
-}
-
 // TODO(krzysd) Handle function calls
 StatusOr<Operation*> ConvertOp(
     const tflite::OperatorT& op, const std::vector<Value>& vals_map,
     const std::vector<mlir::TensorType>& intermediate_types,
-    Value optional_arg_marker, const std::vector<std::string>& op_names,
+    Value optional_arg_marker,
+    const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& op_codes,
+    const std::vector<std::string>& op_names,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::TensorT>>& tensors, Location loc,
     OpBuilder builder) {
@@ -534,6 +527,7 @@ StatusOr<Operation*> ConvertOp(
   }
 
   const bool is_basic_lstm = IsBasicLSTMOp(op.builtin_options);
+  const tflite::OperatorCodeT op_code = *op_codes.at(op.opcode_index);
   const std::string& op_name =
       is_basic_lstm ? "tfl.basic_lstm" : op_names.at(op.opcode_index);
   OperationState op_state(loc, op_name);
@@ -625,9 +619,9 @@ StatusOr<Operation*> ConvertOp(
   }
 
   llvm::SmallVector<mlir::NamedAttribute, 2> attrs;
-  if (IsCustomOp(op_name)) {
-    auto status = mlir::CustomOptionsToAttributes(op_name, op.custom_options,
-                                                  builder, loc, &attrs);
+  if (op_code.builtin_code == tflite::BuiltinOperator_CUSTOM) {
+    auto status = mlir::CustomOptionsToAttributes(
+        op_code.custom_code, op.custom_options, builder, loc, &attrs);
     if (!status.ok()) {
       return emitError(loc, status.ToString()), status;
     }
@@ -676,8 +670,8 @@ template <typename ContainerType>
 mlir::NamedAttribute BuildTFEntryFunctionAttribute(
     const tflite::SubGraphT& subgraph, Builder* builder, const std::string name,
     const ContainerType indices) {
-  llvm::SmallVector<std::string, 8> tensor_names = mlir::functional::map(
-      [&](int i) { return subgraph.tensors.at(i)->name; }, indices);
+  auto tensor_names = llvm::map_range(
+      indices, [&](int i) { return subgraph.tensors.at(i)->name; });
   return builder->getNamedAttr(
       name, builder->getStringAttr(llvm::join(tensor_names, ",")));
 }
@@ -739,6 +733,7 @@ StatusOr<absl::flat_hash_set<const tflite::OperatorT*>> PruneSubgraph(
 // return nodes in ordered_output_arrays in the same order.
 StatusOr<FuncOp> ConvertSubgraph(
     const tflite::SubGraphT& subgraph, llvm::StringRef name,
+    const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& op_codes,
     const std::vector<std::string>& op_names,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::BufferT>>& buffers,
@@ -929,7 +924,8 @@ StatusOr<FuncOp> ConvertSubgraph(
     TF_ASSIGN_OR_RETURN(
         auto* mlir_op,
         ConvertOp(*op, vals_map, intermediate_types, maybe_optional_arg_marker,
-                  op_names, func_names, subgraph.tensors, op_loc, op_builder));
+                  op_codes, op_names, func_names, subgraph.tensors, op_loc,
+                  op_builder));
 
     // Add the results to the value maps. There are two cases: 1. the result
     // tensor does not have min/max values, the original op result is used
@@ -1036,8 +1032,8 @@ OwningModuleRef tflite::FlatBufferToMlir(
     auto& subgraph = e.value();
     std::string name = SubgraphName(e.index(), *subgraph);
     auto func_or_error = ConvertSubgraph(
-        *subgraph, name, operator_names, func_names, model->buffers, base_loc,
-        builder,
+        *subgraph, name, model->operator_codes, operator_names, func_names,
+        model->buffers, base_loc, builder,
         // TODO(b/131175224,b/132239787) Support multiple entry points
         /*is_entry_point=*/e.index() == 0,
         /*use_external_constant=*/use_external_constant, ordered_input_arrays,
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 9734608b19b..ceaa4e215cf 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
@@ -243,42 +243,22 @@ static mlir::Attribute BuildTFL_PaddingAttr(tflite::Padding value,
 }
 
 Status mlir::CustomOptionsToAttributes(
-    const std::string& op_name, const std::vector<uint8_t>& custom_options,
+    const std::string& custom_code, const std::vector<uint8_t>& custom_options,
     mlir::Builder builder, mlir::Location loc,
     llvm::SmallVectorImpl<mlir::NamedAttribute>* attributes) {
-  if (op_name == "tfl.max_pooling_with_argmax_2d" ||
-      op_name == "tfl.max_unpooling_2d") {
-    auto* pool_params =
-        reinterpret_cast<const TfLitePoolParams*>(custom_options.data());
-    TF_ASSIGN_OR_RETURN(auto padding_attribute,
-                        GetPaddingAttr(pool_params->padding, builder, loc));
-    attributes->emplace_back(
-        builder.getNamedAttr("padding", padding_attribute));
-    attributes->emplace_back(builder.getNamedAttr(
-        "stride_h", builder.getI32IntegerAttr(pool_params->stride_height)));
-    attributes->emplace_back(builder.getNamedAttr(
-        "stride_w", builder.getI32IntegerAttr(pool_params->stride_width)));
-    attributes->emplace_back(builder.getNamedAttr(
-        "filter_h", builder.getI32IntegerAttr(pool_params->filter_height)));
-    attributes->emplace_back(builder.getNamedAttr(
-        "filter_w", builder.getI32IntegerAttr(pool_params->filter_width)));
-    return Status::OK();
+  attributes->emplace_back(
+      builder.getNamedAttr("custom_code", builder.getStringAttr(custom_code)));
+  std::string content;
+  content.assign(reinterpret_cast<const char*>(custom_options.data()),
+                 custom_options.size());
+  ShapedType type = RankedTensorType::get(
+      {static_cast<int64_t>(custom_options.size())}, builder.getIntegerType(8));
+  attributes->emplace_back(builder.getNamedAttr(
+      "custom_option",
+      OpaqueElementsAttr::get(builder.getContext()->getRegisteredDialect("tfl"),
+                              type, content)));
 
-  } else if (op_name == "tfl.convolution_2d_transpose_bias") {
-    auto* conv_params = reinterpret_cast<const TfLiteTransposeConvParams*>(
-        custom_options.data());
-    TF_ASSIGN_OR_RETURN(auto padding_attribute,
-                        GetPaddingAttr(conv_params->padding, builder, loc));
-    attributes->emplace_back(
-        builder.getNamedAttr("padding", padding_attribute));
-    attributes->emplace_back(builder.getNamedAttr(
-        "stride_h", builder.getI32IntegerAttr(conv_params->stride_height)));
-    attributes->emplace_back(builder.getNamedAttr(
-        "stride_w", builder.getI32IntegerAttr(conv_params->stride_width)));
-    return Status::OK();
-  }
-
-  return InvalidArgument(absl::StrCat("invalid custom op type: ", op_name));
+  return Status::OK();
 }
 
 // Pull in FlatBuffer writers for TFLite generated using TableGen
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
index 2c3aa10408b..2057d52856b 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
@@ -61,11 +61,12 @@ void BuiltinOptionsToAttributes(
 // operands from tflite op name.
 llvm::MinMax OperandNumbersMinMax(llvm::StringRef op_name);
 
-// Populates the array of mlir::NamedAttributes corresponding to the given
-// custom_options.
-// We use an out parameter per LLVM convention
+// Populates the `custom_code` and `custom_options` to attributes.
+// `custom_code` is used to identify CustomOp.
+// `custom_options` are opaque attribute used to store infomations for this
+// custom op.
 tensorflow::Status CustomOptionsToAttributes(
-    const std::string &op_name, const std::vector<uint8_t> &custom_options,
+    const std::string &custom_code, const std::vector<uint8_t> &custom_options,
     mlir::Builder builder,
     // NOLINTNEXTLINE
     Location loc, llvm::SmallVectorImpl<mlir::NamedAttribute> *attributes);
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
index ccad3cbb79e..23101113a6f 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
@@ -69,6 +69,14 @@ def TFL_SparseOp : OpInterface<"SparseOpInterface"> {
       [{Returns the indices of sparse operands.}],
       "std::vector<int>", "GetSparseOperands", (ins)
     >,
+    InterfaceMethod<
+      [{Returns the supported block size of float sparse operands.}],
+      "std::vector<std::vector<int>>", "GetFloatBlockSize", (ins)
+    >,
+    InterfaceMethod<
+      [{Returns the supported block size of quantized sparse operands.}],
+      "std::vector<std::vector<int>>", "GetQuantizedBlockSize", (ins)
+    >,
   ];
 }
 
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 47a7b32d7e3..3dcfe71770b 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -657,7 +657,7 @@ LogicalResult Verify(FullyConnectedOp op) {
 // GatherOp
 //===----------------------------------------------------------------------===//
 
-static void BuildGatherOp(Builder *builder, OperationState &result,
+static void BuildGatherOp(OpBuilder *builder, OperationState &result,
                           Value params, Value indices, IntegerAttr axis) {
   auto params_type = params.getType().cast<TensorType>();
   auto indices_type = indices.getType().cast<TensorType>();
@@ -665,7 +665,7 @@ static void BuildGatherOp(Builder *builder, OperationState &result,
   // If params/indices is unranked, then output is unranked.
   if (!params_type.hasRank() || !indices_type.hasRank())
     return TFL::GatherOp::build(
-        builder, result, UnrankedTensorType::get(params_type.getElementType()),
+        *builder, result, UnrankedTensorType::get(params_type.getElementType()),
         params, indices, axis);
 
   int64_t params_rank = params_type.getRank();
@@ -710,11 +710,103 @@ static void BuildGatherOp(Builder *builder, OperationState &result,
   }
 
   TFL::GatherOp::build(
-      builder, result,
+      *builder, result,
       RankedTensorType::get(shape, params_type.getElementType()), params,
       indices, axis);
 }
 
+//===----------------------------------------------------------------------===//
+// ScatterNdOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(ScatterNdOp op) {
+  auto indices = op.indices();
+  auto updates = op.updates();
+  auto shape = op.shape();
+  auto output = op.output();
+
+  auto updates_type = updates.getType().cast<ShapedType>();
+  auto indices_type = indices.getType().cast<ShapedType>();
+
+  if (!indices_type.hasStaticShape() || !updates_type.hasStaticShape()) {
+    return success();
+  }
+
+  // Checks if the shape of `updates` is a tensor of shape
+  // `indices.shape[:-1] + shape[indices.shape[-1]:]`, as described in
+  // ScatterNd op description.
+
+  auto outer_dims = indices_type.getRank() - 1;
+  auto outermost_dim = indices_type.getDimSize(outer_dims);
+  // Checks whether the first `outer_dims` dimensions of `indices` and
+  // `updates` are equal.
+  for (auto i = 0; i < outer_dims; i++) {
+    if (indices_type.getDimSize(i) != updates_type.getDimSize(i)) {
+      return op.emitOpError()
+             << "indices.Dims(" << i << ") == " << indices_type.getDimSize(i)
+             << ", but updates.Dims(" << i
+             << ") == " << updates_type.getDimSize(i);
+    }
+  }
+
+  auto output_type = output.getType().cast<ShapedType>();
+  auto shape_type = shape.getType().cast<ShapedType>();
+  if (shape_type.hasStaticShape()) {
+    // Check the rank of `shape`.
+    auto output_rank = outermost_dim + updates_type.getRank() - outer_dims;
+    if (shape_type.getDimSize(0) != output_rank) {
+      return op.emitOpError()
+             << "shape must be a vector of length " << output_rank;
+    }
+    if (output_type.hasRank()) {
+      if (output_type.getRank() != output_rank) {
+        return op.emitOpError()
+               << "output must have the same rank with the length of shape = "
+               << output_rank;
+      }
+    }
+  }
+
+  DenseIntElementsAttr shape_value;
+  if (matchPattern(shape, m_Constant(&shape_value))) {
+    for (const auto shape_elem : shape_value) {
+      if (shape_elem.getSExtValue() <= 0) {
+        return op.emitOpError("all elements of shape must be > 0");
+      }
+    }
+
+    // Checks whether the last `(shape_type.getDimSize(0) - outermost_dim)`
+    // dimensions of `updates` and `shape` are equal.
+    for (auto shape_it : llvm::enumerate(shape_value)) {
+      auto i = shape_it.index();
+      auto value = shape_it.value().getSExtValue();
+      if (i >= outermost_dim) {
+        auto corresponding_dim = i - outermost_dim + outer_dims;
+        if (value != updates_type.getDimSize(corresponding_dim)) {
+          return op.emitOpError()
+                 << "updates.Dims(" << i
+                 << ") == " << updates_type.getDimSize(corresponding_dim)
+                 << ", but shape[" << i << "] == " << value;
+        }
+      }
+    }
+
+    // Checks if the output has the shape specified by `shape`.
+    if (output_type.hasStaticShape()) {
+      for (auto shape_it : llvm::enumerate(shape_value)) {
+        int i = shape_it.index();
+        auto value = shape_it.value().getSExtValue();
+        if (output_type.getDimSize(i) != value) {
+          return op.emitOpError()
+                 << "output shape [" << output_type.getShape()
+                 << "] must be equal to the value of shape " << shape_value;
+        }
+      }
+    }
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // MulOp
 //===----------------------------------------------------------------------===//
@@ -1014,6 +1106,75 @@ static LogicalResult Verify(SliceOp op) {
   return success();
 }
 
+TFL::ConstOp NarrowDownInt64InputValuesForOp(Operation *input_op,
+                                             RankedTensorType value_type,
+                                             Location loc, OpBuilder *builder) {
+  if (input_op == nullptr) return nullptr;
+
+  mlir::DenseIntElementsAttr attr;
+  if (!matchPattern(input_op, m_Constant(&attr))) {
+    return nullptr;
+  }
+
+  auto value_shape_type = mlir::RankedTensorType::get(
+      value_type.getShape(), builder->getIntegerType(32));
+
+  SmallVector<int32_t, 4> value_i32;
+  value_i32.reserve(value_type.getRank());
+  for (const auto &size : attr) {
+    value_i32.push_back(static_cast<int32_t>(size.getSExtValue()));
+  }
+  auto new_value_i32_attr =
+      mlir::DenseIntElementsAttr::get(value_shape_type, value_i32);
+
+  return builder->create<TFL::ConstOp>(loc, new_value_i32_attr);
+}
+
+// This will cast donw int64 values for TFL slice op.
+// This will require the begin & size are constants.
+struct CastDonwInt64BeginEndToInt32 : public OpRewritePattern<TFL::SliceOp> {
+  using OpRewritePattern<TFL::SliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::SliceOp slice_op,
+                                PatternRewriter &rewriter) const override {
+    auto begin = slice_op.begin();
+    auto size = slice_op.size();
+    auto begin_type = begin.getType().dyn_cast_or_null<RankedTensorType>();
+    auto size_type = size.getType().dyn_cast_or_null<RankedTensorType>();
+    auto begin_op = begin.getDefiningOp();
+    auto size_op = size.getDefiningOp();
+
+    if (begin_op == nullptr && size_op == nullptr) return failure();
+
+    if (begin_type == nullptr && size_type == nullptr) return failure();
+
+    // Handle begin.
+    if (begin_op && begin_type && begin_type.getElementType().isInteger(64)) {
+      auto new_begin = NarrowDownInt64InputValuesForOp(
+          begin_op, begin_type, slice_op.getLoc(), &rewriter);
+      if (new_begin != nullptr) {
+        slice_op.setOperand(1, new_begin);
+      }
+    }
+
+    // Handle size.
+    if (size_op && size_type && size_type.getElementType().isInteger(64)) {
+      auto new_size = NarrowDownInt64InputValuesForOp(
+          size_op, size_type, slice_op.getLoc(), &rewriter);
+      if (new_size != nullptr) {
+        slice_op.setOperand(2, new_size);
+      }
+    }
+
+    return success();
+  }
+};
+
+void SliceOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {
+  results.insert<CastDonwInt64BeginEndToInt32>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // SubOp
 //===----------------------------------------------------------------------===//
@@ -1030,7 +1191,7 @@ OpFoldResult SubOp::fold(ArrayRef<Attribute> operands) {
 // TopKOp
 //===----------------------------------------------------------------------===//
 
-static void BuildTopKOp(Builder *builder, OperationState &result, Value input,
+static void BuildTopKOp(OpBuilder *builder, OperationState &result, Value input,
                         Value k) {
   // Output size is only known if k is constant value. A negative dimension is
   // considered dynamic so use -1 here if k is not a constant value.
@@ -1045,14 +1206,14 @@ static void BuildTopKOp(Builder *builder, OperationState &result, Value input,
   // If value is unranked, then so is results.
   if (!val_type.hasRank())
     return TFL::TopKV2Op::build(
-        builder, result, UnrankedTensorType::get(val_type.getElementType()),
+        *builder, result, UnrankedTensorType::get(val_type.getElementType()),
         UnrankedTensorType::get(builder->getIntegerType(32)), input, k);
 
   // Resultant shape is value.shape[:-1] + [k]
   std::vector<int64_t> shape(val_type.getShape());
   shape[shape.size() - 1] = const_k;
   TFL::TopKV2Op::build(
-      builder, result, RankedTensorType::get(shape, val_type.getElementType()),
+      *builder, result, RankedTensorType::get(shape, val_type.getElementType()),
       RankedTensorType::get(shape, builder->getIntegerType(32)), input, k);
 }
 
@@ -1861,6 +2022,18 @@ LogicalResult Verify(WhileOp op) {
   return success();
 }
 
+static LogicalResult Verify(CustomOp op) {
+  OpaqueElementsAttr opaque_attr =
+      op.custom_option().cast<OpaqueElementsAttr>();
+  if (!opaque_attr.getType().hasStaticShape())
+    return op.emitOpError("custom_option should have a static shape.");
+  if (opaque_attr.getValue().size() !=
+      opaque_attr.getType().cast<ShapedType>().getDimSize(0))
+    return op.emitOpError(
+        "custom_option should have the same length of content with shape.");
+  return success();
+}
+
 namespace {
 // Canonicalize While op so that results and operands match and external values
 // are via implicit capture rather than via block args.
@@ -1928,8 +2101,7 @@ struct WhileResultOperandsMatchAndImplicitCapture
     Operation *op = while_op.getOperation();
     Operation *new_op = rewriter.insert(
         Operation::create(op->getLoc(), op->getName(), types, new_operands,
-                          op->getAttrs(), {}, /*numRegions=*/2,
-                          /*resizableOperandList=*/true));
+                          op->getAttrs(), {}, /*numRegions=*/2));
 
     for (int i = 0; i < 2; ++i) new_op->getRegion(i).takeBody(op->getRegion(i));
     int new_index = 0;
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index 42ac0af48d0..c7a1504c3b7 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -54,6 +53,8 @@ class TensorFlowLiteDialect : public Dialect {
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h.inc"
 // Include all specializes estimators below this line
+#include "tensorflow/compiler/mlir/lite/experimental/estimators/arithmetic_count_util.h"
+#include "tensorflow/compiler/mlir/lite/experimental/estimators/cpu_estimators.h"
 #include "tensorflow/compiler/mlir/lite/experimental/estimators/gpu_estimators.h"
 
 }  // end namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index f7955d92074..a585b8e1520 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -20,7 +20,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td"
 include "tensorflow/compiler/mlir/lite/quantization/quantization.td"
 
@@ -99,12 +99,22 @@ def TFL_MirrorPaddingAttr : StrEnumAttr<"Padding", "Mirror pad enum", [
 // A type attribute containing the TensorType.
 def TensorTypeAttr : TypeAttrBase<"TensorType", "Tensor type attribute">;
 
+// A type attribute containing OpaqueElementsAttr and bytes.
+def OpaqueBytesAttr : ElementsAttrBase<
+  And<[
+    CPred<"$_self.isa<OpaqueElementsAttr>() ">,
+    CPred<"$_self.cast<OpaqueElementsAttr>().getType()"
+          ".getElementType().isInteger(8)">,
+  ]>,
+  "opaque bytes attribute"
+ >;
+
 //===----------------------------------------------------------------------===//
 // Derived shape attribute class.
 //===----------------------------------------------------------------------===//
 class DerivedShapeAttr<code body> : DerivedAttr<"ArrayRef<int64_t>", body>;
-class DerivedTFLiteTypeAttr<code body> :
-  DerivedAttr<"tflite::TensorType", body>;
+class DerivedTFLiteTypeAttr<code body, code convert> :
+  DerivedAttr<"tflite::TensorType", body, convert>;
 
 // TFL Runtime op trait predicate.
 class TFL_RuntimePredOpTrait<string desc, Pred pred> :
@@ -237,12 +247,52 @@ class TFL_TFTypesWithSameBits<int i, int j, int num> :
     Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
         CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
 
-class TFL_OperandHasRankLessThan<int n, int m> :
-  PredOpTrait<"operand " # n # " is maximum " # m # "-D",
+class TFL_TFOperandTypesWithSameBits<int i, int j, int num> :
+  And<[
+    Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # i # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+        CPred<"getElementTypeOrSelf($_op.getOperand(" # i # ")).isUnsignedInteger(" # num # ")">]>,
+    Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+        CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
+
+class TFL_OperandIsNoneOrHasRankAtMost<int n, int m> :
+  PredOpTrait<"operand " # n # " is at most " # m # "-D",
+    Or<[
+      CPred<"$_op.getOperand(" # n # ").getType().isa<NoneType>()">,
+      TFL_OperandIsUnrankedPred<n>,
+      CPred<"$_op.getOperand(" # n #
+      ").getType().cast<ShapedType>().getRank() <= " # m>]>>;
+
+class TFL_OperandHasRankAtMost<int n, int m> :
+  PredOpTrait<"operand " # n # " is at most " # m # "-D",
     Or<[TFL_OperandIsUnrankedPred<n>,
       CPred<"$_op.getOperand(" # n #
       ").getType().cast<ShapedType>().getRank() <= " # m>]>>;
 
+class TFL_OperandHasRankAtLeast<int n, int m> :
+  PredOpTrait<"operand " # n # " is at least " # m # "-D",
+    Or<[TFL_OperandIsUnrankedPred<n>,
+      CPred<"$_op.getOperand(" # n #
+      ").getType().cast<ShapedType>().getRank() >= " # m>]>>;
+
+class TFL_OperandHasRankRange<int n, int x, int y> :
+  PredOpTrait<"operand " # n # " has rank range [" # x # ", " # y # "]",
+    Or<[TFL_OperandIsUnrankedPred<n>,
+      CPred<"$_op.getOperand(" # n # ").getType().cast<ShapedType>().getRank() "
+      ">= " # x # " && $_op.getOperand(" # n # ").getType().cast<ShapedType>()."
+      "getRank() <= " # y>]>>;
+
+def TFL_FloatNonNegative : AttrConstraint<
+    CPred<"!$_self.cast<FloatAttr>().getValue().isNegative()">,
+    "whose value is non-negative">;
+
+def TFL_BoolTrue: AttrConstraint<
+    CPred<"$_self.cast<BoolAttr>().getValue()">,
+    "whose value is true">;
+
+def TFL_BoolFalse: AttrConstraint<
+    CPred<"!$_self.cast<BoolAttr>().getValue()">,
+    "whose value is false">;
+
 // This is a quantization-aware version of TCresVTEtIsSameAsOp
 class TFL_TCresVTEtIsSameAsOp<int i, int j> : And<[
   TCOpResIsShapedTypePred<i, j>,
@@ -256,21 +306,46 @@ class TFL_TCresVTEtIsSameAsOp<int i, int j> : And<[
                 "getElementTypeOrSelf($_op.getResult(" # i # "))) == "
             "quant::QuantizedType::castToStorageType("
                 "getElementTypeOrSelf($_op.getOperand(" # j # ")))">]>]>]>;
+
+// This is a quantization-aware version of TCresVTEtIsSameAsOp
+class TFL_TCopVTEtAreSameAt<int i, int j> : Or<[
+  TCopVTEtAreSameAt<[i, j]>,
+  TFL_TFOperandTypesWithSameBits<i, j, 8>,
+  And<[
+    SubstLeaves<"$_self", "getElementTypeOrSelf($_op.getOperand(" # j # "))",
+      quant_QuantizedType.predicate>,
+    CPred<"quant::QuantizedType::castToStorageType("
+              "getElementTypeOrSelf($_op.getOperand(" # i # "))) == "
+          "quant::QuantizedType::castToStorageType("
+              "getElementTypeOrSelf($_op.getOperand(" # j # ")))">]>]>;
+
 //===----------------------------------------------------------------------===//
 // TFL op common constraints.
 //===----------------------------------------------------------------------===//
 
 // This is a constraint for most of the binary ops, e.g., add, mul, div, etc.
-// Binary ops lhs & rhs should have the same value type.
+// Binary ops lhs & rhs should have the same value type, and is capable to
+// compare quantiziation types as well.
 def BinaryOpSameElementTypeConstraint :
-  PredOpTrait<"operands have same element type", TCopVTEtIsSameAs<0, 1>>;
+  PredOpTrait<"operands have same element type",
+    Or<[
+      TCopVTEtIsSameAs<0, 1>,
+      // Two operands' values are both quantized and their type have the same
+      // underlying storage type.
+      And<[
+        SubstLeaves<"$_self", "getElementTypeOrSelf($_op.getOperand(0))",
+          quant_QuantizedType.predicate>,
+        CPred<"quant::QuantizedType::castToStorageType("
+                  "getElementTypeOrSelf($_op.getOperand(0))) == "
+              "quant::QuantizedType::castToStorageType("
+                  "getElementTypeOrSelf($_op.getOperand(1)))">]>]>>;
 
 //===----------------------------------------------------------------------===//
 // TFL common builders.
 //===----------------------------------------------------------------------===//
 
 def TFL_BroadcastableBinaryBuilder : OpBuilder<
-  "Builder *builder, OperationState &result, Value lhs, Value rhs",
+  "OpBuilder &builder, OperationState &result, Value lhs, Value rhs",
   [{
     auto resultType =
       OpTrait::util::getBroadcastedType(lhs.getType(), rhs.getType());
@@ -281,17 +356,17 @@ def TFL_BroadcastableBinaryBuilder : OpBuilder<
   }]>;
 
 def TFL_FusedBroadcastableBinaryBuilder : OpBuilder<
-  "Builder *builder, OperationState &result, Value lhs, Value rhs, "
+  "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
   "StringAttr fusedActivationFunction",
   [{
     buildFusedBroadcastableBinOp(
-       builder, result, lhs, rhs, fusedActivationFunction);
+       &builder, result, lhs, rhs, fusedActivationFunction);
   }]>;
 
 def TFL_ComparisonBinaryBuilder : OpBuilder<
-  "Builder *builder, OperationState &result, Value lhs, Value rhs",
+  "OpBuilder &builder, OperationState &result, Value lhs, Value rhs",
   [{
-    buildComparisonBinOp(builder, result, lhs, rhs);
+    buildComparisonBinOp(&builder, result, lhs, rhs);
   }]>;
 
 //===----------------------------------------------------------------------===//
@@ -339,9 +414,9 @@ class TFL_ConvOp<string mnemonic, string opSummary, int index> :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$input,
     TFL_TensorOf<[F32, QI8, QUI8]>:$filter,
-    TFL_TensorOfOrNone<[F32, I32]>:$bias,
+    TFL_TensorOfOrNone<[F32, I32, I64]>:$bias,
     I32Attr:$dilation_h_factor,
     I32Attr:$dilation_w_factor,
     TFL_AFAttr:$fused_activation_function,
@@ -350,7 +425,7 @@ class TFL_ConvOp<string mnemonic, string opSummary, int index> :
     I32Attr:$stride_w
   );
 
-  let results = (outs TFL_TensorOf<[F32, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$output);
 
   let hasOptions = 0b1;
 }
@@ -450,7 +525,7 @@ retained with length 1.
 }
 
 def TFL_TransposeConvOp:
-    TFL_Op<"transpose_conv", [NoSideEffect]> {
+    TFL_Op<"transpose_conv", [NoSideEffect, TFL_GpuTargetOp]> {
   let summary = "Transpose convolution operator";
 
   let description = [{
@@ -461,6 +536,7 @@ def TFL_TransposeConvOp:
     TFL_1DTensorOf<[I32]>:$output_shape,
     TFL_TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$weights,
     TFL_TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$input,
+    TFL_TensorOfOrNone<[F32, QI32, QUI32]>:$bias,
     TFL_PaddingAttr:$padding,
     I32Attr:$stride_h,
     I32Attr:$stride_w
@@ -473,33 +549,6 @@ def TFL_TransposeConvOp:
   let verifier = [{ return Verify(*this); }];
 }
 
-def TFL_Convolution2DTransposeBiasOp :
-  Op<TFL_Dialect, "convolution_2d_transpose_bias", [NoSideEffect]> {
-  let summary = " Transpose convolution with bias operator";
-
-  let description = [{
-Performs transpose convolution operation on inputs,
-with the option of adding a bias.
-Note this is a custom op that is not supported in the standard runtime.
-
-    Inputs:
-      `inputs[0]`: required: the input activation tensor
-      `inputs[1]`: required: the filter weight tensor
-      `inputs[2]`: optional: the bias tensor
-  }];
-
-  let arguments = (
-    ins AnyTensor:$input,
-    AnyTensor:$filter,
-    TFL_TensorOfOrNone<[AnyType]>:$bias,
-    TFL_PaddingAttr:$padding,
-    I32Attr:$stride_h,
-    I32Attr:$stride_w
-  );
-
-  let results = (outs AnyTensor:$output);
-}
-
 def TFL_AveragePool2DOp:
     TFL_Op<"average_pool_2d",
            [NoSideEffect,
@@ -549,6 +598,8 @@ def TFL_ArgMaxOp : TFL_Op<"arg_max", [NoSideEffect]> {
     return getResult().getType().cast<TensorType>().getElementType().
         cast<IntegerType>().getWidth() > 32 ? tflite::TensorType_INT64 :
             tflite::TensorType_INT32;
+    }], [{
+      TypeAttr::get(getResult().getType().cast<TensorType>().getElementType())
     }]>;
 }
 
@@ -577,6 +628,8 @@ def TFL_ArgMinOp : TFL_Op<"arg_min", [NoSideEffect]> {
     return getResult().getType().cast<TensorType>().getElementType().
         cast<IntegerType>().getWidth() > 32 ? tflite::TensorType_INT64 :
             tflite::TensorType_INT32;
+    }], [{
+      TypeAttr::get(getResult().getType().cast<TensorType>().getElementType())
     }]>;
 }
 
@@ -608,14 +661,14 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
 
   let arguments = (
     ins TFL_VariadicTensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, QI16, TFL_Uint8]>:$values,
+      [F32, I64, I32, I16, I8, QI8, QUI8, TFL_Uint8]>:$values,
     I32Attr:$axis,
     TFL_AFAttr:$fused_activation_function
   );
 
   let results = (outs
     TFL_TensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, QI16, TFL_Uint8]>:$output
+      [F32, I64, I32, I16, I8, QI8, QUI8, TFL_Uint8]>:$output
   );
 
   let hasOptions = 1;
@@ -644,7 +697,7 @@ def TFL_ConstOp : Op<TFL_Dialect, "pseudo_const", [ConstantLike, NoSideEffect,
   let hasFolder = 1;
 
   let builders = [OpBuilder<
-    "Builder *, OperationState &state, Attribute value",
+    "OpBuilder &, OperationState &state, Attribute value",
     [{
       state.addAttribute("value", value);
       state.addTypes(value.getType());
@@ -688,17 +741,20 @@ def TFL_SparseConstOp : Op<TFL_Dialect, "pseudo_sparse_const", [NoSideEffect,
     an actual operation and it will be lowered to buffer instead.
   }];
 
-  let arguments = (ins ElementsAttr:$value, SparsityParameterAttr:$s_param);
+  let arguments = (ins ElementsAttr:$value,
+                   SparsityParameterAttr:$s_param,
+                   ElementsAttr:$compressed_data);
 
   let results = (outs AnyTensor:$output);
 
   let builders = [OpBuilder<
-    "Builder *, OperationState &state, Attribute value, "
-    "SparsityParameterAttr s_param",
+    "OpBuilder &, OperationState &state, Attribute value, "
+    "SparsityParameterAttr s_param, Attribute compressed_data",
     [{
       state.addTypes(value.getType());
       state.addAttribute("value", value);
       state.addAttribute("s_param", s_param);
+      state.addAttribute("compressed_data", compressed_data);
     }]>
   ];
 }
@@ -804,9 +860,45 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
     int GetChannelDimIndex() { return 0; }
     // SparseOpInterface:
     std::vector<int> GetSparseOperands() { return {1}; }
+    std::vector<std::vector<int>> GetFloatBlockSize() { return {{1, 4}}; }
+    std::vector<std::vector<int>> GetQuantizedBlockSize() { return {{1, 16}}; }
   }];
 }
 
+def TFL_BatchMatMulOp : TFL_Op<"batch_matmul", [
+   NoSideEffect,
+   TFL_OperandHasAtleastRank<0, 2>,
+   TFL_OperandHasAtleastRank<1, 2>,
+   SameOperandsAndResultElementType]> {
+
+  let summary = "Batch Matrix Multiply Operator";
+
+  let description = [{
+Performs a batched matrix multiplication on the inputs. Follows the
+conventions of TensorFlow BatchMatMulV2, with support for unknown dimensions
+in the batch dimensions and broadcasting.
+
+    Inputs:
+      `inputs[0]`: required: input LHS
+      `inputs[1]`: required: input RHS
+      `adjoint_lhs`: optional: Transpose LHS (default false)
+      `adjoint_lhs`: optional: Transpose LHS (default false)
+  }];
+
+  let arguments = (ins
+    TFL_TensorOf<[F32]>:$x,
+    TFL_TensorOf<[F32]>:$y,
+    DefaultValuedAttr<BoolAttr, "false">:$adj_x,
+    DefaultValuedAttr<BoolAttr, "false">:$adj_y
+  );
+
+   let results = (outs
+    TFL_TensorOf<[F32]>:$output
+  );
+
+  let hasOptions = 1;
+}
+
 def TFL_GatherOp : TFL_Op<"gather", [
     NoSideEffect,
     SameOperandsAndResultsScale,
@@ -821,26 +913,29 @@ def TFL_GatherOp : TFL_Op<"gather", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I8, I32, I64, TFL_Str, QI8, QUI8, QI16]>:$params,
+    TFL_TensorOf<[F32, I1, I8, I32, I64, TFL_Str, UI8, QI8, QUI8]>:$params,
     TFL_TensorOf<[I32, I64]>:$indices,
     I32Attr:$axis
   );
 
   let builders =
   [
-    OpBuilder<"Builder *builder, OperationState &result, "
+    OpBuilder<"OpBuilder &builder, OperationState &result, "
       "Value params, Value indices, IntegerAttr axis",
-        [{ BuildGatherOp(builder, result, params, indices, axis); }]>
+        [{ BuildGatherOp(&builder, result, params, indices, axis); }]>
   ];
 
   let results = (outs
-    TFL_TensorOf<[F32, I1, I8, I32, I64, TFL_Str, QI8, QUI8, QI16]>:$output
+    TFL_TensorOf<[F32, I1, I8, I32, I64, TFL_Str, UI8, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
 }
 
-def TFL_GatherNdOp : TFL_Op<"gather_nd", [NoSideEffect]> {
+def TFL_GatherNdOp : TFL_Op<"gather_nd", [
+    NoSideEffect,
+    PredOpTrait<"params and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Gather_nd operator";
 
   let description = [{
@@ -857,9 +952,41 @@ def TFL_GatherNdOp : TFL_Op<"gather_nd", [NoSideEffect]> {
   );
 }
 
+def TFL_ScatterNdOp : TFL_Op<"scatter_nd", [
+    NoSideEffect,
+    TFL_OperandHasAtleastRank<0, 1>,
+    TFL_OperandHasAtleastRank<1, 1>,
+    PredOpTrait<"updates and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 1>>
+  ]> {
+  let summary = "Scatter_nd operator";
+
+  let description = [{
+    Scatter `updates` into a new tensor according to `indices`
+  }];
+
+  let arguments = (ins
+    TFL_TensorOf<[I32]>:$indices,
+    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$updates,
+    TFL_1DTensorOf<[I32]>:$shape
+  );
+
+  let results = (outs
+    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
+  );
+
+  let verifier = [{ return Verify(*this); }];
+
+  let hasOptions = 1;
+}
+
 // Same type check of lhs and rhs is handled by the ResultsBroadcastableShape trait.
 def TFL_LessEqualOp : TFL_Op<"less_equal", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Less_equal operator";
 
   let description = [{
@@ -867,8 +994,8 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [
   }];
 
   let arguments = (
-      ins TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$lhs,
-      TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$rhs);
+      ins TFL_TensorOf<[F32, I32, I64, QI8, QUI8]>:$lhs,
+      TFL_TensorOf<[F32, I32, I64, QI8, QUI8]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -881,9 +1008,12 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [
   let hasOptions = 0;
 }
 
-def TFL_LocalResponseNormalizationOp : TFL_Op<"local_response_normalization",
-                                             [NoSideEffect]> {
-    let summary = "Local Response Normalization.";
+def TFL_LocalResponseNormalizationOp : TFL_Op<"local_response_normalization", [
+    TFL_OperandHasRank<0, 4>,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultType,
+    NoSideEffect]> {
+  let summary = "Local Response Normalization.";
 
   let description = [{
 The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
@@ -900,7 +1030,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   }];
 
   let arguments = (ins
-      TFL_TensorOf<[F32, QI8, QUI8]>:$input,
+      TFL_FpTensor:$input,
       I32Attr:$radius,
       F32Attr:$bias,
       F32Attr:$alpha,
@@ -908,14 +1038,17 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, QI8, QUI8]>:$output
+    TFL_FpTensor:$output
   );
 
   let hasOptions = 1;
 }
 
 def TFL_GreaterEqualOp : TFL_Op<"greater_equal", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    ResultsBroadcastableShape,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Greater_equal operator";
 
   let description = [{
@@ -923,8 +1056,8 @@ def TFL_GreaterEqualOp : TFL_Op<"greater_equal", [
   }];
 
   let arguments = (
-      ins AnyTensor:$lhs,
-      AnyTensor:$rhs);
+      ins TFL_TensorOf<[F32, I32, I64, QUI8, QI8]>:$lhs,
+      TFL_TensorOf<[F32, I32, I64, QUI8, QI8]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -941,7 +1074,7 @@ def TFL_MatrixDiagOp : TFL_Op<"matrix_diag", [
   NoSideEffect,
   TFL_OperandHasAtleastRank<0, 1>,
   PredOpTrait<"operand and result must have the same element type",
-    TCresVTEtIsSameAsOp<0, 0>>]> {
+    TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = [{
     Returns a tensor with the provided diagonal and everything else padded with zeros.
   }];
@@ -954,17 +1087,21 @@ def TFL_MatrixDiagOp : TFL_Op<"matrix_diag", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$diagonal
+    TFL_TensorOf<[F32, I8, I16, I32, I64, TFL_Uint8, QUI8, QI8, TFL_Quint8]>:$diagonal
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I8, I16, I32, I64, TFL_Uint8, QUI8, QI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 0;
 }
 
-def TFL_MatrixSetDiagOp : TFL_Op<"matrix_set_diag", [NoSideEffect]> {
+def TFL_MatrixSetDiagOp : TFL_Op<"matrix_set_diag", [
+    TFL_OperandHasAtleastRank<0, 2>,
+    PredOpTrait<"input and result must have the same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect]> {
   let summary = [{
     Returns a batched matrix tensor with new batched diagonal values.
   }];
@@ -976,12 +1113,12 @@ innermost matrices.  These will be overwritten by the values in `diagonal`.
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$input,
-    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$diagonal
+    TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QI16, QUI8, TFL_Quint8]>:$input,
+    TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QI16, QUI8, TFL_Quint8]>:$diagonal
   );
 
   let results = (outs
-    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$output
+    TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QI16, QUI8, TFL_Quint8]>:$result
   );
 
   let hasOptions = 0;
@@ -1099,7 +1236,12 @@ larger than 0.
 }
 
 def TFL_NotEqualOp : TFL_Op<"not_equal", [
-    ResultsBroadcastableShape, Commutative, NoSideEffect, NoQuantizableResult]> {
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    BinaryOpSameElementTypeConstraint,
+    ResultsBroadcastableShape,
+    Commutative,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Not_equal operator";
 
   let description = [{
@@ -1107,17 +1249,17 @@ def TFL_NotEqualOp : TFL_Op<"not_equal", [
   }];
 
   let arguments = (
-      ins AnyTensor:$lhs,
-      AnyTensor:$rhs);
+      ins TFL_TensorOf<[I1, F32, I32, I64, QUI8, QI8, TFL_Quint8, TFL_Str]>:$lhs,
+      TFL_TensorOf<[I1, F32, I32, I64, QUI8, QI8, TFL_Quint8, TFL_Str]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
   let builders =
   [
     OpBuilder<
-      "Builder *builder, OperationState &result, Value lhs, Value rhs",
+      "OpBuilder &builder, OperationState &result, Value lhs, Value rhs",
       [{
-        buildComparisonBinOp(builder, result, lhs, rhs);
+        buildComparisonBinOp(&builder, result, lhs, rhs);
       }]>
   ];
 
@@ -1175,7 +1317,9 @@ def TFL_EluOp: TFL_Op<"elu", [NoSideEffect, SameOperandsAndResultType]> {
 def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
     [NoSideEffect,
      PredOpTrait<"value and output must have same element type",
-       TCresVTEtIsSameAsOp<0, 1>>
+       TFL_TCresVTEtIsSameAsOp<0, 1>>,
+     TFL_OperandHasRank<0, 1>,
+     TFL_OperandHasRankAtLeast<1, 2>
     ]> {
   let summary = "Embedding lookup operator";
 
@@ -1193,6 +1337,8 @@ def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
 
 def TFL_EqualOp: TFL_Op<"equal", [Commutative, ResultsBroadcastableShape,
     NoQuantizableResult,
+    ResultsBroadcastableShape,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
     PredOpTrait<"Operands have same value type", TCopVTEtIsSameAs<0, 1>>]> {
   let summary = "Equal operator";
 
@@ -1202,8 +1348,8 @@ def TFL_EqualOp: TFL_Op<"equal", [Commutative, ResultsBroadcastableShape,
 
   let arguments = (
     ins
-    TFL_TensorOf<[I1, F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$x,
-    TFL_TensorOf<[I1, F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$y
+    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, TFL_Uint8, TFL_Str]>:$x,
+    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, TFL_Uint8, TFL_Str]>:$y
   );
 
   let results = (outs TFL_BoolTensor:$output);
@@ -1228,7 +1374,10 @@ def TFL_ExpOp: TFL_Op<"exp", [NoSideEffect,
 }
 
 def TFL_ExpandDimsOp: TFL_Op<"expand_dims", [
-    NoSideEffect, SameOperandsAndResultsScale]> {
+    NoSideEffect,
+    SameOperandsAndResultsScale,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Inserts a dimension of 1 into a tensor's shape.";
 
   let description = [{
@@ -1265,7 +1414,7 @@ size 1.
   }];
 
   // TODO: Restriction on dim's size and valid range are not modeled here.
-  let arguments = (ins AnyTensor:$input, TFL_IntTensor:$dim);
+  let arguments = (ins AnyTensor:$input, TFL_I32OrI64Tensor:$dim);
 
   let results = (outs AnyTensor:$output);
 
@@ -1311,16 +1460,19 @@ shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
   let customOption = "SqueezeOptions";
 }
 
-def TFL_FillOp: TFL_Op<"fill", [NoSideEffect]> {
+def TFL_FillOp: TFL_Op<"fill", [
+    NoSideEffect,
+    PredOpTrait<"input and result must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 1>>]> {
   let summary = "Fill the tensor with given value.";
   let description = [{
     Fill the tensor with given value.
   }];
 
   let arguments = (ins TFL_I32OrI64Tensor:$dims,
-                  AnyTensor:$value);
+                   TFL_TensorOf<[F32, I32, I64, I1, TFL_Str]>:$input);
 
-  let results = (outs AnyTensor:$res);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, I1, TFL_Str]>:$result);
 
   let hasOptions = 0;
 }
@@ -1338,7 +1490,12 @@ def TFL_FloorOp: TFL_Op<"floor", [NoSideEffect, SameOperandsAndResultType]> {
 }
 
 def TFL_FloorDivOp : TFL_Op<"floor_div", [
-    ResultsBroadcastableShape, NoSideEffect, BinaryOpSameElementTypeConstraint]> {
+    ResultsBroadcastableShape,
+    NoSideEffect,
+    BinaryOpSameElementTypeConstraint,
+    PredOpTrait<"lhs and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>]> {
   let summary = "Floor div operator";
 
   let description = [{
@@ -1346,9 +1503,9 @@ def TFL_FloorDivOp : TFL_Op<"floor_div", [
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs, AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32]>:$lhs, TFL_TensorOf<[F32, I32]>:$rhs);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, I32]>:$output);
 
   let builders = [TFL_BroadcastableBinaryBuilder];
 
@@ -1357,7 +1514,13 @@ def TFL_FloorDivOp : TFL_Op<"floor_div", [
   let printer = [{ return mlir::impl::printOneResultOp(getOperation(), p); }];
 }
 
-def TFL_FloorModOp : TFL_Op<"floor_mod", [ResultsBroadcastableShape, NoSideEffect]> {
+def TFL_FloorModOp : TFL_Op<"floor_mod", [
+    ResultsBroadcastableShape,
+    NoSideEffect,
+    BinaryOpSameElementTypeConstraint,
+    PredOpTrait<"lhs and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>]> {
   let summary = "Division reminder";
 
   let description = [{
@@ -1374,7 +1537,11 @@ def TFL_FloorModOp : TFL_Op<"floor_mod", [ResultsBroadcastableShape, NoSideEffec
 }
 
 def TFL_GreaterOp : TFL_Op<"greater", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Greater operator";
 
   let description = [{
@@ -1382,10 +1549,10 @@ def TFL_GreaterOp : TFL_Op<"greater", [
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$lhs,
+    TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$rhs);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_BoolTensor:$output);
 
   let builders = [TFL_ComparisonBinaryBuilder];
 
@@ -1394,9 +1561,12 @@ def TFL_GreaterOp : TFL_Op<"greater", [
   let printer = [{ return mlir::impl::printOneResultOp(getOperation(), p); }];
 }
 
-def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
-                                          SameOperandsAndResultShape,
-                                          TFL_GpuTargetOp]> {
+def TFL_HardSwishOp: TFL_Op<"hard_swish", [
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_GpuTargetOp]> {
   let summary = "Hardswish activation function.";
   let description = [{
     Computes hard-swish activation function
@@ -1406,7 +1576,7 @@ def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
 
   let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$input);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$out);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$output);
 
   let hasOptions = 0;
 }
@@ -1435,29 +1605,35 @@ def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect,
   let customOption = "L2NormOptions";
 }
 
-def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [NoSideEffect, SameOperandsAndResultType]> {
+def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [
+    SameOperandsAndResultShape,
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Leaky Relu operator";
 
-  // TODO(jpienaar): Add type restriction. This op is only defined for
-  // restricted (floating point) types.
   let description = [{
     Element-wise Leaky ReLU operator
       x -> x >= 0 ? x : (alpha * x)
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$input,
     // Slope of the activation function at x < 0.
     F32Attr:$alpha
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$output);
 
   let hasOptions = 0b1;
 }
 
 def TFL_LessOp : TFL_Op<"less", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Less operator";
 
   let description = [{
@@ -1465,8 +1641,8 @@ def TFL_LessOp : TFL_Op<"less", [
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$lhs,
+    TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -1527,6 +1703,8 @@ def TFL_LogicalOrOp : TFL_Op<"logical_or", [NoSideEffect]> {
 
 def TFL_LogisticOp: TFL_Op<"logistic", [
     NoSideEffect,
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     SameOperandsAndResultShape,
     // zero_point = 0
     // scale = 1. / (max_value + 1)
@@ -1539,9 +1717,9 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
     Computes element-wise Sigmoid of input
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$x);
 
-  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$y);
 }
 
 def TFL_LogOp: TFL_Op<"log", [
@@ -1562,10 +1740,11 @@ def TFL_LogOp: TFL_Op<"log", [
   let hasFolder = 1;
 }
 
-// TODO(b/130643170): Adds some constraint for the input/output element types.
 def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
     NoSideEffect,
     SameOperandsAndResultShape,
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     // zero_point = max_value
     // scale = -log_softmax_output_min / (max_value + 1)
     FixedResultScale<Int8UniformQuantizedType<127, 625, -4>>,
@@ -1578,9 +1757,9 @@ def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
       input - log(reduce_sum(exp(input), dim))
   }];
 
-  let arguments = (ins AnyTensor:$input);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$input);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
@@ -1599,6 +1778,9 @@ def MaxPoolOperandAndResultConstraints : PredOpTrait<"MaxPool2D operand and "
     TFL_TCresVTEtIsSameAsOp<0, 0>]>>;
 
 def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
+    TFL_OperandHasRank<0, 4>,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     MaxPoolOperandAndResultConstraints,
     SameOperandsAndResultsScale,
@@ -1613,7 +1795,7 @@ def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QUI8, QI8, QI16, TFL_Quint8]>:$input,
     TFL_PaddingAttr:$padding,
     I32Attr:$stride_w,
     I32Attr:$stride_h,
@@ -1622,70 +1804,13 @@ def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, QI16, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 
   let customOption = "Pool2DOptions";
 }
 
-def TFL_MaxPoolingWithArgMax2DOp :
-    Op<TFL_Dialect, "max_pooling_with_argmax_2d", [NoSideEffect]> {
-  let summary = "Max Pool 2D with argmax op";
-
-  let description = [{
-    Performs max pooling on the input and outputs both max values and indices.
-    Each index is a flatten index in a sub-array of "filter_w" x "filter_h" size
-    Note this is a custom op that is not supported in the standard runtime.
-
-    Inputs:
-      `inputs[0]`: required: the input activation tensor
-  }];
-
-  let arguments = (
-    ins AnyTensor:$input,
-    TFL_PaddingAttr:$padding,
-    I32Attr:$stride_w,
-    I32Attr:$stride_h,
-    I32Attr:$filter_w,
-    I32Attr:$filter_h
-  );
-
-  let results = (outs
-    AnyTensor:$value,
-    AnyTensor:$indices
-  );
-}
-
-def TFL_MaxUnpooling2DOp :
-    Op<TFL_Dialect, "max_unpooling_2d", [NoSideEffect]> {
-  let summary = "Max Unpool 2D";
-
-  let description = [{
-    Performs max unpool operation.
-    To some extent this is the reverse operation of max pooling:
-    the elements in the input activation tensor is stored into the position
-    specified by the input indices.
-    Note this is a custom op that is not supported in the standard runtime.
-
-    Inputs:
-      `inputs[0]`: required: the input activation tensor
-      `inputs[1]`: required: the input indices
-  }];
-
-  let arguments = (
-    ins AnyTensor:$input,
-    AnyTensor:$indices,
-    TFL_PaddingAttr:$padding,
-    I32Attr:$stride_w,
-    I32Attr:$stride_h,
-    I32Attr:$filter_w,
-    I32Attr:$filter_h
-  );
-
-  let results = (outs AnyTensor:$outputs);
-}
-
 def TFL_MaximumOp : TFL_Op<"maximum", [
     ResultsBroadcastableShape,
     NoSideEffect,
@@ -1711,7 +1836,11 @@ def TFL_MaximumOp : TFL_Op<"maximum", [
   let hasOptions = 0;
 }
 
-def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect]> {
+def TFL_MeanOp : TFL_Op<"mean", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    TFL_GpuTargetOp]> {
   let summary = "Mean operator";
 
   let description = [{
@@ -1723,13 +1852,13 @@ def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8, TFL_Uint8]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Uint8]>:$input,
     TFL_TensorOf<[I32, I64]>:$axis,
     BoolAttr:$keep_dims
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$output);
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Uint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
@@ -1750,14 +1879,14 @@ def TFL_OneHotOp : TFL_Op<"one_hot", [NoSideEffect]> {
   let arguments = (ins
     TFL_TensorOf<[I32, I64]>:$indices,
     TFL_I32Tensor:$depth,
-    TFL_TensorOf<[F32, I32, I64, I1]>:$on_value,
-    TFL_TensorOf<[F32, I32, I64, I1]>:$off_value,
+    TFL_TensorOf<[F32, I32, I64, I1, I8, UI8]>:$on_value,
+    TFL_TensorOf<[F32, I32, I64, I1, I8, UI8]>:$off_value,
 
     I32Attr:$axis
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I1]>:$output
+    TFL_TensorOf<[F32, I32, I64, I1, I8, UI8]>:$output
   );
 
   let hasOptions = 1;
@@ -1771,11 +1900,11 @@ Rounds the values of a tensor to the nearest integer, element-wise.
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32]>:$x
+    TFL_FpTensor:$x
   );
 
   let results = (outs
-    TFL_TensorOf<[F32]>:$y
+    TFL_FpTensor:$y
   );
 }
 
@@ -1808,6 +1937,8 @@ equivalent to setting:
   );
 
   let verifier = [{ return Verify(*this); }];
+
+  let hasCanonicalizer = 1;
 }
 
 def TFL_SumOp: TFL_Op<"sum", [NoSideEffect]> {
@@ -1916,6 +2047,8 @@ def TFL_MinimumOp : TFL_Op<"minimum", [
 def TFL_MulOp : TFL_Op<"mul", [ResultsBroadcastableShape,
                                NoSideEffect,
                                Commutative,
+                               BinaryOpSameElementTypeConstraint,
+                               TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
                                TFL_GpuTargetOp]> {
   let summary = "Multiplication operator";
 
@@ -1957,7 +2090,11 @@ def TFL_NegOp: TFL_Op<"neg", [NoSideEffect, SameOperandsAndResultType]> {
   let hasFolder = 1;
 }
 
-def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
+def TFL_PackOp : TFL_Op<"pack", [
+    PredOpTrait<"values and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultsScale]> {
   let summary = "Packs a list of tensors along a dimension into one tensor";
 
   let description = [{
@@ -1988,14 +2125,14 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
   }];
 
   let arguments = (ins
-    TFL_VariadicTensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>:$values,
+    TFL_VariadicTensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8, QI16, TFL_Quint8]>:$values,
 
-    I32Attr:$values_count,
+    Confined<I32Attr, [IntPositive]>:$values_count,
     I32Attr:$axis
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>:$output
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8, QI16, TFL_Quint8]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2006,8 +2143,11 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
 }
 
 def TFL_PadOp : TFL_Op<"pad", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 4>,
     TFL_OperandHasRank<1, 2>,
     TFL_OperandRankEquals1DimOfOperand<0, 1>,
     TFL_GpuTargetOp]> {
@@ -2038,22 +2178,25 @@ def TFL_PadOp : TFL_Op<"pad", [
     ```
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+  let arguments = (ins TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$padding);
 
-  let results = (outs TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
 
 def TFL_PadV2Op : TFL_Op<"padv2", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 4>,
     TFL_OperandHasRank<1, 2>,
     TFL_OperandHasRank<2, 0>,
     TFL_OperandRankEquals1DimOfOperand<0, 1>,
     PredOpTrait<"input and constant value operands must have same element type",
-      TCopVTEtAreSameAt<[0, 2]>>]> {
+      TFL_TCopVTEtAreSameAt<0, 2>>]> {
   let summary = "Padding operator v2";
 
   let description = [{
@@ -2084,11 +2227,11 @@ def TFL_PadV2Op : TFL_Op<"padv2", [
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I32, I64, UI8, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$padding,
-    TFL_TensorOf<[F32, I8, I32, I64]>:$constant_values);
+    TFL_TensorOf<[F32, I32, I64, UI8, QI8, QUI8, TFL_Quint8]>:$constant_values);
 
-  let results = (outs TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, UI8, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
@@ -2116,7 +2259,21 @@ def TFL_PowOp : TFL_Op<"pow", [ResultsBroadcastableShape,
   let builders = [TFL_BroadcastableBinaryBuilder];
 }
 
-def TFL_PReluOp : TFL_Op<"prelu", [NoSideEffect, TFL_GpuTargetOp]> {
+def TFL_PReluOp : TFL_Op<"prelu", [
+    NoSideEffect,
+    ResultsBroadcastableShape,
+    TFL_GpuTargetOp,
+    TFL_OperandHasRankAtMost<0, 4>,
+    TFL_OperandHasRankAtMost<1, 4>,
+    BinaryOpSameElementTypeConstraint,
+    PredOpTrait<"input and output must have the same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    PredOpTrait<"'alpha' should have one less rank than 'input'.",
+      Or<[TFL_OperandIsUnrankedPred<0>,
+          TFL_OperandIsUnrankedPred<1>,
+          CPred<"$_op.getOperand(0).getType().cast<ShapedType>().getRank() == "
+                "$_op.getOperand(1).getType().cast<ShapedType>().getRank() "
+                "+ 1">]>>]> {
   let summary = "Parameterized Relu operator";
 
   let description = [{
@@ -2129,11 +2286,11 @@ def TFL_PReluOp : TFL_Op<"prelu", [NoSideEffect, TFL_GpuTargetOp]> {
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, QUI8]>:$input,
-    TFL_TensorOf<[F32, QUI8]>:$alpha
+    ins TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$input,
+    TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$alpha
   );
 
-  let results = (outs TFL_TensorOf<[F32, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$output);
 
   let verifier = [{ return Verify(*this); }];
 }
@@ -2165,6 +2322,17 @@ def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect,
   let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
 
   let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
+
+  // This builder doesn't work with quantized type, so it can only be used by
+  // non-quantization tablegen patterns. Currently, it is used by the
+  // elementwise-move reordering pattern in the optimize_patterns.td
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &state, Value input",
+    [{
+      state.addOperands({input});
+      state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
@@ -2181,6 +2349,17 @@ def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
   let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
 
   let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
+
+  // This builder doesn't work with quantized type, so it can only be used by
+  // non-quantization tablegen patterns. Currently, it is used by the
+  // elementwise-move reordering pattern in the optimize_patterns.td
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &state, Value input",
+    [{
+      state.addOperands({input});
+      state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [NoSideEffect,
@@ -2196,6 +2375,17 @@ def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [NoSideEffect,
   let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
 
   let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
+
+  // This builder doesn't work with quantized type, so it can only be used by
+  // non-quantization tablegen patterns. Currently, it is used by the
+  // elementwise-move reordering pattern in the optimize_patterns.td
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &state, Value input",
+    [{
+      state.addOperands({input});
+      state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_ReshapeOp: TFL_Op<"reshape", [
@@ -2257,9 +2447,9 @@ def TFL_RsqrtOp: TFL_Op<"rsqrt", [NoSideEffect,
     Computes element-wise reverse square root of input
   }];
 
-  let arguments = (ins AnyTensor:$x);
+  let arguments = (ins TFL_FpTensor:$x);
 
-  let results = (outs AnyTensor:$y);
+  let results = (outs TFL_FpTensor:$y);
 
   let hasFolder = 1;
 }
@@ -2360,7 +2550,7 @@ def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
   let results = (outs AnyTensor:$output);
 
   // TODO(jpienaar): autogenerate this.
-  let builders = [OpBuilder<"Builder *builder, OperationState &result, "
+  let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
                             "Value condition, Value x, Value y",
   [{
     auto resultType = x.getType();
@@ -2388,10 +2578,10 @@ def TFL_SelectV2Op : TFL_Op<"select_v2", [NoSideEffect]> {
     TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
   let results = (outs AnyTensor:$output);
 
-  let builders = [OpBuilder<"Builder *builder, OperationState &result, "
+  let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
                             "Value cond, Value x, Value y",
   [{
-    BuildSelectV2Op(builder, result, cond, x, y);
+    BuildSelectV2Op(&builder, result, cond, x, y);
   }]>];
 
   let hasOptions = 1;
@@ -2538,7 +2728,8 @@ def TFL_TanhOp: TFL_Op<"tanh", [
     // zero_point = central_value
     // scale = 1. / (central_value - min_value)
     FixedResultScale<Int8UniformQuantizedType<0, 78125, -7>>,
-    FixedResultScale<UInt8UniformQuantizedType<128, 78125, -7>>]> {
+    FixedResultScale<UInt8UniformQuantizedType<128, 78125, -7>>,
+    TFL_GpuTargetOp]> {
   let summary = "Hyperbolic tangent operator";
 
   let description = [{
@@ -2548,6 +2739,17 @@ def TFL_TanhOp: TFL_Op<"tanh", [
   let arguments = (ins TFL_TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$x);
 
   let results = (outs TFL_TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$y);
+
+  // This builder doesn't work with quantized type, so it can only be used by
+  // non-quantization tablegen patterns. Currently, it is used by the
+  // elementwise-move reordering pattern in the optimize_patterns.td
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &state, Value input",
+    [{
+      state.addOperands({input});
+      state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
@@ -2596,9 +2798,9 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
     TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QI8, QUI8]>:$values,
     TFL_I32Tensor:$indices);
 
-  let builders = [OpBuilder<"Builder *builder, OperationState &result, "
+  let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
                             "Value input, Value k",
-                  [{ BuildTopKOp(builder, result, input, k); }]>];
+                  [{ BuildTopKOp(&builder, result, input, k); }]>];
 
   let hasOptions = 1;
 }
@@ -2687,7 +2889,10 @@ def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [
     NoSideEffect,
     SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
-      TCresVTEtIsSameAsOp<0, 0>>
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRankRange<0, 3, 4>,
+    TFL_OperandHasRank<1, 1>,
+    TFL_OperandHasRank<2, 2>
   ]> {
   let summary = "BatchToSpaceNd operator";
 
@@ -2696,13 +2901,13 @@ def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$input,
     TFL_TensorOf<[I32]>:$block_shape,
     TFL_TensorOf<[I32]>:$indices
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, I16, I32, I64, UI8, QI8, QUI8]>:$output
   );
 }
 
@@ -2733,7 +2938,8 @@ def TFL_SpaceToDepthOp: TFL_Op<"space_to_depth", [
     NoSideEffect,
     SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
-      TCresVTEtIsSameAsOp<0, 0>>
+      TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_GpuTargetOp
   ]> {
   let summary = "SpaceToDepth operator";
 
@@ -2760,7 +2966,8 @@ def TFL_DepthToSpaceOp: TFL_Op<"depth_to_space", [
     NoSideEffect,
     SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
-      TFL_TCresVTEtIsSameAsOp<0, 0>>
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRankAtMost<0, 4>
   ]> {
   let summary = "DepthToSpace operator";
 
@@ -2774,12 +2981,12 @@ def TFL_DepthToSpaceOp: TFL_Op<"depth_to_space", [
    }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, TFL_Quint8, QUI8]>:$input,
-    I32Attr:$block_size
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, TFL_Uint8, UI8, QI8, QUI8]>:$input,
+    Confined<I32Attr, [IntPositive]>:$block_size
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, TFL_Quint8, QUI8]>:$output
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, TFL_Uint8, UI8, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
@@ -2872,7 +3079,8 @@ def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor",
   let arguments = (ins
     TFL_TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$input,
     TFL_TensorOf<[I32]>:$size,
-    BoolAttr:$align_corners
+    BoolAttr:$align_corners,
+    DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
   );
 
   let results = (outs
@@ -2923,7 +3131,7 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice",
   [
     NoSideEffect,
     PredOpTrait<"input and output must have same element type",
-    TFL_TCresVTEtIsSameAsOp<0, 0>>,
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     SameOperandsAndResultsScale,
     TFL_GpuTargetOp
   ]> {
@@ -3032,6 +3240,8 @@ in the unique output `y`. In other words:
     return getResult(1).getType().cast<TensorType>().getElementType().
         cast<IntegerType>().getWidth() > 32 ? tflite::TensorType_INT64 :
             tflite::TensorType_INT32;
+    }], [{
+      TypeAttr::get(getResult(1).getType().cast<TensorType>().getElementType())
     }]>;
 
   let hasOptions = 1;
@@ -3048,9 +3258,9 @@ def TFL_DequantizeOp: TFL_Op<"dequantize", [NoQuantizableResult]> {
     quantization parameters.
   }];
 
-  let arguments = (ins AnyTensor:$input);
+  let arguments = (ins TFL_TensorOf<[QI8, QUI8, QI16, F16]>:$input);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_FpTensor:$output);
 }
 
 def TFL_FakeQuantOp : TFL_Op<"fake_quant", [NoSideEffect]> {
@@ -3062,17 +3272,17 @@ def TFL_FakeQuantOp : TFL_Op<"fake_quant", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_FpTensor:$input,
     // The expected [min, max] range of values.
     F32Attr:$min,
     F32Attr:$max,
 
     // The bitwidth of the quantization; between 2 and 16, inclusive.
-    I32Attr:$num_bits,
+    Confined<I32Attr, [IntMinValue<2>, IntMaxValue<16>]>:$num_bits,
     // Quantization range starts from 0 or 1; starts from 1 if true.
-    BoolAttr:$narrow_range);
+    Confined<BoolAttr, [TFL_BoolFalse]>:$narrow_range);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_FpTensor:$output);
 
   let hasCanonicalizer = 0b1;
 
@@ -3094,10 +3304,10 @@ def TFL_QConstOp : Op<TFL_Dialect, "pseudo_qconst", [
     ElementsAttr:$value
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[QUI8, QI8, QI16, QUI16, TFL_Quint8]>:$output);
 
   let builders = [OpBuilder<
-    "Builder *, OperationState &state, TypeAttr qtype, Attribute value",
+    "OpBuilder &, OperationState &state, TypeAttr qtype, Attribute value",
     [{
       state.addAttribute("qtype", qtype);
       state.addAttribute("value", value);
@@ -3119,19 +3329,21 @@ def TFL_SparseQConstOp : Op<TFL_Dialect, "pseudo_sparse_qconst", [
   let arguments = (
     ins TensorTypeAttr:$qtype,
     ElementsAttr:$value,
-    SparsityParameterAttr:$s_param
+    SparsityParameterAttr:$s_param,
+    ElementsAttr:$compressed_data
   );
 
   let results = (outs AnyTensor:$output);
 
   let builders = [OpBuilder<
-    "Builder *, OperationState &state, TypeAttr qtype, "
-    "Attribute value, SparsityParameterAttr s_param",
+    "OpBuilder &, OperationState &state, TypeAttr qtype, "
+    "Attribute value, SparsityParameterAttr s_param, Attribute compressed_data",
     [{
       state.addTypes(qtype.getValue());
       state.addAttribute("qtype", qtype);
       state.addAttribute("value", value);
       state.addAttribute("s_param", s_param);
+      state.addAttribute("compressed_data", compressed_data);
     }]>
   ];
 }
@@ -3153,18 +3365,20 @@ def TFL_QuantizeOp: TFL_Op<"quantize", [
   let results = (outs AnyTensor:$output);
 }
 
-def TFL_DensifyOp: TFL_Op<"densify", [NoSideEffect,
-                                      SameOperandsAndResultType,
-                                      NoQuantizableResult]> {
+def TFL_DensifyOp: TFL_Op<"densify", [
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoQuantizableResult]> {
   let summary = "Densify operator";
 
   let description = [{
     Converts sparse tensor to dense format.
   }];
 
-  let arguments = (ins AnyTensor:$input);
+  let arguments = (ins TFL_TensorOf<[F32, I8]>:$input);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, I8]>:$output);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3227,16 +3441,16 @@ def TFL_BasicLSTMOp : TFL_Op<"basic_lstm", [NoSideEffect,
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$data_input,
-    TFL_TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$prev_activ_input,
-    TFL_TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$weights_input,
-    TFL_TensorOf<[F32, QI32, QUI32]>:$biases_input,
-    TFL_TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$prev_state_input,
+    ins TFL_TensorOf<[F32, QUI8]>:$data_input,
+    TFL_TensorOf<[F32, QUI8]>:$prev_activ_input,
+    TFL_TensorOf<[F32, QUI8]>:$weights_input,
+    TFL_TensorOf<[F32, QI32]>:$biases_input,
+    TFL_TensorOf<[F32, QI16]>:$prev_state_input,
 
     // Attributes
     DefaultValuedAttr<TFL_AFAttr, "TANH">:$fused_activation_function,
-    DefaultValuedAttr<F32Attr, "0.0f">:$cell_clip,
-    DefaultValuedAttr<F32Attr, "0.0f">:$proj_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$cell_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$proj_clip,
     // Since this op is the BASIC kernel only, constrain it.
     Confined<
       DefaultValuedAttr<TFL_LSTMKernelTypeAttr, "BASIC">,
@@ -3245,10 +3459,10 @@ def TFL_BasicLSTMOp : TFL_Op<"basic_lstm", [NoSideEffect,
 
   let hasOptions = 1;
 
-  let results = (outs TFL_2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$activ_output,
-                      TFL_2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$state_output,
-                      TFL_2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$concat_temp,
-                      TFL_2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$activ_temp);
+  let results = (outs TFL_2DTensorOf<[F32, QUI8]>:$activ_output,
+                      TFL_2DTensorOf<[F32, QUI16]>:$state_output,
+                      TFL_2DTensorOf<[F32, QUI8]>:$concat_temp,
+                      TFL_2DTensorOf<[F32, QUI16]>:$activ_temp);
 }
 
 // This is the FULL kernel type LSTM op.
@@ -3478,6 +3692,41 @@ def TFL_BidirectionalSequenceLSTMOp :
            BidiLstmOptionalPeepholeWeightConstraint,
            BidiLstmProjectionWeightBiasConstraint,
            LstmResultConstraint,
+           TFL_OperandHasRank<0, 3>,   // input
+           TFL_OperandHasRank<1, 2>,   // fw_input_to_input_weights
+           TFL_OperandHasRank<2, 2>,   // fw_input_to_forget_weights
+           TFL_OperandHasRank<3, 2>,   // fw_input_to_cell_weights
+           TFL_OperandHasRank<4, 2>,   // fw_input_to_output_weights
+           TFL_OperandHasRank<5, 2>,   // fw_recurrent_to_input_weights
+           TFL_OperandHasRank<6, 2>,   // fw_recurrent_to_forget_weights
+           TFL_OperandHasRank<7, 2>,   // fw_recurrent_to_cell_weights
+           TFL_OperandHasRank<8, 2>,   // fw_recurrent_to_output_weights
+           TFL_OperandHasRank<9, 1>,   // fw_cell_to_input_weights
+           TFL_OperandHasRank<10, 1>,  // fw_cell_to_forget_weights
+           TFL_OperandHasRank<11, 1>,  // fw_cell_to_output_weights
+           TFL_OperandHasRank<12, 1>,  // fw_input_gate_bias
+           TFL_OperandHasRank<13, 1>,  // fw_forget_gate_bias
+           TFL_OperandHasRank<14, 1>,  // fw_cell_bias
+           TFL_OperandHasRank<15, 1>,  // fw_output_gate_bias
+           TFL_OperandHasRank<16, 2>,  // fw_projection_weights
+           TFL_OperandHasRank<17, 1>,  // fw_projection_bias
+           TFL_OperandHasRank<18, 2>,  // bw_input_to_input_weights
+           TFL_OperandHasRank<19, 2>,  // bw_input_to_forget_weights
+           TFL_OperandHasRank<20, 2>,  // bw_input_to_cell_weights
+           TFL_OperandHasRank<21, 2>,  // bw_input_to_output_weights
+           TFL_OperandHasRank<22, 2>,  // bw_recurrent_to_input_weights
+           TFL_OperandHasRank<23, 2>,  // bw_recurrent_to_forget_weights
+           TFL_OperandHasRank<24, 2>,  // bw_recurrent_to_cell_weights
+           TFL_OperandHasRank<25, 2>,  // bw_recurrent_to_output_weights
+           TFL_OperandHasRank<26, 1>,  // bw_cell_to_input_weights
+           TFL_OperandHasRank<27, 1>,  // bw_cell_to_forget_weights
+           TFL_OperandHasRank<28, 1>,  // bw_cell_to_output_weights
+           TFL_OperandHasRank<29, 1>,  // bw_input_gate_bias
+           TFL_OperandHasRank<30, 1>,  // bw_forget_gate_bias
+           TFL_OperandHasRank<31, 1>,  // bw_cell_bias
+           TFL_OperandHasRank<32, 1>,  // bw_output_gate_bias
+           TFL_OperandHasRank<33, 2>,  // bw_projection_weights
+           TFL_OperandHasRank<34, 1>,  // bw_projection_bias
            TFL_StatefulOp]> {
   let summary = "Bidirectional sequence lstm operator";
 
@@ -3571,8 +3820,8 @@ def TFL_BidirectionalSequenceLSTMOp :
 
     // Attributes
     TFL_AFAttr:$fused_activation_function,
-    DefaultValuedAttr<F32Attr, "0.0f">:$cell_clip,
-    DefaultValuedAttr<F32Attr, "0.0f">:$proj_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$cell_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$proj_clip,
     BoolAttr:$merge_outputs,
     BoolAttr:$time_major
   );
@@ -3682,7 +3931,7 @@ def TFL_NumericVerifyOp : Op<TFL_Dialect, "NumericVerify", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[QI8, QUI8, QI16, QUI16]>:$input,
+    TFL_TensorOf<[QI8, QUI8, QI16, F16, TFL_Quint8]>:$input,
     TFL_TensorOf<[F32]>:$ref,
 
     // Attributes
@@ -3802,4 +4051,27 @@ def TFL_WhileOp : Op<TFL_Dialect, "while", [
   let hasCanonicalizer = 1;
 }
 
+def TFL_CustomOp : Op<TFL_Dialect, "custom", [NoSideEffect]> {
+  let summary = "Custom op";
+
+  let description = [{
+    A generic op for any TFLite custom operation.
+
+    input: A list of inputs in the original op.
+    custom_code: A string used to identify which exactly this op is, which
+                 corresponds to operator_codes.custom_code in the flatbuffer.
+    custom_option: a holder to save the op attributes in bytes fashion.
+    output: A list of outputs in the original op.
+  }];
+
+  let arguments = (ins
+    Variadic<TFL_TensorOfOrNone<[AnyType]>>:$input,
+    StrAttr:$custom_code,
+    OpaqueBytesAttr:$custom_option
+  );
+  let results = (outs Variadic<AnyTensor>:$output);
+
+  let verifier = [{ return Verify(*this); }];
+}
+
 #endif // TFL_OPS
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index c338b723a4a..51fcbb97360 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -146,6 +146,10 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
       saved_model_exported_names.begin(), saved_model_exported_names.end());
   absl::Span<std::string> exported_names(exported_names_in_vector);
 
+  if (exported_names.size() != 1) {
+    return errors::Unimplemented("Only support a single exported name.");
+  }
+
   TF_ASSIGN_OR_RETURN(auto module,
                       ImportSavedModel(model_flags.saved_model_dir(),
                                        model_flags.saved_model_version(), tags,
diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
index a63a1e4b1e5..23a65a88186 100644
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_native_cc_binary")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
@@ -115,11 +115,22 @@ tf_native_cc_binary(
     ],
 )
 
+cc_library(
+    name = "numerical_utils",
+    srcs = ["numerical_utils.cc"],
+    hdrs = ["numerical_utils.h"],
+    deps = [
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "device_target",
     srcs = ["device_target.cc"],
     hdrs = ["device_target.h"],
     deps = [
+        ":numerical_utils",
+        "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
@@ -142,3 +153,13 @@ cc_library(
         "@llvm-project//mlir:Support",
     ],
 )
+
+tf_cc_test(
+    name = "numerical_utils_test",
+    srcs = ["numerical_utils_test.cc"],
+    deps = [
+        ":numerical_utils",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/quantization/device_target.cc b/tensorflow/compiler/mlir/lite/quantization/device_target.cc
index b1d72017657..6b5c894b7f5 100644
--- a/tensorflow/compiler/mlir/lite/quantization/device_target.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/device_target.cc
@@ -15,17 +15,24 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/quantization/device_target.h"
 
+#include <algorithm>
+
+#include "absl/types/optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/numerical_utils.h"
 
 namespace mlir {
 namespace quant {
 
 constexpr int k8Bits = 8;
+constexpr int k32Bits = 32;
 constexpr unsigned kSigned = quant::QuantizationFlags::Signed;
 
 DeviceTarget::DeviceTarget(MLIRContext* ctx) : ctx_(ctx) {
@@ -33,49 +40,141 @@ DeviceTarget::DeviceTarget(MLIRContext* ctx) : ctx_(ctx) {
   i8_ = IntegerType::get(k8Bits, ctx_);
   i8_min_ = QuantizedType::getDefaultMinimumForInteger(kSigned, k8Bits);
   i8_max_ = QuantizedType::getDefaultMaximumForInteger(kSigned, k8Bits);
+  i32_ = IntegerType::get(k32Bits, ctx_);
+  i32_min_ = QuantizedType::getDefaultMinimumForInteger(kSigned, k32Bits);
+  i32_max_ = QuantizedType::getDefaultMaximumForInteger(kSigned, k32Bits);
   any_ = AnyQuantizedType();
   qi8_ = AnyQuantizedType::get(kSigned, i8_, f32_, i8_min_, i8_max_);
   qi8n_ = AnyQuantizedType::get(kSigned, i8_, f32_, i8_min_ + 1, i8_max_);
+  qi32_ = AnyQuantizedType::get(kSigned, i32_, f32_, i32_min_, i32_max_);
   assert(qi8n_ == qi8n_);
 }
 
-Optional<KernelSpec> DeviceTarget::Get(QuantizeRegionOp op) const {
-  auto kernel_specs_it = specs_.find(op.logical_kernel());
+Optional<KernelSpec> DeviceTarget::GetKernelSpec(
+    llvm::StringRef kernel, const KernelSpecs::Signature& signature) const {
+  auto kernel_specs_it = specs_.find(kernel);
   if (kernel_specs_it == specs_.end()) return llvm::None;
-
-  KernelSpecs::Signature signature;
-  signature.reserve(op.input_specs().size() + op.output_specs().size());
-  AppendToSignature(op.input_specs(), &signature);
-  AppendToSignature(op.output_specs(), &signature);
   return kernel_specs_it->getValue().Find(signature);
 }
 
+ScaleDecomposeFn DeviceTarget::GetDecomposeFn(QuantizeRegionOp op) const {
+  auto kernel_specs_it = specs_.find(op.logical_kernel());
+  if (kernel_specs_it == specs_.end()) return ScaleDecomposeFn(nullptr);
+  return kernel_specs_it->second.GetDecomposeFn();
+}
+
+void DeviceTarget::AppendToSignature(Type spec,
+                                     KernelSpecs::Signature* signature) {
+  if (auto quant = spec.dyn_cast_or_null<UniformQuantizedType>()) {
+    signature->push_back(AnyQuantizedType::get(
+        quant.getFlags(), quant.getStorageType(), quant.getExpressedType(),
+        quant.getStorageTypeMin(), quant.getStorageTypeMax()));
+  } else if (auto any = spec.dyn_cast_or_null<AnyQuantizedType>()) {
+    signature->push_back(any);
+  } else {  // float
+    signature->push_back(AnyQuantizedType());
+  }
+}
+
 LogicalResult DeviceTarget::RegisterKernel(
     llvm::StringRef kernel, const KernelSpecs::Signature& signature,
-    const ScaleFn& fn) {
+    const ScaleFn& fn, const ScaleDecomposeFn& dfn) {
   return specs_[kernel].Add(signature, {ScaleConstraintType::CustomScale, fn});
 }
 
+namespace ph = std::placeholders;
+
 LogicalResult DeviceTarget::RegisterKernel(
     llvm::StringRef kernel, const KernelSpecs::Signature& signature,
     const ScaleConstraintType constraint) {
-  return specs_[kernel].Add(signature, {constraint, {}});
+  if (failed(specs_[kernel].Add(signature, {constraint, {}}))) return failure();
+  switch (constraint) {
+    case ScaleConstraintType::OutputInputSameScale:
+      specs_[kernel].WithImpl(std::bind(&DeviceTarget::DecomposeSameScale,
+                                        ph::_1, ph::_2, ph::_3, ph::_4));
+      return success();
+    default:
+      return failure();
+  }
 }
 
-void DeviceTarget::AppendToSignature(ArrayAttr specs_attr,
-                                     KernelSpecs::Signature* signature) const {
-  for (auto attr : specs_attr) {
-    Type spec = attr.cast<TypeAttr>().getValue();
-    if (auto quant = spec.dyn_cast<UniformQuantizedType>()) {
-      signature->push_back(AnyQuantizedType::get(
-          quant.getFlags(), quant.getStorageType(), quant.getExpressedType(),
-          quant.getStorageTypeMin(), quant.getStorageTypeMax()));
-    } else if (auto any = spec.dyn_cast<AnyQuantizedType>()) {
-      signature->push_back(any);
-    } else {  // float
-      signature->push_back({});
-    }
+LogicalResult DeviceTarget::DecomposeMultiplyAccumulateScale(
+    Operation* op, quant::QuantizedMultipliers* input_multipliers,
+    quant::QuantizedMultipliers* output_multipliers,
+    quant::QuantizedRanges* output_ranges) {
+  auto rop = llvm::dyn_cast<quant::QuantizeRegionOp>(op);
+  if (!rop) return failure();
+
+  llvm::SmallVector<Type, 4> input_specs, out_specs;
+  for (auto spec : rop.input_specs()) {
+    input_specs.push_back(spec.cast<TypeAttr>().getValue());
   }
+  for (auto spec : rop.output_specs()) {
+    out_specs.push_back(spec.cast<TypeAttr>().getValue());
+  }
+
+  auto in_spec = input_specs[0].dyn_cast<quant::UniformQuantizedType>();
+  // TODO(fengliuai): handles the PerAxis QuantizedType.
+  auto w_spec = input_specs[1].dyn_cast<quant::UniformQuantizedType>();
+  auto b_spec = input_specs[2].dyn_cast<quant::UniformQuantizedType>();
+  auto o_spec = out_specs[0].dyn_cast<quant::UniformQuantizedType>();
+  if (!in_spec || !w_spec || !b_spec || !o_spec) return failure();
+
+  double scale_product = in_spec.getScale() * w_spec.getScale();
+  if (fabs(scale_product - b_spec.getScale()) >= 1e-6) return failure();
+
+  // input multipliers
+  input_multipliers->append(3, kUnitQuantizedMultiplier);
+
+  // output multipliers
+  double real_multiplier = scale_product / o_spec.getScale();
+  output_multipliers->push_back(quant::QuantizeMultiplier(real_multiplier));
+
+  // output ranges
+  auto min = rop.getAttrOfType<FloatAttr>("min");
+  auto max = rop.getAttrOfType<FloatAttr>("max");
+  output_ranges->push_back(quant::CalculateQuantizedRange(
+      o_spec.getScale(), o_spec.getZeroPoint(),
+      (min ? absl::optional<double>(min.getValueAsDouble()) : absl::nullopt),
+      (max ? absl::optional<double>(max.getValueAsDouble()) : absl::nullopt),
+      o_spec.getStorageTypeMin(), o_spec.getStorageTypeMax()));
+
+  return success();
+}
+
+LogicalResult DeviceTarget::DecomposeSameScale(
+    Operation* op, quant::QuantizedMultipliers* input_multipliers,
+    quant::QuantizedMultipliers* output_multipliers,
+    quant::QuantizedRanges* output_ranges) {
+  auto rop = llvm::dyn_cast<quant::QuantizeRegionOp>(op);
+  if (!rop) return failure();
+
+  // input multipliers
+  for (int i = 0; i < op->getNumOperands(); ++i) {
+    input_multipliers->push_back(kUnitQuantizedMultiplier);
+  }
+
+  // output multipliers
+  for (int i = 0; i < op->getNumResults(); ++i) {
+    output_multipliers->push_back(kUnitQuantizedMultiplier);
+  }
+
+  auto o_spec = rop.output_specs()[0]
+                    .cast<TypeAttr>()
+                    .getValue()
+                    .dyn_cast<quant::UniformQuantizedType>();
+  if (!o_spec) return failure();
+
+  // output ranges
+  auto min = rop.getAttrOfType<FloatAttr>("min");
+  auto max = rop.getAttrOfType<FloatAttr>("max");
+  output_ranges->push_back(quant::CalculateQuantizedRange(
+      o_spec.getScale(), o_spec.getZeroPoint(),
+      (min ? absl::optional<double>(min.getValueAsDouble()) : absl::nullopt),
+      (max ? absl::optional<double>(max.getValueAsDouble()) : absl::nullopt),
+      o_spec.getStorageTypeMin(), o_spec.getStorageTypeMax()));
+
+  return success();
 }
 
 }  // namespace quant
diff --git a/tensorflow/compiler/mlir/lite/quantization/device_target.h b/tensorflow/compiler/mlir/lite/quantization/device_target.h
index ee5f1fe7a4c..8ed43157df8 100644
--- a/tensorflow/compiler/mlir/lite/quantization/device_target.h
+++ b/tensorflow/compiler/mlir/lite/quantization/device_target.h
@@ -17,13 +17,13 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_DEVICE_TARGET_H_
 
 #include <functional>
-#include <ostream>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/numerical_utils.h"
 
 namespace mlir {
 namespace quant {
@@ -40,9 +41,17 @@ namespace quant {
 class QuantizeContext;
 
 using AdjacentOperations = llvm::SmallVectorImpl<Operation*>;
+using QuantizedMultipliers = llvm::SmallVector<QuantizedMultiplier, 4>;
+using QuantizedRanges = llvm::SmallVector<QuantizedRange, 4>;
 using ScaleFn = std::function<LogicalResult(QuantizeContext*, Operation*,
                                             AdjacentOperations*, bool*)>;
 
+using ScaleDecomposeFn =
+    std::function<LogicalResult(Operation*, QuantizedMultipliers*,
+                                QuantizedMultipliers*, QuantizedRanges*)>;
+
+static const QuantizedMultiplier kUnitQuantizedMultiplier{1, 0};
+
 enum class ScaleConstraintType {
   OutputInputSameScale,
   OutputInputFreeScale,
@@ -73,12 +82,25 @@ class KernelSpecs {
     }
   }
 
+  ScaleDecomposeFn GetDecomposeFn() const { return decompose_fn_; }
+
   // Adds the kernel signature with the kernel specification.
   LogicalResult Add(const Signature& signature, const KernelSpec& spec) {
     if (all_signatures_.insert({signature, spec}).second) return success();
     return failure();
   }
 
+  KernelSpecs& WithSignature(const KernelSpecs::Signature& signature,
+                             const ScaleFn& fn) {
+    Add(signature, {ScaleConstraintType::CustomScale, fn});
+    return *this;
+  }
+
+  KernelSpecs& WithImpl(const ScaleDecomposeFn& dfn) {
+    decompose_fn_ = dfn;
+    return *this;
+  }
+
  private:
   // The signature is pattern match based.
   struct SignatureInfo : public llvm::DenseMapInfo<Signature> {
@@ -101,6 +123,10 @@ class KernelSpecs {
   // Maps the signature to the kernel spec. Note that the matching is
   // pattern match based.
   llvm::DenseMap<Signature, KernelSpec, SignatureInfo> all_signatures_;
+
+  // A method to compute the effective multipliers. This is independent on the
+  // bits of the ports, thus all the signature shares the same here.
+  ScaleDecomposeFn decompose_fn_;
 };
 
 class DeviceTarget {
@@ -108,31 +134,51 @@ class DeviceTarget {
   explicit DeviceTarget(MLIRContext* ctx);
 
   // Retrieves the kernel spec for the quant region op.
-  Optional<KernelSpec> Get(quant::QuantizeRegionOp op) const;
+  Optional<KernelSpec> GetKernelSpec(
+      llvm::StringRef kernel, const KernelSpecs::Signature& signature) const;
+
+  // Retrieves the scale decomposition function for the quant region op.
+  ScaleDecomposeFn GetDecomposeFn(quant::QuantizeRegionOp op) const;
+
+  // converts specification to signature:
+  // - UniformedQuantizedType -> AnyQuantizedType
+  // - AnyQuantizedType (int) -> AnyQuantizedType
+  // - Float -> {}
+  static void AppendToSignature(Type spec, KernelSpecs::Signature* signature);
 
  protected:
   // Adds the kernel spec with the custom scale function for the kernel.
   LogicalResult RegisterKernel(llvm::StringRef kernel,
                                const KernelSpecs::Signature& signature,
-                               const ScaleFn& fn);
+                               const ScaleFn& fn, const ScaleDecomposeFn& dfn);
 
   // Adds the kernel spec with the scale constraint type for the kernel.
   LogicalResult RegisterKernel(llvm::StringRef kernel,
                                const KernelSpecs::Signature& signature,
                                const ScaleConstraintType constraint);
 
-  // converts specification to signature:
-  // - UniformedQuantizedType -> AnyQuantizedType
-  // - AnyQuantizedType (int) -> AnyQuantizedType
-  // - Float -> {}
-  void AppendToSignature(ArrayAttr specs_attr,
-                         KernelSpecs::Signature* signature) const;
+  // Adds the kernel with the name. Retrun an existing one if it has been
+  // added before.
+  KernelSpecs& RegisterKernel(llvm::StringRef kernel) { return specs_[kernel]; }
+
+  // For "mulmat->add" type of kernels, convert the scales of all the ports to
+  // multipliers.
+  static LogicalResult DecomposeMultiplyAccumulateScale(
+      Operation* op, quant::QuantizedMultipliers* input_multipliers,
+      quant::QuantizedMultipliers* output_multipliers,
+      quant::QuantizedRanges* output_ranges);
+
+  // For "reshape" type of kernels.
+  static LogicalResult DecomposeSameScale(
+      Operation* op, quant::QuantizedMultipliers* input_multipliers,
+      quant::QuantizedMultipliers* output_multipliers,
+      quant::QuantizedRanges* output_ranges);
 
   // A set of parameters are required to build the signatures.
   FloatType f32_;
-  IntegerType i8_;
-  int64_t i8_min_, i8_max_;
-  AnyQuantizedType any_, qi8_, qi8n_;
+  IntegerType i8_, i32_;
+  int64_t i8_min_, i8_max_, i32_min_, i32_max_;
+  AnyQuantizedType any_, qi8_, qi8n_, qi32_;
 
  private:
   // Maps the kernel names to all the available kernels.
diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index 9d5aa167ff4..d924a3e82ac 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_info.pb.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_passes.h"
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index 1504f7d3a1b..b4fddceb580 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -72,5 +72,6 @@ tf_cc_binary(
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index 9b49757fd3f..a2e3c065113 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
 namespace lite {
@@ -38,7 +39,9 @@ namespace lite {
 TfLiteStatus QuantizeModel(
     const tflite::ModelT& input_model, const tflite::TensorType& input_type,
     const tflite::TensorType& output_type,
-    const std::unordered_set<std::string>& operator_names, bool fully_quantize,
+    const tflite::TensorType& inference_type,
+    const std::unordered_set<std::string>& operator_names,
+    bool disable_per_channel, bool fully_quantize,
     flatbuffers::FlatBufferBuilder* builder,
     tflite::ErrorReporter* error_reporter) {
   // TODO(b/142502494): remove this restriction by improving the `emit_adaptor`
@@ -72,15 +75,18 @@ TfLiteStatus QuantizeModel(
   // Apply quantization passes
   PassManager pm(module->getContext());
   TFL::QuantizationSpecs quant_specs;
-  quant_specs.inference_type = tensorflow::DT_QINT8;
+  quant_specs.inference_type = tflite::TflTypeToTfType(inference_type);
   quant_specs.post_training_quantization = true;
+  quant_specs.disable_per_channel = disable_per_channel;
 
   bool emit_adaptor = false;
   auto input_tf_type = tflite::TflTypeToTfType(input_type);
   if (input_tf_type == tensorflow::DT_FLOAT) {
     emit_adaptor = true;
-  } else if (input_tf_type == tensorflow::DT_UINT8) {
-    quant_specs.inference_type = tensorflow::DT_QUINT8;
+  } else if (input_tf_type == tensorflow::DT_UINT8 ||
+             input_tf_type == tensorflow::DT_INT8 ||
+             input_tf_type == tensorflow::DT_INT16) {
+    quant_specs.inference_type = input_tf_type;
   }
 
   pm.addPass(TFL::CreatePrepareQuantizePass(quant_specs));
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
index 473e97e07df..d60df56b473 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
@@ -26,12 +26,15 @@ namespace mlir {
 namespace lite {
 
 // Quantize the `input_model` and write the result to a flatbuffer `builder`.
-// The `input_type` and `output_type` can be float32/qint8/int8.
+// The `input_type`, `output_type` and `inference_type` can be
+// float32/qint8/int8/int16.
 // Return partially quantized model if `fully_quantize` is false.
 TfLiteStatus QuantizeModel(
     const tflite::ModelT& input_model, const tflite::TensorType& input_type,
     const tflite::TensorType& output_type,
-    const std::unordered_set<std::string>& operator_names, bool fully_quantize,
+    const tflite::TensorType& inference_type,
+    const std::unordered_set<std::string>& operator_names,
+    bool disable_per_channel, bool fully_quantize,
     flatbuffers::FlatBufferBuilder* builder,
     tflite::ErrorReporter* error_reporter);
 }  // namespace lite
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
index 7530cdf008f..5bd1b71e631 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
@@ -46,7 +46,9 @@ TfLiteStatus QuantizeAnnotatedModel(llvm::StringRef buffer,
 
   tflite::StderrReporter error_reporter;
   return mlir::lite::QuantizeModel(
-      *model, tflite::TensorType_INT8, tflite::TensorType_INT8, {},
+      *model, tflite::TensorType_INT8, tflite::TensorType_INT8,
+      tflite::TensorType_INT8, {},
+      /*disable_per_channel=*/false,
       /*fully_quantize=*/true, builder, &error_reporter);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/numerical_utils.cc b/tensorflow/compiler/mlir/lite/quantization/numerical_utils.cc
new file mode 100644
index 00000000000..417013f5f84
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/numerical_utils.cc
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/quantization/numerical_utils.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "absl/types/optional.h"
+
+namespace mlir {
+namespace quant {
+
+// This method is adopted from TFLite:
+// ["tensorflow/lite/kernels/internal/quantization_util.cc"]
+QuantizedMultiplier QuantizeMultiplier(double double_multiplier) {
+  if (double_multiplier < 1e-6) {
+    return {0, 0};
+  }
+
+  int32_t shift;
+  const double q = frexp(double_multiplier, &shift);
+  auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
+  assert(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31)) {
+    q_fixed /= 2;
+    ++shift;
+  }
+  assert(q_fixed <= std::numeric_limits<int32_t>::max());
+  // A shift amount smaller than -31 would cause all bits to be shifted out
+  // and thus all results would be zero. We implement that instead with
+  // q_fixed==0, so as to avoid hitting issues with right-shift
+  // operations with shift amounts greater than 31. Note that this happens
+  // roughly when abs(double_multiplier) < 2^-31 and the present handling means
+  // that we're effectively flushing tiny double_multiplier's to zero.
+  // We could conceivably handle values in the range (roughly) [32, 63]
+  // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
+  // the present handling is just doing 'flush denormals to zero'. We could
+  // reconsider and actually generate nonzero denormals if a need arises.
+  if (shift < -31) {
+    shift = 0;
+    q_fixed = 0;
+  }
+  return {static_cast<int32_t>(q_fixed), shift};
+}
+
+QuantizedRange CalculateQuantizedRange(double scale, int32_t zero_point,
+                                       absl::optional<double> rmin,
+                                       absl::optional<double> rmax,
+                                       int32_t qmin, int32_t qmax) {
+  auto quantize = [scale, zero_point](float f) {
+    return zero_point + static_cast<int32_t>(std::round(f / scale));
+  };
+
+  if (rmin.has_value() && rmax.has_value()) {
+    return {std::max(qmin, quantize(rmin.value())),
+            std::min(qmax, quantize(rmax.value()))};
+  } else if (rmin.has_value()) {
+    return {std::max(qmin, quantize(rmin.value())), qmax};
+  } else if (rmax.has_value()) {
+    return {qmin, std::min(qmax, quantize(rmax.value()))};
+  } else {
+    return {qmin, qmax};
+  }
+}
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/numerical_utils.h b/tensorflow/compiler/mlir/lite/quantization/numerical_utils.h
new file mode 100644
index 00000000000..9a818dbbe0e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/numerical_utils.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_NUMERICAL_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_NUMERICAL_UTILS_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/types/optional.h"
+
+namespace mlir {
+namespace quant {
+
+using QuantizedMultiplier = std::pair<int32_t, int32_t>;
+using QuantizedRange = std::pair<int32_t, int32_t>;
+
+// Decompose double precision multiplier to integer multiplier and exponent.
+//    double_multiplier = int_multiplier * 2 ^ (-31 + exponent)
+// int_multiplier will be range of (2^31, 2^30].
+QuantizedMultiplier QuantizeMultiplier(double double_multiplier);
+
+// Calculate the effective quantized value range for the scale, zero point. The
+// range is the minimum range defined by [rmin, rmax] and [qmin, qmax].
+QuantizedRange CalculateQuantizedRange(double scale, int32_t zero_point,
+                                       absl::optional<double> rmin,
+                                       absl::optional<double> rmax,
+                                       int32_t qmin, int32_t qmax);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_NUMERICAL_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc b/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc
new file mode 100644
index 00000000000..05b38a8ae0c
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/quantization/numerical_utils.h"
+
+#include <cmath>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/optional.h"
+
+namespace mlir {
+namespace quant {
+
+namespace {
+
+double ComposeScale(const QuantizedMultiplier& input) {
+  return input.first * exp2(-31 + input.second);
+}
+
+TEST(NumericalUtils, QuantizeMultiplier) {
+  // Decompose multiplier larger than 1.
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e6)), 1.0e6);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e3)), 1.0e3);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(10.)), 10.);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(5.)), 5.);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(2.)), 2.);
+
+  // Decompose multiplier between 1.0 and 1e-6.
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(0.0)), 0.0);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0)), 1.0);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e-1)), 1.0e-1);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e-2)), 1.0e-2);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e-3)), 1.0e-3);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e-4)), 1.0e-4);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e-5)), 1.0e-5);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e-6)), 1.0e-6);
+
+  // When scale is smaller than 1.0e-6, it is decomposed to {0, 0}.
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e-7)), 0.0);
+  ASSERT_FLOAT_EQ(ComposeScale(QuantizeMultiplier(1.0e-8)), 0.0);
+}
+
+TEST(NumericalUtils, ActivationRange) {
+  // zero point = 0
+  auto a =
+      CalculateQuantizedRange(1e-6, 0, absl::nullopt, absl::nullopt, -128, 127);
+  ASSERT_EQ(a.first, -128);
+  ASSERT_EQ(a.second, 127);
+
+  auto b = CalculateQuantizedRange(1e-6, 0, 0.0, absl::nullopt, -128, 127);
+  ASSERT_EQ(b.first, 0);
+  ASSERT_EQ(b.second, 127);
+
+  auto c = CalculateQuantizedRange(1e-6, 0, -1.0, 1.0, -128, 127);
+  ASSERT_EQ(c.first, -128);
+  ASSERT_EQ(c.second, 127);
+
+  auto d = CalculateQuantizedRange(1e-6, 0, 0.0, 6.0, -128, 127);
+  ASSERT_EQ(d.first, 0);
+  ASSERT_EQ(d.second, 127);
+
+  // zero point = 100
+  auto e = CalculateQuantizedRange(1e-6, 100, absl::nullopt, absl::nullopt,
+                                   -128, 127);
+  ASSERT_EQ(e.first, -128);
+  ASSERT_EQ(e.second, 127);
+
+  auto f = CalculateQuantizedRange(1e-6, 100, 0.0, absl::nullopt, -128, 127);
+  ASSERT_EQ(f.first, 100);
+  ASSERT_EQ(f.second, 127);
+
+  auto g = CalculateQuantizedRange(1e-6, 100, -1.0, 1.0, -128, 127);
+  ASSERT_EQ(g.first, -128);
+  ASSERT_EQ(g.second, 127);
+
+  auto h = CalculateQuantizedRange(1e-6, 100, 0.0, 6.0, -128, 127);
+  ASSERT_EQ(h.first, 100);
+  ASSERT_EQ(h.second, 127);
+
+  // zero point = -100
+  auto i = CalculateQuantizedRange(1e-6, -100, absl::nullopt, absl::nullopt,
+                                   -128, 127);
+  ASSERT_EQ(i.first, -128);
+  ASSERT_EQ(i.second, 127);
+
+  auto j = CalculateQuantizedRange(1e-6, -100, 0.0, absl::nullopt, -128, 127);
+  ASSERT_EQ(j.first, -100);
+  ASSERT_EQ(j.second, 127);
+
+  auto k = CalculateQuantizedRange(1e-6, -100, -1.0, 1.0, -128, 127);
+  ASSERT_EQ(k.first, -128);
+  ASSERT_EQ(k.second, 127);
+
+  auto l = CalculateQuantizedRange(1e-6, -100, 0.0, 6.0, -128, 127);
+  ASSERT_EQ(l.first, -100);
+  ASSERT_EQ(l.second, 127);
+}
+
+}  // namespace
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
index 5b1c73e7887..2ffba579548 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
@@ -46,6 +46,12 @@ struct QuantizationSpecs {
   // post-training quantization. We need to deprecate the `weight_quantization`.
   bool post_training_quantization = false;
 
+  // When set to true, quantization will be done per-tensor. Currently, this
+  // option is only valid when the quantization parameters need to be created by
+  // scanning the constant content (post-training quantization or QAT without
+  // weight FakeQuant).
+  bool disable_per_channel = false;
+
   // The node type when the model is exported. Currently this is limited to
   // DT_FLOAT, DT_HALF, DT_QINT8, and DT_QUINT8. When DT_HALF is used, the
   // `weight_quantization` flag needs to set to true. When DT_QUINT8 is used,
@@ -84,7 +90,7 @@ struct QuantizationSpecs {
   bool RunWeightQuantization() const { return weight_quantization; }
 
   // Whether this inference type represents a signed storage type.
-  bool IsSignedInferenceType() {
+  bool IsSignedInferenceType() const {
     switch (inference_type) {
       case tensorflow::DT_QUINT8:
       case tensorflow::DT_QUINT16:
@@ -96,7 +102,7 @@ struct QuantizationSpecs {
 
   // Gets the width of this quantization type. Returns 0 if it isn't a
   // quantization type.
-  int64_t GetQuantizationTypeWidth() {
+  int64_t GetQuantizationTypeWidth() const {
     switch (inference_type) {
       case tensorflow::DT_QINT8:
       case tensorflow::DT_QUINT8:
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc
index 50e3771d467..bcfd06cf06c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc
@@ -64,10 +64,23 @@ std::vector<quant::QuantizeRegionOp> QuantizeContext::GetAllOps() {
   return all_ops;
 }
 
+KernelSpecs::Signature QuantizeContext::GetSignature(QuantizeRegionOp op) {
+  KernelSpecs::Signature signature;
+  signature.reserve(op.input_specs().size() + op.output_specs().size());
+  for (int i = 0; i < op.getNumOperands(); ++i) {
+    DeviceTarget::AppendToSignature(GetOperandParams(op, i), &signature);
+  }
+  for (int i = 0; i < op.getNumResults(); ++i) {
+    DeviceTarget::AppendToSignature(GetResultParams(op, i), &signature);
+  }
+  return signature;
+}
+
 LogicalResult QuantizeContext::Handle(
     quant::QuantizeRegionOp op, llvm::SmallVectorImpl<Operation *> *new_items,
     bool *changed) {
-  auto spec = target_spec_.Get(op);
+  auto signature = GetSignature(op);
+  auto spec = target_spec_.GetKernelSpec(op.logical_kernel(), signature);
   if (!spec.hasValue()) {
     op.emitWarning(
         "Couldn't find kernel from the registeration for quantization.");
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_context.h b/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
index 0d460fd9a50..0c5137eb1a2 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
@@ -107,6 +107,9 @@ class QuantizeContext {
     return states_manager_.GetOperandParams(op, index);
   }
 
+  // Return the signature of the op.
+  KernelSpecs::Signature GetSignature(QuantizeRegionOp op);
+
   // A heuristic to get quantization parameters satisfies the same scale
   // constraints:
   // - If there are immutable states,
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 27ccc7d2b22..d4512509f6b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
@@ -35,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 
 namespace mlir {
@@ -363,6 +365,54 @@ struct ConvertUnsignedToSigned : public OpRewritePattern<Q> {
   }
 };
 
+// Fold Extra Requantize ops if the preceding ops has free scale requirement.
+template <typename RQ>
+struct FoldTrivalRequantizeOp : public OpRewritePattern<RQ> {
+  explicit FoldTrivalRequantizeOp(MLIRContext* context)
+      : OpRewritePattern<RQ>(context, 1) {}
+
+  LogicalResult matchAndRewrite(RQ op,
+                                PatternRewriter& rewriter) const override {
+    Value pre_quantized = op.input();
+    auto pre_quantized_type =
+        quant::QuantizedType::getQuantizedElementType(pre_quantized.getType());
+    if (!pre_quantized_type) return failure();
+
+    Operation* def = pre_quantized.getDefiningOp();
+    if (!def) return failure();
+    if (def->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>() ||
+        def->hasTrait<OpTrait::quant::NoQuantizableResult>()) {
+      return failure();
+    }
+
+    op.emitWarning("Remove trivial `rescale` op. Please fix the source graph.");
+
+    llvm::SmallVector<Type, 4> new_output_types;
+    for (auto result : def->getResults()) {
+      result.getUsers().begin()->dump();
+      op.dump();
+      if (result.hasOneUse() && *result.getUsers().begin() == op) {
+        new_output_types.push_back(op.qtype());
+      } else {
+        new_output_types.push_back(result.getType());
+      }
+    }
+
+    // Remove this rescale op.
+    rewriter.replaceOp(op, {pre_quantized});
+
+    // Replace the output scale of the preceding op.
+    rewriter.setInsertionPointAfter(def);
+    OperationState new_state(def->getLoc(), def->getName().getStringRef(),
+                             def->getOperands(), new_output_types,
+                             def->getAttrs());
+    Operation* new_op = rewriter.createOperation(new_state);
+
+    rewriter.replaceOp(def, new_op->getResults());
+    return success();
+  }
+};
+
 // Given a quantized type `input`, magnifying its scales by the factor stored in
 // `factor`. If `input` isn't a quantized type or the `factor` doesn't match the
 // dimension size of `input` or isn't floating-point, nullptr will be returned.
diff --git a/tensorflow/compiler/mlir/lite/sparsity/BUILD b/tensorflow/compiler/mlir/lite/sparsity/BUILD
index 7ed29173d05..6d0fa671bd2 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/BUILD
+++ b/tensorflow/compiler/mlir/lite/sparsity/BUILD
@@ -25,7 +25,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:common",
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate_lib",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite_d2s",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite:framework",
diff --git a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
index a96c65cd450..8d9228e93b5 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
+++ b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -57,6 +58,7 @@ TfLiteStatus SparsifyModel(const tflite::ModelT& input_model,
   }
 
   PassManager pm(module->getContext());
+  pm.addPass(TFL::CreateDenseToSparsePass());
 
   if (failed(pm.run(module.get()))) {
     const std::string& err = statusHandler.ConsumeStatus().error_message();
diff --git a/tensorflow/compiler/mlir/lite/tests/BUILD b/tensorflow/compiler/mlir/lite/tests/BUILD
index 0d612cec961..58d5afb5864 100644
--- a/tensorflow/compiler/mlir/lite/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
+    exclude = ["load-quantization-recipe.mlir"],
     tags_override = {
         "legalize-tf.mlir": ["no_rocm"],
         "optimize.mlir": ["no_rocm"],
diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index c94eb1bf087..5c69130c939 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -11,9 +11,9 @@ func @reshape_removeAdjacent(tensor<4x4x4xf32>) -> tensor<64xf32> {
   return %1 : tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_removeAdjacent
-// CHECK:  %cst = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  return
+// CHECK:  %[[CST:.*]] = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  return %[[RESHAPE]]
 }
 
 // Checks that tfl.reshape should be removed if its output has more than one
@@ -29,11 +29,11 @@ func @reshape_removeAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> tensor<64xf32>
   return %3 : tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_removeAdjacentWithMultipleUse
-// CHECK:  %cst = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  %1 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  %2 = addf %0, %1
-// CHECK:  return %2
+// CHECK:  %[[CST:.*]] = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  %[[RESHAPE_2:.*]]  = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  %[[RESULT:.*]] = addf %[[RESHAPE_1]], %[[RESHAPE_2]]
+// CHECK:  return %[[RESULT]]
 }
 
 // Checks that tfl.reshape should be kept if its output has more than one
@@ -47,11 +47,11 @@ func @reshape_keepAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> (tensor<16x4xf32
   return %0, %1 : tensor<16x4xf32>, tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_keepAdjacentWithMultipleUse
-// CHECK:  %cst = constant dense<[16, 4]> : tensor<2xi32>
-// CHECK:  %cst_0 = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32>
-// CHECK:  %1 = "tfl.reshape"(%arg0, %cst_0) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  return %0, %1
+// CHECK:  %[[CST:.*]]  = constant dense<[16, 4]> : tensor<2xi32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32>
+// CHECK:  %[[RESHAPE_2:.*]] = "tfl.reshape"(%arg0, %[[CST_0]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  return  %[[RESHAPE_1]],  %[[RESHAPE_2]]
 }
 
 // Checks that tfl.reshape should be removed if its output type is the same
@@ -98,3 +98,16 @@ func @RemoveRedundantPack(%arg0: tensor<2x5xf32>) -> (tensor<2x5xf32>, tensor<5x
   // CHECK-NOT: pack
   // CHECK: return %arg0, %[[UNPACK]]#0 : tensor<2x5xf32>, tensor<5xf32>
 }
+
+// -----
+
+func @Int64SliceBeginSize(%arg0: tensor<4x128x32xf32>) -> tensor<1x128x32xf32> {
+  %0 = "tfl.pseudo_const"() {value = dense<0> : tensor<3xi64>} : () -> tensor<3xi64>
+  %1 = "tfl.pseudo_const"() {value = dense<[1, 128, 32]> : tensor<3xi64>} : () -> tensor<3xi64>
+  %2 = "tfl.slice"(%arg0, %0, %1) : (tensor<4x128x32xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x128x32xf32>
+  return %2 : tensor<1x128x32xf32>
+
+// CHECK:  [[VAL_1:%.*]] = constant dense<0> : tensor<3xi32>
+// CHECK:  [[VAL_2:%.*]] = constant dense<[1, 128, 32]> : tensor<3xi32>
+// CHECK:  [[VAL_3:%.*]] = "tfl.slice"(%arg0, [[VAL_1]], [[VAL_2]]) : (tensor<4x128x32xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x128x32xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index 4b8993e2b26..a8463d51c7e 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -8,13 +8,13 @@ func @add_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>,
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<3.500000e+00> : tensor<4xf32>
-  // CHECK: %cst_0 = constant dense<-5.000000e-01> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<6.000000e+00> : tensor<f32>
-  // CHECK: %cst_2 = constant dense<4.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_3 = constant dense<5.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_4 = constant dense<3.000000e+00> : tensor<4xf32>
-  // CHECK: %0 = tfl.add %cst, %cst_0 {fused_activation_function = "SIGN_BIT"} : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<3.500000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<-5.000000e-01> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<6.000000e+00> : tensor<f32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<4.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_3:.*]]  = constant dense<5.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_4:.*]]  = constant dense<3.000000e+00> : tensor<4xf32>
+  // CHECK: %0 = tfl.add %[[CST]], %[[CST_0]] {fused_activation_function = "SIGN_BIT"} : tensor<4xf32>
 
   %5 = "tfl.add"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.add"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -33,10 +33,10 @@ func @add_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %2 = constant dense< 4> : tensor<4xi32>
   %3 = constant dense<-2> : tensor<4xi32>
 
-  // CHECK: %cst = constant dense<9> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<6> : tensor<4xi32>
-  // CHECK: %cst_1 = constant dense<5> : tensor<4xi32>
-  // CHECK: %cst_2 = constant dense<2> : tensor<4xi32>
+  // CHECK: %[[CST:.*]] = constant dense<9> : tensor<i32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<6> : tensor<4xi32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<5> : tensor<4xi32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<2> : tensor<4xi32>
 
   %5 = "tfl.add"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<  i32>) -> tensor<  i32>
   %6 = "tfl.add"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<4xi32>) -> tensor<4xi32>
@@ -54,10 +54,10 @@ func @sub_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>)
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<3.000000e+00> : tensor<f32>
-  // CHECK: %cst_0 = constant dense<5.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<2.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_2 = constant dense<4.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<3.000000e+00> : tensor<f32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<5.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<2.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<4.000000e+00> : tensor<4xf32>
 
   %5 = "tfl.sub"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.sub"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -75,10 +75,10 @@ func @sub_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %2 = constant dense< 4> : tensor<4xi32>
   %3 = constant dense<-2> : tensor<4xi32>
 
-  // CHECK: %cst = constant dense<7> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<10> : tensor<4xi32>
-  // CHECK: %cst_1 = constant dense<3> : tensor<4xi32>
-  // CHECK: %cst_2 = constant dense<6> : tensor<4xi32>
+  // CHECK: %[[CST:.*]] = constant dense<7> : tensor<i32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<10> : tensor<4xi32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<3> : tensor<4xi32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<6> : tensor<4xi32>
 
   %5 = "tfl.sub"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<  i32>) -> tensor<  i32>
   %6 = "tfl.sub"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<4xi32>) -> tensor<4xi32>
@@ -96,10 +96,10 @@ func @mul_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>)
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<6.750000e+00> : tensor<f32>
-  // CHECK: %cst_0 = constant dense<-2.250000e+00> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<5.250000e+00> : tensor<4xf32>
-  // CHECK: %cst_2 = constant dense<-1.750000e+00> : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<6.750000e+00> : tensor<f32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<-2.250000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<5.250000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<-1.750000e+00> : tensor<4xf32>
 
   %5 = "tfl.mul"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.mul"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -170,8 +170,8 @@ func @add_dense_splat_int() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_splat_dense_int
@@ -183,8 +183,8 @@ func @add_splat_dense_int() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_same_shape
@@ -196,8 +196,8 @@ func @add_dense_dense_int_same_shape() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[5, 22, -2, 98]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[5, 22, -2, 98]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_trailing_dim
@@ -212,10 +212,10 @@ func @add_dense_dense_int_trailing_dim() -> (tensor<2x2xi32>, tensor<2x2x2xi32>,
 
   return %0, %1, %2 : tensor<2x2xi32>, tensor<2x2x2xi32>, tensor<2x2x2xi32>
 
-// CHECK:  %cst = constant dense<{{\[\[}}11, 22], [13, 24]]> : tensor<2x2xi32>
-// CHECK:  %cst_0 = constant dense<{{\[\[\[}}2, 3], [5, 6]], {{\[\[}}4, 5], [7, 8]]]> : tensor<2x2x2xi32>
-// CHECK:  %cst_1 = constant dense<{{\[\[\[}}11, 21], [12, 22]], {{\[\[}}13, 23], [14, 24]]]> : tensor<2x2x2xi32>
-// CHECK:  return %cst, %cst_0, %cst_1
+// CHECK:  %[[CST:.*]] = constant dense<{{\[\[}}11, 22], [13, 24]]> : tensor<2x2xi32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<{{\[\[\[}}2, 3], [5, 6]], {{\[\[}}4, 5], [7, 8]]]> : tensor<2x2x2xi32>
+// CHECK:  %[[CST_1:.*]]  = constant dense<{{\[\[\[}}11, 21], [12, 22]], {{\[\[}}13, 23], [14, 24]]]> : tensor<2x2x2xi32>
+// CHECK:  return %[[CST]], %[[CST_0]], %[[CST_1]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_mixing_1_n
@@ -226,8 +226,8 @@ func @add_dense_dense_int_mixing_1_n() -> tensor<2x2xi32> {
   %0 = "tfl.add"(%cst_0, %cst_1) {fused_activation_function = "NONE"} : (tensor<1x2xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
 
   return %0 : tensor<2x2xi32>
-// CHECK: %cst = constant dense<{{\[\[}}4, 5], [5, 6]]> : tensor<2x2xi32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}4, 5], [5, 6]]> : tensor<2x2xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_splat_float
@@ -239,8 +239,8 @@ func @add_dense_splat_float() -> tensor<4xf32> {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_splat_dense_float
@@ -252,8 +252,8 @@ func @add_splat_dense_float() -> tensor<4xf32> {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_same_shape
@@ -265,8 +265,8 @@ func @add_dense_dense_float_same_shape() -> (tensor<4xf32>) {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-8.89999961, 1.000000e+00, 3.800000e+01, 9.800000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-8.89999961, 1.000000e+00, 3.800000e+01, 9.800000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_trailing_dim
@@ -281,10 +281,10 @@ func @add_dense_dense_float_trailing_dim() -> (tensor<2x2xf32>, tensor<2x2x2xf32
 
   return %0, %1, %2 : tensor<2x2xf32>, tensor<2x2x2xf32>, tensor<2x2x2xf32>
 
-// CHECK:  %cst = constant dense<{{\[\[}}-4.500000e+00, -2.500000e+00], [8.500000e+00, -8.500000e+00]]> : tensor<2x2xf32>
-// CHECK:  %cst_0 = constant dense<{{\[\[\[}}-4.500000e+00, 2.500000e+00], [9.500000e+00, -2.500000e+00]], {{\[\[}}-2.500000e+00, 4.500000e+00], [1.150000e+01, -5.000000e-01]]]> : tensor<2x2x2xf32>
-// CHECK:  %cst_1 = constant dense<{{\[\[\[}}2.000000e+00, -3.000000e+00], [3.000000e+00, -2.000000e+00]], {{\[\[}}4.000000e+00, -1.000000e+00], [5.000000e+00, 0.000000e+00]]]> : tensor<2x2x2xf32>
-// CHECK:  return %cst, %cst_0, %cst_1
+// CHECK:  %[[CST:.*]] = constant dense<{{\[\[}}-4.500000e+00, -2.500000e+00], [8.500000e+00, -8.500000e+00]]> : tensor<2x2xf32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<{{\[\[\[}}-4.500000e+00, 2.500000e+00], [9.500000e+00, -2.500000e+00]], {{\[\[}}-2.500000e+00, 4.500000e+00], [1.150000e+01, -5.000000e-01]]]> : tensor<2x2x2xf32>
+// CHECK:  %[[CST_1:.*]]  = constant dense<{{\[\[\[}}2.000000e+00, -3.000000e+00], [3.000000e+00, -2.000000e+00]], {{\[\[}}4.000000e+00, -1.000000e+00], [5.000000e+00, 0.000000e+00]]]> : tensor<2x2x2xf32>
+// CHECK:  return %[[CST]], %[[CST_0]], %[[CST_1]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_mixfng_1_n
@@ -296,24 +296,24 @@ func @add_dense_dense_float_mixfng_1_n() -> tensor<2x2xf32> {
 
   return %0 : tensor<2x2xf32>
 
-// CHECK: %cst = constant dense<{{\[\[}}-1.500000e+00, -5.500000e+00], [5.500000e+00, 1.500000e+00]]> : tensor<2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}-1.500000e+00, -5.500000e+00], [5.500000e+00, 1.500000e+00]]> : tensor<2x2xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @rank
 func @rank() -> tensor<1xi32> {
   %cst = constant dense<[[1], [2]]> : tensor<2x1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.rank"(%cst) : (tensor<2x1xi32>) -> tensor<1xi32>
   return %0 : tensor<1xi32>
 }
 
 // CHECK-LABEL: @rank_input_known_rank
 func @rank_input_known_rank(%arg0 : tensor<2x1xi32>) -> tensor<1xi32> {
-  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.rank"(%arg0) : (tensor<2x1xi32>) -> tensor<1xi32>
   return %0 : tensor<1xi32>
 }
@@ -323,8 +323,8 @@ func @reshape() -> tensor<4xi32> {
   %input = constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   %shape = constant dense<[4]> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.reshape"(%input, %shape) : (tensor<2x2xi32>, tensor<1xi32>) -> tensor<4xi32>
   return %0 : tensor<4xi32>
 }
@@ -334,8 +334,8 @@ func @reshape_dynamic_output() -> tensor<?xi32> {
   %input = constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   %shape = constant dense<[4]> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.reshape"(%input, %shape) : (tensor<2x2xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -343,8 +343,8 @@ func @reshape_dynamic_output() -> tensor<?xi32> {
 
 // CHECK-LABEL: @pseudo_const
 func @pseudo_const() -> tensor<i32> {
-  // CHECK: [[cst:%.*]] = constant dense<1> : tensor<i32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<1> : tensor<i32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   return %0 : tensor<i32>
 }
@@ -356,8 +356,8 @@ func @range_int() -> tensor<?xi32> {
   %cst_1 = constant dense<4> : tensor<i32>
   %cst_2 = constant dense<1> : tensor<i32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -368,8 +368,8 @@ func @range_float() -> tensor<?xf32> {
   %cst_1 = constant dense<4.0> : tensor<f32>
   %cst_2 = constant dense<1.0> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -381,8 +381,8 @@ func @range_float_neg_delta() -> tensor<?xf32> {
   %cst_1 = constant dense<-4.0> : tensor<f32>
   %cst_2 = constant dense<-1.0> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -393,8 +393,8 @@ func @range_float_nonzero_base() -> tensor<?xf32> {
   %cst_1 = constant dense<7.0> : tensor<f32>
   %cst_2 = constant dense<1.5> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -414,8 +414,8 @@ func @transpose_1d() -> tensor<3xi32> {
   %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
   %cst_perm = constant dense<0> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[}}1, 2, 3]> : tensor<3xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[}}1, 2, 3]> : tensor<3xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<3xi32>
   return %0 : tensor<3xi32>
 }
@@ -425,8 +425,8 @@ func @transpose_dynamic() -> tensor<?xi32> {
   %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
   %cst_perm = constant dense<0> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -436,8 +436,8 @@ func @transpose_2d() -> tensor<2x2xi32> {
   %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %cst_perm = constant dense<[1, 0]> : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 2], {{\[}}1, 3]]> : tensor<2x2xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[}}0, 2], {{\[}}1, 3]]> : tensor<2x2xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -447,8 +447,8 @@ func @transpose_2d_identity() -> tensor<2x2xi32> {
   %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %cst_perm = constant dense<[0, 1]> : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 1], {{\[}}2, 3]]> : tensor<2x2xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[}}0, 1], {{\[}}2, 3]]> : tensor<2x2xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -460,8 +460,8 @@ func @transpose_3d() -> tensor<4x2x3xi32> {
   %cst = constant dense<[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]> : tensor<2x3x4xi32>
   %cst_perm = constant dense<[2, 0, 1]> : tensor<3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[\[}}0, 4, 8], {{\[}}12, 16, 20]], {{\[\[}}1, 5, 9], {{\[}}13, 17, 21]], {{\[\[}}2, 6, 10], {{\[}}14, 18, 22]], {{\[\[}}3, 7, 11], {{\[}}15, 19, 23]]]> : tensor<4x2x3xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 4, 8], {{\[}}12, 16, 20]], {{\[\[}}1, 5, 9], {{\[}}13, 17, 21]], {{\[\[}}2, 6, 10], {{\[}}14, 18, 22]], {{\[\[}}3, 7, 11], {{\[}}15, 19, 23]]]> : tensor<4x2x3xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x3x4xi32>, tensor<3xi32>) -> tensor<4x2x3xi32>
   return %0 : tensor<4x2x3xi32>
 }
@@ -473,8 +473,8 @@ func @ConstantFoldBinaryOpDynamicOutput() -> tensor<?xi32> {
   %87 = "tfl.sub"(%cst_0, %cst) {fused_activation_function = "NONE"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
   return %87 : tensor<?xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_same_shape_dynamic
@@ -486,8 +486,8 @@ func @add_dense_dense_int_same_shape_dynamic() -> tensor<?xi32> {
 
   return %2 : tensor<?xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concat_2_tensors_1_empty
@@ -497,8 +497,8 @@ func @concat_2_tensors_1_empty() -> tensor<2xi32> {
   %3 = "tfl.concatenation"(%1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<0xi32>) -> tensor<2xi32>
   return %3 : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<1> : tensor<2xi32>
-  // CHECK: return [[cst]] : tensor<2xi32>
+  // CHECK: %[[CST:.*]] = constant dense<1> : tensor<2xi32>
+  // CHECK: return %[[CST]] : tensor<2xi32>
 }
 
 // CHECK-LABEL: @concat_3_tensors_1_empty
@@ -509,7 +509,7 @@ func @concat_3_tensors_1_empty() -> tensor<?xi32> {
   %3 = "tfl.concatenation"(%0, %1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<2xi32>, tensor<0xi32>) -> tensor<?xi32>
   return %3 : tensor<?xi32>
 
-  // CHECK: %0 = "tfl.concatenation"(%cst, %cst) {axis = 0 : i32, fused_activation_function = "NONE"}
+  // CHECK: %0 = "tfl.concatenation"(%[[CST]], %[[CST]]) {axis = 0 : i32, fused_activation_function = "NONE"}
   // CHECK: return %0 : tensor<?xi32>
 }
 
@@ -520,10 +520,10 @@ func @concatConstantTensorsFirstDim() -> tensor<2x2x3xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<2x2x3xi32>
   return %0 : tensor<2x2x3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0]], {{\[}}{{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<2x2x3xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0]], {{\[}}{{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<2x2x3xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concatConstantTensorsMiddleDim
@@ -533,10 +533,10 @@ func @concatConstantTensorsMiddleDim() -> tensor<1x4x3xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<1x4x3xi32>
   return %0 : tensor<1x4x3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0], {{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<1x4x3xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0], {{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<1x4x3xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concatConstantTensorsLastDim
@@ -546,10 +546,10 @@ func @concatConstantTensorsLastDim() -> tensor<1x2x6xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 2 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<1x2x6xi32>
   return %0 : tensor<1x2x6xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0, 1, 1, 1], {{\[}}0, 0, 0, 1, 1, 1]]]> : tensor<1x2x6xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0, 1, 1, 1], {{\[}}0, 0, 0, 1, 1, 1]]]> : tensor<1x2x6xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @div_dense_dense_float_mixfng_1_n
@@ -561,8 +561,8 @@ func @div_dense_dense_float_mixfng_1_n() -> tensor<2x2xf32> {
 
   return %0 : tensor<2x2xf32>
 
-// CHECK: %cst = constant dense<{{\[\[}}-5.000000e-01, 0.833333313], [3.750000e-01, -6.250000e-01]]> : tensor<2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}-5.000000e-01, 0.833333313], [3.750000e-01, -6.250000e-01]]> : tensor<2x2xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @div_dense_different_rank
@@ -574,6 +574,6 @@ func @div_dense_different_rank() -> tensor<1x2x2xf32> {
 
   return %0 : tensor<1x2x2xf32>
 
-// CHECK: %cst = constant dense<[{{\[}}{{\[}}5.000000e-01, 0.333333343], [1.000000e+00, 0.666666686]]]> : tensor<1x2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}5.000000e-01, 0.333333343], [1.000000e+00, 0.666666686]]]> : tensor<1x2x2xf32>
+// CHECK:  return %[[CST]]
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
index 9d768fec0ab..cf584987d2d 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
@@ -12,7 +12,6 @@ glob_lit_tests(
         "add.pbtxt": ["no_rocm"],
         "conv_2d.pbtxt": ["no_rocm"],
         "fake_quant_per_channel.pbtxt": ["no_rocm"],
-        "ophint_lstm.pbtxt": ["no_rocm"],
     },
     test_file_exts = [
         "pbtxt",
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt
index adfcd93b4bc..3e03de09d47 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf_tfl_translate -tf-input-arrays=input -tf-input-shapes=1,1,1,256 -tf-input-data-types=DT_FLOAT -tf-inference-type=DT_QINT8  -tf-input-min-values='-33.614346' -tf-input-max-values='21.54917' -tf-output-arrays=output %s -o - --output-mlir 2>&1 | FileCheck --check-prefix=MLIR %s
+# RUN: tf_tfl_translate -tf-input-arrays=input -tf-input-shapes=1,1,1,256 -tf-input-data-types=DT_FLOAT -tf-inference-type=DT_QINT8  -tf-input-min-values='-33.614346' -tf-input-max-values='21.54917' -tf-output-arrays=output %s -o - --output-mlir 2>&1 | FileCheck --check-prefix=MLIR %s --dump-input-on-failure
 # RUN: tf_tfl_translate -tf-input-arrays=input -tf-input-shapes=1,1,1,256 -tf-input-data-types=DT_FLOAT -tf-inference-type=DT_QINT8  -tf-input-min-values='-33.614346' -tf-input-max-values='21.54917' -tf-output-arrays=output %s -o - | flatbuffer_to_string - | FileCheck %s
 
 node {
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/graph_with_placeholder_with_default.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/graph_with_placeholder_with_default.pbtxt
index 82e843517a3..95d483f4e91 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/graph_with_placeholder_with_default.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/graph_with_placeholder_with_default.pbtxt
@@ -142,7 +142,7 @@ versions {
 # CHECK-SAME:  control_outputs = ""
 # CHECK-SAME:  inputs = "unranked"
 # CHECK-SAME:  outputs = "unranked,static,static_10"
-# CHECK:         [[VAL_1:%.*]] = constant dense<0> : tensor<i32>
 # CHECK:         [[VAL_2:%.*]] = constant dense<0> : tensor<10xi32>
+# CHECK:         [[VAL_1:%.*]] = constant dense<0> : tensor<i32>
 # CHECK:         return [[VAL_0]], [[VAL_1]], [[VAL_2]] : tensor<1x8x8x2xi32>, tensor<i32>, tensor<10xi32>
 # CHECK:       }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/ophint_lstm.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/ophint_lstm.pbtxt
deleted file mode 100644
index 1b42b60acf7..00000000000
--- a/tensorflow/compiler/mlir/lite/tests/end2end/ophint_lstm.pbtxt
+++ /dev/null
@@ -1,7822 +0,0 @@
-# RUN: tf_tfl_translate -tf-input-arrays=INPUT -tf-input-shapes=1,3,3 -tf-input-data-types=DT_FLOAT -tf-output-arrays=OUTPUT %s -o - --output-mlir | FileCheck %s
-
-node {
-  name: "INPUT"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: -1
-        }
-        dim {
-          size: 3
-        }
-        dim {
-          size: 3
-        }
-      }
-    }
-  }
-}
-node {
-  name: "unstack"
-  op: "Unpack"
-  input: "INPUT"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "num"
-    value {
-      i: 3
-    }
-  }
-}
-node {
-  name: "rnn/Shape"
-  op: "Shape"
-  input: "unstack"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "out_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/strided_slice/stack"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/strided_slice/stack_1"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/strided_slice/stack_2"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/strided_slice"
-  op: "StridedSlice"
-  input: "rnn/Shape"
-  input: "rnn/strided_slice/stack"
-  input: "rnn/strided_slice/stack_1"
-  input: "rnn/strided_slice/stack_2"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "begin_mask"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "ellipsis_mask"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "end_mask"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "new_axis_mask"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "shrink_axis_mask"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/ExpandDims/dim"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/ExpandDims"
-  op: "ExpandDims"
-  input: "rnn/strided_slice"
-  input: "rnn/TFLiteLSTMCellZeroState/ExpandDims/dim"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/Const"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 3
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/concat/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/concat"
-  op: "ConcatV2"
-  input: "rnn/TFLiteLSTMCellZeroState/ExpandDims"
-  input: "rnn/TFLiteLSTMCellZeroState/Const"
-  input: "rnn/TFLiteLSTMCellZeroState/concat/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/zeros/Const"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/zeros"
-  op: "Fill"
-  input: "rnn/TFLiteLSTMCellZeroState/concat"
-  input: "rnn/TFLiteLSTMCellZeroState/zeros/Const"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "index_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/ExpandDims_2/dim"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/ExpandDims_2"
-  op: "ExpandDims"
-  input: "rnn/strided_slice"
-  input: "rnn/TFLiteLSTMCellZeroState/ExpandDims_2/dim"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/Const_2"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 3
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/concat_1/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/concat_1"
-  op: "ConcatV2"
-  input: "rnn/TFLiteLSTMCellZeroState/ExpandDims_2"
-  input: "rnn/TFLiteLSTMCellZeroState/Const_2"
-  input: "rnn/TFLiteLSTMCellZeroState/concat_1/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/zeros_1/Const"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState/zeros_1"
-  op: "Fill"
-  input: "rnn/TFLiteLSTMCellZeroState/concat_1"
-  input: "rnn/TFLiteLSTMCellZeroState/zeros_1/Const"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "index_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/ExpandDims/dim"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/ExpandDims"
-  op: "ExpandDims"
-  input: "rnn/strided_slice"
-  input: "rnn/TFLiteLSTMCellZeroState_1/ExpandDims/dim"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/Const"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 3
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/concat/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/concat"
-  op: "ConcatV2"
-  input: "rnn/TFLiteLSTMCellZeroState_1/ExpandDims"
-  input: "rnn/TFLiteLSTMCellZeroState_1/Const"
-  input: "rnn/TFLiteLSTMCellZeroState_1/concat/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/zeros/Const"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/zeros"
-  op: "Fill"
-  input: "rnn/TFLiteLSTMCellZeroState_1/concat"
-  input: "rnn/TFLiteLSTMCellZeroState_1/zeros/Const"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "index_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/ExpandDims_2/dim"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/ExpandDims_2"
-  op: "ExpandDims"
-  input: "rnn/strided_slice"
-  input: "rnn/TFLiteLSTMCellZeroState_1/ExpandDims_2/dim"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/Const_2"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 3
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/concat_1/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/concat_1"
-  op: "ConcatV2"
-  input: "rnn/TFLiteLSTMCellZeroState_1/ExpandDims_2"
-  input: "rnn/TFLiteLSTMCellZeroState_1/Const_2"
-  input: "rnn/TFLiteLSTMCellZeroState_1/concat_1/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/zeros_1/Const"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.0
-      }
-    }
-  }
-}
-node {
-  name: "rnn/TFLiteLSTMCellZeroState_1/zeros_1"
-  op: "Fill"
-  input: "rnn/TFLiteLSTMCellZeroState_1/concat_1"
-  input: "rnn/TFLiteLSTMCellZeroState_1/zeros_1/Const"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "index_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_input_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "p\217k>@\254:\276\270W\264\276\014\033N\277p\226a\276\220d+\277\330\277\216>\240VN\276\010\253 \277"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_input_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_input_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/input_to_input_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_input_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_input_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-1-None-input_to_input_w"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_input_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_forget_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "4X\003?\304g1\277\374H\014?@\341\205=\314\264\023?\324{w?\000.V<PG}>\370Y\242>"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_forget_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_forget_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/input_to_forget_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_forget_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_forget_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-None-input_to_forget_w"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_forget_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_cell_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "p\205\r\276@\321\336\2750_\n\276H\256r?\340\017_\277\220\326J\277\2001\013=T\021\n\277\250\000d?"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_cell_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_cell_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/input_to_cell_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_cell_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_cell_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-3-None-input_to_cell_w"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_cell_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 3
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_output_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "`\222T\276l\273A\277 oZ\277\310\335\211\276\300\310?=H\303\264\276\000\367\217\275@\203\224=DXQ\277"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_output_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_output_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/input_to_output_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_to_output_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_output_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-4-None-input_to_output_w"
-  op: "Identity"
-  input: "rnn/rnn1/input_to_output_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 4
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_input_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\310\326\374\27609\310\276\250\036\263\276\200\231\256\274L\362\016?\230\337\003\277\350\023\333>\324;\036?p\026@\276"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_input_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_input_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/cell_to_input_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_input_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_input_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-5-None-cell_to_input_w"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_input_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 5
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_forget_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\210\334b?\024,\033\277\230\r\347\276\030\257\246>\364\0071?\020\036-\277\000\023a>LD ?\024\374\030\277"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_forget_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_forget_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/cell_to_forget_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_forget_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_forget_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-6-None-cell_to_forget_w"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_forget_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 6
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_cell_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\010\341\314\276P6=?p\253N>\364\266-?H;\244>\214*s?\\\307N\277HP\010\277 \226\027>"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_cell_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_cell_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/cell_to_cell_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_cell_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_cell_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-7-None-cell_to_cell_w"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_cell_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 7
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_output_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\350\177\343>\300\212\010\276x\357V?\340\r\344>t[\022\277X\330\021?\330\025\356> s}\277L\352!\277"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_output_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_output_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/cell_to_output_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_to_output_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_output_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-8-None-cell_to_output_w"
-  op: "Identity"
-  input: "rnn/rnn1/cell_to_output_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 8
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_bias"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_bias/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/input_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/input_bias"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/input_bias/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/input_bias/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-12-None-input_bias"
-  op: "Identity"
-  input: "rnn/rnn1/input_bias/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 12
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/forget_bias"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\000\000\200?\000\000\200?\000\000\200?"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/forget_bias/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/forget_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/forget_bias"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/forget_bias/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/forget_bias/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-13-None-forget_bias"
-  op: "Identity"
-  input: "rnn/rnn1/forget_bias/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 13
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_bias"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_bias/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/cell_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/cell_bias"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/cell_bias/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/cell_bias/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-14-None-cell_bias"
-  op: "Identity"
-  input: "rnn/rnn1/cell_bias/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 14
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/output_bias"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/output_bias/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/output_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/output_bias"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/output_bias/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/output_bias/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-15-None-output_bias"
-  op: "Identity"
-  input: "rnn/rnn1/output_bias/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 15
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/w_f_diag"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\020o/> \030\035\276\364|\027?"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/w_f_diag/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/w_f_diag"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/w_f_diag"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/w_f_diag/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/w_f_diag/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-10-None-w_f_diag"
-  op: "Identity"
-  input: "rnn/rnn1/w_f_diag/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 10
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/w_i_diag"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\324\331+\277h\331\322>\250z\017?"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/w_i_diag/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/w_i_diag"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/w_i_diag"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/w_i_diag/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/w_i_diag/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-9-None-w_i_diag"
-  op: "Identity"
-  input: "rnn/rnn1/w_i_diag/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 9
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/w_o_diag"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\230\316\316>\210\316a\277\210\373d\277"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/w_o_diag/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn1/w_o_diag"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn1/w_o_diag"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn1/w_o_diag/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn1/w_o_diag/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-11-None-w_o_diag"
-  op: "Identity"
-  input: "rnn/rnn1/w_o_diag/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 11
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_input_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\220\305\000\2760;\245>HV\372>P\356\270>\324u{?\010\265\345\276\370bw?\300[D\2770\212\344>"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_input_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_input_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/input_to_input_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_input_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_input_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-1-None-input_to_input_w"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_input_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_forget_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\354\037d?\000\254\216\276\374\210w?\020;J\277\200bm=P\270^>\234\2702\277$\300{\277\370\231U\277"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_forget_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_forget_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/input_to_forget_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_forget_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_forget_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-None-input_to_forget_w"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_forget_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_cell_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: ",AH?\200\3616\275,7Y?\024@\024\277p\305\320\276\350\200\342>\000\236\271;\3500\031?T>!?"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_cell_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_cell_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/input_to_cell_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_cell_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_cell_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-3-None-input_to_cell_w"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_cell_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 3
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_output_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "HO[\2770\355L\277@\2007?\324Q\t?$\251\n?@\221\266\276\370mK\277\240\356\014>\300\2440?"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_output_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_output_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/input_to_output_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_to_output_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_output_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-4-None-input_to_output_w"
-  op: "Identity"
-  input: "rnn/rnn2/input_to_output_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 4
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_input_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\274;\002\277\250\302\026\277`\234\361>\220\r\002\277\000\255\200\274\334\332M\277t\225z\277\000(\322:\024\201z\277"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_input_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_input_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/cell_to_input_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_input_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_input_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-5-None-cell_to_input_w"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_input_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 5
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_forget_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "03:>\014\273\035?\020\333+\276\334\371;?HVu?0\310`\27782\275>\304\020x\277,\212a\277"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_forget_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_forget_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/cell_to_forget_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_forget_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_forget_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-6-None-cell_to_forget_w"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_forget_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 6
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_cell_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\244\251o\277\230xo\277\340\222\223>\2409y\276|\327 \277pA\364\276\200\325\003\277\300Lg\277\274=,?"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_cell_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_cell_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/cell_to_cell_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_cell_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_cell_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-7-None-cell_to_cell_w"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_cell_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 7
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_output_w"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\274\345\035\277`\202d?\364\333+?8\246W\2778X\267\276\024ER?4TJ?\254T6? g\215="
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_output_w/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_output_w"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/cell_to_output_w"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_to_output_w/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_output_w/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-8-None-cell_to_output_w"
-  op: "Identity"
-  input: "rnn/rnn2/cell_to_output_w/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 8
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_bias"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_bias/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/input_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/input_bias"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/input_bias/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/input_bias/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-12-None-input_bias"
-  op: "Identity"
-  input: "rnn/rnn2/input_bias/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 12
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/forget_bias"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\000\000\200?\000\000\200?\000\000\200?"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/forget_bias/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/forget_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/forget_bias"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/forget_bias/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/forget_bias/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-13-None-forget_bias"
-  op: "Identity"
-  input: "rnn/rnn2/forget_bias/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 13
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_bias"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_bias/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/cell_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/cell_bias"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/cell_bias/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/cell_bias/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-14-None-cell_bias"
-  op: "Identity"
-  input: "rnn/rnn2/cell_bias/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 14
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/output_bias"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 3
-          }
-        }
-        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/output_bias/Read/ReadVariableOp"
-  op: "Identity"
-  input: "rnn/rnn2/output_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@rnn/rnn2/output_bias"
-      }
-    }
-  }
-}
-node {
-  name: "rnn/rnn2/output_bias/Read/Identity"
-  op: "Identity"
-  input: "rnn/rnn2/output_bias/Read/ReadVariableOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-15-None-output_bias"
-  op: "Identity"
-  input: "rnn/rnn2/output_bias/Read/Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 15
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-0-0-input"
-  op: "Identity"
-  input: "unstack"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-0-c_prev"
-  op: "Identity"
-  input: "rnn/TFLiteLSTMCellZeroState/zeros"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 19
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-18-0-m_prev"
-  op: "Identity"
-  input: "rnn/TFLiteLSTMCellZeroState/zeros_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 18
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-0-0-input"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-18-0-m_prev"
-  input: "rnn/stacked_rnn_cells/concat/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_1/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_1"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-1-None-input_to_input_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-5-None-cell_to_input_w"
-  input: "rnn/stacked_rnn_cells/concat_1/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/MatMul"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells/concat"
-  input: "rnn/stacked_rnn_cells/concat_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/BiasAdd"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells/MatMul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-12-None-input_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_2/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_2"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-None-input_to_forget_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-6-None-cell_to_forget_w"
-  input: "rnn/stacked_rnn_cells/concat_2/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/MatMul_1"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells/concat"
-  input: "rnn/stacked_rnn_cells/concat_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/BiasAdd_1"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells/MatMul_1"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-13-None-forget_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_3/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_3"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-4-None-input_to_output_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-8-None-cell_to_output_w"
-  input: "rnn/stacked_rnn_cells/concat_3/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/MatMul_2"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells/concat"
-  input: "rnn/stacked_rnn_cells/concat_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/BiasAdd_2"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells/MatMul_2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-15-None-output_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_4/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_4"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-3-None-input_to_cell_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-7-None-cell_to_cell_w"
-  input: "rnn/stacked_rnn_cells/concat_4/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/MatMul_3"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells/concat"
-  input: "rnn/stacked_rnn_cells/concat_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/BiasAdd_3"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells/MatMul_3"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-14-None-cell_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/mul"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-10-None-w_f_diag"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-0-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/add"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells/BiasAdd_1"
-  input: "rnn/stacked_rnn_cells/mul"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Sigmoid"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells/add"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/mul_1"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/Sigmoid"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-0-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/mul_2"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-9-None-w_i_diag"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-0-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/add_1"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells/BiasAdd"
-  input: "rnn/stacked_rnn_cells/mul_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Sigmoid_1"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells/add_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Tanh"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells/BiasAdd_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/mul_3"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/Sigmoid_1"
-  input: "rnn/stacked_rnn_cells/Tanh"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/add_2"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells/mul_1"
-  input: "rnn/stacked_rnn_cells/mul_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/mul_4"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-11-None-w_o_diag"
-  input: "rnn/stacked_rnn_cells/add_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/add_3"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells/BiasAdd_2"
-  input: "rnn/stacked_rnn_cells/mul_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Sigmoid_2"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells/add_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Tanh_1"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells/add_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/mul_5"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/Sigmoid_2"
-  input: "rnn/stacked_rnn_cells/Tanh_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-1-0-c"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells/add_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "last"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-0-m"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells/mul_5"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-0-0-input"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-0-m"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-19-0-c_prev"
-  op: "Identity"
-  input: "rnn/TFLiteLSTMCellZeroState_1/zeros"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 19
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-18-0-m_prev"
-  op: "Identity"
-  input: "rnn/TFLiteLSTMCellZeroState_1/zeros_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 18
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_5/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_5"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-0-0-input"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-18-0-m_prev"
-  input: "rnn/stacked_rnn_cells/concat_5/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_6/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_6"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-1-None-input_to_input_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-5-None-cell_to_input_w"
-  input: "rnn/stacked_rnn_cells/concat_6/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/MatMul_4"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells/concat_5"
-  input: "rnn/stacked_rnn_cells/concat_6"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/BiasAdd_4"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells/MatMul_4"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-12-None-input_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_7/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_7"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-None-input_to_forget_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-6-None-cell_to_forget_w"
-  input: "rnn/stacked_rnn_cells/concat_7/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/MatMul_5"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells/concat_5"
-  input: "rnn/stacked_rnn_cells/concat_7"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/BiasAdd_5"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells/MatMul_5"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-13-None-forget_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_8/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_8"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-4-None-input_to_output_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-8-None-cell_to_output_w"
-  input: "rnn/stacked_rnn_cells/concat_8/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/MatMul_6"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells/concat_5"
-  input: "rnn/stacked_rnn_cells/concat_8"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/BiasAdd_6"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells/MatMul_6"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-15-None-output_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_9/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/concat_9"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-3-None-input_to_cell_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-7-None-cell_to_cell_w"
-  input: "rnn/stacked_rnn_cells/concat_9/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/MatMul_7"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells/concat_5"
-  input: "rnn/stacked_rnn_cells/concat_9"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/BiasAdd_7"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells/MatMul_7"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-14-None-cell_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Sigmoid_3"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells/BiasAdd_5"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/mul_6"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/Sigmoid_3"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-19-0-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Sigmoid_4"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells/BiasAdd_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Tanh_2"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells/BiasAdd_7"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/mul_7"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/Sigmoid_4"
-  input: "rnn/stacked_rnn_cells/Tanh_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/add_4"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells/mul_6"
-  input: "rnn/stacked_rnn_cells/mul_7"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Sigmoid_5"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells/BiasAdd_6"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/Tanh_3"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells/add_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/mul_8"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/Sigmoid_5"
-  input: "rnn/stacked_rnn_cells/Tanh_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-1-0-c"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells/add_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "last"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-0-m"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells/mul_8"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-0-1-input"
-  op: "Identity"
-  input: "unstack:1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-1-c_prev"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-1-0-c"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 19
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-18-1-m_prev"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-0-m"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 18
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-0-1-input"
-  input: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-18-1-m_prev"
-  input: "rnn/stacked_rnn_cells_1/concat/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_1/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_1"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-1-None-input_to_input_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-5-None-cell_to_input_w"
-  input: "rnn/stacked_rnn_cells_1/concat_1/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/MatMul"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_1/concat"
-  input: "rnn/stacked_rnn_cells_1/concat_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/BiasAdd"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_1/MatMul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-12-None-input_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_2/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_2"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-None-input_to_forget_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-6-None-cell_to_forget_w"
-  input: "rnn/stacked_rnn_cells_1/concat_2/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/MatMul_1"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_1/concat"
-  input: "rnn/stacked_rnn_cells_1/concat_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/BiasAdd_1"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_1/MatMul_1"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-13-None-forget_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_3/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_3"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-4-None-input_to_output_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-8-None-cell_to_output_w"
-  input: "rnn/stacked_rnn_cells_1/concat_3/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/MatMul_2"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_1/concat"
-  input: "rnn/stacked_rnn_cells_1/concat_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/BiasAdd_2"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_1/MatMul_2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-15-None-output_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_4/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_4"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-3-None-input_to_cell_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-7-None-cell_to_cell_w"
-  input: "rnn/stacked_rnn_cells_1/concat_4/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/MatMul_3"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_1/concat"
-  input: "rnn/stacked_rnn_cells_1/concat_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/BiasAdd_3"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_1/MatMul_3"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-14-None-cell_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/mul"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-10-None-w_f_diag"
-  input: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-1-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/add"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_1/BiasAdd_1"
-  input: "rnn/stacked_rnn_cells_1/mul"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Sigmoid"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_1/add"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/mul_1"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_1/Sigmoid"
-  input: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-1-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/mul_2"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-9-None-w_i_diag"
-  input: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-1-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/add_1"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_1/BiasAdd"
-  input: "rnn/stacked_rnn_cells_1/mul_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Sigmoid_1"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_1/add_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Tanh"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells_1/BiasAdd_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/mul_3"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_1/Sigmoid_1"
-  input: "rnn/stacked_rnn_cells_1/Tanh"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/add_2"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_1/mul_1"
-  input: "rnn/stacked_rnn_cells_1/mul_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/mul_4"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-11-None-w_o_diag"
-  input: "rnn/stacked_rnn_cells_1/add_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/add_3"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_1/BiasAdd_2"
-  input: "rnn/stacked_rnn_cells_1/mul_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Sigmoid_2"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_1/add_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Tanh_1"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells_1/add_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/mul_5"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_1/Sigmoid_2"
-  input: "rnn/stacked_rnn_cells_1/Tanh_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-1-1-c"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_1/add_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "last"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-1-m"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_1/mul_5"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-0-1-input"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_1/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-1-m"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-19-1-c_prev"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-1-0-c"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 19
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-18-1-m_prev"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-0-m"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 18
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_5/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_5"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-0-1-input"
-  input: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-18-1-m_prev"
-  input: "rnn/stacked_rnn_cells_1/concat_5/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_6/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_6"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-1-None-input_to_input_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-5-None-cell_to_input_w"
-  input: "rnn/stacked_rnn_cells_1/concat_6/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/MatMul_4"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_1/concat_5"
-  input: "rnn/stacked_rnn_cells_1/concat_6"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/BiasAdd_4"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_1/MatMul_4"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-12-None-input_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_7/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_7"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-None-input_to_forget_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-6-None-cell_to_forget_w"
-  input: "rnn/stacked_rnn_cells_1/concat_7/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/MatMul_5"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_1/concat_5"
-  input: "rnn/stacked_rnn_cells_1/concat_7"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/BiasAdd_5"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_1/MatMul_5"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-13-None-forget_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_8/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_8"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-4-None-input_to_output_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-8-None-cell_to_output_w"
-  input: "rnn/stacked_rnn_cells_1/concat_8/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/MatMul_6"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_1/concat_5"
-  input: "rnn/stacked_rnn_cells_1/concat_8"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/BiasAdd_6"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_1/MatMul_6"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-15-None-output_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_9/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/concat_9"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-3-None-input_to_cell_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-7-None-cell_to_cell_w"
-  input: "rnn/stacked_rnn_cells_1/concat_9/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/MatMul_7"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_1/concat_5"
-  input: "rnn/stacked_rnn_cells_1/concat_9"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/BiasAdd_7"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_1/MatMul_7"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-14-None-cell_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Sigmoid_3"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_1/BiasAdd_5"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/mul_6"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_1/Sigmoid_3"
-  input: "rnn/stacked_rnn_cells_1/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-19-1-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Sigmoid_4"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_1/BiasAdd_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Tanh_2"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells_1/BiasAdd_7"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/mul_7"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_1/Sigmoid_4"
-  input: "rnn/stacked_rnn_cells_1/Tanh_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/add_4"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_1/mul_6"
-  input: "rnn/stacked_rnn_cells_1/mul_7"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Sigmoid_5"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_1/BiasAdd_6"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/Tanh_3"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells_1/add_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/mul_8"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_1/Sigmoid_5"
-  input: "rnn/stacked_rnn_cells_1/Tanh_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-1-1-c"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_1/add_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "last"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_1/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-1-m"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_1/mul_8"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-0-2-input"
-  op: "Identity"
-  input: "unstack:2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-2-c_prev"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_1/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-1-1-c"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 19
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-18-2-m_prev"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_1/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-1-m"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 18
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-0-2-input"
-  input: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-18-2-m_prev"
-  input: "rnn/stacked_rnn_cells_2/concat/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_1/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_1"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-1-None-input_to_input_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-5-None-cell_to_input_w"
-  input: "rnn/stacked_rnn_cells_2/concat_1/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/MatMul"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_2/concat"
-  input: "rnn/stacked_rnn_cells_2/concat_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/BiasAdd"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_2/MatMul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-12-None-input_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_2/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_2"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-None-input_to_forget_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-6-None-cell_to_forget_w"
-  input: "rnn/stacked_rnn_cells_2/concat_2/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/MatMul_1"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_2/concat"
-  input: "rnn/stacked_rnn_cells_2/concat_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/BiasAdd_1"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_2/MatMul_1"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-13-None-forget_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_3/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_3"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-4-None-input_to_output_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-8-None-cell_to_output_w"
-  input: "rnn/stacked_rnn_cells_2/concat_3/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/MatMul_2"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_2/concat"
-  input: "rnn/stacked_rnn_cells_2/concat_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/BiasAdd_2"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_2/MatMul_2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-15-None-output_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_4/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_4"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-3-None-input_to_cell_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-7-None-cell_to_cell_w"
-  input: "rnn/stacked_rnn_cells_2/concat_4/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/MatMul_3"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_2/concat"
-  input: "rnn/stacked_rnn_cells_2/concat_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/BiasAdd_3"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_2/MatMul_3"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-14-None-cell_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/mul"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-10-None-w_f_diag"
-  input: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-2-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/add"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_2/BiasAdd_1"
-  input: "rnn/stacked_rnn_cells_2/mul"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Sigmoid"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_2/add"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/mul_1"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_2/Sigmoid"
-  input: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-2-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/mul_2"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-9-None-w_i_diag"
-  input: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-19-2-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/add_1"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_2/BiasAdd"
-  input: "rnn/stacked_rnn_cells_2/mul_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Sigmoid_1"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_2/add_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Tanh"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells_2/BiasAdd_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/mul_3"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_2/Sigmoid_1"
-  input: "rnn/stacked_rnn_cells_2/Tanh"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/add_2"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_2/mul_1"
-  input: "rnn/stacked_rnn_cells_2/mul_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/mul_4"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-11-None-w_o_diag"
-  input: "rnn/stacked_rnn_cells_2/add_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/add_3"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_2/BiasAdd_2"
-  input: "rnn/stacked_rnn_cells_2/mul_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Sigmoid_2"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_2/add_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Tanh_1"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells_2/add_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/mul_5"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_2/Sigmoid_2"
-  input: "rnn/stacked_rnn_cells_2/Tanh_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-2-m"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_2/mul_5"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae2de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-0-2-input"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_2/OutputHint-UnidirectionalSequenceLstm-47eb6ae2de2411e9a4834201c0a80701-2-2-m"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-19-2-c_prev"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_1/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-1-1-c"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 19
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-18-2-m_prev"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_1/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-1-m"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "first"
-    }
-  }
-  attr {
-    key: "_tflite_function_input_index"
-    value {
-      i: 18
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_5/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_5"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-0-2-input"
-  input: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-18-2-m_prev"
-  input: "rnn/stacked_rnn_cells_2/concat_5/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_6/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_6"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-1-None-input_to_input_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-5-None-cell_to_input_w"
-  input: "rnn/stacked_rnn_cells_2/concat_6/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/MatMul_4"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_2/concat_5"
-  input: "rnn/stacked_rnn_cells_2/concat_6"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/BiasAdd_4"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_2/MatMul_4"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-12-None-input_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_7/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_7"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-None-input_to_forget_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-6-None-cell_to_forget_w"
-  input: "rnn/stacked_rnn_cells_2/concat_7/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/MatMul_5"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_2/concat_5"
-  input: "rnn/stacked_rnn_cells_2/concat_7"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/BiasAdd_5"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_2/MatMul_5"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-13-None-forget_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_8/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_8"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-4-None-input_to_output_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-8-None-cell_to_output_w"
-  input: "rnn/stacked_rnn_cells_2/concat_8/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/MatMul_6"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_2/concat_5"
-  input: "rnn/stacked_rnn_cells_2/concat_8"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/BiasAdd_6"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_2/MatMul_6"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-15-None-output_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_9/axis"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/concat_9"
-  op: "ConcatV2"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-3-None-input_to_cell_w"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-7-None-cell_to_cell_w"
-  input: "rnn/stacked_rnn_cells_2/concat_9/axis"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/MatMul_7"
-  op: "MatMul"
-  input: "rnn/stacked_rnn_cells_2/concat_5"
-  input: "rnn/stacked_rnn_cells_2/concat_9"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/BiasAdd_7"
-  op: "BiasAdd"
-  input: "rnn/stacked_rnn_cells_2/MatMul_7"
-  input: "rnn/stacked_rnn_cells/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-14-None-cell_bias"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "data_format"
-    value {
-      s: "NHWC"
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Sigmoid_3"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_2/BiasAdd_5"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/mul_6"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_2/Sigmoid_3"
-  input: "rnn/stacked_rnn_cells_2/InputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-19-2-c_prev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Sigmoid_4"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_2/BiasAdd_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Tanh_2"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells_2/BiasAdd_7"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/mul_7"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_2/Sigmoid_4"
-  input: "rnn/stacked_rnn_cells_2/Tanh_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/add_4"
-  op: "Add"
-  input: "rnn/stacked_rnn_cells_2/mul_6"
-  input: "rnn/stacked_rnn_cells_2/mul_7"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Sigmoid_5"
-  op: "Sigmoid"
-  input: "rnn/stacked_rnn_cells_2/BiasAdd_6"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/Tanh_3"
-  op: "Tanh"
-  input: "rnn/stacked_rnn_cells_2/add_4"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/mul_8"
-  op: "Mul"
-  input: "rnn/stacked_rnn_cells_2/Sigmoid_5"
-  input: "rnn/stacked_rnn_cells_2/Tanh_3"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "rnn/stacked_rnn_cells_2/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-2-m"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_2/mul_8"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tflite_function_aggregate"
-    value {
-      s: "stack"
-    }
-  }
-  attr {
-    key: "_tflite_function_name"
-    value {
-      s: "UnidirectionalSequenceLstm"
-    }
-  }
-  attr {
-    key: "_tflite_function_output_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_sort_index"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "_tflite_function_uuid"
-    value {
-      s: "47eb6ae3de2411e9a4834201c0a80701"
-    }
-  }
-  attr {
-    key: "_tflite_ophint_level"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "OUTPUT"
-  op: "Identity"
-  input: "rnn/stacked_rnn_cells_2/OutputHint-UnidirectionalSequenceLstm-47eb6ae3de2411e9a4834201c0a80701-2-2-m"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-library {
-}
-
-# CHECK-LABEL: func @main
-# CHECK-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<1x3x3xf32>) -> tensor<1x3xf32>
-# CHECK-SAME:  control_outputs = ""
-# CHECK-SAME:  inputs = "INPUT"
-# CHECK-SAME:  outputs = "OUTPUT"
-# CHECK:         [[VAL_1:%.*]] = constant dense<0.000000e+00> : tensor<1x3xf32>
-# CHECK:         [[VAL_2:%.*]] = constant dense<0.000000e+00> : tensor<3xf32>
-# CHECK:         [[VAL_3:%.*]] = constant dense<{{\[\[}}-0.856678485, -0.800494194, 0.716800689], [0.536404848, 0.541643381, -0.35657692], [-0.794646739, 0.137629032, 0.690013885]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_4:%.*]] = constant dense<{{\[\[}}-0.125753641, 0.32271719, 0.488939524], [0.36119318, 0.982266664, -0.448646784], [0.966353893, -0.767024993, 0.446366787]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_5:%.*]] = constant dense<{{\[\[}}0.891112089, -2.786560e-01, 0.966933965], [-0.789963722, 0.057955265, 0.217499971], [-0.698129416, -0.983400583, -0.834380626]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_6:%.*]] = constant dense<{{\[\[}}0.782244444, -0.0446639061, 0.848498106], [-0.579102755, -0.407756329, 0.442389727], [0.00566458702, 0.5984025, 0.629857302]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_7:%.*]] = constant dense<1.000000e+00> : tensor<3xf32>
-# CHECK:         [[VAL_8:%.*]] = constant dense<{{\[\[}}-0.616786718, 0.892614365, 0.671324968], [-0.842380046, -0.358094931, 0.821366549], [0.790347338, 0.71222949, 0.0690443515]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_9:%.*]] = constant dense<{{\[\[}}-5.087240e-01, -0.588907719, 0.471896172], [-0.508019447, -0.0157074928, -0.804120779], [-0.978842973, 0.00160336494, -0.978532075]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_10:%.*]] = constant dense<{{\[\[}}0.18183589, 0.616135359, -0.167827845], [0.734281301, 0.958347797, -0.878054618], [0.369523764, -0.969005823, -0.881014585]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_11:%.*]] = constant dense<{{\[\[}}-0.936182261, -0.935433864, 0.288229942], [-0.243383884, -0.628288031, -0.477061749], [-0.514976501, -0.903514862, 6.728170e-01]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_12:%.*]] = constant dense<{{\[}}0.403919935, -0.882057666, -0.894463062]> : tensor<3xf32>
-# CHECK:         [[VAL_13:%.*]] = constant dense<{{\[}}-0.671292543, 0.411814928, 0.560465336]> : tensor<3xf32>
-# CHECK:         [[VAL_14:%.*]] = constant dense<{{\[}}0.171322107, -0.153412342, 0.591750383]> : tensor<3xf32>
-# CHECK:         [[VAL_15:%.*]] = constant dense<{{\[\[}}-0.207589626, -0.756766081, -0.853258133], [-0.269270182, 0.0468223095, -0.353052378], [-0.0702953338, 0.0725159645, -0.817753077]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_16:%.*]] = constant dense<{{\[\[}}0.230039358, -0.182297707, -0.352231741], [-0.805100203, -0.220300436, -0.669503212], [0.278807402, -0.201502323, -0.627609729]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_17:%.*]] = constant dense<{{\[\[}}0.513064623, -0.692989588, 0.547988653], [0.0653710365, 0.576977491, 0.966733217], [0.0130724907, 0.247342348, 0.317092657]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_18:%.*]] = constant dense<{{\[\[}}-0.138204336, -0.10879755, -0.135128736], [0.94797182, -8.713360e-01, -0.792336463], [0.0339827538, -0.539326906, 8.906350e-01]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_19:%.*]] = constant dense<{{\[\[}}0.444335222, -0.133341789, 0.839591503], [0.445418358, -0.571707964, 0.569707394], [0.465010405, -0.990037918, -0.632481337]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_20:%.*]] = constant dense<{{\[\[}}-0.493826151, -0.391061306, -0.349843264], [-0.0213134289, 0.558384657, -0.51513052], [0.427886248, 0.618100405, -0.187585592]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_21:%.*]] = constant dense<{{\[\[}}0.886177539, -0.606141329, -0.451275587], [0.325554609, 0.691527605, -0.676239967], [0.219799042, 0.626042128, -0.597596407]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_22:%.*]] = constant dense<{{\[\[}}-0.400154352, 0.739109992, 0.201825857], [0.678572893, 0.32076478, 0.949867963], [-0.807729483, -5.324750e-01, 0.148033619]]> : tensor<3x3xf32>
-# CHECK:         [[VAL_23:%.*]] = constant unit
-# CHECK:         [[UNPACK:%.*]]:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<1x3x3xf32>) -> (tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>)
-# CHECK:         [[PACK:%.*]] = "tfl.pack"([[UNPACK]]#0, [[UNPACK]]#1, [[UNPACK]]#2) {axis = 0 : i32, values_count = 3 : i32} : (tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<3x1x3xf32>
-# CHECK:         [[VAL_24:%.*]] = constant dense<0.000000e+00> : tensor<1x3xf32>
-# CHECK:         [[UNIDIRECTIONAL_SEQUENCE_LSTM_1:%.*]] = "tfl.unidirectional_sequence_lstm"([[PACK]], [[VAL_16]], [[VAL_17]], [[VAL_18]], [[VAL_15]], [[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_19]], [[VAL_13]], [[VAL_14]], [[VAL_12]], [[VAL_2]], [[VAL_7]], [[VAL_2]], [[VAL_2]], [[VAL_23]], [[VAL_23]], [[VAL_1]], [[VAL_24]], [[VAL_23]], [[VAL_23]], [[VAL_23]], [[VAL_23]]) {fused_activation_function = "TANH", time_major = true} : (tensor<3x1x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, none, none, tensor<1x3xf32>, tensor<1x3xf32>, none, none, none, none) -> tensor<3x1x3xf32>
-# CHECK:         [[VAL_25:%.*]] = constant dense<0.000000e+00> : tensor<1x3xf32>
-# CHECK:         [[VAL_26:%.*]] = constant dense<0.000000e+00> : tensor<1x3xf32>
-# CHECK:         [[UNIDIRECTIONAL_SEQUENCE_LSTM_2:%.*]] = "tfl.unidirectional_sequence_lstm"([[UNIDIRECTIONAL_SEQUENCE_LSTM_1]], [[VAL_4]], [[VAL_5]], [[VAL_6]], [[VAL_3]], [[VAL_9]], [[VAL_10]], [[VAL_11]], [[VAL_8]], [[VAL_23]], [[VAL_23]], [[VAL_23]], [[VAL_2]], [[VAL_7]], [[VAL_2]], [[VAL_2]], [[VAL_23]], [[VAL_23]], [[VAL_25]], [[VAL_26]], [[VAL_23]], [[VAL_23]], [[VAL_23]], [[VAL_23]]) {fused_activation_function = "TANH", time_major = true} : (tensor<3x1x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<3x3xf32>, none, none, none, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, none, none, tensor<1x3xf32>, tensor<1x3xf32>, none, none, none, none) -> tensor<3x1x3xf32>
-# CHECK:         [[RESULT:%.*]]:3 = "tfl.unpack"([[UNIDIRECTIONAL_SEQUENCE_LSTM_2]]) {axis = 0 : i32, num = 3 : i32} : (tensor<3x1x3xf32>) -> (tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>)
-# CHECK:         return [[RESULT]]#2 : tensor<1x3xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir b/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
deleted file mode 100644
index a18ba9cd91a..00000000000
--- a/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
+++ /dev/null
@@ -1,201 +0,0 @@
-// RUN: tf-opt -tfl-extract-ophint %s -split-input-file -verify-diagnostics | FileCheck %s
-
-// CHECK-LABEL: extractSimpleOphint
-func @extractSimpleOphint() {
-// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @d4b1eb00b81211e99426dc4a3e957995(%0) : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-
-  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x16x1xf32>
-  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation", _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  %3 = "tf.Mul"(%2, %1) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x16x1xf32>, tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  return
-}
-
-// CHECK:  func @d4b1eb00b81211e99426dc4a3e957995(tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation"}
-
-// -----
-
-// CHECK-LABEL: extractPackedInputOphint
-func @extractPackedInputOphint() {
-// CHECK:  %[[PACK:[0-9]*]] = "tfl.pack"(%0, %1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<2x1x16x1xf32>
-// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @"47393154b9af11e99426dc4a3e957995"(%[[PACK]]) : (tensor<2x1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_stack", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "47393154b9af11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack-47393154b9af11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-
-  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_stack", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "47393154b9af11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_stack-47393154b9af11e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_stack", _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "47393154b9af11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_stack-47393154b9af11e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_stack", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "47393154b9af11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack-47393154b9af11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  return
-}
-
-// CHECK:  func @"47393154b9af11e99426dc4a3e957995"(tensor<2x1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_stack"}
-
-// -----
-
-// CHECK-LABEL: extractFirstInputOphint
-func @extractFirstInputOphint() {
-// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @b703f0f4b9ec11e99426dc4a3e957995(%0) : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_first", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "b703f0f4b9ec11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_first-b703f0f4b9ec11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-
-  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "first", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_first", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b703f0f4b9ec11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_first-b703f0f4b9ec11e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "first", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_first", _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "b703f0f4b9ec11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_first-b703f0f4b9ec11e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_first", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "b703f0f4b9ec11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_first-b703f0f4b9ec11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  return
-}
-
-// CHECK:  func @b703f0f4b9ec11e99426dc4a3e957995(tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_first"}
-
-// -----
-
-// CHECK-LABEL: extractLastInputOphint
-func @extractLastInputOphint() {
-// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @e31fcf90b9ed11e99426dc4a3e957995(%1) : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_last", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "e31fcf90b9ed11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_last-e31fcf90b9ed11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-
-  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "last", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_last", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "e31fcf90b9ed11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_last-e31fcf90b9ed11e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "last", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_last", _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "e31fcf90b9ed11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_last-e31fcf90b9ed11e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_last", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "e31fcf90b9ed11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_last-e31fcf90b9ed11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  return
-}
-
-// CHECK:  func @e31fcf90b9ed11e99426dc4a3e957995(tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_last"}
-
-// -----
-
-// CHECK-LABEL: extractPackOneInputOphint
-func @extractPackOneInputOphint() {
-// CHECK:  %[[CST:.*]] = constant dense<[1, 1, 16, 1]> : tensor<4xi32>
-// CHECK:  %[[RESHAPE:[0-9]*]] = "tfl.reshape"(%0, %[[CST]]) : (tensor<1x16x1xf32>, tensor<4xi32>) -> tensor<1x1x16x1xf32>
-// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @"33fab028b9ef11e99426dc4a3e957995"(%[[RESHAPE]]) : (tensor<1x1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_pack_input_one", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "33fab028b9ef11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_pack_input_one-33fab028b9ef11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-
-  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_pack_input_one", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "33fab028b9ef11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_pack_input_one-33fab028b9ef11e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %3 = "tf.Identity"(%2) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_pack_input_one", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "33fab028b9ef11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_pack_input_one-33fab028b9ef11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  return
-}
-
-// CHECK:  func @"33fab028b9ef11e99426dc4a3e957995"(tensor<1x1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_pack_input_one"}
-
-// -----
-
-// CHECK-LABEL: extractStackInputOutputOphint
-func @extractStackInputOutputOphint() {
-// CHECK:  %[[PACK:[0-9]*]] = "tfl.pack"(%0, %1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<2x1x16x1xf32>
-// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @b92ed354b9f011e99426dc4a3e957995(%[[PACK]]) : (tensor<2x1x16x1xf32>) -> tensor<2x1x16x1xf32>
-// CHECK:  %[[UNPACK:[0-9]*]]:2 = "tfl.unpack"(%[[OP_HINT_CALL]]) {axis = 0 : i32, num = 2 : i32} : (tensor<2x1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
-// CHECK-DAG:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[UNPACK]]#1) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK-DAG:  %[[OUTPUT_1:[0-9]*]] = "tf.Identity"(%[[UNPACK]]#0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-
-  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %8 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_2"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %9 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_3"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %10 = "tf.Add"(%8, %9) {T = "tfdtype$DT_FLOAT", name = "add"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %11 = "tf.Identity"(%10) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  return
-}
-
-// CHECK:  func @b92ed354b9f011e99426dc4a3e957995(tensor<2x1x16x1xf32>) -> tensor<2x1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_stack_input_output"}
-
-// -----
-
-// CHECK-LABEL: extractMultipleInputsOutputsOphint
-func @extractMultipleInputsOutputsOphint() {
-// CHECK:  %[[MULTI_INPUT_CALL:[0-9]*]]:2 = call @a6ca45beb9f411e99426dc4a3e957995(%0, %1) : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
-
-  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
-  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 1 : i64, _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-1-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %8 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_2"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %9 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_3"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %10 = "tf.Add"(%8, %9) {T = "tfdtype$DT_FLOAT", name = "add"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  %11 = "tf.Identity"(%10) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_output_index = 1 : i64, _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-1-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-  return
-}
-
-// CHECK:  func @a6ca45beb9f411e99426dc4a3e957995(tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
-// CHECK:  attributes  {_tflite_function_input_index = [0 : i32, 1 : i32], _tflite_function_name = "cool_activation_multiple_input_output"}
-
-// -----
-
-// CHECK-LABEL: inputsAfterOutputs
-func @inputsAfterOutputs() {
-// CHECK:  %[[PLACE_HOLDER:[0-9]*]] = "tf.Placeholder"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 2 } dim { size: 2 }"} : () -> tensor<2x2xf32>
-// CHECK:  %[[INPUT_PROCESS:[0-9]*]] = "tf.Sigmoid"(%[[PLACE_HOLDER]]) {T = "tfdtype$DT_FLOAT", device = "", name = "Sigmoid"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
-// CHECK:  %[[OP_HINT_CALL:[0-9]*]]:2 = call @d6266124d2dd11e9b52cdc4a3e957995(%0, %1, %[[INPUT_PROCESS]]) : (tensor<2x2xf32>, tensor<f32>, tensor<2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>)
-
-  %0 = "tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Const", value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 1 : i64, _tflite_function_name = "CustomOp", _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "InputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-1-None-None"} : (tensor<f32>) -> tensor<f32>
-  %2 = "tf.Placeholder"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 2 } dim { size: 2 }"} : () -> tensor<2x2xf32>
-  %3 = "tf.Identity"(%2) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 0 : i64, _tflite_function_name = "CustomOp", _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "InputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-0-None-None"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  %4 = "tf.Add"(%3, %1) {T = "tfdtype$DT_FLOAT", device = "", name = "Add"} : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
-  %5 = "tf.Identity"(%4) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "CustomOp", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "OutputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-0-None-None"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  %6 = "tf.Placeholder"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 2 } dim { size: 2 }"} : () -> tensor<2x2xf32>
-  %7 = "tf.Sigmoid"(%6) {T = "tfdtype$DT_FLOAT", device = "", name = "Sigmoid"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  %8 = "tf.Identity"(%7) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 2 : i64, _tflite_function_name = "CustomOp", _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "InputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-2-None-None"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  %9 = "tf.Add"(%5, %8) {T = "tfdtype$DT_FLOAT", device = "", name = "Add_1"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %10 = "tf.Identity"(%9) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "CustomOp", _tflite_function_output_index = 1 : i64, _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "OutputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-1-None-None"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  return
-}
-
-// CHECK:  func @d6266124d2dd11e9b52cdc4a3e957995(tensor<2x2xf32>, tensor<f32>, tensor<2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>)
-// CHECK:    attributes {_tflite_function_input_index = [0 : i32, 1 : i32, 2 : i32], _tflite_function_name = "CustomOp"}
-
-// -----
-
-module {
-func @extractOphintSame() {
-  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x16x1xf32>
-  %1 = call @AnotherFunc(%0) : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  %3 = "tf.Mul"(%2, %1) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x16x1xf32>, tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  return
-
-// CHECK:    [[VAL_0:%.*]] = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x16x1xf32>
-// CHECK:    [[VAL_1:%.*]] = call @AnotherFunc([[VAL_0]]) : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-// CHECK:    [[VAL_2:%.*]] = "tf.Sigmoid"([[VAL_1]]) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-// CHECK:    [[VAL_3:%.*]] = "tf.Mul"([[VAL_2]], [[VAL_1]]) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x16x1xf32>, tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-// CHECK:    [[VAL_4:%.*]] = "tf.Identity"([[VAL_3]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-}
-
-func @AnotherFunc(%arg0: tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32> {
-  %0 = "tf.Identity"(%arg0) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation", _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  return %0 : tensor<1x16x16x1xf32>
-}
-}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
index da3fe02562b..0b28d434c7c 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
@@ -23,10 +23,10 @@ filegroup(
     data = [
         ":importer_test_legacy_reshape",
         ":importer_test_min_max",
+        ":test_schema.fbs",
         "//tensorflow/compiler/mlir/lite:flatbuffer_to_string",
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate",
         "//tensorflow/compiler/mlir/lite:json_to_flatbuffer",
-        "//tensorflow/lite/schema:schema.fbs",
         "@llvm-project//llvm:FileCheck",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op.mlir
new file mode 100644
index 00000000000..47a65ec2fea
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op.mlir
@@ -0,0 +1,8 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+
+func @main(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<1x32x42x128xf32>, %arg2: tensor<4xi32>) -> tensor<1x64x84x32xf32> {
+  %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "Convolution2DTransposeBias", custom_option = opaque<"tfl", "0x010000000200000002000000"> : tensor<12xi8>} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
+  return %0 : tensor<1x64x84x32xf32>
+}
+// CHECK-LABEL: main
+// CHECK: "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "Convolution2DTransposeBias", custom_option = opaque<"tfl", "0x010000000200000002000000"> : tensor<12xi8>} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/dynamic_shape.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/dynamic_shape.mlir
new file mode 100644
index 00000000000..76e277eddcf
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/dynamic_shape.mlir
@@ -0,0 +1,9 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+
+// CHECK: func @main(%arg0: tensor<?x19x19x3xf32>) -> tensor<?x9x9x4xf32>
+func @main(%arg0: tensor<?x19x19x3xf32>) -> tensor<?x9x9x4xf32> {
+  %cst = constant dense<1.0> : tensor<4xf32>
+  %cst_3 = constant dense<2.0> : tensor<4x3x3x3xf32>
+  %0 = "tfl.conv_2d"(%arg0, %cst_3, %cst) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU6", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<?x19x19x3xf32>, tensor<4x3x3x3xf32>, tensor<4xf32>) -> tensor<?x9x9x4xf32>
+  return %0 : tensor<?x9x9x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/import_json.json b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/import_json.json
index d6d3b142931..f2d275f7ee1 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/import_json.json
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/import_json.json
@@ -1,4 +1,4 @@
-// RUN: json_to_flatbuffer %p/../../../../../lite/schema/schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --dump-input-on-failure %s
+// RUN: json_to_flatbuffer %p/test_schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --dump-input-on-failure %s
 
 // CHECK: %cst = constant unit
 // CHECK: %[[RES0:.*]] = "tfl.conv_2d"(%arg0, %arg1, %cst) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 0 : i32, stride_w = 0 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, none) -> tensor<256x32x32x16xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional_input.json b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional_input.json
index b239656d68d..d6bf73c6c8f 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional_input.json
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional_input.json
@@ -1,4 +1,4 @@
-// RUN: json_to_flatbuffer %p/../../../../../lite/schema/schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --dump-input-on-failure %s
+// RUN: json_to_flatbuffer %p/test_schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --dump-input-on-failure %s
 
 // This test is to test that if the flatbuffer omits the last optional input `bias` of tfl.conv_2d op, the flatbuffer_importer will automatically adds `none` value to tfl.conv_2d.
 
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/test_schema.fbs b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/test_schema.fbs
new file mode 100644
index 00000000000..034844a2916
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/test_schema.fbs
@@ -0,0 +1,1092 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+
+namespace tflite;
+
+// This corresponds to the version.
+file_identifier "TFL3";
+// File extension of any written files.
+file_extension "tflite";
+
+// IMPORTANT: All new members of tables, enums and unions must be added at the
+// end to ensure backwards compatibility.
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+  BOOL = 6,
+  INT16 = 7,
+  COMPLEX64 = 8,
+  INT8 = 9,
+  FLOAT64 = 10,
+}
+
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[ubyte] (force_align: 16);
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
+}
+
+// Parameters for converting a quantized tensor back to float.
+table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];  // For dequantizing the tensor's values.
+  zero_point:[long];
+
+  // If this is not none, the other quantization parameters (i.e. min, max,
+  // scale, zero_point fields above) are ignored and the value of the
+  // QuantizationDetails union should be used.
+  details:QuantizationDetails;
+
+  // Specifies the dimension of the Tensor's shape that the scales and
+  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  // with quantization params:
+  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
+  // will be quantized across the second dimension of t.
+  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  quantized_dimension:int;
+}
+
+// Sparse tensors.
+// We use a modification of the TACO format.
+// Reference: http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
+//
+// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1),
+// potentially with a k-dimensional block (0 <= k <= n) with dims
+// (dn, ..., dn+k-1), the format needs to specify:
+//   1. In what order to traverse these dimensions. For example, to store a 2-D
+//      matrix in row major order, the traversal order would be (d0, d1),
+//      whereas to store it in column major order, the traversal order would be
+//      (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order
+//      could be (d0, d1, d2, d3).
+//   2. How each block dimension in (dn, ..., dn+k-1) maps to the original
+//      tensor dimension in (d0, ..., dn-1).
+//   3. In the traversal order defined above, the format (dense vs. sparse) and
+//      index metadata for each dimension. For a dense dimension, this is just
+//      the size of that dimension. For a sparse dimension, it's the same as
+//      the compressed index defined in the Compressed Sparse Row (CSR) format.
+//      (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html)
+
+// The storage type for a dimension. Currently we support:
+//   1. DENSE: each coordinate in this dimension is stored implicitly.
+//   2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The
+//      compression technique is the same what CSR uses.
+// More types like a sparse dimension with a different compression technique
+// could be added to the list in the future.
+enum DimensionType : byte {
+  DENSE = 0,
+  SPARSE_CSR = 1,
+}
+
+table Int32Vector {
+  values:[int];
+}
+
+table Uint16Vector {
+  values:[ushort] (force_align: 4);
+}
+
+table Uint8Vector {
+  values:[ubyte] (force_align: 4);
+}
+
+// Variable-typed buffer to store the index metadata for a sparse dimension.
+// The widest type is Int32 instead of UInt32 because tensor's shape is a int32
+// vector. We don't want the per-dimensional index to overflow that range.
+union SparseIndexVector {
+  Int32Vector,
+  Uint16Vector,
+  Uint8Vector
+}
+
+table DimensionMetadata {
+  // Whether a dimension is dense or sparse.
+  format:DimensionType;
+  // Index metadata used for a dimension.
+  //   - If format is DimensionType.DENSE then we use the dense_size field to
+  //     store the size of that dimension. Each index in that dimension is
+  //     stored implicitly.
+  //   - If format is DimensionType.SPARSE_CSR then we use array_segments and
+  //     array_indices to encode that dimension. array_segments represents how
+  //     to segment the indices array, each segment corresponds to one element
+  //     in the previous dimension. array_indices represents the index of the
+  //     non-zero elements within this dimension (as those in the CSR matrix
+  //     format, where the first array is row pointers and the second array is
+  //     column indices).
+  dense_size:int;
+  array_segments:SparseIndexVector;
+  array_indices:SparseIndexVector;
+}
+
+// Parameters to encode a sparse TfLite tensor.
+table SparsityParameters {
+  // The traversal order of the dimensions defined in the `shape` field of the
+  // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1,
+  // ..., dn-1),
+  //   - if not block sparse, the traversal_order is just a permutation of (d0,
+  //     ..., dn-1). For example, a 2-D matrix stored in row-major order would
+  //     have traversal_order = (d0, d1).
+  //   - if block sparse with a k-dimensional block (0 <= k <= n), the
+  //     traversal_order has n + k elements. The first n elements are still a
+  //     permutation of (d0, ..., dn-1). The lask k elements are a permutation
+  //     of (dn, ..., dn+k-1), defining how to traverse a block internally. For
+  //     example, a 2-D matrix with 2-D blocks, both stored in row-major order
+  //     would have traversal_order = (d0, d1, d2, d3).
+  traversal_order:[int];
+  // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
+  // stores how a block dimension in (dn, ..., dn+k-1) maps to the original
+  // tensor dimension in (d0, ..., dn).
+  // It's stored in the order of (dn, ..., dn+k-1).
+  // If not block-sparse, this field is NULL.
+  block_map:[int];
+  // In the traversal order defined above, the metadata needed for
+  // each dimension to locate the non-zero values in the original dense tensor.
+  // The size of the dim_metadata array = the size of the traversal_order array
+  // = n + k.
+  dim_metadata:[DimensionMetadata];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, height, width, number of channels] (That's
+  // Tensorflow's NHWC).
+  shape:[int];
+  type:TensorType;
+  // An index that refers to the buffers table at the root of the model. Or,
+  // if there is no data buffer associated (i.e. intermediate results), then
+  // this is 0 (which refers to an always existent empty buffer).
+  //
+  // The data_buffer itself is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
+  buffer:uint;
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+
+  is_variable:bool = false;
+
+  // Parameters to encode a sparse tensor. See the example in
+  // tensorflow/lite/testdata/sparse_tensor.json.
+  sparsity:SparsityParameters;  // Optional.
+
+  // Encodes `shape` with unknown dimensions. Unknown dimensions are
+  // represented with -1.
+  shape_signature:[int]; // Optional.
+}
+
+// A list of builtin operators. Builtin operators are slightly faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+// LINT.IfChange
+enum BuiltinOperator : byte {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  DEPTH_TO_SPACE = 5,
+  DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  MUL = 18,
+  RELU = 19,
+  // NOTE(aselle): RELU_N1_TO_1 used to be called RELU1, but it was renamed
+  // since different model developers use RELU1 in different ways. Never
+  // create another op called RELU1.
+  RELU_N1_TO_1 = 20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  // TODO(aselle): Consider rename to CONCATENATE_EMBEDDINGS
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+  EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
+  SPACE_TO_BATCH_ND = 38,
+  TRANSPOSE = 39,
+  MEAN = 40,
+  SUB = 41,
+  DIV = 42,
+  SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  EXP = 47,
+  TOPK_V2 = 48,
+  SPLIT = 49,
+  LOG_SOFTMAX = 50,
+  // DELEGATE is a special op type for the operations which are delegated to
+  // other backends.
+  // WARNING: Experimental interface, subject to change
+  DELEGATE = 51,
+  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
+  NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
+  EQUAL = 71,
+  NOT_EQUAL = 72,
+  LOG = 73,
+  SUM = 74,
+  SQRT = 75,
+  RSQRT = 76,
+  SHAPE = 77,
+  POW = 78,
+  ARG_MIN = 79,
+  FAKE_QUANT = 80,
+  REDUCE_PROD = 81,
+  REDUCE_MAX = 82,
+  PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
+  LOGICAL_AND = 86,
+  LOGICAL_NOT = 87,
+  UNPACK = 88,
+  REDUCE_MIN = 89,
+  FLOOR_DIV = 90,
+  REDUCE_ANY = 91,
+  SQUARE = 92,
+  ZEROS_LIKE = 93,
+  FILL = 94,
+  FLOOR_MOD = 95,
+  RANGE = 96,
+  RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
+  REVERSE_SEQUENCE = 112,
+  MATRIX_DIAG = 113,
+  QUANTIZE = 114,
+  MATRIX_SET_DIAG = 115,
+  ROUND = 116,
+  HARD_SWISH = 117,
+  IF = 118,
+  WHILE = 119,
+  NON_MAX_SUPPRESSION_V4 = 120,
+  NON_MAX_SUPPRESSION_V5 = 121,
+  SCATTER_ND = 122,
+  SELECT_V2 = 123,
+  DENSIFY = 124,
+  SEGMENT_SUM = 125,
+  BATCH_MATMUL = 126
+}
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+  EmbeddingLookupSparseOptions,
+  MulOptions,
+  PadOptions,
+  GatherOptions,
+  BatchToSpaceNDOptions,
+  SpaceToBatchNDOptions,
+  TransposeOptions,
+  ReducerOptions,
+  SubOptions,
+  DivOptions,
+  SqueezeOptions,
+  SequenceRNNOptions,
+  StridedSliceOptions,
+  ExpOptions,
+  TopKV2Options,
+  SplitOptions,
+  LogSoftmaxOptions,
+  CastOptions,
+  DequantizeOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
+  LessOptions,
+  NegOptions,
+  PadV2Options,
+  GreaterOptions,
+  GreaterEqualOptions,
+  LessEqualOptions,
+  SelectOptions,
+  SliceOptions,
+  TransposeConvOptions,
+  SparseToDenseOptions,
+  TileOptions,
+  ExpandDimsOptions,
+  EqualOptions,
+  NotEqualOptions,
+  ShapeOptions,
+  PowOptions,
+  ArgMinOptions,
+  FakeQuantOptions,
+  PackOptions,
+  LogicalOrOptions,
+  OneHotOptions,
+  LogicalAndOptions,
+  LogicalNotOptions,
+  UnpackOptions,
+  FloorDivOptions,
+  SquareOptions,
+  ZerosLikeOptions,
+  FillOptions,
+  BidirectionalSequenceLSTMOptions,
+  BidirectionalSequenceRNNOptions,
+  UnidirectionalSequenceLSTMOptions,
+  FloorModOptions,
+  RangeOptions,
+  ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
+  UniqueOptions,
+  ReverseV2Options,
+  AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
+  ReverseSequenceOptions,
+  MatrixDiagOptions,
+  QuantizeOptions,
+  MatrixSetDiagOptions,
+  HardSwishOptions,
+  IfOptions,
+  WhileOptions,
+  DepthToSpaceOptions,
+  NonMaxSuppressionV4Options,
+  NonMaxSuppressionV5Options,
+  ScatterNdOptions,
+  SelectV2Options,
+  DensifyOptions,
+  SegmentSumOptions,
+  BatchMatMulOptions
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU_N1_TO_1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  // Parameters for DepthwiseConv version 1 or above.
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  // `depth_multiplier` is redundant. It's used by CPU kernels in
+  // TensorFlow 2.0 or below, but ignored in versions above.
+  // See comments in lite/c/builtin_op_data.h for more details.
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+  // Parameters for DepthwiseConv version 2 or above.
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+  // For weights-only quantization, use asymmetric quantization for non
+  // constant inputs at evaluation time.
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with RNNCell.
+table SequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  merge_outputs: bool;
+  asymmetric_quantize_inputs:bool;
+}
+
+enum FullyConnectedOptionsWeightsFormat: byte {
+  DEFAULT = 0,
+  SHUFFLED4x16INT8 = 1,
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  // Parameters for FullyConnected version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+
+  // Parameters for FullyConnected version 2 or above.
+  weights_format:FullyConnectedOptionsWeightsFormat = DEFAULT;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimension is preserved. Furthermore,
+  // all but the last dimension of the input and output shapes will be equal.
+  keep_num_dims: bool;
+
+  // Parameters for FullyConnected version 7 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table MulOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+enum LSTMKernelType : byte {
+  // Full LSTM kernel which supports peephole and projection.
+  FULL = 0,
+  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
+  BASIC = 1,
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  // Parameters for LSTM version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // Parameters for LSTM version 2 or above.
+  // Basic kernel is only supported in version 2 or above.
+  kernel_type: LSTMKernelType = FULL;
+
+  // Parameters for LSTM version 4 or above.
+  asymmetric_quantize_inputs: bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with LSTMCell.
+table UnidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true then first dimension is sequence, otherwise batch.
+  time_major:bool;
+
+  // Parameter for Unidirectional Sequence LSTM version 4.
+  asymmetric_quantize_inputs:bool;
+}
+
+table BidirectionalSequenceLSTMOptions {
+  // Parameters supported by version 1:
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true, store the outputs of both directions into the first output.
+  merge_outputs: bool;
+
+  // Parameters supported by version 2:
+  // If true then first dimension is sequence, otherwise batch.
+  // Version 1 implementations assumed time_major to be true, so this default
+  // value should never change.
+  time_major: bool = true;
+
+  // Parameters for version 3 or above.
+  asymmetric_quantize_inputs:bool;
+}
+
+table ResizeBilinearOptions {
+  new_height: int (deprecated);
+  new_width: int (deprecated);
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+table ResizeNearestNeighborOptions {
+  align_corners: bool;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:uint;
+}
+
+table PadOptions {
+}
+
+table PadV2Options {
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SpaceToBatchNDOptions {
+}
+
+table BatchToSpaceNDOptions {
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+table DepthToSpaceOptions {
+  block_size: int;
+}
+
+table SubOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DivOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table TopKV2Options {
+}
+
+enum CombinerType : byte {
+  SUM = 0,
+  MEAN = 1,
+  SQRTN = 2,
+}
+
+table EmbeddingLookupSparseOptions {
+  combiner:CombinerType;
+}
+
+table GatherOptions {
+  axis: int;
+}
+
+table TransposeOptions {
+}
+
+table ExpOptions {
+}
+
+table CosOptions {
+}
+
+table ReducerOptions {
+  keep_dims: bool;
+}
+
+table SqueezeOptions {
+  squeeze_dims:[int];
+}
+
+table SplitOptions {
+  num_splits: int;
+}
+
+table SplitVOptions {
+  num_splits: int;
+}
+
+table StridedSliceOptions {
+  begin_mask: int;
+  end_mask: int;
+  ellipsis_mask: int;
+  new_axis_mask: int;
+  shrink_axis_mask: int;
+}
+
+table LogSoftmaxOptions {
+}
+
+table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
+}
+
+table DequantizeOptions {
+}
+
+table MaximumMinimumOptions {
+}
+
+table TileOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
+table ArgMinOptions {
+  output_type : TensorType;
+}
+
+table GreaterOptions {
+}
+
+table GreaterEqualOptions {
+}
+
+table LessOptions {
+}
+
+table LessEqualOptions {
+}
+
+table NegOptions {
+}
+
+table SelectOptions {
+}
+
+table SliceOptions {
+}
+
+table TransposeConvOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+}
+
+table ExpandDimsOptions {
+}
+
+table SparseToDenseOptions {
+  validate_indices:bool;
+}
+
+table EqualOptions {
+}
+
+table NotEqualOptions {
+}
+
+table ShapeOptions {
+  // Optional output type of the operation (int32 or int64). Defaults to int32.
+  out_type : TensorType;
+}
+
+table RankOptions {
+}
+
+table PowOptions {
+}
+
+table FakeQuantOptions {
+  // Parameters supported by version 1:
+  min:float;
+  max:float;
+  num_bits:int;
+
+  // Parameters supported by version 2:
+  narrow_range:bool;
+}
+
+table PackOptions {
+  values_count:int;
+  axis:int;
+}
+
+table LogicalOrOptions {
+}
+
+table OneHotOptions {
+  axis:int;
+}
+
+table AbsOptions {
+}
+
+
+table HardSwishOptions {
+}
+
+table LogicalAndOptions {
+}
+
+table LogicalNotOptions {
+}
+
+table UnpackOptions {
+  num:int;
+  axis:int;
+}
+
+table FloorDivOptions {
+}
+
+table SquareOptions {
+}
+
+table ZerosLikeOptions {
+}
+
+table FillOptions {
+}
+
+table FloorModOptions {
+}
+
+table RangeOptions {
+}
+
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
+table ReverseV2Options {
+}
+
+table AddNOptions {
+}
+
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
+table ReverseSequenceOptions {
+  seq_dim:int;
+  batch_dim:int = 0;
+}
+
+table MatrixDiagOptions {
+}
+
+table QuantizeOptions {
+}
+
+table MatrixSetDiagOptions {
+}
+
+table IfOptions {
+  then_subgraph_index:int;
+  else_subgraph_index:int;
+}
+
+table WhileOptions {
+  cond_subgraph_index:int;
+  body_subgraph_index:int;
+}
+
+table NonMaxSuppressionV4Options {
+}
+
+table NonMaxSuppressionV5Options {
+}
+
+table ScatterNdOptions {
+}
+
+table SelectV2Options {
+}
+
+table DensifyOptions {
+}
+
+table SegmentSumOptions {
+}
+
+table BatchMatMulOptions {
+  adjoint_lhs:bool;
+  adjoint_rhs:bool;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  builtin_code:BuiltinOperator;
+  custom_code:string;
+
+  // The version of the operator. The version need to be bumped whenever new
+  // parameters are introduced into an op.
+  version:int = 1;
+}
+
+enum CustomOptionsFormat : byte {
+  FLEXBUFFERS = 0,
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:uint;
+
+  // Optional input are indicated by -1.
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+  custom_options_format:CustomOptionsFormat;
+
+  // A list of booleans indicating the input tensors which are being mutated by
+  // this operator.(e.g. used by RNN and LSTM).
+  // For example, if the "inputs" array refers to 5 tensors and the second and
+  // fifth are mutable variables, then this list will contain
+  // [false, true, false, false, true].
+  //
+  // If the list is empty, no variable is mutated in this operator.
+  // The list either has the same length as `inputs`, or is empty.
+  mutating_variable_inputs:[bool];
+
+  // A list of indices to the subgraph's "tensors" that are internal to an Op.
+  // Internal tensors are those that do not flow in or out of the operation,
+  // but instead are part of internal computation. As such, the operation's
+  // implementation may manage its memory more efficiently. They are needed
+  // however (i.e. not just an implementation detail) since they are part of the
+  // computation, which may require relevant metadata such as quantization
+  // parameters.
+  intermediates:[int];
+}
+
+// The root type, defining a subgraph, which typically represents an entire
+// model.
+table SubGraph {
+  // A list of all tensors used in this subgraph.
+  tensors:[Tensor];
+
+  // Indices of the tensors that are inputs into this subgraph. Note this is
+  // the list of non-static tensors that feed into the subgraph for inference.
+  inputs:[int];
+
+  // Indices of the tensors that are outputs out of this subgraph. Note this is
+  // the list of output tensors that are considered the product of the
+  // subgraph's inference.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of this subgraph (used for debugging).
+  name:string;
+}
+
+// Table of raw data buffers (used for constant tensors). Referenced by tensors
+// by index. The generous alignment accommodates mmap-friendly data structures.
+table Buffer {
+  data:[ubyte] (force_align: 16);
+}
+
+table Metadata {
+  // A human readable string to uniquely identify a Metadata.
+  name:string;
+  // An index to the buffers table.
+  buffer:uint;
+}
+
+table Model {
+  // Version of the schema.
+  version:uint;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+
+  // Buffers of the model.
+  // Note the 0th entry of this array must be an empty buffer (sentinel).
+  // This is a convention so that tensors without a buffer can provide 0 as
+  // their buffer.
+  buffers:[Buffer];
+
+  // Metadata about the model. Indirects into the existings buffers list.
+  // Deprecated, prefer to use metadata field.
+  metadata_buffer:[int];
+
+  // Metadata about the model.
+  metadata:[Metadata];
+}
+
+root_type Model;
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-ophint-func-op.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-ophint-func-op.mlir
deleted file mode 100644
index 97bb6f2bfde..00000000000
--- a/tensorflow/compiler/mlir/lite/tests/legalize-ophint-func-op.mlir
+++ /dev/null
@@ -1,68 +0,0 @@
-// RUN: tf-opt -tfl-legalize-ophint-func-op %s  -split-input-file | FileCheck %s
-
-module {
-  // CHECK-LABEL: func @testConvertUnidirectionalSequenceRNN
-  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<1x3xf32>, %[[ARG_1:[a-z0-9]*]]: tensor<1x3xf32>)
-  func @testConvertUnidirectionalSequenceRNN(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x4xf32> {
-    // CHECK:  %[[CST:.*]] = constant dense<0.000000e+00> : tensor<1x4xf32>
-    // CHECK:  %[[CST_0:.*]] = constant dense<0.000000e+00> : tensor<4xf32>
-    // CHECK:  %[[CST_1:.*]] = constant dense<0.000000e+00> : tensor<4x3xf32>
-    // CHECK:  %[[CST_2:.*]] = constant dense<0.000000e+00> : tensor<4x4xf32>
-    // CHECK:  %[[PACKED_INPUT:[a-z0-9]*]] = "tfl.pack"(%[[ARG_0]], %[[ARG_1]]) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x1x3xf32>
-    // CHECK:  %[[FUSED_OUTPUT:[a-z0-9]*]] = "tfl.unidirectional_sequence_rnn"(%[[PACKED_INPUT]], %[[CST_1]], %[[CST_2]], %[[CST_0]], %[[CST]]) {fused_activation_function = "TANH", time_major = true} : (tensor<2x1x3xf32>, tensor<4x3xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>) -> tensor<2x1x4xf32>
-    // CHECK:  %[[UNPACK:[0-9]*]]:2 = "tfl.unpack"(%[[FUSED_OUTPUT]]) {axis = 0 : i32, num = 2 : i32} : (tensor<2x1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xf32>)
-
-    %cst = constant dense<0.000000e+00> : tensor<1x4xf32>
-    %cst0 = constant dense<0.000000e+00> : tensor<4xf32>
-    %cst1 = constant dense<0.000000e+00> : tensor<4x3xf32>
-    %cst2 = constant dense<0.000000e+00> : tensor<4x4xf32>
-    %2 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x1x3xf32>
-    %3 = call @a9211722c23011e9875cdc4a3e957995(%2, %cst1, %cst2, %cst0, %cst) : (tensor<2x1x3xf32>, tensor<4x3xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>) -> tensor<2x1x4xf32>
-    %4:2 = "tfl.unpack"(%3) {axis = 0 : i32, num = 2 : i32} : (tensor<2x1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xf32>)
-    return %4#0 : tensor<1x4xf32>
-  }
-  func @a9211722c23011e9875cdc4a3e957995(tensor<2x1x3xf32>, tensor<4x3xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>) -> tensor<2x1x4xf32>
-  attributes  {_tflite_function_name = "UnidirectionalSequenceRnn"}
-}
-
-// -----
-
-module {
-  // CHECK-LABEL: func @testConvertUnidirectionalSequenceLSTM
-  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<1x3xf32>, %[[ARG_1:[a-z0-9]*]]: tensor<1x3xf32>)
-  func @testConvertUnidirectionalSequenceLSTM(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x4xf32> {
-    // CHECK:  %[[CST:.*]] = constant dense<0.000000e+00> : tensor<4x4xf32>
-    // CHECK:  %[[CST_0:.*]] = constant dense<0.000000e+00> : tensor<4x4xf32>
-    // CHECK:  %[[CST_1:.*]] = constant dense<0.000000e+00> : tensor<4x4xf32>
-    // CHECK:  %[[CST_2:.*]] = constant dense<0.000000e+00> : tensor<4x4xf32>
-    // CHECK:  %[[CST_3:.*]] = constant dense<1.000000e+00> : tensor<4xf32>
-    // CHECK:  %[[CST_4:.*]] = constant dense<0.000000e+00> : tensor<4x3xf32>
-    // CHECK:  %[[CST_5:.*]] = constant dense<0.000000e+00> : tensor<4x3xf32>
-    // CHECK:  %[[CST_6:.*]] = constant dense<0.000000e+00> : tensor<4x3xf32>
-    // CHECK:  %[[CST_7:.*]] = constant dense<0.000000e+00> : tensor<4x3xf32>
-    // CHECK:  %[[CST_8:.*]] = constant dense<0.000000e+00> : tensor<4xf32>
-    // CHECK:  %[[CST_9:.*]] = constant dense<0.000000e+00> : tensor<1x4xf32>
-    // CHECK:  %[[PACKED_INPUT:[a-z0-9]*]] = "tfl.pack"(%[[ARG_0]], %[[ARG_1]]) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x1x3xf32>
-    // CHECK:  %[[CST_10:.*]] = constant unit
-    // CHECK:  %[[FUSED_OUTPUT:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%[[PACKED_INPUT]], %[[CST_6]], %[[CST_5]], %[[CST_4]], %[[CST_7]], %[[CST_1]], %[[CST_0]], %[[CST]], %[[CST_2]], %[[CST_10]], %[[CST_10]], %[[CST_10]], %[[CST_8]], %[[CST_3]], %[[CST_8]], %[[CST_8]], %[[CST_10]], %[[CST_10]], %[[CST_9]], %[[CST_9]], %[[CST_10]], %[[CST_10]], %[[CST_10]], %[[CST_10]]) {fused_activation_function = "TANH", time_major = true} : (tensor<2x1x3xf32>, tensor<4x3xf32>, tensor<4x3xf32>, tensor<4x3xf32>, tensor<4x3xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, none, none, none, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, none, none, tensor<1x4xf32>, tensor<1x4xf32>, none, none, none, none) -> tensor<2x1x4xf32>
-    // CHECK:  %[[UNPACK:[0-9]*]]:2 = "tfl.unpack"(%[[FUSED_OUTPUT]]) {axis = 0 : i32, num = 2 : i32} : (tensor<2x1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xf32>)
-
-    %cst = constant dense<0.000000e+00> : tensor<4x4xf32>
-    %cst_0 = constant dense<0.000000e+00> : tensor<4x4xf32>
-    %cst_1 = constant dense<0.000000e+00> : tensor<4x4xf32>
-    %cst_2 = constant dense<0.000000e+00> : tensor<4x4xf32>
-    %cst_3 = constant dense<1.000000e+00> : tensor<4xf32>
-    %cst_4 = constant dense<0.000000e+00> : tensor<4x3xf32>
-    %cst_5 = constant dense<0.000000e+00> : tensor<4x3xf32>
-    %cst_6 = constant dense<0.000000e+00> : tensor<4x3xf32>
-    %cst_7 = constant dense<0.000000e+00> : tensor<4x3xf32>
-    %cst_8 = constant dense<0.000000e+00> : tensor<4xf32>
-    %cst_9 = constant dense<0.000000e+00> : tensor<1x4xf32>
-    %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x1x3xf32>
-    %1:2 = call @a7addbdad08811e9b52cdc4a3e957995(%0, %cst_6, %cst_5, %cst_4, %cst_7, %cst_1, %cst_0, %cst, %cst_2, %cst_8, %cst_3, %cst_8, %cst_8, %cst_9, %cst_9) : (tensor<2x1x3xf32>, tensor<4x3xf32>, tensor<4x3xf32>, tensor<4x3xf32>, tensor<4x3xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<2x1x4xf32>)
-    %2:2 = "tfl.unpack"(%1#1) {axis = 0 : i32, num = 2 : i32} : (tensor<2x1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xf32>)
-    return %2#1 : tensor<1x4xf32>
-  }
-  func @a7addbdad08811e9b52cdc4a3e957995(tensor<2x1x3xf32>, tensor<4x3xf32>, tensor<4x3xf32>, tensor<4x3xf32>, tensor<4x3xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<2x1x4xf32>)
-  attributes  {_tflite_function_input_index = [0 : i32, 1 : i32, 2 : i32, 3 : i32, 4 : i32, 5 : i32, 6 : i32, 7 : i32, 8 : i32, 12 : i32, 13 : i32, 14 : i32, 15 : i32, 18 : i32, 19 : i32], _tflite_function_name = "UnidirectionalSequenceLstm"}
-}
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index d6f2a83984f..15c73d2db2c 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -414,6 +414,26 @@ func @gatherNdHigherRankIndices(%arg0 : tensor<4x3x2xf32>, %arg1 : tensor<2x2xi3
 // CHECK:  "tfl.gather_nd"(%arg0, %arg1) : (tensor<4x3x2xf32>, tensor<2x2xi32>) -> tensor<2x2xf32>
 }
 
+func @scatterNdVectorIndices(%arg0: tensor<5x1xi32>, %arg1: tensor<5x3x2xf32>) -> tensor<10x3x2xf32> {
+  %cst = "tf.Const"() { value = dense<[10, 3, 2]> : tensor<3xi32> } : () -> tensor<3xi32>
+  %1 = "tf.ScatterNd"(%arg0, %arg1, %cst) : (tensor<5x1xi32>, tensor<5x3x2xf32>, tensor<3xi32>) -> tensor<10x3x2xf32>
+  return %1 : tensor<10x3x2xf32>
+
+// CHECK-LABEL:scatterNdVectorIndices
+// CHECK: %[[CST:.*]] = constant dense<[10, 3, 2]> : tensor<3xi32>
+// CHECK: %[[RES:.*]] = "tfl.scatter_nd"(%arg0, %arg1, %[[CST]]) : (tensor<5x1xi32>, tensor<5x3x2xf32>, tensor<3xi32>) -> tensor<10x3x2xf32>
+// CHECK: return %[[RES]]
+}
+
+func @scatterNdHigherRankIndices(%arg0: tensor<4x2x2xi32>, %arg1: tensor<4x2x3xf32>, %arg2: tensor<3xi32>) -> tensor<10x2x3xf32> {
+  %0 = "tf.ScatterNd"(%arg0, %arg1, %arg2) : (tensor<4x2x2xi32>, tensor<4x2x3xf32>, tensor<3xi32>) -> tensor<10x2x3xf32>
+  return %0 : tensor<10x2x3xf32>
+
+// CHECK-LABEL:scatterNdHigherRankIndices
+// CHECK: %[[RES:.*]] = "tfl.scatter_nd"(%arg0, %arg1, %arg2) : (tensor<4x2x2xi32>, tensor<4x2x3xf32>, tensor<3xi32>) -> tensor<10x2x3xf32>
+// CHECK: return %[[RES]]
+}
+
 func @gatherV2VectorIndices(%arg0 : tensor<1x2x20xf32>, %arg1 : tensor<3x5xi32>) -> tensor<1x3x5x20xf32> {
   %0 = "tf.Const"() { value = dense<[1]> : tensor<1xi32> } : () -> tensor<1xi32>
   %1 = "tf.GatherV2"(%arg0, %arg1, %0) : (tensor<1x2x20xf32>, tensor<3x5xi32>, tensor<1xi32>) -> tensor<1x3x5x20xf32>
@@ -1028,6 +1048,15 @@ func @concatv2With3Tensors(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2
 // CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
 }
 
+func @concatv2I64Axis(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
+  %0 = "tf.Const"() { value = dense<-1> : tensor<i64> } : () -> tensor<i64>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %0) : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>, tensor<i64>) -> tensor<2x3xi32>
+  return %1 : tensor<2x3xi32>
+
+// CHECK-LABEL: concatv2I64Axis
+// CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
+}
+
 func @resize_with_bilinear(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
   %0 = "tf.ResizeBilinear"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
@@ -1193,15 +1222,14 @@ func @resize_nearest_neighbor(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi3
   %0 = "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
   // CHECK-LABEL: resize_nearest_neighbor
-  // CHECK: "tfl.resize_nearest_neighbor"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.resize_nearest_neighbor"(%arg0, %arg1) {align_corners = true, half_pixel_centers = false} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
 }
 
-// Note: half_pixel_centers isn't supported by TFLite, so it's not legalized.
 func @resize_nearest_neighbor_with_half_pixel_centers(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
-  %0 = "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = true, half_pixel_centers = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
+  %0 = "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = false, half_pixel_centers = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
   // CHECK-LABEL: resize_nearest_neighbor_with_half_pixel_centers
-  // CHECK: "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = true, half_pixel_centers = true}
+  // CHECK: "tfl.resize_nearest_neighbor"(%arg0, %arg1) {align_corners = false, half_pixel_centers = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
 }
 
 func @sparse_to_dense_with_scalar_sparse_indices(%arg0: tensor<i32>, %arg1: tensor<3xi32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<?x?x?xf32> {
@@ -1296,10 +1324,12 @@ func @conv2d_backprop_input(%arg0: tensor<4xi32>, %arg1: tensor<3x3x1x32xf32>, %
   // CHECK-LABEL: conv2d_backprop_input
   // CHECK: %[[CST:.*]] = constant dense<[2, 0, 1, 3]> : tensor<4xi32>
   // CHECK: %[[ARG0:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
-  // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
+  // CHECK: %[[CST_0:.*]] = constant unit
+  // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
   // CHECK: %[[CST_1:.*]] = constant dense<[2, 0, 1, 3]> : tensor<4xi32>
   // CHECK: %[[ARG2:.*]] = "tfl.transpose"(%arg1, %[[CST_1]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
-  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG2]], %arg2) {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
+  // CHECK: %[[CST_2:.*]] = constant unit
+  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG2]], %arg2, %[[CST_2]]) {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
   // CHECK: %[[RESULT:.*]] = tfl.add %[[ARG1]], %[[ARG3]] {fused_activation_function = "NONE"} : tensor<15x28x28x1xf32>
   // CHECK: return %[[RESULT]] : tensor<15x28x28x1xf32>
 }
@@ -1475,3 +1505,27 @@ func @broadcast_to_i32(%input: tensor<3xi32>, %shape: tensor<2xi32>) -> tensor<3
 // CHECK:  [[MUL:%.*]] = "tfl.mul"(%arg0, [[FILL]]) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
 // CHECK:  return [[MUL]] : tensor<3x3xi32>
 }
+
+func @matmul_batch(%arg0: tensor<10x15xf32>, %arg1: tensor<15x17xf32>) -> tensor<10x17xf32> {
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", adj_x = false, adj_y = false} :
+(tensor<10x15xf32>, tensor<15x17xf32>) -> tensor<10x17xf32>
+  return %0 : tensor<10x17xf32>
+// CHECK-LABEL: matmul_batch
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<10x15xf32>, tensor<15x17xf32>) -> tensor<10x17xf32>
+}
+
+func @matmul_batchv2(%arg0: tensor<2x10x15xf32>, %arg1: tensor<15x17xf32>) -> tensor<2x10x17xf32> {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", adj_x = false, adj_y = false} :
+(tensor<2x10x15xf32>, tensor<15x17xf32>) -> tensor<2x10x17xf32>
+  return %0 : tensor<2x10x17xf32>
+// CHECK-LABEL: matmul_batchv2
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<2x10x15xf32>, tensor<15x17xf32>) -> tensor<2x10x17xf32>
+}
+
+func @matmul_batchv2_unknown_dim(%arg0: tensor<?x10x15xf32>, %arg1: tensor<15x17xf32>) -> tensor<?x10x17xf32> {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", adj_x = false, adj_y = false} :
+(tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
+  return %0 : tensor<?x10x17xf32>
+// CHECK-LABEL: matmul_batchv2_unknown_dim
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
index 221745b471c..9b1eeab3d7c 100644
--- a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
@@ -292,7 +292,7 @@ func @tensorlistResize(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: ten
 // CHECK:  [[SIZE_DIFF:%.*]] = "tf.Sub"([[SIZE]], [[INPUT_SIZE]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:  [[DIFF_RES:%.*]] = "tf.Greater"([[SIZE_DIFF]], [[ZERO]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:  [[SHAPE_1:%.*]] = "tf.Shape"([[INPUT]]) : (tensor<3x10xf32>) -> tensor<?xi32>
-// CHECK:  [[RESULT:%.*]] = "tf.If"([[DIFF_RES]], [[INPUT]], [[SHAPE_1]], [[SIZE_DIFF]], [[SIZE]]) {else_branch = @cond_false, is_stateless = true, output_shapes = ["{}"], then_branch = @cond_true} : (tensor<i1>, tensor<3x10xf32>, tensor<?xi32>, tensor<i32>, tensor<i32>) -> tensor<?x10xf32>
+// CHECK:  [[RESULT:%.*]] = "tf.If"([[DIFF_RES]], [[INPUT]], [[SHAPE_1]], [[SIZE_DIFF]], [[SIZE]]) {else_branch = @cond_false, is_stateless = true, output_shapes = [], then_branch = @cond_true} : (tensor<i1>, tensor<3x10xf32>, tensor<?xi32>, tensor<i32>, tensor<i32>) -> tensor<?x10xf32>
 // CHECK:  return [[RESULT]] : tensor<?x10xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/convolution_2d_transpose_bias.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/convolution_2d_transpose_bias.mlir
deleted file mode 100644
index 9d134a3fcad..00000000000
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/convolution_2d_transpose_bias.mlir
+++ /dev/null
@@ -1,82 +0,0 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --check-prefix=MLIR %s
-
-
-func @main(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<1x32x42x128xf32>, %arg2: tensor<4xi32>) -> tensor<1x64x84x32xf32> {
-
-// CHECK:  {
-// CHECK-NEXT:    version: 3,
-// CHECK-NEXT:    operator_codes: [ {
-// CHECK-NEXT:      builtin_code: CUSTOM,
-// CHECK-NEXT:      custom_code: "Convolution2DTransposeBias"
-// CHECK-NEXT:    } ],
-// CHECK-NEXT:    subgraphs: [ {
-// CHECK-NEXT:      tensors: [ {
-// CHECK-NEXT:        shape: [ 32, 4, 4, 128 ],
-// CHECK-NEXT:        buffer: 1,
-// CHECK-NEXT:        name: "arg0",
-// CHECK-NEXT:        quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:        }
-// CHECK-NEXT:      }, {
-// CHECK-NEXT:        shape: [ 1, 32, 42, 128 ],
-// CHECK-NEXT:        buffer: 2,
-// CHECK-NEXT:        name: "arg1",
-// CHECK-NEXT:        quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:        }
-// CHECK-NEXT:    }, {
-// CHECK-NEXT:      shape: [ 4 ],
-// CHECK-NEXT:      type: INT32,
-// CHECK-NEXT:      buffer: 3,
-// CHECK-NEXT:      name: "arg2",
-// CHECK-NEXT:      quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:      }
-// CHECK-NEXT:    }, {
-// CHECK-NEXT:      shape: [ 1, 64, 84, 32 ],
-// CHECK-NEXT:      buffer: 4,
-// CHECK-NEXT:      name: "tfl.convolution_2d_transpose_bias",
-// CHECK-NEXT:      quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:      }
-// CHECK-NEXT:    } ],
-// CHECK-NEXT:    inputs: [ 0, 1, 2 ],
-// CHECK-NEXT:    outputs: [ 3 ],
-// CHECK-NEXT:    operators: [ {
-// CHECK-NEXT:      inputs: [ 0, 1, 2 ],
-// CHECK-NEXT:      outputs: [ 3 ],
-// CHECK-NEXT:      custom_options: [ 1, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0 ]
-// CHECK-NEXT:    } ],
-// CHECK-NEXT:    name: "main"
-// CHECK-NEXT:  } ],
-// CHECK-NEXT:  description: "MLIR Converted.",
-// CHECK-NEXT:  buffers: [ {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-NEXT:    data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
-// CHECK-NEXT:  } ],
-// CHECK-NEXT:  metadata: [ {
-// CHECK-NEXT:  name: "min_runtime_version",
-// CHECK-NEXT:  buffer: 5
-// CHECK-NEXT:  } ]
-// CHECK-NEXT:}
-
-// MLIR-LABEL: func @main(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<1x32x42x128xf32>, %arg2: tensor<4xi32>)
-// MLIR-SAME:    -> tensor<1x64x84x32xf32>
-// MLIR:         %0 = "tfl.convolution_2d_transpose_bias"(%arg0, %arg1, %arg2)
-// MLIR-SAME:      {padding = "SAME", stride_h = 1 : i32, stride_w = 2 : i32}
-// MLIR-SAME:      (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
-// MLIR-NEXT:    return %0 : tensor<1x64x84x32xf32>
-
-  %0 = "tfl.convolution_2d_transpose_bias"(%arg0, %arg1, %arg2) {padding = "SAME", stride_h = 1 : i32, stride_w = 2 : i32} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
-  return %0 : tensor<1x64x84x32xf32>
-}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
index 1b46fa3d0e5..320f869ac4c 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
@@ -65,7 +65,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:      opcode_index: 1,
 // CHECK-NEXT:      inputs: [ 2, 1 ],
 // CHECK-NEXT:      outputs: [ 3 ],
-// CHECK-NEXT:      custom_options: [ 105, 110, 116, 95, 97, 116, 116, 114, 0, 102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0, 4, 82, 69, 76, 85, 0, 2, 33, 43, 2, 1, 2, 11, 2, 20, 4, 4, 36, 1 ]
+// CHECK-NEXT:      custom_options: [ 102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0, 4, 82, 69, 76, 85, 0, 105, 110, 116, 95, 97, 116, 116, 114, 0, 2, 42, 11, 2, 1, 2, 20, 2, 20, 4, 4, 36, 1 ]
 // CHECK-NEXT:    }, {
 // CHECK-NEXT:      opcode_index: 2,
 // CHECK-NEXT:      inputs: [ 3 ],
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_pooling_with_arg_max_2d.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_pooling_with_arg_max_2d.mlir
deleted file mode 100644
index fc7ef307bae..00000000000
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_pooling_with_arg_max_2d.mlir
+++ /dev/null
@@ -1,71 +0,0 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --check-prefix=MLIR %s
-
-func @main(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>) {
-
-// CHECK:  {
-// CHECK-NEXT:    version: 3,
-// CHECK-NEXT:    operator_codes: [ {
-// CHECK-NEXT:      builtin_code: CUSTOM,
-// CHECK-NEXT:      custom_code: "MaxPoolingWithArgmax2D"
-// CHECK-NEXT:    } ],
-// CHECK-NEXT:    subgraphs: [ {
-// CHECK-NEXT:      tensors: [ {
-// CHECK-NEXT:        shape: [ 1, 64, 64, 32 ],
-// CHECK-NEXT:        buffer: 1,
-// CHECK-NEXT:        name: "arg0",
-// CHECK-NEXT:        quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:        }
-// CHECK-NEXT:      }, {
-// CHECK-NEXT:        shape: [ 1, 32, 32, 32 ],
-// CHECK-NEXT:        buffer: 2,
-// CHECK-NEXT:        name: "tfl.max_pooling_with_argmax_2d",
-// CHECK-NEXT:        quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:        }
-// CHECK-NEXT:    }, {
-// CHECK-NEXT:      shape: [ 1, 32, 32, 32 ],
-// CHECK-NEXT:      buffer: 3,
-// CHECK-NEXT:      name: "tfl.max_pooling_with_argmax_2d:1",
-// CHECK-NEXT:      quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:      }
-// CHECK-NEXT:    } ],
-// CHECK-NEXT:    inputs: [ 0 ],
-// CHECK-NEXT:    outputs: [ 1, 2 ],
-// CHECK-NEXT:    operators: [ {
-// CHECK-NEXT:      inputs: [ 0 ],
-// CHECK-NEXT:      outputs: [ 1, 2 ],
-// CHECK-NEXT:      custom_options: [ 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
-// CHECK-NEXT:    } ],
-// CHECK-NEXT:    name: "main"
-// CHECK-NEXT:  } ],
-// CHECK-NEXT:  description: "MLIR Converted.",
-// CHECK-NEXT:  buffers: [ {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-NEXT:    data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
-// CHECK-NEXT:  } ],
-// CHECK-NEXT:  metadata: [ {
-// CHECK-NEXT:  name: "min_runtime_version",
-// CHECK-NEXT:  buffer: 4
-// CHECK-NEXT:  } ]
-// CHECK-NEXT:}
-
-// MLIR-LABEL: func @main(%arg0: tensor<1x64x64x32xf32>)
-// MLIR-SAME:    -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
-// MLIR:         %value, %indices = "tfl.max_pooling_with_argmax_2d"(%arg0)
-// MLIR-SAME:      {filter_h = 4 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 1 : i32}
-// MLIR-SAME:      (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
-// MLIR-NEXT:    return %value, %indices : tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>
-
-  %0, %1 = "tfl.max_pooling_with_argmax_2d"(%arg0) {filter_h = 4 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 1 : i32} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
-  return %0, %1 : tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>
-}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_unpool_2d.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_unpool_2d.mlir
deleted file mode 100644
index 0dc6f7ea165..00000000000
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_unpool_2d.mlir
+++ /dev/null
@@ -1,71 +0,0 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --check-prefix=MLIR %s
-
-func @main(%arg0: tensor<1x8x8x128xf32>, %arg1: tensor<1x8x8x128xf32>) -> tensor<1x8x8x128xf32> {
-
-// CHECK:  {
-// CHECK-NEXT:    version: 3,
-// CHECK-NEXT:    operator_codes: [ {
-// CHECK-NEXT:      builtin_code: CUSTOM,
-// CHECK-NEXT:      custom_code: "MaxUnpooling2D"
-// CHECK-NEXT:    } ],
-// CHECK-NEXT:    subgraphs: [ {
-// CHECK-NEXT:      tensors: [ {
-// CHECK-NEXT:        shape: [ 1, 8, 8, 128 ],
-// CHECK-NEXT:        buffer: 1,
-// CHECK-NEXT:        name: "arg0",
-// CHECK-NEXT:        quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:        }
-// CHECK-NEXT:      }, {
-// CHECK-NEXT:        shape: [ 1, 8, 8, 128 ],
-// CHECK-NEXT:        buffer: 2,
-// CHECK-NEXT:        name: "arg1",
-// CHECK-NEXT:        quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:        }
-// CHECK-NEXT:    }, {
-// CHECK-NEXT:      shape: [ 1, 8, 8, 128 ],
-// CHECK-NEXT:      buffer: 3,
-// CHECK-NEXT:      name: "tfl.max_unpooling_2d",
-// CHECK-NEXT:      quantization: {
-// CHECK-EMPTY:
-// CHECK-NEXT:      }
-// CHECK-NEXT:    } ],
-// CHECK-NEXT:    inputs: [ 0, 1 ],
-// CHECK-NEXT:    outputs: [ 2 ],
-// CHECK-NEXT:    operators: [ {
-// CHECK-NEXT:      inputs: [ 0, 1 ],
-// CHECK-NEXT:      outputs: [ 2 ],
-// CHECK-NEXT:      custom_options: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
-// CHECK-NEXT:    } ],
-// CHECK-NEXT:    name: "main"
-// CHECK-NEXT:  } ],
-// CHECK-NEXT:  description: "MLIR Converted.",
-// CHECK-NEXT:  buffers: [ {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-EMPTY:
-// CHECK-NEXT:  }, {
-// CHECK-NEXT:    data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
-// CHECK-NEXT:  } ],
-// CHECK-NEXT:  metadata: [ {
-// CHECK-NEXT:  name: "min_runtime_version",
-// CHECK-NEXT:  buffer: 4
-// CHECK-NEXT:  } ]
-// CHECK-NEXT:}
-
-// MLIR-LABEL:  func @main(%arg0: tensor<1x8x8x128xf32>, %arg1: tensor<1x8x8x128xf32>)
-// MLIR-SAME:    -> tensor<1x8x8x128xf32>
-// MLIR:         %0 = "tfl.max_unpooling_2d"(%arg0, %arg1)
-// MLIR-SAME:     {filter_h = 1 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 4 : i32, stride_w = 2 : i32}
-// MLIR-SAME:     (tensor<1x8x8x128xf32>, tensor<1x8x8x128xf32>) -> tensor<1x8x8x128xf32>
-// MLIR-NEXT:    return %0 : tensor<1x8x8x128xf32>
-
-  %0 = "tfl.max_unpooling_2d"(%arg0, %arg1) {filter_h = 1 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 4 : i32, stride_w = 2 : i32} : (tensor<1x8x8x128xf32>, tensor<1x8x8x128xf32>) -> (tensor<1x8x8x128xf32>)
-  return %0 : tensor<1x8x8x128xf32>
-}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
new file mode 100644
index 00000000000..621d10d9000
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
@@ -0,0 +1,77 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
+
+func @main(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+// CHECK: {
+// CHECK-NEXT:  version: 3,
+// CHECK-NEXT:  operator_codes: [ {
+// CHECK-NEXT:    builtin_code: TRANSPOSE_CONV,
+// CHECK-NEXT:    version: 1
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  subgraphs: [ {
+// CHECK-NEXT:    tensors: [ {
+// CHECK-NEXT:      shape: [ 4 ],
+// CHECK-NEXT:      type: INT32,
+// CHECK-NEXT:      buffer: 1,
+// CHECK-NEXT:      name: "arg0",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 32, 4, 4, 128 ],
+// CHECK-NEXT:      buffer: 2,
+// CHECK-NEXT:      name: "arg1",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 1, 32, 42, 128 ],
+// CHECK-NEXT:      buffer: 3,
+// CHECK-NEXT:      name: "arg2",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 1, 64, 84, 32 ],
+// CHECK-NEXT:      buffer: 4,
+// CHECK-NEXT:      name: "tfl.transpose_conv",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    inputs: [ 0, 1, 2 ],
+// CHECK-NEXT:    outputs: [ 3 ],
+// CHECK-NEXT:    operators: [ {
+// CHECK-NEXT:      inputs: [ 0, 1, 2 ],
+// CHECK-NEXT:      outputs: [ 3 ],
+// CHECK-NEXT:      builtin_options_type: TransposeConvOptions,
+// CHECK-NEXT:      builtin_options: {
+// CHECK-NEXT:        stride_w: 2,
+// CHECK-NEXT:        stride_h: 2
+// CHECK-NEXT:      }
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    name: "main"
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  description: "MLIR Converted.",
+// CHECK-NEXT:  buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 49, 46, 57, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:    name: "min_runtime_version",
+// CHECK-NEXT:    buffer: 5
+// CHECK-NEXT:  } ]
+// CHECK-NEXT:}
+
+  %cst = constant unit
+  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  return %0 : tensor<1x64x84x32xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index a85c7f2c8ff..f42e06350e5 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -192,7 +192,7 @@ func @testSquare(tensor<? x f32>) -> tensor<? x f32> {
 
 func @testQuantizedResizeNearestNeighbor(tensor<? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>> {
 ^bb0(%arg0: tensor<? x !quant.uniform<u8:f32, 0.1>>, %arg1: tensor<? x i32>):
-  %0 = "tfl.resize_nearest_neighbor"(%arg0, %arg1) { align_corners = false } : (tensor<? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
+  %0 = "tfl.resize_nearest_neighbor"(%arg0, %arg1) { align_corners = false, half_pixel_centers = false } : (tensor<? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
   return %0 : tensor<? x !quant.uniform<u8:f32, 0.1>>
 }
 
@@ -225,10 +225,10 @@ func @testZerosLike(tensor<? x f32>) -> tensor<? x f32> {
 }
 
 // CHECK-LABEL: testDequantize
-func @testDequantize(tensor<? x i32>) -> tensor<? x f32> {
-^bb0(%arg0: tensor<? x i32>):
-  // CHECK: "tfl.dequantize"(%arg0) : (tensor<?xi32>) -> tensor<?xf32>
-  %0 = "tfl.dequantize"(%arg0): (tensor<? x i32>) -> tensor<? x f32>
+func @testDequantize(tensor<? x !quant.uniform<i8:f32, 0.1>>) -> tensor<? x f32> {
+^bb0(%arg0: tensor<? x !quant.uniform<i8:f32, 0.1>>):
+  // CHECK: "tfl.dequantize"(%arg0) : (tensor<?x!quant.uniform<i8:f32, 1.000000e-01>>) -> tensor<?xf32>
+  %0 = "tfl.dequantize"(%arg0): (tensor<? x !quant.uniform<i8:f32, 0.1>>) -> tensor<? x f32>
   return %0 : tensor<? x f32>
 }
 
@@ -277,6 +277,34 @@ func @testMul(tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32> {
   return %0#0 : tensor<? x i32>
 }
 
+// CHECK-LABEL: testMulNonQuantizedOperandsandQuantizedResult
+func @testMulNonQuantizedOperandsandQuantizedResult(tensor<? x f32>, tensor<? x f32>) -> tensor<? x !quant.any<i16:f32>> {
+^bb0(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>):
+  // CHECK: "tfl.mul"(%arg0, %arg1) {fused_activation_function = "RELU6"}
+  %0 = "tfl.mul"(%arg0, %arg1) {fused_activation_function = "RELU6"}: (tensor<? x f32>, tensor<? x f32>) -> tensor<? x !quant.any<i16:f32>>
+  return %0#0 : tensor<? x !quant.any<i16:f32>>
+}
+
+// -----
+
+func @testMulInvalidOperands(tensor<? x f32>, tensor<? x i32>) -> tensor<? x i32> {
+^bb0(%arg0: tensor<? x f32>, %arg1: tensor<? x i32>):
+  // expected-error @+1 {{failed to verify that operands have same element type}}
+  %0 = "tfl.mul"(%arg0, %arg1) {fused_activation_function = "RELU6"}: (tensor<? x f32>, tensor<? x i32>) -> tensor<? x i32>
+  return %0#0 : tensor<? x i32>
+}
+
+// -----
+
+func @testMulInvalidQuantizedOperands(tensor<* x !quant.any<i16:f32>>, tensor<* x !quant.any<i8:f32>>) -> tensor<* x !quant.any<i16:f32>> {
+^bb0(%arg0: tensor<* x !quant.any<i16:f32>>, %arg1: tensor<* x !quant.any<i8:f32>>):
+  // expected-error @+1 {{failed to verify that operands have same element type}}
+  %0 = "tfl.mul"(%arg0, %arg1) {fused_activation_function = "RELU6"}: (tensor<* x !quant.any<i16:f32>>, tensor<* x !quant.any<i8:f32>>) -> tensor<* x !quant.any<i16:f32>>
+  return %0#0 : tensor<* x !quant.any<i16:f32>>
+}
+
+// -----
+
 // CHECK-LABEL: testDiv
 func @testDiv(tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32> {
 ^bb0(%arg0: tensor<? x i32>, %arg1: tensor<? x i32>):
@@ -517,14 +545,16 @@ func @testMaxPool2DWrongOperandStorageType(tensor<1x7x7x16x!quant.uniform<i9:f32
 // -----
 
 func @testMaxPoolingWithArgMax2D(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>) {
-  %0, %1 = "tfl.max_pooling_with_argmax_2d"(%arg0) {filter_h = 2 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
+  // custom op for "tfl.max_pooling_with_argmax_2d"(%arg0) {filter_h = 2 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
+  %0, %1 = "tfl.custom"(%arg0) {custom_option = opaque<"tfl", "0x01000000020000000200000002000000020000000000000000000000000000000000000000000000"> : tensor<40xi8>, custom_code = "MaxPoolingWithArgmax2D"} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
   return %0, %1 : tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>
 }
 
 // -----
 
 func @testMaxUnpooling2D(%arg0: tensor<1x8x8x128xf32>, %arg1: tensor<1x8x8x128xf32>) -> tensor<1x8x8x128xf32> {
-  %0 = "tfl.max_unpooling_2d"(%arg0, %arg1) {filter_h = 2 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x8x8x128xf32>, tensor<1x8x8x128xf32>) -> (tensor<1x8x8x128xf32>)
+  // custom op for "tfl.max_unpooling_2d"(%arg0, %arg1) {filter_h = 2 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x8x8x128xf32>, tensor<1x8x8x128xf32>) -> (tensor<1x8x8x128xf32>)
+  %0 = "tfl.custom"(%arg0, %arg1) {custom_option = opaque<"tfl", "0x01000000020000000200000002000000020000000000000000000000000000000000000000000000"> : tensor<40xi8>, custom_code = "MaxUnpooling2D"} : (tensor<1x8x8x128xf32>, tensor<1x8x8x128xf32>) -> (tensor<1x8x8x128xf32>)
   return %0 : tensor<1x8x8x128xf32>
 }
 
@@ -543,7 +573,7 @@ func @testLogistic(tensor<1x2x3x4x5xf32>) -> tensor<1x2x3x4x5xf32> {
 // test invalid Logistic input
 func @testLogisticWithWrongInputType(tensor<?xi32>) -> tensor<?xi32> {
 ^bb0(%arg0: tensor<?xi32>):
-  // expected-error @+1 {{tfl.logistic' op operand #0 must be tensor of 32-bit float or QI8 type or QUI8 type or QI16 type or QUI16 type values}}
+  // expected-error @+1 {{'tfl.logistic' op operand #0 must be tensor of 32-bit float or QI8 type or QUI8 type or QI16 type or TFLite quint8 type values, but got 'tensor<?xi32>'}}
   %0 = "tfl.logistic"(%arg0): (tensor<?xi32>) -> tensor<?xi32>
   return %0#0 : tensor<?xi32>
 }
@@ -609,9 +639,9 @@ func @testLstmIntermediates(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.0372480
 // -----
 
 // CHECK-LABEL: testBidirectionalSequenceLstm
-func @testBidirectionalSequenceLstm(%arg0: tensor<? x f32>, %arg1: none, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>, %arg24: tensor<? x f32>, %arg25: tensor<? x f32>, %arg26: tensor<? x f32>, %arg27: tensor<? x f32>, %arg28: tensor<? x f32>, %arg29: tensor<? x f32>, %arg30: tensor<? x f32>, %arg31: tensor<? x f32>, %arg32: tensor<? x f32>, %arg33: tensor<? x f32>, %arg34: tensor<? x f32>, %arg35: tensor<? x f32>, %arg36: tensor<? x f32>, %arg37: tensor<? x f32>, %arg38: tensor<? x f32>, %arg39: tensor<? x f32>, %arg40: tensor<? x f32>, %arg41: tensor<? x f32>, %arg42: tensor<? x f32>, %arg43: tensor<? x f32>, %arg44: tensor<? x f32>, %arg45: tensor<? x f32>, %arg46: tensor<? x f32>, %arg47: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.bidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg30, %arg31, %arg32, %arg33, %arg34, %arg35, %arg36, %arg37, %arg38, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg45, %arg46, %arg47) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", merge_outputs = true, time_major = false} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
-  %0:2 = "tfl.bidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg30, %arg31, %arg32, %arg33, %arg34, %arg35, %arg36, %arg37, %arg38, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg45, %arg46, %arg47) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", merge_outputs = true, time_major = false} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
+func @testBidirectionalSequenceLstm(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>, %arg3: tensor<?x?xf32>, %arg4: tensor<?x?xf32>, %arg5: tensor<?x?xf32>, %arg6: tensor<?x?xf32>, %arg7: tensor<?x?xf32>, %arg8: tensor<?x?xf32>, %arg9: tensor<?xf32>, %arg10: tensor<?xf32>, %arg11: tensor<?xf32>, %arg12: tensor<?xf32>, %arg13: tensor<?xf32>, %arg14: tensor<?xf32>, %arg15: tensor<?xf32>, %arg16: tensor<?x?xf32>, %arg17: tensor<?xf32>, %arg18: tensor<?x?xf32>, %arg19: tensor<?x?xf32>, %arg20: tensor<?x?xf32>, %arg21: tensor<?x?xf32>, %arg22: tensor<?x?xf32>, %arg23: tensor<?x?xf32>, %arg24: tensor<?x?xf32>, %arg25: tensor<?x?xf32>, %arg26: tensor<?xf32>, %arg27: tensor<?xf32>, %arg28: tensor<?xf32>, %arg29: tensor<?xf32>, %arg30: tensor<?xf32>, %arg31: tensor<?xf32>, %arg32: tensor<?xf32>, %arg33: tensor<?x?xf32>, %arg34: tensor<?xf32>, %arg35: tensor<?xf32>, %arg36: tensor<?xf32>, %arg37: tensor<?xf32>, %arg38: tensor<?xf32>, %arg39: tensor<?xf32>, %arg40: tensor<?xf32>, %arg41: tensor<?xf32>, %arg42: tensor<?xf32>, %arg43: tensor<?xf32>, %arg44: tensor<?xf32>, %arg45: tensor<?xf32>, %arg46: tensor<?xf32>, %arg47: tensor<?xf32>) -> tensor<?xf32> {
+  // CHECK: "tfl.bidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg30, %arg31, %arg32, %arg33, %arg34, %arg35, %arg36, %arg37, %arg38, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg45, %arg46, %arg47) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", merge_outputs = true, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
+  %0:2 = "tfl.bidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg30, %arg31, %arg32, %arg33, %arg34, %arg35, %arg36, %arg37, %arg38, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg45, %arg46, %arg47) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", merge_outputs = true, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
   return %0#0 : tensor<?xf32>
 }
 
@@ -1222,10 +1252,10 @@ func @testOneHot(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %
 
 // -----
 
-func @testOneHotWithInvalidOutputType(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<*xi8> {
-  // expected-error @+1 {{'tfl.one_hot' op result #0 must be tensor of 32-bit float or 32-bit signless integer or 64-bit signless integer or 1-bit signless integer values}}
-  %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xi8>
-  return %0 : tensor<*xi8>
+func @testOneHotWithInvalidOutputType(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<*xi16> {
+  // expected-error @+1 {{'tfl.one_hot' op result #0 must be tensor of 32-bit float or 32-bit signless integer or 64-bit signless integer or 1-bit signless integer or 8-bit signless integer or 8-bit unsigned integer values, but got 'tensor<*xi16>'}}
+  %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xi16>
+  return %0 : tensor<*xi16>
 }
 
 // -----
@@ -1444,22 +1474,23 @@ func @testRelu6WithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
 
 // -----
 
-func @testEmbeddingLookup(%arg0 : tensor<?xi32>, %arg1 : tensor<?xf32>) -> tensor<?xf32> {
-  %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?xf32>) -> tensor<?xf32>
+func @testEmbeddingLookup(%arg0 : tensor<?xi32>, %arg1 : tensor<?x?xf32>) -> tensor<?xf32> {
+  %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?x?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
-func @testEmbeddingLookupValueAndResultElementTypeTraitFailed(%arg0 : tensor<?xi32>, %arg1 : tensor<?xi8>) -> tensor<?xf32> {
+func @testEmbeddingLookupValueAndResultElementTypeTraitFailed(%arg0 : tensor<?xi32>, %arg1 : tensor<?x?xi8>) -> tensor<?xf32> {
   // expected-error @+1 {{'tfl.embedding_lookup' op failed to verify that value and output must have same element type}}
-  %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?xi8>) -> tensor<?xf32>
+  %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?x?xi8>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
-func @testQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>> {
+func @testWrongQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>> {
+  // expected-error @+1 {{'tfl.local_response_normalization' op operand #0 must be tensor of 32-bit float values, but got 'tensor<1x56x56x192x!quant.uniform<u8:f32, 2.000000e-02>>'}}
   %0 = "tfl.local_response_normalization"(%arg0) {alpha = 9.99999974E-5 : f32, beta = 5.000000e-01 : f32, bias = 2.000000e+00 : f32, radius = 5 : i32} : (tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
   return %0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
 }
@@ -1493,32 +1524,32 @@ func @testDepthToSpaceInvalidOutputType(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x
 
 // -----
 
-func @testPReluWrongOutputRank(%arg0: tensor<10x10x10x10xf32>, %arg1: tensor<1x1x10xf32>) -> tensor<10x10x10xf32> {
-  // expected-error @+1 {{'input' and 'output' should have the same rank}}
-  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<10x10x10x10xf32>, tensor<1x1x10xf32>) -> tensor<10x10x10xf32>
-  return %0 : tensor<10x10x10xf32>
+func @testPReluWrongOutputRank(%arg0: tensor<10x10x10x10xf32>, %arg1: tensor<10x10x10x10xf32>) -> tensor<10x10xf32> {
+  // expected-error @+1 {{'tfl.prelu' op result type '10x10' not broadcast compatible with broadcasted operands's shapes '10x10x10x10'}}
+  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<10x10x10x10xf32>, tensor<10x10x10x10xf32>) -> tensor<10x10xf32>
+  return %0 : tensor<10x10xf32>
 }
 
 // -----
 
 func @testPReluWrongOutputShape(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<2x3x4xf32>) -> tensor<1x2x3x5xf32> {
-  // expected-error @+1 {{'input' and 'output' should have the same shape}}
+  // expected-error @+1 {{'tfl.prelu' op result type '1x2x3x5' not broadcast compatible with broadcasted operands's shapes '1x2x3x4'}}
   %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<1x2x3x4xf32>, tensor<2x3x4xf32>) -> tensor<1x2x3x5xf32>
   return %0 : tensor<1x2x3x5xf32>
 }
 
 // -----
 
-func @testPReluWrongAlphaRank(%arg0: tensor<7x3x2x14xf32>, %arg1: tensor<2x7x3x2x14xf32>) -> tensor<7x3x2x14xf32> {
+func @testPReluWrongAlphaRank(%arg0: tensor<7x3x2x14xf32>, %arg1: tensor<7x3x2x14xf32>) -> tensor<7x3x2x14xf32> {
   // expected-error @+1 {{'alpha' should have one less rank than 'input'.}}
-  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<7x3x2x14xf32>, tensor<2x7x3x2x14xf32>) -> tensor<7x3x2x14xf32>
+  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<7x3x2x14xf32>, tensor<7x3x2x14xf32>) -> tensor<7x3x2x14xf32>
   return %0 : tensor<7x3x2x14xf32>
 }
 
 // -----
 
 func @testPReluInvalidBroadcast(%arg0: tensor<15x14x2x14xf32>, %arg1: tensor<1x1x3xf32>) -> tensor<15x14x2x14xf32> {
-  // expected-error @+1 {{'alpha' is not broadcastable at dimension 2.}}
+  // expected-error @+1 {{'tfl.prelu' op operands don't have broadcast-compatible shapes}}
   %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<15x14x2x14xf32>, tensor<1x1x3xf32>) -> tensor<15x14x2x14xf32>
   return %0 : tensor<15x14x2x14xf32>
 }
@@ -2032,22 +2063,34 @@ func @testFullyConnectedWithBadOutputShape(%arg0: tensor<1x37xf32>, %arg1: tenso
 // -----
 
 func @testTransposeConv(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
-  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32>
+  %cst = constant unit
+  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
   return %0 : tensor<1x64x84x32xf32>
 }
 
 // -----
 
 func @testConvolution2DTransposeBias(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<1x32x42x128xf32>, %arg2: tensor<4xi32>) -> tensor<1x64x84x32xf32> {
-  %0 = "tfl.convolution_2d_transpose_bias"(%arg0, %arg1, %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
+  // custom op for "tfl.convolution_2d_transpose_bias"(%arg0, %arg1, %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_option = opaque<"tfl", "0x010000000200000002000000"> : tensor<12xi8>, custom_code = "Convolution2DTransposeBias"} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
+  return %0 : tensor<1x64x84x32xf32>
+}
+
+// -----
+
+func @testConvolution2DTransposeNoBias(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant unit
+  // custom op for "tfl.convolution_2d_transpose_bias"(%arg0, %arg1, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.custom"(%arg0, %arg1, %cst) {custom_option = opaque<"tfl", "0x010000000200000002000000"> : tensor<12xi8>, custom_code = "Convolution2DTransposeBias"} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
   return %0 : tensor<1x64x84x32xf32>
 }
 
 // -----
 
 func @testTransposeConvBadOutputRank(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x32x42x128xf32>) -> tensor<64x84x32xf32> {
+  %cst = constant unit
   // expected-error @+1 {{expect output type has rank = 4, got output type tensor<64x84x32xf32>}}
-  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>) -> tensor<64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<64x84x32xf32>
   return %0 : tensor<64x84x32xf32>
 }
 
@@ -2055,8 +2098,9 @@ func @testTransposeConvBadOutputRank(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x
 
 func @testTransposeConvBadOutputShape(%arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x32x42x128xf32>) -> tensor<1x64x84x31xf32> {
   %cst = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_1 = constant unit
   // expected-error @+1 {{expect output type tensor<1x64x84x32xf32>, got tensor<1x64x84x31xf32>}}
-  %0 = "tfl.transpose_conv"(%cst, %arg1, %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x64x84x31xf32>
+  %0 = "tfl.transpose_conv"(%cst, %arg1, %arg2, %cst_1) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x31xf32>
   return %0 : tensor<1x64x84x31xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index d1ead351005..2815afd14b9 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -439,6 +439,31 @@ func @NotReorderReshapeAddIfNotTailingDim(%arg0: tensor<40x40x1xf32>) -> tensor<
   // CHECK: return %[[rs2]]
 }
 
+// CHECK-LABEL: @ReorderElementwiseValueOpAndMoveOp
+func @ReorderElementwiseValueOpAndMoveOp(%arg0: tensor<40x40x1xf32>) -> tensor<40x40xf32> {
+  %shape = constant dense<[40, 40]> : tensor<2xi32>
+  %1 = "tfl.reshape"(%arg0, %shape) : (tensor<40x40x1xf32>, tensor<2xi32>) -> tensor<40x40xf32>
+  %2 = "tfl.relu"(%1) : (tensor<40x40xf32>) -> tensor<40x40xf32>
+  return %2 : tensor<40x40xf32>
+
+  // CHECK: %[[rs1:.*]] = "tfl.relu"(%arg0
+  // CHECK: %[[rs2:.*]] = "tfl.reshape"(%[[rs1]]
+  // CHECK: return %[[rs2]]
+}
+
+// CHECK-LABEL: @NotReorderElementwiseValueOpAndMoveOp
+func @NotReorderElementwiseValueOpAndMoveOp(%arg0: tensor<40x40x1xf32>) -> (tensor<40x40xf32>, tensor<40x40xf32>) {
+  %shape = constant dense<[40, 40]> : tensor<2xi32>
+  %1 = "tfl.reshape"(%arg0, %shape) : (tensor<40x40x1xf32>, tensor<2xi32>) -> tensor<40x40xf32>
+  %2 = "tfl.relu"(%1) : (tensor<40x40xf32>) -> tensor<40x40xf32>
+  return %1, %2 : tensor<40x40xf32>, tensor<40x40xf32>
+
+  // CHECK: %[[rs1:.*]] = "tfl.reshape"(%arg0
+  // CHECK: %[[rs2:.*]] = "tfl.relu"(%[[rs1]]
+  // CHECK: return %[[rs1]], %[[rs2]]
+}
+
+
 // CHECK-LABEL: @FuseFullyConnectedRelu
 func @FuseFullyConnectedRelu(%arg0: tensor<1x256xf32>, %arg1: tensor<128x256xf32>, %arg2: tensor<128xf32>) -> tensor<1x128xf32> {
   %0 = "tfl.fully_connected" (%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x256xf32>, tensor<128x256xf32>, tensor<128xf32>) -> tensor<1x128xf32>
@@ -450,6 +475,28 @@ func @FuseFullyConnectedRelu(%arg0: tensor<1x256xf32>, %arg1: tensor<128x256xf32
   // CHECK: return %[[RES]]
 }
 
+// CHECK-LABEL: @FuseFullyConnectedRelu6
+func @FuseFullyConnectedRelu6(%arg0: tensor<1x256xf32>, %arg1: tensor<128x256xf32>, %arg2: tensor<128xf32>) -> tensor<1x128xf32> {
+  %0 = "tfl.fully_connected" (%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x256xf32>, tensor<128x256xf32>, tensor<128xf32>) -> tensor<1x128xf32>
+  %1 = "tfl.relu6"(%0) : (tensor<1x128xf32>) -> tensor<1x128xf32>
+  return %1 : tensor<1x128xf32>
+
+  // CHECK: %[[RES:[0-9].*]] = "tfl.fully_connected"
+  // CHECK-SAME: fused_activation_function = "RELU6"
+  // CHECK: return %[[RES]]
+}
+
+// CHECK-LABEL: @FuseFullyConnectedRelu1
+func @FuseFullyConnectedRelu1(%arg0: tensor<1x256xf32>, %arg1: tensor<128x256xf32>, %arg2: tensor<128xf32>) -> tensor<1x128xf32> {
+  %0 = "tfl.fully_connected" (%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x256xf32>, tensor<128x256xf32>, tensor<128xf32>) -> tensor<1x128xf32>
+  %1 = "tfl.relu_n1_to_1"(%0) : (tensor<1x128xf32>) -> tensor<1x128xf32>
+  return %1 : tensor<1x128xf32>
+
+  // CHECK: %[[RES:[0-9].*]] = "tfl.fully_connected"
+  // CHECK-SAME: fused_activation_function = "RELU_N1_TO_1"
+  // CHECK: return %[[RES]]
+}
+
 // CHECK-LABEL: @HardSwishPattern
 func @HardSwishPattern(%arg0: tensor<1xf32>) -> tensor<1xf32> {
   %three = constant dense<3.> : tensor<f32>
@@ -911,3 +958,16 @@ func @FusingdivRelu(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32>
 // Fusing:  %[[div2:[0-9].*]] = tfl.div %[[relu]], %[[div1]] {fused_activation_function = "RELU6"} : tensor<1xf32>
 // Fusing:  return
 }
+
+func @ReorderAddWithConstant(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<1.0> : tensor<2x2xf32>
+  %cst_1 = constant dense<2.0> : tensor<2x2xf32>
+  %0 = "tfl.add"(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  %1 = "tfl.add"(%0, %cst_1) {fused_activation_function = "NONE"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %1 : tensor<2x2xf32>
+
+  // CHECK-LABEL: ReorderAddWithConstant
+  // CHECK: %[[CONST:.*]] = constant dense<3.000000e+00> : tensor<2x2xf32>
+  // CHECK: %[[RESULT:.*]] = tfl.add %arg0, %[[CONST]] {fused_activation_function = "NONE"} : tensor<2x2xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
index 5377c4fdb98..6573a2f1c36 100644
--- a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
@@ -19,6 +19,16 @@ func @RemoveUnused(%arg0: tensor<4xf32>, %arg1: tensor<i32>) -> (tensor<2xf32>,t
 // CHECK-NEXT: return %[[split]]#0, %[[split]]#1
 }
 
+// CHECK-LABEL: RemoveTrival
+func @RemoveTrival(%arg0: tensor<384x512x!quant.uniform<i8:f32, 1.0:-128>>, %arg1: tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.0>>, %arg2: none) -> tensor<384x128x!quant.uniform<i8:f32, 2.0>> {
+  %1 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<384x512x!quant.uniform<i8:f32, 1.0:-128>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.0>>, none) -> tensor<384x128x!quant.uniform<i8:f32, 1.0>>
+  %2 = "tfl.quantize"(%1) {qtype = tensor<384x128x!quant.uniform<i8:f32, 2.0>>} : (tensor<384x128x!quant.uniform<i8:f32, 1.0>>) -> tensor<384x128x!quant.uniform<i8:f32, 2.0>>
+  return %2 : tensor<384x128x!quant.uniform<i8:f32, 2.0>>
+
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"{{.*}} -> tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>
+// CHECK-NEXT: return %[[fc]]
+}
+
 func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
   %cst = constant dense<[1, 1001]> : tensor<2xi32>
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 5e456b1a7e5..3af0b25a8e3 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -289,8 +289,8 @@ func @QDQFollowedByRank(%arg0: tensor<1x2xf32>) -> (tensor<i32>) {
   %2 = "tf.Rank"(%1): (tensor<1x2xf32>) -> tensor<i32>
   return %2 : tensor<i32>
 
-// CHECK: %[[R:.*]] = "tf.Rank"(%arg0)
-// CHECK-NEXT: return %[[R]] : tensor<i32>
+// CHECK: %[[R:.*]] = constant dense<2>
+// CHECK: return %cst : tensor<i32>
 }
 
 // CHECK-LABEL: fakeQuantWithConv2D
@@ -418,14 +418,10 @@ func @matmulNoTransposeAOrB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1000xf
   return %166 : tensor<1x1000xf32>
 
   // CHECK-LABEL: matmulNoTransposeAOrB
-  // CHECK: %cst = constant dense<0> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<-1> : tensor<i32>
-  // CHECK: %cst_1 = constant dense<1> : tensor<i32>
-  // CHECK: %0 = "tf.Rank"(%arg1) : (tensor<1280x1000xf32>) -> tensor<i32>
-  // CHECK: %1 = "tf.Range"(%0, %cst, %cst_0) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-  // CHECK: %2 = "tf.Sub"(%1, %cst_1) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-  // CHECK: %3 = "tf.Transpose"(%arg1, %2) : (tensor<1280x1000xf32>, tensor<?xi32>) -> tensor<*xf32>
-  // CHECK: %4 = "tf.MatMul"(%arg0, %3) {transpose_a = false, transpose_b = true} : (tensor<1x1280xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
+  // CHECK: %0 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: %1 = "tf.Transpose"(%arg1, %0) : (tensor<1280x1000xf32>, tensor<?xi32>) -> tensor<*xf32>
+  // CHECK: %2 = "tf.MatMul"(%arg0, %1) {transpose_a = false, transpose_b = true} : (tensor<1x1280xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
+  // CHECK: return %2 : tensor<1x1000xf32>
  }
 
 func @matmulNoTransposeB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1000xf32>) -> tensor<1x1000xf32> {
@@ -433,18 +429,12 @@ func @matmulNoTransposeB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1000xf32>
   return %166 : tensor<1x1000xf32>
 
   // CHECK-LABEL: matmulNoTransposeB
-  // CHECK: %cst = constant dense<0> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<-1> : tensor<i32>
-  // CHECK: %cst_1 = constant dense<1> : tensor<i32>
-  // CHECK: %0 = "tf.Rank"(%arg0) : (tensor<1x1280xf32>) -> tensor<i32>
-  // CHECK: %1 = "tf.Range"(%0, %cst, %cst_0) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-  // CHECK: %2 = "tf.Sub"(%1, %cst_1) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-  // CHECK: %3 = "tf.Transpose"(%arg0, %2) : (tensor<1x1280xf32>, tensor<?xi32>) -> tensor<*xf32>
-  // CHECK: %4 = "tf.Rank"(%arg1) : (tensor<1280x1000xf32>) -> tensor<i32>
-  // CHECK: %5 = "tf.Range"(%4, %cst, %cst_0) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-  // CHECK: %6 = "tf.Sub"(%5, %cst_1) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-  // CHECK: %7 = "tf.Transpose"(%arg1, %6) : (tensor<1280x1000xf32>, tensor<?xi32>) -> tensor<*xf32>
-  // CHECK: %8 = "tf.MatMul"(%3, %7) {transpose_a = false, transpose_b = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
+  // CHECK: %0 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x1280xf32>, tensor<?xi32>) -> tensor<*xf32>
+  // CHECK: %2 = "tf.Transpose"(%arg1, %0) : (tensor<1280x1000xf32>, tensor<?xi32>) -> tensor<*xf32>
+  // CHECK: %3 = "tf.MatMul"(%1, %2) {transpose_a = false, transpose_b = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
+  // CHECK: return %3 : tensor<1x1000xf32>
+
 }
 
 func @snapshot(%arg0: tensor<3xi32>) -> tensor<3xi32> {
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 57f15719cfd..d3f1a430642 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -48,7 +48,8 @@ void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
       quant_specs.default_ranges.second.hasValue()) {
     pass_manager->addPass(mlir::TFL::CreateDefaultQuantParamsPass(
         quant_specs.default_ranges.first.getValueOr(0.0),
-        quant_specs.default_ranges.second.getValueOr(0.0)));
+        quant_specs.default_ranges.second.getValueOr(0.0),
+        quant_specs.IsSignedInferenceType()));
     pass_manager->addPass(mlir::TFL::CreateQuantizePass());
     pass_manager->addPass(
         mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
@@ -73,6 +74,11 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     pass_manager->addPass(mlir::TFControlFlow::CreateRaiseTFControlFlowPass());
   }
 
+  if (pass_config.shape_inference) {
+    pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
+  }
+  // Keep this pass after the shape inference pass, which couldn't do shape
+  // inference for non-tf ops.
   if (!pass_config.quant_specs.serialized_quant_stats.empty()) {
     pass_manager->addPass(
         mlir::quant::CreateImportQuantStatsPassForTFControlDialect(
@@ -80,26 +86,10 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
   }
 
   // The conversion pipeline has to follow the following orders:
-  // 1) Try to convert ophint nodes if present first like ophint lstm.
-  // 2) Saved model related optimization like decompose resource ops
-  // 3) Convert composite functions like lstm/rnns, along with proper function
+  // 1) Saved model related optimization like decompose resource ops
+  // 2) Convert composite functions like lstm/rnns, along with proper function
   // inlining & dce.
-  // 4) Lower static tensor list pass.
-
-  // The ophint extractions happen before lots of other passes:
-  // The assumption of ophint-extraction is each ophinted region is a black-box
-  // and nodes within this black-box is NOT connected to the nodes OUTSIDE the
-  // black-box.
-  // Some passes may merge nodes together (such as const nodes), however, this
-  // will break the ophint-extraction assumption. (The nodes within the black
-  // box is not isolated anymore).
-  // So ophint extraction and legalization needs to happen before
-  // the canonicalization pass.
-  if (pass_config.emit_builtin_tflite_ops) {
-    pass_manager->addPass(mlir::TFL::CreateExtractOphintPass());
-    // Convert composite op pass will happen after ophint extraction pass.
-    pass_manager->addPass(mlir::TFL::CreateLegalizeOphintFuncOpPass());
-  }
+  // 3) Lower static tensor list pass.
 
   // This decomposes resource ops like ResourceGather into read-variable op
   // followed by gather. This is used when the saved model import path is used
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index ab9baefacaf..fce1333a491 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -130,6 +130,8 @@ int main(int argc, char **argv) {
   // interface. That also means we need to relay the value set in one option to
   // all its aliases.
   mlir::registerAsmPrinterCLOptions();
+  mlir::registerMLIRContextCLOptions();
+  mlir::registerPassManagerCLOptions();
   llvm::cl::ParseCommandLineOptions(
       argc, argv, "TF GraphDef to TFLite FlatBuffer converter\n");
 
@@ -158,6 +160,11 @@ int main(int argc, char **argv) {
         absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
     absl::Span<std::string> exported_names(exported_names_vector);
 
+    if (exported_names.size() != 1) {
+      llvm::errs() << "There should be only one exported name";
+      return kTrFailure;
+    }
+
     module = tensorflow::ImportSavedModel(input_file_name, saved_model_version,
                                           tags, exported_names, &context);
   } else {
@@ -173,6 +180,7 @@ int main(int argc, char **argv) {
   if (!module.ok()) return kTrFailure;
 
   mlir::PassManager pm(&context);
+  applyPassManagerCLOptions(pm);
 
   // Set the quantization specifications from the command line flags.
   mlir::TFL::QuantizationSpecs quant_specs;
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 0c82a71f952..62f64ab63b4 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -92,13 +92,15 @@ StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
         file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
         input_shapes, output_arrays, /*control_output_arrays=*/"",
         prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
-        /*graph_as_function=*/false, /*upgrade_legacy=*/true, context);
+        /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+        /*enable_shape_inference=*/false, context);
   }
   return tensorflow::GraphdefToMlirTranslateFunction(
       file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
       input_shapes, output_arrays, /*control_output_arrays=*/"",
       prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
-      /*graph_as_function=*/false, /*upgrade_legacy=*/true, context);
+      /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+      /*enable_shape_inference=*/false, context);
 }
 
 Status ConvertTFExecutorToTFLOrFlatbuffer(
@@ -172,7 +174,7 @@ StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     return module;
   } else if (saved_model_version == 1) {
     auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
-        input_filename, tags, context);
+        input_filename, tags, exported_names, context);
 
     if (!module)
       return tensorflow::errors::InvalidArgument("fail to open input file");
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index 0319e8555fa..c23ae9fcfab 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Support/Functional.h"
 #include "mlir/Support/LLVM.h"
 #include "absl/memory/memory.h"
 #include "llvm/ADT/STLExtras.h"
@@ -47,8 +46,11 @@ namespace {
 class DefaultQuantParamsPass
     : public PassWrapper<DefaultQuantParamsPass, FunctionPass> {
  public:
-  explicit DefaultQuantParamsPass(double default_min, double default_max)
-      : default_min_(default_min), default_max_(default_max) {}
+  explicit DefaultQuantParamsPass(double default_min, double default_max,
+                                  bool is_signed)
+      : default_min_(default_min),
+        default_max_(default_max),
+        is_signed_(is_signed) {}
 
   void runOnFunction() override;
 
@@ -83,6 +85,7 @@ class DefaultQuantParamsPass
 
   double default_min_;
   double default_max_;
+  bool is_signed_;
   quant::QuantParams default_quant_params_;
 };
 }  // namespace
@@ -215,15 +218,16 @@ quant::QuantParams DefaultQuantParamsPass::GetDefaultQuantParams(
     default_quant_params_ = quant::fakeQuantAttrsToType(
         builder.getUnknownLoc(),
         /*numBits=*/8, default_min_, default_max_, /*narrowRange=*/false,
-        builder.getF32Type());
+        builder.getF32Type(), is_signed_);
   }
   return default_quant_params_;
 }
 
 // Creates an instance of the default quant parameters pass.
 std::unique_ptr<OperationPass<FuncOp>> CreateDefaultQuantParamsPass(
-    double default_min, double default_max) {
-  return absl::make_unique<DefaultQuantParamsPass>(default_min, default_max);
+    double default_min, double default_max, bool is_signed) {
+  return absl::make_unique<DefaultQuantParamsPass>(default_min, default_max,
+                                                   is_signed);
 }
 
 // Registers this pass with default values, only for test
@@ -231,7 +235,8 @@ static PassRegistration<DefaultQuantParamsPass> pass(
     "tfl-default-quant",
     "Apply quantization with default quantization parameter", [] {
       return CreateDefaultQuantParamsPass(/*default_min=*/-1.0,
-                                          /*default_max=*/1.0);
+                                          /*default_max=*/1.0,
+                                          /*is_signed=*/false);
     });
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
index 4c3a95dc2a4..9b526f40277 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
@@ -16,10 +16,13 @@ limitations under the License.
 // This transformation pass convert dense tensor to sparse format.
 
 #include "absl/memory/memory.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/lite/tools/optimize/sparsity/format_converter.h"
 
 //===----------------------------------------------------------------------===//
 // The DenseToSparse Pass.
@@ -28,7 +31,226 @@ namespace mlir {
 namespace TFL {
 
 namespace {
+// If sparsity level is below this threadshold, keep the tensor in dense format.
+const float kMinSparsityLevel = 0.3;
+// Heuristic to check if a block configuration is correct.
+const float kBlockOverRandomSparsityRatio = 0.9;
 
+void PopulateEncodingParams(const std::vector<int>& block_size,
+                            std::vector<int>* traversal_order,
+                            std::vector<TfLiteDimensionType>* format,
+                            std::vector<int>* b_map, std::vector<int>* b_size) {
+  *traversal_order = {0, 1};
+  *format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  *b_map = {};
+  *b_size = {};
+  int block_rank = 0;
+  for (int i = 0; i < 2; i++) {
+    if (block_size[i] != 1) {
+      traversal_order->push_back(block_rank + 2);
+      format->push_back(kTfLiteDimDense);
+      block_rank++;
+      b_map->push_back(i);
+      b_size->push_back(block_size[i]);
+    }
+  }
+}
+
+float CalculateRandomSparsity(const ElementsAttr& attr,
+                              const ShapedType& type) {
+  int num_elements = 1;
+  for (int i = 0; i < 2; i++) {
+    num_elements *= type.getDimSize(i);
+  }
+  int num_zeros = 0;
+
+  if (type.getElementType().isF32()) {
+    std::vector<float> data;
+    data.reserve(type.getNumElements());
+    for (const auto val : attr.getValues<float>()) data.push_back(val);
+    for (int i = 0; i < data.size(); i++) {
+      if (data[i] == 0) {
+        num_zeros++;
+      }
+    }
+  } else if (type.getElementType().isa<quant::QuantizedType>()) {
+    std::vector<int8_t> data;
+    data.reserve(type.getNumElements());
+    for (const auto val : attr.getValues<int8_t>()) data.push_back(val);
+    for (int i = 0; i < data.size(); i++) {
+      if (data[i] == 0) {
+        num_zeros++;
+      }
+    }
+  }
+
+  return 1.0 * num_zeros / num_elements;
+}
+
+float CalculateBlockSparsity(const ElementsAttr& attr, const ShapedType& type,
+                             const std::vector<int>& block_size) {
+  float sparsity = 0;
+  std::vector<int> shape(2);
+  shape[0] = type.getDimSize(0);
+  shape[1] = type.getDimSize(1);
+
+  std::vector<int> traversal_order = {};
+  std::vector<TfLiteDimensionType> format = {};
+  std::vector<int> b_size = {};
+  std::vector<int> b_map = {};
+  PopulateEncodingParams(block_size, &traversal_order, &format, &b_map,
+                         &b_size);
+
+  if (type.getElementType().isF32()) {
+    tflite::optimize::sparsity::FormatConverter<float> format_converter(
+        shape, traversal_order, format, b_size, b_map);
+    std::vector<float> data;
+    data.reserve(type.getNumElements());
+    for (const auto val : attr.getValues<float>()) data.push_back(val);
+    format_converter.DenseToSparse(data.data());
+    sparsity =
+        1 - 1.0 * format_converter.GetData().size() / type.getNumElements();
+  } else if (type.getElementType().isa<quant::QuantizedType>()) {
+    tflite::optimize::sparsity::FormatConverter<int8_t> format_converter(
+        shape, traversal_order, format, b_size, b_map);
+    std::vector<int8_t> data;
+    data.reserve(type.getNumElements());
+    for (const auto val : attr.getValues<int8_t>()) data.push_back(val);
+    format_converter.DenseToSparse(data.data());
+    sparsity =
+        1 - 1.0 * format_converter.GetData().size() / type.getNumElements();
+  }
+
+  return sparsity;
+}
+
+typedef struct InspectResult {
+  // Whether the weight tensor is sparse enough to be compressed.
+  bool can_compress;
+  // If the weight tensor cannot be encoded in a block configuration that the op
+  // supports, a Densify() op will be inserted afterwards to fall back to dense
+  // execution.
+  bool needs_densify;
+  // Among the supported block configs of an op, which got selected to encode
+  // the sparse weight.
+  std::vector<int> selected_block_size;
+} InspectResult;
+
+InspectResult InspectWeight(
+    Operation* inst,
+    const std::vector<std::vector<int>>& supported_block_size) {
+  ElementsAttr attr;
+  ShapedType type;
+  InspectResult result = {};
+  if (auto cst = dyn_cast<ConstOp>(inst)) {
+    attr = cst.value();
+    type = cst.getType().cast<ShapedType>();
+  } else if (auto cst = dyn_cast<QConstOp>(inst)) {
+    attr = cst.value();
+    type = cst.getType().cast<ShapedType>();
+  }
+
+  // TODO(b/147449640): Add ability to encode weights more than 2-D, e.g. Conv
+  // weights.
+  if (type.getRank() != 2) {
+    result.can_compress = false;
+    return result;
+  }
+
+  float random_sparsity = CalculateRandomSparsity(attr, type);
+  if (random_sparsity < kMinSparsityLevel) {
+    result.can_compress = false;
+    return result;
+  }
+
+  result.can_compress = true;
+
+  float curr_sparsity = 0;
+  std::vector<int> selected_block_size;
+  result.needs_densify = true;
+  for (const auto& block_size : supported_block_size) {
+    curr_sparsity = CalculateBlockSparsity(attr, type, block_size);
+    if (curr_sparsity / random_sparsity > kBlockOverRandomSparsityRatio) {
+      selected_block_size = block_size;
+      result.can_compress = true;
+      result.needs_densify = false;
+      result.selected_block_size = selected_block_size;
+      break;
+    }
+  }
+
+  return result;
+}
+
+template <typename T>
+std::vector<T> BuildSparsityParameterAttribute(
+    const std::vector<int>& block_size, Operation* inst, OpBuilder* builder,
+    SparsityParameterAttr* s_param) {
+  ElementsAttr attr;
+  ShapedType type;
+  if (auto cst = dyn_cast<ConstOp>(inst)) {
+    attr = cst.value();
+    type = cst.getType().cast<ShapedType>();
+  } else if (auto cst = dyn_cast<QConstOp>(inst)) {
+    attr = cst.value();
+    type = cst.getType().cast<ShapedType>();
+  }
+  std::vector<int> shape(2);
+  shape[0] = type.getDimSize(0);
+  shape[1] = type.getDimSize(1);
+
+  std::vector<int> traversal_order = {};
+  std::vector<TfLiteDimensionType> format = {};
+  std::vector<int> b_size = {};
+  std::vector<int> b_map = {};
+  PopulateEncodingParams(block_size, &traversal_order, &format, &b_map,
+                         &b_size);
+
+  tflite::optimize::sparsity::FormatConverter<T> format_converter(
+      shape, traversal_order, format, b_size, b_map);
+  std::vector<T> data;
+  data.reserve(type.getNumElements());
+  for (const auto val : attr.getValues<T>()) data.push_back(val);
+  format_converter.DenseToSparse(data.data());
+  auto metadata = format_converter.GetDimMetadata();
+  auto compressed_data = format_converter.GetData();
+  const int dim_size = metadata.size() / 2;
+  std::vector<Attribute> dim_metadata(traversal_order.size());
+  for (int i = 0; i < dim_size; i++) {
+    if (format[i] == kTfLiteDimDense) {
+      dim_metadata[i] = DimensionMetadataAttr::get(
+          builder->getStringAttr("DENSE"),
+          builder->getI32IntegerAttr(metadata[2 * i][0]),
+          builder->getArrayAttr({}), builder->getArrayAttr({}),
+          builder->getContext());
+    } else {
+      dim_metadata[i] = DimensionMetadataAttr::get(
+          builder->getStringAttr("SPARSE_CSR"), builder->getI32IntegerAttr(0),
+          builder->getI32ArrayAttr(metadata[2 * i]),
+          builder->getI32ArrayAttr(metadata[2 * i + 1]), builder->getContext());
+    }
+  }
+  *s_param = SparsityParameterAttr::get(
+      builder->getI32ArrayAttr(traversal_order),
+      builder->getI32ArrayAttr(b_map), builder->getArrayAttr(dim_metadata),
+      builder->getContext());
+
+  return compressed_data;
+}
+
+// This pass encodes sparse weights in the model in the proper format, and adds
+// Densify() op if necessary. The general algorithm is:
+//   1. Get list of operands (weights) of an op that can be sparse.
+//   2. Get list of supported block configurations of the op.
+//   3. Calculate random sparsity of the weight.
+//     3.1. If sparsity level is below the encoding threshold, keep in dense.
+//     3.2. If sparsity level is above the encoding threshold, go to 4.
+//   4. Try to encode the weight with supported block configurations. If the
+//      weight was pruned with the same block config, the blocked sparsity level
+//      should match the random sparsity.
+//     4.1. Return the matching block config if found.
+//     4.2. If no matching block config is found, encode the weight with random
+//          sparsity, and add Densify() op to fall back to dense execution.
 struct DenseToSparse : public PassWrapper<DenseToSparse, FunctionPass> {
   void runOnFunction() override;
 };
@@ -39,20 +261,68 @@ void DenseToSparse::runOnFunction() {
 
   func.walk([&](SparseOpInterface sparse_op) {
     const auto& sparse_operands = sparse_op.GetSparseOperands();
+    std::vector<std::vector<int>> supported_block_size;
     for (const int operand : sparse_operands) {
       auto* op = sparse_op.getOperation();
       const auto& value = op->getOperand(operand);
-      builder.setInsertionPoint(op);
-      if (auto* inst = value.getDefiningOp()) {
-        // Replace defining op with SparseConst or SparseQConst.
-        // TODO(yunluli): Implement.
+
+      auto* inst = value.getDefiningOp();
+      if (!inst) {
+        continue;
       }
 
-      // TODO(yunluli): Implement.
-      bool needs_densify = false;
+      if (isa<ConstOp>(inst)) {
+        supported_block_size = sparse_op.GetFloatBlockSize();
+      } else if (isa<QConstOp>(inst)) {
+        supported_block_size = sparse_op.GetQuantizedBlockSize();
+      } else {
+        continue;
+      }
 
-      if (needs_densify) {
-        auto densify = builder.create<DensifyOp>(op->getLoc(), value);
+      InspectResult result = InspectWeight(inst, supported_block_size);
+      if (!result.can_compress) {
+        continue;
+      }
+
+      // The weight is not block sparse. Encode with random sparsity.
+      if (result.selected_block_size.empty()) {
+        result.selected_block_size = {1, 1};
+      }
+
+      builder.setInsertionPoint(op);
+      SparsityParameterAttr s_param;
+      if (auto cst = dyn_cast<ConstOp>(inst)) {
+        std::vector<float> compressed_data =
+            BuildSparsityParameterAttribute<float>(result.selected_block_size,
+                                                   inst, &builder, &s_param);
+        auto compressed_data_type = RankedTensorType::get(
+            {static_cast<int64_t>(compressed_data.size())},
+            builder.getF32Type());
+        auto new_value = DenseElementsAttr::get<float>(compressed_data_type,
+                                                       compressed_data);
+        auto s_const = builder.create<SparseConstOp>(op->getLoc(), cst.value(),
+                                                     s_param, new_value);
+        value.replaceAllUsesWith(s_const.getResult());
+        cst.erase();
+      } else if (auto cst = dyn_cast<QConstOp>(inst)) {
+        std::vector<int8_t> compressed_data =
+            BuildSparsityParameterAttribute<int8_t>(result.selected_block_size,
+                                                    inst, &builder, &s_param);
+        auto compressed_data_type = RankedTensorType::get(
+            {static_cast<int64_t>(compressed_data.size())},
+            builder.getIntegerType(8, true));
+        auto new_value = DenseElementsAttr::get<int8_t>(compressed_data_type,
+                                                        compressed_data);
+        auto s_qconst = builder.create<SparseQConstOp>(
+            op->getLoc(), cst.qtypeAttr(), cst.value(), s_param, new_value);
+        value.replaceAllUsesWith(s_qconst.getResult());
+        cst.erase();
+      }
+
+      if (result.needs_densify) {
+        const auto value = op->getOperand(operand);
+        auto densify =
+            builder.create<DensifyOp>(op->getLoc(), value.getType(), value);
         value.replaceAllUsesWith(densify);
         densify.setOperand(value);
       }
diff --git a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
deleted file mode 100644
index 1d50c4dc29b..00000000000
--- a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
+++ /dev/null
@@ -1,764 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <map>
-#include <queue>
-#include <vector>
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Analysis/LoopAnalysis.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
-#include "tensorflow/compiler/mlir/lite/utils/validators.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace mlir {
-namespace TFL {
-namespace {
-
-constexpr char kTfLiteFunctionName[] = "_tflite_function_name";
-constexpr char kTfLiteFunctionUUID[] = "_tflite_function_uuid";
-constexpr char kTfLiteFunctionInputIndex[] = "_tflite_function_input_index";
-constexpr char kTfLiteFunctionOutputIndex[] = "_tflite_function_output_index";
-constexpr char kTfLiteFunctionSortIndex[] = "_tflite_function_sort_index";
-constexpr char kTfLiteFunctionAggregate[] = "_tflite_function_aggregate";
-
-constexpr char kStrategyNone[] = "None";
-constexpr char kStrategyStack[] = "stack";
-constexpr char kStrategyFirst[] = "first";
-constexpr char kStrategyLast[] = "last";
-
-//  A Ophinted op typically looks like below"
-//
-//     InputOp1        InputOp2    InputOp3
-//       /  \            |             |
-//    val1  val2        val3         val4
-//      |    |           |             |
-//  identOp1 identOp2  identOp3      identOp4
-//     \     |           |            /
-//      \    |           |           /
-//  ....   a bunch of operations (needs to be fused) ...
-//                   /       \
-//                  /         \
-//      identOp1 (output)    identOp2 (output)
-//           |                  |
-//       Other ops           Other ops
-//
-//
-//  In this pass, we are trying to convert them into the following format:
-//
-//                     ||
-//                     ||
-//                    \ /
-//
-//     InputOp1        InputOp2    InputOp3
-//       /  \            |             /
-//    val1  val2        val3         val4
-//      \    |           |            /
-//       PackOp          |           /
-//       \    |          |          /
-//        \   |          |         /
-//           Call funcOp (fusedOp - name like 'UnidirectionalSequenceRNN')
-//            (The funcOp will be inserted at the bottom of the module, also
-// .          note every funcOp will be unique.)
-//                   |
-//                  UnpackOp
-//                 /      \
-//                /        \
-//       Other ops         Other ops
-struct OphintCompositeOp {
-  // OphintCompositeOp is a conceptually "composite op" which will be converted
-  // to a "fused op" later.
-  //
-  // As a "composite op", it has "inputs" and "outputs", and all the inputs
-  // and outputs are annotated by special-annotated identity ops.
-  //
-  // All inputs and outputs need to be processed based on different strategies,
-  // See all the different strategies under
-  // tensorflow/lite/python/op_hint.py
-  //
-  // For example, "stack" strategy means we need to pack the inputs together
-  // or unpack the outputs.
- public:
-  OphintCompositeOp(StringRef uuid, StringRef function_name)
-      : uuid(uuid), function_name(function_name) {}
-
-  void AddInput(int index, Operation* op, StringRef aggregation,
-                int sort_index) {
-    auto it = inputs.find(index);
-    if (it == inputs.end()) {
-      AggregatedOperand operand;
-      operand.aggregation = aggregation;
-      it = inputs.insert({index, operand}).first;
-    }
-    // TODO(renjieliu): check aggregation strategy stays the same.
-    // Also needs to make sure if aggregation strategy is "None" we should not
-    // have more than one op.
-    it->second.ops[sort_index] = op;
-  }
-
-  void AddOutput(int index, Operation* op, llvm::StringRef aggregation,
-                 int sort_index) {
-    auto it = outputs.find(index);
-    if (it == outputs.end()) {
-      AggregatedOperand operand;
-      operand.aggregation = aggregation;
-      it = outputs.insert({index, operand}).first;
-    }
-    // TODO(renjieliu): check aggregation strategy stays the same.
-    // Also needs to make sure if aggregation strategy is "None" we should not
-    // have more than one op.
-    it->second.ops[sort_index] = op;
-  }
-
-  std::vector<Operation*> GetAllInputOps() {
-    std::vector<Operation*> all_input_ops;
-    for (const auto& kv : inputs) {
-      if (kv.second.aggregation == kStrategyFirst) {
-        all_input_ops.push_back(kv.second.ops.at(0));
-        continue;
-      }
-      for (const auto& operandKv : kv.second.ops) {
-        all_input_ops.push_back(operandKv.second);
-      }
-    }
-    return all_input_ops;
-  }
-
-  std::vector<Operation*> GetAllOutputOps() {
-    std::vector<Operation*> all_output_ops;
-    for (const auto& kv : outputs) {
-      for (const auto& operand_kv : kv.second.ops) {
-        all_output_ops.push_back(operand_kv.second);
-      }
-    }
-    return all_output_ops;
-  }
-
-  std::vector<Operation*> GetAllInUseOutputOps() {
-    std::vector<Operation*> all_output_ops;
-    for (const auto& kv : outputs) {
-      auto& aggregated_operand = kv.second;
-      if (aggregated_operand.aggregation != kStrategyStack) {
-        continue;
-      }
-      for (const auto& operand_kv : aggregated_operand.ops) {
-        all_output_ops.push_back(operand_kv.second);
-      }
-    }
-    return all_output_ops;
-  }
-
-  // This function will process the aggregated inputs based on different
-  // strategies like "first", "last", "stack".
-  std::map<int, Value> GetAggregatedInputs(OpBuilder* builder) {
-    std::map<int, Value> aggregated_inputs;
-    for (const auto& kv : inputs) {
-      Value op_input = nullptr;
-      const AggregatedOperand& operand = kv.second;
-      // Dealing with "stack" strategy:
-      // This breaks into two parts:
-      // 1) If the ops only has one element, we only add a reshape op to expand
-      // the dim.
-      // 2) If the ops contain more than one element, we need to append a
-      // pack_op after the input ops.
-      if (operand.aggregation == kStrategyStack) {
-        if (operand.ops.size() == 1) {
-          // If ops size is 1, it will be simply expanding dimensions at dim 0.
-          Operation* current_identity_op = operand.ops.begin()->second;
-          Value input = current_identity_op->getOperand(0);
-          RankedTensorType input_type =
-              input.getType().cast<RankedTensorType>();
-          // The Reshape will be {1, (original_shape)}
-          SmallVector<int64_t, 4> reshape_op_shape;
-          reshape_op_shape.push_back(1);
-          for (const auto& dim : input_type.getShape()) {
-            reshape_op_shape.push_back(dim);
-          }
-
-          Operation* first_use = current_identity_op->getNextNode();
-          builder->setInsertionPoint(first_use);
-          Location loc = first_use->getLoc();
-          auto shape_type = RankedTensorType::get({input_type.getRank() + 1},
-                                                  builder->getIntegerType(32));
-          SmallVector<Attribute, 4> result_shape_data(reshape_op_shape.size());
-          for (int i = 0; i < reshape_op_shape.size(); ++i) {
-            result_shape_data[i] = builder->getI32IntegerAttr(
-                static_cast<int32_t>(reshape_op_shape[i]));
-          }
-          auto shape_attr =
-              DenseElementsAttr::get(shape_type, result_shape_data);
-          auto shape = builder->create<ConstantOp>(loc, shape_type, shape_attr);
-          auto reshape_output_type = RankedTensorType::get(
-              reshape_op_shape, input_type.getElementType());
-          Operation* reshape = builder->create<TFL::ReshapeOp>(
-              first_use->getLoc(), reshape_output_type, input, shape);
-          op_input = reshape->getResult(0);
-
-        } else {
-          // Insert a pack op to pack all the inputs together.
-          std::vector<Value> pack_input_operands;
-          std::vector<Value> packed_input_consumers;
-          for (int i = 0, e = operand.ops.size(); i < e; ++i) {
-            pack_input_operands.push_back(operand.ops.at(i)->getOperand(0));
-            packed_input_consumers.push_back(operand.ops.at(i)->getResult(0));
-          }
-          // Find the first op that consumes the last value of the aggregated
-          // inputs.
-          Operation* first_use = *(packed_input_consumers.back().user_begin());
-          // The pack reshape will be {N, (original_shape)}
-          SmallVector<int64_t, 4> pack_shape;
-          pack_shape.push_back(pack_input_operands.size());
-          RankedTensorType type = operand.ops.at(0)
-                                      ->getResult(0)
-                                      .getType()
-                                      .cast<RankedTensorType>();
-          for (const auto& dim : type.getShape()) {
-            pack_shape.push_back(dim);
-          }
-          auto pack_input_type =
-              RankedTensorType::get(pack_shape, type.getElementType());
-          builder->setInsertionPoint(first_use);
-          Operation* pack_op = builder->create<TFL::PackOp>(
-              first_use->getLoc(), pack_input_type, pack_input_operands,
-              builder->getI32IntegerAttr(pack_input_operands.size()),
-              builder->getI32IntegerAttr(0));
-          op_input = pack_op->getResult(0);
-        }
-      } else if (operand.aggregation == kStrategyLast) {
-        // This handle the strategy "last", if simply takes the last input.
-        op_input = operand.ops.at(operand.ops.size() - 1)->getOperand(0);
-      } else {
-        // This handle the strategy "first" and default, if simply takes the
-        // first input.
-        op_input = operand.ops.at(0)->getOperand(0);
-      }
-      aggregated_inputs[kv.first] = op_input;
-    }
-    return aggregated_inputs;
-  }
-
-  // For now, we just return the first output's location which the fused op will
-  // be inserted in.
-  Operation* GetFirstOutputOp() { return outputs.begin()->second.ops.at(0); }
-
-  // Since we have different aggregation strategies, e.g., "first", "last",
-  // "stack". We don't somehow aggregated to get the outputs for the funcOp.
-  // This function is simply compute the RankedTensorType (shape & element type)
-  std::map<int, Type> GetAggregatedOutputTypes(OpBuilder* builder) {
-    std::map<int, Type> aggregated_output_types;
-    for (const auto& kv : outputs) {
-      const AggregatedOperand& operand = kv.second;
-      if (operand.aggregation == kStrategyStack) {
-        const int output_numer = operand.ops.size();
-        Value first_output = operand.ops.at(0)->getOperand(0);
-        RankedTensorType first_output_type =
-            first_output.getType().cast<RankedTensorType>();
-        // The aggregated output shape will be {N, original_shape}.
-        SmallVector<int64_t, 4> shape;
-        shape.push_back(output_numer);
-        for (const auto& dim : first_output_type.getShape()) {
-          shape.push_back(dim);
-        }
-        aggregated_output_types[kv.first] =
-            RankedTensorType::get(shape, first_output_type.getElementType());
-      } else if (operand.aggregation == kStrategyLast) {
-        Value last_output =
-            operand.ops.at(operand.ops.size() - 1)->getOperand(0);
-        aggregated_output_types[kv.first] = last_output.getType();
-      } else {
-        Value first_output = operand.ops.at(0)->getOperand(0);
-        aggregated_output_types[kv.first] = first_output.getType();
-      }
-    }
-    return aggregated_output_types;
-  }
-
-  void AggregateAndRewireOutputs(OpBuilder* builder, Operation* fused_op) {
-    // TODO(renjieliu): Consider get rid of the ophinted identity nodes here
-    // as well or just rely on the general path to get rid of the identity
-    // nodes.
-    int output_index = 0;
-    for (const auto& kv : outputs) {
-      const AggregatedOperand& operand = kv.second;
-      // This handles the "stack" strategy. It push a unpack_op before all the
-      // outputs and make all the outputs point to the unpack_op.
-      if (operand.aggregation == kStrategyStack) {
-        // TODO(renjieliu): Revisit here if we need to handle
-        // operand.ops().size() == 1 case. Insert a unpack op to unpack the
-        // outputs.
-        const int output_number = operand.ops.size();
-        // Find the first output.
-        Operation* first_output = operand.ops.at(0);
-        Location insert_loc = first_output->getLoc();
-        SmallVector<Type, 4> unpack_output_types(
-            output_number, first_output->getOperand(0).getType());
-
-        builder->setInsertionPoint(first_output);
-        Operation* unpack_op = builder->create<TFL::UnpackOp>(
-            insert_loc, unpack_output_types, fused_op->getResult(output_index),
-            builder->getI32IntegerAttr(output_number),
-            builder->getI32IntegerAttr(0));
-        // For every unpack output, make sure they point to the right ones.
-        for (int i = 0; i < output_number; ++i) {
-          Operation* to_be_replaced_op = operand.ops.at(i);
-          to_be_replaced_op->replaceUsesOfWith(to_be_replaced_op->getOperand(0),
-                                               unpack_op->getResult(i));
-        }
-      } else if (operand.aggregation == kStrategyLast) {
-        // This handles the strategy "last", it simply takes the last output.
-        Operation* op = operand.ops.at(operand.ops.size() - 1);
-        op->replaceUsesOfWith(op->getOperand(0),
-                              fused_op->getResult(output_index));
-      } else {
-        // This handles the strategy "first" and default, it simply takes the
-        // first output.
-        Operation* op = operand.ops.at(0);
-        op->replaceUsesOfWith(op->getOperand(0),
-                              fused_op->getResult(output_index));
-      }
-
-      output_index++;
-    }
-  }
-
-  LogicalResult VerifyOphint() const {
-    if (inputs.empty() || outputs.empty()) return failure();
-    return success();
-  }
-
-  StringRef uuid;
-  StringRef function_name;
-
- private:
-  // The AggregatedOperand is used to hold one "aggregated operand".
-  // For example, this can be
-  // {
-  //    aggregation = "stack",
-  //    {0: ident_op1, 1: ident_op2, 2: ident_op3}
-  // }
-  struct AggregatedOperand {
-    StringRef aggregation;
-    std::map<int, Operation*> ops;
-  };
-
-  std::map<int, AggregatedOperand> inputs;
-  std::map<int, AggregatedOperand> outputs;
-};
-
-// Preprocess the graph for topo sort. (each operation is a node, while
-// inputs/outputs indicate edges) Assume the graph is acyclic. The preprocess
-// does the following:
-//   Compute each operations's in-degress (how many input nodes they're taken)
-//   Get all consumer operations for every operations. (operation_to_outputs)
-//   Get the init_queue (those operations will be processed first).
-void PreprocessTopoSortGraph(
-    Block* block, std::queue<Operation*>* init_queue,
-    llvm::DenseMap<Operation*, llvm::DenseSet<Operation*>>*
-        operation_to_outputs,
-    llvm::DenseMap<Operation*, int>* operation_to_in_degrees) {
-  for (auto& op : *block) {
-    if (&op == block->getTerminator()) continue;
-    if (op.getNumOperands() == 0) {
-      init_queue->push(&op);
-    } else {
-      // The operand of the ops is not a direct indication of the "edge" as we
-      // can have a pack op after a unpack op (they have multiple edges), we
-      // should only count as one.
-      llvm::DenseSet<Operation*> input_ops;
-      for (int i = 0; i < op.getNumOperands(); ++i) {
-        Operation* input_op = op.getOperand(i).getDefiningOp();
-        if (input_op) input_ops.insert(input_op);
-      }
-      if (input_ops.empty()) {
-        init_queue->push(&op);
-        continue;
-      }
-      operation_to_in_degrees->try_emplace(&op, input_ops.size());
-      for (auto* input_op : input_ops) {
-        auto preceding_op_it = operation_to_outputs->find(input_op);
-        if (preceding_op_it == operation_to_outputs->end()) {
-          auto result = operation_to_outputs->try_emplace(
-              input_op, llvm::DenseSet<Operation*>());
-          preceding_op_it = result.first;
-        }
-        preceding_op_it->second.insert(&op);
-      }
-    }
-  }
-}
-
-bool IsSideEffectOp(Operation* op) {
-  // TODO(riverriddle) Properly handle region side effects.
-  if (MemoryEffectOpInterface::hasNoEffect(op) && op->getNumRegions() == 0)
-    return false;
-
-  // Identity op has no side effect.
-  // Check the OperationName maybe more elegant here.
-  auto tf_identity_op = dyn_cast_or_null<TF::IdentityOp>(op);
-  if (tf_identity_op) return false;
-  return true;
-}
-
-// It's possible other transformations can benefit from this util function, but
-// since currently there's none, so we only limit this function to the ophint
-// extraction pass. We may refactor this function to extend the usage in future.
-//
-// Assume the graph is disconnected from outside.
-// Also assume the block has no arguments.
-LogicalResult TopoSortOperations(OpBuilder* builder) {
-  std::queue<Operation*> init_queue;
-  llvm::DenseMap<Operation*, llvm::DenseSet<Operation*>> operation_to_outputs;
-  llvm::DenseMap<Operation*, int> operation_to_in_degrees;
-  std::vector<Operation*> sorted_ops;
-
-  PreprocessTopoSortGraph(builder->getBlock(), &init_queue,
-                          &operation_to_outputs, &operation_to_in_degrees);
-  while (!init_queue.empty()) {
-    Operation* current_op = init_queue.front();
-    init_queue.pop();
-    sorted_ops.push_back(current_op);
-
-    auto current_op_to_output_it = operation_to_outputs.find(current_op);
-    if (current_op_to_output_it == operation_to_outputs.end()) {
-      continue;
-    }
-    for (Operation* output_op : current_op_to_output_it->second) {
-      auto output_op_it = operation_to_in_degrees.find(output_op);
-      if (output_op_it == operation_to_in_degrees.end()) return failure();
-
-      output_op_it->second -= 1;
-      if (output_op_it->second == 0) {
-        init_queue.push(output_op);
-        operation_to_in_degrees.erase(output_op_it);
-      }
-    }
-    operation_to_outputs.erase(current_op_to_output_it);
-  }
-
-  // Before we performs the sort. We need to make sure we didn't mess the
-  // ordering of original side-effect operations.
-  // It's possible those side-effect operations have no topological relations
-  // at all!
-  std::vector<Operation*> original_side_effect_ops;
-  std::vector<Operation*> after_sort_side_effect_ops;
-  for (auto& op : *builder->getBlock()) {
-    if (IsSideEffectOp(&op) && (&op != builder->getBlock()->getTerminator()))
-      original_side_effect_ops.push_back(&op);
-  }
-  for (auto* op : sorted_ops) {
-    if (IsSideEffectOp(op)) after_sort_side_effect_ops.push_back(op);
-  }
-  if (original_side_effect_ops.size() != after_sort_side_effect_ops.size())
-    return failure();
-  for (int i = 0; i < original_side_effect_ops.size(); ++i) {
-    if (original_side_effect_ops[i] != after_sort_side_effect_ops[i])
-      return failure();
-  }
-
-  // Performs the sort.
-  // Ideally it would be nice to just clear the block then write the sorted ops.
-  // But unfortunately that's hard to do.
-  for (int i = sorted_ops.size() - 1; i > 0; --i) {
-    Operation* current_op = sorted_ops[i];
-    for (int j = i - 1; j >= 0; --j) {
-      Operation* prev_op = sorted_ops[j];
-      prev_op->moveBefore(current_op);
-    }
-  }
-
-  return success();
-}
-
-Operation* BuildFusedFuncOp(StringRef func_name, StringRef fused_func_type,
-                            Operation* insert_before_op,
-                            const std::map<int, Value>& inputs,
-                            const std::map<int, Type>& output_types,
-                            OpBuilder* builder, ModuleOp* module_op) {
-  SmallVector<Type, 4> input_types;
-  SmallVector<Value, 4> input_values;
-  SmallVector<int, 4> input_indexes;
-  for (const auto& kv : inputs) {
-    Value input = kv.second;
-    input_types.push_back(input.getType());
-    input_values.push_back(input);
-    input_indexes.push_back(kv.first);
-  }
-
-  SmallVector<Type, 4> func_output_types;
-  for (const auto& kv : output_types) {
-    func_output_types.push_back(kv.second);
-  }
-
-  FunctionType function_type =
-      builder->getFunctionType(/*inputs=*/input_types,
-                               /*results=*/func_output_types);
-
-  SmallVector<NamedAttribute, 4> attrs;
-  attrs.push_back(builder->getNamedAttr(
-      kTfLiteFunctionName, builder->getStringAttr(fused_func_type)));
-  attrs.push_back(builder->getNamedAttr(
-      kTfLiteFunctionInputIndex, builder->getI32ArrayAttr(input_indexes)));
-  FuncOp func_op = FuncOp::create(insert_before_op->getLoc(), func_name,
-                                  function_type, llvm::makeArrayRef(attrs));
-  module_op->push_back(func_op);
-  builder->setInsertionPoint(insert_before_op);
-  return builder->create<CallOp>(insert_before_op->getLoc(), func_op,
-                                 input_values);
-}
-
-llvm::StringMap<OphintCompositeOp> FindAllOphintNodes(Block* bb) {
-  llvm::StringMap<OphintCompositeOp> ophint_composite_ops;
-  for (auto& op : *bb) {
-    auto nameAttr = op.getAttrOfType<StringAttr>(kTfLiteFunctionName);
-    if (!nameAttr) continue;
-    StringRef function_name = nameAttr.getValue();
-    auto uuidAttr = op.getAttrOfType<StringAttr>(kTfLiteFunctionUUID);
-    if (!uuidAttr) continue;
-    StringRef uuid = uuidAttr.getValue();
-    auto it = ophint_composite_ops.find(uuid);
-    if (it == ophint_composite_ops.end()) {
-      OphintCompositeOp ophint_composite_op(uuid, function_name);
-      it = ophint_composite_ops.insert({uuid, ophint_composite_op}).first;
-    }
-
-    // The default aggregation strategy is "NONE".
-    StringRef aggregation = kStrategyNone;
-    auto aggregationAttr =
-        op.getAttrOfType<StringAttr>(kTfLiteFunctionAggregate);
-    if (aggregationAttr != nullptr) aggregation = aggregationAttr.getValue();
-
-    // The default sort index is 0.
-    int sortIndex = 0;
-    auto sortIndexAttr =
-        op.getAttrOfType<IntegerAttr>(kTfLiteFunctionSortIndex);
-    if (sortIndexAttr != nullptr) sortIndex = sortIndexAttr.getInt();
-
-    auto inputIndexAttr =
-        op.getAttrOfType<IntegerAttr>(kTfLiteFunctionInputIndex);
-    if (inputIndexAttr != nullptr) {
-      it->second.AddInput(inputIndexAttr.getInt(), &op, aggregation, sortIndex);
-    } else {
-      auto outputIndexAttr =
-          op.getAttrOfType<IntegerAttr>(kTfLiteFunctionOutputIndex);
-      it->second.AddOutput(outputIndexAttr.getInt(), &op, aggregation,
-                           sortIndex);
-    }
-  }
-
-  return ophint_composite_ops;
-}
-
-llvm::DenseSet<Operation*> BfsForReachableOps(ArrayRef<Operation*> input_ops) {
-  llvm::DenseSet<Operation*> reachable_ops;
-  std::queue<Operation*> ops_queue;
-  for (auto& input_op : input_ops) {
-    for (Value value : input_op->getOperands()) {
-      Operation* op = value.getDefiningOp();
-      if (op != nullptr) ops_queue.push(op);
-    }
-  }
-
-  while (!ops_queue.empty()) {
-    Operation* current_op = ops_queue.front();
-    ops_queue.pop();
-    reachable_ops.insert(current_op);
-    for (Value value : current_op->getOperands()) {
-      Operation* upstream_op = value.getDefiningOp();
-      // Not visited, put it into the queue.
-      if (upstream_op != nullptr &&
-          !llvm::is_contained(reachable_ops, upstream_op)) {
-        ops_queue.emplace(upstream_op);
-      }
-    }
-  }
-
-  return reachable_ops;
-}
-
-// Convert ophint to stub will remove all ops within the ophint region and
-// place a new fused op right before the first op.
-LogicalResult ConvertOphintToStub(StringRef stub_name,
-                                  OphintCompositeOp ophint_composite_op,
-                                  OpBuilder* builder, ModuleOp* module_op) {
-  // Step 1, find all ops reachable by inputs.
-  const llvm::DenseSet<Operation*>& reachable_by_inputs =
-      BfsForReachableOps(ophint_composite_op.GetAllInputOps());
-
-  // Step 2, find all ops reachable by outputs.
-  const llvm::DenseSet<Operation*>& reachable_by_outputs =
-      BfsForReachableOps(ophint_composite_op.GetAllOutputOps());
-
-  // Step 3, deal with inputs aggregation strategies.
-  const std::map<int, Value>& aggregated_inputs =
-      ophint_composite_op.GetAggregatedInputs(builder);
-
-  // Step 4, get aggregated output types.
-  const std::map<int, Type>& aggregated_output_types =
-      ophint_composite_op.GetAggregatedOutputTypes(builder);
-
-  // Step 5, create & place the fused op and rewire the inputs.
-  // Here we use a funcOp to represent the fused op. This "funcOp" will be
-  // converted to other ops (like UnidirectionalSequenceRNNOp) in the
-  // legalization phase.
-  Operation* inserted_before_op = ophint_composite_op.GetFirstOutputOp();
-  Operation* fused_op = BuildFusedFuncOp(
-      stub_name, ophint_composite_op.function_name, inserted_before_op,
-      aggregated_inputs, aggregated_output_types, builder, module_op);
-
-  for (const auto& kv : aggregated_inputs) {
-    Operation* op = kv.second.getDefiningOp();
-    if (op == nullptr) return failure();
-    op->moveBefore(fused_op);
-  }
-
-  // Step 6, deal with outputs aggregation strategies and rewire the outputs.
-  ophint_composite_op.AggregateAndRewireOutputs(builder, fused_op);
-
-  // Step 7, remove all the removable ops where
-  // (reachable_by_outputs - reachable_by_inputs) as removable and the rest
-  // ops are not removable.
-  // We also need to make sure all the output identity nodes are there.
-  llvm::DenseSet<Operation*> ophinted_identity_nodes;
-  for (auto* output : ophint_composite_op.GetAllInUseOutputOps()) {
-    ophinted_identity_nodes.insert(output);
-  }
-
-  auto removeRemovableOps = [&](Operation* op) {
-    if (reachable_by_inputs.count(op) == 0 &&
-        reachable_by_outputs.count(op) != 0 &&
-        ophinted_identity_nodes.count(op) == 0) {
-      op->dropAllDefinedValueUses();
-      op->dropAllReferences();
-      op->erase();
-    }
-  };
-
-  builder->getBlock()->walk(removeRemovableOps);
-
-  // Step 8: Topo sort to fix any invalid temporary IRs.
-  if (failed(TopoSortOperations(builder))) return failure();
-
-  return success();
-}
-
-struct ExtractOphintPass
-    : public PassWrapper<ExtractOphintPass, OperationPass<ModuleOp>> {
-  void runOnOperation() override;
-  void Verify();
-
- private:
-  int ophint_composite_ops_count = 0;
-};
-
-// TODO(renjieliu): Current ophint extraction does not support inputs/outputs
-// cross functions, we need to do that.
-void ExtractOphintPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  for (auto function : module.getOps<FuncOp>()) {
-    // Process block by block.
-    for (auto& bb : function.getBody()) {
-      // Find ophints.
-      const llvm::StringMap<OphintCompositeOp>& ophint_composite_ops =
-          FindAllOphintNodes(&bb);
-      if (ophint_composite_ops.empty()) continue;
-
-      // Verify: Make sure all ophint_composite_ops are valid.
-      // If not valid, we just don't do anything.
-      for (const auto& kv : ophint_composite_ops) {
-        if (failed(kv.getValue().VerifyOphint())) {
-          return;
-        }
-      }
-
-      ophint_composite_ops_count = ophint_composite_ops.size();
-
-      // Convert.
-      OpBuilder builder = OpBuilder::atBlockEnd(&bb);
-      for (const auto& kv : ophint_composite_ops) {
-        if (failed(ConvertOphintToStub(kv.getKey(), kv.getValue(), &builder,
-                                       &module))) {
-          module.emitError()
-              << "Convert ophint failed, malformed inputs or outputs.";
-          return signalPassFailure();
-        }
-      }
-    }
-  }
-}
-
-void ExtractOphintPass::Verify() {
-  ModuleOp module = getOperation();
-  int ophint_func_op_count = 0;
-  for (FuncOp func : getOperation().getOps<FuncOp>()) {
-    for (const NamedAttribute attr : func.getAttrs()) {
-      if (attr.first == kTfLiteFunctionName) {
-        ophint_func_op_count++;
-        if (func.getNumArguments() == 0) {
-          module.emitError() << "Ophint function has no inputs.";
-          return signalPassFailure();
-        }
-        if (func.getType().getNumResults() == 0) {
-          module.emitError() << "Ophint function has no outputs.";
-          return signalPassFailure();
-        }
-      }
-    }
-  }
-  if (ophint_func_op_count != ophint_composite_ops_count) {
-    module.emitError()
-        << "Ophint converted functions do not match ophint regions founded.";
-    return signalPassFailure();
-  }
-}
-
-}  // namespace
-
-/// Creates an instance of the TensorFlow Lite dialect ExtractOphintPass
-/// pass.
-std::unique_ptr<OperationPass<ModuleOp>> CreateExtractOphintPass() {
-  return std::make_unique<ExtractOphintPass>();
-}
-
-static PassRegistration<ExtractOphintPass> pass(
-    "tfl-extract-ophint", "Extract Ophint for TfLite dialect.");
-
-}  // namespace TFL
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc
deleted file mode 100644
index 652d10a53a8..00000000000
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc
+++ /dev/null
@@ -1,295 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringMap.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-
-namespace mlir {
-namespace TFL {
-namespace {
-
-constexpr char kTfLiteFunctionName[] = "_tflite_function_name";
-constexpr char kTfLiteFunctionInputIndex[] = "_tflite_function_input_index";
-constexpr char kUnidirectionalSequenceRnn[] = "UnidirectionalSequenceRnn";
-constexpr char kUnidirectionalSequenceLstm[] = "UnidirectionalSequenceLstm";
-
-// This pass is used for converting to TFLite composite op like
-// UnidirectionalSequenceRNN, UnidirectionalSequenceLSTM or SVDF Op. Currently,
-// this pass is only for ophint converted function op only. See below diagram:
-//
-// InputOp1      InputOp2 ...
-//    \            /
-//     \          /
-//    call funcOp (say UnidirectionalSequenceRNN)
-//           |
-//           |
-//        OutputOp1
-//
-//   funcOp() { '_tflite_function_name' = 'UnidirectionalSequenceRNN'}
-//
-//          ||
-//          ||
-//         \ /
-//
-// InputOp1      InputOp2 ...
-//    \            /
-//     \          /
-//    tfl.UnidirectionalSequenceRNN
-//           |
-//           |
-//        OutputOp1
-struct LegalizeOphintFuncOpPass
-    : public PassWrapper<LegalizeOphintFuncOpPass, OperationPass<ModuleOp>> {
-  void runOnOperation() override;
-};
-
-llvm::StringMap<FuncOp> FindCompositeFuncOps(ModuleOp module) {
-  llvm::StringMap<FuncOp> composite_func_ops;
-  for (FuncOp func : module.getOps<FuncOp>()) {
-    if (func.getAttr(kTfLiteFunctionName))
-      composite_func_ops[func.getName()] = func;
-  }
-  return composite_func_ops;
-}
-
-LogicalResult BuildUnidirectionalSequenceRnnOp(FuncOp composite_func_op,
-                                               CallOp call_op,
-                                               OpBuilder* builder,
-                                               Operation** fused_op) {
-  // UnidirectionalSequenceRnn takes exactly 5 inputs.
-  if (composite_func_op.getNumArguments() != 5) return failure();
-  if (call_op.getNumOperands() != 5) return failure();
-  // UnidirectionalSequenceRnn has exactly 1 input.
-  if (call_op.getNumResults() != 1) return failure();
-
-  // Inputs is indexed at 0.
-  Value input = call_op.getOperand(0);
-  // Input_weight is indexed at 1.
-  Value weight = call_op.getOperand(1);
-  // Recurrent_weight is indexed at 2.
-  Value recurrent_weight = call_op.getOperand(2);
-  // Bias is indexed at 3.
-  Value bias = call_op.getOperand(3);
-  // Hidden_state is indexed at 4.
-  Value hidden_state = call_op.getOperand(4);
-
-  // Build Output.
-  auto output_type = call_op.getResult(0).getType();
-
-  // Currently, ophinted RNN only supports time_major = True.
-  const bool time_major = true;
-  // Activation will always be TanH.
-  StringAttr fused_activation_function = builder->getStringAttr("TANH");
-
-  builder->setInsertionPoint(call_op.getOperation());
-  *fused_op = builder->create<TFL::UnidirectionalSequenceRNNOp>(
-      call_op.getLoc(), output_type, input, weight, recurrent_weight, bias,
-      hidden_state, builder->getBoolAttr(time_major),
-      fused_activation_function);
-  return success();
-}
-
-LogicalResult BuildUnidirectionalSequenceLSTMOp(FuncOp composite_func_op,
-                                                CallOp call_op,
-                                                OpBuilder* builder,
-                                                Operation** fused_op) {
-  if (composite_func_op.getNumArguments() != call_op.getNumOperands())
-    return failure();
-  auto input_index_attr = composite_func_op.getAttr(kTfLiteFunctionInputIndex)
-                              .cast<ArrayAttr>()
-                              .getValue();
-  llvm::DenseMap<int, Value> fused_ops_index_to_call_op_args;
-
-  for (int i = 0; i < call_op.getNumOperands(); ++i) {
-    int input_index = input_index_attr[i].cast<IntegerAttr>().getInt();
-    fused_ops_index_to_call_op_args.try_emplace(input_index,
-                                                call_op.getOperand(i));
-  }
-
-  constexpr int kUnidirectionalSequenceLSTMOpTotalIArgumentNum = 24;
-
-  // We encounter some optional arguments not filled, so we need to create an
-  // empty Value.
-  Value none_value;
-  if (call_op.getNumOperands() <
-      kUnidirectionalSequenceLSTMOpTotalIArgumentNum) {
-    builder->setInsertionPoint(call_op.getOperation());
-    none_value = builder->create<mlir::ConstantOp>(
-        call_op.getLoc(), builder->getNoneType(), builder->getUnitAttr());
-  }
-
-  // Prepare all operands for the UnidirectionalSequenceLSTMOp.
-  SmallVector<Value, kUnidirectionalSequenceLSTMOpTotalIArgumentNum> operands;
-  for (int i = 0; i < kUnidirectionalSequenceLSTMOpTotalIArgumentNum; ++i) {
-    auto operand_it = fused_ops_index_to_call_op_args.find(i);
-    if (operand_it == fused_ops_index_to_call_op_args.end()) {
-      // Encounter optional arguments.
-      operands.push_back(none_value);
-    } else {
-      operands.push_back(operand_it->second);
-    }
-  }
-
-  // Prepare output types.
-  SmallVector<Type, 4> output_types;
-  // The output type set is somewhat adhoc here: The fused op only have exact
-  // one output while the call_op can have more than one output. (but we only
-  // take the last one).
-  // And here we check the outputs are not used (except the last one) if the
-  // call_op has more than one output.
-  if (call_op.getNumResults() > 1) {
-    for (int i = 0; i < call_op.getNumResults() - 1; ++i) {
-      // This one should not be used.
-      Value unused_output = call_op.getResult(i);
-      if (!unused_output.use_empty()) return failure();
-    }
-  }
-  output_types.push_back(
-      call_op.getResult(call_op.getNumResults() - 1).getType());
-
-  // Prepare attributes.
-  SmallVector<NamedAttribute, 4> attributes;
-  attributes.push_back(builder->getNamedAttr("fused_activation_function",
-                                             builder->getStringAttr("TANH")));
-  attributes.push_back(
-      builder->getNamedAttr("time_major", builder->getBoolAttr(true)));
-
-  builder->setInsertionPoint(call_op.getOperation());
-
-  *fused_op = builder->create<TFL::UnidirectionalSequenceLSTMOp>(
-      call_op.getLoc(), output_types, operands, attributes);
-
-  return success();
-}
-
-LogicalResult ConvertTfLiteFusedOpIfAvailable(StringRef func_name,
-                                              FuncOp composite_func_op,
-                                              CallOp call_op,
-                                              OpBuilder* builder) {
-  Operation* fused_op = nullptr;
-  if (func_name == kUnidirectionalSequenceRnn) {
-    // TODO(renjieliu): Validate the func op inputs.
-    LogicalResult build_fused_op_result = BuildUnidirectionalSequenceRnnOp(
-        composite_func_op, call_op, builder, &fused_op);
-    if (failed(build_fused_op_result)) return build_fused_op_result;
-    call_op.replaceAllUsesWith(fused_op);
-  } else if (func_name == kUnidirectionalSequenceLstm) {
-    LogicalResult build_fused_op_result = BuildUnidirectionalSequenceLSTMOp(
-        composite_func_op, call_op, builder, &fused_op);
-    if (failed(build_fused_op_result)) return build_fused_op_result;
-    Value call_output = call_op.getResult(call_op.getNumResults() - 1);
-    if (call_output.getType() != fused_op->getResult(0).getType()) {
-      return failure();
-    }
-    call_output.replaceAllUsesWith(fused_op->getResult(0));
-  } else {  // If we support more fused op, we should add the conversion here.
-    return failure();
-  }
-
-  // Delete call op.
-  Operation* call = call_op.getOperation();
-  call->dropAllDefinedValueUses();
-  call->dropAllReferences();
-  call->erase();
-  return success();
-}
-
-LogicalResult ConvertCallOps(llvm::StringMap<FuncOp>* composite_func_ops,
-                             ModuleOp* module) {
-  for (auto func : module->getOps<FuncOp>()) {
-    // Ideally it will be much simpler if we can just use walk, but we also
-    // want to early return if encounter errors. :(
-    OpBuilder builder(func.getBody());
-    // The call_op replacement within this loop works like an in-place
-    // replacement, so it should be safe to do so.
-    for (auto call_op :
-         llvm::make_early_inc_range(builder.getBlock()->getOps<CallOp>())) {
-      auto it = composite_func_ops->find(call_op.getCallee());
-      if (it == composite_func_ops->end()) return failure();
-
-      // Replace the call op with TfLite fused op.
-      // Currently it's only handled case by case, but ideally it would be
-      // much better if we can do this automatically.
-      FuncOp composite_func_op = it->second;
-      StringRef func_name = composite_func_op.getAttr(kTfLiteFunctionName)
-                                .cast<StringAttr>()
-                                .getValue();
-      if (failed(ConvertTfLiteFusedOpIfAvailable(func_name, composite_func_op,
-                                                 call_op, &builder)))
-        return failure();
-
-      composite_func_ops->erase(it);
-      // Delete func op.
-      Operation* func = composite_func_op.getOperation();
-      func->erase();
-    }
-  }
-  return success();
-}
-
-void LegalizeOphintFuncOpPass::runOnOperation() {
-  ModuleOp module = getOperation();
-
-  // Find all composite funcs, then for every call op inside every func op
-  // within the module, we go ahead and replace the callop with the tflite
-  // corresponding op and destroy the func op. This two-phase processing is
-  // intended:
-  //
-  // Every func op is meant to be used exactly once.
-  // Instead of finding the composite func then loop through the graph and
-  // convert the call op immediately, we break finding & converting into two
-  // phases. This changes the complexity from O(op_in_module *
-  // function_in_module * attr_in_func) to O(op_in_module) * O(map_look_up) +
-  // O(function_in_module * attr_in_func). O(op_in_module) is the dominant
-  // factor here and map look up should be very cheap.
-  llvm::StringMap<FuncOp> composite_func_ops = FindCompositeFuncOps(module);
-  if (composite_func_ops.empty()) return;
-  if (failed(ConvertCallOps(&composite_func_ops, &module))) {
-    module.emitError() << "Legalize ophint: ConvertCallOps failed.";
-    return signalPassFailure();
-  }
-}
-
-}  // namespace
-
-/// Creates an instance of the TensorFlow Lite dialect LegalizeOphintFuncOpPass
-/// pass.
-std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeOphintFuncOpPass() {
-  return std::make_unique<LegalizeOphintFuncOpPass>();
-}
-
-static PassRegistration<LegalizeOphintFuncOpPass> pass(
-    "tfl-legalize-ophint-func-op", "Convert composite op for TfLite dialect.");
-
-}  // namespace TFL
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 586ddf6211f..4c6a16c2233 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -58,6 +58,9 @@ def HasSameStaticShapesPred : CPred<"HasSameStaticShapes($0.getDefiningOp())">;
 def HasSameStaticShapes : Constraint<HasSameStaticShapesPred, "op must have static same input shapes">;
 def HasNotSameStaticShapes : Constraint<Neg<HasSameStaticShapesPred>, "op must have not static same input shapes">;
 
+def CreateNoneValue : NativeCodeCall<
+  "$_builder.create<mlir::ConstantOp>($0.getLoc(), $_builder.getNoneType(), $_builder.getUnitAttr())">;
+
 // Checks if the value has only one user.
 // TODO(karimnosseir): Move to a common place?
 def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
@@ -208,6 +211,11 @@ def : Pat<(TF_LogicalOrOp $l, $r), (TFL_LogicalOrOp $l, $r)>;
 
 def : Pat<(TF_AddOp $lhs, $rhs), (TFL_AddOp $lhs, $rhs, TFL_AF_None)>;
 def : Pat<(TF_AddV2Op $lhs, $rhs), (TFL_AddOp $lhs, $rhs, TFL_AF_None)>;
+// When batch size is known, TF BatchMatMul gets unfolded to TFL FullyConnected
+// with additional ops. In the case of unknown batch size, the match will
+// fall through to here and convert to TF Lite BatchMatMul.
+def : Pat<(TF_BatchMatMulV2Op $lhs, $rhs, $adj_x, $adj_y), (TFL_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y)>;
+def : Pat<(TF_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y), (TFL_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y)>;
 def : Pat<(TF_SubOp $lhs, $rhs), (TFL_SubOp $lhs, $rhs, TFL_AF_None)>;
 def : Pat<(TF_MulOp $lhs, $rhs), (TFL_MulOp $lhs, $rhs, TFL_AF_None)>;
 def : Pat<(TF_RealDivOp $lhs, $rhs), (TFL_DivOp $lhs, $rhs, TFL_AF_None)>;
@@ -294,7 +302,7 @@ def : Pat<(TF_DepthToSpaceOp $input, $block_size, IsDataFormatNHWC:$data_format)
           (TFL_DepthToSpaceOp $input, (convertIntAttrTo32Bit $block_size))>;
 
 def : Pat<(TF_ResizeBilinearOp $images, $size, $align_corners, $half_pixel_centers), (TFL_ResizeBilinearOp $images, $size, $align_corners, $half_pixel_centers)>;
-def : Pat<(TF_ResizeNearestNeighborOp $images, $size, $align_corners, ConstBoolAttrFalse:$half_pixel_centers), (TFL_ResizeNearestNeighborOp $images, $size, $align_corners)>;
+def : Pat<(TF_ResizeNearestNeighborOp $images, $size, $align_corners, $half_pixel_centers), (TFL_ResizeNearestNeighborOp $images, $size, $align_corners, $half_pixel_centers)>;
 
 def : Pat<(TF_MirrorPadOp $arg0, $arg1, $cst), (TFL_MirrorPadOp $arg0, $arg1, $cst)>;
 
@@ -343,6 +351,7 @@ def : Pat<
      (TF_TransposeOp $filter,
        (ConstantOp ConstantAttr<I32VectorElementsAttr<4>, "{2, 0, 1, 3}">)),
      $out_backprop,
+     /*bias=*/ (CreateNoneValue $input_sizes),
      /*padding=*/ $padding,
      /*stride_h=*/ ExtractI32At<1>:$strides,
      /*stride_w=*/ ExtractI32At<2>:$strides)>;
@@ -350,3 +359,6 @@ def : Pat<
 def : Pat<
   (TF_MatrixSetDiagOp $input, $diagonal),
   (TFL_MatrixSetDiagOp $input, $diagonal)>;
+
+def : Pat<(TF_ScatterNdOp I32Tensor:$indices, $updates, $shape),
+          (TFL_ScatterNdOp I32Tensor:$indices, $updates, $shape)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index d9b33f3fa72..bfcbc190638 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -36,8 +36,8 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
@@ -203,6 +203,26 @@ LogicalResult ConvertTFConcatOp::matchAndRewrite(
   return success();
 }
 
+// Converts any IntegerAttr to an IntegerAttr of an i32 type.
+// The value won't change in the new attribute, but if the value is out of
+// the bound of i32, the function returns a failure.
+LogicalResult ConvertToI32Attr(IntegerAttr attr, IntegerAttr* attr_i32) {
+  if (attr.getType().isInteger(/*width=*/32)) {
+    *attr_i32 = attr;
+    return success();
+  }
+
+  int64_t value = attr.getInt();
+  if (value > std::numeric_limits<int>::max() ||
+      value < std::numeric_limits<int>::min()) {
+    return failure();
+  }
+
+  *attr_i32 = IntegerAttr::get(
+      IntegerType::get(/*width=*/32, attr.getContext()), value);
+  return success();
+}
+
 LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_concat_op = cast<TF::ConcatV2Op>(op);
@@ -212,12 +232,16 @@ LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
   // Extract axis attribute from constant axis tensor
   ElementsAttr axis;
   if (!matchPattern(tf_concat_op.axis(), m_Constant(&axis))) return failure();
+  IntegerAttr axis_int = ExtractSingleElementAsInteger(axis);
+
+  // "axis" operand could be a i64 tensor. Resolve it here.
+  IntegerAttr axis_i32;
+  if (failed(ConvertToI32Attr(axis_int, &axis_i32))) return failure();
 
   StringAttr fused_activation_function =
       StringAttr::get("NONE", rewriter.getContext());
   rewriter.replaceOpWithNewOp<ConcatenationOp>(
-      op, output_type, values, ExtractSingleElementAsInteger(axis),
-      fused_activation_function);
+      op, output_type, values, axis_i32, fused_activation_function);
   return success();
 }
 
@@ -288,12 +312,10 @@ LogicalResult ConvertTFSplitOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_split_op = cast<TF::SplitOp>(op);
 
-  auto output_types = functional::map([](Value v) { return v.getType(); },
-                                      tf_split_op.output());
   // Number of splits cannot be negative.
   auto num_split = rewriter.getI32IntegerAttr(tf_split_op.num_split());
 
-  rewriter.replaceOpWithNewOp<TFL::SplitOp>(op, output_types,
+  rewriter.replaceOpWithNewOp<TFL::SplitOp>(op, tf_split_op.output().getTypes(),
                                             tf_split_op.split_dim(),
                                             tf_split_op.value(), num_split);
   return success();
@@ -303,14 +325,12 @@ LogicalResult ConvertTFSplitVOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_splitv_op = cast<TF::SplitVOp>(op);
 
-  auto output_types = functional::map([](Value v) { return v.getType(); },
-                                      tf_splitv_op.output());
   // Number of splits cannot be negative.
   auto num_split = rewriter.getI32IntegerAttr(tf_splitv_op.num_split());
 
   rewriter.replaceOpWithNewOp<TFL::SplitVOp>(
-      op, output_types, tf_splitv_op.value(), tf_splitv_op.size_splits(),
-      tf_splitv_op.split_dim(), num_split);
+      op, tf_splitv_op.output().getTypes(), tf_splitv_op.value(),
+      tf_splitv_op.size_splits(), tf_splitv_op.split_dim(), num_split);
   return success();
 }
 
@@ -402,13 +422,12 @@ LogicalResult ConvertTFUnpackOp::matchAndRewrite(
   auto tf_unpack_op = cast<TF::UnpackOp>(op);
 
   auto input = tf_unpack_op.value();
-  auto output_types = functional::map([](Value v) { return v.getType(); },
-                                      tf_unpack_op.output());
   auto num = rewriter.getI32IntegerAttr(tf_unpack_op.num());
   // Axis can be negative.
   auto axis = rewriter.getI32IntegerAttr(tf_unpack_op.axis().getSExtValue());
 
-  rewriter.replaceOpWithNewOp<UnpackOp>(op, output_types, input, num, axis);
+  rewriter.replaceOpWithNewOp<UnpackOp>(op, tf_unpack_op.output().getTypes(),
+                                        input, num, axis);
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 889f9dde00b..49be29065fe 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -49,7 +49,6 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
@@ -76,8 +75,6 @@ class TensorListPatternRewriter : public PatternRewriter {
  public:
   explicit TensorListPatternRewriter(FuncOp fn)
       : PatternRewriter(fn.getContext()) {}
-
-  Operation *insert(Operation *op) override { return OpBuilder::insert(op); }
 };
 
 /// Lower TensorList ops in functions for subsequent legalization.
@@ -580,7 +577,7 @@ struct ConvertTensorListResize
         ArrayRef<Value>({input_handle, input_shape, size_diff, size}),
         /*then_branch=*/rewriter.getSymbolRefAttr(then_branch_op),
         /*else_branch=*/rewriter.getSymbolRefAttr(else_branch_op),
-        /*output_shapes=*/rewriter.getStrArrayAttr({"{}"}),
+        /*output_shapes=*/rewriter.getArrayAttr({}),
         /*is_stateless=*/rewriter.getBoolAttr(true));
     return success();
   }
@@ -862,6 +859,7 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
   target.addLegalOp<ConstantOp>();
   target.addLegalOp<FuncOp>();
   target.addLegalOp<ReturnOp>();
+  target.addLegalOp<TFL::CustomOp>();
   // Register fused LSTM/RNN ops as legal.
   target.addLegalOp<TFL::LSTMOp>();
   target.addLegalOp<TFL::UnidirectionalSequenceLSTMOp>();
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 423525616f6..a1aedb0af32 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
@@ -52,6 +51,9 @@ namespace TFL {
 //===----------------------------------------------------------------------===//
 // The actual Optimize Pass.
 namespace {
+constexpr char kRelu[] = "RELU";
+constexpr char kRelu6[] = "RELU6";
+constexpr char kRelu1[] = "RELU_N1_TO_1";
 
 bool L2NormalizeReduceAxis(Value sq_op, DenseElementsAttr axis) {
   if (sq_op.getType().cast<ShapedType>().getRank() - 1 ==
@@ -301,10 +303,11 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
 };
 
 // TODO(b/136285429): Move to tablegen when variadic is supported.
-struct FuseFullyConnectedAndRelu : public OpRewritePattern<TFL::ReluOp> {
-  using OpRewritePattern<TFL::ReluOp>::OpRewritePattern;
+template <typename ReluXOp, char const *Act>
+struct FuseFullyConnectedAndReluX : public OpRewritePattern<ReluXOp> {
+  using OpRewritePattern<ReluXOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TFL::ReluOp relu_op,
+  LogicalResult matchAndRewrite(ReluXOp relu_op,
                                 PatternRewriter &rewriter) const override {
     Operation *input = relu_op.getOperand().getDefiningOp();
     if (!isa_and_nonnull<FullyConnectedOp>(input)) return failure();
@@ -312,7 +315,7 @@ struct FuseFullyConnectedAndRelu : public OpRewritePattern<TFL::ReluOp> {
     if (fully_connected_op.fused_activation_function() != "NONE")
       return failure();
 
-    auto new_activation_func = rewriter.getStringAttr("RELU");
+    auto new_activation_func = rewriter.getStringAttr(Act);
     auto new_weights_format =
         rewriter.getStringAttr(fully_connected_op.weights_format());
     auto new_keep_num_dims =
@@ -709,7 +712,10 @@ void Optimize::runOnFunction() {
   // we explore these potentially first and then fuse the binary ops with the
   // following ops in a second pattern match.
   TFL::populateWithGenerated(ctx, &patterns);
-  patterns.insert<FuseFullyConnectedAndAdd, FuseFullyConnectedAndRelu,
+  patterns.insert<FuseFullyConnectedAndAdd,
+                  FuseFullyConnectedAndReluX<TFL::ReluOp, kRelu>,
+                  FuseFullyConnectedAndReluX<TFL::Relu6Op, kRelu6>,
+                  FuseFullyConnectedAndReluX<TFL::Relu1Op, kRelu1>,
                   FuseFullyConnectedAndMul>(ctx);
   applyPatternsAndFoldGreedily(func, patterns);
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 916782d95b3..a3244f31053 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -378,6 +378,19 @@ foreach BinaryOp = [TFL_FloorDivOp, TFL_FloorModOp, TFL_MinimumOp,
              (IsTailOfShape $rhs, $input)]>;
 }
 
+// Reorder the element-wise value operations and the element move operations,
+// such that the value operation happens before move operation.
+foreach ValueOp = [TFL_CeilOp, TFL_ExpOp, TFL_FloorOp, TFL_NegOp,
+                   TFL_ReluOp, TFL_Relu1Op, TFL_Relu6Op, TFL_RoundOp,
+                   TFL_TanhOp, TFL_SqrtOp, TFL_SquareOp] in {
+  foreach MoveOp = [TFL_DepthToSpaceOp, TFL_ExpandDimsOp, TFL_SqueezeOp,
+                   TFL_ReshapeOp, TFL_TransposeOp] in {
+    def : Pat<(ValueOp:$value (MoveOp:$move $input, $move_def)),
+              (MoveOp (ValueOp $input), $move_def),
+              [(HasOneUse $move)]>;
+  }
+}
+
 // Returns shape of a ranked tensor.
 // if called without a ranked tensor it will fail.
 def GetShape: NativeCodeCall<"GetShape($0)">;
@@ -394,8 +407,9 @@ def : Pat<(TFL_ExpandDimsOp:$expand_dims_op $input, $dim),
            (ConstantOp (GetShape $expand_dims_op))),
           [(AnyStaticShapeTensor $expand_dims_op)]>;
 
-class ValueEquals<string val> : Constraint<CPred<
+class FloatValueEquals<string val> : Constraint<CPred<
   "$0.cast<DenseElementsAttr>().getNumElements() == 1 &&"
+  "$0.isa<DenseFPElementsAttr>() &&"
   "*$0.cast<DenseElementsAttr>().getValues<float>().begin() == " # val>>;
 
 // ReLU patterns
@@ -403,13 +417,13 @@ def : Pat<(TFL_MinimumOp (TFL_MaximumOp $input,
                           (ConstantOp $NegOne)),
            (ConstantOp $One)),
           (TFL_Relu1Op $input),
-          [(ValueEquals<"-1"> $NegOne), (ValueEquals<"1"> $One)]>;
+          [(FloatValueEquals<"-1"> $NegOne), (FloatValueEquals<"1"> $One)]>;
 
 def : Pat<(TFL_MaximumOp (TFL_MinimumOp $input,
                           (ConstantOp $One)),
            (ConstantOp $NegOne)),
           (TFL_Relu1Op $input),
-          [(ValueEquals<"-1"> $NegOne), (ValueEquals<"1"> $One)]>;
+          [(FloatValueEquals<"-1"> $NegOne), (FloatValueEquals<"1"> $One)]>;
 
 def : Pat<(TFL_MaximumOp (TFL_MulOp:$mul_out $input1,
                           (ConstantOp F32ElementsAttr:$alpha), TFL_AF_None),
@@ -443,3 +457,21 @@ def : Pat<(TFL_AddOp
 // The constant folding in this pass might produce constant in the tf dialect.
 // This rule is to legalize these constant to the tfl dialect.
 def : Pat<(TF_ConstOp ElementsAttr:$value), (TFL_ConstOp $value)>;
+
+// Reorders adds to allow constant folding.
+// Add --> Add $input, $constantA
+//    \--> $constantB
+// To
+// Add --> $input
+//    \--> Add ($constantA, $constantB)
+foreach ActFun = [TFL_AF_Relu, TFL_AF_Relu6, TFL_AF_Relu1, TFL_AF_None] in {
+  def : Pat<(TFL_AddOp
+              (TFL_AddOp:$first_output $input, (ConstantOp $a), TFL_AF_None),
+              (ConstantOp $b), ActFun),
+            (TFL_AddOp $input,
+              (TFL_AddOp (ConstantOp $a), (ConstantOp $b), TFL_AF_None),
+              ActFun),
+  [(HasOneUse $first_output)]>;
+}
+
+
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index a744a570929..105c9394fb4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -67,13 +67,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateTrimFunctionsPass(
 // pass.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareCompositeFunctionsPass();
 
-// Creates an instance of the TensorFlow Lite dialect ExtractOphint pass.
-std::unique_ptr<OperationPass<ModuleOp>> CreateExtractOphintPass();
-
-// Creates an instance of the TensorFlow Lite dialect LegalizeOphintFuncOpPass
-// pass. The composite op is created from the ophint extraction pass.
-std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeOphintFuncOpPass();
-
 // Creates an instance of the TensorFlow Lite dialect SplitMergedOperandsPass.
 std::unique_ptr<OperationPass<FuncOp>> CreateSplitMergedOperandsPass();
 
@@ -83,7 +76,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeFunctionalOpsPass();
 // Creates an instance of the TensorFlow Lite dialect pass to add default
 // quantization parameters.
 std::unique_ptr<OperationPass<FuncOp>> CreateDefaultQuantParamsPass(
-    double default_min, double default_max);
+    double default_min, double default_max, bool is_signed);
 
 // Creates an instance of the TensorFlow Lite dialect pass to convert dense
 // tensor to sparse format.
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 97b7d57dbf4..7954f72046a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -125,6 +125,7 @@ void PostQuantizePass::runOnFunction() {
   auto func = getFunction();
   auto* ctx = func.getContext();
   TFL::populateWithGenerated(ctx, &patterns);
+  patterns.insert<quant::FoldTrivalRequantizeOp<QuantizeOp>>(ctx);
   applyPatternsAndFoldGreedily(func, patterns);
 
   if (!emit_quant_adaptor_ops_) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index aed99a70bff..f5b252773f6 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -53,7 +53,8 @@ def : Pat<
 def : Pattern<
     (TF_FusedBatchNormOp:$root
         $x, $scale, $offset, $mean, $variance,
-        F32Attr:$epsilon, $data_format, FalseBoolAttr:$is_training),
+        F32Attr:$epsilon, $exponential_avg_factor,
+        $data_format, FalseBoolAttr:$is_training),
     [(TF_AddOp
         (TF_MulOp
             $x,
@@ -75,7 +76,8 @@ def : Pattern<
 def : Pattern<
     (TF_FusedBatchNormV3Op:$root
         $x, $scale, $offset, $mean, $variance,
-        F32Attr:$epsilon, $data_format, FalseBoolAttr:$is_training),
+        F32Attr:$epsilon, $exponential_avg_factor,
+        $data_format, FalseBoolAttr:$is_training),
     [(TF_AddOp
         (TF_MulOp
             $x,
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 4f25e434fac..87cae3dd957 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -70,6 +70,7 @@ class PrepareQuantizePass
     : public PassWrapper<PrepareQuantizePass, FunctionPass> {
  public:
   // Constructor used by the PassRegistration and enforce uint8 quantization.
+  // This is only used by test.
   explicit PrepareQuantizePass() {
     if (quantize_signed)
       quant_specs_.inference_type = tensorflow::DT_QINT8;
@@ -257,15 +258,16 @@ void PrepareQuantizePass::runOnFunction() {
   // convert all of them to signed.
   OwningRewritePatternList patterns;
   bool is_signed = quant_specs_.IsSignedInferenceType();
+  int bit_width = quant_specs_.GetQuantizationTypeWidth();
   if (is_signed) {
     patterns.insert<quant::ConvertUnsignedToSigned<quant::QuantizeCastOp>>(ctx);
     // Convert quant stats to int8 quantization parameters.
     // Currently, only activation stats are imported, so narrow_range = false.
-    patterns.insert<PrepareQuantStats>(8, false, true, ctx);
+    patterns.insert<PrepareQuantStats>(bit_width, false, true, ctx);
   } else {
     // Convert quant stats to uint8 quantization parameters.
     // Currently, only activation stats are imported, so narrow_range = false.
-    patterns.insert<PrepareQuantStats>(8, false, false, ctx);
+    patterns.insert<PrepareQuantStats>(bit_width, false, false, ctx);
   }
   applyPatternsAndFoldGreedily(func, patterns);
 
@@ -273,8 +275,9 @@ void PrepareQuantizePass::runOnFunction() {
 
   // Finally, the quantization parameters can be propagated to the rest of the
   // values (tensors).
-  ApplyQuantizationParamsPropagation(func, is_signed, disable_per_channel,
-                                     GetOpQuantSpec);
+  ApplyQuantizationParamsPropagation(
+      func, is_signed, disable_per_channel || quant_specs_.disable_per_channel,
+      GetOpQuantSpec);
 
   ConvertMlirQuantOpsToTFLQuantOps(func);
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index a9b23d38378..c5211bdfadb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -46,9 +46,9 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/dilated_conv.h"
@@ -322,9 +322,10 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
 
     // Create tensor type for the transpose result.
     auto filter_type = filter.getType().cast<RankedTensorType>();
-    auto result_shape = functional::map(
-        [filter_type](int64_t dim) { return filter_type.getDimSize(dim); },
-        perm);
+    auto result_shape =
+        llvm::to_vector<4>(llvm::map_range(perm, [filter_type](int64_t dim) {
+          return filter_type.getDimSize(dim);
+        }));
     auto elem_type = filter_type.getElementType();
     auto result_type = RankedTensorType::get(result_shape, elem_type);
 
@@ -612,11 +613,35 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_prepare_tf.inc"
 
+// Returns success if all the operations in the `op`'s regions including `op`
+// itself are legal in a TFLite pipeline.
+LogicalResult ValidateOp(Operation *op) {
+  bool has_illegal_ops = false;
+  op->walk([&](Operation *op) {
+    if (isa<TF::VariableV2Op>(op)) {
+      has_illegal_ops = true;
+      op->emitOpError() << "is illegal in a TFLite pipeline";
+    }
+  });
+
+  return failure(has_illegal_ops);
+}
+
 void PrepareTFPass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
   MLIRContext *ctx = &getContext();
 
+  // Check illegal ops in a TFLite pipeline (e.g. trainning only ops) , since
+  // PrepareTFPass is the very first TFLite pass in the pipeline.
+  // TODO(jingpu): It might be better to split this check into its own pass
+  // to make things more modular.
+  if (failed(ValidateOp(func))) {
+    func.emitError() << "tfl-prepare-tf pass failed.";
+    signalPassFailure();
+    return;
+  }
+
   // This pattern was intented to uses TFL QDQs to preserve the quantization
   // parameters from the TF Quant ops, thus this pattern should run with the
   // first `applyPatternsAndFoldGreedily` method, which would otherwise removes
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index 20602338956..ba25b5c897c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
index a7f2a625e65..707f4aba881 100644
--- a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
@@ -228,8 +228,7 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
 
   Operation* new_op = OpBuilder(op).insert(Operation::create(
       op->getLoc(), op->getName(), new_types, operands, op->getAttrs(),
-      /*successors=*/{}, /*numRegions=*/2,
-      /*resizableOperandList=*/true));
+      /*successors=*/{}, /*numRegions=*/2));
   for (int i = 0; i < 2; ++i) new_op->getRegion(i).takeBody(op->getRegion(i));
   op->replaceAllUsesWith(new_op->getResults().take_front(op->getNumResults()));
   op->erase();
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index 1988dff048c..2f876c68fb8 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -94,9 +94,10 @@ Value Transpose(OpBuilder* builder, Value value_to_transpose,
 
   // Create tensor type for the transpose result.
   auto transpose_type = original_type;
-  auto transpose_shape = functional::map(
-      [transpose_type](int32_t dim) { return transpose_type.getDimSize(dim); },
-      perm);
+  auto transpose_shape =
+      llvm::to_vector<8>(llvm::map_range(perm, [transpose_type](int32_t dim) {
+        return transpose_type.getDimSize(dim);
+      }));
   auto elem_type = transpose_type.getElementType();
   auto result_type = RankedTensorType::get(transpose_shape, elem_type);
 
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index af594b0125d..11d3e7332db 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -127,6 +127,7 @@ Status MlirFunctionOptimizationPass::Run(
   GraphImportConfig import_config;
   import_config.graph_as_function = true;
   import_config.control_outputs = *control_ret_node_names;
+  import_config.upgrade_legacy = true;
   TF_ASSIGN_OR_RETURN(auto module_ref,
                       ConvertGraphToMlir(**graph, debug_info, *flib_def,
                                          import_config, &context));
@@ -149,7 +150,6 @@ Status MlirFunctionOptimizationPass::Run(
   }
 
   GraphExportConfig export_config;
-  export_config.graph_as_function = true;
   absl::flat_hash_set<Node*> control_ret_nodes;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       ConvertMlirToGraph(*module_ref, export_config, graph, flib_def,
diff --git a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
index 272fab9cd1c..bce0ed4a33d 100644
--- a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
+++ b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
@@ -55,8 +55,10 @@ llvm::StringRef OpOrArgNameMapper::GetUniqueName(llvm::StringRef prefix) {
   // to be unique.
   auto& val = prefix_it.first->second;
   llvm::SmallString<64> probe_name(prefix);
+  probe_name.append(GetSuffixSeparator());
+  const int probe_prefix_size = probe_name.size();
   while (true) {
-    probe_name.resize(prefix.size());
+    probe_name.resize(probe_prefix_size);
     // TODO(jpienaar): Subtract one so that the initial suffix is 0 instead
     // of 1.
     // TODO(jpienaar): Switch to radix 36 and update tests.
diff --git a/tensorflow/compiler/mlir/op_or_arg_name_mapper.h b/tensorflow/compiler/mlir/op_or_arg_name_mapper.h
index 108496e2283..6a52d13fbc0 100644
--- a/tensorflow/compiler/mlir/op_or_arg_name_mapper.h
+++ b/tensorflow/compiler/mlir/op_or_arg_name_mapper.h
@@ -64,6 +64,9 @@ class OpOrArgNameMapper {
     return op_or_val_to_name_;
   }
 
+  // Returns the separator used before uniqueing suffix.
+  virtual llvm::StringRef GetSuffixSeparator() { return ""; }
+
  private:
   // Returns name from the location of the operation or value.
   virtual std::string GetName(OpOrVal op_or_val) = 0;
diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index 666f89ac72f..1189a926383 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -12,6 +12,22 @@ cc_library(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        # (yongtang) The graph_optimization_pass_registration needs to be part
+        # of a shared object that will be loaded whenever `import tensorflow`
+        # is run. The natural place is libtensorflow_framework.so.
+        # While adding graph_optimization_pass_registration to
+        # libtensorflow_framework.so is possible with some modification in
+        # dependency, many tests will fail due to multiple copies of LLVM.
+        # See https://github.com/tensorflow/tensorflow/pull/39231 for details.
+        # Alternatively, we place graph_optimization_pass_registration here
+        # because:
+        # - tensorflow/python/_pywrap_mlir.so already depends on LLVM anyway
+        # - tensorflow/python/_pywrap_mlir.so always loaded as part of python
+        #   binding
+        # TODO: It might be still preferrable to place graph_optimization_pass
+        # as part of the libtensorflow_framework.so, as it is the central
+        # place for core related components.
+        "//tensorflow/compiler/mlir/tensorflow:graph_optimization_pass_registration",
         "//tensorflow/compiler/mlir/tensorflow:import_utils",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index d0f6e015922..f22fb519a64 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -112,7 +112,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
   // Convert the SavedModelBundle to an MLIR module.
 
   mlir::MLIRContext context;
-  auto module_or = ConvertSavedModelV1ToMlir(bundle, &context);
+  auto module_or = ConvertSavedModelV1ToMlir(bundle, {}, &context);
   if (!module_or.status().ok()) {
     Set_TF_Status_from_Status(status, module_or.status());
     return "// error";
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
new file mode 100644
index 00000000000..78f4312da46
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
@@ -0,0 +1,41 @@
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+package(licenses = ["notice"])
+
+tf_python_pybind_extension(
+    name = "mlir_wrapper",
+    srcs = [
+        "attrs.cc",
+        "basic_classes.cc",
+        "builders.cc",
+        "mlir_wrapper.cc",
+        "mlir_wrapper.h",
+        "ops.cc",
+        "types.cc",
+    ],
+    module_name = "mlir_wrapper",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "filecheck_wrapper",
+    srcs = ["filecheck_wrapper.cc"],
+    module_name = "filecheck_wrapper",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:support",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/attrs.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/attrs.cc
new file mode 100644
index 00000000000..ca7faf2e1d3
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/attrs.cc
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+
+void init_attrs(py::module& m) {
+  py::class_<mlir::Attribute>(m, "Attribute");
+  py::class_<mlir::IntegerAttr, mlir::Attribute>(m, "IntegerAttr")
+      .def("get",
+           py::overload_cast<mlir::Type, int64_t>(&mlir::IntegerAttr::get));
+}
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
new file mode 100644
index 00000000000..25adb44fe1d
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/FileCheck.h"
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+
+void init_basic_classes(py::module& m) {
+  py::class_<mlir::MLIRContext>(m, "MLIRContext").def(py::init<>());
+
+  py::class_<mlir::Location>(m, "Location");
+
+  py::class_<mlir::UnknownLoc>(m, "UnknownLoc")
+      .def("get", &mlir::UnknownLoc::get);
+
+  py::class_<mlir::Region>(m, "Region")
+      .def("back", &mlir::Region::back, py::return_value_policy::reference)
+      .def("front", &mlir::Region::front, py::return_value_policy::reference)
+      .def("add_block", [](mlir::Region& r) { r.push_back(new mlir::Block); })
+      .def("push_back", &mlir::Region::push_back)
+      .def("size", [](mlir::Region& r) { return r.getBlocks().size(); })
+      .def("front", &mlir::Region::front, py::return_value_policy::reference);
+  py::class_<mlir::Block::iterator>(m, "Block_Iterator");
+  py::class_<mlir::Block>(m, "Block")
+      .def("new", ([]() { return new mlir::Block; }),
+           py::return_value_policy::reference)
+      .def("end", &mlir::Block::end)
+      .def("addArgument", &mlir::Block::addArgument);
+
+  py::class_<mlir::Value>(m, "Value").def("getType", &mlir::Value::getType);
+  py::class_<mlir::OpResult, mlir::Value>(m, "OpResult");
+  py::class_<mlir::BlockArgument, mlir::Value>(m, "BlockArgument");
+}
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/builders.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/builders.cc
new file mode 100644
index 00000000000..338f17ed6df
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/builders.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+
+void init_builders(py::module& m) {
+  py::class_<mlir::Builder>(m, "Builder")
+      .def(py::init<mlir::MLIRContext*>())
+      .def("getFunctionType",
+           [](mlir::Builder& b, std::vector<mlir::Type> inputs,
+              std::vector<mlir::Type> outputs) {
+             return b.getFunctionType(llvm::ArrayRef<mlir::Type>(inputs),
+                                      llvm::ArrayRef<mlir::Type>(outputs));
+           });
+  py::class_<mlir::OpBuilder>(m, "OpBuilder")
+      .def(py::init<mlir::MLIRContext*>())
+      .def(py::init<mlir::Region&>())
+      .def(py::init<mlir::Operation*>())
+      .def(py::init<mlir::Block*, mlir::Block::iterator>())
+      .def("getUnknownLoc", &mlir::OpBuilder::getUnknownLoc)
+      .def("setInsertionPoint",
+           py::overload_cast<mlir::Block*, mlir::Block::iterator>(
+               &mlir::OpBuilder::setInsertionPoint))
+      .def("saveInsertionPoint", &mlir::OpBuilder::saveInsertionPoint)
+      .def("restoreInsertionPoint", &mlir::OpBuilder::restoreInsertionPoint)
+      .def(
+          "createOperation",
+          [](mlir::OpBuilder& opb, mlir::OperationState& state) {
+            return opb.createOperation(state);
+          },
+          py::return_value_policy::reference)
+      .def("getContext", &mlir::OpBuilder::getContext,
+           py::return_value_policy::reference);
+
+  py::class_<mlir::OpBuilder::InsertPoint>(m, "OpBuilder_InsertionPoint")
+      .def("getBlock", &mlir::OpBuilder::InsertPoint::getBlock);
+}
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
new file mode 100644
index 00000000000..8a841856b72
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/FileCheck.h"
+#include "llvm/Support/SourceMgr.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+PYBIND11_MODULE(filecheck_wrapper, m) {
+  m.def("check", [](std::string input, std::string check) {
+    llvm::FileCheckRequest fcr;
+    llvm::FileCheck fc(fcr);
+    llvm::SourceMgr SM = llvm::SourceMgr();
+    SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(input),
+                          llvm::SMLoc());
+    SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(check),
+                          llvm::SMLoc());
+    llvm::Regex regex = fc.buildCheckPrefixRegex();
+    fc.readCheckFile(SM, llvm::StringRef(check), regex);
+    return fc.checkInput(SM, llvm::StringRef(input));
+  });
+}
diff --git a/tensorflow/lite/python/optimize/sparsification_wrapper_pybind11.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
similarity index 51%
rename from tensorflow/lite/python/optimize/sparsification_wrapper_pybind11.cc
rename to tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
index 6c63c83f45e..6f468cd4267 100644
--- a/tensorflow/lite/python/optimize/sparsification_wrapper_pybind11.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
@@ -12,24 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "pybind11/pybind11.h"
-#include "pybind11/pytypes.h"
-#include "tensorflow/lite/python/optimize/sparsification_wrapper.h"
+#include "pybind11/stl.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
 
-namespace py = pybind11;
-using tflite::sparsification_wrapper::SparsificationWrapper;
+PYBIND11_MODULE(mlir_wrapper, m) {
+  m.def("registerDialects", []() {
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
+    mlir::registerDialect<mlir::StandardOpsDialect>();
+  });
 
-PYBIND11_MODULE(_pywrap_tensorflow_lite_sparsification_wrapper, m) {
-  m.doc() = R"pbdoc(
-    _pywrap_tensorflow_lite_sparsification_wrapper
-    -----
-  )pbdoc";
-  py::class_<SparsificationWrapper>(m, "SparsificationWrapper")
-      .def(py::init([](py::handle& data) {
-        return ::SparsificationWrapper::CreateWrapperCPPFromBuffer(data.ptr());
-      }))
-      .def("SparsifyModel", [](SparsificationWrapper& self) {
-        return tensorflow::pyo_or_throw(self.SparsifyModel());
-      });
+  init_basic_classes(m);
+  init_types(m);
+  init_builders(m);
+  init_ops(m);
+  init_attrs(m);
 }
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h
new file mode 100644
index 00000000000..562c59b43e1
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_WRAPPER_MLIR_WRAPPER_H
+#define TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_WRAPPER_MLIR_WRAPPER_H
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+void init_basic_classes(py::module& m);
+void init_types(py::module& m);
+void init_builders(py::module& m);
+void init_ops(py::module& m);
+void init_attrs(py::module& m);
+
+#endif  // TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_WRAPPER_MLIR_WRAPPER_H
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/ops.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/ops.cc
new file mode 100644
index 00000000000..4432829653e
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/ops.cc
@@ -0,0 +1,194 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+void init_ops(py::module& m) {
+  py::class_<mlir::Operation, std::unique_ptr<mlir::Operation, py::nodelete>>(
+      m, "Operation")
+      .def("getRegion", &mlir::Operation::getRegion,
+           py::return_value_policy::reference)
+      .def("getResult", &mlir::Operation::getResult)
+      .def("dump", &mlir::Operation::dump)
+      .def("getNumResults", &mlir::Operation::getNumResults);
+
+  py::class_<mlir::OperationState>(m, "OperationState")
+      .def(py::init([](mlir::Location loc, std::string name) {
+        return mlir::OperationState(loc, llvm::StringRef(name));
+      }))
+      .def("addTypes",
+           [](mlir::OperationState& state, std::vector<mlir::Type> tys) {
+             state.addTypes(mlir::ArrayRef<mlir::Type>(tys));
+           })
+      .def("addOperands",
+           [](mlir::OperationState& os, std::vector<mlir::Value> ops) {
+             os.addOperands(mlir::ArrayRef<mlir::Value>(ops));
+           })
+      .def("addRegion", py::overload_cast<>(&mlir::OperationState::addRegion),
+           py::return_value_policy::reference);
+
+  py::class_<mlir::ModuleOp>(m, "ModuleOp")
+      .def("create",
+           [](mlir::Location loc) { return mlir::ModuleOp::create(loc); })
+      .def("push_back",
+           [](mlir::ModuleOp& m, mlir::FuncOp f) { m.push_back(f); })
+      .def("dump", &mlir::ModuleOp::dump)
+      .def("getAsStr", [](mlir::ModuleOp& m) {
+        std::string str;
+        llvm::raw_string_ostream os(str);
+        m.print(os);
+        return os.str();
+      });
+
+  py::class_<mlir::FuncOp>(m, "FuncOp")
+      .def("create",
+           [](mlir::Location location, std::string name,
+              mlir::FunctionType type) {
+             auto func = mlir::FuncOp::create(location, name, type);
+             func.addEntryBlock();
+             return func;
+           })
+      .def(
+          "getBody",
+          [](mlir::FuncOp& f) -> mlir::Region& { return f.getBody(); },
+          py::return_value_policy::reference)
+      .def("getArguments",
+           [](mlir::FuncOp& f) { return f.getArguments().vec(); })
+      .def("getName", [](mlir::FuncOp& f) { return f.getName().str(); })
+      .def("getType", &mlir::FuncOp::getType);
+
+  py::class_<mlir::ReturnOp>(m, "ReturnOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              std::vector<mlir::Value> values) -> mlir::Operation* {
+             return opb
+                 .create<mlir::ReturnOp>(loc,
+                                         mlir::ArrayRef<mlir::Value>(values))
+                 .getOperation();
+           });
+
+  // mlir::TF::AddOp
+  py::class_<mlir::TF::AddV2Op>(m, "Tf_AddV2Op")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::AddV2Op>(loc, x, y).getOperation();
+           });
+
+  py::class_<mlir::TF::AnyOp>(m, "Tf_AnyOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value input,
+              mlir::Value reduction_indices,
+              bool keep_dims = false) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::AnyOp>(loc, opb.getI1Type(), input,
+                                          reduction_indices, keep_dims)
+                 .getOperation();
+           });
+
+  // mlir::TF::ConstOp
+  py::class_<mlir::TF::ConstOp>(m, "Tf_ConstOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              mlir::Attribute value) -> mlir::Operation* {
+             return opb.create<mlir::TF::ConstOp>(loc, value).getOperation();
+           });
+
+  // mlir::TF::EqualOp
+  py::class_<mlir::TF::EqualOp>(m, "Tf_EqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::EqualOp>(loc, x, y, opb.getBoolAttr(true))
+                 .getOperation();
+           });
+
+  // mlir::TF::GreaterEqualOp
+  py::class_<mlir::TF::GreaterEqualOp>(m, "Tf_GreaterEqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::GreaterEqualOp>(loc, x, y)
+                 .getOperation();
+           });
+
+  // mlir::TF::GreaterOp
+  py::class_<mlir::TF::GreaterOp>(m, "Tf_GreaterOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::GreaterOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::LegacyCallOp
+  py::class_<mlir::TF::LegacyCallOp>(m, "Tf_LegacyCallOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              std::vector<mlir::Type> output, std::vector<mlir::Value> args,
+              std::string f) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::LegacyCallOp>(
+                     loc, mlir::ArrayRef<mlir::Type>(output),
+                     mlir::ArrayRef<mlir::Value>(args), mlir::StringRef(f))
+                 .getOperation();
+           });
+
+  // mlir::TF::LessEqualOp
+  py::class_<mlir::TF::LessEqualOp>(m, "Tf_LessEqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::LessEqualOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::LessOp
+  py::class_<mlir::TF::LessOp>(m, "Tf_LessOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::LessOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::NegOp
+  py::class_<mlir::TF::NegOp>(m, "Tf_NegOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              mlir::Value x) -> mlir::Operation* {
+             return opb.create<mlir::TF::NegOp>(loc, x).getOperation();
+           });
+
+  py::class_<mlir::TF::NotEqualOp>(m, "Tf_NotEqualOp")
+      .def("create", [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+                        mlir::Value y) {
+        return opb
+            .create<mlir::TF::NotEqualOp>(
+                loc, x, y, mlir::BoolAttr::get(true, opb.getContext()))
+            .getOperation();
+      });
+
+  // mlir::TF::SubOp
+  py::class_<mlir::TF::SubOp>(m, "Tf_SubOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::SubOp>(loc, x, y).getOperation();
+           });
+}
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
new file mode 100644
index 00000000000..2be67f8e93e
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+void init_types(py::module& m) {
+  // Type
+  py::class_<mlir::Type> Type(m, "Type");
+  Type.def("getKind", &mlir::Type::getKind);
+
+  // Type Enums
+  py::enum_<mlir::StandardTypes::Kind>(Type, "StandardTypes_Kind")
+      .value("BF16", mlir::StandardTypes::BF16);
+
+  // Type Sub-classes
+  py::class_<mlir::FunctionType, mlir::Type>(m, "FunctionType")
+      .def("getResults",
+           [](mlir::FunctionType& ft) { return ft.getResults().vec(); });
+
+  py::class_<mlir::FloatType, mlir::Type>(m, "FloatType")
+      .def("get", &mlir::FloatType::get);
+
+  py::class_<mlir::IntegerType, mlir::Type>(m, "IntegerType")
+      .def("get", py::overload_cast<unsigned, mlir::MLIRContext*>(
+                      &mlir::IntegerType::get));
+
+  py::class_<mlir::UnrankedTensorType, mlir::Type>(m, "UnrankedTensorType")
+      .def("get", &mlir::UnrankedTensorType::get);
+
+  py::class_<mlir::RankedTensorType, mlir::Type>(m, "RankedTensorType")
+      .def("get", [](std::vector<int64_t> shape, mlir::Type ty) {
+        return mlir::RankedTensorType::get(mlir::ArrayRef<int64_t>(shape), ty);
+      });
+}
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index ddb968434c4..f1271d0da24 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -70,8 +70,9 @@ tool_dirs = config.mlir_tf_tools_dirs + [
 ]
 tool_names = [
     'mlir-opt', 'mlir-translate', 'tf-opt', 'tf_tfl_translate',
-    'flatbuffer_to_string', 'flatbuffer_translate', 'tf-mlir-translate',
-    'mlir-tflite-runner', 'tfcompile', 'json_to_flatbuffer', 'xla-opt'
+    'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
+    'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
+    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index b623ca8e849..3e7596c75d7 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -44,8 +44,10 @@ mlir_tf_tools_dirs = [
     'tensorflow/compiler/mlir',
     'tensorflow/compiler/mlir/lite',
     'tensorflow/compiler/mlir/tensorflow',
+    'tensorflow/compiler/mlir/tfjs',
     'tensorflow/compiler/mlir/xla',
-    'tensorflow/compiler/aot'
+    'tensorflow/compiler/aot',
+    'tensorflow/compiler/xla/service/mlir_gpu',
 ]
 config.mlir_tf_tools_dirs = [
     os.path.join(real_test_srcdir, os.environ['TEST_WORKSPACE'], s)
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 4305d64c864..9b2e6f0292b 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -10,6 +10,7 @@ package_group(
     name = "friends",
     includes = ["//third_party/mlir:subpackages"],
     packages = [
+        "//learning/brain/experimental/dtensor/...",
         "//learning/brain/experimental/tfrt/...",
         "//learning/pathways/data_parallel/tf2xla/...",
         "//tensorflow/compiler/...",
@@ -34,7 +35,8 @@ filegroup(
         "ir/tf_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -131,8 +133,9 @@ gentbl(
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_executor_ops.td",
     td_srcs = [
-        "@llvm-project//mlir:include/mlir/IR/OpBase.td",
+        ":tensorflow_ops_td_files",
         "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
+        "@llvm-project//mlir:include/mlir/IR/OpBase.td",
     ],
 )
 
@@ -213,6 +216,20 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tensorflow_attributes",
+    srcs = [
+        "ir/tf_attributes.cc",
+    ],
+    hdrs = [
+        "ir/tf_attributes.h",
+    ],
+    deps = [
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "tensorflow_types",
     srcs = [
@@ -224,6 +241,7 @@ cc_library(
     ],
     deps = [
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -264,6 +282,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":error_util",
+        ":tensorflow_attributes",
         ":tensorflow_canonicalize_inc_gen",
         ":tensorflow_device_ops_inc_gen",
         ":tensorflow_executor_inc_gen",
@@ -281,6 +300,7 @@ cc_library(
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffects",
@@ -325,6 +345,38 @@ cc_library(
     ],
 )
 
+gentbl(
+    name = "tf_data_optimization_inc_gen",
+    tbl_outs = [
+        (
+            "-gen-rewriters",
+            "transforms/generated_tf_data_optimization.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "transforms/tf_data_optimization.td",
+    td_srcs = [
+        ":tensorflow_ops_td_files",
+        "@llvm-project//mlir:StdOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "tf_data_optimization",
+    srcs = [
+        "transforms/tf_data_optimization.cc",
+    ],
+    hdrs = [
+        "transforms/tf_data_optimization.h",
+    ],
+    deps = [
+        ":tensorflow",
+        ":tensorflow_types",
+        ":tf_data_optimization_inc_gen",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "unroll_batch_matmul_pass",
     srcs = [
@@ -389,10 +441,13 @@ cc_library(
         "transforms/tensor_array_ops_decomposition.cc",
         "transforms/tensor_list_ops_decomposition.cc",
         "transforms/test_side_effect_analysis.cc",
+        "transforms/tf_data_optimization_pass.cc",
         "transforms/tf_device_assignment.cc",
         "transforms/tpu_cluster_formation.cc",
         "transforms/tpu_dynamic_layout_pass.cc",
         "transforms/tpu_dynamic_padding_mapper.cc",
+        "transforms/tpu_extract_head_tail_outside_compilation.cc",
+        "transforms/tpu_extract_outside_compilation.cc",
         "transforms/tpu_merge_variables_with_execute.cc",
         "transforms/tpu_rewrite_pass.cc",
         "transforms/tpu_sharding_identification_pass.cc",
@@ -425,6 +480,7 @@ cc_library(
         ":tensorflow",
         ":tensorflow_optimize_inc_gen",
         ":tensorflow_types",
+        ":tf_data_optimization",
         ":tpu_rewrite_device_util",
         ":translate_utils",
         ":unroll_batch_matmul_pass",
@@ -503,7 +559,7 @@ cc_library(
     deps = [
         ":tensorflow",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LoopOpsTransforms",
+        "@llvm-project//mlir:SCFTransforms",
     ],
     alwayslink = 1,
 )
@@ -527,6 +583,7 @@ cc_library(
         ":mangling_util",
         ":mlir_roundtrip_flags",
         ":tensorflow",
+        ":tensorflow_attributes",
         ":tensorflow_passes",
         ":tensorflow_types",
         ":translate_utils",
@@ -580,7 +637,6 @@ cc_library(
         ":error_util",
         ":parse_text_proto",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
     ],
@@ -599,6 +655,7 @@ cc_library(
         ":convert_type",
         ":mangling_util",
         ":tensorflow",
+        ":tensorflow_attributes",
         ":tensorflow_types",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
@@ -767,7 +824,9 @@ cc_library(
     deps = [
         ":convert_type",
         ":mangling_util",
+        ":tensorflow_attributes",
         ":tensorflow_types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -786,10 +845,14 @@ tf_cc_test(
     srcs = ["utils/convert_tensor_test.cc"],
     deps = [
         ":convert_tensor",
+        ":tensorflow",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/stream_executor/lib",
         "@llvm-project//mlir:IR",
     ],
@@ -1014,7 +1077,8 @@ genrule(
     name = "derived_attr_populator_inc",
     srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "ir/tf_generated_ops.td",
         "ir/tf_op_base.td",
@@ -1079,6 +1143,7 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/compiler/mlir/xla:type_to_shape",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf_with_tf2xla",
+    "//tensorflow/compiler/mlir/xla:xla_sink_constants_to_control_flow",
     "//tensorflow/compiler/tf2xla:common",
     "//tensorflow/compiler/tf2xla:xla_compiler",
     "//tensorflow/core:framework",
@@ -1087,6 +1152,7 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/stream_executor/lib",
     "//tensorflow/compiler/xla:xla_data_proto_cc",
     "//tensorflow/compiler/xla/service:hlo",
+    ":convert_tensor",
 ]
 
 # Prefer to link 'compile_mlir_util' library that also links necessary
@@ -1216,6 +1282,7 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1230,6 +1297,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
index 15a4ecfc537..39245425a5a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 namespace TFControlFlow {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
new file mode 100644
index 00000000000..dfad1fce26d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
@@ -0,0 +1,132 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+namespace detail {
+
+// The storage class for ShapeAttr.
+struct ShapeAttrStorage : public AttributeStorage {
+  using KeyTy = std::pair<ArrayRef<int64_t>, bool>;
+
+  explicit ShapeAttrStorage(ArrayRef<int64_t> shape, bool unranked = false)
+      : shape(shape), unranked(unranked) {}
+
+  bool operator==(const KeyTy& key) const {
+    return key == KeyTy(shape, unranked);
+  }
+  static unsigned hashKey(const KeyTy& key) {
+    return llvm::hash_combine(key.first, static_cast<char>(key.second));
+  }
+
+  // NOLINTNEXTLINE
+  static ShapeAttrStorage* construct(mlir::AttributeStorageAllocator& allocator,
+                                     const KeyTy& key) {
+    return new (allocator.allocate<ShapeAttrStorage>())
+        ShapeAttrStorage(allocator.copyInto(key.first), key.second);
+  }
+
+  ArrayRef<int64_t> shape;
+  bool unranked = false;
+};
+
+// The storage class for FuncAttr.
+struct FuncAttrStorage : public AttributeStorage {
+  using KeyTy = std::pair<Attribute, Attribute>;
+
+  explicit FuncAttrStorage(Attribute name, Attribute attrs)
+      : name(name), attrs(attrs) {}
+
+  bool operator==(const KeyTy& key) const { return key == KeyTy(name, attrs); }
+  static unsigned hashKey(const KeyTy& key) {
+    return llvm::hash_combine(key.first, key.second);
+  }
+
+  static FuncAttrStorage* construct(mlir::AttributeStorageAllocator& allocator,
+                                    const KeyTy& key) {
+    return new (allocator.allocate<FuncAttrStorage>())
+        FuncAttrStorage(key.first, key.second);
+  }
+
+  Attribute name;
+  Attribute attrs;
+};
+
+}  // namespace detail
+
+// Get or create a shape attribute.
+ShapeAttr ShapeAttr::get(mlir::MLIRContext* context,
+                         llvm::Optional<ArrayRef<int64_t>> shape) {
+  if (shape)
+    return Base::get(context, AttrKind::SHAPE, *shape,
+                     /*unranked=*/false);
+
+  return Base::get(context, AttrKind::SHAPE, ArrayRef<int64_t>(),
+                   /*unranked=*/true);
+}
+
+llvm::Optional<ArrayRef<int64_t>> ShapeAttr::getValue() const {
+  if (hasRank()) return getShape();
+  return llvm::None;
+}
+
+bool ShapeAttr::hasRank() const { return !getImpl()->unranked; }
+
+int64_t ShapeAttr::getRank() const {
+  assert(hasRank());
+  return getImpl()->shape.size();
+}
+
+ArrayRef<int64_t> ShapeAttr::getShape() const {
+  assert(hasRank());
+  return getImpl()->shape;
+}
+
+bool ShapeAttr::hasStaticShape() const {
+  if (!hasRank()) return false;
+
+  for (auto dim : getShape()) {
+    if (dim < 0) return false;
+  }
+
+  return true;
+}
+
+FuncAttr FuncAttr::get(mlir::MLIRContext* context, llvm::StringRef name,
+                       DictionaryAttr attr) {
+  auto symbol = SymbolRefAttr::get(name, context);
+  return Base::get(context, AttrKind::FUNC, symbol, attr);
+}
+
+FuncAttr FuncAttr::get(mlir::MLIRContext* context, SymbolRefAttr symbol,
+                       DictionaryAttr attr) {
+  return Base::get(context, AttrKind::FUNC, symbol, attr);
+}
+
+SymbolRefAttr FuncAttr::GetName() const {
+  return getImpl()->name.cast<SymbolRefAttr>();
+}
+
+DictionaryAttr FuncAttr::GetAttrs() const {
+  return getImpl()->attrs.cast<DictionaryAttr>();
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
new file mode 100644
index 00000000000..ba67d6cb671
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
@@ -0,0 +1,107 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the attributes used in the TensorFlow dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+namespace AttrKind {
+
+// List of supported custom TensorFlow Attributes kinds, necessary for
+// isa/dyn_cast.
+enum Kind {
+  FIRST_USED_TENSORFLOW_ATTR = Attribute::FIRST_TENSORFLOW_ATTR,
+  SHAPE = FIRST_USED_TENSORFLOW_ATTR,
+  FUNC,
+  LAST_USED_TENSORFLOW_ATTR,
+};
+
+}  // namespace AttrKind
+
+namespace detail {
+
+struct ShapeAttrStorage;
+struct FuncAttrStorage;
+
+}  // namespace detail
+
+class ShapeAttr : public Attribute::AttrBase<ShapeAttr, Attribute,
+                                             detail::ShapeAttrStorage> {
+ public:
+  using Base::Base;
+
+  // Get or create a shape attribute. If shape is llvm::None, then it is
+  // unranked. Otherwise it is ranked. And for ranked shapes, the value of the
+  // dimension size must be >= -1. The value of -1 means the dimension is
+  // dynamic. Otherwise, the dimension is static.
+  static ShapeAttr get(mlir::MLIRContext* context,
+                       llvm::Optional<ArrayRef<int64_t>> shape);
+
+  llvm::Optional<ArrayRef<int64_t>> getValue() const;
+
+  bool hasRank() const;
+
+  // If this is ranked, return the rank. Otherwise, abort.
+  int64_t getRank() const;
+
+  // If this is ranked, return the shape. Otherwise, abort.
+  ArrayRef<int64_t> getShape() const;
+
+  // If this is unranked type or any dimension has unknown size (<0), it doesn't
+  // have static shape. If all dimensions have known size (>= 0), it has static
+  // shape.
+  bool hasStaticShape() const;
+
+  static bool kindof(unsigned kind) { return kind == AttrKind::SHAPE; }
+};
+
+// Custom attribute to model AttrValue.value.func (NameAttrList type attribute).
+// This attribute holds a SymbolRefAttr, for the NameAttrList.name string and a
+// DictionaryAttr for the NameAttrList.attr map<string, AttrValue>. It is
+// currently printed and parsed for the following format:
+//
+//   #tf.func<@symbol, {attr = "value"}>
+//
+// where the first element is the SymbolRefAttr and the second element is the
+// DictionaryAttr.
+class FuncAttr
+    : public Attribute::AttrBase<FuncAttr, Attribute, detail::FuncAttrStorage> {
+ public:
+  using Base::Base;
+
+  static FuncAttr get(mlir::MLIRContext* context, llvm::StringRef name,
+                      DictionaryAttr attr);
+
+  static FuncAttr get(mlir::MLIRContext* context, SymbolRefAttr symbol,
+                      DictionaryAttr attr);
+
+  SymbolRefAttr GetName() const;
+
+  DictionaryAttr GetAttrs() const;
+
+  static bool kindof(unsigned kind) { return kind == AttrKind::FUNC; }
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index e8d32121d1b..b8f0585040c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Support/STLExtras.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/logging.h"
@@ -90,7 +89,7 @@ struct TFInlinerInterface : public DialectInlinerInterface {
 // are perfectly forwarded to the block's terminator.
 bool BlockWrapsSingleOp(Block* block) {
   auto body = block->without_terminator();
-  if (!has_single_element(body)) return false;
+  if (!hasSingleElement(body)) return false;
 
   Operation& wrapped_op = *body.begin();
   Operation* terminator = block->getTerminator();
@@ -187,7 +186,7 @@ LogicalResult Verify(ParallelExecuteOp op) {
 }  // namespace
 
 // static
-void ParallelExecuteOp::build(Builder* builder, OperationState& state,
+void ParallelExecuteOp::build(OpBuilder& builder, OperationState& state,
                               int num_regions,
                               llvm::ArrayRef<Type> output_types) {
   DCHECK_GE(num_regions, 2);
@@ -463,22 +462,22 @@ void BuildReplicateOp(
 }  // anonymous namespace
 
 void ReplicateOp::build(
-    Builder* builder, OperationState& state, int n,
+    OpBuilder& builder, OperationState& state, int n,
     const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>&
         devices,
     llvm::ArrayRef<std::pair<llvm::ArrayRef<Value>, Type>> replicated_inputs,
     llvm::ArrayRef<Type> replica_output_types) {
-  BuildReplicateOp(builder, &state, n, devices, replicated_inputs,
+  BuildReplicateOp(&builder, &state, n, devices, replicated_inputs,
                    replica_output_types);
 }
 
 void ReplicateOp::build(
-    Builder* builder, OperationState& state, int n,
+    OpBuilder& builder, OperationState& state, int n,
     const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>&
         devices,
     llvm::ArrayRef<std::pair<Operation::operand_range, Type>> replicated_inputs,
     Operation::result_type_range replica_output_types) {
-  BuildReplicateOp(builder, &state, n, devices, replicated_inputs,
+  BuildReplicateOp(&builder, &state, n, devices, replicated_inputs,
                    replica_output_types);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
index 4673e86921a..d0c15f7e9ec 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
@@ -48,10 +48,14 @@ class TfDevice_Op<string mnemonic, list<OpTrait> traits = []> :
     Op<TfDevice_Dialect, mnemonic, traits> { }
 
 def TfDevice_LaunchOp : TfDevice_Op<"launch",
-                                    [SingleBlockImplicitTerminator<"ReturnOp">]>
-{
-  let summary = [{The `tf_device.launch` op captures all needed live-in values
-                  and launches containing operations on target device.}];
+    [SingleBlockImplicitTerminator<"ReturnOp">]> {
+  let summary = [{
+The `tf_device.launch` op launches containing operations on target device.
+  }];
+
+  let description = [{
+This op captures all needed live-in values.
+  }];
 
   let arguments = (ins
     StrAttr:$device
@@ -70,7 +74,7 @@ def TfDevice_LaunchOp : TfDevice_Op<"launch",
   }];
 
   let builders = [
-    OpBuilder<[{Builder *builder, OperationState &result,
+    OpBuilder<[{OpBuilder &builder, OperationState &result,
                 StringAttr device, ArrayRef<Type> result_types}],
       [{
         result.addAttribute("device", device);
@@ -85,8 +89,8 @@ def TfDevice_LaunchOp : TfDevice_Op<"launch",
 
 def TfDevice_ReturnOp : TfDevice_Op<"return", [Terminator]> {
   let summary = [{
-    The `tf_device.return` operation terminates and returns values from
-    `tf_device.launch` operation;
+The `tf_device.return` operation terminates and returns values from a
+`tf_device` dialect operation.
   }];
 
   let arguments = (ins
@@ -94,7 +98,7 @@ def TfDevice_ReturnOp : TfDevice_Op<"return", [Terminator]> {
   );
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result",
+    "OpBuilder &builder, OperationState &result",
     [{
       build(builder, result, {});
     }]>
@@ -121,7 +125,6 @@ def TfDevice_LaunchFuncOp : TfDevice_Op<"launch_func", []> {
   let extraClassDeclaration = [{
     StringRef getFunc() { return func(); }
     StringRef getDevice() { return device(); }
-    FunctionType getFuncType();
   }];
 }
 
@@ -167,7 +170,7 @@ def TfDevice_ParallelExecuteOp : TfDevice_Op<"parallel_execute",
   }];
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& state, int num_regions,"
+    OpBuilder<"OpBuilder& builder, OperationState& state, int num_regions,"
               "llvm::ArrayRef<Type> output_types">,
   ];
 
@@ -266,11 +269,11 @@ For example:
   }];
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& state, int n, "
+    OpBuilder<"OpBuilder& builder, OperationState& state, int n, "
               "const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>& devices, "
               "llvm::ArrayRef<std::pair<llvm::ArrayRef<Value>, Type>> replicated_inputs, "
               "llvm::ArrayRef<Type> replica_output_types">,
-    OpBuilder<"Builder* builder, OperationState& state, int n, "
+    OpBuilder<"OpBuilder& builder, OperationState& state, int n, "
               "const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>& devices, "
               "llvm::ArrayRef<std::pair<Operation::operand_range, Type>> replicated_inputs, "
               "Operation::result_type_range replica_output_types">
@@ -281,4 +284,51 @@ For example:
   let verifier = [{ return Verify(*this); }];
 }
 
+def TfDevice_ClusterOp : TfDevice_Op<"cluster",
+    [SingleBlockImplicitTerminator<"ReturnOp">]> {
+  let summary = [{
+The `tf_device.cluster` op wraps containing operations in a region.
+  }];
+
+  let description = [{
+This op can be used to group operations, and captures all needed live-in values.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs
+    Variadic<AnyType>:$results
+  );
+
+  let regions = (region SizedRegion<1>:$body);
+
+  let extraClassDeclaration = [{
+    Block &GetBody() { return getOperation()->getRegion(0).front(); }
+  }];
+}
+
+def TfDevice_ClusterFuncOp : TfDevice_Op<"cluster_func", []> {
+  let summary = [{
+The `tf_device.cluster_func` launches a function containing the body of a
+cluster.
+  }];
+
+  let description = [{
+This op is used for outlining a cluster.
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$func,
+    Variadic<AnyType>:$operands
+  );
+
+  let results = (outs
+    Variadic<AnyType>:$results
+  );
+
+  let extraClassDeclaration = [{
+    StringRef getFunc() { return func(); }
+  }];
+}
+
 #endif // TF_DEVICE_DIALECT
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 0ca4364f9cd..d5ecbf3e292 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Support/STLExtras.h"  // from @llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -318,7 +317,7 @@ YieldOp IslandOp::GetYield() { return llvm::cast<YieldOp>(GetBody().back()); }
 // operation results are perfectly forwarded to the islands yield.
 bool IslandOp::WrapsSingleOp() {
   auto body = GetBody().without_terminator();
-  if (!has_single_element(body)) return false;
+  if (!hasSingleElement(body)) return false;
 
   Operation &wrapped_op = *body.begin();
   YieldOp yield = GetYield();
@@ -475,7 +474,7 @@ namespace {
 ParseResult ParseSwitchOp(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::OperandType, 2> op_infos;
   SmallVector<Type, 1> types;
-  if (parser.parseOperandList(op_infos, 2) || parser.parseColonTypeList(types))
+  if (parser.parseOperandList(op_infos) || parser.parseColonTypeList(types))
     return failure();
   if (types.size() != 1)
     return parser.emitError(parser.getNameLoc())
@@ -487,12 +486,15 @@ ParseResult ParseSwitchOp(OpAsmParser &parser, OperationState &result) {
   // type).
   if (types.front().isa<FunctionType>()) {
     FunctionType type = types.front().cast<FunctionType>();
-    if (type.getNumInputs() != 2)
+    if (type.getNumInputs() < 2)
       return parser.emitError(parser.getNameLoc())
              << " expects a single data type and a predicate";
     result.types.assign(type.getResults().begin(), type.getResults().end());
     types.assign(type.getInputs().begin(), type.getInputs().end());
   } else {
+    if (op_infos.size() < 2)
+      return parser.emitError(parser.getNameLoc())
+             << " expects a single data type and a predicate";
     Type control_type = ControlType::get(parser.getBuilder().getContext());
     result.types.append(2, types[0]);
     result.types.push_back(control_type);
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 3c47ef1117d..0efe578f151 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -20,6 +20,7 @@ limitations under the License.
 #define TF_EXECUTOR_DIALECT
 
 include "mlir/IR/OpBase.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
 
 //===----------------------------------------------------------------------===//
 // TensorFlow dialect definitions
@@ -141,7 +142,7 @@ def TfExecutor_FetchOp : TfExecutor_Op<"fetch",
   );
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result",
+    "OpBuilder &builder, OperationState &result",
     [{
       build(builder, result, {});
     }]>
@@ -222,7 +223,7 @@ def TfExecutor_YieldOp : TfExecutor_Op<"yield",
   );
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result",
+    "OpBuilder &builder, OperationState &result",
     [{
       build(builder, result, {});
     }]>
@@ -234,9 +235,9 @@ def TfExecutor_YieldOp : TfExecutor_Op<"yield",
 def TfExecutor_SwitchOp : TfExecutor_Op<"Switch",
     [ControlOperandsAfterAllData, HasParent<"GraphOp">,
      PredOpTrait<"data operand must be broadcastable to true result",
-                 TCOpIsBroadcastableToRes<0, 0>>,
+                 TF_OpIsBroadcastableToRes<0, 0>>,
      PredOpTrait<"data operand must be broadcastable to false result",
-                 TCOpIsBroadcastableToRes<0, 1>>]>{
+                 TF_OpIsBroadcastableToRes<0, 1>>]>{
   let summary = [{
     The "tf_executor.Switch" operation takes a data operand and a boolean
     predicate condition, and returns two values matching the type of the data
@@ -356,7 +357,7 @@ def TfExecutor_MergeOp : TfExecutor_Op<"Merge",
 def TfExecutor_EnterOp : TfExecutor_Op<"Enter",
     [ControlOperandsAfterAllData, HasParent<"GraphOp">,
      PredOpTrait<"data operand must be broadcastable to result",
-                 TCOpIsBroadcastableToRes<0, 0>>]>{
+                 TF_OpIsBroadcastableToRes<0, 0>>]>{
   let summary = [{
     The "tf_executor.Enter" operation forwards its input to Tensorflow while
     loop.
@@ -449,11 +450,11 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source",
   );
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Type result_type, "
+    "OpBuilder &builder, OperationState &result, Type result_type, "
     "ArrayRef<NamedAttribute> attributes = {}",
     [{
-      Type token_type = TokenType::get(builder->getContext());
-      Type control_type = ControlType::get(builder->getContext());
+      Type token_type = TokenType::get(builder.getContext());
+      Type control_type = ControlType::get(builder.getContext());
       result.types = { result_type, token_type, control_type };
       result.attributes.append(attributes.begin(), attributes.end());
     }]>
@@ -515,7 +516,7 @@ def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink",
   );
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value token, "
+    "OpBuilder &builder, OperationState &result, Value token, "
     "ArrayRef<Value> operands, ArrayRef<NamedAttribute> attributes = {}",
     [{
       assert(operands.size() >= 1 && "tf_executor.NextIteration.Sink builder "
@@ -531,7 +532,7 @@ def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink",
 def TfExecutor_ExitOp : TfExecutor_Op<"Exit",
     [HasParent<"GraphOp">,
      PredOpTrait<"data operand must be broadcastable to result",
-                 TCOpIsBroadcastableToRes<0, 0>>]>{
+                 TF_OpIsBroadcastableToRes<0, 0>>]>{
 
   let summary = [{
     The "tf_executor.Exit" operation forwards a value from an while loop to its
@@ -594,14 +595,14 @@ def TfExecutor_ControlTriggerOp : TfExecutor_Op<"ControlTrigger",
   let hasCanonicalizer = 1;
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, "
+    "OpBuilder &builder, OperationState &result, "
     "ArrayRef<Value> operands, ArrayRef<NamedAttribute> attributes = {}",
     [{
       assert(operands.size() >= 1 && "tf_executor.ControlTrigger builder "
              "expects at least one operand");
       result.operands.insert(result.operands.end(), operands.begin(),
                               operands.end());
-      Type control_type = ControlType::get(builder->getContext());
+      Type control_type = ControlType::get(builder.getContext());
       result.types = {control_type};
       result.attributes.append(attributes.begin(), attributes.end());
     }]>
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 092d2d57cdf..fd24b7284c1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -160,6 +160,8 @@ def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastable
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
 }
 
 def TF_AllOp : TF_Op<"All", [NoSideEffect]> {
@@ -190,6 +192,44 @@ retained with length 1.
   let verifier = [{ return Verify(*this); }];
 }
 
+def TF_AllToAllOp : TF_Op<"AllToAll", [NoSideEffect]> {
+  let summary = "An Op to exchange data across TPU replicas.";
+
+  let description = [{
+On each replica, the input is split into `split_count` blocks along
+`split_dimension` and send to the other replicas given group_assignment. After
+receiving `split_count` - 1 blocks from other replicas, we concatenate the
+blocks along `concat_dimension` as the output.
+
+For example, suppose there are 2 TPU replicas:
+replica 0 receives input: `[[A, B]]`
+replica 1 receives input: `[[C, D]]`
+
+group_assignment=`[[0, 1]]`
+concat_dimension=0
+split_dimension=1
+split_count=2
+
+replica 0's output: `[[A], [C]]`
+replica 1's output: `[[B], [D]]`
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    I32Tensor:$group_assignment,
+
+    I64Attr:$concat_dimension,
+    I64Attr:$split_dimension,
+    I64Attr:$split_count
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_AngleOp : TF_Op<"Angle", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Returns the argument of a complex number.";
 
@@ -253,6 +293,26 @@ retained with length 1.
   let verifier = [{ return Verify(*this); }];
 }
 
+def TF_ApproximateEqualOp : TF_Op<"ApproximateEqual", [Commutative, NoSideEffect]> {
+  let summary = "Returns the truth value of abs(x-y) < tolerance element-wise.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
+
+    DefaultValuedAttr<F32Attr, "1e-05f">:$tolerance
+  );
+
+  let results = (outs
+    I1Tensor:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ArgMaxOp : TF_Op<"ArgMax", [NoSideEffect]> {
   let summary = [{
 Returns the index with the largest value across dimensions of a tensor.
@@ -273,7 +333,7 @@ Usage:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$dimension
   );
 
@@ -306,7 +366,7 @@ Usage:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$dimension
   );
 
@@ -596,6 +656,29 @@ window in `value`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_AvgPoolGradOp : TF_Op<"AvgPoolGrad", [NoSideEffect]> {
+  let summary = "Computes gradients of the average pooling function.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    I32Tensor:$orig_input_shape,
+    TF_FpTensor:$grad,
+
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksize,
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_BatchMatMulOp : TF_Op<"BatchMatMul", [NoSideEffect]> {
   let summary = "Multiplies slices of two tensors in batches.";
 
@@ -1020,6 +1103,26 @@ for dtype in dtype_list:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_BroadcastArgsOp : TF_Op<"BroadcastArgs", [NoSideEffect]> {
+  let summary = "Return the shape of s0 op s1 with broadcast.";
+
+  let description = [{
+Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$s0,
+    TF_I32OrI64Tensor:$s1
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$r0
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_BroadcastGradientArgsOp : TF_Op<"BroadcastGradientArgs", [NoSideEffect]> {
   let summary = [{
 Return the reduction indices for computing gradients of s0 op s1 with broadcast.
@@ -1064,6 +1167,15 @@ tf.Tensor(
 
 In the above example, the input Tensor with the shape of `[1, 3]`
 is broadcasted to output Tensor with shape of `[3, 3]`.
+
+When doing broadcasted operations such as multiplying a tensor
+by a scalar, broadcasting (usually) confers some time or space
+benefit, as the broadcasted tensor is never materialized.
+
+However, `broadcast_to` does not carry with it any such benefits.
+The newly-created tensor takes the full memory of the broadcasted
+shape. (In a graph context, `broadcast_to` might be fused to
+subsequent operation and then be optimized away, however.)
   }];
 
   let arguments = (ins
@@ -1143,7 +1255,7 @@ that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect]> {
   let summary = "Clips tensor values to a specified min and max.";
 
   let description = [{
@@ -1334,6 +1446,30 @@ tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
   let hasCanonicalizer = 1;
 }
 
+def TF_ConjugateTransposeOp : TF_Op<"ConjugateTranspose", [NoSideEffect]> {
+  let summary = [{
+Shuffle dimensions of x according to a permutation and conjugate the result.
+  }];
+
+  let description = [{
+The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+  `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    TF_I32OrI64Tensor:$perm
+  );
+
+  let results = (outs
+    TF_Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tperm = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_Conv2DOp : TF_Op<"Conv2D", [NoSideEffect, TF_LayoutSensitiveInterface]> {
   let summary = [{
 Computes a 2-D convolution given 4-D `input` and `filter` tensors.
@@ -1608,7 +1744,28 @@ Given an input tensor, this function computes hyperbolic cosine of every
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [AllTypesMatch<["input", "output"]>, NoSideEffect]> {
+def TF_CrossOp : TF_Op<"Cross", [NoSideEffect]> {
+  let summary = "Compute the pairwise cross product.";
+
+  let description = [{
+`a` and `b` must be the same shape; they can either be simple 3-element vectors,
+or any shape where the innermost dimension is 3. In the latter case, each pair
+of corresponding 3-element vectors is cross-multiplied independently.
+  }];
+
+  let arguments = (ins
+    TF_IntOrFpTensor:$a,
+    TF_IntOrFpTensor:$b
+  );
+
+  let results = (outs
+    TF_IntOrFpTensor:$product
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
   let summary = "An Op to sum inputs across replicated TPU instances.";
 
   let description = [{
@@ -1632,7 +1789,7 @@ and `B, D, F, H` as group 1. Thus we get the outputs:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CumsumOp : TF_Op<"Cumsum", [AllTypesMatch<["x", "out"]>, NoSideEffect]> {
+def TF_CumsumOp : TF_Op<"Cumsum", [NoSideEffect, TF_AllTypesMatch<["x", "out"]>]> {
   let summary = "Compute the cumulative sum of the tensor `x` along `axis`.";
 
   let description = [{
@@ -1682,6 +1839,169 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_DataFormatDimMapOp : TF_Op<"DataFormatDimMap", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+Returns the dimension index in the destination data format given the one in
+  }];
+
+  let description = [{
+the source data format.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$x,
+
+    DefaultValuedAttr<StrAttr, "NHWC">:$src_format,
+    DefaultValuedAttr<StrAttr, "NCHW">:$dst_format
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_DecodeAndCropJpegOp : TF_Op<"DecodeAndCropJpeg", [NoSideEffect]> {
+  let summary = "Decode and Crop a JPEG-encoded image to a uint8 tensor.";
+
+  let description = [{
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the JPEG-encoded image.
+*   1: output a grayscale image.
+*   3: output an RGB image.
+
+If needed, the JPEG-encoded image is transformed to match the requested number
+of color channels.
+
+The attr `ratio` allows downscaling the image by an integer factor during
+decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+downscaling the image later.
+
+
+It is equivalent to a combination of decode and crop, but much faster by only
+decoding partial jpeg image.
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$contents,
+    I32Tensor:$crop_window,
+
+    DefaultValuedAttr<I64Attr, "0">:$channels,
+    DefaultValuedAttr<I64Attr, "1">:$ratio,
+    DefaultValuedAttr<BoolAttr, "true">:$fancy_upscaling,
+    DefaultValuedAttr<BoolAttr, "false">:$try_recover_truncated,
+    DefaultValuedAttr<F32Attr, "1.0f">:$acceptable_fraction,
+    StrAttr:$dct_method
+  );
+
+  let results = (outs
+    TF_Uint8Tensor:$image
+  );
+}
+
+def TF_DecodeGifOp : TF_Op<"DecodeGif", [NoSideEffect]> {
+  let summary = "Decode the frame(s) of a GIF-encoded image to a uint8 tensor.";
+
+  let description = [{
+GIF images with frame or transparency compression are not supported.
+On Linux and MacOS systems, convert animated GIFs from compressed to
+uncompressed by running:
+
+    convert $src.gif -coalesce $dst.gif
+
+This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+`tf.io.decode_image`.
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$contents
+  );
+
+  let results = (outs
+    TF_Uint8Tensor:$image
+  );
+}
+
+def TF_DecodeJpegOp : TF_Op<"DecodeJpeg", [NoSideEffect]> {
+  let summary = "Decode a JPEG-encoded image to a uint8 tensor.";
+
+  let description = [{
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the JPEG-encoded image.
+*   1: output a grayscale image.
+*   3: output an RGB image.
+
+If needed, the JPEG-encoded image is transformed to match the requested number
+of color channels.
+
+The attr `ratio` allows downscaling the image by an integer factor during
+decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+downscaling the image later.
+
+
+This op also supports decoding PNGs and non-animated GIFs since the interface is
+the same, though it is cleaner to use `tf.io.decode_image`.
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$contents,
+
+    DefaultValuedAttr<I64Attr, "0">:$channels,
+    DefaultValuedAttr<I64Attr, "1">:$ratio,
+    DefaultValuedAttr<BoolAttr, "true">:$fancy_upscaling,
+    DefaultValuedAttr<BoolAttr, "false">:$try_recover_truncated,
+    DefaultValuedAttr<F32Attr, "1.0f">:$acceptable_fraction,
+    StrAttr:$dct_method
+  );
+
+  let results = (outs
+    TF_Uint8Tensor:$image
+  );
+}
+
+def TF_DecodePngOp : TF_Op<"DecodePng", [NoSideEffect]> {
+  let summary = "Decode a PNG-encoded image to a uint8 or uint16 tensor.";
+
+  let description = [{
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the PNG-encoded image.
+*   1: output a grayscale image.
+*   3: output an RGB image.
+*   4: output an RGBA image.
+
+If needed, the PNG-encoded image is transformed to match the requested number
+of color channels.
+
+This op also supports decoding JPEGs and non-animated GIFs since the interface
+is the same, though it is cleaner to use `tf.io.decode_image`.
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$contents,
+
+    DefaultValuedAttr<I64Attr, "0">:$channels
+  );
+
+  let results = (outs
+    TensorOf<[TF_Uint16, TF_Uint8]>:$image
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_DepthToSpaceOp : TF_Op<"DepthToSpace", [NoSideEffect]> {
   let summary = "DepthToSpace for tensors of type T.";
 
@@ -1911,6 +2231,8 @@ def TF_DivOp : TF_Op<"Div", [NoSideEffect, ResultsBroadcastableShape]>,
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
 }
 
 def TF_DivNoNanOp : TF_Op<"DivNoNan", [NoSideEffect, ResultsBroadcastableShape]>,
@@ -2143,6 +2465,51 @@ See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_EluGradOp : TF_Op<"EluGrad", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+Computes gradients for the exponential linear (Elu) operation.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$gradients,
+    TF_FpTensor:$outputs
+  );
+
+  let results = (outs
+    TF_FpTensor:$backprops
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_EmptyOp : TF_Op<"Empty", []> {
+  let summary = [{
+Creates a tensor with the given shape.
+
+This operation creates a tensor of `shape` and `dtype`.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    I32Tensor:$shape,
+
+    DefaultValuedAttr<BoolAttr, "false">:$init
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+
+  let hasFolder = 1;
+}
+
 def TF_EqualOp : TF_Op<"Equal", [Commutative, NoSideEffect]> {
   let summary = "Returns the truth value of (x == y) element-wise.";
 
@@ -2162,8 +2529,8 @@ tf.math.equal(x, y) ==> array([True,  True])
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint8]>:$y,
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
 
     DefaultValuedAttr<BoolAttr, "true">:$incompatible_shape_error
   );
@@ -2175,7 +2542,7 @@ tf.math.equal(x, y) ==> array([True,  True])
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& result, Value  x, "
+    OpBuilder<"OpBuilder& builder, OperationState& result, Value  x, "
               "Value  y, BoolAttr incompatible_shape_error">
   ];
 
@@ -2331,7 +2698,7 @@ size 1.
   TF_DerivedOperandTypeAttr Tdim = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& result, Value  condition, "
+    OpBuilder<"OpBuilder& builder, OperationState& result, Value  condition, "
               "Value  dim">
   ];
 }
@@ -2539,6 +2906,12 @@ fill([2, 3], 9) ==> [[9, 9, 9]
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasFolder = 1;
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, Value dims, Value value"
+  >];
 }
 
 def TF_FloorOp : TF_Op<"Floor", [NoSideEffect, SameOperandsAndResultType]> {
@@ -2621,6 +2994,7 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
     F32Tensor:$variance,
 
     DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
     DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
     DefaultValuedAttr<BoolAttr, "true">:$is_training
   );
@@ -2760,6 +3134,7 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
     F32Tensor:$variance,
 
     DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
     DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
     DefaultValuedAttr<BoolAttr, "true">:$is_training
   );
@@ -2966,8 +3341,8 @@ Gather slices from `params` axis `axis` according to `indices`.
 
   let description = [{
 `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-params.shape[axis + 1:]` where:
+Produces an output tensor with shape `params.shape[:axis] +
+indices.shape[batch_dims:] + params.shape[axis + 1:]` where:
 
 ```python
     # Scalar indices (output is rank(params) - 1).
@@ -3252,22 +3627,6 @@ tf.imag(input) ==> [4.75, 5.75]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_InfeedDequeueTupleOp : TF_Op<"InfeedDequeueTuple", []> {
-  let summary = "Fetches multiple values from infeed as an XLA tuple.";
-
-  let description = [{
-  }];
-
-  let arguments = (ins);
-
-  let results = (outs
-    Variadic<TF_Tensor>:$outputs
-  );
-
-  TF_DerivedResultShapeListAttr shapes = TF_DerivedResultShapeListAttr<0>;
-  TF_DerivedResultTypeListAttr dtypes = TF_DerivedResultTypeListAttr<0>;
-}
-
 def TF_InvOp : TF_Op<"Inv", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the reciprocal of x element-wise.";
 
@@ -3555,6 +3914,28 @@ def TF_LeakyReluOp : TF_Op<"LeakyRelu", [NoSideEffect, SameOperandsAndResultType
   let hasFolder = 1;
 }
 
+def TF_LeakyReluGradOp : TF_Op<"LeakyReluGrad", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+Computes rectified linear gradients for a LeakyRelu operation.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$gradients,
+    TF_FpTensor:$features,
+
+    DefaultValuedAttr<F32Attr, "0.2f">:$alpha
+  );
+
+  let results = (outs
+    TF_FpTensor:$backprops
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_LeftShiftOp : TF_Op<"LeftShift", [NoSideEffect, ResultsBroadcastableShape]>,
                      WithBroadcastableBinOpBuilder {
   let summary = "Elementwise computes the bitwise left-shift of `x` and `y`.";
@@ -3946,10 +4327,9 @@ cublas.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_MatrixBandPartOp : TF_Op<"MatrixBandPart", [AllTypesMatch<["input", "band"]>, NoSideEffect]> {
+def TF_MatrixBandPartOp : TF_Op<"MatrixBandPart", [NoSideEffect, TF_AllTypesMatch<["input", "band"]>]> {
   let summary = [{
-Copy a tensor setting everything outside a central band in each innermost matrix
-to zero.
+Copy a tensor setting everything outside a central band in each innermost matrix to zero.
   }];
 
   let description = [{
@@ -4584,7 +4964,7 @@ retained with length 1.
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value input, "
+    "OpBuilder &builder, OperationState &result, Value input, "
     "Value reduction_indices, BoolAttr keep_dims"
   >];
 }
@@ -4703,12 +5083,12 @@ def TF_MaximumOp : TF_Op<"Maximum", [NoSideEffect, ResultsBroadcastableShape]>,
   }];
 
   let arguments = (ins
-    TF_FpOrI32OrI64Tensor:$x,
-    TF_FpOrI32OrI64Tensor:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TF_FpOrI32OrI64Tensor:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4751,12 +5131,12 @@ def TF_MinimumOp : TF_Op<"Minimum", [NoSideEffect, ResultsBroadcastableShape]>,
   }];
 
   let arguments = (ins
-    TF_FpOrI32OrI64Tensor:$x,
-    TF_FpOrI32OrI64Tensor:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TF_FpOrI32OrI64Tensor:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4854,7 +5234,7 @@ func @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32>
 
 @tf.function
 def foo(x, y):
-  return = mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
+  return mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
 
 graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.TensorSpec([10], tf.float32)).graph.as_graph_def()
 ```
@@ -4919,6 +5299,8 @@ def TF_MulOp : TF_Op<"Mul", [Commutative, NoSideEffect, ResultsBroadcastableShap
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasFolder = 1;
 }
 
 def TF_MulNoNanOp : TF_Op<"MulNoNan", [NoSideEffect, ResultsBroadcastableShape]>,
@@ -4933,12 +5315,12 @@ Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
+    TF_FpOrComplexTensor:$x,
+    TF_FpOrComplexTensor:$y
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$z
+    TF_FpOrComplexTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5117,8 +5499,8 @@ def TF_NotEqualOp : TF_Op<"NotEqual", [Commutative, NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint8]>:$y,
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
 
     DefaultValuedAttr<BoolAttr, "true">:$incompatible_shape_error
   );
@@ -5130,7 +5512,7 @@ def TF_NotEqualOp : TF_Op<"NotEqual", [Commutative, NoSideEffect]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& result, Value  x, "
+    OpBuilder<"OpBuilder& builder, OperationState& result, Value  x, "
               "Value  y, BoolAttr incompatible_shape_error">
   ];
 
@@ -5249,7 +5631,7 @@ output =
   TF_DerivedOperandTypeAttr TI = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& result, Value  indices, "
+    OpBuilder<"OpBuilder& builder, OperationState& result, Value  indices, "
               "Value  depth, Value  on_value, Value  off_value, "
               "IntegerAttr axis">
   ];
@@ -5512,6 +5894,40 @@ retained with length 1.
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_QrOp : TF_Op<"Qr", [NoSideEffect]> {
+  let summary = "Computes the QR decompositions of one or more matrices.";
+
+  let description = [{
+Computes the QR decomposition of each inner matrix in `tensor` such that
+`tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+
+```python
+# a is a tensor.
+# q is a tensor of orthonormal matrices.
+# r is a tensor of upper triangular matrices.
+q, r = qr(a)
+q_full, r_full = qr(a, full_matrices=True)
+```
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$input,
+
+    DefaultValuedAttr<BoolAttr, "false">:$full_matrices
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$q,
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$r
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 def TF_QuantizeAndDequantizeOp : TF_Op<"QuantizeAndDequantize", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Use QuantizeAndDequantizeV2 instead.";
 
@@ -5797,7 +6213,7 @@ tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& result, Value  start, "
+    OpBuilder<"OpBuilder& builder, OperationState& result, Value  start, "
               "Value  limit, Value  delta">
   ];
 }
@@ -5832,8 +6248,10 @@ of the tensor. Rank is also known as "order", "degree", or "ndims."
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& result, Value  input">
+    OpBuilder<"OpBuilder& builder, OperationState& result, Value  input">
   ];
+
+  let hasFolder = 1;
 }
 
 def TF_ReadVariableOp : TF_Op<"ReadVariableOp", []> {
@@ -5913,6 +6331,8 @@ If `x` and `y` are reals, this will return the floating-point division.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
 }
 
 def TF_ReciprocalOp : TF_Op<"Reciprocal", [NoSideEffect, SameOperandsAndResultType]> {
@@ -5955,6 +6375,29 @@ is the corresponding input gradient.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_RecvTPUEmbeddingActivationsOp : TF_Op<"RecvTPUEmbeddingActivations", []> {
+  let summary = "An op that receives embedding activations on the TPU.";
+
+  let description = [{
+The TPU system performs the embedding lookups and aggregations specified by
+the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+results of these aggregations are visible to the Tensorflow Graph as the
+outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+one Tensor of activations per table specified in the model. There can be at
+most one RecvTPUEmbeddingActivations op in the TPU graph.
+  }];
+
+  let arguments = (ins
+    StrAttr:$config
+  );
+
+  let results = (outs
+    Variadic<F32Tensor>:$outputs
+  );
+
+  TF_DerivedResultSizeAttr num_outputs = TF_DerivedResultSizeAttr<0>;
+}
+
 def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType, TF_LayoutAgnostic]> {
   let summary = "Computes rectified linear: `max(features, 0)`.";
 
@@ -5993,6 +6436,24 @@ def TF_Relu6Op : TF_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_Relu6GradOp : TF_Op<"Relu6Grad", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes rectified linear 6 gradients for a Relu6 operation.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_IntOrFpTensor:$gradients,
+    TF_IntOrFpTensor:$features
+  );
+
+  let results = (outs
+    TF_IntOrFpTensor:$backprops
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ReluGradOp : TF_Op<"ReluGrad", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes rectified linear gradients for a Relu operation.";
 
@@ -6090,7 +6551,7 @@ reshape(t, []) ==> 7
 
   let builders = [
     OpBuilder<
-      "Builder* builder, OperationState& result, Value  tensor, Value  shape">
+      "OpBuilder& builder, OperationState& result, Value  tensor, Value  shape">
   ];
 
   let verifier = [{
@@ -6614,6 +7075,106 @@ is the corresponding input gradient.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ScatterNdOp : TF_Op<"ScatterNd", [NoSideEffect]> {
+  let summary = "Scatter `updates` into a new tensor according to `indices`.";
+
+  let description = [{
+Creates a new tensor by applying sparse `updates` to individual values or
+slices within a tensor (initially zero for numeric, empty for string) of
+the given `shape` according to indices.  This operator is the inverse of the
+`tf.gather_nd` operator which extracts values or slices from a given tensor.
+
+This operation is similar to tensor_scatter_add, except that the tensor is
+zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+
+If `indices` contains duplicates, then their updates are accumulated (summed).
+
+**WARNING**: The order in which updates are applied is nondeterministic, so the
+output will be nondeterministic if `indices` contains duplicates -- because
+of some numerical approximation issues, numbers summed in different order
+may yield different results.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of scatter is to insert individual elements in a tensor by
+index. For example, say we want to insert 4 scattered elements in a rank-1
+tensor with 8 elements.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+</div>
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    shape = tf.constant([8])
+    scatter = tf.scatter_nd(indices, updates, shape)
+    print(scatter)
+```
+
+The resulting tensor would look like this:
+
+    [0, 11, 0, 10, 9, 0, 0, 12]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+</div>
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    shape = tf.constant([4, 4, 4])
+    scatter = tf.scatter_nd(indices, updates, shape)
+    print(scatter)
+```
+
+The resulting tensor would look like this:
+
+    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$indices,
+    TF_Tensor:$updates,
+    TF_I32OrI64Tensor:$shape
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_SegmentMaxOp : TF_Op<"SegmentMax", [NoSideEffect]> {
   let summary = "Computes the maximum along segments of a tensor.";
 
@@ -6875,9 +7436,15 @@ select(condition, t, e) ==> [[1, 2],
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+
+  let hasCanonicalizer = 1;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
-def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect]> {
+def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect, ResultsBroadcastableShape]> {
   let summary = "";
 
   let description = [{
@@ -6896,10 +7463,56 @@ def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& result, Value  condition, Value  e, Value  t">
+    OpBuilder<"OpBuilder& builder, OperationState& result, Value  condition, Value  e, Value  t">
   ];
 }
 
+def TF_SeluOp : TF_Op<"Selu", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+  }];
+
+  let description = [{
+if < 0, `scale * features` otherwise.
+
+To be used together with
+`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+
+See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$features
+  );
+
+  let results = (outs
+    TF_FpTensor:$activations
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_SeluGradOp : TF_Op<"SeluGrad", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+Computes gradients for the scaled exponential linear (Selu) operation.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$gradients,
+    TF_FpTensor:$outputs
+  );
+
+  let results = (outs
+    TF_FpTensor:$backprops
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ShapeOp : TF_Op<"Shape", [NoSideEffect]> {
   let summary = "Returns the shape of a tensor.";
 
@@ -6930,7 +7543,7 @@ shape(t) ==> [2, 2, 3]
   }];
 
   let builders = [
-    OpBuilder<"Builder* builder, OperationState& result, Value  input, BoolAttr use32Bit">
+    OpBuilder<"OpBuilder& builder, OperationState& result, Value  input, BoolAttr use32Bit">
   ];
 
   let hasFolder = 1;
@@ -7538,6 +8151,26 @@ I.e., \\(y = \sqrt{x} = x^{1/2}\\).
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_SqrtGradOp : TF_Op<"SqrtGrad", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the gradient for the sqrt of `x` wrt its input.";
+
+  let description = [{
+Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+is the corresponding input gradient.
+  }];
+
+  let arguments = (ins
+    TF_FpOrComplexTensor:$y,
+    TF_FpOrComplexTensor:$dy
+  );
+
+  let results = (outs
+    TF_FpOrComplexTensor:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_SquareOp : TF_Op<"Square", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes square of x element-wise.";
 
@@ -7898,28 +8531,6 @@ shape of `StridedSlice`'s `input`.
   }];
 }
 
-def TF_StringFormatOp : TF_Op<"StringFormat", [NoSideEffect]> {
-  let summary = "Formats a string template using a list of tensors.";
-
-  let description = [{
-Formats a string template using a list of tensors, pretty-printing tensor summaries.
-  }];
-
-  let arguments = (ins
-    Variadic<TF_Tensor>:$inputs,
-
-    DefaultValuedAttr<StrAttr, "%s">:$strtemplate,
-    DefaultValuedAttr<StrAttr, "%s">:$placeholder,
-    DefaultValuedAttr<I64Attr, "3">:$summarize
-  );
-
-  let results = (outs
-    TF_StrTensor:$output
-  );
-
-  TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
-}
-
 def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x - y element-wise.";
@@ -7941,6 +8552,8 @@ def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape]>,
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
 }
 
 def TF_SumOp : TF_Op<"Sum", [NoSideEffect]> {
@@ -7968,7 +8581,7 @@ retained with length 1.
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value input, "
+    "OpBuilder &builder, OperationState &result, Value input, "
     "Value reduction_indices, BoolAttr keep_dims"
   >];
 }
@@ -8287,7 +8900,7 @@ All elements must have the same shape (excepting the first dimension).
     TF_ResourceTensor:$handle,
     F32Tensor:$flow_in,
 
-    DefaultValuedAttr<StrAttr, "tfshape$unknown_rank: true">:$element_shape_except0
+    DefaultValuedAttr<TF_ShapeAttr, "llvm::None">:$element_shape_except0
   );
 
   let results = (outs
@@ -8312,7 +8925,7 @@ All elements selected by `indices` must have the same shape.
     I32Tensor:$indices,
     F32Tensor:$flow_in,
 
-    DefaultValuedAttr<StrAttr, "tfshape$unknown_rank: true">:$element_shape
+    DefaultValuedAttr<TF_ShapeAttr, "llvm::None">:$element_shape
   );
 
   let results = (outs
@@ -8487,7 +9100,7 @@ Write data via Write and read via Read or Pack.
     I32Tensor:$size,
 
     TypeAttr:$dtype,
-    DefaultValuedAttr<StrAttr, "tfshape$unknown_rank: true">:$element_shape,
+    DefaultValuedAttr<TF_ShapeAttr, "llvm::None">:$element_shape,
     DefaultValuedAttr<BoolAttr, "false">:$dynamic_size,
     DefaultValuedAttr<BoolAttr, "true">:$clear_after_read,
     DefaultValuedAttr<BoolAttr, "false">:$identical_element_shapes,
@@ -8729,6 +9342,32 @@ size: size of the output list
   );
 }
 
+def TF_TensorListScatterIntoExistingListOp : TF_Op<"TensorListScatterIntoExistingList", [NoSideEffect]> {
+  let summary = "Scatters tensor at indices in an input list.";
+
+  let description = [{
+Each member of the TensorList corresponds to one row of the input tensor,
+specified by the given index (see `tf.gather`).
+
+input_handle: The list to scatter into.
+tensor: The input tensor.
+indices: The indices used to index into the list.
+output_handle: The TensorList.
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$input_handle,
+    TF_Tensor:$tensor,
+    I32Tensor:$indices
+  );
+
+  let results = (outs
+    TF_VariantTensor:$output_handle
+  );
+
+  TF_DerivedOperandTypeAttr element_dtype = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_TensorListSetItemOp : TF_Op<"TensorListSetItem", [NoSideEffect]> {
   let summary = "";
 
@@ -8875,7 +9514,7 @@ On GPU, if an out of bound index is found, the index is ignored.
 
   let builders = [
     OpBuilder<
-      "Builder* builder, OperationState& result, "
+      "OpBuilder& builder, OperationState& result, "
       "Value tensor, Value indices, Value updates",
       [{build(builder, result, tensor.getType(), tensor, indices, updates);}]
     >
@@ -8960,8 +9599,8 @@ as true/false for a branch condition.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value value", [{
-      build(builder, result, RankedTensorType::get({}, builder->getI1Type()),
+    "OpBuilder &builder, OperationState &result, Value value", [{
+      build(builder, result, RankedTensorType::get({}, builder.getI1Type()),
             value);
     }]>];
 
@@ -9025,7 +9664,7 @@ The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
 
   let builders = [
     OpBuilder<
-      "Builder* builder, OperationState& result, Value  x, Value  perm">
+      "OpBuilder& builder, OperationState& result, Value  x, Value  perm">
   ];
 
   let verifier = [{
@@ -9089,6 +9728,30 @@ y + truncate_mod(x, y) = x`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_TruncatedNormalOp : TF_Op<"TruncatedNormal", []> {
+  let summary = "Outputs random values from a truncated normal distribution.";
+
+  let description = [{
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+
+    DefaultValuedAttr<I64Attr, "0">:$seed,
+    DefaultValuedAttr<I64Attr, "0">:$seed2
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_UniqueOp : TF_Op<"Unique", [NoSideEffect]> {
   let summary = "Finds unique elements in a 1-D tensor.";
 
@@ -9396,6 +10059,30 @@ shape(t) ==> [2, 2, 3]
   let hasFolder = 1;
 }
 
+def TF_VariableV2Op : TF_Op<"VariableV2", []> {
+  let summary = [{
+Holds state in the form of a tensor that persists across steps.
+  }];
+
+  let description = [{
+Outputs a ref to the tensor state so it may be read or modified.
+TODO(zhifengc/mrry): Adds a pointer to a more detail document
+about sharing states in tensorflow.
+  }];
+
+  let arguments = (ins
+    TF_ShapeAttr:$shape,
+    StrAttr:$container,
+    StrAttr:$shared_name
+  );
+
+  let results = (outs
+    TF_Tensor:$ref
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_WhereOp : TF_Op<"Where", [NoSideEffect]> {
   let summary = "Returns locations of nonzero / true values in a tensor.";
 
@@ -9493,6 +10180,110 @@ def TF_XdivyOp : TF_Op<"Xdivy", [NoSideEffect, ResultsBroadcastableShape]>,
   let hasCanonicalizer = 1;
 }
 
+def TF_XlaBroadcastHelperOp : TF_Op<"XlaBroadcastHelper", [NoSideEffect]> {
+  let summary = "Helper operator for performing XLA-style broadcasts";
+
+  let description = [{
+Broadcasts `lhs` and `rhs` to the same rank, by adding size 1 dimensions to
+whichever of `lhs` and `rhs` has the lower rank, using XLA's broadcasting rules
+for binary operators.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs,
+    TF_I32OrI64Tensor:$broadcast_dims
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs_output,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs_output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaConvOp : TF_Op<"XlaConv", [NoSideEffect]> {
+  let summary = "Wraps the XLA ConvGeneralDilated operator, documented at";
+
+  let description = [{
+https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
+.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs,
+    TF_I32OrI64Tensor:$window_strides,
+    TF_I32OrI64Tensor:$padding,
+    TF_I32OrI64Tensor:$lhs_dilation,
+    TF_I32OrI64Tensor:$rhs_dilation,
+    TF_I32OrI64Tensor:$feature_group_count,
+
+    StrAttr:$dimension_numbers,
+    StrAttr:$precision_config
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaDotOp : TF_Op<"XlaDot", [NoSideEffect]> {
+  let summary = "Wraps the XLA DotGeneral operator, documented at";
+
+  let description = [{
+https://www.tensorflow.org/performance/xla/operation_semantics#dotgeneral
+.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs,
+
+    StrAttr:$dimension_numbers,
+    StrAttr:$precision_config
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaDynamicSliceOp : TF_Op<"XlaDynamicSlice", [NoSideEffect]> {
+  let summary = "Wraps the XLA DynamicSlice operator, documented at";
+
+  let description = [{
+https://www.tensorflow.org/performance/xla/operation_semantics#dynamicslice
+.
+
+DynamicSlice extracts a sub-array from the input array at dynamic
+start_indices. The size of the slice in each dimension is passed in
+size_indices, which specify the end point of exclusive slice intervals in each
+dimension -- [start, start + size). The shape of start_indices must have rank 1,
+with dimension size equal to the rank of operand.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_I32OrI64Tensor:$start_indices,
+    TF_I32OrI64Tensor:$size_indices
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XlaDynamicUpdateSliceOp : TF_Op<"XlaDynamicUpdateSlice", [NoSideEffect]> {
   let summary = "Wraps the XLA DynamicUpdateSlice operator, documented at";
 
@@ -9522,22 +10313,230 @@ Handling of out-of-bounds slice indices is implementation-defined.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_XlaShardingOp : TF_Op<"XlaSharding", [NoSideEffect]> {
+def TF_XlaGatherOp : TF_Op<"XlaGather", [NoSideEffect]> {
+  let summary = "Wraps the XLA Gather operator documented at";
+
+  let description = [{
+https://www.tensorflow.org/xla/operation_semantics#gather
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$operand,
+    TF_I32OrI64Tensor:$start_indices,
+    TF_I32OrI64Tensor:$slice_sizes,
+
+    StrAttr:$dimension_numbers,
+    BoolAttr:$indices_are_sorted
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaHostComputeOp : TF_Op<"XlaHostCompute", []> {
   let summary = [{
-An op which shards the input based on the given sharding attribute.
+A pseudo-op to represent host-side computation in an XLA program.
   }];
 
   let description = [{
   }];
 
   let arguments = (ins
-    TF_Tensor:$input
+    Variadic<TF_Tensor>:$inputs,
+
+    StrArrayAttr:$ancestors,
+    TF_ShapeAttrArray:$shapes,
+    SymbolRefAttr:$shape_inference_graph,
+    StrAttr:$key,
+    DefaultValuedAttr<I64Attr, "1000000">:$cost_estimate_ns,
+    DefaultValuedAttr<I64Attr, "0">:$tpu_core
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
+def TF_XlaKeyValueSortOp : TF_Op<"XlaKeyValueSort", [NoSideEffect]> {
+  let summary = "Wraps the XLA Sort operator, documented at";
+
+  let description = [{
+https://www.tensorflow.org/performance/xla/operation_semantics#sort
+.
+
+Sorts a tensor. Currently only sorts in ascending order are supported.
+  }];
+
+  let arguments = (ins
+    TF_IntOrFpTensor:$keys,
+    TF_Tensor:$values
+  );
+
+  let results = (outs
+    TF_IntOrFpTensor:$sorted_keys,
+    TF_Tensor:$sorted_values
+  );
+
+  TF_DerivedOperandTypeAttr V = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr K = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaPadOp : TF_Op<"XlaPad", [NoSideEffect]> {
+  let summary = "Wraps the XLA Pad operator, documented at";
+
+  let description = [{
+https://www.tensorflow.org/performance/xla/operation_semantics#pad
+.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_Tensor:$padding_value,
+    TF_I32OrI64Tensor:$padding_low,
+    TF_I32OrI64Tensor:$padding_high,
+    TF_I32OrI64Tensor:$padding_interior
   );
 
   let results = (outs
     TF_Tensor:$output
   );
 
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaRecvFromHostOp : TF_Op<"XlaRecvFromHost", []> {
+  let summary = "An op to receive a tensor from the host.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_ShapeAttr:$shape,
+    StrAttr:$key
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedResultTypeAttr Toutput = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_XlaReduceOp : TF_Op<"XlaReduce", [NoSideEffect]> {
+  let summary = "Wraps the XLA Reduce operator, documented at";
+
+  let description = [{
+https://www.tensorflow.org/performance/xla/operation_semantics#reduce .
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$init_value,
+
+    I64ArrayAttr:$dimensions_to_reduce,
+    SymbolRefAttr:$reducer
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaReplicaIdOp : TF_Op<"XlaReplicaId", [NoSideEffect]> {
+  let summary = "Replica ID.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs
+    I32Tensor:$id
+  );
+}
+
+def TF_XlaSelfAdjointEigOp : TF_Op<"XlaSelfAdjointEig", [NoSideEffect]> {
+  let summary = [{
+Computes the eigen decomposition of a batch of self-adjoint matrices
+  }];
+
+  let description = [{
+(Note: Only real inputs are supported).
+
+Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices in
+tensor such that tensor[...,:,:] * v[..., :,i] = e[..., i] * v[...,:,i], for
+i=0...N-1.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$a,
+
+    BoolAttr:$lower,
+    I64Attr:$max_iter,
+    F32Attr:$epsilon
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$w,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$v
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaSendToHostOp : TF_Op<"XlaSendToHost", []> {
+  let summary = "An op to send a tensor to the host.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    StrAttr:$key
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tinput = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaSvdOp : TF_Op<"XlaSvd", [NoSideEffect]> {
+  let summary = [{
+Computes the eigen decomposition of a batch of self-adjoint matrices
+  }];
+
+  let description = [{
+(Note: Only real inputs are supported).
+
+Computes the eigenvalues and eigenvectors of the innermost M-by-N matrices in
+tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[...,:,:]).
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$a,
+
+    I64Attr:$max_iter,
+    F32Attr:$epsilon,
+    StrAttr:$precision_config
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$s,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$u,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$v
+  );
+
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
@@ -9595,6 +10594,50 @@ def TF_ZerosLikeOp : TF_Op<"ZerosLike", [NoSideEffect, SameOperandsAndResultType
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF__HostComputeMlirOp : TF_Op<"_HostComputeMlir", []> {
+  let summary = "A host-side computation called from a TPU device.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    StrAttr:$key,
+    DefaultValuedAttr<I64Attr, "0">:$tpu_core
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
+def TF__RecvTPUEmbeddingActivationsOp : TF_Op<"_RecvTPUEmbeddingActivations", []> {
+  let summary = "An op that receives embeddng activations on the TPU.";
+
+  let description = [{
+The TPU system performs the embedding lookups and aggregations. The results of
+these aggregations are visible to the Tensorflow Graph as the outputs of a
+_RecvTPUEmbeddingActivations Op. This op returns a list containing one
+Tensor of activations per table specified in the model.
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$deduplication_data,
+
+    StrAttr:$config
+  );
+
+  let results = (outs
+    Variadic<F32Tensor>:$outputs
+  );
+
+  TF_DerivedResultSizeAttr num_tables = TF_DerivedResultSizeAttr<0>;
+}
+
 def TF__TPUCompileMlirOp : TF_Op<"_TPUCompileMlir", []> {
   let summary = [{
 Compiles a computations for execution on one or more TPU devices.
@@ -9630,3 +10673,44 @@ used to look up the program in the compilation cache.
   TF_DerivedResultSizeAttr num_computations = TF_DerivedResultSizeAttr<1>;
   TF_DerivedOperandSizeAttr NumDynamicShapes = TF_DerivedOperandSizeAttr<0>;
 }
+
+def TF__XlaRecvAtHostOp : TF_Op<"_XlaRecvAtHost", []> {
+  let summary = [{
+A placeholder op to receive values from a running XLA computation.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$dynamic_key,
+
+    StrAttr:$key,
+    I64Attr:$device_ordinal
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
+def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", []> {
+  let summary = "A placeholder op to send values to a running XLA computation.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+    TF_StrTensor:$dynamic_key,
+
+    StrAttr:$key,
+    I64Attr:$device_ordinal
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 773025c58df..dbd8ab0fae2 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -23,7 +23,7 @@ limitations under the License.
 #define TF_OP_BASE
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td"
 
 //===----------------------------------------------------------------------===//
@@ -63,6 +63,23 @@ def TF_OperandsSameAsResultsTypeOrRef : NativeOpTrait<
 // format), as an example all element wise operations are layout agnostic.
 def TF_LayoutAgnostic : NativeOpTrait<"TF::LayoutAgnostic">;
 
+// Variant of broadcastable trait that considers TF's subtype behavior.
+class TF_OpIsBroadcastableToRes<int opId, int resId> : And<[
+    TCOpResIsShapedTypePred<opId, resId>,
+    CPred<"mlir::TF::BroadcastCompatible("
+              "$_op.getOperand(" # opId # ").getType(), "
+              "$_op.getResult(" # resId # ").getType())">]>;
+
+
+class TF_AllTypesMatchPred<list<string> values> :
+    CPred<"TF::AreCastCompatible(llvm::makeArrayRef({"# StrJoin<values>.result #"}))">;
+
+class TF_AllTypesMatch<list<string> names> :
+    PredOpTrait<
+        "all of {" # StrJoin<names>.result # "} have dynamically equal types ",
+        TF_AllTypesMatchPred<
+            !foreach(n, names, !subst("$_self", "$" # n, "$_self.getType()"))>>;
+
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
 //===----------------------------------------------------------------------===//
@@ -70,6 +87,25 @@ def TF_LayoutAgnostic : NativeOpTrait<"TF::LayoutAgnostic">;
 class TF_Op<string mnemonic, list<OpTrait> traits = []> :
     Op<TF_Dialect, mnemonic, traits>;
 
+//===----------------------------------------------------------------------===//
+// TensorFlow attribute definitions
+//===----------------------------------------------------------------------===//
+
+class TF_TensorFlowAttr <string name, string description> :
+    Attr<CPred<"$_self.isa<mlir::TF::" # name # "Attr>()">,
+         "TensorFlow " # description # " attribute">;
+
+def TF_ShapeAttr : TF_TensorFlowAttr<"Shape", "shape"> {
+  let returnType = "llvm::Optional<llvm::ArrayRef<int64_t>>";
+  let convertFromStorage = "$_self.cast<mlir::TF::ShapeAttr>().getValue()";
+
+  // Create a ranked shape attr by default.
+  let constBuilderCall = "mlir::TF::ShapeAttr::get($_builder.getContext(), $0)";
+}
+
+def TF_ShapeAttrArray :
+    TypedArrayAttrBase<TF_ShapeAttr, "tensorflow shape attribute array">;
+
 //===----------------------------------------------------------------------===//
 // TensorFlow type definitions
 //===----------------------------------------------------------------------===//
@@ -103,9 +139,16 @@ def TF_I32Or64 : SignlessIntOfWidths<[32, 64]>;
 def TF_I32OrI64Tensor : TensorOf<[TF_I32Or64]>;
 
 def TF_Uint8 : UI<8>;
+def TF_Uint8Tensor : TensorOf<[TF_Uint8]>;
+
 def TF_Uint16 : UI<16>;
+def TF_Uint16Tensor : TensorOf<[TF_Uint16]>;
+
 def TF_Uint32 : UI<32>;
+def TF_Uint32Tensor : TensorOf<[TF_Uint32]>;
+
 def TF_Uint64 : UI<64>;
+def TF_Uint64Tensor : TensorOf<[TF_Uint64]>;
 
 // Any unsigned integer type
 def TF_UInt : UnsignedIntOfWidths<[8, 16, 32, 64]>;
@@ -233,7 +276,8 @@ def TF_ConvnetDataFormatAttr : StringBasedAttr<
 class TF_DerivedOperandSizeAttr<int idx> : DerivedAttr<
   "size_t",
   "auto range = getODSOperands(" # idx # ");\n"
-  "return std::distance(range.begin(), range.end());">;
+  "return std::distance(range.begin(), range.end());",
+  [{ $_builder.getI64IntegerAttr($_self) }]>;
 
 // A derived attribute that returns the element type of `idx`-th ODS-declared
 // operand. If the `idx`-th operand is a variadic operand, then this attribute
@@ -251,7 +295,16 @@ class TF_DerivedOperandTypeListAttr<int idx> : DerivedAttr<
   "mlir::OperandElementTypeRange",
   "auto values = getODSOperands(" # idx # ");\n"
   "return {mlir::OperandElementTypeIterator(values.begin()), "
-          "mlir::OperandElementTypeIterator(values.end())};"
+          "mlir::OperandElementTypeIterator(values.end())};",
+  [{
+    ArrayAttr::get(
+    [&]() {
+      llvm::SmallVector<Attribute, 4> ret;
+      for (auto t : $_self)
+        ret.push_back(TypeAttr::get(t));
+      return ret;
+    }(), $_ctx)
+  }]
 >;
 
 // A derived attribute that returns the shapes of the tensors in the actual
@@ -262,7 +315,16 @@ class TF_DerivedOperandShapeListAttr<int idx> : DerivedAttr<
   "mlir::TF::OperandShapeRange",
   "auto values = getODSOperands(" # idx # ");\n"
   "return {mlir::TF::OperandShapeIterator(values.begin()), "
-          "mlir::TF::OperandShapeIterator(values.end())};"
+          "mlir::TF::OperandShapeIterator(values.end())};",
+  [{
+    ArrayAttr::get(
+      [&](){
+        llvm::SmallVector<Attribute, 4> ret;
+        for (auto shape : $_self)
+          ret.push_back(mlir::TF::ShapeAttr::get($_ctx, shape));
+        return ret;
+      }(), $_ctx)
+  }]
 >;
 
 // A derived attribute that returns the size of `idx`-th ODS-declared variadic
@@ -270,7 +332,8 @@ class TF_DerivedOperandShapeListAttr<int idx> : DerivedAttr<
 class TF_DerivedResultSizeAttr<int idx> : DerivedAttr<
   "size_t",
   "auto range = getODSResults(" # idx # ");\n"
-  "return std::distance(range.begin(), range.end());">;
+  "return std::distance(range.begin(), range.end());",
+  [{ $_builder.getI64IntegerAttr($_self) }]>;
 
 // A derived attribute that returns the element type of `idx`-th ODS-declared
 // result. If the `idx`-th result is a variadic result, then this attribute
@@ -288,7 +351,16 @@ class TF_DerivedResultTypeListAttr<int idx> : DerivedAttr<
   "mlir::ResultElementTypeRange",
   "auto values = getODSResults(" # idx # ");\n"
   "return {mlir::ResultElementTypeIterator(values.begin()), "
-          "mlir::ResultElementTypeIterator(values.end())};"
+          "mlir::ResultElementTypeIterator(values.end())};",
+  [{
+    ArrayAttr::get(
+    [&]() {
+      llvm::SmallVector<Attribute, 4> ret;
+      for (auto t : $_self)
+        ret.push_back(TypeAttr::get(t));
+      return ret;
+    }(), $_ctx)
+  }]
 >;
 
 // A derived attribute that returns the shapes of the tensors in the actual
@@ -299,12 +371,22 @@ class TF_DerivedResultShapeListAttr<int idx> : DerivedAttr<
   "mlir::TF::ResultShapeRange",
   "auto values = getODSResults(" # idx # ");\n"
   "return {mlir::TF::ResultShapeIterator(values.begin()), "
-          "mlir::TF::ResultShapeIterator(values.end())};"
+          "mlir::TF::ResultShapeIterator(values.end())};",
+  [{
+    ArrayAttr::get(
+      [&](){
+        llvm::SmallVector<Attribute, 4> ret;
+        for (auto shape : $_self)
+          ret.push_back(mlir::TF::ShapeAttr::get($_ctx, shape));
+        return ret;
+      }(), $_ctx)
+  }]
 >;
 
 // A derived attribute that returns the shape of the first result type.
 def TF_DerivedResultShapeAttr : DerivedAttr<"ShapedType",
-  "return (*getOperation()->result_type_begin()).cast<ShapedType>();">;
+  "return (*getOperation()->result_type_begin()).cast<ShapedType>();",
+  [{ TypeAttr::get($_self) }]>;
 
 // A derived attribute that returns the element type of the tensor held by a
 // named resource-type operand or result.
@@ -315,7 +397,6 @@ class TF_DerivedOperandOrResultHandleTypeAttr<string name> : DerivedTypeAttr<
   "assert(!resource_type.getSubtypes().empty() && \"unknown type\");\n"
   "return mlir::getElementTypeOrSelf(*resource_type.getSubtypes().begin());">;
 
-
 // A derived attribute that returns the shape of the tensor held by a named
 // resource-type operand or result.
 class TF_DerivedOperandOrResultHandleShapeAttr<string name> : DerivedAttr<
@@ -324,7 +405,8 @@ class TF_DerivedOperandOrResultHandleShapeAttr<string name> : DerivedAttr<
   "  mlir::getElementTypeOrSelf(this->" # name # "())\n"
   "  .cast<TF::ResourceType>();\n"
   "assert(!resource_type.getSubtypes().empty() && \"unknown shape\");\n"
-  "return resource_type.getSubtypes().begin()->cast<ShapedType>();">;
+  "return resource_type.getSubtypes().begin()->cast<ShapedType>();",
+  [{ TypeAttr::get($_self) }]>;
 
 def TF_IntTypeAttr : TypeAttrBase<"IntegerType", "integer type"> {
   let returnType = "Type";
@@ -338,7 +420,7 @@ def TF_IntTypeAttr : TypeAttrBase<"IntegerType", "integer type"> {
 // behavior. The result type has the same element type as both operands.
 class WithBroadcastableBinOpBuilder {
   list<OpBuilder> builders = [OpBuilder<
-"Builder *builder, OperationState &result, Value  x, Value  y",
+"OpBuilder &builder, OperationState &result, Value  x, Value  y",
 [{
   auto resultType =
       OpTrait::util::getBroadcastedType(x.getType(), y.getType());
@@ -353,12 +435,12 @@ class WithBroadcastableBinOpBuilder {
 // behavior. The result type has bool element type.
 class WithBroadcastableCmpOpBuilder {
   list<OpBuilder> builders = [OpBuilder<
-"Builder *builder, OperationState &result, Value  x, Value  y",
+"OpBuilder &builder, OperationState &result, Value  x, Value  y",
 [{
   Type resultType;
   if (x.getType().isa<UnrankedTensorType>() ||
       y.getType().isa<UnrankedTensorType>()) {
-    resultType = UnrankedTensorType::get(builder->getI1Type());
+    resultType = UnrankedTensorType::get(builder.getI1Type());
   } else {
     SmallVector<int64_t, 4> resultShape;
     if (!OpTrait::util::getBroadcastedShape(
@@ -368,7 +450,7 @@ class WithBroadcastableCmpOpBuilder {
                       "operands have no broadcastable shapes");
     }
 
-    resultType = RankedTensorType::get(resultShape, builder->getI1Type());
+    resultType = RankedTensorType::get(resultShape, builder.getI1Type());
   }
   return build(builder, result, resultType, x, y);
 }]
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 1b13558b692..6f02b8b92d8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <tuple>
 #include <type_traits>
 
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
@@ -55,8 +57,8 @@ limitations under the License.
 #include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Support/STLExtras.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/core/platform/logging.h"
@@ -83,8 +85,7 @@ static RankedTensorType GetRankedTensorTypeForOperand(Value operand) {
 
 // Returns true if the given `value` is of ranked float tensor type with the
 // given `rank`.
-static inline bool isOfRankedFloatTensorType(Value value, int rank) {
-  RankedTensorType type = GetRankedTensorTypeForOperand(value);
+static inline bool IsOfRankedFloatTensorType(RankedTensorType type, int rank) {
   return type && type.getRank() == rank &&
          type.getElementType().isa<FloatType>();
 }
@@ -110,48 +111,6 @@ static inline bool HasRankAtMost(Value value, int64_t rank) {
   return !type || type.getRank() <= rank;
 }
 
-// Returns true if the given pair of TensorFlow types can be cast to one
-// another. In other words, a single run-time value is legal for both the types.
-// For example, tensor<*xf32> and tensor<3xf32> are cast compatible.
-static bool AreCastCompatible(Type a, Type b) {
-  if (TensorCastOp::areCastCompatible(a, b)) return true;
-
-  // Resource types may optionally contain subtypes information that does not
-  // match. Check subtypes compatibility when possible, otherwise treat them as
-  // compatible.
-  auto a_or_element_type = getElementTypeOrSelf(a);
-  auto b_or_element_type = getElementTypeOrSelf(b);
-
-  auto a_kind = a_or_element_type.getKind();
-  auto b_kind = b_or_element_type.getKind();
-
-  if (a_kind == TensorFlowTypes::RESOURCE &&
-      b_kind == TensorFlowTypes::RESOURCE) {
-    auto a_resource_type = a_or_element_type.dyn_cast<ResourceType>();
-    auto b_resource_type = b_or_element_type.dyn_cast<ResourceType>();
-    bool a_has_subtype = !a_resource_type.getSubtypes().empty();
-    bool b_has_subtype = !b_resource_type.getSubtypes().empty();
-
-    if (!a_has_subtype || !b_has_subtype) return true;
-
-    assert(a_resource_type.getSubtypes().size() <= 1 &&
-           "Resource type must have at most one subtype");
-    assert(b_resource_type.getSubtypes().size() <= 1 &&
-           "Resource type must have at most one subtype");
-
-    return TensorCastOp::areCastCompatible(
-        a_resource_type.getSubtypes().front(),
-        b_resource_type.getSubtypes().front());
-  }
-
-  // Variant types may optionally contain subtypes information that need not
-  // match.  It is also not possible to compare subtypes for compatibility as
-  // their interpretation depends on the ops operating on them. So, accept all
-  // pairs of variant types.
-  return a_kind == TensorFlowTypes::VARIANT &&
-         b_kind == TensorFlowTypes::VARIANT;
-}
-
 static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
 }
@@ -293,6 +252,39 @@ static LogicalResult VerifyTypesCompatibility(
   return success();
 }
 
+// This is a helper for the Select to SelectV2 canonicalization. The `data` rank
+// refers to the rank of `t`/`e` (these two inputs have equal rank; this is
+// checked in the verifier).
+//
+// In most cases, the predicate for Select can be used directly as the predicate
+// for SelectV2. However, there is one case that varies, which is when the
+// predicate is a tensor and the data is multidimensional. In this case, Select
+// op semantics dictate that the predicate tensor length must match the size of
+// the first data dimension. This varies from normal broadcasting semantics
+// (which are used in SelectV2), so we must reshape the tensor in this case to
+// be compatible.
+static Value ReshapeSelectPredIfNecessary(OpBuilder *builder, Location loc,
+                                          Value cond, int data_rank) {
+  auto cond_tensor = cond.getType().cast<RankedTensorType>();
+  // Reshape is only needed in the case that the cond rank is 1 (i.e. it is
+  // a vector) AND t/e rank is > 1.
+  if (cond_tensor.getRank() != 1 || data_rank <= 1) {
+    // No reshape necessary. Leave cond as it is.
+    return cond;
+  }
+
+  // This is the case where a reshape is needed. We want to construct the
+  // shape [x,1,...1], where x is the value in the pred tensor and the
+  // length of the shape is equal to data_rank.
+  SmallVector<int64_t, 8> shape(data_rank, 1);
+  shape[0] = cond_tensor.getShape().front();
+  auto new_shape_type =
+      RankedTensorType::get({data_rank}, builder->getIntegerType(64));
+  auto shape_attr = DenseIntElementsAttr::get(new_shape_type, shape);
+  auto new_shape = builder->create<ConstOp>(loc, shape_attr);
+  return builder->create<ReshapeOp>(loc, cond, new_shape);
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions detect device capabilities from RuntimeDevices.
 //===----------------------------------------------------------------------===//
@@ -496,6 +488,65 @@ LogicalResult FoldOperandsPermutation(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Rewrite Pattern for removing trivial Arithmetic op.
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Folder that returns LHS of an Arithmetic Op if the RHS is a constant
+// known to be Identity (e.g X+0)
+template <
+    typename OpT,
+    typename std::enable_if<llvm::is_one_of<
+        OpT, AddV2Op, SubOp, MulOp, DivOp, RealDivOp>::value>::type * = nullptr>
+OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
+                                        ArrayRef<Attribute> operands) {
+  auto result_op_type = arithmetic_op.getResult().getType();
+  auto lhs_type = arithmetic_op.x().getType().template cast<ShapedType>();
+  if (!result_op_type.template cast<ShapedType>().hasStaticShape()) return {};
+
+  // We only handle non-broadcastable case.
+  if (result_op_type != lhs_type) {
+    return {};
+  }
+
+  // Mul and Div ops have identity value one while AddV2 and SubOp have identity
+  // value zero.
+  int identity =
+      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value ||
+       std::is_same<OpT, RealDivOp>::value);
+
+  Type element_ty = lhs_type.getElementType();
+  Attribute identity_attr;
+  if (auto ty = element_ty.template dyn_cast<FloatType>()) {
+    identity_attr = FloatAttr::get(ty, static_cast<double>(identity));
+  } else if (auto ty = element_ty.template dyn_cast<IntegerType>()) {
+    identity_attr = IntegerAttr::get(ty, static_cast<int64_t>(identity));
+  } else {
+    return {};
+  }
+
+  if (auto attr = operands[1].dyn_cast_or_null<DenseElementsAttr>()) {
+    if (attr.isSplat() && attr.getSplatValue() == identity_attr)
+      return arithmetic_op.x();
+  }
+
+  auto rhs_type = arithmetic_op.y().getType().template cast<ShapedType>();
+  // TODO(chhe): we could fold and add an identity to force the broadcast.
+  if (result_op_type != rhs_type) {
+    return {};
+  }
+
+  bool is_symmetric =
+      (std::is_same<OpT, AddV2Op>::value || std::is_same<OpT, MulOp>::value);
+  if (auto attr = operands[0].dyn_cast_or_null<DenseElementsAttr>()) {
+    if (is_symmetric && attr.isSplat() && attr.getSplatValue() == identity_attr)
+      return arithmetic_op.y();
+  }
+  return {};
+}
+}  // namespace
+
 namespace {
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
@@ -527,6 +578,10 @@ void AddV2Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<AddV2OfNegLeft, AddV2OfNegRight>(context);
 }
 
+OpFoldResult AddV2Op::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<AddV2Op>(*this, operands);
+}
+
 //===----------------------------------------------------------------------===//
 // AllOp
 //===----------------------------------------------------------------------===//
@@ -893,10 +948,13 @@ OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
 // Builds a constant op with the specified attribute `value`. The result
 // op's type is deduced from `value`; if `value` is of scalar type,
 // wraps it up with a tensor type of empty shape.
-void ConstOp::build(Builder *builder, OperationState &result, Attribute value) {
+// TODO(jpienaar): This one differs from the autogenerated one as it takes an
+// attribute but always creates an ElementsAttr internally.
+void ConstOp::build(OpBuilder &builder, OperationState &result,
+                    Attribute value) {
   ShapedType type;
-  if (auto elemAttr = value.dyn_cast<ElementsAttr>()) {
-    type = elemAttr.getType();
+  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+    return ConstOp::build(builder, result, elem_attr);
   } else if (value.isa<BoolAttr>() || value.isa<FloatAttr>() ||
              value.isa<IntegerAttr>()) {
     // All TensorFlow types must be tensor types. In the build() method,
@@ -904,15 +962,13 @@ void ConstOp::build(Builder *builder, OperationState &result, Attribute value) {
     // types. But we need to wrap it up with ElementsAttr to construct
     // valid TensorFlow constants.
     type = RankedTensorType::get(/*shape=*/{}, value.getType());
-    value = DenseElementsAttr::get(type, value);
+    return ConstOp::build(builder, result, DenseElementsAttr::get(type, value));
   }
-  // TODO: support other TensorFlow specific types.
-  assert(type && "unsupported attribute type for building tf.Const");
-  result.types.push_back(type);
-  result.addAttribute("value", value);
+  // TODO(jpienaar): support other TensorFlow specific types.
+  llvm_unreachable("unsupported attribute type for building tf.Const");
 }
 
-void ConstOp::build(Builder *builder, OperationState &result, Type type,
+void ConstOp::build(OpBuilder &builder, OperationState &result, Type type,
                     Attribute value) {
   // Handle the case where the type and value are already tensors.
   if (type.isa<TensorType>() && value.isa<ElementsAttr>()) {
@@ -926,6 +982,21 @@ void ConstOp::build(Builder *builder, OperationState &result, Type type,
   assert(type == result.types[0] && "type mismatch in construction");
 }
 
+LogicalResult ConstOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  auto value = attributes.get("value");
+  if (!value) return emitOptionalError(location, "missing attribute 'value'");
+  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+    inferredReturnTypes.assign({elem_attr.getType()});
+    return success();
+  }
+  return emitOptionalError(location,
+                           "attribute 'value' failed to satisfy constraint: "
+                           "constant vector/tensor");
+}
+
 //===----------------------------------------------------------------------===//
 // Conv2DOp and Conv3DOp
 //===----------------------------------------------------------------------===//
@@ -1254,6 +1325,10 @@ void DivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<DivWithSqrtDivisor>(context);
 }
 
+OpFoldResult DivOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<DivOp>(*this, operands);
+}
+
 //===----------------------------------------------------------------------===//
 // DynamicStitchOp
 //===----------------------------------------------------------------------===//
@@ -1338,7 +1413,7 @@ static LogicalResult Verify(DynamicStitchOp op) {
       auto expected_out_ty =
           RankedTensorType::get(expected_shape, out_ty.getElementType());
 
-      if (!AreCastCompatible(out_ty, expected_out_ty)) {
+      if (!AreCastCompatible({out_ty, expected_out_ty})) {
         return op.emitOpError() << "has invalid output type; should be "
                                    "compatible with inferred type "
                                 << expected_out_ty;
@@ -1364,6 +1439,43 @@ static LogicalResult Verify(EinsumOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// EmptyOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult EmptyOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 1 && "empty op has one operand");
+
+  Attribute attr = operands.front();
+  if (!attr) return {};
+
+  auto int_attr = attr.cast<DenseIntElementsAttr>();
+  SmallVector<int64_t, 6> out_shape;
+  for (const auto val : int_attr.getValues<int32_t>()) {
+    out_shape.push_back(val);
+  }
+
+  auto type = getResult().getType().cast<ShapedType>();
+  auto etype = type.getElementType();
+
+  // We can not fold if the result is not static.
+  if (!type.hasStaticShape()) return {};
+
+  if (auto float_type = etype.dyn_cast<FloatType>()) {
+    auto out_type = RankedTensorType::get(out_shape, float_type);
+    return DenseElementsAttr::get(out_type,
+                                  {APFloat(float_type.getFloatSemantics())});
+  }
+
+  if (auto int_type = etype.dyn_cast<IntegerType>()) {
+    auto out_type = RankedTensorType::get(out_shape, etype);
+    APInt val(int_type.getWidth(), 0, int_type.getSignedness());
+    return DenseElementsAttr::get(out_type, val);
+  }
+
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // EmptyTensorListOp
 //===----------------------------------------------------------------------===//
@@ -1393,9 +1505,9 @@ static LogicalResult Verify(EqualOp op) {
       op.getOperation());
 }
 
-void EqualOp::build(Builder *builder, OperationState &result, Value x, Value y,
-                    BoolAttr incompatible_shape_error) {
-  auto result_type = DeduceEqualCmpOpType(builder, result.location, x, y,
+void EqualOp::build(OpBuilder &builder, OperationState &result, Value x,
+                    Value y, BoolAttr incompatible_shape_error) {
+  auto result_type = DeduceEqualCmpOpType(&builder, result.location, x, y,
                                           incompatible_shape_error);
   return build(builder, result, result_type, x, y, incompatible_shape_error);
 }
@@ -1426,8 +1538,8 @@ Type InferExpandDimsOpType(Value input, Value dim) {
   return RankedTensorType::get(shape, element_ty);
 }
 
-void ExpandDimsOp::build(Builder *builder, OperationState &result, Value input,
-                         Value dim) {
+void ExpandDimsOp::build(OpBuilder &builder, OperationState &result,
+                         Value input, Value dim) {
   return build(builder, result, InferExpandDimsOpType(input, dim), input, dim);
 }
 
@@ -1462,10 +1574,12 @@ static LogicalResult Verify(FakeQuantWithMinMaxArgsOp op) {
 // FakeQuantWithMinMaxVarsOp
 //===----------------------------------------------------------------------===//
 static LogicalResult Verify(FakeQuantWithMinMaxVarsOp op) {
-  if (!isOfRankedFloatTensorType(op.min(), 0))
+  auto min = GetRankedTensorTypeForOperand(op.min());
+  if (min && !IsOfRankedFloatTensorType(min, 0))
     return op.emitOpError("requires min to be a 0d float tensor");
 
-  if (!isOfRankedFloatTensorType(op.max(), 0))
+  auto max = GetRankedTensorTypeForOperand(op.max());
+  if (max && !IsOfRankedFloatTensorType(max, 0))
     return op.emitOpError("requires max to be a 0d float tensor");
 
   int64_t num_bits = op.num_bits().getSExtValue();
@@ -1480,30 +1594,33 @@ static LogicalResult Verify(FakeQuantWithMinMaxVarsOp op) {
 // FakeQuantWithMinMaxVarsPerChannelOp
 //===----------------------------------------------------------------------===//
 static LogicalResult Verify(FakeQuantWithMinMaxVarsPerChannelOp op) {
-  if (!isOfRankedFloatTensorType(op.min(), 1))
+  auto min = GetRankedTensorTypeForOperand(op.min());
+  if (min && !IsOfRankedFloatTensorType(min, 1))
     return op.emitOpError("requires min to be a 1d float tensor");
 
-  if (!isOfRankedFloatTensorType(op.max(), 1))
+  auto max = GetRankedTensorTypeForOperand(op.max());
+  if (max && !IsOfRankedFloatTensorType(max, 1))
     return op.emitOpError("requires max to be a 1d float tensor");
 
   Value inputs = op.inputs();
-  if (!HasRankAtLeast(inputs, 1) ||
-      inputs.getType().isa<UnrankedTensorType>()) {
+  if (!HasRankAtLeast(inputs, 1))
     return op.emitError("requires inputs to be at least 1d float tensor");
-  }
 
-  auto inputsType = inputs.getType().cast<ShapedType>();
-  int depth = inputsType.getDimSize(inputsType.getRank() - 1);
-  if (op.min().getType().cast<ShapedType>().getDimSize(0) != depth ||
-      op.max().getType().cast<ShapedType>().getDimSize(0) != depth) {
-    return op.emitOpError(
-        "requires min and max to have same size as last dimension of inputs");
-  }
   int64_t num_bits = op.num_bits().getSExtValue();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
   }
+
+  auto inputs_type = inputs.getType().dyn_cast<RankedTensorType>();
+  if (!inputs_type) return success();
+  int depth = inputs_type.getDimSize(inputs_type.getRank() - 1);
+  if ((min && min.getDimSize(0) != depth) ||
+      (max && max.getDimSize(0) != depth)) {
+    return op.emitOpError(
+        "requires min and max to have same size as last dimension of inputs");
+  }
+
   return success();
 }
 
@@ -1520,6 +1637,50 @@ static LogicalResult Verify(FillOp op) {
   return success();
 }
 
+static ShapedType InferFillOpType(Value dims, Value value) {
+  Type etype = value.getType().cast<ShapedType>().getElementType();
+
+  DenseIntElementsAttr dims_attr;
+  if (!matchPattern(dims, m_Constant(&dims_attr))) {
+    return UnrankedTensorType::get(etype);
+  }
+
+  llvm::SmallVector<int64_t, 4> shape;
+  shape.reserve(dims_attr.getNumElements());
+  for (const APInt dim : dims_attr.getValues<APInt>()) {
+    shape.push_back(dim.getSExtValue());
+  }
+  return RankedTensorType::get(shape, etype);
+}
+
+void FillOp::build(OpBuilder &builder, OperationState &result, Value dims,
+                   Value value) {
+  FillOp::build(builder, result, InferFillOpType(dims, value), dims, value);
+}
+
+OpFoldResult FillOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "fill op has two operand");
+
+  auto value = operands[1].dyn_cast_or_null<ElementsAttr>();
+  if (!value) return {};
+
+  auto type = getType().cast<ShapedType>();
+  if (type.hasStaticShape())
+    return DenseElementsAttr::get(type, value.getValue({}));
+
+  auto dims = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!dims) return {};
+
+  llvm::SmallVector<int64_t, 4> shape;
+  shape.reserve(dims.getNumElements());
+  for (const APInt dim : dims.getValues<APInt>()) {
+    shape.push_back(dim.getSExtValue());
+  }
+  type = RankedTensorType::get(shape, type.getElementType());
+
+  return DenseElementsAttr::get(type, value.getValue({}));
+}
+
 //===----------------------------------------------------------------------===//
 // FusedBatchNormGradOp
 //===----------------------------------------------------------------------===//
@@ -1553,19 +1714,24 @@ StringRef FusedBatchNormGradV3Op::GetOptimalLayout(
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(FusedBatchNormOp op) {
-  if (!isOfRankedFloatTensorType(op.x(), 4))
+  auto x = GetRankedTensorTypeForOperand(op.x());
+  if (x && !IsOfRankedFloatTensorType(x, 4))
     return op.emitOpError("requires x to be a 4D float tensor");
 
-  if (!isOfRankedFloatTensorType(op.scale(), 1))
+  auto scale = GetRankedTensorTypeForOperand(op.scale());
+  if (scale && !IsOfRankedFloatTensorType(scale, 1))
     return op.emitOpError("requires scale to be a 1D float tensor");
 
-  if (!isOfRankedFloatTensorType(op.offset(), 1))
+  auto offset = GetRankedTensorTypeForOperand(op.offset());
+  if (offset && !IsOfRankedFloatTensorType(offset, 1))
     return op.emitOpError("requires offset to be a 1D float tensor");
 
-  if (!isOfRankedFloatTensorType(op.mean(), 1))
+  auto mean = GetRankedTensorTypeForOperand(op.mean());
+  if (mean && !IsOfRankedFloatTensorType(mean, 1))
     return op.emitOpError("requires mean to be a 1D float tensor");
 
-  if (!isOfRankedFloatTensorType(op.variance(), 1))
+  auto variance = GetRankedTensorTypeForOperand(op.variance());
+  if (variance && !IsOfRankedFloatTensorType(variance, 1))
     return op.emitOpError("requires variance to be a 1D float tensor");
 
   // TODO(antiagainst): check attributes
@@ -1671,14 +1837,14 @@ static LogicalResult Verify(IfOp op) {
   for (unsigned i = 0; i < expectedNumInputs; ++i) {
     auto operandType = op.getOperand(i + 1).getType().cast<TensorType>();
     auto thenInputType = thenFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible(operandType, thenInputType))
+    if (!AreCastCompatible({operandType, thenInputType}))
       return op.emitError(
           llvm::formatv("then branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
                         thenInputType, operandType, i));
 
     auto elseInputType = elseFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible(operandType, elseInputType))
+    if (!AreCastCompatible({operandType, elseInputType}))
       return op.emitError(
           llvm::formatv("else branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
@@ -1686,7 +1852,7 @@ static LogicalResult Verify(IfOp op) {
 
     // If branches have incompatible input types that means that no tensor can
     // serve as input to both the functions. Hence, the op is invalid.
-    if (!AreCastCompatible(thenInputType, elseInputType))
+    if (!AreCastCompatible({thenInputType, elseInputType}))
       return op.emitError(llvm::formatv(
           "branches inputs have incompatible types {0} and {1} at index {2}",
           thenInputType, elseInputType, i));
@@ -1702,14 +1868,14 @@ static LogicalResult Verify(IfOp op) {
   for (unsigned i = 0; i < expectedNumResults; ++i) {
     auto resultType = op.getResult(i).getType().cast<TensorType>();
     auto thenResultType = thenFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible(thenResultType, resultType))
+    if (!AreCastCompatible({thenResultType, resultType}))
       return op.emitError(
           llvm::formatv("then branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
                         thenResultType, resultType, i));
 
     auto elseResultType = elseFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible(elseResultType, resultType))
+    if (!AreCastCompatible({elseResultType, resultType}))
       return op.emitError(
           llvm::formatv("else branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
@@ -1822,10 +1988,10 @@ static LogicalResult Verify(MatrixBandPartOp op) {
 // MaxOp
 //===----------------------------------------------------------------------===//
 
-void MaxOp::build(Builder *builder, OperationState &result, Value input,
+void MaxOp::build(OpBuilder &builder, OperationState &result, Value input,
                   Value reduction_indices, BoolAttr keep_dims) {
   Type out_ty =
-      InferReductionOpType(input, reduction_indices, keep_dims, builder);
+      InferReductionOpType(input, reduction_indices, keep_dims, &builder);
   build(builder, result, out_ty, input, reduction_indices, keep_dims);
 }
 
@@ -1888,6 +2054,14 @@ LogicalResult MeanOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// MulOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult MulOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<MulOp>(*this, operands);
+}
+
 //===----------------------------------------------------------------------===//
 // NegOp
 //===----------------------------------------------------------------------===//
@@ -1910,9 +2084,9 @@ static LogicalResult Verify(NotEqualOp op) {
       op.getOperation());
 }
 
-void NotEqualOp::build(Builder *builder, OperationState &result, Value x,
+void NotEqualOp::build(OpBuilder &builder, OperationState &result, Value x,
                        Value y, BoolAttr incompatible_shape_error) {
-  auto result_type = DeduceEqualCmpOpType(builder, result.location, x, y,
+  auto result_type = DeduceEqualCmpOpType(&builder, result.location, x, y,
                                           incompatible_shape_error);
   return build(builder, result, result_type, x, y, incompatible_shape_error);
 }
@@ -1982,7 +2156,7 @@ static TensorType InferOneHotOpType(Value indices, Value depth, Value on_value,
   return RankedTensorType::get(shape, element_ty);
 }
 
-void OneHotOp::build(Builder *builder, OperationState &result, Value indices,
+void OneHotOp::build(OpBuilder &builder, OperationState &result, Value indices,
                      Value depth, Value on_value, Value off_value,
                      IntegerAttr axis) {
   build(builder, result,
@@ -2174,6 +2348,28 @@ OpFoldResult PowOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// QrOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that,
+//
+// * Input type, if ranked, must have at least 2 dimensions and at most
+//   INT32_MAX dimensions.
+//
+static LogicalResult Verify(QrOp op) {
+  auto ttype = op.input().getType().cast<TensorType>();
+  if (!ttype.hasRank()) return success();
+  if (!HasRankAtLeast(op.input(), 2))
+    return op.emitOpError(
+        "requires ranked input tensor to be of rank 2 or more");
+  if (!HasRankAtMost(op.input(), std::numeric_limits<int32_t>::max()))
+    return op.emitOpError(
+        "requires ranked input tensor to be of rank INT32_MAX or less");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ReciprocalOp
 //===----------------------------------------------------------------------===//
@@ -2197,7 +2393,7 @@ static LogicalResult Verify(RandomUniformOp op) {
 // RangeOp
 //===----------------------------------------------------------------------===//
 
-void RangeOp::build(Builder *builder, OperationState &result, Value start,
+void RangeOp::build(OpBuilder &builder, OperationState &result, Value start,
                     Value limit, Value delta) {
   assert(start.getType() == limit.getType());
   assert(start.getType() == delta.getType());
@@ -2227,12 +2423,23 @@ void RangeOp::build(Builder *builder, OperationState &result, Value start,
 // RankOp
 //===----------------------------------------------------------------------===//
 
-void RankOp::build(Builder *builder, OperationState &result, Value input) {
+void RankOp::build(OpBuilder &builder, OperationState &result, Value input) {
   return RankOp::build(builder, result,
-                       RankedTensorType::get({}, builder->getIntegerType(32)),
+                       RankedTensorType::get({}, builder.getIntegerType(32)),
                        input);
 }
 
+// This will create a constant value for RankOp of a ranked tensor.
+OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
+  auto type = input().getType();
+  auto ranked_type = type.dyn_cast<RankedTensorType>();
+  if (!ranked_type) return {};
+
+  auto output_type = getType().cast<ShapedType>();
+  int32_t rank = ranked_type.getRank();
+  return DenseIntElementsAttr::get(output_type, rank);
+}
+
 //===----------------------------------------------------------------------===//
 // RealDivOp
 //===----------------------------------------------------------------------===//
@@ -2242,6 +2449,10 @@ void RealDivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<RealDivWithSqrtDivisor>(context);
 }
 
+OpFoldResult RealDivOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<RealDivOp>(*this, operands);
+}
+
 //===----------------------------------------------------------------------===//
 // ReshapeOp
 //===----------------------------------------------------------------------===//
@@ -2258,7 +2469,7 @@ static LogicalResult Verify(ReshapeOp op) {
   if (rank_by_shape == -1 || !type_of_tensor.hasStaticShape()) return success();
   int64_t num_by_tensor = type_of_tensor.getNumElements();
 
-  auto out_ty = op.getType().cast<RankedTensorType>();
+  auto out_ty = op.getType().dyn_cast<RankedTensorType>();
   if (out_ty && out_ty.hasStaticShape()) {
     int64_t num_output_elements = out_ty.getNumElements();
     if (num_by_tensor != num_output_elements)
@@ -2315,12 +2526,12 @@ static LogicalResult Verify(ReshapeOp op) {
   return success();
 }
 
-void ReshapeOp::build(Builder *builder, OperationState &result, Value tensor,
+void ReshapeOp::build(OpBuilder &builder, OperationState &result, Value tensor,
                       Value shape) {
   auto ttype = tensor.getType().cast<ShapedType>();
   auto etype = ttype.getElementType();
 
-  auto unranked = [builder, etype, &result, shape, tensor]() {
+  auto unranked = [&builder, etype, &result, shape, tensor]() {
     return ReshapeOp::build(builder, result, UnrankedTensorType::get(etype),
                             tensor, shape);
   };
@@ -2373,6 +2584,81 @@ void ReshapeOp::build(Builder *builder, OperationState &result, Value tensor,
   return unranked();
 }
 
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+void SelectOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<SelectToSelectV2>(context);
+}
+
+// Verifies a few extra requirements on SelectOp:
+// (1) `then` and `else` must have same shape
+// (2) At least one of the following must be true:
+//     (a) `cond` has the same rank as `then` and `else`
+//     (b) `cond` is a scalar
+//     (c) `cond` is a vector AND `then` and `else` are non-scalar with their
+//         first dimension equal to `cond`.
+static LogicalResult Verify(SelectOp op) {
+  auto then_tensor = op.t().getType().cast<TensorType>();
+  auto else_tensor = op.e().getType().cast<TensorType>();
+  // Check (1).
+  if (!AreCastCompatible({then_tensor, else_tensor}))
+    return op.emitOpError() << "requires t and e have compatible shapes";
+
+  // Get data rank (if exists).
+  int data_rank;
+  // If data is unranked or data_rank is 0, this will remain -2. Otherwise
+  // refers to first dimension of then and/or else.
+  int data_first_dim = -2;
+  bool then_has_rank = then_tensor.hasRank();
+  bool else_has_rank = else_tensor.hasRank();
+  if (then_has_rank && else_has_rank) {
+    data_rank = then_tensor.getRank();
+    if (then_tensor.getRank() > 0)
+      data_first_dim = then_tensor.getShape().front();
+    if (else_tensor.getRank() > 0)
+      data_first_dim = std::max(
+          static_cast<int>(else_tensor.getShape().front()), data_first_dim);
+  } else if (then_has_rank) {
+    data_rank = then_tensor.getRank();
+    if (then_tensor.getRank() > 0)
+      data_first_dim = then_tensor.getShape().front();
+  } else if (else_has_rank) {
+    data_rank = else_tensor.getRank();
+    if (else_tensor.getRank() > 0)
+      data_first_dim = else_tensor.getShape().front();
+  } else {
+    // Neither has a rank.
+    return success();
+  }
+
+  auto cond_tensor = op.condition().getType().dyn_cast<RankedTensorType>();
+  if (!cond_tensor) return success();
+  auto cond_rank = cond_tensor.getRank();
+  // Check (2a) and (2b).
+  if (cond_rank == 0 || cond_rank == data_rank) return success();
+  // Check (2c).
+  if (cond_rank == 1) {
+    auto cond_shape = cond_tensor.getShape().front();
+    if (data_rank == 0) {
+      return op.emitOpError()
+             << "requires that t and e are nonscalar when pred is a vector";
+    }
+    // We know `data` tensor has a rank of at least 1.
+    if (data_first_dim != -1 && cond_shape != -1 &&
+        data_first_dim != cond_shape) {
+      return op.emitOpError() << "requires that, when pred is a vector, the "
+                                 "shape matches the first dimension of t and e";
+    }
+    return success();
+  }
+  // None of (2a,b,c) were true; fail.
+  return op.emitOpError() << "requires that pred is a scalar OR has the same "
+                             "rank as t and e OR is a vector";
+}
+
 //===----------------------------------------------------------------------===//
 // SelectV2Op
 //===----------------------------------------------------------------------===//
@@ -2399,7 +2685,7 @@ static Type InferSelectV2OpType(Value condition, Value e, Value t) {
   return RankedTensorType::get(result_shape, element_ty);
 }
 
-void SelectV2Op::build(Builder *builder, OperationState &result,
+void SelectV2Op::build(OpBuilder &builder, OperationState &result,
                        Value condition, Value e, Value t) {
   build(builder, result, InferSelectV2OpType(condition, e, t), condition, e, t);
 }
@@ -2417,7 +2703,8 @@ LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
       variadic_idx < 0 ? "" : llvm::formatv(" #{0}", variadic_idx).str();
 
   auto result_ranked_type = result_type.dyn_cast<RankedTensorType>();
-  if (!result_ranked_type || result_ranked_type.getShape().size() != 1)
+  if (!result_ranked_type) return success();
+  if (result_ranked_type.getShape().size() != 1)
     return op->emitOpError("requires 1D type for result") << variadic_idx_str;
 
   auto operand_ranked_type = operand_type.dyn_cast_or_null<RankedTensorType>();
@@ -2431,9 +2718,12 @@ LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
              << variadic_idx_str << " to match rank of operand"
              << variadic_idx_str;
   } else if (result_ranked_type.hasStaticShape()) {
-    // The operand is an unranked tensor, verify that the result is dynamic.
-    return op->emitOpError("requires dynamic shape result")
-           << variadic_idx_str << " for unranked operand" << variadic_idx_str;
+    // The operand is an unranked tensor, print a warning if the result
+    // is static.
+    // Note: We do not handle this situation as an error, this would be too
+    // restrictive due to incompleteness of shape inference at this point.
+    op->emitWarning("has static shape result")
+        << variadic_idx_str << " for unranked operand" << variadic_idx_str;
   }
 
   Type element_type = result_ranked_type.getElementType();
@@ -2475,12 +2765,12 @@ OpFoldResult ShapeOp::fold(ArrayRef<Attribute> operands) {
   return ConvertShapeToAttr(getOperand().getType(), width);
 }
 
-void ShapeOp::build(Builder *builder, OperationState &result, Value input,
+void ShapeOp::build(OpBuilder &builder, OperationState &result, Value input,
                     BoolAttr use32Bit) {
   auto rankedTensorType = input.getType().dyn_cast<RankedTensorType>();
   int64_t rank = rankedTensorType ? rankedTensorType.getRank() : -1;
-  auto out_type = use32Bit.getValue() ? builder->getIntegerType(32)
-                                      : builder->getIntegerType(64);
+  auto out_type = use32Bit.getValue() ? builder.getIntegerType(32)
+                                      : builder.getIntegerType(64);
   return ShapeOp::build(builder, result,
                         RankedTensorType::get({rank}, out_type), input);
 }
@@ -2822,14 +3112,18 @@ void SubOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<SubOfNeg>(context);
 }
 
+OpFoldResult SubOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<SubOp>(*this, operands);
+}
+
 //===----------------------------------------------------------------------===//
 // SumOp
 //===----------------------------------------------------------------------===//
 
-void SumOp::build(Builder *builder, OperationState &result, Value input,
+void SumOp::build(OpBuilder &builder, OperationState &result, Value input,
                   Value reduction_indices, BoolAttr keep_dims) {
   Type out_ty =
-      InferReductionOpType(input, reduction_indices, keep_dims, builder);
+      InferReductionOpType(input, reduction_indices, keep_dims, &builder);
   build(builder, result, out_ty, input, reduction_indices, keep_dims);
 }
 
@@ -2837,6 +3131,12 @@ void SumOp::build(Builder *builder, OperationState &result, Value input,
 // StridedSliceOp
 //===----------------------------------------------------------------------===//
 
+// TODO(b/154160827): Add a canonicalization pattern from tf.StridedSliceOp to
+// tf.SliceOp if both of the following are true:
+// - All strides have a known value equal to 1
+// - No masks are set (or masks can be applied by transforming the inputs to
+//   Slice)
+
 // Verifies that,
 //
 // - begin, end and strides operands are 1D and they have the same number of
@@ -3335,7 +3635,7 @@ static LogicalResult Verify(TransposeOp op) {
 }
 
 // TODO(jpienaar): perm could be optional too.
-void TransposeOp::build(Builder *builder, OperationState &result, Value x,
+void TransposeOp::build(OpBuilder &builder, OperationState &result, Value x,
                         Value perm) {
   auto x_type = x.getType().cast<TensorType>();
   // If value is unranked, then so is results.
@@ -3594,7 +3894,7 @@ static LogicalResult Verify(WhileOp op) {
         auto aType = a.second[idx];
         auto bType = b.second[idx];
 
-        if (!AreCastCompatible(aType, bType))
+        if (!AreCastCompatible({aType, bType}))
           return op.emitError(llvm::formatv(
               "{0} type {1} is incompatible with {2} type {3} at index {4}",
               a.first, aType, b.first, bType, idx));
@@ -3679,12 +3979,132 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
       >();
   addInterfaces<TFInlinerInterface>();
+  addAttributes<ShapeAttr, FuncAttr>();
 
   // Support unknown operations because not all TensorFlow operations are
   // registered.
   allowUnknownOperations();
 }
 
+namespace {
+
+ShapeAttr ParseShapeAttr(MLIRContext *context, StringRef spec, Location loc) {
+  auto emit_error = [&, spec]() {
+    emitError(loc, "invalid TensorFlow shape attribute: ") << spec;
+    return nullptr;
+  };
+
+  if (!spec.consume_front("shape<")) return emit_error();
+
+  if (spec.consume_front("*>"))
+    return mlir::TF::ShapeAttr::get(context, llvm::None);
+
+  SmallVector<int64_t, 4> shape;
+  while (!spec.consume_front(">")) {
+    int64_t dim;
+
+    if (spec.consume_front("?"))
+      dim = -1;
+    else if (spec.consumeInteger(10, dim) || dim < 0)
+      return emit_error();
+
+    spec.consume_front("x");
+
+    shape.push_back(dim);
+  }
+
+  return mlir::TF::ShapeAttr::get(context, llvm::makeArrayRef(shape));
+}
+
+void PrintShapeAttr(ShapeAttr attr, DialectAsmPrinter &os) {  // NOLINT
+  os << "shape";
+
+  os << "<";
+  if (attr.hasRank()) {
+    auto print_dim = [&](int64_t dim) {
+      if (dim > -1)
+        os << dim;
+      else
+        os << "?";
+    };
+    llvm::interleave(attr.getShape(), os, print_dim, "x");
+  } else {
+    os << "*";
+  }
+  os << ">";
+}
+
+// Parses a #tf.func attribute of the following format:
+//
+//   #tf.func<@symbol, {attr = "value"}>
+//
+// where the first element is a SymbolRefAttr and the second element is a
+// DictionaryAttr.
+FuncAttr ParseFuncAttr(MLIRContext *context, StringRef spec, Location loc) {
+  auto emit_error = [&, spec]() {
+    emitError(loc, "invalid TensorFlow func attribute: ") << spec;
+    return nullptr;
+  };
+
+  if (!spec.consume_front("func<")) return emit_error();
+
+  size_t func_name_num_read = 0;
+  Attribute func_name_attr =
+      mlir::parseAttribute(spec, context, func_name_num_read);
+  if (!func_name_attr || !func_name_attr.isa<SymbolRefAttr>())
+    return emit_error();
+  spec = spec.drop_front(func_name_num_read);
+
+  if (!spec.consume_front(", ")) return emit_error();
+
+  size_t func_attrs_num_read = 0;
+  Attribute func_attrs_attr =
+      mlir::parseAttribute(spec, context, func_attrs_num_read);
+  if (!func_attrs_attr || !func_attrs_attr.isa<DictionaryAttr>())
+    return emit_error();
+  spec = spec.drop_front(func_attrs_num_read);
+
+  if (!spec.consume_front(">")) return emit_error();
+
+  return mlir::TF::FuncAttr::get(context, func_name_attr.cast<SymbolRefAttr>(),
+                                 func_attrs_attr.cast<DictionaryAttr>());
+}
+
+// Prints a #tf.func attribute of the following format:
+//
+//   #tf.func<@symbol, {attr = "value"}>
+void PrintFuncAttr(FuncAttr attr, DialectAsmPrinter &os) {
+  os << "func<" << attr.GetName() << ", " << attr.GetAttrs() << ">";
+}
+
+}  // namespace
+
+Attribute TensorFlowDialect::parseAttribute(DialectAsmParser &parser,
+                                            Type type) const {
+  auto spec = parser.getFullSymbolSpec();
+  Location loc = parser.getEncodedSourceLoc(parser.getNameLoc());
+
+  if (spec.startswith("shape")) return ParseShapeAttr(getContext(), spec, loc);
+
+  if (spec.startswith("func")) return ParseFuncAttr(getContext(), spec, loc);
+
+  return (emitError(loc, "unknown TensorFlow attribute: " + spec), nullptr);
+}
+
+void TensorFlowDialect::printAttribute(Attribute attr,
+                                       DialectAsmPrinter &os) const {
+  switch (attr.getKind()) {
+    case AttrKind::SHAPE:
+      PrintShapeAttr(attr.cast<ShapeAttr>(), os);
+      break;
+    case AttrKind::FUNC:
+      PrintFuncAttr(attr.cast<FuncAttr>(), os);
+      break;
+    default:
+      llvm_unreachable("unexpected tensorflow attribute kind");
+  }
+}
+
 // Parses a type registered to this dialect.
 Type TensorFlowDialect::parseType(DialectAsmParser &parser) const {
   StringRef data;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 8dc8fb351f2..88307267ab4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -30,7 +30,9 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -55,6 +57,10 @@ class TensorFlowDialect : public Dialect {
   // Returns the string description of stateful attribute.
   static StringRef GetStatefulAttrName() { return "tf.signature.is_stateful"; }
 
+  Attribute parseAttribute(DialectAsmParser &parser, Type type) const override;
+
+  void printAttribute(Attribute attr, DialectAsmPrinter &os) const override;
+
   // Parse a type registered to this dialect.
   Type parseType(DialectAsmParser &parser) const override;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index fc60a76e092..94b0c5f5e19 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -28,7 +28,9 @@ limitations under the License.
 #define TF_OPS
 
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
 include "mlir/Interfaces/CallInterfaces.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/IR/OpBase.td"
 
 class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [NoSideEffect]> {
@@ -39,6 +41,9 @@ class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [NoSideEffect]> {
   TF_DerivedOperandTypeAttr shape_type = TF_DerivedOperandTypeAttr<0>;
 
   let verifier = [{
+    // This is required to populate derived attributes during export in a
+    // meaningful way. Else during export to GraphDef element_type() query
+    // will result in out of bounds access/assert.
     if (handle_dtype().getSubtypes().size() != 1) {
       return emitOpError(
           "must have exactly one subtype in the result variant type");
@@ -64,7 +69,8 @@ class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [NoSideEffect]> {
 
 // In MLIR, the TensorFlow tensor value is represented as an ElementsAttr, with
 // its type encoding the tensor's shape and data type.
-def TF_ConstOp : TF_Op<"Const", [ConstantLike, NoSideEffect]> {
+def TF_ConstOp : TF_Op<"Const", [ConstantLike, NoSideEffect,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Constant tensor op";
 
   let arguments = (ins
@@ -79,12 +85,18 @@ def TF_ConstOp : TF_Op<"Const", [ConstantLike, NoSideEffect]> {
 
   let builders = [
     OpBuilder<
-      "Builder *builder, OperationState &result, Attribute value">,
+      "OpBuilder &builder, OperationState &result, Attribute value">,
     OpBuilder<
-      "Builder *builder, OperationState &result, Type type, Attribute value">,
+      "OpBuilder &builder, OperationState &result, Type type, Attribute value">,
   ];
 
   let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    static bool isCompatibleReturnTypes(ArrayRef<Type> l, ArrayRef<Type> r) {
+      return BroadcastCompatible(l, r);
+    }
+  }];
 }
 
 def TF_DataFormatVecPermuteOp : TF_Op<"DataFormatVecPermute", [NoSideEffect, SameOperandsAndResultType]> {
@@ -176,7 +188,7 @@ else_branch: A function that takes 'inputs' and returns a list of
 
     FlatSymbolRefAttr:$then_branch,
     FlatSymbolRefAttr:$else_branch,
-    DefaultValuedAttr<StrArrayAttr, "{}">:$output_shapes,
+    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
 
     // Used to map StatelessIf and If op defined in TensorFlow to a common op.
     BoolAttr:$is_stateless
@@ -279,6 +291,7 @@ def TF_ParseExampleV2Op : TF_Op<"ParseExampleV2",
     Variadic<TensorOf<[F32, I64, TF_Str]>>:$dense_defaults,
 
     Confined<I64Attr, [IntMinValue<0>]>:$num_sparse,
+    TF_ShapeAttrArray:$dense_shapes,
     I32ElementsAttr:$result_segment_sizes
   );
 
@@ -479,7 +492,7 @@ body: A function that takes a list of tensors and returns another
 
     FlatSymbolRefAttr:$cond,
     FlatSymbolRefAttr:$body,
-    DefaultValuedAttr<StrArrayAttr, "{}">:$output_shapes,
+    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
     DefaultValuedAttr<I64Attr, "10">:$parallel_iterations,
 
     // Used to map StatelessWhile and While op defined in TensorFlow to a common
@@ -613,29 +626,6 @@ def TF_FusedBatchNormExOp : TF_Op<"_FusedBatchNormEx", [NoSideEffect]> {
   TF_DerivedOperandSizeAttr num_side_inputs = TF_DerivedOperandSizeAttr<5>;
 }
 
-def TF_RecvTPUEmbeddingActivationsOp : TF_Op<"RecvTPUEmbeddingActivations", []> {
-  let summary = "An op that receives embedding activations on the TPU.";
-
-  let description = [{
-The TPU system performs the embedding lookups and aggregations specified by
-the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
-results of these aggregations are visible to the Tensorflow Graph as the
-outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
-one Tensor of activations per table specified in the model. There can be at
-most one RecvTPUEmbeddingActivations op in the TPU graph.
-  }];
-
-  let arguments = (ins
-    StrAttr:$config
-  );
-
-  let results = (outs
-    Variadic<F32Tensor>:$outputs
-  );
-
-  TF_DerivedResultSizeAttr num_outputs = TF_DerivedResultSizeAttr<0>;
-}
-
 // Multiple variadic operands with different sizes are not supported by the
 // dialect generator, so we manually added the op.
 def TF_SendTPUEmbeddingGradientsOp : TF_Op<"SendTPUEmbeddingGradients", [AttrSizedOperandSegments]> {
@@ -667,4 +657,277 @@ config: Serialized TPUEmbeddingConfiguration proto.
   TF_DerivedOperandSizeAttr NN = TF_DerivedOperandSizeAttr<1>;
 }
 
+// Multiple variadic operands with different sizes are not supported by the
+// dialect generator, so we manually added the op.
+def TF__SendTPUEmbeddingGradientsOp : TF_Op<"_SendTPUEmbeddingGradients", [AttrSizedOperandSegments]> {
+  let summary = "Performs gradient updates of embedding tables.";
+
+  let description = [{
+The gradients argument is a TensorList having the same length and shapes as the
+return value of _RecvTPUEmbeddingActivations, but contains gradients of the
+model's loss with respect to the embedding activations. The embedding tables are
+updated from these gradients via the optimizer specified in the
+TPUEmbeddingConfiguration proto given to tpu.initialize_system.
+
+gradients: A TensorList of gradients with which to update embedding tables.
+learning_rates: A TensorList of learning rates used for updating the embedding
+    tables via the optimizer. The length of the TensorList must be equal to the
+    number of dynamic learning rate tags specified in the
+    TPUEmbeddingConfiguration proto.
+deduplication_data: A Tensor with type=DT_VARIANT containing the deduplication
+    data. The tensor is an XLA nested tuple containing N elements. Each
+    element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+    contains indices (DT_INT32) for embedding lookup or weights (DT_FLOAT) to
+    apply to the output of the embedding lookup operation.
+config: Serialized TPUEmbeddingConfiguration proto.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$gradients,
+    Variadic<TF_Tensor>:$learning_rates,
+    TF_VariantTensor:$deduplication_data,
+    StrAttr:$config
+  );
+
+  TF_DerivedOperandSizeAttr NumTables = TF_DerivedOperandSizeAttr<0>;
+  TF_DerivedOperandSizeAttr NumLearningRateTags = TF_DerivedOperandSizeAttr<1>;
+}
+
+// Updated the op description text from the auto-generated op definition.
+def TF__RecvTPUEmbeddingDeduplicationDataOp : TF_Op<"_RecvTPUEmbeddingDeduplicationData", []> {
+  let summary = [{
+Receives deduplication data (indices and weights).
+  }];
+
+  let description = [{
+The deduplication data is a Tensor with type=DT_VARIANT. The tensor itself is an
+XLA nested tuple containing N elements. Each element of the nested tuple is a
+tuple of rank 1 tensors. Each tensor either contains indices (DT_INT32) for
+embedding lookup or weights (DT_FLOAT) to apply to the output of the embedding
+lookup operation.
+  }];
+
+  let arguments = (ins
+    StrAttr:$config
+  );
+
+  let results = (outs
+    TF_VariantTensor:$output
+  );
+}
+
+def TF_XlaShardingOp : TF_Op<"XlaSharding", [NoSideEffect]> {
+  let summary = [{
+An op which shards the input based on the given sharding attribute.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    OptionalAttr<StrAttr>:$_XlaSharding
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_InfeedDequeueTupleOp : TF_Op<"InfeedDequeueTuple", []> {
+  let summary = "Fetches multiple values from infeed as an XLA tuple.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    OptionalAttr<StrAttr>:$_XlaSharding
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedResultShapeListAttr shapes = TF_DerivedResultShapeListAttr<0>;
+  TF_DerivedResultTypeListAttr dtypes = TF_DerivedResultTypeListAttr<0>;
+}
+
+def TF_StringFormatOp : TF_Op<"StringFormat", [NoSideEffect]> {
+  let summary = "Formats a string template using a list of tensors.";
+
+  let description = [{
+Formats a string template using a list of tensors, pretty-printing tensor summaries.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    DefaultValuedAttr<StrAttr, "%s">:$strtemplate,
+    DefaultValuedAttr<StrAttr, "%s">:$placeholder,
+    DefaultValuedAttr<I64Attr, "3">:$summarize
+  );
+
+  let results = (outs
+    TF_StrTensor:$output
+  );
+
+  TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
+}
+
+//===----------------------------------------------------------------------===//
+// tf.data ops
+//===----------------------------------------------------------------------===//
+
+def TF_BatchDatasetV2Op : TF_Op<"BatchDatasetV2", [NoSideEffect]> {
+  let summary = [{
+Creates a dataset that batches `batch_size` elements from `input_dataset`.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$input_dataset,
+    I64Tensor:$batch_size,
+    I1Tensor:$drop_remainder,
+
+    DefaultValuedAttr<BoolAttr, "false">:$parallel_copy,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+}
+
+def TF_MapDatasetOp : TF_Op<"MapDataset", [NoSideEffect]> {
+  let summary = [{
+    Creates a dataset that applies `f` to the outputs of `input_dataset`.
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$input_dataset,
+    Variadic<TF_Tensor>:$other_arguments,
+
+    SymbolRefAttr:$f,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes,
+    DefaultValuedAttr<BoolAttr, "true">:$use_inter_op_parallelism,
+    DefaultValuedAttr<BoolAttr, "false">:$preserve_cardinality
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+
+  TF_DerivedOperandTypeListAttr Targuments = TF_DerivedOperandTypeListAttr<1>;
+}
+
+def TF_MapAndBatchDatasetOp : TF_Op<"MapAndBatchDataset", [NoSideEffect]> {
+  let summary = "Creates a dataset that fuses mapping with batching.";
+
+  let description = [{
+Creates a dataset that applies `f` to the outputs of `input_dataset` and then
+batches `batch_size` of them.
+
+Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
+to `batch_size * num_parallel_batches` copies of `f` in parallel.
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$input_dataset,
+    Variadic<TF_Tensor>:$other_arguments,
+    I64Tensor:$batch_size,
+    I64Tensor:$num_parallel_calls,
+    I1Tensor:$drop_remainder,
+
+    SymbolRefAttr:$f,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes,
+    DefaultValuedAttr<BoolAttr, "false">:$preserve_cardinality
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+
+  TF_DerivedOperandTypeListAttr Targuments = TF_DerivedOperandTypeListAttr<1>;
+}
+
+def TF_ParallelMapDatasetOp : TF_Op<"ParallelMapDataset", [NoSideEffect]> {
+  let summary = [{
+    Creates a dataset that applies `f` to the outputs of `input_dataset`.
+  }];
+
+  let description = [{
+    Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes
+    up to `num_parallel_calls` copies of `f` in parallel.
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$input_dataset,
+    Variadic<TF_Tensor>:$other_arguments,
+    I32Tensor:$num_parallel_calls,
+
+    SymbolRefAttr:$f,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes,
+    DefaultValuedAttr<BoolAttr, "true">:$use_inter_op_parallelism,
+    DefaultValuedAttr<BoolAttr, "false">:$sloppy,
+    DefaultValuedAttr<BoolAttr, "false">:$preserve_cardinality
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+
+  TF_DerivedOperandTypeListAttr Targuments = TF_DerivedOperandTypeListAttr<1>;
+}
+
+def TF_TensorSliceDatasetOp : TF_Op<"TensorSliceDataset", []> {
+  let summary = [{
+    Creates a dataset that emits each dim-0 slice of `components` once.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$components,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+
+  TF_DerivedOperandTypeListAttr Toutput_types = TF_DerivedOperandTypeListAttr<0>;
+}
+
+// TODO(b/156507832): Move tf.InplaceUpdate to tf_generated_ops.td once
+// autogenerated op def matches.
+def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
+  let summary = "Updates specified rows 'i' with values 'v'.";
+
+  let description = [{
+Computes `x[i, :] = v; return x`.
+
+Originally this function is mutative however for compilation we make this
+operation create / operate on a copy of `x`.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    I32Tensor:$i,
+    TF_Tensor:$v
+  );
+
+  let results = (outs
+    TF_Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
index 85c6819a8b4..f488171d1e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -39,6 +39,7 @@ static inline LogicalResult VerifyRefTypeMatch(mlir::Type type,
 // This class provides verification for ops that are known to have the same
 // result types and all operands are either of the same type as result or a REF
 // type corresponding to the result type.
+// TODO(jpienaar): Update the name and the description.
 template <typename ConcreteType>
 class OperandsSameAsResultsTypeOrRef
     : public TraitBase<ConcreteType, OperandsSameAsResultsTypeOrRef> {
@@ -46,23 +47,19 @@ class OperandsSameAsResultsTypeOrRef
   static LogicalResult verifyTrait(Operation* op) {
     LogicalResult shapeMatch = impl::verifySameOperandsAndResultShape(op);
     if (failed(shapeMatch)) return shapeMatch;
-
-    auto type = getElementTypeOrSelf(op->getResult(0).getType());
-
+    Type type = op->getResult(0).getType();
     // Verify that the first result type is same as the rest of the results.
     // We skip the comparison against itself.
-    for (auto resultType : llvm::drop_begin(op->getResultTypes(), 1)) {
-      resultType = getElementTypeOrSelf(resultType);
-      if (resultType != type)
-        return op->emitOpError() << "requires the same type for all results";
+    for (auto result_type : llvm::drop_begin(op->getResultTypes(), 1)) {
+      if (!mlir::TF::HasCompatibleElementTypes(type, result_type))
+        return op->emitOpError()
+               << "requires all return types to have compatible element types";
     }
-
-    for (auto opType : op->getOperandTypes()) {
-      opType = getElementTypeOrSelf(opType);
-      if (opType != type && failed(VerifyRefTypeMatch(type, opType))) {
-        return op->emitError() << "requires all operands to be either same "
-                                  "as or ref type of results";
-      }
+    for (auto operand_type : op->getOperandTypes()) {
+      if (!mlir::TF::HasCompatibleElementTypes(
+              operand_type, type, /*may_ignore_ref_type_lhs=*/true))
+        return op->emitError() << "requires all operands and results to have "
+                                  "compatible element types";
     }
     return success();
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
index 188bc67f70e..d312e5e409b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 #include "llvm/Support/ErrorHandling.h"
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 
@@ -27,6 +28,134 @@ llvm::Optional<llvm::ArrayRef<int64_t>> GetShape(mlir::Value value) {
   if (shaped_type.hasRank()) return shaped_type.getShape();
   return llvm::None;
 }
+
+// Merges cast compatible shapes and returns a more refined shape. The two
+// shapes are cast compatible if they have the same rank and at each dimension,
+// either both have same size or one of them is dynamic. Returns false if the
+// given shapes are not cast compatible. The refined shape is same or more
+// precise than the two input shapes.
+bool GetCastCompatibleShape(llvm::ArrayRef<int64_t> a_shape,
+                            llvm::ArrayRef<int64_t> b_shape,
+                            llvm::SmallVectorImpl<int64_t>* refined_shape) {
+  if (a_shape.size() != b_shape.size()) return false;
+  int64_t rank = a_shape.size();
+  refined_shape->reserve(rank);
+  for (auto dims : llvm::zip(a_shape, b_shape)) {
+    int64_t dim1 = std::get<0>(dims);
+    int64_t dim2 = std::get<1>(dims);
+
+    if (mlir::ShapedType::isDynamic(dim1)) {
+      refined_shape->push_back(dim2);
+      continue;
+    }
+    if (mlir::ShapedType::isDynamic(dim2)) {
+      refined_shape->push_back(dim1);
+      continue;
+    }
+    if (dim1 == dim2) {
+      refined_shape->push_back(dim1);
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+// Given two types `a` and `b`, returns a refined type which is cast compatible
+// with both `a` and `b` and is equal to or more precise than both of them. It
+// returns empty Type if the input types are not cast compatible.
+//
+// The two types are considered cast compatible if they have dynamically equal
+// shapes and element type. For element types that do not have subtypes, they
+// must be equal. However for TensorFlow types such as Resource and Variant,
+// that also have subtypes, we recursively check for subtype compatibilty for
+// Resource types and assume all variant types are cast compatible. If either
+// one of `a` or `b` have empty subtypes, they are considered cast compatible.
+//
+// The returned type is same or more precise than the input types. For example,
+// if `a` and `b` are cast compatible types tensor<2x?x?xf32> and
+// tensor<?x4x?xf32> respectively, the returned type is tensor<2x4x?xf32>.
+//
+// Provides option to ignore ref types on 'a'. This is useful for TF ops that
+// might allow operands to either be same as result type or be a ref type
+// corresponding to it.
+mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
+                                 bool may_ignore_ref_type_a) {
+  // Fast path if everything is equal.
+  if (a == b) return b;
+
+  auto a_tt = a.dyn_cast<mlir::TensorType>();
+  auto b_tt = b.dyn_cast<mlir::TensorType>();
+
+  // If only one of a or b is a tensor type, they are incompatible.
+  if (static_cast<bool>(a_tt) ^ static_cast<bool>(b_tt)) return nullptr;
+
+  // For non-tensor types, we do not need to worry about shape and can return
+  // early.
+  if (!a_tt && !b_tt) {
+    // Remove ref types.
+    if (may_ignore_ref_type_a) {
+      if (auto ref_type = a.dyn_cast<mlir::TF::TensorFlowRefType>()) {
+        a = ref_type.RemoveRef();
+        if (a == b) return a;
+      }
+    }
+    if (a.getKind() != b.getKind()) return nullptr;
+
+    // If either is not a type that contain subtypes then the types are not cast
+    // compatible.
+    auto a_wst = a.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    auto b_wst = b.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    if (!a_wst || !b_wst) return nullptr;
+
+    // For Variant types we are more permissive right now and accept all pairs
+    // of Variant types. If we are more constrainted and check compatibility of
+    // subtypes, we might reject valid graphs.
+    // TODO(prakalps): Variant doesn't have a subtype, we assign it
+    // one, so we should only assign it one when we know the subtype. Then we
+    // can be more constrained and check subtypes for cast compatibility as
+    // well.
+    if (a.isa<mlir::TF::VariantType>()) return a;
+
+    // For Resource types, we recursively check the subtypes for cast
+    // compatibility, if possible. Otherwise treat them as compatible.
+    auto a_wst_st = a_wst.GetSubtypes();
+    auto b_wst_st = b_wst.GetSubtypes();
+    if (a_wst_st.empty() || b_wst_st.empty()) return a;
+    if (a_wst_st.size() != b_wst_st.size()) return nullptr;
+    llvm::SmallVector<mlir::TensorType, 4> refined_subtypes;
+    for (auto subtypes : llvm::zip(a_wst_st, b_wst_st)) {
+      mlir::Type refined_st =
+          GetCastCompatibleType(std::get<0>(subtypes), std::get<1>(subtypes),
+                                /*may_ignore_ref_type_a=*/false);
+      if (!refined_st) return nullptr;
+      refined_subtypes.push_back(refined_st.cast<mlir::TensorType>());
+    }
+
+    return mlir::TF::ResourceType::get(refined_subtypes, a.getContext());
+  }
+
+  // For tensor types, check compatibility of both element type and shape.
+  mlir::Type refined_element_ty = GetCastCompatibleType(
+      a_tt.getElementType(), b_tt.getElementType(), may_ignore_ref_type_a);
+  if (!refined_element_ty) return nullptr;
+
+  if (!a_tt.hasRank() && !b_tt.hasRank()) {
+    return mlir::UnrankedTensorType::get(refined_element_ty);
+  }
+  if (!a_tt.hasRank()) {
+    return mlir::RankedTensorType::get(b_tt.getShape(), refined_element_ty);
+  }
+  if (!b_tt.hasRank()) {
+    return mlir::RankedTensorType::get(a_tt.getShape(), refined_element_ty);
+  }
+
+  llvm::SmallVector<int64_t, 8> refined_shape;
+  if (!GetCastCompatibleShape(a_tt.getShape(), b_tt.getShape(), &refined_shape))
+    return nullptr;
+
+  return mlir::RankedTensorType::get(refined_shape, refined_element_ty);
+}
 }  // namespace
 
 namespace mlir {
@@ -161,5 +290,81 @@ Type TensorFlowTypeWithSubtype::RemoveSubtypes() {
   }
 }
 
+ArrayRef<TensorType> TensorFlowTypeWithSubtype::GetSubtypes() {
+  switch (getKind()) {
+    case TensorFlowTypes::VARIANT:
+      return this->cast<VariantType>().getSubtypes();
+    case TensorFlowTypes::RESOURCE:
+      return this->cast<ResourceType>().getSubtypes();
+    default:
+      llvm_unreachable("unexpected tensorflow type with subtypes kind");
+  }
+}
+
+// TODO(jpienaar): BroadcastCompatible and HasCompatibleElementTypes have
+// similar structure that could be extracted into helper method.
+bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
+  if (lhs.size() != rhs.size()) return false;
+  for (auto types : llvm::zip(lhs, rhs)) {
+    auto lhs_type = std::get<0>(types);
+    auto rhs_type = std::get<1>(types);
+
+    // This should be true for all TF ops:
+    auto lhs_tt = lhs_type.dyn_cast<TensorType>();
+    auto rhs_tt = rhs_type.dyn_cast<TensorType>();
+    if (!lhs_tt || !rhs_tt) {
+      if (lhs_type != rhs_type) return false;
+      continue;
+    }
+
+    // Verify matching element types. These should be identical, except for
+    // variant type where unknown subtype is considered compatible with all
+    // subtypes.
+    auto lhs_et = lhs_tt.getElementType();
+    auto rhs_et = rhs_tt.getElementType();
+    if (lhs_et != rhs_et) {
+      // If either does not have subtypes, then the element types don't match.
+      auto lhs_wst = lhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
+      auto rhs_wst = rhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
+      if (!lhs_wst || !rhs_wst) return false;
+
+      // Consider the subtype of variant types.
+      auto lhs_wst_st = lhs_wst.GetSubtypes();
+      auto rhs_wst_st = rhs_wst.GetSubtypes();
+      if (!lhs_wst_st.empty() && !rhs_wst_st.empty()) {
+        for (auto subtypes : llvm::zip(lhs_wst_st, rhs_wst_st)) {
+          if (!BroadcastCompatible(std::get<0>(subtypes),
+                                   std::get<1>(subtypes)))
+            return false;
+        }
+      }
+    }
+
+    auto lhs_rt = lhs_type.dyn_cast<RankedTensorType>();
+    auto rhs_rt = rhs_type.dyn_cast<RankedTensorType>();
+    if (!lhs_rt || !rhs_rt) return true;
+    SmallVector<int64_t, 4> shape;
+    return OpTrait::util::getBroadcastedShape(lhs_rt.getShape(),
+                                              rhs_rt.getShape(), shape);
+  }
+  return true;
+}
+
+bool HasCompatibleElementTypes(Type lhs, Type rhs,
+                               bool may_ignore_ref_type_lhs) {
+  return GetCastCompatibleType(lhs, rhs, may_ignore_ref_type_lhs) != nullptr;
+}
+
+bool AreCastCompatible(ArrayRef<Type> types) {
+  Type common = types.front();
+  for (auto type : types.drop_front()) {
+    Type refined_type =
+        GetCastCompatibleType(common, type, /*may_ignore_ref_type_a=*/false);
+    if (!refined_type) return false;
+    common = refined_type;
+  }
+  return true;
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index c5225a34fb4..4c99aae4706 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -264,6 +264,9 @@ class TensorFlowTypeWithSubtype : public TensorFlowType {
 
   // Converts a TypeWithSubtype type to the same type but without its subtypes.
   Type RemoveSubtypes();
+
+  // Returns the subtypes.
+  ArrayRef<TensorType> GetSubtypes();
 };
 
 // Returns the corresponding TensorFlow type with subtypes but without its
@@ -295,6 +298,27 @@ class VariantType : public detail::TypeWithSubtypeImpl<VariantType> {
   static std::string getTypeName() { return "VariantType"; }
 };
 
+// Returns whether two arrays of Type are broadcast compatible.
+bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs);
+
+// Returns whether the two elemental types are compatible. Shapes are compatible
+// if:
+// - the types are statically equal
+// - could be dynamically equal
+//   - considering dynamic shapes equal unless contradictory info known;
+//   - element types are equivalent, modulo subtypes possible be less exact
+//     (e.g., a resource type without subtype is considered compatible with
+//      resource type with known subtype).
+// Provide option to ignore ref types on 'lhs'.
+bool HasCompatibleElementTypes(Type lhs, Type rhs,
+                               bool may_ignore_ref_type_lhs = false);
+
+// Returns true if all TensorFlow types can be cast to one
+// another. In other words, a single run-time value is legal for both the types.
+// For example, tensor<*xf32>, tensor<?xf32> and tensor<3xf32> are cast
+// compatible.
+bool AreCastCompatible(ArrayRef<Type> types);
+
 }  // end namespace TF
 }  // end namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/annotate-parameter-replication.mlir b/tensorflow/compiler/mlir/tensorflow/tests/annotate-parameter-replication.mlir
index 0111d4e4a89..743f0b43b69 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/annotate-parameter-replication.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/annotate-parameter-replication.mlir
@@ -10,18 +10,18 @@ module attributes {tf.versions = {producer = 888 : i32}} {
     %5:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf._F"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
       %3 = "tf.Identity"(%1) : (tensor<?xi32>) -> tensor<?xi32>
-      %4 = "tf_device.launch_func"(%ri_0, %3, %2) {func = @tpu0_func, device = ""} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+      %4 = "tf_device.cluster_func"(%ri_0, %3, %2) {func = @_func, device = ""} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
       tf_device.return %4 : tensor<?xi32>
     }
     %6 = "tf._C"(%5#1) : (tensor<?xi32>) -> tensor<?xi32>
     return %6 : tensor<?xi32>
   }
 
-  // CHECK-LABEL: func @tpu0_func
+  // CHECK-LABEL: func @_func
   // CHECK-SAME: %[[ARG0:.*]]: tensor<?xi32>,
   // CHECK-SAME: %[[ARG1:.*]]: tensor<?xi32> {tf_device.is_same_data_across_replicas = true}
   // CHECK-SAME: %[[ARG2:.*]]: tensor<?xi32>)
-  func @tpu0_func(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>, %arg2: tensor<?xi32>) -> tensor<?xi32> {
+  func @_func(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>, %arg2: tensor<?xi32>) -> tensor<?xi32> {
     %0 = "tf._D"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
     return %0 : tensor<?xi32>
   }
@@ -46,18 +46,18 @@ module attributes {tf.versions = {producer = 888 : i32}} {
       [%arg4, %arg5] as %ri_2: tensor<!tf.resource<tensor<?xi32>>>) {_mirrored_variable_indices = [0, 2], n = 2 : i32} {
       %0 = "tf.ReadVariableOp"(%ri_0): (tensor<!tf.resource<tensor<?xi32>>>) -> tensor<?xi32>
       %1 = "tf.ReadVariableOp"(%ri_1): (tensor<!tf.resource<tensor<?xi32>>>) -> tensor<?xi32>
-      %2 = "tf_device.launch_func"(%0, %1, %ri_2) {func = @tpu0_func, device = ""} : (tensor<?xi32>, tensor<?xi32>, tensor<!tf.resource<tensor<?xi32>>>) -> tensor<?xi32>
+      %2 = "tf_device.cluster_func"(%0, %1, %ri_2) {func = @_func, device = ""} : (tensor<?xi32>, tensor<?xi32>, tensor<!tf.resource<tensor<?xi32>>>) -> tensor<?xi32>
       tf_device.return %2 : tensor<?xi32>
     }
     %4 = "tf._C"(%3#1) : (tensor<?xi32>) -> tensor<?xi32>
     return %4 : tensor<?xi32>
   }
 
-  // CHECK-LABEL: func @tpu0_func
+  // CHECK-LABEL: func @_func
   // CHECK-SAME: %[[ARG0:.*]]: tensor<?xi32> {tf_device.is_same_data_across_replicas = true},
   // CHECK-SAME: %[[ARG1:.*]]: tensor<?xi32>,
   // CHECK-SAME: %[[ARG2:.*]]: tensor<!tf.resource<tensor<?xi32>>> {tf_device.is_same_data_across_replicas = true}
-  func @tpu0_func(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>, %arg2: tensor<!tf.resource<tensor<?xi32>>>) -> tensor<?xi32> {
+  func @_func(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>, %arg2: tensor<!tf.resource<tensor<?xi32>>>) -> tensor<?xi32> {
     %0 = "tf._D"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
     return %0 : tensor<?xi32>
   }
@@ -65,21 +65,21 @@ module attributes {tf.versions = {producer = 888 : i32}} {
 
 // -----
 
-// Tests that a non-replicated LaunchFuncOp is not annotated.
+// Tests that a non-replicated ClusterFuncOp is not annotated.
 
 module attributes {tf.versions = {producer = 888 : i32}} {
   // CHECK-LABEL: func @do_not_annotate_without_replicate
   func @do_not_annotate_without_replicate(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     %0 = "tf._A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     %1 = "tf._B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-    %2 = "tf_device.launch_func"(%0, %1) {func = @tpu0_func, device = ""} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+    %2 = "tf_device.cluster_func"(%0, %1) {func = @_func, device = ""} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
     %3 = "tf._C"(%2) : (tensor<?xi32>) -> tensor<?xi32>
     return %3 : tensor<?xi32>
   }
 
-  // CHECK-LABEL: func @tpu0_func
+  // CHECK-LABEL: func @_func
   // CHECK-NOT: tf_device.is_same_data_across_replicas
-  func @tpu0_func(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi32> {
+  func @_func(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi32> {
     %0 = "tf._D"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
     return %0 : tensor<?xi32>
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 7f362a19e04..20f4dd79715 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -258,6 +258,59 @@ func @testDoubleReciprocal(%arg0: tensor<8x16x32x64xi32>) -> tensor<8x16x32x64xi
 // CHECK: return %arg0
 }
 
+// CHECK-LABEL: testSelectScalarPred
+func @testSelectScalarPred(%arg0: tensor<i1>, %arg1: tensor<4x2xf16>, %arg2: tensor<4x2xf16>) -> tensor<4x2xf16> {
+  // CHECK-NEXT: "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<4x2xf16>, tensor<4x2xf16>) -> tensor<4x2xf16>
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<4x2xf16>, tensor<4x2xf16>) -> tensor<4x2xf16>
+  return %0: tensor<4x2xf16>
+}
+
+// CHECK-LABEL: testSelectVectorPred
+func @testSelectVectorPred(%arg0: tensor<2xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK-NEXT: %[[SHAPE:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[PRED:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<2xi1>, tensor<2xi64>) -> tensor<2x1xi1>
+  // CHECK-NEXT: "tf.SelectV2"(%[[PRED]], %arg1, %arg2) : (tensor<2x1xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// CHECK-LABEL: testSelectAllSameShape
+func @testSelectAllSameShape(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK-NEXT: "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// If we don't have guarantees on input shapes, we can't support canonicalizing
+// to SelectV2. Test these cases.
+// CHECK-LABEL: testSelectInvalid
+func @testSelectInvalid(%arg0: tensor<?xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<?xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// CHECK-LABEL: testSelectInvalidUnranked
+func @testSelectInvalidUnranked(%arg0: tensor<6x7xi1>, %arg1: tensor<*xf16>, %arg2: tensor<*xf16>) -> tensor<*xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<6x7xi1>, tensor<*xf16>, tensor<*xf16>) -> tensor<*xf16>
+  return %0: tensor<*xf16>
+}
+
+// CHECK-LABEL: testSelectThenUnranked
+func @testSelectThenUnranked(%arg0: tensor<3xi1>, %arg1: tensor<*xf16>, %arg2: tensor<3x2xf16>) -> tensor<*xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<*xf16>, tensor<3x2xf16>) -> tensor<*xf16>
+  return %0: tensor<*xf16>
+}
+
+// CHECK-LABEL: testSelectElseUnranked
+func @testSelectElseUnranked(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2: tensor<*xf16>) -> tensor<*xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<3x2xf16>, tensor<*xf16>) -> tensor<*xf16>
+  return %0: tensor<*xf16>
+}
+
 // CHECK-LABEL: testLogicalNotOfEqual
 func @testLogicalNotOfEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
@@ -462,3 +515,23 @@ func @testMultiReadVariableOpsOfCast(%arg0: tensor<!tf.resource<tensor<f32>>>) -
  // CHECK: %1 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
  // CHECK: return %1
 }
+
+// CHECK-LABEL: testRankOfRankedTensor
+func @testRankOfRankedTensor(%arg0 : tensor<4x3x2xf32>) -> tensor<i32> {
+  // CHECK:[[VAL0:%.+]] = "tf.Const"() {value = dense<3> : tensor<i32>}
+  %0 = "tf.Rank"(%arg0) : (tensor<4x3x2xf32>) -> tensor<i32>
+
+  // CHECK: return [[VAL0]]
+  return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: @foldFill
+func @foldFill() -> (tensor<3x2x1xf32>, tensor<*xf32>) {
+  %0 = "tf.Const"() {value = dense<[3, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %1 = "tf.Const"() {value = dense<23.0> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  %2 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<3x2x1xf32>
+  // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  %3 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<*xf32>
+  return %2, %3 : tensor<3x2x1xf32>, tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
index 1866879c465..42ed55deeda 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
@@ -1,127 +1,120 @@
-// RUN: tf-opt %s -split-input-file -tf-device-cluster-outlining | FileCheck %s
+// RUN: tf-opt %s -split-input-file -tf-device-cluster-outlining | FileCheck %s -dump-input-on-failure
 
-// Tests simple case of a single `tf_device.launch`.
+// Tests simple case of a single `tf_device.cluster`.
 
-module {
-  // CHECK-LABEL: func @multiplelaunches
-  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
-  func @multiplelaunches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-    %0 = tf_executor.graph {
-      %1:2 = tf_executor.island {
-        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
-        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+// CHECK-LABEL: func @single_cluster
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+func @single_cluster(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+      %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
-        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[A_OUTPUT]]) {device = "tpu0", func = @tpu0_func}
-        %3 = "tf_device.launch"() ( {
-          %4 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
-          tf_device.return %4 : tensor<?xi32>
-        }) {device = "tpu0"} : () -> tensor<?xi32>
+      // CHECK: %[[CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster_func"(%[[A_OUTPUT]]) {func = @[[CLUSTER:.*]]}
+      %3 = "tf_device.cluster"() ( {
+        %4 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_device.return %4 : tensor<?xi32>
+      }) {} : () -> tensor<?xi32>
 
-        // CHECK: tf_executor.yield %[[C_OUTPUT]]
-        tf_executor.yield %3 : tensor<?xi32>
-      }
-      tf_executor.fetch %1#0 : tensor<?xi32>
+      // CHECK: tf_executor.yield %[[CLUSTER_OUTPUT]]
+      tf_executor.yield %3 : tensor<?xi32>
     }
-    return %0 : tensor<?xi32>
+    tf_executor.fetch %1#0 : tensor<?xi32>
   }
-
-// CHECK-LABEL: func @tpu0_func
-// CHECK-SAME: (%[[TPU0_FUNC_ARG_0:[a-z0-9]*]]: tensor<?xi32>) -> tensor<?xi32>
-// CHECK-SAME: sym_visibility = "private"
-// CHECK: %[[TPU0_FUNC_B_OUTPUT:[0-9]*]] = "tf.B"(%[[TPU0_FUNC_ARG_0]])
-// CHECK: return %[[TPU0_FUNC_B_OUTPUT]]
+  return %0 : tensor<?xi32>
 }
 
+// CHECK: func @[[CLUSTER]]
+// CHECK-SAME: (%[[CLUSTER_ARG_0:[a-z0-9]*]]: tensor<?xi32>) -> tensor<?xi32>
+// CHECK-SAME: sym_visibility = "private"
+// CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[CLUSTER_ARG_0]])
+// CHECK: return %[[B_OUTPUT]]
+
 // -----
 
-// Tests that multiple `tf_device.launch` that depend on each other are
+// Tests that multiple `tf_device.cluster` that depend on each other are
 // correctly handled.
 
-module {
-  // CHECK-LABEL: func @multiplelaunches
-  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
-  func @multiplelaunches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-    %0 = tf_executor.graph {
-      %1:2 = tf_executor.island {
-        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
-        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+// CHECK-LABEL: func @multiple_clusters
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+func @multiple_clusters(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+      %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
-        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[A_OUTPUT]]) {device = "tpu0", func = @tpu0_func}
-        %3 = "tf_device.launch"() ( {
-          %6 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
-          tf_device.return %6 : tensor<?xi32>
-        }) {device = "tpu0"} : () -> tensor<?xi32>
+      // CHECK: %[[CLUSTER_0_OUTPUT:[0-9]*]] = "tf_device.cluster_func"(%[[A_OUTPUT]]) {func = @[[CLUSTER_0:.*]]}
+      %3 = "tf_device.cluster"() ( {
+        %6 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_device.return %6 : tensor<?xi32>
+      }) {} : () -> tensor<?xi32>
 
-        // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[C_OUTPUT]])
-        %4 = "tf.D"(%3) : (tensor<?xi32>) -> tensor<?xi32>
+      // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[CLUSTER_0_OUTPUT]])
+      %4 = "tf.D"(%3) : (tensor<?xi32>) -> tensor<?xi32>
 
-        // CHECK: %[[E_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[C_OUTPUT]], %[[D_OUTPUT]]) {device = "gpu0", func = @gpu0_func}
-        %5 = "tf_device.launch"() ( {
-          %6 = "tf.E"(%3) : (tensor<?xi32>) -> tensor<?xi32>
-          %7 = "tf.F"(%4, %6) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
-          tf_device.return %7 : tensor<?xi32>
-        }) {device = "gpu0"} : () -> tensor<?xi32>
+      // CHECK: %[[CLUSTER_1_OUTPUT:[0-9]*]] = "tf_device.cluster_func"(%[[CLUSTER_0_OUTPUT]], %[[D_OUTPUT]]) {func = @[[CLUSTER_1:.*]]}
+      %5 = "tf_device.cluster"() ( {
+        %6 = "tf.E"(%3) : (tensor<?xi32>) -> tensor<?xi32>
+        %7 = "tf.F"(%4, %6) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        tf_device.return %7 : tensor<?xi32>
+      }) {} : () -> tensor<?xi32>
 
-        // CHECK: tf_executor.yield %[[E_OUTPUT]]
-        tf_executor.yield %5 : tensor<?xi32>
-      }
-      tf_executor.fetch %1#0 : tensor<?xi32>
+      // CHECK: tf_executor.yield %[[CLUSTER_1_OUTPUT]]
+      tf_executor.yield %5 : tensor<?xi32>
     }
-    return %0 : tensor<?xi32>
+    tf_executor.fetch %1#0 : tensor<?xi32>
   }
-
-// CHECK-LABEL: func @tpu0_func
-// CHECK-SAME: (%[[TPU0_FUNC_ARG_0:[a-z0-9]*]]: tensor<?xi32>) -> tensor<?xi32>
-// CHECK: %[[TPU0_FUNC_B_OUTPUT:[0-9]*]] = "tf.B"(%[[TPU0_FUNC_ARG_0]])
-// CHECK: return %[[TPU0_FUNC_B_OUTPUT]]
-
-// CHECK-LABEL: func @gpu0_func
-// CHECK-SAME: (%[[GPU0_FUNC_ARG_0:[a-z0-9]*]]: tensor<?xi32>, %[[GPU0_FUNC_ARG_1:[a-z0-9]*]]: tensor<?xi32>) -> tensor<?xi32>
-// CHECK: %[[GPU0_FUNC_E_OUTPUT:[0-9]*]] = "tf.E"(%[[GPU0_FUNC_ARG_0]])
-// CHECK: %[[GPU0_FUNC_F_OUTPUT:[0-9]*]] = "tf.F"(%[[GPU0_FUNC_ARG_1]], %[[GPU0_FUNC_E_OUTPUT]])
-// CHECK: return %[[GPU0_FUNC_F_OUTPUT]]
+  return %0 : tensor<?xi32>
 }
 
+// CHECK: func @[[CLUSTER_0]]
+// CHECK-SAME: (%[[CLUSTER_0_ARG_0:[a-z0-9]*]]: tensor<?xi32>) -> tensor<?xi32>
+// CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[CLUSTER_0_ARG_0]])
+// CHECK: return %[[B_OUTPUT]]
+
+// CHECK: func @[[CLUSTER_1]]
+// CHECK-SAME: (%[[CLUSTER_1_ARG_0:[a-z0-9]*]]: tensor<?xi32>, %[[CLUSTER_1_ARG_1:[a-z0-9]*]]: tensor<?xi32>) -> tensor<?xi32>
+// CHECK: %[[E_OUTPUT:[0-9]*]] = "tf.E"(%[[CLUSTER_1_ARG_0]])
+// CHECK: %[[F_OUTPUT:[0-9]*]] = "tf.F"(%[[CLUSTER_1_ARG_1]], %[[E_OUTPUT]])
+// CHECK: return %[[F_OUTPUT]]
+
 // -----
 
-// Tests outlining launches with no live-in values.
+// Tests outlining clusters with no live-in values.
 
-module {
-  // CHECK-LABEL: func @multiplelaunches
-  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
-  func @multiplelaunches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-    %0 = tf_executor.graph {
-      %1:2 = tf_executor.island wraps
-        // CHECK: %[[A_OUTPUT:[a-z0-9]*]], %{{.*}} = {{.*}} "tf_device.launch_func"() {device = "tpu0", func = @tpu0_func}
-        "tf_device.launch"() ( {
-          %3 = "tf.A"() : () -> tensor<?xi32>
-          tf_device.return %3 : tensor<?xi32>
-        }) {device = "tpu0"} : () -> tensor<?xi32>
-      // CHECK: tf_executor.fetch %[[A_OUTPUT]]
-      tf_executor.fetch %1#0 : tensor<?xi32>
-    }
-    return %0 : tensor<?xi32>
+// CHECK-LABEL: func @cluster_operands
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+func @cluster_operands(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island wraps
+      // CHECK: %[[CLUSTER_OUTPUT:[a-z0-9]*]], %{{.*}} = {{.*}} "tf_device.cluster_func"() {func = @[[CLUSTER:.*]]}
+      "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> tensor<?xi32>
+        tf_device.return %3 : tensor<?xi32>
+      }) {} : () -> tensor<?xi32>
+    // CHECK: tf_executor.fetch %[[CLUSTER_OUTPUT]]
+    tf_executor.fetch %1#0 : tensor<?xi32>
   }
+  return %0 : tensor<?xi32>
+}
 
-// CHECK-LABEL: func @tpu0_func
+// CHECK: func @[[CLUSTER]]
 // CHECK-SAME: () -> tensor<?xi32>
-// CHECK: %[[TPU0_FUNC_A_OUTPUT:[0-9]*]] = "tf.A"()
-// CHECK: return %[[TPU0_FUNC_A_OUTPUT]]
-}
+// CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"()
+// CHECK: return %[[A_OUTPUT]]
 
 // -----
 
-// Tests launch attributes are copied over to launch_func.
+// Tests cluster attributes are copied over to cluster_func.
 
-module {
-  // CHECK-LABEL: func @launch_attrs
-  func @launch_attrs() -> tensor<?xi32> {
-    %0 = "tf_device.launch"() ( {
-      %1 = "tf.A"() : () -> tensor<?xi32>
-      tf_device.return %1 : tensor<?xi32>
-    }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<?xi32>
-    return %0 : tensor<?xi32>
-  }
-
-// CHECK: launch_attr = "launch_attr"
+// CHECK-LABEL: func @cluster_attrs
+func @cluster_attrs() -> tensor<?xi32> {
+  %0 = "tf_device.cluster"() ( {
+    %1 = "tf.A"() : () -> tensor<?xi32>
+    tf_device.return %1 : tensor<?xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+  return %0 : tensor<?xi32>
 }
+
+// CHECK: "tf_device.cluster_func"
+// CHECK-SAME: cluster_attr = "cluster_attr"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 2a34bbfacdc..3ae6023400c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -38,6 +38,56 @@ func @testPow(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>, ten
   return %0, %1, %2 : tensor<4xf32>, tensor<4xf32>, tensor<4xf32>
 }
 
+// CHECK-LABEL: func @testEmpty32
+func @testEmpty32() -> (tensor<5xi32>) {
+  %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: [[VAL:%.+]] = "tf.Const"() {value = dense<0> : tensor<5xi32>}
+  // CHECK: return [[VAL]]
+  %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xi32>)
+  return %1 : tensor<5xi32>
+}
+
+// CHECK-LABEL: func @testEmpty64
+func @testEmpty64() -> (tensor<5xi64>) {
+  %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: [[VAL:%.+]] = "tf.Const"() {value = dense<0> : tensor<5xi64>}
+  // CHECK: return [[VAL]] : tensor<5xi64>
+  %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xi64>)
+  return %1 : tensor<5xi64>
+}
+
+// CHECK-LABEL: func @testEmptyFloat
+func @testEmptyFloat() -> (tensor<5xf64>) {
+  %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: [[VAL:%.+]] = "tf.Const"() {value =  dense<0.000000e+00> : tensor<5xf64>}
+  // CHECK: return [[VAL]]
+  %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xf64>)
+  return %1 : tensor<5xf64>
+}
+
+// CHECK-LABEL: func @testEmptyf16
+func @testEmptyf16() -> (tensor<5xf16>) {
+  %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: [[VAL:%.+]] = "tf.Const"() {value =  dense<0.000000e+00> : tensor<5xf16>}
+  // CHECK: return [[VAL]]
+  %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xf16>)
+  return %1 : tensor<5xf16>
+}
+
+// CHECK-LABEL: func @testEmptybf16
+func @testEmptybf16() -> (tensor<5xbf16>) {
+  %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: [[VAL:%.+]] = "tf.Const"() {value =  dense<0.000000e+00> : tensor<5xbf16>}
+  // CHECK: return [[VAL]]
+  %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xbf16>)
+  return %1 : tensor<5xbf16>
+}
+
 // CHECK-LABEL: func @testShapeN
 func @testShapeN(%arg0: tensor<f32>, %arg1: tensor<1x32x32x16xf32>, %arg2: tensor<*xf32>) -> (tensor<0xi64>, tensor<4xi64>, tensor<4xi64>, tensor<?xi64>) {
 
@@ -251,3 +301,144 @@ func @testTensorListElementShape(%arg0: tensor<!tf.variant<tensor<2x4xf32>>>) ->
   // CHECK-NEXT:    return [[cst]] : tensor<2xi32>
   return %0: tensor<2xi32>
 }
+
+func @RemoveTrivialAdd(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<0.0> : tensor<2x2xf32>
+  %0 = "tf.Add"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveTrivialAdd
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
+}
+
+func @RemoveTrivialAddBf16RHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
+  %cst = constant dense<0.0> : tensor<2x2xbf16>
+  %0 = "tf.Add"(%arg0, %cst) : (tensor<2x2xbf16>, tensor<2x2xbf16>) -> tensor<2x2xbf16>
+  return %0 : tensor<2x2xbf16>
+
+  // CHECK-LABEL: RemoveTrivialAdd
+  // CHECK-NEXT: return %arg0 : tensor<2x2xbf16>
+}
+
+func @RemoveTrivialAddBf16LHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
+  %cst = constant dense<0.0> : tensor<2x2xbf16>
+  %0 = "tf.Add"(%cst, %arg0) : (tensor<2x2xbf16>, tensor<2x2xbf16>) -> tensor<2x2xbf16>
+  return %0 : tensor<2x2xbf16>
+
+  // CHECK-LABEL: RemoveTrivialAdd
+  // CHECK-NEXT: return %arg0 : tensor<2x2xbf16>
+}
+
+func @RemoveTrivialAddV2(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<0.0> : tensor<2x2xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveTrivialAddV2
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
+}
+
+func @RemoveTrivialSub(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<0.0> : tensor<2x2xf32>
+  %0 = "tf.Sub"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveTrivialSub
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
+}
+
+func @RemoveTrivialSubInt8(%arg0: tensor<2x2xi8>) -> tensor<2x2xi8> {
+  %cst = constant dense<0> : tensor<2x2xi8>
+  %0 = "tf.Sub"(%arg0, %cst) : (tensor<2x2xi8>, tensor<2x2xi8>) -> tensor<2x2xi8>
+  return %0 : tensor<2x2xi8>
+
+  // CHECK-LABEL: RemoveTrivialSubInt8
+  // CHECK-NEXT: return %arg0 : tensor<2x2xi8>
+}
+
+func @RemoveTrivialMul(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<1.0> : tensor<2x2xf32>
+  %0 = "tf.Mul"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveTrivialMul
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
+}
+
+func @RemoveTrivialDiv(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<1.0> : tensor<2x2xf32>
+  %0 = "tf.Div"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveTrivialDiv
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
+}
+
+func @RemoveTrivialRealDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<1.0> : tensor<2x2xf32>
+  %0 = "tf.RealDiv"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveTrivialRealDiv
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
+}
+
+func @RemoveTrivialDivBf16RHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
+  %cst = constant dense<1.0> : tensor<2x2xbf16>
+  %0 = "tf.Div"(%arg0, %cst) : (tensor<2x2xbf16>, tensor<2x2xbf16>) -> tensor<2x2xbf16>
+  return %0 : tensor<2x2xbf16>
+
+  // CHECK-LABEL: RemoveTrivialDiv
+  // CHECK-NEXT: return %arg0 : tensor<2x2xbf16>
+}
+
+func @RemoveTrivialMulInt8(%arg0: tensor<2x2xi8>) -> tensor<2x2xi8> {
+  %cst = constant dense<1> : tensor<2x2xi8>
+  %0 = "tf.Mul"(%cst, %arg0) : (tensor<2x2xi8>, tensor<2x2xi8>) -> tensor<2x2xi8>
+  return %0 : tensor<2x2xi8>
+
+  // CHECK-LABEL: RemoveTrivialMulInt8
+  // CHECK-NEXT: return %arg0 : tensor<2x2xi8>
+}
+
+func @DivBf16LHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
+  %cst = constant dense<1.0> : tensor<2x2xbf16>
+  %0 = "tf.Div"(%cst, %arg0) : (tensor<2x2xbf16>, tensor<2x2xbf16>) -> tensor<2x2xbf16>
+  return %0 : tensor<2x2xbf16>
+
+  // CHECK-LABEL: DivBf16LHS
+  // CHECK: tf.Div
+}
+
+func @DontRemoveTrivialAdd(%arg0: tensor<1x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<0.0> : tensor<2x2xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: DontRemoveTrivialAdd
+  // CHECK: %[[CONST:.*]] = constant dense<0.000000e+00> : tensor<2x2xf32>
+  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK: return %[[RESULT]] : tensor<2x2xf32>
+}
+
+func @DontRemoveTrivialAdd2(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %cst = constant dense<0.0> : tensor<2x2xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<?x?xf32> , tensor<2x2xf32>) -> tensor<?x?xf32>
+  return %0 :tensor<?x?xf32>
+
+  // CHECK-LABEL: DontRemoveTrivialAdd2
+  // CHECK: %[[CONST:.*]] = constant dense<0.000000e+00> : tensor<2x2xf32>
+  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
+  // CHECK: return %[[RESULT]] : tensor<?x?xf32>
+}
+
+// Test no fold because of the broadcast.
+func @DontRemoveTrivialMul(%arg0: tensor<1x6x8x1xf32>) -> tensor<1x6x8x1xf32> {
+  %0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Mul"(%arg0, %0) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
+  return %1 : tensor<1x6x8x1xf32>
+  // CHECK-LABEL: DontRemoveTrivialMul
+  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[RESULT:.*]] = "tf.Mul"(%arg0, %[[CONST]]) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
+  // CHECK: return %[[RESULT]] : tensor<1x6x8x1xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir
new file mode 100644
index 00000000000..cd3b8b55032
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir
@@ -0,0 +1,50 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics
+
+// Tests invalid #tf.func attributes.
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func}}
+func @main() attributes {tf._implements = #tf.func} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<>}}
+func @main() attributes {tf._implements = #tf.func<>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<@symbol>}}
+func @main() attributes {tf._implements = #tf.func<@symbol>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<{}>}}
+func @main() attributes {tf._implements = #tf.func<{}>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<"test", {}>}}
+func @main() attributes {tf._implements = #tf.func<"test", {}>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<@symbol, "">}}
+func @main() attributes {tf._implements = #tf.func<@symbol, "">} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<@symbol, {}, "">}}
+func @main() attributes {tf._implements = #tf.func<@symbol, {}, "">} {
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir
new file mode 100644
index 00000000000..de17778c105
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir
@@ -0,0 +1,13 @@
+// RUN: tf-opt %s | tf-opt | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @func_attr
+// CHECK-SAME: tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random"}>
+func @func_attr() attributes {tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random"}>} {
+  return
+}
+
+// CHECK-LABEL: func @nested_func_attr
+// CHECK-SAME: tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random", nested = #tf.func<@symbol_b, {attr2 = true, attr3 = 8.000000e+00 : f32}>}>
+func @nested_func_attr() attributes {tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random", nested = #tf.func<@symbol_b, {attr2 = true, attr3 = 8.0 : f32}>}>} {
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt
index 4d6550b4a2e..660a0dec8ad 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt
@@ -1,6 +1,6 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input0,input1 -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=10:10 -tf-output-arrays=Add -o - | FileCheck %s
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input0,input1 -tf-input-shapes=10:10 -tf-output-arrays=Add -o - | FileCheck --check-prefix=NONE %s
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input0,input1 -tf-input-shapes=10:10 -tf-input-data-types=',DT_INT32' -tf-output-arrays=Add -o - | FileCheck --check-prefix=SOME %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input0,input1 -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=10:10 -tf-output-arrays=Add -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input0,input1 -tf-input-shapes=10:10 -tf-output-arrays=Add -o - | FileCheck --check-prefix=NONE %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input0,input1 -tf-input-shapes=10:10 -tf-input-data-types=',DT_INT32' -tf-output-arrays=Add -o - | FileCheck --check-prefix=SOME %s
 
 node {
   name: "Add"
@@ -39,7 +39,7 @@ versions {
 }
 
 # CHECK-LABEL: func @main
-# CHECK-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<10xi32>, %[[ARG_1:[a-z0-9]+]]: tensor<10xi32>) -> tensor<10xi32>
+# CHECK-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<10xi32>, %[[ARG_1:[a-z0-9]+]]: tensor<10xi32>) -> tensor<*xi32>
 # CHECK-SAME:  control_outputs = ""
 # CHECK-SAME:  inputs = "input0,input1"
 # CHECK-SAME:  outputs = "Add"
@@ -47,7 +47,7 @@ versions {
 # CHECK:           fetch %[[add]]
 
 # SOME-LABEL: func @main
-# SOME-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<10xi32>, %[[ARG_1:[a-z0-9]+]]: tensor<10xi32>) -> tensor<10xi32>
+# SOME-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<10xi32>, %[[ARG_1:[a-z0-9]+]]: tensor<10xi32>) -> tensor<*xi32>
 # SOME-SAME:  control_outputs = ""
 # SOME-SAME:  inputs = "input0,input1"
 # SOME-SAME:  outputs = "Add"
@@ -55,7 +55,7 @@ versions {
 # SOME:           fetch %[[add]]
 
 # NONE-LABEL: func @main
-# NONE-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<10xi32>, %[[ARG_1:[a-z0-9]+]]: tensor<10xi32>) -> tensor<10xi32>
+# NONE-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<10xi32>, %[[ARG_1:[a-z0-9]+]]: tensor<10xi32>) -> tensor<*xi32>
 # NONE-SAME:  control_outputs = ""
 # NONE-SAME:  inputs = "input0,input1"
 # NONE-SAME:  outputs = "Add"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-as-fetch.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-as-fetch.pbtxt
index 524b90b0cc1..50973ea899c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-as-fetch.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-as-fetch.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=arg -tf-input-data-types=DT_INT32 -tf-input-shapes=8 -tf-output-arrays=arg -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s --mlir-print-debuginfo --print-after-all -tf-input-arrays=arg -tf-input-data-types=DT_INT32 -tf-input-shapes=8 -tf-output-arrays=arg -o - | FileCheck %s --dump-input=fail
 
 node {
   name: "arg"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt
index 7b3462f37cd..5578b45716b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 node {
   name: "Constant"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-data-type.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-data-type.pbtxt
index ceacc344887..c57529cebb1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-data-type.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-data-type.pbtxt
@@ -1,5 +1,5 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=p,x -tf-input-shapes=:1  -tf-output-arrays=p,x -o - | FileCheck %s --check-prefix=NONE --dump-input-on-failure
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=p,x -tf-input-shapes=:1  -tf-input-data-types=DT_INT32,DT_BOOL -tf-output-arrays=p,x -o - | FileCheck %s --dump-input-on-failure
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=p,x -tf-input-shapes=:1  -tf-output-arrays=p,x -o - | FileCheck %s --check-prefix=NONE --dump-input-on-failure
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=p,x -tf-input-shapes=:1  -tf-input-data-types=DT_INT32,DT_BOOL -tf-output-arrays=p,x -o - | FileCheck %s --dump-input-on-failure
 
 # Test the handling of the input data types. In particular, if the data type
 # for an input graph node is specified via command line options, use it.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt
index 515e1cf36e5..4bc9df09893 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 node {
   name: "bf16_scalar"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/device-arg-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/device-arg-attr.pbtxt
index 0b87a826305..3c5be84124e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/device-arg-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/device-arg-attr.pbtxt
@@ -1,9 +1,9 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-graph-as-function -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -o - | FileCheck %s --dump-input=fail
 
 # Verify arg devices are added as arg attributes.
 
 # CHECK-LABEL: func @main
-# CHECK-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<*xf32> {tf.device = "/CPU:0"}, %[[ARG_1:[a-z0-9]+]]: tensor<2x4x6x8xi32>) -> (tensor<*xf32>, tensor<2x4x6x8xi32>)
+# CHECK-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<*xf32> {tf.device = "/CPU:0"}, %[[ARG_1:[a-z0-9]+]]: tensor<2x4x6x8xi32>) -> (tensor<*xf32>, tensor<*xi32>)
 
 node {
   name: "args_0"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/empty-value-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/empty-value-attr.pbtxt
index 93a2f602c65..6c385bd219f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/empty-value-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/empty-value-attr.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 node {
   name: "_tf.PartitionedCall"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/error-message-with-source-info.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/error-message-with-source-info.pbtxt
index 650cc9c41d8..b67c88ab77d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/error-message-with-source-info.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/error-message-with-source-info.pbtxt
@@ -1,7 +1,7 @@
-# RUN: not tf-mlir-translate -graphdef-to-mlir -tf-input-arrays=x,y -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=2:3 -tf-output-arrays=x_y_sum %s --tf-debug-info=%s.debug -o - 2>&1 | FileCheck %s
+# RUN: not tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false -tf-input-arrays=x,y -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=2:3 -tf-output-arrays=x_y_sum %s --tf-debug-info=%s.debug -o - 2>&1 | FileCheck %s --dump-input-on-failure
 
 # Checks that source debug information is used in the output error message.
-# CHECK: Graph import failed: Invalid argument: Dimensions must be equal
+# CHECK: error: 'tf.Add' op operands don't have broadcast-compatible shapes
 # CHECK: math_ops.add(x, y, name='x_y_sum')
 # CHECK: build_graph(out_dir)
 node: {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/feed-as-fetch.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/feed-as-fetch.pbtxt
index b75ac6868a3..b639d316dfc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/feed-as-fetch.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/feed-as-fetch.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_INT32 -tf-input-shapes=8 -tf-output-arrays=input -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input -tf-input-data-types=DT_INT32 -tf-input-shapes=8 -tf-output-arrays=input -o - | FileCheck %s --dump-input=fail
 
 node {
   name: "input"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/feed-control-dep.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/feed-control-dep.pbtxt
index 3a3274bf89a..fb2d73779b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/feed-control-dep.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/feed-control-dep.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-output-arrays=output_node -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-output-arrays=output_node -o - | FileCheck %s --dump-input=fail
 
 node {
   name: "input"
@@ -60,7 +60,7 @@ versions {
 }
 
 # CHECK-LABEL: func @main
-# CHECK-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<f32>) -> tensor<f32>
+# CHECK-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<f32>) -> tensor<*xf32>
 # CHECK-SAME:  control_outputs = ""
 # CHECK-SAME:  inputs = "input"
 # CHECK-SAME:  outputs = "output_node"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt
new file mode 100644
index 00000000000..9f044c62736
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt
@@ -0,0 +1,53 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s --dump-input-on-failure
+
+node {
+  name: "custom_relu_func_call"
+  op: "custom_relu"
+}
+node {
+  name: "custom_embedding_matmul_func_call"
+  op: "custom_embedding_matmul"
+}
+library {
+  function {
+    signature {
+      name: "custom_relu"
+    }
+    attr {
+      key: "_implements"
+      value {
+        func {
+          name: "tensorflow.relu"
+        }
+      }
+    }
+  }
+  function {
+    signature {
+      name: "custom_embedding_matmul"
+    }
+    attr {
+      key: "_implements"
+      value {
+        func {
+          name: "tensorflow.embedding_matmul"
+          attr {
+            key: "key1"
+            value {
+              i: 2
+            }
+          }
+          attr {
+            key: "key2"
+            value {
+              b: false
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+# CHECK-DAG: func @custom_relu{{[0-9]*}}() attributes {tf._implements = #tf.func<@tensorflow.relu, {}>}
+# CHECK-DAG: func @custom_embedding_matmul{{[0-9]*}}() attributes {tf._implements = #tf.func<@tensorflow.embedding_matmul, {key1 = 2 : i64, key2 = false}>}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-if-ops.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-if-ops.pbtxt
index 8eca30802ef..0f9e49088f2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-if-ops.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-if-ops.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=a,b -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-input-shapes=':' -tf-output-arrays=StatefulIf,StatelessIf -o - -mlir-print-debuginfo | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=a,b -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-input-shapes=':' -tf-output-arrays=StatefulIf,StatelessIf -o - -mlir-print-debuginfo | FileCheck %s
 
 # Verify that TensorFlow If and StatelessIf ops are mapped to the
 # composite If op in MLIR with is_stateless attribute set accordingly to
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-while-ops.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-while-ops.pbtxt
index ede01ebf62b..5295688d1b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-while-ops.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-while-ops.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=iter,val -tf-input-data-types=DT_INT32,DT_FLOAT -tf-input-shapes=':' -tf-output-arrays=StatefulWhile:1,StatelessWhile:1 -o - -mlir-print-debuginfo | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=iter,val -tf-input-data-types=DT_INT32,DT_FLOAT -tf-input-shapes=':' -tf-output-arrays=StatefulWhile:1,StatelessWhile:1 -o - -mlir-print-debuginfo | FileCheck %s
 
 # Verify that TensorFlow While and StatelessWhile ops are mapped to the
 # composite While op in MLIR with is_stateless attribute set accordingly to
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function-control-ret.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function-control-ret.pbtxt
index dd8aa91e8c7..92b85a7b9c8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function-control-ret.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function-control-ret.pbtxt
@@ -1,6 +1,6 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-graph-as-function -tf-control-output-arrays=var1_add,var2_add -o - | FileCheck %s --dump-input=fail
-# RUN: not tf-mlir-translate -graphdef-to-mlir %s -tf-graph-as-function -tf-control-output-arrays=var1_add,var1_add -o - 2>&1 | FileCheck %s --check-prefix=UNIQUE --dump-input=fail
-# RUN: not tf-mlir-translate -graphdef-to-mlir %s -tf-graph-as-function -tf-control-output-arrays=var3_add -o - 2>&1 | FileCheck %s --check-prefix=MISSING --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -tf-control-output-arrays=var1_add,var2_add -o - | FileCheck %s --dump-input=fail
+# RUN: not tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -tf-control-output-arrays=var1_add,var1_add -o - 2>&1 | FileCheck %s --check-prefix=UNIQUE --dump-input=fail
+# RUN: not tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -tf-control-output-arrays=var3_add -o - 2>&1 | FileCheck %s --check-prefix=MISSING --dump-input=fail
 
 node {
   name: "arg0"
@@ -194,12 +194,10 @@ versions {
 # CHECK-DAG:     %[[VAR_ADD_2:.*]] = tf_executor.island wraps "tf.AssignAddVariableOp"(%[[ARG_2]], %{{.*}})
 # CHECK:         tf_executor.fetch %{{.*}}, %[[VAR_ADD_1]], %[[VAR_ADD_2]]
 
-
 # Test duplicate control ret node names.
 
 # UNIQUE: Control outputs must be unique
 
-
 # Test missing control ret node name.
 
 # MISSING: Control output 'var3_add' is missing
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function-retval-of-arg.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function-retval-of-arg.pbtxt
index e4340c5cda0..82a3ba97d71 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function-retval-of-arg.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function-retval-of-arg.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-graph-as-function -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -o - | FileCheck %s --dump-input=fail
 
 node {
   name: "arg"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
index 3052db812b8..d26585edb03 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
@@ -1,10 +1,10 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-graph-as-function -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -o - | FileCheck %s --dump-input=fail
 
 # Verify main graph was converted to a function, args/rets are mapped correctly,
 # and ops in the main graph are retained. In addition, check if subsequent
 # functions are converted.
 
-# CHECK:      func @main(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, %arg2: tensor<*xf32>, %arg3: tensor<2x4x6x8xi32>) -> (tensor<f32>, tensor<f32>)
+# CHECK:      func @main(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, %arg2: tensor<*xf32>, %arg3: tensor<2x4x6x8xi32>) -> (tensor<*xf32>, tensor<*xf32>)
 # CHECK-SAME: control_outputs = ""
 # CHECK-SAME: inputs = "args_0,args_1,args_2,args_3"
 # CHECK-SAME: outputs = "rets_0,rets_1"
@@ -12,7 +12,7 @@
 # CHECK:          %[[ISLAND_1:.*]], %[[ISLAND_1_control:.*]] = tf_executor.island wraps "tf.Identity"(%[[ISLAND_0]])
 # CHECK:          %[[ISLAND_2:.*]], %[[ISLAND_2_control:.*]] = tf_executor.island wraps "tf.StatefulPartitionedCall"
 # CHECK-SAME:       f = @[[FUNC:[a-z0-9]*]]
-# CHECK:          tf_executor.fetch %[[ISLAND_1]], %[[ISLAND_2]] : tensor<f32>, tensor<f32>
+# CHECK:          tf_executor.fetch %[[ISLAND_1]], %[[ISLAND_2]] : tensor<*xf32>, tensor<*xf32>
 # CHECK:      func @[[FUNC]](%arg0: tensor<*xf32>, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
 
 node {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
index 207d6676f61..cf08d55b3cb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 node {
   name: "Constant"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt
index 75002f538d6..aa47f811ab0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s --dump-input-on-failure
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s --dump-input-on-failure
 
 # Verify that the data_format attributes is pulled from the default value in the
 # registry when not present in the GraphDef
@@ -9,7 +9,7 @@
 # export.
 # CHECK: tf.MaxPool
 # CHECK-NOT: T = f32
-# CHECK-SAME: : (tensor<?x112x112x32xf32>) -> tensor<?x56x56x32xf32>
+# CHECK-SAME: : (tensor<*xf32>) -> tensor<*xf32>
 
 node {
   name: "input"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt
index 157db7d5331..327260e2860 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 node {
   name: "PartitionedCall"
@@ -76,7 +76,7 @@ library {
     # ensure that kDeviceRetOp is used instead of kRetOp
     # CHECK-LABEL: func @foo
     # CHECK:    tf.experimental_ints_on_device = true
-    # CHECK:    return %{{.*}} tensor<i32>
+    # CHECK:    return %{{.*}} tensor<{{.*}}i32>
     attr {
       key: "experimental_ints_on_device"
       value {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt
index 12d05c1195f..f41089f27e6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 # This test is intended to verify the tensor_content field on import of an empty
 # tensor.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt
index 0176edb4b21..eb909834357 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 # CHECK-LABEL:       func @main() {
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt
index f0a7a574ae3..fa6f63e27a5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=x -tf-input-data-types=DT_INT32 -tf-input-shapes=10 -tf-output-arrays=func_call -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=x -tf-input-data-types=DT_INT32 -tf-input-shapes=10 -tf-output-arrays=func_call -o - | FileCheck %s
 
 node {
   name: "x"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt
index 2c6523700e5..e85f1078d43 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s --dump-input=fail
 
 # Verify for functions with control return values, the island with only a
 # consumed control return value has its control output added to the GraphOps
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt
index 7b4804cc801..ab97f6f9c32 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s --dump-input=fail
 
 # Verify for functions with control return values, the island with a consumed
 # data output and a consumed control has both its outputs added to the GraphOps
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt
index 6a2a411d115..10c4d35b5eb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 # Verify that we properly import call site function attributes.
 # CHECK: tf.If
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-input-shapes.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-input-shapes.pbtxt
index fc27e82d20e..9d47292f806 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-input-shapes.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-input-shapes.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 # Verify that the _input_shapes attribute of the FunctionDef is respected.
 # This also checks that the output type is correctly inferred based on
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt
index 563007f4305..9737325a499 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 # This test is tailored to reproduce b/141617294. In particular, the function
 # library contains "foo1", "foo2", ..., "foo20", from which "foo1" and "foo11"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
index df740bc6ccd..0e6e561225d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-output-arrays=func_call -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-output-arrays=func_call -o - | FileCheck %s
 
 node {
   name: "x"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt
deleted file mode 100644
index e0e60c04865..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt
+++ /dev/null
@@ -1,145 +0,0 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
-
-# Verify that the return type of the functions is properly inferred
-#CHECK: func @get_zeros0(%arg0: tensor<*xi32>) -> tensor<2xi32>
-#CHECK: func @identity0(%arg0: tensor<*xi32>) -> tensor<*xi32>
-
-node {
-  name: "Placeholder"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_BOOL
-    }
-  }
-  experimental_debug_info {
-  }
-}
-node {
-  name: "Placeholder_1"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  experimental_debug_info {
-  }
-}
-node {
-  name: "If"
-  op: "If"
-  input: "Placeholder"
-  input: "Placeholder_1"
-  attr {
-    key: "Tcond"
-    value {
-      type: DT_BOOL
-    }
-  }
-  attr {
-    key: "Tin"
-    value {
-      list {
-        type: DT_INT32
-      }
-    }
-  }
-  attr {
-    key: "Tout"
-    value {
-      list {
-        type: DT_INT32
-      }
-    }
-  }
-  attr {
-    key: "else_branch"
-    value {
-      func {
-        name: "get_zeros"
-      }
-    }
-  }
-  attr {
-    key: "then_branch"
-    value {
-      func {
-        name: "identity"
-      }
-    }
-  }
-  experimental_debug_info {
-  }
-}
-library {
-  function {
-    signature {
-      name: "get_zeros"
-      input_arg {
-        name: "get_zeros"
-        type: DT_INT32
-      }
-      output_arg {
-        name: "get_zeros1"
-        type: DT_INT32
-      }
-    }
-    node_def {
-      name: "const"
-      op: "Const"
-      attr {
-        key: "dtype"
-        value {
-          type: DT_INT32
-        }
-      }
-      attr {
-        key: "value"
-        value {
-          tensor {
-            dtype: DT_INT32
-            tensor_shape {
-              dim {
-                size: 2
-              }
-            }
-            int_val: 1
-            int_val: 2
-          }
-        }
-      }
-      experimental_debug_info {
-        original_node_names: "const"
-      }
-    }
-    ret {
-      key: "get_zeros1"
-      value: "const:output:0"
-    }
-  }
-  function {
-    signature {
-      name: "identity"
-      input_arg {
-        name: "identity"
-        type: DT_INT32
-      }
-      output_arg {
-        name: "identity1"
-        type: DT_INT32
-      }
-    }
-    ret {
-      key: "identity1"
-      value: "identity"
-    }
-  }
-}
-versions {
-  producer: 29
-  min_consumer: 12
-}
-
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-variable-shapes.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-variable-shapes.pbtxt
deleted file mode 100644
index e75fe8c9d67..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-variable-shapes.pbtxt
+++ /dev/null
@@ -1,177 +0,0 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
-
-# Verify that the _output_shapes attribute of ReadVariableOp's are used to get
-# variable types.
-# This also checks that the output type is correctly inferred based on
-# that.
-# CHECK: func @__inference_some_function_130(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-# CHECK: tf.ReadVariableOp"(%arg0) {{.*}} : (tensor<*x!tf.resource>) -> tensor<f32>
-
-
-node {
-  name  : "Variable"
-  op    : "VarHandleOp"
-  attr {
-    key  : "shape"
-    value {
-      shape {
-      }
-    }
-  }
-  attr {
-    key  : "dtype"
-    value {
-      type       : DT_FLOAT
-    }
-  }
-  attr {
-    key  : "shared_name"
-    value {
-      s: "Variable"
-    }
-  }
-  attr {
-    key  : "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name  : "StatefulPartitionedCall"
-  op    : "StatefulPartitionedCall"
-  input : [ "Variable" ]
-  attr {
-    key  : "f"
-    value {
-      func {
-        name: "__inference_some_function_13"
-      }
-    }
-  }
-  attr {
-    key  : "config_proto"
-    value {
-      s: "\n\x07\n\x03GPU\x10\x00\n\x07\n\x03\x43PU\x10\x01\x32\x02J\x00\x38\x01"
-    }
-  }
-  attr {
-    key  : "Tout"
-    value {
-      list {
-        type  : [ DT_FLOAT ]
-      }
-    }
-  }
-  attr {
-    key  : "_gradient_op_type"
-    value {
-      s: "PartitionedCall-29"
-    }
-  }
-  attr {
-    key  : "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key  : "Tin"
-    value {
-      list {
-        type  : [ DT_RESOURCE ]
-      }
-    }
-  }
-}
-library {
-  function {
-    signature {
-      name: "__inference_some_function_13"
-      input_arg {
-        name  : "readvariableop_resource"
-        type  : DT_RESOURCE
-      }
-      output_arg {
-        name  : "identity"
-        type  : DT_FLOAT
-      }
-      is_stateful   : true
-      control_output: [ "ReadVariableOp" ]
-    }
-    node_def {
-      name  : "ReadVariableOp"
-      op    : "ReadVariableOp"
-      input : [ "readvariableop_resource" ]
-      device: "/job:localhost/replica:0/task:0/device:CPU:0"
-      attr {
-        key  : "dtype"
-        value {
-          type       : DT_FLOAT
-        }
-      }
-      attr {
-        key  : "_output_shapes"
-        value {
-          list {
-            shape {
-            }
-          }
-        }
-      }
-    }
-    node_def {
-      name  : "Identity"
-      op    : "Identity"
-      input : [ "ReadVariableOp:value:0", "^ReadVariableOp" ]
-      attr {
-        key  : "T"
-        value {
-          type       : DT_FLOAT
-        }
-      }
-      attr {
-        key  : "_output_shapes"
-        value {
-          list {
-            shape {
-            }
-          }
-        }
-      }
-    }
-    ret {
-      key  : "identity"
-      value: "Identity:output:0"
-    }
-    attr {
-      key  : "_input_shapes"
-      value {
-        list {
-          shape {
-            unknown_rank: true
-          }
-        }
-      }
-    }
-    control_ret {
-      key  : "ReadVariableOp"
-      value: "ReadVariableOp"
-    }
-    arg_attr {
-      key  : 0x00000000
-      value {
-      }
-    }
-  }
-}
-versions {
-  producer     : 148
-  min_consumer : 12
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt
index 5ab948eba37..e7f7a59a343 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 # In GraphDef custom gradient functions are modeled using GradientDef which
 # links the function and its gradient. In MLIR a TF ops gradient function is
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-input-func-arg-name-collision.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-input-func-arg-name-collision.pbtxt
index ba94c600cf2..bf210e51288 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-input-func-arg-name-collision.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-input-func-arg-name-collision.pbtxt
@@ -1,9 +1,9 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_INT32 -tf-input-shapes='' -tf-output-arrays=while:2 -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input -tf-input-data-types=DT_INT32 -tf-input-shapes='' -tf-output-arrays=while:2 -o - | FileCheck %s
 
 # This check that we don't error out when importing GraphDef containing
 # functions with arg name that are the same as the graph input name
 
-# CHECK: func @main(%arg0: tensor<i32>) -> tensor<i32>
+# CHECK: func @main(%arg0: tensor<{{.*}}i32>) -> tensor<{{.*}}i32>
 # CHECK: func @while_body
 # CHECK: func @while_cond
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
index d147106579d..b28df8e2c69 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 node {
   name: "unnamed"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-malformed.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-malformed.pbtxt
index a201ccee1fa..af884fe9634 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-malformed.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-malformed.pbtxt
@@ -1,4 +1,4 @@
-# RUN: not tf-mlir-translate -graphdef-to-mlir %s -o - 2>&1 | FileCheck %s
+# RUN: not tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - 2>&1 | FileCheck %s
 
 this is not a valid graph def
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt
index 6ffe4bfbed2..568188f040e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt
@@ -1,16 +1,16 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=out:1,out -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=out:1,out -o - | FileCheck %s
 
 # Verify that we match correctly the input / output when they are scalar.
 
 # CHECK-LABEL: func @main
-# CHECK-SAME:  (%{{[a-z0-9]+}}: tensor<f32> {tf.device = "/device:CPU:0"}) -> (tensor<f32>, tensor<f32>)
+# CHECK-SAME:  (%{{[a-z0-9]+}}: tensor<f32> {tf.device = "/device:CPU:0"})
 # CHECK-SAME:  control_outputs = ""
 # CHECK-SAME:  inputs = "input"
 # CHECK-SAME:  outputs = "out:1,out"
 
 # CHECK:           tf.Relu
 # CHECK:           %[[IDENTITY:[a-z_0-9]+]]:2, {{.*}} = tf_executor.island wraps "tf.IdentityN"
-# CHECK:           etch %[[IDENTITY]]#1, %[[IDENTITY]]#0 : tensor<f32>, tensor<f32>
+# CHECK:           etch %[[IDENTITY]]#1, %[[IDENTITY]]#0
 
 node {
   name: "input"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt
index 191ff5878ee..366e78d0834 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir -mlir-print-debuginfo %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false -mlir-print-debuginfo %s -o - | FileCheck %s --dump-input-on-failure
 
 node {
   name: "PartitionedCall"
@@ -106,5 +106,5 @@ versions {
 # CHECK: func @main
 # CHECK: "tf.PartitionedCall"()
 # CHECK-SAME: f = @[[FUNCTION:[A-Za-z0-9_]*]]
-# CHECK: func @[[FUNCTION]]() -> tensor<ui8>
-# CHECK: return {{.*}} : tensor<ui8>
+# CHECK: func @[[FUNCTION]]() -> tensor<*xui8>
+# CHECK: return {{.*}} : tensor<*xui8>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-undefined-output.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-undefined-output.pbtxt
index 4a778f1945e..3ac8804ce47 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-undefined-output.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-undefined-output.pbtxt
@@ -1,4 +1,4 @@
-# RUN: not tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=NotANodeInTheGraph -o - 2>&1 | FileCheck %s
+# RUN: not tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=NotANodeInTheGraph -o - 2>&1 | FileCheck %s
 
 # CHECK: Graph import failed: Invalid argument: Output NotANodeInTheGraph was not found in graph
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt
index 20bf33d7fb2..926de91e76d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir -mlir-print-debuginfo %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false -mlir-print-debuginfo %s -o - | FileCheck %s
 
 node {
   name: "x"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
index 16cdde94712..e21fd901a9e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir -mlir-print-debuginfo %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false -mlir-print-debuginfo %s -o - | FileCheck %s
 
 # Verify that importing a Graph with a backedge leads to two NextIteration nodes
 # to break the cycle.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/invalid-output-index.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/invalid-output-index.pbtxt
index 77107824319..1b14a733ba2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/invalid-output-index.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/invalid-output-index.pbtxt
@@ -1,4 +1,4 @@
-# RUN: not tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=input:1 -o - 2>&1 | FileCheck %s
+# RUN: not tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=input:1 -o - 2>&1 | FileCheck %s
 
 # CHECK: Graph import failed: Invalid argument: Invalid output index 1 specified for node: input
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/legacy-fed-input-without-inputs.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/legacy-fed-input-without-inputs.pbtxt
index 81fff4d64a8..5b3660e7bed 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/legacy-fed-input-without-inputs.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/legacy-fed-input-without-inputs.pbtxt
@@ -1,5 +1,5 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=input -tf-convert-legacy-fed-inputs -o - | FileCheck %s
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-shapes='' -tf-output-arrays=input -tf-convert-legacy-fed-inputs -o - | FileCheck --check-prefix=NODATATYPE %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=input -tf-convert-legacy-fed-inputs -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input -tf-input-shapes='' -tf-output-arrays=input -tf-convert-legacy-fed-inputs -o - | FileCheck --check-prefix=NODATATYPE %s
 
 # Verify that invalid LegacyFedInput ops without any inputs are replaced with
 # Placeholder ops.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt
index da79023093c..fd33be7baaa 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt
@@ -1,7 +1,7 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s | FileCheck %s
 
 # CHECK:"tf.MlirPassthroughOp"
-# CHECK: mlir_module = "\0Afunc @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {\0A   %add = \22tf.Add\22(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>\0A   %ret = \22magic.op\22(%add, %add) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>\0A   return %ret : tensor<10x10xf32>\0A}\0A"} : (tensor<10xf32>, tensor<10xf32>) -> tensor<*xf32>
+# CHECK: mlir_module = "\0Afunc @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {\0A %add = \22tf.Add\22(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>\0A %ret = \22magic.op\22(%add, %add) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>\0A return %ret : tensor<10x10xf32>\0A}\0A"} : (tensor<10xf32>, tensor<10xf32>) -> tensor<*xf32>
 
 node {
   name: "x"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multi-output-feeds.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multi-output-feeds.pbtxt
index a755f1ff2b1..1ea045b9f77 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multi-output-feeds.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multi-output-feeds.pbtxt
@@ -1,6 +1,6 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=z:1,z:2 -tf-input-shapes=':' -tf-output-arrays=z:2,z:1,a:0 -o - | FileCheck %s --dump-input=fail
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-prune-unused-nodes -tf-input-arrays=z:1,z:2 -tf-input-shapes=':' -tf-output-arrays=z:2,z:1,a:0 -o - | FileCheck --check-prefix=PRUNE %s --dump-input=fail
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-prune-unused-nodes -tf-input-arrays=z:1,z:2 -tf-input-shapes=':' -tf-output-arrays=z:0,a:0 -o - | FileCheck --check-prefix=PRESERVE %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=z:1,z:2 -tf-input-shapes=':' -tf-output-arrays=z:2,z:1,a:0 -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-prune-unused-nodes -tf-input-arrays=z:1,z:2 -tf-input-shapes=':' -tf-output-arrays=z:2,z:1,a:0 -o - | FileCheck --check-prefix=PRUNE %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-prune-unused-nodes -tf-input-arrays=z:1,z:2 -tf-input-shapes=':' -tf-output-arrays=z:0,a:0 -o - | FileCheck --check-prefix=PRESERVE %s --dump-input=fail
 
 # Generated in Python via
 # ```
@@ -11,7 +11,7 @@
 #   x = tf.constant(3.0)
 #   y = tf.constant(4.0)
 #   var = tf.Variable(2.0)
-#   var_add = var.assign_add(3.0)
+#   var_add = var.assign_add(1.0)
 #   with g.control_dependencies([var_add]):
 #     z0, z1, z2 = tf.identity_n((w, x, y))
 #
@@ -198,7 +198,7 @@ node {
         dtype: DT_FLOAT
         tensor_shape {
         }
-        float_val: 3.0
+        float_val: 1.0
       }
     }
   }
@@ -269,7 +269,7 @@ versions {
 # of the feed.
 #
 # CHECK-LABEL: func @main
-# CHECK-SAME:  (%[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>)
+# CHECK-SAME:  (%[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<*xf32>)
 # CHECK-SAME:  control_outputs = ""
 # CHECK-SAME:  inputs = "z:1,z:2"
 # CHECK-SAME:  outputs = "z:2,z:1,a:0"
@@ -282,7 +282,7 @@ versions {
 # unreachable are pruned if pruning is enabled.
 #
 # PRUNE-LABEL: func @main
-# PRUNE-SAME:  (%[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>)
+# PRUNE-SAME:  (%[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<*xf32>)
 # PRUNE-SAME:  control_outputs = ""
 # PRUNE-SAME:  inputs = "z:1,z:2"
 # PRUNE-SAME:  outputs = "z:2,z:1,a:0"
@@ -299,7 +299,7 @@ versions {
 # unreachable are preserved if pruning is not enabled.
 #
 # PRESERVE-LABEL: func @main
-# PRESERVE-SAME:  (%[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>)
+# PRESERVE-SAME:  (%[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<f32>) -> (tensor<*xf32>, tensor<*xf32>)
 # PRESERVE-SAME:  control_outputs = ""
 # PRESERVE-SAME:  inputs = "z:1,z:2"
 # PRESERVE-SAME:  outputs = "z:0,a:0"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt
index 8199484e25e..38b573f2437 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 # Verify that a NextIteration node feeding two different merge nodes is properly
 # Imported.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/node-locations.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/node-locations.pbtxt
index fdf279f3887..82bd09130f9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/node-locations.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/node-locations.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir -mlir-print-debuginfo %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false -mlir-print-debuginfo %s -o - | FileCheck %s
 
 # Check that we correctly import the node locations.
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/output-shapes-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/output-shapes-attr.pbtxt
index 2c93fde5bf2..513e0b2ae59 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/output-shapes-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/output-shapes-attr.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 node {
   name: "input0"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/parse_example.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/parse_example.pbtxt
index 7411a5ea4d7..ec7f0117a8c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/parse_example.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/parse_example.pbtxt
@@ -1,8 +1,8 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input0 -tf-input-data-types=DT_STRING -tf-input-shapes=32 -tf-output-arrays=ParseExample/ParseExampleV2:0,ParseExample/ParseExampleV2:7 -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-input-arrays=input0 -tf-input-data-types=DT_STRING -tf-input-shapes=32 -tf-output-arrays=ParseExample/ParseExampleV2:0,ParseExample/ParseExampleV2:7 -o - | FileCheck %s
 
 # CHECK: %[[parse_example:.*]]:8, %[[parse_example_control:.*]] = tf_executor.island wraps "tf.ParseExampleV2"(%arg0,
 # CHECK: result_segment_sizes = dense<[2, 2, 2, 2, 0, 0]> : vector<6xi32>
-# CHECK: tf_executor.fetch %[[parse_example]]#0, %[[parse_example]]#7 : tensor<?x2xi64>, tensor<32xf32>
+# CHECK: tf_executor.fetch %[[parse_example]]#0, %[[parse_example]]#7 : tensor<*xi64>, tensor<*xf32>
 
 node {
   name: "input0"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt
index 64054cd2152..50b59ad2afa 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-prune-unused-nodes -tf-input-arrays=input0,input1 -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=10:10 -tf-output-arrays=Add -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-prune-unused-nodes -tf-input-arrays=input0,input1 -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=10:10 -tf-output-arrays=Add -o - | FileCheck %s
 
 # Verify that an unused Node (here named "Prune") isn't converted when we
 # request pruning on import.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/quint8-const.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/quint8-const.pbtxt
index cf8051f7aaa..0a8db4260fe 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/quint8-const.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/quint8-const.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - -mlir-print-debuginfo | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - -mlir-print-debuginfo | FileCheck %s
 
 node {
   name: "Quantized_Constant"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/stateful-attribute.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/stateful-attribute.pbtxt
index cb4b00f93be..7a395d2d345 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/stateful-attribute.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/stateful-attribute.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s
 
 node {
   name: "Call_foo"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt
index 051b88102be..e346ff6affe 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt
@@ -1,7 +1,7 @@
 # RUN: tf-mlir-translate -graphdef-to-splatted-mlir %s -o - | FileCheck %s
 
 # CHECK: tf.Const
-# CHECK-SAME: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B2073697A653A2033207D207D2074656E736F725F636F6E74656E743A20225C3030305C3030305C30303022"> : tensor<3x!tf.string>
+# CHECK-SAME: value = dense<""> : tensor<3x!tf.string>
 
 node {
   name: "save/SaveV2/shape_and_slices"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt
index e819efcddd1..5ca72d1a854 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt
@@ -1,11 +1,11 @@
 # RUN: tf-mlir-translate -graphdef-to-splatted-mlir %s -o - -mlir-print-debuginfo | FileCheck %s --dump-input-on-failure
 
 # CHECK: tf_executor.SwitchN
-# CHECK-SAME: of 3 : tensor<i32>
+# CHECK-SAME: of 3 : tensor<*xi32>
 # CHECK-SAME: T = i32
 # CHECK-SAME: loc("Case/branch_index/_3")
 # CHECK: tf_executor.SwitchN
-# CHECK-SAME: of 2 : tensor<f32>
+# CHECK-SAME: of 2 : tensor<*xf32>
 # CHECK-SAME: T = f32
 # CHECK-SAME: loc("Case/Case/input_0/_7")
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/target.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/target.pbtxt
index fbb979c28a4..9f37aeed1d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/target.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/target.pbtxt
@@ -1,6 +1,6 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-control-output-arrays=AssignAdd -o - | FileCheck %s --dump-input=fail
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-prune-unused-nodes -tf-control-output-arrays=AssignAdd -o - | FileCheck --check-prefix=PRUNE %s --dump-input=fail
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-prune-unused-nodes -tf-control-output-arrays=Variable/Assign,AssignAdd -o - | FileCheck --check-prefix=PRESERVE %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-control-output-arrays=AssignAdd -o - | FileCheck %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-prune-unused-nodes -tf-control-output-arrays=AssignAdd -o - | FileCheck --check-prefix=PRUNE %s --dump-input=fail
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-prune-unused-nodes -tf-control-output-arrays=Variable/Assign,AssignAdd -o - | FileCheck --check-prefix=PRESERVE %s --dump-input=fail
 
 # Generated in Python via
 # ```
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tensor-list.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tensor-list.pbtxt
index cc24caae6e8..88d9006cf26 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tensor-list.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tensor-list.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s --dump-input-on-failure
 
 node {
   name: "TensorListReserve/num_elements"
@@ -59,7 +59,7 @@ node {
     key: "value"
     value {
       tensor {
-        dtype: DT_INT32
+        dtype: DT_FLOAT
         tensor_shape {
           dim {
             size: 2
@@ -68,10 +68,10 @@ node {
             size: 2
           }
         }
-        int_val: 1
-        int_val: 2
-        int_val: 3
-        int_val: 4
+        float_val: 1
+        float_val: 2
+        float_val: 3
+        float_val: 4
       }
     }
   }
@@ -209,10 +209,10 @@ versions {
 }
 
 # Verify that list element shape and dtype are expected.
-# CHECK:  tf.TensorListReserve{{.*}}(tensor<2xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<2x2xf32>>>
+# CHECK:  tf.TensorListReserve{{.*}}(tensor<2xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<*xf32>>>
 
 # Nested variant type.
-# CHECK:  tf.TensorListReserve{{.*}}(tensor<2xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<2x2x!tf.variant>>>
+# CHECK:  tf.TensorListReserve{{.*}}(tensor<2xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<*x!tf.variant>>>
 
-# CHECK:  tf.TensorListSetItem{{.*}}(tensor<!tf.variant<tensor<2x2xf32>>>, tensor<i32>, tensor<2x2xf32>) -> tensor<!tf.variant<tensor<2x2xf32>>>
-# CHECK:  tf.TensorListStack{{.*}}(tensor<!tf.variant<tensor<2x2xf32>>>, tensor<i32>) -> tensor<?x2x2xf32>
+# CHECK:  tf.TensorListSetItem{{.*}}(tensor<!tf.variant<tensor<*xf32>>>, tensor<i32>, tensor<2x2xf32>) -> tensor<*x!tf.variant>
+# CHECK:  tf.TensorListStack{{.*}}(tensor<*x!tf.variant>, tensor<i32>) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tf-data-pipeline.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tf-data-pipeline.pbtxt
new file mode 100644
index 00000000000..1e640baa507
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tf-data-pipeline.pbtxt
@@ -0,0 +1,256 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-output-arrays=BatchDatasetV2 -o - | FileCheck %s --dump-input-on-failure
+
+# CHECK-LABEL: func @main() -> tensor<*x!tf.variant>
+# CHECK: %[[tensor_slice:.*]], %[[tensor_slice_control:.*]] = tf_executor.island wraps "tf.TensorSliceDataset"
+# CHECK: %[[map_dataset:.*]], %[[map_dataset_control:.*]] = tf_executor.island wraps "tf.MapDataset"(%[[tensor_slice]]
+# CHECK: %[[batch_dataset:.*]], %[[batch_dataset_control:.*]] = tf_executor.island wraps "tf.BatchDatasetV2"(%[[map_dataset]]
+
+node {
+  name: "tensors/normalize_tensors/component_0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "TensorSliceDataset"
+  op: "TensorSliceDataset"
+  input: "tensors/normalize_tensors/component_0"
+  attr {
+    key: "Toutput_types"
+    value {
+      list {
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MapDataset"
+  op: "MapDataset"
+  input: "TensorSliceDataset"
+  attr {
+    key: "Targuments"
+    value {
+      list {
+      }
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+        name: "__inference_Dataset_map_<lambda>_8"
+      }
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "preserve_cardinality"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "use_inter_op_parallelism"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "batch_size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 5
+      }
+    }
+  }
+}
+node {
+  name: "drop_remainder"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: false
+      }
+    }
+  }
+}
+node {
+  name: "BatchDatasetV2"
+  op: "BatchDatasetV2"
+  input: "MapDataset"
+  input: "batch_size"
+  input: "drop_remainder"
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "parallel_copy"
+    value {
+      b: false
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "__inference_Dataset_map_<lambda>_8"
+      input_arg {
+        name: "args_0"
+        type: DT_INT32
+      }
+      output_arg {
+        name: "identity"
+        type: DT_INT32
+      }
+    }
+    node_def {
+      name: "mul/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "mul"
+      op: "Mul"
+      input: "args_0"
+      input: "mul/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Identity"
+      op: "Identity"
+      input: "mul:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    ret {
+      key: "identity"
+      value: "Identity:output:0"
+    }
+    arg_attr {
+      key: 0
+      value {
+        attr {
+          key: "_user_specified_name"
+          value {
+            s: "args_0"
+          }
+        }
+      }
+    }
+  }
+}
+versions {
+  producer: 134
+  min_consumer: 12
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir b/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir
index 83cfbbac4ab..1f4f03466f1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir
@@ -1,5 +1,6 @@
-// RUN: tf-opt %s --run-tf-graph-optimization --graph-passes=IsolatePlacerInspectionRequiredOpsPass  | FileCheck %s
+// RUN: tf-opt %s --run-tf-graph-optimization --graph-passes=IsolatePlacerInspectionRequiredOpsPass | FileCheck %s
 
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
 func @main() {
   tf_executor.graph {
     %0:2 = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "n"} : () -> tensor<!tf.resource<tensor<8xf32>>>
@@ -15,6 +16,7 @@ func @foo(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> {
   }
   return %graph : tensor<!tf.resource>
 }
+}
 
 // The IsolatePlacerInspectionRequiredOpsPass adds Identities for each input/output of function-calling ops.
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 0195b1b0d3e..7691a6bd6e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -2,17 +2,17 @@
 
 
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
   return %0 : tensor<?x?x?x?xi32>
 }
 
@@ -23,12 +23,12 @@ func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0 : tensor<4x4x4x4xi32>
 }
 
@@ -38,7 +38,7 @@ func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -48,7 +48,7 @@ func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0 : tensor<?x?xi32>
 }
 
@@ -68,7 +68,7 @@ func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -78,7 +78,7 @@ func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -88,7 +88,7 @@ func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -98,7 +98,7 @@ func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
-  %0 = "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
+  %0 = "xla_chlo.broadcast_shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
   return %0 : tensor<2x4xi32>
 }
 
@@ -108,12 +108,12 @@ func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
 }
 
 func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @and_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -123,12 +123,12 @@ func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
 }
 
 func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @or_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -138,12 +138,12 @@ func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
 func @bitwise_or_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
@@ -153,12 +153,12 @@ func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
 func @bitwise_and_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
@@ -174,19 +174,19 @@ func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
 
 func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<2x3xi32>
-  %1 = "xla_hlo.compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+  %1 = "xla_chlo.broadcast_compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
   %2 = xla_hlo.constant dense<0> : tensor<3xi32>
-  %3 = "xla_hlo.compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-  %4 = "xla_hlo.compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
-  %5 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %3 = "xla_chlo.broadcast_compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+  %4 = "xla_chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
+  %5 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %6 = "xla_hlo.abs"(%arg0) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %7 = "xla_hlo.abs"(%arg1) : (tensor<3xi32>) -> tensor<3xi32>
   %8 = xla_hlo.constant dense<1> : tensor<3xi32>
   %9 = xla_hlo.subtract %7, %8 : tensor<3xi32>
-  %10 = "xla_hlo.add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %10 = "xla_chlo.broadcast_add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %11 = "xla_hlo.negate"(%10) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %12 = "xla_hlo.abs"(%arg1) : (tensor<3xi32>) -> tensor<3xi32>
-  %13 = "xla_hlo.divide"(%11, %12) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %13 = "xla_chlo.broadcast_divide"(%11, %12) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %14 = "xla_hlo.select"(%4, %5, %13) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %14 : tensor<2x3xi32>
 }
@@ -195,14 +195,14 @@ func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32
   %0 = xla_hlo.constant dense<0> : tensor<3xi32>
   %1 = "xla_hlo.compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
   %2 = xla_hlo.constant dense<0> : tensor<2x3xi32>
-  %3 = "xla_hlo.compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-  %4 = "xla_hlo.compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
-  %5 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %3 = "xla_chlo.broadcast_compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+  %4 = "xla_chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
+  %5 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   %6 = "xla_hlo.abs"(%arg0) : (tensor<3xi32>) -> tensor<3xi32>
   %7 = "xla_hlo.abs"(%arg1) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %8 = xla_hlo.constant dense<1> : tensor<2x3xi32>
   %9 = xla_hlo.subtract %7, %8 : tensor<2x3xi32>
-  %10 = "xla_hlo.add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %10 = "xla_chlo.broadcast_add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   %11 = "xla_hlo.negate"(%10) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %12 = "xla_hlo.abs"(%arg1) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %13 = xla_hlo.divide %11, %12 : tensor<2x3xi32>
@@ -218,8 +218,8 @@ func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 }
 
 func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> tensor<2x3xf16> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-  %1 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+  %1 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
   %2 = "xla_hlo.floor"(%1) : (tensor<2x3xf16>) -> tensor<2x3xf16>
   return %2 : tensor<2x3xf16>
 }
@@ -230,22 +230,22 @@ func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -255,17 +255,17 @@ func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @notequal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -275,7 +275,7 @@ func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -285,7 +285,7 @@ func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -295,7 +295,7 @@ func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -305,7 +305,7 @@ func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -324,42 +324,37 @@ func @const() -> tensor<2xi32> {
   return %0 : tensor<2xi32>
 }
 
-func @const_dynamic_output() -> tensor<*xi32> {
-  %0 = xla_hlo.constant {value = dense<0> : tensor<2xi32>} : tensor<*xi32>
-  return %0 : tensor<*xi32>
-}
-
 func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
-  %1 = "xla_hlo.maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  %1 = "xla_chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   return %1 : tensor<1xi32>
 }
 
 func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
-  %1 = "xla_hlo.maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+  %1 = "xla_chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   return %1 : tensor<?xi32>
 }
 
 func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
   %1 = xla_hlo.constant dense<6> : tensor<i32>
-  %2 = "xla_hlo.minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-  %3 = "xla_hlo.maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %2 = "xla_chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %3 = "xla_chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
   return %3 : tensor<1xi32>
 }
 
 func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
   %1 = xla_hlo.constant dense<6> : tensor<i32>
-  %2 = "xla_hlo.minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-  %3 = "xla_hlo.maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+  %2 = "xla_chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+  %3 = "xla_chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
   return %3 : tensor<?xi32>
 }
 
 func @relu_grad(%arg0: tensor<4x8xf32>, %arg1: tensor<?x?xf32>) -> tensor<4x8xf32> {
   %0 = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "xla_hlo.compare"(%arg1, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
+  %1 = "xla_chlo.broadcast_compare"(%arg1, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
   %2 = xla_hlo.constant dense<0.000000e+00> : tensor<4x8xf32>
   %3 = "xla_hlo.select"(%1, %arg0, %2) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
   return %3 : tensor<4x8xf32>
@@ -682,6 +677,11 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
   return %0 : tensor<3xcomplex<f32>>
 }
 
+func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
+  %0 = "xla_hlo.convert"(%arg0) : (tensor<2xi32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
 // CHECK-LABEL:   func @biasAdd_NHWC(
@@ -723,13 +723,13 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
 
 // CHECK-LABEL:   func @div(
 // CHECK-SAME:              [[VAL_18:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_19:%.*]] = "tf.RealDiv"([[VAL_18]], [[VAL_18]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           [[VAL_19:%.*]] = "tf.Div"([[VAL_18]], [[VAL_18]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
 // CHECK:           return [[VAL_19]] : tensor<2xi32>
 // CHECK:         }
 
 // CHECK-LABEL:   func @broadcast_div(
 // CHECK-SAME:                        [[VAL_20:%.*]]: tensor<1xi32>, [[VAL_21:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_22:%.*]] = "tf.RealDiv"([[VAL_20]], [[VAL_21]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           [[VAL_22:%.*]] = "tf.Div"([[VAL_20]], [[VAL_21]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
 // CHECK:           return [[VAL_22]] : tensor<1x2xi32>
 // CHECK:         }
 
@@ -741,7 +741,7 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
 
 // CHECK-LABEL:   func @div_dynamic(
 // CHECK-SAME:                      [[VAL_26:%.*]]: tensor<?xi32>, [[VAL_27:%.*]]: tensor<?x?xi32>) -> tensor<?x?xi32> {
-// CHECK:           [[VAL_28:%.*]] = "tf.RealDiv"([[VAL_26]], [[VAL_27]]) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+// CHECK:           [[VAL_28:%.*]] = "tf.Div"([[VAL_26]], [[VAL_27]]) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
 // CHECK:           return [[VAL_28]] : tensor<?x?xi32>
 // CHECK:         }
 
@@ -771,13 +771,13 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
 
 // CHECK-LABEL:   func @real_div(
 // CHECK-SAME:                   [[VAL_40:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_41:%.*]] = "tf.RealDiv"([[VAL_40]], [[VAL_40]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           [[VAL_41:%.*]] = "tf.Div"([[VAL_40]], [[VAL_40]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
 // CHECK:           return [[VAL_41]] : tensor<2xi32>
 // CHECK:         }
 
 // CHECK-LABEL:   func @broadcast_real_div(
 // CHECK-SAME:                             [[VAL_42:%.*]]: tensor<1xi32>, [[VAL_43:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_44:%.*]] = "tf.RealDiv"([[VAL_42]], [[VAL_43]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           [[VAL_44:%.*]] = "tf.Div"([[VAL_42]], [[VAL_43]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
 // CHECK:           return [[VAL_44]] : tensor<1x2xi32>
 // CHECK:         }
 
@@ -896,7 +896,7 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
 // CHECK:           [[VAL_98:%.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
 // CHECK:           [[VAL_99:%.*]] = "tf.Less"([[VAL_95]], [[VAL_98]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
 // CHECK:           [[VAL_100:%.*]] = "tf.Equal"([[VAL_97]], [[VAL_99]]) {incompatible_shape_error = true} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_101:%.*]] = "tf.RealDiv"([[VAL_94]], [[VAL_95]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+// CHECK:           [[VAL_101:%.*]] = "tf.Div"([[VAL_94]], [[VAL_95]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_102:%.*]] = "tf.Abs"([[VAL_94]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_103:%.*]] = "tf.Abs"([[VAL_95]]) : (tensor<3xi32>) -> tensor<3xi32>
 // CHECK:           [[VAL_104:%.*]] = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
@@ -904,7 +904,7 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
 // CHECK:           [[VAL_106:%.*]] = "tf.AddV2"([[VAL_102]], [[VAL_105]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_107:%.*]] = "tf.Neg"([[VAL_106]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_108:%.*]] = "tf.Abs"([[VAL_95]]) : (tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           [[VAL_109:%.*]] = "tf.RealDiv"([[VAL_107]], [[VAL_108]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+// CHECK:           [[VAL_109:%.*]] = "tf.Div"([[VAL_107]], [[VAL_108]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_110:%.*]] = "tf.Select"([[VAL_100]], [[VAL_101]], [[VAL_109]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           return [[VAL_110]] : tensor<2x3xi32>
 // CHECK:         }
@@ -916,7 +916,7 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
 // CHECK:           [[VAL_115:%.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
 // CHECK:           [[VAL_116:%.*]] = "tf.Less"([[VAL_112]], [[VAL_115]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
 // CHECK:           [[VAL_117:%.*]] = "tf.Equal"([[VAL_114]], [[VAL_116]]) {incompatible_shape_error = true} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_118:%.*]] = "tf.RealDiv"([[VAL_111]], [[VAL_112]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           [[VAL_118:%.*]] = "tf.Div"([[VAL_111]], [[VAL_112]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_119:%.*]] = "tf.Abs"([[VAL_111]]) : (tensor<3xi32>) -> tensor<3xi32>
 // CHECK:           [[VAL_120:%.*]] = "tf.Abs"([[VAL_112]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_121:%.*]] = "tf.Const"() {value = dense<1> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
@@ -924,23 +924,23 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
 // CHECK:           [[VAL_123:%.*]] = "tf.AddV2"([[VAL_119]], [[VAL_122]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_124:%.*]] = "tf.Neg"([[VAL_123]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_125:%.*]] = "tf.Abs"([[VAL_112]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_126:%.*]] = "tf.RealDiv"([[VAL_124]], [[VAL_125]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           [[VAL_126:%.*]] = "tf.Div"([[VAL_124]], [[VAL_125]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           [[VAL_127:%.*]] = "tf.Select"([[VAL_117]], [[VAL_118]], [[VAL_126]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           return [[VAL_127]] : tensor<2x3xi32>
 // CHECK:         }
 
 // CHECK-LABEL:   func @floordiv_f32(
 // CHECK-SAME:                       [[VAL_128:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_129:%.*]] = "tf.RealDiv"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_130:%.*]] = "tf.RealDiv"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           [[VAL_129:%.*]] = "tf.Div"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           [[VAL_130:%.*]] = "tf.Div"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
 // CHECK:           [[VAL_131:%.*]] = "tf.FloorDiv"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
 // CHECK:           return [[VAL_131]] : tensor<2xf32>
 // CHECK:         }
 
 // CHECK-LABEL:   func @floordiv_f16_broadcast(
 // CHECK-SAME:                                 [[VAL_132:%.*]]: tensor<2x3xf16>, [[VAL_133:%.*]]: tensor<3xf16>) -> tensor<2x3xf16> {
-// CHECK:           [[VAL_134:%.*]] = "tf.RealDiv"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-// CHECK:           [[VAL_135:%.*]] = "tf.RealDiv"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+// CHECK:           [[VAL_134:%.*]] = "tf.Div"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+// CHECK:           [[VAL_135:%.*]] = "tf.Div"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
 // CHECK:           [[VAL_136:%.*]] = "tf.FloorDiv"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
 // CHECK:           return [[VAL_136]] : tensor<2x3xf16>
 // CHECK:         }
@@ -1066,11 +1066,6 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
 // CHECK:           return [[VAL_190]] : tensor<2xi32>
 // CHECK:         }
 
-// CHECK-LABEL:   func @const_dynamic_output() -> tensor<*xi32> {
-// CHECK:           [[VAL_191:%.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<*xi32>
-// CHECK:           return [[VAL_191]] : tensor<*xi32>
-// CHECK:         }
-
 // CHECK-LABEL:   func @relu(
 // CHECK-SAME:               [[VAL_192:%.*]]: tensor<1xi32>) -> tensor<1xi32> {
 // CHECK:           [[VAL_193:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -1493,3 +1488,8 @@ func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f3
 // CHECK:           return [[VAL_369]] : tensor<3xcomplex<f32>>
 // CHECK:         }
 
+// CHECK-LABEL:   func @convert_i32_f32(
+// CHECK-SAME:                          [[VAL_370:%.*]]: tensor<2xi32>) -> tensor<2xf32> {
+// CHECK:           [[VAL_371:%.*]] = "tf.Cast"([[VAL_370]]) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
+// CHECK:           return [[VAL_371]] : tensor<2xf32>
+// CHECK:         }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
index c5f87c602a3..ce3416141da 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -3,8 +3,8 @@
 // CHECK-LABEL: invert_permutation
 func @invert_permutation(%arg0: tensor<5xi32>) -> tensor<5xi32> {
   // CHECK-NEXT: %[[UPDATES:.*]] = "tf.Const"() {value = dense<[0, 1, 2, 3, 4]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT: %[[PERM:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  // CHECK-NEXT: %[[INDICES:.*]] = "tf.Transpose"(%arg0, %[[PERM]]) : (tensor<5xi32>, tensor<2xi32>) -> tensor<5x1xi32>
+  // CHECK-NEXT: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[5, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK-NEXT: %[[INDICES:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<5xi32>, tensor<2xi32>) -> tensor<5x1xi32>
   // CHECK-NEXT: "tf.TensorScatterUpdate"(%arg0, %[[INDICES]], %[[UPDATES]]) : (tensor<5xi32>, tensor<5x1xi32>, tensor<5xi32>) -> tensor<5xi32>
   %0 = "tf.InvertPermutation"(%arg0) : (tensor<5xi32>) -> tensor<5xi32>
   return %0 : tensor<5xi32>
@@ -455,3 +455,12 @@ func @Reciprocal(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "tf.Reciprocal"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
+
+func @ScatterNd(%arg0: tensor<4x1xi32>, %arg1: tensor<4xf32>) -> tensor<8xf32> {
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<8xf32>} : () -> tensor<8xf32>
+  // CHECK: "tf.TensorScatterUpdate"(%[[ZERO]], %arg0, %arg1) : (tensor<8xf32>, tensor<4x1xi32>, tensor<4xf32>) -> tensor<8xf32>
+
+  %shape = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> tensor<1xi32>
+  %0 = "tf.ScatterNd"(%arg0, %arg1, %shape) : (tensor<4x1xi32>, tensor<4xf32>, tensor<1xi32>) -> tensor<8xf32>
+  return %0 : tensor<8xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
index 515e03ac2d2..680e26f5cbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
@@ -1,9 +1,9 @@
-// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s --dump-input-on-failure
 
 func @main() -> tensor<*x!tf.resource> attributes {tf.entry_function = {inputs = "", outputs = "func_call"}} {
   %0 = tf_executor.graph {
     %outputs, %control = tf_executor.island wraps "tf.VarHandleOp"() {container = "a", device = "/CPU:0", dtype = i64, shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<i64>>> loc("x")
-    %outputs_0, %control_1 = tf_executor.island wraps "tf.LegacyCall"(%outputs, %outputs) {_disable_call_shape_inference = true, f = @test_func_name0} : (tensor<!tf.resource<tensor<i64>>>, tensor<!tf.resource<tensor<i64>>>) -> tensor<*x!tf.resource>
+    %outputs_0, %control_1 = tf_executor.island wraps "tf.LegacyCall"(%outputs, %outputs) {_disable_call_shape_inference = true, f = @test_func_name0} : (tensor<!tf.resource<tensor<i64>>>, tensor<!tf.resource<tensor<i64>>>) -> tensor<*x!tf.resource> loc("called")
     tf_executor.fetch %outputs_0 : tensor<*x!tf.resource>
   }
   return %0 : tensor<*x!tf.resource>
@@ -23,8 +23,7 @@ func @test_func_name0(%arg0: tensor<*x!tf.resource> {tf.resource_arg_unique_id =
 // CHECK:      op: "VarHandleOp"
 
 // CHECK:      name: "func_call"
-// CHECK:      input: "x"
-// CHECK:      input: "x"
+// CHECK:      input: "called"
 
 // CHECK:      library
 // CHECK:        function
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir
index 5134deb7148..18fec33a256 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir
@@ -1,31 +1,29 @@
-// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s --dump-input-on-failure
 
 func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  %graph:2 = tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg0) : (tensor<f32>) -> tensor<f32>
-    %1:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg1) : (tensor<f32>) -> tensor<f32>
-    %2:2 = tf_executor.island wraps "tf.Less"(%0#0, %1#0) : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    %3:2 = tf_executor.island wraps "tf.If"(%2#0, %0#0, %1#0) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = false} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32> loc("StatefulIf")
-    %4:2 = tf_executor.island wraps "tf.If"(%2#0, %0#0, %1#0) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32> loc("StatelessIf")
-    tf_executor.fetch %3#0, %4#0 : tensor<f32>, tensor<f32>
+  %0:2 = tf_executor.graph {
+    %outputs_2, %control_3 = tf_executor.island wraps "tf.Less"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %outputs_4, %control_5 = tf_executor.island wraps "tf.If"(%outputs_2, %arg0, %arg1) {else_branch = @cond_false, is_stateless = false, then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32> loc("StatefulIf")
+    %outputs_6, %control_7 = tf_executor.island wraps "tf.If"(%outputs_2, %arg0, %arg1) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32> loc("StatelessIf")
+    tf_executor.fetch %outputs_4, %outputs_6 : tensor<f32>, tensor<f32>
   }
-  return %graph#0, %graph#1 : tensor<f32>, tensor<f32>
+  return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
 
 func @cond_true(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  %graph = tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Add"(%arg0, %arg1): (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    tf_executor.fetch %0#0 : tensor<*xf32>
+  %0 = tf_executor.graph {
+    %outputs, %control = tf_executor.island wraps "tf.Add"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    tf_executor.fetch %outputs : tensor<*xf32>
   }
-  return %graph : tensor<*xf32>
+  return %0 : tensor<*xf32>
 }
 
 func @cond_false(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  %graph = tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Mul"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    tf_executor.fetch %0#0 : tensor<*xf32>
+  %0 = tf_executor.graph {
+    %outputs, %control = tf_executor.island wraps "tf.Mul"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    tf_executor.fetch %outputs : tensor<*xf32>
   }
-  return %graph : tensor<*xf32>
+  return %0 : tensor<*xf32>
 }
 
 // Verify that If op is mapped to TensorFlow StatelessIf op if the is_stateless
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir
index 403d9541655..9f14a144d9d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir
@@ -1,35 +1,31 @@
 // RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
 
 func @main(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  %graph:2 = tf_executor.graph {
-    %iter:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg0) : (tensor<i32>) -> tensor<i32> loc("iter")
-    %val:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg1) : (tensor<f32>) -> tensor<f32> loc("val")
-
-    // Element wise add `val` with itself for `iter` number of times.
-    %2:3 = tf_executor.island wraps "tf.While"(%iter#0, %val#0) {cond = @cond, body = @body, is_stateless = false} : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>) loc("StatefulWhile")
-    %3:3 = tf_executor.island wraps "tf.While"(%iter#0, %val#0) {cond = @cond, body = @body, is_stateless = true} : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>) loc("StatelessWhile")
-    tf_executor.fetch %2#1, %3#1 : tensor<f32>, tensor<f32>
+  %0:2 = tf_executor.graph {
+    %outputs_2:2, %control_3 = tf_executor.island wraps "tf.While"(%arg0, %arg1) {body = @body, cond = @cond, is_stateless = false} : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>) loc("StatefulWhile")
+    %outputs_4:2, %control_5 = tf_executor.island wraps "tf.While"(%arg0, %arg1) {body = @body, cond = @cond, is_stateless = true} : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>) loc("StatelessWhile")
+    tf_executor.fetch %outputs_2#1, %outputs_4#1 : tensor<f32>, tensor<f32>
   }
-  return %graph#0, %graph#1 : tensor<f32>, tensor<f32>
+  return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
 
 func @cond(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> tensor<i1> {
-  %graph = tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32> loc("Const")
-    %1:2 = tf_executor.island wraps "tf.Greater"(%arg0, %0#0) : (tensor<*xi32>, tensor<i32>) -> tensor<i1>
-    tf_executor.fetch %1#0 : tensor<i1>
+  %0 = tf_executor.graph {
+    %outputs, %control = tf_executor.island wraps "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %outputs_0, %control_1 = tf_executor.island wraps "tf.Greater"(%arg0, %outputs) : (tensor<*xi32>, tensor<i32>) -> tensor<i1>
+    tf_executor.fetch %outputs_0 : tensor<i1>
   }
-  return %graph : tensor<i1>
+  return %0 : tensor<i1>
 }
 
 func @body(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> (tensor<*xi32>, tensor<*xf32>) {
-  %graph:2 = tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32> loc("Const")
-    %1:2 = tf_executor.island wraps "tf.Sub"(%arg0, %0#0) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
-    %2:2 = tf_executor.island wraps "tf.Add"(%arg1, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    tf_executor.fetch %1#0, %2#0 : tensor<*xi32>, tensor<*xf32>
+  %0:2 = tf_executor.graph {
+    %outputs, %control = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %outputs_0, %control_1 = tf_executor.island wraps "tf.Sub"(%arg0, %outputs) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+    %outputs_2, %control_3 = tf_executor.island wraps "tf.Add"(%arg1, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    tf_executor.fetch %outputs_0, %outputs_2 : tensor<*xi32>, tensor<*xf32>
   }
-  return %graph#0, %graph#1 : tensor<*xi32>, tensor<*xf32>
+  return %0#0, %0#1 : tensor<*xi32>, tensor<*xf32>
 }
 
 // Verify that While op is mapped to TensorFlow StatelessWhile op if the
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/invalid_input.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/invalid_input.mlir
index 41f31858fee..336d83e708b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/invalid_input.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/invalid_input.mlir
@@ -1,43 +1,5 @@
 // RUN: not tf-mlir-translate -split-input-file -mlir-to-graphdef %s -o - 2>&1 | FileCheck %s --dump-input=fail
 
-// Tests invalid tf_executor.graph args.
-
-func @main(%arg0: tensor<i32>) {
-  tf_executor.graph {
-    %0:3 = tf_executor.Merge %arg0, %arg0 : tensor<i32> {device = "", N = 2, T = "tfdtype$DT_INT32"} loc("while/Merge")
-    tf_executor.fetch
-  }
-  return
-}
-
-// CHECK: Arg in 'main' should only have one user.
-
-// -----
-
-func @main(%arg0: tensor<i32>, %arg1: tensor<i32>) {
-  tf_executor.graph {
-    %0:3 = tf_executor.Merge %arg0, %arg1 : tensor<i32> {device = "", N = 2, T = "tfdtype$DT_INT32"} loc("while/Merge")
-    tf_executor.fetch
-  }
-  return
-}
-
-// CHECK: User of arg in 'main' must be in an inner op of a tf_executor.island.
-
-// -----
-
-func @main(%arg0: tensor<i32>) {
-  tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Identity"(%arg0) {T = "tfdtype$DT_INT32"} : (tensor<i32>) -> tensor<i32>
-    tf_executor.fetch %0#1 : !tf_executor.control
-  }
-  return
-}
-
-// CHECK: tf_executor.island of user of arg in 'main' must have no control output users.
-
-// -----
-
 // Tests function with multiple blocks.
 
 func @main() {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/output-shapes-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/output-shapes-attr.mlir
index fb3ee49bbc5..f14115460f2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/output-shapes-attr.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/output-shapes-attr.mlir
@@ -1,52 +1,31 @@
-// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s --dump-input-on-failure
 
 func @main(%arg0: tensor<10xi32>) -> tensor<10xi32>
-attributes {tf.entry_function = {inputs = "input0", outputs = "Placeholder"}} {
+attributes {tf.entry_function = {inputs = "input0", outputs = "output0"}} {
   %graph = tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> tensor<10xi32>
-    tf_executor.fetch %0 : tensor<10xi32>
+    tf_executor.fetch %arg0 : tensor<10xi32>
   }
   return %graph : tensor<10xi32>
 }
 
 // CHECK:      node {
-// CHECK-NEXT:   name: "Placeholder"
-// CHECK-NEXT:   op: "Placeholder"
-// CHECK-NEXT:   attr {
-// CHECK-NEXT:     key: "_output_shapes"
-// CHECK-NEXT:     value {
-// CHECK-NEXT:       list {
-// CHECK-NEXT:         shape {
-// CHECK-NEXT:           dim {
-// CHECK-NEXT:             size: 10
-// CHECK-NEXT:           }
-// CHECK-NEXT:         }
-// CHECK-NEXT:       }
-// CHECK-NEXT:      }
-// CHECK-NEXT:   }
-// CHECK-NEXT:   attr {
-// CHECK-NEXT:     key: "dtype"
+// CHECK-NEXT:   name: "input0"
+// CHECK-NEXT:   op: "_Arg"
+// CHECK:          key: "T"
 // CHECK-NEXT:     value {
 // CHECK-NEXT:       type: DT_INT32
 // CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT:   attr {
-// CHECK-NEXT:     key: "shape"
+// CHECK:          key: "_output_shapes"
 // CHECK-NEXT:     value {
-// CHECK-NEXT:       shape {
-// CHECK-NEXT:         dim {
-// CHECK-NEXT:           size: 10
-// CHECK-NEXT:         }
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
+// CHECK-NEXT:      shape {
+// CHECK-NEXT:        dim {
+// CHECK-NEXT:          size: 10
+// CHECK-NEXT:        }
+// CHECK-NEXT:      }
 // CHECK-NEXT:   }
-// CHECK-NEXT:   experimental_debug_info {
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: node {
-// CHECK-NEXT:   name: "main"
+// CHECK:        name: "output0"
 // CHECK-NEXT:   op: "_Retval"
-// CHECK-NEXT:   input: "Placeholder"
+// CHECK-NEXT:   input: "input0"
 // CHECK-NEXT:   attr {
 // CHECK-NEXT:     key: "T"
 // CHECK-NEXT:     value {
@@ -59,6 +38,3 @@ attributes {tf.entry_function = {inputs = "input0", outputs = "Placeholder"}} {
 // CHECK-NEXT:       i: 0
 // CHECK-NEXT:     }
 // CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: library {
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example.mlir
index 72dd164ea3c..1a2c1446c27 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example.mlir
@@ -1,21 +1,18 @@
-// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s --dump-input-on-failure
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 175 : i32}} {
   func @main(%arg0: tensor<32x!tf.string>) -> (tensor<?x2xi64>) attributes {tf.entry_function = {inputs = "input0", outputs = "ParseExample/ParseExampleV2"}} {
 
     %0 = tf_executor.graph {
-      // NOTE(mrry): This dummy input was manually added because the exporter expects it and fails otherwise.
-      %dummy_input, %control_dummy = tf_executor.island wraps "tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_STRING", shape = "tfshape$dim { size: 32 }"} : (tensor<32x!tf.string>) -> tensor<32x!tf.string>
-
       %outputs, %control = tf_executor.island wraps "tf.Const"() {device = "", dtype = f32, value = dense<[]> : tensor<0xf32>} : () -> tensor<0xf32>
       %outputs_0, %control_1 = tf_executor.island wraps "tf.Const"() {device = "", dtype = f32, value = dense<[]> : tensor<0xf32>} : () -> tensor<0xf32>
-      %outputs_2, %control_3 = tf_executor.island wraps "tf.Const"() {device = "", dtype = !tf.string, value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B2073697A653A2032207D207D2074656E736F725F636F6E74656E743A20225C3031345C303134666561747572655F6B657931666561747572655F6B65793222"> : tensor<2x!tf.string>} : () -> tensor<2x!tf.string>
-      %outputs_4, %control_5 = tf_executor.island wraps "tf.Const"() {device = "", dtype = !tf.string, value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B207D207D"> : tensor<0x!tf.string>} : () -> tensor<0x!tf.string>
-      %outputs_6, %control_7 = tf_executor.island wraps "tf.Const"() {device = "", dtype = !tf.string, value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B207D207D"> : tensor<0x!tf.string>} : () -> tensor<0x!tf.string>
-      %outputs_8, %control_9 = tf_executor.island wraps "tf.Const"() {device = "", dtype = !tf.string, value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B2073697A653A2032207D207D2074656E736F725F636F6E74656E743A20225C3031345C303134666561747572655F6B657933666561747572655F6B65793422"> : tensor<2x!tf.string>} : () -> tensor<2x!tf.string>
+      %outputs_2, %control_3 = tf_executor.island wraps "tf.Const"() {device = "", dtype = !tf.string, value = dense<""> : tensor<2x!tf.string>} : () -> tensor<2x!tf.string>
+      %outputs_4, %control_5 = tf_executor.island wraps "tf.Const"() {device = "", dtype = !tf.string, value = dense<""> : tensor<0x!tf.string>} : () -> tensor<0x!tf.string>
+      %outputs_6, %control_7 = tf_executor.island wraps "tf.Const"() {device = "", dtype = !tf.string, value = dense<""> : tensor<0x!tf.string>} : () -> tensor<0x!tf.string>
+      %outputs_8, %control_9 = tf_executor.island wraps "tf.Const"() {device = "", dtype = !tf.string, value = dense<""> : tensor<2x!tf.string>} : () -> tensor<2x!tf.string>
 
-      %outputs_10:8, %control_11 = tf_executor.island wraps "tf.ParseExampleV2"(%dummy_input, %outputs_4, %outputs_8, %outputs_2, %outputs_6, %outputs, %outputs_0) {Tdense = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], dense_shapes = ["tfshape$", "tfshape$"], device = "", name = "ParseExample/ParseExampleV2", num_sparse = 2 : i64, ragged_split_types = [], ragged_value_types = [], result_segment_sizes = dense<[2, 2, 2, 2, 0, 0]> : vector<6xi32>, sparse_types = ["tfdtype$DT_STRING", "tfdtype$DT_INT64"]} : (tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>, tensor<0x!tf.string>, tensor<0xf32>, tensor<0xf32>) -> (tensor<?x2xi64>, tensor<?x2xi64>, tensor<?x!tf.string>, tensor<?xi64>, tensor<2xi64>, tensor<2xi64>, tensor<32xf32>, tensor<32xf32>)
-      // CHECK:      name: "ParseExample/ParseExampleV2"
+      %outputs_10:8, %control_11 = tf_executor.island wraps "tf.ParseExampleV2"(%arg0, %outputs_4, %outputs_8, %outputs_2, %outputs_6, %outputs, %outputs_0) {Tdense = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], dense_shapes = [#tf.shape<>, #tf.shape<>], device = "", num_sparse = 2 : i64, ragged_split_types = [], ragged_value_types = [], result_segment_sizes = dense<[2, 2, 2, 2, 0, 0]> : vector<6xi32>, sparse_types = ["tfdtype$DT_STRING", "tfdtype$DT_INT64"]} : (tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>, tensor<0x!tf.string>, tensor<0xf32>, tensor<0xf32>) -> (tensor<?x2xi64>, tensor<?x2xi64>, tensor<?x!tf.string>, tensor<?xi64>, tensor<2xi64>, tensor<2xi64>, tensor<32xf32>, tensor<32xf32>) loc("ParseExample")
+      // CHECK:      name: "ParseExample"
       // CHECK-NEXT: op: "ParseExampleV2"
       // CHECK-NEXT: input: "input0"
       // CHECK-NEXT: input: "tf.Const3"
@@ -77,9 +74,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       tf_executor.fetch %outputs_10#0 : tensor<?x2xi64>
     }
     return %0#0 : tensor<?x2xi64>
-    // CHECK:      name: "main"
+    // CHECK:      name: "ParseExample/ParseExampleV2"
     // CHECK-NEXT: op: "_Retval"
-    // CHECK-NEXT: input: "ParseExample/ParseExampleV2"
+    // CHECK-NEXT: input: "ParseExample"
 
   }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/preserve-entry-func-names.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/preserve-entry-func-names.mlir
index 8f0b1369a45..46ed409735a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/preserve-entry-func-names.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/preserve-entry-func-names.mlir
@@ -3,22 +3,20 @@
 func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32>
 attributes {tf.entry_function = {inputs = "foo,bar", outputs = "Add"}} {
   %graph = tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> tensor<10xi32>
-    %1:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg1) {device = "", dtype = "tfdtype$DT_INT32", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> tensor<10xi32>
     // This node would be renamed to bar1 [note: if imported from TF graphdef this would not be possible]
-    %2:2 = tf_executor.island wraps "tf.Identity"(%1) {device = "", dtype = "tfdtype$DT_INT32"} : (tensor<10xi32>) -> tensor<10xi32> loc ("bar")
+    %2:2 = tf_executor.island wraps "tf.Identity"(%arg1) {device = "", dtype = "tfdtype$DT_INT32"} : (tensor<10xi32>) -> tensor<10xi32> loc ("bar")
     // The following node would be renamed to bar2
     %3:2 = tf_executor.island wraps "tf.Identity"(%2) {device = "", dtype = "tfdtype$DT_INT32"} : (tensor<10xi32>) -> tensor<10xi32> loc ("bar")
-    %4:2 = tf_executor.island wraps "tf.Add"(%0, %3) {T = "tfdtype$DT_INT32", device = ""} : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32> loc("Add")
+    %4:2 = tf_executor.island wraps "tf.Add"(%arg0, %3) {T = "tfdtype$DT_INT32", device = ""} : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32> loc("Add")
     tf_executor.fetch %4#0 : tensor<10xi32>
   }
   return %graph : tensor<10xi32>
 }
 
 // CHECK: name: "foo"
-// CHECK-NEXT: op: "Placeholder"
+// CHECK-NEXT: op: "_Arg"
 // CHECK: name: "bar"
-// CHECK-NEXT: op: "Placeholder"
+// CHECK-NEXT: op: "_Arg"
 // CHECK: name: "[[BAR_ID_0:.*]]"
 // CHECK-NEXT: op: "Identity"
 // CHECK-NEXT: input: "bar"
@@ -26,6 +24,5 @@ attributes {tf.entry_function = {inputs = "foo,bar", outputs = "Add"}} {
 // CHECK-NEXT: op: "Identity"
 // CHECK-NEXT: input: "[[BAR_ID_0]]"
 // CHECK: name: "Add"
-// CHECK-NEXT: op: "Add"
-// CHECK-NEXT: input: "foo"
-// CHECK-NEXT: input: "[[BAR_ID_1:.*]]"
+// CHECK-NEXT: op: "_Retval"
+// CHECK-NEXT: input: "Add1"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-type-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-type-attr.mlir
index 83ddf6205a8..3dac8d023e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-type-attr.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-type-attr.mlir
@@ -12,7 +12,7 @@
 
 func @main() {
   tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.VariableV2"() {dtype = "tfdtype$DT_INT32", value = dense<2> : tensor<i32>} : () -> tensor<!tf.int32ref> loc("Ref_Variable")
+    %0:2 = tf_executor.island wraps "tf.VariableV2"() {dtype = "tfdtype$DT_INT32", value = dense<2> : tensor<i32>, shape = #tf.shape<2>, container = "", shared_name = ""} : () -> tensor<!tf.int32ref> loc("Ref_Variable")
     %1:2 = tf_executor.island wraps "tf.Identity"(%0#0) : (tensor<!tf.int32ref>) -> tensor<*x!tf.int32ref> loc("foo")
     tf_executor.fetch
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-while-loop.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-while-loop.mlir
index 8b2d3938c35..fde62a72e4b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-while-loop.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-while-loop.mlir
@@ -9,7 +9,7 @@ func @main() {
   // CHECK:  op: "RefNextIteration"
   tf_executor.graph {
     %0:3 = tf_executor.NextIteration.Source : tensor<*x!tf.int32ref> {device = "", T = "tfdtype$DT_INT32"} loc("while/NextIteration")
-    %1:2 = tf_executor.island wraps "tf.VariableV2"() {device = "", dtype = "tfdtype$DT_INT32", value = dense<0> : tensor<i32>} : () -> tensor<!tf.int32ref> loc("Ref_Variable")
+    %1:2 = tf_executor.island wraps "tf.VariableV2"() {device = "", dtype = "tfdtype$DT_INT32", value = dense<0> : tensor<i32>, shape = #tf.shape<0>, container = "", shared_name = ""} : () -> tensor<!tf.int32ref> loc("Ref_Variable")
     %2:2 = tf_executor.Enter %1#0 frame "while/while_context" parallel_iterations 10 : (tensor<!tf.int32ref>) -> (tensor<*x!tf.int32ref>, !tf_executor.control) {device = "", T = "tfdtype$DT_INT32"} loc("while/Enter")
     %3:3 = tf_executor.Merge %2#0, %0#0 : tensor<*x!tf.int32ref> {device = "", N = 2, T = "tfdtype$DT_INT32"} loc("while/Merge")
     %4:2 = tf_executor.island(%3#2) wraps "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", value = dense<10> : tensor<i32>} : () -> tensor<i32> loc("while/Less/y")
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/stringescape.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/stringescape.mlir
index 1ab0195f33a..4b6600d3b16 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/stringescape.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/stringescape.mlir
@@ -11,7 +11,7 @@ func @main() {
   // CHECK-NEXT: value {
   // CHECK-NEXT:   s: " 0\n\000\000"
   tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Empty"() {name = "dummy", dtype = "tfdtype$DT_INT32", value = "\200\n\00\00", listvalue = ["\20\0A"]} : () -> tensor<2xi32>
+    %0:2 = tf_executor.island wraps "tf.Placeholder"() {name = "dummy", dtype = "tfdtype$DT_INT32", value = "\200\n\00\00", listvalue = ["\20\0A"]} : () -> tensor<2xi32>
     tf_executor.fetch
   }
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-gradient-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-gradient-attr.mlir
index 463c1fd63ec..cf319f41010 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-gradient-attr.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-gradient-attr.mlir
@@ -5,35 +5,12 @@ func @main() {
   // CHECK:      node {
   // CHECK-NEXT:   name: "Const"
   // CHECK-NEXT:   op: "Const"
-  // CHECK-NEXT:   attr {
-  // CHECK:          key: "dtype"
-  // CHECK-NEXT:     value {
-  // CHECK-NEXT:       type: DT_FLOAT
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT:   attr {
-  // CHECK-NEXT:     key: "value"
-  // CHECK-NEXT:     value {
-  // CHECK-NEXT:       tensor {
-  // CHECK-NEXT:         dtype: DT_FLOAT
-  // CHECK-NEXT:         tensor_shape {
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         float_val: 0.25
-  // CHECK-NEXT:       }
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT:   experimental_debug_info {
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
     %0:2 = tf_executor.island wraps "tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", value = dense<2.500000e-01> : tensor<f32>} : () -> tensor<f32> loc("Const")
 
   // CHECK:      node {
   // CHECK-NEXT:   name: "foo"
   // CHECK-NEXT:   op: "foo"
   // CHECK-NEXT:   input: "Const"
-  // CHECK:        experimental_debug_info {
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
     %1:2 = tf_executor.island wraps "tf.foo"(%0#0) {device = ""} : (tensor<f32>) -> tensor<*xf32> loc("foo")
     tf_executor.fetch
   }
@@ -44,42 +21,10 @@ func @main() {
 // CHECK-NEXT:   function {
 // CHECK-NEXT:     signature {
 // CHECK-NEXT:       name: "foo"
-// CHECK-NEXT:       input_arg {
-// CHECK-NEXT:         name: "foo"
-// CHECK-NEXT:         type: DT_FLOAT
-// CHECK-NEXT:       }
-// CHECK-NEXT:       output_arg {
-// CHECK-NEXT:         name: "foo1"
-// CHECK-NEXT:         type: DT_FLOAT
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:     ret {
-// CHECK-NEXT:       key: "foo1"
-// CHECK-NEXT:       value: "foo"
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT:   function {
+// CHECK:      function {
 // CHECK-NEXT:     signature {
 // CHECK-NEXT:       name: "foo_grad"
-// CHECK-NEXT:       input_arg {
-// CHECK-NEXT:         name: "foo_grad"
-// CHECK-NEXT:         type: DT_FLOAT
-// CHECK-NEXT:       }
-// CHECK-NEXT:       input_arg {
-// CHECK-NEXT:         name: "foo_grad1"
-// CHECK-NEXT:         type: DT_FLOAT
-// CHECK-NEXT:       }
-// CHECK-NEXT:       output_arg {
-// CHECK-NEXT:         name: "foo_grad2"
-// CHECK-NEXT:         type: DT_FLOAT
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:     ret {
-// CHECK-NEXT:       key: "foo_grad2"
-// CHECK-NEXT:       value: "foo_grad"
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT:   gradient {
+// CHECK:      gradient {
 // CHECK-NEXT:     function_name: "foo"
 // CHECK-NEXT:     gradient_func: "foo_grad"
 // CHECK-NEXT:   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf_add.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf_add.mlir
index beb7312543b..db9e7d4c3e5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf_add.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf_add.mlir
@@ -3,9 +3,7 @@
 func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32>
 attributes {tf.entry_function = {inputs = "input0,input1", outputs = "Add"}} {
   %graph = tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> tensor<10xi32>
-    %1:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg1) {device = "", dtype = "tfdtype$DT_INT32", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> tensor<10xi32>
-    %2:2 = tf_executor.island wraps "tf.Add"(%0#0, %1#0) {T = "tfdtype$DT_INT32", device = ""} : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32> loc("Add")
+    %2:2 = tf_executor.island wraps "tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = ""} : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32> loc("Add")
     tf_executor.fetch %2 : tensor<10xi32>
   }
   return %graph : tensor<10xi32>
@@ -13,66 +11,19 @@ attributes {tf.entry_function = {inputs = "input0,input1", outputs = "Add"}} {
 
 // CHECK:      node {
 // CHECK-NEXT:   name: "input0"
-// CHECK-NEXT:   op: "Placeholder"
-// CHECK-NEXT:     attr {
-// CHECK:          key: "dtype"
-// CHECK-NEXT:     value {
-// CHECK-NEXT:       type: DT_INT32
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT:   attr {
-// CHECK-NEXT:     key: "shape"
-// CHECK-NEXT:     value {
-// CHECK-NEXT:       shape {
-// CHECK-NEXT:         dim {
-// CHECK-NEXT:           size: 10
-// CHECK-NEXT:         }
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT:   experimental_debug_info {
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: node {
+// CHECK-NEXT:   op: "_Arg"
+// CHECK:      node {
 // CHECK-NEXT:   name: "input1"
-// CHECK-NEXT:   op: "Placeholder"
-// CHECK-NEXT:     attr {
-// CHECK:          key: "dtype"
-// CHECK-NEXT:     value {
-// CHECK-NEXT:       type: DT_INT32
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT:   attr {
-// CHECK-NEXT:     key: "shape"
-// CHECK-NEXT:     value {
-// CHECK-NEXT:       shape {
-// CHECK-NEXT:         dim {
-// CHECK-NEXT:           size: 10
-// CHECK-NEXT:         }
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT:   experimental_debug_info {
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: node {
-// CHECK-NEXT:   name: "Add"
+// CHECK-NEXT:   op: "_Arg"
+// CHECK:      node {
+// CHECK-NEXT:   name: "Add1"
 // CHECK-NEXT:   op: "Add"
 // CHECK-NEXT:   input: "input0"
 // CHECK-NEXT:   input: "input1"
-// CHECK-NEXT:   attr {
-// CHECK-NEXT:     key: "T"
-// CHECK-NEXT:     value {
-// CHECK-NEXT:       type: DT_INT32
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK:        experimental_debug_info {
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: node {
-// CHECK-NEXT:   name: "main"
+// CHECK:      node {
+// CHECK-NEXT:   name: "Add"
 // CHECK-NEXT:   op: "_Retval"
-// CHECK-NEXT:   input: "Add"
+// CHECK-NEXT:   input: "Add1"
 // CHECK-NEXT:   attr {
 // CHECK-NEXT:     key: "T"
 // CHECK-NEXT:     value {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/type_attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/type_attr.mlir
index 98af3c8347e..adc7ef1a19e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/type_attr.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/type_attr.mlir
@@ -26,8 +26,7 @@
 
 func @main(%arg0 : tensor<16xf32>) {
   tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg0) : (tensor<16xf32>) -> tensor<16xf32>
-    %1:2 = tf_executor.island wraps "tf.MlirPassthroughOp"(%0#0) {extra_type_attr = [tensor<5xi32>, tensor<16xf32>], Tinputs = [tensor<16xf32>], Toutputs = [tensor<16xf32>], mlir_module = ""} : (tensor<16xf32>) -> tensor<16xf32>
+    %1:2 = tf_executor.island wraps "tf.MlirPassthroughOp"(%arg0) {extra_type_attr = [tensor<5xi32>, tensor<16xf32>], Tinputs = [tensor<16xf32>], Toutputs = [tensor<16xf32>], mlir_module = ""} : (tensor<16xf32>) -> tensor<16xf32>
     tf_executor.fetch
   }
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/type_list_attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/type_list_attr.mlir
index 4a09af84438..466c5adb0e5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/type_list_attr.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/type_list_attr.mlir
@@ -14,7 +14,7 @@ func @main() {
     // CHECK-NEXT:      type: DT_FLOAT
     // CHECK-NEXT:    }
     // CHECK-NEXT:  }
-    %0:2 = tf_executor.island wraps "tf.Empty"() {name = "dummy", dtype = "tfdtype$DT_FLOAT", emptylist = [], typelist = ["tfdtype$DT_INT32", "tfdtype$DT_FLOAT"]} : () -> tensor<*xi32>
+    %0:2 = tf_executor.island wraps "tf.Placeholder"() {name = "dummy", dtype = "tfdtype$DT_FLOAT", emptylist = [], typelist = ["tfdtype$DT_INT32", "tfdtype$DT_FLOAT"]} : () -> tensor<*xi32>
     tf_executor.fetch
   }
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/while-loop.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/while-loop.mlir
index fb2eac81278..83f756ff6e7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/while-loop.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/while-loop.mlir
@@ -10,7 +10,7 @@ func @main() {
   // CHECK-NEXT: input: "while/Add"
   tf_executor.graph {
     %0:3 = tf_executor.NextIteration.Source : tensor<*xi32> {device = "", T = "tfdtype$DT_INT32"} loc("while/NextIteration")
-    %1:2 = tf_executor.island wraps "tf.VariableV2"() {device = "", dtype = "tfdtype$DT_INT32", value = dense<0> : tensor<i32>} : () -> tensor<i32> loc("Ref_Variable")
+    %1:2 = tf_executor.island wraps "tf.VariableV2"() {device = "", dtype = "tfdtype$DT_INT32", value = dense<0> : tensor<i32>, shape = #tf.shape<0>, container = "", shared_name = ""} : () -> tensor<i32> loc("Ref_Variable")
     %2:2 = tf_executor.Enter %1#0 frame "while/while_context" parallel_iterations 10 : (tensor<i32>) -> (tensor<*xi32>, !tf_executor.control) {device = "", T = "tfdtype$DT_INT32"} loc("while/Enter")
     %3:3 = tf_executor.Merge %2#0, %0#0 : tensor<*xi32> {device = "", N = 2, T = "tfdtype$DT_INT32"} loc("while/Merge")
     %4:2 = tf_executor.island(%3#2) wraps "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", value = dense<10> : tensor<i32>} : () -> tensor<i32> loc("while/Less/y")
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index 28da3438520..60663f4bd4a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -1,11 +1,11 @@
 // RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-resources-to-args | FileCheck %s -dump-input-on-failure
 
-// One resource, one read.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// One resource, one read. The initial value of the resource is read.
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD]])
   // CHECK: return %[[PACK]]
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
@@ -18,13 +18,27 @@ func @main() -> tensor<2xf32> {
 
 // -----
 
+// One resource, one write. The initial value of the resource is not read.
+// CHECK-LABEL: func @main(%arg0: tensor<i1>) -> (tensor<f32> {tf.resource_name = "x"})
+func @main(%arg0: tensor<i1>) {
+  // CHECK-NOT: "tf.VarHandleOp"
+  // CHECK-NOT: "tf.AssignVariableOp"
+  // CHECK: return %[[CONST]]
+  %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  "tf.AssignVariableOp"(%1, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+  return
+}
+
+// -----
+
 // One resource, two reads using different resource handles.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
-  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg0)
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg1)
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
   // CHECK: return %[[PACK]]
 
@@ -42,12 +56,12 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // Two resources, two reads using different resources.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}, %arg1: tensor<f32> {tf.resource_name = "y"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}, %arg2: tensor<f32> {tf.resource_name = "y"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
-  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg1)
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg2)
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
   // CHECK: return %[[PACK]]
 
@@ -64,13 +78,13 @@ func @main() -> tensor<2xf32> {
 
 // -----
 
-// One resource with read and write.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.aliasing_output = 1 : i64, tf.resource_name = "x"}) -> (tensor<2xf32>, tensor<f32>)
-func @main() -> tensor<2xf32> {
+// One resource with read and write. The initial value of the resource is read.
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.aliasing_output = 1 : i64, tf.resource_name = "x"}) -> (tensor<2xf32>, tensor<f32>)
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.AssignVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %{{[0-9]*}})
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %{{[0-9]*}})
   // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %[[ADD1]])
-  // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%arg0, %[[ADD2]])
+  // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%arg1, %[[ADD2]])
   // CHECK: return %[[PACK]], %[[ADD1]]
 
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
@@ -87,6 +101,31 @@ func @main() -> tensor<2xf32> {
 
 // -----
 
+// One resource with read and write. The initial value of the resource is not read.
+// CHECK-LABEL: func @main(%arg0: tensor<i1>) -> (tensor<2xf32>, tensor<f32> {tf.resource_name = "x"})
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
+  // CHECK-NOT: "tf.AssignVariableOp"
+  // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%[[CONST]], %[[CONST]])
+  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %[[ADD1]])
+  // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
+  // CHECK: return %[[PACK]], %[[ADD1]]
+
+  %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  "tf.AssignVariableOp"(%1, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+  %2 = "tf.ReadVariableOp"(%1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  %3 = "tf.ReadVariableOp"(%1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  %4 = "tf.AddV2"(%3, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  "tf.AssignVariableOp"(%1, %4) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+  %5 = "tf.ReadVariableOp"(%1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  %6 = "tf.AddV2"(%4, %5) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %7 = "tf.Pack"(%2, %6) : (tensor<f32>, tensor<f32>) -> tensor<2xf32>
+  return %7 : tensor<2xf32>
+}
+
+// -----
+
 // A resource is passed into tf.If
 func @cond_false(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<f32>) -> tensor<f32> {
   return %arg1 : tensor<f32>
@@ -99,14 +138,14 @@ func @cond_true(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<f32>) ->
   return %2 : tensor<f32>
 }
 
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outputs = "result"}} {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outputs = "result"}} {
   %0 = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>} : () -> tensor<f32>
   %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   %2 = "tf.ReadVariableOp"(%1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
   %3 = "tf.Less"(%2, %0) : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %4 = "tf.If"(%3, %1, %2) {Tcond = i1, Tin = ["tfdtype$DT_RESOURCE", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"],
-       else_branch = @cond_false, is_stateless = false, output_shapes = ["tfshape$"],
+       else_branch = @cond_false, is_stateless = false, output_shapes = [#tf.shape<>],
        then_branch = @cond_true} : (tensor<i1>, tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
   %5 = "tf.Identity"(%4) : (tensor<f32>) -> tensor<f32>
   %6 = "tf.Pack"(%2, %5) {N = 2 : i64, T = f32, axis = 0 : i64, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<2xf32>
@@ -118,10 +157,11 @@ func @main() -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outp
 // Tests resource passed in as an argument is not modified and not returned.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
-  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
-  // CHECK-NEXT: "tf.AddV2"(%[[ARG_0]], %[[ARG_0]])
+// CHECK-SAME: %arg0: tensor<i1>
+// CHECK-SAME: %[[ARG_1:[a-z0-9]+]]: tensor<f32>
+func @main(%arg0: tensor<i1>, %arg1: tensor<!tf.resource<tensor<f32>>>) {
+  %0 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  // CHECK-NEXT: "tf.AddV2"(%[[ARG_1]], %[[ARG_1]])
   %1 = "tf.AddV2"(%0, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-NEXT: return
   return
@@ -132,9 +172,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // Tests resource passed in as an argument is modified but not returned.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) {
   // CHECK-NEXT: %[[CONST:[a-z0-9]+]] = "tf.Const"
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
@@ -147,9 +188,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // Tests last resource assign is returned as a result.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
@@ -165,9 +207,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // returns the same value prior.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> (tensor<f32>, tensor<f32>)
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
@@ -182,9 +225,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
 // Tests read interleaved between writes.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> (tensor<f32>, tensor<f32>)
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
   // CHECK-NEXT: %[[CONST_0:[a-z0-9]+]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
@@ -232,7 +276,7 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<!tf.resource<
 
 // Tests main function with multiple blocks.
 
-// expected-error@+1 {{expects 'main' function to have 1 block, got 2}}
+// expected-error@+1 {{expects function 'main' to have 1 block, got 2}}
 func @main() {
   br ^bb1
 ^bb1:
@@ -243,7 +287,7 @@ func @main() {
 
 // Tests main function is terminated with a non MLIR ReturnOp.
 
-// expected-error@+1 {{expects 'main' function to have a MLIR ReturnOp}}
+// expected-error@+1 {{expects function 'main' to have a MLIR ReturnOp}}
 func @main() {
 ^bb0:
   tf_device.return
@@ -273,9 +317,10 @@ func @main() {
 // Tests resource argument has users that are not ReadVariableOp or
 // AssignVariableOp.
 
-// expected-error@+1 {{expects users of resource argument 0 to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp'}}
+// expected-error@+1 {{expects users of resource argument 0 to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got [tf.UnknownOp, tf.VarIsInitializedOp]}}
 func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<i1> {
   %0 = "tf.VarIsInitializedOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
+  %1 = "tf.UnknownOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
   return %0 : tensor<i1>
 }
 
@@ -284,7 +329,7 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<i1> {
 // Tests VarHandleOp has users that are not removed.
 
 func @main() -> tensor<i1> {
-  // expected-error@+1 {{expects no uses but used by operations: tf.UnknownOp, tf.VarIsInitializedOp}}
+  // expected-error@+1 {{expects users to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got [tf.UnknownOp, tf.VarIsInitializedOp]}}
   %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   %1 = "tf.VarIsInitializedOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
   %2 = "tf.UnknownOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
new file mode 100644
index 00000000000..8b8a070cfab
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
@@ -0,0 +1,59 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-var-handles-to-args | FileCheck %s -dump-input-on-failure
+
+// Tests main function with multiple blocks.
+
+// expected-error@+1 {{expects function 'main' to have 1 block, got 2}}
+func @main() {
+  br ^bb1
+^bb1:
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @no_args
+// CHECK-SAME: (%arg0: tensor<!tf.resource> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @no_args() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource>
+  return
+}
+
+// CHECK-LABEL: func @some_args
+// CHECK-SAME: (%arg0: tensor<i1>, %arg1: tensor<!tf.resource> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @some_args(%arg0: tensor<i1>) {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource>
+  return
+}
+
+// CHECK-LABEL: func @unique_vars
+// CHECK-SAME: (%arg0: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"}, %arg1: tensor<!tf.resource<tensor<i32>>> {tf.resource_name = "y"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @unique_vars() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "y"} : () -> tensor<!tf.resource<tensor<i32>>>
+  return
+}
+
+// CHECK-LABEL: func @duplicate_vars
+// CHECK-SAME: (%arg0: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @duplicate_vars() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  return
+}
+
+// CHECK-LABEL: func @duplicate_vars_with_users
+// CHECK-SAME: (%arg0: tensor<f32>, %arg1: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"})
+// CHECK: "tf.ReadVariableOp"(%arg1)
+// CHECK: "tf.AssignAddVariableOp"(%arg1, %arg0)
+// CHECK-NOT: "tf.VarHandleOp"
+func @duplicate_vars_with_users(%arg0: tensor<f32>) {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  %2 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  "tf.AssignAddVariableOp"(%2, %arg0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index 40508121598..8da252fc832 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -18,11 +18,10 @@ func @controls_per_replica() {
   return
 }
 
-// CHECK: %[[CT_0:[0-9]*]] = tf_executor.ControlTrigger
-// CHECK: %[[CT_1:[0-9]*]] = tf_executor.ControlTrigger
-// CHECK: %[[ISLAND_0:[a-z_0-9]*]] = tf_executor.island(%[[CT_0]], %[[CT_1]])
-// CHECK: %[[ISLAND_1:[a-z_0-9]*]] = tf_executor.island(%[[CT_0]], %[[CT_1]])
-// CHECK: %[[ISLAND_2:[a-z_0-9]*]] = tf_executor.island(%[[ISLAND_0]], %[[ISLAND_1]])
+// CHECK: %[[CT_0:.*]] = tf_executor.ControlTrigger
+// CHECK: %[[CT_1:.*]] = tf_executor.ControlTrigger
+// CHECK: %{{.*}} = tf_executor.island(%[[CT_0]], %[[CT_1]])
+// CHECK: %{{.*}} = tf_executor.island(%[[CT_0]], %[[CT_1]])
 
 
 // Tests devices are not remapped if no devices were defined in replicate.
@@ -100,35 +99,45 @@ func @remap_device() {
 // CHECK: device = "/GPU:1"
 
 
-// Tests unused per replica island are added as a control dependency to the
-// island forwarding per replica results.
-// CHECK-LABEL: func @unused_replica_control
-// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>, %[[ARG_1:[a-z0-9]*]]: tensor<i1>)
-func @unused_replica_control(%arg0: tensor<i1>, %arg1: tensor<i1>) {
-  %0 = tf_executor.graph {
-    %1 = tf_executor.ControlTrigger {}
-    %2:2 = tf_executor.island(%1) {
-      %3:4 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<i1>) {n = 2 : i32} {
-        %4 = "tf.opA"(%ri) : (tensor<i1>) -> tensor<i1>
-        %5 = "tf.opB"(%4) : (tensor<i1>) -> tensor<i1>
-        tf_device.return %4, %5 : tensor<i1>, tensor<i1>
+// Tests replicate with control dependency output has each expanded replica
+// control pinned to a sink island.
+// CHECK-LABEL: func @replicate_control
+func @replicate_control() {
+  tf_executor.graph {
+    %1 = tf_executor.island {
+      tf_device.replicate {n = 2 : i32} {
+        tf_device.return
       }
-      tf_executor.yield %3#0 : tensor<i1>
+      tf_executor.yield
     }
-    tf_executor.fetch %2#0 : tensor<i1>
+    tf_executor.fetch %1 : !tf_executor.control
   }
   return
 }
 
-// CHECK:      %[[CT:[0-9]*]] = tf_executor.ControlTrigger
-// CHECK:      %[[ISLAND_0:[a-z_0-9]*]]:2, %{{.*}} = tf_executor.island(%[[CT]])
-// CHECK:        %[[OP_A_0:[0-9]*]] = "tf.opA"(%[[ARG_0]])
-// CHECK:        %[[OP_B_0:[0-9]*]] = "tf.opB"(%[[OP_A_0]])
-// CHECK:        tf_executor.yield %[[OP_A_0]], %[[OP_B_0]]
-// CHECK:      %[[ISLAND_1:[a-z_0-9]*]]:2, %[[ISLAND_1_control:[a-z_0-9]*]] = tf_executor.island(%[[CT]])
-// CHECK:        %[[OP_A_1:[0-9]*]] = "tf.opA"(%[[ARG_1]])
-// CHECK:        %[[OP_B_1:[0-9]*]] = "tf.opB"(%[[OP_A_1]])
-// CHECK:        tf_executor.yield %[[OP_A_1]], %[[OP_B_1]]
-// CHECK:      %[[ISLAND_2:.*]], %[[ISLAND_2_control:.*]] = tf_executor.island(%[[ISLAND_1_control]])
-// CHECK:        tf_executor.yield %[[ISLAND_0]]#0
-// CHECK:      tf_executor.fetch %[[ISLAND_2]]
+// CHECK: %[[REPLICA_0:.*]] = tf_executor.island
+// CHECK: %[[REPLICA_1:.*]] = tf_executor.island
+// CHECK: %[[SINK:.*]] = tf_executor.island(%[[REPLICA_0]], %[[REPLICA_1]])
+// CHECK: tf_executor.fetch %[[SINK]]
+
+
+// Tests replicate results are remapped correctly.
+// CHECK-LABEL: func @replicate_result
+func @replicate_result(%arg0: tensor<i1>, %arg1: tensor<i1>) {
+  %0:4 = tf_executor.graph {
+    %1:5 = tf_executor.island {
+      %2:4 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i1>) {n = 2 : i32} {
+        %3 = "tf.opA"(%arg2) : (tensor<i1>) -> tensor<f32>
+        %4 = "tf.opB"(%arg2) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %3, %4 : tensor<f32>, tensor<i32>
+      }
+      tf_executor.yield %2#0, %2#1, %2#2, %2#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+    }
+    tf_executor.fetch %1#0, %1#1, %1#2, %1#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+  }
+  return
+}
+
+// CHECK: %[[REPLICA_0:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: %[[REPLICA_1:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: tf_executor.fetch %[[REPLICA_0]]#0, %[[REPLICA_1]]#0, %[[REPLICA_0]]#1, %[[REPLICA_1]]#1
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_inlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_inlining.mlir
new file mode 100644
index 00000000000..788c6e2f5a1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_inlining.mlir
@@ -0,0 +1,25 @@
+// RUN: tf-opt -tf-shape-inference -inline="disable-simplify" %s | FileCheck %s --dump-input=always
+// RUN: tf-opt -tf-standard-pipeline=enable-inliner %s | FileCheck %s --dump-input=always
+
+// Tests function with argument has no resource subtype but caller operand has a
+// resource subtype, and after shape inference, function argument is refined and
+// no `tf.Cast` ops are generated.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 384 : i32}} {
+  // CHECK-LABEL: func @main
+  func @main() -> tensor<f32> {
+    // CHECK-NEXT: %[[VAR:.*]] = "tf.VarHandleOp"
+    // CHECK-NEXT: %[[READ_VAR:.*]] = "tf.ReadVariableOp"(%[[VAR]])
+    // CHECK-NEXT: return %[[READ_VAR]]
+    // CHECK-NOT: "tf.Cast"
+    %0 = "tf.VarHandleOp"() {_class = ["loc:@Variable"], allowed_devices = [], container = "", device = "", shared_name = "Variable"} : () -> tensor<!tf.resource<tensor<f32>>>
+    %1 = "tf.StatefulPartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    return %1 : tensor<f32>
+  }
+
+  // CHECK-NOT: func @callee
+  func @callee(%arg0: tensor<!tf.resource>) -> tensor<*xf32> attributes {sym_visibility = "private", tf.signature.is_stateful} {
+    %0 = "tf.ReadVariableOp"(%arg0) {device = ""} : (tensor<!tf.resource>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index bf4e6c1853c..9e7358ab2f5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -9,17 +9,17 @@ func @only_resource_load() -> tensor<*xi32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
 
   // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = i32}
-  // CHECK: "tf_device.launch"
+  // CHECK: "tf_device.cluster"
   // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]])
   // CHECK: tf_device.return %[[COMPUTE_RES]]
-  // CHECK: {device = "tpu0", launch_attr = "launch_attr"}
+  // CHECK: {cluster_attr = "cluster_attr"}
   // CHECK-SAME: () -> tensor<*xi32>
 
-  %1 = "tf_device.launch"() ( {
+  %1 = "tf_device.cluster"() ( {
     %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32>
     %3 = "tf.SomeComputation"(%2) : (tensor<*xi32>) -> (tensor<*xi32>)
     tf_device.return %3 : tensor<*xi32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
 
   return %1 : tensor<*xi32>
 }
@@ -34,20 +34,20 @@ func @only_resource_store() -> tensor<*xi32> {
   // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
 
-  // CHECK: %[[LAUNCH_RES:[0-9]*]]:2 = "tf_device.launch"
+  // CHECK: %[[CLUSTER_RES:[0-9]*]]:2 = "tf_device.cluster"
   // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"()
   // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
-  // CHECK: {device = "tpu0", launch_attr = "launch_attr"}
+  // CHECK: {cluster_attr = "cluster_attr"}
   // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>)
-  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = i32}
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1) {dtype = i32}
 
-  %1 = "tf_device.launch"() ( {
+  %1 = "tf_device.cluster"() ( {
     %2 = "tf.SomeComputation"() : () -> (tensor<*xi32>)
     "tf.AssignVariableOp"(%0, %2) {dtype = i32} : (tensor<*x!tf.resource>, tensor<*xi32>) -> ()
     tf_device.return %2 : tensor<*xi32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
 
-  // CHECK: return %[[LAUNCH_RES]]#0
+  // CHECK: return %[[CLUSTER_RES]]#0
   return %1 : tensor<*xi32>
 }
 
@@ -62,21 +62,21 @@ func @same_resource_load_and_store() -> tensor<*xi32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
 
   // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = i32}
-  // CHECK: %[[LAUNCH_RES:[0-9]*]]:2 = "tf_device.launch"
+  // CHECK: %[[CLUSTER_RES:[0-9]*]]:2 = "tf_device.cluster"
   // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]])
   // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
-  // CHECK: {device = "tpu0", launch_attr = "launch_attr"}
+  // CHECK: {cluster_attr = "cluster_attr"}
   // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>)
-  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = i32}
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1) {dtype = i32}
 
-  %1 = "tf_device.launch"() ( {
+  %1 = "tf_device.cluster"() ( {
     %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32>
     %3 = "tf.SomeComputation"(%2) : (tensor<*xi32>) -> (tensor<*xi32>)
     "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource>, tensor<*xi32>) -> ()
     tf_device.return %3 : tensor<*xi32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
 
-  // CHECK: return %[[LAUNCH_RES]]#0
+  // CHECK: return %[[CLUSTER_RES]]#0
   return %1 : tensor<*xi32>
 }
 
@@ -87,8 +87,8 @@ func @same_resource_load_and_store() -> tensor<*xi32> {
 // CHECK-LABEL: func @internal_resource
 func @internal_resource() -> tensor<*xi32> {
 
-  // CHECK: %[[LAUNCH_RES:[0-9]*]] = "tf_device.launch"
-  %0 = "tf_device.launch"() ( {
+  // CHECK: %[[CLUSTER_RES:[0-9]*]] = "tf_device.cluster"
+  %0 = "tf_device.cluster"() ( {
 
     // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
     %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
@@ -104,9 +104,9 @@ func @internal_resource() -> tensor<*xi32> {
 
     // CHECK: tf_device.return %[[COMPUTE_RES]]
     tf_device.return %3 : tensor<*xi32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
 
-  // CHECK: return %[[LAUNCH_RES]]
+  // CHECK: return %[[CLUSTER_RES]]
   return %0 : tensor<*xi32>
 }
 
@@ -120,12 +120,12 @@ func @lifting_failure() -> tensor<*xi32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
 
   // expected-error @+1 {{has remaining resource inputs that can not be lifted}}
-  %1 = "tf_device.launch"() ( {
+  %1 = "tf_device.cluster"() ( {
     %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32>
 		%3 = "tf.SomeResourceOp"(%0, %2) : (tensor<*x!tf.resource>, tensor<*xi32>) -> tensor<*xi32>
     "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource>, tensor<*xi32>) -> ()
     tf_device.return %3 : tensor<*xi32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
 
   return %1 : tensor<*xi32>
 }
@@ -135,27 +135,27 @@ func @lifting_failure() -> tensor<*xi32> {
 // Tests that pass lifts resource reads/writes from a loop, and removed unused
 // resources.
 
-// CHECK-LABEL: func @launch_with_loop
-func @launch_with_loop() -> () {
+// CHECK-LABEL: func @cluster_with_loop
+func @cluster_with_loop() -> () {
   // CHECK: %[[COUNT:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>}
   %0 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   %unused = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
-  // CHECK: %[[LAUNCH:.*]] = "tf_device.launch"()
-  "tf_device.launch"() ( {
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  "tf_device.cluster"() ( {
     // CHECK: %[[WHILE:.*]]:2 = "tf.While"(%[[COUNT]], %[[READ]])
     %2:3 = "tf.While"(%0, %1, %unused)
                {body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-                output_shapes = ["tfshape$", "tfshape$"]}
+                output_shapes = [#tf.shape<>, #tf.shape<>]}
          : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
          -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK: tf_device.return %[[WHILE]]#1 : tensor<f32>
     tf_device.return
-  // CHECK: {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<f32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  // CHECK: "tf.AssignVariableOp"(%[[VH]], %[[LAUNCH]])
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> tensor<f32>
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  // CHECK: "tf.AssignVariableOp"(%[[VH]], %[[CLUSTER]])
   // CHECK: return
   return
 }
@@ -188,24 +188,24 @@ func @while_cond(%arg0: tensor<i32>, %arg1: tensor<*x!tf.resource<tensor<f32>>>,
 
 // Tests that pass lifts resource reads from loop condition.
 
-// CHECK-LABEL: func @launch_with_loop
-func @launch_with_loop() -> () {
+// CHECK-LABEL: func @cluster_with_loop
+func @cluster_with_loop() -> () {
   // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
-  // CHECK: %[[LAUNCH:.*]] = "tf_device.launch"()
-  "tf_device.launch"() ( {
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  "tf_device.cluster"() ( {
     // CHECK: %[[WHILE:.*]] = "tf.While"(%[[READ]])
     %1 = "tf.While"(%0) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = ["tfshape$"]}
+      output_shapes = [#tf.shape<>]}
          : (tensor<*x!tf.resource<tensor<f32>>>)
          -> (tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK: tf_device.return %[[WHILE]] : tensor<f32>
     tf_device.return
-  // CHECK: {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<f32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  // CHECK: "tf.AssignVariableOp"(%[[VH]], %[[LAUNCH]])
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> tensor<f32>
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  // CHECK: "tf.AssignVariableOp"(%[[VH]], %[[CLUSTER]])
   // CHECK: return
   return
 }
@@ -230,23 +230,23 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 // Tests that pass lifts read-only resource reads from loop, but does not add
 // assign after the loop.
 
-// CHECK-LABEL: func @launch_with_loop
-func @launch_with_loop() -> () {
+// CHECK-LABEL: func @cluster_with_loop
+func @cluster_with_loop() -> () {
   // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
-  // CHECK: "tf_device.launch"()
-  "tf_device.launch"() ( {
+  // CHECK: "tf_device.cluster"()
+  "tf_device.cluster"() ( {
     // CHECK: %[[WHILE:.*]] = "tf.While"(%[[READ]])
     %1 = "tf.While"(%0) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = ["tfshape$"]}
+      output_shapes = [#tf.shape<>]}
          : (tensor<*x!tf.resource<tensor<f32>>>)
          -> (tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK: tf_device.return
     tf_device.return
-  // CHECK: {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   // CHECK-NOT: "tf.AssignVariableOp"
   // CHECK: return
   return
@@ -267,26 +267,26 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // Tests that pass lifts resource reads from nested loops.
 
-// CHECK-LABEL: func @launch_with_nested_loop
-func @launch_with_nested_loop() -> () {
+// CHECK-LABEL: func @cluster_with_nested_loop
+func @cluster_with_nested_loop() -> () {
   // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   // CHECK: %[[VH_UNUSED:.*]] = "tf.VarHandleOp"()
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
-  // CHECK: %[[LAUNCH:.*]] = "tf_device.launch"()
-  "tf_device.launch"() ( {
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  "tf_device.cluster"() ( {
     // CHECK: %[[WHILE:.*]] = "tf.While"(%[[READ]])
     %2:2 = "tf.While"(%0, %1) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = ["tfshape$", "tfshape$"]}
+      output_shapes = [#tf.shape<>, #tf.shape<>]}
          : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
          -> (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK: tf_device.return %[[WHILE]] : tensor<f32>
     tf_device.return
-  // CHECK: {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<f32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  // CHECK: "tf.AssignVariableOp"(%[[VH]], %[[LAUNCH]])
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> tensor<f32>
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  // CHECK: "tf.AssignVariableOp"(%[[VH]], %[[CLUSTER]])
   // CHECK: return
   return
 }
@@ -296,7 +296,7 @@ func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!tf
   // CHECK: %[[WHILE:.*]] = "tf.While"(%[[BARG0]])
   %0:2 = "tf.While"(%arg0, %arg1) {
     body = @while_body1, cond = @while_cond1, device = "", is_stateless = false,
-    output_shapes = ["tfshape$", "tfshape$"]}
+    output_shapes = [#tf.shape<>, #tf.shape<>]}
        : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
        -> (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
   // CHECK-NEXT: return %[[WHILE]]
@@ -330,15 +330,15 @@ func @while_cond1(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!t
 
 // Tests that pass reports error on non-aliasing while input/output resources.
 
-func @launch_with_loop() -> () {
+func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
-  "tf_device.launch"() ( {
+  "tf_device.cluster"() ( {
     %1 = "tf.While"(%0) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = ["tfshape$"]}
+      output_shapes = [#tf.shape<>]}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>) {
@@ -355,15 +355,15 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // Tests that pass reports error on unsupported ops in loop body.
 
-func @launch_with_loop() -> () {
+func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
-  "tf_device.launch"() ( {
+  "tf_device.cluster"() ( {
     %1 = "tf.While"(%0) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = ["tfshape$"]}
+      output_shapes = [#tf.shape<>]}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>) {
@@ -380,15 +380,15 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // Tests that pass reports error on unsupported ops in loop cond.
 
-func @launch_with_loop() -> () {
+func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
-  "tf_device.launch"() ( {
+  "tf_device.cluster"() ( {
     %1 = "tf.While"(%0) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = ["tfshape$"]}
+      output_shapes = [#tf.shape<>]}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>) {
@@ -408,19 +408,19 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // Tests that pass lifts resource reads from if branches.
 
-// CHECK: func @launch_with_if(%[[ARG0:.*]]: tensor<i1>) -> tensor<4xf32>
-func @launch_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
+// CHECK: func @cluster_with_if(%[[ARG0:.*]]: tensor<i1>) -> tensor<4xf32>
+func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   // CHECK-DAG: %[[READ0:.*]] = "tf.ReadVariableOp"(%[[VH0]])
   // CHECK-DAG: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
-  // CHECK: %[[LAUNCH:.*]]:2 = "tf_device.launch"()
-  %2 = "tf_device.launch"() ( {
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  %2 = "tf_device.cluster"() ( {
     // CHECK: %[[IF:.*]]:2 = "tf.If"(%[[ARG0]], %[[READ0]], %[[READ1]])
     %3:2 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
-        output_shapes = ["tfshape$","tfshape$dim { size: 4 }"], is_stateless = false}
+        output_shapes = [#tf.shape<>, #tf.shape<4>], is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
       -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
     // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[IF]]#1, %[[IF]]#0)
@@ -428,10 +428,10 @@ func @launch_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
     %5 = "tf.AddV2"(%4, %3#1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
     // CHECK-NEXT: tf_device.return %[[ADD]], %[[IF]]#1
     tf_device.return %5 : tensor<4xf32>
-  // CHECK: {device = "tpu0", launch_attr = "launch_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<4xf32>
-  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[LAUNCH]]#1)
-  // CHECK: return %[[LAUNCH]]#0
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
   return %2 : tensor<4xf32>
 }
 // CHECK: func @if_then(%[[TARG0:.*]]: tensor<4xf32>, %[[TARG1:.*]]: tensor<4xf32>)
@@ -457,15 +457,15 @@ func @if_else(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.
 
 // Tests that pass lifts resource reads from nested if ops.
 
-// CHECK: func @launch_with_nested_if(%[[ARG0:.*]]: tensor<i1>) -> tensor<f32>
-func @launch_with_nested_if(%arg0: tensor<i1>) -> tensor<f32> {
+// CHECK: func @cluster_with_nested_if(%[[ARG0:.*]]: tensor<i1>) -> tensor<f32>
+func @cluster_with_nested_if(%arg0: tensor<i1>) -> tensor<f32> {
   // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   // CHECK-DAG: %[[READ0:.*]] = "tf.ReadVariableOp"(%[[VH0]])
-  // CHECK: %[[LAUNCH:.*]]:2 = "tf_device.launch"()
-  %2 = "tf_device.launch"() ( {
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  %2 = "tf_device.cluster"() ( {
     // CHECK: %[[IF:.*]] = "tf.If"(%[[ARG0]], %[[READ0]])
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
         output_shapes = [], is_stateless = false}
@@ -476,10 +476,10 @@ func @launch_with_nested_if(%arg0: tensor<i1>) -> tensor<f32> {
     %5 = "tf.AddV2"(%4, %4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
     // CHECK-NEXT: tf_device.return %[[ADD]], %[[IF]]
     tf_device.return %5 : tensor<f32>
-  // CHECK: {device = "tpu0", launch_attr = "launch_attr"} : () -> (tensor<f32>, tensor<f32>)
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<f32>
-  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[LAUNCH]]#1)
-  // CHECK: return %[[LAUNCH]]#0
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<f32>, tensor<f32>)
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<f32>
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
   return %2 : tensor<f32>
 }
 // CHECK: func @if_then(%[[TARG0:.*]]: tensor<f32>)
@@ -520,18 +520,18 @@ func @inner_if_else(%arg0: tensor<*x!tf.resource<tensor<f32>>>)
 
 // Tests that the pass reports error for ambiguous resource aliasing.
 
-func @launch_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
+func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
-  %2 = "tf_device.launch"() ( {
+  %2 = "tf_device.cluster"() ( {
     // expected-error @+1 {{unsupported tf.IfOp output: resource does not alias a single input.}}
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
-        output_shapes = ["tfshape$"], is_stateless = false}
+        output_shapes = [#tf.shape<>], is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
       -> (tensor<*x!tf.resource<tensor<4xf32>>>)
     %4 = "tf.ReadVariableOp"(%3) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
     tf_device.return %4 : tensor<4xf32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<4xf32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
   return %2 : tensor<4xf32>
 }
 func @if_then(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.resource<tensor<4xf32>>>)
@@ -548,15 +548,15 @@ func @if_else(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.
 // Tests that the pass lifts resources on two partitioned call ops sharing the
 // same callee. The lifting should clone the callee then modify the clone.
 
-// CHECK-LABEL: @launch_with_partitioned_call
-func @launch_with_partitioned_call() -> tensor<f32> {
+// CHECK-LABEL: @cluster_with_partitioned_call
+func @cluster_with_partitioned_call() -> tensor<f32> {
   // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   // CHECK: %[[CONST:.*]] = "tf.Const"()
   %1 = "tf.Const"() {value = dense<10.0> : tensor<f32>} : () -> tensor<f32>
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
-  // CHECK: %[[LAUNCH:.*]] = "tf_device.launch"()
-  %2 = "tf_device.launch"() ( {
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  %2 = "tf_device.cluster"() ( {
     // CHECK: %[[PC0:.*]] = "tf.PartitionedCall"(%[[CONST]], %[[READ]], %[[CONST]])
     // CHECK-SAME: f = @callee_resource_lifted
     %3 = "tf.PartitionedCall"(%1, %0, %1) {f = @callee, config = "", config_proto = "", executor_type = ""}
@@ -569,7 +569,7 @@ func @launch_with_partitioned_call() -> tensor<f32> {
     %5 = "tf.AddV2"(%3, %4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
     // CHECK: tf_device.return %[[ADD]] : tensor<f32>
     tf_device.return %5 : tensor<f32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<f32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<f32>
   return %2 : tensor<f32>
 }
 // CHECK: @callee(%[[OA0:.*]]: tensor<f32>, %[[OA1:.*]]: tensor<*x!tf.resource<tensor<f32>>>, %[[OA2:.*]]: tensor<f32>) -> tensor<f32>
@@ -592,8 +592,8 @@ func @callee(%arg0: tensor<f32>, %arg1: tensor<*x!tf.resource<tensor<f32>>>, %ar
 // sharing the same callee. The lifting should clone the callee then modify the
 // clone.
 
-// CHECK-LABEL: @launch_with_stateful_partitioned_call
-func @launch_with_stateful_partitioned_call() -> () {
+// CHECK-LABEL: @cluster_with_stateful_partitioned_call
+func @cluster_with_stateful_partitioned_call() -> () {
   // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
@@ -602,8 +602,8 @@ func @launch_with_stateful_partitioned_call() -> () {
   %2 = "tf.Const"() {value = dense<10.0> : tensor<f32>} : () -> tensor<f32>
   // CHECK-DAG: %[[READ0:.*]] = "tf.ReadVariableOp"(%[[VH0]])
   // CHECK-DAG: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
-  // CHECK: %[[LAUNCH:.*]] = "tf_device.launch"()
-  "tf_device.launch"() ( {
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  "tf_device.cluster"() ( {
     // CHECK: %[[PC0:.*]] = "tf.StatefulPartitionedCall"(%[[READ0]], %[[READ1]], %[[CONST]])
     // CHECK-SAME: f = @callee_resource_lifted
     %3 = "tf.StatefulPartitionedCall"(%0, %1, %2) {f = @callee, config = "", config_proto = "", executor_type = ""}
@@ -614,9 +614,9 @@ func @launch_with_stateful_partitioned_call() -> () {
       : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>>
     // CHECK: tf_device.return %[[PC1]] : tensor<f32>
     tf_device.return
-    // CHECK: {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<f32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[LAUNCH]])
+    // CHECK: {cluster_attr = "cluster_attr"} : () -> tensor<f32>
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]])
   return
 }
 // CHECK: @callee(%[[OA0:.*]]: tensor<*x!tf.resource<tensor<f32>>>, %[[OA1:.*]]: tensor<*x!tf.resource<tensor<f32>>>, %[[OA2:.*]]: tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>>
@@ -637,17 +637,17 @@ func @callee(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!tf.res
 // Tests that the pass reports error on called function that has resource output
 // which doesn't alias an input.
 
-func @launch_with_stateful_partitioned_call() -> () {
+func @cluster_with_stateful_partitioned_call() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   %2 = "tf.Const"() {value = dense<10.0> : tensor<f32>} : () -> tensor<f32>
-  "tf_device.launch"() ( {
+  "tf_device.cluster"() ( {
     %3 = "tf.StatefulPartitionedCall"(%0, %1, %2) {f = @callee, config = "", config_proto = "", executor_type = ""}
       : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>>
     %4 = "tf.StatefulPartitionedCall"(%3, %1, %2) {f = @callee, config = "", config_proto = "", executor_type = ""}
       : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>>
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 // expected-error @+1 {{unsupported function call: resource return value does not alias an input.}}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 73e318f9c50..160bba94cfc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tf-shape-inference -verify-diagnostics | FileCheck %s -dump-input=fail -color
+// RUN: tf-opt %s -tf-shape-inference -verify-diagnostics | FileCheck %s -dump-input=fail
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
 // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
@@ -71,6 +71,15 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %1 : tensor<?x?x?x?xf32>
   }
 
+// Tests where tf.Const's value needs to be refined.
+
+  func @const_refine() -> tensor<*xi32> {
+    %0 = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<*xi32>
+    // CHECK: "tf.Const"
+    // CHECK-SAME: -> tensor<2xi32>
+    return %0 : tensor<*xi32>
+  }
+
 // Tests the case where an op's shape function returns non-fully-defined shapes.
 
 // CHECK-LABEL: func @op_non_fully_defined_shape_fn
@@ -92,7 +101,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @shape_from_if_to_branch_functions
   func @shape_from_if_to_branch_functions(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
-    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @if_else_branch, is_stateless = true, name = "if", output_shapes = ["tfshape$"], then_branch = @if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @if_else_branch, is_stateless = true, name = "if", output_shapes = [#tf.shape<>], then_branch = @if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
     return %0 : tensor<1x2x3xf32>
   }
 
@@ -175,9 +184,9 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
   // CHECK-LABEL: func @invalid_function_reused_by_control_flows
   func @invalid_function_reused_by_control_flows(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
 	  // expected-warning @+1 {{unable to refine shape}}
-    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", output_shapes = ["tfshape$"], then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", output_shapes = [#tf.shape<>], then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
 	  // expected-warning @+1 {{unable to refine shape}}
-    %1 = "tf.If"(%arg0, %0) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", output_shapes = ["tfshape$"], then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    %1 = "tf.If"(%arg0, %0) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", output_shapes = [#tf.shape<>], then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
     return %0 : tensor<1x2x3xf32>
   }
 
@@ -282,6 +291,15 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<?x?x?xf32>
   }
 
+  // Tests that tensor_cast result shapes are refined.
+  // CHECK-LABEL: func @tensor_cast_refine
+  func @tensor_cast_refine(%arg0: tensor<4xi32>) -> (tensor<*xi32>) {
+    // CHECK: tensor_cast
+    // CHECK-SAME: tensor<4xi32> to tensor<4xi32>
+    %0 = tensor_cast %arg0 : tensor<4xi32> to tensor<*xi32>
+    return %0 : tensor<*xi32>
+  }
+
   // CHECK-LABEL: func @fold_cast
   func @fold_cast(%arg0: tensor<*xf32>) -> tensor<*xf32> {
     // CHECK-NOT: Cast
@@ -331,4 +349,65 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
   func @stateful_partitioned_call_func(%arg0: tensor<?xi32>) -> (tensor<?xi32>) {
     return %arg0 : tensor<?xi32>
   }
+
+  // Test propagation involving const values across caller and callee.
+  func @partitioned_call_const(%arg0 : tensor<6xf32>) -> tensor<*xf32> {
+    %0 = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %1 = "tf.PartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @partitioned_call_func_const} : (tensor<2xi32>) -> (tensor<2xi32>)
+    // CHECK: "tf.Reshape"
+    // CHECK-SAME: tensor<3x2xf32>
+    %2 = "tf.Reshape"(%arg0, %1) : (tensor<6xf32>, tensor<2xi32>) -> tensor<*xf32>
+    return %2 : tensor<*xf32>
+  }
+
+  // CHECK-LABEL: func @partitioned_call_func_const
+  func @partitioned_call_func_const(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    // CHECK: return %[[CONST]]
+    return %arg0 : tensor<2xi32>
+  }
+
+  // CHECK-LABEL: func @tensor_list_refine
+  func @tensor_list_refine() {
+    tf_executor.graph {
+      %control = tf_executor.island {
+        %0 = "tf.Const"() {device = "", value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+        %1 = "tf.Const"() {device = "", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+        // CHECK: TensorListReserve{{.*}}-> tensor<!tf.variant<tensor<2x2x!tf.variant>>>
+        %2 = "tf.TensorListReserve"(%0, %1) {device = ""} : (tensor<2xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<*x!tf.variant>>>
+        // CHECK: TensorListReserve{{.*}}-> tensor<!tf.variant<tensor<2x2xf32>>>
+        %3 = "tf.TensorListReserve"(%0, %1) {device = ""} : (tensor<2xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<*xf32>>>
+        %4 = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+        %5 = "tf.Const"() {device = "", value = dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+        // CHECK: tf.TensorListSetItem{{.*}}: (tensor<!tf.variant<tensor<2x2xf32>>>, tensor<i32>, tensor<2x2xf32>) -> tensor<!tf.variant<tensor<2x2xf32>>>
+        %6 = "tf.TensorListSetItem"(%3, %4, %5) {device = ""} : (tensor<!tf.variant<tensor<*xf32>>>, tensor<i32>, tensor<2x2xf32>)-> tensor<*x!tf.variant>
+        %7 = "tf.Const"() {device = "", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+        // CHECK: tf.TensorListStack{{.*}}: (tensor<!tf.variant<tensor<2x2xf32>>>, tensor<i32>) -> tensor<?x2x2xf32>
+        %8 = "tf.TensorListStack"(%6, %7) {device = "", num_elements = -1 : i64} : (tensor<*x!tf.variant>, tensor<i32>) -> tensor<*xf32>
+        tf_executor.yield
+      }
+      tf_executor.fetch
+    }
+    return
+  }
+
+  // CHECK-LABEL: dont_update_for_ref
+  func @dont_update_for_ref() -> () {
+    // CHECK: () -> tensor<4x!tf.f32ref>
+    %11 = "tf.VariableV2"() {container = "", device = "", shape = #tf.shape<4>, shared_name = ""} : () -> tensor<4x!tf.f32ref>
+    // CHECK: (tensor<4x!tf.f32ref>) -> tensor<4xf32>
+    %12 = "tf.Identity"(%11) {device = ""} : (tensor<4x!tf.f32ref>) -> tensor<4xf32>
+    // CHECK: (tensor<4xf32>) -> tensor<4xf32>
+    %13 = "tf.Neg"(%12) {device = ""} : (tensor<4xf32>) -> tensor<4xf32>
+    return
+  }
+
+  // CHECK-LABEL: operand_as_shape
+  func @operand_as_shape(%18: tensor<i32>, %39: tensor<1x4x4x32xf32>) -> () {
+    %cst_5 = constant dense<512> : tensor<i32>
+    %19 = "tf.Pack"(%18, %cst_5) {N = 2 : i64, T = i32, axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+    // CHECK: -> tensor<1x512xf32>
+    %40 = "tf.Reshape"(%39, %19) {T = f32, Tshape = i32, device = ""} : (tensor<1x4x4x32xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+   return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/sink_constant.mlir b/tensorflow/compiler/mlir/tensorflow/tests/sink_constant.mlir
index 282fa4953a5..b9c6e242e70 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/sink_constant.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/sink_constant.mlir
@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: func @sink_const
 func @sink_const(%arg0 : tensor<16xf32>) -> (tensor<16xf32>, tensor<f32>) {
-  // Verify that the constant are sunk in the tf_device.launch region using them
+  // Verify that the constant are sunk in the tf_device.cluster region using them
   // and removed if no other use is left.
 
   // Only the 2.0 and 3.0 constants are removed, the 4.0 has a use in the return
@@ -13,11 +13,11 @@ func @sink_const(%arg0 : tensor<16xf32>) -> (tensor<16xf32>, tensor<f32>) {
   %2 = "tf.Const"() {value = dense<4.000000e+00> : tensor<f32>} : () -> tensor<f32>
   %3 = tf_executor.graph {
     %res, %ctl = tf_executor.island {
-      %3 = "tf_device.launch"() ({
+      %3 = "tf_device.cluster"() ({
 
         // In the device region, check that the 3 constants are materialized and
         // remapped to the uses.
-        // CHECK: tf_device.launch
+        // CHECK: tf_device.cluster
         // CHECK-DAG: %[[CST2:.*]] = "tf.Const"{{.*}}2.0
         // CHECK-DAG: %[[CST3:.*]] = "tf.Const"{{.*}}3.0
         // CHECK-DAG: %[[CST4:.*]] = "tf.Const"{{.*}}4.0
@@ -31,7 +31,7 @@ func @sink_const(%arg0 : tensor<16xf32>) -> (tensor<16xf32>, tensor<f32>) {
         %5 = "tf.Mul"(%4, %1) : (tensor<16xf32>, tensor<f32>) -> tensor<16xf32>
         %6 = "tf.Mul"(%5, %2) : (tensor<16xf32>, tensor<f32>) -> tensor<16xf32>
         tf_device.return %6 : tensor<16xf32>
-      }) {device = "tpu0"} : () -> tensor<16xf32>
+      }) {} : () -> tensor<16xf32>
       tf_executor.yield %3 : tensor<16xf32>
     }
     tf_executor.fetch %res : tensor<16xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
index e8c5bb59663..26801e57698 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
@@ -185,7 +185,7 @@ func @main(%arg0: tensor<i1>) -> () {
 }
 
 // CHECK: func @callee(%[[AARG0:.*]]: tensor<!tf.resource>, %[[AARG1:.*]]: tensor<i1>) -> tensor<!tf.resource>
-func @callee(%arg0: tensor<!tf.resource>, %arg1: tensor<i1>) -> tensor<!tf.resource> {
+func @callee(%arg0: tensor<!tf.resource>, %arg1: tensor<i1>) -> tensor<!tf.resource> attributes {sym_visibility = "public"} {
   %elem = "tf._SomeOp"(%arg1) : (tensor<i1>) -> tensor<f32>
   // CHECK: tf.StackPushV2"
   %push = "tf.StackPushV2"(%arg0, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
@@ -201,6 +201,62 @@ func @callee(%arg0: tensor<!tf.resource>, %arg1: tensor<i1>) -> tensor<!tf.resou
 
 // -----
 
+// Tests PartitionedCall/StatefulPartitionedCall with private callee function.
+
+// CHECK-LABEL: func @main
+func @main(%arg0: tensor<i1>) -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.Stack
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  // CHECK: "tf.StatefulPartitionedCall"
+  // CHECK-SAME: f = @callee
+  %call = "tf.StatefulPartitionedCall"(%stack, %arg0) {f = @callee, config = "", config_proto = "", executor_type = ""}
+    : (tensor<!tf.resource>, tensor<i1>) -> tensor<!tf.resource>
+  // CHECK: "tf.PartitionedCall"
+  // CHECK-SAME: f = @callee
+  %call2 = "tf.PartitionedCall"(%stack, %arg0) {f = @callee, config = "", config_proto = "", executor_type = ""}
+    : (tensor<!tf.resource>, tensor<i1>) -> tensor<!tf.resource>
+  // CHECK: "tf.Slice"
+  %pop = "tf.StackPopV2"(%call) : (tensor<!tf.resource>) -> tensor<f32>
+  // CHECK-NOT: tf.Stack
+  "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
+  // CHECK: return
+  return
+}
+
+// CHECK: func @callee(%[[ARG0:.*]]: tensor<!tf.resource<tensor<10xf32>>>, %[[ARG1:.*]]: tensor<i1>, %[[ARG2:.*]]: tensor<!tf.resource<tensor<1xi32>>>)
+func @callee(%arg0: tensor<!tf.resource>, %arg1: tensor<i1>) -> tensor<!tf.resource> attributes {sym_visibility = "private"} {
+  %elem = "tf._SomeOp"(%arg1) : (tensor<i1>) -> tensor<f32>
+  // CHECK-NOT: "tf.StackPushV2"
+  // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
+  // CHECK: "tf.AssignVariableOp"(%[[TARG0:.*]], %[[UPDATE]])
+  // CHECK: "tf.AssignVariableOp"(%[[EARG1:.*]],
+  // CHECK-NOT: "tf.StackPushV2"
+  %push = "tf.StackPushV2"(%arg0, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+  return %arg0 : tensor<!tf.resource>
+}
+
+// -----
+
+// Tests PartitionedCall op with no signature change on callee.
+
+// CHECK-LABEL: func @main
+func @main() -> () {
+  "tf.PartitionedCall"() {f = @callee, config = "", config_proto = "", executor_type = ""} : () -> ()
+  return
+}
+// CHECK: func @callee()
+func @callee() -> () attributes {sym_visibility = "public"} {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.Stack
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  %elem = "tf._SomeOp"() : () -> tensor<f32>
+  %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+  return
+}
+
+// -----
+
 // Tests that the pass reports error on unknown stack size.
 
 func @main(%arg0: tensor<i32>) -> tensor<2xi32> {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
index 1a13338b0ba..18b250c92a4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
@@ -9,7 +9,7 @@ func @main() -> tensor<3xf32> {
   // CHECK-SAME: -> tensor<5x3xf32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
   // CHECK: "tf.AssignVariableOp"(%[[VAR]], %[[BUFFER]])
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   // CHECK: %[[IND:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAL:.*]] = "tf.Const"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<3xf32>} : () -> tensor<3xf32>
@@ -42,7 +42,7 @@ func @main() -> tensor<i32> {
   // CHECK-SAME: -> tensor<5x3xf32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
   // CHECK: "tf.AssignVariableOp"(%[[VAR]], %[[BUFFER]])
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$unknown_rank: true", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<*>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %value = "tf.Const"() {value = dense<[1.0, 2.0, 3.0]> : tensor<3xf32>} : () -> tensor<3xf32>
   %write = "tf.TensorArrayWriteV3"(%ta#0, %index, %value, %ta#1) : (tensor<!tf.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
@@ -61,18 +61,18 @@ func @main() -> () {
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
   // CHECK: "tf.AssignVariableOp"
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
   // CHECK: %[[CONCAT_RESHAPE:.*]] = "tf.Reshape"(%[[READ]],
   // CHECK-SAME: -> tensor<15xf32>
   // CHECK: %[[LENS:.*]] = "tf.Const"() {value = dense<3> : tensor<5xi64>} : () -> tensor<5xi64>
-  %concat:2 = "tf.TensorArrayConcatV3"(%ta#0, %ta#1) {element_shape_except0 = "tfshape$unknown_rank: true"} : (tensor<!tf.resource>, tensor<f32>) -> (tensor<*xf32>, tensor<*xi64>)
+  %concat:2 = "tf.TensorArrayConcatV3"(%ta#0, %ta#1) {element_shape_except0 = #tf.shape<*>} : (tensor<!tf.resource>, tensor<f32>) -> (tensor<*xf32>, tensor<*xi64>)
   // CHECK: %[[SPLIT_RESHAPE:.*]] = "tf.Reshape"(%[[CONCAT_RESHAPE]],
   // CHECK-SAME: -> tensor<5x3xf32>
   // CHECK: %[[READ2:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[READ2]], %[[SPLIT_RESHAPE]])
   // CHECK: "tf.AssignVariableOp"(%[[VAR]], %[[ADD]])
-  %split = "tf.TensorArraySplitV3"(%ta#0, %concat#0, %concat#1, %ta#1) {element_shape_except0 = "tfshape$unknown_rank: true"} : (tensor<!tf.resource>, tensor<*xf32>, tensor<*xi64>, tensor<f32>) -> tensor<f32>
+  %split = "tf.TensorArraySplitV3"(%ta#0, %concat#0, %concat#1, %ta#1) {element_shape_except0 = #tf.shape<*>} : (tensor<!tf.resource>, tensor<*xf32>, tensor<*xi64>, tensor<f32>) -> tensor<f32>
   return
 }
 
@@ -85,16 +85,16 @@ func @main() -> () {
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
   // CHECK: "tf.AssignVariableOp"
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   %indices = "tf.Const"() {value = dense<[0, 1, 2, 3, 4]> : tensor<5xi32>} : () -> tensor<5xi32>
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
   // CHECK: %[[GATHER_SLICE:.*]] = "tf.Slice"(%[[READ]]
   // CHECK-SAME: (tensor<5x3xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<5x3xf32>
-  %gather = "tf.TensorArrayGatherV3"(%ta#0, %indices, %ta#1) {element_shape = "tfshape$unknown_rank: true"} : (tensor<!tf.resource>, tensor<5xi32>, tensor<f32>) -> tensor<*xf32>
+  %gather = "tf.TensorArrayGatherV3"(%ta#0, %indices, %ta#1) {element_shape = #tf.shape<*>} : (tensor<!tf.resource>, tensor<5xi32>, tensor<f32>) -> tensor<*xf32>
   // CHECK: %[[READ2:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[READ2]], %[[GATHER_SLICE]])
   // CHECK: "tf.AssignVariableOp"(%[[VAR]], %[[ADD]])
-  %scatter = "tf.TensorArrayScatterV3"(%ta#0, %indices, %gather, %ta#1) {element_shape_except0 = "tfshape$unknown_rank: true"} : (tensor<!tf.resource>, tensor<5xi32>, tensor<*xf32>, tensor<f32>) -> tensor<f32>
+  %scatter = "tf.TensorArrayScatterV3"(%ta#0, %indices, %gather, %ta#1) {element_shape_except0 = #tf.shape<*>} : (tensor<!tf.resource>, tensor<5xi32>, tensor<*xf32>, tensor<f32>) -> tensor<f32>
   return
 }
 
@@ -107,13 +107,13 @@ func @main() -> () {
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
   // CHECK: "tf.AssignVariableOp"
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   // CHECK: %[[INDS:.*]] = "tf.Const"() {value = dense<[2, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
   %indices = "tf.Const"() {value = dense<[2, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
   // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[GATHER:.*]] = "tf.GatherV2"(%[[READ]], %[[INDS]], %[[AXIS]]) : (tensor<5x3xf32>, tensor<2xi32>, tensor<i32>) -> tensor<2x3xf32>
-  %gather = "tf.TensorArrayGatherV3"(%ta#0, %indices, %ta#1) {element_shape = "tfshape$unknown_rank: true"} : (tensor<!tf.resource>, tensor<2xi32>, tensor<f32>) -> tensor<*xf32>
+  %gather = "tf.TensorArrayGatherV3"(%ta#0, %indices, %ta#1) {element_shape = #tf.shape<*>} : (tensor<!tf.resource>, tensor<2xi32>, tensor<f32>) -> tensor<*xf32>
   // CHECK: %[[READ2:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
   // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<[1, 3]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK: %[[IND_SLICE0_START:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
@@ -140,7 +140,7 @@ func @main() -> () {
   // CHECK: %[[UPDATE1:.*]] = "tf.XlaDynamicUpdateSlice"(%[[UPDATE0]], %[[ADD1]]
   // CHECK-SAME: (tensor<5x3xf32>, tensor<1x3xf32>, tensor<2xi32>) -> tensor<5x3xf32>
   // CHECK: "tf.AssignVariableOp"(%[[VAR]], %[[UPDATE1]])
-  %scatter = "tf.TensorArrayScatterV3"(%ta#0, %indices, %gather, %ta#1) {element_shape_except0 = "tfshape$unknown_rank: true"} : (tensor<!tf.resource>, tensor<2xi32>, tensor<*xf32>, tensor<f32>) -> tensor<f32>
+  %scatter = "tf.TensorArrayScatterV3"(%ta#0, %indices, %gather, %ta#1) {element_shape_except0 = #tf.shape<*>} : (tensor<!tf.resource>, tensor<2xi32>, tensor<*xf32>, tensor<f32>) -> tensor<f32>
   return
 }
 
@@ -153,7 +153,7 @@ func @main() {
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
   // CHECK: "tf.AssignVariableOp"(%[[VAR]],
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VALUE:.*]] = "tf.Const"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<3xf32>} : () -> tensor<3xf32>
   %value = "tf.Const"() {value = dense<[1.0, 2.0, 3.0]> : tensor<3xf32>} : () -> tensor<3xf32>
@@ -200,7 +200,7 @@ func @main() -> () {
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
   // CHECK: %[[GVAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   // CHECK: "tf.While"(%[[VAR]], %[[SIZE]], %[[GVAR]])
   %1:2 = "tf.While"(%ta#0, %size) {
     body = @while_body, cond = @while_cond, device = "", is_stateless = false}
@@ -247,7 +247,7 @@ func @main() -> () {
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   // CHECK: %[[COND:.*]] = "tf._SomeOp"() : () -> tensor<i1>
   %cond = "tf._SomeOp"() : () -> tensor<i1>
   // CHECK: %[[GVAR1:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
@@ -301,7 +301,7 @@ func @main() -> () {
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   // CHECK: %[[COND:.*]] = "tf._SomeOp"() : () -> tensor<i1>
   %cond = "tf._SomeOp"() : () -> tensor<i1>
   // CHECK: %[[GVAR1:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
@@ -322,7 +322,7 @@ func @main() -> () {
 }
 // CHECK-LABEL: func @callee
 // CHECK-SAME: (%[[OCARG0:.*]]: tensor<!tf.resource>) -> tensor<!tf.resource>
-func @callee(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> {
+func @callee(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> attributes {sym_visibility = "public"} {
   %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %elem = "tf._SomeOp"() : () -> tensor<3xf32>
   %flow = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
@@ -343,11 +343,80 @@ func @callee(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> {
 
 // -----
 
+// Tests (Stateful)PartitionedCall op with private callee function.
+
+// CHECK-LABEL: func @main
+func @main() -> () {
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  // CHECK: %[[COND:.*]] = "tf._SomeOp"() : () -> tensor<i1>
+  %cond = "tf._SomeOp"() : () -> tensor<i1>
+  // CHECK: %[[GVAR1:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
+  %grad:2 = "tf.TensorArrayGradV3"(%ta#0, %ta#1) {source = "a"} : (tensor<!tf.resource>, tensor<f32>) -> (tensor<!tf.resource>, tensor<f32>)
+  // CHECK: %[[GVAR2:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
+  // CHECK: "tf.StatefulPartitionedCall"(%[[VAR]], %[[GVAR1]], %[[GVAR2]])
+  // CHECK-SAME: f = @callee
+  %call = "tf.StatefulPartitionedCall"(%ta#0) {f = @callee, config = "", config_proto = "", executor_type = ""}
+    : (tensor<!tf.resource>) -> tensor<!tf.resource>
+  // CHECK: "tf.PartitionedCall"(%[[VAR]], %[[GVAR1]], %[[GVAR2]])
+  // CHECK-SAME: f = @callee
+  %call2 = "tf.PartitionedCall"(%call) {f = @callee, config = "", config_proto = "", executor_type = ""}
+    : (tensor<!tf.resource>) -> tensor<!tf.resource>
+  // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
+  // CHECK: "tf.Slice"(%[[READ]],
+  %read = "tf.TensorArrayReadV3"(%call2, %index, %ta#1) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+  return
+}
+// CHECK: func @callee(%[[CARG0:.*]]: tensor<!tf.resource<tensor<5x3xf32>>>, %[[CARG1:.*]]: tensor<!tf.resource<tensor<5x3xf32>>>, %[[CARG2:.*]]: tensor<!tf.resource<tensor<5x3xf32>>>)
+func @callee(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> attributes {sym_visibility = "private"} {
+  // CHECK: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[CARG1]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
+  // CHECK: %[[UPDATE1:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ1]],
+  // CHECK: "tf.AssignVariableOp"(%[[CARG1]], %[[UPDATE1]])
+  // CHECK: %[[READ2:.*]] = "tf.ReadVariableOp"(%[[CARG2]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
+  // CHECK: %[[UPDATE2:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ2]],
+  // CHECK: "tf.AssignVariableOp"(%[[CARG2]], %[[UPDATE2]])
+  %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %elem = "tf._SomeOp"() : () -> tensor<3xf32>
+  %flow = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+  %grad:2 = "tf.TensorArrayGradV3"(%arg0, %flow) {source = "a"} : (tensor<!tf.resource>, tensor<f32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %gwrite = "tf.TensorArrayWriteV3"(%grad#0, %const1, %elem, %grad#1) : (tensor<!tf.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
+  %grad2:2 = "tf.TensorArrayGradV3"(%arg0, %flow) {source = "b"} : (tensor<!tf.resource>, tensor<f32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %gwrite2 = "tf.TensorArrayWriteV3"(%grad2#0, %const1, %elem, %grad2#1) : (tensor<!tf.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return %[[CARG0]]
+  return %arg0 : tensor<!tf.resource>
+}
+
+// -----
+
+// Tests PartitionedCall op with no signature change on callee.
+
+// CHECK-LABEL: func @main
+func @main() -> () {
+  %call = "tf.PartitionedCall"() {f = @callee, config = "", config_proto = "", executor_type = ""} : () -> tensor<i32>
+  return
+}
+// CHECK: func @callee() -> tensor<i32>
+func @callee() -> tensor<i32> attributes {sym_visibility = "public"} {
+  %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5xf32>>>
+  // CHECK: "tf.AssignVariableOp"
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  %size_out = "tf.TensorArraySizeV3"(%ta#0, %ta#1) : (tensor<!tf.resource>, tensor<f32>) -> tensor<i32>
+  // CHECK: return %[[SIZE]] : tensor<i32>
+  return %size_out : tensor<i32>
+}
+
+// -----
+
 // Test the pass reports failure on unknown size.
 
 func @main(%arg0: tensor<i32>) -> () {
   // expected-error @+1 {{unknown max element count}}
-  %ta:2 = "tf.TensorArrayV3"(%arg0) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%arg0) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   return
 }
 
@@ -358,7 +427,7 @@ func @main(%arg0: tensor<i32>) -> () {
 func @main(%arg0: tensor<i32>) -> () {
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   // expected-error @+1 {{unknown element shape}}
-  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$unknown_rank: true", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<*>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   return
 }
 
@@ -368,8 +437,8 @@ func @main(%arg0: tensor<i32>) -> () {
 
 func @main(%arg0: tensor<i1>) -> () {
   %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
-  %ta0:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
-  %ta1:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = "tfshape$dim { size: 3 }", dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta0:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %ta1:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
   %if_op = "tf.If"(%arg0, %ta0#0, %ta1#0) {then_branch = @if_then, else_branch = @if_else, is_stateless = false}
     : (tensor<i1>, tensor<!tf.resource>, tensor<!tf.resource>) -> tensor<!tf.resource>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
index 682da38fc56..7e9b85ffc04 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
@@ -141,6 +141,25 @@ func @main(%arg0: tensor<10x8x9xf32>, %arg1: tensor<3xi32>) -> tensor<3x8x9xf32>
 
 // -----
 
+// Test scatter into existing tensor list.
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<10x8x9xf32>, %[[ARG1:.*]]: tensor<5xi32>, %[[ARG2:.*]]: tensor<5x8x9xf32>) -> tensor<10x8x9xf32>
+func @main(%arg0: tensor<10x8x9xf32>, %arg1: tensor<5xi32>, %arg2: tensor<5x8x9xf32>) -> tensor<10x8x9xf32> {
+  %elem_shape = "tf.Const"() {value = dense<[8, 9]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[BUFFER:.*]] = "tf.Identity"(%[[ARG0]]) : (tensor<10x8x9xf32>) -> tensor<10x8x9xf32>
+  %tl = "tf.TensorListFromTensor"(%arg0, %elem_shape) : (tensor<10x8x9xf32>, tensor<2xi32>) -> tensor<!tf.variant<tensor<8x9xf32>>>
+  // CHECK: %[[IND_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[IND_RESHPE:.*]] = "tf.Reshape"(%[[ARG1]], %[[IND_SHAPE]]) : (tensor<5xi32>, tensor<2xi32>) -> tensor<5x1xi32>
+  // CHECK: %[[SC:.*]] = "tf.TensorScatterUpdate"(%[[BUFFER]], %[[IND_RESHPE]], %[[ARG2]]) : (tensor<10x8x9xf32>, tensor<5x1xi32>, tensor<5x8x9xf32>) -> tensor<10x8x9xf32>
+  %scatter = "tf.TensorListScatterIntoExistingList"(%tl, %arg2, %arg1) : (tensor<!tf.variant<tensor<8x9xf32>>>, tensor<5x8x9xf32>, tensor<5xi32>) -> tensor<!tf.variant<tensor<8x9xf32>>>
+  %stack = "tf.TensorListStack"(%scatter, %elem_shape) : (tensor<!tf.variant<tensor<8x9xf32>>>, tensor<2xi32>) -> tensor<10x8x9xf32>
+  // CHECK: return %[[SC]] : tensor<10x8x9xf32>
+  return %stack : tensor<10x8x9xf32>
+}
+
+// -----
+
 // Tests while loop.
 
 // CHECK-LABEL: func @main
@@ -255,7 +274,7 @@ func @main(%arg0: tensor<i1>) -> () {
 }
 
 // CHECK: func @callee(%[[AARG0:.*]]: tensor<!tf.variant<tensor<f32>>>, %[[AARG1:.*]]: tensor<i1>) -> tensor<!tf.variant<tensor<f32>>>
-func @callee(%arg0: tensor<!tf.variant<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<!tf.variant<tensor<f32>>> {
+func @callee(%arg0: tensor<!tf.variant<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<!tf.variant<tensor<f32>>> attributes {sym_visibility = "public"} {
   %elem = "tf._SomeOp"(%arg1) : (tensor<i1>) -> tensor<f32>
   // CHECK: "tf.TensorListPushBack"
   %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
@@ -272,6 +291,66 @@ func @callee(%arg0: tensor<!tf.variant<tensor<f32>>>, %arg1: tensor<i1>) -> tens
 
 // -----
 
+// Tests PartitionedCall/StatefulPartitionedCall with private callee function.
+
+// CHECK-LABEL: func @main
+func @main(%arg0: tensor<i1>) -> () {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.EmptyTensorList
+  // CHECK: %[[INIT:.*]] = "tf.BroadcastTo"
+  %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: "tf.StatefulPartitionedCall"(%[[INIT]],
+  // CHECK-SAME: f = @callee
+  %call = "tf.StatefulPartitionedCall"(%tl, %arg0) {f = @callee, config = "", config_proto = "", executor_type = ""}
+    : (tensor<!tf.variant<tensor<f32>>>, tensor<i1>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: %[[CALL2:.*]]:2 = "tf.PartitionedCall"(%[[INIT]],
+  // CHECK-SAME: f = @callee
+  %call2 = "tf.PartitionedCall"(%tl, %arg0) {f = @callee, config = "", config_proto = "", executor_type = ""}
+    : (tensor<!tf.variant<tensor<f32>>>, tensor<i1>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: %[[COPY:.*]] = "tf.Identity"(%[[CALL2]]#0)
+  // CHECK: "tf.Slice"(%[[COPY]],
+  %pop:2 = "tf.TensorListPopBack"(%call2, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK-NOT: tf.TensorListPopBack
+  // CHECK: return
+  return
+}
+
+// CHECK: func @callee(%[[ARG0:.*]]: tensor<10xf32>, %[[ARG1:.*]]: tensor<i1>, %[[ARG2:.*]]: tensor<1xi32>) -> (tensor<10xf32>, tensor<1xi32>)
+func @callee(%arg0: tensor<!tf.variant<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<!tf.variant<tensor<f32>>> attributes {sym_visibility = "private"} {
+  %elem = "tf._SomeOp"(%arg1) : (tensor<i1>) -> tensor<f32>
+
+  // CHECK-NOT: "tf.TensorListPushBack"
+  // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[ARG2]], %[[CONST1]])
+  // CHECK-NOT: "tf.TensorListPushBack"
+  %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: return %[[UPDATE]], %[[ADD]]
+  return %push : tensor<!tf.variant<tensor<f32>>>
+}
+
+// -----
+
+// Tests PartitionedCall op with no signature change on callee.
+
+// CHECK-LABEL: func @main
+func @main() -> () {
+  "tf.PartitionedCall"() {f = @callee, config = "", config_proto = "", executor_type = ""} : () -> ()
+  return
+}
+// CHECK: func @callee()
+func @callee() -> () attributes {sym_visibility = "public"} {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.EmptyTensorList
+  // CHECK: "tf.BroadcastTo"
+  %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  return
+}
+
+// -----
+
 // Tests that the pass reports error on unknown maximum size.
 
 func @main(%arg0: tensor<i32>) -> () {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index afe63678892..82e60a08e2e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -82,7 +82,7 @@ func @testReverseV2(%arg0: tensor<2x4x3xui8>, %arg1: tensor<1xi32>) -> tensor<2x
 // -----
 
 func @testIdentityWrongType(%arg0: tensor<4x2x!tf.string>) -> tensor<4x2x!tf.stringref> {
-  // expected-error @+1 {{requires all operands to be either same as or ref type of results}}
+  // expected-error @+1 {{all operands and results to have compatible element}}
   %0 = "tf.Identity"(%arg0) : (tensor<4x2x!tf.string>) -> tensor<4x2x!tf.stringref>
   return %0 : tensor<4x2x!tf.stringref>
 }
@@ -725,10 +725,10 @@ func @testFusedBatchNormWrongMeanType(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor
 
 // -----
 // Test invalid tf.FusedBatchNorm
-func @testFusedBatchNormWrongVarianceType(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>) -> tensor<8x8x8x8xf32> {
-^bb0(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<*xf32>):
+func @testFusedBatchNormWrongVarianceType(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<10x2xf32>) -> tensor<8x8x8x8xf32> {
+^bb0(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<10x2xf32>):
   // expected-error @+1 {{requires variance to be a 1D float tensor}}
-  %0:5 = "tf.FusedBatchNorm"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>)
+  %0:5 = "tf.FusedBatchNorm"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<10x2xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<10x2xf32>)
   return %0#0 : tensor<8x8x8x8xf32>
 }
 
@@ -881,20 +881,29 @@ func @testValidMatrixBandPartOpUnranked(%arg0: tensor<*xbf16>, %arg1: tensor<i64
 
 // -----
 
-// Test invalid tf.MatrixBandPart
-func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
-  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
-  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
-  return %0 : tensor<64x64xbf16>
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOpUnrankedBand
+func @testValidMatrixBandPartOpUnrankedBand(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
+  return %0 : tensor<*xbf16>
+}
+
+// -----
+
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOpCompatibleDynamicShapes
+func @testValidMatrixBandPartOpCompatibleDynamicShapes(%arg0: tensor<?x10x?xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<?x?x8xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<?x10x?xbf16>, tensor<i64>, tensor<i64>) -> tensor<?x?x8xbf16>
+  return %0 : tensor<?x?x8xbf16>
 }
 
 // -----
 
 // Test invalid tf.MatrixBandPart
-func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
-  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
-  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
-  return %0 : tensor<*xbf16>
+func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
+  // expected-error @+1 {{op failed to verify that all of {input, band} have dynamically equal types}}
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
+  return %0 : tensor<64x64xbf16>
 }
 
 // -----
@@ -998,6 +1007,116 @@ func @pcall_func_2(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
 
 // -----
 
+//===--------------------------------------------------------------------===//
+//  tf.Select
+//===--------------------------------------------------------------------===//
+
+// Test valid tf.Select
+// CHECK-LABEL: func @testSelect
+func @testSelect(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2: tensor<3x2xf16>) -> tensor<3x2xf16> {
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<3x2xf16>, tensor<3x2xf16>) -> tensor<3x2xf16>
+  return %0: tensor<3x2xf16>
+}
+
+// -----
+
+func @testInvalidSelect(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // expected-error @+1 {{requires that, when pred is a vector, the shape matches the first dimension of t and e}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// -----
+
+// Test invalid tf.Select - broadcasting then/else parameters is not supported
+func @selectBroadcastThen(%arg0: tensor<i1>, %arg1: tensor<8x1xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
+  // expected-error @+1 {{requires t and e have compatible shapes}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<8x1xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+func @invalidSelect(%arg0: tensor<2xi1>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<2xi32> {
+  // expected-error @+1 {{requires that t and e are nonscalar when pred is a vector}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// -----
+
+func @invalidSelect(%arg0: tensor<1x8xi1>, %arg1: tensor<1x8x8xi32>, %arg2: tensor<1x8x8xi32>) -> tensor<1x8x8xi32> {
+  // expected-error @+1 {{requires that pred is a scalar OR has the same rank as t and e OR is a vector}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<1x8xi1>, tensor<1x8x8xi32>, tensor<1x8x8xi32>) -> tensor<1x8x8xi32>
+  return %0: tensor<1x8x8xi32>
+}
+
+// -----
+
+//===--------------------------------------------------------------------===//
+//  tf.SelectV2
+//===--------------------------------------------------------------------===//
+
+// Test valid tf.SelectV2
+// CHfaECK-LABEL: func @selectV2BroadcastThen
+func @selectV2BroadcastThen(%arg0: tensor<i1>, %arg1: tensor<8x1xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<8x1xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+// Test valid tf.SelectV2
+// CHECK-LABEL: func @selectV2BroadcastElse
+func @selectV2BroadcastElse(%arg0: tensor<i1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<8x1xi32>) -> tensor<2x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x8x8xi32>, tensor<8x1xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+// Test valid tf.SelectV2
+// CHECK-LABEL: func @selectV2BroadcastPred
+func @selectV2BroadcastPred(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x8x8xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @selectV2BroadcastAll
+func @selectV2BroadcastAll(%arg0: tensor<8x1x1xi1>, %arg1: tensor<1x8x1xi32>, %arg2: tensor<1x1x8xi32>) -> tensor<8x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<8x1x1xi1>, tensor<1x8x1xi32>, tensor<1x1x8xi32>) -> tensor<8x8x8xi32>
+  return %0: tensor<8x8x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @selectV2DynamicRanked
+func @selectV2DynamicRanked(%arg0: tensor<1xi1>, %arg1: tensor<2x?x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x?x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x?x8xi32>, tensor<2x8x8xi32>) -> tensor<2x?x8xi32>
+  return %0: tensor<2x?x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @selectV2Unranked
+func @selectV2Unranked(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<*xi32>) -> tensor<*xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x8x8xi32>, tensor<*xi32>) -> tensor<*xi32>
+  return %0: tensor<*xi32>
+}
+
+// -----
+
+// Test invalid tf.SelectV2: this is an invalid broadcast for the predicate
+func @testInvalidSelectV2(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2: tensor<3x2xf16>) -> tensor<3x2xf16> {
+  // expected-error @+1 {{operands don't have broadcast-compatible shapes}}
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<3x2xf16>, tensor<3x2xf16>) -> tensor<3x2xf16>
+  return %0: tensor<3x2xf16>
+}
+
+// -----
+
 //===--------------------------------------------------------------------===//
 //  tf.Softmax
 //===--------------------------------------------------------------------===//
@@ -1297,11 +1416,11 @@ func @testShapeWrongResultElemType(%arg0: tensor<1x32x32x16xf32>) -> tensor<4xf3
 
 // -----
 
-func @testShapeWrongResultDim(tensor<1x32x32x16xf32>) -> tensor<*xi32> {
+func @testShapeWrongResultDim(tensor<1x32x32x16xf32>) -> tensor<3x2xi32> {
 ^bb0(%arg0: tensor<1x32x32x16xf32>):
   // expected-error @+1 {{requires 1D type for result}}
-  %0 = "tf.Shape"(%arg0) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<1x32x32x16xf32>) -> tensor<*xi32>
-  return %0 : tensor<*xi32>
+  %0 = "tf.Shape"(%arg0) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<1x32x32x16xf32>) -> tensor<3x2xi32>
+  return %0 : tensor<3x2xi32>
 }
 
 // -----
@@ -1317,7 +1436,7 @@ func @testShapeMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
 
 func @testShapeWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires dynamic shape result for unranked operand}}
+  // expected-warning @+1 {{has static shape result for unranked operand}}
   %0 = "tf.Shape"(%arg0) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
@@ -1341,11 +1460,11 @@ func @testShapeNWrongResultElemType(%arg0: tensor<1x32x32x16xf32>) -> tensor<4xf
 
 // -----
 
-func @testShapeNWrongResultDim(tensor<1x32x32x16xf32>) -> tensor<*xi32> {
+func @testShapeNWrongResultDim(tensor<1x32x32x16xf32>) -> tensor<2x2xi32> {
 ^bb0(%arg0: tensor<1x32x32x16xf32>):
   // expected-error @+1 {{requires 1D type for result #1}}
-  %0:2 = "tf.ShapeN"(%arg0, %arg0) : (tensor<1x32x32x16xf32>, tensor<1x32x32x16xf32>) -> (tensor<4xi32>, tensor<*xi32>)
-  return %0#1 : tensor<*xi32>
+  %0:2 = "tf.ShapeN"(%arg0, %arg0) : (tensor<1x32x32x16xf32>, tensor<1x32x32x16xf32>) -> (tensor<4xi32>, tensor<2x2xi32>)
+  return %0#1 : tensor<2x2xi32>
 }
 
 // -----
@@ -1361,7 +1480,7 @@ func @testShapeNMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
 
 func @testShapeNWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires dynamic shape result #1 for unranked operand #1}}
+  // expected-warning @+1 {{has static shape result #1 for unranked operand #1}}
   %0:2 = "tf.ShapeN"(%arg0, %arg0) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<?xi32>, tensor<2xi32>)
   return %0#1 : tensor<2xi32>
 }
@@ -1402,10 +1521,10 @@ func @testVariableShapeWrongResultElemType(%arg0: tensor<*x!tf.resource<tensor<1
 
 // -----
 
-func @testVariableShapeWrongResultDim(%arg0: tensor<*x!tf.resource<tensor<1x32x32x16xf32>>>) -> tensor<*xi32> {
+func @testVariableShapeWrongResultDim(%arg0: tensor<*x!tf.resource<tensor<1x32x32x16xf32>>>) -> tensor<2x3xi32> {
   // expected-error @+1 {{requires 1D type for result}}
-  %0 = "tf.VariableShape"(%arg0) {output = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource<tensor<1x32x32x16xf32>>>) -> tensor<*xi32>
-  return %0 : tensor<*xi32>
+  %0 = "tf.VariableShape"(%arg0) {output = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource<tensor<1x32x32x16xf32>>>) -> tensor<2x3xi32>
+  return %0 : tensor<2x3xi32>
 }
 
 // -----
@@ -1419,7 +1538,7 @@ func @testVariableShapeMismatchDim(%arg0: tensor<*x!tf.resource<tensor<1x32x32x1
 // -----
 
 func @testVariableShapeWrongResultDimDynamic(%arg0: tensor<*x!tf.resource<tensor<*xf32>>>) -> tensor<2xi32> {
-  // expected-error @+1 {{requires dynamic shape result for unranked operand}}
+  // expected-warning @+1 {{has static shape result for unranked operand}}
   %0 = "tf.VariableShape"(%arg0) {output = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource<tensor<*xf32>>>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
@@ -1768,7 +1887,7 @@ func @testOneHot(%indices: tensor<3xi32>, %depth: tensor<i32>, %on_value: tensor
 // -----
 
 func @testOneHot(%indices: tensor<3xi32>, %on_value: tensor<f32>, %off_value: tensor<f32>) -> tensor<3x5xf32> {
-  %depth = "tf.Const"() { value = dense<-5> : tensor<i64> } : () -> tensor<i32>
+  %depth = "tf.Const"() { value = dense<-5> : tensor<i32> } : () -> tensor<i32>
   // expected-error @+1 {{depth must be non-negative}}
   %result = "tf.OneHot"(%indices, %depth, %on_value, %off_value) {axis = -1 : i64} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<3x5xf32>
   return %result : tensor<3x5xf32>
@@ -2400,7 +2519,7 @@ func @tensor_scatter_update(%tensor: tensor<4xf32>, %indices: tensor<4x2xi32>, %
 // CHECK-LABEL: func @testParseExampleV2DenseOnlyValid
 func @testParseExampleV2DenseOnlyValid(%serialized: tensor<32x!tf.string>, %names : tensor<32x!tf.string>, %dense_keys : tensor<2x!tf.string>, %dense_default_0 : tensor<?xf32>, %dense_default_1 : tensor<?xf32>) -> (tensor<32xf32>) {
   %empty_str_vector = "tf.Const"() {dtype = !tf.string, value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B207D207D"> : tensor<0x!tf.string>} : () -> tensor<0x!tf.string>
-  %result:2 = "tf.ParseExampleV2"(%serialized, %names, %empty_str_vector, %dense_keys, %empty_str_vector, %dense_default_0, %dense_default_1) {dense_shapes = ["tfshape$", "tfshape$"], num_sparse = 0 : i64, result_segment_sizes = dense<[0, 0, 0, 2, 0, 0]> : vector<6xi32>} : (tensor<32x!tf.string>, tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>, tensor<0x!tf.string>, tensor<?xf32>, tensor<?xf32>) -> (tensor<32xf32>, tensor<32xf32>)
+  %result:2 = "tf.ParseExampleV2"(%serialized, %names, %empty_str_vector, %dense_keys, %empty_str_vector, %dense_default_0, %dense_default_1) {dense_shapes = [#tf.shape<>, #tf.shape<>], num_sparse = 0 : i64, result_segment_sizes = dense<[0, 0, 0, 2, 0, 0]> : vector<6xi32>} : (tensor<32x!tf.string>, tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>, tensor<0x!tf.string>, tensor<?xf32>, tensor<?xf32>) -> (tensor<32xf32>, tensor<32xf32>)
   return %result#0 : tensor<32xf32>
 }
 
@@ -2409,7 +2528,7 @@ func @testParseExampleV2DenseOnlyValid(%serialized: tensor<32x!tf.string>, %name
 func @testParseExampleV2DenseMismatchedInputOutput(%serialized: tensor<32x!tf.string>, %names : tensor<32x!tf.string>, %dense_keys : tensor<2x!tf.string>, %dense_default_0 : tensor<?xf32>, %dense_default_1 : tensor<?xf32>) -> (tensor<32xf32>) {
   %empty_str_vector = "tf.Const"() {dtype = !tf.string, value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B207D207D"> : tensor<0x!tf.string>} : () -> tensor<0x!tf.string>
   // expected-error @+1 {{output 'dense_values' should have same length as attribute 'Tdense'}}
-  %result:3 = "tf.ParseExampleV2"(%serialized, %names, %empty_str_vector, %dense_keys, %empty_str_vector, %dense_default_0, %dense_default_1) {dense_shapes = ["tfshape$", "tfshape$"], num_sparse = 0 : i64, result_segment_sizes = dense<[0, 0, 0, 3, 0, 0]> : vector<6xi32>} : (tensor<32x!tf.string>, tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>, tensor<0x!tf.string>, tensor<?xf32>, tensor<?xf32>) -> (tensor<32xf32>, tensor<32xf32>, tensor<32xi64>)
+  %result:3 = "tf.ParseExampleV2"(%serialized, %names, %empty_str_vector, %dense_keys, %empty_str_vector, %dense_default_0, %dense_default_1) {dense_shapes = [#tf.shape<>, #tf.shape<>], num_sparse = 0 : i64, result_segment_sizes = dense<[0, 0, 0, 3, 0, 0]> : vector<6xi32>} : (tensor<32x!tf.string>, tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>, tensor<0x!tf.string>, tensor<?xf32>, tensor<?xf32>) -> (tensor<32xf32>, tensor<32xf32>, tensor<32xi64>)
   return %result#0 : tensor<32xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_map_and_batch.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_map_and_batch.mlir
new file mode 100644
index 00000000000..39f34caf259
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_map_and_batch.mlir
@@ -0,0 +1,29 @@
+// RUN: tf-opt -tf-standard-pipeline -tf-data-optimization %s -o %t && FileCheck %s --dump-input-on-failure < %t
+
+module {
+// CHECK-LABEL: fuse_map_and_batch
+func @fuse_map_and_batch() -> tensor<!tf.variant> attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "BatchDatasetV2"}} {
+  %0 = "tf.Const"() {value = dense<5> : tensor<i64>} : () -> tensor<i64>
+  %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  %2 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK: %[[NPC:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>}
+  // CHECK: %[[TSLICE:.*]] = "tf.TensorSliceDataset"
+  %3 = "tf.TensorSliceDataset"(%2) {device = "", output_shapes = [#tf.shape<>]} : (tensor<3xi32>) -> tensor<*x!tf.variant>
+  // CHECK: "tf.MapAndBatchDataset"(%[[TSLICE]], %[[BSIZE:.*]], %[[NPC]]
+  // CHECK-SAME: f = @"__inference_Dataset_map_<lambda>_80",
+  %4 = "tf.MapDataset"(%3) {device = "",
+           f = @"__inference_Dataset_map_<lambda>_80",
+           output_shapes = [#tf.shape<>], output_types = [i32],
+           preserve_cardinality = false, sloppy = false,
+           use_inter_op_parallelism = true} : (tensor<*x!tf.variant>) -> tensor<!tf.variant>
+  %5 = "tf.BatchDatasetV2"(%4, %0, %1) {device = "", output_shapes = [#tf.shape<>], output_types = [i32], parallel_copy = false} : (tensor<!tf.variant>, tensor<i64>, tensor<i1>) -> tensor<!tf.variant>
+  return %5 : tensor<!tf.variant>
+}
+
+func @"__inference_Dataset_map_<lambda>_80"(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  %0 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Mul"(%arg0, %0) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  %2 = "tf.Identity"(%1) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  return %2 : tensor<*xi32>
+}
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_pmap_and_batch.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_pmap_and_batch.mlir
new file mode 100644
index 00000000000..70c5c220fe1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_pmap_and_batch.mlir
@@ -0,0 +1,29 @@
+// RUN: tf-opt -tf-standard-pipeline -tf-data-optimization %s -o %t && FileCheck %s --dump-input-on-failure < %t
+
+module {
+// CHECK-LABEL: fuse_pmap_and_batch
+func @fuse_pmap_and_batch() -> tensor<!tf.variant> attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "BatchDatasetV2"}} {
+  %0 = "tf.Const"() {value = dense<5> : tensor<i64>} : () -> tensor<i64>
+  %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  %2 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %3 = "tf.Const"() {value = dense<12> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[TSLICE:.*]] = "tf.TensorSliceDataset"
+  %4 = "tf.TensorSliceDataset"(%2) {device = "", output_shapes = [#tf.shape<>]} : (tensor<3xi32>) -> tensor<*x!tf.variant>
+  // CHECK: "tf.MapAndBatchDataset"(%[[TSLICE]],
+  // CHECK-SAME: f = @"__inference_Dataset_map_<lambda>_80",
+  %5 = "tf.ParallelMapDataset"(%4, %3) {device = "",
+           f = @"__inference_Dataset_map_<lambda>_80",
+           output_shapes = [#tf.shape<>], output_types = [i32],
+           preserve_cardinality = false, sloppy = false,
+           use_inter_op_parallelism = true} : (tensor<*x!tf.variant>, tensor<i32>) -> tensor<!tf.variant>
+  %6 = "tf.BatchDatasetV2"(%5, %0, %1) {device = "", output_shapes = [#tf.shape<>], output_types = [i32], parallel_copy = false} : (tensor<!tf.variant>, tensor<i64>, tensor<i1>) -> tensor<!tf.variant>
+  return %6 : tensor<!tf.variant>
+}
+
+func @"__inference_Dataset_map_<lambda>_80"(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  %0 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Mul"(%arg0, %0) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  %2 = "tf.Identity"(%1) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  return %2 : tensor<*xi32>
+}
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
index 6282ab17f17..c048db5a5ee 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
@@ -187,6 +187,26 @@ func @switch_with_unranked_pred(%arg0: tensor<*xf32>, %arg1: tensor<*xi1>) -> te
   return %result : tensor<*xf32>
 }
 
+// CHECK-LABEL: func @switch_with_control_inputs(
+func @switch_with_control_inputs(%arg0: tensor<i1>, %arg1: !tf_executor.control, %arg2: !tf_executor.control) -> tensor<i1> {
+  %result = tf_executor.graph {
+// CHECK: tf_executor.Switch %{{[^%]*}}, %{{[^%]*}}, %{{[^%]*}}, %{{[^%]*}} : tensor<i1>
+    %1:3 = tf_executor.Switch %arg0, %arg0, %arg1, %arg2 : tensor<i1>
+    tf_executor.fetch %1#0 : tensor<i1>
+  }
+  return %result : tensor<i1>
+}
+
+// CHECK-LABEL: func @switch_with_control_inputs_functional(
+func @switch_with_control_inputs_functional(%arg0: tensor<i1>, %arg1: !tf_executor.control, %arg2: !tf_executor.control) -> tensor<i1> {
+  %result = tf_executor.graph {
+// CHECK: tf_executor.Switch %{{[^%]*}}, %{{[^%]*}}, %{{[^%]*}}, %{{[^%]*}} : tensor<i1>
+    %1:3 = tf_executor.Switch %arg0, %arg0, %arg1, %arg2 : (tensor<i1>, tensor<i1>, !tf_executor.control, !tf_executor.control) -> (tensor<i1>, tensor<i1>, !tf_executor.control)
+    tf_executor.fetch %1#0 : tensor<i1>
+  }
+  return %result : tensor<i1>
+}
+
 // CHECK-LABEL: func @switchN(
 func @switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %fetches = tf_executor.graph {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
index a249090a3cf..1fdc99d1ec8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
@@ -333,7 +333,7 @@ func @parent_is_graph(%arg0: tensor<*xf32>, %arg1: tensor<i1>) {
 
 // -----
 
-// Check that a switch always takes two arguments.
+// Check that a switch always needs at least two arguments.
 func @invalid_switch(%arg0: tensor<*xf32>) {
   tf_executor.graph {
     %true, %false, %ctlSwitch = "tf_executor.Switch"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>, !tf_executor.control)
@@ -344,6 +344,17 @@ func @invalid_switch(%arg0: tensor<*xf32>) {
 
 // -----
 
+// Check that a switch always needs at least two arguments.
+func @invalid_switch(%arg0: tensor<*xf32>) {
+  tf_executor.graph {
+    %true, %false, %ctlSwitch = tf_executor.Switch %arg0 : tensor<*xf32>
+// expected-error@-1 {{custom op 'tf_executor.Switch'  expects a single data type and a predicate}}
+  }
+  return
+}
+
+// -----
+
 // Check that a switch second argument must be a valid predicate (i1).
 func @invalid_switch(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
   %result = tf_executor.graph {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/shapes_for_variables.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/shapes_for_variables.py
deleted file mode 100644
index 37290434f10..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/shapes_for_variables.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# RUN: %p/shapes_for_variables | FileCheck %s
-
-# pylint: disable=missing-docstring,line-too-long
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v2 as tf
-from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common
-
-
-class TestModule(tf.Module):
-
-  # Check that we get shapes for variables used in the graph.
-  # In this case, what we are testing is that the return type of the function is
-  # correctly inferred, which requires understanding the shape of the variable
-  # (in particular, the ReadVariableOp that reads it and returns a tensor).
-  #
-  # We eventually want to move the shape inference to a pass separate from
-  # the initial import, in which case this test doesn't make much sense and
-  # will be superceded by MLIR->MLIR shape inference tests.
-  #
-  # CHECK:      func {{@[a-zA-Z_0-9]+}}({{.*}}) -> (tensor<f32> {{.*}})
-  # CHECK:      tf_saved_model.exported_names = ["some_function"]
-  def __init__(self):
-    super(TestModule, self).__init__()
-    self.my_variable = tf.Variable(42.)
-
-  @tf.function(input_signature=[])
-  def some_function(self):
-    return self.my_variable
-
-
-if __name__ == '__main__':
-  common.do_test(TestModule)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_output.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_output.py
deleted file mode 100644
index b476df0cc25..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_output.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# RUN: %p/structured_output | FileCheck %s
-
-# pylint: disable=missing-docstring,line-too-long
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v2 as tf
-from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common
-
-
-class TestModule(tf.Module):
-  # The fNNNN name prefixes in this file are such that the sorted order of the
-  # functions in the resulting MLIR output match the order in the source file,
-  # allowing us to conveniently co-locate the CHECK's with the code they are
-  # checking.
-  #
-  # Note: CHECK-DAG doesn't work with CHECK-SAME/CHECK-NEXT.
-
-  # Check index paths for results.
-  #
-  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
-  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = []})
-  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0000_single_return"]
-  @tf.function(input_signature=[])
-  def f0000_single_return(self):
-    return tf.constant(1.0, shape=[1])
-
-  # Check index paths for results with multiple return values.
-  # Note that semantically in Python, multiple return values are equivalent
-  # to returning a tuple/list.
-  #
-  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
-  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = [1]})
-  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0001_multiple_results_no_punctuation"]
-  @tf.function(input_signature=[])
-  def f0001_multiple_results_no_punctuation(self):
-    return tf.constant(1.0, shape=[1]), tf.constant(1.0, shape=[2])
-
-  # Check index paths for results written explicitly with parentheses.
-  # This is semantically equivalent to the earlier test without parentheses,
-  # but this test serves as documentation of this behavior for the purposes
-  # of tf_saved_model users.
-  #
-  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
-  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = [1]})
-  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0002_multiple_results_parentheses"]
-  @tf.function(input_signature=[])
-  def f0002_multiple_results_parentheses(self):
-    return (tf.constant(1.0, shape=[1]), tf.constant(1.0, shape=[2]))
-
-  # Check index paths for results written explicitly with brackets.
-  # This is semantically equivalent to the earlier test without parentheses,
-  # but this test serves as documentation of this behavior for the purposes
-  # of tf_saved_model users.
-  #
-  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
-  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = [1]})
-  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0003_multiple_results_brackets"]
-  @tf.function(input_signature=[])
-  def f0003_multiple_results_brackets(self):
-    return [tf.constant(1.0, shape=[1]), tf.constant(1.0, shape=[2])]
-
-  # Check index paths for lists.
-  #
-  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
-  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = [0, 0]},
-  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = [0, 1]})
-  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0004_list_2_elements"]
-  @tf.function(input_signature=[])
-  def f0004_list_2_elements(self):
-    return [[tf.constant(1.0, shape=[1]), tf.constant(1.0, shape=[2])]]
-
-  # Check index paths for dicts.
-  # Keys are linearized in sorted order, matching `tf.nest.flatten`.
-  # More thorough testing of this is in structured_input.py. The underlying code
-  # path for linearization is shared, so no need to replicate that testing here.
-  #
-  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
-  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = ["x"]},
-  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = ["y"]})
-  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0005_dict_2_keys"]
-  @tf.function(input_signature=[])
-  def f0005_dict_2_keys(self):
-    return {
-        'x': tf.constant(1.0, shape=[1]),
-        'y': tf.constant(1.0, shape=[2]),
-    }
-
-  # Check index paths for outputs are correctly handled in the presence of
-  # multiple return statements.
-  #
-  # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]}
-  # CHECK-SAME: ) -> (
-  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = ["x"]})
-  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0006_multiple_return_statements"]
-  @tf.function(input_signature=[tf.TensorSpec([], tf.float32)])
-  def f0006_multiple_return_statements(self, x):
-    if x > 3.:
-      return {'x': tf.constant(1.0, shape=[1])}
-    else:
-      return {'x': tf.constant(1.0, shape=[1])}
-
-
-if __name__ == '__main__':
-  common.do_test(TestModule)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
index d0ca8c09457..937178efaa2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
@@ -21,7 +21,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
                  "tfdtype$DT_RESOURCE", "tfdtype$DT_RESOURCE",
                  "tfdtype$DT_RESOURCE"], body = @while_body_7560,
                 cond = @while_cond_7550, device = "", is_stateless = false,
-                output_shapes = ["tfshape$", "tfshape$", "tfshape$", "tfshape$", "tfshape$"]}
+                output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>]}
          : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
          -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
@@ -38,7 +38,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
     return
   }
-  // CHECK: func @while_body_7560
+  // CHECK-LABEL: func @while_body_7560
   func @while_body_7560(%arg0: tensor<i32>,
                         %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
                         %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
@@ -112,7 +112,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
 // -----
 
-// Tests that the pass does not format variabls with other uses.
+// Tests that the pass does not format variables with other uses.
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
   // CHECK-LABEL: func @main
@@ -135,7 +135,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
              tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
     return
   }
-  // CHECK: func @while_body_7560
+  // CHECK-LABEL: func @while_body_7560
   // CHECK-NOT: TPUReshardVariables
   func @while_body_7560(%arg0: tensor<i32>,
                         %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
@@ -198,3 +198,87 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     return %1 : tensor<i1>
   }
 }
+
+// -----
+
+// Tests that the pass does not format variables when model parallelism is
+// present.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  // CHECK-LABEL: func @main
+  // CHECK-NOT: TPUReshardVariables
+  func @main(%arg0: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
+             %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
+             %arg2: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
+             %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"}) {
+
+    %0 = "tf.Const"() {value = dense<100> : tensor<i32>} : () -> tensor<i32>
+    %1:5 = "tf.While"(%0, %arg0, %arg1, %arg2, %arg3)
+               {T = ["tfdtype$DT_INT32", "tfdtype$DT_RESOURCE",
+                 "tfdtype$DT_RESOURCE", "tfdtype$DT_RESOURCE",
+                 "tfdtype$DT_RESOURCE"], body = @while_body_7560,
+                cond = @while_cond_7550, device = "", is_stateless = false,
+                output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>]}
+         : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
+            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+         -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
+             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+    return
+  }
+  // CHECK-LABEL: func @while_body_7560
+  // CHECK-NOT: TPUReshardVariables
+  func @while_body_7560(%arg0: tensor<i32>,
+                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
+                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
+                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
+                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"})
+        -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
+            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) {
+    %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.AddV2"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %compile:2 = "tf_device.launch"() ( {
+      %2:2 = "tf._TPUCompileMlir"() {
+        NumDynamicShapes = 0 : i64,
+        // The metadata encodes 2 parameter and two return values.
+        metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
+        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<!tf.string>
+    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+    "tf_device.launch"() ( {
+      "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
+      tf_device.return
+    }) {device = "/device:CPU:0"} : () -> ()
+    %rep:2 = tf_device.replicate([%arg1, %arg2] as %arg30: tensor<*x!tf.resource<tensor<f32>>>,
+                        [%arg3, %arg4] as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+            {_mirrored_variable_indices = [0, 1], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
+      %id = "tf.Identity"(%arg30) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
+      "tf_device.parallel_execute"() ({
+        "tf_device.launch"() ( {
+          "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
+                {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
+                  : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<!tf.string>) -> ()
+          tf_device.return
+        }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+        tf_device.return
+      }, {
+        tf_device.return
+      }) {} : () -> ()
+      %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+      tf_device.return %ret : tensor<i32>
+    }
+    return %1, %arg1, %arg2, %arg3, %arg4 : tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>,
+              tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
+              tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
+  }
+  // CHECK-LABEL: func @while_cond_7550
+  func @while_cond_7550(%arg0: tensor<i32>,
+                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
+                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
+                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
+                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"})
+       -> tensor<i1> {
+    %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.GreaterEqual"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %1 : tensor<i1>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index fbbbf05f116..6dceb00eefa 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -2,7 +2,7 @@
 
 
 // Test ops in cluster only have `_tpu_replicate` and `device` attributes
-// removed when moved to a launch.
+// removed when moved to a `tf_device.cluster`.
 // CHECK-LABEL: func @cluster_ops_removed_attrs
 func @cluster_ops_removed_attrs() {
   %0 = "tf.opA"() {_tpu_replicate = "replicate", device = "device", name = "name"} : () -> tensor<i1>
@@ -18,9 +18,9 @@ func @cluster_ops_removed_attrs() {
 
 
 // Test TPUReplicateMetadata ops `name` and `num_replicas` attributes are not
-// copied over to launch.
-// CHECK-LABEL: func @launch_removed_metadata_attrs
-func @launch_removed_metadata_attrs() {
+// copied over to `tf_device.cluster`.
+// CHECK-LABEL: func @removed_metadata_attrs
+func @removed_metadata_attrs() {
   %0 = "tf.opA"() {_tpu_replicate = "replicate"} : () -> tensor<i1>
   "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", name = "name", num_replicas = 1, topology = "topology"} : () -> ()
   return
@@ -42,7 +42,7 @@ func @metadata_op_removed() {
 
 
 // Test ops in an island with the same `_tpu_replicate` attribute are merged
-// under a launch.
+// under a `tf_device.cluster`.
 // CHECK-LABEL: func @simple_island
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
 func @simple_island(%arg0 : tensor<i1>) -> tensor<i1> {
@@ -60,19 +60,19 @@ func @simple_island(%arg0 : tensor<i1>) -> tensor<i1> {
 }
 
 // CHECK:          "tf.opB"
-// CHECK:          %[[LAUNCH:[0-9]*]] = "tf_device.launch"() ( {
+// CHECK:          %[[CLUSTER:[0-9]*]] = "tf_device.cluster"() ( {
 // CHECK-NEXT:       %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
 // CHECK-NEXT:       %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_A]])
 // CHECK-NEXT:       tf_device.return %[[OP_C]]
 // CHECK-NEXT:     _tpu_replicate = "replicate"
 // CHECK-SAME:     device = "device"
 // CHECK-SAME:     topology = "topology"
-// CHECK:          tf_executor.yield %[[LAUNCH]]
+// CHECK:          tf_executor.yield %[[CLUSTER]]
 
 
 // Test ops in an island with the same `_tpu_replicate` attribute are merged
-// under a launch, even when the associated TPUReplicateMetadata op is in a
-// different island.
+// under a `tf_device.cluster`, even when the associated TPUReplicateMetadata op
+// is in a different island.
 // CHECK-LABEL: func @simple_island_separate_metadata
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
 func @simple_island_separate_metadata(%arg0 : tensor<i1>) -> tensor<i1> {
@@ -92,18 +92,18 @@ func @simple_island_separate_metadata(%arg0 : tensor<i1>) -> tensor<i1> {
 }
 
 // CHECK:          "tf.opB"
-// CHECK:          %[[LAUNCH:[0-9]*]] = "tf_device.launch"() ( {
+// CHECK:          %[[CLUSTER:[0-9]*]] = "tf_device.cluster"() ( {
 // CHECK-NEXT:       %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
 // CHECK-NEXT:       %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_A]])
 // CHECK-NEXT:       tf_device.return %[[OP_C]]
 // CHECK-NEXT:     _tpu_replicate = "replicate"
 // CHECK-SAME:     device = "device"
 // CHECK-SAME:     topology = "topology"
-// CHECK:          tf_executor.yield %[[LAUNCH]]
+// CHECK:          tf_executor.yield %[[CLUSTER]]
 
 
 // Test ops in multiple islands with the same `_tpu_replicate` attribute are
-// merged under launch ops only within their respective island.
+// merged under `tf_device.cluster` ops only within their respective island.
 // CHECK-LABEL: func @multiple_islands_separate_metadata
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
 func @multiple_islands_separate_metadata(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
@@ -130,28 +130,28 @@ func @multiple_islands_separate_metadata(%arg0 : tensor<i1>) -> (tensor<i1>, ten
 
 // CHECK:        %[[ISLAND_1:.*]], %[[ISLAND_1_control:.*]] = tf_executor.island {
 // CHECK:          "tf.opB"
-// CHECK:          %[[LAUNCH_0:[0-9]*]] = "tf_device.launch"() ( {
+// CHECK:          %[[CLUSTER_0:[0-9]*]] = "tf_device.cluster"() ( {
 // CHECK-NEXT:       %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
 // CHECK-NEXT:       %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_A]])
 // CHECK-NEXT:       tf_device.return %[[OP_C]]
 // CHECK-NEXT:     _tpu_replicate = "replicate"
 // CHECK-SAME:     device = "device"
 // CHECK-SAME:     topology = "topology"
-// CHECK:          tf_executor.yield %[[LAUNCH_0]]
+// CHECK:          tf_executor.yield %[[CLUSTER_0]]
 // CHECK:        tf_executor.island {
 // CHECK:          "tf.opE"
-// CHECK:          %[[LAUNCH_1:[0-9]*]] = "tf_device.launch"() ( {
+// CHECK:          %[[CLUSTER_1:[0-9]*]] = "tf_device.cluster"() ( {
 // CHECK-NEXT:       %[[OP_D:[0-9]*]] = "tf.opD"(%[[ISLAND_1]])
 // CHECK-NEXT:       %[[OP_F:[0-9]*]] = "tf.opF"(%[[ARG_0]])
 // CHECK-NEXT:       tf_device.return %[[OP_F]]
 // CHECK-NEXT:     _tpu_replicate = "replicate"
 // CHECK-SAME:     device = "device"
 // CHECK-SAME:     topology = "topology"
-// CHECK:          tf_executor.yield %[[LAUNCH_1]]
+// CHECK:          tf_executor.yield %[[CLUSTER_1]]
 
 
 // Test ops in a function body with the same `_tpu_replicate` attribute are
-// merged under a launch op.
+// merged under a `tf_device.cluster` op.
 // CHECK-LABEL: func @ops_in_func_body
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
 func @ops_in_func_body(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>, tensor<i1>) {
@@ -167,7 +167,7 @@ func @ops_in_func_body(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>, tensor<i1
 
 // CHECK:      "tf.opB"
 // CHECK:      "tf.opE"
-// CHECK:      %[[LAUNCH:[0-9]*]]:3 = "tf_device.launch"() ( {
+// CHECK:      %[[CLUSTER:[0-9]*]]:3 = "tf_device.cluster"() ( {
 // CHECK-NEXT:   %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
 // CHECK-NEXT:   %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_A]])
 // CHECK-NEXT:   %[[OP_D:[0-9]*]] = "tf.opD"(%[[OP_C]])
@@ -176,11 +176,11 @@ func @ops_in_func_body(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>, tensor<i1
 // CHECK-NEXT: _tpu_replicate = "replicate"
 // CHECK-SAME: device = "device"
 // CHECK-SAME: topology = "topology"
-// CHECK:      return %[[LAUNCH]]#0, %[[LAUNCH]]#1, %[[LAUNCH]]#2
+// CHECK:      return %[[CLUSTER]]#0, %[[CLUSTER]]#1, %[[CLUSTER]]#2
 
 
-// Test a nested user of an op in a cluster has its operand be updated to launch
-// result.
+// Test a nested user of an op in a cluster has its operand be updated to
+// `tf_device.cluster` result.
 // CHECK-LABEL: func @nested_cluster_op_user
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
 func @nested_cluster_op_user(%arg0 : tensor<i1>) -> (tensor<i1>) {
@@ -193,7 +193,7 @@ func @nested_cluster_op_user(%arg0 : tensor<i1>) -> (tensor<i1>) {
   return %2 : tensor<i1>
 }
 
-// CHECK:      %[[LAUNCH:[0-9]*]]:2 = "tf_device.launch"() ( {
+// CHECK:      %[[CLUSTER:[0-9]*]]:2 = "tf_device.cluster"() ( {
 // CHECK-NEXT:   %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
 // CHECK-NEXT:   %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
 // CHECK-NEXT:   tf_device.return %[[OP_A]], %[[OP_B]]
@@ -201,8 +201,8 @@ func @nested_cluster_op_user(%arg0 : tensor<i1>) -> (tensor<i1>) {
 // CHECK-SAME: device = "device"
 // CHECK-SAME: topology = "topology"
 // CHECK:      tf_executor.graph {
-// CHECK-NEXT:   tf_executor.fetch %[[LAUNCH]]#0
-// CHECK:      return %[[LAUNCH]]#1
+// CHECK-NEXT:   tf_executor.fetch %[[CLUSTER]]#0
+// CHECK:      return %[[CLUSTER]]#1
 
 
 // Test nested op of a cluster with an operand from an op of the same cluster
@@ -218,7 +218,7 @@ func @nested_cluster_op(%arg0 : tensor<i1>) -> (tensor<i1>) {
   return %1 : tensor<i1>
 }
 
-// CHECK:      %[[LAUNCH:[0-9]*]] = "tf_device.launch"() ( {
+// CHECK:      %[[CLUSTER:[0-9]*]] = "tf_device.cluster"() ( {
 // CHECK-NEXT:   %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
 // CHECK-NEXT:   %[[OP_B:[0-9]*]] = "tf.opB"() ( {
 // CHECK-NEXT:     "tf.opC"(%[[OP_A]])
@@ -226,7 +226,7 @@ func @nested_cluster_op(%arg0 : tensor<i1>) -> (tensor<i1>) {
 // CHECK-NEXT: _tpu_replicate = "replicate"
 // CHECK-SAME: device = "device"
 // CHECK-SAME: topology = "topology"
-// CHECK:      return %[[LAUNCH]]
+// CHECK:      return %[[CLUSTER]]
 
 
 // Test multiple clusters interleaved.
@@ -242,21 +242,21 @@ func @interleaved_clusters(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
   return %2, %3 : tensor<i1>, tensor<i1>
 }
 
-// CHECK:      %[[LAUNCH_0:[0-9]*]] = "tf_device.launch"() ( {
+// CHECK:      %[[CLUSTER_0:[0-9]*]] = "tf_device.cluster"() ( {
 // CHECK-NEXT:   %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
 // CHECK-NEXT:   %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_A]])
 // CHECK-NEXT:   tf_device.return %[[OP_C]]
 // CHECK-NEXT: _tpu_replicate = "replicate_0"
 // CHECK-SAME: device = "device_0"
 // CHECK-SAME: topology = "topology_0"
-// CHECK:      %[[LAUNCH_1:[0-9]*]] = "tf_device.launch"() ( {
+// CHECK:      %[[CLUSTER_1:[0-9]*]] = "tf_device.cluster"() ( {
 // CHECK-NEXT:   %[[OP_B:[0-9]*]] = "tf.opB"(%[[ARG_0]])
 // CHECK-NEXT:   %[[OP_D:[0-9]*]] = "tf.opD"(%[[OP_B]])
 // CHECK-NEXT:   tf_device.return %[[OP_D]]
 // CHECK-NEXT: _tpu_replicate = "replicate_1"
 // CHECK-SAME: device = "device_1"
 // CHECK-SAME: topology = "topology_1"
-// CHECK:      return %[[LAUNCH_0]], %[[LAUNCH_1]]
+// CHECK:      return %[[CLUSTER_0]], %[[CLUSTER_1]]
 
 
 // Test operands and results of ops of a cluster that are interleaved between
@@ -276,14 +276,14 @@ func @interleaved_cluster_operands_results() {
 
 // CHECK:      %[[OP_C:[0-9]*]] = "tf.opC"
 // CHECK:      %[[OP_E:[0-9]*]] = "tf.opE"(%[[OP_C]])
-// CHECK:      %[[LAUNCH:[0-9]*]] = "tf_device.launch"() ( {
+// CHECK:      %[[CLUSTER:[0-9]*]] = "tf_device.cluster"() ( {
 // CHECK-NEXT:   %[[OP_A:[0-9]*]] = "tf.opA"
 // CHECK-NEXT:   "tf.opF"(%[[OP_E]])
 // CHECK-NEXT:   tf_device.return %[[OP_A]]
 // CHECK-NEXT: _tpu_replicate = "replicate"
 // CHECK-SAME: device = "device"
 // CHECK-SAME: topology = "topology"
-// CHECK:      %[[OP_B:[0-9]*]] = "tf.opB"(%[[LAUNCH]])
+// CHECK:      %[[OP_B:[0-9]*]] = "tf.opB"(%[[CLUSTER]])
 // CHECK:      "tf.opD"(%[[OP_B]])
 
 
@@ -306,24 +306,24 @@ func @one_replica(%arg0: tensor<i1>) -> tensor<i1> {
 
 // CHECK:      %[[OP_C:[0-9]*]] = "tf.opC"
 // CHECK:      %[[OP_E:[0-9]*]] = "tf.opE"(%[[OP_C]])
-// CHECK:      %[[LAUNCH:[0-9]*]]:2 = "tf_device.launch"() ( {
+// CHECK:      %[[CLUSTER:[0-9]*]]:2 = "tf_device.cluster"() ( {
 // CHECK-NEXT:   %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
 // CHECK-NEXT:   %[[OP_F:[0-9]*]] = "tf.opF"(%[[OP_E]])
 // CHECK-NEXT:   tf_device.return %[[OP_A]], %[[OP_F]]
 // CHECK-NEXT: _tpu_replicate = "replicate"
 // CHECK-SAME: device = "device"
 // CHECK-SAME: topology = "topology"
-// CHECK:      %[[OP_B:[0-9]*]] = "tf.opB"(%[[LAUNCH]]#0)
+// CHECK:      %[[OP_B:[0-9]*]] = "tf.opB"(%[[CLUSTER]]#0)
 // CHECK:      "tf.opD"(%[[OP_B]])
-// CHECK:      return %[[LAUNCH]]#1
+// CHECK:      return %[[CLUSTER]]#1
 // CHECK-NOT:  "tf.TPUReplicatedInput"
 // CHECK-NOT:  "tf.TPUReplicatedOutput"
 
 
 // Test replication with replicated operands and replicated results. The cluster
-// will be wrapped in a launch first and then by a replicate. TPUReplicatedInput
-// and TPUReplicatedOutput nodes will be replaced by the replicate operands and
-// results.
+// will be wrapped in a `tf_device.cluster` first and then by a replicate.
+// TPUReplicatedInput and TPUReplicatedOutput nodes will be replaced by the
+// replicate operands and results.
 // CHECK-LABEL: func @replication
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>, %[[ARG_1:[a-z0-9]*]]: tensor<i32>, %[[ARG_2:[a-z0-9]*]]: tensor<f32>)
 func @replication(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<f32>) -> (tensor<i32>, tensor<f32>) {
@@ -347,18 +347,18 @@ func @replication(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<f32>) ->
 // CHECK-DAG:  [%[[ARG_0]], %[[OP_A]]] as %[[RI_0:[a-z0-9]*]]: tensor<i1>
 // CHECK-DAG:  [%[[OP_B]], %[[ARG_1]]] as %[[RI_1:[a-z0-9]*]]: tensor<i32>
 // CHECK-SAME: n = 2 : i32
-// CHECK-NEXT:   %[[LAUNCH:[0-9]*]]:2 = "tf_device.launch"() ( {
+// CHECK-NEXT:   %[[CLUSTER:[0-9]*]]:2 = "tf_device.cluster"() ( {
 // CHECK:          %[[OP_D:[0-9]*]] = "tf.opD"(%[[RI_0]], %[[RI_1]], %[[ARG_2]], %[[OP_C]])
 // CHECK:          %[[OP_E:[0-9]*]] = "tf.opE"(%[[OP_D]], %[[RI_0]], %[[RI_1]], %[[ARG_2]], %[[OP_C]])
 // CHECK:          tf_device.return %[[OP_D]], %[[OP_E]]
 // CHECK-NEXT:   _tpu_replicate = "replicate"
 // CHECK-SAME:   device = "device"
 // CHECK-SAME:   topology = "topology"
-// CHECK:        tf_device.return %[[LAUNCH]]#0, %[[LAUNCH]]#1
+// CHECK:        tf_device.return %[[CLUSTER]]#0, %[[CLUSTER]]#1
 // CHECK:      return %[[REPLICATE]]#0, %[[REPLICATE]]#3
 
 
-// Test `tf.TPUReplicatedInput` ops are sorted by their `index` attribute.
+// Test TPUReplicatedInput ops are sorted by their `index` attribute.
 // Non-negative `index` should precede `index` of -1, and ordering of ops with
 // `index` of -1 does not matter.
 // CHECK-LABEL: func @sort_replicated_input
@@ -452,7 +452,7 @@ func @mismatched_replicated_output() {
 // Test cluster that should be replicated where its outputs do not lead to a
 // TPUReplicatedOutput.
 func @missing_replicated_output() {
-  // expected-error@+1 {{requires output of tf_device.launch to lead to a 'tf.TPUReplicatedOutput' op}}
+  // expected-error@+1 {{requires output of tf_device.cluster to lead to a 'tf.TPUReplicatedOutput' op}}
   %0 = "tf.opA"() {_tpu_replicate = "replicate", device = "device", name = "name"} : () -> tensor<i1>
   %1 = "tf.opB"(%0) : (tensor<i1>) -> tensor<i1>
   "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
@@ -520,8 +520,10 @@ func @input_index_gaps(%arg0: tensor<i1>) {
   return
 }
 
+
 // -----
 
+
 // Test that the `is_mirrored_variable` attribute is preserved in the
 // tf_device.replicate op.
 // CHECK-LABEL: func @mirrored_variables
@@ -537,4 +539,3 @@ func @mirrored_variables(%arg0: tensor<!tf.resource<tensor<32xf32>>>, %arg1: ten
 // CHECK:      tf_device.replicate
 // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %{{[a-z0-9]*}}
 // CHECK-SAME: _mirrored_variable_indices = [1]
-
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_dynamic_padding_mapper.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_dynamic_padding_mapper.mlir
index ad2ebc08c1d..8b610e45b4e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_dynamic_padding_mapper.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_dynamic_padding_mapper.mlir
@@ -10,7 +10,7 @@
 // CHECK-LABEL: func @single_arg_single_shape
 func @single_arg_single_shape(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    "tf_device.launch_func"(%ri_0, %ri_1) {device = "", func = @func0, padding_map = ["\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
+    "tf_device.cluster_func"(%ri_0, %ri_1) {func = @func0, padding_map = ["\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -37,7 +37,7 @@ func @func0(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 // CHECK-LABEL: func @single_arg_multiple_shapes
 func @single_arg_multiple_shapes(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>, [%arg0, %arg0] as %ri_2: tensor<i1>) {n = 2 : i32} {
-    "tf_device.launch_func"(%ri_0, %ri_1, %ri_2) {device = "", func = @func1, padding_map = ["\10\02\18\01", "\10\03\18\02"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
+    "tf_device.cluster_func"(%ri_0, %ri_1, %ri_2) {func = @func1, padding_map = ["\10\02\18\01", "\10\03\18\02"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -69,7 +69,7 @@ func @func1(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 // CHECK-LABEL: func @multiple_args
 func @multiple_args(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>, [%arg0, %arg0] as %ri_2: tensor<i1>, [%arg0, %arg0] as %ri_3: tensor<i1>, [%arg0, %arg0] as %ri_4: tensor<i1>) {n = 2 : i32} {
-    "tf_device.launch_func"(%ri_0, %ri_1, %ri_2, %ri_3, %ri_4) {device = "", func = @func2, padding_map = ["\10\02\18\01", "\10\03\18\02", "\08\04\10\01\18\03"]} : (tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>) -> ()
+    "tf_device.cluster_func"(%ri_0, %ri_1, %ri_2, %ri_3, %ri_4) {func = @func2, padding_map = ["\10\02\18\01", "\10\03\18\02", "\08\04\10\01\18\03"]} : (tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -90,7 +90,7 @@ func @func2(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>, %arg3: tens
 // CHECK-LABEL: func @remap_indices
 func @remap_indices(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    "tf_device.launch_func"(%ri_1, %arg0, %ri_0) {device = "", func = @func3, padding_map = ["\10\02\18\01"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
+    "tf_device.cluster_func"(%ri_1, %arg0, %ri_0) {func = @func3, padding_map = ["\10\02\18\01"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -111,7 +111,7 @@ func @func3(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 //   padding_arg_index: 1
 // CHECK-LABEL: func @no_replicate
 func @no_replicate(%arg0: tensor<i1>) {
-  "tf_device.launch_func"(%arg0, %arg0, %arg0) {device = "", func = @func4, padding_map = ["\10\02\18\01"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
+  "tf_device.cluster_func"(%arg0, %arg0, %arg0) {func = @func4, padding_map = ["\10\02\18\01"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
   return
 }
 
@@ -125,7 +125,7 @@ func @func4(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 // CHECK-LABEL: func @no_padding_map
 func @no_padding_map(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    "tf_device.launch_func"(%ri_1, %arg0, %ri_0) {device = "", func = @func5} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
+    "tf_device.cluster_func"(%ri_1, %arg0, %ri_0) {func = @func5} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -141,7 +141,7 @@ func @func5(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 // CHECK-LABEL: func @empty_padding_map
 func @empty_padding_map(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    "tf_device.launch_func"(%ri_1, %arg0, %ri_0) {device = "", func = @func6, padding_map = []} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
+    "tf_device.cluster_func"(%ri_1, %arg0, %ri_0) {func = @func6, padding_map = []} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -162,7 +162,7 @@ func @func6(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 // CHECK-LABEL: func @unused_padding_map
 func @unused_padding_map(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    "tf_device.launch_func"(%ri_1) {device = "", func = @func7, padding_map = ["\10\02\18\01"]} : (tensor<i1>) -> ()
+    "tf_device.cluster_func"(%ri_1) {func = @func7, padding_map = ["\10\02\18\01"]} : (tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -189,7 +189,7 @@ func @func7(%arg0: tensor<i1>) {
 func @missing_padding_arg(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>, [%arg0, %arg0] as %ri_2: tensor<i1>, [%arg0, %arg0] as %ri_3: tensor<i1>) {n = 2 : i32} {
     // expected-warning@+1 {{bad 'padding_map' attribute at index 0, unused padding_arg_index 1}}
-    "tf_device.launch_func"(%ri_0, %ri_2, %ri_3) {device = "", func = @func8, padding_map = ["\10\02\18\01", "\08\02\10\02\18\03"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
+    "tf_device.cluster_func"(%ri_0, %ri_2, %ri_3) {func = @func8, padding_map = ["\10\02\18\01", "\08\02\10\02\18\03"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -206,8 +206,8 @@ func @func8(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 // Test bad padding map attribute (not an array).
 func @bad_padding_map() {
   tf_device.replicate {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.launch_func' op requires 'padding_map' array attribute}}
-    "tf_device.launch_func"() {device = "", func = @_func, padding_map = 0 : i32} : () -> ()
+    // expected-error@+1 {{'tf_device.cluster_func' op requires 'padding_map' array attribute}}
+    "tf_device.cluster_func"() {func = @_func, padding_map = 0 : i32} : () -> ()
     tf_device.return
   }
   return
@@ -222,8 +222,8 @@ func @_func() {
 // Test bad padding map attribute (element in array is not a string).
 func @bad_padding_map_element() {
   tf_device.replicate {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.launch_func' op bad 'padding_map' attribute at index 0, not a string}}
-    "tf_device.launch_func"() {device = "", func = @_func, padding_map = [0 : i32]} : () -> ()
+    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, not a string}}
+    "tf_device.cluster_func"() {func = @_func, padding_map = [0 : i32]} : () -> ()
     tf_device.return
   }
   return
@@ -238,8 +238,8 @@ func @_func() {
 // Test unparsable padding map.
 func @bad_padding_map_proto() {
   tf_device.replicate {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.launch_func' op bad 'padding_map' attribute at index 0, failed to parse 'z' as tensorflow::tpu::PaddingMap}}
-    "tf_device.launch_func"() {device = "", func = @_func, padding_map = ["z"]} : () -> ()
+    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, failed to parse 'z' as tensorflow::tpu::PaddingMap}}
+    "tf_device.cluster_func"() {func = @_func, padding_map = ["z"]} : () -> ()
     tf_device.return
   }
   return
@@ -259,8 +259,8 @@ func @_func() {
 //   padding_arg_index: 1
 func @negative_arg_index(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.launch_func' op bad 'padding_map' attribute at index 0, arg_index must be in [0, 2), got -1}}
-    "tf_device.launch_func"(%ri_0, %ri_1) {device = "", func = @_func, padding_map = ["\08\FF\FF\FF\FF\FF\FF\FF\FF\FF\01\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
+    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, arg_index must be in [0, 2), got -1}}
+    "tf_device.cluster_func"(%ri_0, %ri_1) {func = @_func, padding_map = ["\08\FF\FF\FF\FF\FF\FF\FF\FF\FF\01\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -280,8 +280,8 @@ func @_func(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 //   padding_arg_index: 1
 func @bad_arg_index(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.launch_func' op bad 'padding_map' attribute at index 0, arg_index must be in [0, 2), got 2}}
-    "tf_device.launch_func"(%ri_0, %ri_1) {device = "", func = @_func, padding_map = ["\08\02\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
+    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, arg_index must be in [0, 2), got 2}}
+    "tf_device.cluster_func"(%ri_0, %ri_1) {func = @_func, padding_map = ["\08\02\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -301,8 +301,8 @@ func @_func(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 //   padding_arg_index: -1
 func @negative_padding_arg_index(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.launch_func' op bad 'padding_map' attribute at index 0, padding_arg_index must be in [0, 2), got -1}}
-    "tf_device.launch_func"(%ri_0, %ri_1) {device = "", func = @_func, padding_map = ["\08\01\10\02\18\FF\FF\FF\FF\FF\FF\FF\FF\FF\01"]} : (tensor<i1>, tensor<i1>) -> ()
+    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, padding_arg_index must be in [0, 2), got -1}}
+    "tf_device.cluster_func"(%ri_0, %ri_1) {func = @_func, padding_map = ["\08\01\10\02\18\FF\FF\FF\FF\FF\FF\FF\FF\FF\01"]} : (tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
@@ -322,8 +322,8 @@ func @_func(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 //   padding_arg_index: 2
 func @bad_padding_arg_index(%arg0: tensor<i1>) {
   tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.launch_func' op bad 'padding_map' attribute at index 0, padding_arg_index must be in [0, 2), got 2}}
-    "tf_device.launch_func"(%ri_0, %ri_1) {device = "", func = @_func, padding_map = ["\08\01\10\02\18\02"]} : (tensor<i1>, tensor<i1>) -> ()
+    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, padding_arg_index must be in [0, 2), got 2}}
+    "tf_device.cluster_func"(%ri_0, %ri_1) {func = @_func, padding_map = ["\08\01\10\02\18\02"]} : (tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
new file mode 100644
index 00000000000..90fa8cff5dc
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -0,0 +1,136 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-extract-head-tail-outside-compilation | FileCheck %s --dump-input-on-failure
+
+// Tests extraction of a outside compiled ops at head of TPU computation.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @single_head_outside_compilation
+  func @single_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+    // CHECK:      tf_device.launch
+    //
+    // CHECK:        "tf.A"
+    // CHECK-NEXT:   tf_device.return
+    //
+    // CHECK:      device
+    // CHECK-SAME:  "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
+      "tf.B"() : () -> ()
+      "tf.C"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @multiple_head_outside_compilation
+  func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
+    // CHECK:        "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK:      device
+    // CHECK-SAME:  "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
+      "tf.D"(%1) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+  // CHECK-LABEL: func @test_do_not_outside_compiled_ops_in_middle
+  func @test_do_not_outside_compiled_ops_in_middle(%arg0 : tensor<i32>) -> () {
+    // CHECK-NOT:  tf_device.launch
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+      "tf.C"(%1) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+  // CHECK-LABEL: func @test_ops_with_tpu_operands_not_extracted
+  func @test_ops_with_tpu_operands_not_extracted(%arg0 : tensor<i32>) -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[D_OUT]]
+    // CHECK:      device
+    // CHECK-SAME: "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.B"
+    // CHECK:        "tf.C"
+    // CHECK:        "tf.E"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.B"() {} : () -> (tensor<i32>)
+      %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+      %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+  // CHECK-LABEL: func @test_replicated_head_outside_compilation
+  func @test_replicated_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[D_OUT]]
+    // CHECK:      device
+    // CHECK-SAME: "TPU_REPLICATED_HOST"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.B"
+    // CHECK:        "tf.C"
+    // CHECK:        "tf.E"
+    // CHECK-NEXT:   tf_device.return
+    tf_device.replicate() {n = 2 : i32} {
+      "tf_device.cluster"() ( {
+        %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+        %1 = "tf.B"() {} : () -> (tensor<i32>)
+        %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+        %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+        %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
+        tf_device.return
+      }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+      tf_device.return
+    }
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
new file mode 100644
index 00000000000..3cb693ee571
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -0,0 +1,144 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-extract-outside-compilation | FileCheck %s --dump-input-on-failure
+
+// Tests that missing `_xla_outside_compilation` attribute value results in an error.
+
+func @missing_outside_compilation_attribute() -> () {
+  "tf_device.cluster"() ( {
+    "tf.A"() : () -> ()
+    // expected-error@+1 {{attribute '_xla_outside_compilation' is empty}}
+    "tf.B"() {_xla_outside_compilation = ""} : () -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// -----
+
+// Tests that TPU cluster with no outside compilation does not generate parallel_execute.
+
+// CHECK-LABEL: func @no_outside_compilation
+func @no_outside_compilation() -> tensor<?xi32> {
+  %0 = "tf_device.cluster"() ( {
+    %1 = "tf.A"() : () -> tensor<?xi32>
+    %2 = "tf.B"(%1) : (tensor<?xi32>) -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+// CHECK-NOT: "tf_device.parallel_execute"
+
+// Tests extraction of a single outside compiled cluster with no input or output dependecies.
+
+// CHECK-LABEL: func @nodep_single_outside_compilation
+func @nodep_single_outside_compilation() -> () {
+   // CHECK: "tf_device.parallel_execute"
+   // CHECK-NEXT: "tf_device.launch"
+   // CHECK-NEXT: "tf.B"
+   // CHECK-NOT: _xla_outside_compilation
+   // CHECK: "tf_device.cluster"
+   // CHECK-NEXT: "tf.A"
+   // CHECK: cluster_attr = "cluster_attr"
+  "tf_device.cluster"() ( {
+    "tf.A"() : () -> ()
+    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+    "tf.C"() : () -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// Tests extraction of a single outside compiled cluster with multiple ops and no input or output dependecies.
+
+// CHECK-LABEL: func @nodep_single_cluster_multiple_ops_outside_compilation
+func @nodep_single_cluster_multiple_ops_outside_compilation() -> () {
+   // CHECK: "tf_device.parallel_execute"
+   // CHECK-NEXT: "tf_device.launch"
+   // CHECK-NEXT: "tf.B"
+   // CHECK-NEXT: "tf.C"
+   // CHECK-NEXT: "tf.D"
+   // CHECK-NOT: _xla_outside_compilation
+   // CHECK: "tf_device.cluster"
+   // CHECK-NEXT: "tf.A"
+   // CHECK-NEXT: "tf.E"
+   // CHECK: cluster_attr = "cluster_attr"
+  "tf_device.cluster"() ( {
+    "tf.A"() : () -> ()
+    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+    "tf.C"() {_xla_outside_compilation = "cluster1"} : () -> ()
+    "tf.D"() {_xla_outside_compilation = "cluster1"} : () -> ()
+    "tf.E"() : () -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// Tests extraction of a multiple outside compiled clusters with no input or output dependecies.
+
+// CHECK-LABEL: func @nodep_multiple_outside_compilation
+func @nodep_multiple_outside_compilation() -> () {
+   // CHECK: "tf_device.parallel_execute"
+   // CHECK-COUNT-2: "tf_device.launch"
+   // CHECK: "tf_device.cluster"
+  "tf_device.cluster"() ( {
+    "tf.A"() : () -> ()
+    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+    "tf.C"() : () -> ()
+    "tf.D"() {_xla_outside_compilation = "cluster2"} : () -> ()
+    "tf.E"() : () -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// Tests extraction of a single outside compiled cluster with single TPU cluster return.
+
+// CHECK-LABEL: func @single_tpu_return_single_outside_compilation
+func @single_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+      // CHECK-NEXT: "tf_device.launch"
+      // CHECK: %[[TPU_CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster"
+        // CHECK: tf_device.return
+      // CHECK: tf_device.return %[[TPU_CLUSTER_OUTPUT]]
+      // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      "tf.A"() : () -> ()
+      "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      %3 = "tf.C"() : () -> tensor<?xi32>
+      tf_device.return %3 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster with multiple TPU cluster return.
+
+// CHECK-LABEL: func @multiple_tpu_return_single_outside_compilation
+func @multiple_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xf32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: %[[REPLICATE:[0-9]*]]:4 = tf_device.replicate
+    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2  = "tf_device.parallel_execute"
+      // CHECK-NEXT: "tf_device.launch"
+      // CHECK: %[[TPU_CLUSTER_OUTPUT:[0-9]*]]:2 = "tf_device.cluster"
+        // CHECK: tf_device.return
+      // CHECK: tf_device.return %[[TPU_CLUSTER_OUTPUT]]
+    // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
+  %1:4 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2, %3 = "tf_device.cluster"() ( {
+      %4 = "tf.A"() : () -> tensor<?xf32>
+      "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      %5 = "tf.C"() : () -> tensor<?xi32>
+      tf_device.return %4, %5  : tensor<?xf32>, tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> (tensor<?xf32>, tensor<?xi32>)
+    tf_device.return %2, %3 : tensor<?xf32>, tensor<?xi32>
+  }
+
+  return %1 : tensor<?xf32>
+}
+
+// TODO(b/154363171): Add test cases for when output of outside compilation is returned by parallel_execute.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index 06d6c35e0a8..332b46f427f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -5,7 +5,7 @@
 // expected-error@+1 {{requires attribute 'tf.versions'}}
 module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_tf_versions() {
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -20,7 +20,7 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_devices() {
     // expected-error@+1 {{error in fetching TPU compilation/execution devices: no TPU_SYSTEM devices found}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -30,13 +30,13 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with missing `num_cores_per_replicas`
+// Tests `tf_device.cluster_func` with missing `num_cores_per_replicas`
 // attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_num_cores_per_replica() {
     // expected-error@+1 {{requires attribute 'num_cores_per_replica'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -46,12 +46,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad `num_cores_per_replicas` attribute.
+// Tests `tf_device.cluster_func` with bad `num_cores_per_replicas` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_num_cores_per_replica() {
     // expected-error@+1 {{requires attribute 'num_cores_per_replica'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = "", step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = "", step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -61,12 +61,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with missing `step_marker_location` attribute.
+// Tests `tf_device.cluster_func` with missing `step_marker_location` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_num_cores_per_replica() {
     // expected-error@+1 {{requires attribute 'step_marker_location'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -76,12 +76,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad `step_marker_location` attribute.
+// Tests `tf_device.cluster_func` with bad `step_marker_location` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_step_marker_location() {
     // expected-error@+1 {{requires attribute 'step_marker_location'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = 1, padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = 1, padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -91,12 +91,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with unparsable `step_marker_location` attribute.
+// Tests `tf_device.cluster_func` with unparsable `step_marker_location` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @unparsable_step_marker_location() {
     // expected-error@+1 {{bad 'step_marker_location' attribute with value 'test'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "test", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "test", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -106,12 +106,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with missing `padding_map` attribute.
+// Tests `tf_device.cluster_func` with missing `padding_map` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_padding_map() {
     // expected-error@+1 {{requires attribute 'padding_map'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -121,12 +121,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad `padding_map` attribute.
+// Tests `tf_device.cluster_func` with bad `padding_map` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_padding_map() {
     // expected-error@+1 {{requires attribute 'padding_map'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = "", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = "", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -136,12 +136,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad element in `padding_map` attribute.
+// Tests `tf_device.cluster_func` with bad element in `padding_map` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_element_padding_map() {
     // expected-error@+1 {{bad 'padding_map' attribute at index 0, not a string}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [1], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [1], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -151,12 +151,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with unparsable element in `padding_map` attribute.
+// Tests `tf_device.cluster_func` with unparsable element in `padding_map` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @unparsable_element_padding_map() {
     // expected-error@+1 {{bad 'padding_map' attribute at index 0 with value 'test': failed to parse to tpu::PaddingMap}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["test"], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["test"], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -166,12 +166,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with missing `topology` attribute.
+// Tests `tf_device.cluster_func` with missing `topology` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_topology() {
     // expected-error@+1 {{requires attribute 'topology'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -181,12 +181,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad `topology` attribute.
+// Tests `tf_device.cluster_func` with bad `topology` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_topology() {
     // expected-error@+1 {{requires attribute 'topology'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = 1 : i32, device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = 1 : i32, device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -196,12 +196,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with `topology` attribute resulting in device assignment error.
+// Tests `tf_device.cluster_func` with `topology` attribute resulting in device assignment error.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @invalid_topology() {
     // expected-error@+1 {{error in fetching TPU compilation/execution devices}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "test", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "test", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -211,12 +211,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with missing `device_assignment` attribute.
+// Tests `tf_device.cluster_func` with missing `device_assignment` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_device_assignment() {
     // expected-error@+1 {{requires attribute 'device_assignment'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -226,12 +226,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad `device_assignment` attribute.
+// Tests `tf_device.cluster_func` with bad `device_assignment` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_device_assignment() {
     // expected-error@+1 {{requires attribute 'device_assignment'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = "", input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = "", input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -241,12 +241,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad element in `device_assignment` attribute.
+// Tests `tf_device.cluster_func` with bad element in `device_assignment` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_element_device_assignment() {
     // expected-error@+1 {{bad 'device_assignment' attribute at index 0, not an int}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [""], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [""], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -277,12 +277,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with `device_assignment` attribute resulting in device assignment error.
+// Tests `tf_device.cluster_func` with `device_assignment` attribute resulting in device assignment error.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @invalid_device_assignment() {
     // expected-error@+1 {{error in fetching TPU compilation/execution devices}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "\0A\03\01\01\02\10\01\18\02\22\06\00\00\00\00\00\01", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "\0A\03\01\01\02\10\01\18\02\22\06\00\00\00\00\00\01", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -292,12 +292,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with missing `input_sharding_configuration` attribute.
+// Tests `tf_device.cluster_func` with missing `input_sharding_configuration` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_input_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{requires attribute 'input_sharding_configuration'}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = [], topology = "", device_assignment = [], output_sharding_configuration = []} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = [], topology = "", device_assignment = [], output_sharding_configuration = []} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -317,12 +317,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad `input_sharding_configuration` attribute.
+// Tests `tf_device.cluster_func` with bad `input_sharding_configuration` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_input_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{requires attribute 'input_sharding_configuration'}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = "", output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = "", output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -332,12 +332,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with mismatched `input_sharding_configuration` attribute size.
+// Tests `tf_device.cluster_func` with mismatched `input_sharding_configuration` attribute size.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @mismatched_size_input_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{bad 'input_sharding_configuration' attribute, expected array attribute of size 1, got size 0}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -347,12 +347,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with unsupported operand type.
+// Tests `tf_device.cluster_func` with unsupported operand type.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @unsupported_operand_type(%arg0: tensor<?xi2>) {
     // expected-error@+1 {{failed to determine operand type at index 0: Converting i2 to DataType}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi2>) -> tensor<?xi2>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi2>) -> tensor<?xi2>
     return
   }
   func @empty_func(%arg0: tensor<?xi2>) -> tensor<?xi2> {
@@ -362,12 +362,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad element in `input_sharding_configuration` attribute.
+// Tests `tf_device.cluster_func` with bad element in `input_sharding_configuration` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_element_input_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{bad 'input_sharding_configuration' attribute at index 0, not a string}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [1], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [1], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -377,12 +377,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with unparsable element in `input_sharding_configuration` attribute.
+// Tests `tf_device.cluster_func` with unparsable element in `input_sharding_configuration` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @unparsable_element_input_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{bad 'input_sharding_configuration' attribute at index 0 with value 'test': failed to parse to xla::OpSharding}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["test"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["test"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -392,12 +392,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with missing `output_sharding_configuration` attribute.
+// Tests `tf_device.cluster_func` with missing `output_sharding_configuration` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_output_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{requires attribute 'output_sharding_configuration'}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -407,12 +407,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with bad `output_sharding_configuration` attribute.
+// Tests `tf_device.cluster_func` with bad `output_sharding_configuration` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_output_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{requires attribute 'output_sharding_configuration'}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ""} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ""} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -422,12 +422,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with mismatched `output_sharding_configuration` attribute size.
+// Tests `tf_device.cluster_func` with mismatched `output_sharding_configuration` attribute size.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @mismatched_size_output_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{bad 'output_sharding_configuration' attribute, expected array attribute of size 1, got size 0}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = []} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = []} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -438,12 +438,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 // -----
 
 
-// Tests `tf_device.launch_func` with bad element in `output_sharding_configuration` attribute.
+// Tests `tf_device.cluster_func` with bad element in `output_sharding_configuration` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_element_output_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{bad 'output_sharding_configuration' attribute at index 0, not a string}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = [1]} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = [1]} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -453,12 +453,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with unparsable element in `output_sharding_configuration` attribute.
+// Tests `tf_device.cluster_func` with unparsable element in `output_sharding_configuration` attribute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @unparsable_element_output_sharding_configuration(%arg0: tensor<?xi32>) {
     // expected-error@+1 {{bad 'output_sharding_configuration' attribute at index 0 with value 'test': failed to parse to xla::OpSharding}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["test"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["test"]} : (tensor<?xi32>) -> tensor<?xi32>
     return
   }
   func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
@@ -468,7 +468,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests `tf_device.launch_func` with empty `step_marker_location` attribute
+// Tests `tf_device.cluster_func` with empty `step_marker_location` attribute
 // defaults to `STEP_MARK_AT_ENTRY`.
 //
 // The expected TPUCompileMetadataProto is:
@@ -478,7 +478,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @default_step_marker_location
   func @default_step_marker_location() {
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    "tf_device.cluster_func"() {_tpu_replicate = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     // CHECK:      metadata
     // CHECK-SAME: num_replicas: 1
     // CHECK-SAME: num_cores_per_replica: 1
@@ -497,7 +497,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @unranked_shape_arg
   func @unranked_shape_arg(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*xi32>) -> tensor<*xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*xi32>) -> tensor<*xi32>
     // CHECK:      metadata
     // CHECK-SAME: shape {\0A unknown_rank: true
 
@@ -515,7 +515,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @partial_shape_arg
   func @partial_shape_arg(%arg0: tensor<?x?x3xi32>) -> tensor<?x?x3xi32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?x?x3xi32>) -> tensor<?x?x3xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?x?x3xi32>) -> tensor<?x?x3xi32>
     // CHECK:      metadata
     // CHECK-SAME: args
     // CHECK-SAME: shape {\0A dim {\0A size: -1\0A }\0A dim {\0A size: -1\0A }\0A dim {\0A size: 3\0A }\0A }
@@ -546,7 +546,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @static_shape_arg
   func @static_shape_arg(%arg0: tensor<1x2x3xi32>) -> tensor<1x2x3xi32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<1x2x3xi32>) -> tensor<1x2x3xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<1x2x3xi32>) -> tensor<1x2x3xi32>
     // CHECK:      metadata
     // CHECK-SAME: args
     // CHECK-SAME: shape
@@ -571,7 +571,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @resource_arg
   func @resource_arg(%arg0: tensor<*x!tf.resource>) -> tensor<*x!tf.resource> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*x!tf.resource>) -> tensor<*x!tf.resource>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*x!tf.resource>) -> tensor<*x!tf.resource>
     // CHECK:      metadata
     // CHECK:      dtype: DT_RESOURCE
     // CHECK-SAME: kind: VARIABLE
@@ -590,7 +590,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @parameter_arg
   func @parameter_arg(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*xf32>) -> tensor<*xf32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*xf32>) -> tensor<*xf32>
     // CHECK:      metadata
     // CHECK:      dtype: DT_FLOAT
     // CHECK-SAME: kind: PARAMETER
@@ -614,7 +614,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests metadata is populated correctly based on launch_func op and attributes.
+// Tests metadata is populated correctly based on cluster_func op and attributes.
 //
 // The expected TPUCompileMetadataProto is:
 //   args {
@@ -650,7 +650,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @metadata
   func @metadata(%arg0: tensor<8xi32>) -> tensor<8xi32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
     // CHECK:      metadata
     // CHECK-SAME: args
     // CHECK-SAME: dtype: DT_INT32
@@ -694,7 +694,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NOT:  "tf.Shape"(%[[ARG_3]])
     // CHECK:      %[[ARG_0_SHAPE:[0-9]*]] = "tf.Shape"(%[[ARG_0]])
     // CHECK:      %[[ARG_2_SHAPE:[0-9]*]] = "tf.Shape"(%[[ARG_2]])
-    %0 = "tf_device.launch_func"(%arg0, %arg1, %arg2, %arg3) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*xi32>, tensor<8xi32>, tensor<*xi32>, tensor<8xi32>) -> tensor<8xi32>
+    %0 = "tf_device.cluster_func"(%arg0, %arg1, %arg2, %arg3) {_tpu_replicate = "cluster0", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*xi32>, tensor<8xi32>, tensor<*xi32>, tensor<8xi32>) -> tensor<8xi32>
     // CHECK:      "tf._TPUCompileMlir"(%[[ARG_0_SHAPE]], %[[ARG_2_SHAPE]])
 
     return %0: tensor<8xi32>
@@ -706,16 +706,16 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests simple case of `tf_device.launch_func` on TPU with single input and
+// Tests simple case of `tf_device.cluster_func` on TPU with single input and
 // single output.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-  // CHECK-LABEL: func @single_tpu_launch_func
-  func @single_tpu_launch_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  // CHECK-LABEL: func @single_tpu_cluster_func
+  func @single_tpu_cluster_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.cluster_func"(%0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
@@ -747,18 +747,20 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests simple case of `tf_device.launch_func` on TPU with replication.
+// Tests simple case of `tf_device.cluster_func` on TPU with replication. Under
+// data parallelism replicated host devices are also added to the
+// tf_device.replicate
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
-  // CHECK-LABEL: func @replicated_tpu_launch_func
+  // CHECK-LABEL: func @replicated_tpu_cluster_func
   // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
-  func @replicated_tpu_launch_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  func @replicated_tpu_cluster_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
     // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK-SAME: ([%[[A_OUTPUT]], %[[ARG_0]]] as %[[RI_0:[a-z0-9]*]]: tensor<?xi32>)
-    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]}
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}
     // CHECK-SAME: n = 2
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[RI_0]])
@@ -775,7 +777,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
       // CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
       // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf_device.launch"
       // CHECK-NEXT: "tf.TPUExecute"(%[[RI_0]], %[[COMPILE_OUTPUT]]#1)
-      %2 = "tf_device.launch_func"(%ri_0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+      %2 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
 
       // CHECK: tf_device.return %[[EXECUTE_OUTPUT]]
       tf_device.return %2 : tensor<?xi32>
@@ -796,15 +798,15 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests that launch_func without _tpu_replicate attribute is ignored.
+// Tests that cluster_func without _tpu_replicate attribute is ignored.
 
 module attributes {tf.versions = {producer = 888 : i32}} {
-  // CHECK-LABEL: func @single_gpu_launch_func
-  func @single_gpu_launch_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  // CHECK-LABEL: func @single_gpu_cluster_func
+  func @single_gpu_cluster_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
-    %1 = "tf_device.launch_func"(%0) {device = "gpu0", func = @gpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
-    // CHECK: tf_device.launch_func
+    %1 = "tf_device.cluster_func"(%0) {device = "gpu0", func = @gpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: tf_device.cluster_func
     // CHECK-SAME: device = "gpu0"
     // CHECK-SAME: func = @gpu0_func
     // CHECK-SAME: num_cores_per_replica = 1
@@ -823,7 +825,7 @@ module attributes {tf.versions = {producer = 888 : i32}} {
 
 // -----
 
-// Tests of `tf_device.launch_func` on TPU with nested function calls.
+// Tests of `tf_device.cluster_func` on TPU with nested function calls.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @with_nested_func
@@ -831,7 +833,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.cluster_func"(%0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
@@ -871,7 +873,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests of `tf_device.launch_func` on TPU with referenced function that's not
+// Tests of `tf_device.cluster_func` on TPU with referenced function that's not
 // via a standard call op.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
@@ -880,7 +882,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.cluster_func"(%0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
@@ -916,7 +918,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests rewriting `tf_device.launch_func` on TPU with a chain of referenced
+// Tests rewriting `tf_device.cluster_func` on TPU with a chain of referenced
 // functions.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
@@ -925,7 +927,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.cluster_func"(%0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
@@ -969,7 +971,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests rewriting `tf_device.launch_func` on TPU with multiple calls to same
+// Tests rewriting `tf_device.cluster_func` on TPU with multiple calls to same
 // function.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
@@ -978,7 +980,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.cluster_func"(%0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
@@ -1017,15 +1019,15 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests multiple `tf_device.launch_func` on TPU with different computation.
+// Tests multiple `tf_device.cluster_func` on TPU with different computation.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-  // CHECK-LABEL: func @multiple_launch_different_func
-  func @multiple_launch_different_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  // CHECK-LABEL: func @multiple_cluster_different_func
+  func @multiple_cluster_different_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func0, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.cluster_func"(%0) {_tpu_replicate = "cluster0", func = @tpu0_func0, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
@@ -1039,7 +1041,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK: %[[EXECUTE0_OUTPUT:[0-9]*]] = "tf_device.launch"
     // CHECK-NEXT: "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE0_OUTPUT]]#1)
 
-    %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "", func = @tpu0_func1, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %2 = "tf_device.cluster_func"(%1) {_tpu_replicate = "cluster1", func = @tpu0_func1, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[EXECUTE0_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[EXECUTE0_OUTPUT]])
     // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[EXECUTE0_SHAPE_OUTPUT]])
@@ -1073,15 +1075,15 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests multiple `tf_device.launch_func` on TPU with same computation.
+// Tests multiple `tf_device.cluster_func` on TPU with same computation.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-  // CHECK-LABEL: func @multiple_launch_same_func
-  func @multiple_launch_same_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  // CHECK-LABEL: func @multiple_cluster_same_func
+  func @multiple_cluster_same_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.cluster_func"(%0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
@@ -1095,7 +1097,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK: %[[EXECUTE0_OUTPUT:[0-9]*]] = "tf_device.launch"
     // CHECK-NEXT: "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE0_OUTPUT]]#1)
 
-    %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %2 = "tf_device.cluster_func"(%1) {_tpu_replicate = "cluster1", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[EXECUTE0_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[EXECUTE0_OUTPUT]])
     // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[EXECUTE0_SHAPE_OUTPUT]])
@@ -1128,12 +1130,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 // ArrayAttr and DictionaryAttr.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-  // CHECK-LABEL: func @single_tpu_launch_func
-  func @single_tpu_launch_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  // CHECK-LABEL: func @single_tpu_cluster_func
+  func @single_tpu_cluster_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.cluster_func"(%0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
@@ -1203,7 +1205,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT: "tf.TPUCompileSucceededAssert"
     // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf_device.launch"
     // CHECK-NEXT: "tf.TPUExecute"
-    %1 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
 
     %compile_result = "tf.TPUCompilationResult"() {_tpu_replicate = "cluster0"} : () -> tensor<!tf.string>
     %compile_result2 = "tf.TPUCompilationResult"() {_tpu_replicate = "cluster0"} : () -> tensor<!tf.string>
@@ -1222,6 +1224,41 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
+// Tests simple case of `tf_device.cluster_func` on TPU with replication and
+// parallel_execute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
+  // CHECK-LABEL: func @replicated_parallel_tpu_cluster_func
+  func @replicated_parallel_tpu_cluster_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      // CHECK: "tf._TPUCompileMlir"
+      // CHECK: "tf.TPUCompileSucceededAssert"
+      // CHECK: "tf_device.parallel_execute"
+      // CHECK:    "tf.TPUExecute"
+      %3 = "tf_device.parallel_execute"() ( {
+        "tf.D"() : () -> ()
+        tf_device.return
+      }, {
+        %4 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+        tf_device.return %4 : tensor<?xi32>
+      }) : () -> (tensor<?xi32>)
+      tf_device.return %3 : tensor<?xi32>
+    }
+    %2 = "tf.C"(%1#1) : (tensor<?xi32>) -> tensor<?xi32>
+    return %2 : tensor<?xi32>
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
 // Tests devices are set properly for non replicated model parallelism.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"]} {
@@ -1244,7 +1281,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-NEXT:     "tf.TPUExecute"
     // CHECK-NEXT:     tf_device.return
     // CHECK-NEXT:   device = "/job:localhost/replica:0/task:0/device:TPU:1"
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
+    %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
     return %0 : tensor<8xi32>
   }
   func @tpu0_func(%arg0: tensor<8xi32>) -> tensor<8xi32> {
@@ -1282,15 +1319,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 //   "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01"
 // -----
 
-// Tests devices are set properly for replicated model parallelism.
+// Tests devices are set properly for replicated model parallelism. No
+// replicated host device should be present.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @replicated_parallel_execute
   func @replicated_parallel_execute(%arg0: tensor<8xi32>, %arg1: tensor<8xi32>) -> (tensor<8xi32>, tensor<8xi32>) {
     // CHECK: tf_device.replicate
-    // CHECK-SAME: devices =
-    // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
-    // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"], TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]}
     %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
       // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
       // CHECK-NEXT:   "tf._TPUCompileMlir"()
@@ -1309,7 +1345,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK-NEXT:     "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return
       // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_1"
-      %1 = "tf_device.launch_func"(%ri) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
+      %1 = "tf_device.cluster_func"(%ri) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
       tf_device.return %1 : tensor<8xi32>
     }
     return %0#0, %0#1 : tensor<8xi32>, tensor<8xi32>
@@ -1322,8 +1358,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests that inputs are inputs with maximal and replicate sharding are set properly
-// for replicated model parallelism.
+// Tests that inputs are inputs with maximal and replicate sharding are set
+// properly for replicated model parallelism.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @parallel_execute_with_input_with_sharding_configurations
@@ -1344,7 +1380,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK:        "tf_device.launch"
       // CHECK-NEXT:     "tf.TPUExecute"(%[[RI_1]], %[[RI_2]], %[[COMPILE]]#2)
       // CHECK:        device = "TPU_REPLICATED_CORE_1"
-      %1 = "tf_device.launch_func"(%ri, %ri2, %ri3) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [""], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "", ""], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>, tensor<*xi1>, tensor<*xi32>) -> tensor<8xi32>
+      %1 = "tf_device.cluster_func"(%ri, %ri2, %ri3) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [""], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "", ""], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>, tensor<*xi1>, tensor<*xi32>) -> tensor<8xi32>
       tf_device.return %1 : tensor<8xi32>
     }
     return %0#0, %0#1 : tensor<8xi32>, tensor<8xi32>
@@ -1357,8 +1393,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests devices are set properly for replicated model parallelism with
-// outputs to TPU computation placed on logical device 0.
+// Tests devices are set properly for replicated model parallelism with outputs
+// to TPU computation placed on logical device 0.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @parallel_execute_with_different_outputs
@@ -1382,7 +1418,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK:        "tf_device.launch"
       // CHECK-NEXT:     "tf.TPUExecute"
       // CHECK:        device = "TPU_REPLICATED_CORE_1"
-      %1 = "tf_device.launch_func"(%ri) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
+      %1 = "tf_device.cluster_func"(%ri) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
       tf_device.return %1 : tensor<8xi32>
     }
     return %0#0, %0#1 : tensor<8xi32>, tensor<8xi32>
@@ -1420,7 +1456,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_1_OUTPUT]]
       // CHECK:        device = "TPU_REPLICATED_CORE_1"
-      %1, %2 = "tf_device.launch_func"(%ri) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<8xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      %1, %2 = "tf_device.cluster_func"(%ri) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<8xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
@@ -1434,8 +1470,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests inputs are correctly split and fed into TPU computation for
-// tiled input sharding.
+// Tests inputs are correctly split and fed into TPU computation for tiled input
+// sharding.
 
 // The following OpSharding is used for TPU computation inputs in below test:
 // Proto debug string:
@@ -1487,7 +1523,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_OUT]]#1, %[[RI_1]], %[[COMPILE]]#2)
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_1_OUTPUT]]
       // CHECK:        device = "TPU_REPLICATED_CORE_1"
-      %1, %2 = "tf_device.launch_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
@@ -1555,7 +1591,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK:     %[[CONST_CONCAT_DIM:[0-9]*]] = "tf.Const"()
       // CHECK:     %[[CONCAT_OUTPUT:[0-9]*]] = "tf.Concat"(%[[CONST_CONCAT_DIM]], %[[PARALLEL_EXECUTE_OUTPUT]]#0, %[[PARALLEL_EXECUTE_OUTPUT]]#2
 
-      %1, %2 = "tf_device.launch_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "", padding_map = [""], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01", "\08\01\1A\01\01\22\01\00"]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "", padding_map = [""], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01", "\08\01\1A\01\01\22\01\00"]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
@@ -1598,7 +1634,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
   func @uneven_input_sharding_disallowed(%arg0: tensor<128x10xf32>, %arg1: tensor<128x10xf32>, %arg2: tensor<*xi32>, %arg3: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
     // expected-error@+1 {{incorrect input sharding configuration received. 1-th dimension of the input must be evenly divisible by 4}}
-    %1, %2 = "tf_device.launch_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [""], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\01\04\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\01\04\22\04\00\01\02\03", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+    %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [""], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\01\04\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\01\04\22\04\00\01\02\03", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
@@ -1638,7 +1674,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
   func @uneven_output_sharding_disallowed(%arg0: tensor<128x10xf32>, %arg1: tensor<128x10xf32>, %arg2: tensor<*xi32>, %arg3: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
     // expected-error@+1 {{incorrect sharding format for outputs. Number of tiled outputs(4) must match the number of logical devices(2)}}
-    %1, %2 = "tf_device.launch_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [""], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["", ""], output_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\01\04\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\01\04\22\04\00\01\02\03", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+    %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [""], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["", ""], output_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\01\04\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\01\04\22\04\00\01\02\03", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
@@ -1744,7 +1780,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK:        %[[LAUNCH_3_OUTPUT:[0-9]*]] = "tf_device.launch"
       // CHECK-NEXT:     %[[EXECUTE_3_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_2_OUT]]#1, %[[COMPILE]]#4)
       // CHECK:          tf_device.return %[[EXECUTE_3_OUTPUT]]
-      %1, %2 = "tf_device.launch_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\00\01\02\03", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\00\01\02\03", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
@@ -1851,7 +1887,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK:        %[[LAUNCH_3_OUTPUT:[0-9]*]] = "tf_device.launch"
       // CHECK-NEXT:     %[[EXECUTE_3_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_2_OUT]]#1, %[[COMPILE]]#4)
       // CHECK:          tf_device.return %[[EXECUTE_3_OUTPUT]]
-      %1, %2 = "tf_device.launch_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\00\01\02\03", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\00\01\02\03", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
@@ -1935,7 +1971,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK:     %[[CONCAT2_OUTPUT:[0-9]*]] = "tf.Concat"(%[[CONST_CONCAT2_DIM]], %[[PARALLEL_EXECUTE_OUTPUT]]#3, %[[PARALLEL_EXECUTE_OUTPUT]]#4
       // CHECK:     %[[CONST_CONCAT3_DIM:[0-9]*]] = "tf.Const"()
       // CHECK:     %[[CONCAT3_OUTPUT:[0-9]*]] = "tf.Concat"(%[[CONST_CONCAT3_DIM]], %[[CONCAT_OUTPUT]], %[[CONCAT2_OUTPUT]]
-      %1, %2 = "tf_device.launch_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "", padding_map = [""], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\00\01\02\03", "\08\01\1A\01\01\22\01\00"]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "", padding_map = [""], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\00\01\02\03", "\08\01\1A\01\01\22\01\00"]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
@@ -2020,7 +2056,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK:        %[[LAUNCH_3_OUTPUT:[0-9]*]] = "tf_device.launch"
       // CHECK-NEXT:     %[[EXECUTE_3_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_1_OUT]]#0, %[[COMPILE]]#4)
       // CHECK:          tf_device.return %[[EXECUTE_3_OUTPUT]]
-      %1, %2 = "tf_device.launch_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\03\02\01\00", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\03\02\01\00", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
@@ -2104,7 +2140,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK:     %[[CONCAT2_OUTPUT:[0-9]*]] = "tf.Concat"(%[[CONST_CONCAT2_DIM]], %[[PARALLEL_EXECUTE_OUTPUT]]#2, %[[PARALLEL_EXECUTE_OUTPUT]]#0
       // CHECK:     %[[CONST_CONCAT3_DIM:[0-9]*]] = "tf.Const"()
       // CHECK:     %[[CONCAT3_OUTPUT:[0-9]*]] = "tf.Concat"(%[[CONST_CONCAT3_DIM]], %[[CONCAT_OUTPUT]], %[[CONCAT2_OUTPUT]]
-      %1, %2 = "tf_device.launch_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "", padding_map = [""], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\03\02\01\00", "\08\01\1A\01\01\22\01\00"]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "", padding_map = [""], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\03\12\12\10\0b\1a\02\02\02\2a\06\0a\02\01\00\20\01\32\02\00\00\1a\02\02\02\22\04\03\02\01\00", "\08\01\1A\01\01\22\01\00"]} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
     return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
index 17180490270..fff1240a121 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
@@ -1,10 +1,10 @@
 // RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-sharding-identification | FileCheck %s --dump-input=fail
 
-// Tests empty launch func. Empty input/output sharding configuration
+// Tests empty cluster func. Empty input/output sharding configuration
 // attributes must be added.
-// CHECK-LABEL: func @check_sharding_attrs_exists_for_empty_launch_func
-func @check_sharding_attrs_exists_for_empty_launch_func() {
-  "tf_device.launch_func"() {device = "", func = @empty_func, step_marker_location = ""} : () -> ()
+// CHECK-LABEL: func @check_sharding_attrs_exists_for_empty_cluster_func
+func @check_sharding_attrs_exists_for_empty_cluster_func() {
+  "tf_device.cluster_func"() {func = @empty_func, step_marker_location = ""} : () -> ()
   // CHECK: input_sharding_configuration = []
   // CHECK: output_sharding_configuration = []
   return
@@ -21,7 +21,7 @@ func @empty_func() {
 // gets default maximal(0) sharding configuration.
 // CHECK-LABEL: func @check_default_sharding_for_block_arg_inputs_outputs
 func @check_default_sharding_for_block_arg_inputs_outputs(%arg0: tensor<*xi32>) {
-  "tf_device.launch_func"(%arg0) {device = "", func = @func_without_sharding, step_marker_location = ""} : (tensor<*xi32>) -> ()
+  "tf_device.cluster_func"(%arg0) {func = @func_without_sharding, step_marker_location = ""} : (tensor<*xi32>) -> ()
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\08\01\1A\01\01\22\01\00"]
   // CHECK: output_sharding_configuration
@@ -42,7 +42,7 @@ func @func_without_sharding(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 // default maximal(0) sharding configuration.
 // CHECK-LABEL: func @check_default_sharding_for_inputs_outputs
 func @check_default_sharding_for_inputs_outputs(%arg0: tensor<*xi32>) {
-  "tf_device.launch_func"(%arg0) {device = "", func = @func_without_sharding, step_marker_location = ""} : (tensor<*xi32>) -> ()
+  "tf_device.cluster_func"(%arg0) {func = @func_without_sharding, step_marker_location = ""} : (tensor<*xi32>) -> ()
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\08\01\1A\01\01\22\01\00"]
   // CHECK: output_sharding_configuration
@@ -63,7 +63,7 @@ func @func_without_sharding(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 // Tests with a input arg connected to XlaSharding op.
 // CHECK-LABEL: func @check_sharding_for_input_correctly_identified
 func @check_sharding_for_input_correctly_identified(%arg0: tensor<*xi32>) {
-  "tf_device.launch_func"(%arg0) {device = "", func = @inputs_with_sharding_func, step_marker_location = ""} : (tensor<*xi32>) -> ()
+  "tf_device.cluster_func"(%arg0) {func = @inputs_with_sharding_func, step_marker_location = ""} : (tensor<*xi32>) -> ()
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\01\02\03"]
   // CHECK: output_sharding_configuration
@@ -85,7 +85,7 @@ func @inputs_with_sharding_func(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 // Tests with sharding is correctly parsed for multiple inputs/outputs.
 // CHECK-LABEL: func @check_sharding_for_multiple_inputs_outputs
 func @check_sharding_for_multiple_inputs_outputs(%arg0: tensor<*xi32>, %arg1: tensor<*xi1>) {
-  "tf_device.launch_func"(%arg0, %arg1) {device = "", func = @func_with_sharding, step_marker_location = ""} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>)
+  "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_sharding, step_marker_location = ""} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>)
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\01\02\03", "\04\05\06"]
   // CHECK: output_sharding_configuration
@@ -110,7 +110,7 @@ func @func_with_sharding(%arg0: tensor<*xi32>, %arg1: tensor<*xi1>) -> (tensor<*
 // Tests with input sharding following an identity op.
 // CHECK-LABEL: func @check_sharding_after_identity
 func @check_sharding_after_identity(%arg0: tensor<*xi32>, %arg1: tensor<*xi1>) {
-  "tf_device.launch_func"(%arg0, %arg1) {device = "", func = @func_with_sharding_after_identity, step_marker_location = ""} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>)
+  "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_sharding_after_identity, step_marker_location = ""} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>)
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\01\02\03", "\04\05\06"]
   // CHECK: output_sharding_configuration
@@ -136,7 +136,7 @@ func @func_with_sharding_after_identity(%arg0: tensor<*xi32>, %arg1: tensor<*xi1
 // Tests with input sharding following a ReadVariable op.
 // CHECK-LABEL: func @check_sharding_after_read_variable
 func @check_sharding_after_read_variable(%arg0: tensor<*xi32>, %arg1: tensor<*xi1>) {
-  "tf_device.launch_func"(%arg0, %arg1) {device = "", func = @func_with_sharding_after_read_variable, step_marker_location = ""} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>)
+  "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_sharding_after_read_variable, step_marker_location = ""} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>)
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\01\02\03", "\04\05\06"]
   // CHECK: output_sharding_configuration
@@ -164,7 +164,7 @@ func @func_with_sharding_after_read_variable(%arg0: tensor<*x!tf.resource<tensor
 // Tests with input sharding following an identity op and cast op.
 // CHECK-LABEL: func @check_sharding_after_cast_op
 func @check_sharding_after_cast_op(%arg0: tensor<*xi32>, %arg1: tensor<*xi1>) {
-  "tf_device.launch_func"(%arg0, %arg1) {device = "", func = @func_with_sharding_after_cast, step_marker_location = ""} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>)
+  "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_sharding_after_cast, step_marker_location = ""} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>)
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\01\02\03", "\04\05\06"]
   // CHECK: output_sharding_configuration
@@ -185,3 +185,45 @@ func @func_with_sharding_after_cast(%arg0: tensor<*xi32>, %arg1: tensor<*xi1>) -
   %7 = "tf.XlaSharding"(%5) { _XlaSharding = "\0D\0E\0F" } : (tensor<*xi1>) -> tensor<*xi1>
   return %6, %7 : tensor<*xi32> , tensor<*xi1>
 }
+
+// -----
+
+// Tests that input sharding inside a functional op is parsed correctly.
+// CHECK-LABEL: func @check_sharding_inside_functional_op
+func @check_sharding_inside_functional_op(%arg0: tensor<*xi32>, %arg1: tensor<*xi1>) {
+  "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_device_training_loop, step_marker_location = ""} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>)
+  // CHECK: input_sharding_configuration
+  // CHECK-SAME: ["\01\02\03", "\04\05\06"]
+  // CHECK: output_sharding_configuration
+  // CHECK-SAME: ["\0A\0B\0C", "\0D\0E\0F"]
+  return
+}
+
+// CHECK-LABEL: func @func_with_device_training_loop
+// CHECK-SAME: (%{{[a-z0-9]+}}: tensor<*xi32> {xla_hlo.sharding = "\01\02\03"}, %{{[a-z0-9]+}}: tensor<*xi1> {xla_hlo.sharding = "\04\05\06"})
+// CHECK-SAME: -> (tensor<*xi32> {xla_hlo.sharding = "\0A\0B\0C"}, tensor<*xi1> {xla_hlo.sharding = "\0D\0E\0F"})
+func @func_with_device_training_loop(%arg0: tensor<*xi32>, %arg1: tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi1>) {
+  %1:2 = "tf.StatefulPartitionedCall"(%arg0){f= @func_body, config="", config_proto="", executor_type=""}
+         : (tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+  %2 = "tf.PartitionedCall"(%arg1) {config = "", config_proto = "", executor_type = "", f = @pcall_func_body} : (tensor<*xi1>) -> (tensor<i32>)
+  %3, %4 = "tf.A"(%1#0, %2) : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi32>, tensor<*xi1>)
+
+  %5 = "tf.XlaSharding"(%3) { _XlaSharding = "\0A\0B\0C" } : (tensor<*xi32>) -> tensor<*xi32>
+  %6 = "tf.XlaSharding"(%4) { _XlaSharding = "\0D\0E\0F" } : (tensor<*xi1>) -> tensor<*xi1>
+
+  return %5, %6 : tensor<*xi32> , tensor<*xi1>
+}
+
+// CHECK-LABEL: func @func_body
+func @func_body(%arg0: tensor<*xi32>)-> (tensor<*xi32>, tensor<*xi1>) {
+  %1 = "tf.XlaSharding"(%arg0) { _XlaSharding = "\01\02\03" } : (tensor<*xi32>) -> tensor<*xi32>
+  %2, %3 = "tf.C"(%1) : (tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+  return %2, %3 : tensor<*xi32> , tensor<*xi1>
+}
+
+// CHECK-LABEL: func @pcall_func_body
+func @pcall_func_body(%arg0: tensor<*xi1>) -> tensor<i32> {
+  %1 = "tf.XlaSharding"(%arg0) { _XlaSharding = "\04\05\06" } : (tensor<*xi1>) -> tensor<*xi1>
+  %2 = "tf.D"(%1) : (tensor<*xi1>) -> (tensor<i32>)
+  return %2 : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc b/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
index 01c30eabd35..fb3ecfde771 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
@@ -36,7 +36,7 @@ namespace {
 constexpr char kReplicationAttr[] = "tf_device.is_same_data_across_replicas";
 constexpr char kMirroredVariableIndicesAttr[] = "_mirrored_variable_indices";
 
-// Analyzes the inputs to LaunchFuncOps in the module, and annotates their
+// Analyzes the inputs to ClusterFuncOps in the module, and annotates their
 // invoked functions whether each input has the same data across replicas.
 struct AnnotateParameterReplication
     : public PassWrapper<AnnotateParameterReplication,
@@ -57,8 +57,8 @@ Value SkipIdentityAndReadVariable(Value v) {
 void AnnotateParameterReplication::runOnOperation() {
   ModuleOp m = getOperation();
   OpBuilder builder(m.getContext());
-  m.walk([&](tf_device::LaunchFuncOp launch_func) {
-    auto replicate = launch_func.getParentOfType<tf_device::ReplicateOp>();
+  m.walk([&](tf_device::ClusterFuncOp cluster_func) {
+    auto replicate = cluster_func.getParentOfType<tf_device::ReplicateOp>();
     if (!replicate) return;
     auto mirrored_variable_indices_attr =
         replicate.getAttrOfType<ArrayAttr>(kMirroredVariableIndicesAttr);
@@ -69,8 +69,8 @@ void AnnotateParameterReplication::runOnOperation() {
             mirrored_index.cast<IntegerAttr>().getInt());
       }
     }
-    auto func = llvm::cast<FuncOp>(m.lookupSymbol(launch_func.func()));
-    for (auto entry : llvm::enumerate(launch_func.getOperands())) {
+    auto func = llvm::cast<FuncOp>(m.lookupSymbol(cluster_func.func()));
+    for (auto entry : llvm::enumerate(cluster_func.getOperands())) {
       auto operand = SkipIdentityAndReadVariable(entry.value());
       auto block_arg = operand.dyn_cast<BlockArgument>();
       if (block_arg && block_arg.getOwner() == &replicate.GetBody()) {
@@ -98,7 +98,7 @@ CreateAnnotateParameterReplicationPass() {
 
 static PassRegistration<AnnotateParameterReplication> pass(
     "tf-annotate-parameter-replication",
-    "Annotate whether a LaunchFuncOp's parameters have the same data across "
+    "Annotate whether a ClusterFuncOp's parameters have the same data across "
     "replicas.");
 
 }  // namespace TFDevice
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
index 727b13bc959..de73dff8b0b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 73130640d1b..a01769bc395 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -30,9 +30,10 @@ namespace {
 void EnableLogging(PassManager *pm) {
   // Print the whole module after each pass, which requires disabling
   // multi-threading as well.
-  pm->disableMultithreading();
+  pm->getContext()->disableMultithreading();
   pm->enableIRPrinting(std::make_unique<tensorflow::BridgeLoggerConfig>(
       /*print_module_scope=*/true));
+  pm->enableTiming(std::make_unique<tensorflow::BridgeTimingConfig>());
 }
 }  // namespace
 
@@ -46,6 +47,7 @@ void AddGraphExportLoweringPasses(OpPassManager &pm) {
   pm.addNestedPass<FuncOp>(TFDevice::CreateParallelExecuteToIslandsPass());
   pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
   pm.addNestedPass<FuncOp>(TFDevice::CreateLaunchToDeviceAttributePass());
+  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
 }
 
 tensorflow::Status RunTPUBridge(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index ccc3e83a2a2..cf09f8d64fb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -152,6 +152,23 @@ def RealDivWithSqrtDivisor : Pat<(TF_RealDivOp $arg0, (TF_SqrtOp $arg1)),
 def ReciprocalNested : Pat<(TF_ReciprocalOp (TF_ReciprocalOp $arg)),
                            (replaceWithValue $arg)>;
 
+//===----------------------------------------------------------------------===//
+// Select op patterns.
+//===----------------------------------------------------------------------===//
+
+def ReshapeSelectPredIfNecessary : NativeCodeCall<
+  "ReshapeSelectPredIfNecessary(&($_builder), $0.getOwner()->getLoc(), $1, "
+  "$2.getType().cast<RankedTensorType>().getRank())">;
+
+// Select supports tensor `condition` where the shape is equal to the first
+// dimension of t and e. SelectV2 op supports normal broadcasting, so in these
+// cases the condition needs to be reshaped.
+def SelectToSelectV2 : Pat<
+  (TF_SelectOp:$op StaticShapeTensorOf<[AnyType]>:$cond,
+                   StaticShapeTensorOf<[AnyType]>:$t,
+                   StaticShapeTensorOf<[AnyType]>:$e),
+  (TF_SelectV2Op (ReshapeSelectPredIfNecessary $op, $cond, $t), $t, $e)>;
+
 //===----------------------------------------------------------------------===//
 // Square op patterns.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
index aa4c071abdf..886bd5b5b65 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This pass outlines regions of `tf_device.launch` into functions and replaces
-// `tf_device.launch` with equivalent `tf_device.launch_func` operations.
+// This pass outlines regions of `tf_device.cluster` into functions and replaces
+// `tf_device.cluster` with equivalent `tf_device.cluster_func` operations.
 
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
@@ -35,7 +35,6 @@ namespace TFDevice {
 
 namespace {
 
-constexpr char kDeviceAttr[] = "device";
 constexpr char kFuncAttr[] = "func";
 
 struct ClusterOutliningPass
@@ -43,28 +42,29 @@ struct ClusterOutliningPass
   void runOnOperation() override;
 };
 
-void ReplaceLaunchReturnWithReturn(tf_device::ReturnOp launch_return_op,
-                                   OpBuilder* builder) {
-  builder->create<ReturnOp>(launch_return_op.getLoc(),
-                            launch_return_op.getOperands());
-  launch_return_op.erase();
+void ReplaceClusterReturnWithReturn(tf_device::ReturnOp cluster_return_op,
+                                    OpBuilder* builder) {
+  builder->create<ReturnOp>(cluster_return_op.getLoc(),
+                            cluster_return_op.getOperands());
+  cluster_return_op.erase();
 }
 
-// Builds a function that outlines region attached to launch_op and inserts
+// Builds a function that outlines region attached to cluster_op and inserts
 // built function into given module.
-FuncOp BuildFunction(StringRef device, llvm::ArrayRef<Value> live_ins,
-                     tf_device::LaunchOp launch_op, SymbolTable* symbol_table,
+FuncOp BuildFunction(llvm::ArrayRef<Value> live_ins,
+                     tf_device::ClusterOp cluster_op, SymbolTable* symbol_table,
                      OpBuilder* builder) {
   llvm::SmallVector<Type, 4> operand_types;
   operand_types.reserve(live_ins.size());
   for (Value v : live_ins) operand_types.emplace_back(v.getType());
 
-  auto func_type = FunctionType::get(operand_types, launch_op.getResultTypes(),
+  auto func_type = FunctionType::get(operand_types, cluster_op.getResultTypes(),
                                      builder->getContext());
 
-  std::string func_name_prefix = Twine(device, "_func").str();
+  // TODO(lyandy): Define better name for outlined function. Potentially some
+  // name can be added during cluster formation.
   FuncOp outlined_func =
-      FuncOp::create(launch_op.getLoc(), func_name_prefix, func_type);
+      FuncOp::create(cluster_op.getLoc(), "_func", func_type);
 
   // This function is not externally visible and marking it private would allow
   // symbol-dce pass to remove it when it is not referenced anymore.
@@ -73,64 +73,59 @@ FuncOp BuildFunction(StringRef device, llvm::ArrayRef<Value> live_ins,
   // Create function body.
   Block* outlined_func_block = outlined_func.addEntryBlock();
 
-  // Replace uses of live-in values within launch_op region with function
+  // Replace uses of live-in values within cluster_op region with function
   // arguments.
-  Region& launch_op_region = launch_op.body();
-  for (const auto& p :
-       llvm::zip(live_ins, outlined_func_block->getArguments())) {
+  Region& cluster_op_region = cluster_op.body();
+  for (auto p : llvm::zip(live_ins, outlined_func_block->getArguments())) {
     replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
-                               launch_op_region);
+                               cluster_op_region);
   }
 
-  // Move all instructions in launch_op into outlined_function's only block.
-  auto& launch_op_body = launch_op_region.front().getOperations();
+  // Move all instructions in cluster_op into outlined_function's only block.
+  auto& cluster_op_body = cluster_op.GetBody().getOperations();
   outlined_func_block->getOperations().splice(
-      outlined_func_block->end(), launch_op_body, launch_op_body.begin(),
-      launch_op_body.end());
+      outlined_func_block->end(), cluster_op_body, cluster_op_body.begin(),
+      cluster_op_body.end());
 
-  // Replace `tf_device.launch_return` terminator with `std.return` in function
+  // Replace `tf_device.return` terminator with `std.return` in function
   // body.
-  auto launch_return_op =
+  auto cluster_return_op =
       cast<tf_device::ReturnOp>(outlined_func_block->getTerminator());
-  builder->setInsertionPoint(launch_return_op);
-  ReplaceLaunchReturnWithReturn(launch_return_op, builder);
+  builder->setInsertionPoint(cluster_return_op);
+  ReplaceClusterReturnWithReturn(cluster_return_op, builder);
 
   symbol_table->insert(outlined_func);
   return outlined_func;
 }
 
-// Outlines body of `tf_device.launch` into a function and create a
-// `tf_device.launch_func` to invoke that function. `tf_device.launch` is
+// Outlines body of `tf_device.cluster` into a function and create a
+// `tf_device.cluster_func` to invoke that function. `tf_device.cluster` is
 // removed afterwards.`
-void OutlineLaunch(tf_device::LaunchOp launch_op, SymbolTable* symbol_table,
-                   OpBuilder* builder) {
+void OutlineCluster(tf_device::ClusterOp cluster_op, SymbolTable* symbol_table,
+                    OpBuilder* builder) {
   llvm::SetVector<Value> live_ins;
-  getUsedValuesDefinedAbove(launch_op.body(), launch_op.body(), live_ins);
+  getUsedValuesDefinedAbove(cluster_op.body(), cluster_op.body(), live_ins);
 
-  StringRef device =
-      launch_op.getAttrOfType<StringAttr>(kDeviceAttr).getValue();
+  FuncOp outlined_func =
+      BuildFunction(live_ins.getArrayRef(), cluster_op, symbol_table, builder);
+  cluster_op.setAttr(builder->getIdentifier(kFuncAttr),
+                     builder->getSymbolRefAttr(outlined_func.getName()));
 
-  FuncOp outlined_func = BuildFunction(device, live_ins.getArrayRef(),
-                                       launch_op, symbol_table, builder);
-  launch_op.setAttr(builder->getIdentifier(kFuncAttr),
-                    builder->getSymbolRefAttr(outlined_func.getName()));
+  builder->setInsertionPoint(cluster_op);
+  auto cluster_func_op = builder->create<tf_device::ClusterFuncOp>(
+      cluster_op.getLoc(), outlined_func.getType().getResults(),
+      live_ins.getArrayRef(), cluster_op.getAttrs());
 
-  builder->setInsertionPoint(launch_op);
-  tf_device::LaunchFuncOp launch_func_op =
-      builder->create<tf_device::LaunchFuncOp>(
-          launch_op.getLoc(), outlined_func.getType().getResults(),
-          live_ins.getArrayRef(), launch_op.getAttrs());
-
-  launch_op.replaceAllUsesWith(launch_func_op);
-  launch_op.erase();
+  cluster_op.replaceAllUsesWith(cluster_func_op);
+  cluster_op.erase();
 }
 
 void ClusterOutliningPass::runOnOperation() {
-  ModuleOp m = getOperation();
-  SymbolTable symbol_table(m);
-  OpBuilder builder(m.getContext());
-  m.walk([&](tf_device::LaunchOp launch) {
-    OutlineLaunch(launch, &symbol_table, &builder);
+  ModuleOp module = getOperation();
+  SymbolTable symbol_table(module);
+  OpBuilder builder(module.getContext());
+  module.walk([&](tf_device::ClusterOp cluster) {
+    OutlineCluster(cluster, &symbol_table, &builder);
   });
 }
 
@@ -142,7 +137,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateClusterOutliningPass() {
 
 static PassRegistration<ClusterOutliningPass> pass(
     "tf-device-cluster-outlining",
-    "Outline regions of tf_device.launch operations.");
+    "Outline regions of tf_device.cluster operations.");
 
 }  // namespace TFDevice
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 2269b4c55c8..55a0b5c3fd3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/eval_util.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -46,6 +48,12 @@ LogicalResult ConstantFoldFallbackHook(
     }
   }
 
+  // Do not execute function calls.
+  if (llvm::isa<TF::WhileOp>(inst) || llvm::isa<TF::IfOp>(inst) ||
+      llvm::isa<CallOpInterface>(inst)) {
+    return failure();
+  }
+
   // TODO(jpienaar): Currently this persists the entire program execution. This
   // should instead be per module/set from the Graph being executed in TF (if
   // any) so that the value of variables in the context could be read.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index 3610fb36cf3..d9af88bfbae 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
index fe9c10781fd..f44c0fed709 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index 50f77cd9c3d..b1cbc41a03e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index 853fd806c5f..6fd7556084d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -18,12 +18,16 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/xla/ir/chlo_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 def : Pat<(HLO_ConstOp $value), (TF_ConstOp $value)>;
 
 //===----------------------------------------------------------------------===//
 // Binary op patterns.
+// Note that these are legalized from chlo.broadcast_* ops, since those are
+// semantically compatible with the corresponding TF ops. Depending on
+// context, getting to these ops may require some raising.
 //===----------------------------------------------------------------------===//
 
 // Check that two values can be broadcasted together
@@ -31,37 +35,45 @@ def : Pat<(HLO_ConstOp $value), (TF_ConstOp $value)>;
 def AreBroadcastCompatible : Constraint<CPred<"AreBroadcastCompatible($0, $1)">,
     "types must be broadcastable">;
 
-foreach fromToBinPair = [[HLO_AddOp, TF_AddV2Op],
-                         [HLO_DivOp, TF_DivOp],
-                         [HLO_ShiftLeftOp, TF_LeftShiftOp],
-                         [HLO_MaxOp, TF_MaximumOp],
-                         [HLO_MinOp, TF_MinimumOp],
-                         [HLO_MulOp, TF_MulOp],
-                         [HLO_PowOp, TF_PowOp],
-                         [HLO_DivOp, TF_RealDivOp],
-                         [HLO_SubOp, TF_SubOp],
-                         [HLO_Atan2Op, TF_Atan2Op],
-                         [HLO_RemOp, TF_ModOp]] in
-  def : Pat<(fromToBinPair[0] $l, $r, $_), (fromToBinPair[1] $l, $r),
+foreach fromToBinPair = [[HLO_AddOp, HLOClient_BroadcastAddOp, TF_AddV2Op],
+                         [HLO_DivOp, HLOClient_BroadcastDivOp, TF_DivOp],
+                         [HLO_ShiftLeftOp, HLOClient_BroadcastShiftLeftOp, TF_LeftShiftOp],
+                         [HLO_MaxOp, HLOClient_BroadcastMaxOp, TF_MaximumOp],
+                         [HLO_MinOp, HLOClient_BroadcastMinOp, TF_MinimumOp],
+                         [HLO_MulOp, HLOClient_BroadcastMulOp, TF_MulOp],
+                         [HLO_PowOp, HLOClient_BroadcastPowOp, TF_PowOp],
+                         [HLO_SubOp, HLOClient_BroadcastSubOp, TF_SubOp],
+                         [HLO_Atan2Op, HLOClient_BroadcastAtan2Op, TF_Atan2Op],
+                         [HLO_RemOp, HLOClient_BroadcastRemOp, TF_ModOp]] in {
+  def : Pat<(fromToBinPair[0] $l, $r), (fromToBinPair[2] $l, $r)>;
+  def : Pat<(fromToBinPair[1] $l, $r, $_), (fromToBinPair[2] $l, $r),
             [(AreBroadcastCompatible $l, $r)]>;
+}
 
-foreach pair  = [[HLO_AndOp, TF_BitwiseAndOp],
-                 [HLO_OrOp, TF_BitwiseOrOp],
-                 [HLO_XorOp, TF_BitwiseXorOp]] in
-  def : Pat<(pair[0] TF_IntTensor:$l, TF_IntTensor:$r, $_), (pair[1] $l, $r),
+foreach pair  = [[HLO_AndOp, HLOClient_BroadcastAndOp, TF_BitwiseAndOp],
+                 [HLO_OrOp, HLOClient_BroadcastOrOp, TF_BitwiseOrOp],
+                 [HLO_XorOp, HLOClient_BroadcastXorOp, TF_BitwiseXorOp]] in {
+  def : Pat<(pair[0] TF_IntTensor:$l, TF_IntTensor:$r), (pair[2] $l, $r)>;
+  def : Pat<(pair[1] TF_IntTensor:$l, TF_IntTensor:$r, $_), (pair[2] $l, $r),
             [(AreBroadcastCompatible $l, $r)]>;
+}
 
-foreach pair  = [[HLO_AndOp, TF_LogicalAndOp],
-                 [HLO_OrOp, TF_LogicalOrOp]] in
-  def : Pat<(pair[0] I1Tensor:$l, I1Tensor:$r, $_), (pair[1] $l, $r),
+foreach pair  = [[HLO_AndOp, HLOClient_BroadcastAndOp, TF_LogicalAndOp],
+                 [HLO_OrOp, HLOClient_BroadcastOrOp, TF_LogicalOrOp]] in {
+  def : Pat<(pair[0] I1Tensor:$l, I1Tensor:$r), (pair[2] $l, $r)>;
+  def : Pat<(pair[1] I1Tensor:$l, I1Tensor:$r, $_), (pair[2] $l, $r),
             [(AreBroadcastCompatible $l, $r)]>;
+}
 
-def : Pat<(HLO_ShiftRightArithmeticOp $l, $r, $_), (TF_RightShiftOp $l, $r),
+def : Pat<(HLO_ShiftRightArithmeticOp $l, $r), (TF_RightShiftOp $l, $r)>;
+def : Pat<(HLOClient_BroadcastShiftRightArithmeticOp $l, $r, $_), (TF_RightShiftOp $l, $r),
           [(AreBroadcastCompatible $l, $r)]>;
-def : Pat<(HLO_ShiftRightLogicalOp $l, $r, $_), (TF_RightShiftOp $l, $r),
+def : Pat<(HLO_ShiftRightLogicalOp $l, $r), (TF_RightShiftOp $l, $r)>;
+def : Pat<(HLOClient_BroadcastShiftRightLogicalOp $l, $r, $_), (TF_RightShiftOp $l, $r),
           [(AreBroadcastCompatible $l, $r)]>;
 
-def : Pat<(HLO_FloorOp (HLO_DivOp $l, $r, $_)), (TF_FloorDivOp $l, $r),
+def : Pat<(HLO_FloorOp (HLO_DivOp $l, $r)), (TF_FloorDivOp $l, $r)>;
+def : Pat<(HLO_FloorOp (HLOClient_BroadcastDivOp $l, $r, $_)), (TF_FloorDivOp $l, $r),
           [(AreBroadcastCompatible $l, $r)]>;
 
 def : Pat<(HLO_ComplexOp $r, $i), (TF_ComplexOp $r, $i)>;
@@ -69,6 +81,9 @@ def : Pat<(HLO_ComplexOp $r, $i), (TF_ComplexOp $r, $i)>;
 // Unary op patterns.
 //===----------------------------------------------------------------------===//
 
+def : Pat<(HLO_ConvertOp HLO_Tensor:$operand),
+          (TF_CastOp $operand, ConstBoolAttrFalse)>;
+
 foreach Mapping = [[HLO_AbsOp, TF_AbsOp],
                    [HLO_BitcastConvertOp, TF_BitcastOp],
                    [HLO_CeilOp, TF_CeilOp],
@@ -115,16 +130,23 @@ def : Pat<(HLO_ConcatenateOp $inputs, $dim),
 
 //===----------------------------------------------------------------------===//
 // Compare op patterns.
+// Note that these are legalized from chlo.broadcast_* ops, since those are
+// semantically compatible with the corresponding TF ops. Depending on
+// context, getting to these ops may require some raising.
 //===----------------------------------------------------------------------===//
 
 foreach p = [[TF_EqualOp, HLO_COMPARISON_DIRECTION_EQ],
-             [TF_NotEqualOp, HLO_COMPARISON_DIRECTION_NE]] in
-  def : Pat<(HLO_CompareOp $l, $r, $_, p[1]), (p[0] $l, $r, ConstBoolAttrTrue),
+             [TF_NotEqualOp, HLO_COMPARISON_DIRECTION_NE]] in {
+  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $_, p[1]), (p[0] $l, $r, ConstBoolAttrTrue),
             [(AreBroadcastCompatible $l, $r)]>;
+  def : Pat<(HLO_CompareOp $l, $r, p[1]), (p[0] $l, $r, ConstBoolAttrTrue)>;
+}
 
 foreach pair = [[TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE],
                 [TF_GreaterOp, HLO_COMPARISON_DIRECTION_GT],
                 [TF_LessEqualOp, HLO_COMPARISON_DIRECTION_LE],
-                [TF_LessOp, HLO_COMPARISON_DIRECTION_LT]] in
-  def : Pat<(HLO_CompareOp $l, $r, $_, pair[1]), (pair[0] $l, $r),
+                [TF_LessOp, HLO_COMPARISON_DIRECTION_LT]] in {
+  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $_, pair[1]), (pair[0] $l, $r),
             [(AreBroadcastCompatible $l, $r)]>;
+  def : Pat<(HLO_CompareOp $l, $r, pair[1]), (pair[0] $l, $r)>;
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index f934e2ac169..c0de6f557ab 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -253,8 +253,8 @@ class LowerDynamicStitchOp : public OpRewritePattern<TF::DynamicStitchOp> {
 //   %delta = "tf.Const"() {value = dense<1> : tensor<i32>}
 //   %updates = "tf.Range"(%start, %limit, %delta) :
 //     (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5xi32>
-//   %perm = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>}
-//   %indices = "tf.Transpose"(%x, %perm) : (tensor<5xi32, tensor<2xi32) ->
+//   %shape = "tf.Const"() {value = dense<[5, 1]> : tensor<2xi32>}
+//   %indices = "tf.Reshape"(%x, %shape) : (tensor<5xi32, tensor<2xi32) ->
 //     tensor<5x1xi32>
 //   "tf.TensorScatterUpdate"(%x, %indices, %updates) :
 //     (tensor<5xi32>, tensor<5x1xi32>, tensor<5xi32>) -> tensor<5xi32>
@@ -268,13 +268,12 @@ class LowerInvertPermutationOp
   LogicalResult matchAndRewrite(TF::InvertPermutationOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    auto x_type = op.x().getType().cast<TensorType>();
-    Type int_type = x_type.getElementType();  // Could be i32 or i64.
-
+    auto x_type = op.x().getType().dyn_cast<RankedTensorType>();
     // x input must have static shape.
-    if (!x_type.hasStaticShape()) {
+    if (!x_type || !x_type.hasStaticShape()) {
       return failure();
     }
+    Type int_type = x_type.getElementType();  // Could be i32 or i64.
 
     auto result_type = x_type;
     auto start =
@@ -287,13 +286,11 @@ class LowerInvertPermutationOp
     auto updates =
         rewriter.create<TF::RangeOp>(loc, result_type, start, limit, delta);
 
-    auto perm_type = RankedTensorType::get({2}, int_type);
-    auto perm = rewriter.create<TF::ConstOp>(
-        loc, DenseElementsAttr::get(perm_type, {1, 0}));
-    auto transposed_x_type =
-        RankedTensorType::get({x_type.getShape()[0], 1}, int_type);
-    auto indices =
-        rewriter.create<TF::TransposeOp>(loc, transposed_x_type, op.x(), perm);
+    auto shape_type = RankedTensorType::get({2}, rewriter.getIntegerType(32));
+    auto shape = rewriter.create<TF::ConstOp>(
+        loc, DenseElementsAttr::get(
+                 shape_type, {static_cast<int>(x_type.getDimSize(0)), 1}));
+    auto indices = rewriter.create<TF::ReshapeOp>(loc, op.x(), shape);
 
     rewriter.replaceOpWithNewOp<TF::TensorScatterUpdateOp>(
         op, result_type, op.x(), indices, updates);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index 1074f9e1926..acf9cd27b47 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -227,3 +227,10 @@ def LowerZerosLikeOp :
   Pat<(TF_ZerosLikeOp:$src_op TensorOf<[AnySignlessInteger, AnyFloat]>:$input),
       (TF_BroadcastToOp (TF_ConstOp (GetScalarOfType<0> $input)),
                         (CreateTFShapeOp $src_op, $input, /*use 32bit*/ConstBoolAttrFalse))>;
+
+def LowerScatterNdOp :
+  Pat<(TF_ScatterNdOp $indices,
+       TensorOf<[AnySignlessInteger, AnyFloat]>:$updates, $shape),
+      (TF_TensorScatterUpdateOp
+       (TF_FillOp $shape, (TF_ConstOp (GetScalarOfType<0> $updates))),
+       $indices, $updates)>;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc
index 02e1c994986..31a80a4ecdb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc
@@ -97,6 +97,36 @@ CreateMarkFunctionVisibilityUsingEntryFunctionSpecificationPass() {
       MarkFunctionVisibilityUsingEntryFunctionSpecificationPass>();
 }
 
+// Marks the main function with public visibility, while other functions are
+// marked with private visibility.
+LogicalResult MarkOnlyMainFunctionWithPublicVisibility(ModuleOp module) {
+  for (auto func : module.getOps<FuncOp>()) {
+    if (func.getName() == "main") {
+      func.setVisibility(FuncOp::Visibility::Public);
+    } else {
+      func.setVisibility(FuncOp::Visibility::Private);
+    }
+  }
+  return success();
+}
+
+namespace {
+struct MarkOnlyMainFunctionWithPublicVisibilityPass
+    : public PassWrapper<MarkOnlyMainFunctionWithPublicVisibilityPass,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override {
+    if (failed(MarkOnlyMainFunctionWithPublicVisibility(getOperation()))) {
+      signalPassFailure();
+    }
+  }
+};
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateMarkOnlyMainFunctionWithPublicVisibilityPass() {
+  return std::make_unique<MarkOnlyMainFunctionWithPublicVisibilityPass>();
+}
+
 }  // namespace TF
 
 namespace tf_saved_model {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
index 6c7a47623e2..849f1487c6e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
@@ -62,11 +62,11 @@ void CreateTFStandardPipeline(OpPassManager &pm,
   // Hopefully there is a single island left, or there wasn't any to begin with.
   // We now run the optimizer which operates mostly inside islands.
   func_pm.addPass(createCanonicalizerPass());
+  pm.addPass(CreateTFShapeInferencePass());
   if (options.enable_inliner) {
     pm.addPass(createInlinerPass());
   }
   pm.addPass(createSymbolDCEPass());
-  pm.addPass(CreateTFShapeInferencePass());
   pm.addNestedPass<FuncOp>(CreateTFOptimizePass());
   pm.addNestedPass<FuncOp>(createCSEPass());
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
index 693d6d964db..c13d7de754e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
@@ -237,7 +237,7 @@ LogicalResult CreateIslandsFromParallelExecute(
 // individual islands per region of parallel_execute.
 void LowerSingleIslandParallelExecuteToIslands(
     tf_executor::IslandOp island_op) {
-  if (!has_single_element(island_op.GetBody().without_terminator())) return;
+  if (!hasSingleElement(island_op.GetBody().without_terminator())) return;
 
   if (auto parallel_execute_op = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
           &island_op.GetBody().front()))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index d6da961eb0e..81d0259d2d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -91,6 +91,10 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateResourceDeviceInferencePass();
 // of their aliasing output arguments.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass();
 
+// Creates a pass that promotes tf.VarHandleOp to resource arguments for all
+// functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass();
+
 // Marks function visibility using tf.entry_function specification. That is,
 // functions with tf.entry_function attributes are marked with public
 // visibility while the other functions are marked with private visibility.
@@ -101,6 +105,11 @@ LogicalResult MarkFunctionVisibilityUsingEntryFunctionSpecification(
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateMarkFunctionVisibilityUsingEntryFunctionSpecificationPass();
 
+// Creates a pass that marks the main function with public visibility, while
+// other functions are marked with private visibility.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateMarkOnlyMainFunctionWithPublicVisibilityPass();
+
 // Creates a simple device assignment pass on TF dialect for CoreRT use case.
 std::unique_ptr<OperationPass<FuncOp>> CreateSimpleTFDeviceAssignmentPass(
     llvm::StringRef default_device);
@@ -251,6 +260,15 @@ std::unique_ptr<OperationPass<FuncOp>> CreateTPUMergeVariablesWithExecutePass();
 // run-time according to compilation result.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUVariableReformattingPass();
 
+// Creates a pass that extracts outside compilation (CPU ops inside TPU cluster)
+// at head/tail of TPU cluster to run before/after TPU computation.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUExtractHeadTailOutsideCompilationPass();
+
+// Creates a pass that extract outside compilation (CPU ops inside TPU cluster)
+// ops to a separate parallel_execute region to run on CPU.
+std::unique_ptr<OperationPass<FuncOp>> CreateTPUExtractOutsideCompilationPass();
+
 // Populates the supplied passmanager with the passes required to run the
 void CreateTPUBridgePipeline(OpPassManager& pm);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index db8ecbd86ee..cece23b4750 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -13,31 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This pass promotes resource reads in the main function to input arguments
-// of the function. It also promotes resource writes in the main function to
-// outputs of the main function. If a resource may be updated by the main
-// function, the corresponding input and output arguments are alias.
+// This pass promotes resource accesses in the main function to input arguments
+// and outputs of the main function.
+//
+// Two types of resources are supported:
+// (1) A function argument of TF::ResourceType type.
+// (2) A VarHandleOp in the function.
+//
+// After the pass,
+//
+//  . The function will have an input argument for each resource that is
+//    already provided as an input argument or is read. The type of the input
+//    argument will become the shape of the value represented by the resource.
+//
+//  . The function will have an output for each resource that is written. The
+//    type of the output will become the shape of the resource.
 //
 // The information of variable identification and input-output alising is
-// recorded as named attributes of the input arguments:
+// recorded as named attributes of the input argument or output:
 //
 //  . 'tf.resource_name' matches 'shared_name' of VarHandleOp, which represents
-//    the identifier of the resource corresponding to the input argument.
+//    the identifier of the corresponding resource. This attribute is added to
+//    an input argument if the initial value of the resource is read, or to the
+//    output if the initial value is not read.
 //
 //  . 'tf.aliasing_output' is the index of the function output that is an alias
-//    of the input argument. This attribute is not added if there is no output
-//    alias for the input argument.
+//    of the input argument. This attribute is added only to the input argument
+//    when the initial value of the corresponding resource is read, and the
+//    resource is written later.
 //
 // Assumption of this pass:
 //  . Compound resource operations have already been decomposed.
 //  . Dead functions have already been removed, as resource arguments in dead
 //    functions can cause the pass to fail.
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -59,74 +76,174 @@ constexpr char kResourceFunctionMsg[] =
     "expects function level resource argument";
 constexpr char kInvalidResourceMsg[] =
     "expects resource to be a VarHandleOp or function argument";
+constexpr char kResourceNameArgAttr[] = "tf.resource_name";
 
-// Records the input argument index and the current live value for a resource
-// variable.
+// Checks if a function has only one block.
+mlir::LogicalResult CheckSingleBlockFunction(FuncOp function) {
+  if (!hasSingleElement(function.getBlocks()))
+    return function.emitError()
+           << "expects function '" << function.getName()
+           << "' to have 1 block, got " << function.getBlocks().size();
+
+  return success();
+}
+
+// Collects names of users of a resource that are not `tf.ReadVariableOp` and
+// not `tf.AssignVariableOp`.
+llvm::SmallSet<llvm::StringRef, 1> GetCompositeResourceUserNames(
+    Value resource) {
+  // SmallSet will use a vector when there is only one element and use std::set
+  // when there are more than one elements. This ensures that the operations in
+  // the error message are ordered.
+  llvm::SmallSet<llvm::StringRef, 1> composite_users;
+  for (Operation* user : resource.getUsers())
+    if (!llvm::isa<TF::ReadVariableOp>(user) &&
+        !llvm::isa<TF::AssignVariableOp>(user))
+      composite_users.insert(user->getName().getStringRef());
+
+  return composite_users;
+}
+
+// Checks if `tf.VarHandleOp` has a valid resource subtype and its users are of
+// `tf.ReadVariableOp` and `tf.AssignVariableOp` only.
+mlir::LogicalResult ValidateVarHandle(TF::VarHandleOp var_handle_op) {
+  auto resource_type =
+      getElementTypeOrSelf(var_handle_op.getType()).cast<TF::ResourceType>();
+  if (resource_type.getSubtypes().size() != 1)
+    return var_handle_op.emitOpError()
+           << "expects resource type to have one subtype, got "
+           << resource_type;
+
+  auto composite_ops = GetCompositeResourceUserNames(var_handle_op);
+  if (!composite_ops.empty())
+    return var_handle_op.emitOpError()
+           << "expects users to be 'tf.ReadVariableOp' or "
+              "'tf.AssignVariableOp', got ["
+           << llvm::join(composite_ops.begin(), composite_ops.end(), ", ")
+           << "]";
+
+  return success();
+}
+
+// Checks if resource argument has a valid resource subtype and its users are of
+// `tf.ReadVariableOp` and `tf.AssignVariableOp` only.
+mlir::LogicalResult ValidateResourceArgument(FuncOp function,
+                                             BlockArgument resource_arg,
+                                             TF::ResourceType resource_type) {
+  if (resource_type.getSubtypes().size() != 1)
+    return function.emitError()
+           << "expects resource type of argument "
+           << resource_arg.getArgNumber() << " to have one subtype, got "
+           << resource_type;
+
+  auto composite_ops = GetCompositeResourceUserNames(resource_arg);
+  if (!composite_ops.empty())
+    return function.emitError()
+           << "expects users of resource argument "
+           << resource_arg.getArgNumber()
+           << " to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got ["
+           << llvm::join(composite_ops.begin(), composite_ops.end(), ", ")
+           << "]";
+
+  return success();
+}
+
+// Adds resource arguments for every unique (name) variable handle. Associated
+// `tf.VarHandleOp` are removed from the function. Variable shared names are
+// returned in `var_handle_shared_names` based on the ordering of added resource
+// arguments.
+mlir::LogicalResult PromoteVarHandlesToArguments(
+    FuncOp function, bool add_validation,
+    llvm::SmallVectorImpl<std::string>* var_handle_shared_names) {
+  Block& block = function.front();
+  auto func_type = function.getType();
+
+  auto func_arg_types = llvm::to_vector<4>(func_type.getInputs());
+  llvm::SmallDenseMap<llvm::StringRef, int> var_arg_index_by_name;
+  for (auto var_handle_op :
+       llvm::make_early_inc_range(block.getOps<TF::VarHandleOp>())) {
+    if (add_validation && failed(ValidateVarHandle(var_handle_op)))
+      return failure();
+
+    llvm::StringRef name = var_handle_op.shared_nameAttr().getValue();
+    auto it = var_arg_index_by_name.insert({name, func_arg_types.size()});
+    if (it.second) {
+      var_handle_shared_names->emplace_back(name);
+      auto resource_type = var_handle_op.resource().getType();
+      func_arg_types.push_back(resource_type);
+      var_handle_op.resource().replaceAllUsesWith(
+          block.addArgument(resource_type));
+    } else {
+      var_handle_op.resource().replaceAllUsesWith(
+          block.getArgument(it.first->getSecond()));
+    }
+    var_handle_op.erase();
+  }
+
+  if (!var_handle_shared_names->empty())
+    function.setType(FunctionType::get(func_arg_types, func_type.getResults(),
+                                       function.getContext()));
+
+  return success();
+}
+
+// Records the current live value for a resource variable and whether a read or
+// write on the variable occurred.
 struct ResourceInfo {
-  int64_t input_index;
-  Value live_value;
+  Value live_value = nullptr;
+  bool read = false;
+  bool write = false;
 };
 
-using ArgOrName = llvm::PointerUnion<BlockArgument, Attribute>;
-using ResourceMap = llvm::SmallDenseMap<ArgOrName, ResourceInfo>;
-
-LogicalResult PromoteResourcesToArguments(FuncOp function) {
+LogicalResult PromoteResourcesToArguments(
+    FuncOp function, llvm::ArrayRef<std::string> var_handle_shared_names) {
   Block& block = function.front();
 
   auto return_op = llvm::dyn_cast_or_null<ReturnOp>(block.getTerminator());
   if (!return_op)
-    return function.emitError(
-        "expects 'main' function to have a MLIR ReturnOp");
+    return function.emitError() << "expects function '" << function.getName()
+                                << "' to have a MLIR ReturnOp";
 
-  ResourceMap resource_map;
+  llvm::SmallVector<ResourceInfo, 4> resources(function.getNumArguments());
   auto argument_types = llvm::to_vector<4>(function.getType().getInputs());
+  bool has_resources = false;
+  auto add_resource_argument = [&](BlockArgument arg,
+                                   TF::ResourceType resource_type) {
+    Type arg_type = resource_type.getSubtypes().front();
+    arg.setType(arg_type);
+    resources[arg.getArgNumber()].live_value = arg;
+    argument_types[arg.getArgNumber()] = arg_type;
+    has_resources = true;
+  };
 
-  // Loop through the resource arguments in the function and store a mapping
-  // from that argument to its index and itself as the current live value.
-  for (BlockArgument& func_arg : function.getArguments()) {
+  // Loop through the non `tf.VarHandleOp` resource arguments in the function,
+  // validate its uses and subtype, and store a mapping from that argument to
+  // itself as the current live value.
+  auto func_args = function.getArguments().take_front(
+      function.getNumArguments() - var_handle_shared_names.size());
+  for (BlockArgument& func_arg : func_args) {
     auto resource_type =
         getElementTypeOrSelf(func_arg.getType()).dyn_cast<TF::ResourceType>();
     if (!resource_type) continue;
-    if (resource_type.getSubtypes().size() != 1)
-      return function.emitError()
-             << "expects resource type of argument " << func_arg.getArgNumber()
-             << " to have one subtype, got " << resource_type;
+    if (failed(ValidateResourceArgument(function, func_arg, resource_type)))
+      return failure();
 
-    for (auto* user : func_arg.getUsers())
-      if (!llvm::isa<TF::ReadVariableOp>(user) &&
-          !llvm::isa<TF::AssignVariableOp>(user))
-        return function.emitError()
-               << "expects users of resource argument "
-               << func_arg.getArgNumber()
-               << " to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp'";
-
-    Type arg_type = resource_type.getSubtypes().front();
-    func_arg.setType(arg_type);
-    resource_map[func_arg] = {func_arg.getArgNumber(), func_arg};
-    argument_types[func_arg.getArgNumber()] = arg_type;
+    add_resource_argument(func_arg, resource_type);
   }
 
-  // Loop through the VarHandleOp in the function. When the first VarHandleOp
-  // for a resource variable is encountered, create a new function argument and
-  // add an entry to the resource_map to record the information.
-  for (auto var_handle_op : block.getOps<TF::VarHandleOp>()) {
-    if (resource_map.count(var_handle_op.shared_nameAttr())) continue;
-
+  // Loop through `tf.VarHandleOp` resource arguments in the function and store
+  // a mapping from that argument to itself as the current live value. No
+  // validations are necessary here as these arguments were validated prior to
+  // being added.
+  auto var_handle_args =
+      function.getArguments().take_back(var_handle_shared_names.size());
+  for (BlockArgument& var_handle_arg : var_handle_args) {
     auto resource_type =
-        getElementTypeOrSelf(var_handle_op.getType()).cast<TF::ResourceType>();
-    if (resource_type.getSubtypes().size() != 1)
-      return var_handle_op.emitOpError()
-             << "expects resource type to have one subtype, got "
-             << resource_type;
-
-    Type arg_type = resource_type.getSubtypes().front();
-    BlockArgument arg = block.addArgument(arg_type);
-    resource_map[var_handle_op.shared_nameAttr()] = {
-        static_cast<int64_t>(argument_types.size()), arg};
-    argument_types.push_back(arg_type);
+        getElementTypeOrSelf(var_handle_arg.getType()).cast<TF::ResourceType>();
+    add_resource_argument(var_handle_arg, resource_type);
   }
 
-  if (resource_map.empty()) return success();
+  if (!has_resources) return success();
 
   // We initially assign the argument for a resource as the live value for the
   // resource. We then walk through the operations in the function in their
@@ -139,11 +256,9 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
         if (func_arg.getOwner() != &block)
           return read_op.emitOpError(kResourceFunctionMsg);
 
-        read_op.value().replaceAllUsesWith(resource_map[func_arg].live_value);
-      } else if (auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(
-                     read_op.resource().getDefiningOp())) {
-        read_op.value().replaceAllUsesWith(
-            resource_map[var_handle_op.shared_nameAttr()].live_value);
+        ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
+        resource_info.read = true;
+        read_op.value().replaceAllUsesWith(resource_info.live_value);
       } else {
         return read_op.emitOpError(kInvalidResourceMsg);
       }
@@ -154,11 +269,9 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
         if (func_arg.getOwner() != &block)
           return write_op.emitOpError(kResourceFunctionMsg);
 
-        resource_map[func_arg].live_value = write_op.value();
-      } else if (auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(
-                     write_op.resource().getDefiningOp())) {
-        resource_map[var_handle_op.shared_nameAttr()].live_value =
-            write_op.value();
+        ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
+        resource_info.write = true;
+        resource_info.live_value = write_op.value();
       } else {
         return read_op.emitOpError(kInvalidResourceMsg);
       }
@@ -169,55 +282,68 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
 
   const int64_t num_results_before = function.getNumResults();
   auto return_operands = llvm::to_vector<4>(return_op.getOperands());
-  return_operands.reserve(num_results_before + resource_map.size());
   auto result_types = llvm::to_vector<4>(return_op.getOperandTypes());
-  result_types.reserve(num_results_before + resource_map.size());
+  llvm::SmallVector<std::pair<int64_t, llvm::StringRef>, 4>
+      output_only_resources;
   llvm::SmallVector<std::pair<int64_t, int64_t>, 4> input_output_alias;
-  input_output_alias.reserve(resource_map.size());
 
-  // Collect new return values and mapping from resource input index to output
-  // alias. If the last live value is itself (argument), then that live value
-  // will not be returned as the resource is unmodified.
-  for (auto& resource : resource_map) {
-    int64_t input_index = resource.getSecond().input_index;
-    Value live_value = resource.getSecond().live_value;
-    auto live_arg = live_value.dyn_cast<BlockArgument>();
-    if (live_arg && live_arg.getOwner() == &block &&
-        live_arg.getArgNumber() == input_index)
+  // Collect new return values for variable writes and either (a) output-only
+  // resource attributes (if the resource is not promoted to an argument) or (b)
+  // mapping from resource input index to output alias (if the resource has been
+  // promoted to an argument). Resource arguments that were originally
+  // `tf.VarHandleOp` but not read are collected and then removed.
+  OpBuilder builder(return_op);
+  const int var_handles_start_idx =
+      function.getNumArguments() - var_handle_shared_names.size();
+  int new_argument_index = 0;
+  llvm::SmallVector<int, 4> argument_indices_to_remove;
+  for (auto resource_and_index : llvm::enumerate(resources)) {
+    const auto& resource = resource_and_index.value();
+    if (!resource.live_value) {
+      // Ignore non resource arguments.
+      ++new_argument_index;
       continue;
-
-    return_operands.push_back(live_value);
-    result_types.push_back(live_value.getType());
-    input_output_alias.push_back(
-        {input_index, num_results_before + input_output_alias.size()});
-  }
-
-  // Erase all VarHandleOp.
-  for (Operation& op : llvm::make_early_inc_range(function.front())) {
-    auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(op);
-    if (!var_handle_op) continue;
-    if (!var_handle_op.use_empty()) {
-      // SmallSet will use a vector when there is only one element and use
-      // std::set when there are more than one elements. This ensures that
-      // the operations in the error message are ordered.
-      llvm::SmallSet<std::string, 2> unique_operations;
-      llvm::for_each(
-          var_handle_op.getOperation()->getUsers(), [&](Operation* user) {
-            unique_operations.insert(user->getName().getStringRef().str());
-          });
-
-      return var_handle_op.emitOpError(
-                 "expects no uses but used by operations: ")
-             << llvm::join(unique_operations.begin(), unique_operations.end(),
-                           ", ");
     }
 
-    op.erase();
+    const auto index = resource_and_index.index();
+    const bool is_var_handle = index >= var_handles_start_idx;
+    if (resource.write) {
+      if (!is_var_handle || resource.read) {
+        input_output_alias.push_back(
+            {new_argument_index, return_operands.size()});
+      } else if (is_var_handle) {
+        output_only_resources.push_back(
+            {return_operands.size(),
+             var_handle_shared_names[index - var_handles_start_idx]});
+      }
+      return_operands.push_back(resource.live_value);
+      result_types.push_back(resource.live_value.getType());
+    }
+
+    if (is_var_handle && !resource.read) {
+      assert(block.getArgument(index).getUses().empty());
+      argument_indices_to_remove.push_back(index);
+    } else {
+      if (is_var_handle) {
+        // Add resource_name attribute to VarHandleOp read.
+        function.setArgAttr(
+            new_argument_index, kResourceNameArgAttr,
+            builder.getStringAttr(
+                var_handle_shared_names[index - var_handles_start_idx]));
+      }
+      ++new_argument_index;
+    }
   }
 
-  // Rewrite return if more results need to be returned by the function.
-  OpBuilder builder(return_op);
-  if (!input_output_alias.empty()) {
+  // Remove unread var handle arguments.
+  for (int argument_index_to_remove :
+       llvm::reverse(argument_indices_to_remove)) {
+    block.eraseArgument(argument_index_to_remove);
+    argument_types.erase(argument_types.begin() + argument_index_to_remove);
+  }
+
+  // Rewrite return if there are variable writes.
+  if (return_operands.size() > num_results_before) {
     builder.create<ReturnOp>(return_op.getLoc(), return_operands);
     return_op.erase();
   }
@@ -225,13 +351,10 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
   // Update function argument and result types with new resource subtypes.
   function.setType(builder.getFunctionType(argument_types, result_types));
 
-  // Add resource_name attribute to the input argument for the resources.
-  for (auto& resource : resource_map) {
-    if (auto attr = resource.getFirst().dyn_cast<Attribute>()) {
-      function.setArgAttr(resource.getSecond().input_index, "tf.resource_name",
-                          attr);
-    }
-  }
+  // Add resource_name attribute to the output for the resources.
+  for (auto& resource : output_only_resources)
+    function.setResultAttr(resource.first, kResourceNameArgAttr,
+                           builder.getStringAttr(resource.second));
 
   // Add aliasing_output attribute to the input argument for the resources that
   // are updated by the function.
@@ -256,26 +379,60 @@ void PromoteResourcesToArgsPass::runOnOperation() {
   // This routine should only be called when control flow operations are still
   // represented with TF IfOp and WhileOp operations. In this case, there should
   // be only one basic blocks in the MLIR representation.
-  if (!has_single_element(main_func.getBlocks())) {
-    main_func.emitError() << "expects 'main' function to have 1 block, got "
-                          << main_func.getBlocks().size();
-    return signalPassFailure();
-  }
+  if (failed(CheckSingleBlockFunction(main_func))) return signalPassFailure();
 
+  llvm::SmallVector<std::string, 4> var_handle_shared_names;
   if (failed(ResourceLiftingForFunctionalControlFlow(main_func)) ||
-      failed(PromoteResourcesToArguments(main_func)))
+      failed(PromoteVarHandlesToArguments(main_func, /*add_validation=*/true,
+                                          &var_handle_shared_names)) ||
+      failed(PromoteResourcesToArguments(main_func, var_handle_shared_names)))
     return signalPassFailure();
 }
 
+class PromoteVarHandlesToArgsPass
+    : public PassWrapper<PromoteVarHandlesToArgsPass, OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override;
+};
+
+void PromoteVarHandlesToArgsPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  MLIRContext* context = module.getContext();
+  for (auto function : module.getOps<FuncOp>()) {
+    if (failed(CheckSingleBlockFunction(function))) return signalPassFailure();
+
+    llvm::SmallVector<std::string, 4> var_handle_shared_names;
+    PromoteVarHandlesToArguments(function, /*add_validation=*/false,
+                                 &var_handle_shared_names);
+
+    // Add resource names for each `tf.VarHandleOp` that were promoted to
+    // resource arguments.
+    const int var_handle_args_offset =
+        function.getNumArguments() - var_handle_shared_names.size();
+    for (auto var_name_and_index : llvm::enumerate(var_handle_shared_names))
+      function.setArgAttr(var_name_and_index.index() + var_handle_args_offset,
+                          kResourceNameArgAttr,
+                          StringAttr::get(var_name_and_index.value(), context));
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass() {
   return std::make_unique<PromoteResourcesToArgsPass>();
 }
 
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass() {
+  return std::make_unique<PromoteVarHandlesToArgsPass>();
+}
+
 static PassRegistration<PromoteResourcesToArgsPass> pass(
     "tf-promote-resources-to-args",
     "Promote resources reads/writes to function inputs/outputs.");
 
+static PassRegistration<PromoteVarHandlesToArgsPass> var_handle_pass(
+    "tf-promote-var-handles-to-args",
+    "Promote tf.VarHandleOps to function arguments.");
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index a781f054755..2fd230005d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
@@ -107,10 +108,9 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
 
 // Creates islands per replica from `tf_device.replicate` region and remap
 // replicate results with new island outputs. A single island is created to
-// forward results from each replica island. Control dependencies of individual
-// replicas are added to the single island if the single island does not emit
-// a result from the respective replica. Devices are remapped from aliased
-// devices to explicit devices, for `tf_device.launch` ops.
+// forward control dependencies if there is a control dependency output from the
+// replicate island. Devices are remapped from aliased devices to explicit
+// devices, for `tf_device.launch` ops.
 //
 // For example, the following:
 //
@@ -156,12 +156,9 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
 //   }) {device = "/DEVICE:3"} : () -> tensor<i1>
 //   tf_executor.yield %a1, %b1 : tensor<i1>, tensor<i1>
 // }
-// %6:2 = tf_executor.island(%3#2) {
-//   tf_executor.yield %0#0 : tensor<i1>
-// }
-LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
-                                         tf_executor::IslandOp island_op,
-                                         tf_device::ReplicateOp replicate_op) {
+void CreateIslandsFromReplicate(const Dialect* tf_dialect,
+                                tf_executor::IslandOp island_op,
+                                tf_device::ReplicateOp replicate_op) {
   OpBuilder builder(island_op);
   const int num_replicas = replicate_op.n().getLimitedValue();
 
@@ -181,44 +178,38 @@ LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
           replica_result_and_idx.value();
 
   // Remap replicate results to per replica result.
-  replicate_op.replaceAllUsesWith(replicas_outputs);
+  for (auto result : llvm::zip(island_op.outputs(), replicas_outputs))
+    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
 
-  // Collect per replica control dependency and add to island operand if replica
-  // island has no uses.
-  llvm::SmallVector<Value, 8> island_operands;
-  for (auto& replica : replicas)
-    if (replica.use_empty()) island_operands.push_back(replica.control());
+  // Add sink island to pin all replicas as a control dependency if there is a
+  // control dependency leading from the replicate originally.
+  if (!island_op.control().use_empty()) {
+    llvm::SmallVector<Value, 8> island_operands;
+    for (auto& replica : replicas) island_operands.push_back(replica.control());
 
-  // Create single island forwarding per replica result.
-  builder.setInsertionPoint(island_op);
-  auto island_sink = builder.create<tf_executor::IslandOp>(
-      island_op.getLoc(), llvm::to_vector<8>(island_op.getResultTypes()),
-      island_operands, llvm::ArrayRef<NamedAttribute>{});
-  island_sink.body().push_back(new Block);
-
-  // Move replicate island YieldOp over to new single island.
-  island_op.GetYield().getOperation()->moveBefore(
-      &island_sink.GetBody(), island_sink.GetBody().begin());
-
-  // Remap island results.
-  island_op.replaceAllUsesWith(island_sink);
+    builder.setInsertionPoint(island_op);
+    auto island_sink = builder.create<tf_executor::IslandOp>(
+        island_op.getLoc(), llvm::ArrayRef<Type>{},
+        tf_executor::ControlType::get(island_op.getContext()), island_operands);
+    island_sink.body().push_back(new Block);
+    builder.setInsertionPointToEnd(&island_sink.GetBody());
+    builder.create<tf_executor::YieldOp>(island_op.getLoc(),
+                                         llvm::ArrayRef<Value>{});
+    island_op.control().replaceAllUsesWith(island_sink.control());
+  }
 
   island_op.erase();
-  return success();
 }
 
 // Finds islands with a single `tf_device.replicate` and create individual
 // islands per replica of the replicate.
-LogicalResult LowerSingleIslandReplicateToIslands(
-    const Dialect* tf_dialect, tf_executor::IslandOp island_op) {
-  if (!has_single_element(island_op.GetBody().without_terminator()))
-    return success();
+void LowerSingleIslandReplicateToIslands(const Dialect* tf_dialect,
+                                         tf_executor::IslandOp island_op) {
+  if (!island_op.WrapsSingleOp()) return;
 
   if (auto replicate_op =
           llvm::dyn_cast<tf_device::ReplicateOp>(&island_op.GetBody().front()))
-    return CreateIslandsFromReplicate(tf_dialect, island_op, replicate_op);
-
-  return success();
+    CreateIslandsFromReplicate(tf_dialect, island_op, replicate_op);
 }
 
 void ReplicateToIslandPass::runOnFunction() {
@@ -228,13 +219,9 @@ void ReplicateToIslandPass::runOnFunction() {
     getFunction().emitError() << "'tf' dialect is not registered";
   }
 
-  auto result = getFunction().walk([&](tf_executor::IslandOp island_op) {
-    if (failed(LowerSingleIslandReplicateToIslands(tf_dialect, island_op)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
+  getFunction().walk([&](tf_executor::IslandOp island_op) {
+    LowerSingleIslandReplicateToIslands(tf_dialect, island_op);
   });
-
-  if (result.wasInterrupted()) return signalPassFailure();
 }
 }  // anonymous namespace
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index eea8ad8caad..611c4d2725a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -62,7 +62,7 @@ namespace {
 // TensorFlow resource variable and returns new value:
 //
 // %resource_handle = "tf.VarHandleOp"()
-// %1 = "tf_device.launch"() ( {
+// %1 = "tf_device.cluster"() ( {
 //   %init_value = "tf.ReadVariableOp"(%resource_handle)
 //   "tf.AssignAddVariableOp"(%resource_handle, %init_value)
 //   %new_value = "tf.ReadVariableOp"(%resource_handle)
@@ -73,7 +73,7 @@ namespace {
 //
 // %resource_handle = "tf.VarHandleOp"()
 // %init_value = "tf.ReadVariableOp"(%resource_handle)
-// %1:2 = "tf_device.launch"() ( {
+// %1:2 = "tf_device.cluster"() ( {
 //   %new_value = "tf.AddV2"(%init_value, %init_value)
 //   tf_device.return %new_value, %new_value
 // })
@@ -81,7 +81,7 @@ namespace {
 //
 // You can see that there are a few main changes applied:
 // 1) All the resource variable reads and writes are now outside of
-//    tf_device.launch op.
+//    tf_device.cluster op.
 // 2) Instead of taking resource handles as input, this device computation now
 //    takes snapshotted values of that device.
 // 3) Some resource load operations are eliminated with store-load forwarding.
@@ -89,13 +89,13 @@ namespace {
 //    external resource store operations so that resources are still updated
 //    after the computation.
 //
-// If the launch body contains functional control flow, the pass first lifts the
-// loads/stores in the body/cond/branch functions to the launch body, then
+// If the cluster body contains functional control flow, the pass first lifts
+// the loads/stores in the body/cond/branch functions to the cluster body, then
 // performs the above lifting. E.g.,
 //
-// func @launch_with_loop() -> () {
+// func @cluster_with_loop() -> () {
 //   %0 = "tf.VarHandleOp"() ...
-//   "tf_device.launch"() ( {
+//   "tf_device.cluster"() ( {
 //      %1 = "tf.While"(%0) {body = @while_body, cond = @while_cond}
 //      tf_device.return
 //   })
@@ -113,10 +113,10 @@ namespace {
 //
 // will be be transformed to:
 //
-// func @launch_with_loop() {
+// func @cluster_with_loop() {
 //   %0 = "tf.VarHandleOp"() ...
 //   %1 = "tf.ReadVariableOp"(%0)
-//   %2 = "tf_device.launch"() ( {
+//   %2 = "tf_device.cluster"() ( {
 //     %3 = "tf.While"(%1) {body = @while_body, cond = @while_cond}
 //     tf_device.return %3 : tensor<f32>
 //   }) : () -> tensor<f32>
@@ -140,7 +140,7 @@ struct ResourceOpLiftingPass
 // such nodes to carry information.
 void RemoveIdentity(Block* block) {
   for (auto& op : llvm::make_early_inc_range(*block)) {
-    if (llvm::isa<TF::IdentityOp>(&op) || llvm::isa<TF::IdentityNOp>(&op)) {
+    if (isa<TF::IdentityOp>(&op) || isa<TF::IdentityNOp>(&op)) {
       op.replaceAllUsesWith(op.getOperands());
       op.erase();
     }
@@ -241,7 +241,7 @@ bool AppendResourceStoreValueToReturn(Block* body) {
 
     // TODO(ycao): Prevent same value from being returned multiple times.
     // TODO(ycao): Do not return resource store value if it is defined outside
-    // of launch_op.
+    // of cluster.
     new_return_operands.push_back(assign_variable_op.value());
     has_resource_store = true;
   }
@@ -256,81 +256,78 @@ bool AppendResourceStoreValueToReturn(Block* body) {
   return true;
 }
 
-// Moves resource store operations to after launch_op. This assumes load-store
-// forwarding has been performed on this launch_op such that there is at most
-// one resource store operation carrying its final value.
-tf_device::LaunchOp SinkResourceStores(tf_device::LaunchOp launch_op,
-                                       OpBuilder* builder) {
-  // Update ReturnOp inside launch_op's body to output final values of updated
+// Moves resource store operations to after cluster. This assumes load-store
+// forwarding has been performed on this cluster such that there is at most one
+// resource store operation carrying its final value.
+tf_device::ClusterOp SinkResourceStores(tf_device::ClusterOp cluster,
+                                        OpBuilder* builder) {
+  // Update ReturnOp inside cluster's body to output final values of updated
   // external resources.
-  if (!AppendResourceStoreValueToReturn(&launch_op.GetBody())) return launch_op;
+  if (!AppendResourceStoreValueToReturn(&cluster.GetBody())) return cluster;
 
-  auto new_return_op = launch_op.GetBody().getTerminator();
-  llvm::SmallVector<Type, 4> new_launch_return_types(
-      new_return_op->getOperandTypes());
+  auto new_return_op = cluster.GetBody().getTerminator();
+  llvm::SmallVector<Type, 4> new_return_types(new_return_op->getOperandTypes());
 
-  builder->setInsertionPoint(launch_op);
-  auto new_launch_op = builder->create<tf_device::LaunchOp>(
-      launch_op.getLoc(), new_launch_return_types,
-      /*operands=*/llvm::SmallVector<Value, 4>(), launch_op.getAttrs());
-  new_launch_op.body().takeBody(launch_op.body());
+  builder->setInsertionPoint(cluster);
+  auto new_cluster = builder->create<tf_device::ClusterOp>(
+      cluster.getLoc(), new_return_types,
+      /*operands=*/llvm::SmallVector<Value, 4>(), cluster.getAttrs());
+  new_cluster.body().takeBody(cluster.body());
 
-  // Replace uses of old launch_op results with those of new_launch_op.
-  for (auto p : llvm::zip(launch_op.getResults(), new_launch_op.getResults())) {
-    std::get<0>(p).replaceAllUsesWith(std::get<1>(p));
-  }
+  // Replace uses of old cluster results with those of new_cluster.
+  for (auto result : llvm::zip(cluster.getResults(), new_cluster.getResults()))
+    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
 
-  // Create a mapping from operands of new_return_op operands to new_launch_op
+  // Create a mapping from operands of new_return_op operands to new_cluster
   // results.
   BlockAndValueMapping mapper;
-  for (auto p :
-       llvm::zip(new_return_op->getOperands(), new_launch_op.getResults())) {
-    mapper.map(std::get<0>(p), std::get<1>(p));
-  }
+  for (auto operand_result :
+       llvm::zip(new_return_op->getOperands(), new_cluster.getResults()))
+    mapper.map(std::get<0>(operand_result), std::get<1>(operand_result));
 
   // Clone all resource store ops and map their operands to values returned from
-  // new_launch_op.
-  for (Operation& op : llvm::make_early_inc_range(new_launch_op.GetBody())) {
-    if (dyn_cast<TF::AssignVariableOp>(&op)) {
+  // new_cluster.
+  for (Operation& op : llvm::make_early_inc_range(new_cluster.GetBody())) {
+    if (isa<TF::AssignVariableOp>(op)) {
       builder->clone(op, mapper);
       op.erase();
     }
   }
 
-  launch_op.erase();
-  return new_launch_op;
+  cluster.erase();
+  return new_cluster;
 }
 
-// Hoists resource variable loads and sinks stores from launch_op.
-LogicalResult HoistResourceOpsFromLaunchOp(tf_device::LaunchOp launch_op) {
-  ModuleOp m = launch_op.getParentOfType<ModuleOp>();
-  OpBuilder builder(m);
+// Hoists resource variable loads and sinks stores from cluster.
+LogicalResult HoistResourceOpsFromCluster(tf_device::ClusterOp cluster,
+                                          ModuleOp module) {
+  OpBuilder builder(module);
 
   // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(&launch_op.GetBody());
+  RemoveIdentity(&cluster.GetBody());
 
   // Perform store-load forwarding. So that each resource is only loaded with
   // its initial value and is only stored with its final value.
-  ForwardStoreToLoad(&launch_op.GetBody());
+  ForwardStoreToLoad(&cluster.GetBody());
 
-  // Move loads of external resources, if any, to before launch_op.
-  // (Skipping resources created inside of launch_op.)
+  // Move loads of external resources, if any, to before cluster.
+  // (Skipping resources created inside of cluster.)
   HoistResourceLoads(
-      &launch_op.GetBody(),
+      &cluster.GetBody(),
       /*skip_load=*/
       [&](TF::ReadVariableOp read) {
-        return read.resource().getParentRegion() == &launch_op.body();
+        return read.resource().getParentRegion() == &cluster.body();
       },
       /*move_load=*/
       [&](TF::ReadVariableOp read) {
-        read.getOperation()->moveBefore(launch_op);
+        read.getOperation()->moveBefore(cluster);
       });
 
-  // Move stores of external resources, if any, to after launch_op.
-  auto new_launch_op = SinkResourceStores(launch_op, &builder);
+  // Move stores of external resources, if any, to after cluster.
+  auto new_cluster = SinkResourceStores(cluster, &builder);
 
   llvm::SetVector<Value> captured_values;
-  getUsedValuesDefinedAbove(new_launch_op.body(), new_launch_op.body(),
+  getUsedValuesDefinedAbove(new_cluster.body(), new_cluster.body(),
                             captured_values);
 
   for (Value v : captured_values) {
@@ -338,7 +335,7 @@ LogicalResult HoistResourceOpsFromLaunchOp(tf_device::LaunchOp launch_op) {
     if (!tensor_type) continue;
     if (!tensor_type.getElementType().isa<TF::ResourceType>()) continue;
 
-    return new_launch_op.emitOpError()
+    return new_cluster.emitOpError()
            << "has remaining resource inputs that can not be lifted";
   }
 
@@ -378,8 +375,7 @@ LogicalResult FindResourceArgUseInfo(
         info.data_type = assign.value().getType();
         continue;
       }
-      if (llvm::isa<TF::StackPushV2Op>(user) ||
-          llvm::isa<TF::StackPopV2Op>(user)) {
+      if (isa<TF::StackPushV2Op>(user) || isa<TF::StackPopV2Op>(user)) {
         // Stacks will be handled by a separate pass.
         do_not_touch = true;
         break;
@@ -654,11 +650,8 @@ LogicalResult HanldeWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
     arg_data_type_and_updated_output_index[entry.getFirst()] = {
         entry.getSecond(), update_index};
     if (!new_output_shapes.empty()) {
-      tensorflow::TensorShapeProto shape_proto;
-      tensorflow::ConvertTypeToTensorShape(entry.getSecond())
-          .AsProto(&shape_proto);
-      new_output_shapes[entry.getFirst()] = builder.getStringAttr(
-          tensorflow::mangling_util::MangleShape(shape_proto));
+      new_output_shapes[entry.getFirst()] =
+          tensorflow::ConvertTypeToTensorShapeAttr(entry.getSecond());
     }
   }
   AddLoadsStoresOutsideControlFlowOp(new_while,
@@ -800,11 +793,8 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
     arg_data_type_and_updated_output_index[entry.getFirst() + 1] = {
         entry.getSecond(), update_index};
     if (!if_op.output_shapes().getValue().empty() && update_index >= 0) {
-      tensorflow::TensorShapeProto shape_proto;
-      tensorflow::ConvertTypeToTensorShape(entry.getSecond())
-          .AsProto(&shape_proto);
-      new_output_shapes.push_back(builder.getStringAttr(
-          tensorflow::mangling_util::MangleShape(shape_proto)));
+      new_output_shapes.push_back(
+          tensorflow::ConvertTypeToTensorShapeAttr(entry.getSecond()));
     }
   }
   AddLoadsStoresOutsideControlFlowOp(new_if,
@@ -1040,7 +1030,7 @@ LogicalResult HoistForFunctionalControlFlow(
   for (auto local_var : local_vars) {
     if (llvm::all_of(local_var.resource().getUsers(),
                      [](const Operation* user) {
-                       return llvm::isa<TF::AssignVariableOp>(user);
+                       return isa<TF::AssignVariableOp>(user);
                      })) {
       for (auto user : local_var.resource().getUsers()) user->erase();
       local_var.erase();
@@ -1049,18 +1039,18 @@ LogicalResult HoistForFunctionalControlFlow(
   return success();
 }
 
-// Lifts resource operation from tf_device.launch_func ops nested in `op`
-// outside. Returns failure if there are remaining resource-type values that can
-// not be lifted.
+// Lifts resource operation from tf_device.cluster ops nested in `op` outside.
+// Returns failure if there are remaining resource-type values that can not be
+// lifted.
 void ResourceOpLiftingPass::runOnOperation() {
   llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>
       lifted_partitioned_call_callees;
-  auto result = getOperation().walk([&](FuncOp func_op) {
-    return func_op.walk([&](tf_device::LaunchOp launch_op) {
+  ModuleOp module = getOperation();
+  auto result = module.walk([&](FuncOp func_op) {
+    return func_op.walk([&](tf_device::ClusterOp cluster) {
       if (failed(HoistForFunctionalControlFlow(
-              &launch_op.GetBody(), getOperation(),
-              &lifted_partitioned_call_callees)) ||
-          failed(HoistResourceOpsFromLaunchOp(launch_op))) {
+              &cluster.GetBody(), module, &lifted_partitioned_call_callees)) ||
+          failed(HoistResourceOpsFromCluster(cluster, module))) {
         return WalkResult::interrupt();
       }
       return WalkResult::advance();
@@ -1112,7 +1102,7 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function) {
   // This routine should only be called when control flow operations are still
   // represented with TF IfOp and WhileOp operations. In this case, there should
   // be only one basic blocks in the MLIR representation.
-  if (!has_single_element(function.getBlocks())) {
+  if (!hasSingleElement(function.getBlocks())) {
     return function.emitError()
            << "expect the function to have 1 block while it has "
            << function.getBlocks().size();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index d3a6adbbce6..5a2cae38062 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <initializer_list>
 #include <iterator>
 
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
@@ -26,10 +28,12 @@ limitations under the License.
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
@@ -55,12 +59,14 @@ limitations under the License.
 #define DEBUG_TYPE "tf-shape-inference"
 
 using ::tensorflow::int64;
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
 
 namespace mlir {
 namespace TF {
 namespace {
-Optional<llvm::SmallVector<mlir::Type, 4>> InferShapeForFunctionReturnType(
-    FuncOp func) {
+Optional<SmallVector<Type, 4>> InferShapeForFunctionReturnType(FuncOp func) {
   // Find any return ops.
   SmallVector<ReturnOp, 4> return_ops;
   for (Block& block : func) {
@@ -120,19 +126,19 @@ bool IsSupportedNonTFOp(Operation* op) {
 // not a TF operation, as we can't guarantee that the new type will be OK.
 void AddCastBackForUnsupportedNonTFUses(Operation* op, Value result,
                                         Dialect* tf_dialect, Type old_type) {
-  OpBuilder builder(op);
-  builder.setInsertionPointAfter(op);
   // A tf.Cast operation is lazily created on the first uses that isn't a TF
   // operation.
   TF::CastOp cast_op;
   auto get_cast_op = [&]() {
-    if (!cast_op)
-      cast_op =
-          builder.create<TF::CastOp>(op->getLoc(), old_type, result,
-                                     /*truncate=*/builder.getBoolAttr(false));
-    return mlir::Value(cast_op);
+    if (!cast_op) {
+      OpBuilder b(op);
+      b.setInsertionPointAfter(op);
+      cast_op = b.create<TF::CastOp>(op->getLoc(), old_type, result,
+                                     /*truncate=*/b.getBoolAttr(false));
+    }
+    return Value(cast_op);
   };
-  for (OpOperand& use : llvm::make_early_inc_range(result.getUses())) {
+  for (OpOperand& use : make_early_inc_range(result.getUses())) {
     if (use.getOwner()->getDialect() != tf_dialect &&
         !IsSupportedNonTFOp(use.getOwner()))
       use.set(get_cast_op());
@@ -155,10 +161,22 @@ Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
 bool InferShapeForPassThroughOps(OperandRange pass_through_operands,
                                  Operation* op, Dialect* tf_dialect) {
   bool changed = false;
-  for (auto entry : llvm::zip(pass_through_operands, op->getResults())) {
+  for (auto entry : zip(pass_through_operands, op->getResults())) {
     Type operand_type = std::get<0>(entry).getType();
     Value result = std::get<1>(entry);
     if (result.getType() == operand_type) continue;
+    // Pass through nodes may remove ref types, don't consider that as
+    // refinement.
+    // TODO(jpienaar): There could be refinement in addition to this, so
+    // refine this.
+    if (operand_type.cast<TensorType>()
+            .getElementType()
+            .isa<TF::TensorFlowRefType>() &&
+        !result.getType()
+             .cast<TensorType>()
+             .getElementType()
+             .isa<TF::TensorFlowRefType>())
+      continue;
     AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect,
                                        result.getType());
     result.setType(operand_type);
@@ -184,6 +202,11 @@ bool InferShapeForNonTFDialectOperation(Operation* op, Dialect* tf_dialect) {
         iter_sink.getOperands().drop_front().take_front(), iter_source,
         tf_dialect);
   }
+  // TODO(b/155227679): Use OpInterface instead of hard-coding for TensorCastOp.
+  if (auto tensor_cast = dyn_cast<TensorCastOp>(op)) {
+    return InferShapeForPassThroughOps(
+        tensor_cast.getOperation()->getOperands(), op, tf_dialect);
+  }
   return false;
 }
 
@@ -230,15 +253,36 @@ GetSubtypes(Type type) {
 // match the i-th operand type). Returns true if anything is changed.
 bool PassThroughOperandTypes(OperandRange operands, ResultRange results) {
   bool changed = false;
-  for (auto entry : llvm::zip(operands, results)) {
+  for (auto entry : zip(operands, results)) {
     Type operand_type = std::get<0>(entry).getType();
-    if (operand_type == std::get<1>(entry).getType()) continue;
+    Type result_type = std::get<1>(entry).getType();
+    if (operand_type == result_type) continue;
+    // Pass through nodes may remove ref types, don't consider that as
+    // refinement.
+    // TODO(jpienaar): There could be refinement in addition to this, so
+    // refine this.
+    if (operand_type.cast<TensorType>()
+            .getElementType()
+            .isa<TF::TensorFlowRefType>() &&
+        !result_type.cast<TensorType>()
+             .getElementType()
+             .isa<TF::TensorFlowRefType>())
+      continue;
+
     std::get<1>(entry).setType(operand_type);
     changed = true;
   }
   return changed;
 }
 
+// Returns whether type can be further refined.
+bool CanBeRefined(Type type) {
+  auto shape_type = type.dyn_cast<ShapedType>();
+  return shape_type && (!shape_type.hasStaticShape() ||
+                        shape_type.getElementType().isa<TF::ResourceType>() ||
+                        shape_type.getElementType().isa<TF::VariantType>());
+}
+
 // Infers the shape from a (Stateful)PartionedCall operation by looking up the
 // called function and propagating the return type.
 bool InferShapeForCall(Operation* op) {
@@ -246,19 +290,18 @@ bool InferShapeForCall(Operation* op) {
   CallInterfaceCallable callable = call_op.getCallableForCallee();
   SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>();
   if (!sym) return false;
-  FuncOp func =
-      dyn_cast<mlir::FuncOp>(SymbolTable::lookupNearestSymbolFrom(op, sym));
+  FuncOp func = dyn_cast<FuncOp>(SymbolTable::lookupNearestSymbolFrom(op, sym));
   if (!func) return false;
 
   bool changed = false;
   // Map each of the results of the call to the returned type of the
   // function.
-  for (auto result : llvm::zip(op->getResults(), func.getType().getResults())) {
+  for (auto result : zip(op->getResults(), func.getType().getResults())) {
     if (std::get<0>(result).getType() == std::get<1>(result)) continue;
     // Skip already statically shaped results.
-    auto shaped_type = std::get<0>(result).getType().dyn_cast<ShapedType>();
-    if (!shaped_type || shaped_type.hasStaticShape()) continue;
+    if (!CanBeRefined(std::get<0>(result).getType())) continue;
 
+    auto shaped_type = std::get<0>(result).getType().cast<ShapedType>();
     auto new_type = std::get<1>(result).dyn_cast<RankedTensorType>();
     if (!new_type) continue;
 
@@ -273,11 +316,293 @@ bool InferShapeForCall(Operation* op) {
   return changed;
 }
 
+bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
+                                    Dialect* tf_dialect) {
+  Operation* op = infer_ti.getOperation();
+  SmallVector<Type, 4> inferred;
+  LogicalResult res = infer_ti.inferReturnTypes(
+      op->getContext(), op->getLoc(), op->getOperands(),
+      op->getAttrDictionary(), op->getRegions(), inferred);
+  if (failed(res)) {
+    op->emitOpError("failed to refine type as inference failed");
+    return false;
+  }
+
+  if (inferred == op->getResultTypes()) return false;
+
+  // Map each of the results of the call to the returned type of the
+  // function.
+  bool changed = false;
+  for (auto result : zip(op->getResults(), inferred)) {
+    if (std::get<0>(result).getType() == std::get<1>(result)) continue;
+
+    // Inserts a cast back to the original type if any user is not in the
+    // TF dialect.
+    AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result),
+                                       op->getDialect(), std::get<1>(result));
+    // Finally we inferred the shape and replace the type for this result.
+    std::get<0>(result).setType(std::get<1>(result));
+    changed = true;
+  }
+  return changed;
+}
+
 }  // namespace
 
-bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
-                                  int64_t graph_version) {
-  assert(tf_dialect == op->getDialect());
+// Combination of value producer and port of value produced (e.g.,
+//   <value result output>:<value in output tensor>,
+// so for tf.Const -> tensor<10x20xf32>, [0,2,18] would point to a unique output
+// scalar value).
+struct ValuePort {
+  PointerUnion<Operation*, BlockArgument> producer;
+  SmallVector<unsigned int, 2> port;
+
+  bool operator==(const ValuePort& other) const {
+    return producer == other.producer && port == other.port;
+  }
+
+  // Convert output value to ValuePort.
+  explicit ValuePort(Value v) {
+    OpResult opr = v.dyn_cast<OpResult>();
+    if (opr) {
+      producer = opr.getOwner();
+      port = {opr.getResultNumber()};
+    } else {
+      producer = v.cast<BlockArgument>();
+      port = {0};
+    }
+  }
+  ValuePort(PointerUnion<Operation*, BlockArgument> producer,
+            SmallVector<unsigned int, 2> port)
+      : producer(producer), port(port) {}
+
+  raw_ostream& print(raw_ostream& os) const {
+    if (auto* op = producer.dyn_cast<Operation*>())
+      os << "op " << op->getName();
+    if (auto ba = producer.dyn_cast<BlockArgument>())
+      os << "block_arg " << ba.getArgNumber();
+    os << formatv(" [{0}]", llvm::make_range(port.begin(), port.end()));
+    return os;
+  }
+};
+
+struct ValuePortHasher {
+  std::size_t operator()(const ValuePort& other) const {
+    return hash_combine(llvm::hash_value(other.producer.getOpaqueValue()),
+                        hash_value(ArrayRef<unsigned int>(other.port)));
+  }
+};
+
+using ValuePortResultMap =
+    std::unordered_map<ValuePort, Attribute, ValuePortHasher>;
+using ComputedQueryFn = function_ref<bool(ValuePort)>;
+using ValueQueryFn = function_ref<Attribute(const ValuePort&)>;
+using ValuePortInputs = SmallVectorImpl<ValuePort>;
+
+// TODO(jpienaar): ComputeInputsRequiredForOutput and ComputeOutputComponent are
+// intended to be switched to op interfaces once more refined.
+LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
+                                             ComputedQueryFn has_been_computed,
+                                             ValuePortInputs* inputs) {
+  auto op = value_port.producer.dyn_cast<Operation*>();
+  auto& port = value_port.port;
+  if (!op) return failure();
+
+  // No inputs required for constants.
+  if (matchPattern(op, m_Constant())) return success();
+
+  // Note: this focusses only on the trivial pack op case and this could be
+  // generalized.
+  if (auto pack_op = dyn_cast<TF::PackOp>(op)) {
+    if (pack_op.getType().cast<TensorType>().getRank() != 1) return failure();
+    if (port.size() != 2) return failure();
+    assert(port[0] == 0);
+    ValuePort req(pack_op.getOperand(port[1]));
+    if (!has_been_computed(req)) inputs->push_back(req);
+    return success();
+  }
+
+  return failure();
+}
+
+// Computes the output produced by ValuePort using the query function of
+// existing computed values.
+Attribute ComputeOutputComponent(const ValuePort& value_port,
+                                 ValueQueryFn values) {
+  LLVM_DEBUG(value_port.print(llvm::errs() << "\nComputing output for "));
+
+  auto op = value_port.producer.dyn_cast<Operation*>();
+  if (!op) return nullptr;
+  auto& port = value_port.port;
+
+  if (port.empty()) {
+    LLVM_DEBUG(llvm::dbgs() << "skipping, port outside spec of " << op << "\n");
+    return nullptr;
+  }
+
+  ElementsAttr attr;
+  if (matchPattern(op, m_Constant(&attr))) {
+    if (port.size() == 1 && port[0] == 0) return attr;
+    return nullptr;
+  }
+
+  // Note: this focusses only on the trivial pack op case and this could be
+  // generalized.
+  if (auto pack_op = dyn_cast<TF::PackOp>(op)) {
+    if (pack_op.getType().cast<TensorType>().getRank() != 1) return nullptr;
+    if (port.size() != 2 || port[0] != 0) return nullptr;
+    ValuePort op_port(op->getOperand(port[1]));
+    return values(op_port);
+  }
+  return nullptr;
+}
+
+// Context used during ShapeInference. This class contains common information
+// that is required by the individual shape inference helper functions (e.g.,
+// TF Graph version, constant values computed, etc.)
+class ShapeInference {
+ public:
+  ShapeInference(int64_t graph_version, MLIRContext* context);
+
+  LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
+                                               ValuePortInputs* inputs) {
+    return ::mlir::TF::ComputeInputsRequiredForOutput(
+        value_port,
+        [this](const ValuePort& port) {
+          return results_.find(port) != results_.end();
+        },
+        inputs);
+  }
+
+  Attribute ComputeOutputComponent(const ValuePort& value_port) {
+    return ::mlir::TF::ComputeOutputComponent(
+        value_port, [this](const ValuePort& port) { return results_[port]; });
+  }
+
+  // Returns ShapeHandle if the op result could be computed as shape.
+  ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic);
+
+  void RecordValue(const ValuePort& value_port, Attribute value) {
+    results_[value_port] = value;
+  }
+
+  // Performs shape inference on the provided op and return true if the type of
+  // at least one result has been changed.
+  // A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
+  // `graph_version` indicates the current GraphDef compatibility versions
+  // (the versions field in graph.proto).
+  bool InferShapeForSingleOperation(Operation* op);
+
+  // Infers shape on the provided region, including nested ones, iterate until
+  // fix point with a limit of max_iteration. Returns success if fix point is
+  // reached before max_iteration.
+  LogicalResult InferShapeUntilFixPoint(Region* region,
+                                        int64_t max_iteration = 10);
+
+  // Updates input types and refine shapes inside body of functions that are
+  // attached to ControlFlow ops (If/While). These functions include Then/Else
+  // branches of IfOp and Cond/Body functions of WhileOp. These functions share
+  // following common properties:
+  //   1) They are never reused, ie. having a single use in module.
+  //   2) Their input types match those of their parent ops (excluding inputs
+  //      like predicate).
+  // Returns a boolean indicating whether any change has been applied.
+  LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
+                                              ArrayRef<Type> input_types,
+                                              int64_t max_iteration);
+
+  // Propagate the shapes to the functions named.
+  LogicalResult PropagateShapeToFunctions(
+      ModuleOp module, Operation::operand_type_range input_types,
+      ArrayRef<StringRef> func_names, int64_t max_iteration);
+
+  // Shape propagation for call/control flow ops.
+  LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
+                                                    int64_t max_iteration);
+
+ private:
+  // Mapping between ValuePort (which corresponds to an OpResult or smaller,
+  // e.g., first element of OpResult produded) to an Attribute if the ValuePort
+  // corresponds to a constant value.
+  ValuePortResultMap results_;
+  int64_t graph_version_;
+  MLIRContext* context_;
+  Dialect* tf_dialect_;
+};
+
+ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context)
+    : graph_version_(graph_version) {
+  context_ = context;
+  tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
+}
+
+ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
+                                                 InferenceContext* ic) {
+  LLVM_DEBUG(result.print(llvm::dbgs() << "\nEvaluate partially "));
+  auto rt = result.getType().dyn_cast<RankedTensorType>();
+  if (!rt || !rt.hasStaticShape() || rt.getRank() != 1) return {};
+  int dim_size = rt.getDimSize(0);
+
+  // Worklist to direct partial evaluation.
+  SmallVector<ValuePort, 4> worklist;
+
+  // Simple evaluator that attempts to partially evaluate the input value even
+  // if unable to evaluate the complete output. Below follows a simple stack
+  // based evaluation where it queries what operands/part of operands need to
+  // be evaluated and attempting to partially evaluate those operands. It does
+  // so by pushing the operands that need to be required on to the worklist
+  // before enqueuing the operation requiering those values.
+  std::vector<DimensionHandle> dims(dim_size, ic->UnknownDim());
+  for (unsigned int i = 0, e = dims.size(); i != e; ++i) {
+    LLVM_DEBUG(llvm::dbgs() << "\nConsidering output dim " << i << "\n");
+
+    worklist.push_back(
+        ValuePort{result.getOwner(), {result.getResultNumber(), i}});
+    while (!worklist.empty()) {
+      auto front = worklist.pop_back_val();
+      LLVM_DEBUG(front.print(llvm::errs() << "\nWorklist front "));
+
+      SmallVector<ValuePort, 4> inputs;
+      auto res = ComputeInputsRequiredForOutput(front, &inputs);
+      if (failed(res)) {
+        // Abort if unable to find which required inputs need to be computed.
+        worklist.clear();
+        break;
+      }
+
+      if (!inputs.empty()) {
+        // Enqueue required computation followed by its required operands in
+        // stack.
+        worklist.push_back(std::move(front));
+        for (auto& it : inputs) worklist.push_back(std::move(it));
+        continue;
+      }
+
+      auto ret = ComputeOutputComponent(front);
+      if (!ret) continue;
+
+      RecordValue(front, ret);
+      LLVM_DEBUG(ret.print(llvm::dbgs() << "\ncomputed result = "));
+
+      // If worklist is empty, then this is the root query op.
+      if (worklist.empty()) {
+        LLVM_DEBUG(llvm::dbgs() << "[root node]\n");
+        if (auto dea = ret.dyn_cast<DenseIntElementsAttr>()) {
+          if (dea.getNumElements() != 1) {
+            LLVM_DEBUG(llvm::errs() << "Unexpected number of elements\n");
+            return {};
+          }
+          int64_t val = (*dea.getIntValues().begin()).getSExtValue();
+          dims[i] = ic->MakeDim(val);
+        }
+      }
+    }
+  }
+  return ic->MakeShape(dims);
+}
+
+bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
+  assert(tf_dialect_ == op->getDialect());
   // The shape function of these ops sometimes does not propagate subtypes
   // (handle shapes) for resource and variant types. We use a simple passthrough
   // to make sure they are preserved in the output.
@@ -289,15 +614,9 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // If no result for this op needs shape inference, we have a fast-path return.
   // But if the type is a resource/variant, we do not skip it because we might
   // not have the handle shapes.
-  if (llvm::all_of(op->getResultTypes(), [](Type type) {
-        auto shape_type = type.dyn_cast<ShapedType>();
-        return !shape_type ||
-               (shape_type.hasStaticShape() &&
-                !shape_type.getElementType().isa<TF::ResourceType>() &&
-                !shape_type.getElementType().isa<TF::VariantType>());
-      })) {
+  if (none_of(op->getResultTypes(), CanBeRefined)) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for statically shaped op '"
-                            << op->getName() << "'.\n";);
+                            << op->getName() << "'.\n");
     return false;
   }
 
@@ -310,12 +629,12 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // This is necessary to avoid reprocessing the tf.Cast that are inserted at
   // the end of this function.
   if (isa<CastOp>(op) &&
-      llvm::all_of(op->getResult(0).getUsers(), [&](Operation* user) {
-        return user->getDialect() != tf_dialect;
+      all_of(op->getResult(0).getUsers(), [&](Operation* user) {
+        return user->getDialect() != tf_dialect_;
       })) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for tf.Cast with no TF "
                                "dialect operation users '"
-                            << *op << "'.\n";);
+                            << *op << "'.\n");
     return false;
   }
 
@@ -330,13 +649,13 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
       tensorflow::OpRegistry::Global()->LookUp(node_name.data());
   if (!op_reg_data) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for unregistered op '"
-                            << op->getName() << "'.\n";);
+                            << op->getName() << "'.\n");
     return false;
   }
   if (op_reg_data->shape_inference_fn == nullptr) {
     LLVM_DEBUG(llvm::dbgs()
-                   << "Skipping inference for op without shape function '"
-                   << op->getName() << "'.\n";);
+               << "Skipping inference for op without shape function '"
+               << op->getName() << "'.\n");
     return false;
   }
 
@@ -391,9 +710,9 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // Perform the shape inference using an InferenceContext with the input
   // shapes. This object is abstracting the information that the ShapeInference
   // function operates on.
-  tensorflow::shape_inference::InferenceContext c(
-      graph_version, *node_def, op_reg_data->op_def, input_shapes,
-      input_tensors, /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
+  InferenceContext c(graph_version_, *node_def, op_reg_data->op_def,
+                     input_shapes, input_tensors,
+                     /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
   auto status = c.Run(op_reg_data->shape_inference_fn);
   if (!status.ok()) {
     LLVM_DEBUG(llvm::dbgs() << "Shape inference error for '" << *op
@@ -401,6 +720,43 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
     return false;
   }
 
+  // Determine if, during shape computation, the shape functions attempted to
+  // query an input operand as shape where the input was not known/constant.
+  bool requires_inputs =
+      any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
+        return c.requested_input_tensor_as_partial_shape(input) &&
+               !input_tensors[input];
+      });
+  if (requires_inputs) {
+    std::vector<ShapeHandle> input_tensors_as_shapes;
+    for (int input : llvm::seq<int>(0, c.num_inputs())) {
+      if (c.requested_input_tensor_as_partial_shape(input) &&
+          !input_tensors[input]) {
+        auto op_result = op->getOperand(input).dyn_cast<OpResult>();
+        if (!op_result) continue;
+        // Resize on first valid shape computed.
+        input_tensors_as_shapes.resize(c.num_inputs());
+        auto handle = ComputeOutputAsShape(op_result, &c);
+        LLVM_DEBUG(llvm::dbgs() << "Requested " << input << " as shape "
+                                << (handle.Handle() ? "found" : "not found"));
+        if (handle.Handle()) input_tensors_as_shapes[input] = handle;
+      }
+    }
+
+    // Attempt to compute the unknown operands as shapes.
+    // Note: in the case where no partial outputs could be computed, this would
+    // be empty.
+    if (!input_tensors_as_shapes.empty()) {
+      c.set_input_tensors_as_shapes(input_tensors_as_shapes);
+      auto status = c.Run(op_reg_data->shape_inference_fn);
+      if (!status.ok()) {
+        LLVM_DEBUG(llvm::dbgs() << "Shape inference error for '" << *op
+                                << "': " << status.error_message() << "\n");
+        return false;
+      }
+    }
+  }
+
   assert(c.num_outputs() == op->getNumResults() &&
          "inference context matches the MLIR number of results.");
 
@@ -410,15 +766,14 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   for (int output : llvm::seq<int>(0, c.num_outputs())) {
     // Skip already statically shaped results.
     Value result = op->getResult(output);
-    auto shaped_type = result.getType().dyn_cast<ShapedType>();
-    if (!shaped_type || shaped_type.hasStaticShape()) continue;
+    if (!CanBeRefined(result.getType())) continue;
+    auto shaped_type = result.getType().cast<ShapedType>();
 
-    tensorflow::shape_inference::ShapeHandle shape_handle = c.output(output);
+    ShapeHandle shape_handle = c.output(output);
     LLVM_DEBUG(llvm::dbgs() << "Inferred output " << output << " : "
                             << c.DebugString(shape_handle) << "\n");
-    auto get_tensor_type =
-        [&c](const tensorflow::shape_inference::ShapeHandle& sh,
-             Type element_type) -> TensorType {
+    auto get_tensor_type = [&c](const ShapeHandle& sh,
+                                Type element_type) -> TensorType {
       if (!c.RankKnown(sh)) return UnrankedTensorType::get(element_type);
       // Convert the shape from TensorFlow (int64) to MLIR (int64_t).
       SmallVector<int64_t, 8> shape;
@@ -432,7 +787,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
         new_element_type.isa<TF::VariantType>()) {
       auto handle_shapes_types = c.output_handle_shapes_and_types(output);
       if (handle_shapes_types) {
-        llvm::SmallVector<mlir::TensorType, 1> subtypes;
+        SmallVector<TensorType, 1> subtypes;
         OpBuilder b(op);
         for (const auto& shape_n_type : *handle_shapes_types) {
           Type element_type;
@@ -452,7 +807,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
     if (result.getType() == new_type) continue;
     // Inserts a cast back to the original type if any user is not in the TF
     // dialect.
-    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect,
+    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect_,
                                        result.getType());
     // Finally we inferred the shape and replace the type for this result.
     result.setType(new_type);
@@ -464,31 +819,19 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   return changed;
 }
 
-// Updates input types and refine shapes inside body of functions that are
-// attached to ControlFlow ops (If/While). These functions include Then/Else
-// branches of IfOp and Cond/Body functions of WhileOp. These functions share
-// following common properties:
-//   1) They are never reused, ie. having a single use in module.
-//   2) Their input types match those of their parent ops (excluding inputs like
-//      predicate).
-// Returns a boolean indicating whether any change has been applied.
-LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
-                                            llvm::ArrayRef<Type> input_types,
-                                            int64_t graph_version,
-                                            int64_t max_iteration) {
+LogicalResult ShapeInference::RefineShapeForControlFlowFunc(
+    FuncOp func, ArrayRef<Type> input_types, int64_t max_iteration) {
   ModuleOp module = func.getParentOfType<ModuleOp>();
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
   if (num_uses != 1) {
-    func.emitWarning(llvm::formatv(
+    func.emitWarning(formatv(
         "expected control flow function {0} to have exactly 1 use, found {1}.",
         func.getName(), num_uses));
     return failure();
   }
 
   FunctionType func_type = func.getType();
-  if (input_types == func_type.getInputs()) return success();
-
   func.setType(FunctionType::get(input_types, func_type.getResults(),
                                  func.getContext()));
 
@@ -496,8 +839,7 @@ LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
     arg_and_idx.value().setType(input_types[arg_and_idx.index()]);
   }
 
-  auto res =
-      InferShapeUntilFixPoint(&func.getBody(), graph_version, max_iteration);
+  auto res = InferShapeUntilFixPoint(&func.getBody(), max_iteration);
   if (failed(res)) return res;
 
   auto new_return_types = InferShapeForFunctionReturnType(func);
@@ -509,41 +851,85 @@ LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
   return success();
 }
 
-LogicalResult PropagateShapeToFunctions(
+LogicalResult ShapeInference::PropagateShapeToFunctions(
     ModuleOp module, Operation::operand_type_range input_types,
-    llvm::ArrayRef<StringRef> func_names, int64_t graph_version,
-    int64_t max_iteration) {
-  bool success = true;
+    ArrayRef<StringRef> func_names, int64_t max_iteration) {
+  bool all_succeeded = true;
   auto types = llvm::to_vector<4>(input_types);
   for (auto func_name : func_names) {
     FuncOp func = module.lookupSymbol<FuncOp>(func_name);
-    if (failed(RefineShapeForControlFlowFunc(func, types, graph_version,
-                                             max_iteration))) {
-      success = false;
-    }
+    all_succeeded =
+        succeeded(RefineShapeForControlFlowFunc(func, types, max_iteration)) &&
+        all_succeeded;
   }
-  return mlir::success(success);
+  return success(all_succeeded);
 }
 
-LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
-                                                  int64_t graph_version,
-                                                  int64_t max_iteration) {
+// If the callee has only one use, propagates any constant operand of call_op to
+// the called function body's corresponding argument.
+//
+// TODO(b/154065712): Move this to a more general inter-procedural constant
+// folding pass.
+void PropagateConstantToCallee(CallOpInterface call_op,
+                               SymbolRefAttr callee_sym, ModuleOp module) {
+  auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
+  auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
+  int num_uses = std::distance(func_uses->begin(), func_uses->end());
+  OpBuilder builder(&func.front().front());
+  Operation* op = call_op.getOperation();
+  if (num_uses == 1) {
+    // If this is the only caller, and an operand is a constant, propagate
+    // the constant inside the function.
+    for (auto arg : func.getArguments()) {
+      auto operand = op->getOperand(arg.getArgNumber()).getDefiningOp();
+      if (isa_and_nonnull<TF::ConstOp>(operand)) {
+        arg.replaceAllUsesWith(builder.clone(*operand)->getResult(0));
+      }
+    }
+  }
+}
+
+// Propagates any constant return value of the callee function to the call op's
+// corresponding result.
+void PropagateConstantFromCallee(CallOpInterface call_op,
+                                 SymbolRefAttr callee_sym, ModuleOp module) {
+  auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
+  // If the return value is a constant, replace the call result with a constant.
+  Operation* op = call_op.getOperation();
+  OpBuilder builder(op);
+  builder.setInsertionPointAfter(op);
+  for (auto retval :
+       llvm::enumerate(func.front().getTerminator()->getOperands())) {
+    auto retval_op = retval.value().getDefiningOp();
+    if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
+      op->getResult(retval.index())
+          .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
+    }
+  }
+}
+
+LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
+    Operation* op, int64_t max_iteration) {
   ModuleOp module = op->getParentOfType<ModuleOp>();
   if (auto if_op = dyn_cast<TF::IfOp>(op)) {
     return PropagateShapeToFunctions(
-        module, llvm::drop_begin(if_op.getOperandTypes(), 1),
-        {if_op.then_branch(), if_op.else_branch()}, graph_version,
-        max_iteration);
+        module, drop_begin(if_op.getOperandTypes(), 1),
+        {if_op.then_branch(), if_op.else_branch()}, max_iteration);
   } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
     return PropagateShapeToFunctions(module, while_op.getOperandTypes(),
                                      {while_op.cond(), while_op.body()},
-                                     graph_version, max_iteration);
+                                     max_iteration);
   } else if (auto call_op = dyn_cast<CallOpInterface>(op)) {
     CallInterfaceCallable callable = call_op.getCallableForCallee();
     if (SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>()) {
-      return PropagateShapeToFunctions(
-          module, call_op.getArgOperands().getTypes(), {sym.getRootReference()},
-          graph_version, max_iteration);
+      PropagateConstantToCallee(call_op, sym, module);
+      if (failed(PropagateShapeToFunctions(
+              module, call_op.getArgOperands().getTypes(),
+              {sym.getRootReference()}, max_iteration))) {
+        return failure();
+      }
+      PropagateConstantFromCallee(call_op, sym, module);
+      return success();
     }
   }
 
@@ -552,13 +938,10 @@ LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
   return success();
 }
 
-LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
-                                      int64_t max_iteration) {
-  MLIRContext* ctx = region->getContext();
-  Dialect* tf_dialect = ctx->getRegisteredDialect<TensorFlowDialect>();
-
-  // An operation folder that is used to attempt folding before inference.
-  OperationFolder folder(ctx);
+LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
+                                                      int64_t max_iteration) {
+  // An operation folder that is used to attempt folding before inference._
+  OperationFolder folder(context_);
   bool changed = true;
 
   // TODO(aminim): we could have a more efficient traversal by guiding the
@@ -570,8 +953,15 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
     LLVM_DEBUG(llvm::dbgs()
                << "Shape inference, iteration " << iteration << "\n");
     region->walk([&](Operation* op) {
-      if (op->getDialect() != tf_dialect) {
-        changed |= InferShapeForNonTFDialectOperation(op, tf_dialect);
+      if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
+        changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect_);
+        // TODO(jpienaar): Debug why we can't just return here. We end up with
+        // additional constant due to the propagation of constant into attached
+        // function if we return already.
+      }
+
+      if (op->getDialect() != tf_dialect_) {
+        changed |= InferShapeForNonTFDialectOperation(op, tf_dialect_);
         return;
       }
 
@@ -580,13 +970,12 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
 
       // Best-effort shape inference in attached functions. Do not return
       // failure even if it doesn't get to fixed point.
-      if (failed(PropagateShapeIntoAttachedFunctions(op, graph_version,
-                                                     max_iteration))) {
+      if (failed(PropagateShapeIntoAttachedFunctions(op, max_iteration))) {
         op->emitWarning() << "unable to refine shape of attached function "
                              "arguments and bodies";
       }
 
-      changed |= InferShapeForSingleOperation(op, tf_dialect, graph_version);
+      changed |= InferShapeForSingleOperation(op);
     });
   }
 
@@ -601,31 +990,43 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
 LogicalResult InferShapeForFunction(FuncOp func,
                                     ArrayRef<ArrayRef<int64_t>> arg_shapes,
                                     int64_t graph_version) {
-  mlir::FunctionType func_type = func.getType();
+  ShapeInference context(graph_version, func.getContext());
+  if (arg_shapes.empty()) {
+    if (failed(context.InferShapeUntilFixPoint(&func.getBody())))
+      return failure();
+    // TODO(b/156276510): Verify that it is always fine to refine a function's
+    // return type, as long as we do not change the argument shapes.
+    if (auto return_types = InferShapeForFunctionReturnType(func)) {
+      func.setType(FunctionType::get(func.getType().getInputs(),
+                                     return_types.getValue(),
+                                     func.getContext()));
+    }
+
+    return success();
+  }
+  FunctionType func_type = func.getType();
   bool needs_refinement = false;
-  llvm::SmallVector<mlir::Type, 4> new_arg_types;
+  SmallVector<Type, 4> new_arg_types;
   new_arg_types.reserve(func_type.getNumInputs());
 
   // Update argument types in-place using the provided arg_shapes.
   for (size_t i = 0; i < func_type.getNumInputs(); ++i) {
     ArrayRef<int64_t> shape = arg_shapes[i];
-    mlir::Type element_type;
-    if (auto input_ty =
-            func_type.getInput(i).dyn_cast<mlir::RankedTensorType>()) {
+    Type element_type;
+    if (auto input_ty = func_type.getInput(i).dyn_cast<RankedTensorType>()) {
       if (!input_ty || input_ty.getShape().size() != shape.size()) {
         return failure();
       }
       element_type = input_ty.getElementType();
     } else {
-      auto unranked_input_ty =
-          func_type.getInput(i).dyn_cast<mlir::TensorType>();
+      auto unranked_input_ty = func_type.getInput(i).dyn_cast<TensorType>();
       if (!unranked_input_ty) {
         return failure();
       }
       element_type = unranked_input_ty.getElementType();
     }
 
-    auto new_arg_type = mlir::RankedTensorType::get(shape, element_type);
+    auto new_arg_type = RankedTensorType::get(shape, element_type);
     if (new_arg_type != func_type.getInput(i)) {
       // If the new type is more detailed, trigger shape inference.
       func.getArgument(i).setType(new_arg_type);
@@ -638,28 +1039,17 @@ LogicalResult InferShapeForFunction(FuncOp func,
     return success();
   }
 
-  mlir::LogicalResult result =
-      mlir::TF::InferShapeUntilFixPoint(&func.getBody(), graph_version);
+  LogicalResult result = context.InferShapeUntilFixPoint(&func.getBody());
   if (failed(result)) {
     return failure();
   }
 
   auto return_types = InferShapeForFunctionReturnType(func);
-  func.setType(mlir::FunctionType::get(new_arg_types,
-                                       return_types.hasValue()
-                                           ? return_types.getValue()
-                                           : func.getType().getResults(),
-                                       func.getContext()));
-
-  return success();
-}
-
-LogicalResult InferShapeForFunctionType(FuncOp func) {
-  if (auto return_types = InferShapeForFunctionReturnType(func)) {
-    func.setType(mlir::FunctionType::get(func.getType().getInputs(),
-                                         return_types.getValue(),
-                                         func.getContext()));
-  }
+  func.setType(FunctionType::get(new_arg_types,
+                                 return_types.hasValue()
+                                     ? return_types.getValue()
+                                     : func.getType().getResults(),
+                                 func.getContext()));
 
   return success();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
index 0524ec678ed..e36d8d56d6d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
@@ -27,30 +27,13 @@ namespace mlir {
 
 namespace TF {
 
-// Performs shape inference on the provided op and return true if the type of
-// at least one result has been changed.
-// A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
-// `graph_version` indicates the current GraphDef compatibility versions
-// (the versions field in graph.proto).
-bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
-                                  int64_t graph_version);
-
-// Infers shape on the provided region, including nested ones, iterate until fix
-// point with a limit of max_iteration. Returns success if fix point is reached
-// before max_iteration.
-LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
-                                      int64_t max_iteration = 10);
-
 // Given a list of refined shapes matching the function arguments of func, runs
 // shape inference over the function to propagate this updated information.
+// If arg_shapes are empty, then argument shapes will be left unchanged.
 LogicalResult InferShapeForFunction(FuncOp func,
                                     ArrayRef<ArrayRef<int64_t>> arg_shapes,
                                     int64_t graph_version);
 
-// Refines the return type of the given function by folding tf.Cast that
-// precedes the return instruction.
-LogicalResult InferShapeForFunctionType(FuncOp func);
-
 }  // namespace TF
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
index 48e4e77ce0f..acdfc0eb039 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
@@ -58,10 +58,8 @@ struct ShapeInference
     }
     int64_t producer = producer_or.ValueOrDie();
     for (auto func : module.getOps<FuncOp>()) {
-      InferShapeUntilFixPoint(&func.getBody(), producer);
-      // TODO(yuanzx): Verify that it is always fine to refine a function's
-      // return type, as long as we do not change the argument shapes.
-      InferShapeForFunctionType(func);
+      if (failed(InferShapeForFunction(func, /*arg_shapes=*/{}, producer)))
+        return signalPassFailure();
     }
   }
 };
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc
index 0eafdea0964..e62df78ed11 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc
@@ -41,15 +41,15 @@ using ::mlir::TF::ConstOp;
 class ExecutorConstantSinking
     : public mlir::PassWrapper<ExecutorConstantSinking, FunctionPass> {
   void runOnFunction() override {
-    getFunction().walk([](tf_device::LaunchOp launch) {
-      LLVM_DEBUG(llvm::dbgs() << "Visit " << *launch.getOperation() << "\n");
+    getFunction().walk([](tf_device::ClusterOp cluster) {
+      LLVM_DEBUG(llvm::dbgs() << "Visit " << *cluster.getOperation() << "\n");
       // For each launch op, we find the values used that come from a constant
       // defined above and sink these constants in the region body.
       // The sunk_constant map keeps a mapping from a ConstOp defined above to
       // a sunk clone of it. This allows for reusing a sunk constant with
       // multiple uses in the region.
       llvm::DenseMap<Value, TF::ConstOp> sunk_constant;
-      Region &body = launch.body();
+      Region &body = cluster.body();
       visitUsedValuesDefinedAbove(body, [&](OpOperand *use) {
         Value constant = use->get();
         auto const_op = dyn_cast_or_null<TF::ConstOp>(constant.getDefiningOp());
@@ -84,7 +84,7 @@ class ExecutorConstantSinking
 
 static mlir::PassRegistration<ExecutorConstantSinking> pass(
     "tf-device-constant-sinking",
-    "Sink constants implicitly captured in a tf_device.launch region. This "
+    "Sink constants implicitly captured in a tf_device.cluster region. This "
     "reduces the number of arguments when outlining later.");
 
 }  // anonymous namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index 55b22ad8625..c349c2b4c3e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -154,14 +154,14 @@ struct PartitionedCallStackOpsInfo {
 
 LogicalResult DecomposeStackOpsInternal(
     Block*, ModuleOp, llvm::SmallDenseMap<Value, Value>*,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallStackOpsInfo>*);
+    llvm::StringMap<PartitionedCallStackOpsInfo>*);
 
 // Handles stack usage by a tf.While. It will convert the body and conditional
 // function signatures, and performs stack ops decomposition on them.
 LogicalResult HandleWhileOp(
     TF::WhileOp while_op, ModuleOp module,
     const llvm::SmallDenseMap<Value, Value>& data_var_to_size_var,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallStackOpsInfo>*
+    llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
   auto body = module.lookupSymbol<FuncOp>(while_op.body());
   llvm::SmallDenseMap<Value, Value> body_map;
@@ -207,9 +207,8 @@ LogicalResult HandleWhileOp(
     new_while_operands.push_back(it->getSecond());
     if (!new_output_shapes.empty()) {
       // Size is a scalar shape.
-      tensorflow::TensorShapeProto shape_proto;
-      new_output_shapes.push_back(builder.getStringAttr(
-          tensorflow::mangling_util::MangleShape(shape_proto)));
+      new_output_shapes.push_back(
+          mlir::TF::ShapeAttr::get(builder.getContext(), ArrayRef<int64_t>()));
     }
   }
   auto new_while =
@@ -238,7 +237,7 @@ LogicalResult HandleWhileOp(
 LogicalResult HandleIfOp(
     TF::IfOp if_op, ModuleOp module,
     const llvm::SmallDenseMap<Value, Value>& data_var_to_size_var,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallStackOpsInfo>*
+    llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
   auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
   auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
@@ -295,11 +294,11 @@ template <typename CallOp>
 LogicalResult HandlePartitionedCallOp(
     CallOp call, FuncOp callee, ModuleOp module,
     const llvm::SmallDenseMap<Value, Value>& data_var_to_size_var,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallStackOpsInfo>*
+    llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
   auto emplace_res = decomposed_partitioned_call_callees->try_emplace(
-      callee, PartitionedCallStackOpsInfo());
-  auto& info = emplace_res.first->getSecond();
+      callee.getName(), PartitionedCallStackOpsInfo());
+  auto& info = emplace_res.first->second;
   // Recreate the call op with info.
   auto recreate_caller = [&] {
     auto new_operands = llvm::to_vector<8>(call.getOperands());
@@ -343,39 +342,38 @@ LogicalResult HandlePartitionedCallOp(
     return recreate_caller();
   }
   llvm::SmallDenseMap<Value, Value> callee_map;
-  auto callee_clone = callee.clone();
+  FuncOp lowered_callee = callee;
+  if (callee.getVisibility() != SymbolTable::Visibility::Private) {
+    // Clone non-private callee in case of signature change.
+    lowered_callee = callee.clone();
+    lowered_callee.setVisibility(SymbolTable::Visibility::Private);
+  }
   auto find_arg_stack_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = data_var_to_size_var.find(call.getOperand(index));
     if (it == data_var_to_size_var.end()) return llvm::None;
     return it->getFirst().getType();
   };
-  ModifyFunctionSignature(callee_clone, &callee_map, find_arg_stack_type);
-  if (callee_map.empty()) {
+  ModifyFunctionSignature(lowered_callee, &callee_map, find_arg_stack_type);
+  info.signature_change = !callee_map.empty();
+  if (!info.signature_change) {
     // Signature is not modified. We do not need the clone.
-    info.signature_change = false;
-    callee_clone.erase();
+    if (lowered_callee != callee) {
+      lowered_callee.erase();
+    }
   } else {
-    info.signature_change = true;
-    info.decomposed_callee = callee_clone;
+    info.decomposed_callee = lowered_callee;
     for (auto& entry : callee_map) {
       info.stack_var_arg_to_size_arg
           [entry.getFirst().cast<BlockArgument>().getArgNumber()] =
           entry.getSecond().cast<BlockArgument>().getArgNumber();
     }
-    // Add the clone with a new name.
-    auto name_base = llvm::join(
-        std::vector<std::string>{callee.getName().str(), "stack_decomposed"},
-        "_");
-    auto name = name_base;
-    {
-      int64_t counter = 0;
-      while (module.lookupSymbol(name)) {
-        name = llvm::formatv("{0}_{1}", name_base, counter++).str();
-      }
+    if (lowered_callee != callee) {
+      // Add the clone with a new name.
+      lowered_callee.setName(
+          llvm::formatv("{0}_stack_decomposed", callee.getName()).str());
+      SymbolTable(module).insert(lowered_callee);
+      callee = lowered_callee;
     }
-    callee_clone.setName(name);
-    SymbolTable(module).insert(callee_clone);
-    callee = callee_clone;
   }
   if (failed(DecomposeStackOpsInternal(&callee.front(), module, &callee_map,
                                        decomposed_partitioned_call_callees))) {
@@ -487,7 +485,7 @@ LogicalResult HandleStackPopV2Op(
 LogicalResult DecomposeStackOpsInternal(
     Block* block, ModuleOp module,
     llvm::SmallDenseMap<Value, Value>* data_var_to_size_var,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallStackOpsInfo>*
+    llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
   for (auto& op : llvm::make_early_inc_range(block->getOperations())) {
     if (llvm::isa<TF::IdentityOp>(&op) || llvm::isa<TF::IdentityNOp>(&op)) {
@@ -545,7 +543,7 @@ LogicalResult DecomposeStackOpsInternal(
 
 LogicalResult DecomposeStackOps(Block* block, ModuleOp module) {
   llvm::SmallDenseMap<Value, Value> data_var_to_size_var;
-  llvm::SmallDenseMap<FuncOp, PartitionedCallStackOpsInfo>
+  llvm::StringMap<PartitionedCallStackOpsInfo>
       decomposed_partitioned_call_callees;
   return DecomposeStackOpsInternal(block, module, &data_var_to_size_var,
                                    &decomposed_partitioned_call_callees);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index 8e0c34a8c83..cfeb2b1f031 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -105,17 +105,12 @@ LogicalResult GetSplitElementTypeAndCount(TF::TensorArraySplitV3Op split,
 // Tries to infer the tensor array element shape.
 llvm::Optional<llvm::SmallVector<int64_t, 8>> GetTensorArrayElementShape(
     TF::TensorArrayV3Op ta, ModuleOp module) {
-  tensorflow::TensorShapeProto element_shape;
-  if (tensorflow::mangling_util::DemangleShape(ta.element_shape().str(),
-                                               &element_shape)
-          .ok()) {
-    tensorflow::PartialTensorShape shape(element_shape);
-    if (shape.IsFullyDefined()) {
-      // Convert int64 to int64_.
-      auto int64_dims = shape.dim_sizes();
-      llvm::SmallVector<int64_t, 8> dims(int64_dims.begin(), int64_dims.end());
-      return dims;
-    }
+  auto element_shape = ta.element_shapeAttr().cast<mlir::TF::ShapeAttr>();
+  if (element_shape.hasStaticShape()) {
+    auto shape = element_shape.getShape();
+    // Convert int64 to int64_.
+    llvm::SmallVector<int64_t, 8> dims(shape.begin(), shape.end());
+    return dims;
   }
 
   bool has_failure = false;
@@ -531,13 +526,12 @@ void ChangeFunctionInputSignature(
 
 LogicalResult DecomposeTensorArrayOps(
     Block*, ModuleOp, llvm::SmallDenseMap<Value, TensorArrayStats>*,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallTensorArrayOpsInfo>*);
+    llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*);
 
-LogicalResult HandleWhileOp(
-    TF::WhileOp while_op, ModuleOp module,
-    llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallTensorArrayOpsInfo>*
-        decomposed_partitioned_call_callees) {
+LogicalResult HandleWhileOp(TF::WhileOp while_op, ModuleOp module,
+                            llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
+                            llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
+                                decomposed_partitioned_call_callees) {
   auto body = module.lookupSymbol<FuncOp>(while_op.body());
   auto cond = module.lookupSymbol<FuncOp>(while_op.cond());
   auto grads = AccessedGradients({body, cond}, module);
@@ -619,11 +613,10 @@ LogicalResult HandleWhileOp(
   return success();
 }
 
-LogicalResult HandleIfOp(
-    TF::IfOp if_op, ModuleOp module,
-    llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallTensorArrayOpsInfo>*
-        decomposed_partitioned_call_callees) {
+LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
+                         llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
+                         llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
+                             decomposed_partitioned_call_callees) {
   auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
   auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
   auto grads = AccessedGradients({then_branch, else_branch}, module);
@@ -706,11 +699,11 @@ template <typename CallOp>
 LogicalResult HandlePartitionedCallOp(
     CallOp call, FuncOp callee, ModuleOp module,
     llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallTensorArrayOpsInfo>*
+    llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
         decomposed_partitioned_call_callees) {
   auto emplace_res = decomposed_partitioned_call_callees->try_emplace(
-      callee, PartitionedCallTensorArrayOpsInfo());
-  auto& info = emplace_res.first->getSecond();
+      callee.getName(), PartitionedCallTensorArrayOpsInfo());
+  auto& info = emplace_res.first->second;
   // Recreates the call op with info.
   auto recreate_caller = [&]() -> LogicalResult {
     auto new_operands = llvm::to_vector<8>(call.getOperands());
@@ -752,7 +745,7 @@ LogicalResult HandlePartitionedCallOp(
     if (!info.signature_change) return success();
     return recreate_caller();
   }
-  // Rewrite the callee on a cloned function.
+  // Rewrite the callee.
   info.signature_change = false;
   auto ta_arg_buffer_type = [&](int64_t index) -> Type {
     auto it = stats->find(call.getOperand(index));
@@ -765,45 +758,46 @@ LogicalResult HandlePartitionedCallOp(
     if (it == stats->end()) return false;
     return it->getSecond().accumulate_on_write;
   };
-  auto callee_clone = callee.clone();
-  callee_clone.setVisibility(SymbolTable::Visibility::Private);
-  auto grads = AccessedGradients({callee_clone}, module);
-  for (int64_t i = 0; i < callee_clone.getNumArguments(); ++i) {
+  FuncOp lowered_callee = callee;
+  if (callee.getVisibility() != SymbolTable::Visibility::Private) {
+    // Clone non-private callee in case of signature change.
+    lowered_callee = callee.clone();
+    lowered_callee.setVisibility(SymbolTable::Visibility::Private);
+  }
+  auto grads = AccessedGradients({lowered_callee}, module);
+  for (int64_t i = 0; i < lowered_callee.getNumArguments(); ++i) {
     auto it = grads.find(i);
     if (it == grads.end()) continue;
     info.arg_grads.emplace_back(i, it->getSecond());
   }
   llvm::SmallDenseMap<Value, TensorArrayStats> callee_stats;
-  ChangeFunctionInputSignature(callee_clone, grads, ta_arg_buffer_type,
+  ChangeFunctionInputSignature(lowered_callee, grads, ta_arg_buffer_type,
                                ta_accumulate_on_write, &callee_stats);
-  if (failed(DecomposeTensorArrayOps(&callee_clone.front(), module,
+  if (failed(DecomposeTensorArrayOps(&lowered_callee.front(), module,
                                      &callee_stats,
                                      decomposed_partitioned_call_callees))) {
     return failure();
   }
   for (int64_t i = 0; i < call.getNumResults(); ++i) {
-    auto ret = callee_clone.front().getTerminator()->getOperand(i);
+    auto ret = lowered_callee.front().getTerminator()->getOperand(i);
     if (!getElementTypeOrSelf(ret.getType()).isa<TF::ResourceType>()) continue;
     auto arg = ret.dyn_cast<BlockArgument>();
     if (!arg) continue;
     info.ret_forward_input.emplace_back(i, arg.getArgNumber());
   }
 
-  if (!info.signature_change) {
-    // Signature is not modified. We do not need to keep two copies.
-    info.signature_change = false;
-    auto name = callee.getName();
-    callee.erase();
-    callee_clone.setName(name);
-    SymbolTable(module).insert(callee_clone);
-  } else {
-    info.decomposed_callee = callee_clone;
-    // Add the clone with a new name.
-    auto name =
-        llvm::formatv("{0}_{1}", callee.getName(), "tensorarray_decomposed")
-            .str();
-    callee_clone.setName(name);
-    SymbolTable(module).insert(callee_clone);
+  info.decomposed_callee = lowered_callee;
+  if (lowered_callee != callee) {
+    if (!info.signature_change) {
+      // Signature is not modified. We do not need to keep two copies.
+      lowered_callee.setName(callee.getName());
+      callee.erase();
+    } else {
+      // Add the clone with a new name.
+      lowered_callee.setName(
+          llvm::formatv("{0}_tensorarray_decomposed", callee.getName()).str());
+    }
+    SymbolTable(module).insert(lowered_callee);
   }
   if (info.signature_change) return recreate_caller();
   return success();
@@ -812,7 +806,7 @@ LogicalResult HandlePartitionedCallOp(
 LogicalResult DecomposeTensorArrayOps(
     Block* block, ModuleOp module,
     llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallTensorArrayOpsInfo>*
+    llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
         decomposed_partitioned_call_callees) {
   for (auto& op : llvm::make_early_inc_range(block->getOperations())) {
     if (llvm::isa<TF::IdentityOp>(&op) || llvm::isa<TF::IdentityNOp>(&op)) {
@@ -880,7 +874,7 @@ void TensorArrayOpsDecompositionPass::runOnOperation() {
   auto main = module.lookupSymbol<FuncOp>("main");
   if (!main) return;
   llvm::SmallDenseMap<Value, TensorArrayStats> stats;
-  llvm::SmallDenseMap<FuncOp, PartitionedCallTensorArrayOpsInfo>
+  llvm::StringMap<PartitionedCallTensorArrayOpsInfo>
       decomposed_partitioned_call_callees;
   if (failed(DecomposeTensorArrayOps(&main.front(), module, &stats,
                                      &decomposed_partitioned_call_callees))) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 962e82df8a9..6e27823191b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
@@ -122,7 +123,7 @@ struct PartitionedCallDecompositionInfo {
 
 LogicalResult DecomposeTensorListOpsInternal(
     Block*, ModuleOp, llvm::SmallDenseMap<Value, SizeInfo>*,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*);
+    llvm::StringMap<PartitionedCallDecompositionInfo>*);
 
 // Adds the corresponding sizes of tensor list buffers in func's return values
 // to the list of return values. Returns the mapping from the buffer indices to
@@ -151,7 +152,7 @@ AddTensorListSizesToReturn(
 LogicalResult HandleWhileOp(
     TF::WhileOp while_op, ModuleOp module,
     llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*
+    llvm::StringMap<PartitionedCallDecompositionInfo>*
         decomposed_partitioned_call_callees) {
   // Rewrite body.
   auto body = module.lookupSymbol<FuncOp>(while_op.body());
@@ -197,9 +198,8 @@ LogicalResult HandleWhileOp(
     new_while_operands.push_back(it->getSecond().size);
     if (!new_output_shapes.empty()) {
       // Size is a scalar shape.
-      tensorflow::TensorShapeProto shape_proto;
-      new_output_shapes.push_back(builder.getStringAttr(
-          tensorflow::mangling_util::MangleShape(shape_proto)));
+      new_output_shapes.push_back(
+          mlir::TF::ShapeAttr::get(builder.getContext(), ArrayRef<int64_t>()));
     }
   }
   auto new_while =
@@ -216,11 +216,10 @@ LogicalResult HandleWhileOp(
   return success();
 }
 
-LogicalResult HandleIfOp(
-    TF::IfOp if_op, ModuleOp module,
-    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*
-        decomposed_partitioned_call_callees) {
+LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
+                         llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+                         llvm::StringMap<PartitionedCallDecompositionInfo>*
+                             decomposed_partitioned_call_callees) {
   // Rewrite the branches.
   auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
   auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
@@ -285,11 +284,11 @@ template <typename CallOp>
 LogicalResult HandlePartitionedCallOp(
     CallOp call, FuncOp callee, ModuleOp module,
     llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*
+    llvm::StringMap<PartitionedCallDecompositionInfo>*
         decomposed_partitioned_call_callees) {
   auto emplace_res = decomposed_partitioned_call_callees->try_emplace(
-      callee, PartitionedCallDecompositionInfo());
-  auto& info = emplace_res.first->getSecond();
+      callee.getName(), PartitionedCallDecompositionInfo());
+  auto& info = emplace_res.first->second;
   // Recreates the call op with info.
   auto recreate_caller = [&] {
     auto new_operands = llvm::to_vector<8>(call.getOperands());
@@ -325,10 +324,14 @@ LogicalResult HandlePartitionedCallOp(
     if (!info.signature_change) return success();
     return recreate_caller();
   }
-  // Rewrite the callee on a cloned function.
+  // Rewrite the callee.
   llvm::SmallDenseMap<Value, SizeInfo> callee_map;
-  auto callee_clone = callee.clone();
-  callee_clone.setVisibility(SymbolTable::Visibility::Private);
+  FuncOp lowered_callee = callee;
+  if (callee.getVisibility() != SymbolTable::Visibility::Private) {
+    // Clone non-private callee in case of signature change.
+    lowered_callee = callee.clone();
+    lowered_callee.setVisibility(SymbolTable::Visibility::Private);
+  }
   auto find_arg_buffer_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = buffer_to_size->find(call.getOperand(index));
     if (it == buffer_to_size->end()) return llvm::None;
@@ -337,41 +340,41 @@ LogicalResult HandlePartitionedCallOp(
   auto arg_buffer_size_is_fixed = [&](int64_t index) {
     return (*buffer_to_size)[call.getOperand(index)].fixed;
   };
-  ModifyFunctionSignature(callee_clone, cutil::GetSizeType(OpBuilder(call)),
+  ModifyFunctionSignature(lowered_callee, cutil::GetSizeType(OpBuilder(call)),
                           &callee_map, find_arg_buffer_type,
                           arg_buffer_size_is_fixed);
-  const bool args_no_changed = callee.empty();
+  const bool args_no_changed = callee_map.empty();
   if (failed(DecomposeTensorListOpsInternal(
-          &callee_clone.front(), module, &callee_map,
+          &lowered_callee.front(), module, &callee_map,
           decomposed_partitioned_call_callees))) {
     return failure();
   }
   info.buffer_ret_to_size_ret =
-      AddTensorListSizesToReturn(callee_clone, callee_map);
+      AddTensorListSizesToReturn(lowered_callee, callee_map);
+  info.decomposed_callee = lowered_callee;
   if (args_no_changed && info.buffer_ret_to_size_ret.empty()) {
     // Signature is not modified. We do not need to keep two copies.
     info.signature_change = false;
-    auto name = callee.getName();
-    callee.erase();
-    callee_clone.setName(name);
-    SymbolTable(module).insert(callee_clone);
+    if (lowered_callee != callee) {
+      lowered_callee.setName(callee.getName());
+      callee.erase();
+      SymbolTable(module).insert(lowered_callee);
+    }
   } else {
     info.signature_change = true;
-    info.decomposed_callee = callee_clone;
     for (auto& entry : callee_map) {
       auto buffer_arg = entry.getFirst().dyn_cast<BlockArgument>();
       if (!buffer_arg) continue;
       info.buffer_arg_to_size_arg[buffer_arg.getArgNumber()] =
           entry.getSecond().size.cast<BlockArgument>().getArgNumber();
     }
-
-    // Add the clone with a new name.
-    auto name = llvm::join(std::vector<std::string>{callee.getName().str(),
-                                                    "tensorlist_decomposed"},
-                           "_");
-    callee_clone.setName(name);
-    SymbolTable(module).insert(callee_clone);
-    callee = callee_clone;
+    if (lowered_callee != callee) {
+      // Add the clone with a new name.
+      lowered_callee.setName(
+          llvm::formatv("{0}_tensorlist_decomposed", callee.getName()).str());
+      SymbolTable(module).insert(lowered_callee);
+      callee = lowered_callee;
+    }
   }
   if (info.signature_change) return recreate_caller();
   return success();
@@ -541,7 +544,8 @@ LogicalResult HandleTensorListSetItemOp(
   auto new_buffer = cutil::SetElement(index, buffer, set_item.item(), builder,
                                       set_item.getLoc());
   set_item.output_handle().replaceAllUsesWith(new_buffer);
-  (*buffer_to_size)[new_buffer] = it->getSecond();
+  auto size = it->getSecond();
+  (*buffer_to_size)[new_buffer] = size;
   set_item.erase();
   return success();
 }
@@ -607,10 +611,37 @@ LogicalResult HandleTensorListGatherOp(
   return success();
 }
 
+LogicalResult HandleTensorListScatterIntoExistingListOp(
+    TF::TensorListScatterIntoExistingListOp scatter,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
+  auto it = buffer_to_size->find(scatter.input_handle());
+  if (it == buffer_to_size->end()) {
+    return scatter.emitOpError("unknown tensor list");
+  }
+  auto buffer = scatter.input_handle();
+  OpBuilder builder(scatter);
+  auto indices_type = scatter.indices().getType().cast<RankedTensorType>();
+  if (!indices_type) return scatter.emitOpError("unranked indices shape");
+  auto shape_type = RankedTensorType::get({2}, builder.getIntegerType(32));
+  auto shape = builder.create<TF::ConstOp>(
+      scatter.getLoc(),
+      DenseElementsAttr::get(
+          shape_type, {static_cast<int>(indices_type.getDimSize(0)), 1}));
+  auto indices =
+      builder.create<TF::ReshapeOp>(scatter.getLoc(), scatter.indices(), shape);
+  Value tensor_scatter_update = builder.create<TF::TensorScatterUpdateOp>(
+      scatter.getLoc(), buffer, indices, scatter.tensor());
+  scatter.output_handle().replaceAllUsesWith(tensor_scatter_update);
+  scatter.erase();
+  auto size = it->getSecond();
+  (*buffer_to_size)[tensor_scatter_update] = size;
+  return success();
+}
+
 LogicalResult DecomposeTensorListOpsInternal(
     Block* block, ModuleOp module,
     llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*
+    llvm::StringMap<PartitionedCallDecompositionInfo>*
         decomposed_partitioned_call_callees) {
   for (auto& op : llvm::make_early_inc_range(block->getOperations())) {
     // TODO(yuanzx): Add a pass to remove identities in device computation.
@@ -661,16 +692,25 @@ LogicalResult DecomposeTensorListOpsInternal(
       if (failed(HandleTensorListGatherOp(gather, *buffer_to_size))) {
         return failure();
       }
+    } else if (auto scatter =
+                   llvm::dyn_cast<TF::TensorListScatterIntoExistingListOp>(
+                       &op)) {
+      if (failed(HandleTensorListScatterIntoExistingListOp(scatter,
+                                                           buffer_to_size))) {
+        return failure();
+      }
     } else if (auto addn = llvm::dyn_cast<TF::AddNOp>(&op)) {
       auto it = buffer_to_size->find(addn.getOperand(0));
       if (it != buffer_to_size->end()) {
         addn.sum().setType(addn.getOperand(0).getType());
-        (*buffer_to_size)[addn.sum()] = it->getSecond();
+        auto size = it->getSecond();
+        (*buffer_to_size)[addn.sum()] = size;
       }
     } else if (auto zeros = llvm::dyn_cast<TF::ZerosLikeOp>(&op)) {
       if (buffer_to_size->count(zeros.x()) > 0) {
         zeros.y().setType(zeros.x().getType());
-        (*buffer_to_size)[zeros.y()] = (*buffer_to_size)[zeros.x()];
+        auto size = (*buffer_to_size)[zeros.x()];
+        (*buffer_to_size)[zeros.y()] = size;
       }
     } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
       if (failed(HandleWhileOp(while_op, module, buffer_to_size,
@@ -707,7 +747,7 @@ LogicalResult DecomposeTensorListOpsInternal(
 
 LogicalResult DecomposeTensorListOps(Block* block, ModuleOp module) {
   llvm::SmallDenseMap<Value, SizeInfo> buffer_to_size;
-  llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>
+  llvm::StringMap<PartitionedCallDecompositionInfo>
       decomposed_partitioned_call_callees;
   return DecomposeTensorListOpsInternal(block, module, &buffer_to_size,
                                         &decomposed_partitioned_call_callees);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
new file mode 100644
index 00000000000..786c4b74b34
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.h"
+
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+
+struct FuseParallelMapAndBatch : public OpRewritePattern<BatchDatasetV2Op> {
+  using OpRewritePattern<BatchDatasetV2Op>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BatchDatasetV2Op op,
+                                PatternRewriter &rewriter) const override {
+    auto batchInputDataset = op.input_dataset();
+
+    ParallelMapDatasetOp batchInputOp = dyn_cast_or_null<ParallelMapDatasetOp>(
+        batchInputDataset.getDefiningOp());
+    if (!batchInputOp) return failure();
+
+    // The type of the `num_parallel_calls` argument in ParallelMapDataset
+    // and MapAndBatchDataset is different (int32 and int64 respectively)
+    auto num_parallel_calls_op = rewriter.create<CastOp>(
+        op.getLoc(), UnrankedTensorType::get(rewriter.getIntegerType(64)),
+        batchInputOp.num_parallel_calls(), rewriter.getBoolAttr(false));
+
+    auto fused_op = rewriter.create<MapAndBatchDatasetOp>(
+        op.getLoc(), op.getType(), batchInputOp.input_dataset(),
+        batchInputOp.other_arguments(), op.batch_size(),
+        num_parallel_calls_op.y(), op.drop_remainder(), batchInputOp.f(),
+        op.output_types(), op.output_shapes(),
+        batchInputOp.preserve_cardinality());
+    rewriter.replaceOp(op, {fused_op.handle()});
+    return failure();
+  }
+};
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/generated_tf_data_optimization.inc"
+}  // namespace
+
+void PopulateTFDataOptimizationPatterns(MLIRContext *context,
+                                        OwningRewritePatternList *patterns) {
+  patterns->insert<FuseParallelMapAndBatch>(context);
+  populateWithGenerated(context, patterns);
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.h b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.h
new file mode 100644
index 00000000000..ffbc06a9515
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_DATA_OPTIMIZATION_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_DATA_OPTIMIZATION_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Populates patterns to perform optimizations specific to tf.data operations.
+void PopulateTFDataOptimizationPatterns(MLIRContext *context,
+                                        OwningRewritePatternList *patterns);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_DATA_OPTIMIZATION_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.td
new file mode 100644
index 00000000000..4b4239679b2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.td
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+
+// TODO(jpienaar): Move this somewhere general.
+class GetI64ScalarElementsAttr<int value> :
+  NativeCodeCall<"DenseElementsAttr::get<int64_t>(RankedTensorType::get({}, $_builder.getIntegerType(64)), " # value # ")">;
+
+def FuseMapAndBatch : Pat<
+  (TF_BatchDatasetV2Op
+     (TF_MapDatasetOp $input_dataset, $other_arguments, $f, $output_types,
+        $output_shapes, $use_inter_op_parallelism, $preserve_cardinality),
+     $batch_size, $drop_remainder, $parallel_copy, $batch_output_types,
+     $batch_output_shapes),
+  (TF_MapAndBatchDatasetOp $input_dataset, $other_arguments, $batch_size,
+     (TF_ConstOp (GetI64ScalarElementsAttr<1>)), $drop_remainder, $f,
+        $batch_output_types, $batch_output_shapes, $preserve_cardinality)>;
+
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization_pass.cc
new file mode 100644
index 00000000000..5be69bddb11
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization_pass.cc
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// Perform tf.data optimizations.
+struct TFDataOptimization
+    : public PassWrapper<TFDataOptimization, FunctionPass> {
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    mlir::TF::PopulateTFDataOptimizationPatterns(&getContext(), &patterns);
+
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
+  }
+};
+
+}  // namespace
+}  // namespace TF
+}  // namespace mlir
+
+static mlir::PassRegistration<mlir::TF::TFDataOptimization> pass(
+    "tf-data-optimization", "Performs tf.data optimizations");
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
index 500b879e697..1e4caaf5dd6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/public/session_options.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
index 860d537c7ef..6ea6df38568 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 // This transformation pass takes ops with the same `_tpu_replicate` attribute
-// in a block and clusters them together under a `tf_device::LaunchOp`.
+// in a block and clusters them together under a `tf_device.cluster`.
 // Associated TPUReplicateMetadata ops are removed and its attributes are copied
-// over to the associated `tf_device::LaunchOp`. If a cluster should be
+// over to the associated `tf_device.cluster`. If a cluster should be
 // replicated, the associated `tf_device::LaunchOp` will be wrapped further with
 // a `tf_device.replicate`. This pass also assumes ops of the same cluster do
 // not have ops outside of the cluster that are both operands and results of the
@@ -65,7 +65,8 @@ constexpr char kBadTPUReplicateAttrMsg[] =
     "requires '_tpu_replicate' string attribute";
 
 // Mapping for `_tpu_replicate` attribute to TPUReplicateMetadata attributes.
-using MetadataMap = llvm::SmallDenseMap<llvm::StringRef, NamedAttributeList, 8>;
+using MetadataMap =
+    llvm::SmallDenseMap<llvm::StringRef, MutableDictionaryAttr, 8>;
 
 // Mapping for `_tpu_replicate` attribute to ops of a cluster.
 using ClusterMap = llvm::SmallDenseMap<llvm::StringRef,
@@ -83,7 +84,7 @@ struct TPUClusterFormation
 LogicalResult CollectMetadata(Operation* op, MetadataMap* metadata_map) {
   auto result =
       op->walk([&](TF::TPUReplicateMetadataOp metadata_op) -> WalkResult {
-        NamedAttributeList attrs = metadata_op.getAttrs();
+        MutableDictionaryAttr attrs = metadata_op.getAttrs();
 
         // Missing or bad `_tpu_replicate` attribute.
         auto tpu_replicate_attr = attrs.get(kTPUReplicateAttr);
@@ -178,7 +179,7 @@ llvm::SmallSetVector<Operation*, 8> CollectClusterPrecedingUsers(
 
 // Collects results and associated types of the cluster that are used outside of
 // the cluster. These results and types are used to create the clusters
-// `tf_device::LaunchOp` and associated terminator. Results that have no uses
+// `tf_device.cluster` and associated terminator. Results that have no uses
 // outside of the cluster (i.e. results of ops in the cluster are only consumed
 // by other ops in the cluster) are pruned.
 llvm::SmallVector<Value, 8> CollectClusterResults(
@@ -200,40 +201,37 @@ llvm::SmallVector<Value, 8> CollectClusterResults(
   return results;
 }
 
-// Creates a `tf_device::LaunchOp` to wrap cluster ops.
-tf_device::LaunchOp CreateLaunchOpForCluster(Operation* last_cluster_op,
-                                             llvm::ArrayRef<Value> results) {
-  // `tf_device::LaunchOp` will be placed at where the last op of the cluster
-  // is.
+// Creates a `tf_device.cluster` to wrap cluster ops.
+tf_device::ClusterOp CreateOpForCluster(Operation* last_cluster_op,
+                                        llvm::ArrayRef<Value> results) {
+  // `tf_device.cluster` will be placed at where the last op of the cluster is.
   OpBuilder builder(last_cluster_op);
 
   llvm::SmallVector<Type, 8> result_types;
   for (Value result : results) result_types.push_back(result.getType());
 
-  // An empty string placeholder is used for the device as that will be later
-  // populated with the device of the associated TPUReplicateMetadata op.
-  auto launch_op = builder.create<tf_device::LaunchOp>(
-      last_cluster_op->getLoc(), builder.getStringAttr(""), result_types);
+  auto cluster = builder.create<tf_device::ClusterOp>(last_cluster_op->getLoc(),
+                                                      result_types);
 
-  launch_op.body().push_back(new Block);
+  cluster.body().push_back(new Block);
 
   // Add terminator.
-  builder.setInsertionPointToEnd(&launch_op.GetBody());
+  builder.setInsertionPointToEnd(&cluster.GetBody());
   builder.create<tf_device::ReturnOp>(last_cluster_op->getLoc(), results);
 
-  return launch_op;
+  return cluster;
 }
 
-// Moves cluster ops to associated `tf_device.LaunchOp` body.
-void MoveClusterOpsToLaunchOp(
-    tf_device::LaunchOp launch_op,
+// Moves cluster ops to associated `tf_device.cluster` body.
+void MoveClusterOpsToCluster(
+    tf_device::ClusterOp cluster,
     const llvm::SmallSetVector<Operation*, 8>& cluster_ops) {
-  MLIRContext* context = launch_op.getContext();
-  Operation* terminator = &launch_op.GetBody().back();
+  MLIRContext* context = cluster.getContext();
+  Operation* terminator = cluster.GetBody().getTerminator();
 
   for (Operation* cluster_op : cluster_ops) {
     // Remove `_tpu_replicate` and `device` attribute from ops in the cluster
-    // as that information will be present in the `tf_device.LaunchOp`.
+    // as that information will be present in the `tf_device.cluster`.
     cluster_op->removeAttr(Identifier::get(kTPUReplicateAttr, context));
     cluster_op->removeAttr(Identifier::get(kDeviceAttr, context));
     cluster_op->moveBefore(terminator);
@@ -241,24 +239,24 @@ void MoveClusterOpsToLaunchOp(
 }
 
 // Replaces uses of cluster ops results outside of cluster with the associated
-// `tf_device::LaunchOp` results.
-void UpdateLaunchOpResultExternalUses(tf_device::LaunchOp launch_op,
-                                      llvm::ArrayRef<Value> results) {
-  Block& launch_op_block = launch_op.GetBody();
-  for (auto ret_vals : llvm::zip(results, launch_op.getResults())) {
+// `tf_device.cluster` results.
+void UpdateClusterResultExternalUses(tf_device::ClusterOp cluster,
+                                     llvm::ArrayRef<Value> results) {
+  Block& cluster_block = cluster.GetBody();
+  for (auto ret_vals : llvm::zip(results, cluster.getResults())) {
     Value old_ret = std::get<0>(ret_vals);
     Value new_ret = std::get<1>(ret_vals);
     for (auto& use : llvm::make_early_inc_range(old_ret.getUses()))
-      if (!launch_op_block.findAncestorOpInBlock(*use.getOwner()))
+      if (!cluster_block.findAncestorOpInBlock(*use.getOwner()))
         use.set(new_ret);
   }
 }
 
 // Moves users of cluster that are before the cluster to after the cluster.
-void MovePrecedingClusterUsers(tf_device::LaunchOp launch_op,
+void MovePrecedingClusterUsers(tf_device::ClusterOp cluster,
                                llvm::ArrayRef<Operation*> preceding_users) {
-  Operation* op_after_launch_op = launch_op.getOperation()->getNextNode();
-  for (Operation* user : preceding_users) user->moveBefore(op_after_launch_op);
+  Operation* op_after_cluster = cluster.getOperation()->getNextNode();
+  for (Operation* user : preceding_users) user->moveBefore(op_after_cluster);
 }
 
 // Sorts `tf.TPUReplicatedInput` ops by `index` attribute. Ops with an `index`
@@ -296,19 +294,18 @@ LogicalResult SortTPUReplicatedInputsByIndex(
 
 // Creates a `tf_device.replicate` to represent replication for the cluster, if
 // necessary.
-LogicalResult ReplicateCluster(tf_device::LaunchOp launch_op,
-                               int num_replicas) {
+LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
   // No need to replicate.
   if (num_replicas == 1) return success();
 
   if (num_replicas < 1)
-    return launch_op.emitError() << "requires '" << kNumReplicasAttr
-                                 << "' int attribute to be at least 1";
+    return cluster.emitError() << "requires '" << kNumReplicasAttr
+                               << "' int attribute to be at least 1";
 
   // Collect all used TPUReplicatedInput ops and sort by `index`.
   llvm::SmallSetVector<Operation*, 8> unique_replicated_input_ops;
   mlir::visitUsedValuesDefinedAbove(
-      launch_op.body(), launch_op.body(), [&](mlir::OpOperand* operand) {
+      cluster.body(), cluster.body(), [&](mlir::OpOperand* operand) {
         Operation* def = operand->get().getDefiningOp();
         if (def && llvm::isa<TF::TPUReplicatedInputOp>(def))
           unique_replicated_input_ops.insert(def);
@@ -338,24 +335,24 @@ LogicalResult ReplicateCluster(tf_device::LaunchOp launch_op,
   }
 
   // Create replicate op.
-  OpBuilder builder(launch_op);
+  OpBuilder builder(cluster);
   auto replicate_op = builder.create<tf_device::ReplicateOp>(
-      launch_op.getLoc(), num_replicas,
+      cluster.getLoc(), num_replicas,
       llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>>(),
-      replicated_inputs, launch_op.getResultTypes());
+      replicated_inputs, cluster.getResultTypes());
   if (!mirrored_variable_indices.empty())
     replicate_op.setAttr(kMirroredVariableIndicesAttr,
                          builder.getI64ArrayAttr(mirrored_variable_indices));
 
   // Replace replicated cluster results with replicate op results.
-  for (auto result_and_idx : llvm::enumerate(launch_op.getResults())) {
+  for (auto result_and_idx : llvm::enumerate(cluster.getResults())) {
     Value result = result_and_idx.value();
     int idx = result_and_idx.index();
     for (auto& use : result.getUses()) {
       Operation* def = use.getOwner();
       if (!def || !llvm::isa<TF::TPUReplicatedOutputOp>(def))
-        return launch_op.emitError()
-               << "requires output of " << launch_op.getOperationName()
+        return cluster.emitError()
+               << "requires output of " << cluster.getOperationName()
                << " to lead to a 'tf.TPUReplicatedOutput' op";
 
       if (def->getNumResults() != num_replicas)
@@ -374,14 +371,15 @@ LogicalResult ReplicateCluster(tf_device::LaunchOp launch_op,
     Operation* input = std::get<0>(input_and_block_arg);
     Value block_arg = std::get<1>(input_and_block_arg);
     mlir::replaceAllUsesInRegionWith(input->getResult(0), block_arg,
-                                     launch_op.body());
+                                     cluster.body());
   }
 
-  // Create terminator for replicate op and move launch into replicate.
+  // Create terminator for replicate op and move `tf_device.cluster` into
+  // replicate.
   builder.setInsertionPointToEnd(&replicate_op.GetBody());
   auto return_op = builder.create<tf_device::ReturnOp>(replicate_op.getLoc(),
-                                                       launch_op.getResults());
-  launch_op.getOperation()->moveBefore(return_op);
+                                                       cluster.getResults());
+  cluster.getOperation()->moveBefore(return_op);
 
   return success();
 }
@@ -395,31 +393,33 @@ LogicalResult ReplicateCluster(tf_device::LaunchOp launch_op,
 //      `_tpu_replicate` attribute.
 //   2. Find users not in cluster that are interleaved between cluster ops.
 //   3. Find external uses of cluster ops.
-//   4. Create `tf_device::LaunchOp` with results consisting of the external
-//      uses of cluster ops determined at 3.
-//   5. Move cluster ops to `tf_device::LaunchOp` body.
-//   6. Replace external uses of cluster ops uses with `tf_device::LaunchOp`
+//   4. Create `tf_device.cluster` with results consisting of the external uses
+//      of cluster ops determined at 3.
+//   5. Move cluster ops to `tf_device.cluster` body.
+//   6. Replace external uses of cluster ops uses with `tf_device.cluster`
 //      results.
-//   7. Move users from 2 to after the `tf_device::LaunchOp`.
-//   8. Wrap cluster (`tf_device::LaunchOp`) in a `tf_device.replicate` if
+//   7. Move users from 2 to after the `tf_device.cluster`.
+//   8. Wrap cluster (`tf_device.cluster`) in a `tf_device.replicate` if
 //      attribute `num_replicas` is greater than 1.
-//   9. Copy over TPUReplicateMetadata attributes to `tf_device::LaunchOp`.
+//   9. Copy over TPUReplicateMetadata attributes to `tf_device.cluster`.
 LogicalResult FormClustersInBlock(Block* block,
                                   const MetadataMap& metadata_map) {
   ClusterMap clusters;
   LogicalResult result = CollectAndGroupClusterOps(block, &clusters);
   if (failed(result)) return result;
 
-  for (const auto& cluster : clusters) {
-    const auto& cluster_ops = cluster.getSecond();
+  for (const auto& cluster_metadata_and_ops : clusters) {
+    const auto& cluster_ops = cluster_metadata_and_ops.getSecond();
 
-    auto cluster_metadata = metadata_map.find(cluster.getFirst());
+    auto cluster_metadata =
+        metadata_map.find(cluster_metadata_and_ops.getFirst());
 
     // No TPUReplicateMetadata for a `_tpu_replicate` attribute.
     if (cluster_metadata == metadata_map.end()) {
       cluster_ops.front()->emitWarning()
           << "TPUReplicateMetadata for associated '" << kTPUReplicateAttr
-          << "' attribute '" << cluster.getFirst() << "' is missing";
+          << "' attribute '" << cluster_metadata_and_ops.getFirst()
+          << "' is missing";
       continue;
     }
 
@@ -429,28 +429,28 @@ LogicalResult FormClustersInBlock(Block* block,
     llvm::SmallVector<Value, 8> results =
         CollectClusterResults(block, cluster_ops);
 
-    tf_device::LaunchOp launch_op =
-        CreateLaunchOpForCluster(cluster_ops.back(), results);
+    tf_device::ClusterOp cluster =
+        CreateOpForCluster(cluster_ops.back(), results);
 
-    MoveClusterOpsToLaunchOp(launch_op, cluster_ops);
+    MoveClusterOpsToCluster(cluster, cluster_ops);
 
-    UpdateLaunchOpResultExternalUses(launch_op, results);
+    UpdateClusterResultExternalUses(cluster, results);
 
-    MovePrecedingClusterUsers(launch_op, preceding_users.getArrayRef());
+    MovePrecedingClusterUsers(cluster, preceding_users.getArrayRef());
 
     auto num_replicas = cluster_metadata->getSecond().get(kNumReplicasAttr);
     if (!num_replicas || !num_replicas.isa<mlir::IntegerAttr>())
-      return launch_op.emitError()
+      return cluster.emitError()
              << "requires '" << kNumReplicasAttr << "' int attribute";
 
     if (failed(ReplicateCluster(
-            launch_op, num_replicas.cast<mlir::IntegerAttr>().getInt())))
+            cluster, num_replicas.cast<mlir::IntegerAttr>().getInt())))
       return failure();
 
-    // Copy TPUReplicateMetadata attributes to launch.
-    launch_op.setAttrs(cluster_metadata->second);
+    // Copy TPUReplicateMetadata attributes to `tf_device.cluster`.
+    cluster.setAttrs(cluster_metadata->second);
     // Exclude `num_replicas` as cluster should be replicated if necessary.
-    launch_op.removeAttr(kNumReplicasAttr);
+    cluster.removeAttr(kNumReplicasAttr);
   }
 
   return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
index 6fb686995b4..3fbd8369b7e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/STLExtras.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_padding_mapper.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_padding_mapper.cc
index ad80eaaf1a6..64af2eabd3d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_padding_mapper.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_padding_mapper.cc
@@ -43,7 +43,7 @@ namespace TFTPU {
 constexpr char kPaddingMapAttr[] = "padding_map";
 
 // This pass remaps and assigns padding maps to an encapsulated function's
-// arguments from a `tf_device.launch_func` `padding_map` attribute. Remapping
+// arguments from a `tf_device.cluster_func` `padding_map` attribute. Remapping
 // is from replicated input index to encapsulated function's operand index
 // (user).
 
@@ -54,13 +54,13 @@ struct TPUDynamicPaddingMapper
 };
 
 // Creates a mapping from replicated input index (in `tf_device.replicate` op)
-// to `tf_device.launch_func` operand index.
+// to `tf_device.cluster_func` operand index.
 llvm::SmallDenseMap<int32_t, int32_t> GetRemappedReplicatedInputIndices(
-    tf_device::LaunchFuncOp launch_func, tf_device::ReplicateOp replicate) {
+    tf_device::ClusterFuncOp cluster_func, tf_device::ReplicateOp replicate) {
   Block* replicate_block = &replicate.GetBody();
 
   llvm::SmallDenseMap<int32_t, int32_t> remapped_indices;
-  for (auto operand_and_idx : llvm::enumerate(launch_func.getOperands()))
+  for (auto operand_and_idx : llvm::enumerate(cluster_func.getOperands()))
     if (auto block_arg = operand_and_idx.value().dyn_cast<BlockArgument>())
       if (block_arg.getOwner() == replicate_block)
         remapped_indices[block_arg.getArgNumber()] = operand_and_idx.index();
@@ -68,11 +68,12 @@ llvm::SmallDenseMap<int32_t, int32_t> GetRemappedReplicatedInputIndices(
   return remapped_indices;
 }
 
-// Extracts `padding_map` from `tf_device.launch_func` and remaps the associated
-// replicated input indices to the encapsulated function operand indices. An
-// error will be returned if an index is not found or parsing failed.
+// Extracts `padding_map` from `tf_device.cluster_func` and remaps the
+// associated replicated input indices to the encapsulated function operand
+// indices. An error will be returned if an index is not found or parsing
+// failed.
 LogicalResult GetRemappedPaddings(
-    tf_device::LaunchFuncOp launch_func, int num_replicated_args,
+    tf_device::ClusterFuncOp cluster_func, int num_replicated_args,
     const llvm::SmallDenseMap<int32_t, int32_t>& remapped_indices,
     llvm::SmallVectorImpl<tensorflow::tpu::PaddingMap>* remapped_paddings) {
   auto bad_index_msg = [num_replicated_args](int32_t index,
@@ -85,12 +86,12 @@ LogicalResult GetRemappedPaddings(
         .str();
   };
 
-  Attribute padding_map_attr = launch_func.getAttr(kPaddingMapAttr);
+  Attribute padding_map_attr = cluster_func.getAttr(kPaddingMapAttr);
   if (!padding_map_attr) return success();
 
   auto padding_map = padding_map_attr.dyn_cast<ArrayAttr>();
   if (!padding_map)
-    return launch_func.emitOpError()
+    return cluster_func.emitOpError()
            << "requires '" << kPaddingMapAttr << "' array attribute";
 
   for (auto padding_attr_and_idx : llvm::enumerate(padding_map)) {
@@ -98,25 +99,25 @@ LogicalResult GetRemappedPaddings(
     auto& padding_attr = padding_attr_and_idx.value();
     auto padding = padding_attr.dyn_cast<StringAttr>();
     if (!padding)
-      return launch_func.emitOpError(
+      return cluster_func.emitOpError(
           llvm::formatv("bad '{0}' attribute at index {1}, not a string",
                         kPaddingMapAttr, padding_attr_and_idx.index()));
 
     tensorflow::tpu::PaddingMap padding_proto;
     if (!padding_proto.ParseFromString(padding.getValue().str()))
-      return launch_func.emitOpError(llvm::formatv(
+      return cluster_func.emitOpError(llvm::formatv(
           "bad '{0}' attribute at index {1}, failed to parse '{2}' as "
           "tensorflow::tpu::PaddingMap",
           kPaddingMapAttr, idx, padding.getValue()));
 
     const int32_t arg_index = padding_proto.arg_index();
     if (arg_index >= num_replicated_args || arg_index < 0)
-      return launch_func.emitOpError()
+      return cluster_func.emitOpError()
              << bad_index_msg(idx, "arg_index", arg_index);
 
     const int32_t padding_arg_index = padding_proto.padding_arg_index();
     if (padding_arg_index >= num_replicated_args || padding_arg_index < 0)
-      return launch_func.emitOpError()
+      return cluster_func.emitOpError()
              << bad_index_msg(idx, "padding_arg_index", padding_arg_index);
 
     auto arg_index_it = remapped_indices.find(arg_index);
@@ -125,7 +126,7 @@ LogicalResult GetRemappedPaddings(
 
     auto padding_arg_index_it = remapped_indices.find(padding_arg_index);
     if (padding_arg_index_it == remapped_indices.end()) {
-      launch_func.emitWarning(llvm::formatv(
+      cluster_func.emitWarning(llvm::formatv(
           "bad '{0}' attribute at index {1}, unused padding_arg_index {2}",
           kPaddingMapAttr, idx, padding_arg_index));
       continue;
@@ -169,22 +170,21 @@ void AnnotateFunctionArgumentsWithPaddings(
   }
 }
 
-LogicalResult RemapAndAssignPaddingMaps(tf_device::LaunchFuncOp launch_func,
+LogicalResult RemapAndAssignPaddingMaps(tf_device::ClusterFuncOp cluster_func,
                                         SymbolTable* symbol_table) {
-  auto replicate =
-      llvm::dyn_cast_or_null<tf_device::ReplicateOp>(launch_func.getParentOp());
+  auto replicate = cluster_func.getParentOfType<tf_device::ReplicateOp>();
   // LaunchFunc is not replicated, there will be no padding.
   if (!replicate) return success();
   const int num_replicated_args = replicate.GetBody().getNumArguments();
 
-  auto func = symbol_table->lookup<FuncOp>(launch_func.func());
+  auto func = symbol_table->lookup<FuncOp>(cluster_func.func());
   if (!func) return success();
 
   llvm::SmallDenseMap<int32_t, int32_t> remapped_indices =
-      GetRemappedReplicatedInputIndices(launch_func, replicate);
+      GetRemappedReplicatedInputIndices(cluster_func, replicate);
 
   llvm::SmallVector<tensorflow::tpu::PaddingMap, 4> remapped_paddings;
-  if (failed(GetRemappedPaddings(launch_func, num_replicated_args,
+  if (failed(GetRemappedPaddings(cluster_func, num_replicated_args,
                                  remapped_indices, &remapped_paddings)))
     return failure();
 
@@ -196,8 +196,8 @@ LogicalResult RemapAndAssignPaddingMaps(tf_device::LaunchFuncOp launch_func,
 void TPUDynamicPaddingMapper::runOnOperation() {
   ModuleOp module = getOperation();
   SymbolTable symbol_table(module);
-  module.walk([&](tf_device::LaunchFuncOp launch_func) {
-    RemapAndAssignPaddingMaps(launch_func, &symbol_table);
+  module.walk([&](tf_device::ClusterFuncOp cluster_func) {
+    RemapAndAssignPaddingMaps(cluster_func, &symbol_table);
   });
 }
 }  // anonymous namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
new file mode 100644
index 00000000000..02d0c3e849b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -0,0 +1,317 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <type_traits>
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
+
+namespace mlir {
+namespace TFTPU {
+
+// This pass extracts a CPU computation cluster with `_xla_outside_compilation`
+// annotation from the head or tail of a TPU cluster.
+
+namespace {
+
+constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
+
+bool HasOutsideCompilationAttribute(Operation* op) {
+  return op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr) != nullptr;
+}
+
+// Returns whether all operands of `op` are from values inside the
+// `input_value_set`.
+bool OpContainsOperandsFromSet(Operation* op,
+                               const llvm::SetVector<Value>& input_value_set) {
+  for (auto operand : op->getOperands())
+    if (input_value_set.count(operand) == 0) return false;
+
+  return true;
+}
+
+void RecordOutsideCompiledOpsAndUsages(
+    Operation* op, llvm::SmallSetVector<Operation*, 4>* outside_compiled_ops,
+    llvm::SetVector<Value>* outside_compiled_op_usages) {
+  if (HasOutsideCompilationAttribute(op) &&
+      OpContainsOperandsFromSet(op, *outside_compiled_op_usages)) {
+    outside_compiled_ops->insert(op);
+    outside_compiled_op_usages->insert(op->getResults().begin(),
+                                       op->getResults().end());
+  }
+}
+
+// Traverses the MLIR graph and returns a set of ops that
+// are connected to inputs of TPU computation and outside compiled.
+void ExtractOutsideCompiledOpsConnectedToHead(
+    Value input_value, llvm::SetVector<Value>* values_used_in_host_cluster,
+    llvm::SmallSetVector<Operation*, 4>* outside_compiled_ops) {
+  llvm::SmallSetVector<Operation*, 4> parent_outside_compiled_ops_at_head;
+  for (auto& usage : input_value.getUses()) {
+    auto head_operation = usage.getOwner();
+    RecordOutsideCompiledOpsAndUsages(head_operation,
+                                      &parent_outside_compiled_ops_at_head,
+                                      values_used_in_host_cluster);
+  }
+
+  // Traverse the graph and find all outside compiled ops connected from
+  // the `input_value`.
+  while (!parent_outside_compiled_ops_at_head.empty()) {
+    llvm::SmallSetVector<Operation*, 4> connected_outside_compiled_ops;
+    for (auto head_outside_compiled_op : parent_outside_compiled_ops_at_head) {
+      auto op_results = head_outside_compiled_op->getOpResults();
+      for (auto op_result : op_results) {
+        for (auto& use : op_result.getUses()) {
+          auto connected_op = use.getOwner();
+          RecordOutsideCompiledOpsAndUsages(connected_op,
+                                            &connected_outside_compiled_ops,
+                                            values_used_in_host_cluster);
+        }
+      }
+    }
+
+    outside_compiled_ops->insert(parent_outside_compiled_ops_at_head.begin(),
+                                 parent_outside_compiled_ops_at_head.end());
+    std::swap(parent_outside_compiled_ops_at_head,
+              connected_outside_compiled_ops);
+  }
+}
+
+// TODO(hongjunchoi): Also handle ops without inputs that are outside
+// compiled.
+//
+// Returns set of ops that are outside compiled and are directly connected
+// to inputs to the TPU computation.
+llvm::SmallSetVector<Operation*, 4> IdentifyOutsideCompiledOpsAtHead(
+    tf_device::ClusterOp tpu_cluster) {
+  llvm::SmallSetVector<Operation*, 4> outside_compiled_at_head_ops;
+  llvm::SetVector<Value> values_used_in_cluster;
+  auto& cluster_region = tpu_cluster.body();
+  getUsedValuesDefinedAbove(cluster_region, cluster_region,
+                            values_used_in_cluster);
+
+  auto input_value_list = llvm::to_vector<8>(values_used_in_cluster);
+  for (auto input_value : input_value_list)
+    ExtractOutsideCompiledOpsConnectedToHead(
+        input_value, &values_used_in_cluster, &outside_compiled_at_head_ops);
+  return outside_compiled_at_head_ops;
+}
+
+// Returns output values of extracted outside compiled cluster at head that
+// are used by the TPU computation.
+llvm::SmallVector<Value, 8> GetHeadExtractedClusterOutputs(
+    const llvm::SmallSetVector<Operation*, 4>& head_outside_compiled_ops) {
+  llvm::SmallVector<Value, 8> outputs;
+  outputs.reserve(head_outside_compiled_ops.size());
+
+  for (auto op : head_outside_compiled_ops) {
+    for (Operation* user : op->getUsers()) {
+      if (!head_outside_compiled_ops.count(user)) {
+        outputs.append(op->result_begin(), op->result_end());
+        break;
+      }
+    }
+  }
+
+  return outputs;
+}
+
+// Creates new tf_device.launch op with outside compiled ops extracted
+// from the head of TPU computation.
+llvm::Optional<tf_device::LaunchOp> IsolateHeadExtractedOpsToLaunchOp(
+    OpBuilder* builder, tf_device::ClusterOp cluster,
+    const llvm::SmallSetVector<Operation*, 4>& head_outside_compiled_ops) {
+  if (head_outside_compiled_ops.empty())
+    return llvm::Optional<tf_device::LaunchOp>();
+
+  // Create tf_device.launch op to separate all extracted outside compiled ops
+  // before the tf_device.cluster.
+  auto output_values =
+      GetHeadExtractedClusterOutputs(head_outside_compiled_ops);
+
+  llvm::SmallVector<Type, 8> output_return_types;
+  output_return_types.reserve(output_values.size());
+  for (auto output : output_values)
+    output_return_types.emplace_back(output.getType());
+
+  builder->setInsertionPoint(cluster);
+  auto host_launch_op = builder->create<tf_device::LaunchOp>(
+      cluster.getLoc(), builder->getStringAttr(""), output_return_types);
+
+  // Replace all usages of outside compiled ops that are used in TPU
+  // computation with the results of the above created launch op.
+  for (auto output_and_index : llvm::enumerate(output_values)) {
+    auto output_index = output_and_index.index();
+    auto output = output_and_index.value();
+    for (auto& use : output.getUses()) {
+      if (!head_outside_compiled_ops.count(use.getOwner()))
+        use.set(host_launch_op.getResult(output_index));
+    }
+  }
+
+  // Create terminator op for the newly created launch op.
+  host_launch_op.body().push_back(new Block());
+  builder->setInsertionPointToEnd(&host_launch_op.GetBody());
+  auto terminator = builder->create<tf_device::ReturnOp>(
+      host_launch_op.getLoc(), output_values);
+
+  // Move all outside compile ops from cluster op to launch op.
+  for (auto outside_compiled_op : head_outside_compiled_ops)
+    outside_compiled_op->moveBefore(terminator);
+
+  return host_launch_op;
+}
+
+// Parses TPU compilation and execution device form tpu cluster and assigns
+// host device to `host_launch` device attribute.
+LogicalResult SetCompilationDeviceToHostLaunch(
+    OpBuilder* builder, mlir::TF::RuntimeDevices devices,
+    tf_device::ClusterOp tpu_cluster, tf_device::LaunchOp host_launch) {
+  auto num_cores_per_replica_attr = tpu_cluster.getAttrOfType<IntegerAttr>(
+      tensorflow::kNumCoresPerReplicaAttr);
+  if (!num_cores_per_replica_attr)
+    return tpu_cluster.emitOpError(
+        "cluster op missing `num_cores_per_replica` attribute");
+
+  if (num_cores_per_replica_attr.getInt() != 1)
+    return tpu_cluster.emitOpError(
+        "outside compilation is not supported with model parallelism.");
+
+  auto topology_attr =
+      tpu_cluster.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
+  if (!topology_attr)
+    return tpu_cluster.emitOpError("cluster op missing `topology` attribute");
+
+  auto device_assignment_attr = tpu_cluster.getAttrOfType<mlir::ArrayAttr>(
+      tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return tpu_cluster.emitOpError(
+        llvm::formatv("requires attribute '{0}'",
+                      tensorflow::kDeviceAssignmentAttr)
+            .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+
+  if (!status_or_device_coodinates.ok())
+    return tpu_cluster.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
+
+  // Determine compilation and execution devices.
+  auto status_or_tpu_device_assignment =
+      tensorflow::GetTPUCompilationAndExecutionDevices(
+          devices.device_names(), /*num_replicas=*/1,
+          /*num_cores_per_replica=*/1, topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
+  if (!status_or_tpu_device_assignment.ok())
+    return tpu_cluster.emitError()
+           << "error in fetching TPU compilation/execution devices: "
+           << status_or_tpu_device_assignment.status().error_message();
+  auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
+  host_launch.deviceAttr(
+      builder->getStringAttr(tpu_device_assignment.tpu_devices[0][0].host));
+
+  return success();
+}
+
+// Assigns host device attribute to host launch op or enclosing
+// tf_device.replicate op if TPU computation is replicated.
+LogicalResult HandleHostLaunchDeviceAssignment(
+    OpBuilder* builder, mlir::TF::RuntimeDevices devices,
+    tf_device::ClusterOp tpu_cluster, tf_device::LaunchOp host_launch) {
+  auto parent_replicate_op =
+      llvm::dyn_cast_or_null<tf_device::ReplicateOp>(host_launch.getParentOp());
+  // If computation is replicated, then add TPU_REPLICATED_HOST device alias
+  // to the host launch op. This device alias would later be a reference to
+  // host device string in the device map of tf_device.replicate op
+  // during tpu_rewrite pass.
+  if (parent_replicate_op) {
+    host_launch.deviceAttr(
+        builder->getStringAttr(tensorflow::kTPUReplicatedHost));
+  } else {
+    if (failed(SetCompilationDeviceToHostLaunch(builder, devices, tpu_cluster,
+                                                host_launch)))
+      return failure();
+  }
+
+  return success();
+}
+
+struct TPUExtractHeadTailOutsideCompilation
+    : public PassWrapper<TPUExtractHeadTailOutsideCompilation,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
+  // Get runtime devices information from the closest parent module.
+  auto module = getOperation();
+  mlir::TF::RuntimeDevices devices;
+  if (failed(tensorflow::GetDevicesFromOp(module, &devices)))
+    return signalPassFailure();
+
+  OpBuilder builder(&getContext());
+  auto result = module.walk([&](tf_device::ClusterOp cluster) {
+    auto head_outside_compiled_ops = IdentifyOutsideCompiledOpsAtHead(cluster);
+    auto host_launch_op = IsolateHeadExtractedOpsToLaunchOp(
+        &builder, cluster, head_outside_compiled_ops);
+    if (host_launch_op) {
+      if (failed(HandleHostLaunchDeviceAssignment(&builder, devices, cluster,
+                                                  *host_launch_op))) {
+        return WalkResult::interrupt();
+      }
+    }
+
+    // TODO(b/155115766): Implement tail outside compiled op extraction.
+    return WalkResult::advance();
+  });
+
+  if (result.wasInterrupted()) signalPassFailure();
+}
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUExtractHeadTailOutsideCompilationPass() {
+  return std::make_unique<TPUExtractHeadTailOutsideCompilation>();
+}
+
+static PassRegistration<TPUExtractHeadTailOutsideCompilation> pass(
+    "tf-tpu-extract-head-tail-outside-compilation",
+    "Extracts TPU head or tail outside compilation to separate "
+    "parallel_execute.");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
new file mode 100644
index 00000000000..4281b85bd7f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -0,0 +1,192 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
+constexpr char kDeviceAttr[] = "device";
+
+// Mapping for `_xla_outside_compilation` attribute to ops of a cluster.
+using OutsideClusterMap =
+    llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<Operation*, 8>, 8>;
+
+// This pass extracts a CPU computation cluster with `_xla_outside_compilation`
+// annotation from a TPU cluster.  Each outside compilation cluster is moved to
+// a parallel_execute region.  The TPU cluster is also moved to a
+// parallel_execute region.
+// TODO(b/154363171): Add example tranformations.
+
+struct TPUExtractOutsideCompilation
+    : public PassWrapper<TPUExtractOutsideCompilation, FunctionPass> {
+  void runOnFunction() override;
+};
+
+// Collects and clusters ops in `block` with the same `_xla_outside_compilation`
+// attribute into `clusters` This returns an error if a
+// `_xla_outside_compilation` attribute of an op is empty.
+LogicalResult CollectAndGroupOutsideClusterOps(Block* block,
+                                               OutsideClusterMap* clusters) {
+  for (Operation& op : *block) {
+    if (auto attr = op.getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
+      if (attr.getValue().empty())
+        return op.emitError()
+               << "attribute '" << kXlaOutsideCompilationAttr << "' is empty";
+
+      auto it = clusters->try_emplace(attr.getValue());
+      it.first->getSecond().push_back(&op);
+    }
+  }
+
+  return success();
+}
+
+// Moves `cluster_ops` to associated `launch_op` body.
+void MoveOutsideClusterOpsToLaunchOp(
+    tf_device::LaunchOp launch_op,
+    const llvm::SmallVector<Operation*, 8>& cluster_ops) {
+  MLIRContext* context = launch_op.getContext();
+  Operation* terminator = launch_op.GetBody().getTerminator();
+
+  for (Operation* cluster_op : cluster_ops) {
+    // Remove `_xla_outside_compilation` and `device` attribute from ops in the
+    // cluster as that information will be present in the `launch_op`.
+    cluster_op->removeAttr(
+        Identifier::get(kXlaOutsideCompilationAttr, context));
+    cluster_op->removeAttr(Identifier::get(kDeviceAttr, context));
+    cluster_op->moveBefore(terminator);
+  }
+}
+
+// Creates a `tf_device::LaunchOp` to wrap cluster ops.
+tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
+    OpBuilder* builder, Operation* last_cluster_op) {
+  // TODO(b/154363171): Set the CPU device.
+  // An empty string placeholder is used for the device as that will be later
+  // populated with the device of the associated TPUReplicateMetadata op.
+  llvm::SmallVector<Type, 8> result_types;
+  auto launch_op = builder->create<tf_device::LaunchOp>(
+      last_cluster_op->getLoc(), builder->getStringAttr(""), result_types);
+
+  launch_op.body().push_back(new Block);
+
+  // Add terminator.
+  builder->setInsertionPointToEnd(&launch_op.GetBody());
+  builder->create<tf_device::ReturnOp>(last_cluster_op->getLoc(),
+                                       llvm::ArrayRef<Value>{});
+
+  return launch_op;
+}
+
+// Propagates the return from `parallel_execute_op` to parent replicate
+// op if it exists.
+void PropagateParallelExecuteReturnToReplicate(
+    tf_device::ParallelExecuteOp parallel_execute_op) {
+  // Update the return for the parallel_execute op parent.
+  auto replicate = llvm::dyn_cast_or_null<tf_device::ReplicateOp>(
+      parallel_execute_op.getParentOp());
+  if (replicate)
+    replicate.GetBody().getTerminator()->setOperands(
+        parallel_execute_op.execute_outputs());
+}
+
+// Creates a `parallel_execute` op in place of launch with 'clusters` and
+// 'launch` as regions.
+void CreateParallelExecuteFromOutsideClusters(
+    tf_device::ClusterOp tpu_cluster, const OutsideClusterMap& clusters) {
+  OpBuilder builder(tpu_cluster);
+  // Create parallel_execute regions.  The original TPU cluster computation
+  // is the extra region.
+  int num_regions = 1 + clusters.size();
+  auto parallel_execute_op = builder.create<tf_device::ParallelExecuteOp>(
+      tpu_cluster.getLoc(), num_regions, tpu_cluster.results().getTypes());
+
+  // Move outside compilation clusters to parallel_execute regions.
+  for (const auto& cluster : llvm::enumerate(clusters)) {
+    const auto& cluster_ops = cluster.value().getSecond();
+
+    Block& outside_block =
+        parallel_execute_op.GetRegionBlockWithIndex(cluster.index());
+    builder.setInsertionPointToEnd(&outside_block);
+    tf_device::LaunchOp launch_op =
+        CreateLaunchOpForOutsideCluster(&builder, cluster_ops.back());
+    MoveOutsideClusterOpsToLaunchOp(launch_op, cluster_ops);
+    builder.setInsertionPointToEnd(&outside_block);
+    // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute
+    // regions either through communication with TPU parallel_execute regions
+    // or modifying parallel_execute returns.
+    builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
+                                        ArrayRef<Value>{});
+  }
+
+  // Move the launch body to last parallel_execute block.
+  Block& inside_block =
+      parallel_execute_op.GetRegionBlockWithIndex(num_regions - 1);
+  builder.setInsertionPointToEnd(&inside_block);
+  builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
+                                      tpu_cluster.getResults());
+  tpu_cluster.getOperation()->moveBefore(inside_block.getTerminator());
+
+  PropagateParallelExecuteReturnToReplicate(parallel_execute_op);
+  // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute
+  // regions either through communication with TPU parallel_execute regions
+  // or modifying parallel_execute returns.
+}
+
+void TPUExtractOutsideCompilation::runOnFunction() {
+  auto extract_result =
+      getFunction().walk([&](tf_device::ClusterOp tpu_cluster) {
+        OutsideClusterMap clusters;
+        if (failed(CollectAndGroupOutsideClusterOps(&tpu_cluster.GetBody(),
+                                                    &clusters)))
+          return WalkResult::interrupt();
+
+        if (clusters.empty()) return WalkResult::advance();
+
+        CreateParallelExecuteFromOutsideClusters(tpu_cluster, clusters);
+
+        return WalkResult::advance();
+      });
+
+  if (extract_result.wasInterrupted()) return signalPassFailure();
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+CreateTPUExtractOutsideCompilationPass() {
+  return std::make_unique<TPUExtractOutsideCompilation>();
+}
+
+static PassRegistration<TPUExtractOutsideCompilation> pass(
+    "tf-tpu-extract-outside-compilation",
+    "Extracts TPU outside compilation to separate parallel_execute.");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index a635fdb9a1f..a7ad6a964b9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -64,35 +64,30 @@ static llvm::cl::opt<bool> tpu_compile_metadata_debug(
                    "'tf._TPUCompileMlir' op as a proto debug string"));
 
 constexpr char kNumReplicasAttr[] = "num_replicas";
-constexpr char kNumCoresPerReplicaAttr[] = "num_cores_per_replica";
 constexpr char kStepMarkerLocationAttr[] = "step_marker_location";
 constexpr char kPaddingMapAttr[] = "padding_map";
-constexpr char kTopologyAttr[] = "topology";
-constexpr char kDeviceAssignmentAttr[] = "device_assignment";
 constexpr char kDeviceAttr[] = "device";
 constexpr char kDevicesAttr[] = "devices";
 constexpr char kVersionsAttr[] = "tf.versions";
 
 constexpr char kBadStringArrayElementMsg[] =
     "bad '{0}' attribute at index {1}, not a string";
-constexpr char kBadIntArrayElementMsg[] =
-    "bad '{0}' attribute at index {1}, not an int";
 constexpr char kBadArrayElementMsg[] =
     "bad '{0}' attribute at index {1} with value '{2}': failed to parse to {3}";
 constexpr char kBadArrayAttrLengthMsg[] =
     "bad '{0}' attribute, expected array attribute of size {1}, got size {2}";
 
-// Rewrites `tf_device.launch_func` operations assigned to TPU into actual TPU
+// Rewrites `tf_device.cluster_func` operations assigned to TPU into actual TPU
 // jit-compile runtime ops.
 //
 // For example:
-//   %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster", func =
+//   %1 = "tf_device.cluster_func"(%0) {_tpu_replicate = "cluster", func =
 //         @tpu_func}
 //   %2 = "tf.SomeOp"(%1)
 //
 // Would become following ops (unimportant attributes, types are omitted):
 //    %1 = "tf.Shape"(%0)
-//    %2:2 = "tf.MLIRCompileToTPU"(%1) {module = "<Serialized @tpu_func>"}
+//    %2:2 = "tf._TPUCompileMlir"(%1) {module = "<Serialized @tpu_func>"}
 //    "tf.TPUCompileSucceededAssert"(%2#0)
 //    %3 = "tf.TPUExecute"(%0, %2#1)
 //    %4 = "tf.SomeOp"(%3)
@@ -163,36 +158,10 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
   return success();
 }
 
-// Extracts device coordinates from a device assignment attribute on an op.
-LogicalResult GetDeviceCoordinates(
-    tf_device::LaunchFuncOp op,
-    llvm::SmallVectorImpl<int64_t>* device_assignment) {
-  auto device_assignment_attr =
-      op.getAttrOfType<ArrayAttr>(kDeviceAssignmentAttr);
-  if (!device_assignment_attr)
-    return op.emitOpError(CreateMissingAttributeMsg(kDeviceAssignmentAttr));
-
-  device_assignment->reserve(device_assignment_attr.size());
-
-  for (auto device_coordinate_and_idx :
-       llvm::enumerate(device_assignment_attr)) {
-    auto device_coordinate =
-        device_coordinate_and_idx.value().dyn_cast<IntegerAttr>();
-    if (!device_coordinate)
-      return op.emitOpError(llvm::formatv(kBadIntArrayElementMsg,
-                                          kDeviceAssignmentAttr,
-                                          device_coordinate_and_idx.index()));
-
-    device_assignment->push_back(device_coordinate.getInt());
-  }
-
-  return success();
-}
-
 // Populates a TPUCompileMetadataProto with StepMarkerLocation from a
-// `tf_device::LaunchFuncOp`.
+// `tf_device::ClusterFuncOp`.
 LogicalResult SetMetadataProtoStepMarkerLocation(
-    tf_device::LaunchFuncOp op,
+    tf_device::ClusterFuncOp op,
     tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   auto step_marker_location =
       op.getAttrOfType<StringAttr>(kStepMarkerLocationAttr);
@@ -216,9 +185,9 @@ LogicalResult SetMetadataProtoStepMarkerLocation(
 }
 
 // Populates a TPUCompileMetadataProto with PaddingMap from a
-// `tf_device::LaunchFuncOp`.
+// `tf_device::ClusterFuncOp`.
 LogicalResult SetMetadataProtoPaddingMap(
-    tf_device::LaunchFuncOp op,
+    tf_device::ClusterFuncOp op,
     tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   auto padding_map = op.getAttrOfType<ArrayAttr>(kPaddingMapAttr);
   if (!padding_map)
@@ -259,9 +228,9 @@ LogicalResult SetOpSharding(Operation* op, Attribute attr, llvm::StringRef name,
 }
 
 // Populates a TPUCompileMetadataProto with argument types and sharding from a
-// `tf_device::LaunchFuncOp`.
+// `tf_device::ClusterFuncOp`.
 LogicalResult SetMetadataProtoArgs(
-    tf_device::LaunchFuncOp op,
+    tf_device::ClusterFuncOp op,
     tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   auto input_shardings =
       op.getAttrOfType<ArrayAttr>(tensorflow::kInputShardingAttr);
@@ -314,9 +283,9 @@ LogicalResult SetMetadataProtoArgs(
 }
 
 // Populates a TPUCompileMetadataProto with result sharding from a
-// `tf_device::LaunchFuncOp`.
+// `tf_device::ClusterFuncOp`.
 LogicalResult SetMetadataProtoRetvals(
-    tf_device::LaunchFuncOp op,
+    tf_device::ClusterFuncOp op,
     tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   auto output_shardings =
       op.getAttrOfType<ArrayAttr>(tensorflow::kOutputShardingAttr);
@@ -341,11 +310,11 @@ LogicalResult SetMetadataProtoRetvals(
 }
 
 // Populates a TPUCompileMetadataProto from attributes of a
-// `tf_device::LaunchFuncOp`. If any necessary attributes are missing from the
+// `tf_device::ClusterFuncOp`. If any necessary attributes are missing from the
 // op, a failure will be returned.
 // TODO(lyandy): Support session handle and guaranteed consts.
-LogicalResult SetMetadataProtoFromLaunchFuncOp(
-    tf_device::LaunchFuncOp op, int num_replicas, int num_cores_per_replica,
+LogicalResult SetMetadataProtoFromClusterFuncOp(
+    tf_device::ClusterFuncOp op, int num_replicas, int num_cores_per_replica,
     llvm::Optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
     tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   metadata->set_num_replicas(num_replicas);
@@ -377,7 +346,7 @@ tf_device::LaunchOp WrapOpInLaunch(OpBuilder* builder, Location loc,
   builder->setInsertionPointToEnd(&launch.GetBody());
   builder->create<tf_device::ReturnOp>(loc, op->getResults());
 
-  // Move op inside launch.
+  // Move op inside cluster.
   op->moveBefore(launch.GetBody().getTerminator());
 
   builder->restoreInsertionPoint(insert_point);
@@ -386,16 +355,16 @@ tf_device::LaunchOp WrapOpInLaunch(OpBuilder* builder, Location loc,
 }
 
 // Create a `tf._TPUCompileMlir` that contains a MLIR module that is
-// functionally equivalent to the function referenced by launch_func.
+// functionally equivalent to the function referenced by cluster_func.
 Operation* BuildCompileOp(
-    tf_device::LaunchFuncOp launch_func, int num_replicas,
+    tf_device::ClusterFuncOp cluster_func, int num_replicas,
     int num_cores_per_replica, llvm::StringRef compilation_device,
     llvm::Optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
     OpBuilder* builder) {
   // Set metadata from attributes.
   tensorflow::tpu::TPUCompileMetadataProto metadata;
-  if (failed(SetMetadataProtoFromLaunchFuncOp(
-          launch_func, num_replicas, num_cores_per_replica,
+  if (failed(SetMetadataProtoFromClusterFuncOp(
+          cluster_func, num_replicas, num_cores_per_replica,
           std::move(xla_device_assignment), &metadata)))
     return nullptr;
 
@@ -405,28 +374,28 @@ Operation* BuildCompileOp(
   else
     metadata.SerializeToString(&txt_metadata);
 
-  // Build a shape op for each input to launch_func.
+  // Build a shape op for each input to cluster_func.
   // TODO(b/139377366): When shape inference is ready, we can use compile time
   // shape inference to get inputs that have static shapes and only use shape
   // ops for the rest.
   llvm::SmallVector<Value, 4> compile_op_operands;
-  compile_op_operands.reserve(launch_func.getNumOperands());
+  compile_op_operands.reserve(cluster_func.getNumOperands());
 
-  for (auto operand_and_idx : llvm::enumerate(launch_func.getOperands())) {
+  for (auto operand_and_idx : llvm::enumerate(cluster_func.getOperands())) {
     // Skip adding shape op for operands that have static shapes.
     tensorflow::PartialTensorShape shape(
         metadata.args(operand_and_idx.index()).shape());
     if (shape.IsFullyDefined()) continue;
 
     auto shape_op = builder->create<TF::ShapeOp>(
-        launch_func.getLoc(),
+        cluster_func.getLoc(),
         RankedTensorType::get({-1}, builder->getIntegerType(64)),
         operand_and_idx.value());
     compile_op_operands.emplace_back(shape_op.getResult());
   }
 
-  FlatSymbolRefAttr func_attr = launch_func.funcAttr();
-  FuncOp func = launch_func.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
+  FlatSymbolRefAttr func_attr = cluster_func.funcAttr();
+  FuncOp func = cluster_func.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
       func_attr.getValue());
 
   std::string txt_module;
@@ -436,7 +405,7 @@ Operation* BuildCompileOp(
       RankedTensorType::get({}, builder->getType<TF::StringType>());
 
   auto compile_op = builder->create<TF::_TPUCompileMlirOp>(
-      launch_func.getLoc(), /*compilation_status=*/result_type, /*program=*/
+      cluster_func.getLoc(), /*compilation_status=*/result_type, /*program=*/
       llvm::SmallVector<Type, 8>(num_cores_per_replica, result_type),
       compile_op_operands, txt_module, txt_metadata);
 
@@ -448,43 +417,56 @@ Operation* BuildCompileOp(
 // core, and all replica devices per core are grouped together.
 void AssignDevicesToReplicate(
     tf_device::ReplicateOp replicate,
-    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<tensorflow::TPUDeviceAndHost, 8>>
+        tpu_devices,
     OpBuilder* builder) {
   if (!replicate) return;
 
-  const int num_replicas = execution_devices.size();
-  const int num_cores_per_replica = execution_devices.front().size();
+  const int num_replicas = tpu_devices.size();
+  const int num_cores_per_replica = tpu_devices.front().size();
 
   llvm::SmallVector<NamedAttribute, 8> device_attrs;
   for (int core = 0; core < num_cores_per_replica; ++core) {
     llvm::SmallVector<StringRef, 8> devices_by_core;
     devices_by_core.reserve(num_replicas);
     for (int replica = 0; replica < num_replicas; ++replica)
-      devices_by_core.push_back(execution_devices[replica][core]);
+      devices_by_core.push_back(tpu_devices[replica][core].device);
 
     device_attrs.push_back(
         builder->getNamedAttr(tensorflow::GetDeviceAliasForLogicalCore(core),
                               builder->getStrArrayAttr(devices_by_core)));
   }
 
+  // For data parallelism, also add replicated host devices, as these are
+  // necessary for outside compilation.
+  if (num_cores_per_replica == 1) {
+    llvm::SmallVector<StringRef, 8> hosts;
+    hosts.reserve(num_replicas);
+    for (int replica = 0; replica < num_replicas; ++replica)
+      hosts.push_back(tpu_devices[replica][0].host);
+
+    device_attrs.push_back(builder->getNamedAttr(
+        tensorflow::kTPUReplicatedHost, builder->getStrArrayAttr(hosts)));
+  }
+
   replicate.setAttr(kDevicesAttr, builder->getDictionaryAttr(device_attrs));
 }
 
 // Creates a `tf.TPUExecute` op that executes TPU program.
 LogicalResult BuildExecuteOp(
     const int core_id, llvm::ArrayRef<xla::OpSharding> output_sharding_config,
-    llvm::ArrayRef<Value> inputs, tf_device::LaunchFuncOp launch_func,
+    llvm::ArrayRef<Value> inputs, tf_device::ClusterFuncOp cluster_func,
     OpBuilder* builder, TF::TPUExecuteOp* execute_op) {
   // TODO(b/139377366): Need to snapshot all resource variable inputs in
   // follow-up CLs.
   llvm::SmallVector<Type, 4> output_types;
   auto result = tensorflow::GetOutputTypesForLogicalDeviceComputation(
-      core_id, output_sharding_config, launch_func, &output_types);
+      core_id, output_sharding_config, cluster_func, &output_types);
   if (failed(result)) return failure();
 
-  // TPUExecute has same output types as launch_func.
+  // TPUExecute has same output types as cluster_func.
   *execute_op = builder->create<TF::TPUExecuteOp>(
-      launch_func.getLoc(), output_types, inputs,
+      cluster_func.getLoc(), output_types, inputs,
       llvm::ArrayRef<NamedAttribute>{});
   return success();
 }
@@ -492,32 +474,33 @@ LogicalResult BuildExecuteOp(
 // Creates a tf_device.parallel_execute op that wraps TPUExecute op to
 // represent execution of TPU program in multiple logical cores.
 LogicalResult BuildParallelExecuteOp(
-    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<tensorflow::TPUDeviceAndHost, 8>>
+        tpu_devices,
     llvm::ArrayRef<xla::OpSharding> output_sharding_config,
-    Operation* compile_op, tf_device::LaunchFuncOp launch_func,
+    Operation* compile_op, tf_device::ClusterFuncOp cluster_func,
     OpBuilder* builder, tf_device::ParallelExecuteOp* parallel_execute_op) {
-  const int num_cores_per_replica = execution_devices.front().size();
+  const int num_cores_per_replica = tpu_devices.front().size();
   // parallel_execute op returns concatenated list of return values of
   // all its regions.
   //
   // TODO(b/149102702): Correctly map inputs to parallel_execute op via
-  // identifying xla_sharding op in the launch_func function.
-  const auto& launch_result_types = launch_func.getResultTypes();
+  // identifying xla_sharding op in the cluster_func function.
+  const auto cluster_result_types = cluster_func.getResultTypes();
   llvm::SmallVector<Type, 8> concatenated_output_types;
-  concatenated_output_types.reserve(launch_result_types.size() *
+  concatenated_output_types.reserve(cluster_result_types.size() *
                                     num_cores_per_replica);
 
   for (int core = 0; core < num_cores_per_replica; ++core) {
     llvm::SmallVector<Type, 4> output_types;
     auto result = tensorflow::GetOutputTypesForLogicalDeviceComputation(
-        core, output_sharding_config, launch_func, &output_types);
+        core, output_sharding_config, cluster_func, &output_types);
     if (failed(result)) return failure();
 
     for (Type t : output_types) concatenated_output_types.emplace_back(t);
   }
 
   *parallel_execute_op = builder->create<tf_device::ParallelExecuteOp>(
-      launch_func.getLoc(), num_cores_per_replica, concatenated_output_types);
+      cluster_func.getLoc(), num_cores_per_replica, concatenated_output_types);
 
   // Extract inputs for each region of the parallel_execute op. The i-th
   // element in the list represents the input lists to TPU computation for
@@ -525,10 +508,10 @@ LogicalResult BuildParallelExecuteOp(
   llvm::SmallVector<llvm::SmallVector<mlir::Value, 4>, 4> input_list;
   builder->setInsertionPoint(*parallel_execute_op);
   auto result = tensorflow::ExtractInputsForLogicalDevices(
-      num_cores_per_replica, launch_func, builder, &input_list);
+      num_cores_per_replica, cluster_func, builder, &input_list);
   if (failed(result)) return failure();
 
-  const bool replicated = execution_devices.size() != 1;
+  const bool replicated = tpu_devices.size() != 1;
   // For each logical core, create a region with TPUExecute op.
   assert(input_list.size() == num_cores_per_replica);
   for (int core = 0; core < num_cores_per_replica; ++core) {
@@ -539,13 +522,13 @@ LogicalResult BuildParallelExecuteOp(
     //
     // TODO(b/148913294): Identify inputs/return values specific to each
     // logical core TPU execution by parsing xla_sharding op in
-    // launch_func.
+    // cluster_func.
     auto execute_inputs = input_list[core];
     execute_inputs.emplace_back(compile_op->getResult(core + 1));
 
     TF::TPUExecuteOp execute;
     result = BuildExecuteOp(core, output_sharding_config, execute_inputs,
-                            launch_func, builder, &execute);
+                            cluster_func, builder, &execute);
     if (failed(result)) return failure();
 
     // If computation is replicated, use aliased device. Otherwise there is only
@@ -553,7 +536,7 @@ LogicalResult BuildParallelExecuteOp(
     // op.
     std::string device = replicated
                              ? tensorflow::GetDeviceAliasForLogicalCore(core)
-                             : execution_devices.front()[core];
+                             : tpu_devices.front()[core].device;
 
     auto region_launch_op =
         WrapOpInLaunch(builder, region.getParent()->getLoc(), execute, device);
@@ -566,13 +549,14 @@ LogicalResult BuildParallelExecuteOp(
 }
 
 tf_device::LaunchOp AssignDevicesToReplicatedExecute(
-    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<tensorflow::TPUDeviceAndHost, 8>>
+        tpu_devices,
     Operation* execute_op, OpBuilder* builder) {
-  const bool replicated = execution_devices.size() != 1;
+  const bool replicated = tpu_devices.size() != 1;
   // If computation is replicated, use aliased device. Otherwise there is only
   // one execution device and the device is assigned to the execute op.
   std::string device = replicated ? tensorflow::GetDeviceAliasForLogicalCore(0)
-                                  : execution_devices.front().front();
+                                  : tpu_devices.front().front().device;
 
   return WrapOpInLaunch(builder, execute_op->getLoc(), execute_op, device);
 }
@@ -587,16 +571,16 @@ void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
   WrapOpInLaunch(builder, compile_op->getLoc(), assert_op, compilation_device);
 }
 
-// Rewrites a `tf_device.launch_func` operation into a set of TPU Runtime
-// Operations that jit-compiles and executes function in `tf_device.launch_func`
-// on TPU. Device assignment is determined from available devices in `devices`.
-// If it is not possible to rewrite the operation or device assignment fails, a
-// failure will be returned.
+// Rewrites a `tf_device.cluster_func` operation into a set of TPU Runtime
+// Operations that jit-compiles and executes function in
+// `tf_device.cluster_func` on TPU. Device assignment is determined from
+// available devices in `devices`. If it is not possible to rewrite the
+// operation or device assignment fails, a failure will be returned.
 //
-// For example, a non replicated `tf_device.launch_func`:
+// For example, a non replicated `tf_device.cluster_func`:
 //
 // func @main(%arg0: tensor<i1>) {
-//   %0 = "tf_device.launch_func"(%arg0)
+//   %0 = "tf_device.cluster_func"(%arg0)
 //          {_tpu_replicate = "cluster0", device = "", func = @_func} :
 //          (tensor<i1>) -> tensor<i1>
 //   return
@@ -613,12 +597,12 @@ void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
 //   return
 // }
 //
-// and a replicated `tf_device.launch_func`:
+// and a replicated `tf_device.cluster_func`:
 //
 // func @main(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 //   %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<i1>)
 //                              {n = 2 : i32} {
-//     %1 = "tf_device.launch_func"(%ri)
+//     %1 = "tf_device.cluster_func"(%ri)
 //            {_tpu_replicate = "cluster0", device = "", func = @_func} :
 //            (tensor<i1>) -> tensor<i1>
 //     tf_device.return %1 : tensor<i1>
@@ -641,53 +625,78 @@ void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
 //   return
 // }
 LogicalResult Rewrite(
-    tf_device::LaunchFuncOp launch_func,
+    tf_device::ClusterFuncOp cluster_func,
     llvm::ArrayRef<tensorflow::DeviceNameUtils::ParsedName> devices,
     OpBuilder* builder) {
-  // Skip non-tpu device launch_func.
-  auto replicate_attr = launch_func.getAttrOfType<StringAttr>("_tpu_replicate");
+  // Skip non-tpu device cluster_func.
+  auto replicate_attr =
+      cluster_func.getAttrOfType<StringAttr>("_tpu_replicate");
   if (!replicate_attr) return success();
 
   // Collect `num_replicas` and `num_cores_per_replica` attributes.
   int num_replicas = 1;
   tf_device::ReplicateOp replicate =
-      launch_func.getParentOp()
+      cluster_func.getParentOp()
           ? llvm::dyn_cast_or_null<tf_device::ReplicateOp>(
-                launch_func.getParentOp())
+                cluster_func.getParentOp())
           : nullptr;
   if (replicate) num_replicas = replicate.n().getLimitedValue();
 
-  auto num_cores_per_replica_attr =
-      launch_func.getAttrOfType<IntegerAttr>(kNumCoresPerReplicaAttr);
+  auto num_cores_per_replica_attr = cluster_func.getAttrOfType<IntegerAttr>(
+      tensorflow::kNumCoresPerReplicaAttr);
   if (!num_cores_per_replica_attr)
-    return launch_func.emitOpError(
-        CreateMissingAttributeMsg(kNumCoresPerReplicaAttr));
+    return cluster_func.emitOpError(
+        CreateMissingAttributeMsg(tensorflow::kNumCoresPerReplicaAttr));
 
   int num_cores_per_replica = num_cores_per_replica_attr.getInt();
 
-  auto topology_attr = launch_func.getAttrOfType<StringAttr>(kTopologyAttr);
+  auto topology_attr =
+      cluster_func.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
   if (!topology_attr)
-    return launch_func.emitOpError(CreateMissingAttributeMsg(kTopologyAttr));
+    return cluster_func.emitOpError(
+        CreateMissingAttributeMsg(tensorflow::kTopologyAttr));
 
-  llvm::SmallVector<int64_t, 6> device_assignment;
-  if (failed(GetDeviceCoordinates(launch_func, &device_assignment)))
-    return failure();
+  auto device_assignment_attr = cluster_func.getAttrOfType<mlir::ArrayAttr>(
+      tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return cluster_func.emitOpError(
+        llvm::formatv("requires attribute '{0}'",
+                      tensorflow::kDeviceAssignmentAttr)
+            .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+  if (!status_or_device_coodinates.ok())
+    return cluster_func.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
 
   // Determine compilation and execution devices.
   auto status_or_tpu_device_assignment =
       tensorflow::GetTPUCompilationAndExecutionDevices(
           devices, num_replicas, num_cores_per_replica,
-          topology_attr.getValue(), device_assignment);
+          topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
   if (!status_or_tpu_device_assignment.ok())
-    return launch_func.emitError()
+    return cluster_func.emitError()
            << "error in fetching TPU compilation/execution devices: "
            << status_or_tpu_device_assignment.status().error_message();
 
   // Create compile op.
   auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
-  builder->setInsertionPoint(launch_func);
+  builder->setInsertionPoint(cluster_func);
+
+  // Create the TPUCompileMlir and TPUCompileSucceededAssert outside of
+  // parallel_execute region if it exists.
+  if (llvm::isa<tf_device::ParallelExecuteOp>(cluster_func.getParentOp())) {
+    // Currently, outside compilation and model parallelism are not supported
+    // together.
+    assert(num_cores_per_replica == 1);
+    builder->setInsertionPoint(cluster_func.getParentOp());
+  }
+
   Operation* compile_op = BuildCompileOp(
-      launch_func, num_replicas, num_cores_per_replica,
+      cluster_func, num_replicas, num_cores_per_replica,
       tpu_device_assignment.compilation_device,
       std::move(tpu_device_assignment.xla_device_assignment), builder);
   if (!compile_op) return failure();
@@ -696,54 +705,55 @@ LogicalResult Rewrite(
   // the same _tpu_replicate attribute and replace it with the result of the
   // compile op. This op is used as a placeholder to hook during graph creation
   // the other ops that are intended to consume the compile result.
-  Block* block = launch_func.getOperation()->getBlock();
+  Block* block = cluster_func.getOperation()->getBlock();
   for (auto compile_result_op : block->getOps<TF::TPUCompilationResultOp>())
     compile_result_op.output().replaceAllUsesWith(compile_op->getResult(0));
 
   BuildTPUCompileSucceededAssertOp(
       compile_op, tpu_device_assignment.compilation_device, builder);
 
-  AssignDevicesToReplicate(replicate, tpu_device_assignment.execution_devices,
+  AssignDevicesToReplicate(replicate, tpu_device_assignment.tpu_devices,
                            builder);
 
   llvm::SmallVector<xla::OpSharding, 4> output_shardings;
   auto result = tensorflow::ParseAndValidateOutputSharding(
-      num_cores_per_replica, launch_func, &output_shardings);
+      num_cores_per_replica, cluster_func, &output_shardings);
   if (failed(result)) return failure();
 
+  builder->setInsertionPoint(cluster_func);
   if (num_cores_per_replica > 1) {
     // For model parallelism, tf_device.parallel_execute is used to express
     // concurrent device execution across multiple logical devices.
 
     tf_device::ParallelExecuteOp execute_op;
-    result = BuildParallelExecuteOp(tpu_device_assignment.execution_devices,
-                                    output_shardings, compile_op, launch_func,
+    result = BuildParallelExecuteOp(tpu_device_assignment.tpu_devices,
+                                    output_shardings, compile_op, cluster_func,
                                     builder, &execute_op);
     if (failed(result)) return failure();
 
     // As tf_device.parallel_execute wraps # logical cores number of TPUExecute
     // ops, the number of return values of parallel_execute op exceeds that of
-    // launch_func op. As so, each return value of parallel_execute op must be
-    // mapped with corresponding return value usages of launch_func.
-    tensorflow::RemapOutputsFromLogicalDevices(launch_func.getLoc(),
-                                               output_shardings, launch_func,
+    // cluster_func op. As so, each return value of parallel_execute op must be
+    // mapped with corresponding return value usages of cluster_func.
+    tensorflow::RemapOutputsFromLogicalDevices(cluster_func.getLoc(),
+                                               output_shardings, cluster_func,
                                                execute_op, builder);
   } else {
-    llvm::SmallVector<Value, 4> execute_inputs(launch_func.getOperands());
+    llvm::SmallVector<Value, 4> execute_inputs(cluster_func.getOperands());
     execute_inputs.emplace_back(compile_op->getResult(1));
 
     TF::TPUExecuteOp execute_op;
     result = BuildExecuteOp(
-        /*core_id=*/0, output_shardings, execute_inputs, launch_func, builder,
+        /*core_id=*/0, output_shardings, execute_inputs, cluster_func, builder,
         &execute_op);
     if (failed(result)) return failure();
 
     tf_device::LaunchOp launch_op = AssignDevicesToReplicatedExecute(
-        tpu_device_assignment.execution_devices, execute_op, builder);
-    launch_func.replaceAllUsesWith(launch_op);
+        tpu_device_assignment.tpu_devices, execute_op, builder);
+    cluster_func.replaceAllUsesWith(launch_op);
   }
 
-  launch_func.erase();
+  cluster_func.erase();
 
   return success();
 }
@@ -754,7 +764,7 @@ void TPURewritePass::runOnOperation() {
     return signalPassFailure();
 
   OpBuilder builder(&getContext());
-  auto result = getOperation().walk([&](tf_device::LaunchFuncOp op) {
+  auto result = getOperation().walk([&](tf_device::ClusterFuncOp op) {
     if (failed(Rewrite(op, devices.device_names(), &builder)))
       return WalkResult::interrupt();
 
@@ -777,7 +787,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass() {
 
 static PassRegistration<TPURewritePass> pass(
     "tf-tpu-rewrite",
-    "Rewriting `tf_device.launch_func` on TPUs into TPU runtime ops");
+    "Rewriting `tf_device.cluster_func` on TPUs into TPU runtime ops");
 
 }  // namespace TFTPU
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index f0455cf010a..f8b6e364f55 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -24,8 +24,10 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -45,18 +47,19 @@ struct TPUShardingIdentificationPass
   void runOnOperation() override;
 };
 
-// XlaSharding op may be direct user of inputs but it may also be followed by
-// an Identity op and, in the case where bfloat16 type is used, Cast op may be
-// added right after the input. As so, parse the users of the operation to
-// access connected XlaSharding op.
+// Sets `sharding_op` if `op` is XlaShardingOp or if XlaSharding op is adjacent
+// to `op`. XlaSharding op may be direct user of inputs but it may also be
+// followed by an Identity op and, in the case where bfloat16 type is used, Cast
+// op may be added right after the input. As so, parse the users of the
+// operation to access connected XlaSharding op.
 //
-// TODO(hongjunchoi): Consider explicitly checking op patterns to detect
-// sharded inputs.
-void GetAdjacentToXlaShardingOp(
-    Operation* op, llvm::Optional<TF::XlaShardingOp>* sharding_op) {
-  // TODO(hongjunchoi): Detect the case when sharding configuration is
-  // ambiguous for a single input (i.e. multiple different XlaSharding ops
-  // with different configuration policies are connected).
+// TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
+// inputs.
+void GetAdjacentXlaShardingOp(Operation* op,
+                              llvm::Optional<TF::XlaShardingOp>* sharding_op) {
+  // TODO(hongjunchoi): Detect the case when sharding configuration is ambiguous
+  // for a single input (i.e. multiple different XlaSharding ops with different
+  // configuration policies are connected).
   if (sharding_op->hasValue()) return;
 
   if (auto sharding = llvm::dyn_cast<TF::XlaShardingOp>(op)) {
@@ -66,100 +69,190 @@ void GetAdjacentToXlaShardingOp(
 
   if (llvm::isa<TF::IdentityOp>(op) || llvm::isa<TF::CastOp>(op)) {
     for (auto user : op->getUsers())
-      GetAdjacentToXlaShardingOp(user, sharding_op);
+      GetAdjacentXlaShardingOp(user, sharding_op);
   }
 }
 
-// Parse XlaSharding op connected to input args. If Input to
-// tf_device.LaunchFunc op is of resource type, then XlaSharding op
-// will be connected to following ReadVariable op.
+// Parses XlaSharding op connected to input args. If Input to
+// tf_device.ClusterFunc op is of resource type, then XlaSharding op will be
+// connected to following ReadVariable op.
 //
-// TODO(hongjunchoi): Add logic to parse XlaSharding op inside a
-// Call op or if/while op.
-llvm::Optional<llvm::StringRef> ParseInputSharding(const FuncOp func,
-                                                   const int arg_index,
-                                                   const Value& arg) {
+// TODO(hongjunchoi): Add logic to parse XlaSharding op inside a Call op or
+// If/While op.
+llvm::Optional<llvm::StringRef> ParseInputSharding(const Value& arg) {
   llvm::Optional<TF::XlaShardingOp> parsed_sharding_op;
   for (auto user : arg.getUsers()) {
     if (parsed_sharding_op) continue;
 
-    GetAdjacentToXlaShardingOp(user, &parsed_sharding_op);
+    GetAdjacentXlaShardingOp(user, &parsed_sharding_op);
     if (parsed_sharding_op) continue;
 
     if (llvm::isa<TF::ReadVariableOp>(user))
       for (auto read_variable_user : user->getUsers())
-        GetAdjacentToXlaShardingOp(read_variable_user, &parsed_sharding_op);
+        GetAdjacentXlaShardingOp(read_variable_user, &parsed_sharding_op);
   }
 
   if (!parsed_sharding_op) return llvm::Optional<llvm::StringRef>();
-  return tensorflow::ParseShardingAttribute(parsed_sharding_op->getOperation());
+  return parsed_sharding_op.getValue()._XlaSharding();
 }
 
-// If operand of return value of tf_device.LaunchFunc op is directly from
-// XlaSharding op, return the provided sharding configuration.
+// Returns the provided sharding configuration if operand of return value of
+// tf_device.ClusterFunc op is directly from XlaSharding op,
 llvm::Optional<StringRef> ParseReturnValueSharding(FuncOp func,
                                                    const int output_index,
                                                    const OpOperand& operand) {
   if (auto sharding_op = llvm::dyn_cast_or_null<TF::XlaShardingOp>(
-          operand.get().getDefiningOp())) {
-    return tensorflow::ParseShardingAttribute(sharding_op.getOperation());
-  }
+          operand.get().getDefiningOp()))
+    return sharding_op._XlaSharding();
 
   return llvm::Optional<StringRef>();
 }
 
-// If XlaSharding op is connected to input/output of the tf_device.LaunchFuncOp,
-// then add attributes to the op specifying the sharding configurations.
-void IdentifyXlaShardingForTPUComputation(Builder* builder,
-                                          tf_device::LaunchFuncOp launch_func) {
-  // Look up function definition from module.
-  FuncOp func = launch_func.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
-      launch_func.func());
-  Block& func_entry_block = func.getBody().getBlocks().front();
+// Includes information on Func op and argument index of the input value. This
+// is used to trace Value that is fed into function call ops.
+struct FunctionAndArgumentInfo {
+  FuncOp func;
+  int argument_index;
+};
 
-  // By default inputs have maximal sharding and inputs are assigned to
-  // logical core 0 if no sharding is defined.
-  const std::string logical_core_0_sharding =
-      xla::sharding_builder::AssignDevice(0).SerializeAsString();
-  auto logical_core_0_sharding_attr =
-      builder->getStringAttr(logical_core_0_sharding);
+// Adds tf.PartitionedCall op or tf.StatefulPartitionedCall op to `list`. If
+// `op` is a function call op, then find the func op from provided `module` and
+// add the func op with `arg_index` to `list`. `list` will later be used to
+// trace mlir::Value that is fed into (potentially nested) function call ops.
+void AddFunctionalOpsToList(
+    const int arg_index, ModuleOp module, Operation* op,
+    llvm::SmallVectorImpl<FunctionAndArgumentInfo>* list) {
+  if (auto pcall_op = llvm::dyn_cast<TF::PartitionedCallOp>(op)) {
+    if (!pcall_op.f().isa<FlatSymbolRefAttr>()) return;
+
+    auto pcall_func = llvm::cast<FuncOp>(
+        module.lookupSymbol(pcall_op.f().getRootReference()));
+    assert(pcall_func);
+    list->emplace_back(FunctionAndArgumentInfo{pcall_func, arg_index});
+
+  } else if (auto spcall_op =
+                 llvm::dyn_cast<TF::StatefulPartitionedCallOp>(op)) {
+    auto sp_call_func = llvm::cast<FuncOp>(module.lookupSymbol(spcall_op.f()));
+    assert(sp_call_func);
+    list->emplace_back(FunctionAndArgumentInfo{sp_call_func, arg_index});
+  }
+}
+
+// Walks the MLIR graph from `arg` and return a list of all function call ops to
+// which the `arg` op is directly connected.
+//
+// For example:
+//   argument0 -> PartitionedCallOp -> StatefulPartitionedCallOp -> AddOp
+//
+// For above case, PartitionedCall op and StatefulPartitionedCallOp will be
+// returned.
+llvm::SmallVector<FunctionAndArgumentInfo, 4> ExtractFunctionsConnectedToArg(
+    BlockArgument arg, ModuleOp module) {
+  llvm::SmallVector<FunctionAndArgumentInfo, 4> functions_connected_to_arg;
+  for (auto& arg_use : arg.getUses())
+    AddFunctionalOpsToList(arg_use.getOperandNumber(), module,
+                           arg_use.getOwner(), &functions_connected_to_arg);
+
+  llvm::SmallVector<FunctionAndArgumentInfo, 4> functions_to_parse{
+      functions_connected_to_arg.begin(), functions_connected_to_arg.end()};
+
+  while (!functions_to_parse.empty()) {
+    llvm::SmallVector<FunctionAndArgumentInfo, 4> newly_discovered_functions;
+    for (auto function_info : functions_to_parse) {
+      Block& func_entry_block =
+          function_info.func.getBody().getBlocks().front();
+      auto argument =
+          func_entry_block.getArgument(function_info.argument_index);
+
+      for (auto& arg_use : argument.getUses())
+        AddFunctionalOpsToList(arg_use.getOperandNumber(), module,
+                               arg_use.getOwner(), &newly_discovered_functions);
+    }
+
+    functions_connected_to_arg.append(newly_discovered_functions.begin(),
+                                      newly_discovered_functions.end());
+    std::swap(functions_to_parse, newly_discovered_functions);
+  }
+
+  return functions_connected_to_arg;
+}
+
+// Walks the graph from the arguments of the `cluster_func_op` and extracts
+// sharding configurations for all inputs by parsing XlaSharding op connected to
+// the arguments. If argument to the `cluster_func_op` directly feeds into
+// another function call op, then recursively walk the function definition to
+// find the connected XlaSharding op.
+void IdentifyXlaShardingForComputationInputs(
+    StringRef logical_core_0_sharding, tf_device::ClusterFuncOp cluster_func_op,
+    FuncOp cluster_function, Builder* builder) {
+  // Look up function definition from module.
+  Block& cluster_function_block =
+      cluster_function.getBody().getBlocks().front();
+  ModuleOp module = cluster_func_op.getParentOfType<ModuleOp>();
 
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_args(
-      func_entry_block.getNumArguments(), logical_core_0_sharding);
+      cluster_function_block.getNumArguments(), logical_core_0_sharding);
 
-  // Iterate through input arguments to the entry block of tf_device.LaunchFunc.
-  // For input ops, look for following XlaSharding ops. XlaSharding ops can:
+  // Iterate through input arguments to the entry block of
+  // tf_device.ClusterFunc. For input ops, look for following XlaSharding ops.
+  // XlaSharding ops can:
   //   1) Directly follow the input argument if input argument has non-resource
   //      types.
   //   2) Follow ReadVariableOp if the input type is of resource type.
   //   3) Follow IdentityOp or CastOp after above cases (1), (2).
   //
-  // Sharding configurations are added to the tf_device.LaunchFunc as an
+  // Sharding configurations are added to the tf_device.ClusterFunc as an
   // attribute and the function as an argument attribute.
-  for (auto& arg : func_entry_block.getArguments()) {
-    const int index = arg.getArgNumber();
-    auto arg_sharding = ParseInputSharding(func, index, arg);
+  for (auto& arg : cluster_function_block.getArguments()) {
+    auto arg_sharding = ParseInputSharding(arg);
+    const int arg_index_to_tpu_computation = arg.getArgNumber();
+
+    if (!arg_sharding.hasValue()) {
+      auto connected_functions_to_arg =
+          ExtractFunctionsConnectedToArg(arg, module);
+      for (auto& function_arg_info : connected_functions_to_arg) {
+        if (arg_sharding.hasValue()) break;
+
+        const int function_argument_index = function_arg_info.argument_index;
+        auto& parsed_function = function_arg_info.func;
+        Block& parsed_function_block =
+            parsed_function.getBody().getBlocks().front();
+        arg_sharding = ParseInputSharding(
+            parsed_function_block.getArgument(function_argument_index));
+      }
+    }
 
     if (arg_sharding) {
-      sharding_for_args[index] = arg_sharding.getValue();
-      func.setArgAttr(index, kShardingAttr,
-                      builder->getStringAttr(arg_sharding.getValue()));
+      sharding_for_args[arg_index_to_tpu_computation] = arg_sharding.getValue();
+      cluster_function.setArgAttr(
+          arg_index_to_tpu_computation, kShardingAttr,
+          builder->getStringAttr(arg_sharding.getValue()));
     } else {
-      func.setArgAttr(index, kShardingAttr, logical_core_0_sharding_attr);
+      cluster_function.setArgAttr(
+          arg_index_to_tpu_computation, kShardingAttr,
+          builder->getStringAttr(logical_core_0_sharding));
     }
   }
-  launch_func.setAttr(tensorflow::kInputShardingAttr,
-                      builder->getStrArrayAttr(sharding_for_args));
 
+  cluster_func_op.setAttr(tensorflow::kInputShardingAttr,
+                          builder->getStrArrayAttr(sharding_for_args));
+}
+
+// Parses XlaSharding op directly connected from the outputs of the
+// `cluster_func` and extract sharding configurations for outputs.
+void IdentifyXlaShardingForComputationOutputs(
+    StringRef logical_core_0_sharding, FuncOp func,
+    tf_device::ClusterFuncOp cluster_func, Builder* builder) {
   // By default return values from logical core 0 is used if no sharding
   // configuration is defined.
-  Operation* terminator = func_entry_block.getTerminator();
+  Block& function_block = func.getBody().getBlocks().front();
+  Operation* terminator = function_block.getTerminator();
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_rets(
       terminator->getNumOperands(), logical_core_0_sharding);
 
   // Iterate through operands of the terminator. If the preceding op is
   // XlaShardingOp, then the provided sharding configuration is added to the
-  // tf_device.LaunchFunc as an attribute and the function as a result
+  // tf_device.ClusterFunc as an attribute and the function as a result
   // attribute.
   for (auto& ret : terminator->getOpOperands()) {
     const int index = ret.getOperandNumber();
@@ -170,17 +263,39 @@ void IdentifyXlaShardingForTPUComputation(Builder* builder,
       func.setResultAttr(index, kShardingAttr,
                          builder->getStringAttr(ret_sharding.getValue()));
     } else {
-      func.setResultAttr(index, kShardingAttr, logical_core_0_sharding_attr);
+      func.setResultAttr(index, kShardingAttr,
+                         builder->getStringAttr(logical_core_0_sharding));
     }
   }
-  launch_func.setAttr(tensorflow::kOutputShardingAttr,
-                      builder->getStrArrayAttr(sharding_for_rets));
+  cluster_func.setAttr(tensorflow::kOutputShardingAttr,
+                       builder->getStrArrayAttr(sharding_for_rets));
+}
+
+// Extracts input/output sharding configuration of `cluster_func` by parsing
+// XlaSharding ops inside the `cluster_func`.
+void IdentifyXlaShardingForTPUComputation(
+    Builder* builder, tf_device::ClusterFuncOp cluster_func) {
+  // Look up function definition from module.
+  FuncOp func = cluster_func.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
+      cluster_func.func());
+
+  // By default inputs/outputs have maximal sharding and are assigned to logical
+  // core 0 if no sharding is defined.
+  const std::string logical_core_0_sharding =
+      xla::sharding_builder::AssignDevice(0).SerializeAsString();
+
+  IdentifyXlaShardingForComputationInputs(logical_core_0_sharding, cluster_func,
+                                          func, builder);
+
+  IdentifyXlaShardingForComputationOutputs(logical_core_0_sharding, func,
+                                           cluster_func, builder);
 }
 
 void TPUShardingIdentificationPass::runOnOperation() {
   Builder builder(getOperation().getContext());
-  getOperation().walk([&](tf_device::LaunchFuncOp launch_func) {
-    IdentifyXlaShardingForTPUComputation(&builder, launch_func);
+
+  getOperation().walk([&](tf_device::ClusterFuncOp cluster_func) {
+    IdentifyXlaShardingForTPUComputation(&builder, cluster_func);
   });
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index a6ea26b1ebf..9e8745918e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/STLExtras.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -347,11 +346,9 @@ TF::WhileOp AddStateVarsToWhileOp(TF::WhileOp while_op, FuncOp body,
   if (new_while_op.output_shapes().size() != 0) {
     auto new_output_shapes = llvm::to_vector<4>(new_while_op.output_shapes());
     // VarHandleOp is a scalar shape resource.
-    tensorflow::TensorShapeProto scalar;
-    scalar.set_unknown_rank(false);
     for (int64_t i = 0; i < state_vars.size(); ++i) {
-      new_output_shapes.push_back(builder.getStringAttr(
-          tensorflow::mangling_util::MangleShape(scalar)));
+      new_output_shapes.push_back(
+          mlir::TF::ShapeAttr::get(builder.getContext(), ArrayRef<int64_t>()));
     }
     new_while_op.setAttr("output_shapes",
                          builder.getArrayAttr(new_output_shapes));
@@ -570,7 +567,11 @@ void TPUVariableRuntimeReformattingPass::runOnOperation() {
       replicate = nullptr;
       return WalkResult::interrupt();
     });
-    if (replicate) HandleReplicateOp(while_op, replicate, &getContext());
+    // Model parallelism is not supported, and can be detected when a
+    // `tf_device.parallel_execute` op in the `tf_device.replicate` is present.
+    if (replicate &&
+        replicate.GetBody().getOps<tf_device::ParallelExecuteOp>().empty())
+      HandleReplicateOp(while_op, replicate, &getContext());
   });
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
index 4f852af47e5..ceb2d86899b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
index 510337b54cd..3245e3b9e6a 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/STLExtras.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -114,7 +113,7 @@ void BreakUpIslands::runOnFunction() {
     state.addOperands(operands);
     Operation* new_op = builder.createOperation(state);
     item.replaceAllUsesWith(new_op);
-    new_op->setAttrs(item.getAttrList());
+    new_op->setAttrs(item.getMutableAttrDict());
     item.erase();
   }
 }
@@ -220,7 +219,7 @@ void BreakUpIslands::BreakUpIsland(
   }
 
   // Skip islands that are already only a single op.
-  if (has_single_element(island_body)) return;
+  if (hasSingleElement(island_body)) return;
 
   auto control_type = tf_executor::ControlType::get(&getContext());
   auto island_control_inputs = llvm::to_vector<4>(island_op.controlInputs());
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc b/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc
index b5ebd45936a..9aeaa0ba318 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc
@@ -167,7 +167,7 @@ void ControlToExecutorDialectConversion::runOnFunction() {
       op.getResult(0).replaceAllUsesWith(replacement->getResult(0));
       for (int i : llvm::seq<int>(1, op.getNumResults()))
         op.getResult(i).replaceAllUsesWith(replacement->getResult(i + 1));
-      replacement->setAttrs(op.getAttrList());
+      replacement->setAttrs(op.getMutableAttrDict());
       op.erase();
       continue;
     } else if (op.getName().getStringRef() == "_tf.NextIteration.sink") {
@@ -177,7 +177,7 @@ void ControlToExecutorDialectConversion::runOnFunction() {
           frame_name_to_loop[frame.getValue()];
       replacement = builder.create<tf_executor::NextIterationSinkOp>(
           loc, srcOp.token(), operands, ArrayRef<NamedAttribute>{});
-      replacement->setAttrs(op.getAttrList());
+      replacement->setAttrs(op.getMutableAttrDict());
       op.erase();
       continue;
     } else if (op.getName().getStringRef() == "_tf.LoopCond") {
@@ -220,7 +220,7 @@ void ControlToExecutorDialectConversion::runOnFunction() {
       // Create the operation inside the island
       OpBuilder island_builder = OpBuilder::atBlockEnd(&island.GetBody());
       Operation *inner_op = island_builder.createOperation(result);
-      inner_op->setAttrs(op.getAttrList());
+      inner_op->setAttrs(op.getMutableAttrDict());
 
       // Add the terminator for the island
       SmallVector<Value, 8> ret_vals(inner_op->getResults());
@@ -230,7 +230,7 @@ void ControlToExecutorDialectConversion::runOnFunction() {
     // Copy the attributes from the original operation to the replacement and
     // remap the results.
     if (!isa<tf_executor::IslandOp>(replacement))
-      replacement->setAttrs(op.getAttrList());
+      replacement->setAttrs(op.getMutableAttrDict());
     for (int i : llvm::seq<int>(0, op.getNumResults()))
       op.getResult(i).replaceAllUsesWith(replacement->getResult(i));
     op.erase();
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc b/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc
index 7d0b75006a7..481f1fac7b8 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc
@@ -136,7 +136,7 @@ void ExecutorToControlDialectConversion::runOnFunction() {
 
         // Create the replacement operation.
         auto *replacement = builder.createOperation(state);
-        replacement->setAttrs(wrapped_op.getAttrList());
+        replacement->setAttrs(wrapped_op.getMutableAttrDict());
 
         for (auto ops_and_ret_vals :
              llvm::zip(wrapped_op.getResults(), replacement->getResults()))
@@ -208,7 +208,7 @@ void ExecutorToControlDialectConversion::runOnFunction() {
 
     // Create the replacement operation.
     auto *replacement = builder.createOperation(state);
-    replacement->setAttrs(op.getAttrList());
+    replacement->setAttrs(op.getMutableAttrDict());
 
     if (auto next_iteration =
             dyn_cast<tf_executor::NextIterationSourceOp>(op)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 2a349988084..75fcede8fbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -143,7 +143,7 @@ Status HasSingleGraphSingleOpIslandsFunctions(mlir::ModuleOp module) {
       return mlir::WalkResult::interrupt();
     }
 
-    if (!has_single_element(block)) {
+    if (!hasSingleElement(block)) {
       status = errors::FailedPrecondition(
           kInvalidExecutorGraphMsg,
           "function does not only contain a single tf_executor.graph.");
@@ -236,7 +236,6 @@ class Exporter {
   typedef absl::InlinedVector<Node*, 4> NodeVector;
   absl::flat_hash_map<Operation*, NodeVector> returns_;
   const mlir::Dialect* tf_dialect_;
-  llvm::DenseSet<Operation*> to_delete_;
 };
 
 StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
@@ -252,6 +251,10 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
 
   node_def->set_op(FunctionLibraryDefinition::kArgOp);
 
+  TF_RETURN_IF_ERROR(SetShapeAttribute("_output_shapes",
+                                       arg.getType().cast<mlir::ShapedType>(),
+                                       node_def->mutable_attr()));
+
   DataType dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(
       arg.getType().cast<mlir::TensorType>().getElementType(), &dtype));
@@ -418,59 +421,11 @@ bool IsEntryFunctionArg(BlockArgument arg) {
 // name will be used instead of generating a unique name.
 Status Exporter::AddArgumentNode(BlockArgument arg, unsigned index,
                                  llvm::StringRef name) {
-  if (!IsEntryFunctionArg(arg) || !name.empty()) {
-    TF_ASSIGN_OR_RETURN(auto node_def, GetArgumentNode(arg, index, name));
-    Status status;
-    Node* node = graph_->AddNode(*node_def, &status);
-    TF_RETURN_IF_ERROR(status);
-    args_[arg] = node;
-    return status;
-  }
-
-  // If it is an argument from the "main" function, it has only one user, which
-  // is an input node. We recover the original input node and skip adding the
-  // argument node. The new input node will be handled as normal in the
-  // following steps.
-  if (!arg.hasOneUse()) {
-    return errors::FailedPrecondition(
-        "Arg in 'main' should only have one user.");
-  }
-  auto* input = *arg.user_begin();
-  auto* parent = input->getParentOp();
-  auto island = llvm::dyn_cast_or_null<mlir::tf_executor::IslandOp>(parent);
-  if (!island)
-    return errors::FailedPrecondition(
-        "User of arg in 'main' must be in an inner op of a "
-        "tf_executor.island.");
-
-  if (!island.control().use_empty())
-    return errors::FailedPrecondition(
-        "tf_executor.island of user of arg in 'main' must have no control "
-        "output users.");
-
-  auto input_name = input->getName().getStringRef();
-  input_name.consume_back(".input");
-
-  mlir::OpBuilder builder(island.getContext());
-  builder.setInsertionPointToStart(&island.GetBody());
-  auto loc = mlir::NameLoc::get(
-      builder.getIdentifier(op_to_name_.GetUniqueName(input)),
-      builder.getContext());
-  OperationState state(loc, input_name.str());
-  state.attributes.append(input->getAttrs().begin(), input->getAttrs().end());
-  for (auto op : input->getOperands()) {
-    // Skip the argument in the new operation.
-    if (op.isa<BlockArgument>()) continue;
-    state.operands.push_back(op);
-  }
-  state.types.append(input->getResultTypes().begin(),
-                     input->getResultTypes().end());
-  auto* inst = builder.createOperation(state);
-  // If it is one of the specified input names, then the new instruction should
-  // have the same name.
-  op_to_name_.InitOpName(inst, op_to_name_.GetUniqueName(input));
-  input->replaceAllUsesWith(inst);
-  to_delete_.insert(input);
+  TF_ASSIGN_OR_RETURN(auto node_def, GetArgumentNode(arg, index, name));
+  Status status;
+  Node* node = graph_->AddNode(*node_def, &status);
+  TF_RETURN_IF_ERROR(status);
+  args_[arg] = node;
   return Status::OK();
 }
 
@@ -520,9 +475,6 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
     absl::flat_hash_set<Node*>* control_ret_nodes) {
   mlir::Block& block = function.front();
 
-  // Determine if _Arg and _Retval nodes should use input and output names.
-  bool graph_as_function = false;
-
   // Extract input & output names if set.
   llvm::SmallVector<llvm::StringRef, 2> input_names;
   llvm::SmallVector<llvm::StringRef, 2> output_names;
@@ -537,7 +489,6 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
         input_names, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
     dict_attr.get("outputs").cast<mlir::StringAttr>().getValue().split(
         output_names, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
-    graph_as_function = configs.graph_as_function;
   }
 
   auto graph = absl::make_unique<Graph>(OpRegistry::Global());
@@ -565,38 +516,24 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
         << ") != terminator operands (" << num_data_results << ")";
     llvm::DenseMap<Operation*, llvm::StringRef> output_op_to_name;
     llvm::StringMap<Operation*> name_to_op;
-    for (auto it : llvm::enumerate(graph_op.GetFetch().getOperands())) {
+    for (const auto& it : llvm::enumerate(graph_op.GetFetch().getOperands())) {
       // Skip control rets.
       if (it.index() >= num_data_results) break;
-      // If there is a result index specified, ensure only one and that it
-      // matches the result index of the op.
-      auto result = it.value().cast<mlir::OpResult>();
+      // TODO(jpienaar): If there is a result index specified, ensure only one
+      // and that it matches the result index of the op.
       std::string orig_name(output_names[it.index()]);
       auto tensor_id = ParseTensorName(orig_name);
       auto name = LegalizeNodeName(
           llvm::StringRef(tensor_id.node().data(), tensor_id.node().size()));
 
-      if (graph_as_function) {
-        // Ensure name does not get reused.
-        (void)exporter.op_to_name_.GetUniqueName(name);
-        continue;
-      }
-
-      Operation* defining_op = GetIslandInnerOpOrSelf(result.getDefiningOp());
-      if (output_op_to_name.insert({defining_op, name}).second) {
-        TF_RET_CHECK(name_to_op.insert({name, defining_op}).second)
-            << "multiple operations associated with the same name";
-        exporter.op_to_name_.InitOpName(defining_op, name);
-      } else {
-        TF_RET_CHECK(output_op_to_name[defining_op] == name)
-            << "associating multiple names with the same op not supported";
-      }
+      // Ensure name does not get reused.
+      (void)exporter.op_to_name_.GetUniqueName(name);
     }
   }
 
   if (!input_names.empty()) {
     TF_RET_CHECK(input_names.size() == block.getNumArguments());
-    for (auto it : llvm::enumerate(function.getArguments())) {
+    for (const auto& it : llvm::enumerate(function.getArguments())) {
       // TODO(lyandy): Update when changing feed/fetch import.
       std::string orig_name(input_names[it.index()]);
       std::string name = LegalizeNodeName(orig_name);
@@ -605,14 +542,8 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
           << "input port designation not supported";
       // Only assign user of argument the input name if the main graph did not
       // have its _Arg nodes lifted into the functions arguments.
-      if (graph_as_function) {
-        // Ensure name does not get reused.
-        (void)exporter.op_to_name_.GetUniqueName(name);
-      } else {
-        Operation* defining_op =
-            GetIslandInnerOpOrSelf(*it.value().user_begin());
-        exporter.op_to_name_.InitOpName(defining_op, name);
-      }
+      // Ensure name does not get reused.
+      (void)exporter.op_to_name_.GetUniqueName(name);
     }
   }
 
@@ -628,8 +559,7 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
     }
 
     TF_RETURN_IF_ERROR(exporter.AddArgumentNode(
-        arg, index,
-        graph_as_function && !input_names.empty() ? input_names[index] : ""));
+        arg, index, !input_names.empty() ? input_names[index] : ""));
   }
 
   auto convert_called_function = [&](llvm::StringRef name) {
@@ -659,10 +589,7 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
       // tf_executor.NextIteration.Sink will be used instead.
       continue;
     } else if (auto fetch = llvm::dyn_cast<mlir::tf_executor::FetchOp>(inst)) {
-      TF_RETURN_IF_ERROR(exporter.AddFetchNode(
-          function, fetch,
-          graph_as_function ? output_names
-                            : llvm::ArrayRef<llvm::StringRef>()));
+      TF_RETURN_IF_ERROR(exporter.AddFetchNode(function, fetch, output_names));
     } else if (auto island =
                    llvm::dyn_cast<mlir::tf_executor::IslandOp>(inst)) {
       Operation& inner_op = island.GetBody().front();
@@ -698,12 +625,6 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
   TF_RETURN_IF_ERROR(
       exporter.GetControlRetNodes(graph_op.GetFetch(), control_ret_nodes));
 
-  // Delete replaced arguments ops.
-  // Note: This is done afterwards to avoid the ops created above from reusing a
-  // memory location of an op to which a mapping has already been assigned.
-  // TODO(jpienaar): Remove this need.
-  for (auto it : exporter.to_delete_) it->erase();
-
   return graph;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 851eb03edac..bd63a3b224f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <type_traits>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -39,10 +40,10 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Analysis/Verifier.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -56,13 +57,17 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -72,6 +77,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -87,7 +93,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/utils/transitive_fanin.h"
@@ -107,6 +112,10 @@ static inline absl::string_view StringRefToView(llvm::StringRef ref) {
 }
 
 namespace tensorflow {
+using mlir::NamedAttrList;
+using mlir::TensorType;
+using mlir::TF::VarHandleOp;
+using mlir::tf_saved_model::GlobalTensorOp;
 using stream_executor::port::StatusOr;
 
 namespace {
@@ -224,32 +233,42 @@ class ImporterBase {
 
   // Returns the inferred input type at index `idx` of the `node` in the
   // context.
-  StatusOr<mlir::TensorType> InferInputType(const Node& node, int idx,
-                                            mlir::Builder builder);
+  StatusOr<mlir::Type> InferInputType(const Node& node, int idx,
+                                      mlir::Builder builder);
 
   // Returns the inferred output type at index `idx` of the `node` in the
   // context.
-  StatusOr<mlir::TensorType> InferOutputType(const Node& node, int idx,
-                                             mlir::Builder builder);
+  StatusOr<mlir::Type> InferOutputType(const Node& node, int idx,
+                                       mlir::Builder builder);
 
  private:
   // Most types with subtypes have only one subtype.
-  using ElementSubtypes = llvm::SmallVector<mlir::TensorType, 1>;
+  using ElementSubtypes = llvm::SmallVector<TensorType, 1>;
 
   // Adds all the ordered_nodes to the shape refiner shape_refiner_. Then all
   // data type and shape information is maintained by the shape_refiner_.
-  Status AddNodesToShapeRefiner();
+  // TODO(jpienaar): Remove once shape inference on import is removed.
+  Status AddNodesToShapeRefiner(
+      std::unordered_map<string, Node*>* node_name_map);
+
+  // Prune nodes that do not feed into fetch nodes.
+  Status PruneUnreachableNodes(
+      std::unordered_map<string, Node*>* node_name_map);
+
+  // Converts feeds to Placeholder nodes.
+  Status ConvertFeedsToPlaceholders(
+      std::unordered_map<string, Node*>* node_name_map);
 
   // Converts the inferred shape referred to by 'handle' in 'context', with
   // given element type, and returns an MLIR tensor type.
-  StatusOr<mlir::TensorType> ConvertDataTypeAndShape(
+  StatusOr<TensorType> ConvertDataTypeAndShape(
       DataType dtype, const shape_inference::ShapeHandle& handle,
       const std::vector<shape_inference::ShapeAndType>* handle_subtypes,
       shape_inference::InferenceContext* context, mlir::Builder builder);
 
   // Converts the inferred shape referred to by 'handle' in 'context', with
   // given element type, and returns an MLIR tensor type.
-  StatusOr<mlir::TensorType> ConvertElementTypeAndShape(
+  StatusOr<TensorType> ConvertElementTypeAndShape(
       mlir::Type element_type, const shape_inference::ShapeHandle& handle,
       shape_inference::InferenceContext* context, mlir::Builder builder);
 
@@ -264,6 +283,21 @@ class ImporterBase {
     return ::tensorflow::ConvertTensorProto(value, &builder_);
   }
 
+  // Converts the tensor shape proto into an MLIR shape attribute.
+  StatusOr<mlir::TF::ShapeAttr> ConvertTensorShapeProto(
+      const TensorShapeProto& shape) {
+    if (shape.unknown_rank())
+      return mlir::TF::ShapeAttr::get(builder_.getContext(), llvm::None);
+
+    llvm::SmallVector<int64_t, 4> dims;
+    dims.reserve(shape.dim().size());
+    for (const auto& dim : shape.dim()) {
+      dims.push_back(dim.size());
+    }
+    return mlir::TF::ShapeAttr::get(builder_.getContext(),
+                                    llvm::makeArrayRef(dims));
+  }
+
   // Converts func name in graphdef to mlir::SymbolRefAttribute.
   StatusOr<mlir::FlatSymbolRefAttr> ConvertFunctionCallName(
       const std::string& func_name);
@@ -276,15 +310,15 @@ class ImporterBase {
   // AttrValue {name : foo, attrs : {k1 : bar, k2 : rfc}}, it will convert it to
   // a list of MLIR Attributes: [{base_name : foo}, {base_name.k1 : bar},
   // {base_name.k2 : rfc}}.
-  Status ConvertFunctionCallAttribute(
-      const std::string& base_name, const AttrValue& value,
-      llvm::SmallVector<mlir::NamedAttribute, 4>* attributes);
+  Status ConvertFunctionCallAttribute(const std::string& base_name,
+                                      const AttrValue& value,
+                                      NamedAttrList* attributes);
 
   // Helper to create either a tf_executor operation or a TF operation wrapped
   // in an island. When convert_to_legacy_call is true, converts the operation
   // representing a call to a library function with a name represented in
   // node_type_name to LegacyCallOp.
-  mlir::Operation* createOperation(
+  mlir::Operation* CreateOperation(
       const Node& node, llvm::StringRef node_type_name,
       const mlir::OperationState& result,
       const llvm::SmallVectorImpl<mlir::Value>& control_operands,
@@ -381,7 +415,10 @@ class ImporterBase {
   const GraphDebugInfo& debug_info_;
   llvm::StringRef function_name_for_debug_info_;
   NodeValueMap node_values_;
-  std::unique_ptr<ShapeRefiner> shape_refiner_;
+  // TODO(jpienaar): Remove once shape inference on import is removed.
+  // The shape_refinner_ will be nullptr if shape inference on import is
+  // not enabled.
+  std::unique_ptr<ShapeRefiner> shape_refiner_ = nullptr;
   NameUniquifier* function_name_uniquifier_;
   mlir::StatusScopedDiagnosticHandler error_handler_;
 
@@ -639,8 +676,9 @@ Status ImporterBase::GetInputOutputNodes(
   return Status::OK();
 }
 
-// TODO(fengliuai): Replace the iterative algorithm by an one pass propagation
-Status ImporterBase::AddNodesToShapeRefiner() {
+// TODO(jpienaar): Remove this post shape inference on import flag is removed.
+Status ImporterBase::AddNodesToShapeRefiner(
+    std::unordered_map<string, Node*>* node_name_map) {
   shape_refiner_ = absl::make_unique<ShapeRefiner>(graph_->versions(),
                                                    graph_->op_registry());
   // Some operations (for example "TPUExecute") don't have shape inference
@@ -650,7 +688,6 @@ Status ImporterBase::AddNodesToShapeRefiner() {
   shape_refiner_->set_function_library_for_shape_inference(&graph_flib_);
 
   TF_ASSIGN_OR_RETURN(auto feeds_by_node, GetFeedsByNode(specs_.inputs));
-  auto node_name_map = graph_->BuildNodeNameIndex();
 
   // First add all nodes to the refiner.
   for (Node* node : ordered_nodes_) {
@@ -684,7 +721,7 @@ Status ImporterBase::AddNodesToShapeRefiner() {
           TF_ASSIGN_OR_RETURN(
               auto placeholder_node_and_removed,
               CreatePlaceholderNodeForFeed(array_info.shape, dtype, node, index,
-                                           node_name_map));
+                                           *node_name_map));
 
           Node* placeholder_node = placeholder_node_and_removed.first;
           if (placeholder_node_and_removed.second) {
@@ -693,7 +730,7 @@ Status ImporterBase::AddNodesToShapeRefiner() {
             node_added_to_shape_refiner = true;
           }
           remapped_feeds_[{it->first, index}] = placeholder_node->name();
-          node_name_map[placeholder_node->name()] = placeholder_node;
+          (*node_name_map)[placeholder_node->name()] = placeholder_node;
           // Add the new placeholder node to the shape refiner.
           Status status = shape_refiner_->AddNode(placeholder_node);
           if (!status.ok()) {
@@ -787,7 +824,7 @@ Status ImporterBase::AddNodesToShapeRefiner() {
   // Prune nodes in the graph that are not reachable from the output.
   if (specs_.prune_unused_nodes) {
     std::unordered_set<const Node*> prune_start;
-    TF_RETURN_IF_ERROR(GetInputOutputNodes(node_name_map, &prune_start));
+    TF_RETURN_IF_ERROR(GetInputOutputNodes(*node_name_map, &prune_start));
     if (!prune_start.empty()) {
       if (PruneForReverseReachability(graph_.get(), prune_start)) {
         VLOG(1) << "Pruned unused nodes in graphdef";
@@ -872,30 +909,125 @@ Status ImporterBase::AddNodesToShapeRefiner() {
   return Status::OK();
 }
 
-StatusOr<mlir::TensorType> ImporterBase::InferInputType(const Node& node,
-                                                        int idx,
-                                                        mlir::Builder builder) {
-  ExtendedInferenceContext* shape_context =
-      shape_refiner_->GetExtendedContext(&node);
-  DataType dtype = shape_context->input_type(idx);
-  auto* context = shape_context->get_context();
-  return ConvertDataTypeAndShape(dtype, context->input(idx),
-                                 context->input_handle_shapes_and_types(idx),
-                                 context, builder);
+StatusOr<mlir::Type> ImporterBase::InferInputType(const Node& node, int idx,
+                                                  mlir::Builder builder) {
+  if (specs_.enable_shape_inference) {
+    // TODO(jpienaar): Remove this if shape inference on import flag is removed.
+    ExtendedInferenceContext* shape_context =
+        shape_refiner_->GetExtendedContext(&node);
+    DataType dtype = shape_context->input_type(idx);
+    auto* context = shape_context->get_context();
+    return ConvertDataTypeAndShape(dtype, context->input(idx),
+                                   context->input_handle_shapes_and_types(idx),
+                                   context, builder);
+  }
+  DataType dtype = node.properties()->input_types[idx];
+  mlir::Type element_type;
+  TF_RETURN_IF_ERROR(ConvertDataType(dtype, builder, &element_type));
+  return mlir::UnrankedTensorType::get(element_type);
 }
 
-StatusOr<mlir::TensorType> ImporterBase::InferOutputType(
-    const Node& node, int idx, mlir::Builder builder) {
-  ExtendedInferenceContext* shape_context =
-      shape_refiner_->GetExtendedContext(&node);
-  DataType dtype = shape_context->output_type(idx);
-  auto* context = shape_context->get_context();
-  return ConvertDataTypeAndShape(dtype, context->output(idx),
-                                 context->output_handle_shapes_and_types(idx),
-                                 context, builder);
+StatusOr<mlir::Type> ImporterBase::InferOutputType(const Node& node, int idx,
+                                                   mlir::Builder builder) {
+  DataType dtype = node.properties()->output_types[idx];
+
+  // Returns output type given inference context.
+  auto shape_ic = [&](shape_inference::InferenceContext* c) {
+    return ConvertDataTypeAndShape(dtype, c->output(idx),
+                                   c->output_handle_shapes_and_types(idx), c,
+                                   builder);
+  };
+
+  if (specs_.enable_shape_inference) {
+    // TODO(jpienaar): Remove this if shape inference on import flag is removed.
+    ExtendedInferenceContext* shape_context =
+        shape_refiner_->GetExtendedContext(&node);
+    return shape_ic(shape_context->get_context());
+  }
+
+  // Treat TensorList init ops specially here as the op requires knowing its
+  // element dtype.
+  // TODO(jpienaar): Reconsider post refactoring shape functions.
+  if (node.type_string() == "TensorListReserve" ||
+      node.type_string() == "EmptyTensorList") {
+    mlir::Type etype;
+    if (auto element_dtype = node.attrs().Find("element_dtype")) {
+      TF_RETURN_IF_ERROR(
+          ConvertDataType(element_dtype->type(), builder, &etype));
+    }
+    return mlir::RankedTensorType::get(
+        {}, mlir::TF::VariantType::get({mlir::UnrankedTensorType::get(etype)},
+                                       etype.getContext()));
+  }
+
+  // Returns a simple, more conservative unranked tensor type.
+  auto default_type = [&]() -> StatusOr<mlir::Type> {
+    mlir::Type element_type;
+    TF_RETURN_IF_ERROR(ConvertDataType(dtype, builder, &element_type));
+    return mlir::UnrankedTensorType::get(element_type);
+  };
+
+  // Below we only try and do some shape inference for "source" ops which have
+  // no inputs.
+  if (node.num_inputs() > 0) return default_type();
+
+  // Do some simply inference here to get the function arguments correct for
+  // this common case.
+  // TODO(jpienaar): Reconsider post refactoring shape functions.
+  if (node.IsArg()) {
+    if (dtype == DT_RESOURCE) {
+      const AttrValue* dtype_attr = node.attrs().Find("_handle_dtypes");
+      const AttrValue* shape_attr = node.attrs().Find("_handle_shapes");
+      if (dtype_attr && shape_attr) {
+        if (dtype_attr->list().type().empty()) {
+          return errors::InvalidArgument(
+              "Invalid \"_handle_dtypes\" attribute value for _Arg node: ",
+              shape_attr->DebugString());
+        }
+        if (shape_attr->list().shape().empty()) {
+          return errors::InvalidArgument(
+              "Invalid \"_handle_shapes\" attribute value for _Arg node: ",
+              shape_attr->DebugString());
+        }
+        DataType dtype = dtype_attr->list().type(0);
+        const TensorShapeProto& shape_proto = shape_attr->list().shape(0);
+        TF_ASSIGN_OR_RETURN(
+            auto etype, ConvertToMlirTensorType(shape_proto, dtype, &builder));
+        return mlir::UnrankedTensorType::get(mlir::TF::ResourceType::get(
+            {etype.cast<TensorType>()}, builder.getContext()));
+      } else {
+        return mlir::UnrankedTensorType::get(
+            mlir::TF::ResourceType::get(builder.getContext()));
+      }
+    } else if (auto shape = node.attrs().Find("_output_shapes")) {
+      if (shape->has_list() && shape->list().shape_size() == 1) {
+        return ConvertToMlirTensorType(shape->list().shape().at(0), dtype,
+                                       &builder);
+      }
+    }
+  }
+
+  const tensorflow::OpRegistrationData* op_reg_data;
+  TF_RETURN_IF_ERROR(
+      graph_->op_registry()->LookUp(node.type_string(), &op_reg_data));
+  if (!op_reg_data) {
+    DVLOG(1) << "Skipping inference for unregistered op " << node.type_string();
+    return default_type();
+  }
+  if (op_reg_data->shape_inference_fn == nullptr) {
+    DVLOG(1) << "Skipping inference for op without shape function "
+             << node.type_string();
+    return default_type();
+  }
+  shape_inference::InferenceContext c(graph_->versions().producer(),
+                                      node.attrs(), op_reg_data->op_def,
+                                      std::vector<PartialTensorShape>{}, {},
+                                      /*input_tensors_as_shapes=*/{}, {});
+  TF_RETURN_IF_ERROR(c.Run(op_reg_data->shape_inference_fn));
+  return shape_ic(&c);
 }
 
-StatusOr<mlir::TensorType> ImporterBase::ConvertDataTypeAndShape(
+StatusOr<TensorType> ImporterBase::ConvertDataTypeAndShape(
     DataType dtype, const shape_inference::ShapeHandle& handle,
     const std::vector<shape_inference::ShapeAndType>* handle_subtypes,
     shape_inference::InferenceContext* context, mlir::Builder builder) {
@@ -914,7 +1046,7 @@ StatusOr<mlir::TensorType> ImporterBase::ConvertDataTypeAndShape(
   return ConvertElementTypeAndShape(element_type, handle, context, builder);
 }
 
-StatusOr<mlir::TensorType> ImporterBase::ConvertElementTypeAndShape(
+StatusOr<TensorType> ImporterBase::ConvertElementTypeAndShape(
     mlir::Type element_type, const shape_inference::ShapeHandle& handle,
     shape_inference::InferenceContext* context, mlir::Builder builder) {
   if (!context->RankKnown(handle)) {
@@ -952,7 +1084,7 @@ StatusOr<ImporterBase::ElementSubtypes> ImporterBase::ConvertSubtypes(
     mlir::Type element_type;
     TF_RETURN_IF_ERROR(
         ::tensorflow::ConvertDataType(subtype.dtype, builder, &element_type));
-    TF_ASSIGN_OR_RETURN(mlir::TensorType type,
+    TF_ASSIGN_OR_RETURN(TensorType type,
                         ConvertElementTypeAndShape(element_type, subtype.shape,
                                                    context, builder));
     subtypes.push_back(type);
@@ -960,9 +1092,9 @@ StatusOr<ImporterBase::ElementSubtypes> ImporterBase::ConvertSubtypes(
   return subtypes;
 }
 
-Status ImporterBase::ConvertFunctionCallAttribute(
-    const std::string& base_name, const AttrValue& value,
-    llvm::SmallVector<mlir::NamedAttribute, 4>* attributes) {
+Status ImporterBase::ConvertFunctionCallAttribute(const std::string& base_name,
+                                                  const AttrValue& value,
+                                                  NamedAttrList* attributes) {
   TF_ASSIGN_OR_RETURN(auto func_attr,
                       ConvertFunctionCallName(value.func().name()));
   attributes->push_back(builder_.getNamedAttr(base_name, func_attr));
@@ -1000,7 +1132,7 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
       return mlir::TypeAttr::get(type);
     }
     case AttrValue::kShape:
-      return builder_.getStringAttr(mangling_util::MangleShape(value.shape()));
+      return ConvertTensorShapeProto(value.shape());
     case AttrValue::kTensor:
       return ConvertTensorProto(value.tensor());
     case AttrValue::kList: {
@@ -1014,12 +1146,13 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
       for (const auto& item : value.list().b())
         attrs.push_back(builder_.getBoolAttr(item));
       for (const auto& item : value.list().type()) {
-        attrs.push_back(builder_.getStringAttr(
-            mangling_util::MangleDataType(static_cast<DataType>(item))));
+        mlir::Type type;
+        TF_RETURN_IF_ERROR(ConvertDataType(DataType(item), builder_, &type));
+        attrs.push_back(mlir::TypeAttr::get(type));
       }
       for (const auto& item : value.list().shape()) {
-        attrs.push_back(
-            builder_.getStringAttr(mangling_util::MangleShape(item)));
+        TF_ASSIGN_OR_RETURN(auto attr, ConvertTensorShapeProto(item));
+        attrs.push_back(attr);
       }
       for (const auto& item : value.list().tensor()) {
         TF_ASSIGN_OR_RETURN(auto attr, ConvertTensorProto(item));
@@ -1035,8 +1168,18 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
       return builder_.getArrayAttr(
           llvm::makeArrayRef(attrs.begin(), attrs.end()));
     }
-    case AttrValue::kFunc:
-      return errors::Unknown("kFunc type should be handled separately!");
+    case AttrValue::kFunc: {
+      // TODO(b/156546237): Unify kFunc/NameAttrList attribute representation.
+      // Currently kFunc/NameAttrList attributes in a kList/repeated AttrValue
+      // will not use this representation.
+      NamedAttrList attrs;
+      for (const auto& func_attr : value.func().attr()) {
+        TF_ASSIGN_OR_RETURN(auto attr, ConvertAttributeValue(func_attr.second));
+        attrs.push_back(builder_.getNamedAttr(func_attr.first, attr));
+      }
+      auto func_attrs = builder_.getDictionaryAttr(attrs);
+      return mlir::TF::FuncAttr::get(context_, value.func().name(), func_attrs);
+    }
     case AttrValue::VALUE_NOT_SET:
       return builder_.getUnitAttr();
     // kPlaceholder is not implemented.
@@ -1090,7 +1233,7 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   TF_RETURN_IF_ERROR(
       FunctionDefToBodyHelper(*func_def, AttrSlice(), &func_lib, &fbody));
 
-  // Converts the argument and return types to mlir types.
+  // Converts the argument and return types to MLIR types.
   absl::InlinedVector<mlir::NamedAttribute, 8> attributes;
   attributes.reserve(func_def->attr_size());
   for (const auto& name_and_value : func_def->attr()) {
@@ -1126,6 +1269,7 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   // We populate the NodeSpec so that all the _Arg ops get their shape
   // added correctly.
   GraphImportConfig specs;
+  specs.enable_shape_inference = specs_.enable_shape_inference;
   for (const auto& name_and_value : func_def->attr()) {
     if (name_and_value.first == "_input_shapes") {
       auto& list = name_and_value.second.list();
@@ -1167,9 +1311,96 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   return Status::OK();
 }
 
+Status ImporterBase::PruneUnreachableNodes(
+    std::unordered_map<string, Node*>* node_name_map) {
+  std::unordered_set<const Node*> prune_start;
+  TF_RETURN_IF_ERROR(GetInputOutputNodes(*node_name_map, &prune_start));
+
+  if (!prune_start.empty()) {
+    if (PruneForReverseReachability(graph_.get(), prune_start)) {
+      VLOG(1) << "Pruned unused nodes in graphdef";
+    } else {
+      VLOG(1) << "No unused nodes in graphdef to prune";
+    }
+  } else {
+    VLOG(1) << "No output nodes specified, skipping pruning";
+  }
+  return Status::OK();
+}
+
+Status ImporterBase::ConvertFeedsToPlaceholders(
+    std::unordered_map<string, Node*>* node_name_map) {
+  // Feeds (edges) are converted into single-output placeholder nodes to
+  // simplify the conversion process.
+  TF_ASSIGN_OR_RETURN(auto feeds_by_node, GetFeedsByNode(specs_.inputs));
+  for (const auto& it : feeds_by_node) {
+    TensorId tensor = ParseTensorName(it.first);
+    auto jt = node_name_map->find(std::string(tensor.node()));
+    if (jt == node_name_map->end()) {
+      return errors::FailedPrecondition(
+          absl::StrCat("Graph does not contain node: ", tensor.node()));
+    }
+
+    Node* node = jt->second;
+    auto op_name = node->op_def().name();
+    if (op_name != "Placeholder" && op_name != "LegacyFedInput" &&
+        op_name != FunctionLibraryDefinition::kArgOp) {
+      for (const auto& output_tensor : it.second) {
+        const int index = output_tensor.first;
+        const ArrayInfo& array_info = output_tensor.second->second;
+
+        DataType dtype = array_info.imported_dtype;
+        // Uses the existing output type if it isn't specified by the user.
+        if (dtype == DT_INVALID) {
+          dtype = node->output_type(index);
+        }
+
+        TF_ASSIGN_OR_RETURN(
+            auto placeholder_node_and_removed,
+            CreatePlaceholderNodeForFeed(array_info.shape, dtype, node, index,
+                                         *node_name_map));
+
+        Node* placeholder_node = placeholder_node_and_removed.first;
+        if (placeholder_node->in_edges().empty()) {
+          graph_->AddControlEdge(graph_->source_node(), placeholder_node,
+                                 true /* skip test for duplicates */);
+        }
+        if (placeholder_node->out_edges().empty()) {
+          graph_->AddControlEdge(placeholder_node, graph_->sink_node(),
+                                 true /* skip test for duplicates */);
+        }
+        remapped_feeds_[{it.first, index}] = placeholder_node->name();
+        (*node_name_map)[placeholder_node->name()] = placeholder_node;
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status ImporterBase::PrepareConvert(const Graph& graph) {
   TF_RETURN_IF_ERROR(RemoveBackedges(graph));
-  TF_RETURN_IF_ERROR(AddNodesToShapeRefiner());
+
+  auto node_name_map = graph_->BuildNodeNameIndex();
+
+  if (specs_.enable_shape_inference) {
+    // TODO(jpienaar): Remove once infer shapes on import flag is removed.
+    TF_RETURN_IF_ERROR(AddNodesToShapeRefiner(&node_name_map));
+  } else {
+    TF_RETURN_IF_ERROR(ConvertFeedsToPlaceholders(&node_name_map));
+  }
+
+  // Prune nodes in the graph that are not reachable from the output.
+  if (specs_.prune_unused_nodes) {
+    TF_RETURN_IF_ERROR(PruneUnreachableNodes(&node_name_map));
+  }
+
+  if (!specs_.enable_shape_inference) {
+    // Re-initialize ordered_nodes_ since we might have modified the graph.
+    GetReversePostOrder(
+        *graph_, &ordered_nodes_,
+        [](const Node* n1, const Node* n2) { return n1->name() < n2->name(); });
+  }
+
   return Status::OK();
 }
 
@@ -1210,6 +1441,26 @@ Status ImporterBase::Convert(
     function.setArgAttr(entry.first, "tf.resource_arg_unique_id",
                         builder_.getI64IntegerAttr(entry.second));
   }
+
+  // TODO(jpienaar): Update post removing shape_refinier_.
+  if (!specs_.enable_shape_inference) {
+    // Refine graph's type given more precise fetch.
+    auto fetch = graph.GetFetch();
+    bool all_equal = true;
+    for (auto it :
+         llvm::zip_first(graph.getResults(), fetch.getOperandTypes())) {
+      auto rt = std::get<1>(it);
+      if (rt == std::get<0>(it).getType()) continue;
+      std::get<0>(it).setType(rt);
+      all_equal = false;
+    }
+    if (!all_equal) {
+      function.setType(mlir::FunctionType::get(func_type.getInputs(),
+                                               graph.getResultTypes(),
+                                               function.getContext()));
+    }
+  }
+
   return Status::OK();
 }
 
@@ -1315,7 +1566,7 @@ mlir::Location ImporterBase::GetLocation(const NodeDef& node_def) {
     std::string name_for_name_loc =
         function_name.empty() ? name.str() : (name + "@" + function_name).str();
     auto name_loc_id = mlir::Identifier::get(name_for_name_loc, context_);
-    const auto& location_it = debug_info.find(debug_info_key);
+    const auto location_it = debug_info.find(debug_info_key);
     if (location_it == debug_info.end()) {
       return mlir::NameLoc::get(name_loc_id, context_);
     }
@@ -1389,7 +1640,7 @@ Status ImporterBase::EmitErrorWithLocationStr(const Node& node,
   return error_handler_.Combine(error_status);
 }
 
-mlir::Operation* ImporterBase::createOperation(
+mlir::Operation* ImporterBase::CreateOperation(
     const Node& node, llvm::StringRef node_type_name,
     const mlir::OperationState& result,
     const llvm::SmallVectorImpl<mlir::Value>& control_operands,
@@ -1579,6 +1830,8 @@ Status ImporterBase::ConvertNode(const Node& node) {
   absl::c_stable_sort(in_edges, [](const Edge* e1, const Edge* e2) {
     if (e1->IsControlEdge() && !e2->IsControlEdge()) return false;
     if (!e1->IsControlEdge() && e2->IsControlEdge()) return true;
+    if (e1->IsControlEdge() && e2->IsControlEdge())
+      return e1->src()->id() < e2->src()->id();
     return e1->dst_input() < e2->dst_input();
   });
 
@@ -1685,7 +1938,7 @@ Status ImporterBase::ConvertNode(const Node& node) {
   }
 
   // Register the mapping between the TF node and the newly created operation.
-  node_values_[node.id()] = createOperation(
+  node_values_[node.id()] = CreateOperation(
       node, node_type_name, result, control_operands, convert_to_legacy_call);
   return Status::OK();
 }
@@ -1765,13 +2018,43 @@ StatusOr<mlir::FunctionType> ImporterBase::InferLibFunctionType(
   // MLIR function type signature.
 
   llvm::SmallVector<mlir::Type, 4> arg_types;
-  arg_types.reserve(fbody.arg_types.size());
-  for (auto arg : fbody.arg_nodes) {
-    // Find node in the graph using the node id instead of using `arg` directly
-    // because the graph has been cloned.
-    auto* node = graph_->FindNodeId(arg->id());
-    TF_ASSIGN_OR_RETURN(auto type, InferOutputType(*node, /*idx=*/0, builder));
-    arg_types.push_back(type);
+  if (specs_.inputs.empty()) {
+    arg_types.reserve(fbody.arg_types.size());
+    for (auto arg : fbody.arg_nodes) {
+      // Find node in the graph using the node id instead of using `arg`
+      // directly because the graph has been cloned.
+      auto* node = graph_->FindNodeId(arg->id());
+      TF_ASSIGN_OR_RETURN(auto type,
+                          InferOutputType(*node, /*idx=*/0, builder));
+      arg_types.push_back(type);
+    }
+  } else {
+    arg_types.reserve(fbody.arg_types.size());
+    for (const auto& it : llvm::enumerate(specs_.inputs)) {
+      mlir::Type element_type;
+      const auto& node_info = it.value().second;
+      DataType dtype = node_info.imported_dtype;
+      // Uses the existing output type of the arg node if the data type of the
+      // the node isn't specified through the import configuration.
+      if (dtype == DT_INVALID) {
+        auto arg = fbody.arg_nodes[it.index()];
+        auto* node = graph_->FindNodeId(arg->id());
+        dtype = node->output_type(0);
+        if (dtype == DT_INVALID) {
+          return errors::InvalidArgument("Input ", it.index(),
+                                         "has invalid data type");
+        }
+      }
+      TF_RETURN_IF_ERROR(
+          ::tensorflow::ConvertDataType(dtype, builder, &element_type));
+      if (node_info.shape.unknown_rank()) {
+        arg_types.push_back(mlir::UnrankedTensorType::get(element_type));
+      } else {
+        llvm::SmallVector<int64_t, 4> shape;
+        TF_RETURN_IF_ERROR(ConvertToMlirShape(node_info.shape, &shape));
+        arg_types.push_back(mlir::RankedTensorType::get(shape, element_type));
+      }
+    }
   }
 
   llvm::SmallVector<mlir::Type, 4> ret_types;
@@ -1885,13 +2168,13 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
       auto node_name = [&](const OutputTensor& tensor) {
         ss << tensor.node->name();
       };
-      mlir::interleave(arg_nodes, ss, node_name, ",");
+      llvm::interleave(arg_nodes, ss, node_name, ",");
       auto inputs = b.getNamedAttr("inputs", b.getStringAttr(ss.str()));
       s.clear();
-      mlir::interleave(ret_nodes, ss, node_name, ",");
+      llvm::interleave(ret_nodes, ss, node_name, ",");
       auto outputs = b.getNamedAttr("outputs", b.getStringAttr(ss.str()));
       s.clear();
-      mlir::interleave(specs.control_outputs, ss, ",");
+      llvm::interleave(specs.control_outputs, ss, ",");
       auto control_outputs =
           b.getNamedAttr("control_outputs", b.getStringAttr(ss.str()));
 
@@ -1916,16 +2199,16 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
       mlir::Builder b(context);
       std::string s;
       llvm::raw_string_ostream ss(s);
-      mlir::interleave(
+      llvm::interleave(
           specs.inputs, ss,
           [&](const std::pair<std::string, ArrayInfo>& v) { ss << v.first; },
           ",");
       auto inputs = b.getNamedAttr("inputs", b.getStringAttr(ss.str()));
       s.clear();
-      mlir::interleave(specs.outputs, ss, ",");
+      llvm::interleave(specs.outputs, ss, ",");
       auto outputs = b.getNamedAttr("outputs", b.getStringAttr(ss.str()));
       s.clear();
-      mlir::interleave(specs.control_outputs, ss, ",");
+      llvm::interleave(specs.control_outputs, ss, ",");
       auto control_outputs =
           b.getNamedAttr("control_outputs", b.getStringAttr(ss.str()));
 
@@ -2023,9 +2306,13 @@ StatusOr<mlir::FunctionType> GraphDefImporter::InferMainFunctionType(
     }
     TF_RETURN_IF_ERROR(
         ::tensorflow::ConvertDataType(imported_dtype, builder, &element_type));
-    llvm::SmallVector<int64_t, 4> shape;
-    TF_RETURN_IF_ERROR(ConvertToMlirShape(node_info.shape, &shape));
-    arg_types.push_back(mlir::RankedTensorType::get(shape, element_type));
+    if (node_info.shape.unknown_rank()) {
+      arg_types.push_back(mlir::UnrankedTensorType::get(element_type));
+    } else {
+      llvm::SmallVector<int64_t, 4> shape;
+      TF_RETURN_IF_ERROR(ConvertToMlirShape(node_info.shape, &shape));
+      arg_types.push_back(mlir::RankedTensorType::get(shape, element_type));
+    }
     i++;
   }
 
@@ -2154,8 +2441,8 @@ class SavedModelObjectGraphImporter : public ImporterBase {
   // Main entry point: converts all functions in the given meta graph to an MLIR
   // Module.
   static StatusOr<mlir::OwningModuleRef> Convert(
-      SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
-      absl::Span<std::string> exported_names, bool add_default_attributes);
+      SavedModelV2Bundle* saved_model, absl::Span<std::string> exported_names,
+      mlir::MLIRContext* context, bool add_default_attributes);
 
  private:
   explicit SavedModelObjectGraphImporter(
@@ -2623,11 +2910,10 @@ void AdjustBoundInputArgTypes(mlir::ModuleOp module) {
 void SortSavedModelModule(mlir::ModuleOp module) {
   struct NamedGlobalTensor {
     llvm::StringRef name;
-    mlir::tf_saved_model::GlobalTensorOp global_tensor;
+    GlobalTensorOp global_tensor;
   };
   llvm::SmallVector<NamedGlobalTensor, 8> named_global_tensors;
-  for (auto global_tensor :
-       module.getOps<mlir::tf_saved_model::GlobalTensorOp>()) {
+  for (auto global_tensor : module.getOps<GlobalTensorOp>()) {
     auto exported_names = mlir::tf_saved_model::GetExportedNames(global_tensor);
     // We use stable_sort, so duplicate empty names are fine here.
     named_global_tensors.push_back(
@@ -2818,7 +3104,7 @@ Status CreateSavedModelIR(
       TF_ASSIGN_OR_RETURN(
           auto type, ConvertToMlirTensorType(variable.shape(), variable.dtype(),
                                              &builder));
-      auto op = builder.create<mlir::tf_saved_model::GlobalTensorOp>(
+      auto op = builder.create<GlobalTensorOp>(
           builder.getUnknownLoc(),
           builder.getStringAttr(object_names.GetSymbolTableName(node_id)),
           value_attr,
@@ -2838,7 +3124,7 @@ Status CreateSavedModelIR(
       }
       TF_ASSIGN_OR_RETURN(auto value_attr,
                           ConvertTensorProto(*value, &builder));
-      auto op = builder.create<mlir::tf_saved_model::GlobalTensorOp>(
+      auto op = builder.create<GlobalTensorOp>(
           builder.getUnknownLoc(),
           builder.getStringAttr(object_names.GetSymbolTableName(node_id)),
           value_attr,
@@ -2856,8 +3142,8 @@ Status CreateSavedModelIR(
 }
 
 StatusOr<mlir::OwningModuleRef> SavedModelObjectGraphImporter::Convert(
-    SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
-    absl::Span<std::string> exported_names, bool add_default_attributes) {
+    SavedModelV2Bundle* saved_model, absl::Span<std::string> exported_names,
+    mlir::MLIRContext* context, bool add_default_attributes) {
   GraphDebugInfo dummy_debug_info;
   const GraphDebugInfo& debug_info =
       saved_model->debug_info() ? *saved_model->debug_info() : dummy_debug_info;
@@ -2934,17 +3220,20 @@ class SavedModelSignatureDefImporter {
  public:
   // Main entry point: converts all functions (specified by SignatureDefs) in
   // the given meta graph to an MLIR Module.
-  static StatusOr<mlir::OwningModuleRef> Convert(const SavedModelBundle& bundle,
-                                                 mlir::MLIRContext* context) {
-    SavedModelSignatureDefImporter importer(bundle, context);
+  static StatusOr<mlir::OwningModuleRef> Convert(
+      const SavedModelBundle& bundle, absl::Span<std::string> exported_names,
+      mlir::MLIRContext* context) {
+    SavedModelSignatureDefImporter importer(bundle, exported_names, context);
 
     return importer.ConvertSignatures();
   }
 
  private:
   SavedModelSignatureDefImporter(const SavedModelBundle& bundle,
+                                 absl::Span<std::string> exported_names,
                                  mlir::MLIRContext* context)
       : bundle_(bundle),
+        exported_names_(exported_names),
         module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
 
   // Converts the SavedModel to the SavedModel dialect. Creates an MLIR function
@@ -2959,19 +3248,25 @@ class SavedModelSignatureDefImporter {
   // Creates GlobalTensorOp for each variable and moves each VarHandle op to
   // the enclosing function's arguments.
   Status LiftVariables();
-  // Moves the result of the VarHandleOp to the enclosing function's argument
-  // list and erases this VarHandleOp.
-  void LiftVariable(mlir::TF::VarHandleOp op);
+
+  // Moves the result of the VarHandleOp with corresponding global tensor to the
+  // enclosing function's argument list and erases this VarHandleOp. The global
+  // tensor's shape is used to provide the most accurate nested shape.
+  void LiftVariable(VarHandleOp op, GlobalTensorOp global_tensor);
+
+  using VarGlobalMap = llvm::MapVector<
+      llvm::StringRef,
+      std::pair<GlobalTensorOp, llvm::SmallVector<VarHandleOp, 2>>>;
 
   // Reads all variables from the SavedModel through session and creates
   // GlobalTensorOp for these variables.
-  Status ReadVariablesFromSession(
-      const llvm::SmallVectorImpl<mlir::TF::VarHandleOp>& ops);
+  Status ReadVariablesFromSession(VarGlobalMap* var_globals);
 
   GraphImportConfig::InputArrays ParseInputArrays(
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
 
   const SavedModelBundle& bundle_;
+  absl::Span<std::string> exported_names_;
   mlir::OwningModuleRef module_;
 };
 
@@ -2987,6 +3282,9 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
   GraphDebugInfo debug_info;
   if (bundle_.debug_info != nullptr) debug_info = *bundle_.debug_info;
 
+  llvm::StringSet<> exported_name_set;
+  exported_name_set.insert(exported_names_.begin(), exported_names_.end());
+
   for (const auto& key_and_signature_def : signatures) {
     const std::string& sig_def_key = key_and_signature_def.first;
     const SignatureDef& signature_def = key_and_signature_def.second;
@@ -2996,6 +3294,10 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
     if (sig_def_key == "__saved_model_init_op") {
       continue;
     }
+    if (!exported_name_set.empty() &&
+        exported_name_set.count(sig_def_key) == 0) {
+      continue;
+    }
 
     TF_RETURN_IF_ERROR(ConvertSignature(graphdef, sig_def_key, signature_def,
                                         debug_info, flib_def));
@@ -3088,31 +3390,34 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
 }
 
 Status SavedModelSignatureDefImporter::LiftVariables() {
-  llvm::SmallVector<mlir::TF::VarHandleOp, 4> ops;
+  VarGlobalMap var_globals;
 
-  bool contains_ref_variable = false;
-
-  module_->walk([&ops, &contains_ref_variable](mlir::Operation* op) {
-    if (auto var_handle_op = llvm::dyn_cast<mlir::TF::VarHandleOp>(op))
-      ops.push_back(var_handle_op);
+  auto walker = [&var_globals](mlir::Operation* op) {
+    if (auto var_handle_op = llvm::dyn_cast<VarHandleOp>(op))
+      var_globals[var_handle_op.shared_name()].second.push_back(var_handle_op);
     else if (op->getName().getStringRef() == "tf.VariableV2")
-      contains_ref_variable = true;
-  });
+      return mlir::WalkResult::interrupt();
+    return mlir::WalkResult::advance();
+  };
+  bool contains_ref_variable = module_->walk(walker).wasInterrupted();
 
   if (contains_ref_variable)
     return errors::InvalidArgument(
         "Ref variable created by VariableV2 is not supported.");
 
-  if (ops.empty()) return Status::OK();
+  if (var_globals.empty()) return Status::OK();
 
-  TF_RETURN_IF_ERROR(ReadVariablesFromSession(ops));
+  TF_RETURN_IF_ERROR(ReadVariablesFromSession(&var_globals));
 
-  for (auto op : ops) LiftVariable(op);
+  for (const auto& it : var_globals)
+    for (VarHandleOp var_handle : it.second.second)
+      LiftVariable(var_handle, it.second.first);
 
   return Status::OK();
 }
 
-void SavedModelSignatureDefImporter::LiftVariable(mlir::TF::VarHandleOp op) {
+void SavedModelSignatureDefImporter::LiftVariable(
+    VarHandleOp op, GlobalTensorOp global_tensor) {
   mlir::OpBuilder builder(&module_->getBodyRegion());
 
   auto func_op = op.getParentOfType<mlir::FuncOp>();
@@ -3123,7 +3428,13 @@ void SavedModelSignatureDefImporter::LiftVariable(mlir::TF::VarHandleOp op) {
   // Create the new function type by adding variable type to the arguments.
   llvm::SmallVector<mlir::Type, 4> new_input_types(
       func_type.getInputs().begin(), func_type.getInputs().end());
-  new_input_types.push_back(op.resource().getType());
+  mlir::Type resource_type = op.resource().getType();
+  // Use the corresponding global tensor's type.
+  auto type = global_tensor.type().cast<TensorType>();
+  resource_type = mlir::RankedTensorType::get(
+      {}, mlir::TF::ResourceType::get({type}, type.getContext()));
+
+  new_input_types.push_back(resource_type);
   auto new_func_type =
       builder.getFunctionType(new_input_types, func_type.getResults());
 
@@ -3135,29 +3446,26 @@ void SavedModelSignatureDefImporter::LiftVariable(mlir::TF::VarHandleOp op) {
                      builder.getSymbolRefAttr(op.shared_name()));
 
   // Add the newly added function param to entry block's arguments.
-  auto new_value = func_op.front().addArgument(op.resource().getType());
+  auto new_value = func_op.front().addArgument(resource_type);
 
-  // Remove the VarHandleOp.
+  // Remove the VarHandleOp also updating the containing island's return type.
+  DCHECK(llvm::isa<mlir::tf_executor::IslandOp>(op.getParentOp()));
+  DCHECK(llvm::cast<mlir::tf_executor::IslandOp>(op.getParentOp())
+             .WrapsSingleOp());
   op.getOperation()->replaceAllUsesWith(llvm::ArrayRef<mlir::Value>(new_value));
+  op.getParentOp()->getResult(0).setType(resource_type);
   op.getOperation()->erase();
 }
 
 Status SavedModelSignatureDefImporter::ReadVariablesFromSession(
-    const llvm::SmallVectorImpl<mlir::TF::VarHandleOp>& ops) {
+    VarGlobalMap* var_globals) {
   mlir::OpBuilder builder(&module_->getBodyRegion());
 
-  // Find all variables and their corresponding read ops.
-  llvm::MapVector<llvm::StringRef, mlir::TF::VarHandleOp>
-      variable_names_and_ops;
-  for (auto op : ops) {
-    variable_names_and_ops[op.shared_name()] = op;
-  }
-
   // Read all resource variables from the session.
   std::vector<std::string> variable_names;
-  variable_names.reserve(variable_names_and_ops.size());
-  for (const auto& name_and_location : variable_names_and_ops)
-    variable_names.push_back(std::string(name_and_location.first));
+  variable_names.reserve(var_globals->size());
+  for (const auto& name_and_location : *var_globals)
+    variable_names.push_back(name_and_location.first.str());
 
   std::vector<Tensor> resource_tensors;
   TF_RETURN_IF_ERROR(bundle_.GetSession()->Run(
@@ -3189,17 +3497,22 @@ Status SavedModelSignatureDefImporter::ReadVariablesFromSession(
     tensors.push_back(*var->tensor());
   }
 
-  for (const auto& iter : llvm::zip(variable_names_and_ops, tensors)) {
+  for (const auto iter : llvm::zip(*var_globals, tensors)) {
+    // Create global tensor op corresponding to the variable. Use the location
+    // of the first use encountered.
+    VarHandleOp op = std::get<0>(iter).second.second.front();
     const auto& name = std::get<0>(iter).first;
-    auto location = std::get<0>(iter).second.getLoc();
     const auto& tensor = std::get<1>(iter);
 
     // Create tensor attribute for this variable.
     TF_ASSIGN_OR_RETURN(auto tensor_attr, ConvertTensor(tensor, &builder));
 
-    builder.create<mlir::tf_saved_model::GlobalTensorOp>(
-        location, builder.getStringAttr(name), tensor_attr,
-        mlir::TypeAttr::get(tensor_attr.getType()), builder.getUnitAttr());
+    // Create the global tensor op with the tensor attribute.
+    auto type = tensor_attr.getType().cast<TensorType>();
+    auto global_tensor = builder.create<GlobalTensorOp>(
+        op.getLoc(), builder.getStringAttr(name), tensor_attr,
+        mlir::TypeAttr::get(type), builder.getUnitAttr());
+    std::get<0>(iter).second.first = global_tensor;
   }
 
   return Status::OK();
@@ -3267,12 +3580,14 @@ StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
     SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
     absl::Span<std::string> exported_names, bool add_default_attributes) {
   return SavedModelObjectGraphImporter::Convert(
-      saved_model, context, exported_names, add_default_attributes);
+      saved_model, exported_names, context, add_default_attributes);
 }
 
 StatusOr<mlir::OwningModuleRef> ConvertSavedModelV1ToMlir(
-    const SavedModelBundle& saved_model, mlir::MLIRContext* context) {
-  return SavedModelSignatureDefImporter::Convert(saved_model, context);
+    const SavedModelBundle& saved_model, absl::Span<std::string> exported_names,
+    mlir::MLIRContext* context) {
+  return SavedModelSignatureDefImporter::Convert(saved_model, exported_names,
+                                                 context);
 }
 
 std::string MlirModuleToString(mlir::ModuleOp module, bool show_debug_info) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index 8603eadb487..bdb72345201 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -55,6 +55,7 @@ stream_executor::port::StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
 // expressed with tf_executor dialect.
 stream_executor::port::StatusOr<mlir::OwningModuleRef>
 ConvertSavedModelV1ToMlir(const SavedModelBundle& saved_model,
+                          absl::Span<std::string> exported_names,
                           mlir::MLIRContext* context);
 
 // Serialize a MLIR module to a string.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
index c45739f003a..e74fe9341c3 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
@@ -57,6 +57,9 @@ struct GraphImportConfig {
   // If true, upgrade legacy features of the graph (for instance, functionalize
   // control-flow).
   bool upgrade_legacy = false;
+  // If true, enables shape inference on input.
+  // TODO(jpienaar): This will be removed shortly.
+  bool enable_shape_inference = true;
 };
 
 struct GraphExportConfig {
@@ -66,8 +69,6 @@ struct GraphExportConfig {
   bool export_library = true;
   // Whether to export debug original node name in the GraphDef.
   bool export_debug_info = true;
-  // If true, the main graph will be treated as a function.
-  bool graph_as_function = false;
 };
 
 // Parses the command line flag strings to the specification of nodes in
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
index 2a4d059f21e..cb3a3be22d8 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
@@ -15,16 +15,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h"
 
-#include "mlir/Analysis/Verifier.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/metrics.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 12e38da987e..6ada0fec4e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -49,7 +49,7 @@ static StatusOr<mlir::OwningModuleRef> GraphdefToMlirImport(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view control_output_arrays, bool prune_unused_nodes,
     bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    mlir::MLIRContext* context) {
+    bool enable_shape_inference, mlir::MLIRContext* context) {
   GraphDef graphdef;
   TF_RETURN_IF_ERROR(
       tensorflow::LoadProtoFromBuffer({input.data(), input.size()}, &graphdef));
@@ -64,6 +64,7 @@ static StatusOr<mlir::OwningModuleRef> GraphdefToMlirImport(
   specs.convert_legacy_fed_inputs = convert_legacy_fed_inputs;
   specs.graph_as_function = graph_as_function;
   specs.upgrade_legacy = upgrade_legacy;
+  specs.enable_shape_inference = enable_shape_inference;
   TF_RETURN_IF_ERROR(ParseInputArrayInfo(input_arrays, input_dtypes,
                                          input_shapes, &specs.inputs));
   TF_RETURN_IF_ERROR(ParseOutputArrayInfo(output_arrays, &specs.outputs));
@@ -103,11 +104,12 @@ mlir::OwningModuleRef GraphdefToMlirTranslateFunction(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view control_output_arrays, bool prune_unused_nodes,
     bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    mlir::MLIRContext* context) {
+    bool enable_shape_inference, mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
       input, debug_info_file, input_arrays, input_dtypes, input_shapes,
       output_arrays, control_output_arrays, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy, context);
+      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
+      enable_shape_inference, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "Graph import failed: " << module_or.status();
     return nullptr;
@@ -139,7 +141,8 @@ mlir::OwningModuleRef SavedModelObjectGraphToMlirImport(
 
 mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
-    const std::unordered_set<std::string>& tags, mlir::MLIRContext* context) {
+    const std::unordered_set<std::string>& tags,
+    absl::Span<std::string> exported_names, mlir::MLIRContext* context) {
   tensorflow::SavedModelBundle bundle;
   tensorflow::SessionOptions session_options;
   // Force saved model states to be restored to CPU.
@@ -153,7 +156,7 @@ mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     return nullptr;
   }
 
-  auto module_or = ConvertSavedModelV1ToMlir(bundle, context);
+  auto module_or = ConvertSavedModelV1ToMlir(bundle, exported_names, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "SavedModel V1 import failed: " << module_or.status();
     return nullptr;
@@ -167,11 +170,12 @@ mlir::OwningModuleRef GraphdefToSplattedMlirTranslateFunction(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view control_output_arrays, bool prune_unused_nodes,
     bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    mlir::MLIRContext* context) {
+    bool enable_shape_inference, mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
       input, debug_info_file, input_arrays, input_dtypes, input_shapes,
       output_arrays, control_output_arrays, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy, context);
+      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
+      enable_shape_inference, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "Graph import failed: " << module_or.status();
     return nullptr;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index ef72000b4d2..490b7c7d8f0 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -37,7 +37,8 @@ mlir::OwningModuleRef GraphdefToMlirTranslateFunction(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view control_output_arrays, bool prune_unused_nodes,
     bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    mlir::MLIRContext* context);
+    // TODO(jpienaar): Remove this.
+    bool enable_shape_inference, mlir::MLIRContext* context);
 
 // Similar as the above function, but replaces all constant tensors
 // with randomly generated splat values.
@@ -47,7 +48,8 @@ mlir::OwningModuleRef GraphdefToSplattedMlirTranslateFunction(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view control_output_arrays, bool prune_unused_nodes,
     bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    mlir::MLIRContext* context);
+    // TODO(jpienaar): Remove this.
+    bool enable_shape_inference, mlir::MLIRContext* context);
 
 // Converts a TensorFlow SavedModel stored in the directory with the given
 // `saved_model_dir` into a MLIR module. Creates MLIR entities into the
@@ -62,7 +64,8 @@ mlir::OwningModuleRef SavedModelObjectGraphToMlirImport(
 // given MLIR `context`.
 mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
-    const std::unordered_set<std::string>& tags, mlir::MLIRContext* context);
+    const std::unordered_set<std::string>& tags,
+    absl::Span<std::string> exported_names, mlir::MLIRContext* context);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
index 9347f00a43e..249ed2767c0 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
@@ -109,3 +109,9 @@ opt<bool> graph_as_function("tf-graph-as-function",
 opt<bool> upgrade_legacy("tf-upgrade-legacy",
                          llvm::cl::desc("Upgrade legacy TF graph behavior"),
                          llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+opt<bool> enable_shape_inference(
+    "tf-enable-shape-inference-on-import",
+    llvm::cl::desc("Enable shape inference on import (temporary)"),
+    llvm::cl::init(false));
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
index bfcaed43ba2..accff43f697 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
@@ -39,5 +39,7 @@ extern llvm::cl::opt<bool> prune_unused_nodes;
 extern llvm::cl::opt<bool> convert_legacy_fed_inputs;
 extern llvm::cl::opt<bool> graph_as_function;
 extern llvm::cl::opt<bool> upgrade_legacy;
+// TODO(jpienaar): Temporary flag, flip default and and remove.
+extern llvm::cl::opt<bool> enable_shape_inference;
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_TF_MLIR_TRANSLATE_CL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index b4c279c367d..8f7c1e77c01 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -45,7 +45,8 @@ static OwningModuleRef GraphdefToMlirTranslateFunction(llvm::StringRef input,
   return tensorflow::GraphdefToMlirTranslateFunction(
       input, debug_info_file, input_arrays, input_dtypes, input_shapes,
       output_arrays, control_output_arrays, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy, context);
+      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
+      enable_shape_inference, context);
 }
 
 static TranslateToMLIRRegistration GraphdefToMlirTranslate(
@@ -56,7 +57,8 @@ static OwningModuleRef GraphdefToSplattedMlirTranslateFunction(
   return tensorflow::GraphdefToSplattedMlirTranslateFunction(
       input, debug_info_file, input_arrays, input_dtypes, input_shapes,
       output_arrays, control_output_arrays, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy, context);
+      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
+      enable_shape_inference, context);
 }
 
 static TranslateToMLIRRegistration GraphdefToSplattedMlirTranslate(
@@ -68,7 +70,6 @@ static LogicalResult MlirToGraphdefTranslateFunction(
 
   // TODO(fengliuai): Add exporter flags.
   tensorflow::GraphExportConfig confs;
-  confs.graph_as_function = graph_as_function;
   StatusOr<std::unique_ptr<tensorflow::GraphDef>> graphdef_or(
       tensorflow::ConvertMlirToGraphdef(module, confs));
   if (!graphdef_or.status().ok()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
index 8212c0b50a4..d7b511094d3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
@@ -38,6 +38,7 @@ inline static void Log(BridgeLoggerConfig::PrintCallbackFn print_callback,
   std::unique_ptr<llvm::raw_ostream> os;
   std::string filepath;
   if (CreateFileForDumping(name, &os, &filepath).ok()) print_callback(*os);
+  VLOG(1) << "Dumped MLIR module to " << filepath;
 }
 
 void BridgeLoggerConfig::printBeforeIfEnabled(mlir::Pass* pass,
@@ -52,4 +53,11 @@ void BridgeLoggerConfig::printAfterIfEnabled(mlir::Pass* pass,
   Log(print_callback, pass, operation, "after");
 }
 
+void BridgeTimingConfig::printTiming(PrintCallbackFn printCallback) {
+  std::string name = "mlir_bridge_pass_timing.txt";
+  std::unique_ptr<llvm::raw_ostream> os;
+  std::string filepath;
+  if (CreateFileForDumping(name, &os, &filepath).ok()) printCallback(*os);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h
index b5b2ad33b31..eaf3a7c2598 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h
@@ -44,6 +44,13 @@ class BridgeLoggerConfig : public mlir::PassManager::IRPrinterConfig {
                            PrintCallbackFn print_callback) override;
 };
 
+// Logger for logging/dumping pass pipeline timings after completion.
+class BridgeTimingConfig : public mlir::PassManager::PassTimingConfig {
+ public:
+  // Hook that control how/where is the output produced
+  void printTiming(PrintCallbackFn printCallback) override;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_BRIDGE_LOGGER_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index 8405167c7cd..03283da0112 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -17,10 +17,13 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
@@ -35,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
@@ -258,14 +262,12 @@ Status ConvertMLIRToXlaComputation(
     mlir::ModuleOp module_op, llvm::StringRef device_type,
     xla::XlaComputation* xla_computation, bool use_tuple_args,
     bool return_tuple,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn) {
-  // Mark main function as public.
-  mlir::FuncOp main_func = module_op.lookupSymbol<mlir::FuncOp>("main");
-  if (main_func) {
-    main_func.setVisibility(mlir::FuncOp::Visibility::Public);
-  }
-
+    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   mlir::PassManager tf2xla(module_op.getContext());
+  // Mark main function as public, and other functions as private.
+  tf2xla.addPass(
+      mlir::TF::CreateMarkOnlyMainFunctionWithPublicVisibilityPass());
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
   tf2xla.addPass(mlir::TF::CreateTensorListOpsDecompositionPass());
   tf2xla.addPass(mlir::TF::CreateStackOpsDecompositionPass());
@@ -273,30 +275,45 @@ Status ConvertMLIRToXlaComputation(
   tf2xla.addPass(mlir::TFDevice::CreateDecomposeResourceOpsPass());
   tf2xla.addPass(mlir::TF::CreatePromoteResourcesToArgsPass());
   tf2xla.addPass(mlir::createSymbolDCEPass());
+  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
   // LegalizeTFControlFlow encapsulates arguments for control flow operations
   // with a tuple argument which break the assumption of resource lifting
   // inside PromoteResourcesToArgs.
   tf2xla.addPass(mlir::xla_hlo::createLegalizeTFControlFlowPass());
 
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::xla_hlo::createLegalizeTFPass(true));
+  for (auto& target_pass : custom_legalization_passes) {
+    tf2xla.addNestedPass<mlir::FuncOp>(std::move(target_pass));
+  }
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
 
   // Leverage tf2xla kernels for ops that didn't get lowered in the previous
   // legalization pass.
   tf2xla.addPass(mlir::xla_hlo::createLegalizeTfWithTf2XlaPass(device_type));
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
 
+  // Run shape inference pass to propagate shapes through tensor_cast operations
+  // from static to dynamic shapes. This could be generated if the shape
+  // inference was originally missing in a TF op but the corresponding HLO op
+  // had static shape after lowering.
+  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
+
   // Run LegalizeTFPass again because the previous legalization passes can
   // expose more graph pruning and canonicalization opportunities that are
   // necessary for the second LegalizeTFPass(allow_partial_conversion=false)
   // invocation.
   tf2xla.addNestedPass<mlir::FuncOp>(
       mlir::xla_hlo::createLegalizeTFPass(false));
+  // In order to export to XLA, we must sink constants to control flow regions,
+  // since XLA uses functional control flow.
+  tf2xla.addNestedPass<mlir::FuncOp>(
+      mlir::xla_hlo::createSinkConstantsToControlFlowPass());
 
   if (VLOG_IS_ON(1)) {
     // Print the whole module after each pass which requires disabling
     // multi-threading as well.
-    tf2xla.disableMultithreading();
+    module_op.getContext()->disableMultithreading();
     tf2xla.enableIRPrinting(std::make_unique<tensorflow::BridgeLoggerConfig>(
         /*print_module_scope=*/true));
   }
@@ -326,7 +343,8 @@ static Status CompileMlirToXlaHlo(
     mlir::ModuleOp module_op, llvm::ArrayRef<TensorShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args,
     XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result) {
+    XlaCompiler::CompilationResult* compilation_result,
+    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   if (VLOG_IS_ON(1))
     tensorflow::DumpMlirOpToFile("mlir_compile_before", module_op);
 
@@ -344,7 +362,8 @@ static Status CompileMlirToXlaHlo(
   TF_RETURN_IF_ERROR(ConvertMLIRToXlaComputation(
       module_op, device_type, compilation_result->computation.get(),
       use_tuple_args,
-      /*return_tuple=*/true, shape_representation_fn));
+      /*return_tuple=*/true, shape_representation_fn,
+      std::move(custom_legalization_passes)));
 
   // Construct mapping from XlaComputation's arg to input edges of execute
   // node.
@@ -374,7 +393,8 @@ Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef mlir_module_string, llvm::ArrayRef<TensorShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args,
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result) {
+    XlaCompiler::CompilationResult* compilation_result,
+    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   RegisterDialects();
   mlir::MLIRContext mlir_context;
   mlir::OwningModuleRef mlir_module;
@@ -383,16 +403,51 @@ Status CompileSerializedMlirToXlaHlo(
       ParseMlirModule(mlir_module_string, &mlir_context, &mlir_module));
   return CompileMlirToXlaHlo(mlir_module.get(), arg_shapes, device_type,
                              use_tuple_args, shape_representation_fn,
-                             compilation_result);
+                             compilation_result,
+                             std::move(custom_legalization_passes));
+}
+
+// Rewrites the given module with specified args. For each of the constant args,
+// it gets inlined in the "main' function and the corresponding argument is
+// removed from the signature.
+// Returns the original indices for the other arguments on success.
+static StatusOr<std::vector<int>> RewriteWithArgs(
+    mlir::ModuleOp module, llvm::ArrayRef<const XlaCompiler::Argument> args) {
+  mlir::FuncOp main_fn = module.lookupSymbol<mlir::FuncOp>("main");
+  std::vector<int> params;
+
+  auto builder = mlir::OpBuilder(main_fn.getBody());
+  std::vector<int> args_to_erase;
+  for (int idx = 0; idx < args.size(); idx++) {
+    const XlaCompiler::Argument& xla_arg = args[idx];
+    mlir::BlockArgument mlir_arg = main_fn.getArgument(idx);
+    if (xla_arg.kind != XlaCompiler::Argument::kConstant) {
+      params.push_back(idx);
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(auto value_attr,
+                        ConvertTensor(xla_arg.constant_value, &builder));
+    // TODO(hinsu): Use the actual location of the constant.
+    auto constant = builder.create<mlir::TF::ConstOp>(
+        mlir::UnknownLoc::get(module.getContext()), value_attr);
+    mlir_arg.replaceAllUsesWith(constant);
+    args_to_erase.push_back(idx);
+  }
+
+  for (int idx : llvm::reverse(args_to_erase)) main_fn.eraseArgument(idx);
+  return params;
 }
 
 Status CompileGraphToXlaHlo(
-    const Graph& graph, llvm::ArrayRef<TensorShape> arg_shapes,
+    const Graph& graph, llvm::ArrayRef<const XlaCompiler::Argument> args,
     llvm::StringRef device_type, bool use_tuple_args,
     const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result) {
+    XlaCompiler::CompilationResult* compilation_result,
+    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   RegisterDialects();
+
   mlir::MLIRContext context;
   GraphImportConfig config;
   config.graph_as_function = true;
@@ -400,9 +455,19 @@ Status CompileGraphToXlaHlo(
       ConvertGraphToMlir(graph, debug_info, flib_def, config, &context);
   if (!module_or.ok()) return module_or.status();
 
-  return CompileMlirToXlaHlo(module_or.ValueOrDie().get(), arg_shapes,
-                             device_type, use_tuple_args,
-                             shape_representation_fn, compilation_result);
+  mlir::ModuleOp module = module_or.ValueOrDie().get();
+  TF_ASSIGN_OR_RETURN(std::vector<int> remaining_params,
+                      RewriteWithArgs(module, {args.data(), args.size()}));
+  llvm::SmallVector<TensorShape, 4> arg_shapes;
+  arg_shapes.reserve(args.size());
+  for (unsigned idx : remaining_params)
+    arg_shapes.push_back(absl::get<TensorShape>(args[idx].shape));
+
+  auto status = CompileMlirToXlaHlo(
+      module, arg_shapes, device_type, use_tuple_args, shape_representation_fn,
+      compilation_result, std::move(custom_legalization_passes));
+  compilation_result->input_mapping = remaining_params;
+  return status;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
index 74c602a7afb..24b60dcb346 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
@@ -50,11 +51,14 @@ namespace tensorflow {
 // shape_representation_fn: when this is set, this shape representation function
 //   will be used to determine argument and result shapes. Otherwise the
 //   original shape will be used as is.
+// custom_legalization_passes: passes to run before the default TF legalization
+//   passes for backend-specific ops.
 Status ConvertMLIRToXlaComputation(
     mlir::ModuleOp module_op, llvm::StringRef device_type,
     xla::XlaComputation* xla_computation, bool use_tuple_args,
     bool return_tuple,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn = nullptr);
+    const XlaCompiler::ShapeRepresentationFn shape_representation_fn = nullptr,
+    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
 
 // Compiles a serialized MLIR module into XLA HLO, generates all accompanying
 // metadata and stores them in CompilationResult.
@@ -62,15 +66,17 @@ Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef mlir_module_string, llvm::ArrayRef<TensorShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args,
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result);
+    XlaCompiler::CompilationResult* compilation_result,
+    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
 
 // Same as the above but takes input as TensorFlow Graph.
 Status CompileGraphToXlaHlo(
-    const Graph& graph, llvm::ArrayRef<TensorShape> arg_shapes,
+    const Graph& graph, llvm::ArrayRef<const XlaCompiler::Argument> args,
     llvm::StringRef device_type, bool use_tuple_args,
     const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result);
+    XlaCompiler::CompilationResult* compilation_result,
+    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
index 26c50a24f58..91640aff437 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
@@ -252,6 +252,37 @@ TEST(CompileSerializedMlirToXlaHloTest, ShapeInference) {
               ::testing::HasSubstr(expected_signature));
 }
 
+TEST(CompileSerializedMlirToXlaHloTest, ShapeInferenceAfterLegalization) {
+  constexpr char mlir_module[] = R"(
+    module attributes {tf.versions = {producer = 179 : i32}} {
+      func @main(%arg0: tensor<8x16x16x64xbf16>, %arg1: tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>) {
+        %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg1, %arg1, %arg1) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>)
+        return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>
+      }
+    }
+  )";
+
+  std::vector<TensorShape> arg_shapes{TensorShape({8, 16, 16, 64}),
+                                      TensorShape({64})};
+  XlaCompiler::CompilationResult compilation_result;
+
+  Status s = CompileSerializedMlirToXlaHlo(
+      mlir_module, arg_shapes, "XLA_CPU_JIT",
+      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
+  TF_ASSERT_OK(s);
+
+  const xla::HloModuleConfig module_config(
+      compilation_result.computation->GetProgramShape().ValueOrDie());
+  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
+      compilation_result.computation->proto(), module_config);
+  TF_ASSERT_OK(status_or_hlo_module.status());
+
+  constexpr char expected_signature[] =
+      R"(-> (bf16[8,16,16,64], f32[64], f32[64], f32[64], f32[64], f32[0]))";
+  EXPECT_THAT(status_or_hlo_module.ValueOrDie()->ToString(),
+              ::testing::HasSubstr(expected_signature));
+}
+
 TEST(CompileSerializedMlirToXlaHloTest, ConstantFoldHook) {
   constexpr char mlir_module[] = R"(
 module attributes {tf.versions = {producer = 179 : i32}} {
@@ -424,8 +455,12 @@ TEST(CompileGraphToXlaHlo, Basic) {
   test::graph::Retval(&graph, 0, arg);
 
   XlaCompiler::CompilationResult result;
+  XlaCompiler::Argument compiler_arg;
+  compiler_arg.kind = XlaCompiler::Argument::kParameter;
+  compiler_arg.shape = TensorShape();
+
   TF_ASSERT_OK(
-      CompileGraphToXlaHlo(graph, /*arg_shapes=*/{TensorShape()}, "XLA_CPU_JIT",
+      CompileGraphToXlaHlo(graph, /*args=*/{compiler_arg}, "XLA_CPU_JIT",
                            /*use_tuple_args=*/false, flib_def, GraphDebugInfo(),
                            /*shape_representation_fn=*/nullptr, &result));
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index 29de158ff3c..b28f26b6c3c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -31,13 +31,16 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
@@ -47,6 +50,7 @@ using llvm::SmallVector;
 using mlir::Builder;
 using mlir::DenseFPElementsAttr;
 using mlir::DenseIntElementsAttr;
+using mlir::DenseStringElementsAttr;
 using mlir::ElementsAttr;
 using mlir::OpaqueElementsAttr;
 using mlir::RankedTensorType;
@@ -83,16 +87,36 @@ StatusOr<ElementsAttr> ConvertFlatTensor(const Tensor& input_tensor,
       type, llvm::makeArrayRef(arr.data(), arr.size()));
 }
 
-StatusOr<ElementsAttr> ConvertBF16Tensor(const Tensor& input_tensor,
-                                         ShapedType type) {
+ElementsAttr ConvertBf16Tensor(const Tensor& input_tensor,
+                               RankedTensorType type) {
   auto flat = input_tensor.flat<bfloat16>();
+  llvm::SmallVector<llvm::APFloat, 4> floats;
+  floats.reserve(flat.size());
+  for (bfloat16 v : llvm::makeArrayRef(flat.data(), flat.size()))
+    floats.push_back(llvm::APFloat(static_cast<double>(v)));
+  return mlir::DenseElementsAttr::get(type, llvm::makeArrayRef(floats));
+}
 
-  llvm::SmallVector<double, 4> flat_double;
-  flat_double.reserve(flat.size());
-  for (bfloat16 v : llvm::makeArrayRef(flat.data(), flat.size())) {
-    flat_double.push_back(static_cast<double>(v));
+ElementsAttr ConvertHalfTensor(const Tensor& tensor, RankedTensorType type) {
+  auto buffer = llvm::makeArrayRef(static_cast<char*>(tensor.data()),
+                                   tensor.TotalBytes());
+  return mlir::DenseElementsAttr::getFromRawBuffer(
+      type, buffer,
+      /*isSplatBuffer=*/type.getNumElements() == 1);
+}
+
+StatusOr<ElementsAttr> ConvertStringTensor(const Tensor& input_tensor,
+                                           ShapedType type) {
+  // Extract to a vector of StringRefs for converting.
+  auto arr = input_tensor.flat<tstring>();
+  std::vector<mlir::StringRef> string_refs;
+  string_refs.reserve(arr.size());
+  for (int i = 0; i < arr.size(); i++) {
+    const auto& val = arr(i);
+    string_refs.push_back({val.data(), val.size()});
   }
-  return mlir::DenseElementsAttr::get(type, llvm::makeArrayRef(flat_double));
+
+  return DenseStringElementsAttr::get(type, string_refs);
 }
 
 StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
@@ -109,18 +133,31 @@ StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
   case DTYPE:                      \
     return ConvertFlatTensor<CTYPE>(input_tensor, type);
 
-  // TODO(fengliuai): customize the conversions for more types.
+  // TODO(fengliuai): customize the conversions for quantized and string types.
   switch (input_dtype) {
     CONVERT_FLAT(DT_BOOL, bool)
     CONVERT_FLAT(DT_FLOAT, float)
     CONVERT_FLAT(DT_DOUBLE, double)
+    CONVERT_FLAT(DT_INT8, int8)
+    CONVERT_FLAT(DT_INT16, int16)
     CONVERT_FLAT(DT_INT32, int32)
     CONVERT_FLAT(DT_INT64, int64)
+    CONVERT_FLAT(DT_UINT8, uint8)
+    CONVERT_FLAT(DT_UINT16, uint16)
+    CONVERT_FLAT(DT_UINT32, uint32)
+    CONVERT_FLAT(DT_UINT64, uint64)
+    CONVERT_FLAT(DT_COMPLEX64, std::complex<float>)
+    CONVERT_FLAT(DT_COMPLEX128, std::complex<double>)
 
     // BFLOAT16 is a special case that it needs to be cast to double type to
     // match its storage type.
     case DT_BFLOAT16:
-      return ConvertBF16Tensor(input_tensor, type);
+      return ConvertBf16Tensor(input_tensor, type);
+    case DT_HALF:
+      return ConvertHalfTensor(input_tensor, type);
+
+    case DT_STRING:
+      return ConvertStringTensor(input_tensor, type);
 
     default:
       // TODO(shpeisman): restructure code to reuse dialect pointer across
@@ -164,6 +201,38 @@ PartialTensorShape ConvertTypeToTensorShape(const mlir::Type& type) {
   return TensorShape();
 }
 
+mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
+  if (type.isa<mlir::UnrankedTensorType>()) {
+    return mlir::TF::ShapeAttr::get(type.getContext(), llvm::None);
+  }
+
+  if (auto tensor_type = type.dyn_cast<mlir::RankedTensorType>()) {
+    return mlir::TF::ShapeAttr::get(type.getContext(), tensor_type.getShape());
+  }
+
+  // If type is not a RankedTensor or UnrankedTensor, it must be a scalar.
+  // Empty TensorShape indicates a scalar.
+  return mlir::TF::ShapeAttr::get(type.getContext(), ArrayRef<int64_t>());
+}
+
+// Converts an MLIR dense string elements attribute to a TensorFlow tensor
+// proto.
+void ConvertStringElementsAttr(
+    const DenseStringElementsAttr attr,
+    protobuf::RepeatedPtrField<std::string>* output) {
+  for (const auto& val : attr.getRawStringData())
+    output->Add({val.data(), val.size()});
+}
+
+template <typename T>
+void ConvertComplexElementsAttr(const mlir::DenseElementsAttr attr,
+                                protobuf::RepeatedField<T>* output) {
+  for (const auto& val : attr.getValues<std::complex<T>>()) {
+    output->Add(val.real());
+    output->Add(val.imag());
+  }
+}
+
 // Converts an MLIR opaque elements attribute to a TensorFlow tensor proto.
 Status ConvertOpaqueElementsAttr(const ElementsAttr attr,
                                  TensorProto* output_tensor) {
@@ -175,139 +244,80 @@ Status ConvertOpaqueElementsAttr(const ElementsAttr attr,
   return InvalidArgument("Unexpected elements attribute type from MLIR.");
 }
 
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the double_val field updated.
-Status ConvertDoubleElementsAttr(const ElementsAttr attr,
-                                 TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_double_val(elts.getSplatValue<double>());
-    } else {
-      for (auto value : elts.getValues<double>())
-        output_tensor->add_double_val(value);
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the float_val field updated.
-Status ConvertFloatElementsAttr(const ElementsAttr attr,
-                                TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_float_val(elts.getSplatValue<float>());
-    } else {
-      for (auto value : elts.getValues<float>())
-        output_tensor->add_float_val(value);
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the half_val field updated.
-Status ConvertHalfElementsAttr(const ElementsAttr attr,
-                               TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_half_val(
-          (*elts.begin()).bitcastToAPInt().getSExtValue());
-    } else {
-      for (const auto& value : elts.getFloatValues())
-        output_tensor->add_half_val(value.bitcastToAPInt().getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the int_val field updated.
-Status ConvertIntElementsAttr(const mlir::ElementsAttr attr,
-                              TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_int_val((*elts.begin()).getSExtValue());
-    } else {
-      for (const auto& val : elts)
-        output_tensor->add_int_val(val.getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-Status ConvertBfloat16ElementsAttr(const mlir::ElementsAttr attr,
-                                   TensorProto* output_tensor) {
-  auto elts = attr.dyn_cast<DenseFPElementsAttr>();
-  if (!elts) {
-    return ConvertOpaqueElementsAttr(attr, output_tensor);
-  }
-
-  // Bfloat16 is internally represented as `double` in MLIR.
-  if (elts.isSplat()) {
-    double v = elts.getSplatValue<double>();
-    bfloat16 bf16_val = static_cast<bfloat16>(v);
-    output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
+// Converts an MLIR elements attribute and adds it to specified repeated field.
+template <typename T>
+void ConvertElementsAttr(const mlir::DenseElementsAttr attr,
+                         protobuf::RepeatedField<T>* output) {
+  if (attr.isSplat()) {
+    output->Add(attr.getSplatValue<T>());
   } else {
-    for (auto v : elts.getValues<double>()) {
+    for (auto value : attr.getValues<T>()) output->Add(value);
+  }
+}
+
+// Converts an MLIR elements attribute containing half values and adds it to
+// specified repeated field.
+void ConvertHalfElementsAttr(const DenseFPElementsAttr attr,
+                             protobuf::RepeatedField<int>* output_tensor) {
+  if (attr.isSplat()) {
+    output_tensor->Add((*attr.begin()).bitcastToAPInt().getSExtValue());
+  } else {
+    for (const llvm::APFloat value : attr.getFloatValues())
+      output_tensor->Add(value.bitcastToAPInt().getSExtValue());
+  }
+}
+
+// Converts an MLIR elements attribute containing int values and adds it to
+// specified repeated field.
+void ConvertIntElementsAttr(const mlir::DenseIntElementsAttr attr,
+                            protobuf::RepeatedField<int>* output) {
+  if (attr.isSplat()) {
+    output->Add((*attr.begin()).getSExtValue());
+  } else {
+    for (const llvm::APInt val : attr) output->Add(val.getSExtValue());
+  }
+}
+
+void ConvertBfloat16ElementsAttr(const mlir::DenseFPElementsAttr attr,
+                                 protobuf::RepeatedField<int>* output) {
+  // Bfloat16 is internally represented as `double` in MLIR.
+  if (attr.isSplat()) {
+    double v = attr.getSplatValue<double>();
+    bfloat16 bf16_val = static_cast<bfloat16>(v);
+    output->Add(absl::bit_cast<int16>(bf16_val));
+  } else {
+    for (auto v : attr.getValues<double>()) {
       bfloat16 bf16_val = static_cast<bfloat16>(v);
-      output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
+      output->Add(absl::bit_cast<int16>(bf16_val));
     }
   }
-
-  return Status::OK();
 }
 
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the int64_val field updated.
-Status ConvertInt64ElementsAttr(const mlir::ElementsAttr attr,
-                                TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_int64_val((*elts.begin()).getSExtValue());
-    } else {
-      for (const auto& val : elts)
-        output_tensor->add_int64_val(val.getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with bool_val field updated.
-Status ConvertBoolElementsAttr(const mlir::ElementsAttr attr,
-                               TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    for (const auto& val : elts) {
-      output_tensor->add_bool_val(val.getBoolValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-Status ConvertToTensorProto(const ElementsAttr attr,
-                            TensorProto* output_tensor) {
+Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
   auto type = attr.getType();
   auto shape = type.getShape();
   DataType output_dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &output_dtype));
-  output_tensor->set_dtype(output_dtype);
-  ConvertToTensorShapeProto(shape, output_tensor->mutable_tensor_shape());
+  output->set_dtype(output_dtype);
+  ConvertToTensorShapeProto(shape, output->mutable_tensor_shape());
+
+  if (attr.isa<OpaqueElementsAttr>())
+    return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(), output);
+
+  auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
+  if (!dense_attr) return errors::InvalidArgument("Unsupported elements attr");
 
   switch (output_dtype) {
     case DT_FLOAT:
-      return ConvertFloatElementsAttr(attr, output_tensor);
+      ConvertElementsAttr<float>(dense_attr, output->mutable_float_val());
+      break;
     case DT_HALF:
-      // Handles both DenseFPElementsAttr and OpaqueElementsAttr.
-      return ConvertHalfElementsAttr(attr, output_tensor);
+      ConvertHalfElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
+                              output->mutable_half_val());
+      break;
     case DT_DOUBLE:
-      return ConvertDoubleElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_double_val());
+      break;
     case DT_QUINT8:
     case DT_UINT8:
     case DT_INT8:
@@ -315,17 +325,40 @@ Status ConvertToTensorProto(const ElementsAttr attr,
     case DT_UINT16:
     case DT_INT16:
     case DT_INT32:
-      return ConvertIntElementsAttr(attr, output_tensor);
+      ConvertIntElementsAttr(dense_attr.cast<DenseIntElementsAttr>(),
+                             output->mutable_int_val());
+      break;
+    case DT_UINT32:
+      ConvertElementsAttr(dense_attr, output->mutable_uint32_val());
+      break;
+    case DT_UINT64:
+      ConvertElementsAttr(dense_attr, output->mutable_uint64_val());
+      break;
     case DT_INT64:
-      return ConvertInt64ElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_int64_val());
+      break;
     case DT_BOOL:
-      return ConvertBoolElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_bool_val());
+      break;
     case DT_BFLOAT16:
-      return ConvertBfloat16ElementsAttr(attr, output_tensor);
+      ConvertBfloat16ElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
+                                  output->mutable_half_val());
+      break;
+    case DT_STRING:
+      ConvertStringElementsAttr(dense_attr.cast<DenseStringElementsAttr>(),
+                                output->mutable_string_val());
+      break;
+    case DT_COMPLEX64:
+      ConvertComplexElementsAttr(dense_attr, output->mutable_scomplex_val());
+      break;
+    case DT_COMPLEX128:
+      ConvertComplexElementsAttr(dense_attr, output->mutable_dcomplex_val());
+      break;
     default:
-      return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(),
-                                       output_tensor);
+      return errors::Unimplemented(absl::StrCat("Unimplemented data type ",
+                                                DataTypeString(output_dtype)));
   }
+  return Status::OK();
 }
 
 Status ConvertToTensor(const mlir::ElementsAttr attr, Tensor* output_tensor) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
index fdaf7ef0d45..e7cde4db936 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -44,6 +45,9 @@ void ConvertToTensorShapeProto(llvm::ArrayRef<int64_t> shape,
 // Converts an MLIR type to a TensorFlow tensor shape.
 PartialTensorShape ConvertTypeToTensorShape(const mlir::Type& type);
 
+// Converts an MLIR shaped type to a TensorFlow shape attribute.
+mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type);
+
 // Converts an MLIR elements attribute to a TensorFlow tensor proto.
 Status ConvertToTensorProto(mlir::ElementsAttr attr,
                             TensorProto* output_tensor);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index 5d039176bb0..bf96e3d1df4 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -15,10 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 
+#include <cstring>
+#include <initializer_list>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -26,6 +33,14 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+static void RegisterDialects() {
+  static bool init_once = []() {
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    return true;
+  }();
+  (void)init_once;
+}
+
 TEST(ConvertTypeToTensorTypeTest, UnrankedTensorType) {
   mlir::MLIRContext context;
   mlir::Builder b(&context);
@@ -61,5 +76,99 @@ TEST(ConvertTypeToTensorTypeTest, ScalarTensorType) {
   EXPECT_TRUE(output_shape.IsIdenticalTo(TensorShape()));
 }
 
+TEST(ConvertTypeToTensorTypeTest, ConvertStringTensor) {
+  RegisterDialects();
+  mlir::MLIRContext context;
+  mlir::Builder b(&context);
+
+  // Create the sample tensor to convert.
+  Tensor tensor(DT_STRING, TensorShape({1, 2, 2, 1}));
+  EXPECT_EQ(4, tensor.NumElements());
+  auto Tt = tensor.flat<tstring>();
+  Tt.setValues({"one", "two", "three", "four"});
+  auto value_or_status = ConvertTensor(tensor, &b);
+  ASSERT_TRUE(value_or_status.ok());
+  auto attr = value_or_status.ValueOrDie();
+
+  EXPECT_TRUE(attr.isa<mlir::DenseStringElementsAttr>());
+  auto string_attr = attr.cast<mlir::DenseStringElementsAttr>();
+  auto string_values = string_attr.getRawStringData();
+  ASSERT_EQ(string_values.size(), 4);
+  EXPECT_EQ(string_values[0], mlir::StringRef("one"));
+  EXPECT_EQ(string_values[1], mlir::StringRef("two"));
+  EXPECT_EQ(string_values[2], mlir::StringRef("three"));
+  EXPECT_EQ(string_values[3], mlir::StringRef("four"));
+}
+
+class ConvertTensorTest : public ::testing::Test {
+ protected:
+  template <typename T>
+  void VerifyConversion(std::initializer_list<T> values, DataType dtype,
+                        mlir::Type expected_ty) {
+    mlir::Builder b(expected_ty.getContext());
+    Tensor tensor(dtype, TensorShape({static_cast<int64>(values.size())}));
+    tensor.flat<T>().setValues(values);
+
+    auto value_or = ConvertTensor(tensor, &b);
+    TF_ASSERT_OK(value_or.status());
+    auto attr = value_or.ValueOrDie();
+
+    EXPECT_EQ(attr.getType().getElementType(), expected_ty);
+
+    Tensor out;
+    TF_ASSERT_OK(ConvertToTensor(attr, &out));
+
+    test::ExpectTensorEqual<T>(tensor, out);
+  }
+};
+
+TEST_F(ConvertTensorTest, Simple) {
+  RegisterDialects();
+
+  mlir::MLIRContext context;
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<Eigen::half>(
+      {Eigen::half(1.0)}, DT_HALF, mlir::FloatType::getF16(&context)));
+  ASSERT_NO_FATAL_FAILURE(
+      VerifyConversion<bfloat16>({bfloat16(1.0), bfloat16(-1.0)}, DT_BFLOAT16,
+                                 mlir::FloatType::getBF16(&context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<float>(
+      {1.0, -1.0}, DT_FLOAT, mlir::FloatType::getF32(&context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<double>(
+      {1.0, -1.0}, DT_DOUBLE, mlir::FloatType::getF64(&context)));
+
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8>(
+      {1, -1}, DT_INT8, mlir::IntegerType::get(8, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16>(
+      {1, -1}, DT_INT16, mlir::IntegerType::get(16, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32>(
+      {1, -1}, DT_INT32, mlir::IntegerType::get(32, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int64>(
+      {1, -1}, DT_INT64, mlir::IntegerType::get(64, &context)));
+
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8>(
+      {1, 2}, DT_UINT8,
+      mlir::IntegerType::get(
+          8, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16>(
+      {1, 2}, DT_UINT16,
+      mlir::IntegerType::get(
+          16, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32>(
+      {1, 2}, DT_UINT32,
+      mlir::IntegerType::get(
+          32, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64>(
+      {1, 2}, DT_UINT64,
+      mlir::IntegerType::get(
+          64, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<std::complex<float>>(
+      {{0.0, 1.0}, {1.0, 0.0}}, DT_COMPLEX64,
+      mlir::ComplexType::get(mlir::FloatType::getF32(&context))));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<std::complex<double>>(
+      {{0.0, 1.0}, {1.0, 0.0}}, DT_COMPLEX128,
+      mlir::ComplexType::get(mlir::FloatType::getF64(&context))));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
index ffcd1f71a50..c77107c8de7 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Analysis/Verifier.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index 538c7968592..797687ea658 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -144,7 +144,7 @@ std::string DumpMlirOpToFile(llvm::StringRef name, mlir::Operation* op,
   Status result = CreateFileForDumping(name, &os, &filepath, dirname);
   if (!result.ok()) return result.error_message();
 
-  op->print(*os, mlir::OpPrintingFlags().useLocalScope());
+  op->print(*os, mlir::OpPrintingFlags().useLocalScope().printGenericOpForm());
   LOG(INFO) << "Dumped MLIR operation '" << op->getName().getStringRef().str()
             << "' to '" << filepath << "'";
   return filepath;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
index e6908a15609..c0d109f7569 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
@@ -54,7 +54,8 @@ TEST(DumpMlirModuleTest, Valid) {
   std::string expected_txt_module;
   {
     llvm::raw_string_ostream os(expected_txt_module);
-    module_ref->getOperation()->print(os);
+    module_ref->getOperation()->print(
+        os, mlir::OpPrintingFlags().printGenericOpForm());
     os.flush();
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index 075014319df..4877cbc4a44 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -52,12 +54,23 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace {
+// static TensorFlow op prefix set.
+std::set<std::string>* GlobalOpPrefixes() {
+  static std::set<std::string>* global_op_prefixes = [] {
+    std::set<std::string>* result = new std::set<std::string>;
+    result->insert("tf.");
+    result->insert("_tf.");
+    result->insert("tf_executor.");
+    return result;
+  }();
+  return global_op_prefixes;
+}
+
 // Converts a location to the debug information for the node def.
 Status ConvertLocation(mlir::Location inst_loc,
                        NodeDef::ExperimentalDebugInfo* debug_info) {
@@ -96,6 +109,19 @@ Status ConvertAttribute(const mlir::ElementsAttr& attr, AttrValue* value) {
   return ConvertToTensorProto(attr, value->mutable_tensor());
 }
 
+Status ConvertAttribute(const mlir::TF::ShapeAttr& attr, AttrValue* value) {
+  auto* shape = value->mutable_shape();
+  if (attr.hasRank()) {
+    for (auto dim_size : attr.getShape()) {
+      auto* dim = shape->add_dim();
+      dim->set_size(dim_size);
+    }
+  } else {
+    shape->set_unknown_rank(true);
+  }
+  return Status::OK();
+}
+
 Status ConvertAttribute(const mlir::StringAttr& attr, AttrValue* value) {
   absl::string_view attr_value(attr.getValue().data(), attr.getValue().size());
   switch (mangling_util::GetMangledKind(attr_value)) {
@@ -182,6 +208,10 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, AttrValue* value) {
       }
       TF_RETURN_IF_ERROR(ConvertAttribute(elt_type, &attr_val));
       list->add_type(attr_val.type());
+    } else if (auto attr = a.dyn_cast<mlir::TF::ShapeAttr>()) {
+      AttrValue attr_val;
+      TF_RETURN_IF_ERROR(ConvertAttribute(attr, &attr_val));
+      *list->add_shape() = attr_val.shape();
     } else {
       return errors::Unimplemented("Unhandled attribute!");
     }
@@ -250,8 +280,10 @@ StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef op_name) {
   // - ".sink" or ".Sink": only the NextIteration operation has this suffix. We
   // don't need to consider ".source"/".Source" because the nodes with this
   // suffix are skipped by the caller and will not be added to the graph.
-  if (!op_name.consume_front("_tf.") && !op_name.consume_front("tf.") &&
-      !op_name.consume_front("tf_executor.")) {
+  auto prefixes = GlobalOpPrefixes();
+  if (std::none_of(prefixes->begin(), prefixes->end(), [&](std::string prefix) {
+        return op_name.consume_front(prefix);
+      })) {
     return errors::FailedPrecondition("op node '", op_name.str(),
                                       "' was not a TF op!");
   }
@@ -367,7 +399,8 @@ Status ConvertAttributes(
         TF_RETURN_IF_ERROR(
             ConvertAttribute(attr.cast<mlir::ArrayAttr>(), &value));
         break;
-      case mlir::StandardAttributes::DenseElements:
+      case mlir::StandardAttributes::DenseIntOrFPElements:
+      case mlir::StandardAttributes::DenseStringElements:
       case mlir::StandardAttributes::OpaqueElements:
         TF_RETURN_IF_ERROR(
             ConvertAttribute(attr.cast<mlir::ElementsAttr>(), &value));
@@ -380,6 +413,10 @@ Status ConvertAttributes(
         TF_RETURN_IF_ERROR(
             ConvertAttribute(attr.cast<mlir::UnitAttr>(), &value));
         break;
+      case static_cast<unsigned>(mlir::TF::AttrKind::SHAPE):
+        TF_RETURN_IF_ERROR(
+            ConvertAttribute(attr.cast<mlir::TF::ShapeAttr>(), &value));
+        break;
       // AffineMap kind is not implemented.
       case mlir::StandardAttributes::AffineMap:
         return errors::Unimplemented("AffineMap attribute (needed for '",
@@ -483,4 +520,9 @@ bool IsLegacyCallInstruction(mlir::Operation* inst) {
          inst->getName().getStringRef().compare("_tf.LegacyCall") == 0;
 }
 
+Status AddTensorFlowOpPrefix(std::string prefix) {
+  GlobalOpPrefixes()->insert(prefix);
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
index 32ed528bd0d..58fe39fa4e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -34,10 +34,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
+namespace mlir {
+class ShapedType;
+}  // namespace mlir
+
 namespace tensorflow {
 
 using stream_executor::port::StatusOr;
 
+// Add custom op prefix for TensorFlow dialects.
+Status AddTensorFlowOpPrefix(std::string);
+
 // Maps an MLIR op name in the TensorFlow dialect or the TensorFlow control
 // dialect back into a TensorFlow valid op name.
 StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
index 47c5d27767d..3d16352f78e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
@@ -31,12 +31,17 @@ inline llvm::StringRef StringViewToRef(absl::string_view view) {
 }
 }  // namespace
 
-Status LoadProtoFromBuffer(absl::string_view input,
-                           protobuf::MessageLite* proto) {
+Status LoadProtoFromBuffer(absl::string_view input, protobuf::Message* proto) {
   // Attempt to parse as text.
   if (ParseTextProto(input, "", proto).ok()) return Status::OK();
 
   // Else attempt to parse as binary.
+  return LoadProtoFromBuffer(input, static_cast<protobuf::MessageLite*>(proto));
+}
+
+Status LoadProtoFromBuffer(absl::string_view input,
+                           protobuf::MessageLite* proto) {
+  // Attempt to parse as binary.
   protobuf::io::ArrayInputStream binary_stream(input.data(), input.size());
   if (proto->ParseFromZeroCopyStream(&binary_stream)) return Status::OK();
 
@@ -44,8 +49,8 @@ Status LoadProtoFromBuffer(absl::string_view input,
   return errors::InvalidArgument("Could not parse input proto");
 }
 
-Status LoadProtoFromFile(absl::string_view input_filename,
-                         protobuf::MessageLite* proto) {
+template <class T>
+Status LoadProtoFromFileImpl(absl::string_view input_filename, T* proto) {
   const auto file_or_err =
       llvm::MemoryBuffer::getFileOrSTDIN(StringViewToRef(input_filename));
   if (std::error_code error = file_or_err.getError()) {
@@ -60,4 +65,14 @@ Status LoadProtoFromFile(absl::string_view input_filename,
   return LoadProtoFromBuffer(content, proto);
 }
 
+Status LoadProtoFromFile(absl::string_view input_filename,
+                         protobuf::Message* proto) {
+  return LoadProtoFromFileImpl(input_filename, proto);
+}
+
+Status LoadProtoFromFile(absl::string_view input_filename,
+                         protobuf::MessageLite* proto) {
+  return LoadProtoFromFileImpl(input_filename, proto);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h
index 56cd188f393..ad1531dd449 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h
@@ -24,13 +24,20 @@ namespace tensorflow {
 
 // Reads text (.pbtext) or binary (.pb) format of a proto message from the given
 // buffer. Returns error status of the file is not found or malformed proto.
+// Note that text protos can only be parsed when full protobuf::Message protos
+// are used, and will fail for protobuf::MessageLite protos.
+Status LoadProtoFromBuffer(absl::string_view input, protobuf::Message* proto);
 Status LoadProtoFromBuffer(absl::string_view input,
-                           tensorflow::protobuf::MessageLite* proto);
+                           protobuf::MessageLite* proto);
 
 // Reads text (.pbtext) or binary (.pb) format of a proto message from the given
 // file path. Returns error status of the file is not found or malformed proto.
+// Note that text protos can only be parsed when full protobuf::Message protos
+// are used, and will fail for protobuf::MessageLite protos.
 Status LoadProtoFromFile(absl::string_view input_filename,
-                         tensorflow::protobuf::MessageLite* proto);
+                         protobuf::Message* proto);
+Status LoadProtoFromFile(absl::string_view input_filename,
+                         protobuf::MessageLite* proto);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.cc b/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.cc
index b616d34fdd8..1bf615de8c4 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.cc
@@ -24,7 +24,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifndef TENSORFLOW_LITE_PROTOS
 namespace {
 // Error collector that simply ignores errors reported.
 class NoOpErrorCollector : public protobuf::io::ErrorCollector {
@@ -32,7 +31,6 @@ class NoOpErrorCollector : public protobuf::io::ErrorCollector {
   void AddError(int line, int column, const std::string& message) override {}
 };
 }  // namespace
-#endif  // TENSORFLOW_LITE_PROTOS
 
 Status ConsumePrefix(absl::string_view str, absl::string_view prefix,
                      absl::string_view* output) {
@@ -45,8 +43,7 @@ Status ConsumePrefix(absl::string_view str, absl::string_view prefix,
 
 Status ParseTextProto(absl::string_view text_proto,
                       absl::string_view prefix_to_strip,
-                      protobuf::MessageLite* parsed_proto) {
-#ifndef TENSORFLOW_LITE_PROTOS
+                      protobuf::Message* parsed_proto) {
   protobuf::TextFormat::Parser parser;
   // Don't produce errors when attempting to parse text format as it would fail
   // when the input is actually a binary file.
@@ -60,15 +57,11 @@ Status ParseTextProto(absl::string_view text_proto,
   }
   protobuf::io::ArrayInputStream input_stream(text_proto_without_prefix.data(),
                                               text_proto_without_prefix.size());
-  if (parser.Parse(&input_stream,
-                   tensorflow::down_cast<protobuf::Message*>(parsed_proto))) {
+  if (parser.Parse(&input_stream, parsed_proto)) {
     return Status::OK();
   }
   parsed_proto->Clear();
   return errors::InvalidArgument("Could not parse text proto: ", text_proto);
-#else
-  return errors::Unavailable("Cannot parse text protos on mobile.");
-#endif  // TENSORFLOW_LITE_PROTOS
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.h b/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.h
index 5646f1378af..c1f1e3b111d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.h
@@ -32,7 +32,12 @@ Status ConsumePrefix(absl::string_view str, absl::string_view prefix,
 // proto.
 Status ParseTextProto(absl::string_view text_proto,
                       absl::string_view prefix_to_strip,
-                      protobuf::MessageLite* parsed_proto);
+                      protobuf::Message* parsed_proto);
+inline Status ParseTextProto(absl::string_view /* text_proto */,
+                             absl::string_view /* prefix_to_strip */,
+                             protobuf::MessageLite* /* parsed_proto */) {
+  return errors::Unavailable("Cannot parse text protos on mobile.");
+}
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 6cf2781e48d..282b7ad3139 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -39,6 +39,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
+
+const char* const kTPUReplicatedHost = "TPU_REPLICATED_HOST";
+const char* const kNumCoresPerReplicaAttr = "num_cores_per_replica";
+const char* const kTopologyAttr = "topology";
+const char* const kDeviceAssignmentAttr = "device_assignment";
+
 // Device coordinates are defined as (x, y, z, core), thus resulting in a rank 4
 // topology.
 constexpr int kTPUTopologyRank = 4;
@@ -46,8 +52,8 @@ constexpr int kTPUTopologyRank = 4;
 constexpr char kDeviceTPUSystem[] = "TPU_SYSTEM";
 constexpr char kDeviceTPU[] = "TPU";
 constexpr char kTPUReplicatedCore[] = "TPU_REPLICATED_CORE";
-constexpr char kTopologyAttr[] = "topology";
-constexpr char kDeviceAssignmentAttr[] = "device_assignment";
+constexpr char kBadIntArrayElementMsg[] =
+    "bad '{0}' attribute at index {1}, not an int";
 
 using Device = DeviceNameUtils::ParsedName;
 using Devices = llvm::ArrayRef<DeviceNameUtils::ParsedName>;
@@ -164,12 +170,19 @@ std::string GetTPUCompilationDevice(Device system_device) {
   return DeviceNameUtils::ParsedNameToString(system_device);
 }
 
+// Finds the host CPU device for a given TPU device.
+std::string GetCPUHostDeviceForTPUDevice(Device tpu_device) {
+  tpu_device.type = DEVICE_CPU;
+  tpu_device.id = 0;
+  return DeviceNameUtils::ParsedNameToString(tpu_device);
+}
+
 // Determines execution devices when topology and device assignment are not
 // defined. This is a special case where a single core computation is replicated
 // to every core in the mesh. TPU devices are simply added to
 // `execution_devices` of one replica. `num_replicas` must be 1 or the total
 // number of TPU devices available, and `num_cores_per_replica` must be 1.
-StatusOr<ExecutionDevices> GetFullMeshTPUExecutionDeviceAssignment(
+StatusOr<TPUDevicesAndHosts> GetFullMeshTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
     llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices) {
   const int num_tasks = tpu_devices.size();
@@ -185,17 +198,18 @@ StatusOr<ExecutionDevices> GetFullMeshTPUExecutionDeviceAssignment(
         "'num_cores_per_replica' must be equal to 1, got ",
         num_cores_per_replica);
 
-  ExecutionDevices execution_devices;
-  execution_devices.reserve(num_replicas);
+  TPUDevicesAndHosts devices_and_hosts;
+  devices_and_hosts.reserve(num_replicas);
   for (int i = 0; i < num_replicas; ++i) {
     const int task = i / num_tpus_per_task;
     const int device = i % num_tpus_per_task;
-    execution_devices.push_back(
-        {tensorflow::DeviceNameUtils::ParsedNameToString(
-            tpu_devices[task][device])});
+    const auto& tpu_device = tpu_devices[task][device];
+    devices_and_hosts.push_back({TPUDeviceAndHost(
+        /*device=*/tensorflow::DeviceNameUtils::ParsedNameToString(tpu_device),
+        /*host=*/GetCPUHostDeviceForTPUDevice(tpu_device))});
   }
 
-  return execution_devices;
+  return devices_and_hosts;
 }
 
 // Helper struct for keeping track of task and device for an associated TPU
@@ -326,7 +340,7 @@ StatusOr<xla::Array4D<TaskAndDevice>> ParseTopologyAttr(
 //  - number of device coordinates (in tuple 3) match number 'num_replicas' *
 //    'num_cores_per_replica'
 //  - a TPU device associated with each device coordinate
-StatusOr<std::pair<ExecutionDevices, xla::DeviceAssignmentProto>>
+StatusOr<std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>>
 GetGeneralTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
     llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices,
@@ -361,9 +375,9 @@ GetGeneralTPUExecutionDeviceAssignment(
   std::vector<bool> used_device_ids(
       location_to_id(bound_x - 1, bound_y - 1, bound_z - 1, bound_core - 1),
       false);
-  ExecutionDevices execution_devices(
-      num_replicas,
-      llvm::SmallVector<std::string, 8>(num_cores_per_replica, ""));
+  TPUDevicesAndHosts devices_and_hosts(
+      num_replicas, llvm::SmallVector<TPUDeviceAndHost, 8>(
+                        num_cores_per_replica, TPUDeviceAndHost()));
   xla::DeviceAssignment device_assignment(num_replicas, num_cores_per_replica);
   int pos = 0;
   for (int replica = 0; replica < num_replicas; ++replica) {
@@ -393,20 +407,43 @@ GetGeneralTPUExecutionDeviceAssignment(
 
       used_device_ids[device_id] = true;
       device_assignment(replica, logical_core) = device_id;
-      execution_devices[replica][logical_core] =
-          DeviceNameUtils::ParsedNameToString(tpu_devices[task][device]);
+      auto& device_and_host = devices_and_hosts[replica][logical_core];
+      const auto& tpu_device = tpu_devices[task][device];
+      device_and_host.device = DeviceNameUtils::ParsedNameToString(tpu_device);
+      device_and_host.host = GetCPUHostDeviceForTPUDevice(tpu_device);
     }
   }
 
   xla::DeviceAssignmentProto device_assignment_proto;
   TF_RETURN_IF_ERROR(device_assignment.Serialize(&device_assignment_proto));
 
-  return std::pair<ExecutionDevices, xla::DeviceAssignmentProto>(
-      std::move(execution_devices), std::move(device_assignment_proto));
+  return std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>(
+      std::move(devices_and_hosts), std::move(device_assignment_proto));
 }
 
 }  // anonymous namespace
 
+StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+    mlir::ArrayAttr device_assignment_attr) {
+  llvm::SmallVector<int64_t, 8> device_coordinates;
+  device_coordinates.reserve(device_assignment_attr.size());
+
+  for (auto device_coordinate_and_idx :
+       llvm::enumerate(device_assignment_attr)) {
+    auto device_coordinate =
+        device_coordinate_and_idx.value().dyn_cast<mlir::IntegerAttr>();
+    if (!device_coordinate)
+      return errors::InvalidArgument(
+          llvm::formatv(kBadIntArrayElementMsg, kDeviceAssignmentAttr,
+                        device_coordinate_and_idx.index())
+              .str());
+
+    device_coordinates.push_back(device_coordinate.getInt());
+  }
+
+  return device_coordinates;
+}
+
 StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
     Devices devices, int num_replicas, int num_cores_per_replica,
     llvm::StringRef topology_attr,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index dd296a13f4b..6bb541ab683 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -30,32 +31,52 @@ limitations under the License.
 namespace tensorflow {
 using stream_executor::port::StatusOr;
 
-// TPU devices to be used for execution (e.g. devices for TPUExecute ops). They
-// are ordered by `num_replicas` followed by `num_cores_per_replica`.
-using ExecutionDevices =
-    llvm::SmallVector<llvm::SmallVector<std::string, 8>, 8>;
+extern const char* const kTPUReplicatedHost;
+extern const char* const kNumCoresPerReplicaAttr;
+extern const char* const kTopologyAttr;
+extern const char* const kDeviceAssignmentAttr;
 
-// TPU compilation device, execution devices, and optionally execution device
-// IDs. Execution device IDs are populated if `topology` and `device_assignment`
-// are provided.
+// A TPU device for execution alongside its associated host CPU device.
+struct TPUDeviceAndHost {
+  TPUDeviceAndHost() {}
+  TPUDeviceAndHost(llvm::StringRef device, llvm::StringRef host)
+      : device(device), host(host) {}
+
+  std::string device;
+  std::string host;
+};
+
+// TPU devices to be used for execution (e.g. devices for TPUExecute ops) and
+// their associated host CPU devices (for outside compilation). They are ordered
+// by `num_replicas` followed by `num_cores_per_replica`.
+using TPUDevicesAndHosts =
+    llvm::SmallVector<llvm::SmallVector<TPUDeviceAndHost, 8>, 8>;
+
+// TPU compilation device, execution and associated host devices, and optionally
+// execution device IDs. Execution device IDs are populated if `topology` and
+// `device_assignment` are provided.
 struct TPUDeviceAssignment {
   TPUDeviceAssignment(llvm::StringRef compilation_device,
-                      ExecutionDevices&& execution_devices)
+                      TPUDevicesAndHosts&& tpu_devices)
       : compilation_device(compilation_device),
-        execution_devices(std::move(execution_devices)) {}
+        tpu_devices(std::move(tpu_devices)) {}
 
   TPUDeviceAssignment(llvm::StringRef compilation_device,
-                      ExecutionDevices&& execution_devices,
+                      TPUDevicesAndHosts&& tpu_devices,
                       xla::DeviceAssignmentProto&& xla_device_assignment)
       : compilation_device(compilation_device),
-        execution_devices(std::move(execution_devices)),
+        tpu_devices(std::move(tpu_devices)),
         xla_device_assignment(std::move(xla_device_assignment)) {}
 
   std::string compilation_device;
-  ExecutionDevices execution_devices;
+  TPUDevicesAndHosts tpu_devices;
   llvm::Optional<xla::DeviceAssignmentProto> xla_device_assignment;
 };
 
+// Extracts device coordinates from a device assignment attribute on an op.
+StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+    mlir::ArrayAttr device_assignment_attr);
+
 // Finds the TPU compilation device and execution devices from `devices` for a
 // TPU computation subgraph. Compilation device is determined from looking up
 // all TPU_SYSTEM:0 devices and choosing the CPU device associated to the first
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 87319f2adeb..a70e93a0195 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <tuple>
 
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/tpu/topology.pb.h"
@@ -323,30 +325,46 @@ TEST(TPURewriteDeviceUtilTest, ValidFullMeshDeviceAssignment) {
 
   TF_ASSERT_OK(status_or.status());
 
-  auto& tpu_device_assignment = status_or.ValueOrDie();
+  const auto& tpu_device_assignment = status_or.ValueOrDie();
   EXPECT_EQ(tpu_device_assignment.compilation_device,
             "/job:worker/replica:0/task:0/device:CPU:0");
-  auto& execution_devices = tpu_device_assignment.execution_devices;
-  ASSERT_EQ(execution_devices.size(), 8);
-  for (const auto& replica_execution_device : execution_devices)
-    ASSERT_EQ(replica_execution_device.size(), 1);
+  const auto& tpu_devices = tpu_device_assignment.tpu_devices;
+  ASSERT_EQ(tpu_devices.size(), 8);
+  for (const auto& replica_tpu_devices : tpu_devices)
+    ASSERT_EQ(replica_tpu_devices.size(), 1);
 
-  EXPECT_EQ(execution_devices[0][0],
+  EXPECT_EQ(tpu_devices[0][0].device,
             "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[1][0],
+  EXPECT_EQ(tpu_devices[0][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][0].device,
             "/job:worker/replica:0/task:0/device:TPU:1");
-  EXPECT_EQ(execution_devices[2][0],
+  EXPECT_EQ(tpu_devices[1][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[2][0].device,
             "/job:worker/replica:0/task:0/device:TPU:2");
-  EXPECT_EQ(execution_devices[3][0],
+  EXPECT_EQ(tpu_devices[2][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[3][0].device,
             "/job:worker/replica:0/task:0/device:TPU:3");
-  EXPECT_EQ(execution_devices[4][0],
+  EXPECT_EQ(tpu_devices[3][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[4][0].device,
             "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[5][0],
+  EXPECT_EQ(tpu_devices[4][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[5][0].device,
             "/job:worker/replica:0/task:1/device:TPU:1");
-  EXPECT_EQ(execution_devices[6][0],
+  EXPECT_EQ(tpu_devices[5][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[6][0].device,
             "/job:worker/replica:0/task:1/device:TPU:2");
-  EXPECT_EQ(execution_devices[7][0],
+  EXPECT_EQ(tpu_devices[6][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[7][0].device,
             "/job:worker/replica:0/task:1/device:TPU:3");
+  EXPECT_EQ(tpu_devices[7][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
 
   EXPECT_FALSE(tpu_device_assignment.xla_device_assignment.hasValue());
 }
@@ -410,30 +428,46 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh2x2x2) {
 
   TF_ASSERT_OK(status_or.status());
 
-  auto& tpu_device_assignment = status_or.ValueOrDie();
+  const auto& tpu_device_assignment = status_or.ValueOrDie();
   EXPECT_EQ(tpu_device_assignment.compilation_device,
             "/job:worker/replica:0/task:0/device:CPU:0");
-  auto& execution_devices = tpu_device_assignment.execution_devices;
-  ASSERT_EQ(execution_devices.size(), 4);
-  for (const auto& replica_execution_device : execution_devices)
-    ASSERT_EQ(replica_execution_device.size(), 2);
+  const auto& tpu_devices = tpu_device_assignment.tpu_devices;
+  ASSERT_EQ(tpu_devices.size(), 4);
+  for (const auto& replica_tpu_devices : tpu_devices)
+    ASSERT_EQ(replica_tpu_devices.size(), 2);
 
-  EXPECT_EQ(execution_devices[0][0],
+  EXPECT_EQ(tpu_devices[0][0].device,
             "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[0][1],
+  EXPECT_EQ(tpu_devices[0][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[0][1].device,
             "/job:worker/replica:0/task:1/device:TPU:3");
-  EXPECT_EQ(execution_devices[1][0],
+  EXPECT_EQ(tpu_devices[0][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][0].device,
             "/job:worker/replica:0/task:0/device:TPU:1");
-  EXPECT_EQ(execution_devices[1][1],
+  EXPECT_EQ(tpu_devices[1][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][1].device,
             "/job:worker/replica:0/task:1/device:TPU:2");
-  EXPECT_EQ(execution_devices[2][0],
+  EXPECT_EQ(tpu_devices[1][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[2][0].device,
             "/job:worker/replica:0/task:0/device:TPU:3");
-  EXPECT_EQ(execution_devices[2][1],
+  EXPECT_EQ(tpu_devices[2][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[2][1].device,
             "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[3][0],
+  EXPECT_EQ(tpu_devices[2][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[3][0].device,
             "/job:worker/replica:0/task:0/device:TPU:2");
-  EXPECT_EQ(execution_devices[3][1],
+  EXPECT_EQ(tpu_devices[3][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[3][1].device,
             "/job:worker/replica:0/task:1/device:TPU:1");
+  EXPECT_EQ(tpu_devices[3][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
 
   auto& xla_device_assignment = tpu_device_assignment.xla_device_assignment;
   ASSERT_TRUE(xla_device_assignment.hasValue());
@@ -511,23 +545,35 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   EXPECT_EQ(tpu_device_assignment.compilation_device,
             "/job:worker/replica:0/task:0/device:CPU:0");
 
-  auto& execution_devices = tpu_device_assignment.execution_devices;
-  ASSERT_EQ(execution_devices.size(), 2);
-  for (const auto& replica_execution_device : execution_devices)
-    ASSERT_EQ(replica_execution_device.size(), 3);
+  auto& tpu_devices = tpu_device_assignment.tpu_devices;
+  ASSERT_EQ(tpu_devices.size(), 2);
+  for (const auto& replica_tpu_devices : tpu_devices)
+    ASSERT_EQ(replica_tpu_devices.size(), 3);
 
-  EXPECT_EQ(execution_devices[0][0],
+  EXPECT_EQ(tpu_devices[0][0].device,
             "/job:worker/replica:0/task:1/device:TPU:1");
-  EXPECT_EQ(execution_devices[0][1],
+  EXPECT_EQ(tpu_devices[0][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[0][1].device,
             "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[0][2],
+  EXPECT_EQ(tpu_devices[0][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[0][2].device,
             "/job:worker/replica:0/task:2/device:TPU:0");
-  EXPECT_EQ(execution_devices[1][0],
+  EXPECT_EQ(tpu_devices[0][2].host,
+            "/job:worker/replica:0/task:2/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][0].device,
             "/job:worker/replica:0/task:2/device:TPU:1");
-  EXPECT_EQ(execution_devices[1][1],
+  EXPECT_EQ(tpu_devices[1][0].host,
+            "/job:worker/replica:0/task:2/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][1].device,
             "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[1][2],
+  EXPECT_EQ(tpu_devices[1][1].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][2].device,
             "/job:worker/replica:0/task:0/device:TPU:1");
+  EXPECT_EQ(tpu_devices[1][2].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
 
   auto& xla_device_assignment = tpu_device_assignment.xla_device_assignment;
   ASSERT_TRUE(xla_device_assignment.hasValue());
@@ -552,5 +598,29 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   EXPECT_EQ(computation_device_2.replica_device_ids(1), 3);
 }
 
+TEST(TPURewriteDeviceUtilTest, TestGetDeviceCoordinates) {
+  mlir::MLIRContext context;
+  mlir::Builder builder(&context);
+  auto device_assignment_attr = builder.getI64ArrayAttr({1, 2, 3});
+  auto status_or_device_coodinates =
+      GetDeviceCoordinates(device_assignment_attr);
+  ASSERT_TRUE(status_or_device_coodinates.ok());
+  auto device_coordinates = status_or_device_coodinates.ConsumeValueOrDie();
+  EXPECT_EQ(device_coordinates[0], 1);
+  EXPECT_EQ(device_coordinates[1], 2);
+  EXPECT_EQ(device_coordinates[2], 3);
+}
+
+TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
+  mlir::MLIRContext context;
+  mlir::Builder builder(&context);
+  auto device_assignment_attr = builder.getF32ArrayAttr({1.0, 2.0, 3.0});
+  auto status_or_device_coodinates =
+      GetDeviceCoordinates(device_assignment_attr);
+  ASSERT_TRUE(!status_or_device_coodinates.ok());
+  EXPECT_EQ(status_or_device_coodinates.status().error_message(),
+            "bad 'device_assignment' attribute at index 0, not an int");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index 1853183c3b4..083a5abf840 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -37,18 +37,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-const char* const kXlaShardingAttrName = "_XlaSharding";
 const char* const kInputShardingAttr = "input_sharding_configuration";
 const char* const kOutputShardingAttr = "output_sharding_configuration";
 
-llvm::Optional<mlir::StringRef> ParseShardingAttribute(
-    mlir::Operation* operation) {
-  const auto& sharding_attr =
-      operation->getAttrOfType<mlir::StringAttr>(kXlaShardingAttrName);
-  if (!sharding_attr) return llvm::Optional<mlir::StringRef>();
-  return sharding_attr.getValue();
-}
-
 namespace {
 
 constexpr char kNumSplitAttr[] = "num_split";
@@ -211,23 +202,23 @@ mlir::LogicalResult HandleTileShardedInputs(
 }  // namespace
 
 mlir::LogicalResult ExtractInputsForLogicalDevices(
-    const int num_cores_per_replica, mlir::tf_device::LaunchFuncOp launch_func,
-    mlir::OpBuilder* builder,
+    const int num_cores_per_replica,
+    mlir::tf_device::ClusterFuncOp cluster_func, mlir::OpBuilder* builder,
     llvm::SmallVectorImpl<llvm::SmallVector<mlir::Value, 4>>* input_list) {
   // Initialize the input list for each logical devices.
   input_list->reserve(num_cores_per_replica);
   for (int i = 0; i < num_cores_per_replica; ++i)
     input_list->emplace_back(llvm::SmallVector<mlir::Value, 4>());
 
-  llvm::SmallVector<mlir::Value, 4> launch_func_inputs(
-      launch_func.getOperands());
+  llvm::SmallVector<mlir::Value, 4> cluster_func_inputs(
+      cluster_func.getOperands());
   auto sharding_attrs =
-      launch_func.getOperation()->getAttrOfType<mlir::ArrayAttr>(
+      cluster_func.getOperation()->getAttrOfType<mlir::ArrayAttr>(
           kInputShardingAttr);
   // If sharding attribute does not exist, then all inputs are placed on 0th
   // logical core by default.
   if (!sharding_attrs) {
-    (*input_list)[0] = launch_func_inputs;
+    (*input_list)[0] = cluster_func_inputs;
     return mlir::success();
   }
 
@@ -238,7 +229,7 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
   for (const auto& sharding_attr_and_index : llvm::enumerate(sharding_attrs)) {
     const auto& sharding_attr = sharding_attr_and_index.value();
     const auto input_index = sharding_attr_and_index.index();
-    const auto& input_value = launch_func_inputs[input_index];
+    const auto& input_value = cluster_func_inputs[input_index];
 
     xla::OpSharding sharding;
     sharding.ParseFromString(
@@ -248,11 +239,11 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
     if (input_sharding_type == xla::OpSharding::OTHER) {
       llvm::SmallVector<mlir::Value, 4> tiled_inputs;
       auto result = HandleTileShardedInputs(
-          launch_func.getLoc(), sharding, input_value, builder, &tiled_inputs);
+          cluster_func.getLoc(), sharding, input_value, builder, &tiled_inputs);
       if (mlir::failed(result)) return mlir::failure();
 
       if (tiled_inputs.size() != num_cores_per_replica)
-        launch_func.emitError(llvm::formatv(
+        cluster_func.emitError(llvm::formatv(
             "incorrect {0}-th tiled input sharding received. "
             "Product of tile sharding splits({1}) must be equal to "
             "number of logical devices : {2}",
@@ -274,36 +265,37 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
 }
 
 mlir::LogicalResult ParseAndValidateOutputSharding(
-    const int num_cores_per_replica, mlir::tf_device::LaunchFuncOp launch_func,
+    const int num_cores_per_replica,
+    mlir::tf_device::ClusterFuncOp cluster_func,
     mlir::SmallVector<xla::OpSharding, 4>* output_sharding_list) {
-  output_sharding_list->reserve(launch_func.getNumResults());
+  output_sharding_list->reserve(cluster_func.getNumResults());
 
   const auto output_sharding_attrs =
-      launch_func.getOperation()->getAttrOfType<mlir::ArrayAttr>(
+      cluster_func.getOperation()->getAttrOfType<mlir::ArrayAttr>(
           kOutputShardingAttr);
   if (!output_sharding_attrs)
-    return launch_func.emitError(
-        "output_sharding_configuration missing from launch func");
+    return cluster_func.emitError(
+        "output_sharding_configuration missing from cluster func");
 
-  if (output_sharding_attrs.size() != launch_func.getNumResults())
-    return launch_func.emitError("incorrect number of output sharding");
+  if (output_sharding_attrs.size() != cluster_func.getNumResults())
+    return cluster_func.emitError("incorrect number of output sharding");
 
   for (auto output_sharding_and_index :
        llvm::enumerate(output_sharding_attrs)) {
     const auto& output_sharding = output_sharding_and_index.value();
     const int sharding_index = output_sharding_and_index.index();
     if (!output_sharding.isa<mlir::StringAttr>())
-      return launch_func.emitError(llvm::formatv(
+      return cluster_func.emitError(llvm::formatv(
           "non-string output sharding at index {0}", sharding_index));
 
     xla::OpSharding sharding;
     if (!sharding.ParseFromString(
             output_sharding.cast<mlir::StringAttr>().getValue().str()))
-      return launch_func.emitError("incorrect sharding format for outputs");
+      return cluster_func.emitError("incorrect sharding format for outputs");
 
     if (sharding.type() == xla::OpSharding::OTHER &&
         sharding.tile_assignment_devices_size() != num_cores_per_replica)
-      return launch_func.emitError(llvm::formatv(
+      return cluster_func.emitError(llvm::formatv(
           "incorrect sharding format for outputs. Number of "
           "tiled outputs({0}) must match the number of logical "
           "devices({1})",
@@ -312,7 +304,7 @@ mlir::LogicalResult ParseAndValidateOutputSharding(
     if (sharding.type() == xla::OpSharding::MAXIMAL &&
         ((sharding.tile_assignment_devices(0) >= num_cores_per_replica) ||
          (sharding.tile_assignment_devices(0) < 0)))
-      return launch_func.emitError(llvm::formatv(
+      return cluster_func.emitError(llvm::formatv(
           "incorrect sharding format for outputs. Maximal "
           "sharding should be assigned to device id in range "
           "[0, {0}). Currently assigned to {1}",
@@ -332,15 +324,15 @@ bool IsAssignedToLogicalDevice(const int core_id,
 }
 
 // Returns the index of the return value of region in
-// `tf_device.parallel_execute` that represents launch func output at
-// index |launch_func_output_index|. Regions of parallel_execute may
+// `tf_device.parallel_execute` that represents cluster func output at
+// index |cluster_func_output_index|. Regions of parallel_execute may
 // have different return values depending on outside sharding
 // configuration.
-int MapLaunchOutputIndexWithRegionOutputIndex(
+int MapClusterOutputIndexWithRegionOutputIndex(
     llvm::ArrayRef<xla::OpSharding> output_sharding_config, const int core_id,
-    const int launch_func_output_index) {
+    const int cluster_func_output_index) {
   int region_output_index = 0;
-  for (int output_index = 0; output_index < launch_func_output_index;
+  for (int output_index = 0; output_index < cluster_func_output_index;
        ++output_index) {
     const auto& sharding = output_sharding_config[output_index];
     if (sharding.type() != xla::OpSharding::MAXIMAL ||
@@ -353,8 +345,8 @@ int MapLaunchOutputIndexWithRegionOutputIndex(
 
 // Merges outputs from TPU computation for tile-sharded outputs.
 mlir::LogicalResult HandleTileShardedOutputs(
-    const int launch_func_output_index, const xla::OpSharding& sharding,
-    const mlir::Location& location, mlir::Value launch_func_output,
+    const int cluster_func_output_index, const xla::OpSharding& sharding,
+    const mlir::Location& location, mlir::Value cluster_func_output,
     mlir::tf_device::ParallelExecuteOp parallel_execute,
     mlir::OpBuilder* builder) {
   // Inject concat ops after parallel_execute to merge outputs from
@@ -366,8 +358,8 @@ mlir::LogicalResult HandleTileShardedOutputs(
   llvm::SmallVector<mlir::Value, 4> outputs_to_merge;
   outputs_to_merge.reserve(sharding.tile_assignment_devices_size());
   for (const auto logical_device_id : sharding.tile_assignment_devices()) {
-    const int region_output_index = MapLaunchOutputIndexWithRegionOutputIndex(
-        sharding, logical_device_id, launch_func_output_index);
+    const int region_output_index = MapClusterOutputIndexWithRegionOutputIndex(
+        sharding, logical_device_id, cluster_func_output_index);
     const auto output_from_logical_device = parallel_execute.GetRegionOutputs(
         logical_device_id)[region_output_index];
     outputs_to_merge.emplace_back(output_from_logical_device);
@@ -402,30 +394,30 @@ mlir::LogicalResult HandleTileShardedOutputs(
   }
 
   assert(outputs_to_merge.size() == 1);
-  launch_func_output.replaceAllUsesWith(outputs_to_merge[0]);
+  cluster_func_output.replaceAllUsesWith(outputs_to_merge[0]);
   return mlir::success();
 }
 
 mlir::LogicalResult ValidateAndGetTiledExecuteOutputShape(
     const mlir::Location& location,
-    const mlir::TensorType launch_func_output_type,
+    const mlir::TensorType cluster_func_output_type,
     const xla::OpSharding& output_sharding,
     mlir::Type* tiled_logical_computation_type) {
   auto new_output_shape =
-      llvm::to_vector<4>(launch_func_output_type.getShape());
+      llvm::to_vector<4>(cluster_func_output_type.getShape());
   for (auto dimension_and_output_splits :
        llvm::enumerate(output_sharding.tile_assignment_dimensions())) {
     const auto dimension_index = dimension_and_output_splits.index();
     const auto output_splits = dimension_and_output_splits.value();
-    const auto& output_shape = launch_func_output_type.getShape();
+    const auto output_shape = cluster_func_output_type.getShape();
 
     if (output_shape[dimension_index] == mlir::ShapedType::kDynamicSize) {
-      *tiled_logical_computation_type = launch_func_output_type;
+      *tiled_logical_computation_type = cluster_func_output_type;
       break;
     }
 
     auto output_shape_at_dim =
-        launch_func_output_type.getShape()[dimension_index];
+        cluster_func_output_type.getShape()[dimension_index];
     if (output_shape_at_dim % output_splits != 0) {
       mlir::emitError(
           location,
@@ -441,7 +433,7 @@ mlir::LogicalResult ValidateAndGetTiledExecuteOutputShape(
   }
 
   *tiled_logical_computation_type = mlir::RankedTensorType::get(
-      new_output_shape, launch_func_output_type.getElementType());
+      new_output_shape, cluster_func_output_type.getElementType());
 
   return mlir::success();
 }
@@ -450,34 +442,34 @@ mlir::LogicalResult ValidateAndGetTiledExecuteOutputShape(
 
 mlir::LogicalResult GetOutputTypesForLogicalDeviceComputation(
     const int core_id, llvm::ArrayRef<xla::OpSharding> output_sharding_config,
-    mlir::tf_device::LaunchFuncOp launch_func,
+    mlir::tf_device::ClusterFuncOp cluster_func,
     llvm::SmallVectorImpl<mlir::Type>* output_types) {
-  output_types->reserve(launch_func.getNumResults());
+  output_types->reserve(cluster_func.getNumResults());
 
-  for (auto result_and_index : llvm::enumerate(launch_func.getResults())) {
+  for (auto result_and_index : llvm::enumerate(cluster_func.getResults())) {
     const auto output_index = result_and_index.index();
     const auto& output_sharding = output_sharding_config[output_index];
     const auto output_sharding_type = output_sharding.type();
-    const auto& launch_func_output_type =
+    const auto cluster_func_output_type =
         result_and_index.value().getType().cast<mlir::TensorType>();
 
-    // If output shape of launch func is statically known and output is tiled
-    // sharded, then the corresponding output shape of launch func must be
+    // If output shape of cluster func is statically known and output is tiled
+    // sharded, then the corresponding output shape of cluster func must be
     // evenly divisible number of shardings.
     if (output_sharding_type == xla::OpSharding::OTHER) {
       mlir::Type tiled_logical_computation_type;
-      if (launch_func_output_type.hasRank()) {
+      if (cluster_func_output_type.hasRank()) {
         auto result = ValidateAndGetTiledExecuteOutputShape(
-            launch_func.getLoc(), launch_func_output_type, output_sharding,
+            cluster_func.getLoc(), cluster_func_output_type, output_sharding,
             &tiled_logical_computation_type);
         if (mlir::failed(result)) return mlir::failure();
       } else {
-        tiled_logical_computation_type = launch_func_output_type;
+        tiled_logical_computation_type = cluster_func_output_type;
       }
       output_types->emplace_back(tiled_logical_computation_type);
     } else if (output_sharding_type == xla::OpSharding::REPLICATED ||
                IsAssignedToLogicalDevice(core_id, output_sharding)) {
-      output_types->emplace_back(launch_func_output_type);
+      output_types->emplace_back(cluster_func_output_type);
     }
   }
 
@@ -487,17 +479,17 @@ mlir::LogicalResult GetOutputTypesForLogicalDeviceComputation(
 void RemapOutputsFromLogicalDevices(
     const mlir::Location& location,
     llvm::ArrayRef<xla::OpSharding> output_sharding_config,
-    mlir::tf_device::LaunchFuncOp launch_func,
+    mlir::tf_device::ClusterFuncOp cluster_func,
     mlir::tf_device::ParallelExecuteOp parallel_execute,
     mlir::OpBuilder* builder) {
-  for (auto result_and_index : llvm::enumerate(launch_func.getResults())) {
+  for (auto result_and_index : llvm::enumerate(cluster_func.getResults())) {
     const auto output_index = result_and_index.index();
-    const auto& launch_func_output = result_and_index.value();
+    const auto cluster_func_output = result_and_index.value();
     const auto& output_sharding = output_sharding_config[output_index];
     const auto output_sharding_type = output_sharding.type();
     if (output_sharding_type == xla::OpSharding::OTHER) {
       HandleTileShardedOutputs(output_index, output_sharding, location,
-                               launch_func_output, parallel_execute, builder);
+                               cluster_func_output, parallel_execute, builder);
       continue;
     }
 
@@ -506,13 +498,13 @@ void RemapOutputsFromLogicalDevices(
       logical_device_id = output_sharding.tile_assignment_devices(0);
 
     // For maximal sharding configuration, correctly remap outputs from
-    // parallel_execute region to users of the launch func.
-    const int region_output_index = MapLaunchOutputIndexWithRegionOutputIndex(
+    // parallel_execute region to users of the cluster func.
+    const int region_output_index = MapClusterOutputIndexWithRegionOutputIndex(
         output_sharding_config, logical_device_id, output_index);
 
     const auto output_from_logical_device = parallel_execute.GetRegionOutputs(
         logical_device_id)[region_output_index];
-    launch_func_output.replaceAllUsesWith(output_from_logical_device);
+    cluster_func_output.replaceAllUsesWith(output_from_logical_device);
   }
 }
 
@@ -531,7 +523,7 @@ llvm::SmallVector<llvm::SmallVector<int64_t, 4>, 4> GetMetadataArgumentMapping(
     const auto& sharding = arg_and_idx.value().sharding();
     const int64_t idx = arg_and_idx.index();
 
-    const auto& sharding_type = sharding.type();
+    const auto sharding_type = sharding.type();
     if (sharding_type == xla::OpSharding::OTHER) {
       for (const auto& device : sharding.tile_assignment_devices())
         input_mappings[device].push_back(idx);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
index 77bfd259cf6..69bc092927d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
@@ -29,27 +29,23 @@ limitations under the License.
 
 namespace tensorflow {
 
-extern const char* const kXlaShardingAttrName;
 extern const char* const kInputShardingAttr;
 extern const char* const kOutputShardingAttr;
 
-// Parses "_XlaSharding" attribute from operation, if it exists.
-llvm::Optional<mlir::StringRef> ParseShardingAttribute(
-    mlir::Operation* operation);
-
-// Parses "input_sharding_configuration" attribute and returns a list where
-// i-th element is a list of mlir::Value's which represent inputs for the
-// TPU computation correponding to i-th logical device. If the attribute
-// does not exist, the all inputs are placed on logical core 0.
+// Parses "input_sharding_configuration" attribute and returns a list where i-th
+// element is a list of mlir::Value's which represent inputs for the TPU
+// computation correponding to i-th logical device. If the attribute does not
+// exist, the all inputs are placed on logical core 0.
 mlir::LogicalResult ExtractInputsForLogicalDevices(
-    const int num_cores_per_replica, mlir::tf_device::LaunchFuncOp launch_func,
-    mlir::OpBuilder* builder,
+    const int num_cores_per_replica,
+    mlir::tf_device::ClusterFuncOp cluster_func, mlir::OpBuilder* builder,
     llvm::SmallVectorImpl<llvm::SmallVector<mlir::Value, 4>>* input_list);
 
-// Extracts a list of OpSharding that represent output sharding configuration
-// of `tf_device.launch`.
+// Extracts a list of OpSharding that represent output sharding configuration of
+// `tf_device.cluster`.
 mlir::LogicalResult ParseAndValidateOutputSharding(
-    const int num_cores_per_replica, mlir::tf_device::LaunchFuncOp launch_func,
+    const int num_cores_per_replica,
+    mlir::tf_device::ClusterFuncOp cluster_func,
     mlir::SmallVector<xla::OpSharding, 4>* output_sharding_list);
 
 // Retrieves output types for TPUExecute op representing execution for provided
@@ -57,15 +53,15 @@ mlir::LogicalResult ParseAndValidateOutputSharding(
 // different outputs depending on the output sharding configuration.
 mlir::LogicalResult GetOutputTypesForLogicalDeviceComputation(
     const int core_id, llvm::ArrayRef<xla::OpSharding> output_sharding_config,
-    mlir::tf_device::LaunchFuncOp launch_func,
+    mlir::tf_device::ClusterFuncOp cluster_func,
     llvm::SmallVectorImpl<mlir::Type>* output_types);
 
 // Remaps outputs of `tf_device.parallel_execute` op that represent concurrent
-// execution of the `tf_device.launch_func` with its users.
+// execution of the `tf_device.cluster_func` with its users.
 void RemapOutputsFromLogicalDevices(
     const mlir::Location& location,
     llvm::ArrayRef<xla::OpSharding> output_sharding_config,
-    mlir::tf_device::LaunchFuncOp launch_func,
+    mlir::tf_device::ClusterFuncOp cluster_func,
     mlir::tf_device::ParallelExecuteOp parallel_execute,
     mlir::OpBuilder* builder);
 
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index 62b862f5e21..2e1528e0d60 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -104,26 +104,24 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  std::unordered_set<std::string> tags = absl::StrSplit(saved_model_tags, ',');
+  std::vector<std::string> exported_names_vector =
+      absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
+  absl::Span<std::string> exported_names(exported_names_vector);
+
   if (import_saved_model_object_graph) {
-    std::unordered_set<std::string> tags =
-        absl::StrSplit(saved_model_tags, ',');
-    std::vector<std::string> exported_names =
-        absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
     mlir::MLIRContext context;
 
     auto module = tensorflow::SavedModelObjectGraphToMlirImport(
-        input_filename, tags, absl::Span<std::string>(exported_names),
-        &context);
+        input_filename, tags, exported_names, &context);
     if (!module) return 1;
 
     module->print(output->os());
   } else if (import_saved_model_signature_defs) {
-    std::unordered_set<std::string> tags =
-        absl::StrSplit(saved_model_tags, ',');
     mlir::MLIRContext context;
 
     auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
-        input_filename, tags, &context);
+        input_filename, tags, exported_names, &context);
     if (!module) return 1;
 
     module->print(output->os());
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index 2c4abb90abb..ac629ac4573 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -1,4 +1,5 @@
 load("//third_party/mlir:tblgen.bzl", "gentbl")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -39,7 +40,7 @@ gentbl(
         "ir/tfjs_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -77,3 +78,160 @@ cc_library(
     ],
     alwayslink = 1,
 )
+
+gentbl(
+    name = "tfjs_optimize_inc_gen",
+    tbl_outs = [
+        (
+            "-gen-rewriters",
+            "transforms/generated_optimize.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "transforms/optimize_pattern.td",
+    td_srcs = [
+        ":tfjs_ops_td_files",
+        "@llvm-project//mlir:StdOpsTdFiles",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
+    ],
+)
+
+cc_library(
+    name = "tfjs_optimize",
+    srcs = [
+        "transforms/generated_optimize.inc",
+        "transforms/optimize.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+    ],
+    deps = [
+        ":tensorflow_js",
+        ":tensorflow_js_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tensorflow_js_passes",
+    srcs = ["tf_tfjs_passes.cc"],
+    hdrs = [
+        "tf_tfjs_passes.h",
+    ],
+    deps = [
+        ":tfjs_optimize",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "json_translate_lib",
+    srcs = [
+        "translate/json_translate.cc",
+    ],
+    hdrs = [
+        "translate/json_translate.h",
+    ],
+    deps = [
+        ":tensorflow_js",
+        ":tensorflow_js_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:export_utils",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Translation",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tf_to_tfjs_json",
+    srcs = ["translate/tf_to_tfjs_json.cc"],
+    hdrs = [
+        "translate/tf_to_tfjs_json.h",
+    ],
+    deps = [
+        ":json_translate_lib",
+        ":tfjs_optimize",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_binary(
+    name = "json_translate",
+    deps = [
+        ":json_translate_lib",
+        "@llvm-project//mlir:MlirTranslateMain",
+    ],
+)
+
+filegroup(
+    name = "tf_tfjs_translate_main",
+    srcs = [
+        "translate/tf_tfjs_translate.cc",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf_tfjs_translate",
+    srcs = [":tf_tfjs_translate_main"],
+    deps = [
+        ":json_translate_lib",
+        ":tensorflow_js_passes",
+        ":tf_to_tfjs_json",
+        ":tfjs_optimize",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
index 5c1080b79ad..9c98c9b0e19 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
@@ -26,9 +26,9 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+
 namespace mlir {
 namespace tfjs {
 
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
index 172347bc0f5..134aa010d8c 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
@@ -23,7 +23,7 @@ limitations under the License.
 #define TFJS_DIALECT
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // TensorFlow.js dialect definitions
diff --git a/tensorflow/compiler/mlir/tfjs/tests/BUILD b/tensorflow/compiler/mlir/tfjs/tests/BUILD
index 4faa8d2efe8..a4ebc997991 100644
--- a/tensorflow/compiler/mlir/tfjs/tests/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/tests/BUILD
@@ -15,5 +15,6 @@ filegroup(
     data = [
         "//tensorflow/compiler/mlir:tf-opt",
         "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
new file mode 100644
index 00000000000..5c8d37da2f0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
@@ -0,0 +1,23 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+licenses(["notice"])
+
+glob_lit_tests(
+    data = [
+        ":test_utilities",
+    ],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "pbtxt",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/mlir/tfjs:tf_tfjs_translate",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt b/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
new file mode 100644
index 00000000000..f6a324fdc13
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
@@ -0,0 +1,78 @@
+# RUN: tf_tfjs_translate %s -tf-input-arrays=input0,input1 -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=10:10 -tf-output-arrays=Mul -o - | FileCheck %s --dump-input-on-failure
+# Add two tensor<4xi32> inputs and return the result
+
+node {
+  name: "Add"
+  op: "Add"
+  input: "input0"
+  input: "input1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "input1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Mul"
+  op: "Mul"
+  input: "Add"
+  input: "Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 27
+}
+
+# CHECK: "name": "input0"
+# CHECK-NEXT: "op": "Placeholder"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "input1",
+# CHECK-NEXT: "op": "Placeholder"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Add"
+# CHECK-NEXT: "op": "AddV2"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "input0"
+# CHECK-NEXT: "input1"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Mul1"
+# CHECK-NEXT: "op": "Mul"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "Add"
+# CHECK-NEXT: "Add"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Mul"
+# CHECK-NEXT: "op": "_Retval"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "Mul1"
+# CHECK: "type": "DT_INT32"
+# CHECK: "library"
+# CHECK: "versions"
+# CHECK: "producer": 27
+
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt b/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
new file mode 100644
index 00000000000..810db71f5e0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
@@ -0,0 +1,175 @@
+# RUN: tf_tfjs_translate %s -tf-input-arrays=input0 -tf-input-data-types=DT_FLOAT -tf-input-shapes=10 -tf-output-arrays=Add -tf-custom-opdefs="name: 'Prelu' input_arg: { name: 'x' type: DT_FLOAT } input_arg: { name: 'alpha' type: DT_FLOAT } output_arg: { name: 'c' type: DT_FLOAT }" -o - | FileCheck %s --dump-input-on-failure
+# Add two tensor<4xi32> inputs and return the result
+
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "alpha"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "input0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Neg"
+  op: "Neg"
+  input: "input0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Relu1"
+  op: "Relu"
+  input: "Neg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Mul"
+  op: "Mul"
+  input: "alpha"
+  input: "Relu1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Add"
+  op: "Add"
+  input: "Relu"
+  input: "Mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "main"
+  op: "_Retval"
+  input: "Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 344
+}
+
+# CHECK: "node":
+# CHECK: "name": "input0",
+# CHECK-NEXT: "op": "Placeholder",
+# CHECK-NEXT: "attr":
+# CHECK: "type": "DT_FLOAT"
+# CHECK: "name": "Add.Relu.Neg.Relu1.Mul",
+# CHECK-NEXT: "op": "Const",
+# CHECK-NEXT: "attr":
+# CHECK: "value":
+# CHECK: "tensor":
+# CHECK: "dtype": "DT_FLOAT",
+# CHECK: "tensorShape": {},
+# CHECK: "floatVal":
+# CHECK: -0.5
+# CHECK: "name": "Add.Relu.Neg.Relu1.Mul1",
+# CHECK-NEXT: "op": "Prelu",
+# CHECK-NEXT: "input":
+# CHECK: "input0",
+# CHECK: "Add.Relu.Neg.Relu1.Mul"
+# CHECK: "attr":
+# CHECK: "_output_shapes":
+# CHECK: "list":
+# CHECK: "shape":
+# CHECK: "dim":
+# CHECK: "size": "10"
+# CHECK: "experimentalDebugInfo": {}
+# CHECK: "name": "Add",
+# CHECK-NEXT: "op": "_Retval",
+# CHECK-NEXT: "input":
+# CHECK: "Add.Relu.Neg.Relu1.Mul1"
+# CHECK: "attr":
+# CHECK: "T":
+# CHECK: "type": "DT_FLOAT"
+# CHECK: "library": {},
+# CHECK: "versions":
+# CHECK: "producer": 344
+
diff --git a/tensorflow/compiler/mlir/tfjs/tests/optimize.mlir b/tensorflow/compiler/mlir/tfjs/tests/optimize.mlir
new file mode 100644
index 00000000000..1e249f17e45
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/optimize.mlir
@@ -0,0 +1,29 @@
+// Run optimize pass only and check the results.
+// RUN: tf-opt %s -tfjs-optimize | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: prelu_fusion
+func @prelu_fusion(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %alpha = constant dense<-0.2> : tensor<3xf32>
+  %0 = "tf.Relu"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  %1 = "tf.Neg"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  %2 = "tf.Relu"(%1) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  %3 = "tf.Mul"(%alpha, %2) : (tensor<3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+  %4 = "tf.AddV2"(%0, %3) : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %4 : tensor<2x3xf32>
+
+  // CHECK: %[[RESULT:[0-9].*]] = tfjs.Prelu
+}
+
+// CHECK-LABEL: prelu_not_fused
+// Rank of alpha should be one less than input for PReLU, which is not the case.
+func @prelu_not_fused(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %alpha = constant dense<-0.2> : tensor<f32>
+  %0 = "tf.Relu"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  %1 = "tf.Neg"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  %2 = "tf.Relu"(%1) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  %3 = "tf.Mul"(%alpha, %2) : (tensor<f32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+  %4 = "tf.AddV2"(%0, %3) : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %4 : tensor<2x3xf32>
+
+  // CHECK: %[[RESULT:[0-9].*]] = "tf.Relu"
+}
diff --git a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
new file mode 100644
index 00000000000..a445937570e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.h"
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfjs/transforms/passes.h"
+
+namespace mlir {
+/// Create a pass to convert from the TFExecutor to the TF control dialect.
+std::unique_ptr<OperationPass<FuncOp>>
+CreateTFExecutorToControlDialectConversion();
+}  // namespace mlir
+
+namespace tensorflow {
+
+void AddTFToTFJSConversionPasses(mlir::OpPassManager* pm) {
+  // Then we pass the MLIR module through the TF standard pipeline, which for
+  mlir::TF::StandardPipelineOptions tf_options;
+  tf_options.enable_inliner = true;
+  mlir::TF::CreateTFStandardPipeline(*pm, tf_options);
+
+  // freeze global tensors.
+  pm->addPass(mlir::tf_saved_model::CreateFreezeGlobalTensorsPass());
+
+  // TFJS dialect passes.
+  pm->addPass(mlir::tfjs::CreateOptimizePass());
+
+  // Canonicalize, CSE etc.
+  pm->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  pm->addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
+
+  // raise to executor dialect in order to use GraphDef converter
+  pm->addNestedPass<mlir::FuncOp>(
+      mlir::CreateFunctionalToExecutorDialectConversionPass());
+  pm->addNestedPass<mlir::FuncOp>(mlir::CreateBreakUpIslandsPass());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.h b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.h
new file mode 100644
index 00000000000..92a13fd4607
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TF_TFJS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TF_TFJS_PASSES_H_
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace tensorflow {
+
+// Add the TF to TFJS passes into a pass_manager.
+void AddTFToTFJSConversionPasses(mlir::OpPassManager* pm);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TF_TFJS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc b/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
new file mode 100644
index 00000000000..c03a68471bc
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation pass takes operations in TensorFlow dialect and
+// optimizes them to resulting operations in TensorFlow.js dialect.
+
+#include <memory>
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h"
+
+namespace mlir {
+namespace tfjs {
+
+//===----------------------------------------------------------------------===//
+// The actual Optimize Pass.
+namespace {
+
+// Optimize TFJS operations in functions.
+struct Optimize : public PassWrapper<Optimize, FunctionPass> {
+  void runOnFunction() override;
+};
+
+#include "tensorflow/compiler/mlir/tfjs/transforms/generated_optimize.inc"
+
+void Optimize::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto *ctx = &getContext();
+  auto func = getFunction();
+
+  populateWithGenerated(ctx, &patterns);
+  applyPatternsAndFoldGreedily(func, patterns);
+}
+}  // namespace
+
+// Creates an instance of the TensorFlow.js dialect Optimize pass.
+std::unique_ptr<OperationPass<FuncOp>> CreateOptimizePass() {
+  return std::make_unique<Optimize>();
+}
+
+static PassRegistration<Optimize> pass(
+    "tfjs-optimize", "Optimize within the TensorFlow.js dialect");
+
+}  // namespace tfjs
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfjs/transforms/optimize_pattern.td b/tensorflow/compiler/mlir/tfjs/transforms/optimize_pattern.td
new file mode 100644
index 00000000000..c5a059e5b6b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/transforms/optimize_pattern.td
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the optimization pattern definition file for TensorFlow.js.
+
+include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td"
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+
+// Checks if the value has only one user.
+def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
+
+// Constraint that makes sure both operands are the same operands.
+// TODO(b/154826385): Reconsider once equal source pattern symbols are allowed.
+def EqualOperands : Constraint<CPred<"$0 == $1">>;
+
+// Checks if the operand0's rank is one less than operand1's rank.
+def PReluAlphaRankCheck : Constraint<
+  CPred<"$0.getType().cast<ShapedType>().getRank() == "
+  "$1.getType().cast<ShapedType>().getRank() - 1">>;
+
+
+// PReLU pattern from Keras:
+// f(x) = Relu(x) + (-alpha * Relu(-x))
+def : Pat<(TF_AddV2Op
+           (TF_ReluOp:$relu_out $input1),
+           (TF_MulOp:$mul_out
+            (TF_ReluOp (TF_NegOp:$input_neg_out $input2)),
+            $neg_alpha)),
+          (TFJS_PReluOp $input1, (TF_NegOp $neg_alpha)),
+          [(EqualOperands $input1, $input2),
+           (PReluAlphaRankCheck $neg_alpha, $input1),
+           (HasOneUse $relu_out),
+           (HasOneUse $mul_out),
+           (HasOneUse $input_neg_out)
+          ]>;
diff --git a/tensorflow/compiler/mlir/tfjs/transforms/passes.h b/tensorflow/compiler/mlir/tfjs/transforms/passes.h
new file mode 100644
index 00000000000..0da361810e2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/transforms/passes.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+namespace mlir {
+class FuncOp;
+template <typename T>
+class OperationPass;
+
+namespace tfjs {
+
+// Creates an instance of the TensorFlow Lite dialect Optimize pass.
+std::unique_ptr<OperationPass<FuncOp>> CreateOptimizePass();
+
+}  // namespace tfjs
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc b/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
new file mode 100644
index 00000000000..7f4b8ffae09
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfjs/translate/json_translate.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+
+using mlir::ModuleOp;
+using mlir::TranslateFromMLIRRegistration;
+using std::string;
+using tensorflow::Status;
+using xla::StatusOr;
+
+// Translates the given MLIR module in the TFJS dialect to TFJS JSON
+// format. Returns false on success.
+//
+bool tfjs::MlirToJSONTranslateFunction(ModuleOp module,
+                                       std::string* serialized_json) {
+  string json_output;
+  // Allow TF to treat TFJS ops as TF ops.
+  if (!tensorflow::AddTensorFlowOpPrefix("tfjs.").ok()) {
+    LOG(ERROR) << "Failed to add tfjs op prefix.";
+    return false;
+  }
+  tensorflow::GraphExportConfig confs;
+  confs.export_shapes = true;
+  confs.export_library = true;
+  tensorflow::FunctionLibraryDefinition flib_def(
+      tensorflow::OpRegistry::Global(), tensorflow::FunctionDefLibrary());
+  absl::flat_hash_set<tensorflow::Node*> control_ret_nodes;
+  auto graph = absl::make_unique<tensorflow::Graph>(flib_def);
+  auto status = tensorflow::ConvertMlirToGraph(module, confs, &graph, &flib_def,
+                                               &control_ret_nodes);
+  if (!status.ok()) {
+    LOG(ERROR) << "Graph export failed: " << status;
+    return false;
+  }
+  auto graphdef = absl::make_unique<tensorflow::GraphDef>();
+  graph->ToGraphDef(graphdef.get());
+
+  // Replace the _Arg nodes of the main function with Placeholder op.
+  auto nodes = graphdef->mutable_node();
+  for (const auto& node : llvm::enumerate(*nodes)) {
+    if (node.value().op() == "_Arg") {
+      nodes->Mutable(node.index())->set_op("Placeholder");
+    }
+  }
+
+  tensorflow::protobuf::util::JsonPrintOptions json_options;
+  json_options.add_whitespace = true;
+  auto jsonStatus = tensorflow::protobuf::util::MessageToJsonString(
+      *graphdef, &json_output, json_options);
+  if (!jsonStatus.ok()) {
+    LOG(ERROR) << "Proto2Json failed: " << status;
+    return false;
+  }
+  *serialized_json = std::move(json_output);
+  return true;
+}
+
+static mlir::LogicalResult MlirToJSONFileTranslateFunction(
+    ModuleOp module, llvm::raw_ostream& output) {
+  std::string serialized_json;
+  if (!tfjs::MlirToJSONTranslateFunction(module, &serialized_json))
+    return mlir::failure();
+
+  output << serialized_json;
+  return mlir::success();
+}
+
+static TranslateFromMLIRRegistration MLIRToJSONFileTranslate(
+    "mlir-to-tfjs-json", MlirToJSONFileTranslateFunction);
diff --git a/tensorflow/compiler/mlir/tfjs/translate/json_translate.h b/tensorflow/compiler/mlir/tfjs/translate/json_translate.h
new file mode 100644
index 00000000000..0a931f770ad
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/json_translate.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
+
+#include <string>
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tfjs {
+
+// Translates the given MLIR `module` into a JSON string. Returns true if
+// translation fails, otherwise returns false.
+bool MlirToJSONTranslateFunction(mlir::ModuleOp module,
+                                 std::string* serialized_json);
+}  // namespace tfjs
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
new file mode 100644
index 00000000000..e735a3c7b8c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
@@ -0,0 +1,173 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iostream>
+#include <string>
+
+#include "absl/strings/str_split.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
+#include "tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.h"
+#include "tensorflow/compiler/mlir/tfjs/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+using llvm::cl::opt;
+using mlir::MLIRContext;
+using stream_executor::port::StatusOr;
+
+// NOLINTNEXTLINE
+opt<std::string> input_file_name(llvm::cl::Positional,
+                                 llvm::cl::desc("<input file>"),
+                                 llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model_object_graph(
+    "savedmodel-objectgraph-to-mlir",
+    llvm::cl::desc("Import a saved model to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model_signature_defs(
+    "savedmodel-signaturedefs-to-mlir",
+    llvm::cl::desc("Import a saved model V1 to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_tags(
+    "tf-savedmodel-tags",
+    llvm::cl::desc("Tags used to indicate which MetaGraphDef to import, "
+                   "separated by ','"),
+    llvm::cl::init("serve"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_exported_names(
+    "tf-savedmodel-exported-names",
+    llvm::cl::desc("Names to export from SavedModel, separated by ','. Empty "
+                   "(the default) means export all."),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> output_file_name("o", llvm::cl::desc("<output file>"),
+                                  llvm::cl::value_desc("filename"),
+                                  llvm::cl::init("-"));
+// NOLINTNEXTLINE
+opt<bool> input_mlir(
+    "input-mlir",
+    llvm::cl::desc("Take input TensorFlow model in textual MLIR instead of "
+                   "GraphDef format"),
+    llvm::cl::init(false), llvm::cl::Hidden);
+// NOLINTNEXTLINE
+opt<bool> output_mlir(
+    "output-mlir",
+    llvm::cl::desc("Output MLIR rather than JSON for the generated TFJS model"),
+    llvm::cl::init(false));
+
+// The following approach allows injecting opdefs in addition
+// to those that are already part of the global TF registry  to be linked in
+// prior to importing the graph. The primary goal is for support of custom ops.
+// This is not intended to be a general solution for custom ops for the future
+// but mainly for supporting older models like mobilenet_ssd. More appropriate
+// mechanisms, such as op hints or using functions to represent composable ops
+// like https://github.com/tensorflow/community/pull/113 should be encouraged
+// going forward.
+// NOLINTNEXTLINE
+llvm::cl::list<std::string> custom_opdefs(
+    "tf-custom-opdefs", llvm::cl::desc("List of custom opdefs when importing "
+                                       "graphdef"));
+
+// Debugging flag to print function mapping in the JSON.
+// NOLINTNEXTLINE
+static opt<bool> print_function_result_mapping(
+    "print-function-result-mapping",
+    llvm::cl::desc(
+        "Print the mapping of function result to json output buffer"),
+    llvm::cl::init(false));
+
+enum TranslationStatus { kTrSuccess, kTrFailure };
+
+static int PrintFunctionResultMapping(const std::string& result) {
+  std::cout << result << std::endl;
+  return kTrSuccess;
+}
+
+int main(int argc, char** argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "TF GraphDef to TFJS JSON converter\n");
+
+  MLIRContext context;
+  llvm::SourceMgr source_mgr;
+  mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &context);
+
+  StatusOr<mlir::OwningModuleRef> module;
+
+  if (import_saved_model_object_graph || import_saved_model_signature_defs) {
+    if (input_mlir)
+      module = tensorflow::errors::InvalidArgument(
+          "Importing saved model should not have input_mlir set");
+    module = tensorflow::ImportSavedModel(
+        import_saved_model_object_graph, import_saved_model_signature_defs,
+        custom_opdefs, input_file_name, saved_model_tags,
+        saved_model_exported_names, &context);
+  } else {
+    module = tensorflow::LoadFromGraphdefOrMlirSource(
+        input_file_name, input_mlir, custom_opdefs, debug_info_file,
+        input_arrays, input_dtypes, input_shapes, output_arrays,
+        /*prune_unused_nodes=*/true, &source_mgr, &context);
+  }
+
+  // If errors occur, the library call in the above already logged the error
+  // message. So we can just return here.
+  if (!module.ok()) return kTrFailure;
+
+  mlir::PassManager pm(&context);
+
+  tensorflow::AddTFToTFJSConversionPasses(&pm);
+
+  std::string result;
+  auto status = tensorflow::ConvertTFOpsToTfjsJSON(module.ValueOrDie().get(),
+                                                   output_mlir, &result, &pm);
+  if (!status.ok()) return kTrFailure;
+
+  std::string error_msg;
+  auto output = mlir::openOutputFile(output_file_name, &error_msg);
+  if (output == nullptr) {
+    llvm::errs() << error_msg << '\n';
+    return kTrFailure;
+  }
+  output->os() << result;
+  output->keep();
+
+  // Print out debugging info related to function mapping.
+  if (print_function_result_mapping) return PrintFunctionResultMapping(result);
+  return kTrSuccess;
+}
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
new file mode 100644
index 00000000000..7dc9ea049ba
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
@@ -0,0 +1,152 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tfjs/translate/json_translate.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+using mlir::MLIRContext;
+using mlir::ModuleOp;
+using mlir::OwningModuleRef;
+using stream_executor::port::StatusOr;
+
+namespace {
+tensorflow::Status RegisterCustomOps(
+    const std::vector<std::string>& extra_tf_opdefs) {
+  for (const auto& tf_opdefs_string : extra_tf_opdefs) {
+    tensorflow::OpDef opdef;
+    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
+                                                           &opdef)) {
+      LOG(ERROR) << "OpDef parsing failed for: " << tf_opdefs_string;
+      return errors::InvalidArgument("fail to parse extra OpDef");
+    }
+    // Register extra opdefs.
+    tensorflow::OpRegistry::Global()->Register(
+        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
+          *op_reg_data = tensorflow::OpRegistrationData(opdef);
+          return Status::OK();
+        });
+  }
+  return Status::OK();
+}
+}  // namespace
+
+StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
+    const std::string& input_filename, bool input_mlir,
+    const std::vector<std::string>& extra_tf_opdefs,
+    absl::string_view debug_info_file, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, bool prune_unused_nodes,
+    llvm::SourceMgr* source_mgr, MLIRContext* context) {
+  // Set up the input file.
+  std::string error_message;
+  auto file = mlir::openInputFile(input_filename, &error_message);
+  if (!file) {
+    llvm::errs() << error_message << "\n";
+    return errors::InvalidArgument("fail to open input file");
+  }
+
+  if (input_mlir) {
+    source_mgr->AddNewSourceBuffer(std::move(file), llvm::SMLoc());
+    return OwningModuleRef(mlir::parseSourceFile(*source_mgr, context));
+  }
+
+  TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+
+  return tensorflow::GraphdefToMlirTranslateFunction(
+      file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
+      input_shapes, output_arrays, /*control_output_arrays=*/"",
+      prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
+      /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+      /*enable_shape_inference=*/true, context);
+}
+
+Status ConvertTFOpsToTfjsJSON(mlir::ModuleOp module, bool export_to_mlir,
+                              std::string* result,
+                              mlir::PassManager* pass_manager) {
+  mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
+                                                    /*propagate=*/true);
+  if (failed(pass_manager->run(module))) {
+    return statusHandler.ConsumeStatus();
+  }
+
+  if (export_to_mlir) {
+    llvm::raw_string_ostream os(*result);
+    module.print(os);
+    return Status::OK();
+  }
+
+  return tfjs::MlirToJSONTranslateFunction(module, result)
+             ? Status::OK()
+             : statusHandler.ConsumeStatus();
+}
+
+StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::vector<std::string>& extra_tf_opdefs,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context) {
+  std::unordered_set<std::string> tags = absl::StrSplit(saved_model_tags, ',');
+  std::vector<std::string> exported_names_in_vector =
+      absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
+  absl::Span<std::string> exported_names(exported_names_in_vector);
+  if (import_saved_model) {
+    auto module = tensorflow::SavedModelObjectGraphToMlirImport(
+        input_filename, tags, absl::Span<std::string>(exported_names), context);
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+    TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+    return module;
+  } else if (import_saved_model_v1) {
+    auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
+        input_filename, tags, exported_names, context);
+
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+    TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+    return module;
+  } else {
+    return tensorflow::errors::InvalidArgument(
+        "Should be either saved model v1 or v2");
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h
new file mode 100644
index 00000000000..d68f0e7d46e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/Support/SourceMgr.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+// Load a TF model from a GraphDef definition or a TF control flow dialect MLIR
+// source into a MLIR module. If `input_mlir` is true, load from a MLIR source
+// file; otherwise, load from a GraphDef.
+// Setting prune_unused_nodes to true, would prune unreachable nodes if
+// output_arrays is specified.
+stream_executor::port::StatusOr<mlir::OwningModuleRef>
+LoadFromGraphdefOrMlirSource(
+    const std::string& input_filename, bool input_mlir,
+    const std::vector<std::string>& extra_tf_opdefs,
+    absl::string_view debug_info_file, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, bool prune_unused_nodes,
+    llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
+
+// Load Saved model (either v1 or v2) into MLIR.
+stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::vector<std::string>& extra_tf_opdefs,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context);
+
+// Taking a MLIR module in TF executor dialect and a set of parameters,
+// applies a set of passes to convert the module to TFJS dialect and
+// serializes the result to JSON string.
+// If `export_to_mlir` is true, the result is exported in MLIR text format,
+// otherwise exported in JSON.
+Status ConvertTFOpsToTfjsJSON(mlir::ModuleOp module, bool export_to_mlir,
+                              std::string* result,
+                              mlir::PassManager* pass_manager);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
new file mode 100644
index 00000000000..27a8dbd2809
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -0,0 +1,50 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
+licenses(["notice"])
+
+cc_library(
+    name = "cubin_creator",
+    srcs = ["cubin_creator.cc"],
+    hdrs = ["cubin_creator.h"],
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:TargetNVVMIR",
+        "@llvm-project//mlir:Transforms",
+        "//tensorflow/compiler/mlir/xla:hlo",
+        "//tensorflow/compiler/mlir/xla:lhlo",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/xla:xla_materialize_broadcasts",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir/xla:xla_unfuse_batch_norm",  # buildcleaner: keep
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
+        "//tensorflow/compiler/xla/service/gpu:target_constants",
+        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/compiler/xla/service/mlir_gpu:kernel_lowering",
+        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core:lib",
+    ] + if_cuda(["//tensorflow/stream_executor/gpu:asm_compiler"]),
+)
+
+tf_cc_binary(
+    name = "tf_to_cubin",
+    srcs = ["tf_to_cubin.cc"],
+    visibility = ["//tensorflow/core/kernels/cubin_headers:__pkg__"],
+    deps = [
+        ":cubin_creator",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
new file mode 100644
index 00000000000..f47485d0214
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -0,0 +1,270 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//===- cubin_creator.cc -----------------------------------------*- C++ -*-===//
+//
+// This file implements the function to compile a TF kernel function to a cubin.
+//
+//===----------------------------------------------------------------------===//
+#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/escaping.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/NVVMIR.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#endif
+
+namespace {
+using tensorflow::Status;
+using xla::InternalError;
+using xla::StatusOr;
+
+StatusOr<std::string> GetLibdeviceDir(
+    const xla::HloModuleConfig& hlo_module_config) {
+  for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
+           hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
+    std::string libdevice_dir =
+        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
+    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
+    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
+      VLOG(2) << "Found libdevice dir " << libdevice_dir;
+      return libdevice_dir;
+    }
+  }
+  return InternalError(
+      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
+}
+
+struct MaterializeBroadcastsPass
+    : public mlir::PassWrapper<MaterializeBroadcastsPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::ConversionTarget conversionTarget(getContext());
+    mlir::OwningRewritePatternList conversionPatterns;
+
+    // Consider the xla_hlo dialect legal for tests.
+    conversionTarget.addLegalDialect<mlir::xla_hlo::XlaHloDialect>();
+    // The conversion uses helpers from the Standard dialect.
+    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
+
+    mlir::xla_hlo::SetupMaterializeBroadcastsLegality(&getContext(),
+                                                      &conversionTarget);
+    mlir::xla_hlo::PopulateMaterializeBroadcastsPatterns(&getContext(),
+                                                         &conversionPatterns);
+
+    if (failed(applyPartialConversion(getFunction(), conversionTarget,
+                                      conversionPatterns))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+struct UnfuseBatchNormPass
+    : public mlir::PassWrapper<UnfuseBatchNormPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    mlir::xla_hlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
+    mlir::applyPatternsAndFoldGreedily(getOperation(), patterns);
+  }
+};
+
+Status LowerTfOpToLhloWithDynamicShapes(mlir::ModuleOp module) {
+  mlir::PassManager pm(module.getContext());
+  auto enable_if_vlog_is_on = [](mlir::Pass* pass, mlir::Operation* op) {
+    return VLOG_IS_ON(1);
+  };
+  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
+                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
+                      /*printModuleScope=*/false,
+                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
+  pm.addNestedPass<mlir::FuncOp>(mlir::xla_hlo::createLegalizeTFPass(false));
+  pm.addNestedPass<mlir::FuncOp>(
+      absl::make_unique<MaterializeBroadcastsPass>());
+  pm.addNestedPass<mlir::FuncOp>(absl::make_unique<UnfuseBatchNormPass>());
+  pm.addPass(mlir::xla_hlo::createLegalizeToLhloPass());
+  pm.addNestedPass<mlir::FuncOp>(mlir::xla_lhlo::createLhloCopyRemovalPass());
+
+  if (failed(pm.run(module))) {
+    return InternalError("Lowering TF to LHLO failed.");
+  }
+  return Status::OK();
+}
+
+struct PropagateStaticKnowledge
+    : public mlir::PassWrapper<PropagateStaticKnowledge,
+                               mlir::OperationPass<mlir::LLVM::LLVMFuncOp>> {
+  explicit PropagateStaticKnowledge(mlir::FunctionType type,
+                                    llvm::ArrayRef<uint32_t> same_shape_)
+      : func_type(type), same_shape(same_shape_) {}
+
+  void runOnOperation() override {
+    // We know due to tensorflow ABI that the offset is always 0 and that the
+    // innermost stride is always 1. To make this visible to the compiler,
+    // we insert constants into the code and replace usages accordingly.
+    // We do not change the signature so that we keep a somewhat stable ABI
+    // that is easy to undertand by tools.
+    mlir::LLVM::LLVMFuncOp func = getOperation();
+    mlir::OpBuilder b(func.getBody());
+    auto index_type = func.getArgument(3).getType();
+    mlir::Value one = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 1));
+    mlir::Value zero = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 0));
+    uint32_t arg_pos = 0;
+    std::vector<uint32_t> positions;
+    for (mlir::Type arg_type : func_type.getInputs()) {
+      positions.push_back(arg_pos);
+      func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
+      arg_pos += 3 + arg_type.cast<mlir::ShapedType>().getRank() * 2;
+      func.getArgument(arg_pos - 1).replaceAllUsesWith(one);
+    }
+
+    // If we have knowledge that some arguments have the same shape, we
+    // can use that here. Simply replace usages of the shape parameters within
+    // the function body to a single shape parameter.
+    if (!same_shape.empty()) {
+      auto first = same_shape.front();
+      auto first_offset = positions.at(first);
+      mlir::ShapedType first_type =
+          func_type.getInput(first).cast<mlir::ShapedType>();
+      uint32_t rank = first_type.getRank();
+      for (auto same : same_shape.drop_front(1)) {
+        uint32_t same_offset = positions.at(same);
+        auto same_type = func_type.getInput(same).cast<mlir::ShapedType>();
+        if (same_type.getRank() != rank) {
+          func.emitOpError() << "same shape constraints on arguments with "
+                                "non-matching shapes: #"
+                             << first << " and #" << same;
+          signalPassFailure();
+        }
+
+        for (uint32_t i = 0; i < 2 * rank; ++i) {
+          // Replace uses for second arg data with first arg.
+          auto same_arg = func.getArgument(same_offset + 3 + i);
+          auto first_arg = func.getArgument(first_offset + 3 + i);
+          same_arg.replaceAllUsesWith(first_arg);
+        }
+      }
+    }
+  }
+
+  mlir::FunctionType func_type;
+  llvm::ArrayRef<uint32_t> same_shape;
+};
+
+Status PropagateStaticShapeKnowledgeToKernel(
+    mlir::ModuleOp module, llvm::ArrayRef<uint32_t> same_shape) {
+  // Grab the original signature from the single function.
+  auto func = *module.getBody()->op_begin<mlir::FuncOp>();
+
+  mlir::PassManager pm(module.getContext());
+  auto enable_if_vlog_is_on = [](mlir::Pass*, mlir::Operation*) {
+    return VLOG_IS_ON(1);
+  };
+  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
+                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
+                      /*printModuleScope=*/false,
+                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
+  auto& kernel_pm = pm.nest<::mlir::gpu::GPUModuleOp>();
+  kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
+      absl::make_unique<PropagateStaticKnowledge>(func.getType(), same_shape));
+
+  if (failed(pm.run(module))) {
+    return InternalError("Static knowledge propagation failed.");
+  }
+  return Status::OK();
+}
+}  // namespace
+
+StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
+    llvm::StringRef tf_code, std::pair<int32_t, int32_t> compute_capability,
+    llvm::ArrayRef<uint32_t> tile_sizes, llvm::ArrayRef<uint32_t> same_shape,
+    llvm::ArrayRef<uint32_t> unroll_factors) {
+  mlir::MLIRContext context;
+  context.allowUnregisteredDialects();  // TODO(b/152572127)
+  mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
+
+  TF_RETURN_IF_ERROR(LowerTfOpToLhloWithDynamicShapes(module.get()));
+  TF_RETURN_IF_ERROR(
+      xla::mlir_gpu::LowerLHLOToGPU(module.get(), tile_sizes, unroll_factors,
+                                    /*collapseParallelLoops=*/false));
+  TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
+  // TODO(b/156985522): Figure out why we get a segfault when generating Tanh
+  // with 'same_shape' containing {0, 1}. We would also get the crash if we
+  // unconditionally call PropagateStaticShapeKnowledgeToKernel while
+  // 'same_shape' is empty.
+  if (!same_shape.empty()) {
+    TF_RETURN_IF_ERROR(
+        PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
+  }
+
+  mlir::OwningModuleRef kernel_module =
+      xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
+  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module);
+  if (!llvmModule) {
+    return InternalError("Could not translate MLIR module to NVVM");
+  }
+
+  llvmModule->setModuleIdentifier("acme");
+  llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
+
+  xla::HloModuleConfig config;
+  config.set_debug_options(xla::GetDebugOptionsFromFlags());
+
+  TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
+  TF_ASSIGN_OR_RETURN(std::string ptx, xla::gpu::nvptx::CompileToPtx(
+                                           llvmModule.get(), compute_capability,
+                                           config, libdevice_dir));
+  VLOG(1) << ptx;
+
+#if GOOGLE_CUDA
+  return tensorflow::se::CompileGpuAsm(
+      std::get<0>(compute_capability), std::get<1>(compute_capability),
+      ptx.c_str(), xla::gpu::PtxOptsFromConfig(config));
+#else
+  return InternalError(
+      "GOOGLE_CUDA not defined. Did you specify --config=cuda ?");
+#endif
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
new file mode 100644
index 00000000000..47626ba9d0d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//===- cubin_creator.h ------------------------------------------*- C++ -*-===//
+//
+// This file declares the function to compile a TF kernel function to a cubin.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+xla::StatusOr<std::vector<uint8_t>> GenerateCubinForTfCode(
+    llvm::StringRef tf_code,
+    std::pair<int32_t, int32_t> compute_capability = {7, 5},
+    llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
+    llvm::ArrayRef<uint32_t> same_shape = {},
+    llvm::ArrayRef<uint32_t> unroll_factors = {});
+}  // namespace kernel_gen
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
new file mode 100644
index 00000000000..8edc567e777
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
@@ -0,0 +1,118 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//===- tf_to_cubin.cc -------------------------------------------*- C++ -*-===//
+//
+// This file implements the entry point to compile a tf op to a cubin file.
+//
+//===----------------------------------------------------------------------===//
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace {
+bool ParseStringList(std::string string_list, std::vector<uint32_t>* result) {
+  result->clear();
+  uint32_t item;
+  auto items = absl::StrSplit(string_list, ',');
+  for (const auto& item_str : items) {
+    if (!absl::SimpleAtoi(item_str, &item)) {
+      LOG(ERROR) << "Expected token " << item_str << " to be an integer";
+      return false;
+    }
+    result->push_back(item);
+  }
+  return true;
+}
+}  // namespace
+
+int main(int argc, char** argv) {
+  std::string output_file = "foo.bin";
+  int32_t architecture = 50;
+  std::vector<uint32_t> tile_sizes;
+  std::vector<uint32_t> unroll_factors;
+  std::vector<uint32_t> same_shape;
+
+  auto parse_tile_sizes = [&tile_sizes](std::string tile_sizes_str) {
+    if (!ParseStringList(tile_sizes_str, &tile_sizes)) {
+      return false;
+    }
+    // Initialize with the default.
+    if (tile_sizes.empty()) {
+      tile_sizes.push_back(16);
+      tile_sizes.push_back(64);
+    }
+    return true;
+  };
+
+  auto parse_unroll_factors =
+      [&unroll_factors](std::string unroll_factors_str) {
+        return ParseStringList(unroll_factors_str, &unroll_factors);
+      };
+
+  auto parse_same_shape = [&same_shape](std::string same_shape_str) {
+    return ParseStringList(same_shape_str, &same_shape);
+  };
+
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("output", &output_file, "output file"),
+      tensorflow::Flag("arch", &architecture,
+                       "target architecture (e.g. 50 for sm_50)"),
+      tensorflow::Flag("tile_sizes", parse_tile_sizes, "16,64",
+                       "tile sizes to use"),
+      tensorflow::Flag("unroll_factors", parse_unroll_factors, "",
+                       "factors to unroll by, separated by commas"),
+      tensorflow::Flag("same_shape", parse_same_shape, "",
+                       "arguments with same shape, separated by commas"),
+  };
+  bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain("usage", &argc, &argv);
+  if (!parse_ok) {
+    return 1;
+  }
+
+  std::pair<int32_t, int32_t> compute_capability(architecture / 10,
+                                                 architecture % 10);
+
+  auto cubin = tensorflow::kernel_gen::GenerateCubinForTfCode(
+      argv[1], compute_capability, tile_sizes, same_shape, unroll_factors);
+
+  if (!cubin.ok()) {
+    LOG(ERROR) << cubin.status();
+    return 1;
+  }
+
+  std::vector<uint8_t> cubin_data = cubin.ConsumeValueOrDie();
+
+  auto status = tensorflow::WriteStringToFile(
+      tensorflow::Env::Default(), output_file,
+      absl::string_view{reinterpret_cast<char*>(cubin_data.data()),
+                        cubin_data.size()});
+
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 122692059bf..179a637ec7b 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -11,9 +11,10 @@ package_group(
     includes = ["//third_party/mlir:subpackages"],
     packages = [
         "//babelfish/device/...",
+        "//learning/brain/experimental/dtensor/...",
         "//learning/brain/experimental/mlir/...",
-        "//learning/brain/experimental/swift_mlir/...",
         "//learning/brain/google/xla/kernels/...",
+        "//learning/brain/google/xla/mlir/...",
         "//learning/brain/swift/swift_mlir/...",
         "//learning/pathways/data_parallel/tf2xla/...",
         "//platforms/xla/...",
@@ -22,7 +23,6 @@ package_group(
         "//tensorflow/compiler/xla/...",
         "//third_party/iree/...",
         "//third_party/mlir_edge/...",
-        "//third_party/tf_runtime/tools/tf_kernel_gen/...",
     ],
 )
 
@@ -31,25 +31,25 @@ exports_files(["ir/hlo_ops.td"])
 filegroup(
     name = "hlo_ops_td_files",
     srcs = [
-        "ir/hlo_client_ops.td",
+        "ir/chlo_ops.td",
         "ir/hlo_ops.td",
         "ir/hlo_ops_base.td",
         "ir/hlo_utils.td",
         "ir/lhlo_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
 gentbl(
-    name = "hlo_client_ops_inc_gen",
+    name = "chlo_ops_inc_gen",
     tbl_outs = [
-        ("-gen-op-decls", "ir/hlo_client_ops.h.inc"),
-        ("-gen-op-defs", "ir/hlo_client_ops.cc.inc"),
+        ("-gen-op-decls", "ir/chlo_ops.h.inc"),
+        ("-gen-op-defs", "ir/chlo_ops.cc.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "ir/hlo_client_ops.td",
+    td_file = "ir/chlo_ops.td",
     td_srcs = [
         ":hlo_ops_td_files",
     ],
@@ -132,12 +132,14 @@ cc_library(
         "transforms/legalize_tf_control_flow.cc",
     ],
     deps = [
+        ":chlo_legalize_to_hlo",
         ":convert_op_folder",
         ":hlo",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client:sharding_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:conv_grad_shape_utils",
         "@llvm-project//llvm:support",
@@ -145,6 +147,7 @@ cc_library(
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
@@ -162,6 +165,7 @@ cc_library(
         ":mlir_hlo_builder",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
         "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
@@ -183,11 +187,30 @@ cc_library(
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
     ],
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_sink_constants_to_control_flow",
+    srcs = [
+        "transforms/sink_constants_to_control_flow.cc",
+    ],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "map_xla_to_scalar_op",
     hdrs = ["transforms/map_xla_to_scalar_op.h"],
@@ -236,8 +259,8 @@ cc_library(
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
-        "@llvm-project//mlir:LoopOps",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
@@ -274,8 +297,8 @@ cc_library(
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
-        "@llvm-project//mlir:LoopOps",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
@@ -331,8 +354,6 @@ cc_library(
     srcs = ["transforms/buffer_assignment.cc"],
     hdrs = ["transforms/buffer_assignment.h"],
     deps = [
-        ":hlo",
-        ":lhlo",
         "@com_google_absl//absl/memory",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
@@ -344,6 +365,26 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "buffer_assignment_test",
+    srcs = ["transforms/buffer_assignment_test.cc"],
+    hdrs = [
+        "transforms/buffer_assignment.h",
+        "transforms/passes.h",
+    ],
+    deps = [
+        "@com_google_absl//absl/memory",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 gentbl(
     name = "xla_legalize_to_standard_inc_gen",
     tbl_outs = [
@@ -374,6 +415,28 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_hlo_to_lhlo_with_xla",
+    srcs = ["transforms/xla_hlo_to_lhlo_with_xla.cc"],
+    hdrs = ["transforms/xla_hlo_to_lhlo_with_xla.h"],
+    deps = [
+        ":hlo",
+        ":hlo_utils",
+        ":lhlo",
+        ":mlir_hlo_to_hlo",
+        ":xla_dialect_registration",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:hlo",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "xla_legalize_to_standard",
     srcs = ["transforms/legalize_to_standard.cc"],
@@ -452,17 +515,35 @@ cc_library(
 )
 
 cc_library(
-    name = "xla_test_passes",
+    name = "chlo_legalize_to_hlo",
     srcs = [
-        "transforms/materialize_broadcasts_pass.cc",
-        "transforms/unfuse_batch_norm_pass.cc",
+        "transforms/chlo_legalize_to_hlo.cc",
     ],
     deps = [
         ":hlo",
-        ":xla_materialize_broadcasts",
-        ":xla_unfuse_batch_norm",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "xla_test_passes",
+    srcs = [
+        "transforms/chlo_legalize_to_hlo_pass.cc",
+        "transforms/materialize_broadcasts_pass.cc",
+        "transforms/test_infer_shaped_type_pass.cc",
+        "transforms/unfuse_batch_norm_pass.cc",
+    ],
+    deps = [
+        ":chlo_legalize_to_hlo",  # build-cleaner: keep
+        ":hlo",
+        ":xla_materialize_broadcasts",  # build-cleaner: keep
+        ":xla_unfuse_batch_norm",  # build-cleaner: keep
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
@@ -472,14 +553,16 @@ cc_library(
 cc_library(
     name = "hlo",
     srcs = [
-        "ir/hlo_client_ops.cc",
+        "ir/broadcast_utils.cc",
+        "ir/chlo_ops.cc",
         "ir/hlo_ops.cc",
         "ir/hlo_ops.cc.inc",
         "ir/hlo_ops.h.inc",
         "ir/hlo_utils.cc",
     ],
     hdrs = [
-        "ir/hlo_client_ops.h",
+        "ir/broadcast_utils.h",
+        "ir/chlo_ops.h",
         "ir/hlo_ops.h",
         "ir/hlo_utils.h",
         "transforms/passes.h",
@@ -487,8 +570,8 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":chlo_ops_inc_gen",
         ":convert_op_folder",
-        ":hlo_client_ops_inc_gen",
         ":hlo_ops_base_inc_gen",
         ":hlo_ops_inc_gen",
         ":xla_canonicalize_inc_gen",
@@ -498,6 +581,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
@@ -516,12 +600,14 @@ cc_library(
         "ir/mlir_hlo_builder.h",
     ],
     deps = [
+        ":attribute_importer",
         ":hlo",
         ":hlo_utils",
         ":type_to_shape",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:shape_inference",
@@ -572,6 +658,7 @@ cc_library(
         ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:lib",
         "@llvm-project//mlir:IR",
     ],
     alwayslink = 1,
@@ -650,6 +737,7 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/stream_executor/lib",
         "@llvm-project//llvm:support",
@@ -686,6 +774,7 @@ cc_library(
         "hlo_module_importer.h",
     ],
     deps = [
+        ":attribute_importer",
         ":hlo",
         ":hlo_utils",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
@@ -705,6 +794,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "attribute_importer",
+    srcs = ["attribute_importer.cc"],
+    hdrs = ["attribute_importer.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core/platform:types",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "xla_mlir_translate",
     srcs = ["xla_mlir_translate.cc"],
@@ -740,7 +841,7 @@ genrule(
     name = "operator_writer_inc",
     srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         ":ir/hlo_ops.td",
         ":ir/hlo_ops_base.td",
@@ -771,6 +872,8 @@ cc_library(
     ],
     deps = [
         ":buffer_assignment",
+        ":buffer_assignment_test",
+        ":chlo_legalize_to_hlo",
         ":hlo",
         ":hlo_legalize_to_lhlo",
         ":lhlo",
@@ -780,6 +883,7 @@ cc_library(
         ":lhlo_legalize_to_gpu",
         ":lhlo_legalize_to_parallel_loops",
         ":xla_dialect_registration",
+        ":xla_hlo_to_lhlo_with_xla",
         ":xla_legalize_control_flow",
         ":xla_legalize_tf",
         ":xla_legalize_tf_with_tf2xla",
@@ -787,6 +891,7 @@ cc_library(
         ":xla_legalize_to_standard",
         ":xla_lower",
         ":xla_materialize_broadcasts",
+        ":xla_sink_constants_to_control_flow",
         ":xla_test_passes",
     ],
 )
@@ -795,6 +900,8 @@ tf_cc_binary(
     name = "xla-opt",
     deps = [
         ":all_xla_passes_for_testing",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/mlir:tf_mlir_opt_main",
     ],
 )
diff --git a/tensorflow/compiler/mlir/xla/attribute_importer.cc b/tensorflow/compiler/mlir/xla/attribute_importer.cc
new file mode 100644
index 00000000000..201ec0d053f
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/attribute_importer.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/attribute_importer.h"
+
+#include <vector>
+
+namespace xla {
+
+static mlir::DenseIntElementsAttr Convert(llvm::ArrayRef<int64_t> elements,
+                                          mlir::Builder* builder) {
+  return mlir::DenseIntElementsAttr::get(
+      mlir::RankedTensorType::get(elements.size(), builder->getIntegerType(64)),
+      elements);
+}
+
+mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
+                                       mlir::Builder* builder) {
+  if (!config) return {};
+
+  // TODO(b/129709049) The HLO text format elides this in the all DEFAULT
+  // case and the parser sticks it in. Maybe we should too.
+  llvm::SmallVector<mlir::Attribute, 4> operand_precision_attrs;
+
+  for (auto prec : config->operand_precision()) {
+    operand_precision_attrs.push_back(
+        builder->getStringAttr(PrecisionConfig_Precision_Name(prec)));
+  }
+  return builder->getArrayAttr(operand_precision_attrs);
+}
+
+// Converts the gather dimensions to attributes.
+mlir::xla_hlo::GatherDimensionNumbers ConvertGatherDimensionNumbers(
+    const xla::GatherDimensionNumbers& dnums, mlir::Builder* builder) {
+  std::vector<int64_t> offset_dims(dnums.offset_dims().begin(),
+                                   dnums.offset_dims().end());
+  std::vector<int64_t> collapsed_slice_dims(
+      dnums.collapsed_slice_dims().begin(), dnums.collapsed_slice_dims().end());
+  std::vector<int64_t> start_index_map(dnums.start_index_map().begin(),
+                                       dnums.start_index_map().end());
+  return mlir::xla_hlo::GatherDimensionNumbers::get(
+      Convert(offset_dims, builder), Convert(collapsed_slice_dims, builder),
+      Convert(start_index_map, builder),
+      builder->getI64IntegerAttr(dnums.index_vector_dim()),
+      builder->getContext());
+}
+
+mlir::xla_hlo::ScatterDimensionNumbers ConvertScatterDimensionNumbers(
+    const xla::ScatterDimensionNumbers& dnums, mlir::Builder* builder) {
+  std::vector<int64_t> update_window_dims(dnums.update_window_dims().begin(),
+                                          dnums.update_window_dims().end());
+  std::vector<int64_t> inserted_window_dims(
+      dnums.inserted_window_dims().begin(), dnums.inserted_window_dims().end());
+  std::vector<int64_t> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  return mlir::xla_hlo::ScatterDimensionNumbers::get(
+      Convert(update_window_dims, builder),
+      Convert(inserted_window_dims, builder),
+      Convert(scatter_dims_to_operand_dims, builder),
+      builder->getI64IntegerAttr(dnums.index_vector_dim()),
+      builder->getContext());
+}
+
+mlir::xla_hlo::DotDimensionNumbers ConvertDotDimensionNumbers(
+    const DotDimensionNumbers& dnums, mlir::Builder* builder) {
+  std::vector<int64_t> rhs_contracting_dimensions(
+      dnums.rhs_contracting_dimensions().begin(),
+      dnums.rhs_contracting_dimensions().end());
+  std::vector<int64_t> lhs_contracting_dimensions(
+      dnums.lhs_contracting_dimensions().begin(),
+      dnums.lhs_contracting_dimensions().end());
+  std::vector<int64_t> rhs_batch_dimensions(
+      dnums.rhs_batch_dimensions().begin(), dnums.rhs_batch_dimensions().end());
+  std::vector<int64_t> lhs_batch_dimensions(
+      dnums.lhs_batch_dimensions().begin(), dnums.lhs_batch_dimensions().end());
+
+  // Push the attributes into our new DictionaryAttr.
+  auto lhs_batch_dims_attr = Convert(lhs_batch_dimensions, builder);
+  auto rhs_batch_dims_attr = Convert(rhs_batch_dimensions, builder);
+  auto lhs_contracting_dims_attr = Convert(lhs_contracting_dimensions, builder);
+  auto rhs_contracting_dims_attr = Convert(rhs_contracting_dimensions, builder);
+
+  return mlir::xla_hlo::DotDimensionNumbers::get(
+      lhs_batch_dims_attr, rhs_batch_dims_attr, lhs_contracting_dims_attr,
+      rhs_contracting_dims_attr, builder->getContext());
+}
+
+mlir::xla_hlo::ConvDimensionNumbers ConvertConvDimensionNumbers(
+    const xla::ConvolutionDimensionNumbers& dnums, mlir::Builder* builder) {
+  llvm::SmallVector<int64_t, 4> input_spatial_dims(
+      dnums.input_spatial_dimensions().begin(),
+      dnums.input_spatial_dimensions().end());
+  llvm::SmallVector<int64_t, 4> kernel_spatial_dims(
+      dnums.kernel_spatial_dimensions().begin(),
+      dnums.kernel_spatial_dimensions().end());
+  llvm::SmallVector<int64_t, 4> output_spatial_dims(
+      dnums.output_spatial_dimensions().begin(),
+      dnums.output_spatial_dimensions().end());
+  return mlir::xla_hlo::ConvDimensionNumbers::get(
+      builder->getI64IntegerAttr(dnums.input_batch_dimension()),
+      builder->getI64IntegerAttr(dnums.input_feature_dimension()),
+      Convert(input_spatial_dims, builder),
+      builder->getI64IntegerAttr(dnums.kernel_input_feature_dimension()),
+      builder->getI64IntegerAttr(dnums.kernel_output_feature_dimension()),
+      Convert(kernel_spatial_dims, builder),
+      builder->getI64IntegerAttr(dnums.output_batch_dimension()),
+      builder->getI64IntegerAttr(dnums.output_feature_dimension()),
+      Convert(output_spatial_dims, builder), builder->getContext());
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/mlir/xla/attribute_importer.h b/tensorflow/compiler/mlir/xla/attribute_importer.h
new file mode 100644
index 00000000000..9a7ae338334
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/attribute_importer.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_ATTRIBUTE_IMPORTER_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_ATTRIBUTE_IMPORTER_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// Converts an XLA PrecisionConfig to the corresponding MLIR attribute.
+mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
+                                       mlir::Builder* builder);
+
+// Converts the gather dimensions to attributes.
+mlir::xla_hlo::GatherDimensionNumbers ConvertGatherDimensionNumbers(
+    const xla::GatherDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the scatter dimensions to attributes.
+mlir::xla_hlo::ScatterDimensionNumbers ConvertScatterDimensionNumbers(
+    const xla::ScatterDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the dot dimensions to attributes.
+mlir::xla_hlo::DotDimensionNumbers ConvertDotDimensionNumbers(
+    const DotDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the conv dimensions to attributes.
+mlir::xla_hlo::ConvDimensionNumbers ConvertConvDimensionNumbers(
+    const xla::ConvolutionDimensionNumbers& dnums, mlir::Builder* builder);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_ATTRIBUTE_IMPORTER_H_
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index a49648b0b37..718db1597cf 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/xla/attribute_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
@@ -56,6 +57,7 @@ using mlir::Value;
 namespace xla {
 
 namespace {
+
 // Note: This sanitization function causes an irreversible many-to-one mapping
 // and any solution to mitigate this would cause issues with the reverse
 // direction. Longterm solution is to add a function attribute to maintain the
@@ -230,15 +232,19 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
 #undef MakeAndReturnBatchNormOp
 
     case HloOpcode::kDot: {
-      attributes.push_back(ConvertPrecisionConfig(instruction));
+      attributes.push_back(builder_->getNamedAttr(
+          "precision_config",
+          ConvertPrecisionConfig(&instruction->precision_config(), builder_)));
 
       // Consider consolidating DotOps together.
       if (DotIsDefault(instruction)) {
         MakeAndReturn(DotOp);
       }
 
-      attributes.push_back(
-          ConvertDotDimensionNumbers(instruction->dot_dimension_numbers()));
+      attributes.push_back(builder_->getNamedAttr(
+          "dot_dimension_numbers",
+          ConvertDotDimensionNumbers(instruction->dot_dimension_numbers(),
+                                     builder_)));
       MakeAndReturn(DotGeneralOp);
     }
     case HloOpcode::kCall: {
@@ -278,8 +284,10 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     }
     case HloOpcode::kGather: {
       auto gather_instruction = Cast<HloGatherInstruction>(instruction);
-      attributes.push_back(ConvertGatherDimensionNumbers(
-          gather_instruction->gather_dimension_numbers()));
+      attributes.push_back(builder_->getNamedAttr(
+          "dimension_numbers",
+          ConvertGatherDimensionNumbers(
+              gather_instruction->gather_dimension_numbers(), builder_)));
 
       std::vector<int64_t> slice_sizes(
           gather_instruction->gather_slice_sizes().begin(),
@@ -296,9 +304,11 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       std::vector<int64_t> slice_sizes(
           instruction->dynamic_slice_sizes().begin(),
           instruction->dynamic_slice_sizes().end());
-      attributes.push_back(
-          builder_->getNamedAttr("slice_sizes", Convert(slice_sizes)));
-      MakeAndReturn(DynamicSliceOp);
+      return func_builder
+          ->create<mlir::xla_hlo::DynamicSliceOp>(
+              loc, result_type, operands[0],
+              makeArrayRef(operands).drop_front(), Convert(slice_sizes))
+          .getOperation();
     }
     case HloOpcode::kDynamicUpdateSlice: {
       return func_builder
@@ -343,8 +353,10 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     }
     case HloOpcode::kScatter: {
       auto scatter = Cast<HloScatterInstruction>(instruction);
-      attributes.push_back(
-          ConvertScatterDimensionNumbers(scatter->scatter_dimension_numbers()));
+      attributes.push_back(builder_->getNamedAttr(
+          "scatter_dimension_numbers",
+          ConvertScatterDimensionNumbers(scatter->scatter_dimension_numbers(),
+                                         builder_)));
       attributes.push_back(builder_->getNamedAttr(
           "indices_are_sorted",
           builder_->getBoolAttr(scatter->indices_are_sorted())));
@@ -411,8 +423,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       TF_RETURN_IF_ERROR(GetMlirTypes(
           {instruction->true_computation()->root_instruction()}, &rets));
 
-      auto op = func_builder->create<mlir::xla_hlo::ConditionalOp>(
-          loc, rets, operands, attributes);
+      auto op = func_builder->create<mlir::xla_hlo::IfOp>(loc, rets, operands,
+                                                          attributes);
       TF_RETURN_IF_ERROR(ImportComputation(instruction->true_computation(),
                                            &op.true_branch()));
       TF_RETURN_IF_ERROR(ImportComputation(instruction->false_computation(),
@@ -575,15 +587,20 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
           builder_->getNamedAttr("lhs_dilations", Convert(lhs_dilations)));
       attributes.push_back(
           builder_->getNamedAttr("rhs_dilations", Convert(rhs_dilations)));
-      attributes.push_back(ConvertConvDimensionNumbers(
-          instruction->convolution_dimension_numbers()));
+      attributes.push_back(builder_->getNamedAttr(
+          "dimension_numbers",
+          ConvertConvDimensionNumbers(
+              instruction->convolution_dimension_numbers(), builder_)));
       attributes.push_back(builder_->getNamedAttr(
           "feature_group_count",
           builder_->getI64IntegerAttr(instruction->feature_group_count())));
       attributes.push_back(builder_->getNamedAttr(
           "batch_group_count",
           builder_->getI64IntegerAttr(instruction->batch_group_count())));
-      attributes.push_back(ConvertPrecisionConfig(instruction));
+      attributes.push_back(builder_->getNamedAttr(
+          "precision_config",
+          ConvertPrecisionConfig(&instruction->precision_config(), builder_)));
+
       MakeAndReturn(ConvOp);
     }
 
@@ -715,20 +732,6 @@ StatusOr<Value> HloFunctionImporter::GetMlirValue(HloInstruction* instruction) {
       "Unable to find value for input: ", instruction->ToString()));
 }
 
-mlir::NamedAttribute HloFunctionImporter::ConvertPrecisionConfig(
-    HloInstruction* instruction) {
-  // TODO(b/129709049) The HLO text format elides this in the all DEFAULT
-  // case and the parser sticks it in. Maybe we should too.
-  llvm::SmallVector<mlir::Attribute, 4> operand_precision_attrs;
-
-  for (auto prec : instruction->precision_config().operand_precision()) {
-    operand_precision_attrs.push_back(
-        builder_->getStringAttr(PrecisionConfig_Precision_Name(prec)));
-  }
-  return builder_->getNamedAttr(
-      "precision_config", builder_->getArrayAttr(operand_precision_attrs));
-}
-
 mlir::NamedAttribute HloFunctionImporter::ConvertComparisonDirection(
     HloInstruction* instruction) {
   return builder_->getNamedAttr(
@@ -749,10 +752,10 @@ mlir::DenseIntElementsAttr HloFunctionImporter::ConvertDimensions(
 }
 
 mlir::DenseIntElementsAttr HloFunctionImporter::Convert(
-    llvm::ArrayRef<int64_t> op_dimensions) {
+    llvm::ArrayRef<int64_t> elements) {
   return DenseIntElementsAttr::get(
-      RankedTensorType::get(op_dimensions.size(), builder_->getIntegerType(64)),
-      op_dimensions);
+      RankedTensorType::get(elements.size(), builder_->getIntegerType(64)),
+      elements);
 }
 
 mlir::NamedAttribute HloFunctionImporter::ConvertPadding(
@@ -764,86 +767,6 @@ mlir::NamedAttribute HloFunctionImporter::ConvertPadding(
   return builder_->getNamedAttr("padding", attr);
 }
 
-mlir::NamedAttribute HloFunctionImporter::ConvertDotDimensionNumbers(
-    const DotDimensionNumbers& dnums) {
-  std::vector<int64_t> rhs_contracting_dimensions(
-      dnums.rhs_contracting_dimensions().begin(),
-      dnums.rhs_contracting_dimensions().end());
-  std::vector<int64_t> lhs_contracting_dimensions(
-      dnums.lhs_contracting_dimensions().begin(),
-      dnums.lhs_contracting_dimensions().end());
-  std::vector<int64_t> rhs_batch_dimensions(
-      dnums.rhs_batch_dimensions().begin(), dnums.rhs_batch_dimensions().end());
-  std::vector<int64_t> lhs_batch_dimensions(
-      dnums.lhs_batch_dimensions().begin(), dnums.lhs_batch_dimensions().end());
-
-  // Push the attributes into our new DictionaryAttr.
-  auto lhs_batch_dims_attr = Convert(lhs_batch_dimensions);
-  auto rhs_batch_dims_attr = Convert(rhs_batch_dimensions);
-  auto lhs_contracting_dims_attr = Convert(lhs_contracting_dimensions);
-  auto rhs_contracting_dims_attr = Convert(rhs_contracting_dimensions);
-
-  auto attr = mlir::xla_hlo::DotDimensionNumbers::get(
-      lhs_batch_dims_attr, rhs_batch_dims_attr, lhs_contracting_dims_attr,
-      rhs_contracting_dims_attr, context_);
-  return builder_->getNamedAttr("dot_dimension_numbers", attr);
-}
-
-mlir::NamedAttribute HloFunctionImporter::ConvertConvDimensionNumbers(
-    const xla::ConvolutionDimensionNumbers& dnums) {
-  llvm::SmallVector<int64_t, 4> input_spatial_dims(
-      dnums.input_spatial_dimensions().begin(),
-      dnums.input_spatial_dimensions().end());
-  llvm::SmallVector<int64_t, 4> kernel_spatial_dims(
-      dnums.kernel_spatial_dimensions().begin(),
-      dnums.kernel_spatial_dimensions().end());
-  llvm::SmallVector<int64_t, 4> output_spatial_dims(
-      dnums.output_spatial_dimensions().begin(),
-      dnums.output_spatial_dimensions().end());
-  auto attr = mlir::xla_hlo::ConvDimensionNumbers::get(
-      builder_->getI64IntegerAttr(dnums.input_batch_dimension()),
-      builder_->getI64IntegerAttr(dnums.input_feature_dimension()),
-      Convert(input_spatial_dims),
-      builder_->getI64IntegerAttr(dnums.kernel_input_feature_dimension()),
-      builder_->getI64IntegerAttr(dnums.kernel_output_feature_dimension()),
-      Convert(kernel_spatial_dims),
-      builder_->getI64IntegerAttr(dnums.output_batch_dimension()),
-      builder_->getI64IntegerAttr(dnums.kernel_output_feature_dimension()),
-      Convert(output_spatial_dims), context_);
-  return builder_->getNamedAttr("dimension_numbers", attr);
-}
-
-mlir::NamedAttribute HloFunctionImporter::ConvertGatherDimensionNumbers(
-    const xla::GatherDimensionNumbers& dnums) {
-  std::vector<int64_t> offset_dims(dnums.offset_dims().begin(),
-                                   dnums.offset_dims().end());
-  std::vector<int64_t> collapsed_slice_dims(
-      dnums.collapsed_slice_dims().begin(), dnums.collapsed_slice_dims().end());
-  std::vector<int64_t> start_index_map(dnums.start_index_map().begin(),
-                                       dnums.start_index_map().end());
-  auto attr = mlir::xla_hlo::GatherDimensionNumbers::get(
-      Convert(offset_dims), Convert(collapsed_slice_dims),
-      Convert(start_index_map),
-      builder_->getI64IntegerAttr(dnums.index_vector_dim()), context_);
-  return builder_->getNamedAttr("dimension_numbers", attr);
-}
-
-mlir::NamedAttribute HloFunctionImporter::ConvertScatterDimensionNumbers(
-    const xla::ScatterDimensionNumbers& dnums) {
-  std::vector<int64_t> update_window_dims(dnums.update_window_dims().begin(),
-                                          dnums.update_window_dims().end());
-  std::vector<int64_t> inserted_window_dims(
-      dnums.inserted_window_dims().begin(), dnums.inserted_window_dims().end());
-  std::vector<int64_t> scatter_dims_to_operand_dims(
-      dnums.scatter_dims_to_operand_dims().begin(),
-      dnums.scatter_dims_to_operand_dims().end());
-  auto attr = mlir::xla_hlo::ScatterDimensionNumbers::get(
-      Convert(update_window_dims), Convert(inserted_window_dims),
-      Convert(scatter_dims_to_operand_dims),
-      builder_->getI64IntegerAttr(dnums.index_vector_dim()), context_);
-  return builder_->getNamedAttr("scatter_dimension_numbers", attr);
-}
-
 mlir::NamedAttribute HloFunctionImporter::ConvertSourceTargetPairs(
     const std::vector<std::pair<tensorflow::int64, tensorflow::int64>>&
         source_target_pairs) {
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
index 5dfa0adac82..14b6d309e94 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
@@ -89,9 +89,6 @@ class HloFunctionImporter {
   // Returns the Mlir Value for the corresponding HloInstruction.
   StatusOr<mlir::Value> GetMlirValue(xla::HloInstruction* instruction);
 
-  // Converts an XLA PrecisionConfig to the corresponding MLIR attribute.
-  mlir::NamedAttribute ConvertPrecisionConfig(xla::HloInstruction* instruction);
-
   // Converts an XLA ComparisonDirection to the corresponding MLIR attribute.
   mlir::NamedAttribute ConvertComparisonDirection(
       xla::HloInstruction* instruction);
@@ -101,28 +98,12 @@ class HloFunctionImporter {
       llvm::ArrayRef<tensorflow::int64> op_dimensions);
 
   // Converts Array ref to an DenseIntElementsAttr.
-  mlir::DenseIntElementsAttr Convert(llvm::ArrayRef<int64_t> op_dimensions);
+  mlir::DenseIntElementsAttr Convert(llvm::ArrayRef<int64_t> elements);
 
   // Converts Array ref to padding attribute. Input is a flattened list of
   // padding low and padding high for each of the spatial dimensions.
   mlir::NamedAttribute ConvertPadding(llvm::ArrayRef<int64_t> padding);
 
-  // Converts the dot dimensions to attribute.
-  mlir::NamedAttribute ConvertDotDimensionNumbers(
-      const DotDimensionNumbers& dnums);
-
-  // Converts the conv dimensions to attributes.
-  mlir::NamedAttribute ConvertConvDimensionNumbers(
-      const xla::ConvolutionDimensionNumbers& dnums);
-
-  // Converts the gather dimensions to attributes.
-  mlir::NamedAttribute ConvertGatherDimensionNumbers(
-      const xla::GatherDimensionNumbers& dnums);
-
-  // Converts the scatter dimensions to attributes.
-  mlir::NamedAttribute ConvertScatterDimensionNumbers(
-      const xla::ScatterDimensionNumbers& dnums);
-
   // Converts replica groups to attribute
   mlir::NamedAttribute ConvertReplicaGroups(
       const std::vector<ReplicaGroup>& replica_groups);
diff --git a/tensorflow/compiler/mlir/xla/hlo_utils.cc b/tensorflow/compiler/mlir/xla/hlo_utils.cc
index dfed190ba1e..dc801f64ede 100644
--- a/tensorflow/compiler/mlir/xla/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_utils.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 namespace {
@@ -41,6 +43,31 @@ template <typename CppType>
       type, llvm::makeArrayRef(data_span.data(), data_span.size()));
 }
 
+mlir::APFloat ConvertToAPFloat(bfloat16 val) {
+  // bfloat16 values are stored as double in MLIR.
+  return llvm::APFloat(static_cast<double>(val));
+}
+
+mlir::APFloat ConvertToAPFloat(half val) {
+  llvm::APFloat single_val = llvm::APFloat(static_cast<float>(val));
+  bool loses_info = false;
+  CHECK_EQ(single_val.convert(llvm::APFloat::IEEEhalf(),
+                              llvm::APFloat::rmTowardZero, &loses_info),
+           llvm::APFloat::opOK);
+  CHECK(!loses_info);
+  return single_val;
+}
+
+template <typename CppType>
+::mlir::DenseElementsAttr CreateDenseAttrFrom16BitFloat(
+    const ShapedType& type, const LiteralBase& literal) {
+  auto data_span = literal.data<CppType>();
+  llvm::SmallVector<mlir::APFloat, 4> vals;
+  vals.reserve(data_span.size());
+  for (CppType val : data_span) vals.push_back(ConvertToAPFloat(val));
+  return ::mlir::DenseElementsAttr::get(type, vals);
+}
+
 StatusOr<llvm::SmallVector<AffineMap, 1>> GetPermutationIfAvailable(
     const Shape& shape, mlir::Builder builder) {
   if (!shape.has_layout() ||
@@ -83,12 +110,15 @@ StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
                       ConvertTensorShapeToType<mlir::RankedTensorType>(
                           literal.shape(), builder));
 
+  // TODO(hinsu): Support remaining XLA primitive types.
   auto element_type = literal.shape().element_type();
   switch (element_type) {
     case PrimitiveType::PRED:
       return CreateDenseAttrFromLiteral<bool>(type, literal);
     case PrimitiveType::F16:
-      return CreateDenseAttrFromLiteral<float>(type, literal);
+      return CreateDenseAttrFrom16BitFloat<half>(type, literal);
+    case PrimitiveType::BF16:
+      return CreateDenseAttrFrom16BitFloat<bfloat16>(type, literal);
     case PrimitiveType::F32:
       return CreateDenseAttrFromLiteral<float>(type, literal);
     case PrimitiveType::F64:
@@ -101,6 +131,18 @@ StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
       return CreateDenseAttrFromLiteral<int32>(type, literal);
     case PrimitiveType::S64:
       return CreateDenseAttrFromLiteral<int64>(type, literal);
+    case PrimitiveType::U8:
+      return CreateDenseAttrFromLiteral<uint8>(type, literal);
+    case PrimitiveType::U16:
+      return CreateDenseAttrFromLiteral<uint16>(type, literal);
+    case PrimitiveType::U32:
+      return CreateDenseAttrFromLiteral<uint32>(type, literal);
+    case PrimitiveType::U64:
+      return CreateDenseAttrFromLiteral<uint64>(type, literal);
+    case PrimitiveType::C64:
+      return CreateDenseAttrFromLiteral<complex64>(type, literal);
+    case PrimitiveType::C128:
+      return CreateDenseAttrFromLiteral<complex128>(type, literal);
     default:
       return tensorflow::errors::Internal(
           absl::StrCat("Unsupported type: ", PrimitiveType_Name(element_type)));
@@ -137,6 +179,14 @@ StatusOr<mlir::Type> ConvertPrimitiveTypeToMLIRType(PrimitiveType element_type,
       return builder.getIntegerType(32);
     case PrimitiveType::S64:
       return builder.getIntegerType(64);
+    case PrimitiveType::U8:
+      return builder.getIntegerType(8, /*isSigned=*/false);
+    case PrimitiveType::U16:
+      return builder.getIntegerType(16, /*isSigned=*/false);
+    case PrimitiveType::U32:
+      return builder.getIntegerType(32, /*isSigned=*/false);
+    case PrimitiveType::U64:
+      return builder.getIntegerType(64, /*isSigned=*/false);
     case PrimitiveType::C64:
       return mlir::ComplexType::get(builder.getF32Type());
     case PrimitiveType::C128:
diff --git a/tensorflow/compiler/mlir/xla/ir/broadcast_utils.cc b/tensorflow/compiler/mlir/xla/ir/broadcast_utils.cc
new file mode 100644
index 00000000000..2f77b7da114
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/broadcast_utils.cc
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/ir/broadcast_utils.h"
+
+#include <algorithm>
+
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+
+namespace mlir {
+namespace xla {
+
+bool IsLegalNumpyRankedBroadcast(Value lhs, Value rhs,
+                                 DenseIntElementsAttr broadcast_dims) {
+  RankedTensorType lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
+  RankedTensorType rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
+  if (!lhs_type || !rhs_type) return false;
+  if (lhs_type.getRank() == rhs_type.getRank()) return true;
+
+  // Otherwise, verify that broadcast_dims strictly performs left-padding.
+  auto smaller_rank = std::min(lhs_type.getRank(), rhs_type.getRank());
+  auto larger_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
+
+  if (smaller_rank != broadcast_dims.getNumElements()) {
+    return false;
+  }
+  auto expected_extents =
+      llvm::seq<int64_t>(larger_rank - smaller_rank, larger_rank);
+  return std::equal(expected_extents.begin(), expected_extents.end(),
+                    broadcast_dims.getIntValues().begin());
+}
+
+Value ComputeBinaryElementwiseBroadcastingResultExtents(Location loc, Value lhs,
+                                                        Value rhs,
+                                                        OpBuilder& builder) {
+  auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
+  auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
+  if (!lhs_type || !rhs_type) {
+    emitError(loc) << "shape computation for broadcasting elementwise ops "
+                   << "is only implemented for ranked tensors";
+    return nullptr;
+  }
+
+  int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
+  auto shape_type = shape::ShapeType::get(builder.getContext());
+  Value lhs_shape_v =
+      builder.createOrFold<shape::ShapeOfOp>(loc, shape_type, lhs);
+  Value rhs_shape_v =
+      builder.createOrFold<shape::ShapeOfOp>(loc, shape_type, rhs);
+  Value result_shape_v = builder.createOrFold<shape::BroadcastOp>(
+      loc, shape_type, lhs_shape_v, rhs_shape_v, nullptr /* error */);
+  return builder.createOrFold<shape::ToExtentTensorOp>(
+      loc, RankedTensorType::get({result_rank}, builder.getIndexType()),
+      result_shape_v);
+}
+
+}  // namespace xla
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/ir/broadcast_utils.h b/tensorflow/compiler/mlir/xla/ir/broadcast_utils.h
new file mode 100644
index 00000000000..7c5b5e3311c
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/broadcast_utils.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_BROADCAST_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_IR_BROADCAST_UTILS_H_
+
+// Utilities relating to implementing HLO broadcasting.
+// Note: This file should not depend on any non-MLIR TensorFlow libraries.
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace xla {
+
+// Checks whether the given operand types and broadcast_dims attr represent a
+// legal combination for "numpy" style broadcasting (where 1-dims are prepended
+// to the smaller ranked operand until it is of the same rank as the larger).
+// See: https://docs.scipy.org/doc/numpy/reference/ufuncs.html
+bool IsLegalNumpyRankedBroadcast(Value lhs, Value rhs,
+                                 DenseIntElementsAttr broadcast_dims);
+
+// Emits shape dialect ops to compute the result shape for a broadcasting
+// binary elementwise op which broadcasts according to "numpy" semantics
+// (see above), returning an extents tensor of the resulting shape.
+Value ComputeBinaryElementwiseBroadcastingResultExtents(Location loc, Value lhs,
+                                                        Value rhs,
+                                                        OpBuilder& builder);
+
+}  // namespace xla
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_BROADCAST_UTILS_H_
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
new file mode 100644
index 00000000000..26db4549a2a
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -0,0 +1,278 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/broadcast_utils.h"
+
+namespace mlir {
+namespace xla_chlo {
+
+template <typename T>
+static LogicalResult Verify(T op) {
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// BinaryOps
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Gets the resulting type from a broadcast between two types.
+static Type GetBroadcastType(Type x, Type y, Type element_type,
+                             DenseIntElementsAttr broadcast_dimensions_attr) {
+  auto x_ranked = x.dyn_cast<RankedTensorType>();
+  auto y_ranked = y.dyn_cast<RankedTensorType>();
+  if (!x_ranked || !y_ranked) {
+    return UnrankedTensorType::get(element_type);
+  }
+
+  auto shape_x = x_ranked.getShape();
+  auto shape_y = y_ranked.getShape();
+
+  if (shape_x.size() == shape_y.size()) {
+    llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
+    for (int i = 0; i < shape_x.size(); i++) {
+      auto x_val = shape_x[i];
+      auto y_val = shape_y[i];
+      if (x_val == -1 || y_val == -1) {
+        out_shape[i] = -1;
+      } else {
+        out_shape[i] = std::max(x_val, y_val);
+      }
+    }
+    return RankedTensorType::get(out_shape, element_type);
+  }
+
+  auto shape_large = shape_x.size() > shape_y.size() ? shape_x : shape_y;
+  auto shape_small = shape_x.size() <= shape_y.size() ? shape_x : shape_y;
+
+  llvm::SmallVector<int64_t, 4> broadcast_dimensions;
+  if (broadcast_dimensions_attr) {
+    // Explicit broadcast dimensions.
+    for (const APInt& int_value : broadcast_dimensions_attr.getIntValues()) {
+      broadcast_dimensions.push_back(int_value.getSExtValue());
+    }
+    if (broadcast_dimensions.size() != shape_small.size()) {
+      // Signal illegal broadcast_dimensions as unranked.
+      return UnrankedTensorType::get(element_type);
+    }
+  } else {
+    // If no broadcast dimensions, assume "numpy" broadcasting.
+    broadcast_dimensions = llvm::to_vector<4>(llvm::seq<int64_t>(
+        shape_large.size() - shape_small.size(), shape_large.size()));
+  }
+
+  llvm::SmallVector<int64_t, 4> out_shape(shape_large.begin(),
+                                          shape_large.end());
+
+  // Update according to the broadcast dimensions.
+  for (auto index_pair : llvm::enumerate(broadcast_dimensions)) {
+    auto old_value = out_shape[index_pair.value()];
+    auto new_value = shape_small[index_pair.index()];
+    if (old_value != -1 && (new_value == -1 || new_value > old_value)) {
+      out_shape[index_pair.value()] = new_value;
+    }
+  }
+
+  return RankedTensorType::get(out_shape, element_type);
+}
+
+LogicalResult InferBroadcastBinaryOpReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, Type element_type,
+    SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+  // Find broadcast_dimensions.
+  DenseIntElementsAttr broadcast_dimensions =
+      attributes.get("broadcast_dimensions")
+          .dyn_cast_or_null<DenseIntElementsAttr>();
+
+  ShapedType lhs_type = operands[0].getType().dyn_cast<ShapedType>();
+  ShapedType rhs_type = operands[1].getType().dyn_cast<ShapedType>();
+  if (!lhs_type || !rhs_type ||
+      lhs_type.getElementType() != rhs_type.getElementType()) {
+    return emitOptionalError(location, "mismatched operand types");
+  }
+  if (!element_type) element_type = lhs_type.getElementType();
+  Type result_type =
+      GetBroadcastType(lhs_type, rhs_type, element_type, broadcast_dimensions);
+
+  if (auto ranked_result_type = result_type.dyn_cast<RankedTensorType>()) {
+    inferedReturnShapes.emplace_back(ranked_result_type.getShape(),
+                                     element_type);
+    return success();
+  }
+
+  // TODO(laurenzo): This should be constructing with `element_type` but that
+  // constructor variant needs to be added upstream.
+  inferedReturnShapes.emplace_back(/* element_type */);
+  return success();
+}
+
+LogicalResult ReifyBroadcastBinaryOpReturnTypeShapes(
+    OpBuilder& builder, Operation* op,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  auto loc = op->getLoc();
+  auto lhs = op->getOperand(0);
+  auto rhs = op->getOperand(1);
+
+  // Check for "numpy"-style rank broadcast.
+  auto broadcast_dimensions = op->getAttr("broadcast_dimensions")
+                                  .dyn_cast_or_null<DenseIntElementsAttr>();
+  if (broadcast_dimensions &&
+      !xla::IsLegalNumpyRankedBroadcast(lhs, rhs, broadcast_dimensions)) {
+    // Note: It is unclear whether the general specification of explicit
+    // broadcast_dimensions on binary ops is a feature we want to carry
+    // forward. While it can technically be implemented for ranked-dynamic,
+    // it is incompatible with unranked inputs. If this warning is emitted
+    // in real programs, it is an indication that the feature should be
+    // implemented versus just falling back on the more standard definition
+    // of numpy-like prefix-padding.
+    return op->emitWarning()
+           << "unsupported non prefix-padded dynamic rank "
+           << "broadcast_dimensions = " << broadcast_dimensions;
+  }
+
+  Value computed_shape = xla::ComputeBinaryElementwiseBroadcastingResultExtents(
+      loc, lhs, rhs, builder);
+  if (!computed_shape) return failure();
+  reifiedReturnShapes.push_back(computed_shape);
+  return success();
+}
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// BroadcastComplexOp (has custom type inference due to different result type).
+//===----------------------------------------------------------------------===//
+
+LogicalResult BroadcastComplexOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+  ShapedType lhs_type = operands[0].getType().dyn_cast<ShapedType>();
+  if (!lhs_type) {
+    return emitOptionalError(location, "expected ShapedType");
+  }
+  Type element_type = ComplexType::get(lhs_type.getElementType());
+  return InferBroadcastBinaryOpReturnTypeComponents(context, location, operands,
+                                                    attributes, element_type,
+                                                    inferedReturnShapes);
+}
+LogicalResult BroadcastComplexOp::reifyReturnTypeShapes(
+    OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return ReifyBroadcastBinaryOpReturnTypeShapes(builder, getOperation(),
+                                                reifiedReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// BroadcastCompareOp (has custom type inference due to different result type).
+//===----------------------------------------------------------------------===//
+
+void BroadcastCompareOp::build(OpBuilder& builder, OperationState& result,
+                               Value lhs, Value rhs,
+                               DenseIntElementsAttr broadcast_dimensions,
+                               StringAttr comparison_direction) {
+  auto new_type = GetBroadcastType(lhs.getType(), rhs.getType(),
+                                   builder.getI1Type(), broadcast_dimensions);
+  build(builder, result, new_type, lhs, rhs, broadcast_dimensions,
+        comparison_direction);
+}
+
+LogicalResult BroadcastCompareOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+  Type element_type = IntegerType::get(1, context);
+  return InferBroadcastBinaryOpReturnTypeComponents(context, location, operands,
+                                                    attributes, element_type,
+                                                    inferedReturnShapes);
+}
+LogicalResult BroadcastCompareOp::reifyReturnTypeShapes(
+    OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return ReifyBroadcastBinaryOpReturnTypeShapes(builder, getOperation(),
+                                                reifiedReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// Macros for method definitions that are common to most broadcasting ops.
+//===----------------------------------------------------------------------===//
+
+#define BROADCAST_INFER_SHAPE_TYPE_OP_DEFS(Op)                                \
+  LogicalResult Op::inferReturnTypeComponents(                                \
+      MLIRContext* context, Optional<Location> location, ValueRange operands, \
+      DictionaryAttr attributes, RegionRange regions,                         \
+      SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {           \
+    return InferBroadcastBinaryOpReturnTypeComponents(                        \
+        context, location, operands, attributes, /*element_type=*/nullptr,    \
+        inferedReturnShapes);                                                 \
+  }                                                                           \
+  LogicalResult Op::reifyReturnTypeShapes(                                    \
+      OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {      \
+    return ReifyBroadcastBinaryOpReturnTypeShapes(builder, getOperation(),    \
+                                                  reifiedReturnShapes);       \
+  }
+
+#define BROADCAST_BINARY_OP_DEFS(Op)                                           \
+  void Op::build(OpBuilder& builder, OperationState& result, Value left,       \
+                 Value right, DenseIntElementsAttr broadcast_dimensions) {     \
+    auto type = GetBroadcastType(                                              \
+        left.getType().cast<ShapedType>(), right.getType().cast<ShapedType>(), \
+        getElementTypeOrSelf(right.getType()), broadcast_dimensions);          \
+    return Op::build(builder, result, type, left, right,                       \
+                     broadcast_dimensions);                                    \
+  }                                                                            \
+  BROADCAST_INFER_SHAPE_TYPE_OP_DEFS(Op)
+
+BROADCAST_BINARY_OP_DEFS(BroadcastAddOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastAndOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastAtan2Op);
+BROADCAST_BINARY_OP_DEFS(BroadcastDivOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastMaxOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastMinOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastMulOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastOrOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastPowOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastRemOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastShiftLeftOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastShiftRightArithmeticOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastShiftRightLogicalOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastSubOp);
+BROADCAST_BINARY_OP_DEFS(BroadcastXorOp);
+
+#undef BROADCAST_INFER_SHAPE_TYPE_OP_DEFS
+#undef BROADCAST_BINARY_OP_DEFS
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.cc.inc"
+
+//===----------------------------------------------------------------------===//
+// xla_chlo Dialect Constructor
+//===----------------------------------------------------------------------===//
+
+XlaHloClientDialect::XlaHloClientDialect(MLIRContext* context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.cc.inc"
+      >();
+}
+
+}  // namespace xla_chlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h b/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
similarity index 72%
rename from tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h
rename to tensorflow/compiler/mlir/xla/ir/chlo_ops.h
index 405b1ffb12e..a5337907579 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_CLIENT_OPS_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_CLIENT_OPS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_CHLO_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_IR_CHLO_OPS_H_
 
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Dialect.h"  // from @llvm-project
@@ -24,21 +24,22 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
-namespace xla_hlo_client {
+namespace xla_chlo {
 
 class XlaHloClientDialect : public Dialect {
  public:
   explicit XlaHloClientDialect(MLIRContext *context);
-  static StringRef getDialectNamespace() { return "xla_hlo_client"; }
+  static StringRef getDialectNamespace() { return "xla_chlo"; }
 };
 
 #define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h.inc"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h.inc"
 
-}  // namespace xla_hlo_client
+}  // namespace xla_chlo
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_CLIENT_OPS_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_CHLO_OPS_H_
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.td b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
new file mode 100644
index 00000000000..febc99f6b72
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
@@ -0,0 +1,370 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines "client" aligned HLO ops.
+// These ops are not necessarily orthogonal or optimized for transformation but
+// for ease of expression in certain cases deemed important for client
+// libraries (i.e. implicit broadcasting, helper ops, etc).
+// This dialect is considered to exist in addition to augment the xla_hlo
+// dialect for ergonomic needs, not duplicate/replace it.
+//
+// The typical use of this dialect is for client libraries to be able to emit
+// less constrained ops and rely on the conversion framework to lower any
+// xla_chlo ops to canonical xla_hlo ops.
+//
+// See: https://www.tensorflow.org/xla/operation_semantics
+
+#ifndef CHLO_OPS
+#define CHLO_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
+
+def HLOClient_Dialect : Dialect {
+  let name = "xla_chlo";
+  let cppNamespace = "xla_chlo";
+  let summary = [{
+    XLA Client HLO Ops
+  }];
+
+  let description = [{
+    This dialect contains ops that align closely with the API surface area
+    of the XlaBuilder C++ API, where such ops have semantics that go beyond
+    what exists in the lower level dialects (such as `xla_hlo`). Essentially,
+    whenever the client library uses syntactic sugar or composition
+    of multiple ops for an API call, this dialect tries to model the API call
+    and provide conversion patterns to fully materialize into lower level
+    dialects.
+  }];
+}
+
+class HLOClient_Op<string mnemonic, list<OpTrait> traits> :
+    Op<HLOClient_Dialect, mnemonic, traits> {
+  // TODO(b/129012527) Much of this custom verification should be expressed as
+  // type constraints.
+  let verifier = [{ return Verify(*this); }];
+}
+
+//===----------------------------------------------------------------------===//
+// XLA binary elementwise op definitions.
+// From the client perspective, each of these support both explicit rank
+// broadcasting (via the broadcast_dimensions attribute) and implicit degenerate
+// shape broadcasting.
+//
+// These correspond to operations in the xla_hlo dialect without the
+// "broadcast_" prefix, except that those ops require same-shaped operands and
+// results.
+//
+// See:
+//   https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
+//   https://www.tensorflow.org/xla/broadcasting
+//===----------------------------------------------------------------------===//
+
+class HLOClient_BroadcastBinaryElementwiseOp<
+  string mnemonic, list<OpTrait> traits> :
+        HLOClient_Op<mnemonic,
+            !listconcat(traits, [
+              DeclareOpInterfaceMethods<InferShapedTypeOpInterface>])> {
+  let arguments = (ins
+    HLO_Tensor:$lhs,
+    HLO_Tensor:$rhs,
+    // Explicit rank-broadcast dimension mappings. Defaults to "numpy" prefix
+    // padded rank-broadcast semantics if omitted.
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+  );
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, Value left, Value  right, "
+    "DenseIntElementsAttr broadcast_dimensions"
+  >];
+
+  let results = (outs HLO_Tensor);
+
+  let assemblyFormat = [{
+    $lhs `,` $rhs attr-dict `:`
+    `(` type($lhs) `,` type($rhs) `)` `->` type(results)
+  }];
+
+  let extraClassDeclaration = [{
+    // TODO(laurenzo): It isn't clear to me why reifyReturnShapes does not
+    // have its declaration generated by DeclareOpInterfaceMethods.
+    LogicalResult reifyReturnTypeShapes(
+         OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes);
+  }];
+}
+
+def HLOClient_BroadcastAddOp : HLOClient_BroadcastBinaryElementwiseOp<"broadcast_add",
+    [Commutative, NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Addition operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `lhs + rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastAtan2Op : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_atan2",
+    [NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Atan2 operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `atan2(lhs/rhs)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastDivOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_divide",
+    [NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Division operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `lhs / rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastMaxOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_maximum",
+    [Commutative, NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Maximum operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `max(lhs, rhs)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastMinOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_minimum",
+    [Commutative, NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Minimum operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `min(lhs, rhs)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastMulOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_multiply",
+    [Commutative, NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Multiplication operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `lhs * rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastPowOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_power",
+    [NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Power operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `lhs ^ rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastRemOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_remainder",
+    [NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Remainder operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `lhs % rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastShiftLeftOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_shift_left",
+    [NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Shift left operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `lhs << rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastShiftRightArithmeticOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_shift_right_arithmetic",
+    [NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Shift right arithmetic operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `lhs >> rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastShiftRightLogicalOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_shift_right_logical",
+    [NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Shift right logical operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `lhs >> rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastSubOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_subtract",
+    [NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "Subtraction operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `lhs - rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// XLA binary elementwise op definitions.
+// The same description as the arithmetic binary elementwise ops applies.
+//===----------------------------------------------------------------------===//
+
+class HLOClient_BroadcastBinaryLogicalElementwiseOp<string mnemonic> :
+    HLOClient_BroadcastBinaryElementwiseOp<
+      mnemonic, [Commutative, NoSideEffect]> {
+  let arguments = (ins
+    HLO_PredOrIntTensor:$lhs,
+    HLO_PredOrIntTensor:$rhs,
+    // Explicit rank-broadcast dimension mappings. Defaults to "numpy" prefix
+    // padded rank-broadcast semantics if omitted.
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+  );
+}
+
+def HLOClient_BroadcastAndOp: HLOClient_BroadcastBinaryLogicalElementwiseOp<
+    "broadcast_and"> {
+  string summary = "Logical and operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `logical_and(lhs, rhs)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastOrOp: HLOClient_BroadcastBinaryLogicalElementwiseOp<
+    "broadcast_or"> {
+  string summary = "Logical or operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `logical_or(lhs, rhs)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+def HLOClient_BroadcastXorOp : HLOClient_BroadcastBinaryLogicalElementwiseOp<
+    "broadcast_xor"> {
+  string summary = "Logical xor operator (with optional broadcasting)";
+
+  string description = [{
+    Returns `logical_xor(lhs, rhs)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Broadcasting complex op
+//===----------------------------------------------------------------------===//
+
+def HLOClient_BroadcastComplexOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_complex", [NoSideEffect]> {
+  string summary = "Complex operator (with optional broadcasting)";
+
+  string description = [{
+    Performs element-wise conversion of a pair of real and imaginary values to
+    a complex value.
+  }];
+
+  let arguments = (ins
+    HLO_FpTensor:$lhs,
+    HLO_FpTensor:$rhs,
+    // Explicit rank-broadcast dimension mappings. Defaults to "numpy" prefix
+    // padded rank-broadcast semantics if omitted.
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+  );
+  let results = (outs HLO_ComplexTensor);
+}
+
+//===----------------------------------------------------------------------===//
+// Broadcasting compare op
+//===----------------------------------------------------------------------===//
+
+def HLOClient_BroadcastCompareOp : HLOClient_BroadcastBinaryElementwiseOp<
+    "broadcast_compare", [NoSideEffect]> {
+  string summary = "Compare operator (with optional broadcasting)";
+
+  string description = [{
+    Compares `lhs` and `rhs` elementwise according to `comparison_direction`.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations.
+  }];
+
+  let arguments = (ins
+    HLO_Tensor:$lhs,
+    HLO_Tensor:$rhs,
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions,
+    HLO_ComparisonDirectionAttr:$comparison_direction
+  );
+  let results = (outs HLO_PredTensor);
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
+    "DenseIntElementsAttr broadcast_dimensions, StringAttr comparison_direction"
+  >];
+}
+
+#endif  // CHLO_OPS
diff --git a/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc b/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc
index bafbc1ac9a9..2d1bc8d4359 100644
--- a/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
 
 // Static initialization for XLA dialect registration.
 static mlir::DialectRegistration<mlir::xla_hlo::XlaHloDialect> xla_hlo_ops;
-static mlir::DialectRegistration<mlir::xla_hlo_client::XlaHloClientDialect>
-    xla_hlo_client_ops;
+static mlir::DialectRegistration<mlir::xla_chlo::XlaHloClientDialect>
+    xla_chlo_ops;
 static mlir::DialectRegistration<mlir::xla_lhlo::XlaLhloDialect> xla_lhlo_ops;
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc
deleted file mode 100644
index 921c4f069ec..00000000000
--- a/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h"
-
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-
-namespace mlir {
-namespace xla_hlo_client {
-
-template <typename T>
-static LogicalResult Verify(T op) {
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// BinaryOps
-//===----------------------------------------------------------------------===//
-
-namespace {
-// Gets the resulting type from a broadcast between two types.
-static Type GetBroadcastType(Builder* builder, Type x, Type y,
-                             Type element_type,
-                             DenseIntElementsAttr broadcast_dimensions) {
-  auto x_ranked = x.dyn_cast<RankedTensorType>();
-  auto y_ranked = y.dyn_cast<RankedTensorType>();
-  if (!x_ranked || !y_ranked) {
-    return UnrankedTensorType::get(element_type);
-  }
-
-  auto shape_x = x_ranked.getShape();
-  auto shape_y = y_ranked.getShape();
-
-  if (shape_x.size() == shape_y.size()) {
-    llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (int i = 0; i < shape_x.size(); i++) {
-      auto x_val = shape_x[i];
-      auto y_val = shape_y[i];
-      if (x_val == -1 || y_val == -1) {
-        out_shape[i] = -1;
-      } else {
-        out_shape[i] = std::max(x_val, y_val);
-      }
-    }
-    return RankedTensorType::get(out_shape, element_type);
-  }
-
-  // Return unranked tensor for invalid broadcast dimensions.
-  if (!broadcast_dimensions) return UnrankedTensorType::get(element_type);
-
-  auto shape_large = shape_x.size() > shape_y.size() ? shape_x : shape_y;
-  auto shape_small = shape_x.size() <= shape_y.size() ? shape_x : shape_y;
-
-  llvm::SmallVector<int64_t, 4> out_shape(shape_large.begin(),
-                                          shape_large.end());
-
-  // Update according to the broadcast dimensions.
-  for (auto index_pair : llvm::enumerate(broadcast_dimensions.getIntValues())) {
-    auto old_value = out_shape[index_pair.value().getSExtValue()];
-    auto new_value = shape_small[index_pair.index()];
-    if (old_value != -1 && (new_value == -1 || new_value > old_value)) {
-      out_shape[index_pair.value().getSExtValue()] = new_value;
-    }
-  }
-
-  return RankedTensorType::get(out_shape, element_type);
-}
-}  // namespace
-
-#define BINARY_BUILDER(Op)                                                   \
-  void Op::build(Builder* builder, OperationState& result, Value left,       \
-                 Value right, DenseIntElementsAttr broadcast_dimensions) {   \
-    auto type = GetBroadcastType(builder, left.getType().cast<ShapedType>(), \
-                                 right.getType().cast<ShapedType>(),         \
-                                 getElementTypeOrSelf(right.getType()),      \
-                                 broadcast_dimensions);                      \
-    return Op::build(builder, result, type, left, right,                     \
-                     broadcast_dimensions);                                  \
-  }
-
-BINARY_BUILDER(AddOp);
-BINARY_BUILDER(AndOp);
-BINARY_BUILDER(Atan2Op);
-BINARY_BUILDER(DivOp);
-BINARY_BUILDER(MaxOp);
-BINARY_BUILDER(MinOp);
-BINARY_BUILDER(MulOp);
-BINARY_BUILDER(OrOp);
-BINARY_BUILDER(PowOp);
-BINARY_BUILDER(RemOp);
-BINARY_BUILDER(ShiftLeftOp);
-BINARY_BUILDER(ShiftRightArithmeticOp);
-BINARY_BUILDER(ShiftRightLogicalOp);
-BINARY_BUILDER(SubOp);
-BINARY_BUILDER(XorOp);
-
-#undef BINARY_BUILDER
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc.inc"
-
-//===----------------------------------------------------------------------===//
-// xla_hlo_client Dialect Constructor
-//===----------------------------------------------------------------------===//
-
-XlaHloClientDialect::XlaHloClientDialect(MLIRContext* context)
-    : Dialect(getDialectNamespace(), context) {
-  addOperations<
-#define GET_OP_LIST
-#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc.inc"
-      >();
-}
-
-}  // namespace xla_hlo_client
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.td
deleted file mode 100644
index 48b765f2299..00000000000
--- a/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.td
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Defines "client" aligned HLO ops.
-// These ops are not necessarily orthogonal or optimized for transformation but
-// for ease of expression in certain cases deemed important for client
-// libraries (i.e. implicit broadcasting, helper ops, etc).
-// This dialect is considered to exist in addition to augment the xla_hlo
-// dialect for ergonomic needs, not duplicate/replace it.
-//
-// The typical use of this dialect is for client libraries to be able to emit
-// less constrained ops and rely on the conversion framework to lower any
-// xla_hlo_client ops to canonical xla_hlo ops.
-//
-// See: https://www.tensorflow.org/xla/operation_semantics
-
-#ifndef HLO_CLIENT_OPS
-#define HLO_CLIENT_OPS
-
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
-include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
-
-def HLOClient_Dialect : Dialect {
-  let name = "xla_hlo_client";
-  let cppNamespace = "xla_hlo_client";
-}
-
-class HLOClient_Op<string mnemonic, list<OpTrait> traits> :
-    Op<HLOClient_Dialect, mnemonic, traits> {
-  // TODO(b/129012527) Much of this custom verification should be expressed as
-  // type constraints.
-  let verifier = [{ return Verify(*this); }];
-}
-
-//===----------------------------------------------------------------------===//
-// XLA binary elementwise op definitions.
-// From the client perspective, each of these support both explicit rank
-// broadcasting (via the broadcast_dimensions attribute) and implicit degenerate
-// shape broadcasting.
-//
-// These have 1:1 correspondence with same-named ops in the xla_hlo dialect;
-// however, those operations do not support broadcasting.
-//
-// See:
-//   https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
-//   https://www.tensorflow.org/xla/broadcasting
-//===----------------------------------------------------------------------===//
-
-class HLOClient_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
-        HLOClient_Op<mnemonic, traits> {
-  let arguments = (ins
-    HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
-  );
-
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value left, Value  right, "
-    "DenseIntElementsAttr broadcast_dimensions"
-  >];
-
-  let results = (outs HLO_Tensor);
-  let parser = [{ return mlir::impl::parseOneResultSameOperandTypeOp(parser, result); }];
-  let printer = [{ return mlir::impl::printOneResultOp(getOperation(), p); }];
-}
-
-def HLOClient_AddOp : HLOClient_BinaryElementwiseOp<"add",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_AddOp;
-
-def HLOClient_Atan2Op : HLOClient_BinaryElementwiseOp<"atan2",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_Atan2Op;
-
-def HLOClient_DivOp : HLOClient_BinaryElementwiseOp<"divide",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_DivOp;
-
-def HLOClient_MaxOp : HLOClient_BinaryElementwiseOp<"maximum",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MaxOp;
-
-def HLOClient_MinOp : HLOClient_BinaryElementwiseOp<"minimum",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MinOp;
-
-def HLOClient_MulOp : HLOClient_BinaryElementwiseOp<"multiply",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MulOp;
-
-def HLOClient_PowOp : HLOClient_BinaryElementwiseOp<"pow",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_PowOp;
-
-def HLOClient_RemOp : HLOClient_BinaryElementwiseOp<"remainder",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_RemOp;
-
-def HLOClient_ShiftLeftOp : HLOClient_BinaryElementwiseOp<"shift_left",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftLeftOp;
-
-def HLOClient_ShiftRightArithmeticOp : HLOClient_BinaryElementwiseOp<"shift_right_arithmetic",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightArithmeticOp;
-
-def HLOClient_ShiftRightLogicalOp : HLOClient_BinaryElementwiseOp<"shift_right_logical",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightLogicalOp;
-
-def HLOClient_SubOp : HLOClient_BinaryElementwiseOp<"subtract",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_SubOp;
-
-//===----------------------------------------------------------------------===//
-// XLA binary elementwise op definitions.
-// The same description as the arithmetic binary elementwise ops applies.
-//===----------------------------------------------------------------------===//
-
-class HLOClient_BinaryLogicalElementwiseOp<string mnemonic> :
-        HLOClient_BinaryElementwiseOp<mnemonic, [Commutative, NoSideEffect]> {
-  let arguments = (ins
-    HLO_PredOrIntTensor:$lhs,
-    HLO_PredOrIntTensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
-  );
-}
-
-def HLOClient_AndOp: HLOClient_BinaryLogicalElementwiseOp<"and">, BASE_HLO_AndOp;
-def HLOClient_OrOp: HLOClient_BinaryLogicalElementwiseOp<"or">, BASE_HLO_OrOp;
-def HLOClient_XorOp : HLOClient_BinaryLogicalElementwiseOp<"xor">, BASE_HLO_XorOp;
-
-#endif  // HLO_CLIENT_OPS
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index a60ebd76d0e..03928467cff 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <algorithm>
+#include <functional>
 
 #include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/APFloat.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
@@ -159,23 +161,15 @@ DenseIntElementsAttr BuildConvPaddingAttrs(
 //===----------------------------------------------------------------------===//
 
 static void Print(ConstOp op, OpAsmPrinter* printer) {
-  // Use short form only if the result type matches type of attribute 'value'.
-  bool use_short_form = op.value().getType() == op.getType();
-
   // Print op name.
   *printer << op.getOperationName();
 
-  // If short form, elide attribute value while printing the attribute
-  // dictionary.
+  // Elide attribute value while printing the attribute dictionary.
   SmallVector<StringRef, 1> elided_attrs;
-  if (use_short_form) elided_attrs.push_back("value");
+  elided_attrs.push_back("value");
   printer->printOptionalAttrDict(op.getAttrs(), elided_attrs);
 
-  if (use_short_form) {
-    *printer << ' ' << op.value();
-  } else {
-    *printer << " : " << op.getType();
-  }
+  *printer << ' ' << op.value();
 }
 
 static ParseResult ParseConstOp(OpAsmParser* parser, OperationState* result) {
@@ -205,7 +199,8 @@ OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
 }
 
 // Builds a constant op with the specified attribute `value`.
-void ConstOp::build(Builder* builder, OperationState& result, Attribute value) {
+void ConstOp::build(OpBuilder& builder, OperationState& result,
+                    Attribute value) {
   Type type;
   if (auto elemAttr = value.dyn_cast<ElementsAttr>()) {
     type = elemAttr.getType();
@@ -271,7 +266,7 @@ static LogicalResult Verify(IotaOp op) {
 // AbsOp
 //===----------------------------------------------------------------------===//
 
-void AbsOp::build(Builder* builder, OperationState& result, Value operand) {
+void AbsOp::build(OpBuilder& builder, OperationState& result, Value operand) {
   auto shaped_type = operand.getType().cast<ShapedType>();
   Type new_type;
   if (!shaped_type.getElementType().isa<ComplexType>()) {
@@ -322,7 +317,7 @@ static LogicalResult Verify(CollectivePermuteOp op) {
 // ConvertOp
 //===----------------------------------------------------------------------===//
 
-void ConvertOp::build(Builder* builder, OperationState& result, Value operand,
+void ConvertOp::build(OpBuilder& builder, OperationState& result, Value operand,
                       Type result_element_ty) {
   Type result_ty;
   Type operand_ty = operand.getType();
@@ -337,6 +332,10 @@ void ConvertOp::build(Builder* builder, OperationState& result, Value operand,
 OpFoldResult ConvertOp::fold(ArrayRef<Attribute> operands) {
   if (getOperand().getType() == getResult().getType()) return getOperand();
 
+  // If the result has non-static shape, a convert op is necessary to go from
+  // static shape to non-static shape.
+  if (!getResult().getType().cast<TensorType>().hasStaticShape()) return {};
+
   // If the operand is constant, we can do the conversion now.
   if (auto elementsAttr = operands.front().dyn_cast_or_null<ElementsAttr>()) {
     return xla::ConvertElementsAttr(elementsAttr,
@@ -555,6 +554,19 @@ static LogicalResult Verify(BroadcastInDimOp op) {
   return success();
 }
 
+OpFoldResult BroadcastInDimOp::fold(ArrayRef<Attribute>) {
+  auto type = getType().cast<RankedTensorType>();
+  if (type != getOperand().getType()) {
+    return nullptr;
+  }
+  auto broadcast_values = broadcast_dimensions().getValues<int64_t>();
+  if (!std::equal(broadcast_values.begin(), broadcast_values.end(),
+                  llvm::seq<int64_t>(0, type.getRank()).begin())) {
+    return nullptr;
+  }
+  return getOperand();
+}
+
 //===----------------------------------------------------------------------===//
 // ScalarsToDimensionTensorOp
 //===----------------------------------------------------------------------===//
@@ -725,7 +737,7 @@ static LogicalResult Verify(ClampOp op) {
 // ComplexOp
 //===----------------------------------------------------------------------===//
 
-void ComplexOp::build(Builder* builder, OperationState& state, Value lhs,
+void ComplexOp::build(OpBuilder& builder, OperationState& state, Value lhs,
                       Value rhs) {
   auto type = lhs.getType();
   auto element_ty = ComplexType::get(getElementTypeOrSelf(type));
@@ -770,7 +782,7 @@ Type CreateRealType(Type type) {
 }
 }  // namespace
 
-void ImagOp::build(Builder* builder, OperationState& state, Value val) {
+void ImagOp::build(OpBuilder& builder, OperationState& state, Value val) {
   build(builder, state, CreateRealType(val.getType()), val);
 }
 
@@ -783,7 +795,7 @@ OpFoldResult ImagOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
-void RealOp::build(Builder* builder, OperationState& state, Value val) {
+void RealOp::build(OpBuilder& builder, OperationState& state, Value val) {
   build(builder, state, CreateRealType(val.getType()), val);
 }
 
@@ -800,9 +812,102 @@ OpFoldResult RealOp::fold(ArrayRef<Attribute> operands) {
 // ConcatenateOp
 //===----------------------------------------------------------------------===//
 
+namespace {
+class ConcatenateOperandRemoval : public OpRewritePattern<ConcatenateOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(ConcatenateOp op,
+                                PatternRewriter& rewriter) const override {
+    auto axis = op.dimension().getLimitedValue();
+    llvm::SmallVector<Value, 6> new_operands;
+    for (auto operand : op.getOperands()) {
+      auto ty = operand.getType().cast<ShapedType>();
+      if (ty.getDimSize(axis) != 0) {
+        new_operands.push_back(operand);
+      }
+    }
+
+    if (!new_operands.empty() && new_operands.size() < op.getNumOperands()) {
+      rewriter.replaceOpWithNewOp<ConcatenateOp>(op, op.getResult().getType(),
+                                                 new_operands, op.dimension());
+      return success();
+    }
+
+    return failure();
+  }
+};
+}  // namespace
+
+void ConcatenateOp::getCanonicalizationPatterns(
+    OwningRewritePatternList& results, MLIRContext* context) {
+  results.insert<ConcatenateOperandRemoval>(context);
+}
+
+template <typename T>
+static Attribute foldConcatenateHelper(ConcatenateOp* op,
+                                       ArrayRef<Attribute> operands) {
+  auto axis = op->dimension().getLimitedValue();
+  auto type = op->getType().cast<ShapedType>();
+
+  SmallVector<T, 6> values;
+  auto shape = type.getShape();
+
+  size_t top_size = 1;
+  for (int i = 0; i < axis; i++) {
+    top_size = top_size * shape[i];
+  }
+
+  for (size_t i = 0; i < top_size; i++) {
+    for (auto operand : operands) {
+      DenseElementsAttr attr = operand.cast<DenseElementsAttr>();
+      size_t bottom_size = attr.getNumElements() / top_size;
+      auto iter = attr.getValues<T>().begin() + i * bottom_size;
+      values.append(iter, iter + bottom_size);
+    }
+  }
+
+  return DenseElementsAttr::get(type, values);
+}
+
+static Attribute foldConcatenate(ConcatenateOp* op,
+                                 ArrayRef<Attribute> operands) {
+  for (auto operand : operands) {
+    if (!operand) return {};
+  }
+
+  auto type = op->getResult().getType().cast<ShapedType>();
+  auto etype = type.getElementType();
+  if (etype.isa<IntegerType>()) {
+    return foldConcatenateHelper<APInt>(op, operands);
+  }
+
+  if (etype.isa<FloatType>()) {
+    return foldConcatenateHelper<APFloat>(op, operands);
+  }
+
+  return {};
+}
+
 OpFoldResult ConcatenateOp::fold(ArrayRef<Attribute> operands) {
   if (getNumOperands() == 1) return getOperand(0);
-  return {};
+
+  ShapedType type = getResult().getType().cast<ShapedType>();
+  if (!type.hasStaticShape()) return {};
+
+  auto axis = dimension().getLimitedValue();
+  if (auto attr = foldConcatenate(this, operands)) {
+    return attr;
+  }
+
+  llvm::SmallVector<Value, 6> new_operands;
+  for (auto operand : getOperands()) {
+    auto ty = operand.getType().cast<ShapedType>();
+    if (ty.getDimSize(axis) != 0) {
+      return {};
+    }
+  }
+
+  return DenseElementsAttr::get(type, ArrayRef<Attribute>());
 }
 
 static LogicalResult Verify(ConcatenateOp op) {
@@ -832,15 +937,106 @@ static LogicalResult Verify(ConcatenateOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DynamicReshapeOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(DynamicReshapeOp op) {
+  auto result_type = op.result().getType().dyn_cast<RankedTensorType>();
+  auto output_shape_type =
+      op.output_shape().getType().dyn_cast<RankedTensorType>();
+  if (result_type && output_shape_type && output_shape_type.hasStaticShape() &&
+      output_shape_type.getDimSize(0) != result_type.getRank()) {
+    return op.emitError() << "output should have a rank equal to the number of "
+                             "elements in output_shape";
+  }
+  return success();
+}
+
+namespace {
+class DynamicReshapeOpNotActuallyDynamic
+    : public OpRewritePattern<DynamicReshapeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(DynamicReshapeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto type = op.result().getType().dyn_cast<RankedTensorType>();
+    if (!type || !type.hasStaticShape()) {
+      return rewriter.notifyMatchFailure(op, "requires static shape tensor");
+    }
+    rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), op.operand());
+    return success();
+  }
+};
+}  // namespace
+
+void DynamicReshapeOp::getCanonicalizationPatterns(
+    OwningRewritePatternList& results, MLIRContext* context) {
+  results.insert<DynamicReshapeOpNotActuallyDynamic>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // DynamicSliceOp
 //===----------------------------------------------------------------------===//
 
+namespace {
+// Canonicalizes DynamicSlice ops that can be replaced instead with Slice ops.
+// This canonicalization is applied the case when the `begin` input values are
+// compile time constants and thus can be made into a tensor.
+struct DynamicSliceToSlice : public OpRewritePattern<DynamicSliceOp> {
+  using OpRewritePattern<DynamicSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DynamicSliceOp dynamic_slice,
+                                PatternRewriter& rewriter) const override {
+    Value input = dynamic_slice.operand();
+    auto input_tensor = input.getType().dyn_cast<RankedTensorType>();
+    if (!input_tensor) return failure();
+
+    SmallVector<int64_t, 4> temp_start_indices;
+    for (Value start : dynamic_slice.start_indices()) {
+      APInt val;
+      if (!matchPattern(start, m_ConstantInt(&val))) {
+        return failure();
+      }
+      temp_start_indices.push_back(*(val.getRawData()));
+    }
+
+    // At this point we've determined that the start indices are all constants;
+    // pack them into a single tensor.
+    auto loc = dynamic_slice.getLoc();
+    int64_t input_rank = input_tensor.getRank();
+    auto slice_start_indices =
+        GetI64ElementsAttr(temp_start_indices, &rewriter);
+    DenseIntElementsAttr slice_limits = BuildSliceLimits(
+        slice_start_indices, dynamic_slice.slice_sizes(), &rewriter);
+    DenseIntElementsAttr slice_strides =
+        GetI64ElementsAttr(SmallVector<int64_t, 4>(input_rank, 1), &rewriter);
+    auto result = rewriter.create<SliceOp>(loc, input, slice_start_indices,
+                                           slice_limits, slice_strides);
+    rewriter.replaceOp(dynamic_slice, {result});
+    return success();
+  }
+};
+
+}  // namespace
+
 void DynamicSliceOp::getCanonicalizationPatterns(
     OwningRewritePatternList& results, MLIRContext* context) {
   results.insert<DynamicSliceToSlice>(context);
 }
 
+// Verifies that the number of slice sizes and the number of start indices match
+static LogicalResult Verify(DynamicSliceOp op) {
+  int num_slice_sizes = op.slice_sizes().getNumElements();
+  int num_start_indices = op.start_indices().size();
+  if (num_start_indices != num_slice_sizes) {
+    return op.emitOpError()
+           << "has mismatched number of slice sizes (" << num_slice_sizes
+           << ") and number of start indices (" << num_start_indices << ")";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // InfeedOp
 //===----------------------------------------------------------------------===//
@@ -969,36 +1165,27 @@ static LogicalResult Verify(RecvOp op) {
 
 OpFoldResult CopyOp::fold(ArrayRef<Attribute> operands) { return getOperand(); }
 
-//===----------------------------------------------------------------------===//
-// ReshapeOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
-  if (getOperand().getType() == getType()) {
-    return getOperand();
-  }
-
-  if (auto prev_op =
-          dyn_cast_or_null<ReshapeOp>(getOperand().getDefiningOp())) {
-    setOperand(prev_op.getOperand());
-    return getResult();
-  }
-
-  if (auto elements = operands.front().dyn_cast_or_null<DenseElementsAttr>()) {
-    return elements.reshape(getResult().getType().cast<ShapedType>());
-  }
-
-  return {};
-}
-
 //===----------------------------------------------------------------------===//
 // ReverseOp
 //===----------------------------------------------------------------------===//
 
 OpFoldResult ReverseOp::fold(ArrayRef<Attribute> operands) {
+  auto input = operand();
+
   // No dimensions to reverse.
-  if (dimensions().getNumElements() == 0) return operand();
-  return nullptr;
+  if (dimensions().getNumElements() == 0) return input;
+
+  llvm::SmallVector<APInt, 5> new_dims;
+  new_dims.reserve(dimensions().getNumElements());
+
+  auto shaped_type = input.getType().cast<ShapedType>();
+  for (auto dim : dimensions().getValues<APInt>()) {
+    if (shaped_type.getDimSize(dim.getLimitedValue()) != 1) {
+      return nullptr;
+    }
+  }
+
+  return input;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1027,7 +1214,7 @@ static TensorType GetReduceResultType(Type operand_ty,
   return RankedTensorType::get(shape, element_ty);
 }
 
-void ReduceOp::build(Builder* builder, OperationState& state,
+void ReduceOp::build(OpBuilder& builder, OperationState& state,
                      ValueRange operands, ValueRange init_values,
                      DenseIntElementsAttr dimensions) {
   SmallVector<Type, 1> result_ty;
@@ -1035,7 +1222,7 @@ void ReduceOp::build(Builder* builder, OperationState& state,
 
   for (Value operand : operands) {
     result_ty.push_back(
-        GetReduceResultType(operand.getType(), dimensions, builder));
+        GetReduceResultType(operand.getType(), dimensions, &builder));
   }
   build(builder, state, result_ty, operands, init_values, dimensions);
 }
@@ -1066,7 +1253,7 @@ static LogicalResult Verify(SelectOp op) {
 // the return type based on operand type.
 LogicalResult SelectOp::inferReturnTypes(
     MLIRContext*, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   auto x_type = operands[1].getType();
   auto y_type = operands[2].getType();
@@ -1171,117 +1358,205 @@ static LogicalResult Verify(PadOp op) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(ReshapeOp op) {
-  auto operand_ty = op.operand().getType().cast<TensorType>();
+  // If the operand type is dynamically shaped there is nothing to verify.
+  auto operand_ty = op.operand().getType().cast<RankedTensorType>();
   if (!operand_ty || !operand_ty.hasStaticShape()) return success();
-  int64_t num_input_elements = operand_ty.getNumElements();
 
-  auto out_ty = op.getType().cast<RankedTensorType>();
-  if (out_ty && out_ty.hasStaticShape()) {
-    int64_t num_output_elements = out_ty.getNumElements();
-    if (num_input_elements != num_output_elements)
-      return op.emitOpError()
-             << "number of output elements (" << num_output_elements
-             << ") doesn't match expected number of elements ("
-             << num_input_elements << ")";
-  }
+  // If the operand type is statically shaped (not required) the number of
+  // elements must match that of the result type.
+  auto result_ty = op.getType().cast<RankedTensorType>();
+  assert(result_ty && result_ty.hasStaticShape() &&
+         "result type must be statically shaped");
+  int64_t num_result_elements = result_ty.getNumElements();
+  int64_t num_operand_elements = operand_ty.getNumElements();
+  if (num_result_elements != num_operand_elements)
+    return op.emitOpError()
+           << "number of output elements (" << num_result_elements
+           << ") doesn't match expected number of elements ("
+           << num_operand_elements << ")";
+
   return success();
 }
 
+OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
+  if (getOperand().getType() == getType()) {
+    return getOperand();
+  }
+
+  if (auto prev_op =
+          dyn_cast_or_null<ReshapeOp>(getOperand().getDefiningOp())) {
+    setOperand(prev_op.getOperand());
+    return getResult();
+  }
+
+  if (auto elements = operands.front().dyn_cast_or_null<DenseElementsAttr>()) {
+    return elements.reshape(getResult().getType().cast<ShapedType>());
+  }
+
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // BinaryOps
 //===----------------------------------------------------------------------===//
 
 namespace {
-// Gets the resulting type from a broadcast between two types.
-static Type GetBroadcastType(Builder* builder, Type x, Type y,
-                             Type element_type,
-                             DenseIntElementsAttr broadcast_dimensions) {
+
+// Updates the element type of a (presumed) tensor type 'x', returning either
+// a permuted UnrankedTensorType or RankedTensorType.
+static Type UpdateResultElementType(Builder* builder, Type x,
+                                    Type element_type) {
   auto x_ranked = x.dyn_cast<RankedTensorType>();
-  auto y_ranked = y.dyn_cast<RankedTensorType>();
-  if (!x_ranked || !y_ranked) {
+  if (!x_ranked) {
     return UnrankedTensorType::get(element_type);
   }
 
   auto shape_x = x_ranked.getShape();
-  auto shape_y = y_ranked.getShape();
-
-  if (shape_x.size() == shape_y.size()) {
-    llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (int i = 0; i < shape_x.size(); i++) {
-      auto x_val = shape_x[i];
-      auto y_val = shape_y[i];
-      if (x_val == -1 || y_val == -1) {
-        out_shape[i] = -1;
-      } else {
-        out_shape[i] = std::max(x_val, y_val);
-      }
-    }
-    return RankedTensorType::get(out_shape, element_type);
-  }
-
-  // Return unranked tensor for invalid broadcast dimensions.
-  if (!broadcast_dimensions) return UnrankedTensorType::get(element_type);
-
-  auto shape_large = shape_x.size() > shape_y.size() ? shape_x : shape_y;
-  auto shape_small = shape_x.size() <= shape_y.size() ? shape_x : shape_y;
-
-  llvm::SmallVector<int64_t, 4> out_shape(shape_large.begin(),
-                                          shape_large.end());
-
-  // Update according to the broadcast dimensions.
-  for (auto index_pair : llvm::enumerate(broadcast_dimensions.getIntValues())) {
-    auto old_value = out_shape[index_pair.value().getSExtValue()];
-    auto new_value = shape_small[index_pair.index()];
-    if (old_value != -1 && (new_value == -1 || new_value > old_value)) {
-      out_shape[index_pair.value().getSExtValue()] = new_value;
-    }
-  }
-
-  return RankedTensorType::get(out_shape, element_type);
+  return RankedTensorType::get(shape_x, element_type);
 }
 }  // namespace
 
-#define BINARY_BUILDER(Op)                                                   \
-  void Op::build(Builder* builder, OperationState& result, Value left,       \
-                 Value right, DenseIntElementsAttr broadcast_dimensions) {   \
-    auto type = GetBroadcastType(builder, left.getType().cast<ShapedType>(), \
-                                 right.getType().cast<ShapedType>(),         \
-                                 getElementTypeOrSelf(right.getType()),      \
-                                 broadcast_dimensions);                      \
-    return Op::build(builder, result, type, left, right,                     \
-                     broadcast_dimensions);                                  \
+template <typename Op, typename ElementType = Type, typename ValType,
+          typename Convert>
+static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
+  if (!attrs[0] || !attrs[1]) return {};
+
+  DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
+  DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
+  if (!lhs || !rhs) return {};
+
+  ShapedType type = op->getType().template cast<ShapedType>();
+  if (!type.hasStaticShape()) {
+    return {};
   }
 
-BINARY_BUILDER(AddOp);
-BINARY_BUILDER(AndOp);
-BINARY_BUILDER(Atan2Op);
-BINARY_BUILDER(DivOp);
-BINARY_BUILDER(MaxOp);
-BINARY_BUILDER(MinOp);
-BINARY_BUILDER(MulOp);
-BINARY_BUILDER(OrOp);
-BINARY_BUILDER(PowOp);
-BINARY_BUILDER(RemOp);
-BINARY_BUILDER(ShiftLeftOp);
-BINARY_BUILDER(ShiftRightArithmeticOp);
-BINARY_BUILDER(ShiftRightLogicalOp);
-BINARY_BUILDER(SubOp);
-BINARY_BUILDER(XorOp);
+  Type etype = type.getElementType();
 
-#undef BINARY_BUILDER
+  // Evaluate for integer values.
+  if (!etype.isa<ElementType>()) {
+    return {};
+  }
+
+  SmallVector<ValType, 6> values;
+  values.reserve(lhs.getNumElements());
+  for (const auto zip :
+       llvm::zip(lhs.getValues<ValType>(), rhs.getValues<ValType>())) {
+    values.push_back(Convert()(std::get<0>(zip), std::get<1>(zip)));
+  }
+
+  return DenseElementsAttr::get(type, values);
+}
+
+#define BINARY_FOLDER(Op, Func)                                                \
+  OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                           \
+    if (getElementTypeOrSelf(getType()).isa<FloatType>())                      \
+      return BinaryFolder<Op, FloatType, APFloat, Func<APFloat>>(this, attrs); \
+    if (getElementTypeOrSelf(getType()).isa<IntegerType>())                    \
+      return BinaryFolder<Op, IntegerType, APInt, Func<APInt>>(this, attrs);   \
+    return {};                                                                 \
+  }
+
+BINARY_FOLDER(AddOp, std::plus);
+BINARY_FOLDER(SubOp, std::minus);
+BINARY_FOLDER(MulOp, std::multiplies);
+
+#undef BINARY_FOLDER
 
 //===----------------------------------------------------------------------===//
 // SliceOp
 //===----------------------------------------------------------------------===//
 
-void SliceOp::build(Builder* builder, OperationState& result, Value operand,
+void SliceOp::build(OpBuilder& builder, OperationState& result, Value operand,
                     DenseIntElementsAttr start_indices,
                     DenseIntElementsAttr limit_indices,
                     DenseIntElementsAttr strides) {
-  return build(
-      builder, result,
-      InferOutputTypes(builder, operand, start_indices, limit_indices, strides),
-      operand, start_indices, limit_indices, strides);
+  return build(builder, result,
+               InferOutputTypes(&builder, operand, start_indices, limit_indices,
+                                strides),
+               operand, start_indices, limit_indices, strides);
+}
+
+template <typename I, typename E>
+static void SliceElements(I values, ArrayRef<int64_t> sizes,
+                          ArrayRef<int64_t> starts, ArrayRef<int64_t> limits,
+                          ArrayRef<int64_t> strides,
+                          llvm::SmallVectorImpl<E>* out_values) {
+  assert(starts.size() == limits.size());
+  assert(starts.size() == strides.size());
+  if (starts.empty()) return;
+
+  int64_t start = starts.front();
+  int64_t limit = limits.front();
+  int64_t stride = strides.front();
+  if (starts.size() == 1) {
+    for (int i = start; i < limit; i += stride) {
+      out_values->push_back(*(values + i));
+    }
+    return;
+  }
+
+  for (; start < limit; start += stride) {
+    auto begin = values + start * sizes.front();
+    SliceElements<I, E>(begin, sizes.drop_front(), starts.drop_front(),
+                        limits.drop_front(), strides.drop_front(), out_values);
+  }
+}
+
+template <typename I, typename E>
+static Attribute FoldSlice(SliceOp* op, I values) {
+  auto start = llvm::to_vector<6>(op->start_indices().getValues<int64_t>());
+  auto limit = llvm::to_vector<6>(op->limit_indices().getValues<int64_t>());
+  auto stride = llvm::to_vector<6>(op->strides().getValues<int64_t>());
+
+  auto result_type = op->operand().getType().cast<ShapedType>();
+  if (!result_type.hasStaticShape()) return {};
+
+  auto shape = result_type.getShape();
+  int64_t count = result_type.getNumElements();
+  // Compute the striding for each dimension.
+  llvm::SmallVector<int64_t, 6> sizes;
+  sizes.reserve(shape.size());
+  for (auto v : shape) {
+    count = count / v;
+    sizes.push_back(count);
+  }
+
+  llvm::SmallVector<E, 6> out_values;
+  out_values.reserve(result_type.getNumElements());
+  SliceElements<I, E>(values, sizes, start, limit, stride, &out_values);
+
+  return DenseElementsAttr::get(op->getResult().getType().cast<ShapedType>(),
+                                out_values);
+}
+
+OpFoldResult SliceOp::fold(ArrayRef<Attribute> operands) {
+  // Check if the SliceOp is a NoOp operation.
+  auto operand_shape = getOperand().getType().cast<ShapedType>().getShape();
+  auto result_type = getResult().getType().cast<ShapedType>();
+  auto result_shape = result_type.getShape();
+
+  if (result_type.hasStaticShape() && (operand_shape == result_shape)) {
+    return getOperand();
+  }
+
+  if (operands.empty() || !operands.front()) return {};
+
+  // Evaluate for statically valued inputs.
+  DenseElementsAttr elements = operands.front().dyn_cast<DenseElementsAttr>();
+  if (!elements) return {};
+
+  auto etype = elements.getType().getElementType();
+  if (etype.isa<IntegerType>()) {
+    return FoldSlice<DenseElementsAttr::IntElementIterator, APInt>(
+        this, elements.getIntValues().begin());
+  } else if (etype.isa<FloatType>()) {
+    return FoldSlice<
+        llvm::mapped_iterator<DenseElementsAttr::IntElementIterator,
+                              std::function<APFloat(const APInt&)>>,
+        APFloat>(this, elements.getFloatValues().begin());
+  }
+
+  return {};
 }
 
 // Returns output dimension size for slice result for the given arguments.
@@ -1328,16 +1603,16 @@ Type SliceOp::InferOutputTypes(Builder* builder, Value operand,
 // SortOp
 //===----------------------------------------------------------------------===//
 
-void SortOp::build(Builder* builder, OperationState& state, ValueRange operands,
-                   int64_t dimension, bool is_stable) {
+void SortOp::build(OpBuilder& builder, OperationState& state,
+                   ValueRange operands, int64_t dimension, bool is_stable) {
   state.addOperands(operands);
-  state.addAttribute("dimension", builder->getI64IntegerAttr(dimension));
-  state.addAttribute("is_stable", builder->getBoolAttr(dimension));
+  state.addAttribute("dimension", builder.getI64IntegerAttr(dimension));
+  state.addAttribute("is_stable", builder.getBoolAttr(dimension));
 
   SmallVector<Type, 2> element_types;
   element_types.reserve(operands.size());
   for (Value operand : operands) element_types.push_back(operand.getType());
-  state.addTypes(builder->getTupleType(element_types));
+  state.addTypes(builder.getTupleType(element_types));
 
   state.addRegion();
 }
@@ -1512,24 +1787,24 @@ static LogicalResult Verify(TriangularSolveOp op) {
 // GetTupleElementOp
 //===----------------------------------------------------------------------===//
 
-void GetTupleElementOp::build(Builder* builder, OperationState& result,
+void GetTupleElementOp::build(OpBuilder& builder, OperationState& result,
                               Value tuple, int32_t index) {
   if (auto tuple_type = tuple.getType().dyn_cast<TupleType>()) {
     auto element_type = tuple_type.getType(index);
     build(builder, result, element_type, tuple,
-          builder->getI32IntegerAttr(index));
+          builder.getI32IntegerAttr(index));
     return;
   }
 
   build(builder, result, tuple.getType(), tuple,
-        builder->getI32IntegerAttr(index));
+        builder.getI32IntegerAttr(index));
 }
 
 //===----------------------------------------------------------------------===//
 // TupleOp
 //===----------------------------------------------------------------------===//
 
-void TupleOp::build(Builder* builder, OperationState& result,
+void TupleOp::build(OpBuilder& builder, OperationState& result,
                     ValueRange values) {
   SmallVector<Type, 4> types;
   types.reserve(values.size());
@@ -1537,7 +1812,7 @@ void TupleOp::build(Builder* builder, OperationState& result,
     types.push_back(val.getType());
   }
 
-  build(builder, result, builder->getTupleType(types), values);
+  build(builder, result, builder.getTupleType(types), values);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1553,13 +1828,11 @@ void UnaryEinsumOp::getCanonicalizationPatterns(
 // CompareOp
 //===----------------------------------------------------------------------===//
 
-void CompareOp::build(Builder* builder, OperationState& result, Value lhs,
-                      Value rhs, DenseIntElementsAttr broadcast_dimensions,
-                      StringAttr comparison_direction) {
-  auto new_type = GetBroadcastType(builder, lhs.getType(), rhs.getType(),
-                                   builder->getI1Type(), broadcast_dimensions);
-  build(builder, result, new_type, lhs, rhs, broadcast_dimensions,
-        comparison_direction);
+void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
+                      Value rhs, StringAttr comparison_direction) {
+  auto new_type =
+      UpdateResultElementType(&builder, lhs.getType(), builder.getI1Type());
+  build(builder, result, new_type, lhs, rhs, comparison_direction);
 }
 
 #define GET_OP_CLASSES
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
index 02f36836f5e..9725a0684f6 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
@@ -29,8 +29,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 class OpBuilder;
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index abfc42b20d9..99801f1618e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -23,7 +23,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_utils.td"
 
@@ -46,8 +46,9 @@ class HLO_Op<string mnemonic, list<OpTrait> traits> :
 // XLA nullary op definitions.
 //===----------------------------------------------------------------------===//
 
-def HLO_ConstOp : HLO_Op<"constant", [ConstantLike, NoSideEffect]>,
-                  BASE_HLO_ConstOp {
+def HLO_ConstOp : HLO_Op<"constant",
+    [ConstantLike, NoSideEffect, AllTypesMatch<["value", "output"]>]>,
+    BASE_HLO_ConstOp {
   let arguments = (ins
     ElementsAttr:$value
   );
@@ -57,7 +58,7 @@ def HLO_ConstOp : HLO_Op<"constant", [ConstantLike, NoSideEffect]>,
   );
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Attribute value"
+    "OpBuilder &builder, OperationState &result, Attribute value"
   >];
 
   let printer = [{ return Print(*this, &p); }];
@@ -94,6 +95,7 @@ def HLO_CreateTokenOp : HLO_Op<"create_token", [NoSideEffect]> {
 // XLA unary elementwise op definitions.
 //===----------------------------------------------------------------------===//
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
+
 class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
       Type TensorType>: HLO_Op<mnemonic,
         !listconcat(traits, [InferShapedTypeOpInterface])> {
@@ -102,8 +104,7 @@ class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
     let extraClassDeclaration = [{
       static  LogicalResult inferReturnTypeComponents(
           MLIRContext* context, Optional<Location> location,
-          ValueRange operands, ArrayRef<NamedAttribute> attributes,
-          RegionRange regions,
+          ValueRange operands, DictionaryAttr attributes, RegionRange regions,
           SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
         return failure();
       }
@@ -117,9 +118,10 @@ class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
 
 // Abs supports complex to real, so element type is not guaranteed to match.
 def HLO_AbsOp: HLO_UnaryElementwiseOp<"abs",
-    [NoSideEffect, SameOperandsAndResultShape], HLO_Tensor>, BASE_HLO_AbsOp {
+    [NoSideEffect, SameOperandsAndResultShape],
+     TensorOf<[HLO_SInt, AnyFloat, HLO_Complex]>>, BASE_HLO_AbsOp {
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value operand"
+    "OpBuilder &builder, OperationState &result, Value operand"
   >];
 }
 
@@ -131,7 +133,7 @@ def HLO_ConvertOp : HLO_UnaryElementwiseOp<
     BASE_HLO_ConvertOp {
 
   let builders = [OpBuilder<
-    "Builder *, OperationState &tblgen_state, Value operand, "
+    "OpBuilder &, OperationState &tblgen_state, Value operand, "
     "Type result_element_ty"
   >];
 
@@ -159,6 +161,16 @@ def HLO_Expm1Op: HLO_UnaryElementwiseOp<"exponential_minus_one",
 def HLO_FloorOp: HLO_UnaryElementwiseOp<"floor",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_FloorOp;
 
+def HLO_ImagOp: HLO_Op<
+    "imag", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_ImagOp {
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &tblgen_state, Value val">];
+
+  let arguments = (ins HLO_ComplexTensor);
+  let results = (outs HLO_FpTensor);
+  let hasFolder = 1;
+}
+
 def HLO_IsFiniteOp: HLO_UnaryElementwiseOp<"is_finite",
     [NoSideEffect, SameOperandsAndResultShape], HLO_Tensor>,
     BASE_HLO_IsFiniteOp {
@@ -186,6 +198,16 @@ def HLO_PopulationCountOp: HLO_UnaryElementwiseOp<"popcnt",
     [NoSideEffect, SameOperandsAndResultType], HLO_IntTensor>,
     BASE_HLO_PopulationCountOp;
 
+def HLO_RealOp: HLO_Op<
+    "real", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_RealOp {
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &tblgen_state, Value val">];
+
+  let arguments = (ins HLO_ComplexTensor);
+  let results = (outs HLO_FpTensor);
+  let hasFolder = 1;
+}
+
 def HLO_RoundOp: HLO_UnaryElementwiseOp<"round_nearest_afz",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_RoundOp;
 
@@ -194,7 +216,8 @@ def HLO_RsqrtOp: HLO_UnaryElementwiseOp<"rsqrt",
     BASE_HLO_RsqrtOp;
 
 def HLO_SignOp: HLO_UnaryElementwiseOp<"sign",
-    [NoSideEffect, SameOperandsAndResultType], HLO_IntFpOrComplexTensor>,
+    [NoSideEffect, SameOperandsAndResultType],
+    TensorOf<[HLO_SInt, AnyFloat, HLO_Complex]>>,
     BASE_HLO_SignOp;
 
 def HLO_SinOp: HLO_UnaryElementwiseOp<"sine",
@@ -206,67 +229,25 @@ def HLO_SqrtOp: HLO_UnaryElementwiseOp<"sqrt",
     BASE_HLO_SqrtOp;
 
 def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
-    [ResultsAreFloatLike, NoSideEffect, SameOperandsAndResultType],
+    [NoSideEffect, SameOperandsAndResultType],
     HLO_FpOrComplexTensor>, BASE_HLO_TanhOp;
 
-//===----------------------------------------------------------------------===//
-// XLA complex unary elementwise op definitions.
-//===----------------------------------------------------------------------===//
-// See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
-
-def HLO_ComplexOp: HLO_Op<"complex",
-    [NoSideEffect, SameOperandsElementType, SameOperandsAndResultShape]>,
-    BASE_HLO_ComplexOp {
-  let builders = [OpBuilder<
-    "Builder *, OperationState &tblgen_state, Value lhs, Value rhs">];
-
-  let arguments = (ins HLO_FpTensor:$lhs, HLO_FpTensor:$rhs);
-  let results = (outs HLO_ComplexTensor);
-  let hasFolder = 1;
-}
-
-def HLO_ImagOp: HLO_Op<
-    "imag", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_ImagOp {
-  let builders = [OpBuilder<
-    "Builder *, OperationState &tblgen_state, Value val">];
-
-  let arguments = (ins HLO_ComplexTensor);
-  let results = (outs HLO_FpTensor);
-  let hasFolder = 1;
-}
-
-def HLO_RealOp: HLO_Op<
-    "real", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_RealOp {
-  let builders = [OpBuilder<
-    "Builder *, OperationState &tblgen_state, Value val">];
-
-  let arguments = (ins HLO_ComplexTensor);
-  let results = (outs HLO_FpTensor);
-  let hasFolder = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // XLA binary elementwise op definitions.
 //===----------------------------------------------------------------------===//
-
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
+
 class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
         HLO_Op<mnemonic, !listconcat(traits, [InferShapedTypeOpInterface])> {
   let arguments = (ins
     HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+    HLO_Tensor:$rhs
   );
 
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value left, Value  right, "
-    "DenseIntElementsAttr broadcast_dimensions"
-  >];
-
   let extraClassDeclaration = [{
     static  LogicalResult inferReturnTypeComponents(
         MLIRContext* context, Optional<Location> location, ValueRange operands,
-        ArrayRef<NamedAttribute> attributes, RegionRange regions,
+        DictionaryAttr attributes, RegionRange regions,
         SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
       return failure();
     }
@@ -283,40 +264,60 @@ class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
 }
 
 def HLO_AddOp : HLO_BinaryElementwiseOp<"add",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_AddOp;
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_AddOp {
+  let hasFolder = 1;
+}
 
 def HLO_Atan2Op : HLO_BinaryElementwiseOp<"atan2",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_Atan2Op;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_Atan2Op;
+
+def HLO_ComplexOp: HLO_Op<"complex",
+    [NoSideEffect, SameOperandsAndResultShape]>,
+    BASE_HLO_ComplexOp {
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &tblgen_state, Value lhs, Value rhs">];
+
+  let arguments = (ins HLO_FpTensor:$lhs, HLO_FpTensor:$rhs);
+  let results = (outs HLO_ComplexTensor);
+  let hasFolder = 1;
+}
 
 def HLO_DivOp : HLO_BinaryElementwiseOp<"divide",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_DivOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_DivOp {
+}
 
 def HLO_MaxOp : HLO_BinaryElementwiseOp<"maximum",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MaxOp;
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MaxOp {
+}
 
 def HLO_MinOp : HLO_BinaryElementwiseOp<"minimum",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MinOp;
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MinOp {
+}
 
 def HLO_MulOp : HLO_BinaryElementwiseOp<"multiply",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MulOp;
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MulOp {
+  let hasFolder = 1;
+}
 
 def HLO_PowOp : HLO_BinaryElementwiseOp<"power",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_PowOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_PowOp;
 
 def HLO_RemOp : HLO_BinaryElementwiseOp<"remainder",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_RemOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_RemOp;
 
 def HLO_ShiftLeftOp : HLO_BinaryElementwiseOp<"shift_left",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftLeftOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftLeftOp;
 
 def HLO_ShiftRightArithmeticOp : HLO_BinaryElementwiseOp<"shift_right_arithmetic",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightArithmeticOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftRightArithmeticOp;
 
 def HLO_ShiftRightLogicalOp : HLO_BinaryElementwiseOp<"shift_right_logical",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightLogicalOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftRightLogicalOp;
 
 def HLO_SubOp : HLO_BinaryElementwiseOp<"subtract",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_SubOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_SubOp {
+  let hasFolder = 1;
+}
 
 //===----------------------------------------------------------------------===//
 // XLA binary elementwise op definitions.
@@ -324,11 +325,11 @@ def HLO_SubOp : HLO_BinaryElementwiseOp<"subtract",
 
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
 class HLO_BinaryLogicalElementwiseOp<string mnemonic> :
-        HLO_BinaryElementwiseOp<mnemonic, [Commutative, NoSideEffect]> {
+        HLO_BinaryElementwiseOp<
+            mnemonic, [Commutative, NoSideEffect, SameOperandsAndResultType]> {
   let arguments = (ins
     HLO_PredOrIntTensor:$lhs,
-    HLO_PredOrIntTensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+    HLO_PredOrIntTensor:$rhs
   );
 }
 
@@ -477,8 +478,11 @@ def HLO_AfterAllOp : HLO_Op<"after_all", []> {
   let results = (outs HLO_Token);
 }
 
-def HLO_ConditionalOp: HLO_Op<"conditional", [NoSideEffect]> {
-  string summary = "Conditional operator";
+// Xla Client API has two separate calls for indexed and predicated conditional,
+// although both eventually map to kConditional HLO. IfOp maps to predicated
+// conditional use of kConditional HLO.
+def HLO_IfOp: HLO_Op<"if", []> {
+  string summary = "If operator";
 
   string description = [{
     Returns the result of executing either a true or false function depending on
@@ -501,7 +505,7 @@ def HLO_ConditionalOp: HLO_Op<"conditional", [NoSideEffect]> {
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_WhileOp: HLO_Op<"while", [NoSideEffect, SameOperandsAndResultType]> {
+def HLO_WhileOp: HLO_Op<"while", [SameOperandsAndResultType]> {
   string summary = "While operator";
 
   string description = [{
@@ -562,7 +566,7 @@ def HLO_ReduceOp: HLO_Op<"reduce", [
   let results = (outs Variadic<HLO_TensorOrTuple>);
 
   let builders = [OpBuilder<
-    "Builder *, OperationState &state, ValueRange operands, "
+    "OpBuilder &, OperationState &state, ValueRange operands, "
     "ValueRange init_values, DenseIntElementsAttr dimensions"
   >];
 
@@ -592,7 +596,7 @@ def HLO_GetTupleElementOp: HLO_Op<"get_tuple_element", [NoSideEffect]>, BASE_HLO
   let hasFolder = 1;
 
   let builders = [OpBuilder<
-                  "Builder *builder, OperationState &results, "
+                  "OpBuilder &builder, OperationState &results, "
                   "Value  value, int32_t index">];
 }
 
@@ -601,29 +605,24 @@ def HLO_TupleOp : HLO_Op<"tuple", [NoSideEffect]>, BASE_HLO_TupleOp {
   let results = (outs HLO_Tuple);
 
   let builders = [OpBuilder<
-                  "Builder *builder, OperationState &results, "
+                  "OpBuilder &builder, OperationState &results, "
                   "ValueRange values">];
 
 }
 
 def HLO_CompareOp: HLO_Op<"compare",
-      [NoSideEffect, SameOperandsElementType]>, BASE_HLO_CompareOp {
+      [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]>,
+      BASE_HLO_CompareOp {
   let arguments = (ins
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions,
     HLO_ComparisonDirectionAttr:$comparison_direction
   );
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value left, Value  right, "
-    "DenseIntElementsAttr broadcast_dimensions, "
-    "StringAttr comparison_direction"
-  >];
   let results = (outs HLO_PredTensor);
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value lhs, Value rhs, "
-    "DenseIntElementsAttr broadcast_dimensions, StringAttr comparison_direction"
+    "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
+    "StringAttr comparison_direction"
   >];
 }
 
@@ -644,8 +643,10 @@ def HLO_SliceOp: HLO_Op<
 
   let results = (outs HLO_Tensor);
 
+  let hasFolder = 1;
+
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value operand, "
+    "OpBuilder &builder, OperationState &result, Value operand, "
     "DenseIntElementsAttr start_indices, DenseIntElementsAttr limit_indices, "
     "DenseIntElementsAttr strides"
   >];
@@ -661,11 +662,10 @@ def HLO_SliceOp: HLO_Op<
 }
 
 def HLO_DynamicSliceOp: HLO_Op<"dynamic-slice",
-      [NoSideEffect, AllElementTypesMatch<["operand", "result"]>,
-       AllShapesMatch<["start_indices", "slice_sizes"]>]> {
+      [NoSideEffect, AllElementTypesMatch<["operand", "result"]>]> {
   let arguments = (ins
     HLO_Tensor:$operand,
-    HLO_Tensor:$start_indices,
+    Variadic<HLO_ScalarIntTensor>:$start_indices,
     I64ElementsAttr:$slice_sizes
   );
 
@@ -679,7 +679,7 @@ def HLO_DynamicUpdateSliceOp: HLO_Op<"dynamic-update-slice",
   let arguments = (ins
     HLO_Tensor:$operand,
     HLO_Tensor:$update,
-    Variadic<HLO_Tensor>:$start_indices
+    Variadic<HLO_ScalarIntTensor>:$start_indices
   );
 
   let results = (outs HLO_Tensor:$result);
@@ -763,6 +763,7 @@ def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
 
   let results = (outs HLO_StaticShapeTensor);
 
+  let hasFolder = 1;
   // Only handles a static subset of the legacy format.
   let hasCustomHLOConverter = 1;
 }
@@ -776,7 +777,7 @@ def HLO_ScalarsToDimensionTensorOp : HLO_Op<"scalars_to_dimension_tensor",
     compute shape arguments to dynamic operations.
   }];
 
-  let arguments = (ins Variadic<AnySignlessInteger>:$scalars);
+  let arguments = (ins Variadic<HLO_DimensionValue>:$scalars);
   let results = (outs HLO_DimensionTensor);
 
   // Cannot be exported to legacy formats.
@@ -842,6 +843,7 @@ def HLO_ConcatenateOp : HLO_Op<"concatenate",
 
   let results = (outs HLO_Tensor);
 
+  let hasCanonicalizer = 1;
   let hasFolder = 1;
 
 }
@@ -1048,12 +1050,32 @@ def HLO_ReshapeOp: HLO_Op<"reshape",
       [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ReshapeOp {
   let arguments = (ins HLO_Tensor:$operand);
 
-  let results = (outs HLO_Tensor);
+  let results = (outs HLO_StaticShapeTensor);
   let hasFolder = 1;
 
   let hasCustomHLOConverter = 1;
 }
 
+def HLO_DynamicReshapeOp: HLO_Op<"dynamic_reshape", []> {
+  let summary = "Reshape a tensor to a given, possibly dynamic, shape.";
+  let description = [{
+    Reshapes `operand` to `output_shape`.
+
+    Requires:
+    - The length of `output_shape` is equal to the rank of `result`.
+    - The number of elements in `operand` (that is, the product of extents of
+      its shape) is equal to the number of elements in `output_shape` (that is,
+      the product of values in `output_shape`).
+  }];
+
+  let arguments = (ins HLO_Tensor:$operand, HLO_DimensionTensor:$output_shape);
+  let results = (outs HLO_Tensor:$result);
+
+  let hasCanonicalizer = 1;
+  // Cannot be exported to legacy formats.
+  let hasCustomHLOConverter = 1;
+}
+
 def ScatterDimensionNumbers : StructAttr<"ScatterDimensionNumbers", HLO_Dialect,
       [StructFieldAttr<"update_window_dims", I64ElementsAttr>,
       StructFieldAttr<"inserted_window_dims", I64ElementsAttr>,
@@ -1130,7 +1152,7 @@ def HLO_SortOp : HLO_Op<"sort", [NoSideEffect]>, BASE_HLO_SortOp {
   let regions = (region SizedRegion<1>:$comparator);
 
   let builders = [OpBuilder<
-    "Builder *builder, OperationState &state, ValueRange operands, "
+    "OpBuilder &builder, OperationState &state, ValueRange operands, "
     "int64_t dimension = -1, bool is_stable = false"
   >];
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index 7994026ac3b..b5130eafd0e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -18,9 +18,16 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 
-def HLO_Int : SignlessIntOfWidths<[8, 16, 32, 64]>;
 def HLO_Pred : TypeAlias<I1, "pred (AKA boolean or 1-bit integer)">;
 
+// TODO(hinsu): Use signed integers instead of signless integer which is being
+// used for legacy reasons.
+def HLO_SInt : SignlessIntOfWidths<[8, 16, 32, 64]>;
+def HLO_UInt : UnsignedIntOfWidths<[8, 16, 32, 64]>;
+def HLO_Int : AnyTypeOf<[HLO_SInt, HLO_UInt]>;
+
+def HLO_Complex : Complex<AnyTypeOf<[F32, F64]>>;
+
 // The broadcasting dimensions correspond to a tuple that describes how a
 // smaller rank shape is broadcast into a larger rank shape. For example,
 // given a 2x3x4 cuboid and a 3x4 matrix, a broadcasting tuple (1,2) means
@@ -47,24 +54,26 @@ def HLO_FpTensor : TensorOf<[AnyFloat]>;
 
 def HLO_PredTensor : TensorOf<[HLO_Pred]>;
 
-def HLO_Tensor : TensorOf<[AnyFloat, AnySignlessInteger, AnyComplex]>;
+def HLO_Tensor : TensorOf<[AnyFloat, HLO_Pred, HLO_Int, HLO_Complex]>;
 
-def HLO_ComplexTensor : TensorOf<[AnyComplex]>;
+def HLO_ComplexTensor : TensorOf<[HLO_Complex]>;
 
 def HLO_Tuple : NestedTupleOf<[HLO_Tensor, HLO_Token]>;
 
 def HLO_TensorOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Tuple]>;
 
+def HLO_DimensionValue : AnyTypeOf<[Index, HLO_Pred, HLO_Int]>;
+
 // Dynamic representation of a shape vector as a tensor.
 def HLO_DimensionTensor : ShapedContainerType<
-    [Index, AnySignlessInteger],
+    [HLO_DimensionValue],
     And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
     "a 1D tensor of dimensions">;
 
 // In general, static shaped tensor constraints should be avoided unless
 // it is for a legacy op which is only correct with static shapes.
 def HLO_StaticShapeTensor : StaticShapeTensorOf<[
-      AnyFloat, AnySignlessInteger, AnyComplex]>;
+      AnyFloat, HLO_Pred, HLO_Int, HLO_Complex]>;
 
 //===----------------------------------------------------------------------===//
 // XLA on tensors combined type definitions.
@@ -77,10 +86,10 @@ def HLO_IntOrFpTensor : TensorOf<[HLO_Int, AnyFloat]>;
 def HLO_PredOrIntTensor : TensorOf<[HLO_Pred, HLO_Int]>;
 
 // Any floating-point or complex tensor types
-def HLO_FpOrComplexTensor : TensorOf<[AnyFloat, AnyComplex]>;
+def HLO_FpOrComplexTensor : TensorOf<[AnyFloat, HLO_Complex]>;
 
 // Any int, floating-point or complex tensor types
-def HLO_IntFpOrComplexTensor : TensorOf<[HLO_Int, AnyFloat, AnyComplex]>;
+def HLO_IntFpOrComplexTensor : TensorOf<[HLO_Int, AnyFloat, HLO_Complex]>;
 
 // Any pred, int or floating-point tensor types
 def HLO_PredIntOrFpTensor : TensorOf<[HLO_Pred, HLO_Int, AnyFloat]>;
@@ -143,15 +152,6 @@ class BASE_HLO_ClzOp {
   }];
 }
 
-class BASE_HLO_ComplexOp {
-  string summary = "Complex operator";
-
-  string description = [{
-    Performs element-wise conversion of a pair of real and imaginary values to
-    a complex value.
-  }];
-}
-
 class BASE_HLO_ConvertOp {
   string summary = "Convert operator";
 
@@ -393,6 +393,15 @@ class BASE_HLO_AddOp {
   }];
 }
 
+class BASE_HLO_ComplexOp {
+  string summary = "Complex operator";
+
+  string description = [{
+    Performs element-wise conversion of a pair of real and imaginary values to
+    a complex value.
+  }];
+}
+
 class BASE_HLO_DivOp {
   string summary = "Division operator";
 
@@ -517,7 +526,7 @@ class BASE_HLO_AndOp {
   string summary = "Logical and";
 
   string description = [{
-    Returns `lhs /\ rhs` element-wise.
+    Returns `logical_and(lhs, rhs)` element-wise.
 
     See
     https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
@@ -528,7 +537,7 @@ class BASE_HLO_OrOp {
   string summary = "Logical or";
 
   string description = [{
-    Returns `lhs \/ rhs` element-wise.
+    Returns `logical_or(lhs, rhs)` element-wise.
 
     See
     https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
@@ -539,7 +548,7 @@ class BASE_HLO_XorOp {
   string summary = "Logical xor";
 
   string description = [{
-    Returns `lhs xor rhs` element-wise.
+    Returns `logical_xor(lhs, rhs)` element-wise.
 
     See
     https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
index 7fb0e1c0831..680a73e49c5 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
@@ -60,11 +60,11 @@ XlaLhloDialect::XlaLhloDialect(MLIRContext *context)
 
 // TODO(cheshire): Support folding, reuse code from hlo_ops.cc.
 
-void FusionOp::build(Builder *builder, OperationState &result,
+void FusionOp::build(OpBuilder &builder, OperationState &result,
                      ArrayRef<NamedAttribute> attributes) {
   result.addAttributes(attributes);
   Region *bodyRegion = result.addRegion();
-  FusionOp::ensureTerminator(*bodyRegion, *builder, result.location);
+  FusionOp::ensureTerminator(*bodyRegion, builder, result.location);
 }
 
 }  // namespace xla_lhlo
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
index 8a3f833c7f4..1c4ccaae214 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
-#include "mlir/Support/Functional.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 class OpBuilder;
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 7613f1e0ffc..db75bbd1f67 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -19,7 +19,7 @@ limitations under the License.
 #define LHLO_OPS
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 
 def LHLO_Dialect : Dialect {
@@ -37,13 +37,12 @@ def LHLO_IntBuffer : MemRefOf<[HLO_Int]>;
 // Any floating-point tensor types
 def LHLO_FpBuffer : MemRefOf<[AnyFloat]>;
 
-
 def LHLO_PredBuffer : MemRefOf<[HLO_Pred]>;
 
 // Any integer or floating-point tensor types
 def LHLO_IntOrFpBuffer : MemRefOf<[HLO_Int, AnyFloat]>;
 
-def LHLO_Buffer : MemRefOf<[AnyFloat, AnySignlessInteger]>;
+def LHLO_Buffer : MemRefOf<[AnyFloat, AnySignlessInteger, AnyComplex]>;
 
 def LHLO_TupleBuffer : NestedTupleOf<[LHLO_Buffer]>;
 
@@ -93,21 +92,34 @@ def LHLO_CosOp: LHLO_UnaryElementwiseOp<"cosine">, BASE_HLO_CosOp;
 
 def LHLO_ExpOp: LHLO_UnaryElementwiseOp<"exponential">, BASE_HLO_ExpOp;
 
+def LHLO_ImagOp: LHLO_Op<"imag", [SameOperandsShape]>, BASE_HLO_ImagOp {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_LogOp: LHLO_UnaryElementwiseOp<"log">, BASE_HLO_LogOp;
 
 def LHLO_NegOp: LHLO_UnaryElementwiseOp<"negate">, BASE_HLO_NegOp;
 
+def LHLO_RealOp: LHLO_Op<"real", [SameOperandsShape]>, BASE_HLO_RealOp {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_RsqrtOp: LHLO_UnaryElementwiseOp<"rsqrt">, BASE_HLO_RsqrtOp;
 
 def LHLO_SqrtOp: LHLO_UnaryElementwiseOp<"sqrt">, BASE_HLO_SqrtOp;
 
 def LHLO_SignOp: LHLO_UnaryElementwiseOp<"sign">, BASE_HLO_SignOp;
 
+def LHLO_SinOp: LHLO_UnaryElementwiseOp<"sine">, BASE_HLO_SinOp;
+
 def LHLO_TanhOp: LHLO_UnaryElementwiseOp<"tanh">, BASE_HLO_TanhOp;
 
 //===----------------------------------------------------------------------===//
 // XLA binary elementwise op definitions.
 //===----------------------------------------------------------------------===//
+// See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
 
 class LHLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
         LHLO_Op<mnemonic, traits> {
@@ -121,6 +133,12 @@ class LHLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
 
 def LHLO_AddOp : LHLO_BinaryElementwiseOp<"add", []>, BASE_HLO_AddOp;
 
+def LHLO_ComplexOp: LHLO_Op<"complex", [SameOperandsShape]>, BASE_HLO_ComplexOp {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
+                       Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_DivOp : LHLO_BinaryElementwiseOp<"divide", []>, BASE_HLO_DivOp;
 
 def LHLO_MaxOp : LHLO_BinaryElementwiseOp<"maximum", []>, BASE_HLO_MaxOp;
@@ -402,7 +420,7 @@ def FusionOp : LHLO_Op<"fusion", [SingleBlockImplicitTerminator<"TerminatorOp">]
 
   let skipDefaultBuilders = 1;
   let builders = [
-     OpBuilder<"Builder *builder, OperationState &result, "
+     OpBuilder<"OpBuilder &builder, OperationState &result, "
                "ArrayRef<NamedAttribute> attributes">
    ];
 }
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index 8bf036224ba..774caab77fb 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -18,11 +18,13 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/attribute_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 
@@ -54,6 +56,20 @@ static mlir::DenseIntElementsAttr GetI64ElementsAttr(
   return mlir::DenseIntElementsAttr::get(ty, mlir_values);
 }
 
+static mlir::DenseIntElementsAttr ConvertPadding(
+    absl::Span<const std::pair<tensorflow::int64, tensorflow::int64>> padding,
+    mlir::Builder* builder) {
+  llvm::SmallVector<int64_t, 8> elements;
+  elements.reserve(padding.size() * 2);
+  for (const auto& vals : padding) {
+    elements.push_back(vals.first);
+    elements.push_back(vals.second);
+  }
+  auto ty = mlir::RankedTensorType::get(
+      {static_cast<int64_t>(padding.size()), 2}, builder->getIntegerType(64));
+  return mlir::DenseIntElementsAttr::get(ty, elements);
+}
+
 MlirHloBuilder::~MlirHloBuilder() = default;
 
 StatusOr<XlaOp> MlirHloBuilder::MakeXlaOp(mlir::Value val) {
@@ -77,6 +93,76 @@ XlaOp MlirHloBuilder::ConstantLiteral(const LiteralSlice& literal) {
   });
 }
 
+StatusOr<XlaOp> MlirHloBuilder::ConvGeneralDilatedInternal(
+    const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+    absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  mlir::ArrayAttr config_attr;
+  if (precision_config)
+    config_attr = ConvertPrecisionConfig(precision_config, &builder_);
+  auto op = builder_.create<mlir::xla_hlo::ConvOp>(
+      loc_, ty, GetValue(lhs), GetValue(rhs),
+      GetI64ElementsAttr(window_strides, &builder_),
+      ConvertPadding(padding, &builder_),
+      GetI64ElementsAttr(lhs_dilation, &builder_),
+      GetI64ElementsAttr(rhs_dilation, &builder_),
+      ConvertConvDimensionNumbers(dimension_numbers, &builder_),
+      builder_.getI64IntegerAttr(feature_group_count),
+      builder_.getI64IntegerAttr(batch_group_count), config_attr);
+  return MakeXlaOp(op);
+}
+
+StatusOr<XlaOp> MlirHloBuilder::TransposeInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const int64> permutation) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  auto op = builder_.create<mlir::xla_hlo::TransposeOp>(
+      loc_, ty, GetValue(operand), GetI64ElementsAttr(permutation, &builder_));
+  return MakeXlaOp(op);
+}
+
+StatusOr<XlaOp> MlirHloBuilder::GatherInternal(
+    const Shape& shape, XlaOp input, XlaOp start_indices,
+    const GatherDimensionNumbers& dimension_numbers,
+    absl::Span<const int64> slice_sizes, bool indices_are_sorted) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  auto op = builder_.create<mlir::xla_hlo::GatherOp>(
+      loc_, ty, GetValue(input), GetValue(start_indices),
+      ConvertGatherDimensionNumbers(dimension_numbers, &builder_),
+      GetI64ElementsAttr(slice_sizes, &builder_));
+  return MakeXlaOp(op);
+}
+
+StatusOr<XlaOp> MlirHloBuilder::RngOpInternal(
+    RandomDistribution distribution, absl::Span<const XlaOp> parameters,
+    const Shape& shape) {
+  // TODO(hinsu): Introduce RngOp in the HLO dialect in MLIR and then RngUniform
+  // and RngNormal can be mapped to the new op.
+  std::string op_name;
+  if (distribution == xla::RandomDistribution::RNG_UNIFORM) {
+    op_name = "xla_hlo.rng_uniform";
+  } else {
+    TF_RET_CHECK(distribution == xla::RandomDistribution::RNG_NORMAL)
+        << "Unexpected distribution: " << distribution;
+    op_name = "xla_hlo.rng_normal";
+  }
+
+  if (shape.is_dynamic())
+    return Unimplemented("RngOp with dynamic dims not supported");
+  llvm::SmallVector<XlaOp, 3> operands;
+  operands.append(parameters.begin(), parameters.end());
+  operands.push_back(
+      ConstantLiteral(LiteralUtil::CreateR1<int64>(shape.dimensions())));
+  return CreateOp(op_name, shape, operands);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::ReshapeInternal(const Shape& shape,
                                                 XlaOp operand,
                                                 int64 inferred_dimension) {
@@ -91,6 +177,19 @@ StatusOr<XlaOp> MlirHloBuilder::ReshapeInternal(const Shape& shape,
   return MakeXlaOp(op.getResult());
 }
 
+StatusOr<XlaOp> MlirHloBuilder::DotGeneralInternal(
+    const Shape& shape, XlaOp lhs, XlaOp rhs,
+    const DotDimensionNumbers& dimension_number,
+    const PrecisionConfig* precision_config) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  auto op = builder_.create<mlir::xla_hlo::DotGeneralOp>(
+      loc_, ty, GetValue(lhs), GetValue(rhs),
+      ConvertDotDimensionNumbers(dimension_number, &builder_),
+      ConvertPrecisionConfig(precision_config, &builder_));
+  return MakeXlaOp(op.getResult());
+}
+
 StatusOr<XlaOp> MlirHloBuilder::InDimBroadcast(
     const Shape& shape, XlaOp operand,
     absl::Span<const int64> broadcast_dimensions) {
@@ -110,7 +209,6 @@ StatusOr<XlaOp> MlirHloBuilder::Compare(const Shape& shape, XlaOp lhs,
                                          shape, builder_));
   auto op = builder_.create<mlir::xla_hlo::CompareOp>(
       loc_, ty, GetValue(lhs), GetValue(rhs),
-      /*broadcast_dimensions=*/mlir::DenseIntElementsAttr(),
       builder_.getStringAttr(ComparisonDirectionToString(direction)));
   return MakeXlaOp(op.getResult());
 }
@@ -118,15 +216,120 @@ StatusOr<XlaOp> MlirHloBuilder::Compare(const Shape& shape, XlaOp lhs,
 XlaOp MlirHloBuilder::BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape,
                                           XlaOp lhs, XlaOp rhs) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    return CreateOp(GetMlirOpName(binop), shape, {lhs, rhs}, /*attributes=*/{});
+    return CreateOp(GetMlirOpName(binop), shape, {lhs, rhs});
   });
 }
 
 StatusOr<XlaOp> MlirHloBuilder::AddOpWithShape(
     HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands) {
   return CreateOp(GetMlirOpName(opcode), shape,
-                  llvm::makeArrayRef<XlaOp>(operands.data(), operands.size()),
-                  /*attributes=*/{});
+                  llvm::makeArrayRef<XlaOp>(operands.data(), operands.size()));
+}
+
+XlaOp MlirHloBuilder::CreateToken() {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    return MakeXlaOp(builder_.create<mlir::xla_hlo::CreateTokenOp>(
+        loc_, mlir::xla_hlo::TokenType::get(builder_.getContext())));
+  });
+}
+
+StatusOr<XlaOp> MlirHloBuilder::InfeedWithTokenInternal(
+    const Shape& infeed_instruction_shape, XlaOp token, const string& config) {
+  TF_ASSIGN_OR_RETURN(mlir::Type result_type,
+                      ConvertShapeToType<mlir::RankedTensorType>(
+                          infeed_instruction_shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::InfeedOp>(
+      loc_, result_type, GetValue(token),
+      /*infeed_config=*/config));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::OutfeedWithTokenInternal(
+    XlaOp operand, XlaOp token, const Shape& shape_with_layout,
+    const string& outfeed_config) {
+  auto token_type = mlir::xla_hlo::TokenType::get(builder_.getContext());
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::OutfeedOp>(
+      loc_, token_type, GetValue(operand), GetValue(token), outfeed_config));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::ConcatInDimInternal(
+    const Shape& shape, absl::Span<const XlaOp> operands, int64 dimension) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_type,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  auto mlir_operands = GetValues(operands);
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::ConcatenateOp>(
+      loc_, result_type, mlir_operands, builder_.getI64IntegerAttr(dimension)));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::GetTupleElementInternal(const Shape& shape,
+                                                        XlaOp tuple_data,
+                                                        int64 index) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_type,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::GetTupleElementOp>(
+      loc_, result_type, GetValue(tuple_data),
+      builder_.getI32IntegerAttr(index)));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::SliceInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const int64> start_indices,
+    absl::Span<const int64> limit_indices, absl::Span<const int64> strides) {
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::SliceOp>(
+      loc_, GetValue(operand), GetI64ElementsAttr(start_indices, &builder_),
+      GetI64ElementsAttr(limit_indices, &builder_),
+      GetI64ElementsAttr(strides, &builder_)));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::DynamicSliceInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+    absl::Span<const int64> slice_sizes) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::DynamicSliceOp>(
+      loc_, result_ty, GetValue(operand), GetValues(start_indices),
+      GetI64ElementsAttr(slice_sizes, &builder_)));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::DynamicUpdateSliceInternal(
+    const Shape& shape, XlaOp operand, XlaOp update,
+    absl::Span<const XlaOp> start_indices) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::DynamicUpdateSliceOp>(
+      loc_, result_ty, GetValue(operand), GetValue(update),
+      GetValues(start_indices)));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::PadInternal(
+    const Shape& shape, XlaOp operand, XlaOp padding_value,
+    const PaddingConfig& padding_config) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_type,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  std::vector<int64> low;
+  std::vector<int64> high;
+  std::vector<int64> internal;
+  for (auto& dimension : padding_config.dimensions()) {
+    low.push_back(dimension.edge_padding_low());
+    high.push_back(dimension.edge_padding_high());
+    internal.push_back(dimension.interior_padding());
+  }
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::PadOp>(
+      loc_, result_type, GetValue(operand), GetValue(padding_value),
+      GetI64ElementsAttr(low, &builder_), GetI64ElementsAttr(high, &builder_),
+      GetI64ElementsAttr(internal, &builder_)));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::TupleInternal(
+    const Shape& shape, absl::Span<const XlaOp> elements) {
+  mlir::SmallVector<mlir::Value, 4> operands;
+  for (auto& element : elements) {
+    operands.push_back(GetValue(element));
+  }
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::TupleOp>(loc_, operands));
 }
 
 StatusOr<XlaOp> MlirHloBuilder::CreateOp(
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index 85345621677..fc5baaee44d 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -54,6 +54,9 @@ class MlirHloBuilder : public XlaBuilder {
   // TODO(hinsu): Add a constructor to build a new MLIR function from scratch
   // and override Build methods.
 
+  MlirHloBuilder(std::string name, mlir::OpBuilder builder, mlir::Location loc)
+      : XlaBuilder(name), builder_(builder), loc_(loc) {}
+
   MlirHloBuilder(const MlirHloBuilder&) = delete;
   MlirHloBuilder& operator=(const MlirHloBuilder&) = delete;
 
@@ -75,6 +78,17 @@ class MlirHloBuilder : public XlaBuilder {
     return mlir::Value::getFromOpaquePointer(ptr);
   }
 
+  // Returns MLIR values corresponding to the given XLA ops.
+  //
+  // Requires that the ops were created by this builder.
+  std::vector<mlir::Value> GetValues(absl::Span<const XlaOp> ops) {
+    std::vector<mlir::Value> values;
+    for (auto xla_op : ops) {
+      values.push_back(GetValue(xla_op));
+    }
+    return values;
+  }
+
   // Sets location for newly built ops, until reset.
   void SetLocation(mlir::Location loc) { loc_ = loc; }
 
@@ -87,12 +101,46 @@ class MlirHloBuilder : public XlaBuilder {
   // Returns the shape of the given op.
   StatusOr<const Shape*> GetShapePtr(XlaOp op) const override;
 
+  // Creates the given op at the current location.
+  template <typename OpTy, typename... Args>
+  OpTy create(Args&&... args) {
+    return builder_.create<OpTy>(loc_, std::forward<Args>(args)...);
+  }
+
  private:
   XlaOp ConstantLiteral(const LiteralSlice& literal) override;
 
+  StatusOr<XlaOp> ConvGeneralDilatedInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config) override;
+
+  StatusOr<XlaOp> TransposeInternal(
+      const Shape& shape, XlaOp operand,
+      absl::Span<const int64> permutation) override;
+
+  StatusOr<XlaOp> GatherInternal(
+      const Shape& shape, XlaOp input, XlaOp start_indices,
+      const GatherDimensionNumbers& dimension_numbers,
+      absl::Span<const int64> slice_sizes, bool indices_are_sorted) override;
+
+  StatusOr<XlaOp> RngOpInternal(RandomDistribution distribution,
+                                absl::Span<const XlaOp> parameters,
+                                const Shape& shape) override;
+
   StatusOr<XlaOp> ReshapeInternal(const Shape& shape, XlaOp operand,
                                   int64 inferred_dimension) override;
 
+  StatusOr<XlaOp> DotGeneralInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs,
+      const DotDimensionNumbers& dimension_number,
+      const PrecisionConfig* precision_config) override;
+
   StatusOr<XlaOp> InDimBroadcast(
       const Shape& shape, XlaOp operand,
       absl::Span<const int64> broadcast_dimensions) override;
@@ -106,10 +154,47 @@ class MlirHloBuilder : public XlaBuilder {
   StatusOr<XlaOp> AddOpWithShape(HloOpcode opcode, const Shape& shape,
                                  absl::Span<const XlaOp> operands) override;
 
+  XlaOp CreateToken() override;
+
+  StatusOr<XlaOp> InfeedWithTokenInternal(const Shape& infeed_instruction_shape,
+                                          XlaOp token,
+                                          const string& config) override;
+  StatusOr<XlaOp> OutfeedWithTokenInternal(
+      XlaOp operand, XlaOp token, const Shape& shape_with_layout,
+      const string& outfeed_config) override;
+
+  StatusOr<XlaOp> ConcatInDimInternal(const Shape& shape,
+                                      absl::Span<const XlaOp> operands,
+                                      int64 dimension) override;
+
+  StatusOr<XlaOp> GetTupleElementInternal(const Shape& shape, XlaOp tuple_data,
+                                          int64 index) override;
+
+  StatusOr<XlaOp> SliceInternal(const Shape& shape, XlaOp operand,
+                                absl::Span<const int64> start_indices,
+                                absl::Span<const int64> limit_indices,
+                                absl::Span<const int64> strides) override;
+
+  StatusOr<XlaOp> DynamicSliceInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+      absl::Span<const int64> slice_sizes) override;
+
+  StatusOr<XlaOp> DynamicUpdateSliceInternal(
+      const Shape& shape, XlaOp operand, XlaOp update,
+      absl::Span<const XlaOp> start_indices) override;
+
+  StatusOr<XlaOp> PadInternal(const Shape& shape, XlaOp operand,
+                              XlaOp padding_value,
+                              const PaddingConfig& padding_config) override;
+
+  StatusOr<XlaOp> TupleInternal(const Shape& shape,
+                                absl::Span<const XlaOp> elements) override;
+
   // Creates HLO dialect op and returns the result as an XlaOp.
-  StatusOr<XlaOp> CreateOp(const std::string& op_name, const Shape& shape,
-                           llvm::ArrayRef<XlaOp> operands,
-                           llvm::ArrayRef<mlir::NamedAttribute> attributes);
+  StatusOr<XlaOp> CreateOp(
+      const std::string& op_name, const Shape& shape,
+      llvm::ArrayRef<XlaOp> operands,
+      llvm::ArrayRef<mlir::NamedAttribute> attributes = {});
 
   mlir::OpBuilder builder_;
   mlir::Location loc_;
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 6d87dc8e603..9e30d830602 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 using ::stream_executor::port::StatusOr;
@@ -612,7 +613,12 @@ LogicalResult ExportXlaOp(DynamicBroadcastInDimOp op, OpLoweringContext ctx) {
   return failure();
 }
 
-LogicalResult ExportXlaOp(ConditionalOp op, OpLoweringContext ctx) {
+LogicalResult ExportXlaOp(DynamicReshapeOp op, OpLoweringContext ctx) {
+  // This op has no expression in the legacy export format.
+  return failure();
+}
+
+LogicalResult ExportXlaOp(IfOp op, OpLoweringContext ctx) {
   xla::XlaComputation true_branch;
   xla::XlaComputation false_branch;
   auto& value_map = *ctx.values;
@@ -901,8 +907,12 @@ LogicalResult ExportXlaOp(WhileOp op, OpLoweringContext ctx) {
 namespace mlir {
 namespace {
 
-StatusOr<xla::Literal> CreateLiteralFromAttr(Type type, ElementsAttr attr) {
-  xla::Shape shape = xla::TypeToShape(type);
+StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
+  if (attr.isa<OpaqueElementsAttr>())
+    return tensorflow::errors::Unimplemented(
+        "Opaque elements attr not supported");
+
+  xla::Shape shape = xla::TypeToShape(attr.getType());
 
 #define ELEMENTS_ATTR_TO_LITERAL(xla_type, cpp_type)       \
   case xla_type: {                                         \
@@ -919,11 +929,27 @@ StatusOr<xla::Literal> CreateLiteralFromAttr(Type type, ElementsAttr attr) {
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S16, int16)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S32, int32)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S64, int64)
-    // TODO(b/130356985): Update once MLIR supports unsigned integers.
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U8, uint8)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U16, uint16)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U32, uint32)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U64, uint64)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C64, std::complex<float>)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C128, std::complex<double>)
+    case xla::PrimitiveType::F16: {
+      llvm::SmallVector<xla::half, 16> values;
+      values.reserve(attr.getNumElements());
+      for (APFloat val : attr.getValues<APFloat>()) {
+        bool loses_info = false;
+        CHECK_EQ(val.convert(llvm::APFloat::IEEEsingle(),
+                             llvm::APFloat::rmTowardZero, &loses_info),
+                 llvm::APFloat::opOK);
+        CHECK(!loses_info);
+        values.push_back(xla::half(val.convertToFloat()));
+      }
+      xla::Array<xla::half> source_data(shape.dimensions());
+      source_data.SetValues(values);
+      return xla::LiteralUtil::CreateFromArray(source_data);
+    }
     case xla::PrimitiveType::BF16: {
       xla::Array<double> source_data(shape.dimensions());
       auto attr_values = attr.getValues<APFloat>();
@@ -960,11 +986,26 @@ LogicalResult ConvertToHloModule::Lower(
     return LowerFunctionCall(&call_op, builder, &value_map);
   }
 
+  if (auto op = dyn_cast<mlir::TensorCastOp>(inst)) {
+    Value operand = op.getOperand();
+    auto ty = operand.getType().dyn_cast<ShapedType>();
+    // If this was a cast from a static shaped tensors, then it is a noop for
+    // export to HLO and we can use the operand.
+    if (!ty || !ty.hasStaticShape()) {
+      inst->emitOpError()
+          << "requires static shaped operand for HLO translation";
+      return failure();
+    }
+
+    value_map[op.getResult()] = value_map[operand];
+    return success();
+  }
+
   // TODO(jpienaar): This doesn't support layouts yet.
   if (matchPattern(inst, m_Constant(&const_attr))) {
-    auto literal_or =
-        CreateLiteralFromAttr(*inst->result_type_begin(), const_attr);
-    if (!literal_or.ok()) return inst->emitError("unsupported elemental type");
+    auto literal_or = CreateLiteralFromAttr(const_attr);
+    if (!literal_or.ok())
+      return inst->emitError(literal_or.status().ToString());
     value_map[inst->getResult(0)] =
         xla::ConstantLiteral(builder, literal_or.ValueOrDie());
     return success();
@@ -1022,8 +1063,7 @@ LogicalResult ConvertToHloModule::Lower(
     return success();
   }
 
-  inst->emitError("unable to lower operation of type '" +
-                  inst->getName().getStringRef().str() + '\'');
+  inst->emitOpError() << "can't be translated to XLA HLO";
   return failure();
 }
 
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
index 1a341b00d0c..8bfe4c76b04 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
@@ -31,16 +31,16 @@ namespace mlir {
 // are converted to a tuple even when there is only a single return value.
 // Multiple return values are always converted to a tuple and returned as a
 // single value.
-Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
+Status ConvertMlirHloToHlo(mlir::ModuleOp module, ::xla::HloProto* hlo_proto,
                            bool use_tuple_args, bool return_tuple,
                            const tensorflow::XlaCompiler::ShapeRepresentationFn
                                shape_representation_fn = nullptr);
 
 // Creates XlaOp equivalent of a given MLIR operation using the operand info
 // from `value_lowering` map.
-llvm::Optional<xla::XlaOp> CreateXlaOperator(
+llvm::Optional<::xla::XlaOp> CreateXlaOperator(
     mlir::Operation* op,
-    llvm::DenseMap<mlir::Value, xla::XlaOp>* value_lowering);
+    llvm::DenseMap<mlir::Value, ::xla::XlaOp>* value_lowering);
 
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
index acb7af50996..44af7ca75bb 100644
--- a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
+++ b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <sstream>
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
@@ -26,13 +27,12 @@ limitations under the License.
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include "mlir/Support/STLExtras.h"  // from @llvm-project
 #include "mlir/TableGen/Operator.h"  // from @llvm-project
 
+using llvm::interleaveComma;
 using llvm::raw_ostream;
 using llvm::RecordKeeper;
 using llvm::StringRef;
-using mlir::interleaveComma;
 using mlir::tblgen::Attribute;
 using mlir::tblgen::NamedAttribute;
 using mlir::tblgen::NamedTypeConstraint;
diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index 989b846f561..ad69383bd98 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(licenses = ["notice"])
 
@@ -18,3 +19,18 @@ filegroup(
         "@llvm-project//llvm:FileCheck",
     ],
 )
+
+tf_cc_test(
+    name = "mlir_hlo_builder_test",
+    srcs = ["mlir_hlo_builder_test.cc"],
+    deps = [
+        "//tensorflow/compiler/mlir/xla:hlo",
+        "//tensorflow/compiler/mlir/xla:mlir_hlo_builder",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir b/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
index 866e7218de0..ad007d0eb50 100644
--- a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
@@ -1,89 +1,156 @@
-// RUN: xla-opt -test-buffer-assignment -split-input-file %s | FileCheck %s -dump-input-on-failure
+// RUN: tf-opt -test-buffer-assignment -allow-unregistered-dialect -split-input-file %s | FileCheck %s -dump-input-on-failure
 
-// CHECK-LABEL: Testing : condBranch
-func @condBranch(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-  // CHECK: Alloc: cond_br
+// CHECK-LABEL: func @func_signature_conversion
+func @func_signature_conversion(%arg0: tensor<4x8xf32>) {
+    return
+}
+// CHECK: ({{.*}}: memref<4x8xf32>) {
+
+// -----
+
+// CHECK-LABEL: func @non_void_to_void_return_op_converter
+func @non_void_to_void_return_op_converter(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
+  return %arg0 : tensor<4x8xf32>
+}
+//      CHECK: (%[[ARG0:.*]]: [[TYPE:.*]]<[[RANK:.*]]>, %[[RESULT:.*]]: [[TYPE]]<[[RANK]]>) {
+// CHECK-NEXT: "buffer_assignment_test.copy"(%[[ARG0]], %[[RESULT]])
+// CHECK-NEXT: return
+
+// -----
+
+// CHECK-LABEL: func @func_and_block_signature_conversion
+func @func_and_block_signature_conversion(%arg0 : tensor<2xf32>, %cond : i1, %arg1: tensor<4x4xf32>) -> tensor<4x4xf32>{
     cond_br %cond, ^bb1, ^bb2
   ^bb1:
     br ^exit(%arg0 : tensor<2xf32>)
   ^bb2:
-    %1 = "xla_hlo.exponential"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    br ^exit(%arg0 : tensor<2xf32>)
+  ^exit(%arg2: tensor<2xf32>):
+    return %arg1 : tensor<4x4xf32>
+}
+//      CHECK: (%[[ARG0:.*]]: [[ARG0_TYPE:.*]], %[[COND:.*]]: i1, %[[ARG1:.*]]: [[ARG1_TYPE:.*]], %[[RESULT:.*]]: [[RESULT_TYPE:.*]]) {
+//      CHECK: br ^[[EXIT_BLOCK:.*]](%[[ARG0]] : [[ARG0_TYPE]])
+//      CHECK: br ^[[EXIT_BLOCK]](%[[ARG0]] : [[ARG0_TYPE]])
+//      CHECK: ^[[EXIT_BLOCK]](%{{.*}}: [[ARG0_TYPE]])
+// CHECK-NEXT: "buffer_assignment_test.copy"(%[[ARG1]], %[[RESULT]])
+// CHECK-NEXT: return
+
+// -----
+
+// CHECK-LABEL: func @condBranch
+func @condBranch(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
+    cond_br %cond, ^bb1, ^bb2
+  ^bb1:
+    br ^exit(%arg0 : tensor<2xf32>)
+  ^bb2:
+    %1 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
     br ^exit(%1 : tensor<2xf32>)
   ^exit(%arg1: tensor<2xf32>):
     return %arg1 : tensor<2xf32>
-  // CHECK-NEXT: Dealloc: return
+
 }
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: cond_br
+//      CHECK: "buffer_assignment_test.copy
+// CHECK-NEXT: dealloc %[[ALLOC]]
+// CHECK-NEXT: return
 
 // -----
 
-// CHECK-LABEL: Testing : criticalEdge
+// CHECK-LABEL: func @emptyUsesValue
+func @emptyUsesValue(%arg0: memref<4xf32>) {
+  %0 = alloc() : memref<4xf32>
+  return
+}
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: dealloc %[[ALLOC]]
+// CHECK-NEXT: return
+
+// -----
+
+// CHECK-LABEL: func @criticalEdge
 func @criticalEdge(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-  // CHECK: Alloc: cond_br
     cond_br %cond, ^bb1, ^exit(%arg0 : tensor<2xf32>)
   ^bb1:
-    %0 = "xla_hlo.exponential"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
     br ^exit(%0 : tensor<2xf32>)
   ^exit(%arg1: tensor<2xf32>):
     return %arg1 : tensor<2xf32>
-  // CHECK-NEXT: Dealloc: return
 }
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: cond_br
+//      CHECK: "buffer_assignment_test.copy
+// CHECK-NEXT: dealloc %[[ALLOC]]
+// CHECK-NEXT: return
 
 // -----
 
-// CHECK-LABEL: Testing : invCriticalEdge
+// CHECK-LABEL: func @invCriticalEdge
 func @invCriticalEdge(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-  // CHECK: Alloc: %0 = "xla_hlo.exponential"
-    %0 = "xla_hlo.exponential"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
     cond_br %cond, ^bb1, ^exit(%arg0 : tensor<2xf32>)
   ^bb1:
     br ^exit(%0 : tensor<2xf32>)
   ^exit(%arg1: tensor<2xf32>):
     return %arg1 : tensor<2xf32>
-  // CHECK-NEXT: Dealloc: return
 }
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
+//      CHECK: "buffer_assignment_test.copy
+// CHECK-NEXT: dealloc %[[ALLOC]]
+// CHECK-NEXT: return
 
 // -----
 
-// CHECK-LABEL: Testing : ifElse
+// CHECK-LABEL: func @ifElse
 func @ifElse(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-  // CHECK: Alloc: %0 = "xla_hlo.exponential"(%arg1)
-    %0 = "xla_hlo.exponential"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>), ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
+    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>),
+                   ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
   ^bb1(%arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>):
     br ^exit(%arg1, %arg2 : tensor<2xf32>, tensor<2xf32>)
   ^bb2(%arg3 : tensor<2xf32>, %arg4 : tensor<2xf32>):
     br ^exit(%arg3, %arg4 : tensor<2xf32>, tensor<2xf32>)
   ^exit(%arg5 : tensor<2xf32>, %arg6 : tensor<2xf32>):
-  // CHECK-NEXT: Dealloc: %7 = "xla_hlo.exponential"(%5)
-  // CHECK: Alloc: %7 = "xla_hlo.exponential"(%5)
-  // CHECK-NEXT: Dealloc: return
-    %1 = "xla_hlo.exponential"(%arg5) : (tensor<2xf32>) -> tensor<2xf32>
+    %1 = "buffer_assignment_test.unary"(%arg5) : (tensor<2xf32>) -> tensor<2xf32>
     return %1 : tensor<2xf32>
 }
+// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
+// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
+//      CHECK: %[[SECOND_ALLOC:.*]] = alloc()
+// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
+// CHECK-NEXT: dealloc %[[FIRST_ALLOC]]
+// CHECK-NEXT: "buffer_assignment_test.copy
+// CHECK-NEXT: dealloc %[[SECOND_ALLOC]]
+// CHECK-NEXT: return
 
 // -----
 
-// CHECK-LABEL: Testing : ifElseNoUsers
+// CHECK-LABEL: func @ifElseNoUsers
 func @ifElseNoUsers(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-  // CHECK: Alloc: %0 = "xla_hlo.exponential"(%arg1)
-    %0 = "xla_hlo.exponential"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>), ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
+    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>),
+                   ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
   ^bb1(%arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>):
     br ^exit(%arg1, %arg2 : tensor<2xf32>, tensor<2xf32>)
   ^bb2(%arg3 : tensor<2xf32>, %arg4 : tensor<2xf32>):
     br ^exit(%arg3, %arg4 : tensor<2xf32>, tensor<2xf32>)
   ^exit(%arg5 : tensor<2xf32>, %arg6 : tensor<2xf32>):
-  // CHECK-NEXT: return
     return %arg0 : tensor<2xf32>
 }
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
+//      CHECK: "buffer_assignment_test.copy
+// CHECK-NEXT: dealloc %[[ALLOC]]
+// CHECK-NEXT: return
 
 // -----
 
-// CHECK-LABEL: Testing : ifElseNested
+// CHECK-LABEL: func @ifElseNested
 func @ifElseNested(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-  // CHECK: Alloc: %0 = "xla_hlo.exponential"(%arg1)
-    %0 = "xla_hlo.exponential"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>), ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
+    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>),
+                   ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
   ^bb1(%arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>):
     br ^exit(%arg1, %arg2 : tensor<2xf32>, tensor<2xf32>)
   ^bb2(%arg3 : tensor<2xf32>, %arg4 : tensor<2xf32>):
@@ -93,39 +160,101 @@ func @ifElseNested(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
   ^bb4(%arg8 : tensor<2xf32>):
     br ^exit(%arg3, %arg8 : tensor<2xf32>, tensor<2xf32>)
   ^exit(%arg5 : tensor<2xf32>, %arg6 : tensor<2xf32>):
-  // CHECK-NEXT: Dealloc: %9 = "xla_hlo.exponential"(%7)
-  // CHECK: Alloc: %9 = "xla_hlo.exponential"(%7)
-  // CHECK-NEXT: Dealloc: return
-    %1 = "xla_hlo.exponential"(%arg5) : (tensor<2xf32>) -> tensor<2xf32>
+    %1 = "buffer_assignment_test.unary"(%arg5) : (tensor<2xf32>) -> tensor<2xf32>
     return %1 : tensor<2xf32>
 }
+// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
+// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
+//      CHECK: %[[SECOND_ALLOC:.*]] = alloc()
+// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
+// CHECK-NEXT: dealloc %[[FIRST_ALLOC]]
+// CHECK-NEXT: "buffer_assignment_test.copy
+// CHECK-NEXT: dealloc %[[SECOND_ALLOC]]
+// CHECK-NEXT: return
 
 // -----
 
-// CHECK-LABEL: Testing : redundantOperations
-func @redundantOperations(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) {
-  // CHECK: Alloc: %0 = xla_hlo.maximum
-  // CHECK-NEXT: Dealloc: %1 = xla_hlo.add
-  %1 = "xla_hlo.maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  // CHECK: Alloc: %1 = xla_hlo.add
-  // CHECK-NEXT: Dealloc: %1 = xla_hlo.add
-  %2 = "xla_hlo.add"(%arg0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK-LABEL: func @redundantOperations
+func @redundantOperations(%arg0: tensor<4xf32>) {
+  %1 = "buffer_assignment_test.unary"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  %2 = "buffer_assignment_test.unary"(%1) : (tensor<4xf32>) -> tensor<4xf32>
   return
 }
+// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
+// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
+// CHECK-NEXT: %[[SECOND_ALLOC:.*]] = alloc()
+// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
+// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc
+// CHECK-NEXT: return
 
 // -----
 
-// CHECK-LABEL: Testing : reduce
-func @reduce(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
-  // CHECK: Alloc: %0 = xla_hlo.constant
-  // CHECK-NEXT: Dealloc: %1 = "xla_hlo.reduce"(%arg0, %0)
-  %0 = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: Alloc: %1 = "xla_hlo.reduce"(%arg0, %0)
-  // CHECK: Dealloc: return
-  %2 = "xla_hlo.reduce"(%arg0, %0) ( {
-  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-    %4 = xla_hlo.add %arg1, %arg2 : tensor<f32>
-    "xla_hlo.return"(%4) : (tensor<f32>) -> ()
-  }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<f32>) -> tensor<4x8xf32>
-  return %2 : tensor<4x8xf32>
+// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc
+func @moving_alloc_and_inserting_missing_dealloc(%cond : i1, %arg0 : memref<2xf32>, %arg1: memref<2xf32>){
+    cond_br %cond, ^bb1, ^bb2
+  ^bb1:
+    %0 = alloc() : memref<2xf32>
+    "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
+    br ^exit(%0 : memref<2xf32>)
+  ^bb2:
+
+    %1 = alloc() : memref<2xf32>
+    "buffer_assignment_test.unary_lowered"(%arg0, %1) : (memref<2xf32>, memref<2xf32>) -> ()
+    br ^exit(%1 : memref<2xf32>)
+  ^exit(%arg2: memref<2xf32>):
+    "bufer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    return
 }
+// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
+// CHECK-NEXT: %[[SECOND_ALLOC:.*]] = alloc()
+//      CHECK: "bufer_assignment_test.copy"
+// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc
+// CHECK-NEXT: return
+
+// -----
+
+// CHECK-LABEL: func @moving_invalid_dealloc_op_complex
+func @moving_invalid_dealloc_op_complex(%cond : i1, %arg0 : memref<2xf32>, %arg1: memref<2xf32>){
+    cond_br %cond, ^bb1, ^bb2
+  ^bb1:
+    br ^exit(%arg0 : memref<2xf32>)
+  ^bb2:
+    %1 = alloc() : memref<2xf32>
+    "buffer_assignment_test.unary_lowered"(%arg0, %1) : (memref<2xf32>, memref<2xf32>) -> ()
+    dealloc %1 : memref<2xf32>
+    br ^exit(%1 : memref<2xf32>)
+  ^exit(%arg2: memref<2xf32>):
+    "bufer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    return
+}
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+//      CHECK: bufer_assignment_test.copy
+// CHECK-NEXT: dealloc
+// CHECK-NEXT: return
+
+// -----
+
+// CHECK-LABEL: func @inserting_missing_dealloc_simple
+func @inserting_missing_dealloc_simple(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
+    %0 = alloc() : memref<2xf32>
+    "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
+    "bufer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    return
+}
+//      CHECK: bufer_assignment_test.copy
+// CHECK-NEXT: dealloc
+
+// -----
+
+// CHECK-LABEL: func @moving_invalid_dealloc_op
+func @moving_invalid_dealloc_op(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
+    %0 = alloc() : memref<2xf32>
+    "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
+    dealloc %0 : memref<2xf32>
+    "bufer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    return
+}
+//      CHECK: bufer_assignment_test.copy
+// CHECK-NEXT: dealloc
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index 1b60745b20c..30255586002 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -1,9 +1,146 @@
 // RUN: xla-opt %s -pass-pipeline='func(canonicalize)' | FileCheck %s --dump-input-on-failure
 
-func @dynamic_slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> {
+// CHECK-LABEL: add_fold
+func @add_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<[1, 2, 3, 4]> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<[5, 6, 7, 8]> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<[6, 8, 10, 12]>
+  %2 = "xla_hlo.add"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: add_scalar_fold
+func @add_scalar_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<1> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<5> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<6>
+  %2 = "xla_hlo.add"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: add_fold_float
+func @add_fold_float() -> tensor<4xf64> {
+  %0 = xla_hlo.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf64>
+  %1 = xla_hlo.constant dense<[5.0, 6.0, 7.0, 8.0]> : tensor<4xf64>
+  // CHECK: xla_hlo.constant dense<[6.000000e+00, 8.000000e+00, 1.000000e+01, 1.200000e+01]>
+  %2 = "xla_hlo.add"(%0, %1) : (tensor<4xf64>, tensor<4xf64>) -> (tensor<4xf64>)
+  return %2 : tensor<4xf64>
+}
+
+// CHECK-LABEL: sub_scalar_fold
+func @sub_scalar_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<5> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<1> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<4>
+  %2 = "xla_hlo.subtract"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: multiply_scalar_fold
+func @multiply_scalar_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<5> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<3> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<15>
+  %2 = "xla_hlo.multiply"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: concatenate_noop
+func @concatenate_noop(%arg0: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK-SAME: [[ARG:%.+]]: tensor<4xi32>
+  %0 = "xla_hlo.concatenate"(%arg0) { dimension = 0 : i64 } : (tensor<4xi32>) -> tensor<4xi32>
+
+  // CHECK: return [[ARG]]
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: concatenate_remove_operand
+func @concatenate_remove_operand(%arg0: tensor<4xi32>, %arg1: tensor<0xi32>) -> tensor<4xi32> {
+  // CHECK-SAME: [[ARG0:%.+]]: tensor<4xi32>
+  // CHECK-SAME: [[ARG1:%.+]]: tensor<0xi32>
+  %0 = "xla_hlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<4xi32>, tensor<0xi32>) -> tensor<4xi32>
+
+  // CHECK: return [[ARG0]]
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: concatenate_empty_bool
+func @concatenate_empty_bool(%arg0: tensor<0xi1>, %arg1: tensor<0xi1>) -> tensor<0xi1> {
+  // CHECK: xla_hlo.constant
+  %0 = "xla_hlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<0xi1>, tensor<0xi1>) -> tensor<0xi1>
+
+  return %0 : tensor<0xi1>
+}
+
+// CHECK-LABEL: concatenate_empty_int
+func @concatenate_empty_int(%arg0: tensor<0xi32>, %arg1: tensor<0xi32>) -> tensor<0xi32> {
+  // CHECK: xla_hlo.constant
+  %0 = "xla_hlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<0xi32>, tensor<0xi32>) -> tensor<0xi32>
+
+  return %0 : tensor<0xi32>
+}
+
+// CHECK-LABEL: concatenate_empty_float
+func @concatenate_empty_float(%arg0: tensor<0xf32>, %arg1: tensor<0xf32>) -> tensor<0xf32> {
+  // CHECK: xla_hlo.constant
+  %0 = "xla_hlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<0xf32>, tensor<0xf32>) -> tensor<0xf32>
+
+  return %0 : tensor<0xf32>
+}
+
+// CHECK-LABEL: concatenate_const_1D
+func @concatenate_const_1D() -> tensor<4xi32> {
+  // CHECK: [[VAL:%.+]]= xla_hlo.constant dense<[0, 1, 2, 3]>
+  %0 = xla_hlo.constant dense<[0, 1]> : tensor<2xi32>
+  %1 = xla_hlo.constant dense<[2, 3]> : tensor<2xi32>
+  %2 = "xla_hlo.concatenate"(%0, %1) { dimension = 0 : i64 } : (tensor<2xi32>, tensor<2xi32>) -> tensor<4xi32>
+
+  // CHECK: return [[VAL]]
+  return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: concatenate_const_1D_float
+func @concatenate_const_1D_float() -> tensor<4xf32> {
+  // CHECK: [[VAL:%.+]] = xla_hlo.constant dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]>
+
+  %0 = xla_hlo.constant dense<[0.0, 1.0]> : tensor<2xf32>
+  %1 = xla_hlo.constant dense<[2.0, 3.0]> : tensor<2xf32>
+  %2 = "xla_hlo.concatenate"(%0, %1) { dimension = 0 : i64 } : (tensor<2xf32>, tensor<2xf32>) -> tensor<4xf32>
+
+  // CHECK: return [[VAL]]
+  return %2 : tensor<4xf32>
+}
+
+// CHECK-LABEL: concatenate_const_2D_vertical
+func @concatenate_const_2D_vertical() -> tensor<2x2xi32> {
+  // CHECK: [[VAL:%.+]]= xla_hlo.constant dense<[
+  // CHECK-SAME: [0, 1], [2, 3]
+  // CHECK-SAME: ]>
+  %0 = xla_hlo.constant dense<[[0, 1]]> : tensor<1x2xi32>
+  %1 = xla_hlo.constant dense<[[2, 3]]> : tensor<1x2xi32>
+  %2 = "xla_hlo.concatenate"(%0, %1) { dimension = 0 : i64 } : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
+
+  // CHECK: return [[VAL]]
+  return %2 : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: concatenate_const_2D_horizontal
+func @concatenate_const_2D_horizontal() -> tensor<2x2xi32> {
+  // CHECK: [[VAL:%.+]]= xla_hlo.constant dense<[
+  // CHECK-SAME: [0, 2], [1, 3]
+  // CHECK-SAME: ]>
+  %0 = xla_hlo.constant dense<[[0], [1]]> : tensor<2x1xi32>
+  %1 = xla_hlo.constant dense<[[2], [3]]> : tensor<2x1xi32>
+  %2 = "xla_hlo.concatenate"(%0, %1) { dimension = 1 : i64 } : (tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
+
+  // CHECK: return [[VAL]]
+  return %2 : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: dynamic_slice_variable_start
+func @dynamic_slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // CHECK: "xla_hlo.dynamic-slice"
-  %0 = xla_hlo.constant dense<[1, 4]> : tensor<2xi64>
-  %1 = "xla_hlo.dynamic-slice"(%arg0, %arg1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<2xi64>) -> tensor<1x4xi32>
+  %1 = "xla_hlo.dynamic-slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   return %1 : tensor<1x4xi32>
 }
 
@@ -14,23 +151,110 @@ func @dynamic_slice_constant_start(%arg0: tensor<4xi32>) -> tensor<2xi32> {
   // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>
   // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>}
   // CHECK: return %[[RESULT]] : tensor<2xi32>
-  %0 = xla_hlo.constant dense<1> : tensor<1xi64>
-  %2 = "xla_hlo.dynamic-slice"(%arg0, %0) {slice_sizes = dense<2> : tensor<1xi64>} : (tensor<4xi32>, tensor<1xi64>) -> tensor<2xi32>
-  return %2 : tensor<2xi32>
+  %0 = xla_hlo.constant dense<1> : tensor<i64>
+  %1 = "xla_hlo.dynamic-slice"(%arg0, %0) {slice_sizes = dense<2> : tensor<1xi64>} : (tensor<4xi32>, tensor<i64>) -> tensor<2xi32>
+  return %1 : tensor<2xi32>
 }
 
 // CHECK-LABEL: dynamic_slice_constant_start_dynamic_shape
-func @dynamic_slice_constant_start_dynamic_shape(%arg0: tensor<?x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> {
+func @dynamic_slice_constant_start_dynamic_shape(%arg0: tensor<?x4xi32>, %arg1: tensor<2xi64>) -> tensor<?x4xi32> {
   // CHECK: %[[RESULT:.*]] = "xla_hlo.slice"(%arg0)
   // CHECK-DAG-SAME: limit_indices = dense<[2, 4]> : tensor<2xi64>
   // CHECK-DAG-SAME: start_indices = dense<[1, 0]> : tensor<2xi64>
   // CHECK-DAG-SAME: strides = dense<1> : tensor<2xi64>
-  // CHECK: return %[[RESULT]] : tensor<1x4xi32>
-  %0 = xla_hlo.constant dense<[1, 0]> : tensor<2xi64>
-  %1 = "xla_hlo.dynamic-slice"(%arg0, %0) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<?x4xi32>, tensor<2xi64>) -> tensor<1x4xi32>
-  return %1 : tensor<1x4xi32>
+  // CHECK: return %[[RESULT]] : tensor<?x4xi32>
+  %0 = xla_hlo.constant dense<1> : tensor<i64>
+  %1 = xla_hlo.constant dense<0> : tensor<i64>
+  %2 = "xla_hlo.dynamic-slice"(%arg0, %0, %1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<?x4xi32>, tensor<i64>, tensor<i64>) -> tensor<?x4xi32>
+  return %2 : tensor<?x4xi32>
 }
 
+// CHECK-LABEL: slice_2D_noop
+// CHECK-SAME: [[ARG:%.+]]: tensor<2x2xi64>
+func @slice_2D_noop(%arg0: tensor<2x2xi64>) -> tensor<2x2xi64> {
+  %0 = "xla_hlo.slice"(%arg0) { limit_indices = dense<[2, 2]> : tensor<2xi64>, start_indices = dense<[0, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<2x2xi64>) -> (tensor<2x2xi64>)
+
+  // CHECK-NEXT: return [[ARG]]
+  return %0 : tensor<2x2xi64>
+}
+
+// CHECK-LABEL: slice_1D_fold
+func @slice_1D_fold() -> tensor<2xi64> {
+  %0 = xla_hlo.constant dense<[5, 7, 9, 10]> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<[7, 9]>
+  %1 = "xla_hlo.slice"(%0) { limit_indices = dense<[3]> : tensor<1xi64>, start_indices = dense<[1]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<4xi64>) -> (tensor<2xi64>)
+  return %1 : tensor<2xi64>
+}
+
+// CHECK-LABEL: slice_1D_fp
+func @slice_1D_fp() -> tensor<2xf32> {
+  %0 = xla_hlo.constant dense<[5.0, 7.0, 9.0, 10.0]> : tensor<4xf32>
+  // CHECK: xla_hlo.constant dense<[7.000000e+00, 9.000000e+00]>
+  %1 = "xla_hlo.slice"(%0) { limit_indices = dense<[3]> : tensor<1xi64>, start_indices = dense<[1]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> (tensor<2xf32>)
+  return %1 : tensor<2xf32>
+}
+
+// CHECK-LABEL: slice_1D_strided_fold
+func @slice_1D_strided_fold() -> tensor<2xi64> {
+  %0 = xla_hlo.constant dense<[5, 7, 9, 10]> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<[7, 10]>
+  %1 = "xla_hlo.slice"(%0) { limit_indices = dense<[4]> : tensor<1xi64>, start_indices = dense<[1]> : tensor<1xi64>, strides = dense<2> : tensor<1xi64>} : (tensor<4xi64>) -> (tensor<2xi64>)
+  return %1 : tensor<2xi64>
+}
+
+// CHECK-LABEL: slice_2D_fold
+func @slice_2D_fold() -> tensor<2x2xi64> {
+  %0 = xla_hlo.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi64>
+  // CHECK-NEXT: xla_hlo.constant dense<[
+  // CHECK-SAME: [6, 7],
+  // CHECK-SAME: [10, 11]
+  // CHECK-SAME: ]>
+  %1 = "xla_hlo.slice"(%0) { limit_indices = dense<[3, 4]> : tensor<2xi64>, start_indices = dense<[1, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x4xi64>) -> (tensor<2x2xi64>)
+  return %1 : tensor<2x2xi64>
+}
+
+// CHECK-LABEL: slice_2D_fold_horizontal
+func @slice_2D_fold_horizontal() -> tensor<1x4xi64> {
+  %0 = xla_hlo.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi64>
+  // CHECK-NEXT: xla_hlo.constant dense<[
+  // CHECK-SAME: [0, 1, 2, 3]
+  // CHECK-SAME: ]>
+  %1 = "xla_hlo.slice"(%0) { limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<[0, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x4xi64>) -> (tensor<1x4xi64>)
+  return %1 : tensor<1x4xi64>
+}
+
+// CHECK-LABEL: slice_2D_fold_vertical
+func @slice_2D_fold_vertical() -> tensor<4x1xi64> {
+  %0 = xla_hlo.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi64>
+  // CHECK-NEXT: xla_hlo.constant dense<[
+  // CHECK-SAME: [2], [6], [10], [14]
+  // CHECK-SAME: ]>
+  %1 = "xla_hlo.slice"(%0) { limit_indices = dense<[4, 3]> : tensor<2xi64>, start_indices = dense<[0, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x4xi64>) -> (tensor<4x1xi64>)
+  return %1 : tensor<4x1xi64>
+}
+
+// CHECK-LABEL: func @broadcast_in_dim_identity
+func @broadcast_in_dim_identity(%arg0: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> {
+  // CHECK: return %arg0
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<2x3x4xf32>) -> tensor<2x3x4xf32>
+  return %0 : tensor<2x3x4xf32>
+}
+
+// CHECK-LABEL: func @broadcast_in_dim_not_identity_because_it_actually_broadcasts
+func @broadcast_in_dim_not_identity_because_it_actually_broadcasts(%arg0: tensor<1x2xf32>) -> tensor<2x2xf32> {
+  // CHECK: xla_hlo.broadcast_in_dim
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// CHECK-LABEL: func @broadcast_in_dim_not_identity_permutation
+func @broadcast_in_dim_not_identity_permutation(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  // CHECK: xla_hlo.broadcast_in_dim
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 0]> : tensor<2xi64>} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_op_not_actually_dynamic
 func @dynamic_broadcast_in_dim_op_not_actually_dynamic(%arg0: tensor<4xf32>, %arg1: tensor<2xi64>) -> tensor<5x4xf32> {
   // CHECK: %[[RESULT:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<5x4xf32>
@@ -155,3 +379,28 @@ func @fold_pad_into_conv_i32(%arg0 : tensor<1x32x32x3xi32>,
   } : (tensor<1x38x38x3xi32>, tensor<7x7x3x64xi32>) -> tensor<1x16x16x64xi32>
   return %2 : tensor<1x16x16x64xi32>
 }
+
+// CHECK-LABEL: func @dynamic_reshape_not_actually_dynamic
+func @dynamic_reshape_not_actually_dynamic(%arg0: tensor<4xf32>, %shape: tensor<2xindex>) -> tensor<4x1xf32> {
+  // CHECK: xla_hlo.reshape
+  %0 = "xla_hlo.dynamic_reshape"(%arg0, %shape) : (tensor<4xf32>, tensor<2xindex>) -> tensor<4x1xf32>
+  return %0 : tensor<4x1xf32>
+}
+
+// CHECK-LABEL: do_not_dce_while
+func @do_not_dce_while(%arg0: tensor<i64>) -> tensor<i64> {
+  // CHECK: xla_hlo.while
+  %0 = "xla_hlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<i64>):
+    %1 = "xla_hlo.compare"(%arg1, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "xla_hlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i64>):
+    %1 = "xla_hlo.create_token"() : () -> !xla_hlo.token
+    // Side-effecting op outfeed present inside while.
+    %2 = "xla_hlo.outfeed"(%arg1, %1) {outfeed_config = ""} : (tensor<i64>, !xla_hlo.token) -> !xla_hlo.token
+    "xla_hlo.return"(%arg1) : (tensor<i64>) -> ()
+  }) : (tensor<i64>) -> tensor<i64>
+
+  return %arg0 : tensor<i64>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
new file mode 100644
index 00000000000..d67a7d09f7c
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
@@ -0,0 +1,56 @@
+// RUN: xla-opt -test-xla-infer-shaped-type-methods -allow-unregistered-dialect -split-input-file -verify-diagnostics %s -o - | FileCheck --dump-input=fail %s
+
+// CHECK-LABEL: @broadcast_add
+// Note that all broadcast_ops are expanded from the same template, so
+// only test reification on an examplar op.
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>,
+// CHECK-SAME: %[[ARG1:.+]]: tensor<?xf32>
+func @broadcast_add(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<1xindex> {
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-DAG: %[[BCAST_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK: %[[EXTENTS:.+]] = "shape.to_extent_tensor"(%[[BCAST_S]])
+  // CHECK: return %[[EXTENTS]]
+  %0 = xla_chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %1 = "xla_test.reify_return_type_shapes"(%0) : (tensor<?xf32>) -> tensor<1xindex>
+  return %1 : tensor<1xindex>
+}
+
+// -----
+// CHECK-LABEL: @complex_ranked_components
+func @complex_ranked_components(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
+  %0 = xla_chlo.broadcast_complex %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
+  // CHECK: "xla_test.return_type_components"(%0) {dims0 = [-1, -1], element_type0 = complex<f32>}
+  %1 = "xla_test.get_return_type_components"(%0) : (tensor<?x?xcomplex<f32>>) -> tensor<?x?xcomplex<f32>>
+  return %1 : tensor<?x?xcomplex<f32>>
+}
+
+// -----
+// CHECK-LABEL: @compare_ranked_components
+func @compare_ranked_components(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
+  %0 = xla_chlo.broadcast_compare %arg0, %arg1 {comparison_direction = "EQ"} : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
+  // CHECK: "xla_test.return_type_components"(%0) {dims0 = [-1, -1], element_type0 = i1}
+  %1 = "xla_test.get_return_type_components"(%0) : (tensor<?x?xi1>) -> tensor<?x?xi1>
+  return %0 : tensor<?x?xi1>
+}
+
+// -----
+// CHECK-LABEL: @broadcast_add_ranked_components_r1
+func @broadcast_add_ranked_components_r1(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = xla_chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: "xla_test.return_type_components"(%0) {dims0 = [-1], element_type0 = f32}
+  %1 = "xla_test.get_return_type_components"(%0) : (tensor<?xf32>) -> tensor<?xf32>
+  return %1 : tensor<?xf32>
+}
+
+// -----
+// CHECK-LABEL: @broadcast_add_ranked_components_r1x2
+func @broadcast_add_ranked_components_r1x2(%arg0: tensor<?xf32>, %arg1: tensor<?x3xf32>) -> tensor<?x3xf32> {
+  %0 = xla_chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?x3xf32>) -> tensor<?x3xf32>
+  // TODO: Overly broad shapes are being returned. Tighten the calculation
+  // and update/extend these tests.
+  // CHECK: "xla_test.return_type_components"(%0) {dims0 = [-1, -1], element_type0 = f32}
+  %1 = "xla_test.get_return_type_components"(%0) : (tensor<?x3xf32>) -> tensor<?x3xf32>
+  return %1 : tensor<?x3xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
new file mode 100644
index 00000000000..7194f7034b5
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -0,0 +1,227 @@
+// RUN: xla-opt -test-xla-chlo-legalize-to-hlo -split-input-file -verify-diagnostics %s -o - | FileCheck --dump-input=fail %s
+
+// Check the non-broadcast case for each registered op, then just check a
+// representative op for detailed broadcast semantics.
+// CHECK-LABEL: @addWithoutBroadcast
+func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.add %arg0, %arg1
+  %0 = xla_chlo.broadcast_add %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @dynamicBroadcast
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
+// CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
+func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
+  // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK-DAG: %[[RESULT:.+]] = xla_hlo.add %[[ARG0_B]], %[[ARG1_B]]
+  // CHECK: return %[[RESULT]] : tensor<?x?xf32>
+  %0 = xla_chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+// CHECK-LABEL: @dynamicBroadcastComplex
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
+// CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
+func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
+  // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[RESULT:.+]] = "xla_hlo.complex"(%[[ARG0_B]], %[[ARG1_B]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
+  // CHECK: return %[[RESULT]] : tensor<?x?xcomplex<f32>>
+  %0 = xla_chlo.broadcast_complex %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
+  return %0 : tensor<?x?xcomplex<f32>>
+}
+
+// -----
+// CHECK-LABEL: @dynamicBroadcastCompare
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
+// CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
+func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
+  // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[RESULT:.+]] = "xla_hlo.compare"(%[[ARG0_B]], %[[ARG1_B]]) {comparison_direction = "EQ"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
+  // CHECK: return %[[RESULT]] : tensor<?x?xi1>
+  %0 = xla_chlo.broadcast_compare %arg0, %arg1 {comparison_direction = "EQ"} : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
+  return %0 : tensor<?x?xi1>
+}
+
+// -----
+// Verifies that broadcast_dimensions validity checks are valid.
+// CHECK-LABEL: @dynamicNonScalarBroadcastDimensions
+func @dynamicNonScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
+  // CHECK: xla_hlo.add
+  %0 = xla_chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  return %0 : tensor<1x4xf32>
+}
+
+// -----
+// Verifies that broadcast_dimensions validity checks are valid.
+// CHECK-LABEL: @dynamicNonScalarByScalarBroadcastDimensions
+func @dynamicNonScalarByScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<f32>) -> tensor<1x4xf32> {
+  // CHECK: xla_hlo.add
+  %0 = xla_chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1x4xf32>, tensor<f32>) -> tensor<1x4xf32>
+  return %0 : tensor<1x4xf32>
+}
+
+// -----
+// Verifies that invalid broadcast dimensions are rejected.
+func @dynamicNonScalarBroadcastDimensionsSizeMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
+  // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
+  // expected-error @+1 {{failed to legalize operation}}
+  %0 = xla_chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  return %0 : tensor<1x4xf32>
+}
+
+// -----
+// Verifies that invalid broadcast dimensions are rejected.
+func @dynamicNonScalarBroadcastDimensionsMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
+  // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
+  // expected-error @+1 {{failed to legalize operation}}
+  %0 = xla_chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  return %0 : tensor<1x4xf32>
+}
+
+// -----
+// Note that broadcast_add is used as a proxy for all of the template
+// expansions. Tests below merely verify that the op has an expansion.
+// CHECK-LABEL: @andWithoutBroadcast
+func @andWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
+  // CHECK: xla_hlo.and %arg0, %arg1
+  %0 = xla_chlo.broadcast_and %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
+  return %0 : tensor<4xi1>
+}
+
+// -----
+// CHECK-LABEL: @atan2WithoutBroadcast
+func @atan2WithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.atan2 %arg0, %arg1
+  %0 = xla_chlo.broadcast_atan2 %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @compareWithoutBroadcast
+func @compareWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xi1> {
+  // CHECK: "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  %0 = xla_chlo.broadcast_compare %arg0, %arg1 {comparison_direction = "EQ"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  return %0 : tensor<4xi1>
+}
+
+// -----
+// CHECK-LABEL: @complexWithoutBroadcast
+func @complexWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xcomplex<f32>> {
+  // CHECK: "xla_hlo.complex"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xcomplex<f32>>
+  %0 = xla_chlo.broadcast_complex %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xcomplex<f32>>
+  return %0 : tensor<4xcomplex<f32>>
+}
+
+// -----
+// CHECK-LABEL: @divideWithoutBroadcast
+func @divideWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.divide %arg0, %arg1
+  %0 = xla_chlo.broadcast_divide %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @maximumWithoutBroadcast
+func @maximumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.maximum %arg0, %arg1
+  %0 = xla_chlo.broadcast_maximum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @minimumWithoutBroadcast
+func @minimumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.minimum %arg0, %arg1
+  %0 = xla_chlo.broadcast_minimum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @multiplyWithoutBroadcast
+func @multiplyWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.multiply %arg0, %arg1
+  %0 = xla_chlo.broadcast_multiply %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @orWithoutBroadcast
+func @orWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
+  // CHECK: xla_hlo.or %arg0, %arg1
+  %0 = xla_chlo.broadcast_or %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
+  return %0 : tensor<4xi1>
+}
+
+// -----
+// CHECK-LABEL: @powerWithoutBroadcast
+func @powerWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.power %arg0, %arg1
+  %0 = xla_chlo.broadcast_power %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @remainderWithoutBroadcast
+func @remainderWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.remainder %arg0, %arg1
+  %0 = xla_chlo.broadcast_remainder %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @shift_leftWithoutBroadcast
+func @shift_leftWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.shift_left %arg0, %arg1
+  %0 = xla_chlo.broadcast_shift_left %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @shift_right_arithmeticWithoutBroadcast
+func @shift_right_arithmeticWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.shift_right_arithmetic %arg0, %arg1
+  %0 = xla_chlo.broadcast_shift_right_arithmetic %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @shift_right_logicalWithoutBroadcast
+func @shift_right_logicalWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.shift_right_logical %arg0, %arg1
+  %0 = xla_chlo.broadcast_shift_right_logical %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @subWithoutBroadcast
+func @subWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: xla_hlo.subtract %arg0, %arg1
+  %0 = xla_chlo.broadcast_subtract %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+// CHECK-LABEL: @xorWithoutBroadcast
+func @xorWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
+  // CHECK: xla_hlo.xor %arg0, %arg1
+  %0 = xla_chlo.broadcast_xor %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
+  return %0 : tensor<4xi1>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index c457f3d5506..68f6d172afc 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -hlo-legalize-to-lhlo %s -o - | FileCheck %s --dump-input-on-failure
+// RUN: xla-opt -hlo-legalize-to-lhlo -buffer-placement %s -o - | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: func @attrs
 func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -13,33 +13,42 @@ func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
 
 // -----
 
+func @return_func(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  return %arg0 : tensor<4xf32>
+}
+//      CHECK: (%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[TYPE]])
+// CHECK-NEXT: "xla_lhlo.copy"(%[[ARG0]], %[[RESULT]]) : ([[TYPE]], [[TYPE]]) -> ()
+// CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
+
+// -----
+
 // CHECK-LABEL: func @func_op_long
 func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
-  // CHECK-NEXT: %[[MUL_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[SUB_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[MIN_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[ADD_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[MAX_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
   %1 = xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.maximum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MAX_RESULT]])
   %2 = xla_hlo.add %arg0, %1 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.add"(%[[NEW_ARG0]], %[[MAX_RESULT]], %[[ADD_RESULT]])
   %3 = xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.minimum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MIN_RESULT]])
   %4 = xla_hlo.subtract %arg1, %3 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.subtract"(%[[NEW_ARG1]], %[[MIN_RESULT]], %[[SUB_RESULT]])
   %5 = xla_hlo.multiply %2, %4 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[MUL_RESULT]])
-  // CHECK-NEXT: dealloc %[[MAX_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: dealloc %[[ADD_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: dealloc %[[MIN_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: dealloc %[[SUB_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]]) : (memref<4xf32>, memref<4xf32>) -> ()
-  // CHECK-NEXT: dealloc %[[MUL_RESULT]] : memref<4xf32>
   return %5 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 }
+//      CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
+// CHECK-NEXT: %[[MAX_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.maximum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MAX_RESULT]])
+// CHECK-NEXT: %[[ADD_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.add"(%[[NEW_ARG0]], %[[MAX_RESULT]], %[[ADD_RESULT]])
+// CHECK-NEXT: dealloc %[[MAX_RESULT]] : memref<4xf32>
+// CHECK-NEXT: %[[MIN_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.minimum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MIN_RESULT]])
+// CHECK-NEXT: %[[SUB_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.subtract"(%[[NEW_ARG1]], %[[MIN_RESULT]], %[[SUB_RESULT]])
+// CHECK-NEXT: dealloc %[[MIN_RESULT]] : memref<4xf32>
+// CHECK-NEXT: %[[MUL_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[MUL_RESULT]])
+// CHECK-NEXT: dealloc %[[SUB_RESULT]] : memref<4xf32>
+// CHECK-NEXT: dealloc %[[ADD_RESULT]] : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]]) : (memref<4xf32>, memref<4xf32>) -> ()
+// CHECK-NEXT: dealloc %[[MUL_RESULT]] : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 
 // -----
 
@@ -47,20 +56,20 @@ func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
              %summand_2: memref<2x2xf32>, %result: memref<2x2xf32>) {
   // CHECK: (%{{.*}}: {{.*}}, {{.*}}: {{.*}}, {{.*}}: {{.*}}, %[[RESULT:.*]]: {{.*}})
-  // CHECK-NEXT:  %[[MUL_RESULT:.*]] = alloc() {temp = true} : memref<2x2xf32>
-  // CHECK-NEXT:  %[[ADD_RESULT:.*]] = alloc() {temp = true} : memref<2x2xf32>
+  // CHECK-NEXT:  %[[ADD_RESULT:.*]] = alloc() : memref<2x2xf32>
   %tensor_summand_1 = tensor_load %summand_1 : memref<2x2xf32>
   %tensor_summand_2 = tensor_load %summand_2 : memref<2x2xf32>
   %sum = "xla_hlo.add"(%tensor_summand_1, %tensor_summand_2)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.add"(%{{.*}}, %{{.*}}, %[[ADD_RESULT]])
+  // CHECK-NEXT:  %[[MUL_RESULT:.*]] = alloc() : memref<2x2xf32>
   %tensor_multiplier = tensor_load %multiplier : memref<2x2xf32>
   %tensor_result = "xla_hlo.multiply"(%sum, %tensor_multiplier)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %{{.*}}, %[[MUL_RESULT]])
+  // CHECK-NEXT:  dealloc %[[ADD_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]])
   tensor_store %tensor_result, %result : memref<2x2xf32>
-  // CHECK-NEXT:  dealloc %[[ADD_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT:  dealloc %[[MUL_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
   "xla_lhlo.terminator"() : () -> ()
@@ -174,6 +183,45 @@ func @dyn_broadcast(%operand: memref<?x?xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @complex
+func @complex(%real: memref<2x2xf32>,
+              %imag: memref<2x2xf32>,
+              %result: memref<2x2xcomplex<f32>>) {
+  %tensor_real = tensor_load %real : memref<2x2xf32>
+  %tensor_imag = tensor_load %imag : memref<2x2xf32>
+  %tensor_result = "xla_hlo.complex"(%tensor_real, %tensor_imag)
+      : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xcomplex<f32>>
+  // CHECK: "xla_lhlo.complex"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<2x2xcomplex<f32>>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @real
+func @real(%operand: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xcomplex<f32>>
+  %tensor_result = "xla_hlo.real"(%tensor_operand)
+      : (tensor<2x2xcomplex<f32>>) -> tensor<2x2xf32>
+  // CHECK: "xla_lhlo.real"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<2x2xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @imag
+func @imag(%operand: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xcomplex<f32>>
+  %tensor_result = "xla_hlo.imag"(%tensor_operand)
+      : (tensor<2x2xcomplex<f32>>) -> tensor<2x2xf32>
+  // CHECK: "xla_lhlo.imag"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<2x2xf32>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @iota
 func @iota(%result: memref<10xi32>) {
   %tensor_result = "xla_hlo.iota"()
@@ -347,3 +395,15 @@ func @tanh_dyn(%arg0: tensor<?x?xf32>) {
   // CHECK: "xla_lhlo.tanh"(%arg0, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @dot
+func @dot(%arg0: tensor<1024x1024xf32>) -> tensor<1024x1024xf32> {
+// CHECK-SAME: (%[[ARG0:.*]]: [[TYPE:.*]],
+// CHECK-SAME:  %[[RESULT:.*]]: [[TYPE]])
+// CHECK: "xla_lhlo.dot"(%[[ARG0]], %[[ARG0]], %{{.*}}) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+    %dot = "xla_hlo.dot"(%arg0, %arg0)
+      : (tensor<1024x1024xf32>, tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+    return %dot : tensor<1024x1024xf32>
+  }
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
index ecee1d681d6..a27bf2cff79 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
@@ -222,6 +222,16 @@ func @float_cos(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 
 // -----
 
+// CHECK-LABEL: func @float_sin
+func @float_sin(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  // CHECK: linalg.generic
+  // CHECK: sin
+  %0 = "xla_hlo.sine"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @copy
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func @copy(%input: tensor<2x4x8xf32>) -> tensor<2x4x8xf32> {
@@ -246,10 +256,36 @@ func @select(%pred: tensor<2x2xi1>, %lhs: tensor<2x2xf32>,
 
 // -----
 
+// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1, d2) -> ()>
+// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-LABEL: func @broadcast_scalar
+func @broadcast_scalar(%arg: tensor<f32>) -> tensor<4x2x1xf32> {
+  %0 = "xla_hlo.broadcast"(%arg) {broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>} : (tensor<f32>) -> tensor<4x2x1xf32>
+  return %0: tensor<4x2x1xf32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d4, d5)>
+// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+// CHECK-LABEL: func @broadcast
+func @broadcast(%arg: tensor<4x?x16xf32>) -> tensor<4x2x1x4x?x16xf32> {
+  %0 = "xla_hlo.broadcast"(%arg) {broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>} : (tensor<4x?x16xf32>) -> tensor<4x2x1x4x?x16xf32>
+  return %0: tensor<4x2x1x4x?x16xf32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
+
+// -----
+
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, 0)>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-// CHECK-LABEL: func @broadcast
-func @broadcast(%operand: tensor<5x7x1xf32>) -> tensor<7x10x6x4x5xf32> {
+// CHECK-LABEL: func @broadcast_in_dim
+func @broadcast_in_dim(%operand: tensor<5x7x1xf32>) -> tensor<7x10x6x4x5xf32> {
   %0 = "xla_hlo.broadcast_in_dim"(%operand)
          {broadcast_dimensions = dense<[4,0,2]> : tensor<3xi64>}
          : (tensor<5x7x1xf32>) -> tensor<7x10x6x4x5xf32>
@@ -261,6 +297,22 @@ func @broadcast(%operand: tensor<5x7x1xf32>) -> tensor<7x10x6x4x5xf32> {
 
 // -----
 
+// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1) -> (d0)>
+// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @broadcast_in_dim_with_one_to_one
+func @broadcast_in_dim_with_one_to_one(
+         %operand: tensor<1xf32>) -> tensor<1x5xf32> {
+  %0 = "xla_hlo.broadcast_in_dim"(%operand)
+         {broadcast_dimensions = dense<[0]> : tensor<1xi64>}
+         : (tensor<1xf32>) -> tensor<1x5xf32>
+  return %0 : tensor<1x5xf32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
+
+// -----
+
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2) -> ()>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-LABEL: func @broadcast_scalar
@@ -359,3 +411,147 @@ func @add_scalar(%lhs: tensor<f32>, %rhs: tensor<f32>) -> tensor<f32> {
 // CHECK-NEXT: ^bb0(%[[LHS:.*]]: f32, %[[RHS:.*]]: f32):
 // CHECK: %[[RESULT:.*]] = addf %[[LHS]], %[[RHS]]
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
+func @reshape_collapse_single_dim
+  (%arg0: tensor<1x28x28x1xf32>) -> tensor<1x784xf32> {
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<1x28x28x1xf32>) -> tensor<1x784xf32>
+  return %0 : tensor<1x784xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d0)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>
+// CHECK-LABEL: func @reshape_collapse_single_dim
+//       CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP0]], #[[MAP1]]]
+
+// -----
+
+func @reshape_collapse(%arg0: tensor<2x2x2x3xf32>) -> tensor<2x4x3xf32> {
+    %0 = "xla_hlo.reshape"(%arg0) : (tensor<2x2x2x3xf32>) -> tensor<2x4x3xf32>
+    return %0 : tensor<2x4x3xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d0)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
+//   CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
+// CHECK-LABEL: func @reshape_collapse
+//       CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
+
+// -----
+
+func @reshape_expand(%arg0: tensor<2x8xf32>) -> tensor<2x4x2xf32> {
+    %0 = "xla_hlo.reshape"(%arg0) : (tensor<2x8xf32>) -> tensor<2x4x2xf32>
+    return %0 : tensor<2x4x2xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK-LABEL: func @reshape_expand
+//       CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP0]], #[[MAP1]]]
+
+// -----
+
+func @reshape_single_expand(%arg0 : tensor<8xf32>) -> tensor<1x4x2xf32> {
+    %0 = "xla_hlo.reshape"(%arg0) : (tensor<8xf32>) -> tensor<1x4x2xf32>
+    return %0 : tensor<1x4x2xf32>
+}
+//       CHECK: #[[MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-LABEL: func @reshape_single_expand
+//       CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP0]]]
+
+// -----
+
+func @reshape_multiple_collapse
+  (%arg0 : tensor<1x2x2x5x3x2xf32>) -> tensor<1x4x5x6xf32> {
+    %0 = "xla_hlo.reshape"(%arg0) : (tensor<1x2x2x5x3x2xf32>) -> tensor<1x4x5x6xf32>
+    return %0 : tensor<1x4x5x6xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2)>
+//   CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d3)>
+//   CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>
+// CHECK-LABEL: func @reshape_multiple_collapse
+//       CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP3]]]
+
+// -----
+
+// CHECK-LABEL: func @convert_i32_to_f32
+func @convert_i32_to_f32(%input: tensor<2x2xi32>) -> tensor<2x2xf32> {
+  %result = "xla_hlo.convert"(%input) : (tensor<2x2xi32>) -> tensor<2x2xf32>
+  return %result : tensor<2x2xf32>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32):
+// CHECK-NEXT:   %[[RESULT:.*]] = sitofp %[[OPERAND_IN]] : i32 to f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
+// CHECK-LABEL: func @convert_i16_to_i32
+func @convert_i16_to_i32(%input: tensor<2x2xi16>) -> tensor<2x2xi32> {
+  %result = "xla_hlo.convert"(%input) : (tensor<2x2xi16>) -> tensor<2x2xi32>
+  return %result : tensor<2x2xi32>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i16):
+// CHECK-NEXT:   %[[RESULT:.*]] = zexti %[[OPERAND_IN]] : i16 to i32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
+
+// -----
+
+// CHECK-LABEL: func @convert_i32_to_i16
+func @convert_i32_to_i16(%input: tensor<2x2xi32>) -> tensor<2x2xi16> {
+  %result = "xla_hlo.convert"(%input) : (tensor<2x2xi32>) -> tensor<2x2xi16>
+  return %result : tensor<2x2xi16>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32):
+// CHECK-NEXT:   %[[RESULT:.*]] = trunci %[[OPERAND_IN]] : i32 to i16
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i16
+
+// -----
+
+// CHECK-LABEL: func @convert_f32_to_f64
+func @convert_f32_to_f64(%input: tensor<2x2xf32>) -> tensor<2x2xf64> {
+  %result = "xla_hlo.convert"(%input) : (tensor<2x2xf32>) -> tensor<2x2xf64>
+  return %result : tensor<2x2xf64>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32):
+// CHECK-NEXT:   %[[RESULT:.*]] = fpext %[[OPERAND_IN]] : f32 to f64
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f64
+
+// -----
+
+// CHECK-LABEL: func @convert_f64_to_f32
+func @convert_f64_to_f32(%input: tensor<2x2xf64>) -> tensor<2x2xf32> {
+  %result = "xla_hlo.convert"(%input) : (tensor<2x2xf64>) -> tensor<2x2xf32>
+  return %result : tensor<2x2xf32>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f64):
+// CHECK-NEXT:   %[[RESULT:.*]] = fptrunc %[[OPERAND_IN]] : f64 to f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
+// CHECK-LABEL: func @convert_f32_to_i32
+func @convert_f32_to_i32(%input: tensor<2x2xf32>) -> tensor<2x2xi32> {
+  %result = "xla_hlo.convert"(%input) : (tensor<2x2xf32>) -> tensor<2x2xi32>
+  return %result : tensor<2x2xi32>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32):
+// CHECK-NEXT:   %[[RESULT:.*]] = fptosi %[[OPERAND_IN]] : f32 to i32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reverse
+func @reverse(%input: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %result = "xla_hlo.reverse"(%input) {
+    dimensions = dense<1> : tensor<1xi64>
+  } : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %result : tensor<2x3xf32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
new file mode 100644
index 00000000000..149c0c94663
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
@@ -0,0 +1,307 @@
+// RUN: xla-opt -split-input-file -xla-hlo-to-lhlo-with-xla %s | FileCheck --enable-var-scope --dump-input=fail %s
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.abs
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %abs = "xla_hlo.abs"(%value) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %abs : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.add
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.add"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xi32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lhlo.and
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.and"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.ceil
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.ceil"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<1x2xf32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func @main(%value0: tensor<1x2xf32>, %value1: tensor<1x2xf32>) -> tensor<1x2xcomplex<f32>> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<1x2xcomplex<f32>>
+// CHECK: lhlo.complex
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.complex"(%value0, %value1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
+  return %res : tensor<1x2xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xcomplex<f32>> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xcomplex<f32>> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<1x2xcomplex<f32>>
+// CHECK: lhlo.cosine
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.cosine"(%value0) : (tensor<1x2xcomplex<f32>>) -> tensor<1x2xcomplex<f32>>
+  return %res : tensor<1x2xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.divide
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.divide"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.exponential
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.exponential"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.log
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.log"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.maximum
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.maximum"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.minimum
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.minimum"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.multiply
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.multiply"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.negate
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.negate"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xcomplex<f32>> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
+func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<1x2xf32>
+// CHECK: lhlo.real
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.real"(%value0) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
+  return %res : tensor<1x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xcomplex<f32>> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
+func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<1x2xf32>
+// CHECK: lhlo.imag
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.imag"(%value0) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
+  return %res : tensor<1x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xi32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lhlo.remainder
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.remainder"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.rsqrt
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.rsqrt"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi1> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<2x2xf32> {xla_lhlo.params = 2
+// CHECK-SAME: %[[ARG3:.*]]: memref<16xi8>
+func @main(%pred: tensor<2x2xi1>, %lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.select
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[VIEW]]
+// CHECK-NEXT: return
+  %0 = "xla_hlo.select"(%pred, %lhs, %rhs) : (tensor<2x2xi1>, tensor<2x2xf32>, tensor<2x2xf32>) -> (tensor<2x2xf32>)
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.sign
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.sign"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.sqrt
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.sqrt"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xi32> {xla_lhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lhlo.subtract
+// CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[VIEW]]
+// CHECK-NEXT: return
+  %res = "xla_hlo.subtract"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lhlo.tanh
+// CHECK-SAME: %[[ARG0]], %[[VIEW]]
+  %res = "xla_hlo.tanh"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %res : tensor<2x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
new file mode 100644
index 00000000000..6a2b68adac3
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
@@ -0,0 +1,17 @@
+// RUN: xla-opt -xla-hlo-to-lhlo-with-xla %s | FileCheck --enable-var-scope --dump-input=fail %s
+
+// Current allocation will lead to one buffer argument for the "value" and
+// another one for the output, an no returned values.
+// CHECK-LABEL: func @main
+// CHECK-SAME:  %[[ARG0:.*]]: memref<2x2xf32> {xla_lhlo.params = 0 : index},
+// CHECK-SAME:  %[[ARG1:.*]]: memref<16xi8> {xla_lhlo.alloc = 0 : index, xla_lhlo.liveout = true}
+// CHECK-SAME: ) {
+func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  // The only expected instruction is a copy from the input into the output.
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[C02:.*]] = constant 0 : index
+  // CHECK: %[[OUTPUT:.*]] = std.view %[[ARG1]][%[[C02]]][] : memref<16xi8> to memref<2x2xf32>
+  // CHECK: xla_lhlo.copy
+  // CHECK-SAME: %[[ARG0]], %[[OUTPUT]]
+  return %value : tensor<2x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
index 83c3f765dc3..83880bc8ce9 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
@@ -35,7 +35,7 @@ func @conditional(%arg0: tensor<f32>) -> tensor<f32> {
 
   // CHECK:   [[VAL1:%.+]] = extract_element [[VAL0]][] : tensor<i1>
   // CHECK:   cond_br [[VAL1]], ^bb1(%arg0 : tensor<f32>), ^bb2(%arg0 : tensor<f32>)
-  %1 = "xla_hlo.conditional"(%0, %arg0, %arg0) ( {
+  %1 = "xla_hlo.if"(%0, %arg0, %arg0) ( {
 
   ^bb0(%arg1: tensor<f32>):
     // CHECK: ^bb1([[VAL2:%.+]]: tensor<f32>):
@@ -131,7 +131,7 @@ func @conditional_with_multiple_blocks(%arg0: tensor<f32>, %arg1: tensor<f32>, %
   // CHECK: ^[[EXIT]](%6: tensor<f32>):
   // CHECK:   return %6 : tensor<f32>
   // CHECK: }
-  %1 = "xla_hlo.conditional"(%pred, %arg0, %arg1) ( {
+  %1 = "xla_hlo.if"(%pred, %arg0, %arg1) ( {
   ^then_entry(%arg2: tensor<f32>):
     br ^then_succ(%arg2: tensor<f32>)
   ^then_succ(%0: tensor<f32>):
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
new file mode 100644
index 00000000000..3605e2a0d5c
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
@@ -0,0 +1,93 @@
+// RUN: tf-opt -xla-legalize-tf=allow-partial-conversion %s | FileCheck %s --dump-input-on-failure
+
+//===----------------------------------------------------------------------===//
+// tf.BatchMatMulV2 op legalizations.
+//===----------------------------------------------------------------------===//
+
+func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
+// CHECK-LABEL:   func @batchmatmulv2_basic
+// CHECK-SAME:        ([[LHS:%.*]]: tensor<1x4x2xf32>, [[RHS:%.*]]: tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+// CHECK:           [[LHSSHAPE:%.*]] = shape.shape_of [[LHS]] : tensor<1x4x2xf32>
+// CHECK:           [[RHSSHAPE:%.*]] = shape.shape_of [[RHS]] : tensor<3x2x4xf32>
+// CHECK:           [[CM2:%.*]] = constant -2 : i32
+// CHECK:           [[LHSHEAD:%.*]], [[LHSTAIL:%.*]] = "shape.split_at"([[LHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
+// CHECK:           [[RHSHEAD:%.*]], [[RHSTAIL:%.*]] = "shape.split_at"([[RHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
+// CHECK:           [[BCASTHEAD:%.*]] = "shape.broadcast"([[LHSHEAD]], [[RHSHEAD]]) : (!shape.shape, !shape.shape) -> !shape.shape
+// CHECK:           [[LHSBCASTSHAPE:%.*]] = "shape.concat"([[BCASTHEAD]], [[LHSTAIL]]) : (!shape.shape, !shape.shape) -> !shape.shape
+// CHECK:           [[LHSSHAPEEXTENTS:%.*]] = "shape.to_extent_tensor"([[LHSBCASTSHAPE]]) : (!shape.shape) -> tensor<3xindex>
+// CHECK:           [[LHSBCAST:%.*]] = "xla_hlo.dynamic_broadcast_in_dim"([[LHS]], [[LHSSHAPEEXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x4x2xf32>, tensor<3xindex>) -> tensor<3x4x2xf32>
+// CHECK:           [[RHSBCASTSHAPE:%.*]] = "shape.concat"([[BCASTHEAD]], [[RHSTAIL]]) : (!shape.shape, !shape.shape) -> !shape.shape
+// CHECK:           [[RHSSHAPEEXTENTS:%.*]] = "shape.to_extent_tensor"([[RHSBCASTSHAPE]]) : (!shape.shape) -> tensor<3xindex>
+// CHECK:           [[RHSBCAST:%.*]] = "xla_hlo.dynamic_broadcast_in_dim"([[RHS]], [[RHSSHAPEEXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x4xf32>, tensor<3xindex>) -> tensor<3x2x4xf32>
+// CHECK:           [[RESULT:%.*]] = "xla_hlo.dot_general"([[LHSBCAST]], [[RHSBCAST]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+// CHECK:           return [[RESULT]] : tensor<3x4x4xf32>
+// CHECK:         }
+
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<1x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+  return %0 : tensor<3x4x4xf32>
+}
+
+func @batchmatmulv2_lhs_batch(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<3x4x4xf32> {
+// CHECK-LABEL:   func @batchmatmulv2_lhs_batch
+// CHECK:           "xla_hlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+// CHECK:           "xla_hlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
+// CHECK:           "xla_hlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
+// CHECK-SAME:        lhs_batching_dimensions = dense<0> : tensor<1xi64>,
+// CHECK-SAME:        lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
+// CHECK-SAME:        rhs_batching_dimensions = dense<0> : tensor<1xi64>,
+// CHECK-SAME:        rhs_contracting_dimensions = dense<1> : tensor<1xi64>}}
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<3x4x2xf32>, tensor<2x4xf32>) -> tensor<3x4x4xf32>
+  return %0 : tensor<3x4x4xf32>
+}
+
+func @batchmatmulv2_rhs_batch(%arg0: tensor<4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
+// CHECK-LABEL:   func @batchmatmulv2_rhs_batch
+// CHECK:           "xla_hlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
+// CHECK:           "xla_hlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+// CHECK:           "xla_hlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
+// CHECK-SAME:        lhs_batching_dimensions = dense<0> : tensor<1xi64>,
+// CHECK-SAME:        lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
+// CHECK-SAME:        rhs_batching_dimensions = dense<0> : tensor<1xi64>,
+// CHECK-SAME:        rhs_contracting_dimensions = dense<1> : tensor<1xi64>}}
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+  return %0 : tensor<3x4x4xf32>
+}
+
+func @batchmatmulv2_dynamic(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+// CHECK-LABEL:   func @batchmatmulv2_dynamic
+// CHECK:           "xla_hlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
+// CHECK-SAME:  lhs_batching_dimensions = dense<0> : tensor<1xi64>,
+// CHECK-SAME:  lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
+// CHECK-SAME:  rhs_batching_dimensions = dense<0> : tensor<1xi64>,
+// CHECK-SAME:  rhs_contracting_dimensions = dense<1> : tensor<1xi64>}}
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+func @batchmatmulv2_adj_real(%arg0: tensor<5x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<5x4xf32> {
+// CHECK-LABEL:   func @batchmatmulv2_adj_real
+// CHECK:           "xla_hlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
+// CHECK-SAME:        lhs_batching_dimensions = dense<[]> : tensor<0xi64>,
+// CHECK-SAME:        lhs_contracting_dimensions = dense<0> : tensor<1xi64>,
+// CHECK-SAME:        rhs_batching_dimensions = dense<[]> : tensor<0xi64>,
+// CHECK-SAME:        rhs_contracting_dimensions = dense<1> : tensor<1xi64>}}
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xf32>, tensor<2x4xf32>) -> tensor<5x4xf32>
+  return %0 : tensor<5x4xf32>
+}
+
+func @batchmatmulv2_adj_complex(%arg0: tensor<5x2xcomplex<f32>>, %arg1: tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
+// CHECK-LABEL:   func @batchmatmulv2_adj_complex(
+// CHECK-SAME:                                    [[LHS:%.*]]: tensor<5x2xcomplex<f32>>, [[RHS:%.*]]: tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
+// CHECK:           [[LHSRE:%.*]] = "xla_hlo.real"([[LHS]])
+// CHECK:           [[LHSIM:%.*]] = "xla_hlo.imag"([[LHS]])
+// CHECK:           [[LHSIMNEG:%.*]] = "xla_hlo.negate"([[LHSIM]])
+// CHECK:           [[LHSCONJ:%.*]] = "xla_hlo.complex"([[LHSRE]], [[LHSIMNEG]])
+// CHECK:           [[RHSRE:%.*]] = "xla_hlo.real"([[RHS]])
+// CHECK:           [[RHSIM:%.*]] = "xla_hlo.imag"([[RHS]])
+// CHECK:           [[RHSIMNEG:%.*]] = "xla_hlo.negate"([[RHSIM]])
+// CHECK:           [[RHSCONJ:%.*]] = "xla_hlo.complex"([[RHSRE]], [[RHSIMNEG]])
+// CHECK:           shape.shape_of [[LHSCONJ]]
+// CHECK:           shape.shape_of [[RHSCONJ]]
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xcomplex<f32>>, tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
+  return %0 : tensor<5x4xcomplex<f32>>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
new file mode 100644
index 00000000000..c114b8c50a5
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
@@ -0,0 +1,334 @@
+// Note that binary elementwise tests are run with chlo legalization enabled
+// (unlike the rest), since this is the primary use case for such ops and
+// verification of shapes and broadcasts is desired.
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" %s | FileCheck %s --dump-input-on-failure
+
+//===----------------------------------------------------------------------===//
+// Binary op legalizations.
+// Most of these expand from the same pattern. Full semantics are
+// verified for tf.Add and pattern application only for the rest.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @add
+func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %[[SUM0:.*]] = xla_hlo.add %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  %[[SUM1:.*]] = xla_hlo.add %[[SUM0]], %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %[[SUM1]] : tensor<2xi32>
+  %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %1 = "tf.AddV2"(%0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %1: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @broadcast_add
+// TODO(laurenzo): Change this to a (5 + 2x1) shaped add to make the check
+// patterns unambiguous and more interesting (once broadcastable trait is
+// fixed upstream).
+func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  return %0: tensor<1x2xi32>
+}
+
+// CHECK-LABEL: func @broadcast_multi_dim_add
+// TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
+// broadcastable bug is fixed (helps make the CHECK matching unambiguous)
+func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [4, 1, 1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+  return %0: tensor<4x4x4x4xi32>
+}
+
+// CHECK-LABEL: func @add_dynamic
+func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %4, %5 : tensor<?x?xi32>
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %0: tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func @div
+func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  %0 = "tf.Div"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @shift_left
+func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK:  xla_hlo.shift_left %arg0, %arg1 : tensor<4xi32>
+  %0 = "tf.LeftShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @div_unranked
+func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // CHECK: tf.Div
+  %0 = "tf.Div"(%arg0, %arg1) : (tensor<*xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %0: tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func @maximum
+func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
+  %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @minimum
+func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
+  %0 = "tf.Minimum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @mul
+func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.multiply %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  %0 = "tf.Mul"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @real_div
+func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
+  %0 = "tf.RealDiv"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @sub
+func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.subtract %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  %0 = "tf.Sub"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @shift_right
+func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK:  xla_hlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @shift_right_unsigned
+func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> tensor<4xui8> {
+  // CHECK:  tf.RightShift
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<4xui8>) -> tensor<4xui8>
+  return %0 : tensor<4xui8>
+}
+
+// CHECK-LABEL: func @broadcast_shift_right_unsigned
+func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8>) -> tensor<2x4xui8> {
+  // CHECK:  tf.RightShift
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<2x4xui8>) -> tensor<2x4xui8>
+  return %0 : tensor<2x4xui8>
+}
+
+// CHECK-LABEL: func @and
+func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+  // CHECK-NEXT:  xla_hlo.and
+  %0 = "tf.LogicalAnd"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @and_unranked
+func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
+  // CHECK: tf.LogicalAnd
+  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @or
+func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+  // CHECK-NEXT:  xla_hlo.or
+  %0 = "tf.LogicalOr"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @bitwise_or
+func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK-NEXT: xla_hlo.or
+  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0: tensor<4xi32>
+}
+
+// CHECK-LABEL: func @bitwise_and
+func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK-NEXT: xla_hlo.and
+  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0: tensor<4xi32>
+}
+
+// CHECK-LABEL: func @pow
+func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK-NEXT:  xla_hlo.power
+  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0: tensor<2xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Equality op legalizations.
+// tf.Equal and tf.NotEqual expand from the same pattern. Full semantics are
+// verified for tf.Equal and pattern application only for tf.NotEqual
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @equal
+func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @equal_dynamic
+func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  return %0: tensor<?xi1>
+}
+
+// CHECK-LABEL: func @equal_broadcast
+func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  return %0: tensor<1x2xi1>
+}
+
+// CHECK-LABEL: func @equal_broadcast_no_incompatible_shapes_error
+func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  return %0: tensor<1x2xi1>
+}
+
+// CHECK-LABEL: func @equal_incompatible_shape_broadcastable
+func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  return %0: tensor<?xi1>
+}
+
+// CHECK-LABEL: func @equal_incompatible_shape_dynamic
+func @equal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @equal_incompatible_shape_both_dynamic
+func @equal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @equal_unranked
+func @equal_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi1> {
+  // CHECK: "tf.Equal"
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @notequal
+func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "NE"}
+  %0 = "tf.NotEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+//===----------------------------------------------------------------------===//
+// Compare op legalizations.
+// These expand from the same pattern. Full semantics are checked for
+// tf.Greater. Others just check that the pattern applied.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @greater
+func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK: "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @broadcast_greater
+func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  return %0: tensor<1x2xi1>
+}
+
+// CHECK-LABEL: func @greater_dynamic
+func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
+  return %0: tensor<?xi1>
+}
+
+// CHECK-LABEL: func @greater_uranked
+func @greater_uranked(%arg0: tensor<*xi32>) -> tensor<*xi1> {
+  // CHECK:  "tf.Greater"
+  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @greater_equal
+func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GE"}
+  %0 = "tf.GreaterEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @less
+func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"}
+  %0 = "tf.Less"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @less_equal
+func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"}
+  %0 = "tf.LessEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
index 808d0053416..61f82fcad19 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
@@ -6,7 +6,7 @@ attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
   // CHECK: [[VAL0:%.+]] = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK: [[VAL1:%.+]] = "xla_hlo.tuple"(%arg0, %arg1)
-  // CHECK: [[VAL2:%.+]] = "xla_hlo.conditional"([[VAL0]], [[VAL1]], [[VAL1]]) ( {
+  // CHECK: [[VAL2:%.+]] = "xla_hlo.if"([[VAL0]], [[VAL1]], [[VAL1]]) ( {
   // CHECK: ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
   // CHECK:   [[VAL4:%.+]] = "xla_hlo.get_tuple_element"(%arg2) {index = 0 : i32}
   // CHECK:   [[VAL5:%.+]] = "xla_hlo.get_tuple_element"(%arg2) {index = 1 : i32}
@@ -21,7 +21,7 @@ attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
   // CHECK:   [[VAL7:%.+]] = "xla_hlo.tuple"([[VAL6]])
   // CHECK: "xla_hlo.return"([[VAL7]]) : (tuple<tensor<f32>>) -> ()
   // CHECK: })
-  %1 = "tf.If"(%0, %arg0, %arg1) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _lower_using_switch_merge = true, _output_shapes = ["tfshape$"], device = "", else_branch = @cond_false, is_stateless = true, name = "cond", output_shapes = ["tfshape$"], then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %1 = "tf.If"(%0, %arg0, %arg1) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _lower_using_switch_merge = true, _output_shapes = ["tfshape$"], device = "", else_branch = @cond_false, is_stateless = true, name = "cond", output_shapes = [#tf.shape<>], then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
 
   // CHECK: [[VAL3:%.+]] = "xla_hlo.get_tuple_element"([[VAL2]]) {index = 0 : i32}
   // CHECK: return [[VAL3]]
@@ -68,7 +68,7 @@ attributes  {tf._input_shapes = ["tfshape$"]} {
   // CHECK: [[VAL5:%.+]] = "xla_hlo.get_tuple_element"([[VAL3]]) {index = 1 : i32}
   // CHECK: [[VAL6:%.+]] = "xla_hlo.get_tuple_element"([[VAL3]]) {index = 2 : i32}
   // CHECK: return [[VAL6]]
-  %2:3 = "tf.While"(%0, %1, %0) {T = ["tfdtype$DT_INT32", "tfdtype$DT_INT32", "tfdtype$DT_INT32"], _lower_using_switch_merge = true, _num_original_outputs = 3 : i64, _output_shapes = ["tfshape$", "tfshape$", "tfshape$"], body = @while_body, cond = @while_cond, device = "", is_stateless = true, name = "while", output_shapes = ["tfshape$", "tfshape$", "tfshape$"], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+  %2:3 = "tf.While"(%0, %1, %0) {T = ["tfdtype$DT_INT32", "tfdtype$DT_INT32", "tfdtype$DT_INT32"], _lower_using_switch_merge = true, _num_original_outputs = 3 : i64, _output_shapes = ["tfshape$", "tfshape$", "tfshape$"], body = @while_body, cond = @while_cond, device = "", is_stateless = true, name = "while", output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
   return %2#2 : tensor<i32>
 }
 func @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir
index d2b4d269fef..0660af4ed1c 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir
@@ -1,22 +1,24 @@
 // RUN: tf-opt %s -xla-legalize-tf -split-input-file -verify-diagnostics
 
+// expected-error@below{{The following operations cannot be legalized: tf.NoOp (count: 1); tf_executor.fetch (count: 1); tf_executor.graph (count: 1); tf_executor.island (count: 1); tf_executor.yield (count: 1). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.}}
+// expected-error@below{{Emitting more detail about one op that failed to legalize...}}
 func @tf_executor_graph_op() {
-    // expected-error@+1 {{failed to legalize operation 'tf_executor.graph'}}
     tf_executor.graph {
       %0 = tf_executor.island {
+        // expected-error@+1 {{'tf.NoOp' op is not legalizable}}
         "tf.NoOp"() {} : () -> ()
         tf_executor.yield
       }
       tf_executor.fetch
     }
     return
-
 }
 
 // -----
 
+// expected-error@below{{The following operations cannot be legalized: tf.OpA (count: 1). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.}}
 func @tf_unknown_op(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // expected-error@+1 {{failed to legalize operation 'tf.OpA'}}
+  // expected-error@+1 {{'tf.OpA' op is not legalizable}}
   %0 = "tf.OpA"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
 }
@@ -27,3 +29,16 @@ func @tf_known_op(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
 }
+
+// -----
+
+// expected-error@below{{The following operations cannot be legalized: tf.OpA (count: 1); tf.OpB (count: 2). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.}}
+// expected-error@below{{Emitting more detail about one op that failed to legalize...}}
+func @tf_unknown_known_mix(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // expected-error@+1 {{'tf.OpA' op is not legalizable}}
+  %0 = "tf.OpA"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %1 = "tf.OpB"(%0, %0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %2 = "tf.Add"(%1, %1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %3 = "tf.OpB"(%2, %2) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %2: tensor<2xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index 2fed18cb917..e8d5cfe997d 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -23,6 +23,15 @@ func @unknown_op(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL: not_whitelisted_op
+func @not_whitelisted_op(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<?x?x?xf32> {
+  // CHECK: tf.TensorListReserve
+  %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<3xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<?x?x?xf32>>>
+  // CHECK: tf.TensorListGetItem
+  %1 = "tf.TensorListGetItem"(%0, %arg2, %arg0) : (tensor<!tf.variant<tensor<?x?x?xf32>>>, tensor<i32>, tensor<3xi32>) -> tensor<?x?x?xf32>
+  return %1 : tensor<?x?x?xf32>
+}
+
 // CHECK-LABEL: unranked_operand
 func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: tf.Abs
@@ -41,6 +50,15 @@ func @dynamic_operand(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL: unsupported_dtype
+func @unsupported_dtype(%arg0: tensor<2x!tf.variant>) -> tensor<2x!tf.variant> {
+  // CHECK: tf.AddN
+  // expected-remark@+1 {{unsupported type: tensor<2x!tf.variant>}}
+  %0 = "tf.AddN"(%arg0, %arg0) : (tensor<2x!tf.variant>, tensor<2x!tf.variant>) -> tensor<2x!tf.variant>
+
+  return %0 : tensor<2x!tf.variant>
+}
+
 // CHECK-LABEL: multiple_dialect_ops
 func @multiple_dialect_ops(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK: xla_hlo.negate
@@ -106,12 +124,68 @@ func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// TODO(hinsu): Add a test with variant type once one of the ops supporting
-// the type is whitelisted. It should be rejected with unsupported type remark.
+// CHECK-LABEL: func @const_inputs
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x2xf64>, %[[ARG1:.*]]: tensor<f64>,
+func @const_inputs(%arg0: tensor<2x2xf64>, %arg1: tensor<f64>, %arg2: tensor<2xi32>, %arg3: tensor<2xi32>, %arg4: tensor<2xi32>) -> tensor<6x5xf64> {
 
-// TODO(hinsu): Add a test with uint8 type once one of the ops supporting the
-// type is whitelisted. Unsigned types are not yet added to the HLO dialect so
-// it should return an error. See b/130356985
+  // CHECK: "xla_hlo.pad"(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME-DAG: edge_padding_high = dense<[1, 2]> : tensor<2xi64>
+  // CHECK-SAME-DAG: edge_padding_low = dense<[2, 1]> : tensor<2xi64>
+  // CHECK-SAME-DAG: interior_padding = dense<[1, 0]> : tensor<2xi64>
+
+  %0 = xla_hlo.constant dense<[2, 1]> : tensor<2xi32>
+  %1 = xla_hlo.constant dense<[1, 2]> : tensor<2xi32>
+  %2 = xla_hlo.constant dense<[1, 0]> : tensor<2xi32>
+  %3 = "tf.XlaPad"(%arg0, %arg1, %0, %1, %2) : (tensor<2x2xf64>, tensor<f64>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<6x5xf64>
+  return %3 : tensor<6x5xf64>
+}
+
+func @non_const_inputs(%arg0: tensor<2x2xf64>, %arg1: tensor<f64>, %arg2: tensor<2xi32>, %arg3: tensor<2xi32>, %arg4: tensor<2xi32>) -> tensor<6x5xf64> {
+  // expected-remark@+1 {{lowering requires operand #2 to be a constant}}
+  %0 = "tf.XlaPad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<2x2xf64>, tensor<f64>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<6x5xf64>
+  return %0 : tensor<6x5xf64>
+}
+
+// CHECK-LABEL: dynamic_result_type
+func @dynamic_result_type(%arg0: tensor<2xf32>) -> tensor<*xf32> {
+  // CHECK: %[[RESULT:.*]] = "xla_hlo.abs"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: tensor_cast %0 : tensor<2xf32> to tensor<*xf32>
+  %0 = "tf.Abs"(%arg0) : (tensor<2xf32>) -> tensor<*xf32>
+
+  // return %[[RESULT]]
+  return %0 : tensor<*xf32>
+}
+
+func @truncated_normal() -> tensor<2x2xf32> {
+  // CHECK-NOT: tf.TruncatedNormal
+  %0 = xla_hlo.constant dense<[2, 2]> : tensor<2xi32>
+  %1 = "tf.TruncatedNormal"(%0) {T = i32, device = "", dtype = f32, seed = 0 : i64, seed2 = 1950157571 : i64} : (tensor<2xi32>) -> tensor<2x2xf32>
+  return %1 : tensor<2x2xf32>
+}
+
+// CHECK-LABEL: dynamic_update_slice
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<3x4xi32>, %[[ARG1:.*]]: tensor<2x2xi32>, %[[ARG2:.*]]: tensor<2xi32>
+func @dynamic_update_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<2x2xi32>, %arg2: tensor<2xi32>) -> tensor<3x4xi32> {
+
+  // CHECK: %[[SLICE0:.*]] = "xla_hlo.slice"(%[[ARG2]])
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>
+  // CHECK-DAG-SAME: limit_indices = dense<1> : tensor<1xi64>
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+  // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: %[[DIM0:.*]] = "xla_hlo.reshape"(%[[SLICE0]]) : (tensor<1xi32>) -> tensor<i32>
+
+  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%[[ARG2]])
+  // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>
+  // CHECK-DAG-SAME: limit_indices = dense<2> : tensor<1xi64>
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+  // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: %[[DIM1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<1xi32>) -> tensor<i32>
+
+  // CHECK: "xla_hlo.dynamic-update-slice"(%[[ARG0]], %[[ARG1]], %[[DIM0]], %[[DIM1]])
+
+  %0 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %arg2) : (tensor<3x4xi32>, tensor<2x2xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+  return %0: tensor<3x4xi32>
+}
 
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 2b1c9172f70..2288e0fefc4 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1,4 +1,11 @@
-// RUN: tf-opt -xla-legalize-tf=allow-partial-conversion %s | FileCheck %s --dump-input-on-failure
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=false" %s | FileCheck %s --dump-input-on-failure
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -verify-diagnostics %s
+// This test runs twice:
+//   1. Through FileCheck with chlo legalization disabled since verifying
+//      that the chlo ops emit produces more useful tests.
+//   2. With chlo legalization enabled, verifying diagnostics to pick up any
+//      issues with the full lowering (can catch some broadcasting corner
+//      cases which emit with a warning).
 
 //===----------------------------------------------------------------------===//
 // BatchNorm op legalizations.
@@ -27,30 +34,68 @@ func @fusedBatchNormV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf3
 }
 
 // CHECK-LABEL: fusedBatchNormV3_noTraining_mixedPrecision
-func @fusedBatchNormV3_noTraining_mixedPrecision(%arg0: tensor<8x8x8x8xbf16>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xbf16>) {
-  // CHECK: %[[RESULT0:.*]] = "xla_hlo.convert"(%arg0) : (tensor<8x8x8x8xbf16>) -> tensor<8x8x8x8xf32>
-  // CHECK: %[[RESULT1:.*]] = "xla_hlo.batch_norm_inference"(%[[RESULT0]], %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-  // CHECK-NEXT: "xla_hlo.convert"(%[[RESULT1]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xbf16>
-  return %0#0 : tensor<8x8x8x8xbf16>
+// CHECK-SAME:  ([[X:%.*]]: tensor<8x8x8x8xbf16>, [[SCALE:%.*]]: tensor<8xf32>, [[OFFSET:%.*]]: tensor<8xf32>, [[MEAN:%.*]]: tensor<8xf32>, [[VARIANCE:%.*]]: tensor<8xf32>)
+func @fusedBatchNormV3_noTraining_mixedPrecision(%arg0: tensor<8x8x8x8xbf16>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>) {
+  // CHECK: [[CONVERT_X:%.*]] = "xla_hlo.convert"([[X]]) : (tensor<8x8x8x8xbf16>) -> tensor<8x8x8x8xf32>
+  // CHECK: [[Y:%.*]] = "xla_hlo.batch_norm_inference"([[CONVERT_X]], [[SCALE]], [[OFFSET]], [[MEAN]], [[VARIANCE]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>)
+  // CHECK: [[Y_CONVERT:%.*]] = "xla_hlo.convert"([[Y]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xbf16>
+  // CHECK: [[DUMMY:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<0xf32>
+  // CHECK: [[DUMMY_CAST:%.*]] = tensor_cast [[DUMMY]] : tensor<0xf32> to tensor<*xf32>
+  // CHECK: return [[Y_CONVERT]], [[MEAN]], [[VARIANCE]], [[MEAN]], [[VARIANCE]], [[DUMMY_CAST]]
+  return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>
 }
 
 // CHECK-LABEL: fusedBatchNormV3_training
 func @fusedBatchNormV3_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
   // CHECK: %[[RESULT0:.*]] = "xla_hlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>
-  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK: "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 0 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8x8x8x8xf32>
   // CHECK: "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 1 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK: %[[VAR:.*]] = "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 2 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK: xla_hlo.constant
-  // CHECK: "xla_hlo.multiply"(%[[VAR]], {{.*}}) : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK: xla_chlo.broadcast_multiply %[[VAR]], {{.*}} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   return %0#0 : tensor<8x8x8x8xf32>
 }
 
+// CHECK-LABEL: func @fusedBatchNormV3_training_batchVariance
+func @fusedBatchNormV3_training_batchVariance(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> tensor<8xf32> {
+  // CHECK: %[[RESULT0:.*]] = "xla_hlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK: %[[VAR:.*]] = "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 2 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
+  // CHECK: return %[[VAR]]
+  return %0#4 : tensor<8xf32>
+}
+
+// CHECK-LABEL: fusedBatchNormV3_training_exponentialAvgFactor
+func @fusedBatchNormV3_training_exponentialAvgFactor(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) {
+  // CHECK: %[[RESULT0:.*]] = "xla_hlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 0.8 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK-DAG: %[[BATCH_MEAN:.*]] = "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 1 : i32}
+  // CHECK-DAG: %[[BATCH_VAR:.*]] = "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 2 : i32}
+
+  // CHECK: %[[FACTOR:.*]] = xla_hlo.constant dense<1.00195694>
+  // CHECK: %[[CORRECTED_VAR:.*]] = xla_chlo.broadcast_multiply %[[BATCH_VAR]], %[[FACTOR]]
+
+  // CHECK-DAG: %[[ALPHA:.*]] = xla_hlo.constant dense<0.199999988>
+  // CHECK-DAG: %[[BETA:.*]] = xla_hlo.constant dense<8.000000e-01>
+
+  // CHECK: %[[ALPHA_MUL_OLD_MEAN:.*]] = xla_chlo.broadcast_multiply %[[ALPHA]], %arg3
+  // CHECK: %[[BETA_MUL_BATCH_MEAN:.*]] = xla_chlo.broadcast_multiply %[[BETA]], %[[BATCH_MEAN]]
+  // CHECK: %[[NEW_BATCH_MEAN:.*]] = xla_chlo.broadcast_add %[[ALPHA_MUL_OLD_MEAN]], %[[BETA_MUL_BATCH_MEAN]]
+
+  // CHECK: %[[ALPHA_MUL_OLD_VAR:.*]] = xla_chlo.broadcast_multiply %[[ALPHA]], %arg4
+  // CHECK: %[[BETA_MUL_CORRECTED_VAR:.*]] = xla_chlo.broadcast_multiply %[[BETA]], %[[CORRECTED_VAR]]
+  // CHECK: %[[NEW_BATCH_VAR:.*]] = xla_chlo.broadcast_add %[[ALPHA_MUL_OLD_VAR]], %[[BETA_MUL_CORRECTED_VAR]]
+
+  // CHECK: return %[[NEW_BATCH_MEAN]], %[[NEW_BATCH_VAR]], %[[BATCH_MEAN]], %[[BATCH_VAR]]
+  return %0#1, %0#2, %0#3, %0#4 : tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>
+}
+
 // CHECK-LABEL: fusedBatchNormV3_training_mixedPrecision
 func @fusedBatchNormV3_training_mixedPrecision(%arg0: tensor<8x8x8x8xbf16>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xbf16>) {
   // CHECK: "xla_hlo.convert"(%arg0) : (tensor<8x8x8x8xbf16>) -> tensor<8x8x8x8xf32>
-  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK: "xla_hlo.convert"({{.*}}) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xbf16>
   return %0#0 : tensor<8x8x8x8xbf16>
 }
@@ -58,28 +103,28 @@ func @fusedBatchNormV3_training_mixedPrecision(%arg0: tensor<8x8x8x8xbf16>, %arg
 // CHECK-LABEL: fusedBatchNormV3_NCHW
 func @fusedBatchNormV3_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
   // CHECK: "xla_hlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 1 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>
-  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   return %0#0 : tensor<8x8x8x8xf32>
 }
 
 // CHECK-LABEL: fusedBatchNormV3_noTraining_dynamic_supported
 func @fusedBatchNormV3_noTraining_dynamic_supported(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>, %arg3: tensor<?xf32>, %arg4: tensor<?xf32>) -> (tensor<?x?x?x?xf32>) {
   // CHECK: "xla_hlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 1 : i64} : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?x?xf32>
-  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, is_training = false} : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = false} : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
   return %0#0 : tensor<?x?x?x?xf32>
 }
 
 // CHECK-LABEL: fusedBatchNormV3_training_dynamic_unsupported1
 func @fusedBatchNormV3_training_dynamic_unsupported1(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>, %arg3: tensor<?xf32>, %arg4: tensor<?xf32>) -> (tensor<?x?x?x?xf32>) {
   // CHECK: tf.FusedBatchNormV3
-  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, is_training = true} : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
   return %0#0 : tensor<?x?x?x?xf32>
 }
 
 // CHECK-LABEL: fusedBatchNormV3_training_dynamic_unsupported2
 func @fusedBatchNormV3_training_dynamic_unsupported2(%arg0: tensor<?x6x?x?xf32>, %arg1: tensor<6xf32>, %arg2: tensor<6xf32>, %arg3: tensor<6xf32>, %arg4: tensor<6xf32>) -> (tensor<?x6x?x?xf32>) {
   // CHECK: tf.FusedBatchNormV3
-  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, is_training = true} : (tensor<?x6x?x?xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>) -> (tensor<?x6x?x?xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>)
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<?x6x?x?xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>) -> (tensor<?x6x?x?xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>, tensor<6xf32>)
   return %0#0 : tensor<?x6x?x?xf32>
 }
 
@@ -89,11 +134,12 @@ func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -104,10 +150,10 @@ func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x
   // CHECK-NEXT: }) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -147,11 +193,12 @@ func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -162,10 +209,11 @@ func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: }) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -232,11 +280,12 @@ func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -247,10 +296,11 @@ func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: }) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -317,11 +367,12 @@ func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 2, 3]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -332,10 +383,11 @@ func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: }) {dimensions = dense<[0, 2, 3]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 2, 3]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -367,280 +419,41 @@ func @fusedBatchNormGradV3_Training_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: tens
 
 // CHECK-LABEL: func @biasAdd_NHWC
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[ARG0_SHAPE]])
+  // CHECK: %[[ARG1_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = xla_hlo.add %arg0, %[[ARG1_BCAST]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 // CHECK-LABEL: func @biasAdd_NCHW
 func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[ARG0_SHAPE]])
+  // CHECK: %[[ARG1_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = xla_hlo.add %arg0, %[[ARG1_BCAST]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NCHW"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 // CHECK-LABEL: func @biasAdd_dynamic
 func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
-  // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[ARG0_SHAPE]])
+  // CHECK: %[[ARG1_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = xla_hlo.add %arg0, %[[ARG1_BCAST]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NCHW"} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
   return %0 : tensor<?x?x?x?xi32>
 }
 
 //===----------------------------------------------------------------------===//
-// Binary op legalizations.
+// DiagPart
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: func @add
-func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %[[SUM0:.*]] = xla_hlo.add %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  %[[SUM1:.*]] = xla_hlo.add %[[SUM0]], %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %[[SUM1]] : tensor<2xi32>
-  %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  %1 = "tf.AddV2"(%0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %1: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_add
-func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_multi_dim_add
-func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
-  return %0: tensor<4x4x4x4xi32>
-}
-
-// CHECK-LABEL: func @div
-func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
-  %0 = "tf.Div"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_div
-func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @shift_left
-func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK:  xla_hlo.shift_left %arg0, %arg1 : tensor<4xi32>
-  %0 = "tf.LeftShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0 : tensor<4xi32>
-}
-
-// CHECK-LABEL: func @div_dynamic
-func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-  return %0: tensor<?x?xi32>
-}
-
-// CHECK-LABEL: func @div_unranked
-func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK: tf.Div
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<*xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-  return %0: tensor<?x?xi32>
-}
-
-// CHECK-LABEL: func @maximum
-func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
-  %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: func @minimum
-func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
-  %0 = "tf.Minimum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: func @mul
-func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.multiply %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
-  %0 = "tf.Mul"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_mul
-func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @real_div
-func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
-  %0 = "tf.RealDiv"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_real_div
-func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.RealDiv"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @sub
-func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.subtract %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
-  %0 = "tf.Sub"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_sub
-func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Sub"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @shift_right
-func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK:  xla_hlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0 : tensor<4xi32>
-}
-
-// CHECK-LABEL: func @broadcast_shift_right
-func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
-  // CHECK: "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
-  return %0 : tensor<2x4xi32>
-}
-
-// CHECK-LABEL: func @shift_right_unsigned
-func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> tensor<4xui8> {
-  // CHECK:  tf.RightShift
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<4xui8>) -> tensor<4xui8>
-  return %0 : tensor<4xui8>
-}
-
-// CHECK-LABEL: func @broadcast_shift_right_unsigned
-func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8>) -> tensor<2x4xui8> {
-  // CHECK:  tf.RightShift
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<2x4xui8>) -> tensor<2x4xui8>
-  return %0 : tensor<2x4xui8>
-}
-
-// CHECK-LABEL: func @and
-func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  // CHECK-NEXT:  xla_hlo.and
-  %0 = "tf.LogicalAnd"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @and_broadcast
-func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.and"
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @and_dynamic
-func @and_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  // CHECK-NEXT: "xla_hlo.and"
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @and_unranked
-func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
-  // CHECK: tf.LogicalAnd
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @or
-func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  // CHECK-NEXT:  xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @or_broadcast
-func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @or_dynamic
-func @or_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @bitwise_or
-func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0: tensor<4xi32>
-}
-
-// CHECK-LABEL: func @bitwise_or_broadcast
-func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-  return %0: tensor<1x4xi8>
-}
-
-// CHECK-LABEL: func @bitwise_or_dynamic
-func @bitwise_or_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-  return %0: tensor<?xi32>
-}
-
-// CHECK-LABEL: func @bitwise_and
-func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0: tensor<4xi32>
-}
-
-// CHECK-LABEL: func @bitwise_and_broadcast
-func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-  return %0: tensor<1x4xi8>
-}
-
-// CHECK-LABEL: func @bitwise_and_dynamic
-func @bitwise_and_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-  return %0: tensor<?xi32>
-}
-
-// CHECK-LABEL: func @pow
-func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-NEXT:  xla_hlo.power
-  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  return %0: tensor<2xf32>
-}
-
-// CHECK-LABEL: func @pow_dynamic
-func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-  // CHECK-NEXT:  xla_hlo.power
-  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  return %0: tensor<?xf32>
-}
-
 // CHECK-LABEL: func @diag_part
 // CHECK-SAME: %[[ARG:.*]]: tensor<4x3x4x3xf32>
 func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
@@ -660,6 +473,10 @@ func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
   return %0: tensor<4x3xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// Einsum.
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func @einsum
 func @einsum(%arg0: tensor<2x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<2x4xf32> {
   // CHECK:  xla_hlo.einsum
@@ -674,22 +491,26 @@ func @unary_einsum(%arg0: tensor<2x3xf32>) -> tensor<2x2xf32> {
   return %0: tensor<2x2xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// FloorDiv and FloorMod.
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func @floordiv_broadcast_i32
 func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
   // CHECK-DAG: [[ZEROS1:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"(%arg0, [[ZEROS1]]) {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = "LT"}
   // CHECK-DAG: [[ZEROS2:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZEROS2]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[CMP1]], [[CMP2]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
-  // CHECK-DAG: [[DIV1:%.+]] = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[CMP1]], [[CMP2]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
   // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
   // CHECK-DAG: [[ZEROS3:%.+]] = xla_hlo.constant dense<1>
-  // CHECK-DAG: [[SUB:%.+]] = xla_hlo.subtract [[ABS2]], [[ZEROS3]]
-  // CHECK-DAG: [[ADD:%.+]] = "xla_hlo.add"([[ABS1]], [[SUB]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ZEROS3]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
   // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[DIV2:%.+]] = "xla_hlo.divide"([[NEG]], [[ABS3]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[DIV2:%.+]] = xla_chlo.broadcast_divide [[NEG]], [[ABS3]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[CMP3]], [[DIV1]], [[DIV2]])
   // CHECK: return [[SELECT]]
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
@@ -699,19 +520,19 @@ func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> te
 // CHECK-LABEL: func @floordiv_reverse_broadcast_i32
 func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // CHECK-DAG: [[ZEROS1:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"(%arg0, [[ZEROS1]]) {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = "LT"}
   // CHECK-DAG: [[ZEROS2:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZEROS2]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[CMP1]], [[CMP2]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
-  // CHECK-DAG: [[DIV1:%.+]] = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[CMP1]], [[CMP2]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
   // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
   // CHECK-DAG: [[ZEROS3:%.+]] = xla_hlo.constant dense<1>
-  // CHECK-DAG: [[SUB:%.+]] = xla_hlo.subtract [[ABS2]], [[ZEROS3]]
-  // CHECK-DAG: [[ADD:%.+]] = "xla_hlo.add"([[ABS1]], [[SUB]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ZEROS3]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
   // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[DIV2:%.+]] = xla_hlo.divide [[NEG]], [[ABS3]]
+  // CHECK-DAG: [[DIV2:%.+]] = xla_chlo.broadcast_divide [[NEG]], [[ABS3]]
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[CMP3]], [[DIV1]], [[DIV2]])
   // CHECK: return [[SELECT]]
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -720,7 +541,7 @@ func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32
 
 // CHECK-LABEL: func @floordiv_f32
 func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-NEXT:  %[[DIV:.*]] = xla_hlo.divide %arg0, %arg0
+  // CHECK-NEXT:  %[[DIV:.*]] = xla_chlo.broadcast_divide %arg0, %arg0
   // CHECK-NEXT:  %[[FLOOR:.*]] = "xla_hlo.floor"(%[[DIV]])
   // CHECK-NEXT:  return %[[FLOOR]] : tensor<2xf32>
   %0 = "tf.FloorDiv"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
@@ -731,7 +552,7 @@ func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 func @floordiv_bf16(%arg0: tensor<2xbf16>) -> tensor<2xbf16> {
   // CHECK-NEXT:  xla_hlo.convert
   // CHECK-NEXT:  xla_hlo.convert
-  // CHECK-NEXT:  xla_hlo.divide
+  // CHECK-NEXT:  xla_chlo.broadcast_divide
   // CHECK-NEXT:  xla_hlo.floor
   // CHECK-NEXT:  xla_hlo.convert
   // CHECK-NEXT:  return
@@ -741,7 +562,7 @@ func @floordiv_bf16(%arg0: tensor<2xbf16>) -> tensor<2xbf16> {
 
 // CHECK-LABEL: func @floordiv_f16_broadcast
 func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> tensor<2x3xf16> {
-  // CHECK-NEXT:  xla_hlo.divide
+  // CHECK-NEXT:  xla_chlo.broadcast_divide
   // CHECK-NEXT:  xla_hlo.floor
   // CHECK-NEXT:  return
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
@@ -764,15 +585,15 @@ func @floordiv_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*x
 
 // CHECK-LABEL: func @floormod_broadcast_numerator
 func @floormod_broadcast_numerator(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[REM:%.+]] = "xla_hlo.remainder"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = xla_chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"([[REM]], [[ZL]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
   // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZR:%.+]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[REM:%.+]], [[ZR]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP4:%.+]] = "xla_hlo.compare"([[CMP2]], [[CMP3]]) {comparison_direction = "NE"}
-  // CHECK-DAG: [[AND:%.+]] = xla_hlo.and [[CMP1]], [[CMP4]]
-  // CHECK-DAG: [[ADD:%.+]] = xla_hlo.add %arg1, [[REM]]
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR:%.+]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM:%.+]], [[ZR]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {comparison_direction = "NE"}
+  // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]]
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[AND]], [[ADD]], [[REM]])
   // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -781,15 +602,15 @@ func @floormod_broadcast_numerator(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>)
 
 // CHECK-LABEL: func @floormod_broadcast_denominator
 func @floormod_broadcast_denominator(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[REM:%.+]] = "xla_hlo.remainder"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = xla_chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"([[REM]], [[ZL]]) {comparison_direction = "NE"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = "NE"}
   // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZR:%.+]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[REM:%.+]], [[ZR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP4:%.+]] = "xla_hlo.compare"([[CMP2]], [[CMP3]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
-  // CHECK-DAG: [[AND:%.+]] = xla_hlo.and [[CMP1]], [[CMP4]]
-  // CHECK-DAG: [[ADD:%.+]] = "xla_hlo.add"(%arg1, [[REM]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR:%.+]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM:%.+]], [[ZR]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
+  // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[AND]], [[ADD]], [[REM]])
   // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
@@ -810,201 +631,22 @@ func @floormod_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*x
   return %0: tensor<*xi32>
 }
 
+//===----------------------------------------------------------------------===//
+// BroadcastTo.
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func @broadcast_to
 func @broadcast_to(%arg0: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
   %cst = "tf.Const"() { value = dense<16> : tensor<4xi32> } : () -> tensor<4xi32>
 
   // CHECK: [[CST:%.+]] = xla_hlo.constant
-  // CHECK: "xla_hlo.dynamic_broadcast_in_dim"(%arg0, [[CST]])
+  // CHECK: [[CAST:%.+]] = tensor_cast [[CST]] : tensor<4xi32> to tensor<4xi32>
+  // CHECK: "xla_hlo.dynamic_broadcast_in_dim"(%arg0, [[CAST]])
   // CHECK-SAME: {broadcast_dimensions = dense<3> : tensor<1xi64>}
   %0 = "tf.BroadcastTo"(%arg0, %cst) : (tensor<16xf32>, tensor<4xi32>) -> tensor<16x16x16x16xf32>
   return %0 : tensor<16x16x16x16xf32>
 }
 
-//===----------------------------------------------------------------------===//
-// Equality op legalizations.
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @equal
-func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @equal_dynamic
-func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @equal_broadcast
-func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @equal_broadcast_no_incompatible_shapes_error
-func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @equal_incompatible_shape_broadcastable
-func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @equal_incompatible_shape_dynamic
-func @equal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @equal_incompatible_shape_both_dynamic
-func @equal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @equal_unranked
-func @equal_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi1> {
-  // CHECK: "tf.Equal"
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @notequal
-func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @notequal_dynamic
-func @notequal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @notequal_broadcast
-func @notequal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @notequal_broadcast_no_incompatible_shapes_error
-func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_broadcastable
-func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_dynamic
-func @notequal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_both_dynamic
-func @notequal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-//===----------------------------------------------------------------------===//
-// Compare op legalizations.
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @greater
-func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @broadcast_greater
-func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @greater_dynamic
-func @greater_dynamic(%arg0: tensor<?xi32>) -> tensor<?xi1> {
-  // CHECK:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @greater_uranked
-func @greater_uranked(%arg0: tensor<*xi32>) -> tensor<*xi1> {
-  // CHECK:  "tf.Greater"
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @greater_equal
-func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GE"}
-  %0 = "tf.GreaterEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @broadcast_greater_equal
-func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"}
-  %0 = "tf.GreaterEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @less
-func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"}
-  %0 = "tf.Less"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @broadcast_less
-func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
-  %0 = "tf.Less"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @less_equal
-func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"}
-  %0 = "tf.LessEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @broadcast_less_equal
-func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"}
-  %0 = "tf.LessEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-
 //===----------------------------------------------------------------------===//
 // Complex op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1163,6 +805,26 @@ func @infeed_dequeue_tuple() -> (tensor<3xi32>, tensor<4xf32>) {
   return %0#0, %0#1 : tensor<3xi32>, tensor<4xf32>
 }
 
+// The following op sharding is used:
+// Proto debug string:
+//   type: TUPLE
+//   tuple_shardings {
+//     type: MAXIMAL
+//     tile_assignment_dimensions: 1
+//     tile_assignment_devices: 0
+//   }
+// Serialized string:
+//   "\08\02*\08\08\01\1A\01\01\22\01\00"
+
+// CHECK-LABEL: infeed_dequeue_tuple_sharding
+func @infeed_dequeue_tuple_sharding() -> tensor<8xi32> {
+  // CHECK: "xla_hlo.infeed"
+  // An additional sharding is added at the end to account for token result.
+  // CHECK-SAME: xla_hlo.sharding = "type: TUPLE\0Atuple_shardings {\0A type: MAXIMAL\0A tile_assignment_dimensions: 1\0A tile_assignment_devices: 0\0A}\0Atuple_shardings {\0A type: MAXIMAL\0A tile_assignment_dimensions: 1\0A tile_assignment_devices: 0\0A}\0A"
+  %0 = "tf.InfeedDequeueTuple"() {_XlaSharding = "\08\02*\08\08\01\1A\01\01\22\01\00"} : () -> tensor<8xi32>
+  return %0 : tensor<8xi32>
+}
+
 //===----------------------------------------------------------------------===//
 // Nullary op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1176,8 +838,10 @@ func @const() -> tensor<2xi32> {
 
 // CHECK-LABEL: @const_dynamic_output
 func @const_dynamic_output() -> tensor<*xi32> {
-  // CHECK: xla_hlo.constant {value = dense<0> : tensor<2xi32>} : tensor<*xi32>
+  // CHECK: [[CONST:%.*]] = xla_hlo.constant dense<0> : tensor<2xi32>
+  // CHECK: [[CAST:%.*]] = tensor_cast [[CONST]] : tensor<2xi32> to tensor<*xi32>
   %0 = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> (tensor<*xi32>)
+  // CHECK: return [[CAST]]
   return %0: tensor<*xi32>
 }
 
@@ -1271,12 +935,12 @@ func @matrix_band_part(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: ten
   // CHECK: %[[X:.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<64x64xbf16>
   // CHECK: %[[Y:.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64x64xbf16>
   // CHECK: %[[OFFSET:.*]] = xla_hlo.subtract %[[X]], %[[Y]] : tensor<64x64xbf16>
-  // CHECK: %[[G:.*]] = "xla_hlo.compare"(%[[F]], %[[OFFSET]]) {comparison_direction = "LE"} : (tensor<bf16>, tensor<64x64xbf16>) -> tensor<*xi1>
+  // CHECK: %[[G:.*]] = xla_chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<bf16>, tensor<64x64xbf16>) -> tensor<64x64xi1>
 
   // CHECK: %[[H:.*]] = "xla_hlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
-  // CHECK: %[[I:.*]] = "xla_hlo.compare"(%[[OFFSET]], %[[H]]) {comparison_direction = "LE"} : (tensor<64x64xbf16>, tensor<bf16>) -> tensor<*xi1>
+  // CHECK: %[[I:.*]] = xla_chlo.broadcast_compare %[[OFFSET]], %[[H]] {comparison_direction = "LE"} : (tensor<64x64xbf16>, tensor<bf16>) -> tensor<64x64xi1>
 
-  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<*xi1>
+  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<64x64xi1>
 
   // CHECK: %[[ZERO2:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<64x64xbf16>
   // CHECK: %[[R:.*]] = "xla_hlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
@@ -1292,11 +956,11 @@ func @matrix_band_part_2(%arg0: tensor<12x24x48xbf16>, %arg1: tensor<i64>, %arg2
   // CHECK: %[[Y:.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<24x48xbf16>
   // CHECK: %[[OFFSET:.*]] = xla_hlo.subtract %[[X]], %[[Y]] : tensor<24x48xbf16>
 
-  // CHECK: %[[G:.*]] = "xla_hlo.compare"(%[[F]], %[[OFFSET]]) {comparison_direction = "LE"} : (tensor<bf16>, tensor<24x48xbf16>) -> tensor<*xi1>
+  // CHECK: %[[G:.*]] = xla_chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<bf16>, tensor<24x48xbf16>) -> tensor<24x48xi1>
 
   // CHECK: %[[H:.*]] = "xla_hlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
-  // CHECK: %[[I:.*]] = "xla_hlo.compare"(%[[OFFSET]], %[[H]]) {comparison_direction = "LE"} : (tensor<24x48xbf16>, tensor<bf16>) -> tensor<*xi1>
-  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : tensor<*xi1>
+  // CHECK: %[[I:.*]] = xla_chlo.broadcast_compare %[[OFFSET]], %[[H]] {comparison_direction = "LE"} : (tensor<24x48xbf16>, tensor<bf16>) -> tensor<24x48xi1>
+  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<24x48xi1>
 
   // CHECK: %[[ZERO2:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<12x24x48xbf16>
   // CHECK: %[[R:.*]] = "xla_hlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
@@ -1348,6 +1012,28 @@ func @maxpool_same_padding(%arg0: tensor<2x13x25x7xi32>) -> tensor<2x4x7x7xi32>
   return %0 : tensor<2x4x7x7xi32>
 }
 
+// CHECK-LABEL: maxpool_3d_valid_padding
+// CHECK-SAME: %[[ARG:.*]]: tensor
+func @maxpool_3d_valid_padding(%arg0: tensor<2x8x12x20x7xf32>) -> tensor<2x8x3x5x7xf32> {
+  // CHECK: %[[INIT:.*]] = xla_hlo.constant dense<0xFF800000> : tensor<f32>
+  // CHECK: "xla_hlo.reduce_window"(%[[ARG]], %[[INIT]])
+  // CHECK: xla_hlo.maximum
+  // CHECK: xla_hlo.return
+  // CHECK: {window_dimensions = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>, window_strides = dense<[1, 1, 4, 4, 1]> : tensor<5xi64>}
+
+  %0 = "tf.MaxPool3D"(%arg0) {data_format = "NDHWC", ksize = [1, 1, 2, 2, 1], padding = "VALID", strides = [1, 1, 4, 4, 1]} : (tensor<2x8x12x20x7xf32>) -> tensor<2x8x3x5x7xf32>
+  return %0 : tensor<2x8x3x5x7xf32>
+}
+
+// CHECK-LABEL: maxpool_3d_same_padding
+// CHECK-SAME: %[[ARG:.*]]: tensor
+func @maxpool_3d_same_padding(%arg0: tensor<2x8x13x25x7xf32>) -> tensor<2x8x4x7x7xf32> {
+  // CHECK: padding = dense<{{\[\[}}0, 0], [0, 0], [0, 1], [1, 1], [0, 0]]> : tensor<5x2xi64>
+
+  %0 = "tf.MaxPool3D"(%arg0) {data_format = "NDHWC", ksize = [1, 1, 2, 3, 1], padding = "SAME", strides = [1, 1, 4, 4, 1]} : (tensor<2x8x13x25x7xf32>) -> tensor<2x8x4x7x7xf32>
+  return %0 : tensor<2x8x4x7x7xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // MaxPoolGrad op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1376,6 +1062,25 @@ func @max_pool_grad_valid(%orig_input: tensor<10x24x24x64xf32>, %orig_output: te
   return %result : tensor<10x24x24x64xf32>
 }
 
+// CHECK-LABEL: @max_pool_3d_grad_valid
+// CHECK-SAME: %[[INPUT:.*]]: tensor<10x8x24x24x64xf32>, %arg1: tensor<10x8x12x12x64xf32>, %[[GRAD:.*]]: tensor<10x8x12x12x64xf32>
+func @max_pool_3d_grad_valid(%orig_input: tensor<10x8x24x24x64xf32>, %orig_output: tensor<10x8x12x12x64xf32>, %grad: tensor<10x8x12x12x64xf32>) -> tensor<10x8x24x24x64xf32> {
+  // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[RESULT:.*]] = "xla_hlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) ( {
+  // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
+  // CHECK: %[[SELECT_RESULT:.*]] = "xla_hlo.compare"(%[[VALUE_A]], %[[VALUE_B]]) {comparison_direction = "GE"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK: "xla_hlo.return"(%[[SELECT_RESULT]]) : (tensor<i1>) -> ()
+  // CHECK: },  {
+  // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
+  // CHECK: %[[SELECT_RESULT:.*]] = xla_hlo.add %[[VALUE_A]], %[[VALUE_B]] : tensor<f32>
+  // CHECK: "xla_hlo.return"(%[[SELECT_RESULT]]) : (tensor<f32>) -> ()
+  // CHECK: }) {window_dimensions = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>, window_strides = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>} : (tensor<10x8x24x24x64xf32>, tensor<10x8x12x12x64xf32>, tensor<f32>) -> tensor<10x8x24x24x64xf32>
+  // CHECK: return %[[RESULT]] : tensor<10x8x24x24x64xf32>
+  // CHECK: }
+  %result = "tf.MaxPool3DGrad"(%orig_input, %orig_output, %grad) {data_format = "NDHWC", ksize = [1, 1, 2, 2, 1], padding = "VALID", strides = [1, 1, 2, 2, 1]} : (tensor<10x8x24x24x64xf32>, tensor<10x8x12x12x64xf32>, tensor<10x8x12x12x64xf32>) -> tensor<10x8x24x24x64xf32>
+  return %result : tensor<10x8x24x24x64xf32>
+}
+
 // CHECK-LABEL: @max_pool_grad_same
 func @max_pool_grad_same(%orig_input: tensor<2x13x25x7xf32>, %orig_output: tensor<2x4x7x7xf32>, %grad: tensor<2x4x7x7xf32>) -> tensor<2x13x25x7xf32> {
   // CHECK: padding = dense<{{\[\[}}0, 0], [0, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
@@ -1388,6 +1093,13 @@ func @max_pool_grad_same(%orig_input: tensor<2x13x25x7xf32>, %orig_output: tenso
   return %result : tensor<2x13x25x7xf32>
 }
 
+// CHECK-LABEL: @max_pool_3d_grad_same
+func @max_pool_3d_grad_same(%orig_input: tensor<2x8x13x25x7xf32>, %orig_output: tensor<2x8x4x7x7xf32>, %grad: tensor<2x8x4x7x7xf32>) -> tensor<2x8x13x25x7xf32> {
+  // CHECK: padding = dense<{{\[\[}}0, 0], [0, 0], [0, 1], [1, 1], [0, 0]]> : tensor<5x2xi64>
+  %result = "tf.MaxPool3DGrad"(%orig_input, %orig_output, %grad) {data_format = "NDHWC", ksize = [1, 1, 2, 3, 1], padding = "SAME", strides = [1, 1, 4, 4, 1]} : (tensor<2x8x13x25x7xf32>, tensor<2x8x4x7x7xf32>, tensor<2x8x4x7x7xf32>) -> tensor<2x8x13x25x7xf32>
+  return %result : tensor<2x8x13x25x7xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // OneHot op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1395,12 +1107,13 @@ func @max_pool_grad_same(%orig_input: tensor<2x13x25x7xf32>, %orig_output: tenso
 // CHECK-LABEL:one_hot
 func @one_hot(%indices: tensor<3xi32>, %on_value: tensor<f32>, %off_value: tensor<f32>) -> tensor<3x5xf32> {
   // CHECK: %[[IOTA:.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<3x5xi32>
-  // CHECK: %[[COMPARE:.*]] = "xla_hlo.compare"(%arg0, %[[IOTA]]) {broadcast_dimensions = dense<0> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<3xi32>, tensor<3x5xi32>) -> tensor<3x5xi1>
+  // CHECK: %[[BCAST_ARG0:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<3x5xi32>
+  // CHECK: %[[COMPARE:.*]] = "xla_hlo.compare"(%[[BCAST_ARG0]], %[[IOTA]]) {comparison_direction = "EQ"} : (tensor<3x5xi32>, tensor<3x5xi32>) -> tensor<3x5xi1>
   // CHECK: %[[ON_VALUE:.*]] = "xla_hlo.broadcast"(%arg1) {broadcast_sizes = dense<[3, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<3x5xf32>
   // CHECK: %[[OFF_VALUE:.*]] = "xla_hlo.broadcast"(%arg2) {broadcast_sizes = dense<[3, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<3x5xf32>
   // CHECK: %[[RESULT:.*]] = "xla_hlo.select"(%[[COMPARE]], %[[ON_VALUE]], %[[OFF_VALUE]]) : (tensor<3x5xi1>, tensor<3x5xf32>, tensor<3x5xf32>) -> tensor<3x5xf32>
   // CHECK: return %[[RESULT]] : tensor<3x5xf32>
-  %depth = "tf.Const"() { value = dense<5> : tensor<i64> } : () -> tensor<i32>
+  %depth = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
   %result = "tf.OneHot"(%indices, %depth, %on_value, %off_value) {axis = -1 : i64} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<3x5xf32>
   return %result : tensor<3x5xf32>
 }
@@ -1487,6 +1200,44 @@ func @unhandled_partitioned_call_2(%arg0: tensor<i32>, %arg1: tensor<*xi32>) ->
   return %0, %1 : tensor<i32>, tensor<i32>
 }
 
+
+//===----------------------------------------------------------------------===//
+// ReverseV2 op legalization.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @reverse_func_32
+func @reverse_func_32(%arg0: tensor<5xi32>) -> tensor<5xi32> {
+  %axis = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>)
+
+  // CHECK: [[VAL:%.+]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>}
+  %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5xi32>, tensor<1xi32>) -> tensor<5xi32>
+
+  // CHECK: return [[VAL]] : tensor<5xi32>
+  return %reversed : tensor<5xi32>
+}
+
+// CHECK-LABEL: @reverse_func_64
+func @reverse_func_64(%arg0: tensor<5xi32>) -> tensor<5xi32> {
+  %axis = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> (tensor<1xi64>)
+
+  // CHECK: [[VAL:%.+]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>}
+  %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5xi32>, tensor<1xi64>) -> tensor<5xi32>
+
+  // CHECK: return [[VAL]] : tensor<5xi32>
+  return %reversed : tensor<5xi32>
+}
+
+// CHECK-LABEL: @reverse_func_neg
+func @reverse_func_neg(%arg0: tensor<5x5xi32>) -> tensor<5x5xi32> {
+  %axis = "tf.Const"() {value = dense<[-1]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+
+  // CHECK: [[VAL:%.+]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<1> : tensor<1xi64>}
+  %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5x5xi32>, tensor<1xi32>) -> tensor<5x5xi32>
+
+  // CHECK: return [[VAL]] : tensor<5x5xi32>
+  return %reversed : tensor<5x5xi32>
+}
+
 //===----------------------------------------------------------------------===//
 // StatefulPartitionedCall op legalization.
 //===----------------------------------------------------------------------===//
@@ -1522,7 +1273,7 @@ func @stateful_pcall_multi_in_out(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (te
 // CHECK-LABEL: func @relu
 func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0> : tensor<i32>
-  // CHECK: "xla_hlo.maximum"(%[[ZERO]], %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: xla_chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   %0 = "tf.Relu"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
@@ -1530,7 +1281,7 @@ func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 // CHECK-LABEL: func @relu_unranked
 func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0> : tensor<i32>
-  // CHECK: "xla_hlo.maximum"(%[[ZERO]], %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: xla_chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   %0 = "tf.Relu"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
   return %0: tensor<?xi32>
 }
@@ -1558,8 +1309,8 @@ func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 func @relu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) -> tensor<4x8xf32> {
   // CHECK-DAG: %[[ZERO_SCALAR:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<4x8xf32>
-  // CHECK-DAG: %[[PRED:.*]] = "xla_hlo.compare"(%[[FEATURES]], %[[ZERO_SCALAR]]) {comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<*xi1>
-  // CHECK-DAG: %[[RESULT:.*]] = "xla_hlo.select"(%[[PRED]], %[[GRADIENTS]], %[[ZERO]]) : (tensor<*xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
+  // CHECK-DAG: %[[PRED:.*]] = xla_chlo.broadcast_compare %[[FEATURES]], %[[ZERO_SCALAR]] {comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
+  // CHECK-DAG: %[[RESULT:.*]] = "xla_hlo.select"(%[[PRED]], %[[GRADIENTS]], %[[ZERO]]) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
   // CHECK-DAG: return %[[RESULT]] : tensor<4x8xf32>
   %2 = "tf.ReluGrad"(%gradients, %features) : (tensor<4x8xf32>, tensor<?x?xf32>) -> tensor<4x8xf32>
   return %2 : tensor<4x8xf32>
@@ -1569,27 +1320,6 @@ func @relu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) -> tens
 // Select op legalizations.
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: func @select
-func @select(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
-  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @select_float
-func @select_float(%arg0: tensor<2xi1>, %arg1: tensor<2xf32>, %arg2: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
-  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  return %0: tensor<2xf32>
-}
-
-// CHECK-LABEL: func @select_multidimensional
-func @select_multidimensional(%arg0: tensor<3x2xi1>, %arg1: tensor<3x2xi32>, %arg2: tensor<3x2xi32>) -> tensor<3x2xi32> {
-  // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
-  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3x2xi1>, tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
-  return %0: tensor<3x2xi32>
-}
-
 // CHECK-LABEL: func @selectv2
 func @selectv2(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
@@ -1628,6 +1358,14 @@ func @selectv2_broadcast_pred(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %ar
   return %0: tensor<2x8x8xi32>
 }
 
+// CHECK-LABEL: func @selectv2_broadcast_tensor_pred
+func @selectv2_broadcast_tensor_pred(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK: %[[BROADCAST:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi1>) -> tensor<2x3xi1>
+  // CHECK: "xla_hlo.select"(%[[BROADCAST]], %arg1, %arg2)
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
 // CHECK-LABEL: func @selectv2_broadcast_all
 func @selectv2_broadcast_all(%arg0: tensor<8x1x1xi1>, %arg1: tensor<1x8x1xi32>, %arg2: tensor<1x1x8xi32>) -> tensor<8x8x8xi32> {
   // CHECK-DAG: %[[BROADCAST_0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x1x1xi1>) -> tensor<8x8x8xi1>
@@ -1669,7 +1407,10 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: {dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2xf32>
   // CHECK: %[[CASTED_MAX:.*]] = "xla_hlo.convert"(%[[MAX]]) : (tensor<2xf32>) -> tensor<2xf32>
 
-  // CHECK: %[[SHIFTED_INP:.*]] = "xla_hlo.subtract"(%[[ARG0]], %[[CASTED_MAX]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]]) : (!shape.shape) -> tensor<2xindex>
+  // CHECK: %[[BCAST_MAX:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[CASTED_MAX]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[SHIFTED_INP:.*]] = xla_hlo.subtract %[[ARG0]], %[[BCAST_MAX]]
   // CHECK: %[[EXP:.*]] = "xla_hlo.exponential"(%[[SHIFTED_INP]])
 
   // Verify reduce op for summation and its body.
@@ -1681,8 +1422,11 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: {dimensions = dense<1> : tensor<1xi64>}
   // CHECK: %[[CASTED_SUM:.*]] = "xla_hlo.convert"(%[[SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
 
-  // CHECK: %[[RESULT:.*]] = "xla_hlo.divide"(%[[EXP]], %[[CASTED_SUM]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // return %[[RESULT]]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]]) : (!shape.shape) -> tensor<2xindex>
+  // CHECK: %[[BCAST_SUM:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[CASTED_SUM]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.*]] = xla_hlo.divide %[[EXP]], %[[BCAST_SUM]]
+  // CHECK: return %[[RESULT]]
 
   %0 = "tf.Softmax"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
   return %0: tensor<2x3xf32>
@@ -1691,7 +1435,7 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // Verify intermediate and final shape are correct with dynamic shapes.
 // CHECK-LABEL: func @dynamic_softmax
 func @dynamic_softmax(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK: "xla_hlo.divide"({{.*}}) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xf32>
+  // CHECK: xla_hlo.divide {{.*}}  : tensor<?x?xf32>
   %0 = "tf.Softmax"(%arg0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
   return %0: tensor<?x?xf32>
 }
@@ -1717,43 +1461,29 @@ func @rank4_softmax(%arg0: tensor<2x3x4x5xf16>) -> tensor<2x3x4x5xf16> {
   // CHECK: "xla_hlo.reduce"
   // CHECK: dimensions = dense<3>
 
-  // CHECK: "xla_hlo.divide"{{.*}} {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK: {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK: xla_hlo.divide {{.*}}
   %0 = "tf.Softmax"(%arg0) : (tensor<2x3x4x5xf16>) -> tensor<2x3x4x5xf16>
   return %0: tensor<2x3x4x5xf16>
 }
 
 //===----------------------------------------------------------------------===//
 // LogSoftmax op legalizations.
+// This just changes the tail of the regular Softmax legalization
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @simple_logsoftmax
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x3xf32>)
 func @simple_logsoftmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-
-  // Verify reduce op for max computation and its body.
-  // CHECK-DAG: %[[CASTED_INP:.*]] = "xla_hlo.convert"(%[[ARG0]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[NEG_INF:.*]] = xla_hlo.constant dense<0xFF800000> : tensor<f32>
-  // CHECK: %[[MAX:.*]] = "xla_hlo.reduce"(%[[CASTED_INP]], %[[NEG_INF]])
-  // CHECK:  xla_hlo.maximum
-  // CHECK: "xla_hlo.return"
-  // CHECK: {dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2xf32>
-  // CHECK: %[[CASTED_MAX:.*]] = "xla_hlo.convert"(%[[MAX]]) : (tensor<2xf32>) -> tensor<2xf32>
-
-  // CHECK: %[[SHIFTED_INP:.*]] = "xla_hlo.subtract"(%[[ARG0]], %[[CASTED_MAX]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK: %[[EXP:.*]] = "xla_hlo.exponential"(%[[SHIFTED_INP]])
-
-  // Verify reduce op for summation and its body.
-  // CHECK-DAG: %[[CASTED_EXP:.*]] = "xla_hlo.convert"(%[[EXP]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: %[[SUM:.*]] = "xla_hlo.reduce"(%[[CASTED_EXP]], %[[ZERO]])
-  // CHECK:  xla_hlo.add
-  // CHECK: "xla_hlo.return"
-  // CHECK: {dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %{{.*}} = "xla_hlo.reduce"({{.*}})
+  // CHECK: %[[SUM:.*]] = "xla_hlo.reduce"({{.*}})
   // CHECK: %[[CASTED_SUM:.*]] = "xla_hlo.convert"(%[[SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: %[[LOG:.*]] = "xla_hlo.log"(%[[CASTED_SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
-
-  // CHECK: %[[RESULT:.*]] = "xla_hlo.subtract"(%[[SHIFTED_INP]], %[[LOG]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // return %[[RESULT]]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]]) : (!shape.shape) -> tensor<2xindex>
+  // CHECK: %[[BCAST_SUM:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[LOG]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.*]] = xla_hlo.subtract {{.*}}, %[[BCAST_SUM]]
+  // CHECK: return %[[RESULT]]
 
   %0 = "tf.LogSoftmax"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
   return %0: tensor<2x3xf32>
@@ -2064,6 +1794,17 @@ func @sigmoid(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL: @sigmoid_grad
+func @sigmoid_grad(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK-DAG: [[MUL0:%.+]] =  xla_hlo.multiply %arg1, %arg0 : tensor<2xf32>
+  // CHECK-DAG: [[ONE:%.+]] = xla_hlo.constant dense<1.000000e+00> : tensor<2xf32>
+  // CHECK-DAG: [[SUB:%.+]] =  xla_hlo.subtract [[ONE]], %arg0 : tensor<2xf32>
+  // CHECK-DAG: [[MUL1:%.+]] =  xla_hlo.multiply [[MUL0]], [[SUB]] : tensor<2xf32>
+  // CHECK: return [[MUL1]]
+  %0 = "tf.SigmoidGrad"(%arg0, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
 // CHECK-LABEL: @sin
 func @sin(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK:  "xla_hlo.sine"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
@@ -2085,7 +1826,6 @@ func @sin_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %0 : tensor<*xf32>
 }
 
-
 // CHECK-LABEL: func @rsqrt
 func @rsqrt(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK:  "xla_hlo.rsqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
@@ -2248,8 +1988,18 @@ func @sign(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
 // CHECK-LABEL: slice_constant_start
 func @slice_constant_start(%arg0: tensor<4xi32>) -> tensor<2xi32> {
   // CHECK: %[[START:.*]] = xla_hlo.constant dense<1> : tensor<1xi64>
-  // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START]]) : (tensor<1xi64>) -> tensor<1xi64>
-  // CHECK: %[[RESULT:.*]] =  "xla_hlo.dynamic-slice"(%arg0, %[[START_I64]]) {slice_sizes = dense<2> : tensor<1xi64>} : (tensor<4xi32>, tensor<1xi64>) -> tensor<2xi32>
+  // CHECK: %[[CAST:.*]] = tensor_cast %[[START]] : tensor<1xi64> to tensor<1xi64>
+  // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[CAST]]) : (tensor<1xi64>) -> tensor<1xi64>
+  // CHECK: %[[SLICED_START:.*]] = "xla_hlo.slice"(%[[START_I64]])
+  // CHECK-DAG-SAME: {limit_indices = dense<1> : tensor<1xi64>,
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>,
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} :
+  // CHECK-DAG-SAME: (tensor<1xi64>) -> tensor<1xi64>
+  // CHECK: %[[RESHAPED_START:.*]] = "xla_hlo.reshape"(%[[SLICED_START:.*]]) :
+  // CHECK-DAG-SAME: (tensor<1xi64>) -> tensor<i64>
+  // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[RESHAPED_START]])
+  // CHECK-DAG-SAME: {slice_sizes = dense<2> : tensor<1xi64>} :
+  // CHECK-DAG-SAME: (tensor<4xi32>, tensor<i64>) -> tensor<2xi32>
   // CHECK: return %[[RESULT]] : tensor<2xi32>
   %starts = "tf.Const"() {value = dense<[1]> : tensor<1xi64>} : () -> (tensor<1xi64>)
   %sizes = "tf.Const"() {value = dense<[2]> : tensor<1xi64>} : () -> (tensor<1xi64>)
@@ -2260,8 +2010,14 @@ func @slice_constant_start(%arg0: tensor<4xi32>) -> tensor<2xi32> {
 // CHECK-LABEL: slice_i32_consts
 func @slice_i32_consts(%arg0: tensor<4xi32>) -> tensor<2xi32> {
   // CHECK: %[[START:.*]] = xla_hlo.constant dense<1> : tensor<1xi32>
-  // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START]]) : (tensor<1xi32>) -> tensor<1xi64>
-  // CHECK: slice_sizes = dense<2> : tensor<1xi64>
+  // CHECK: %[[START_CAST:.*]] = tensor_cast %[[START]] : tensor<1xi32> to tensor<1xi32>
+  // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START_CAST]]) : (tensor<1xi32>) -> tensor<1xi64>
+  // CHECK: %[[SLICED_START:.*]] = "xla_hlo.slice"(%[[START_I64]])
+  // CHECK-DAG-SAME: {limit_indices = dense<1> : tensor<1xi64>,
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>,
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} : (tensor<1xi64>) -> tensor<1xi64>
+  // CHECK: %[[RESHAPED_START:.*]] = "xla_hlo.reshape"(%[[SLICED_START]]) : (tensor<1xi64>) -> tensor<i64>
+  // CHECK: "xla_hlo.dynamic-slice"(%arg0, %[[RESHAPED_START]]) {slice_sizes = dense<2> : tensor<1xi64>} : (tensor<4xi32>, tensor<i64>) -> tensor<2xi32>
   %starts = "tf.Const"() {value = dense<[1]> : tensor<1xi32>} : () -> (tensor<1xi32>)
   %sizes = "tf.Const"() {value = dense<[2]> : tensor<1xi32>} : () -> (tensor<1xi32>)
   %0 = "tf.Slice"(%arg0, %starts, %sizes) : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
@@ -2271,8 +2027,14 @@ func @slice_i32_consts(%arg0: tensor<4xi32>) -> tensor<2xi32> {
 // CHECK-LABEL: slice_constant_start_negative_one_size
 func @slice_constant_start_negative_one_size(%arg0: tensor<4xi32>) -> tensor<3xi32> {
   // CHECK: %[[START:.*]] = xla_hlo.constant dense<1> : tensor<1xi64>
-  // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START]]) : (tensor<1xi64>) -> tensor<1xi64>
-  // CHECK: %[[RESULT:.*]] =  "xla_hlo.dynamic-slice"(%arg0, %[[START_I64]]) {slice_sizes = dense<3> : tensor<1xi64>} : (tensor<4xi32>, tensor<1xi64>) -> tensor<3xi32>
+  // CHECK: %[[START_CAST:.*]] = tensor_cast %[[START]] : tensor<1xi64> to tensor<1xi64>
+  // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START_CAST]]) : (tensor<1xi64>) -> tensor<1xi64>
+  // CHECK: %[[SLICED_START:.*]] = "xla_hlo.slice"(%[[START_I64]])
+  // CHECK-DAG-SAME: {limit_indices = dense<1> : tensor<1xi64>,
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>,
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} : (tensor<1xi64>) -> tensor<1xi64>
+  // CHECK: %[[RESHAPED_START:.*]] = "xla_hlo.reshape"(%[[SLICED_START]]) : (tensor<1xi64>) -> tensor<i64>
+  // CHECK: %[[RESULT:.*]] =  "xla_hlo.dynamic-slice"(%arg0, %[[RESHAPED_START]]) {slice_sizes = dense<3> : tensor<1xi64>} : (tensor<4xi32>, tensor<i64>) -> tensor<3xi32>
   // CHECK: return %[[RESULT]] : tensor<3xi32>
   %starts = "tf.Const"() {value = dense<[1]> : tensor<1xi64>} : () -> (tensor<1xi64>)
   %sizes = "tf.Const"() {value = dense<[-1]> : tensor<1xi64>} : () -> (tensor<1xi64>)
@@ -2283,8 +2045,26 @@ func @slice_constant_start_negative_one_size(%arg0: tensor<4xi32>) -> tensor<3xi
 // CHECK-LABEL: slice_constant_start_dynamic_shape
 func @slice_constant_start_dynamic_shape(%arg0: tensor<?x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> {
   // CHECK: %[[START:.*]] = xla_hlo.constant dense<[1, 0]> : tensor<2xi64>
-  // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START]]) : (tensor<2xi64>) -> tensor<2xi64>
-  // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[START_I64]]) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<?x4xi32>, tensor<2xi64>) -> tensor<1x4xi32>
+  // CHECK: %[[START_CAST:.*]] = tensor_cast %[[START]] : tensor<2xi64> to tensor<2xi64>
+  // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START_CAST]]) : (tensor<2xi64>) -> tensor<2xi64>
+  // CHECK: %[[SLICED_START1:.*]] = "xla_hlo.slice"(%[[START_I64]])
+  // CHECK-DAG-SAME: {limit_indices = dense<1> : tensor<1xi64>,
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>,
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} :
+  // CHECK-DAG-SAME: (tensor<2xi64>) -> tensor<1xi64>
+  // CHECK: %[[RESHAPED_START1:.*]] = "xla_hlo.reshape"(%[[SLICED_START1]]) :
+  // CHECK-DAG-SAME: (tensor<1xi64>) -> tensor<i64>
+  // CHECK: %[[SLICED_START2:.*]] = "xla_hlo.slice"(%[[START_I64]])
+  // CHECK-DAG-SAME: {limit_indices = dense<2> : tensor<1xi64>,
+  // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>,
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} :
+  // CHECK-DAG-SAME: (tensor<2xi64>) -> tensor<1xi64>
+  // CHECK: %[[RESHAPED_START2:.*]] = "xla_hlo.reshape"(%[[SLICED_START2]]) :
+  // CHECK-DAG-SAME: (tensor<1xi64>) -> tensor<i64>
+  // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"
+  // CHECK-DAG-SAME: (%arg0, %[[RESHAPED_START1]], %[[RESHAPED_START2]])
+  // CHECK-DAG-SAME: {slice_sizes = dense<[1, 4]> : tensor<2xi64>} :
+  // CHECK-DAG-SAME: (tensor<?x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   // CHECK: return %[[RESULT]] : tensor<1x4xi32>
   %starts = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> (tensor<2xi64>)
   %sizes = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi64>} : () -> (tensor<2xi64>)
@@ -2295,7 +2075,14 @@ func @slice_constant_start_dynamic_shape(%arg0: tensor<?x4xi32>, %arg1: tensor<2
 // CHECK-LABEL: slice_variable_start
 func @slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> {
   // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%arg1) : (tensor<2xi64>) -> tensor<2xi64>
-  // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[START_I64]]) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<2xi64>) -> tensor<1x4xi32>
+  // CHECK: %[[SLICED_START1:.*]] = "xla_hlo.slice"(%[[START_I64]])
+  // CHECK-DAG-SAME: {limit_indices = dense<1> : tensor<1xi64>,
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>,
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} : (tensor<2xi64>) -> tensor<1xi64>
+  // CHECK: %[[RESHAPED_START1:.*]] = "xla_hlo.reshape"(%[[SLICED_START1]]) : (tensor<1xi64>) -> tensor<i64>
+  // CHECK: %[[SLICED_START2:.*]] = "xla_hlo.slice"(%[[START_I64]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi64>) -> tensor<1xi64>
+  // CHECK: %[[RESHAPED_START2:.*]] = "xla_hlo.reshape"(%[[SLICED_START2]]) : (tensor<1xi64>) -> tensor<i64>
+  // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[RESHAPED_START1]], %[[RESHAPED_START2]]) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   // CHECK: return %[[RESULT]] : tensor<1x4xi32>
   %sizes = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi64>} : () -> (tensor<2xi64>)
   %0 = "tf.Slice"(%arg0, %arg1, %sizes) : (tensor<3x4xi32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x4xi32>
@@ -2380,8 +2167,8 @@ func @strided_slice_begin_end_mask(%input: tensor<4x128x1024xf32>) {
   // Begin:        1,   4,   -3
   // End:          8,  65,   42
   // Stride:       1,   4,   -1
-  // Begin mask:   1,   0,    0  (= 1)
-  // End mask:     0,   0,    1  (= 4)
+  // Begin mask:   0,   0,    1  (= 1)
+  // End mask:     1,   0,    0  (= 4)
 
   // So result shape:
   // Dim #0: begin mask (1) -> begin = 0; end 8 canonicalized to 4: so 4
@@ -2528,6 +2315,142 @@ func @strided_slice_implicit_ellipsis_mask(%input: tensor<10x16x2xf32>) -> tenso
   return %0 : tensor<2x16x2xf32>
 }
 
+// CHECK-LABEL: strided_slice_nonconstant_begin_end
+func @strided_slice_nonconstant_begin_end(%arg0: tensor<i32>, %arg1: tensor<32x1x97xi32>) -> (tensor<1x97xi32>) {
+  // In this case, the `begin` and `end` inputs are unknown at compile time --
+  // so the StridedSlice needs to slice these vectors and use that as input to
+  // an HLO dynamic slice.
+  %begin = "tf.Pack"(%arg0) {N = 1 : i64, T = i32, axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %2 = "tf.AddV2"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %end = "tf.Pack"(%2) {N = 1 : i64, T = i32, axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  // CHECK: %[[A:.*]] = "xla_hlo.reshape"(%arg0) : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[BEGIN:.*]] = "xla_hlo.concatenate"(%[[A]])
+  // CHECK-DAG-SAME: {dimension = 0 : i64} : (tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0> : tensor<i32>
+  // CHECK-NEXT: %[[INDEX:.*]] = "xla_hlo.slice"(%[[BEGIN]])
+  // CHECK-DAG-SAME: {limit_indices = dense<1> : tensor<1xi64>,
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>,
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[INDEX2:.*]] = "xla_hlo.reshape"(%[[INDEX]]) : (tensor<1xi32>) -> tensor<i32>
+  // CHECK-NEXT: %[[CMP:.*]] = xla_chlo.broadcast_compare %[[INDEX2]], %[[ZERO]]
+  // CHECK-DAG-SAME: {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT: %[[DIM:.*]] = xla_hlo.constant dense<32> : tensor<i32>
+  // CHECK-NEXT: %[[WRAP:.*]] = xla_chlo.broadcast_add %[[DIM]], %[[INDEX2]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK-NEXT: %[[INDEX3:.*]] = "xla_hlo.select"(%[[CMP]], %[[WRAP]], %[[INDEX2]]) :
+  // CHECK-DAG-SAME: (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK-NEXT: %[[SLICED:.*]] = "xla_hlo.dynamic-slice"
+  // CHECK-DAG-SAME: (%arg1, %[[INDEX3]], %[[ZERO]], %[[ZERO]])
+  // CHECK-DAG-SAME: {slice_sizes = dense<[1, 1, 97]> : tensor<3xi64>} :
+  // CHECK-DAG-SAME: (tensor<32x1x97xi32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<1x97xi32>
+  // CHECK-NEXT: %[[FINAL:.*]] = "xla_hlo.reshape"(%[[SLICED]]) : (tensor<1x97xi32>) -> tensor<1x97xi32>
+  %result = "tf.StridedSlice"(%arg1, %begin, %end, %1) {Index = i32, T = i32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  // CHECK-NEXT: return %[[FINAL]] : tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_stride_1
+func @strided_slice_nonconstant_begin_end_stride_1(%input: tensor<32x1x97xi32>, %begin: tensor<1xi32>, %end: tensor<1xi32>, %strides: tensor<1xi32>) -> (tensor<1x97xi32>) {
+  // Dynamic stride: when `begin` and `end` inputs are unknown at compile time,
+  // `strides` must be known.
+  // CHECK: tf.StridedSlice
+  %result = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = i32, begin_mask = 4 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_stride_2
+func @strided_slice_nonconstant_begin_end_stride_2(%input: tensor<32x1x97xi32>, %begin: tensor<1xi32>, %end: tensor<1xi32>) -> (tensor<1x97xi32>) {
+  // Invalid stride (not equal to 1): when `begin` and `end` inputs are unknown
+  // at compile time, `strides` must be known to have all 1 values.
+  %strides = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: tf.StridedSlice
+  %result = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = i32, begin_mask = 4 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_invalid_elem_count
+func @strided_slice_nonconstant_begin_end_invalid_elem_count(%input: tensor<4x8xf32>, %begin: tensor<2xi64>, %end: tensor<2xi64>) -> tensor<6x10xf32> {
+  %strides = "tf.Const"() { value = dense<[1, 1]> : tensor<2xi64> } : () -> tensor<2xi64>
+  // When begin/end are dynamic, the number of output elements must be equal to
+  // the number of input elements sliced.
+  // CHECK: tf.StridedSlice
+  %0 = "tf.StridedSlice"(%input, %begin, %end, %strides) : (tensor<4x8xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<6x10xf32>
+  return %0 : tensor<6x10xf32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_and_begin_mask
+func @strided_slice_nonconstant_begin_end_and_begin_mask(%input: tensor<32x1x97xi32>, %begin: tensor<1xi32>, %end: tensor<1xi32>) -> (tensor<1x97xi32>) {
+  // Begin mask: When `begin` and `end` inputs are unknown at compile time, we
+  // can't support a begin mask.
+  %strides = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: tf.StridedSlice
+  %result = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = i32, begin_mask = 4 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_and_end_mask
+func @strided_slice_nonconstant_begin_end_and_end_mask(%input: tensor<32x1x97xi32>, %begin: tensor<1xi32>, %end: tensor<1xi32>) -> (tensor<1x97xi32>) {
+  // End mask: When `begin` and `end` inputs are unknown at compile time, we
+  // can't support an end mask.
+  %strides = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: tf.StridedSlice
+  %result = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = i32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_and_new_axis_mask
+func @strided_slice_nonconstant_begin_end_and_new_axis_mask(%input: tensor<32x1x97xi32>, %begin: tensor<1xi32>, %end: tensor<1xi32>) -> (tensor<1x97xi32>) {
+  // New axis mask: When `begin` and `end` inputs are unknown at compile time,
+  // we can't support a new_axis mask.
+  %strides = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: tf.StridedSlice
+  %result = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = i32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 15 : i64, shrink_axis_mask = 1 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_and_ellipsis_mask
+func @strided_slice_nonconstant_begin_end_and_ellipsis_mask(%input: tensor<32x1x97xi32>, %begin: tensor<1xi32>, %end: tensor<1xi32>) -> (tensor<1x97xi32>) {
+  // This ellipsis mask is not supported because it does not refer to the last
+  // dimension.
+  // [0, 1, 0] = 2
+  %strides = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: tf.StridedSlice
+  %result = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = i32, begin_mask = 0 : i64, device = "", ellipsis_mask = 2 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_and_valid_ellipsis_mask
+func @strided_slice_nonconstant_begin_end_and_valid_ellipsis_mask(%input: tensor<32x1x97xi32>, %begin: tensor<1xi32>, %end: tensor<1xi32>) -> (tensor<1x97xi32>) {
+  // This ellipsis mask is supported because it refers to the last dimension.
+  // [1, 0, 0] = 4
+  %strides = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: xla_hlo.dynamic-slice
+  %result = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = i32, begin_mask = 0 : i64, device = "", ellipsis_mask = 4 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_and_valid_shrink_axis_mask
+func @strided_slice_nonconstant_begin_end_and_valid_shrink_axis_mask(%input: tensor<32x1x97xi32>, %begin: tensor<1xi32>, %end: tensor<1xi32>) -> (tensor<1x97xi32>) {
+  // This shrink_axis mask is supported because it refers to a major dimension.
+  // [1, 1, 1] = 7
+  %strides = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: xla_hlo.dynamic-slice
+  %result = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = i32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 7 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
+// CHECK-LABEL: strided_slice_nonconstant_begin_end_and_invalid_shrink_axis_mask
+func @strided_slice_nonconstant_begin_end_and_invalid_shrink_axis_mask(%input: tensor<32x1x97xi32>, %begin: tensor<1xi32>, %end: tensor<1xi32>) -> (tensor<1x97xi32>) {
+  // This shrink_axis mask is unsupported because it does not refer to a major
+  // dimension.
+  // [0, 1, 0] = 2
+  %strides = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: tf.StridedSlice
+  %result = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = i32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<32x1x97xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x97xi32>
+  return %result : tensor<1x97xi32>
+}
+
 
 //===----------------------------------------------------------------------===//
 // Reduction op legalizations.
@@ -2543,7 +2466,7 @@ func @mean(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> {
   // CHECK:  "xla_hlo.return"(%[[REDUCE_BODY_RESULT]]) : (tensor<f32>) -> ()
   // CHECK: }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: %[[DIVISOR:.*]] = xla_hlo.constant dense<8.000000e+00> : tensor<f32>
-  // CHECK: %[[MEAN:.*]] = "xla_hlo.divide"(%[[REDUCED]], %[[DIVISOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: %[[MEAN:.*]] = xla_chlo.broadcast_divide %[[REDUCED]], %[[DIVISOR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: %[[CAST_BACK:.*]] = "xla_hlo.convert"(%[[MEAN]]) : (tensor<4xf32>) -> tensor<4xf16>
   // CHECK: %[[RESULT:.*]] = "xla_hlo.reshape"(%[[CAST_BACK]]) : (tensor<4xf16>) -> tensor<4x1xf16>
   // CHECK: return %[[RESULT]] : tensor<4x1xf16>
@@ -2847,8 +2770,8 @@ func @rng_std_normal(%arg0: tensor<3xi32>) -> tensor<12x?x64xf32> {
 func @range(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<5xf32> {
   %1 = "tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "range/limit", value = dense<5.000000e+00> : tensor<f32>} : () -> tensor<f32>
   // CHECK-DAG: [[IOTA:%.*]] = "xla_hlo.iota"
-  // CHECK-DAG: [[MUL:%.*]] = "xla_hlo.multiply"([[IOTA]], [[DELTA]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
-  // CHECK: "xla_hlo.add"([[MUL]], [[START]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.*]] = xla_chlo.broadcast_multiply [[IOTA]], [[DELTA]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK: xla_chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
   %3 = "tf.Range"(%arg0, %1, %arg1) {Tidx = "tfdtype$DT_FLOAT", device = "", name = "range"} : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<5xf32>
   return %3 : tensor<5xf32>
 }
@@ -2857,14 +2780,15 @@ func @range(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<5xf32> {
 // CHECK-SAME: [[START:%.*]]: tensor<f32>, [[STOP:%.*]]: tensor<f32>
 func @linspace_static(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<4xf32> {
   // CHECK-DAG: [[NUM:%.*]] = xla_hlo.constant dense<4>
-  // CHECK-DAG: [[NUM_F32:%.*]] = "xla_hlo.convert"([[NUM]])
+  // CHECK-DAG: [[NUM_CAST:%.*]] = tensor_cast [[NUM]]
+  // CHECK-DAG: [[NUM_F32:%.*]] = "xla_hlo.convert"([[NUM_CAST]])
   // CHECK-DAG: [[ONE:%.*]] = xla_hlo.constant dense<1.000000e+00>
-  // CHECK-DAG: [[STEP_DENOMINATOR:%.*]] = xla_hlo.subtract [[NUM_F32]], [[ONE]]
-  // CHECK-DAG: [[STEP_NUMERATOR:%.*]] = xla_hlo.subtract [[STOP]], [[START]]
-  // CHECK-DAG: [[STEP:%.*]] = xla_hlo.divide [[STEP_NUMERATOR]], [[STEP_DENOMINATOR]]
+  // CHECK-DAG: [[STEP_DENOMINATOR:%.*]] = xla_chlo.broadcast_subtract [[NUM_F32]], [[ONE]]
+  // CHECK-DAG: [[STEP_NUMERATOR:%.*]] = xla_chlo.broadcast_subtract [[STOP]], [[START]]
+  // CHECK-DAG: [[STEP:%.*]] = xla_chlo.broadcast_divide [[STEP_NUMERATOR]], [[STEP_DENOMINATOR]]
   // CHECK-DAG: [[IOTA:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64}
-  // CHECK-DAG: [[MUL:%.*]] = "xla_hlo.multiply"([[IOTA]], [[STEP]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
-  // CHECK-DAG: [[LINSPACE:%.*]] = "xla_hlo.add"([[MUL]], [[START]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.*]] = xla_chlo.broadcast_multiply [[IOTA]], [[STEP]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK-DAG: [[LINSPACE:%.*]] = xla_chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
   // CHECK: return [[LINSPACE]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<4> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.LinSpace"(%arg0, %arg1, %0) : (tensor<f32>, tensor<f32>, tensor<i32>) -> tensor<4xf32>
@@ -2880,10 +2804,10 @@ func @linspace_dynamic(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32
 
 // CHECK-LABEL: func @linspace_invalid_num
 func @linspace_invalid_num(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<?xf32> {
-  // CHECK: xla_hlo.constant {value = dense<[]> : tensor<0xi32>} : tensor<i32>
+  // CHECK: xla_hlo.constant dense<[]> : tensor<0xi32>
   // CHECK: "tf.LinSpace"
-  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<[]> : tensor<0xi32>} : () -> tensor<i32>
-  %1 = "tf.LinSpace"(%arg0, %arg1, %0) : (tensor<f32>, tensor<f32>, tensor<i32>) -> tensor<?xf32>
+  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %1 = "tf.LinSpace"(%arg0, %arg1, %0) : (tensor<f32>, tensor<f32>, tensor<0xi32>) -> tensor<?xf32>
   return %1 : tensor<?xf32>
 }
 
@@ -2922,6 +2846,37 @@ func @conv_simple(%arg0: tensor<256x32x32x6xf32>, %arg1: tensor<3x3x3x16xf32>) -
   return %0 : tensor<256x30x30x16xf32>
 }
 
+// CHECK-LABEL: conv3d_simple
+func @conv3d_simple(%arg0: tensor<256x32x32x32x6xf32>, %arg1: tensor<3x3x3x3x16xf32>) -> tensor<256x30x30x30x16xf32> {
+
+  // CHECK: "xla_hlo.convolution"(%arg0, %arg1)
+
+  // Default attributes
+  // CHECK-NOT: lhs_dilation
+  // CHECK-NOT: precision_config
+
+  // CHECK-DAG-SAME: window_strides = dense<[5, 6, 7]>
+  // CHECK-DAG-SAME: padding = dense<[[1, 2], [2, 3], [2, 3]]>
+  // CHECK-DAG-SAME: rhs_dilation = dense<[2, 3, 4]>
+
+  // CHECK-DAG-SAME: dimension_numbers
+  // CHECK-DAG-SAME:   input_batch_dimension = 0
+  // CHECK-DAG-SAME:   input_feature_dimension = 4
+  // CHECK-DAG-SAME:   input_spatial_dimensions = dense<[1, 2, 3]>
+  // CHECK-DAG-SAME:   kernel_input_feature_dimension = 3
+  // CHECK-DAG-SAME:   kernel_output_feature_dimension = 4
+  // CHECK-DAG-SAME:   kernel_spatial_dimensions = dense<[0, 1, 2]>
+  // CHECK-DAG-SAME:   output_batch_dimension = 0
+  // CHECK-DAG-SAME:   output_feature_dimension = 4
+  // CHECK-DAG-SAME:   output_spatial_dimensions = dense<[1, 2, 3]>
+
+  // CHECK-DAG-SAME: feature_group_count = 2
+  // CHECK-DAG-SAME: batch_group_count = 1
+
+  %0 = "tf.Conv3D"(%arg0, %arg1) {data_format = "NDHWC", dilations = [1, 2, 3, 4, 1], padding = "SAME", strides = [1, 5, 6, 7, 1]} : (tensor<256x32x32x32x6xf32>, tensor<3x3x3x3x16xf32>) -> tensor<256x30x30x30x16xf32>
+  return %0 : tensor<256x30x30x30x16xf32>
+}
+
 // CHECK-LABEL: depthwiseconv_simple
 func @depthwiseconv_simple(%arg0: tensor<2x4x5x3xf32>, %arg1: tensor<2x2x3x3xf32>) -> tensor<2x3x4x9xf32> {
   // CHECK: %[[RESHAPED_FILTER:.*]] = "xla_hlo.reshape"(%arg1) : (tensor<2x2x3x3xf32>) -> tensor<2x2x1x9xf32>
@@ -2993,6 +2948,36 @@ func @conv2d_backprop_input(
   return %result : tensor<100x28x28x1xf32>
 }
 
+// CHECK-LABEL: @conv3d_backprop_input
+func @conv3d_backprop_input(%filter: tensor<3x3x3x1x6xf32>, %out_backprop: tensor<2x8x8x8x6xf32>) -> tensor<2x8x8x8x1xf32> {
+  // CHECK: %[[REV_FILTER:.*]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK: %[[RESULT:.*]] = "xla_hlo.convolution"(%arg1, %[[REV_FILTER]])
+
+  // CHECK-DAG-SAME: batch_group_count = 1 : i64,
+
+  // CHECK-DAG-SAME: dimension_numbers =
+  // CHECK-DAG-SAME:   input_batch_dimension = 0 : i64
+  // CHECK-DAG-SAME:   input_feature_dimension = 4 : i64
+  // CHECK-DAG-SAME:   input_spatial_dimensions = dense<[1, 2, 3]> : tensor<3xi64>
+  // CHECK-DAG-SAME:   kernel_input_feature_dimension = 4 : i64
+  // CHECK-DAG-SAME:   kernel_output_feature_dimension = 3 : i64
+  // CHECK-DAG-SAME:   kernel_spatial_dimensions = dense<[0, 1, 2]> : tensor<3xi64>
+  // CHECK-DAG-SAME:   output_batch_dimension = 0 : i64
+  // CHECK-DAG-SAME:   output_feature_dimension = 4 : i64
+  // CHECK-DAG-SAME:   output_spatial_dimensions = dense<[1, 2, 3]> : tensor<3xi64>
+
+  // CHECK-DAG-SAME: feature_group_count = 1 : i64
+  // CHECK-DAG-SAME: lhs_dilation = dense<1> : tensor<3xi64>
+  // CHECK-DAG-SAME: padding = dense<1> : tensor<3x2xi64>
+  // CHECK-DAG-SAME: rhs_dilation = dense<1> : tensor<3xi64>
+  // CHECK-DAG-SAME: window_strides = dense<1> : tensor<3xi64>
+
+  // CHECK: return %[[RESULT]]
+  %input_sizes = "tf.Const" () {value = dense<[2, 8, 8, 8, 1]> : tensor<5xi32>} : () -> tensor<5xi32>
+  %result = "tf.Conv3DBackpropInputV2"(%input_sizes, %filter, %out_backprop) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1],  padding = "SAME", strides = [1, 1, 1, 1, 1]} : (tensor<5xi32>, tensor<3x3x3x1x6xf32>, tensor<2x8x8x8x6xf32>) -> tensor<2x8x8x8x1xf32>
+  return %result : tensor<2x8x8x8x1xf32>
+}
+
 // CHECK-LABEL: @conv2d_backprop_filter
 func @conv2d_backprop_filter(
     %input: tensor<100x28x28x1xf32>,
@@ -3029,6 +3014,35 @@ func @conv2d_backprop_filter(
   return %result : tensor<100x28x28x1xf32>
 }
 
+// CHECK-LABEL: @conv3d_backprop_filter
+func @conv3d_backprop_filter(%input: tensor<2x8x8x8x1xf32>, %out_backprop: tensor<2x8x8x8x6xf32>) -> tensor<2x8x8x8x1xf32> {
+  // CHECK: %[[RESULT:.*]] = "xla_hlo.convolution"(%arg0, %arg1)
+
+  // CHECK-DAG-SAME: batch_group_count = 1 : i64
+
+  // CHECK-DAG-SAME: dimension_numbers =
+  // CHECK-DAG-SAME:   input_batch_dimension = 4 : i64
+  // CHECK-DAG-SAME:   input_feature_dimension = 0 : i64
+  // CHECK-DAG-SAME:   input_spatial_dimensions = dense<[1, 2, 3]> : tensor<3xi64>
+  // CHECK-DAG-SAME:   kernel_input_feature_dimension = 0 : i64
+  // CHECK-DAG-SAME:   kernel_output_feature_dimension = 4 : i64
+  // CHECK-DAG-SAME:   kernel_spatial_dimensions = dense<[1, 2, 3]> : tensor<3xi64>
+  // CHECK-DAG-SAME:   output_batch_dimension = 3 : i64
+  // CHECK-DAG-SAME:   output_feature_dimension = 4 : i64
+  // CHECK-DAG-SAME:   output_spatial_dimensions = dense<[0, 1, 2]> : tensor<3xi64>
+
+  // CHECK-DAG-SAME: feature_group_count = 1 : i64
+  // CHECK-DAG-SAME: lhs_dilation = dense<1> : tensor<3xi64>
+  // CHECK-DAG-SAME: padding = dense<1> : tensor<3x2xi64>
+  // CHECK-DAG-SAME: rhs_dilation = dense<1> : tensor<3xi64>
+  // CHECK-DAG-SAME: window_strides = dense<1> : tensor<3xi64>
+
+  // CHECK: return %[[RESULT]]
+  %filter_sizes = "tf.Const"() {value = dense<[3, 3, 3, 1, 6]> : tensor<5xi32>} : () -> tensor<5xi32>
+  %result = "tf.Conv3DBackpropFilterV2"(%input, %filter_sizes, %out_backprop) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1],  padding = "SAME", strides = [1, 1, 1, 1, 1]} : (tensor<2x8x8x8x1xf32>, tensor<5xi32>, tensor<2x8x8x8x6xf32>) -> tensor<2x8x8x8x1xf32>
+  return %result : tensor<2x8x8x8x1xf32>
+}
+
 // CHECK-LABEL: @cross_replica_sum
 func @cross_replica_sum(%input: tensor<10xf32>) -> tensor<10xf32> {
   %replica_groups = "tf.Const" () {
@@ -3069,13 +3083,13 @@ func @size_ranked(%input: tensor<2x?x8xf32>) -> (tensor<i32>) {
   // CHECK: %[[CONST:.*]] = xla_hlo.constant dense<1>
   // CHECK: %[[DIM_0:.*]] = "xla_hlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 0
-  // CHECK: %[[MUL_0:.*]] = xla_hlo.multiply %[[CONST]], %[[DIM_0]]
+  // CHECK: %[[MUL_0:.*]] = xla_chlo.broadcast_multiply %[[CONST]], %[[DIM_0]]
   // CHECK: %[[DIM_1:.*]] = "xla_hlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 1
-  // CHECK: %[[MUL_1:.*]] = xla_hlo.multiply %[[MUL_0]], %[[DIM_1]]
+  // CHECK: %[[MUL_1:.*]] = xla_chlo.broadcast_multiply %[[MUL_0]], %[[DIM_1]]
   // CHECK: %[[DIM_2:.*]] = "xla_hlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 2
-  // CHECK: %[[MUL_2:.*]] = xla_hlo.multiply %[[MUL_1]], %[[DIM_2]]
+  // CHECK: %[[MUL_2:.*]] = xla_chlo.broadcast_multiply %[[MUL_1]], %[[DIM_2]]
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<2x?x8xf32>) -> tensor<i32>
   // CHECK: return %[[MUL_2]]
   return %size : tensor<i32>
@@ -3232,30 +3246,31 @@ func @assert(%arg0: tensor<i1>, %arg1: tensor<*xf32>) {
 // tf.Unpack legalization
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: @unpack
-func @unpack(%input: tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>) {
-  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<4x1x6xf32>) -> tensor<4x?xf32>
-  // CHECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES2:.*]] = "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[SLICE3:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES3:.*]] = "xla_hlo.reshape"(%[[SLICE3]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
+// TODO(b/156340000): Re-enable when fixed.
+// // C-HECK-LABEL: @unpack
+// func @unpack(%input: tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>) {
+//   // C-HECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<4x1x6xf32>) -> tensor<4x?xf32>
+//   // C-HECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES2:.*]] = "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
+//   // C-HECK: %[[SLICE3:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES3:.*]] = "xla_hlo.reshape"(%[[SLICE3]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
 
-  %0:3 = "tf.Unpack"(%input) {axis = 1} : (tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>)
-  // return %[[RES1]], %[[RES2]], %[[RES3]]
-  return %0#0, %0#1, %0#2 : tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>
-}
+//   %0:3 = "tf.Unpack"(%input) {axis = 1} : (tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>)
+//   // return %[[RES1]], %[[RES2]], %[[RES3]]
+//   return %0#0, %0#1, %0#2 : tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>
+// }
 
-// CHECK-LABEL: @unpack_dynamic
-func @unpack_dynamic(%input: tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
-  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
-  // CHECK: "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 2]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
-  // CHECK: "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
+// // C-HECK-LABEL: @unpack_dynamic
+// func @unpack_dynamic(%input: tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+//   // C-HECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
+//   // C-HECK: "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
+//   // C-HECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 2]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
+//   // C-HECK: "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
 
-  %0:2 = "tf.Unpack"(%input) {axis = -1} : (tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
-  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
-}
+//   %0:2 = "tf.Unpack"(%input) {axis = -1} : (tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
+//   return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
+// }
 
 //===----------------------------------------------------------------------===//
 // tf.UnsortedSegment{Max|Min|Prod|Sum} legalization
@@ -3320,11 +3335,11 @@ func @unsorted_segment_max(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: @gather_v2
-func @gather_v2(%arg0: tensor<16x2x3xf32>, %arg1: tensor<16x5xi32>) -> tensor<16x2x5x3xf32> {
-  // CHECK: "xla_hlo.torch_index_select"(%arg0, %arg1) {batch_dims = 1 : i64, dim = 2 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>) -> tensor<16x2x5x3xf32>
+func @gather_v2(%arg0: tensor<16x2x3xf32>, %arg1: tensor<16x5xi32>) -> tensor<16x2x5xf32> {
+  // CHECK: "xla_hlo.torch_index_select"(%arg0, %arg1) {batch_dims = 1 : i64, dim = 2 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>) -> tensor<16x2x5xf32>
   %0 = "tf.Const"() { value = dense<[-1]> : tensor<1xi32> } : () -> tensor<1xi32>
-  %1 = "tf.GatherV2"(%arg0, %arg1, %0) {batch_dims = -1 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>, tensor<1xi32>) -> tensor<16x2x5x3xf32>
-  return %1 : tensor<16x2x5x3xf32>
+  %1 = "tf.GatherV2"(%arg0, %arg1, %0) {batch_dims = -1 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>, tensor<1xi32>) -> tensor<16x2x5xf32>
+  return %1 : tensor<16x2x5xf32>
 }
 
 // CHECK-LABEL: @gather_v2_dynamic
@@ -3591,7 +3606,7 @@ func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
   // CHECK:   [[INDICES1:%.*]] = "xla_hlo.dynamic-update-slice"([[INDICES]], [[TGT_IDX]], [[IV]]) : (tensor<4xi32>, tensor<1xi32>, tensor<i32>) -> tensor<4xi32>
   // CHECK:   [[INDICES2:%.*]] = "xla_hlo.dynamic-update-slice"([[INDICES1]], [[SRC_IDX]], [[SWP]]) : (tensor<4xi32>, tensor<1xi32>, tensor<i32>) -> tensor<4xi32>
   // CHECK:   [[ONE:%.*]] = xla_hlo.constant dense<1> : tensor<i32>
-  // CHECK:   [[NEW_IV:%.*]] = xla_hlo.add [[IV]], [[ONE]]
+  // CHECK:   [[NEW_IV:%.*]] = xla_chlo.broadcast_add [[IV]], [[ONE]]
   // CHECK:   [[NEW_TUPLE:%.*]] = "xla_hlo.tuple"([[NEW_IV]], [[SWAPS]], [[INDICES2]])
   // CHECK:   "xla_hlo.return"([[NEW_TUPLE]])
   // CHECK: }) : (tuple<tensor<i32>, tensor<4xi32>, tensor<4xi32>>) -> tuple<tensor<i32>, tensor<4xi32>, tensor<4xi32>>
@@ -3616,16 +3631,18 @@ func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
 // CHECK-LABLE: @variable_shape32
 func @variable_shape32(%input: tensor<!tf.resource<tensor<2x4x8xf32>>>) -> tensor<3xi32> {
   // CHECK: [[CST:%.*]] = xla_hlo.constant dense<[2, 4, 8]> : tensor<3xi32>
+  // CHECK: [[CST_CAST:%.*]] = tensor_cast [[CST]]
   %0 = "tf.VariableShape"(%input) : (tensor<!tf.resource<tensor<2x4x8xf32>>>) -> (tensor<3xi32>)
-  // CHECK: return [[CST]]
+  // CHECK: return [[CST_CAST]]
   return %0: tensor<3xi32>
 }
 
 // CHECK-LABLE: @variable_shape64
 func @variable_shape64(%input: tensor<!tf.resource<tensor<2x4x8xf32>>>) -> tensor<3xi64> {
   // CHECK: [[CST:%.*]] = xla_hlo.constant dense<[2, 4, 8]> : tensor<3xi64>
+  // CHECK: [[CST_CAST:%.*]] = tensor_cast [[CST]]
   %0 = "tf.VariableShape"(%input) : (tensor<!tf.resource<tensor<2x4x8xf32>>>) -> (tensor<3xi64>)
-  // CHECK: return [[CST]]
+  // CHECK: return [[CST_CAST]]
   return %0: tensor<3xi64>
 }
 
@@ -3658,7 +3675,7 @@ func @avgpool_valid_padding(%arg0: tensor<2x12x20x7xf16>) -> tensor<2x3x5x7xf16>
   // CHECK:   "xla_hlo.return"([[ADD]])
   // CHECK: }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>} : (tensor<2x12x20x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
   // CHECK: [[COUNT:%.+]] = xla_hlo.constant dense<4.000000e+00> : tensor<f32>
-  // CHECK: [[DIV:%.+]] = "xla_hlo.divide"([[REDUCE]], [[COUNT]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<2x3x5x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
+  // CHECK: [[DIV:%.+]] = xla_chlo.broadcast_divide [[REDUCE]], [[COUNT]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<2x3x5x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
   // CHECK: [[CONV16:%.+]] = "xla_hlo.convert"([[DIV]]) : (tensor<2x3x5x7xf32>) -> tensor<2x3x5x7xf16>
   // CHECK: return [[CONV16]]
   %0 = "tf.AvgPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 4, 4, 1]} : (tensor<2x12x20x7xf16>) -> tensor<2x3x5x7xf16>
@@ -3679,6 +3696,41 @@ func @xla_sharding(%arg0: tensor<4x16xf32>) -> tensor<4x16xf32> {
   return %0 : tensor<4x16xf32>
 }
 
+// CHECK-LABEL: inplace_update_one
+func @inplace_update_one(%arg0: tensor<8x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<1xi32>) -> tensor<8x4xf32> {
+  // CHECK-DAG: [[CST:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[SLICE1:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE2:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+  // CHECK-DAG: [[RESHAPE1:%.+]] = "xla_hlo.reshape"([[SLICE1]])
+  // CHECK-DAG: [[UPDATE:%.+]] = "xla_hlo.dynamic-update-slice"(%arg0, [[SLICE2]], [[RESHAPE1]], [[CST]])
+  %0 = "tf.InplaceUpdate"(%arg0, %arg2, %arg1) : (tensor<8x4xf32>, tensor<1xi32>, tensor<1x4xf32>) -> tensor<8x4xf32>
+
+  // CHECK: return [[UPDATE]]
+  return %0 : tensor<8x4xf32>
+}
+
+// CHECK-LABEL: inplace_update_three
+func @inplace_update_three(%arg0: tensor<8x8x4xf32>, %arg1: tensor<3x8x4xf32>, %arg2: tensor<3xi32>) -> tensor<8x8x4xf32> {
+  // CHECK-DAG: [[CST:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[SLICE1:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE2:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE3:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<3> : tensor<1xi64>, start_indices = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE4:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[1, 8, 4]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[SLICE5:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[2, 8, 4]> : tensor<3xi64>, start_indices = dense<[1, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[SLICE6:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[3, 8, 4]> : tensor<3xi64>, start_indices = dense<[2, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[RESHAPE1:%.+]] = "xla_hlo.reshape"([[SLICE1]])
+  // CHECK-DAG: [[RESHAPE2:%.+]] = "xla_hlo.reshape"([[SLICE2]])
+  // CHECK-DAG: [[RESHAPE3:%.+]] = "xla_hlo.reshape"([[SLICE3]])
+  // CHECK-DAG: [[UPDATE1:%.+]] = "xla_hlo.dynamic-update-slice"(%arg0, [[SLICE4]], [[RESHAPE1]], [[CST]], [[CST]])
+  // CHECK-DAG: [[UPDATE2:%.+]] = "xla_hlo.dynamic-update-slice"([[UPDATE1]], [[SLICE5]], [[RESHAPE2]], [[CST]], [[CST]])
+  // CHECK-DAG: [[UPDATE3:%.+]] = "xla_hlo.dynamic-update-slice"([[UPDATE2]], [[SLICE6]], [[RESHAPE3]], [[CST]], [[CST]])
+  %0 = "tf.InplaceUpdate"(%arg0, %arg2, %arg1) : (tensor<8x8x4xf32>, tensor<3xi32>, tensor<3x8x4xf32>) -> tensor<8x8x4xf32>
+
+  // CHECK:  return [[UPDATE3]] : tensor<8x8x4xf32>
+  return %0 : tensor<8x8x4xf32>
+}
+
+
 // CHECK-LABEL: xla_dynamic_update_slice
 func @xla_dynamic_update_slice(%arg0: tensor<4x16xf32>, %arg1: tensor<2x4xf32>, %arg2: tensor<2xi32>) -> tensor<4x16xf32> {
   // CHECK: [[SLICE0:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
@@ -3701,6 +3753,21 @@ func @xla_dynamic_update_slice2(%arg0: tensor<4xf32>, %arg1: tensor<2xf32>, %arg
   return %0 : tensor<4xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// AllToAll op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @alltoall_basic
+func @alltoall_basic(%input: tensor<10xf32>) -> tensor<10xf32> {
+  %group_assignment = "tf.Const" () {
+    value = dense<[[0, 2, 4, 6], [1, 3, 5, 7], [3, 5, 6, 8]]> : tensor<3x4xi32>
+  } : () -> tensor<3x4xi32>
+  %result = "tf.AllToAll"(%input, %group_assignment) {T = f32, concat_dimension = 1 : i64, split_count = 2 : i64, split_dimension = 0 : i64} :  (tensor<10xf32>, tensor<3x4xi32>)  -> tensor<10xf32>
+  // CHECK: xla_hlo.all_to_all
+  // CHECK-SAME: replica_groups = dense<{{\[}}[0, 2, 4, 6], [1, 3, 5, 7], [3, 5, 6, 8]]> : tensor<3x4xi64>
+  return %result : tensor<10xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // Cumsum op legalizations.
 //===----------------------------------------------------------------------===//
@@ -3746,96 +3813,13 @@ func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
-//===----------------------------------------------------------------------===//
-// tf.BatchMatMulV2 op legalizations.
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @batchmatmulv2_broadcast_singleton_dimension
-func @batchmatmulv2_broadcast_singleton_dimension(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
-  // CHECK:         [[BLHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x4x2xf32>, {{.*}}) -> tensor<3x4x2xf32>
-  // CHECK:         [[BRHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x4xf32>, {{.*}}) -> tensor<3x2x4xf32>
-  // CHECK:         [[BDST:%.+]] = "xla_hlo.dot_general"([[BLHS]], [[BRHS]]) {dot_dimension_numbers = {
-  // CHECK-SAME:      lhs_batching_dimensions = dense<0> : tensor<1xi64>,
-  // CHECK-SAME:      lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
-  // CHECK-SAME:      rhs_batching_dimensions = dense<0> : tensor<1xi64>,
-  // CHECK-SAME:      rhs_contracting_dimensions = dense<1> : tensor<1xi64>
-  // CHECK-SAME:    }} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
-  // CHECK:         return [[BDST]] : tensor<3x4x4xf32>
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<1x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
-  return %0 : tensor<3x4x4xf32>
-}
-
-// CHECK-LABEL: func @batchmatmulv2_lhs_batch
-func @batchmatmulv2_lhs_batch(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<3x4x4xf32> {
-  // CHECK:         [[BLHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x4x2xf32>, {{.*}}) -> tensor<3x4x2xf32>
-  // CHECK:         [[BRHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<2x4xf32>, {{.*}}) -> tensor<3x2x4xf32>
-  // CHECK:         [[BDST:%.+]] = "xla_hlo.dot_general"([[BLHS]], [[BRHS]]) {dot_dimension_numbers = {
-  // CHECK-SAME:      lhs_batching_dimensions = dense<0> : tensor<1xi64>,
-  // CHECK-SAME:      lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
-  // CHECK-SAME:      rhs_batching_dimensions = dense<0> : tensor<1xi64>,
-  // CHECK-SAME:      rhs_contracting_dimensions = dense<1> : tensor<1xi64>
-  // CHECK-SAME:    }} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
-  // CHECK:         return [[BDST]] : tensor<3x4x4xf32>
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<3x4x2xf32>, tensor<2x4xf32>) -> tensor<3x4x4xf32>
-  return %0 : tensor<3x4x4xf32>
-}
-
-// CHECK-LABEL: func @batchmatmulv2_rhs_batch
-func @batchmatmulv2_rhs_batch(%arg0: tensor<4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
-  // CHECK:         [[BLHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4x2xf32>, {{.*}}) -> tensor<3x4x2xf32>
-  // CHECK:         [[BRHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x4xf32>, {{.*}}) -> tensor<3x2x4xf32>
-  // CHECK:         [[BDST:%.+]] = "xla_hlo.dot_general"([[BLHS]], [[BRHS]]) {dot_dimension_numbers = {
-  // CHECK-SAME:      lhs_batching_dimensions = dense<0> : tensor<1xi64>,
-  // CHECK-SAME:      lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
-  // CHECK-SAME:      rhs_batching_dimensions = dense<0> : tensor<1xi64>,
-  // CHECK-SAME:      rhs_contracting_dimensions = dense<1> : tensor<1xi64>
-  // CHECK-SAME:    }} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
-  // CHECK:         return [[BDST]] : tensor<3x4x4xf32>
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
-  return %0 : tensor<3x4x4xf32>
-}
-
-// CHECK-LABEL: func @batchmatmulv2_dynamic
-func @batchmatmulv2_dynamic(%arg0: tensor<?x4x2xf32>, %arg1: tensor<?x2x4xf32>) -> tensor<?x4x4xf32> {
-  // CHECK: "tf.BatchMatMulV2"
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<?x4x2xf32>, tensor<?x2x4xf32>) -> tensor<?x4x4xf32>
-  return %0 : tensor<?x4x4xf32>
-}
-
-// CHECK-LABEL: func @batchmatmulv2_adj_real
-func @batchmatmulv2_adj_real(%arg0: tensor<5x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<5x4xf32> {
-  // CHECK:         [[BLHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, {{.*}}) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<5x2xf32>, {{.*}}) -> tensor<5x2xf32>
-  // CHECK:         [[BRHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, {{.*}}) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<2x4xf32>, {{.*}}) -> tensor<2x4xf32>
-  // CHECK:         [[BDST:%.+]] = "xla_hlo.dot_general"([[BLHS]], [[BRHS]]) {dot_dimension_numbers = {
-  // CHECK-SAME:      lhs_batching_dimensions = dense<[]> : tensor<0xi64>,
-  // CHECK-SAME:      lhs_contracting_dimensions = dense<0> : tensor<1xi64>,
-  // CHECK-SAME:      rhs_batching_dimensions = dense<[]> : tensor<0xi64>,
-  // CHECK-SAME:      rhs_contracting_dimensions = dense<1> : tensor<1xi64>
-  // CHECK-SAME:    }} : (tensor<5x2xf32>, tensor<2x4xf32>) -> tensor<5x4xf32>
-  // CHECK:         return [[BDST]] : tensor<5x4xf32>
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xf32>, tensor<2x4xf32>) -> tensor<5x4xf32>
-  return %0 : tensor<5x4xf32>
-}
-
-// CHECK-LABEL: func @batchmatmulv2_adj_complex
-func @batchmatmulv2_adj_complex(%arg0: tensor<5x2xcomplex<f32>>, %arg1: tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
-  // CHECK:         [[LHSRE:%.+]] = "xla_hlo.real"(%arg0) : (tensor<5x2xcomplex<f32>>) -> tensor<5x2xf32>
-  // CHECK:         [[LHSIM:%.+]] = "xla_hlo.imag"(%arg0) : (tensor<5x2xcomplex<f32>>) -> tensor<5x2xf32>
-  // CHECK:         [[LHSIMNEG:%.+]] = "xla_hlo.negate"([[LHSIM]]) : (tensor<5x2xf32>) -> tensor<5x2xf32>
-  // CHECK:         [[LHSCONJ:%.+]] = "xla_hlo.complex"([[LHSRE]], [[LHSIMNEG]]) : (tensor<5x2xf32>, tensor<5x2xf32>) -> tensor<5x2xcomplex<f32>>
-  // CHECK:         [[RHSRE:%.+]] = "xla_hlo.real"(%arg1) : (tensor<2x4xcomplex<f32>>) -> tensor<2x4xf32>
-  // CHECK:         [[RHSIM:%.+]] = "xla_hlo.imag"(%arg1) : (tensor<2x4xcomplex<f32>>) -> tensor<2x4xf32>
-  // CHECK:         [[RHSIMNEG:%.+]] = "xla_hlo.negate"([[RHSIM]]) : (tensor<2x4xf32>) -> tensor<2x4xf32>
-  // CHECK:         [[RHSCONJ:%.+]] = "xla_hlo.complex"([[RHSRE]], [[RHSIMNEG]]) : (tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xcomplex<f32>>
-  // CHECK:         [[BLHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"([[LHSCONJ]], {{.*}}) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<5x2xcomplex<f32>>, {{.*}}) -> tensor<5x2xcomplex<f32>>
-  // CHECK:         [[BRHS:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"([[RHSCONJ]], {{.*}}) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<2x4xcomplex<f32>>, {{.*}}) -> tensor<2x4xcomplex<f32>>
-  // CHECK:         [[BDST:%.+]] = "xla_hlo.dot_general"([[BLHS]], [[BRHS]]) {dot_dimension_numbers = {
-  // CHECK-SAME:      lhs_batching_dimensions = dense<[]> : tensor<0xi64>,
-  // CHECK-SAME:      lhs_contracting_dimensions = dense<0> : tensor<1xi64>,
-  // CHECK-SAME:      rhs_batching_dimensions = dense<[]> : tensor<0xi64>,
-  // CHECK-SAME:      rhs_contracting_dimensions = dense<1> : tensor<1xi64>
-  // CHECK-SAME:    }} : (tensor<5x2xcomplex<f32>>, tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
-  // CHECK:         return [[BDST]] : tensor<5x4xcomplex<f32>>
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xcomplex<f32>>, tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
-  return %0 : tensor<5x4xcomplex<f32>>
+// CHECK:  func @qr([[VAL_0:%.*]]: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
+func @qr(%arg0: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>) {
+  // The tf.Qr lowering is a full algorithm that is not effective to verify with
+  // FileCheck. Just verify that it converted.
+  // TODO(laurenzo): Move this out of the mainline tf2xla conversion as it is
+  // really only applicable to certain legacy uses.
+  // CHECK-NOT: "tf.Qr"
+  %0:2 = "tf.Qr"(%arg0) {full_matrices = false} : (tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
+  return %0#0, %0#1 : tensor<500x100x75xf32>, tensor<500x75x75xf32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
index d25a84d0e25..9f27a204baf 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -xla-legalize-to-std %s -o - | FileCheck %s
+// RUN: xla-opt -xla-legalize-to-std %s -o - | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: func @binary_ops_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
 func @binary_ops_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
@@ -42,40 +42,6 @@ func @binary_ops_int(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32
   return %4 : tensor<4xi32>
 }
 
-// Broadcasting is not currently supported.
-// TODO(suderman):Future pass should take all broadcasted binary ops and convert
-// them to separate broadcast and binary op.
-// CHECK-LABEL: func @binary_ops_broadcast(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
-func @binary_ops_broadcast(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
-  // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "add.3"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {
-      name = "add.3", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %1 = "xla_hlo.multiply"(%0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "mul.4"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %1 = "xla_hlo.multiply"(%0, %arg1) {
-      name = "mul.4", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %2 = "xla_hlo.subtract"(%1, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "sub.5"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %2 = "xla_hlo.subtract"(%1, %arg1) {
-      name = "sub.5", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %3 = "xla_hlo.divide"(%2, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "div.6"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %3 = "xla_hlo.divide"(%2, %arg1) {
-      name = "div.6", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %4 = "xla_hlo.remainder"(%3, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %4 = "xla_hlo.remainder"(%3, %arg1) {
-    broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: return %4 : tensor<4x4xf32>
-  return %4 : tensor<4x4xf32>
-}
-
 // CHECK-LABEL: func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>) {
 func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
   // CHECK-NEXT: %0 = cmpi "eq", %arg0, %arg0 : tensor<4xi32>
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
index 013748fea28..99b1766e73c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
@@ -24,9 +24,9 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 // CHECK-LABEL: func @fusion
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:        addf
 //       CHECK:      linalg.generic
@@ -36,9 +36,9 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 //   TILED-DAG:  %[[C2:.*]] = constant 2
 //   TILED-DAG:  %[[C3:.*]] = constant 3
 //   TILED-NOT:  linalg.generic
-//       TILED:  loop.for {{.*}} step %[[C2]]
-//       TILED:    loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:  loop.for
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//       TILED:    scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:  scf.for
 //       TILED:      linalg.generic
 //       TILED:        addf
 //       TILED:      linalg.generic
@@ -46,8 +46,8 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 
 // PLOOP-LABEL: func @fusion
 //   PLOOP-NOT:  linalg.generic
-//       PLOOP:  loop.parallel
-//   PLOOP-NOT:  loop.parallel
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
 //       PLOOP:      linalg.generic
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
@@ -94,9 +94,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 // CHECK-LABEL: func @fusion
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:      linalg.generic
 //       CHECK:        subf
@@ -107,9 +107,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 //   TILED-DAG:   %[[C2:.*]] = constant 2
 //   TILED-DAG:   %[[C3:.*]] = constant 3
 //   TILED-NOT:   linalg.generic
-//       TILED:   loop.for {{.*}} step %[[C2]]
-//       TILED:     loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:   loop.for
+//       TILED:   scf.for {{.*}} step %[[C2]]
+//       TILED:     scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:   scf.for
 //       TILED:       linalg.generic
 //       TILED:       linalg.generic
 //       TILED:         subf
@@ -118,8 +118,8 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 
 // PLOOP-LABEL: func @fusion_of_three
 //   PLOOP-NOT:   linalg.generic
-//       PLOOP:   loop.parallel
-//   PLOOP-NOT:   loop.parallel
+//       PLOOP:   scf.parallel
+//   PLOOP-NOT:   scf.parallel
 //       PLOOP:       linalg.generic
 //       PLOOP:       linalg.generic
 //       PLOOP:         subf
@@ -147,11 +147,11 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 // CHECK-LABEL: func @fusion_4d
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//       CHECK:      loop.for {{.*}} step %[[C1]]
-//       CHECK:        loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//       CHECK:      scf.for {{.*}} step %[[C1]]
+//       CHECK:        scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:        addf
 //       CHECK:      linalg.generic
@@ -161,9 +161,9 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 //   TILED-DAG:  %[[C2:.*]] = constant 2
 //   TILED-DAG:  %[[C3:.*]] = constant 3
 //   TILED-NOT:  linalg.generic
-//       TILED:  loop.for {{.*}} step %[[C2]]
-//       TILED:    loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:  loop.for
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//       TILED:    scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:  scf.for
 //       TILED:      linalg.generic
 //       TILED:        addf
 //       TILED:      linalg.generic
@@ -171,8 +171,8 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 
 // PLOOP-LABEL: func @fusion_4d
 //   PLOOP-NOT:  linalg.generic
-//       PLOOP:  loop.parallel
-//   PLOOP-NOT:  loop.parallel
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
 //       PLOOP:      linalg.generic
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
new file mode 100644
index 00000000000..c640b395f4d
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
@@ -0,0 +1,199 @@
+// GenericAtomicRMWOp should contain only ops with no side effects.
+// Unfortunately, the legalization pattern for SelectAndScatterOp has to adapt
+// to XLA LHLO dialect using allocs/deallocs inside of GenericAtomicRMWOp body.
+// Lowering to STD dialect and store forwarding pass would be required to get
+// rid of them. This is exactly what is done in the real MLIR GPU pipeline, but
+// here we disable verification with `verify-each=0` to check the output IR.
+// RUN: xla-opt %s -lhlo-legalize-to-parallel-loops -canonicalize --verify-each=0 | FileCheck %s --dump-input-on-failure
+
+func @select_and_scatter(%arg: memref<112x112xf32>,
+                         %src: memref<56x56xf32>,
+                         %init: memref<f32>,
+                         %result: memref<112x112xf32>) {
+  "xla_lhlo.select_and_scatter"(%arg, %src, %init, %result) ( {
+    // select
+    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %pred: memref<i1>):
+      "xla_lhlo.compare"(%lhs, %rhs, %pred) {comparison_direction = "GE"} :
+          (memref<f32>, memref<f32>, memref<i1>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+  }, {
+    // scatter
+    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %out: memref<f32>):
+      "xla_lhlo.add"(%lhs, %rhs, %out) :
+          (memref<f32>, memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+  }) {
+    padding = dense<[[0, 1], [0, 1]]> : tensor<2x2xi64>,
+    window_dimensions = dense<[3, 3]> : tensor<2xi64>,
+    window_strides = dense<[2, 2]> : tensor<2xi64>
+  } : (memref<112x112xf32>,
+       memref<56x56xf32>,
+       memref<f32>, memref<112x112xf32>) -> ()
+  "xla_lhlo.terminator"() : () -> ()
+}
+// CHECK-LABEL: func @select_and_scatter(
+// CHECK-SAME:   [[ARG_BUF:%.*]]: memref<112x112xf32>,
+// CHECK-SAME:   [[SRC_BUF:%.*]]: memref<56x56xf32>,
+// CHECK-SAME:   [[INIT_BUF:%.*]]: memref<f32>,
+// CHECK-SAME:   [[RESULT_BUF:%.*]]: memref<112x112xf32>) {
+
+// Constants.
+// CHECK:  [[C56:%.*]] = constant 56 : index
+// CHECK:  [[C1:%.*]] = constant 1 : index
+// CHECK:  [[C0_F32:%.*]] = constant 0.000000e+00 : f32
+// CHECK:  [[CFALSE:%.*]] = constant 0 : i1
+// CHECK:  [[C3:%.*]] = constant 3 : index
+// CHECK:  [[C2:%.*]] = constant 2 : index
+// CHECK:  [[C0:%.*]] = constant 0 : index
+// CHECK:  [[C112:%.*]] = constant 112 : index
+// CHECK:  [[CTRUE:%.*]] = constant 1 : i1
+
+// Parallel loop to initialize the output buffer.
+// CHECK:    [[INIT:%.*]] = load [[INIT_BUF]][] : memref<f32>
+// CHECK:    scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
+// CHECK-SAME:          to ([[C112]], [[C112]]) step ([[C1]], [[C1]]) {
+// CHECK:      store [[INIT]], [[RESULT_BUF]]{{\[}}[[I]], [[J]]]
+// CHECK:      scf.yield
+// CHECK:    }
+
+// Parallel loop over source buffer to compute scattered values.
+// CHECK:    scf.parallel ([[II:%.*]], [[JJ:%.*]]) = ([[C0]], [[C0]])
+// CHECK-SAME:          to ([[C56]], [[C56]]) step ([[C1]], [[C1]]) {
+
+// Window loop w.r.t. first dim.
+// CHECK:      [[SEL_RES_I:%.*]]:4
+// CHECK-SAME:   = scf.for [[WIN_I:%.*]] = [[C0]] to [[C3]] step [[C1]]
+// CHECK-SAME:     iter_args(
+// CHECK-SAME:       [[SEL_I_0:%.*]] = [[C0]], [[SEL_J_0:%.*]] = [[C0]],
+// CHECK-SAME:       [[SEL_VAL_0:%.*]] = [[C0_F32]],
+// CHECK-SAME:       [[SEL_INIT_0:%.*]] = [[CFALSE]]
+// CHECK-SAME:     ) -> (index, index, f32, i1) {
+
+// Window loop w.r.t. second dim.
+// CHECK:      [[SEL_RES_J:%.*]]:4
+// CHECK-SAME:   = scf.for [[WIN_J:%.*]] = [[C0]] to [[C3]] step [[C1]]
+// CHECK-SAME:     iter_args(
+// CHECK-SAME:       [[SEL_I:%.*]] = [[SEL_I_0]], [[SEL_J:%.*]] = [[SEL_J_0]],
+// CHECK-SAME:       [[SEL_VAL:%.*]] = [[SEL_VAL_0]],
+// CHECK-SAME:       [[SEL_INIT:%.*]] = [[SEL_INIT_0]]
+// CHECK-SAME:     ) -> (index, index, f32, i1) {
+
+// Compute index I of the ARG buffer and check whether it is in padding area.
+// CHECK:  [[START_I:%.*]] = muli [[II]], [[C2]] : index
+// CHECK:  [[OFFSET_I:%.*]] = subi [[WIN_I]], [[C0]] : index
+// CHECK:  [[ARG_I:%.*]] = addi [[START_I]], [[OFFSET_I]] : index
+// CHECK:  [[ARG_I_FITS:%.*]] = cmpi "ult", [[ARG_I]], [[C112]] : index
+
+// Update `INBOUNDS`, i.e. whether or not ARG indices are inside the boundaries
+// of the buffer or they are in the padding area.
+// CHECK:      [[INBOUNDS_0:%.*]] = and [[ARG_I_FITS]], [[CTRUE]] : i1
+
+// Compute index J of the ARG buffer and check whether it is in padding area.
+// CHECK:  [[START_J:%.*]] = muli [[JJ]], [[C2]] : index
+// CHECK:  [[OFFSET_J:%.*]] = subi [[WIN_J]], [[C0]] : index
+// CHECK:  [[ARG_J:%.*]] = addi [[START_J]], [[OFFSET_J]] : index
+// CHECK:  [[ARG_J_FITS:%.*]] = cmpi "ult", [[ARG_J]], [[C112]] : index
+
+// Update `INBOUNDS`, i.e. whether or not ARG indices are inside the boundaries
+// of the buffer or they are in the padding area.
+// CHECK:  [[INBOUNDS_1:%.*]] = and [[INBOUNDS_0]], [[ARG_J_FITS]] : i1
+
+// If ARG ivs are in the padding area, then 'select' function does not have to
+// be applied, current selected ivs (SEL_I, SEL_J) and value (SEL_VAL) are
+// returned in that case.
+// CHECK:  [[IF_INBOUNDS_RES:%.*]]:4
+// CHECK-SAME:  = scf.if [[INBOUNDS_1]] -> (index, index, f32, i1) {
+
+
+  // INBOUNDS-THEN-BODY, i.e. if INBOUNDS == true
+
+  // CHECK: [[ARG_ELEM:%.*]] = load [[ARG_BUF]]{{\[}}[[ARG_I]], [[ARG_J]]]
+  // CHECK: [[IF_INIT_RES:%.*]]:4
+  // CHECK-SAME:  = scf.if [[SEL_INIT]] -> (index, index, f32, i1) {
+
+    // INIT-THEN-BODY, i.e. INBOUNDS == true and INIT = true
+
+    // The LHLO IR of the select block of the lhlo.select_and_scatter is applied
+    // to the current selected value (SEL_VAL) and the element of the ARG buffer
+    // to compute boolean PRED, whether the new value and ivs should replace the
+    // current ones.
+
+    // Allocate buffers for ARG element, current selected value to adapt LHLO
+    // code.
+    // CHECK:  [[ARG_ELEM_BUF:%.*]] = alloc() : memref<f32>
+    // CHECK:  [[SEL_VAL_BUF:%.*]] = alloc() : memref<f32>
+    // CHECK:  [[PRED_BUF:%.*]] = alloc() : memref<i1>
+    // CHECK:  store [[ARG_ELEM]], [[ARG_ELEM_BUF]][] : memref<f32>
+    // CHECK:  store [[SEL_VAL]], [[SEL_VAL_BUF]][] : memref<f32>
+
+    // Compute PRED.
+    // CHECK:  "xla_lhlo.compare"(
+    // CHECK-SAME:     [[ARG_ELEM_BUF]], [[SEL_VAL_BUF]], [[PRED_BUF]])
+    // CHECK:      [[PRED:%.*]] = load [[PRED_BUF]][] : memref<i1>
+
+
+    // Depending on PRED, return ARG ivs & elem or current select ivs and value.
+    // CHECK:  [[IF_PRED_RES:%.*]]:4 = scf.if [[PRED]]
+    // CHECK:    scf.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]], [[CTRUE]]
+    // CHECK:  } else {
+    // CHECK:    scf.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]], [[SEL_INIT]]
+    // CHECK:  }
+
+    // INIT-THEN-BODY yield.
+    // CHECK:  scf.yield [[IF_PRED_RES]]#0, [[IF_PRED_RES]]#1,
+    // CHECK-SAME:        [[IF_PRED_RES]]#2, [[IF_PRED_RES]]#3
+
+    // INIT-ELSE-BODY, i.e. if INBOUNDS == TRUE and INIT == FALSE, returns ARG
+    // ivs and element without computing Select function.
+    // CHECK:  scf.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]],
+    // CHECK-SAME:        [[CTRUE]] : index, index, f32, i1
+    // CHECK:  }
+
+  // INBOUNDS-THEN-BODY yield.
+  // CHECK:  scf.yield [[IF_INIT_RES]]#0, [[IF_INIT_RES]]#1, [[IF_INIT_RES]]#2,
+  // CHECK-SAME:        [[IF_INIT_RES]]#3 : index, index, f32, i1
+  // CHECK:  }
+
+  // INBOUNDS-ELSE-REGION, i.e. if INBOUNDS == FALSE
+  // We are in the pad area, return current iter_args.
+  // CHECK:  scf.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]],
+  // CHECK-SAME:  [[SEL_INIT]] : index, index, f32, i1
+  // CHECK:  }
+
+// Window loop w.r.t. second dim yield.
+// CHECK:  scf.yield [[IF_INBOUNDS_RES]]#0, [[IF_INBOUNDS_RES]]#1,
+// CHECK-SAME:        [[IF_INBOUNDS_RES]]#2, [[IF_INBOUNDS_RES]]#3
+// CHECK:  }
+
+// Window loop w.r.t. first dim yield.
+// CHECK:    scf.yield [[SEL_RES_J]]#0, [[SEL_RES_J]]#1, [[SEL_RES_J]]#2,
+// CHECK-SAME:          [[SEL_RES_J]]#3 : index, index, f32, i1
+// CHECK:  }
+
+// Use selected ivs to load element from the SRC buffer.
+// CHECK: [[SRC_ELEM:%.*]] = load [[SRC_BUF]]{{\[}}[[II]], [[JJ]]]
+
+// Update of RESULT[SELECTED_I, SELECTED_J] should be done atomically, because
+// it may happen that several other threads select the same IVs if the windows
+// overlap.
+// CHECK: generic_atomic_rmw [[RESULT_BUF]]{{\[}}[[SEL_RES_I]]#0,
+// CHECK-SAME:                 [[SEL_RES_I]]#1] : memref<112x112xf32>
+// CHECK: ^bb0([[CUR_RES:%.*]]: f32):
+
+// Allocate buffers for ARG element, current selected value to adapt LHLO code.
+// CHECK:  [[SRC_ELEM_BUF:%.*]] = alloc() : memref<f32>
+// CHECK:  [[CUR_RES_BUF:%.*]] = alloc() : memref<f32>
+// CHECK:  [[RES_BUF:%.*]] = alloc() : memref<f32>
+// CHECK:  store [[SRC_ELEM]], [[SRC_ELEM_BUF]][] : memref<f32>
+// CHECK:  store [[CUR_RES]], [[CUR_RES_BUF]][] : memref<f32>
+
+// Compute scatter value.
+// CHECK:  "xla_lhlo.add"([[SRC_ELEM_BUF]], [[CUR_RES_BUF]], [[RES_BUF]]) :
+// CHECK-SAME: (memref<f32>, memref<f32>, memref<f32>) -> ()
+// CHECK:  [[RES:%.*]] = load [[RES_BUF]][] : memref<f32>
+
+// Atomic RMW terminator that returns updated value.
+// CHECK:  atomic_yield [[RES]] : f32
+
+// Parallel loop over source buffer yield
+// CHECK:  scf.yield
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
index 4d878cee6f4..16ffbf241b0 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
@@ -22,7 +22,7 @@ func @reduce(%arg: memref<100x10xf32>,
 // CHECK-DAG: %[[LB:.*]] = constant 0 : index
 // CHECK-DAG: %[[UB:.*]] = constant 10 : index
 // CHECK-DAG: %[[STEP:.*]] = constant 1 : index
-// CHECK: loop.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK: scf.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
 // CHECK: %[[LHS:.*]] = linalg.slice %[[ARG2]][%[[IDX]]] : memref<100xf32>, index, memref<f32, #map0>
 // CHECK: %[[RHS:.*]] = linalg.slice %[[ARG0]][%[[IDX]], %[[IDX1]]] : memref<100x10xf32>, index, index, memref<f32, #map0>
 // CHECK: "xla_lhlo.add"(%[[LHS]], %[[RHS]], %[[LHS]]) : (memref<f32, {{.*}}>, memref<f32, {{.*}}>, memref<f32, {{.*}}>) -> ()
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
index a070dac9836..626e905695c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
@@ -3,7 +3,7 @@
 // CHECK: #map0 = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @element_wise
 func @element_wise(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
+                   %result: memref<2x2xf32>) {
   "xla_lhlo.add"(%lhs, %rhs, %result)
       : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
@@ -16,8 +16,9 @@ func @element_wise(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
 // -----
 
 // CHECK-LABEL: func @element_wise_with_dynamic_shape
-func @element_wise_with_dynamic_shape(%lhs: memref<?x?xf32>, %rhs: memref<?x?xf32>,
-          %result: memref<?x?xf32>) {
+func @element_wise_with_dynamic_shape(%lhs: memref<?x?xf32>,
+                                       %rhs: memref<?x?xf32>,
+                                       %result: memref<?x?xf32>) {
   "xla_lhlo.add"(%lhs, %rhs, %result)
       : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
   return
@@ -31,22 +32,22 @@ func @element_wise_with_dynamic_shape(%lhs: memref<?x?xf32>, %rhs: memref<?x?xf3
 
 // CHECK-LABEL: func @element_wise_scalar
 func @element_wise_scalar(%lhs: memref<f32>, %rhs: memref<f32>,
-          %result: memref<f32>) {
+                          %result: memref<f32>) {
+  "xla_lhlo.add"(%lhs, %rhs, %result)
+      : (memref<f32>, memref<f32>, memref<f32>) -> ()
+  return
+}
 // CHECK: %[[LHS:.*]] = load
 // CHECK: %[[RHS:.*]] = load
 // CHECK: %[[RES:.*]] = addf %[[LHS]], %[[RHS]]
 // CHECK: store %[[RES]]
 // CHECK-NEXT: return
-  "xla_lhlo.add"(%lhs, %rhs, %result)
-      : (memref<f32>, memref<f32>, memref<f32>) -> ()
-  return
-}
 
 // -----
 
 // CHECK-LABEL: func @minf
 func @minf(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
+           %result: memref<2x2xf32>) {
   "xla_lhlo.minimum"(%lhs, %rhs, %result)
       : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
@@ -61,7 +62,7 @@ func @minf(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
 
 // CHECK-LABEL: func @maxi
 func @maxi(%lhs: memref<2x2xi32>, %rhs: memref<2x2xi32>,
-          %result: memref<2x2xi32>) {
+           %result: memref<2x2xi32>) {
   "xla_lhlo.maximum"(%lhs, %rhs, %result)
       : (memref<2x2xi32>, memref<2x2xi32>, memref<2x2xi32>) -> ()
   return
@@ -89,8 +90,7 @@ func @and(%lhs: memref<2x2xi32>, %rhs: memref<2x2xi32>,
 // -----
 
 // CHECK-LABEL: func @exp
-func @exp(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
+func @exp(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
   "xla_lhlo.exponential"(%input, %result)
       : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
@@ -103,10 +103,8 @@ func @exp(%input: memref<2x2xf32>,
 // -----
 
 // CHECK-LABEL: func @log
-func @log(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.log"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+func @log(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.log"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -117,10 +115,8 @@ func @log(%input: memref<2x2xf32>,
 // -----
 
 // CHECK-LABEL: func @copy
-func @copy(%input: memref<2x4x8xf32>,
-           %result: memref<2x4x8xf32>) {
-  "xla_lhlo.copy"(%input, %result)
-      : (memref<2x4x8xf32>, memref<2x4x8xf32>) -> ()
+func @copy(%in: memref<2x4x8xf32>, %out: memref<2x4x8xf32>) {
+  "xla_lhlo.copy"(%in, %out) : (memref<2x4x8xf32>, memref<2x4x8xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -131,7 +127,7 @@ func @copy(%input: memref<2x4x8xf32>,
 
 // CHECK-LABEL: func @float_cmp
 func @float_cmp(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-    %result: memref<2x2xi1>) {
+                %result: memref<2x2xi1>) {
   "xla_lhlo.compare"(%lhs, %rhs, %result) {comparison_direction = "EQ"}
       : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xi1>) -> ()
   return
@@ -146,7 +142,8 @@ func @float_cmp(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
 // CHECK-LABEL: func @int_cmp
 func @int_cmp(%lhs: memref<2x2xi32>, %rhs: memref<2x2xi32>,
           %result: memref<2x2xi1>) {
-  "xla_lhlo.compare"(%lhs, %rhs, %result) {comparison_direction = "LT"} : (memref<2x2xi32>, memref<2x2xi32>, memref<2x2xi1>) -> ()
+  "xla_lhlo.compare"(%lhs, %rhs, %result) {comparison_direction = "LT"}
+      : (memref<2x2xi32>, memref<2x2xi32>, memref<2x2xi1>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -157,10 +154,10 @@ func @int_cmp(%lhs: memref<2x2xi32>, %rhs: memref<2x2xi32>,
 // -----
 
 // CHECK-LABEL: func @select
-func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
+func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>,
+             %rhs: memref<2x2xf32>, %result: memref<2x2xf32>) {
   "xla_lhlo.select"(%pred, %lhs, %rhs, %result)
-      : (memref<2x2xi1>, memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
+    : (memref<2x2xi1>, memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -184,23 +181,45 @@ func @iota(%out: memref<7x10xf32>) {
 
 // -----
 
-// CHECK: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @iota
-func @iota(%out: memref<7x10xi64>) {
-  "xla_lhlo.iota"(%out) {iota_dimension = 1 : i64} : (memref<7x10xi64>) -> ()
+// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1, d2) -> ()>
+// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-LABEL: func @broadcast_scalar
+func @broadcast_scalar(%operand: memref<f32>, %result: memref<4x2x1xf32>) {
+  "xla_lhlo.broadcast"(%operand, %result) {
+    broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>
+  } : (memref<f32>, memref<4x2x1xf32>) -> ()
   return
 }
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.+]]: f32, %{{.+}}: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d4, d5)>
+// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+// CHECK-LABEL: func @broadcast
+func @broadcast(%operand: memref<4x?x16xf32>,
+                %result: memref<4x2x1x4x?x16xf32>) {
+  "xla_lhlo.broadcast"(%operand, %result) {
+    broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>
+  } : (memref<4x?x16xf32>, memref<4x2x1x4x?x16xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.+]]: f32, %{{.+}}: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
 // -----
 
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, d2)>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-// CHECK-LABEL: func @dynamic_broadcast
-func @dynamic_broadcast(%operand: memref<?x?x?xf32>,
-                        %result: memref<?x?x?x?x?xf32>) {
-  "xla_lhlo.broadcast_in_dim"(%operand, %result)
-    {broadcast_dimensions = dense<[4,0,2]> : tensor<3xi64>}
-    : (memref<?x?x?xf32>, memref<?x?x?x?x?xf32>) -> ()
+// CHECK-LABEL: func @dynamic_broadcast_in_dim
+func @dynamic_broadcast_in_dim(%operand: memref<?x?x?xf32>,
+                               %result: memref<?x?x?x?x?xf32>) {
+  "xla_lhlo.broadcast_in_dim"(%operand, %result) {
+    broadcast_dimensions = dense<[4,0,2]> : tensor<3xi64>
+  } : (memref<?x?x?xf32>, memref<?x?x?x?x?xf32>) -> ()
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
@@ -211,11 +230,12 @@ func @dynamic_broadcast(%operand: memref<?x?x?xf32>,
 
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, 0)>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-// CHECK-LABEL: func @broadcast
-func @broadcast(%operand: memref<5x7x1xf32>, %result: memref<7x10x6x4x5xf32>) {
-  "xla_lhlo.broadcast_in_dim"(%operand, %result)
-    {broadcast_dimensions = dense<[4,0,2]> : tensor<3xi64>}
-    : (memref<5x7x1xf32>, memref<7x10x6x4x5xf32>) -> ()
+// CHECK-LABEL: func @broadcast_in_dim_with_expansion
+func @broadcast_in_dim_with_expansion(%operand: memref<5x7x1xf32>,
+                                      %result: memref<7x10x6x4x5xf32>) {
+  "xla_lhlo.broadcast_in_dim"(%operand, %result) {
+    broadcast_dimensions = dense<[4,0,2]> : tensor<3xi64>
+  } : (memref<5x7x1xf32>, memref<7x10x6x4x5xf32>) -> ()
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
@@ -226,11 +246,12 @@ func @broadcast(%operand: memref<5x7x1xf32>, %result: memref<7x10x6x4x5xf32>) {
 
 // CHECK-DAG: #[[RESULT_MAP_0:.*]] = affine_map<(d0, d1, d2) -> ()>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-// CHECK-LABEL: func @broadcast_scalar
-func @broadcast_scalar(%operand: memref<f32>, %result: memref<7x10x6xf32>) {
-  "xla_lhlo.broadcast_in_dim"(%operand, %result)
-    {broadcast_dimensions = dense<[]> : tensor<0xi64>}
-    : (memref<f32>, memref<7x10x6xf32>) -> ()
+// CHECK-LABEL: func @broadcast_in_dim_scalar
+func @broadcast_in_dim_scalar(%operand: memref<f32>,
+                              %result: memref<7x10x6xf32>) {
+  "xla_lhlo.broadcast_in_dim"(%operand, %result) {
+    broadcast_dimensions = dense<[]> : tensor<0xi64>
+  } : (memref<f32>, memref<7x10x6xf32>) -> ()
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[RESULT_MAP_0]], #[[RESULT_MAP]]]
@@ -239,9 +260,26 @@ func @broadcast_scalar(%operand: memref<f32>, %result: memref<7x10x6xf32>) {
 
 // -----
 
+// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1) -> (d0)>
+// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @broadcast_in_dim_with_one_to_one
+func @broadcast_in_dim_with_one_to_one(%operand: memref<1xf32>, %result: memref<1x5xf32>) {
+  "xla_lhlo.broadcast_in_dim"(%operand, %result) {
+    broadcast_dimensions = dense<[0]> : tensor<1xi64>
+  } : (memref<1xf32>, memref<1x5xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.+]]: f32, %{{.+}}: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
+
+// -----
+
 // CHECK-LABEL: func @constant
 func @constant(%value: memref<i32>) {
-  "xla_lhlo.constant"(%value) {value = dense<10> : tensor<i32>} : (memref<i32>) -> ()
+  "xla_lhlo.constant"(%value) {
+    value = dense<10> : tensor<i32>
+  } : (memref<i32>) -> ()
   return
 }
 // CHECK: %[[CONSTANT:.*]] = constant 10 : i32
@@ -249,11 +287,9 @@ func @constant(%value: memref<i32>) {
 
 // -----
 
-// CHECK-LABEL: func @abs
-func @abs(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.abs"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+// CHECK-LABEL: func @absf
+func @absf(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.abs"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -263,10 +299,10 @@ func @abs(%input: memref<2x2xf32>,
 
 // -----
 
-func @abs(%input: memref<2x2xi32>,
+// CHECK-LABEL: func @absi
+func @absi(%input: memref<2x2xi32>,
           %result: memref<2x2xi32>) {
-  "xla_lhlo.abs"(%input, %result)
-      : (memref<2x2xi32>, memref<2x2xi32>) -> ()
+  "xla_lhlo.abs"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
   return
 }
 
@@ -281,10 +317,8 @@ func @abs(%input: memref<2x2xi32>,
 // -----
 
 // CHECK-LABEL: func @ceil
-func @ceil(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.ceil"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+func @ceil(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.ceil"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -295,10 +329,8 @@ func @ceil(%input: memref<2x2xf32>,
 // -----
 
 // CHECK-LABEL: func @convert_i32_to_f32
-func @convert_i32_to_f32(%input: memref<2x2xi32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.convert"(%input, %result)
-      : (memref<2x2xi32>, memref<2x2xf32>) -> ()
+func @convert_i32_to_f32(%input: memref<2x2xi32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.convert"(%input, %result) : (memref<2x2xi32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -311,8 +343,7 @@ func @convert_i32_to_f32(%input: memref<2x2xi32>,
 // CHECK-LABEL: func @convert_i16_to_i32
 func @convert_i16_to_i32(%input: memref<2x2xi16>,
           %result: memref<2x2xi32>) {
-  "xla_lhlo.convert"(%input, %result)
-      : (memref<2x2xi16>, memref<2x2xi32>) -> ()
+  "xla_lhlo.convert"(%input, %result) : (memref<2x2xi16>, memref<2x2xi32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -323,10 +354,8 @@ func @convert_i16_to_i32(%input: memref<2x2xi16>,
 // -----
 
 // CHECK-LABEL: func @convert_i32_to_i16
-func @convert_i32_to_i16(%input: memref<2x2xi32>,
-          %result: memref<2x2xi16>) {
-  "xla_lhlo.convert"(%input, %result)
-      : (memref<2x2xi32>, memref<2x2xi16>) -> ()
+func @convert_i32_to_i16(%input: memref<2x2xi32>, %result: memref<2x2xi16>) {
+  "xla_lhlo.convert"(%input, %result) : (memref<2x2xi32>, memref<2x2xi16>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -337,10 +366,8 @@ func @convert_i32_to_i16(%input: memref<2x2xi32>,
 // -----
 
 // CHECK-LABEL: func @convert_f32_to_f64
-func @convert_f32_to_f64(%input: memref<2x2xf32>,
-          %result: memref<2x2xf64>) {
-  "xla_lhlo.convert"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf64>) -> ()
+func @convert_f32_to_f64(%input: memref<2x2xf32>, %result: memref<2x2xf64>) {
+  "xla_lhlo.convert"(%input, %result) : (memref<2x2xf32>, memref<2x2xf64>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -351,10 +378,8 @@ func @convert_f32_to_f64(%input: memref<2x2xf32>,
 // -----
 
 // CHECK-LABEL: func @convert_f64_to_f32
-func @convert_f64_to_f32(%input: memref<2x2xf64>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.convert"(%input, %result)
-      : (memref<2x2xf64>, memref<2x2xf32>) -> ()
+func @convert_f64_to_f32(%input: memref<2x2xf64>, %result: memref<2x2xf32>) {
+  "xla_lhlo.convert"(%input, %result) : (memref<2x2xf64>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -365,10 +390,8 @@ func @convert_f64_to_f32(%input: memref<2x2xf64>,
 // -----
 
 // CHECK-LABEL: func @convert_i32_to_i32
-func @convert_i32_to_i32(%input: memref<2x2xi32>,
-          %result: memref<2x2xi32>) {
-  "xla_lhlo.convert"(%input, %result)
-      : (memref<2x2xi32>, memref<2x2xi32>) -> ()
+func @convert_i32_to_i32(%input: memref<2x2xi32>, %result: memref<2x2xi32>) {
+  "xla_lhlo.convert"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -378,10 +401,8 @@ func @convert_i32_to_i32(%input: memref<2x2xi32>,
 // -----
 
 // CHECK-LABEL: func @convert_f32_to_f32
-func @convert_f32_to_f32(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.convert"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+func @convert_f32_to_f32(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.convert"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -390,11 +411,22 @@ func @convert_f32_to_f32(%input: memref<2x2xf32>,
 
 // -----
 
+// CHECK-LABEL: func @convert_f32_to_i32
+func @convert_f32_to_i32(%input: memref<2x2xf32>, %result: memref<2x2xi32>) {
+  "xla_lhlo.convert"(%input, %result)
+      : (memref<2x2xf32>, memref<2x2xi32>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]: i32):
+// CHECK-NEXT:   %[[RESULT:.*]] = fptosi %[[OPERAND_IN]] : f32 to i32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
+
+// -----
+
 // CHECK-LABEL: func @cos
-func @cos(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.cosine"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+func @cos(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.cosine"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -404,28 +436,37 @@ func @cos(%input: memref<2x2xf32>,
 
 // -----
 
-// CHECK-LABEL: func @neg
-func @neg(%input: memref<2x2xf32>,
+// CHECK-LABEL: func @sin
+func @sin(%input: memref<2x2xf32>,
           %result: memref<2x2xf32>) {
-  "xla_lhlo.negate"(%input, %result)
+  "xla_lhlo.sine"(%input, %result)
       : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
 // CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[RESULT:.*]] = sin %[[OPERAND_IN]] : f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
+// CHECK-LABEL: func @negf
+func @negf(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.negate"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
 // CHECK-NEXT:   %[[RESULT:.*]] = negf %[[OPERAND_IN]] : f32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
 
 // -----
 
-// CHECK-LABEL: func @neg
-func @neg(%input: memref<2x2xi32>,
-          %result: memref<2x2xi32>) {
-  "xla_lhlo.negate"(%input, %result)
-      : (memref<2x2xi32>, memref<2x2xi32>) -> ()
+// CHECK-LABEL: func @negi
+func @negi(%input: memref<2x2xi32>, %result: memref<2x2xi32>) {
+  "xla_lhlo.negate"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
   return
 }
-
 // CHECK: linalg.generic
 // CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]):
 // CHECK-NEXT:   %[[L0:.*]] = constant 0 : i32 
@@ -436,7 +477,7 @@ func @neg(%input: memref<2x2xi32>,
 
 // CHECK-LABEL: func @rem
 func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
+                %result: memref<2x2xf32>) {
   "xla_lhlo.remainder"(%lhs, %rhs, %result)
       : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
@@ -449,10 +490,8 @@ func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
 // -----
 
 // CHECK-LABEL: func @rsqrt
-func @rsqrt(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.rsqrt"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+func @rsqrt(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.rsqrt"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -463,10 +502,8 @@ func @rsqrt(%input: memref<2x2xf32>,
 // -----
 
 // CHECK-LABEL: func @sign
-func @sign(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.sign"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+func @sign(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.sign"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -478,10 +515,8 @@ func @sign(%input: memref<2x2xf32>,
 // -----
 
 // CHECK-LABEL: func @sqrt
-func @sqrt(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.sqrt"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+func @sqrt(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.sqrt"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -492,10 +527,8 @@ func @sqrt(%input: memref<2x2xf32>,
 // -----
 
 // CHECK-LABEL: func @tanh
-func @tanh(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "xla_lhlo.tanh"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+func @tanh(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "xla_lhlo.tanh"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
   return
 }
 // CHECK: linalg.generic
@@ -503,6 +536,48 @@ func @tanh(%input: memref<2x2xf32>,
 // CHECK-NEXT:   %[[RESULT:.*]] = tanh %[[OPERAND_IN]] : f32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
 
+// -----
+
+// CHECK-LABEL: func @complex
+func @complex(%real: memref<2x2xf32>,
+              %imag: memref<2x2xf32>,
+              %cplx: memref<2x2xcomplex<f32>>) {
+  "xla_lhlo.complex"(%real, %imag, %cplx)
+      : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xcomplex<f32>>) -> ()
+  return
+}
+// CHECK:      linalg.generic
+// CHECK-NEXT: ^bb0(%[[RE:.*]]: f32, %[[IM:.*]]: f32, %[[CP:.*]]: complex<f32>):
+// CHECK-NEXT:   %[[RESULT:.*]] = create_complex %[[RE]], %[[IM]] : complex<f32>
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f32>
+
+// -----
+
+// CHECK-LABEL: func @real
+func @real(%cplx: memref<2x2xcomplex<f32>>,
+           %real: memref<2x2xf32>) {
+  "xla_lhlo.real"(%cplx, %real)
+      : (memref<2x2xcomplex<f32>>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK:      linalg.generic
+// CHECK-NEXT: ^bb0(%[[CPLX_IN:.*]]: complex<f32>, %[[REAL_OUT:.*]]: f32):
+// CHECK-NEXT:   %[[REAL:.*]] = re %[[CPLX_IN:.*]] : complex<f32>
+// CHECK-NEXT:   linalg.yield %[[REAL]] : f32
+
+// -----
+
+// CHECK-LABEL: func @imag
+func @imag(%cplx: memref<2x2xcomplex<f32>>,
+           %imag: memref<2x2xf32>) {
+  "xla_lhlo.imag"(%cplx, %imag)
+      : (memref<2x2xcomplex<f32>>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK:      linalg.generic
+// CHECK-NEXT: ^bb0(%[[CPLX_IN:.*]]: complex<f32>, %[[IMAG_OUT:.*]]: f32):
+// CHECK-NEXT:   %[[IMAG:.*]] = im %[[CPLX_IN:.*]] : complex<f32>
+// CHECK-NEXT:   linalg.yield %[[IMAG]] : f32
 
 // -----
 
@@ -532,7 +607,8 @@ func @slice(%operand: memref<?x?xf32>, %result: memref<?x?xf32>) {
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @reshape_3D_2D
 func @reshape_3D_2D(%arg0: memref<12x1x42xi32>, %arg1 : memref<12x42xi32>) {
-  "xla_lhlo.reshape"(%arg0, %arg1) : (memref<12x1x42xi32>, memref<12x42xi32>) -> ()
+  "xla_lhlo.reshape"(%arg0, %arg1)
+    : (memref<12x1x42xi32>, memref<12x42xi32>) -> ()
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
@@ -543,7 +619,8 @@ func @reshape_3D_2D(%arg0: memref<12x1x42xi32>, %arg1 : memref<12x42xi32>) {
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @reshape_4D_2D
 func @reshape_4D_2D(%arg0: memref<12x42x1x1xi32>, %arg1 : memref<12x42xi32>) {
-  "xla_lhlo.reshape"(%arg0, %arg1) : (memref<12x42x1x1xi32>, memref<12x42xi32>) -> ()
+  "xla_lhlo.reshape"(%arg0, %arg1)
+    : (memref<12x42x1x1xi32>, memref<12x42xi32>) -> ()
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
@@ -554,7 +631,21 @@ func @reshape_4D_2D(%arg0: memref<12x42x1x1xi32>, %arg1 : memref<12x42xi32>) {
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 // CHECK-LABEL: func @reshape_2D_4D
 func @reshape_2D_4D(%arg0: memref<12x42xi32>, %arg1 : memref<12x1x42x1xi32>) {
-  "xla_lhlo.reshape"(%arg0, %arg1) : (memref<12x42xi32>, memref<12x1x42x1xi32>) -> ()
+  "xla_lhlo.reshape"(%arg0, %arg1)
+    : (memref<12x42xi32>, memref<12x1x42x1xi32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reverse
+func @reverse(%arg0: memref<2x3xf32>, %arg1: memref<2x3xf32>) {
+  "xla_lhlo.reverse"(%arg0, %arg1) {
+    dimensions = dense<1> : tensor<1xi64>
+  } : (memref<2x3xf32>, memref<2x3xf32>) -> ()
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
index e1f0d5c8682..32c367f97d6 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
@@ -22,13 +22,13 @@ func @reduce(%arg: memref<100x10x5xf32>,
 // CHECK-DAG:  [[C10:%.*]] = constant 10 : index
 // CHECK-DAG:  [[C100:%.*]] = constant 100 : index
 // CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:                     to ([[C100]], [[C5]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
 // CHECK-SAME:      ([[C0]]) to ([[C10]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
 // CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<100x10x5xf32>
-// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -37,12 +37,12 @@ func @reduce(%arg: memref<100x10x5xf32>,
 // CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:      }
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 // CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    loop.yield
+// CHECK:    scf.yield
 
 // -----
 
@@ -66,10 +66,10 @@ func @reduce_no_outer_loop(%arg: memref<100xf32>,
 // CHECK-DAG:  [[C1:%.*]] = constant 1 : index
 // CHECK-DAG:  [[C100:%.*]] = constant 100 : index
 // CHECK:      [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:      [[REDUCTION_RESULT:%.*]] = loop.parallel ([[I:%.*]]) = ([[C0]])
+// CHECK:      [[REDUCTION_RESULT:%.*]] = scf.parallel ([[I:%.*]]) = ([[C0]])
 // CHECK-SAME:     to ([[C100]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:        [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]{{\[}}[[I]]{{\]}}
-// CHECK:        loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:        scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:        ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:          [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:          [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -78,9 +78,9 @@ func @reduce_no_outer_loop(%arg: memref<100xf32>,
 // CHECK:          store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:          "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:          [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:          loop.reduce.return [[ACC_RESULT]]
+// CHECK:          scf.reduce.return [[ACC_RESULT]]
 // CHECK:        }
-// CHECK:        loop.yield
+// CHECK:        scf.yield
 // CHECK:      store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[C0]]]
 
 // -----
@@ -107,13 +107,13 @@ func @dynamic_reduce(%arg: memref<?x?x?xf32>,
 // CHECK:  [[DIM1:%.*]] = dim [[ARG_BUF]], 1 : memref<?x?x?xf32>
 // CHECK:  [[DIM2:%.*]] = dim [[ARG_BUF]], 2 : memref<?x?x?xf32>
 // CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:                     to ([[DIM0]], [[DIM2]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
 // CHECK-SAME:     ([[C0]]) to ([[DIM1]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
 // CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<?x?x?xf32>
-// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -122,12 +122,12 @@ func @dynamic_reduce(%arg: memref<?x?x?xf32>,
 // CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:      }
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 // CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    loop.yield
+// CHECK:    scf.yield
 
 // -----
 
@@ -136,7 +136,7 @@ func @reduce_window(%arg: memref<112x112xf32>,
              %result: memref<56x56xf32>) {
   "xla_lhlo.reduce_window"(%arg, %init, %result) ( {
     ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-      "xla_lhlo.maximum"(%lhs, %rhs, %res) 
+      "xla_lhlo.maximum"(%lhs, %rhs, %res)
         : (memref<f32>, memref<f32>, memref<f32>) -> ()
       "xla_lhlo.terminator"() : () -> ()
     }) {
@@ -158,9 +158,9 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK-DAG:  [[C56:%.*]] = constant 56 : index
 // CHECK-DAG:  [[C112:%.*]] = constant 112 : index
 // CHECK:      [[INIT:%.*]] = load [[INIT_BUF]][] : memref<f32>
-// CHECK:      loop.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
+// CHECK:      scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:         to ([[C56]], [[C56]]) step ([[C1]], [[C1]]) {
-// CHECK:        [[REDUCTION_RESULT:%.*]] = loop.parallel
+// CHECK:        [[REDUCTION_RESULT:%.*]] = scf.parallel
 // CHECK-SAME:       ([[IW:%.*]], [[JW:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:       to ([[C3]], [[C3]]) step ([[C1]], [[C1]])
 // CHECK-SAME:       init ([[INIT]]) -> f32 {
@@ -177,15 +177,15 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK:          [[INDEX_J_FITS:%.*]] = cmpi "ult", [[INDEX_J]], [[C112]]
 // CHECK:          [[IN_BOUNDS_1:%.*]] = and [[IN_BOUNDS_0]], [[INDEX_J_FITS]]
 
-// CHECK:          [[ELEM_TO_REDUCE:%.*]] = loop.if [[IN_BOUNDS_1]] -> (f32) {
+// CHECK:          [[ELEM_TO_REDUCE:%.*]] = scf.if [[IN_BOUNDS_1]] -> (f32) {
 // CHECK:            [[OPERAND_ELEM:%.*]] =
 // CHECK-SAME:         load [[OPERAND_BUF]]{{\[}}[[INDEX_I]], [[INDEX_J]]]
-// CHECK:              loop.yield [[OPERAND_ELEM]] : f32
+// CHECK:              scf.yield [[OPERAND_ELEM]] : f32
 // CHECK:            } else {
-// CHECK:              loop.yield [[INIT]] : f32
+// CHECK:              scf.yield [[INIT]] : f32
 // CHECK:            }
 
-// CHECK:          loop.reduce([[ELEM_TO_REDUCE]])  : f32 {
+// CHECK:          scf.reduce([[ELEM_TO_REDUCE]])  : f32 {
 // CHECK:          ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:            [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:            [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -194,12 +194,12 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK:            store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:            "xla_lhlo.maximum"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:            [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:            loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:            scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:          }
-// CHECK:          loop.yield
+// CHECK:          scf.yield
 // CHECK:        }
 // CHECK:        store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[J]]]
-// CHECK:        loop.yield
+// CHECK:        scf.yield
 // CHECK:      }
 // CHECK:      return
 // CHECK:    }
diff --git a/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir b/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir
index 35a5ae549d5..81376761467 100644
--- a/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt %s -test-xla-lower-complex | FileCheck %s
+// RUN: xla-opt %s -test-xla-chlo-legalize-to-hlo -test-xla-lower-complex | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: @add
 func @add(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
@@ -15,21 +15,6 @@ func @add(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
   return %5, %6 : tensor<2xf32>, tensor<2xf32>
 }
 
-// CHECK-LABEL: @add_broadcast
-func @add_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.add"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.add"(%arg1, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %4 = "xla_hlo.add"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return [[VAL0]], [[VAL1]]
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
 // CHECK-LABEL: @add_unranked
 func @add_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
@@ -60,21 +45,6 @@ func @sub(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
   return %5, %6 : tensor<2xf32>, tensor<2xf32>
 }
 
-// CHECK-LABEL: @sub_broadcast
-func @sub_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.subtract"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.subtract"(%arg1, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %4 = "xla_hlo.subtract"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return [[VAL0]], [[VAL1]]
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
 // CHECK-LABEL: @sub_unranked
 func @sub_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
@@ -109,25 +79,6 @@ func @mul(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
   return %5, %6 : tensor<2xf32>, tensor<2xf32>
 }
 
-// CHECK-LABEL: @mul_broadcast
-func @mul_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.multiply"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.multiply"(%arg1, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL2:%.+]] = xla_hlo.subtract [[VAL0]], [[VAL1]]
-  // CHECK-DAG: [[VAL3:%.+]] = "xla_hlo.multiply"(%arg0, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL4:%.+]] = "xla_hlo.multiply"(%arg1, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL5:%.+]] = xla_hlo.add [[VAL3]], [[VAL4]]
-  %4 = "xla_hlo.multiply"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return %2, %5 : tensor<1x2xf32>, tensor<1x2xf32>
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
 // CHECK-LABEL: @mul_unranked
 func @mul_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
@@ -186,45 +137,6 @@ func @div(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
 
 // -----
 
-// CHECK-LABEL: @div_broadcast
-func @div_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.negate"(%arg3)
-
-  // Compute the numerator's real component:
-  //   numerator.real = lhs.real * rhs.real  lhs.imag * rhs.imag
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.multiply"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL2:%.+]] = "xla_hlo.multiply"(%arg1, [[VAL0]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL3:%.+]] = xla_hlo.subtract [[VAL1]], [[VAL2]]
-
-  // Compute the real valued denominator as rhs * con(rhs):
-  //   denominator = rhs.real * rhs.real + rhs.imag * rhs.imag
-  // CHECK-DAG: [[VAL4:%.+]] = xla_hlo.multiply %arg2, %arg2
-  // CHECK-DAG: [[VAL5:%.+]] = xla_hlo.multiply %arg3, [[VAL0]]
-  // CHECK-DAG: [[VAL6:%.+]] = xla_hlo.subtract [[VAL4]], [[VAL5]]
-
-  // Compute the numerator's imaginary component:
-  //   numerator.imag = lhs.imag * rhs.real - lhs.real * rhs.imag
-  // CHECK-DAG: [[VAL7:%.+]] = "xla_hlo.multiply"(%arg1, %arg2)
-  // CHECK-DAG: [[VAL8:%.+]] = "xla_hlo.multiply"(%arg0, [[VAL0]])
-  // CHECK-DAG: [[VAL9:%.+]] = xla_hlo.add [[VAL8]], [[VAL7]]
-
-  // Divide the numerator by the real valued denominator.
-  // CHECK-DAG: [[VAL10:%.+]] = "xla_hlo.divide"([[VAL3]], [[VAL6]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL11:%.+]] = "xla_hlo.divide"([[VAL9]], [[VAL6]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %4 = "xla_hlo.divide"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return [[VAL10]], [[VAL11]]
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @div_unranked
 func @div_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
diff --git a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
index fde5c12c1c6..55b55c7b4e2 100644
--- a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
@@ -1,273 +1,11 @@
 // RUN: xla-opt -test-xla-materialize-broadcasts -split-input-file %s -o - | FileCheck --dump-input=fail %s
 
-// CHECK-LABEL: @addBroadcastRhs
-func @addBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastLhs
-func @addBroadcastLhs(%arg0: tensor<4xf32>, %arg1: tensor<1x4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<1x4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastMultidimension
-func @addBroadcastMultidimension(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1x4xf32>) -> tensor<1x1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x1x4xf32>) -> tensor<1x1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x1x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>, tensor<1x1x4xf32>) -> tensor<1x1x4xf32>
-  return %0 : tensor<1x1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastBothArgs
-func @addBroadcastBothArgs(%arg0: tensor<1x2xf32>, %arg1: tensor<3x2x1xf32>) -> tensor<3x2x2xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xf32>) -> tensor<3x2x2xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x1xf32>) -> tensor<3x2x2xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<3x2x2xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xf32>, tensor<3x2x1xf32>) -> tensor<3x2x2xf32>
-  return %0 : tensor<3x2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastScalar
-func @addBroadcastScalar(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+// CHECK-LABEL: @clampBroadcast
+// CHECK-SAME: (%[[MIN:.+]]: tensor<f32>, %[[VAL:.+]]: tensor<4xf32>, %[[MAX:.+]]: tensor<f32>)
+func @clampBroadcast(%min: tensor<f32>, %value: tensor<4xf32>, %max: tensor<f32>) -> tensor<4xf32> {
+  // CHECK-DAG: %[[MIN_BC:.+]] = "xla_hlo.broadcast"(%[[MIN]]) {broadcast_sizes = dense<4> : tensor<1xi64>} : (tensor<f32>) -> tensor<4xf32>
+  // CHECK-DAG: %[[MAX_BC:.+]] = "xla_hlo.broadcast"(%[[MAX]]) {broadcast_sizes = dense<4> : tensor<1xi64>} : (tensor<f32>) -> tensor<4xf32>
+  // CHECK: "xla_hlo.clamp"(%[[MIN_BC]], %[[VAL]], %[[MAX_BC]]) : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %0 = "xla_hlo.clamp"(%min, %value, %max) : (tensor<f32>, tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
-
-// -----
-
-// CHECK-LABEL: @addWithoutBroadcast
-func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %arg1 : tensor<4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addUnranked
-func @addUnranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %arg1 : tensor<*xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  return %0 : tensor<*xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @atan2BroadcastRhs
-func @atan2BroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.atan2 %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.atan2"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @divBroadcastRhs
-func @divBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.divide %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @maxBroadcastRhs
-func @maxBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.maximum %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.maximum"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @minBroadcastRhs
-func @minBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.minimum %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.minimum"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @mulBroadcastRhs
-func @mulBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.multiply %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @powBroadcastRhs
-func @powBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.power %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.power"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @remainderBroadcastRhs
-func @remainderBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.remainder %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.remainder"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @shiftLeftBroadcastRhs
-func @shiftLeftBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.shift_left %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.shift_left"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @shiftRightArithmeticBroadcastRhs
-func @shiftRightArithmeticBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.shift_right_arithmetic %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @shiftRightLogicalBroadcastRhs
-func @shiftRightLogicalBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.shift_right_logical %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.shift_right_logical"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @subBroadcastRhs
-func @subBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.subtract %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @andBroadcastRhs
-func @andBroadcastRhs(%arg0: tensor<1x4xi32>, %arg1: tensor<4xi32>) -> tensor<1x4xi32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.and %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xi32>
-  %0 = "xla_hlo.and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xi32>, tensor<4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @orBroadcastRhs
-func @orBroadcastRhs(%arg0: tensor<1x4xi32>, %arg1: tensor<4xi32>) -> tensor<1x4xi32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.or %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xi32>
-  %0 = "xla_hlo.or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xi32>, tensor<4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @xorBroadcastRhs
-func @xorBroadcastRhs(%arg0: tensor<1x4xi32>, %arg1: tensor<4xi32>) -> tensor<1x4xi32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.xor %[[BROADCAST0]], %[[BROADCAST1]] : tensor<1x4xi32>
-  %0 = "xla_hlo.xor"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xi32>, tensor<4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @compareBroadcastRhs
-func @compareBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xi1> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = "xla_hlo.compare"(%[[BROADCAST0]], %[[BROADCAST1]]) {comparison_direction = "NE"} : (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xi1>
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xi1>
-  return %0 : tensor<1x4xi1>
-}
-
-// -----
-
-// CHECK-LABEL: @dynamicBroadcastAdd
-func @dynamicBroadcastAdd(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xf32> {
-  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
-  // CHECK-NEXT: %c1 = constant 1 : index
-  // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
-  // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
-  // CHECK-NEXT: %[[SEL:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[SEL]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @dynamicBroadcastAddScalar
-func @dynamicBroadcastAddScalar(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) -> tensor<?x?xf32> {
-  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
-  // CHECK-NEXT: %[[DIM1:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[DIM1]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
diff --git a/tensorflow/compiler/mlir/xla/tests/mlir_hlo_builder_test.cc b/tensorflow/compiler/mlir/xla/tests/mlir_hlo_builder_test.cc
new file mode 100644
index 00000000000..54791e15cf4
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/mlir_hlo_builder_test.cc
@@ -0,0 +1,179 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h"
+
+#include <string>
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+namespace {
+
+static void ExpectHasSubstr(absl::string_view s, absl::string_view expected) {
+  EXPECT_TRUE(absl::StrContains(s, expected))
+      << s << " does not contain " << expected;
+}
+
+class XlaBuilderTest : public ::testing::Test {
+ protected:
+  XlaBuilderTest()
+      : name_(SetupTest()),
+        context_(),
+        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(&context_))),
+        builder_(&module_->getBodyRegion()),
+        xla_builder_(name_, builder_, module_->getLoc()) {}
+
+  string SetupTest() {
+    mlir::registerDialect<mlir::xla_hlo::XlaHloDialect>();
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+
+  // Retuns the MLIR op string representation of the given XlaOp.
+  string GetMlirOpString(XlaOp xla_op) {
+    string str;
+    llvm::raw_string_ostream ostream{str};
+    xla_builder_.GetValue(xla_op).print(ostream);
+    ostream.flush();
+    return str;
+  }
+
+  string name_;
+  mlir::MLIRContext context_;
+  mlir::OwningModuleRef module_;
+  mlir::OpBuilder builder_;
+  MlirHloBuilder xla_builder_;
+};
+
+TEST_F(XlaBuilderTest, CreateToken) {
+  auto token = CreateToken(&xla_builder_);
+  auto str = GetMlirOpString(token);
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+
+  ExpectHasSubstr(GetMlirOpString(token),
+                  R"("xla_hlo.create_token"() : () -> !xla_hlo.token)");
+}
+
+TEST_F(XlaBuilderTest, Infeed) {
+  auto token = CreateToken(&xla_builder_);
+  auto infeed = InfeedWithToken(token, ShapeUtil::MakeShape(F32, {4, 8}), "");
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+  ExpectHasSubstr(
+      GetMlirOpString(infeed),
+      R"("xla_hlo.infeed"(%0) {infeed_config = ""} : (!xla_hlo.token) -> tuple<tensor<4x8xf32>, !xla_hlo.token>)");
+}
+
+TEST_F(XlaBuilderTest, Outfeed) {
+  auto outfeed_shape = ShapeUtil::MakeShape(F32, {4, 8});
+  auto data = ConstantLiteral(
+      &xla_builder_,
+      LiteralUtil::CreateFromDimensions(F32, outfeed_shape.dimensions()));
+  auto token = CreateToken(&xla_builder_);
+  auto outfeed = OutfeedWithToken(data, token, outfeed_shape, "");
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+  ExpectHasSubstr(
+      GetMlirOpString(outfeed),
+      R"("xla_hlo.outfeed"(%0, %1) {outfeed_config = ""} : (tensor<4x8xf32>, !xla_hlo.token) -> !xla_hlo.token)");
+}
+
+TEST_F(XlaBuilderTest, ConcatInDim) {
+  auto data0 = ConstantLiteral(
+      &xla_builder_, LiteralUtil::CreateFromDimensions(F32, {2, 4, 5}));
+  auto data1 = ConstantLiteral(
+      &xla_builder_, LiteralUtil::CreateFromDimensions(F32, {2, 6, 5}));
+  auto concat = ConcatInDim(&xla_builder_, {data0, data1}, 1);
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+  ExpectHasSubstr(
+      GetMlirOpString(concat),
+      R"("xla_hlo.concatenate"(%0, %1) {dimension = 1 : i64} : (tensor<2x4x5xf32>, tensor<2x6x5xf32>) -> tensor<2x10x5xf32>)");
+}
+
+TEST_F(XlaBuilderTest, Tuple) {
+  auto data0 = ConstantLiteral(&xla_builder_,
+                               LiteralUtil::CreateFromDimensions(F32, {3, 7}));
+  auto data1 = ConstantLiteral(&xla_builder_,
+                               LiteralUtil::CreateFromDimensions(F32, {}));
+  auto tuple = Tuple(&xla_builder_, {data0, data1});
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+  ExpectHasSubstr(
+      GetMlirOpString(tuple),
+      R"("xla_hlo.tuple"(%0, %1) : (tensor<3x7xf32>, tensor<f32>) -> tuple<tensor<3x7xf32>, tensor<f32>>)");
+}
+
+TEST_F(XlaBuilderTest, GetTupleElement) {
+  auto data0 = ConstantLiteral(&xla_builder_,
+                               LiteralUtil::CreateFromDimensions(F32, {3, 7}));
+  auto data1 = ConstantLiteral(&xla_builder_,
+                               LiteralUtil::CreateFromDimensions(F32, {}));
+  auto tuple_data = Tuple(&xla_builder_, {data0, data1});
+  auto gte = GetTupleElement(tuple_data, 1);
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+  ExpectHasSubstr(
+      GetMlirOpString(gte),
+      R"("xla_hlo.get_tuple_element"(%2) {index = 1 : i32} : (tuple<tensor<3x7xf32>, tensor<f32>>) -> tensor<f32>)");
+}
+
+TEST_F(XlaBuilderTest, Slice) {
+  auto data = ConstantLiteral(&xla_builder_,
+                              LiteralUtil::CreateFromDimensions(F32, {3, 7}));
+  auto slice = Slice(data, {0, 1}, {2, 5}, {1, 1});
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+  ExpectHasSubstr(
+      GetMlirOpString(slice),
+      R"("xla_hlo.slice"(%0) {limit_indices = dense<[2, 5]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<3x7xf32>) -> tensor<2x4xf32>)");
+}
+
+TEST_F(XlaBuilderTest, Pad) {
+  auto data = ConstantLiteral(&xla_builder_,
+                              LiteralUtil::CreateFromDimensions(F32, {3, 7}));
+  auto zero = ConstantLiteral(&xla_builder_, LiteralUtil::Zero(F32));
+
+  PaddingConfig padding_config;
+  auto* dims0 = padding_config.add_dimensions();
+  dims0->set_edge_padding_low(1);
+  dims0->set_interior_padding(0);
+  dims0->set_edge_padding_high(2);
+  auto* dims1 = padding_config.add_dimensions();
+  dims1->set_edge_padding_low(3);
+  dims1->set_interior_padding(1);
+  dims1->set_edge_padding_high(0);
+  auto pad = Pad(data, zero, padding_config);
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+  ExpectHasSubstr(
+      GetMlirOpString(pad),
+      R"("xla_hlo.pad"(%0, %1) {edge_padding_high = dense<[2, 0]> : tensor<2xi64>, edge_padding_low = dense<[1, 3]> : tensor<2xi64>, interior_padding = dense<[0, 1]> : tensor<2xi64>} : (tensor<3x7xf32>, tensor<f32>) -> tensor<6x16xf32>)");
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index aa38ccd3c30..f09ec62c8dc 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -446,7 +446,7 @@ func @recv_non_token_second_result(%token: !xla_hlo.token) -> tuple<tensor<3x4xi
 
 func @rng_uniform_invalid_type(%mu: tensor<complex<f32>>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = xla_hlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{must be tensor of pred (AKA boolean or 1-bit integer) or 8/16/32/64-bit signless integer or floating-point values, but got 'tensor<complex<f32>>'}}
+  // expected-error@+1 {{but got 'tensor<complex<f32>>'}}
   %0 = "xla_hlo.rng_uniform"(%mu, %sigma, %shape) : (tensor<complex<f32>>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   return %0 : tensor<2x3x5xf32>
 }
@@ -461,6 +461,14 @@ func @scalars_to_dimension_tensor(%arg0: i32, %arg1: i32) -> tensor<2xi32> {
 
 // -----
 
+// CHECK-LABEL: @scalars_to_dimension_tensor_index
+func @scalars_to_dimension_tensor_index(%arg0: index, %arg1: index) -> tensor<2xindex> {
+  %0 = "xla_hlo.scalars_to_dimension_tensor"(%arg0, %arg1) : (index, index) -> tensor<2xindex>
+  return %0 : tensor<2xindex>
+}
+
+// -----
+
 // CHECK-LABEL: func @select
 func @select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -551,37 +559,61 @@ func @slice_operand_result_mismatch(%arg0: tensor<3x4xi32>) -> tensor<1x4xf32> {
 // -----
 
 // CHECK-LABEL: func @dynamic_slice
-func @dynamic_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> {
-  %0 = "xla_hlo.dynamic-slice"(%arg0, %arg1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<2xi64>) -> tensor<1x4xi32>
+func @dynamic_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
+  %0 = "xla_hlo.dynamic-slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   return %0 : tensor<1x4xi32>
 }
 
 // -----
 
-func @dynamic_slice_mismatch_indices(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> {
-  // expected-error@+1 {{failed to verify that all of {start_indices, slice_sizes} have same shape}}
-  %0 = "xla_hlo.dynamic-slice"(%arg0, %arg1) {slice_sizes = dense<[4]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<2xi64>) -> tensor<1x4xi32>
+func @dynamic_slice_mismatch_indices(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
+  // expected-error@+1 {{has mismatched number of slice sizes (1) and number of start indices (2)}}
+  %0 = "xla_hlo.dynamic-slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[4]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   return %0 : tensor<1x4xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @dynamic_slice_different_indice_element_type
-func @dynamic_slice_different_indice_element_type(%arg0: tensor<3x4xi32>, %arg1: tensor<1xi32>) -> tensor<1x4xi32> {
-  %0 = "xla_hlo.dynamic-slice"(%arg0, %arg1) {slice_sizes = dense<[4]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<1xi32>) -> tensor<1x4xi32>
+func @dynamic_slice_different_indice_element_type(%arg0: tensor<3x4xi32>, %arg1: tensor<i32>) -> tensor<1x4xi32> {
+  %0 = "xla_hlo.dynamic-slice"(%arg0, %arg1) {slice_sizes = dense<[4]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i32>) -> tensor<1x4xi32>
   return %0 : tensor<1x4xi32>
 }
 
 // -----
 
-func @dynamic_slice_mismatch_element_types(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xf32> {
+func @dynamic_slice_mismatch_element_types(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xf32> {
   // expected-error@+1 {{failed to verify that all of {operand, result} have same element type}}
-  %0 = "xla_hlo.dynamic-slice"(%arg0, %arg1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<2xi64>) -> tensor<1x4xf32>
+  %0 = "xla_hlo.dynamic-slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xf32>
   return %0 : tensor<1x4xf32>
 }
 
 // -----
 
+func @dynamic_slice_invalid_start(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> {
+  // expected-error@+1 {{operand #1 must be a 0-dim integer tensor of 8/16/32/64-bit signless integer or 8/16/32/64-bit unsigned integer values, but got 'tensor<2xi64>'}}
+  %0 = "xla_hlo.dynamic-slice"(%arg0, %arg1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<2xi64>) -> tensor<1x4xi32>
+  return %0 : tensor<1x4xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @dynamic_update_slice
+func @dynamic_update_slice(%input: tensor<3x4xi64>, %update: tensor<2xi64>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<3x4xi64> {
+  %0 = "xla_hlo.dynamic-update-slice"(%input, %update, %start1, %start2) : (tensor<3x4xi64>, tensor<2xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
+  return %0 : tensor<3x4xi64>
+}
+
+// -----
+
+func @dynamic_update_slice_invalid_start(%input: tensor<3x4xi64>, %update: tensor<2xi64>, %start: tensor<2xi64>) -> tensor<3x4xi64> {
+  // expected-error@+1 {{operand #2 must be a 0-dim integer tensor of 8/16/32/64-bit signless integer or 8/16/32/64-bit unsigned integer values, but got 'tensor<2xi64>'}}
+  %0 = "xla_hlo.dynamic-update-slice"(%input, %update, %start) : (tensor<3x4xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<3x4xi64>
+  return %0 : tensor<3x4xi64>
+}
+
+// -----
+
 // CHECK-LABEL: func @transpose
 func @transpose(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
   %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
@@ -754,7 +786,7 @@ func @or_i1_type(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
 // -----
 
 func @or_invalid_f32_type(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // expected-error@+1 {{must be tensor of pred (AKA boolean or 1-bit integer) or 8/16/32/64-bit signless integer values, but got 'tensor<4xf32>'}}
+  // expected-error@+1 {{but got 'tensor<4xf32>'}}
   %0 = "xla_hlo.or"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
@@ -777,12 +809,14 @@ func @constants() -> () {
 
   // CHECK: xla_hlo.constant {extra_attr = 3 : i32} dense<0> : tensor<i32>
   %1 = "xla_hlo.constant"() {extra_attr = 3 : i32, value = dense<0> : tensor<i32>} : () -> (tensor<i32>)
+  return
+}
 
-  // CHECK: xla_hlo.constant {value = dense<0> : tensor<i32>} : tensor<*xi32>
-  %2 = "xla_hlo.constant"() {value = dense<0> : tensor<i32>} : () -> (tensor<*xi32>)
+// -----
 
-  // CHECK: xla_hlo.constant {extra_attr = 3 : i32, value = dense<0> : tensor<i32>} : tensor<*xi32>
-  %3 = "xla_hlo.constant"() {extra_attr = 3 : i32, value = dense<0> : tensor<i32>} : () -> (tensor<*xi32>)
+func @constant_invalid() -> () {
+  // expected-error@+1 {{op failed to verify that all of {value, output} have same type}}
+  %0 = "xla_hlo.constant"() {value = dense<0> : tensor<i32>} : () -> (tensor<*xi32>)
   return
 }
 
@@ -958,3 +992,18 @@ func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
   }} : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
   return
 }
+
+// -----
+
+func @compatible_shapes(%arg0: tensor<?xf32>, %shape: tensor<2xindex>) -> tensor<?x?xf32> {
+  %0 = "xla_hlo.dynamic_reshape"(%arg0, %shape) : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func @incompatible_shapes(%arg0: tensor<?xf32>, %shape: tensor<2xindex>) -> tensor<?xf32> {
+  // expected-error @+1 {{output should have a rank equal to the number of elements in output_shape}}
+  %0 = "xla_hlo.dynamic_reshape"(%arg0, %shape) : (tensor<?xf32>, tensor<2xindex>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
new file mode 100644
index 00000000000..9f54e40dcaa
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
@@ -0,0 +1,60 @@
+// RUN: xla-opt %s -xla-hlo-sink-constants-to-control-flow | FileCheck %s --dump-input=fail
+
+// Tests sinking constants to a while loop.
+
+// CHECK-LABEL: func @sink_const_to_while
+func @sink_const_to_while(%arg0: tensor<i64>) -> tensor<i64> {
+  // CHECK-NEXT: xla_hlo.while
+  %c0 = xla_hlo.constant dense<1> : tensor<i64>
+  %c1 = xla_hlo.constant dense<2> : tensor<i64>
+  %0 = "xla_hlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<i64>):
+    // CHECK: %[[ARG1A:.+]]: tensor<i64>
+    // CHECK: %[[C0:.+]] = xla_hlo.constant dense<1> : tensor<i64>
+    // CHECK: "xla_hlo.compare"(%[[C0]], %[[ARG1A]])
+    %1 = "xla_hlo.compare"(%c0, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "xla_hlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i64>):
+    // CHECK: %[[ARG1B:.+]]: tensor<i64>
+    // CHECK-DAG: %[[C1:.+]] = xla_hlo.constant dense<2> : tensor<i64>
+    // CHECK-DAG: %[[ADD0:.+]] = xla_hlo.add %[[ARG1B]], %[[ARG1B]]
+    %2 = xla_hlo.add %arg1, %arg1 : tensor<i64>
+    // CHECK: %[[ADD1:.+]] = xla_hlo.add %[[C1]], %[[ADD0]]
+    %3 = xla_hlo.add %c1, %2 : tensor<i64>
+    // CHECK: %[[ADD2:.+]] = xla_hlo.add %[[C1]], %[[ADD1]]
+    %4 = xla_hlo.add %c1, %3 : tensor<i64>
+    "xla_hlo.return"(%4) : (tensor<i64>) -> ()
+  }) : (tensor<i64>) -> tensor<i64>
+  return %0 : tensor<i64>
+}
+
+// Tests sinking constants to a conditional op.
+
+// CHECK-LABEL: func @sink_const_to_conditional
+func @sink_const_to_conditional(%arg0: tensor<i64>) -> tensor<i64> {
+  %c0 = xla_hlo.constant dense<1> : tensor<i64>
+  %c1 = xla_hlo.constant dense<2> : tensor<i64>
+  %0 = "xla_hlo.compare"(%arg0, %c0) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %1 = "xla_hlo.tuple"(%arg0) : (tensor<i64>) -> tuple<tensor<i64>>
+  // CHECK: xla_hlo.if
+  %2 = "xla_hlo.if"(%0, %1, %1) ( {
+  ^bb0(%arg1: tuple<tensor<i64>>):
+    // CHECK: %[[C0:.+]] = xla_hlo.constant dense<1> : tensor<i64>
+    %3 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+    // CHECK: %[[ADD0:.+]] = xla_hlo.add %[[C0]],
+    %4 = xla_hlo.add %c0, %3 : tensor<i64>
+    %5 = "xla_hlo.tuple"(%4) : (tensor<i64>) -> tuple<tensor<i64>>
+    "xla_hlo.return"(%5) : (tuple<tensor<i64>>) -> ()
+  },  {
+  ^bb0(%arg1: tuple<tensor<i64>>):
+    // CHECK: %[[C1:.+]] = xla_hlo.constant dense<2> : tensor<i64>
+    %6 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+    // CHECK: %[[ADD1:.+]] = xla_hlo.add %[[C1]],
+    %7 = xla_hlo.add %c1, %6 : tensor<i64>
+    %8 = "xla_hlo.tuple"(%7) : (tensor<i64>) -> tuple<tensor<i64>>
+    "xla_hlo.return"(%8) : (tuple<tensor<i64>>) -> ()
+  }) : (tensor<i1>, tuple<tensor<i64>>, tuple<tensor<i64>>) -> tuple<tensor<i64>>
+  %9 = "xla_hlo.get_tuple_element"(%2) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+  return %9 : tensor<i64>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index 8953516c5fc..20b43e8633d 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
+// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s --dump-input-on-failure
 
 // CHECK:  HloModule
 func @main(%arg0: !xla_hlo.token, %arg1: !xla_hlo.token) -> !xla_hlo.token {
@@ -96,34 +96,6 @@ func @main(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>, tensor
 
 // -----
 
-// CHECK:  HloModule
-func @main(%arg0: tensor<1x4xi32>, %arg1: tensor<2x4xi32>, %arg2: tensor<2x3x4xi32>) -> tensor<2x3x4xi32> {
-  // Same rank degenerate broadcast
-  // CHECK:  [[ARG_0:%.*]] = s32[1,4] parameter(0)
-  // CHECK-NEXT:  [[RESHAPE_1:%.*]] = s32[4] reshape(s32[1,4] [[ARG_0]])
-  // CHECK-NEXT:  [[BROADCAST_1:%.*]] = s32[2,4] broadcast(s32[4] [[RESHAPE_1]])
-  // CHECK-NEXT:  [[ARG_1:%.*]] = s32[2,4] parameter(1)
-  // CHECK-NEXT:  s32[2,4] add(s32[2,4] [[BROADCAST_1]], s32[2,4] [[ARG_1]])
-  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<1x4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
-
-  // Broadcast up rank
-  // CHECK-NEXT:  [[BROADCAST_2:%.*]] = s32[2,3,4] broadcast(s32[2,4] [[ARG_1]]), dimensions={0,2}
-  // CHECK-NEXT:  [[ARG_2:%.*]] = s32[2,3,4] parameter(2)
-  // CHECK-NEXT:  s32[2,3,4] add(s32[2,3,4] [[BROADCAST_2]], s32[2,3,4] [[ARG_2]])
-  %1 = "xla_hlo.add"(%arg1, %arg2) {broadcast_dimensions = dense<[0,2]> : tensor<2xi64>} : (tensor<2x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
-
-  // Broadcast up rank + degenerate broadcast
-  // CHECK-NEXT:  [[BROADCAST_3:%.*]] = s32[2,1,4] broadcast(s32[1,4] [[ARG_0]]), dimensions={1,2}
-  // CHECK-NEXT:  [[RESHAPE_2:%.*]] = s32[2,4] reshape(s32[2,1,4] [[BROADCAST_3]])
-  // CHECK-NEXT:  [[BROADCAST_4:%.*]] = s32[2,3,4] broadcast(s32[2,4] [[RESHAPE_2]]), dimensions={0,2}
-  // CHECK:  ROOT
-  // CHECK-SAME:  s32[2,3,4] add(s32[2,3,4] [[BROADCAST_4]], s32[2,3,4] [[ARG_2]])
-  %2 = "xla_hlo.add"(%arg0, %arg2) {broadcast_dimensions = dense<[1,2]> : tensor<2xi64>} : (tensor<1x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
-  return %2 : tensor<2x3x4xi32>
-}
-
-// -----
-
 // CHECK:  HloModule
 func @main(%arg0: tensor<2xi32>) -> tensor<2xf32> {
   %0 = "xla_hlo.bitcast_convert"(%arg0) : (tensor<2xi32>) -> tensor<2xf32>
@@ -260,7 +232,7 @@ func @main(%arg0 : tensor<5x2xf32>,
 // -----
 
 // CHECK:  HloModule
-func @main() -> tensor<2x2x1x1xf32> {
+func @main() {
   // CHECK:  constant.{{.*}} = s64[] constant(1)
   %cst = constant dense<1> : tensor<i64>
   // CHECK:  constant.{{.*}} = f32[2,2,1,1]
@@ -285,10 +257,22 @@ func @main() -> tensor<2x2x1x1xf32> {
   // CHECK:  s32[2,2] constant({ { 3, 2 }, { 1, 4 } })
   %cst_5 = constant dense<[[3, 2], [1, 4]]> : tensor<2x2xi32>
 
-  // CHECK: bf16[4] constant({1, 2, 3, 4})
-  %cst_6 = constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
+  // CHECK:  u32[2,2] constant({ { 1, 2 }, { 4, 8 } })
+  %cst_6 = constant dense<[[1, 2], [4, 8]]> : tensor<2x2xui32>
 
-  return %cst_0 : tensor<2x2x1x1xf32>
+  // CHECK: bf16[4] constant({1, 2, 3, 4})
+  %cst_7 = constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
+
+  // CHECK: f16[4] constant({1, -4, -65504, 0.015625}
+  %cst_8 = constant dense<[1.0e+00, -4.0e+00, -65504.0e+00, 1.5625e-02]> : tensor<4xf16>
+
+  // CHECK: c64[] constant((1, 0))
+  %cst_9 = constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
+
+  // CHECK: c128[] constant((1, 0))
+  %cst_10 = constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
+
+  return
 }
 
 // -----
@@ -460,14 +444,18 @@ func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10
 // -----
 
 // CHECK:  HloModule
-func @main(%arg: tensor<4x2xf32>) -> tensor<i32> {
-  %0 = "xla_hlo.get_dimension_size"(%arg) {dimension = 1 : i32} : (tensor<4x2xf32>) -> tensor<i32>
-  return %0 : tensor<i32>
+func @main(%arg: tensor<4x2xf32>, %size: tensor<i32>) -> tensor<i32> {
+  %0 = "xla_hlo.set_dimension_size"(%arg, %size) {dimension = 1 : i32} : (tensor<4x2xf32>, tensor<i32>) -> tensor<4x2xf32>
+  %1 = "xla_hlo.get_dimension_size"(%0) {dimension = 1 : i32} : (tensor<4x2xf32>) -> tensor<i32>
+  return %1 : tensor<i32>
 }
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = f32[4,2] parameter(0)
-// CHECK:  s32[] get-dimension-size(f32[4,2] [[ARG]]), dimensions={1}
+// CHECK:  [[SIZE:%.*]] = s32[] parameter(1)
+// CHECK:  [[DYNAMIC:%.*]] = f32[4,<=2] set-dimension-size(f32[4,2] [[ARG]], s32[] [[SIZE]]), dimensions={1}
+// CHECK:  ROOT %[[RESULT:.*]] = s32[] get-dimension-size(f32[4,<=2] [[DYNAMIC]]), dimensions={1}
+
 
 // -----
 
@@ -860,6 +848,21 @@ func @main(%arg: tensor<3x4xi32>) -> tensor<1x2xi32> {
 
 // -----
 
+// CHECK:  HloModule
+func @main(%arg: tensor<3x4xi32>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<1x4xi32> {
+  %0 = "xla_hlo.dynamic-slice"(%arg, %start1, %start2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  return %0 : tensor<1x4xi32>
+}
+
+// CHECK:  ENTRY
+// CHECK:  %[[ARG:.*]] = s32[3,4] parameter(0)
+// CHECK:  %[[ARG1:.*]] = s64[] parameter(1)
+// CHECK:  %[[ARG2:.*]] = s64[] parameter(2)
+// CHECK:  ROOT
+// CHECK-SAME:  s32[1,4] dynamic-slice(s32[3,4] %[[ARG]], s64[] %[[ARG1]], s64[] %[[ARG2]]), dynamic_slice_sizes={1,4}
+
+// -----
+
 // CHECK:  HloModule
 func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   "xla_hlo.trace"(%arg0) {tag = "This is a random test"} : (tensor<2xi32>) -> ()
@@ -1001,3 +1004,28 @@ func @main(%arg0: tensor<2xcomplex<f32>>, %arg1: tensor<2xcomplex<f64>>) -> (ten
 // CHECK:  %[[ARG1:.*]] = c128[2] parameter(1)
 // CHECK:  %[[ABS1:.*]] = f64[2] abs(c128[2] %[[ARG1]])
 // CHECK:  ROOT %[[RESULT:.*]] = (f32[2], f64[2]) tuple(f32[2] %[[ABS0]], f64[2] %[[ABS1]])
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg0: tensor<4xui8>) -> (tensor<4xui8>) {
+  %0 = "xla_hlo.not"(%arg0) : (tensor<4xui8>) -> tensor<4xui8>
+  return %0 : tensor<4xui8>
+}
+
+// CHECK: ENTRY
+// CHECK: %[[ARG0:.*]] = u8[4] parameter(0)
+//  ROOT %[[RESULT:.*]] = u8[4] not(u8[4] %[[ARG0]])
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg0: tensor<4xi32>) -> (tensor<*xi32>) {
+  %0 = "xla_hlo.not"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
+  %1 = tensor_cast %0 : tensor<4xi32> to tensor<*xi32>
+  return %1 : tensor<*xi32>
+}
+
+// CHECK: ENTRY
+// CHECK: %[[ARG0:.*]] = s32[4] parameter(0)
+//  ROOT %[[RESULT:.*]] = s32[4] not(s32[4] %[[ARG0]])
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export_errors.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export_errors.mlir
new file mode 100644
index 00000000000..97c53cb5f9f
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export_errors.mlir
@@ -0,0 +1,7 @@
+// RUN: not tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s 2>&1 | FileCheck %s
+
+// CHECK: Opaque elements attr not supported
+func @main() {
+  %0 = "tf.Const"() {value = opaque<"tf", "0x0123456789ABCDEF"> : tensor<4xf32>} : () -> tensor<4xf32>
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/conditional.mlir b/tensorflow/compiler/mlir/xla/tests/translate/if.mlir
similarity index 98%
rename from tensorflow/compiler/mlir/xla/tests/translate/conditional.mlir
rename to tensorflow/compiler/mlir/xla/tests/translate/if.mlir
index e510a2aa35f..6542966fc7c 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/conditional.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/if.mlir
@@ -41,7 +41,7 @@ func @main(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
   %1 = "xla_hlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
 
   // CHECK:   %[[VAL3:.+]] = (f32[]) conditional(pred[] %[[VAL1]], (f32[]) %[[VAL2]], (f32[]) %[[VAL2]]), true_computation=[[R0]], false_computation=[[R1]]
-  %2 = "xla_hlo.conditional"(%0, %1, %1) ( {
+  %2 = "xla_hlo.if"(%0, %1, %1) ( {
   ^bb0(%arg1: tuple<tensor<f32>>):
     %6 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
     %7 = "xla_hlo.log"(%6) : (tensor<f32>) -> tensor<f32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/conditional.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
similarity index 97%
rename from tensorflow/compiler/mlir/xla/tests/translate/conditional.hlotxt
rename to tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
index 00f6ec2d308..d2c6e669e9b 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/conditional.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
@@ -29,7 +29,7 @@ ENTRY %tfcompile.20 {
   // CHECK: [[R2:%.+]] = "xla_hlo.tuple"([[A0]])
   %tuple.5 = (f32[]) tuple(%arg0.1), metadata={op_type="If" op_name="cond/Merge_if"}
 
-  // CHECK: [[R3:%.+]] = "xla_hlo.conditional"([[R1]], [[R2]], [[R2]]) ( {
+  // CHECK: [[R3:%.+]] = "xla_hlo.if"([[R1]], [[R2]], [[R2]]) ( {
   // CHECK: ^bb0([[A1:%.+]]: tuple<tensor<f32>>):
   // CHECK:   [[R7:%.+]] = "xla_hlo.get_tuple_element"([[A1]])
   // CHECK:   [[R8:%.+]] = "xla_hlo.log"([[R7]])
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index 89a34dfa68a..af45f84b34d 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s --dump-input-on-failure
 
 HloModule main
 
@@ -20,29 +20,6 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
   ROOT %dot.4 = f32[] dot(f32[4]{0} %add.3, f32[4]{0} %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 
-// This test is more thorough than those of the the other binary ops to test
-// their shared functionality.
-
-// CHECK-LABEL:  func @test_add
-%test_add (Arg_0.1: f32[4], Arg_1.2: f32[4], Arg_2.3: f32[], Arg_3.4: f32[]) -> f32[4] {
-  %Arg_0.1 = f32[4] parameter(0)
-  %Arg_1.2 = f32[4] parameter(1)
-  %Arg_2.3 = f32[] parameter(2)
-  %Arg_3.4 = f32[] parameter(3)
-
-  // Add two tensors
-  // CHECK-NEXT:  xla_hlo.add %arg0, %arg1 {name = "{{.*}}"}
-  %add.3 = f32[4] add(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
-
-  // Add two scalars
-  // CHECK-NEXT:  xla_hlo.add %arg2, %arg3
-  %add.4 = f32[] add(f32[] %Arg_2.3, f32[] %Arg_3.4)
-
-  // Add a tensor and scalar
-  // CHECK-NEXT:  "xla_hlo.add"(%0, %1)
-  ROOT %add.5 = f32[4] add(f32[4] %add.3, f32[] %add.4)
-}
-
 // CHECK-LABEL:  func @test_after_all
 // CHECK-SAME:  ([[VAL_0:%.*]]: !xla_hlo.token, [[VAL_1:%.*]]: !xla_hlo.token) -> !xla_hlo.token
 %test_after_all (token0: token[], token1: token[] ) -> token[] {
@@ -159,11 +136,11 @@ add {
 }
 
 
-// CHECK-LABEL:  func @test_compare(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>, %arg2: tensor<1xf32>) -> tensor<3xi1> {
-%test_compare (Arg_0.1: f32[3], Arg_1.2: f32[3], Arg_2.3: f32[1]) -> pred[3] {
+// CHECK-LABEL:  func @test_compare(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>, %arg2: tensor<3xf32>) -> tensor<3xi1> {
+%test_compare (Arg_0.1: f32[3], Arg_1.2: f32[3], Arg_2.3: f32[3]) -> pred[3] {
   %Arg_0.1 = f32[3] parameter(0)
   %Arg_1.2 = f32[3] parameter(1)
-  %Arg_2.3 = f32[1] parameter(2)
+  %Arg_2.3 = f32[3] parameter(2)
 
   // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ", name = "{{.*}}"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   %compare.4 = pred[3] compare(Arg_0.1, Arg_1.2), direction=EQ
@@ -172,7 +149,7 @@ add {
   %compare.5 = pred[3] compare(Arg_0.1, Arg_1.2), direction=LE
 
   // Requires broadcast of compatible tensors.
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg2) {comparison_direction = "GT", name = "{{.*}}"} : (tensor<3xf32>, tensor<1xf32>) -> tensor<3xi1>
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg2) {comparison_direction = "GT", name = "{{.*}}"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   ROOT %compare.6 = pred[3] compare(Arg_0.1, Arg_2.3), direction=GT
 }
 
@@ -204,7 +181,22 @@ add {
   // Note that double brackets "[[" have to be escaped as they denote variables
   // in FileCheck. The only way to do so is to drop into regex with "{{"
   // CHECK-NEXT:  constant  {name = "{{.*}}"} dense<{{\[\[\[\[}}1.000000e+00]], {{\[\[}}2.000000e+00]]], {{\[\[\[}}3.000000e+00]], {{\[\[}}4.000000e+00]]]]> : tensor<2x2x1x1xf32>
-  ROOT %constant.1 = f32[2,2,1,1]{3,2,1,0} constant({{{{1.0}},{{2.0}}},{{{3.0}},{{4.0}}}}), metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
+  %constant.1 = f32[2,2,1,1]{3,2,1,0} constant({{{{1.0}},{{2.0}}},{{{3.0}},{{4.0}}}}), metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
+
+  // CHECK: dense<[1, 2, 4, 8]> : tensor<4xui64>
+  %constant.2 = u64[4] constant({ 1, 2, 4, 8 })
+
+  // CHECK: dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
+  %constant.3 = bf16[4] constant({1, 2, 3, 4})
+
+  // CHECK: dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
+  %constant.4 = c64[] constant((1, 0))
+
+  // CHECK: dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
+  %constant.5 = c128[] constant((1, 0))
+
+  // CHECK: dense<[1.000000e+00, -4.000000e+00, -6.550400e+04, 1.562500e-02]> : tensor<4xf16>
+  ROOT %constant.6 = f16[4] constant({1, -4, -65504, 0.015625})
 }
 
 // TODO(b/129422361) Potentially update when copy, reshape, and conv have actual
@@ -233,8 +225,8 @@ add {
   // CHECK-SAME:       kernel_input_feature_dimension = 2 : i64
   // CHECK-SAME:       kernel_output_feature_dimension = 3 : i64
   // CHECK-SAME:       kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>
-  // CHECK-SAME:       output_batch_dimension = 0 : i64
-  // CHECK-SAME:       output_feature_dimension = 3 : i64
+  // CHECK-SAME:       output_batch_dimension = 3 : i64
+  // CHECK-SAME:       output_feature_dimension = 0 : i64
   // CHECK-SAME:       output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>
   // CHECK-SAME:     }
   // CHECK-SAME:     feature_group_count = 1 : i64
@@ -244,11 +236,11 @@ add {
   // CHECK-SAME:     rhs_dilations = dense<[2, 3]> : tensor<2xi64>
   // CHECK-SAME:     window_strides = dense<[4, 5]> : tensor<2xi64>
   // CHECK-SAME:   }
-  // CHECK-SAME:   (tensor<256x32x32x6xf32>, tensor<2x2x1x1xf32>) -> tensor<256x30x30x16xf32>
+  // CHECK-SAME:   (tensor<256x32x32x6xf32>, tensor<2x2x1x1xf32>) -> tensor<16x30x30x256xf32>
 
-  %convolution.4 = f32[256,30,30,16]{2,1,3,0} convolution(%reshape.2, %constant.3), window={size=3x3 stride=4x5 pad=44_45x60_60 rhs_dilate=2x3}, dim_labels=b01f_01io->b01f, metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
+  %convolution.4 = f32[16,30,30,256]{2,1,3,0} convolution(%reshape.2, %constant.3), window={size=3x3 stride=4x5 pad=44_45x60_60 rhs_dilate=2x3}, dim_labels=b01f_01io->f01b, metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
 
-  // CHECK-NEXT:  %3 = "xla_hlo.reshape"(%2) {name = "{{.*}}"} : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32>
+  // CHECK-NEXT:  %3 = "xla_hlo.reshape"(%2) {name = "{{.*}}"} : (tensor<16x30x30x256xf32>) -> tensor<256x30x30x16xf32>
   %reshape.5 = f32[256,30,30,16]{3,2,1,0} reshape(%convolution.4), metadata={op_name="HLO_Retvals"}
 
   // CHECK-NEXT:  "xla_hlo.tuple"(%3) {name = "{{.*}}"} : (tensor<256x30x30x16xf32>) -> tuple<tensor<256x30x30x16xf32>>
@@ -265,19 +257,19 @@ add {
   ROOT %convolution = f32[1,5,1] convolution(f32[1,2,1] %input, f32[1,1,1] %filter), feature_group_count=1, dim_labels=b0f_0io->b0f, window={pad=1_2 size=1}
 }
 
-// CHECK-LABEL:  func @test_convert(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf64> {
-%test_convert (Arg_0.1: f32[4], Arg_1.2: f32[]) -> f64[4] {
+// CHECK-LABEL:  func @test_convert(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf64> {
+%test_convert (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f64[4] {
   %Arg_0.1 = f32[4] parameter(0)
-  %Arg_1.2 = f32[] parameter(1)
+  %Arg_1.2 = f32[4] parameter(1)
 
   // CHECK-NEXT:  %0 = "xla_hlo.convert"(%arg0) {name = "{{.*}}"} : (tensor<4xf32>) -> tensor<4xf64>
   %convert.3 = f64[4] convert(f32[4] %Arg_0.1)
 
-  // CHECK-NEXT:  %1 = "xla_hlo.convert"(%arg1) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f64>
-  %convert.4 = f64[] convert(f32[] %Arg_1.2)
+  // CHECK-NEXT:  %1 = "xla_hlo.convert"(%arg1) {name = "{{.*}}"} : (tensor<4xf32>) -> tensor<4xf64>
+  %convert.4 = f64[4] convert(f32[4] %Arg_1.2)
 
-  // CHECK-NEXT:  "xla_hlo.add"(%0, %1)
-  ROOT %add.5 = f64[4] add(f64[4] %convert.3, f64[] %convert.4)
+  // CHECK-NEXT:  xla_hlo.add %0, %1
+  ROOT %add.5 = f64[4] add(f64[4] %convert.3, f64[4] %convert.4)
 }
 
 // CHECK-LABEL:  func @test_cosine(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
@@ -347,33 +339,35 @@ add {
 }
 
 // CHECK-LABEL:  func @test_dynamic_slice
-// CHECK-SAME:  [[OPERAND:%.*]]: tensor<2x2x258xi32>, [[START_INDICES:%.*]]: tensor<3xi32>
+// CHECK-SAME:  [[OPERAND:%.*]]: tensor<2x2x258xi32>, [[START_IDX_1:%.*]]: tensor<i32>, [[START_IDX_2:%.*]]: tensor<i32>, [[START_IDX_3:%.*]]: tensor<i32>
 %test_dynamic_slice (operand: s32[2,2,258], start_indices: s32[3]) -> s32[1,1,32] {
   %operand = s32[2,2,258] parameter(0)
-  %start_indices = s32[3] parameter(1)
-  // CHECK:  "xla_hlo.dynamic-slice"([[OPERAND]], [[START_INDICES]])
+  %start_idx_1 = s32[] parameter(1)
+  %start_idx_2 = s32[] parameter(2)
+  %start_idx_3 = s32[] parameter(3)
+  // CHECK:  "xla_hlo.dynamic-slice"([[OPERAND]], [[START_IDX_1]], [[START_IDX_2]], [[START_IDX_3]])
   // CHECK-SAME:  slice_sizes = dense<[1, 1, 32]> : tensor<3xi64>
-  ROOT %dynamic-slice = s32[1,1,32] dynamic-slice(s32[2,2,258] %operand, s32[3] %start_indices), dynamic_slice_sizes={1,1,32}
+  ROOT %dynamic-slice = s32[1,1,32] dynamic-slice(s32[2,2,258] %operand, s32[] %start_idx_1, s32[] %start_idx_2, s32[] %start_idx_3), dynamic_slice_sizes={1,1,32}
 }
 
-// CHECK-LABEL:  func @test_dynamic_update_slice_1(%arg0: tensor<4x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<4x4xf32> {
+// CHECK-LABEL:  func @test_dynamic_update_slice_1(%arg0: tensor<4x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<4x4xf32> {
 %test_dynamic_update_slice_1 (Arg_0.1: f32[4, 4], Arg_1.2: f32[1, 4], Arg_2.3: f32[], Arg_3.4: f32[]) -> f32[4, 4] {
   %Arg_0.1 = f32[4, 4] parameter(0)
   %Arg_1.2 = f32[1, 4] parameter(1)
-  %Arg_2.3 = f32[] parameter(2)
-  %Arg_3.4 = f32[] parameter(3)
+  %Arg_2.3 = s32[] parameter(2)
+  %Arg_3.4 = s32[] parameter(3)
 
-  // CHECK-NEXT:  "xla_hlo.dynamic-update-slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<4x4xf32>, tensor<1x4xf32>, tensor<f32>, tensor<f32>) -> tensor<4x4xf32>
+  // CHECK-NEXT:  "xla_hlo.dynamic-update-slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<4x4xf32>, tensor<1x4xf32>, tensor<i32>, tensor<i32>) -> tensor<4x4xf32>
   ROOT %dynamic-update-slice.5 = f32[4, 4] dynamic-update-slice(%Arg_0.1, %Arg_1.2, %Arg_2.3, %Arg_3.4)
 }
 
-// CHECK-LABEL:  func @test_dynamic_update_slice_2(%arg0: tensor<4xf32>, %arg1: tensor<2xf32>, %arg2: tensor<f32>) -> tensor<4xf32>
+// CHECK-LABEL:  func @test_dynamic_update_slice_2(%arg0: tensor<4xf32>, %arg1: tensor<2xf32>, %arg2: tensor<i32>) -> tensor<4xf32>
 %test_dynamic_update_slice_2 (Arg_0.1: f32[4], Arg_1.2: f32[2], Arg_2.3: f32[]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[2] parameter(1)
-  %Arg_2.3 = f32[] parameter(2)
+  %Arg_2.3 = s32[] parameter(2)
 
-  // CHECK-NEXT:  "xla_hlo.dynamic-update-slice"(%arg0, %arg1, %arg2) : (tensor<4xf32>, tensor<2xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK-NEXT:  "xla_hlo.dynamic-update-slice"(%arg0, %arg1, %arg2) : (tensor<4xf32>, tensor<2xf32>, tensor<i32>) -> tensor<4xf32>
   ROOT %dynamic-update-slice.5 = f32[4] dynamic-update-slice(%Arg_0.1, %Arg_1.2, %Arg_2.3)
 }
 
@@ -1001,3 +995,12 @@ add {
   // CHECK:  "xla_hlo.abs"(%[[ARG1]]) {name = "{{.*}}"} : (tensor<2xcomplex<f64>>) -> tensor<2xf64>
   ROOT %tuple.5 = (f32[2], f64[2]) tuple(f32[2] %abs.3, f64[2] %abs.4)
 }
+
+// CHECK-LABEL:  func @unsigned_int
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<4xui16>)
+%unsigned_int(Arg_0.1: u16[4]) -> u16[4] {
+  %Arg_0.1 = u16[4] parameter(0)
+
+  // CHECK: "xla_hlo.not"(%[[ARG0]]) {name = "{{.*}}"} : (tensor<4xui16>) -> tensor<4xui16>
+  ROOT %not.2 = u16[4] not(u16[4] %Arg_0.1)
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
index 9778772e250..7a54de73db7 100644
--- a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
@@ -106,24 +106,19 @@ func @batchNormInference_dynamic_shape(
     -> tensor<?x?x?x?xf32> {
   // CHECK-DAG: %[[EPS:.+]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
   // CHECK-DAG: %[[DIM:.+]] = dim %[[VARIANCE]], 0 : tensor<?xf32>
-  // CHECK-DAG: %[[INDEX_CAST:.+]] = index_cast %[[DIM]] : index to i32
-  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INDEX_CAST]]) : (i32) -> tensor<1xi32>
-  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xi32>) -> tensor<?xf32>
+  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM]]) : (index) -> tensor<1xindex>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = xla_hlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = "xla_hlo.sqrt"(%[[VARIANCE_EPS]]) : (tensor<?xf32>) -> tensor<?xf32>
   // CHECK-DAG: %[[INPUT_DIM_0:.+]] = dim %[[X]], 0 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_0:.+]] = index_cast %[[INPUT_DIM_0]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_1:.+]] = dim %[[X]], 1 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_1:.+]] = index_cast %[[INPUT_DIM_1]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_2:.+]] = dim %[[X]], 2 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_2:.+]] = index_cast %[[INPUT_DIM_2]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_3:.+]] = dim %[[X]], 3 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_3:.+]] = index_cast %[[INPUT_DIM_3]] : index to i32
-  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INPUT_INDEX_CAST_0]], %[[INPUT_INDEX_CAST_1]], %[[INPUT_INDEX_CAST_2]], %[[INPUT_INDEX_CAST_3]]) : (i32, i32, i32, i32) -> tensor<4xi32>
-  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]]) : (index, index, index, index) -> tensor<4xindex>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_CENTER:.+]] = xla_hlo.subtract %[[X]], %[[MEAN_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_SCALED:.+]] = xla_hlo.multiply %[[X_CENTER]], %[[SCALE_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = xla_hlo.divide %[[X_SCALED]], %[[STDDEV_BCAST]] : tensor<?x?x?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc
index 540c9ab486d..640b9b84622 100644
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc
@@ -371,8 +371,7 @@ struct BufferAssignmentPass
       // If there is an existing dealloc, move it to the right place.
       if (deallocs.size()) {
         Operation* nextOp = positions.getDeallocPosition()->getNextNode();
-        if (!nextOp)
-          nextOp = &positions.getDeallocPosition()->getBlock()->back();
+        assert(nextOp && "Invalid Dealloc operation position");
         (*deallocs.begin())->moveBefore(nextOp);
       } else {
         // If there is no dealloc node, insert one in the right place.
@@ -397,18 +396,8 @@ BufferAssignmentPlacer::BufferAssignmentPlacer(Operation* op)
 /// Computes the actual position to place allocs for the given value.
 OpBuilder::InsertPoint BufferAssignmentPlacer::computeAllocPosition(
     Value value) {
-  Operation* insertOp;
-  if (auto arg = value.dyn_cast<BlockArgument>()) {
-    // This is a block argument which has to be allocated in the scope
-    // of its associated terminator.
-    auto domNode = dominators.getNode(arg.getOwner());
-    assert(domNode != nullptr && "Cannot find dominator info");
-    auto idomNode = domNode->getIDom();
-    assert(idomNode != nullptr && "There is no parent dominator");
-    insertOp = idomNode->getBlock()->getTerminator();
-  } else {
-    insertOp = value.getDefiningOp();
-  }
+  Operation* insertOp = value.getDefiningOp();
+  assert(insertOp && "There is not a defining operation for the input value");
   OpBuilder opBuilder(insertOp);
   return opBuilder.saveInsertionPoint();
 }
@@ -457,14 +446,25 @@ LogicalResult FunctionAndBlockSignatureConverter::matchAndRewrite(
   return success();
 }
 
-// Adding functions whose arguments are memref type to the set of legal
-// operations.
+/// A helper method to make the functions, whose all block argument types are
+/// Memref or non-shaped type, legal. BufferAssignmentPlacer expects all
+/// function and block argument types are in Memref or non-shaped type. Using
+/// this helper method and additionally, FunctionAndBlockSignatureConverter as a
+/// pattern conversion make sure that the type of block arguments are compatible
+/// with using BufferAssignmentPlacer.
 void FunctionAndBlockSignatureConverter::addDynamicallyLegalFuncOp(
     ConversionTarget& target) {
-  target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
-    auto inputs = op.getType().getInputs();
-    return std::all_of(inputs.begin(), inputs.end(),
-                       [](Type input) { return input.isa<MemRefType>(); });
+  auto isLegalBlockArg = [](BlockArgument arg) -> bool {
+    auto type = arg.getType();
+    return type.isa<MemRefType>() || !type.isa<ShapedType>();
+  };
+  target.addDynamicallyLegalOp<FuncOp>([&](FuncOp funcOp) {
+    bool legality = true;
+    for (auto& block2 : funcOp.getBlocks()) {
+      legality &= llvm::all_of(block2.getArguments(), isLegalBlockArg);
+      if (!legality) break;
+    }
+    return legality;
   });
 }
 
@@ -481,23 +481,5 @@ static PassRegistration<BufferAssignmentPass> buffer_assignment_pass(
     "Executes buffer assignment pass to automatically move alloc and dealloc "
     "operations into their proper positions");
 
-/// A simple pass to print debug/test information for the buffer assignment
-/// analysis.
-struct BufferAssignmentTestPass
-    : mlir::PassWrapper<BufferAssignmentTestPass, FunctionPass> {
-  void runOnFunction() override {
-    llvm::outs() << "Testing : " << getFunction().getName() << "\n";
-    getAnalysis<BufferAssignmentAnalysis>().print(llvm::outs());
-  };
-};
-
-std::unique_ptr<OperationPass<FuncOp>> createBufferAssignmentTestPass() {
-  return absl::make_unique<BufferAssignmentTestPass>();
-}
-
-static PassRegistration<BufferAssignmentTestPass> buffer_assignment_test_pass(
-    "test-buffer-assignment",
-    "Outputs debug test information for the buffer assignment analysis");
-
 }  // namespace xla
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h
index d8b4c2554bb..ced5769b44c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h
+++ b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_BUFFER_ASSIGNMENT_H_
 #define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_BUFFER_ASSIGNMENT_H_
 
-#include "mlir/Analysis/Dominance.h"
 #include "mlir/Analysis/Liveness.h"
-#include "mlir/IR/Builders.h"   // TF:llvm-project
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Dominance.h"
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
new file mode 100644
index 00000000000..5a0d791079c
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
@@ -0,0 +1,170 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for testing buffer assignment including its
+// utility converters.
+
+#include "tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h"
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
+#include "mlir/IR/Function.h"                 // TF:llvm-project
+#include "mlir/IR/Operation.h"                // TF:llvm-project
+#include "mlir/Pass/Pass.h"                   // TF:llvm-project
+#include "mlir/Pass/PassManager.h"            // TF:llvm-project
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+
+namespace mlir {
+namespace xla {
+namespace {
+/// This pass tests two provided operation converters,
+/// FunctionAndBlockSignatureConverter and NonVoidToVoidReturnOpConverter, for
+/// Buffer Assignment.
+struct BufferAssignmentPreparationTestPass
+    : mlir::PassWrapper<BufferAssignmentPreparationTestPass, FunctionPass> {
+  /// This dialect independent unary operation has been defined only for testing
+  /// buffer assignment.
+  class BufferAssignmentTestUnaryOp
+      : public Op<BufferAssignmentTestUnaryOp, OpTrait::OneResult,
+                  OpTrait::OneOperand> {
+   public:
+    using Op::Op;
+    static StringRef getOperationName() {
+      return "buffer_assignment_test.unary";
+    }
+    static void build(OpBuilder& b, OperationState& state, Value source) {
+      state.addOperands(source);
+    }
+  };
+
+  /// This dialect independent lowered unary operation has been defined only for
+  /// testing buffer assignment.
+  class BufferAssignmentTestUnaryLoweredOp
+      : public Op<BufferAssignmentTestUnaryLoweredOp, OpTrait::ZeroResult,
+                  OpTrait::NOperands<2>::Impl> {
+   public:
+    using Op::Op;
+    static StringRef getOperationName() {
+      return "buffer_assignment_test.unary_lowered";
+    }
+    static void build(OpBuilder& b, OperationState& state, Value source,
+                      Value target) {
+      state.addOperands(source);
+      state.addOperands(target);
+    }
+  };
+
+  /// This dialect independent copy operation has been defined only for testing
+  /// NonVoidToVoidReturnOpConverter
+  class BufferAssignmentTestCopyOp
+      : public Op<BufferAssignmentTestCopyOp, OpTrait::ZeroResult,
+                  OpTrait::NOperands<2>::Impl> {
+   public:
+    using Op::Op;
+    static StringRef getOperationName() {
+      return "buffer_assignment_test.copy";
+    }
+    static void build(OpBuilder& b, OperationState& state, Value from,
+                      Value to) {
+      state.addOperands(from);
+      state.addOperands(to);
+    }
+  };
+
+  /// A simple converter that legalizes a BufferAssignmentTestUnaryOp to a
+  /// BufferAssignmentTestUnaryLoweredOp and creates buffer allocation for
+  /// the result of the computation.
+  class TestUnaryOpConverter : public BufferAssignmentOpConversionPattern<
+                                   BufferAssignmentTestUnaryOp> {
+   public:
+    using BufferAssignmentOpConversionPattern<
+        BufferAssignmentTestUnaryOp>::BufferAssignmentOpConversionPattern;
+
+    // Performs the actual legalization conversion step.
+    LogicalResult matchAndRewrite(
+        BufferAssignmentTestUnaryOp op, ArrayRef<Value> operands,
+        ConversionPatternRewriter& rewriter) const final {
+      // Create a new buffer allocation using the current BufferAssignmentPlacer
+      // instance.
+      auto result = op.getResult();
+      auto result_type = result.getType().dyn_cast<ShapedType>();
+      auto memref_type =
+          MemRefType::get(result_type.getShape(), result_type.getElementType());
+      rewriter.restoreInsertionPoint(
+          bufferAssignment->computeAllocPosition(result));
+      auto alloc = rewriter.create<AllocOp>(op.getLoc(), memref_type);
+
+      // Create the lowered operation and replace the old operation with a
+      // reference to the allocated buffer.
+      rewriter.create<BufferAssignmentTestUnaryLoweredOp>(op.getLoc(),
+                                                          operands[0], alloc);
+      rewriter.replaceOp(op, {alloc});
+      return success();
+    }
+  };
+
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    auto funcOp = getOperation();
+    auto context = funcOp.getContext();
+    ConversionTarget target(*context);
+    BufferAssignmentPlacer bufferAssignmentPlacer(funcOp);
+
+    // Specifying the legal and illegal operations.
+    context->allowUnregisteredDialects(true);
+    target.addIllegalOp<BufferAssignmentTestUnaryOp>();
+    target.addLegalOp<BufferAssignmentTestUnaryLoweredOp>();
+    target.addLegalOp<BufferAssignmentTestCopyOp>();
+    target.addLegalOp<AllocOp>();
+    target.addLegalOp<DeallocOp>();
+    // TODO(dfki): ReturnOp can also be changed to TestReturnOp like
+    // BufferAssignmentTestCopyOp.
+    target.addDynamicallyLegalOp<ReturnOp>(
+        [](ReturnOp returnOp) { return returnOp.getNumOperands() == 0; });
+    FunctionAndBlockSignatureConverter::addDynamicallyLegalFuncOp(target);
+
+    // Adding patterns for testing this pass.
+    // clang-format off
+    patterns.insert<
+        FunctionAndBlockSignatureConverter,
+        TestUnaryOpConverter,
+        NonVoidToVoidReturnOpConverter
+          <ReturnOp, ReturnOp, BufferAssignmentTestCopyOp>
+    >(context, &bufferAssignmentPlacer);
+    // clang-format on
+
+    if (failed(applyPartialConversion(funcOp, target, patterns, nullptr))) {
+      funcOp.emitOpError()
+          << "Failed to apply buffer assignment preparation steps";
+    }
+  };
+};
+}  // namespace
+
+/// This pass tests helper methods such as computeAllocPosition,
+/// FunctionAndBlockSignatureConverter, NonVoidToVoidReturnOpConverter
+/// conversion patterns. Furthermore, it checks buffer-assignment pass that
+/// moves existing Alloc and Dealloc operations to their proper positions, and
+/// insert missing Dealloc operations.
+static PassPipelineRegistration<> buffer_assignment_test_pass(
+    "test-buffer-assignment",
+    "Tests buffer assignment helper methods and buffer assignment pass.",
+    [](mlir::OpPassManager& pm) {
+      pm.addPass(absl::make_unique<BufferAssignmentPreparationTestPass>());
+      pm.addPass(createBufferAssignmentPass());
+    });
+
+}  // namespace xla
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/canonicalize.td b/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
index 65f81aae9f2..b788cb80380 100644
--- a/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
@@ -19,25 +19,6 @@ include "mlir/IR/OpBase.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_utils.td"
 
-//===----------------------------------------------------------------------===//
-// DynamicSlice op patterns.
-//===----------------------------------------------------------------------===//
-
-def BuildSliceLimits : NativeCodeCall<
-    "BuildSliceLimits($0.cast<DenseIntElementsAttr>(),"
-    "$1.cast<DenseIntElementsAttr>(), &$_builder)">;
-
-def BuildSliceStrides : NativeCodeCall<
-  "GetI64ElementsAttr(SmallVector<int64_t, 4>("
-  "$0.getType().cast<RankedTensorType>().getRank(), 1), &$_builder)">;
-
-def DynamicSliceToSlice: Pat<(HLO_DynamicSliceOp HLO_Tensor:$input,
-           (HLO_ConstOp I64ElementsAttr:$starting_indices),
-           I64ElementsAttr:$slice_sizes),
-          (HLO_SliceOp $input, (CastIntElementsAttr $starting_indices),
-           (BuildSliceLimits $starting_indices, $slice_sizes),
-            (BuildSliceStrides $input))>;
-
 def UnaryToBinaryEinsumEq : NativeCodeCall<
   "$_builder.getStringAttr(\",\" + $0.getValue().str())">;
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
new file mode 100644
index 00000000000..e5a79616d5b
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
@@ -0,0 +1,228 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/broadcast_utils.h"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla_chlo {
+
+namespace {
+
+// Converts binary ops that statically are determined to not broadcast directly
+// to the corresponding xla_hlo non-broadcasting op.
+template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
+struct ConvertTrivialNonBroadcastBinaryOp : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Only rewrite for statically determinable non-broadcasting cases.
+    auto lhs_type = op.lhs().getType().template dyn_cast<RankedTensorType>();
+    auto rhs_type = op.rhs().getType().template dyn_cast<RankedTensorType>();
+    if (!lhs_type || !rhs_type) return failure();
+
+    // Requires rank broadcast.
+    if (lhs_type.getRank() != rhs_type.getRank()) return failure();
+    // Any dynamic dimension may require broadcasting and requires more
+    // analysis.
+    if (!lhs_type.hasStaticShape() || !rhs_type.hasStaticShape())
+      return failure();
+
+    for (auto extents : llvm::zip(lhs_type.getShape(), rhs_type.getShape())) {
+      auto lhs_extent = std::get<0>(extents);
+      auto rhs_extent = std::get<1>(extents);
+      if (lhs_extent != rhs_extent) {
+        return failure();
+      }
+    }
+
+    rewriter.replaceOp(op, {Adaptor::CreateOp(op, op.getResult().getType(),
+                                              op.lhs(), op.rhs(), rewriter)});
+    return success();
+  }
+};
+
+// Converts a binary op with ranked broadcasting operands to explicitly
+// broadcast and invoke the corresponding xla_hlo non-broadcasting op.
+// Note that dynamic broadcasting supported by this pattern is only valid for
+// "numpy" broadcasting semantics as defined here:
+//   https://docs.scipy.org/doc/numpy/reference/ufuncs.html
+// Specifically, this includes the following cases:
+//   - Same rank broadcast (operands have the same static rank).
+//   - Different-rank broadcast, either without a broadcast_dims attribte or
+//     with the broadcast_dims attribute set to map to a prefix padding.
+//   - Legal combinations of degenerate (1-dim) implicit broadcasting.
+// The restriction on broadcast_dims derives from the definition of the
+// `shape.broadcast` op, which only supports prefix-padding.
+//
+// It may be possible to expand this pattern to operate on unranked tensors in
+// the future by emitting more code to dynamically differentiate based on rank.
+// Whether that is of any practical benefit remains to be seen.
+template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
+struct ConvertRankedDynamicBroadcastBinaryOp
+    : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Only support ranked operands.
+    Value lhs = op.lhs();
+    Value rhs = op.rhs();
+    auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
+    auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
+    auto result_type =
+        op.getResult().getType().template dyn_cast<RankedTensorType>();
+    if (!lhs_type || !rhs_type || !result_type) return failure();
+
+    // Check for "numpy"-style rank broadcast.
+    auto broadcast_dimensions = op.broadcast_dimensions();
+    if (broadcast_dimensions &&
+        !xla::IsLegalNumpyRankedBroadcast(lhs, rhs, *broadcast_dimensions)) {
+      // Note: It is unclear whether the general specification of explicit
+      // broadcast_dimensions on binary ops is a feature we want to carry
+      // forward. While it can technically be implemented for ranked-dynamic,
+      // it is incompatible with unranked inputs. If this warning is emitted
+      // in real programs, it is an indication that the feature should be
+      // implemented versus just falling back on the more standard definition
+      // of numpy-like prefix-padding.
+      op.emitWarning() << "unsupported non prefix-padded dynamic rank "
+                       << "broadcast_dimensions = " << *broadcast_dimensions;
+      return failure();
+    }
+
+    // Compute result shape.
+    auto loc = op.getLoc();
+    int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
+    Value result_extents =
+        xla::ComputeBinaryElementwiseBroadcastingResultExtents(loc, lhs, rhs,
+                                                               rewriter);
+
+    // Note that we unconditionally emit DynamicBroadcastInDim ops and let
+    // downstream canonicalizations fold them away if possible. This is
+    // because, in the dynamic case, there are many corner cases regarding
+    // when it is safe to omit, and some of them require analysis to prove
+    // properly.
+    auto lhs_broadcast_dimensions = llvm::to_vector<4>(
+        llvm::seq<int64_t>(result_rank - lhs_type.getRank(), result_rank));
+    Value broadcasted_lhs = rewriter.create<xla_hlo::DynamicBroadcastInDimOp>(
+        loc,
+        RankedTensorType::get(result_type.getShape(),
+                              lhs_type.getElementType()),
+        lhs, result_extents,
+        rewriter.getI64TensorAttr(lhs_broadcast_dimensions));
+    auto rhs_broadcast_dimensions = llvm::to_vector<4>(
+        llvm::seq<int64_t>(result_rank - rhs_type.getRank(), result_rank));
+    Value broadcasted_rhs = rewriter.create<xla_hlo::DynamicBroadcastInDimOp>(
+        loc,
+        RankedTensorType::get(result_type.getShape(),
+                              rhs_type.getElementType()),
+        rhs, result_extents,
+        rewriter.getI64TensorAttr(rhs_broadcast_dimensions));
+
+    // And generate the final non-broadcasted binary op.
+    rewriter.replaceOp(op, {Adaptor::CreateOp(op, result_type, broadcasted_lhs,
+                                              broadcasted_rhs, rewriter)});
+    return success();
+  }
+};
+
+template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
+void PopulateForBinaryOp(MLIRContext *context,
+                         OwningRewritePatternList *patterns) {
+  patterns
+      ->insert<ConvertTrivialNonBroadcastBinaryOp<ChloOpTy, HloOpTy, Adaptor>>(
+          context, 10);
+  patterns->insert<
+      ConvertRankedDynamicBroadcastBinaryOp<ChloOpTy, HloOpTy, Adaptor>>(
+      context, 5);
+}
+
+template <typename FromOpTy, typename ToOpTy>
+struct HloBinaryElementwiseAdaptor {
+  static ToOpTy CreateOp(FromOpTy from_op, Type result_type,
+                         Value broadcasted_lhs, Value broadcasted_rhs,
+                         OpBuilder &builder) {
+    return builder.create<ToOpTy>(from_op.getLoc(), result_type,
+                                  broadcasted_lhs, broadcasted_rhs);
+  }
+};
+
+struct HloComplexAdaptor {
+  static xla_hlo::ComplexOp CreateOp(BroadcastComplexOp from_op,
+                                     Type result_type, Value broadcasted_lhs,
+                                     Value broadcasted_rhs,
+                                     OpBuilder &builder) {
+    return builder.create<xla_hlo::ComplexOp>(from_op.getLoc(), result_type,
+                                              broadcasted_lhs, broadcasted_rhs);
+  }
+};
+
+struct HloCompareAdaptor {
+  static xla_hlo::CompareOp CreateOp(BroadcastCompareOp from_op,
+                                     Type result_type, Value broadcasted_lhs,
+                                     Value broadcasted_rhs,
+                                     OpBuilder &builder) {
+    return builder.create<xla_hlo::CompareOp>(from_op.getLoc(), result_type,
+                                              broadcasted_lhs, broadcasted_rhs,
+                                              from_op.comparison_direction());
+  }
+};
+
+}  // namespace
+
+void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
+                                       OwningRewritePatternList *patterns) {
+  // Instantiate conversion templates for conforming binary elementwise ops
+  // that do not have different dtypes between operands and results and do
+  // not have special attributes that need to be preserved.
+#define POPULATE_BCAST(ChloOp, HloOp)                                      \
+  PopulateForBinaryOp<ChloOp, HloOp,                                       \
+                      HloBinaryElementwiseAdaptor<ChloOp, HloOp>>(context, \
+                                                                  patterns);
+
+  POPULATE_BCAST(BroadcastAddOp, xla_hlo::AddOp);
+  POPULATE_BCAST(BroadcastAndOp, xla_hlo::AndOp);
+  POPULATE_BCAST(BroadcastAtan2Op, xla_hlo::Atan2Op);
+  POPULATE_BCAST(BroadcastDivOp, xla_hlo::DivOp);
+  POPULATE_BCAST(BroadcastMaxOp, xla_hlo::MaxOp);
+  POPULATE_BCAST(BroadcastMinOp, xla_hlo::MinOp);
+  POPULATE_BCAST(BroadcastMulOp, xla_hlo::MulOp);
+  POPULATE_BCAST(BroadcastOrOp, xla_hlo::OrOp);
+  POPULATE_BCAST(BroadcastPowOp, xla_hlo::PowOp);
+  POPULATE_BCAST(BroadcastRemOp, xla_hlo::RemOp);
+  POPULATE_BCAST(BroadcastShiftLeftOp, xla_hlo::ShiftLeftOp);
+  POPULATE_BCAST(BroadcastShiftRightArithmeticOp,
+                 xla_hlo::ShiftRightArithmeticOp);
+  POPULATE_BCAST(BroadcastShiftRightLogicalOp, xla_hlo::ShiftRightLogicalOp);
+  POPULATE_BCAST(BroadcastSubOp, xla_hlo::SubOp);
+  POPULATE_BCAST(BroadcastXorOp, xla_hlo::XorOp);
+
+  // Broadcasting ops requiring special construction.
+  PopulateForBinaryOp<BroadcastComplexOp, xla_hlo::ComplexOp,
+                      HloComplexAdaptor>(context, patterns);
+  PopulateForBinaryOp<BroadcastCompareOp, xla_hlo::CompareOp,
+                      HloCompareAdaptor>(context, patterns);
+}
+
+}  // namespace xla_chlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo_pass.cc b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo_pass.cc
new file mode 100644
index 00000000000..a4d0918bfb1
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo_pass.cc
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla_chlo {
+
+namespace {
+
+struct TestChloLegalizeToHloPass
+    : public PassWrapper<TestChloLegalizeToHloPass, FunctionPass> {
+  void runOnFunction() override {
+    ConversionTarget conversionTarget(getContext());
+    OwningRewritePatternList conversionPatterns;
+
+    conversionTarget.addIllegalDialect<XlaHloClientDialect>();
+    // Consider the xla_hlo dialect legal for tests.
+    conversionTarget.addLegalDialect<xla_hlo::XlaHloDialect>();
+    // The conversion uses helpers from the Standard dialect.
+    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
+    conversionTarget.addLegalDialect<mlir::shape::ShapeDialect>();
+
+    PopulateLegalizeChloToHloPatterns(&getContext(), &conversionPatterns);
+
+    if (failed(applyPartialConversion(getFunction(), conversionTarget,
+                                      conversionPatterns))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+}  // namespace xla_chlo
+}  // namespace mlir
+
+static mlir::PassRegistration<mlir::xla_chlo::TestChloLegalizeToHloPass> pass(
+    "test-xla-chlo-legalize-to-hlo",
+    "Test pass for applying chlo -> hlo legalization patterns");
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index d3fb832d542..11b2ae65d8e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
@@ -39,16 +40,11 @@ namespace xla_hlo {
 namespace {
 
 constexpr StringRef kTempBufferAttr = "temp";
-
-/// Returns DeallocOp to ensure that CopyOp is not inserted after dealloc.
-Operation* FindInsertionPointForCopy(Value value) {
-  for (const auto& user : value.getUsers()) {
-    if (auto dealloc = dyn_cast<DeallocOp>(user)) {
-      return user;
-    }
-  }
-  return nullptr;
-}
+template <typename T>
+using BaseOpConversion = BufferAssignmentOpConversionPattern<T>;
+using StdReturnOpConverter =
+    NonVoidToVoidReturnOpConverter<mlir::ReturnOp, xla_lhlo::TerminatorOp,
+                                   xla_lhlo::CopyOp>;
 
 Value InsertDynamicAllocAndDealloc(Location loc, Value result,
                                    Value shape_operand,
@@ -92,8 +88,9 @@ Value InsertDynamicAllocAndDealloc(Location loc, Value result,
   return alloc;
 }
 
-Value InsertAllocAndDealloc(Location loc, Value result,
-                            ConversionPatternRewriter* rewriter) {
+Value InsertAlloc(Location loc, OpResult result,
+                  BufferAssignmentPlacer* bufferAssignment,
+                  ConversionPatternRewriter* rewriter) {
   auto result_type = result.getType().dyn_cast<ShapedType>();
   if (!result_type || !result_type.hasStaticShape()) {
     result.getDefiningOp()->emitOpError()
@@ -101,31 +98,21 @@ Value InsertAllocAndDealloc(Location loc, Value result,
   }
   auto memref_type =
       MemRefType::get(result_type.getShape(), result_type.getElementType());
-
-  Operation* op = result.getDefiningOp();
-  auto block = op->getBlock();
-
-  OpBuilder allocBuilder(op);
-  allocBuilder.setInsertionPointToStart(block);  // Inserting at the beginning
-  auto alloc = allocBuilder.create<AllocOp>(loc, memref_type);
-
-  alloc.setAttr(kTempBufferAttr, rewriter->getBoolAttr(true));
-
-  allocBuilder.setInsertionPoint(block, std::prev(block->end()));
-  allocBuilder.create<DeallocOp>(loc, alloc);
-
+  OpBuilder::InsertionGuard guard(*rewriter);
+  rewriter->restoreInsertionPoint(
+      bufferAssignment->computeAllocPosition(result));
+  auto alloc = rewriter->create<AllocOp>(loc, memref_type);
   return alloc;
 }
 
 template <typename HloOpTy>
-class HloToLhloOpConverter : public ConversionPattern {
+class HloToLhloOpConverter : public BaseOpConversion<HloOpTy> {
  public:
-  explicit HloToLhloOpConverter(MLIRContext* context)
-      : ConversionPattern(HloOpTy::getOperationName(), 1, context) {}
-
+  using BaseOpConversion<HloOpTy>::BaseOpConversion;
   LogicalResult matchAndRewrite(
-      Operation* op, ArrayRef<Value> operands,
+      HloOpTy hloOp, ArrayRef<Value> operands,
       ConversionPatternRewriter& rewriter) const final {
+    Operation* op = hloOp.getOperation();
     const auto& original_results = op->getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
     for (auto result : llvm::enumerate(original_results)) {
@@ -135,8 +122,8 @@ class HloToLhloOpConverter : public ConversionPattern {
         return failure();
       }
       if (resultType.hasStaticShape()) {
-        buffer_args.push_back(
-            InsertAllocAndDealloc(op->getLoc(), result.value(), &rewriter));
+        buffer_args.push_back(InsertAlloc(op->getLoc(), result.value(),
+                                          this->bufferAssignment, &rewriter));
       } else {
         SmallVector<Value, 1> results_shape;
         auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
@@ -156,9 +143,9 @@ class HloToLhloOpConverter : public ConversionPattern {
 };
 
 struct HloToLhloDynamicBroadcastInDimOpConverter
-    : public OpConversionPattern<xla_hlo::DynamicBroadcastInDimOp> {
+    : public BaseOpConversion<xla_hlo::DynamicBroadcastInDimOp> {
  public:
-  using OpConversionPattern::OpConversionPattern;
+  using BaseOpConversion<xla_hlo::DynamicBroadcastInDimOp>::BaseOpConversion;
 
   LogicalResult matchAndRewrite(
       xla_hlo::DynamicBroadcastInDimOp op, ArrayRef<Value> operands,
@@ -175,10 +162,9 @@ struct HloToLhloDynamicBroadcastInDimOpConverter
   }
 };
 
-struct HloToLhloReduceOpConverter
-    : public OpConversionPattern<xla_hlo::ReduceOp> {
+struct HloToLhloReduceOpConverter : public BaseOpConversion<xla_hlo::ReduceOp> {
  public:
-  using OpConversionPattern::OpConversionPattern;
+  using BaseOpConversion<xla_hlo::ReduceOp>::BaseOpConversion;
 
   LogicalResult matchAndRewrite(
       xla_hlo::ReduceOp op, ArrayRef<Value> operands,
@@ -194,7 +180,8 @@ struct HloToLhloReduceOpConverter
     const auto& original_results = op.getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
     for (auto result : original_results) {
-      buffer_args.push_back(InsertAllocAndDealloc(loc, result, &rewriter));
+      buffer_args.push_back(
+          InsertAlloc(loc, result, this->bufferAssignment, &rewriter));
     }
     auto new_op = rewriter.create<xla_lhlo::ReduceOp>(
         loc, llvm::None, buffer_args, op.getAttrs());
@@ -230,12 +217,12 @@ struct HloToLhloReduceOpConverter
   }
 };
 
-class HloToLhloTensorLoadOpConverter : public ConversionPattern {
+class HloToLhloTensorLoadOpConverter
+    : public BaseOpConversion<mlir::TensorLoadOp> {
  public:
-  explicit HloToLhloTensorLoadOpConverter(MLIRContext* context)
-      : ConversionPattern(TensorLoadOp::getOperationName(), 1, context) {}
+  using BaseOpConversion<mlir::TensorLoadOp>::BaseOpConversion;
   LogicalResult matchAndRewrite(
-      Operation* op, ArrayRef<Value> operands,
+      mlir::TensorLoadOp op, ArrayRef<Value> operands,
       ConversionPatternRewriter& rewriter) const final {
     rewriter.replaceOp(op, operands);
     return success();
@@ -243,13 +230,13 @@ class HloToLhloTensorLoadOpConverter : public ConversionPattern {
 };
 
 // TODO(b/137624192): Rewrite into a copy and elide copy if possible.
-class HloToLhloTensorStoreOpConverter : public ConversionPattern {
+class HloToLhloTensorStoreOpConverter
+    : public BaseOpConversion<mlir::TensorStoreOp> {
  public:
-  explicit HloToLhloTensorStoreOpConverter(MLIRContext* context)
-      : ConversionPattern(TensorStoreOp::getOperationName(), 1, context) {}
+  using BaseOpConversion<mlir::TensorStoreOp>::BaseOpConversion;
 
   LogicalResult matchAndRewrite(
-      Operation* op, ArrayRef<Value> operands,
+      mlir::TensorStoreOp op, ArrayRef<Value> operands,
       ConversionPatternRewriter& rewriter) const final {
     rewriter.replaceOpWithNewOp<xla_lhlo::CopyOp>(
         op, llvm::None, operands.front(), operands.back());
@@ -291,7 +278,6 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 //     "xla_lhlo.multiply"(%0, %arg0, %arg3) :
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-//     dealloc %0 : memref<2x2xf32>
 //     "xla_lhlo.terminator"() : () -> ()
 //   }) : () -> ()
 //   return
@@ -313,14 +299,13 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 //               %arg1: memref<4xf32>,
 //               %arg2: memref<4xf32>) {
 //   %0 = alloc() : memref<4xf32>
-//   %1 = alloc() : memref<4xf32>
+
 //   "xla_lhlo.maximum"(%arg0, %arg1, %0) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
+//   %1 = alloc() : memref<4xf32>
 //   "xla_lhlo.add"(%arg0, %0, %1) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
 //   "xla_lhlo.copy"(%1, %arg2) : (memref<4xf32>, memref<4xf32>) -> ()
-//   dealloc %0 : memref<4xf32>
-//   dealloc %1 : memref<4xf32>
 //   "xla_lhlo.terminator"() : () -> ()
 // }
 
@@ -346,119 +331,47 @@ struct HloLegalizeToLhlo
     });
 
     auto module = getOperation();
-    populateHLOToLHLOConversionPattern(module.getContext(), &patterns);
-
-    // Do partial conversion so we can have unknown ops in tests.
-    if (failed(applyPartialConversion(module, target, patterns, nullptr))) {
-      signalPassFailure();
-    }
+    BufferAssignmentTypeConverter converter;
+    module.walk([&](FuncOp func) {
+      BufferAssignmentPlacer bufferAssignment(func);
+      OwningRewritePatternList patterns;
+      populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
+                                         &converter, &patterns);
+      return WalkResult(
+          applyPartialConversion(func, target, patterns, &converter));
+    });
   }
 };
-
-Type ConvertType(Type t) {
-  if (auto tensorType = t.dyn_cast<RankedTensorType>()) {
-    return MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-  }
-  return t;
-}
-
 }  // namespace
 
-/// Transforms FuncOp arguments and results from tensors to buffers. Tensor
-/// results are converted to memrefs and appended to the argument list.
-class HloToLhloFuncOpConverter : public OpConversionPattern<FuncOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      FuncOp funcOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final {
-    if (funcOp.getBody().getBlocks().size() > 1) {
-      funcOp.emitOpError() << "tensor to buffer conversion expects a single "
-                              "block in the region containing the operation";
-      return failure();
-    }
-
-    auto funcType = funcOp.getType();
-
-    TypeConverter::SignatureConversion conversion(funcType.getNumInputs());
-    for (auto argType : llvm::enumerate(funcType.getInputs())) {
-      conversion.addInputs(argType.index(), ConvertType(argType.value()));
-    }
-    for (auto resType : funcType.getResults()) {
-      conversion.addInputs(ConvertType(resType));
-    }
-    rewriter.updateRootInPlace(funcOp, [&] {
-      funcOp.setType(
-          rewriter.getFunctionType(conversion.getConvertedTypes(), llvm::None));
-      rewriter.applySignatureConversion(&funcOp.getBody(), conversion);
-    });
-    return success();
-  }
-};
-
-/// Transforms ReturnOp to LhloTerminator. CopyOp is inserted to copy each
-/// result to the corresponding buffer argument.
-class StdToLhloReturnOpConverter : public OpConversionPattern<mlir::ReturnOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mlir::ReturnOp returnOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final {
-    auto numReturnValues = returnOp.getNumOperands();
-    auto funcOp = returnOp.getParentOfType<FuncOp>();
-    auto numFuncArgs = funcOp.getNumArguments();
-    auto loc = returnOp.getLoc();
-
-    for (auto operand : llvm::enumerate(operands)) {
-      auto returnArgNumber = numFuncArgs - numReturnValues + operand.index();
-      auto dstBuffer = funcOp.getArgument(returnArgNumber);
-      if (dstBuffer == operand.value()) {
-        continue;
-      }
-
-      auto dealloc = FindInsertionPointForCopy(operand.value());
-
-      if (dealloc == nullptr) {
-        returnOp.emitOpError()
-            << "Missing dealloc for operand " << operand.index();
-        return failure();
-      }
-      OpBuilder::InsertionGuard guard(rewriter);
-      rewriter.setInsertionPoint(dealloc);
-      rewriter.create<xla_lhlo::CopyOp>(loc, llvm::None, operand.value(),
-                                        funcOp.getArgument(returnArgNumber));
-    }
-    rewriter.replaceOpWithNewOp<xla_lhlo::TerminatorOp>(returnOp);
-    return success();
-  }
-};
-
-void populateHLOToLHLOConversionPattern(MLIRContext* context,
-                                        OwningRewritePatternList* patterns) {
+void populateHLOToLHLOConversionPattern(
+    MLIRContext* context, BufferAssignmentPlacer* bufferAssignment,
+    TypeConverter* converter, OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<
       HloToLhloDynamicBroadcastInDimOpConverter,
-      HloToLhloFuncOpConverter,
       HloToLhloOpConverter<xla_hlo::AbsOp>,
       HloToLhloOpConverter<xla_hlo::AddOp>,
       HloToLhloOpConverter<xla_hlo::AndOp>,
       HloToLhloOpConverter<xla_hlo::BroadcastInDimOp>,
       HloToLhloOpConverter<xla_hlo::CeilOp>,
       HloToLhloOpConverter<xla_hlo::CompareOp>,
+      HloToLhloOpConverter<xla_hlo::ComplexOp>,
       HloToLhloOpConverter<xla_hlo::ConstOp>,
       HloToLhloOpConverter<xla_hlo::ConvertOp>,
       HloToLhloOpConverter<xla_hlo::CopyOp>,
       HloToLhloOpConverter<xla_hlo::CosOp>,
       HloToLhloOpConverter<xla_hlo::DivOp>,
+      HloToLhloOpConverter<xla_hlo::DotOp>,
       HloToLhloOpConverter<xla_hlo::ExpOp>,
+      HloToLhloOpConverter<xla_hlo::ImagOp>,
       HloToLhloOpConverter<xla_hlo::IotaOp>,
       HloToLhloOpConverter<xla_hlo::LogOp>,
       HloToLhloOpConverter<xla_hlo::MaxOp>,
       HloToLhloOpConverter<xla_hlo::MinOp>,
       HloToLhloOpConverter<xla_hlo::MulOp>,
       HloToLhloOpConverter<xla_hlo::NegOp>,
+      HloToLhloOpConverter<xla_hlo::RealOp>,
       HloToLhloOpConverter<xla_hlo::RemOp>,
       HloToLhloOpConverter<xla_hlo::RsqrtOp>,
       HloToLhloOpConverter<xla_hlo::SelectOp>,
@@ -469,8 +382,9 @@ void populateHLOToLHLOConversionPattern(MLIRContext* context,
       HloToLhloReduceOpConverter,
       HloToLhloTensorLoadOpConverter,
       HloToLhloTensorStoreOpConverter,
-      StdToLhloReturnOpConverter
-  >(context);
+      FunctionAndBlockSignatureConverter,
+      StdReturnOpConverter
+  >(context, bufferAssignment, converter);
   // clang-format on
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
index 129a24600a2..bb1169a57d6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
@@ -61,47 +61,46 @@ LogicalResult ReplaceTerminators(Region* region, Block* target_block,
   return success();
 }
 
-LogicalResult LowerConditionalOp(mlir::xla_hlo::ConditionalOp conditional_op) {
-  Operation* op_inst = conditional_op.getOperation();
-  mlir::OpBuilder builder(conditional_op);
+LogicalResult LowerIfOp(mlir::xla_hlo::IfOp if_op) {
+  Operation* op_inst = if_op.getOperation();
+  mlir::OpBuilder builder(if_op);
   auto orig_block = op_inst->getBlock();
   auto* tail_block = orig_block->splitBlock(op_inst);
-  auto loc = conditional_op.getLoc();
+  auto loc = if_op.getLoc();
 
   // Duplicate the true and false regions in the block between the sections
   // before and after the conditional.
   BlockAndValueMapping mapper;
-  conditional_op.true_branch().cloneInto(orig_block->getParent(),
-                                         Region::iterator(tail_block), mapper);
-  conditional_op.false_branch().cloneInto(orig_block->getParent(),
-                                          Region::iterator(tail_block), mapper);
+  if_op.true_branch().cloneInto(orig_block->getParent(),
+                                Region::iterator(tail_block), mapper);
+  if_op.false_branch().cloneInto(orig_block->getParent(),
+                                 Region::iterator(tail_block), mapper);
 
   // Determine the blocks for the start of the true and false regions.
-  Block* true_block = mapper.lookup(&conditional_op.true_branch().front());
-  Block* false_block = mapper.lookup(&conditional_op.false_branch().front());
+  Block* true_block = mapper.lookup(&if_op.true_branch().front());
+  Block* false_block = mapper.lookup(&if_op.false_branch().front());
 
   // Perform the conditional branch into the true/false cases.
   builder.setInsertionPointToEnd(orig_block);
 
   // Extract the predicate for checking branching, then branch to the true and
   // false regions appropriately.
-  auto cond_value =
-      builder.create<mlir::ExtractElementOp>(loc, conditional_op.pred());
+  auto cond_value = builder.create<mlir::ExtractElementOp>(loc, if_op.pred());
   builder.create<mlir::CondBranchOp>(loc, cond_value, true_block,
-                                     conditional_op.true_arg(), false_block,
-                                     conditional_op.false_arg());
+                                     if_op.true_arg(), false_block,
+                                     if_op.false_arg());
 
   // Replace the true case's return operations with a branch to the tail of
   // the condition.
-  if (failed(ReplaceTerminators(&conditional_op.true_branch(), tail_block, loc,
-                                mapper, &builder)))
+  if (failed(ReplaceTerminators(&if_op.true_branch(), tail_block, loc, mapper,
+                                &builder)))
     return failure();
-  if (failed(ReplaceTerminators(&conditional_op.false_branch(), tail_block, loc,
-                                mapper, &builder)))
+  if (failed(ReplaceTerminators(&if_op.false_branch(), tail_block, loc, mapper,
+                                &builder)))
     return failure();
 
-  tail_block->addArguments(conditional_op.getResult().getType());
-  conditional_op.getResult().replaceAllUsesWith(tail_block->getArgument(0));
+  tail_block->addArguments(if_op.getResult().getType());
+  if_op.getResult().replaceAllUsesWith(tail_block->getArgument(0));
 
   op_inst->erase();
   return success();
@@ -210,11 +209,11 @@ LogicalResult LowerWhileOp(mlir::xla_hlo::WhileOp while_op) {
 
 void LegalizeControlFlow::runOnFunction() {
   auto func = getFunction();
-  llvm::SmallVector<ConditionalOp, 4> conditional_ops;
-  func.walk([&](ConditionalOp op) { conditional_ops.push_back(op); });
+  llvm::SmallVector<IfOp, 4> if_ops;
+  func.walk([&](IfOp op) { if_ops.push_back(op); });
 
-  for (auto& op : conditional_ops) {
-    if (failed(LowerConditionalOp(op))) return signalPassFailure();
+  for (auto& op : if_ops) {
+    if (failed(LowerIfOp(op))) return signalPassFailure();
   }
 
   llvm::SmallVector<WhileOp, 4> while_ops;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 50536e6a124..8675d6c8a4b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -23,7 +23,10 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -41,10 +44,13 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/xla/convert_op_folder.h"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/kernels/conv_grad_shape_utils.h"
@@ -55,12 +61,15 @@ namespace mlir {
 namespace xla_hlo {
 namespace {
 
+constexpr char kShardingAttr[] = "xla_hlo.sharding";
+
 class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
  public:
   LegalizeTF() = default;
   LegalizeTF(const LegalizeTF &) {}
-  explicit LegalizeTF(bool allow_partial_conversion) {
+  explicit LegalizeTF(bool allow_partial_conversion, bool legalize_chlo) {
     allow_partial_conversion_ = allow_partial_conversion;
+    legalize_chlo_ = legalize_chlo;
   }
 
   /// Performs the lowering to XLA dialect.
@@ -71,6 +80,11 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
       *this, "allow-partial-conversion",
       llvm::cl::desc("Allow operations that can't be legalized."),
       llvm::cl::init(false)};
+  Option<bool> legalize_chlo_{
+      *this, "legalize-chlo",
+      llvm::cl::desc(
+          "Also legalizes intermediate chlo ops to hlo (default true)"),
+      llvm::cl::init(true)};
 };
 
 /// Returns if the given TF data format string is the default format.
@@ -114,6 +128,28 @@ static DenseIntElementsAttr GetI32ElementsAttr(ArrayRef<int32_t> values,
   return DenseIntElementsAttr::get(ty, values);
 }
 
+// Returns a 1-d i64 elements attribute populated with numbers from start to
+// end, excluding.
+static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
+                                                     Builder *builder) {
+  int size = end - start;
+
+  SmallVector<int64_t, 4> vals;
+  vals.resize(size);
+  std::iota(vals.begin(), vals.end(), start);
+
+  TensorType ty = RankedTensorType::get({size}, builder->getIntegerType(64));
+  return DenseIntElementsAttr::get(ty, vals);
+}
+
+// Returns a 1-d i64 elements attribute populated with `val` repeated `size`
+// times.
+static DenseIntElementsAttr GetI64ElementsAttrForValue(int size, int64_t val,
+                                                       Builder *builder) {
+  TensorType ty = RankedTensorType::get({size}, builder->getIntegerType(64));
+  return DenseIntElementsAttr::get(ty, val);
+}
+
 // Returns the corresponding type that should be used for performing sum
 // accumulation over the given input type.
 Type GetSumAccumulationType(Type input_type) {
@@ -168,6 +204,20 @@ static ConvertOp CastValueToI64(Location loc, Value value,
   return rewriter->create<ConvertOp>(loc, value, rewriter->getIntegerType(64));
 }
 
+// Creates an unpack op along the 0th dimension of the tensor. The `value` input
+// must be a ranked tensor.
+static TF::UnpackOp UnpackTensorAlongZeroDim(Location loc, Value value,
+                                             PatternRewriter *rewriter) {
+  auto indices_type = value.getType().cast<RankedTensorType>();
+  int num_outputs = indices_type.getShape().front();
+  SmallVector<Type, 2> unpacked_indices_type(
+      num_outputs, RankedTensorType::get({}, indices_type.getElementType()));
+  auto unpacked_indices = rewriter->create<TF::UnpackOp>(
+      loc, unpacked_indices_type, value,
+      IntegerAttr::get(rewriter->getIntegerType(64), 0));
+  return unpacked_indices;
+}
+
 // Returns size of dimension at the specified index, if ranked tensor.
 // Otherwise, returns -1.
 //
@@ -179,10 +229,17 @@ int64_t GetDimSize(Type ty, int64_t index) {
   return ranked_ty.getDimSize(index);
 }
 
-template <typename T>
+template <typename T, int num_dims>
 tensorflow::TensorShape ToTensorShape(llvm::ArrayRef<T> sizes) {
-  return tensorflow::TensorShape(
-      llvm::SmallVector<tensorflow::int64, 4>(sizes.begin(), sizes.end()));
+  return tensorflow::TensorShape(llvm::SmallVector<tensorflow::int64, num_dims>(
+      sizes.begin(), sizes.end()));
+}
+
+template <typename T, int num_dims>
+tensorflow::TensorShape ToTensorShape(
+    llvm::iterator_range<DenseElementsAttr::ElementIterator<T>> sizes) {
+  return tensorflow::TensorShape(llvm::SmallVector<tensorflow::int64, num_dims>(
+      sizes.begin(), sizes.end()));
 }
 
 // Returns minimal value for the given int or float element type.
@@ -224,8 +281,270 @@ static ConstOp GetMaxValueForType(Type ty, Location loc,
 // Returns int or float scalar DenseElementsAttr attribute with the given
 // element type and the value.
 static ConstOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
-                                    PatternRewriter *rewriter) {
-  return rewriter->create<ConstOp>(loc, xla::GetScalarOfType(ty, raw_value));
+                                    OpBuilder *builder) {
+  return builder->create<ConstOp>(loc, xla::GetScalarOfType(ty, raw_value));
+}
+
+// Creates an xla_hlo::SliceOp where the major dimensions have full size, and
+// the minor dimensions have the provided offsets and sizes.
+static Value SliceInMinorDims(Location loc, Value v,
+                              ArrayRef<int64_t> minor_starts,
+                              ArrayRef<int64_t> minor_limits,
+                              OpBuilder *builder) {
+  auto type = v.getType().cast<RankedTensorType>();
+  llvm::SmallVector<int64_t, 4> slice_starts(type.getRank(), 0);
+  int64_t major_dims = type.getRank() - minor_starts.size();
+  std::copy(minor_starts.begin(), minor_starts.end(),
+            slice_starts.begin() + major_dims);
+  auto slice_limits = llvm::to_vector<4>(type.getShape());
+  std::copy(minor_limits.begin(), minor_limits.end(),
+            slice_limits.begin() + major_dims);
+  llvm::SmallVector<int64_t, 4> slice_strides(type.getRank(), 1);
+  return builder->create<SliceOp>(loc, v,
+                                  GetI64ElementsAttr(slice_starts, builder),
+                                  GetI64ElementsAttr(slice_limits, builder),
+                                  GetI64ElementsAttr(slice_strides, builder));
+}
+
+// Creates a vector of index values:
+//  [0, 0, ..., minor_indices[0], minor_indices[1], ... minor_indices[-1]]
+// with length `rank`.
+static llvm::SmallVector<Value, 4> CreateFullIndexVectorFromMinorIndices(
+    Location loc, ArrayRef<Value> minor_indices, int64_t rank,
+    OpBuilder *builder) {
+  auto zero =
+      GetScalarConstOfType(getElementTypeOrSelf(minor_indices[0].getType()),
+                           loc, 0, builder)
+          .output();
+  llvm::SmallVector<Value, 4> indices(rank, zero);
+  std::copy(minor_indices.begin(), minor_indices.end(),
+            indices.begin() + (rank - minor_indices.size()));
+  return indices;
+}
+
+// Creates an xla_hlo::DynamicSliceOp where the major dimensions have full size,
+// and the minor dimensions have the provided offsets and sizes.
+static Value DynamicSliceInMinorDims(Location loc, Value v,
+                                     ArrayRef<Value> minor_starts,
+                                     ArrayRef<int64_t> minor_sizes,
+                                     OpBuilder *builder) {
+  if (minor_starts.empty()) return v;
+  auto type = v.getType().cast<RankedTensorType>();
+  auto slice_starts = CreateFullIndexVectorFromMinorIndices(
+      loc, minor_starts, type.getRank(), builder);
+  int64_t major_dims = type.getRank() - minor_starts.size();
+  auto slice_sizes = llvm::to_vector<4>(type.getShape());
+  std::copy(minor_sizes.begin(), minor_sizes.end(),
+            slice_sizes.begin() + major_dims);
+  auto slice_type = RankedTensorType::get(slice_sizes, type.getElementType());
+  return builder->create<xla_hlo::DynamicSliceOp>(
+      loc, slice_type, v, slice_starts,
+      GetI64ElementsAttr(slice_sizes, builder));
+}
+
+// Creates an xla_hlo::DynamicUpdateSliceOp where the major dimensions have zero
+// offsets, and the minor dimensions have the provided offsets.
+static Value DynamicUpdateSliceInMinorDims(Location loc, Value v, Value update,
+                                           ArrayRef<Value> minor_starts,
+                                           OpBuilder *builder) {
+  if (minor_starts.empty()) return v;
+  auto type = v.getType().cast<RankedTensorType>();
+  auto dus_starts = CreateFullIndexVectorFromMinorIndices(
+      loc, minor_starts, type.getRank(), builder);
+  return builder->create<DynamicUpdateSliceOp>(loc, type, v, update,
+                                               llvm::makeArrayRef(dus_starts));
+}
+
+// Creates an xla_hlo::DynamicUpdateSliceOp where the major dimensions have zero
+// offsets, and the minor dimensions have the provided static offsets.
+static Value UpdateSliceInMinorDims(Location loc, Value v, Value update,
+                                    ArrayRef<int64_t> minor_starts,
+                                    OpBuilder *builder) {
+  llvm::SmallVector<Value, 4> dus_starts(minor_starts.size());
+  for (int64_t i = 0; i < minor_starts.size(); ++i) {
+    dus_starts[i] = GetScalarConstOfType(builder->getIntegerType(32), loc,
+                                         minor_starts[i], builder);
+  }
+  return DynamicUpdateSliceInMinorDims(loc, v, update, dus_starts, builder);
+}
+
+// Deprecated: This is maintained to aid in porting old code that is not yet
+// dynamic shape aware and uses broadcasting modes that CHLO does not support.
+// Gets the resulting type from a broadcast between two types for statically
+// shaped types. This is to be used for legacy lowerings that both use non
+// left-padded broadcasting and static shapes. Its use should not be permitted
+// in new code.
+// May return nullptr on invalid static broadcast dimensions.
+// ABSL_DEPRECATED()
+static RankedTensorType GetStaticBroadcastType(
+    RankedTensorType x, RankedTensorType y,
+    DenseIntElementsAttr broadcast_dimensions_attr) {
+  auto element_type = x.getElementType();
+  auto shape_x = x.getShape();
+  auto shape_y = y.getShape();
+
+  if (shape_x.size() == shape_y.size()) {
+    llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
+    for (int i = 0; i < shape_x.size(); i++) {
+      auto x_val = shape_x[i];
+      auto y_val = shape_y[i];
+      out_shape[i] = std::max(x_val, y_val);
+    }
+    return RankedTensorType::get(out_shape, element_type);
+  }
+
+  auto shape_large = shape_x.size() > shape_y.size() ? shape_x : shape_y;
+  auto shape_small = shape_x.size() <= shape_y.size() ? shape_x : shape_y;
+
+  llvm::SmallVector<int64_t, 4> broadcast_dimensions;
+  // Explicit broadcast dimensions.
+  for (const APInt &int_value : broadcast_dimensions_attr) {
+    broadcast_dimensions.push_back(int_value.getSExtValue());
+  }
+  if (broadcast_dimensions.size() != shape_small.size()) {
+    return nullptr;
+  }
+  llvm::SmallVector<int64_t, 4> out_shape(shape_large.begin(),
+                                          shape_large.end());
+
+  // Update according to the broadcast dimensions.
+  for (auto index_pair : llvm::enumerate(broadcast_dimensions)) {
+    auto old_value = out_shape[index_pair.value()];
+    auto new_value = shape_small[index_pair.index()];
+    out_shape[index_pair.value()] = std::max(old_value, new_value);
+  }
+  return RankedTensorType::get(out_shape, element_type);
+}
+
+// Deprecated: This is maintained to aid in porting old code that is not yet
+// dynamic shape aware and uses broadcasting modes that CHLO does not support.
+// Applies static binary broadcasting to a binary elementwise op.
+// This is a legacy helper to provide general broadcasting support in legacy,
+// static shaped code that relies on non-left-padded broadcasting semantics.
+template <typename BinaryOp>
+static Value StaticBinaryBroadcast(Location loc, Value x, Value y,
+                                   DenseIntElementsAttr broadcast_dims,
+                                   OpBuilder &builder) {
+  auto x_type = x.getType().cast<RankedTensorType>();
+  auto y_type = y.getType().cast<RankedTensorType>();
+  auto result_type = GetStaticBroadcastType(x_type, y_type, broadcast_dims);
+  if (!result_type) {
+    emitError(loc) << "could not binary broadcast " << x_type << ", " << y_type
+                   << " with broadcast_dims = " << broadcast_dims;
+    return nullptr;
+  }
+  auto larger_broadcast_dims =
+      GetI64ElementsAttrForSeq(0, result_type.getRank(), &builder);
+  if (x_type.getRank() < y_type.getRank()) {
+    if (x_type != result_type) {
+      x = builder.create<BroadcastInDimOp>(loc, result_type, x, broadcast_dims);
+    }
+    if (y_type != result_type) {
+      y = builder.create<BroadcastInDimOp>(loc, result_type, y,
+                                           larger_broadcast_dims);
+    }
+  } else {
+    if (x_type != result_type) {
+      x = builder.create<BroadcastInDimOp>(loc, result_type, x,
+                                           larger_broadcast_dims);
+    }
+    if (y_type != result_type) {
+      y = builder.create<BroadcastInDimOp>(loc, result_type, y, broadcast_dims);
+    }
+  }
+  return builder.create<BinaryOp>(loc, x, y);
+}
+
+// Gets a 1D tensor type suitable for expressing extents of the given tensor
+// value type. If the value type is ranked, the result will be statically
+// shaped. Otherwise, it will have a dynamic dimension.
+static RankedTensorType GetExtentsTensorTypeFor(TensorType value_type) {
+  Builder b(value_type.getContext());
+  int64_t dim = value_type.hasRank() ? value_type.getRank() : -1;
+  return RankedTensorType::get({dim}, b.getIndexType());
+}
+
+// Broadcasts a 'lower_rank_value' to the shape of a 'higher_rank_value'
+// by assuming that the shape of the lower ranked is a broadcast compatible
+// prefix of the higher ranked.
+// Values must be RankedTensorType (this restriction derives from the
+// broadcast_dimensions attribute on DynamicBroadcastInDim).
+//
+// Example:
+//   CommonPrefixBroadcast(tensor<4x3x256>, tensor<4, 3>) will broadcast the
+//   lower rank value to [4, 3, 256] (i.e. the opposite of numpy-style
+//   implicit broadcasting).
+static Value CommonPrefixBroadcast(Location loc, Value higher_rank_value,
+                                   Value lower_rank_value, OpBuilder &builder) {
+  Value higher_rank_shape =
+      builder.create<shape::ShapeOfOp>(loc, higher_rank_value);
+  auto result_extents_type =
+      GetExtentsTensorTypeFor(higher_rank_value.getType().cast<TensorType>());
+  Value result_extents = builder.create<shape::ToExtentTensorOp>(
+      loc, result_extents_type, higher_rank_shape);
+
+  auto lower_rank_type = lower_rank_value.getType().cast<RankedTensorType>();
+  auto lower_rank = lower_rank_type.getRank();
+  auto prefix_dims = GetI64ElementsAttrForSeq(0, lower_rank, &builder);
+  return builder.create<DynamicBroadcastInDimOp>(
+      loc, higher_rank_value.getType(), lower_rank_value, result_extents,
+      prefix_dims);
+}
+
+// Given a value (broadcast_to) and a feature dimension, broadcasts a 1D
+// value (broadcast_from) along that feature dimension. This is a shortcut
+// for the cases where a 1D tensor must be broadcast along a specific feature
+// dimension, which can vary based on data layout, etc.
+//
+// The extent of `broadcast_from` dim0 must be equal to the extent of the
+// feature_dim of `broadcast_to`.
+//
+// Example:
+//   [1x2x3x4], [2], 1 -> [1x2x3x4]
+// TODO(laurenzo): Swap the order of broadcast_to and broadcast_from for
+// consistency. Possibly also rename for clarity.
+static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
+                                     Value broadcast_from, int64_t feature_dim,
+                                     OpBuilder &builder) {
+  auto broadcast_dims = GetI64ElementsAttr({feature_dim}, &builder);
+  auto to_type = broadcast_to.getType().cast<RankedTensorType>();
+  auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
+  auto result_extents_type = GetExtentsTensorTypeFor(to_type);
+  auto result_extents = builder.create<shape::ToExtentTensorOp>(
+      loc, result_extents_type, result_shape);
+  return builder.create<DynamicBroadcastInDimOp>(
+      loc, to_type, broadcast_from, result_extents, broadcast_dims);
+}
+
+// Creates a batch dot using xla_hlo::DotGeneralOp.
+Value BatchDot(Location loc, Value lhs, bool transpose_lhs, Value rhs,
+               bool transpose_rhs, int64_t num_batch_dims,
+               ArrayAttr precision_config, OpBuilder *builder) {
+  auto batch_dimensions = GetI64ElementsAttr(
+      llvm::to_vector<4>(llvm::seq<int64_t>(0, num_batch_dims)), builder);
+  auto lhs_contracting_dimensions = GetI64ElementsAttr(
+      llvm::makeArrayRef({transpose_lhs ? num_batch_dims : num_batch_dims + 1}),
+      builder);
+  auto rhs_contracting_dimensions = GetI64ElementsAttr(
+      llvm::makeArrayRef({transpose_rhs ? num_batch_dims + 1 : num_batch_dims}),
+      builder);
+  auto dimension_numbers = DotDimensionNumbers::get(
+      /*lhs_batching_dimensions=*/batch_dimensions,
+      /*rhs_batching_dimensions=*/batch_dimensions,
+      /*lhs_contracting_dimensions=*/lhs_contracting_dimensions,
+      /*rhs_contracting_dimensions=*/rhs_contracting_dimensions,
+      builder->getContext());
+  auto lhs_shape = lhs.getType().cast<RankedTensorType>().getShape();
+  auto rhs_shape = rhs.getType().cast<RankedTensorType>().getShape();
+  auto shape = llvm::to_vector<4>(lhs_shape);
+  shape[shape.size() - 2] =
+      transpose_lhs ? lhs_shape.back() : lhs_shape[lhs_shape.size() - 2];
+  shape[shape.size() - 1] =
+      transpose_rhs ? rhs_shape[rhs_shape.size() - 2] : rhs_shape.back();
+  Type element_type = getElementTypeOrSelf(lhs.getType());
+  return builder->create<DotGeneralOp>(
+      loc, RankedTensorType::get(shape, element_type), lhs, rhs,
+      dimension_numbers, precision_config);
 }
 
 // Builds body for reduce op by using the using the template binary op as the
@@ -242,8 +561,7 @@ static void BuildReduceBody(Type element_type, Region *body,
 
   Location loc = body->getLoc();
   auto reducer =
-      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1),
-                          /*broadcast_dimensions=*/nullptr);
+      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1));
   builder->create<ReturnOp>(loc, reducer.getResult());
 }
 
@@ -343,8 +661,7 @@ static void CreateWhile32(Location loc, int num_iterations,
         loc, builder->getI32IntegerAttr(num_iterations));
     StringAttr compare_direction = StringAttr::get("LT", builder->getContext());
     Value compare = builder->create<xla_hlo::CompareOp>(
-        loc, loop_iv, upper_limit,
-        /*broadcast_dimensions=*/nullptr, compare_direction);
+        loc, loop_iv, upper_limit, compare_direction);
 
     builder->create<xla_hlo::ReturnOp>(loc, compare);
   }
@@ -374,9 +691,9 @@ static void CreateWhile32(Location loc, int num_iterations,
     // Increment the loop induction variable by one.
     auto one =
         builder->create<xla_hlo::ConstOp>(loc, builder->getI32IntegerAttr(1));
-    auto no_broadcast_dims = GetI64ElementsAttr({}, builder);
-    auto plus_one = builder->create<xla_hlo::AddOp>(loc, old_values[0], one,
-                                                    no_broadcast_dims);
+    auto scalar_broadcast_dims = GetI64ElementsAttr({}, builder);
+    auto plus_one = builder->create<xla_chlo::BroadcastAddOp>(
+        loc, old_values[0], one, scalar_broadcast_dims);
     // Prepend with the updated loop induction variable.
     new_values.insert(new_values.begin(), plus_one);
 
@@ -401,21 +718,6 @@ static IntegerAttr getFeatureDimensionAttr(Builder &b, StringAttr format,
       GetFeatureDimension(format, input.getType().cast<RankedTensorType>()));
 }
 
-//===----------------------------------------------------------------------===//
-// Bias op utilities.
-//===----------------------------------------------------------------------===//
-
-// Return a 1D DenseIntElementsAttr for the feature dimension of a BiasAdd.
-// Requires input to have ranked tensor.
-static DenseIntElementsAttr getBiasFeatureDimension(Builder &b,
-                                                    StringAttr format,
-                                                    Value input) {
-  auto inputType = input.getType().cast<RankedTensorType>();
-  size_t featureDim = GetFeatureDimension(format, inputType);
-  RankedTensorType type = RankedTensorType::get(1, b.getIntegerType(64));
-  return DenseIntElementsAttr::get(type, featureDim);
-}
-
 //===----------------------------------------------------------------------===//
 // MatMul op utilities.
 //===----------------------------------------------------------------------===//
@@ -552,20 +854,6 @@ static Type ChangeTensorElementType(Builder *b, Type tensor_type,
 // Softmax op utilities.
 //===----------------------------------------------------------------------===//
 
-// Returns a 1-d i64 elements attribute populated with numbers from start to
-// end, excluding.
-static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
-                                                     Builder *builder) {
-  int size = end - start;
-
-  SmallVector<int64_t, 4> vals;
-  vals.resize(size);
-  std::iota(vals.begin(), vals.end(), start);
-
-  TensorType ty = RankedTensorType::get({size}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(ty, vals);
-}
-
 // Returns the type to use for accumulating the given type.
 static Type GetAccumulationType(Type ty) {
   // Upcast 16 bit sum reductions to 32 bit to reduce the precision loss from
@@ -592,8 +880,7 @@ static void BuildArgMinMaxReductionBody(Type input_element_type,
   StringAttr compare_direction =
       StringAttr::get(direction, builder->getContext());
   Value compare = builder->create<CompareOp>(
-      loc, block->getArgument(0), block->getArgument(2),
-      /*broadcast_dimensions=*/nullptr, compare_direction);
+      loc, block->getArgument(0), block->getArgument(2), compare_direction);
 
   Value selected_input = builder->create<SelectOp>(
       loc, input_type, compare, block->getArgument(0), block->getArgument(2));
@@ -709,8 +996,7 @@ static void BuildSortComparisonBody(llvm::ArrayRef<Type> element_types,
   StringAttr compare_direction =
       StringAttr::get(direction, builder->getContext());
   Value compare = builder->create<xla_hlo::CompareOp>(
-      loc, block->getArgument(0), block->getArgument(1),
-      /*broadcast_dimensions=*/nullptr, compare_direction);
+      loc, block->getArgument(0), block->getArgument(1), compare_direction);
 
   builder->create<xla_hlo::ReturnOp>(loc, compare);
 }
@@ -749,6 +1035,27 @@ NamedAttribute GetConvDimensionNumbersAttr(
           feature_dim, spatial_dims, builder->getContext()));
 }
 
+// Converts a TF::BiasAddOp to HLO.
+// This differs from a normal TF::AddOp with respect to how the data_format
+// is handled, which can optionally require a general broadcast of the
+// 'bias' term in a way that is not compatible with the standard left-padded
+// broadcast semantics (i.e. NCHW will broadcast into dimension 1).
+// The correct 'bias' broadcast will be synthesized manually.
+class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(TF::BiasAddOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto feature_dim = GetFeatureDimension(
+        op.data_formatAttr(), op.value().getType().cast<RankedTensorType>());
+    auto bias_broadcast = Broadcast1DToFeatureDim(loc, op.value(), op.bias(),
+                                                  feature_dim, rewriter);
+    rewriter.replaceOpWithNewOp<AddOp>(op, op.value(), bias_broadcast);
+    return success();
+  }
+};
+
 // Converts the TensorFlow conv op in template to the generic HLO conv op by
 // converting TensorFlow op attributes to HLO op attributes.
 //
@@ -764,16 +1071,20 @@ NamedAttribute GetConvDimensionNumbersAttr(
 // the paddings attribute anyway requires multiple source op attributes and
 // result op attributes. Defining it as declarative rewrite rule will introduce
 // some duplication in the C++ helper methods.
-template <typename OpT, int num_spatial_dims, bool depthwise_conv = false>
-class ConvertConv : public OpRewritePattern<OpT> {
+template <typename OpTy, int num_spatial_dims, bool depthwise_conv = false>
+class ConvertConvOp : public OpRewritePattern<OpTy> {
  public:
-  using OpRewritePattern<OpT>::OpRewritePattern;
+  using OpRewritePattern<OpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(OpT op,
+  LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    tensorflow::TensorFormat format;
-    std::string data_format = op.data_format().str();
-    if (!FormatFromString(data_format, &format)) return failure();
+    tensorflow::TensorFormat data_format;
+    if (!FormatFromString(op.data_format().str(), &data_format))
+      return failure();
+
+    tensorflow::Padding padding;
+    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+      return failure();
 
     auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
     auto filter_ty =
@@ -782,23 +1093,8 @@ class ConvertConv : public OpRewritePattern<OpT> {
 
     // Input, filter and the result needs to have static shape for calculation
     // of HLO paddings and feature group count attributes.
-    for (RankedTensorType ty : {input_ty, filter_ty, result_ty}) {
+    for (RankedTensorType ty : {input_ty, filter_ty, result_ty})
       if (!ty || !ty.hasStaticShape()) return failure();
-    }
-
-    int num_dims = num_spatial_dims + 2;
-    tensorflow::Padding padding;
-    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
-      return failure();
-
-    auto get_int = [](Attribute attr) {
-      return attr.template cast<IntegerAttr>().getInt();
-    };
-
-    SmallVector<int64_t, 4> spatial_dim_indices;
-    SmallVector<int64_t, 4> rhs_dilations;
-    SmallVector<int64_t, 4> window_strides;
-    SmallVector<int64_t, 8> paddings;
 
     ArrayRef<Attribute> dilations = op.dilations().getValue();
     ArrayRef<Attribute> strides = op.strides().getValue();
@@ -811,14 +1107,24 @@ class ConvertConv : public OpRewritePattern<OpT> {
           op.template getAttrOfType<ArrayAttr>("explicit_paddings").getValue();
     }
 
-    for (int i = 0; i < num_spatial_dims; ++i) {
-      int64_t dim = GetTensorSpatialDimIndex(num_dims, format, i);
+    SmallVector<int64_t, num_spatial_dims> spatial_dim_indices;
+    SmallVector<int64_t, num_spatial_dims> rhs_dilations;
+    SmallVector<int64_t, num_spatial_dims> window_strides;
+    SmallVector<int64_t, num_spatial_dims * 2> paddings;
+
+    auto get_int = [](Attribute attr) {
+      return attr.template cast<IntegerAttr>().getInt();
+    };
+
+    constexpr int num_dims = num_spatial_dims + 2;
+    for (auto i : llvm::seq<int>(0, num_spatial_dims)) {
+      const int64_t dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
       spatial_dim_indices.push_back(dim);
 
-      int64_t stride = get_int(strides[dim]);
-      int64_t dilation = get_int(dilations[dim]);
-      window_strides.push_back(stride);
+      const int64_t dilation = get_int(dilations[dim]);
       rhs_dilations.push_back(dilation);
+      const int64_t stride = get_int(strides[dim]);
+      window_strides.push_back(stride);
 
       int64_t pad_low, pad_high;
       if (padding == tensorflow::Padding::EXPLICIT) {
@@ -845,19 +1151,19 @@ class ConvertConv : public OpRewritePattern<OpT> {
     auto window_strides_attr = rewriter.getNamedAttr(
         "window_strides", GetI64ElementsAttr(window_strides, &rewriter));
 
-    auto dimension_numbers_attr =
-        GetConvDimensionNumbersAttr(spatial_dim_indices, format, &rewriter);
+    auto dimension_numbers_attr = GetConvDimensionNumbersAttr(
+        spatial_dim_indices, data_format, &rewriter);
 
-    int64_t input_channels =
-        GetDimSize(input_ty, GetTensorFeatureDimIndex(num_dims, format));
+    const int64_t input_channels =
+        GetDimSize(input_ty, GetTensorFeatureDimIndex(num_dims, data_format));
     // Filters data_format is always HWIO so input channels dimension is after
     // all spatial dimensions.
-    int64_t filter_channels = GetDimSize(filter_ty, num_spatial_dims);
+    const int64_t filter_channels = GetDimSize(filter_ty, num_spatial_dims);
     // TensorFlow convolution op verifies that the number of input channels is
     // divisible by the number of filter channels.
     // For depthwise convolution the feature_group_count argument would be set
     // to the input feature dimension.
-    int64_t feature_group_count =
+    const int64_t feature_group_count =
         depthwise_conv ? input_channels : input_channels / filter_channels;
     auto feature_group_count_attr = rewriter.getNamedAttr(
         "feature_group_count", rewriter.getI64IntegerAttr(feature_group_count));
@@ -874,14 +1180,12 @@ class ConvertConv : public OpRewritePattern<OpT> {
     // Reshape the filter to {spatial_dims...., 1,in_channels *
     // channel_multiplier}
     if (depthwise_conv) {
-      auto filter_shape = filter_ty.getShape();
-      llvm::SmallVector<int64_t, 4> new_shape(filter_shape.size());
-      for (int i = 0; i < num_spatial_dims; ++i) {
-        new_shape[i] = filter_shape[i];
-      }
-      new_shape[num_spatial_dims] = 1;
-      new_shape[num_spatial_dims + 1] =
-          filter_shape[num_spatial_dims] * filter_shape[num_spatial_dims + 1];
+      ArrayRef<int64_t> filter_shape = filter_ty.getShape();
+      llvm::SmallVector<int64_t, num_dims> new_shape(
+          filter_shape.begin(), filter_shape.begin() + num_spatial_dims);
+      new_shape.push_back(1);
+      new_shape.push_back(filter_shape[num_spatial_dims] *
+                          filter_shape[num_spatial_dims + 1]);
       operands[1] = rewriter.create<xla_hlo::ReshapeOp>(
           op.getLoc(),
           RankedTensorType::get(new_shape, filter_ty.getElementType()),
@@ -896,10 +1200,12 @@ class ConvertConv : public OpRewritePattern<OpT> {
   }
 };
 
-using ConvertConv2D = ConvertConv<TF::Conv2DOp, /*num_spatial_dims=*/2>;
-using ConvertDepthConv2D =
-    ConvertConv<TF::DepthwiseConv2dNativeOp, /*num_spatial_dims=*/2,
-                /*depthwise_conv=*/true>;
+using ConvertConv2DOp = ConvertConvOp<TF::Conv2DOp, /*num_spatial_dims=*/2>;
+using ConvertConv3DOp = ConvertConvOp<TF::Conv3DOp, /*num_spatial_dims=*/3>;
+using ConvertDepthConv2DOp =
+    ConvertConvOp<TF::DepthwiseConv2dNativeOp, /*num_spatial_dims=*/2,
+                  /*depthwise_conv=*/true>;
+
 // Converts BF16 FloorDiv op to have casting operators on either end as BF16
 // division can result in strange behavior.
 //
@@ -1011,7 +1317,6 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
                                          rewriter.getI64IntegerAttr(1));
     Value compare = rewriter.create<CompareOp>(
         op.getLoc(), iota0, iota1,
-        /*broadcast_dimensions=*/nullptr,
         StringAttr::get("EQ", rewriter.getContext()));
     Value zero = GetScalarConstOfType(input_type.getElementType(), op.getLoc(),
                                       0, &rewriter);
@@ -1124,33 +1429,35 @@ class ConvertFusedBatchNormGradBase
         non_feature_dims.push_back(i);
       }
       auto reduce_dims = GetI64ElementsAttr(non_feature_dims, &rewriter);
-      auto broadcast_dims = GetI64ElementsAttr({feature_dim}, &rewriter);
-      auto no_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+      auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
 
       // scratch1 = rsqrt(var + epsilon)
       RankedTensorType scalar_float = RankedTensorType::get({}, kernel_type);
       auto epsilon = rewriter.create<ConstOp>(
           loc, DenseFPElementsAttr::get(scalar_float, {op.epsilon()}));
-      auto add_op = rewriter.create<AddOp>(loc, var, epsilon.getResult(),
-                                           no_broadcast_dims);
+      auto add_op = rewriter.create<xla_chlo::BroadcastAddOp>(
+          loc, var, epsilon.getResult(), scalar_broadcast_dims);
+
       Value scratch1 = rewriter.create<RsqrtOp>(loc, add_op);
 
       // scratch2 = sum(y_backprop * (x - mean))
-      auto sub_op = rewriter.create<SubOp>(loc, act, mean, broadcast_dims);
-      auto weighted_grad =
-          rewriter.create<MulOp>(loc, grad, sub_op, no_broadcast_dims);
+      auto sub_op = rewriter.create<xla_hlo::SubOp>(
+          loc, act,
+          Broadcast1DToFeatureDim(loc, act, mean, feature_dim, rewriter));
+      auto weighted_grad = rewriter.create<xla_hlo::MulOp>(loc, grad, sub_op);
       Value scratch2 =
           ApplyReduction(loc, weighted_grad, reduce_dims, &rewriter);
 
       // x_backprop = y_backprop * (scale * scratch1)
       auto scaled_grad =
-          rewriter.create<MulOp>(loc, op.scale(), scratch1, no_broadcast_dims);
-      x_backprop =
-          rewriter.create<MulOp>(loc, grad, scaled_grad, broadcast_dims);
+          rewriter.create<xla_hlo::MulOp>(loc, op.scale(), scratch1);
+      x_backprop = rewriter.create<xla_hlo::MulOp>(
+          loc, grad,
+          Broadcast1DToFeatureDim(loc, act, scaled_grad, feature_dim,
+                                  rewriter));
 
       // scale_backprop = scratch2 * scratch1
-      scale_backprop =
-          rewriter.create<MulOp>(loc, scratch1, scratch2, no_broadcast_dims);
+      scale_backprop = rewriter.create<xla_hlo::MulOp>(loc, scratch1, scratch2);
 
       // offset_backprop = sum(y_backprop)
       offset_backprop = ApplyReduction(loc, grad, reduce_dims, &rewriter);
@@ -1186,16 +1493,19 @@ class ConvertFusedBatchNormV3Op
     auto feature_dim =
         getFeatureDimensionAttr(rewriter, op.data_formatAttr(), op.x());
 
-    auto input_type_tensor = op.x().getType().dyn_cast<TensorType>();
+    auto input_type_tensor = op.x().getType().cast<TensorType>();
     auto input_element_type = input_type_tensor.getElementType();
 
-    auto scale_type_tensor = op.scale().getType().dyn_cast<TensorType>();
+    auto scale_type_tensor = op.scale().getType().cast<TensorType>();
     auto scale_element_type = scale_type_tensor.getElementType();
+
+    auto mean_type_tensor = op.mean().getType().cast<TensorType>();
+    auto mean_element_type = mean_type_tensor.getElementType();
     // In the training case, dimensions of input tensors must be static.
-    if (op.is_training() && ((!input_type_tensor.hasStaticShape()) ||
-                             (!scale_type_tensor.hasStaticShape()))) {
+    if (op.is_training() && (!input_type_tensor.hasStaticShape() ||
+                             !scale_type_tensor.hasStaticShape() ||
+                             !mean_type_tensor.hasStaticShape()))
       return failure();
-    }
 
     // TODO(b/69928690): Support mixed precision in the XLA batch
     // normalization operators. As a workaround, create a new x with the same
@@ -1229,6 +1539,7 @@ class ConvertFusedBatchNormV3Op
           op.getLoc(), bn_train_op_result, 0);
       Value batch_mean = rewriter.create<xla_hlo::GetTupleElementOp>(
           op.getLoc(), bn_train_op_result, 1);
+      Value reserve_space_1 = batch_mean;
       Value batch_variance = rewriter.create<xla_hlo::GetTupleElementOp>(
           op.getLoc(), bn_train_op_result, 2);
 
@@ -1242,15 +1553,50 @@ class ConvertFusedBatchNormV3Op
       auto factor_const_op = rewriter.create<xla_hlo::ConstOp>(
           op.getLoc(), rewriter.getFloatAttr(scale_element_type, factor));
 
-      auto corrected_variance = rewriter.create<xla_hlo::MulOp>(
+      Value corrected_variance = rewriter.create<xla_chlo::BroadcastMulOp>(
           op.getLoc(), batch_variance.getType(), batch_variance,
-          factor_const_op, /*DenseIntElementsAttr=*/DenseIntElementsAttr());
+          factor_const_op, /*broadcast_dimensions=*/DenseIntElementsAttr());
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
       y_out = rewriter.create<xla_hlo::ConvertOp>(op.getLoc(), y_out,
                                                   input_element_type);
 
+      float exponential_avg_factor =
+          op.exponential_avg_factor().convertToFloat();
+      if (exponential_avg_factor != 1.0f) {
+        auto alpha = rewriter.create<xla_hlo::ConstOp>(
+            op.getLoc(), rewriter.getFloatAttr(mean_element_type,
+                                               1.0f - exponential_avg_factor));
+        auto beta = rewriter.create<xla_hlo::ConstOp>(
+            op.getLoc(),
+            rewriter.getFloatAttr(mean_element_type, exponential_avg_factor));
+
+        // new_running_mean = alpha * old_mean + beta * batch_mean.
+        auto alpha_mul_old_mean = rewriter.create<xla_chlo::BroadcastMulOp>(
+            op.getLoc(), op.mean().getType(), alpha, op.mean(),
+            /*broadcast_dimensions=*/DenseIntElementsAttr());
+        auto beta_mul_batch_mean = rewriter.create<xla_chlo::BroadcastMulOp>(
+            op.getLoc(), batch_mean.getType(), beta, batch_mean,
+            /*broadcast_dimensions=*/DenseIntElementsAttr());
+        batch_mean = rewriter.create<xla_chlo::BroadcastAddOp>(
+            op.getLoc(), alpha_mul_old_mean, beta_mul_batch_mean,
+            /*broadcast_dimensions=*/DenseIntElementsAttr());
+
+        // new_running_variance = alpha * old_variance + beta * batch_variance.
+        auto alpha_mul_old_variance = rewriter.create<xla_chlo::BroadcastMulOp>(
+            op.getLoc(), op.variance().getType(), alpha, op.variance(),
+            /*broadcast_dimensions=*/DenseIntElementsAttr());
+        auto beta_mul_batch_variance =
+            rewriter.create<xla_chlo::BroadcastMulOp>(
+                op.getLoc(), corrected_variance.getType(), beta,
+                corrected_variance,
+                /*broadcast_dimensions=*/DenseIntElementsAttr());
+        corrected_variance = rewriter.create<xla_chlo::BroadcastAddOp>(
+            op.getLoc(), alpha_mul_old_variance, beta_mul_batch_variance,
+            /*broadcast_dimensions=*/DenseIntElementsAttr());
+      }
+
       // TF FusedBatchNormV3 op expects 5 outputs. Outputs 3 and 4 are
       // currently marked as "reserved spaces 1 and 2". They are used to
       // pass the per-batch mean and variance to the gradiant. Here we
@@ -1259,8 +1605,8 @@ class ConvertFusedBatchNormV3Op
       // matter what we pass there.
       rewriter.replaceOp(op, {y_out, /*batch_mean=*/batch_mean,
                               /*batch_variance=*/corrected_variance,
-                              /*reserve_space_1=*/batch_mean,
-                              /*reserve_space_2=*/corrected_variance,
+                              /*reserve_space_1=*/reserve_space_1,
+                              /*reserve_space_2=*/batch_variance,
                               /*reserve_space_3=*/op.x()});
     } else {  // Inference case.
       auto bn_train_op = rewriter.create<BatchNormInferenceOp>(
@@ -1276,11 +1622,28 @@ class ConvertFusedBatchNormV3Op
 
       // The mean, variance, and reserved space outputs of the batch norm op are
       // not used for inference. It doesn't matter what values we provide for
-      // the last 5 results.
-      rewriter.replaceOp(
-          op, {/*y=*/y_out, /*batch_mean=*/op.x(),
-               /*batch_variance=*/op.x(), /*reserve_space_1=*/op.x(),
-               /*reserve_space_2=*/op.x(), /*reserve_space_3=*/op.x()});
+      // the last 5 results as long as they are of the same type. Forward
+      // input mean and variance to output mean, variance, reserved_space_1 and
+      // reserver_space_2. Create a constant tensor to forward to last
+      // reserve_space_3 output.
+      auto reserve_space_3_type = op.getResult(5).getType().cast<TensorType>();
+      int num_elements = reserve_space_3_type.hasStaticShape()
+                             ? reserve_space_3_type.getNumElements()
+                             : 0;
+      auto const_attr_type = RankedTensorType::get(
+          {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
+
+      Value dummy_const = rewriter.create<ConstOp>(
+          op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
+      if (const_attr_type != reserve_space_3_type)
+        dummy_const = rewriter.create<TensorCastOp>(
+            op.getLoc(), reserve_space_3_type, dummy_const);
+      rewriter.replaceOp(op, {/*y=*/y_out,
+                              /*batch_mean=*/op.mean(),
+                              /*batch_variance=*/op.variance(),
+                              /*reserve_space_1=*/op.mean(),
+                              /*reserve_space_2=*/op.variance(),
+                              /*reserve_space_3=*/dummy_const});
     }
     return success();
   }
@@ -1290,13 +1653,15 @@ class ConvertFusedBatchNormV3Op
 //
 // Requires padding to be either 'SAME' or 'VALID' and the number of input
 // dimensions to be equal to the size of window dimensions and window strides.
+template <int num_dims>
 static DenseIntElementsAttr GetReduceWindowPadding(
     llvm::ArrayRef<int64_t> input_dims, ArrayAttr window_dims,
     ArrayAttr window_strides, StringRef padding, Builder *builder) {
   if (padding == "VALID") return {};
   DCHECK_EQ(padding.str(), "SAME");
 
-  llvm::SmallVector<tensorflow::int64, 4> input_shape, window_shape, strides;
+  llvm::SmallVector<tensorflow::int64, num_dims> input_shape, window_shape,
+      strides;
   input_shape.reserve(input_dims.size());
   window_shape.reserve(window_shape.size());
   strides.reserve(window_strides.size());
@@ -1311,7 +1676,7 @@ static DenseIntElementsAttr GetReduceWindowPadding(
       ::xla::MakePadding(input_shape, window_shape, strides,
                          ::xla::Padding::kSame);
   int64_t rank = paddings.size();
-  llvm::SmallVector<int64_t, 8> flatten_paddings(rank * 2);
+  llvm::SmallVector<int64_t, num_dims * 2> flatten_paddings(rank * 2);
   for (int i = 0; i < rank; i++) {
     flatten_paddings[2 * i] = paddings[i].first;
     flatten_paddings[2 * i + 1] = paddings[i].second;
@@ -1321,7 +1686,7 @@ static DenseIntElementsAttr GetReduceWindowPadding(
       flatten_paddings);
 }
 
-// Converts MaxPool op to HLO ReduceWindow op by setting appropriate window
+// Converts AvgPool op to HLO ReduceWindow op by setting appropriate window
 // dimensions with add as the reduction function. The reduction result is
 // then divided by the number of elements in the window.
 class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
@@ -1361,8 +1726,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
     Value init =
         GetScalarConstOfType(sum_element_type, op.getLoc(), 0, &rewriter);
     DenseIntElementsAttr paddings_attr =
-        GetReduceWindowPadding(input_type.getShape(), op.ksize(), op.strides(),
-                               op.padding(), &rewriter);
+        GetReduceWindowPadding<4>(input_type.getShape(), op.ksize(),
+                                  op.strides(), op.padding(), &rewriter);
     auto reduce = rewriter.create<ReduceWindowOp>(
         op.getLoc(), result_type, input_value, init,
         GetI64ElementsAttr(op.ksize()), GetI64ElementsAttr(op.strides()),
@@ -1380,10 +1745,9 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
     // Divide by the number of elements in the window.
     Value divisor =
         GetScalarConstOfType(sum_element_type, op.getLoc(), count, &rewriter);
-    auto batch_dims =
-        GetI64ElementsAttrForSeq(0, input_type.getRank(), &rewriter);
-    Value result = rewriter.create<DivOp>(op.getLoc(), result_type, reduce,
-                                          divisor, batch_dims);
+    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+    Value result = rewriter.create<xla_chlo::BroadcastDivOp>(
+        op.getLoc(), result_type, reduce, divisor, scalar_broadcast_dims);
 
     // Convert back if we enlarged the element type's bitwidth.
     if (input_element_type != sum_element_type)
@@ -1404,21 +1768,22 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
 //   %max_pool = "xla_hlo.reduce"(%inp, %init) ["xla_hlo.maximum"]
 //               {window_dimensions = ..., window_strides = ... }
 //
-class ConvertMaxPoolOp : public OpRewritePattern<TF::MaxPoolOp> {
+template <typename OpTy, int num_dims>
+class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
  public:
-  using OpRewritePattern::OpRewritePattern;
+  using OpRewritePattern<OpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::MaxPoolOp op,
+  LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     Type element_type =
-        op.input().getType().cast<TensorType>().getElementType();
+        op.input().getType().template cast<TensorType>().getElementType();
     if (!element_type.isSignlessIntOrFloat()) return failure();
     Location loc = op.getLoc();
     ConstOp init = GetMinValueForType(element_type, loc, &rewriter);
 
-    auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
+    auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
-    DenseIntElementsAttr paddings_attr = GetReduceWindowPadding(
+    DenseIntElementsAttr paddings_attr = GetReduceWindowPadding<num_dims>(
         input_ty.getShape(), op.ksize(), op.strides(), op.padding(), &rewriter);
     auto reduce = rewriter.create<ReduceWindowOp>(
         loc, op.getType(), op.input(), init.getResult(),
@@ -1432,6 +1797,9 @@ class ConvertMaxPoolOp : public OpRewritePattern<TF::MaxPoolOp> {
   }
 };
 
+using ConvertMaxPool2DOp = ConvertMaxPoolOp<TF::MaxPoolOp, /*num_dims=*/4>;
+using ConvertMaxPool3DOp = ConvertMaxPoolOp<TF::MaxPool3DOp, /*num_dims=*/5>;
+
 // Converts SelectV2 to HLO Select op and necessary BroadcastInDim ops on
 // operands.
 //
@@ -1542,24 +1910,21 @@ class ConvertSigmoidOp : public OpRewritePattern<TF::SigmoidOp> {
         op.getLoc(),
         rewriter.getFloatAttr(getElementTypeOrSelf(operand.getType()), 0.5));
 
-    auto shaped_type = operand.getType().cast<ShapedType>();
+    auto type = operand.getType().dyn_cast<RankedTensorType>();
+    if (!type)
+      return rewriter.notifyMatchFailure(op, "requires ranked tensor type");
     auto constant_ones = rewriter.create<BroadcastOp>(
-        op.getLoc(), shaped_type, scalar_one,
-        DenseIntElementsAttr::get(
-            RankedTensorType::get({shaped_type.getRank()},
-                                  rewriter.getIntegerType(64)),
-            shaped_type.getShape()));
+        op.getLoc(), type, scalar_one,
+        GetI64ElementsAttr(type.getShape(), &rewriter));
 
-    auto scaled_input = rewriter.create<MulOp>(
-        op.getLoc(), operand, constant_ones, DenseIntElementsAttr());
+    auto scaled_input =
+        rewriter.create<xla_hlo::MulOp>(op.getLoc(), operand, constant_ones);
     auto tanh_op =
         rewriter.create<TanhOp>(op.getLoc(), operand.getType(), scaled_input);
     auto mul_op =
-        rewriter.create<MulOp>(op.getLoc(), tanh_op, constant_ones,
-                               /*DenseIntElementsAttr=*/DenseIntElementsAttr());
+        rewriter.create<xla_hlo::MulOp>(op.getLoc(), tanh_op, constant_ones);
     auto add_op =
-        rewriter.create<AddOp>(op.getLoc(), mul_op, constant_ones,
-                               /*DenseIntElementsAttr=*/DenseIntElementsAttr());
+        rewriter.create<xla_hlo::AddOp>(op.getLoc(), mul_op, constant_ones);
 
     rewriter.replaceOp(op, add_op.getResult());
     return success();
@@ -1598,20 +1963,18 @@ class ConvertSoftmaxOp : public OpRewritePattern<OpTy> {
 
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    Value logits = op.logits();
-
     // Softmax converter requires ranked type because the XLA reduce ops used
     // while lowering requires dimensions attribute to reduce along.
+    // Note that the input and output shape is equivalent, so we use 'logits'
+    // and its type for shape calculations.
+    Value logits = op.logits();
     RankedTensorType type = logits.getType().dyn_cast<RankedTensorType>();
     if (!type) return failure();
-
     auto loc = op.getLoc();
     int rank = type.getRank();
 
     // Note that the TensorFlow Softmax op verifies that the input rank is
-    // greater than or equal to one so both of the following sequences are
-    // valid.
-    auto batch_dims = GetI64ElementsAttrForSeq(0, rank - 1, &rewriter);
+    // greater than or equal to one so the following sequence is valid.
     auto reduce_dim = rewriter.create<TF::ConstOp>(
         loc, GetI64ElementsAttr({rank - 1}, &rewriter));
 
@@ -1624,8 +1987,10 @@ class ConvertSoftmaxOp : public OpRewritePattern<OpTy> {
     auto max_logits =
         rewriter.create<TF::MaxOp>(loc, logits, reduce_dim,
                                    /*keep_dims=*/rewriter.getBoolAttr(false));
-    auto shifted_logits =
-        rewriter.create<SubOp>(loc, type, logits, max_logits, batch_dims);
+    auto max_logits_broadcast =
+        CommonPrefixBroadcast(loc, logits, max_logits, rewriter);
+    auto shifted_logits = rewriter.create<xla_hlo::SubOp>(loc, type, logits,
+                                                          max_logits_broadcast);
 
     // Exponentiate the inputs.
     Value exp = rewriter.create<ExpOp>(loc, type, shifted_logits);
@@ -1638,9 +2003,12 @@ class ConvertSoftmaxOp : public OpRewritePattern<OpTy> {
 
     if (use_log) {
       Value log = rewriter.create<LogOp>(loc, sum);
-      rewriter.replaceOpWithNewOp<SubOp>(op, shifted_logits, log, batch_dims);
+      auto log_broadcast = CommonPrefixBroadcast(loc, logits, log, rewriter);
+      rewriter.replaceOpWithNewOp<xla_hlo::SubOp>(op, shifted_logits,
+                                                  log_broadcast);
     } else {
-      rewriter.replaceOpWithNewOp<DivOp>(op, exp, sum, batch_dims);
+      auto sum_broadcast = CommonPrefixBroadcast(loc, logits, sum, rewriter);
+      rewriter.replaceOpWithNewOp<xla_hlo::DivOp>(op, exp, sum_broadcast);
     }
     return success();
   }
@@ -1687,7 +2055,7 @@ class ConvertSizeOp : public OpRewritePattern<TF::SizeOp> {
       auto dim = rewriter.create<GetDimensionSizeOp>(
           op.getLoc(), result_type, input,
           rewriter.getIntegerAttr(rewriter.getIntegerType(32), i));
-      size = rewriter.create<MulOp>(
+      size = rewriter.create<xla_chlo::BroadcastMulOp>(
           op.getLoc(), size->getResult(0), dim.getResult(),
           /*DenseIntElementsAttr=*/DenseIntElementsAttr());
     }
@@ -1700,29 +2068,63 @@ class ConvertSizeOp : public OpRewritePattern<TF::SizeOp> {
 static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
                                            Value *out_lhs, Value *out_rhs,
                                            PatternRewriter *rewriter) {
+  // The dimension structure of the relevant operands to a tf.BatchMatMulV2 is:
+  // - lhs: [LHSBATCHDIMS..., LHSROWS, LHSCOLS]
+  // - rhs: [RHSBATCHDIMS..., RHSROWS, RHSCOLS]
+  // - result: [broadcast(LHSBATCHDIMS, RHSBATCHDIMS)..., LHSROWS, RHSCOLS]
+  // To perform the matmul, we need to first broadcast lhs and rhs to a common
+  // set of leading dimensions before doing the actual matmul.
+  // That's what the code below does.
+  // In particular, we populate out_lhs and out_rhs to have dimension structure:
+  // - out_lhs: [broadcast(LHSBATCHDIMS, RHSBATCHDIMS)..., LHSROWS, LHSCOLS]
+  // - out_rhs: [broadcast(LHSBATCHDIMS, RHSBATCHDIMS)..., RHSROWS, RHSCOLS]
+  // To do this, we need to calculate those output shapes, which involves
+  // slicing off the leading batch dims of each operand, broadcasting them,
+  // then concatenating the broadcasted leading dims back to the row/col dims.
+  // Finally, we create a TF::BroadcastTo op that does the actual broadcast.
+
+  // TODO(silvasean): Reduce duplication across reified shape calculations and
+  // the static computation of output types needed to create ops.
+  Value lhs_shape = rewriter->create<shape::ShapeOfOp>(loc, lhs);
+  Value rhs_shape = rewriter->create<shape::ShapeOfOp>(loc, rhs);
+  Value const_neg2 =
+      rewriter->create<ConstantOp>(loc, rewriter->getI32IntegerAttr(-2));
+  auto lhs_splitted =
+      rewriter->create<shape::SplitAtOp>(loc, lhs_shape, const_neg2);
+  auto rhs_splitted =
+      rewriter->create<shape::SplitAtOp>(loc, rhs_shape, const_neg2);
   auto lhs_type = lhs.getType().cast<RankedTensorType>();
   auto rhs_type = rhs.getType().cast<RankedTensorType>();
-  // The last two dimensions are the matrix row/col dimensions. Don't
-  // broadcast them.
-  SmallVector<int64_t, 6> result_batch_shape;
+  // The last two dimensions are the matrix row/col dimensions. Don't broadcast
+  // them.
+  SmallVector<int64_t, 6> result_batch_shape_compile_time_extents;
   OpTrait::util::getBroadcastedShape(lhs_type.getShape().drop_back(2),
                                      rhs_type.getShape().drop_back(2),
-                                     result_batch_shape);
-  auto handle_one_side = [rewriter, &result_batch_shape, loc](
-                             Value side, RankedTensorType type,
-                             Value *out_side) {
+                                     result_batch_shape_compile_time_extents);
+  auto result_batch_shape = rewriter->create<shape::BroadcastOp>(
+      loc, lhs_splitted.head(), rhs_splitted.head(),
+      /*error=*/nullptr);
+  // Lambda which handles the broadcasting of one side to the common
+  // leading-batch dimensions.
+  auto broadcast_one_side = [&](Value side, RankedTensorType type,
+                                Value tail_shape, Value *out_side) {
     ArrayRef<int64_t> matrix_dims = type.getShape().take_back(2);
-    auto result_shape = result_batch_shape;
+    auto result_shape = result_batch_shape_compile_time_extents;
     result_shape.append(matrix_dims.begin(), matrix_dims.end());
     auto result_type =
         RankedTensorType::get(result_shape, type.getElementType());
-    auto shape = rewriter->create<TF::ConstOp>(
-        loc, GetI64ElementsAttr(result_shape, rewriter));
-    *out_side =
-        rewriter->create<TF::BroadcastToOp>(loc, result_type, side, shape);
+    auto shape =
+        rewriter->create<shape::ConcatOp>(loc, result_batch_shape, tail_shape);
+    auto shape_tensor = rewriter->create<shape::ToExtentTensorOp>(
+        loc,
+        RankedTensorType::get({static_cast<int64_t>(result_shape.size())},
+                              rewriter->getIndexType()),
+        shape);
+    *out_side = rewriter->create<TF::BroadcastToOp>(loc, result_type, side,
+                                                    shape_tensor);
   };
-  handle_one_side(lhs, lhs_type, out_lhs);
-  handle_one_side(rhs, rhs_type, out_rhs);
+  broadcast_one_side(lhs, lhs_type, lhs_splitted.tail(), out_lhs);
+  broadcast_one_side(rhs, rhs_type, rhs_splitted.tail(), out_rhs);
 }
 
 class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
@@ -1742,10 +2144,6 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
     if (rhs_type.getElementType().isa<ComplexType>() && op.adj_y()) {
       rhs = rewriter.create<TF::ConjOp>(op.getLoc(), rhs_type, rhs);
     }
-    // TODO(silvasean): Support dynamic shapes.
-    if (!lhs_type.hasStaticShape() || !rhs_type.hasStaticShape()) {
-      return failure();
-    }
 
     // Broadcast both operands.
     BroadcastBatchMatMulV2Operands(lhs, rhs, op.getLoc(), &lhs, &rhs,
@@ -1766,6 +2164,8 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
         /*lhs_contracting_dimensions=*/lhs_contracting_dimensions,
         /*rhs_contracting_dimensions=*/rhs_contracting_dimensions,
         rewriter.getContext());
+    // TODO(silvasean): Emit shape checks for contracting dimensions.
+    // (The batch dimensions are checked by the broadcasting logic)
     rewriter.replaceOpWithNewOp<DotGeneralOp>(op, op.getType(), lhs, rhs,
                                               dimension_numbers,
                                               /*precision_config=*/nullptr);
@@ -1981,11 +2381,16 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
 // negative strides and Reshape op to update the output shape. Indices and
 // strides operands are converted to attributes with non-negative indexing.
 //
+// If the begin input is not a compile time constant, the begin input needs to
+// be sliced and the slice needs to be lowered to xla_hlo.DynamicSlice. In this
+// case, strides must have a known value of 1 (otherwise we have insufficient
+// information to conform to XLA's op semantics).
+//
 // For example with an op like following,
 //   tf.StridedSlice(%input, %begin, %end, %strides) {shrink_axis_mask = 1}
 //     : tensor<AxBxf32> -> tensor<Pxf32>
 //
-// Output would be:
+// If the %begin input is constant, output would be:
 //   %reversed = "xla_hlo.Reverse" (%input) {dimensions = ...}
 //   %sliced = "xla_hlo.Slice" (%input)
 //             {start_indices = ..., limit_indices = ..., strides = ...}
@@ -1995,31 +2400,16 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
  public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::StridedSliceOp op,
-                                PatternRewriter &rewriter) const override {
-    // Input shape needs to be static to convert negative indices in TensorFlow
-    // to absolute indices required by HLO.
-    //
-    // TODO(hinsu): Relax this constraint for ops without negative indices and
-    // strides.
-    auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
-    if (!input_ty || !input_ty.hasStaticShape()) return failure();
-    ArrayRef<int64_t> input_shape = input_ty.getShape();
-
-    // Output shape needs to be static to apply 'new_axis_mask' or
-    // 'shrink_axis_mask' by reshaping tensor after slice.
-    //
-    // TODO(hinsu): Relax this constraint for ops without the above masks.
-    auto result_ty = op.getType().dyn_cast<RankedTensorType>();
-    if (!result_ty || !result_ty.hasStaticShape()) return failure();
-
-    SmallVector<int64_t, 4> begin_indices, end_indices, strides;
-    if (!op.GetSlicedBoundRanges(&begin_indices, &end_indices, &strides))
-      return failure();
-
+  LogicalResult rewriteWithConstantBegin(TF::StridedSliceOp op,
+                                         ArrayRef<int64_t> begin_indices,
+                                         ArrayRef<int64_t> end_indices,
+                                         ArrayRef<int64_t> strides,
+                                         RankedTensorType input_ty,
+                                         PatternRewriter &rewriter) const {
     SmallVector<int64_t, 4> hlo_begin_indices, hlo_end_indices, hlo_strides,
         dims_to_reverse;
     int64_t input_rank = input_ty.getRank();
+    ArrayRef<int64_t> input_shape = input_ty.getShape();
     hlo_begin_indices.reserve(input_rank);
     hlo_end_indices.reserve(input_rank);
     hlo_strides.reserve(input_rank);
@@ -2071,6 +2461,170 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), sliced);
     return success();
   }
+
+  LogicalResult rewriteWithUnknownBegin(TF::StridedSliceOp op,
+                                        RankedTensorType input_ty,
+                                        RankedTensorType result_ty,
+                                        PatternRewriter &rewriter) const {
+    // If begin and end values are dynamic, we can only support this lowering
+    // if strides are a known value of 1.
+    DenseIntElementsAttr sparse_strides_attr;
+    if (!matchPattern(op.strides(), m_Constant(&sparse_strides_attr))) {
+      return rewriter.notifyMatchFailure(
+          op,
+          "requires that strides are known when begin/end values are dynamic");
+    }
+    SmallVector<int64_t, 4> strides;
+    int64_t stride_value;
+    for (const APInt &stride : sparse_strides_attr) {
+      if ((stride_value = stride.getSExtValue()) != 1) {
+        return rewriter.notifyMatchFailure(op,
+                                           "requires that strides are all 1 "
+                                           "when begin/end values are dynamic");
+      }
+      strides.push_back(stride_value);
+    }
+
+    ArrayRef<int64_t> input_shape = input_ty.getShape();
+    int last_dim = std::max(static_cast<int>(input_shape.size()) - 1, 0);
+
+    // When begin/end values are dynamic, we can only support shrinking a major
+    // axis. For instance, if there are 4 dims, we can support a
+    // shrink_axis_mask of 0001 (1), 0011 (3), 0111 (7), or 1111 (15), but no
+    // other.
+    bool shrink_axis_mask_ok = op.shrink_axis_mask().isMask();
+    if (!shrink_axis_mask_ok)
+      return rewriter.notifyMatchFailure(
+          op,
+          "requires that shrink_axis_mask, if set, refer to a major axis "
+          "dimension (when begin/end values are dynamic)");
+
+    // When begin/end values are dynamic, the ellipsis mask, if set, must refer
+    // to the last dimension.
+    int ellipsis_mask = op.ellipsis_mask().getZExtValue();
+    if (!(ellipsis_mask == 0 || ellipsis_mask == (1 << last_dim)))
+      return rewriter.notifyMatchFailure(
+          op,
+          "requires that ellipsis_mask, if set, refer to the last dimension of "
+          "input (when begin/end values are dynamic)");
+
+    APInt begin_mask = op.begin_mask();
+    if (!begin_mask.isNullValue())
+      return rewriter.notifyMatchFailure(
+          op,
+          "requires that begin_mask is either set to 0 or not set when "
+          "begin/end values are dynamic");
+    APInt end_mask = op.end_mask();
+    if (!end_mask.isNullValue())
+      return rewriter.notifyMatchFailure(
+          op,
+          "requires that end_mask is either set to 0 or not set when begin/end "
+          "values are dynamic");
+    APInt new_axis_mask = op.new_axis_mask();
+    if (!new_axis_mask.isNullValue())
+      return rewriter.notifyMatchFailure(
+          op,
+          "requires that new_axis_mask is either set to 0 or not set when "
+          "begin/end values are dynamic");
+
+    // In this case where the begin and end values are dynamic, the number of
+    // output elements has to be equal to the number of input elements that
+    // are sliced.
+    int output_elements = result_ty.getNumElements();
+    int input_elements_sliced = 1;
+
+    // Begin must be a ranked, 1-dimensional tensor: This is checked by the
+    // verifier.
+    int64_t slicing_dim_size =
+        op.begin().getType().cast<RankedTensorType>().getShape()[0];
+    auto input_rank = input_shape.size();
+    for (int d = slicing_dim_size; d < input_rank; ++d) {
+      // We only support slicing major dimensions, so minor dimensions after
+      // slicing dimensions are all sliced with their full sizes.
+      input_elements_sliced *= input_shape[d];
+    }
+    if (input_elements_sliced != output_elements) {
+      return rewriter.notifyMatchFailure(
+          op,
+          "requires the number of output elements to be equal to the number of "
+          "input elements sliced (when begin/end values are dynamic)");
+    }
+
+    SmallVector<Value, 4> slice_begin_indices;
+    // For the dimensions that are to be sliced, all have slice sizes of 1.
+    SmallVector<int64_t, 4> slice_sizes(slicing_dim_size, 1);
+    auto input_element_ty = input_ty.getElementType();
+    // Scalar tensor type.
+    TensorType type = RankedTensorType::get(/*shape=*/{}, input_element_ty);
+    Location loc = op.getLoc();
+    auto zero = GetScalarConstOfType(input_element_ty, loc, 0, &rewriter);
+    for (int d = 0; d < slicing_dim_size; ++d) {
+      auto index = rewriter.create<SliceOp>(
+          loc, op.begin(), GetI64ElementsAttr({d}, &rewriter),
+          GetI64ElementsAttr({d + 1}, &rewriter),
+          GetI64ElementsAttr({1}, &rewriter));
+      // Convert index to scalar.
+      auto reshaped_index = rewriter.create<ReshapeOp>(loc, type, index);
+      // If the index is negative, wrap it around with dimension size.
+      auto index_negative =
+          rewriter.create<TF::LessOp>(loc, reshaped_index, zero);
+      auto input_val = GetScalarConstOfType(input_element_ty, loc,
+                                            input_shape[d], &rewriter);
+      auto wrapped_index =
+          rewriter.create<TF::AddOp>(loc, input_val, reshaped_index);
+      auto final_index = rewriter.create<SelectOp>(
+          loc, type, index_negative, wrapped_index, reshaped_index);
+      slice_begin_indices.push_back(final_index);
+    }
+
+    // For non-slice dims, get the full slice of that dimension.
+    for (int d = slicing_dim_size; d < input_shape.size(); ++d) {
+      slice_sizes.push_back(input_shape[d]);
+      slice_begin_indices.push_back(zero);
+    }
+
+    auto slice_sizes_attr = GetI64ElementsAttr(slice_sizes, &rewriter);
+    // This must be an xla DynamicSlice op due to the inputs that aren't
+    // constant.
+    auto sliced = rewriter.create<DynamicSliceOp>(
+        loc, op.getType(), op.input(), slice_begin_indices, slice_sizes_attr);
+
+    // Reshape slice result so that the shape is updated depending on
+    // 'new_axis_mask' or 'shrink_axis_mask' attributes.
+    rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), sliced);
+    return success();
+  }
+
+  LogicalResult matchAndRewrite(TF::StridedSliceOp op,
+                                PatternRewriter &rewriter) const override {
+    // Input shape needs to be static to convert negative indices in TensorFlow
+    // to absolute indices required by HLO.
+    //
+    // TODO(hinsu): Relax this constraint for ops without negative indices and
+    // strides.
+    auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
+    if (!input_ty || !input_ty.hasStaticShape()) return failure();
+
+    // Output shape needs to be static to apply 'new_axis_mask' or
+    // 'shrink_axis_mask' by reshaping tensor after slice.
+    //
+    // TODO(hinsu): Relax this constraint for ops without the above masks.
+    auto result_ty = op.getType().dyn_cast<RankedTensorType>();
+    if (!result_ty || !result_ty.hasStaticShape()) return failure();
+
+    DenseIntElementsAttr sparse_begin_attr, sparse_end_attr;
+    if (!matchPattern(op.begin(), m_Constant(&sparse_begin_attr)) ||
+        !matchPattern(op.end(), m_Constant(&sparse_end_attr))) {
+      return rewriteWithUnknownBegin(op, input_ty, result_ty, rewriter);
+    }
+
+    SmallVector<int64_t, 4> begin_indices, end_indices, strides;
+    if (!op.GetSlicedBoundRanges(&begin_indices, &end_indices, &strides)) {
+      return failure();
+    }
+    return rewriteWithConstantBegin(op, begin_indices, end_indices, strides,
+                                    input_ty, rewriter);
+  }
 };
 
 // Converts tf.StridedSliceGrad to HLO reshape, reverse and padding ops.
@@ -2187,16 +2741,31 @@ class ConvertRangeOp : public OpRewritePattern<TF::RangeOp> {
 
     auto iota = rewriter.create<IotaOp>(op.getLoc(), result_type,
                                         rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<MulOp>(
+    auto scaled = rewriter.create<xla_chlo::BroadcastMulOp>(
         op.getLoc(), result_type, iota, op.delta(),
         xla::getBroadcastDimensionsAttr(&rewriter, iota, op.delta()));
-    rewriter.replaceOpWithNewOp<AddOp>(
+    rewriter.replaceOpWithNewOp<xla_chlo::BroadcastAddOp>(
         op, result_type, scaled, op.start(),
         xla::getBroadcastDimensionsAttr(&rewriter, scaled, op.start()));
     return success();
   }
 };
 
+ElementsAttr ConvertAxisAttr(Value val, ElementsAttr attr, Builder *builder) {
+  auto int_attr = attr.cast<DenseIntElementsAttr>();
+  auto type = val.getType().cast<ShapedType>();
+
+  SmallVector<int64_t, 6> axis;
+  axis.reserve(int_attr.getNumElements());
+
+  int64_t rank = type.getRank();
+  for (auto val : int_attr.getValues<APInt>()) {
+    axis.push_back((val.getSExtValue() + rank) % rank);
+  }
+
+  return builder->getI64TensorAttr(axis);
+}
+
 /// Converts the LinSpace tensorflow op to a xla_hlo.iota op with a scaling
 /// and offset applied to generate the linspace values. The output tensor needs
 /// to have a static shape.  The implementation is defined in C++ because there
@@ -2223,7 +2792,7 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     int64_t num = (*num_attr.begin()).getSExtValue();
 
     // Calculate the scaling that needs to be applied to the iota.
-    auto step_numerator = rewriter.create<SubOp>(
+    auto step_numerator = rewriter.create<xla_chlo::BroadcastSubOp>(
         op.getLoc(), op.start().getType(), op.stop(), op.start(),
         xla::getBroadcastDimensionsAttr(&rewriter, op.stop(), op.start()));
     Value step_denominator = rewriter.create<ConvertOp>(
@@ -2231,11 +2800,11 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     if (num > 1) {
       Value one = GetScalarConstOfType(result_type.getElementType(),
                                        op.getLoc(), 1, &rewriter);
-      step_denominator = rewriter.create<SubOp>(
+      step_denominator = rewriter.create<xla_chlo::BroadcastSubOp>(
           op.getLoc(), step_denominator.getType(), step_denominator, one,
           xla::getBroadcastDimensionsAttr(&rewriter, step_denominator, one));
     }
-    auto step = rewriter.create<DivOp>(
+    auto step = rewriter.create<xla_chlo::BroadcastDivOp>(
         op.getLoc(), step_numerator.getType(), step_numerator, step_denominator,
         xla::getBroadcastDimensionsAttr(&rewriter, step_numerator,
                                         step_denominator));
@@ -2243,10 +2812,10 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     // Scale the iota and add the offset.
     auto iota = rewriter.create<IotaOp>(op.getLoc(), result_type,
                                         rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<MulOp>(
+    auto scaled = rewriter.create<xla_chlo::BroadcastMulOp>(
         op.getLoc(), result_type, iota, step,
         xla::getBroadcastDimensionsAttr(&rewriter, iota, step));
-    rewriter.replaceOpWithNewOp<AddOp>(
+    rewriter.replaceOpWithNewOp<xla_chlo::BroadcastAddOp>(
         op, result_type, scaled, op.start(),
         xla::getBroadcastDimensionsAttr(&rewriter, scaled, op.start()));
     return success();
@@ -2322,8 +2891,8 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
       auto divisor = GetScalarConstOfType(reduce_element_type, loc,
                                           divisor_count, &rewriter);
       auto broadcast_dims = GetI64ElementsAttr({}, &rewriter);
-      result = rewriter.create<DivOp>(loc, result, divisor.getResult(),
-                                      broadcast_dims);
+      result = rewriter.create<xla_chlo::BroadcastDivOp>(
+          loc, result, divisor.getResult(), broadcast_dims);
     }
 
     result = rewriter.create<ConvertOp>(loc, result, element_type);
@@ -2670,23 +3239,25 @@ class ConvertTileOp : public OpRewritePattern<TF::TileOp> {
   }
 };
 
-class ConvertMaxPoolGradOp : public OpRewritePattern<TF::MaxPoolGradOp> {
+template <typename OpTy, int num_dims>
+class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
  public:
-  using OpRewritePattern::OpRewritePattern;
+  using OpRewritePattern<OpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::MaxPoolGradOp op,
+  LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
 
     Type element_type =
-        op.orig_input().getType().cast<TensorType>().getElementType();
+        op.orig_input().getType().template cast<TensorType>().getElementType();
 
     // Compute paddings using the original input and kernel shape and strides.
     // Here, ReduceWindow op as used as the MaxPool op is lowered to the
     // ReduceWindow op.
-    auto input_ty = op.orig_input().getType().dyn_cast<RankedTensorType>();
+    auto input_ty =
+        op.orig_input().getType().template dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
-    DenseIntElementsAttr paddings_attr = GetReduceWindowPadding(
+    DenseIntElementsAttr paddings_attr = GetReduceWindowPadding<num_dims>(
         input_ty.getShape(), op.ksize(), op.strides(), op.padding(), &rewriter);
 
     auto result = rewriter.create<SelectAndScatterOp>(
@@ -2706,7 +3277,6 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<TF::MaxPoolGradOp> {
 
       auto reducer = rewriter.create<CompareOp>(
           loc, block->getArgument(0), block->getArgument(1),
-          /*broadcast_dimensions=*/nullptr,
           StringAttr::get("GE", rewriter.getContext()));
       rewriter.create<ReturnOp>(loc, reducer.getResult());
     }
@@ -2717,103 +3287,112 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<TF::MaxPoolGradOp> {
   }
 };
 
-// Converts hlo.Conv2DBackpropInputOp into:
+using ConvertMaxPool2DGradOp =
+    ConvertMaxPoolGradOp<TF::MaxPoolGradOp, /*num_dims=*/4>;
+using ConvertMaxPool3DGradOp =
+    ConvertMaxPoolGradOp<TF::MaxPool3DGradOp, /*num_dims=*/5>;
+
+// Converts tf.Conv?DBackpropInputOp into:
 //   %rev_filter = "xla_hlo.reverse"(%filter)
 //   %result = "xla_hlo.convolution"(%out_backprop, %rev_filter)
-class ConvertConv2DBackpropInputOp
-    : public OpRewritePattern<TF::Conv2DBackpropInputOp> {
+template <typename OpTy, int num_spatial_dims>
+class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
  public:
-  using OpRewritePattern::OpRewritePattern;
+  using OpRewritePattern<OpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::Conv2DBackpropInputOp op,
+  LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     // Unpack all of the attributes.
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format)) {
+    if (!FormatFromString(op.data_format().str(), &data_format))
       return failure();
-    }
+
     tensorflow::Padding padding;
     if (!GetPaddingFromString(op.padding().str(), &padding).ok())
       return failure();
 
     auto out_backprop_ty =
-        op.out_backprop().getType().dyn_cast<RankedTensorType>();
-    if (!out_backprop_ty || !out_backprop_ty.hasStaticShape()) return failure();
-    ArrayRef<int64_t> out_backprop_shape = out_backprop_ty.getShape();
-    auto filter_ty = op.filter().getType().dyn_cast<RankedTensorType>();
-    if (!filter_ty || !filter_ty.hasStaticShape()) return failure();
-    ArrayRef<int64_t> filter_shape = filter_ty.getShape();
-    int num_spatial_dims = 2;
-    Location loc = op.getLoc();
+        op.out_backprop().getType().template dyn_cast<RankedTensorType>();
+    auto filter_ty =
+        op.filter().getType().template dyn_cast<RankedTensorType>();
 
-    int num_dims = num_spatial_dims + 2;
-    int batch_dim = tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
-    int feature_dim =
-        tensorflow::GetTensorFeatureDimIndex(num_dims, data_format);
+    for (RankedTensorType ty : {out_backprop_ty, filter_ty})
+      if (!ty || !ty.hasStaticShape()) return failure();
 
     DenseIntElementsAttr input_shape_attr;
     if (!matchPattern(op.input_sizes(), m_Constant(&input_shape_attr)) ||
-        input_shape_attr.getType().getRank() != 1) {
+        input_shape_attr.getType().getRank() != 1)
       return failure();
-    }
-    auto input_shape =
-        llvm::to_vector<4>(input_shape_attr.getValues<int32_t>());
-    if (input_shape.size() != num_dims) return failure();
 
-    auto batch_dim_attr = rewriter.getI64IntegerAttr(batch_dim);
-    auto feature_dim_attr = rewriter.getI64IntegerAttr(feature_dim);
+    auto input_shape = input_shape_attr.getValues<int32_t>();
 
+    auto dilations_attr = GetI64ElementsAttr(op.dilations());
+    std::vector<int> dilations{
+        dilations_attr.template getValues<int64_t>().begin(),
+        dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.strides());
     std::vector<tensorflow::int32> strides{
-        strides_attr.getValues<int64_t>().begin(),
-        strides_attr.getValues<int64_t>().end()};
-    auto dilations_attr = GetI64ElementsAttr(op.dilations());
-    std::vector<int> dilations{dilations_attr.getValues<int64_t>().begin(),
-                               dilations_attr.getValues<int64_t>().end()};
-    auto explicit_paddings_attr = GetI64ElementsAttr(op.explicit_paddings());
-    std::vector<tensorflow::int64> explicit_paddings{
-        explicit_paddings_attr.getValues<int64_t>().begin(),
-        explicit_paddings_attr.getValues<int64_t>().end()};
+        strides_attr.template getValues<int64_t>().begin(),
+        strides_attr.template getValues<int64_t>().end()};
 
-    int64_t in_depth = input_shape[feature_dim];
-    int64_t filter_in_depth = filter_shape[num_spatial_dims];
-    int64_t feature_group_count = in_depth / filter_in_depth;
+    std::vector<tensorflow::int64> explicit_paddings;
+    if (padding == tensorflow::Padding::EXPLICIT) {
+      // EXPLICIT padding mode and the associated attribute is limited to
+      // Conv2DBackpropInput. So, fetch attribute by identifier instead of the
+      // op.explicit_paddings() attribute getter.
+      ArrayRef<Attribute> explicit_paddings_attr =
+          op.template getAttrOfType<ArrayAttr>("explicit_paddings").getValue();
+      explicit_paddings.reserve(explicit_paddings_attr.size());
+      for (Attribute explicit_padding : explicit_paddings_attr)
+        explicit_paddings.push_back(
+            explicit_padding.cast<IntegerAttr>().getInt());
+    }
+
+    constexpr int num_dims = num_spatial_dims + 2;
+    ArrayRef<int64_t> filter_shape = filter_ty.getShape();
 
     // Reuse dimension computation logic from conv_grad_shape_utils.cc.
     tensorflow::ConvBackpropDimensions dims;
     if (!tensorflow::ConvBackpropComputeDimensionsV2(
-             "", num_spatial_dims, ToTensorShape<int>(input_shape),
-             ToTensorShape<int64_t>(filter_shape),
-             ToTensorShape<int64_t>(out_backprop_shape), dilations, strides,
-             padding, explicit_paddings, data_format, &dims)
+             /*label=*/"", num_spatial_dims,
+             ToTensorShape<int32_t, num_dims>(input_shape),
+             ToTensorShape<int64_t, num_dims>(filter_shape),
+             ToTensorShape<int64_t, num_dims>(out_backprop_ty.getShape()),
+             dilations, strides, padding, explicit_paddings, data_format, &dims)
              .ok()) {
       return failure();
     }
 
     // Compute ConvDimensionNumbers, dilation, and padding.
-    SmallVector<int64_t, 4> kernel_spatial_dims(num_spatial_dims);
-    SmallVector<int64_t, 4> conv_paddings(num_spatial_dims * 2);
-    SmallVector<int64_t, 4> lhs_dilation(num_spatial_dims);
-    SmallVector<int64_t, 4> rhs_dilation(num_spatial_dims);
-    SmallVector<int64_t, 4> ones(num_spatial_dims, 1);
-    SmallVector<int64_t, 4> spatial_dims(num_spatial_dims);
-    for (int i = 0; i < num_spatial_dims; ++i) {
-      int64_t dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
-      spatial_dims[i] = dim;
-      kernel_spatial_dims[i] = i;
+    SmallVector<int64_t, num_spatial_dims> spatial_dims;
+    SmallVector<int64_t, num_spatial_dims> lhs_dilation;
+    SmallVector<int64_t, num_spatial_dims> rhs_dilation;
+    SmallVector<int64_t, num_spatial_dims * 2> paddings;
 
-      conv_paddings[i * 2] = dims.spatial_dims[i].pad_before;
-      conv_paddings[i * 2 + 1] = dims.spatial_dims[i].pad_after;
-      lhs_dilation[i] = dims.spatial_dims[i].stride;
-      rhs_dilation[i] = dilations[dim];
+    for (int i : llvm::seq<int>(0, num_spatial_dims)) {
+      const int64_t dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      spatial_dims.push_back(dim);
+      const auto &spatial_dim_i = dims.spatial_dims[i];
+      lhs_dilation.push_back(spatial_dim_i.stride);
+      rhs_dilation.push_back(dilations[dim]);
+      paddings.push_back(spatial_dim_i.pad_before);
+      paddings.push_back(spatial_dim_i.pad_after);
     }
+
     RankedTensorType paddings_ty = RankedTensorType::get(
         {num_spatial_dims, 2}, rewriter.getIntegerType(64));
-    auto paddings_attr = DenseIntElementsAttr::get(paddings_ty, conv_paddings);
+    auto paddings_attr = DenseIntElementsAttr::get(paddings_ty, paddings);
+
     auto spatial_dims_attr = GetI64ElementsAttr(spatial_dims, &rewriter);
 
     Value filter = op.filter();
 
+    const int feature_dim =
+        tensorflow::GetTensorFeatureDimIndex(num_dims, data_format);
+    const int64_t in_depth = *(input_shape.begin() + feature_dim);
+    const int64_t filter_in_depth = filter_shape[num_spatial_dims];
+    const int64_t feature_group_count = in_depth / filter_in_depth;
+
     if (feature_group_count != 1) {
       /*
       // TODO(parkers): Convert this code to mlir.
@@ -2823,15 +3402,25 @@ class ConvertConv2DBackpropInputOp
       return failure();
     }
 
+    auto kernel_spatial_dims_attr =
+        GetI64ElementsAttrForSeq(0, num_spatial_dims, &rewriter);
+
     // Mirror the filter in the spatial dimensions.
-    filter = rewriter.create<ReverseOp>(
-        loc, filter, GetI64ElementsAttr(kernel_spatial_dims, &rewriter));
+    filter = rewriter.create<ReverseOp>(op.getLoc(), filter,
+                                        kernel_spatial_dims_attr);
+
+    const int batch_dim =
+        tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
+    auto batch_dim_attr = rewriter.getI64IntegerAttr(batch_dim);
+    auto feature_dim_attr = rewriter.getI64IntegerAttr(feature_dim);
 
     // activation gradients
     //   = gradients (with padding and dilation) <conv> mirrored_weights
     Value result = rewriter.create<ConvOp>(
-        loc, op.getType(), op.out_backprop(), filter,
-        /*window_strides=*/GetI64ElementsAttr(ones, &rewriter),
+        op.getLoc(), op.getType(), op.out_backprop(), filter,
+        /*window_strides=*/
+        GetI64ElementsAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
+                                   &rewriter),
         /*padding=*/paddings_attr, GetI64ElementsAttr(lhs_dilation, &rewriter),
         GetI64ElementsAttr(rhs_dilation, &rewriter),
         ConvDimensionNumbers::get(
@@ -2845,8 +3434,7 @@ class ConvertConv2DBackpropInputOp
             rewriter.getI64IntegerAttr(num_spatial_dims + 1),
             /*kernel_output_feature_dimension=*/
             rewriter.getI64IntegerAttr(num_spatial_dims),
-            /*kernel_spatial_dimensions=*/
-            GetI64ElementsAttr(kernel_spatial_dims, &rewriter),
+            /*kernel_spatial_dimensions=*/kernel_spatial_dims_attr,
             /*output_batch_dimension=*/batch_dim_attr,
             /*output_feature_dimension=*/feature_dim_attr,
             /*output_spatial_dimensions=*/spatial_dims_attr,
@@ -2861,67 +3449,79 @@ class ConvertConv2DBackpropInputOp
   }
 };
 
-// Converts tf.Conv2DBackpropFilterOp into:
-//   %result = "xla_hlo.convolution"(%input, %out_backprop)
-class ConvertConv2DBackpropFilterOp
-    : public OpRewritePattern<TF::Conv2DBackpropFilterOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
+using ConvertConv2DBackpropInputOp =
+    ConvertConvBackpropInputOp<TF::Conv2DBackpropInputOp,
+                               /*num_spatial_dims=*/2>;
+using ConvertConv3DBackpropInputOp =
+    ConvertConvBackpropInputOp<TF::Conv3DBackpropInputV2Op,
+                               /*num_spatial_dims=*/3>;
 
-  LogicalResult matchAndRewrite(TF::Conv2DBackpropFilterOp op,
+// Converts tf.Conv?DBackpropFilterOp into:
+//   %result = "xla_hlo.convolution"(%input, %out_backprop)
+template <typename OpTy, int num_spatial_dims>
+class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
+ public:
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     // Unpack all of the attributes.
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format)) {
+    if (!FormatFromString(op.data_format().str(), &data_format))
       return failure();
-    }
+
     tensorflow::Padding padding;
     if (!GetPaddingFromString(op.padding().str(), &padding).ok())
       return failure();
 
     auto out_backprop_ty =
-        op.out_backprop().getType().dyn_cast<RankedTensorType>();
-    if (!out_backprop_ty || !out_backprop_ty.hasStaticShape()) return failure();
+        op.out_backprop().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
+
+    for (RankedTensorType ty : {out_backprop_ty, input_ty})
+      if (!ty || !ty.hasStaticShape()) return failure();
+
     ArrayRef<int64_t> out_backprop_shape = out_backprop_ty.getShape();
-    auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
-    if (!input_ty || !input_ty.hasStaticShape()) return failure();
     ArrayRef<int64_t> input_shape = input_ty.getShape();
 
     DenseIntElementsAttr filter_shape_attr;
     if (!matchPattern(op.filter_sizes(), m_Constant(&filter_shape_attr)) ||
-        filter_shape_attr.getType().getRank() != 1) {
+        filter_shape_attr.getType().getRank() != 1)
       return failure();
-    }
 
+    auto dilations_attr = GetI64ElementsAttr(op.dilations());
+    std::vector<int> dilations{
+        dilations_attr.template getValues<int64_t>().begin(),
+        dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.strides());
     std::vector<tensorflow::int32> strides{
-        strides_attr.getValues<int64_t>().begin(),
-        strides_attr.getValues<int64_t>().end()};
-    auto dilations_attr = GetI64ElementsAttr(op.dilations());
-    SmallVector<int, 4> dilations{dilations_attr.getValues<int64_t>().begin(),
-                                  dilations_attr.getValues<int64_t>().end()};
-    auto explicit_paddings_attr = GetI64ElementsAttr(op.explicit_paddings());
-    SmallVector<tensorflow::int64, 4> explicit_paddings{
-        explicit_paddings_attr.getValues<int64_t>().begin(),
-        explicit_paddings_attr.getValues<int64_t>().end()};
+        strides_attr.template getValues<int64_t>().begin(),
+        strides_attr.template getValues<int64_t>().end()};
 
-    int num_spatial_dims = 2;
-    int num_dims = num_spatial_dims + 2;
-    int batch_dim = tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
-    int feature_dim =
-        tensorflow::GetTensorFeatureDimIndex(num_dims, data_format);
+    std::vector<tensorflow::int64> explicit_paddings;
+    if (padding == tensorflow::Padding::EXPLICIT) {
+      // EXPLICIT padding mode and the associated attribute is limited to
+      // Conv2DBackpropFilter. So, fetch attribute by identifier instead of the
+      // op.explicit_paddings() attribute getter.
+      ArrayRef<Attribute> explicit_paddings_attr =
+          op.template getAttrOfType<ArrayAttr>("explicit_paddings").getValue();
+      explicit_paddings.reserve(explicit_paddings_attr.size());
+      for (Attribute explicit_padding : explicit_paddings_attr)
+        explicit_paddings.push_back(
+            explicit_padding.cast<IntegerAttr>().getInt());
+    }
 
-    auto filter_shape =
-        llvm::to_vector<4>(filter_shape_attr.getValues<int32_t>());
-    if (filter_shape.size() != num_dims) return failure();
+    constexpr int num_dims = num_spatial_dims + 2;
+    auto filter_shape = filter_shape_attr.getValues<int32_t>();
 
     // Reuse dimension computation logic from conv_grad_shape_utils.cc.
     tensorflow::ConvBackpropDimensions dims;
     if (!tensorflow::ConvBackpropComputeDimensionsV2(
-             "", num_spatial_dims, ToTensorShape<int64_t>(input_shape),
-             ToTensorShape<int>(filter_shape),
-             ToTensorShape<int64_t>(out_backprop_shape), dilations, strides,
-             padding, explicit_paddings, data_format, &dims)
+             /*label=*/"", num_spatial_dims,
+             ToTensorShape<int64_t, num_dims>(input_shape),
+             ToTensorShape<int32_t, num_dims>(filter_shape),
+             ToTensorShape<int64_t, num_dims>(out_backprop_shape), dilations,
+             strides, padding, explicit_paddings, data_format, &dims)
              .ok()) {
       return failure();
     }
@@ -2932,9 +3532,12 @@ class ConvertConv2DBackpropFilterOp
     // 1. In the case of group convolution, move the num_groups dimension before
     // the batch dimension
     // 2. Swap the roles of the batch and feature dimensions.
-    int64_t in_depth = input_shape[feature_dim];
-    int64_t filter_in_depth = filter_shape[num_spatial_dims];
-    int64_t feature_group_count = in_depth / filter_in_depth;
+    const int feature_dim =
+        tensorflow::GetTensorFeatureDimIndex(num_dims, data_format);
+    const int64_t in_depth = input_shape[feature_dim];
+    const int64_t filter_in_depth = *(filter_shape.begin() + num_spatial_dims);
+    const int64_t feature_group_count = in_depth / filter_in_depth;
+
     if (feature_group_count != 1) {
       /*
           // TODO(parkers): translate this code to mlir.
@@ -2946,21 +3549,20 @@ class ConvertConv2DBackpropFilterOp
     }
 
     // Compute ConvDimensionNumbers, dilation, and padding.
-    SmallVector<int64_t, 8> conv_padding(num_spatial_dims * 2);
-    SmallVector<int64_t, 4> rhs_dilation(num_spatial_dims);
-    SmallVector<int64_t, 4> window_strides(num_spatial_dims);
-    SmallVector<int64_t, 4> lhs_dilation(num_spatial_dims, 1);
-    SmallVector<int64_t, 4> spatial_dims(num_spatial_dims);
-    SmallVector<int64_t, 4> kernel_spatial_dims(num_spatial_dims);
+    SmallVector<int64_t, num_spatial_dims> spatial_dims;
+    SmallVector<int64_t, num_spatial_dims> kernel_spatial_dims;
+    SmallVector<int64_t, num_spatial_dims> rhs_dilation;
+    SmallVector<int64_t, num_spatial_dims * 2> paddings;
+    SmallVector<int64_t, num_spatial_dims> window_strides;
 
     // The filter gradients are computed by a convolution of the input
     // activations and the output gradients, with some appropriate padding.
     // See the comment at the top of conv_grad_ops.h for details.
 
-    for (int64_t i = 0; i < num_spatial_dims; ++i) {
-      int64_t dim =
+    for (int i : llvm::seq<int>(0, num_spatial_dims)) {
+      const int64_t dim =
           tensorflow::GetTensorSpatialDimIndex(num_dims, data_format, i);
-      kernel_spatial_dims[i] = dim;
+      kernel_spatial_dims.push_back(dim);
       // Besides padding the input, we will also expand output_rows to
       //    expanded_out_rows = (output_rows - 1) * stride + 1
       // with zeros in between:
@@ -2969,8 +3571,9 @@ class ConvertConv2DBackpropFilterOp
       //
       // This is done by specifying the window dilation factors in the
       // convolution HLO below.
-      rhs_dilation[i] = dims.spatial_dims[i].stride;
-      window_strides[i] = dilations[dim];
+      const auto &spatial_dim_i = dims.spatial_dims[i];
+      rhs_dilation.push_back(spatial_dim_i.stride);
+      window_strides.push_back(dilations[dim]);
 
       // We will also need to pad the input with zeros such that after the
       // convolution, we get the right size for the filter.
@@ -2978,8 +3581,8 @@ class ConvertConv2DBackpropFilterOp
       // expanded_out_rows as a filter, we should get filter_rows back.
 
       const int64_t padded_in_size =
-          dims.spatial_dims[i].expanded_output_size +
-          (dims.spatial_dims[i].filter_size - 1) * dilations[dim];
+          spatial_dim_i.expanded_output_size +
+          (spatial_dim_i.filter_size - 1) * dilations[dim];
 
       // However it can be smaller than input_rows: in this
       // case it means some of the inputs are not used.
@@ -2995,8 +3598,7 @@ class ConvertConv2DBackpropFilterOp
       // and input "C" is not used at all.
       //
       // We apply negative padding in this case.
-      const int64_t pad_total =
-          padded_in_size - dims.spatial_dims[i].input_size;
+      const int64_t pad_total = padded_in_size - spatial_dim_i.input_size;
 
       // + For the EXPLICIT padding, we pad the top/left side with the explicit
       //   padding and pad the bottom/right side with the remaining space.
@@ -3013,26 +3615,27 @@ class ConvertConv2DBackpropFilterOp
                                      : padding == tensorflow::Padding::SAME
                                            ? std::max<int64_t>(pad_total / 2, 0)
                                            : 0;
-      conv_padding[i * 2] = pad_before;
-      conv_padding[i * 2 + 1] = pad_total - pad_before;
+      paddings.push_back(pad_before);
+      paddings.push_back(pad_total - pad_before);
     }
 
     RankedTensorType paddings_ty = RankedTensorType::get(
         {num_spatial_dims, 2}, rewriter.getIntegerType(64));
-    auto paddings_attr = DenseIntElementsAttr::get(paddings_ty, conv_padding);
-    auto out_spatial_dims_attr =
-        GetI64ElementsAttrForSeq(0, num_spatial_dims, &rewriter);
+    auto paddings_attr = DenseIntElementsAttr::get(paddings_ty, paddings);
     auto kernel_spatial_dims_attr =
         GetI64ElementsAttr(kernel_spatial_dims, &rewriter);
 
+    const int batch_dim =
+        tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
     auto batch_dim_attr = rewriter.getI64IntegerAttr(batch_dim);
     auto feature_dim_attr = rewriter.getI64IntegerAttr(feature_dim);
 
-    Location loc = op.getLoc();
     Value result = rewriter.create<ConvOp>(
-        loc, op.getType(), op.input(), op.out_backprop(),
+        op.getLoc(), op.getType(), op.input(), op.out_backprop(),
         /*window_strides=*/GetI64ElementsAttr(window_strides, &rewriter),
-        /*padding=*/paddings_attr, GetI64ElementsAttr(lhs_dilation, &rewriter),
+        /*padding=*/paddings_attr, /*lhs_dilation=*/
+        GetI64ElementsAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
+                                   &rewriter),
         GetI64ElementsAttr(rhs_dilation, &rewriter),
         ConvDimensionNumbers::get(
             // Swap batch_dim and feature_dim in the activations.
@@ -3050,7 +3653,8 @@ class ConvertConv2DBackpropFilterOp
             rewriter.getI64IntegerAttr(num_spatial_dims),
             /*output_feature_dimension=*/
             rewriter.getI64IntegerAttr(num_spatial_dims + 1),
-            /*output_spatial_dimensions=*/out_spatial_dims_attr,
+            /*output_spatial_dimensions=*/
+            GetI64ElementsAttrForSeq(0, num_spatial_dims, &rewriter),
             rewriter.getContext()),
         rewriter.getI64IntegerAttr(feature_group_count),
         /*batch_group_count=*/rewriter.getI64IntegerAttr(1),
@@ -3062,6 +3666,13 @@ class ConvertConv2DBackpropFilterOp
   }
 };
 
+using ConvertConv2DBackpropFilterOp =
+    ConvertConvBackpropFilterOp<TF::Conv2DBackpropFilterOp,
+                                /*num_spatial_dims=*/2>;
+using ConvertConv3DBackpropFilterOp =
+    ConvertConvBackpropFilterOp<TF::Conv3DBackpropFilterV2Op,
+                                /*num_spatial_dims=*/3>;
+
 class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
  public:
   using OpRewritePattern::OpRewritePattern;
@@ -3091,13 +3702,20 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
     output_dims.insert(output_dims.begin() + axis, depth);
 
     Location loc = op.getLoc();
+
+    // The iota result is the effective output shape of the computation,
+    // and indices must be broadcast into it. At this point, this computation
+    // would need to be reworked quite a bit to support dynamic shapes, so
+    // just using static broadcasting.
     auto index_type = RankedTensorType::get(output_dims, element_type);
-    Value compare = rewriter.create<CompareOp>(
-        loc, op.indices(),
-        rewriter.create<IotaOp>(
-            loc, index_type,
-            IntegerAttr::get(rewriter.getIntegerType(64), axis)),
-        GetI64ElementsAttr(broadcast_dims, &rewriter),
+    auto iota = rewriter.create<IotaOp>(
+        loc, index_type, IntegerAttr::get(rewriter.getIntegerType(64), axis));
+    auto broadcast_indices = rewriter.create<BroadcastInDimOp>(
+        loc, index_type, op.indices(),
+        GetI64ElementsAttr(broadcast_dims, &rewriter));
+
+    Value compare = rewriter.create<xla_hlo::CompareOp>(
+        loc, broadcast_indices, iota,
         StringAttr::get("EQ", rewriter.getContext()));
     Value on_value = rewriter.create<BroadcastOp>(
         loc, op.getType(), op.on_value(),
@@ -3163,6 +3781,27 @@ class ConvertInfeedDequeueTupleOp
     auto data_and_token =
         rewriter.create<InfeedOp>(op.getLoc(), data_and_token_type, token,
                                   /*infeed_config=*/rewriter.getStringAttr(""));
+    if (op._XlaSharding().hasValue()) {
+      // _XlaSharding attribute in TF is a serialized string of the OpSharding
+      // proto, so convert to a text form here.
+      ::xla::OpSharding sharding_proto;
+      if (!sharding_proto.ParseFromString(op._XlaSharding().getValue().str()))
+        return failure();
+
+      // Token is a control signal and not a real data, so arbitrarily assign
+      // the token to device 0.
+      if (sharding_proto.type() == ::xla::OpSharding::TUPLE)
+        *sharding_proto.add_tuple_shardings() =
+            ::xla::sharding_builder::AssignDevice(0);
+
+      std::string sharding_str;
+      if (!::tensorflow::protobuf::TextFormat::PrintToString(sharding_proto,
+                                                             &sharding_str))
+        return failure();
+
+      data_and_token.setAttr(kShardingAttr,
+                             rewriter.getStringAttr(sharding_str));
+    }
 
     // The infeed instruction produces a tuple of the infeed data and a token
     // type. Emit get_tuple_element to get infeed data tuple.
@@ -3702,36 +4341,91 @@ class ConvertXlaShardingOp : public OpRewritePattern<TF::XlaShardingOp> {
                                 PatternRewriter &rewriter) const override {
     // TODO(b/148313088): define sharding attribute struct in MLIR intead of
     // using a string.
-    auto sharding = op.getAttrOfType<StringAttr>("_XlaSharding");
-    if (!sharding) {
-      return failure();
-    }
+    if (!op._XlaSharding().hasValue()) return failure();
 
     // _XlaSharding attribute in TF is a serialized string of the OpSharding
     // proto, so convert to a text form here.
     ::xla::OpSharding sharding_proto;
     std::string sharding_str;
-    if (!sharding_proto.ParseFromString(sharding.getValue().str())) {
+    if (!sharding_proto.ParseFromString(op._XlaSharding().getValue().str()) ||
+        !::tensorflow::protobuf::TextFormat::PrintToString(sharding_proto,
+                                                           &sharding_str))
       return failure();
-    }
-    if (!::tensorflow::protobuf::TextFormat::PrintToString(sharding_proto,
-                                                           &sharding_str)) {
-      return failure();
-    }
 
     auto custom_call = rewriter.create<xla_hlo::CustomCallOp>(
         op.getLoc(), op.getType(), op.input(),
         /*call_target_name=*/rewriter.getStringAttr("Sharding"),
         /*has_side_effect=*/rewriter.getBoolAttr(false),
         /*backend_config=*/rewriter.getStringAttr(""));
-    custom_call.setAttr("xla_hlo.sharding",
-                        rewriter.getStringAttr(sharding_str));
+    custom_call.setAttr(kShardingAttr, rewriter.getStringAttr(sharding_str));
     rewriter.replaceOp(op, custom_call.getResult());
 
     return success();
   }
 };
 
+// Converts a TF InplaceUpdate op to DynamicUpdateSlice HLO.
+class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::InplaceUpdateOp op,
+                                PatternRewriter &rewriter) const override {
+    auto input = op.x();
+    auto indices = op.i();
+    auto updates = op.v();
+
+    // Slice each row of `i` and `v` to perform a separate dynamic-update-slice
+    // on the contents of `x`.
+    auto input_type = input.getType().cast<ShapedType>();
+    auto updates_type = updates.getType().cast<ShapedType>();
+    auto indices_type = indices.getType().cast<ShapedType>();
+    if (!indices_type.hasStaticShape()) return failure();
+
+    if (indices_type.getRank() != 1) return failure();
+
+    SmallVector<Type, 4> unpacked_indices_type(
+        indices_type.getDimSize(0),
+        RankedTensorType::get({}, indices_type.getElementType()));
+    auto zero_attr = IntegerAttr::get(rewriter.getIntegerType(64), 0);
+    auto unpacked_indices = rewriter.create<TF::UnpackOp>(
+        op.getLoc(), unpacked_indices_type, indices, zero_attr);
+
+    SmallVector<int64_t, 4> split_updates_shape;
+    split_updates_shape.append(updates_type.getShape().begin(),
+                               updates_type.getShape().end());
+    split_updates_shape.front() = 1;
+    SmallVector<Type, 4> split_updates_type;
+    split_updates_type.resize(
+        updates_type.getShape().front(),
+        RankedTensorType::get(split_updates_shape,
+                              updates_type.getElementType()));
+
+    auto cst =
+        rewriter.create<xla_hlo::ConstOp>(op.getLoc(), zero_attr).getResult();
+    auto split_updates = rewriter.create<TF::SplitOp>(
+        op.getLoc(), split_updates_type, cst, updates);
+
+    SmallVector<Value, 6> input_indices;
+    input_indices.resize(input_type.getRank(), cst);
+
+    SmallVector<int64_t, 6> starts(updates_type.getRank(), 0);
+    SmallVector<int64_t, 6> strides(updates_type.getRank(), 1);
+    SmallVector<int64_t, 6> limits(updates_type.getShape().begin(),
+                                   updates_type.getShape().end());
+
+    for (auto pair :
+         llvm::zip(unpacked_indices.output(), split_updates.output())) {
+      input_indices.front() = std::get<0>(pair);
+      input = rewriter.create<xla_hlo::DynamicUpdateSliceOp>(
+          op.getLoc(), op.getType(), input, std::get<1>(pair), input_indices);
+    }
+
+    rewriter.replaceOp(op, input);
+    return success();
+  }
+};
+
 // Converts a TF XlaDynamicUpdateSlice op to DynamicUpdateSlice HLO.
 class ConvertXlaDynamicUpdateSliceOp
     : public OpRewritePattern<TF::XlaDynamicUpdateSliceOp> {
@@ -3831,9 +4525,561 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
   }
 };
 
+// Converts a TF QR op to HLO.
+class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::QrOp op,
+                                PatternRewriter &rewriter) const override {
+    // Block Householder QR Factorization. Algorithm 5.2.2 of Golub and van
+    // Loan. def qr_blocked(a, block_size):
+    //   m = a.shape[0]
+    //   n = a.shape[1]
+    //   q = np.eye(m)
+    //   for i in xrange(0, min(m, n), block_size):
+    //     k = min(block_size, min(m, n) - s)
+    //     (a, vs, taus) = qr(a[i:, i:i+k])
+    //     y = vs
+    //     w = ComputeWYRepresentation(vs, taus, m-i, k)
+    //     a[i:, i+r:] += np.dot(y, np.dot(w.T, a[i:, i+k:]))
+    //     q[:, i:] += np.dot(q[:, i:], np.dot(w, y.T))
+    //   return (q, a)
+    auto type = op.input().getType().dyn_cast<RankedTensorType>();
+    if (!type || !type.hasStaticShape()) return failure();
+    // The block size is chosen to match old bridge lowering.
+    constexpr int64_t kBlockSize = 128;
+    Value a = op.input();
+    int64_t m = type.getDimSize(type.getRank() - 2);
+    int64_t n = type.getDimSize(type.getRank() - 1);
+    int64_t p = std::min(m, n);
+    auto batch_dims = type.getShape().drop_back(2);
+    auto iota_type = RankedTensorType::get({m, m}, rewriter.getIntegerType(32));
+    auto iota0 = rewriter.create<IotaOp>(op.getLoc(), iota_type,
+                                         rewriter.getI64IntegerAttr(0));
+    auto iota1 = rewriter.create<IotaOp>(op.getLoc(), iota_type,
+                                         rewriter.getI64IntegerAttr(1));
+    Value compare = rewriter.create<CompareOp>(
+        op.getLoc(), iota0, iota1,
+        StringAttr::get("EQ", rewriter.getContext()));
+    Value identity_matrix =
+        rewriter.create<ConvertOp>(op.getLoc(), compare, type.getElementType());
+    auto q_shape = llvm::to_vector<4>(type.getShape());
+    q_shape.back() = m;
+    Value q = rewriter.create<BroadcastOp>(
+        op.getLoc(), RankedTensorType::get(q_shape, type.getElementType()),
+        identity_matrix, GetI64ElementsAttr(batch_dims, &rewriter));
+    auto precision_config = rewriter.getStrArrayAttr({"HIGHEST", "HIGHEST"});
+    for (int64_t i = 0; i < p; i += kBlockSize) {
+      int64_t k = std::min(kBlockSize, p - i);
+      auto a_block =
+          SliceInMinorDims(op.getLoc(), a, {i, i}, {m, i + k}, &rewriter);
+      Value r_block;
+      Value taus;
+      Value vs;
+      QRBlock(op.getLoc(), a_block, &r_block, &taus, &vs, &rewriter);
+      a = UpdateSliceInMinorDims(op.getLoc(), a, r_block, {i, i}, &rewriter);
+
+      // Compute the I-WY block representation of a product of Householder
+      // matrices.
+      Value w =
+          ComputeWYRepresentation(op.getLoc(), type.getElementType(),
+                                  batch_dims, vs, taus, m - i, k, &rewriter);
+      auto y = vs;
+
+      // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
+      Value a_panel =
+          SliceInMinorDims(op.getLoc(), a, {i, i + k}, {m, n}, &rewriter);
+      auto a_update = BatchDot(op.getLoc(), w, true, a_panel, false,
+                               batch_dims.size(), precision_config, &rewriter);
+      a_update = BatchDot(op.getLoc(), y, false, a_update, false,
+                          batch_dims.size(), precision_config, &rewriter);
+      a_panel = rewriter.create<AddOp>(op.getLoc(), a_panel, a_update);
+      a = UpdateSliceInMinorDims(op.getLoc(), a, a_panel, {i, i + k},
+                                 &rewriter);
+
+      // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
+      Value q_panel =
+          SliceInMinorDims(op.getLoc(), q, {0, i}, {m, m}, &rewriter);
+      Value q_update = BatchDot(op.getLoc(), q_panel, false, w, false,
+                                batch_dims.size(), precision_config, &rewriter);
+      q_update = BatchDot(op.getLoc(), q_update, false, y, true,
+                          batch_dims.size(), precision_config, &rewriter);
+      q_panel = rewriter.create<AddOp>(op.getLoc(), q_panel, q_update);
+      q = UpdateSliceInMinorDims(op.getLoc(), q, q_panel, {i}, &rewriter);
+    }
+    // full_matrices is false when only a partial result in needed. Slice to the
+    // needed dimensions here.
+    if (!op.full_matrices()) {
+      q = SliceInMinorDims(op.getLoc(), q, {0, 0}, {m, p}, &rewriter);
+      a = SliceInMinorDims(op.getLoc(), a, {0, 0}, {p, n}, &rewriter);
+    }
+    rewriter.replaceOp(op, {q, a});
+    return success();
+  }
+
+ private:
+  // Computes a Householder reflection of the form:
+  // H = I - tau v v.T.
+  // such that
+  // H . ( x1  ) = ( x1   )
+  //     ( x2  ) = ( x2   )
+  //     ( ... ) = ( ...  )
+  //     ( xk  ) = ( beta )
+  //     ( ... )   ( 0    )
+  //     ( ... )   ( 0    )
+  // Unlike the usual formulation, we allow the caller to supply 'k' rather than
+  // only providing the relevant part of 'x' to maintain XLA's static shape
+  // invariant. In addition, the implementation supports batching.
+  // Pseudo-code, without batching:
+  //   alpha = x[k]
+  //   x_copy = np.copy(x)
+  //   x_copy[:k+1] = 0
+  //   xnorm = norm2(x_copy)
+  //   if xnorm == 0:
+  //     beta = alpha
+  //     tau = 0
+  //     v = np.zeros_like(x)
+  //   else:
+  //     beta = - np.sign(alpha) * dlapy2(alpha, xnorm)
+  //     tau = (beta - alpha) / beta
+  //     v = x / (alpha - beta)
+  //   v[k] = 1
+  //   return (v, tau, beta)
+  void House(Location loc, Value x, Value k, ArrayRef<int64_t> batch_dims,
+             const int64_t m, OpBuilder *builder, Value *v, Value *tau,
+             Value *beta) const {
+    auto x_type = x.getType().cast<RankedTensorType>();
+
+    llvm::SmallVector<int64_t, 4> batch_dim_ids(batch_dims.size());
+    std::iota(batch_dim_ids.begin(), batch_dim_ids.end(), 0);
+    const int64_t minor_dim = batch_dims.size();
+
+    Value zero = GetScalarConstOfType(x_type.getElementType(), loc, 0, builder);
+    Value one = GetScalarConstOfType(x_type.getElementType(), loc, 1, builder);
+
+    // alpha = x[k]
+    Value alpha = DynamicSliceInMinorDims(loc, x, {k}, {1}, builder);
+    alpha = builder->create<ReshapeOp>(
+        loc, RankedTensorType::get(batch_dims, x_type.getElementType()), alpha);
+
+    // Compute x[k+1:] (padded with zeros in elements 0..k)
+    Value iota = builder->create<IotaOp>(
+        loc, RankedTensorType::get({m}, builder->getIntegerType(32)),
+        builder->getI64IntegerAttr(0));
+    Value gtk = builder->create<xla_chlo::BroadcastCompareOp>(
+        loc, iota, k, GetI64ElementsAttr({}, builder),
+        StringAttr::get("GT", builder->getContext()));
+    gtk = builder->create<ConvertOp>(loc, gtk, x_type.getElementType());
+    Value x_after_k = builder->create<xla_chlo::BroadcastMulOp>(
+        loc, x, gtk, GetI64ElementsAttr({minor_dim}, builder));
+    Value x_after_k_sq = builder->create<MulOp>(loc, x_after_k, x_after_k);
+    // sigma = np.dot(x[k+1:], x[k+1:])
+    auto sigma = builder->create<ReduceOp>(
+        loc, x_after_k_sq, zero, GetI64ElementsAttr({minor_dim}, builder));
+    BuildReduceBody<AddOp>(x_type.getElementType(), &sigma.body(), builder);
+    // mu = np.sqrt(x[k]*x[k] + sigma)
+    Value alpha_sq = builder->create<MulOp>(loc, alpha, alpha);
+    Value mu = builder->create<SqrtOp>(
+        loc, builder->create<AddOp>(loc, alpha_sq, sigma.getResult(0)));
+
+    Value sigma_is_zero = builder->create<xla_chlo::BroadcastCompareOp>(
+        loc, sigma.getResult(0), zero, GetI64ElementsAttr({}, builder),
+        StringAttr::get("EQ", builder->getContext()));
+    Value alpha_is_negative = builder->create<xla_chlo::BroadcastCompareOp>(
+        loc, alpha, zero, GetI64ElementsAttr({}, builder),
+        StringAttr::get("LT", builder->getContext()));
+    auto batch_size_one = builder->create<BroadcastOp>(
+        loc, alpha.getType(), one, GetI64ElementsAttr(batch_dims, builder));
+    Value signed_mu = builder->create<xla_chlo::BroadcastMulOp>(
+        loc,
+        builder->create<SelectOp>(loc, mu.getType(), alpha_is_negative,
+                                  batch_size_one,
+                                  builder->create<NegOp>(loc, batch_size_one)),
+        mu, GetI64ElementsAttr({}, builder));
+    *beta = builder->create<SelectOp>(loc, alpha.getType(), sigma_is_zero,
+                                      alpha, signed_mu);
+    *tau = builder->create<DivOp>(
+        loc, builder->create<SubOp>(loc, *beta, alpha), *beta);
+    Value zero_tau = builder->create<BroadcastOp>(
+        loc, alpha.getType(), zero, GetI64ElementsAttr(batch_dims, builder));
+    *tau = builder->create<SelectOp>(loc, alpha.getType(), sigma_is_zero,
+                                     zero_tau, *tau);
+    Value divisor = builder->create<SubOp>(loc, alpha, *beta);
+    divisor = builder->create<SelectOp>(loc, divisor.getType(), sigma_is_zero,
+                                        batch_size_one, divisor);
+
+    Value eqk = builder->create<xla_chlo::BroadcastCompareOp>(
+        loc, iota, k, GetI64ElementsAttr({}, builder),
+        StringAttr::get("EQ", builder->getContext()));
+    eqk = builder->create<ConvertOp>(loc, eqk, x_type.getElementType());
+    llvm::SmallVector<int64_t, 4> e_k_shape(batch_dims.size(), 1);
+    e_k_shape.push_back(m);
+    auto e_k = builder->create<BroadcastOp>(
+        loc, RankedTensorType::get(e_k_shape, x_type.getElementType()), eqk,
+        GetI64ElementsAttr(llvm::SmallVector<int64_t, 4>(batch_dims.size(), 1),
+                           builder));
+
+    // Form v as [0, 0, ..., 1] ++ x[k+1:] / divisor
+    // If sigma is zero, x[k+1:] is zero, so use any non-zero divisor.
+    // Note that the add performs a degenerate broadcast.
+    *v = builder->create<xla_chlo::BroadcastAddOp>(
+        loc, e_k,
+        StaticBinaryBroadcast<DivOp>(loc, x_after_k, divisor,
+                                     GetI64ElementsAttr(batch_dim_ids, builder),
+                                     *builder),
+        /*broadcast_dimensions=*/nullptr);
+  }
+
+  // Householder QR decomposition. Algorithm 5.2.1 from Golub and Van
+  // Loan "Matrix Computations", 4th Edition. This is an unblocked
+  // implementation used as an inner routine of the blocked implementation.
+  // Algorithm is adapted slightly so the shapes inside the loop are static, at
+  // the cost of some redundant computation. Since this is used as an inner
+  // block kernel, accumulates the Householder transformations (vs, taus) rather
+  // than the matrix q. Equivalent Python code, without batching: def qr(a):
+  //   m = a.shape[0]
+  //   n = a.shape[1]
+  //   vs = np.zeros([m, n])
+  //   taus = np.zeros([n])
+  //   for j in xrange(min(m, n)):
+  //     v, tau, beta = house(a[:, j], j)
+  //     # Unusually, we apply the Householder transformation to the entirety of
+  //     # a, wasting FLOPs to maintain the static shape invariant that XLA
+  //     # requires. For columns that precede j this has no effect.
+  //     a[:, :] -= tau * np.dot(v[:, np.newaxis],
+  //                              np.dot(v[np.newaxis, :], a[:, :]))
+  //     # Form column j explicitly rather than relying on the precision of the
+  //     # Householder update.
+  //     a[j, j] = beta
+  //     a[j+1:, j] = np.zeros([m - j - 1], dtype=a.dtype)
+  //     vs[:, j] = v
+  //     taus[j] = tau
+  //   return (q, vs, taus)
+  void QRBlock(Location loc, Value a, Value *r, Value *taus, Value *vs,
+               PatternRewriter *rewriter) const {
+    auto a_type = a.getType().cast<RankedTensorType>();
+    const int num_dims = a_type.getRank();
+    assert(num_dims >= 2 && "Argument to QR must have rank >= 2");
+
+    const int64_t m = a_type.getDimSize(a_type.getRank() - 2);
+    const int64_t n = a_type.getDimSize(a_type.getRank() - 1);
+
+    const int64_t num_batch_dims = num_dims - 2;
+    auto batch_dims = a_type.getShape().take_front(num_batch_dims);
+    llvm::SmallVector<int64_t, 4> batch_dim_indices(batch_dims.size());
+    std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
+
+    auto qr_body_fn = [&](Location loc, Value j, ArrayRef<Value> old_values,
+                          SmallVectorImpl<Value> *new_values,
+                          OpBuilder *builder) {
+      auto a = old_values[0];
+      auto vs = old_values[1];
+      auto taus = old_values[2];
+
+      // v, beta = house(a[:, j], j)
+      auto x = DynamicSliceInMinorDims(loc, a, {j}, {1}, builder);
+      auto x_collapsed_shape = llvm::to_vector<4>(batch_dims);
+      x_collapsed_shape.push_back(m);
+      auto x_collapsed = builder->create<ReshapeOp>(
+          loc,
+          RankedTensorType::get(x_collapsed_shape,
+                                getElementTypeOrSelf(x.getType())),
+          x);
+      Value v, tau, beta;
+      House(loc, x_collapsed, j, batch_dims, m, builder, &v, &tau, &beta);
+
+      auto shape = llvm::to_vector<4>(batch_dims);
+      shape.append({1, m});
+      auto v_broadcast = builder->create<ReshapeOp>(
+          loc, RankedTensorType::get(shape, getElementTypeOrSelf(v.getType())),
+          v);
+      // a[:, :] -= tau * np.dot(v[:, np.newaxis],
+      //                          np.dot(v[np.newaxis, :], a[:, :]))
+      auto precision = builder->getStrArrayAttr({"HIGHEST", "HIGHEST"});
+      auto vva = BatchDot(loc, v_broadcast, false, a, false, num_batch_dims,
+                          precision, builder);
+      vva = BatchDot(loc, v_broadcast, true, vva, false, num_batch_dims,
+                     precision, builder);
+      auto tau_x_vva = StaticBinaryBroadcast<xla_hlo::MulOp>(
+          loc, tau, vva, GetI64ElementsAttr(batch_dim_indices, builder),
+          *builder);
+      a = builder->create<SubOp>(loc, a, tau_x_vva);
+
+      // It is more precise to populate column 'k' explicitly, rather than
+      // computing it implicitly by applying the Householder transformation.
+      // a[k,k] = beta
+      // a[k+1:,k] = np.zeros([m-k-1], dtype=a.dtype)
+      auto iota = builder->create<IotaOp>(
+          loc, RankedTensorType::get({m, 1}, builder->getIntegerType(32)),
+          builder->getI64IntegerAttr(0));
+      Value predecessor_mask = builder->create<xla_chlo::BroadcastCompareOp>(
+          loc, iota, j, GetI64ElementsAttr({}, builder),
+          StringAttr::get("LT", builder->getContext()));
+      predecessor_mask = builder->create<ConvertOp>(loc, predecessor_mask,
+                                                    a_type.getElementType());
+      Value mask = builder->create<xla_chlo::BroadcastCompareOp>(
+          loc, iota, j, GetI64ElementsAttr({}, builder),
+          StringAttr::get("EQ", builder->getContext()));
+      mask = builder->create<ConvertOp>(loc, mask, a_type.getElementType());
+      llvm::SmallVector<int64_t, 4> broadcast_mask_shape(a_type.getRank(), 1);
+      broadcast_mask_shape[a_type.getRank() - 2] = m;
+      mask = builder->create<BroadcastOp>(
+          loc,
+          RankedTensorType::get(broadcast_mask_shape, a_type.getElementType()),
+          mask,
+          GetI64ElementsAttr(llvm::SmallVector<int64_t, 4>(num_batch_dims, 1),
+                             builder));
+      Value predecessor_masked_x = StaticBinaryBroadcast<MulOp>(
+          loc, x, predecessor_mask,
+          GetI64ElementsAttr({num_dims - 2, num_dims - 1}, builder), *builder);
+      Value masked_beta = StaticBinaryBroadcast<MulOp>(
+          loc, beta, mask, GetI64ElementsAttr(batch_dim_indices, builder),
+          *builder);
+      Value new_x =
+          builder->create<AddOp>(loc, predecessor_masked_x, masked_beta);
+      // Update a[:,j]
+      llvm::SmallVector<int64_t, 4> dim_ids(num_dims);
+      std::iota(dim_ids.begin(), dim_ids.end(), 0);
+      new_x = builder->create<BroadcastInDimOp>(
+          loc, a_type, new_x, GetI64ElementsAttr(dim_ids, builder));
+      const int64_t minor_dim = num_batch_dims;
+      auto iota_mn = builder->create<IotaOp>(
+          loc,
+          RankedTensorType::get(a_type.getShape(), builder->getIntegerType(32)),
+          builder->getI64IntegerAttr(minor_dim + 1));
+      Value xa_mask = builder->create<xla_chlo::BroadcastCompareOp>(
+          loc, iota_mn, j, GetI64ElementsAttr({}, builder),
+          StringAttr::get("EQ", builder->getContext()));
+      a = builder->create<SelectOp>(loc, a_type, xa_mask, new_x, a);
+
+      // vs[:, j] = v
+      llvm::SmallVector<int64_t, 4> vs_broadcast_dims(num_batch_dims + 1);
+      std::iota(vs_broadcast_dims.begin(), vs_broadcast_dims.end(), 0);
+      Value vs_zeros =
+          GetScalarConstOfType(a_type.getElementType(), loc, 0, builder);
+      vs_zeros = builder->create<BroadcastOp>(
+          loc, vs.getType(), vs_zeros,
+          GetI64ElementsAttr(vs.getType().cast<RankedTensorType>().getShape(),
+                             builder));
+      auto vs_update = builder->create<SelectOp>(
+          loc, vs.getType(), xa_mask,
+          StaticBinaryBroadcast<AddOp>(
+              loc, vs_zeros, v, GetI64ElementsAttr(vs_broadcast_dims, builder),
+              *builder),
+          vs_zeros);
+      vs = builder->create<AddOp>(loc, vs, vs_update);
+
+      // taus[j] = tau
+      llvm::SmallVector<int64_t, 4> tau_broadcast_dims(batch_dims.size());
+      std::iota(tau_broadcast_dims.begin(), tau_broadcast_dims.end(), 0);
+
+      auto iota_shape = llvm::to_vector<4>(batch_dims);
+      iota_shape.push_back(n);
+      auto iota_n = builder->create<IotaOp>(
+          loc, RankedTensorType::get(iota_shape, builder->getIntegerType(32)),
+          builder->getI64IntegerAttr(minor_dim));
+      Value taus_zeros =
+          GetScalarConstOfType(a_type.getElementType(), loc, 0, builder);
+      taus_zeros = builder->create<BroadcastOp>(
+          loc, taus.getType(), taus_zeros,
+          GetI64ElementsAttr(taus.getType().cast<RankedTensorType>().getShape(),
+                             builder));
+      Value taus_mask = builder->create<xla_chlo::BroadcastCompareOp>(
+          loc, iota_n, j, GetI64ElementsAttr({}, builder),
+          StringAttr::get("EQ", builder->getContext()));
+      auto taus_update = builder->create<SelectOp>(
+          loc, taus.getType(), taus_mask,
+          StaticBinaryBroadcast<AddOp>(
+              loc, taus_zeros, tau,
+              GetI64ElementsAttr(tau_broadcast_dims, builder), *builder),
+          taus_zeros);
+      taus = builder->create<AddOp>(loc, taus, taus_update);
+      new_values->assign({a, vs, taus});
+    };
+
+    Value zero =
+        GetScalarConstOfType(a_type.getElementType(), loc, 0, rewriter);
+    *vs = rewriter->create<BroadcastOp>(
+        loc, a_type, zero, GetI64ElementsAttr(a_type.getShape(), rewriter));
+    auto taus_shape = llvm::to_vector<4>(batch_dims);
+    taus_shape.push_back(n);
+    *taus = rewriter->create<BroadcastOp>(
+        loc, RankedTensorType::get(taus_shape, a_type.getElementType()), zero,
+        GetI64ElementsAttr(taus_shape, rewriter));
+
+    SmallVector<Value, 4> while_output;
+    CreateWhile32(loc, std::min(m, n), qr_body_fn, {a, *vs, *taus},
+                  &while_output, rewriter);
+    *r = while_output[0];
+    *vs = while_output[1];
+    *taus = while_output[2];
+  }
+
+  // Computes W and Y such that I-WY is equivalent to the sequence of
+  // Householder
+  // transformations given by vs and taus.
+  // Golub and van Loan, "Matrix Computations", algorithm 5.1.2.
+  // Y = np.zeros([m, n])
+  // W = np.zeros([m, n])
+  // Y[:, 0] = vs[:, 0]
+  // W[:, 0] = -taus[0] * vs[:, 0]
+  // for j in xrange(1, n):
+  //   v = vs[:, j]
+  //   z = -taus[j] * v - taus[j] * np.dot(W, np.dot(Y.T, v))
+  //   W[:, j] = z
+  //   Y[:, j] = v
+  // return W
+  // There is no need to return Y since at termination of the loop it is equal
+  // to vs.
+  Value ComputeWYRepresentation(Location loc, Type data_type,
+                                ArrayRef<int64_t> batch_dims, Value vs,
+                                Value taus, int64_t m, int64_t n,
+                                PatternRewriter *rewriter) const {
+    int64_t n_index = batch_dims.size() + 1;
+    llvm::SmallVector<int64_t, 4> batch_dim_indices(batch_dims.size());
+    std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
+
+    auto body_fn = [&](Location loc, Value j, ArrayRef<Value> old_values,
+                       SmallVectorImpl<Value> *new_values, OpBuilder *builder) {
+      // w has shape [..., m, n]
+      auto w = old_values[0];
+      const auto vs = old_values[1];
+      const auto taus = old_values[2];
+
+      // Want j values in range [1, ... n).
+      j = builder->create<AddOp>(
+          loc, j,
+          GetScalarConstOfType(getElementTypeOrSelf(j.getType()), loc, 1,
+                               builder));
+      // vs has shape [..., m, 1]
+      auto v = DynamicSliceInMinorDims(loc, vs, {j}, {1}, builder);
+      // beta has shape [..., 1]
+      auto beta = DynamicSliceInMinorDims(loc, taus, {j}, {1}, builder);
+
+      auto iota_shape = llvm::to_vector<4>(batch_dims);
+      iota_shape.append({m, n});
+      auto iota_mn = builder->create<IotaOp>(
+          loc, RankedTensorType::get(iota_shape, builder->getIntegerType(32)),
+          builder->getI64IntegerAttr(n_index));
+
+      // y has shape [..., m, n]
+      Value zero = GetScalarConstOfType(getElementTypeOrSelf(vs.getType()), loc,
+                                        0, builder);
+      zero = builder->create<BroadcastOp>(
+          loc, vs.getType(), zero,
+          GetI64ElementsAttr(vs.getType().cast<RankedTensorType>().getShape(),
+                             builder));
+      auto compare = builder->create<xla_chlo::BroadcastCompareOp>(
+          loc, iota_mn, j, GetI64ElementsAttr({}, builder),
+          StringAttr::get("GE", builder->getContext()));
+      auto y = builder->create<SelectOp>(loc, vs.getType(), compare, zero, vs);
+
+      // yv has shape [..., n, 1]
+      auto precision = builder->getStrArrayAttr({"HIGHEST", "HIGHEST"});
+      auto yv = BatchDot(loc, y, true, v, false, batch_dims.size(), precision,
+                         builder);
+      // wyv has shape [..., m, 1]
+      auto wyv = BatchDot(loc, w, false, yv, false, batch_dims.size(),
+                          precision, builder);
+
+      // z = -beta * (v + wyv)
+      auto neg_beta = builder->create<NegOp>(loc, beta);
+      auto v_wyv = builder->create<AddOp>(loc, v, wyv);
+      auto beta_broadcast_dims = llvm::to_vector<4>(batch_dim_indices);
+      beta_broadcast_dims.push_back(n_index);
+      auto z = StaticBinaryBroadcast<MulOp>(
+          loc, neg_beta, v_wyv,
+          GetI64ElementsAttr(beta_broadcast_dims, builder), *rewriter);
+
+      w = DynamicUpdateSliceInMinorDims(loc, w, z, {j}, builder);
+      new_values->assign({w, vs, taus});
+    };
+
+    Value w =
+        GetScalarConstOfType(getElementTypeOrSelf(data_type), loc, 0, rewriter);
+    auto w_shape = llvm::to_vector<4>(batch_dims);
+    w_shape.append({m, n});
+    w = rewriter->create<BroadcastOp>(loc,
+                                      RankedTensorType::get(w_shape, data_type),
+                                      w, GetI64ElementsAttr(w_shape, rewriter));
+    auto v = SliceInMinorDims(loc, vs, {0}, {1}, rewriter);
+    auto beta = SliceInMinorDims(loc, taus, {0}, {1}, rewriter);
+    auto neg_beta = rewriter->create<NegOp>(loc, beta);
+    auto beta_broadcast_dims = llvm::to_vector<4>(batch_dim_indices);
+    beta_broadcast_dims.push_back(n_index);
+    auto bv = StaticBinaryBroadcast<MulOp>(
+        loc, neg_beta, v, GetI64ElementsAttr(beta_broadcast_dims, rewriter),
+        *rewriter);
+    w = UpdateSliceInMinorDims(loc, w, bv, {0}, rewriter);
+
+    SmallVector<Value, 4> while_output;
+    CreateWhile32(loc, n - 1, body_fn, {w, vs, taus}, &while_output, rewriter);
+    return while_output[0];
+  }
+};
+
+// Emits debug information which includes the number of ops of each type which
+// failed to legalize.
+void EmitLegalizationErrors(Operation *op,
+                            const DenseSet<Operation *> &nonlegalized_ops) {
+  // Track the legalization failures by mapping op name to information about
+  // that failure: the number of unlegalized occurances of the op, and one
+  // example operation that failed.
+  std::map<StringRef, std::pair<int, Operation *>> op_name_to_error_info;
+  DenseSet<Operation *> error_ops;
+  for (Operation *nonlegalized_op : nonlegalized_ops) {
+    // Increment count of this legalization failure.
+    StringRef op_name = nonlegalized_op->getName().getStringRef();
+    // If this emplace is successful, it's the first time we've encountered
+    // this op type. Initialize count to 0 so that after increment, it is 1.
+    auto insertion_result = op_name_to_error_info.emplace(
+        op_name, std::make_pair(0, nonlegalized_op));
+    ++insertion_result.first->second.first;
+  }
+  std::vector<std::string> error_messages;
+  error_messages.reserve(op_name_to_error_info.size());
+  for (const auto &op_info : op_name_to_error_info) {
+    error_messages.push_back(
+        llvm::formatv("{0} (count: {1})", op_info.first, op_info.second.first));
+  }
+  Location loc = op->getLoc();
+  emitError(loc) << "The following operations cannot be legalized: "
+                 << llvm::join(error_messages, "; ")
+                 << ". These legalization failure(s) may be due to missing TF "
+                    "to HLO lowerings and/or unsupported attributes, etc.";
+  // Emit more information about the missing ops. This error message
+  // contains useful details beyond the op name (input and output shapes,
+  // attributes, etc.).
+  if (!VLOG_IS_ON(1) && nonlegalized_ops.size() != 1) {
+    emitError(loc)
+        << "Emitting more detail about one op that failed to legalize...";
+  } else if (VLOG_IS_ON(1)) {
+    emitError(loc) << "Emitting more detail about one of each type of op "
+                      "that failed to legalize...";
+  }
+  for (const auto &op_info : op_name_to_error_info) {
+    op_info.second.second->emitOpError() << "is not legalizable";
+    if (!VLOG_IS_ON(1)) break;
+  }
+}
+
+// Performs the lowering to XLA dialect.
+void LegalizeTF::runOnFunction() {
+  if (failed(
+          legalizeTF(getFunction(), allow_partial_conversion_, legalize_chlo_)))
+    signalPassFailure();
+}
+
+static PassRegistration<LegalizeTF> pass(
+    "xla-legalize-tf", "Legalize from TensorFlow to the XLA dialect");
+
+}  // end namespace
+
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_tf.inc"
 
-LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
+LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
+                         bool legalize_chlo) {
   MLIRContext *context = op->getContext();
 
   // Add lowering patterns to the list.
@@ -3846,17 +5092,19 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
   TF::PopulateLoweringTFPatterns(context, &patterns);
   patterns.insert<
       ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBatchMatMulV2Op,
-      ConvertBroadcastToOp, ConvertBF16FloorDivOp, ConvertConv2D,
-      ConvertDepthConv2D, ConvertConv2DBackpropFilterOp,
-      ConvertConv2DBackpropInputOp, ConvertCumsumOp, ConvertDiagPartOp,
-      ConvertEinsumOp, ConvertFusedBatchNormGradOp,
-      ConvertFusedBatchNormGradV2Op, ConvertFusedBatchNormGradV3Op,
-      ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp, ConvertLinSpaceOp,
-      ConvertMaxOp, ConvertMinOp, ConvertAvgPoolOp, ConvertMaxPoolOp,
-      ConvertMaxPoolGradOp, ConvertMeanOp, ConvertOneHotOp,
-      ConvertOutfeedEnqueueTupleOp, ConvertProdOp, ConvertRangeOp,
-      ConvertSelectV2Op, ConvertSigmoidOp, ConvertSizeOp,
-      ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
+      ConvertBiasAddOp, ConvertBroadcastToOp, ConvertBF16FloorDivOp,
+      ConvertConv2DOp, ConvertConv3DOp, ConvertDepthConv2DOp,
+      ConvertConv2DBackpropFilterOp, ConvertConv3DBackpropFilterOp,
+      ConvertConv2DBackpropInputOp, ConvertConv3DBackpropInputOp,
+      ConvertCumsumOp, ConvertDiagPartOp, ConvertEinsumOp,
+      ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
+      ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV3Op,
+      ConvertInfeedDequeueTupleOp, ConvertInplaceUpdateOp, ConvertLinSpaceOp,
+      ConvertMaxOp, ConvertMinOp, ConvertAvgPoolOp, ConvertMaxPool2DOp,
+      ConvertMaxPool3DOp, ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp,
+      ConvertMeanOp, ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp,
+      ConvertProdOp, ConvertQrOp, ConvertRangeOp, ConvertSelectV2Op,
+      ConvertSigmoidOp, ConvertSizeOp, ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
       ConvertSoftmaxOp<TF::SoftmaxOp, false>, ConvertSplitOp, ConvertSplitVOp,
       ConvertStridedSliceOp, ConvertStridedSliceGradOp, ConvertSumOp,
       ConvertTensorScatterUpdateOp, ConvertTileOp, ConvertTopKV2Op,
@@ -3865,33 +5113,45 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
       ConvertRandomShuffleOp, ConvertVariableShapeOp, ConvertXlaShardingOp,
       ConvertXlaDynamicUpdateSliceOp>(op->getContext());
 
+  // Populate with CHLO->HLO lowerings to account for TF ops legalized to
+  // CHLO first.
+  if (legalize_chlo) {
+    xla_chlo::PopulateLegalizeChloToHloPatterns(context, &patterns);
+  }
+
   ConversionTarget target(*context);
+  if (legalize_chlo) {
+    target.addIllegalDialect<xla_chlo::XlaHloClientDialect>();
+  } else {
+    target.addLegalDialect<xla_chlo::XlaHloClientDialect>();
+  }
   target.addLegalDialect<XlaHloDialect>();
+  target.addLegalDialect<StandardOpsDialect>();
+  target.addLegalDialect<shape::ShapeDialect>();
   target.addLegalOp<CallOp>();
+  target.addLegalOp<TensorCastOp>();
 
   if (!allow_partial_conversion) {
     // Fully qualify ReturnOp here as xla_hlo dialect also defines a ReturnOp.
     target.addLegalOp<ModuleOp, FuncOp, ModuleTerminatorOp, ::mlir::ReturnOp>();
-    return applyFullConversion(op, target, patterns);
+    DenseSet<Operation *> nonlegalized_ops;
+    LogicalResult result = applyPartialConversion(
+        op, target, patterns, /*converter=*/nullptr, &nonlegalized_ops);
+    // In order to enforce that the conversion result is fully converted,
+    // fail if there are any nonlegalized ops in the set.
+    if (failed(result) || !nonlegalized_ops.empty()) {
+      EmitLegalizationErrors(op, nonlegalized_ops);
+      return failure();
+    }
+    return result;
   }
 
   return applyPartialConversion(op, target, patterns);
 }
 
-/// Performs the lowering to XLA dialect.
-void LegalizeTF::runOnFunction() {
-  if (failed(legalizeTF(getFunction(), allow_partial_conversion_)))
-    signalPassFailure();
-}
-
-static PassRegistration<LegalizeTF> pass(
-    "xla-legalize-tf", "Legalize from TensorFlow to the XLA dialect");
-
-}  // end namespace
-
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion) {
-  return std::make_unique<LegalizeTF>(allow_partial_conversion);
+    bool allow_partial_conversion, bool legalize_chlo) {
+  return std::make_unique<LegalizeTF>(allow_partial_conversion, legalize_chlo);
 }
 
 }  // end namespace xla_hlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
index 86927fe0e07..ef13e66568d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
@@ -66,7 +66,7 @@ createLegalizeTFControlFlowPass() {
 namespace {
 
 void Detuple(Value tuple, Operation::result_range replace, OpBuilder* builder) {
-  // De-tuple the results of the xla hlo conditional result.
+  // De-tuple the results of the xla hlo if result.
   for (auto result_it : llvm::enumerate(replace)) {
     auto get_tuple_value = builder->create<xla_hlo::GetTupleElementOp>(
         result_it.value().getLoc(), tuple, result_it.index());
@@ -74,11 +74,11 @@ void Detuple(Value tuple, Operation::result_range replace, OpBuilder* builder) {
   }
 }
 
-// Imports the source region into the destination region. The XLA conditional
+// Imports the source region into the destination region. The XLA if
 // operation only supports one argument per branch. Therefore any branch that
 // requires additional arguments requires their values be tupled together. Then,
 // to support multiple returns (as XLA only supports a single return value) the
-// results of the conditional are tupled together.
+// results of the if operation are tupled together.
 void ImportXlaRegion(mlir::FuncOp func, Region* dest_region, Location loc,
                      bool tuple_return = true) {
   BlockAndValueMapping mapper;
@@ -114,11 +114,11 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   builder.setInsertionPoint(op);
   auto tuple_input = builder.create<xla_hlo::TupleOp>(loc, inputs);
 
-  // Create the new conditional op with tuple inputs.
+  // Create the new if op with tuple inputs.
   SmallVector<Value, 3> operands(op.getOperands());
   auto result_type = builder.getTupleType(op.getResultTypes());
-  auto conditional = builder.create<xla_hlo::ConditionalOp>(
-      loc, result_type, op.cond(), tuple_input, tuple_input);
+  auto if_op = builder.create<xla_hlo::IfOp>(loc, result_type, op.cond(),
+                                             tuple_input, tuple_input);
 
   // Import the regions for both the true and false cases. These regions
   // must be updated to tuple the return results together and use the xla hlo
@@ -126,12 +126,12 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   BlockAndValueMapping mapper;
   auto then_branch = module.lookupSymbol<mlir::FuncOp>(op.then_branch());
   auto else_branch = module.lookupSymbol<mlir::FuncOp>(op.else_branch());
-  ImportXlaRegion(then_branch, &conditional.true_branch(), loc);
-  ImportXlaRegion(else_branch, &conditional.false_branch(), loc);
+  ImportXlaRegion(then_branch, &if_op.true_branch(), loc);
+  ImportXlaRegion(else_branch, &if_op.false_branch(), loc);
 
-  // De-tuple the results of the xla hlo conditional result.
+  // De-tuple the results of the xla hlo if result.
   builder.setInsertionPointAfter(op);
-  Detuple(conditional.getResult(), op.getResults(), &builder);
+  Detuple(if_op.getResult(), op.getResults(), &builder);
   op.erase();
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 2f825a882f7..19fc42714b0 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -18,6 +18,7 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/xla/ir/chlo_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 def SignedIntTensor : TensorOf<[I1, I8, I16, I32, I64]>;
@@ -52,7 +53,8 @@ def GetHLOAxisFromTFAxisVariadic : NativeCodeCall<
 
 def : Pattern<
     (TF_FusedBatchNormOp:$root $x, $scale, $offset, $mean, $variance, $epsilon,
-                               $data_format, FalseBoolAttr:$is_training),
+                         $exponential_avg_factor, $data_format,
+                         FalseBoolAttr:$is_training),
     [(HLO_BatchNormInferenceOp $x, $scale, $offset, $mean, $variance,
                                $epsilon, (FeatureDimension $data_format, $x)),
      // We already guaranteed that the last four results has no use so it
@@ -71,18 +73,6 @@ def : Pattern<
 // HLO and XLA doesn't support Assertions.
 def LowerAssert : Pattern<(TF_AssertOp $condition, $data, $summarize), []>;
 
-//===----------------------------------------------------------------------===//
-// Bias op patterns.
-//===----------------------------------------------------------------------===//
-def BiasAddFeatureDimension : NativeCodeCall<
-    "getBiasFeatureDimension($_builder, $0, $1)">;
-
-// $input needs to be a ranked tensor to identify index of the feature
-// dimension depending on the data_format 'NHWC' or 'NCHW'.
-def : Pat<(TF_BiasAddOp AnyRankedTensor:$input, $bias, $data_format),
-          (HLO_AddOp $input, $bias,
-              (BiasAddFeatureDimension $data_format, $input))>;
-
 //===----------------------------------------------------------------------===//
 // Binary op patterns.
 //===----------------------------------------------------------------------===//
@@ -95,21 +85,22 @@ class DirectBinaryPat<Op FromOp, Op ToOp>
   : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r))>;
 
-foreach fromToBinPair = [[TF_AddOp, HLO_AddOp],
-                         [TF_AddV2Op, HLO_AddOp],
-                         [TF_DivOp, HLO_DivOp],
-                         [TF_LeftShiftOp, HLO_ShiftLeftOp],
-                         [TF_MaximumOp, HLO_MaxOp],
-                         [TF_MinimumOp, HLO_MinOp],
-                         [TF_MulOp, HLO_MulOp],
-                         [TF_PowOp, HLO_PowOp],
-                         [TF_RealDivOp, HLO_DivOp],
-                         [TF_SubOp, HLO_SubOp]] in
+foreach fromToBinPair = [[TF_AddOp, HLOClient_BroadcastAddOp],
+                         [TF_AddV2Op, HLOClient_BroadcastAddOp],
+                         [TF_DivOp, HLOClient_BroadcastDivOp],
+                         [TF_LeftShiftOp, HLOClient_BroadcastShiftLeftOp],
+                         [TF_MaximumOp, HLOClient_BroadcastMaxOp],
+                         [TF_MinimumOp, HLOClient_BroadcastMinOp],
+                         [TF_MulOp, HLOClient_BroadcastMulOp],
+                         [TF_PowOp, HLOClient_BroadcastPowOp],
+                         [TF_RealDivOp, HLOClient_BroadcastDivOp],
+                         [TF_SubOp, HLOClient_BroadcastSubOp]] in
   def : DirectBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
 def LowerRightShiftSigned :
   Pat<(TF_RightShiftOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-      (HLO_ShiftRightArithmeticOp $l, $r, (BinBroadcastDimensions $l, $r)),
+      (HLOClient_BroadcastShiftRightArithmeticOp $l, $r,
+       (BinBroadcastDimensions $l, $r)),
       [(SignedIntTensor $r)]>;
 
 // TODO(hinsu): Lower unsigned types to HLO_ShiftRightLogical once the HLO op
@@ -121,10 +112,11 @@ def : Pat<(TF_ComplexOp $r, $i), (HLO_ComplexOp $r, $i)>;
 //
 //  return floor(div(x, y))
 def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-          (HLO_FloorOp (HLO_DivOp $l, $r, (BinBroadcastDimensions $l, $r))),
+          (HLO_FloorOp
+           (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r))),
           [(IEEEFloatTensor $l)]>;
 
-// Performs a substitution of FloorDir for integer tensors, which required
+// Performs a substitution of FloorDiv for integer tensors, which required
 // additional correction for a negative numerator / denominator. Equivalent
 // pseudocode is shown below:
 //
@@ -145,16 +137,16 @@ def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
 // broadcast attributes.
 def : Pat<(TF_FloorDivOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
         (HLO_SelectOp
-         (HLO_CompareOp
-          (HLO_CompareOp $l, (HLO_ConstOp (ConstantSplat<"0"> $l)),
+         (HLOClient_BroadcastCompareOp
+          (HLOClient_BroadcastCompareOp $l, (HLO_ConstOp (ConstantSplat<"0"> $l)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
-          (HLO_CompareOp $r, (HLO_ConstOp (ConstantSplat<"0"> $r)),
+          (HLOClient_BroadcastCompareOp $r, (HLO_ConstOp (ConstantSplat<"0"> $r)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
           (BinBroadcastDimensions $l, $r), HLO_COMPARISON_DIRECTION_EQ),
-        (HLO_DivOp $l, $r, (BinBroadcastDimensions $l, $r)),
-          (HLO_DivOp
-           (HLO_NegOp:$neg (HLO_AddOp (HLO_AbsOp $l),
-                       (HLO_SubOp (HLO_AbsOp $r),
+        (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r)),
+          (HLOClient_BroadcastDivOp
+           (HLO_NegOp:$neg (HLOClient_BroadcastAddOp (HLO_AbsOp $l),
+                       (HLOClient_BroadcastSubOp (HLO_AbsOp $r),
                         (HLO_ConstOp (ConstantSplat<"1"> $r)),
                         (NullDenseIntElementsAttr)),
                      (BinBroadcastDimensions $l, $r))),
@@ -170,20 +162,20 @@ def : Pat<(TF_FloorDivOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
 // broadcast attributes.
 def : Pat<(TF_FloorModOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
       (HLO_SelectOp
-       (HLO_AndOp
-        (HLO_CompareOp
-         (HLO_RemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
+       (HLOClient_BroadcastAndOp
+        (HLOClient_BroadcastCompareOp
+         (HLOClient_BroadcastRemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
          (HLO_ConstOp:$l_zeros (ConstantSplat<"0"> $l)),
          (BinBroadcastDimensions $l, $rem), HLO_COMPARISON_DIRECTION_NE),
-        (HLO_CompareOp
-         (HLO_CompareOp:$r_cmp $r,
+        (HLOClient_BroadcastCompareOp
+         (HLOClient_BroadcastCompareOp:$r_cmp $r,
           (HLO_ConstOp:$r_zeros (ConstantSplat<"0"> $r)),
           (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
-         (HLO_CompareOp:$rem_cmp $rem, $r_zeros,
+         (HLOClient_BroadcastCompareOp:$rem_cmp $rem, $r_zeros,
           (BinBroadcastDimensions $rem, $r_zeros), HLO_COMPARISON_DIRECTION_LT),
          (BinBroadcastDimensions $r_cmp, $rem_cmp), HLO_COMPARISON_DIRECTION_NE),
         (NullDenseIntElementsAttr)),
-        (HLO_AddOp $r,
+        (HLOClient_BroadcastAddOp $r,
          $rem, (BinBroadcastDimensions $r, $rem)), $rem)>;
 
 //===----------------------------------------------------------------------===//
@@ -195,10 +187,10 @@ class DirectLogicalBinaryPat<Op FromOp, Op ToOp>
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r)),
         [(SignedIntTensor $l)]>;
 
-foreach fromToBinPair = [[TF_LogicalAndOp, HLO_AndOp],
-                         [TF_LogicalOrOp, HLO_OrOp],
-                         [TF_BitwiseOrOp, HLO_OrOp],
-                         [TF_BitwiseAndOp, HLO_AndOp]] in
+foreach fromToBinPair = [[TF_LogicalAndOp, HLOClient_BroadcastAndOp],
+                         [TF_LogicalOrOp, HLOClient_BroadcastOrOp],
+                         [TF_BitwiseOrOp, HLOClient_BroadcastOrOp],
+                         [TF_BitwiseAndOp, HLOClient_BroadcastAndOp]] in
   def : DirectLogicalBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
 //===----------------------------------------------------------------------===//
@@ -207,7 +199,8 @@ foreach fromToBinPair = [[TF_LogicalAndOp, HLO_AndOp],
 
 class DirectComparePat<Op FromOp, StrEnumAttrCase direction>
   : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-        (HLO_CompareOp $l, $r, (BinBroadcastDimensions $l, $r), direction)>;
+        (HLOClient_BroadcastCompareOp
+           $l, $r, (BinBroadcastDimensions $l, $r), direction)>;
 
 def : DirectComparePat<TF_GreaterOp, HLO_COMPARISON_DIRECTION_GT>;
 def : DirectComparePat<TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE>;
@@ -217,7 +210,8 @@ def : DirectComparePat<TF_LessEqualOp, HLO_COMPARISON_DIRECTION_LE>;
 class EqualityPat<Op FromOp, StrEnumAttrCase direction>
     : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r,
            TrueBoolAttr:$incompatible_shape_error),
-        (HLO_CompareOp $l, $r, (BinBroadcastDimensions $l, $r), direction),
+        (HLOClient_BroadcastCompareOp
+         $l, $r, (BinBroadcastDimensions $l, $r), direction),
         [(AreBroadcastCompatible $l, $r)]>;
 
 def : EqualityPat<TF_EqualOp, HLO_COMPARISON_DIRECTION_EQ>;
@@ -272,6 +266,13 @@ def : Pat<(TF_CrossReplicaSumOp $input, (TF_ConstOp $group_assignment)),
           (HLO_CrossReplicaSumOp $input,
             (CastElementsToI64Elements $group_assignment))>;
 
+//===----------------------------------------------------------------------===//
+// All2All op patterns.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(TF_AllToAllOp AnyRankedTensor:$input, (TF_ConstOp $group_assignment), I64Attr:$concat_dimension, $split_dimension, $split_count),
+          (HLO_AllToAllOp $input, $split_dimension, $concat_dimension, $split_count, (CastElementsToI64Elements $group_assignment))>;
+
 //===----------------------------------------------------------------------===//
 // FFT op patterns.
 //===----------------------------------------------------------------------===//
@@ -392,39 +393,36 @@ def : Pattern<(TF_MatrixBandPartOp:$op AnyRankedTensor:$input, $num_lower, $num_
           (HLO_SelectOp:$num_lower_or_m
            (HLO_CompareOp
             $num_lower, (HLO_ConstOp:$zero (ConstantSplat<"0"> $num_lower)),
-            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT
+            HLO_COMPARISON_DIRECTION_LT
            ),
            $m_dim,
            $num_lower
           ),
           (HLO_SelectOp:$num_upper_or_n
            (HLO_CompareOp
-            $num_upper, $zero,
-            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT
+            $num_upper, $zero, HLO_COMPARISON_DIRECTION_LT
            ),
            $n_dim,
            $num_upper
           ),
           (HLO_SelectOp
            (HLO_AndOp
-            (HLO_CompareOp
+            (HLOClient_BroadcastCompareOp
              (HLO_NegOp
               (createConvertOp $op, $num_lower_or_m, $input)
              ),
              (HLO_SubOp:$offset
-              (createIotaOp<"1"> $op, $input), (createIotaOp<"0"> $op, $input),
-              (NullDenseIntElementsAttr)
+              (createIotaOp<"1"> $op, $input), (createIotaOp<"0"> $op, $input)
              ),
              (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
             ),
-            (HLO_CompareOp
+            (HLOClient_BroadcastCompareOp
              $offset,
              (createConvertOp
               $op, $num_upper_or_n, $input
              ),
              (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
-            ),
-            (BinBroadcastDimensions $offset, $input)
+            )
            ),
            $input,
            (HLO_ConstOp (ConstantSplat<"0"> $input))
@@ -434,7 +432,8 @@ def : Pattern<(TF_MatrixBandPartOp:$op AnyRankedTensor:$input, $num_lower, $num_
 // Nullary op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_ConstOp:$res ElementsAttr:$value), (HLO_ConstOp $value),
+def : Pat<(TF_ConstOp:$res ElementsAttr:$value),
+          (TensorCastOp (HLO_ConstOp $value)),
           [(HLO_Tensor $res)]>;
 
 //===----------------------------------------------------------------------===//
@@ -447,8 +446,9 @@ def : Pat<(TF_ConstOp:$res ElementsAttr:$value), (HLO_ConstOp $value),
 // TODO(hinsu): Lower unsigned and quantized types after supporting
 // them in GetScalarOfType.
 def : Pat<(TF_ReluOp AnyRankedTensor:$input),
-          (HLO_MaxOp (HLO_ConstOp:$zero (GetScalarOfType<0> $input)), $input,
-                     (BinBroadcastDimensions $zero, $input)),
+          (HLOClient_BroadcastMaxOp
+               (HLO_ConstOp:$zero (GetScalarOfType<0> $input)), $input,
+               (BinBroadcastDimensions $zero, $input)),
           [(TF_SintOrFpTensor $input)]>;
 
 // TODO(hinsu): Lower unsigned and quantized types after supporting
@@ -470,7 +470,7 @@ def : Pat<(TF_Relu6Op AnyRankedTensor:$input),
 // to create splat tensor of dynamic shape in HLO.
 def : Pat<(TF_ReluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$features),
           (HLO_SelectOp
-            (HLO_CompareOp $features,
+            (HLOClient_BroadcastCompareOp $features,
               (HLO_ConstOp (GetScalarOfType<0> $features)),
               (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_GT),
             $gradients, (HLO_ConstOp (ConstantSplat<"0"> $gradients)))>;
@@ -479,6 +479,9 @@ def : Pat<(TF_ReluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$featu
 // Slice op patterns.
 //===----------------------------------------------------------------------===//
 
+def CastToI64AndUnpackTensor: NativeCodeCall<
+  "UnpackTensorAlongZeroDim($0.getLoc(), CastValueToI64($0.getLoc(), $1, &$_builder), &$_builder).output()">;
+
 def CanBeTranslatedToDynamicSlice : Constraint<CPred<
   "CanBeTranslatedToDynamicSlice($0, $1, $2.cast<DenseIntElementsAttr>())">>;
 
@@ -488,7 +491,8 @@ def TFSliceSizes2HLOSliceSizes : NativeCodeCall<
 
 def : Pat<(TF_SliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices,
            (TF_ConstOp $slice_sizes)),
-          (HLO_DynamicSliceOp $input, (CastValueToI64 $op, $starting_indices),
+          (HLO_DynamicSliceOp $input,
+           (CastToI64AndUnpackTensor $op, $starting_indices),
            (TFSliceSizes2HLOSliceSizes $input, $starting_indices, $slice_sizes)),
           [(CanBeTranslatedToDynamicSlice $input, $starting_indices,
             $slice_sizes)]>;
@@ -508,16 +512,14 @@ foreach callOp = [TF_PartitionedCallOp, TF_StatefulPartitionedCallOp] in {
 }
 
 //===----------------------------------------------------------------------===//
-// Ternary op patterns.
+// Reverse op patterns.
 //===----------------------------------------------------------------------===//
 
-def BothTypesMatch : Constraint<CPred<"$0.getType() == $1.getType()">,
-   "types must be equal">;
+// Handles axis conversion for TF reverse.
+def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, $1, &$_builder)">;
 
-def : Pat<(TF_SelectOp $cond, $t, $e), (HLO_SelectOp $cond, $t, $e),
-  // TODO(jpienaar): This restriction is to avoid creating a currently
-  // unsupported HLO select.
-  [(BothTypesMatch $t, $e)]>;
+def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (TF_ConstOp $axis)),
+    (HLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
 
 //===----------------------------------------------------------------------===//
 // Unary op patterns.
@@ -569,7 +571,6 @@ def : Pat<(TF_SignOp $x),
             (HLO_CompareOp
               $x,
               $x,
-              (NullDenseIntElementsAttr),
               HLO_COMPARISON_DIRECTION_NE
             ),
             (HLO_ConstOp (ConstantSplat<"0"> $x)),
@@ -606,3 +607,12 @@ def : Pat<(srcDstOpPair[0]:$old $shape, $seed, $seed2),
             (CastValueToI64 $old, $shape)),
           [(IsShapedTensor $shape)]>;
 }
+
+//===----------------------------------------------------------------------===//
+// Sigmoid grad op.
+//===----------------------------------------------------------------------===//
+def : Pat<(TF_SigmoidGradOp AnyRankedTensor:$l, AnyRankedTensor:$r),
+          (HLO_MulOp
+           (HLO_MulOp $r, $l),
+           (HLO_SubOp (HLO_ConstOp (ConstantSplat<"1"> $l)), $l)),
+          [(IEEEFloatTensor $l)]>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index eb6fe2e98b4..76657bd5e20 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -36,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h.inc"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
 #include "tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h"
@@ -77,9 +80,105 @@ static bool IsOpWhitelisted(Operation* op) {
   // building valid MLIR using MlirHloBuilder.
   // TODO(hinsu): Drop explicit whitelist when MLIR based bridge is enabled for
   // all tf2xla kernels.
-  return isa<TF::AbsOp>(op) || isa<TF::Atan2Op>(op) || isa<TF::CastOp>(op) ||
-         isa<TF::GreaterOp>(op) || isa<TF::InvOp>(op) ||
-         isa<TF::SelectV2Op>(op);
+  // clang-format off
+  static llvm::SmallDenseSet<mlir::TypeID, 512> ops = {
+    TypeID::get<TF::AbsOp>(),
+    TypeID::get<TF::AcoshOp>(),
+    TypeID::get<TF::AcosOp>(),
+    TypeID::get<TF::AddNOp>(),
+    TypeID::get<TF::AddV2Op>(),
+    TypeID::get<TF::ApproximateEqualOp>(),
+    TypeID::get<TF::AsinhOp>(),
+    TypeID::get<TF::AsinOp>(),
+    TypeID::get<TF::Atan2Op>(),
+    TypeID::get<TF::AtanhOp>(),
+    TypeID::get<TF::AtanOp>(),
+    TypeID::get<TF::BatchMatMulV2Op>(),
+    TypeID::get<TF::BiasAddGradOp>(),
+    TypeID::get<TF::BiasAddOp>(),
+    TypeID::get<TF::BitwiseAndOp>(),
+    TypeID::get<TF::BitwiseOrOp>(),
+    TypeID::get<TF::BitwiseXorOp>(),
+    TypeID::get<TF::CastOp>(),
+    TypeID::get<TF::ClipByValueOp>(),
+    TypeID::get<TF::ComplexAbsOp>(),
+    TypeID::get<TF::ConjugateTransposeOp>(),
+    TypeID::get<TF::CoshOp>(),
+    TypeID::get<TF::CrossOp>(),
+    TypeID::get<TF::DataFormatDimMapOp>(),
+    TypeID::get<TF::DataFormatVecPermuteOp>(),
+    TypeID::get<TF::DigammaOp>(),
+    TypeID::get<TF::DivNoNanOp>(),
+    TypeID::get<TF::EluGradOp>(),
+    TypeID::get<TF::EluOp>(),
+    TypeID::get<TF::EqualOp>(),
+    TypeID::get<TF::ErfcOp>(),
+    TypeID::get<TF::ErfOp>(),
+    TypeID::get<TF::Expm1Op>(),
+    TypeID::get<TF::FloorDivOp>(),
+    TypeID::get<TF::FloorModOp>(),
+    TypeID::get<TF::GatherNdOp>(),
+    TypeID::get<TF::GreaterEqualOp>(),
+    TypeID::get<TF::GreaterOp>(),
+    TypeID::get<TF::InvertOp>(),
+    TypeID::get<TF::InvOp>(),
+    TypeID::get<TF::LeakyReluGradOp>(),
+    TypeID::get<TF::LeakyReluOp>(),
+    TypeID::get<TF::LeftShiftOp>(),
+    TypeID::get<TF::LessEqualOp>(),
+    TypeID::get<TF::LessOp>(),
+    TypeID::get<TF::LgammaOp>(),
+    TypeID::get<TF::LogicalAndOp>(),
+    TypeID::get<TF::LogicalNotOp>(),
+    TypeID::get<TF::LogicalOrOp>(),
+    TypeID::get<TF::LogOp>(),
+    TypeID::get<TF::MatMulOp>(),
+    TypeID::get<TF::MulOp>(),
+    TypeID::get<TF::NegOp>(),
+    TypeID::get<TF::NotEqualOp>(),
+    TypeID::get<TF::PadOp>(),
+    TypeID::get<TF::PlaceholderWithDefaultOp>(),
+    TypeID::get<TF::PowOp>(),
+    TypeID::get<TF::RealDivOp>(),
+    TypeID::get<TF::ReciprocalOp>(),
+    TypeID::get<TF::ReciprocalGradOp>(),
+    TypeID::get<TF::Relu6GradOp>(),
+    TypeID::get<TF::RightShiftOp>(),
+    TypeID::get<TF::RintOp>(),
+    TypeID::get<TF::RoundOp>(),
+    TypeID::get<TF::SelectV2Op>(),
+    TypeID::get<TF::SeluGradOp>(),
+    TypeID::get<TF::SeluOp>(),
+    TypeID::get<TF::SigmoidGradOp>(),
+    TypeID::get<TF::SinhOp>(),
+    TypeID::get<TF::SinOp>(),
+    TypeID::get<TF::SoftplusGradOp>(),
+    TypeID::get<TF::SoftsignGradOp>(),
+    TypeID::get<TF::SoftsignOp>(),
+    TypeID::get<TF::SqrtGradOp>(),
+    TypeID::get<TF::SquareOp>(),
+    TypeID::get<TF::SubOp>(),
+    TypeID::get<TF::TanOp>(),
+    TypeID::get<TF::TransposeOp>(),
+    TypeID::get<TF::TruncateDivOp>(),
+    TypeID::get<TF::TruncatedNormalOp>(),
+    TypeID::get<TF::TruncateModOp>(),
+    TypeID::get<TF::UnpackOp>(),
+    TypeID::get<TF::XdivyOp>(),
+    TypeID::get<TF::XlaBroadcastHelperOp>(),
+    TypeID::get<TF::XlaConvOp>(),
+    TypeID::get<TF::XlaDotOp>(),
+    TypeID::get<TF::XlaDynamicSliceOp>(),
+    TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
+    TypeID::get<TF::XlaPadOp>(),
+    TypeID::get<TF::Xlog1pyOp>(),
+    TypeID::get<TF::XlogyOp>()
+  };
+  // clang-format on
+
+  auto* abstractOp = op->getAbstractOperation();
+  if (!abstractOp) return false;
+  return ops.count(abstractOp->typeID);
 }
 
 static std::unique_ptr<tensorflow::StaticDeviceMgr> CreateDeviceMgr(
@@ -121,6 +220,10 @@ class FuncLegalizer {
   // legalization.
   LogicalResult LegalizeOp(Operation* op);
 
+  // Converts the given operand to expression of kind kConstant or kXlaOp.
+  // Emits a remark and returns expression of kind kInvalid on failure.
+  tensorflow::XlaExpression GetExprForOperand(Value operand, Operation* op);
+
   FuncOp func_;
   std::string device_type_;
 
@@ -247,6 +350,17 @@ LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
   // Transfer ownership of the kernel to a local smart pointer.
   auto op_kernel = absl::WrapUnique(op_kernel_raw);
 
+  std::vector<int> required_constants;
+  status = tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
+      *op_kernel, &required_constants);
+  if (!status.ok()) {
+    op->emitRemark() << "failed to compute required constants: "
+                     << status.ToString();
+    return success();
+  }
+  llvm::SmallDenseSet<int, 4> required_consts;
+  required_consts.insert(required_constants.begin(), required_constants.end());
+
   // TensorValue in inputs are backed by tensors which in turn depend on
   // expressions. So, pre-allocate them to the required size.
   InlinedVector<tensorflow::XlaExpression, 4> expressions;
@@ -257,45 +371,39 @@ LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
   inputs.reserve(op->getNumOperands());
 
   // Prepare the list of Tensor inputs for the kernel.
-  for (Value operand : op->getOperands()) {
-    // Skip this op if XLA doesn't support this operand type.
-    auto xla_op_or = hlo_builder_.MakeXlaOp(operand);
-    if (!xla_op_or.ok()) {
-      op->emitRemark() << "skipping legalization due to "
-                       << xla_op_or.status().ToString();
+  for (auto it : llvm::enumerate(op->getOperands())) {
+    Value operand = it.value();
+    size_t idx = it.index();
+
+    tensorflow::XlaExpression expr = GetExprForOperand(operand, op);
+    tensorflow::XlaExpression::Kind kind = expr.kind();
+    if (kind == tensorflow::XlaExpression::Kind::kInvalid) return success();
+    if (required_consts.count(idx) &&
+        kind != tensorflow::XlaExpression::Kind::kConstant) {
+      op->emitRemark() << "lowering requires operand #" << idx
+                       << " to be a constant";
       return success();
     }
-    ::xla::XlaOp xla_op = xla_op_or.ValueOrDie();
+    expressions.push_back(expr);
 
-    tensorflow::DataType dtype;
-    status = tensorflow::ConvertToDataType(operand.getType(), &dtype);
-    if (!status.ok()) {
-      op->emitRemark() << "skipping legalization due to " << status.ToString();
-      return success();
-    }
-
-    auto expression = tensorflow::XlaExpression::XlaOp(xla_op, dtype);
-    expressions.push_back(expression);
-
-    if (!tensorflow::DataTypeCanUseMemcpy(dtype)) {
+    if (!tensorflow::DataTypeCanUseMemcpy(expr.dtype())) {
       op->emitRemark() << "skipping legalization due to unsupported type "
                        << operand.getType();
       return success();
     }
 
-    auto shape_or = expression.GetShape();
+    auto shape_or = expr.GetShape();
     if (!shape_or.ok()) {
       op->emitRemark() << "failed to get shape for expression. "
-                       << expression.HumanString();
+                       << expr.HumanString();
       return success();
     }
 
     tensors.emplace_back(
-        device_->GetAllocator(tensorflow::AllocatorAttributes()), dtype,
+        device_->GetAllocator(tensorflow::AllocatorAttributes()), expr.dtype(),
         shape_or.ValueOrDie());
     tensorflow::Tensor& tensor = tensors.back();
-    tensorflow::XlaOpKernelContext::AssignExpressionToTensor(expression,
-                                                             &tensor);
+    tensorflow::XlaOpKernelContext::AssignExpressionToTensor(expr, &tensor);
     inputs.emplace_back(&tensor);
   }
 
@@ -327,13 +435,51 @@ LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
       return op->emitError(
           "expects XlaExpression of kind kXlaOp in compiled output");
     auto value = hlo_builder_.GetValue(expr->handle());
-    op->getResult(i).replaceAllUsesWith(value);
+    mlir::OpResult old_result = op->getResult(i);
+    if (value.getType() != old_result.getType()) {
+      value =
+          hlo_builder_.create<mlir::TensorCastOp>(value, old_result.getType());
+    }
+    old_result.replaceAllUsesWith(value);
   }
 
   op->erase();
   return success();
 }
 
+tensorflow::XlaExpression FuncLegalizer::GetExprForOperand(Value operand,
+                                                           Operation* op) {
+  ElementsAttr const_attr;
+  auto defining_op = operand.getDefiningOp();
+  if (defining_op && matchPattern(defining_op, m_Constant(&const_attr))) {
+    tensorflow::Tensor tensor;
+    auto status = tensorflow::ConvertToTensor(const_attr, &tensor);
+    if (!status.ok()) {
+      op->emitRemark() << "skipping legalization due to failed const conversion"
+                       << status.ToString();
+      return tensorflow::XlaExpression::Invalid();
+    }
+    return tensorflow::XlaExpression::Constant(tensor);
+  }
+
+  // Skip this op if XLA doesn't support this operand type.
+  auto xla_op_or = hlo_builder_.MakeXlaOp(operand);
+  if (!xla_op_or.ok()) {
+    op->emitRemark() << "skipping legalization due to "
+                     << xla_op_or.status().ToString();
+    return tensorflow::XlaExpression::Invalid();
+  }
+  ::xla::XlaOp xla_op = xla_op_or.ValueOrDie();
+
+  tensorflow::DataType dtype;
+  auto status = tensorflow::ConvertToDataType(operand.getType(), &dtype);
+  if (!status.ok()) {
+    op->emitRemark() << "skipping legalization due to " << status.ToString();
+    return tensorflow::XlaExpression::Invalid();
+  }
+  return tensorflow::XlaExpression::XlaOp(xla_op, dtype);
+}
+
 class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
  public:
   LegalizeTF() = default;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
index c0f6c2c3541..21e39db018b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
@@ -36,47 +36,36 @@ def IsSameSizePred : CPred<
 def IsSameSizeConstraint : Constraint<IsSameSizePred, "inputs are same size">;
 
 
-def : Pat<(HLO_AndOp HLO_PredTensor:$l, HLO_PredTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_AndOp HLO_PredTensor:$l, HLO_PredTensor:$r),
           (AndOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_AddOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_AddOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (AddFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_SubOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_SubOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (SubFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_MulOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (MulFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_DivOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (DivFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_RemOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_RemOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (RemFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_AddOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_AddOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (AddIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_SubOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_SubOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (SubIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_MulOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (MulIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_DivOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (SignedDivIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_RemOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_RemOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (SignedRemIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index bdee1b77cff..43c0911a4a6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
index e6f3ac02d4f..f0eb3cc1a0f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/LoopOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
@@ -112,7 +112,7 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
       auto step = rewriter.create<mlir::ConstantOp>(
           loc, rewriter.getIndexType(),
           rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
-      auto loop = rewriter.create<mlir::loop::ForOp>(loc, zero, upper, step);
+      auto loop = rewriter.create<mlir::scf::ForOp>(loc, zero, upper, step);
 
       rewriter.setInsertionPointToStart(loop.getBody());
       // Compute memrefs for the value to reduce. This makes it easier to just
@@ -173,8 +173,7 @@ struct LhloLegalizeToGpu : public PassWrapper<LhloLegalizeToGpu, FunctionPass> {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
     target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
-                           gpu::GPUDialect, loop::LoopOpsDialect,
-                           XlaLhloDialect>();
+                           gpu::GPUDialect, scf::SCFDialect, XlaLhloDialect>();
     target.addIllegalOp<ReduceOp>();
     auto func = getFunction();
     patterns.insert<LhloReduceToGPULaunchConverter>(func.getContext());
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
index 489285e02d1..734a75a4307 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/LoopOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -61,15 +61,15 @@ Value ApplySingleResultLhloCode(Location loc, ValueRange operands,
 
 // Converts a block with LHLO ops and with signature:
 //   ^bb(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-// into a reduction operator of loop.reduce by doing buffer allocation for
-// scalar arguments and the result of `loop.reduce` to make it compatible with
+// into a reduction operator of scf.reduce by doing buffer allocation for
+// scalar arguments and the result of `scf.reduce` to make it compatible with
 // LHLO ops.
-void ConvertToReductionOperator(Location loc, loop::ReduceOp reduce_op,
+void ConvertToReductionOperator(Location loc, scf::ReduceOp reduce_op,
                                 Block* lhlo_block, OpBuilder* b) {
   Block& loop_reduce_op_body = reduce_op.reductionOperator().front();
   OpBuilder::InsertionGuard guard(*b);
   b->setInsertionPointToStart(&loop_reduce_op_body);
-  b->create<loop::ReduceReturnOp>(
+  b->create<scf::ReduceReturnOp>(
       loc, ApplySingleResultLhloCode(loc, loop_reduce_op_body.getArguments(),
                                      lhlo_block, b));
 }
@@ -90,8 +90,9 @@ struct MappedIvs {
   SmallVector<Value, 2> ivs;
 };
 
-MappedIvs MapWindowIvsToInput(ReduceWindowOp op, ValueRange ivs,
-                              ValueRange window_ivs, OpBuilder* b) {
+template <typename OpTy>
+MappedIvs MapWindowIvsToInput(OpTy op, ValueRange ivs, ValueRange window_ivs,
+                              OpBuilder* b) {
   MappedIvs mapped_ivs;
 
   if (!op.window_strides().hasValue()) {
@@ -106,14 +107,14 @@ MappedIvs MapWindowIvsToInput(ReduceWindowOp op, ValueRange ivs,
 
   auto loc = op.getLoc();
   auto operand = op.operand();
-  auto operand_shape = operand.getType().cast<MemRefType>().getShape();
+  auto operand_shape = operand.getType().template cast<MemRefType>().getShape();
 
   // `in_bounds` is false when the mapped indices are in the padding area.
   mapped_ivs.in_bounds = b->create<mlir::ConstantOp>(
       loc, b->getI1Type(), b->getIntegerAttr(b->getI1Type(), 1));
   for (unsigned i = 0, e = ivs.size(); i < e; ++i) {
-    auto stride = window_strides.getValue<llvm::APInt>(i);
-    auto pad_low = padding.getValue<llvm::APInt>({i, 0});
+    auto stride = window_strides.template getValue<llvm::APInt>(i);
+    auto pad_low = padding.template getValue<llvm::APInt>({i, 0});
 
     Value stride_val = b->create<ConstantIndexOp>(loc, stride.getSExtValue());
     Value pad_low_val = b->create<ConstantIndexOp>(loc, pad_low.getSExtValue());
@@ -135,9 +136,9 @@ MappedIvs MapWindowIvsToInput(ReduceWindowOp op, ValueRange ivs,
   return mapped_ivs;
 }
 
-// Returns loop::Parallel over a shaped value with static or dynamic shape.
-loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
-                                   OpBuilder* b) {
+// Returns scf::Parallel over a shaped value with static or dynamic shape.
+scf::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
+                                  OpBuilder* b) {
   Value zero = b->create<ConstantIndexOp>(loc, 0);
   Value one = b->create<ConstantIndexOp>(loc, 1);
 
@@ -150,10 +151,10 @@ loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
     lower.push_back(zero);
     step.push_back(one);
   }
-  return b->create<loop::ParallelOp>(loc, lower, upper, step);
+  return b->create<scf::ParallelOp>(loc, lower, upper, step);
 }
 
-// Converts `xla_lhlo.ReduceOp` into two loop::ParallelOp and a loop::ReduceOp.
+// Converts `xla_lhlo.ReduceOp` into two scf::ParallelOp and a scf::ReduceOp.
 // The outper `ParallelOp` refers to the parallel loops if there are
 // any. The inner `ParalleOp` refers to the reduction loops and `ReduceOp`
 // contains the reduction operator.
@@ -169,10 +170,10 @@ loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
 //  is roughly converted into:
 //
 //  %init = load %init_buf[] : memref<f32>
-//  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-//    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+//  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+//    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
 //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-//      loop.reduce(%elem_to_reduce)  {
+//      scf.reduce(%elem_to_reduce)  {
 //        ^bb0(%elem: f32, %acc: f32):   // no predecessors
 //          elem_buf = alloc() : memref<f32>
 //          store %elem, elem_buf[] : memref<f32>
@@ -180,11 +181,11 @@ loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
 //          store %acc, acc_buf[] : memref<f32>
 //          <LHLO_ops>
 //          %acc_result = load acc_buf[] : memref<f32>
-//          loop.reduce.return %acc_result : f32
+//          scf.reduce.return %acc_result : f32
 //      } : f32
-//      loop.yield
+//      scf.yield
 //    } : f32
-//    loop.yield
+//    scf.yield
 //  }
 class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
  public:
@@ -196,7 +197,7 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
     // TODO(b/137624192) Implement variadic reduce.
     if (xla_reduce_op.out().size() != 1) return failure();
 
-    loop::ReduceOp reduce_op =
+    scf::ReduceOp reduce_op =
         CreateReduceOpInNestedParallelLoops(xla_reduce_op, &rewriter);
     ConvertToReductionOperator(xla_reduce_op.getLoc(), reduce_op,
                                &xla_reduce_op.body().front(), &rewriter);
@@ -205,26 +206,26 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
   }
 
  private:
-  // Creates nested `loop.parallel` ops with `loop.reduce`. The outer ParallelOp
+  // Creates nested `scf.parallel` ops with `scf.reduce`. The outer ParallelOp
   // refers to the parallel dimensions of `xla_reduce_op` if any and the inner
-  // ParallelOp refers to the reduction dimensions. The loop.reduce op is
+  // ParallelOp refers to the reduction dimensions. The scf.reduce op is
   // returned.
   //
   // If the reduction argument is a memref<100x10x5xf32> and the
   // reduction is performed along dimension 1 then this method will generate
   //
   //  %init = load %init_buf[] : memref<f32>
-  //  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-  //    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+  //  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+  //    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
   //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-  //      loop.reduce(%elem_to_reduce)  {
+  //      scf.reduce(%elem_to_reduce)  {
   //        <THE BLOCK PTR TO BE RETURNED>
   //      } : f32
-  //      loop.yield
+  //      scf.yield
   //    } : f32
-  //    loop.yield
+  //    scf.yield
   //  }
-  loop::ReduceOp CreateReduceOpInNestedParallelLoops(
+  scf::ReduceOp CreateReduceOpInNestedParallelLoops(
       xla_lhlo::ReduceOp xla_reduce_op,
       ConversionPatternRewriter* rewriter) const {
     auto loc = xla_reduce_op.getLoc();
@@ -253,13 +254,13 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
     SmallVector<Value, 1> init_value = {
         rewriter->create<LoadOp>(loc, *xla_reduce_op.init_values().begin())};
     // Outer ParallelOp is not needed if it is a reduction across all dims.
-    loop::ParallelOp outer;
+    scf::ParallelOp outer;
     if (!parallel_lower.empty()) {
-      outer = rewriter->create<loop::ParallelOp>(loc, parallel_lower,
-                                                 parallel_upper, parallel_step);
+      outer = rewriter->create<scf::ParallelOp>(loc, parallel_lower,
+                                                parallel_upper, parallel_step);
       rewriter->setInsertionPointToStart(outer.getBody());
     }
-    loop::ParallelOp inner = rewriter->create<loop::ParallelOp>(
+    scf::ParallelOp inner = rewriter->create<scf::ParallelOp>(
         loc, reduce_lower, reduce_upper, reduce_step, init_value);
     Value reduction_result = *inner.getResults().begin();
 
@@ -293,7 +294,7 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
     rewriter->setInsertionPointToStart(inner.getBody());
     Value elem = rewriter->create<mlir::LoadOp>(
         loc, *xla_reduce_op.operands().begin(), indices);
-    return rewriter->create<loop::ReduceOp>(loc, elem);
+    return rewriter->create<scf::ReduceOp>(loc, elem);
   }
 };
 
@@ -313,8 +314,8 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
 //     accumulator = reduction_operator(output[O], value)
 //   output[O] = accumulator
 //
-// Converts `xla_lhlo.ReduceWindowOp` into two loop::ParallelOp and a
-// loop::ReduceOp.
+// Converts `xla_lhlo.ReduceWindowOp` into two scf::ParallelOp and a
+// scf::ReduceOp.
 // The outper `ParallelOp` refers to the parallel loops that traverese output
 // buffer. The inner `ParalleOp` refers to the reduction loops that traverse
 // reduction windows and `ReduceOp` contains the reduction operator.
@@ -340,20 +341,20 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
 // is roughly converted into:
 //
 //    %neutral_elem = load %init_buf[] : memref<f32>
-//    loop.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
-//      %result = loop.parallel (%iw, %jw) = (%c0, %c0)
+//    scf.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
+//      %result = scf.parallel (%iw, %jw) = (%c0, %c0)
 //                  to (%c3, %c3) step (%c1, %c1) neutral_elem (%0) -> f32 {
 //        %in_bounds = <COMPUTE IF INDEX IS IN OPERAND'S pad>
 //        %elem = load %operand[%computed_i, %computed_j]
 //        %elem_or_neutral = select %in_bounds, %elem, %neutral_elem : f32
-//        loop.reduce(%elem_to_reduce)  : f32 {
+//        scf.reduce(%elem_to_reduce)  : f32 {
 //          ^bb0(%arg7: f32, %arg8: f32):
 //            <LHLO ops>
 //        }
-//        loop.yield
+//        scf.yield
 //      }
 //      store %result, %output_buffer[%i, %j] : memref<56x56xf32>
-//      loop.yield
+//      scf.yield
 //    }
 //    return
 //  }
@@ -365,12 +366,12 @@ class ReduceWindowOpConverter
   LogicalResult matchAndRewrite(
       xla_lhlo::ReduceWindowOp xla_reduce_window_op, ArrayRef<Value> /*args*/,
       ConversionPatternRewriter& rewriter) const final {
-    loop::ParallelOp output_loop, window_loop;
+    scf::ParallelOp output_loop, window_loop;
     std::tie(output_loop, window_loop) =
         CreateParallelLoopsToTraverseOutputAndWindow(xla_reduce_window_op,
                                                      &rewriter);
 
-    loop::ReduceOp reduce_op = CreateReduceOpInNestedParallelLoops(
+    scf::ReduceOp reduce_op = CreateReduceOpInNestedParallelLoops(
         xla_reduce_window_op, output_loop, window_loop, &rewriter);
 
     ConvertToReductionOperator(xla_reduce_window_op.getLoc(), reduce_op,
@@ -380,7 +381,7 @@ class ReduceWindowOpConverter
   }
 
  private:
-  std::pair<loop::ParallelOp, loop::ParallelOp>
+  std::pair<scf::ParallelOp, scf::ParallelOp>
   CreateParallelLoopsToTraverseOutputAndWindow(
       xla_lhlo::ReduceWindowOp xla_reduce_window_op,
       ConversionPatternRewriter* rewriter) const {
@@ -404,7 +405,7 @@ class ReduceWindowOpConverter
       window_upper.push_back(
           rewriter->create<ConstantIndexOp>(loc, window_dim.getSExtValue()));
     }
-    auto window_loop = rewriter->create<loop::ParallelOp>(
+    auto window_loop = rewriter->create<scf::ParallelOp>(
         loc, window_lower, window_upper, window_step, init_value);
 
     Value reduction_result = *window_loop.getResults().begin();
@@ -413,9 +414,9 @@ class ReduceWindowOpConverter
     return std::make_pair(output_loop, window_loop);
   }
 
-  loop::ReduceOp CreateReduceOpInNestedParallelLoops(
+  scf::ReduceOp CreateReduceOpInNestedParallelLoops(
       xla_lhlo::ReduceWindowOp xla_reduce_window_op,
-      loop::ParallelOp output_loop, loop::ParallelOp window_loop,
+      scf::ParallelOp output_loop, scf::ParallelOp window_loop,
       ConversionPatternRewriter* rewriter) const {
     rewriter->setInsertionPointToStart(window_loop.getBody());
     auto loc = xla_reduce_window_op.getLoc();
@@ -430,24 +431,263 @@ class ReduceWindowOpConverter
     Value xla_operand = xla_reduce_window_op.operand();
     auto xla_operand_type = xla_operand.getType().cast<MemRefType>();
 
+    // Compute ivs in 'arg' buffer and whether these ivs are in pad area or not.
     MappedIvs mapped_ivs = MapWindowIvsToInput(
         xla_reduce_window_op, output_loop.getInductionVars(),
         window_loop.getInductionVars(), rewriter);
 
-    auto elem_or_init = rewriter->create<loop::IfOp>(
+    auto elem_or_init = rewriter->create<scf::IfOp>(
         loc, xla_operand_type.getElementType(), mapped_ivs.in_bounds,
         /*withElseRegion=*/true);
 
     OpBuilder then_builder = elem_or_init.getThenBodyBuilder();
     Value elem = then_builder.create<mlir::LoadOp>(
         loc, xla_reduce_window_op.operand(), mapped_ivs.ivs);
-    then_builder.create<loop::YieldOp>(loc, elem);
+    then_builder.create<scf::YieldOp>(loc, elem);
 
     OpBuilder else_builder = elem_or_init.getElseBodyBuilder();
-    else_builder.create<loop::YieldOp>(loc, *window_loop.initVals().begin());
+    else_builder.create<scf::YieldOp>(loc, *window_loop.initVals().begin());
 
-    return rewriter->create<loop::ReduceOp>(loc,
-                                            *elem_or_init.results().begin());
+    return rewriter->create<scf::ReduceOp>(loc,
+                                           *elem_or_init.results().begin());
+  }
+};
+
+// See the operation semantics in
+// https://www.tensorflow.org/xla/operation_semantics#selectandscatter
+//
+// Pseudocode:
+//  scf.parallel(coordinates O in the output):
+//    output[O] = init
+//  scf.parallel(coordinates S in the source):
+//    selected_ivs = 0
+//    selected_val = 0
+//    initialized_flag = false
+//    scf.for (first dim W_1 in the window)
+//         iter_args (selected_ivs, selected_val, initialized_flag):
+//    ...
+//      scf.for (last dim W_N in the window):
+//           iter_args (selected_ivs, selected_val, initialized_flag):
+//        I = S * stride + W - pad_low
+//        if I within bounds of operand:
+//          if (initialized_flag):
+//            pred = select(selected_value, operand(I))):
+//            if (pred)
+//              selected_value = operand(I)
+//              selected_index = I
+//          else
+//              selected_value = operand(I)
+//              selected_index = I
+//              initialized_flag = true
+//    output(selected_index) = scatter(output(selected_index), source(S))
+class SelectAndScatterOpConverter
+    : public OpConversionPattern<xla_lhlo::SelectAndScatterOp> {
+ public:
+  using OpConversionPattern<xla_lhlo::SelectAndScatterOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      xla_lhlo::SelectAndScatterOp s_and_s_op, ArrayRef<Value> /*args*/,
+      ConversionPatternRewriter& rewriter) const final {
+    auto loc = s_and_s_op.getLoc();
+    InitializeOutput(s_and_s_op, &rewriter);
+    scf::ParallelOp loop_over_src =
+        MakeLoopOverShape(loc, s_and_s_op.source(), &rewriter);
+    rewriter.setInsertionPointToStart(loop_over_src.getBody());
+
+    // Compute indices of the selected element in the window.
+    auto selected_ivs = SelectIvs(s_and_s_op, loop_over_src, &rewriter);
+
+    // Load `source[selected_ivs]`.
+    auto src_elem = rewriter.create<LoadOp>(loc, s_and_s_op.source(),
+                                            loop_over_src.getInductionVars());
+
+    // Compute `out[selected_ivs]` = scatter(out[selected_ivs], src_element)`.
+    auto rmw = rewriter.create<GenericAtomicRMWOp>(loc, s_and_s_op.out(),
+                                                   selected_ivs);
+    OpBuilder rmw_builder = OpBuilder::atBlockEnd(rmw.getBody());
+    auto acc_result =
+        ApplySingleResultLhloCode(loc, {src_elem, rmw.getCurrentValue()},
+                                  &s_and_s_op.scatter().front(), &rmw_builder);
+    rmw_builder.create<AtomicYieldOp>(loc, acc_result);
+
+    rewriter.replaceOp(s_and_s_op, llvm::None);
+    return success();
+  }
+
+ private:
+  void InitializeOutput(xla_lhlo::SelectAndScatterOp s_and_s_op,
+                        OpBuilder* b) const {
+    auto loc = s_and_s_op.getLoc();
+    Value init_value = b->create<LoadOp>(loc, s_and_s_op.init_value());
+
+    scf::ParallelOp loop_over_output =
+        MakeLoopOverShape(loc, s_and_s_op.out(), b);
+    OpBuilder::InsertionGuard guard(*b);
+    b->setInsertionPointToStart(loop_over_output.getBody());
+    b->create<StoreOp>(loc, init_value, s_and_s_op.out(),
+                       loop_over_output.getInductionVars());
+  }
+
+  struct WindowLoops {
+    SmallVector<Value, 2> selected_ivs;
+    SmallVector<Value, 2> window_ivs;
+    scf::ForOp inner_loop;
+  };
+  WindowLoops InsertWindowLoops(xla_lhlo::SelectAndScatterOp s_and_s_op,
+                                scf::ParallelOp loop_over_src,
+                                OpBuilder* b) const {
+    auto loc = s_and_s_op.getLoc();
+    Value zero = b->create<ConstantIndexOp>(loc, 0);
+    Value one = b->create<ConstantIndexOp>(loc, 1);
+
+    auto element_type =
+        s_and_s_op.out().getType().cast<MemRefType>().getElementType();
+    auto rank = loop_over_src.getNumLoops();
+
+    // `iter_args` = [iv_1, ..., iv_N, selected_value, is_initialized]
+    SmallVector<Value, 4> iter_args(rank, zero);
+    iter_args.push_back(b->create<mlir::ConstantOp>(
+        loc, element_type, b->getFloatAttr(element_type, 0)));
+    iter_args.push_back(b->create<mlir::ConstantOp>(
+        loc, b->getI1Type(), b->getIntegerAttr(b->getI1Type(), 0)));
+
+    // Create a nested loop that traverses the window.
+    OpBuilder::InsertPoint ip;
+    WindowLoops result;
+    for (const auto& window_dim :
+         s_and_s_op.window_dimensions()->getIntValues()) {
+      Value upper = b->create<ConstantIndexOp>(loc, window_dim.getSExtValue());
+      result.inner_loop =
+          b->create<scf::ForOp>(loc, zero, upper, one, iter_args);
+      if (b->getInsertionBlock() == loop_over_src.getBody()) {
+        ip = b->saveInsertionPoint();
+        result.selected_ivs = result.inner_loop.getResults().take_front(rank);
+      } else {
+        b->create<scf::YieldOp>(loc, result.inner_loop.getResults());
+      }
+      b->setInsertionPointToStart(result.inner_loop.getBody());
+      iter_args = ValueRange{result.inner_loop.getRegionIterArgs()};
+      result.window_ivs.push_back(result.inner_loop.getInductionVar());
+    }
+    b->restoreInsertionPoint(ip);
+    return result;
+  }
+
+  // Adapter to store iteration arguments of sequential loops that perform
+  // select in a window.
+  class IterArgs {
+   public:
+    explicit IterArgs(ValueRange ivs_val_flag) : ivs_val_flag_(ivs_val_flag) {}
+    IterArgs(ValueRange ivs, Value value, Value flag) {
+      ivs_val_flag_ = ivs;
+      ivs_val_flag_.push_back(value);
+      ivs_val_flag_.push_back(flag);
+    }
+
+    ArrayRef<Value> to_vector() const { return ivs_val_flag_; }
+
+    // Indices of the currently selected value.
+    ArrayRef<Value> ivs() const { return to_vector().drop_back(2); }
+    // Currently selected value w.r.t. select() function.
+    Value value() const { return ivs_val_flag_.end()[-2]; }
+    // i1 flag if value() and ivs() were initialized.
+    Value is_init() const { return ivs_val_flag_.back(); }
+
+   private:
+    // Vector that stores iv_1, ..., iv_N, value, init.
+    SmallVector<Value, 4> ivs_val_flag_;
+  };
+
+  SmallVector<Value, 2> SelectIvs(xla_lhlo::SelectAndScatterOp s_and_s_op,
+                                  scf::ParallelOp loop_over_src,
+                                  OpBuilder* b) const {
+    auto loc = s_and_s_op.getLoc();
+
+    WindowLoops window_loops = InsertWindowLoops(s_and_s_op, loop_over_src, b);
+    auto inner_loop_b =
+        OpBuilder::atBlockEnd(window_loops.inner_loop.getBody());
+
+    // Compute ivs in 'arg' buffer and whether these ivs are in the pad area.
+    MappedIvs mapped_ivs =
+        MapWindowIvsToInput(s_and_s_op, loop_over_src.getInductionVars(),
+                            window_loops.window_ivs, &inner_loop_b);
+
+    IterArgs ivs_val_flag(window_loops.inner_loop.getRegionIterArgs());
+
+    auto if_in_bounds = inner_loop_b.create<scf::IfOp>(
+        loc, window_loops.inner_loop.getResultTypes(), mapped_ivs.in_bounds,
+        /*withElseRegion=*/true);
+
+    // Case when we are inside boundaries of 'arg' and not in the pad area.
+    {
+      OpBuilder in_bounds_then_b = if_in_bounds.getThenBodyBuilder();
+      auto select_or_init_results = SelectOrInitialize(
+          s_and_s_op, mapped_ivs.ivs, &ivs_val_flag, &in_bounds_then_b);
+      in_bounds_then_b.create<scf::YieldOp>(loc, select_or_init_results);
+    }
+
+    // Case when we are in the pad.
+    {
+      OpBuilder in_bounds_else_b = if_in_bounds.getElseBodyBuilder();
+      in_bounds_else_b.create<scf::YieldOp>(loc, ivs_val_flag.to_vector());
+    }
+
+    inner_loop_b.create<scf::YieldOp>(loc, if_in_bounds.getResults());
+    return window_loops.selected_ivs;
+  }
+
+  SmallVector<Value, 4> SelectOrInitialize(
+      xla_lhlo::SelectAndScatterOp s_and_s_op, ArrayRef<Value> operand_ivs,
+      IterArgs* ivs_val_flag, OpBuilder* b) const {
+    auto loc = s_and_s_op.getLoc();
+    Value true_i1 = b->create<mlir::ConstantOp>(
+        loc, b->getI1Type(), b->getIntegerAttr(b->getI1Type(), 1));
+
+    TypeRange iter_arg_types{ivs_val_flag->to_vector()};
+    Value operand_elem =
+        b->create<LoadOp>(loc, s_and_s_op.operand(), operand_ivs);
+    auto if_init =
+        b->create<scf::IfOp>(loc, iter_arg_types, ivs_val_flag->is_init(),
+                             /*withElseRegion=*/true);
+    // Init == true, i.e. iter args are already initialized with a selected
+    // element in boundaries of the operand. Select function has to be computed
+    // here.
+    {
+      OpBuilder if_init_then_b = if_init.getThenBodyBuilder();
+
+      auto& lhlo_select = s_and_s_op.select().front();
+      Value pred =
+          ApplySingleResultLhloCode(loc, {operand_elem, ivs_val_flag->value()},
+                                    &lhlo_select, &if_init_then_b);
+
+      auto if_pred = if_init_then_b.create<scf::IfOp>(loc, iter_arg_types, pred,
+                                                      /*withElseRegion=*/true);
+
+      // Pred == true, therefore pack newly selected ivs, val and init flag back
+      // to iter_args and return.
+      {
+        OpBuilder if_pred_then_b = if_pred.getThenBodyBuilder();
+        if_pred_then_b.create<scf::YieldOp>(
+            loc, IterArgs{operand_ivs, operand_elem, true_i1}.to_vector());
+      }
+
+      // Pred == false, therefore return old iter_args.
+      {
+        OpBuilder if_pred_else_b = if_pred.getElseBodyBuilder();
+        if_pred_else_b.create<scf::YieldOp>(loc, ivs_val_flag->to_vector());
+      }
+
+      if_init_then_b.create<scf::YieldOp>(loc, if_pred.getResults());
+    }
+    // Init == false, i.e. only pad was visited before and this is the first
+    // element in the boundaries of the operand.
+    {
+      OpBuilder if_init_else_b = if_init.getElseBodyBuilder();
+
+      if_init_else_b.create<scf::YieldOp>(
+          loc, IterArgs{operand_ivs, operand_elem, true_i1}.to_vector());
+    }
+    return if_init.getResults();
   }
 };
 
@@ -460,15 +700,16 @@ struct LhloLegalizeToParallelLoops
     // clang-format off
     patterns.insert<
         ReduceOpConverter,
-        ReduceWindowOpConverter
+        ReduceWindowOpConverter,
+        SelectAndScatterOpConverter
       >(func.getContext());
     // clang-format on
 
     ConversionTarget target(getContext());
     target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
-                           loop::LoopOpsDialect, XlaLhloDialect>();
-    target.addIllegalOp<xla_lhlo::ReduceOp>();
-    target.addIllegalOp<xla_lhlo::ReduceWindowOp>();
+                           scf::SCFDialect, XlaLhloDialect>();
+    target.addIllegalOp<xla_lhlo::ReduceOp, xla_lhlo::ReduceWindowOp,
+                        xla_lhlo::SelectAndScatterOp>();
 
     if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
       signalPassFailure();
diff --git a/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td b/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
index dcb0ab20e9e..e1ae5ef6abf 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
@@ -28,70 +28,62 @@ include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 // and imaginary components.
 foreach elementwiseOp = [HLO_AddOp, HLO_SubOp] in
   def : Pat<(elementwiseOp HLO_ComplexTensor:$lhs,
-             HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+             HLO_ComplexTensor:$rhs),
             (HLO_ComplexOp
-              (elementwiseOp (HLO_RealOp $lhs), (HLO_RealOp $rhs),
-               $broadcast_dimensions),
-              (elementwiseOp (HLO_ImagOp $lhs), (HLO_ImagOp $rhs),
-               $broadcast_dimensions))>;
+              (elementwiseOp (HLO_RealOp $lhs), (HLO_RealOp $rhs)),
+              (elementwiseOp (HLO_ImagOp $lhs), (HLO_ImagOp $rhs)))>;
 
 // Complex multiplication results in a cross product multiplication between the
 // real and imaginary components such that:
 //   result.real = lhs.real * rhs.real - lhs.imag * rhs.imag
 //   result.imag = lhs.imag * rhs.real + lhs.real * rhs.imag
 def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs,
-           HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+           HLO_ComplexTensor:$rhs),
           (HLO_ComplexOp
            (HLO_SubOp
             (HLO_MulOp
              (HLO_RealOp:$lhs_real $lhs),
-             (HLO_RealOp:$rhs_real $rhs),
-             $broadcast_dimensions),
+             (HLO_RealOp:$rhs_real $rhs)),
             (HLO_MulOp
              (HLO_ImagOp:$lhs_imag $lhs),
-             (HLO_ImagOp:$rhs_imag $rhs),
-             $broadcast_dimensions),
-            (NullDenseIntElementsAttr)),
+             (HLO_ImagOp:$rhs_imag $rhs))),
            (HLO_AddOp
-            (HLO_MulOp $lhs_real, $rhs_imag, $broadcast_dimensions),
-            (HLO_MulOp $lhs_imag, $rhs_real, $broadcast_dimensions),
-            (NullDenseIntElementsAttr)))>;
+            (HLO_MulOp $lhs_real, $rhs_imag),
+            (HLO_MulOp $lhs_imag, $rhs_real)))>;
 
 // Multiplication between a complex and real tensor can be distributed by
 // applying the real multiplicant to both the real and complex component.
 //
 // Note that the sourcep pattern is not legal according to the HLO dialect but
 // instead handle intermediates generated by other patterns.
-def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs),
           (HLO_ComplexOp
-           (HLO_MulOp (HLO_RealOp $lhs), $rhs, $broadcast_dimensions),
-           (HLO_MulOp (HLO_ImagOp $lhs), $rhs, $broadcast_dimensions))>;
+           (HLO_MulOp (HLO_RealOp $lhs), $rhs),
+           (HLO_MulOp (HLO_ImagOp $lhs), $rhs))>;
 
-def : Pat<(HLO_MulOp HLO_IntOrFpTensor:$lhs, HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_IntOrFpTensor:$lhs, HLO_ComplexTensor:$rhs),
           (HLO_ComplexOp
-           (HLO_MulOp $lhs, (HLO_RealOp $rhs), $broadcast_dimensions),
-           (HLO_MulOp $lhs, (HLO_ImagOp $rhs), $broadcast_dimensions))>;
+           (HLO_MulOp $lhs, (HLO_RealOp $rhs)),
+           (HLO_MulOp $lhs, (HLO_ImagOp $rhs)))>;
 
 
 // Division is performed by normalizing the denominator by multiplying by the
 // conjugate of the rhs.
 //   numerator = lhs * conj(rhs)
 //   denominator = rhs * conj(rhs)
-def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_ComplexTensor:$rhs),
             (HLO_DivOp
              (HLO_MulOp:$num $lhs,
               (HLO_ComplexOp:$conj
                (HLO_RealOp $rhs),
-               (HLO_NegOp (HLO_ImagOp $rhs))),
-              $broadcast_dimensions),
-             (HLO_RealOp:$den (HLO_MulOp $rhs, $conj, $broadcast_dimensions)),
-             (BinBroadcastDimensions $num, $den))>;
+               (HLO_NegOp (HLO_ImagOp $rhs)))),
+             (HLO_RealOp:$den (HLO_MulOp $rhs, $conj)))>;
 
 
-def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs),
           (HLO_ComplexOp
-           (HLO_DivOp (HLO_RealOp $lhs), $rhs, $broadcast_dimensions),
-           (HLO_DivOp (HLO_ImagOp $lhs), $rhs, $broadcast_dimensions))>;
+           (HLO_DivOp (HLO_RealOp $lhs), $rhs),
+           (HLO_DivOp (HLO_ImagOp $lhs), $rhs))>;
 
 
 // Absolute value is evaluated as:
@@ -100,11 +92,8 @@ def : Pat<(HLO_AbsOp HLO_ComplexTensor:$val),
           (HLO_ComplexOp
            (HLO_SqrtOp
              (HLO_AddOp
-              (HLO_MulOp (HLO_RealOp:$real $val), $real,
-               (NullDenseIntElementsAttr)),
-              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag,
-               (NullDenseIntElementsAttr)),
-              (NullDenseIntElementsAttr))),
+              (HLO_MulOp (HLO_RealOp:$real $val), $real),
+              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag))),
            (HLO_ConstOp (ConstantSplat<"0"> $real)))>;
 
 // Exponential can be lowered to an exponential on the real component and a
@@ -117,5 +106,4 @@ def : Pat<(HLO_ExpOp HLO_ComplexTensor:$val),
            (HLO_ExpOp (HLO_RealOp $val)),
            (HLO_ComplexOp
             (HLO_CosOp (HLO_ImagOp:$imag $val)),
-            (HLO_SinOp $imag)),
-           (NullDenseIntElementsAttr))>;
+            (HLO_SinOp $imag)))>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
index 9d04e82430d..21b954a3eb4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
@@ -44,22 +44,27 @@ MAP_HLO_TO_LHLO(BroadcastInDimOp);
 MAP_HLO_TO_LHLO(CeilOp);
 MAP_HLO_TO_LHLO(ConstOp);
 MAP_HLO_TO_LHLO(CompareOp);
+MAP_HLO_TO_LHLO(ComplexOp);
 MAP_HLO_TO_LHLO(ConvertOp);
 MAP_HLO_TO_LHLO(CopyOp);
 MAP_HLO_TO_LHLO(CosOp);
 MAP_HLO_TO_LHLO(DivOp);
+MAP_HLO_TO_LHLO(DotOp);
 MAP_HLO_TO_LHLO(ExpOp);
+MAP_HLO_TO_LHLO(ImagOp);
 MAP_HLO_TO_LHLO(IotaOp);
 MAP_HLO_TO_LHLO(LogOp);
 MAP_HLO_TO_LHLO(MaxOp);
 MAP_HLO_TO_LHLO(MinOp);
 MAP_HLO_TO_LHLO(MulOp);
 MAP_HLO_TO_LHLO(NegOp);
+MAP_HLO_TO_LHLO(RealOp);
 MAP_HLO_TO_LHLO(ReduceOp);
 MAP_HLO_TO_LHLO(RemOp);
 MAP_HLO_TO_LHLO(RsqrtOp);
 MAP_HLO_TO_LHLO(SelectOp);
 MAP_HLO_TO_LHLO(SignOp);
+MAP_HLO_TO_LHLO(SinOp);
 MAP_HLO_TO_LHLO(SqrtOp);
 MAP_HLO_TO_LHLO(SubOp);
 MAP_HLO_TO_LHLO(TanhOp);
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
index 8296011bf54..c317dc36b3c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
@@ -227,6 +227,28 @@ inline Value MapLhloOpToStdScalarOp<xla_lhlo::CeilOp>(
       loc, result_types, args, b);
 }
 
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::ComplexOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<CreateComplexOp>{}(loc, result_types, args,
+                                                       b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::RealOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<ReOp>{}(loc, result_types, args, b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::ImagOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<ImOp>{}(loc, result_types, args, b);
+}
+
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::ConvertOp>(
     Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
@@ -259,11 +281,9 @@ inline Value MapLhloOpToStdScalarOp<xla_lhlo::ConvertOp>(
     // No conversion is needed for the same width integers
     return args.front();
   }
-  // TODO(dfki-ehna): Add other primitive type conversions
-  // if (mlir::FpToSiOp::areCastCompatible(sourceType, targetType)) {
-  //   return b.create<mlir::FpToSiOp>(loc, result_types,
-  //   args,mlir::None);
-  // }
+  if (mlir::FPToSIOp::areCastCompatible(sourceType, targetType)) {
+    return b->create<mlir::FPToSIOp>(loc, result_types, args, mlir::None);
+  }
   return nullptr;
 }
 
@@ -275,6 +295,14 @@ inline Value MapLhloOpToStdScalarOp<xla_lhlo::CosOp>(
       loc, result_types, args, b);
 }
 
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::SinOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::SinOp>{}(
+      loc, result_types, args, b);
+}
+
 /// Implements the conversion of XLA op to scalar op (to use within region of a
 /// linalg.generic op) for compare-select style operations like min/max.
 template <typename... Args>
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
index 237cac64ffd..c56f5adc12d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
@@ -28,281 +28,43 @@ namespace xla_hlo {
 
 namespace {
 
-// Returns a 1-d i64 elements attribute populated with numbers from start to
-// end, excluding.
-static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
-                                                     Builder *builder) {
-  int size = end - start;
+// Converts ClampOp with broadcast semantics. ClampOp requires "all three arrays
+// must be the same shape. Alternatively, as a restricted form of broadcasting,
+// min and/or max can be a scalar of type T."
+struct ClampWithBroadcastConvert : public OpRewritePattern<ClampOp> {
+  explicit ClampWithBroadcastConvert(MLIRContext *context)
+      : OpRewritePattern<ClampOp>(context) {}
 
-  SmallVector<int64_t, 4> vals;
-  vals.resize(size);
-  std::iota(vals.begin(), vals.end(), start);
-
-  TensorType ty = RankedTensorType::get({size}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(ty, vals);
-}
-
-// Helper function for OpRewritePattern classes to materialize broadcasts on
-// LHS and RHS arguments to a binary op.
-//
-// Returns true and sets out_lhs and out_rhs to BroadcastInDimOps if successful,
-// returns false otherwise.
-template <typename SrcOp>
-bool CreateBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
-                                 Value *out_lhs, Value *out_rhs) {
-  if (!op.broadcast_dimensions().hasValue()) {
-    // Note: the op may still have an implicit broadcast on it, such as
-    // for (tensor<1xf32>, tensor<4xf32>).
-    return false;
-  }
-
-  // Insert BroadcastInDimOps for the left-hand-side and right-hand-side args,
-  // replacing the original LHS and RHS args in the source op with the results
-  // of the broadcasts.
-  //
-  // If the higher dimensional argument does not actually need the broadcast,
-  // a canonicalization pass should be able to remove that op later.
-  Value lhs = op.lhs();
-  Value rhs = op.rhs();
-
-  auto op_ranked_type = op.getType().template dyn_cast<RankedTensorType>();
-  auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
-  auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
-  if (!op_ranked_type || !lhs_ranked_type || !rhs_ranked_type) {
-    // Unranked, can't determine at this point how to perform the broadcast.
-    return false;
-  }
-
-  // Dynamic result shape, can't use BroadcastInDimOp.
-  assert(op_ranked_type.hasStaticShape() &&
-         "dynamic shape requires DynamicBroadcastInDim");
-
-  auto lhs_rank = lhs_ranked_type.getRank();
-  auto rhs_rank = rhs_ranked_type.getRank();
-
-  // Set broadcast_dimensions to [0, ..., rank] for the higher rank arg.
-  // Use the original op.broadcast_dimensions for the lower rank arg.
-  auto higher_rank_broadcast_dims =
-      GetI64ElementsAttrForSeq(0, std::max(lhs_rank, rhs_rank), rewriter);
-  DenseIntElementsAttr lhs_broadcast_dims;
-  DenseIntElementsAttr rhs_broadcast_dims;
-  if (lhs_rank > rhs_rank) {
-    lhs_broadcast_dims = higher_rank_broadcast_dims;
-    rhs_broadcast_dims = op.broadcast_dimensions().getValue();
-  } else if (lhs_rank < rhs_rank) {
-    lhs_broadcast_dims = op.broadcast_dimensions().getValue();
-    rhs_broadcast_dims = higher_rank_broadcast_dims;
-  } else {
-    // This shouldn't happen for legal ops. If the broadcast_dimensions
-    // attribute is set, the ranks should be different.
-    // TODO(scotttodd): Add a custom verification for ops and assert here.
-    return false;
-  }
-
-  // BroadcastInDimOp must have the same element type for operands and results,
-  // so preserve the original output shape and the original input element type.
-  // For example, `SrcOp (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xi1>`:
-  //   broadcast_in_dim (tensor<1x4xf32>) -> tensor<1x4xf32>
-  //   broadcast_in_dim (tensor<4xf32>) -> tensor<1x4xf32>
-  //   SrcOp (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xi1>
-  ArrayRef<int64_t> op_shape = op_ranked_type.getShape();
-  auto lhs_type =
-      RankedTensorType::get(op_shape, lhs_ranked_type.getElementType());
-  auto rhs_type =
-      RankedTensorType::get(op_shape, rhs_ranked_type.getElementType());
-
-  *out_lhs = rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), lhs_type,
-                                                      lhs, lhs_broadcast_dims);
-  *out_rhs = rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), rhs_type,
-                                                      rhs, rhs_broadcast_dims);
-  return true;
-}
-
-// Helper template to generate code for computing the result shape of a
-// broadcasted operation. This ultimately should be subsumed by functions
-// from the shape dialect.
-// Assumes that large and small are the operand values of `op` and that they
-// have a ranked tensory type with rank(large) >= rank(small).
-template <typename SrcOp>
-std::vector<Value> ComputeBroadcastedShape(SrcOp op, Value small, Value large,
-                                           PatternRewriter *rewriter) {
-  auto loc = op.getLoc();
-  auto larger_ranked_type = large.getType().cast<RankedTensorType>();
-  auto output_rank = larger_ranked_type.getRank();
-
-  constexpr int kExpandShape = -1;
-
-  std::vector<Value> shape_values;
-  shape_values.reserve(output_rank);
-  std::vector<int> indexes(output_rank, kExpandShape);
-  DenseIntElementsAttr broadcast_dimensions =
-      op.broadcast_dimensions().getValue();
-  // Compute a mapping from output dimensions to their corresponding input
-  // dimensions in the smaller ranked operand.
-  for (auto pair : llvm::enumerate(broadcast_dimensions.getIntValues())) {
-    indexes.at(pair.value().getLimitedValue()) = pair.index();
-  }
-
-  // Compute the broadcasted shape of the result using numpy style broadcasting
-  // semantics. The result shape at a position is the shape of the larger
-  // operand at that position if the no dimension of the smaller operand is
-  // mapped to it.
-  // If both operands contribute to an output dimension, their shape has to
-  // either be the same in that dimension or it can be 1, in which case the
-  // shape of the other operand is used.
-  for (int i = 0; i < output_rank; ++i) {
-    Value index_value;
-    if (indexes[i] == kExpandShape) {
-      // The smaller shape gets expanded to the larger one in this case.
-      index_value = rewriter->create<mlir::DimOp>(loc, large, i);
-    } else {
-      // Compute the result shape depending on whether the rank of smaller is 1.
-      // This does not check that the broadcast operation actualy is correct.
-      // In particular, we do not check that both shapes are the same if the
-      // smaller ranked shape is not 1.
-      ConstantOp one = rewriter->create<mlir::ConstantOp>(
-          loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
-      DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
-      DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
-      CmpIOp compare =
-          rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
-      index_value =
-          rewriter->create<mlir::SelectOp>(loc, compare, lrg_dim, sml_dim);
-    }
-    // Ideally, we would like to keep this on index but MLIR does not allow
-    // this.
-    shape_values.push_back(rewriter->create<mlir::IndexCastOp>(
-        loc, index_value, rewriter->getIntegerType(32)));
-  }
-
-  return shape_values;
-}
-
-// Helper function for OpRewritePattern classes to materialize dynamic
-// broadcasts on LHS and RHS arguments to a binary op.
-//
-// Returns true and set out_lhs and out_rhs for materialized dynamic broadcasts
-// for LHS and RHS arguments, else returns false.
-template <typename SrcOp>
-bool CreateDynamicBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
-                                        Value *out_lhs, Value *out_rhs) {
-  if (!op.broadcast_dimensions().hasValue()) {
-    // Note: the op may still have an implicit broadcast on it, such as
-    // for (tensor<1xf32>, tensor<4xf32>).
-    return false;
-  }
-
-  // Insert BroadcastInDimOps for the left-hand-side and right-hand-side args,
-  // replacing the original LHS and RHS args in the source op with the results
-  // of the broadcasts.
-  Value lhs = op.lhs();
-  Value rhs = op.rhs();
-
-  auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
-  auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
-  if (!lhs_ranked_type || !rhs_ranked_type) {
-    // Unranked, can't determine at this point how to perform the broadcast.
-    return false;
-  }
-
-  auto lhs_rank = lhs_ranked_type.getRank();
-  auto rhs_rank = rhs_ranked_type.getRank();
-
-  // Set broadcast_dimensions to [0, ..., rank] for the higher rank arg.
-  // Use the original op.broadcast_dimensions for the lower rank arg.
-  auto higher_rank_broadcast_dims =
-      GetI64ElementsAttrForSeq(0, std::max(lhs_rank, rhs_rank), rewriter);
-  DenseIntElementsAttr lhs_broadcast_dims;
-  DenseIntElementsAttr rhs_broadcast_dims;
-  std::vector<Value> shape_elements;
-  if (lhs_rank > rhs_rank) {
-    lhs_broadcast_dims = higher_rank_broadcast_dims;
-    rhs_broadcast_dims = op.broadcast_dimensions().getValue();
-    shape_elements = ComputeBroadcastedShape<SrcOp>(op, rhs, lhs, rewriter);
-  } else if (lhs_rank < rhs_rank) {
-    lhs_broadcast_dims = op.broadcast_dimensions().getValue();
-    rhs_broadcast_dims = higher_rank_broadcast_dims;
-    shape_elements = ComputeBroadcastedShape<SrcOp>(op, lhs, rhs, rewriter);
-  } else {
-    // This shouldn't happen for legal ops. If the broadcast_dimensions
-    // attribute is set, the ranks should be different.
-    // TODO(scotttodd): Add a custom verification for ops and assert here.
-    return false;
-  }
-
-  // DynamicBroadcastInDimOp preserves the element type but produces a tensor
-  // with unranked shape. The rank of the output is the length of the
-  // output shape argument.
-  SmallVector<int64_t, 4> op_shape(shape_elements.size(),
-                                   RankedTensorType::kDynamicSize);
-  auto lhs_type =
-      RankedTensorType::get(op_shape, lhs_ranked_type.getElementType());
-  auto rhs_type =
-      RankedTensorType::get(op_shape, rhs_ranked_type.getElementType());
-
-  // We need a way to turn a list of scalars into a vector. While Standard
-  // dialect does not have one, use the XLA_HLO variant.
-  int shape_size = shape_elements.size();
-  Type shape_element_type = shape_elements.front().getType();
-  Value shape_value = rewriter->create<ScalarsToDimensionTensorOp>(
-      op.getLoc(), RankedTensorType::get({shape_size}, shape_element_type),
-      shape_elements);
-
-  *out_lhs = rewriter->createOrFold<DynamicBroadcastInDimOp>(
-      op.getLoc(), lhs_type, lhs, shape_value, lhs_broadcast_dims);
-  *out_rhs = rewriter->createOrFold<DynamicBroadcastInDimOp>(
-      op.getLoc(), rhs_type, rhs, shape_value, rhs_broadcast_dims);
-  return true;
-}
-
-template <typename SrcOp>
-struct BinaryOpWithBroadcastConvert : public OpRewritePattern<SrcOp> {
-  explicit BinaryOpWithBroadcastConvert(MLIRContext *context)
-      : OpRewritePattern<SrcOp>(context) {}
-
-  LogicalResult matchAndRewrite(SrcOp op,
+  LogicalResult matchAndRewrite(ClampOp op,
                                 PatternRewriter &rewriter) const override {
-    Value new_lhs;
-    Value new_rhs;
+    auto operand_type = op.operand().getType().dyn_cast<RankedTensorType>();
+    auto max_type = op.max().getType().dyn_cast<RankedTensorType>();
+    auto min_type = op.min().getType().dyn_cast<RankedTensorType>();
+    // Unrancked types are not supported.
+    if (!operand_type || !max_type || !min_type) return failure();
+    // Does not support operand with dynamic dimensions for now.
+    if (!operand_type.hasStaticShape()) return failure();
 
-    auto op_ranked_type = op.getType().template dyn_cast<RankedTensorType>();
-    if (!op_ranked_type) return failure();
+    ArrayRef<int64_t> operand_shape = operand_type.getShape();
 
-    if (op_ranked_type.hasStaticShape()) {
-      if (!CreateBroadcastsForBinaryOp(op, &rewriter, &new_lhs, &new_rhs)) {
-        return failure();
-      }
-    } else {
-      if (!CreateDynamicBroadcastsForBinaryOp(op, &rewriter, &new_lhs,
-                                              &new_rhs)) {
-        return failure();
-      }
+    Value max_value = op.max();
+    if (max_type != operand_type) {
+      assert(max_type.getRank() == 0);
+      max_value = rewriter.createOrFold<BroadcastOp>(
+          op.getLoc(), operand_type, max_value,
+          rewriter.getI64TensorAttr(operand_shape));
     }
 
-    // Replace the original op with a new one that uses the new args.
-    // New args are broadcasts, so no dims are needed on the replacement op.
-    rewriter.replaceOpWithNewOp<SrcOp>(op, op.getType(), new_lhs, new_rhs,
-                                       /*broadcast_dims=*/nullptr);
-    return success();
-  }
-};
-
-// Specialized class for CompareOp, as it has an additional builder argument.
-struct CompareWithBroadcastConvert : public OpRewritePattern<CompareOp> {
-  explicit CompareWithBroadcastConvert(MLIRContext *context)
-      : OpRewritePattern<CompareOp>(context) {}
-
-  LogicalResult matchAndRewrite(CompareOp op,
-                                PatternRewriter &rewriter) const override {
-    Value new_lhs;
-    Value new_rhs;
-    if (!CreateBroadcastsForBinaryOp(op, &rewriter, &new_lhs, &new_rhs)) {
-      return failure();
+    Value min_value = op.min();
+    if (min_type != operand_type) {
+      assert(min_type.getRank() == 0);
+      min_value = rewriter.createOrFold<BroadcastOp>(
+          op.getLoc(), operand_type, min_value,
+          rewriter.getI64TensorAttr(operand_shape));
     }
 
-    rewriter.replaceOpWithNewOp<CompareOp>(op, op.getType(), new_lhs, new_rhs,
-                                           /*broadcast_dims=*/nullptr,
-                                           op.comparison_direction());
+    rewriter.replaceOpWithNewOp<ClampOp>(op, op.getType(), min_value,
+                                         op.operand(), max_value);
     return success();
   }
 };
@@ -311,58 +73,18 @@ struct CompareWithBroadcastConvert : public OpRewritePattern<CompareOp> {
 
 void SetupMaterializeBroadcastsLegality(MLIRContext *context,
                                         ConversionTarget *conversionTarget) {
-#define ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(OpType) \
-  conversionTarget->addDynamicallyLegalOp<OpType>(      \
-      [](OpType op) { return !op.broadcast_dimensions().hasValue(); });
-  // Binary elementwise ops.
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(AddOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(Atan2Op);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(DivOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(MaxOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(MinOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(MulOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(PowOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(RemOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(ShiftLeftOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(ShiftRightArithmeticOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(ShiftRightLogicalOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(SubOp);
-
-  // Binary logical elementwise ops.
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(AndOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(OrOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(XorOp);
-
-  // CompareOp.
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(CompareOp);
-
-#undef ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST
+  conversionTarget->addDynamicallyLegalOp<ClampOp>([](ClampOp op) {
+    return op.max().getType() == op.operand().getType() &&
+           op.min().getType() == op.operand().getType();
+  });
 }
 
 void PopulateMaterializeBroadcastsPatterns(MLIRContext *context,
                                            OwningRewritePatternList *patterns) {
-  // Binary elementwise ops.
-  patterns->insert<BinaryOpWithBroadcastConvert<AddOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<Atan2Op>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<DivOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<MaxOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<MinOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<MulOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<PowOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<RemOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<ShiftLeftOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<ShiftRightArithmeticOp>>(
-      context);
-  patterns->insert<BinaryOpWithBroadcastConvert<ShiftRightLogicalOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<SubOp>>(context);
-
-  // Binary logical elementwise ops.
-  patterns->insert<BinaryOpWithBroadcastConvert<AndOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<OrOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<XorOp>>(context);
-
-  // CompareOp. Note the specialized class instead of using the template.
-  patterns->insert<CompareWithBroadcastConvert>(context);
+  // ClampOp. This op has a special case where it accepts either same-shaped
+  // inputs or scalars (a restricted form of broadcasting). This makes the
+  // broadcast explicit.
+  patterns->insert<ClampWithBroadcastConvert>(context);
 }
 
 }  // namespace xla_hlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 2d0164981a3..a1dd6c5ce1e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -36,7 +36,7 @@ namespace xla_hlo {
 /// Lowers from TF dialect to HLO dialect. When allow_partial_conversion is
 /// false, emits an error if there is any operation that can't be legalized.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion = false);
+    bool allow_partial_conversion = false, bool legalize_chlo = true);
 
 /// Lowers from TF dialect to HLO dialect using tf2xla op kernels for the
 /// specified device type.
@@ -50,7 +50,8 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFControlFlowPass();
 /// dialect using the conversion patterns registered by the HLO dialect. When
 /// allow_partial_conversion is false, emits an error if there is any operation
 /// that can't be legalized.
-LogicalResult legalizeTF(Operation* op, bool allow_partial_conversion = false);
+LogicalResult legalizeTF(Operation* op, bool allow_partial_conversion = false,
+                         bool legalize_chlo = true);
 
 /// Lowers HLO control flow ops to the Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeControlFlowPass();
@@ -65,6 +66,10 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass();
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass();
 
+// Sinks constants implicitly captured in control flow regions. This is
+// necessary to export to XLA.
+std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
+
 }  // namespace xla_hlo
 
 namespace xla_lhlo {
@@ -81,8 +86,8 @@ std::unique_ptr<OperationPass<FuncOp>> createLegalizeToGpuPass();
 // Fuses linalg ops obtained after LHLO lowering. To enable fusion,
 // operations are first tiled.
 //
-// When 'use_parallel_loops' is set, the tiling will use loop.parallel
-// operations. Otherwise, loop.for operations are used.
+// When 'use_parallel_loops' is set, the tiling will use scf.parallel
+// operations. Otherwise, scf.for operations are used.
 //
 // 'tile_sizes' provides the tile sizes to use for tiling. If the linalg
 // operation has more dimensions than tile sizes provided, 1 is used as
diff --git a/tensorflow/compiler/mlir/xla/transforms/rewriters.h b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
index 7656c89facb..9cde6f84474 100644
--- a/tensorflow/compiler/mlir/xla/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 
 namespace mlir {
+class BufferAssignmentPlacer;
 namespace xla_hlo {
 
 // Collection of rewrite patterns for lowering a general dot product.
@@ -38,9 +39,9 @@ void PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
                               MLIRContext *ctx);
 
 // Collection of rewrite patterns for lowering of HLO to LHLO dialect.
-void populateHLOToLHLOConversionPattern(MLIRContext *context,
-                                        OwningRewritePatternList *patterns);
-
+void populateHLOToLHLOConversionPattern(
+    MLIRContext *context, BufferAssignmentPlacer *bufferAssignment,
+    TypeConverter *converter, OwningRewritePatternList *patterns);
 // Collection of rewrite patterns for lowering of HLO to Linalg dialect.
 void populateHLOToLinalgConversionPattern(MLIRContext *context,
                                           OwningRewritePatternList *patterns);
@@ -61,6 +62,16 @@ void PopulateUnfuseBatchNormPatterns(MLIRContext *context,
                                      OwningRewritePatternList *patterns);
 
 }  // namespace xla_hlo
+
+namespace xla_chlo {
+
+// Populates a collection of conversion patterns for legalizing client-HLO to
+// HLO.
+void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
+                                       OwningRewritePatternList *patterns);
+
+}  // namespace xla_chlo
+
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_REWRITERS_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
new file mode 100644
index 00000000000..5a45e0f3b18
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+
+namespace mlir {
+namespace xla_hlo {
+
+namespace {
+
+// A pass that sinks constants implicitly captured in control flow regions. This
+// is necessary to export to XLA.
+class SinkConstantsToControlFlow
+    : public mlir::PassWrapper<SinkConstantsToControlFlow, FunctionPass> {
+  void runOnFunction() override {
+    getFunction().walk([](Operation* op) {
+      if (auto while_op = llvm::dyn_cast<WhileOp>(op)) {
+        SinkToRegion(&while_op.body());
+        SinkToRegion(&while_op.cond());
+      } else if (auto if_op = llvm::dyn_cast<IfOp>(op)) {
+        SinkToRegion(&if_op.true_branch());
+        SinkToRegion(&if_op.false_branch());
+      }
+    });
+  }
+
+ private:
+  // Performs constant sinking into a region.
+  static void SinkToRegion(Region* region) {
+    llvm::DenseMap<Value, ConstOp> sunk_constant;
+    visitUsedValuesDefinedAbove({*region}, [&](OpOperand* use) {
+      Value constant = use->get();
+      auto const_op = dyn_cast_or_null<ConstOp>(constant.getDefiningOp());
+      if (!const_op) return;
+      auto map_entry = sunk_constant.try_emplace(constant, nullptr);
+      if (!map_entry.second) {
+        // This constant has already been cloned into the region, reuse it.
+        use->set(map_entry.first->getSecond().getResult());
+        if (constant.use_empty()) const_op.erase();
+        return;
+      }
+      if (constant.hasOneUse()) {
+        const_op.getOperation()->moveBefore(&region->front().front());
+        return;
+      }
+      map_entry.first->getSecond() = const_op.clone();
+      region->front().getOperations().insert(region->front().begin(),
+                                             map_entry.first->getSecond());
+      use->set(map_entry.first->getSecond().getResult());
+    });
+  }
+};
+
+static mlir::PassRegistration<SinkConstantsToControlFlow> pass(
+    "xla-hlo-sink-constants-to-control-flow",
+    "Sink constants implicitly captured in control flow regions. This is "
+    "necessary to export to XLA.");
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass() {
+  return std::make_unique<SinkConstantsToControlFlow>();
+}
+
+}  // namespace xla_hlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc b/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc
new file mode 100644
index 00000000000..71441656c08
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc
@@ -0,0 +1,100 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace xla {
+namespace {
+
+struct InferReturnTypeComponentsPattern : public RewritePattern {
+  InferReturnTypeComponentsPattern(MLIRContext *context)
+      : RewritePattern("xla_test.get_return_type_components", 1, context) {}
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    if (op->getNumOperands() != 1) return failure();
+    auto defining_op = op->getOperand(0).getDefiningOp();
+    auto defining_op_int =
+        llvm::dyn_cast_or_null<InferShapedTypeOpInterface>(defining_op);
+    if (!defining_op_int) return failure();
+    SmallVector<ShapedTypeComponents, 4> components;
+    if (failed(defining_op_int.inferReturnTypeComponents(
+            op->getContext(), op->getLoc(), defining_op->getOperands(),
+            defining_op->getAttrDictionary(), defining_op->getRegions(),
+            components))) {
+      return failure();
+    }
+
+    // Replace the op with another pass-through op with attributes added.
+    OperationState state(op->getLoc(), "xla_test.return_type_components",
+                         op->getOperands(), op->getResultTypes(),
+                         op->getAttrs());
+    auto new_op = rewriter.createOperation(state);
+    for (auto it : llvm::enumerate(components)) {
+      if (it.value().hasRank()) {
+        new_op->setAttr((StringRef("dims") + Twine(it.index())).str(),
+                        rewriter.getI64ArrayAttr(it.value().getDims()));
+      }
+      if (it.value().getElementType()) {
+        new_op->setAttr((Twine("element_type") + Twine(it.index())).str(),
+                        TypeAttr::get(it.value().getElementType()));
+      }
+    }
+    rewriter.replaceOp(op, {new_op->getResults()});
+    return success();
+  }
+};
+
+struct ReifyReturnTypeShapesPattern : public RewritePattern {
+  ReifyReturnTypeShapesPattern(MLIRContext *context)
+      : RewritePattern("xla_test.reify_return_type_shapes", 1, context) {}
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    if (op->getNumOperands() != 1) return failure();
+    auto defining_op = llvm::dyn_cast_or_null<InferShapedTypeOpInterface>(
+        op->getOperand(0).getDefiningOp());
+    if (!defining_op) return failure();
+    SmallVector<Value, 4> return_shapes;
+    if (failed(defining_op.reifyReturnTypeShapes(rewriter, return_shapes))) {
+      return failure();
+    }
+    rewriter.replaceOp(op, return_shapes);
+    return success();
+  }
+};
+
+struct TestInferShapedTypeMethodsPass
+    : public PassWrapper<TestInferShapedTypeMethodsPass, FunctionPass> {
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    patterns.insert<ReifyReturnTypeShapesPattern>(&getContext());
+    patterns.insert<InferReturnTypeComponentsPattern>(&getContext());
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
+  }
+};
+
+}  // namespace
+}  // namespace xla
+}  // namespace mlir
+
+static mlir::PassRegistration<mlir::xla::TestInferShapedTypeMethodsPass> pass(
+    "test-xla-infer-shaped-type-methods",
+    "Uses test ops to invoke InferShapedTypeOpInterface methods");
diff --git a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
index 32d8b079c89..98eb404e4d4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
@@ -58,9 +58,7 @@ Value CalculateShapeValue(Location loc, Value operand,
   int64_t rank = result_type.getRank();
   shape_values.reserve(rank);
   for (int64_t i = 0; i < rank; ++i) {
-    auto index_value = rewriter.create<mlir::DimOp>(loc, operand, i);
-    shape_values.push_back(rewriter.create<mlir::IndexCastOp>(
-        loc, index_value, rewriter.getIntegerType(32)));
+    shape_values.push_back(rewriter.create<mlir::DimOp>(loc, operand, i));
   }
   Type shape_element_type = shape_values.front().getType();
   return rewriter.create<ScalarsToDimensionTensorOp>(
@@ -137,8 +135,8 @@ class UnfuseBatchNormInferencePattern
     if (!epsilon) {
       return failure();
     }
-    Value stddev = rewriter.create<xla_hlo::AddOp>(
-        bn_op.getLoc(), bn_op.variance(), epsilon, /*broadcast_dims=*/nullptr);
+    Value stddev = rewriter.create<xla_hlo::AddOp>(bn_op.getLoc(),
+                                                   bn_op.variance(), epsilon);
     stddev = rewriter.create<xla_hlo::SqrtOp>(bn_op.getLoc(), stddev);
 
     // Broadcast all terms.
@@ -162,13 +160,13 @@ class UnfuseBatchNormInferencePattern
     // Compute:
     // scale * (input - mean) / stddev + offset
     Value result = rewriter.create<xla_hlo::SubOp>(
-        bn_op.getLoc(), bn_op.operand(), broadcast_mean, nullptr);
+        bn_op.getLoc(), bn_op.operand(), broadcast_mean);
     result = rewriter.create<xla_hlo::MulOp>(bn_op.getLoc(), result,
-                                             broadcast_scale, nullptr);
+                                             broadcast_scale);
     result = rewriter.create<xla_hlo::DivOp>(bn_op.getLoc(), result,
-                                             broadcast_stddev, nullptr);
-    rewriter.replaceOpWithNewOp<xla_hlo::AddOp>(bn_op, result, broadcast_offset,
-                                                nullptr);
+                                             broadcast_stddev);
+    rewriter.replaceOpWithNewOp<xla_hlo::AddOp>(bn_op, result,
+                                                broadcast_offset);
 
     return success();
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
new file mode 100644
index 00000000000..a12bd9e7c1a
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
@@ -0,0 +1,458 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.h"
+
+#include <memory>
+#include <tuple>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/hlo_utils.h"
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+using xla::BufferAllocation;
+using xla::BufferAssignment;
+using xla::HloComputation;
+using xla::HloInstruction;
+using xla::HloModule;
+using xla::HloModuleProto;
+using xla::HloProto;
+using xla::Shape;
+using xla::StatusOr;
+
+namespace mlir {
+namespace {
+
+absl::string_view StringRefToView(llvm::StringRef ref) {
+  return {ref.data(), ref.size()};
+}
+
+StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
+    const HloProto& hlo_proto) {
+  const HloModuleProto& module_proto = hlo_proto.hlo_module();
+  TF_ASSIGN_OR_RETURN(const ::xla::HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          module_proto, ::xla::GetDebugOptionsFromFlags()));
+  return HloModule::CreateFromProto(module_proto, module_config);
+}
+
+// This class will process an HloModule with the supplied BufferAssignment and
+// populate the MLIR ModuleOp with the computation converted in the LHLO
+// dialect.
+class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
+ public:
+  // Main entry point of the processing: after this call the MLIR ModuleOp is
+  // populated with the computation from the HloModule. The returned `Status`
+  // indicates success or failure in the conversion.
+  Status Run();
+
+  LhloDialectEmitter(const BufferAssignment& assignment,
+                     const HloModule& hlo_module, ModuleOp module)
+      : assignment_(std::move(assignment)),
+        hlo_module_(hlo_module),
+        module_(module),
+        builder_(module.getContext()),
+        i8_type_(builder_.getIntegerType(8)) {}
+
+ private:
+  Status DefaultAction(HloInstruction* instr) final;
+
+  // Computation parameters don't need any specific handling when they are
+  // visited, they are already processed when we enter a new computation.
+  Status HandleParameter(HloInstruction* instr) final { return Status::OK(); }
+
+  // Helper function to create view in a buffer for a given slice. The view is
+  // cached in the `slices_` map.
+  Value GetOrCreateView(const BufferAllocation::Slice& slice);
+
+  // Helper function to create view in a buffer for a given instruction result.
+  StatusOr<Value> GetOrCreateView(const HloInstruction* instr);
+
+  // Return an MLIR location for an HLO instruction.
+  Location getLocation(HloInstruction* inst) {
+    return NameLoc::get(builder_.getIdentifier(inst->name()),
+                        builder_.getContext());
+  }
+
+  // This map provides access to MLIR buffers for each HLO buffer allocation.
+  // The MLIR buffers are all `memref<{size}xi8>` and correspond to function
+  // parameters. It is populated at the beginning of the processing with all the
+  // buffer allocations and is unchanged afterward. Every HLOInstruction is
+  // using a "slice" of the buffer allocation and providing shape, layout, and
+  // Dtype. An MLIR view is used separately to model slices into the allocations
+  // (see below).
+  llvm::DenseMap<const BufferAllocation*, Value> allocations_;
+
+  // This map provides access to MLIR buffers for each HLO buffer slice. A slice
+  // is contained in a BufferAllocation, and has an offset and a size.
+  // The MLIR buffers are all `memref<{size}xi8>`. If the slice is the entire
+  // BufferAllocation then the MLIR buffer corresponds to function
+  // parameter for the allocation, otherwise it will map to a ViewOp in the
+  // allocation. It is populated lazily in the `GetOrCreateView()` helper as we
+  // process every instruction.
+  using SliceKey = std::tuple<const BufferAllocation*, int64_t, int64_t>;
+  llvm::DenseMap<SliceKey, Value> slices_;
+
+  // The BufferAssignment computed by XLA ahead of time.
+  const BufferAssignment& assignment_;
+
+  // The HLO module that will be converted.
+  const HloModule& hlo_module_;
+
+  // This is the MLIR module in which a function will be created for every HLO
+  // computation.
+  ModuleOp module_;
+
+  // The builder keeps track of the current insertion point in the MLIR module.
+  OpBuilder builder_;
+  // Convenient "cached" access to this widely used MLIR type (i8).
+  Type i8_type_;
+};
+
+Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
+  llvm::SmallVector<Value, 4> operands(instr->operand_count() + 1);
+  for (int arg_idx = 0; arg_idx < instr->operand_count(); ++arg_idx) {
+    TF_ASSIGN_OR_RETURN(operands[arg_idx],
+                        GetOrCreateView(instr->operand(arg_idx)));
+  }
+
+  TF_ASSIGN_OR_RETURN(operands.back(), GetOrCreateView(instr));
+  Location loc = getLocation(instr);
+  ArrayRef<std::pair<Identifier, Attribute>> attrs;
+  ArrayRef<Type> rets{};
+
+  using ::xla::HloOpcode;
+  switch (instr->opcode()) {
+    case HloOpcode::kAbs:
+      builder_.create<xla_lhlo::AbsOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kAdd:
+      builder_.create<xla_lhlo::AddOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kAnd:
+      builder_.create<xla_lhlo::AndOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kCeil:
+      builder_.create<xla_lhlo::CeilOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kComplex:
+      builder_.create<xla_lhlo::ComplexOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kCopy:
+      builder_.create<xla_lhlo::CopyOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kCos:
+      builder_.create<xla_lhlo::CosOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kDivide:
+      builder_.create<xla_lhlo::DivOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kExp:
+      builder_.create<xla_lhlo::ExpOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kImag:
+      builder_.create<xla_lhlo::ImagOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kLog:
+      builder_.create<xla_lhlo::LogOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kMaximum:
+      builder_.create<xla_lhlo::MaxOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kMinimum:
+      builder_.create<xla_lhlo::MinOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kMultiply:
+      builder_.create<xla_lhlo::MulOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kNegate:
+      builder_.create<xla_lhlo::NegOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kReal:
+      builder_.create<xla_lhlo::RealOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kRemainder:
+      builder_.create<xla_lhlo::RemOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kRsqrt:
+      builder_.create<xla_lhlo::RsqrtOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kSelect:
+      builder_.create<xla_lhlo::SelectOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kSign:
+      builder_.create<xla_lhlo::SignOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kSqrt:
+      builder_.create<xla_lhlo::SqrtOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kSubtract:
+      builder_.create<xla_lhlo::SubOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    case HloOpcode::kTanh:
+      builder_.create<xla_lhlo::TanhOp>(loc, rets, operands, attrs);
+      return Status::OK();
+    default:
+      llvm::errs() << instr->ToString();
+      return tensorflow::errors::Internal(
+          absl::StrCat("LHLO opcode ", ::xla::HloOpcodeString(instr->opcode()),
+                       " is not supported."));
+  }
+  return Status::OK();
+}
+
+Value LhloDialectEmitter::GetOrCreateView(
+    const BufferAllocation::Slice& slice) {
+  // Check if we already have a view for this slice, otherwise we need to create
+  // a new one.
+  SliceKey slice_key(slice.allocation(), slice.offset(), slice.size());
+  auto slice_view_it = slices_.find(slice_key);
+  if (slice_view_it != slices_.end()) return slice_view_it->second;
+
+  // Check if we can just use the entire allocation before creating a view.
+  Value alloc_buffer = allocations_[slice.allocation()];
+  if (slice.offset() == 0 && slice.size() == slice.allocation()->size()) {
+    slices_.insert({slice_key, alloc_buffer});
+    return alloc_buffer;
+  }
+
+  // Create the view for this slice size, possible with an affine map to model
+  // the offset. The result is cached in the slices_ map.
+  // The std.view result type does not carry the static offset: this is not
+  // useful information. Rather, the view op must have the static offset.
+  auto slice_type = MemRefType::get({slice.size()}, i8_type_, {});
+
+  Value byte_shift =
+      builder_.create<ConstantIndexOp>(alloc_buffer.getLoc(), slice.offset());
+  auto slice_view =
+      builder_.create<ViewOp>(alloc_buffer.getLoc(), slice_type, alloc_buffer,
+                              byte_shift, /*sizes=*/ArrayRef<Value>{});
+  slices_.insert({slice_key, slice_view});
+  return slice_view;
+}
+
+// Returns a view for the result of an instruction.
+// We first get a view for the slice in the allocation, and then may need to
+// create another view to adjust the slice for the shape of the instruction.
+StatusOr<Value> LhloDialectEmitter::GetOrCreateView(
+    const HloInstruction* instr) {
+  const Shape& target_shape = instr->shape();
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_slice,
+                      assignment_.GetUniqueTopLevelSlice(instr));
+  Value slice_view = GetOrCreateView(out_slice);
+  TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
+                                         target_shape, builder_));
+  Value byte_shift =
+      builder_.create<ConstantIndexOp>(builder_.getUnknownLoc(), 0);
+  if (slice_view.getType() != out_type)
+    slice_view =
+        builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type, slice_view,
+                                byte_shift, /*sizes=*/ArrayRef<Value>{});
+  return slice_view;
+}
+
+Status LhloDialectEmitter::Run() {
+  HloComputation* computation = hlo_module_.entry_computation();
+  std::string function_name =
+      computation->name().empty() ? "__compute" : computation->name();
+
+  // Create the function as () -> (), we'll compute the arguments from the
+  // buffer allocation and update the type then.
+  auto func_op = FuncOp::create(builder_.getUnknownLoc(), function_name,
+                                builder_.getFunctionType({}, {}));
+  Block* block = func_op.addEntryBlock();
+
+  // The function signature will be composed of:
+  // - one memref for each of the parameters.
+  // - one memref for each other buffer allocation.
+  llvm::SmallVector<MutableDictionaryAttr, 8> args_attrs;
+  for (const HloInstruction* param : computation->parameter_instructions()) {
+    TF_ASSIGN_OR_RETURN(auto arg_type, ::xla::ConvertShapeToType<MemRefType>(
+                                           param->shape(), builder_));
+    // First map parameters to memrefs on the operation.
+    block->addArgument(arg_type);
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
+                        assignment_.GetUniqueTopLevelSlice(param));
+    allocations_[slice.allocation()] = block->getArguments().back();
+    args_attrs.emplace_back();
+    args_attrs.back().set(builder_.getIdentifier("xla_lhlo.params"),
+                          builder_.getIndexAttr(param->parameter_number()));
+  }
+
+  for (const BufferAllocation& alloc : assignment_.Allocations()) {
+    if (alloc.is_entry_computation_parameter()) continue;
+    block->addArgument(MemRefType::get({alloc.size()}, i8_type_));
+    allocations_[&alloc] = block->getArguments().back();
+    args_attrs.emplace_back();
+    args_attrs.back().set(builder_.getIdentifier("xla_lhlo.alloc"),
+                          builder_.getIndexAttr(alloc.index()));
+    if (alloc.maybe_live_out())
+      args_attrs.back().set(builder_.getIdentifier("xla_lhlo.liveout"),
+                            builder_.getBoolAttr(true));
+  }
+
+  FunctionType function_type = builder_.getFunctionType(
+      llvm::to_vector<8>(block->getArgumentTypes()), {});
+  func_op.setType(function_type);
+  func_op.setAllArgAttrs(args_attrs);
+
+  SymbolTable symbol_table(module_);
+  symbol_table.insert(func_op);
+  builder_.setInsertionPointToEnd(block);
+
+  const ::xla::HloInstructionSequence* schedule =
+      assignment_.hlo_ordering().SequentialOrder(*computation);
+  if (!schedule)
+    return ::xla::Unimplemented("Missing sequential order for the computation");
+
+  const std::vector<HloInstruction*>& ordering = schedule->instructions();
+  TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, ordering));
+  builder_.create<ReturnOp>(builder_.getUnknownLoc());
+  return Status::OK();
+}
+
+// Convert the MLIR `module` from HLO dialect to LHLO dialect using XLA for the
+// given platform.
+Status ConvertModule(ModuleOp module, StringRef platform_name) {
+  SymbolTable symbol_table(module);
+  if (!symbol_table.lookup("main")) {
+    return ::xla::InvalidArgument(
+        "conversion to HLO module failed: missing main()");
+  }
+  HloProto hlo_proto;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      ConvertMlirHloToHlo(module, &hlo_proto,
+                          /*use_tuple_args=*/false,
+                          /*return_tuple=*/false,
+                          /*shape_representation_fn=*/nullptr),
+      "conversion to XLA HLO proto failed");
+
+  auto statusOrHloModule = HloModuleFromProto(hlo_proto);
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(statusOrHloModule.status(),
+                                  "parsing HLO proto to HLO module failed");
+  std::unique_ptr<HloModule> hlo_module =
+      std::move(statusOrHloModule.ValueOrDie());
+
+  auto platform = ::xla::se::MultiPlatformManager::PlatformWithName(
+      StringRefToView(platform_name));
+  if (!platform.ok()) {
+    std::string error_msg;
+    llvm::raw_string_ostream os(error_msg);
+    os << "failed to get platform: " << platform.status().ToString()
+       << " (available Platform: ";
+    std::vector<std::string> available_platforms;
+    (void)::xla::se::MultiPlatformManager::PlatformsWithFilter(
+        [&](const stream_executor::Platform* p) {
+          available_platforms.push_back(p->Name());
+          return false;
+        });
+    llvm::interleaveComma(available_platforms, os);
+    os << ")";
+    return ::xla::InvalidArgument("%s", os.str().c_str());
+  }
+
+  ::xla::BackendOptions backend_options;
+  backend_options.set_platform(platform.ValueOrDie());
+  auto backend_or_err = ::xla::Backend::CreateBackend(backend_options);
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(backend_or_err.status(),
+                                  "failed to create XLA Backend ");
+  auto backend = std::move(backend_or_err.ValueOrDie());
+
+  // Run all HLO passes to produce an optimized module.
+  auto result_or = backend->compiler()->RunHloPassesAndBufferAssignement(
+      std::move(hlo_module), backend->default_stream_executor(),
+      backend->memory_allocator());
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(result_or.status(),
+                                  "running XLA pass pipeline");
+  std::unique_ptr<HloModule> optimized_hlo_module =
+      std::move(std::get<0>(result_or.ValueOrDie()));
+  std::unique_ptr<BufferAssignment> assignment =
+      std::move(std::get<1>(result_or.ValueOrDie()));
+
+  // Clear the module before populating it back with the result of the
+  // conversion.
+  module.getBody()->clear();
+  OpBuilder builder(module);
+  module.ensureTerminator(module.getBodyRegion(), builder, module.getLoc());
+
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      HloToLhloModule(*assignment, *optimized_hlo_module, module),
+      "converting HLO to LHLO");
+
+  return Status::OK();
+}
+
+// This pass take a MLIR HLO module, convert it to XLA to perform the HLO
+// optimization pipeline for the required platform, and then convert back to
+// MLIR LHLO.
+class XlaHloToLhloPass
+    : public PassWrapper<XlaHloToLhloPass, OperationPass<ModuleOp>> {
+ public:
+  XlaHloToLhloPass() = default;
+  XlaHloToLhloPass(const XlaHloToLhloPass&) {}
+
+ private:
+  void runOnOperation() final {
+    ModuleOp module = getOperation();
+    Status status = ConvertModule(module, platform_);
+    if (!status.ok()) {
+      module.emitError() << status.ToString();
+      return signalPassFailure();
+    }
+  }
+
+  Option<std::string> platform_{
+      *this, "platform",
+      llvm::cl::desc("The platform to use for the XLA optimization pipeline."),
+      llvm::cl::init("Host")};
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> createXlaHloToLhloWithXlaPass() {
+  return std::make_unique<XlaHloToLhloPass>();
+}
+
+Status HloToLhloModule(const BufferAssignment& assignment,
+                       const HloModule& hlo_module, ModuleOp module) {
+  return LhloDialectEmitter(assignment, hlo_module, module).Run();
+}
+
+static PassRegistration<XlaHloToLhloPass> registration(
+    "xla-hlo-to-lhlo-with-xla",
+    "Emit LHLO from HLO using the existing XLA implementation");
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.h b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.h
new file mode 100644
index 00000000000..1018bdbf408
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_HLO_TO_LHLO_WITH_XLA_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_HLO_TO_LHLO_WITH_XLA_H_
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace mlir {
+
+// Populate the MLIR `module` with the computation from the `hlo_module` using
+// the provided buffer `assignment`. The returned `Status` indicates success
+// or failure in the conversion.
+tensorflow::Status HloToLhloModule(const xla::BufferAssignment& assignment,
+                                   const xla::HloModule& hlo_module,
+                                   ModuleOp module);
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_HLO_TO_LHLO_WITH_XLA_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index f9c041f2e28..2b496677d62 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file implements logic for lowering HLO dialect to LHLO dialect.
+// This file implements logic for lowering HLO/LHLO dialect to Linalg dialect.
 
 #include "absl/memory/memory.h"
 #include "llvm/ADT/APInt.h"
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h"
 #include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
@@ -47,14 +48,14 @@ ArrayAttr GetNParallelLoopsAttrs(unsigned nParallelLoops, Builder* b) {
   return b->getArrayAttr(iteratorTypes);
 }
 
+template <bool isLHLO = true>
+Value getResultValue(Operation* op) {
+  return isLHLO ? op->getOperand(op->getNumOperands() - 1) : op->getResult(0);
+}
+
 template <bool isLHLO = true>
 ShapedType getXLAOpResultType(Operation* op) {
-  if (isLHLO) {
-    return op->getOperand(op->getNumOperands() - 1)
-        .getType()
-        .cast<ShapedType>();
-  }
-  return op->getResult(0).getType().cast<ShapedType>();
+  return getResultValue<isLHLO>(op).getType().template cast<ShapedType>();
 }
 
 template <bool isLHLO = true>
@@ -83,7 +84,8 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
       emitError(loc, "lhlo to linalg conversion expects ranked args");
       return failure();
     }
-    if (!argType.getElementType().isSignlessIntOrFloat()) {
+    auto elemTy = argType.getElementType();
+    if (!elemTy.isSignlessIntOrFloat() && !elemTy.template isa<ComplexType>()) {
       return failure();
     }
 
@@ -134,7 +136,7 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
         rewriter.getI64IntegerAttr(bodyResultTypes.size()),  // args_out
         rewriter.getArrayAttr(indexingMaps),
         GetNParallelLoopsAttrs(nloops, &rewriter),
-        /*doc=*/nullptr, /*fun=*/nullptr, /*library_call=*/nullptr);
+        /*doc=*/nullptr, /*library_call=*/nullptr);
 
     // Add a block to the region.
     auto* region = &linalgOp.region();
@@ -206,9 +208,7 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
     if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(op)) return failure();
     auto operandType = op.operand().getType().template cast<ShapedType>();
     auto resultType = getXLAOpResultType<isLHLO>(op);
-    if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(op)) return failure();
-    ArrayAttr indexingMapsAttr =
-        static_cast<const Derived&>(*this).getIndexingMapsAttr(op, &rewriter);
+    ArrayAttr indexingMapsAttr = Derived::getIndexingMapsAttr(op, &rewriter);
     if (!indexingMapsAttr) return failure();
 
     OpBuilder::InsertionGuard linalgOpGuard(rewriter);
@@ -218,7 +218,7 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
         loc, isLHLO ? ArrayRef<Type>{} : resultType, args,
         rewriter.getI64IntegerAttr(1), rewriter.getI64IntegerAttr(1),
         indexingMapsAttr, GetNParallelLoopsAttrs(nloops, &rewriter),
-        /*doc=*/nullptr, /*fun=*/nullptr, /*library_call=*/nullptr);
+        /*doc=*/nullptr, /*library_call=*/nullptr);
 
     auto* region = &linalgOp.region();
     auto* block = rewriter.createBlock(region, region->end());
@@ -233,6 +233,44 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
   }
 };
 
+/// Pattern to convert BroadcastOp to Linalg ops.
+template <typename OpTy, bool isLHLO = true>
+class BroadcastConverter
+    : public DataMovementOpConverter<BroadcastConverter<OpTy, isLHLO>, OpTy,
+                                     isLHLO> {
+ public:
+  using DataMovementOpConverter<BroadcastConverter, OpTy,
+                                isLHLO>::DataMovementOpConverter;
+
+  static ArrayAttr getIndexingMapsAttr(OpTy broadcastOp, Builder* b) {
+    ShapedType inputType =
+        broadcastOp.operand().getType().template cast<ShapedType>();
+    unsigned inputRank = inputType.getRank();
+    unsigned nloops = getXLAOpResultType<isLHLO>(broadcastOp).getRank();
+
+    // BroadcastOp prepends the dimensions in the `broadcast_sizes` attribute to
+    // the input's dimensions.
+    unsigned numPrependedDims = llvm::size(broadcastOp.broadcast_sizes());
+    SmallVector<AffineExpr, 4> inputDimExprs;
+    inputDimExprs.reserve(inputRank);
+    for (int i = 0; i < inputRank; ++i) {
+      inputDimExprs.push_back(b->getAffineDimExpr(numPrependedDims + i));
+    }
+
+    AffineMap inputMap;
+    MLIRContext* context = b->getContext();
+    if (inputDimExprs.empty()) {
+      // The input is a scalar, i.e. this is a scalar broadcast op.
+      inputMap = AffineMap::get(nloops, /*symbolCount=*/0, context);
+    } else {
+      inputMap =
+          AffineMap::get(nloops, /*symbolCount=*/0, inputDimExprs, context);
+    }
+    return b->getAffineMapArrayAttr(
+        {inputMap, b->getMultiDimIdentityMap(nloops)});
+  }
+};
+
 template <typename OpTy, bool isLHLO = true>
 class BroadcastInDimConverter
     : public DataMovementOpConverter<BroadcastInDimConverter<OpTy, isLHLO>,
@@ -241,61 +279,37 @@ class BroadcastInDimConverter
   using DataMovementOpConverter<BroadcastInDimConverter<OpTy, isLHLO>, OpTy,
                                 isLHLO>::DataMovementOpConverter;
 
-  ArrayAttr getIndexingMapsAttr(OpTy broadcastOp, Builder* b) const {
+  static ArrayAttr getIndexingMapsAttr(OpTy broadcastOp, Builder* b) {
     auto resultType = getXLAOpResultType<isLHLO>(broadcastOp);
     auto operandType =
         broadcastOp.operand().getType().template cast<ShapedType>();
     unsigned nloops = resultType.getRank();
 
+    // The input is a scalar, i.e. this is a scalar broadcast op.
+    if (operandType.getRank() == 0) {
+      return b->getAffineMapArrayAttr(
+          {AffineMap::get(nloops, /*symbolCount=*/0, b->getContext()),
+           b->getMultiDimIdentityMap(nloops)});
+    }
+
     auto operandShape = operandType.getShape();
     SmallVector<AffineExpr, 4> dimExprs;
-    AffineMap inputMap = AffineMap::get(b->getContext());
-    {
-      dimExprs.reserve(nloops);
+    dimExprs.reserve(nloops);
 
-      if (broadcastOp.broadcast_dimensions()) {
-        for (const auto& broadcastDim :
-             enumerate(broadcastOp.broadcast_dimensions().getIntValues())) {
-          int size = broadcastDim.value().getSExtValue();
-          // TODO(pifon): Add support for args with dynamic shapes for the case
-          // when a dimension of size 1 is broadcasted into dim of size N.
-          AffineExpr affineExpr = operandShape[broadcastDim.index()] == 1
-                                      ? b->getAffineConstantExpr(0)
-                                      : b->getAffineDimExpr(size);
-          dimExprs.push_back(affineExpr);
-        }
-      }
-      if (dimExprs.empty()) {
-        // The input is a scalar, i.e. this is a scalar broadcast op.
-        inputMap = AffineMap::get(nloops, /*symbolCount=*/0, b->getContext());
-      } else {
-        inputMap = AffineMap::get(nloops, /*symbolCount=*/0, dimExprs);
+    if (broadcastOp.broadcast_dimensions()) {
+      for (const auto& broadcastDim :
+           enumerate(broadcastOp.broadcast_dimensions().getIntValues())) {
+        int size = broadcastDim.value().getSExtValue();
+        bool expansion_needed = operandShape[broadcastDim.index()] == 1 &&
+                                resultType.getShape()[size] != 1;
+        // TODO(pifon): Add support for args with dynamic shapes for the case
+        // when a dimension of size 1 is broadcasted into dim of size N.
+        dimExprs.push_back(expansion_needed ? b->getAffineConstantExpr(0)
+                                            : b->getAffineDimExpr(size));
       }
     }
     return b->getAffineMapArrayAttr(
-        {inputMap, b->getMultiDimIdentityMap(nloops)});
-  }
-};
-
-template <typename OpTy, bool isLHLO = true>
-class TransposeConverter
-    : public DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
-                                     isLHLO> {
- public:
-  using DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
-                                isLHLO>::DataMovementOpConverter;
-  ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) const {
-    auto resultType =
-        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
-    auto nloops = resultType.getRank();
-    SmallVector<AffineExpr, 2> inputExprs;
-    inputExprs.resize(resultType.getRank());
-    for (auto permutation : llvm::enumerate(op.permutation())) {
-      inputExprs[permutation.value().getZExtValue()] =
-          b->getAffineDimExpr(permutation.index());
-    }
-    return b->getAffineMapArrayAttr(
-        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs),
+        {AffineMap::get(nloops, /*symbolCount=*/0, dimExprs, b->getContext()),
          b->getMultiDimIdentityMap(nloops)});
   }
 };
@@ -313,15 +327,33 @@ class TransposeConverter
 /// can have indexing maps
 /// [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1,
 /// d2)>]
+
+// TODO(ravishankarm): This pattern needs to be removed. The general reshape
+// lowering hits a corner case where the following sequence of operations
+// cannot be fused cause the resulting indexing map is not invertible.
+//
+// %r = linalg.reshape %s [affine_map<(d0, d1, d2) -> (d0, d1)>,
+//                         affine_map<(d0, d1, d2) -> (d2)>]
+//      : tensor<5x5xf32> into tensor<5x1x5xf32>
+// %f = linalg.generic
+//      {...
+//       indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+//                        affine_map<(d0, d1, d2) -> (d0, d2)>],
+//       iterator_types = ["parallel", "parallel", "parallel"]} %r {..}
+//      : tensor<5x1x5xf32> -> tensor<5x5xf32>
+//
+// The resolution of this requires a canonicalization on linalg ops where the
+// dims of size 1 are removed. This pattern can be removed after that.
 template <typename OpTy, bool isLHLO = true>
 class ReshapeAddRemoveDimConverter
     : public DataMovementOpConverter<ReshapeAddRemoveDimConverter<OpTy, isLHLO>,
                                      OpTy, isLHLO> {
  public:
-  using DataMovementOpConverter<ReshapeAddRemoveDimConverter<OpTy, isLHLO>,
-                                OpTy, isLHLO>::DataMovementOpConverter;
+  ReshapeAddRemoveDimConverter(MLIRContext* context)
+      : DataMovementOpConverter<ReshapeAddRemoveDimConverter<OpTy, isLHLO>,
+                                OpTy, isLHLO>(context, 100) {}
 
-  ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) const {
+  static ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) {
     auto resultType =
         getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
     auto operandType =
@@ -367,11 +399,111 @@ class ReshapeAddRemoveDimConverter
       return nullptr;
     inputExprs.resize(operandShape.size(), b->getAffineConstantExpr(0));
     return b->getAffineMapArrayAttr(
-        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs),
+        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
          b->getMultiDimIdentityMap(nloops)});
   }
 };
 
+template <typename OpTy, bool isLHLO = true>
+class TransposeConverter
+    : public DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
+                                     isLHLO> {
+ public:
+  using DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
+                                isLHLO>::DataMovementOpConverter;
+  static ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) {
+    auto resultType =
+        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
+    auto nloops = resultType.getRank();
+    SmallVector<AffineExpr, 2> inputExprs;
+    inputExprs.resize(resultType.getRank());
+    for (auto permutation : llvm::enumerate(op.permutation())) {
+      inputExprs[permutation.value().getZExtValue()] =
+          b->getAffineDimExpr(permutation.index());
+    }
+    return b->getAffineMapArrayAttr(
+        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
+         b->getMultiDimIdentityMap(nloops)});
+  }
+};
+
+// Converts reshape ops that can be proven to be either a collapse of dimensions
+// or expansion of dimensions of the operand.
+template <typename OpTy, bool isLHLO = true>
+class ReshapeOpConverter : public OpConversionPattern<OpTy> {
+ public:
+  using OpConversionPattern<OpTy>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      OpTy reshapeOp, ArrayRef<Value> args,
+      ConversionPatternRewriter& rewriter) const final {
+    if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(reshapeOp))
+      return failure();
+    ShapedType operandType =
+        reshapeOp.operand().getType().template cast<ShapedType>();
+    ShapedType resultType = getXLAOpResultType<isLHLO>(reshapeOp);
+
+    if (!operandType.hasStaticShape() || !resultType.hasStaticShape())
+      return failure();
+
+    // TODO(ravishankarm): To make this pattern not match the pattern that
+    // ReshapeAddRemoveDimConverter is for, check that condition here. Remove
+    // this when ReshapeAddRemoveDimConverter pattern is removed.
+    if (ReshapeAddRemoveDimConverter<OpTy, isLHLO>::getIndexingMapsAttr(
+            reshapeOp, &rewriter))
+      return failure();
+
+    // Compute the reassociation maps for the linalg operation.
+    ArrayRef<int64_t> srcShape =
+        (operandType.getRank() > resultType.getRank() ? operandType.getShape()
+                                                      : resultType.getShape());
+    ArrayRef<int64_t> dstShape =
+        (operandType.getRank() > resultType.getRank() ? resultType.getShape()
+                                                      : operandType.getShape());
+    unsigned currSrcDim = 0, currDstDim = 0;
+    SmallVector<SmallVector<AffineExpr, 4>, 4> exprs(dstShape.size());
+    while (currSrcDim < srcShape.size() && currDstDim < dstShape.size()) {
+      int64_t dstSize = dstShape[currDstDim];
+      int64_t srcSize = srcShape[currSrcDim];
+      while (srcSize < dstSize && currSrcDim < srcShape.size()) {
+        exprs[currDstDim].push_back(rewriter.getAffineDimExpr(currSrcDim++));
+        srcSize *= srcShape[currSrcDim];
+      }
+      if (srcSize == dstSize) {
+        exprs[currDstDim].push_back(rewriter.getAffineDimExpr(currSrcDim++));
+        // If the next dim in dstShape is not 1, treat subsequent dims in
+        // srcShape which are 1 to be collapsed.
+        if (currDstDim == dstShape.size() - 1 ||
+            dstShape[currDstDim + 1] != 1) {
+          while (currSrcDim < srcShape.size() && srcShape[currSrcDim] == 1) {
+            exprs[currDstDim].push_back(
+                rewriter.getAffineDimExpr(currSrcDim++));
+          }
+        }
+      } else {
+        return failure();
+      }
+      currDstDim++;
+    }
+    if (currSrcDim != srcShape.size()) return failure();
+
+    SmallVector<ArrayRef<AffineExpr>, 4> reassociationMaps;
+    for (auto& expr : exprs) reassociationMaps.push_back(expr);
+
+    if (isLHLO) {
+      Value reshapeBuffer = rewriter.create<linalg::ReshapeOp>(
+          reshapeOp.getLoc(), resultType, args[0], reassociationMaps);
+      rewriter.replaceOpWithNewOp<linalg::CopyOp>(
+          reshapeOp, reshapeBuffer, args[1], /*inputPermutation =*/nullptr,
+          /*outputPermutation =*/nullptr);
+    } else {
+      rewriter.replaceOpWithNewOp<linalg::TensorReshapeOp>(
+          reshapeOp, resultType, args[0], reassociationMaps);
+    }
+    return success();
+  }
+};
+
 class IotaConverter : public OpConversionPattern<xla_lhlo::IotaOp> {
  public:
   using OpConversionPattern<xla_lhlo::IotaOp>::OpConversionPattern;
@@ -399,7 +531,7 @@ class IotaConverter : public OpConversionPattern<xla_lhlo::IotaOp> {
         rewriter.getI64IntegerAttr(1),  // args_out
         rewriter.getArrayAttr(indexingMaps),
         GetNParallelLoopsAttrs(nloops, &rewriter),
-        /*doc=*/nullptr, /*fun=*/nullptr, /*library_call=*/nullptr);
+        /*doc=*/nullptr, /*library_call=*/nullptr);
 
     // Add a block to the region.
     auto* region = &linalgOp.region();
@@ -441,6 +573,34 @@ class ConstConverter : public OpConversionPattern<xla_lhlo::ConstOp> {
   }
 };
 
+// TODO(b/156787842): Support the lowering for dynamic shapes.
+template <typename OpTy, bool isLHLO = true>
+class ReverseConverter
+    : public DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
+                                     isLHLO> {
+ public:
+  using DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
+                                isLHLO>::DataMovementOpConverter;
+  static ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) {
+    auto resultType =
+        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
+    auto nloops = resultType.getRank();
+    SmallVector<AffineExpr, 2> inputExprs;
+    inputExprs.reserve(nloops);
+    for (int i = 0; i < nloops; ++i)
+      inputExprs.push_back(b->getAffineDimExpr(i));
+    for (auto dim : op.dimensions()) {
+      int i = dim.getZExtValue();
+      if (resultType.isDynamicDim(i)) return {};
+      int n = resultType.getShape()[i];
+      inputExprs[i] = b->getAffineConstantExpr(n - 1) - inputExprs[i];
+    }
+    return b->getAffineMapArrayAttr(
+        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
+         b->getMultiDimIdentityMap(nloops)});
+  }
+};
+
 class SliceConverter : public OpConversionPattern<xla_lhlo::SliceOp> {
  public:
   using OpConversionPattern<xla_lhlo::SliceOp>::OpConversionPattern;
@@ -478,7 +638,8 @@ class SliceConverter : public OpConversionPattern<xla_lhlo::SliceOp> {
 void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                                            OwningRewritePatternList* patterns) {
   // clang-format off
-  patterns->insert<BroadcastInDimConverter<xla_lhlo::BroadcastInDimOp>,
+  patterns->insert<BroadcastConverter<xla_lhlo::BroadcastOp>,
+                   BroadcastInDimConverter<xla_lhlo::BroadcastInDimOp>,
                    ConstConverter,
                    IotaConverter,
                    PointwiseToLinalgConverter<xla_lhlo::AbsOp>,
@@ -486,25 +647,30 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_lhlo::AndOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CeilOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CompareOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::ComplexOp>,
                    PointwiseToLinalgConverter<xla_lhlo::ConvertOp>,
                    // TODO(ataei): Remove this pattern, CopyOp is folded away.
                    PointwiseToLinalgConverter<xla_lhlo::CopyOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CosOp>,
                    PointwiseToLinalgConverter<xla_lhlo::DivOp>,
                    PointwiseToLinalgConverter<xla_lhlo::ExpOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::ImagOp>,
                    PointwiseToLinalgConverter<xla_lhlo::LogOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MaxOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MinOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MulOp>,
                    PointwiseToLinalgConverter<xla_lhlo::NegOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::RealOp>,
                    PointwiseToLinalgConverter<xla_lhlo::RemOp>,
                    PointwiseToLinalgConverter<xla_lhlo::RsqrtOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SelectOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SignOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::SinOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SqrtOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SubOp>,
                    PointwiseToLinalgConverter<xla_lhlo::TanhOp>,
                    ReshapeAddRemoveDimConverter<xla_lhlo::ReshapeOp>,
+                   ReverseConverter<xla_lhlo::ReverseOp>,
                    ScalarPointwiseToStandardConverter<xla_lhlo::AddOp>,
                    SliceConverter
                   >(context);
@@ -576,29 +742,37 @@ namespace xla_hlo {
 
 void populateHLOToLinalgConversionPattern(MLIRContext* context,
                                           OwningRewritePatternList* patterns) {
-  patterns->insert<BroadcastInDimConverter<xla_hlo::BroadcastInDimOp, false>,
-                   ReshapeAddRemoveDimConverter<xla_hlo::ReshapeOp, false>,
-                   TransposeConverter<xla_hlo::TransposeOp, false>,
+  patterns->insert<BroadcastConverter<xla_hlo::BroadcastOp, false>,
+                   BroadcastInDimConverter<xla_hlo::BroadcastInDimOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::AbsOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::AddOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::AndOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CeilOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CompareOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::CosOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::ComplexOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::ConvertOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CopyOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::CosOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::DivOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::ExpOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::ImagOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::LogOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MaxOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MinOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MulOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::NegOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::RealOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::RemOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::RsqrtOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::SelectOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::SinOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::SqrtOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::SubOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::TanhOp, false>>(context);
+                   PointwiseToLinalgConverter<xla_hlo::TanhOp, false>,
+                   ReshapeAddRemoveDimConverter<xla_hlo::ReshapeOp, false>,
+                   ReshapeOpConverter<xla_hlo::ReshapeOp, false>,
+                   ReverseConverter<xla_hlo::ReverseOp, false>,
+                   TransposeConverter<xla_hlo::TransposeOp, false>>(context);
 }
 
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass() {
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.cc b/tensorflow/compiler/mlir/xla/type_to_shape.cc
index 3b1ae934c48..9f144bb4a45 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -64,17 +65,18 @@ PrimitiveType TypeToPrimitiveType(mlir::Type type) {
       return PrimitiveType::F64;
     case mlir::StandardTypes::Integer: {
       const auto integer = type.cast<IntegerType>();
+      bool is_unsigned = integer.isUnsigned();
       switch (integer.getWidth()) {
         case 1:
           return PrimitiveType::PRED;
         case 8:
-          return PrimitiveType::S8;
+          return is_unsigned ? PrimitiveType::U8 : PrimitiveType::S8;
         case 16:
-          return PrimitiveType::S16;
+          return is_unsigned ? PrimitiveType::U16 : PrimitiveType::S16;
         case 32:
-          return PrimitiveType::S32;
+          return is_unsigned ? PrimitiveType::U32 : PrimitiveType::S32;
         case 64:
-          return PrimitiveType::S64;
+          return is_unsigned ? PrimitiveType::U64 : PrimitiveType::S64;
         default:
           return PrimitiveType::PRIMITIVE_TYPE_INVALID;
       }
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 1ee25813320..ea4ba8dab6b 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -128,6 +128,7 @@ tf_xla_py_test(
     name = "adagrad_da_test",
     size = "small",
     srcs = ["adagrad_da_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -165,6 +166,7 @@ tf_xla_py_test(
     srcs = ["add_n_test.py"],
     # TensorList ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -200,6 +202,7 @@ tf_xla_py_test(
     name = "binary_ops_test",
     size = "medium",
     srcs = ["binary_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -224,6 +227,7 @@ tf_xla_py_test(
     name = "complex_div_test",
     size = "medium",
     srcs = ["complex_div_test.py"],
+    enable_mlir_bridge = True,
     enabled_backends = [
         "cpu",
         "gpu",
@@ -448,6 +452,7 @@ tf_xla_py_test(
     name = "clustering_test",
     size = "small",
     srcs = ["clustering_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -465,6 +470,7 @@ tf_xla_py_test(
     name = "concat_ops_test",
     size = "medium",
     srcs = ["concat_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "many_xla_args",
@@ -487,6 +493,7 @@ tf_xla_py_test(
     name = "conv2d_test",
     size = "medium",
     srcs = ["conv2d_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 10,
     tags = [
@@ -509,6 +516,7 @@ tf_xla_py_test(
     name = "conv3d_test",
     size = "medium",
     srcs = ["conv3d_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -554,6 +562,7 @@ tf_xla_py_test(
     name = "dynamic_slice_ops_test",
     size = "small",
     srcs = ["dynamic_slice_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -570,6 +579,7 @@ tf_xla_py_test(
     name = "einsum_op_test",
     size = "medium",
     srcs = ["einsum_op_test.py"],
+    enable_mlir_bridge = True,
     enabled_backends = [
         "cpu",
         "gpu",
@@ -591,6 +601,7 @@ tf_xla_py_test(
     name = "reshape_op_test",
     size = "small",
     srcs = ["reshape_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -662,6 +673,7 @@ tf_xla_py_test(
     name = "fifo_queue_test",
     size = "medium",
     srcs = ["fifo_queue_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -701,6 +713,7 @@ tf_xla_py_test(
     name = "slice_ops_test",
     size = "small",
     srcs = ["slice_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -736,6 +749,7 @@ tf_xla_py_test(
     name = "function_test",
     size = "small",
     srcs = ["function_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -880,6 +894,7 @@ tf_xla_py_test(
     name = "nary_ops_test",
     size = "small",
     srcs = ["nary_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -897,6 +912,7 @@ tf_xla_py_test(
     name = "nullary_ops_test",
     size = "small",
     srcs = ["nullary_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1219,6 +1235,7 @@ tf_xla_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "config-cuda-only",
@@ -1279,6 +1296,7 @@ tf_xla_py_test(
     srcs = ["tensor_array_ops_test.py"],
     # TensorArray ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "config-cuda-only",
@@ -1307,6 +1325,7 @@ tf_xla_py_test(
     srcs = ["tensor_list_ops_test.py"],
     # TensorList ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1325,6 +1344,7 @@ tf_xla_py_test(
     name = "ternary_ops_test",
     size = "medium",
     srcs = ["ternary_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1346,26 +1366,6 @@ tf_xla_py_test(
     name = "unary_ops_test",
     size = "medium",
     srcs = ["unary_ops_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-    ],
-    deps = [
-        ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-# TODO(hinsu): Combine this test with unary_ops_test instead of replicating it.
-tf_xla_py_test(
-    name = "unary_mlir_ops_test",
-    size = "medium",
-    srcs = ["unary_mlir_ops_test.py"],
     enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
@@ -1387,6 +1387,7 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["fused_batchnorm_test.py"],
     python_version = "PY3",
+    shard_count = 5,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -1467,6 +1468,7 @@ tf_xla_py_test(
     name = "gather_nd_op_test",
     size = "medium",
     srcs = ["gather_nd_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1519,6 +1521,7 @@ tf_xla_py_test(
     name = "data_format_ops_test",
     size = "small",
     srcs = ["data_format_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1753,6 +1756,7 @@ tf_xla_py_test(
     name = "placeholder_test",
     size = "small",
     srcs = ["placeholder_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1789,6 +1793,7 @@ tf_xla_py_test(
     name = "xla_ops_test",
     size = "medium",
     srcs = ["xla_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1808,6 +1813,7 @@ tf_xla_py_test(
     name = "conv_node_name_test",
     size = "medium",
     srcs = ["conv_node_name_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -1854,6 +1860,7 @@ tf_xla_py_test(
     name = "special_math_test",
     size = "medium",
     srcs = ["special_math_test.py"],
+    enable_mlir_bridge = True,
     shard_count = 5,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1866,3 +1873,20 @@ tf_xla_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+tf_xla_py_test(
+    name = "ensure_shape_op_test",
+    size = "medium",
+    srcs = ["ensure_shape_op_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 8543e8ea2be..00ed6d83e2e 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -26,6 +26,7 @@ import numpy as np
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import gen_math_ops
@@ -474,6 +475,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
           expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36],
                             dtype=np.int64))
 
+  @test_util.disable_mlir_bridge("Enable tf.NextAfter Compilation")
   def testNextAfter(self):
     for dtype in self.numeric_types:
       if dtype in [np.float32, np.float64]:
@@ -501,6 +503,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
             expected=expected,
             equality_test=NextAfterEqualityTest)
 
+  @test_util.disable_mlir_bridge(
+      "Complex types not supported in CreateDenseElementsAttrFromLiteral")
   def testComplexOps(self):
     for dtype in self.complex_types:
       ctypes = {np.complex64: np.float32, np.complex128: np.float64}
@@ -521,11 +525,19 @@ class BinaryOpsTest(xla_test.XLATestCase):
 
       self._testBinary(
           gen_math_ops.real_div,
-          np.array([3, 3j, -1.5j, -8, 2 + 3j, 2 + 4j], dtype=dtype),
-          np.array([2, -2, 7j, -4j, 4 - 6j, 1 + 2j], dtype=dtype),
-          expected=np.array(
-              [1.5, -1.5j, -0.2142857, -2j, (2 + 3j) / (4 - 6j), 2],
-              dtype=dtype))
+          np.array(
+              [3, 3j, -1.5j, -8, 2 + 3j, 2 + 4j, 9.663546088957395e-28 + 0j],
+              dtype=dtype),
+          np.array([
+              2, -2, 7j, -4j, 4 - 6j, 1 + 2j,
+              9.39511792677288e-16 - 1.529841108938729e-23j
+          ],
+                   dtype=dtype),
+          expected=np.array([
+              1.5, -1.5j, -0.2142857, -2j,
+              (2 + 3j) / (4 - 6j), 2, 1.028571e-12 + 1.674859e-20j
+          ],
+                            dtype=dtype))
 
       self._testBinary(
           math_ops.pow,
@@ -716,6 +728,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
     for dtype in self.signed_int_types - {np.int8}:
       self._testRemainder(dtype)
 
+  @test_util.disable_mlir_bridge(
+      "F16 type is not supported in CreateDenseElementsAttrFromLiteral")
   def testFloatRemainder(self):
     for dtype in self.float_types:
       self._testRemainder(dtype)
@@ -1210,6 +1224,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
                [7, 7, 7, 7, 7, 7]],
               dtype=dtype))
 
+  @test_util.disable_mlir_bridge(
+      "Requires concatenate op support in MlirHloBuilder")
   def testSymmetricMirrorPad(self):
     mirror_pad = lambda t, paddings: array_ops.pad(t, paddings, "SYMMETRIC")
     for dtype in self.numeric_types:
@@ -1241,6 +1257,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array([[0, 0], [0, 0]], dtype=np.int32),
           expected=np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype))
 
+  @test_util.disable_mlir_bridge(
+      "Requires concatenate op support in MlirHloBuilder")
   def testReflectMirrorPad(self):
     mirror_pad = lambda t, paddings: array_ops.pad(t, paddings, "REFLECT")
     for dtype in self.numeric_types:
@@ -1394,6 +1412,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
             ],
             equality_test=self.ListsAreClose)
 
+  @test_util.disable_mlir_bridge("TODO(b/155097657): Debug incorrect answer")
   def testTile(self):
     for dtype in self.numeric_types:
       self._testBinary(
@@ -1551,6 +1570,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
                      np.array([2, 1, 5], dtype=np.int32),
                      expected=np.array([2, 3, 5], dtype=np.int32))
 
+  @test_util.disable_mlir_bridge("Error handling")
+  def testBroadcastArgsError(self):
     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                              "Incompatible shapes"):
       self._testBinary(array_ops.broadcast_dynamic_shape,
@@ -1558,6 +1579,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
                        np.array([4, 5, 6], dtype=np.int32),
                        expected=None)
 
+  @test_util.disable_mlir_bridge(
+      "Requires BroadcastInDim method in MlirHloBuilder")
   def testBroadcastTo(self):
     for dtype in self.all_types:
       x = np.random.randint(0, high=100, size=[2, 3])
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 10dd2d6542c..f35ded924d5 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradients_impl
@@ -293,6 +294,7 @@ class ConcatTest(xla_test.XLATestCase):
 
   # The purpose of this is to ensure that XLA on GPU will not run out of memory
   # with too many arguments.
+  @test_util.disable_mlir_bridge("TODO(b/153895138): Debug.")
   def testConcatLargeNumberOfTensors(self):
     if "CPU" in self.device:
       self.skipTest("This test can time out on CPU, so we will just allow "
diff --git a/tensorflow/compiler/tests/ensure_shape_op_test.py b/tensorflow/compiler/tests/ensure_shape_op_test.py
new file mode 100644
index 00000000000..95de5a9c49b
--- /dev/null
+++ b/tensorflow/compiler/tests/ensure_shape_op_test.py
@@ -0,0 +1,51 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ensure_shape_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.platform import test
+
+
+class EnsureShapeOpTest(xla_test.XLATestCase):
+
+  def testEnsureShape(self):
+    with self.session() as sess:
+      p = array_ops.placeholder(dtypes.int32)
+      with self.test_scope():
+        op = check_ops.ensure_shape(p, (None, 3))
+      expected_out = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+      self.assertAllEqual(expected_out,
+                          sess.run(op, {p: [[0, 1, 2], [3, 4, 5], [6, 7, 8]]}))
+
+  def testInvalidEnsureShape(self):
+    with self.session() as sess:
+      p = array_ops.placeholder(dtypes.int32)
+      with self.test_scope():
+        op = check_ops.ensure_shape(p, (None, 3, 3))
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is not compatible with expected shape"):
+        sess.run(op, {p: [[0, 1, 2], [3, 4, 5], [6, 7, 8]]})
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/gather_nd_op_test.py b/tensorflow/compiler/tests/gather_nd_op_test.py
index d1f72b89e83..90ac515764b 100644
--- a/tensorflow/compiler/tests/gather_nd_op_test.py
+++ b/tensorflow/compiler/tests/gather_nd_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -45,6 +46,7 @@ class GatherNdTest(xla_test.XLATestCase):
               np.array([8, 1, 2, 3, 7, 5], dtype=dtype),
               np.array([[4], [4], [0]], np.int32)))
 
+  @test_util.disable_mlir_bridge("Error handling")
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
     with self.session():
       params = np.ones((3, 3), dtype=np.float32)
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index b89472b8085..81779203955 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -30,7 +30,6 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import image_ops
@@ -979,7 +978,6 @@ class NonMaxSuppressionTest(xla_test.XLATestCase):
 
 class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSFrom6(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1017,7 +1015,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
                         indices_output)
     self.assertAllEqual([5, 4], num_valid_output)
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSFrom6Max3(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1051,7 +1048,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([[0, 1, 2], [0, 1, 3]], indices_output)
     self.assertAllEqual([3, 3], num_valid_output)
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSSingleFrom6Max3(self):
     boxes_data = [[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                   [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
@@ -1082,7 +1078,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([0, 1, 2], indices_output)
     self.assertAllEqual(3, num_valid_output)
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSSingleFrom6NoPad(self):
     boxes_data = [[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                   [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
@@ -1112,7 +1107,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([0, 1, 2, 4, 5], indices_output)
     self.assertAllEqual(5, num_valid_output)
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSBatchDimsFrom6Max3(self):
     boxes_data = [[[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                     [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1146,7 +1140,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([[[0, 1, 2], [0, 1, 3]]], indices_output)
     self.assertAllEqual([[3, 3]], num_valid_output)
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSScoreThresholdFrom6Max3(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1182,7 +1175,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([3, 2], num_valid_output)
     self.assertAllEqual([[0, 1, 2], [0, 1, invalid_index]], indices_output)
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSUnsortedInputFrom6(self):
     boxes_data = [[[0, 2, 1, 2], [3, 3, 4, 4], [0, 0, 1, 1],
                    [0, 0.4, 1, 1.4], [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8]],
@@ -1219,7 +1211,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
                         indices_output)
     self.assertAllEqual([5, 4], num_valid_output)
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSNoncanonicalizedInputFrom6(self):
     boxes_data = [[[1, 0, 0, 1], [4, 3, 3, 4], [1, 0.4, 0, 1.4],
                    [1, 0.6, 0, 1.6], [1, 0.8, 0, 1.8], [1, 2, 0, 2]],
@@ -1257,7 +1248,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
                         indices_output)
     self.assertAllEqual([5, 4], num_valid_output)
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSScoreThresholdCanInputsFrom6Max3(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1293,7 +1283,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([3, 2], num_valid_output)
     self.assertAllEqual([[0, 1, 2], [0, 1, invalid_index]], indices_output)
 
-  @test_util.with_forward_compatibility_horizons(None, [2020, 4, 21])
   def testBatchedNMSFrom6DynamicInput(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 52f47416ed2..2f304d0a96f 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -190,6 +190,25 @@ class RandomOpsTest(xla_test.XLATestCase):
         self._checkTruncatedNormalIsInRange(
             x, a=a, b=b, mu=mu, sigma=sigma, count=count, stat_test=stat_test)
 
+  def testParameterizedTruncatedNormalBroadcasting(self):
+    for dtype in self._random_types() & {np.float32, np.float64}:
+      with self.session():
+        with self.test_scope():
+          a = -1.
+          b = 1.
+          mu = 0.
+          sigma = 1.
+          count = 10000000
+          x = random_ops.parameterized_truncated_normal(
+              shape=[1, count],
+              dtype=dtype,
+              means=mu,
+              stddevs=sigma,
+              minvals=[a],
+              maxvals=[b])
+        self._checkTruncatedNormalIsInRange(
+            x, a=a, b=b, mu=mu, sigma=sigma, count=count, stat_test=True)
+
   def testParameterizedTruncatedNormalIsInRangeCenter(self):
     count = 10000000
     self._implParameterizedTruncatedNormalIsInRange(
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 8bad4da0524..9f963110cf3 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -62,7 +63,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/tests/special_math_test.py b/tensorflow/compiler/tests/special_math_test.py
index b3abc40f82d..3efaa6434be 100644
--- a/tensorflow/compiler/tests/special_math_test.py
+++ b/tensorflow/compiler/tests/special_math_test.py
@@ -29,6 +29,7 @@ import scipy.special as sps
 import six
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_random_ops
@@ -43,6 +44,16 @@ flags.DEFINE_bool('vary_seed', False,
 NUM_SAMPLES = int(1e3)
 
 
+@def_function.function(experimental_compile=True)
+def _igamma(a, x):
+  return math_ops.igamma(a, x)
+
+
+@def_function.function(experimental_compile=True)
+def _igammac(a, x):
+  return math_ops.igammac(a, x)
+
+
 # This is df/da / df/dx, where f = igamma.
 def implicit_reparameterization_grad(a, x):
   log_prob = math_ops.xlogy(a - 1., x) - math_ops.lgamma(a) - x
@@ -64,13 +75,39 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
 
   # Skip Float64 test on TPU due to missing ops.
   def maybe_skip_test(self, dtype):
-    if self.device not in ['XLA_GPU', 'XLA_CPU', 'CPU'] and dtype == np.float64:
+    if self.device not in ['XLA_GPU', 'XLA_CPU'] and dtype == np.float64:
       self.skipTest(
           'Skipping test because some F64 operations not supported on TPU.')
 
+  def adjust_tolerance_for_tpu(self, dtype, rtol, atol):
+    if self.device not in ['TPU']:
+      return rtol, atol
+
+    if dtype == np.float32:
+      return 2e-2, 1e-7
+    return 2e-4, 1e-20
+
   @parameterized.parameters((np.float32, 1e-2, 1e-11),
                             (np.float64, 1e-4, 1e-30))
-  def testIgammaSmallValues(self, dtype, rtol, atol):
+  def testLargeXSmallA(self, dtype, rtol, atol):
+    self.maybe_skip_test(dtype)
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    # Test values near zero.
+    x = np.random.uniform(low=100., high=200., size=[NUM_SAMPLES]).astype(dtype)
+    a = np.random.uniform(low=0.3, high=1., size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.gammainc(a, x)
+    with self.session() as sess:
+      with self.test_scope():
+        y = _igamma(a, x)
+      actual = sess.run(y)
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  def testSmallValues(self, dtype, rtol, atol):
+    self.maybe_skip_test(dtype)
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
     # Test values near zero.
     x = np.random.uniform(
         low=np.finfo(dtype).tiny, high=1., size=[NUM_SAMPLES]).astype(dtype)
@@ -80,12 +117,14 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
     expected_values = sps.gammainc(a, x)
     with self.session() as sess:
       with self.test_scope():
-        actual = sess.run(math_ops.igamma(a, x))
+        actual = sess.run(_igamma(a, x))
     self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
 
   @parameterized.parameters((np.float32, 1e-2, 1e-11),
                             (np.float64, 1e-4, 1e-30))
-  def testIgammaMediumValues(self, dtype, rtol, atol):
+  def testMediumValues(self, dtype, rtol, atol):
+    self.maybe_skip_test(dtype)
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
     # Test values near zero.
     x = np.random.uniform(low=1., high=100., size=[NUM_SAMPLES]).astype(dtype)
     a = np.random.uniform(low=1., high=100., size=[NUM_SAMPLES]).astype(dtype)
@@ -93,11 +132,14 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
     expected_values = sps.gammainc(a, x)
     with self.session() as sess:
       with self.test_scope():
-        actual = sess.run(math_ops.igamma(a, x))
+        actual = sess.run(_igamma(a, x))
     self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
 
   @parameterized.parameters((np.float32, 2e-2, 1e-5), (np.float64, 1e-4, 1e-30))
-  def testIgammaLargeValues(self, dtype, rtol, atol):
+  def testLargeValues(self, dtype, rtol, atol):
+    if self.device == 'TPU':
+      # TODO(b/154908275): Remove this once fixed for large a, x.
+      self.skipTest('Skipping test since numerically unstable on TPU.')
     # Test values near zero.
     x = np.random.uniform(
         low=100., high=int(1e4), size=[NUM_SAMPLES]).astype(dtype)
@@ -107,13 +149,13 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
     expected_values = sps.gammainc(a, x)
     with self.session() as sess:
       with self.test_scope():
-        actual = sess.run(math_ops.igamma(a, x))
+        actual = sess.run(_igamma(a, x))
     self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
 
   # We don't check small values because the numerical gradients become quite
   # large.
   @parameterized.parameters((np.float32, 0.09), (np.float64, 1e-7))
-  def testIgammaGradMediumValues(self, dtype, tolerance):
+  def testGradMediumValues(self, dtype, tolerance):
     self.maybe_skip_test(dtype)
     with self.session():
       with self.test_scope():
@@ -124,13 +166,13 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
             np.random.uniform(low=1., high=100.,
                               size=[NUM_SAMPLES]).astype(dtype))
 
-        f = lambda b: math_ops.igamma(b, x)
+        f = lambda b: _igamma(b, x)
         max_error = gradient_checker_v2.max_error(
             *gradient_checker_v2.compute_gradient(f, x=[a], delta=1e-3))
     self.assertLessEqual(max_error, tolerance)
 
   @parameterized.parameters((np.float32, 0.5), (np.float64, 1e-7))
-  def testIgammaGradLargeValues(self, dtype, tolerance):
+  def testGradLargeValues(self, dtype, tolerance):
     self.maybe_skip_test(dtype)
     with self.session():
       with self.test_scope():
@@ -141,7 +183,7 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
             np.random.uniform(low=100., high=int(1e4),
                               size=[NUM_SAMPLES]).astype(dtype))
 
-        f = lambda b: math_ops.igamma(b, x)
+        f = lambda b: _igamma(b, x)
         max_error = gradient_checker_v2.max_error(
             *gradient_checker_v2.compute_gradient(f, x=[a], delta=1e-2))
     self.assertLessEqual(max_error, tolerance)
@@ -150,6 +192,7 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
                             (np.float64, 1e-4, 1e-30))
   def testRandomGammaGradSmallValues(self, dtype, rtol, atol):
     self.maybe_skip_test(dtype)
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
     # Test values near zero.
 
     with self.session() as sess:
@@ -179,6 +222,7 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
                             (np.float64, 1e-4, 1e-30))
   def testRandomGammaGradMediumValues(self, dtype, rtol, atol):
     self.maybe_skip_test(dtype)
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
 
     with self.session() as sess:
       with self.test_scope():
@@ -202,6 +246,98 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
     self.assertAllClose(actual_grad, gamma_sample_grad, atol=atol, rtol=rtol)
 
 
+class IgammacTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def setUp(self):
+    if flags.FLAGS.vary_seed:
+      entropy = os.urandom(64)
+      if six.PY2:
+        answer = int(entropy.encode('hex'), 16)
+      else:
+        answer = int.from_bytes(entropy, 'big')
+      np.random.seed(answer % (2**32 - 1))
+    super(IgammacTest, self).setUp()
+
+  # Skip Float64 test on TPU due to missing ops.
+  def maybe_skip_test(self, dtype):
+    if self.device not in ['XLA_GPU', 'XLA_CPU'] and dtype == np.float64:
+      # TODO(b/154908275): Remove this once fixed for large a, x.
+      self.skipTest(
+          'Skipping test because some F64 operations not supported on TPU.')
+
+  def adjust_tolerance_for_tpu(self, dtype, rtol, atol):
+    if self.device not in ['TPU']:
+      return rtol, atol
+
+    if dtype == np.float32:
+      return 2e-2, 1e-7
+    return 2e-4, 1e-20
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  def testLargeXSmallA(self, dtype, rtol, atol):
+    self.maybe_skip_test(dtype)
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    # Test values near zero.
+    x = np.random.uniform(low=100., high=200., size=[NUM_SAMPLES]).astype(dtype)
+    a = np.random.uniform(low=0.3, high=1., size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.gammaincc(a, x)
+    with self.session() as sess:
+      with self.test_scope():
+        y = _igammac(a, x)
+      actual = sess.run(y)
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  def testSmallValues(self, dtype, rtol, atol):
+    self.maybe_skip_test(dtype)
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    # Test values near zero.
+    x = np.random.uniform(
+        low=np.finfo(dtype).tiny, high=1., size=[NUM_SAMPLES]).astype(dtype)
+    a = np.random.uniform(
+        low=np.finfo(dtype).tiny, high=1., size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.gammaincc(a, x)
+    with self.session() as sess:
+      with self.test_scope():
+        actual = sess.run(_igammac(a, x))
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  def testMediumValues(self, dtype, rtol, atol):
+    self.maybe_skip_test(dtype)
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    # Test values near zero.
+    x = np.random.uniform(low=1., high=100., size=[NUM_SAMPLES]).astype(dtype)
+    a = np.random.uniform(low=1., high=100., size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.gammaincc(a, x)
+    with self.session() as sess:
+      with self.test_scope():
+        actual = sess.run(_igammac(a, x))
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 2e-2, 1e-5), (np.float64, 1e-4, 1e-30))
+  def testLargeValues(self, dtype, rtol, atol):
+    if self.device == 'TPU':
+      self.skipTest('Skipping test since numerically unstable on TPU.')
+    # Test values near zero.
+    x = np.random.uniform(
+        low=100., high=int(1e4), size=[NUM_SAMPLES]).astype(dtype)
+    a = np.random.uniform(
+        low=100., high=int(1e4), size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.gammaincc(a, x)
+    with self.session() as sess:
+      with self.test_scope():
+        actual = sess.run(_igammac(a, x))
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+
 if __name__ == '__main__':
   os.environ['XLA_FLAGS'] = '--xla_cpu_enable_fast_math=false'
   test.main()
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index 465f368db82..7bbfecff403 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -24,6 +24,7 @@ import scipy.special as sps
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -47,6 +48,8 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       {'start': 1, 'end': 2, 'num': 1},
       {'start': 1, 'end': 4, 'num': 3},
       {'start': 0, 'end': 41, 'num': 42})
+  @test_util.disable_mlir_bridge(
+      'TODO(b/156174708): Dynamic result types not supported')
   def testLinspace(self, start, end, num):
     expected = np.linspace(start, end, num, dtype=np.float32)
     result = self._testTernary(
@@ -211,6 +214,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
             upper,
             expected=np.minimum(np.maximum(x, lower), upper))
 
+  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetaincSanity(self):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
@@ -230,7 +234,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       {
           'sigma': 1e15,
           'rtol': 1e-6,
-          'atol': 1e-6
+          'atol': 1e-4
       },
       {
           'sigma': 30,
@@ -240,7 +244,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       {
           'sigma': 1e-8,
           'rtol': 5e-4,
-          'atol': 3e-6
+          'atol': 3e-4
       },
       {
           'sigma': 1e-16,
@@ -248,6 +252,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
           'atol': 2e-4
       },
   )
+  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetainc(self, sigma, rtol, atol):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
diff --git a/tensorflow/compiler/tests/unary_mlir_ops_test.py b/tensorflow/compiler/tests/unary_mlir_ops_test.py
deleted file mode 100644
index 4238877c761..00000000000
--- a/tensorflow/compiler/tests/unary_mlir_ops_test.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for XLA JIT compiler."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.compiler.tests import xla_test
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import googletest
-
-
-class UnaryOpsTest(xla_test.XLATestCase):
-  """Test cases for unary operators."""
-
-  def _assertOpOutputMatchesExpected(self,
-                                     op,
-                                     inp,
-                                     expected,
-                                     equality_test=None,
-                                     rtol=1e-3,
-                                     atol=1e-5):
-    """Verifies that 'op' produces 'expected' when fed input 'inp' .
-
-    Args:
-      op: operator to test
-      inp: numpy input array to use as input to 'op'.
-      expected: numpy array representing the expected output of 'op'.
-      equality_test: either None, or a function that tests two numpy arrays for
-        equality. If None, self.assertAllClose is used.
-      rtol: relative tolerance for equality test.
-      atol: absolute tolerance for equality test.
-    """
-    with self.session() as session:
-      with self.test_scope():
-        pinp = array_ops.placeholder(
-            dtypes.as_dtype(inp.dtype), inp.shape, name='a')
-        output = op(pinp)
-      result = session.run(output, {pinp: inp})
-      if equality_test is None:
-        self.assertEqual(output.dtype, expected.dtype)
-        self.assertAllCloseAccordingToType(
-            expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
-      else:
-        equality_test(result, expected, rtol=rtol, atol=atol)
-
-  def testNumericOps(self):
-    for dtype in self.numeric_types - {np.int8, np.uint8}:
-      self._assertOpOutputMatchesExpected(
-          math_ops.abs,
-          np.array([[2, -1]], dtype=dtype),
-          expected=np.array([[2, 1]], dtype=np.real(dtype(0)).dtype))
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/compiler/tests/unary_ops_composition_test.cc b/tensorflow/compiler/tests/unary_ops_composition_test.cc
index b5f18bba077..569261de094 100644
--- a/tensorflow/compiler/tests/unary_ops_composition_test.cc
+++ b/tensorflow/compiler/tests/unary_ops_composition_test.cc
@@ -82,9 +82,8 @@ class UnaryOpsCompositionTest : public OpsTestBase {
     DeviceContext* device_context =
         device_->tensorflow_gpu_device_info()->default_context;
 
-    TF_CHECK_OK(BlockingCopy([&](StatusCallback cb) {
-      device_context->CopyCPUTensorToDevice(&input_on_host, device_, input, cb);
-    }));
+    TF_CHECK_OK(device_context->CopyCPUTensorToDeviceSync(&input_on_host,
+                                                          device_, input));
 
     TF_ASSERT_OK(RunOpKernel());
 
@@ -94,27 +93,12 @@ class UnaryOpsCompositionTest : public OpsTestBase {
     Tensor* output = GetOutput(0);
     Tensor output_on_host(cpu_allocator, output->dtype(), output->shape());
 
-    TF_CHECK_OK(BlockingCopy([&](StatusCallback cb) {
-      device_context->CopyDeviceTensorToCPU(output, "output 0", device_,
-                                            &output_on_host, cb);
-    }));
+    TF_CHECK_OK(device_context->CopyDeviceTensorToCPUSync(
+        output, "output 0", device_, &output_on_host));
 
     test::ExpectClose(expected_tensor, output_on_host, /*atol=*/1e-5,
                       /*rtol=*/1e-5);
   }
-
- private:
-  template <typename CopyFnTy>
-  Status BlockingCopy(CopyFnTy copy_fn) {
-    Notification n;
-    Status status;
-    copy_fn([&](Status s) {
-      status = s;
-      n.Notify();
-    });
-    n.WaitForNotification();
-    return status;
-  }
 };
 
 TEST_F(UnaryOpsCompositionTest, Compose_Sqrt_Sqrt_F) {
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index a9f5a5e743d..3e36f67615b 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -25,6 +25,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -84,6 +85,8 @@ class UnaryOpsTest(xla_test.XLATestCase):
     for i in xrange(len(result)):
       self.assertAllClose(result[i], expected[i], rtol, atol)
 
+  @test_util.disable_mlir_bridge(
+      "MlirHloBuilder::Iota missing required for xla::Diag")
   def testAllTypeOps(self):
     for dtype in self.numeric_types - {np.int8, np.uint8}:
       self._assertOpOutputMatchesExpected(
@@ -509,6 +512,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
               ],
               dtype=dtype))
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/153812660): Handle tf.QuantizeAndDequantize compilation")
+  def testQuantizeAndDequantize(self):
+    for dtype in self.float_types:
+
       def quantize_and_dequantize_v2(x):
         return array_ops.quantize_and_dequantize_v2(
             x, -127, 127, signed_input=True, num_bits=8)
@@ -593,6 +601,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([-1, -0.5, 0, 0.3], dtype=dtype),
           expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype))
 
+  @test_util.disable_mlir_bridge("TODO(b/156135423): Fix ConvertSigmoidOp")
   def testComplexOps(self):
     for dtype in self.complex_types:
 
@@ -823,6 +832,8 @@ class UnaryOpsTest(xla_test.XLATestCase):
             [[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]]], dtype=np.float32),
         expected=np.array([14., 22.], dtype=np.float32))
 
+  @test_util.disable_mlir_bridge("TODO(b/153812660): Handle tf.Cast compilation"
+                                )
   def testCast(self):
     shapes = [[], [4], [2, 3], [2, 0, 4]]
     types = {
@@ -870,6 +881,8 @@ class UnaryOpsTest(xla_test.XLATestCase):
             src,
             expected=dst)
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/153812660): Handle tf.Bitcast compilation")
   def testBitcast(self):
     self._assertOpOutputMatchesExpected(
         lambda x: array_ops.bitcast(x, dtypes.int32),
@@ -893,12 +906,16 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([1, 0x100000003f800000], np.int64),
           expected=np.array([1, 0x100000003f800000], np.uint64))
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/153812660): Handle tf.InvertPermutation compilation")
   def testInvertPermutation(self):
     self._assertOpOutputMatchesExpected(
         array_ops.invert_permutation,
         np.array([1, 2, 0], np.int32),
         expected=np.array([2, 0, 1], dtype=np.int32))
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/153812660): Handle tf.InvertPermutation compilation")
   def testInvertPermutationTwiceIsNoop(self):
     self._assertOpOutputMatchesExpected(
         lambda x: array_ops.invert_permutation(array_ops.invert_permutation(x)),
@@ -990,6 +1007,8 @@ class UnaryOpsTest(xla_test.XLATestCase):
         ],
         equality_test=self.ListsAreClose)
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/153812660): Handle tf.DepthToSpace compilation")
   def testDepthToSpace(self):
 
     def make_op(data_format):
@@ -1042,6 +1061,8 @@ class UnaryOpsTest(xla_test.XLATestCase):
                               [[[6, 7], [14, 15]], [[22, 23], [30, 31]]]]],
                             dtype=dtype))
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/153812660): Handle tf.SpaceToDepth compilation")
   def testSpaceToDepth(self):
 
     def make_op(data_format):
@@ -1101,6 +1122,8 @@ class UnaryOpsTest(xla_test.XLATestCase):
     self._assertOpOutputMatchesExpected(
         nn_ops.softplus, features, expected=expected, rtol=1e-6, atol=9.1e-6)
 
+  @test_util.disable_mlir_bridge(
+      "bf16 type not supported in CreateDenseElementsAttrFromLiteral")
   def testSoftplus(self):
     for dtype in self.float_types:
       self._assertSoftplusMatchesExpected([[-2, 0, 8]], dtype)
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 3c2fcbc0fcc..f3e915daa67 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
 
@@ -78,6 +79,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(v,),
           expected=np.tile(v, (7, 42, 1, 1)))
 
+  @test_util.disable_mlir_bridge('Dynamic result types not supported')
   def testShiftRightLogical(self):
     self._assertOpOutputMatchesExpected(
         xla.shift_right_logical,
@@ -89,6 +91,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
         args=(np.array([0xFFFFFFFF, 16], dtype=np.uint32), np.uint32(4)),
         expected=np.array([0x0FFFFFFF, 1], dtype=np.uint32))
 
+  @test_util.disable_mlir_bridge('Dynamic result types not supported')
   def testShiftRightArithmetic(self):
     self._assertOpOutputMatchesExpected(
         xla.shift_right_arithmetic,
@@ -208,6 +211,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                [7, 7, 7, 7, 7], [7, 2, 3, 7, 7], [7, 7, 7, 7, 7]],
               dtype=dtype))
 
+  @test_util.disable_mlir_bridge('Not supported yet')
   def testReduce(self):
     for dtype in set(self.numeric_types).intersection(
         set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
@@ -258,6 +262,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),),
           expected=np.array([0, 45, 120, 231], dtype=dtype))
 
+  @test_util.disable_mlir_bridge('Not supported yet')
   def testSelectAndScatter(self):
     for dtype in set(self.numeric_types).intersection(
         set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
@@ -311,6 +316,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                         [[673, 674], [683, 684], [693, 694]]]),
               dtype=dtype))
 
+  @test_util.disable_mlir_bridge('Error handling')
   def testDynamicSliceWithIncorrectStartIndicesShape(self):
     with self.session() as session:
       with self.test_scope():
@@ -324,6 +330,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           (r'start_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and start_indices has shape \[2\].*'))
 
+  @test_util.disable_mlir_bridge('Error handling')
   def testDynamicSliceWithIncorrectSizeIndicesShape(self):
     with self.session() as session:
       with self.test_scope():
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index af1877a2394..356798c19bd 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -384,6 +384,7 @@ tf_cuda_library(
         ":utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "//tensorflow/core/common_runtime:core_cpu",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
@@ -487,9 +488,15 @@ cc_library(
     copts = tf_copts(),
     deps = [
         "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:core_cpu",
+        "//tensorflow/core/grappler/costs:graph_properties",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
         "@com_google_protobuf//:protobuf_headers",
     ],
 )
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 278f49da71b..806d930b76f 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -32,12 +32,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
@@ -77,6 +77,19 @@ Status BuildNodeMap(const Graph& graph,
   return Status::OK();
 }
 
+EngineInfo::EngineType GetEngineType(const ConversionParams& params) {
+  return (params.is_dyn_op || params.use_calibration)
+             ? EngineInfo::EngineType::TRTDynamic
+             : EngineInfo::EngineType::TRTStatic;
+}
+
+// Returns true when use_implicit_batch is false or when we are building dynamic
+// engine, to allow unknown size for dimensions rather than dimension 0.
+bool AllowDynamicNonBatchDimension(const ConversionParams& params) {
+  return !params.use_implicit_batch ||
+         GetEngineType(params) == EngineInfo::EngineType::TRTDynamic;
+}
+
 }  // namespace
 
 struct EdgePtrCompare {
@@ -393,9 +406,8 @@ Status CreateTRTNode(const ConversionParams& params,
           for (int i = 1; i < conn.outside_shape.dims(); i++) {
             if (conn.outside_shape.dim_size(i) <= 0) {
               return errors::Internal(
-                  "Input shapes must be fully defined when in static mode. "
-                  "Please try is_dynamic_op=True (shape was ",
-                  conn.outside_shape.DebugString(), ")");
+                  "Not fully defined input shape when in static mode which "
+                  "should have been excluded by the segmenter. ");
             }
           }
         }
@@ -645,11 +657,15 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     segment_options.exclude_node_list.insert(node);
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
+  segment_options.use_implicit_batch = params.use_implicit_batch;
+  segment_options.allow_dynamic_non_batch_dim =
+      AllowDynamicNonBatchDimension(params);
+
   segment::SegmentNodesVector initial_segments;
   TrtNodeValidator validator(*params.graph_properties, params.precision_mode,
                              params.use_calibration, params.use_implicit_batch);
   TF_RETURN_IF_ERROR(segment::SegmentGraph(
-      &graph,
+      &graph, params.graph_properties,
       std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator,
                 std::placeholders::_1),
       // Input validation is already done by TrtNodeValidator, so we don't
@@ -686,9 +702,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       continue;
     }
     curr_engine.precision_mode = params.precision_mode;
-    curr_engine.engine_type = ((params.is_dyn_op || params.use_calibration)
-                                   ? EngineInfo::EngineType::TRTDynamic
-                                   : EngineInfo::EngineType::TRTStatic);
+    curr_engine.engine_type = GetEngineType(params);
     curr_engine.use_calibration = params.use_calibration;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
     curr_engine.allow_build_at_runtime = params.allow_build_at_runtime;
@@ -764,6 +778,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     } else {
       // Graph is not modified.
       LOG(WARNING) << "Cannot replace " << msg
+                   << " reason: " << status.error_message()
                    << " (keeping original segment).";
     }
     if (VLOG_IS_ON(1)) {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index bb705812c52..e791ff9ff60 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -29,10 +29,12 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
@@ -41,7 +43,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -795,6 +796,19 @@ nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
   }
 }
 
+Status TRT_TensorOrWeights::GetTfType(DataType* tf_type) const {
+  if (is_tensor()) {
+    nvinfer1::DataType trt_type = tensor()->getType();
+    return TrtTypeToTfType(trt_type, tf_type);
+  }
+
+  if (is_weights()) {
+    *tf_type = weights().GetTensor().dtype();
+    return Status::OK();
+  }
+  return errors::Internal("The object is probably not initialized");
+}
+
 string TRT_TensorOrWeights::DebugString() const {
   string output = "TRT_TensorOrWeights(type=";
   if (is_tensor()) {
@@ -1456,12 +1470,13 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
                                   absl::string_view name,
                                   nvinfer1::ITensor** output_tensor) {
   const auto dims = input_tensor->getDimensions();
-
-  if (order_with_batch_dim.size() - 1 != size_t(dims.nbDims)) {
+  const int order_size = use_implicit_batch_ ? order_with_batch_dim.size() - 1
+                                             : order_with_batch_dim.size();
+  if (order_size != size_t(dims.nbDims)) {
     return errors::InvalidArgument(
         "Rank of perm for transpose does not match with that of the input.");
   }
-  if (order_with_batch_dim[0] != 0) {
+  if (use_implicit_batch_ && order_with_batch_dim[0] != 0) {
     return errors::Unimplemented(
         "Transpose at batch dimension is not supported.");
   }
@@ -1472,8 +1487,13 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
   MarkQuantizationRangesAsInferrable(input_tensor, layer->getOutput(0));
 
   nvinfer1::Permutation permutation;
-  for (int32_t i = 0; i < dims.nbDims; ++i) {
-    permutation.order[i] = order_with_batch_dim[i + 1] - 1;
+  if (use_implicit_batch_) {
+    for (int32_t i = 0; i < dims.nbDims; ++i) {
+      permutation.order[i] = order_with_batch_dim[i + 1] - 1;
+    }
+  } else {
+    std::copy(order_with_batch_dim.begin(), order_with_batch_dim.end(),
+              permutation.order);
   }
   VLOG(1) << "TransposeTensor permutation: "
           << DebugString(permutation, dims.nbDims);
@@ -1894,27 +1914,48 @@ Status CheckInputsWeights(
   return Status::OK();
 }
 
-Status AllowDataTypes(const OpConverterParams& params,
-                      const std::set<DataType>& allowed_dtypes,
-                      const char* dtype_attr_name = "T") {
-  const auto& node_def = params.node_def;
+Status GetNodeDefTfType(const NodeDef& node_def, DataType* tf_type,
+                        const char* type_attr_name) {
   TFAttrs attrs(node_def);
-  if (!attrs.count(dtype_attr_name)) {
-    return errors::InvalidArgument("Attribute with name ", dtype_attr_name,
+  if (!attrs.count(type_attr_name)) {
+    return errors::InvalidArgument("Attribute with name ", type_attr_name,
                                    " not found.");
   }
-  const auto op_dtype = attrs.get<DataType>(dtype_attr_name);
-  if (!allowed_dtypes.count(op_dtype)) {
-    // Build string list of allowed types.
-    std::ostringstream ss;
-    for (auto it = allowed_dtypes.begin(); it != allowed_dtypes.end(); ++it) {
-      if (it != allowed_dtypes.begin()) ss << ", ";
-      ss << DataTypeString(*it);
-    }
-    return errors::Unimplemented("Data type ", DataTypeString(op_dtype),
+  *tf_type = attrs.get<DataType>(type_attr_name);
+  return Status::OK();
+}
+
+Status GetInputTfType(const OpConverterParams& params, DataType* tf_type,
+                      int pos) {
+  const std::vector<TRT_TensorOrWeights>& inputs = params.inputs;
+  if (inputs.size() <= pos) {
+    return errors::Internal("Invalid input position");
+  }
+
+  return inputs[pos].GetTfType(tf_type);
+}
+
+constexpr const char kOutputTypeAttrName[] = "T";
+
+Status GetOutputTfType(const OpConverterParams& params, DataType* tf_type) {
+  return GetNodeDefTfType(params.node_def, tf_type, kOutputTypeAttrName);
+}
+
+Status AllowDataTypes(const OpConverterParams& params,
+                      const std::set<DataType>& allowed_types,
+                      const char* type_attr_name = kOutputTypeAttrName) {
+  const auto& node_def = params.node_def;
+  DataType tf_type;
+  TF_RETURN_IF_ERROR(GetNodeDefTfType(node_def, &tf_type, type_attr_name));
+  if (!allowed_types.count(tf_type)) {
+    string allowed_types_string = absl::StrJoin(
+        allowed_types, ", ", [](string* out, const DataType& type) {
+          absl::StrAppendFormat(out, "%s", DataTypeString(type));
+        });
+    return errors::Unimplemented("Data type ", DataTypeString(tf_type),
                                  " is not supported for ", node_def.op(),
-                                 ", must be one of [", ss.str(), "], at ",
-                                 node_def.name());
+                                 ", must be one of [", allowed_types_string,
+                                 "], at ", node_def.name());
   }
   return Status::OK();
 }
@@ -2027,6 +2068,24 @@ Status Conv2DPaddingHelper(OpConverterParams* params, const TFAttrs& attrs,
   return Status::OK();
 }
 
+namespace {
+// Extracts the spatial dimensions from `output_sizes` and returns them as a
+// vector of size 2.
+std::vector<int64_t> GetSpatialDimsFromOutputSizes(
+    const TRT_TensorOrWeights& output_sizes, const int h_index,
+    const int w_index) {
+  // We use h_index and w_index instead of 1 and 2 because we haven't
+  // transposed output_sizes along with the input.
+  const TRT_ShapedWeights& weights = output_sizes.weights();
+  const int output_sizes_length = weights.count();
+  auto output_sizes_values = static_cast<int*>(weights.GetValues());
+  // The length of output_sizes can be 2 or 4. When the length is 4,
+  // output_sizes represents <height,width>.
+  return {output_sizes_values[output_sizes_length == 4 ? h_index : 0],
+          output_sizes_values[output_sizes_length == 4 ? w_index : 1]};
+}
+}  // namespace
+
 Status ConvertConv2DHelper(OpConverterParams* params, int group,
                            bool is_conv2d_backprop_input) {
   const auto& inputs = params->inputs;
@@ -2125,11 +2184,8 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
     // For backprop, calculate padding based on "input_sizes" input, which
     // actually corresponds to output size. ("input_sizes" makes sense in the
     // context of Conv2DBackpropInput).
-    // We use h_index and w_index instead of 1 and 2 because we havent
-    // transposed backprop_output_size along with the input.
-    auto output_size_weights =
-        static_cast<int*>(backprop_output_size.weights().GetValues());
-    input_dims = {output_size_weights[h_index], output_size_weights[w_index]};
+    input_dims =
+        GetSpatialDimsFromOutputSizes(backprop_output_size, h_index, w_index);
   } else {
     // Use 1 and 2 because tensor_dim has the dimensions of the transposed
     // input.
@@ -2189,22 +2245,24 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   // argument output_shape and thus the TRT output shape could be wrong
   // in case of strides>1.
   if (is_conv2d_backprop_input) {
-    auto tf_output_shape =
-        static_cast<int*>(backprop_output_size.weights().GetValues());
+    std::vector<int64_t> output_spatial_dims =
+        GetSpatialDimsFromOutputSizes(backprop_output_size, h_index, w_index);
+    const int output_height = output_spatial_dims[0];
+    const int output_width = output_spatial_dims[1];
     nvinfer1::Dims trt_output_shape = output_tensor->getDimensions();
     // What determines the padding size is the difference between the given
     // input_sizes (tf_output_shape) and TRT computed size.
-    const int height_diff = tf_output_shape[h_index] - trt_output_shape.d[1];
-    const int width_diff = tf_output_shape[w_index] - trt_output_shape.d[2];
+    const int height_diff = output_height - trt_output_shape.d[1];
+    const int width_diff = output_width - trt_output_shape.d[2];
     if ((height_diff < 0) || (width_diff < 0)) {
       return errors::InvalidArgument(
           "input_sizes argument of Conv2DBackprop (i.e. output_shape argument "
           "of conv2d_transpose) ",
           "is too small for the given out_backprop argument of Conv2DBackprop "
           "(i.e. input argument of conv2d_transpose). Expect: ",
-          "(", tf_output_shape[h_index], ", ", tf_output_shape[w_index],
-          ") >= ", "(", trt_output_shape.d[1], ", ", trt_output_shape.d[2],
-          ") for op ", node_def.name());
+          "(", output_height, ", ", output_width, ") >= ", "(",
+          trt_output_shape.d[1], ", ", trt_output_shape.d[2], ") for op ",
+          node_def.name());
     }
     // Only add a padding layer if padding sizes are larger than 0
     if ((height_diff > 0) || (width_diff > 0)) {
@@ -2254,11 +2312,13 @@ Status ConvertTranspose(OpConverterParams* params) {
 
   // Verify the permutation.
   nvinfer1::ITensor* input_tensor = inputs.at(0).tensor();
-  if (perm.size() - 1 != size_t(input_tensor->getDimensions().nbDims)) {
+  const int perm_size =
+      params->use_implicit_batch ? perm.size() - 1 : perm.size();
+  if (perm_size != size_t(input_tensor->getDimensions().nbDims)) {
     return errors::InvalidArgument(
         "Rank of perm for transpose does not match with that of the input.");
   }
-  if (perm[0] != 0) {
+  if (params->use_implicit_batch && perm[0] != 0) {
     return errors::Unimplemented(
         "Transpose at batch dimension is not supported.");
   }
@@ -2283,112 +2343,70 @@ Status ConvertTranspose(OpConverterParams* params) {
 
 Status ConvertReshape(OpConverterParams* params) {
   const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(
       CheckInputsWeights(*params, {{"tensor", false}, {"shape", true}}));
   TF_RETURN_IF_ERROR(AllowDataTypes(
       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
   const TRT_TensorOrWeights& input_tensor = inputs.at(0);
+
+  // TODO(bixia): we can't use inputs.at(1).weights().ToVector<int>() for two
+  // reasons: (1) When weights.count()==0, TRT_ShapedWeights::tensor_ dtype is
+  // not properly set to INT32. (2) I tried a fix for the first problem, I got
+  // shared pointer related error in convert_nodes_test. We should fix the
+  // problems and switch to use inputs.at(1).weights().ToVector<int>(), a type
+  // safe method to access the content of the tensor.
   TRT_ShapedWeights weights = inputs.at(1).weights();
   if (weights.count() == 0) {
     return errors::Unimplemented("Reshape to shape=[] is not supported, at ",
-                                 node_def.name());
+                                 params->node_def.name());
   }
 
-  const int* weights_ptr = static_cast<int*>(weights.GetValues());
-
-  // Check that it doesn't change the batch dimension. This check is
-  // conservative, for example, when the first dim of the shape is -1 and input
-  // tensor shape is not fixed, it is still possible that the reshape doesn't
-  // change the batch dim, but as long as there is a possibility that it could
-  // change the batch dim, it reject the conversion. The parameters are:
-  //
-  // * reshape_batch_dim: the value of the first dim of the input shape constant
-  // * reshape_dims: all other dims of the input shape constant
-  // * input_batch_dim: the value of the first dim of the input tensor to
-  //   reshape
-  // * input_dims: all other dims of the input tensor to reshape
-  //
-  // The validation logic is:
-  //
-  // if input_batch_dim is fixed:
-  //   if reshape_batch_dim == input_batch_dim:
-  //     ok
-  //   elif reshape_batch_dim == -1 (meaning reshape_dims are fixed) and
-  //        input_dims are fixed and
-  //        prod(input_dims) == prod(reshape_dims)
-  //     ok
-  //   else:
-  //     not ok
-  // elif input_dims are fixed:
-  //   if reshape_dims are fixed and
-  //      prod(input_dims) == prod(reshape_dims):
-  //     ok
-  //   else:
-  //     not ok
-  // else:
-  //   not ok
-  //
-  // Note that the following is ok no matter whether reshape_batch_dim is fixed
-  // or not:
-  //
-  // ```
-  // input_batch_dim is not fixed &&
-  //     reshape_dims are fixed &&
-  //     prod(input_dims) == prod(reshape_dims),
-  // ```
-  //
-  // because the non-batch dims of the new and old shapes match, and TF runtime
-  // should make sure the batch dim is not changed.
+  const int* output_shape_dims = static_cast<int*>(weights.GetValues());
+  size_t output_shape_dims_count = weights.count();
 
   const int input_batch_dim = input_tensor.batch_size();
-  const int reshape_batch_dim = weights_ptr[0];
-  const nvinfer1::Dims input_dims = input_tensor.GetTrtDims();
+  const int output_batch_dim = output_shape_dims[0];
 
-  nvinfer1::Dims reshape_dims;
-  reshape_dims.nbDims = weights.count() - 1;
-  for (int i = 1; i < weights.count(); i++) {
-    reshape_dims.d[i - 1] = weights_ptr[i];
+  const nvinfer1::Dims input_nonbatch_dims = input_tensor.GetTrtDims();
+  nvinfer1::Dims output_nonbatch_dims;
+  output_nonbatch_dims.nbDims = output_shape_dims_count - 1;
+  for (int i = 1; i < output_shape_dims_count; i++) {
+    output_nonbatch_dims.d[i - 1] = output_shape_dims[i];
   }
 
-  // Check that it doesn't change the batch dimension according to the logic
-  // mentioned above.
-  bool reshape_may_change_batch_dim = false;
-  if (input_batch_dim > 0) {        // Batch size is fixed.
-    if (reshape_batch_dim == -1) {  // Other dims of the shape must be fixed.
-      if (!AreDimsStaticWithSameSize(input_dims, reshape_dims,
-                                     /*is_tensor=*/true)) {
-        reshape_may_change_batch_dim = true;
-      }
-    } else if (reshape_batch_dim != input_batch_dim) {
-      reshape_may_change_batch_dim = true;
-    } else {
-      // This means (input_batch_dim>0 && input_batch_dim==reshape_batch_dim),
-      // and TF runtime should make sure non-batch dims are matched.
-    }
-  } else if (!AreDimsStaticWithSameSize(input_dims, reshape_dims,
-                                        /*is_tensor=*/true)) {
-    reshape_may_change_batch_dim = true;
-  }
   VLOG(1) << "input_batch_dim=" << input_batch_dim
-          << ", input_dims=" << DebugString(input_dims)
-          << "\nreshape_batch_dim=" << reshape_batch_dim
-          << ", reshape_dims=" << DebugString(reshape_dims);
+          << ", input_nonbatch_dims=" << DebugString(input_nonbatch_dims)
+          << "\nresult_batch_dim=" << output_batch_dim
+          << ", result_nonbatch_dims=" << DebugString(output_nonbatch_dims);
+
+  // Check whether input_batch_dim and output_batch_dim will have the same
+  // static value.
+  bool reshape_may_change_batch_dim = false;
+  if (input_batch_dim != -1 && output_batch_dim != -1) {
+    reshape_may_change_batch_dim = (input_batch_dim != output_batch_dim);
+  } else {
+    reshape_may_change_batch_dim =
+        !AreDimsStaticWithSameSize(input_nonbatch_dims, output_nonbatch_dims,
+                                   /*is_tensor=*/true);
+  }
   if (reshape_may_change_batch_dim) {
-    const string msg = StrCat(
-        "Reshape on batch dimension is not supported, at ", node_def.name(),
-        ". input_batch_dim=", input_batch_dim, ", ", DebugString(input_dims),
-        "; reshape_batch_dim=", reshape_batch_dim, ", ",
-        DebugString(reshape_dims));
+    const string msg =
+        StrCat("Reshape on batch dimension is not supported, at ",
+               params->node_def.name(), ". input_batch_dim=", input_batch_dim,
+               ", ", DebugString(input_nonbatch_dims),
+               "; output_batch_dim=", output_batch_dim, ", ",
+               DebugString(output_nonbatch_dims));
     return errors::Unimplemented(msg);
   }
 
-  // Start conversion.
+  // Perform the conversion.
   nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      input_tensor, reshape_dims, params->validation_only, &output_tensor));
+      input_tensor, output_nonbatch_dims, params->validation_only,
+      &output_tensor));
   if (params->validation_only) return Status::OK();
 
+  // Record the conversion result.
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
@@ -2430,26 +2448,19 @@ Status ConvertExpandDims(OpConverterParams* params) {
 }
 
 Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
-                                const std::vector<int>& trt_axes,
+                                std::vector<int>* input_dims,
                                 nvinfer1::ITensor** output) {
-  const nvinfer1::Dims dims = input->getDimensions();
-  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
-  // Mark axes to remove by setting them to 0.
-  for (int axis : trt_axes) {
-    input_dims[axis] = 0;
-  }
-
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
   // If the remaining dimensions of a squeeze operation have dynamic sizes, we
   // need to use TRT ops to build the result shape for the squeeze operation.
   // This is because IShuffleLayer::setReshapeDimensions treats -1 as a special
   // value.
-  if (absl::c_any_of(input_dims, [](int i) { return i == -1; })) {
+  if (absl::c_any_of(*input_dims, [](int i) { return i == -1; })) {
     nvinfer1::ITensor* shape = network()->addShape(*input)->getOutput(0);
     std::vector<nvinfer1::ITensor const*> concat_inputs;
-    for (int i = 0; i < input_dims.size(); i++) {
+    for (int i = 0; i < input_dims->size(); i++) {
       // If input dim wasn't set to 0 earlier, we include it in new shape.
-      if (input_dims[i] != 0) {
+      if (input_dims->at(i) != 0) {
         concat_inputs.push_back(
             network()
                 ->addSlice(*shape, {1, {i}}, {1, {1}}, {1, {1}})
@@ -2469,11 +2480,12 @@ Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
   }
 #endif
   // Remove all dims which are equal to 0.
-  input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0),
-                   input_dims.end());
+  input_dims->erase(std::remove(input_dims->begin(), input_dims->end(), 0),
+                    input_dims->end());
   // Reshape tensor.
   nvinfer1::Dims new_dims;
-  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims));
+  VLOG(2) << "input_dims" << input_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(*input_dims, &new_dims));
   TF_RETURN_IF_ERROR(PrepareTensorForShape(TRT_TensorOrWeights(input), new_dims,
                                            /*validation_only=*/false, output));
   return Status::OK();
@@ -2492,31 +2504,48 @@ Status ConvertSqueeze(OpConverterParams* params) {
   TFAttrs attrs(node_def);
   auto squeeze_dims = attrs.get<std::vector<int64>>("squeeze_dims");
   if (squeeze_dims.empty()) {
-    return errors::Unimplemented(
-        "Squeeze is only implemented for explicit dims, at ", node_def.name());
-  }
-  std::vector<int> trt_axes;
-  trt_axes.reserve(squeeze_dims.size());
-  for (int tf_axis : squeeze_dims) {
-    // If the axis is valid, then convert it to TRT axis, otherwise abort
-    // conversion.
-    int trt_axis;
-    TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
-                                   params->use_implicit_batch, &trt_axis));
-    // Make sure target dimension is size 1 or unknown size (-1)
-    if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) {
-      return errors::InvalidArgument(
-          "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
-          " cannot be squeezed because it must be size 1, at ",
+    if (params->use_implicit_batch || !HasStaticShape(dims)) {
+      return errors::Unimplemented(
+          "Squeeze is not implemented for empty squeeze_dims, at ",
           node_def.name());
+    } else {
+      // explicit batch mode with static input shape we squeeze all singleton
+      // dimensions
+      for (int& dim : input_dims) {
+        if (dim == 1) {
+          // Mark it for removal by setting it to 0
+          dim = 0;
+        }
+      }
+    }
+  } else {
+    std::vector<int> trt_axes;
+    trt_axes.reserve(squeeze_dims.size());
+    for (int tf_axis : squeeze_dims) {
+      // If the axis is valid, then convert it to TRT axis, otherwise abort
+      // conversion.
+      int trt_axis;
+      TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
+                                     params->use_implicit_batch, &trt_axis));
+      // Make sure target dimension is size 1 or unknown size (-1)
+      if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) {
+        return errors::InvalidArgument(
+            "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
+            " cannot be squeezed because it must be size 1, at ",
+            node_def.name());
+      }
+      trt_axes.push_back(trt_axis);
+    }
+    // Mark axes to remove by setting them to 0.
+    for (int axis : trt_axes) {
+      input_dims[axis] = 0;
     }
-    trt_axes.push_back(trt_axis);
   }
   if (params->validation_only) return Status::OK();
 
   nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
-      input_tensor.tensor(), trt_axes, &output_tensor));
+      input_tensor.tensor(), &input_dims, &output_tensor));
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
@@ -4604,6 +4633,42 @@ Status ConvertUnpack(OpConverterParams* params) {
   return ConvertSplitHelper(params, inputs.at(0), tf_axis, num, true);
 }
 
+// Supports cast fp16=>fp32 through IIdentityLayer.
+Status ConvertCast(OpConverterParams* params) {
+  const NodeDef& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  auto unsupport_cast_error = [&]() {
+    return errors::Unimplemented("Cast op: ", node_def.op(),
+                                 " not supported at: ", node_def.name());
+  };
+
+  DataType input_type;
+  TF_RETURN_IF_ERROR(GetInputTfType(*params, &input_type, 0));
+  if (input_type != DataType::DT_HALF) {
+    return unsupport_cast_error();
+  }
+
+  DataType output_type;
+  TF_RETURN_IF_ERROR(GetOutputTfType(*params, &output_type));
+  if (output_type != DataType::DT_FLOAT) {
+    return unsupport_cast_error();
+  }
+
+  if (params->validation_only) return Status::OK();
+
+  nvinfer1::ITensor* input = params->inputs.at(0).tensor();
+  nvinfer1::IIdentityLayer* layer =
+      params->converter->network()->addIdentity(*input);
+  layer->setPrecision(nvinfer1::DataType::kFLOAT);
+
+  if (layer->getOutput(0)->getType() != nvinfer1::DataType::kFLOAT) {
+    return errors::Internal("IIdentityLayer doesn't work as expected");
+  }
+
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
+}
+
 Status ConvertConcat(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -5681,6 +5746,7 @@ static void RegisterValidatableOpConverters(
   (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS;
 #endif
   (*registration)["AddN"] = ConvertAddN;
+  (*registration)["Cast"] = ConvertCast;
   (*registration)["ConcatV2"] = ConvertConcat;
   (*registration)["Const"] = ConvertConst;
   (*registration)["Conv2D"] = ConvertConv2D;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 8608c8226ee..2fe8eec9675 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -294,6 +294,8 @@ class TRT_TensorOrWeights {
 
   nvinfer1::Dims GetTrtDims() const;
 
+  Status GetTfType(DataType* tf_type) const;
+
   int batch_size() const { return batch_size_; }
 
   string DebugString() const;
@@ -529,11 +531,9 @@ class Converter {
 
   // Helper function to add a squeeze op to the network.
   //
-  // The trt_axes argument lists those axes that need to be squeezed. Each axis
-  // in the list is numbered according to TRT convention (see ConvertAxis for
-  // details).
-  Status SqueezeTensor(nvinfer1::ITensor* input,
-                       const std::vector<int>& trt_axes,
+  // The input_dims argument stores the TRT dimensions of the input tensor,
+  // where the dimensions to be squeezed are replaced by 0.
+  Status SqueezeTensor(nvinfer1::ITensor* input, std::vector<int>* input_dims,
                        nvinfer1::ITensor** output);
 
   // Creates an IConstantLayer using 'weights' whose dimensions are specified by
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index e9e3333ea38..d4badd1cc03 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -15,17 +15,25 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
+#include <algorithm>
+#include <functional>
 #include <memory>
 #include <unordered_map>
 #include <vector>
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/nn_ops_internal.h"
@@ -33,6 +41,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
@@ -48,11 +58,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -62,7 +67,42 @@ namespace convert {
 using absl::StrCat;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
-using ::testing::NanSensitiveFloatNear;
+using ::testing::Matcher;
+
+// TensorRT modes for testing. We define the following three modes:
+// 1. Implicit batch mode: The tensors have static (known) input shape and the
+//    the batch dimension (first dim) is removed from the TRT tensor shape. In
+//    a loose notation: trt_shape = tf_shape[1:]. This is the standard mode of
+//    a TensorRT network definition  before TensorRT 6.
+// 2. Explicit batch mode: static (known) input shape, but the batch dimension
+//    is part of the trt tensor shape. (trt_shape = tf_shape)
+// 3. Dynamic shape mode allows unknown input shapes, and requires explicit
+//    batch size definition (trt_shape = tf_shape).
+//
+// Note that the Converter only distinguishes between two modes:
+// - use_implicit_batch == true, this corresponds to kImplicitBatch,
+// - use_implicit_batch == false which includes both kExplicitBatch and
+//   kDynamicShape.
+//
+// For the converter, the distinction between explicit batch or dynamic shape
+// mode follows from the input tensors of the network: dynamic shape input
+// implies dynamic shape mode, while static shape input tensors imply explicit
+// batch mode. We want to test all these modes, therefore we define the
+// TrtTestMode with the following three options.
+enum class TrtTestMode {
+  kImplicitBatch = 0,
+  kExplicitBatch = 1,
+  kDynamicShape = 2
+};
+
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+constexpr std::array<TrtTestMode, 3> ValidTrtModes = {
+    TrtTestMode::kImplicitBatch, TrtTestMode::kExplicitBatch,
+    TrtTestMode::kDynamicShape};
+#else
+constexpr std::array<TrtTestMode, 1> ValidTrtModes = {
+    TrtTestMode::kImplicitBatch};
+#endif
 
 // TODO(laigd): put this into some test utils file.
 void ExpectStatus(Status status, error::Code code = error::OK,
@@ -84,30 +124,29 @@ nvinfer1::Dims GetTestDims(const std::vector<int>& d) {
   return dims;
 }
 
-nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) {
-  switch (tf_dtype) {
-    case DT_FLOAT:
-      return nvinfer1::DataType::kFLOAT;
-    case DT_HALF:
-      return nvinfer1::DataType::kHALF;
-    case DT_INT32:
-      return nvinfer1::DataType::kINT32;
-    default:
-      QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype);
+// Prints the vector to the output stream.
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
+  if (!v.empty()) {
+    os << '[';
+    std::copy(v.begin(), v.end(), std::ostream_iterator<T>(os, ", "));
+    os << "\b\b]";
   }
+  return os;
 }
 
-DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      return DT_FLOAT;
-    case nvinfer1::DataType::kHALF:
-      return DT_HALF;
-    case nvinfer1::DataType::kINT32:
-      return DT_INT32;
-    default:
-      QCHECK(false) << "Unexpected data type " << static_cast<int>(trt_dtype);
-  }
+nvinfer1::DataType TfDataTypeToTrt(DataType tf_type) {
+  nvinfer1::DataType trt_type;
+  Status status = TfTypeToTrtType(tf_type, &trt_type);
+  EXPECT_EQ(status, Status::OK());
+  return trt_type;
+}
+
+DataType TrtDataTypeToTf(nvinfer1::DataType trt_type) {
+  DataType tf_type;
+  Status status = TrtTypeToTfType(trt_type, &tf_type);
+  EXPECT_EQ(status, Status::OK());
+  return tf_type;
 }
 
 NodeDef MakeNodeDef(const string& name, const string& op,
@@ -165,6 +204,24 @@ void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
       << "  actual: " << DebugString(rhs);
 }
 
+Matcher<std::vector<float>> ArrayFloatNear(const std::vector<float>& values,
+                                           float max_abs_error = 1e-5,
+                                           bool nan_sensitive = false) {
+  std::vector<Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    if (nan_sensitive) {
+      matchers.emplace_back(::testing::NanSensitiveFloatNear(v, max_abs_error));
+    } else if (max_abs_error == 0) {
+      matchers.emplace_back(::testing::FloatEq(v));
+    } else {
+      EXPECT_GE(max_abs_error, 0);
+      matchers.emplace_back(::testing::FloatNear(v, max_abs_error));
+    }
+  }
+  return ElementsAreArray(matchers);
+}
+
 template <typename T>
 void ExpectArrayNear(const std::vector<T>& lhs, absl::Span<const T> rhs) {
   ASSERT_EQ(lhs.size(), rhs.size());
@@ -242,7 +299,8 @@ struct StaticCaster {
 };
 
 template <typename InCType, typename OutCType>
-std::vector<OutCType> CastTestVector(const std::vector<InCType>& vals) {
+std::vector<OutCType> CastTestVector(
+    const gtl::ArraySlice<InCType>& vals) {  // non-absl ok
   std::vector<OutCType> res(vals.size());
   std::transform(vals.begin(), vals.end(), res.begin(),
                  StaticCaster<InCType, OutCType>());
@@ -1215,10 +1273,15 @@ TEST_F(ConvertGraphDefToEngineTest, IdentityGraph) {
   TF_EXPECT_OK(RunConvertGraphDefToEngine(&s));
 }
 
-template <typename T>
-Tensor ConstructTensor(int data_size, const T& value = T()) {
-  std::vector<T> values(data_size, value);
-  return test::AsTensor<T>(values);
+// Returns a vector of shapes from a vector of input tensors. This can be used
+// to create optimization profiles.
+Status GetShapeFromDataVec(DataVec input_data,
+                           std::vector<TensorShape>* shape_vec) {
+  shape_vec->reserve(input_data.size());
+  std::transform(input_data.begin(), input_data.end(),
+                 std::back_inserter(*shape_vec),
+                 [](InputOutputData x) { return x.tensor.shape(); });
+  return Status::OK();
 }
 
 template <typename T>
@@ -1227,11 +1290,27 @@ inline absl::Span<const T> GetSpanForData(const InputOutputData& data) {
   return absl::Span<const T>(tensor_map.data(), tensor_map.size());
 }
 
+std::vector<float> GetDataAsFloat(InputOutputData& data) {
+  if (data.tensor.dtype() == DT_FLOAT) {
+    auto span = GetSpanForData<float>(data);
+    return std::vector<float>(span.begin(), span.end());
+  }
+  if (data.tensor.dtype() == DT_HALF) {
+    return CastTestVector<Eigen::half, float>(
+        GetSpanForData<Eigen::half>(data));
+  }
+  if (data.tensor.dtype() == DT_INT32) {
+    return CastTestVector<int32, float>(GetSpanForData<int32>(data));
+  }
+  LOG(FATAL) << "DataType not supported for testing "
+             << DataTypeString(data.tensor.dtype());
+}
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
  public:
-  OpConverterTest() : scope_(Scope::NewRootScope()) {
+  OpConverterTest()
+      : scope_(Scope::NewRootScope()), allocator_(new GpuManagedAllocator()) {
     QCHECK_EQ(0, cudaStreamCreate(&stream_));
     Reset();
   }
@@ -1242,22 +1321,84 @@ class OpConverterTest : public ::testing::Test {
     return converter_->GetTensorOrWeights(name, output);
   }
 
-  void Reset() {
+  void Reset(TrtPrecisionMode precision_mode_to_test = TrtPrecisionMode::FP32,
+             TrtTestMode trt_mode = TrtTestMode::kImplicitBatch) {
     // Destroy existing TRT objects in a proper order.
     converter_.reset(nullptr);
     engine_.reset(nullptr);
 
     // Re-create them in proper order.
     converter_ =
-        std::move(Converter::Create(precision_mode_to_test_,
+        std::move(Converter::Create(precision_mode_to_test,
                                     /*use_calibration=*/false, &logger_,
-                                    /*use_implicit_batch=*/true)
+                                    /*use_implicit_batch=*/trt_mode ==
+                                        TrtTestMode::kImplicitBatch)
                       .ValueOrDie());
 
     // Reset other related artifacts.
     scope_ = Scope::NewRootScope();
   }
 
+  // Constructs a flat tensor with 'vals' in Unified Memory.
+  template <typename T>
+  Tensor AsTensor(gtl::ArraySlice<T> vals) {  // non-absl ok
+    Tensor ret(allocator_.get(), DataTypeToEnum<T>::value,
+               {static_cast<int64>(vals.size())});
+    std::copy_n(vals.data(), vals.size(), ret.flat<T>().data());
+    return ret;
+  }
+
+  // Constructs a tensor of "shape" with values "vals" in Unified Memory.
+  template <typename T>
+  Tensor AsTensor(gtl::ArraySlice<T> vals,  // non-absl ok
+                  const TensorShape& shape) {
+    Tensor ret(allocator_.get(), DataTypeToEnum<T>::value,
+               {static_cast<int64>(vals.size())});
+    CHECK(ret.CopyFrom(AsTensor(vals), shape));
+    return ret;
+  }
+
+  // Constructs a tensor with given values (vals). The tensor type is defined by
+  // the tf_dtype argument, its shape is given by input_dims. The tensor is
+  // constructed using the allocator of OpConverterTest in Unified Memory.
+  template <typename T>
+  Tensor AsTensor(std::vector<T> vals, const std::vector<int> input_dims,
+                  DataType tf_dtype) {
+    Tensor ret(allocator_.get(), tf_dtype, {static_cast<int64>(vals.size())});
+    if (tf_dtype == DT_FLOAT) {
+      auto conv_vals = CastTestVector<T, float>(vals);
+      std::copy_n(conv_vals.data(), conv_vals.size(), ret.flat<float>().data());
+    } else if (tf_dtype == DT_HALF) {
+      auto conv_vals = CastTestVector<T, Eigen::half>(vals);
+      std::copy_n(conv_vals.data(), conv_vals.size(),
+                  ret.flat<Eigen::half>().data());
+    } else if (tf_dtype == DT_INT32) {
+      auto conv_vals = CastTestVector<T, int32>(vals);
+      std::copy_n(conv_vals.data(), conv_vals.size(), ret.flat<int32>().data());
+    } else {
+      LOG(FATAL) << "Cannot create tensor with type "
+                 << DataTypeString(tf_dtype);
+    }
+    TensorShape shape;
+    TF_EXPECT_OK(TensorShapeUtils::MakeShape(input_dims, &shape));
+    CHECK(ret.CopyFrom(ret, shape));
+    return ret;
+  }
+
+  // Constructs a flat tensor in Unified Memory.
+  template <typename T>
+  Tensor ConstructTensor(int data_size, const T& value = T()) {
+    std::vector<T> values(data_size, value);
+    return AsTensor<T>(values);
+  }
+
+  // Constructs a flat tensor in Unified Memory.
+  template <typename T>
+  Tensor ConstructTensor(int data_size, const T& value, DataType tf_dtype) {
+    std::vector<T> values(data_size, value);
+    return AsTensor<T>(values, {data_size}, tf_dtype);
+  }
+
   void CheckDataTypeMatches(const DataVec& datas) {
     for (const auto& data : datas) {
       const int input_index = engine_->getBindingIndex(data.name.c_str());
@@ -1271,27 +1412,35 @@ class OpConverterTest : public ::testing::Test {
     }
   }
 
-  // TODO(laigd): test fp16 and int8 support for more converters.
-  void BuildAndRun(const DataVec& input_data, DataVec* output_data,
-                   TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32,
-                   const int batch_size = 1) {
+  Status BuildAndRun(const DataVec& input_data, DataVec* output_data,
+                     const int batch_size = 1) {
     // Mark the output tensor as TRT engine output.
     std::vector<Converter::EngineOutputInfo> output_info;
     for (const auto& data : *output_data) {
       output_info.push_back(
           {data.name, data.name, TfDataTypeToTrt(data.tensor.dtype())});
     }
-    TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(output_info));
+    TF_RETURN_IF_ERROR(converter_->RenameAndMarkOutputTensors(output_info));
 
     // Build the TRT engine.
-    ASSERT_EQ(nullptr, engine_.get());
-    TF_ASSERT_OK(
+    if (engine_.get() != nullptr) {
+      return errors::Internal("Engine already exists");
+    }
+    TrtShapeOptimizationProfile profiles;
+    if (!converter_->use_implicit_batch()) {
+      // Create a single optimization profile for explicit batch mode
+      std::vector<TensorShape> input_shapes;
+      TF_RETURN_IF_ERROR(GetShapeFromDataVec(input_data, &input_shapes));
+      profiles.AddShape(input_shapes);
+      profiles.InitProfiles();
+    }
+    TF_RETURN_IF_ERROR(
         converter_->BuildCudaEngine(&engine_,
                                     /*max_batch_size=*/batch_size,
                                     /*max_workspace_size_bytes=*/1 << 26,
                                     /*allocator=*/nullptr,
                                     /*calibrator=*/nullptr,
-                                    /*profiles=*/nullptr));
+                                    /*profiles=*/&profiles));
     CHECK_NOTNULL(engine_.get());
     CheckDataTypeMatches(input_data);
     CheckDataTypeMatches(*output_data);
@@ -1299,65 +1448,29 @@ class OpConverterTest : public ::testing::Test {
     const int num_bindings = input_data.size() + output_data->size();
     std::vector<void*> buffers(num_bindings);
 
-    ASSERT_EQ(engine_->getNbBindings(), num_bindings);
+    if (engine_->getNbBindings() != num_bindings) {
+      return errors::Internal("Number of bindings do not match");
+    }
+    // Since we have only 1 optimization profile (which is enabled by default)
+    // it is fine to create execution context directly, instead of calling
+    // profiles.CreateExecutionContexts()
     TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context(
         engine_->createExecutionContext());
 
     // Prepare input bindings.
-    TF_ASSERT_OK(SetTrtEngineInputs(engine_.get(), execution_context.get(), 0,
-                                    buffers, converter_->use_implicit_batch(),
-                                    batch_size, nullptr, &input_data));
-
+    TF_RETURN_IF_ERROR(SetTrtEngineInputs(
+        engine_.get(), execution_context.get(), 0, buffers,
+        converter_->use_implicit_batch(), batch_size, nullptr, &input_data));
     // Prepare output bindings.
-    TF_ASSERT_OK(SetTrtEngineOutputs(engine_.get(), execution_context.get(), 0,
-                                     buffers, converter_->use_implicit_batch(),
-                                     batch_size, nullptr, output_data));
-
-    // Allocate buffers on GPU and copy data there. This is necessary because
-    // the test tensors are allocated in host memory, so the pointers that
-    // SetTrtEngin(In|Out)puts placed into buffers[] cannot be used on the GPU.
-    // We allocate the GPU buffers, copy the data there, and overwrite the
-    // addresses in the buffers array.
-    //
-    // TODO(tfeher): This step can be avoided if we allocate the Tensors in
-    // unified memory.
-    for (const auto& data : input_data) {
-      const int input_index = engine_->getBindingIndex(data.name.c_str());
-      ASSERT_NE(-1, input_index);
-      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], data.TotalBytes()));
-      ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], data.Buffer(),
-                                   data.TotalBytes(), cudaMemcpyHostToDevice,
-                                   stream_));
-    }
-    struct SizeAndIndex {
-      SizeAndIndex(int in_size, int in_index)
-          : size(in_size), index(in_index) {}
-      int size;
-      int index;
-    };
-    std::vector<SizeAndIndex> output_infos;
-    for (const auto& data : *output_data) {
-      const int output_index = engine_->getBindingIndex(data.name.c_str());
-      ASSERT_NE(-1, output_index);
-      output_infos.emplace_back(data.TotalBytes(), output_index);
-      ASSERT_EQ(0, cudaMalloc(&buffers[output_index], data.TotalBytes()));
-    }
-
+    TF_RETURN_IF_ERROR(SetTrtEngineOutputs(
+        engine_.get(), execution_context.get(), 0, buffers,
+        converter_->use_implicit_batch(), batch_size, nullptr, output_data));
     // Execute the TRT engine.
-    TF_ASSERT_OK(TrtEnqueue(execution_context.get(), buffers, stream_,
-                            converter_->use_implicit_batch(), batch_size));
-
-    for (int i = 0; i < output_infos.size(); ++i) {
-      const auto& output_info = output_infos[i];
-      ASSERT_EQ(0, cudaMemcpyAsync(output_data->at(i).Buffer(),
-                                   buffers[output_info.index], output_info.size,
-                                   cudaMemcpyDeviceToHost, stream_));
-    }
+    TF_RETURN_IF_ERROR(TrtEnqueue(execution_context.get(), buffers, stream_,
+                                  converter_->use_implicit_batch(),
+                                  batch_size));
     cudaStreamSynchronize(stream_);
-
-    for (int i = 0; i < num_bindings; ++i) {
-      ASSERT_EQ(0, cudaFree(buffers[i]));
-    }
+    return Status::OK();
   }
 
   bool HasStaticShape(const nvinfer1::Dims& dims) const {
@@ -1368,22 +1481,46 @@ class OpConverterTest : public ::testing::Test {
     return true;
   }
 
-  // Add ITensor for both validation and conversion.
-  void AddTestTensor(
-      const string& name, const std::vector<int32>& dims, int batch_size = 1,
+  bool HasStaticShape(std::vector<int> dims) const {
+    return !absl::c_any_of(dims, [](int i) { return i < 0; });
+  }
+
+  // Adds ITensor for both validation and conversion, assuming explicit batch
+  // dimension is included in dims (ie for an NCHW tensor dims = {N, C, H, W}).
+  void AddTestTensorWithTFDims(
+      const string& name, const std::vector<int32>& dims,
       nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
     DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
     ops::Placeholder::Attrs attrs;
     TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_));
-    attrs.shape_.InsertDim(0, batch_size);
+
     auto input = ops::Placeholder(scope_.WithOpName(name), tf_dtype, attrs);
     node_inputs_[name] = input.output;
 
     // Add a real ITensor for conversion conditionally.
-    const nvinfer1::Dims trt_dims = GetTestDims(dims);
-    if (HasStaticShape(trt_dims)) {
+    const nvinfer1::Dims trt_dims =
+        TensorShapeToTrtDims(attrs.shape_, converter_->use_implicit_batch());
+    if (!converter_->use_implicit_batch() || HasStaticShape(trt_dims)) {
+      int batch_size = dims[0];
       TF_EXPECT_OK(
           converter_->AddInputTensor(name, trt_dtype, trt_dims, batch_size));
+    }
+  }
+
+  // Adds ITensor for both validation and conversion. The difference compared to
+  // AddTestTensorWithTFDims is in the meaning of the dims parameter. To define
+  // a tensor with NCHW shape, here we set dims = {C,H,W} and batch_size = N.
+  // TODO(tfeher) remove this function once all test are updated to use the
+  // other version of AddTestTensor (defined by
+  // ParameterizedOpConverterTestBase).
+  void AddTestTensor(
+      const string& name, const std::vector<int32>& dims, int batch_size = 1,
+      nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
+    std::vector<int32> dims_with_batch(dims.size() + 1);
+    dims_with_batch[0] = batch_size;
+    std::copy(dims.begin(), dims.end(), dims_with_batch.begin() + 1);
+    AddTestTensorWithTFDims(name, dims_with_batch, trt_dtype);
+    if (HasStaticShape(dims)) {
       ASSERT_EQ(batch_size, converter_->batch_size_);
     }
   }
@@ -1395,7 +1532,7 @@ class OpConverterTest : public ::testing::Test {
     // Add weights for validation.
     TensorShape shape;
     TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &shape));
-    Tensor t = test::AsTensor<T>(values, shape);
+    Tensor t = AsTensor<T>(values, shape);
     node_inputs_[name] = ops::Const(scope_.WithOpName(name), t);
 
     // Add weights for conversion.
@@ -1415,6 +1552,21 @@ class OpConverterTest : public ::testing::Test {
         converter_->AddTensorOrWeights(name, TRT_TensorOrWeights{weights}));
   }
 
+  template <typename T>
+  void AddTestWeights(const string& name, const std::vector<int>& dims,
+                      const std::vector<T>& values, DataType tf_dtype) {
+    if (tf_dtype == DT_FLOAT) {
+      AddTestWeights(name, dims, CastTestVector<T, float>(values));
+    } else if (tf_dtype == DT_HALF) {
+      AddTestWeights(name, dims, CastTestVector<T, Eigen::half>(values));
+    } else if (tf_dtype == DT_INT32) {
+      AddTestWeights(name, dims, CastTestVector<T, int32>(values));
+    } else {
+      FAIL() << "Cannot create test weights with type "
+             << DataTypeString(tf_dtype);
+    }
+  }
+
   // Test validation in validation-only mode.
   void RunValidation(const Node* node, error::Code expected_code = error::OK,
                      const char* expected_msg_substr = nullptr) {
@@ -1423,9 +1575,9 @@ class OpConverterTest : public ::testing::Test {
     grappler::GraphProperties graph_properties(item);
     TF_EXPECT_OK(graph_properties.InferStatically(true));
 
-    TrtNodeValidator validator(graph_properties, precision_mode_to_test_,
+    TrtNodeValidator validator(graph_properties, converter_->precision_mode(),
                                /*use_calibration=*/false,
-                               /*use_implicit_batch=*/true);
+                               converter_->use_implicit_batch());
     ExpectStatus(validator.IsTensorRTCandidate(node), expected_code,
                  expected_msg_substr);
   }
@@ -1464,6 +1616,33 @@ class OpConverterTest : public ::testing::Test {
     }
   }
 
+  // Helper method to run both validation and conversion, and check the output
+  // shape.
+  void RunValidationAndConversion(const NodeDef& node_def, const Status& status,
+                                  const char* output_name,
+                                  const std::vector<int>& exp_out_dims) {
+    RunValidationAndConversion(node_def, status.code(),
+                               status.error_message().c_str(), true);
+    if (status.ok()) {
+      TRT_TensorOrWeights output;
+      TF_EXPECT_OK(GetTensorOrWeights(output_name, &output));
+      ASSERT_TRUE(output.is_tensor());
+      if (converter_->use_implicit_batch() && !exp_out_dims.empty()) {
+        // We only check output shape implicit batch mode. In dynamic shape
+        // mode we need to wait for the concrate input shapes to be defined
+        // (by setBindingDimensions before enqueue) before we can check
+        // whether the output dims are equal.
+        //
+        // TODO(tamas) enable this check in explicit_batch_mode
+
+        // Removing batch dim
+        auto out_dims =
+            std::vector<int>(exp_out_dims.begin() + 1, exp_out_dims.end());
+        ExpectTrtDimsEqualsArray(out_dims, output.tensor()->getDimensions());
+      }
+    }
+  }
+
   // Expose quantization_ranges_ for tests
   std::unordered_map<nvinfer1::ITensor*, float>& quantization_ranges() {
     return converter_->quantization_ranges_;
@@ -1474,10 +1653,6 @@ class OpConverterTest : public ::testing::Test {
   }
   std::unique_ptr<Converter> converter_;
 
- protected:
-  // TODO(laigd): parameterize the test and make the precision mode a parameter.
-  TrtPrecisionMode precision_mode_to_test_ = TrtPrecisionMode::FP32;
-
  private:
   Logger logger_;
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
@@ -1488,8 +1663,205 @@ class OpConverterTest : public ::testing::Test {
   // GraphProperties.
   Scope scope_;
   std::unordered_map<string, Output> node_inputs_;
+  std::unique_ptr<Allocator> allocator_;
 };
 
+// General test parameters to be used with ops that take a single input tensor.
+struct TestParamBase {
+  // Concrete input dimensions for the test (including the batch dim)
+  std::vector<int> input_dims;
+
+  // Dimensions to define an input with PartialTensorShape. This can be used to
+  // define networks with dynamic input shape. It can be left empty, in that
+  // case AddTestTensor sets partial shapes that are appropriate to TrtTestMode.
+  std::vector<int> partial_input_dims;
+
+  // Concrete (static) output dimensions, including batch size as first dim
+  std::vector<int> expected_output_dims;
+
+  // Parameter vector, has converter specific meaning.
+  std::vector<int> param;
+
+  // Expected status of conversion (with concrete error message)
+  Status status;
+
+  // Expected status of BuildAndRun
+  Status runtime_status;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestParamBase& p) {
+  os << "input_dims" << p.input_dims;
+  if (!p.partial_input_dims.empty()) {
+    os << ", partial_input_dims" << p.partial_input_dims;
+  }
+  if (!p.expected_output_dims.empty()) {
+    os << ", exp_out_dims" << p.expected_output_dims;
+  }
+  if (!p.param.empty()) {
+    os << ", param" << p.param;
+  }
+  os << ", " << p.status;
+  return os;
+}
+
+// Parameterized version of OpConverterTest. We have the following parameters:
+// 1. TrtTestMode: implicit batch, explicit batch, dynamic shape modes
+// 2. DataType of the input TF tensors: DT_FLOAT, DT_HALF, DT_INT32
+// 3. TrtPrecisionMode argument for the Converter: FP32, FP16, INT8
+// We will introduce subclasses that will be instantiated using different
+// combinations of the DataType and TrtPrecisionMode parameters.
+class ParameterizedOpConverterTestBase
+    : public OpConverterTest,
+      public ::testing::WithParamInterface<
+          std::tuple<TrtTestMode, DataType, TrtPrecisionMode>> {
+ public:
+  ParameterizedOpConverterTestBase()
+      : trt_mode(std::get<0>(GetParam())),
+        tf_dtype(std::get<1>(GetParam())),
+        converter_precision(std::get<2>(GetParam())) {}
+
+  void Reset() {
+    OpConverterTest::Reset(converter_precision, trt_mode);
+    input_data_.clear();
+  }
+
+  // Adds an input ITensor for TRT network. Also creates the corresponding TF
+  // tensor, and stores it in the list of inputs (input_data_).
+  //
+  // The TF tensor is always created with concrete static input shape given by
+  // dims. The ITensor can have static or dynamic shape based on the trt_mode
+  // attribute. The ITensor shape is set automatically according to the trt_mode
+  // parameter, unless the user overrides it with an explicit
+  // partial_input_shape_dims argument.
+  //
+  // Parameters:
+  // - name of the input node
+  // - dims actual dimensions of the tensor that we will use during the test
+  //   (including explicit batch dim)
+  // - values initial values for the TF tensor
+  // - dtype data type of the tensor
+  // - partial_input_shape dimensions which can incude unknown shapes. This can
+  //   be empty, in that case the partial_input_shape will be set automatically
+  //   depending on the trt_mode argument. (This argument also includes explicit
+  //   batch dim).
+  //
+  template <typename T>
+  void AddTestTensor(const string& name, const std::vector<int32>& dims,
+                     DataType tf_dtype, const std::vector<T>& values,
+                     const std::vector<int32>& partial_input_shape_dims = {}) {
+    std::vector<int32> partial_shape;
+    if (!partial_input_shape_dims.empty()) {
+      partial_shape = partial_input_shape_dims;
+    } else {
+      if (trt_mode == TrtTestMode::kDynamicShape) {
+        // In dynamic shape mode we make all dims unknown.
+        partial_shape = std::vector<int32>(dims.size(), -1);
+      } else {
+        // Use static (known) input shapes.
+        partial_shape = dims;
+      }
+    }
+    AddTestTensorWithTFDims(name, partial_shape, TfDataTypeToTrt(tf_dtype));
+    if (!values.empty()) {
+      VLOG(2) << "Adding test tensor: " << name << " "
+              << DataTypeString(tf_dtype);
+      InputOutputData data{name, AsTensor(values, dims, tf_dtype)};
+      VLOG(2) << "Added tensor: " << data.name
+              << DataTypeString(data.tensor.dtype());
+      input_data_.push_back(data);
+    }
+  }
+
+  // Adds test tensor (same as above) but with the default tf_dtype defined by
+  // the test params.
+  void AddTestTensor(const string& name, const std::vector<int32>& dims,
+                     const std::vector<float>& values = {},
+                     const std::vector<int32>& partial_input_shape_dims = {}) {
+    AddTestTensor<float>(name, dims, tf_dtype, values,
+                         partial_input_shape_dims);
+  }
+
+  // Builds and runs the converted network. Checks output tensor shape. Tests
+  // output values using a matcher. The network can have multiple input and
+  // output tensors. The inputs are defined by the input_data_ member variable.
+  void BuildAndRun(const string& name,
+                   const std::vector<std::vector<int>>& expected_output_dims,
+                   const Status& expected_runtime_status,
+                   const std::vector<Matcher<std::vector<float>>>& matcher) {
+    TensorShape shape;
+    const int n_output = expected_output_dims.size();
+    ASSERT_EQ(n_output, matcher.size());
+    DataVec output_data;
+    for (int i = 0; i < n_output; i++) {
+      TF_EXPECT_OK(
+          TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
+      string out_name = (n_output == 1) ? name : StrCat(name, ":", i);
+      InputOutputData data{out_name,
+                           ConstructTensor(shape.num_elements(), 0, tf_dtype)};
+      output_data.push_back(data);
+    }
+    ASSERT_FALSE(input_data_.empty());
+    const int batch_size = input_data_[0].tensor.shape().dim_size(0);
+    Status stat =
+        OpConverterTest::BuildAndRun(input_data_, &output_data, batch_size);
+    ASSERT_EQ(expected_runtime_status, stat);
+    if (expected_runtime_status.ok() && stat.ok()) {
+      for (int i = 0; i < n_output; i++) {
+        // Check the shape of the actual output tensors
+        TF_EXPECT_OK(
+            TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
+        EXPECT_TRUE(output_data[i].tensor.shape() == shape)
+            << "Expected shape: " << shape.DebugString() << ", actual shape"
+            << output_data[i].tensor.shape().DebugString();
+        EXPECT_THAT(GetDataAsFloat(output_data[i]), matcher[i]);
+      }
+    }
+  }
+
+  // Runs validation and conversion. If conversion is successfull then builds
+  // the TRT network, executes it and checks the output.
+  void TestOpConverter(const string& name, const NodeDef node_def,
+                       const std::vector<int>& expected_output_dims,
+                       const Status& expected_conversion_status,
+                       const Status& expected_runtime_status,
+                       const Matcher<std::vector<float>>& matcher) {
+    RunValidationAndConversion(node_def, expected_conversion_status,
+                               name.c_str(), expected_output_dims);
+    if (expected_conversion_status.ok()) {
+      BuildAndRun(name, std::vector<std::vector<int>>({expected_output_dims}),
+                  expected_runtime_status,
+                  std::vector<Matcher<std::vector<float>>>({matcher}));
+    }
+  }
+
+ protected:
+  const TrtTestMode trt_mode;
+  const DataType tf_dtype;
+  const TrtPrecisionMode converter_precision;
+  DataVec input_data_;
+};
+
+// Op converter test in FP32 mode. While for debugging purposes it might make
+// sense to run over all possible combinations, normally a subset of them
+// would be sufficient:
+// - All valid options to TrtTestMode (implicit, explicit, dynamic shape)
+// - DataType: is the TF data type of the input tensors. This usually only
+//   influences the data type added by Converter::AddInputTensor. We test the
+//   valid combinations of input data types in AddAndGetInputs, therefore
+//   for most of the OpConverterTest its is sufficient to test for DT_FLOAT.
+// - TrtPrecisionMode: valid options are FP32, FP16 and INT8. This influences
+//   how TRT handles the precision inside the TRT network, but should not matter
+//   for the TF -> TRT conversion. Therefore it should be sufficient to test
+//   for FP32.
+class OpConverterTest1 : public ParameterizedOpConverterTestBase {};
+
+// Instantiate parameter combinations to OpConverterTest1
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverterTest1,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_FLOAT),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
 template <typename T>
 void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
   out->Clear();
@@ -1564,13 +1936,13 @@ void TestConvertConst(OpConverterTest* test) {
     reset_and_test(t, true, {1}, {12});
   }
   {
-    Tensor t = test::AsTensor<InputCType>({1, 2});
+    Tensor t = test->AsTensor<InputCType>({1, 2});
     reset_and_test(t, false, {2}, {1, 2});
     reset_and_test(t, true, {2}, {1, 2});
   }
   {
     Tensor t =
-        test::AsTensor<InputCType>({1, 2, 3, 4, 5, 6}, TensorShape({2, 3}));
+        test->AsTensor<InputCType>({1, 2, 3, 4, 5, 6}, TensorShape({2, 3}));
     reset_and_test(t, false, {2, 3}, {1, 2, 3, 4, 5, 6});
     reset_and_test(t, true, {2, 3}, {1, 2, 3, 4, 5, 6});
   }
@@ -1578,7 +1950,7 @@ void TestConvertConst(OpConverterTest* test) {
     // Set all tensor elements to the same value. Such tensors are encoded
     // using a single element list in tensor proto.
     Tensor t =
-        test::AsTensor<InputCType>({1, 1, 1, 1, 1, 1}, TensorShape({2, 3}));
+        test->AsTensor<InputCType>({1, 1, 1, 1, 1, 1}, TensorShape({2, 3}));
     reset_and_test(t, false, {2, 3}, {1, 1, 1, 1, 1, 1});
     reset_and_test(t, true, {2, 3}, {1, 1, 1, 1, 1, 1});
   }
@@ -1586,7 +1958,7 @@ void TestConvertConst(OpConverterTest* test) {
     // Set trailing tensor elements to the same value. Such tensors are
     // encoded by truncating all equal elements except the first one.
     Tensor t =
-        test::AsTensor<InputCType>({2, 2, 1, 1, 1, 1}, TensorShape({2, 3}));
+        test->AsTensor<InputCType>({2, 2, 1, 1, 1, 1}, TensorShape({2, 3}));
     reset_and_test(t, false, {2, 3}, {2, 2, 1, 1, 1, 1});
     reset_and_test(t, true, {2, 3}, {2, 2, 1, 1, 1, 1});
   }
@@ -1601,10 +1973,9 @@ TEST_F(OpConverterTest, ConvertConst) {
   }
   {
     Reset();
-    Tensor tensor =
-        test::AsTensor<int64>({1, std::numeric_limits<int64>::max(), 1, 1, 1,
-                               std::numeric_limits<int64>::lowest()},
-                              TensorShape({2, 3}));
+    Tensor tensor = AsTensor<int64>({1, std::numeric_limits<int64>::max(), 1, 1,
+                                     1, std::numeric_limits<int64>::lowest()},
+                                    TensorShape({2, 3}));
     NodeDef node_def;
     node_def.set_name("my_const");
     node_def.set_op("Const");
@@ -1628,57 +1999,62 @@ TEST_F(OpConverterTest, ConvertConst) {
   TestConvertConst<DT_UINT64, uint64, int32>(this);
 }
 
-TEST_F(OpConverterTest, ConvertTranspose) {
+TEST_P(OpConverterTest1, ConvertTranspose) {
   // Get the NodeDef for Transpose.
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
   auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
   auto transpose = ops::Transpose(s.WithOpName("my_transpose"), input, weights);
   const NodeDef& node_def = transpose.operation.node()->def();
 
-  {
-    // Permutation is a tensor, should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestTensor("weights", {3});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"perm\" for Transpose must be a constant, at my_transpose");
+  std::vector<TestParamBase> test_params = {
+      // For the first test we leave param empty. This signals to use a
+      // input as weight which will be invalid
+      TestParamBase{{3, 1, 2, 1},
+                    {},
+                    {},
+                    {},
+                    Status(error::UNIMPLEMENTED,
+                           "The input \"perm\" for Transpose must be a "
+                           "constant, at my_transpose")},
+      TestParamBase{{1, 1, 2, 3},
+                    {},
+                    {},
+                    {0, 1, 2},
+                    Status(error::INVALID_ARGUMENT,
+                           "Rank of perm for transpose does not match with "
+                           "that of the input.")},
+      // Transpose batch dim
+      TestParamBase{
+          {1, 1, 2, 3},
+          {},
+          {3, 2, 1, 1},
+          {3, 2, 1, 0},
+          (trt_mode == TrtTestMode::kImplicitBatch)
+              ? Status(error::UNIMPLEMENTED,
+                       "Transpose at batch dimension is not supported")
+              : Status::OK()},
+      TestParamBase{{1, 1, 2, 3}, {}, {1, 3, 1, 2}, {0, 3, 1, 2}},
+  };
+  if (trt_mode == TrtTestMode::kDynamicShape) {
+    // Dynamic shape tests where some shapes are known
+    test_params.push_back(TestParamBase{
+        {1, 1, 2, 3}, {-1, 1, 2, -1}, {1, 3, 1, 2}, {0, 3, 1, 2}});
   }
-  {
-    // Transpose at batch dimension, should fail.
+  std::vector<float> expected_values{1, 4, 2, 5, 3, 6};
+  for (auto p : test_params) {
+    SCOPED_TRACE(p);
     Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("weights", {4}, {1, 0, 2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "Transpose at batch dimension is not supported");
-  }
-  {
-    // Permutation rank doesn't match, should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("weights", {3}, {0, 1, 2});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Rank of perm for transpose does not match with that of the input.");
-  }
-  {
-    // Ok.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("weights", {4}, {0, 3, 1, 2});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_transpose", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions());
-
-    const DataVec input_data{
-        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
-    DataVec output_data{{"my_transpose", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAre(1, 4, 2, 5, 3, 6));
+    AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
+                  p.partial_input_dims);
+    if (p.param.empty()) {
+      AddTestTensor("weights", {3});
+    } else {
+      AddTestWeights<int32>("weights", {static_cast<int>(p.param.size())},
+                            p.param);
+    }
+    TestOpConverter("my_transpose", node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray(expected_values));
   }
 }
 
@@ -1772,10 +2148,10 @@ TEST_F(OpConverterTest, ConvertReshape) {
     std::vector<float> input_vec(TrtTensorDimsNumElements(actual_output_dims) *
                                  batch_size);
     std::iota(input_vec.begin(), input_vec.end(), 1);
-    const DataVec input_data{{"input", test::AsTensor<float>(input_vec)}};
+    const DataVec input_data{{"input", AsTensor<float>(input_vec)}};
     DataVec output_data{
         {"my_reshape", ConstructTensor<float>(input_vec.size())}};
-    BuildAndRun(input_data, &output_data, TrtPrecisionMode::FP32, batch_size);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data, batch_size));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(input_vec));
   }
@@ -1828,9 +2204,9 @@ void TestMatMulHelper(
       ASSERT_TRUE(output.is_tensor());
       ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
 
-      const DataVec input_data{{"input", test::AsTensor<float>({0, 1})}};
-      DataVec output_data{{"my_matmul", ConstructTensor<float>(2)}};
-      test->BuildAndRun(input_data, &output_data);
+      const DataVec input_data{{"input", test->AsTensor<float>({0, 1})}};
+      DataVec output_data{{"my_matmul", test->ConstructTensor<float>(2)}};
+      TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
       if (transpose_b) {
         EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
       } else {
@@ -1855,9 +2231,9 @@ void TestMatMulHelper(
     TF_EXPECT_OK(test->GetTensorOrWeights("my_matmul", &output));
     ASSERT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
-    const DataVec input_data{{"input", test::AsTensor<float>({0, 1})}};
-    DataVec output_data{{"my_matmul", ConstructTensor<float>(2)}};
-    test->BuildAndRun(input_data, &output_data);
+    const DataVec input_data{{"input", test->AsTensor<float>({0, 1})}};
+    DataVec output_data{{"my_matmul", test->ConstructTensor<float>(2)}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     if (transpose_b) {
       EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
     } else {
@@ -1927,28 +2303,24 @@ TEST_F(OpConverterTest, ConvertMatMul) {
   }
   {
     // Make sure that INT8 mode uses IFullyConnectedLayer when possible.
-    precision_mode_to_test_ = TrtPrecisionMode::INT8;
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     NodeDef node_def = get_matmul_nodedef(DT_FLOAT, false, false);
     AddTestTensor("input", {2, 1, 1});
     AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
     RunValidationAndConversion(node_def);
     CheckAddedLayers<nvinfer1::IMatrixMultiplyLayer>(this, false);
     CheckAddedLayers<nvinfer1::IFullyConnectedLayer>(this, true);
-    precision_mode_to_test_ = TrtPrecisionMode::FP32;
   }
   {
     // Make sure that INT8 mode doesn't try to use IFullyConnectedLayer when not
     // compatible. In this case we can't use FC because weights is a tensor.
-    precision_mode_to_test_ = TrtPrecisionMode::INT8;
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     NodeDef node_def = get_matmul_nodedef(DT_FLOAT, false, false);
     AddTestTensor("input", {2, 1, 1});
     AddTestTensor("weights", {2, 2});
     RunValidationAndConversion(node_def);
     CheckAddedLayers<nvinfer1::IMatrixMultiplyLayer>(this, true);
     CheckAddedLayers<nvinfer1::IFullyConnectedLayer>(this, false);
-    precision_mode_to_test_ = TrtPrecisionMode::FP32;
   }
   TestMatMulHelper(this, get_matmul_nodedef, "MatMul");
 }
@@ -1980,15 +2352,13 @@ TEST_F(OpConverterTest, ConvertBatchMatMul) {
   {
     // Make sure that INT8 mode doesn't try to use IFullyConnectedLayer when not
     // compatible. In this case we can't use FC because transpose_a is true.
-    precision_mode_to_test_ = TrtPrecisionMode::INT8;
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     NodeDef node_def = get_batch_matmul_nodedef(DT_FLOAT, true, false);
     AddTestTensor("input", {1, 2, 2});
     AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
     RunValidationAndConversion(node_def);
     CheckAddedLayers<nvinfer1::IMatrixMultiplyLayer>(this, true);
     CheckAddedLayers<nvinfer1::IFullyConnectedLayer>(this, false);
-    precision_mode_to_test_ = TrtPrecisionMode::FP32;
   }
 
   for (bool transpose_a : {false, true}) {
@@ -2004,9 +2374,9 @@ TEST_F(OpConverterTest, ConvertBatchMatMul) {
       TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
       ASSERT_TRUE(output.is_tensor());
       ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
-      const DataVec input_data{{"input", test::AsTensor<float>({0, 1, 2, 3})}};
+      const DataVec input_data{{"input", AsTensor<float>({0, 1, 2, 3})}};
       DataVec output_data{{"my_matmul", ConstructTensor<float>(4)}};
-      BuildAndRun(input_data, &output_data);
+      TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
       if (!transpose_a && !transpose_b) {
         EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                     ElementsAre(3, 4, 11, 16));
@@ -2077,9 +2447,10 @@ void TestConvertBiasAdd(OpConverterTest* test) {
                 num_input);
 
       const DataVec input_data{
-          {"input", ConstructTensor<CType>(num_input, CType(0))}};
-      DataVec output_data{{"my_biasadd", ConstructTensor<CType>(num_input)}};
-      test->BuildAndRun(input_data, &output_data);
+          {"input", test->ConstructTensor<CType>(num_input, CType(0))}};
+      DataVec output_data{
+          {"my_biasadd", test->ConstructTensor<CType>(num_input)}};
+      TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
       if (trt_input_rank == 1) {
         if (data_format == "NHWC") {
           EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
@@ -2147,14 +2518,14 @@ void TestBinaryOp(OpConverterTest* test, bool operand_1_is_tensor,
   if (operand_1_is_tensor) {
     input_data.push_back(
         {"input1",
-         test::AsTensor<CType>({CType(3), CType(6), CType(3), CType(6)})});
+         test->AsTensor<CType>({CType(3), CType(6), CType(3), CType(6)})});
   }
   if (operand_2_is_tensor) {
     input_data.push_back(
         {"input2",
-         test::AsTensor<CType>({CType(2), CType(3), CType(2), CType(3)})});
+         test->AsTensor<CType>({CType(2), CType(3), CType(2), CType(3)})});
   }
-  DataVec output_data{{"my_binary", ConstructTensor<CType>(8)}};
+  DataVec output_data{{"my_binary", test->ConstructTensor<CType>(8)}};
   // Check output dims.
   TRT_TensorOrWeights output;
   TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
@@ -2162,10 +2533,7 @@ void TestBinaryOp(OpConverterTest* test, bool operand_1_is_tensor,
   ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
   // After broadcasting first input becomes {3, 6, 3, 6} and second input
   // becomes {2, 3, 2, 3}.
-  test->BuildAndRun(
-      input_data, &output_data,
-      dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32,
-      /*batch_size=*/2);
+  TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data, /*batch_size=*/2));
   if (node_def.op() == "Add") {
     EXPECT_THAT(
         GetSpanForData<CType>(output_data[0]),
@@ -2287,7 +2655,7 @@ void TestAddN(OpConverterTest* test) {
     for (const auto name : {"inp1", "inp2", "inp3"}) {
       test->AddTestTensor(name, /*dims=*/{1, 2}, /*batch_size=*/2,
                           TfDataTypeToTrt(dtype));
-      input_data.push_back({name, test::AsTensor<CType>({CType(1), CType(2),
+      input_data.push_back({name, test->AsTensor<CType>({CType(1), CType(2),
                                                          CType(3), CType(4)})});
     }
     const NodeDef node_def = GetAddNNodeDef({"inp1", "inp2", "inp3"}, dtype);
@@ -2298,11 +2666,8 @@ void TestAddN(OpConverterTest* test) {
     ASSERT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({1, 2}, output.tensor()->getDimensions());
 
-    DataVec output_data{{"my_addn", ConstructTensor<CType>(4)}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32,
-        /*batch_size=*/2);
+    DataVec output_data{{"my_addn", test->ConstructTensor<CType>(4)}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data, /*batch_size=*/2));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(CastTestVector<int, CType>({3, 6, 9, 12})));
   }
@@ -2313,7 +2678,7 @@ void TestAddN(OpConverterTest* test) {
     for (const auto name : {"inp1", "inp2"}) {
       test->AddTestTensor(name, /*dims=*/{1, 2}, /*batch_size=*/1,
                           TfDataTypeToTrt(dtype));
-      input_data.push_back({name, test::AsTensor<CType>({CType(1), CType(2)})});
+      input_data.push_back({name, test->AsTensor<CType>({CType(1), CType(2)})});
     }
     test->AddTestWeights("inp3", /*dims=*/{1, 1, 2},
                          /*values=*/std::vector<CType>{CType(3), CType(4)});
@@ -2325,10 +2690,8 @@ void TestAddN(OpConverterTest* test) {
     ASSERT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({1, 2}, output.tensor()->getDimensions());
 
-    DataVec output_data{{"my_addn", ConstructTensor<CType>(2)}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    DataVec output_data{{"my_addn", test->ConstructTensor<CType>(2)}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(CastTestVector<int, CType>({5, 8})));
   }
@@ -2350,10 +2713,9 @@ TEST_F(OpConverterTest, ConvertAddN) {
 }
 
 TEST_F(OpConverterTest, ConvertQuantize) {
-  precision_mode_to_test_ = TrtPrecisionMode::INT8;
   {
     // FakeQuantWithMinMaxArgs attributes are empty, should fail.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     NodeDef node_def =
         MakeNodeDef("my_quantize", "FakeQuantWithMinMaxArgs", {"input"});
     AddTestTensor("input", {1, 2, 3});
@@ -2364,7 +2726,7 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
   {
     // FakeQuantWithMinMaxArgs ranges set via attributes, ok.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
@@ -2382,7 +2744,7 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
   {
     // FakeQuantWithMinMaxVars ranges set via inputs, ok.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
@@ -2403,7 +2765,7 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
   {
     // QuantizeAndDequantizeV2 ranges set via inputs, ok.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
@@ -2424,7 +2786,7 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
   {
     // QuantizeAndDequantizeV2 Range inputs are tensors, should fail.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
@@ -2442,7 +2804,7 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
   {
     // QuantizeAndDequantizeV3 ranges set via inputs, ok.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
@@ -2491,13 +2853,11 @@ void TestConvertSquare(OpConverterTest* test) {
     inputs[i] = value;
     expected_outputs[i] = value * value;
   }
-  const DataVec input_data{{"input", test::AsTensor<CType>(inputs)}};
+  const DataVec input_data{{"input", test->AsTensor<CType>(inputs)}};
   // Engine outputs are converted to FP16 automatically if we set FP16 mode in
   // the builder.
-  DataVec output_data{{"my_square", ConstructTensor<CType>(num_inputs)}};
-  test->BuildAndRun(
-      input_data, &output_data,
-      dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+  DataVec output_data{{"my_square", test->ConstructTensor<CType>(num_inputs)}};
+  TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
   ExpectArrayNear(expected_outputs, GetSpanForData<CType>(output_data[0]));
 }
 
@@ -2607,10 +2967,9 @@ TEST_F(OpConverterTest, ConvertCombinedNMS) {
         {"my_nms:2", ConstructTensor<float>(2)},
         {"my_nms:3", ConstructTensor<int32>(1)},
     };
-    const DataVec input_data{
-        {"boxes", test::AsTensor<float>({0, 0, 0.3, 0.4})},
-        {"scores", test::AsTensor<float>({0.4, 0.7, 0.3})}};
-    BuildAndRun(input_data, &output_data);
+    const DataVec input_data{{"boxes", AsTensor<float>({0, 0, 0.3, 0.4})},
+                             {"scores", AsTensor<float>({0.4, 0.7, 0.3})}};
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAre(0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4));
     EXPECT_THAT(GetSpanForData<float>(output_data[1]), ElementsAre(0.7, 0.4));
@@ -2620,90 +2979,67 @@ TEST_F(OpConverterTest, ConvertCombinedNMS) {
 }
 #endif  // IS_TRT_VERSION_GE(5, 1, 0, 0)
 
-TEST_F(OpConverterTest, ConvertActivation) {
+template <typename T>
+NodeDef CreateUnaryOp(DataType tf_dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
+  return T(s.WithOpName("my_unary"), input).operation.node()->def();
+}
+
+constexpr float kLeakyReluAlpha = 0.2f;
+template <>
+NodeDef CreateUnaryOp<ops::internal::LeakyRelu>(DataType tf_dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
+  return ops::internal::LeakyRelu(
+             s.WithOpName("my_unary"), input,
+             ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha))
+      .operation.node()
+      ->def();
+}
+
+TEST_P(OpConverterTest1, ConvertActivation) {
   {
     // Input is weights, should fail.
     Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto relu = ops::Relu(s.WithOpName("my_act"), input);
-    const NodeDef& node_def = relu.operation.node()->def();
+    const NodeDef& node_def = CreateUnaryOp<ops::Relu>(tf_dtype);
     AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "The input \"input\" for Relu must be a tensor, at my_act");
+        "The input \"input\" for Relu must be a tensor, at my_unary");
   }
 
-  constexpr float kLeakyReluAlpha = 0.2f;
   constexpr float kSeluAlpha = 1.7580993408473768599402175208123f;
   constexpr float kSeluScale = 1.0507009873554804934193349852946f;
+  using OpFunc = std::function<NodeDef(DataType)>;
+  using ValFunc = float (*)(float);
+  std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
 
-  // Get nodedef for activation layer.
-  auto get_act_nodedef = [](string op_name) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "LeakyRelu") {
-      auto act = ops::internal::LeakyRelu(
-          s.WithOpName("my_act"), input,
-          ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha));
-      return act.operation.node()->def();
-    } else if (op_name == "Relu") {
-      auto act = ops::Relu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Relu6") {
-      auto act = ops::Relu6(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Sigmoid") {
-      auto act = ops::Sigmoid(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Tanh") {
-      auto act = ops::Tanh(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Elu") {
-      auto act = ops::Elu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Selu") {
-      auto act = ops::Selu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Softsign") {
-      auto act = ops::Softsign(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Softplus") {
-      auto act = ops::Softplus(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    }
-    EXPECT_TRUE(false);
-    return NodeDef();
-  };
-  // Get expected output for activation layer.
-  auto get_act_output = [](string op_name, float input) -> float {
-    if (op_name == "LeakyRelu") {
-      return (input > 0.0f) ? input : input * kLeakyReluAlpha;
-    } else if (op_name == "Relu") {
-      return (input > 0.0f) ? input : 0.0f;
-    } else if (op_name == "Relu6") {
-      return std::min(std::max(input, 0.0f), 6.0f);
-    } else if (op_name == "Sigmoid") {
-      return 1.0f / (1.0f + std::exp(-input));
-    } else if (op_name == "Tanh") {
-      return std::tanh(input);
-    } else if (op_name == "Elu") {
-      return (input > 0.0f) ? input : std::exp(input) - 1;
-    } else if (op_name == "Selu") {
-      return (input > 0.0f) ? kSeluScale * input
-                            : kSeluScale * kSeluAlpha * (std::exp(input) - 1);
-    } else if (op_name == "Softsign") {
-      return input / (std::abs(input) + 1);
-    } else if (op_name == "Softplus") {
-      return std::log(std::exp(input) + 1);
-    }
-    EXPECT_TRUE(false);
-    return 0;
-  };
+#define ADD_OP(name, op, compute) \
+  op_map[name] = std::make_pair(CreateUnaryOp<op>, compute)
+  ADD_OP("LeakyRelu", ops::internal::LeakyRelu,
+         [](float x) { return (x > 0.0f) ? x : x * kLeakyReluAlpha; });
+  ADD_OP("Relu", ops::Relu, [](float x) { return (x > 0.0f) ? x : 0.0f; });
+  ADD_OP("Relu6", ops::Relu6,
+         [](float x) { return std::min(std::max(x, 0.0f), 6.0f); });
+  ADD_OP("Sigmoid", ops::Sigmoid,
+         [](float x) { return 1.0f / (1.0f + std::exp(-x)); });
+  ADD_OP("Tanh", ops::Tanh, static_cast<ValFunc>(std::tanh));
+  ADD_OP("Elu", ops::Elu,
+         [](float x) { return (x > 0.0f) ? x : std::exp(x) - 1; });
+  ADD_OP("Selu", ops::Selu, [](float x) {
+    return (x > 0.0f) ? kSeluScale * x
+                      : kSeluScale * kSeluAlpha * (std::exp(x) - 1);
+  });
+  ADD_OP("Softsign", ops::Softsign,
+         [](float x) { return x / (std::abs(x) + 1); });
+  ADD_OP("Softplus", ops::Softplus,
+         [](float x) { return std::log(std::exp(x) + 1); });
+#undef ADD_OP
 
   // Get list of ops to test.
   std::vector<string> ops_to_test;
-  // Add all ops supported by ConvertUnary.
+  // Add all ops supported by ConvertActivation.
   auto* map = ActivationTypeMap();
   ops_to_test.reserve(map->size());
   for (auto& pair : *map) {
@@ -2712,16 +3048,30 @@ TEST_F(OpConverterTest, ConvertActivation) {
   // Add other activation ops to test.
   ops_to_test.push_back("Relu6");
   ops_to_test.push_back("LeakyRelu");
+  auto p = TestParamBase{
+      {1, 1, 2, 3},  // input dims
+      {},            // input partial dims
+      {1, 1, 2, 3},  // expected output dims
+  };
   // Ok.
   for (const string& op_name : ops_to_test) {
+    if (!op_map.count(op_name)) {
+      FAIL() << "Activation op test map does not contain op " << op_name;
+    }
     Reset();
-    NodeDef node_def = get_act_nodedef(op_name);
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
+    NodeDef node_def = op_map[op_name].first(tf_dtype);
+    const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
+    AddTestTensor("input", p.input_dims, input);
+
+    // std::exp in Softplus will overflow for input > 88
+    std::vector<float> output_values;
+    std::transform(input.begin(), input.end(),
+                   std::back_inserter(output_values), op_map[op_name].second);
+    TestOpConverter("my_unary", node_def, p.expected_output_dims, Status::OK(),
+                    Status::OK(), ArrayFloatNear(output_values, 0, false));
+
     TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_act", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
 
     // Certain activations should set quantization range automatically.
     auto ranges = quantization_ranges();
@@ -2731,17 +3081,6 @@ TEST_F(OpConverterTest, ConvertActivation) {
                op_name == "Softsign") {
       EXPECT_EQ(ranges[output.tensor()], 1.0f);
     }
-
-    // std::exp in Softplus will overflow for input > 88
-    const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
-    const DataVec input_data{{"input", test::AsTensor<float>(input)}};
-    DataVec output_data{{"my_act", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    for (int i = 0; i < input.size(); i++) {
-      const float expected_output = get_act_output(op_name, input[i]);
-      EXPECT_FLOAT_EQ(GetSpanForData<float>(output_data[0])[i],
-                      expected_output);
-    }
   }
 }
 
@@ -2839,134 +3178,117 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    const DataVec input_data{{"input", AsTensor<float>({1, 2, 3, 4, 5, 6})}};
     DataVec output_data{{"my_expanddims", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
 
-TEST_F(OpConverterTest, ConvertSqueeze) {
-  {
-    // No attrs, should fail.
-    Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto squeeze = ops::Squeeze(s.WithOpName("my_squeeze"), input);
-    const NodeDef& node_def = squeeze.operation.node()->def();
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Squeeze is only implemented for explicit dims, at my_squeeze");
-  }
-
+TEST_P(OpConverterTest1, ConvertSqueeze) {
+  const bool use_implicit_batch = (trt_mode == TrtTestMode::kImplicitBatch);
   // Get the NodeDef for Squeeze.
-  auto get_squeeze_nodedef = [](std::vector<int> axis) -> NodeDef {
+  auto get_squeeze_nodedef = [](std::vector<int> axes,
+                                DataType tf_dtype) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    ops::Squeeze::Attrs squeeze_attrs;
-    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);  // non-absl ok
-    auto squeeze =
-        ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
-    return squeeze.operation.node()->def();
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
+    if (!axes.empty()) {
+      ops::Squeeze::Attrs squeeze_attrs;
+      squeeze_attrs.axis_ = gtl::ArraySlice<int>(axes);  // non-absl ok
+      auto squeeze =
+          ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
+      return squeeze.operation.node()->def();
+    } else {
+      auto squeeze = ops::Squeeze(s.WithOpName("my_squeeze"), input);
+      return squeeze.operation.node()->def();
+    }
   };
-
-  {
-    // Input is weights, should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({0});
-    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"input\" for Squeeze must be a tensor, at my_squeeze");
-  }
-  {
-    // Squeeze batch dim, should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({0});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "TensorRT does not allow manipulation of the "
-                               "batch dimension, at my_squeeze");
-  }
-  {
-    // Squeeze batch dim via negative axis, should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({-4});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "TensorRT does not allow manipulation of the "
-                               "batch dimension, at my_squeeze");
-  }
-  {
-    // Squeeze >= rank(input), should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({4});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Axis value of 4 is out of bounds, must be in range [-4, 4), at "
-        "my_squeeze");
-  }
-  {
-    // Squeeze < -rank(input), should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({-5});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Axis value of -5 is out of bounds, must be in range [-4, 4), at "
-        "my_squeeze");
-  }
-  {
-    // Squeeze an axis with size != 1, should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({2});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Dimension 2 with size 2 cannot be squeezed because it must be size 1, "
-        "at my_squeeze");
-  }
-
-  struct TestParams {
-    std::vector<int> input_dims;
-    std::vector<int> axis;
-    std::vector<int> expected_output_dims;
+  std::vector<TestParamBase> test_params = {
+      TestParamBase{
+          {1, 2, 1, 3},  // input dims
+          {},            // input partial dims
+          {2, 3},        // expected output dims
+          {},            // axis
+          trt_mode == TrtTestMode::kExplicitBatch
+              ? Status::OK()
+              : Status{error::UNIMPLEMENTED,
+                       "Squeeze is not implemented for empty squeeze_dims, at "
+                       "my_squeeze"}},
+      TestParamBase{{1, 2, 1, 3},
+                    {},
+                    {2, 1, 3},
+                    {0},
+                    use_implicit_batch
+                        ? Status{error::UNIMPLEMENTED,
+                                 "TensorRT does not allow manipulation of the "
+                                 "batch dimension, at my_squeeze"}
+                        : Status::OK()},
+      TestParamBase{{1, 2, 1, 3},
+                    {},
+                    {2, 1, 3},
+                    {-4},
+                    use_implicit_batch
+                        ? Status{error::UNIMPLEMENTED,
+                                 "TensorRT does not allow manipulation of the "
+                                 "batch dimension, at my_squeeze"}
+                        : Status::OK()},
+      TestParamBase{
+          {1, 1, 2, 3},
+          {},
+          {},
+          {4},
+          Status{error::INVALID_ARGUMENT,
+                 "Axis value of 4 is out of bounds, must be in range [-4, 4), "
+                 "at my_squeeze"}},
+      TestParamBase{
+          {1, 1, 2, 3},
+          {},
+          {},
+          {-5},
+          Status{error::INVALID_ARGUMENT,
+                 "Axis value of -5 is out of bounds, must be in range [-4, 4), "
+                 "at my_squeeze"}},
+      TestParamBase{{1, 1, 2, 3}, {}, {1, 2, 3}, {1}},
+      TestParamBase{{1, 1, 2, 3}, {}, {1, 2, 3}, {-3}},
+      TestParamBase{{1, 2, 3, 1}, {}, {1, 2, 3}, {3}},
+      TestParamBase{{1, 2, 3, 1}, {}, {1, 2, 3}, {-1}},
+      TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {1, 3, 5}},
+      TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {3, 1, 5}},
+      TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {-1, -3, -5}},
+      TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {1, -3, 5}},
+      TestParamBase{{1, 1, 6}, {}, {1, 6}, {1}},
+      TestParamBase{{1, 6, 1}, {}, {1, 6}, {2}},
   };
+  auto squeeze_non_singleton = TestParamBase{
+      {1, 1, 2, 3},
+      {},
+      {},
+      {2},
+      Status{error::INVALID_ARGUMENT,
+             "Dimension 2 with size 2 cannot be squeezed because it must be "
+             "size 1, at my_squeeze"}};
 
-  // Ok.
-  std::vector<TestParams> ok_params = {
-      TestParams{{1, 2, 3}, {1}, {2, 3}},
-      TestParams{{1, 2, 3}, {-3}, {2, 3}},
-      TestParams{{2, 3, 1}, {3}, {2, 3}},
-      TestParams{{2, 3, 1}, {-1}, {2, 3}},
-      TestParams{{1, 2, 1, 3, 1}, {1, 3, 5}, {2, 3}},
-      TestParams{{1, 2, 1, 3, 1}, {3, 1, 5}, {2, 3}},
-      TestParams{{1, 2, 1, 3, 1}, {-1, -3, -5}, {2, 3}},
-      TestParams{{1, 2, 1, 3, 1}, {1, -3, 5}, {2, 3}},
-      TestParams{{1, 6}, {1}, {6}},
-      TestParams{{6, 1}, {2}, {6}},
-  };
-  for (int i = 0; i < ok_params.size(); ++i) {
+  if (trt_mode == TrtTestMode::kDynamicShape) {
+    // In this test we try to squeeze axis=2 which has size > 1. In dynamic
+    // shape mode the converter sees only -1, so it cannot catch this error.
+    squeeze_non_singleton.status = Status::OK();  // conversion status
+    squeeze_non_singleton.runtime_status =
+        errors::InvalidArgument("Negative number of dimensions -1");
+    // Dynamic shape tests with partially known input shape
+    test_params.push_back(TestParamBase{{2, 1, 3}, {2, -1, 3}, {2, 3}, {1}});
+    test_params.push_back(TestParamBase{{2, 1, 3}, {2, 1, -1}, {2, 3}, {1}});
+  }
+  test_params.push_back(squeeze_non_singleton);
+
+  for (TestParamBase p : test_params) {
+    SCOPED_TRACE(p);
     Reset();
-    NodeDef node_def = get_squeeze_nodedef(ok_params[i].axis);
-    AddTestTensor("input", ok_params[i].input_dims);
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_squeeze", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-
-    const DataVec input_data{
-        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
-    DataVec output_data{{"my_squeeze", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAre(1, 2, 3, 4, 5, 6));
+    NodeDef node_def = get_squeeze_nodedef(p.param, tf_dtype);
+    AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
+                  p.partial_input_dims);
+    TestOpConverter("my_squeeze", node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray({1, 2, 3, 4, 5, 6}));
   }
 }
 
@@ -3565,11 +3887,11 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    const DataVec input_data{{"input", test::AsTensor<float>(ok_input)}};
+    const DataVec input_data{{"input", AsTensor<float>(ok_input)}};
     DataVec output_data{
         {"my_strided_slice",
          ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
@@ -3706,11 +4028,10 @@ TEST_F(OpConverterTest, ConvertSlice) {
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    const DataVec input_data{{"input", AsTensor<float>({1, 2, 3, 4, 5, 6})}};
     DataVec output_data{{"my_slice", ConstructTensor<float>(
                                          ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
@@ -3720,28 +4041,16 @@ TEST_F(OpConverterTest, ConvertConv2D) {
   // Get nodedef for Conv2D layer.
   auto get_conv2d_nodedef =
       [](std::vector<int> strides = {1, 1, 1, 1}, string padding = "SAME",
-         string data_format = "NCHW", std::vector<int> dilations = {1, 1, 1, 1},
-         bool is_conv2d_backprop_input = false) -> NodeDef {
+         string data_format = "NCHW",
+         std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
-    if (is_conv2d_backprop_input) {
-      auto input_sizes =
-          ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32);
-      ops::Conv2DBackpropInput::Attrs attrs = ops::Conv2DBackpropInput::Attrs()
-                                                  .DataFormat(data_format)
-                                                  .Dilations(dilations);
-      auto conv2d =
-          ops::Conv2DBackpropInput(s.WithOpName("my_conv2d"), input_sizes,
-                                   filter, input, strides, padding, attrs);
-      return conv2d.operation.node()->def();
-    } else {
-      ops::Conv2D::Attrs attrs =
-          ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
-      auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter,
-                                strides, padding, attrs);
-      return conv2d.operation.node()->def();
-    }
+    ops::Conv2D::Attrs attrs =
+        ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
+    auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides,
+                              padding, attrs);
+    return conv2d.operation.node()->def();
   };
 
   {
@@ -3807,19 +4116,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                                "Dilation rate must be 1 for batch and channel "
                                "dimensions, at my_conv2d");
   }
-  {
-    // Dilation + Conv2DBackpropInput, should fail.
-    Reset();
-    NodeDef node_def =
-        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 2, 1}, true);
-    AddTestTensor("input", {2, 3, 1});
-    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
-    AddTestWeights<int>("input_sizes", {4}, {1, 2, 3, 1});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "Dilation with Conv2DBackpropInput "
-                               "(conv2d_transpose) is not supported, "
-                               "at my_conv2d");
-  }
   {
     // Strides is not 4D, should fail.
     Reset();
@@ -3852,7 +4148,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     string padding;
     string data_format;
     std::vector<int> dilations;
-    bool is_conv2d_backprop_input;
     std::vector<int> expected_output_dims;
     std::vector<float> expected_output;
   };
@@ -3868,7 +4163,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 2},
                  /*expected_output=*/{1, 1, 0, 1}},
       // SAME padding (Asymmetric)
@@ -3880,7 +4174,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 3},
                  /*expected_output=*/{1, 1, -2, 0, 1, -4}},
       // SAME padding (Symmetric)
@@ -3892,7 +4185,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 3},
                  /*expected_output=*/{1, 2, -1, 3, 1, -3}},
       // NHWC
@@ -3904,7 +4196,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NHWC",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{2, 2, 1},
                  /*expected_output=*/{1, 1, 0, 1}},
       // Dilated
@@ -3916,7 +4207,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 2},
-                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 1},
                  /*expected_output=*/{2, 1}},
       // Strided
@@ -3928,9 +4218,83 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 2},
                  /*expected_output=*/{1, 0, 1, 3}},
+  };
+
+  for (int i = 0; i < ok_params.size(); i++) {
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding,
+                           ok_params[i].data_format, ok_params[i].dilations);
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<float>("weights", ok_params[i].filter_dims,
+                          ok_params[i].filter);
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output));
+    ASSERT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    const DataVec input_data{{"input", AsTensor<float>(ok_params[i].input)}};
+    DataVec output_data{
+        {"my_conv2d",
+         ConstructTensor<float>(ok_params[i].expected_output.size())}};
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertConv2DBackpropInput) {
+  // Get nodedef for Conv2D layer.
+  auto get_conv2d_backprop_input_nodedef =
+      [](std::vector<int> strides = {1, 1, 1, 1}, string padding = "SAME",
+         string data_format = "NCHW",
+         std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
+    auto input_sizes = ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32);
+    ops::Conv2DBackpropInput::Attrs attrs = ops::Conv2DBackpropInput::Attrs()
+                                                .DataFormat(data_format)
+                                                .Dilations(dilations);
+    auto conv2d = ops::Conv2DBackpropInput(
+        s.WithOpName("my_conv2d_backprop_input"), input_sizes, filter, input,
+        strides, padding, attrs);
+    return conv2d.operation.node()->def();
+  };
+
+  {
+    // Dilation + Conv2DBackpropInput, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_backprop_input_nodedef({1, 1, 1, 1}, "SAME",
+                                                         "NHWC", {1, 1, 2, 1});
+    AddTestTensor("input", {2, 3, 1});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    AddTestWeights<int>("input_sizes", {4}, {1, 2, 3, 1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation with Conv2DBackpropInput "
+                               "(conv2d_transpose) is not supported, "
+                               "at my_conv2d_backprop_input");
+  }
+
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<float> input;
+    std::vector<int> filter_dims;
+    std::vector<float> filter;
+    std::vector<int> strides;
+    string padding;
+    string data_format;
+    std::vector<int> dilations;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+  };
+
+  // Ok.
+  std::vector<TestParams> ok_params = {
       // Transpose Strided
       TestParams{/*input_dims=*/{1, 2, 2},
                  /*input=*/{0, 1, 2, 3},
@@ -3940,7 +4304,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/true,
                  /*expected_output_dims=*/{1, 2, 4},
                  /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}},
       // Transpose Strided NHWC
@@ -3952,7 +4315,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NHWC",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/true,
                  /*expected_output_dims=*/{2, 4, 1},
                  /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}},
       // Transpose Strided NHWC with VALID padding
@@ -3964,41 +4326,52 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NHWC",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/true,
                  /*expected_output_dims=*/{7, 1, 1},
                  /*expected_output=*/{0, 0, -1, 1, -2, 2, 0}},
-
   };
 
   for (int i = 0; i < ok_params.size(); i++) {
-    Reset();
-    NodeDef node_def = get_conv2d_nodedef(
-        ok_params[i].strides, ok_params[i].padding, ok_params[i].data_format,
-        ok_params[i].dilations, ok_params[i].is_conv2d_backprop_input);
-    AddTestTensor("input", ok_params[i].input_dims);
-    AddTestWeights<float>("weights", ok_params[i].filter_dims,
-                          ok_params[i].filter);
-    if (ok_params[i].is_conv2d_backprop_input) {
-      std::vector<int> tf_input_sizes = ok_params[i].expected_output_dims;
-      tf_input_sizes.insert(tf_input_sizes.begin(), 1);  // Add batch dimension.
-      QCHECK_EQ(4, tf_input_sizes.size());
-      AddTestWeights<int>("input_sizes", {4}, tf_input_sizes);
-    }
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
+    for (int input_sizes_length : {2, 4}) {
+      Reset();
+      NodeDef node_def = get_conv2d_backprop_input_nodedef(
+          ok_params[i].strides, ok_params[i].padding, ok_params[i].data_format,
+          ok_params[i].dilations);
+      AddTestTensor("input", ok_params[i].input_dims);
+      AddTestWeights<float>("weights", ok_params[i].filter_dims,
+                            ok_params[i].filter);
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>(ok_params[i].input)}};
-    DataVec output_data{
-        {"my_conv2d",
-         ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAreArray(ok_params[i].expected_output));
+      std::vector<int> tf_input_sizes = ok_params[i].expected_output_dims;
+      if (input_sizes_length == 4) {
+        tf_input_sizes.insert(tf_input_sizes.begin(),
+                              1);  // Add batch dimension.
+        QCHECK_EQ(4, tf_input_sizes.size());
+        AddTestWeights<int>("input_sizes", {4}, tf_input_sizes);
+      } else {
+        // Remove the channel dimension.
+        if (ok_params[i].data_format == "NHWC") {
+          tf_input_sizes.pop_back();
+        } else {
+          tf_input_sizes.erase(tf_input_sizes.begin());
+        }
+        QCHECK_EQ(2, tf_input_sizes.size());
+        AddTestWeights<int>("input_sizes", {2}, tf_input_sizes);
+      }
+
+      RunValidationAndConversion(node_def);
+      TRT_TensorOrWeights output;
+      TF_EXPECT_OK(GetTensorOrWeights("my_conv2d_backprop_input", &output));
+      ASSERT_TRUE(output.is_tensor());
+      ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                               output.tensor()->getDimensions());
+
+      const DataVec input_data{{"input", AsTensor<float>(ok_params[i].input)}};
+      DataVec output_data{
+          {"my_conv2d_backprop_input",
+           ConstructTensor<float>(ok_params[i].expected_output.size())}};
+      TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
+      EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                  ElementsAreArray(ok_params[i].expected_output));
+    }
   }
 }
 
@@ -4323,12 +4696,11 @@ TEST_F(OpConverterTest, ConvertConv3D) {
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>(ok_params[i].input)}};
+    const DataVec input_data{{"input", AsTensor<float>(ok_params[i].input)}};
     DataVec output_data{
         {"my_conv3d",
          ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
@@ -4511,12 +4883,11 @@ TEST_F(OpConverterTest, ConvertPool3D) {
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>(ok_params[i].input)}};
+    const DataVec input_data{{"input", AsTensor<float>(ok_params[i].input)}};
     DataVec output_data{
         {expected_node_name,
          ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
@@ -4558,10 +4929,10 @@ TEST_F(OpConverterTest, ConvertTopK) {
       }
 
       const DataVec input_data{
-          {"input", test::AsTensor<float>({-9, 3, 5, 1, 6, -5, 7, 1, 0, -1})}};
+          {"input", AsTensor<float>({-9, 3, 5, 1, 6, -5, 7, 1, 0, -1})}};
       DataVec output_data{{"my_topk", ConstructTensor<float>(4)},
                           {"my_topk:1", ConstructTensor<int32>(4)}};
-      BuildAndRun(input_data, &output_data);
+      TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
       EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                   ElementsAre(6, 5, 7, 1));
       EXPECT_THAT(GetSpanForData<int32>(output_data[1]),
@@ -4741,17 +5112,15 @@ void TestConvertGather(OpConverterTest* test) {
 
     DataVec input_data;
     if (ok_params[i].params_is_tensor) {
-      input_data = {{"params", test::AsTensor<CType>(params_input)},
-                    {"indices", test::AsTensor<int32>(ok_params[i].indices)}};
+      input_data = {{"params", test->AsTensor<CType>(params_input)},
+                    {"indices", test->AsTensor<int32>(ok_params[i].indices)}};
     } else {
-      input_data = {{"indices", test::AsTensor<int32>(ok_params[i].indices)}};
+      input_data = {{"indices", test->AsTensor<int32>(ok_params[i].indices)}};
     }
     DataVec output_data{
-        {"my_gather", ConstructTensor<CType>(expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32,
-        /*batch_size=*/expected_output_shape[0]);
+        {"my_gather", test->ConstructTensor<CType>(expected_output.size())}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data,
+                                   /*batch_size=*/expected_output_shape[0]));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(converted_expected_output));
   }
@@ -4822,135 +5191,52 @@ TEST_F(OpConverterTest, ConvertGather) {
   TestConvertGather<DT_INT32>(this);
 }
 
-TEST_F(OpConverterTest, ConvertUnary) {
+NodeDef CreateCastOp(DataType tf_dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF);
+  return ops::Cast(s.WithOpName("my_unary"), input, DT_FLOAT)
+      .operation.node()
+      ->def();
+}
+
+TEST_P(OpConverterTest1, ConvertUnary) {
   {
     // Input is weights, should fail.
     Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto neg = ops::Neg(s.WithOpName("my_unary"), input);
-    const NodeDef& node_def = neg.operation.node()->def();
+    const NodeDef node_def = CreateUnaryOp<ops::Neg>(tf_dtype);
     AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "The input \"x\" for Neg must be a tensor, at my_unary");
   }
-
-  // Get nodedef for unary layer.
-  auto get_unary_nodedef = [](string op_name) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "Abs") {
-      auto unary = ops::Abs(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Acos") {
-      auto unary = ops::Acos(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Acosh") {
-      auto unary = ops::Acosh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Asin") {
-      auto unary = ops::Asin(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Asinh") {
-      auto unary = ops::Asinh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Atan") {
-      auto unary = ops::Atan(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Atanh") {
-      auto unary = ops::Atanh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Ceil") {
-      auto unary = ops::Ceil(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Cos") {
-      auto unary = ops::Cos(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Cosh") {
-      auto unary = ops::Cosh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Exp") {
-      auto unary = ops::Exp(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Floor") {
-      auto unary = ops::Floor(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Log") {
-      auto unary = ops::Log(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Neg") {
-      auto unary = ops::Neg(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Reciprocal") {
-      auto unary = ops::Reciprocal(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Rsqrt") {
-      auto unary = ops::Rsqrt(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sin") {
-      auto unary = ops::Sin(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sinh") {
-      auto unary = ops::Sinh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sqrt") {
-      auto unary = ops::Sqrt(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Tan") {
-      auto unary = ops::Tan(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    }
-    EXPECT_TRUE(false);
-    return NodeDef();
-  };
-  // Get expected output for unary layer.
-  auto get_unary_output = [](string op_name, float input) -> float {
-    if (op_name == "Abs") {
-      return std::abs(input);
-    } else if (op_name == "Acos") {
-      return std::acos(input);
-    } else if (op_name == "Acosh") {
-      return std::acosh(input);
-    } else if (op_name == "Asin") {
-      return std::asin(input);
-    } else if (op_name == "Asinh") {
-      return std::asinh(input);
-    } else if (op_name == "Atan") {
-      return std::atan(input);
-    } else if (op_name == "Atanh") {
-      return std::atanh(input);
-    } else if (op_name == "Ceil") {
-      return std::ceil(input);
-    } else if (op_name == "Cos") {
-      return std::cos(input);
-    } else if (op_name == "Cosh") {
-      return std::cosh(input);
-    } else if (op_name == "Exp") {
-      return std::exp(input);
-    } else if (op_name == "Floor") {
-      return std::floor(input);
-    } else if (op_name == "Log") {
-      return std::log(input);
-    } else if (op_name == "Neg") {
-      return -input;
-    } else if (op_name == "Reciprocal") {
-      return 1.0 / input;
-    } else if (op_name == "Rsqrt") {
-      return 1.0 / std::sqrt(input);
-    } else if (op_name == "Sin") {
-      return std::sin(input);
-    } else if (op_name == "Sinh") {
-      return std::sinh(input);
-    } else if (op_name == "Sqrt") {
-      return std::sqrt(input);
-    } else if (op_name == "Tan") {
-      return std::tan(input);
-    }
-    EXPECT_TRUE(false);
-    return 0;
-  };
-
+  using OpFunc = std::function<NodeDef(DataType)>;
+  using ValFunc = float (*)(float);
+  std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
+#define ADD_OP(name, op, compute) \
+  op_map[name] =                  \
+      std::make_pair(CreateUnaryOp<op>, static_cast<ValFunc>(compute))
+  ADD_OP("Abs", ops::Abs, std::abs);
+  ADD_OP("Acos", ops::Acos, std::acos);
+  ADD_OP("Acosh", ops::Acosh, std::acosh);
+  ADD_OP("Asin", ops::Asin, std::asin);
+  ADD_OP("Asinh", ops::Asinh, std::asinh);
+  ADD_OP("Atan", ops::Atan, std::atan);
+  ADD_OP("Atanh", ops::Atanh, std::atanh);
+  op_map["Cast"] = std::make_pair(CreateCastOp, [](float x) { return x; });
+  ADD_OP("Ceil", ops::Ceil, std::ceil);
+  ADD_OP("Cos", ops::Cos, std::cos);
+  ADD_OP("Cosh", ops::Cosh, std::cosh);
+  ADD_OP("Exp", ops::Exp, std::exp);
+  ADD_OP("Floor", ops::Floor, std::floor);
+  ADD_OP("Log", ops::Log, std::log);
+  ADD_OP("Neg", ops::Neg, [](float x) { return -x; });
+  ADD_OP("Reciprocal", ops::Reciprocal, [](float x) { return 1.0f / x; });
+  ADD_OP("Rsqrt", ops::Rsqrt, [](float x) { return 1.0f / std::sqrt(x); });
+  ADD_OP("Sin", ops::Sin, std::sin);
+  ADD_OP("Sinh", ops::Sinh, std::sinh);
+  ADD_OP("Sqrt", ops::Sqrt, std::sqrt);
+  ADD_OP("Tan", ops::Tan, std::tan);
+#undef ADD_OP
   // Get list of ops to test.
   std::vector<string> ops_to_test;
   // Add all ops supported by ConvertUnary.
@@ -4961,26 +5247,35 @@ TEST_F(OpConverterTest, ConvertUnary) {
   }
   // Add other unary ops to test.
   ops_to_test.push_back("Rsqrt");
-  // Ok.
+  // Prepare test parameters
+  auto p = TestParamBase{
+      {1, 1, 2, 3},  // input dims
+      {},            // input partial dims
+      {1, 1, 2, 3},  // expected output dims
+  };
   for (const string& op_name : ops_to_test) {
+    SCOPED_TRACE(op_name);
     Reset();
-    NodeDef node_def = get_unary_nodedef(op_name);
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
-
-    const std::vector<float> input = {-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
-    const DataVec input_data{{"input", test::AsTensor<float>(input)}};
-    DataVec output_data{{"my_unary", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    for (int i = 0; i < input.size(); ++i) {
-      const float expected_output = get_unary_output(op_name, input[i]);
-      EXPECT_THAT(GetSpanForData<float>(output_data[0])[i],
-                  NanSensitiveFloatNear(expected_output, 0.0001));
+    if (!op_map.count(op_name)) {
+      FAIL() << "Unary op test map does not contain op " << op_name;
     }
+    NodeDef node_def = op_map[op_name].first(tf_dtype);
+
+    // TODO(bixia): we assume this test is only instantiated for DT_FLOAT for
+    // now. Need to find a better way to express input and output types.
+    //
+    // TODO(tfeher): improve tests by defining an expected output data type and
+    // check that. Currently only the shape and values of the output are
+    // checked.
+    DataType input_tf_dtype = op_name == "Cast" ? DT_HALF : tf_dtype;
+
+    std::vector<float> input_values{-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
+    AddTestTensor("input", p.input_dims, input_tf_dtype, input_values);
+    std::vector<float> output;
+    std::transform(input_values.begin(), input_values.end(),
+                   std::back_inserter(output), op_map[op_name].second);
+    TestOpConverter("my_unary", node_def, p.expected_output_dims, Status::OK(),
+                    p.runtime_status, ArrayFloatNear(output, 0.0001, true));
   }
 }
 
@@ -5079,14 +5374,12 @@ void TestConvertConcat(OpConverterTest* test) {
     for (int j = 0; j < num_inputs; ++j) {
       input_data.push_back(
           {StrCat("values_", j),
-           test::AsTensor<CType>(ok_params[i].input_values[j])});
+           test->AsTensor<CType>(ok_params[i].input_values[j])});
     }
     DataVec output_data{
         {"my_concat",
-         ConstructTensor<CType>(ok_params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+         test->ConstructTensor<CType>(ok_params[i].expected_output.size())}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
@@ -5244,16 +5537,14 @@ void TestConvertSplit(OpConverterTest* test) {
                                outputs[j].tensor()->getDimensions());
       // Create buffer to store output.
       output_data.push_back(
-          {name,
-           ConstructTensor<CType>(ok_params[i].expected_outputs[j].size())});
+          {name, test->ConstructTensor<CType>(
+                     ok_params[i].expected_outputs[j].size())});
     }
 
     // Verify output values are correct.
     const DataVec input_data{
-        {"value", test::AsTensor<CType>(ok_params[i].value)}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+        {"value", test->AsTensor<CType>(ok_params[i].value)}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     for (int j = 0; j < outputs.size(); ++j) {
       EXPECT_THAT(GetSpanForData<CType>(output_data[j]),
                   ElementsAreArray(ok_params[i].expected_outputs[j]));
@@ -5423,16 +5714,14 @@ void TestConvertUnpack(OpConverterTest* test) {
                                outputs[j].tensor()->getDimensions());
       // Create buffer to store output.
       output_data.push_back(
-          {name,
-           ConstructTensor<CType>(ok_params[i].expected_outputs[j].size())});
+          {name, test->ConstructTensor<CType>(
+                     ok_params[i].expected_outputs[j].size())});
     }
 
     // Verify output values are correct.
     const DataVec input_data{
-        {"value", test::AsTensor<CType>(ok_params[i].value)}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+        {"value", test->AsTensor<CType>(ok_params[i].value)}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     for (int j = 0; j < outputs.size(); ++j) {
       EXPECT_THAT(GetSpanForData<CType>(output_data[j]),
                   ElementsAreArray(ok_params[i].expected_outputs[j]));
@@ -5597,13 +5886,11 @@ void TestConvertPack(OpConverterTest* test) {
     DataVec input_data;
     for (int j = 0; j < num_inputs; ++j) {
       input_data.push_back({StrCat("values_", j),
-                            test::AsTensor<CType>(params[i].input_values[j])});
+                            test->AsTensor<CType>(params[i].input_values[j])});
     }
-    DataVec output_data{
-        {"my_pack", ConstructTensor<CType>(params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    DataVec output_data{{"my_pack", test->ConstructTensor<CType>(
+                                        params[i].expected_output.size())}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(params[i].expected_output));
   }
@@ -5747,13 +6034,11 @@ void TestConvertArgMinMax(OpConverterTest* test) {
                              output.tensor()->getDimensions());
     // Create input data for tensors.
     const DataVec input_data{
-        {"input", test::AsTensor<CType>(params[i].input_value)}};
+        {"input", test->AsTensor<CType>(params[i].input_value)}};
     DataVec output_data{
-        {"my_arg",
-         ConstructTensor<int32>(params[i].expected_argmax_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+        {"my_arg", test->ConstructTensor<int32>(
+                       params[i].expected_argmax_output.size())}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
 
     if (node_def.op() == "ArgMax") {
       EXPECT_THAT(GetSpanForData<int32>(output_data[0]),
@@ -5849,12 +6134,10 @@ void TestConvertDepthSpaceShuffle(
     ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    DataVec input_data{{"input", test::AsTensor<CType>(params[i].input_value)}};
-    DataVec output_data{{"my_shuffle", ConstructTensor<CType>(
+    DataVec input_data{{"input", test->AsTensor<CType>(params[i].input_value)}};
+    DataVec output_data{{"my_shuffle", test->ConstructTensor<CType>(
                                            params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(params[i].expected_output));
   }
@@ -6127,12 +6410,10 @@ void TestConvertClipByValue(OpConverterTest* test) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray(params[i].dims, output.tensor()->getDimensions());
 
-    DataVec input_data{{"t", test::AsTensor<CType>(params[i].input_value)}};
-    DataVec output_data{
-        {"my_clip", ConstructTensor<CType>(params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    DataVec input_data{{"t", test->AsTensor<CType>(params[i].input_value)}};
+    DataVec output_data{{"my_clip", test->ConstructTensor<CType>(
+                                        params[i].expected_output.size())}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(params[i].expected_output));
   }
@@ -6235,14 +6516,12 @@ void TestConvertSquaredDifference(OpConverterTest* test) {
     ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    DataVec input_data{{"x", test::AsTensor<CType>(params[i].value_x)},
-                       {"y", test::AsTensor<CType>(params[i].value_y)}};
+    DataVec input_data{{"x", test->AsTensor<CType>(params[i].value_x)},
+                       {"y", test->AsTensor<CType>(params[i].value_y)}};
     DataVec output_data{
         {"my_squared_diff",
-         ConstructTensor<CType>(params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+         test->ConstructTensor<CType>(params[i].expected_output.size())}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(params[i].expected_output));
   }
@@ -6342,14 +6621,12 @@ void TestConvertResize(OpConverterTest* test) {
 
     // Create input data for tensors.
     const DataVec input_data{
-        {"input", test::AsTensor<CType>(params[i].input_values)}};
+        {"input", test->AsTensor<CType>(params[i].input_values)}};
     DataVec output_data{
-        {"my_resize", ConstructTensor<CType>(
+        {"my_resize", test->ConstructTensor<CType>(
                           params[i].expected_nearest_output_values.size())}};
 
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
 
     if (node_def.op() == "ResizeBilinear") {
       ExpectArrayAlmostEqual(params[i].expected_bilinear_output_values,
@@ -6444,14 +6721,12 @@ void TestConvertPad(OpConverterTest* test) {
 
     // Create input data for tensors.
     const DataVec input_data{
-        {"input", test::AsTensor<CType>(params[i].input_values)}};
+        {"input", test->AsTensor<CType>(params[i].input_values)}};
     DataVec output_data{
-        {"my_pad",
-         ConstructTensor<CType>(params[i].expected_output_values.size())}};
+        {"my_pad", test->ConstructTensor<CType>(
+                       params[i].expected_output_values.size())}};
 
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     ExpectArrayAlmostEqual(params[i].expected_output_values,
                            GetSpanForData<CType>(output_data[0]), CType(1e-5));
   }
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index fb3ae6943d3..a4b64ec0dc5 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -185,6 +186,40 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
   return Status::OK();
 }
 
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type) {
+  switch (tf_type) {
+    case DT_FLOAT:
+      *trt_type = nvinfer1::DataType::kFLOAT;
+      break;
+    case DT_HALF:
+      *trt_type = nvinfer1::DataType::kHALF;
+      break;
+    case DT_INT32:
+      *trt_type = nvinfer1::DataType::kINT32;
+      break;
+    default:
+      return errors::Internal("Unsupported tensorflow type");
+  }
+  return Status::OK();
+}
+
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type) {
+  switch (trt_type) {
+    case nvinfer1::DataType::kFLOAT:
+      *tf_type = DT_FLOAT;
+      break;
+    case nvinfer1::DataType::kHALF:
+      *tf_type = DT_HALF;
+      break;
+    case nvinfer1::DataType::kINT32:
+      *tf_type = DT_INT32;
+      break;
+    default:
+      return errors::Internal("Invalid TRT type");
+  }
+  return Status::OK();
+}
+
 int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
   int n_bindings = engine->getNbBindings();
   int n_input = 0;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index 5d4cf1bb851..59eeb420134 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -106,6 +106,9 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
                             bool use_implicit_batch, int batch_size,
                             TensorShape& shape);
 
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type);
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type);
+
 // Returns a string that includes compile time TensorRT library version
 // information {Maj, Min, Patch}.
 string GetLinkedTensorRTVersion();
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 66a1a96d96d..d9b8e198f4f 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -569,7 +569,15 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
     input_concrete_shapes.push_back(ctx->input(i).shape());
   }
 
-  OP_REQUIRES_OK_ASYNC(ctx, VerifyInputShapes(input_concrete_shapes), *helper);
+  Status verify_input_shape_status = VerifyInputShapes(input_concrete_shapes);
+  // TODO(bixia): Fix the segmentation.
+  if (!verify_input_shape_status.ok()) {
+    LOG_FIRST_N(WARNING, 5) << "Running native segment for" << name()
+                            << " due to failure in verifying input shapes: "
+                            << verify_input_shape_status.error_message();
+    ExecuteNativeSegment(ctx, helper);
+    return;
+  }
 
   if (!use_implicit_batch_) {
     if (profile_generation_mode_) {
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 4d9dd42a53a..749335f1b09 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -21,14 +21,18 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -36,8 +40,11 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
+namespace {
 using absl::StrAppend;
+using absl::StrAppendFormat;
 using absl::StrCat;
+using absl::StrJoin;
 
 // A simple graph representation to mirror Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
@@ -240,8 +247,6 @@ struct NodePtrCompare {
   }
 };
 
-namespace {
-
 // Copied from TF ReverseDFS, which only works for Graph.
 void StableDFS(const SimpleGraph& g, bool reverse,
                const std::vector<const SimpleNode*>& start,
@@ -341,7 +346,236 @@ bool CanContractEdge(const SimpleEdge* edge,
             });
   return !has_cycle;
 }
-}  // namespace
+
+// TODO(bixia): put this to a common utility file.
+string TensorPropertiesToString(const OpInfo::TensorProperties& prop) {
+  string s = StrCat(DataTypeString(prop.dtype()), ": ");
+  StrAppend(&s, "[");
+  if (prop.shape().unknown_rank()) {
+    StrAppend(&s, "?");
+  } else {
+    StrAppend(&s, StrJoin(prop.shape().dim(), ",",
+                          [](string* out, const TensorShapeProto_Dim& d) {
+                            StrAppendFormat(out, "%d", d.size());
+                          }));
+  }
+  StrAppend(&s, "]");
+  return s;
+}
+
+string TensorPropertiesToString(
+    const std::vector<OpInfo::TensorProperties>& properties) {
+  return StrJoin(properties, "; ",
+                 [](string* out, const OpInfo::TensorProperties& prop) {
+                   StrAppend(out, TensorPropertiesToString(prop));
+                 });
+}
+
+// From the given list of input properties, returns the leading shape, which is
+// the shape that determines the batch size of the operation. The leading shape
+// is selected from the group of input shapes with the highest rank as follows:
+//  . If all of those shapes have non-negative values for the batch dimension,
+//    the leading shape is the one with the largest value for the batch
+//    dimension.
+//  . If some or all of those shapes have negative values for the batch
+//    dimension, and the rest of those shapes have 1 for the batch dimension,
+//    the leading shape is the first of those shapes with a negative value for
+//    the batch dimension.
+//  . Otherwise, we can't determine the leading shape for the operation and
+//    have to exclude the operation from TRT.
+//
+// Examples:
+//    case-1: a[1,3,4] + b[2,3,4] => leading shape [2,3,4]
+//    case-2: a[2,3,4] + b[scalar] => leading shape [2,3,4]
+//    case-3: a[-1,3,4] + b[1,3,4] => leading shape [-1,3,4]
+//    case-4: a[-1,3,4] + b[2,3,4] => no leading shape
+//
+// We have to return "no leading shape" for case-4 to exclude such operation
+// from being translated for this reason:
+//   The actually input for "a" have to be in the shape of [2,3,4] for the
+//   operation to be valid. On the other hand, if we translate the operation
+//   to implicit batch mode, it will becomes a[3,4]+b[3,4] which is valid for
+//   any input shape of "a".
+//
+// This routine assumes the input program is valid. For example, we shouldn't
+// see invalid operation like a[2,3,4] + b[3,3,4]. It also assumes the input
+// properties is not empty and all input have known shapes.
+//
+// TODO(bixia): find a way to share this knowledge with the converter.
+// TODO(bixia): investigate the use of symbolic shape analysis to improve
+//   segmentation, such as by requiring the dynamic dimensions to have the same
+//   negative value.
+absl::optional<const TensorShapeProto*> FindLeadingShape(
+    absl::Span<const OpInfo::TensorProperties> properties) {
+  DCHECK(!properties.empty());
+  const TensorShapeProto* result;
+  int max_batch_dim_value;
+  auto choose_shape_with_higher_rank = [&](const TensorShapeProto* s) {
+    result = s;
+    max_batch_dim_value = s->dim_size() < 1 ? 1 : s->dim(0).size();
+  };
+
+  DCHECK(!properties[0].shape().unknown_rank());
+  choose_shape_with_higher_rank(&properties[0].shape());
+
+  for (const OpInfo::TensorProperties& p : properties.subspan(1)) {
+    DCHECK(!p.shape().unknown_rank());
+    if (p.shape().dim_size() < result->dim_size()) continue;
+
+    if (p.shape().dim_size() > result->dim_size()) {
+      choose_shape_with_higher_rank(&p.shape());
+      continue;
+    }
+
+    // Among the shapes with the same rank, choose the one with a dynamic batch
+    // size. If no shapes have a dynamic batch size, choose the one with the
+    // largest size.
+    if (result->dim_size() < 1) continue;
+
+    if (p.shape().dim(0).size() < 0 || result->dim(0).size() < 0) {
+      if (p.shape().dim(0).size() < 0 && result->dim(0).size() >= 0) {
+        result = &p.shape();
+      } else {
+        max_batch_dim_value =
+            std::max<int>(max_batch_dim_value, p.shape().dim(0).size());
+      }
+
+      continue;
+    }
+
+    if (p.shape().dim(0).size() > result->dim(0).size()) {
+      result = &p.shape();
+      max_batch_dim_value = result->dim(0).size();
+    }
+  }
+
+  if (result->dim_size() > 0 && result->dim(0).size() < 0) {
+    // dynamic batch size
+    if (max_batch_dim_value <= 1) {
+      return result;
+    } else {
+      return absl::nullopt;
+    }
+  }
+
+  return result;
+}
+
+// Returns the inputs that are relevant to determinate the batch size of the
+// operation. This routine handles the following cases:
+//   . Operations that support implicit boradcasting, such as operation mul.
+//     In this case, we need to inspect all the inputs in order to determine the
+//     batch size of the operation.
+//   . Special cases. Such as "Conv2DBackpropInput", "Conv3DBackpropInputV2".
+//   . The batch size of a operation is determined by the first input of the
+//     operation.
+absl::Span<const OpInfo::TensorProperties> GetInputsToDeterminateBatchSize(
+    const Node* node, const std::vector<OpInfo::TensorProperties>& all_inputs) {
+  // TODO(bixia): Find a way to share this knowledge with the converter.
+  static std::set<string> broadcast_supporting_ops = {
+      // ops corresponding to ConvertBinary in the converter
+      "Add",
+      "AddV2",
+      "Mul",
+      "Sub"
+      "Div",
+      "FloorDiv",
+      "RealDiv",
+      "Minimum",
+      "Maximum",
+      "Pow",
+      // other ops that need to need GetTrtBroadcastShape to convert
+      "BiasAdd",
+      "SquaredDifference",
+      "BatchMatMul",
+      "BatchMatMulV2",
+  };
+  const string& op = node->def().op();
+
+  if (op == "Conv2DBackpropInput" || op == "Conv3DBackpropInputV2") {
+    DCHECK_EQ(all_inputs.size(), 3);
+    return absl::MakeSpan(all_inputs).subspan(2, 1);
+  }
+
+  if (broadcast_supporting_ops.count(op)) {
+    return absl::MakeSpan(all_inputs);
+  }
+
+  // This is the common case for the operations that don't support implicit
+  // broadcasting: the first operand determines its batch size. All otherwise
+  // cases are handled before reaching here.
+  return absl::MakeSpan(all_inputs).subspan(0, 1);
+}
+
+// Returns true if the operation we can remove the implicit batch of the
+// operation.
+//
+// In particular, if the input shape has dynamic rank or the input shape rank
+// is less than 2, we can't remove the implicit batch dimension and generate
+// a new operation for TRT translation.
+bool OperationCanBeTranslatedToImplicitBatch(
+    const grappler::GraphProperties* graph_properties, const Node* node) {
+  VLOG(3) << "process node " << node->name();
+  if (node->num_inputs() == 0) return true;
+  if (!graph_properties || !graph_properties->HasInputProperties(node->name()))
+    return false;
+
+  VLOG(3) << "input shapes "
+          << TensorPropertiesToString(
+                 graph_properties->GetInputProperties(node->name()));
+
+  const std::vector<OpInfo::TensorProperties>& all_input_properties =
+      graph_properties->GetInputProperties(node->name());
+  absl::Span<const OpInfo::TensorProperties> input_properties =
+      GetInputsToDeterminateBatchSize(node, all_input_properties);
+  if (absl::c_any_of(input_properties, [](const OpInfo::TensorProperties& p) {
+        return p.shape().unknown_rank();
+      })) {
+    return false;
+  }
+
+  absl::optional<const TensorShapeProto*> leading_shape =
+      FindLeadingShape(input_properties);
+  return leading_shape.has_value() && leading_shape.value()->dim_size() >= 2;
+}
+
+// Returns true if we can't be sure that the operand with the given properties
+// won't have negative values for non-batch dimensions.
+//
+bool HasDynamicNonBatchDimension(const OpInfo::TensorProperties& prop) {
+  const TensorShapeProto& shape = prop.shape();
+  if (shape.unknown_rank()) return true;
+
+  // Scalar is a well specified shape, and TRT supports implicit broadcasting
+  // from scalar to other shapes.
+  if (shape.dim_size() == 0) return false;
+  for (int i = 1; i < shape.dim_size(); ++i) {
+    // The value of a dynamic dimension can be other negative values besides
+    // -1, representing the symbolic group of the dimension.
+    if (shape.dim(i).size() <= -1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if we can't be sure that the operation won't have dynamic
+// non-batch dimension involved. We only check the shape of the first output
+// assuming shape inference already propagates the shapes.
+bool OperationHasDynamicNonBatchDimension(
+    const grappler::GraphProperties* graph_properties, const Node* node) {
+  VLOG(3) << "process node " << node->name();
+  // If the node doesn't have any input or output, not computation is involved.
+  if (node->num_inputs() == 0 || node->num_outputs() == 0) return false;
+
+  // If the node doesn't have output properties, return true to be conservative.
+  if (!graph_properties->HasOutputProperties(node->name())) return true;
+  VLOG(3) << "output shapes "
+          << TensorPropertiesToString(
+                 graph_properties->GetOutputProperties(node->name()));
+  return HasDynamicNonBatchDimension(
+      graph_properties->GetOutputProperties(node->name()).at(0));
+}
 
 void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
                   std::vector<const SimpleEdge*>* remove_edges) {
@@ -401,12 +635,61 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
   }
 }
 
+// Returns a batch size representation for a segment that only contains the
+// given node.
+ClusterBatchSize GetClusterBatchSizeForNode(
+    const grappler::GraphProperties* graph_properties, const Node* node,
+    bool use_implicit_batch) {
+  ClusterBatchSize cluster_batch_size;
+  if (!use_implicit_batch || !node || node->num_inputs() == 0) {
+    return cluster_batch_size;
+  }
+
+  if (!graph_properties ||
+      !graph_properties->HasInputProperties(node->name())) {
+    VLOG(3) << "doesn't have input property";
+    return cluster_batch_size.SetBatchSizeValue(-1);
+  }
+
+  const std::vector<OpInfo::TensorProperties>& input_properties =
+      graph_properties->GetInputProperties(node->name());
+  absl::optional<const TensorShapeProto*> optional_leading_shape =
+      FindLeadingShape(GetInputsToDeterminateBatchSize(node, input_properties));
+  DCHECK(optional_leading_shape.has_value());
+  const TensorShapeProto* leading_shape = optional_leading_shape.value();
+
+  DCHECK(!leading_shape->unknown_rank() && leading_shape->dim_size() >= 2);
+  return cluster_batch_size.SetBatchSizeValue(leading_shape->dim(0).size());
+}
+
+void AddSegmentForNode(const grappler::GraphProperties* graph_properties,
+                       std::vector<UnionFind<SimpleNode*>>* segments,
+                       SimpleNode* node, bool use_implicit_batch) {
+  segments->emplace_back(
+      node, GetClusterBatchSizeForNode(
+                graph_properties, node == nullptr ? nullptr : node->tf_node(),
+                use_implicit_batch));
+}
+
+}  // namespace
+
 Status SegmentGraph(const Graph* tf_graph,
+                    const grappler::GraphProperties* graph_properties,
                     const std::function<Status(const Node*)>& candidate_fn,
                     const std::function<bool(const Edge*)>& input_candidate_fn,
                     const std::function<bool(const Edge*)>& output_candidate_fn,
                     const SegmentOptions& options,
                     SegmentNodesVector* segments) {
+  if (!options.use_implicit_batch && !options.allow_dynamic_non_batch_dim) {
+    return errors::Internal(
+        "Explicit batch mode should allow dynamic non-batch dimensions");
+  }
+
+  if (!options.allow_dynamic_non_batch_dim && !graph_properties) {
+    return errors::Internal(
+        "Need graph propertities to disallow dynamic non-batch dimensions");
+  }
+
   // Steps:
   // 1. run the segmentation algorithm to find all the segments, which uses
   //    candidate_fn to determine the candidates segment nodes;
@@ -422,34 +705,61 @@ Status SegmentGraph(const Graph* tf_graph,
   // for TRT.
   std::unordered_set<string> unsupported_ops;
   int num_unsupported_ops = 0;
+
+  // Getting the operations blacklisted for conversion
+  string tftrt_op_blacklist_str;
+  TF_CHECK_OK(
+      ReadStringFromEnvVar("TF_TRT_OP_BLACKLIST", "", &tftrt_op_blacklist_str));
+
+  auto tftrt_op_blacklist = gtl::FlatSet<string>{};  // non-absl ok
+
+  for (const auto& x : str_util::Split(tftrt_op_blacklist_str, ",")) {
+    tftrt_op_blacklist.insert(x);
+  }
+
+  // Parsing each node of the graph
   std::vector<UnionFind<SimpleNode*>> node_segments;
   for (int i = 0; i < graph->num_node_ids(); ++i) {
     SimpleNode* node = graph->FindNodeId(i);
-    if (options.exclude_node_list.count(node->name()) != 0) {
+    auto exclude_node = [&](absl::string_view reason) {
       VLOG(1) << "Not a TF-TRT candidate, "
               << "(Op type: " << node->tf_node()->type_string() << "), "
               << "(Op name: " << node->name() << "), "
-              << "(Reason: excluded by segmenter option)";
+              << "(Reason: " << reason << ")";
       unsupported_ops.emplace(node->tf_node()->type_string());
       num_unsupported_ops++;
       node = nullptr;
+    };
+    if (options.exclude_node_list.count(node->name()) != 0) {
+      exclude_node("excluded by segmenter option");
+    } else if (options.use_implicit_batch &&
+               !OperationCanBeTranslatedToImplicitBatch(graph_properties,
+                                                        node->tf_node())) {
+      exclude_node(
+          "implicit batch mode requires input shape with at least two "
+          "dimensions");
+    } else if (!options.allow_dynamic_non_batch_dim &&
+               OperationHasDynamicNonBatchDimension(graph_properties,
+                                                    node->tf_node())) {
+      exclude_node("dynamic non-batch dimensions not allowed");
     } else {
       const Status status = candidate_fn(node->tf_node());
       if (!status.ok()) {
-        VLOG(1) << "Not a TF-TRT candidate, "
-                << "(Op type: " << node->tf_node()->type_string() << "), "
-                << "(Op name: " << node->name() << "), "
-                << "(Reason: " << status << ")";
-        unsupported_ops.emplace(node->tf_node()->type_string());
-        num_unsupported_ops++;
-        node = nullptr;
+        exclude_node(status.error_message());
+      } else if (tftrt_op_blacklist.count(node->tf_node()->type_string())) {
+        // WARNING verbosity since the user explicitly requests this behavior.
+        LOG(WARNING) << "Blacklisted as TF-TRT candidate, "
+                     << "(Op type: " << node->tf_node()->type_string() << "), "
+                     << "(Op name: " << node->name() << ")";
+        exclude_node("Blacklisted with the env var TF_TRT_OP_BLACKLIST");
       } else {
         VLOG(2) << "Accepted as a TF-TRT candidate, "
                 << "(Op type: " << node->tf_node()->type_string() << "), "
                 << "(Op name: " << node->name();
       }
     }
-    node_segments.emplace_back(node);
+    AddSegmentForNode(graph_properties, &node_segments, node,
+                      options.use_implicit_batch);
   }
   string msg = StrCat(
       "There are ", num_unsupported_ops, " ops of ", unsupported_ops.size(),
@@ -482,18 +792,23 @@ Status SegmentGraph(const Graph* tf_graph,
               return true;
             });
   for (const SimpleNode* node : order) {
-    // All output nodes of 'node' have been visited...
+    // All output nodes of 'node' have been visited.
     VLOG(3) << "Trying node " << node->name() << " id=" << node->id();
-    // 'node' must be a TRT candidate...
+    // 'node' must be a TRT candidate.
     if (node_segments[node->id()].Value() == nullptr) {
       VLOG(3) << "... not a TRT candidate";
       continue;
     }
-    // Contract output edges to combine 'node' with output
-    // nodes. Iterate since combining two nodes may unblock other
-    // combining.
+    // Contract output edges to combine 'node' with output nodes. Repeat this
+    // step until no output edges can be further contracted. This is because
+    // contracting an output edge may unblock new edges for contracting.
+    ClusterBatchSize expected_batch_size =
+        node_segments[node->id()].BatchSize();
+    VLOG(3) << "batch size " << expected_batch_size;
     while (true) {
       std::set<const SimpleEdge*, SimpleEdgePtrCompare> contract_edges;
+      // TODO(bixia): consider merging the loop to find the edges and the loop
+      // to contract the edges.
       for (const SimpleEdge* out_edge : node->out_edges()) {
         VLOG(3) << "... out node " << out_edge->dst()->name() << " ( "
                 << out_edge->dst()->id() << " <- " << node->id() << " )";
@@ -501,14 +816,26 @@ Status SegmentGraph(const Graph* tf_graph,
           VLOG(3) << "... ... Control Edge, Skipping";
           continue;
         }
-        // Out node must be TRT candidate...
+        // Out node must be a TRT candidate.
         if (node_segments[out_edge->dst()->id()].Value() == nullptr) {
           VLOG(3) << "... ... not a TRT candidate";
           continue;
         }
+        // Out node must have compatible batch size.
+        ClusterBatchSize out_batch_size =
+            node_segments[out_edge->dst()->id()].BatchSize();
+        ClusterBatchSize merged_batch_size = expected_batch_size;
+        if (!merged_batch_size.MergeIfCompatible(out_batch_size)) {
+          VLOG(3) << "... ... incompatible batch size "
+                  << expected_batch_size.ToString() << " "
+                  << out_batch_size.ToString();
+          continue;
+        }
         if (CanContractEdge(out_edge, graph)) {
-          VLOG(3) << "... ... can contract";
+          VLOG(3) << "... ... can contract. new batch size "
+                  << merged_batch_size.ToString();
           contract_edges.insert(out_edge);
+          expected_batch_size = merged_batch_size;
         } else {
           VLOG(3) << "... ... cannot contract, would form cycle";
         }
@@ -525,7 +852,8 @@ Status SegmentGraph(const Graph* tf_graph,
 
         VLOG(3) << "Merge " << src->name() << " <- " << dst->name() << " ("
                 << src->id() << " <- " << dst->id();
-        node_segments[src->id()].Merge(&node_segments[dst->id()]);
+        TF_RETURN_IF_ERROR(
+            node_segments[src->id()].Merge(&node_segments[dst->id()]));
 
         // Contracting the edge leaves disconnected graph edges.
         // Remove these from the graph and from 'contract_edges' so we
@@ -539,6 +867,12 @@ Status SegmentGraph(const Graph* tf_graph,
           graph->RemoveEdge(r);
         }
       }
+      ClusterBatchSize actual_batch_size =
+          node_segments[node->id()].BatchSize();
+      if (expected_batch_size != actual_batch_size) {
+        return errors::Internal(
+            "expected batch size is not the same as the actual batch size");
+      }
     }
   }
 
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index 77c0af223c8..7295c8f0d9d 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -37,12 +38,17 @@ using SegmentNodesVector = std::vector<std::set<const Node*>>;
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
   int minimum_segment_size = 2;
+  bool use_implicit_batch = true;
+  // When use_implicit_batch is false or when we are building dynamic engines,
+  // we allow dynamic non-batch dimensions.
+  bool allow_dynamic_non_batch_dim = false;
   std::set<string> exclude_node_list;
 };
 
 // Get the subgraphs of a graph that can be handled by TensorRT.
 //
-// @param graph Graph of the network
+// @param tf_graph Graph of the network.
+// @graph_properties is the static graph properties.
 // @param candidate_fn A function that returns OK for a Node* if
 // that node can be handled by TensorRT.
 // @param segments Returns the TensorRT segments/subgraphs. Each entry
@@ -50,6 +56,7 @@ struct SegmentOptions {
 // all the NodeDefs in that subgraph.
 // @return the status.
 Status SegmentGraph(const Graph* tf_graph,
+                    const grappler::GraphProperties* graph_properties,
                     const std::function<Status(const Node*)>& candidate_fn,
                     const std::function<bool(const Edge*)>& input_candidate_fn,
                     const std::function<bool(const Edge*)>& output_candidate_fn,
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index cb038e58126..2437481a9c4 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -42,7 +42,7 @@ class SegmentTest : public ::testing::Test {
       if (node_names.find(node->name()) != node_names.end()) {
         return Status::OK();
       }
-      return errors::NotFound("");
+      return errors::NotFound("Not a user specified candidate");
     };
   }
 
@@ -60,18 +60,29 @@ class SegmentTest : public ::testing::Test {
     };
   }
 
-  void RunTest(const Graph* graph, const std::set<string>& candidates,
+  void RunTest(const Graph* graph,
+               const grappler::GraphProperties* graph_properties,
+               const std::set<string>& candidates,
                const std::set<string>& input_candidates,
                const std::set<string>& output_candidates,
                const std::vector<std::set<string>>& expected_segments) {
     SegmentNodesVector segments;
-    TF_EXPECT_OK(SegmentGraph(graph, MakeCandidateFn(candidates),
+    TF_EXPECT_OK(SegmentGraph(graph, graph_properties,
+                              MakeCandidateFn(candidates),
                               MakeInputEdgeCandidateFn(input_candidates),
                               MakeOutputEdgeCandidateFn(output_candidates),
-                              default_options_, &segments));
+                              segment_options_, &segments));
     ValidateSegment(segments, expected_segments);
   }
 
+  void RunTest(const Graph* graph, const std::set<string>& candidates,
+               const std::set<string>& input_candidates,
+               const std::set<string>& output_candidates,
+               const std::vector<std::set<string>>& expected_segments) {
+    RunTest(graph, nullptr, candidates, input_candidates, output_candidates,
+            expected_segments);
+  }
+
   void ValidateSegment(const SegmentNodesVector& segments,
                        const std::vector<std::set<string>>& expected_segments) {
     EXPECT_EQ(expected_segments.size(), segments.size());
@@ -93,7 +104,17 @@ class SegmentTest : public ::testing::Test {
     }
   }
 
-  SegmentOptions default_options_;
+  void DisableImplicitBatchMode() {
+    segment_options_.use_implicit_batch = false;
+    segment_options_.allow_dynamic_non_batch_dim = true;
+  }
+
+  void EnableImplicitBatchModeForStaticEngine() {
+    segment_options_.use_implicit_batch = true;
+    segment_options_.allow_dynamic_non_batch_dim = false;
+  }
+
+  SegmentOptions segment_options_;
 };
 
 std::set<string> operator-(const std::set<string>& lhs, const string& rhs) {
@@ -107,6 +128,7 @@ TEST_F(SegmentTest, Empty) {
   Graph g(OpRegistry::Global());
   TF_EXPECT_OK(s.ToGraph(&g));
   // Expect no segments/subgraphs.
+  DisableImplicitBatchMode();
   RunTest(&g, {}, {}, {}, {});
 }
 
@@ -133,6 +155,7 @@ TEST_F(SegmentTest, Simple) {
   // All Add operations are candidates, and we expect all of them to be
   // collapsed into a single segment
   const std::set<string> all_adds = {"add0", "add1", "add2", "add3", "add4"};
+  DisableImplicitBatchMode();
   RunTest(&g, all_adds, all_adds, all_adds, {all_adds});
 
   // Make add1 not a candidate, and we expect all other Add operations to be
@@ -179,6 +202,7 @@ TEST_F(SegmentTest, AvoidCycle) {
 
   // add2 is not a TRT candidate so there should be no segments generated.
   const std::set<string> without_add2 = {"add0", "add1", "add3", "add4"};
+  DisableImplicitBatchMode();
   RunTest(&g, without_add2, without_add2, without_add2, {});
 }
 
@@ -212,6 +236,7 @@ TEST_F(SegmentTest, Multiple) {
                                      "add5", "add6", "add7", "add8"};
   // Make add5 not a TRT candidate, and we expect two segments.
   auto without_add5 = all_adds - "add5";
+  DisableImplicitBatchMode();
   RunTest(&g, without_add5, without_add5, without_add5,
           {{"add0", "add1", "add2", "add3"}, {"add6", "add8"}});
 
@@ -258,6 +283,7 @@ TEST_F(SegmentTest, BigIfElse) {
   // Make add2 not a TRT candidate, and we expect 2 segments.
   const std::set<string> all_adds = {"add0", "add1", "add2", "add3",
                                      "add4", "add5", "add6", "add7"};
+  DisableImplicitBatchMode();
   RunTest(&g, all_adds - "add2", all_adds, all_adds,
           {{"add0", "add1"}, {"add3", "add4", "add5", "add6", "add7"}});
 }
@@ -276,9 +302,221 @@ TEST_F(SegmentTest, IdentityOps) {
                                            "identity2", "identity3"};
   // Identity ops are not counted as effective ops in the segment, so no segment
   // will be formed in this case.
+  DisableImplicitBatchMode();
   RunTest(&g, all_identities, all_identities, all_identities, {});
 }
 
+// Testing implicit batch mode segmentation: it excludes the add-2 operation
+// with a dynamic non-batch dimension.
+TEST_F(SegmentTest, ExcludeAddWithDynamicNonBatchDimension) {
+  Scope s = Scope::NewRootScope();
+  auto feed_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2, 3}));
+  auto feed_1_shape = ops::Placeholder::Shape(PartialTensorShape({-1, -1, 3}));
+  auto const_val = ops::Const<float>(s, {1.0}, {});
+  auto feed_0 =
+      ops::Placeholder(s.WithOpName("feed-1"), DT_FLOAT, feed_0_shape);
+  auto feed_1 =
+      ops::Placeholder(s.WithOpName("feed-2"), DT_FLOAT, feed_1_shape);
+  auto add_0 = ops::Add(s.WithOpName("add-0"), feed_0, const_val);
+  auto add_1 = ops::Add(s.WithOpName("add-1"), add_0, feed_0);
+  auto add_2 = ops::Add(s.WithOpName("add-2"), const_val, feed_1);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("add-2");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"add-0", "add-1", "add-2"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          {all_nodes - "add-2"});
+}
+
+// Testing implicit batch mode segmentation: It excludes the reshape operation
+// with a dynamic non-batch output dimension.
+// TODO(bixia): hoist the check for reshape should not change batch size from
+// the converter to the segmenter and add another test case for excluding
+// a reshape without dynamic dimensions involved.
+TEST_F(SegmentTest, ExcludeReshapeWithDynamicNonBatchDimensionInOutput) {
+  Scope s = Scope::NewRootScope();
+  auto feed_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2, 3}));
+  auto const_val = ops::Const<float>(s, {1.0}, {});
+  auto feed_0 =
+      ops::Placeholder(s.WithOpName("feed-1"), DT_FLOAT, feed_0_shape);
+  auto add_0 = ops::Add(s.WithOpName("add-0"), feed_0, const_val);
+  auto reshape = ops::Reshape(s.WithOpName("reshape"), add_0, Input({6, -1}));
+  auto add_1 = ops::Add(s.WithOpName("add-1"), reshape, const_val);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("add-1");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"add-0", "reshape", "add-1"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, {});
+}
+
+TEST_F(SegmentTest, RankOneCannotUseImplicitBatch) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(TensorShape({3}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({3}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto const_val = ops::Const(s.WithOpName("const-scalar"), 1.0f, {});
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, const_val);
+  auto output_1 = ops::Add(s.WithOpName("output-1"), input_1, const_val);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  item.fetch.push_back("output-1");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"const-scalar", "output-0", "output-1"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, {});
+}
+
+TEST_F(SegmentTest, TwoChainsDiffBatchSizes) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(TensorShape({2, 3}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({5, 3}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto const_val = ops::Const(s.WithOpName("const-scalar"), 1.0f, {});
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, const_val);
+  auto output_1 = ops::Add(s.WithOpName("output-1"), input_1, const_val);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  item.fetch.push_back("output-1");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"const-scalar", "output-0", "output-1"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          {{"output-0", "const-scalar"}});
+}
+
+TEST_F(SegmentTest, SameRankImplicitBroadcastingStaticBatchSize) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(TensorShape({2, 3, 1}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({1, 3, 4}));
+  auto input_2_shape = ops::Placeholder::Shape(TensorShape({2, 3, 4}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto input_2 =
+      ops::Placeholder(s.WithOpName("input-2"), DT_FLOAT, input_2_shape);
+  auto multiple = ops::Mul(s.WithOpName("multiple"), input_2, input_2);
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, multiple);
+  auto output_1 = ops::Add(s.WithOpName("output-1"), input_1, multiple);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  item.fetch.push_back("output-1");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"multiple", "output-0", "output-1"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          {all_nodes});
+}
+
+TEST_F(SegmentTest, SameRankImplicitBroadcastingDynamicBatchSize) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({1, 2}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto const_val = ops::Const(s.WithOpName("const-val"), 1.0f, {1, 1});
+  auto add_0 = ops::Add(s.WithOpName("add-0"), input_0, const_val);
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, add_0);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"const-val", "add-0", "output-0"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          {{"const-val", "add-0", "output-0"}});
+}
+
+TEST_F(SegmentTest, IncompatibleBatchSizes) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({2, 2}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto const_val = ops::Const(s.WithOpName("const-val"), 1.0f, {2, 2});
+  auto add_0 = ops::Add(s.WithOpName("add-0"), input_0, const_val);
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, add_0);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"const-val", "add-0", "output-0"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, {});
+}
 }  // namespace test
 }  // namespace segment
 }  // namespace tensorrt
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
index 6458ae692fd..70e83c12fca 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -16,51 +16,192 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 
+#include "absl/strings/str_format.h"
+#include "absl/types/optional.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// Union-Find data structure.
-// Each cluster has an associated value; when merging clusters we can control
-// which value becomes the representative of the merged clusters. Values must be
-// copyable.
+// ClusterBatchSize is a data structure to record the batch size we have seen
+// for a cluster during segmentation.
+//
+// When constructing clusters for implicit batch mode, we support the
+// with both dynamic batch size and static batch size. We restrict nodes inside
+// a cluster to either have dynamic batch size or have the same value for static
+// batch size. For this reason, we use a field has_dynamic_batch_value_ to keep
+// track of whether the cluster has any node with dynamic batch size. We use
+// field static_batch_value_ to keep track of whether the cluster has any node
+// with static batch size and what the value of the static batch size, if any.
+// Examples:
+// cluster:  a = a1[1,3] + a1[1,3]
+// ClusterBatchSize: has_dynamic_batch_size_ = false
+//                   static_batch_value_ = {has value, 1}
+//
+// cluster:  b = b1[-1,3] + b2[-1, 3]
+// ClusterBatchSize: has_dynamic_batch_size_ = true
+//                   static_batch_value_ = {has no value}
+//
+// cluster:  a = a1[1,3] + a1[1,3]; b = b1[-1,3] + b2[-1, 3]
+// ClusterBatchSize: has_dynamic_batch_size_ = true
+//                   static_batch_value_ = {has value, 1}
+//
+// When constructing cluster for explicit batch mode, all ClusterBatchSize is
+// irrelevant.
+//
+//
+absl::optional<int> static_batch_value_;
+class ClusterBatchSize {
+ public:
+  ClusterBatchSize()
+      : has_dynamic_batch_value_(false), static_batch_value_(absl::nullopt) {}
+
+  bool operator==(const ClusterBatchSize& b) {
+    return HasDynamicBatchValue() == b.HasDynamicBatchValue() &&
+           static_batch_value_ == b.static_batch_value_;
+  }
+
+  bool operator!=(const ClusterBatchSize& b) { return !(*this == b); }
+
+  int GetStaticBatchValue() const {
+    DCHECK(HasStaticBatchValue());
+    return static_batch_value_.value();
+  }
+
+  // Sets the batch size value assuming that the object doesn't have a batch
+  // size value yet:
+  //   a non-negative input value representing a known batch size.
+  //   a negative input value representing a dynamic batch size.
+  ClusterBatchSize SetBatchSizeValue(int value) {
+    if (value < 0) {
+      has_dynamic_batch_value_ = true;
+      return *this;
+    }
+    static_batch_value_ = value;
+    return *this;
+  }
+
+  bool MergeIfCompatible(const ClusterBatchSize& b) {
+    bool is_compatible = MergeIfCompatible(b.static_batch_value_);
+    if (!is_compatible) return false;
+
+    if (!HasDynamicBatchValue() && b.HasDynamicBatchValue()) {
+      has_dynamic_batch_value_ = true;
+    }
+
+    return true;
+  }
+
+  // Returns a string for the batch size value. If the object has a static
+  // batch size value, return a string for the value. If the object has a
+  // dynamic size value, return -1. Otherwise, returns -2 to represent that
+  // a batch size hasn't been set yet.
+  string ToString() const {
+    string s;
+    absl::StrAppendFormat(&s, "batch_size=(%d,%d,", HasDynamicBatchValue(),
+                          HasStaticBatchValue());
+    if (HasStaticBatchValue()) {
+      absl::StrAppendFormat(&s, "%d", GetStaticBatchValue());
+    }
+    absl::StrAppend(&s, ")");
+    return s;
+  }
+
+ private:
+  bool HasStaticBatchValue() const { return static_batch_value_.has_value(); }
+  bool HasDynamicBatchValue() const { return has_dynamic_batch_value_; }
+
+ private:
+  bool MergeIfCompatible(const absl::optional<int>& b) {
+    bool is_compatible = !HasStaticBatchValue() || !b.has_value() ||
+                         GetStaticBatchValue() == b.value();
+    if (!is_compatible) {
+      return false;
+    }
+    if (!HasStaticBatchValue() && b.has_value()) {
+      static_batch_value_ = b;
+    }
+    return true;
+  }
+
+ private:
+  // To track whether the cluster has any node with dynamic batch size.
+  bool has_dynamic_batch_value_;
+  // To track whether the cluster has any node with static batch size, and the
+  // unique value for static batch size.
+  absl::optional<int> static_batch_value_;
+};
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const ClusterBatchSize& batch_size) {
+  return os << batch_size.ToString();
+}
+
+// Represents a disjoint set of copyable values with type T. We use this data
+// structure to construct clusters for TRTEngineOp. As such, this data structure
+// has a field to record the batch size for the current cluster and merges the
+// corresponding batch sizes when merging two clusters. Most of the methods in
+// this class are side-effecting as they also compress the path from the object
+// to the parent of its containing set.
 template <typename T>
 class UnionFind {
  public:
   UnionFind() : size_(1), parent_(nullptr) {}
-  explicit UnionFind(const T& v) : size_(1), parent_(nullptr), value_(v) {}
+  explicit UnionFind(const T& v, ClusterBatchSize batch_size)
+      : size_(1),
+        cluster_batch_size_(batch_size),
+        parent_(nullptr),
+        value_(v) {}
 
-  // Returns the number of elements in a cluster.
+  // Returns the number of elements in the cluster and compresses the path from
+  // this object to the root of the cluster.
   int Size() { return FindRoot()->size_; }
 
-  // Merges this cluster with 'other'. This cluster's value becomes
-  // the value of the merged cluster; the value of 'other' is ignored.
-  void Merge(UnionFind* other);
+  // Returns the batch size of the cluster and compress the path from this
+  // object to the root object.
+  ClusterBatchSize BatchSize() { return FindRoot()->cluster_batch_size_; }
 
-  // Each cluster has an associated value. Retrieves the value associated
-  // with this cluster.
+  // Merges this cluster with 'other'. This cluster's size_ is updated to
+  // the size of the merged cluster; the size_ of 'other' becomes inaccessible
+  // as only the size_ of the root object is accessible.
+  Status Merge(UnionFind* other);
+
+  // Retrieves the value for the root of the cluster.
   T& ParentValue() { return FindRoot()->value_; }
 
-  // Get the original value of this node.
+  // Returns the value for the object.
   T& Value() { return value_; }
 
  private:
-  // Finds the root element of the cluster. Performs path compression.
+  // Returns the root object for the cluster and compresses the path from this
+  // object to the root object.
   UnionFind* FindRoot();
 
   int size_;
+  ClusterBatchSize cluster_batch_size_;
   UnionFind* parent_;
   T value_;
 };
 
 template <typename T>
-void UnionFind<T>::Merge(UnionFind* other) {
+Status UnionFind<T>::Merge(UnionFind* other) {
   UnionFind<T>* a = FindRoot();
   UnionFind<T>* b = other->FindRoot();
-  if (a == b) return;
+  if (a == b) return Status::OK();
 
+  ClusterBatchSize batch_size = a->cluster_batch_size_;
+  bool merged = batch_size.MergeIfCompatible(other->cluster_batch_size_);
+  if (!merged) {
+    return errors::Internal("trying to merge incompatible cluster.");
+  }
+
+  a->cluster_batch_size_ = batch_size;
   b->parent_ = a;
   a->size_ += b->size_;
+  return Status::OK();
 }
 
 template <typename T>
@@ -76,4 +217,7 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index a5332385994..55341c0a01f 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -81,7 +81,7 @@ tf_portable_proto_library(
     name = "portable_tf2xla_proto",
     config_string = "allow_all:true",
     header_outs = ["//tensorflow/compiler/tf2xla/tf2xla.proto.h"],
-    portable_deps = ["//tensorflow/core:portable_proto_lib_full_runtime"],
+    portable_deps = ["//tensorflow/core:portable_proto_lib"],
     proto_deps = [
         ":tf2xla_proto",
         "//tensorflow/core:protos_all",
@@ -182,6 +182,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
     ],
 )
@@ -703,12 +704,8 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
-        "//tensorflow/compiler/mlir/tensorflow:device_util",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/core:core_cpu",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core:lib",
         "@llvm-project//llvm:support",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
index f06665dad56..8dede16c332 100644
--- a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
+++ b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
@@ -29,6 +29,8 @@ TEST(XlaCompiledCpuFunctionTest, AlignmentValue) {
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
   // So any value that we choose must abide by that constraint as well.
   EXPECT_EQ(xla::cpu_function_runtime::kAlign, Allocator::kAllocatorAlignment);
+  EXPECT_LE(xla::cpu_function_runtime::kMinAlign,
+            Allocator::kAllocatorAlignment);
 }
 
 std::vector<BufferInfo> SizesToBufferInfos(const intptr_t* sizes, size_t n) {
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 033dae2292d..2fcfd20f49f 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -30,13 +30,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 8702adf43a7..8f53d227249 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -25,10 +25,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/test_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index eadd05fcee0..b6e84eabe8d 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <deque>
 #include <numeric>
 #include <vector>
+
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -39,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/compiler/tf2xla/graph_compiler_util.cc b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
index 814ebe39e6d..a9385e05564 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler_util.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
@@ -24,13 +24,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/dump_graph.h"
@@ -49,10 +49,12 @@ typedef std::unordered_map<string, Node*> NodeMap;
 // Each feed id identifies the positional output of some node, which may consist
 // of multiple edges. AddPlaceholdersForFeeds has already replaced each fed
 // tensor with a placeholder.  For each feed tensor, replaces all edges so they
-// point from a new _Arg node instead.
+// point from a new _Arg node instead. The newly created _Arg nodes are added to
+// `arg_nodes`.
 Status AddArgNodes(Graph* graph, const NodeMap& node_map,
                    const protobuf::RepeatedPtrField<tf2xla::Feed>& feeds,
-                   const std::unordered_map<string, string>& feed_remapping) {
+                   const std::unordered_map<string, string>& feed_remapping,
+                   std::unordered_set<const Node*>* arg_nodes) {
   for (int arg_index = 0; arg_index < feeds.size(); ++arg_index) {
     const tf2xla::Feed& feed = feeds[arg_index];
     // All feeds have been replaced by placeholders.
@@ -86,6 +88,7 @@ Status AddArgNodes(Graph* graph, const NodeMap& node_map,
             .Attr(kShapeAttr, TensorShape(feed.shape()))
             .Attr(kDebugNameAttr, feed.name())
             .Finalize(graph, &arg_node));
+    arg_nodes->insert(arg_node);
 
     // Collects out-edges from the feed node that have a matching edge index;
     // these will be replaced with edges from the arg node instead.
@@ -149,13 +152,13 @@ Status RewriteAndPruneGraph(
   for (Node* n : graph->nodes()) {
     node_map[n->name()] = n;
   }
+  std::unordered_set<const Node*> nodes_to_keep;
+  TF_RETURN_IF_ERROR(AddArgNodes(graph, node_map, config.feed(), feed_remapping,
+                                 &nodes_to_keep));
   TF_RETURN_IF_ERROR(
-      AddArgNodes(graph, node_map, config.feed(), feed_remapping));
-  std::unordered_set<const Node*> retval_nodes;
-  TF_RETURN_IF_ERROR(
-      AddRetvalNodes(graph, node_map, config.fetch(), &retval_nodes));
+      AddRetvalNodes(graph, node_map, config.fetch(), &nodes_to_keep));
   VLOG(2) << "Post rewrite: " << DumpGraphToFile("tf2xla_post_rewrite", *graph);
-  PruneForReverseReachability(graph, std::move(retval_nodes));
+  PruneForReverseReachability(graph, std::move(nodes_to_keep));
   FixupSourceAndSinkEdges(graph);
   VLOG(2) << "Post prune: " << DumpGraphToFile("tfcompile_post_prune", *graph);
   // Sanity-check, to make sure the feeds and fetches still exist post-pruning.
@@ -277,8 +280,16 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   // Prune the GraphDef first so that unknown ops that we aren't compiling get
   // filtered out.
   GraphDef second_copy_def;
+  // Add the placeholder nodes as "fetches" in prune_config, such that they will
+  // be preserved in PruneGraphDefInto.
+  auto prune_config = config;
+  for (const auto& entry : feed_remapping) {
+    auto ph = prune_config.add_fetch();
+    *ph->mutable_id()->mutable_node_name() = entry.second;
+    ph->mutable_id()->set_output_index(0);
+  }
   TF_RETURN_IF_ERROR(
-      PruneGraphDefInto(config, first_copy_def, &second_copy_def));
+      PruneGraphDefInto(prune_config, first_copy_def, &second_copy_def));
 
   TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(
       &second_copy_def, *g->op_registry(), /*node_offset=*/0));
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index dbb420b14fd..bfdfe38305b 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -39,6 +39,7 @@ tf_kernel_library(
         "elu_op.cc",
         "elu_op.h",
         "empty_op.cc",
+        "ensure_shape_op.cc",
         "extract_image_patches_op.cc",
         "fake_param_op.cc",
         "fake_quantize_ops.cc",
@@ -102,6 +103,7 @@ tf_kernel_library(
         "spacetodepth_op.cc",
         "sparse_to_dense_op.cc",
         "split_op.cc",
+        "spmd_manual_sharding_ops.cc",
         "stack_ops.cc",
         "stateful_random_ops.cc",
         "stateless_random_ops.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index dad310911a0..7e8d3d7002a 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -109,7 +109,7 @@ class CategoricalOp : public XlaOpKernel {
                                   /*axis=*/class_dimension);
     } else {
       argmax = xla::ArgMax(softmax_entries, xla_output_type,
-                           /*axis=*/class_dimension);
+                           /*axis=*/class_dimension, /*stable=*/true);
     }
 
     if (num_samples == 1) {
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
index bb2c0d9ddb8..5dbc083368c 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -28,6 +28,15 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+absl::InlinedVector<xla::XlaOp, 4> SliceVector(xla::XlaOp input, int64 rank) {
+  absl::InlinedVector<xla::XlaOp, 4> scalar_indices;
+  scalar_indices.reserve(rank);
+  for (int i = 0; i < rank; i++)
+    scalar_indices.push_back(
+        xla::Reshape(xla::Slice(input, {i}, {i + 1}, {1}), {}));
+  return scalar_indices;
+}
+
 class DynamicUpdateSliceOp : public XlaOpKernel {
  public:
   explicit DynamicUpdateSliceOp(OpKernelConstruction* context)
@@ -41,21 +50,23 @@ class DynamicUpdateSliceOp : public XlaOpKernel {
     const TensorShape update_shape = ctx->InputShape("update");
     const TensorShape index_shape = ctx->InputShape("indices");
 
+    int64 rank = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsVector(index_shape) &&
-            index_shape.num_elements() == input_shape.dims(),
+            index_shape.num_elements() == rank,
         errors::InvalidArgument("index must be a vector with length equal to "
                                 "the number of input dimensions"));
     OP_REQUIRES(
-        ctx, input_shape.dims() == update_shape.dims(),
+        ctx, rank == update_shape.dims(),
         errors::InvalidArgument("input and update must have the same rank,"
                                 " input shape is ",
                                 input_shape.DebugString(), "; update shape is ",
                                 update_shape.DebugString()));
 
+    xla::XlaOp indices = ctx->Input("indices");
     xla::XlaOp result = xla::DynamicUpdateSlice(
-        ctx->Input("input"), ctx->Input("update"), ctx->Input("indices"));
+        ctx->Input("input"), ctx->Input("update"), SliceVector(indices, rank));
     ctx->SetOutput(0, result);
   }
 };
@@ -76,17 +87,18 @@ class DynamicSliceOp : public XlaOpKernel {
     const TensorShape start_indices_shape = ctx->InputShape("start_indices");
     const TensorShape size_indices_shape = ctx->InputShape("size_indices");
 
+    int64 rank = input_shape.dims();
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(start_indices_shape) &&
-                    start_indices_shape.num_elements() == input_shape.dims(),
+                    start_indices_shape.num_elements() == rank,
                 errors::InvalidArgument(
                     "start_indices must be a vector with length equal to "
                     "input rank, but input rank is ",
-                    input_shape.dims(), " and start_indices has shape ",
+                    rank, " and start_indices has shape ",
                     start_indices_shape.DebugString()));
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(size_indices_shape) &&
-                    size_indices_shape.num_elements() == input_shape.dims(),
+                    size_indices_shape.num_elements() == rank,
                 errors::InvalidArgument(
                     "size_indices must be a vector with length equal to "
                     "input rank, but input rank is ",
@@ -96,8 +108,10 @@ class DynamicSliceOp : public XlaOpKernel {
     std::vector<int64> size_indices;
     OP_REQUIRES_OK(
         ctx, ctx->ConstantInputAsIntVector("size_indices", &size_indices));
+
+    xla::XlaOp start_indices = ctx->Input("start_indices");
     xla::XlaOp result = xla::DynamicSlice(
-        ctx->Input("input"), ctx->Input("start_indices"), size_indices);
+        ctx->Input("input"), SliceVector(start_indices, rank), size_indices);
     ctx->SetOutput(0, result);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/ensure_shape_op.cc b/tensorflow/compiler/tf2xla/kernels/ensure_shape_op.cc
new file mode 100644
index 00000000000..8221327d36f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/ensure_shape_op.cc
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA-specific ensure_shape Op.
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace {
+
+class EnsureShapeOp : public XlaOpKernel {
+ public:
+  explicit EnsureShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &expected_shape_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape shape = ctx->InputShape(0);
+
+    // valiate shape
+    OP_REQUIRES(
+        ctx, expected_shape_.IsCompatibleWith(shape),
+        errors::InvalidArgument("Shape of tensor ", this->def().input(0), " ",
+                                shape.DebugString(),
+                                " is not compatible with expected shape ",
+                                expected_shape_.DebugString(), "."));
+
+    // If shape matches, outputs the tensor.
+    ctx->SetOutput(0, ctx->Input(0));
+  }
+
+ private:
+  PartialTensorShape expected_shape_;
+};
+
+REGISTER_XLA_OP(Name("EnsureShape"), EnsureShapeOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc b/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc
index ec3463bd58f..ba9e406312d 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 
@@ -29,7 +30,8 @@ class XlaFakeParamOp : public XlaOpKernel {
  public:
   explicit XlaFakeParamOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
     DataType dtype;
-    TensorShape tensor_shape;
+    // Tensor shape can be unknown.
+    PartialTensorShape tensor_shape;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &tensor_shape));
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, tensor_shape, &shape_));
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index 219dc738eaa..31637d9d8a0 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -77,7 +77,7 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
     if (is_gpu_) {
       output = xla::ArgMaxTwoPass(input, index_xla_type, axis);
     } else {
-      output = xla::ArgMax(input, index_xla_type, axis);
+      output = xla::ArgMax(input, index_xla_type, axis, /*stable=*/true);
     }
   }
 
@@ -86,8 +86,7 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
 
 XlaArgMaxOp::XlaArgMaxOp(OpKernelConstruction* ctx)
     : XlaArgMinMaxOp(ctx, /*is_min=*/false) {}
-REGISTER_XLA_OP(Name("ArgMax")
-                    .CompileTimeConstantInput("dimension"),
+REGISTER_XLA_OP(Name("ArgMax").CompileTimeConstantInput("dimension"),
                 XlaArgMaxOp);
 
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 1ccf0b4b125..3acb1d3359b 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // TODO(misard,phawkins): add tests.
 
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/lib/random.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -337,13 +338,20 @@ class ParameterizedTruncatedNormalOp : public XlaOpKernel {
            "reproducible behavior is desired.";
     xla::XlaOp uniform = xla::RngUniform(min_positive, one, xla_shape);
 
-    xla::XlaOp means = ctx->Input(1);
-    xla::XlaOp stddevs = ctx->Input(2);
-    xla::XlaOp minvals = ctx->Input(3);
-    xla::XlaOp maxvals = ctx->Input(4);
+    auto result = b->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+      TF_ASSIGN_OR_RETURN(xla::XlaOp means,
+                          BroadcastTo(ctx->Input(1), shape.dim_sizes()));
+      TF_ASSIGN_OR_RETURN(xla::XlaOp stddevs,
+                          BroadcastTo(ctx->Input(2), shape.dim_sizes()));
+      TF_ASSIGN_OR_RETURN(xla::XlaOp minvals,
+                          BroadcastTo(ctx->Input(3), shape.dim_sizes()));
+      TF_ASSIGN_OR_RETURN(xla::XlaOp maxvals,
+                          BroadcastTo(ctx->Input(4), shape.dim_sizes()));
+      return ParameterizedTruncatedNormal(uniform, means, stddevs, minvals,
+                                          maxvals);
+    });
 
-    ctx->SetOutput(0, ParameterizedTruncatedNormal(uniform, means, stddevs,
-                                                   minvals, maxvals));
+    ctx->SetOutput(0, result);
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 5d2b08f424c..85917af6a65 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -274,10 +274,23 @@ class ZerosLikeOp : public XlaOpKernel {
 
       auto list_shape_or = ctx->builder()->GetShape(list);
       OP_REQUIRES_OK(ctx, list_shape_or.status());
+      const xla::Shape& list_shape = list_shape_or.ValueOrDie();
+      std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
+      list_dynamic_dims.reserve(list_shape.tuple_shapes_size() - 1);
+      for (int64 i = 0; i < list_shape.tuple_shapes_size() - 1; ++i) {
+        // Set dynamic dimension size to 0 for initialization value.
+        std::vector<xla::XlaOp> dynamic_dims;
+        const xla::Shape& shape = list_shape.tuple_shapes(i);
+        auto sub_element = xla::GetTupleElement(list, i);
+        for (int64 dim = 0; dim < shape.dimensions_size(); ++dim) {
+          dynamic_dims.push_back(xla::GetDimensionSize(sub_element, dim));
+        }
+        list_dynamic_dims.push_back(dynamic_dims);
+      }
       xla::XlaOp new_list;
       OP_REQUIRES_OK(
-          ctx, CreateZerosTensorListWithShape(
-                   ctx->builder(), list_shape_or.ValueOrDie(), &new_list));
+          ctx, CreateZerosTensorListWithShape(ctx->builder(), list_shape,
+                                              list_dynamic_dims, &new_list));
 
       xla::XlaOp push_index;
       OP_REQUIRES_OK(ctx, GetTensorListPushIndex(list, &push_index));
@@ -287,10 +300,20 @@ class ZerosLikeOp : public XlaOpKernel {
                      SetTensorListPushIndex(new_list, push_index, &result));
       ctx->SetTensorListOutput(0, result);
     } else {
-      const TensorShape input_shape = ctx->InputShape(0);
-
       auto zero = XlaHelpers::Zero(ctx->builder(), input_type(0));
-      ctx->SetOutput(0, xla::Broadcast(zero, input_shape.dim_sizes()));
+      xla::XlaOp input = ctx->Input(0);
+      auto input_shape = ctx->InputXlaShape(0).ValueOrDie();
+      auto result = xla::Broadcast(zero, input_shape.dimensions());
+
+      // Setting up dynamic dimensions of the broadcast.
+      for (int64 i = 0; i < input_shape.dimensions_size(); ++i) {
+        if (input_shape.is_dynamic_dimension(i)) {
+          xla::XlaOp input_dynamic_dim = xla::GetDimensionSize(input, i);
+          result = xla::SetDimensionSize(result, input_dynamic_dim, i);
+        }
+      }
+
+      ctx->SetOutput(0, result);
     }
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 17d0b87edda..7f274c6b00f 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -42,19 +42,17 @@ class SliceOp : public XlaOpKernel {
     const TensorShape begin_tensor_shape = ctx->InputShape(1);
     const TensorShape size_tensor_shape = ctx->InputShape(2);
 
+    const int input_dims = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsVector(begin_tensor_shape) &&
             TensorShapeUtils::IsVector(size_tensor_shape) &&
-            begin_tensor_shape.num_elements() == input_shape.dims() &&
-            size_tensor_shape.num_elements() == input_shape.dims(),
+            begin_tensor_shape.num_elements() == input_dims &&
+            size_tensor_shape.num_elements() == input_dims,
         errors::InvalidArgument(
             "Expected begin and size arguments to be 1-D tensors of size ",
-            input_shape.dims(), ", but got shapes ",
-            begin_tensor_shape.DebugString(), " and ",
-            size_tensor_shape.DebugString(), " instead."));
-
-    const int input_dims = input_shape.dims();
+            input_dims, ", but got shapes ", begin_tensor_shape.DebugString(),
+            " and ", size_tensor_shape.DebugString(), " instead."));
 
     std::vector<int64> begin;
     std::vector<int64> size;
@@ -129,7 +127,15 @@ class SliceOp : public XlaOpKernel {
                                             input_shape.dim_size(i), "], but ",
                                             "got ", size[i]));
       }
-      ctx->SetOutput(0, xla::DynamicSlice(ctx->Input(0), ctx->Input(1), size));
+
+      absl::InlinedVector<xla::XlaOp, 4> scalar_indices;
+      scalar_indices.reserve(input_dims);
+      xla::XlaOp begin = ctx->Input("begin");
+      for (int i = 0; i < input_dims; i++)
+        scalar_indices.push_back(
+            xla::Reshape(xla::Slice(begin, {i}, {i + 1}, {1}), {}));
+
+      ctx->SetOutput(0, xla::DynamicSlice(ctx->Input(0), scalar_indices, size));
     }
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc b/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
new file mode 100644
index 00000000000..cd28fe8fa3f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
@@ -0,0 +1,147 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaSpmdFullToShardShapeOp : public XlaOpKernel {
+ public:
+  explicit XlaSpmdFullToShardShapeOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("manual_sharding", &manual_sharding_str_));
+  }
+
+  ~XlaSpmdFullToShardShapeOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp input = ctx->Input(0);
+    auto input_shape_or = ctx->InputXlaShape(0);
+    OP_REQUIRES_OK(ctx, input_shape_or.status());
+    xla::OpSharding sharding;
+    if (!sharding.ParseFromString(manual_sharding_str_)) {
+      OP_REQUIRES_OK(ctx,
+                     xla::InvalidArgument("manual_sharding attribute was not a "
+                                          "valid encoded xla::OpSharding "
+                                          "proto."));
+    }
+    auto output_shape = input_shape_or.ValueOrDie();
+    int64 rank = output_shape.rank();
+    if (sharding.type() == xla::OpSharding::OTHER) {
+      for (int64 i = 0; i < rank; ++i) {
+        int64 partitions_i = sharding.tile_assignment_dimensions(i);
+        if (partitions_i == 1) continue;
+        int64 dim_size =
+            xla::CeilOfRatio(output_shape.dimensions(i), partitions_i);
+        output_shape.set_dimensions(i, dim_size);
+      }
+    }
+    xla::XlaOp input_annotation;
+    {
+      // Annotate the full-shape input with the manual sharding.
+      xla::XlaScopedShardingAssignment assign_sharding(ctx->builder(),
+                                                       sharding);
+      input_annotation =
+          xla::CustomCall(ctx->builder(), /*call_target_name=*/"Sharding",
+                          {input}, input_shape_or.ValueOrDie());
+    }
+
+    {
+      // Annotate the shard-shape output with replicated sharding, so that the
+      // partitioner will leave it as is.
+      xla::OpSharding replicated;
+      replicated.set_type(xla::OpSharding::REPLICATED);
+      xla::XlaScopedShardingAssignment assign_sharding(ctx->builder(),
+                                                       replicated);
+      auto output = xla::CustomCall(ctx->builder(),
+                                    /*call_target_name=*/"SPMDFullToShardShape",
+                                    {input_annotation}, output_shape);
+      ctx->SetOutput(0, output);
+    }
+  }
+
+ private:
+  string manual_sharding_str_;
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSpmdFullToShardShapeOp);
+};
+
+class XlaSpmdShardToFullShapeOp : public XlaOpKernel {
+ public:
+  explicit XlaSpmdShardToFullShapeOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("full_shape", &full_shape_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("manual_sharding", &manual_sharding_str_));
+  }
+
+  ~XlaSpmdShardToFullShapeOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp input = ctx->Input(0);
+    auto input_shape_or = ctx->InputXlaShape(0);
+    OP_REQUIRES_OK(ctx, input_shape_or.status());
+    auto output_shape = TensorShapeToXLAShape(
+        input_shape_or.ValueOrDie().element_type(), full_shape_);
+
+    xla::OpSharding sharding;
+    if (!sharding.ParseFromString(manual_sharding_str_)) {
+      OP_REQUIRES_OK(ctx,
+                     xla::InvalidArgument("manual_sharding attribute was not a "
+                                          "valid encoded xla::OpSharding "
+                                          "proto."));
+    }
+    xla::XlaOp input_annotation;
+    {
+      // Annotate the shard-shape input with replicated sharding, so that the
+      // partitioner will leave it as is.
+      xla::OpSharding replicated;
+      replicated.set_type(xla::OpSharding::REPLICATED);
+      xla::XlaScopedShardingAssignment assign_sharding(ctx->builder(),
+                                                       replicated);
+      input_annotation =
+          xla::CustomCall(ctx->builder(), /*call_target_name=*/"Sharding",
+                          {input}, input_shape_or.ValueOrDie());
+    }
+
+    {
+      // Annotate the full-shape output with the manual sharding.
+      xla::XlaScopedShardingAssignment assign_sharding(ctx->builder(),
+                                                       sharding);
+      ctx->SetOutput(
+          0, xla::CustomCall(ctx->builder(),
+                             /*call_target_name=*/"SPMDShardToFullShape",
+                             {input_annotation}, output_shape));
+    }
+  }
+
+ private:
+  TensorShape full_shape_;
+  string manual_sharding_str_;
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSpmdShardToFullShapeOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSpmdFullToShardShape"), XlaSpmdFullToShardShapeOp);
+REGISTER_XLA_OP(Name("XlaSpmdShardToFullShape"), XlaSpmdShardToFullShapeOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 9093175af75..2684c982600 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -202,7 +202,7 @@ class StridedSliceOp : public XlaOpKernel {
           ctx, output_elements == input_elements_sliced,
           errors::InvalidArgument(
               "The number of output elements ", output_elements,
-              "  has to equal to number of input elements that are sliced ",
+              " has to equal to number of input elements that are sliced ",
               input_elements_sliced, " when input indices are not constant."));
 
       for (int64 i = 0; i < ctx->InputShape("begin").dims(); ++i) {
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 4af3d4233dd..fa5a96ca6bd 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -44,6 +44,36 @@ namespace tensorflow {
 
 namespace {
 
+// GetTensorListDynamicDims collects the dynamic dimensions that a tensorlist
+// may carry and returns them in a 2D vector: int64[ElementSize][DimSize]. If a
+// dimension is static, a constant dimension is returned.
+xla::StatusOr<std::vector<std::vector<xla::XlaOp>>> GetTensorListDynamicDims(
+    XlaOpKernelContext* ctx, const xla::Shape& element_shape,
+    const xla::Shape& list_shape, int64 num_elements) {
+  std::vector<int64> dynamic_sizes;
+  ctx->set_dynamic_dimension_is_minus_one(true);
+  // The multiplier can be a dynamic value.
+  TF_RETURN_IF_ERROR(ctx->ConstantInputAsIntVector(0, &dynamic_sizes));
+  std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
+  // Set dynamic dimension size to 0 for initialization value.
+  std::vector<xla::XlaOp> dynamic_dims;
+  // Leading dim is a static dimension.
+  dynamic_dims.push_back(xla::ConstantR0<int32>(ctx->builder(), num_elements));
+  for (int64 dim = 0; dim < element_shape.dimensions_size(); ++dim) {
+    if (ctx->is_dynamic_dimension(dynamic_sizes[dim])) {
+      auto dynamic_dim_size = xla::Slice(ctx->Input(0), {dim}, {dim + 1}, {1});
+      dynamic_dim_size = xla::Reshape(dynamic_dim_size, {});
+      dynamic_dim_size = xla::ConvertElementType(dynamic_dim_size, xla::S32);
+      dynamic_dims.push_back(dynamic_dim_size);
+    } else {
+      dynamic_dims.push_back(
+          xla::ConstantR0<int32>(ctx->builder(), dynamic_sizes[dim]));
+    }
+  }
+  list_dynamic_dims.push_back(dynamic_dims);
+  return list_dynamic_dims;
+}
+
 class TensorListLengthOp : public XlaOpKernel {
  public:
   explicit TensorListLengthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
@@ -124,10 +154,14 @@ class TensorListReserveOp : public XlaOpKernel {
       xla::Shape list_shape;
       OP_REQUIRES_OK(ctx, GetTensorListShapeFromElementShape(
                               element_shape, num_elements, &list_shape));
-
+      // Set up dynamic dimension sizes to create the zero tensor.
+      auto list_dynamic_dims_or = GetTensorListDynamicDims(
+          ctx, element_shape, list_shape, num_elements);
+      OP_REQUIRES_OK(ctx, list_dynamic_dims_or.status());
       xla::XlaOp new_list;
       OP_REQUIRES_OK(ctx, CreateZerosTensorListWithShape(
-                              ctx->builder(), list_shape, &new_list));
+                              ctx->builder(), list_shape,
+                              list_dynamic_dims_or.ValueOrDie(), &new_list));
       xla::XlaOp result;
       OP_REQUIRES_OK(
           ctx,
@@ -185,10 +219,16 @@ class EmptyTensorListOp : public XlaOpKernel {
         xla::Shape list_shape;
         OP_REQUIRES_OK(ctx, GetTensorListShapeFromElementShape(
                                 element_shape, max_num_elements, &list_shape));
+        // Set up dynamic dimension sizes to create the zero tensor.
+        auto list_dynamic_dims_or = GetTensorListDynamicDims(
+            ctx, element_shape, list_shape, max_num_elements);
+        OP_REQUIRES_OK(ctx, list_dynamic_dims_or.status());
 
         xla::XlaOp result;
         OP_REQUIRES_OK(ctx, CreateZerosTensorListWithShape(
-                                ctx->builder(), list_shape, &result));
+                                ctx->builder(), list_shape,
+                                list_dynamic_dims_or.ValueOrDie(), &result));
+
         ctx->SetTensorListOutput(0, result);
         return;
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index 6020b002f10..aa71e4d4364 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
 
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -247,19 +248,29 @@ Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
   return Status::OK();
 }
 
-Status CreateZerosTensorListWithShape(xla::XlaBuilder* b,
-                                      const xla::Shape& list_shape,
-                                      xla::XlaOp* list) {
+Status CreateZerosTensorListWithShape(
+    xla::XlaBuilder* b, const xla::Shape& list_shape,
+    const std::vector<std::vector<xla::XlaOp>>& dynamic_dims,
+    xla::XlaOp* list) {
   int tuple_size = xla::ShapeUtil::TupleElementCount(list_shape);
   std::vector<xla::XlaOp> elements;
-  for (int i = 0; i < tuple_size; i++) {
+  TF_RET_CHECK(dynamic_dims.size() == tuple_size - 1);
+  for (int i = 0; i < tuple_size - 1; i++) {
     const xla::Shape& shape =
         xla::ShapeUtil::GetTupleElementShape(list_shape, i);
     xla::XlaOp zero =
         xla::ConstantLiteral(b, xla::LiteralUtil::Zero(shape.element_type()));
     xla::XlaOp zeros = xla::Broadcast(zero, shape.dimensions());
+    TF_RET_CHECK(dynamic_dims[i].size() == shape.dimensions_size());
+    for (int64 dim = 0; dim < shape.dimensions_size(); ++dim) {
+      zeros = xla::SetDimensionSize(zeros, dynamic_dims[i][dim], dim);
+    }
     elements.push_back(zeros);
   }
+  // List size (last item) has to be S32.
+  TF_RET_CHECK(xla::ShapeUtil::GetTupleElementShape(list_shape, tuple_size - 1)
+                   .element_type() == xla::S32);
+  elements.push_back(xla::ConstantLiteral(b, xla::LiteralUtil::Zero(xla::S32)));
   *list = xla::Tuple(b, elements);
   return Status::OK();
 }
@@ -272,12 +283,12 @@ Status GetInitializedTensorListForElement(xla::XlaOp list, xla::XlaOp element,
 
   xla::XlaBuilder* b = list.builder();
   xla::Shape list_shape;
+  TF_ASSIGN_OR_RETURN(xla::Shape element_shape, b->GetShape(element));
+
   if (element_is_tensor_list) {
-    TF_ASSIGN_OR_RETURN(xla::Shape element_shape, b->GetShape(element));
     TF_RETURN_IF_ERROR(GetTensorListShapeFromElementTensorListShape(
         element_shape, leading_dim, &list_shape));
   } else {
-    TF_ASSIGN_OR_RETURN(xla::Shape element_shape, b->GetShape(element));
     TF_RETURN_IF_ERROR(GetTensorListShapeFromElementShape(
         element_shape, leading_dim, &list_shape));
   }
@@ -295,7 +306,27 @@ Status GetInitializedTensorListForElement(xla::XlaOp list, xla::XlaOp element,
     *initialized_list = list;
     return Status::OK();
   } else {
-    return CreateZerosTensorListWithShape(b, list_shape, initialized_list);
+    // Prepare dynamic dimension dimensions for zero tensor list. The dynamic
+    // sizes are created by reading the dynamic dimension size of sub-elements.
+    std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
+    for (int64 i = 0; i < list_shape.tuple_shapes_size() - 1; ++i) {
+      std::vector<xla::XlaOp> dynamic_dims;
+      const xla::Shape& shape = list_shape.tuple_shapes(i);
+      // Leading dim is a static dimension.
+      dynamic_dims.push_back(xla::ConstantR0<int32>(b, leading_dim));
+      xla::XlaOp sub_element;
+      if (element_is_tensor_list) {
+        sub_element = xla::GetTupleElement(element, i);
+      } else {
+        sub_element = element;
+      }
+      for (int64 dim = 0; dim < shape.dimensions_size() - 1; ++dim) {
+        dynamic_dims.push_back(xla::GetDimensionSize(sub_element, dim));
+      }
+      list_dynamic_dims.push_back(dynamic_dims);
+    }
+    return CreateZerosTensorListWithShape(b, list_shape, list_dynamic_dims,
+                                          initialized_list);
   }
 }
 
@@ -473,7 +504,13 @@ Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
 
   xla::XlaOp list_part = xla::GetTupleElement(list, 0);
   xla::XlaOp read = xla::DynamicSlice(list_part, start_indices, slice_shape);
-
+  for (int64 i = 0; i < buffer_shape.dimensions_size(); ++i) {
+    if (buffer_shape.is_dynamic_dimension(i)) {
+      auto buffer = xla::GetTupleElement(list, 0);
+      auto gds = xla::GetDimensionSize(buffer, i);
+      read = xla::SetDimensionSize(read, gds, i);
+    }
+  }
   slice_shape.erase(slice_shape.begin());
   *result = xla::Reshape(read, slice_shape);
   return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
index 7fac2d9dbab..ef3c8badf71 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
@@ -74,9 +74,9 @@ Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
                                           xla::Shape* tensor_list_shape);
 
 // Returns a TensorList filled by zeros with the given shape.
-Status CreateZerosTensorListWithShape(xla::XlaBuilder* b,
-                                      const xla::Shape& list_shape,
-                                      xla::XlaOp* list);
+Status CreateZerosTensorListWithShape(
+    xla::XlaBuilder* b, const xla::Shape& list_shape,
+    const std::vector<std::vector<xla::XlaOp>>& dynamic_dims, xla::XlaOp* list);
 
 // If the TensorList is initialized, check that its shape matches element shape;
 // If the TensorList is uninitialized, initialize it with the element shape.
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 21568a196ba..fe7a5898011 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -510,8 +510,25 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       // first compilation and the body/cond was recompiled with the updated
       // shape/datatype of the list.
       if (input_shape != list_shape) {
-        OP_REQUIRES_OK(ctx, CreateZerosTensorListWithShape(
-                                ctx->builder(), list_shape, &inputs[i]));
+        // Prepare dynamic dimensions for element shapes.
+        std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
+        for (int64 i = 0; i < list_shape.tuple_shapes_size() - 1; ++i) {
+          // Set dynamic dimension size to 0 for initilization value.
+          std::vector<xla::XlaOp> dynamic_dims;
+          const xla::Shape& shape = list_shape.tuple_shapes(i);
+          for (int64 dim = 0; dim < shape.dimensions_size(); ++dim) {
+            int32 dim_size = shape.dimensions(dim);
+            if (shape.is_dynamic_dimension(dim)) {
+              dim_size = 0;
+            }
+            dynamic_dims.push_back(
+                xla::ConstantR0<int32>(ctx->builder(), dim_size));
+          }
+          list_dynamic_dims.push_back(dynamic_dims);
+        }
+        OP_REQUIRES_OK(
+            ctx, CreateZerosTensorListWithShape(ctx->builder(), list_shape,
+                                                list_dynamic_dims, &inputs[i]));
       } else {
         inputs[i] = ctx->Input(input_num);
       }
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 6d0d569724f..c398e5f129e 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -18,10 +18,18 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+auto* mlir_bridge_gauge_v1 = monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/config/experimental/enable_mlir_bridge_gauge_v1",
+    "Tracks usage of the MLIR-based TF2XLA bridge among TF1 models");
+auto* mlir_bridge_gauge_v2 = monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/config/experimental/enable_mlir_bridge_gauge_v2",
+    "Tracks usage of the MLIR-based TF2XLA bridge among TF2 models");
+
 // This runs the first phase of the "bridge", transforming the graph in a form
 // that can be executed with delegation of some computations to an accelerator.
 // This builds on the model of XLA where a subset of the graph is encapsulated
@@ -31,11 +39,13 @@ namespace tensorflow {
 Status MlirBridgePass::Run(const ConfigProto& config_proto,
                            mlir::ModuleOp module) {
   if (!config_proto.experimental().enable_mlir_bridge()) {
-    VLOG(1) << "Skipping MLIR Bridge Pass, session flag not enabled";
+    VLOG(0) << "Skipping MLIR TPU Bridge, session flag not enabled";
+    mlir_bridge_gauge_v2->GetCell()->Set(false);
     return Status::OK();
   }
 
-  VLOG(1) << "Running MLIR Bridge Pass";
+  VLOG(0) << "Running MLIR TPU Bridge";
+  mlir_bridge_gauge_v2->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
       mlir::TFTPU::TPUBridge(module, /*enable_logging=*/VLOG_IS_ON(1)));
 
@@ -47,11 +57,13 @@ Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
   if (options.is_function_graph) return Status::OK();
 
   if (!options.session_options->config.experimental().enable_mlir_bridge()) {
-    VLOG(1) << "Skipping MLIR Bridge V1 Compat Pass, session flag not enabled";
+    VLOG(0) << "Skipping MLIR TPU Bridge V1 Compat, session flag not enabled";
+    mlir_bridge_gauge_v1->GetCell()->Set(false);
     return Status::OK();
   }
 
-  VLOG(1) << "Running MLIR Bridge V1 Compat Pass";
+  VLOG(0) << "Running MLIR TPU Bridge V1 Compat";
+  mlir_bridge_gauge_v1->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
       mlir::TFTPU::TPUBridgeV1Compat(module, /*enable_logging=*/VLOG_IS_ON(1)));
 
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index daf261fa5d8..43793be56a7 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -95,6 +96,7 @@ static void RegisterDialects() {
     mlir::registerDialect<mlir::TF::TensorFlowDialect>();
     mlir::registerDialect<mlir::StandardOpsDialect>();
     mlir::registerDialect<mlir::xla_hlo::XlaHloDialect>();
+    mlir::registerDialect<mlir::shape::ShapeDialect>();
     return true;
   }();
   (void)init_once;
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 6b71cca9c2a..862da1f3f95 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -648,6 +648,62 @@ This op has better TPU performance since it doesn't have explicitly reshape and
 transpose operations as tf.einsum does.
 )doc");
 
+REGISTER_OP("XlaSpmdFullToShardShape")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("manual_sharding: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      auto input_handle = c->input(0);
+      if (!c->RankKnown(input_handle)) {
+        return shape_inference::UnknownShape(c);
+      }
+      string sharding_attr;
+      TF_RETURN_IF_ERROR(c->GetAttr("manual_sharding", &sharding_attr));
+      std::vector<shape_inference::DimensionHandle> dims;
+      for (int64 i = 0; i < c->Rank(input_handle); ++i) {
+        auto dim = c->Value(c->Dim(input_handle, i));
+        xla::OpSharding sharding;
+        sharding.ParseFromString(sharding_attr);
+        int64 partitions_i = sharding.tile_assignment_dimensions(i);
+        if (dim != shape_inference::InferenceContext::kUnknownDim &&
+            sharding.type() == xla::OpSharding::OTHER && partitions_i != 1) {
+          dim = (dim + partitions_i - 1) / partitions_i;
+        }
+        dims.push_back(c->MakeDim(dim));
+      }
+      c->set_output(0, c->MakeShape(dims));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+An op used by XLA SPMD partitioner to switch from automatic partitioning to
+manual partitioning. It annotates the input (full-shape, to be automatically
+partitioned) with the same sharding used by manual partitioning, and outputs a
+shard-shaped tensor to be consumed by later manually-partitioned ops. If the
+shape is not evenly partitionable, the padding region will be masked with 0s.
+)doc");
+
+REGISTER_OP("XlaSpmdShardToFullShape")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("manual_sharding: string")
+    .Attr("full_shape: shape")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      TensorShape shape_attr;
+      TF_RETURN_IF_ERROR(c->GetAttr("full_shape", &shape_attr));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(shape_attr, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+An op used by XLA SPMD partitioner to switch from manual partitioning to
+automatic partitioning. It converts the shard-shaped, manually partitioned input
+into full-shaped tensor to be partitioned automatically with the same sharding
+used by manual partitioning.
+)doc");
+
 REGISTER_OP("XlaSharding")
     .Input("input: T")
     .Output("output: T")
@@ -674,7 +730,7 @@ REGISTER_OP("XlaGather")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Output("output: T")
-    .SetShapeFn(UnchangedRank)
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Wraps the XLA Gather operator documented at
   https://www.tensorflow.org/xla/operation_semantics#gather
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 0df61da57a3..c59c47e92fb 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -418,6 +418,26 @@ def _sharding_grad(op, grad):
   return [grad]
 
 
+spmd_full_to_shard_shape = gen_xla_ops.xla_spmd_full_to_shard_shape
+spmd_shard_to_full_shape = gen_xla_ops.xla_spmd_shard_to_full_shape
+
+
+@ops.RegisterGradient("XlaSpmdFullToShardShape")
+def _spmd_full_to_shard_shape_grad(op, grad):
+  s2f = gen_xla_ops.xla_spmd_shard_to_full_shape(
+      grad,
+      manual_sharding=op.get_attr("manual_sharding"),
+      full_shape=op.inputs[0].shape.as_list())
+  return [s2f]
+
+
+@ops.RegisterGradient("XlaSpmdShardToFullShape")
+def _spmd_shard_to_full_shape_grad(op, grad):
+  f2s = gen_xla_ops.xla_spmd_full_to_shard_shape(
+      grad, manual_sharding=op.get_attr("manual_sharding"))
+  return [f2s]
+
+
 sort = gen_xla_ops.xla_sort
 key_value_sort = gen_xla_ops.xla_key_value_sort
 while_loop = gen_xla_ops.xla_while
diff --git a/tensorflow/compiler/tf2xla/shape_util.cc b/tensorflow/compiler/tf2xla/shape_util.cc
index 8997b2f5c68..2fce6e7f0c7 100644
--- a/tensorflow/compiler/tf2xla/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/shape_util.cc
@@ -98,6 +98,43 @@ Status XLAShapeToTensorShape(const xla::Shape& shape,
   return Status::OK();
 }
 
+// Convert a TensorShape into the equivalent XLA Shape proto.
+Status TensorShapeToXLAShape(DataType dtype,
+                             const PartialTensorShape& tensor_shape,
+                             xla::Shape* shape) {
+  xla::PrimitiveType type;
+  TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(dtype, &type));
+  *shape = TensorShapeToXLAShape(type, tensor_shape);
+  return Status::OK();
+}
+
+xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
+                                 const PartialTensorShape& tensor_shape) {
+  if (tensor_shape.unknown_rank()) {
+    // For unknown shape, create a rank 1 size 0 tensor.
+    return xla::ShapeUtil::MakeShapeWithLayout(type, {0}, {0});
+  }
+  int rank = tensor_shape.dims();
+  std::vector<int64> dimensions(rank);
+  std::vector<bool> dynamic_dimensions(rank, false);
+  std::vector<int64> layout(rank);
+  for (int d = 0; d < rank; ++d) {
+    dimensions[d] = tensor_shape.dim_size(d);
+    if (dimensions[d] < 0) {
+      dynamic_dimensions[d] = true;
+    }
+  }
+  // XLA uses minor-to-major; Tensorflow uses major-to-minor.
+  std::iota(layout.rbegin(), layout.rend(), 0);
+  xla::Shape result =
+      xla::ShapeUtil::MakeShapeWithLayout(type, dimensions, layout);
+
+  for (int64 d = 0; d < rank; ++d) {
+    result.set_dynamic_dimension(d, dynamic_dimensions[d]);
+  }
+  return result;
+}
+
 // Convert a TensorShape into the equivalent XLA Shape proto.
 Status TensorShapeToXLAShape(DataType dtype, const TensorShape& tensor_shape,
                              xla::Shape* shape) {
diff --git a/tensorflow/compiler/tf2xla/shape_util.h b/tensorflow/compiler/tf2xla/shape_util.h
index 331cfa38c1d..438df7ecb18 100644
--- a/tensorflow/compiler/tf2xla/shape_util.h
+++ b/tensorflow/compiler/tf2xla/shape_util.h
@@ -44,6 +44,17 @@ Status TensorShapeToXLAShape(DataType dtype, const TensorShape& tensor_shape,
 xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
                                  const TensorShape& tensor_shape);
 
+// Convert a PartialTensorShape into the equivalent XLA Shape proto. An shape
+// with unknown rank is represented by an r1 with empty dimension.
+Status TensorShapeToXLAShape(DataType dtype,
+                             const PartialTensorShape& tensor_shape,
+                             xla::Shape* shape);
+
+// Convert a PartialTensorShape into the equivalent XLA Shape proto. An shape
+// with unknown rank is represented by an r1 with empty dimension.
+xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
+                                 const PartialTensorShape& tensor_shape);
+
 // Given an XLA shape with layouts, builds a layout vector in the form able to
 // be fed to ops like InfeedEnqueue/InfeedEnqueueTuple/XRTAllocateV2/....
 // THe returned vector is a linearized sequence of the minor-to-major values of
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index 24afe595b18..7ea69f734c9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -99,5 +99,42 @@ TEST(ConvertGraphDefToXla, Sum) {
       ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
+TEST(ConvertGraphDefToXla, SumWithUnusedArgument) {
+  GraphDef graph_def = SumGraph();
+  tf2xla::Config config = SumConfig();
+  NodeDef* unused = graph_def.add_node();
+  unused->set_name("unused");
+  unused->set_op("Placeholder");
+  (*unused->mutable_attr())["dtype"] = TypeAttrValue(DT_INT32);
+  config.add_feed()->mutable_id()->set_node_name("unused");
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  // Set up arguments.
+  auto x_literal = xla::LiteralUtil::CreateR0<int32>(10);
+  auto y_literal = xla::LiteralUtil::CreateR0<int32>(32);
+  auto x_global_or = client->TransferToServer(x_literal);
+  auto y_global_or = client->TransferToServer(y_literal);
+  auto unused_global_or = client->TransferToServer(y_literal);
+  TF_EXPECT_OK(x_global_or.status());
+  TF_EXPECT_OK(y_global_or.status());
+  TF_EXPECT_OK(unused_global_or.status());
+  std::unique_ptr<xla::GlobalData> x_global =
+      std::move(x_global_or.ValueOrDie());
+  std::unique_ptr<xla::GlobalData> y_global =
+      std::move(y_global_or.ValueOrDie());
+  std::unique_ptr<xla::GlobalData> unused_global =
+      std::move(unused_global_or.ValueOrDie());
+
+  // Execute and check result.
+  auto result_or = client->ExecuteAndTransfer(
+      computation, {x_global.get(), y_global.get(), unused_global.get()});
+  TF_EXPECT_OK(result_or.status());
+  xla::Literal result = std::move(result_or.ValueOrDie());
+  EXPECT_EQ("(\ns32[] 42\n)", result.ToString());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 04d9086ce4c..550f562a0e1 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -172,13 +172,16 @@ class XlaCompiledCpuFunction {
   // called for each positional argument, in order to set the argument buffers.
   //
   // Allocated memory must be aligned to the size specified by
-  // tensorflow::tfcompile::runtime::kAlign. If possible, use the functions in
+  // xla::cpu_function_runtime::kMinAlign. If possible, use the functions in
   // tensorflow/compiler/tf2xla/cpu_function_runtime.h to ensure correct
   // alignment.
   //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
   void set_arg_data(size_t index, const void* data) {
+    assert((arg_size(index) < xla::cpu_function_runtime::kMinAlign ||
+            (uintptr_t)data % xla::cpu_function_runtime::kMinAlign == 0) &&
+           "Underaligned pointer!");
     // The const_cast is safe because the generated code does not write to arg
     // buffers.
     //
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 85f2d5c1fc6..3d6083621f4 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -39,12 +39,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -621,6 +621,7 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   graph_optimizer_options.cf_consider_fn = cf_consider_fn;
   graph_optimizer_options.inline_multi_device_functions = true;
   graph_optimizer_options.inline_impl_selection_group_functions = true;
+  graph_optimizer_options.inline_with_single_device_body_placer = true;
   optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
                      /*device=*/nullptr, &graph, graph_optimizer_options);
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 76780167187..4f1b6c8e7a9 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
@@ -463,6 +463,27 @@ TEST_F(XlaCompilerTest, TransposeVariables) {
             xla::ShapeUtil::MakeTupleShape({transposed, transposed}));
 }
 
+// Unranked fake param returns a 0 shaped tensor.
+TEST_F(XlaCompilerTest, UnrankedFakeParam) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  PartialTensorShape shape;
+  auto a = ops::FakeParam(scope, DT_INT32, shape);
+  auto ret = ops::_Retval(scope.WithOpName("D"), a, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Compiles the graph.
+  XlaCompiler compiler(DefaultOptions());
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "compile",
+                                     std::move(graph), {}, &result));
+  // Check that the return shapes are correctly tranposed.
+  EXPECT_EQ(result.xla_output_shape,
+            xla::ShapeUtil::MakeTupleShape(
+                {xla::ShapeUtil::MakeShape(xla::S32, {0})}));
+}
+
 // Tests that the compiler doesn't reorder the parameters.
 TEST_F(XlaCompilerTest, MixedOrderArguments) {
   for (bool swap_order : {false, true}) {
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index 0aa139ce4f0..49f108ed6c8 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -121,6 +121,9 @@ xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
                       handle().builder()->IsConstant(handle()));
   if (!is_constant) return {absl::nullopt};
 
+  if (!client)
+    return errors::InvalidArgument("client is required to resolve constant");
+
   TF_ASSIGN_OR_RETURN(xla::XlaComputation constant_graph,
                       handle().builder()->BuildConstantSubGraph(
                           handle(), dynamic_dimension_is_minus_one));
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index a394de1a9e8..2c6edf5389e 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -175,8 +175,9 @@ Status XlaOpKernelContext::ConstantInputReshaped(
     int index, absl::Span<const int64> new_dims,
     xla::Literal* constant_literal) {
   XlaExpression e = InputExpression(index);
+  auto* client = compiler() ? compiler()->client() : nullptr;
   xla::StatusOr<absl::optional<Tensor>> constant_or_status =
-      e.ResolveConstant(compiler()->client(), dynamic_dimension_is_minus_one_);
+      e.ResolveConstant(client, dynamic_dimension_is_minus_one_);
   if (!constant_or_status.ok()) {
     Status status = constant_or_status.status();
     errors::AppendToMessage(&status, "while evaluating input ", index, " of ",
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 8a384399e19..6987b6fbb98 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -217,6 +217,8 @@ class XlaOpKernelContext {
     return dynamic_dimension_is_minus_one_;
   }
 
+  bool is_dynamic_dimension(int64 dim_size) { return dim_size == -1; }
+
   // Reads the current value of the resource variable referred to by input
   // `index`. If `shape` is not nullptr, sets `*shape` to the shape of the
   // variable. Returns an error if the variable has not been initialized, or if
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index a2993058321..45f49cee328 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -17,7 +17,6 @@ package_group(
         "//tensorflow/compiler/...",
         "//tensorflow/python/tpu/...",
         "//third_party/py/jax/...",
-        "//third_party/tf_runtime/tools/tf_kernel_gen/...",
     ],
 )
 
@@ -332,6 +331,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
@@ -449,6 +449,7 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index cd52e2f5e45..404f9eb7519 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -70,6 +70,12 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_num_partitions(
   return *this;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_use_spmd_partitioning(
+    bool use_spmd_partitioning) {
+  use_spmd_partitioning_ = use_spmd_partitioning;
+  return *this;
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_assignment(
     const DeviceAssignment& device_assignment) {
   device_assignment_ = device_assignment;
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 360ad0260df..9a7fdd974b1 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -77,6 +77,11 @@ class ExecutableBuildOptions {
   int num_partitions() const { return num_partitions_; }
   ExecutableBuildOptions& set_num_partitions(int num_partitions);
 
+  // Indicates whether to use SPMD (true) or MPMD (false) partitioning when
+  // num_partitions > 1 and XLA is requested to partition the input program.
+  bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
+  ExecutableBuildOptions& set_use_spmd_partitioning(bool use_spmd_partitioning);
+
   // If set, this specifies a static device assignment for the computation.
   // Otherwise, the computation will be compiled generically and can be run with
   // any device assignment compatible with the computation's replica and
@@ -104,6 +109,7 @@ class ExecutableBuildOptions {
   se::DeviceMemoryAllocator* device_allocator_ = nullptr;
   int num_replicas_ = 1;
   int num_partitions_ = 1;
+  bool use_spmd_partitioning_ = false;
   absl::optional<DeviceAssignment> device_assignment_;
   bool alias_passthrough_params_ = false;
 };
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index a24f110fd7a..20d9930341f 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -114,7 +114,8 @@ namespace {
 
 XlaComputation CreateMinMaxComputation(XlaBuilder* outer_builder,
                                        PrimitiveType value_type,
-                                       PrimitiveType index_type, bool is_min) {
+                                       PrimitiveType index_type, bool is_min,
+                                       bool stable, bool tie_low) {
   auto sub_builder = outer_builder->CreateSubBuilder("minmax_func");
   XlaBuilder* b = sub_builder.get();
   XlaOp lhs_value =
@@ -126,14 +127,21 @@ XlaComputation CreateMinMaxComputation(XlaBuilder* outer_builder,
   XlaOp rhs_index =
       Parameter(b, 3, ShapeUtil::MakeShape(index_type, {}), "rhs_index");
 
-  auto cmp = is_min ? Le(lhs_value, rhs_value) : Ge(lhs_value, rhs_value);
+  XlaOp cmp = is_min ? Le(lhs_value, rhs_value) : Ge(lhs_value, rhs_value);
   XlaOp max = Select(cmp, lhs_value, rhs_value);
   XlaOp arg_max = Select(cmp, lhs_index, rhs_index);
+  if (stable) {
+    XlaOp eq = Eq(lhs_value, rhs_value);
+    XlaOp tie_id =
+        tie_low ? Min(lhs_index, rhs_index) : Max(lhs_index, rhs_index);
+    arg_max = Select(eq, tie_id, arg_max);
+  }
   Tuple(b, {max, arg_max});
   return b->Build().ConsumeValueOrDie();
 }
 
-XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min) {
+XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min,
+                bool stable, bool tie_low) {
   XlaBuilder* builder = input.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
@@ -150,8 +158,9 @@ XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min) {
     iota_shape.set_element_type(index_type);
     XlaOp iota = Iota(builder, iota_shape, axis);
 
-    XlaComputation reducer = CreateMinMaxComputation(
-        builder, input_shape.element_type(), index_type, is_min);
+    XlaComputation reducer =
+        CreateMinMaxComputation(builder, input_shape.element_type(), index_type,
+                                is_min, stable, tie_low);
     XlaOp max_argmax = Reduce(builder, {input, iota},
                               {value_init_value, index_init_value}, reducer,
                               /*dimensions_to_reduce=*/{axis});
@@ -164,7 +173,7 @@ XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min) {
 }
 
 XlaOp ArgMinMaxTwoPass(XlaOp input, PrimitiveType output_type, int axis,
-                       bool is_min) {
+                       bool is_min, bool tie_low) {
   XlaBuilder* builder = input.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
@@ -180,38 +189,51 @@ XlaOp ArgMinMaxTwoPass(XlaOp input, PrimitiveType output_type, int axis,
 
     XlaOp iota = Iota(
         builder, ShapeUtil::ChangeElementType(input_shape, output_type), axis);
-    XlaOp input_max = Reduce(input, init_value, reducer,
-                             /*dimensions_to_reduce=*/{axis});
+    XlaOp reduced_input = Reduce(input, init_value, reducer,
+                                 /*dimensions_to_reduce=*/{axis});
     std::vector<int64> broadcast_dims(input_shape.rank() - 1);
     std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
     std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
-
-    XlaOp max_idx = MaxValue(builder, output_type);
-    XlaOp select_mask = Select(Eq(input, input_max, broadcast_dims),
-                               /*on_true=*/iota,
-                               /*on_false=*/
-                               max_idx);
-
-    return Reduce(select_mask, max_idx,
-                  CreateScalarMinComputation(output_type, builder),
-                  /*dimensions_to_reduce=*/{axis});
+    if (tie_low) {
+      XlaOp max_idx = MaxValue(builder, output_type);
+      XlaOp select_mask = Select(Eq(input, reduced_input, broadcast_dims),
+                                 /*on_true=*/iota,
+                                 /*on_false=*/
+                                 max_idx);
+      return Reduce(select_mask, max_idx,
+                    CreateScalarMinComputation(output_type, builder),
+                    /*dimensions_to_reduce=*/{axis});
+    } else {
+      XlaOp min_idx = MinValue(builder, output_type);
+      XlaOp select_mask = Select(Eq(input, reduced_input, broadcast_dims),
+                                 /*on_true=*/iota,
+                                 /*on_false=*/
+                                 min_idx);
+      return Reduce(select_mask, min_idx,
+                    CreateScalarMaxComputation(output_type, builder),
+                    /*dimensions_to_reduce=*/{axis});
+    }
   });
 }
 }  // namespace
 
-XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis) {
-  return ArgMinMax(input, output_type, axis, /*is_min=*/false);
+XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis, bool stable,
+             bool tie_low) {
+  return ArgMinMax(input, output_type, axis, /*is_min=*/false, stable, tie_low);
 }
 
-XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis) {
-  return ArgMinMax(input, output_type, axis, /*is_min=*/true);
+XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis, bool stable,
+             bool tie_low) {
+  return ArgMinMax(input, output_type, axis, /*is_min=*/true, stable, tie_low);
 }
 
-XlaOp ArgMaxTwoPass(XlaOp input, PrimitiveType output_type, int axis) {
-  return ArgMinMaxTwoPass(input, output_type, axis, /*is_min=*/false);
+XlaOp ArgMaxTwoPass(XlaOp input, PrimitiveType output_type, int axis,
+                    bool tie_low) {
+  return ArgMinMaxTwoPass(input, output_type, axis, /*is_min=*/false, tie_low);
 }
 
-XlaOp ArgMinTwoPass(XlaOp input, PrimitiveType output_type, int axis) {
-  return ArgMinMaxTwoPass(input, output_type, axis, /*is_min=*/true);
+XlaOp ArgMinTwoPass(XlaOp input, PrimitiveType output_type, int axis,
+                    bool tie_low) {
+  return ArgMinMaxTwoPass(input, output_type, axis, /*is_min=*/true, tie_low);
 }
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index 350dcc5531d..2712b2aa191 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -77,14 +77,24 @@ XlaComputation CreateScalarIdentityWithZeroComputation(PrimitiveType type,
 XlaOp Any(XlaOp predicates);
 
 // Returns the argmax of `input` along `axis`. `output_type` is the type to
-// use for the output.
-XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis);
-XlaOp ArgMaxTwoPass(XlaOp input, PrimitiveType output_type, int axis);
+// use for the output. The `tie_low` argument drives the index selection is case
+// of same values. If `true` (default behavior) the lowest index will be
+// returned, otherwise the higher. The tie_low argument only applies if `stable`
+// is true or using the ArgMaxTwoPass.
+XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis,
+             bool stable = false, bool tie_low = true);
+XlaOp ArgMaxTwoPass(XlaOp input, PrimitiveType output_type, int axis,
+                    bool tie_low = true);
 
 // Returns the argmin of `input` along `axis`. `output_type` is the type to
-// use for the output.
-XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis);
-XlaOp ArgMinTwoPass(XlaOp input, PrimitiveType output_type, int axis);
+// use for the output. The `tie_low` argument drives the index selection is case
+// of same values. If `true` (default behavior) the lowest index will be
+// returned, otherwise the higher. The tie_low argument only applies if `stable`
+// is true or using the ArgMinTwoPass.
+XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis,
+             bool stable = false, bool tie_low = true);
+XlaOp ArgMinTwoPass(XlaOp input, PrimitiveType output_type, int axis,
+                    bool tie_low = true);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic_test.cc b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
index d3ff14d8a9b..842b06348ed 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
@@ -33,14 +33,16 @@ class ArithmeticTest : public ClientLibraryTestBase {
  public:
   template <typename NativeT>
   void TestArgMin(std::initializer_list<std::initializer_list<NativeT>> input,
-                  absl::Span<NativeT const> expected_output, int axis) {
-    return TestArgMinMax(input, expected_output, axis, /*is_min=*/true);
+                  absl::Span<NativeT const> expected_output, int axis,
+                  bool tie_low) {
+    TestArgMinMax(input, expected_output, axis, /*is_min=*/true, tie_low);
   }
 
   template <typename NativeT>
   void TestArgMax(std::initializer_list<std::initializer_list<NativeT>> input,
-                  absl::Span<NativeT const> expected_output, int axis) {
-    return TestArgMinMax(input, expected_output, axis, /*is_min=*/false);
+                  absl::Span<NativeT const> expected_output, int axis,
+                  bool tie_low) {
+    TestArgMinMax(input, expected_output, axis, /*is_min=*/false, tie_low);
   }
 
  private:
@@ -48,46 +50,63 @@ class ArithmeticTest : public ClientLibraryTestBase {
   template <typename NativeT>
   void TestArgMinMax(
       std::initializer_list<std::initializer_list<NativeT>> input,
-      absl::Span<NativeT const> expected_output, int axis, bool is_min) {
+      absl::Span<NativeT const> expected_output, int axis, bool is_min,
+      bool tie_low) {
     if (is_min) {
-      TestArgMinMaxImpl(input, expected_output, axis, &ArgMin);
-      TestArgMinMaxImpl(input, expected_output, axis, &ArgMinTwoPass);
+      TestArgMinMaxImpl(
+          input, expected_output, [=](XlaOp op, PrimitiveType type) {
+            return ArgMin(op, type, axis, /*stable=*/true, tie_low);
+          });
+      TestArgMinMaxImpl(input, expected_output,
+                        [=](XlaOp op, PrimitiveType type) {
+                          return ArgMinTwoPass(op, type, axis, tie_low);
+                        });
     } else {
-      TestArgMinMaxImpl(input, expected_output, axis, &ArgMax);
-      TestArgMinMaxImpl(input, expected_output, axis, &ArgMaxTwoPass);
+      TestArgMinMaxImpl(
+          input, expected_output, [=](XlaOp op, PrimitiveType type) {
+            return ArgMax(op, type, axis, /*stable=*/true, tie_low);
+          });
+      TestArgMinMaxImpl(input, expected_output,
+                        [=](XlaOp op, PrimitiveType type) {
+                          return ArgMaxTwoPass(op, type, axis, tie_low);
+                        });
     }
   }
 
   template <typename NativeT>
   void TestArgMinMaxImpl(
       std::initializer_list<std::initializer_list<NativeT>> input,
-      absl::Span<NativeT const> expected_output, int axis,
-      std::function<void(XlaOp, PrimitiveType, int)> MinMaxImpl) {
+      absl::Span<NativeT const> expected_output,
+      std::function<void(XlaOp, PrimitiveType)> MinMaxImpl) {
     XlaBuilder builder(TestName());
     XlaOp x = ConstantR2<NativeT>(&builder, input);
-    MinMaxImpl(x, primitive_util::NativeToPrimitiveType<NativeT>(), axis);
+    MinMaxImpl(x, primitive_util::NativeToPrimitiveType<NativeT>());
     ComputeAndCompareR1<NativeT>(&builder, expected_output, {});
   }
 };
 
 XLA_TEST_F(ArithmeticTest, ArgMinR2Axis0) {
   TestArgMin<int32>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {0, 1, 2},
-                    /*axis=*/0);
+                    /*axis=*/0, /*tie_low=*/true);
+  TestArgMin<int32>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {0, 2, 2},
+                    /*axis=*/0, /*tie_low=*/false);
 }
 
 XLA_TEST_F(ArithmeticTest, ArgMinR2Axis1) {
   TestArgMin<int32>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {0, 1, 1},
-                    /*axis=*/1);
+                    /*axis=*/1, /*tie_low=*/true);
+  TestArgMin<int32>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {0, 1, 2},
+                    /*axis=*/1, /*tie_low=*/false);
 }
 
 XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis0) {
   TestArgMax<int32>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {2, 0, 1},
-                    /*axis=*/0);
+                    /*axis=*/0, /*tie_low=*/true);
 }
 
 XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis1) {
   TestArgMax<int32>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {1, 0, 0},
-                    /*axis=*/1);
+                    /*axis=*/1, /*tie_low=*/true);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 701479614aa..f2ee94a0159 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -922,7 +922,6 @@ XlaOp Igamma(XlaOp a, XlaOp x) {
         ScalarLike(a, 1) - IgammacContinuedFraction<VALUE>(
                                ax, x, a, And(enabled, use_igammac), type),
         IgammaSeries<VALUE>(ax, x, a, And(enabled, Not(use_igammac)), type));
-    output = Select(underflow, ZerosLike(output), output);
     output = Select(x_is_zero, ZerosLike(output), output);
     output = Select(Or(domain_error, is_nan), FullLike(a, nan), output);
     return output;
@@ -968,7 +967,6 @@ XlaOp IgammaGradA(XlaOp a, XlaOp x) {
                               ax, x, a, And(enabled, use_igammac), type),
                           IgammaSeries<DERIVATIVE>(
                               ax, x, a, And(enabled, Not(use_igammac)), type));
-    output = Select(underflow, ZerosLike(output), output);
     output = Select(x_is_zero, ZerosLike(output), output);
     output = Select(Or(domain_error, is_nan), FullLike(a, nan), output);
     return output;
@@ -1016,7 +1014,6 @@ XlaOp RandomGammaGrad(XlaOp a, XlaOp x) {
                               ax, x, a, And(enabled, use_igammac), type),
                           IgammaSeries<SAMPLE_DERIVATIVE>(
                               ax, x, a, And(enabled, Not(use_igammac)), type));
-    output = Select(underflow, ZerosLike(output), output);
     output = Select(x_is_zero, ZerosLike(output), output);
     output = Select(Or(domain_error, is_nan), FullLike(a, nan), output);
     return output;
@@ -1061,8 +1058,7 @@ XlaOp Igammac(XlaOp a, XlaOp x) {
                                       ax, x, a, And(enabled, use_igamma), type),
                IgammacContinuedFraction<VALUE>(
                    ax, x, a, And(enabled, Not(use_igamma)), type));
-    return Select(underflow, ZerosLike(a),
-                  Select(out_of_range, FullLike(a, 1), result));
+    return Select(out_of_range, FullLike(a, 1), result);
   };
   return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(auto a_shape, b.GetShape(a));
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 32796dd8d70..cb79b2ef7db 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -236,6 +236,19 @@ XLA_TEST_F(MathTest, SqrtF32) {
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, SqrtF64) {
+  XlaBuilder builder(TestName());
+  Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F64);
+
+  std::unique_ptr<GlobalData> zero_data =
+      client_->TransferToServer(zero_literal).ConsumeValueOrDie();
+
+  XlaOp zero = Parameter(&builder, 0, zero_literal.shape(), "zero");
+  Sqrt(zero);
+
+  ComputeAndCompareR0<double>(&builder, 0.0f, {zero_data.get()}, error_spec_);
+}
+
 #ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64
 XLA_TEST_F(MathTest, ErfInvF64) {
   XlaBuilder builder(TestName());
@@ -298,6 +311,15 @@ XLA_TEST_F(MathTest, SqrtSixValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, CbrtSixValues) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(&builder, {8.0, 1.0, 4096.0, -64.0, 1.728, 1331});
+  Cbrt(x);
+
+  std::vector<float> expected = {2, 1, 16, -4, 1.2, 11};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.001));
+}
+
 XLA_TEST_F(MathTest, SinhSmallValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1e-3, 1e-5, 1e-7, 1e-9, 1e-11});
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 807cbe9bd5d..58365c0f498 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -822,23 +822,29 @@ XlaOp XlaBuilder::Slice(XlaOp operand, absl::Span<const int64> start_indices,
                         absl::Span<const int64> limit_indices,
                         absl::Span<const int64> strides) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferSliceShape(
                                          *operand_shape, start_indices,
                                          limit_indices, strides));
-    *instr.mutable_shape() = shape.ToProto();
-    for (int i = 0; i < start_indices.size(); i++) {
-      auto* slice_config = instr.add_slice_dimensions();
-      slice_config->set_start(start_indices[i]);
-      slice_config->set_limit(limit_indices[i]);
-      slice_config->set_stride(strides[i]);
-    }
-
-    return AddInstruction(std::move(instr), HloOpcode::kSlice, {operand});
+    return SliceInternal(shape, operand, start_indices, limit_indices, strides);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::SliceInternal(const Shape& shape, XlaOp operand,
+                                          absl::Span<const int64> start_indices,
+                                          absl::Span<const int64> limit_indices,
+                                          absl::Span<const int64> strides) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  for (int i = 0; i < start_indices.size(); i++) {
+    auto* slice_config = instr.add_slice_dimensions();
+    slice_config->set_start(start_indices[i]);
+    slice_config->set_limit(limit_indices[i]);
+    slice_config->set_stride(strides[i]);
+  }
+  return AddInstruction(std::move(instr), HloOpcode::kSlice, {operand});
+}
+
 XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64 start_index,
                              int64 limit_index, int64 stride, int64 dimno) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -854,34 +860,10 @@ XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64 start_index,
   });
 }
 
-XlaOp XlaBuilder::DynamicSlice(XlaOp operand, XlaOp start_indices,
-                               absl::Span<const int64> slice_sizes) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
-                        GetShapePtr(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape, ShapeInference::InferDynamicSliceShape(
-                         *operand_shape, {*start_indices_shape}, slice_sizes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    for (int64 size : slice_sizes) {
-      instr.add_dynamic_slice_sizes(size);
-    }
-
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice,
-                          {operand, start_indices});
-  });
-}
-
 XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
                                absl::Span<const XlaOp> start_indices,
                                absl::Span<const int64> slice_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> start_indices_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
@@ -892,43 +874,28 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
                             *operand_shape, start_indices_shapes, slice_sizes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    for (int64 size : slice_sizes) {
-      instr.add_dynamic_slice_sizes(size);
-    }
-
-    std::vector<XlaOp> operands = {operand};
-    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
+    return DynamicSliceInternal(shape, operand, start_indices, slice_sizes);
   });
 }
 
-XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
-                                     XlaOp start_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
+StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+    absl::Span<const int64> slice_sizes) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
 
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
-    TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
-                        GetShapePtr(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape,
-        ShapeInference::InferDynamicUpdateSliceShape(
-            *operand_shape, *update_shape, {*start_indices_shape}));
-    *instr.mutable_shape() = shape.ToProto();
+  for (int64 size : slice_sizes) {
+    instr.add_dynamic_slice_sizes(size);
+  }
 
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          {operand, update, start_indices});
-  });
+  std::vector<XlaOp> operands = {operand};
+  operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+  return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
 }
 
 XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                      absl::Span<const XlaOp> start_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
     std::vector<const Shape*> start_indices_shape_ptrs;
@@ -940,53 +907,68 @@ XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferDynamicUpdateSliceShape(
                          *operand_shape, *update_shape, start_indices_shapes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    std::vector<XlaOp> operands = {operand, update};
-    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          operands);
+    return DynamicUpdateSliceInternal(shape, operand, update, start_indices);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
+    const Shape& shape, XlaOp operand, XlaOp update,
+    absl::Span<const XlaOp> start_indices) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  std::vector<XlaOp> operands = {operand, update};
+  operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+  return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                        operands);
+}
+
 XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
                               int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConcatOpShape(
                                          operand_shape_ptrs, dimension));
-    *instr.mutable_shape() = shape.ToProto();
-
-    instr.add_dimensions(dimension);
-
-    return AddInstruction(std::move(instr), HloOpcode::kConcatenate, operands);
+    return ConcatInDimInternal(shape, operands, dimension);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::ConcatInDimInternal(
+    const Shape& shape, absl::Span<const XlaOp> operands, int64 dimension) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  instr.add_dimensions(dimension);
+
+  return AddInstruction(std::move(instr), HloOpcode::kConcatenate, operands);
+}
+
 XlaOp XlaBuilder::Pad(XlaOp operand, XlaOp padding_value,
                       const PaddingConfig& padding_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* padding_value_shape,
                         GetShapePtr(padding_value));
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferPadShape(
                          *operand_shape, *padding_value_shape, padding_config));
-    *instr.mutable_shape() = shape.ToProto();
-    *instr.mutable_padding_config() = padding_config;
-
-    return AddInstruction(std::move(instr), HloOpcode::kPad,
-                          {operand, padding_value});
+    return PadInternal(shape, operand, padding_value, padding_config);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::PadInternal(const Shape& shape, XlaOp operand,
+                                        XlaOp padding_value,
+                                        const PaddingConfig& padding_config) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  *instr.mutable_padding_config() = padding_config;
+  return AddInstruction(std::move(instr), HloOpcode::kPad,
+                        {operand, padding_value});
+}
+
 XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64> dimensions,
                           absl::Span<const int64> new_sizes,
                           int64 inferred_dimension) {
@@ -1080,7 +1062,6 @@ XlaOp XlaBuilder::Select(XlaOp pred, XlaOp on_true, XlaOp on_false) {
 
 XlaOp XlaBuilder::Tuple(absl::Span<const XlaOp> elements) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
@@ -1088,14 +1069,19 @@ XlaOp XlaBuilder::Tuple(absl::Span<const XlaOp> elements) {
     TF_ASSIGN_OR_RETURN(const Shape shape,
                         ShapeInference::InferVariadicOpShape(
                             HloOpcode::kTuple, operand_shape_ptrs));
-    *instr.mutable_shape() = shape.ToProto();
-    return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
+    return TupleInternal(shape, elements);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::TupleInternal(const Shape& shape,
+                                          absl::Span<const XlaOp> elements) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
+}
+
 XlaOp XlaBuilder::GetTupleElement(XlaOp tuple_data, int64 index) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* tuple_shape, GetShapePtr(tuple_data));
     if (!tuple_shape->IsTuple()) {
       return InvalidArgument(
@@ -1107,16 +1093,22 @@ XlaOp XlaBuilder::GetTupleElement(XlaOp tuple_data, int64 index) {
           "GetTupleElement() index (%d) out of range for tuple shape %s", index,
           ShapeUtil::HumanString(*tuple_shape));
     }
-    *instr.mutable_shape() =
-        ShapeUtil::GetTupleElementShape(*tuple_shape, index).ToProto();
-
-    instr.set_tuple_index(index);
-
-    return AddInstruction(std::move(instr), HloOpcode::kGetTupleElement,
-                          {tuple_data});
+    return GetTupleElementInternal(
+        ShapeUtil::GetTupleElementShape(*tuple_shape, index), tuple_data,
+        index);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::GetTupleElementInternal(const Shape& shape,
+                                                    XlaOp tuple_data,
+                                                    int64 index) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  instr.set_tuple_index(index);
+  return AddInstruction(std::move(instr), HloOpcode::kGetTupleElement,
+                        {tuple_data});
+}
+
 XlaOp XlaBuilder::Dot(XlaOp lhs, XlaOp rhs,
                       const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1134,21 +1126,29 @@ XlaOp XlaBuilder::DotGeneral(XlaOp lhs, XlaOp rhs,
                              const DotDimensionNumbers& dimension_numbers,
                              const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDotOpShape(*lhs_shape, *rhs_shape,
                                                         dimension_numbers));
-    *instr.mutable_shape() = shape.ToProto();
-    *instr.mutable_dot_dimension_numbers() = dimension_numbers;
-    if (precision_config != nullptr) {
-      *instr.mutable_precision_config() = *precision_config;
-    }
-    return AddInstruction(std::move(instr), HloOpcode::kDot, {lhs, rhs});
+    return DotGeneralInternal(shape, lhs, rhs, dimension_numbers,
+                              precision_config);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::DotGeneralInternal(
+    const Shape& shape, XlaOp lhs, XlaOp rhs,
+    const DotDimensionNumbers& dimension_numbers,
+    const PrecisionConfig* precision_config) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  *instr.mutable_dot_dimension_numbers() = dimension_numbers;
+  if (precision_config != nullptr) {
+    *instr.mutable_precision_config() = *precision_config;
+  }
+  return AddInstruction(std::move(instr), HloOpcode::kDot, {lhs, rhs});
+}
+
 Status XlaBuilder::VerifyConvolution(
     const Shape& lhs_shape, const Shape& rhs_shape,
     const ConvolutionDimensionNumbers& dimension_numbers) const {
@@ -1269,7 +1269,6 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     int64 feature_group_count, int64 batch_group_count,
     const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_RETURN_IF_ERROR(
@@ -1282,30 +1281,45 @@ XlaOp XlaBuilder::ConvGeneralDilated(
       window_dimensions[i] =
           rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
     }
-    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+
+    TF_ASSIGN_OR_RETURN(Window window,
                         ShapeInference::InferWindowFromDimensions(
                             window_dimensions, window_strides, padding,
                             lhs_dilation, rhs_dilation));
-
-    TF_ASSIGN_OR_RETURN(
-        Shape shape, ShapeInference::InferConvolveShape(
-                         *lhs_shape, *rhs_shape, feature_group_count,
-                         batch_group_count, instr.window(), dimension_numbers));
-    *instr.mutable_shape() = shape.ToProto();
-
-    *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
-    instr.set_feature_group_count(feature_group_count);
-    instr.set_batch_group_count(batch_group_count);
-
-    if (precision_config != nullptr) {
-      *instr.mutable_precision_config() = *precision_config;
-    }
-
-    return AddInstruction(std::move(instr), HloOpcode::kConvolution,
-                          {lhs, rhs});
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferConvolveShape(
+                            *lhs_shape, *rhs_shape, feature_group_count,
+                            batch_group_count, window, dimension_numbers));
+    return ConvGeneralDilatedInternal(shape, lhs, rhs, window, window_strides,
+                                      padding, lhs_dilation, rhs_dilation,
+                                      dimension_numbers, feature_group_count,
+                                      batch_group_count, precision_config);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
+    const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+    absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  *instr.mutable_window() = window;
+  *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+  instr.set_feature_group_count(feature_group_count);
+  instr.set_batch_group_count(batch_group_count);
+
+  if (precision_config != nullptr) {
+    *instr.mutable_precision_config() = *precision_config;
+  }
+
+  return AddInstruction(std::move(instr), HloOpcode::kConvolution, {lhs, rhs});
+}
+
 XlaOp XlaBuilder::Fft(XlaOp operand, const FftType fft_type,
                       const absl::Span<const int64> fft_length) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1399,14 +1413,11 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
 XlaOp XlaBuilder::InfeedWithToken(XlaOp token, const Shape& shape,
                                   const string& config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     if (!LayoutUtil::HasLayout(shape)) {
       return InvalidArgument("Given shape to Infeed must have a layout");
     }
     const Shape infeed_instruction_shape =
         ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
-    *instr.mutable_shape() = infeed_instruction_shape.ToProto();
-    instr.set_infeed_config(config);
 
     if (shape.IsArray() && sharding() &&
         sharding()->type() == OpSharding::OTHER) {
@@ -1419,11 +1430,18 @@ XlaOp XlaBuilder::InfeedWithToken(XlaOp token, const Shape& shape,
       return InvalidArgument(
           "Replicated sharding is not yet supported for infeeds");
     }
-
-    return AddInstruction(std::move(instr), HloOpcode::kInfeed, {token});
+    return InfeedWithTokenInternal(infeed_instruction_shape, token, config);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::InfeedWithTokenInternal(
+    const Shape& infeed_instruction_shape, XlaOp token, const string& config) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = infeed_instruction_shape.ToProto();
+  instr.set_infeed_config(config);
+  return AddInstruction(std::move(instr), HloOpcode::kInfeed, {token});
+}
+
 void XlaBuilder::Outfeed(XlaOp operand, const Shape& shape_with_layout,
                          const string& outfeed_config) {
   ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1480,10 +1498,6 @@ XlaOp XlaBuilder::OutfeedWithToken(XlaOp operand, XlaOp token,
                                    const Shape& shape_with_layout,
                                    const string& outfeed_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
-
     // Check and set outfeed shape.
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
       return InvalidArgument("Given shape to Outfeed must have a layout");
@@ -1495,15 +1509,22 @@ XlaOp XlaBuilder::OutfeedWithToken(XlaOp operand, XlaOp token,
           ShapeUtil::HumanStringWithLayout(shape_with_layout),
           ShapeUtil::HumanStringWithLayout(*operand_shape));
     }
-    *instr.mutable_outfeed_shape() = shape_with_layout.ToProto();
-
-    instr.set_outfeed_config(outfeed_config);
-
-    return AddInstruction(std::move(instr), HloOpcode::kOutfeed,
-                          {operand, token});
+    return OutfeedWithTokenInternal(operand, token, shape_with_layout,
+                                    outfeed_config);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::OutfeedWithTokenInternal(
+    XlaOp operand, XlaOp token, const Shape& shape_with_layout,
+    const string& outfeed_config) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
+  *instr.mutable_outfeed_shape() = shape_with_layout.ToProto();
+  instr.set_outfeed_config(outfeed_config);
+  return AddInstruction(std::move(instr), HloOpcode::kOutfeed,
+                        {operand, token});
+}
+
 XlaOp XlaBuilder::CreateToken() {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -1624,18 +1645,23 @@ XlaOp XlaBuilder::CustomCall(
 XlaOp XlaBuilder::Transpose(XlaOp operand,
                             absl::Span<const int64> permutation) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTransposeShape(
                                          *operand_shape, permutation));
-    *instr.mutable_shape() = shape.ToProto();
-    for (int64 dim : permutation) {
-      instr.add_dimensions(dim);
-    }
-    return AddInstruction(std::move(instr), HloOpcode::kTranspose, {operand});
+    return TransposeInternal(shape, operand, permutation);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::TransposeInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const int64> permutation) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  for (int64 dim : permutation) {
+    instr.add_dimensions(dim);
+  }
+  return AddInstruction(std::move(instr), HloOpcode::kTranspose, {operand});
+}
+
 XlaOp XlaBuilder::Rev(XlaOp operand, absl::Span<const int64> dimensions) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -1748,8 +1774,6 @@ XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
                         absl::Span<const XlaOp> parameters,
                         const Shape& shape) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     // Check the number of parameters per RNG distribution.
     switch (distribution) {
       case RandomDistribution::RNG_NORMAL:
@@ -1765,14 +1789,20 @@ XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
     }
 
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
-    *instr.mutable_shape() = shape.ToProto();
-
-    instr.set_distribution(distribution);
-
-    return AddInstruction(std::move(instr), HloOpcode::kRng, parameters);
+    return RngOpInternal(distribution, parameters, shape);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::RngOpInternal(RandomDistribution distribution,
+                                          absl::Span<const XlaOp> parameters,
+                                          const Shape& shape) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  instr.set_distribution(distribution);
+
+  return AddInstruction(std::move(instr), HloOpcode::kRng, parameters);
+}
+
 XlaOp XlaBuilder::RngNormal(XlaOp mu, XlaOp sigma, const Shape& shape) {
   return RngOp(RandomDistribution::RNG_NORMAL, {mu, sigma}, shape);
 }
@@ -1837,27 +1867,33 @@ XlaOp XlaBuilder::Gather(XlaOp input, XlaOp start_indices,
                          absl::Span<const int64> slice_sizes,
                          bool indices_are_sorted) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-    instr.set_indices_are_sorted(indices_are_sorted);
-
     TF_ASSIGN_OR_RETURN(const Shape* input_shape, GetShapePtr(input));
     TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
                         GetShapePtr(start_indices));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGatherShape(
                                          *input_shape, *start_indices_shape,
                                          dimension_numbers, slice_sizes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    *instr.mutable_gather_dimension_numbers() = dimension_numbers;
-    for (int64 bound : slice_sizes) {
-      instr.add_gather_slice_sizes(bound);
-    }
-
-    return AddInstruction(std::move(instr), HloOpcode::kGather,
-                          {input, start_indices});
+    return GatherInternal(shape, input, start_indices, dimension_numbers,
+                          slice_sizes, indices_are_sorted);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::GatherInternal(
+    const Shape& shape, XlaOp input, XlaOp start_indices,
+    const GatherDimensionNumbers& dimension_numbers,
+    absl::Span<const int64> slice_sizes, bool indices_are_sorted) {
+  HloInstructionProto instr;
+  instr.set_indices_are_sorted(indices_are_sorted);
+  *instr.mutable_shape() = shape.ToProto();
+  *instr.mutable_gather_dimension_numbers() = dimension_numbers;
+  for (int64 bound : slice_sizes) {
+    instr.add_gather_slice_sizes(bound);
+  }
+
+  return AddInstruction(std::move(instr), HloOpcode::kGather,
+                        {input, start_indices});
+}
+
 XlaOp XlaBuilder::Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
                           const XlaComputation& update_computation,
                           const ScatterDimensionNumbers& dimension_numbers,
@@ -2149,6 +2185,39 @@ XlaOp XlaBuilder::BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
   });
 }
 
+XlaOp XlaBuilder::AllGather(XlaOp operand, int64 all_gather_dimension,
+                            int64 shard_count,
+                            absl::Span<const ReplicaGroup> replica_groups,
+                            const absl::optional<ChannelHandle>& channel_id,
+                            const absl::optional<Layout>& layout) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+
+    TF_ASSIGN_OR_RETURN(Shape inferred_shape,
+                        ShapeInference::InferAllGatherShape(
+                            *operand_shape, all_gather_dimension, shard_count));
+    if (layout) {
+      *inferred_shape.mutable_layout() = *layout;
+      instr.set_constrain_layout(true);
+    }
+    *instr.mutable_shape() = inferred_shape.ToProto();
+
+    instr.add_dimensions(all_gather_dimension);
+    for (const ReplicaGroup& group : replica_groups) {
+      *instr.add_replica_groups() = group;
+    }
+    if (channel_id.has_value()) {
+      instr.set_channel_id(channel_id->handle());
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        auto all_gather,
+        AddInstruction(std::move(instr), HloOpcode::kAllGather, {operand}));
+    return all_gather;
+  });
+}
+
 XlaOp XlaBuilder::CrossReplicaSum(
     XlaOp operand, absl::Span<const ReplicaGroup> replica_groups) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -2257,7 +2326,8 @@ XlaOp XlaBuilder::AllReduce(XlaOp operand, const XlaComputation& computation,
 
 XlaOp XlaBuilder::AllToAll(XlaOp operand, int64 split_dimension,
                            int64 concat_dimension, int64 split_count,
-                           const std::vector<ReplicaGroup>& replica_groups) {
+                           const std::vector<ReplicaGroup>& replica_groups,
+                           const absl::optional<Layout>& layout) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
@@ -2292,7 +2362,21 @@ XlaOp XlaBuilder::AllToAll(XlaOp operand, int64 split_dimension,
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs));
+
+    if (layout) {
+      TF_RET_CHECK(shape.IsTuple() && !ShapeUtil::IsNestedTuple(shape));
+      for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+        if (layout->minor_to_major().size() != shape.tuple_shapes(i).rank()) {
+          return InvalidArgument(
+              "Provided layout must be compatible with the operand shape: %s "
+              "vs %s",
+              layout->ToString(), operand_shape->ToString());
+        }
+        *(shape.mutable_tuple_shapes(i)->mutable_layout()) = *layout;
+      }
+    }
     *instr.mutable_shape() = shape.ToProto();
+
     for (const ReplicaGroup& group : replica_groups) {
       *instr.add_replica_groups() = group;
     }
@@ -2596,6 +2680,11 @@ XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64 dimension) {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGetDimensionSizeShape(
                                          *operand_shape, dimension));
+    // Calling GetDimensionSize on a static dimension returns a constant
+    // instruction.
+    if (!operand_shape->is_dynamic_dimension(dimension)) {
+      return ConstantR0<int32>(this, operand_shape->dimensions(dimension));
+    }
     *instr.mutable_shape() = shape.ToProto();
     instr.add_dimensions(dimension);
     return AddInstruction(std::move(instr), HloOpcode::kGetDimensionSize,
@@ -2607,8 +2696,20 @@ XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferSetDimensionSizeShape(
                                          *operand_shape, dimension));
+    // Setting an op's dynamic dimension to the static size is a noop.
+    TF_ASSIGN_OR_RETURN(const HloInstructionProto* val_proto,
+                        LookUpInstruction(val));
+    if (StringToHloOpcode(val_proto->opcode()).ValueOrDie() ==
+        HloOpcode::kConstant) {
+      TF_ASSIGN_OR_RETURN(auto literal,
+                          Literal::CreateFromProto(val_proto->literal(), true));
+      if (literal.Get<int32>({}) == shape.dimensions(dimension)) {
+        return operand;
+      }
+    }
     *instr.mutable_shape() = shape.ToProto();
     instr.add_dimensions(dimension);
     return AddInstruction(std::move(instr), HloOpcode::kSetDimensionSize,
@@ -3019,20 +3120,11 @@ XlaOp SliceInDim(const XlaOp operand, int64 start_index, int64 limit_index,
                                        stride, dimno);
 }
 
-XlaOp DynamicSlice(const XlaOp operand, const XlaOp start_indices,
-                   absl::Span<const int64> slice_sizes) {
-  return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
-}
 XlaOp DynamicSlice(const XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes) {
   return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
 }
 
-XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
-                         const XlaOp start_indices) {
-  return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
-}
-
 XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
                          absl::Span<const XlaOp> start_indices) {
   return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
@@ -3096,6 +3188,10 @@ XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
                                  broadcast_dimensions, direction);
 }
 
+XlaOp Compare(const XlaOp lhs, const XlaOp rhs, ComparisonDirection direction) {
+  return Compare(lhs, rhs, {}, direction);
+}
+
 XlaOp Dot(const XlaOp lhs, const XlaOp rhs,
           const PrecisionConfig* precision_config) {
   return lhs.builder()->Dot(lhs, rhs, precision_config);
@@ -3384,6 +3480,16 @@ XlaOp ReduceWindowWithGeneralPadding(
       base_dilations, window_dilations, padding);
 }
 
+XlaOp AllGather(const XlaOp operand, int64 all_gather_dimension,
+                int64 shard_count,
+                absl::Span<const ReplicaGroup> replica_groups,
+                const absl::optional<ChannelHandle>& channel_id,
+                const absl::optional<Layout>& layout) {
+  return operand.builder()->AllGather(operand, all_gather_dimension,
+                                      shard_count, replica_groups, channel_id,
+                                      layout);
+}
+
 XlaOp CrossReplicaSum(const XlaOp operand,
                       absl::Span<const ReplicaGroup> replica_groups) {
   return operand.builder()->CrossReplicaSum(operand, replica_groups);
@@ -3399,9 +3505,10 @@ XlaOp AllReduce(const XlaOp operand, const XlaComputation& computation,
 
 XlaOp AllToAll(const XlaOp operand, int64 split_dimension,
                int64 concat_dimension, int64 split_count,
-               const std::vector<ReplicaGroup>& replica_groups) {
+               const std::vector<ReplicaGroup>& replica_groups,
+               const absl::optional<Layout>& layout) {
   return operand.builder()->AllToAll(operand, split_dimension, concat_dimension,
-                                     split_count, replica_groups);
+                                     split_count, replica_groups, layout);
 }
 
 XlaOp CollectivePermute(
@@ -3488,6 +3595,9 @@ XlaOp Imag(const XlaOp operand) {
 XlaOp Sqrt(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kSqrt, operand);
 }
+XlaOp Cbrt(const XlaOp operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kCbrt, operand);
+}
 XlaOp Rsqrt(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kRsqrt, operand);
 }
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 06fc518851f..426b6d83207 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -364,6 +364,10 @@ class XlaBuilder {
   Status SetInstructionFrontendAttribute(XlaOp op, string attribute,
                                          string value);
 
+  // Returns shapes for the operands.
+  StatusOr<std::vector<Shape>> GetOperandShapes(
+      absl::Span<const XlaOp> operands) const;
+
  private:
   // Build helper which takes the id of the root operation..
   StatusOr<XlaComputation> Build(int64 root_id, bool remove_dynamic_dimensions);
@@ -391,6 +395,10 @@ class XlaBuilder {
   XlaOp Pad(XlaOp operand, XlaOp padding_value,
             const PaddingConfig& padding_config);
 
+  virtual StatusOr<XlaOp> PadInternal(const Shape& shape, XlaOp operand,
+                                      XlaOp padding_value,
+                                      const PaddingConfig& padding_config);
+
   XlaOp Reshape(XlaOp operand, absl::Span<const int64> dimensions,
                 absl::Span<const int64> new_sizes,
                 int64 inferred_dimension = -1);
@@ -406,30 +414,42 @@ class XlaBuilder {
   XlaOp Slice(XlaOp operand, absl::Span<const int64> start_indices,
               absl::Span<const int64> limit_indices,
               absl::Span<const int64> strides);
+  virtual StatusOr<XlaOp> SliceInternal(const Shape& shape, XlaOp operand,
+                                        absl::Span<const int64> start_indices,
+                                        absl::Span<const int64> limit_indices,
+                                        absl::Span<const int64> strides);
+  virtual XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
+                           int64 stride, int64 dimno);
 
-  XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
-                   int64 stride, int64 dimno);
-
-  ABSL_DEPRECATED("Use span-of-indices form instead")
-  XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                     absl::Span<const int64> slice_sizes);
   XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                      absl::Span<const int64> slice_sizes);
+  virtual StatusOr<XlaOp> DynamicSliceInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+      absl::Span<const int64> slice_sizes);
 
-  ABSL_DEPRECATED("Use span-of-indices form instead")
-  XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update, XlaOp start_indices);
   XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                            absl::Span<const XlaOp> start_indices);
+  virtual StatusOr<XlaOp> DynamicUpdateSliceInternal(
+      const Shape& shape, XlaOp operand, XlaOp update,
+      absl::Span<const XlaOp> start_indices);
 
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
+  virtual StatusOr<XlaOp> ConcatInDimInternal(const Shape& shape,
+                                              absl::Span<const XlaOp> operands,
+                                              int64 dimension);
 
   void Trace(const string& tag, XlaOp operand);
 
   XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
 
   XlaOp Tuple(absl::Span<const XlaOp> elements);
+  virtual StatusOr<XlaOp> TupleInternal(const Shape& shape,
+                                        absl::Span<const XlaOp> elements);
 
   XlaOp GetTupleElement(XlaOp tuple_data, int64 index);
+  virtual StatusOr<XlaOp> GetTupleElementInternal(const Shape& shape,
+                                                  XlaOp tuple_data,
+                                                  int64 index);
 
   XlaOp Dot(XlaOp lhs, XlaOp rhs,
             const PrecisionConfig* precision_config = nullptr);
@@ -472,19 +492,32 @@ class XlaBuilder {
                            int64 batch_group_count = 1,
                            const PrecisionConfig* precision_config = nullptr);
 
+  virtual StatusOr<XlaOp> ConvGeneralDilatedInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config);
+
   XlaOp Fft(XlaOp operand, FftType fft_type,
             absl::Span<const int64> fft_length);
 
   XlaOp Infeed(const Shape& shape, const string& config = "");
-  XlaOp InfeedWithToken(XlaOp token, const Shape& shape,
-                        const string& config = "");
+  XlaOp InfeedWithToken(XlaOp token, const Shape& shape, const string& config);
+  virtual StatusOr<XlaOp> InfeedWithTokenInternal(
+      const Shape& infeed_instruction_shape, XlaOp token, const string& config);
 
   void Outfeed(XlaOp operand, const Shape& shape_with_layout,
                const string& outfeed_config);
   XlaOp OutfeedWithToken(XlaOp operand, XlaOp token,
                          const Shape& shape_with_layout,
                          const string& outfeed_config);
-
+  virtual StatusOr<XlaOp> OutfeedWithTokenInternal(
+      XlaOp operand, XlaOp token, const Shape& shape_with_layout,
+      const string& outfeed_config);
   XlaOp Call(const XlaComputation& computation,
              absl::Span<const XlaOp> operands);
 
@@ -527,6 +560,12 @@ class XlaBuilder {
   XlaOp CrossReplicaSum(XlaOp operand,
                         absl::Span<const ReplicaGroup> replica_groups = {});
 
+  XlaOp AllGather(
+      XlaOp operand, int64 all_gather_dimension, int64 shard_count,
+      absl::Span<const ReplicaGroup> replica_groups = {},
+      const absl::optional<ChannelHandle>& channel_id = absl::nullopt,
+      const absl::optional<Layout>& layout = absl::nullopt);
+
   XlaOp AllReduce(
       XlaOp operand, const XlaComputation& computation,
       absl::Span<const ReplicaGroup> replica_groups = {},
@@ -535,7 +574,8 @@ class XlaBuilder {
 
   XlaOp AllToAll(XlaOp operand, int64 split_dimension, int64 concat_dimension,
                  int64 split_count,
-                 const std::vector<ReplicaGroup>& replica_groups);
+                 const std::vector<ReplicaGroup>& replica_groups,
+                 const absl::optional<Layout>& layout = absl::nullopt);
 
   XlaOp CollectivePermute(
       XlaOp operand,
@@ -565,6 +605,8 @@ class XlaBuilder {
   XlaOp BitcastConvertType(XlaOp operand, PrimitiveType new_element_type);
 
   XlaOp Transpose(XlaOp operand, absl::Span<const int64> permutation);
+  virtual StatusOr<XlaOp> TransposeInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const int64> permutation);
 
   XlaOp Rev(XlaOp operand, absl::Span<const int64> dimensions);
 
@@ -603,6 +645,11 @@ class XlaBuilder {
                absl::Span<const int64> slice_sizes,
                bool indices_are_sorted = false);
 
+  virtual StatusOr<XlaOp> GatherInternal(
+      const Shape& shape, XlaOp input, XlaOp start_indices,
+      const GatherDimensionNumbers& dimension_numbers,
+      absl::Span<const int64> slice_sizes, bool indices_are_sorted);
+
   XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
                 const XlaComputation& update_computation,
                 const ScatterDimensionNumbers& dimension_numbers,
@@ -617,7 +664,7 @@ class XlaBuilder {
   XlaOp RecvFromHost(XlaOp token, const Shape& shape,
                      const ChannelHandle& handle);
 
-  XlaOp CreateToken();
+  virtual XlaOp CreateToken();
 
   XlaOp AfterAll(absl::Span<const XlaOp> tokens);
 
@@ -677,6 +724,10 @@ class XlaBuilder {
   XlaOp RngOp(RandomDistribution distribution,
               absl::Span<const XlaOp> parameters, const Shape& shape);
 
+  virtual StatusOr<XlaOp> RngOpInternal(RandomDistribution distribution,
+                                        absl::Span<const XlaOp> parameters,
+                                        const Shape& shape);
+
   virtual StatusOr<XlaOp> InDimBroadcast(
       const Shape& shape, XlaOp operand,
       absl::Span<const int64> broadcast_dimensions);
@@ -694,10 +745,6 @@ class XlaBuilder {
   // Returns the (inferred) result for the program shape using the given root.
   StatusOr<ProgramShape> GetProgramShape(int64 root_id) const;
 
-  // Returns shapes for the operands.
-  StatusOr<std::vector<Shape>> GetOperandShapes(
-      absl::Span<const XlaOp> operands) const;
-
   // A visitor which checks whether an operation is a compile-time constant,
   // meaning that it doesn't depend on any parameters, or on any stateful
   // operation such as `RngNormal` or `Infeed`. The visitor walks the
@@ -812,14 +859,10 @@ class XlaBuilder {
   friend XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
                           int64 stride, int64 dimno);
 
-  friend XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                            absl::Span<const int64> slice_sizes);
   friend XlaOp DynamicSlice(XlaOp operand,
                             absl::Span<const XlaOp> start_indices,
                             absl::Span<const int64> slice_sizes);
 
-  friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
-                                  XlaOp start_indices);
   friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                   absl::Span<const XlaOp> start_indices);
 
@@ -846,11 +889,16 @@ class XlaBuilder {
   friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
                        absl::Span<const int64> broadcast_dimensions,
                        ComparisonDirection direction);
+  friend XlaOp Compare(XlaOp lhs, XlaOp rhs, ComparisonDirection direction);
   friend XlaOp Dot(XlaOp lhs, XlaOp rhs,
                    const PrecisionConfig* precision_config);
   friend XlaOp DotGeneral(XlaOp lhs, XlaOp rhs,
                           const DotDimensionNumbers& dimension_number,
                           const PrecisionConfig* precision_config);
+  virtual StatusOr<XlaOp> DotGeneralInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs,
+      const DotDimensionNumbers& dimension_number,
+      const PrecisionConfig* precision_config);
   friend XlaOp Conv(XlaOp lhs, XlaOp rhs,
                     absl::Span<const int64> window_strides, Padding padding,
                     int64 feature_group_count, int64 batch_group_count,
@@ -958,13 +1006,19 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding);
   friend XlaOp CrossReplicaSum(XlaOp operand,
                                absl::Span<const ReplicaGroup> replica_groups);
+  friend XlaOp AllGather(XlaOp operand, int64 all_gather_dimension,
+                         int64 shard_count,
+                         absl::Span<const ReplicaGroup> replica_groups,
+                         const absl::optional<ChannelHandle>& channel_id,
+                         const absl::optional<Layout>& layout);
   friend XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
                          absl::Span<const ReplicaGroup> replica_groups,
                          const absl::optional<ChannelHandle>& channel_id,
                          const absl::optional<Shape>& shape_with_layout);
   friend XlaOp AllToAll(XlaOp operand, int64 split_dimension,
                         int64 concat_dimension, int64 split_count,
-                        const std::vector<ReplicaGroup>& replica_groups);
+                        const std::vector<ReplicaGroup>& replica_groups,
+                        const absl::optional<Layout>& layout);
   friend XlaOp CollectivePermute(
       XlaOp operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
@@ -999,6 +1053,7 @@ class XlaBuilder {
   friend XlaOp Imag(XlaOp operand);
   friend XlaOp Sqrt(XlaOp operand);
   friend XlaOp Rsqrt(XlaOp operand);
+  friend XlaOp Cbrt(XlaOp operand);
   friend XlaOp Pow(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
   friend XlaOp IsFinite(XlaOp operand);
@@ -1381,10 +1436,6 @@ XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
 XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes);
 
-ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                   absl::Span<const int64> slice_sizes);
-
 // Enqueues a dynamic update slice operation onto the computation, which
 // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
 // The shape of 'update' determines the shape of the slice of 'operand'
@@ -1405,9 +1456,6 @@ XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
 XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                          absl::Span<const XlaOp> start_indices);
 
-ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update, XlaOp start_indices);
-
 // Enqueues a concatenate instruction onto the computation. 'operands' must
 // have >= 1 entry.
 XlaOp ConcatInDim(XlaBuilder* builder, absl::Span<const XlaOp> operands,
@@ -1451,10 +1499,12 @@ XlaOp Lt(XlaOp lhs, XlaOp rhs,
 XlaOp Le(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
-// Enqueues a comparison instruction onto the computation.
+// Enqueues a comparison instruction onto the computation (optionally without
+// broadcast_dimensions for consistency with others).
 XlaOp Compare(XlaOp lhs, XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions,
               ComparisonDirection direction);
+XlaOp Compare(XlaOp lhs, XlaOp rhs, ComparisonDirection direction);
 
 // Enqueues a dot instruction onto the computation.
 XlaOp Dot(XlaOp lhs, XlaOp rhs,
@@ -1735,6 +1785,11 @@ XlaOp ReduceWindowWithGeneralPadding(
 XlaOp CrossReplicaSum(XlaOp operand,
                       absl::Span<const ReplicaGroup> replica_groups = {});
 
+XlaOp AllGather(XlaOp operand, int64 all_gather_dimension, int64 shard_count,
+                absl::Span<const ReplicaGroup> replica_groups = {},
+                const absl::optional<ChannelHandle>& channel_id = absl::nullopt,
+                const absl::optional<Layout>& layout = absl::nullopt);
+
 // Enqueues an operation that do an AllReduce of the operand cross cores. Here
 // AllReduce means doing a reduction on the input operand cross cores and then
 // broadcasting the reduction result to those cores. The reduction function is
@@ -1760,9 +1815,13 @@ XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
                 const absl::optional<Shape>& shape_with_layout = absl::nullopt);
 
 // Enqueues an operation that do an Alltoall of the operand cross cores.
+// An optional `layout` can be specified to force the layout of the instruction.
+// This is used to guarantee the same layout for a group of AllToAll ops
+// compiled separately.
 XlaOp AllToAll(XlaOp operand, int64 split_dimension, int64 concat_dimension,
                int64 split_count,
-               const std::vector<ReplicaGroup>& replica_groups = {});
+               const std::vector<ReplicaGroup>& replica_groups = {},
+               const absl::optional<Layout>& layout = absl::nullopt);
 
 // Enqueues an collective operation that sends and receives data cross replicas.
 //
@@ -1849,6 +1908,9 @@ XlaOp Imag(XlaOp operand);
 // Enqueues a sqrt computation onto the computation.
 XlaOp Sqrt(XlaOp operand);
 
+// Enqueues a cbrt computation onto the computation.
+XlaOp Cbrt(XlaOp operand);
+
 // Enqueues a rsqrt computation onto the computation.
 XlaOp Rsqrt(XlaOp operand);
 
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 115a822b323..4fa47077fca 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -381,6 +381,29 @@ TEST_F(XlaBuilderTest, Transpose) {
   EXPECT_THAT(root, op::Transpose(op::Parameter()));
 }
 
+TEST_F(XlaBuilderTest, AllGatherR1) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4}), "x");
+  AllGather(x, /*all_gather_dimension=*/0, /*shard_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), ShapeUtil::MakeShape(F32, {16})));
+}
+
+TEST_F(XlaBuilderTest, AllGatherR2) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
+  AllGather(x, /*all_gather_dimension=*/1, /*shard_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(root->shape(), ShapeUtil::MakeShape(F32, {4, 64})));
+}
+
 TEST_F(XlaBuilderTest, AllToAll) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
@@ -407,13 +430,25 @@ TEST_F(XlaBuilderTest, CollectivePermute) {
 
 TEST_F(XlaBuilderTest, GetDimensionSize) {
   XlaBuilder b(TestName());
-  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
+  auto x =
+      Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}, {false, true}), "x");
   GetDimensionSize(x, 1);
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kGetDimensionSize);
 }
 
+TEST_F(XlaBuilderTest, GetDimensionSizeConstant) {
+  XlaBuilder b(TestName());
+  auto x =
+      Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}, {false, true}), "x");
+  // Get dimension size from a contant dimension gives us a constant.
+  GetDimensionSize(x, 0);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kConstant);
+}
+
 TEST_F(XlaBuilderTest, ReportError) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
diff --git a/tensorflow/compiler/xla/cpu_function_runtime.h b/tensorflow/compiler/xla/cpu_function_runtime.h
index 0c3355cbbfb..ea981d526e4 100644
--- a/tensorflow/compiler/xla/cpu_function_runtime.h
+++ b/tensorflow/compiler/xla/cpu_function_runtime.h
@@ -138,6 +138,9 @@ class BufferInfo {
 // Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
 constexpr size_t kAlign = 64;
 
+// The minimum alignment of buffers passed to XLA:CPU.
+constexpr size_t kMinAlign = 16;
+
 // When declaring variables that will be passed to an XLA instance as input via
 // set_arg_data(), be it a regular input or a resource variable in the graph,
 // the C++ variables must be aligned.
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 8604531889e..4152982bf4c 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -55,14 +55,26 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   // b/77879207.
   opts.set_xla_gpu_disable_multi_streaming(true);
 
-  // TODO(jlebar): Disable fastmath once doing so is not a performance
-  // regression.
+  // Disable forms of fast math that have caused users problems in the past.
   opts.set_xla_cpu_enable_fast_math(true);
+  opts.set_xla_cpu_fast_math_honor_nans(true);
+  opts.set_xla_cpu_fast_math_honor_infs(true);
+  opts.set_xla_cpu_fast_math_honor_functions(true);
+  opts.set_xla_cpu_fast_math_honor_division(true);
+
+  // By default, copy TF's Eigen style min_max behavior with nans.
+  opts.set_xla_cpu_enable_fast_min_max(false);
+
   opts.set_xla_gpu_enable_fast_min_max(true);
 
   opts.set_xla_allow_excess_precision(true);
   opts.set_xla_force_host_platform_device_count(1);
   opts.set_xla_gpu_deterministic_reductions(false);
+  opts.set_xla_cpu_enable_xprof_traceme(true);
+  // TODO(b/155295372): disable ptxas fallback by default.
+  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(true);
+  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(false);
+
   return opts;
 }
 
@@ -217,335 +229,353 @@ static void AllocateFlags() {
     return true;
   };
 
-  flag_objects = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_cpu_enable_fast_math",
-          bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
-          flag_values->xla_cpu_enable_fast_math(),
-          "Enable unsafe fast-math optimizations in the CPU compiler; "
-          "this may produce faster code at the expense of some accuracy."),
-      tensorflow::Flag(
-          "xla_cpu_fast_math_honor_nans",
-          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_nans),
-          flag_values->xla_cpu_fast_math_honor_nans(),
-          "When xla_cpu_enable_fast_math is true then this controls whether we "
-          "allow operations to produce NaNs.  Ignored when "
-          "xla_cpu_enable_fast_math is false."),
-      tensorflow::Flag(
-          "xla_cpu_fast_math_honor_infs",
-          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_infs),
-          flag_values->xla_cpu_fast_math_honor_infs(),
-          "When xla_cpu_enable_fast_math is true then this controls whether we "
-          "allow operations to produce infinites.  Ignored when "
-          "xla_cpu_enable_fast_math is false."),
-      tensorflow::Flag(
-          "xla_cpu_fast_math_honor_division",
-          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_division),
-          flag_values->xla_cpu_fast_math_honor_division(),
-          "When xla_cpu_enable_fast_math is true then this controls whether "
-          "we forbid to use multiplication by the reciprocal instead of "
-          "division. Ignored when xla_cpu_enable_fast_math is false."),
-      tensorflow::Flag(
-          "xla_cpu_fast_math_honor_functions",
-          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_functions),
-          flag_values->xla_cpu_fast_math_honor_functions(),
-          "When xla_cpu_enable_fast_math is true then this controls whether "
-          "we forbid to approximate calculations for functions. Ignored when "
-          "xla_cpu_enable_fast_math is false."),
-      tensorflow::Flag(
-          "xla_gpu_enable_fast_min_max",
-          bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
-          flag_values->xla_gpu_enable_fast_min_max(),
-          "Enable fast floating point min/max lowering that does not propagate "
-          "NaNs."),
-      tensorflow::Flag(
-          "xla_llvm_enable_alias_scope_metadata",
-          bool_setter_for(
-              &DebugOptions::set_xla_llvm_enable_alias_scope_metadata),
-          flag_values->xla_llvm_enable_alias_scope_metadata(),
-          "In LLVM-based backends, enable the emission of "
-          "!alias.scope metadata in the generated IR."),
-      tensorflow::Flag(
-          "xla_llvm_enable_noalias_metadata",
-          bool_setter_for(&DebugOptions::set_xla_llvm_enable_noalias_metadata),
-          flag_values->xla_llvm_enable_noalias_metadata(),
-          "In LLVM-based backends, enable the emission of "
-          "!noalias metadata in the generated IR."),
-      tensorflow::Flag(
-          "xla_llvm_enable_invariant_load_metadata",
-          bool_setter_for(
-              &DebugOptions::set_xla_llvm_enable_invariant_load_metadata),
-          flag_values->xla_llvm_enable_invariant_load_metadata(),
-          "In LLVM-based backends, enable the emission of "
-          "!invariant.load metadata in "
-          "the generated IR."),
-      tensorflow::Flag(
-          "xla_llvm_disable_expensive_passes",
-          bool_setter_for(&DebugOptions::set_xla_llvm_disable_expensive_passes),
-          flag_values->xla_llvm_disable_expensive_passes(),
-          "In LLVM-based backends, disable a custom set of "
-          "expensive optimization passes."),
-      tensorflow::Flag(
-          "xla_backend_optimization_level",
-          int32_setter_for(&DebugOptions::set_xla_backend_optimization_level),
-          flag_values->xla_backend_optimization_level(),
-          "Numerical optimization level for the XLA compiler backend."),
-      tensorflow::Flag(
-          "xla_disable_hlo_passes", setter_for_xla_disable_hlo_passes, "",
-          "Comma-separated list of hlo passes to be disabled. These names "
-          "must exactly match the passes' names; no whitespace around "
-          "commas."),
-      tensorflow::Flag(
-          "xla_enable_hlo_passes_only", setter_for_xla_enable_hlo_passes_only,
-          "",
-          "Comma-separated list of hlo passes to be enabled. These names "
-          "must exactly match the passes' names; no whitespace around "
-          "commas. The unspecified passes are all disabled."),
-      tensorflow::Flag(
-          "xla_disable_all_hlo_passes",
-          bool_setter_for(&DebugOptions::set_xla_disable_all_hlo_passes), false,
-          "Disables all HLO passes.  Notes that some passes are necessary for "
-          "correctness and the invariants that must be satisfied by 'fully "
-          "optimized' HLO are different for different devices and may change "
-          "over time.  The only 'guarantee', such as it is, is that if you "
-          "compile XLA and dump the optimized HLO for some graph, you should "
-          "be able to run it again on the same device with the same build of "
-          "XLA."),
-      tensorflow::Flag(
-          "xla_embed_ir_in_executable",
-          bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
-          flag_values->xla_embed_ir_in_executable(),
-          "Embed the compiler IR as a string in the executable."),
-      tensorflow::Flag(
-          "xla_eliminate_hlo_implicit_broadcast",
-          bool_setter_for(
-              &DebugOptions::set_xla_eliminate_hlo_implicit_broadcast),
-          flag_values->xla_eliminate_hlo_implicit_broadcast(),
-          "Eliminate implicit broadcasts when lowering user "
-          "computations to HLO instructions; use explicit "
-          "broadcast instead."),
-      tensorflow::Flag(
-          "xla_cpu_multi_thread_eigen",
-          bool_setter_for(&DebugOptions::set_xla_cpu_multi_thread_eigen),
-          flag_values->xla_cpu_multi_thread_eigen(),
-          "When generating calls to Eigen in the CPU backend, "
-          "use multi-threaded Eigen mode."),
-      tensorflow::Flag("xla_gpu_cuda_data_dir",
-                       flag_values->mutable_xla_gpu_cuda_data_dir(),
-                       "If non-empty, specifies a local directory containing "
-                       "ptxas and nvvm libdevice files; otherwise we use "
-                       "those from runfile directories."),
-      tensorflow::Flag("xla_gpu_ftz",
-                       bool_setter_for(&DebugOptions::set_xla_gpu_ftz),
-                       flag_values->xla_gpu_ftz(),
-                       "If true, flush-to-zero semantics are enabled in the "
-                       "code generated for GPUs."),
-      tensorflow::Flag(
-          "xla_gpu_disable_multi_streaming",
-          bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
-          flag_values->xla_gpu_disable_multi_streaming(),
-          "If true, multi-streaming in the GPU backend is disabled."),
-      tensorflow::Flag(
-          "xla_gpu_max_kernel_unroll_factor",
-          int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
-          flag_values->xla_gpu_max_kernel_unroll_factor(),
-          "Specify the maximum kernel unroll factor for the GPU backend."),
-      tensorflow::Flag("xla_gpu_ptx_file", setter_for_xla_gpu_ptx_file, "",
-                       "If non-empty, specifies a file containing ptx to use. "
-                       "The filename prefix must have the same pattern as PTX "
-                       "dumped by XLA. This allows to match one specific "
-                       "module. General workflow. Get the generated module "
-                       "ptx from XLA. Modify it. Then pass it back via this "
-                       "option."),
-      tensorflow::Flag(
-          "xla_test_all_output_layouts",
-          bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
-          flag_values->xla_test_all_output_layouts(),
-          "Let ClientLibraryTestBase::ComputeAndCompare* test "
-          "all permutations of output layouts. For example, with "
-          "a 3D shape, all permutations of the set {0, 1, 2} are "
-          "tried."),
-      tensorflow::Flag(
-          "xla_test_all_input_layouts",
-          bool_setter_for(&DebugOptions::set_xla_test_all_input_layouts),
-          flag_values->xla_test_all_input_layouts(),
-          "Let ClientLibraryTestBase::ComputeAndCompare* test "
-          "all permutations of *input* layouts. For example, for "
-          "2 input arguments with 2D shape and 4D shape, the "
-          "computation will run 2! * 4! times for every possible "
-          "layouts"),
-      tensorflow::Flag(
-          "xla_hlo_profile",
-          bool_setter_for(&DebugOptions::set_xla_hlo_profile),
-          flag_values->xla_hlo_profile(),
-          "Instrument the computation to collect per-HLO cycle counts"),
-      tensorflow::Flag("xla_backend_extra_options",
-                       setter_for_xla_backend_extra_options, "",
-                       "Extra options to pass to a backend; "
-                       "comma-separated list of 'key=val' strings (=val "
-                       "may be omitted); no whitespace around commas."),
-      tensorflow::Flag(
-          "xla_gpu_use_cudnn_batchnorm",
-          bool_setter_for(&DebugOptions::set_xla_gpu_use_cudnn_batchnorm),
-          flag_values->xla_gpu_use_cudnn_batchnorm(),
-          "Allows the GPU backend to implement batchnorm HLOs using cudnn, "
-          "rather than expanding them to a soup of HLOs."),
+  flag_objects = new std::vector<tensorflow::Flag>();
+  flag_objects->reserve(55);
+  // Don't use an initializer list for initializing the vector; this would
+  // create a temporary copy, and exceeds the stack space when compiling with
+  // certain configurations.
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_enable_fast_math",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
+      flag_values->xla_cpu_enable_fast_math(),
+      "Enable unsafe fast-math optimizations in the CPU compiler; this may "
+      "produce faster code at the expense of some accuracy."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_fast_math_honor_nans",
+      bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_nans),
+      flag_values->xla_cpu_fast_math_honor_nans(),
+      "When xla_cpu_enable_fast_math is true then this controls whether we "
+      "allow operations to produce NaNs.  Ignored when "
+      "xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_fast_math_honor_infs",
+      bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_infs),
+      flag_values->xla_cpu_fast_math_honor_infs(),
+      "When xla_cpu_enable_fast_math is true then this controls whether we "
+      "allow operations to produce infinites.  Ignored when "
+      "xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_fast_math_honor_division",
+      bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_division),
+      flag_values->xla_cpu_fast_math_honor_division(),
+      "When xla_cpu_enable_fast_math is true then this controls whether we "
+      "forbid to use multiplication by the reciprocal instead of division. "
+      "Ignored when xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_fast_math_honor_functions",
+      bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_functions),
+      flag_values->xla_cpu_fast_math_honor_functions(),
+      "When xla_cpu_enable_fast_math is true then this controls whether we "
+      "forbid to approximate calculations for functions. Ignored when "
+      "xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_enable_fast_min_max",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_min_max),
+      flag_values->xla_cpu_enable_fast_min_max(),
+      "Enable fast floating point min/max lowering that always propagates "
+      "NaNs."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_enable_fast_min_max",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
+      flag_values->xla_gpu_enable_fast_min_max(),
+      "Enable fast floating point min/max lowering that does not propagate "
+      "NaNs."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_llvm_enable_alias_scope_metadata",
+      bool_setter_for(&DebugOptions::set_xla_llvm_enable_alias_scope_metadata),
+      flag_values->xla_llvm_enable_alias_scope_metadata(),
+      "In LLVM-based backends, enable the emission of !alias.scope metadata in "
+      "the generated IR."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_llvm_enable_noalias_metadata",
+      bool_setter_for(&DebugOptions::set_xla_llvm_enable_noalias_metadata),
+      flag_values->xla_llvm_enable_noalias_metadata(),
+      "In LLVM-based backends, enable the emission of !noalias metadata in the "
+      "generated IR."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_llvm_enable_invariant_load_metadata",
+      bool_setter_for(
+          &DebugOptions::set_xla_llvm_enable_invariant_load_metadata),
+      flag_values->xla_llvm_enable_invariant_load_metadata(),
+      "In LLVM-based backends, enable the emission of !invariant.load metadata "
+      "in the generated IR."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_llvm_disable_expensive_passes",
+      bool_setter_for(&DebugOptions::set_xla_llvm_disable_expensive_passes),
+      flag_values->xla_llvm_disable_expensive_passes(),
+      "In LLVM-based backends, disable a custom set of expensive optimization "
+      "passes."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_backend_optimization_level",
+      int32_setter_for(&DebugOptions::set_xla_backend_optimization_level),
+      flag_values->xla_backend_optimization_level(),
+      "Numerical optimization level for the XLA compiler backend."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_disable_hlo_passes", setter_for_xla_disable_hlo_passes, "",
+      "Comma-separated list of hlo passes to be disabled. These names must "
+      "exactly match the passes' names; no whitespace around commas."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_enable_hlo_passes_only", setter_for_xla_enable_hlo_passes_only, "",
+      "Comma-separated list of hlo passes to be enabled. These names must "
+      "exactly match the passes' names; no whitespace around commas. The "
+      "unspecified passes are all disabled."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_disable_all_hlo_passes",
+      bool_setter_for(&DebugOptions::set_xla_disable_all_hlo_passes), false,
+      "Disables all HLO passes.  Notes that some passes are necessary for "
+      "correctness and the invariants that must be satisfied by 'fully "
+      "optimized' HLO are different for different devices and may change "
+      "over time.  The only 'guarantee', such as it is, is that if you compile "
+      "XLA and dump the optimized HLO for some graph, you should be able to "
+      "run it again on the same device with the same build of XLA."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_embed_ir_in_executable",
+      bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
+      flag_values->xla_embed_ir_in_executable(),
+      "Embed the compiler IR as a string in the executable."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_eliminate_hlo_implicit_broadcast",
+      bool_setter_for(&DebugOptions::set_xla_eliminate_hlo_implicit_broadcast),
+      flag_values->xla_eliminate_hlo_implicit_broadcast(),
+      "Eliminate implicit broadcasts when lowering user computations to HLO "
+      "instructions; use explicit broadcast instead."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_multi_thread_eigen",
+      bool_setter_for(&DebugOptions::set_xla_cpu_multi_thread_eigen),
+      flag_values->xla_cpu_multi_thread_eigen(),
+      "When generating calls to Eigen in the CPU backend, use multi-threaded "
+      "Eigen mode."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_cuda_data_dir", flag_values->mutable_xla_gpu_cuda_data_dir(),
+      "If non-empty, specifies a local directory containing ptxas and nvvm "
+      "libdevice files; otherwise we use those from runfile directories."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_ftz", bool_setter_for(&DebugOptions::set_xla_gpu_ftz),
+      flag_values->xla_gpu_ftz(),
+      "If true, flush-to-zero semantics are enabled in the code generated for "
+      "GPUs."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_disable_multi_streaming",
+      bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
+      flag_values->xla_gpu_disable_multi_streaming(),
+      "If true, multi-streaming in the GPU backend is disabled."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_max_kernel_unroll_factor",
+      int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
+      flag_values->xla_gpu_max_kernel_unroll_factor(),
+      "Specify the maximum kernel unroll factor for the GPU backend."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_ptx_file", setter_for_xla_gpu_ptx_file, "",
+      "If non-empty, specifies a file containing ptx to use. The filename "
+      "prefix must have the same pattern as PTX dumped by XLA. This allows to "
+      "match one specific module. General workflow. Get the generated module "
+      "ptx from XLA. Modify it. Then pass it back via this option."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_test_all_output_layouts",
+      bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
+      flag_values->xla_test_all_output_layouts(),
+      "Let ClientLibraryTestBase::ComputeAndCompare* test all permutations of "
+      "output layouts. For example, with a 3D shape, all permutations of the "
+      "set {0, 1, 2} are tried."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_test_all_input_layouts",
+      bool_setter_for(&DebugOptions::set_xla_test_all_input_layouts),
+      flag_values->xla_test_all_input_layouts(),
+      "Let ClientLibraryTestBase::ComputeAndCompare* test all permutations of "
+      "*input* layouts. For example, for 2 input arguments with 2D shape and "
+      "4D shape, the computation will run 2! * 4! times for every possible "
+      "layouts"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_hlo_profile", bool_setter_for(&DebugOptions::set_xla_hlo_profile),
+      flag_values->xla_hlo_profile(),
+      "Instrument the computation to collect per-HLO cycle counts"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_backend_extra_options", setter_for_xla_backend_extra_options, "",
+      "Extra options to pass to a backend; comma-separated list of 'key=val' "
+      "strings (=val may be omitted); no whitespace around commas."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_use_cudnn_batchnorm",
+      bool_setter_for(&DebugOptions::set_xla_gpu_use_cudnn_batchnorm),
+      flag_values->xla_gpu_use_cudnn_batchnorm(),
+      "Allows the GPU backend to implement batchnorm HLOs using cudnn, rather "
+      "than expanding them to a soup of HLOs."));
+  flag_objects->push_back(
       tensorflow::Flag("xla_cpu_use_mkl_dnn",
                        bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
                        flag_values->xla_cpu_use_mkl_dnn(),
-                       "Generate calls to MKL-DNN in the CPU backend."),
-      tensorflow::Flag(
-          "xla_gpu_crash_on_verification_failures",
-          bool_setter_for(
-              &DebugOptions::set_xla_gpu_crash_on_verification_failures),
-          flag_values->xla_gpu_crash_on_verification_failures(),
-          "Crashes the program on extra verification failures, e.g. cuDNN "
-          "cross checking failures"),
-      tensorflow::Flag(
-          "xla_gpu_autotune_level",
-          int32_setter_for(&DebugOptions::set_xla_gpu_autotune_level),
-          flag_values->xla_gpu_autotune_level(),
-          "Set GEMM and Convolution auto-tuning level."
-          "0 = off; 1 = on; 2 = on+init; 3 = on+init+reinit; 4 = "
-          "on+init+reinit+check."),
-      tensorflow::Flag(
-          "xla_force_host_platform_device_count",
-          int32_setter_for(
-              &DebugOptions::set_xla_force_host_platform_device_count),
-          flag_values->xla_force_host_platform_device_count(),
-          "Force the host platform to pretend that there are these many "
-          "host \"devices\". All of these host devices are backed by the same"
-          "threadpool.  Setting this to anything other than 1 can increase "
-          "overhead from context switching but we let the user override this "
-          "behavior to help run tests on the host that run models in parallel "
-          "across multiple devices."),
-      tensorflow::Flag(
-          "xla_gpu_disable_gpuasm_optimizations",
-          bool_setter_for(
-              &DebugOptions::set_xla_gpu_disable_gpuasm_optimizations),
-          flag_values->xla_gpu_disable_gpuasm_optimizations(),
-          "In XLA:GPU run ptxas in -O0 (default is -O3)."),
-      tensorflow::Flag(
-          "xla_fuel", setter_for_xla_fuel, /*default_value_for_display=*/"",
-          "Sets compiler fuel, useful for bisecting bugs in passes.  Format "
-          "--xla_fuel=PASS1=NUM1,PASS2=NUM2,..."),
-
-      tensorflow::Flag(
-          "xla_dump_to", string_setter_for(&DebugOptions::set_xla_dump_to),
-          flag_values->xla_dump_to(),
-          "Directory into which debugging data is written.  If not specified "
-          "but another dumping flag is passed, data will be written to stdout. "
-          " To explicitly write to stdout, set this to \"-\".  The values "
-          "\"sponge\" and \"test_undeclared_outputs_dir\" have a special "
-          "meaning: They cause us to dump into the directory specified by the "
-          "environment variable TEST_UNDECLARED_OUTPUTS_DIR."),
-      tensorflow::Flag(
-          "xla_dump_hlo_as_text",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_text),
-          flag_values->xla_dump_hlo_as_text(),
-          "Dumps HLO modules as text before and after optimizations.  Results "
-          "are written to the --xla_dump_to dir, or, if no dir is specified, "
-          "to stdout."),
-      tensorflow::Flag(
-          "xla_dump_hlo_as_proto",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_proto),
-          flag_values->xla_dump_hlo_as_proto(),
-          "Dumps HLO modules as HloProtos to the directory specified by "
-          "--xla_dump_to."),
-      tensorflow::Flag(
-          "xla_dump_hlo_as_dot",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_dot),
-          flag_values->xla_dump_hlo_as_dot(),
-          "Dumps HLO modules rendered as dot files to the directory "
-          "specified by --xla_dump_to."),
+                       "Generate calls to MKL-DNN in the CPU backend."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_crash_on_verification_failures",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_crash_on_verification_failures),
+      flag_values->xla_gpu_crash_on_verification_failures(),
+      "Crashes the program on extra verification failures, e.g. cuDNN cross "
+      "checking failures"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_autotune_level",
+      int32_setter_for(&DebugOptions::set_xla_gpu_autotune_level),
+      flag_values->xla_gpu_autotune_level(),
+      "Set GEMM and Convolution auto-tuning level. 0 = off; 1 = on; 2 = "
+      "on+init; 3 = on+init+reinit; 4 = on+init+reinit+check."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_force_host_platform_device_count",
+      int32_setter_for(&DebugOptions::set_xla_force_host_platform_device_count),
+      flag_values->xla_force_host_platform_device_count(),
+      "Force the host platform to pretend that there are these many host "
+      "\"devices\". All of these host devices are backed by the same "
+      "threadpool. Setting this to anything other than 1 can increase overhead "
+      "from context switching but we let the user override this behavior to "
+      "help run tests on the host that run models in parallel across multiple "
+      "devices."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_disable_gpuasm_optimizations",
+      bool_setter_for(&DebugOptions::set_xla_gpu_disable_gpuasm_optimizations),
+      flag_values->xla_gpu_disable_gpuasm_optimizations(),
+      "In XLA:GPU run ptxas in -O0 (default is -O3)."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_fuel", setter_for_xla_fuel, /*default_value_for_display=*/"",
+      "Sets compiler fuel, useful for bisecting bugs in passes.  Format "
+      "--xla_fuel=PASS1=NUM1,PASS2=NUM2,..."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_to", string_setter_for(&DebugOptions::set_xla_dump_to),
+      flag_values->xla_dump_to(),
+      "Directory into which debugging data is written. If not specified but "
+      "another dumping flag is passed, data will be written to stdout. To "
+      "explicitly write to stdout, set this to \"-\". The values \"sponge\" "
+      "and \"test_undeclared_outputs_dir\" have a special meaning: They cause "
+      "us to dump into the directory specified by the environment variable "
+      "TEST_UNDECLARED_OUTPUTS_DIR."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_as_text",
+      bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_text),
+      flag_values->xla_dump_hlo_as_text(),
+      "Dumps HLO modules as text before and after optimizations. Results are "
+      "written to the --xla_dump_to dir, or, if no dir is specified, to "
+      "stdout."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_as_proto",
+      bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_proto),
+      flag_values->xla_dump_hlo_as_proto(),
+      "Dumps HLO modules as HloProtos to the directory specified by "
+      "--xla_dump_to."));
+  flag_objects->push_back(
+      tensorflow::Flag("xla_dump_hlo_as_dot",
+                       bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_dot),
+                       flag_values->xla_dump_hlo_as_dot(),
+                       "Dumps HLO modules rendered as dot files to the "
+                       "directory specified by --xla_dump_to."));
+  flag_objects->push_back(
       tensorflow::Flag("xla_dump_hlo_as_html",
                        bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_html),
                        flag_values->xla_dump_hlo_as_html(),
                        "Dumps HLO modules rendered as HTML files to the "
-                       "directory specified by --xla_dump_to."),
-      tensorflow::Flag(
-          "xla_dump_hlo_as_url",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_url),
-          flag_values->xla_dump_hlo_as_url(),
-          "Tries to dump HLO modules rendered as URLs to stdout (and also to "
-          "the directory specified by --xla_dump_to). This is not implemented "
-          "by default; you need to add a plugin which calls "
-          "RegisterGraphToURLRenderer()."),
-      tensorflow::Flag(
-          "xla_dump_hlo_snapshots",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_snapshots),
-          flag_values->xla_dump_hlo_snapshots(),
-          "Every time an HLO module is run, dumps an HloSnapshot to the "
-          "directory specified by --xla_dump_to."),
-      tensorflow::Flag(
-          "xla_dump_hlo_module_re",
-          string_setter_for(&DebugOptions::set_xla_dump_hlo_module_re),
-          flag_values->xla_dump_hlo_module_re(),
-          "Limits dumping only to modules which match this regular expression. "
-          " Default is to dump all modules."),
-      tensorflow::Flag(
-          "xla_dump_hlo_pass_re",
-          string_setter_for(&DebugOptions::set_xla_dump_hlo_pass_re),
-          flag_values->xla_dump_hlo_pass_re(),
-          "If specified, dumps HLO before and after optimization passes which "
-          "match this regular expression, in addition to dumping at the very "
-          "beginning and end of compilation."),
-      tensorflow::Flag(
-          "xla_dump_include_timestamp",
-          bool_setter_for(&DebugOptions::set_xla_dump_include_timestamp),
-          flag_values->xla_dump_include_timestamp(),
-          "If specified, includes a timestamp in the dumped filenames."),
-      tensorflow::Flag(
-          "xla_dump_max_hlo_modules",
-          int32_setter_for(&DebugOptions::set_xla_dump_max_hlo_modules),
-          flag_values->xla_dump_max_hlo_modules(),
-          "Max number of hlo module dumps in a directory. Set to < 0 for "
-          "unbounded."),
-      tensorflow::Flag(
-          "xla_hlo_graph_addresses",
-          bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
-          flag_values->xla_hlo_graph_addresses(),
-          "When rendering graphs (--xla_dump_hlo_as_{dot,html,url}), displays "
-          "the address in memory of each HloInstruction object."),
-      tensorflow::Flag(
-          "xla_hlo_graph_sharding_color",
-          bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
-          flag_values->xla_hlo_graph_sharding_color(),
-          "Assign colors based on sharding assignments when generating the "
-          "HLO graphs."),
-      tensorflow::Flag(
-          "xla_allow_excess_precision",
-          bool_setter_for(&DebugOptions::set_xla_allow_excess_precision),
-          flag_values->xla_allow_excess_precision(),
-          "Allow xla to increase the output precision of an instruction."),
-      tensorflow::Flag(
-          "xla_gpu_force_conv_nchw",
-          bool_setter_for(&DebugOptions::set_xla_gpu_force_conv_nchw),
-          flag_values->xla_gpu_force_conv_nchw(),
-          "For cuDNN convolutions, always NCHW layouts."),
-      tensorflow::Flag("xla_gpu_algorithm_blacklist_path",
-                       string_setter_for(
-                           &DebugOptions::set_xla_gpu_algorithm_blacklist_path),
-                       flag_values->xla_gpu_algorithm_blacklist_path(),
-                       "An AlgorithmBlacklist text proto file as a blacklist "
-                       "of convolutions to avoid to use."),
-
-      tensorflow::Flag(
-          "xla_gpu_deterministic_reductions",
-          bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_reductions),
-          flag_values->xla_gpu_deterministic_reductions(),
-          "Always run deterministic reductions on GPU"),
-      tensorflow::Flag(
-          "xla_tpu_detect_nan",
-          bool_setter_for(&DebugOptions::set_xla_tpu_detect_nan),
-          flag_values->xla_tpu_detect_nan(),
-          "Trigger error on execution on TPU if a NAN value is detected"),
-      tensorflow::Flag(
-          "xla_tpu_detect_inf",
-          bool_setter_for(&DebugOptions::set_xla_tpu_detect_inf),
-          flag_values->xla_tpu_detect_inf(),
-          "Trigger error on execution on TPU if a INF value is detected"),
-  });
+                       "directory specified by --xla_dump_to."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_as_url",
+      bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_url),
+      flag_values->xla_dump_hlo_as_url(),
+      "Tries to dump HLO modules rendered as URLs to stdout (and also to the "
+      "directory specified by --xla_dump_to). This is not implemented by "
+      "default; you need to add a plugin which calls "
+      "RegisterGraphToURLRenderer()."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_snapshots",
+      bool_setter_for(&DebugOptions::set_xla_dump_hlo_snapshots),
+      flag_values->xla_dump_hlo_snapshots(),
+      "Every time an HLO module is run, dumps an HloSnapshot to the directory "
+      "specified by --xla_dump_to."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_module_re",
+      string_setter_for(&DebugOptions::set_xla_dump_hlo_module_re),
+      flag_values->xla_dump_hlo_module_re(),
+      "Limits dumping only to modules which match this regular expression. "
+      "Default is to dump all modules."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_pass_re",
+      string_setter_for(&DebugOptions::set_xla_dump_hlo_pass_re),
+      flag_values->xla_dump_hlo_pass_re(),
+      "If specified, dumps HLO before and after optimization passes which "
+      "match this regular expression, in addition to dumping at the very "
+      "beginning and end of compilation."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_include_timestamp",
+      bool_setter_for(&DebugOptions::set_xla_dump_include_timestamp),
+      flag_values->xla_dump_include_timestamp(),
+      "If specified, includes a timestamp in the dumped filenames."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_max_hlo_modules",
+      int32_setter_for(&DebugOptions::set_xla_dump_max_hlo_modules),
+      flag_values->xla_dump_max_hlo_modules(),
+      "Max number of hlo module dumps in a directory. Set to < 0 for "
+      "unbounded."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_hlo_graph_addresses",
+      bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
+      flag_values->xla_hlo_graph_addresses(),
+      "When rendering graphs (--xla_dump_hlo_as_{dot,html,url}), displays "
+      "the address in memory of each HloInstruction object."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_hlo_graph_sharding_color",
+      bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
+      flag_values->xla_hlo_graph_sharding_color(),
+      "Assign colors based on sharding assignments when generating the HLO "
+      "graphs."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_allow_excess_precision",
+      bool_setter_for(&DebugOptions::set_xla_allow_excess_precision),
+      flag_values->xla_allow_excess_precision(),
+      "Allow xla to increase the output precision of an instruction."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_force_conv_nchw",
+      bool_setter_for(&DebugOptions::set_xla_gpu_force_conv_nchw),
+      flag_values->xla_gpu_force_conv_nchw(),
+      "For cuDNN convolutions, always NCHW layouts."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_algorithm_blacklist_path",
+      string_setter_for(&DebugOptions::set_xla_gpu_algorithm_blacklist_path),
+      flag_values->xla_gpu_algorithm_blacklist_path(),
+      "An AlgorithmBlacklist text proto file as a blacklist of convolutions to "
+      "avoid to use."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_deterministic_reductions",
+      bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_reductions),
+      flag_values->xla_gpu_deterministic_reductions(),
+      "Always run deterministic reductions on GPU"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_tpu_detect_nan",
+      bool_setter_for(&DebugOptions::set_xla_tpu_detect_nan),
+      flag_values->xla_tpu_detect_nan(),
+      "Trigger error on execution on TPU if a NAN value is detected"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_tpu_detect_inf",
+      bool_setter_for(&DebugOptions::set_xla_tpu_detect_inf),
+      flag_values->xla_tpu_detect_inf(),
+      "Trigger error on execution on TPU if a INF value is detected"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_enable_xprof_traceme",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_xprof_traceme),
+      flag_values->xla_cpu_enable_xprof_traceme(),
+      "If true, XLA CPU generates code to call "
+      "TraceMe::Activity{Start|End} around HLO operations."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found),
+      flag_values->xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(),
+      "If true, XLA GPU falls back to the driver if ptxas is not found. Note "
+      "that falling back to the driver can have drawbacks like using more "
+      "memory and/or other bugs during compilation, so we recommend setting "
+      "this flag to false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_unsafe_fallback_to_driver_on_ptxas_error",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error),
+      flag_values->xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(),
+      "If true, XLA GPU falls back to the driver if there is an error when "
+      "running ptxas. Note that falling back to the driver can have drawbacks "
+      "like using more memory and/or other bugs during compilation, so we "
+      "recommend setting this flag to false."));
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 6981b35975f..8ae8c418d5d 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -50,6 +50,7 @@ class RunId {
  public:
   // Creates a new, unique RunId.
   RunId();
+  explicit RunId(int64 value) : data_(value) {}
 
   RunId(const RunId&) = default;
   RunId& operator=(const RunId&) = default;
@@ -127,6 +128,13 @@ class ExecutableRunOptions {
   ExecutableRunOptions& set_rng_seed(int rng_seed);
   int rng_seed() const;
 
+  ExecutableRunOptions& set_launch_id(int32 launch_id) {
+    launch_id_ = launch_id;
+    return *this;
+  }
+
+  int32 launch_id() const { return launch_id_; }
+
   ExecutableRunOptions& set_run_id(RunId id);
   RunId run_id() const;
 
@@ -153,6 +161,7 @@ class ExecutableRunOptions {
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
   int rng_seed_ = 0;
+  int32 launch_id_ = 0;
   stream_executor::Stream* host_to_device_stream_ = nullptr;
   ThenExecuteFunction* then_execute_function_ = nullptr;
   RunId run_id_;
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index b89bfd68073..212ad87d94c 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -243,3 +243,54 @@ def split(tensor,
       tensor, split_dimension, num_devices, input_shape).apply_to_tensor(
           tensor, assign_tuple_sharding=assign_tuple_sharding)
   return tensor
+
+
+def get_op_sharding(op):
+  """Returns sharding attribute of an op.
+
+  Args:
+    op: a TensorFlow op.
+
+  Returns:
+    The attribute representing XLA sharding on this op.
+  """
+  return op.get_attr('_XlaSharding')
+
+
+def auto_to_manual_spmd_partition(tensor, manual_sharding):
+  """Switches from automatic SPMD partitioning to manual partitioning.
+
+  Converts a full-shaped tensor (to be automatically partitioned by SPMD
+  partitioner) to a shard-shaped tensor to be consumed by manually partitioned
+  ops.
+
+  Args:
+    tensor: A tf.Tensor in full shape.
+    manual_sharding: a serialized string of OpSharding to be used in manual
+      partitioning.
+
+  Returns:
+    A shard-shaped tensor to be consumed by manually partitioned ops.
+  """
+  return tf2xla.spmd_full_to_shard_shape(
+      tensor, manual_sharding=manual_sharding)
+
+
+def manual_to_auto_spmd_partition(tensor, manual_sharding, full_shape):
+  """Switches from manual partitioning to automatic SPMD partitioning.
+
+  Converts a shard-shaped tensor (manually partitioned in SPMD-style) to a
+  full-shaped tensor to be partitioned automatically by the SPMD partitioner.
+
+  Args:
+    tensor: A tf.Tensor in shard shape.
+    manual_sharding: a serialized string of OpSharding to be used in manual
+      partitioning.
+    full_shape: the shape of tensor before partitioning.
+
+  Returns:
+    A full-shaped tensor to be partitioned automatically by the SPMD
+    partitioner.
+  """
+  return tf2xla.spmd_shard_to_full_shape(
+      tensor, manual_sharding=manual_sharding, full_shape=full_shape)
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 495701eaac2..002d07184a7 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -2299,20 +2299,26 @@ The output is guaranteed to be a deterministic function of the initial state but
 it is *not* guaranteed to be deterministic between backends and different
 compiler versions.
 
-<b>`RngBitGenerator(algorithm, key, shape)`</b> | Arguments | Type | Semantics |
-|---------------- | ----------------- | ------------------------------------- |
-| `algorithm` | `RandomAlgorithm` | PRNG algorithm to be used. | |
-`initial_state` | `XlaOp` | Initial state for the PRNG algorithm. | | `shape` |
-`Shape` | Output shape for generated data. |
+<b>`RngBitGenerator(algorithm, key, shape)`</b>
 
-Available values for `algorithm`: * `rng_default`: Backend specific algorithm
-with backend specific shape requirements. * `rng_three_fry`: ThreeFry
-counter-based PRNG algorithm. The `initial_state` shape is `u64[2]` with
-arbitrary values.
-[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
-* `rng_philox`: Philox algorithm to generate random numbers in parallel. The
-`initial_state` shape is `u64[3]` with arbitrary values.
-[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+Arguments       | Type              | Semantics
+--------------- | ----------------- | -------------------------------------
+`algorithm`     | `RandomAlgorithm` | PRNG algorithm to be used.
+`initial_state` | `XlaOp`           | Initial state for the PRNG algorithm.
+`shape`         | `Shape`           | Output shape for generated data.
+
+Available values for `algorithm`:
+
+-   `rng_default`: Backend specific algorithm with backend specific shape
+    requirements.
+
+-   `rng_three_fry`: ThreeFry counter-based PRNG algorithm. The `initial_state`
+    shape is `u64[2]` with arbitrary values.
+    [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+
+-   `rng_philox`: Philox algorithm to generate random numbers in parallel. The
+    `initial_state` shape is `u64[3]` with arbitrary values.
+    [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
 
 ## Scatter
 
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 44e6a3c7bdb..cbbad741ce3 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -198,6 +199,34 @@ Literal LiteralBase::CreateFromShape(const Shape& shape) {
   return literal;
 }
 
+absl::optional<int64> LiteralBase::GetFirstInteger() const {
+  switch (shape().element_type()) {
+    case U8:
+      return GetFirstElement<uint8>();
+    case U16:
+      return GetFirstElement<uint16>();
+    case U32:
+      return GetFirstElement<uint32>();
+    case U64: {
+      int64 v = GetFirstElement<uint64>();
+      if (v < 0) {
+        return absl::nullopt;
+      }
+      return v;
+    }
+    case S8:
+      return GetFirstElement<int8>();
+    case S16:
+      return GetFirstElement<int16>();
+    case S32:
+      return GetFirstElement<int32>();
+    case S64:
+      return GetFirstElement<int64>();
+    default:
+      return absl::nullopt;
+  }
+}
+
 template <typename NativeT>
 Status MutableLiteralBase::CopySliceFromInternal(
     const LiteralBase& src_literal, absl::Span<const int64> src_base,
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index 7aee34437e6..1553d042e80 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
@@ -116,6 +117,9 @@ class LiteralBase {
   template <typename NativeT>
   NativeT GetFirstElement() const;
 
+  // As above but returns any integer type casted to an int64.
+  absl::optional<int64> GetFirstInteger() const;
+
   // As Get(), but determines the correct type and converts the value
   // into text.
   string GetAsString(absl::Span<const int64> multi_index,
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
new file mode 100644
index 00000000000..dbd33705d0e
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -0,0 +1,213 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "worker_thread",
+    srcs = ["worker_thread.cc"],
+    hdrs = ["worker_thread.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "event_pool",
+    srcs = ["event_pool.cc"],
+    hdrs = ["event_pool.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "semaphore",
+    srcs = ["semaphore.cc"],
+    hdrs = ["semaphore.h"],
+    deps = [
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+tf_cc_test(
+    name = "semaphore_test",
+    srcs = ["semaphore_test.cc"],
+    deps = [
+        ":semaphore",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "tracked_device_buffer",
+    srcs = ["tracked_device_buffer.cc"],
+    hdrs = ["tracked_device_buffer.h"],
+    deps = [
+        ":event_pool",
+        ":local_device_state",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
+        "//tensorflow/stream_executor:event",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+tf_cc_test(
+    name = "tracked_device_buffer_test",
+    srcs = ["tracked_device_buffer_test.cc"],
+    deps = [
+        ":tracked_device_buffer",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
+    ],
+)
+
+cc_library(
+    name = "local_device_state",
+    srcs = ["local_device_state.cc"],
+    hdrs = ["local_device_state.h"],
+    deps = [
+        ":event_pool",
+        ":semaphore",
+        ":worker_thread",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/stream_executor:event",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "pjrt_client",
+    srcs = ["pjrt_client.cc"],
+    hdrs = ["pjrt_client.h"],
+    visibility = ["//tensorflow/compiler/xla:friends"],
+    deps = [
+        ":event_pool",
+        ":local_device_state",
+        ":tracked_device_buffer",
+        "//tensorflow/compiler/xla:cpu_function_runtime",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/pjrt/distributed:protocol_proto_cc",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
+        "//tensorflow/core:allocator",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor/host:host_platform_id",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "cpu_device",
+    srcs = ["cpu_device.cc"],
+    hdrs = ["cpu_device.h"],
+    deps = [
+        ":pjrt_client",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "nvidia_gpu_device",
+    srcs = ["nvidia_gpu_device.cc"],
+    hdrs = ["nvidia_gpu_device.h"],
+    copts = if_cuda(["-DNCCL_ENABLED=1"]),
+    deps = [
+        ":pjrt_client",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/pjrt/distributed:client",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core/common_runtime:bfc_allocator",
+        "//tensorflow/core/common_runtime/gpu:gpu_mem_allocator",
+        "//tensorflow/stream_executor:tf_allocator_adapter",
+    ] + if_cuda(["@local_config_nccl//:nccl"]),
+)
+
+tf_cc_test(
+    name = "gpu_multistream_test",
+    srcs = ["gpu_multistream_test.cc"],
+    tags = [
+        # TODO(phawkins): figure out TF test infra such that this only runs under GPU.
+        "no_oss",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":nvidia_gpu_device",
+        ":pjrt_client",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:random",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/cpu_device.cc b/tensorflow/compiler/xla/pjrt/cpu_device.cc
similarity index 76%
rename from tensorflow/compiler/xla/python/cpu_device.cc
rename to tensorflow/compiler/xla/pjrt/cpu_device.cc
index 404d9ca133d..75c3bfc1277 100644
--- a/tensorflow/compiler/xla/python/cpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/cpu_device.h"
+#include "tensorflow/compiler/xla/pjrt/cpu_device.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 
@@ -24,9 +25,10 @@ static const char kCpuPlatformName[] = "cpu";
 
 CpuDevice::CpuDevice(int id,
                      std::unique_ptr<LocalDeviceState> local_device_state)
-    : Device(id, std::move(local_device_state), kCpuPlatformName) {}
+    : Device(id, std::move(local_device_state), kCpuPlatformName,
+             /*device_kind=*/kCpuPlatformName) {}
 
-StatusOr<std::shared_ptr<PyLocalClient>> GetCpuClient(bool asynchronous) {
+StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
                       PlatformUtil::GetPlatform("Host"));
   if (platform->VisibleDeviceCount() <= 0) {
@@ -39,8 +41,14 @@ StatusOr<std::shared_ptr<PyLocalClient>> GetCpuClient(bool asynchronous) {
 
   std::vector<std::unique_ptr<Device>> devices;
   for (int i = 0; i < client->device_count(); ++i) {
-    se::StreamExecutor* executor =
-        client->backend().stream_executor(i).ValueOrDie();
+    se::StreamExecutorConfig config;
+    config.ordinal = i;
+    // 8MiB stacks seem to be necessary for running LAPACK/OpenBLAS
+    // computations.
+    config.device_options.non_portable_tags["host_thread_stack_size_in_bytes"] =
+        absl::StrCat(8192 * 1024);
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                        platform->GetExecutor(config));
     auto device_state = absl::make_unique<LocalDeviceState>(
         executor, client, LocalDeviceState::kSynchronous, asynchronous,
         /*allow_event_reuse=*/false);
@@ -48,7 +56,7 @@ StatusOr<std::shared_ptr<PyLocalClient>> GetCpuClient(bool asynchronous) {
     devices.push_back(std::move(device));
   }
 
-  return std::make_shared<PyLocalClient>(
+  return std::make_shared<PjRtClient>(
       kCpuPlatformName, client, std::move(devices), /*host_id=*/0,
       /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
       /*gpu_run_options=*/nullptr);
diff --git a/tensorflow/compiler/xla/python/cpu_device.h b/tensorflow/compiler/xla/pjrt/cpu_device.h
similarity index 75%
rename from tensorflow/compiler/xla/python/cpu_device.h
rename to tensorflow/compiler/xla/pjrt/cpu_device.h
index 1039cb5d1c6..c70d90ae228 100644
--- a/tensorflow/compiler/xla/python/cpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_CPU_DEVICE_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_CPU_DEVICE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_CPU_DEVICE_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_CPU_DEVICE_H_
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/python/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
@@ -28,8 +28,8 @@ class CpuDevice : public Device {
   CpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state);
 };
 
-StatusOr<std::shared_ptr<PyLocalClient>> GetCpuClient(bool asynchronous);
+StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous);
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_CPU_DEVICE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_CPU_DEVICE_H_
diff --git a/tensorflow/compiler/xla/python/distributed/BUILD b/tensorflow/compiler/xla/pjrt/distributed/BUILD
similarity index 100%
rename from tensorflow/compiler/xla/python/distributed/BUILD
rename to tensorflow/compiler/xla/pjrt/distributed/BUILD
diff --git a/tensorflow/compiler/xla/python/distributed/client.cc b/tensorflow/compiler/xla/pjrt/distributed/client.cc
similarity index 94%
rename from tensorflow/compiler/xla/python/distributed/client.cc
rename to tensorflow/compiler/xla/pjrt/distributed/client.cc
index c50c3f50a9d..830e512b156 100644
--- a/tensorflow/compiler/xla/python/distributed/client.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/client.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/distributed/client.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/client.h"
 
 #include <chrono>  // NOLINT
 
-#include "tensorflow/compiler/xla/python/distributed/protocol.h"
-#include "tensorflow/compiler/xla/python/distributed/util.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/protocol.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/util.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/python/distributed/client.h b/tensorflow/compiler/xla/pjrt/distributed/client.h
similarity index 85%
rename from tensorflow/compiler/xla/python/distributed/client.h
rename to tensorflow/compiler/xla/pjrt/distributed/client.h
index 1ab5292bea8..865a752849e 100644
--- a/tensorflow/compiler/xla/python/distributed/client.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/client.h
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_CLIENT_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_CLIENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_CLIENT_H_
 
 #include <memory>
 
 #include "grpcpp/channel.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "tensorflow/compiler/xla/python/distributed/protocol.grpc.pb.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/protocol.grpc.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/platform/env.h"
 
@@ -47,4 +47,4 @@ class DistributedRuntimeClient {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_CLIENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_CLIENT_H_
diff --git a/tensorflow/compiler/xla/python/distributed/client_server_test.cc b/tensorflow/compiler/xla/pjrt/distributed/client_server_test.cc
similarity index 95%
rename from tensorflow/compiler/xla/python/distributed/client_server_test.cc
rename to tensorflow/compiler/xla/pjrt/distributed/client_server_test.cc
index e78949933a2..cfe60a06207 100644
--- a/tensorflow/compiler/xla/python/distributed/client_server_test.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/client_server_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "grpcpp/security/server_credentials.h"
 #include "absl/time/time.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/client.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/protocol.pb.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/service.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/python/distributed/client.h"
-#include "tensorflow/compiler/xla/python/distributed/protocol.pb.h"
-#include "tensorflow/compiler/xla/python/distributed/service.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/python/distributed/distributed.cc b/tensorflow/compiler/xla/pjrt/distributed/distributed.cc
similarity index 95%
rename from tensorflow/compiler/xla/python/distributed/distributed.cc
rename to tensorflow/compiler/xla/pjrt/distributed/distributed.cc
index 6afc7b1c4e9..7753e2dcfc7 100644
--- a/tensorflow/compiler/xla/python/distributed/distributed.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/distributed.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/distributed/distributed.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/distributed.h"
 
 #include "grpcpp/grpcpp.h"
 
diff --git a/tensorflow/compiler/xla/python/distributed/distributed.h b/tensorflow/compiler/xla/pjrt/distributed/distributed.h
similarity index 83%
rename from tensorflow/compiler/xla/python/distributed/distributed.h
rename to tensorflow/compiler/xla/pjrt/distributed/distributed.h
index 0475c3e9feb..b3909387259 100644
--- a/tensorflow/compiler/xla/python/distributed/distributed.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/distributed.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_DISTRIBUTED_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_DISTRIBUTED_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_DISTRIBUTED_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_DISTRIBUTED_H_
 
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/python/distributed/client.h"
-#include "tensorflow/compiler/xla/python/distributed/service.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/client.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
@@ -43,4 +43,4 @@ std::shared_ptr<DistributedRuntimeClient> GetDistributedRuntimeClient(
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_DISTRIBUTED_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_DISTRIBUTED_H_
diff --git a/tensorflow/compiler/xla/python/distributed/key_value_store.cc b/tensorflow/compiler/xla/pjrt/distributed/key_value_store.cc
similarity index 95%
rename from tensorflow/compiler/xla/python/distributed/key_value_store.cc
rename to tensorflow/compiler/xla/pjrt/distributed/key_value_store.cc
index 5966d4ce12b..e989b1384d2 100644
--- a/tensorflow/compiler/xla/python/distributed/key_value_store.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/key_value_store.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/distributed/key_value_store.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/key_value_store.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/python/distributed/key_value_store.h b/tensorflow/compiler/xla/pjrt/distributed/key_value_store.h
similarity index 89%
rename from tensorflow/compiler/xla/python/distributed/key_value_store.h
rename to tensorflow/compiler/xla/pjrt/distributed/key_value_store.h
index 8560305e6f6..d496de1feb5 100644
--- a/tensorflow/compiler/xla/python/distributed/key_value_store.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/key_value_store.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_KEY_VALUE_STORE_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_KEY_VALUE_STORE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_KEY_VALUE_STORE_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_KEY_VALUE_STORE_H_
 
 #include "grpcpp/grpcpp.h"
 #include "absl/base/thread_annotations.h"
@@ -50,4 +50,4 @@ class KeyValueStore {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_KEY_VALUE_STORE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_KEY_VALUE_STORE_H_
diff --git a/tensorflow/compiler/xla/python/distributed/protocol.h b/tensorflow/compiler/xla/pjrt/distributed/protocol.h
similarity index 80%
rename from tensorflow/compiler/xla/python/distributed/protocol.h
rename to tensorflow/compiler/xla/pjrt/distributed/protocol.h
index 208c6dab8c5..4daa939ac8d 100644
--- a/tensorflow/compiler/xla/python/distributed/protocol.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/protocol.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_PROTOCOL_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_PROTOCOL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_PROTOCOL_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_PROTOCOL_H_
 
 namespace xla {
 
@@ -22,4 +22,4 @@ static constexpr int kDistributedRuntimeProtocolVersion = 1;
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_PROTOCOL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_PROTOCOL_H_
diff --git a/tensorflow/compiler/xla/python/distributed/protocol.proto b/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
similarity index 100%
rename from tensorflow/compiler/xla/python/distributed/protocol.proto
rename to tensorflow/compiler/xla/pjrt/distributed/protocol.proto
diff --git a/tensorflow/compiler/xla/python/distributed/service.cc b/tensorflow/compiler/xla/pjrt/distributed/service.cc
similarity index 96%
rename from tensorflow/compiler/xla/python/distributed/service.cc
rename to tensorflow/compiler/xla/pjrt/distributed/service.cc
index cc2b3a5aca2..3325fcd8319 100644
--- a/tensorflow/compiler/xla/python/distributed/service.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/distributed/service.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/service.h"
 
-#include "tensorflow/compiler/xla/python/distributed/protocol.h"
-#include "tensorflow/compiler/xla/python/distributed/util.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/protocol.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/python/distributed/service.h b/tensorflow/compiler/xla/pjrt/distributed/service.h
similarity index 91%
rename from tensorflow/compiler/xla/python/distributed/service.h
rename to tensorflow/compiler/xla/pjrt/distributed/service.h
index baf470e4f13..725a76791ce 100644
--- a/tensorflow/compiler/xla/python/distributed/service.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_SERVICE_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_SERVICE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_SERVICE_H_
 
 #include "absl/time/time.h"
-#include "tensorflow/compiler/xla/python/distributed/key_value_store.h"
-#include "tensorflow/compiler/xla/python/distributed/protocol.grpc.pb.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/key_value_store.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/protocol.grpc.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
@@ -98,4 +98,4 @@ void BuildGlobalTopology(absl::Span<LocalTopologyProto> local_topologies,
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_SERVICE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_SERVICE_H_
diff --git a/tensorflow/compiler/xla/python/distributed/service_test.cc b/tensorflow/compiler/xla/pjrt/distributed/service_test.cc
similarity index 91%
rename from tensorflow/compiler/xla/python/distributed/service_test.cc
rename to tensorflow/compiler/xla/pjrt/distributed/service_test.cc
index 08326df2f38..b56dbb17d1a 100644
--- a/tensorflow/compiler/xla/python/distributed/service_test.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/service_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/distributed/service.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/service.h"
 
-#include "tensorflow/compiler/xla/python/distributed/protocol.pb.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/protocol.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/xla/python/distributed/util.h b/tensorflow/compiler/xla/pjrt/distributed/util.h
similarity index 87%
rename from tensorflow/compiler/xla/python/distributed/util.h
rename to tensorflow/compiler/xla/pjrt/distributed/util.h
index 07ae8d1f0ce..abb2b6089e7 100644
--- a/tensorflow/compiler/xla/python/distributed/util.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_UTIL_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_UTIL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_UTIL_H_
 
 #include "grpcpp/support/status.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -41,4 +41,4 @@ inline ::grpc::Status ToGrpcStatus(const Status& s) {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_UTIL_H_
diff --git a/tensorflow/compiler/xla/python/event_pool.cc b/tensorflow/compiler/xla/pjrt/event_pool.cc
similarity index 96%
rename from tensorflow/compiler/xla/python/event_pool.cc
rename to tensorflow/compiler/xla/pjrt/event_pool.cc
index c7b52f523d9..86aa38cdd0f 100644
--- a/tensorflow/compiler/xla/python/event_pool.cc
+++ b/tensorflow/compiler/xla/pjrt/event_pool.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/event_pool.h"
+#include "tensorflow/compiler/xla/pjrt/event_pool.h"
 
 #include "absl/memory/memory.h"
 #include "absl/synchronization/mutex.h"
diff --git a/tensorflow/compiler/xla/python/event_pool.h b/tensorflow/compiler/xla/pjrt/event_pool.h
similarity index 95%
rename from tensorflow/compiler/xla/python/event_pool.h
rename to tensorflow/compiler/xla/pjrt/event_pool.h
index bda3fb6baff..47768c28fd9 100644
--- a/tensorflow/compiler/xla/python/event_pool.h
+++ b/tensorflow/compiler/xla/pjrt/event_pool.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_EVENT_POOL_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_EVENT_POOL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_EVENT_POOL_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_EVENT_POOL_H_
 
 #include <memory>
 #include <stack>
@@ -87,4 +87,4 @@ class EventPool {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_EVENT_POOL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_EVENT_POOL_H_
diff --git a/tensorflow/compiler/xla/python/gpu_multistream_test.cc b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
similarity index 81%
rename from tensorflow/compiler/xla/python/gpu_multistream_test.cc
rename to tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
index a633e4dd020..2db7de3720d 100644
--- a/tensorflow/compiler/xla/python/gpu_multistream_test.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/python/local_client.h"
-#include "tensorflow/compiler/xla/python/nvidia_gpu_device.h"
+#include "tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/platform/random.h"
@@ -28,7 +28,7 @@ namespace {
 // computation wait for the inputs to be produced before executing.
 TEST(GpuMultiStream, Basics) {
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<PyLocalClient> client,
+      std::shared_ptr<PjRtClient> client,
       GetNvidiaGpuClient(/*asynchronous=*/true, GpuAllocatorConfig(),
                          /*distributed_client=*/nullptr, /*node_id=*/0));
 
@@ -54,10 +54,9 @@ TEST(GpuMultiStream, Basics) {
   device_assignment(0, 0) = device->id();
   compile_options.executable_build_options.set_device_assignment(
       device_assignment);
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<PyLocalExecutable> executable,
-      PyLocalExecutable::Compile(computation, client.get(),
-                                 std::move(compile_options)));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtExecutable> executable,
+                          PjRtExecutable::Compile(computation, client.get(),
+                                                  std::move(compile_options)));
 
   int64 dummy_size = 1 << 20;
   std::vector<int32> dummy_inputs(dummy_size);
@@ -72,19 +71,19 @@ TEST(GpuMultiStream, Basics) {
     // must wait.
     TF_ASSERT_OK_AND_ASSIGN(
         auto dummy_buffer,
-        PyLocalBuffer::FromHostBuffer(
+        PjRtBuffer::FromHostBuffer(
             dummy_inputs.data(), dummy_shape, /*force_copy=*/false,
             /*buffer_reference=*/nullptr, client.get(), device));
     TF_ASSERT_OK_AND_ASSIGN(
         auto in_buffer0,
-        PyLocalBuffer::FromHostBuffer(
-            inputs.data(), shape, /*force_copy=*/false,
-            /*buffer_reference=*/nullptr, client.get(), device));
+        PjRtBuffer::FromHostBuffer(inputs.data(), shape, /*force_copy=*/false,
+                                   /*buffer_reference=*/nullptr, client.get(),
+                                   device));
     TF_ASSERT_OK_AND_ASSIGN(
         auto in_buffer1,
-        PyLocalBuffer::FromHostBuffer(
-            inputs.data(), shape, /*force_copy=*/false,
-            /*buffer_reference=*/nullptr, client.get(), device));
+        PjRtBuffer::FromHostBuffer(inputs.data(), shape, /*force_copy=*/false,
+                                   /*buffer_reference=*/nullptr, client.get(),
+                                   device));
     // The execution may be enqueued before the transfers complete, requiring
     // adequate device-side synchronization.
     ExecuteOptions options;
diff --git a/tensorflow/compiler/xla/python/local_device_state.cc b/tensorflow/compiler/xla/pjrt/local_device_state.cc
similarity index 98%
rename from tensorflow/compiler/xla/python/local_device_state.cc
rename to tensorflow/compiler/xla/pjrt/local_device_state.cc
index 6a96908cb12..d173c891c95 100644
--- a/tensorflow/compiler/xla/python/local_device_state.cc
+++ b/tensorflow/compiler/xla/pjrt/local_device_state.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/local_device_state.h"
+#include "tensorflow/compiler/xla/pjrt/local_device_state.h"
 
 #include <memory>
 #include <vector>
diff --git a/tensorflow/compiler/xla/python/local_device_state.h b/tensorflow/compiler/xla/pjrt/local_device_state.h
similarity index 96%
rename from tensorflow/compiler/xla/python/local_device_state.h
rename to tensorflow/compiler/xla/pjrt/local_device_state.h
index 5cd2c0014a0..eb25c37878f 100644
--- a/tensorflow/compiler/xla/python/local_device_state.h
+++ b/tensorflow/compiler/xla/pjrt/local_device_state.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_DEVICE_STATE_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_DEVICE_STATE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_LOCAL_DEVICE_STATE_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_LOCAL_DEVICE_STATE_H_
 
 #include <memory>
 #include <random>
@@ -22,9 +22,9 @@ limitations under the License.
 
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/python/event_pool.h"
-#include "tensorflow/compiler/xla/python/semaphore.h"
-#include "tensorflow/compiler/xla/python/worker_thread.h"
+#include "tensorflow/compiler/xla/pjrt/event_pool.h"
+#include "tensorflow/compiler/xla/pjrt/semaphore.h"
+#include "tensorflow/compiler/xla/pjrt/worker_thread.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
@@ -207,4 +207,4 @@ class LocalDeviceState {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_DEVICE_STATE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_LOCAL_DEVICE_STATE_H_
diff --git a/tensorflow/compiler/xla/python/nvidia_gpu_device.cc b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
similarity index 93%
rename from tensorflow/compiler/xla/python/nvidia_gpu_device.cc
rename to tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
index 572b18a0abd..4863e5e8165 100644
--- a/tensorflow/compiler/xla/python/nvidia_gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/nvidia_gpu_device.h"
+#include "tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h"
 
 #ifdef NCCL_ENABLED
 #include "third_party/nccl/nccl.h"
@@ -31,10 +31,10 @@ namespace {
 
 static const char kGpuPlatformName[] = "gpu";
 
-// A custom PyLocalClient that overrides the device assignment method.
-class GpuClient : public xla::PyLocalClient {
+// A custom PjRtClient that overrides the device assignment method.
+class GpuClient : public xla::PjRtClient {
  public:
-  using xla::PyLocalClient::PyLocalClient;
+  using xla::PjRtClient::PjRtClient;
 
   xla::StatusOr<xla::DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
@@ -52,8 +52,7 @@ xla::StatusOr<xla::DeviceAssignment> GpuClient::GetDefaultDeviceAssignment(
     return assignment;
   }
   // Fallback to default global device assignment if we can't run locally.
-  return PyLocalClient::GetDefaultDeviceAssignment(num_replicas,
-                                                   num_partitions);
+  return PjRtClient::GetDefaultDeviceAssignment(num_replicas, num_partitions);
 }
 
 // Builds an xla::LocalClient for the GPU platform.
@@ -213,8 +212,11 @@ std::vector<std::unique_ptr<Device>> BuildLocalDevices(
   std::vector<std::unique_ptr<Device>> devices;
   for (auto& local_device : local_device_states) {
     int device_ordinal = local_device->device_ordinal();
+    const se::DeviceDescription& description =
+        local_device->executor()->GetDeviceDescription();
     auto device = absl::make_unique<GpuDevice>(
-        device_ordinal, std::move(local_device), /*node_id=*/0);
+        device_ordinal, std::move(local_device), description.name(),
+        /*node_id=*/0);
     devices.push_back(std::move(device));
   }
   return devices;
@@ -259,9 +261,9 @@ Status BuildDistributedDevices(
         gpu_device_ids[device_proto.local_device_ordinal()] =
             GlobalDeviceId(device_proto.global_device_id());
       }
-      auto device =
-          absl::make_unique<GpuDevice>(device_proto.global_device_id(),
-                                       std::move(local_device), node.node_id());
+      auto device = absl::make_unique<GpuDevice>(
+          device_proto.global_device_id(), std::move(local_device),
+          device_proto.name(), node.node_id());
       devices->push_back(std::move(device));
     }
   }
@@ -283,10 +285,11 @@ Status BuildDistributedDevices(
 
 GpuDevice::GpuDevice(int id,
                      std::unique_ptr<LocalDeviceState> local_device_state,
-                     int node_id)
-    : Device(id, std::move(local_device_state), kGpuPlatformName, node_id) {}
+                     std::string device_kind, int node_id)
+    : Device(id, std::move(local_device_state), kGpuPlatformName,
+             std::move(device_kind), node_id) {}
 
-StatusOr<std::shared_ptr<PyLocalClient>> GetNvidiaGpuClient(
+StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
     bool asynchronous, const GpuAllocatorConfig& allocator_config,
     std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id) {
   TF_ASSIGN_OR_RETURN(LocalClient * xla_client, GetGpuXlaClient());
@@ -309,7 +312,7 @@ StatusOr<std::shared_ptr<PyLocalClient>> GetNvidiaGpuClient(
     devices = BuildLocalDevices(std::move(local_device_states));
   }
 
-  std::shared_ptr<PyLocalClient> pyclient = std::make_shared<GpuClient>(
+  std::shared_ptr<PjRtClient> pyclient = std::make_shared<GpuClient>(
       "gpu", xla_client, std::move(devices),
       /*node_id=*/node_id, std::move(allocator),
       std::move(host_memory_allocator),
diff --git a/tensorflow/compiler/xla/python/nvidia_gpu_device.h b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
similarity index 83%
rename from tensorflow/compiler/xla/python/nvidia_gpu_device.h
rename to tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
index 333a82a2d78..bf59ddef3a9 100644
--- a/tensorflow/compiler/xla/python/nvidia_gpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_NVIDIA_GPU_DEVICE_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_NVIDIA_GPU_DEVICE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_NVIDIA_GPU_DEVICE_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_NVIDIA_GPU_DEVICE_H_
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/python/distributed/client.h"
-#include "tensorflow/compiler/xla/python/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 
@@ -28,7 +28,7 @@ namespace xla {
 class GpuDevice : public Device {
  public:
   GpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state,
-            int node_id);
+            std::string device_kind, int node_id);
 };
 
 struct GpuAllocatorConfig {
@@ -53,10 +53,10 @@ struct GpuAllocatorConfig {
 
 // distributed_client may be nullptr in non-distributed settings.
 // distributed_client should not be Open()ed before calling this function.
-StatusOr<std::shared_ptr<PyLocalClient>> GetNvidiaGpuClient(
+StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
     bool asynchronous, const GpuAllocatorConfig& allocator_config,
     std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id);
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_NVIDIA_GPU_DEVICE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_NVIDIA_GPU_DEVICE_H_
diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
similarity index 78%
rename from tensorflow/compiler/xla/python/local_client.cc
rename to tensorflow/compiler/xla/pjrt/pjrt_client.cc
index 68165c220f8..80fd0e0b658 100644
--- a/tensorflow/compiler/xla/python/local_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -52,7 +52,7 @@ limitations under the License.
 // host-to-device transfers, device-to-host transfers, and compute. This allows
 // us to overlap transfers on and off the device with computation.
 //
-// Synchronization between streams occurs via BufferDefinitionEvents that
+// Synchronization between streams occurs via BufferSequencingEvents that
 // describe when the contents of a logical buffer are known to be valid on
 // a particular stream, and when a buffer's uses have all completed.
 //
@@ -62,7 +62,7 @@ limitations under the License.
 // See the comment on LocalDeviceState::AllocationModel for a discussion of the
 // different allocation semantics on CPU, GPU, and TPU.
 
-#include "tensorflow/compiler/xla/python/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 
 #include <cstddef>
 #include <memory>
@@ -79,13 +79,14 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/python/distributed/protocol.pb.h"
-#include "tensorflow/compiler/xla/python/event_pool.h"
-#include "tensorflow/compiler/xla/python/local_device_state.h"
-#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/protocol.pb.h"
+#include "tensorflow/compiler/xla/pjrt/event_pool.h"
+#include "tensorflow/compiler/xla/pjrt/local_device_state.h"
+#include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
@@ -100,6 +101,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/host/host_platform_id.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/stream.h"
 
@@ -152,7 +154,7 @@ StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
   return xla_assignment;
 }
 
-PyLocalClient::PyLocalClient(
+PjRtClient::PjRtClient(
     std::string platform_name, LocalClient* client,
     std::vector<std::unique_ptr<Device>> devices, int host_id,
     std::unique_ptr<se::DeviceMemoryAllocator> allocator,
@@ -191,15 +193,14 @@ PyLocalClient::PyLocalClient(
   }
 }
 
-StatusOr<DeviceAssignment> PyLocalClient::GetDefaultDeviceAssignment(
+StatusOr<DeviceAssignment> PjRtClient::GetDefaultDeviceAssignment(
     int num_replicas, int num_partitions) const {
   return client_->backend().computation_placer()->AssignDevices(num_replicas,
                                                                 num_partitions);
 }
 
-StatusOr<absl::flat_hash_set<int>>
-PyLocalClient::GetParametersThatMustBeDonated(const LocalExecutable& executable,
-                                              bool tuple_inputs) const {
+StatusOr<absl::flat_hash_set<int>> PjRtClient::GetParametersThatMustBeDonated(
+    const LocalExecutable& executable, bool tuple_inputs) const {
   // TODO(b/149489114) support buffer donation on CPU/GPU when XLA supports it.
   const HloInputOutputAliasConfig& config =
       executable.executable()->module().input_output_alias_config();
@@ -274,10 +275,10 @@ void StallStreamOnError(LocalDeviceState* local_device, se::Stream* stream) {
 // a reference to the buffer until the copy completes or serialize the compute
 // stream behind the copy. It is often better to retain a reference since while
 // that keeps memory alive longer, it avoids stalling the compute stream.
-void RecordUsage(PyLocalBuffer::ScopedHold device_buffer,
+void RecordUsage(PjRtBuffer::ScopedHold device_buffer,
                  LocalDeviceState* buffer_local_device,
                  LocalDeviceState* stream_local_device,
-                 std::shared_ptr<BufferDefinitionEvent> event,
+                 std::shared_ptr<BufferSequencingEvent> event,
                  se::Stream* usage_stream, bool prefer_to_retain_reference) {
   bool retain_buffer_until_completion =
       // If the buffer wasn't allocated on the same device as the stream, always
@@ -303,11 +304,11 @@ void RecordUsage(PyLocalBuffer::ScopedHold device_buffer,
 // buffer is a tuple then the tuple tables are allocated, and all necessary
 // synchronization for them is dealt with, before the buffer is returned.
 //
-// It is safe to delete the returned PyLocalBuffer without further
+// It is safe to delete the returned PjRtBuffer without further
 // synchronization if an error occurs before the buffer is used.
-StatusOr<std::unique_ptr<PyLocalBuffer>> AllocateDestinationBuffer(
+StatusOr<std::unique_ptr<PjRtBuffer>> AllocateDestinationBuffer(
     const Shape& on_host_shape, Device* device, LocalDeviceState* local_device,
-    se::Stream* copy_stream, PyLocalClient* client) {
+    se::Stream* copy_stream, PjRtClient* client) {
   if (on_host_shape.IsTuple() && on_host_shape.tuple_shapes_size() == 0) {
     return InvalidArgument("Can't make a buffer from an empty tuple");
   }
@@ -328,11 +329,11 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> AllocateDestinationBuffer(
   }
   Shape on_device_shape = dst_buffer.on_device_shape();
 
-  absl::InlinedVector<std::shared_ptr<BufferDefinitionEvent>, 2>
+  absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 2>
       definition_events;
   // We always have at least one definition event, for the copy completing to
   // the device buffers.
-  definition_events.emplace_back(std::make_shared<BufferDefinitionEvent>());
+  definition_events.emplace_back(std::make_shared<BufferSequencingEvent>());
   se::Stream* tuple_table_stream = local_device->host_to_device_stream();
   if (on_device_shape.IsTuple()) {
     // We also need to copy the tuple tables, so we'll have a second defintion
@@ -353,7 +354,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> AllocateDestinationBuffer(
     // from error cases because we have started a transfer and must not allow
     // dst_buffer to be freed too soon in the non-async allocation models.
 
-    definition_events.emplace_back(std::make_shared<BufferDefinitionEvent>());
+    definition_events.emplace_back(std::make_shared<BufferSequencingEvent>());
     StatusOr<EventPool::Handle> event_or =
         local_device->event_pool().ThenAllocateAndRecordEvent(
             tuple_table_stream);
@@ -361,16 +362,16 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> AllocateDestinationBuffer(
       StallStreamOnError(local_device, tuple_table_stream);
       return event_or.status();
     }
-    definition_events[1]->SetDefinitionEvent(event_or.ConsumeValueOrDie(),
+    definition_events[1]->SetSequencingEvent(event_or.ConsumeValueOrDie(),
                                              tuple_table_stream);
   }
-  std::shared_ptr<SharedDeviceBuffer> dst_device_buffer =
-      SharedDeviceBuffer::FromScopedShapedBuffer(&dst_buffer,
-                                                 definition_events);
+  std::shared_ptr<TrackedDeviceBuffer> dst_device_buffer =
+      TrackedDeviceBuffer::FromScopedShapedBuffer(&dst_buffer,
+                                                  definition_events);
 
-  auto py_buffer = absl::make_unique<PyLocalBuffer>(
-      on_host_shape, on_device_shape, std::move(dst_device_buffer), client,
-      device);
+  auto py_buffer = absl::make_unique<PjRtBuffer>(on_host_shape, on_device_shape,
+                                                 std::move(dst_device_buffer),
+                                                 client, device);
 
   if (on_device_shape.IsTuple()) {
     // Add a usage hold for the tuple table write and immediately convert it to
@@ -393,8 +394,8 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> AllocateDestinationBuffer(
 // definition_event was added when the buffer was allocated, but has not yet
 // had an event recorded.
 Status AddDestinationBufferSynchronization(
-    LocalDeviceState* local_device, PyLocalBuffer::ScopedHold device_buffer,
-    std::shared_ptr<BufferDefinitionEvent> definition_event,
+    LocalDeviceState* local_device, PjRtBuffer::ScopedHold device_buffer,
+    std::shared_ptr<BufferSequencingEvent> definition_event,
     se::Stream* copy_stream) {
   StatusOr<EventPool::Handle> event_or =
       local_device->event_pool().ThenAllocateAndRecordEvent(copy_stream);
@@ -402,7 +403,7 @@ Status AddDestinationBufferSynchronization(
     StallStreamOnError(local_device, copy_stream);
     return event_or.status();
   }
-  definition_event->SetDefinitionEvent(event_or.ConsumeValueOrDie(),
+  definition_event->SetSequencingEvent(event_or.ConsumeValueOrDie(),
                                        copy_stream);
   // prefer_to_retain_reference=false means don't retain a memory reference
   // until the transfer is complete when using the ComputeSynchronized
@@ -418,13 +419,13 @@ Status AddDestinationBufferSynchronization(
 
 }  // namespace
 
-PyLocalBuffer::ScopedHold::~ScopedHold() {
+PjRtBuffer::ScopedHold::~ScopedHold() {
   if (ok()) {
     parent_->DropHold(type_, buffer().get());
   }
 }
 
-PyLocalBuffer::ScopedHold::ScopedHold(ScopedHold&& other)
+PjRtBuffer::ScopedHold::ScopedHold(ScopedHold&& other)
     : parent_(other.parent_),
       type_(other.type_),
       buffer_or_(std::move(other.buffer_or_)) {
@@ -432,23 +433,23 @@ PyLocalBuffer::ScopedHold::ScopedHold(ScopedHold&& other)
   other.SetError(InvalidArgument("Buffer has been moved."));
 }
 
-void PyLocalBuffer::ScopedHold::Acquire(
-    StatusOr<std::shared_ptr<SharedDeviceBuffer>>&& buffer_or) {
+void PjRtBuffer::ScopedHold::Acquire(
+    StatusOr<std::shared_ptr<TrackedDeviceBuffer>>&& buffer_or) {
   CHECK(!ok());
   buffer_or_ = std::move(buffer_or);
   // Check the invariant holds.
   CHECK(!ok() || buffer_or_.ValueOrDie() != nullptr);
 }
 
-PyLocalBuffer::ScopedHold::ForClosure PyLocalBuffer::ScopedHold::ToClosure() {
+PjRtBuffer::ScopedHold::ForClosure PjRtBuffer::ScopedHold::ToClosure() {
   CHECK(ok());
   ForClosure for_closure(parent_, type_, std::move(buffer_or_));
   SetError(InvalidArgument("Buffer has been released"));
   return for_closure;
 }
 
-void PyLocalBuffer::ScopedHold::ConvertUsageHold(
-    se::Stream* usage_stream, std::shared_ptr<BufferDefinitionEvent> event,
+void PjRtBuffer::ScopedHold::ConvertUsageHold(
+    se::Stream* usage_stream, std::shared_ptr<BufferSequencingEvent> event,
     bool reference_held) {
   CHECK(ok());
   CHECK(type_ == kUsage);
@@ -457,14 +458,14 @@ void PyLocalBuffer::ScopedHold::ConvertUsageHold(
   SetError(InvalidArgument("Buffer has been converted"));
 }
 
-void PyLocalBuffer::ScopedHold::ConfirmDonation() {
+void PjRtBuffer::ScopedHold::ConfirmDonation() {
   CHECK(ok());
   CHECK(type_ == kDonation);
   parent_->ConfirmDonation(buffer().get());
   SetError(InvalidArgument("Buffer has been donated"));
 }
 
-void PyLocalBuffer::ScopedHold::AddToInput(
+void PjRtBuffer::ScopedHold::AddToInput(
     ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
     const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
     ExecutionInput* execution_input,
@@ -479,12 +480,12 @@ void PyLocalBuffer::ScopedHold::AddToInput(
 }
 
 /* static */
-StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
+StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
     const void* data, const Shape& shape, bool force_copy,
-    std::shared_ptr<void> buffer_reference, PyLocalClient* client,
+    std::shared_ptr<void> buffer_reference, PjRtClient* client,
     Device* device) {
-  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromHostBuffer");
-  VLOG(2) << "PyLocalBuffer::FromHostBuffer: shape: " << shape.ToString()
+  tensorflow::profiler::TraceMe traceme("PjRtBuffer::FromHostBuffer");
+  VLOG(2) << "PjRtBuffer::FromHostBuffer: shape: " << shape.ToString()
           << " device: " << device->DebugString();
   if (shape.IsTuple()) {
     return InvalidArgument("Use FromHostLiteral to transfer a tuple");
@@ -494,27 +495,25 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
 
   // If we are on the host platform and the input buffer is sufficiently
   // aligned, we can simply point to the input array's data without any further
-  // copies. We require a 64-byte alignment because XLA may generate AVX512
-  // code which requires it. If the client allocator doesn't align quite as
-  // aggressively, (e.g., NumPy doesn't) there's a high chance this test will
-  // fail.
-  static constexpr int kMinimumAlignment = 64;
+  // copies. At the time of writing we require a 16-byte alignment because XLA
+  // may generate code which requires it.
   if (!force_copy &&
-      ((absl::bit_cast<std::uintptr_t>(data) & (kMinimumAlignment - 1)) == 0) &&
-      local_device->executor()->platform_kind() == se::PlatformKind::kHost) {
+      ((absl::bit_cast<std::uintptr_t>(data) &
+        (cpu_function_runtime::kMinAlign - 1)) == 0) &&
+      local_device->executor()->platform()->id() == se::host::kHostPlatformId) {
     std::function<void()> on_delete_callback =
         [buffer_reference{std::move(buffer_reference)}]() {
           // Frees buffer_reference.
         };
     se::DeviceMemoryBase buffer(const_cast<void*>(data),
                                 ShapeUtil::ByteSizeOf(shape));
-    absl::Span<const std::shared_ptr<BufferDefinitionEvent>> definition_events;
-    auto device_buffer = std::make_shared<SharedDeviceBuffer>(
+    absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events;
+    auto device_buffer = std::make_shared<TrackedDeviceBuffer>(
         /*allocator=*/nullptr, local_device->device_ordinal(),
         std::initializer_list<se::DeviceMemoryBase>{buffer}, definition_events,
         std::move(on_delete_callback));
-    return absl::make_unique<PyLocalBuffer>(
-        shape, shape, std::move(device_buffer), client, device);
+    return absl::make_unique<PjRtBuffer>(shape, shape, std::move(device_buffer),
+                                         client, device);
   }
 
   TransferManager* transfer_manager =
@@ -522,7 +521,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
   TF_ASSIGN_OR_RETURN(Shape compact_shape,
                       transfer_manager->ChooseCompactLayoutForShape(shape));
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PyLocalBuffer> py_buffer,
+      std::unique_ptr<PjRtBuffer> py_buffer,
       AllocateDestinationBuffer(compact_shape, device, local_device,
                                 local_device->host_to_device_stream(), client));
 
@@ -574,7 +573,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
           local_device->host_to_device_stream(), literal, buffer));
     }
 
-    std::shared_ptr<BufferDefinitionEvent> event =
+    std::shared_ptr<BufferSequencingEvent> event =
         device_buffer->definition_events()[0];
     TF_CHECK_OK(AddDestinationBufferSynchronization(
         local_device, std::move(device_buffer), event,
@@ -589,10 +588,10 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
 }
 
 /* static */
-StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostLiteral(
-    const LiteralSlice& literal, PyLocalClient* client, Device* device) {
-  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromHostLiteral");
-  VLOG(2) << "PyLocalBuffer::FromHostLiteral: shape: "
+StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
+    const LiteralSlice& literal, PjRtClient* client, Device* device) {
+  tensorflow::profiler::TraceMe traceme("PjRtBuffer::FromHostLiteral");
+  VLOG(2) << "PjRtBuffer::FromHostLiteral: shape: "
           << literal.shape().ToString() << " device: " << device->DebugString();
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
                       device->GetLocalDeviceState());
@@ -603,7 +602,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostLiteral(
       Shape compact_shape,
       transfer_manager->ChooseCompactLayoutForShape(literal.shape()));
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PyLocalBuffer> py_buffer,
+      std::unique_ptr<PjRtBuffer> py_buffer,
       AllocateDestinationBuffer(compact_shape, device, local_device,
                                 local_device->host_to_device_stream(), client));
 
@@ -632,7 +631,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostLiteral(
     TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
         local_device->host_to_device_stream(), literal, buffer));
 
-    std::shared_ptr<BufferDefinitionEvent> event =
+    std::shared_ptr<BufferSequencingEvent> event =
         device_buffer->definition_events()[0];
     TF_CHECK_OK(AddDestinationBufferSynchronization(
         local_device, std::move(device_buffer), event,
@@ -642,9 +641,9 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostLiteral(
   return py_buffer;
 }
 
-/*static*/ void PyLocalBuffer::MakeCrossHostReceiveBuffers(
-    absl::Span<const Shape> shapes, PyLocalClient* client, Device* device,
-    PyLocalCrossHostRecvNotifier&& notifier) {
+/*static*/ void PjRtBuffer::MakeCrossHostReceiveBuffers(
+    absl::Span<const Shape> shapes, PjRtClient* client, Device* device,
+    PjRtCrossHostRecvNotifier&& notifier) {
   if (shapes.empty()) {
     notifier(InvalidArgument(
         "shapes parameter empty in MakeCrossHostReceiveBuffers"));
@@ -658,10 +657,10 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostLiteral(
   }
   LocalDeviceState* local_device = local_device_or.ConsumeValueOrDie();
 
-  std::vector<std::unique_ptr<PyLocalBuffer>> buffers;
+  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
   buffers.reserve(shapes.size());
   for (const auto& shape : shapes) {
-    StatusOr<std::unique_ptr<PyLocalBuffer>> buffer_or =
+    StatusOr<std::unique_ptr<PjRtBuffer>> buffer_or =
         AllocateDestinationBuffer(shape, device, local_device,
                                   /*copy_stream=*/nullptr, client);
     if (!buffer_or.ok()) {
@@ -674,9 +673,9 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostLiteral(
   client->EnqueueCrossHostReceive(std::move(buffers), std::move(notifier));
 }
 
-PyLocalBuffer::PyLocalBuffer(Shape on_host_shape, Shape on_device_shape,
-                             std::shared_ptr<SharedDeviceBuffer> device_buffer,
-                             PyLocalClient* client, Device* device)
+PjRtBuffer::PjRtBuffer(Shape on_host_shape, Shape on_device_shape,
+                       std::shared_ptr<TrackedDeviceBuffer> device_buffer,
+                       PjRtClient* client, Device* device)
     : client_(client),
       on_host_shape_(std::move(on_host_shape)),
       on_device_shape_(std::move(on_device_shape)),
@@ -688,14 +687,14 @@ PyLocalBuffer::PyLocalBuffer(Shape on_host_shape, Shape on_device_shape,
   }
 }
 
-PyLocalBuffer::~PyLocalBuffer() {
+PjRtBuffer::~PjRtBuffer() {
   Delete();
   for (int i = 0; i < ScopedHold::Type::kMaxValue; ++i) {
     CHECK_EQ(holds_[i], 0);
   }
 }
 
-void PyLocalBuffer::WaitForOutstandingUsageHolds() {
+void PjRtBuffer::WaitForOutstandingUsageHolds() {
   auto not_in_usage_hold = [&]() {
     mu_.AssertHeld();
     return holds_[ScopedHold::kUsage] == 0;
@@ -703,7 +702,7 @@ void PyLocalBuffer::WaitForOutstandingUsageHolds() {
   mu_.Await(absl::Condition(&not_in_usage_hold));
 }
 
-void PyLocalBuffer::WaitForOutstandingDonationHold() {
+void PjRtBuffer::WaitForOutstandingDonationHold() {
   auto not_in_donation_hold = [&]() {
     mu_.AssertHeld();
     return holds_[ScopedHold::kDonation] == 0;
@@ -711,10 +710,10 @@ void PyLocalBuffer::WaitForOutstandingDonationHold() {
   mu_.Await(absl::Condition(&not_in_donation_hold));
 }
 
-StatusOr<std::shared_ptr<SharedDeviceBuffer>> PyLocalBuffer::Release(
+StatusOr<std::shared_ptr<TrackedDeviceBuffer>> PjRtBuffer::Release(
     bool wait_for_operations_to_complete) {
-  std::shared_ptr<SharedDeviceBuffer> device_buffer;
-  SharedDeviceBuffer::StreamAndEventContainer events;
+  std::shared_ptr<TrackedDeviceBuffer> device_buffer;
+  TrackedDeviceBuffer::StreamAndEventContainer events;
   {
     absl::MutexLock lock(&mu_);
     // We first wait for a donation hold to complete if there is one in
@@ -722,7 +721,7 @@ StatusOr<std::shared_ptr<SharedDeviceBuffer>> PyLocalBuffer::Release(
     // set device_buffer_ to nullptr before returning to this thread.
     WaitForOutstandingDonationHold();
     if (device_buffer_ == nullptr) {
-      return std::shared_ptr<SharedDeviceBuffer>();
+      return std::shared_ptr<TrackedDeviceBuffer>();
     }
     // Set host_value_ and device_buffer_ to null now so that no other thread
     // can add a hold while we are in WaitForOutstandingUsageHolds()
@@ -774,10 +773,10 @@ StatusOr<std::shared_ptr<SharedDeviceBuffer>> PyLocalBuffer::Release(
         }
       }
       if (block_stream != nullptr) {
+        se::Stream* block_stream_ptr = block_stream.release();
         local_device_state->ThenExecuteOnCallbackThread(
-            block_stream.get(),
-            [device_buffer, block_stream_ptr{block_stream.release()},
-             local_device_state]() {
+            block_stream_ptr,
+            [device_buffer, block_stream_ptr, local_device_state]() {
               local_device_state->ReturnStreamToPool(
                   std::unique_ptr<se::Stream>(block_stream_ptr));
             });
@@ -787,18 +786,18 @@ StatusOr<std::shared_ptr<SharedDeviceBuffer>> PyLocalBuffer::Release(
   return device_buffer;
 }
 
-void PyLocalBuffer::Delete() {
+void PjRtBuffer::Delete() {
   // When wait_for_reads_to_complete is false, Release should never fail.
   TF_CHECK_OK(Release(/*wait_for_operations_to_complete=*/false).status());
 }
 
-bool PyLocalBuffer::IsDeleted() {
+bool PjRtBuffer::IsDeleted() {
   absl::MutexLock lock(&mu_);
   return device_buffer_ == nullptr;
 }
 
-StatusOr<std::shared_ptr<SharedDeviceBuffer>>
-PyLocalBuffer::GetBufferForHoldLocked(ScopedHold::Type type) {
+StatusOr<std::shared_ptr<TrackedDeviceBuffer>>
+PjRtBuffer::GetBufferForHoldLocked(ScopedHold::Type type) {
   if (type == ScopedHold::kDonation) {
     if (device_buffer_ == nullptr) {
       return InvalidArgument("Donation requested for invalid buffer");
@@ -832,13 +831,14 @@ PyLocalBuffer::GetBufferForHoldLocked(ScopedHold::Type type) {
   return device_buffer_;
 }
 
-void PyLocalBuffer::AcquireHoldLocked(ScopedHold* hold) {
+void PjRtBuffer::AcquireHoldLocked(ScopedHold* hold) {
   hold->Acquire(GetBufferForHoldLocked(hold->type()));
 }
 
-void PyLocalBuffer::ConvertUsageHold(
-    SharedDeviceBuffer* buffer, se::Stream* usage_stream,
-    std::shared_ptr<BufferDefinitionEvent> event, bool reference_held) {
+void PjRtBuffer::ConvertUsageHold(TrackedDeviceBuffer* buffer,
+                                  se::Stream* usage_stream,
+                                  std::shared_ptr<BufferSequencingEvent> event,
+                                  bool reference_held) {
   absl::MutexLock lock(&mu_);
   CHECK(device_buffer_.get() == buffer || device_buffer_ == nullptr);
   buffer->AddUsageEvent(usage_stream, std::move(event), reference_held);
@@ -846,7 +846,7 @@ void PyLocalBuffer::ConvertUsageHold(
   --holds_[ScopedHold::kUsage];
 }
 
-void PyLocalBuffer::ConfirmDonation(SharedDeviceBuffer* device_buffer) {
+void PjRtBuffer::ConfirmDonation(TrackedDeviceBuffer* device_buffer) {
   {
     absl::MutexLock lock(&mu_);
     CHECK_EQ(holds_[ScopedHold::kUsage], 0);
@@ -868,8 +868,7 @@ void PyLocalBuffer::ConfirmDonation(SharedDeviceBuffer* device_buffer) {
   donation_semaphore_.Release(1);
 }
 
-void PyLocalBuffer::DropHold(ScopedHold::Type type,
-                             SharedDeviceBuffer* buffer) {
+void PjRtBuffer::DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer) {
   absl::MutexLock lock(&mu_);
   CHECK(device_buffer_.get() == buffer || device_buffer_ == nullptr);
   CHECK_GT(holds_[type], 0);
@@ -882,7 +881,7 @@ void PyLocalBuffer::DropHold(ScopedHold::Type type,
   }
 }
 
-Status PyLocalBuffer::CopyToHostAsync() {
+Status PjRtBuffer::CopyToHostAsync() {
   if (IsEmptyTuple()) {
     return InvalidArgument("CopyToHostAsync called on empty tuple");
   }
@@ -915,7 +914,7 @@ Status PyLocalBuffer::CopyToHostAsync() {
         host_value->ready.Notify();
       });
 
-  auto usage_event = std::make_shared<BufferDefinitionEvent>();
+  auto usage_event = std::make_shared<BufferSequencingEvent>();
   StatusOr<EventPool::Handle> event_or =
       local_device->event_pool().ThenAllocateAndRecordEvent(stream);
   if (!event_or.ok()) {
@@ -924,7 +923,7 @@ Status PyLocalBuffer::CopyToHostAsync() {
     StallStreamOnError(local_device, stream);
     return event_or.status();
   }
-  usage_event->SetDefinitionEvent(event_or.ConsumeValueOrDie(), stream);
+  usage_event->SetSequencingEvent(event_or.ConsumeValueOrDie(), stream);
   // When using the ComputeSynchronized allocation model, retain a reference to
   // the device_buffer until the copy completes, to ensure that the buffer isn't
   // deleted or donated while it is still in use. The choice of retaining a
@@ -940,8 +939,8 @@ Status PyLocalBuffer::CopyToHostAsync() {
   return Status::OK();
 }
 
-StatusOr<std::shared_ptr<Literal>> PyLocalBuffer::ToLiteral() {
-  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::ToLiteral");
+StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral() {
+  tensorflow::profiler::TraceMe traceme("PjRtBuffer::ToLiteral");
   TF_RETURN_IF_ERROR(CopyToHostAsync());
   std::shared_ptr<HostValue> host_value;
   {
@@ -956,7 +955,7 @@ StatusOr<std::shared_ptr<Literal>> PyLocalBuffer::ToLiteral() {
   return host_value->value;
 }
 
-StatusOr<ShapedBuffer> PyLocalBuffer::AsShapedBuffer() const {
+StatusOr<ShapedBuffer> PjRtBuffer::AsShapedBuffer() const {
   absl::MutexLock lock(&mu_);
   if (device_buffer_ == nullptr) {
     return InvalidArgument(
@@ -966,8 +965,7 @@ StatusOr<ShapedBuffer> PyLocalBuffer::AsShapedBuffer() const {
                                         client_->client()->platform());
 }
 
-PyLocalBuffer::ScopedHold PyLocalBuffer::GetBufferWithHold(
-    ScopedHold::Type type) {
+PjRtBuffer::ScopedHold PjRtBuffer::GetBufferWithHold(ScopedHold::Type type) {
   if (type == ScopedHold::kDonation) {
     // Ensure that at most one donation hold can be in progress at a time.
     donation_semaphore_.Acquire(1);
@@ -981,14 +979,14 @@ PyLocalBuffer::ScopedHold PyLocalBuffer::GetBufferWithHold(
   return hold;
 }
 
-StatusOr<std::pair<std::unique_ptr<PyLocalBuffer>,
-                   std::shared_ptr<BufferDefinitionEvent>>>
-PyLocalBuffer::CopyToDeviceHelper(
+StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
+                   std::shared_ptr<BufferSequencingEvent>>>
+PjRtBuffer::CopyToDeviceHelper(
     Device* dst_device, LocalDeviceState* dst_local_device,
     LocalDeviceState* transfer_local_device, se::Stream* transfer_stream,
-    std::shared_ptr<SharedDeviceBuffer> src_device_buffer) {
+    std::shared_ptr<TrackedDeviceBuffer> src_device_buffer) {
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PyLocalBuffer> py_buffer,
+      std::unique_ptr<PjRtBuffer> py_buffer,
       AllocateDestinationBuffer(on_host_shape_, dst_device, dst_local_device,
                                 transfer_stream, client_));
 
@@ -1002,8 +1000,8 @@ PyLocalBuffer::CopyToDeviceHelper(
       on_host_shape_, on_device_shape_, client_->client()->platform());
 
   // Copy the leaf buffers.
-  StatusOr<std::shared_ptr<BufferDefinitionEvent>> copy_event_or =
-      [&]() -> StatusOr<std::shared_ptr<BufferDefinitionEvent>> {
+  StatusOr<std::shared_ptr<BufferSequencingEvent>> copy_event_or =
+      [&]() -> StatusOr<std::shared_ptr<BufferSequencingEvent>> {
     for (const auto& leaf : src_buffer.buffers().leaves()) {
       const ShapeIndex& index = leaf.first;
       const se::DeviceMemoryBase& input_buffer = leaf.second;
@@ -1017,7 +1015,7 @@ PyLocalBuffer::CopyToDeviceHelper(
             output_buffer));
       }
     }
-    std::shared_ptr<BufferDefinitionEvent> event =
+    std::shared_ptr<BufferSequencingEvent> event =
         dst_device_buffer->definition_events()[0];
     TF_RETURN_IF_ERROR(AddDestinationBufferSynchronization(
         transfer_local_device, std::move(dst_device_buffer), event,
@@ -1037,14 +1035,14 @@ PyLocalBuffer::CopyToDeviceHelper(
     return copy_event_or.status();
   }
 
-  return std::pair<std::unique_ptr<PyLocalBuffer>,
-                   std::shared_ptr<BufferDefinitionEvent>>(
+  return std::pair<std::unique_ptr<PjRtBuffer>,
+                   std::shared_ptr<BufferSequencingEvent>>(
       std::move(py_buffer), copy_event_or.ConsumeValueOrDie());
 }
 
-StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::CopyToDevice(
+StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CopyToDevice(
     Device* dst_device) {
-  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::CopyToDevice");
+  tensorflow::profiler::TraceMe traceme("PjRtBuffer::CopyToDevice");
   if (dst_device == device_) {
     return InvalidArgument(
         "CopyToDevice cannot accept the same source and destination devices");
@@ -1072,8 +1070,8 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::CopyToDevice(
     AcquireHoldLocked(&src_device_buffer);
   }
 
-  StatusOr<std::pair<std::unique_ptr<PyLocalBuffer>,
-                     std::shared_ptr<BufferDefinitionEvent>>>
+  StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
+                     std::shared_ptr<BufferSequencingEvent>>>
       buffer_and_event_or = CopyToDeviceHelper(
           dst_device, dst_local_device, transfer_local_device, transfer_stream,
           src_device_buffer.buffer());
@@ -1082,8 +1080,8 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::CopyToDevice(
   }
 
   auto& buffer_and_event = buffer_and_event_or.ValueOrDie();
-  std::unique_ptr<PyLocalBuffer>& buffer = buffer_and_event.first;
-  std::shared_ptr<BufferDefinitionEvent>& event = buffer_and_event.second;
+  std::unique_ptr<PjRtBuffer>& buffer = buffer_and_event.first;
+  std::shared_ptr<BufferSequencingEvent>& event = buffer_and_event.second;
 
   // prefer_to_retain_reference=*/true means that, when using the
   // ComputeSynchronized allocation model, retain a reference to the
@@ -1098,14 +1096,13 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::CopyToDevice(
   return std::move(buffer);
 }
 
-Status PyLocalBuffer::CopyToRemoteDevice(
-    absl::string_view serialized_descriptor) {
+Status PjRtBuffer::CopyToRemoteDevice(absl::string_view serialized_descriptor) {
   return client_->CopyToRemoteDevice(this, serialized_descriptor);
 }
 
-Status PyLocalBuffer::BlockHostUntilReady() {
-  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::BlockHostUntilReady");
-  std::shared_ptr<SharedDeviceBuffer> device_buffer;
+Status PjRtBuffer::BlockHostUntilReady() {
+  tensorflow::profiler::TraceMe traceme("PjRtBuffer::BlockHostUntilReady");
+  std::shared_ptr<TrackedDeviceBuffer> device_buffer;
   {
     absl::MutexLock lock(&mu_);
     if (device_buffer_ == nullptr) {
@@ -1141,20 +1138,20 @@ struct TupleHandle {
   ExecutionInput execution_input;
   // A definition event that has been recorded on the host_to_device stream
   // after the tuple table transfer.
-  std::shared_ptr<BufferDefinitionEvent> event;
+  std::shared_ptr<BufferSequencingEvent> event;
 };
 
 // Makes a tuple from the arguments to an execution.
 StatusOr<TupleHandle> MakeTupleHelper(
-    PyLocalClient* client, LocalDeviceState* local_device,
-    absl::Span<PyLocalBuffer* const> py_buffers,
-    absl::Span<const PyLocalBuffer::ScopedHold> device_buffers,
+    PjRtClient* client, LocalDeviceState* local_device,
+    absl::Span<PjRtBuffer* const> py_buffers,
+    absl::Span<const PjRtBuffer::ScopedHold> device_buffers,
     int device_ordinal) {
   std::vector<Shape> host_shapes;
   std::vector<Shape> device_shapes;
   host_shapes.reserve(py_buffers.size());
   device_shapes.reserve(py_buffers.size());
-  for (const PyLocalBuffer* buffer : py_buffers) {
+  for (const PjRtBuffer* buffer : py_buffers) {
     host_shapes.push_back(buffer->on_host_shape());
     device_shapes.push_back(buffer->on_device_shape());
   }
@@ -1175,8 +1172,8 @@ StatusOr<TupleHandle> MakeTupleHelper(
       LocalDeviceState::kComputeSynchronized) {
     stream->ThenWaitFor(local_device->compute_stream());
   } else {
-    // In principle we would do a DCHECK for CanShapedBufferBeAccessedNow here
-    // but that call requires a ShapedBuffer which we don't have.
+    DCHECK(transfer_manager->CanBufferBeAccessedNow(
+        local_device->compute_stream()->parent(), root_table_memory.cref()));
   }
 
   ExecutionInput execution_input(on_device_shape);
@@ -1190,7 +1187,7 @@ StatusOr<TupleHandle> MakeTupleHelper(
       MaybeOwningDeviceMemory(std::move(root_table_memory)));
   ++input_iterator;
   // Then set each sub-tuple in turn from the parameters.
-  for (const PyLocalBuffer::ScopedHold& device_buffer : device_buffers) {
+  for (const PjRtBuffer::ScopedHold& device_buffer : device_buffers) {
     device_buffer.AddToInput(&input_iterator, iterator_end, &execution_input,
                              allocator);
   }
@@ -1205,22 +1202,22 @@ StatusOr<TupleHandle> MakeTupleHelper(
     return event_or.status();
   }
 
-  auto transfer_event = std::make_shared<BufferDefinitionEvent>();
-  transfer_event->SetDefinitionEvent(event_or.ConsumeValueOrDie(), stream);
+  auto transfer_event = std::make_shared<BufferSequencingEvent>();
+  transfer_event->SetSequencingEvent(event_or.ConsumeValueOrDie(), stream);
   return TupleHandle({std::move(on_host_shape), std::move(execution_input),
                       std::move(transfer_event)});
 }
 
 // Converts a ScopedShapedBuffer returned from an execution into a
-// PyLocalBuffer.
-std::unique_ptr<PyLocalBuffer> OutputBufferHelper(
+// PjRtBuffer.
+std::unique_ptr<PjRtBuffer> OutputBufferHelper(
     ScopedShapedBuffer* result_buffer,
-    std::shared_ptr<BufferDefinitionEvent> definition_event,
-    PyLocalClient* client, Device* device, LocalDeviceState* local_device) {
-  std::shared_ptr<SharedDeviceBuffer> out_buffer =
-      SharedDeviceBuffer::FromScopedShapedBuffer(result_buffer,
-                                                 {definition_event});
-  auto py_buffer = absl::make_unique<PyLocalBuffer>(
+    std::shared_ptr<BufferSequencingEvent> definition_event, PjRtClient* client,
+    Device* device, LocalDeviceState* local_device) {
+  std::shared_ptr<TrackedDeviceBuffer> out_buffer =
+      TrackedDeviceBuffer::FromScopedShapedBuffer(result_buffer,
+                                                  {definition_event});
+  auto py_buffer = absl::make_unique<PjRtBuffer>(
       result_buffer->on_host_shape(), result_buffer->on_device_shape(),
       std::move(out_buffer), client, device);
   RecordUsage(py_buffer->GetBufferWithUsageHold(), local_device, local_device,
@@ -1229,7 +1226,7 @@ std::unique_ptr<PyLocalBuffer> OutputBufferHelper(
   return py_buffer;
 }
 
-static Device* LookupDevice(const PyLocalClient& client, int device_id) {
+static Device* LookupDevice(const PjRtClient& client, int device_id) {
   auto it = client.id_to_device().find(device_id);
   CHECK(it != client.id_to_device().end())
       << "Unknown device id: " << device_id;
@@ -1238,23 +1235,25 @@ static Device* LookupDevice(const PyLocalClient& client, int device_id) {
 
 }  // namespace
 
-PyLocalExecutable::PyLocalExecutable(
+PjRtExecutable::PjRtExecutable(
     std::vector<std::unique_ptr<LocalExecutable>> executables,
-    bool tuple_arguments, DeviceAssignment device_assignment,
-    PyLocalClient* client)
+    bool parameter_is_tupled_arguments, DeviceAssignment device_assignment,
+    std::vector<std::pair<int, int>> local_logical_device_ids,
+    std::vector<Device*> local_devices, PjRtClient* client)
     : client_(client),
       device_assignment_(std::make_shared<DeviceAssignment>(device_assignment)),
-      tuple_arguments_(tuple_arguments) {
+      parameter_is_tupled_arguments_(parameter_is_tupled_arguments),
+      local_logical_device_ids_(std::move(local_logical_device_ids)),
+      local_devices_(std::move(local_devices)) {
   executables_.reserve(executables.size());
   for (auto& executable : executables) {
     executables_.emplace_back(std::move(executable));
   }
 
   // This must go after `executables_` is initialized.
-  VLOG(1) << "PyLocalExecutable " << name() << " device_assignment:\n"
+  VLOG(1) << "PjRtExecutable " << name() << " device_assignment:\n"
           << device_assignment_->ToString();
 
-  const int num_replicas = device_assignment_->replica_count();
   const int num_partitions = device_assignment_->computation_count();
 
   // SPMD sharding produces a single executable for multiple partitions.
@@ -1264,25 +1263,12 @@ PyLocalExecutable::PyLocalExecutable(
         << " did not match number of partitions " << num_partitions;
   }
 
-  for (int replica = 0; replica < num_replicas; ++replica) {
-    for (int partition = 0; partition < num_partitions; ++partition) {
-      int device_id = (*device_assignment_)(replica, partition);
-      Device* device = LookupDevice(*client_, device_id);
-      if (device->host_id() != client_->host_id()) {
-        VLOG(3) << "Non-local device: " << device_id;
-        continue;
-      }
-      local_logical_device_ids_.emplace_back(replica, partition);
-      local_devices_.push_back(device);
-    }
-  }
   CHECK_GE(local_devices_.size(), 1) << device_assignment_->ToString();
   CHECK_LE(local_devices_.size(), client_->local_device_count())
       << "Inconsistent local device count.";
 }
 
-Status PyLocalExecutable::SetUpDonation(PyLocalClient* client,
-                                        bool tuple_inputs) {
+Status PjRtExecutable::SetUpDonation(PjRtClient* client, bool tuple_inputs) {
   parameters_that_must_be_donated_.reserve(executables_.size());
   for (auto& executable : executables_) {
     TF_ASSIGN_OR_RETURN(
@@ -1294,7 +1280,7 @@ Status PyLocalExecutable::SetUpDonation(PyLocalClient* client,
   return Status::OK();
 }
 
-const std::string& PyLocalExecutable::name() const {
+const std::string& PjRtExecutable::name() const {
   Executable* executable = executables_[0]->executable();
   if (executable->has_module()) {
     return executable->module().name();
@@ -1308,11 +1294,10 @@ const std::string& PyLocalExecutable::name() const {
 // Enqueues a computation onto the compute stream. Each buffer returned in
 // device_buffers has a usage hold added that must be dropped on error or
 // converted on success.
-StatusOr<ScopedShapedBuffer> PyLocalExecutable::EnqueueExecution(
-    absl::Span<PyLocalBuffer* const> argument_handles, int replica,
-    int partition, int executable_idx, const RunId& run_id,
-    const ExecuteOptions& options, Device* device,
-    std::vector<PyLocalBuffer::ScopedHold>* device_buffers) const {
+StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
+    absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
+    int executable_idx, const RunId& run_id, const ExecuteOptions& options,
+    Device* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers) const {
   int device_ordinal = device->local_device_state()->device_ordinal();
   tensorflow::profiler::TraceMe traceme([&] {
     return absl::StrCat("LocalExecutable::Execute#run_id=", run_id.ToInt(),
@@ -1321,14 +1306,14 @@ StatusOr<ScopedShapedBuffer> PyLocalExecutable::EnqueueExecution(
   VLOG(3) << "Replica " << replica << ", partition " << partition
           << " mapped to device ordinal for execution: " << device_ordinal;
 
-  absl::flat_hash_set<BufferDefinitionEvent*> events;
+  absl::flat_hash_set<BufferSequencingEvent*> events;
   std::vector<const Shape*> argument_host_shapes;
   std::vector<ExecutionInput> execution_inputs;
   device_buffers->reserve(argument_handles.size());
   const absl::flat_hash_set<int>& parameters_that_must_be_donated =
       parameters_that_must_be_donated_[executable_idx];
   for (int i = 0; i < argument_handles.size(); ++i) {
-    PyLocalBuffer* handle = argument_handles[i];
+    PjRtBuffer* handle = argument_handles[i];
     if (handle->device() != device) {
       return InvalidArgument(
           "Buffer passed to Execute() as argument %d to replica %d is on "
@@ -1338,9 +1323,9 @@ StatusOr<ScopedShapedBuffer> PyLocalExecutable::EnqueueExecution(
     bool must_donate = parameters_that_must_be_donated.find(i) !=
                        parameters_that_must_be_donated.end();
     device_buffers->emplace_back(handle->GetBufferWithHold(
-        must_donate ? PyLocalBuffer::ScopedHold::kDonation
-                    : PyLocalBuffer::ScopedHold::kUsage));
-    PyLocalBuffer::ScopedHold& device_buffer = device_buffers->back();
+        must_donate ? PjRtBuffer::ScopedHold::kDonation
+                    : PjRtBuffer::ScopedHold::kUsage));
+    PjRtBuffer::ScopedHold& device_buffer = device_buffers->back();
     if (!device_buffer.ok()) {
       return InvalidArgument(
           "Invalid buffer passed to Execute() as argument %d to replica %d: "
@@ -1356,9 +1341,23 @@ StatusOr<ScopedShapedBuffer> PyLocalExecutable::EnqueueExecution(
                           &events);
   }
 
+  if (options.arguments_are_tupled) {
+    if (!parameter_is_tupled_arguments_) {
+      return InvalidArgument(
+          "Arguments may only be supplied as a tuple when the executable was "
+          "compiled with a single tupled parameter");
+    }
+    if (argument_handles.size() != 1) {
+      return InvalidArgument(
+          "Option arguments_are_tupled was true but %d buffers were passed to "
+          "execution",
+          argument_handles.size());
+    }
+  }
+
   LocalDeviceState* device_state = &client_->device_state(device_ordinal);
   TupleHandle tuple_handle;
-  if (tuple_arguments_) {
+  if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
     TF_ASSIGN_OR_RETURN(tuple_handle,
                         MakeTupleHelper(client_, device_state, argument_handles,
                                         *device_buffers, device_ordinal));
@@ -1369,10 +1368,10 @@ StatusOr<ScopedShapedBuffer> PyLocalExecutable::EnqueueExecution(
     argument_host_shapes.reserve(argument_handles.size());
     execution_inputs.reserve(argument_handles.size());
     for (int i = 0; i < argument_handles.size(); ++i) {
-      PyLocalBuffer* handle = argument_handles[i];
+      PjRtBuffer* handle = argument_handles[i];
       argument_host_shapes.push_back(&handle->on_host_shape());
 
-      const PyLocalBuffer::ScopedHold& device_buffer = (*device_buffers)[i];
+      const PjRtBuffer::ScopedHold& device_buffer = (*device_buffers)[i];
       // Make an ExecutionInput from the device buffer.
       execution_inputs.emplace_back(handle->on_device_shape());
       ExecutionInput& execution_input = execution_inputs.back();
@@ -1386,7 +1385,7 @@ StatusOr<ScopedShapedBuffer> PyLocalExecutable::EnqueueExecution(
     }
   }
 
-  for (BufferDefinitionEvent* event : events) {
+  for (BufferSequencingEvent* event : events) {
     event->WaitForEventOnStream(device_state->compute_stream());
   }
 
@@ -1459,10 +1458,10 @@ StatusOr<ScopedShapedBuffer> PyLocalExecutable::EnqueueExecution(
   return result_buffer_or_status.ConsumeValueOrDie().ConsumeResult();
 }
 
-StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
-PyLocalExecutable::ExecuteHelper(
-    absl::Span<PyLocalBuffer* const> argument_handles, int replica,
-    int partition, const RunId& run_id, const ExecuteOptions& options) const {
+StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+PjRtExecutable::ExecuteHelper(absl::Span<PjRtBuffer* const> argument_handles,
+                              int replica, int partition, const RunId& run_id,
+                              const ExecuteOptions& options) const {
   const int device_id = (*device_assignment_)(replica, partition);
   Device* device = LookupDevice(*client_, device_id);
 
@@ -1475,7 +1474,7 @@ PyLocalExecutable::ExecuteHelper(
   // SPMD sharding produces a single executable for multiple partitions.
   int executable_idx = executables_.size() > 1 ? partition : 0;
 
-  std::vector<PyLocalBuffer::ScopedHold> device_buffers;
+  std::vector<PjRtBuffer::ScopedHold> device_buffers;
   device_buffers.reserve(argument_handles.size());
   StatusOr<ScopedShapedBuffer> result_buffer_or_status =
       EnqueueExecution(argument_handles, replica, partition, executable_idx,
@@ -1495,8 +1494,8 @@ PyLocalExecutable::ExecuteHelper(
       device_state->event_pool().ThenAllocateAndRecordEvent(stream);
   if (!event_or.ok()) {
     StallStreamOnError(device_state, stream);
-    for (PyLocalBuffer::ScopedHold& b : device_buffers) {
-      if (b.type() == PyLocalBuffer::ScopedHold::kDonation) {
+    for (PjRtBuffer::ScopedHold& b : device_buffers) {
+      if (b.type() == PjRtBuffer::ScopedHold::kDonation) {
         // Even though there was an error we need to call ConfirmDonation, which
         // renders b invalid, since the computation has been enqueued and b has
         // been donated.
@@ -1505,9 +1504,9 @@ PyLocalExecutable::ExecuteHelper(
     }
     return event_or.status();
   }
-  auto definition_event = std::make_shared<BufferDefinitionEvent>();
-  definition_event->SetDefinitionEvent(event_or.ConsumeValueOrDie(), stream);
-  std::vector<std::unique_ptr<PyLocalBuffer>> outputs;
+  auto definition_event = std::make_shared<BufferSequencingEvent>();
+  definition_event->SetSequencingEvent(event_or.ConsumeValueOrDie(), stream);
+  std::vector<std::unique_ptr<PjRtBuffer>> outputs;
   if (options.untuple_result && result_buffer.on_host_shape().IsTuple()) {
     int tuple_count = result_buffer.on_host_shape().tuple_shapes_size();
     outputs.reserve(tuple_count);
@@ -1533,17 +1532,17 @@ PyLocalExecutable::ExecuteHelper(
                                          client_, device, device_state));
   }
 
-  for (PyLocalBuffer::ScopedHold& b : device_buffers) {
+  for (PjRtBuffer::ScopedHold& b : device_buffers) {
     // prefer_to_retain_reference=false because when using the
     // ComputeSynchronized allocation model we don't need to retain a reference
     // to the device_buffer during execution because by definition the compute
     // stream is synchronized past the execution.
-    if (b.type() == PyLocalBuffer::ScopedHold::kUsage) {
+    if (b.type() == PjRtBuffer::ScopedHold::kUsage) {
       RecordUsage(std::move(b), device_state, device_state, definition_event,
                   stream,
                   /*prefer_to_retain_reference=*/false);
     } else {
-      CHECK(b.type() == PyLocalBuffer::ScopedHold::kDonation);
+      CHECK(b.type() == PjRtBuffer::ScopedHold::kDonation);
       b.ConfirmDonation();
     }
   }
@@ -1551,9 +1550,9 @@ PyLocalExecutable::ExecuteHelper(
   return outputs;
 }
 
-StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
-PyLocalExecutable::Execute(absl::Span<PyLocalBuffer* const> argument_handles,
-                           const ExecuteOptions& options) const {
+StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> PjRtExecutable::Execute(
+    absl::Span<PjRtBuffer* const> argument_handles,
+    const ExecuteOptions& options) const {
   if (num_replicas() != 1) {
     return InvalidArgument(
         "Attempted to execute computation with %d replicas using Execute()",
@@ -1569,9 +1568,9 @@ PyLocalExecutable::Execute(absl::Span<PyLocalBuffer* const> argument_handles,
                        RunId(), options);
 }
 
-StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
-PyLocalExecutable::ExecuteOnLocalDevice(
-    absl::Span<PyLocalBuffer* const> argument_handles, Device* device,
+StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+PjRtExecutable::ExecuteOnLocalDevice(
+    absl::Span<PjRtBuffer* const> argument_handles, Device* device,
     const ExecuteOptions& options) const {
   for (int i = 0; i < local_devices_.size(); ++i) {
     if (local_devices_[i] == device) {
@@ -1587,9 +1586,9 @@ PyLocalExecutable::ExecuteOnLocalDevice(
       device->id());
 }
 
-StatusOr<std::vector<std::vector<std::unique_ptr<PyLocalBuffer>>>>
-PyLocalExecutable::ExecuteOnLocalDevices(
-    absl::Span<const std::vector<PyLocalBuffer*>> argument_handles,
+StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+PjRtExecutable::ExecuteOnLocalDevices(
+    absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options) const {
   RunId run_id;
   tensorflow::profiler::TraceMe traceme([&] {
@@ -1611,7 +1610,7 @@ PyLocalExecutable::ExecuteOnLocalDevices(
           << "; num_replicas=" << num_replicas()
           << " num_partitions=" << num_partitions()
           << " num_local_devices=" << num_local_devices;
-  std::vector<StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>> results(
+  std::vector<StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>> results(
       num_local_devices);
   if (num_local_devices == 1) {
     // Fast-path if there is only one device — run the computation on the
@@ -1674,7 +1673,7 @@ PyLocalExecutable::ExecuteOnLocalDevices(
   }
   VLOG(1) << "Replicated execution complete.";
 
-  std::vector<std::vector<std::unique_ptr<PyLocalBuffer>>> wrapped_results(
+  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> wrapped_results(
       num_local_devices);
   for (int i = 0; i < num_local_devices; ++i) {
     const int replica = local_logical_device_ids_[i].first;
@@ -1693,9 +1692,96 @@ PyLocalExecutable::ExecuteOnLocalDevices(
   return wrapped_results;
 }
 
-/*static*/ StatusOr<std::unique_ptr<PyLocalExecutable>>
-PyLocalExecutable::Compile(const XlaComputation& computation,
-                           PyLocalClient* client, CompileOptions options) {
+namespace {
+
+StatusOr<Shape> GetShardedShape(const Shape& shape,
+                                const OpSharding& sharding) {
+  if (sharding.type() == OpSharding::TUPLE) {
+    if (!shape.IsTuple()) {
+      return InvalidArgument(
+          "Got tuple OpSharding (%s) for non-tuple shape (%s)",
+          sharding.DebugString(), shape.ToString());
+    }
+    if (sharding.tuple_shardings_size() != shape.tuple_shapes_size()) {
+      return InvalidArgument(
+          "Got mismatched OpSharding tuple size (%d) and shape tuple size (%d)."
+          " (OpSharding: %s, shape: %s)",
+          sharding.tuple_shardings_size(), shape.tuple_shapes_size(),
+          sharding.DebugString(), shape.ToString());
+    }
+    std::vector<Shape> sharded_subshapes;
+    for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
+      TF_ASSIGN_OR_RETURN(
+          Shape sharded_subshape,
+          GetShardedShape(shape.tuple_shapes(i), sharding.tuple_shardings(i)));
+      sharded_subshapes.emplace_back(std::move(sharded_subshape));
+    }
+    return ShapeUtil::MakeTupleShape(sharded_subshapes);
+  }
+  TF_ASSIGN_OR_RETURN(HloSharding hlo_sharding,
+                      HloSharding::FromProto(sharding));
+  return hlo_sharding.TileShape(shape);
+}
+
+StatusOr<Shape> GetShardedShape(const HloInstructionProto& instr) {
+  const Shape unsharded_shape(instr.shape());
+  Shape sharded_shape;
+  if (instr.has_sharding()) {
+    TF_ASSIGN_OR_RETURN(sharded_shape,
+                        GetShardedShape(unsharded_shape, instr.sharding()));
+  } else {
+    sharded_shape = unsharded_shape;
+  }
+  LayoutUtil::ClearLayout(&sharded_shape);
+  return sharded_shape;
+}
+
+// Returns sharded (argument shapes, result shape) without layouts.
+StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
+    const XlaComputation& computation) {
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      computation.GetProgramShape());
+  std::vector<Shape> arg_shapes;
+  arg_shapes.resize(program_shape.parameters_size());
+  Shape result_shape;
+  for (const HloComputationProto& comp : computation.proto().computations()) {
+    if (comp.id() != computation.proto().entry_computation_id()) {
+      continue;
+    }
+    for (const HloInstructionProto& instr : comp.instructions()) {
+      if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter)) {
+        if (instr.parameter_number() >= program_shape.parameters_size()) {
+          return InvalidArgument(
+              "Got invalid parameter number %d, expected %d parameters",
+              instr.parameter_number(), program_shape.parameters_size());
+        }
+        TF_ASSIGN_OR_RETURN(arg_shapes[instr.parameter_number()],
+                            GetShardedShape(instr));
+      }
+      if (instr.id() == comp.root_id()) {
+        if (result_shape.element_type() != PRIMITIVE_TYPE_INVALID) {
+          return InvalidArgument("Found multiple root instructions");
+        }
+        TF_ASSIGN_OR_RETURN(result_shape, GetShardedShape(instr));
+      }
+    }
+  }
+  for (int i = 0; i < arg_shapes.size(); ++i) {
+    if (arg_shapes[i].element_type() == PRIMITIVE_TYPE_INVALID) {
+      return InvalidArgument("Couldn't find parameter %d", i);
+    }
+  }
+  if (result_shape.element_type() == PRIMITIVE_TYPE_INVALID) {
+    return InvalidArgument("Couldn't find root instruction");
+  }
+  return std::make_pair(arg_shapes, result_shape);
+}
+
+}  // namespace
+
+/*static*/ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtExecutable::Compile(
+    const XlaComputation& computation, PjRtClient* client,
+    CompileOptions options) {
   tensorflow::profiler::TraceMe traceme("LocalExecutable::Compile");
 
   ExecutableBuildOptions& build_options = options.executable_build_options;
@@ -1704,70 +1790,113 @@ PyLocalExecutable::Compile(const XlaComputation& computation,
   }
 
   if (!build_options.has_device_assignment()) {
-    VLOG(2) << "PyLocalExecutable::Compile using default device_assignment.";
+    VLOG(2) << "PjRtExecutable::Compile using default device_assignment.";
     TF_ASSIGN_OR_RETURN(
         DeviceAssignment device_assignment,
         client->GetDefaultDeviceAssignment(build_options.num_replicas(),
                                            build_options.num_partitions()));
     build_options.set_device_assignment(device_assignment);
   }
-  VLOG(2) << "PyLocalExecutable::Compile device_assignment:\n"
+  VLOG(2) << "PjRtExecutable::Compile device_assignment:\n"
           << build_options.device_assignment().ToString();
 
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      computation.GetProgramShape());
   if (!options.argument_layouts) {
-    TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                        computation.GetProgramShape());
     options.argument_layouts = program_shape.parameters();
     for (Shape& shape : *options.argument_layouts) {
       LayoutUtil::ClearLayout(&shape);
     }
+  } else if (options.argument_layouts->size() !=
+             program_shape.parameters_size()) {
+    return InvalidArgument(
+        "CompileOptions specify %d argument layouts, but computation has %d "
+        "arguments",
+        options.argument_layouts->size(), program_shape.parameters_size());
   }
   std::vector<const Shape*> argument_layout_pointers;
   argument_layout_pointers.reserve(options.argument_layouts->size());
 
-  // Assign a default layout to any array subshapes that are missing layouts.
-  auto assign_layouts = [client](Shape* shape) {
+  // Assign a default layout based on `sharded_shape` to any array subshapes in
+  // `dst_shape` that are missing layouts.
+  auto assign_layouts = [client](const Shape& sharded_shape, Shape* dst_shape) {
     return ShapeUtil::ForEachMutableSubshapeWithStatus(
-        shape, [&](Shape* subshape, const ShapeIndex&) {
+        dst_shape, [&](Shape* subshape, const ShapeIndex& idx) {
           if (subshape->IsArray() && !subshape->has_layout()) {
+            CHECK(ShapeUtil::IndexIsValid(sharded_shape, idx));
+            const Shape& sharded_subshape =
+                ShapeUtil::GetSubshape(sharded_shape, idx);
             LayoutUtil::SetToDefaultLayout(subshape);
-            TF_ASSIGN_OR_RETURN(*subshape,
-                                client->client()
-                                    ->backend()
-                                    .transfer_manager()
-                                    ->ChooseCompactLayoutForShape(*subshape));
+            TF_ASSIGN_OR_RETURN(Shape layout, client->client()
+                                                  ->backend()
+                                                  .transfer_manager()
+                                                  ->ChooseCompactLayoutForShape(
+                                                      sharded_subshape));
+            *subshape->mutable_layout() = layout.layout();
           }
           return Status::OK();
         });
   };
+  TF_ASSIGN_OR_RETURN(auto sharded_shapes,
+                      GetShardedProgramShapes(computation));
 
-  for (Shape& layout : *options.argument_layouts) {
-    argument_layout_pointers.push_back(&layout);
-    TF_RETURN_IF_ERROR(assign_layouts(&layout));
+  CHECK_EQ(sharded_shapes.first.size(), options.argument_layouts->size());
+  for (int i = 0; i < options.argument_layouts->size(); ++i) {
+    Shape* layout = &(*options.argument_layouts)[i];
+    argument_layout_pointers.push_back(layout);
+    TF_RETURN_IF_ERROR(assign_layouts(sharded_shapes.first[i], layout));
   }
 
   Shape result_layout;
   if (build_options.result_layout()) {
     result_layout = *build_options.result_layout();
   } else {
-    TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                        computation.GetProgramShape());
     result_layout = program_shape.result();
     LayoutUtil::ClearLayout(&result_layout);
   }
-  TF_RETURN_IF_ERROR(assign_layouts(&result_layout));
+  TF_RETURN_IF_ERROR(assign_layouts(sharded_shapes.second, &result_layout));
   build_options.set_result_layout(result_layout);
 
+  const int num_replicas = build_options.device_assignment().replica_count();
+  const int num_partitions =
+      build_options.device_assignment().computation_count();
+
+  std::vector<std::pair<int, int>> local_logical_device_ids;
+  std::vector<Device*> local_devices;
+  for (int replica = 0; replica < num_replicas; ++replica) {
+    for (int partition = 0; partition < num_partitions; ++partition) {
+      int device_id = build_options.device_assignment()(replica, partition);
+      Device* device = LookupDevice(*client, device_id);
+      if (device->host_id() != client->host_id()) {
+        VLOG(3) << "Non-local device: " << device_id;
+        continue;
+      }
+      local_logical_device_ids.emplace_back(replica, partition);
+      local_devices.push_back(device);
+    }
+  }
+  if (local_devices.empty()) {
+    return InvalidArgument(
+        "Device assignment (%s) does not have any local devices.",
+        build_options.device_assignment().ToString());
+  }
+
+  if (build_options.device_ordinal() < 0) {
+    build_options.set_device_ordinal(
+        local_devices.front()->local_device_state()->device_ordinal());
+  }
+
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<LocalExecutable>> local_executables,
       client->client()->Compile(computation, argument_layout_pointers,
                                 build_options));
 
-  auto py_executable = absl::make_unique<PyLocalExecutable>(
-      std::move(local_executables), options.tuple_arguments,
-      build_options.device_assignment(), client);
-  TF_RETURN_IF_ERROR(
-      py_executable->SetUpDonation(client, options.tuple_arguments));
+  auto py_executable = absl::make_unique<PjRtExecutable>(
+      std::move(local_executables), options.parameter_is_tupled_arguments,
+      build_options.device_assignment(), std::move(local_logical_device_ids),
+      std::move(local_devices), client);
+  TF_RETURN_IF_ERROR(py_executable->SetUpDonation(
+      client, options.parameter_is_tupled_arguments));
   return py_executable;
 }
 
diff --git a/tensorflow/compiler/xla/python/local_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
similarity index 78%
rename from tensorflow/compiler/xla/python/local_client.h
rename to tensorflow/compiler/xla/pjrt/pjrt_client.h
index 2911ec12424..775b44c7073 100644
--- a/tensorflow/compiler/xla/python/local_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_CLIENT_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_CLIENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_PJRT_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_PJRT_CLIENT_H_
 
 #include <memory>
 #include <string>
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/python/local_device_state.h"
-#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/local_device_state.h"
+#include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -43,20 +43,20 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 
 // API notes:
-// Despite having the name "PyLocalClient", it is intended that this API may
-// also be consumed from C++. Python/pybind11/NumPy logic should therefore not
-// be used in this API.
+// PjRt stands for "Pretty much Just another RunTime".
 
 namespace xla {
 
 class Device {
  public:
   explicit Device(int id, std::unique_ptr<LocalDeviceState> local_device_state,
-                  absl::string_view platform_name, int host_id = 0)
+                  std::string platform_name, std::string device_kind,
+                  int host_id = 0)
       : id_(id),
         local_device_state_(std::move(local_device_state)),
         host_id_(host_id),
-        platform_name_(platform_name) {}
+        platform_name_(std::move(platform_name)),
+        device_kind_(std::move(device_kind)) {}
   virtual ~Device() {}
 
   // The ID of this device. IDs are unique among devices of this type
@@ -81,6 +81,9 @@ class Device {
 
   const std::string& platform_name() const { return platform_name_; }
 
+  // A vendor-dependent string that uniquely identifies the kind of device.
+  const std::string& device_kind() const { return device_kind_; }
+
   virtual std::string DebugString() const;
 
  private:
@@ -88,37 +91,38 @@ class Device {
   const std::unique_ptr<LocalDeviceState> local_device_state_;
   const int host_id_;
   const std::string platform_name_;
+  const std::string device_kind_;
 };
 
 // Forward declaration.
-class PyLocalBuffer;
+class PjRtBuffer;
 
 // Helper struct for cross host transfers, returned by the callback from a call
-// to PyLocalBuffer::MakeCrossHostReceiveBuffers.
-struct PyLocalCrossHostRecvBuffer {
+// to PjRtBuffer::MakeCrossHostReceiveBuffers.
+struct PjRtCrossHostRecvBuffer {
   // serialized_descriptor should be transmitted to the sender and passed to a
   // call to src_buffer->CopyToRemoteDevice.
   std::string serialized_descriptor;
   // The buffer that will hold the result of the transfer.
-  std::unique_ptr<PyLocalBuffer> buffer;
+  std::unique_ptr<PjRtBuffer> buffer;
 };
-using PyLocalCrossHostRecvNotifier =
-    std::function<void(StatusOr<std::vector<PyLocalCrossHostRecvBuffer>>&&)>;
+using PjRtCrossHostRecvNotifier =
+    std::function<void(StatusOr<std::vector<PjRtCrossHostRecvBuffer>>&&)>;
 
 // Encapsulates the state of Python session with XLA.
 //
-// It is the responsibility of the client of this API to keep the PyLocalClient
+// It is the responsibility of the client of this API to keep the PjRtClient
 // alive as long as any of the other runtime objects are alive.
-class PyLocalClient : public std::enable_shared_from_this<PyLocalClient> {
+class PjRtClient : public std::enable_shared_from_this<PjRtClient> {
  public:
   // `allocator` may null, in which case the platform default allocator is used.
-  explicit PyLocalClient(
+  explicit PjRtClient(
       std::string platform_name, LocalClient* client,
       std::vector<std::unique_ptr<Device>> devices, int host_id,
       std::unique_ptr<se::DeviceMemoryAllocator> allocator,
       std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
       std::unique_ptr<GpuExecutableRunOptions> gpu_run_options);
-  virtual ~PyLocalClient() = default;
+  virtual ~PjRtClient() = default;
 
   virtual StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const;
@@ -164,15 +168,15 @@ class PyLocalClient : public std::enable_shared_from_this<PyLocalClient> {
       const LocalExecutable& executable, bool tuple_inputs) const;
 
  protected:
-  friend class PyLocalBuffer;
+  friend class PjRtBuffer;
   virtual void EnqueueCrossHostReceive(
-      std::vector<std::unique_ptr<PyLocalBuffer>>&& buffers,
-      PyLocalCrossHostRecvNotifier&& notifier) const {
+      std::vector<std::unique_ptr<PjRtBuffer>>&& buffers,
+      PjRtCrossHostRecvNotifier&& notifier) const {
     notifier(Unimplemented("Cross host receives not implemented."));
   }
 
   virtual Status CopyToRemoteDevice(
-      PyLocalBuffer* buffer, absl::string_view serialized_descriptor) const {
+      PjRtBuffer* buffer, absl::string_view serialized_descriptor) const {
     return Unimplemented("Cross host sends not implemented.");
   }
 
@@ -205,24 +209,24 @@ class PyLocalClient : public std::enable_shared_from_this<PyLocalClient> {
 StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
     absl::Span<const std::vector<Device*>> devices);
 
-// Holds a reference from Python to a tuple of device buffers. A PyLocalBuffer
+// Holds a reference from Python to a tuple of device buffers. A PjRtBuffer
 // can be either valid or invalid. An invalid buffer is one that has never been
 // initialized, or a buffer that has been deleted (e.g., by calling Delete, or
 // by donating it to a computation that aliases an input parameter to an
-// output). We allow PyLocalBuffer objects to outlive the underlying device
+// output). We allow PjRtBuffer objects to outlive the underlying device
 // buffers so we can decouple buffer lifetimes from the corresponding Python
 // references if needed. Thread-safe.
-class PyLocalBuffer {
+class PjRtBuffer {
  public:
-  // Helper class to retain a "hold" on a PyLocalBuffer. A ScopedHold may not
-  // outlive its parent PyLocalBuffer.
+  // Helper class to retain a "hold" on a PjRtBuffer. A ScopedHold may not
+  // outlive its parent PjRtBuffer.
   //
   // There are three types of hold, as follows:
   //
   // 1) Usage hold: a transient hold while an operation using the buffer is
   //    being enqueued onto a stream.
   // A client acquires a usage hold by calling
-  // PyLocalBuffer::GetBufferWithHold(kUsage) or the convenience wrapper
+  // PjRtBuffer::GetBufferWithHold(kUsage) or the convenience wrapper
   // GetBufferWithUsageHold(). If the enqueue completes successfully the hold
   // should be released using a call to ConvertUsageHold. If the ScopedHold is
   // deleted without ConvertUsageHold being called, e.g., on error, the hold is
@@ -233,16 +237,16 @@ class PyLocalBuffer {
   // 2) External hold: a potentially long-lived hold while the buffer is being
   //    shared by an external framework, e.g., NumPy.
   // A client acquires an external hold by calling
-  // PyLocalBuffer::GetBufferWithHold(kExternal) or the convenience wrapper
+  // PjRtBuffer::GetBufferWithHold(kExternal) or the convenience wrapper
   // GetBufferWithExternalReference and releases it by deleting the ScopedHold.
   // The external framework should not modify the underlying buffer unless it is
   // confident via its own synchronization that modifications do not race with
-  // reads from the PyLocalBuffer.
+  // reads from the PjRtBuffer.
   //
   // 3) Donation hold: a transient hold while an execution that donates the
   //    buffer is being enqueued onto the compute stream.
   // A client acquires a donation hold by calling
-  // PyLocalBuffer::GetBufferWithHold(kDonation). If the enqueue completes
+  // PjRtBuffer::GetBufferWithHold(kDonation). If the enqueue completes
   // successfully the hold should be released using a call to ConfirmDonation
   // after which the buffer is invalid. If the ScopedHold is deleted without
   // ConfirmDonation being called, e.g., on error, the hold is dropped and the
@@ -256,8 +260,8 @@ class PyLocalBuffer {
   // will block if there are any outstanding usage holds until those holds are
   // dropped or converted.
   //
-  // Calls to PyLocalBuffer::Release (and transitively to
-  // PyLocalBuffer::Delete() and ~PyLocalBuffer()) will block until all usage
+  // Calls to PjRtBuffer::Release (and transitively to
+  // PjRtBuffer::Delete() and ~PjRtBuffer()) will block until all usage
   // and donation holds are either deleted or converted/confirmed.
   class ScopedHold {
    public:
@@ -274,12 +278,12 @@ class PyLocalBuffer {
     bool ok() const { return buffer_or_.ok(); }
 
     // Access to the underlying device buffer storage. Requires this->ok().
-    const std::shared_ptr<SharedDeviceBuffer>& buffer() const {
+    const std::shared_ptr<TrackedDeviceBuffer>& buffer() const {
       CHECK_NE(buffer_or_.ValueOrDie(), nullptr);
       return buffer_or_.ValueOrDie();
     }
-    SharedDeviceBuffer* operator->() const { return buffer().get(); }
-    const SharedDeviceBuffer& operator*() const { return *buffer(); }
+    TrackedDeviceBuffer* operator->() const { return buffer().get(); }
+    const TrackedDeviceBuffer& operator*() const { return *buffer(); }
 
     // Converts the hold into a usage event. Only valid for holds of type
     // kUsage.
@@ -292,7 +296,7 @@ class PyLocalBuffer {
     //                   the host is sure that the usage (transfer or execution)
     //                   has completed.
     void ConvertUsageHold(se::Stream* usage_stream,
-                          std::shared_ptr<BufferDefinitionEvent> event,
+                          std::shared_ptr<BufferSequencingEvent> event,
                           bool reference_held);
 
     // Confirms that the buffer was successfully donated to an execution.
@@ -304,7 +308,7 @@ class PyLocalBuffer {
     // buffers to an ExecutionInput. We require but do not verify that
     // 'iterator' when passed in is pointing to a sub-tuple of the
     // ExecutionInput whose on_device_shape matches that of the
-    // SharedDeviceBuffer. 'end' is used to check that 'iterator' doesn't run
+    // TrackedDeviceBuffer. 'end' is used to check that 'iterator' doesn't run
     // out of bounds. Donates the device buffers if the hold type is kDonation,
     // otherwise retains ownership of the device buffers.
     void AddToInput(ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
@@ -313,16 +317,15 @@ class PyLocalBuffer {
                     se::DeviceMemoryAllocator* allocator) const;
 
    private:
-    friend class PyLocalBuffer;
+    friend class PjRtBuffer;
 
     // Helper struct that makes it possible to move a ScopedHold through a
     // closure.
     using ForClosure =
-        std::tuple<PyLocalBuffer*, Type,
-                   StatusOr<std::shared_ptr<SharedDeviceBuffer>>>;
+        std::tuple<PjRtBuffer*, Type,
+                   StatusOr<std::shared_ptr<TrackedDeviceBuffer>>>;
 
-    ScopedHold(PyLocalBuffer* parent, Type type)
-        : parent_(parent), type_(type) {
+    ScopedHold(PjRtBuffer* parent, Type type) : parent_(parent), type_(type) {
       SetError(InvalidArgument("Buffer has not been initialized"));
     }
     explicit ScopedHold(const ForClosure& closure_helper)
@@ -337,18 +340,18 @@ class PyLocalBuffer {
     void SetError(Status s) { buffer_or_ = s; }
 
     // Sets buffer_or_. Called by parent_ to initialize the hold.
-    void Acquire(StatusOr<std::shared_ptr<SharedDeviceBuffer>>&& buffer_or);
+    void Acquire(StatusOr<std::shared_ptr<TrackedDeviceBuffer>>&& buffer_or);
     // Releases the contents of *this, so *this can subsequently be
     // deleted without releasing the parent's hold. Should be passed to the
     // appropriate constructor of another ScopedHold, e.g., when a hold must be
     // passed through a closure that is incompatible with std::move.
     ForClosure ToClosure();
 
-    PyLocalBuffer* const parent_;
+    PjRtBuffer* const parent_;
     const Type type_;
     // There is an invariant that if buffer_or_.ok() then
     // buffer_or_.ValueOrDie() != nullptr.
-    StatusOr<std::shared_ptr<SharedDeviceBuffer>> buffer_or_;
+    StatusOr<std::shared_ptr<TrackedDeviceBuffer>> buffer_or_;
   };
 
   // If `force_copy` is true, forces a copy of the input buffer on CPU.
@@ -356,45 +359,45 @@ class PyLocalBuffer {
   // `buffer_reference` is an optional shared pointer that should be kept alive
   // by the runtime as long as the contents of `data` may still be accessed by
   // the runtime (may be nullptr).
-  static StatusOr<std::unique_ptr<PyLocalBuffer>> FromHostBuffer(
+  static StatusOr<std::unique_ptr<PjRtBuffer>> FromHostBuffer(
       const void* data, const Shape& shape, bool force_copy,
-      std::shared_ptr<void> buffer_reference, PyLocalClient* client,
+      std::shared_ptr<void> buffer_reference, PjRtClient* client,
       Device* device);
 
   // Note that literal must remain in scope until the transfer has completed, so
   // the caller should, for example, wait for BlockHostUntilReady() completes on
   // the return value before letting literal go out of scope.
-  static StatusOr<std::unique_ptr<PyLocalBuffer>> FromHostLiteral(
-      const LiteralSlice& literal, PyLocalClient* client, Device* device);
+  static StatusOr<std::unique_ptr<PjRtBuffer>> FromHostLiteral(
+      const LiteralSlice& literal, PjRtClient* client, Device* device);
 
-  // Asynchronously makes a vector of PyLocalBuffers that can be used to receive
+  // Asynchronously makes a vector of PjRtBuffers that can be used to receive
   // cross host transfers using `client` on `device'. `shapes` must be the exact
   // shapes, with identical layouts, corresponding to the buffers that will be
   // sent. When resources for the transfer are available, notifier will be
-  // called with a vector of PyLocalCrossHostRecvBuffer structs, one for each
+  // called with a vector of PjRtCrossHostRecvBuffer structs, one for each
   // shape in `shapes`. Each struct contains a buffer that will contain the
   // received value, and an opaque string that should be transmitted to the
   // sending host and used in a call to CopyToRemoteDevice. None of the recv
   // buffers will become ready until *all* of the sends have completed.
-  static void MakeCrossHostReceiveBuffers(
-      absl::Span<const Shape> shapes, PyLocalClient* client, Device* device,
-      PyLocalCrossHostRecvNotifier&& notifier);
+  static void MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
+                                          PjRtClient* client, Device* device,
+                                          PjRtCrossHostRecvNotifier&& notifier);
 
-  PyLocalBuffer(Shape on_host_shape, Shape on_device_shape,
-                std::shared_ptr<SharedDeviceBuffer> device_buffer,
-                PyLocalClient* client, Device* device);
-  ~PyLocalBuffer();
+  PjRtBuffer(Shape on_host_shape, Shape on_device_shape,
+             std::shared_ptr<TrackedDeviceBuffer> device_buffer,
+             PjRtClient* client, Device* device);
+  ~PjRtBuffer();
 
-  PyLocalBuffer(const PyLocalBuffer&) = delete;
-  PyLocalBuffer(PyLocalBuffer&&) = delete;
-  PyLocalBuffer& operator=(const PyLocalBuffer&) = delete;
-  PyLocalBuffer& operator=(PyLocalBuffer&&) = delete;
+  PjRtBuffer(const PjRtBuffer&) = delete;
+  PjRtBuffer(PjRtBuffer&&) = delete;
+  PjRtBuffer& operator=(const PjRtBuffer&) = delete;
+  PjRtBuffer& operator=(PjRtBuffer&&) = delete;
 
   const Shape& on_host_shape() const { return on_host_shape_; }
   const Shape& on_device_shape() const { return on_device_shape_; }
   Device* device() const { return device_; }
   const std::string& platform_name() const { return client_->platform_name(); }
-  PyLocalClient* client() const { return client_; }
+  PjRtClient* client() const { return client_; }
   bool IsEmptyTuple() const {
     return on_host_shape_.IsTuple() && on_host_shape_.tuple_shapes_size() == 0;
   }
@@ -415,14 +418,14 @@ class PyLocalBuffer {
   // semantics of the underlying platform. Delete may briefly block if another
   // thread is in the process of enqueuing an operation on this buffer, but it
   // will never block for a stream operation to complete. If an external
-  // framework holds a reference to the SharedDeviceBuffer via
+  // framework holds a reference to the TrackedDeviceBuffer via
   // GetBufferWithExternalReference, the memory will not be freed until the
   // external framework drops the reference.
   void Delete();
 
   // Similar to Delete, drops the buffer's reference to its associated device
   // memory, leaving the buffer in an invalid state, but returns the
-  // SharedDeviceBuffer rather than freeing the device memory, so that another
+  // TrackedDeviceBuffer rather than freeing the device memory, so that another
   // framework can take ownership of it. The buffer returned from Release may
   // be safely dropped at any time even if it still has pending async
   // operations. The client should call BlockHostUntilReady before calling
@@ -434,17 +437,17 @@ class PyLocalBuffer {
   // If the buffer was shared via an external reference it is the client's
   // responsibility that accesses via that reference do not interfere with
   // accesses via the buffer returned from Release.
-  StatusOr<std::shared_ptr<SharedDeviceBuffer>> Release(
+  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> Release(
       bool wait_for_operations_to_complete);
 
   // True if and only if Delete or Release has previously been called.
   bool IsDeleted();
 
-  // Returns a view of the PyLocalBuffer device memory as a ShapedBuffer. The
-  // PyLocalBuffer retains ownership of the device buffers.
+  // Returns a view of the PjRtBuffer device memory as a ShapedBuffer. The
+  // PjRtBuffer retains ownership of the device buffers.
   StatusOr<ShapedBuffer> AsShapedBuffer() const;
 
-  // Returns a hold on the SharedDeviceBuffer holding the device
+  // Returns a hold on the TrackedDeviceBuffer holding the device
   // buffers. See comment on ScopedHold.
   ScopedHold GetBufferWithHold(ScopedHold::Type type);
   ScopedHold GetBufferWithUsageHold() {
@@ -456,7 +459,7 @@ class PyLocalBuffer {
 
   // Copies the buffer to device `dst_device`. Returns an error if the buffer is
   // already on dst_device.
-  StatusOr<std::unique_ptr<PyLocalBuffer>> CopyToDevice(Device* dst_device);
+  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(Device* dst_device);
 
   // Copies the buffer to the remote device encoded in serialized_descriptor.
   // This call must be preceded by a call to MakeCrossHostReceiveBuffers on the
@@ -474,10 +477,10 @@ class PyLocalBuffer {
   Status BlockHostUntilReady();
 
  private:
-  friend class PyLocalClient;
+  friend class PjRtClient;
   // The cached value of the buffer on the host, produced either from a call to
   // CopyToHost or from a call to ToLiteral. Once a value has been fetched to
-  // the host, it persists Delete() is called or the PyLocalBuffer is destroyed.
+  // the host, it persists Delete() is called or the PjRtBuffer is destroyed.
   struct HostValue {
     absl::Notification ready;
     // status and value are valid for reading only after `ready` has been
@@ -495,7 +498,7 @@ class PyLocalBuffer {
   // Adds a hold of 'type' and returns device_buffer_. Returns an error if
   // device_buffer_ is null, or if a donation hold was requested when there is
   // an outstanding external hold.
-  StatusOr<std::shared_ptr<SharedDeviceBuffer>> GetBufferForHoldLocked(
+  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> GetBufferForHoldLocked(
       ScopedHold::Type type) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Adds a hold of hold->type() and initializes `hold` with device_buffer_.
@@ -506,33 +509,33 @@ class PyLocalBuffer {
   // Drops a usage hold and calls device_buffer_->AddUsageEvent. Does a sanity
   // check that buffer==device_buffer_ or device_buffer_==nullptr. Called after
   // device_buffer_ was successfully enqueued on a stream.
-  void ConvertUsageHold(SharedDeviceBuffer* buffer, se::Stream* usage_stream,
-                        std::shared_ptr<BufferDefinitionEvent> event,
+  void ConvertUsageHold(TrackedDeviceBuffer* buffer, se::Stream* usage_stream,
+                        std::shared_ptr<BufferSequencingEvent> event,
                         bool reference_held);
 
   // Drops a donation hold and makes *this invalid for further use. Does a
   // sanity check that buffer==device_buffer_. Called after device_buffer_ was
   // successfully donated to an execution.
-  void ConfirmDonation(SharedDeviceBuffer* device_buffer);
+  void ConfirmDonation(TrackedDeviceBuffer* device_buffer);
 
   // Drops a hold without taking any other action. Does a sanity check that
   // buffer==device_buffer_ or device_buffer_==nullptr.
-  void DropHold(ScopedHold::Type type, SharedDeviceBuffer* buffer);
+  void DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer);
 
-  StatusOr<std::pair<std::unique_ptr<PyLocalBuffer>,
-                     std::shared_ptr<BufferDefinitionEvent>>>
+  StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
+                     std::shared_ptr<BufferSequencingEvent>>>
   CopyToDeviceHelper(Device* dst_device, LocalDeviceState* dst_local_device,
                      LocalDeviceState* transfer_local_device,
                      se::Stream* transfer_stream,
-                     std::shared_ptr<SharedDeviceBuffer> src_device_buffer);
+                     std::shared_ptr<TrackedDeviceBuffer> src_device_buffer);
 
-  PyLocalClient* const client_;
+  PjRtClient* const client_;
   const Shape on_host_shape_;
   const Shape on_device_shape_;
   Device* const device_;
 
   mutable absl::Mutex mu_;
-  std::shared_ptr<SharedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
+  std::shared_ptr<TrackedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
   std::shared_ptr<HostValue> host_value_ TF_GUARDED_BY(mu_);
   // Count of holds on the buffer.
   std::array<int, ScopedHold::Type::kMaxValue> holds_ TF_GUARDED_BY(mu_);
@@ -544,15 +547,20 @@ struct CompileOptions {
   // The layouts of the arguments that the computation should expect.
   absl::optional<std::vector<Shape>> argument_layouts;
 
-  // If true, the arguments to the computation will be wrapped in a tuple and
-  // passed as a single parameter.
-  bool tuple_arguments = false;
+  // If true, the supplied computation expects its arguments to be wrapped in a
+  // tuple and passed as a single parameter.
+  bool parameter_is_tupled_arguments = false;
 
   // XLA's compilation time options.
   ExecutableBuildOptions executable_build_options;
 };
 
 struct ExecuteOptions {
+  // If true, the client must pass a single PjRtBuffer which contains all of
+  // the arguments as a single XLA tuple, otherwise each argument must be
+  // passed in its own PjRtBuffer. May only be true if the executable was
+  // compiled with parameter_is_tupled_arguments==true.
+  bool arguments_are_tupled = false;
   // If true, the computation must return a tuple, which will be destructured
   // into its elements.
   bool untuple_result = false;
@@ -563,17 +571,19 @@ struct ExecuteOptions {
 // partition, as specified by the build options). If any input/output alias
 // has been specified in the computation, the parameter containing the input
 // buffer will be donated when passed to the execution.
-class PyLocalExecutable {
+class PjRtExecutable {
  public:
-  static StatusOr<std::unique_ptr<PyLocalExecutable>> Compile(
-      const XlaComputation& computation, PyLocalClient* client,
+  static StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      const XlaComputation& computation, PjRtClient* client,
       CompileOptions options);
 
-  PyLocalExecutable(std::vector<std::unique_ptr<LocalExecutable>> executables,
-                    bool tuple_arguments, DeviceAssignment device_assignment,
-                    PyLocalClient* client);
+  PjRtExecutable(std::vector<std::unique_ptr<LocalExecutable>> executables,
+                 bool parameter_is_tupled_arguments,
+                 DeviceAssignment device_assignment,
+                 std::vector<std::pair<int, int>> local_logical_device_ids,
+                 std::vector<Device*> local_devices, PjRtClient* client);
 
-  PyLocalClient* client() const { return client_; }
+  PjRtClient* client() const { return client_; }
 
   int num_replicas() const {
     return executables_[0]->build_options().num_replicas();
@@ -605,21 +615,21 @@ class PyLocalExecutable {
 
   const std::vector<Device*>& local_devices() const { return local_devices_; }
 
-  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> Execute(
-      absl::Span<PyLocalBuffer* const> argument_handles,
+  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> Execute(
+      absl::Span<PjRtBuffer* const> argument_handles,
       const ExecuteOptions& options) const;
 
-  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> ExecuteOnLocalDevice(
-      absl::Span<PyLocalBuffer* const> argument_handles, Device* device,
+  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteOnLocalDevice(
+      absl::Span<PjRtBuffer* const> argument_handles, Device* device,
       const ExecuteOptions& options) const;
 
   // Execute on local devices. Takes a sequence of argument lists (one argument
   // list per local device) and returns a tuple of results (one result per local
   // device). The number of argument lists must be equal to the local device
   // count.
-  StatusOr<std::vector<std::vector<std::unique_ptr<PyLocalBuffer>>>>
+  StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
   ExecuteOnLocalDevices(
-      absl::Span<const std::vector<PyLocalBuffer*>> argument_handles,
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options) const;
 
   void Delete() { executables_.clear(); }
@@ -629,20 +639,20 @@ class PyLocalExecutable {
  private:
   // Initializes information about which arguments to which executables must be
   // donated due to aliases that were specified by the computation.
-  Status SetUpDonation(PyLocalClient* client, bool tuple_inputs);
+  Status SetUpDonation(PjRtClient* client, bool tuple_inputs);
   StatusOr<ScopedShapedBuffer> EnqueueExecution(
-      absl::Span<PyLocalBuffer* const> argument_handles, int replica,
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, int executable_idx, const RunId& run_id,
       const ExecuteOptions& options, Device* device,
-      std::vector<PyLocalBuffer::ScopedHold>* device_buffers) const;
-  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> ExecuteHelper(
-      absl::Span<PyLocalBuffer* const> argument_handles, int replica,
+      std::vector<PjRtBuffer::ScopedHold>* device_buffers) const;
+  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteHelper(
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, const RunId& run_id, const ExecuteOptions& options) const;
 
   // Create shared pointers so we can free them after the execution: with
   // asynchronous execution, the process being executed can outlive the
   // executable itself.
-  PyLocalClient* const client_;
+  PjRtClient* const client_;
   // One executable per partition.
   std::vector<std::shared_ptr<LocalExecutable>> executables_;
   // Per-executable set of parameters that have any aliased buffers and thus
@@ -652,7 +662,7 @@ class PyLocalExecutable {
 
   // True if the executables were compiled expecting arguments in a single
   // tuple.
-  const bool tuple_arguments_;
+  const bool parameter_is_tupled_arguments_;
 
   // The replica and partition indices of device_assignment_ to be run by this
   // client. On single-host platforms without partitioning, this is all replicas
@@ -671,4 +681,4 @@ class PyLocalExecutable {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_CLIENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_PJRT_CLIENT_H_
diff --git a/tensorflow/compiler/xla/python/semaphore.cc b/tensorflow/compiler/xla/pjrt/semaphore.cc
similarity index 97%
rename from tensorflow/compiler/xla/python/semaphore.cc
rename to tensorflow/compiler/xla/pjrt/semaphore.cc
index 5926618bddc..c1df52acc61 100644
--- a/tensorflow/compiler/xla/python/semaphore.cc
+++ b/tensorflow/compiler/xla/pjrt/semaphore.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/semaphore.h"
+#include "tensorflow/compiler/xla/pjrt/semaphore.h"
 
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/compiler/xla/python/semaphore.h b/tensorflow/compiler/xla/pjrt/semaphore.h
similarity index 92%
rename from tensorflow/compiler/xla/python/semaphore.h
rename to tensorflow/compiler/xla/pjrt/semaphore.h
index 7d3e9ce6271..45345becf74 100644
--- a/tensorflow/compiler/xla/python/semaphore.h
+++ b/tensorflow/compiler/xla/pjrt/semaphore.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_SEMAPHORE_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_SEMAPHORE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_SEMAPHORE_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_SEMAPHORE_H_
 
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -65,4 +65,4 @@ class Semaphore {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_SEMAPHORE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_SEMAPHORE_H_
diff --git a/tensorflow/compiler/xla/python/semaphore_test.cc b/tensorflow/compiler/xla/pjrt/semaphore_test.cc
similarity index 97%
rename from tensorflow/compiler/xla/python/semaphore_test.cc
rename to tensorflow/compiler/xla/pjrt/semaphore_test.cc
index 5ef59618b8b..56f7e8c9a05 100644
--- a/tensorflow/compiler/xla/python/semaphore_test.cc
+++ b/tensorflow/compiler/xla/pjrt/semaphore_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/semaphore.h"
+#include "tensorflow/compiler/xla/pjrt/semaphore.h"
 
 #include "absl/synchronization/notification.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.cc b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc
similarity index 78%
rename from tensorflow/compiler/xla/python/shared_device_buffer.cc
rename to tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc
index e4f57752dcc..32ca4e4550c 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer.cc
+++ b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 
 #include <iterator>
 #include <memory>
 
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/compiler/xla/python/local_device_state.h"
+#include "tensorflow/compiler/xla/pjrt/local_device_state.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/stream_executor/device_memory.h"
@@ -29,7 +29,7 @@ limitations under the License.
 
 namespace xla {
 
-void BufferDefinitionEvent::SetDefinitionEvent(EventPool::Handle event,
+void BufferSequencingEvent::SetSequencingEvent(EventPool::Handle event,
                                                se::Stream* stream) {
   absl::MutexLock lock(&mu_);
   CHECK(!event_.event());
@@ -38,23 +38,23 @@ void BufferDefinitionEvent::SetDefinitionEvent(EventPool::Handle event,
   streams_defined_on_.push_back(stream);
 }
 
-bool BufferDefinitionEvent::EventHasBeenRecorded() const {
+bool BufferSequencingEvent::EventHasBeenRecorded() const {
   return event_.event() != nullptr;
 }
 
-uint64 BufferDefinitionEvent::sequence_number() const {
+uint64 BufferSequencingEvent::sequence_number() const {
   absl::MutexLock lock(&mu_);
   CHECK(EventHasBeenRecorded());
   return event_.sequence_number();
 }
 
-void BufferDefinitionEvent::WaitForEventOnStream(se::Stream* stream) {
+void BufferSequencingEvent::WaitForEventOnStream(se::Stream* stream) {
   absl::MutexLock lock(&mu_);
 
   // We cannot wait for an event until ThenRecordEvent has been called; on GPU
   // newly created events are deemed to have already happened past.
   mu_.Await(
-      absl::Condition(this, &BufferDefinitionEvent::EventHasBeenRecorded));
+      absl::Condition(this, &BufferSequencingEvent::EventHasBeenRecorded));
 
   // The set of defined streams is expected to be very small indeed (usually
   // 1-2), so a simple linear scan should be fast enough.
@@ -68,13 +68,13 @@ void BufferDefinitionEvent::WaitForEventOnStream(se::Stream* stream) {
   streams_defined_on_.push_back(stream);
 }
 
-bool BufferDefinitionEvent::DefinedOn(se::Stream* stream) {
+bool BufferSequencingEvent::DefinedOn(se::Stream* stream) {
   absl::MutexLock lock(&mu_);
 
   // We cannot wait for an event until ThenRecordEvent has been called; on GPU
   // newly created events are deemed to have already happened past.
   mu_.Await(
-      absl::Condition(this, &BufferDefinitionEvent::EventHasBeenRecorded));
+      absl::Condition(this, &BufferSequencingEvent::EventHasBeenRecorded));
 
   // The set of defined streams is expected to be very small indeed (usually
   // 1-2), so a simple linear scan should be fast enough.
@@ -82,21 +82,21 @@ bool BufferDefinitionEvent::DefinedOn(se::Stream* stream) {
                    stream) != streams_defined_on_.end();
 }
 
-bool BufferDefinitionEvent::IsComplete() {
+bool BufferSequencingEvent::IsComplete() {
   absl::MutexLock lock(&mu_);
 
   // We cannot wait for an event until ThenRecordEvent has been called; on
   // GPU newly created events are deemed to have already happened past.
   mu_.Await(
-      absl::Condition(this, &BufferDefinitionEvent::EventHasBeenRecorded));
+      absl::Condition(this, &BufferSequencingEvent::EventHasBeenRecorded));
 
   return event_.event()->PollForStatus() == se::Event::Status::kComplete;
 }
 
-/* static */ std::shared_ptr<SharedDeviceBuffer>
-SharedDeviceBuffer::FromScopedShapedBuffer(
+/* static */ std::shared_ptr<TrackedDeviceBuffer>
+TrackedDeviceBuffer::FromScopedShapedBuffer(
     ScopedShapedBuffer* shaped_buffer,
-    absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+    absl::Span<const std::shared_ptr<BufferSequencingEvent>>
         definition_events) {
   ShapeTree<se::DeviceMemoryBase>::iterator iterator =
       shaped_buffer->buffers().begin();
@@ -111,15 +111,15 @@ SharedDeviceBuffer::FromScopedShapedBuffer(
         ++iterator;
       });
   CHECK(iterator == shaped_buffer->buffers().end());
-  return std::make_shared<SharedDeviceBuffer>(
+  return std::make_shared<TrackedDeviceBuffer>(
       shaped_buffer->memory_allocator(), shaped_buffer->device_ordinal(),
       absl::Span<se::DeviceMemoryBase>(buffers), definition_events,
       /*on_delete_callback=*/nullptr);
 }
 
-ShapedBuffer SharedDeviceBuffer::AsShapedBuffer(const Shape& on_host_shape,
-                                                const Shape& on_device_shape,
-                                                se::Platform* platform) const {
+ShapedBuffer TrackedDeviceBuffer::AsShapedBuffer(const Shape& on_host_shape,
+                                                 const Shape& on_device_shape,
+                                                 se::Platform* platform) const {
   ShapedBuffer shaped_buffer(on_host_shape, on_device_shape, platform,
                              device_ordinal_);
   ShapeTree<se::DeviceMemoryBase>::iterator iterator =
@@ -136,7 +136,7 @@ ShapedBuffer SharedDeviceBuffer::AsShapedBuffer(const Shape& on_host_shape,
 // See comment on ExecutionInput in xla/service/executable.h to understand
 // the meaning of owned/unowned in that class.
 
-void SharedDeviceBuffer::AddToInputAsImmutable(
+void TrackedDeviceBuffer::AddToInputAsImmutable(
     ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
     const ShapeTree<MaybeOwningDeviceMemory>::iterator& end) const {
   for (const se::DeviceMemoryBase& buf : device_memory_) {
@@ -147,7 +147,7 @@ void SharedDeviceBuffer::AddToInputAsImmutable(
   }
 }
 
-void SharedDeviceBuffer::AddToInputAsDonated(
+void TrackedDeviceBuffer::AddToInputAsDonated(
     ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
     const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
     ExecutionInput* execution_input,
@@ -165,14 +165,14 @@ void SharedDeviceBuffer::AddToInputAsDonated(
 namespace {
 
 using MoveIterator =
-    absl::Span<const std::shared_ptr<BufferDefinitionEvent>>::iterator;
+    absl::Span<const std::shared_ptr<BufferSequencingEvent>>::iterator;
 
 }  // namespace
 
-SharedDeviceBuffer::SharedDeviceBuffer(
+TrackedDeviceBuffer::TrackedDeviceBuffer(
     se::DeviceMemoryAllocator* allocator, int device_ordinal,
     absl::Span<se::DeviceMemoryBase const> device_memory,
-    absl::Span<const std::shared_ptr<BufferDefinitionEvent>> definition_events,
+    absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events,
     std::function<void()> on_delete_callback)
     : allocator_(allocator),
       device_ordinal_(device_ordinal),
@@ -183,7 +183,7 @@ SharedDeviceBuffer::SharedDeviceBuffer(
       in_use_(true),
       on_delete_callback_(std::move(on_delete_callback)) {}
 
-SharedDeviceBuffer::~SharedDeviceBuffer() {
+TrackedDeviceBuffer::~TrackedDeviceBuffer() {
   if (allocator_) {
     for (const se::DeviceMemoryBase& buffer : device_memory_) {
       Status status = allocator_->Deallocate(device_ordinal_, buffer);
@@ -197,8 +197,8 @@ SharedDeviceBuffer::~SharedDeviceBuffer() {
   }
 }
 
-void SharedDeviceBuffer::AddUsageEvent(
-    se::Stream* usage_stream, std::shared_ptr<BufferDefinitionEvent> event,
+void TrackedDeviceBuffer::AddUsageEvent(
+    se::Stream* usage_stream, std::shared_ptr<BufferSequencingEvent> event,
     bool reference_held) {
   CHECK(in_use_);
 
@@ -214,16 +214,16 @@ void SharedDeviceBuffer::AddUsageEvent(
   usage_events_.push_back({usage_stream, event, reference_held});
 }
 
-SharedDeviceBuffer::StreamAndEventContainer
-SharedDeviceBuffer::LockUseAndTransferUsageEvents() {
+TrackedDeviceBuffer::StreamAndEventContainer
+TrackedDeviceBuffer::LockUseAndTransferUsageEvents() {
   CHECK(in_use_);
   in_use_ = false;
   return std::move(usage_events_);
 }
 
 void GetDeviceBufferEvents(
-    const SharedDeviceBuffer& buffer, bool get_usage_events,
-    absl::flat_hash_set<BufferDefinitionEvent*>* events) {
+    const TrackedDeviceBuffer& buffer, bool get_usage_events,
+    absl::flat_hash_set<BufferSequencingEvent*>* events) {
   if (get_usage_events) {
     for (const auto& e : buffer.usage_events()) {
       events->insert(e.event.get());
@@ -235,11 +235,11 @@ void GetDeviceBufferEvents(
   }
 }
 
-void WaitForBufferDefinitionEventsOnStream(const SharedDeviceBuffer& buffer,
+void WaitForBufferDefinitionEventsOnStream(const TrackedDeviceBuffer& buffer,
                                            se::Stream* stream) {
-  absl::flat_hash_set<BufferDefinitionEvent*> events;
+  absl::flat_hash_set<BufferSequencingEvent*> events;
   GetDeviceBufferEvents(buffer, /*get_usage_events=*/false, &events);
-  for (BufferDefinitionEvent* event : events) {
+  for (BufferSequencingEvent* event : events) {
     event->WaitForEventOnStream(stream);
   }
 }
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.h b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h
similarity index 76%
rename from tensorflow/compiler/xla/python/shared_device_buffer.h
rename to tensorflow/compiler/xla/pjrt/tracked_device_buffer.h
index 4a5f8d82abd..562cb2f913e 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer.h
+++ b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
 
 #include <memory>
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/python/event_pool.h"
-#include "tensorflow/compiler/xla/python/local_device_state.h"
+#include "tensorflow/compiler/xla/pjrt/event_pool.h"
+#include "tensorflow/compiler/xla/pjrt/local_device_state.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape.h"
@@ -31,8 +31,8 @@ limitations under the License.
 
 namespace xla {
 
-// A BufferDefinitionEvent describes whether a buffer is valid from the
-// viewpoint of each of stream that may access it.
+// A BufferSequencingEvent keeps track of dependencies of a buffer on each
+// stream it has been used on.
 //
 // Each logical buffer in an XLA computation may be defined (i.e., written to)
 // at most once. We call the operation that writes the buffer's value on some
@@ -42,6 +42,9 @@ namespace xla {
 // 'stream', RecordOnStream(stream) should also be called to trigger the
 // definition event after the operation has completed.
 //
+// After the buffer is read on 'stream' another event should be added so that
+// it is possible to sequence buffer donation after all reads have completed.
+//
 // Since different streams are not necessarily synchronized with one another,
 // if we wish to consume the value of the buffer on a different stream, we
 // should first call WaitForEventOnStream(stream), which add a cross-stream
@@ -53,17 +56,14 @@ namespace xla {
 // The dependency logic caches the set of streams at the tail of which the
 // definition event is known to have occurred; waiting for the same event on the
 // same stream causes no additional waiting.
-//
-// TODO(misard) Rename this BufferSequencingEvent now that it is used for Usage
-// events as well.
-class BufferDefinitionEvent {
+class BufferSequencingEvent {
  public:
-  BufferDefinitionEvent() = default;
+  BufferSequencingEvent() = default;
 
-  // Sets the definition event of the buffer to 'event', which is recorded
-  // on 'stream'. Must be called at most once. Unblocks any other host threads
-  // are blocked in WaitForEventOnStream.
-  void SetDefinitionEvent(EventPool::Handle event, se::Stream* stream);
+  // Sets the sequencing event to 'event', which is recorded on 'stream'. Must
+  // be called at most once. Unblocks any other host threads that are blocked in
+  // WaitForEventOnStream.
+  void SetSequencingEvent(EventPool::Handle event, se::Stream* stream);
 
   // Adds synchronization events to 'stream' that wait for this event to be
   // defined on 'stream'. Does nothing if the event is already known to have
@@ -83,16 +83,16 @@ class BufferDefinitionEvent {
 
   // Compares the sequence numbers of two recorded events. It is illegal to call
   // the comparison operators unless both events have been recorded.
-  inline bool operator<(const BufferDefinitionEvent& rhs) const {
+  inline bool operator<(const BufferSequencingEvent& rhs) const {
     return sequence_number() < rhs.sequence_number();
   }
-  inline bool operator>(const BufferDefinitionEvent& rhs) const {
+  inline bool operator>(const BufferSequencingEvent& rhs) const {
     return rhs < *this;
   }
-  inline bool operator<=(const BufferDefinitionEvent& rhs) const {
+  inline bool operator<=(const BufferSequencingEvent& rhs) const {
     return !(*this > rhs);
   }
-  inline bool operator>=(const BufferDefinitionEvent& rhs) const {
+  inline bool operator>=(const BufferSequencingEvent& rhs) const {
     return !(*this < rhs);
   }
 
@@ -100,9 +100,10 @@ class BufferDefinitionEvent {
   bool EventHasBeenRecorded() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   uint64 sequence_number() const;
 
-  // An event that is triggered when the content of one or more buffers is
-  // ready. If this event is nullptr, it is assumed that the buffer's content is
-  // always defined.
+  // An event that is triggered when the content of one or more buffers has been
+  // read or written. If this event is used as a definition event and is
+  // nullptr, it is assumed that the buffer's content is always defined for
+  // example because it uses storage borrowed from elsewhere.
   EventPool::Handle event_;
 
   mutable absl::Mutex mu_;
@@ -115,7 +116,7 @@ class BufferDefinitionEvent {
 // owns all of the device memory in the tuple. It also tracks the definition and
 // usage of the memory on streams, to allow for synchronized usage and deletion
 // of memory under all of the allocation model semantics.
-class SharedDeviceBuffer {
+class TrackedDeviceBuffer {
  public:
   // Helper object to keep track of usage of the buffer on streams.
   struct StreamAndEvent {
@@ -123,17 +124,17 @@ class SharedDeviceBuffer {
     se::Stream* stream;
     // An event that is later than the most recent usage of the buffer on
     // stream.
-    std::shared_ptr<BufferDefinitionEvent> event;
+    std::shared_ptr<BufferSequencingEvent> event;
     // True if and only if a reference to the buffer is kept live until after
     // the host knows that event is complete.
     bool reference_held;
   };
 
-  // Converts a ScopedShapedBuffer into a SharedDeviceBuffer. Takes ownership of
-  // the buffers of the shaped_buffer.
-  static std::shared_ptr<SharedDeviceBuffer> FromScopedShapedBuffer(
+  // Converts a ScopedShapedBuffer into a TrackedDeviceBuffer. Takes ownership
+  // of the buffers of the shaped_buffer.
+  static std::shared_ptr<TrackedDeviceBuffer> FromScopedShapedBuffer(
       ScopedShapedBuffer* shaped_buffer,
-      absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
           definition_events);
 
   // Builds a ShapedBuffer view onto the buffers of 'tree'. We require but do
@@ -146,7 +147,7 @@ class SharedDeviceBuffer {
   // Adds the owned device buffers in order to 'iterator'. Used to add the
   // buffers to an ExecutionInput. We require but do not verify that 'iterator'
   // when passed in is pointing to a sub-tuple of the ExecutionInput whose
-  // on_device_shape matches that of the SharedDeviceBuffer. 'end' is used to
+  // on_device_shape matches that of the TrackedDeviceBuffer. 'end' is used to
   // check that 'iterator' doesn't run out of bounds.
   void AddToInputAsImmutable(
       ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
@@ -158,7 +159,7 @@ class SharedDeviceBuffer {
   // this->ReleaseDeviceMemory() must be called to avoid freeing the device
   // memory twice. We require but do not verify that 'iterator' when passed in
   // is pointing to a sub-tuple of execution_input whose on_device_shape matches
-  // that of the SharedDeviceBuffer. 'end' is used to check that 'iterator'
+  // that of the TrackedDeviceBuffer. 'end' is used to check that 'iterator'
   // doesn't run out of bounds.
   void AddToInputAsDonated(
       ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
@@ -174,7 +175,7 @@ class SharedDeviceBuffer {
   const absl::InlinedVector<se::DeviceMemoryBase, 1>& device_memory() const {
     return device_memory_;
   }
-  absl::Span<const std::shared_ptr<BufferDefinitionEvent>> definition_events()
+  absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events()
       const {
     return definition_events_;
   }
@@ -196,7 +197,7 @@ class SharedDeviceBuffer {
   //                   is sure that the usage (transfer or execution) has
   //                   completed.
   void AddUsageEvent(se::Stream* usage_stream,
-                     std::shared_ptr<BufferDefinitionEvent> event,
+                     std::shared_ptr<BufferSequencingEvent> event,
                      bool reference_held);
 
   using StreamAndEventContainer = absl::InlinedVector<StreamAndEvent, 3>;
@@ -206,13 +207,13 @@ class SharedDeviceBuffer {
   // any stream and, e.g. AddUsageHold will CHECK fail.
   StreamAndEventContainer LockUseAndTransferUsageEvents();
 
-  SharedDeviceBuffer() : in_use_(true) {}
-  SharedDeviceBuffer(se::DeviceMemoryAllocator* allocator, int device_ordinal,
-                     absl::Span<se::DeviceMemoryBase const> device_memory,
-                     absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
-                         definition_events,
-                     std::function<void()> on_delete_callback);
-  ~SharedDeviceBuffer();
+  TrackedDeviceBuffer() : in_use_(true) {}
+  TrackedDeviceBuffer(se::DeviceMemoryAllocator* allocator, int device_ordinal,
+                      absl::Span<se::DeviceMemoryBase const> device_memory,
+                      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
+                          definition_events,
+                      std::function<void()> on_delete_callback);
+  ~TrackedDeviceBuffer();
 
  private:
   // Are the buffers in device_memory_ owned? If so, which allocator and device
@@ -228,32 +229,32 @@ class SharedDeviceBuffer {
   // single-stream execution case where events are not necessary for buffer
   // event sequencing. All events must be triggered before the buffers can be
   // used.
-  absl::InlinedVector<std::shared_ptr<BufferDefinitionEvent>, 2>
+  absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 2>
       definition_events_;
 
   // in_use_ starts out true, and is set to false when the buffer is released
-  // from its owning PyLocalBuffer. Once in_use_ is false, the buffer may no
+  // from its owning PjRtBuffer. Once in_use_ is false, the buffer may no
   // longer be used on any stream.
   bool in_use_;
   // Set of streams that the buffer has ever been used on, see comment on
   // StreamAndEvent.
   StreamAndEventContainer usage_events_;
 
-  // A callback to call when the SharedDeviceBuffer is about to be destroyed.
+  // A callback to call when the TrackedDeviceBuffer is about to be destroyed.
   std::function<void()> on_delete_callback_;
 };
 
 // Populates 'events' with the set of buffer events for buffer. If
 // get_usage_events=true populates with the latest usage events, otherwise
 // populates with the definition events.
-void GetDeviceBufferEvents(const SharedDeviceBuffer& buffer,
+void GetDeviceBufferEvents(const TrackedDeviceBuffer& buffer,
                            bool get_usage_events,
-                           absl::flat_hash_set<BufferDefinitionEvent*>* events);
+                           absl::flat_hash_set<BufferSequencingEvent*>* events);
 
 // Waits for all of the definition events in a buffer on 'stream'.
-void WaitForBufferDefinitionEventsOnStream(const SharedDeviceBuffer& buffer,
+void WaitForBufferDefinitionEventsOnStream(const TrackedDeviceBuffer& buffer,
                                            se::Stream* stream);
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer_test.cc b/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc
similarity index 88%
rename from tensorflow/compiler/xla/python/shared_device_buffer_test.cc
rename to tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc
index ddf02dcb2de..9373b57e7d1 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer_test.cc
+++ b/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 
 #include <memory>
 
@@ -27,8 +27,8 @@ limitations under the License.
 namespace xla {
 namespace {
 
-StatusOr<std::shared_ptr<SharedDeviceBuffer>> MakeArray(const Shape& shape,
-                                                        LocalClient* client) {
+StatusOr<std::shared_ptr<TrackedDeviceBuffer>> MakeArray(const Shape& shape,
+                                                         LocalClient* client) {
   std::vector<stream_executor::DeviceMemoryBase> device_buffers;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       client->backend().transfer_manager()->HostShapeToDeviceShape(shape),
@@ -42,13 +42,13 @@ StatusOr<std::shared_ptr<SharedDeviceBuffer>> MakeArray(const Shape& shape,
         device_buffers.push_back(device_memory.Release());
         return Status::OK();
       }));
-  return std::make_shared<SharedDeviceBuffer>(
+  return std::make_shared<TrackedDeviceBuffer>(
       client->backend().memory_allocator(), /*device_ordinal=*/0,
       device_buffers,
-      absl::Span<const std::shared_ptr<BufferDefinitionEvent>>(), nullptr);
+      absl::Span<const std::shared_ptr<BufferSequencingEvent>>(), nullptr);
 }
 
-TEST(SharedDeviceBufferTest, AsShapedBuffer) {
+TEST(TrackedDeviceBufferTest, AsShapedBuffer) {
   LocalClient* client = ClientLibrary::LocalClientOrDie();
 
   Shape a_shape = ShapeUtil::MakeShape(F32, {3, 101, 4});
@@ -98,7 +98,7 @@ TEST(SharedDeviceBufferTest, AsShapedBuffer) {
   EXPECT_TRUE(expected_it == expected_buffer_sequence.end());
 }
 
-TEST(SharedDeviceBufferTest, FromScopedShapedBuffer) {
+TEST(TrackedDeviceBufferTest, FromScopedShapedBuffer) {
   LocalClient* client = ClientLibrary::LocalClientOrDie();
 
   Literal literal = LiteralUtil::MakeTupleOwned(
@@ -108,8 +108,8 @@ TEST(SharedDeviceBufferTest, FromScopedShapedBuffer) {
   TF_ASSERT_OK_AND_ASSIGN(
       ScopedShapedBuffer shaped_buffer,
       client->LiteralToShapedBuffer(literal, /*device_ordinal=*/0));
-  std::shared_ptr<SharedDeviceBuffer> device_buffer =
-      SharedDeviceBuffer::FromScopedShapedBuffer(&shaped_buffer, {});
+  std::shared_ptr<TrackedDeviceBuffer> device_buffer =
+      TrackedDeviceBuffer::FromScopedShapedBuffer(&shaped_buffer, {});
 
   EXPECT_EQ(device_buffer->device_memory().size(),
             ShapeUtil::SubshapeCount(
diff --git a/tensorflow/compiler/xla/python/worker_thread.cc b/tensorflow/compiler/xla/pjrt/worker_thread.cc
similarity index 96%
rename from tensorflow/compiler/xla/python/worker_thread.cc
rename to tensorflow/compiler/xla/pjrt/worker_thread.cc
index d3fb02023a5..e8194534aef 100644
--- a/tensorflow/compiler/xla/python/worker_thread.cc
+++ b/tensorflow/compiler/xla/pjrt/worker_thread.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/worker_thread.h"
+#include "tensorflow/compiler/xla/pjrt/worker_thread.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/python/worker_thread.h b/tensorflow/compiler/xla/pjrt/worker_thread.h
similarity index 90%
rename from tensorflow/compiler/xla/python/worker_thread.h
rename to tensorflow/compiler/xla/pjrt/worker_thread.h
index 598f7b1d4ae..4fd2baa4cda 100644
--- a/tensorflow/compiler/xla/python/worker_thread.h
+++ b/tensorflow/compiler/xla/pjrt/worker_thread.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_WORKER_THREAD_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_WORKER_THREAD_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_WORKER_THREAD_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_WORKER_THREAD_H_
 
 #include <functional>
 #include <memory>
@@ -51,4 +51,4 @@ class WorkerThread {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_WORKER_THREAD_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_WORKER_THREAD_H_
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 7c1109166b6..863296c681c 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -1,7 +1,5 @@
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_py_test_deps")
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_cc_test")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
@@ -25,9 +23,25 @@ pyx_library(
     srcs = ["custom_call_for_test.pyx"],
 )
 
-py_test(
+py_library(
     name = "xla_client_test",
+    testonly = 1,
     srcs = ["xla_client_test.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":custom_call_for_test",
+        ":xla_client",
+        ":xla_extension",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "xla_client_test_cpu",
+    srcs = ["xla_client_test.py"],
+    args = ["--backend=cpu"],
     main = "xla_client_test.py",
     python_version = "PY3",
     srcs_version = "PY3",
@@ -36,19 +50,30 @@ py_test(
         ":custom_call_for_test",
         ":xla_client",
         ":xla_extension",
+        "@absl_py//absl/flags",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
     ] + xla_py_test_deps(),
 )
 
-cc_library(
-    name = "worker_thread",
-    srcs = ["worker_thread.cc"],
-    hdrs = ["worker_thread.h"],
+py_test(
+    name = "xla_client_test_gpu",
+    srcs = ["xla_client_test.py"],
+    args = ["--backend=gpu"],
+    main = "xla_client_test.py",
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss",
+        "requires-gpu-nvidia",
+    ],  # TODO(phawkins): This test passes, but requires --config=monolithic.
     deps = [
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/synchronization",
-    ],
+        ":xla_client",
+        ":xla_extension",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
+    ] + xla_py_test_deps(),
 )
 
 cc_library(
@@ -62,7 +87,6 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         ":bfloat16",
-        ":local_client",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
@@ -70,6 +94,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/core:lib",
         "//third_party/py/numpy:headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -79,146 +104,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "event_pool",
-    srcs = ["event_pool.cc"],
-    hdrs = ["event_pool.h"],
-    deps = [
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "semaphore",
-    srcs = ["semaphore.cc"],
-    hdrs = ["semaphore.h"],
-    deps = [
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-tf_cc_test(
-    name = "semaphore_test",
-    srcs = ["semaphore_test.cc"],
-    deps = [
-        ":semaphore",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "shared_device_buffer",
-    srcs = ["shared_device_buffer.cc"],
-    hdrs = ["shared_device_buffer.h"],
-    deps = [
-        ":event_pool",
-        ":local_device_state",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/compiler/xla/service:transfer_manager",
-        "//tensorflow/core:lib",
-        "//tensorflow/stream_executor:device_memory",
-        "//tensorflow/stream_executor:device_memory_allocator",
-        "//tensorflow/stream_executor:event",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-tf_cc_test(
-    name = "shared_device_buffer_test",
-    srcs = ["shared_device_buffer_test.cc"],
-    deps = [
-        ":shared_device_buffer",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/core:test_main",
-        "//tensorflow/stream_executor:device_memory",
-        "//tensorflow/stream_executor:device_memory_allocator",
-    ],
-)
-
-cc_library(
-    name = "local_device_state",
-    srcs = ["local_device_state.cc"],
-    hdrs = ["local_device_state.h"],
-    deps = [
-        ":event_pool",
-        ":semaphore",
-        ":worker_thread",
-        "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
-        "//tensorflow/stream_executor:event",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "local_client",
-    srcs = ["local_client.cc"],
-    hdrs = ["local_client.h"],
-    visibility = ["//tensorflow/compiler/xla:friends"],
-    deps = [
-        ":event_pool",
-        ":local_device_state",
-        ":shared_device_buffer",
-        "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:executable_build_options",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/python/distributed:protocol_proto_cc",
-        "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
-        "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
-        "//tensorflow/core:allocator",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/stream_executor:event",
-        "//tensorflow/stream_executor:stream",
-        "//tensorflow/stream_executor/lib",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "python_ref_manager",
     srcs = ["python_ref_manager.cc"],
@@ -283,10 +168,10 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
-        ":local_client",
-        ":shared_device_buffer",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:tracked_device_buffer",
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:platform",
         "//tensorflow/stream_executor/cuda:cuda_platform_id",
@@ -301,37 +186,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cpu_device",
-    srcs = ["cpu_device.cc"],
-    hdrs = ["cpu_device.h"],
-    deps = [
-        ":local_client",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/service:platform_util",
-    ],
-)
-
-cc_library(
-    name = "nvidia_gpu_device",
-    srcs = ["nvidia_gpu_device.cc"],
-    hdrs = ["nvidia_gpu_device.h"],
-    copts = if_cuda(["-DNCCL_ENABLED=1"]),
-    deps = [
-        ":local_client",
-        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/python/distributed:client",
-        "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core/common_runtime:bfc_allocator",
-        "//tensorflow/core/common_runtime/gpu:gpu_mem_allocator",
-        "//tensorflow/stream_executor:tf_allocator_adapter",
-    ] + if_cuda(["@local_config_nccl//:nccl"]),
-)
-
 config_setting(
     name = "enable_gpu",
     values = {"define": "xla_python_enable_gpu=true"},
@@ -350,11 +204,7 @@ pybind_extension(
     module_name = "xla_extension",
     deps = [
         ":bfloat16",
-        ":cpu_device",
         ":dlpack",
-        ":local_client",
-        ":nvidia_gpu_device",
-        ":shared_device_buffer",
         ":python_ref_manager",
         ":types",
         "@com_google_absl//absl/base",
@@ -384,9 +234,13 @@ pybind_extension(
         "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
         "//tensorflow/compiler/xla/client/lib:sorting",
         "//tensorflow/compiler/xla/client/lib:svd",
-        "//tensorflow/compiler/xla/python/distributed",
-        "//tensorflow/compiler/xla/python/distributed:client",
-        "//tensorflow/compiler/xla/python/distributed:service",
+        "//tensorflow/compiler/xla/pjrt:cpu_device",
+        "//tensorflow/compiler/xla/pjrt:nvidia_gpu_device",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:tracked_device_buffer",
+        "//tensorflow/compiler/xla/pjrt/distributed",
+        "//tensorflow/compiler/xla/pjrt/distributed:client",
+        "//tensorflow/compiler/xla/pjrt/distributed:service",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:hlo",
@@ -406,8 +260,8 @@ pybind_extension(
         "//tensorflow/core:lib_internal_impl",  # buildcleaner: keep
         "//tensorflow/core/profiler/lib:profiler_backends",
         "//tensorflow/core/profiler/lib:profiler_session",
-        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/rpc:profiler_server",
+        "//tensorflow/python/profiler/internal:traceme_context_manager",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:platform",
     ] + select({
@@ -415,25 +269,3 @@ pybind_extension(
         "//conditions:default": [],
     }),
 )
-
-tf_cc_test(
-    name = "gpu_multistream_test",
-    srcs = ["gpu_multistream_test.cc"],
-    tags = [
-        # TODO(phawkins): figure out TF test infra such that this only runs under GPU.
-        "no_oss",
-        "requires-gpu-nvidia",
-    ],
-    deps = [
-        ":local_client",
-        ":nvidia_gpu_device",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/client:executable_build_options",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:gpu_plugin",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:random",
-    ],
-)
diff --git a/tensorflow/compiler/xla/python/bfloat16.cc b/tensorflow/compiler/xla/python/bfloat16.cc
index 2f288094ecd..e48475b7a85 100644
--- a/tensorflow/compiler/xla/python/bfloat16.cc
+++ b/tensorflow/compiler/xla/python/bfloat16.cc
@@ -46,52 +46,15 @@ Safe_PyObjectPtr make_safe(PyObject* object) {
   return Safe_PyObjectPtr(object);
 }
 
-// Workarounds for Python 2 vs 3 API differences.
-#if PY_MAJOR_VERSION < 3
-
-PyObject* MakePyString(const string& s) {
-  return PyString_FromString(s.c_str());
-}
-
-typedef long HashType;  // NOLINT
-
-bool TfPyInt_Check(PyObject* object) { return PyInt_Check(object); }
-
-PyObject* TfPyInt_FromLong(long x) {  // NOLINT
-  return PyInt_FromLong(x);
-}
-
-long TfPyInt_AsLong(PyObject* x) {  // NOLINT
-  return PyInt_AsLong(x);
-}
-
-#else  // PY_MAJOR_VERSION < 3
-
-PyObject* MakePyString(const string& s) {
-  return PyUnicode_FromString(s.c_str());
-}
-
-bool TfPyInt_Check(PyObject* object) {
+bool PyLong_CheckNoOverflow(PyObject* object) {
   if (!PyLong_Check(object)) {
-    return 0;
+    return false;
   }
   int overflow = 0;
   PyLong_AsLongAndOverflow(object, &overflow);
   return (overflow == 0);
 }
 
-PyObject* TfPyInt_FromLong(long x) {  // NOLINT
-  return PyLong_FromLong(x);
-}
-
-long TfPyInt_AsLong(PyObject* x) {  // NOLINT
-  return PyLong_AsLong(x);
-}
-
-typedef Py_hash_t HashType;
-
-#endif  // PY_MAJOR_VERSION < 3
-
 // Registered numpy type ID. Global variable populated by the registration code.
 // Protected by the GIL.
 int npy_bfloat16 = -1;
@@ -143,8 +106,8 @@ bool CastToBfloat16(PyObject* arg, bfloat16* output) {
     *output = bfloat16(d);
     return true;
   }
-  if (TfPyInt_Check(arg)) {
-    long l = TfPyInt_AsLong(arg);  // NOLINT
+  if (PyLong_CheckNoOverflow(arg)) {
+    long l = PyLong_AsLong(arg);  // NOLINT
     if (PyErr_Occurred()) {
       return false;
     }
@@ -205,7 +168,7 @@ PyObject* PyBfloat16_Float(PyObject* self) {
 PyObject* PyBfloat16_Int(PyObject* self) {
   bfloat16 x = PyBfloat16_Bfloat16(self);
   long y = static_cast<long>(x);  // NOLINT
-  return TfPyInt_FromLong(y);
+  return PyLong_FromLong(y);
 }
 
 // Negates a PyBfloat16.
@@ -243,11 +206,7 @@ PyObject* PyBfloat16_TrueDivide(PyObject* a, PyObject* b) {
   if (SafeCastToBfloat16(a, &x) && SafeCastToBfloat16(b, &y)) {
     return PyBfloat16_FromBfloat16(x / y).release();
   }
-#if PY_MAJOR_VERSION < 3
-  return PyArray_Type.tp_as_number->nb_divide(a, b);
-#else
   return PyArray_Type.tp_as_number->nb_true_divide(a, b);
-#endif
 }
 
 // Python number methods for PyBfloat16 objects.
@@ -255,9 +214,6 @@ PyNumberMethods PyBfloat16_AsNumber = {
     PyBfloat16_Add,       // nb_add
     PyBfloat16_Subtract,  // nb_subtract
     PyBfloat16_Multiply,  // nb_multiply
-#if PY_MAJOR_VERSION < 3
-    PyBfloat16_TrueDivide,  // nb_divide
-#endif
     nullptr,              // nb_remainder
     nullptr,              // nb_divmod
     nullptr,              // nb_power
@@ -271,27 +227,13 @@ PyNumberMethods PyBfloat16_AsNumber = {
     nullptr,              // nb_and
     nullptr,              // nb_xor
     nullptr,              // nb_or
-#if PY_MAJOR_VERSION < 3
-    nullptr,  // nb_coerce
-#endif
     PyBfloat16_Int,  // nb_int
-#if PY_MAJOR_VERSION < 3
-    PyBfloat16_Int,  // nb_long
-#else
     nullptr,  // reserved
-#endif
     PyBfloat16_Float,  // nb_float
-#if PY_MAJOR_VERSION < 3
-    nullptr,  // nb_oct
-    nullptr,  // nb_hex
-#endif
 
     nullptr,  // nb_inplace_add
     nullptr,  // nb_inplace_subtract
     nullptr,  // nb_inplace_multiply
-#if PY_MAJOR_VERSION < 3
-    nullptr,  // nb_inplace_divide
-#endif
     nullptr,  // nb_inplace_remainder
     nullptr,  // nb_inplace_power
     nullptr,  // nb_inplace_lshift
@@ -376,36 +318,35 @@ PyObject* PyBfloat16_RichCompare(PyObject* a, PyObject* b, int op) {
 // Implementation of repr() for PyBfloat16.
 PyObject* PyBfloat16_Repr(PyObject* self) {
   bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
-  string v = absl::StrCat(static_cast<float>(x));
-  return MakePyString(v);
+  std::string v = absl::StrCat(static_cast<float>(x));
+  return PyUnicode_FromString(v.c_str());
 }
 
 // Implementation of str() for PyBfloat16.
 PyObject* PyBfloat16_Str(PyObject* self) {
   bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
-  string v = absl::StrCat(static_cast<float>(x));
-  return MakePyString(v);
+  std::string v = absl::StrCat(static_cast<float>(x));
+  return PyUnicode_FromString(v.c_str());
 }
 
 // Hash function for PyBfloat16. We use the identity function, which is a weak
 // hash function.
-HashType PyBfloat16_Hash(PyObject* self) {
+Py_hash_t PyBfloat16_Hash(PyObject* self) {
   bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
   return x.value;
 }
 
 // Python type for PyBfloat16 objects.
 PyTypeObject PyBfloat16_Type = {
-#if PY_MAJOR_VERSION < 3
-    PyObject_HEAD_INIT(nullptr) 0,  // ob_size
+    PyVarObject_HEAD_INIT(nullptr, 0) "bfloat16",  // tp_name
+    sizeof(PyBfloat16),                            // tp_basicsize
+    0,                                             // tp_itemsize
+    nullptr,                                       // tp_dealloc
+#if PY_VERSION_HEX < 0x03080000
+    nullptr,  // tp_print
 #else
-    PyVarObject_HEAD_INIT(nullptr, 0)
+    0,  // tp_vectorcall_offset
 #endif
-    "bfloat16",            // tp_name
-    sizeof(PyBfloat16),    // tp_basicsize
-    0,                     // tp_itemsize
-    nullptr,               // tp_dealloc
-    0,                     // tp_print  NOLINT
     nullptr,               // tp_getattr
     nullptr,               // tp_setattr
     nullptr,               // tp_compare / tp_reserved
@@ -420,11 +361,7 @@ PyTypeObject PyBfloat16_Type = {
     nullptr,               // tp_setattro
     nullptr,               // tp_as_buffer
                            // tp_flags
-#if PY_MAJOR_VERSION < 3
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES,
-#else
     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
-#endif
     "bfloat16 floating-point values",  // tp_doc
     nullptr,                           // tp_traverse
     nullptr,                           // tp_clear
@@ -1287,7 +1224,7 @@ bool Initialize() {
   import_array1(false);
   import_umath1(false);
 
-  Safe_PyObjectPtr numpy_str = make_safe(MakePyString("numpy"));
+  Safe_PyObjectPtr numpy_str = make_safe(PyUnicode_FromString("numpy"));
   if (!numpy_str) {
     return false;
   }
diff --git a/tensorflow/compiler/xla/python/dlpack.cc b/tensorflow/compiler/xla/python/dlpack.cc
index 103d2ba5a59..d37d480607a 100644
--- a/tensorflow/compiler/xla/python/dlpack.cc
+++ b/tensorflow/compiler/xla/python/dlpack.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "include/dlpack/dlpack.h"  // from @dlpack
-#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
@@ -39,7 +39,7 @@ namespace {
 const char* const kDlTensorCapsuleName = "dltensor";
 
 struct DLPackTensor {
-  std::shared_ptr<SharedDeviceBuffer> buffer;
+  std::shared_ptr<TrackedDeviceBuffer> buffer;
   std::vector<int64> shape;
   std::vector<int64> strides;
   DLManagedTensor tensor;
@@ -210,7 +210,7 @@ StatusOr<DLContext> DLContextForDevice(const Device& device) {
   return context;
 }
 
-StatusOr<Device*> DeviceForDLContext(const PyLocalClient& client,
+StatusOr<Device*> DeviceForDLContext(const PjRtClient& client,
                                      const DLContext& context) {
   se::Platform::Id platform_id;
   switch (context.device_type) {
@@ -239,11 +239,11 @@ StatusOr<Device*> DeviceForDLContext(const PyLocalClient& client,
 
 }  // namespace
 
-StatusOr<py::capsule> BufferToDLPackManagedTensor(PyLocalBuffer* buffer) {
+StatusOr<py::capsule> BufferToDLPackManagedTensor(PjRtBuffer* buffer) {
   auto pack = absl::make_unique<DLPackTensor>();
   // Block on outstanding operations, so that it is safe to read or mutate the
   // returned buffer.
-  StatusOr<std::shared_ptr<SharedDeviceBuffer>> buffer_or =
+  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> buffer_or =
       buffer->Release(/*wait_for_operations_to_complete=*/true);
   if (!buffer_or.ok()) {
     return InvalidArgument(
@@ -293,8 +293,8 @@ StatusOr<py::capsule> BufferToDLPackManagedTensor(PyLocalBuffer* buffer) {
   return capsule;
 }
 
-StatusOr<std::unique_ptr<PyLocalBuffer>> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, PyLocalClient* client) {
+StatusOr<std::unique_ptr<PjRtBuffer>> DLPackManagedTensorToBuffer(
+    const pybind11::capsule& tensor, PjRtClient* client) {
   if (absl::string_view(tensor.name()) != kDlTensorCapsuleName) {
     return InvalidArgument(
         "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
@@ -334,8 +334,8 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> DLPackManagedTensorToBuffer(
   if (dlmt->deleter) {
     on_delete_callback = [dlmt]() { dlmt->deleter(dlmt); };
   }
-  absl::Span<const std::shared_ptr<BufferDefinitionEvent>> definition_events;
-  auto device_buffer = std::make_shared<SharedDeviceBuffer>(
+  absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events;
+  auto device_buffer = std::make_shared<TrackedDeviceBuffer>(
       /*allocator=*/nullptr, dlmt->dl_tensor.ctx.device_id,
       std::initializer_list<se::DeviceMemoryBase>{buffer}, definition_events,
       std::move(on_delete_callback));
@@ -344,8 +344,8 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> DLPackManagedTensorToBuffer(
   // capsule it cannot be used again.
   PyCapsule_SetName(tensor.ptr(), "used_dltensor");
   PyCapsule_SetDestructor(tensor.ptr(), nullptr);
-  return absl::make_unique<PyLocalBuffer>(
-      shape, shape, std::move(device_buffer), client, device);
+  return absl::make_unique<PjRtBuffer>(shape, shape, std::move(device_buffer),
+                                       client, device);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/dlpack.h b/tensorflow/compiler/xla/python/dlpack.h
index 88548ba5b2a..6766bbe93b1 100644
--- a/tensorflow/compiler/xla/python/dlpack.h
+++ b/tensorflow/compiler/xla/python/dlpack.h
@@ -17,14 +17,14 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_DLPACK_H_
 
 #include "pybind11/pybind11.h"
-#include "tensorflow/compiler/xla/python/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 
 namespace xla {
 
-StatusOr<pybind11::capsule> BufferToDLPackManagedTensor(PyLocalBuffer* buffer);
+StatusOr<pybind11::capsule> BufferToDLPackManagedTensor(PjRtBuffer* buffer);
 
-StatusOr<std::unique_ptr<PyLocalBuffer>> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, PyLocalClient* client);
+StatusOr<std::unique_ptr<PjRtBuffer>> DLPackManagedTensorToBuffer(
+    const pybind11::capsule& tensor, PjRtClient* client);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
index b5f1a831d4a..c460cc36f08 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
@@ -19,8 +19,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:executable_build_options",
-        "//tensorflow/compiler/xla/python:local_client",
-        "//tensorflow/compiler/xla/python:semaphore",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:semaphore",
         "//tensorflow/compiler/xla/python/tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:direct_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:grpc_tpu_driver",
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
index 1089b3cc8e5..e78f04ff980 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/python/semaphore.h"
+#include "tensorflow/compiler/xla/pjrt/semaphore.h"
 #include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -37,7 +37,8 @@ namespace xla {
 
 TpuDevice::TpuDevice(int id, int host_id, const std::array<int, 3>& coords,
                      int core_on_chip)
-    : xla::Device(id, /*local_device_state=*/nullptr, kTpuPlatform, host_id),
+    : xla::Device(id, /*local_device_state=*/nullptr, kTpuPlatform,
+                  /*device_kind=*/"Cloud TPU", host_id),
       coords_(coords),
       core_on_chip_(core_on_chip) {}
 
@@ -749,8 +750,7 @@ PyTpuExecutable::ExecuteOnLocalDevices(
     const XlaComputation& computation,
     absl::optional<std::vector<Shape>> argument_layouts,
     const ExecutableBuildOptions* build_options,
-    std::shared_ptr<PyTpuClient> client,
-    absl::optional<DeviceAssignment> device_assignment, bool tuple_arguments) {
+    std::shared_ptr<PyTpuClient> client, bool tuple_arguments) {
   tensorflow::profiler::TraceMe traceme("PyTpuExecutable::Compile");
 
   VLOG(1) << "Compile: "
@@ -762,21 +762,23 @@ PyTpuExecutable::ExecuteOnLocalDevices(
   if (build_options != nullptr) {
     options = *build_options;
   }
+  absl::optional<xla::DeviceAssignment> device_assignment;
 
   // For POD use case, the device_assignment.num_replicas() may be greater than
   // the number of available local devices, where applicable the non-local
   // devices must be filtered out from participating local computation.
-  if (device_assignment) {
-    if (device_assignment->replica_count() != options.num_replicas()) {
+  if (options.has_device_assignment()) {
+    if (options.device_assignment().replica_count() != options.num_replicas()) {
       return InvalidArgument(
           "Mismatched number of replicas for device "
           "assignment and computation (%d vs %d).",
-          device_assignment->replica_count(), options.num_replicas());
-    } else if (device_assignment->computation_count() != 1) {
+          options.device_assignment().replica_count(), options.num_replicas());
+    } else if (options.device_assignment().computation_count() != 1) {
       return Unimplemented(
           "Only 1 computation per replica supported, %d requested.",
-          device_assignment->computation_count());
+          options.device_assignment().computation_count());
     }
+    device_assignment = options.device_assignment();
   } else {
     TF_ASSIGN_OR_RETURN(device_assignment,
                         client->GetDefaultDeviceAssignment(
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
index f30ce4fda17..4c45df181db 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
-#include "tensorflow/compiler/xla/python/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h"
 #include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.pb.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -267,8 +267,7 @@ class PyTpuExecutable {
       const XlaComputation& computation,
       absl::optional<std::vector<Shape>> argument_layouts,
       const ExecutableBuildOptions* build_options,
-      std::shared_ptr<PyTpuClient> client,
-      absl::optional<DeviceAssignment> device_assignment, bool tuple_arguments);
+      std::shared_ptr<PyTpuClient> client, bool tuple_arguments);
 
   PyTpuExecutable(
       std::unique_ptr<tpu_driver::CompiledProgramHandle> compiled_program,
@@ -285,6 +284,8 @@ class PyTpuExecutable {
   PyTpuExecutable& operator=(const PyTpuExecutable&) = delete;
   PyTpuExecutable& operator=(PyTpuExecutable&&) = delete;
 
+  std::shared_ptr<PyTpuClient> client() const { return client_; }
+
   int num_replicas() const { return device_assignment_.replica_count(); }
   int num_partitions() const { return device_assignment_.computation_count(); }
 
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
index 89338934904..6d4482af43f 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
@@ -20,28 +20,21 @@ from __future__ import print_function
 
 from absl import logging
 
-from tensorflow.compiler.xla.python import xla_client
-from tensorflow.compiler.xla.python import xla_extension as _xla
+# Import xla_client to load shared C++ extensions (just CompileOptions at the
+# time of writing).
+from tensorflow.compiler.xla.python import xla_client  # pylint: disable=unused-import
 from tensorflow.compiler.xla.python.tpu_driver.client import tpu_client_extension as _tpu_client
 
 
-class TpuBackend(xla_client.Backend):
+class TpuBackend(object):
   """XLA backend implemented using the Tpu driver API."""
 
   # Cache the backends to prevent double driver initializations.
   _local_backend = None
 
-  def __init__(self, client):
-    """Creates a new TpuBackend.
-
-    Args:
-      client: A _tpu_client.TpuClient object.
-    """
-    super(TpuBackend, self).__init__('tpu')
-    self.client = client
-
   @staticmethod
   def create(worker=None, force=False):
+    """Constructs a Cloud TPU backend."""
     # `force` == True will skip caching any backends (if applicable) and will
     # always try to create a new client.
     if worker is None:
@@ -56,63 +49,11 @@ class TpuBackend(xla_client.Backend):
       if worker == 'local':
         worker = 'local://'
       if force:
-        return TpuBackend(_tpu_client.TpuClient.Get(worker))
+        return _tpu_client.TpuClient.Get(worker)
       if TpuBackend._local_backend is None:
         logging.info('Starting the local TPU driver.')
-        TpuBackend._local_backend = TpuBackend(
-            _tpu_client.TpuClient.Get(worker))
+        TpuBackend._local_backend = _tpu_client.TpuClient.Get(worker)
       return TpuBackend._local_backend
     else:
       # We do not cache for non-local backends.
-      return TpuBackend(_tpu_client.TpuClient.Get(worker))
-
-  def device_count(self):
-    return self.client.device_count()
-
-  def local_device_count(self):
-    return self.client.local_device_count()
-
-  def local_devices(self):
-    return self.client.local_devices()
-
-  def devices(self):
-    return self.client.devices()
-
-  def host_id(self):
-    return self.client.host_id()
-
-  def buffer_from_pyval(self, pyval, device=None, force_copy=False):
-    if device is None:
-      device = self.client.local_devices()[0]
-    return _tpu_client.PyTpuBuffer.from_python(pyval, self.client, device)
-
-  def compile(self, c_computation, compile_options):
-    options = _xla.ExecutableBuildOptions()
-    options.num_replicas = compile_options.num_replicas
-    options.num_partitions = compile_options.num_partitions
-    if compile_options.result_layout:
-      options.result_layout = compile_options.result_layout
-    options.debug_options.xla_cpu_fast_math_honor_infs = True
-    options.debug_options.xla_cpu_fast_math_honor_nans = True
-    options.debug_options.xla_cpu_fast_math_honor_division = True
-    options.debug_options.xla_cpu_fast_math_honor_functions = True
-    options.debug_options.xla_gpu_enable_fast_min_max = False
-    return _tpu_client.TpuExecutable.Compile(c_computation,
-                                             compile_options.argument_layouts,
-                                             options, self.client,
-                                             compile_options.device_assignment,
-                                             compile_options.tuple_arguments)
-
-  def get_default_device_assignment(self, num_replicas, num_partitions=None):
-    if num_partitions is not None:
-      return self.client.GetDefaultDeviceAssignment(num_replicas,
-                                                    num_partitions)
-    else:
-      # TODO(henrytan): delete this case after all callers can handle 2D output
-      return self.client.GetDefaultDeviceAssignment(num_replicas)
-
-  def serialize(self, executable):
-    return self.client.SerializeExecutable(executable)
-
-  def deserialize(self, serialized_executable):
-    return self.client.DeserializeExecutable(serialized_executable, self.client)
+      return _tpu_client.TpuClient.Get(worker)
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
index 83a3e5b3db9..f44d69656e6 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
@@ -32,12 +32,13 @@ PYBIND11_MODULE(tpu_client_extension, m) {
 
   py::class_<PyTpuClient, std::shared_ptr<PyTpuClient>>(m, "TpuClient")
       .def_static("Get", &PyTpuClient::Get, py::arg("worker"))
+      .def_property_readonly("platform", &PyTpuClient::platform_name)
       .def("device_count", &PyTpuClient::device_count)
       .def("local_device_count", &PyTpuClient::local_device_count)
       .def("devices", &PyTpuClient::devices)
       .def("local_devices", &PyTpuClient::local_devices)
       .def("host_id", &PyTpuClient::host_id)
-      .def("GetDefaultDeviceAssignment",
+      .def("get_default_device_assignment",
            [](PyTpuClient* client, int num_replicas, int num_partitions)
                -> StatusOr<std::vector<std::vector<std::shared_ptr<Device>>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
@@ -57,7 +58,7 @@ PYBIND11_MODULE(tpu_client_extension, m) {
              return result;
            })
       // TODO(skye): delete after all callers can handle 2D output
-      .def("GetDefaultDeviceAssignment",
+      .def("get_default_device_assignment",
            [](PyTpuClient* client, int num_replicas)
                -> StatusOr<std::vector<std::shared_ptr<Device>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
@@ -72,14 +73,14 @@ PYBIND11_MODULE(tpu_client_extension, m) {
              }
              return result;
            })
-      .def("TransferToInfeed",
+      .def("transfer_to_infeed",
            [](PyTpuClient* client, const LiteralSlice& literal,
               int device_ordinal) {
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
              return client->TransferToInfeed(literal, device_ordinal);
            })
-      .def("TransferFromOutfeed",
+      .def("transfer_from_outfeed",
            [](PyTpuClient* client, const Shape& shape,
               int device_ordinal) -> StatusOr<py::object> {
              GlobalPyRefManager()->CollectGarbage();
@@ -91,16 +92,16 @@ PYBIND11_MODULE(tpu_client_extension, m) {
                literal_shared = std::make_shared<Literal>(std::move(literal));
              }
              return LiteralToPython(std::move(literal_shared));
-           });
-
-  py::class_<PyTpuBuffer>(m, "PyTpuBuffer")
-      .def_static(
-          "from_python",
-          [](const pybind11::object& argument,
-             std::shared_ptr<PyTpuClient> client,
-             std::shared_ptr<Device> device)
-              -> StatusOr<std::unique_ptr<PyTpuBuffer>> {
-            CHECK(device != nullptr);
+           })
+      .def(
+          "buffer_from_pyval",
+          [](std::shared_ptr<PyTpuClient> client,
+             const pybind11::object& argument, std::shared_ptr<Device> device,
+             bool force_copy) -> StatusOr<std::unique_ptr<PyTpuBuffer>> {
+            if (device == nullptr) {
+              TF_RET_CHECK(!client->local_devices().empty());
+              device = client->local_devices().front();
+            }
             auto iter = client->id_to_device().find(device->id());
             if (iter->second != device) {
               return InvalidArgument(
@@ -124,7 +125,25 @@ PYBIND11_MODULE(tpu_client_extension, m) {
             return PyTpuBuffer::FromLiterals(
                 std::move(leaves), tree.shape, std::move(py_buffer_ref),
                 std::move(client), std::move(device));
-          })
+          },
+          py::arg("argument"), py::arg("device") = nullptr,
+          py::arg("force_copy") = false)
+      .def(
+          "compile",
+          [](std::shared_ptr<PyTpuClient> client,
+             const XlaComputation& computation, CompileOptions options)
+              -> StatusOr<std::unique_ptr<PyTpuExecutable>> {
+            py::gil_scoped_release gil_release;
+            return PyTpuExecutable::Compile(
+                computation, options.argument_layouts,
+                &options.executable_build_options, client,
+                options.parameter_is_tupled_arguments);
+          },
+          py::arg("computation"),
+          py::arg("compile_options") = CompileOptions());
+
+  py::class_<PyTpuBuffer>(m, "PyTpuBuffer")
+      .def_property_readonly("client", &PyTpuBuffer::client)
       .def("copy_to_device",
            [](PyTpuBuffer* buffer, std::shared_ptr<Device> dst_device) {
              CHECK(dst_device != nullptr);
@@ -159,37 +178,21 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       });
 
   py::class_<PyTpuExecutable>(m, "TpuExecutable")
-      .def_static("Compile", &PyTpuExecutable::Compile,
-                  py::call_guard<py::gil_scoped_release>())
-      .def_static("Compile",
-                  [](const XlaComputation& computation,
-                     absl::optional<std::vector<Shape>> argument_layouts,
-                     const ExecutableBuildOptions* build_options,
-                     std::shared_ptr<PyTpuClient> client,
-                     absl::optional<std::vector<std::vector<Device*>>>
-                         device_assignment,
-                     bool tuple_arguments)
-                      -> StatusOr<std::unique_ptr<PyTpuExecutable>> {
-                    py::gil_scoped_release gil_release;
-                    absl::optional<DeviceAssignment> xla_device_assignment;
-                    if (device_assignment) {
-                      TF_ASSIGN_OR_RETURN(
-                          xla_device_assignment,
-                          DevicesToDeviceAssignment(*device_assignment));
-                    }
-                    return PyTpuExecutable::Compile(
-                        computation, argument_layouts, build_options, client,
-                        std::move(xla_device_assignment), tuple_arguments);
-                  })
       .def("local_logical_device_ids",
            &PyTpuExecutable::local_logical_device_ids)
       .def("local_devices", &PyTpuExecutable::local_devices)
-      .def("SizeOfGeneratedCodeInBytes",
+      .def_property_readonly("client", &PyTpuExecutable::client)
+      .def("size_of_generated_code_in_bytes",
            &PyTpuExecutable::SizeOfGeneratedCodeInBytes)
       .def("Delete", &PyTpuExecutable::Delete)
       .def("Execute", &PyTpuExecutable::Execute,
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
       .def("ExecuteOnLocalDevices", &PyTpuExecutable::ExecuteOnLocalDevices,
+           py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
+      .def("delete", &PyTpuExecutable::Delete)
+      .def("execute", &PyTpuExecutable::Execute,
+           py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
+      .def("execute_on_local_devices", &PyTpuExecutable::ExecuteOnLocalDevices,
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"));
 
   py::class_<TpuDevice, Device, std::shared_ptr<TpuDevice>>(m, "TpuDevice")
diff --git a/tensorflow/compiler/xla/python/types.h b/tensorflow/compiler/xla/python/types.h
index 7a29f9dca28..673f403d91e 100644
--- a/tensorflow/compiler/xla/python/types.h
+++ b/tensorflow/compiler/xla/python/types.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/python/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -38,16 +38,16 @@ namespace xla {
 
 // Custom holder types.
 //
-// We must keep the PyLocalClient object alive as long as any of the runtime
+// We must keep the PjRtClient object alive as long as any of the runtime
 // objects are alive. Since we don't have a lot of control over Python
-// destructor ordering, we keep the PyLocalClient object as a std::shared_ptr<>,
+// destructor ordering, we keep the PjRtClient object as a std::shared_ptr<>,
 // and ensure that each Python runtime object holds a reference to the
-// PyLocalClient. An alternative design would be to keep a single global
-// singleton PyLocalClient, although this seems less flexible, especially for
+// PjRtClient. An alternative design would be to keep a single global
+// singleton PjRtClient, although this seems less flexible, especially for
 // writing tests.
 //
-// To maintain PyLocalClient references, we define pybind11 holder classes that
-// are custom smart pointers that also keep a reference to a PyLocalClient.
+// To maintain PjRtClient references, we define pybind11 holder classes that
+// are custom smart pointers that also keep a reference to a PjRtClient.
 // pybind11 has a `keep_alive` feature that has a similar goal, but it doesn't
 // seem sufficiently flexible to describe ownership relationships in cases where
 // the ownership doesn't pertain to a direct argument or return value of a
@@ -55,7 +55,7 @@ namespace xla {
 // objects that contain both a reference and a runtime class; holder classes
 // seem less tedious to define.
 
-// A pair of a PyLocalClient reference and an unowned pointer to T.
+// A pair of a PjRtClient reference and an unowned pointer to T.
 template <typename T>
 struct ClientAndPtr {
   ClientAndPtr() = default;
@@ -70,7 +70,7 @@ struct ClientAndPtr {
   ClientAndPtr& operator=(const ClientAndPtr&) = default;
   ClientAndPtr& operator=(ClientAndPtr&&) = default;
 
-  std::shared_ptr<PyLocalClient> client;
+  std::shared_ptr<PjRtClient> client;
   T* contents;
 
   T* get() const { return contents; }
@@ -81,7 +81,7 @@ struct ClientAndPtr {
 // By defining a templated helper function, we can use return type deduction
 // and avoid specifying types at the caller.
 template <typename T>
-ClientAndPtr<T> WrapWithClient(std::shared_ptr<PyLocalClient> client,
+ClientAndPtr<T> WrapWithClient(std::shared_ptr<PjRtClient> client,
                                T* contents) {
   ClientAndPtr<T> result;
   result.client = std::move(client);
@@ -89,7 +89,7 @@ ClientAndPtr<T> WrapWithClient(std::shared_ptr<PyLocalClient> client,
   return result;
 }
 
-// A pair of a PyLocalClient reference and an owned pointer to T.
+// A pair of a PjRtClient reference and an owned pointer to T.
 template <typename T>
 struct ClientAndUniquePtr {
   ClientAndUniquePtr() = default;
@@ -103,7 +103,7 @@ struct ClientAndUniquePtr {
   ClientAndUniquePtr& operator=(const ClientAndUniquePtr&) = delete;
   ClientAndUniquePtr& operator=(ClientAndUniquePtr&&) = default;
 
-  std::shared_ptr<PyLocalClient> client;
+  std::shared_ptr<PjRtClient> client;
   std::unique_ptr<T> contents;
 
   T* get() const { return contents.get(); }
@@ -112,7 +112,7 @@ struct ClientAndUniquePtr {
 };
 
 template <typename T>
-ClientAndUniquePtr<T> WrapWithClient(std::shared_ptr<PyLocalClient> client,
+ClientAndUniquePtr<T> WrapWithClient(std::shared_ptr<PjRtClient> client,
                                      std::unique_ptr<T> contents) {
   ClientAndUniquePtr<T> result;
   result.client = std::move(client);
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 1cdff854f21..c75586c92a7 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "pybind11/attr.h"
 #include "pybind11/cast.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@@ -39,14 +40,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/pjrt/cpu_device.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/client.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/distributed.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/service.h"
+#include "tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/bfloat16.h"
-#include "tensorflow/compiler/xla/python/cpu_device.h"
-#include "tensorflow/compiler/xla/python/distributed/client.h"
-#include "tensorflow/compiler/xla/python/distributed/distributed.h"
-#include "tensorflow/compiler/xla/python/distributed/service.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
-#include "tensorflow/compiler/xla/python/local_client.h"
-#include "tensorflow/compiler/xla/python/nvidia_gpu_device.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
@@ -62,15 +63,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
+#include "tensorflow/python/profiler/internal/traceme_context_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 
 namespace xla {
+namespace {
 
 namespace py = pybind11;
 
-namespace {
+using ::tensorflow::profiler::TraceMeContextManager;
 
 struct Uniquer {
   absl::Mutex mu;
@@ -161,21 +163,21 @@ Status PyRegisterCustomCallTarget(const std::string& fn_name,
 
 // Extra data to be kept alive by the consumer of the buffer protocol.
 struct ExtraBufferInfo {
-  explicit ExtraBufferInfo(PyLocalBuffer::ScopedHold device_buffer)
+  explicit ExtraBufferInfo(PjRtBuffer::ScopedHold device_buffer)
       : device_buffer(std::move(device_buffer)) {}
 
   std::string format;
   std::vector<Py_ssize_t> strides;
-  // We keep a reference to the SharedDeviceBuffer that backs the PyLocalBuffer.
-  // This prevents a use-after-free in the event that Delete() is called on
-  // a buffer with an live buffer protocol view. It does however mean that
-  // Delete() sometimes won't actually delete immediately.
-  PyLocalBuffer::ScopedHold device_buffer;
+  // We keep a reference to the TrackedDeviceBuffer that backs the
+  // PjRtBuffer. This prevents a use-after-free in the event that Delete() is
+  // called on a buffer with an live buffer protocol view. It does however mean
+  // that Delete() sometimes won't actually delete immediately.
+  PjRtBuffer::ScopedHold device_buffer;
 };
 
-int PyLocalBufferGetBuffer(PyObject* exporter, Py_buffer* view, int flags) {
+int PjRtBufferGetBuffer(PyObject* exporter, Py_buffer* view, int flags) {
   auto& buffer =
-      py::reinterpret_borrow<py::object>(exporter).cast<PyLocalBuffer&>();
+      py::reinterpret_borrow<py::object>(exporter).cast<PjRtBuffer&>();
   Status status = [&]() {
     // Py_buffer objects are POD C structures, so we don't need to hold the GIL.
     // Additionally we call BlockHostUntilReady() below, which may block.
@@ -200,7 +202,7 @@ int PyLocalBufferGetBuffer(PyObject* exporter, Py_buffer* view, int flags) {
     if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
       return InvalidArgument("XLA buffers are read-only.");
     }
-    PyLocalBuffer::ScopedHold device_buffer(
+    PjRtBuffer::ScopedHold device_buffer(
         buffer.GetBufferWithExternalReference());
     if (!device_buffer.status().ok()) {
       return InvalidArgument("Deleted buffer used in buffer protocol.");
@@ -257,22 +259,21 @@ int PyLocalBufferGetBuffer(PyObject* exporter, Py_buffer* view, int flags) {
   return 0;
 }
 
-void PyLocalBufferReleaseBuffer(PyObject*, Py_buffer* buffer) {
+void PjRtBufferReleaseBuffer(PyObject*, Py_buffer* buffer) {
   auto extra = static_cast<ExtraBufferInfo*>(buffer->internal);
   delete extra;
 }
 
-PyBufferProcs PyLocalBufferProcs = []() {
+PyBufferProcs PjRtBufferProcs = []() {
   PyBufferProcs procs;
-  procs.bf_getbuffer = &PyLocalBufferGetBuffer;
-  procs.bf_releasebuffer = &PyLocalBufferReleaseBuffer;
+  procs.bf_getbuffer = &PjRtBufferGetBuffer;
+  procs.bf_releasebuffer = &PjRtBufferReleaseBuffer;
   return procs;
 }();
 
 // Implementation of the CUDA array interface for sharing GPU buffers with other
 // Python libraries.
-StatusOr<py::dict> PyLocalBufferCudaArrayInterface(
-    const PyLocalBuffer& buffer) {
+StatusOr<py::dict> PjRtBufferCudaArrayInterface(const PjRtBuffer& buffer) {
   if (buffer.device()->local_device_state()->executor()->platform_kind() !=
       se::PlatformKind::kCuda) {
     return InvalidArgument(
@@ -310,36 +311,61 @@ void BuildOpsSubmodule(py::module* m) {
   // XlaBuilder.
   py::module ops = m->def_submodule("ops", "XLA operations");
 
-  ops.def("AfterAll", &AfterAll);
+  py::enum_<TriangularSolveOptions::Transpose>(
+      ops, "TriangularSolveOptions_Transpose")
+      .value("TRANSPOSE_INVALID", TriangularSolveOptions::TRANSPOSE_INVALID)
+      .value("NO_TRANSPOSE", TriangularSolveOptions::NO_TRANSPOSE)
+      .value("TRANSPOSE", TriangularSolveOptions::TRANSPOSE)
+      .value("ADJOINT", TriangularSolveOptions::ADJOINT);
+
+  ops.def("AfterAll", &AfterAll, py::arg("builder"), py::arg("tokens"));
   ops.def(
       "AllReduce",
       static_cast<XlaOp (*)(
           XlaOp, const XlaComputation&, absl::Span<const ReplicaGroup>,
           const absl::optional<ChannelHandle>&, const absl::optional<Shape>&)>(
-          &AllReduce));
-  ops.def("AllToAll", &AllToAll);
-  ops.def("CollectivePermute", &CollectivePermute);
-  ops.def("CreateToken", &CreateToken);
+          &AllReduce),
+      py::arg("operand"), py::arg("computation"),
+      py::arg("replica_groups") = py::list(),
+      py::arg("channel_id") = absl::nullopt,
+      py::arg("shape_with_layout") = absl::nullopt);
+  ops.def("AllToAll", &AllToAll, py::arg("operand"), py::arg("split_dimension"),
+          py::arg("concat_dimension"), py::arg("split_count"),
+          py::arg("replica_groups") = py::list(),
+          py::arg("layout") = absl::nullopt);
+  ops.def("CollectivePermute", &CollectivePermute, py::arg("operand"),
+          py::arg("source_target_pairs"));
+  ops.def("CreateToken", &CreateToken, py::arg("builder"));
   ops.def("CrossReplicaSum",
           static_cast<XlaOp (*)(XlaOp, absl::Span<const ReplicaGroup>)>(
-              &CrossReplicaSum));
+              &CrossReplicaSum),
+          py::arg("operand"), py::arg("replica_groups") = py::list());
   ops.def("BitcastConvertType", &BitcastConvertType, py::arg("operand"),
           py::arg("new_element_type"));
   ops.def("Broadcast", &Broadcast, py::arg("operand"), py::arg("sizes"));
   ops.def("BroadcastInDim", &BroadcastInDim, py::arg("operand"),
           py::arg("shape"), py::arg("broadcast_dimensions"));
-  ops.def("Call", &Call);
+  ops.def("Call", &Call, py::arg("builder"), py::arg("computation"),
+          py::arg("operands"));
   ops.def("Cholesky", &Cholesky, py::arg("a"), py::arg("lower") = true);
-  ops.def("Clamp", &Clamp);
+  ops.def("Clamp", &Clamp, py::arg("min"), py::arg("operand"), py::arg("max"));
   ops.def("Collapse", &Collapse, py::arg("operand"), py::arg("dimensions"));
-  ops.def("ConcatInDim", &ConcatInDim);
+  ops.def("ConcatInDim", &ConcatInDim, py::arg("builder"), py::arg("operands"),
+          py::arg("dimension"));
   ops.def("Conditional",
           static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaComputation* const>,
-                                absl::Span<const XlaOp>)>(&Conditional));
+                                absl::Span<const XlaOp>)>(&Conditional),
+          py::arg("branch_index"), py::arg("branch_computations"),
+          py::arg("branch_operands"));
   ops.def("Conditional",
           static_cast<XlaOp (*)(XlaOp, XlaOp, const XlaComputation&, XlaOp,
-                                const XlaComputation&)>(&Conditional));
-  ops.def("ConstantLiteral", &ConstantLiteral);
+                                const XlaComputation&)>(&Conditional),
+          py::arg("predicate"), py::arg("true_operand"),
+          py::arg("true_computation"), py::arg("false_operand"),
+          py::arg("false_computation"));
+  ops.def("Constant", &ConstantLiteral, py::arg("builder"), py::arg("literal"));
+  ops.def("ConstantLiteral", &ConstantLiteral, py::arg("builder"),
+          py::arg("literal"));
   ops.def("ConvGeneralDilated", &ConvGeneralDilated, py::arg("lhs"),
           py::arg("rhs"), py::arg("window_strides"), py::arg("padding"),
           py::arg("lhs_dilation"), py::arg("rhs_dilation"),
@@ -348,48 +374,80 @@ void BuildOpsSubmodule(py::module* m) {
           py::arg("precision_config") = nullptr);
   ops.def("ConvertElementType", &ConvertElementType, py::arg("operand"),
           py::arg("new_element_type"));
-  ops.def("CustomCall", &CustomCall);
-  ops.def("CustomCallWithLayout", &CustomCallWithLayout);
+  ops.def(
+      "CustomCall",
+      [](XlaBuilder* builder, const py::bytes& call_target_name,
+         absl::Span<const XlaOp> operands, const Shape& shape,
+         const py::bytes& opaque) -> XlaOp {
+        return CustomCall(builder, call_target_name, operands, shape, opaque);
+      },
+      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
+      py::arg("shape"), py::arg("opaque") = py::bytes(""));
+  ops.def(
+      "CustomCallWithLayout",
+      [](XlaBuilder* builder, const py::bytes& call_target_name,
+         absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
+         absl::Span<const Shape> operand_shapes_with_layout,
+         const py::bytes& opaque) -> XlaOp {
+        return CustomCallWithLayout(builder, call_target_name, operands,
+                                    shape_with_layout,
+                                    operand_shapes_with_layout, opaque);
+      },
+      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
+      py::arg("shape_with_layout"), py::arg("operand_shapes_with_layout"),
+      py::arg("opaque") = py::bytes(""));
   ops.def("Dot", &Dot, py::arg("lhs"), py::arg("rhs"),
           py::arg("precision_config") = nullptr);
   ops.def("DotGeneral", &DotGeneral, py::arg("lhs"), py::arg("rhs"),
           py::arg("dimension_numbers"), py::arg("precision_config") = nullptr);
   ops.def("DynamicSlice",
           static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
-                                absl::Span<const int64>)>(&DynamicSlice));
+                                absl::Span<const int64>)>(&DynamicSlice),
+          py::arg("operand"), py::arg("start_indices"), py::arg("slice_sizes"));
   ops.def("DynamicUpdateSlice",
           static_cast<XlaOp (*)(XlaOp, XlaOp, absl::Span<const XlaOp>)>(
-              &DynamicUpdateSlice));
+              &DynamicUpdateSlice),
+          py::arg("operand"), py::arg("update"), py::arg("start_indices"));
 
-  ops.def("Fft", &Fft);
+  ops.def("Fft", &Fft, py::arg("operand"), py::arg("fft_type"),
+          py::arg("fft_length"));
 
   ops.def("Gather", &Gather, py::arg("a"), py::arg("start_indices"),
           py::arg("dimension_numbers"), py::arg("slice_sizes"),
-          py::arg("indices_are_sorted"));
-  ops.def("GetTupleElement", &GetTupleElement);
+          py::arg("indices_are_sorted") = false);
+  ops.def("GetTupleElement", &GetTupleElement, py::arg("tuple_data"),
+          py::arg("index"));
   ops.def("InfeedWithToken", &InfeedWithToken, py::arg("token"),
           py::arg("shape"), py::arg("config") = "");
   ops.def("Iota",
-          static_cast<XlaOp (*)(XlaBuilder*, const Shape&, int64)>(&Iota));
+          static_cast<XlaOp (*)(XlaBuilder*, const Shape&, int64)>(&Iota),
+          py::arg("builder"), py::arg("shape"), py::arg("iota_dimension"));
   ops.def("Iota",
-          static_cast<XlaOp (*)(XlaBuilder*, PrimitiveType, int64)>(&Iota));
-  ops.def("Map", &Map);
-  ops.def("NextAfter", &NextAfter);
+          static_cast<XlaOp (*)(XlaBuilder*, PrimitiveType, int64)>(&Iota),
+          py::arg("builder"), py::arg("type"), py::arg("size"));
+  ops.def("Map", &Map, py::arg("builder"), py::arg("operands"),
+          py::arg("computation"), py::arg("dimensions"),
+          py::arg("static_operands") = py::list());
+  ops.def("NextAfter", &NextAfter, py::arg("from"), py::arg("to"));
   ops.def("OutfeedWithToken", &OutfeedWithToken, py::arg("operand"),
           py::arg("token"), py::arg("shape_with_layout"),
           py::arg("outfeed_config") = "");
-  ops.def("Pad", &Pad);
-  ops.def("Parameter", static_cast<XlaOp (*)(XlaBuilder*, int64, const Shape&,
-                                             const std::string&)>(&Parameter));
+  ops.def("Pad", &Pad, py::arg("operand"), py::arg("padding_value"),
+          py::arg("padding_config"));
   ops.def("Parameter",
           static_cast<XlaOp (*)(XlaBuilder*, int64, const Shape&,
                                 const std::string&, const std::vector<bool>&)>(
-              &Parameter));
-  ops.def("QR",
-          [](XlaOp a, bool full_matrices) -> StatusOr<std::pair<XlaOp, XlaOp>> {
-            TF_ASSIGN_OR_RETURN(auto qr, QRDecomposition(a, full_matrices));
-            return std::make_pair(qr.q, qr.r);
-          });
+              &Parameter),
+          py::arg("builder"), py::arg("parameter_number"), py::arg("shape"),
+          py::arg("name") = "",
+          py::arg("replicated_at_leaf_buffers") = std::vector<bool>());
+  ops.def(
+      "QR",
+      [](XlaOp a, bool full_matrices) -> StatusOr<std::pair<XlaOp, XlaOp>> {
+        TF_ASSIGN_OR_RETURN(auto qr, QRDecomposition(a, full_matrices));
+        return std::make_pair(qr.q, qr.r);
+      },
+      py::arg("operand"), py::arg("full_matrices"));
   ops.def(
       "Eigh",
       [](XlaOp a, bool lower, int64 max_iter,
@@ -410,29 +468,49 @@ void BuildOpsSubmodule(py::module* m) {
   ops.def("Reduce",
           static_cast<XlaOp (*)(XlaBuilder*, absl::Span<const XlaOp>,
                                 absl::Span<const XlaOp>, const XlaComputation&,
-                                absl::Span<const int64>)>(&Reduce));
+                                absl::Span<const int64>)>(&Reduce),
+          py::arg("builder"), py::arg("operands"), py::arg("init_values"),
+          py::arg("computation"), py::arg("dimensions_to_reduce"));
   ops.def("ReducePrecision", &ReducePrecision, py::arg("operand"),
           py::arg("exponent_bits"), py::arg("mantissa_bits"));
-  ops.def("ReduceWindowWithGeneralPadding", &ReduceWindowWithGeneralPadding);
-  ops.def("ReplicaId", &ReplicaId);
-  ops.def("Reshape", static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>,
-                                           absl::Span<const int64>)>(&Reshape));
+  ops.def("ReduceWindowWithGeneralPadding", &ReduceWindowWithGeneralPadding,
+          py::arg("operand"), py::arg("init_value"), py::arg("computation"),
+          py::arg("window_dimensions"), py::arg("window_strides"),
+          py::arg("base_dilations"), py::arg("window_dilations"),
+          py::arg("padding"));
+  ops.def("ReplicaId", &ReplicaId, py::arg("builder"));
   ops.def("Reshape",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>)>(&Reshape));
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>,
+                                absl::Span<const int64>)>(&Reshape),
+          py::arg("operand"), py::arg("dimensions"), py::arg("new_sizes"));
+  ops.def("Reshape",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>)>(&Reshape),
+          py::arg("operand"), py::arg("new_sizes"));
   ops.def("Rev", &Rev, py::arg("operand"), py::arg("dimensions"));
-  ops.def("RngNormal", &RngNormal);
-  ops.def("RngUniform", &RngUniform);
-  ops.def("Scatter", &Scatter);
-  ops.def("Select", &Select);
+  ops.def("RngNormal", &RngNormal, py::arg("mu"), py::arg("sigma"),
+          py::arg("shape"));
+  ops.def("RngUniform", &RngUniform, py::arg("a"), py::arg("b"),
+          py::arg("shape"));
+  ops.def("Scatter", &Scatter, py::arg("input"), py::arg("scatter_indices"),
+          py::arg("updates"), py::arg("update_computation"),
+          py::arg("dimension_numbers"), py::arg("indices_are_sorted") = false,
+          py::arg("unique_indices") = false);
+  ops.def("Select", &Select, py::arg("pred"), py::arg("on_true"),
+          py::arg("on_false"));
   ops.def("SelectAndScatterWithGeneralPadding",
-          &SelectAndScatterWithGeneralPadding);
-  ops.def("Slice", &Slice);
+          &SelectAndScatterWithGeneralPadding, py::arg("operand"),
+          py::arg("select"), py::arg("window_dimensions"),
+          py::arg("window_strides"), py::arg("padding"), py::arg("source"),
+          py::arg("init_value"), py::arg("scatter"));
+  ops.def("Slice", &Slice, py::arg("operand"), py::arg("start_indices"),
+          py::arg("limit_indices"), py::arg("strides"));
   ops.def("SliceInDim", &SliceInDim, py::arg("operand"), py::arg("start_index"),
           py::arg("limit_index"), py::arg("stride"), py::arg("dimno"));
   ops.def(
       "Sort",
-      [](XlaBuilder* builder, absl::Span<const XlaOp> operands, int64 dimension,
-         absl::optional<const XlaComputation*> comparator) -> XlaOp {
+      [](XlaBuilder* builder, absl::Span<const XlaOp> operands,
+         absl::optional<const XlaComputation*> comparator, int64 dimension,
+         bool is_stable) -> XlaOp {
         return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
           std::vector<PrimitiveType> operand_types;
           for (const auto& operand : operands) {
@@ -441,27 +519,32 @@ void BuildOpsSubmodule(py::module* m) {
           }
 
           if (comparator) {
-            return Sort(operands, **comparator, dimension);
+            return Sort(operands, **comparator, dimension, is_stable);
           } else {
             return Sort(operands,
                         CreateScalarLtComputation(operand_types, builder),
-                        dimension);
+                        dimension, is_stable);
           }
         });
       },
-      py::arg("builder"), py::arg("operands"), py::arg("dimension") = -1,
-      py::arg("comparator") = absl::nullopt);
+      py::arg("builder"), py::arg("operands"),
+      py::arg("comparator") = absl::nullopt, py::arg("dimension") = -1,
+      py::arg("is_stable") = false);
   ops.def("TopK", &TopK, py::arg("input"), py::arg("k"));
-  ops.def("Transpose", &Transpose);
-  ops.def("TriangularSolve", &TriangularSolve);
-  ops.def("Tuple", &Tuple);
-  ops.def("While", &While);
+  ops.def("Transpose", &Transpose, py::arg("operand"), py::arg("permutation"));
+  ops.def("TriangularSolve", &TriangularSolve, py::arg("a"), py::arg("b"),
+          py::arg("left_side"), py::arg("lower"), py::arg("unit_diagonal"),
+          py::arg("transpose_a"));
+  ops.def("Tuple", &Tuple, py::arg("builder"), py::arg("elements"));
+  ops.def("While", &While, py::arg("condition"), py::arg("body"),
+          py::arg("init"));
 
-  ops.def("Igamma", &Igamma);
-  ops.def("Igammac", &Igammac);
-  ops.def("IgammaGradA", &IgammaGradA);
-  ops.def("RandomGammaGrad", &RandomGammaGrad);
-  ops.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta);
+  ops.def("Igamma", &Igamma, py::arg("a"), py::arg("x"));
+  ops.def("Igammac", &Igammac, py::arg("a"), py::arg("x"));
+  ops.def("IgammaGradA", &IgammaGradA, py::arg("a"), py::arg("x"));
+  ops.def("RandomGammaGrad", &RandomGammaGrad, py::arg("a"), py::arg("x"));
+  ops.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta, py::arg("a"),
+          py::arg("b"), py::arg("x"));
 
 #define BINARY_OP(op)                                                 \
   ops.def(                                                            \
@@ -539,43 +622,6 @@ void BuildOpsSubmodule(py::module* m) {
 #undef UNARY_OP
 }
 
-// Helper to implement TraceMe as a context manager in Python.
-class TraceMeContextManager {
- public:
-  explicit TraceMeContextManager(py::str name, py::kwargs kwargs)
-      : name_(std::move(name)), kwargs_(std::move(kwargs)) {}
-
-  void Enter() {
-    if (IsEnabled()) {
-      std::string name(name_);
-      if (!kwargs_.empty()) {
-        absl::StrAppend(&name, "#");
-        bool first = true;
-        for (const auto& entry : kwargs_) {
-          absl::StrAppend(&name, first ? "" : ",",
-                          std::string(py::str(entry.first)), "=",
-                          std::string(py::str(entry.second)));
-          first = false;
-        }
-        absl::StrAppend(&name, "#");
-      }
-      traceme_.emplace(std::move(name));
-    }
-  }
-  py::object Exit(const py::object& ex_type, const py::object& ex_value,
-                  const py::object& traceback) {
-    traceme_.reset();
-    return py::none();
-  }
-
-  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
-
- private:
-  py::str name_;
-  py::kwargs kwargs_;
-  absl::optional<tensorflow::profiler::TraceMe> traceme_;
-};
-
 void BuildProfilerSubmodule(py::module* m) {
   py::module profiler =
       m->def_submodule("profiler", "TensorFlow profiler integration");
@@ -591,11 +637,23 @@ void BuildProfilerSubmodule(py::module* m) {
       },
       py::arg("port"));
 
-  py::class_<TraceMeContextManager> traceme_class(profiler, "TraceMe");
+  py::class_<TraceMeContextManager> traceme_class(profiler, "TraceMe",
+                                                  py::module_local());
   traceme_class.def(py::init<py::str, py::kwargs>())
-      .def("__enter__", &TraceMeContextManager::Enter)
-      .def("__exit__", &TraceMeContextManager::Exit)
-      .def_static("IsEnabled", &TraceMeContextManager::IsEnabled);
+      .def("__enter__",
+           [](py::object self) -> py::object {
+             py::cast<TraceMeContextManager*>(self)->Enter();
+             return self;
+           })
+      .def("__exit__",
+           [](py::object self, const py::object& ex_type,
+              const py::object& ex_value,
+              const py::object& traceback) -> py::object {
+             py::cast<TraceMeContextManager*>(self)->Exit();
+             return py::none();
+           })
+      .def("set_metadata", &TraceMeContextManager::SetMetadata)
+      .def_static("is_enabled", &TraceMeContextManager::IsEnabled);
 }
 
 }  // namespace
@@ -784,6 +842,53 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("computation_count", &DeviceAssignment::computation_count)
       .def("__repr__", &DeviceAssignment::ToString);
 
+  py::class_<CompileOptions> compile_options(m, "CompileOptions");
+  compile_options
+      .def(py::init([]() -> CompileOptions {
+        CompileOptions options;
+        DebugOptions* debug_options =
+            options.executable_build_options.mutable_debug_options();
+        // Sets fast-math-disabling default options expected by JAX.
+        debug_options->set_xla_cpu_enable_fast_min_max(false);
+        debug_options->set_xla_gpu_enable_fast_min_max(false);
+        return options;
+      }))
+      .def_readwrite("argument_layouts", &CompileOptions::argument_layouts)
+      .def_readwrite("parameter_is_tupled_arguments",
+                     &CompileOptions::parameter_is_tupled_arguments)
+      .def_readonly("executable_build_options",
+                    &CompileOptions::executable_build_options)
+      // TODO(phawkins): the following fields exist for backward compatibility.
+      // Remove them after JAX has been updated not to use them.
+      .def_readwrite("tuple_arguments",
+                     &CompileOptions::parameter_is_tupled_arguments)
+      .def_property(
+          "num_replicas",
+          [](const CompileOptions& options) {
+            return options.executable_build_options.num_replicas();
+          },
+          [](CompileOptions& options, int num_replicas) {
+            options.executable_build_options.set_num_replicas(num_replicas);
+          })
+      .def_property(
+          "num_partitions",
+          [](const CompileOptions& options) {
+            return options.executable_build_options.num_partitions();
+          },
+          [](CompileOptions& options, int num_partitions) {
+            options.executable_build_options.set_num_partitions(num_partitions);
+          })
+      .def_property(
+          "device_assignment",
+          [](const CompileOptions& options) {
+            return options.executable_build_options.device_assignment();
+          },
+          [](CompileOptions& options,
+             const DeviceAssignment& device_assignment) {
+            options.executable_build_options.set_device_assignment(
+                device_assignment);
+          });
+
   py::class_<Device, ClientAndPtr<Device>>(
       m, "Device",
       "A descriptor of an available device.\n\nSubclasses are used to "
@@ -797,8 +902,12 @@ PYBIND11_MODULE(xla_extension, m) {
                              "Integer ID of this device's host.\n\n"
                              "This is always 0 except on multi-host platforms.")
       .def_property_readonly("platform", &Device::platform_name)
+      .def_property_readonly("device_kind", &Device::device_kind)
+      .def_property_readonly(
+          "client",
+          [](const ClientAndPtr<Device>& device) { return device.client; })
       .def("__str__", &Device::DebugString)
-      .def("TransferToInfeed",
+      .def("transfer_to_infeed",
            [](const Device& device, const LiteralSlice& literal) {
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
@@ -808,7 +917,7 @@ PYBIND11_MODULE(xla_extension, m) {
                  literal, local_device->device_ordinal());
            })
       .def(
-          "TransferFromOutfeed",
+          "transfer_from_outfeed",
           [](const Device& device, const Shape& shape) -> StatusOr<py::object> {
             GlobalPyRefManager()->CollectGarbage();
             std::shared_ptr<Literal> literal_shared;
@@ -816,10 +925,17 @@ PYBIND11_MODULE(xla_extension, m) {
               py::gil_scoped_release gil_release;
               TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
                                   device.GetLocalDeviceState());
+              Shape shape_with_layout = shape;
+              ShapeUtil::ForEachMutableSubshape(
+                  &shape_with_layout, [](Shape* subshape, const ShapeIndex&) {
+                    if (!subshape->has_layout()) {
+                      LayoutUtil::SetToDefaultLayout(subshape);
+                    }
+                  });
               TF_ASSIGN_OR_RETURN(
                   Literal literal,
                   local_device->client()->TransferFromOutfeedLocal(
-                      shape, local_device->device_ordinal()));
+                      shape_with_layout, local_device->device_ordinal()));
 
               literal_shared = std::make_shared<Literal>(std::move(literal));
             }
@@ -839,7 +955,7 @@ PYBIND11_MODULE(xla_extension, m) {
   // Local XLA client methods.
 
   // Custom-call targets.
-  m.def("RegisterCustomCallTarget", &PyRegisterCustomCallTarget);
+  m.def("register_custom_call_target", &PyRegisterCustomCallTarget);
 
   py::class_<GpuAllocatorConfig> alloc_config(m, "GpuAllocatorConfig");
   alloc_config.def(py::init<>())
@@ -851,11 +967,13 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("PLATFORM", GpuAllocatorConfig::Kind::kPlatform)
       .value("BFC", GpuAllocatorConfig::Kind::kBFC);
 
-  py::class_<PyLocalClient, std::shared_ptr<PyLocalClient>>(m, "LocalClient")
-      .def("device_count", &PyLocalClient::device_count)
-      .def("local_device_count", &PyLocalClient::local_device_count)
+  py::class_<PjRtClient, std::shared_ptr<PjRtClient>> py_local_client(
+      m, "LocalClient");
+  py_local_client.def_property_readonly("platform", &PjRtClient::platform_name)
+      .def("device_count", &PjRtClient::device_count)
+      .def("local_device_count", &PjRtClient::local_device_count)
       .def("devices",
-           [](std::shared_ptr<PyLocalClient> client) {
+           [](std::shared_ptr<PjRtClient> client) {
              std::vector<ClientAndPtr<Device>> devices;
              devices.reserve(client->devices().size());
              for (const auto& device : client->devices()) {
@@ -864,7 +982,7 @@ PYBIND11_MODULE(xla_extension, m) {
              return devices;
            })
       .def("local_devices",
-           [](std::shared_ptr<PyLocalClient> client) {
+           [](std::shared_ptr<PjRtClient> client) {
              std::vector<ClientAndPtr<Device>> devices;
              devices.reserve(client->local_devices().size());
              for (Device* device : client->local_devices()) {
@@ -872,9 +990,9 @@ PYBIND11_MODULE(xla_extension, m) {
              }
              return devices;
            })
-      .def("host_id", &PyLocalClient::host_id)
-      .def("GetDefaultDeviceAssignment",
-           [](std::shared_ptr<PyLocalClient> client, int num_replicas,
+      .def("host_id", &PjRtClient::host_id)
+      .def("get_default_device_assignment",
+           [](std::shared_ptr<PjRtClient> client, int num_replicas,
               int num_partitions)
                -> StatusOr<std::vector<std::vector<ClientAndPtr<Device>>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
@@ -894,8 +1012,8 @@ PYBIND11_MODULE(xla_extension, m) {
              return result;
            })
       // TODO(skye): delete after all callers can handle 2D output
-      .def("GetDefaultDeviceAssignment",
-           [](std::shared_ptr<PyLocalClient> client,
+      .def("get_default_device_assignment",
+           [](std::shared_ptr<PjRtClient> client,
               int num_replicas) -> StatusOr<std::vector<ClientAndPtr<Device>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                                  client->GetDefaultDeviceAssignment(
@@ -909,17 +1027,67 @@ PYBIND11_MODULE(xla_extension, m) {
              }
              return result;
            })
-      .def("CreateChannelHandle",
-           [](PyLocalClient* client) {
+      .def("create_channel_handle",
+           [](PjRtClient* client) {
              return client->client()->CreateChannelHandle();
            })
-      .def("CreateDeviceToHostChannelHandle",
-           [](PyLocalClient* client) {
+      .def("create_device_to_host_channel_handle",
+           [](PjRtClient* client) {
              return client->client()->CreateDeviceToHostChannelHandle();
            })
-      .def("CreateHostToDeviceChannelHandle", [](PyLocalClient* client) {
+      .def("create_host_to_device_channel_handle", [](PjRtClient* client) {
         return client->client()->CreateHostToDeviceChannelHandle();
       });
+  py_local_client.def(
+      "buffer_from_pyval",
+      [](std::shared_ptr<PjRtClient> client, const pybind11::object& argument,
+         Device* device,
+         bool force_copy) -> StatusOr<ClientAndUniquePtr<PjRtBuffer>> {
+        if (device == nullptr) {
+          TF_RET_CHECK(!client->local_devices().empty());
+          device = client->local_devices().front();
+        }
+        CHECK(device != nullptr);
+        auto iter = client->id_to_device().find(device->id());
+        if (iter->second != device) {
+          return InvalidArgument(
+              "Cannot copy value to device '%s' with '%s' backend",
+              device->DebugString(), client->platform_name());
+        }
+        GlobalPyRefManager()->CollectGarbage();
+
+        absl::optional<CastToArrayResult> c = CastToArray(argument);
+        if (!c) {
+          return InvalidArgument("from_python argument must be an array.");
+        }
+
+        TF_ASSIGN_OR_RETURN(PythonBufferTree tree,
+                            GetPythonBufferTree(argument));
+        std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
+            GlobalPyRefManager()->ManageReference(std::move(c->array));
+
+        py::gil_scoped_release gil_release;
+        TF_ASSIGN_OR_RETURN(
+            std::unique_ptr<PjRtBuffer> buffer,
+            PjRtBuffer::FromHostBuffer(c->buf_ptr, c->shape, force_copy,
+                                       std::move(py_buffer_ref), client.get(),
+                                       device));
+        return WrapWithClient(std::move(client), std::move(buffer));
+      },
+      py::arg("argument"), py::arg("device") = nullptr,
+      py::arg("force_copy") = false);
+  py_local_client.def(
+      "compile",
+      [](std::shared_ptr<PjRtClient> client, const XlaComputation& computation,
+         CompileOptions options)
+          -> StatusOr<ClientAndUniquePtr<PjRtExecutable>> {
+        py::gil_scoped_release gil_release;
+        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtExecutable> executable,
+                            PjRtExecutable::Compile(computation, client.get(),
+                                                    std::move(options)));
+        return WrapWithClient(std::move(client), std::move(executable));
+      },
+      py::arg("computation"), py::arg("compile_options") = CompileOptions());
 
   m.def("get_cpu_client", &GetCpuClient, py::arg("asynchronous") = true);
   m.def("get_nvidia_gpu_client", &GetNvidiaGpuClient,
@@ -927,67 +1095,33 @@ PYBIND11_MODULE(xla_extension, m) {
         py::arg("allocator_config") = GpuAllocatorConfig(),
         py::arg("distributed_client") = nullptr, py::arg("node_id") = 0);
 
-  py::class_<PyLocalBuffer, ClientAndUniquePtr<PyLocalBuffer>> buffer(
+  py::class_<PjRtBuffer, ClientAndUniquePtr<PjRtBuffer>> buffer(
       m, "PyLocalBuffer");
   buffer
-      .def_static(
-          "from_python",
-          [](const pybind11::object& argument,
-             std::shared_ptr<PyLocalClient> client, Device* device,
-             bool force_copy) -> StatusOr<ClientAndUniquePtr<PyLocalBuffer>> {
-            CHECK(device != nullptr);
-            auto iter = client->id_to_device().find(device->id());
-            if (iter->second != device) {
-              return InvalidArgument(
-                  "Cannot copy value to device '%s' with '%s' backend",
-                  device->DebugString(), client->platform_name());
-            }
-            GlobalPyRefManager()->CollectGarbage();
-
-            absl::optional<CastToArrayResult> c = CastToArray(argument);
-            if (!c) {
-              return InvalidArgument("from_python argument must be an array.");
-            }
-
-            TF_ASSIGN_OR_RETURN(PythonBufferTree tree,
-                                GetPythonBufferTree(argument));
-            std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
-                GlobalPyRefManager()->ManageReference(std::move(c->array));
-
-            py::gil_scoped_release gil_release;
-            TF_ASSIGN_OR_RETURN(
-                std::unique_ptr<PyLocalBuffer> buffer,
-                PyLocalBuffer::FromHostBuffer(c->buf_ptr, c->shape, force_copy,
-                                              std::move(py_buffer_ref),
-                                              client.get(), device));
-            return WrapWithClient(std::move(client), std::move(buffer));
-          },
-          py::arg("argument"), py::arg("client"), py::arg("device"),
-          py::arg("force_copy") = false)
       .def("copy_to_device",
-           [](PyLocalBuffer* buffer, const ClientAndPtr<Device>& dst_device)
-               -> StatusOr<ClientAndUniquePtr<PyLocalBuffer>> {
+           [](PjRtBuffer* buffer, const ClientAndPtr<Device>& dst_device)
+               -> StatusOr<ClientAndUniquePtr<PjRtBuffer>> {
              CHECK(dst_device.get() != nullptr);
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
-             TF_ASSIGN_OR_RETURN(std::unique_ptr<PyLocalBuffer> out,
+             TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> out,
                                  buffer->CopyToDevice(dst_device.get()));
              return WrapWithClient(dst_device.client, std::move(out));
            })
-      .def("delete", &PyLocalBuffer::Delete)
+      .def("delete", &PjRtBuffer::Delete)
       .def("block_host_until_ready",
-           [](PyLocalBuffer* buffer) {
+           [](PjRtBuffer* buffer) {
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
              return buffer->BlockHostUntilReady();
            })
-      .def("copy_to_host_async", &PyLocalBuffer::CopyToHostAsync,
+      .def("copy_to_host_async", &PjRtBuffer::CopyToHostAsync,
            py::call_guard<py::gil_scoped_release>())
       .def(
           "to_py",
           [](py::object buffer_obj) -> StatusOr<py::object> {
             GlobalPyRefManager()->CollectGarbage();
-            PyLocalBuffer* buffer = buffer_obj.cast<PyLocalBuffer*>();
+            PjRtBuffer* buffer = buffer_obj.cast<PjRtBuffer*>();
             LocalDeviceState* state = buffer->device()->local_device_state();
             if (state->executor()->platform_kind() == se::PlatformKind::kHost &&
                 buffer->on_device_shape().IsArray() &&
@@ -1005,17 +1139,20 @@ PYBIND11_MODULE(xla_extension, m) {
             }
             return LiteralToPython(std::move(literal));
           })
-      .def("shape", &PyLocalBuffer::on_host_shape)
+      .def("shape", &PjRtBuffer::on_host_shape)
+      .def_property_readonly("client",
+                             [](const PjRtBuffer& buffer) {
+                               return buffer.client()->shared_from_this();
+                             })
       .def("device",
-           [](const PyLocalBuffer& buffer) {
+           [](const PjRtBuffer& buffer) {
              return WrapWithClient(buffer.client()->shared_from_this(),
                                    buffer.device());
            })
-      .def("platform", &PyLocalBuffer::platform_name)
-      .def("is_deleted",
-           [](PyLocalBuffer* buffer) { return buffer->IsDeleted(); })
+      .def("platform", &PjRtBuffer::platform_name)
+      .def("is_deleted", [](PjRtBuffer* buffer) { return buffer->IsDeleted(); })
       .def("unsafe_buffer_pointer",
-           [](const PyLocalBuffer& buffer) -> StatusOr<std::uintptr_t> {
+           [](const PjRtBuffer& buffer) -> StatusOr<std::uintptr_t> {
              TF_ASSIGN_OR_RETURN(ShapedBuffer shaped_buffer,
                                  buffer.AsShapedBuffer());
              if (shaped_buffer.on_device_shape().IsTuple()) {
@@ -1027,76 +1164,24 @@ PYBIND11_MODULE(xla_extension, m) {
                  shaped_buffer.root_buffer().opaque());
            })
       .def_property_readonly("__cuda_array_interface__",
-                             &PyLocalBufferCudaArrayInterface);
+                             &PjRtBufferCudaArrayInterface);
 
   // pybind11's implementation of the buffer protocol doesn't allow for correct
   // error handling. We bypass it and implement the buffer protocol ourselves.
   PyTypeObject* buffer_type = reinterpret_cast<PyTypeObject*>(buffer.ptr());
-  buffer_type->tp_as_buffer = &PyLocalBufferProcs;
+  buffer_type->tp_as_buffer = &PjRtBufferProcs;
 
-  py::class_<PyLocalExecutable, ClientAndUniquePtr<PyLocalExecutable>>
-      executable(m, "LocalExecutable");
+  py::class_<PjRtExecutable, ClientAndUniquePtr<PjRtExecutable>> executable(
+      m, "LocalExecutable");
   executable
-      .def_static("Compile",
-                  [](const XlaComputation& computation,
-                     absl::optional<std::vector<Shape>> argument_layouts,
-                     const ExecutableBuildOptions* build_options,
-                     std::shared_ptr<PyLocalClient> client,
-                     absl::optional<DeviceAssignment> device_assignment,
-                     bool tuple_arguments)
-                      -> StatusOr<ClientAndUniquePtr<PyLocalExecutable>> {
-                    py::gil_scoped_release gil_release;
-                    CompileOptions options;
-                    options.argument_layouts = std::move(argument_layouts);
-                    if (build_options) {
-                      options.executable_build_options = *build_options;
-                    }
-                    options.tuple_arguments = tuple_arguments;
-                    if (device_assignment) {
-                      options.executable_build_options.set_device_assignment(
-                          *device_assignment);
-                    }
-                    TF_ASSIGN_OR_RETURN(
-                        std::unique_ptr<PyLocalExecutable> executable,
-                        PyLocalExecutable::Compile(computation, client.get(),
-                                                   std::move(options)));
-                    return WrapWithClient(std::move(client),
-                                          std::move(executable));
-                  })
-      .def_static("Compile",
-                  [](const XlaComputation& computation,
-                     absl::optional<std::vector<Shape>> argument_layouts,
-                     const ExecutableBuildOptions* build_options,
-                     std::shared_ptr<PyLocalClient> client,
-                     absl::optional<std::vector<std::vector<Device*>>>
-                         device_assignment,
-                     bool tuple_arguments)
-                      -> StatusOr<ClientAndUniquePtr<PyLocalExecutable>> {
-                    py::gil_scoped_release gil_release;
-                    CompileOptions options;
-                    options.argument_layouts = std::move(argument_layouts);
-                    if (build_options) {
-                      options.executable_build_options = *build_options;
-                    }
-                    options.tuple_arguments = tuple_arguments;
-                    if (device_assignment) {
-                      TF_ASSIGN_OR_RETURN(
-                          DeviceAssignment xla_assignment,
-                          DevicesToDeviceAssignment(*device_assignment));
-                      options.executable_build_options.set_device_assignment(
-                          xla_assignment);
-                    }
-                    TF_ASSIGN_OR_RETURN(
-                        std::unique_ptr<PyLocalExecutable> executable,
-                        PyLocalExecutable::Compile(computation, client.get(),
-                                                   std::move(options)));
-                    return WrapWithClient(std::move(client),
-                                          std::move(executable));
-                  })
+      .def_property_readonly("client",
+                             [](const PjRtExecutable& executable) {
+                               return executable.client()->shared_from_this();
+                             })
       .def("local_logical_device_ids",
-           &PyLocalExecutable::local_logical_device_ids)
+           &PjRtExecutable::local_logical_device_ids)
       .def("local_devices",
-           [](const PyLocalExecutable& executable) {
+           [](const PjRtExecutable& executable) {
              std::vector<ClientAndPtr<Device>> devices;
              devices.reserve(executable.local_devices().size());
              for (Device* device : executable.local_devices()) {
@@ -1105,21 +1190,21 @@ PYBIND11_MODULE(xla_extension, m) {
              }
              return devices;
            })
-      .def("SizeOfGeneratedCodeInBytes",
-           &PyLocalExecutable::SizeOfGeneratedCodeInBytes)
-      .def("Delete", &PyLocalExecutable::Delete)
+      .def("size_of_generated_code_in_bytes",
+           &PjRtExecutable::SizeOfGeneratedCodeInBytes)
+      .def("delete", &PjRtExecutable::Delete)
       .def(
-          "Execute",
-          [](const PyLocalExecutable& executable,
-             absl::Span<PyLocalBuffer* const> args)
-              -> StatusOr<std::vector<ClientAndUniquePtr<PyLocalBuffer>>> {
+          "execute",
+          [](const PjRtExecutable& executable,
+             absl::Span<PjRtBuffer* const> args)
+              -> StatusOr<std::vector<ClientAndUniquePtr<PjRtBuffer>>> {
             py::gil_scoped_release gil_release;
             ExecuteOptions options;
             options.untuple_result = true;
             TF_ASSIGN_OR_RETURN(
-                std::vector<std::unique_ptr<PyLocalBuffer>> output_buffers,
+                std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
                 executable.Execute(args, options));
-            std::vector<ClientAndUniquePtr<PyLocalBuffer>> outputs;
+            std::vector<ClientAndUniquePtr<PjRtBuffer>> outputs;
             outputs.reserve(output_buffers.size());
             for (auto& buffer : output_buffers) {
               outputs.push_back(WrapWithClient(
@@ -1129,19 +1214,19 @@ PYBIND11_MODULE(xla_extension, m) {
           },
           py::arg("arguments"))
       .def(
-          "ExecuteOnLocalDevices",
-          [](const PyLocalExecutable& executable,
-             absl::Span<const std::vector<PyLocalBuffer*>> args)
+          "execute_on_local_devices",
+          [](const PjRtExecutable& executable,
+             absl::Span<const std::vector<PjRtBuffer*>> args)
               -> StatusOr<
-                  std::vector<std::vector<ClientAndUniquePtr<PyLocalBuffer>>>> {
+                  std::vector<std::vector<ClientAndUniquePtr<PjRtBuffer>>>> {
             py::gil_scoped_release gil_release;
             ExecuteOptions options;
             options.untuple_result = true;
             TF_ASSIGN_OR_RETURN(
-                std::vector<std::vector<std::unique_ptr<PyLocalBuffer>>>
+                std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
                     output_buffers,
                 executable.ExecuteOnLocalDevices(args, options));
-            std::vector<std::vector<ClientAndUniquePtr<PyLocalBuffer>>> outputs;
+            std::vector<std::vector<ClientAndUniquePtr<PjRtBuffer>>> outputs;
             outputs.resize(output_buffers.size());
             for (int computation = 0; computation < output_buffers.size();
                  ++computation) {
@@ -1155,8 +1240,8 @@ PYBIND11_MODULE(xla_extension, m) {
           },
           py::arg("arguments"))
       .def(
-          "get_hlo_modules",
-          [](const PyLocalExecutable& executable)
+          "hlo_modules",
+          [](const PjRtExecutable& executable)
               -> StatusOr<std::vector<std::shared_ptr<HloModule>>> {
             std::vector<std::shared_ptr<HloModule>> modules;
             modules.reserve(executable.executables().size());
@@ -1170,6 +1255,7 @@ PYBIND11_MODULE(xla_extension, m) {
           });
 
   py::class_<DebugOptions>(m, "DebugOptions")
+      .def("__repr__", &DebugOptions::DebugString)
       .def_property("xla_cpu_enable_fast_math",
                     &DebugOptions::xla_cpu_enable_fast_math,
                     &DebugOptions::set_xla_cpu_enable_fast_math)
@@ -1191,6 +1277,7 @@ PYBIND11_MODULE(xla_extension, m) {
 
   py::class_<ExecutableBuildOptions>(m, "ExecutableBuildOptions")
       .def(py::init<>())
+      .def("__repr__", &ExecutableBuildOptions::ToString)
       .def_property(
           "result_layout",
           [](const ExecutableBuildOptions& options) -> absl::optional<Shape> {
@@ -1205,7 +1292,20 @@ PYBIND11_MODULE(xla_extension, m) {
                     &ExecutableBuildOptions::set_num_partitions)
       .def_property_readonly(
           "debug_options", &ExecutableBuildOptions::mutable_debug_options,
-          py::return_value_policy::reference, py::keep_alive<1, 0>());
+          py::return_value_policy::reference, py::keep_alive<1, 0>())
+      .def_property(
+          "device_assignment",
+          [](const ExecutableBuildOptions& options)
+              -> absl::optional<DeviceAssignment> {
+            return options.has_device_assignment()
+                       ? absl::optional<DeviceAssignment>(
+                             options.device_assignment())
+                       : absl::nullopt;
+          },
+          &ExecutableBuildOptions::set_device_assignment)
+      .def_property("use_spmd_partitioning",
+                    &ExecutableBuildOptions::use_spmd_partitioning,
+                    &ExecutableBuildOptions::set_use_spmd_partitioning);
 
   py::class_<XlaComputation>(m, "XlaComputation")
       .def(py::init([](const py::bytes& serialized_hlo_module_proto)
@@ -1214,12 +1314,13 @@ PYBIND11_MODULE(xla_extension, m) {
         proto.ParseFromString(serialized_hlo_module_proto);
         return absl::make_unique<XlaComputation>(proto);
       }))
-      .def("GetProgramShape", &XlaComputation::GetProgramShape)
-      .def("GetSerializedProto", &GetComputationSerializedProto)
-      .def("GetHloText", &GetComputationHloText)
-      .def("GetHloDotGraph", &GetComputationHloDotGraph)
-      .def("Hash", &HashComputation)
-      .def("get_hlo_module", &GetHloModule);
+      .def("get_hlo_module", &GetHloModule)
+      .def("program_shape", &XlaComputation::GetProgramShape)
+      .def("as_serialized_hlo_module_proto", &GetComputationSerializedProto)
+      .def("as_hlo_text", &GetComputationHloText)
+      .def("as_hlo_dot_graph", &GetComputationHloDotGraph)
+      .def("hash", &HashComputation)
+      .def("as_hlo_module", &GetHloModule);
 
   py::class_<HloPrintOptions> hlo_print_options_class(m, "HloPrintOptions");
   hlo_print_options_class.def(py::init<>())
@@ -1297,6 +1398,7 @@ PYBIND11_MODULE(xla_extension, m) {
       .def(py::init([](const std::string& name) -> std::unique_ptr<XlaBuilder> {
         return absl::make_unique<XlaBuilder>(UniquifyName(name));
       }))
+      // TODO(phawkins): delete capitalized names after updating callers.
       .def(
           "Build",
           [](XlaBuilder& builder, absl::optional<XlaOp> root) {
@@ -1304,38 +1406,47 @@ PYBIND11_MODULE(xla_extension, m) {
           },
           "Builds a computation from the contents of the builder.",
           py::arg("root") = absl::nullopt)
-      .def("ClearOpMetadata", &XlaBuilder::ClearOpMetadata)
       .def("GetShape", &XlaBuilder::GetShape)
       .def(
-          "GetProgramShape",
+          "build",
+          [](XlaBuilder& builder, absl::optional<XlaOp> root) {
+            return root ? builder.Build(*root) : builder.Build();
+          },
+          "Builds a computation from the contents of the builder.",
+          py::arg("root") = absl::nullopt)
+      .def("clear_op_metadata", &XlaBuilder::ClearOpMetadata)
+      .def("get_shape", &XlaBuilder::GetShape)
+      .def(
+          "get_program_shape",
           [](const XlaBuilder& builder,
              absl::optional<XlaOp> root) -> StatusOr<ProgramShape> {
             return root ? builder.GetProgramShape(*root)
                         : builder.GetProgramShape();
           },
           py::arg("root") = absl::nullopt)
-      .def("IsConstant", &XlaBuilder::IsConstant)
-      .def("SetOpMetadata", &XlaBuilder::SetOpMetadata)
-      .def("SetSharding", &XlaBuilder::SetSharding)
-      .def("ClearSharding", &XlaBuilder::ClearSharding);
+      .def("is_constant", &XlaBuilder::IsConstant)
+      .def("set_op_metadata", &XlaBuilder::SetOpMetadata)
+      .def("set_sharding", &XlaBuilder::SetSharding)
+      .def("clear_sharding", &XlaBuilder::ClearSharding)
+      .def("setup_alias",
+           [](XlaBuilder& builder, const std::vector<int64>& output_index,
+              int64 param_number, const std::vector<int64>& param_index) {
+             builder.SetUpAlias(
+                 ShapeIndex(output_index.begin(), output_index.end()),
+                 param_number,
+                 ShapeIndex(param_index.begin(), param_index.end()));
+           });
 
-  m.def("BufferToDLPackManagedTensor", BufferToDLPackManagedTensor);
-  m.def("DLPackManagedTensorToBuffer",
-        [](const py::capsule& tensor, std::shared_ptr<PyLocalClient> client)
-            -> StatusOr<ClientAndUniquePtr<PyLocalBuffer>> {
+  m.def("buffer_to_dlpack_managed_tensor", BufferToDLPackManagedTensor);
+  m.def("dlpack_managed_tensor_to_buffer",
+        [](const py::capsule& tensor, std::shared_ptr<PjRtClient> client)
+            -> StatusOr<ClientAndUniquePtr<PjRtBuffer>> {
           TF_ASSIGN_OR_RETURN(
-              std::unique_ptr<PyLocalBuffer> buffer,
+              std::unique_ptr<PjRtBuffer> buffer,
               DLPackManagedTensorToBuffer(tensor, client.get()));
           return WrapWithClient(std::move(client), std::move(buffer));
         });
 
-  py::enum_<TriangularSolveOptions::Transpose>(
-      m, "TriangularSolveOptions_Transpose")
-      .value("TRANSPOSE_INVALID", TriangularSolveOptions::TRANSPOSE_INVALID)
-      .value("NO_TRANSPOSE", TriangularSolveOptions::NO_TRANSPOSE)
-      .value("TRANSPOSE", TriangularSolveOptions::TRANSPOSE)
-      .value("ADJOINT", TriangularSolveOptions::ADJOINT);
-
   py::enum_<PrecisionConfig::Precision>(m, "PrecisionConfig_Precision")
       .value("DEFAULT", PrecisionConfig::DEFAULT)
       .value("HIGH", PrecisionConfig::HIGH)
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index c036f3a59e6..76c3bc33a91 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -19,12 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
 import collections
 import enum  # pylint: disable=g-bad-import-order
 import inspect
-import itertools
 import os
+from typing import List, Sequence, Tuple, Union
 
 from absl import logging
 import numpy as np
@@ -35,130 +34,18 @@ import numpy as np
 # and TensorFlow may fail with duplicate protocol buffer message definitions.
 
 from tensorflow.compiler.xla.python import xla_extension as _xla
-from tensorflow.compiler.xla.python.xla_extension import ops
 
-# Most functions are snake_case for consistency with other modules, whereas
-# method names of ComputationBuilder and Computation are CamelCase for
-# consistency with XLA.
+# Most functions are snake_case for consistency with other modules, some
+# method names are CamelCase for consistency with XLA.
 # pylint: disable=invalid-name
 
+# Pylint has false positives for type annotations.
+# pylint: disable=invalid-sequence-index
+
+ops = _xla.ops
 profiler = _xla.profiler
 
 
-class Backend(object, metaclass=abc.ABCMeta):
-  """Abstract base class for XLA backends."""
-
-  def __init__(self, platform):
-    """Creates a new Backend.
-
-    Args:
-      platform: A string naming the platform; for example 'gpu'.
-    """
-    self.platform = platform
-
-  @abc.abstractmethod
-  def device_count(self):
-    """Returns the number of devices known to the backend."""
-
-  @abc.abstractmethod
-  def local_device_count(self):
-    """Returns the number of devices local to this host."""
-
-  @abc.abstractmethod
-  def devices(self):
-    """Returns a list of `device_count()` Device subclasses."""
-
-  @abc.abstractmethod
-  def host_id(self):
-    """Returns the integer ID of this host."""
-
-  @abc.abstractmethod
-  def buffer_from_pyval(self, pyval, device=None, force_copy=False):
-    """Allocates a fresh buffer and populates it with `pyval`."""
-
-  @abc.abstractmethod
-  def compile(self, computation, compile_options):
-    """Compiles a computation. Returns an executable."""
-
-  @abc.abstractmethod
-  def get_default_device_assignment(self, num_replicas, num_partitions):
-    """Returns the default device assignment that `compile` would use.
-
-    If `compile_options.device_assignment` isn't set, `compile` will pick a
-    deterministic device assignment based on the number of replicas and
-    partitions, possibly optimizing for device locality. This method returns
-    that assignment, which is useful for e.g. manually replicating a value
-    before passing it to a compiled executable.
-
-    Args:
-      num_replicas: the number of replicas needed.
-      num_partitions: the number of partitions needed.
-
-    Returns:
-      A list of list of Devices of size `(num_replicas, num_partitions)`.
-    """
-
-
-class LocalBackend(Backend):
-  """XLA backend implemented using the in-process xla::LocalClient API."""
-
-  def __init__(self, platform, client):
-    """Creates a new LocalBackend.
-
-    Args:
-      platform: A string; the user-visible platform name, e.g. 'gpu'.
-      client: An _xla.PyLocalClient object.
-    """
-    super(LocalBackend, self).__init__(platform)
-    self.client = client
-
-  def device_count(self):
-    return self.client.device_count()
-
-  def local_device_count(self):
-    return self.client.local_device_count()
-
-  def devices(self):
-    return self.client.devices()
-
-  def local_devices(self):
-    return self.client.local_devices()
-
-  def host_id(self):
-    return self.client.host_id()
-
-  def buffer_from_pyval(self, pyval, device=None, force_copy=False):
-    if device is None:
-      device = self.local_devices()[0]
-    return _xla.PyLocalBuffer.from_python(pyval, self.client, device,
-                                          force_copy)
-
-  def compile(self, c_computation, compile_options):
-    options = _xla.ExecutableBuildOptions()
-    options.num_replicas = compile_options.num_replicas
-    options.num_partitions = compile_options.num_partitions
-    if compile_options.result_layout:
-      options.result_layout = compile_options.result_layout
-    options.debug_options.xla_cpu_fast_math_honor_infs = True
-    options.debug_options.xla_cpu_fast_math_honor_nans = True
-    options.debug_options.xla_cpu_fast_math_honor_division = True
-    options.debug_options.xla_cpu_fast_math_honor_functions = True
-    options.debug_options.xla_gpu_enable_fast_min_max = False
-    return _xla.LocalExecutable.Compile(c_computation,
-                                        compile_options.argument_layouts,
-                                        options, self.client,
-                                        compile_options.device_assignment,
-                                        compile_options.tuple_arguments)
-
-  def get_default_device_assignment(self, num_replicas, num_partitions=None):
-    if num_partitions is not None:
-      return self.client.GetDefaultDeviceAssignment(num_replicas,
-                                                    num_partitions)
-    else:
-      # TODO(skye): delete this case after all callers can handle 2D output
-      return self.client.GetDefaultDeviceAssignment(num_replicas)
-
-
 xla_platform_names = {
     'cpu': 'Host',
     'gpu': 'CUDA',
@@ -166,8 +53,7 @@ xla_platform_names = {
 
 
 def _cpu_backend_factory():
-  client = _xla.get_cpu_client(asynchronous=True)
-  return LocalBackend(platform='cpu', client=client)
+  return _xla.get_cpu_client(asynchronous=True)
 
 
 def _gpu_backend_factory(distributed_client=None, node_id=0):
@@ -190,12 +76,11 @@ def _gpu_backend_factory(distributed_client=None, node_id=0):
     config.memory_fraction = float(memory_fraction)
   config.preallocate = preallocate not in ('0', 'false', 'False')
 
-  client = _xla.get_nvidia_gpu_client(
+  return _xla.get_nvidia_gpu_client(
       asynchronous=True,
       allocator_config=config,
       distributed_client=distributed_client,
       node_id=node_id)
-  return LocalBackend(platform='gpu', client=client)
 
 
 # Backend factories, keyed by user-visible name, in increasing priority order.
@@ -376,44 +261,6 @@ class ProgramShape(object):
 """
 
 
-class Buffer(object):
-  """Represents a handle to data owned by XLA.
-
-  The referent is ready for use in executing a local, compiled
-  Computation. On XLA platforms involving a device (e.g. GPU), this
-  means the referent is in device memory.
-  """
-
-  @staticmethod
-  def from_pyval(pyval, device=None, backend=None, force_copy=False):
-    """Copies the `pyval` to a freshly allocated on-device buffer."""
-    backend = backend or get_local_backend()
-    return backend.buffer_from_pyval(pyval, device, force_copy=force_copy)
-
-  # Buffer is not an instantiable type and exists only for its static methods.
-  # The underlying buffer objects are C++ object with the following
-  # API:
-  # def shape(self) -> Shape:
-  # def device(self) -> int:
-  # def delete(self):
-  # def is_deleted(self) -> bool:
-  # def block_host_until_ready(self):
-  #    """Blocks the calling thread until the buffer is ready on device."""
-  # def copy_to_host_async(self):
-  #    """Requests a copy of the buffer to the host.
-  #
-  #       Does not block waiting for the copy. Values fetched are available via
-  #       `to_py()`; the purpose of `copy_to_host_async` is to prefetch values
-  #       for subsequent `to_py()` calls, especially when requesting many values
-  #       at once.
-  #    """
-  # def to_py(self):
-  #    """Returns the value of the buffer as a Python tuple tree of ndarrays."""
-  #
-  # TODO(phawkins): remove Buffer and its static methods completely, have
-  # clients call methods on Backend to create buffers.
-
-
 def shape_from_pyval(pyval):
   """Returns a Shape that describes a tuple-tree of Numpy arrays."""
 
@@ -426,43 +273,6 @@ def shape_from_pyval(pyval):
   return convert(pyval)
 
 
-def transfer_to_infeed(value, device=None):
-  """Transfers the given value into the XLA infeed queue.
-
-  XLA's infeed queue is a single queue that feeds the "XLA virtual machine" with
-  a totally ordered stream of values. This is dequeued from XLA computations via
-  the Infeed() operation.
-
-  Args:
-    value: the value that the caller would like to enqueue into the XLA infeed
-      queue
-    device: the device to infeed the value to. Each device has a distinct infeed
-      queue.
-  """
-  # TODO(phawkins): support non-default backends.
-  backend = get_local_backend()
-  device = device or backend.local_devices()[0]
-  device.TransferToInfeed(value)
-
-
-def transfer_from_outfeed(shape, device=None):
-  """Transfers a literal of the given shape from `device`'s outfeed.
-
-  Args:
-    shape: The shape of the value to transfer from outfeed.
-    device: The device from which to transfer the outfeed value. Each device has
-      a distinct outfeed queue..
-
-  Returns:
-    The literal value that is produced from the outfeed queue.
-  """
-  # TODO(phawkins): support non-default backends.
-  backend = get_local_backend()
-  device = device or backend.local_devices()[0]
-  return device.TransferFromOutfeed(
-      shape.with_major_to_minor_layout_if_absent())
-
-
 DeviceAssignment = _xla.DeviceAssignment
 DeviceAssignment.__doc__ = """
 A DeviceAssignment is a C++ object with the following signature.
@@ -484,112 +294,19 @@ def computation_count():
 """
 
 Device = _xla.Device
-
-
-class CompileOptions(object):
-  """Python object for XLA compile options.
-
-  These options can be passed to the 'compile' step when using a local XLA
-  client.
-  """
-
-  def __init__(self):
-    self.xla_dump_to = None
-    self.dump_hlo_pass_re = None
-    self.dump_hlo_module_re = None
-    self.dump_hlo_as_text = None
-    self.dump_hlo_as_proto = None
-    self.hlo_profile = None
-    self.num_replicas = 1
-    self.num_partitions = 1
-    self.argument_layouts = None
-    self.result_layout = None
-    self.device_assignment = None
-    self.tuple_arguments = False
-
-
-class Computation(object):
-  """Python wrapper for an XLA Computation.
-
-  A Computation can be compiled to form an Executable, or used as a
-  subcomputation in ComputationBuilder methods.
-  """
-
-  def __init__(self, c_computation, backend=None):
-    self._c_computation = c_computation
-    # The backend argument is deprecated. Pass a backend to Compile() instead.
-    self._backend = backend
-
-  @property
-  def computation(self):
-    return self._c_computation
-
-  def GetSerializedProto(self):
-    """Gets the serialized HloModuleProto proto object in this computation.
-
-    Returns:
-       A string containing a serialized HloModuleProto proto containing the
-       computation and its dependencies.
-    """
-    return self.computation.GetSerializedProto()
-
-  def GetHloText(self):
-    """Get the textual HLO representation of this computation.
-
-    Returns:
-       A string containing the textual HLO.
-    """
-    return self.computation.GetHloText()
-
-  def GetHloDotGraph(self):
-    """Get a Graphviz Dot representation of this computation.
-
-    Returns:
-       A string containing the graphviz dot graph.
-    """
-    return self.computation.GetHloDotGraph()
-
-  def Compile(self, argument_shapes=None, compile_options=None, backend=None):
-    """Compiles a computation.
-
-    Computations are the result of a "ComputationBuild'ing" process.
-
-    Arguments:
-      argument_shapes: Deprecated. Use compile_options.argument_layouts instead.
-      compile_options: options to use for compilation, includes an optional laid
-        out result shape for the computation.
-      backend: a `Backend` for which an executable should be generated.
-
-    Returns:
-      A Executable instance.
-    """
-    backend = backend or self._backend or get_local_backend()
-
-    compile_options = compile_options or CompileOptions()
-    if argument_shapes:
-      compile_options.argument_layouts = argument_shapes
-    return backend.compile(self.computation, compile_options)
-
-  def GetProgramShape(self):
-    return self._c_computation.GetProgramShape()
-
-  def GetReturnValueShape(self):
-    return self._c_computation.GetProgramShape().result_shape()
-
-  def Hash(self):
-    return self._c_computation.Hash()
+CompileOptions = _xla.CompileOptions
 
 
 # An Executable is a C++ class that duck types with the following API:
 # class Executable(object):
 #   def local_devices(self) -> [Device]:
-#   def Execute(self, arguments : [Buffer]) -> Buffer:
+#   def execute(self, arguments : [Buffer]) -> Buffer:
 #     """Execute on one replica with Buffer arguments and return value."""
 #
-#   def SizeOfGeneratedCodeInBytes(self) -> int:
+#   def size_of_generated_code_in_bytes(self) -> int:
 #     """Return generated binary size, or -1 if not known."""
 #
-#   def ExecuteOnLocalDevices(self, arguments: [[Buffer]]) -> [Buffer]:
+#   def execute_on_local_devices(self, arguments: [[Buffer]]) -> [Buffer]:
 #     """Execute on many replicas with Buffer arguments and return value.
 #
 #     Args:
@@ -605,21 +322,18 @@ class Computation(object):
 # There are different implementations of Executable for different backends.
 
 
-def execute_with_python_values(executable, arguments=(), backend=None):
+def execute_with_python_values(executable, arguments, backend):
   """Execute on one replica with Python values as arguments and output."""
 
-  backend = backend or get_local_backend()
-
   def put(arg):
-    return Buffer.from_pyval(
-        arg, device=executable.local_devices()[0], backend=backend)
+    return backend.buffer_from_pyval(arg, device=executable.local_devices()[0])
 
   arguments = [put(arg) for arg in arguments]
-  outputs = executable.Execute(arguments)
+  outputs = executable.execute(arguments)
   return [x.to_py() for x in outputs]
 
 
-def execute_with_python_values_replicated(executable, arguments, backend=None):
+def execute_with_python_values_replicated(executable, arguments, backend):
   """Execute on many replicas with Python values as arguments and output.
 
   Arguments:
@@ -631,7 +345,6 @@ def execute_with_python_values_replicated(executable, arguments, backend=None):
   Returns:
     A list of python values, one per replica.
   """
-  backend = backend or get_local_backend()
   devices = executable.local_devices()
   # pylint: disable=g-complex-comprehension
   flat_args = [(arg, devices[replica])
@@ -646,7 +359,7 @@ def execute_with_python_values_replicated(executable, arguments, backend=None):
     flat_arg_buffers = flat_arg_buffers[len(replica_args):]
   return [[x.to_py()
            for x in xs]
-          for xs in executable.ExecuteOnLocalDevices(arg_buffers)]
+          for xs in executable.execute_on_local_devices(arg_buffers)]
 
 
 class PaddingType(enum.Enum):
@@ -654,8 +367,8 @@ class PaddingType(enum.Enum):
   SAME = 2
 
 
-def _convert_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
-                                        window_strides):
+def window_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
+                                      window_strides):
   """Maps PaddingType or string to pad values (list of pairs of ints)."""
   if not isinstance(padding_type, (str, PaddingType)):
     msg = 'padding_type must be str or PaddingType, got {}.'
@@ -685,1094 +398,10 @@ def _convert_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
     raise ValueError(msg.format(padding_type))
 
 
-class ComputationBuilder(object):
-  """XLA computation builder.
-
-  Enqueues XLA ops in sequence and in order to build a
-  Computation, which in turn can be compiled into a
-  LocalExecutable, which in turn can be locally executed.
-  """
-
-  # The methods of this class map 1-to-1 onto the XLA C++
-  # computation builder API. Therefore, there's no need to laboriously list
-  # arguments and return values for every method, especially where it's obvious.
-  #
-  # pylint: disable=g-doc-return-or-yield
-  # pylint: disable=g-doc-args
-
-  def __init__(self, name):
-    self._builder = _xla.XlaBuilder(name)
-    self._parameter_numbering = itertools.count()
-
-  def Build(self, root=None, backend=None):
-    """Builds a `Computation` from the contents of the builder.
-
-    Args:
-      root: if not None, the operator containing the return value of the
-        computation.
-
-    Returns:
-      A `Computation`.
-    """
-    if root is not None:
-      return Computation(self._builder.Build(root), backend=backend)
-    else:
-      return Computation(self._builder.Build(), backend=backend)
-
-  def GetShape(self, operand):
-    return self._builder.GetShape(operand)
-
-  def SetOpMetadata(self, op_metadata):
-    """Set metadata for operations that are about to be enqueued."""
-    self._builder.SetOpMetadata(op_metadata)
-
-  def ClearOpMetadata(self):
-    """Clear metadata for operations that are about to be enqueued."""
-    self._builder.ClearOpMetadata()
-
-  def SetSharding(self, sharding):
-    """Set sharding that will be attached to all instructions until cleared."""
-    self._builder.SetSharding(sharding)
-
-  def ClearSharding(self):
-    """Clears the sharding.
-
-    Ops will be sharded according to the default placement policy.
-    """
-    self._builder.ClearSharding()
-
-  def CreateToken(self):
-    """Enqueues a CreateToken op onto the computation.
-
-    Returns:
-      An XlaOp, representing a fresh token.
-    """
-    return ops.CreateToken(self._builder)
-
-  def AfterAll(self, tokens):
-    """Enqueues a after-all op onto the computation.
-
-    `AfterAll` takes a variadic number of tokens and produces a single token.
-
-    Args:
-      tokens: a list of `XlaOp` values representing predecessor tokens.
-
-    Returns:
-      An `XlaOp`.
-    """
-    return ops.AfterAll(self._builder, tokens)
-
-  def Infeed(self, shape, token=None):
-    """Enqueues an infeed op onto the computation.
-
-    Infeed operations dequeue data of the given shape from the device's infeed
-    queue for subsequent use in the computation.
-
-    Args:
-      shape: a `Shape` describing the shape of the infed value.
-      token: an optional `XlaOp` representing a token after which the infeed
-        effect should be sequenced.
-
-    Returns:
-      An XlaOp, representing a (value, token) pair.
-    """
-    if token is None:
-      token = ops.CreateToken(self._builder)
-    return ops.InfeedWithToken(token,
-                               shape.with_major_to_minor_layout_if_absent())
-
-  def Outfeed(self, operand, token=None):
-    """Enqueues an outfeed op onto the computation.
-
-    Outfeed operations enqueue data, using the given operand, onto the XLA
-    outfeed queue for subsequent dequeue via the client API.
-
-    Args:
-      operand: an `XlaOp` representing the data to outfeed.
-      token: an `XlaOp` representing a token after which the outfeed should be
-        sequenced.
-
-    Returns:
-      An `XlaOp` representing a token.
-    """
-    if token is None:
-      token = ops.CreateToken(self._builder)
-    return ops.OutfeedWithToken(operand, token, self._builder.GetShape(operand),
-                                '')
-
-  def Constant(self, value):
-    """Enqueues a constant op onto the computation.
-
-    Args:
-      value: value for the constant, as a np.array with an explicit dtype set to
-        one of the supported types.
-
-    Returns:
-      An XlaOp.
-    """
-    return ops.ConstantLiteral(self._builder, value)
-
-  def ConstantF32Scalar(self, value):
-    """Convenience method to enqueue a scalar F32 constant op.
-
-    Args:
-      value: a floating-point number.
-
-    Returns:
-      An XlaOp.
-    """
-    return self.Constant(np.array(value, dtype=np.float32))
-
-  def ConstantF64Scalar(self, value):
-    """Convenience method to enqueue a scalar F32 constant op.
-
-    Args:
-      value: a floating-point number.
-
-    Returns:
-      An XlaOp.
-    """
-    return self.Constant(np.array(value, dtype=np.float64))
-
-  def ConstantS32Scalar(self, value):
-    """Convenience method to enqueue a scalar S32 constant op.
-
-    Args:
-      value: a floating-point number.
-
-    Returns:
-      An XlaOp.
-    """
-    return self.Constant(np.array(value, dtype=np.int32))
-
-  def ConstantS64Scalar(self, value):
-    """Convenience method to enqueue a scalar S64 constant op.
-
-    Args:
-      value: a floating-point number.
-
-    Returns:
-      An XlaOp.
-    """
-    return self.Constant(np.array(value, dtype=np.int64))
-
-  def ConstantPredScalar(self, value):
-    """Convenience method to enqueue a scalar PRED constant op.
-
-    Args:
-      value: a boolean value.
-
-    Returns:
-      An XlaOp.
-    """
-    return self.Constant(np.array(value, dtype=np.bool))
-
-  def ParameterWithShape(self,
-                         shape,
-                         name=None,
-                         parameter_num=None,
-                         replicated=None):
-    """Enqueues a Parameter op onto the computation, given a shape.
-
-    Args:
-      shape: the parameter's shape as a Shape object.
-      name: optional string name for the parameter.
-      parameter_num: parameter number in the computation function. If None, the
-        next linear parameter number is used. The default value capability can
-        be used for auto-numbering. If you're using auto-numbering for some
-        parameters, use it for *all* parameters to avoid clashes.
-      replicated: whether to mark the parameter's leaves as replicated. May be a
-        bool, in which case it applies to all leaves, or an iterable of bools.
-        The default is None, which means no replication annotation.
-
-    Returns:
-      An XlaOp.
-    """
-    if name is None:
-      name = ''
-    if parameter_num is None:
-      parameter_num = next(self._parameter_numbering)
-    if replicated is None:
-      replicated = []
-    elif isinstance(replicated, bool):
-      replicated = [replicated] * shape.leaf_count()
-
-    return ops.Parameter(self._builder, parameter_num,
-                         shape.with_major_to_minor_layout_if_absent(),
-                         name.encode('utf8'), replicated)
-
-  def ParameterFromNumpy(self, value, name=None, parameter_num=None):
-    """Enqueues a Parameter op onto the computation.
-
-    Args:
-      value: a Numpy array, or a nested tuple thereof, from which the shape is
-        inferred.
-      name: as in ParameterWithShape.
-      parameter_num: as in ParameterWithShape.
-
-    Returns:
-      An XlaOp.
-    """
-    return self.ParameterWithShape(
-        shape_from_pyval(value), name=name, parameter_num=parameter_num)
-
-  def Iota(self, dtype, size):
-    """Enqueues an iota constant onto the computation.
-
-    Args:
-      dtype: expected numpy dtype of the output.
-      size: integer, the number of elements in the array.
-
-    Returns:
-      An XlaOp representing the added iota constant.
-    """
-    element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
-    return ops.Iota(self._builder, element_type, size)
-
-  def BroadcastedIota(self, dtype, shape, dimension):
-    """Enqueues a broadcasted iota constant onto the computation.
-
-    Args:
-      dtype: expected numpy dtype of the output.
-      shape: tuple of integers, the expected output shape (dimensions).
-      dimension: positive integer, dimension along which to increment values.
-
-    Returns:
-      An XlaOp representing the added broadcasted iota constant.
-    """
-    element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
-    xla_shape = _xla.Shape.array_shape(element_type, shape, None)
-    return ops.Iota(self._builder, xla_shape, dimension)
-
-  def Concatenate(self, operands, dimension):
-    """Enqueues a concatenate operation onto the computation.
-
-    Args:
-      operands: the operands to concatenate.
-      dimension: the dimension in which to perform the concatenation.
-
-    Returns:
-      An XlaOp representing the added concatenate op.
-    """
-    return ops.ConcatInDim(self._builder, list(operands), dimension)
-
-  def ReplicaId(self):
-    """Enqueues a ReplicaId operation onto the computation.
-
-    Returns:
-      A LocalOp representing the replica id.
-    """
-    return _xla.ops.ReplicaId(self._builder)
-
-  def Pad(self, operand, padding_value, padding_config):
-    """Enqueues a Pad operation onto the computation.
-
-    Args:
-      operand: XlaOp representing the array to pad.
-      padding_value: XlaOp representing the scalar pad value.
-      padding_config: either a PaddingConfig or a list of integer triples
-        (edge_padding_low, edge_padding_high, interior_padding) representing the
-        configuration of the padding operation.
-
-    Returns:
-      An XlaOp representing the added Pad op.
-    """
-    if isinstance(padding_config, tuple) or isinstance(padding_config, list):
-      padding_config = GetPaddingConfigFromTriples(padding_config)
-    return ops.Pad(operand, padding_value, padding_config)
-
-  def Reshape(self, operand, dimensions, new_sizes):
-    """Enqueues a reshape op onto the computation.
-
-    Args:
-      operand: XlaOp representing the array to be reshaped.
-      dimensions: sequence of integers encoding the order in which dimensions
-        are collapsed or None, in which case dimensions are flattened in order.
-      new_sizes: sequence of integers encoding the new dimension sizes (shape).
-
-    Returns:
-      An XlaOp representing the added Reshape op.
-    """
-    if dimensions is None:
-      ndim = len(self.GetShape(operand).dimensions())
-      dimensions = tuple(range(ndim))
-    return ops.Reshape(operand, dimensions, new_sizes)
-
-  def AllReduce(self, operand, computation, replica_groups=None):
-    """AllReduce op.
-
-    Args:
-      operand: XlaOp representing the input array
-      computation: a Computation object - binary reduction function.
-      replica_groups: optional, list of lists of ints encoding a partition of
-        the set {0, 1, ..., num_replicas} into equally-sized replica groups
-        within which the all-to-all is performed. If not supplied or None (the
-        default), all replicas belong to the same group.
-
-    Returns:
-      An XlaOp that represents the all-reduced result.
-    """
-    replica_groups_protos = _get_replica_groups_protos(replica_groups)
-    return ops.AllReduce(operand, computation.computation,
-                         replica_groups_protos, None, None)
-
-  def AllToAll(self,
-               operand,
-               split_dimension,
-               concat_dimension,
-               replica_groups=None):
-    """AllToAll op.
-
-    Args:
-      operand: XlaOp representing the input array
-      split_dimension: the dimension along which the operand is split
-      concat_dimension: the dimension along which the split blocks are
-        concatenated
-      replica_groups: optional, list of lists of ints encoding a partition of
-        the set {0, 1, ..., num_replicas} into equally-sized replica groups
-        within which the all-to-all is performed. If not supplied or None (the
-        default), all replicas belong to the same group.
-
-    Returns:
-      An XlaOp that represents the all-to-all concatenation.
-    """
-    replica_groups_protos = _get_replica_groups_protos(replica_groups)
-    if not replica_groups:
-      split_count = 1
-    else:
-      split_count = len(replica_groups[0])
-      if not all(split_count == len(g) for g in replica_groups):
-        raise ValueError('Replica groups must be equally sized')
-    return ops.AllToAll(operand, split_dimension, concat_dimension, split_count,
-                        replica_groups_protos)
-
-  def CrossReplicaSum(self, operand, replica_groups=None):
-    """CrossReplicaSum op.
-
-    Args:
-      operand: the operand to sum across replica instances.
-      replica_groups: optional, list of lists of ints encoding a partition of
-        the set {0, 1, ..., num_replicas} into equally-sized replica groups
-        within which the cross-replica sum is performed. If not supplied or None
-        (the default), all replicas belong to the same group.
-
-    Returns:
-      An XlaOp that represents on each replica the sum of its group's values.
-    """
-    replica_groups_protos = _get_replica_groups_protos(replica_groups)
-    return ops.CrossReplicaSum(operand, replica_groups_protos)
-
-  def Trans(self, operand):
-    """Specialized matrix transpose op."""
-    return ops.Transpose(operand, [1, 0])
-
-  def Transpose(self, operand, permutation):
-    """Transpose op."""
-    return ops.Transpose(operand, permutation)
-
-  def SelectAndScatter(self, operand, select, window_dimensions, window_strides,
-                       padding, source, init_value, scatter):
-    """Select and scatter op, used by the gradient of ReduceWindow.
-
-    Args:
-      operand: XlaOp for array of dimension N and type T over which the windows
-        slide.
-      select: Computation of type (T, T) -> Pred to apply to the elements of
-        each window to indicate which element is selected.
-      window_dimensions: sequence of N integers for dimensions of the window.
-      window_strides: sequence of N integers for the strides of the window.
-      padding: PaddingType representing either 'SAME' or 'VALID ' padding.
-      source: XlaOp for array of type T with values to scatter.
-      init_value: XlaOp of scalar type T for initial out value.
-      scatter: Computation of type (T, T) -> T to apply to each scatter source
-        element with its destination element.
-
-    Returns:
-      An XlaOp representing the added SelectAndScatter op.
-    """
-    pads = _convert_padding_type_to_pad_values(
-        padding,
-        self.GetShape(operand).dimensions(), window_dimensions, window_strides)
-    return ops.SelectAndScatterWithGeneralPadding(operand, select.computation,
-                                                  window_dimensions,
-                                                  window_strides, pads, source,
-                                                  init_value,
-                                                  scatter.computation)
-
-  def Slice(self, operand, start_indices, limit_indices, strides=None):
-    """Enqueues a slice operation onto the computation.
-
-    Args:
-      operand: XlaOp for the N dimensional array to be sliced.
-      start_indices: iterable of N integers containing the starting indices of
-        the slice for each dimension.
-      limit_indices: iterable of N integers containing the ending indices
-        (exclusive) of the slice for each dimension.
-      strides: optional iterable of N integers containing the stride sizes for
-        each dimension.
-
-    Returns:
-      An XlaOp representing the added Slice op.
-    """
-    if strides is None:
-      start_indices = list(start_indices)
-      strides = [1] * len(start_indices)
-    return ops.Slice(operand, start_indices, limit_indices, strides)
-
-  def DynamicSlice(self, operand, start_indices, slice_sizes):
-    """Enqueues a slice op with dynamic start indices onto the computation.
-
-    Args:
-      operand: XlaOp for the N dimensional array to be sliced.
-      start_indices: XlaOp for the 1D array of N integers containing the
-        starting indices of the slice.
-      slice_sizes: iterable of N integers containing the slice sizes in each
-        dimension.
-
-    Returns:
-      An XlaOp representing the added DynamicSlice op.
-    """
-    slice_sizes = list(slice_sizes)
-    if isinstance(start_indices, _xla.XlaOp):
-      start_indices = [
-          ops.Reshape(ops.Slice(start_indices, [i], [i + 1], [1]), [])
-          for i in range(len(slice_sizes))
-      ]
-    return ops.DynamicSlice(operand, list(start_indices), slice_sizes)
-
-  def DynamicUpdateSlice(self, operand, update, start_indices):
-    """Enqueues a dynamic update slice operation onto the computation.
-
-    Args:
-      operand: XlaOp for the N dimensional array to be updated.
-      update: N dimensional array comprising the slice update.
-      start_indices: Rank-1 array of N integers comprising the starting indices
-        of the slice along each dimension.
-
-    Returns:
-      An XlaOp representing the added DynamicUpdateSlice op.
-    """
-    if isinstance(start_indices, _xla.XlaOp):
-      ndims = self._builder.GetShape(start_indices).dimensions()[0]
-      start_indices = [
-          ops.Reshape(ops.Slice(start_indices, [i], [i + 1], [1]), [])
-          for i in range(ndims)
-      ]
-    return ops.DynamicUpdateSlice(operand, update, list(start_indices))
-
-  def Tuple(self, *elems):
-    """Enqueues a tuple operation onto the computation.
-
-    Args:
-      elems: a sequence of tuple operands (each a XlaOp).
-
-    Returns:
-      An XlaOp representing the added Tuple op.
-    """
-    return ops.Tuple(self._builder, list(elems))
-
-  def Call(self, computation_to_apply, operands):
-    """Enqueues a call operation onto the computation.
-
-    Args:
-      computation_to_apply: a Computation object.
-      operands: an iterable of XlaOp. The number and types of operands must
-        match the arity of computation_to_apply.
-
-    Returns:
-      An XlaOp representing the added call op.
-    """
-    return ops.Call(self._builder, computation_to_apply.computation,
-                    list(operands))
-
-  # TODO(skyewm): remove CustomCallWithLayout after callers are updated to use
-  # CustomCall.
-  def CustomCallWithLayout(self,
-                           call_target_name,
-                           operands,
-                           shape_with_layout,
-                           operand_shapes_with_layout,
-                           opaque=None):
-    """Enqueues a custom call operation onto the computation.
-
-    Args:
-      call_target_name: the name of the function to call.
-      operands: an iterable of XlaOp. The number and types of operands must
-        match the arity of `operand_shapes_with_layout`.
-      shape_with_layout: the shape of the operator's output, with layout.
-      operand_shapes_with_layout: the shapes of `operands`, including the
-        expected layouts.
-      opaque: an opaque string passed to the backend.
-
-    Returns:
-      An XlaOp representing the added custom call op.
-    """
-    opaque = opaque or b''
-    return ops.CustomCallWithLayout(
-        self._builder, call_target_name, list(operands), shape_with_layout,
-        list(operand_shapes_with_layout), opaque)
-
-  def CustomCall(self, call_target_name, operands, shape,
-                 operand_shapes_with_layout=None, opaque=None):
-    """Enqueues a custom call operation onto the computation.
-
-    Args:
-      call_target_name: the name of the function to call.
-      operands: an iterable of XlaOp. The number and types of operands must
-        match the arity of `operand_shapes_with_layout`.
-      shape: the shape of the operator's output. Must have layout if
-        `operand_shapes_with_layout` is provided.
-      operand_shapes_with_layout: optional, the shapes of `operands` including
-        the expected layouts.
-      opaque: an opaque string passed to the backend.
-
-    Returns:
-      An XlaOp representing the added custom call op.
-    """
-    opaque = opaque or b''
-    if operand_shapes_with_layout is None:
-      return ops.CustomCall(self._builder, call_target_name, list(operands),
-                            shape, opaque)
-    else:
-      return ops.CustomCallWithLayout(
-          self._builder, call_target_name, list(operands), shape,
-          list(operand_shapes_with_layout), opaque)
-
-  def Map(self, operands, computation_to_apply, dimensions):
-    """Enqueues a map operation onto the computation.
-
-    Args:
-      operands: an iterable of XlaOp.
-      computation_to_apply: a Computation object.
-      dimensions: dimensions over which to apply map the function.
-
-    Returns:
-      An XlaOp representing the added Map op.
-    """
-    return ops.Map(self._builder, list(operands),
-                   computation_to_apply.computation, dimensions, [])
-
-  def Reduce(self, operand, init_value, computation_to_apply, dimensions):
-    """Enqueues a reduction operation onto the computation.
-
-    Args:
-      operand: reduction operand (XlaOp).
-      init_value: reduction initial value (XlaOp).
-      computation_to_apply: a Computation object - binary reduction function.
-      dimensions: sequence of dimensions (integers) to reduce on.
-
-    Returns:
-      An XlaOp representing the added Reduce op.
-    """
-    return ops.Reduce(self._builder, [operand], [init_value],
-                      computation_to_apply.computation, dimensions)
-
-  def ReduceWindow(self, operand, init_value, computation_to_apply,
-                   window_dimensions, window_strides, padding):
-    """Enqueues a windowed reduction operation onto the computation.
-
-    Args:
-      operand: reduction operand (XlaOp).
-      init_value: reduction initial value (XlaOp).
-      computation_to_apply: a binary reduction function (Computation).
-      window_dimensions: dimensions of window (sequence of integers).
-      window_strides: strides for window (sequence of integers).
-      padding: PaddingType representing either 'SAME' or 'VALID' padding.
-
-    Returns:
-      An XlaOp representing the added ReduceWindow op.
-    """
-    pads = _convert_padding_type_to_pad_values(
-        padding,
-        self.GetShape(operand).dimensions(), window_dimensions, window_strides)
-    return ops.ReduceWindowWithGeneralPadding(operand, init_value,
-                                              computation_to_apply.computation,
-                                              window_dimensions, window_strides,
-                                              (), (), pads)
-
-  def ReduceWindowWithGeneralPadding(self, operand, init_value,
-                                     computation_to_apply, window_dimensions,
-                                     window_strides, base_dilations,
-                                     window_dilations, padding):
-    """Enqueues a windowed reduction operation onto the computation.
-
-    Args:
-      operand: reduction operand (XlaOp).
-      init_value: reduction initial value (XlaOp).
-      computation_to_apply: a binary reduction function (Computation).
-      window_dimensions: dimensions of window (sequence of integers).
-      window_strides: strides for window (sequence of integers).
-      base_dilations: dilations for the base (sequence of integers).
-      window_dilations: dilations for window (sequence of integers).
-      padding: length-N array-like of pairs of integers of (low, high) padding.
-
-    Returns:
-      An XlaOp representing the added ReduceWindow op.
-    """
-    return ops.ReduceWindowWithGeneralPadding(operand, init_value,
-                                              computation_to_apply.computation,
-                                              window_dimensions, window_strides,
-                                              base_dilations, window_dilations,
-                                              padding)
-
-  def RngNormal(self, mu, sigma, dims):
-    """Enqueues an RngNormal operation onto the computation.
-
-    Args:
-      mu: An XlaOp to an F32 scalar specifying the mean.
-      sigma: An XlaOp to an F32 scalar specifying the standard deviation.
-      dims: A 1D array-like of nonnegative integers specifying the dimensions.
-    Returns: a XlaOp to the generated array of F32 values.
-    """
-    shape = _xla.Shape.array_shape(self.GetShape(mu).xla_element_type(), dims)
-    return ops.RngNormal(mu, sigma, shape)
-
-  def RngUniform(self, a, b, dims):
-    """Enqueues an RngUniform operation onto the computation.
-
-    Args:
-      a: a XlaOp to an F32, S32, or U32 scalar (consistent with the type of b)
-        specifying the low end of the interval [a, b) over which values are
-        generated.
-      b: a XlaOp to an F32, S32, or U32 scalar (consistent with the type of a)
-        specifying the high end of the interval [a, b) over which values are
-        generated.
-      dims: A 1D array-like of nonnegative integers specifying the dimensions.
-    Returns: a XlaOp to the generated array of values with the same numeric type
-      (F32, S32, or U32) as the arguments a and b.
-    """
-    shape = _xla.Shape.array_shape(self.GetShape(a).xla_element_type(), dims)
-    return ops.RngUniform(a, b, shape)
-
-  def While(self, cond, body, init):
-    """Enqueues a While operation onto the computation.
-
-    Args:
-      cond: a Computation for the loop condition, which has type T -> PRED
-      body: a Computation for the loop body, which has type T -> T
-      init: a XlaOp for the initial parameter, which has type T
-    Returns: a XlaOp representing the While operation.
-    """
-    return ops.While(cond.computation, body.computation, init)
-
-  def Conditional(self, pred, true_operand, true_computation, false_operand,
-                  false_computation):
-    """Enqueues a Conditional operation onto the computation.
-
-    Args:
-      predicate: a XlaOp to test, which has scalar type PRED
-      true_operand: a XlaOp of type T_0
-      true_computation: a Computation to apply to true_operand, type T_0 -> S
-      false_operand: a ComputationDatahandle of type T_1
-      false_computation: a Computation to apply to false_operand, type T_1 -> S
-    Returns: a XlaOp representing the Conditional operation.
-    """
-    return ops.Conditional(pred, true_operand, true_computation.computation,
-                           false_operand, false_computation.computation)
-
-  def IsConstant(self, operand):
-    """Checks whether the given operand is a compile-time constant.
-
-    Args:
-      operand: a ComputationDataHandle to test.
-    Returns: bool indicating whether `operand` is a compile-time constant,
-      meaning its value does not depend on any parametersor, or on stateful
-      operators such as `RngNormal` or `Infeed`.
-    """
-    return self._builder.IsConstant(operand)
-
-  def BuildConstantSubGraph(self, operand):
-    """Builds a constant sub graph.
-
-    Args:
-      operand: a XlaOp to test.
-    Returns: a Computation that is rooted on the given `operand` which is a
-      compile-time constant.
-    """
-    return ops.BuildConstantSubGraph(operand)
-
-  def DotGeneral(self, lhs, rhs, dimension_numbers, precision_config=None):
-    """Enqueues a general dot operation onto the computation.
-
-    Args:
-      lhs: XlaOp for the left-hand-side array.
-      rhs: XlaOp for the right-hand-side array.
-      dimension_numbers: either a DotDimensionNumbers or a nested tuple
-        ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of
-        integers representing the dimensions to treat as contracting dimensions
-        and batch dimensions on each input operand.
-    Returns: a XlaOp representing the DotGeneral operation.
-    """
-    if isinstance(dimension_numbers, tuple):
-      dimension_numbers = GetDotDimensionsFromLists(dimension_numbers)
-    return ops.DotGeneral(
-        lhs, rhs, dimension_numbers, precision_config=precision_config)
-
-  def Conv(self,
-           lhs,
-           rhs,
-           window_strides,
-           padding,
-           feature_group_count=1,
-           batch_group_count=1,
-           precision_config=None):
-    """Enqueues a Conv operation onto the computation.
-
-    Args:
-      lhs: XlaOp for the rank N+2 array of inputs.
-      rhs: XlaOp for the rank N+2 array of kernel weights.
-      window_strides: length-N array-like of integer kernel strides.
-      padding: PaddingType representing either 'SAME' or 'VALID' padding.
-      feature_group_count: number of feature groups for grouped convolution.
-      batch_group_count: number of batch groups for grouped convolution.
-    Returns: a XlaOp representing the Conv operation.
-    """
-    pads = _convert_padding_type_to_pad_values(
-        padding,
-        self.GetShape(lhs).dimensions()[2:],
-        self.GetShape(rhs).dimensions()[2:], window_strides)
-    return self.ConvGeneralDilated(
-        lhs,
-        rhs,
-        window_strides,
-        pads, [], [],
-        dimension_numbers=None,
-        feature_group_count=feature_group_count,
-        batch_group_count=batch_group_count,
-        precision_config=precision_config)
-
-  def ConvWithGeneralPadding(self,
-                             lhs,
-                             rhs,
-                             window_strides,
-                             padding,
-                             lhs_dilation,
-                             rhs_dilation,
-                             feature_group_count=1,
-                             batch_group_count=1,
-                             precision_config=None):
-    """Enqueues a ConvWithGeneralPadding operation onto the computation.
-
-    Args:
-      lhs: XlaOp for the rank N+2 array of inputs.
-      rhs: XlaOp for the rank N+2 array of kernel weights.
-      window_strides: length-N array-like of kernel strides.
-      padding: length-N array-like of pairs of integers of (low, high) padding.
-      lhs_dilation: length-N array-like of dilation factors.
-      rhs_dilation: length-N array-like of dilation factors.
-      feature_group_count: number of feature groups for grouped convolution.
-      batch_group_count: number of batch groups for grouped convolution.
-
-    Returns:
-      A ComputationdataHandle representing the added ConvWithGeneralPadding op.
-    """
-    return self.ConvGeneralDilated(
-        lhs,
-        rhs,
-        list(window_strides),
-        list(padding),
-        list(lhs_dilation),
-        list(rhs_dilation),
-        dimension_numbers=None,
-        feature_group_count=feature_group_count,
-        batch_group_count=batch_group_count,
-        precision_config=precision_config)
-
-  def _GetConvDimensionNumbers(self, num_spatial_dims):
-    """Create ConvolutionDimensionNumbers proto for convolutions."""
-    nd = num_spatial_dims
-    dimension_numbers = ConvolutionDimensionNumbers()
-    dimension_numbers.input_batch_dimension = 0
-    dimension_numbers.input_feature_dimension = 1
-    dimension_numbers.output_batch_dimension = 0
-    dimension_numbers.output_feature_dimension = 1
-    dimension_numbers.kernel_output_feature_dimension = 0
-    dimension_numbers.kernel_input_feature_dimension = 1
-    dimension_numbers.input_spatial_dimensions.extend(range(2, 2 + nd))
-    dimension_numbers.kernel_spatial_dimensions.extend(range(2, 2 + nd))
-    dimension_numbers.output_spatial_dimensions.extend(range(2, 2 + nd))
-    return dimension_numbers
-
-  def ConvGeneralDilated(self,
-                         lhs,
-                         rhs,
-                         window_strides,
-                         padding,
-                         lhs_dilation,
-                         rhs_dilation,
-                         dimension_numbers=None,
-                         feature_group_count=1,
-                         batch_group_count=1,
-                         precision_config=None):
-    """Enqueues a ConvGeneralDilated operation onto the computation.
-
-    Args:
-      lhs: XlaOp for the rank N+2 array of inputs.
-      rhs: XlaOp for the rank N+2 array of kernel weights.
-      window_strides: length-N array-like of integer kernel strides.
-      padding: length-N array-like of pairs of integers of (low, high) padding.
-      lhs_dilation: length-N array-like of integer dilation factors.
-      rhs_dilation: length-N array-like of integer dilation factors.
-      dimension_numbers: optional, either a ConvolutionDimensionNumbers object
-        or a tuple (lhs_spec, rhs_spec, out_spec). Each element is a string of
-        length N+2 identifying by position: (1) batch dimensions in lhs, rhs,
-          and the output with the character 'N', (2) feature dimensions in lhs
-          and the output with the character 'C', (3) input and output feature
-          dimensions in rhs with the characters 'I' and 'O' respectively, and
-          (4) spatial dimension correspondences between lhs, rhs, and the output
-          using any distinct characters. For example, to indicate dimension
-          numbers consistent with the Conv operation with two spatial
-          dimensions, one could use ('NCHW', 'OIHW', 'NCHW'). As another
-          example, to indicate dimension numbers consistent with the TensorFlow
-          Conv2D operation, one could use ('NHWC', 'HWIO', 'NHWC'). When using
-          the latter form of convolution dimension specification, window strides
-          are associated with spatial dimension character labels according to
-          the order in which the labels appear in the rhs_spec string, so that
-          window_strides[0] is matched with the dimension corresponding to the
-          first character appearing in rhs_spec that is not 'I' or 'O'. By
-          default, use the same dimension numbering as Conv and
-          ConvWithGeneralPadding.
-      feature_group_count: number of feature groups for grouped convolution.
-      batch_group_count: number of batch groups for grouped convolution.
-    Returns: a XlaOp representing the ConvGeneralDilated operation.
-    """
-    if dimension_numbers is None:
-      dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
-    elif isinstance(dimension_numbers, tuple):
-      lhs_spec, rhs_spec, out_spec = dimension_numbers
-      dimension_numbers = ConvolutionDimensionNumbers()
-
-      dimension_numbers.input_batch_dimension = lhs_spec.index('N')
-      dimension_numbers.input_feature_dimension = lhs_spec.index('C')
-      dimension_numbers.output_batch_dimension = out_spec.index('N')
-      dimension_numbers.output_feature_dimension = out_spec.index('C')
-      dimension_numbers.kernel_output_feature_dimension = rhs_spec.index('O')
-      dimension_numbers.kernel_input_feature_dimension = rhs_spec.index('I')
-
-      dimension_numbers.kernel_spatial_dimensions.extend(
-          i for i, c in enumerate(rhs_spec) if c not in {'I', 'O'})
-      dimension_numbers.input_spatial_dimensions.extend(
-          sorted((i for i, c in enumerate(lhs_spec) if c not in {'N', 'C'}),
-                 key=lambda i: rhs_spec.index(lhs_spec[i])))
-      dimension_numbers.output_spatial_dimensions.extend(
-          sorted((i for i, c in enumerate(out_spec) if c not in {'N', 'C'}),
-                 key=lambda i: rhs_spec.index(out_spec[i])))
-    return ops.ConvGeneralDilated(
-        lhs,
-        rhs,
-        window_strides,
-        padding,
-        lhs_dilation,
-        rhs_dilation,
-        dimension_numbers,
-        feature_group_count,
-        batch_group_count,
-        precision_config=precision_config)
-
-  def Sort(self, operands, dimension=-1, comparator=None):
-    """Enqueues a sort operation onto the computation.
-
-    Args:
-      operands: either an XlaOp or a sequence of XlaOps to sort. All operands
-        must be arrays with the same dimensions.
-      dimension: the array dimension over which to sort.
-      comparator: a comparator XlaComputation. See the XLA operation semantics
-        for details.
-
-    Returns:
-      Either an XlaOp or a tuple of XlaOps (if `operands` was an XlaOp or
-      a tuple of XlaOps, respectively.)
-    """
-    operands = (
-        list(operands)
-        if isinstance(operands, collections.abc.Sequence) else [operands])
-    return ops.Sort(self._builder, operands, dimension,
-                    comparator.computation if comparator else None)
-
-  def SortKeyVal(self, keys, values, dimension=-1):
-    """Enqueues a key-value sort operation onto the computation.
-
-    Deprecated. Use `Sort` instead.
-    """
-    return ops.Sort(self._builder, [keys, values], dimension)
-
-  def QR(self, a, full_matrices=True):
-    """Enqueues a QR decomposition onto the computation."""
-    return self.Tuple(*ops.QR(a, full_matrices))
-
-  def TriangularSolve(self,
-                      a,
-                      b,
-                      left_side=False,
-                      lower=False,
-                      transpose_a=False,
-                      conjugate_a=False,
-                      unit_diagonal=False):
-    """Enqueues a triangular-solve operation onto the computation."""
-    if not transpose_a:
-      transpose = _xla.TriangularSolveOptions_Transpose.NO_TRANSPOSE
-      if conjugate_a:
-        a = self.Conj(a)
-    else:
-      transpose = (
-          _xla.TriangularSolveOptions_Transpose.ADJOINT
-          if conjugate_a else _xla.TriangularSolveOptions_Transpose.TRANSPOSE)
-    return ops.TriangularSolve(a, b, left_side, lower, unit_diagonal, transpose)
-
-  def Eigh(self, a, full_matrices=True):
-    """Enqueues a symmetric/Hermitian eigendecomposition."""
-    return self.Tuple(*ops.Eigh(a, full_matrices))
-
-  def SVD(self, a):
-    """Enqueues a singular value decomposition."""
-    return self.Tuple(*ops.SVD(a))
-
-  def Gather(self,
-             a,
-             start_indices,
-             dimension_numbers,
-             slice_sizes,
-             indices_are_sorted=False):
-    """Enqueues a Gather operation onto the computation."""
-    return ops.Gather(a, start_indices, dimension_numbers, slice_sizes,
-                      indices_are_sorted)
-
-  def Scatter(self,
-              a,
-              scatter_indices,
-              updates,
-              update_computation,
-              dimension_numbers,
-              indices_are_sorted=False,
-              unique_indices=False):
-    """Enqueues a Scatter operation onto the computation."""
-    return ops.Scatter(a, scatter_indices, updates,
-                       update_computation.computation, dimension_numbers,
-                       indices_are_sorted, unique_indices)
-
-  def Fft(self, operand, fft_type, fft_lengths):
-    """Enqueues a FFT operation onto the computation."""
-    return ops.Fft(operand, fft_type, fft_lengths)
-
-
+XlaBuilder = _xla.XlaBuilder
+XlaComputation = _xla.XlaComputation
 FftType = _xla.FftType
 
-_UNARY_OPS = [
-    'Not',
-    'PopulationCount',
-    'Clz',
-    'Abs',
-    'Exp',
-    'Expm1',
-    'Floor',
-    'Round',
-    'Ceil',
-    'Log',
-    'Log1p',
-    'Sign',
-    'Cos',
-    'Sin',
-    'Tanh',
-    'IsFinite',
-    'Sqrt',
-    'Rsqrt',
-    'Square',
-    'Reciprocal',
-    'Neg',
-    'Erf',
-    'Erfc',
-    'ErfInv',
-    'Lgamma',
-    'Digamma',
-    'BesselI0e',
-    'BesselI1e',
-    'Acos',
-    'Asin',
-    'Atan',
-    'Tan',
-    'Acosh',
-    'Asinh',
-    'Atanh',
-    'Cosh',
-    'Sinh',
-    'Real',
-    'Imag',
-    'Conj',
-]
-
-_BINARY_OPS = [
-    'Eq',
-    'Ne',
-    'Ge',
-    'Gt',
-    'Lt',
-    'Le',
-    'Add',
-    'Sub',
-    'Mul',
-    'Div',
-    'Rem',
-    'Max',
-    'Min',
-    'And',
-    'Or',
-    'Xor',
-    'Pow',
-    'ShiftLeft',
-    'ShiftRightArithmetic',
-    'ShiftRightLogical',
-    'Atan2',
-    'Igamma',
-    'IgammaGradA',
-    'Igammac',
-    'Complex',
-    'NextAfter',
-]
-
-_OTHER_OPS = [
-    'BitcastConvertType',
-    'Broadcast',
-    'BroadcastInDim',
-    'Cholesky',
-    'Clamp',
-    'Collapse',
-    'CollectivePermute',
-    'ConvertElementType',
-    'Dot',
-    'GetTupleElement',
-    'ReducePrecision',
-    'RegularizedIncompleteBeta',
-    'Rev',
-    'Select',
-    'SliceInDim',
-    'TopK',
-]
-
-
-def _forward_methods_to_local_builder():
-  """Forward remaining ComputationBuilder methods to the C API.
-
-  Set up methods, corresponding to XLA operations,
-  whose calls are forwarded in a boilerplate manner to the underlying
-  _xla.ops API.
-  """
-
-  def forward_op(target_method):
-
-    def forward(builder, *args, **kwargs):
-      del builder
-      return target_method(*args, **kwargs)
-
-    return forward
-
-  for method_name in itertools.chain(_UNARY_OPS, _BINARY_OPS, _OTHER_OPS):
-    forward = forward_op(getattr(ops, method_name))
-    forward.__name__ = method_name
-    setattr(ComputationBuilder, method_name, forward)
-
-
-_forward_methods_to_local_builder()
-
 
 def register_custom_call_target(name, fn, platform='cpu'):
   """Registers a custom call target.
@@ -1782,7 +411,7 @@ def register_custom_call_target(name, fn, platform='cpu'):
     fn: a PyCapsule object containing the function pointer.
     platform: the target platform.
   """
-  _xla.RegisterCustomCallTarget(name, fn, xla_platform_names[platform])
+  _xla.register_custom_call_target(name, fn, xla_platform_names[platform])
 
 
 # Deprecated. Use register_custom_call_target instead.
@@ -1807,15 +436,28 @@ class PaddingConfig(object):
     self.dimensions = []
 
 
-def GetPaddingConfigFromTriples(triples):
-  """Create PaddingConfig proto from list of triples of integers."""
-  padding_config = PaddingConfig()
-  for lo, hi, interior in triples:
-    dimension = PaddingConfigDimension()
-    dimension.edge_padding_low = lo
-    dimension.edge_padding_high = hi
-    dimension.interior_padding = interior
-    padding_config.dimensions.append(dimension)
+def make_padding_config(
+    padding_config: Union[PaddingConfig, Sequence[Tuple[int, int, int]]]
+) -> PaddingConfig:
+  """Create PaddingConfig proto from list of triples of integers.
+
+  Args:
+    padding_config: either a PaddingConfig or a list of integer triples
+      (edge_padding_low, edge_padding_high, interior_padding) representing the
+      configuration of the padding operation.
+
+  Returns:
+    A `PaddingConfig` object.
+  """
+  if isinstance(padding_config, tuple) or isinstance(padding_config, list):
+    triples = padding_config
+    padding_config = PaddingConfig()
+    for lo, hi, interior in triples:
+      dimension = PaddingConfigDimension()
+      dimension.edge_padding_low = lo
+      dimension.edge_padding_high = hi
+      dimension.interior_padding = interior
+      padding_config.dimensions.append(dimension)
   return padding_config
 
 
@@ -1831,14 +473,32 @@ class DotDimensionNumbers(object):
     self.rhs_batch_dimensions = []
 
 
-def GetDotDimensionsFromLists(dimension_numbers):
-  (lhs_contract, rhs_contract), (lhs_batch, rhs_batch) = dimension_numbers
-  dot_dims_proto = DotDimensionNumbers()
-  dot_dims_proto.lhs_contracting_dimensions.extend(lhs_contract)
-  dot_dims_proto.rhs_contracting_dimensions.extend(rhs_contract)
-  dot_dims_proto.lhs_batch_dimensions.extend(lhs_batch)
-  dot_dims_proto.rhs_batch_dimensions.extend(rhs_batch)
-  return dot_dims_proto
+def make_dot_dimension_numbers(
+    dimension_numbers: Union[DotDimensionNumbers,
+                             Tuple[Tuple[List[int], List[int]],
+                                   Tuple[List[int], List[int]]]]
+) -> DotDimensionNumbers:
+  """Builds a DotDimensionNumbers object from a specification.
+
+  Args:
+    dimension_numbers: either a `DotDimensionNumbers` or a nested tuple
+      `((lhs_contract, rhs_contract), (lhs_batch, rhs_batch))` of lists of
+      integers representing the dimensions to treat as contracting dimensions
+      and batch dimensions on each input operand.
+
+  Returns:
+    A `DotDimensionNumbers` object.
+  """
+  if isinstance(dimension_numbers, (list, tuple)):
+    (lhs_contract, rhs_contract), (lhs_batch, rhs_batch) = dimension_numbers
+    dot_dims_proto = DotDimensionNumbers()
+    dot_dims_proto.lhs_contracting_dimensions.extend(lhs_contract)
+    dot_dims_proto.rhs_contracting_dimensions.extend(rhs_contract)
+    dot_dims_proto.lhs_batch_dimensions.extend(lhs_batch)
+    dot_dims_proto.rhs_batch_dimensions.extend(rhs_batch)
+    return dot_dims_proto
+  else:
+    return dimension_numbers
 
 
 class ConvolutionDimensionNumbers(object):
@@ -1861,6 +521,70 @@ class ConvolutionDimensionNumbers(object):
     self.output_spatial_dimensions = []
 
 
+def make_convolution_dimension_numbers(
+    dimension_numbers: Union[None, ConvolutionDimensionNumbers, Tuple[str, str,
+                                                                      str]],
+    num_spatial_dimensions: int) -> ConvolutionDimensionNumbers:
+  """Builds a ConvolutionDimensionNumbers object from a specification.
+
+  Args:
+    dimension_numbers: optional, either a ConvolutionDimensionNumbers object or
+      a tuple (lhs_spec, rhs_spec, out_spec). Each element is a string of
+      length N+2 identifying by position: (1) batch dimensions in lhs, rhs, and
+        the output with the character 'N', (2) feature dimensions in lhs and the
+        output with the character 'C', (3) input and output feature dimensions
+        in rhs with the characters 'I' and 'O' respectively, and (4) spatial
+        dimension correspondences between lhs, rhs, and the output using any
+        distinct characters. For example, to indicate dimension numbers
+        consistent with the Conv operation with two spatial dimensions, one
+        could use ('NCHW', 'OIHW', 'NCHW'). As another example, to indicate
+        dimension numbers consistent with the TensorFlow Conv2D operation, one
+        could use ('NHWC', 'HWIO', 'NHWC'). When using the latter form of
+        convolution dimension specification, window strides are associated with
+        spatial dimension character labels according to the order in which the
+        labels appear in the rhs_spec string, so that window_strides[0] is
+        matched with the dimension corresponding to the first character
+        appearing in rhs_spec that is not 'I' or 'O'. By default, use the same
+        dimension numbering as Conv and ConvWithGeneralPadding.
+    num_spatial_dimensions: the number of spatial dimensions.
+
+  Returns:
+    A `ConvolutionDimensionNumbers` object.
+  """
+  if dimension_numbers is None:
+    nd = num_spatial_dimensions
+    dimension_numbers = ConvolutionDimensionNumbers()
+    dimension_numbers.input_batch_dimension = 0
+    dimension_numbers.input_feature_dimension = 1
+    dimension_numbers.output_batch_dimension = 0
+    dimension_numbers.output_feature_dimension = 1
+    dimension_numbers.kernel_output_feature_dimension = 0
+    dimension_numbers.kernel_input_feature_dimension = 1
+    dimension_numbers.input_spatial_dimensions.extend(range(2, 2 + nd))
+    dimension_numbers.kernel_spatial_dimensions.extend(range(2, 2 + nd))
+    dimension_numbers.output_spatial_dimensions.extend(range(2, 2 + nd))
+  elif isinstance(dimension_numbers, tuple):
+    lhs_spec, rhs_spec, out_spec = dimension_numbers
+    dimension_numbers = ConvolutionDimensionNumbers()
+
+    dimension_numbers.input_batch_dimension = lhs_spec.index('N')
+    dimension_numbers.input_feature_dimension = lhs_spec.index('C')
+    dimension_numbers.output_batch_dimension = out_spec.index('N')
+    dimension_numbers.output_feature_dimension = out_spec.index('C')
+    dimension_numbers.kernel_output_feature_dimension = rhs_spec.index('O')
+    dimension_numbers.kernel_input_feature_dimension = rhs_spec.index('I')
+
+    dimension_numbers.kernel_spatial_dimensions.extend(
+        i for i, c in enumerate(rhs_spec) if c not in {'I', 'O'})
+    dimension_numbers.input_spatial_dimensions.extend(
+        sorted((i for i, c in enumerate(lhs_spec) if c not in {'N', 'C'}),
+               key=lambda i: rhs_spec.index(lhs_spec[i])))
+    dimension_numbers.output_spatial_dimensions.extend(
+        sorted((i for i, c in enumerate(out_spec) if c not in {'N', 'C'}),
+               key=lambda i: rhs_spec.index(out_spec[i])))
+  return dimension_numbers
+
+
 class OpSharding(object):
   """Python representation of a xla.OpSharding protobuf."""
   __slots__ = ('type', 'tile_assignment_dimensions', 'tile_assignment_devices',
@@ -1923,7 +647,7 @@ def _make_replica_group_proto(replica_group):
   return replica_group_proto
 
 
-def _get_replica_groups_protos(replica_groups):
+def make_replica_groups(replica_groups):
   if replica_groups is None:
     replica_groups_protos = []  # special value for XLA API
   else:
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 95b760965d8..fbdd9921a40 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -24,14 +24,19 @@ import itertools
 import threading
 import unittest
 
+from absl import flags
 from absl.testing import absltest
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.compiler.xla.python import custom_call_for_test
 from tensorflow.compiler.xla.python import xla_client
 
 # pylint: disable=g-import-not-at-top
+try:
+  from tensorflow.compiler.xla.python import custom_call_for_test
+except ImportError:
+  custom_call_for_test = None
+
 try:
   import portpicker
 except ImportError:
@@ -39,2106 +44,2004 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 bfloat16 = xla_client.bfloat16
-
-
-class ComputationTest(absltest.TestCase):
-  """Base class for running an XLA Computation through the local client."""
-
-  def _NewComputation(self, name=None):
-    if name is None:
-      name = self.id()
-    return xla_client.ComputationBuilder(name)
-
-  def _Execute(self, c, arguments):
-    compiled_c = c.Build().Compile()
-    return xla_client.execute_with_python_values(compiled_c, arguments)
-
-  def _ExecuteAndAssertWith(self, assert_func, c, arguments, expected):
-    assert expected is not None
-    results = self._Execute(c, arguments)
-    self.assertLen(results, len(expected))
-    for result, e in zip(results, expected):
-      # Numpy's comparison methods are a bit too lenient by treating inputs as
-      # "array-like", meaning that scalar 4 will be happily compared equal to
-      # [[4]]. We'd like to be more strict so assert shapes as well.
-      self.assertEqual(np.asanyarray(result).shape, np.asanyarray(e).shape)
-      assert_func(result, e)
-
-  def _ExecuteAndCompareExact(self, c, arguments=(), expected=None):
-    self._ExecuteAndAssertWith(np.testing.assert_equal, c, arguments, expected)
-
-  def _ExecuteAndCompareClose(self,
-                              c,
-                              arguments=(),
-                              expected=None,
-                              rtol=1e-7,
-                              atol=0):
-    self._ExecuteAndAssertWith(
-        functools.partial(np.testing.assert_allclose, rtol=rtol, atol=atol), c,
-        arguments, expected)
-
-
-def NumpyArrayF32(*args, **kwargs):
-  """Convenience wrapper to create Numpy arrays with a np.float32 dtype."""
-  return np.array(*args, dtype=np.float32, **kwargs)
-
-
-def NumpyArrayF64(*args, **kwargs):
-  """Convenience wrapper to create Numpy arrays with a np.float64 dtype."""
-  return np.array(*args, dtype=np.float64, **kwargs)
-
-
-def NumpyArrayS32(*args, **kwargs):
-  """Convenience wrapper to create Numpy arrays with a np.int32 dtype."""
-  return np.array(*args, dtype=np.int32, **kwargs)
-
-
-def NumpyArrayS64(*args, **kwargs):
-  """Convenience wrapper to create Numpy arrays with a np.int64 dtype."""
-  return np.array(*args, dtype=np.int64, **kwargs)
-
-
-def NumpyArrayBool(*args, **kwargs):
-  """Convenience wrapper to create Numpy arrays with a np.bool dtype."""
-  return np.array(*args, dtype=np.bool, **kwargs)
-
-
-class ComputationPrinting(absltest.TestCase):
-
-  def ExampleComputation(self):
-    builder = xla_client.ComputationBuilder("acomputation")
-    p0 = builder.ParameterFromNumpy(np.float32(0))
-    p1 = builder.ParameterFromNumpy(np.zeros((4,), np.float32))
-    x = builder.Mul(p0, p1)
-    builder.Add(x, x)
-    return builder.Build()
-
-  def testComputationToHloText(self):
-    computation = self.ExampleComputation()
-    hlo_text = computation.GetHloText()
-    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-
-  def testComputationToHloGraph(self):
-    computation = self.ExampleComputation()
-    hlo_dot_graph = computation.GetHloDotGraph()
-    self.assertTrue(hlo_dot_graph.startswith("digraph "))
-
-  def testHloModuleToHloText(self):
-    computation = self.ExampleComputation()
-    hlo_text = computation.computation.get_hlo_module().to_string()
-    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-
-  def testHloModuleToHloGraph(self):
-    computation = self.ExampleComputation()
-    hlo_dot_graph = xla_client._xla.hlo_module_to_dot_graph(
-        computation.computation.get_hlo_module())
-    self.assertTrue(hlo_dot_graph.startswith("digraph "))
-
-  def testCompiledHloModuleToHloText(self):
-    computation = self.ExampleComputation()
-    executable = computation.Compile()
-    hlo_modules = executable.get_hlo_modules()
-    self.assertLen(hlo_modules, 1)
-    hlo_text = hlo_modules[0].to_string()
-    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-    self.assertIn("fusion", hlo_text)
-
-
-class ComputationHashTest(absltest.TestCase):
-
-  def testHash(self):
-    builder0 = xla_client.ComputationBuilder("computation0")
-    p0 = builder0.ParameterFromNumpy(np.float32(0))
-    p1 = builder0.ParameterFromNumpy(np.zeros((4,), np.float32))
-    builder0.Mul(p0, p1)
-    computation0 = builder0.Build()
-
-    builder1 = xla_client.ComputationBuilder("computation1")
-    p0 = builder1.ParameterFromNumpy(np.float32(0))
-    p1 = builder1.ParameterFromNumpy(np.zeros((4,), np.float32))
-    builder1.Mul(p0, p1)
-    computation1 = builder1.Build()
-
-    self.assertEqual(computation0.Hash(), computation1.Hash())
-
-
-class ComputationsWithConstantsTest(ComputationTest):
-  """Tests focusing on Constant ops."""
-
-  def testConstantScalarSumS8(self):
-    c = self._NewComputation()
-    c.Add(c.Constant(np.int8(1)), c.Constant(np.int8(2)))
-    self._ExecuteAndCompareExact(c, expected=[np.int8(3)])
-
-  def testConstantScalarSumBF16(self):
-    c = self._NewComputation()
-    c.Add(c.Constant(bfloat16(1.11)), c.Constant(bfloat16(3.14)))
-    self._ExecuteAndCompareClose(c, expected=[bfloat16(4.25)])
-
-  def testConstantScalarSumF32(self):
-    c = self._NewComputation()
-    c.Add(c.ConstantF32Scalar(1.11), c.ConstantF32Scalar(3.14))
-    self._ExecuteAndCompareClose(c, expected=[4.25])
-
-  def testConstantScalarSumF64(self):
-    c = self._NewComputation()
-    c.Add(c.ConstantF64Scalar(1.11), c.ConstantF64Scalar(3.14))
-    self._ExecuteAndCompareClose(c, expected=[4.25])
-
-  def testConstantScalarSumS32(self):
-    c = self._NewComputation()
-    c.Add(c.ConstantS32Scalar(1), c.ConstantS32Scalar(2))
-    self._ExecuteAndCompareClose(c, expected=[3])
-
-  def testConstantScalarSumS64(self):
-    c = self._NewComputation()
-    c.Add(c.ConstantS64Scalar(1), c.ConstantS64Scalar(2))
-    self._ExecuteAndCompareClose(c, expected=[3])
-
-  def testConstantVectorMulF16(self):
-    c = self._NewComputation()
-    c.Mul(
-        c.Constant(np.array([2.5, 3.3, -1.2, 0.7], np.float16)),
-        c.Constant(np.array([-1.2, 2, -2, -3], np.float16)))
-    self._ExecuteAndCompareClose(
-        c, expected=[np.array([-3, 6.6, 2.4, -2.1], np.float16)], rtol=2e-3)
-
-  def testConstantVectorMulF32(self):
-    c = self._NewComputation()
-    c.Mul(
-        c.Constant(NumpyArrayF32([2.5, 3.3, -1.2, 0.7])),
-        c.Constant(NumpyArrayF32([-1.2, 2, -2, -3])))
-    self._ExecuteAndCompareClose(c, expected=[[-3, 6.6, 2.4, -2.1]])
-
-  def testConstantVectorMulF64(self):
-    c = self._NewComputation()
-    c.Mul(
-        c.Constant(NumpyArrayF64([2.5, 3.3, -1.2, 0.7])),
-        c.Constant(NumpyArrayF64([-1.2, 2, -2, -3])))
-    self._ExecuteAndCompareClose(c, expected=[[-3, 6.6, 2.4, -2.1]])
-
-  def testConstantVectorScalarDivF32(self):
-    c = self._NewComputation()
-    c.Div(
-        c.Constant(NumpyArrayF32([1.5, 2.5, 3.0, -10.8])),
-        c.ConstantF32Scalar(2.0))
-    self._ExecuteAndCompareClose(c, expected=[[0.75, 1.25, 1.5, -5.4]])
-
-  def testConstantVectorScalarDivF64(self):
-    c = self._NewComputation()
-    c.Div(
-        c.Constant(NumpyArrayF64([1.5, 2.5, 3.0, -10.8])),
-        c.ConstantF64Scalar(2.0))
-    self._ExecuteAndCompareClose(c, expected=[[0.75, 1.25, 1.5, -5.4]])
-
-  def testConstantVectorScalarPowF32(self):
-    c = self._NewComputation()
-    c.Pow(c.Constant(NumpyArrayF32([1.5, 2.5, 3.0])), c.ConstantF32Scalar(2.))
-    self._ExecuteAndCompareClose(c, expected=[[2.25, 6.25, 9.]])
-
-  def testConstantVectorScalarPowF64(self):
-    c = self._NewComputation()
-    c.Pow(c.Constant(NumpyArrayF64([1.5, 2.5, 3.0])), c.ConstantF64Scalar(2.))
-    self._ExecuteAndCompareClose(c, expected=[[2.25, 6.25, 9.]])
-
-  def testIota(self):
-    c = self._NewComputation()
-    c.Iota(np.float32, 10)
-    self._ExecuteAndCompareExact(c, expected=[np.arange(10, dtype=np.float32)])
-
-  def testBroadcastedIota(self):
-    c = self._NewComputation()
-    c.BroadcastedIota(np.int64, (2, 3), 1)
-    expected = np.array([[0, 1, 2], [0, 1, 2]], dtype=np.int64)
-    self._ExecuteAndCompareExact(c, expected=[expected])
-
-  def testBooleanAnd(self):
-    c = self._NewComputation()
-    c.And(
-        c.Constant(NumpyArrayBool([True, False, True, False])),
-        c.Constant(NumpyArrayBool([True, True, False, False])))
-    self._ExecuteAndCompareExact(c, expected=[[True, False, False, False]])
-
-  def testBooleanOr(self):
-    c = self._NewComputation()
-    c.Or(
-        c.Constant(NumpyArrayBool([True, False, True, False])),
-        c.Constant(NumpyArrayBool([True, True, False, False])))
-    self._ExecuteAndCompareExact(c, expected=[[True, True, True, False]])
-
-  def testBooleanXor(self):
-    c = self._NewComputation()
-    c.Xor(
-        c.Constant(NumpyArrayBool([True, False, True, False])),
-        c.Constant(NumpyArrayBool([True, True, False, False])))
-    self._ExecuteAndCompareExact(c, expected=[[False, True, True, False]])
-
-  def testSum2DF32(self):
-    c = self._NewComputation()
-    c.Add(
-        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6]])),
-        c.Constant(NumpyArrayF32([[1, -1, 1], [-1, 1, -1]])))
-    self._ExecuteAndCompareClose(c, expected=[[[2, 1, 4], [3, 6, 5]]])
-
-  def testShiftLeft(self):
-    c = self._NewComputation()
-    c.ShiftLeft(c.Constant(NumpyArrayS32([3])), c.Constant(NumpyArrayS32([2])))
-    self._ExecuteAndCompareClose(c, expected=[[12]])
-
-  def testShiftRightArithmetic(self):
-    c = self._NewComputation()
-    c.ShiftRightArithmetic(
-        c.Constant(NumpyArrayS32([-2])), c.Constant(NumpyArrayS32([1])))
-    self._ExecuteAndCompareClose(c, expected=[[-1]])
-
-  def testShiftRightLogical(self):
-    c = self._NewComputation()
-    c.ShiftRightLogical(
-        c.Constant(NumpyArrayS32([-1])), c.Constant(NumpyArrayS32([1])))
-    self._ExecuteAndCompareClose(c, expected=[[2**31 - 1]])
-
-  def testSum2DF64(self):
-    c = self._NewComputation()
-    c.Add(
-        c.Constant(NumpyArrayF64([[1, 2, 3], [4, 5, 6]])),
-        c.Constant(NumpyArrayF64([[1, -1, 1], [-1, 1, -1]])))
-    self._ExecuteAndCompareClose(c, expected=[[[2, 1, 4], [3, 6, 5]]])
-
-  def testSum2DWith1DBroadcastDim0F32(self):
-    # sum of a 2D array with a 1D array where the latter is replicated across
-    # dimension 0 to match the former's shape.
-    c = self._NewComputation()
-    c.Add(
-        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        c.Constant(NumpyArrayF32([10, 20, 30])),
-        broadcast_dimensions=(0,))
-    self._ExecuteAndCompareClose(
-        c, expected=[[[11, 12, 13], [24, 25, 26], [37, 38, 39]]])
-
-  def testSum2DWith1DBroadcastDim0F64(self):
-    # sum of a 2D array with a 1D array where the latter is replicated across
-    # dimension 0 to match the former's shape.
-    c = self._NewComputation()
-    c.Add(
-        c.Constant(NumpyArrayF64([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        c.Constant(NumpyArrayF64([10, 20, 30])),
-        broadcast_dimensions=(0,))
-    self._ExecuteAndCompareClose(
-        c, expected=[[[11, 12, 13], [24, 25, 26], [37, 38, 39]]])
-
-  def testSum2DWith1DBroadcastDim1F32(self):
-    # sum of a 2D array with a 1D array where the latter is replicated across
-    # dimension 1 to match the former's shape.
-    c = self._NewComputation()
-    c.Add(
-        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        c.Constant(NumpyArrayF32([10, 20, 30])),
-        broadcast_dimensions=(1,))
-    self._ExecuteAndCompareClose(
-        c, expected=[[[11, 22, 33], [14, 25, 36], [17, 28, 39]]])
-
-  def testSum2DWith1DBroadcastDim1F64(self):
-    # sum of a 2D array with a 1D array where the latter is replicated across
-    # dimension 1 to match the former's shape.
-    c = self._NewComputation()
-    c.Add(
-        c.Constant(NumpyArrayF64([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        c.Constant(NumpyArrayF64([10, 20, 30])),
-        broadcast_dimensions=(1,))
-    self._ExecuteAndCompareClose(
-        c, expected=[[[11, 22, 33], [14, 25, 36], [17, 28, 39]]])
-
-  def testConstantAxpyF32(self):
-    c = self._NewComputation()
-    c.Add(
-        c.Mul(
-            c.ConstantF32Scalar(2),
-            c.Constant(NumpyArrayF32([2.2, 3.3, 4.4, 5.5]))),
-        c.Constant(NumpyArrayF32([100, -100, 200, -200])))
-    self._ExecuteAndCompareClose(c, expected=[[104.4, -93.4, 208.8, -189]])
-
-  def testConstantAxpyF64(self):
-    c = self._NewComputation()
-    c.Add(
-        c.Mul(
-            c.ConstantF64Scalar(2),
-            c.Constant(NumpyArrayF64([2.2, 3.3, 4.4, 5.5]))),
-        c.Constant(NumpyArrayF64([100, -100, 200, -200])))
-    self._ExecuteAndCompareClose(c, expected=[[104.4, -93.4, 208.8, -189]])
-
-  def testCustomCall(self):
-    c = self._NewComputation()
-    for name, fn in custom_call_for_test.cpu_custom_call_targets.items():
-      xla_client.register_custom_call_target(name, fn, platform="cpu")
-    c.CustomCall(
-        b"test_subtract_f32",
-        operands=(c.ConstantF32Scalar(1.25), c.ConstantF32Scalar(0.5)),
-        shape=xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
-        operand_shapes_with_layout=(
-            xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
-            xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
-        ))
-    self._ExecuteAndCompareClose(c, expected=[0.75])
-
-
-class ComputationFromProtoTest(absltest.TestCase):
-  """Test computation execution from HLO proto."""
-
-  def testExecuteFromProto(self):
-    # Build the HLO proto
-    b = xla_client.ComputationBuilder("computation")
-    b.Add(b.Constant(np.int8(1)), b.Constant(np.int8(2)))
-    serialized_proto = b.Build().GetSerializedProto()
-
-    # Load and execute the proto
-    c = xla_client.Computation(xla_client._xla.XlaComputation(serialized_proto))
-    ans, = xla_client.execute_with_python_values(c.Compile())
-    np.testing.assert_equal(ans, np.int8(3))
-
-
-class ParametersTest(ComputationTest):
-  """Tests focusing on Parameter ops and argument-passing."""
-
-  def setUp(self):
-    self.f32_scalar_2 = NumpyArrayF32(2.0)
-    self.f32_4vector = NumpyArrayF32([-2.3, 3.3, -4.3, 5.3])
-    self.f64_scalar_2 = NumpyArrayF64(2.0)
-    self.f64_4vector = NumpyArrayF64([-2.3, 3.3, -4.3, 5.3])
-    self.s32_scalar_3 = NumpyArrayS32(3)
-    self.s32_4vector = NumpyArrayS32([10, 15, -2, 7])
-    self.s64_scalar_3 = NumpyArrayS64(3)
-    self.s64_4vector = NumpyArrayS64([10, 15, -2, 7])
-
-  def testScalarTimesVectorAutonumberF32(self):
-    c = self._NewComputation()
-    p0 = c.ParameterFromNumpy(self.f32_scalar_2)
-    p1 = c.ParameterFromNumpy(self.f32_4vector)
-    c.Mul(p0, p1)
-    self._ExecuteAndCompareClose(
-        c,
-        arguments=[self.f32_scalar_2, self.f32_4vector],
-        expected=[[-4.6, 6.6, -8.6, 10.6]])
-
-  def testScalarTimesVectorAutonumberF64(self):
-    c = self._NewComputation()
-    p0 = c.ParameterFromNumpy(self.f64_scalar_2)
-    p1 = c.ParameterFromNumpy(self.f64_4vector)
-    c.Mul(p0, p1)
-    self._ExecuteAndCompareClose(
-        c,
-        arguments=[self.f64_scalar_2, self.f64_4vector],
-        expected=[[-4.6, 6.6, -8.6, 10.6]])
-
-  def testScalarTimesVectorS32(self):
-    c = self._NewComputation()
-    p0 = c.ParameterFromNumpy(self.s32_scalar_3)
-    p1 = c.ParameterFromNumpy(self.s32_4vector)
-    c.Mul(p0, p1)
-    self._ExecuteAndCompareExact(
-        c,
-        arguments=[self.s32_scalar_3, self.s32_4vector],
-        expected=[[30, 45, -6, 21]])
-
-  def testScalarTimesVectorS64(self):
-    c = self._NewComputation()
-    p0 = c.ParameterFromNumpy(self.s64_scalar_3)
-    p1 = c.ParameterFromNumpy(self.s64_4vector)
-    c.Mul(p0, p1)
-    self._ExecuteAndCompareExact(
-        c,
-        arguments=[self.s64_scalar_3, self.s64_4vector],
-        expected=[[30, 45, -6, 21]])
-
-  def testScalarMinusVectorExplicitNumberingF32(self):
-    # Use explicit numbering and pass parameter_num first. Sub is used since
-    # it's not commutative and can help catch parameter reversal within the
-    # computation.
-    c = self._NewComputation()
-    p1 = c.ParameterFromNumpy(self.f32_4vector, parameter_num=1)
-    p0 = c.ParameterFromNumpy(self.f32_scalar_2, parameter_num=0)
-    c.Sub(p1, p0)
-    self._ExecuteAndCompareClose(
-        c,
-        arguments=[self.f32_scalar_2, self.f32_4vector],
-        expected=[[-4.3, 1.3, -6.3, 3.3]])
-
-  def testScalarMinusVectorExplicitNumberingF64(self):
-    # Use explicit numbering and pass parameter_num first. Sub is used since
-    # it's not commutative and can help catch parameter reversal within the
-    # computation.
-    c = self._NewComputation()
-    p1 = c.ParameterFromNumpy(self.f64_4vector, parameter_num=1)
-    p0 = c.ParameterFromNumpy(self.f64_scalar_2, parameter_num=0)
-    c.Sub(p1, p0)
-    self._ExecuteAndCompareClose(
-        c,
-        arguments=[self.f64_scalar_2, self.f64_4vector],
-        expected=[[-4.3, 1.3, -6.3, 3.3]])
-
-
-class BufferTest(ComputationTest):
-  """Tests focusing on execution with Buffers."""
-
-  def testConstantSum(self):
-    c = self._NewComputation()
-    c.Add(c.ConstantF32Scalar(1.11), c.ConstantF32Scalar(3.14))
-    self._ExecuteAndCompareClose(c, expected=[4.25])
-
-  def testOneParameterSum(self):
-    c = self._NewComputation()
-    c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14))
-    self._ExecuteAndCompareClose(
-        c, arguments=[NumpyArrayF32(1.11)], expected=[4.25])
-
-  def testTwoParameterSum(self):
-    c = self._NewComputation()
-    c.Add(
-        c.ParameterFromNumpy(NumpyArrayF32(0.)),
-        c.ParameterFromNumpy(NumpyArrayF32(0.)))
-    self._ExecuteAndCompareClose(
-        c,
-        arguments=[NumpyArrayF32(1.11),
-                   NumpyArrayF32(3.14)],
-        expected=[4.25])
-
-  def testCannotCallWithDeletedBuffers(self):
-    c = self._NewComputation()
-    c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14))
-    arg = NumpyArrayF32(1.11)
-    compiled_c = c.Build().Compile()
-    arg_buffer = xla_client.Buffer.from_pyval(arg)
-    arg_buffer.delete()
-    with self.assertRaises(RuntimeError):
-      compiled_c.Execute([arg_buffer])
-
-  def testShape(self):
-    pyval = np.array([[1., 2.]], np.float32)
-    local_buffer = xla_client.Buffer.from_pyval(pyval)
-    xla_shape = local_buffer.shape()
-    self.assertEqual(xla_shape.dimensions(), (1, 2))
-    self.assertEqual(np.dtype(xla_shape.element_type()), np.dtype(np.float32))
-
-  def testBlockHostUntilReadyWorks(self):
-    arg = np.array([[1., 2.]], np.float32)
-    arg_buffer = xla_client.Buffer.from_pyval(arg)
-    arg_buffer.block_host_until_ready()
-    # This test merely checks that nothing goes awry when we call
-    # block_host_until_ready(); it's difficult to test anything else.
-
-  def testCopyToHost(self):
-    arg0 = np.array([[1., 2.]], np.float32)
-    arg1 = np.array([[3., 4.]], np.float32)
-    arg0_buffer = xla_client.Buffer.from_pyval(arg0)
-    arg1_buffer = xla_client.Buffer.from_pyval(arg1)
-    # Prefetch two buffers using copy_to_host_async, and then retrieve their
-    # values using to_py.
-    arg0_buffer.copy_to_host_async()
-    arg0_buffer.copy_to_host_async()  # Duplicate calls don't do anything.
-    arg1_buffer.copy_to_host_async()
-    np.testing.assert_equal(arg0, arg0_buffer.to_py())
-    np.testing.assert_equal(arg1, arg1_buffer.to_py())
-    # copy_to_host_async does nothing after to_py is called.
-    arg0_buffer.copy_to_host_async()
-    np.testing.assert_equal(arg0, arg0_buffer.to_py())
-
-  def testDevice(self):
-    x = np.arange(8)
-    for device in xla_client.get_local_backend().local_devices():
-      buf = xla_client.Buffer.from_pyval(x, device=device)
-      self.assertEqual(buf.device(), device)
-      np.testing.assert_equal(x, buf.to_py())
-
-
-class SingleOpTest(ComputationTest):
-  """Tests for single ops.
-
-  The goal here is smoke testing - to exercise the most basic functionality of
-  single XLA ops. As minimal as possible number of additional ops are added
-  around the op being tested.
-  """
-
-  def testConcatenateF32(self):
-    c = self._NewComputation()
-    args = (
-        c.Constant(NumpyArrayF32([1.0, 2.0, 3.0])),
-        c.Constant(NumpyArrayF32([4.0, 5.0, 6.0])),
-    )
-    c.Concatenate(args, dimension=0)
-    self._ExecuteAndCompareClose(c, expected=[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]])
-
-  def testConcatenateF64(self):
-    c = self._NewComputation()
-    args = (
-        c.Constant(NumpyArrayF64([1.0, 2.0, 3.0])),
-        c.Constant(NumpyArrayF64([4.0, 5.0, 6.0])),
-    )
-    c.Concatenate(args, dimension=0)
-    self._ExecuteAndCompareClose(c, expected=[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]])
-
-  def testConvertElementType(self):
-    xla_types = {
-        np.bool: xla_client.PrimitiveType.PRED,
-        np.int32: xla_client.PrimitiveType.S32,
-        np.int64: xla_client.PrimitiveType.S64,
-        np.float32: xla_client.PrimitiveType.F32,
-        np.float64: xla_client.PrimitiveType.F64,
-    }
-
-    def _ConvertAndTest(template, src_dtype, dst_dtype):
+ops = xla_client.ops
+
+FLAGS = flags.FLAGS
+
+# We choose to ignore pylint's complaints about complex comprehensions, which we
+# use widely for parameterizing tests.
+# pylint: disable=g-complex-comprehension
+
+
+def TestFactory(xla_backend, cloud_tpu=False):
+  tests = []
+
+  if not cloud_tpu:
+    int_dtypes = [np.int32, np.int64, np.uint32, np.uint64]
+    # TODO(phawkins): test np.float16, where supported.
+    float_dtypes = [bfloat16, np.float32, np.float64]
+    complex_dtypes = [np.complex64, np.complex128]
+    standard_dtypes = int_dtypes + float_dtypes + complex_dtypes + [np.bool_]
+  else:
+    int_dtypes = [np.int32, np.uint32]
+    float_dtypes = [np.float32]
+    complex_dtypes = [np.complex64]
+    standard_dtypes = int_dtypes + float_dtypes + complex_dtypes + [np.bool_]
+  dlpack_dtypes = int_dtypes + float_dtypes
+
+  class ComputationTest(parameterized.TestCase):
+    """Base class for running an XLA Computation through the local client."""
+
+    def setUp(self):
+      super(ComputationTest, self).setUp()
+      self.backend = xla_backend()
+
+    def _NewComputation(self, name=None):
+      if name is None:
+        name = self.id()
+      return xla_client.XlaBuilder(name)
+
+    def _Execute(self, c, arguments):
+      compiled_c = self.backend.compile(c.build())
+      return xla_client.execute_with_python_values(
+          compiled_c, arguments, backend=self.backend)
+
+    def _ExecuteAndAssertWith(self, assert_func, c, arguments, expected):
+      assert expected is not None
+      results = self._Execute(c, arguments)
+      self.assertLen(results, len(expected))
+      for result, e in zip(results, expected):
+        # Numpy's comparison methods are a bit too lenient by treating inputs as
+        # "array-like", meaning that scalar 4 will be happily compared equal to
+        # [[4]]. We'd like to be more strict so assert shapes as well.
+        self.assertEqual(np.asanyarray(result).shape, np.asanyarray(e).shape)
+        assert_func(result, e)
+
+    def _ExecuteAndCompareExact(self, c, arguments=(), expected=None):
+      self._ExecuteAndAssertWith(np.testing.assert_equal, c, arguments,
+                                 expected)
+
+    def _ExecuteAndCompareClose(self,
+                                c,
+                                arguments=(),
+                                expected=None,
+                                rtol=1e-7,
+                                atol=0):
+      self._ExecuteAndAssertWith(
+          functools.partial(np.testing.assert_allclose, rtol=rtol, atol=atol),
+          c, arguments, expected)
+
+  def NumpyArrayF32(*args, **kwargs):
+    """Convenience wrapper to create Numpy arrays with a np.float32 dtype."""
+    return np.array(*args, dtype=np.float32, **kwargs)
+
+  def NumpyArrayS32(*args, **kwargs):
+    """Convenience wrapper to create Numpy arrays with a np.int32 dtype."""
+    return np.array(*args, dtype=np.int32, **kwargs)
+
+  def NumpyArrayBool(*args, **kwargs):
+    """Convenience wrapper to create Numpy arrays with a np.bool dtype."""
+    return np.array(*args, dtype=np.bool, **kwargs)
+
+  class ComputationPrinting(absltest.TestCase):
+
+    def setUp(self):
+      super(ComputationPrinting, self).setUp()
+      self.backend = xla_backend()
+
+    def ExampleComputation(self):
+      builder = xla_client.XlaBuilder("acomputation")
+      p0 = ops.Parameter(builder, 0, xla_client.shape_from_pyval(np.float32(0)))
+      p1 = ops.Parameter(
+          builder, 1, xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+      x = ops.Mul(p0, p1)
+      ops.Add(x, x)
+      return builder.build()
+
+    def testComputationToHloText(self):
+      computation = self.ExampleComputation()
+      hlo_text = computation.as_hlo_text()
+      self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+
+    def testComputationToHloGraph(self):
+      computation = self.ExampleComputation()
+      hlo_dot_graph = computation.as_hlo_dot_graph()
+      self.assertTrue(hlo_dot_graph.startswith("digraph "))
+
+    def testHloModuleToHloText(self):
+      computation = self.ExampleComputation()
+      hlo_text = computation.as_hlo_module().to_string()
+      self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+
+    def testHloModuleToHloGraph(self):
+      computation = self.ExampleComputation()
+      hlo_dot_graph = xla_client._xla.hlo_module_to_dot_graph(
+          computation.as_hlo_module())
+      self.assertTrue(hlo_dot_graph.startswith("digraph "))
+
+    @unittest.skipIf(cloud_tpu, "not implemented")
+    def testCompiledHloModuleToHloText(self):
+      computation = self.ExampleComputation()
+      executable = self.backend.compile(computation)
+      hlo_modules = executable.hlo_modules()
+      self.assertLen(hlo_modules, 1)
+      hlo_text = hlo_modules[0].to_string()
+      self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+      self.assertIn("fusion", hlo_text)
+
+  tests.append(ComputationPrinting)
+
+  class ComputationHashTest(absltest.TestCase):
+
+    def testHash(self):
+      builder0 = xla_client.XlaBuilder("computation0")
+      p0 = ops.Parameter(builder0, 0,
+                         xla_client.shape_from_pyval(np.float32(0)))
+      p1 = ops.Parameter(
+          builder0, 1, xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+      ops.Mul(p0, p1)
+      computation0 = builder0.build()
+
+      builder1 = xla_client.XlaBuilder("computation1")
+      p0 = ops.Parameter(builder1, 0,
+                         xla_client.shape_from_pyval(np.float32(0)))
+      p1 = ops.Parameter(
+          builder1, 1, xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+      ops.Mul(p0, p1)
+      computation1 = builder1.build()
+
+      self.assertEqual(computation0.hash(), computation1.hash())
+
+  tests.append(ComputationHashTest)
+
+  class ComputationsWithConstantsTest(ComputationTest):
+    """Tests focusing on Constant ops."""
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in int_dtypes + float_dtypes)
+    def testConstantScalarSum(self, dtype):
+      if dtype == np.int8 and self.backend.platform == "tpu":
+        self.skipTest("TPU doesn't support int8")
       c = self._NewComputation()
-      x = c.Constant(np.array(template, dtype=src_dtype))
-      c.ConvertElementType(x, xla_types[dst_dtype])
+      ops.Add(ops.Constant(c, dtype(1.11)), ops.Constant(c, dtype(3.14)))
+      self._ExecuteAndCompareClose(c, expected=[dtype(1.11) + dtype(3.14)])
 
-      result = xla_client.execute_with_python_values(c.Build().Compile())
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testConstantVectorMul(self, dtype):
+      c = self._NewComputation()
+      ops.Mul(
+          ops.Constant(c, np.array([2.5, 3.3, -1.2, 0.7], dtype)),
+          ops.Constant(c, np.array([-1.2, 2, -2, -3], dtype)))
+      self._ExecuteAndCompareClose(
+          c, expected=[[-3, 6.6, 2.4, -2.1]], rtol=3e-3)
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testConstantVectorScalarDiv(self, dtype):
+      c = self._NewComputation()
+      ops.Div(
+          ops.Constant(c, np.array([1.5, 2.5, 3.0, -10.8], dtype=dtype)),
+          ops.Constant(c, dtype(2.0)))
+      self._ExecuteAndCompareClose(
+          c, expected=[[0.75, 1.25, 1.5, -5.4]], rtol=2e-3)
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testConstantVectorScalarPow(self, dtype):
+      c = self._NewComputation()
+      ops.Pow(
+          ops.Constant(c, np.array([1.5, 2.5, 3.0], dtype=dtype)),
+          ops.Constant(c, dtype(2.)))
+      self._ExecuteAndCompareClose(c, expected=[[2.25, 6.25, 9.]])
+
+    def testIota(self):
+      c = self._NewComputation()
+      ops.Iota(c, xla_client.PrimitiveType.F32, 10)
+      self._ExecuteAndCompareExact(
+          c, expected=[np.arange(10, dtype=np.float32)])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in int_dtypes)
+    def testBroadcastedIota(self, dtype):
+      c = self._NewComputation()
+      shape = xla_client.Shape.array_shape(
+          xla_client.dtype_to_etype(dtype), (2, 3))
+      ops.Iota(c, shape, 1)
+      expected = np.array([[0, 1, 2], [0, 1, 2]], dtype=dtype)
+      self._ExecuteAndCompareExact(c, expected=[expected])
+
+    def testBooleanAnd(self):
+      c = self._NewComputation()
+      ops.And(
+          ops.Constant(c, NumpyArrayBool([True, False, True, False])),
+          ops.Constant(c, NumpyArrayBool([True, True, False, False])))
+      self._ExecuteAndCompareExact(c, expected=[[True, False, False, False]])
+
+    def testBooleanOr(self):
+      c = self._NewComputation()
+      ops.Or(
+          ops.Constant(c, NumpyArrayBool([True, False, True, False])),
+          ops.Constant(c, NumpyArrayBool([True, True, False, False])))
+      self._ExecuteAndCompareExact(c, expected=[[True, True, True, False]])
+
+    def testBooleanXor(self):
+      c = self._NewComputation()
+      ops.Xor(
+          ops.Constant(c, NumpyArrayBool([True, False, True, False])),
+          ops.Constant(c, NumpyArrayBool([True, True, False, False])))
+      self._ExecuteAndCompareExact(c, expected=[[False, True, True, False]])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testSum2D(self, dtype):
+      c = self._NewComputation()
+      ops.Add(
+          ops.Constant(c, np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)),
+          ops.Constant(c, np.array([[1, -1, 1], [-1, 1, -1]], dtype=dtype)))
+      self._ExecuteAndCompareClose(c, expected=[[[2, 1, 4], [3, 6, 5]]])
+
+    def testShiftLeft(self):
+      c = self._NewComputation()
+      ops.ShiftLeft(
+          ops.Constant(c, NumpyArrayS32([3])),
+          ops.Constant(c, NumpyArrayS32([2])))
+      self._ExecuteAndCompareClose(c, expected=[[12]])
+
+    def testShiftRightArithmetic(self):
+      c = self._NewComputation()
+      ops.ShiftRightArithmetic(
+          ops.Constant(c, NumpyArrayS32([-2])),
+          ops.Constant(c, NumpyArrayS32([1])))
+      self._ExecuteAndCompareClose(c, expected=[[-1]])
+
+    def testShiftRightLogical(self):
+      c = self._NewComputation()
+      ops.ShiftRightLogical(
+          ops.Constant(c, NumpyArrayS32([-1])),
+          ops.Constant(c, NumpyArrayS32([1])))
+      self._ExecuteAndCompareClose(c, expected=[[2**31 - 1]])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testSum2DWith1DBroadcastDim0(self, dtype):
+      # sum of a 2D array with a 1D array where the latter is replicated across
+      # dimension 0 to match the former's shape.
+      c = self._NewComputation()
+      ops.Add(
+          ops.Constant(c,
+                       np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                dtype=dtype)),
+          ops.Constant(c, np.array([10, 20, 30], dtype=dtype)),
+          broadcast_dimensions=(0,))
+      self._ExecuteAndCompareClose(
+          c, expected=[[[11, 12, 13], [24, 25, 26], [37, 38, 39]]])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testSum2DWith1DBroadcastDim1(self, dtype):
+      # sum of a 2D array with a 1D array where the latter is replicated across
+      # dimension 1 to match the former's shape.
+      c = self._NewComputation()
+      ops.Add(
+          ops.Constant(c,
+                       np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                dtype=dtype)),
+          ops.Constant(c, np.array([10, 20, 30], dtype=dtype)),
+          broadcast_dimensions=(1,))
+      self._ExecuteAndCompareClose(
+          c, expected=[[[11, 22, 33], [14, 25, 36], [17, 28, 39]]])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testConstantAxpy(self, dtype):
+      c = self._NewComputation()
+      ops.Add(
+          ops.Mul(
+              ops.Constant(c, dtype(2)),
+              ops.Constant(c, np.array([2.2, 3.3, 4.4, 5.5], dtype=dtype))),
+          ops.Constant(c, np.array([100, -100, 200, -200], dtype)))
+      self._ExecuteAndCompareClose(
+          c, expected=[[104.4, -93.4, 208.8, -189]], rtol=2e-3)
+
+    def testCustomCall(self):
+      if self.backend.platform != "cpu":
+        self.skipTest("Test requires cpu platform")
+      c = self._NewComputation()
+      for name, fn in custom_call_for_test.cpu_custom_call_targets.items():
+        xla_client.register_custom_call_target(name, fn, platform="cpu")
+      ops.CustomCallWithLayout(
+          c,
+          b"test_subtract_f32",
+          operands=[
+              ops.Constant(c, np.float32(1.25)),
+              ops.Constant(c, np.float32(0.5))
+          ],
+          shape_with_layout=xla_client.Shape.array_shape(
+              np.dtype(np.float32), (), ()),
+          operand_shapes_with_layout=[
+              xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
+              xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
+          ])
+      self._ExecuteAndCompareClose(c, expected=[0.75])
+
+  tests.append(ComputationsWithConstantsTest)
+
+  class ComputationFromProtoTest(absltest.TestCase):
+    """Test computation execution from HLO proto."""
+
+    def setUp(self):
+      super(ComputationFromProtoTest, self).setUp()
+      self.backend = xla_backend()
+
+    def testExecuteFromProto(self):
+      # Build the HLO proto
+      b = xla_client.XlaBuilder("computation")
+      ops.Add(ops.Constant(b, np.int32(1)), ops.Constant(b, np.int32(2)))
+      serialized_proto = b.build().as_serialized_hlo_module_proto()
+
+      # Load and execute the proto
+      c = xla_client.XlaComputation(serialized_proto)
+      ans, = xla_client.execute_with_python_values(
+          self.backend.compile(c), (), backend=self.backend)
+      np.testing.assert_equal(ans, np.int32(3))
+
+  tests.append(ComputationFromProtoTest)
+
+  class ParametersTest(ComputationTest):
+    """Tests focusing on Parameter ops and argument-passing."""
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in int_dtypes)
+    def testScalarTimesVector(self, dtype):
+      c = self._NewComputation()
+      arg0 = np.array(3, dtype=dtype)
+      arg1 = np.array([10, 15, -2, 7], dtype=dtype)
+      p0 = ops.Parameter(c, 0, xla_client.shape_from_pyval(arg0))
+      p1 = ops.Parameter(c, 1, xla_client.shape_from_pyval(arg1))
+      ops.Mul(p0, p1)
+      self._ExecuteAndCompareExact(
+          c, arguments=[arg0, arg1], expected=[arg0 * arg1])
+
+    # TODO(phawkins): test comparison harness doesn't support bfloat16
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes if dtype != bfloat16)
+    def testScalarMinusVectorExplicitNumbering(self, dtype):
+      # Use explicit numbering and pass parameter_num first. Sub is used since
+      # it's not commutative and can help catch parameter reversal within the
+      # computation.
+      c = self._NewComputation()
+      arg0 = np.array(2.0, dtype=dtype)
+      arg1 = np.array([-2.3, 3.3, -4.3, 5.3], dtype=dtype)
+      p1 = ops.Parameter(c, 1, xla_client.shape_from_pyval(arg1))
+      p0 = ops.Parameter(c, 0, xla_client.shape_from_pyval(arg0))
+      ops.Sub(p1, p0)
+      self._ExecuteAndCompareClose(
+          c, arguments=[arg0, arg1], expected=[arg1 - arg0])
+
+  tests.append(ParametersTest)
+
+  class BufferTest(ComputationTest):
+    """Tests focusing on execution with Buffers."""
+
+    def testConstantSum(self):
+      c = self._NewComputation()
+      ops.Add(
+          ops.Constant(c, np.float32(1.11)), ops.Constant(c, np.float32(3.14)))
+      self._ExecuteAndCompareClose(c, expected=[4.25])
+
+    def testOneParameterSum(self):
+      c = self._NewComputation()
+      ops.Add(
+          ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(0.))),
+          ops.Constant(c, np.float32(3.14)))
+      self._ExecuteAndCompareClose(
+          c, arguments=[NumpyArrayF32(1.11)], expected=[4.25])
+
+    def testTwoParameterSum(self):
+      c = self._NewComputation()
+      ops.Add(
+          ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(0.))),
+          ops.Parameter(c, 1, xla_client.shape_from_pyval(NumpyArrayF32(0.))))
+      self._ExecuteAndCompareClose(
+          c,
+          arguments=[NumpyArrayF32(1.11),
+                     NumpyArrayF32(3.14)],
+          expected=[4.25])
+
+    @unittest.skipIf(cloud_tpu, "not implemented")
+    def testCannotCallWithDeletedBuffers(self):
+      c = self._NewComputation()
+      ops.Add(
+          ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(0.))),
+          ops.Constant(c, np.float32(3.14)))
+      arg = NumpyArrayF32(1.11)
+      compiled_c = self.backend.compile(c.build())
+      arg_buffer = self.backend.buffer_from_pyval(arg)
+      arg_buffer.delete()
+      with self.assertRaises(RuntimeError):
+        compiled_c.execute([arg_buffer])
+
+    def testShape(self):
+      pyval = np.array([[1., 2.]], np.float32)
+      local_buffer = self.backend.buffer_from_pyval(pyval)
+      xla_shape = local_buffer.shape()
+      self.assertEqual(xla_shape.dimensions(), (1, 2))
+      self.assertEqual(np.dtype(xla_shape.element_type()), np.dtype(np.float32))
+
+    def testBlockHostUntilReadyWorks(self):
+      arg = np.array([[1., 2.]], np.float32)
+      arg_buffer = self.backend.buffer_from_pyval(arg)
+      arg_buffer.block_host_until_ready()
+      # This test merely checks that nothing goes awry when we call
+      # block_host_until_ready(); it's difficult to test anything else.
+
+    def testCopyToHost(self):
+      arg0 = np.array([[1., 2.]], np.float32)
+      arg1 = np.array([[3., 4.]], np.float32)
+      arg0_buffer = self.backend.buffer_from_pyval(arg0)
+      arg1_buffer = self.backend.buffer_from_pyval(arg1)
+      # Prefetch two buffers using copy_to_host_async, and then retrieve their
+      # values using to_py.
+      arg0_buffer.copy_to_host_async()
+      arg0_buffer.copy_to_host_async()  # Duplicate calls don't do anything.
+      arg1_buffer.copy_to_host_async()
+      np.testing.assert_equal(arg0, arg0_buffer.to_py())
+      np.testing.assert_equal(arg1, arg1_buffer.to_py())
+      # copy_to_host_async does nothing after to_py is called.
+      arg0_buffer.copy_to_host_async()
+      np.testing.assert_equal(arg0, arg0_buffer.to_py())
+
+    def testDevice(self):
+      x = np.arange(8, dtype=np.int32)
+      for device in self.backend.local_devices():
+        buf = self.backend.buffer_from_pyval(x, device=device)
+        self.assertEqual(buf.device(), device)
+        np.testing.assert_equal(x, buf.to_py())
+
+  tests.append(BufferTest)
+
+  class SingleOpTest(ComputationTest):
+    """Tests for single ops.
+
+    The goal here is smoke testing - to exercise the most basic functionality of
+    single XLA ops. As minimal as possible number of additional ops are added
+    around the op being tested.
+    """
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testConcatenate(self, dtype):
+      c = self._NewComputation()
+      args = (
+          ops.Constant(c, np.array([1.0, 2.0, 3.0], dtype=dtype)),
+          ops.Constant(c, np.array([4.0, 5.0, 6.0], dtype=dtype)),
+      )
+      ops.ConcatInDim(c, args, dimension=0)
+      self._ExecuteAndCompareExact(
+          c, expected=[np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtype)])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}_{}".format(src_dtype.__name__,
+                                         dst_dtype.__name__),
+        "src_dtype": src_dtype,
+        "dst_dtype": dst_dtype,
+    } for src_dtype, dst_dtype in itertools.permutations(
+        [np.bool, np.int32, np.int64, np.float32, np.float64], 2))
+    def testConvertElementType(self, src_dtype, dst_dtype):
+      if ((src_dtype in [np.int64, np.float64] or
+           dst_dtype in [np.int64, np.float64]) and
+          self.backend.platform == "tpu"):
+        self.skipTest("TPU doesn't support float64")
+      c = self._NewComputation()
+      x = np.array([0, 1, 0, 0, 1], dtype=src_dtype)
+      ops.ConvertElementType(
+          ops.Constant(c, x), xla_client.dtype_to_etype(dst_dtype))
+
+      result = xla_client.execute_with_python_values(
+          self.backend.compile(c.build()), (), backend=self.backend)
       self.assertLen(result, 1)
-      expected = np.array(template, dtype=dst_dtype)
+      expected = np.array(x, dtype=dst_dtype)
 
       self.assertEqual(result[0].shape, expected.shape)
       self.assertEqual(result[0].dtype, expected.dtype)
       np.testing.assert_equal(result[0], expected)
 
-    x = [0, 1, 0, 0, 1]
-    for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
-      _ConvertAndTest(x, src_dtype, dst_dtype)
-
-  def testBitcastConvertType(self):
-    xla_x32_types = {
-        np.int32: xla_client.PrimitiveType.S32,
-        np.float32: xla_client.PrimitiveType.F32,
-    }
-
-    xla_x64_types = {
-        np.int64: xla_client.PrimitiveType.S64,
-        np.float64: xla_client.PrimitiveType.F64,
-    }
-
-    def _ConvertAndTest(template, src_dtype, dst_dtype, dst_etype):
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "_{}_{}".format(src_dtype.__name__,
+                                             dst_dtype.__name__),
+            "src_dtype": src_dtype,
+            "dst_dtype": dst_dtype,
+        }
+        for dtypes in [[np.int32, np.float32], [np.int64, np.float64]]
+        for src_dtype, dst_dtype in itertools.permutations(dtypes, 2))
+    def testBitcastConvertType(self, src_dtype, dst_dtype):
+      if (np.float64 in (src_dtype, dst_dtype) and
+          self.backend.platform == "tpu"):
+        self.skipTest("TPU doesn't support float64")
       c = self._NewComputation()
-      x = c.Constant(np.array(template, dtype=src_dtype))
-      c.BitcastConvertType(x, dst_etype)
+      x = np.array([0, 1, 0, 0, 1], dtype=src_dtype)
+      ops.BitcastConvertType(
+          ops.Constant(c, x), xla_client.dtype_to_etype(dst_dtype))
 
-      result = xla_client.execute_with_python_values(c.Build().Compile())
+      result = xla_client.execute_with_python_values(
+          self.backend.compile(c.build()), (), backend=self.backend)
       self.assertLen(result, 1)
-      expected = np.array(template, src_dtype).view(dst_dtype)
+      expected = x.view(dst_dtype)
 
       self.assertEqual(result[0].shape, expected.shape)
       self.assertEqual(result[0].dtype, expected.dtype)
       np.testing.assert_equal(result[0], expected)
 
-    x = [0, 1, 0, 0, 1]
-    for xla_types in [xla_x32_types, xla_x64_types]:
-      for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
-        _ConvertAndTest(x, src_dtype, dst_dtype, xla_types[dst_dtype])
+    # TODO(b/123523486) implement AllToAll on CPU
+    def DISABLED_testAllToAllOneReplica(self):
+      samples = [
+          NumpyArrayF32([97.0]),
+          NumpyArrayF32([64.0, 117.0]),
+          NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
+      ]
+      for lhs in samples[:1]:
+        c = self._NewComputation()
+        ops.AllToAll(ops.Constant(c, lhs), 0, 0)
+        self._ExecuteAndCompareExact(c, expected=[lhs])
 
-  # TODO(b/123523486) implement AllToAll on CPU
-  def DISABLED_testAllToAllOneReplica(self):
-    samples = [
-        NumpyArrayF32([97.0]),
-        NumpyArrayF32([64.0, 117.0]),
-        NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
-    ]
-    for lhs in samples[:1]:
+    def testCrossReplicaSumOneReplica(self):
+      samples = [
+          NumpyArrayF32(42.0),
+          NumpyArrayF32([97.0]),
+          NumpyArrayF32([64.0, 117.0]),
+          NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
+      ]
+      for lhs in samples:
+        c = self._NewComputation()
+        ops.CrossReplicaSum(ops.Constant(c, lhs))
+        self._ExecuteAndCompareExact(c, expected=[lhs])
+
+    def testReplicaId(self):
       c = self._NewComputation()
-      c.AllToAll(c.Constant(lhs), 0, 0)
-      self._ExecuteAndCompareExact(c, expected=[lhs])
+      _ = ops.ReplicaId(c)
+      self._ExecuteAndCompareExact(c, expected=[0])
 
-  def testCrossReplicaSumOneReplica(self):
-    samples = [
-        NumpyArrayF32(42.0),
-        NumpyArrayF32([97.0]),
-        NumpyArrayF32([64.0, 117.0]),
-        NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
-    ]
-    for lhs in samples:
+    def testCrossReplicaSumOneReplicaWithSingletonGroup(self):
+      samples = [
+          NumpyArrayF32(42.0),
+          NumpyArrayF32([97.0]),
+          NumpyArrayF32([64.0, 117.0]),
+          NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
+      ]
+      for lhs in samples:
+        c = self._NewComputation()
+        ops.CrossReplicaSum(
+            ops.Constant(c, lhs), xla_client.make_replica_groups([[0]]))
+        self._ExecuteAndCompareExact(c, expected=[lhs])
+
+    # TODO(phawkins): np.dot implementation doesn't support bfloat16
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes if dtype != bfloat16)
+    def testDotMatrixVector(self, dtype):
       c = self._NewComputation()
-      c.CrossReplicaSum(c.Constant(lhs))
-      self._ExecuteAndCompareExact(c, expected=[lhs])
+      lhs = np.array([[2.0, 3.0], [4.0, 5.0]], dtype=dtype)
+      rhs = np.array([[10.0], [20.0]], dtype=dtype)
+      ops.Dot(ops.Constant(c, lhs), ops.Constant(c, rhs))
+      self._ExecuteAndCompareClose(c, expected=[np.dot(lhs, rhs)])
 
-  def testReplicaId(self):
-    c = self._NewComputation()
-    _ = c.ReplicaId()
-    self._ExecuteAndCompareExact(c, expected=[0])
-
-  def testCrossReplicaSumOneReplicaWithSingletonGroup(self):
-    samples = [
-        NumpyArrayF32(42.0),
-        NumpyArrayF32([97.0]),
-        NumpyArrayF32([64.0, 117.0]),
-        NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
-    ]
-    for lhs in samples:
+    # TODO(phawkins): np.dot implementation doesn't support bfloat16
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes if dtype != bfloat16)
+    def testDotMatrixMatrix(self, dtype):
       c = self._NewComputation()
-      c.CrossReplicaSum(c.Constant(lhs), [[0]])
-      self._ExecuteAndCompareExact(c, expected=[lhs])
+      lhs = np.array([[2.0, 3.0], [4.0, 5.0]], dtype=dtype)
+      rhs = np.array([[10.0, 20.0], [100.0, 200.0]], dtype=dtype)
+      ops.Dot(ops.Constant(c, lhs), ops.Constant(c, rhs))
+      self._ExecuteAndCompareClose(c, expected=[np.dot(lhs, rhs)])
 
-  def testDotMatrixVectorF32(self):
-    c = self._NewComputation()
-    lhs = NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]])
-    rhs = NumpyArrayF32([[10.0], [20.0]])
-    c.Dot(c.Constant(lhs), c.Constant(rhs))
-    self._ExecuteAndCompareClose(c, expected=[np.dot(lhs, rhs)])
-
-  def testDotMatrixVectorF64(self):
-    c = self._NewComputation()
-    lhs = NumpyArrayF64([[2.0, 3.0], [4.0, 5.0]])
-    rhs = NumpyArrayF64([[10.0], [20.0]])
-    c.Dot(c.Constant(lhs), c.Constant(rhs))
-    self._ExecuteAndCompareClose(c, expected=[np.dot(lhs, rhs)])
-
-  def testDotMatrixMatrixF32(self):
-    c = self._NewComputation()
-    lhs = NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]])
-    rhs = NumpyArrayF32([[10.0, 20.0], [100.0, 200.0]])
-    c.Dot(c.Constant(lhs), c.Constant(rhs))
-    self._ExecuteAndCompareClose(c, expected=[np.dot(lhs, rhs)])
-
-  def testDotMatrixMatrixF64(self):
-    c = self._NewComputation()
-    lhs = NumpyArrayF64([[2.0, 3.0], [4.0, 5.0]])
-    rhs = NumpyArrayF64([[10.0, 20.0], [100.0, 200.0]])
-    c.Dot(c.Constant(lhs), c.Constant(rhs))
-    self._ExecuteAndCompareClose(c, expected=[np.dot(lhs, rhs)])
-
-  def testDotGeneral(self):
-    c = self._NewComputation()
-    rng = np.random.RandomState(0)
-    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
-    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
-    dimension_numbers = (([2], [1]), ([0], [0]))
-    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
-    self._ExecuteAndCompareClose(c, expected=[np.matmul(lhs, rhs)], rtol=1e-6)
-
-  def testDotGeneralWithDotDimensionNumbersProto(self):
-    c = self._NewComputation()
-    rng = np.random.RandomState(0)
-    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
-    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
-
-    dimension_numbers = xla_client.DotDimensionNumbers()
-    dimension_numbers.lhs_contracting_dimensions.append(2)
-    dimension_numbers.rhs_contracting_dimensions.append(1)
-    dimension_numbers.lhs_batch_dimensions.append(0)
-    dimension_numbers.rhs_batch_dimensions.append(0)
-
-    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
-    self._ExecuteAndCompareClose(c, expected=[np.matmul(lhs, rhs)], rtol=1e-6)
-
-  def testDotGeneralWithPrecisionConfig(self):
-    c = self._NewComputation()
-    rng = np.random.RandomState(0)
-    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
-    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
-    dimension_numbers = (([2], [1]), ([0], [0]))
-    config = xla_client.PrecisionConfig()
-    config.operand_precision.append(config.Precision.HIGH)
-    config.operand_precision.append(config.Precision.HIGHEST)
-    c.DotGeneral(
-        c.Constant(lhs),
-        c.Constant(rhs),
-        dimension_numbers,
-        precision_config=config)
-    self._ExecuteAndCompareClose(c, expected=[np.matmul(lhs, rhs)], rtol=1e-6)
-
-  def testConvF32Same(self):
-    c = self._NewComputation()
-    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-    lhs = a(1, 2, 3, 4)
-    rhs = a(1, 2, 1, 2) * 10
-    c.Conv(
-        c.Constant(lhs), c.Constant(rhs), [1, 1], xla_client.PaddingType.SAME)
-    result = np.array([[[
-        [640., 700., 760., 300.],
-        [880., 940., 1000., 380.],
-        [1120., 1180., 1240., 460.],
-    ]]])
-    self._ExecuteAndCompareClose(c, expected=[result])
-
-  def testConvF32Valid(self):
-    c = self._NewComputation()
-    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-    lhs = a(1, 2, 3, 4)
-    rhs = a(1, 2, 1, 2) * 10
-    c.Conv(
-        c.Constant(lhs), c.Constant(rhs), [2, 1], xla_client.PaddingType.VALID)
-    result = np.array([[[
-        [640., 700., 760.],
-        [1120., 1180., 1240.],
-    ]]])
-    self._ExecuteAndCompareClose(c, expected=[result])
-
-  def testConvWithGeneralPaddingF32(self):
-    c = self._NewComputation()
-    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-    lhs = a(1, 1, 2, 3)
-    rhs = a(1, 1, 1, 2) * 10
-    strides = [1, 1]
-    pads = [(1, 0), (0, 1)]
-    lhs_dilation = (2, 1)
-    rhs_dilation = (1, 1)
-    c.ConvWithGeneralPadding(
-        c.Constant(lhs), c.Constant(rhs), strides, pads, lhs_dilation,
-        rhs_dilation)
-    result = np.array([[[
-        [0., 0., 0.],
-        [10., 20., 0.],
-        [0., 0., 0.],
-        [40., 50., 0.],
-    ]]])
-    self._ExecuteAndCompareClose(c, expected=[result])
-
-  def testConvGeneralDilatedF32(self):
-    c = self._NewComputation()
-    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-    lhs = a(1, 1, 2, 3)
-    rhs = a(1, 1, 1, 2) * 10
-    strides = [1, 1]
-    pads = [(1, 0), (0, 1)]
-    lhs_dilation = (2, 1)
-    rhs_dilation = (1, 1)
-    dimension_numbers = ("NCHW", "OIHW", "NCHW")
-    c.ConvGeneralDilated(
-        c.Constant(lhs), c.Constant(rhs), strides, pads, lhs_dilation,
-        rhs_dilation, dimension_numbers)
-    result = np.array([[[
-        [0., 0., 0.],
-        [10., 20., 0.],
-        [0., 0., 0.],
-        [40., 50., 0.],
-    ]]])
-    self._ExecuteAndCompareClose(c, expected=[result])
-
-  def testConvGeneralDilatedF32WithPrecisionConfig(self):
-    c = self._NewComputation()
-    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-    lhs = a(1, 1, 2, 3)
-    rhs = a(1, 1, 1, 2) * 10
-    strides = [1, 1]
-    pads = [(1, 0), (0, 1)]
-    lhs_dilation = (2, 1)
-    rhs_dilation = (1, 1)
-    dimension_numbers = ("NCHW", "OIHW", "NCHW")
-    config = xla_client.PrecisionConfig()
-    config.operand_precision.append(config.Precision.HIGHEST)
-    config.operand_precision.append(config.Precision.DEFAULT)
-    c.ConvGeneralDilated(
-        c.Constant(lhs),
-        c.Constant(rhs),
-        strides,
-        pads,
-        lhs_dilation,
-        rhs_dilation,
-        dimension_numbers,
-        precision_config=config)
-    result = np.array([[[
-        [0., 0., 0.],
-        [10., 20., 0.],
-        [0., 0., 0.],
-        [40., 50., 0.],
-    ]]])
-    self._ExecuteAndCompareClose(c, expected=[result])
-
-  def testConvGeneralDilatedPermutedF32(self):
-    c = self._NewComputation()
-    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-    lhs = a(1, 1, 2, 3)
-    rhs = a(1, 1, 1, 2) * 10
-    strides = [1, 1]
-    pads = [(1, 0), (0, 1)]
-    lhs_dilation = (2, 1)
-    rhs_dilation = (1, 1)
-
-    dimension_numbers = ("NHWC", "OIHW", "CWNH")
-    c.ConvGeneralDilated(
-        c.Constant(np.transpose(lhs, (0, 2, 3, 1))), c.Constant(rhs), strides,
-        pads, lhs_dilation, rhs_dilation, dimension_numbers)
-    result = np.array([[[[0., 0., 0.], [10., 20., 0.], [0., 0., 0.],
-                         [40., 50., 0.]]]])
-    self._ExecuteAndCompareClose(
-        c, expected=[np.transpose(result, (1, 3, 0, 2))])
-
-  def testConvGeneralDilatedGroupedConvolutionF32(self):
-    c = self._NewComputation()
-    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-    lhs = a(1, 2, 2, 3)
-    rhs = a(2, 1, 1, 2) * 10
-    strides = [1, 1]
-    pads = [(1, 0), (0, 1)]
-    lhs_dilation = (2, 1)
-    rhs_dilation = (1, 1)
-    dimension_numbers = ("NCHW", "OIHW", "NCHW")
-    feature_group_count = 2
-    c.ConvGeneralDilated(
-        c.Constant(lhs), c.Constant(rhs), strides, pads, lhs_dilation,
-        rhs_dilation, dimension_numbers, feature_group_count)
-    result = np.array([[[
-        [0., 0., 0.],
-        [10., 20., 0.],
-        [0., 0., 0.],
-        [40., 50., 0.],
-    ], [
-        [0., 0., 0.],
-        [330., 380., 160.],
-        [0., 0., 0.],
-        [480., 530., 220.],
-    ]]])
-    self._ExecuteAndCompareClose(c, expected=[result])
-
-  def testBooleanNot(self):
-    c = self._NewComputation()
-    arr = NumpyArrayBool([True, False, True])
-    c.Not(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[~arr])
-
-  def testPopulationCount(self):
-    c = self._NewComputation()
-    arr = NumpyArrayS32([3, 0, 1])
-    c.PopulationCount(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.array([2, 0, 1])])
-
-  def testCountLeadingZeros(self):
-    c = self._NewComputation()
-    arr = NumpyArrayS32([0x7FFF, 0x12345678])
-    c.Clz(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[[17, 3]])
-
-  def testExp(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, 12.1])
-    c.Exp(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.exp(arr)])
-
-  def testExpm1(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, 12.1])
-    c.Expm1(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.expm1(arr)])
-
-  def testRound(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, 12.1])
-    c.Round(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.round(arr)])
-
-  def testLog(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, 12.1])
-    c.Log(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.log(arr)])
-
-  def testLog1p(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, 12.1])
-    c.Log1p(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.log1p(arr)])
-
-  def testNeg(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, 12.1])
-    c.Neg(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[-arr])
-
-  def testFloor(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, 12.1])
-    c.Floor(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.floor(arr)])
-
-  def testCeil(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, 12.1])
-    c.Ceil(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.ceil(arr)])
-
-  def testAbs(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, -12.1, 2.4, -1.])
-    c.Abs(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.abs(arr)])
-
-  def testTanh(self):
-    c = self._NewComputation()
-    arr = NumpyArrayF32([3.3, 12.1])
-    c.Tanh(c.Constant(arr))
-    self._ExecuteAndCompareClose(c, expected=[np.tanh(arr)])
-
-  def testTrans(self):
-
-    def _TransposeAndTest(array):
+    def testDotGeneral(self):
       c = self._NewComputation()
-      c.Trans(c.Constant(array))
-      self._ExecuteAndCompareClose(c, expected=[array.T])
+      rng = np.random.RandomState(0)
+      lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+      rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+      dimension_numbers = xla_client.make_dot_dimension_numbers(
+          (([2], [1]), ([0], [0])))
+      ops.DotGeneral(
+          ops.Constant(c, lhs), ops.Constant(c, rhs), dimension_numbers)
+      self._ExecuteAndCompareClose(c, expected=[np.matmul(lhs, rhs)], rtol=1e-6)
 
-    # Test square and non-square matrices in both default (C) and F orders.
-    for array_fun in [NumpyArrayF32, NumpyArrayF64]:
-      _TransposeAndTest(array_fun([[1, 2, 3], [4, 5, 6]]))
-      _TransposeAndTest(array_fun([[1, 2, 3], [4, 5, 6]], order="F"))
-      _TransposeAndTest(array_fun([[1, 2], [4, 5]]))
-      _TransposeAndTest(array_fun([[1, 2], [4, 5]], order="F"))
-
-  def testTranspose(self):
-
-    def _TransposeAndTest(array, permutation):
+    def testDotGeneralWithDotDimensionNumbersProto(self):
       c = self._NewComputation()
-      c.Transpose(c.Constant(array), permutation)
-      expected = np.transpose(array, permutation)
+      rng = np.random.RandomState(0)
+      lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+      rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+
+      dimension_numbers = xla_client.DotDimensionNumbers()
+      dimension_numbers.lhs_contracting_dimensions.append(2)
+      dimension_numbers.rhs_contracting_dimensions.append(1)
+      dimension_numbers.lhs_batch_dimensions.append(0)
+      dimension_numbers.rhs_batch_dimensions.append(0)
+
+      ops.DotGeneral(
+          ops.Constant(c, lhs), ops.Constant(c, rhs), dimension_numbers)
+      self._ExecuteAndCompareClose(c, expected=[np.matmul(lhs, rhs)], rtol=1e-6)
+
+    def testDotGeneralWithPrecisionConfig(self):
+      c = self._NewComputation()
+      rng = np.random.RandomState(0)
+      lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+      rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+      dimension_numbers = xla_client.make_dot_dimension_numbers(
+          (([2], [1]), ([0], [0])))
+      config = xla_client.PrecisionConfig()
+      config.operand_precision.append(config.Precision.HIGH)
+      config.operand_precision.append(config.Precision.HIGHEST)
+      ops.DotGeneral(
+          ops.Constant(c, lhs),
+          ops.Constant(c, rhs),
+          dimension_numbers,
+          precision_config=config)
+      self._ExecuteAndCompareClose(c, expected=[np.matmul(lhs, rhs)], rtol=1e-6)
+
+    def testConvGeneralDilatedF32(self):
+      c = self._NewComputation()
+      a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+      lhs = a(1, 1, 2, 3)
+      rhs = a(1, 1, 1, 2) * 10
+      strides = [1, 1]
+      pads = [(1, 0), (0, 1)]
+      lhs_dilation = (2, 1)
+      rhs_dilation = (1, 1)
+      dimension_numbers = xla_client.make_convolution_dimension_numbers(
+          ("NCHW", "OIHW", "NCHW"), 2)
+      ops.ConvGeneralDilated(
+          ops.Constant(c, lhs), ops.Constant(c, rhs), strides, pads,
+          lhs_dilation, rhs_dilation, dimension_numbers)
+      result = np.array([[[
+          [0., 0., 0.],
+          [10., 20., 0.],
+          [0., 0., 0.],
+          [40., 50., 0.],
+      ]]])
+      self._ExecuteAndCompareClose(c, expected=[result])
+
+    def testConvGeneralDilatedF32WithPrecisionConfig(self):
+      c = self._NewComputation()
+      a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+      lhs = a(1, 1, 2, 3)
+      rhs = a(1, 1, 1, 2) * 10
+      strides = [1, 1]
+      pads = [(1, 0), (0, 1)]
+      lhs_dilation = (2, 1)
+      rhs_dilation = (1, 1)
+      dimension_numbers = xla_client.make_convolution_dimension_numbers(
+          ("NCHW", "OIHW", "NCHW"), 2)
+      config = xla_client.PrecisionConfig()
+      config.operand_precision.append(config.Precision.HIGHEST)
+      config.operand_precision.append(config.Precision.DEFAULT)
+      ops.ConvGeneralDilated(
+          ops.Constant(c, lhs),
+          ops.Constant(c, rhs),
+          strides,
+          pads,
+          lhs_dilation,
+          rhs_dilation,
+          dimension_numbers,
+          precision_config=config)
+      result = np.array([[[
+          [0., 0., 0.],
+          [10., 20., 0.],
+          [0., 0., 0.],
+          [40., 50., 0.],
+      ]]])
+      self._ExecuteAndCompareClose(c, expected=[result])
+
+    def testConvGeneralDilatedPermutedF32(self):
+      c = self._NewComputation()
+      a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+      lhs = a(1, 1, 2, 3)
+      rhs = a(1, 1, 1, 2) * 10
+      strides = [1, 1]
+      pads = [(1, 0), (0, 1)]
+      lhs_dilation = (2, 1)
+      rhs_dilation = (1, 1)
+
+      dimension_numbers = xla_client.make_convolution_dimension_numbers(
+          ("NHWC", "OIHW", "CWNH"), 2)
+      ops.ConvGeneralDilated(
+          ops.Constant(c, np.transpose(lhs,
+                                       (0, 2, 3, 1))), ops.Constant(c, rhs),
+          strides, pads, lhs_dilation, rhs_dilation, dimension_numbers)
+      result = np.array([[[[0., 0., 0.], [10., 20., 0.], [0., 0., 0.],
+                           [40., 50., 0.]]]])
+      self._ExecuteAndCompareClose(
+          c, expected=[np.transpose(result, (1, 3, 0, 2))])
+
+    def testConvGeneralDilatedGroupedConvolutionF32(self):
+      c = self._NewComputation()
+      a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+      lhs = a(1, 2, 2, 3)
+      rhs = a(2, 1, 1, 2) * 10
+      strides = [1, 1]
+      pads = [(1, 0), (0, 1)]
+      lhs_dilation = (2, 1)
+      rhs_dilation = (1, 1)
+      dimension_numbers = xla_client.make_convolution_dimension_numbers(
+          ("NCHW", "OIHW", "NCHW"), 2)
+      feature_group_count = 2
+      ops.ConvGeneralDilated(
+          ops.Constant(c, lhs), ops.Constant(c, rhs), strides, pads,
+          lhs_dilation, rhs_dilation, dimension_numbers, feature_group_count)
+      result = np.array([[[
+          [0., 0., 0.],
+          [10., 20., 0.],
+          [0., 0., 0.],
+          [40., 50., 0.],
+      ], [
+          [0., 0., 0.],
+          [330., 380., 160.],
+          [0., 0., 0.],
+          [480., 530., 220.],
+      ]]])
+      self._ExecuteAndCompareClose(c, expected=[result])
+
+    def testBooleanNot(self):
+      c = self._NewComputation()
+      arr = NumpyArrayBool([True, False, True])
+      ops.Not(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[~arr])
+
+    def testPopulationCount(self):
+      c = self._NewComputation()
+      arr = NumpyArrayS32([3, 0, 1])
+      ops.PopulationCount(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.array([2, 0, 1])])
+
+    def testCountLeadingZeros(self):
+      c = self._NewComputation()
+      arr = NumpyArrayS32([0x7FFF, 0x12345678])
+      ops.Clz(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[[17, 3]])
+
+    def testExp(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, 12.1])
+      ops.Exp(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.exp(arr)])
+
+    def testExpm1(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, 12.1])
+      ops.Expm1(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.expm1(arr)])
+
+    def testRound(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, 12.1])
+      ops.Round(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.round(arr)])
+
+    def testLog(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, 12.1])
+      ops.Log(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.log(arr)])
+
+    def testLog1p(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, 12.1])
+      ops.Log1p(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.log1p(arr)])
+
+    def testNeg(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, 12.1])
+      ops.Neg(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[-arr])
+
+    def testFloor(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, 12.1])
+      ops.Floor(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.floor(arr)])
+
+    def testCeil(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, 12.1])
+      ops.Ceil(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.ceil(arr)])
+
+    def testAbs(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, -12.1, 2.4, -1.])
+      ops.Abs(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.abs(arr)])
+
+    def testTanh(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([3.3, 12.1])
+      ops.Tanh(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.tanh(arr)])
+
+    def testTranspose(self):
+
+      def _TransposeAndTest(array, permutation):
+        c = self._NewComputation()
+        ops.Transpose(ops.Constant(c, array), permutation)
+        expected = np.transpose(array, permutation)
+        self._ExecuteAndCompareClose(c, expected=[expected])
+
+      _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [0, 1])
+      _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [1, 0])
+      _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [0, 1])
+      _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [1, 0])
+
+      arr = np.random.RandomState(0).randn(2, 3, 4).astype(np.float32)
+      for permutation in itertools.permutations(range(arr.ndim)):
+        _TransposeAndTest(arr, permutation)
+        _TransposeAndTest(np.asfortranarray(arr), permutation)
+
+    def testEq(self):
+      c = self._NewComputation()
+      ops.Eq(
+          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4])),
+          ops.Constant(c, NumpyArrayS32([4, 2, 3, 1])))
+      self._ExecuteAndCompareExact(c, expected=[[False, True, True, False]])
+
+    def testNe(self):
+      c = self._NewComputation()
+      ops.Ne(
+          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4])),
+          ops.Constant(c, NumpyArrayS32([4, 2, 3, 1])))
+      self._ExecuteAndCompareExact(c, expected=[[True, False, False, True]])
+
+      ops.Ne(
+          ops.Constant(c, NumpyArrayF32([-2.0, 0.0,
+                                         float("nan"),
+                                         float("nan")])),
+          ops.Constant(c, NumpyArrayF32([2.0, -0.0, 1.0,
+                                         float("nan")])))
+      self._ExecuteAndAssertWith(
+          np.testing.assert_allclose,
+          c, (),
+          expected=[[True, False, True, True]])
+
+    def testGt(self):
+      c = self._NewComputation()
+      ops.Gt(
+          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 9])),
+          ops.Constant(c, NumpyArrayS32([1, 0, 2, 7, 12])))
+      self._ExecuteAndCompareExact(
+          c, expected=[[False, True, True, False, False]])
+
+    def testGe(self):
+      c = self._NewComputation()
+      ops.Ge(
+          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 9])),
+          ops.Constant(c, NumpyArrayS32([1, 0, 2, 7, 12])))
+      self._ExecuteAndCompareExact(
+          c, expected=[[True, True, True, False, False]])
+
+    def testLt(self):
+      c = self._NewComputation()
+      ops.Lt(
+          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 9])),
+          ops.Constant(c, NumpyArrayS32([1, 0, 2, 7, 12])))
+      self._ExecuteAndCompareExact(
+          c, expected=[[False, False, False, True, True]])
+
+    def testLe(self):
+      c = self._NewComputation()
+      ops.Le(
+          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 9])),
+          ops.Constant(c, NumpyArrayS32([1, 0, 2, 7, 12])))
+      self._ExecuteAndCompareExact(
+          c, expected=[[True, False, False, True, True]])
+
+    def testMax(self):
+      c = self._NewComputation()
+      ops.Max(
+          ops.Constant(c, NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
+          ops.Constant(c, NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
+      self._ExecuteAndCompareExact(c, expected=[[1.0, 2.0, 3.0, 7.0, 12.0]])
+
+    def testMaxExplicitBroadcastDim0(self):
+      c = self._NewComputation()
+      ops.Max(
+          ops.Constant(c, NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+          ops.Constant(c, NumpyArrayF32([3, 4, 5])),
+          broadcast_dimensions=(0,))
+      self._ExecuteAndCompareExact(
+          c, expected=[[[3, 3, 3], [4, 5, 6], [7, 8, 9]]])
+
+    def testMaxExplicitBroadcastDim1(self):
+      c = self._NewComputation()
+      ops.Max(
+          ops.Constant(c, NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+          ops.Constant(c, NumpyArrayF32([3, 4, 5])),
+          broadcast_dimensions=(1,))
+      self._ExecuteAndCompareExact(
+          c, expected=[[[3, 4, 5], [4, 5, 6], [7, 8, 9]]])
+
+    def testMin(self):
+      c = self._NewComputation()
+      ops.Min(
+          ops.Constant(c, NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
+          ops.Constant(c, NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
+      self._ExecuteAndCompareExact(c, expected=[[1.0, 0.0, 2.0, 4.0, 9.0]])
+
+    def testPad(self):
+      c = self._NewComputation()
+      ops.Pad(
+          ops.Constant(c, NumpyArrayF32([[1.0, 2.0], [3.0, 4.0]])),
+          ops.Constant(c, NumpyArrayF32(0.0)),
+          xla_client.make_padding_config([(1, 2, 1), (0, 1, 0)]))
+      self._ExecuteAndCompareClose(
+          c,
+          expected=[[[0.0, 0.0, 0.0], [1.0, 2.0, 0.0], [0.0, 0.0, 0.0],
+                     [3.0, 4.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]])
+
+    def testPadWithPaddingConfig(self):
+      c = self._NewComputation()
+      padding_config = xla_client.PaddingConfig()
+      for lo, hi, interior in [(1, 2, 1), (0, 1, 0)]:
+        dimension = xla_client.PaddingConfigDimension()
+        dimension.edge_padding_low = lo
+        dimension.edge_padding_high = hi
+        dimension.interior_padding = interior
+        padding_config.dimensions.append(dimension)
+      ops.Pad(
+          ops.Constant(c, NumpyArrayF32([[1.0, 2.0], [3.0, 4.0]])),
+          ops.Constant(c, NumpyArrayF32(0.0)), padding_config)
+      self._ExecuteAndCompareClose(
+          c,
+          expected=[[[0.0, 0.0, 0.0], [1.0, 2.0, 0.0], [0.0, 0.0, 0.0],
+                     [3.0, 4.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]])
+
+    def testReshape(self):
+      c = self._NewComputation()
+      ops.Reshape(
+          ops.Constant(c, NumpyArrayS32([[1, 2], [3, 4], [5, 6]])),
+          dimensions=[0, 1],
+          new_sizes=[2, 3])
+      self._ExecuteAndCompareExact(c, expected=[[[1, 2, 3], [4, 5, 6]]])
+
+    def testCollapse(self):
+      c = self._NewComputation()
+      ops.Collapse(
+          ops.Constant(c, NumpyArrayS32([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])),
+          dimensions=[1, 2])
+      self._ExecuteAndCompareExact(c, expected=[[[1, 2, 3, 4], [5, 6, 7, 8]]])
+
+    def testRev(self):
+      c = self._NewComputation()
+      ops.Rev(
+          ops.Constant(c, NumpyArrayS32([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])),
+          dimensions=[0, 2])
+      self._ExecuteAndCompareExact(
+          c, expected=[[[[6, 5], [8, 7]], [[2, 1], [4, 3]]]])
+
+    def testReducePrecision(self):
+      c = self._NewComputation()
+      ops.ReducePrecision(
+          ops.Constant(c, NumpyArrayF32([float.fromhex("0x1.32fffep-3")])),
+          exponent_bits=8,
+          mantissa_bits=7)
+      self._ExecuteAndCompareClose(c, expected=[[float.fromhex("0x1.32p-3")]])
+
+    def testClampF32(self):
+      c = self._NewComputation()
+      ops.Clamp(
+          ops.Constant(c, NumpyArrayF32(-1)),
+          ops.Constant(c, NumpyArrayF32([-2, -1, 0, 1, 2, 3])),
+          ops.Constant(c, NumpyArrayF32(2)))
+      self._ExecuteAndCompareExact(c, expected=[[-1, -1, 0, 1, 2, 2]])
+
+    def testClampS32(self):
+      c = self._NewComputation()
+      ops.Clamp(
+          ops.Constant(c, NumpyArrayS32(-1)),
+          ops.Constant(c, NumpyArrayS32([-2, -1, 0, 1, 2, 3])),
+          ops.Constant(c, NumpyArrayS32(2)))
+      self._ExecuteAndCompareExact(c, expected=[[-1, -1, 0, 1, 2, 2]])
+
+    def testSelect(self):
+      c = self._NewComputation()
+      ops.Select(
+          ops.Constant(c, NumpyArrayBool([True, False, False, True, False])),
+          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 5])),
+          ops.Constant(c, NumpyArrayS32([-1, -2, -3, -4, -5])))
+      self._ExecuteAndCompareExact(c, expected=[[1, -2, -3, 4, -5]])
+
+    def testSlice(self):
+      c = self._NewComputation()
+      ops.Slice(
+          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+          [1, 0], [3, 2], [1, 1])
+      self._ExecuteAndCompareExact(c, expected=[[[4, 5], [7, 8]]])
+
+    def testSliceInDim(self):
+      c = self._NewComputation()
+      ops.SliceInDim(
+          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+          start_index=1,
+          limit_index=2,
+          stride=1,
+          dimno=1)
+      self._ExecuteAndCompareExact(c, expected=[[[2], [5], [8]]])
+      ops.SliceInDim(
+          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+          start_index=0,
+          limit_index=3,
+          stride=2,
+          dimno=0)
+      self._ExecuteAndCompareExact(c, expected=[[[1, 2, 3], [7, 8, 9]]])
+
+    def testDynamicSlice(self):
+      c = self._NewComputation()
+      ops.DynamicSlice(
+          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+          [ops.Constant(c, NumpyArrayS32([1, 0]))], [2, 2])
+      self._ExecuteAndCompareExact(c, expected=[[[4, 5], [7, 8]]])
+
+    def testDynamicUpdateSlice(self):
+      c = self._NewComputation()
+      ops.DynamicUpdateSlice(
+          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+          ops.Constant(c, NumpyArrayS32([[1, 2], [3, 4]])),
+          [ops.Constant(c, NumpyArrayS32([1, 1]))])
+      self._ExecuteAndCompareExact(
+          c, expected=[[[1, 2, 3], [4, 1, 2], [7, 3, 4]]])
+
+    def testTuple(self):
+      c = self._NewComputation()
+      ops.Tuple(c, [
+          ops.Constant(c, np.int32(42)),
+          ops.Constant(c, NumpyArrayF32([1.0, 2.0])),
+          ops.Constant(c, NumpyArrayBool([True, False, False, True]))
+      ])
+      result = xla_client.execute_with_python_values(
+          self.backend.compile(c.build()), (), backend=self.backend)
+      self.assertLen(result, 3)
+      np.testing.assert_equal(result[0], 42)
+      np.testing.assert_allclose(result[1], [1.0, 2.0])
+      np.testing.assert_equal(result[2], [True, False, False, True])
+
+    def testGetTupleElement(self):
+      c = self._NewComputation()
+      ops.GetTupleElement(
+          ops.Tuple(c, [
+              ops.Constant(c, np.int32(42)),
+              ops.Constant(c, NumpyArrayF32([1.0, 2.0])),
+              ops.Constant(c, NumpyArrayBool([True, False, False, True]))
+          ]), 1)
+      self._ExecuteAndCompareClose(c, expected=[[1.0, 2.0]])
+
+    def testBroadcast(self):
+      c = self._NewComputation()
+      ops.Broadcast(
+          ops.Constant(c, NumpyArrayS32([10, 20, 30, 40])), sizes=(3,))
+      self._ExecuteAndCompareExact(
+          c, expected=[[[10, 20, 30, 40], [10, 20, 30, 40], [10, 20, 30, 40]]])
+
+    def testBroadcastInDim(self):
+      c = self._NewComputation()
+      ops.BroadcastInDim(ops.Constant(c, NumpyArrayS32([1, 2])), [2, 2], [0])
+      self._ExecuteAndCompareExact(c, expected=[[[1, 1], [2, 2]]])
+      ops.BroadcastInDim(ops.Constant(c, NumpyArrayS32([1, 2])), [2, 2], [1])
+      self._ExecuteAndCompareExact(c, expected=[[[1, 2], [1, 2]]])
+
+    def testRngNormal(self):
+      shape = (2, 3)
+      c = self._NewComputation()
+      ops.RngNormal(
+          ops.Constant(c, NumpyArrayF32(0.)),
+          ops.Constant(c, NumpyArrayF32(1.)),
+          shape=xla_client.Shape.array_shape(xla_client.PrimitiveType.F32,
+                                             shape))
+      result = xla_client.execute_with_python_values(
+          self.backend.compile(c.build()), (), backend=self.backend)
+      # since the result is random, we just check shape and uniqueness
+      self.assertLen(result, 1)
+      self.assertEqual(result[0].shape, shape)
+      self.assertLen(np.unique(result[0]), np.prod(shape))
+
+    def testRngUniformF32(self):
+      lo, hi = 2., 4.
+      shape = (2, 3)
+      c = self._NewComputation()
+      ops.RngUniform(
+          ops.Constant(c, NumpyArrayF32(lo)),
+          ops.Constant(c, NumpyArrayF32(hi)),
+          shape=xla_client.Shape.array_shape(xla_client.PrimitiveType.F32,
+                                             shape))
+      result = xla_client.execute_with_python_values(
+          self.backend.compile(c.build()), (), backend=self.backend)
+      # since the result is random, we just check shape, uniqueness, and range
+      self.assertLen(result, 1)
+      self.assertEqual(result[0].shape, shape)
+      self.assertLen(np.unique(result[0]), np.prod(shape))
+      self.assertTrue(np.all(lo <= result[0]))
+      self.assertTrue(np.all(result[0] < hi))
+
+    def testRngUniformS32(self):
+      lo, hi = 2, 4
+      shape = (2, 3)
+      c = self._NewComputation()
+      ops.RngUniform(
+          ops.Constant(c, NumpyArrayS32(lo)),
+          ops.Constant(c, NumpyArrayS32(hi)),
+          shape=xla_client.Shape.array_shape(xla_client.PrimitiveType.S32,
+                                             shape))
+      result = xla_client.execute_with_python_values(
+          self.backend.compile(c.build()), (), backend=self.backend)
+      # since the result is random, we just check shape, integrality, and range
+      self.assertLen(result, 1)
+      self.assertEqual(result[0].shape, shape)
+      self.assertEqual(result[0].dtype, np.int32)
+      self.assertTrue(np.all(lo <= result[0]))
+      self.assertTrue(np.all(result[0] < hi))
+
+    def testCholesky(self):
+      l = np.array([[4, 0, 0, 0], [6, 5, 0, 0], [2, 14, 16, 0], [3, 6, 1, 4]],
+                   dtype=np.float32)
+      c = self._NewComputation()
+      ops.Cholesky(ops.Constant(c, np.tril(np.dot(l, l.T))))
+      self._ExecuteAndCompareClose(c, expected=[l], rtol=1e-4)
+
+    def testSort(self):
+      keys = np.array([[2, 4, 1, 3], [3, 1, 4, 2]], dtype=np.float32)
+      c = self._NewComputation()
+      ops.Sort(c, [ops.Constant(c, keys)], is_stable=True)
+      self._ExecuteAndCompareClose(
+          c,
+          expected=[np.array([[1, 2, 3, 4], [1, 2, 3, 4]], dtype=np.float32)])
+
+    def testSortKeyVal(self):
+      keys = np.array([[2, 4, 1, 3], [3, 1, 4, 2]], dtype=np.float32)
+      values = np.array([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=np.int32)
+      c = self._NewComputation()
+      ops.Sort(c, (ops.Constant(c, keys), ops.Constant(c, values)), dimension=0)
+      result = xla_client.execute_with_python_values(
+          self.backend.compile(c.build()), (), backend=self.backend)
+      self.assertLen(result, 2)
+      np.testing.assert_allclose(result[0], [[2, 1, 1, 2], [3, 4, 4, 3]])
+      np.testing.assert_equal(result[1], [[0, 5, 2, 7], [4, 1, 6, 3]])
+
+    def testSortCustomComparator(self):
+      b = self._NewComputation("comparator")
+      p0 = ops.Parameter(b, 0, xla_client.shape_from_pyval(NumpyArrayF32(0)))
+      q0 = ops.Parameter(b, 1, xla_client.shape_from_pyval(NumpyArrayF32(0)))
+      p1 = ops.Parameter(b, 2, xla_client.shape_from_pyval(NumpyArrayS32(0)))
+      q1 = ops.Parameter(b, 3, xla_client.shape_from_pyval(NumpyArrayS32(0)))
+      ops.Or(ops.Lt(p0, q0), ops.And(ops.Eq(p0, q0), ops.Gt(p1, q1)))
+      comparator = b.build()
+
+      keys = np.array([[2, 3, 1, 3], [3, 1, 2, 2]], dtype=np.float32)
+      values = np.array([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=np.int32)
+      c = self._NewComputation()
+      ops.Sort(
+          c, (ops.Constant(c, keys), ops.Constant(c, values)),
+          dimension=1,
+          comparator=comparator)
+      result = xla_client.execute_with_python_values(
+          self.backend.compile(c.build()), (), backend=self.backend)
+      self.assertLen(result, 2)
+      np.testing.assert_allclose(result[0], [[1, 2, 3, 3], [1, 2, 2, 3]])
+      np.testing.assert_equal(result[1], [[2, 0, 3, 1], [5, 7, 6, 4]])
+
+    def testQR(self):
+      a = np.array([[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166],
+                    [10, 63, 166, 310]],
+                   dtype=np.float32)
+      c = self._NewComputation()
+      ops.Tuple(c, ops.QR(ops.Constant(c, a), full_matrices=True))
+      q, r = self._Execute(c, ())
+      np.testing.assert_allclose(np.dot(q, r), a, rtol=1e-4)
+
+    def testEigh(self):
+      a = np.array([[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166],
+                    [10, 63, 166, 310]],
+                   dtype=np.float32)
+      a = (a + a.T) / 2
+
+      c = self._NewComputation()
+      ops.Tuple(c, ops.Eigh(ops.Constant(c, a), lower=True))
+      # TODO(b/129396575): Turn this test back on when it passes without
+      # fastmath.
+      # v, w = self._Execute(c, ())
+      # self.assertLess(np.linalg.norm(np.dot(a, v) - w * v), 1e-3)
+
+    def testSVD(self):
+      a = np.array([[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166],
+                    [10, 63, 166, 310]],
+                   dtype=np.float32)
+      c = self._NewComputation()
+      ops.Tuple(c, ops.SVD(ops.Constant(c, a)))
+      u, d, v = self._Execute(c, ())
+      self.assertLess(np.linalg.norm(a - np.matmul(u * d, v.T)), 1e-3)
+
+    def testTriangularSolve(self):
+      a_vals = np.array(
+          [[2, 0, 0, 0], [3, 6, 0, 0], [4, 7, 9, 0], [5, 8, 10, 11]],
+          dtype=np.float32)
+      b_vals = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
+                        dtype=np.float32)
+
+      c = self._NewComputation()
+      ops.TriangularSolve(
+          ops.Constant(c, a_vals),
+          ops.Constant(c, b_vals),
+          left_side=False,
+          lower=True,
+          transpose_a=ops.TriangularSolveOptions_Transpose.TRANSPOSE,
+          unit_diagonal=False)
+      self._ExecuteAndCompareClose(
+          c,
+          expected=[
+              np.array([
+                  [0.5, 0.08333334, 0.04629629, 0.03367003],
+                  [2.5, -0.25, -0.1388889, -0.1010101],
+                  [4.5, -0.58333331, -0.32407406, -0.23569024],
+              ],
+                       dtype=np.float32)
+          ],
+          rtol=1e-4)
+
+    def testIsConstant(self):
+      c = self._NewComputation()
+      a = ops.Constant(c, np.int32(3))
+      b = ops.Constant(c, np.int32(1))
+      x = ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayS32(0)))
+      const_expr = ops.Sub(b, a)
+      non_const_expr = ops.Mul(const_expr, x)
+      self.assertTrue(c.is_constant(const_expr))
+      self.assertFalse(c.is_constant(non_const_expr))
+
+    def testGather(self):
+      a = np.arange(9).astype(np.int32).reshape((3, 3))
+      indices = np.array([[[0, 2], [2, 1]], [[1, 2], [2, 0]]], dtype=np.int32)
+      dnums = xla_client.GatherDimensionNumbers()
+      dnums.offset_dims.append(1)
+      dnums.offset_dims.append(2)
+      dnums.start_index_map.append(0)
+      dnums.start_index_map.append(1)
+      dnums.index_vector_dim = 2
+      c = self._NewComputation()
+      ops.Gather(
+          ops.Constant(c, a),
+          ops.Constant(c, indices),
+          dnums,
+          slice_sizes=[1, 1])
+      g, = self._Execute(c, ())
+      expected = np.array([[[[2, 7]]], [[[5, 6]]]], dtype=np.int32)
+      np.testing.assert_allclose(g, expected, rtol=1e-4)
+
+    def testFft(self):
+      if self.backend.platform == "tpu":
+        self.skipTest("TPU only supports 1D FFT")
+      shape = [2, 3, 4, 5]
+      rng = np.random.RandomState(0)
+      a = rng.randn(*shape) + 1.0j * rng.randn(*shape)
+      a = a.astype(np.complex64)
+      # FFT
+      c = self._NewComputation()
+      ops.Fft(ops.Constant(c, a), xla_client.FftType.FFT, shape[-3:])
+      self._ExecuteAndCompareClose(
+          c, expected=[np.fft.fftn(a, axes=(1, 2, 3))], rtol=1e-4)
+      # IFFT
+      c = self._NewComputation()
+      ops.Fft(ops.Constant(c, a), xla_client.FftType.IFFT, shape[-3:])
+      self._ExecuteAndCompareClose(
+          c, expected=[np.fft.ifftn(a, axes=(1, 2, 3))], rtol=1e-4)
+      # RFFT
+      b = rng.randn(*shape).astype(np.float32)
+      c = self._NewComputation()
+      ops.Fft(ops.Constant(c, b), xla_client.FftType.RFFT, shape[-3:])
+      self._ExecuteAndCompareClose(
+          c, expected=[np.fft.rfftn(b, axes=(1, 2, 3))], rtol=1e-4)
+      # IRFFT
+      c = self._NewComputation()
+      ops.Fft(ops.Constant(c, a), xla_client.FftType.IRFFT, [3, 4, 8])
+      self._ExecuteAndCompareClose(
+          c, expected=[np.fft.irfftn(a, axes=(1, 2, 3))], rtol=1e-4)
+
+    def testNextAfter(self):
+      c = self._NewComputation()
+      ops.NextAfter(
+          ops.Constant(c, np.array([1, 2], dtype=np.float32)),
+          ops.Constant(c, np.array([2, 1], dtype=np.float32)))
+      out, = self._Execute(c, ())
+      eps = np.finfo(np.float32).eps
+      np.testing.assert_equal(
+          np.array([eps + 1, 2 - eps], dtype=np.float32), out)
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testRegularizedIncompleteBeta(self, dtype):
+      x = np.array([0.53787335, 0.24015466, 0.47494545, 0.13567594, 0.95114538],
+                   dtype=dtype)
+      a = np.array([0.00753073, 0.34813385, 0.30485708, 1.29298632, 0.51472606],
+                   dtype=dtype)
+      b = np.array([0.55688389, 0.59794214, 0.42661022, 1.59748339, 0.95047677],
+                   dtype=dtype)
+      c = self._NewComputation()
+      ops.RegularizedIncompleteBeta(
+          ops.Constant(c, a), ops.Constant(c, b), ops.Constant(c, x))
+      expected = np.array(
+          [0.98923271, 0.48575411, 0.57952568, 0.12579775, 0.96989155])
+      self._ExecuteAndCompareClose(c, expected=[expected], rtol=2e-2)
+
+  tests.append(SingleOpTest)
+
+  class EmbeddedComputationsTest(ComputationTest):
+    """Tests for XLA graphs with embedded computations (such as maps)."""
+
+    def _CreateConstantComputation(self, in_dtype, out_dtype):
+      """Computation (A) -> B that returns a constant 1 for any input."""
+      c = self._NewComputation("constant_{}_{}_one".format(
+          in_dtype.__name__, out_dtype.__name__))
+      ops.Parameter(c, 0,
+                    xla_client.shape_from_pyval(np.array(0, dtype=in_dtype)))
+      ops.Constant(c, out_dtype(1))
+      return c.build()
+
+    def _CreateMulBy2Computation(self, dtype):
+      """Computation (dtype) -> dtype that multiplies its parameter by 2."""
+      c = self._NewComputation("mul_f32_by2")
+      ops.Mul(
+          ops.Parameter(
+              c, 0,
+              xla_client.shape_from_pyval(np.array(
+                  0, dtype=dtype)).with_major_to_minor_layout_if_absent()),
+          ops.Constant(c, dtype(2.0)))
+      return c.build()
+
+    def _CreateMulF32ByParamComputation(self):
+      """Computation (f32) -> f32 that multiplies one parameter by the other."""
+      c = self._NewComputation("mul_f32_by_param")
+      ops.Mul(
+          ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(0))),
+          ops.Parameter(c, 1, xla_client.shape_from_pyval(NumpyArrayF32(0))))
+      return c.build()
+
+    def _CreateBinaryAddComputation(self, dtype):
+      """Computation (dtype, dtype) -> dtype that adds its two parameters."""
+      c = self._NewComputation("add_param0_by_param1")
+      shape = xla_client.shape_from_pyval(np.array(0, dtype=dtype))
+      shape = shape.with_major_to_minor_layout_if_absent()
+      ops.Add(ops.Parameter(c, 0, shape), ops.Parameter(c, 1, shape))
+      return c.build()
+
+    def _CreateBinaryGeComputation(self, dtype):
+      """Computation (dtype, dtype) -> bool that tests param0 >= param1."""
+      c = self._NewComputation("param0_lt_param1")
+      shape = xla_client.shape_from_pyval(np.array(0, dtype=dtype))
+      shape = shape.with_major_to_minor_layout_if_absent()
+      ops.Ge(ops.Parameter(c, 0, shape), ops.Parameter(c, 1, shape))
+      return c.build()
+
+    def _MakeSample3DArray(self, dtype):
+      return np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]],
+                       [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
+                      dtype=dtype)
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testCall(self, dtype):
+      c = self._NewComputation()
+      ops.Call(
+          c,
+          self._CreateMulBy2Computation(dtype),
+          operands=(ops.Constant(c, dtype(5.0)),))
+      self._ExecuteAndCompareClose(c, expected=[10.0])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}_{}".format(in_dtype.__name__, out_dtype.__name__),
+        "in_dtype": in_dtype,
+        "out_dtype": out_dtype,
+    } for in_dtype, out_dtype in [[np.float32, np.int32]])
+    def testMapEachElementToConstant(self, in_dtype, out_dtype):
+      c = self._NewComputation()
+      ops.Map(c,
+              [ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=in_dtype))],
+              self._CreateConstantComputation(in_dtype, out_dtype), [0])
+      self._ExecuteAndCompareExact(c, expected=[[1, 1, 1, 1]])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testMapMulBy2(self, dtype):
+      if dtype == np.float64 and self.backend.platform == "tpu":
+        self.skipTest("TPU doesn't support float64")
+      c = self._NewComputation()
+      ops.Map(c, [ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=dtype))],
+              self._CreateMulBy2Computation(dtype), [0])
+      self._ExecuteAndCompareClose(c, expected=[[2.0, 4.0, 6.0, 8.0]])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testSimpleMapChain(self, dtype):
+      if dtype == np.float64 and self.backend.platform == "tpu":
+        self.skipTest("TPU doesn't support float64")
+      # Chains a map of constant-out with a map of mul-by-2
+      c = self._NewComputation()
+      const = ops.Map(
+          c, [ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=dtype))],
+          self._CreateConstantComputation(dtype, dtype), [0])
+      ops.Map(c, [const], self._CreateMulBy2Computation(dtype), [0])
+      self._ExecuteAndCompareClose(c, expected=[[2.0, 2.0, 2.0, 2.0]])
+
+    # TODO(b/154752816): bfloat16 crashes in evaluator.
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes if dtype != bfloat16)
+    def testDivVectorsWithMap(self, dtype):
+
+      def DivComputation():
+        c = self._NewComputation("div_param0_by_param1")
+        shape = xla_client.shape_from_pyval(np.array(0, dtype=dtype))
+        ops.Div(ops.Parameter(c, 0, shape), ops.Parameter(c, 1, shape))
+        return c.build()
+
+      c = self._NewComputation()
+      ops.Map(c, (ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=dtype)),
+                  ops.Constant(c, np.array([5.0, 5.0, 4.0, 4.0], dtype=dtype))),
+              DivComputation(), [0])
+      self._ExecuteAndCompareClose(
+          c, expected=[[0.2, 0.4, 0.75, 1.0]], rtol=1e-3)
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testSelectAndScatter(self, dtype):
+      if dtype == np.float64 and self.backend.platform == "tpu":
+        self.skipTest("TPU doesn't support float64")
+      c = self._NewComputation()
+      operand = ops.Constant(
+          c, np.array([[1., 2., 6.], [4., 5., 3.]], dtype=dtype))
+      window_dimensions = (2, 1)
+      window_strides = (1, 2)
+      padding = xla_client.window_padding_type_to_pad_values(
+          xla_client.PaddingType.VALID,
+          c.get_shape(operand).dimensions(), window_dimensions, window_strides)
+      ops.SelectAndScatterWithGeneralPadding(
+          operand,
+          select=self._CreateBinaryGeComputation(dtype),
+          window_dimensions=window_dimensions,
+          window_strides=window_strides,
+          padding=padding,
+          source=ops.Constant(c, np.array([[0.1, 0.2]], dtype=dtype)),
+          init_value=ops.Constant(c, np.array(1, dtype=dtype)),
+          scatter=self._CreateBinaryAddComputation(dtype))
+      self._ExecuteAndCompareClose(
+          c, expected=[[[1., 1., 1.2], [1.1, 1., 1.]]], rtol=5e-3)
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testReduce1DtoScalar(self, dtype):
+      c = self._NewComputation()
+      ops.Reduce(
+          c,
+          operands=[
+              ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=dtype))
+          ],
+          init_values=[ops.Constant(c, dtype(0))],
+          computation=self._CreateBinaryAddComputation(dtype),
+          dimensions_to_reduce=[0])
+      self._ExecuteAndCompareClose(c, expected=[10])
+
+    # TODO(phawkins): test comparison harness doesn't support bfloat16
+    @parameterized.named_parameters({
+        "testcase_name": "_{}_dim{}".format(dtype.__name__, dim),
+        "dtype": dtype,
+        "dim": dim,
+    } for dtype in float_dtypes if dtype != bfloat16 for dim in range(2))
+    def testReduce2DTo1D(self, dtype, dim):
+      input_array = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype)
+      c = self._NewComputation()
+      ops.Reduce(
+          c,
+          operands=[ops.Constant(c, input_array)],
+          init_values=[ops.Constant(c, dtype(0))],
+          computation=self._CreateBinaryAddComputation(dtype),
+          dimensions_to_reduce=[dim])
+      self._ExecuteAndCompareClose(c, expected=[np.sum(input_array, axis=dim)])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}_dims[{}]".format(dtype.__name__, dims),
+        "dtype": dtype,
+        "dims": tuple(dims)
+    } for dtype in float_dtypes for dims in itertools.permutations(range(3)))
+    def testReduce3DAllPossibleWaysF32(self, dtype, dims):
+      input_array = self._MakeSample3DArray(dtype)
+      c = self._NewComputation()
+      ops.Reduce(
+          c,
+          operands=[ops.Constant(c, input_array)],
+          init_values=[ops.Constant(c, dtype(0))],
+          computation=self._CreateBinaryAddComputation(dtype),
+          dimensions_to_reduce=dims)
+      self._ExecuteAndCompareClose(c, expected=[np.sum(input_array, axis=dims)])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testReduceWindowValidUnitStrides(self, dtype):
+      if dtype == np.float64 and self.backend.platform == "tpu":
+        self.skipTest("TPU doesn't support float64")
+      input_array = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype)
+      c = self._NewComputation()
+      window_dimensions = (2, 1)
+      window_strides = (1, 1)
+      padding = xla_client.window_padding_type_to_pad_values(
+          xla_client.PaddingType.VALID, input_array.shape, window_dimensions,
+          window_strides)
+      ops.ReduceWindowWithGeneralPadding(
+          operand=ops.Constant(c, input_array),
+          init_value=ops.Constant(c, dtype(0)),
+          computation=self._CreateBinaryAddComputation(dtype),
+          window_dimensions=window_dimensions,
+          window_strides=window_strides,
+          base_dilations=[],
+          window_dilations=[],
+          padding=padding)
+      self._ExecuteAndCompareClose(c, expected=[[[5., 7., 9.]]])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testReduceWindowSameUnitStrides(self, dtype):
+      if dtype == np.float64 and self.backend.platform == "tpu":
+        self.skipTest("TPU doesn't support float64")
+      input_array = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype)
+      c = self._NewComputation()
+      window_dimensions = (2, 1)
+      window_strides = (1, 1)
+      padding = xla_client.window_padding_type_to_pad_values(
+          xla_client.PaddingType.SAME, input_array.shape, window_dimensions,
+          window_strides)
+      ops.ReduceWindowWithGeneralPadding(
+          operand=ops.Constant(c, input_array),
+          init_value=ops.Constant(c, dtype(0)),
+          computation=self._CreateBinaryAddComputation(dtype),
+          window_dimensions=window_dimensions,
+          window_strides=window_strides,
+          base_dilations=[],
+          window_dilations=[],
+          padding=padding)
+      self._ExecuteAndCompareClose(c, expected=[[[5., 7., 9.], [4., 5., 6.]]])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testReduceWindowValidGeneralStrides(self, dtype):
+      if dtype == np.float64 and self.backend.platform == "tpu":
+        self.skipTest("TPU doesn't support float64")
+      input_array = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype)
+      c = self._NewComputation()
+      window_dimensions = (2, 1)
+      window_strides = (1, 2)
+      padding = xla_client.window_padding_type_to_pad_values(
+          xla_client.PaddingType.VALID, input_array.shape, window_dimensions,
+          window_strides)
+      ops.ReduceWindowWithGeneralPadding(
+          operand=ops.Constant(c, input_array),
+          init_value=ops.Constant(c, dtype(0)),
+          computation=self._CreateBinaryAddComputation(dtype),
+          window_dimensions=window_dimensions,
+          window_strides=window_strides,
+          base_dilations=[],
+          window_dilations=[],
+          padding=padding)
+      self._ExecuteAndCompareClose(c, expected=[[[5., 9.]]])
+
+    @parameterized.named_parameters({
+        "testcase_name": "_{}".format(dtype.__name__),
+        "dtype": dtype,
+    } for dtype in float_dtypes)
+    def testWhile(self, dtype):
+
+      def LessThan10Cond():
+        c = self._NewComputation("test_lt_10")
+        shape = xla_client.shape_from_pyval(np.array(0, dtype=dtype))
+        ops.Lt(ops.Parameter(c, 0, shape), ops.Constant(c, dtype(10.)))
+        return c.build()
+
+      cond = LessThan10Cond()
+      body = self._CreateMulBy2Computation(dtype)
+      c = self._NewComputation()
+      init = ops.Constant(c, dtype(1.))
+      ops.While(cond, body, init)
+      self._ExecuteAndCompareClose(c, expected=[16.])
+
+    def testConditionalTrue(self):
+      c = self._NewComputation()
+      pred = ops.Constant(c, np.bool_(True))
+      true_operand = ops.Constant(c, np.float32(3.))
+      true_computation = self._CreateMulBy2Computation(np.float32)
+      false_operand = ops.Constant(c, np.float32(2.))
+      false_computation = self._CreateConstantComputation(
+          np.float32, np.float32)
+      ops.Conditional(pred, true_operand, true_computation, false_operand,
+                      false_computation)
+      self._ExecuteAndCompareClose(c, expected=[6.])
+
+    def testConditionalFalse(self):
+      c = self._NewComputation()
+      pred = ops.Constant(c, np.bool_(False))
+      true_operand = ops.Constant(c, np.float32(3.))
+      true_computation = self._CreateMulBy2Computation(np.float32)
+      false_operand = ops.Constant(c, np.float32(2.))
+      false_computation = self._CreateConstantComputation(
+          np.float32, np.float32)
+      ops.Conditional(pred, true_operand, true_computation, false_operand,
+                      false_computation)
+      self._ExecuteAndCompareClose(c, expected=[1.])
+
+    @unittest.skipIf(cloud_tpu, "not implemented")
+    def testInfeedS32Values(self):
+      to_infeed = NumpyArrayS32([1, 2, 3, 4])
+      c = self._NewComputation()
+      ops.GetTupleElement(
+          ops.InfeedWithToken(
+              ops.CreateToken(c),
+              xla_client.shape_from_pyval(
+                  to_infeed[0]).with_major_to_minor_layout_if_absent()), 0)
+      compiled_c = self.backend.compile(c.build())
+      device = self.backend.local_devices()[0]
+      for item in to_infeed:
+        device.transfer_to_infeed(item)
+
+      for item in to_infeed:
+        result, = xla_client.execute_with_python_values(
+            compiled_c, (), backend=self.backend)
+        self.assertEqual(result, item)
+
+    @unittest.skipIf(cloud_tpu, "not implemented")
+    def testInfeedTuple(self):
+      to_infeed = (NumpyArrayS32([1, 2, 3, 4]), NumpyArrayS32([[7], [8]]))
+      c = self._NewComputation()
+      ops.GetTupleElement(
+          ops.InfeedWithToken(
+              ops.CreateToken(c),
+              xla_client.shape_from_pyval(
+                  to_infeed).with_major_to_minor_layout_if_absent()), 0)
+      compiled_c = self.backend.compile(c.build())
+      device = self.backend.local_devices()[0]
+      device.transfer_to_infeed(to_infeed)
+
+      result = xla_client.execute_with_python_values(
+          compiled_c, (), backend=self.backend)
+      self.assertLen(result, 2)
+      np.testing.assert_equal(result[0], to_infeed[0])
+      np.testing.assert_equal(result[1], to_infeed[1])
+
+    @unittest.skipIf(cloud_tpu, "not implemented")
+    def testInfeedThenOutfeedS32(self):
+      to_round_trip = NumpyArrayS32([1, 2, 3, 4])
+      c = self._NewComputation()
+      x_and_token = ops.InfeedWithToken(
+          ops.CreateToken(c),
+          xla_client.shape_from_pyval(
+              to_round_trip[0]).with_major_to_minor_layout_if_absent())
+      x = ops.GetTupleElement(x_and_token, 0)
+      token = ops.GetTupleElement(x_and_token, 1)
+      outfeed_shape = xla_client.shape_from_pyval(
+          to_round_trip[0]).with_major_to_minor_layout_if_absent()
+      ops.OutfeedWithToken(x, token, outfeed_shape)
+
+      compiled_c = self.backend.compile(c.build())
+      device = self.backend.local_devices()[0]
+
+      for want in to_round_trip:
+        execution = threading.Thread(target=lambda: compiled_c.execute([]))
+        execution.start()
+        device.transfer_to_infeed(want)
+        got = device.transfer_from_outfeed(outfeed_shape)
+        execution.join()
+        self.assertEqual(want, got)
+
+    def testScatter(self):
+      a = np.arange(9).astype(np.int32).reshape((3, 3))
+      scatter_indices = np.array([0, 2], dtype=np.int32)
+      updates = np.array([[10, 20, 30], [70, 80, 90]], dtype=np.int32)
+
+      dnums = xla_client.ScatterDimensionNumbers()
+      dnums.update_window_dims.append(1)
+      dnums.inserted_window_dims.append(0)
+      dnums.scatter_dims_to_operand_dims.append(0)
+      dnums.index_vector_dim = 1
+
+      c = self._NewComputation()
+      ops.Scatter(
+          ops.Constant(c, a), ops.Constant(c, scatter_indices),
+          ops.Constant(c, updates), self._CreateBinaryAddComputation(np.int32),
+          dnums)
+      expected = np.array([[10, 21, 32], [3, 4, 5], [76, 87, 98]],
+                          dtype=np.int32)
       self._ExecuteAndCompareClose(c, expected=[expected])
 
-    _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [0, 1])
-    _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [1, 0])
-    _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [0, 1])
-    _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [1, 0])
+  class ErrorTest(ComputationTest):
 
-    arr = np.random.RandomState(0).randn(2, 3, 4).astype(np.float32)
-    for permutation in itertools.permutations(range(arr.ndim)):
-      _TransposeAndTest(arr, permutation)
-      _TransposeAndTest(np.asfortranarray(arr), permutation)
+    def setUp(self):
+      super(ErrorTest, self).setUp()
+      self.f32_scalar_2 = NumpyArrayF32(2.0)
+      self.s32_scalar_2 = NumpyArrayS32(2)
 
-  def testEq(self):
-    c = self._NewComputation()
-    c.Eq(
-        c.Constant(NumpyArrayS32([1, 2, 3, 4])),
-        c.Constant(NumpyArrayS32([4, 2, 3, 1])))
-    self._ExecuteAndCompareExact(c, expected=[[False, True, True, False]])
-
-  def testNe(self):
-    c = self._NewComputation()
-    c.Ne(
-        c.Constant(NumpyArrayS32([1, 2, 3, 4])),
-        c.Constant(NumpyArrayS32([4, 2, 3, 1])))
-    self._ExecuteAndCompareExact(c, expected=[[True, False, False, True]])
-
-    c.Ne(
-        c.Constant(NumpyArrayF32([-2.0, 0.0,
-                                  float("nan"),
-                                  float("nan")])),
-        c.Constant(NumpyArrayF32([2.0, -0.0, 1.0, float("nan")])))
-    self._ExecuteAndAssertWith(
-        np.testing.assert_allclose, c, (), expected=[[True, False, True, True]])
-
-  def testGt(self):
-    c = self._NewComputation()
-    c.Gt(
-        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
-        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
-    self._ExecuteAndCompareExact(
-        c, expected=[[False, True, True, False, False]])
-
-  def testGe(self):
-    c = self._NewComputation()
-    c.Ge(
-        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
-        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
-    self._ExecuteAndCompareExact(c, expected=[[True, True, True, False, False]])
-
-  def testLt(self):
-    c = self._NewComputation()
-    c.Lt(
-        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
-        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
-    self._ExecuteAndCompareExact(
-        c, expected=[[False, False, False, True, True]])
-
-  def testLe(self):
-    c = self._NewComputation()
-    c.Le(
-        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
-        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
-    self._ExecuteAndCompareExact(c, expected=[[True, False, False, True, True]])
-
-  def testMax(self):
-    c = self._NewComputation()
-    c.Max(
-        c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
-        c.Constant(NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
-    self._ExecuteAndCompareExact(c, expected=[[1.0, 2.0, 3.0, 7.0, 12.0]])
-
-  def testMaxExplicitBroadcastDim0(self):
-    c = self._NewComputation()
-    c.Max(
-        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        c.Constant(NumpyArrayF32([3, 4, 5])),
-        broadcast_dimensions=(0,))
-    self._ExecuteAndCompareExact(
-        c, expected=[[[3, 3, 3], [4, 5, 6], [7, 8, 9]]])
-
-  def testMaxExplicitBroadcastDim1(self):
-    c = self._NewComputation()
-    c.Max(
-        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        c.Constant(NumpyArrayF32([3, 4, 5])),
-        broadcast_dimensions=(1,))
-    self._ExecuteAndCompareExact(
-        c, expected=[[[3, 4, 5], [4, 5, 6], [7, 8, 9]]])
-
-  def testMin(self):
-    c = self._NewComputation()
-    c.Min(
-        c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
-        c.Constant(NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
-    self._ExecuteAndCompareExact(c, expected=[[1.0, 0.0, 2.0, 4.0, 9.0]])
-
-  def testPad(self):
-    c = self._NewComputation()
-    c.Pad(
-        c.Constant(NumpyArrayF32([[1.0, 2.0], [3.0, 4.0]])),
-        c.Constant(NumpyArrayF32(0.0)), [(1, 2, 1), (0, 1, 0)])
-    self._ExecuteAndCompareClose(
-        c,
-        expected=[[[0.0, 0.0, 0.0], [1.0, 2.0, 0.0], [0.0, 0.0, 0.0],
-                   [3.0, 4.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]])
-
-  def testPadWithPaddingConfig(self):
-    c = self._NewComputation()
-    padding_config = xla_client.PaddingConfig()
-    for lo, hi, interior in [(1, 2, 1), (0, 1, 0)]:
-      dimension = xla_client.PaddingConfigDimension()
-      dimension.edge_padding_low = lo
-      dimension.edge_padding_high = hi
-      dimension.interior_padding = interior
-      padding_config.dimensions.append(dimension)
-    c.Pad(
-        c.Constant(NumpyArrayF32([[1.0, 2.0], [3.0, 4.0]])),
-        c.Constant(NumpyArrayF32(0.0)), padding_config)
-    self._ExecuteAndCompareClose(
-        c,
-        expected=[[[0.0, 0.0, 0.0], [1.0, 2.0, 0.0], [0.0, 0.0, 0.0],
-                   [3.0, 4.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]])
-
-  def testReshape(self):
-    c = self._NewComputation()
-    c.Reshape(
-        c.Constant(NumpyArrayS32([[1, 2], [3, 4], [5, 6]])),
-        dimensions=[0, 1],
-        new_sizes=[2, 3])
-    self._ExecuteAndCompareExact(c, expected=[[[1, 2, 3], [4, 5, 6]]])
-
-  def testCollapse(self):
-    c = self._NewComputation()
-    c.Collapse(
-        c.Constant(NumpyArrayS32([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])),
-        dimensions=[1, 2])
-    self._ExecuteAndCompareExact(c, expected=[[[1, 2, 3, 4], [5, 6, 7, 8]]])
-
-  def testRev(self):
-    c = self._NewComputation()
-    c.Rev(
-        c.Constant(NumpyArrayS32([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])),
-        dimensions=[0, 2])
-    self._ExecuteAndCompareExact(
-        c, expected=[[[[6, 5], [8, 7]], [[2, 1], [4, 3]]]])
-
-  def testReducePrecision(self):
-    c = self._NewComputation()
-    c.ReducePrecision(
-        c.Constant(NumpyArrayF32([float.fromhex("0x1.32fffep-3")])),
-        exponent_bits=8,
-        mantissa_bits=7)
-    self._ExecuteAndCompareClose(c, expected=[[float.fromhex("0x1.32p-3")]])
-
-  def testClampF32(self):
-    c = self._NewComputation()
-    c.Clamp(
-        c.Constant(NumpyArrayF32(-1)),
-        c.Constant(NumpyArrayF32([-2, -1, 0, 1, 2, 3])),
-        c.Constant(NumpyArrayF32(2)))
-    self._ExecuteAndCompareExact(c, expected=[[-1, -1, 0, 1, 2, 2]])
-
-  def testClampS32(self):
-    c = self._NewComputation()
-    c.Clamp(
-        c.Constant(NumpyArrayS32(-1)),
-        c.Constant(NumpyArrayS32([-2, -1, 0, 1, 2, 3])),
-        c.Constant(NumpyArrayS32(2)))
-    self._ExecuteAndCompareExact(c, expected=[[-1, -1, 0, 1, 2, 2]])
-
-  def testSelect(self):
-    c = self._NewComputation()
-    c.Select(
-        c.Constant(NumpyArrayBool([True, False, False, True, False])),
-        c.Constant(NumpyArrayS32([1, 2, 3, 4, 5])),
-        c.Constant(NumpyArrayS32([-1, -2, -3, -4, -5])))
-    self._ExecuteAndCompareExact(c, expected=[[1, -2, -3, 4, -5]])
-
-  def testSlice(self):
-    c = self._NewComputation()
-    c.Slice(
-        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])), [1, 0],
-        [3, 2])
-    self._ExecuteAndCompareExact(c, expected=[[[4, 5], [7, 8]]])
-
-  def testSliceInDim(self):
-    c = self._NewComputation()
-    c.SliceInDim(
-        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        start_index=1,
-        limit_index=2,
-        stride=1,
-        dimno=1)
-    self._ExecuteAndCompareExact(c, expected=[[[2], [5], [8]]])
-    c.SliceInDim(
-        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        start_index=0,
-        limit_index=3,
-        stride=2,
-        dimno=0)
-    self._ExecuteAndCompareExact(c, expected=[[[1, 2, 3], [7, 8, 9]]])
-
-  def testDynamicSlice(self):
-    c = self._NewComputation()
-    c.DynamicSlice(
-        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        c.Constant(NumpyArrayS32([1, 0])), [2, 2])
-    self._ExecuteAndCompareExact(c, expected=[[[4, 5], [7, 8]]])
-
-  def testDynamicUpdateSlice(self):
-    c = self._NewComputation()
-    c.DynamicUpdateSlice(
-        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-        c.Constant(NumpyArrayS32([[1, 2], [3, 4]])),
-        c.Constant(NumpyArrayS32([1, 1])))
-    self._ExecuteAndCompareExact(
-        c, expected=[[[1, 2, 3], [4, 1, 2], [7, 3, 4]]])
-
-  def testTuple(self):
-    c = self._NewComputation()
-    c.Tuple(
-        c.ConstantS32Scalar(42), c.Constant(NumpyArrayF32([1.0, 2.0])),
-        c.Constant(NumpyArrayBool([True, False, False, True])))
-    result = xla_client.execute_with_python_values(c.Build().Compile())
-    self.assertLen(result, 3)
-    np.testing.assert_equal(result[0], 42)
-    np.testing.assert_allclose(result[1], [1.0, 2.0])
-    np.testing.assert_equal(result[2], [True, False, False, True])
-
-  def testGetTupleElement(self):
-    c = self._NewComputation()
-    c.GetTupleElement(
-        c.Tuple(
-            c.ConstantS32Scalar(42), c.Constant(NumpyArrayF32([1.0, 2.0])),
-            c.Constant(NumpyArrayBool([True, False, False, True]))), 1)
-    self._ExecuteAndCompareClose(c, expected=[[1.0, 2.0]])
-
-  def testBroadcast(self):
-    c = self._NewComputation()
-    c.Broadcast(c.Constant(NumpyArrayS32([10, 20, 30, 40])), sizes=(3,))
-    self._ExecuteAndCompareExact(
-        c, expected=[[[10, 20, 30, 40], [10, 20, 30, 40], [10, 20, 30, 40]]])
-
-  def testBroadcastInDim(self):
-    c = self._NewComputation()
-    c.BroadcastInDim(c.Constant(NumpyArrayS32([1, 2])), [2, 2], [0])
-    self._ExecuteAndCompareExact(c, expected=[[[1, 1], [2, 2]]])
-    c.BroadcastInDim(c.Constant(NumpyArrayS32([1, 2])), [2, 2], [1])
-    self._ExecuteAndCompareExact(c, expected=[[[1, 2], [1, 2]]])
-
-  def testRngNormal(self):
-    shape = (2, 3)
-    c = self._NewComputation()
-    c.RngNormal(
-        c.Constant(NumpyArrayF32(0.)),
-        c.Constant(NumpyArrayF32(1.)),
-        dims=shape)
-    result = xla_client.execute_with_python_values(c.Build().Compile())
-    # since the result is random, we just check shape and uniqueness
-    self.assertLen(result, 1)
-    self.assertEqual(result[0].shape, shape)
-    self.assertLen(np.unique(result[0]), np.prod(shape))
-
-  def testRngUniformF32(self):
-    lo, hi = 2., 4.
-    shape = (2, 3)
-    c = self._NewComputation()
-    c.RngUniform(
-        c.Constant(NumpyArrayF32(lo)),
-        c.Constant(NumpyArrayF32(hi)),
-        dims=shape)
-    result = xla_client.execute_with_python_values(c.Build().Compile())
-    # since the result is random, we just check shape, uniqueness, and range
-    self.assertLen(result, 1)
-    self.assertEqual(result[0].shape, shape)
-    self.assertLen(np.unique(result[0]), np.prod(shape))
-    self.assertTrue(np.all(lo <= result[0]))
-    self.assertTrue(np.all(result[0] < hi))
-
-  def testRngUniformS32(self):
-    lo, hi = 2, 4
-    shape = (2, 3)
-    c = self._NewComputation()
-    c.RngUniform(
-        c.Constant(NumpyArrayS32(lo)),
-        c.Constant(NumpyArrayS32(hi)),
-        dims=shape)
-    result = xla_client.execute_with_python_values(c.Build().Compile())
-    # since the result is random, we just check shape, integrality, and range
-    self.assertLen(result, 1)
-    self.assertEqual(result[0].shape, shape)
-    self.assertEqual(result[0].dtype, np.int32)
-    self.assertTrue(np.all(lo <= result[0]))
-    self.assertTrue(np.all(result[0] < hi))
-
-  def testCholesky(self):
-    l = np.array([[4, 0, 0, 0], [6, 5, 0, 0], [2, 14, 16, 0], [3, 6, 1, 4]],
-                 dtype=np.float32)
-    c = self._NewComputation()
-    c.Cholesky(c.Constant(np.dot(l, l.T)))
-    self._ExecuteAndCompareClose(c, expected=[l], rtol=1e-4)
-
-  def testSort(self):
-    keys = np.array([[2, 4, 1, 3], [3, 1, 4, 2]], dtype=np.float32)
-    c = self._NewComputation()
-    c.Sort(c.Constant(keys))
-    self._ExecuteAndCompareClose(
-        c, expected=[np.array([[1, 2, 3, 4], [1, 2, 3, 4]], dtype=np.float32)])
-
-  def testSortKeyVal(self):
-    keys = np.array([[2, 4, 1, 3], [3, 1, 4, 2]], dtype=np.float32)
-    values = np.array([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=np.int32)
-    c = self._NewComputation()
-    c.Sort((c.Constant(keys), c.Constant(values)), dimension=0)
-    result = xla_client.execute_with_python_values(c.Build().Compile())
-    self.assertLen(result, 2)
-    np.testing.assert_allclose(result[0], [[2, 1, 1, 2], [3, 4, 4, 3]])
-    np.testing.assert_equal(result[1], [[0, 5, 2, 7], [4, 1, 6, 3]])
-
-  def testSortCustomComparator(self):
-    b = self._NewComputation("comparator")
-    p0 = b.ParameterFromNumpy(NumpyArrayF32(0))
-    q0 = b.ParameterFromNumpy(NumpyArrayF32(0))
-    p1 = b.ParameterFromNumpy(NumpyArrayS32(0))
-    q1 = b.ParameterFromNumpy(NumpyArrayS32(0))
-    b.Or(b.Lt(p0, q0), b.And(b.Eq(p0, q0), b.Gt(p1, q1)))
-    comparator = b.Build()
-
-    keys = np.array([[2, 3, 1, 3], [3, 1, 2, 2]], dtype=np.float32)
-    values = np.array([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=np.int32)
-    c = self._NewComputation()
-    c.Sort((c.Constant(keys), c.Constant(values)),
-           dimension=1,
-           comparator=comparator)
-    result = xla_client.execute_with_python_values(c.Build().Compile())
-    self.assertLen(result, 2)
-    np.testing.assert_allclose(result[0], [[1, 2, 3, 3], [1, 2, 2, 3]])
-    np.testing.assert_equal(result[1], [[2, 0, 3, 1], [5, 7, 6, 4]])
-
-  def testQR(self):
-    a = np.array(
-        [[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166], [10, 63, 166, 310]],
-        dtype=np.float32)
-    c = self._NewComputation()
-    c.QR(c.Constant(a), full_matrices=True)
-    q, r = self._Execute(c, ())
-    np.testing.assert_allclose(np.dot(q, r), a, rtol=1e-4)
-
-  def testEigh(self):
-    a = np.array(
-        [[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166], [10, 63, 166, 310]],
-        dtype=np.float32)
-    a = (a + a.T) / 2
-
-    c = self._NewComputation()
-    c.Eigh(c.Constant(a), full_matrices=True)
-    # TODO(b/129396575): Turn this test back on when it passes without fastmath.
-    # v, w = self._Execute(c, ())
-    # self.assertLess(np.linalg.norm(np.dot(a, v) - w * v), 1e-3)
-
-  def testSVD(self):
-    a = np.array(
-        [[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166], [10, 63, 166, 310]],
-        dtype=np.float32)
-    c = self._NewComputation()
-    c.SVD(c.Constant(a))
-    u, d, v = self._Execute(c, ())
-    self.assertLess(np.linalg.norm(a - np.matmul(u * d, v.T)), 1e-3)
-
-  def testTriangularSolve(self):
-    a_vals = np.array(
-        [[2, 0, 0, 0], [3, 6, 0, 0], [4, 7, 9, 0], [5, 8, 10, 11]],
-        dtype=np.float32)
-    b_vals = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
-                      dtype=np.float32)
-
-    c = self._NewComputation()
-    c.TriangularSolve(
-        c.Constant(a_vals),
-        c.Constant(b_vals),
-        left_side=False,
-        lower=True,
-        transpose_a=True)
-    self._ExecuteAndCompareClose(
-        c,
-        expected=[
-            np.array([
-                [0.5, 0.08333334, 0.04629629, 0.03367003],
-                [2.5, -0.25, -0.1388889, -0.1010101],
-                [4.5, -0.58333331, -0.32407406, -0.23569024],
-            ],
-                     dtype=np.float32)
-        ],
-        rtol=1e-4)
-
-  def testIsConstant(self):
-    c = self._NewComputation()
-    a = c.ConstantS32Scalar(3)
-    b = c.ConstantS32Scalar(1)
-    x = c.ParameterFromNumpy(NumpyArrayS32(0))
-    const_expr = c.Sub(b, a)
-    non_const_expr = c.Mul(const_expr, x)
-    self.assertTrue(c.IsConstant(const_expr))
-    self.assertFalse(c.IsConstant(non_const_expr))
-    # self.assertTrue(c.IsConstant(c.Sub(c.Add(x, a), x)))  # TODO(b/77245564)
-
-  def testGather(self):
-    a = np.arange(9).astype(np.int32).reshape((3, 3))
-    indices = np.array([[[0, 2], [2, 1]], [[1, 2], [2, 0]]], dtype=np.int32)
-    dnums = xla_client.GatherDimensionNumbers()
-    dnums.offset_dims.append(1)
-    dnums.offset_dims.append(2)
-    dnums.start_index_map.append(0)
-    dnums.start_index_map.append(1)
-    dnums.index_vector_dim = 2
-    c = self._NewComputation()
-    c.Gather(c.Constant(a), c.Constant(indices), dnums, slice_sizes=[1, 1])
-    g, = self._Execute(c, ())
-    expected = np.array([[[[2, 7]]], [[[5, 6]]]], dtype=np.int32)
-    np.testing.assert_allclose(g, expected, rtol=1e-4)
-
-  def testFft(self):
-    shape = [2, 3, 4, 5]
-    rng = np.random.RandomState(0)
-    a = rng.randn(*shape) + 1.0j * rng.randn(*shape)
-    a = a.astype(np.complex64)
-    # FFT
-    c = self._NewComputation()
-    c.Fft(c.Constant(a), xla_client.FftType.FFT, shape[-3:])
-    self._ExecuteAndCompareClose(
-        c, expected=[np.fft.fftn(a, axes=(1, 2, 3))], rtol=1e-4)
-    # IFFT
-    c = self._NewComputation()
-    c.Fft(c.Constant(a), xla_client.FftType.IFFT, shape[-3:])
-    self._ExecuteAndCompareClose(
-        c, expected=[np.fft.ifftn(a, axes=(1, 2, 3))], rtol=1e-4)
-    # RFFT
-    b = rng.randn(*shape).astype(np.float32)
-    c = self._NewComputation()
-    c.Fft(c.Constant(b), xla_client.FftType.RFFT, shape[-3:])
-    self._ExecuteAndCompareClose(
-        c, expected=[np.fft.rfftn(b, axes=(1, 2, 3))], rtol=1e-4)
-    # IRFFT
-    c = self._NewComputation()
-    c.Fft(c.Constant(a), xla_client.FftType.IRFFT, [3, 4, 8])
-    self._ExecuteAndCompareClose(
-        c, expected=[np.fft.irfftn(a, axes=(1, 2, 3))], rtol=1e-4)
-
-  def testNextAfter(self):
-    c = self._NewComputation()
-    c.NextAfter(
-        c.Constant(np.array([1, 2], dtype=np.float32)),
-        c.Constant(np.array([2, 1], dtype=np.float32)))
-    out, = self._Execute(c, ())
-    eps = np.finfo(np.float32).eps
-    np.testing.assert_equal(np.array([eps + 1, 2 - eps], dtype=np.float32), out)
-
-  def testRegularizedIncompleteBeta(self):
-    x = np.array([0.53787335, 0.24015466, 0.47494545, 0.13567594, 0.95114538])
-    a = np.array([0.00753073, 0.34813385, 0.30485708, 1.29298632, 0.51472606])
-    b = np.array([0.55688389, 0.59794214, 0.42661022, 1.59748339, 0.95047677])
-    c = self._NewComputation()
-    c.RegularizedIncompleteBeta(c.Constant(a), c.Constant(b), c.Constant(x))
-    expected = np.array(
-        [0.98923271, 0.48575411, 0.57952568, 0.12579775, 0.96989155])
-    self._ExecuteAndCompareClose(c, expected=[expected], rtol=1e-4)
-
-
-class EmbeddedComputationsTest(ComputationTest):
-  """Tests for XLA graphs with embedded computations (such as maps)."""
-
-  def _CreateConstantS32Computation(self):
-    """Computation (f32) -> s32 that returns a constant 1 for any input."""
-    c = self._NewComputation("constant_s32_one")
-    # TODO(eliben): consider adding a nicer way to create new parameters without
-    # having to create dummy Numpy arrays or populating Shape messages. Perhaps
-    # we need our own (Python-client-own) way to represent Shapes conveniently.
-    c.ParameterFromNumpy(NumpyArrayF32(0))
-    c.ConstantS32Scalar(1)
-    return c.Build()
-
-  def _CreateConstantS64Computation(self):
-    """Computation (f64) -> s64 that returns a constant 1 for any input."""
-    c = self._NewComputation("constant_s64_one")
-    # TODO(eliben): consider adding a nicer way to create new parameters without
-    # having to create dummy Numpy arrays or populating Shape messages. Perhaps
-    # we need our own (Python-client-own) way to represent Shapes conveniently.
-    c.ParameterFromNumpy(NumpyArrayF64(0))
-    c.ConstantS64Scalar(1)
-    return c.Build()
-
-  def _CreateConstantF32Computation(self):
-    """Computation (f32) -> f32 that returns a constant 1.0 for any input."""
-    c = self._NewComputation("constant_f32_one")
-    c.ParameterFromNumpy(NumpyArrayF32(0))
-    c.ConstantF32Scalar(1.0)
-    return c.Build()
-
-  def _CreateConstantF64Computation(self):
-    """Computation (f64) -> f64 that returns a constant 1.0 for any input."""
-    c = self._NewComputation("constant_f64_one")
-    c.ParameterFromNumpy(NumpyArrayF64(0))
-    c.ConstantF64Scalar(1.0)
-    return c.Build()
-
-  def _CreateMulF32By2Computation(self):
-    """Computation (f32) -> f32 that multiplies its parameter by 2."""
-    c = self._NewComputation("mul_f32_by2")
-    c.Mul(c.ParameterFromNumpy(NumpyArrayF32(0)), c.ConstantF32Scalar(2.0))
-    return c.Build()
-
-  def _CreateMulF32ByParamComputation(self):
-    """Computation (f32) -> f32 that multiplies one parameter by the other."""
-    c = self._NewComputation("mul_f32_by_param")
-    c.Mul(
-        c.ParameterFromNumpy(NumpyArrayF32(0)),
-        c.ParameterFromNumpy(NumpyArrayF32(0)))
-    return c.Build()
-
-  def _CreateMulF64By2Computation(self):
-    """Computation (f64) -> f64 that multiplies its parameter by 2."""
-    c = self._NewComputation("mul_f64_by2")
-    c.Mul(c.ParameterFromNumpy(NumpyArrayF64(0)), c.ConstantF64Scalar(2.0))
-    return c.Build()
-
-  def _CreateBinaryAddS32Computation(self):
-    """Computation (s32, s32) -> s32 that adds its two parameters."""
-    c = self._NewComputation("add_param0_by_param1")
-    c.Add(
-        c.ParameterFromNumpy(NumpyArrayS32(0)),
-        c.ParameterFromNumpy(NumpyArrayS32(0)))
-    return c.Build()
-
-  def _CreateBinaryAddF32Computation(self):
-    """Computation (f32, f32) -> f32 that adds its two parameters."""
-    c = self._NewComputation("add_param0_by_param1")
-    c.Add(
-        c.ParameterFromNumpy(NumpyArrayF32(0)),
-        c.ParameterFromNumpy(NumpyArrayF32(0)))
-    return c.Build()
-
-  def _CreateBinaryAddF64Computation(self):
-    """Computation (f64, f64) -> f64 that adds its two parameters."""
-    c = self._NewComputation("add_param0_by_param1")
-    c.Add(
-        c.ParameterFromNumpy(NumpyArrayF64(0)),
-        c.ParameterFromNumpy(NumpyArrayF64(0)))
-    return c.Build()
-
-  def _CreateBinaryDivF32Computation(self):
-    """Computation (f32, f32) -> f32 that divides its two parameters."""
-    c = self._NewComputation("div_param0_by_param1")
-    c.Div(
-        c.ParameterFromNumpy(NumpyArrayF32(0)),
-        c.ParameterFromNumpy(NumpyArrayF32(0)))
-    return c.Build()
-
-  def _CreateBinaryDivF64Computation(self):
-    """Computation (f64, f64) -> f64 that divides its two parameters."""
-    c = self._NewComputation("div_param0_by_param1")
-    c.Div(
-        c.ParameterFromNumpy(NumpyArrayF64(0)),
-        c.ParameterFromNumpy(NumpyArrayF64(0)))
-    return c.Build()
-
-  def _CreateTestF32Lt10Computation(self):
-    """Computation (f32) -> bool that tests if its parameter is less than 10."""
-    c = self._NewComputation("test_f32_lt_10")
-    c.Lt(c.ParameterFromNumpy(NumpyArrayF32(0)), c.ConstantF32Scalar(10.))
-    return c.Build()
-
-  def _CreateTestF64Lt10Computation(self):
-    """Computation (f64) -> bool that tests if its parameter is less than 10."""
-    c = self._NewComputation("test_f64_lt_10")
-    c.Lt(c.ParameterFromNumpy(NumpyArrayF64(0)), c.ConstantF64Scalar(10.))
-    return c.Build()
-
-  def _CreateBinaryGeF32Computation(self):
-    """Computation (f32, f32) -> bool that tests first_param >= second_param."""
-    c = self._NewComputation("param0_lt_param1")
-    c.Ge(
-        c.ParameterFromNumpy(NumpyArrayF32(0)),
-        c.ParameterFromNumpy(NumpyArrayF32(0)))
-    return c.Build()
-
-  def _CreateBinaryGeF64Computation(self):
-    """Computation (f64, f64) -> bool that tests first_param >= second_param."""
-    c = self._NewComputation("param0_lt_param1")
-    c.Ge(
-        c.ParameterFromNumpy(NumpyArrayF64(0)),
-        c.ParameterFromNumpy(NumpyArrayF64(0)))
-    return c.Build()
-
-  def _MakeSample3DArrayF32(self):
-    return NumpyArrayF32([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]],
-                          [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
-
-  def _MakeSample3DArrayF64(self):
-    return NumpyArrayF64([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]],
-                          [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
-
-  def testCallF32(self):
-    c = self._NewComputation()
-    c.Call(
-        self._CreateMulF32By2Computation(),
-        operands=(c.ConstantF32Scalar(5.0),))
-    self._ExecuteAndCompareClose(c, expected=[10.0])
-
-  def testCallF64(self):
-    c = self._NewComputation()
-    c.Call(
-        self._CreateMulF64By2Computation(),
-        operands=(c.ConstantF64Scalar(5.0),))
-    self._ExecuteAndCompareClose(c, expected=[10.0])
-
-  def testMapEachElementToS32Constant(self):
-    c = self._NewComputation()
-    c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
-          self._CreateConstantS32Computation(), [0])
-    self._ExecuteAndCompareExact(c, expected=[[1, 1, 1, 1]])
-
-  def testMapEachElementToS64Constant(self):
-    c = self._NewComputation()
-    c.Map([c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0]))],
-          self._CreateConstantS64Computation(), [0])
-    self._ExecuteAndCompareExact(c, expected=[[1, 1, 1, 1]])
-
-  def testMapMulBy2F32(self):
-    c = self._NewComputation()
-    c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
-          self._CreateMulF32By2Computation(), [0])
-    self._ExecuteAndCompareClose(c, expected=[[2.0, 4.0, 6.0, 8.0]])
-
-  def testMapMulBy2F64(self):
-    c = self._NewComputation()
-    c.Map([c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0]))],
-          self._CreateMulF64By2Computation(), [0])
-    self._ExecuteAndCompareClose(c, expected=[[2.0, 4.0, 6.0, 8.0]])
-
-  def testSimpleMapChainF32(self):
-    # Chains a map of constant-f32 with a map of mul-by-2
-    c = self._NewComputation()
-    const_f32 = c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
-                      self._CreateConstantF32Computation(), [0])
-    c.Map([const_f32], self._CreateMulF32By2Computation(), [0])
-    self._ExecuteAndCompareClose(c, expected=[[2.0, 2.0, 2.0, 2.0]])
-
-  def testSimpleMapChainF64(self):
-    # Chains a map of constant-f64 with a map of mul-by-2
-    c = self._NewComputation()
-    const_f64 = c.Map([c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0]))],
-                      self._CreateConstantF64Computation(), [0])
-    c.Map([const_f64], self._CreateMulF64By2Computation(), [0])
-    self._ExecuteAndCompareClose(c, expected=[[2.0, 2.0, 2.0, 2.0]])
-
-  def testDivVectorsWithMapF32(self):
-    c = self._NewComputation()
-    c.Map((c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0])),
-           c.Constant(NumpyArrayF32([5.0, 5.0, 4.0, 4.0]))),
-          self._CreateBinaryDivF32Computation(), [0])
-    self._ExecuteAndCompareClose(c, expected=[[0.2, 0.4, 0.75, 1.0]])
-
-  def testDivVectorsWithMapF64(self):
-    c = self._NewComputation()
-    c.Map((c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0])),
-           c.Constant(NumpyArrayF64([5.0, 5.0, 4.0, 4.0]))),
-          self._CreateBinaryDivF64Computation(), [0])
-    self._ExecuteAndCompareClose(c, expected=[[0.2, 0.4, 0.75, 1.0]])
-
-  def testSelectAndScatterF32(self):
-    c = self._NewComputation()
-    c.SelectAndScatter(
-        c.Constant(NumpyArrayF32([[1., 2., 6.], [4., 5., 3.]])),
-        select=self._CreateBinaryGeF32Computation(),
-        window_dimensions=(2, 1),
-        window_strides=(1, 2),
-        padding=xla_client.PaddingType.VALID,
-        source=c.Constant(NumpyArrayF32([[0.1, 0.2]])),
-        init_value=c.Constant(NumpyArrayF32(1)),
-        scatter=self._CreateBinaryAddF32Computation())
-    self._ExecuteAndCompareClose(c, expected=[[[1., 1., 1.2], [1.1, 1., 1.]]])
-
-  def testSelectAndScatterF64(self):
-    c = self._NewComputation()
-    c.SelectAndScatter(
-        c.Constant(NumpyArrayF64([[1., 2., 6.], [4., 5., 3.]])),
-        select=self._CreateBinaryGeF64Computation(),
-        window_dimensions=(2, 1),
-        window_strides=(1, 2),
-        padding=xla_client.PaddingType.VALID,
-        source=c.Constant(NumpyArrayF64([[0.1, 0.2]])),
-        init_value=c.Constant(NumpyArrayF64(1)),
-        scatter=self._CreateBinaryAddF64Computation())
-    self._ExecuteAndCompareClose(c, expected=[[[1., 1., 1.2], [1.1, 1., 1.]]])
-
-  def testReduce1DtoScalarF32(self):
-    c = self._NewComputation()
-    c.Reduce(
-        operand=c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0])),
-        init_value=c.ConstantF32Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF32Computation(),
-        dimensions=[0])
-    self._ExecuteAndCompareClose(c, expected=[10])
-
-  def testReduce1DtoScalarF64(self):
-    c = self._NewComputation()
-    c.Reduce(
-        operand=c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0])),
-        init_value=c.ConstantF64Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF64Computation(),
-        dimensions=[0])
-    self._ExecuteAndCompareClose(c, expected=[10])
-
-  def testReduce2DTo1DDim0F32(self):
-    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.Reduce(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF32Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF32Computation(),
-        dimensions=[0])
-    self._ExecuteAndCompareClose(c, expected=[[5, 7, 9]])
-
-  def testReduce2DTo1DDim0F64(self):
-    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.Reduce(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF64Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF64Computation(),
-        dimensions=[0])
-    self._ExecuteAndCompareClose(c, expected=[[5, 7, 9]])
-
-  def testReduce2DTo1DDim1F32(self):
-    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.Reduce(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF32Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF32Computation(),
-        dimensions=[1])
-    self._ExecuteAndCompareClose(c, expected=[[6, 15]])
-
-  def testReduce2DTo1DDim1F64(self):
-    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.Reduce(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF64Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF64Computation(),
-        dimensions=[1])
-    self._ExecuteAndCompareClose(c, expected=[[6, 15]])
-
-  def testReduce3DAllPossibleWaysF32(self):
-    input_array = self._MakeSample3DArrayF32()
-
-    def _ReduceAndTest(*dims):
+    def testCompileWithWrongElementTypeInLayout(self):
       c = self._NewComputation()
-      c.Reduce(
-          operand=c.Constant(input_array),
-          init_value=c.ConstantF32Scalar(0),
-          computation_to_apply=self._CreateBinaryAddF32Computation(),
-          dimensions=dims)
-      self._ExecuteAndCompareClose(
-          c, expected=[np.sum(input_array, axis=tuple(dims))])
+      c.set_op_metadata(xla_client.CurrentSourceInfoMetadata())
+      ops.Parameter(c, 0, xla_client.shape_from_pyval(self.s32_scalar_2))
+      c.clear_op_metadata()
 
-    _ReduceAndTest(0)
-    _ReduceAndTest(0, 1)
-    _ReduceAndTest(0, 2)
-    _ReduceAndTest(1, 2)
-    _ReduceAndTest(0, 1, 2)
+      options = xla_client.CompileOptions()
+      options.argument_layouts = [
+          xla_client.Shape.array_shape(np.dtype(np.float32), [])
+      ]
 
-  def testReduce3DAllPossibleWaysF64(self):
-    input_array = self._MakeSample3DArrayF64()
+      def TestFun():
+        return self.backend.compile(c.build(), compile_options=options)
 
-    def _ReduceAndTest(*dims):
+      self.assertRaisesRegex(
+          RuntimeError, r".*Invalid argument shape.*"
+          r"expected s32\[\], got f32\[\].*", TestFun)
+
+    def testInvokeWithWrongElementType(self):
       c = self._NewComputation()
-      c.Reduce(
-          operand=c.Constant(input_array),
-          init_value=c.ConstantF64Scalar(0),
-          computation_to_apply=self._CreateBinaryAddF64Computation(),
-          dimensions=dims)
-      self._ExecuteAndCompareClose(
-          c, expected=[np.sum(input_array, axis=tuple(dims))])
+      c.set_op_metadata(xla_client.CurrentSourceInfoMetadata())
+      ops.Parameter(c, 0, xla_client.shape_from_pyval(self.s32_scalar_2))
+      c.clear_op_metadata()
 
-    _ReduceAndTest(0)
-    _ReduceAndTest(0)
-    _ReduceAndTest(0, 1)
-    _ReduceAndTest(0, 2)
-    _ReduceAndTest(1, 2)
-    _ReduceAndTest(0, 1, 2)
+      def TestFun():
+        return xla_client.execute_with_python_values(
+            self.backend.compile(c.build()), [self.f32_scalar_2], self.backend)
 
-  def testReduceWindowValidUnitStridesF32(self):
-    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.ReduceWindow(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF32Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF32Computation(),
-        window_dimensions=(2, 1),
-        window_strides=(1, 1),
-        padding=xla_client.PaddingType.VALID)
-    self._ExecuteAndCompareClose(c, expected=[[[5., 7., 9.]]])
+      self.assertRaisesRegex(
+          RuntimeError, r"Invalid argument: Argument does not match.*"
+          r"want s32\[\], got f32\[\].*", TestFun)
 
-  def testReduceWindowSameUnitStridesF32(self):
-    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.ReduceWindow(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF32Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF32Computation(),
-        window_dimensions=(2, 1),
-        window_strides=(1, 1),
-        padding=xla_client.PaddingType.SAME)
-    self._ExecuteAndCompareClose(c, expected=[[[5., 7., 9.], [4., 5., 6.]]])
+  tests.append(EmbeddedComputationsTest)
 
-  def testReduceWindowValidGeneralStridesF32(self):
-    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.ReduceWindow(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF32Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF32Computation(),
-        window_dimensions=(2, 1),
-        window_strides=(1, 2),
-        padding=xla_client.PaddingType.VALID)
-    self._ExecuteAndCompareClose(c, expected=[[[5., 9.]]])
+  class ComputationRootTest(ComputationTest):
+    """Tests related to setting the root of the computation."""
 
-  def testReduceWindowValidUnitStridesF64(self):
-    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.ReduceWindow(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF64Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF64Computation(),
-        window_dimensions=(2, 1),
-        window_strides=(1, 1),
-        padding=xla_client.PaddingType.VALID)
-    self._ExecuteAndCompareClose(c, expected=[[[5., 7., 9.]]])
+    def testComputationRootDifferentFromLastOp(self):
+      c = self._NewComputation()
+      x = ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(2.0)))
+      result = ops.Add(x, ops.Constant(c, np.float32(3.14)))
+      ops.Add(result, ops.Constant(c, np.float32(1.618)))
 
-  def testReduceWindowSameUnitStridesF64(self):
-    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.ReduceWindow(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF64Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF64Computation(),
-        window_dimensions=(2, 1),
-        window_strides=(1, 1),
-        padding=xla_client.PaddingType.SAME)
-    self._ExecuteAndCompareClose(c, expected=[[[5., 7., 9.], [4., 5., 6.]]])
+      arg = NumpyArrayF32(1.0)
+      compiled_c = self.backend.compile(c.build(result))
+      ans, = xla_client.execute_with_python_values(
+          compiled_c, [arg], backend=self.backend)
+      np.testing.assert_allclose(ans, 4.14)
 
-  def testReduceWindowValidGeneralStridesF64(self):
-    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    c = self._NewComputation()
-    c.ReduceWindow(
-        operand=c.Constant(input_array),
-        init_value=c.ConstantF64Scalar(0),
-        computation_to_apply=self._CreateBinaryAddF64Computation(),
-        window_dimensions=(2, 1),
-        window_strides=(1, 2),
-        padding=xla_client.PaddingType.VALID)
-    self._ExecuteAndCompareClose(c, expected=[[[5., 9.]]])
+  tests.append(ComputationRootTest)
 
-  def testWhileF32(self):
-    cond = self._CreateTestF32Lt10Computation()
-    body = self._CreateMulF32By2Computation()
-    c = self._NewComputation()
-    init = c.ConstantF32Scalar(1.)
-    c.While(cond, body, init)
-    self._ExecuteAndCompareClose(c, expected=[16.])
+  class SetShardingTest(ComputationTest):
+    """Tests related to set OpSharding."""
 
-  def testWhileF64(self):
-    cond = self._CreateTestF64Lt10Computation()
-    body = self._CreateMulF64By2Computation()
-    c = self._NewComputation()
-    init = c.ConstantF64Scalar(1.)
-    c.While(cond, body, init)
-    self._ExecuteAndCompareClose(c, expected=[16.])
+    def testSetSharding(self):
+      c = self._NewComputation()
+      sharding = xla_client.OpSharding()
+      sharding.type = sharding.type.REPLICATED
+      sharding.tile_assignment_dimensions.extend([1])
+      sharding.tile_assignment_devices.extend([0])
+      c.set_sharding(sharding)
+      x = ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(2.0)))
+      c.clear_sharding()
 
-  def testConditionalTrue(self):
-    c = self._NewComputation()
-    pred = c.ConstantPredScalar(True)
-    true_operand = c.ConstantF32Scalar(3.)
-    true_computation = self._CreateMulF32By2Computation()
-    false_operand = c.ConstantF32Scalar(2.)
-    false_computation = self._CreateConstantF32Computation()
-    c.Conditional(pred, true_operand, true_computation, false_operand,
-                  false_computation)
-    self._ExecuteAndCompareClose(c, expected=[6.])
+      result = ops.Add(x, ops.Constant(c, np.float32(3.14)))
+      ops.Add(result, ops.Constant(c, np.float32(1.618)))
+      arg = NumpyArrayF32(1.0)
+      compiled_c = self.backend.compile(c.build(result))
+      ans, = xla_client.execute_with_python_values(
+          compiled_c, [arg], backend=self.backend)
+      np.testing.assert_allclose(ans, 4.14)
 
-  def testConditionalFalse(self):
-    c = self._NewComputation()
-    pred = c.ConstantPredScalar(False)
-    true_operand = c.ConstantF32Scalar(3.)
-    true_computation = self._CreateMulF32By2Computation()
-    false_operand = c.ConstantF32Scalar(2.)
-    false_computation = self._CreateConstantF32Computation()
-    c.Conditional(pred, true_operand, true_computation, false_operand,
-                  false_computation)
-    self._ExecuteAndCompareClose(c, expected=[1.])
+  tests.append(SetShardingTest)
 
-  def testInfeedS32Values(self):
-    to_infeed = NumpyArrayS32([1, 2, 3, 4])
-    c = self._NewComputation()
-    c.GetTupleElement(c.Infeed(xla_client.shape_from_pyval(to_infeed[0])), 0)
-    compiled_c = c.Build().Compile()
-    for item in to_infeed:
-      xla_client.transfer_to_infeed(item)
+  class AliasTest(ComputationTest):
 
-    for item in to_infeed:
-      result, = xla_client.execute_with_python_values(compiled_c)
-      self.assertEqual(result, item)
+    def testSetUpAlias(self):
+      c = self._NewComputation()
+      p1 = ops.Parameter(
+          c, 0,
+          xla_client.shape_from_pyval(
+              NumpyArrayF32(1.0)).with_major_to_minor_layout_if_absent())
+      p2 = ops.Parameter(
+          c, 1,
+          xla_client.shape_from_pyval(
+              NumpyArrayF32(1.0)).with_major_to_minor_layout_if_absent())
+      out = ops.Add(p1, p2)
+      c.setup_alias([], 0, [])
+      c = c.build(out)
+      if self.backend.platform != "tpu":
+        with self.assertRaisesRegex(
+            RuntimeError, "Buffer aliasing is not supported "
+            "by XLA for non-TPU backends"):
+          self.backend.compile(c)
 
-  def testInfeedTuple(self):
-    to_infeed = (NumpyArrayS32([1, 2, 3, 4]), NumpyArrayS32([[7], [8]]))
-    c = self._NewComputation()
-    c.GetTupleElement(c.Infeed(xla_client.shape_from_pyval(to_infeed)), 0)
-    compiled_c = c.Build().Compile()
-    xla_client.transfer_to_infeed(to_infeed)
+  tests.append(AliasTest)
 
-    result = xla_client.execute_with_python_values(compiled_c)
-    self.assertLen(result, 2)
-    np.testing.assert_equal(result[0], to_infeed[0])
-    np.testing.assert_equal(result[1], to_infeed[1])
+  testcase_shapes = [
+      (),
+      (1,),
+      (2, 3),
+      (2, 0),
+      (0, 7),
+      (4, 1, 2),
+      (2, 1, 3),
+      (2, 4, 1),
+      (3, 1),
+      (1, 3),
+  ]
 
-  def testInfeedThenOutfeedS32(self):
-    to_round_trip = NumpyArrayS32([1, 2, 3, 4])
-    c = self._NewComputation()
-    x_and_token = c.Infeed(xla_client.shape_from_pyval(to_round_trip[0]))
-    x = c.GetTupleElement(x_and_token, 0)
-    token = c.GetTupleElement(x_and_token, 1)
-    c.Outfeed(x, token)
+  def FormatShapeAndDtype(shape, dtype):
+    return "_{}[{}]".format(np.dtype(dtype).name, ",".join(map(str, shape)))
 
-    compiled_c = c.Build().Compile()
+  class DLPackTest(parameterized.TestCase):
 
-    for want in to_round_trip:
-      execution = threading.Thread(target=lambda: compiled_c.Execute([]))
-      execution.start()
-      xla_client.transfer_to_infeed(want)
-      got = xla_client.transfer_from_outfeed(
-          xla_client.shape_from_pyval(to_round_trip[0]))
-      execution.join()
-      self.assertEqual(want, got)
+    def setUp(self):
+      super(DLPackTest, self).setUp()
+      self.backend = xla_backend()
+      if self.backend.platform not in ("cpu", "gpu"):
+        self.skipTest("DLPack requires CPU or GPU")
 
-  def testScatter(self):
-    a = np.arange(9).astype(np.int32).reshape((3, 3))
-    scatter_indices = np.array([0, 2], dtype=np.int32)
-    updates = np.array([[10, 20, 30], [70, 80, 90]], dtype=np.int32)
+    # pylint: disable=g-complex-comprehension
+    @parameterized.named_parameters({
+        "testcase_name": FormatShapeAndDtype(shape, dtype),
+        "dtype": dtype,
+        "shape": shape
+    } for dtype in dlpack_dtypes for shape in testcase_shapes)
+    def testRoundTrip(self, dtype, shape):
+      x = np.array(np.random.rand(*shape) * 100, dtype=dtype)
+      buffer = self.backend.buffer_from_pyval(x)
+      dlt = xla_client._xla.buffer_to_dlpack_managed_tensor(buffer)
+      del buffer  # Free "buffer" to make sure dlt retains ownership.
+      self.assertEqual(type(dlt).__name__, "PyCapsule")
+      y = xla_client._xla.dlpack_managed_tensor_to_buffer(
+          dlt, self.backend)
+      np.testing.assert_array_equal(x, y.to_py())
 
-    dnums = xla_client.ScatterDimensionNumbers()
-    dnums.update_window_dims.append(1)
-    dnums.inserted_window_dims.append(0)
-    dnums.scatter_dims_to_operand_dims.append(0)
-    dnums.index_vector_dim = 1
+    def testTensorsCanBeConsumedOnceOnly(self):
+      x = np.array(np.random.rand(3, 4, 5, 6), dtype=np.float32)
+      buffer = self.backend.buffer_from_pyval(x)
+      dlt = xla_client._xla.buffer_to_dlpack_managed_tensor(buffer)
 
-    c = self._NewComputation()
-    c.Scatter(
-        c.Constant(a), c.Constant(scatter_indices), c.Constant(updates),
-        self._CreateBinaryAddS32Computation(), dnums)
-    expected = np.array([[10, 21, 32], [3, 4, 5], [76, 87, 98]], dtype=np.int32)
-    self._ExecuteAndCompareClose(c, expected=[expected])
+      def ConsumeDLPackTensor():
+        _ = xla_client._xla.dlpack_managed_tensor_to_buffer(
+            dlt, self.backend)
+
+      ConsumeDLPackTensor()
+      self.assertRaisesRegex(
+          RuntimeError, ".*a DLPack tensor may be consumed at most once.*",
+          ConsumeDLPackTensor)
+
+  tests.append(DLPackTest)
+
+  class BufferProtocolTest(parameterized.TestCase):
+
+    def setUp(self):
+      super(BufferProtocolTest, self).setUp()
+      self.backend = xla_backend()
+      if self.backend.platform != "cpu":
+        self.skipTest("Test requires CPU")
+
+    # pylint: disable=g-complex-comprehension
+    @parameterized.named_parameters({
+        "testcase_name": FormatShapeAndDtype(shape, dtype),
+        "dtype": dtype,
+        "shape": shape
+    } for dtype in standard_dtypes if dtype != bfloat16
+                                    for shape in testcase_shapes)
+    def testRoundTrip(self, dtype, shape):
+      x = np.array(np.random.rand(*shape) * 100, dtype=dtype)
+      x_ptr = x.__array_interface__["data"][0]
+      buffer = self.backend.buffer_from_pyval(x)
+      y = np.array(buffer, copy=False)
+      y_ptr = y.__array_interface__["data"][0]
+      np.testing.assert_array_equal(x, y)
+      # If the input was sufficiently aligned, the input and output should
+      # alias.
+      self.assertTrue((x_ptr & 15) != 0 or x_ptr == y_ptr)
+      self.assertEqual(y_ptr, buffer.unsafe_buffer_pointer())
+
+      buffer2 = self.backend.buffer_from_pyval(x, force_copy=True)
+      z = np.array(buffer2, copy=False)
+      self.assertNotEqual(x.__array_interface__["data"][0],
+                          z.__array_interface__["data"][0])
+
+    def testDeleteWithActiveView(self):
+      x = np.random.randn(20, 10)
+      buffer = self.backend.buffer_from_pyval(x)
+      buffer_ptr = buffer.unsafe_buffer_pointer()
+      y = np.array(buffer, copy=False)
+      buffer.delete()
+      # It is still legal to access `y`; the array view must keep it alive.
+      np.testing.assert_array_equal(x, y)
+      self.assertEqual(y.__array_interface__["data"][0], buffer_ptr)
+
+  tests.append(BufferProtocolTest)
+
+  class ProfilerTest(absltest.TestCase):
+
+    def testTraceMe(self):
+      # TODO(phawkins): These tests just check that the TraceMe context manager
+      # acts like a context manager and doesn't explode. Ideally we'd check that
+      # the profiler saw the traceme too.
+      with xla_client.profiler.TraceMe("test1"):
+        pass
+      with xla_client.profiler.TraceMe("test2", foo=123):
+        pass
+      with self.assertRaises(ValueError):
+        with xla_client.profiler.TraceMe("test3"):
+          raise ValueError("test")
+
+    @unittest.skipIf(portpicker is None, "Test requires portpicker")
+    def testStartServer(self):
+      port = portpicker.pick_unused_port()
+      server = xla_client.profiler.start_server(port)
+      del server
+
+  tests.append(ProfilerTest)
+  return tests
 
 
-class ErrorTest(ComputationTest):
-
-  def setUp(self):
-    self.f32_scalar_2 = NumpyArrayF32(2.0)
-    self.s32_scalar_2 = NumpyArrayS32(2)
-
-  def testCompileWithWrongElementTypeInLayout(self):
-    c = self._NewComputation()
-    c.SetOpMetadata(xla_client.CurrentSourceInfoMetadata())
-    c.ParameterFromNumpy(self.s32_scalar_2)
-    c.ClearOpMetadata()
-
-    options = xla_client.CompileOptions()
-    options.argument_layouts = [
-        xla_client.Shape.array_shape(np.dtype(np.float32), [])
-    ]
-
-    def TestFun():
-      return c.Build().Compile(compile_options=options)
-
-    self.assertRaisesRegex(
-        RuntimeError, r".*Invalid argument shape.*"
-        r"expected s32\[\], got f32\[\].*", TestFun)
-
-  def testInvokeWithWrongElementType(self):
-    c = self._NewComputation()
-    c.SetOpMetadata(xla_client.CurrentSourceInfoMetadata())
-    c.ParameterFromNumpy(self.s32_scalar_2)
-    c.ClearOpMetadata()
-
-    def TestFun():
-      return xla_client.execute_with_python_values(c.Build().Compile(),
-                                                   [self.f32_scalar_2])
-
-    self.assertRaisesRegex(
-        RuntimeError, r"Invalid argument: Argument does not match.*"
-        r"want s32\[\], got f32\[\].*", TestFun)
-
-
-class ComputationRootTest(ComputationTest):
-  """Tests related to setting the root of the computation."""
-
-  def testComputationRootDifferentFromLastOp(self):
-    c = self._NewComputation()
-    x = c.ParameterFromNumpy(NumpyArrayF32(2.0))
-    result = c.Add(x, c.ConstantF32Scalar(3.14))
-    extra = c.Add(result, c.ConstantF32Scalar(1.618))  # pylint: disable=unused-variable
-
-    arg = NumpyArrayF32(1.0)
-    compiled_c = c.Build(result).Compile()
-    ans, = xla_client.execute_with_python_values(compiled_c, [arg])
-    np.testing.assert_allclose(ans, 4.14)
-
-
-class SetShardingTest(ComputationTest):
-  """Tests related to set OpSharding."""
-
-  def testSetSharding(self):
-    c = self._NewComputation()
-    sharding = xla_client.OpSharding()
-    sharding.type = sharding.type.REPLICATED
-    sharding.tile_assignment_dimensions.extend([1])
-    sharding.tile_assignment_devices.extend([0])
-    # Set Sharding.
-    c.SetSharding(sharding)
-    x = c.ParameterFromNumpy(NumpyArrayF32(2.0))
-    # Clear Sharding.
-    c.ClearSharding()
-
-    result = c.Add(x, c.ConstantF32Scalar(3.14))
-    extra = c.Add(result, c.ConstantF32Scalar(1.618))  # pylint: disable=unused-variable
-    arg = NumpyArrayF32(1.0)
-    compiled_c = c.Build(result).Compile()
-    ans, = xla_client.execute_with_python_values(compiled_c, [arg])
-    np.testing.assert_allclose(ans, 4.14)
-
-
-int_dtypes = [
-    np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32,
-    np.uint64
-]
-float_dtypes = [np.float16, np.float32, np.float64]
-complex_dtypes = [np.complex64, np.complex128]
-dlpack_dtypes = int_dtypes + float_dtypes + [bfloat16]
-standard_dtypes = int_dtypes + float_dtypes + complex_dtypes + [np.bool_]
-
-testcase_shapes = [
-    (),
-    (1,),
-    (2, 3),
-    (2, 0),
-    (0, 7),
-    (4, 1, 2),
-    (2, 1, 3),
-    (2, 4, 1),
-    (3, 1),
-    (1, 3),
-]
-
-
-def FormatShapeAndDtype(shape, dtype):
-  return "_{}[{}]".format(np.dtype(dtype).name, ",".join(map(str, shape)))
-
-
-class DLPackTest(parameterized.TestCase):
-
-  # pylint: disable=g-complex-comprehension
-  @parameterized.named_parameters({
-      "testcase_name": FormatShapeAndDtype(shape, dtype),
-      "dtype": dtype,
-      "shape": shape
-  } for dtype in dlpack_dtypes for shape in testcase_shapes)
-  def testRoundTrip(self, dtype, shape):
-    x = np.array(np.random.rand(*shape) * 100, dtype=dtype)
-    backend = xla_client.get_local_backend()
-    buffer = xla_client.Buffer.from_pyval(x, backend=backend)
-    dlt = xla_client._xla.BufferToDLPackManagedTensor(buffer)
-    del buffer  # Free "buffer" to make sure dlt retains ownership.
-    self.assertEqual(type(dlt).__name__, "PyCapsule")
-    y = xla_client._xla.DLPackManagedTensorToBuffer(dlt, backend.client)
-    np.testing.assert_array_equal(x, y.to_py())
-
-  def testTensorsCanBeConsumedOnceOnly(self):
-    x = np.array(np.random.rand(3, 4, 5, 6), dtype=np.float32)
-    backend = xla_client.get_local_backend()
-    buffer = xla_client.Buffer.from_pyval(x, backend=backend)
-    dlt = xla_client._xla.BufferToDLPackManagedTensor(buffer)
-
-    def ConsumeDLPackTensor():
-      _ = xla_client._xla.DLPackManagedTensorToBuffer(dlt, backend.client)
-
-    ConsumeDLPackTensor()
-    self.assertRaisesRegex(RuntimeError,
-                           ".*a DLPack tensor may be consumed at most once.*",
-                           ConsumeDLPackTensor)
-
-
-class BufferProtocolTest(parameterized.TestCase):
-
-  # pylint: disable=g-complex-comprehension
-  @parameterized.named_parameters({
-      "testcase_name": FormatShapeAndDtype(shape, dtype),
-      "dtype": dtype,
-      "shape": shape
-  } for dtype in standard_dtypes for shape in testcase_shapes)
-  def testRoundTrip(self, dtype, shape):
-    x = np.array(np.random.rand(*shape) * 100, dtype=dtype)
-    x_ptr = x.__array_interface__["data"][0]
-    backend = xla_client.get_local_backend("cpu")
-    buffer = xla_client.Buffer.from_pyval(x, backend=backend)
-    y = np.array(buffer, copy=False)
-    y_ptr = y.__array_interface__["data"][0]
-    np.testing.assert_array_equal(x, y)
-    # If the input was sufficiently aligned, the input and output should alias.
-    self.assertTrue((x_ptr & 63) != 0 or x_ptr == y_ptr)
-    self.assertEqual(y_ptr, buffer.unsafe_buffer_pointer())
-
-    buffer2 = xla_client.Buffer.from_pyval(x, backend=backend, force_copy=True)
-    z = np.array(buffer2, copy=False)
-    self.assertNotEqual(x.__array_interface__["data"][0],
-                        z.__array_interface__["data"][0])
-
-  def testDeleteWithActiveView(self):
-    x = np.random.randn(20, 10)
-    backend = xla_client.get_local_backend("cpu")
-    buffer = xla_client.Buffer.from_pyval(x, backend=backend)
-    buffer_ptr = buffer.unsafe_buffer_pointer()
-    y = np.array(buffer, copy=False)
-    buffer.delete()
-    # It is still legal to access `y`; the array view must keep it alive.
-    np.testing.assert_array_equal(x, y)
-    self.assertEqual(y.__array_interface__["data"][0], buffer_ptr)
-
-
-class ProfilerTest(absltest.TestCase):
-
-  def testTraceMe(self):
-    # TODO(phawkins): These tests just check that the TraceMe context manager
-    # acts like a context manager and doesn't explode. Ideally we'd check that
-    # the profiler saw the traceme too.
-    with xla_client.profiler.TraceMe("test1"):
-      pass
-    with xla_client.profiler.TraceMe("test2", foo=123):
-      pass
-    with self.assertRaises(ValueError):
-      with xla_client.profiler.TraceMe("test3"):
-        raise ValueError("test")
-
-  @unittest.skipIf(portpicker is None, "Test requires portpicker")
-  def testStartServer(self):
-    port = portpicker.pick_unused_port()
-    server = xla_client.profiler.start_server(port)
-    del server
+def InstantiateTests(globals_dict, backend_fn, test_prefix="", **kw):
+  # Avoid creating a new backend per test (this causes GPU OOM, and is probably
+  # inefficient).
+  backend_fn = functools.lru_cache(maxsize=None)(backend_fn)
+  for klass in TestFactory(backend_fn, **kw):
+    test = type(test_prefix + klass.__name__, (klass,), {})
+    # Clean up the qualified names of the tests to not include the test factory.
+    test.__qualname__ = test.__name__
+    globals_dict[test.__name__] = test
 
 
 if __name__ == "__main__":
+  flags.DEFINE_string("backend", "cpu", "Target backend.")
+  InstantiateTests(globals(),
+                   lambda: xla_client.get_local_backend(FLAGS.backend))
   absltest.main()
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 3f06c6a29ce..a8f20827c6d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -460,6 +460,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_sharding_util",
+    srcs = [
+        "hlo_sharding_util.cc",
+    ],
+    hdrs = [
+        "hlo_sharding_util.h",
+    ],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:array",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_sharding_util_test",
+    srcs = [
+        "hlo_sharding_util_test.cc",
+    ],
+    deps = [
+        ":hlo_sharding_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 tf_cc_test(
     name = "dynamic_parameter_binding_test",
     srcs = ["dynamic_parameter_binding_test.cc"],
@@ -977,7 +1008,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:gpu_transfer_manager",
         "//tensorflow/core:stream_executor_no_cuda",
     ] + if_cuda_is_configured([
-        "//tensorflow/compiler/xla/service/mlir_gpu:mlir_compiler",
+        "//tensorflow/compiler/xla/service/mlir_gpu:mlir_compiler_impl",
     ]),
 )
 
@@ -1078,6 +1109,7 @@ cc_library(
     srcs = ["compiler.cc"],
     hdrs = ["compiler.h"],
     deps = [
+        ":buffer_assignment",
         ":buffer_value",
         ":computation_placer",
         ":executable",
@@ -1122,6 +1154,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -2120,6 +2153,51 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "conditional_code_motion",
+    srcs = ["conditional_code_motion.cc"],
+    hdrs = ["conditional_code_motion.h"],
+    deps = [
+        ":call_graph",
+        ":call_inliner",
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_dce",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":tuple_simplifier",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "conditional_code_motion_test",
+    srcs = ["conditional_code_motion_test.cc"],
+    deps = [
+        ":conditional_code_motion",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "convolution_group_converter",
     srcs = ["convolution_group_converter.cc"],
@@ -2350,6 +2428,42 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "all_gather_decomposer",
+    srcs = ["all_gather_decomposer.cc"],
+    hdrs = ["all_gather_decomposer.h"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "all_gather_decomposer_test",
+    srcs = ["all_gather_decomposer_test.cc"],
+    deps = [
+        ":all_gather_decomposer",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "tuple_simplifier",
     srcs = ["tuple_simplifier.cc"],
@@ -3170,6 +3284,7 @@ cc_library(
         ":heap_simulator",
         ":hlo_cost_analysis",
         "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/core/lib/math:math_util",
     ],
 )
 
@@ -3187,6 +3302,29 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "memory_space_propagation",
+    srcs = ["memory_space_propagation.cc"],
+    hdrs = ["memory_space_propagation.h"],
+    deps = [
+        ":hlo",
+        ":hlo_dataflow_analysis",
+        ":hlo_pass",
+    ],
+)
+
+tf_cc_test(
+    name = "memory_space_propagation_test",
+    srcs = ["memory_space_propagation_test.cc"],
+    deps = [
+        ":hlo_parser",
+        ":memory_space_propagation",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_dce",
     srcs = ["hlo_dce.cc"],
@@ -3740,6 +3878,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:transform_utils",
     ],
@@ -4116,6 +4255,28 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "root_instruction_sinker",
+    srcs = ["root_instruction_sinker.cc"],
+    hdrs = ["root_instruction_sinker.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":tuple_util",
+    ],
+)
+
+tf_cc_test(
+    name = "root_instruction_sinker_test",
+    srcs = ["root_instruction_sinker_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":root_instruction_sinker",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "while_util",
     srcs = ["while_util.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
old mode 100644
new mode 100755
index 4c0dcbbd2ad..2fbfd156844
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -494,6 +495,10 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   StatusOr<bool> FoldConvInputPad(HloInstruction* convolution);
   StatusOr<bool> FoldConvFilterPad(HloInstruction* convolution);
 
+  // Tries to swap convolution operands if they would result in a more efficient
+  // convolution.
+  StatusOr<bool> SwapConvOperands(HloInstruction* convolution);
+
   // Tries to use a kDot in place of the given convolution.
   StatusOr<bool> SimplifyConvToDot(HloInstruction* convolution);
 
@@ -503,6 +508,13 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // Tries to convert slice(reshape(X)) into reshape(slice(X))
   StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
 
+  // Tries to simplify `(and (< a N) (< a K))` in cases where `N <= K` into
+  // `(< a N)`. This is crucial for being able to figure out the loop trip
+  // count.
+  //
+  // Assumes that the input is conjunction.
+  StatusOr<bool> TrySimplifyTautologicalCompare(HloInstruction* conjunction);
+
   // Useful when we want to use the same visitor over multiple computations.
   void ResetState(HloComputation* computation);
 
@@ -811,6 +823,8 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
     // Concatenate the indices and updates
     if (index_concat_is_safe && same_dimension_numbers &&
         index_concat_dimension &&
+        lhs_scatter_index->shape().element_type() ==
+            rhs_scatter_index->shape().element_type() &&
         ShapeUtil::SameDimensions(lhs_update_window, rhs_update_window)) {
       TF_ASSIGN_OR_RETURN(HloInstruction * new_operand,
                           MakeBinaryHlo(HloOpcode::kAdd, lhs_scatter_operand,
@@ -849,6 +863,57 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   return Status::OK();
 }
 
+StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyTautologicalCompare(
+    HloInstruction* conjunction) {
+  HloInstruction *lhs, *rhs;
+  if (!Match(conjunction, m::And(m::Op(&lhs), m::Op(&rhs)))) {
+    return false;
+  }
+  struct LessThanCompareInfo {  // (LT var constant)
+    HloInstruction* var;
+    int64 constant;
+  };
+
+  auto get_compare_info_helper =
+      [&](HloInstruction* lhs,
+          HloInstruction* rhs) -> absl::optional<LessThanCompareInfo> {
+    if (!Match(rhs, m::Constant().WithShape(
+                        m::Shape().IsEffectiveScalar().WithElementType(
+                            PrimitiveType::S32)))) {
+      return absl::nullopt;
+    }
+    return {LessThanCompareInfo{lhs, *rhs->literal().GetFirstInteger()}};
+  };
+
+  auto get_compare_info =
+      [&](HloInstruction* cmp) -> absl::optional<LessThanCompareInfo> {
+    HloInstruction *lhs, *rhs;
+    if (!Match(cmp, m::Compare(m::Op(&lhs), m::Op(&rhs))
+                        .WithComparisonDirection(ComparisonDirection::kLt))) {
+      return absl::nullopt;
+    }
+    if (auto match1 = get_compare_info_helper(lhs, rhs)) {
+      return match1;
+    } else if (auto match2 = get_compare_info_helper(rhs, lhs)) {
+      return match2;
+    }
+    return absl::nullopt;
+  };
+
+  absl::optional<LessThanCompareInfo> lhs_info = get_compare_info(lhs);
+  absl::optional<LessThanCompareInfo> rhs_info = get_compare_info(rhs);
+  if (lhs_info && rhs_info && lhs_info->var == rhs_info->var) {
+    int64 new_bound = std::min(lhs_info->constant, rhs_info->constant);
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+        conjunction,
+        HloInstruction::CreateCompare(lhs->shape(), lhs_info->var,
+                                      MakeScalarLike(lhs_info->var, new_bound),
+                                      ComparisonDirection::kLt)));
+    return true;
+  }
+  return false;
+}
+
 Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs))));
@@ -883,6 +948,13 @@ Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
     return Status::OK();
   }
 
+  // Simplify tautological conjunctions.
+  TF_ASSIGN_OR_RETURN(bool found_tautological_compare,
+                      TrySimplifyTautologicalCompare(logical_and));
+  if (found_tautological_compare) {
+    return Status::OK();
+  }
+
   return Status::OK();
 }
 
@@ -1416,6 +1488,22 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
     return ReplaceInstruction(divide, new_divide);
   }
 
+  // If X is a convert from pred, then
+  // X / broadcast(Y) => broadcast(1/Y) * X
+  if (Match(divide,
+            m::Divide(
+                m::Convert(&a,
+                           m::Op().WithShape(m::Shape().WithElementType(PRED))),
+                m::Broadcast(m::Op(&b).WithShape(m::Shape().IsScalar()))))) {
+    TF_ASSIGN_OR_RETURN(
+        auto recip, MakeBinaryHlo(HloOpcode::kDivide, MakeScalarLike(b, 1), b));
+    auto recip_bcast = computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(divide->shape(), recip, {}));
+    TF_ASSIGN_OR_RETURN(auto mul,
+                        MakeBinaryHlo(HloOpcode::kMultiply, recip_bcast, a));
+    return ReplaceInstruction(divide, mul);
+  }
+
   return Status::OK();
 }
 
@@ -2964,26 +3052,6 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
                                             MakeScalarLike(lhs, 1), lhs));
   }
 
-  VLOG(10) << "trying transform [pow(pow(A, X), Y) => pow(A, X*Y)]: "
-           << power->ToString();
-
-  // Don't perform this optimization if either of the exponents is complex; this
-  // identity is true only for real-valued exponents.  In addition, we cowardly
-  // refuse to do this transformation if the two exponents have different
-  // element types.
-  if (lhs->opcode() == HloOpcode::kPower &&
-      !ShapeUtil::ElementIsComplex(lhs->operand(1)->shape()) &&
-      !ShapeUtil::ElementIsComplex(rhs->shape()) &&
-      ShapeUtil::SameElementType(lhs->operand(1)->shape(), rhs->shape())) {
-    auto exponent_product =
-        computation_->AddInstruction(HloInstruction::CreateBinary(
-            rhs->shape(), HloOpcode::kMultiply, lhs->mutable_operand(1), rhs));
-    return ReplaceWithNewInstruction(
-        power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kPower,
-                                            lhs->mutable_operand(0),
-                                            exponent_product));
-  }
-
   return Status::OK();
 }
 
@@ -3651,6 +3719,39 @@ Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
         MakeBroadcastHlo(new_dynamic_slice, operand->dimensions(),
                          dynamic_slice->shape()));
   }
+
+  // Convert a dynamic slice into a slice if all offsets are  constant and the
+  // operand is not constant. If ev
+  if (operand->opcode() != HloOpcode::kConstant &&
+      absl::c_all_of(absl::MakeSpan(dynamic_slice->operands().begin() + 1,
+                                    dynamic_slice->operands().end()),
+                     [](HloInstruction* operand) {
+                       return operand->opcode() == HloOpcode::kConstant &&
+                              ShapeUtil::ElementIsIntegral(operand->shape());
+                     })) {
+    const int64 rank = operand->shape().rank();
+    std::vector<int64> slice_starts(rank);
+    std::vector<int64> slice_limits(rank);
+    std::vector<int64> slice_strides(rank, 1);
+
+    for (int64 i = 0; i < rank; ++i) {
+      absl::optional<int64> offset =
+          dynamic_slice->operand(i + 1)->literal().GetFirstInteger();
+      if (!offset || *offset < 0) {
+        return Status::OK();
+      }
+      const int64 max_offset =
+          dynamic_slice->operand(0)->shape().dimensions(i) -
+          dynamic_slice->shape().dimensions(i);
+      slice_starts[i] = std::min(max_offset, *offset);
+      slice_limits[i] =
+          std::min(max_offset, *offset) + dynamic_slice->shape().dimensions(i);
+    }
+    return ReplaceWithNewInstruction(
+        dynamic_slice,
+        HloInstruction::CreateSlice(dynamic_slice->shape(), operand,
+                                    slice_starts, slice_limits, slice_strides));
+  }
   return Status::OK();
 }
 
@@ -3685,8 +3786,8 @@ Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
         compatible = false;
       }
     }
+    PaddingConfig padding_config;
     if (compatible) {
-      PaddingConfig padding_config;
       for (int64 dim = 0; dim < updated_shape.rank(); ++dim) {
         auto padding_config_dim = padding_config.add_dimensions();
         auto slice_dim_start = update_start_indx->operand(dim + offset);
@@ -3695,37 +3796,32 @@ Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
           break;
         }
         VLOG(2) << "slice :" << slice_dim_start->ToString();
-        int64 beg;
-        if (slice_dim_start->shape().element_type() == S32) {
-          beg = slice_dim_start->literal().Get<int32>({});
-        } else if (slice_dim_start->shape().element_type() == U32) {
-          beg = slice_dim_start->literal().Get<uint32>({});
-        } else {
+        absl::optional<int64> beg =
+            slice_dim_start->literal().GetFirstInteger();
+        if (!beg) {
           compatible = false;
           break;
         }
-        VLOG(2) << "beg value:" << beg;
+        VLOG(2) << "beg value:" << *beg;
         auto update_width = ShapeUtil::GetDimension(update_shape, dim);
         auto bcast_width = ShapeUtil::GetDimension(updated_shape, dim);
-        padding_config_dim->set_edge_padding_low(beg);
+        padding_config_dim->set_edge_padding_low(*beg);
         padding_config_dim->set_edge_padding_high(
-            std::max(bcast_width - (beg + update_width), 0LL));
+            std::max(bcast_width - (*beg + update_width), int64{0}));
         // dynamic_update_slice does not specify a stride
         padding_config_dim->set_interior_padding(0);
       }
-      if (compatible) {
-        HloInstruction* pad =
-            computation_->AddInstruction(HloInstruction::CreatePad(
-                updated_shape, dus_update, pad_value, padding_config));
-        VLOG(2) << dynamic_update_slice->ToString();
-        VLOG(2) << " with pad:" << pad->ToString();
-        VLOG(2) << " Computation before rewrite is: "
-                << dynamic_update_slice->parent()->ToString();
-        auto res = ReplaceInstruction(dynamic_update_slice, pad);
-        VLOG(2) << " Computation after rewrite is: "
-                << pad->parent()->ToString();
-        return res;
-      }
+    }
+
+    if (compatible) {
+      HloInstruction* pad =
+          computation_->AddInstruction(HloInstruction::CreatePad(
+              updated_shape, dus_update, pad_value, padding_config));
+      VLOG(2) << dynamic_update_slice->ToString();
+      VLOG(2) << " with pad:" << pad->ToString();
+      VLOG(2) << " Computation before rewrite is: "
+              << dynamic_update_slice->parent()->ToString();
+      return ReplaceInstruction(dynamic_update_slice, pad);
     }
   }
 
@@ -4481,6 +4577,107 @@ StatusOr<bool> AlgebraicSimplifierVisitor::FoldConvFilterPad(
   return true;
 }
 
+StatusOr<bool> AlgebraicSimplifierVisitor::SwapConvOperands(
+    HloInstruction* convolution) {
+  if (!options_.enable_conv_operand_swap() || options_.is_layout_sensitive()) {
+    return false;
+  }
+  if (convolution->feature_group_count() > 1 ||
+      convolution->batch_group_count() > 1) {
+    return false;
+  }
+
+  const auto& dnums = convolution->convolution_dimension_numbers();
+  const auto& window_dims = convolution->window().dimensions();
+  Window swapped_window;
+
+  HloInstruction *input = convolution->mutable_operand(0),
+                 *kernel = convolution->mutable_operand(1);
+  int64 kernel_product = 1;
+  int64 swapped_kernel_product = 1;
+  DimensionVector reverse_dimensions;
+  for (int64 spatial_dim = 0;
+       spatial_dim < dnums.input_spatial_dimensions_size(); ++spatial_dim) {
+    const int64 kernel_size = window_dims[spatial_dim].size();
+    kernel_product *= kernel_size;
+    const int64 dilated_kernel_size =
+        1 + (kernel_size - 1) * window_dims[spatial_dim].window_dilation();
+
+    const int64 input_size =
+        input->shape().dimensions(dnums.input_spatial_dimensions(spatial_dim));
+    swapped_kernel_product *= input_size;
+    const int64 dilated_input_size =
+        1 + (input_size - 1) * window_dims[spatial_dim].base_dilation();
+
+    auto new_dim = swapped_window.add_dimensions();
+    new_dim->set_size(input_size);
+    // If the kernel is not reversed, the activations must be manually reversed.
+    if (!window_dims[spatial_dim].window_reversal()) {
+      reverse_dimensions.push_back(
+          dnums.kernel_spatial_dimensions(spatial_dim));
+    }
+    // The input is not originally reversed so it must be reversed to move the
+    // kernel.
+    new_dim->set_window_reversal(true);
+    // Base dilation and window dilation switch places.
+    new_dim->set_base_dilation(window_dims[spatial_dim].window_dilation());
+    new_dim->set_window_dilation(window_dims[spatial_dim].base_dilation());
+    new_dim->set_stride(window_dims[spatial_dim].stride());
+    new_dim->set_padding_low(dilated_input_size +
+                             window_dims[spatial_dim].padding_low() -
+                             dilated_kernel_size);
+    new_dim->set_padding_high(dilated_input_size +
+                              window_dims[spatial_dim].padding_high() -
+                              dilated_kernel_size);
+  }
+
+  // Don't transform if a naive convolution implementation would not have fewer
+  // flops.
+  if (kernel_product <= swapped_kernel_product) {
+    return false;
+  }
+  ConvolutionDimensionNumbers swapped_dnums;
+  *swapped_dnums.mutable_output_spatial_dimensions() =
+      dnums.output_spatial_dimensions();
+  // Swap batch and output feature of the output.
+  swapped_dnums.set_output_batch_dimension(dnums.output_feature_dimension());
+  swapped_dnums.set_output_feature_dimension(dnums.output_batch_dimension());
+
+  // Swap input dnums with kernel dnums
+  *swapped_dnums.mutable_input_spatial_dimensions() =
+      dnums.kernel_spatial_dimensions();
+  swapped_dnums.set_input_batch_dimension(
+      dnums.kernel_output_feature_dimension());
+  swapped_dnums.set_input_feature_dimension(
+      dnums.kernel_input_feature_dimension());
+
+  // Swap kernel dnums with input dnums
+  *swapped_dnums.mutable_kernel_spatial_dimensions() =
+      dnums.input_spatial_dimensions();
+  swapped_dnums.set_kernel_output_feature_dimension(
+      dnums.input_batch_dimension());
+  swapped_dnums.set_kernel_input_feature_dimension(
+      dnums.input_feature_dimension());
+
+  PrecisionConfig precision_config;
+  precision_config.add_operand_precision(
+      convolution->precision_config().operand_precision(1));
+  precision_config.add_operand_precision(
+      convolution->precision_config().operand_precision(0));
+  if (!reverse_dimensions.empty()) {
+    TF_ASSIGN_OR_RETURN(kernel, MakeReverseHlo(kernel, reverse_dimensions));
+  }
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * new_convolution,
+      MakeConvolveHlo(kernel, input, /*feature_group_count=*/1, swapped_window,
+                      swapped_dnums, precision_config));
+
+  convolution->SetupDerivedInstruction(new_convolution);
+  TF_RETURN_IF_ERROR(ReplaceInstruction(convolution, new_convolution));
+
+  return true;
+}
+
 StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
     HloInstruction* convolution) {
   auto* lhs = convolution->mutable_operand(0);
@@ -4619,6 +4816,11 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
     return Status::OK();
   }
 
+  // Try to swap convolution operands.
+  TF_ASSIGN_OR_RETURN(bool swapped, SwapConvOperands(convolution));
+  if (swapped) {
+    return Status::OK();
+  }
   // Try to replace the convolution with a kDot instruction.
   TF_ASSIGN_OR_RETURN(bool replaced_with_dot, SimplifyConvToDot(convolution));
   if (replaced_with_dot) {
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index d3c276e9bc3..9f29df3c209 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -80,6 +80,12 @@ class AlgebraicSimplifierOptions {
     return enable_conv_simplification_;
   }
 
+  // Enable convolution operand swapping on platforms where it is supported.
+  void set_enable_conv_operand_swap(bool enable_conv_operand_swap) {
+    enable_conv_operand_swap_ = enable_conv_operand_swap;
+  }
+  bool enable_conv_operand_swap() const { return enable_conv_operand_swap_; }
+
   // If enable_window_reduce_replacement is true, the kReduceWindow instruction
   // can be optimized by replacement with simpler operations.
   void set_enable_window_reduce_to_reduce_replacement(
@@ -139,6 +145,7 @@ class AlgebraicSimplifierOptions {
   bool enable_dot_strength_reduction_{true};
   bool enable_dot_to_multiply_rewrite_{true};
   bool enable_conv_simplification_{true};
+  bool enable_conv_operand_swap_{true};
   bool enable_window_reduce_to_reduce_replacement_{true};
   bool enable_reduce_of_reshape_{true};
   bool replace_transpose_with_bitcast_{true};
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
old mode 100755
new mode 100644
index 10b437506b3..0260a925b63
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -1011,13 +1011,8 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
-  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(
-      computation->root_instruction(),
-      GmockMatch(m::Power(m::Op().Is(base),
-                          m::Multiply(m::Op().Is(exp1), m::Op().Is(exp2)))));
+  ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex
@@ -4188,6 +4183,31 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
   EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Parameter()));
 }
 
+TEST_F(AlgebraicSimplifierTest, ConstantDynamicSlice) {
+  auto m = CreateNewVerifiedModule();
+  HloComputation::Builder builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
+  std::vector<HloInstruction*> params;
+  for (int i = 0; i < 3; ++i) {
+    params.push_back(builder.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(2 << (i + 1)))));
+  }
+  Shape ds_shape = ShapeUtil::MakeShape(F32, {2, 20, 200});
+  builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+      ds_shape,
+      builder.AddInstruction(
+          HloInstruction::CreateParameter(0, shape, "operand")),
+      params,
+      /*slice_sizes=*/{2, 20, 200}));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter())));
+}
+
 // A dynamic-update-slice is trivial if its start indices are all zeroes and the
 // size of its "update" equals the size of its output.  In this case, the
 // dynamic-update-slice is equal to its update.
@@ -5741,6 +5761,25 @@ TEST_F(AlgebraicSimplifierTest, CompareSame) {
               GmockMatch(m::Broadcast(m::ConstantScalar(true))));
 }
 
+TEST_F(AlgebraicSimplifierTest, CompareSimplified) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      param = s32[] parameter(0)
+      c1 = s32[] constant(10)
+      c2 = s32[] constant(100)
+      cmp1 = pred[] compare(param, c1), direction=LT
+      cmp2 = pred[] compare(param, c2), direction=LT
+      ROOT out = pred[] and(cmp1, cmp2)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Op(), m::Op().IsConstantScalar(10))
+                     .WithComparisonDirection(ComparisonDirection::kLt)));
+}
+
 TEST_F(AlgebraicSimplifierTest, CanDisableDotToMultiplyRewrite) {
   // Some backends may have better performance by treating an outer product as a
   // Dot, rather than a broadcast Multiply
@@ -6414,5 +6453,53 @@ TEST_F(AlgebraicSimplifierTest, ScalarScatter) {
   // Combine Scatters
   ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
 }
+
+TEST_F(AlgebraicSimplifierTest, SwapConvOperands) {
+  const char* hlo_string = R"(
+  HloModule m
+  test {
+    a = f32[3,3,160,160] parameter(0)
+    b = f32[128,32,32,160] parameter(1)
+    ROOT c = f32[128,32,32,160] convolution(a,b),
+     window={size=32x32 pad=30_30x30_30 rhs_reversal=1x1},
+     dim_labels=01bf_o01i->f01b
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  // Combine Scatters
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  const HloInstruction* conv = m->entry_computation()->root_instruction();
+  EXPECT_THAT(conv,
+              GmockMatch(m::Convolution(m::Parameter(1), m::Parameter(0))));
+  EXPECT_EQ(conv->window().dimensions(0).size(), 3);
+  EXPECT_EQ(conv->window().dimensions(1).size(), 3);
+  EXPECT_EQ(conv->window().dimensions(0).window_reversal(), true);
+  EXPECT_EQ(conv->window().dimensions(1).window_reversal(), true);
+  EXPECT_EQ(conv->window().dimensions(0).padding_low(), 1);
+  EXPECT_EQ(conv->window().dimensions(1).padding_low(), 1);
+  EXPECT_EQ(conv->window().dimensions(0).padding_high(), 1);
+  EXPECT_EQ(conv->window().dimensions(1).padding_high(), 1);
+}
+
+TEST_F(AlgebraicSimplifierTest, ScalarDividePredicate) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = pred[2] parameter(0)
+      cvt = f32[2] convert(p0)
+      p1 = f32[] parameter(1)
+      bcast = f32[2] broadcast(p1), dimensions={}
+      ROOT div = f32[2] divide(cvt, bcast)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::MultiplyAnyOrder(
+          m::Convert(m::Parameter(0)),
+          m::Broadcast(m::Divide(m::ConstantScalar(1), m::Parameter(1))))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.cc b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
new file mode 100644
index 00000000000..00b9adaea43
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
@@ -0,0 +1,157 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_gather_decomposer.h"
+
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+// Creates a computation of x + y.
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
+  HloComputation::Builder sum_b("add");
+  auto x = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
+  auto y = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(type, {}), "y"));
+  if (type == PRED) {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kOr, x, y));
+  } else {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kAdd, x, y));
+  }
+  HloComputation* reduction = module->AddEmbeddedComputation(sum_b.Build());
+  return reduction;
+}
+
+Status DecomposeAllGather(HloAllGatherInstruction* ag, HloComputation* comp) {
+  const int64 shard_size =
+      ag->operand(0)->shape().dimensions(ag->all_gather_dimension());
+  const int64 ag_size = ag->shape().dimensions(ag->all_gather_dimension());
+  TF_RET_CHECK(ag_size % shard_size == 0);
+  int64 partition_count = ag_size / shard_size;
+  auto zero = comp->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(ag->shape().element_type())));
+  zero = comp->AddInstruction(
+      HloInstruction::CreateBroadcast(ag->shape(), zero, {}));
+  auto zero_index = comp->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(U32)));
+  std::vector<HloInstruction*> start_indices(ag->shape().rank(), zero_index);
+  auto shard_id_from_subgroup = [&](HloInstruction* replica_or_global_id) {
+    if (ag->replica_groups().empty()) {
+      return replica_or_global_id;
+    }
+    if (ag->replica_groups().size() == 1) {
+      // Whether the group is {1, 2, ..., N - 1}.
+      bool trivial_group = true;
+      for (int64 i = 0; i < ag->replica_groups()[0].replica_ids_size(); ++i) {
+        if (ag->replica_groups()[0].replica_ids(i) != i) {
+          trivial_group = false;
+          break;
+        }
+      }
+      if (trivial_group) {
+        CHECK_EQ(partition_count, ag->replica_groups()[0].replica_ids_size());
+        return replica_or_global_id;
+      }
+    }
+    // Create a table of shard IDs for each replica_or_global_id, then slice it
+    // using replica_or_global_id.
+    std::vector<uint32> shard_ids(ag->replica_groups().size() *
+                                  ag->replica_groups()[0].replica_ids_size());
+    for (const auto& group : ag->replica_groups()) {
+      for (int64 i = 0; i < group.replica_ids_size(); ++i) {
+        shard_ids[group.replica_ids(i)] = i;
+      }
+    }
+    auto id_table = comp->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<uint32>(shard_ids)));
+    auto shard_id = comp->AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::MakeShape(U32, {1}), id_table, {replica_or_global_id}, {1}));
+    shard_id = comp->AddInstruction(
+        HloInstruction::CreateReshape(ShapeUtil::MakeShape(U32, {}), shard_id));
+    return shard_id;
+  };
+  HloInstruction* shard_id;
+  if (ag->channel_id().has_value()) {
+    if (ag->use_global_device_ids()) {
+      auto pid = comp->AddInstruction(HloInstruction::CreatePartitionId());
+      auto rid = comp->AddInstruction(HloInstruction::CreateReplicaId());
+      auto pcount = comp->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<uint32>(partition_count)));
+      auto global_id = comp->AddInstruction(HloInstruction::CreateBinary(
+          pid->shape(), HloOpcode::kAdd, pid,
+          comp->AddInstruction(HloInstruction::CreateBinary(
+              pid->shape(), HloOpcode::kMultiply, rid, pcount))));
+      shard_id = shard_id_from_subgroup(global_id);
+    } else {
+      TF_RET_CHECK(!ag->replica_groups().empty());
+      TF_RET_CHECK(ag->replica_groups()[0].replica_ids_size() == 1);
+      shard_id = comp->AddInstruction(HloInstruction::CreatePartitionId());
+    }
+  } else {
+    shard_id = shard_id_from_subgroup(
+        comp->AddInstruction(HloInstruction::CreateReplicaId()));
+  }
+  start_indices[ag->all_gather_dimension()] =
+      comp->AddInstruction(HloInstruction::CreateBinary(
+          shard_id->shape(), HloOpcode::kMultiply, shard_id,
+          comp->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<uint32>(shard_size)))));
+  auto dus = comp->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      zero->shape(), zero, ag->mutable_operand(0), start_indices));
+  auto ar = comp->AddInstruction(HloInstruction::CreateAllReduce(
+      dus->shape(), {dus},
+      MakeBinaryAdd(dus->shape().element_type(), comp->parent()),
+      ag->replica_groups(),
+      /*constrain_layout=*/ag->constrain_layout(), ag->channel_id(),
+      ag->use_global_device_ids()));
+  TF_RETURN_IF_ERROR(ag->ReplaceAllUsesWith(ar));
+  TF_RETURN_IF_ERROR(comp->RemoveInstructionAndUnusedOperands(ag));
+  return Status::OK();
+}
+
+StatusOr<bool> AllGatherDecomposer::Run(HloModule* module) {
+  bool changed = false;
+  for (auto comp : module->MakeNonfusionComputations()) {
+    for (auto hlo : comp->MakeInstructionPostOrder()) {
+      if (hlo->opcode() != HloOpcode::kAllGather) {
+        continue;
+      }
+      auto ag = Cast<HloAllGatherInstruction>(hlo);
+      if (should_decompose_(*ag)) {
+        TF_RETURN_IF_ERROR(DecomposeAllGather(ag, comp));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.h b/tensorflow/compiler/xla/service/all_gather_decomposer.h
new file mode 100644
index 00000000000..6b20765c709
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// AllGatherDecomposer is a pass which converts unsupported all-gathers into
+// dynamic-update-slices and all-reduces.
+class AllGatherDecomposer : public HloModulePass {
+ public:
+  explicit AllGatherDecomposer(
+      std::function<bool(const HloAllGatherInstruction&)> should_decompose)
+      : should_decompose_(std::move(should_decompose)) {}
+  AllGatherDecomposer()
+      : should_decompose_(
+            [](const HloAllGatherInstruction& ag) { return true; }) {}
+  absl::string_view name() const override { return "all_gather_decomposer"; }
+
+  // Run AllGatherDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  std::function<bool(const HloAllGatherInstruction&)> should_decompose_;
+  int64 partition_count_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
new file mode 100644
index 00000000000..3df5e51a7c2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
@@ -0,0 +1,160 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_gather_decomposer.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using ::testing::AllOf;
+namespace op = xla::testing::opcode_matchers;
+using AllGatherDecomposerTest = HloTestBase;
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGather) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0), replica_groups={}, dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(op::Constant()), op::Parameter(0), op::Constant(),
+          op::Multiply(op::ReplicaId(), op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossPartitionAllGather) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0), replica_groups={{0}}, channel_id=1,
+    dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(op::Constant()), op::Parameter(0), op::Constant(),
+          op::Multiply(op::PartitionId(), op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGatherWithTrivialGroup) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0), replica_groups={{0,1,2,3}},
+    dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(op::Constant()), op::Parameter(0), op::Constant(),
+          op::Multiply(op::ReplicaId(), op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGatherWithSubgroups) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0),
+    replica_groups={{2,1,0,3}, {4,6,7,5}}, dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  auto id =
+      AllOf(op::Shape("u32[]"),
+            op::Reshape(op::DynamicSlice(op::Constant(), op::ReplicaId())));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::AllReduce(op::DynamicUpdateSlice(
+                  op::Broadcast(op::Constant()), op::Parameter(0),
+                  op::Constant(), op::Multiply(id, op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGatherWithSubgroupsGlobalIds) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0),
+    replica_groups={{2,1,0,3}, {4,6,7,5}}, dimensions={1}, channel_id=1,
+    use_global_device_ids=true
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  auto global_id =
+      op::Add(op::PartitionId(), op::Multiply(op::ReplicaId(), op::Constant()));
+  auto id = AllOf(op::Shape("u32[]"),
+                  op::Reshape(op::DynamicSlice(op::Constant(), global_id)));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::AllReduce(op::DynamicUpdateSlice(
+                  op::Broadcast(op::Constant()), op::Parameter(0),
+                  op::Constant(), op::Multiply(id, op::Constant()))));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc b/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
index b486612ff83..0b41f374900 100644
--- a/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
@@ -238,7 +238,7 @@ TEST_F(AllReduceCombinerTest, NoDependentCombination) {
 
 // Tests that AllReduce ops with different groups are not combined.
 TEST_F(AllReduceCombinerTest, GroupAllReduce) {
-  auto module = CreateNewVerifiedModule();
+  auto module = CreateNewVerifiedModule(TestName(), /*replica_count=*/4);
   HloComputation::Builder b(TestName());
   HloComputation* reduction = MakeReduction(HloOpcode::kAdd, module.get());
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
index 2e03e67c59c..4914836b34a 100644
--- a/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
@@ -78,8 +78,8 @@ test {
   ROOT tuple = (f32[8,16], f32[8,16], f32[8,16], f32[]) tuple(all-reduce, all-reduce.1, all-reduce.2, all-reduce.3)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kModuleStr, /*replica_count=*/8));
   AllReduceSimplifier simplifier(/*replica_count=*/8);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(
@@ -114,8 +114,8 @@ test {
   ROOT all-reduce.1 = f32[8,16] all-reduce(all-reduce), replica_groups={}, to_apply=sum
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kModuleStr, /*replica_count=*/8));
   AllReduceSimplifier simplifier(/*replica_count=*/8);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -155,8 +155,8 @@ test {
   ROOT tuple = (f32[8,16], f32[8,16], f32[8,16]) tuple(all-reduce, all-reduce.1, all-reduce.2)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kModuleStr, /*replica_count=*/8));
   AllReduceSimplifier simplifier(/*replica_count=*/8);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index a02d5a86a27..bfa8f1020e5 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -447,8 +447,9 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -497,8 +498,9 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -565,8 +567,9 @@ ENTRY %entrycomp (p: f32[2,1]) -> (f32[2], f32[2]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -633,8 +636,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -675,8 +679,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -756,8 +761,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -809,8 +815,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -891,8 +898,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2,
                          /*spmd_partition=*/false);
   auto changed = combiner.Run(module.get()).ValueOrDie();
@@ -929,8 +937,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2,
                          /*spmd_partition=*/true);
   auto changed = combiner.Run(module.get()).ValueOrDie();
@@ -987,8 +996,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -1062,8 +1072,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -1110,8 +1121,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -1180,8 +1192,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -1224,8 +1237,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -1312,8 +1326,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -1363,8 +1378,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -1452,8 +1468,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -1502,8 +1519,9 @@ ENTRY %entrycomp (p: f32[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
@@ -1579,8 +1597,9 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1));
   ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/1,
                          /*spmd_partition=*/false);
   auto changed = combiner.Run(module.get()).ValueOrDie();
@@ -1616,8 +1635,9 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1));
   ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/1,
                          /*spmd_partition=*/true);
   auto changed = combiner.Run(module.get()).ValueOrDie();
@@ -1691,8 +1711,9 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2,
                          /*spmd_partition=*/false);
   auto changed = combiner.Run(module.get()).ValueOrDie();
@@ -1719,8 +1740,9 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2,
                          /*spmd_partition=*/true);
   auto changed = combiner.Run(module.get()).ValueOrDie();
@@ -1739,14 +1761,17 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,4]) -> f32[2,4] {
   %p = f32[2,4] parameter(0), sharding={replicated}
-  ROOT %all-reduce = f32[2,4] all-reduce(%p), replica_groups={{0,1}},
-    to_apply=%sum.f32
+  ROOT %all-reduce = f32[2,4] all-reduce(%p), to_apply=%sum.f32,
+    replica_groups={{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}}
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
-  ArCrsCombiner combiner(/*num_spatial_partitions=*/4, /*num_replicas=*/64,
+  // Replacing replicated all-reduce is only triggered when there are enough
+  // replicas (currently > num_partitions * 8).
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/32));
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/32,
                          /*spmd_partition=*/true);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
@@ -1758,7 +1783,7 @@ ENTRY %entrycomp (p: f32[2,4]) -> f32[2,4] {
   auto ar = root->operand(0);
   auto divisor = root->operand(1)->operand(0);
   EXPECT_TRUE(ar->channel_id());
-  EXPECT_TRUE(divisor->literal().IsAllFloat(4));
+  EXPECT_TRUE(divisor->literal().IsAllFloat(2));
 }
 
 TEST_F(ArCrsCombinerTest, AllReduceWithGlobalIdReplicaGroups) {
@@ -1782,8 +1807,9 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[]) {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
   ArCrsCombiner combiner(/*num_spatial_partitions=*/4, /*num_replicas=*/2,
                          /*spmd_partition=*/true);
   auto changed = combiner.Run(module.get()).ValueOrDie();
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index 78924908015..05d15fa1d07 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -275,7 +275,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleAllReduce) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleAllToAllToBF16) {
-  auto module = CreateNewVerifiedModule();
+  auto module = CreateNewVerifiedModule(TestName(), /*replica_count=*/2);
 
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
@@ -289,7 +289,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleAllToAllToBF16) {
   replica_groups[0].add_replica_ids(1);
   HloInstruction* a2a = builder.AddInstruction(HloInstruction::CreateAllToAll(
       ShapeUtil::MakeTupleShape({bf16_shape, bf16_shape}), {a, a},
-      replica_groups, absl::nullopt));
+      replica_groups, /*constrain_layout=*/false, absl::nullopt));
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(Normalize(module.get()));
@@ -304,7 +304,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleAllToAllToBF16) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleAllToAllToF32) {
-  auto module = CreateNewVerifiedModule();
+  auto module = CreateNewVerifiedModule(TestName(), /*replica_count=*/2);
 
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
@@ -318,7 +318,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleAllToAllToF32) {
   replica_groups[0].add_replica_ids(1);
   HloInstruction* a2a = builder.AddInstruction(HloInstruction::CreateAllToAll(
       ShapeUtil::MakeTupleShape({bf16_shape, f32_shape}), {a, a},
-      replica_groups, absl::nullopt));
+      replica_groups, /*constrain_layout=*/false, absl::nullopt));
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(Normalize(module.get()));
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
index abb695fa486..30d764225c2 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -79,6 +79,7 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision(
     const HloInstruction& hlo, int64 operand_index) {
   switch (hlo.opcode()) {
     case HloOpcode::kAbs:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllToAll:
     case HloOpcode::kBroadcast:
     case HloOpcode::kClamp:
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 8c76e912011..ce9c8a4ea62 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -91,6 +91,7 @@ CompileOnlyService::CompileAheadOfTime(
     TF_RETURN_IF_ERROR(options.static_device_assignment().Serialize(
         execution_options.mutable_device_assignment()));
   }
+  execution_options.set_use_spmd_partitioning(options.use_spmd_partitioning());
   for (const AotXlaComputationInstance& instance : computations) {
     TF_RET_CHECK(instance.computation.has_host_program_shape());
     *execution_options.mutable_shape_with_output_layout() =
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 653f4555a77..f03b27cdcc7 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -28,6 +28,14 @@ namespace xla {
 /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
     tensorflow::LINKER_INITIALIZED);
 
+StatusOr<
+    std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
+Compiler::RunHloPassesAndBufferAssignement(
+    std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
+    se::DeviceMemoryAllocator* device_allocator) {
+  return Unimplemented("This compiler does not support this method");
+}
+
 std::vector<std::unique_ptr<tensorflow::protobuf::Message>>
 Compiler::ComputeBackendConfigs(const HloInstruction& hlo,
                                 se::StreamExecutor* executor) const {
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index b2e1231e315..57b24e372e6 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/executable.h"
@@ -75,6 +76,7 @@ class AotCompilationOptions {
 
   virtual int64 replica_count() const { return 0; }
   virtual int64 num_cores() const { return 0; }
+  virtual bool use_spmd_partitioning() const { return false; }
 
   // Optional allocator that may be used for allocating temp space on the device
   // during compilation.
@@ -172,6 +174,21 @@ class Compiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       se::DeviceMemoryAllocator* device_allocator) = 0;
 
+  // Runs HLO passes to optimize the given HloModule, perform scheduling and
+  // buffer assignment, returns the optimized module and the buffer assignments.
+  // This interface is intentionally narrow.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // space on the device for use during compilation. For example, the compiler
+  // may allocate buffers on the device and then run variants of a given
+  // algorithm over those buffers, to see which variant is fastest. Any space
+  // allocated should be deallocated before this function returns.
+  virtual StatusOr<
+      std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
+  RunHloPassesAndBufferAssignement(std::unique_ptr<HloModule> module,
+                                   se::StreamExecutor* executor,
+                                   se::DeviceMemoryAllocator* device_allocator);
+
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
   // applied to module. Generally a module should be passed through RunHloPasses
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
new file mode 100644
index 00000000000..eecdcc851e9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -0,0 +1,483 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/conditional_code_motion.h"
+
+#include <iterator>
+#include <stack>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace xla {
+
+namespace {
+
+struct ConditionalBoundary {
+  ConditionalBoundary(HloInstruction* op, int64 op_index, HloInstruction* usr)
+      : operand(op), operand_index(op_index), user(usr) {}
+  // `operand` is one of `user`'s operand.
+
+  // Instruction that remains in the conditional but one of its user
+  // is moved out of conditonal.
+  HloInstruction* operand;
+  // operand_index for `operand` in the `user`.
+  int64 operand_index;
+  // Instruction that moved out of conditional.
+  HloInstruction* user;
+};
+
+// Visit the root instructions to its operands follow BFS.
+// Will visit an instructions after all its users have been visited. Parameters
+// are not visited.
+class BranchVisitor {
+ public:
+  explicit BranchVisitor(const HloComputation* branch_computation) {
+    HloInstruction* root_inst = branch_computation->root_instruction();
+    worklist_.push_back(root_inst);
+    visited_.insert(root_inst);
+    for (auto parameter_inst : branch_computation->parameter_instructions()) {
+      parameter_instructions_.insert(parameter_inst);
+    }
+  }
+  // Get next intruction to visit.
+  HloInstruction* GetNextInstruction() {
+    if (!worklist_.empty()) {
+      HloInstruction* inst = worklist_.front();
+      worklist_.pop_front();
+      return inst;
+    }
+    return nullptr;
+  }
+
+  // Add operands of one instruction to worklist for further visit.
+  void AddInstructionOperands(HloInstruction* inst) {
+    int64 operand_count = inst->operand_count();
+    for (int i = 0; i < operand_count; i++) {
+      HloInstruction* operand = inst->mutable_operand(i);
+      if (ContainsKey(visited_, operand)) {
+        continue;
+      }
+      bool all_user_visited = std::all_of(
+          operand->users().begin(), operand->users().end(),
+          [&](HloInstruction* user) { return ContainsKey(visited_, user); });
+
+      if (!all_user_visited) {
+        continue;
+      }
+      // Do not visit parameter_instructions.
+      if (ContainsKey(parameter_instructions_, operand)) {
+        // Add the operand and this instruction to the boundaries.
+        boundaries_.emplace_back(operand, i, inst);
+        continue;
+      }
+
+      worklist_.push_back(operand);
+      visited_.insert(operand);
+    }
+  }
+
+  // Add instruction and its users to conditional boundaries.
+  void AddInstructionToBoundary(HloInstruction* inst) {
+    for (auto user : inst->users()) {
+      boundaries_.emplace_back(inst, user->operand_index(inst), user);
+    }
+  }
+
+  // Add instruction to the to be removed instructions set and vector.
+  void AddInstructionToHoist(HloInstruction* inst) {
+    instructions_to_hoist_set_.insert(inst);
+    instructions_to_hoist_.emplace_back(inst);
+  }
+
+  // If visitor has next instruction to visit.
+  bool HasNextInstruction() const { return !worklist_.empty(); }
+
+  // If there is no hoist intruction.
+  int64 HoistInstructionSize() { return instructions_to_hoist_.size(); }
+
+  // Get boundaries of this branch.
+  const std::vector<ConditionalBoundary>& boundaries() const {
+    return boundaries_;
+  }
+
+  // Get instructions to hoist in this branch.
+  const std::vector<HloInstruction*>& instructions_to_hoist() const {
+    return instructions_to_hoist_;
+  }
+
+  // Get hoist instruction set in this branch.
+  const std::unordered_set<HloInstruction*>& instructions_to_hoist_set() const {
+    return instructions_to_hoist_set_;
+  }
+
+ private:
+  // worklist is the deque that contains instructions to be visited.
+  std::deque<HloInstruction*> worklist_;
+
+  // instructions that has been visited.
+  std::unordered_set<HloInstruction*> visited_;
+
+  // parameter instructions of the branch.
+  std::unordered_set<HloInstruction*> parameter_instructions_;
+
+  // Boundaries contains the set of instructions that its operand is within
+  // conditional but it can be hoist out of conditional.
+  std::vector<ConditionalBoundary> boundaries_;
+
+  // Instructions to hoist.
+  std::unordered_set<HloInstruction*> instructions_to_hoist_set_;
+
+  // Instructions to hoist, the order within this vector is BFS and
+  // an instruction's order will always be after its users.
+  std::vector<HloInstruction*> instructions_to_hoist_;
+};
+
+// Returns true if `instruction` is worth hoisting out.
+bool WorthHoisting(HloInstruction* instruction) {
+  for (const auto* operand : instruction->operands()) {
+    // Only move out instructions that won't share the same operand
+    // to avoid copy of the operand.
+    if (operand->user_count() > 1) {
+      return false;
+    }
+  }
+  switch (instruction->opcode()) {
+    case HloOpcode::kConvert:
+      // If Convert is after AllReduce, it is worth moving out AllReduce out
+      // of conditional for AR/CRS combine. If Convert is after other ops such
+      // as Dot or Convolutional, it is better to keep convert within
+      // conditional so that convert can be fused with Dot or Convolutional.
+      //
+      // TODO(b/154283721): figure out the scenario when convert can be fused
+      // with AllReduce out of conditional.
+      if (instruction->operand(0)->opcode() == HloOpcode::kAllReduce) {
+        return true;
+      }
+      return false;
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAdd:
+    case HloOpcode::kConstant:
+    case HloOpcode::kSubtract:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kDivide:
+    case HloOpcode::kTuple:
+    case HloOpcode::kGetTupleElement:
+      return true;
+    default:
+      return false;
+  }
+}
+
+// Compare if the instructions to be visited at each branches are identical.
+bool InstructionWithinBranchIdentical(
+    const std::vector<HloInstruction*>& instructions, bool is_layout_senstive) {
+  // Identical includes the shape of each operands are equal.
+  auto eq_operand = [&](const HloInstruction* a, const HloInstruction* b) {
+    bool eq_operands = is_layout_senstive
+                           ? ShapeUtil::Equal(a->shape(), b->shape())
+                           : ShapeUtil::Compatible(a->shape(), b->shape());
+    return eq_operands;
+  };
+
+  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
+    return *a == *b;
+  };
+
+  if (instructions[0] == nullptr) {
+    return false;
+  }
+
+  if (instructions[0]->IsCrossModuleAllReduce()) {
+    return std::all_of(
+        instructions.begin(), instructions.end(),
+        [&](HloInstruction* instruction) {
+          if (!instruction->IsCrossModuleAllReduce()) {
+            return false;
+          }
+          auto old_channel_id = instruction->channel_id();
+          instruction->set_channel_id(instructions[0]->channel_id());
+          bool eq_instructions = instructions[0]->Identical(
+              *instruction, eq_operand, eq_computations, is_layout_senstive);
+          instruction->set_channel_id(old_channel_id);
+          return eq_instructions;
+        });
+  }
+
+  return std::all_of(instructions.begin(), instructions.end(),
+                     [&](HloInstruction* instruction) {
+                       return instructions[0]->Identical(
+                           *instruction, eq_operand, eq_computations,
+                           is_layout_senstive);
+                     });
+}
+
+// Returns if all the visitors/branches has next instruction to visit.
+bool HasNextInstruction(const std::vector<BranchVisitor>& visitors) {
+  bool has_next = true;
+  for (const auto& visitor : visitors) {
+    has_next &= visitor.HasNextInstruction();
+  }
+  return has_next;
+}
+
+// Create tuple element as the new root of the branch. The tuple will contain
+// the operands that can't move out of conditional but its user will be moved
+// out of conditional.
+HloInstruction* CreateNewRoot(
+    const std::vector<ConditionalBoundary>& boundaries,
+    const std::unordered_set<HloInstruction*>& instructions_to_hoist_set,
+    HloComputation* computation) {
+  std::vector<HloInstruction*> elements;
+  elements.reserve(boundaries.size());
+  for (auto boundary : boundaries) {
+    if (ContainsKey(instructions_to_hoist_set, boundary.user)) {
+      elements.push_back(boundary.operand);
+    }
+  }
+  return computation->AddInstruction(HloInstruction::CreateTuple(elements));
+}
+
+// Copy identical instructions within conditional outside of conditional.
+void CopyIdenticalInstructionsOutOfConditional(
+    const std::vector<HloInstruction*>& instructions_to_hoist,
+    HloComputation* conditional_parent,
+    absl::flat_hash_map<HloInstruction*, HloInstruction*>*
+        hoisted_instructions) {
+  int64 instructions_size = instructions_to_hoist.size();
+  // Visit the operands before its users and copy it, so that the copied
+  // user will point to the correct operand.
+  for (int64 i = instructions_size - 1; i >= 0; i--) {
+    HloInstruction* old_instruction = instructions_to_hoist[i];
+    auto get_new_operand = [&](HloInstruction* old_operand) {
+      // If the operand can't be found in `instructions_to_hoist`, this
+      // operand will be in the `boundaries`, GetTupleElement instructions
+      // will be added later to replace this operand.
+      if (!ContainsKey(*hoisted_instructions, old_operand)) {
+        return old_operand;
+      }
+      return FindOrDie(*hoisted_instructions, old_operand);
+    };
+
+    absl::InlinedVector<HloInstruction*, 4> new_operands;
+    absl::c_transform(old_instruction->operands(),
+                      std::back_inserter(new_operands), get_new_operand);
+
+    HloInstruction* new_instruction = conditional_parent->AddInstruction(
+        old_instruction->CloneWithNewOperands(old_instruction->shape(),
+                                              new_operands));
+    // Maps the instruction outside of conditional to the instruction
+    // inside of the conditional.
+    InsertOrDie(hoisted_instructions, old_instruction, new_instruction);
+  }
+}
+
+// If there are instructions to hoist, the root of the conditional must be
+// moved out. Change the users of the conditional to the hoisted instruction
+// of the new root.
+Status ChangeConditionalUsers(
+    HloInstruction* conditional, HloInstruction* old_root,
+    const absl::flat_hash_map<HloInstruction*, HloInstruction*>&
+        hoisted_instructions) {
+  HloInstruction* new_root = FindOrDie(hoisted_instructions, old_root);
+  TF_RETURN_IF_ERROR(conditional->ReplaceAllUsesWith(new_root));
+  return Status::OK();
+}
+
+// Insert GetTupleElement before the instructions whose operands might still
+// be within the conditional.
+Status CreateGetTupleElementAfterConditional(
+    const std::vector<ConditionalBoundary>& boundaries,
+    const std::unordered_set<HloInstruction*>& instructions_to_hoist_set,
+    const absl::flat_hash_map<HloInstruction*, HloInstruction*>&
+        hoisted_instructions,
+    HloInstruction* conditional, HloComputation* computation) {
+  int boundary_instruction_size = boundaries.size();
+
+  // Inserts GetTupleElement before the boundary instructions.
+  for (int i = 0; i < boundary_instruction_size; i++) {
+    HloInstruction* gte =
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            boundaries[i].operand->shape(), conditional, i));
+
+    HloInstruction* new_instruction =
+        FindOrDie(hoisted_instructions, boundaries[i].user);
+    TF_RETURN_IF_ERROR(
+        new_instruction->ReplaceOperandWith(boundaries[i].operand_index, gte));
+  }
+  return Status::OK();
+}
+
+// Remove instructions to be hoisted out of the branch computation.
+Status RemoveInstructionFromComputation(
+    const std::vector<HloInstruction*>& instructions_to_hoist,
+    HloComputation* branch) {
+  // Will visit the instructions after its users.
+  for (auto* instruction : instructions_to_hoist) {
+    TF_RETURN_IF_ERROR(branch->RemoveInstruction(instruction));
+  }
+  return Status::OK();
+}
+
+// Hoist identical ops out of the conditional. The definition of identical
+// are the shape of the operands are identical and their properties are
+// identical. Will start from the root instruction of each branch and get
+// the identical ops to hoist.
+StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
+                                      bool is_layout_sensitive) {
+  int branch_count = conditional->branch_count();
+  if (branch_count <= 0) {
+    return false;
+  }
+
+  std::vector<BranchVisitor> visitors;
+  visitors.reserve(branch_count);
+  // Visit instructions from the root instruction to the operands using BFS.
+  for (int i = 0; i < branch_count; i++) {
+    visitors.emplace_back(BranchVisitor(conditional->branch_computation(i)));
+  }
+
+  // The instructions to be visited within each branch.
+  std::vector<HloInstruction*> front_instructions(branch_count);
+
+  while (HasNextInstruction(visitors)) {
+    for (int i = 0; i < branch_count; i++) {
+      front_instructions[i] = visitors[i].GetNextInstruction();
+    }
+    // If two instructions has the same shape, opcode and its operands has the
+    // same shape, then this instruction can be moved out of conditional.
+    if (WorthHoisting(front_instructions[0]) &&
+        InstructionWithinBranchIdentical(front_instructions,
+                                         is_layout_sensitive)) {
+      for (int i = 0; i < branch_count; i++) {
+        visitors[i].AddInstructionOperands(front_instructions[i]);
+        visitors[i].AddInstructionToHoist(front_instructions[i]);
+      }
+    } else {
+      for (int i = 0; i < branch_count; i++) {
+        // If the ops are not identical, these ops and its users will
+        // be in the boundaries` of the conditional. These ops will be stayed
+        // within the conditional, but one its only user will be moved out
+        // of conditional.
+        visitors[i].AddInstructionToBoundary(front_instructions[i]);
+      }
+    }
+  }
+
+  if (visitors[0].HoistInstructionSize() <= 1) {
+    return false;
+  }
+
+  HloInstruction* old_root =
+      conditional->branch_computation(0)->root_instruction();
+  HloComputation* conditional_parent = conditional->parent();
+  // Maps instructions in the conditional body to instructions hoisted outside
+  // the conditional that compute the same value.
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> hoisted_instructions;
+  // Copy identical instructions out of the conditional.
+  CopyIdenticalInstructionsOutOfConditional(visitors[0].instructions_to_hoist(),
+                                            conditional_parent,
+                                            &hoisted_instructions);
+  // If there are instructions to hoist, the root of the conditional must be
+  // moved out. Change the users of the conditional to the hoisted instruction
+  // of the new root.
+  TF_RETURN_IF_ERROR(
+      ChangeConditionalUsers(conditional, old_root, hoisted_instructions));
+
+  // Create tuple element within each branch and set it as root.
+  for (int i = 0; i < branch_count; i++) {
+    HloInstruction* tuple = CreateNewRoot(
+        visitors[i].boundaries(), visitors[i].instructions_to_hoist_set(),
+        conditional->branch_computation(i));
+    conditional->branch_computation(i)->set_root_instruction(tuple, true);
+  }
+  // Changes conditional instruction shape to the shape of the new root.
+  *conditional->mutable_shape() =
+      conditional->branch_computation(0)->root_instruction()->shape();
+
+  // Insert GetTupleElement before the instructions whose operands might still
+  // be within the conditional.
+  TF_RETURN_IF_ERROR(CreateGetTupleElementAfterConditional(
+      visitors[0].boundaries(), visitors[0].instructions_to_hoist_set(),
+      hoisted_instructions, conditional, conditional_parent));
+
+  // Remove hoist instructions from the branches.
+  for (int i = 0; i < branch_count; i++) {
+    TF_RETURN_IF_ERROR(
+        RemoveInstructionFromComputation(visitors[i].instructions_to_hoist(),
+                                         conditional->branch_computation(i)));
+  }
+
+  return true;
+}
+
+}  // namespace
+
+StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
+  bool changed = false;
+
+  // Gather all the conditional ops in our module. We do this ahead of time so
+  // we don't have to worry about mutating the lists of computations or
+  // instructions as we iterate.
+  std::vector<HloInstruction*> conditional_ops;
+  for (auto* comp : module->MakeComputationPostOrder()) {
+    for (auto* instr : comp->MakeInstructionPostOrder()) {
+      if (instr->opcode() == HloOpcode::kConditional) {
+        conditional_ops.push_back(instr);
+      }
+    }
+  }
+
+  for (HloInstruction* conditional_op : conditional_ops) {
+    TF_ASSIGN_OR_RETURN(bool result, MergeIdenticalElements(
+                                         conditional_op, is_layout_sensitive_));
+    changed |= result;
+  }
+
+  if (changed) {
+    HloPassPipeline subpipeline("after_conditional_code_motion");
+    subpipeline.AddPass<TupleSimplifier>();
+    subpipeline.AddPass<HloDCE>();
+    TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
+    changed |= cleanup_changed;
+  }
+
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.h b/tensorflow/compiler/xla/service/conditional_code_motion.h
new file mode 100644
index 00000000000..1197a8b3620
--- /dev/null
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_CODE_MOTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_CODE_MOTION_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass that moves identical ops out of conditional.
+// - The definition of identical are the shape of the operands are identical
+// and their properties are identical.
+// - Currently, only some types of instructions is supported.
+// TODO(b/154283721): relax non-sharable operand constraint and avoid copies in
+// the new root.
+// - Only the identical ops that won't share operands with other ops will
+// be moved out of conditional.
+class ConditionalCodeMotion : public HloModulePass {
+ public:
+  // If is_layout_sensitive is true, then the hoist process preserves layout
+  // during identical comparison. Otherwise, layout is ignored.
+  explicit ConditionalCodeMotion(bool is_layout_sensitive = true)
+      : is_layout_sensitive_(is_layout_sensitive) {}
+  absl::string_view name() const override { return "conditional-code-motion"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  const bool is_layout_sensitive_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_CODE_MOTION_H_
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
new file mode 100644
index 00000000000..4a52303a42a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -0,0 +1,413 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/conditional_code_motion.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using ConditionalCodeMotionTest = HloTestBase;
+namespace op = xla::testing::opcode_matchers;
+
+TEST_F(ConditionalCodeMotionTest, DoNotMoveConvertOut) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}) tuple(%convert.2894)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}) tuple(%convert.3604)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+  get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
+  ROOT result = (bf16[2,512,364]{2,1,0}) tuple(get-first-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass;
+  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+}
+
+TEST_F(ConditionalCodeMotionTest, UserShareOperandCannotBeMoved) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+on_true {
+  arg_tuple.1 = (f32[]) parameter(0)
+  get-tuple-element.1 = f32[] get-tuple-element(arg_tuple.1), index=0
+  constant.1 = f32[] constant(1)
+  constant.2 = f32[] constant(2)
+  constant.3 = f32[] constant(3)
+  constant.4 = f32[] constant(4)
+  constant.5 = f32[] constant(5)
+  add.1 = f32[] add(get-tuple-element.1, constant.1)
+  add.2 = f32[] add(add.1, constant.2)
+  add.3 = f32[] add(add.1, constant.3)
+  add.4 = f32[] add(add.3, constant.5)
+  multiply.1 = f32[] multiply(add.2, constant.4)
+  ROOT tuple.6 = (f32[], f32[]) tuple(multiply.1, add.4)
+}
+
+on_false {
+  arg_tuple.2 = (f32[]) parameter(0)
+  get-tuple-element.2 = f32[] get-tuple-element(arg_tuple.2), index=0
+  constant.6 = f32[] constant(1)
+  constant.7 = f32[] constant(2)
+  constant.8 = f32[] constant(3)
+  constant.9 = f32[] constant(4)
+  constant.10 = f32[] constant(5)
+  add.4 = f32[] add(get-tuple-element.2, constant.6)
+  sub.1 = f32[] subtract(add.4, constant.7)
+  add.5 = f32[] add(add.4, constant.8)
+  add.6 = f32[] add(add.5, constant.10)
+  multiply.2 = f32[] multiply(sub.1, constant.9)
+  ROOT tuple.6 = (f32[], f32[]) tuple(multiply.2, add.6)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[]) parameter(1)
+  tuple.2 = (f32[]) parameter(2)
+  conditional = (f32[], f32[])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=on_true,
+    false_computation=on_false
+  get-first-index = f32[] get-tuple-element(conditional), index=0
+  get-second-index = f32[] get-tuple-element(conditional), index=1
+  ROOT result = (f32[], f32[]) tuple(get-first-index, get-second-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass;
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 9);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 9);
+
+  // Check only one add and multiply is moved out.
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Tuple(
+          op::Multiply(op::GetTupleElement(op::Conditional()), op::Constant()),
+          op::Add(op::GetTupleElement(op::Conditional()), op::Constant()))));
+}
+
+TEST_F(ConditionalCodeMotionTest, ConditionalRootElementChanged) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+on_true {
+  arg_tuple.1 = (f32[]) parameter(0)
+  get-tuple-element.1 = f32[] get-tuple-element(arg_tuple.1), index=0
+  constant.1 = f32[] constant(1)
+  constant.2 = f32[] constant(2)
+  add.1 = f32[] add(get-tuple-element.1, constant.1)
+  add.2 = f32[] add(get-tuple-element.1, constant.2)
+  add.3 = f32[] add(add.1, add.2)
+  ROOT tuple.3 = (f32[]) tuple(add.3)
+}
+
+on_false {
+  arg_tuple.2 = (f32[]) parameter(0)
+  get-tuple-element.2 = f32[] get-tuple-element(arg_tuple.2), index=0
+  constant.3 = f32[] constant(1)
+  constant.4 = f32[] constant(2)
+  add.4 = f32[] add(get-tuple-element.2, constant.3)
+  add.5 = f32[] add(get-tuple-element.2, constant.4)
+  add.6 = f32[] add(add.4, add.5)
+  ROOT tuple.4 = (f32[]) tuple(add.6)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[]) parameter(1)
+  tuple.2 = (f32[]) parameter(2)
+  conditional = (f32[])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=on_true,
+    false_computation=on_false
+  get-first-index = f32[] get-tuple-element(conditional), index=0
+  ROOT result = (f32[]) tuple(get-first-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass;
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 7);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 7);
+
+  // add.3 in on_true will be moved out, add.1 and add.2 will be in condtional
+  // root.
+  ASSERT_TRUE(ShapeUtil::Compatible(
+      conditional->shape(),
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})})));
+}
+
+TEST_F(ConditionalCodeMotionTest, ConditionalIsRootInstruction) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+on_true {
+  arg_tuple.1 = (f32[]) parameter(0)
+  get-tuple-element.1 = f32[] get-tuple-element(arg_tuple.1), index=0
+  constant.1 = f32[] constant(1)
+  constant.2 = f32[] constant(2)
+  constant.3 = f32[] constant(3)
+  constant.4 = f32[] constant(4)
+  constant.5 = f32[] constant(5)
+  add.1 = f32[] add(get-tuple-element.1, constant.1)
+  add.2 = f32[] add(add.1, constant.2)
+  add.3 = f32[] add(add.1, constant.3)
+  add.4 = f32[] add(add.3, constant.5)
+  multiply.1 = f32[] multiply(add.2, constant.4)
+  ROOT tuple.6 = (f32[], f32[]) tuple(multiply.1, add.4)
+}
+
+on_false {
+  arg_tuple.2 = (f32[]) parameter(0)
+  get-tuple-element.2 = f32[] get-tuple-element(arg_tuple.2), index=0
+  constant.6 = f32[] constant(1)
+  constant.7 = f32[] constant(2)
+  constant.8 = f32[] constant(3)
+  constant.9 = f32[] constant(4)
+  constant.10 = f32[] constant(5)
+  add.4 = f32[] add(get-tuple-element.2, constant.6)
+  sub.1 = f32[] subtract(add.4, constant.7)
+  add.5 = f32[] add(add.4, constant.8)
+  add.6 = f32[] add(add.5, constant.10)
+  multiply.2 = f32[] multiply(sub.1, constant.9)
+  ROOT tuple.6 = (f32[], f32[]) tuple(multiply.2, add.6)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[]) parameter(1)
+  tuple.2 = (f32[]) parameter(2)
+  ROOT conditional = (f32[], f32[])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=on_true,
+    false_computation=on_false
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass;
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 9);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 9);
+
+  // Check only one add and multiply is moved out.
+  // add.3 and add.5 can't be moved out because they share operands with
+  // other instructions.
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Tuple(
+          op::Multiply(op::GetTupleElement(op::Conditional()), op::Constant()),
+          op::Add(op::GetTupleElement(op::Conditional()), op::Constant()))));
+}
+
+TEST_F(ConditionalCodeMotionTest, LayoutMisMatchCannotMovedOut) {
+  absl::string_view hlo_string =
+      R"(
+HloModule LayoutMisMatchCannotMovedOut
+
+%add.64 (x.139: bf16[], y.139: bf16[]) -> bf16[] {
+  %x.139 = bf16[]{:T(512)} parameter(0)
+  %y.139 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44073 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.139, bf16[]{:T(512)} %y.139)
+}
+
+%add.181 (x.256: bf16[], y.256: bf16[]) -> bf16[] {
+  %x.256 = bf16[]{:T(512)} parameter(0)
+  %y.256 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44842 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.256, bf16[]{:T(512)} %y.256)
+}
+
+on_true {
+  %arg_tuple.1 = (bf16[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = bf16[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %all-reduce.1 = bf16[93184,4]{1,0}
+    all-reduce(bf16[93184,4]{1,0} %get-tuple-element.1),
+    channel_id=188, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.64
+  %convert.2894 = f32[93184,4]{1,0} convert(bf16[93184, 4]{1,0} %all-reduce.1)
+  ROOT %tuple.1 = (f32[93184,4]{1,0}) tuple(%convert.2894)
+}
+
+on_false {
+  %arg_tuple.2 = (bf16[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = bf16[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %copy.1 = bf16[93184,4]{0,1} copy(bf16[93184,4]{1,0} %get-tuple-element.3)
+  %all-reduce.2 = bf16[93184,4]{0, 1}
+    all-reduce(bf16[93184,4]{0, 1} %copy.1),
+    channel_id=188, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.181
+  %convert.3604 = f32[93184,4]{0,1} convert(bf16[93184,4]{0,1} %all-reduce.2)
+  ROOT %tuple.2 = (f32[93184,4]{0,1}) tuple(%convert.3604)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (bf16[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (bf16[93184,4]{1,0}) parameter(2)
+  conditional = (f32[93184,4]{1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+  get-first-index = f32[93184,4]{1,0} get-tuple-element(conditional), index=0
+  ROOT result = (f32[93184,4]{1,0}) tuple(get-first-index)
+}
+)";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass;
+  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveCrossModuleAllReduceOut) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+%add.64 (x.139: bf16[], y.139: bf16[]) -> bf16[] {
+  %x.139 = bf16[]{:T(512)} parameter(0)
+  %y.139 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44073 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.139, bf16[]{:T(512)} %y.139)
+}
+
+%add.181 (x.256: bf16[], y.256: bf16[]) -> bf16[] {
+  %x.256 = bf16[]{:T(512)} parameter(0)
+  %y.256 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44842 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.256, bf16[]{:T(512)} %y.256)
+}
+
+on_true {
+  arg_tuple.1 = (bf16[2,54,168,128], bf16[2,52,168,128]) parameter(0)
+  get-tuple-element.11 = bf16[2,54,168,128] get-tuple-element(arg_tuple.1), index=0
+  get-tuple-element.12 = bf16[2,52,168,128] get-tuple-element(arg_tuple.1), index=1
+  convolution.1 = bf16[3,3,128,128] convolution(bf16[2,54,168,128]
+    get-tuple-element.11, bf16[2,52,168,128]
+    get-tuple-element.12), window={size=52x168 pad=0_0x1_1},
+    dim_labels=f01b_i01o->01bf
+  all-reduce.1 = bf16[3,3,128,128]
+    all-reduce(bf16[3,3,128,128] %convolution.1),
+    channel_id=188, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.64, metadata={op_type="Conv2DBackpropFilter"
+    op_name="gradients/resnet50/conv2d_22/Conv2D_grad/Conv2DBackpropFilter"}
+  convert.1 = f32[3,3,128,128] convert(bf16[3,3,128,128] %all-reduce.1),
+    metadata={op_type="Cast" op_name="Cast_15"}
+  ROOT tuple.1 = (f32[3,3,128,128]) tuple(convert.1)
+}
+
+on_false {
+  arg_tuple.2 = (bf16[2,86,104,128], bf16[2,84,104,128]) parameter(0)
+  get-tuple-element.21 = bf16[2,86,104,128]
+    get-tuple-element(arg_tuple.2), index=0
+  get-tuple-element.22 = bf16[2,84,104,128]
+    get-tuple-element(arg_tuple.2), index=1
+  convolution.2 = bf16[3,3,128,128]
+    convolution(bf16[2,86,104,128] get-tuple-element.21, bf16[2,84,104,128]
+    get-tuple-element.22), window={size=84x104 pad=0_0x1_1},
+    dim_labels=f01b_i01o->01bf
+  all-reduce.2 = bf16[3,3,128,128]
+    all-reduce(bf16[3,3,128,128] %convolution.2),
+    channel_id=485, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.181, metadata={op_type="Conv2DBackpropFilter"
+    op_name="gradients/resnet50/conv2d_22/Conv2D_grad/Conv2DBackpropFilter"}
+  convert.2 = f32[3,3,128,128]
+    convert(bf16[3,3,128,128] %all-reduce.2),
+    metadata={op_type="Cast" op_name="Cast_15"}
+  ROOT tuple.2 = (f32[3,3,128,128]) tuple(convert.2)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.3 = (bf16[2,54,168,128], bf16[2,52,168,128]) parameter(1)
+  arg_tuple.4 = (bf16[2,86,104,128], bf16[2,84,104,128]) parameter(2)
+  conditional = (f32[3,3,128,128])
+    conditional(pred.1, arg_tuple.3, arg_tuple.4), true_computation=on_true,
+    false_computation=on_false
+  get-first-index = f32[3,3,128,128]
+    get-tuple-element(conditional), index=0
+  ROOT result = (f32[3,3,128,128]) tuple(get-first-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass;
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 5);
+
+  // Checks if conditional shape has changed.
+  ASSERT_TRUE(ShapeUtil::Compatible(
+      conditional->shape(), ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(
+                                BF16, {3, 3, 128, 128})})));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert(op::AllReduce(
+                        op::GetTupleElement(op::Conditional()))))));
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 7e1b8a1e7ee..2f432cd9356 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -35,6 +35,7 @@ filegroup(
     srcs = [
         "runtime_fp16.cc",
         "runtime_key_value_sort.cc",
+        "runtime_pow.cc",
         "runtime_single_threaded_conv2d.cc",
         "runtime_single_threaded_fft.cc",
         "runtime_single_threaded_matmul.cc",
@@ -49,6 +50,7 @@ filegroup(
         "runtime_fft_impl.h",
         "runtime_fp16.h",
         "runtime_key_value_sort.h",
+        "runtime_pow.h",
         "runtime_single_threaded_conv2d.h",
         "runtime_single_threaded_fft.h",
         "runtime_single_threaded_matmul.h",
@@ -144,6 +146,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_padder",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
@@ -204,6 +207,7 @@ cc_library(
         ":cpu_runtime",
         ":orc_jit_memory_mapper",
         ":runtime_fp16",
+        ":runtime_pow",
         ":runtime_conv2d",
         ":runtime_conv2d_mkl",
         ":runtime_fft",
@@ -250,6 +254,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_pow",
+    srcs = [
+        "runtime_pow.cc",
+    ],
+    hdrs = [
+        "runtime_pow.h",
+    ],
+    copts = runtime_copts(),
+    deps = [
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
 cc_library(
     name = "cpu_executable",
     srcs = ["cpu_executable.cc"],
@@ -357,6 +376,7 @@ cc_library(
     ],
     hdrs = ["target_machine_features.h"],
     deps = [
+        "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 5e536d362d9..a21ace0d8b2 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -198,11 +198,6 @@ void CompilerFunctor::AddTargetInfoPasses(
   target_library_info_impl->addVectorizableFunctions(
       VectorFunctionsForTargetLibraryInfoImpl());
 
-  // TODO(b/136651482): Disable pow(f) so LLVM doesn't transform it into powi.
-  // It would be better to provide our own powi.
-  target_library_info_impl->setUnavailable(llvm::LibFunc_pow);
-  target_library_info_impl->setUnavailable(llvm::LibFunc_powf);
-
   passes->add(
       new llvm::TargetLibraryInfoWrapperPass(*target_library_info_impl));
   passes->add(createTargetTransformInfoWrapperPass(
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 53d0d14f598..fe769bbdd2a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -72,6 +72,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -239,7 +240,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   HloPassPipeline pipeline("HLO passes through layout assignment");
   pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                             /*allow_mixed_precision=*/false);
-
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
   pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
@@ -273,6 +273,13 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<ConvolutionGroupConverter>(
       cost_model,
       /*convert_batch_groups_only=*/false);
+  pipeline.AddPass<ScatterExpander>();
+  pipeline.AddPass<BatchNormExpander>(
+      /*rewrite_training_op=*/true,
+      /*rewrite_inference_op=*/true,
+      /*rewrite_grad_op=*/true);
+  pipeline.AddPass<DynamicPadder>();
+  pipeline.AddPass<HloGetDimensionSizeRewriter>();
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {
     auto& pass =
@@ -281,12 +288,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
                                                /*allow_mixed_precision=*/false);
 
     pass.AddPass<TreeReductionRewriter>();
-    pass.AddPass<ScatterExpander>();
-    pass.AddPass<BatchNormExpander>(
-        /*rewrite_training_op=*/true,
-        /*rewrite_inference_op=*/true,
-        /*rewrite_grad_op=*/true);
-    pipeline.AddPass<HloGetDimensionSizeRewriter>();
     AlgebraicSimplifierOptions options;
     options.set_enable_dot_strength_reduction(false);
     pass.AddPass<AlgebraicSimplifier>(options);
@@ -402,8 +403,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
 namespace {
 
 // Align buffers to 16-byte boundaries.
-constexpr int64 kMemoryAlignment = 16;
-auto memory_alignment = [](LogicalBuffer::Color) { return kMemoryAlignment; };
+int64 memory_alignment(LogicalBuffer::Color) {
+  return cpu_function_runtime::kMinAlign;
+}
 
 llvm::TargetOptions CompilerTargetOptions(
     const HloModuleConfig& module_config) {
@@ -521,6 +523,33 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
   return std::move(module);
 }
 
+StatusOr<
+    std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
+CpuCompiler::RunHloPassesAndBufferAssignement(
+    std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
+    se::DeviceMemoryAllocator* device_allocator) {
+  TF_ASSIGN_OR_RETURN(
+      module, RunHloPasses(std::move(module), executor, device_allocator));
+
+  // Select an order for emitting the HLO instructions for each computation.
+  // Using this sequence enables tighter buffer liveness analysis and reduced
+  // memory usage (as compared to using DependencyHloOrdering).
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                      ScheduleModule(module.get(), BufferSizeBytesFunction(),
+                                     ComputationSchedulerToModuleScheduler(
+                                         DFSMemoryScheduler)));
+
+  // Run buffer allocation on the HLO graph.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> assignment,
+      BufferAssigner::Run(module.get(),
+                          absl::make_unique<SequentialHloOrdering>(schedule),
+                          BufferSizeBytesFunction(), memory_alignment,
+                          /*allocate_buffers_for_constants=*/true));
+
+  return std::make_tuple(std::move(module), std::move(assignment));
+}
+
 namespace {
 
 // Post-compilation callback functor for use by SimpleOrcJIT.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 537bf8b87c6..d28ccd985a3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -136,6 +136,12 @@ class CpuCompiler : public LLVMCompiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       se::DeviceMemoryAllocator* device_allocator) override;
 
+  StatusOr<
+      std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
+  RunHloPassesAndBufferAssignement(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
+      se::DeviceMemoryAllocator* device_allocator) override;
+
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       se::DeviceMemoryAllocator* device_allocator) override;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 8c1ae0179c0..f031daecb1f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -363,7 +363,12 @@ StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
   if (shape.IsOpaque()) {
     return sizeof(void*);
   }
-  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  if (shape.is_static() || shape.IsTuple()) {
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  }
+  // Each dynamic dimension size is represented as a S32.
+  int64 metadata_size = sizeof(int32) * shape.dimensions_size();
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*)) + metadata_size;
 }
 
 const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index fae9670051a..e21ed7ad60e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -154,7 +154,8 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor,
                                                    int64 size,
                                                    const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return InvalidArgument("Infeed shape is too large: needs %d bytes", size);
+    return InvalidArgument("CPU infeed of %d bytes exceeds maximum of %d bytes",
+                           size, std::numeric_limits<int32>::max());
   }
 
   if (size <= 0) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
index d933380442f..43d2e0a3cab 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory.h"
 
 namespace xla {
 
@@ -50,6 +51,12 @@ class CpuTransferManager : public GenericTransferManager {
     return true;
   }
 
+  bool CanBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const se::DeviceMemoryBase& device_buffer) const override {
+    return true;
+  }
+
  private:
   Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
                                 const void* source);
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index e21ca01c803..05364a4492b 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -109,24 +109,6 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const HloToElementGeneratorMap& operand_to_generator) {
   switch (hlo->opcode()) {
-    case HloOpcode::kMap:
-      return [this, hlo, &operand_to_generator](
-                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        std::vector<llvm::Value*> operands;
-        for (int i = 0; i < hlo->operand_count(); i++) {
-          TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
-                              operand_to_generator.at(hlo->operand(i))(index));
-          operands.push_back(operand_value);
-        }
-        return ir_emitter_->EmitElementalMap(*Cast<HloMapInstruction>(hlo),
-                                             operands, llvm_ir::IrName(hlo));
-      };
-    case HloOpcode::kReduceWindow:
-      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
-        return ir_emitter_->EmitElementalReduceWindow(
-            Cast<HloReduceWindowInstruction>(hlo),
-            operand_to_generator.at(hlo->operand(0)), index);
-      };
     case HloOpcode::kConvolution:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         return ir_emitter_->EmitElementalConvolution(
@@ -134,22 +116,6 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
             operand_to_generator.at(hlo->operand(0)),
             operand_to_generator.at(hlo->operand(1)), index);
       };
-    case HloOpcode::kReduce:
-      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
-        auto reduce_instr = Cast<HloReduceInstruction>(hlo);
-        std::vector<llvm_ir::ElementGenerator> input_generators;
-        for (const HloInstruction* instr : reduce_instr->inputs()) {
-          input_generators.push_back(operand_to_generator.at(instr));
-        }
-
-        std::vector<llvm_ir::ElementGenerator> initial_value_generators;
-        for (const HloInstruction* instr : reduce_instr->init_values()) {
-          initial_value_generators.push_back(operand_to_generator.at(instr));
-        }
-        return ir_emitter_->EmitElementalReduce(
-            reduce_instr, std::move(input_generators),
-            std::move(initial_value_generators), index);
-      };
     default:
       return ElementalIrEmitter::MakeElementGenerator(hlo,
                                                       operand_to_generator);
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index e3fba9306b7..5c9f6677ab3 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -44,6 +44,12 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                   llvm::Value* value) override;
 
+  StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name) override {
+    return ir_emitter_->EmitThreadLocalCall(callee, parameters, name);
+  }
+
   IrEmitter* ir_emitter_;
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index f4549ac9f3b..70dde919afb 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include <algorithm>
 #include <iterator>
 #include <limits>
@@ -182,11 +183,8 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
                     arch_type_ == llvm::Triple::ArchType::x86_64;
   profiling_state_ = ProfilingState(use_rdtscp);
 
-  bool emit_tracing =
-      hlo_module_config_.hlo_profiling_enabled() &&
-      hlo_module_config_.debug_options().xla_backend_extra_options().count(
-          "xla_hlo_trace");
-  tracing_state_.set_enabled(emit_tracing);
+  tracing_state_.set_enabled(
+      computation->parent()->config().cpu_traceme_enabled());
 
   TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, instruction_order));
   llvm::Function* ir_function = compute_function_->function();
@@ -573,25 +571,9 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(sort));
   Shape keys_shape = sort->keys()->shape();
   PrimitiveType keys_type = keys_shape.element_type();
-  switch (keys_type) {
-    case PRED:
-    case S8:
-    case U8:
-    case S16:
-    case U16:
-    case BF16:
-    case F16:
-    case S32:
-    case U32:
-    case F32:
-    case S64:
-    case U64:
-    case F64:
-      break;
-    default:
-      return Unimplemented(
-          "Element type %s not supported in the Sort op on CPU.",
-          PrimitiveType_Name(keys_type));
+  if (!primitive_util::IsArrayType(keys_type)) {
+    return Unimplemented("Element type %s not supported in the Sort op on CPU.",
+                         PrimitiveType_Name(keys_type));
   }
   std::vector<llvm::Value*> destination_addresses(sort->operand_count());
   for (int64 i = 0; i < sort->operand_count(); ++i) {
@@ -698,101 +680,6 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   return Status::OK();
 }
 
-llvm::Value* IrEmitter::EmitElementalMap(
-    const HloMapInstruction& map_instr,
-    absl::Span<llvm::Value* const> elemental_operands, absl::string_view name) {
-  return EmitScalarReturningThreadLocalCall(*map_instr.to_apply(),
-                                            elemental_operands, name);
-}
-
-StatusOr<llvm::Value*> IrEmitter::EmitElementalReduceWindow(
-    const HloReduceWindowInstruction* reduce_window,
-    const llvm_ir::ElementGenerator& input_generator,
-    const llvm_ir::IrArray::Index& index) {
-  const HloInstruction* operand = reduce_window->operand(0);
-  const Window& window = reduce_window->window();
-
-  // We fold inputs into the accumulator and initialize it to
-  // the initial value on the reduce_window.
-  PrimitiveType operand_element_type = operand->shape().element_type();
-  llvm::Value* accumulator_address = llvm_ir::EmitAllocaAtFunctionEntry(
-      llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-      "reduce_window_accumulator_address", &b_,
-      MinimumAlignmentForPrimitiveType(operand_element_type));
-  Store(Load(GetEmittedValueFor(reduce_window->operand(1))),
-        accumulator_address);
-
-  llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &b_);
-  std::vector<int64> window_size;
-  for (const auto& dim : window.dimensions()) {
-    window_size.push_back(dim.size());
-  }
-  const llvm_ir::IrArray::Index window_index = loops.AddLoopsForShape(
-      ShapeUtil::MakeShape(operand_element_type, window_size), "window");
-  CHECK_EQ(window_index.size(), index.size());
-
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
-
-  std::vector<llvm::Value*> input_multi_index(index.size());
-  llvm::Value* in_bounds_condition = nullptr;
-  for (size_t i = 0; i < index.size(); ++i) {
-    llvm::Value* strided_index =
-        NSWMul(index[i], b_.getInt64(window.dimensions(i).stride()));
-    input_multi_index[i] = NSWSub(
-        NSWAdd(strided_index,
-               NSWMul(window_index[i],
-                      b_.getInt64(window.dimensions(i).window_dilation()))),
-        b_.getInt64(window.dimensions(i).padding_low()));
-
-    // We need to verify that we are not in the dilated base area.
-    llvm::Value* dilation_condition =
-        ICmpEQ(SRem(input_multi_index[i],
-                    b_.getInt64(window.dimensions(i).base_dilation())),
-               b_.getInt64(0));
-    if (in_bounds_condition == nullptr) {
-      in_bounds_condition = dilation_condition;
-    } else {
-      in_bounds_condition = And(in_bounds_condition, dilation_condition);
-    }
-
-    // Apply base dilation to the index.
-    input_multi_index[i] =
-        SDiv(input_multi_index[i],
-             b_.getInt64(window.dimensions(i).base_dilation()));
-
-    // We need to check if 0 <= input_multi_index[i] < bound, as otherwise we
-    // are in the padding so that we can skip the computation. That is
-    // equivalent to input_multi_index[i] < bound as an *unsigned* comparison,
-    // since a negative value will wrap to a large positive value.
-    llvm::Value* index_condition =
-        ICmpULT(input_multi_index[i],
-                b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
-    if (in_bounds_condition == nullptr) {
-      in_bounds_condition = index_condition;
-    } else {
-      in_bounds_condition = And(in_bounds_condition, index_condition);
-    }
-  }
-  CHECK(in_bounds_condition != nullptr);
-
-  llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
-  SetToFirstInsertPoint(if_data.true_block, &b_);
-
-  // We are not in the padding, so carry out the computation.
-  llvm_ir::IrArray::Index input_index(input_multi_index, operand->shape(),
-                                      b_.getInt64Ty());
-  TF_ASSIGN_OR_RETURN(llvm::Value* const input_value,
-                      input_generator(input_index));
-  llvm::Value* result = EmitScalarReturningThreadLocalCall(
-      *reduce_window->to_apply(), {Load(accumulator_address), input_value},
-      "reducer_function");
-  Store(result, accumulator_address);
-
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-  return Load(accumulator_address);
-}
-
 Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   // Pseudo code for reduce window:
   //
@@ -2102,108 +1989,6 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   return true;
 }
 
-StatusOr<llvm::Value*> IrEmitter::EmitElementalReduce(
-    const HloReduceInstruction* reduce,
-    std::vector<llvm_ir::ElementGenerator> input_generators,
-    std::vector<llvm_ir::ElementGenerator> initial_value_generators,
-    const llvm_ir::IrArray::Index& index) {
-  const Shape& out_shape = reduce->shape();
-  bool is_variadic = !out_shape.IsArray();
-  int accumulators_count = 1;
-  if (is_variadic) {
-    CHECK(out_shape.IsTuple());
-    accumulators_count = out_shape.tuple_shapes_size();
-  }
-
-  absl::Span<const int64> reduced_dimensions(reduce->dimensions());
-
-  std::vector<llvm::Value*> accumulator_addrs;
-  std::vector<llvm::Type*> accumulator_types;
-  for (int i = 0; i < accumulators_count; i++) {
-    const Shape& element_shape =
-        is_variadic ? out_shape.tuple_shapes(i) : out_shape;
-    PrimitiveType accumulator_type = element_shape.element_type();
-    llvm::Type* accumulator_llvm_type =
-        llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_);
-    accumulator_types.push_back(accumulator_llvm_type);
-
-    // Initialize an accumulator with init_value.
-    llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-        accumulator_llvm_type, "accumulator_" + std::to_string(i), &b_,
-        MinimumAlignmentForPrimitiveType(accumulator_type));
-    TF_ASSIGN_OR_RETURN(
-        llvm::Value* const init_value,
-        initial_value_generators[i](llvm_ir::IrArray::Index(index.GetType())));
-    Store(init_value, accumulator_addr);
-    accumulator_addrs.push_back(accumulator_addr);
-  }
-
-  // The enclosing loops go over all the target elements. Now we have to compute
-  // the actual target element. For this, we build a new loop nest to iterate
-  // over all the reduction dimensions in the argument.
-  // AddLoopsForShapeOnDimensions will return an Index where induction Value*s
-  // are placed for each dimension in dimensions, and all the rest are nullptrs.
-  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
-  const HloInstruction* arg = reduce->operand(0);
-  std::vector<llvm::Value*> input_multi_index =
-      loops.AddLoopsForShapeOnDimensions(arg->shape(), reduced_dimensions,
-                                         "reduction_dim");
-
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
-
-  // Build a full index for the input argument, using input_multi_index as the
-  // base. In input_multi_index only the reduction dimensions are filled in. We
-  // fill in the rest of the dimensions with induction Value*s taken from
-  // 'index' which iterates over the target array.  See the high-level
-  // description in the XLA documentation for details.
-  llvm_ir::IrArray::Index::const_iterator it = index.begin();
-
-  for (auto& i : input_multi_index) {
-    if (i == nullptr) {
-      i = *it++;
-    }
-  }
-  CHECK(index.end() == it);
-  llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
-                                      b_.getInt64Ty());
-
-  std::vector<llvm::Value*> reduction_operands;
-  for (llvm::Value* accum : accumulator_addrs) {
-    llvm::Value* accum_value = Load(accum);
-    reduction_operands.push_back(accum_value);
-  }
-
-  for (int i = 0; i < accumulators_count; i++) {
-    TF_ASSIGN_OR_RETURN(llvm::Value* const input_element,
-                        input_generators[i](input_index));
-    reduction_operands.push_back(input_element);
-  }
-
-  std::vector<llvm::Value*> results = EmitThreadLocalCall(
-      *reduce->to_apply(), reduction_operands, "reduce_function");
-
-  CHECK(results.size() == accumulators_count);
-  for (int i = 0; i < accumulators_count; i++) {
-    Store(results[i], accumulator_addrs[i]);
-  }
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-
-  if (is_variadic) {
-    // Emit a structure, as that what the LoopEmitter expects.
-    llvm::Value* returned_structure = llvm::UndefValue::get(
-        llvm::StructType::get(b_.getContext(), accumulator_types));
-    for (int i = 0; i < accumulators_count; i++) {
-      llvm::Value* accumulator_value = Load(accumulator_addrs[i]);
-      returned_structure =
-          b_.CreateInsertValue(returned_structure, accumulator_value, i);
-    }
-    return returned_structure;
-  } else {
-    CHECK_EQ(accumulator_addrs.size(), 1);
-    return Load(accumulator_addrs[0]);
-  }
-}
-
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
   auto arg = reduce->mutable_operand(0);
   auto init_value = reduce->mutable_operand(1);
@@ -2557,7 +2342,95 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
+  // TODO(jackcao): Generalize this to generic llvm emitter.
+  TF_RET_CHECK(hlo->shape().rank() == 1);
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+  for (int64 i = 1; i < hlo->operand_count(); ++i) {
+    const int64 dim_index = i - 1;
+    llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(i));
+    llvm::LoadInst* dim_size = b_.CreateLoad(source_buffer, "dim_size");
+    llvm::Value* dest_buffer = GetEmittedValueFor(hlo);
+    llvm::Value* raw_buffer =
+        b_.CreateBitCast(dest_buffer, b_.getInt8Ty()->getPointerTo());
+
+    int32 raw_data_size =
+        ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(hlo->shape()));
+    llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
+        b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
+    b_.CreateStore(dim_size,
+                   b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
+  }
+
+  return EmitTargetElementLoop(hlo,
+                               [=](const llvm_ir::IrArray::Index& dest_index) {
+                                 // TODO(jackcao): Properly linearize dest_index
+                                 // and delinearize to source index.
+                                 return GetIrArrayFor(hlo->operand(0))
+                                     .EmitReadArrayElement(dest_index, &b_);
+                               });
+}
+
+Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
+  // TODO(jackcao): Generalize this to generic llvm emitter.
+  TF_RET_CHECK(hlo->operand(0)->shape().rank() == 1);
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice data_slice,
+                      assignment_.GetUniqueSlice(hlo, {0}));
+  const Shape& data_shape = ShapeUtil::GetSubshape(hlo->shape(), {0});
+  llvm::Value* data_address = EmitBufferPointer(data_slice, data_shape);
+  llvm_ir::IrArray data_array(data_address, data_shape);
+  TF_RETURN_IF_ERROR(llvm_ir::LoopEmitter(
+                         [=](const llvm_ir::IrArray::Index& dest_index) {
+                           // TODO(jackcao): Properly linearize dest_index and
+                           // delinearize to source index.
+                           return GetIrArrayFor(hlo->operand(0))
+                               .EmitReadArrayElement(dest_index, &b_);
+                         },
+                         llvm_ir::IrArray(data_address, data_shape), &b_)
+                         .EmitLoop(IrName(hlo)));
+  std::vector<llvm::Value*> tuple_operand_ptrs;
+  tuple_operand_ptrs.push_back(data_array.GetBasePointer());
+
+  // PadToStatic has a dynamic tensor as input and variadic size of outputs:
+  // (static_tensor, dynamic_dim_0, dynamic_dim_1, ... )
+  // Dynamic dimension sizes starts from output index 1.
+  for (int64 i = 1; i < hlo->shape().tuple_shapes_size(); ++i) {
+    // Read from the metadata section of the dynamic input (operand 0).
+    const Shape& dim_shape = ShapeUtil::GetSubshape(hlo->shape(), {i});
+    TF_RET_CHECK(Shape::Equal()(dim_shape, ShapeUtil::MakeScalarShape(S32)));
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dim_size_slice,
+                        assignment_.GetUniqueSlice(hlo, {i}));
+    llvm::Value* dest_dim_size_address =
+        EmitBufferPointer(dim_size_slice, data_shape);
+    const int64 dim_index = i - 1;
+    llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(0));
+    llvm::Value* raw_buffer =
+        b_.CreateBitCast(source_buffer, b_.getInt8Ty()->getPointerTo());
+    int32 raw_data_size = ShapeUtil::ByteSizeOf(
+        ShapeUtil::MakeStaticShape(hlo->operand(0)->shape()));
+    llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
+        b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
+    llvm::Value* dim_size = b_.CreateLoad(
+        b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
+    b_.CreateStore(dim_size, b_.CreateBitCast(dest_dim_size_address,
+                                              b_.getInt32Ty()->getPointerTo()));
+    tuple_operand_ptrs.push_back(dest_dim_size_address);
+  }
+
+  // Emit static tensor and dynamic sizes as one tuple.
+  llvm_ir::EmitTuple(GetIrArrayFor(hlo), tuple_operand_ptrs, &b_);
+  return Status::OK();
+}
+
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
+  if (custom_call->custom_call_target() == "PadToStatic") {
+    return HandlePadToStatic(custom_call);
+  }
+  if (custom_call->custom_call_target() == "SliceToDynamic") {
+    return HandleSliceToDynamic(custom_call);
+  }
   absl::Span<HloInstruction* const> operands(custom_call->operands());
   llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
@@ -3002,9 +2875,8 @@ Status IrEmitter::HandleRngGetAndUpdateState(HloInstruction* rng_state) {
                                  old_state->getType()->getScalarType(),
                                  address->getType()->getPointerAddressSpace()));
   llvm::StoreInst* store = Store(old_state, address);
-  store->setAlignment(
-      llvm::MaybeAlign(IrEmitter::MinimumAlignmentForPrimitiveType(
-          rng_state->shape().element_type())));
+  store->setAlignment(llvm::Align(IrEmitter::MinimumAlignmentForPrimitiveType(
+      rng_state->shape().element_type())));
 
   return Status::OK();
 }
@@ -3126,7 +2998,8 @@ void IrEmitter::TracingState::EmitTracingStart(llvm::IRBuilder<>* b,
   }
 
   llvm::Type* int8_ptr_type = b->getInt8Ty()->getPointerTo();
-  llvm::Type* void_ptr_type = b->getVoidTy()->getPointerTo();
+  llvm::Type* void_ptr_type =
+      int8_ptr_type;  // LLVM does not have a void*, we use an int8* instead.
   llvm::FunctionType* fn_type =
       llvm::FunctionType::get(b->getInt64Ty(), {void_ptr_type, int8_ptr_type},
                               /*isVarArg=*/false);
@@ -3156,7 +3029,9 @@ void IrEmitter::TracingState::EmitTracingEnd(llvm::IRBuilder<>* b,
     return;
   }
 
-  llvm::Type* void_ptr_type = b->getVoidTy()->getPointerTo();
+  llvm::Type* void_ptr_type =
+      b->getInt8Ty()->getPointerTo();  // LLVM does not have a void*, we use an
+                                       // int8* instead.
   llvm::FunctionType* fn_type =
       llvm::FunctionType::get(b->getVoidTy(), {void_ptr_type, b->getInt64Ty()},
                               /*isVarArg=*/false);
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index cc5aa3f37fc..9b0d11e9f3f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -58,6 +58,8 @@ namespace cpu {
 // functions.
 class IrEmitter : public DfsHloVisitorWithDefault,
                   public IrBuilderMixin<IrEmitter> {
+  friend class CpuElementalIrEmitter;
+
  public:
   using GeneratorForOperandIrArrays =
       std::function<std::vector<llvm_ir::IrArray>()>;
@@ -113,28 +115,12 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // Emit an LLVM global variable for every constant buffer allocation.
   Status EmitConstantGlobals();
 
-  // Emit code to map one element according to `map_instr`.
-  llvm::Value* EmitElementalMap(
-      const HloMapInstruction& map_instr,
-      absl::Span<llvm::Value* const> elemental_operands,
-      absl::string_view name);
-  // Emit code to emit the element at `index` for a reduce window instruction.
-  StatusOr<llvm::Value*> EmitElementalReduceWindow(
-      const HloReduceWindowInstruction* reduce_window,
-      const llvm_ir::ElementGenerator& input_generator,
-      const llvm_ir::IrArray::Index& index);
   // Emit code to emit the element at `index` for a convolution instruction.
   StatusOr<llvm::Value*> EmitElementalConvolution(
       const HloConvolutionInstruction* convolution,
       const llvm_ir::ElementGenerator& input_generator,
       const llvm_ir::ElementGenerator& kernel_generator,
       const llvm_ir::IrArray::Index& index);
-  // Emit code to emit the element at `index` for a reduce instruction.
-  StatusOr<llvm::Value*> EmitElementalReduce(
-      const HloReduceInstruction* reduce,
-      std::vector<llvm_ir::ElementGenerator> input_generators,
-      std::vector<llvm_ir::ElementGenerator> initial_value_generator,
-      const llvm_ir::IrArray::Index& index);
 
  protected:
   //
@@ -197,6 +183,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   }
 
  private:
+  Status HandleSliceToDynamic(HloInstruction* hlo);
+  Status HandlePadToStatic(HloInstruction* hlo);
   Status HandleAllReduceSingleReplica(HloInstruction* crs);
   Status HandleAllReduceMultipleReplica(HloInstruction* crs);
 
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 8af9b9657c0..f62769cc615 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -121,7 +121,8 @@ void RewriteCalls(
   }
 
   // Generate the vectorized code.
-  CHECK_EQ(vector_width, input->getType()->getVectorNumElements());
+  CHECK_EQ(vector_width,
+           llvm::cast<llvm::VectorType>(input->getType())->getNumElements());
   llvm::Value* result = fn_body_generator(&b, input, vector_width);
 
   // Downcast result to scalar type if necessary.
@@ -142,8 +143,8 @@ void RewriteCalls(
   }
   for (auto* call_to_inline : calls_to_inline) {
     llvm::InlineFunctionInfo inline_function_info;
-    CHECK(
-        llvm::InlineFunction(call_to_inline, inline_function_info).isSuccess());
+    CHECK(llvm::InlineFunction(*call_to_inline, inline_function_info)
+              .isSuccess());
   }
   // LLVM's InjectTLIMappings adds functions that might be used for
   // vectorization to 'llvm.compiler.used'. Remove it before deleting the
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_pow.cc b/tensorflow/compiler/xla/service/cpu/runtime_pow.cc
new file mode 100644
index 00000000000..08308b4ce57
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_pow.cc
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_pow.h"
+
+#include "tensorflow/core/platform/macros.h"
+
+template <typename T>
+static T Powi(T a, tensorflow::int32 b) {
+  const bool recip = b < 0;
+  T r = 1;
+  while (true) {
+    if (b & 1) r *= a;
+    b /= 2;
+    if (b == 0) break;
+    a *= a;
+  }
+  return recip ? 1 / r : r;
+}
+
+float TF_ATTRIBUTE_WEAK __powisf2(float a, tensorflow::int32 b) {
+  return Powi(a, b);
+}
+
+double TF_ATTRIBUTE_WEAK __powidf2(double a, tensorflow::int32 b) {
+  return Powi(a, b);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_pow.h b/tensorflow/compiler/xla/service/cpu/runtime_pow.h
new file mode 100644
index 00000000000..53f8094256d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_pow.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_POW_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_POW_H_
+
+#include "tensorflow/core/platform/types.h"
+
+// Raises F32 value a to the power of b.
+extern "C" float __powisf2(float a, tensorflow::int32 b);
+
+// Raises F64 value a to the power of b.
+extern "C" double __powidf2(double a, tensorflow::int32 b);
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_POW_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 153bd572eba..395eb31c13f 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_pow.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
@@ -56,9 +57,8 @@ llvm::SmallVector<std::string, 0> DetectMachineAttributes() {
   llvm::StringMap<bool> host_features;
   if (llvm::sys::getHostCPUFeatures(host_features)) {
     for (auto& feature : host_features) {
-      if (feature.second) {
-        result.push_back(std::string(feature.first()));
-      }
+      result.push_back((feature.second ? '+' : '-') +
+                       std::string(feature.first()));
     }
   }
   return result;
@@ -271,6 +271,8 @@ bool RegisterKnownJITSymbols() {
                      "Host");
   registry->Register("__truncdfhf2", reinterpret_cast<void*>(__truncdfhf2),
                      "Host");
+  registry->Register("__powisf2", reinterpret_cast<void*>(__powisf2), "Host");
+  registry->Register("__powidf2", reinterpret_cast<void*>(__powidf2), "Host");
 
 #undef REGISTER_CPU_RUNTIME_SYMBOL
 
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
index 5cdac203af2..518684e38c5 100644
--- a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+
+#include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -34,27 +36,17 @@ llvm::TargetTransformInfo* LLVMTargetMachineFeatures::GetTargetTransformInfoFor(
 
 int64 LLVMTargetMachineFeatures::minimum_alignment_for_allocation(
     int64 size_bytes) const {
-  // GLibc malloc returns a pointer with alignment 8 on 32-bit platforms and 16
-  // on 64-bit platforms.  TCMalloc returns a pointer with alignment 8 for
-  // allocations smaller than kMallocAlignmentThreshold bytes and at least
-  // alignment 16 for allocations greater than or equal to
-  // kMallocAlignmentThreshold bytes.  N.B. We could improve on this lower bound
-  // by explicitly allocating the memory with posix_memalign.  This is
-  // complicated by our desire to allow parameter buffers created by clients to
-  // be consumed directly by the JIT.
+  // Assume that all pointers are aligned to at least
+  // xla::cpu_function_runtime::kMinAlign.
   if (size_bytes == 0) {
     // No need to align empty buffers.
     return 1;
   }
 
-  const int64 kMallocAlignmentThreshold = 512;
-
-  int pointer_size = target_machine_->getPointerSize(0);
-  int buffer_alignment =
-      size_bytes >= kMallocAlignmentThreshold ? 2 * pointer_size : pointer_size;
-  DCHECK_GT(buffer_alignment, 0);
-
-  return buffer_alignment;
+  // Allow small buffers to be underaligned, there is no vectorization benefit
+  // anyways.
+  return std::min<int64>(llvm::PowerOf2Ceil(size_bytes),
+                         cpu_function_runtime::kMinAlign);
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index f4da6856940..c698afbdc6a 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -65,8 +65,8 @@ TEST_F(CpuExternalConstantsTest, BasicNegative) {
   // The constant array in this test case is small enough that there is no need
   // to externalize it.
   TestWithArray(/*rows=*/4, /*cols=*/4, R"(
-CHECK-NOT: @constant_global_0 = external unnamed_addr constant [16 x float], align 8
-CHECK: @0 = private unnamed_addr constant [64 x i8] {{.*}}, align 8
+CHECK-NOT: @constant_global_0 = external unnamed_addr constant [16 x float]
+CHECK: @0 = private unnamed_addr constant [64 x i8] {{.*}}, align 16
 )");
 }
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index b6d6de28bc5..efeab3bd31a 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -70,6 +70,13 @@ class CpuUnaryIntrinsicTest
     return absl::StrCat(opcode, "_On_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    HloTestBase::SetAotFastMathDebugOptions(&debug_options);
+    return debug_options;
+  }
 };
 
 // Creates a module with a call to the unary op, and tests if the
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
index 8a72eb15487..757d878e224 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
@@ -69,6 +69,13 @@ class CpuVectorizationTest
     return absl::StrCat(opcode, "_On_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    HloTestBase::SetAotFastMathDebugOptions(&debug_options);
+    return debug_options;
+  }
 };
 
 TEST_P(CpuVectorizationTest, DoIt) {
diff --git a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
index 2918c886f08..754885d8744 100644
--- a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -37,10 +37,10 @@ StatusOr<unsigned> GetTargetVectorRegisterByteSize(std::string triple) {
   }
 
   llvm::LLVMContext context;
-  std::unique_ptr<llvm::Function> function =
-      absl::WrapUnique(llvm::Function::Create(
-          llvm::FunctionType::get(llvm::Type::getVoidTy(context), {}),
-          llvm::GlobalValue::ExternalLinkage, "test"));
+  llvm::Module module("test", context);
+  llvm::Function* function = llvm::Function::Create(
+      llvm::FunctionType::get(llvm::Type::getVoidTy(context), {}),
+      llvm::GlobalValue::ExternalLinkage, "test", &module);
 
   std::unique_ptr<llvm::TargetMachine> target_machine =
       absl::WrapUnique(target->createTargetMachine(
diff --git a/tensorflow/compiler/xla/service/depthwise_convolution_converter.cc b/tensorflow/compiler/xla/service/depthwise_convolution_converter.cc
old mode 100755
new mode 100644
diff --git a/tensorflow/compiler/xla/service/depthwise_convolution_converter.h b/tensorflow/compiler/xla/service/depthwise_convolution_converter.h
old mode 100755
new mode 100644
diff --git a/tensorflow/compiler/xla/service/depthwise_convolution_converter_test.cc b/tensorflow/compiler/xla/service/depthwise_convolution_converter_test.cc
old mode 100755
new mode 100644
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index e4676141f65..caea9d9095a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -109,10 +109,14 @@ class DfsHloVisitorBase {
   virtual Status HandleRsqrt(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleCbrt(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
   virtual Status HandleFft(HloInstructionPtr fft) = 0;
   virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
   virtual Status HandleCholesky(HloInstructionPtr hlo) = 0;
+  virtual Status HandleAllGather(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index baa9240fb56..9cd220245ba 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -98,6 +98,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCholesky(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
+  Status HandleAllGather(HloInstructionPtr crs) override {
+    return DefaultAction(crs);
+  }
   Status HandleAllReduce(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
index ca6fadc2e23..0afcc4cd961 100644
--- a/tensorflow/compiler/xla/service/dump.cc
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -85,7 +85,7 @@ struct CanonicalDebugOptions {
       // resort to this hack.
       string pattern = opts.xla_dump_hlo_module_re();
       should_dump_module = [pattern](string_view module_name) {
-        return RE2::PartialMatch(string(module_name), pattern);
+        return RE2::PartialMatch(module_name, pattern);
       };
     } else if (!opts.xla_dump_hlo_pass_re().empty() ||
                !opts.xla_dump_to().empty() || output_format_specified) {
@@ -99,7 +99,7 @@ struct CanonicalDebugOptions {
     if (!opts.xla_dump_hlo_pass_re().empty()) {
       string pattern = opts.xla_dump_hlo_pass_re();
       should_dump_pass = [pattern](string_view pass_name) {
-        return RE2::PartialMatch(string(pass_name), pattern);
+        return RE2::PartialMatch(pass_name, pattern);
       };
     } else {
       should_dump_pass = [](string_view) { return false; };
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index a103b555df6..e193df6d9bd 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -1369,77 +1369,27 @@ Status DynamicDimensionInferenceVisitor::HandleScatter(HloInstruction* hlo) {
 }
 
 Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
-  // While loop is handled by passing dynamic size hlos as parameters into the
-  // hlo while loop. This is done by replacing the original while with a new
-  // one.
-  //
-  // Before:
-  //
-  // op1 = ...
-  // op2 = ...
-  // op1_x = ... // dynamic dimension size of op1
-  // while = while(op1, op2)
-  //
-  //
-  // After:
-  //
-  // op1 = ...
-  // op2 = ...
-  // op1_x = ... // dynamic dimension size of op1
-  // while = while(op1, op2, op1_x)
-  //
-  // In the above graph, op_x is the bound of the dynamic dimension size of op1
-  // and is wired into the while loop as new parameter.
-  //
-  // TODO(b/119843103): Once we implement dynamic bounds in XLA backend, dynamic
-  // bound can be propagated through native xla values instead of relying on
-  // additional parameter.
-
-  // dynamic_size_to_operand_id_index_map keeps track of dynamic size operations
-  // to their operand ids in the new while loop.
-  absl::flat_hash_map<HloInstruction*, int64>
-      dynamic_size_to_operand_id_index_map;
-
-  // operands_to_add collects dynamic sizes that need to be added to the while
-  // loop as parameters. Note that a dynamic size is ignored if it is already
-  // part of the parameter. i.e.:
-  //
-  // We don't do:
-  //
-  // op1 = ...
-  // op2 = ...
-  // op_x = ... // dynamic dimension size of both op1 and op2
-  // while = while(op1, op2, op_x, op_x) // 4 parameters
-  //
-  // But we do:
-  //
-  // op1 = ...
-  // op2 = ...
-  // op_x = ... // dynamic dimension size of both op1 and op2
-  // while = while(op1, op2, op_x)
-  //
-  // An alternative is to do this in a while loop CSE pass.
-  //
+  // If the output of the conditional contains dynamic dimension. We send
+  // dynamic dimension size out by adding additional root element. A mapping
+  // from the root instruction's dynamic dimension index (represented by a shape
+  // index as output index and a int64 dimension number) to output index
+  // (represented by an int64) is tracked for the conditional instruction (all
+  // branches should have the same mapping).
+  ShapeTree<absl::flat_hash_map<int64, int64>> dynamic_output_mapping(
+      hlo->shape());
   std::vector<HloInstruction*> operands_to_add;
-  int64 operand_count = hlo->shape().tuple_shapes_size();
+  const int64 original_tuple_count = hlo->shape().tuple_shapes_size();
+  int64 operand_count = original_tuple_count;
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
-      hlo, [&](HloInstruction*, ShapeIndex, int64, int64,
+      hlo, [&](HloInstruction*, ShapeIndex index, int64 dim, int64,
                HloInstruction* dynamic_size, DimensionConstraint constraint) {
-        const HloInstruction* tuple_operand = hlo->operand(0);
-        for (int64 i = 0; i < tuple_operand->operand_count(); ++i) {
-          if (dynamic_size == tuple_operand->operand(i)) {
-            dynamic_size_to_operand_id_index_map[dynamic_size] = i;
-            return Status::OK();
-          }
-        }
-        auto iter = dynamic_size_to_operand_id_index_map.find(dynamic_size);
-        if (iter == dynamic_size_to_operand_id_index_map.end()) {
-          operands_to_add.push_back(dynamic_size);
-          dynamic_size_to_operand_id_index_map[dynamic_size] = operand_count++;
-        }
+        operands_to_add.push_back(dynamic_size);
+        dynamic_output_mapping.mutable_element(index)->emplace(dim,
+                                                               operand_count++);
         return Status::OK();
       }));
 
+  DynamicParameterBinding binding_for_while;
   if (!operands_to_add.empty()) {
     // Only replace the while loop if there are new parameters to add.
     HloInstruction* old_tuple_operand = hlo->mutable_operand(0);
@@ -1453,37 +1403,78 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
     parent_->CopyMapping(/*from=*/old_tuple_operand,
                          /*to=*/new_tuple_operand);
     hlo = result.new_while_instr;
+    // We have replaced the while loop, now set the dynamic dimensions for the
+    // newly created while loop so that the hlos that consumes the while loop
+    // can see the dynamic dimensions. Also sets the dynamic parameter binding
+    // for running inference in the while loop.
+    TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
+        hlo,
+        [&](HloInstruction*, ShapeIndex index, int64 dimension,
+            int64 operand_index, HloInstruction* dynamic_size,
+            DimensionConstraint constraint) -> Status {
+          TF_RET_CHECK(!operands_to_add.empty());
+          const int64 output_dynamic_size_index =
+              dynamic_output_mapping.element(index).at(dimension);
+          DynamicParameterBinding::DynamicParameter dynamic_parameter{
+              operand_index, {output_dynamic_size_index}};
+          DynamicParameterBinding::DynamicDimension dynamic_dimension{
+              operand_index, index, dimension};
+          TF_RETURN_IF_ERROR(
+              binding_for_while.Bind(dynamic_parameter, dynamic_dimension));
+          // This is the updated output dynamic size coming out of hlo while
+          // loop.
+          HloInstruction* output_dynamic_size = hlo->parent()->AddInstruction(
+              HloInstruction::CreateGetTupleElement(
+                  ShapeUtil::MakeScalarShape(S32), hlo,
+                  output_dynamic_size_index));
+          parent_->SetDynamicSize(result.replacement_instr, index, dimension,
+                                  output_dynamic_size, constraint);
+          return Status::OK();
+        }));
+    // Set the replacement instruction as visited to avoid visiting it again.
+    SetVisited(*result.replacement_instr);
   }
 
-  // We have replaced the while loop, now set the dynamic dimensions for the
-  // newly created while loop so that the hlos that consumes the while loop can
-  // see the dynamic dimensions. Also sets the dynamic parameter binding for
-  // running inference in the while loop.
-  DynamicParameterBinding binding_for_while;
-  TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
-      hlo, [&](HloInstruction*, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
-        DynamicParameterBinding::DynamicParameter dynamic_parameter{
-            operand_index,
-            {dynamic_size_to_operand_id_index_map[dynamic_size]}};
-        DynamicParameterBinding::DynamicDimension dynamic_dimension{
-            operand_index, index, dimension};
-        TF_RETURN_IF_ERROR(
-            binding_for_while.Bind(dynamic_parameter, dynamic_dimension));
-        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size,
-                                constraint);
-        return Status::OK();
-      }));
-
   // Run inference in while body and condition.
   TF_RETURN_IF_ERROR(DynamicDimensionInferenceVisitor::Run(
       hlo->while_body(), binding_for_while, parent_));
   TF_RETURN_IF_ERROR(DynamicDimensionInferenceVisitor::Run(
       hlo->while_condition(), binding_for_while, parent_));
 
-  // Set the replacement while loop as visited to avoid visiting it again.
-  SetVisited(*hlo);
+  if (operands_to_add.empty()) {
+    // No dynamic dimension in the inputs and outputs.
+    return Status::OK();
+  }
+
+  // The dynamic dimension size could have been changed in the loop body (e.g, A
+  // loop that inserts items in a stack, the stack size increases with each
+  // iteration). Rewrite the dynamic dimension size at the root.
+  HloInstruction* body_root = hlo->while_body()->root_instruction();
+  std::vector<HloInstruction*> new_root_operands(body_root->operand_count(),
+                                                 nullptr);
+
+  // Original non-dynamic-dim operands of root are pass-through.
+  for (int64 i = 0; i < original_tuple_count; ++i) {
+    new_root_operands[i] =
+        hlo->while_body()->AddInstruction(HloInstruction::CreateGetTupleElement(
+            body_root->shape().tuple_shapes(i), body_root, i));
+  }
+  // Add dynamic dimension size as new parameters.
+  TF_RETURN_IF_ERROR(ForEachDynamicDimension(
+      hlo->while_body()->root_instruction(),
+      [&](ShapeIndex index, int64 dim, HloInstruction* dynamic_size,
+          DimensionConstraint) -> Status {
+        const int64 output_index =
+            dynamic_output_mapping.element(index).at(dim);
+        new_root_operands[output_index] = dynamic_size;
+        return Status::OK();
+      }));
+  for (auto operand : new_root_operands) {
+    TF_RET_CHECK(operand != nullptr);
+  }
+  HloInstruction* new_body_root = hlo->while_body()->AddInstruction(
+      HloInstruction::CreateTuple(new_root_operands));
+  hlo->while_body()->set_root_instruction(new_body_root);
   return Status::OK();
 }
 
@@ -1629,6 +1620,24 @@ Status DynamicDimensionInference::ForwardDynamicSize(HloInstruction* inst,
   return Status::OK();
 }
 
+bool DynamicDimensionInference::HasDynamicDimension(
+    HloInstruction* inst) const {
+  bool has_dynamic_dim = false;
+  ShapeUtil::ForEachSubshape(
+      inst->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.IsTuple()) {
+          return;
+        }
+        for (int64 i = 0; i < subshape.dimensions_size(); ++i) {
+          HloInstruction* operand_dynamic_size = GetDynamicSize(inst, index, i);
+          if (operand_dynamic_size != nullptr) {
+            has_dynamic_dim = true;
+          }
+        }
+      });
+  return has_dynamic_dim;
+}
+
 HloInstruction* DynamicDimensionInference::GetDynamicSize(
     HloInstruction* inst, const ShapeIndex& index, int64 dim) const {
   auto iter = dynamic_mapping_.find(DynamicDimension{inst, index, dim});
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
index 6e3b9e26feb..417f0289143 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -51,6 +51,10 @@ class DynamicDimensionInference {
   HloInstruction* GetDynamicSize(HloInstruction* inst, const ShapeIndex& index,
                                  int64 dim) const;
 
+  // Returns if current instruction contains any dynamic dimension. Recursively
+  // go into tuples.
+  bool HasDynamicDimension(HloInstruction* inst) const;
+
   // Forward dynamic dimension size at `dim` and its constraint from `inst` to
   // `new_inst`.
   Status ForwardDynamicSize(HloInstruction* inst, HloInstruction* new_inst,
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index dc295669fa9..b5a17619edf 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -767,7 +767,7 @@ TEST_F(DynamicDimensionInferenceTest, WhileTest) {
   //  While
   auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, tuple_shape, "A"));
-  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+  builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/1, scalar_shape_, "size_param"));
   builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, a_param));
@@ -782,37 +782,32 @@ TEST_F(DynamicDimensionInferenceTest, WhileTest) {
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
 
-  // Test that dynamic dimension inference does the right thing. A lambda is
-  // used here since we want to test twice by running inference again
-  // (idempotency).
-  auto test_dynamic_dimension = [&]() {
-    HloInstruction* while_hlo = nullptr;
-    // The while hlo has been replaced, find the new one.
-    for (HloInstruction* inst : module_->entry_computation()->instructions()) {
-      if (inst->opcode() == HloOpcode::kWhile) {
-        while_hlo = inst;
-      }
-    }
-    ASSERT_NE(while_hlo, nullptr);
-    // The original while shape has 2 parameters. With dynamic size passed in
-    // as an extra parameter, the tuple should have 3 elements.
-    EXPECT_EQ(while_hlo->shape().tuple_shapes_size(), 3);
-    HloInstruction* add = nullptr;
-    for (HloInstruction* inst : while_hlo->while_body()->instructions()) {
-      if (inst->opcode() == HloOpcode::kAdd) {
-        add = inst;
-      }
-    }
-    EXPECT_NE(add, nullptr);
-    EXPECT_NE(inference_->GetDynamicSize(add, {}, 0), nullptr);
-    EXPECT_EQ(inference_->GetDynamicSize(while_hlo, {0}, 0), size_param);
-    EXPECT_EQ(inference_->GetDynamicSize(while_hlo, {1}, 0), size_param);
-  };
-
   TF_ASSERT_OK(RunInference());
-  test_dynamic_dimension();
-  TF_ASSERT_OK(RunInference());
-  test_dynamic_dimension();
+  HloInstruction* while_hlo = nullptr;
+  // The while hlo has been replaced, find the new one.
+  for (HloInstruction* inst : module_->entry_computation()->instructions()) {
+    if (inst->opcode() == HloOpcode::kWhile) {
+      while_hlo = inst;
+    }
+  }
+  ASSERT_NE(while_hlo, nullptr);
+  // The original while shape has 2 parameters. With dynamic size, the tuple
+  // should have 4 elements (We don't deduplicate the arguments).
+  EXPECT_EQ(while_hlo->shape().tuple_shapes_size(), 4);
+  HloInstruction* add_inst = nullptr;
+  for (HloInstruction* inst : while_hlo->while_body()->instructions()) {
+    if (inst->opcode() == HloOpcode::kAdd) {
+      add_inst = inst;
+    }
+  }
+  EXPECT_NE(add_inst, nullptr);
+  EXPECT_NE(inference_->GetDynamicSize(add_inst, {}, 0), nullptr);
+  EXPECT_NE(inference_->GetDynamicSize(
+                module_->entry_computation()->root_instruction(), {0}, 0),
+            nullptr);
+  EXPECT_NE(inference_->GetDynamicSize(
+                module_->entry_computation()->root_instruction(), {1}, 0),
+            nullptr);
 }
 
 TEST_F(DynamicDimensionInferenceTest, ConditionalInputTest) {
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index e0fe9c08d0a..44fdda0f411 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace xla {
 
@@ -244,8 +245,9 @@ HloInstruction* PadWithScalar(HloInstruction* inst, int64 dim,
 Status RewriteDynamicReshapeSplitInput(
     HloInstruction* reshape, int64 input_dim,
     absl::Span<const int64> output_dims,
+    absl::Span<HloInstruction*> output_dynamic_dims,
     DynamicDimensionInference* dynamic_dimension_inference) {
-  VLOG(1) << "Reshaping input dim " << input_dim << "to "
+  VLOG(2) << "Reshaping input dim " << input_dim << "to "
           << VectorString(output_dims);
   const Shape operand_shape = reshape->operand(0)->shape();
   TF_RET_CHECK(output_dims.size() > 1);
@@ -280,8 +282,7 @@ Status RewriteDynamicReshapeSplitInput(
   // dimension.
   for (int64 i = 1; i < output_dims.size(); ++i) {
     const int64 output_dim = output_dims[i];
-    HloInstruction* dynamic_size =
-        dynamic_dimension_inference->GetDynamicSize(reshape, {}, output_dim);
+    HloInstruction* dynamic_size = output_dynamic_dims[output_dim];
     if (dynamic_size == nullptr) {
       continue;
     }
@@ -331,10 +332,7 @@ Status RewriteDynamicReshapeSplitInput(
       mask_input_shape, HloOpcode::kSubtract, cumsum, broadcast_ones));
 
   GatherDimensionNumbers gather_dim_numbers;
-  // We use gather to rearrange the input dim dimension. However the current
-  // semantic of gather doesn't allow us to collapse dimension in this case so
-  // we keep it, which make the gather from shape [..., input_dim, ...] to
-  // [..., 1, input_dim, ...]
+  // Use gather to rearrange the input dim dimension.
   for (int64 i = 0; i < operand_shape.dimensions_size(); ++i) {
     // Offset dim is every dimension including newly added size 1 dim, except
     // for input_dim, which acts as a batch_dim.
@@ -396,177 +394,255 @@ Status RewriteDynamicReshapeSplitInput(
   return Status::OK();
 }
 
+// RewriteDynamicReshapeCombineInput is similar to
+// RewriteDynamicReshapeSplitInput, in a reshape if multiple dimensions are
+// combined into one dimension, we need to rewrite the output.
+//
+// The reason for this is that a continuous input may not be evenly reshaped
+// into output.  Image we have [2, <=3] where second dimension has size 2 and
+// padding(P) data has size 1:
+// [[a,b,P]
+//  [c,d,P]]
+//
+// And we have a reshape that combines this two input dimensions.
+//
+// [2, <=3]
+//  |
+// Reshape
+//  |
+// [6]
+//
+// This should produce the same result as if the data has no padding:
+//
+// [2, 2]     // [[a, b], [c, d]]
+//  |
+// Reshape
+//  |
+// [4]  // [a,b,c,d]
+//
+// Without rewriting, the result would be:
+//
+// [a,b,P,c,d,P], which is incorrect.
+//
+// We need to rewrite the reshape such that it produces:
+// [a,b,c,d,P,P]
+//
+// The way we do this is by a 5-steps sort-gather algorithm:
+//
+// 1.First we use the input shape to generate a binary 0-1 masking, which masks
+// out the padded area of the output:
+// [[0,0,1]
+//  [0,0,1]]
+//
+// 2.Then we do an reshape to reshape the mask from input shape to output
+// shape [2,3]->[6]:
+//  [0,0,1,0,0,1]
+//
+// 3.We then generate an iota mask using the output shape:
+//  [0,1,2,3,4,5]
+//
+// 4.Stable sort the iota mask using the binary mask as key:
+//  key  [0,0,1,0,0,1]
+//  value[0,1,2,3,4,5]
+//     | Sort by key
+//     v
+//  key  [0,0,0,0,1,1]
+//  value[0,1,3,4,2,5]
+//
+// 5.Gather the original output [a,b,P,c,d,P] using the sorted iota mask:
+//      original output       gather indices
+//       [a,b,P,c,d,P]         [0,1,3,4,2,5]
+//            |                    |
+//          Gather ----------------+
+//            |
+//       [a,b,c,d,P,P]
+//
 Status RewriteDynamicReshapeCombineInput(
-    HloInstruction* reshape, int64 input_dim, int64 output_dim,
-    HloInstruction* dynamic_size,
+    HloInstruction* reshape, absl::Span<const int64> input_dims,
+    int64 output_dim, absl::Span<HloInstruction*> input_dynamic_dims,
     DynamicDimensionInference* dynamic_dimension_inference) {
   // Rewrite dynamic reshape into reshape followed by a sort, all padded
   // data will be moved to the end.
-  const HloInstruction* operand = reshape->operand(0);
   HloComputation* comp = reshape->parent();
   HloInstruction* zero = comp->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
   HloInstruction* one = comp->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::One(S32)));
-  const Shape mask_shape =
-      ShapeUtil::ChangeElementType(operand->shape(), xla::S32);
-  const Shape mask_reshaped_shape =
-      ShapeUtil::ChangeElementType(reshape->shape(), xla::S32);
-  HloInstruction* broadcasted_zero = comp->AddInstruction(
-      HloInstruction::CreateBroadcast(mask_shape, zero, {}));
-  // Pad masking area with 1s, rest with 0s.
-  HloInstruction* padding_mask =
-      PadWithScalar(broadcasted_zero, input_dim, dynamic_size, one);
-  HloInstruction* mask_reshaped = comp->AddInstruction(
-      HloInstruction::CreateReshape(mask_reshaped_shape, padding_mask));
+  const Shape output_shape = reshape->shape();
+  const Shape input_shape = reshape->operand(0)->shape();
+  const Shape mask_output_shape =
+      ShapeUtil::MakeShape(xla::S32, {output_shape.dimensions(output_dim)});
+  std::vector<int64> input_dim_sizes;
+  for (int64 input_dim : input_dims) {
+    input_dim_sizes.push_back(input_shape.dimensions(input_dim));
+  }
 
-  // Build computation for reshape, key is the mask shape, value is reshape's
-  // original data.
+  const Shape mask_input_shape =
+      ShapeUtil::MakeShape(xla::S32, input_dim_sizes);
+
+  // Step 1 -- generate binary mask.
+  // Mask starts with all zero, each dynamic dimension sets that dimension of
+  // the mask to partially ones in the end.
+  HloInstruction* binary_mask = comp->AddInstruction(
+      HloInstruction::CreateBroadcast(mask_input_shape, zero, {}));
+
+  bool need_rewrite = false;
+
+  // Pad the effective dimension with 1.
+  //
+  // Index starts from 1 since there is no need to rewrite a major output
+  // dimension.
+  for (int64 i = 1; i < input_dims.size(); ++i) {
+    const int64 input_dim = input_dims[i];
+    HloInstruction* dynamic_size = input_dynamic_dims[input_dim];
+    if (dynamic_size == nullptr) {
+      continue;
+    }
+    // If there is a dynamic dimension in the input, need to rewrite the output.
+    need_rewrite = true;
+
+    binary_mask = PadWithScalar(binary_mask, i, dynamic_size, one);
+  }
+  if (!need_rewrite) {
+    VLOG(2) << "No need to rewrite";
+    return Status::OK();
+  }
+
+  // Step 2.
+  // Do a reshape to flatten the binary mask into output_shape
+  HloInstruction* output_shape_binary_mask = comp->AddInstruction(
+      HloInstruction::CreateReshape(mask_output_shape, binary_mask));
+
+  // Step 3.
+  // Generate an iota with output shape.
+  HloInstruction* iota =
+      comp->AddInstruction(HloInstruction::CreateIota(mask_output_shape, 0));
+
+  // Step 4.
+  // Stable sort the iota mask using the binary mask as key and iota as value:
+
+  // Build computation for sort, key is the mask, value is the iota.
   HloComputation::Builder comp_builder("compare");
   HloInstruction* lhs_key =
       comp_builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(S32, {}), "lhs_key"));
+          0, ShapeUtil::MakeScalarShape(S32), "lhs_key"));
   HloInstruction* rhs_key =
       comp_builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(S32, {}), "rhs_key"));
+          1, ShapeUtil::MakeScalarShape(S32), "rhs_key"));
 
   // Values for lhs and rhs
   comp_builder.AddInstruction(HloInstruction::CreateParameter(
-      2, ShapeUtil::MakeShape(operand->shape().element_type(), {}),
-      "lhs_value"));
+      2, ShapeUtil::MakeScalarShape(S32), "lhs_value"));
   comp_builder.AddInstruction(HloInstruction::CreateParameter(
-      3, ShapeUtil::MakeShape(operand->shape().element_type(), {}),
-      "rhs_value"));
+      3, ShapeUtil::MakeScalarShape(S32), "rhs_value"));
   comp_builder.AddInstruction(
       HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), lhs_key,
                                     rhs_key, ComparisonDirection::kLt));
   HloComputation* compare =
       comp->parent()->AddEmbeddedComputation(comp_builder.Build());
 
+  // Use mask_reshaped as key, sort reshaped data as value.
+  HloInstruction* sort = comp->AddInstruction(HloInstruction::CreateSort(
+      ShapeUtil::MakeTupleShape({mask_output_shape, mask_output_shape}), 0,
+      {output_shape_binary_mask, iota}, compare,
+      /*is_stable=*/true));
+
+  HloInstruction* gather_indices = comp->AddInstruction(
+      HloInstruction::CreateGetTupleElement(mask_output_shape, sort, 1));
+
+  // Step 5.Gather the original output using the sorted iota mask:
+
+  GatherDimensionNumbers gather_dim_numbers;
+  // Use gather to rearrange the output dim dimension.
+  for (int64 i = 0; i < output_shape.dimensions_size(); ++i) {
+    // Offset dim is every dimension including newly added size 1 dim, except
+    // for input_dim, which acts as a batch_dim.
+    if (i != output_dim) {
+      gather_dim_numbers.add_offset_dims(i);
+    }
+  }
+  // The dimension to rewrite is the index dim.
+  gather_dim_numbers.add_start_index_map(output_dim);
+  gather_dim_numbers.set_index_vector_dim(1);
+  gather_dim_numbers.add_collapsed_slice_dims(output_dim);
+
   HloInstruction* static_dim_size = comp->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
           reshape->shape().dimensions(output_dim))));
 
   // Temporarily removes dynamic dimension of the reshape before we send it to
-  // the sort -- we want padded area to also participate in the sort.
+  // the sort -- we want padded area to also participate in the gather.
   HloInstruction* reshape_static =
       comp->AddInstruction(HloInstruction::CreateSetDimensionSize(
           reshape->shape(), reshape, static_dim_size, output_dim));
+  std::vector<int64> gather_slice_sizes(output_shape.dimensions().begin(),
+                                        output_shape.dimensions().end());
+  gather_slice_sizes[output_dim] = 1;
+  HloInstruction* gather = comp->AddInstruction(HloInstruction::CreateGather(
+      output_shape, reshape_static, gather_indices, gather_dim_numbers,
+      gather_slice_sizes, true));
 
-  // Use mask_reshaped as key, sort reshaped data as value.
-  HloInstruction* sort = comp->AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({mask_reshaped_shape, reshape->shape()}),
-      output_dim, {mask_reshaped, reshape_static}, compare,
-      /*is_stable=*/true));
-  HloInstruction* dynamic_reshape = comp->AddInstruction(
-      HloInstruction::CreateGetTupleElement(reshape->shape(), sort, 1));
-  // Forward dynamic size to the newly created reshape.
+  // Forward dynamic size to the newly created gather.
   HloInstruction* output_dynamic_size =
       dynamic_dimension_inference->GetDynamicSize(reshape, {}, output_dim);
   TF_RET_CHECK(output_dynamic_size != nullptr);
-  dynamic_reshape = comp->AddInstruction(HloInstruction::CreateSetDimensionSize(
-      dynamic_reshape->shape(), dynamic_reshape, output_dynamic_size,
-      output_dim));
+  gather = comp->AddInstruction(HloInstruction::CreateSetDimensionSize(
+      gather->shape(), gather, output_dynamic_size, output_dim));
   auto users = reshape->users();
   for (auto* user : users) {
     // Avoid cycles by not replacing the staic reshape and get_dimension_size.
     if (user != reshape_static && user != output_dynamic_size) {
-      TF_RETURN_IF_ERROR(reshape->ReplaceUseWith(user, dynamic_reshape));
+      TF_RETURN_IF_ERROR(reshape->ReplaceUseWith(user, gather));
     }
   }
 
   if (reshape == comp->root_instruction()) {
-    comp->set_root_instruction(dynamic_reshape);
+    comp->set_root_instruction(gather);
   }
 
-  TF_RETURN_IF_ERROR(dynamic_dimension_inference->ForwardDynamicSize(
-      reshape, dynamic_reshape, {}));
+  TF_RETURN_IF_ERROR(
+      dynamic_dimension_inference->ForwardDynamicSize(reshape, gather, {}));
 
   return Status::OK();
 }
 
-Status RewriteDynamicReshapeSingleDim(
-    HloInstruction* reshape, int64 input_dim, HloInstruction* dynamic_size,
+Status RewriteDynamicReshapeSingleGroup(
+    HloInstruction* reshape, absl::Span<const int64> input_dims,
+    absl::Span<const int64> output_dims,
+    absl::Span<HloInstruction*> input_dynamic_dims,
+    absl::Span<HloInstruction*> output_dynamic_dims,
     DynamicDimensionInference* dynamic_dimension_inference) {
   VLOG(2) << "Rewriting dynamic reshape " << reshape->ToString()
-          << " input dim: " << input_dim;
+          << " input dims: " << VectorString(input_dims)
+          << " output dims: " << VectorString(output_dims);
+
   const Shape operand_shape = reshape->operand(0)->shape();
   const Shape output_shape = reshape->shape();
 
-  const int64 static_input_dim_size = operand_shape.dimensions()[input_dim];
-
-  // Don't need to rewrite size 1 input dims.
-  if (static_input_dim_size == 1) {
-    return Status::OK();
-  }
-
-  auto common_factors =
-      CommonFactors(operand_shape.dimensions(), output_shape.dimensions());
-  // If there are multiple input dims combining into one output dim,
-  // input_dim_start and input_dim_end represent the input dimension range.
-  int64 input_dim_start = -1;
-  int64 input_dim_end = -1;
-  // Similarly when one input dim is splitted into multiple outputs, we use
-  // output_dim_start and output_dim_start to represent the output dimension
-  // range.
-  int64 output_dim_start = -1;
-  int64 output_dim_end = -1;
-  // Find common_factors that the input belong to.
-  for (int64 i = 0; i < common_factors.size() - 1; ++i) {
-    auto start = common_factors[i];
-    auto end = common_factors[i + 1];
-    if (input_dim >= start.first && input_dim < end.first) {
-      // Found the common_factor group that the input_dim belongs to.
-      input_dim_start = start.first;
-      input_dim_end = end.first;
-      output_dim_start = start.second;
-      output_dim_end = end.second;
+  if (input_dims.size() == 1) {
+    int64 input_dim = input_dims[0];
+    // Size 1 dimension doesn't need a rewrite.
+    if (operand_shape.dimensions()[input_dim] == 1) {
+      return Status::OK();
     }
-  }
-
-  TF_RET_CHECK(output_dim_end - output_dim_start > 0);
-
-  std::vector<int64> output_dims;
-  for (int64 i = output_dim_start; i < output_dim_end; ++i) {
-    output_dims.push_back(i);
-  }
-
-  const int64 first_output_dim = output_dims[0];
-
-  if (reshape->shape().dimensions(first_output_dim) < static_input_dim_size) {
     // One input dimension is splitted into multiple output dimensions.
     return RewriteDynamicReshapeSplitInput(reshape, input_dim, output_dims,
+                                           output_dynamic_dims,
                                            dynamic_dimension_inference);
   }
 
-  if (reshape->shape().dimensions(first_output_dim) == static_input_dim_size) {
-    // Unchanged dynamic dimension doesn't need a rewrite.
-    return Status::OK();
-  }
-
-  // Multiple dimensions got combined into one output.
-  if (input_dim != input_dim_start) {
-    // If 'input_dim' is not the first dimension that got combined into the
-    // output. A reshape rewrite on the output is needed:
-    //
-    //  Need a write (d is dynamic):
-    //  1, 2, d
-    //   |
-    //  Reshape
-    //   |
-    //   2d
-    //
-    //  Don't need rewrite:
-    //  d, 2
-    //   |
-    //  Reshape
-    //   |
-    //   2d
-    //
-    return RewriteDynamicReshapeCombineInput(reshape, input_dim,
-                                             first_output_dim, dynamic_size,
+  if (output_dims.size() == 1) {
+    int64 output_dim = output_dims[0];
+    if (output_shape.dimensions()[output_dim] == 1) {
+      return Status::OK();
+    }
+    // One input dimension is splitted into multiple output dimensions.
+    return RewriteDynamicReshapeCombineInput(reshape, input_dims, output_dim,
+                                             input_dynamic_dims,
                                              dynamic_dimension_inference);
   }
+  // Shouldn't get here;
+  TF_RET_CHECK(false);
   return Status::OK();
 }
 
@@ -718,23 +794,85 @@ StatusOr<bool> RewriteDynamicReshape(
     DynamicDimensionInference* dynamic_dimension_inference) {
   bool changed = false;
   HloInstruction* operand = reshape->mutable_operand(0);
+  std::vector<HloInstruction*> input_dynamic_dims;
+  for (int64 dim = 0; dim < operand->shape().dimensions_size(); ++dim) {
+    input_dynamic_dims.push_back(
+        dynamic_dimension_inference->GetDynamicSize(operand, {}, dim));
+  }
 
-  // We append sort instructions after reshape if there is a dynamic input, and
-  // the order of sort matters. Rewrite minor dimensions first in case multiple
-  // inputs have dynamic dimensions to ensure correct order of sort.
-  for (int64 input_dim = operand->shape().rank() - 1; input_dim >= 0;
-       --input_dim) {
-    HloInstruction* operand_dynamic_size =
-        dynamic_dimension_inference->GetDynamicSize(operand, {}, input_dim);
+  std::vector<HloInstruction*> output_dynamic_dims;
+  for (int64 dim = 0; dim < reshape->shape().dimensions_size(); ++dim) {
+    output_dynamic_dims.push_back(
+        dynamic_dimension_inference->GetDynamicSize(reshape, {}, dim));
+  }
 
-    if (operand_dynamic_size == nullptr) {
+  auto common_factors = CommonFactors(operand->shape().dimensions(),
+                                      reshape->shape().dimensions());
+  // Find common_factors that the input belongs to.
+  for (int64 i = 0; i < common_factors.size() - 1; ++i) {
+    auto start = common_factors[i];
+    auto end = common_factors[i + 1];
+    std::vector<int64> input_dims;
+    std::vector<int64> output_dims;
+    for (int64 dim = start.first; dim < end.first; ++dim) {
+      input_dims.push_back(dim);
+    }
+    for (int64 dim = start.second; dim < end.second; ++dim) {
+      output_dims.push_back(dim);
+    }
+
+    VLOG(2) << "input_dims: " << VectorString(input_dims);
+    VLOG(2) << "output_dims: " << VectorString(output_dims);
+
+    if (input_dims.empty() || output_dims.empty()) {
       continue;
     }
-    TF_RETURN_IF_ERROR(RewriteDynamicReshapeSingleDim(
-        reshape, input_dim, operand_dynamic_size, dynamic_dimension_inference));
+    bool has_dynamic_dimension = absl::c_any_of(output_dims, [&](int64 dim) {
+      HloInstruction* operand_dynamic_size =
+          dynamic_dimension_inference->GetDynamicSize(reshape, {}, dim);
 
-    changed = true;
+      return operand_dynamic_size != nullptr ||
+             reshape->shape().is_dynamic_dimension(dim);
+    });
+
+    if (!has_dynamic_dimension) {
+      // Don't need to rewrite any group without dynamic dimensions.
+      VLOG(2) << "All dimensions are static in this common factor group";
+      continue;
+    }
+
+    if (input_dims.size() == 1 && output_dims.size() == 1) {
+      // The dimension is unchanged. No rewrite needed.
+      continue;
+    }
+    if (input_dims.size() > 1 && output_dims.size() > 1) {
+      // We don't support the case when a dynamic dimension is both combined
+      // with and splitted into other dimensions:
+      //
+      //  [x, yz]
+      //     | Reshape
+      //  [xy, z]
+      //
+      // TODO(yunxing): This can be supported by canonicalizing
+      // the offending reshape into two reshapes:
+      //
+      //  [x,yz]
+      //     | Reshape
+      //  [x, y, z]
+      //     | Reshape
+      //  [xy, z]
+      //
+      return Unimplemented(
+          "Dynamic input dimension to reshape that is both splitted and "
+          "combined is not supported %s",
+          reshape->ToString());
+    }
+
+    TF_RETURN_IF_ERROR(RewriteDynamicReshapeSingleGroup(
+        reshape, input_dims, output_dims, absl::MakeSpan(input_dynamic_dims),
+        absl::MakeSpan(output_dynamic_dims), dynamic_dimension_inference));
   }
+
   return changed;
 }
 
@@ -806,106 +944,6 @@ Status InsertPadToStaticAfterModuleInputs(HloModule* module) {
   return Status::OK();
 }
 
-// For all dynamic outputs that live out of the computation, add
-// slice-to-dynamic operations.
-Status InsertSliceToDynamicBeforeModuleOutputs(
-    const DynamicDimensionInference& dynamic_dimension_inference,
-    HloModule* module) {
-  auto root = module->entry_computation()->root_instruction();
-  absl::flat_hash_set<ShapeIndex> dynamic_outputs;
-  ShapeUtil::ForEachSubshape(
-      root->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
-        if (subshape.IsArray()) {
-          bool has_dynamic_output = false;
-          for (int64 dim = 0; dim < subshape.rank(); ++dim) {
-            if (dynamic_dimension_inference.GetDynamicSize(root, index, dim) !=
-                nullptr) {
-              CHECK_LE(index.size(), 1) << "XLA doesn't support nested output "
-                                           "dimension that has dynamic size";
-              has_dynamic_output = true;
-            }
-          }
-          if (has_dynamic_output) {
-            dynamic_outputs.insert(index);
-          }
-        }
-      });
-  if (!dynamic_outputs.empty()) {
-    if (root->shape().IsTuple()) {
-      std::vector<HloInstruction*> new_root_operands;
-      ShapeUtil::ForEachSubshape(root->shape(), [&](const Shape& subshape,
-                                                    const ShapeIndex& index) {
-        if (!subshape.IsArray()) {
-          return;
-        }
-
-        auto gte = module->entry_computation()->AddInstruction(
-            HloInstruction::CreateGetTupleElement(
-                ShapeUtil::MakeShapeWithStaticDimensions(subshape), root,
-                index[0]));
-
-        if (dynamic_outputs.contains(index)) {
-          CHECK_EQ(index.size(), 1)
-              << "XLA only support 1 layer nested output tuple";
-          // For dynamic outputs, creates an slice operation.
-          std::vector<HloInstruction*> slice_operands;
-          // First operand is the original input. Rest are dimension values.
-          slice_operands.push_back(gte);
-          // Keep a dynamic version of the subshape as we are removing the
-          // dynamic dimension in the original root and gte.
-          Shape dynamic_subshape = subshape;
-          for (int64 dim = 0; dim < subshape.rank(); ++dim) {
-            HloInstruction* dynamic_size =
-                dynamic_dimension_inference.GetDynamicSize(root, index, dim);
-            if (dynamic_size != nullptr) {
-              slice_operands.push_back(dynamic_size);
-            } else {
-              auto const_size = HloInstruction::CreateConstant(
-                  LiteralUtil::CreateR0<int32>(subshape.dimensions(dim)));
-              slice_operands.push_back(
-                  module->entry_computation()->AddInstruction(
-                      std::move(const_size)));
-            }
-          }
-          // This is a dynamic output, add slice operation.
-          auto slice = HloInstruction::CreateCustomCall(
-              dynamic_subshape, slice_operands, "SliceToDynamic");
-          new_root_operands.push_back(
-              module->entry_computation()->AddInstruction(std::move(slice)));
-        } else {
-          new_root_operands.push_back(gte);
-        }
-      });
-
-      auto new_root = module->entry_computation()->AddInstruction(
-          HloInstruction::CreateTuple(new_root_operands));
-      module->entry_computation()->set_root_instruction(new_root);
-    } else {
-      std::vector<HloInstruction*> slice_operands;
-      // First operand is the original input. Rest are dimension values.
-      slice_operands.push_back(root);
-      for (int64 dim = 0; dim < root->shape().rank(); ++dim) {
-        HloInstruction* dynamic_size =
-            dynamic_dimension_inference.GetDynamicSize(root, {}, dim);
-        if (dynamic_size != nullptr) {
-          slice_operands.push_back(dynamic_size);
-        } else {
-          auto const_size = HloInstruction::CreateConstant(
-              LiteralUtil::CreateR0<int32>(root->shape().dimensions(dim)));
-          slice_operands.push_back(module->entry_computation()->AddInstruction(
-              std::move(const_size)));
-        }
-        // This is a dynamic output, add slice operation.
-        auto slice = module->entry_computation()->AddInstruction(
-            HloInstruction::CreateCustomCall(root->shape(), slice_operands,
-                                             "SliceToDynamic", "0-0"));
-        module->entry_computation()->set_root_instruction(slice);
-      }
-    }
-  }
-  return Status::OK();
-}
-
 // Remove all dynamic shapes between pad-to-static and slice-to-dynamic.
 //
 // After this visitor the entry computation then looks like:
@@ -922,46 +960,217 @@ Status InsertSliceToDynamicBeforeModuleOutputs(
 // ROOT tuple (dynamic)
 class DynamicShapeRemovingVisitor : public DfsHloVisitorWithDefault {
  public:
+  explicit DynamicShapeRemovingVisitor(
+      const DynamicPadder::OpSupportsDynamismHandler&
+          op_supports_dynamism_handler,
+      const DynamicDimensionInference& dynamic_dimension_inference)
+      : op_supports_dynamism_handler_(op_supports_dynamism_handler),
+        dynamic_dimension_inference_(dynamic_dimension_inference) {}
+
   Status DefaultAction(HloInstruction* hlo) override;
 
   Status HandleCustomCall(HloInstruction* hlo) override;
 
+  Status HandleTuple(HloInstruction* hlo) override;
+  Status HandleGetTupleElement(HloInstruction* hlo) override;
+
   Status HandleParameter(HloInstruction* hlo) override;
 
-  static Status Run(HloComputation* computation) {
-    DynamicShapeRemovingVisitor visitor;
-    return computation->Accept(&visitor);
+  static Status Run(HloComputation* computation,
+                    const DynamicPadder::OpSupportsDynamismHandler&
+                        op_supports_dynamism_handler,
+                    const DynamicDimensionInference& dynamic_shape_inference,
+                    bool require_dynamic_output) {
+    DynamicShapeRemovingVisitor visitor(op_supports_dynamism_handler,
+                                        dynamic_shape_inference);
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+    // If the outputs is required to be dynamic form, insert static to dynamic
+    // conversion as root.
+    if (require_dynamic_output) {
+      HloInstruction* root = computation->root_instruction();
+      if (dynamic_shape_inference.HasDynamicDimension(root)) {
+        HloInstruction* new_root = visitor.ConvertToDynamic(root);
+        computation->set_root_instruction(new_root);
+      }
+    }
+    return Status::OK();
   }
+
+ private:
+  // If a tensor produced by `inst` is in dynamic form, convert it to static and
+  // returns the new instruction.
+  HloInstruction* ConvertToStatic(HloInstruction* inst);
+
+  // If a tensor produced by `inst` is in static form, convert it to dynamic and
+  // returns the new instruction.
+  HloInstruction* ConvertToDynamic(HloInstruction* inst);
+
+  const DynamicPadder::OpSupportsDynamismHandler& op_supports_dynamism_handler_;
+
+  const DynamicDimensionInference& dynamic_dimension_inference_;
 };
 
+HloInstruction* DynamicShapeRemovingVisitor::ConvertToDynamic(
+    HloInstruction* inst) {
+  auto* comp = inst->parent();
+  const Shape& shape = inst->shape();
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> dynamic_operands;
+    for (int64 i = 0; i < shape.tuple_shapes_size(); ++i) {
+      auto operand = inst->mutable_operand(i);
+      if (dynamic_dimension_inference_.HasDynamicDimension(operand)) {
+        // Recurse.
+        dynamic_operands.push_back(ConvertToDynamic(operand));
+      } else {
+        dynamic_operands.push_back(operand);
+      }
+    }
+    return comp->AddInstruction(HloInstruction::CreateTuple(dynamic_operands));
+  } else {
+    // Collect the data input, as well as dimension sizes, and feed them to
+    // slice to dynamic to create a dynamic tensor.
+    Shape output_shape = shape;  // 0th element.
+    CHECK(output_shape.is_static());
+    std::vector<HloInstruction*> slice_operand;
+    slice_operand.push_back(inst);
+    for (int64 i = 0; i < output_shape.dimensions_size(); ++i) {
+      auto dimension_size =
+          dynamic_dimension_inference_.GetDynamicSize(inst, {}, i);
+      if (dimension_size == nullptr) {
+        dimension_size = comp->AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32>(output_shape.dimensions(i))));
+      } else {
+        output_shape.set_dynamic_dimension(i, true);
+      }
+      slice_operand.push_back(dimension_size);
+    }
+    return comp->AddInstruction(HloInstruction::CreateCustomCall(
+        output_shape, slice_operand, "SliceToDynamic"));
+  }
+}
+
+HloInstruction* DynamicShapeRemovingVisitor::ConvertToStatic(
+    HloInstruction* inst) {
+  auto* comp = inst->parent();
+  const Shape& shape = inst->shape();
+  CHECK(shape.is_dynamic());
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> static_operands;
+    for (int64 i = 0; i < shape.tuple_shapes_size(); ++i) {
+      auto operand = inst->mutable_operand(i);
+      if (shape.tuple_shapes(i).is_dynamic()) {
+        static_operands.push_back(ConvertToStatic(operand));
+      } else {
+        static_operands.push_back(operand);
+      }
+    }
+    return comp->AddInstruction(HloInstruction::CreateTuple(static_operands));
+  } else {
+    // The output shape of pad static is a tuple. The 0th element is the data
+    // output, which is the same as input shape, but without dynamic dimensions.
+    // i-th element is the dynamic dimension size for i-1th input dimension.
+    Shape data_output_shape = shape;  // 0th element.
+    data_output_shape.clear_dynamic_dimensions();
+    Shape output_shape = ShapeUtil::MakeTupleShape({data_output_shape});
+    for (int64 i = 0; i < shape.rank(); ++i) {
+      ShapeUtil::AppendShapeToTuple(ShapeUtil::MakeScalarShape(S32),
+                                    &output_shape);
+    }
+    HloInstruction* pad_to_static =
+        comp->AddInstruction(HloInstruction::CreateCustomCall(
+            output_shape, {inst}, "PadToStatic", ""));
+    HloInstruction* data_output =
+        comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+            data_output_shape, pad_to_static, 0));
+    return data_output;
+  }
+}
+
 Status DynamicShapeRemovingVisitor::DefaultAction(HloInstruction* hlo) {
-  // Default rule: If input to an op is static, remove dynamism in output.
-  bool input_is_dynamic = false;
-  // Default rule:
-  for (int64 i = 0; i < hlo->operand_count(); ++i) {
-    if (!hlo->operand(i)->shape().is_static()) {
-      input_is_dynamic = true;
+  const bool input_is_dynamic = absl::c_any_of(
+      hlo->operands(),
+      [](const HloInstruction* hlo) { return hlo->shape().is_dynamic(); });
+
+  // By default, ops don't support dynamic lowering.
+  OpDynamismSupport op_support = OpDynamismSupport::kNoSupport;
+  if (op_supports_dynamism_handler_) {
+    op_support = op_supports_dynamism_handler_(hlo);
+  }
+  if (op_support == OpDynamismSupport::kNoSupport) {
+    for (auto* sub_computation : hlo->called_computations()) {
+      for (auto* param : sub_computation->parameter_instructions()) {
+        param->mutable_shape()->clear_dynamic_dimensions();
+      }
     }
   }
-
-  if (!input_is_dynamic) {
+  // If the input to an op is static and the op doesn't support
+  // dynamic output, remove dynamism in output -- dynamic_padder should have
+  // rewritten it to support static shapes.
+  if (!input_is_dynamic && op_support == OpDynamismSupport::kNoSupport) {
     hlo->mutable_shape()->clear_dynamic_dimensions();
+    return Status::OK();
   }
+
+  // Op doesn't support dynamic tensor: For each operand rewrite dynamic input
+  // into static input using pad_to_static.
+  if (input_is_dynamic && op_support == OpDynamismSupport::kNoSupport) {
+    VLOG(1) << "op doesn't support dynamic tensor: " << hlo->ToString();
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      if (hlo->operand(i)->shape().is_dynamic()) {
+        auto static_operand = ConvertToStatic(hlo->mutable_operand(i));
+        TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(i, static_operand));
+      }
+    }
+    // This op doesn't support dynamic lowering so the op has to be static.
+    hlo->mutable_shape()->clear_dynamic_dimensions();
+    return Status::OK();
+  }
+
+  // If the op requires dynamic tensor and input is static -- construct a
+  // dynamic tensor from the static tensor to feed it.
+  if (!input_is_dynamic && op_support == OpDynamismSupport::kRequired) {
+    VLOG(1) << "op doesn't support static tensor: " << hlo->ToString();
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      auto operand = hlo->mutable_operand(i);
+      if (dynamic_dimension_inference_.HasDynamicDimension(operand)) {
+        auto dynamic_operand = ConvertToDynamic(hlo->mutable_operand(i));
+        TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(i, dynamic_operand));
+      }
+    }
+    return Status::OK();
+  }
+
   return Status::OK();
 }
 
-Status DynamicShapeRemovingVisitor::HandleCustomCall(HloInstruction* hlo) {
-  if (hlo->custom_call_target() == "SliceToDynamic") {
-    // Don't remove slice-to-dynamic instruction.
-    return Status::OK();
+Status DynamicShapeRemovingVisitor::HandleGetTupleElement(HloInstruction* hlo) {
+  *hlo->mutable_shape() =
+      hlo->operand(0)->shape().tuple_shapes(hlo->tuple_index());
+  return Status::OK();
+}
+
+Status DynamicShapeRemovingVisitor::HandleTuple(HloInstruction* hlo) {
+  for (int64 i = 0; i < hlo->operand_count(); ++i) {
+    *hlo->mutable_shape()->mutable_tuple_shapes(i) = hlo->operand(i)->shape();
   }
-  return DefaultAction(hlo);
+  return Status::OK();
 }
 
 Status DynamicShapeRemovingVisitor::HandleParameter(HloInstruction* hlo) {
   return Status::OK();
 }
 
+Status DynamicShapeRemovingVisitor::HandleCustomCall(HloInstruction* hlo) {
+  if (hlo->custom_call_target() == "SliceToDynamic" ||
+      hlo->custom_call_target() == "PadToStatic") {
+    // Those ops support are created to handle dynamic tensors so by their
+    // nature they support dynamic lowering.
+    return Status::OK();
+  }
+
+  return DefaultAction(hlo);
+}
+
 }  // namespace
 
 StatusOr<bool> DynamicPadder::Run(HloModule* module) {
@@ -1000,11 +1209,20 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
       }));
 
   TF_RETURN_IF_ERROR(InsertPadToStaticAfterModuleInputs(module));
-  TF_ASSIGN_OR_RETURN(DynamicDimensionInference dynamic_dimension_inference,
-                      DynamicDimensionInference::Run(module));
+  TF_ASSIGN_OR_RETURN(
+      DynamicDimensionInference dynamic_dimension_inference,
+      DynamicDimensionInference::Run(module, custom_call_handler_));
 
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
+      OpDynamismSupport has_dynamism_support = OpDynamismSupport::kNoSupport;
+      if (op_supports_dynamism_handler_ != nullptr) {
+        has_dynamism_support = op_supports_dynamism_handler_(inst);
+      }
+      // This op support dynamic lowering, no padding is required.
+      if (has_dynamism_support != OpDynamismSupport::kNoSupport) {
+        continue;
+      }
       if (inst->opcode() == HloOpcode::kConcatenate) {
         TF_ASSIGN_OR_RETURN(
             changed, RewriteDynamicConcat(inst, &dynamic_dimension_inference));
@@ -1015,6 +1233,11 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
             changed, RewriteDynamicSort(inst, &dynamic_dimension_inference));
         continue;
       }
+      if (inst->opcode() == HloOpcode::kReshape) {
+        TF_ASSIGN_OR_RETURN(
+            changed, RewriteDynamicReshape(inst, &dynamic_dimension_inference));
+        continue;
+      }
       for (int64 operand_num = 0; operand_num < inst->operand_count();
            ++operand_num) {
         HloInstruction* original_operand = inst->mutable_operand(operand_num);
@@ -1023,11 +1246,6 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
           continue;
         }
 
-        if (inst->opcode() == HloOpcode::kReshape) {
-          TF_ASSIGN_OR_RETURN(changed, RewriteDynamicReshape(
-                                           inst, &dynamic_dimension_inference));
-          continue;
-        }
         for (int64 input_dim = 0; input_dim < operand->shape().rank();
              ++input_dim) {
           HloInstruction* operand_dynamic_size =
@@ -1058,37 +1276,28 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
       }
     }
   }
-  if (slice_dynamic_output_) {
-    TF_RETURN_IF_ERROR(InsertSliceToDynamicBeforeModuleOutputs(
-        dynamic_dimension_inference, module));
-  }
 
-  // Remove all dynamic dimensions after entry parameter and root instruction --
-  // Dynamic padder will produce an equivalent static shaped graph.
-  for (HloComputation* computation : module->computations()) {
-    if (computation == module->entry_computation()) {
-      TF_RETURN_IF_ERROR(DynamicShapeRemovingVisitor::Run(computation));
-    } else {
-      for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
-        bool operand_is_dynamic = false;
-        for (auto* operand : inst->operands()) {
-          if (!operand->shape().is_static()) {
-            operand_is_dynamic = true;
-          }
-        }
-        if (!operand_is_dynamic) {
-          inst->mutable_shape()->clear_dynamic_dimensions();
-        }
-      }
-    }
+  // There are ops that only support dynamic lowering and ops that only support
+  // static lowering, add dynamic<->static tensor conversion around the boundary
+  // between those ops, as well as the root instruction.
+  auto computations = module->MakeComputationPostOrder();
+  // Reverse postorder so that if caller doesn't support dynamic tensor (while,
+  // etc), change their called computation to only take static tensors.
+  for (auto it = computations.rbegin(); it != computations.rend(); ++it) {
+    HloComputation* computation = *it;
+    // if slice_dynamic_output_ is set and this is entry computation, we need
+    // the output tensor to be in dynamic form.
+    bool require_dynamic_output =
+        slice_dynamic_output_ && computation == module->entry_computation();
+    TF_RETURN_IF_ERROR(DynamicShapeRemovingVisitor::Run(
+        computation, op_supports_dynamism_handler_, dynamic_dimension_inference,
+        /*require_dynamic_output=*/require_dynamic_output));
   }
 
   HloDCE dce;
   TF_ASSIGN_OR_RETURN(changed, dce.Run(module));
-
   VLOG(2) << "Post DynamicPadder HLO:";
   XLA_VLOG_LINES(2, module->ToString());
-
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.h b/tensorflow/compiler/xla/service/dynamic_padder.h
index f0f3eed0a26..ca2513eaa5c 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.h
+++ b/tensorflow/compiler/xla/service/dynamic_padder.h
@@ -36,12 +36,38 @@ namespace xla {
 // Dynamic_padder removes dynamic shapes from the entry computation, and inserts
 // custom calls (with dynamic shapes), which are lowered by specialized
 // emitters: PadToStatic and SliceToDynamic.
+
+// Each instruction can have one of the three modes in supporting dynamic
+// lowering.
+enum OpDynamismSupport {
+  // There is no support for dynamic lowering -- dynamic padder will make sure
+  // the input to that op has static bound by rewriting the op (e.g, extra space
+  // in reduce_sum will be padded with 0).
+  kNoSupport = 0,
+  // The op can take either dynamic input or static input.
+  kOptional,
+  // The op only has a dynamic lowering, dynamic padder will make sure the input
+  // to this op is in dynamic form.
+  kRequired,
+};
+
 class DynamicPadder : public HloModulePass {
  public:
+  // Returns true if given instruction supports native dynamic lowering. If so,
+  // dynamic padder will not attempt to pad it.
+  using OpSupportsDynamismHandler =
+      std::function<OpDynamismSupport(HloInstruction*)>;
+
   // If `slice_dynamic_output` is true, insert 'slice_to_dynamic' ops to all
   // outputs that are inferred to be dynamic.
-  explicit DynamicPadder(bool slice_dynamic_output = true)
-      : slice_dynamic_output_(slice_dynamic_output) {}
+  explicit DynamicPadder(
+      bool slice_dynamic_output = true,
+      DynamicDimensionInference::CustomCallInferenceHandler
+          custom_call_handler = nullptr,
+      OpSupportsDynamismHandler op_supports_dynamism_handler = nullptr)
+      : slice_dynamic_output_(slice_dynamic_output),
+        custom_call_handler_(custom_call_handler),
+        op_supports_dynamism_handler_(op_supports_dynamism_handler) {}
 
   absl::string_view name() const override { return "dynamic_padder"; }
 
@@ -51,6 +77,13 @@ class DynamicPadder : public HloModulePass {
   // Insert 'slice_to_dynamic' ops to all outputs that are inferred to be
   // dynamic.
   bool slice_dynamic_output_;
+
+  // A handler for dynamic dimension inference of custom calls.
+  DynamicDimensionInference::CustomCallInferenceHandler custom_call_handler_;
+
+  // A handler to indicate if a given hlo instruction support native dynamism
+  // lowering.
+  OpSupportsDynamismHandler op_supports_dynamism_handler_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
index c937bf2c723..e4c70317f2b 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -44,12 +44,49 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
+OpDynamismSupport OpHasDynamismSupport(HloInstruction* hlo) {
+  if (hlo->opcode() != HloOpcode::kCustomCall) {
+    return OpDynamismSupport::kNoSupport;
+  }
+  if (hlo->custom_call_target() == "OpWithDynamicLowering") {
+    return OpDynamismSupport::kRequired;
+  }
+  return OpDynamismSupport::kNoSupport;
+}
+
+Status CustomCallDynamicDimensionInference(
+    HloInstruction* hlo, DynamicDimensionInference* inferencer) {
+  if (hlo->custom_call_target() == "OpWithDynamicLowering") {
+    if (hlo->shape().IsTuple()) {
+      // Use the operand's dynamic size as output dynamic size.
+      HloInstruction* dynamic_size =
+          inferencer->GetDynamicSize(hlo->mutable_operand(0), {1}, 0);
+      inferencer->SetDynamicSize(hlo, {1}, 0, dynamic_size);
+    } else {
+      // Use the operand's dynamic size as output dynamic size.
+      HloInstruction* dynamic_size =
+          inferencer->GetDynamicSize(hlo->mutable_operand(0), {}, 0);
+      inferencer->SetDynamicSize(hlo, {}, 0, dynamic_size);
+    }
+  }
+
+  return Status::OK();
+}
+
 class DynamicPadderTest : public HloTestBase {
  protected:
   DynamicPadderTest() : HloTestBase() { module_ = CreateNewVerifiedModule(); }
 
+  std::unique_ptr<HloModule> GetHloModule(const string& hlo_text) {
+    std::unique_ptr<HloModule> module =
+        ParseAndReturnVerifiedModule(hlo_text).ValueOrDie();
+    return module;
+  }
+
   StatusOr<bool> RunPadder() {
-    DynamicPadder padder;
+    DynamicPadder padder(/*slice_dynamic_output=*/true,
+                         CustomCallDynamicDimensionInference,
+                         OpHasDynamismSupport);
     return padder.Run(module_.get());
   }
 
@@ -105,6 +142,120 @@ TEST_F(DynamicPadderTest, ReduceTest) {
   ExpectPadded(reduce->operand(0));
 }
 
+TEST_F(DynamicPadderTest, DynamicLoweringTest) {
+  const string hlo_text = R"(
+HloModule DynamicLowering
+
+ENTRY main {
+  param = s32[5] parameter(0)
+  const = s32[] constant(3)
+  param_padded = s32[<=5] set-dimension-size(param, const),
+                dimensions={0}
+  custom-call.1 = s32[<=5] custom-call(param_padded),
+    custom_call_target="OpWithDynamicLowering"
+  custom-call.2 = s32[<=5] custom-call(custom-call.1),
+    custom_call_target="OpWithDynamicLowering"
+  // Negate doesn't support dynamic lowering.
+  ROOT negate = s32[<=5] negate(custom-call.2)
+}
+)";
+
+  module_ = GetHloModule(hlo_text);
+
+  TF_ASSERT_OK(RunPadder().status());
+  // After rewrite, we should have :
+  //
+  //   param
+  //     |
+  //  SliceToDynamic
+  //     |
+  //  OpWithDynamicLowering (custom_call_1)
+  //     |
+  //  OpWithDynamicLowering (custom_call_2)
+  //     |
+  //  PadToStatic
+  //     |
+  //   Negate
+  //     |
+  //   SliceToDynamic // Root require dynamic form tensor.
+  auto custom_call_1 =
+      module_->entry_computation()->GetInstructionWithName("custom-call.1");
+  auto custom_call_2 =
+      module_->entry_computation()->GetInstructionWithName("custom-call.2");
+  // Test that the input to custom call
+  HloInstruction* slice_to_dynamic = custom_call_1->mutable_operand(0);
+  ASSERT_THAT(slice_to_dynamic->opcode(), HloOpcode::kCustomCall);
+  ASSERT_THAT(slice_to_dynamic->custom_call_target(), "SliceToDynamic");
+  ASSERT_EQ(custom_call_2->user_count(), 1);
+  HloInstruction* pad_to_static = custom_call_2->users()[0];
+  ASSERT_THAT(pad_to_static->opcode(), HloOpcode::kCustomCall);
+  ASSERT_THAT(pad_to_static->custom_call_target(), "PadToStatic");
+  slice_to_dynamic = module_->entry_computation()->root_instruction();
+  ASSERT_THAT(slice_to_dynamic->opcode(), HloOpcode::kCustomCall);
+  ASSERT_THAT(slice_to_dynamic->custom_call_target(), "SliceToDynamic");
+}
+
+TEST_F(DynamicPadderTest, DynamicLoweringTestTupleInput) {
+  const string hlo_text = R"(
+HloModule DynamicLowering
+
+ENTRY main {
+  param = s32[5] parameter(0)
+  const = s32[] constant(3)
+  param_padded = s32[<=5] set-dimension-size(param, const),
+                dimensions={0}
+  // Create a tuple with static and dynamic componenet.
+  tuple_arg = (s32[], s32[<=5]) tuple(const, param_padded)
+  custom-call.1 = (s32[], s32[<=5]) custom-call(tuple_arg),
+    custom_call_target="OpWithDynamicLowering"
+  custom-call.2 = (s32[], s32[<=5]) custom-call(custom-call.1),
+    custom_call_target="OpWithDynamicLowering"
+  data = s32[<=5]{0} get-tuple-element(custom-call.2), index=1
+  // Negate doesn't support dynamic lowering.
+  ROOT negate = s32[<=5] negate(data)
+}
+)";
+
+  module_ = GetHloModule(hlo_text);
+
+  TF_ASSERT_OK(RunPadder().status());
+  // After rewrite, we should have :
+  //
+  //   param
+  //     |
+  //  SliceToDynamic
+  //     |
+  //    Tuple
+  //     |
+  //  OpWithDynamicLowering (custom_call_1)
+  //     |
+  //  OpWithDynamicLowering (custom_call_2)
+  //     |
+  //   GTE
+  //     |
+  //  PadToStatic
+  //     |
+  //   Negate
+  //     |
+  //   SliceToDynamic // Root require dynamic form tensor.
+
+  auto* root = module_->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::CustomCall("SliceToDynamic", op::Negate(), op::Constant()));
+  HloInstruction* negate = root->mutable_operand(0);
+  EXPECT_THAT(
+      negate,
+      op::Negate(op::GetTupleElement(op::CustomCall(
+          "PadToStatic", op::GetTupleElement(op::CustomCall(
+                             "OpWithDynamicLowering", ::testing::_))))));
+  auto custom_call_1 =
+      module_->entry_computation()->GetInstructionWithName("custom-call.1");
+  EXPECT_THAT(custom_call_1,
+              op::CustomCall(
+                  "OpWithDynamicLowering",
+                  op::Tuple(op::Constant(), op::CustomCall("SliceToDynamic"))));
+}
+
 TEST_F(DynamicPadderTest, ConvolutionTest) {
   auto builder = HloComputation::Builder(TestName());
   constexpr int xdim = 3;
@@ -844,6 +995,149 @@ ENTRY main {
   EXPECT_EQ(result, expected);
 }
 
+XLA_TEST_F(ExecutionTest, ReshapeSplitCombineSameTime) {
+  // [<=4, 2, <=2]
+  //       |
+  //    Reshape
+  //       |
+  // [2, <=2, <=4]
+  //
+  // Split one input dynamic dim to multiple output dims while combining two
+  // dimensions together.
+  //
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  param = s32[4, 2, 2] parameter(0)
+  two = s32[] constant(2)
+  one = s32[] constant(1)
+  param_padded_partial = s32[<=4, 2, 2] set-dimension-size(param, two),
+    dimensions={0}
+
+  param_padded_dynamic = s32[<=4, 2, <=2] set-dimension-size(param_padded_partial,
+                                                             one),
+    dimensions={2}
+  reshaped = s32[2, <=2, <=4] reshape(param_padded_dynamic),
+    inferred_dimension=1
+  init = s32[] constant(0)
+  ROOT reduce = s32[] reduce(reshaped, init),
+      dimensions={0, 1, 2},
+      to_apply=update_s32
+}
+)";
+
+  // First and last dims are dynamic. Padded data are expressed as -1.
+  Literal operand = LiteralUtil::CreateR3<int32>({{{0, -1}, {1, -1}},
+                                                  {{2, -1}, {3, -1}},
+                                                  {{-1, -1}, {-1, -1}},
+                                                  {{-1, -1}, {-1, -1}}});
+  auto module = GetHloModule(hlo_text);
+
+  Literal result = PadAndExecute(std::move(module), {&operand});
+
+  // Reshaping (with correct reshape rewriting) produces:
+  // [[[0, 1, -1, -1], [-1, -1, -1, -1]], [[2, 3, -1, -1], [-1, -1, -1, -1]]]
+  //
+  //  Dynamic padder auto pads -1 with 0.
+  //
+  // Reducing it produces 0 + 1 + 2 + 3 = 6
+
+  Literal expected = LiteralUtil::CreateR0<int32>(6);
+
+  EXPECT_EQ(result, expected);
+}
+
+XLA_TEST_F(ExecutionTest, WhileLoopStack) {
+  // Push into a dynamic sized stack with iteration number:
+  // init:
+  // [[P, P],
+  //  [P, P],
+  //  [P, P],
+  //  [P, P]]
+  // First iteration i = 0:
+  // [[0, 0],
+  //  [P, P],
+  //  [P, P],
+  //  [P, P]]
+  // Second iteration i = 1:
+  // [[0, 0],
+  //  [1, 1],
+  //  [P, P],
+  //  [P, P]]
+  // Third iteration i = 2:
+  // [[0, 0],
+  //  [1, 1],
+  //  [2, 2],
+  //  [P, P]]
+
+  const string hlo_text = R"(
+HloModule module
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(lhs, rhs)
+}
+
+body {
+  stack = (s32[<=4,2]) parameter(0)
+  stack_buffer = s32[<=4, 2] get-tuple-element(stack), index=0
+  stack_size = s32[] get-dimension-size(stack_buffer), dimensions={0}
+  zero = s32[] constant(0)
+  one = s32[] constant(1)
+  // content of the stack is the stack index broadcasted.
+  new_data = s32[1, 2] broadcast(s32[] stack_size), dimensions={}
+  new_stack_buffer = s32[<=4, 2] dynamic-update-slice(stack_buffer, new_data, stack_size, zero)
+  new_stack_size = s32[] add(stack_size, one)
+  new_stack_buffer_dynamic = s32[<=4, 2]set-dimension-size(new_stack_buffer, new_stack_size), dimensions={0}
+  ROOT new_stack = (s32[<=4,2]) tuple(new_stack_buffer_dynamic)
+}
+
+condition {
+  stack = (s32[<=4,2]) parameter(0)
+  stack_buffer = s32[<=4, 2] get-tuple-element(stack), index=0
+  stack_size = s32[] get-dimension-size(stack_buffer), dimensions={0}
+  three = s32[] constant(3)
+  ROOT less-than = pred[] compare(s32[] stack_size, s32[] three), direction=LT
+}
+
+ENTRY entry {
+  zero = s32[] constant(0)
+  pad = s32[] constant(-1)
+  stack_buffer_input = s32[4, 2] broadcast(s32[] pad), dimensions={}
+  stack_buffer_input_dynamic = s32[<=4, 2] set-dimension-size(stack_buffer_input, zero), dimensions={0}
+  input_tuple = (s32[<=4 ,2]) tuple(stack_buffer_input_dynamic)
+  while = (s32[<=4, 2]) while(input_tuple), body=body, condition=condition
+  stack_buffer = s32[<=4, 2] get-tuple-element(while), index=0
+  ROOT reduce = s32[2] reduce(stack_buffer, zero),
+    dimensions={0},
+    to_apply=update_s32
+}
+)";
+
+  auto module = GetHloModule(hlo_text);
+
+  Literal result = PadAndExecute(std::move(module), {});
+
+  // Stack has three valid items in it:
+  // [[0, 0],
+  //  [1, 1],
+  //  [2, 2],
+  //  [P, P]]
+  //
+  // Reducing along major dimension gives us [3, 3]
+  Literal expected = LiteralUtil::CreateR1<int32>({{3, 3}});
+
+  EXPECT_EQ(result, expected);
+}
+
 XLA_TEST_F(ExecutionTest, DoubleDynamicDimension) {
   const string hlo_text = R"(
 HloModule TensorFlowScatterV1
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 3eb6dab3129..8cb660de46c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -461,6 +461,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
       return EmitSqrt(op->shape().element_type(), operand_value);
     case HloOpcode::kRsqrt:
       return EmitRsqrt(op->shape().element_type(), operand_value);
+    case HloOpcode::kCbrt:
+      return EmitCbrt(op->shape().element_type(), operand_value);
     case HloOpcode::kFloor:
       return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor,
                                           {operand_value},
@@ -787,6 +789,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     case HloOpcode::kRsqrt: {
       return EmitComplexRsqrt(op, component_type, operand_value);
     }
+    case HloOpcode::kCbrt: {
+      return EmitComplexCbrt(op, component_type, operand_value);
+    }
     case HloOpcode::kNegate:
       return EmitComposeComplex(op, FNeg(EmitExtractReal(operand_value)),
                                 FNeg(EmitExtractImag(operand_value)));
@@ -1081,6 +1086,19 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexRsqrt(
   return EmitComposeComplex(op, real_part, imag_part);
 }
 
+//
+// Using EmitComplexPower with c=1.0/3.0 and d=0
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexCbrt(
+    const HloInstruction* op, PrimitiveType prim_type,
+    llvm::Value* operand_value) {
+  auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
+  auto third = llvm::ConstantFP::get(type, 1.0 / 3.0);
+  auto zero = llvm::ConstantFP::get(type, 0);
+  llvm::Value* a = EmitExtractReal(operand_value);
+  llvm::Value* b = EmitExtractImag(operand_value);
+  return EmitComplexPower(op, a, b, third, zero);
+}
+
 // (a+bi)^(c+di) =
 //    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
 //    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
@@ -1392,6 +1410,19 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                       {lhs->getType()}, b_);
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitCbrt(PrimitiveType prim_type,
+                                                    llvm::Value* value) {
+  auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
+  auto third = llvm::ConstantFP::get(type, 1.0 / 3.0);
+  auto abs_value =
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
+  TF_ASSIGN_OR_RETURN(llvm::Value * abs_res,
+                      EmitPow(prim_type, abs_value, third));
+  auto signed_res = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::copysign,
+                                                 {abs_res, value}, {type}, b_);
+  return signed_res;
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
                                                      llvm::Value* lhs,
                                                      llvm::Value* rhs) {
@@ -2181,6 +2212,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kTanh:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
@@ -2390,6 +2422,43 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                  -> StatusOr<llvm::Value*> {
         return EmitElementalDot(hlo, operand_to_generator, dot_result_index);
       };
+    case HloOpcode::kMap:
+      return [this, hlo, &operand_to_generator](
+                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+        std::vector<llvm::Value*> operands;
+        for (int i = 0; i < hlo->operand_count(); i++) {
+          TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+                              operand_to_generator.at(hlo->operand(i))(index));
+          operands.push_back(operand_value);
+        }
+        std::vector<llvm_ir::ElementGenerator> input_generators;
+        for (const HloInstruction* instr : hlo->operands()) {
+          input_generators.push_back(operand_to_generator.at(instr));
+        }
+        return EmitElementalMap(Cast<HloMapInstruction>(hlo), operands);
+      };
+    case HloOpcode::kReduceWindow:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        return EmitElementalReduceWindow(
+            Cast<HloReduceWindowInstruction>(hlo),
+            operand_to_generator.at(hlo->operand(0)),
+            operand_to_generator.at(hlo->operand(1)), index);
+      };
+    case HloOpcode::kReduce:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        auto reduce_instr = Cast<HloReduceInstruction>(hlo);
+        std::vector<llvm_ir::ElementGenerator> input_generators;
+        for (const HloInstruction* instr : reduce_instr->inputs()) {
+          input_generators.push_back(operand_to_generator.at(instr));
+        }
+
+        std::vector<llvm_ir::ElementGenerator> initial_value_generators;
+        for (const HloInstruction* instr : reduce_instr->init_values()) {
+          initial_value_generators.push_back(operand_to_generator.at(instr));
+        }
+        return EmitElementalReduce(reduce_instr, std::move(input_generators),
+                                   std::move(initial_value_generators), index);
+      };
     default:
       return [hlo](const IrArray::Index& index) {
         return Unimplemented("Unhandled opcode for elemental IR emission: %s",
@@ -2419,4 +2488,215 @@ llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
   return complex;
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalMap(
+    const HloMapInstruction* map_instr,
+    absl::Span<llvm::Value* const> elemental_operands) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm::Value*> values,
+      EmitThreadLocalCall(*map_instr->to_apply(), elemental_operands,
+                          llvm_ir::IrName(map_instr)));
+  CHECK_EQ(values.size(), 1);
+  return values[0];
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduceWindow(
+    const HloReduceWindowInstruction* reduce_window,
+    const llvm_ir::ElementGenerator& input_generator,
+    const llvm_ir::ElementGenerator& initial_value_generator,
+    const llvm_ir::IrArray::Index& index) {
+  // Pseudocode:
+  // for each index I in output
+  //   value = init_value
+  //   for each index W in window
+  //     for each dimension i from 0 to rank - 1
+  //       (input index I)[i] = O[i] * stride[i] + W[i] - pad_low[i]
+  //     if I in bounds of input
+  //       value = function(value, input[I])
+  //     output[O] = value
+  const HloInstruction* operand = reduce_window->operand(0);
+  const Window& window = reduce_window->window();
+
+  PrimitiveType operand_element_type = operand->shape().element_type();
+  llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
+      llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
+      "reduce_window_accum_ptr", b_);
+  {
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value* const init_value,
+        initial_value_generator(llvm_ir::IrArray::Index(index.GetType())));
+    Store(init_value, accum_ptr);
+  }
+
+  llvm::Type* index_type = index.GetType();
+  auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
+    return index.GetConstantWithIndexType(c);
+  };
+
+  llvm_ir::ForLoopNest loops(IrName(reduce_window), b_, index_type);
+  std::vector<int64> window_size;
+  for (const auto& dim : window.dimensions()) {
+    window_size.push_back(dim.size());
+  }
+  const IrArray::Index window_index = loops.AddLoopsForShape(
+      ShapeUtil::MakeShape(operand_element_type, window_size), "window");
+  CHECK_EQ(window_index.size(), index.size());
+
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
+
+  std::vector<llvm::Value*> input_multi_index(index.size());
+  llvm::Value* in_bounds = b_->getInt1(true);
+  for (size_t i = 0; i < index.size(); ++i) {
+    llvm::Value* stridden_index =
+        NSWMul(index[i], index_typed_const(window.dimensions(i).stride()));
+    input_multi_index[i] = NSWSub(
+        NSWAdd(
+            stridden_index,
+            NSWMul(window_index[i],
+                   index_typed_const(window.dimensions(i).window_dilation()))),
+        index_typed_const(window.dimensions(i).padding_low()));
+
+    // We need to verify that we are not in the dilated base area.
+    llvm::Value* dilation_condition =
+        ICmpEQ(SRem(input_multi_index[i],
+                    index_typed_const(window.dimensions(i).base_dilation())),
+               index_typed_const(0));
+    in_bounds = And(in_bounds, dilation_condition);
+
+    // Apply base dilation to the index.
+    input_multi_index[i] =
+        SDiv(input_multi_index[i],
+             index_typed_const(window.dimensions(i).base_dilation()));
+
+    // We must check whether 0 <= input_multi_index[i] < bound, as
+    // otherwise we are in the pad and so can skip the computation. This
+    // comparison is equivalent to the unsigned comparison
+    // input_multi_index[i] < bound, as a negative value wraps to a large
+    // positive value.
+    in_bounds = And(in_bounds,
+                    ICmpULT(input_multi_index[i],
+                            index_typed_const(operand->shape().dimensions(i))));
+  }
+
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
+  SetToFirstInsertPoint(if_data.true_block, b_);
+
+  // We are not in pad, so do the computation.
+  IrArray::Index input_index(input_multi_index, operand->shape(), index_type);
+  TF_ASSIGN_OR_RETURN(llvm::Value * input_value, input_generator(input_index));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm::Value*> accum_values,
+      EmitThreadLocalCall(*reduce_window->to_apply(),
+                          {Load(accum_ptr), input_value}, "reducer_function"));
+  CHECK_EQ(accum_values.size(), 1);
+  Store(accum_values[0], accum_ptr);
+
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_);
+  return Load(accum_ptr);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduce(
+    const HloReduceInstruction* reduce,
+    std::vector<llvm_ir::ElementGenerator> input_generators,
+    std::vector<llvm_ir::ElementGenerator> initial_value_generators,
+    const llvm_ir::IrArray::Index& index) {
+  const Shape& out_shape = reduce->shape();
+  bool is_variadic = !out_shape.IsArray();
+  int accumulators_count = 1;
+  if (is_variadic) {
+    CHECK(out_shape.IsTuple());
+    accumulators_count = out_shape.tuple_shapes_size();
+  }
+
+  absl::Span<const int64> reduced_dimensions(reduce->dimensions());
+
+  std::vector<llvm::Value*> accumulator_addrs;
+  std::vector<llvm::Type*> accumulator_types;
+  llvm::Type* index_type = index.GetType();
+  for (int i = 0; i < accumulators_count; i++) {
+    const Shape& element_shape =
+        is_variadic ? out_shape.tuple_shapes(i) : out_shape;
+    PrimitiveType accumulator_type = element_shape.element_type();
+    llvm::Type* accumulator_llvm_type =
+        llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_);
+    accumulator_types.push_back(accumulator_llvm_type);
+
+    // Initialize an accumulator with init_value.
+    llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+        accumulator_llvm_type, "accumulator_" + std::to_string(i), b());
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value* const init_value,
+        initial_value_generators[i](llvm_ir::IrArray::Index(index_type)));
+    Store(init_value, accumulator_addr);
+    accumulator_addrs.push_back(accumulator_addr);
+  }
+
+  // The enclosing loops go over all the target elements. Now we have to compute
+  // the actual target element. For this, we build a new loop nest to iterate
+  // over all the reduction dimensions in the argument.
+  // AddLoopsForShapeOnDimensions will return an Index where induction Value*s
+  // are placed for each dimension in dimensions, and all the rest are nullptrs.
+  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), b(), index_type);
+  const HloInstruction* arg = reduce->operand(0);
+  std::vector<llvm::Value*> input_multi_index =
+      loops.AddLoopsForShapeOnDimensions(arg->shape(), reduced_dimensions,
+                                         "reduction_dim");
+
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b());
+
+  // Build a full index for the input argument, using input_multi_index as the
+  // base. In input_multi_index only the reduction dimensions are filled in. We
+  // fill in the rest of the dimensions with induction Value*s taken from
+  // 'index' which iterates over the target array.  See the high-level
+  // description in the XLA documentation for details.
+  auto it = index.begin();
+
+  for (auto& i : input_multi_index) {
+    if (i == nullptr) {
+      i = *it++;
+    }
+  }
+  CHECK(index.end() == it);
+  llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
+                                      index_type);
+
+  std::vector<llvm::Value*> reduction_operands;
+  for (llvm::Value* accum : accumulator_addrs) {
+    llvm::Value* accum_value = Load(accum);
+    reduction_operands.push_back(accum_value);
+  }
+
+  for (int i = 0; i < accumulators_count; i++) {
+    TF_ASSIGN_OR_RETURN(llvm::Value* const input_element,
+                        input_generators[i](input_index));
+    reduction_operands.push_back(input_element);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm::Value*> results,
+      EmitThreadLocalCall(*reduce->to_apply(), reduction_operands,
+                          "reduce_function"));
+
+  CHECK(results.size() == accumulators_count);
+  for (int i = 0; i < accumulators_count; i++) {
+    Store(results[i], accumulator_addrs[i]);
+  }
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b());
+
+  if (is_variadic) {
+    // Emit a structure, as that what the LoopEmitter expects.
+    llvm::Value* returned_structure = llvm::UndefValue::get(
+        llvm::StructType::get(b()->getContext(), accumulator_types));
+    for (int i = 0; i < accumulators_count; i++) {
+      llvm::Value* accumulator_value = Load(accumulator_addrs[i]);
+      returned_structure =
+          b()->CreateInsertValue(returned_structure, accumulator_value, i);
+    }
+    return returned_structure;
+  } else {
+    CHECK_EQ(accumulator_addrs.size(), 1);
+    return Load(accumulator_addrs[0]);
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 99833a5525f..06a9d7b194c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -17,12 +17,17 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_
 
 #include <unordered_map>
+#include <vector>
 
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -116,6 +121,9 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual StatusOr<llvm::Value*> EmitSqrt(PrimitiveType prim_type,
                                           llvm::Value* value);
 
+  virtual StatusOr<llvm::Value*> EmitCbrt(PrimitiveType prim_type,
+                                          llvm::Value* value);
+
   virtual StatusOr<llvm::Value*> EmitRsqrt(PrimitiveType prim_type,
                                            llvm::Value* value);
 
@@ -159,6 +167,10 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
                                                  PrimitiveType prim_type,
                                                  llvm::Value* operand_value);
 
+  virtual StatusOr<llvm::Value*> EmitComplexCbrt(const HloInstruction* op,
+                                                 PrimitiveType prim_type,
+                                                 llvm::Value* operand_value);
+
   virtual StatusOr<llvm::Value*> EmitComplexRsqrt(const HloInstruction* op,
                                                   PrimitiveType prim_type,
                                                   llvm::Value* operand_value);
@@ -213,6 +225,26 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
       const HloToElementGeneratorMap& operand_to_generator,
       const llvm_ir::IrArray::Index& dot_result_index);
 
+  virtual StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name) = 0;
+
+  StatusOr<llvm::Value*> EmitElementalMap(
+      const HloMapInstruction* map_instr,
+      absl::Span<llvm::Value* const> elemental_operands);
+
+  StatusOr<llvm::Value*> EmitElementalReduceWindow(
+      const HloReduceWindowInstruction* reduce_window,
+      const llvm_ir::ElementGenerator& input_generator,
+      const llvm_ir::ElementGenerator& initial_value_generator,
+      const llvm_ir::IrArray::Index& index);
+
+  StatusOr<llvm::Value*> EmitElementalReduce(
+      const HloReduceInstruction* reduce,
+      std::vector<llvm_ir::ElementGenerator> input_generators,
+      std::vector<llvm_ir::ElementGenerator> initial_value_generators,
+      const llvm_ir::IrArray::Index& index);
+
   llvm::IRBuilder<>* const b_;
 
   llvm::Module* module_;
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 8a9a96ce363..f1ac1fef451 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -149,9 +149,6 @@ class ExecutionOutput {
     to_be_released_.push_back(std::move(mem));
   }
 
-  void SetOutputShapeTable(se::OwningDeviceMemory output_shape_table) {
-    output_shape_table_ = std::move(output_shape_table);
-  }
 
   // Should be called once it is known that the execute operation succeeded,
   // before returning the ExecutionOutput to the caller.
@@ -164,19 +161,11 @@ class ExecutionOutput {
 
   ScopedShapedBuffer* MutableResult() { return &result_; }
 
-  const se::OwningDeviceMemory& ShapeTable() const {
-    return output_shape_table_;
-  }
-
   ScopedShapedBuffer ConsumeResult() {
     aliased_indices_.clear();
     return std::move(result_);
   }
 
-  se::OwningDeviceMemory ConsumeShapeTable() {
-    return std::move(output_shape_table_);
-  }
-
   const std::vector<se::OwningDeviceMemory>& ToBeReleased() const {
     return to_be_released_;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 61bc41283e1..0f6b2cb72e6 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -684,7 +684,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/util/proto:proto_utils",
@@ -720,7 +720,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -1674,7 +1674,7 @@ tf_proto_library_cc(
     protodeps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/core:autotuning_proto",
+        "//tensorflow/core/protobuf:autotuning_proto",
     ],
 )
 
@@ -1685,8 +1685,8 @@ cc_library(
     deps = [
         ":gpu_autotuning_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/core:autotuning_proto_cc",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc
old mode 100755
new mode 100644
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index c6df786fb51..1be0b1b4e7b 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -305,168 +305,5 @@ llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
   return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block);
 }
 
-llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
-    const HloInstruction* hlo,
-    const HloToElementGeneratorMap& operand_to_generator) {
-  switch (hlo->opcode()) {
-    case HloOpcode::kMap:
-      return [=, &operand_to_generator](
-                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        TF_RET_CHECK(!hlo->operands().empty())
-            << "Zero operand map not implemented in GPU backend.";
-        TF_RET_CHECK(hlo->to_apply()->num_parameters() > 0);
-        std::vector<llvm::Value*> operand_elements;
-        for (HloInstruction* operand : hlo->operands()) {
-          TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                              operand_to_generator.at(operand)(index));
-          operand_elements.push_back(value);
-        }
-        return compute_nested_(*hlo->to_apply(), operand_elements);
-      };
-    case HloOpcode::kReduceWindow:
-      // Pseudocode:
-      // for each index I in output
-      //   value = init_value
-      //   for each index W in window
-      //     for each dimension i from 0 to rank - 1
-      //       (input index I)[i] = O[i] * stride[i] + W[i] - pad_low[i]
-      //     if I in bounds of input
-      //       value = function(value, input[I])
-      //     output[O] = value
-      return [=, &operand_to_generator](
-                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        const HloInstruction* operand = hlo->operand(0);
-        const Window& window = hlo->window();
-
-        PrimitiveType operand_element_type = operand->shape().element_type();
-        llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-            "reduce_window_accum_ptr", b_);
-        {
-          TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
-                              operand_to_generator.at(hlo->operand(1))(
-                                  IrArray::Index(index.GetType())));
-          Store(init_value, accum_ptr);
-        }
-
-        llvm::Type* index_type = index.GetType();
-        auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
-          return index.GetConstantWithIndexType(c);
-        };
-
-        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
-        std::vector<int64> window_size;
-        for (const auto& dim : window.dimensions()) {
-          window_size.push_back(dim.size());
-        }
-        const IrArray::Index window_index = loops.AddLoopsForShape(
-            ShapeUtil::MakeShape(operand_element_type, window_size), "window");
-        CHECK_EQ(window_index.size(), index.size());
-
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
-
-        std::vector<llvm::Value*> input_multi_index(index.size());
-        llvm::Value* in_bounds = b_->getInt1(true);
-        for (size_t i = 0; i < index.size(); ++i) {
-          llvm::Value* stridden_index = NSWMul(
-              index[i], index_typed_const(window.dimensions(i).stride()));
-          input_multi_index[i] = NSWSub(
-              NSWAdd(stridden_index,
-                     NSWMul(window_index[i],
-                            index_typed_const(
-                                window.dimensions(i).window_dilation()))),
-              index_typed_const(window.dimensions(i).padding_low()));
-
-          // We need to verify that we are not in the dilated base area.
-          llvm::Value* dilation_condition = ICmpEQ(
-              SRem(input_multi_index[i],
-                   index_typed_const(window.dimensions(i).base_dilation())),
-              index_typed_const(0));
-          in_bounds = And(in_bounds, dilation_condition);
-
-          // Apply base dilation to the index.
-          input_multi_index[i] =
-              SDiv(input_multi_index[i],
-                   index_typed_const(window.dimensions(i).base_dilation()));
-
-          // We must check whether 0 <= input_multi_index[i] < bound, as
-          // otherwise we are in the pad and so can skip the computation. This
-          // comparison is equivalent to the unsigned comparison
-          // input_multi_index[i] < bound, as a negative value wraps to a large
-          // positive value.
-          in_bounds =
-              And(in_bounds,
-                  ICmpULT(input_multi_index[i],
-                          index_typed_const(operand->shape().dimensions(i))));
-        }
-
-        llvm_ir::LlvmIfData if_data =
-            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
-        SetToFirstInsertPoint(if_data.true_block, b_);
-
-        // We are not in pad, so do the computation.
-        IrArray::Index input_index(input_multi_index, operand->shape(),
-                                   index_type);
-        TF_ASSIGN_OR_RETURN(llvm::Value * input_value,
-                            operand_to_generator.at(operand)(input_index));
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * accum_value,
-            compute_nested_(*hlo->to_apply(), {Load(accum_ptr), input_value}));
-        Store(accum_value, accum_ptr);
-
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_);
-        return Load(accum_ptr);
-      };
-    case HloOpcode::kReduce:
-      // TODO(b/118332391): This should be supported.
-      CHECK_EQ(hlo->operand_count(), 2) << "Did not expect variadic reduce";
-      return [=, &operand_to_generator](
-                 const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
-        const HloInstruction* operand = hlo->operand(0);
-        llvm::Value* accum_ptr =
-            b()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
-                hlo->shape().element_type(), module_));
-        llvm::Type* index_type = output_index.GetType();
-        TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
-                            operand_to_generator.at(hlo->operand(1))(
-                                IrArray::Index(index_type)));
-        b()->CreateStore(init_value, accum_ptr);
-
-        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
-        std::vector<llvm::Value*> input_multi_index =
-            loops.AddLoopsForShapeOnDimensions(
-                operand->shape(), hlo->dimensions(), "reduction_dim");
-        if (!ShapeUtil::IsScalar(hlo->shape())) {
-          // Here only input_multi_index[hlo->dimensions()] are non-null, so we
-          // must set the rest.
-          size_t j = 0;
-          for (auto& i : input_multi_index) {
-            if (i == nullptr) {
-              i = output_index[j++];
-            }
-          }
-          CHECK_EQ(output_index.size(), j);
-        }
-        llvm_ir::IrArray::Index input_index(
-            input_multi_index, hlo->operand(0)->shape(), index_type);
-
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b());
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * input_value,
-            operand_to_generator.at(hlo->operand(0))(input_index));
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * accum_value,
-            compute_nested_(*hlo->to_apply(),
-                            {b()->CreateLoad(accum_ptr), input_value}));
-        b()->CreateStore(accum_value, accum_ptr);
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b());
-        return b()->CreateLoad(accum_ptr);
-      };
-    default:
-      return ElementalIrEmitter::MakeElementGenerator(hlo,
-                                                      operand_to_generator);
-  }
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index c8a58a21980..3c4e9f7c1e6 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -47,10 +47,6 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
                         llvm::Module* module, llvm::IRBuilder<>* b,
                         NestedComputer compute_nested);
 
-  llvm_ir::ElementGenerator MakeElementGenerator(
-      const HloInstruction* hlo,
-      const HloToElementGeneratorMap& operand_to_generator) override;
-
  protected:
   StatusOr<llvm::Value*> EmitFloatBinaryOp(const HloInstruction* op,
                                            llvm::Value* lhs_value,
@@ -92,6 +88,17 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitComplexAbs(PrimitiveType prim_type,
                                         llvm::Value* value) override;
 
+  StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view) override {
+    // TODO(b/118332391): Supported variadic return values.
+    auto result = compute_nested_(callee, parameters);
+    if (!result.ok()) {
+      return result.status();
+    }
+    return std::vector<llvm::Value*>{result.ValueOrDie()};
+  }
+
   llvm::Value* EmitThreadId() override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 767c34b3a99..5f6dfd7d3a5 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -216,6 +216,7 @@ Status GpuCompiler::OptimizeHloModule(
       // bitcast. This leads to having to linearize and then delinearize the
       // index.
       options.set_replace_transpose_with_bitcast(false);
+      options.set_enable_conv_operand_swap(false);
       pass.AddPass<AlgebraicSimplifier>(options);
       // AlgebraicSimplifier may add contracting dimensions to a dot.
       pass.AddPass<DotDecomposer>();
@@ -321,6 +322,7 @@ Status GpuCompiler::OptimizeHloModule(
     HloPassPipeline pipeline("final_algebraic_simplifier");
     AlgebraicSimplifierOptions options;
     options.set_is_layout_sensitive(true);
+    options.set_enable_conv_operand_swap(false);
     pipeline.AddPass<AlgebraicSimplifier>(options);
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
@@ -399,6 +401,7 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // bitcast. This leads to having to linearize and then delinearize the
   // index.
   options.set_replace_transpose_with_bitcast(false);
+  options.set_enable_conv_operand_swap(false);
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
 
   if (RequireDeterminism() ||
@@ -406,6 +409,16 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     pipeline.AddPass<HloPassFix<GpuTreeReductionRewriter>>();
   }
 
+  // GemmRewriter assumes that all transposes are folded into gemms, but,
+  // since commit 7d529df, this is not always true at this point.
+  // Therefore, rerun transpose folding.
+  pipeline.AddPass<TransposeFolding>(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return IsMatrixMultiplication(dot) ? candidate_operands
+                                           : TransposeFolding::OperandIndices{};
+      },
+      TransposeFolding::NeverFoldTranspose);
   // Rewrite GEMMs into custom calls.
   pipeline.AddPass<GemmRewriter>();
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc
old mode 100755
new mode 100644
index 5936ed6c166..4a4448f668c
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc
@@ -217,7 +217,7 @@ MatchBackwardFilter(HloInstruction* conv) {
       }
     }
     if (dim->padding_high() < 0) {
-      LOG(ERROR)
+      LOG(WARNING)
           << "Fusing this pattern to backward filter convolution would cause "
              "negative padding ("
           << dim->padding_high()
@@ -428,7 +428,7 @@ MatchBackwardInput(HloInstruction* conv) {
     auto backward_padding_low =
         kernel_size - 1 - old_window.dimensions(i).padding_low();
     if (backward_padding_low < 0) {
-      LOG(ERROR)
+      LOG(WARNING)
           << "The low padding of the backward convolution would be negative ("
           << backward_padding_low
           << "), which isn't supported by GpuConvPaddingLegalization "
@@ -496,13 +496,13 @@ MatchBackwardInput(HloInstruction* conv) {
     //   ABCD = BackwardInputConv(abc, xy, padding_low=1, padding_high=-1)
     // with positive padding low but negative padding high.
     if (dim->padding_high() < 0) {
-      LOG(ERROR) << "Fusing this pattern to backward convolution would cause "
-                    "negative padding ("
-                 << dim->padding_high()
-                 << ") on right/bottom of the activations, which is not "
-                    "supported by GpuConvPaddingLegalization (b/32744257). "
-                    "Falling back to unfused convolution for instruction: "
-                 << conv->ToString();
+      LOG(WARNING) << "Fusing this pattern to backward convolution would cause "
+                      "negative padding ("
+                   << dim->padding_high()
+                   << ") on right/bottom of the activations, which is not "
+                      "supported by GpuConvPaddingLegalization (b/32744257). "
+                      "Falling back to unfused convolution for instruction: "
+                   << conv->ToString();
       return no_match_result;
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 1316e8ad1aa..bb4184ff76f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -351,6 +351,9 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
                            const HloInstruction& instr2) {
   if (SharedMemoryUsage(instr1) + SharedMemoryUsage(instr2) >
       kSharedMemoryBudgetInBytes) {
+    VLOG(5) << "Shared memory usage of fusion of " << instr1.ToString()
+            << " and " << instr2.ToString() << " would be over the budget of "
+            << kSharedMemoryBudgetInBytes << "B";
     return true;
   }
 
@@ -383,6 +386,14 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
           num_output_buffers <=
       kMaxOperandsAndOutputsPerFusion) {
     return false;
+  } else {
+    VLOG(5) << "Operand count of "
+            << "(" << instr1.ToString() << " ) = " << instr1.operand_count()
+            << " and ( " << instr2.ToString()
+            << " ) = " << instr2.operand_count()
+            << " and num_output_buffers = " << num_output_buffers
+            << " is bigger than the bound of "
+            << kMaxOperandsAndOutputsPerFusion;
   }
 
   // Compute the precise number of operands to the new fusion.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index 05fa798dc39..cb22b4d9042 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -96,7 +96,8 @@ Status GpuTransferManager::EnqueueBuffersToInfeed(
 StatusOr<InfeedBuffer> GpuTransferManager::TransferBufferToInfeedInternal(
     se::StreamExecutor* executor, int64 size, const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return InvalidArgument("Infeed shape is too large: needs %d bytes", size);
+    return InvalidArgument("GPU infeed of %d bytes exceeds maximum of %d bytes",
+                           size, std::numeric_limits<int32>::max());
   }
 
   if (size == 0) {
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc
index 5e7593a82a6..6d663c66b50 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc
@@ -192,6 +192,14 @@ bool IsProfitableFusionCandidate(const HloInstruction& instr) {
     return false;
   }
 
+  // We can emit DUS in-place, horizontally fusing it makes the emitter no
+  // longer recognize that it can be done in-place. This creates much slower
+  // code. This restriction could be lifted if buffer assignment would recognize
+  // that the DUS can be done in-place even inside of a horizontal fusion.
+  if (root->opcode() == HloOpcode::kDynamicUpdateSlice) {
+    return false;
+  }
+
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_fusion_test.cc
index e1024f6017c..bad589964ff 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_fusion_test.cc
@@ -364,6 +364,45 @@ TEST_F(HorizontalFusionTest, RMSPropLike) {
   EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1.0e-5, 1.0e-5}));
 }
 
+TEST_F(HorizontalFusionTest, NegativeTestForDynamicUpdateSlice) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule NegativeTestForDynamicUpdateSlice
+
+  fusion.1 {
+    p.0 = f16[5,9,10]{2,1,0} parameter(0)
+    p.1 = s32[1]{0} parameter(1)
+    p.2 = f16[1,9,10]{2,1,0} parameter(2)
+    c.0 = s32[] constant(0)
+    pad = s32[3]{0} pad(p.1, c.0), padding=0_2
+    ROOT %dynamic-update-slice = f16[5,9,10]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+  }
+
+  fusion.2 {
+    p.0 = f16[5,9,10]{2,1,0} parameter(0)
+    p.1 = s32[1]{0} parameter(1)
+    p.2 = f16[1,9,10]{2,1,0} parameter(2)
+    c.0 = s32[] constant(0)
+    pad = s32[3]{0} pad(p.1, c.0), padding=0_2
+    ROOT %dynamic-update-slice = f16[5,9,10]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+  }
+
+  ENTRY entry {
+    p.00 = f16[5,9,10]{2,1,0} parameter(0)
+    p.01 = f16[5,9,10]{2,1,0} parameter(1)
+    p.10 = s32[1]{0} parameter(2)
+    p.11 = s32[1]{0} parameter(3)
+    p.20 = f16[1,9,10]{2,1,0} parameter(4)
+    p.21 = f16[1,9,10]{2,1,0} parameter(5)
+
+    f1 = f16[5,9,10] fusion(p.00, p.10, p.20), kind=kLoop, calls=fusion.1
+    f2 = f16[5,9,10] fusion(p.01, p.11, p.21), kind=kLoop, calls=fusion.2
+    ROOT tuple = (f16[5,9,10],f16[5,9,10]) tuple(f1, f2)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_FALSE(GpuHorizontalFusion().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index fc1c1bb4ab1..a0580e2ab04 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -65,12 +65,16 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
 bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   if (!ShouldFuseInexpensiveChecks(consumer, operand_index)) {
+    VLOG(5) << "Not fusing inexpensive checks of operand " << operand_index
+            << " of " << consumer->ToString();
     return false;
   }
   auto producer = consumer->operand(operand_index);
 
   // The following checks are potentially expensive.
   if (FusionWouldBeTooLarge(*consumer, *producer)) {
+    VLOG(5) << "Fusion of (" << producer->ToString() << ") into ("
+            << consumer->ToString() << ") would be too large";
     return false;
   }
   if (consumer->opcode() != HloOpcode::kFusion) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 011eb07d3bd..744cd7b56bf 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -222,7 +222,7 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
     // Derive a minimum alignment from the type. The optimizer can increase it
     // later.
     store->setAlignment(
-        llvm::MaybeAlign(ShapeUtil::ByteSizeOfPrimitiveType(element_type)));
+        llvm::Align(ShapeUtil::ByteSizeOfPrimitiveType(element_type)));
     return true;
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index b8154b0e157..a78ffc8dd1a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -106,6 +106,11 @@ const auto kDimY = KernelMappingScheme::DimY;
 const auto kDimZ = KernelMappingScheme::DimZ;
 const auto kDimTot = KernelMappingScheme::DimTot;
 
+const auto kLinearIndexingX = KernelMappingScheme::LinearIndexingX;
+const auto kStridedIndexingX = KernelMappingScheme::StridedIndexingX;
+const auto kStridedLinearIndexingX =
+    KernelMappingScheme::StridedLinearIndexingX;
+
 // If a dimensions is smaller than this, untiled transposition may be more
 // efficient.
 const int64 kMinDimensionToTransposeTiled = 16;
@@ -533,13 +538,11 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
     absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
         extra_output_gens) {
   for (int i = 0; i != extra_output_gens.size(); ++i) {
-    llvm::Value* extra_output_address =
-        GetIrArray(*unnested_hlo, *unnested_hlo, extra_output_gens[i].second)
-            .EmitArrayElementAddress(index, &b_, "extra_output_element_address",
-                                     use_linear_index);
     TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value,
                         extra_output_gens[i].first(index));
-    Store(extra_output_ir_value, extra_output_address);
+    GetIrArray(*unnested_hlo, *unnested_hlo, extra_output_gens[i].second)
+        .EmitWriteArrayElement(index, extra_output_ir_value, &b_,
+                               use_linear_index);
   }
   return Status::OK();
 }
@@ -1865,7 +1868,6 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
     return absl::c_any_of(hlo.fused_instructions_computation()->instructions(),
                           [](const HloInstruction* instr) {
                             switch (instr->opcode()) {
-                              case HloOpcode::kReduce:
                               case HloOpcode::kReduceWindow:
                               case HloOpcode::kSort:
                               case HloOpcode::kDot:
@@ -1892,6 +1894,10 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
       default:
         return false;
     }
+  } else if (hlo.opcode() == HloOpcode::kReduce && hlo.shape().IsArray()) {
+    // TODO: check if the to_apply() attribute contains instruction
+    // that break LLVM vectorization.
+    return false;
   }
   return true;
 }
@@ -1920,13 +1926,59 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
                                     llvm::Value* thread_id_x,
                                     llvm::Type* index_ty,
                                     llvm::IRBuilder<>* b) {
-  if (mapping_scheme.DilatedX()) {
+  auto constant = [&](int64 val) {
+    return llvm::ConstantInt::get(index_ty, val);
+  };
+  if (mapping_scheme.GetIndexingOrder() == kStridedIndexingX) {
     return thread_id_x;
+  } else if (mapping_scheme.GetIndexingOrder() == kStridedLinearIndexingX) {
+    return b->CreateMul(thread_id_x, constant(mapping_scheme.GetVectorSize()));
   }
+  CHECK_EQ(mapping_scheme.GetIndexingOrder(), kLinearIndexingX);
   int64 x_num_steps =
       mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX();
-  return b->CreateMul(thread_id_x,
-                      llvm::ConstantInt::get(index_ty, x_num_steps));
+  return b->CreateMul(thread_id_x, constant(x_num_steps));
+}
+
+// Calls `emit_elem_function()` `x_num_steps` times.  If
+// `vector_size`==1, then each element index passed to
+// `emit_elem_function()` will be separated by `step_x`. If `vector_size`>1,
+// then it must be a multiple of `x_num_steps`.  In that case, it
+// triggers a different indexing order that is vectorizable by
+// LLVM. It generates many groups of calls to `emit_elem_function`. Each
+// group is separated by `step_x` elements.  Inside a group, elements
+// are consecutive. If `check_x_tile_bounds` is true, then it will check
+// if the element index is in bound compared to `tile_width` before
+// calling `emit_elem_function`.
+static void UnrollInnerTileLoop(
+    bool check_x_tile_bounds, int64 x_num_steps, int64 step_x,
+    int64 vector_size, const string& loop_name, KernelSupportLibrary* ksl,
+    llvm::Value* start_offset_x, llvm::Value* y_loc, llvm::Value* tile_width,
+    const IrArray::Index& source_idx, llvm::IRBuilder<>* b,
+    const IrEmitterUnnested::EmitElementFunction* emit_elem_function) {
+  llvm::Type* index_ty = tile_width->getType();
+  auto constant = [&](int64 val) {
+    return llvm::ConstantInt::get(index_ty, val);
+  };
+  IrArray::Index source_idx_x_base = source_idx.AddOffsetToDim(y_loc, kDimY, b);
+  for (int64 j = 0; j < x_num_steps / vector_size; j++) {
+    for (int64 i = 0; i < vector_size; i++) {
+      int64 linear_index = j * vector_size + i;
+      llvm::Value* x_loc = b->CreateAdd(constant(j * step_x * vector_size + i),
+                                        start_offset_x, "x_loc");
+      IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim(
+          constant(j * step_x * vector_size + i), kDimX, b);
+      auto emit_element = [&] {
+        return (*emit_elem_function)(source_idx_x, y_loc, x_loc, linear_index);
+      };
+      if (check_x_tile_bounds) {
+        ksl->If(loop_name + "_x_in_tile", b->CreateICmpULT(x_loc, tile_width),
+                emit_element);
+      } else {
+        emit_element();
+      }
+    }
+  }
 }
 
 void IrEmitterUnnested::EmitTile(
@@ -1951,7 +2003,9 @@ void IrEmitterUnnested::EmitTile(
   // of threads.
   // Otherwise, the stride is one, but we multiply each offset by the limit of
   // number of steps which can be made.
-  int64 step_x = mapping_scheme.DilatedX() ? num_threads_x : 1;
+  int64 step_x =
+      mapping_scheme.GetIndexingOrder() == kLinearIndexingX ? 1 : num_threads_x;
+  int64 vector_size = mapping_scheme.GetVectorSize();
 
   IrArray::Index source_idx =
       tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_);
@@ -1962,7 +2016,9 @@ void IrEmitterUnnested::EmitTile(
 
   // True iff all threads always execute all instructions in the tiling
   // dimension X.
-  bool x_tile_fits = mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0;
+  bool x_tile_fits =
+      mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0 &&
+      mapping_scheme.GetRowContiguous();
 
   // The outer loop below is simply doing:
   //
@@ -1978,32 +2034,40 @@ void IrEmitterUnnested::EmitTile(
   //
   // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the
   // workaround.
-  ksl->For(loop_name + "_y_in_tile",
-           /*start=*/constant(0),
-           /*end=*/
-           ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
-                         num_threads_y),
-           /*step=*/constant(1), [&](llvm::Value* y_indvar) {
-             llvm::Value* y_loc =
-                 b_.CreateAdd(thread_id_info.thread_id_y,
-                              b_.CreateMul(y_indvar, num_threads_y));
-             for (int64 j = 0; j < x_num_steps; j++) {
-               llvm::Value* x_loc =
-                   b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc");
-               IrArray::Index source_idx_x =
-                   source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-                       .AddOffsetToDim(constant(j * step_x), kDimX, &b_);
-               auto emit_element = [&] {
-                 return emit_elem_function(source_idx_x, y_loc, x_loc, j);
-               };
-               if (!x_tile_fits) {
-                 ksl->If(loop_name + "_x_in_tile",
-                         b_.CreateICmpULT(x_loc, tile_width), emit_element);
-               } else {
-                 emit_element();
-               }
-             }
-           });
+  ksl->For(
+      loop_name + "_y_in_tile",
+      /*start=*/constant(0),
+      /*end=*/
+      ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
+                    num_threads_y),
+      /*step=*/constant(1), [&](llvm::Value* y_indvar) {
+        llvm::Value* y_loc = b_.CreateAdd(
+            thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
+        auto unroll_inner_tile_loop = [&](bool check_x_tile_bounds) {
+          return UnrollInnerTileLoop(check_x_tile_bounds, x_num_steps, step_x,
+                                     vector_size, loop_name, ksl,
+                                     start_offset_x, y_loc, tile_width,
+                                     source_idx, &b_, &emit_elem_function);
+        };
+
+        // Only take this path when we unroll in a way vectorizable by
+        // LLVM. Special case when the tile doesn't fit completely for even
+        // row size. For odd row size every other row isn't aligned to the
+        // vectorized size, so it can't be vectorized by LLVM.
+        if (!x_tile_fits &&
+            mapping_scheme.GetIndexingOrder() == kStridedLinearIndexingX) {
+          ksl->If(
+              loop_name + "_is_full_tile",
+              // For the last block, tile_width will be the number of
+              // elements left.
+              b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
+                              tile_width),
+              [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/false); },
+              [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/true); });
+        } else {
+          unroll_inner_tile_loop(/*check_x_tile_bounds=*/!x_tile_fits);
+        }
+      });
 }
 
 // Emits code to process a tensor element in a tile for the given kCopy HLO that
@@ -2035,6 +2099,19 @@ static IrArray::Index GetUnnormalizedIndex(
     const Shape& unnormalized_shape, llvm::IRBuilder<>* b_,
     const KernelMappingScheme& kernel_mapping_scheme) {
   DCHECK_EQ(normalized_shape_index.size(), 3);
+  // If the normalization only add a new dimensions of size 1,
+  // generate simpler indexing. LLVM doesn't always simplify the more
+  // complicated indexing and this prevents it from vectorizing some
+  // cases. We do this only for major_to_minor memory layout.
+  if (unnormalized_shape.rank() == 2 && unnormalized_shape.has_layout() &&
+      unnormalized_shape.dimensions()[0] == normalized_shape_index.dims()[1] &&
+      unnormalized_shape.dimensions()[1] == normalized_shape_index.dims()[2] &&
+      unnormalized_shape.layout().minor_to_major(1) == 0) {
+    CHECK_EQ(normalized_shape_index.dims()[0], 1);
+    auto multidim = normalized_shape_index.multidim();
+    return IrArray::Index({multidim[1], multidim[2]}, unnormalized_shape,
+                          normalized_shape_index.GetType());
+  }
   llvm::Value* linear = normalized_shape_index.Linearize(
       kernel_mapping_scheme.GetDimsInElems(), b_);
   return IrArray::Index(linear, unnormalized_shape, b_);
@@ -2077,21 +2154,6 @@ void IrEmitterUnnested::EmitTileElementForFusion(
   }
 }
 
-// Gets the number of partial results accumulated by a single thread performing
-// reduction.
-static int GetNumberOfPartialResults(
-    const ReductionCodegenInfo& reduction_info) {
-  const KernelMappingScheme& mapping_scheme =
-      reduction_info.GetKernelMappingScheme();
-  if (reduction_info.IsRowReduction()) {
-    return 1;
-  }
-  int64 num_partial_results = mapping_scheme.DilatedX() ? 1 : 2;
-  CHECK_EQ(num_partial_results,
-           (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX()));
-  return num_partial_results;
-}
-
 void IrEmitterUnnested::EmitPrologueForReduction(
     HloInstruction* unnested_hlo, ReductionCodegenInfo* reduction_info,
     absl::Span<HloInstruction* const> reduce_instructions,
@@ -2118,7 +2180,7 @@ void IrEmitterUnnested::EmitPrologueForReduction(
     llvm::AllocaInst* reduction_input_address = Alloca(element_type);
     reduction_input_addresses->push_back(reduction_input_address);
 
-    int num_partial_results = GetNumberOfPartialResults(*reduction_info);
+    int num_partial_results = reduction_info->GetNumPartialResults();
     AddressVector* partial_result_addresses =
         reduction_info->GetMutablePartialResultAddresses();
     llvm::AllocaInst* partial_result_address =
@@ -2270,7 +2332,7 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
   absl::Span<llvm::AllocaInst* const> partial_result_addresses =
       reduction_info.GetPartialResultAddresses();
 
-  int num_partial_results = GetNumberOfPartialResults(reduction_info);
+  int num_partial_results = reduction_info.GetNumPartialResults();
 
   // Emit an atomic operation that accumulates the partial reduction to the
   // output element. For row reduction, this is only for lane 0 due to the
@@ -2484,7 +2546,7 @@ void IrEmitterUnnested::EmitTileElementForReduction(
   // GetElementPointer with array types. This enables the vectorization of
   // the computation for different partial results. Use this index if
   // 'num_partial_results > 1'.
-  int num_partial_results = GetNumberOfPartialResults(reduction_info);
+  int num_partial_results = reduction_info.GetNumPartialResults();
   auto index_without_linear = IrArray::Index(
       input_index.multidim(), reduction_operand_shape, input_index.GetType());
 
@@ -2670,7 +2732,9 @@ void IrEmitterUnnested::EmitHlo021Tile(
                                      /*tile_sizes=*/{1, kWarpSize, kWarpSize},
                                      /*num_threads_y=*/kNumRows,
                                      /*num_threads_x=*/kWarpSize,
-                                     /*is_dilated_x=*/false);
+                                     /*indexing_order=*/kLinearIndexingX,
+                                     /*vector_size=*/1,
+                                     /*is_row_contiguous=*/false);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
@@ -3111,15 +3175,6 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   std::array<int64, 3> reduction_tiling =
       GetReductionTiling(reduction_dimensions, smallest_input_dtype_bits,
                          &ir_emitter_context_->device_description());
-  bool dilated_x =
-      reduction_dimensions.is_row_reduction ||
-      !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
-                                            reduction_dimensions.dimensions[2]);
-
-  if (!dilated_x && !reduction_dimensions.is_row_reduction) {
-    // Vectorized loads: a single thread reduces two adjacent columns.
-    reduction_tiling[2] *= 2;
-  }
 
   int64 num_threads_y = reduction_dimensions.is_row_reduction ? 1 : kWarpSize;
   int64 num_threads_x = [&] {
@@ -3133,12 +3188,54 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
     return kWarpSize;
   }();
 
+  bool tile_fit = reduction_dimensions.dimensions[kDimX] %
+                      (reduction_tiling[2] * num_threads_x) ==
+                  0;
+
+  int cc_major = 0, cc_minor = 0;
+  ir_emitter_context_->device_description().cuda_compute_capability(&cc_major,
+                                                                    &cc_minor);
+
+  int num_partial_results = 1;
+  KernelMappingScheme::IndexingOrder indexing_order = [&]() {
+    if (reduction_dimensions.is_row_reduction &&
+        // P100, only try to vectorize+coales memory access when the
+        // tile size fits exactly and dtypes <= 32 bits
+        ((cc_major == 6 && smallest_input_dtype_bits <= 32 && tile_fit) ||
+         // On V100, only try to vectorize+coales memory access for
+         // rows of even size.  For odd row sizes, every other row
+         // isn't aligned, so it can't be vectorized.
+         (cc_major >= 7 && reduction_dimensions.dimensions[2] % 2 == 0))) {
+      return kStridedLinearIndexingX;
+    } else if (!reduction_dimensions.is_row_reduction &&
+               IsUnrollingColumnReductionBeneficial(
+                   unnested_hlo, input_shape,
+                   reduction_dimensions.dimensions[2])) {
+      num_partial_results = 2;
+      reduction_tiling[2] *= num_partial_results;
+      return kLinearIndexingX;
+    } else {
+      return kStridedIndexingX;
+    }
+  }();
+
+  int vector_size = 1;
+  if (indexing_order == kStridedLinearIndexingX) {
+    if (reduction_dimensions.dimensions[2] % 2 == 0 &&
+        // Assuming XLA will perform the unrolling and LLVM will vectorize,
+        // disable the unroll for the cases that LLVM doesn't vectorize.
+        !MayPreventVectorization(*unnested_hlo)) {
+      vector_size = 2;
+    } else {
+      indexing_order = kStridedIndexingX;
+    }
+  }
   KernelMappingScheme mapping_scheme(
       reduction_dimensions.dimensions,
       {reduction_tiling[0], reduction_tiling[1] * num_threads_y,
        reduction_tiling[2] * num_threads_x},
-      num_threads_y, num_threads_x, dilated_x);
-  return ReductionCodegenInfo(mapping_scheme,
+      num_threads_y, num_threads_x, indexing_order, vector_size);
+  return ReductionCodegenInfo(mapping_scheme, num_partial_results,
                               reduction_dimensions.is_row_reduction);
 }
 
@@ -3354,9 +3451,8 @@ void IrEmitterUnnested::EmitElementForInputFusibleSlices(
           GetIrArray(*unnested_hlo, *unnested_hlo, shape_index);
       IrArray::Index slice_dst_index(dst_multidim, slice->shape(),
                                      index.GetType());
-      llvm::Value* dst_addr = src_ir_array.EmitArrayElementAddress(
-          slice_dst_index, &b_, "slice.dest");
-      b_.CreateStore(input_ir_values[i], dst_addr);
+      src_ir_array.EmitWriteArrayElement(slice_dst_index, input_ir_values[i],
+                                         &b_);
     };
 
     ksl.If(StrCat("slice", i), guarding_cond, emit_slice_elem_func);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index eeab8d4dc80..d5c4ecbc795 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -76,19 +76,34 @@ namespace gpu {
 class KernelMappingScheme {
  public:
   enum { DimZ = 0, DimY, DimX, DimTot };
+  enum IndexingOrder {
+    // Thread reads consecutive elements.
+    LinearIndexingX,
+    // Thread reads strided elements while keeping memory coalescing.
+    StridedIndexingX,
+    // Thread reads a few consecutive elements then take a strided
+    // step. This can trigger vectorized reads and keep memory
+    // coalescing.
+    StridedLinearIndexingX
+  };
+
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
                       absl::Span<const int64> tile_sizes, int64 num_threads_y,
-                      int64 num_threads_x, bool is_dilated_x)
+                      int64 num_threads_x, IndexingOrder indexing_order,
+                      int vector_size, bool is_row_contiguous = false)
       : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]},
         tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]},
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
-        dilated_x_(is_dilated_x) {
+        indexing_order_(indexing_order),
+        vector_size_(vector_size),
+        is_row_contiguous_(is_row_contiguous) {
     CHECK_EQ(tile_sizes[1] % num_threads_y_, 0);
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
-    if (!dilated_x_) {
-      // dilated_x_=false is for the purpose of vectorization, which requires
+    if (indexing_order != LinearIndexingX) {
+      // StridedIndexingX, and StridedLinearIndexingX
+      // is for the purpose of vectorization, which requires
       // GetTileSizeFor(DimX) to be a multiplier of num_threads_x_.
       CHECK_EQ(GetTileSizeFor(DimX) % num_threads_x_, 0);
     }
@@ -118,7 +133,9 @@ class KernelMappingScheme {
     return GetNumThreadsX() * GetNumThreadsY();
   }
 
-  bool DilatedX() const { return dilated_x_; }
+  IndexingOrder GetIndexingOrder() const { return indexing_order_; }
+  int GetVectorSize() const { return vector_size_; }
+  bool GetRowContiguous() const { return is_row_contiguous_; }
 
  private:
   // The number of elements in each dimension.
@@ -133,12 +150,18 @@ class KernelMappingScheme {
   // Number of threads used to process elements in the Y direction of a tile.
   const int64 num_threads_y_;
 
-  // When num_threads_x threads process a total of tile_size_x elements in the
-  // X dimension of a tile, each threads process n=tile_size_x/num_threads_x
-  // elements. When dilated_x=false, the n elements processed by a thread are
-  // contiguous. On the other hand, when dilated_x=true the n elements are
-  // dilated by a factor of num_threads_x.
-  const bool dilated_x_;
+  // When num_threads_x threads process a total of tile_size_x
+  // elements in the X dimension of a tile, each threads process
+  // n=tile_size_x/num_threads_x elements.
+  // indexing_order defines which tile's elements each thread reads.
+  const IndexingOrder indexing_order_;
+
+  // vector_size_ only supported for row reduction and must be a divisor
+  // of tile_sizes_[2]/num_threads_x.  Interesting values are 2 and 4
+  // to trigger vectorized loads on GPUs while keeping memory
+  // coalescing.
+  const int vector_size_;
+  const bool is_row_contiguous_;
 };
 
 // Information to support the code generation for a tiled reduction kernel.
@@ -146,8 +169,15 @@ using AddressVector = absl::InlinedVector<llvm::AllocaInst*, 1>;
 class ReductionCodegenInfo {
  public:
   explicit ReductionCodegenInfo(KernelMappingScheme mapping_scheme,
-                                bool is_row_reduction)
-      : mapping_scheme_(mapping_scheme), is_row_reduction_(is_row_reduction) {}
+                                int num_partial_results, bool is_row_reduction)
+      : mapping_scheme_(mapping_scheme),
+        num_partial_results_(num_partial_results),
+        is_row_reduction_(is_row_reduction) {
+    if (num_partial_results > 1) {
+      CHECK_EQ(num_partial_results, (mapping_scheme.GetTileSizeX() /
+                                     mapping_scheme.GetNumThreadsX()));
+    }
+  }
 
   const KernelMappingScheme& GetKernelMappingScheme() const {
     return mapping_scheme_;
@@ -183,6 +213,7 @@ class ReductionCodegenInfo {
     return reduction_input_addresses_;
   }
 
+  int GetNumPartialResults() const { return num_partial_results_; }
   bool IsRowReduction() const { return is_row_reduction_; }
 
   // Gets a pointer to a mutable shared cache used by reduction.
@@ -201,6 +232,7 @@ class ReductionCodegenInfo {
   const KernelMappingScheme mapping_scheme_;
   AddressVector partial_result_addresses_;
   AddressVector reduction_input_addresses_;
+  int num_partial_results_;
   bool is_row_reduction_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 060a0375271..497dcda4361 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -689,7 +689,7 @@ std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
     llvm::Triple target_triple, int amdgpu_version,
     const HloModuleConfig& hlo_module_config) {
   return GetTargetMachine(target_triple, absl::StrCat("gfx", amdgpu_version),
-                          hlo_module_config, "-code-object-v3");
+                          hlo_module_config, "+code-object-v3");
 }
 
 void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index 2d255d76746..aff9e6f162b 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
 
 #include <chrono>  // NOLINT (required by TF interfaces)
+#include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
@@ -85,6 +86,11 @@ namespace {
 
 using tensorflow::BlockingCounter;
 
+bool IsGlobalNcclConfig() {
+  static bool global_nccl_config = std::getenv("NCCL_COMM_ID") != nullptr;
+  return global_nccl_config;
+}
+
 // Functions to translate an ncclResult_t/cudaError_t to a Status object.  Used
 // by the macros below.
 Status TranslateStatus(ncclResult_t s, const char* file, int64 line,
@@ -285,7 +291,6 @@ class NcclClique {
     std::vector<ncclComm_t> raw_comms(local_device_ordinals_.size(), nullptr);
     TF_ASSIGN_OR_RETURN(const absl::optional<std::string>& nccl_id_string,
                         maybe_nccl_unique_id);
-
     ncclUniqueId nccl_id;
     if (nccl_id_string) {
       TF_RETURN_IF_ERROR(StringToNcclUniqueId(*nccl_id_string, &nccl_id));
@@ -416,10 +421,12 @@ RendezvousNcclAllReduce::SubmitParticipantImpl(
         nccl_unique_id = (*participant.nccl_unique_id_callback)(clique_key);
       } else {
         if (participant.rendezvous_key.global_devices.size() !=
-            participant.rendezvous_key.num_local_participants) {
+                participant.rendezvous_key.num_local_participants &&
+            !IsGlobalNcclConfig()) {
           nccl_unique_id = InvalidArgument(
-              "Multihost AllReduce on GPU requires a nccl_unique_id_callback "
-              "to be provided by the client.");
+              "If not local devices are taking part of a collective API on "
+              "GPU, the nccl_unique_id_callback must be provided by the "
+              "client.");
         } else {
           nccl_unique_id = absl::optional<std::string>();
         }
@@ -568,6 +575,13 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
       std::vector<int64> global_participating_replicas,
       GetParticipatingReplicas(global_device_id, instr->replica_groups(),
                                replica_count_, *params.device_assn));
+  if (IsGlobalNcclConfig() &&
+      global_participating_replicas.size() != replica_count_) {
+    return InvalidArgument(
+        "Partial replica groups are not allowed when using NCCL_COMM_ID "
+        "environment configuration.");
+  }
+
   std::vector<GlobalDeviceId> global_devices;
   std::vector<std::pair<GlobalDeviceId, int64>> local_devices;
   local_devices.reserve(global_participating_replicas.size());
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index d905e56b66f..7ff8d40b440 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -141,6 +141,7 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
     // bitcast. This leads to having to linearize and then delinearize the
     // index.
     options.set_replace_transpose_with_bitcast(false);
+    options.set_enable_conv_operand_swap(false);
     options.set_cudnn_batchnorm_forward_training_metadata(
         kCudnnBatchNormForwardTrainingCallTarget);
     pass.AddPass<AlgebraicSimplifier>(options);
@@ -382,7 +383,6 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
           VLOG(2) << "Compiled PTX size:" << ptx.size()
                   << " CUBIN size: " << cache_value->cubin_data.size();
         } else {
-          bool log_warning = true;
           if (maybe_cubin.status().code() ==
               tensorflow::error::Code::NOT_FOUND) {
             // Missing ptxas is expected in some environments where CUDA SDK
@@ -392,15 +392,36 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
             // TODO(jlebar): we should implement a LOG_FIRST_N and LOG_EVERY_N
             // for more general usage.
             static std::atomic<bool> warning_done(false);
-            log_warning = !warning_done.exchange(true);
-          }
-          if (log_warning) {
-            PrintCantFindCudaMessage(
-                "Can't find ptxas binary in ${CUDA_DIR}/bin.  Will back to the "
-                "GPU driver for PTX -> sass compilation.  This is OK so long "
-                "as you don't see a warning below about an out-of-date driver "
-                "version. Custom ptxas location can be specified using $PATH.",
-                hlo_module_config);
+            bool log_warning = !warning_done.exchange(true);
+            if (log_warning) {
+              PrintCantFindCudaMessage(
+                  "Can't find ptxas binary in ${CUDA_DIR}/bin.  Will back to "
+                  "the GPU driver for PTX -> sass compilation.  This is OK so "
+                  "long as you don't see a warning below about an out-of-date "
+                  "driver version. Custom ptxas location can be specified "
+                  "using $PATH.",
+                  hlo_module_config);
+            }
+            CHECK(hlo_module_config.debug_options()
+                      .xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found())
+                << "There was an error when trying to compile ptx into sass "
+                   "code. If you want to try falling back to the GPU driver to "
+                   "jit compile ptx, you can use the flag "
+                   "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found."
+                   " Use at your own risk though, it has known drawbacks like "
+                   "increased memory consumption.";
+          } else {
+            LOG(ERROR) << "Error during compilation of ptx to sass: "
+                       << maybe_cubin.status();
+            CHECK(hlo_module_config.debug_options()
+                      .xla_gpu_unsafe_fallback_to_driver_on_ptxas_error())
+                << "There was an error when trying to compile ptx into sass "
+                   "code. Up until May 14 2020, XLA silently ignored such "
+                   "errors and fell back to the GPU driver. This is likely to "
+                   "trigger subtle runtime issues and is hence discouraged. "
+                   "If you want to temporarily restore this behavior use the "
+                   "flag --xla_gpu_unsafe_fallback_to_driver_on_ptxas_error "
+                   "and file a bug in b/components/366096.";
           }
 
           // We're going to use the driver to JIT our PTX->SASS, so warn if
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index 1fd51c78988..7a9845d0f49 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -164,6 +164,33 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "reduction_vectorization_test",
+    srcs = [
+        "reduction_vectorization_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:gemm_rewriter",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_test(
     name = "reduction_dimension_grouper_test",
     srcs = [
@@ -208,6 +235,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gpu_copy_alone_test",
+    srcs = [
+        "gpu_copy_alone_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "gpu_ftz_test",
     srcs = ["gpu_ftz_test.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
new file mode 100644
index 00000000000..1c475ab4e10
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// WARNING: This tests must be alone in its file!  Otherwise, the
+// error isn't caught. We expect and CUDA_ERROR_ILLEGAL_ADDRESS to be
+// thrown with the old buggy code.
+class CopyAloneNoOptTest : public GpuCodegenTest {
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // The test MultiOutputStore contain a MOF fusion and XLA optimizer pass
+    // doesn't like this.
+    debug_options.set_xla_disable_all_hlo_passes(true);
+    return debug_options;
+  }
+};
+
+TEST_F(CopyAloneNoOptTest, CopyTranspose) {
+  const char* hlo_text = R"(
+HloModule mod
+ENTRY main {
+  %param = f32[8,32,32,32,16]{4,3,2,1,0} parameter(0)
+  ROOT %copy = f32[8,32,32,32,16]{3,2,1,4,0} copy(f32[8,32,32,32,16]{4,3,2,1,0} %param)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK-NOT: ld.global.nc.v2
+)");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index ca0a78034d7..38ff2da7161 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -58,7 +58,7 @@ TEST_F(GpuNoAliasTest, Concat) {
 ; CHECK: load float, float* %[[y_gep]], {{.*}}, !noalias ![[param_noalias]]
 ; CHECK: %[[result_ptr:.*]] = bitcast [2 x [6 x float]]* %fusion{{.*}} to float*
 ; CHECK: %[[result_gep:.*]] = getelementptr inbounds float, float* %[[result_ptr]]
-; CHECK: store float {{.*}}, float* %[[result_gep]], !alias.scope ![[param_noalias]]
+; CHECK: store float {{.*}}, float* %[[result_gep]], align 4, !alias.scope ![[param_noalias]]
 ; CHECK: ![[param_noalias]] = !{![[retval_buffer:.*]]}
       )",
                      /*match_optimized_ir=*/false);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
new file mode 100644
index 00000000000..abca1f0cf18
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -0,0 +1,360 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+class ReductionVectorizationTest : public GpuCodegenTest {};
+
+TEST_F(ReductionVectorizationTest, Power2) {
+  const char* hlo_text = R"(
+HloModule ReducePower2
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %param_0 = f32[5,131072] parameter(0)
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5] reduce(f32[5,131072] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  se::StreamExecutor* executor = backend().default_stream_executor();
+  int cc_major = 0, cc_minor = 0;
+  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                           &cc_minor);
+  string expected_ptx;
+  if (cc_major >= 6) {
+    expected_ptx = R"(
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+)";
+  } else {
+    expected_ptx = R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, TileFit) {
+  const char* hlo_text = R"(
+HloModule ReduceTileFit
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %param_0 = f32[5,122880] parameter(0)
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  se::StreamExecutor* executor = backend().default_stream_executor();
+  int cc_major = 0, cc_minor = 0;
+  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                           &cc_minor);
+  string expected_ptx;
+  if (cc_major >= 6) {
+    expected_ptx = R"(
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+)";
+  } else {
+    expected_ptx = R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, EvenColumns) {
+  const char* hlo_text = R"(
+HloModule ReducePower2
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %param_0 = f32[5,131070] parameter(0)
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5] reduce(f32[5,131070] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  se::StreamExecutor* executor = backend().default_stream_executor();
+  int cc_major = 0, cc_minor = 0;
+  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                           &cc_minor);
+  string expected_ptx;
+  if (cc_major >= 7) {
+    expected_ptx = R"(
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK-NOT: ld.global.nc.v2.f32
+// TODO: Make this a vectorized load
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+)";
+  } else {
+    expected_ptx = R"(
+CHECK-NOT: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, DisabledOddColumns) {
+  const char* hlo_text = R"(
+HloModule ReduceTileFit
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(%x, %y)
+}
+
+ENTRY %main {
+  %param_0 = f32[5,131071] parameter(0)
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5] reduce(f32[5,131071] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK-NOT: ld.global.nc.v4.f32
+CHECK-NOT: ld.global.nc.u64
+CHECK-NOT: ld.global.u64
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, Exp) {
+  const char* hlo_text = R"(
+HloModule DisableSin
+
+%add_float {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add.17 = f32[] add(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %arg0.1 = f32[5,131072] parameter(0)
+  %sine = f32[5,131072] exponential(f32[5,131072] %arg0.1)
+  %constant.0 = f32[] constant(0)
+  ROOT %reduce.18 = f32[5] reduce(f32[5,131072] %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  se::StreamExecutor* executor = backend().default_stream_executor();
+  int cc_major = 0, cc_minor = 0;
+  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                           &cc_minor);
+  string expected_ptx;
+  if (cc_major >= 6) {
+    expected_ptx = R"(
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+)";
+  } else {
+    expected_ptx = R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, DisableSin) {
+  const char* hlo_text = R"(
+HloModule DisableSin
+
+%add_float {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add.17 = f32[] add(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %arg0.1 = f32[5,131072] parameter(0)
+  %sine = f32[5,131072] sine(f32[5,131072] %arg0.1)
+  %constant.0 = f32[] constant(0)
+  ROOT %reduce.18 = f32[5] reduce(f32[5,131072] %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK-NOT: ld.global.nc.v4.f32
+CHECK-NOT: ld.global.nc.u64
+CHECK-NOT: ld.global.u64
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+class ReductionVectorizationNoOptTest : public GpuCodegenTest {
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // The test MultiOutputStore contain a MOF fusion and XLA optimizer pass
+    // doesn't like this.
+    debug_options.set_xla_disable_all_hlo_passes(true);
+    return debug_options;
+  }
+};
+
+TEST_F(ReductionVectorizationNoOptTest, MultiOutputStore) {
+  const char* hlo_text = R"(
+HloModule MultiOutputStore
+
+%add_f32 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+%fused_computation {
+  %param_0 = f32[2,384,1024] parameter(0)
+  %param_1 = f32[2,384] parameter(1)
+  %constant0 = f32[] constant(0.0009765625)
+  %broadcast0 = f32[2,384] broadcast(%constant0), dimensions={}
+  %multiply0 = f32[2,384] multiply(%param_1, %broadcast0)
+  %broadcast1 = f32[2,384,1024] broadcast(%multiply0), dimensions={0,1}
+  %subtract = f32[2,384,1024] subtract(%param_0, %broadcast1)
+  %multiply1 = f32[2,384,1024] multiply(%subtract, %subtract)
+  %constant1 = f32[] constant(0)
+  %reduce = f32[2,384] reduce(%multiply1, %constant1), dimensions={2}, to_apply=%add_f32
+  ROOT %tuple = (f32[2,384], f32[2,384,1024], f32[2,384,1024]) tuple(%reduce, %subtract, %broadcast1)
+}
+
+ENTRY %cluster {
+  %param0 = f32[2,384,1024] parameter(0)
+  %param1 =  f32[2,384] parameter(1)
+  ROOT %fusion = (f32[2,384], f32[2,384,1024], f32[2,384,1024]) fusion(%param0, %param1), kind=kInput, calls=%fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK: ld.global.nc.v2.f32
+CHECK: st.global.v2.f32
+CHECK: st.global.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: st.global.v2.f32
+CHECK: st.global.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: st.global.v2.f32
+CHECK: st.global.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: st.global.v2.f32
+CHECK: st.global.v2.f32
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index c4911df150f..134c8953b15 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -171,7 +171,7 @@ message HloInstructionProto {
   xla.OpSharding sharding = 40;
 
   // Backend configuration for the instruction. Has backend-specific meaning.
-  string backend_config = 43;
+  bytes backend_config = 43;
 
   // Cross replica op fields.
   repeated ReplicaGroup replica_groups = 49;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 94a4df43cf4..32a9038b15a 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -707,6 +707,10 @@ Status HloCostAnalysis::HandleCholesky(const HloInstruction* hlo) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleAllGather(const HloInstruction* hlo) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
   // We assume 2 replicas, so that each output element is the sum of two input
   // elements.
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 915c4dcbe84..9fdb42185fb 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -76,6 +76,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleFft(const HloInstruction* fft) override;
   Status HandleTriangularSolve(const HloInstruction* hlo) override;
   Status HandleCholesky(const HloInstruction* hlo) override;
+  Status HandleAllGather(const HloInstruction* hlo) override;
   Status HandleAllReduce(const HloInstruction* crs) override;
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleCollectivePermute(const HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index a573b621c88..900b557b4dc 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -47,15 +47,14 @@ StatusOr<bool> HloDCE::RunOnComputation(
   // computation's instruction while simultaneously removing instructions.
   std::vector<HloInstruction*> dead_roots;
   for (auto* instruction : computation->instructions()) {
+    auto maybe_collective_op = DynCast<HloAllReduceInstruction>(instruction);
     if (instruction != computation->root_instruction() &&
         instruction->user_count() == 0 &&
         computation->IsSafelyRemovable(instruction) &&
         (!instruction->HasSideEffect() ||
          (remove_cross_partition_collective_ops &&
-          ((instruction->opcode() == HloOpcode::kAllReduce &&
-            !Cast<HloAllReduceInstruction>(instruction)->constrain_layout()) ||
-           instruction->opcode() == HloOpcode::kCollectivePermute ||
-           instruction->opcode() == HloOpcode::kAllToAll)))) {
+          (maybe_collective_op != nullptr &&
+           !maybe_collective_op->constrain_layout())))) {
       dead_roots.push_back(instruction);
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index db651d3c323..b04635dda03 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -4442,5 +4442,27 @@ TEST_F(HloEvaluatorTest, CopyStartCopyDone) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+TEST_F(HloEvaluatorTest, MapBF16) {
+  const absl::string_view hlo_text = R"(
+  HloModule test
+
+  map_computation {
+    p = bf16[] parameter(0)
+    add = bf16[] add(p, p)
+    ROOT conv = f32[] convert(add)
+  }
+
+  ENTRY CopyStartCopyDone {
+    c = bf16[3] constant({1, 2, 3})
+    ROOT map = f32[3] map(c), to_apply=map_computation
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  Literal expected = LiteralUtil::CreateR1<float>({2.f, 4.f, 6.f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result, HloEvaluator().Evaluate(*m_->entry_computation(), {}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 6fa3f9fb34b..3dc9cc24734 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -700,6 +700,38 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCbrt(HloInstruction* cbrt) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[cbrt],
+        ElementWiseUnaryOp(cbrt, [](ElementwiseT elem_operand) -> ElementwiseT {
+          return std::pow(elem_operand, static_cast<ElementwiseT>(1.0 / 3.0));
+          return elem_operand.real() < 0
+                     ? -std::pow(-elem_operand,
+                                 static_cast<ElementwiseT>(1.0 / 3.0))
+                     : std::pow(elem_operand,
+                                static_cast<ElementwiseT>(1.0 / 3.0));
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCbrt(HloInstruction* cbrt) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[cbrt],
+                        ElementWiseUnaryOp(cbrt, [](ElementwiseT elem_operand) {
+                          return std::cbrt(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleCbrt(HloInstruction* cbrt) override {
+    return HandleCbrt<ElementwiseT>(cbrt);
+  }
+
   Status HandleRsqrt(HloInstruction* rsqrt) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[rsqrt],
@@ -1680,6 +1712,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                             MapImpl<Eigen::half>(map));
         break;
       }
+      case BF16: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<bfloat16>(map));
+        break;
+      }
       case F32: {
         TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<float>(map));
         break;
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 78e4d39d3fe..cd2a61d7eff 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -980,6 +980,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kSlice:
     case HloOpcode::kSort:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
       // De-emphasize scalar-shaped elementwise ops -- they're generally
@@ -1056,6 +1057,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGetDimensionSize:
     case HloOpcode::kSetDimensionSize:
       return kGray;
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 22b74663087..9e9c8b0913b 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -388,6 +388,24 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                   proto.outfeed_config());
       break;
     }
+    case HloOpcode::kAllGather: {
+      absl::optional<int64> channel_id;
+      if (proto.channel_id() > 0) {
+        channel_id = proto.channel_id();
+      }
+
+      TF_RET_CHECK(proto.dimensions_size() == 1)
+          << "AllGather cannot have more than 1 all-gather dimensions";
+      TF_RET_CHECK(all_operands().size() == 1)
+          << "AllGather must have a single operand";
+      int64 all_gather_dimension = proto.dimensions(0);
+      instruction = CreateAllGather(
+          shape, operands(0), all_gather_dimension,
+          std::vector<ReplicaGroup>(proto.replica_groups().begin(),
+                                    proto.replica_groups().end()),
+          proto.constrain_layout(), channel_id, proto.use_global_device_ids());
+      break;
+    }
     case HloOpcode::kAllReduce: {
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "AllReduce should have 1 called computation but sees "
@@ -430,6 +448,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           /*replica_groups=*/
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
                                     proto.replica_groups().end()),
+          /*constrain_layout=*/proto.constrain_layout(),
           /*channel_id=*/channel_id, split_dimension);
       break;
     }
@@ -806,6 +825,7 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kTanh:
       break;
     default:
@@ -927,6 +947,15 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
       shape, operand, exponent_bits, mantissa_bits);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllGather(
+    const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+    const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+    const absl::optional<int64>& channel_id, bool use_global_device_ids) {
+  return absl::make_unique<HloAllGatherInstruction>(
+      shape, operand, all_gather_dimension, replica_groups, constrain_layout,
+      channel_id, use_global_device_ids);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllReduce(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
@@ -939,11 +968,12 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllToAll(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const std::vector<ReplicaGroup>& replica_groups,
+    const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
     const absl::optional<int64>& channel_id,
     const absl::optional<int64>& split_dimension) {
   return absl::make_unique<HloAllToAllInstruction>(
-      shape, operands, replica_groups, channel_id, split_dimension);
+      shape, operands, replica_groups, constrain_layout, channel_id,
+      split_dimension);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1375,6 +1405,8 @@ bool HloInstruction::HasSideEffectNoRecurse() const {
     case HloOpcode::kAllReduce:
       return channel_id().has_value() ||
              Cast<HloAllReduceInstruction>(this)->constrain_layout();
+    case HloOpcode::kAllToAll:
+      return Cast<HloAllToAllInstruction>(this)->constrain_layout();
     case HloOpcode::kCustomCall:
       return Cast<HloCustomCallInstruction>(this)
           ->custom_call_has_side_effect();
@@ -1513,6 +1545,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kParameter:
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kReducePrecision:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
@@ -1561,6 +1594,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kTanh:
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateUnary(shape, opcode_, new_operands[0]);
@@ -1933,6 +1967,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
     case HloOpcode::kTuple:
@@ -1990,6 +2025,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kReducePrecision:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
@@ -2377,6 +2413,7 @@ bool HloInstruction::IsElementwiseImpl(
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kTanh:
       CHECK_EQ(1, operand_count());
       return true;
@@ -2843,6 +2880,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleConvolution(this);
     case HloOpcode::kFft:
       return visitor->HandleFft(this);
+    case HloOpcode::kAllGather:
+      return visitor->HandleAllGather(this);
     case HloOpcode::kAllReduce:
       return visitor->HandleAllReduce(this);
     case HloOpcode::kAllToAll:
@@ -2889,6 +2928,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleSin(this);
     case HloOpcode::kSqrt:
       return visitor->HandleSqrt(this);
+    case HloOpcode::kCbrt:
+      return visitor->HandleCbrt(this);
     case HloOpcode::kRsqrt:
       return visitor->HandleRsqrt(this);
     case HloOpcode::kReal:
@@ -3366,8 +3407,14 @@ string FrontendAttributesToString(
   std::vector<std::pair<string, string>> sorted_attributes(
       frontend_attributes.map().begin(), frontend_attributes.map().end());
   absl::c_sort(sorted_attributes);
-  return absl::StrFormat(
-      "{%s}", absl::StrJoin(sorted_attributes, ",", absl::PairFormatter("=")));
+  // Frontend attribute is a comma-separated list of attribute="value" pairs,
+  // e.g., frontend_attributes={name="value_a",type="int32"}.
+  const auto formatter = [](string* out,
+                            const std::pair<string, string>& item) {
+    absl::StrAppend(out, item.first, "=\"", item.second, "\"");
+  };
+  return absl::StrFormat("{%s}",
+                         absl::StrJoin(sorted_attributes, ",", formatter));
 }
 
 string PaddingConfigToString(const PaddingConfig& padding) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 98f2a20d505..8be7a034877 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -618,6 +618,16 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, const int exponent_bits,
       const int mantissa_bits);
 
+  // Creates an all-gather op, which concats the operands of all participants
+  // along all_gather_dimension. The replica_groups, channel_id, and
+  // use_global_device_ids arguments are identical to those in all-reduce,
+  // except that the order of the group members determines the concatenation
+  // order of inputs from different participants.
+  static std::unique_ptr<HloInstruction> CreateAllGather(
+      const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+      const absl::optional<int64>& channel_id, bool use_global_device_ids);
+
   // Creates a cross replica reduction op.
   //
   // `reduction_computation`: the reduction function.
@@ -667,7 +677,7 @@ class HloInstruction {
   // It is used to implement the higher-level instruction in XlaBuilder.
   static std::unique_ptr<HloInstruction> CreateAllToAll(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const std::vector<ReplicaGroup>& replica_groups,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
       const absl::optional<int64>& channel_id,
       const absl::optional<int64>& split_dimension = absl::nullopt);
 
@@ -1605,6 +1615,9 @@ class HloInstruction {
   virtual int64 dimensions(int64 index) const {
     LOG(FATAL) << "Unimplemented method.";
   }
+  virtual std::vector<int64>* mutable_dimensions() {
+    LOG(FATAL) << "Unimplemented method.";
+  }
 
   // Delegates to HloConcatenateInstruction::concatenate_dimension.
   int64 concatenate_dimension() const;
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 3c2e90c202a..d5bdd674563 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -513,10 +513,11 @@ HloRecvDoneInstruction::CloneWithNewOperandsImpl(
 HloCollectiveInstruction::HloCollectiveInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    const std::vector<ReplicaGroup>& replica_groups,
+    const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
     const absl::optional<int64>& channel_id)
     : HloChannelInstruction(opcode, shape, channel_id),
-      replica_groups_(replica_groups) {
+      replica_groups_(replica_groups),
+      constrain_layout_(constrain_layout) {
   for (auto operand : operands) {
     AppendOperand(operand);
   }
@@ -526,6 +527,7 @@ HloInstructionProto HloCollectiveInstruction::ToProto() const {
   HloInstructionProto proto = HloChannelInstruction::ToProto();
   *proto.mutable_replica_groups() = {replica_groups_.begin(),
                                      replica_groups_.end()};
+  proto.set_constrain_layout(constrain_layout_);
   return proto;
 }
 
@@ -535,6 +537,9 @@ std::vector<string> HloCollectiveInstruction::ExtraAttributesToStringImpl(
       HloChannelInstruction::ExtraAttributesToStringImpl(options);
   result.push_back(
       StrCat("replica_groups=", ReplicaGroupsToString(replica_groups())));
+  if (constrain_layout_) {
+    result.push_back("constrain_layout=true");
+  }
   return result;
 }
 
@@ -551,14 +556,58 @@ bool HloCollectiveInstruction::IdenticalSlowPath(
                        });
 }
 
+HloAllGatherInstruction::HloAllGatherInstruction(
+    const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+    const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+    const absl::optional<int64>& channel_id, bool use_global_device_ids)
+    : HloCollectiveInstruction(HloOpcode::kAllGather, shape, {operand},
+                               replica_groups, constrain_layout, channel_id),
+      all_gather_dimension_(all_gather_dimension),
+      use_global_device_ids_(use_global_device_ids) {}
+
+std::vector<string> HloAllGatherInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  std::vector<string> result =
+      HloCollectiveInstruction::ExtraAttributesToStringImpl(options);
+  result.push_back(StrCat("dimensions={", all_gather_dimension_, "}"));
+  if (use_global_device_ids_) {
+    result.push_back("use_global_device_ids=true");
+  }
+  return result;
+}
+
+std::unique_ptr<HloInstruction>
+HloAllGatherInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* /*context*/) const {
+  return absl::make_unique<HloAllGatherInstruction>(
+      shape, new_operands[0], all_gather_dimension(), replica_groups(),
+      constrain_layout(), channel_id(), use_global_device_ids());
+}
+
+HloInstructionProto HloAllGatherInstruction::ToProto() const {
+  HloInstructionProto proto = HloCollectiveInstruction::ToProto();
+  proto.add_dimensions(all_gather_dimension_);
+  return proto;
+}
+
+bool HloAllGatherInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloAllGatherInstruction&>(other);
+  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
+         all_gather_dimension_ == casted_other.all_gather_dimension() &&
+         use_global_device_ids() == casted_other.use_global_device_ids();
+}
+
 HloAllReduceInstruction::HloAllReduceInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
     const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
     const absl::optional<int64>& channel_id, bool use_global_device_ids)
     : HloCollectiveInstruction(HloOpcode::kAllReduce, shape, operands,
-                               replica_groups, channel_id),
-      constrain_layout_(constrain_layout),
+                               replica_groups, constrain_layout, channel_id),
       use_global_device_ids_(use_global_device_ids) {
   AppendComputation(reduce_computation);
 }
@@ -574,7 +623,6 @@ bool HloAllReduceInstruction::IsNoop() const {
 
 HloInstructionProto HloAllReduceInstruction::ToProto() const {
   HloInstructionProto proto = HloCollectiveInstruction::ToProto();
-  proto.set_constrain_layout(constrain_layout_);
   proto.set_use_global_device_ids(use_global_device_ids_);
   return proto;
 }
@@ -583,9 +631,6 @@ std::vector<string> HloAllReduceInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
   std::vector<string> result =
       HloCollectiveInstruction::ExtraAttributesToStringImpl(options);
-  if (constrain_layout_) {
-    result.push_back("constrain_layout=true");
-  }
   if (use_global_device_ids_) {
     result.push_back("use_global_device_ids=true");
   }
@@ -614,11 +659,11 @@ HloAllReduceInstruction::CloneWithNewOperandsImpl(
 
 HloAllToAllInstruction::HloAllToAllInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const std::vector<ReplicaGroup>& replica_groups,
+    const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
     const absl::optional<int64>& channel_id,
     const absl::optional<int64>& split_dimension)
     : HloCollectiveInstruction(HloOpcode::kAllToAll, shape, operands,
-                               replica_groups, channel_id),
+                               replica_groups, constrain_layout, channel_id),
       split_dimension_(split_dimension) {}
 
 std::unique_ptr<HloInstruction>
@@ -626,7 +671,8 @@ HloAllToAllInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* /*context*/) const {
   return absl::make_unique<HloAllToAllInstruction>(
-      shape, new_operands, replica_groups(), channel_id(), split_dimension());
+      shape, new_operands, replica_groups(), constrain_layout(), channel_id(),
+      split_dimension());
 }
 
 HloInstructionProto HloAllToAllInstruction::ToProto() const {
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 0cf8f7e6eb0..ae78d365cfa 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -313,37 +313,6 @@ class HloCollectiveInstruction : public HloChannelInstruction {
     return replica_groups_;
   }
 
- protected:
-  explicit HloCollectiveInstruction(
-      HloOpcode opcode, const Shape& shape,
-      absl::Span<HloInstruction* const> operands,
-      const std::vector<ReplicaGroup>& replica_groups,
-      const absl::optional<int64>& channel_id);
-
-  HloInstructionProto ToProto() const override;
-
-  std::vector<string> ExtraAttributesToStringImpl(
-      const HloPrintOptions& options) const override;
-  bool IdenticalSlowPath(
-      const HloInstruction& other,
-      const std::function<bool(const HloComputation*, const HloComputation*)>&
-          eq_computations) const override;
-
-  std::vector<ReplicaGroup> replica_groups_;
-};
-
-class HloAllReduceInstruction : public HloCollectiveInstruction {
- public:
-  explicit HloAllReduceInstruction(
-      const Shape& shape, absl::Span<HloInstruction* const> operands,
-      HloComputation* reduce_computation,
-      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
-      const absl::optional<int64>& channel_id, bool use_global_device_ids);
-
-  // Returns true if the AllReduce does no communication, so it's equivalent
-  // to a mem copy.
-  bool IsNoop() const;
-
   // Returns true if the layout of the AllReduce is enforced by XLA client (as
   // the layout set in the shape). The only reason for the client to set the
   // layout is to separately compile computations that communicate with
@@ -359,6 +328,70 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   // unconstrained AllReduce instructions (checked by HloVerifier).
   bool constrain_layout() const { return constrain_layout_; }
 
+ protected:
+  explicit HloCollectiveInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+      const absl::optional<int64>& channel_id);
+
+  HloInstructionProto ToProto() const override;
+
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  std::vector<ReplicaGroup> replica_groups_;
+  bool constrain_layout_;
+};
+
+class HloAllGatherInstruction : public HloCollectiveInstruction {
+ public:
+  explicit HloAllGatherInstruction(
+      const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+      const absl::optional<int64>& channel_id, bool use_global_device_ids);
+  // Same as HloAllReduceInstruction::use_global_device_ids.
+  bool use_global_device_ids() const { return use_global_device_ids_; }
+
+  // The dimension on which data from different participants are concatenated.
+  int64 all_gather_dimension() const { return all_gather_dimension_; }
+
+ protected:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  HloInstructionProto ToProto() const override;
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64 all_gather_dimension_;
+  bool use_global_device_ids_;
+};
+
+class HloAllReduceInstruction : public HloCollectiveInstruction {
+ public:
+  explicit HloAllReduceInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+      const absl::optional<int64>& channel_id, bool use_global_device_ids);
+
+  // Returns true if the AllReduce does no communication, so it's equivalent
+  // to a mem copy.
+  bool IsNoop() const;
+
   // Returns true if the ids in the ReplicaGroup config represent a global id of
   // (replica_id * partition_count + partition_id) instead of a replica id.
   // This enables more flexible grouping of devices if this all-reduce is both
@@ -387,7 +420,6 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
       HloCloneContext* context) const override;
 
-  bool constrain_layout_;
   bool use_global_device_ids_;
 };
 
@@ -395,7 +427,7 @@ class HloAllToAllInstruction : public HloCollectiveInstruction {
  public:
   explicit HloAllToAllInstruction(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const std::vector<ReplicaGroup>& replica_groups,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
       const absl::optional<int64>& channel_id,
       const absl::optional<int64>& split_dimension);
 
@@ -465,6 +497,7 @@ class HloReverseInstruction : public HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  std::vector<int64>* mutable_dimensions() override { return &dimensions_; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -491,6 +524,7 @@ class HloConcatenateInstruction : public HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  std::vector<int64>* mutable_dimensions() override { return &dimensions_; }
   // Accessor for the dimension in which a concatenate HLO should occur.
   int64 concatenate_dimension() const { return dimensions(0); }
   // Returns a serialized representation of this instruction.
@@ -520,6 +554,7 @@ class HloReduceInstruction : public HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  std::vector<int64>* mutable_dimensions() override { return &dimensions_; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -560,6 +595,7 @@ class HloSortInstruction : public HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  std::vector<int64>* mutable_dimensions() override { return &dimensions_; }
   // Returns the sort dimension for this instruction
   int64 sort_dimension() const { return dimensions(0); }
   // Returns a serialized representation of this instruction.
@@ -594,6 +630,7 @@ class HloTransposeInstruction : public HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  std::vector<int64>* mutable_dimensions() override { return &dimensions_; }
   // Returns whether this instruction does a rank-2 transposition.
   bool IsRank2Transpose() const;
   // Returns a serialized representation of this instruction.
@@ -621,6 +658,7 @@ class HloBroadcastInstruction : public HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  std::vector<int64>* mutable_dimensions() override { return &dimensions_; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -668,6 +706,7 @@ class HloMapInstruction : public HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  std::vector<int64>* mutable_dimensions() { return &dimensions_; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index de65ed99303..9722d5c2b76 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -420,6 +420,8 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
     if (execution_options->num_partitions() > 0) {
       module_config.set_num_partitions(execution_options->num_partitions());
     }
+    module_config.set_use_spmd_partitioning(
+        execution_options->use_spmd_partitioning());
     if (execution_options->has_device_assignment()) {
       TF_ASSIGN_OR_RETURN(std::unique_ptr<DeviceAssignment> device_assignment,
                           DeviceAssignment::Deserialize(
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index d90a1485441..964f83322a4 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -104,10 +104,20 @@ class HloModuleConfig {
     return debug_options_.xla_hlo_profile();
   }
 
+  bool cpu_traceme_enabled() const {
+    return debug_options_.xla_cpu_enable_xprof_traceme();
+  }
+
   // Sets/returns the module seed set during execution.
   void set_seed(uint64 seed) { seed_ = seed; }
   uint64 seed() const { return seed_; }
 
+  // Set the launch id of the program. Launch id identifies a set of programs
+  // that should be launched together.
+  void set_launch_id(uint64 launch_id) { launch_id_ = launch_id; }
+
+  int32 launch_id() const { return launch_id_; }
+
   void set_replica_count(int64 replica_count) {
     replica_count_ = replica_count;
   }
@@ -118,6 +128,11 @@ class HloModuleConfig {
   }
   int64 num_partitions() const { return num_partitions_; }
 
+  void set_use_spmd_partitioning(bool use_spmd_partitioning) {
+    use_spmd_partitioning_ = use_spmd_partitioning;
+  }
+  bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
+
   // Return a string which unambiguously represents all the fields of this data
   // structure. Used for generating a cache key for storing the compiled
   // executable.
@@ -189,6 +204,14 @@ class HloModuleConfig {
 
   std::vector<std::vector<int64>>* mutable_dot_config() { return &dot_config_; }
 
+  const std::vector<std::vector<std::vector<int64>>>& layout_config() const {
+    return layout_config_;
+  }
+
+  std::vector<std::vector<std::vector<int64>>>* mutable_layout_config() {
+    return &layout_config_;
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -197,12 +220,19 @@ class HloModuleConfig {
   // Module/graph-level seed handle.
   uint64 seed_ = 0;
 
+  // Program id that identifies a set of program to be launched together.
+  int32 launch_id_ = 0;
+
   // The number of replicas (data parallelism) to compile this binary for.
   int64 replica_count_ = 1;
 
   // The number of partitions (model parallelism) to compile this binary for.
   int64 num_partitions_ = 1;
 
+  // Whether to use SPMD (true) or MPMD (false) when num_partitions_ > 0 and XLA
+  // needs to partition the module.
+  bool use_spmd_partitioning_ = false;
+
   // The target maximum parallelism at which to partition HLOs for parallel
   // execution on the CPU backend.
   int64 intra_op_parallelism_threads_ = -1;
@@ -219,6 +249,9 @@ class HloModuleConfig {
   FusionConfigCollection fusion_config_collection_ =
       FusionConfigCollection::kOff;
 
+  // TODO(b/155665133): Consolidate fusion, dot, and layout config into a proto
+  // similar to backend config.
+
   // Custom fusion configuration, where fusion_config_[c][v] control if node v
   // in computation c must be fused to all its consumers (true) or not (false).
   std::vector<std::vector<bool>> fusion_config_;
@@ -227,6 +260,10 @@ class HloModuleConfig {
   // how to convert dot operation v (sorted topologically and by computation) to
   // convolution.
   std::vector<std::vector<int64>> dot_config_;
+
+  // Layout configuration, where layout_config_[v][i] controls the layout
+  // decision i of operation v.
+  std::vector<std::vector<std::vector<int64>>> layout_config_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index dfe68d93f30..664fa10a990 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -48,6 +48,7 @@ namespace xla {
   V(kAdd, "add", 2)                                                    \
   V(kAddDependency, "add-dependency", 2)                               \
   V(kAfterAll, "after-all", kHloOpcodeIsVariadic)                      \
+  V(kAllGather, "all-gather", 1)                                       \
   V(kAllReduce, "all-reduce", kHloOpcodeIsVariadic)                    \
   V(kAllToAll, "all-to-all", kHloOpcodeIsVariadic)                     \
   V(kAtan2, "atan2", 2)                                                \
@@ -138,6 +139,7 @@ namespace xla {
   V(kSlice, "slice", 1)                                                \
   V(kSort, "sort", kHloOpcodeIsVariadic)                               \
   V(kSqrt, "sqrt", 1)                                                  \
+  V(kCbrt, "cbrt", 1)                                                  \
   V(kSubtract, "subtract", 2)                                          \
   V(kTanh, "tanh", 1)                                                  \
   V(kTrace, "trace", 1)                                                \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index f41ed233ed3..2a90c95850c 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -784,6 +784,7 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kTanh: {
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -849,6 +850,35 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
           HloInstruction::CreateBitcastConvert(shape, operands[0]));
       break;
     }
+    case HloOpcode::kAllGather: {
+      optional<std::vector<std::vector<int64>>> tmp_groups;
+      optional<std::vector<int64>> replica_group_ids;
+      optional<int64> channel_id;
+      optional<std::vector<int64>> dimensions;
+      optional<bool> constrain_layout;
+      optional<bool> use_global_device_ids;
+      attrs["replica_groups"] = {/*required=*/false,
+                                 AttrTy::kBracedInt64ListList, &tmp_groups};
+      attrs["channel_id"] = {/*required=*/false, AttrTy::kInt64, &channel_id};
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions};
+      attrs["constrain_layout"] = {/*required=*/false, AttrTy::kBool,
+                                   &constrain_layout};
+      attrs["use_global_device_ids"] = {/*required=*/false, AttrTy::kBool,
+                                        &use_global_device_ids};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      std::vector<ReplicaGroup> replica_groups;
+      if (tmp_groups) {
+        replica_groups = CreateReplicaGroups(*tmp_groups);
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateAllGather(
+          shape, operands[0], dimensions->at(0), replica_groups,
+          constrain_layout ? *constrain_layout : false, channel_id,
+          use_global_device_ids ? *use_global_device_ids : false));
+      break;
+    }
     case HloOpcode::kAllReduce: {
       optional<std::vector<std::vector<int64>>> tmp_groups;
       optional<HloComputation*> to_apply;
@@ -887,6 +917,9 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/false, AttrTy::kBracedInt64List,
                              &dimensions};
+      optional<bool> constrain_layout;
+      attrs["constrain_layout"] = {/*required=*/false, AttrTy::kBool,
+                                   &constrain_layout};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
           (dimensions && dimensions->size() != 1)) {
         return false;
@@ -900,7 +933,9 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
         split_dimension = dimensions->at(0);
       }
       instruction = builder->AddInstruction(HloInstruction::CreateAllToAll(
-          shape, operands, replica_groups, channel_id, split_dimension));
+          shape, operands, replica_groups,
+          constrain_layout ? *constrain_layout : false, channel_id,
+          split_dimension));
       break;
     }
     case HloOpcode::kCollectivePermute: {
@@ -1892,6 +1927,9 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
   if (outer_dimension_partitions) {
     instruction->set_outer_dimension_partitions(*outer_dimension_partitions);
   }
+  if (frontend_attributes) {
+    instruction->set_frontend_attributes(*frontend_attributes);
+  }
   return AddInstruction(name, instruction, name_loc);
 }  // NOLINT(readability/fn_size)
 
@@ -1946,7 +1984,7 @@ bool HloParserImpl::ParseFrontendAttributes(
       if (!ParseAttributeName(&attribute)) {
         return false;
       }
-      if (lexer_.GetKind() != TokKind::kIdent) {
+      if (lexer_.GetKind() != TokKind::kString) {
         return false;
       }
       (*frontend_attributes->mutable_map())[attribute] = lexer_.GetStrVal();
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 66ce7d821f0..e18014a3071 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -42,6 +42,7 @@ using absl::string_view;
 struct TestData {
   string test_name;
   string module_string;
+  int64 replica_count = 1;
   bool enable_verification = true;
 };
 
@@ -1439,7 +1440,8 @@ ENTRY AllReduceWithSubgroups {
   ROOT all-reduce = f32[128,32]{0,1} all-reduce(input), replica_groups={{0,1},{2,3}}, to_apply=add
 }
 
-)"
+)",
+/*replica_count=*/4,
 },
 // all-reduce with constrained layout
 {
@@ -1478,6 +1480,43 @@ ENTRY CRS {
 
 )"
 },
+// all-gather
+{
+"AllGather",
+R"(HloModule AllGather
+
+ENTRY AllGather {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT ag = f32[128,128]{0,1} all-gather(input), replica_groups={}, dimensions={1}
+}
+
+)"
+},
+// all-gather with constrained layout
+{
+"AllGatherWithLayout",
+R"(HloModule AllGather
+
+ENTRY AllGather {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT ag = f32[128,128]{0,1} all-gather(input), replica_groups={}, constrain_layout=true, dimensions={1}
+}
+
+)"
+},
+// all-gather with subgroups
+{
+"AllGatherWithSubgroups",
+R"(HloModule AllGatherWithSubgroups
+
+ENTRY AllGatherWithSubgroups {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT ag = f32[128,64]{0,1} all-gather(input), replica_groups={{0,1},{2,3}}, dimensions={1}
+}
+
+)",
+/*replica_count=*/4,
+},
 // all-to-all
 {
 "AllToAll",
@@ -1501,7 +1540,8 @@ ENTRY AllToAllWithSubgroups {
   ROOT a2a = (f32[128,32]{0,1}, f32[128,32]{0,1}) all-to-all(p0, p1), replica_groups={{1,2},{3,0}}
 }
 
-)"
+)",
+/*replica_count=*/4,
 },
 // collective-permute
 {
@@ -1513,7 +1553,8 @@ ENTRY CollectivePermute {
   ROOT root = f32[128,32]{0,1} collective-permute(input), source_target_pairs={{0,1},{1,2},{2,3}}
 }
 
-)"
+)",
+/*replica_count=*/4
 },
 // replica-id
 {
@@ -1686,16 +1727,19 @@ class HloParameterizedParserTest
   void ExpectEqual() {
     std::unique_ptr<HloModule> module;
     const string& original = GetParam().module_string;
+    HloModuleConfig config;
+    config.set_replica_count(GetParam().replica_count);
     if (GetParam().enable_verification) {
       auto verified_module = absl::make_unique<VerifiedHloModule>(
-          GetParam().test_name, HloModuleConfig(),
+          GetParam().test_name, config,
           /*verifier_layout_sensitive=*/false,
           /*allow_mixed_precision_in_hlo_verifier=*/true,
           ShapeUtil::ByteSizeOfElements);
       TF_ASSERT_OK(verified_module->ParseHloStringAndVerifyModule(original));
       module = std::move(verified_module);
     } else {
-      TF_ASSERT_OK_AND_ASSIGN(module, ParseAndReturnUnverifiedModule(original));
+      TF_ASSERT_OK_AND_ASSIGN(module,
+                              ParseAndReturnUnverifiedModule(original, config));
     }
     if (proto_round_trip) {
       TF_ASSERT_OK_AND_ASSIGN(module, HloModule::CreateFromProto(
@@ -2415,7 +2459,8 @@ TEST_F(HloParserTest, ParseSharding) {
 }
 
 TEST_F(HloParserTest, ParseFrontendAttributes) {
-  const string original = "{attr_a=test_a,attr_b=b}";
+  const string original =
+      R"({attr_a="test_a",attr_b="b",attr_c="s64",attr_d="a/b"})";
   TF_ASSERT_OK_AND_ASSIGN(FrontendAttributes frontend_attributes,
                           ParseFrontendAttributes(original));
   EXPECT_EQ(FrontendAttributesToString(frontend_attributes), original);
diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h
index 33af8297b94..a22a394c6a4 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_fix.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h
@@ -46,8 +46,8 @@ class HloPassFix : public Pass {
       VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
       ++iteration_count;
       if (iteration_count == kLimit) {
-        LOG(WARNING) << "Unexpectedly high number of iterations in HLO passes, "
-                        "exiting fixed point loop.";
+        VLOG(1) << "Unexpectedly high number of iterations in HLO passes, "
+                   "exiting fixed point loop.";
         // Return false in case this is fixed point is nested.
         return false;
       }
@@ -68,8 +68,8 @@ class HloPassFix : public Pass {
       VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
       ++iteration_count;
       if (iteration_count == kLimit) {
-        LOG(WARNING) << "Unexpectedly high number of iterations in HLO passes, "
-                        "exiting fixed point loop.";
+        VLOG(1) << "Unexpectedly high number of iterations in HLO passes, "
+                   "exiting fixed point loop.";
         // Return false in case this is fixed point is nested.
         return false;
       }
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index 0b68cc27008..1d089333ef0 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -148,7 +148,7 @@ class HloReachabilityMap {
 
    private:
     using Word = uint64;
-    static const size_t kBits = 64;
+    static constexpr size_t kBits = 64;
 
     // Number of bits in the bitvector.
     size_t size_;
diff --git a/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
index 822b00aecbf..d858d6aa1c7 100644
--- a/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
@@ -69,8 +69,8 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           module_str, /*replica_count=*/4));
   auto param = module->entry_computation()->parameter_instruction(0);
   param->set_parameter_replicated_at_leaf_buffers(
       absl::Span<const bool>{false, true});
@@ -149,8 +149,8 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           module_str, /*replica_count=*/4));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloReplicationAnalysis> analysis,
       HloReplicationAnalysis::Run(module.get(), /*cross_partition_spmd=*/true));
@@ -575,8 +575,8 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           module_str, /*replica_count=*/2));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloReplicationAnalysis> analysis,
                           HloReplicationAnalysis::Run(
                               module.get(), /*cross_partition_spmd=*/false));
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 9701c343288..b0a03707efb 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -199,10 +199,12 @@ std::vector<int64> HloSharding::TileLimitForDevice(const Shape& shape,
 }
 
 int64 HloSharding::RequiredLeaves(const Shape& shape) {
-  // Empty tuples have no leaf nodes as far as ShapeUtil and ShapeTree are
-  // concerned, but they do have a single tuple_elements_ entry since we want
-  // to allow empty tuple results to have sharding.
-  return ShapeUtil::IsEmptyTuple(shape) ? 1 : ShapeUtil::GetLeafCount(shape);
+  // Empty tuples (with arbitrary nesting) have no leaf nodes as far as
+  // ShapeUtil and ShapeTree are concerned, but they do have a single
+  // tuple_elements_ entry since we want to allow empty tuple results to
+  // have sharding.
+  const int64 leaf_count = ShapeUtil::GetLeafCount(shape);
+  return (leaf_count == 0) ? 1 : leaf_count;
 }
 
 Status HloSharding::CheckLeafCount(const Shape& shape) const {
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
new file mode 100644
index 00000000000..129091ca06f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -0,0 +1,574 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+
+#include <map>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/array.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+
+absl::optional<int64> SelectDominantDevice(
+    const std::map<int64, int64>& device_map, int64* top_count) {
+  int64 device = 0;
+  int64 count = 0;
+  for (auto& it : device_map) {
+    if (it.second > count) {
+      count = it.second;
+      device = it.first;
+    }
+  }
+  if (top_count != nullptr) {
+    *top_count = count;
+  }
+  return count > 0 ? absl::optional<int64>(device) : absl::optional<int64>();
+}
+
+Status AssignComputationDevice(HloComputation* computation, int64 device) {
+  VLOG(4) << "Assigning device " << device << " to " << computation->name()
+          << " computation";
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (!instruction->has_sharding()) {
+      VLOG(4) << "Assigning device " << device << " to " << instruction->name();
+      instruction->set_device_sharding(device);
+    }
+  }
+  return Status::OK();
+}
+
+absl::optional<int64> GetMostOccurringDevice(
+    absl::Span<HloInstruction* const> instructions) {
+  std::map<int64, int64> device_map;
+  for (HloInstruction* instruction : instructions) {
+    if (instruction->has_sharding()) {
+      for (auto& it : instruction->sharding().UsedDevices(nullptr)) {
+        // The UsedDevices() API returns a map<device, occurrence_count>.
+        device_map[it.first] += it.second;
+      }
+    }
+  }
+  return SelectDominantDevice(device_map, nullptr);
+}
+
+StatusOr<absl::optional<int64>> GetDominantDevice(
+    absl::Span<HloComputation* const> computations, double dominant_factor) {
+  int64 instruction_count = 0;
+  std::map<int64, int64> device_map;
+  for (HloComputation* computation : computations) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      int64 count = 1;
+      if (instruction->has_sharding()) {
+        for (auto& it : instruction->sharding().UsedDevices(&count)) {
+          // The UsedDevices() API returns a map<device, occurrence_count>.
+          device_map[it.first] += it.second;
+        }
+      }
+      instruction_count += count;
+    }
+  }
+  int64 count;
+  absl::optional<int64> device = SelectDominantDevice(device_map, &count);
+  absl::optional<int64> dominant_device;
+  if (device) {
+    double factor =
+        static_cast<double>(count) / static_cast<double>(instruction_count);
+    if (factor >= dominant_factor) {
+      dominant_device = device;
+    }
+  }
+  return dominant_device;
+}
+
+HloSharding TransposeSharding(const HloSharding& sharding,
+                              const std::vector<int64>& dimensions) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+  const int64 rank = dimensions.size();
+  std::vector<int64> tile_assignment_dim(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    tile_assignment_dim[i] = sharding.tile_assignment().dim(dimensions[i]);
+  }
+  Array<int64> tile_assignment = sharding.tile_assignment();
+  tile_assignment.Reshape(tile_assignment_dim);
+  tile_assignment.Each([&](absl::Span<const int64> indices, int64* value) {
+    std::vector<int64> src_indices(indices.size(), -1);
+    for (int64 i = 0; i < indices.size(); ++i) {
+      src_indices[dimensions[i]] = indices[i];
+    }
+    *value = sharding.tile_assignment()(src_indices);
+  });
+  return HloSharding::Tile(tile_assignment);
+}
+
+absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+
+  // In case of a tiled sharding the reshaped sharding will be a valid if the
+  // reshape is composed from the following operations:
+  // * Adding or removing dimensions with size 1.
+  // * Merging consecutive dimensions where only the most major is sharded.
+  // * Splitting a dimension to consecutive dimensions.
+  // * Any reshaping of unsharded dimensions.
+  // Note that merge and split can happen consecutively on the same dimension,
+  // e.g., f32[1024,256,1024] to f32[128,2048,1024] can be considered that 1024
+  // gets split into 128 and 8, but 8 then gets merged with 256. We use stacks
+  // to make supporting such cases easy.
+  const Shape tile_shape = sharding.TileShape(source_shape);
+  std::vector<int64> target_tile_assignment_dimensions;
+  std::vector<int64> source_dims_stack(source_shape.rank());
+  std::vector<int64> target_dims_stack(target_shape.rank());
+  std::vector<int64> sharding_tile_dims_stack(source_shape.rank());
+  for (int64 i = 0; i < source_shape.rank(); ++i) {
+    source_dims_stack[i] = source_shape.dimensions(source_shape.rank() - 1 - i);
+    sharding_tile_dims_stack[i] =
+        sharding.tile_assignment().dim(source_shape.rank() - 1 - i);
+  }
+  for (int64 i = 0; i < target_shape.rank(); ++i) {
+    target_dims_stack[i] = target_shape.dimensions(target_shape.rank() - 1 - i);
+  }
+  while (!source_dims_stack.empty() || !target_dims_stack.empty()) {
+    if (target_dims_stack.empty()) {
+      if (Product(sharding_tile_dims_stack) != 1) {
+        return absl::nullopt;
+      }
+      break;
+    }
+    int64 s_size = 1;
+    int64 t_size = 1;
+    int64 s_partitions = 1;
+    if (!source_dims_stack.empty()) {
+      s_size = source_dims_stack.back();
+      source_dims_stack.pop_back();
+      s_partitions = sharding_tile_dims_stack.back();
+      sharding_tile_dims_stack.pop_back();
+    }
+    t_size = target_dims_stack.back();
+    target_dims_stack.pop_back();
+    if (s_partitions * Product(sharding_tile_dims_stack) == 1) {
+      // No more partitions left.
+      target_tile_assignment_dimensions.push_back(1);
+      continue;
+    }
+    if (s_size == t_size) {
+      // Same dimension.
+      target_tile_assignment_dimensions.push_back(s_partitions);
+    } else if (t_size == 1) {
+      // Trivial dimension added.
+      target_tile_assignment_dimensions.push_back(1);
+      source_dims_stack.push_back(s_size);
+      sharding_tile_dims_stack.push_back(s_partitions);
+    } else if (s_size == 1) {
+      // Trivial dimension removed.
+      if (s_partitions != 1) {
+        return absl::nullopt;
+      }
+      target_dims_stack.push_back(t_size);
+    } else if (s_size > t_size) {
+      // Dimension split.
+      if (s_size % t_size != 0 || t_size % s_partitions != 0) {
+        return absl::nullopt;
+      }
+      target_tile_assignment_dimensions.push_back(s_partitions);
+      // We have part of the s_size unprocessed, so put it back to stack.
+      source_dims_stack.push_back(s_size / t_size);
+      sharding_tile_dims_stack.push_back(1);
+    } else {
+      // Dimension merge. Also merge the source dimension with the next, and
+      // process it next time.
+      if (s_size % s_partitions != 0) {
+        return absl::nullopt;
+      }
+      CHECK(!source_dims_stack.empty());
+      if (sharding_tile_dims_stack.back() != 1 && s_size != s_partitions) {
+        // If the next dimension to combine is sharded, we require that the
+        // current dimension's shard size to be 1. Otherwise, the new shard
+        // would be non-contiguous.
+        return absl::nullopt;
+      }
+      source_dims_stack.back() *= s_size;
+      sharding_tile_dims_stack.back() *= s_partitions;
+      target_dims_stack.push_back(t_size);
+    }
+  }
+  Array<int64> new_tile_assignment = sharding.tile_assignment();
+  new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
+                                   absl::Span<const int64> dims) {
+  CHECK(!sharding.IsTuple() && !sharding.IsTileMaximal());
+  CHECK_NE(absl::c_find(dims, dim), dims.end()) << "dim is not in dims";
+  // We optimize the tile assignment on the single dimension dim in a way to
+  // minimize communication among devices caused by the reshard:
+  // +---+---+               +---+---+              +-+-+-+-+
+  // |   |   |               |   0   |              | | | | |
+  // | 0 | 1 |               +-------+              | | | | |
+  // |   |   |  reshape on   |   1   |  reshape on  | | | | |
+  // +---+---+   dim 0  =>   +-------+   dim 1  =>  |0|2|1|3|
+  // |   |   |               |   2   |              | | | | |
+  // | 2 | 3 |               +-------+              | | | | |
+  // |   |   |               |   3   |              | | | | |
+  // +---+---+               +---+---+              +-+-+-+-+
+
+  std::vector<int64> tile_dims(sharding.tile_assignment().num_dimensions(), 1);
+  // Handle ignore dimensions.
+  std::vector<int64> ignore_sizes;
+  int64 ignore_size = 1;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (absl::c_find(dims, i) == dims.end()) {
+      int64 size = sharding.tile_assignment().dim(i);
+      ignore_sizes.push_back(size);
+      tile_dims[i] = size;
+      ignore_size *= size;
+    }
+  }
+
+  using Buckets = std::vector<std::vector<int64>>;
+  Array<Buckets> buckets(ignore_sizes,
+                         Buckets(sharding.tile_assignment().dim(dim)));
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> index, int64 device) {
+        std::vector<int64> ignore_index;
+        for (int64 i = 0; i < index.size(); ++i) {
+          if (absl::c_find(dims, i) == dims.end()) {
+            ignore_index.push_back(index[i]);
+          }
+        }
+        buckets(ignore_index)[index[dim]].push_back(device);
+      });
+  std::vector<int64> devices;
+  buckets.Each([&](absl::Span<const int64> index, const Buckets& buckets) {
+    for (auto& bucket : buckets) {
+      devices.insert(devices.end(), bucket.begin(), bucket.end());
+    }
+  });
+  tile_dims[dim] = devices.size() / ignore_size;
+  Array<int64> tile_assignment(tile_dims);
+  tile_assignment.SetValues(devices);
+  return HloSharding::Tile(tile_assignment);
+}
+
+bool ContainsTileSharding(const HloModule& module) {
+  for (const HloComputation* computation : module.computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->has_sharding() &&
+          !instruction->sharding().IsTileMaximal()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+HloSharding GatherOutputSharding(const HloSharding& index_sharding,
+                                 const HloInstruction* hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
+  std::vector<int64> output_tile_assignment_dims;
+  for (int64 i = 0, index_dim = 0; i < hlo->shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.offset_dims(), i)) {
+      output_tile_assignment_dims.push_back(1);
+    } else {
+      output_tile_assignment_dims.push_back(
+          index_sharding.tile_assignment().dim(index_dim));
+      index_dim++;
+    }
+  }
+  Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  new_tile_assignment.Reshape(output_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding GatherIndexSharding(const HloSharding& output_sharding,
+                                const HloInstruction* hlo) {
+  if (output_sharding.IsTileMaximal()) {
+    return output_sharding;
+  }
+
+  const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
+  std::vector<int64> index_tile_assignment_dims;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      index_tile_assignment_dims.push_back(
+          output_sharding.tile_assignment().dim(i));
+    }
+  }
+  Array<int64> new_tile_assignment = output_sharding.tile_assignment();
+  new_tile_assignment.Reshape(index_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
+  if (hlo.sharding().IsTileMaximal()) {
+    return hlo.sharding();
+  }
+
+  const GatherDimensionNumbers& dnums = hlo.gather_dimension_numbers();
+  std::vector<int64> tile_assignment_dims(hlo.shape().rank());
+  int64 num_elements = 1;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      tile_assignment_dims[i] = hlo.sharding().tile_assignment().dim(i);
+      num_elements *= hlo.sharding().tile_assignment().dim(i);
+    } else {
+      tile_assignment_dims[i] = 1;
+    }
+  }
+  if (num_elements == hlo.sharding().tile_assignment().num_elements()) {
+    // Output sharding is only on non offset dimensions. We use output sharding
+    // to shard this gather op directly.
+    return hlo.sharding();
+  }
+
+  if (num_elements == 1) {
+    // Output sharding is only on offset dimensions. We do not shard this gather
+    // op. Return a tile maximal sharding with the first device in output
+    // sharding tile assignment.
+    return HloSharding::AssignDevice(*hlo.sharding().tile_assignment().begin());
+  }
+
+  // Output sharding is on both offset and non offset dimensions. We shard the
+  // gather op only on non offset dimensions.
+  // For example:
+  // - the gather op has sharding [2,2]{0,1,2,3},
+  // - first dimension is non offset dimension,
+  // - second dimension is offset dimension,
+  // Then the result sharding will be [2,1]{0,2}.
+  std::vector<int64> slice_starts(hlo.shape().rank(), 0LL),
+      slice_limits(hlo.shape().rank());
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      slice_limits[i] = hlo.sharding().tile_assignment().dim(i);
+    } else {
+      slice_limits[i] = 1;
+    }
+  }
+  Array<int64> tile_assignment =
+      hlo.sharding().tile_assignment().Slice(slice_starts, slice_limits);
+  return HloSharding::Tile(tile_assignment);
+}
+
+HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
+                                 const HloInstruction* hlo) {
+  if (data_sharding.IsTileMaximal()) {
+    return data_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo->scatter_dimension_numbers();
+  std::vector<int64> index_tile_assignment_dims;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.update_window_dims(), i)) {
+      index_tile_assignment_dims.push_back(
+          data_sharding.tile_assignment().dim(i));
+    }
+  }
+  if (index_tile_assignment_dims.size() < hlo->operand(1)->shape().rank()) {
+    index_tile_assignment_dims.push_back(1);
+  }
+  Array<int64> new_tile_assignment = data_sharding.tile_assignment();
+  new_tile_assignment.Reshape(index_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ScatterDataSharding(const HloSharding& index_sharding,
+                                const HloInstruction* hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo->scatter_dimension_numbers();
+  std::vector<int64> data_tile_assignment_dims;
+  for (int64 i = 0, index_dim = 0; i < hlo->shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.update_window_dims(), i)) {
+      data_tile_assignment_dims.push_back(1);
+    } else {
+      data_tile_assignment_dims.push_back(
+          index_sharding.tile_assignment().dim(index_dim));
+      index_dim++;
+    }
+  }
+  Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  new_tile_assignment.Reshape(data_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
+                                          const HloInstruction& hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  // Only shard on first "number of scatter_window_dims" dimensions.
+  const ScatterDimensionNumbers& dnums = hlo.scatter_dimension_numbers();
+  int64 num_elements = 1;
+  int64 index_dim = 0;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
+      num_elements *= index_sharding.tile_assignment().dim(index_dim);
+      index_dim++;
+    }
+  }
+  if (num_elements == index_sharding.tile_assignment().num_elements()) {
+    // Index sharding is only on scatter_window_dims. We use this index sharding
+    // directly.
+    return index_sharding;
+  }
+
+  // Index sharding is only on update_window_dims. We do not shard this scatter
+  // op. Return a tile maximal sharding with the first device in index sharding
+  // tile assignment.
+  if (num_elements == 1) {
+    return HloSharding::AssignDevice(*index_sharding.tile_assignment().begin());
+  }
+
+  const int64 index_rank = hlo.operand(1)->shape().rank();
+  std::vector<int64> slice_starts(index_rank, 0LL), slice_limits(index_rank);
+  for (int64 i = 0; i < index_rank; ++i) {
+    if (i < index_dim) {
+      slice_limits[i] = index_sharding.tile_assignment().dim(i);
+    } else {
+      slice_limits[i] = 1;
+    }
+  }
+  Array<int64> tile_assignment =
+      index_sharding.tile_assignment().Slice(slice_starts, slice_limits);
+  return HloSharding::Tile(tile_assignment);
+}
+
+HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
+                                         const HloInstruction& hlo) {
+  if (data_sharding.IsTileMaximal()) {
+    return data_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo.scatter_dimension_numbers();
+  const int64 data_rank = hlo.operand(2)->shape().rank();
+  std::vector<int64> tile_assignment_dims(data_rank, 1LL);
+  int64 num_elements = 1;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
+      CHECK_LT(i, data_rank);
+      tile_assignment_dims[i] = data_sharding.tile_assignment().dim(i);
+      num_elements *= data_sharding.tile_assignment().dim(i);
+    }
+  }
+  if (num_elements == data_sharding.tile_assignment().num_elements()) {
+    // Data sharding is only on scatter_window_dims. We use this data sharding
+    // directly.
+    return data_sharding;
+  }
+
+  if (num_elements == 1) {
+    // Data sharding is only on update_window_dims. We do not shard this
+    // scatter op. Return a tile maximal sharding with the first device in
+    // data sharding tile assignment.
+    return HloSharding::AssignDevice(*data_sharding.tile_assignment().begin());
+  }
+
+  // Data sharding is on both update_window_dims and scatter_window_dims. We
+  // shard the scatter op only on scatter_window_dims. For example:
+  // - the scatter data has sharding [2,2]{0,1,2,3},
+  // - first dimension is scatter_window_dims,
+  // - second dimension is update_window_dims,
+  // Then the result sharding will be [2,1]{0,2}.
+  std::vector<int64> slice_starts(data_rank, 0LL);
+  Array<int64> tile_assignment =
+      data_sharding.tile_assignment().Slice(slice_starts, tile_assignment_dims);
+  return HloSharding::Tile(tile_assignment);
+}
+
+StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
+IdentityValueAndHloOpcodeForScatterReduceComputation(
+    const HloScatterInstruction& scatter) {
+  auto computation = scatter.to_apply();
+  // We only handle computations with 2 parameters and only 1 calculation.
+  if (computation->instruction_count() != 3) {
+    return Status(
+        tensorflow::error::Code::INVALID_ARGUMENT,
+        "Expected scatter reduce computation with 2 parameters and only 1 "
+        "calculation");
+  }
+
+  auto root_instruction = computation->root_instruction();
+  if (root_instruction->opcode() == HloOpcode::kAdd ||
+      root_instruction->opcode() == HloOpcode::kOr) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::Zero(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMultiply ||
+             root_instruction->opcode() == HloOpcode::kAnd) {
+    return std::make_pair(HloInstruction::CreateConstant(
+                              LiteralUtil::One(scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMaximum) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::MinValue(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMinimum) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::MaxValue(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  }
+
+  return Status(tensorflow::error::Code::INVALID_ARGUMENT,
+                "Expected scatter reduce computation which is "
+                "add/or/multiply/add/min/max");
+}
+
+std::vector<int64> DevicesForSharding(
+    const HloSharding& sharding, const std::vector<int64>& available_devices) {
+  std::vector<int64> devices;
+  if (sharding.IsReplicated()) {
+    for (int64 d : available_devices) {
+      if (!HloSharding::IsReservedDevice(d)) {
+        devices.push_back(d);
+      }
+    }
+    return devices;
+  }
+
+  for (int64 i : available_devices) {
+    if (sharding.UsesDevice(i)) {
+      devices.push_back(i);
+    }
+  }
+  DCHECK(std::all_of(sharding.tile_assignment().begin(),
+                     sharding.tile_assignment().end(), [&](int64 device) {
+                       return std::find(available_devices.begin(),
+                                        available_devices.end(),
+                                        device) != available_devices.end();
+                     }));
+  return devices;
+}
+
+}  // namespace hlo_sharding_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.h b/tensorflow/compiler/xla/service/hlo_sharding_util.h
new file mode 100644
index 00000000000..00d9434a34d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.h
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
+
+#include <map>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+
+// Given a map<device, occurrence_count>, selects the device with higher
+// occurrence count (if any). If top_count in not nullptr, it will receive the
+// count of the dominant device returned.
+absl::optional<int64> SelectDominantDevice(
+    const std::map<int64, int64>& device_map, int64* top_count);
+
+// Assigns all the instructions of a computation, to a given device.
+// This API does not recurse into called computations, and does not assign
+// instructions which already have sharding.
+Status AssignComputationDevice(HloComputation* computation, int64 device);
+
+// Given an instruction container, returns the device which is most commonly
+// occurring among the instructions.
+absl::optional<int64> GetMostOccurringDevice(
+    absl::Span<HloInstruction* const> instructions);
+
+// Given a set of computations, tries to extract the dominant device. A device
+// is dominant if the combined occurrence among all the instructions of the
+// input computations, is greater/equal than/to dominant_factor (real number
+// from 0 to 1).
+// This API does not recurse into called computations.
+// If no device exists that satisfies the condition, the returned optional will
+// hold no value.
+StatusOr<absl::optional<int64>> GetDominantDevice(
+    absl::Span<HloComputation* const> computations, double dominant_factor);
+
+// Returns the HloSharding with the tile dimensions and tile assignment
+// transposed based on the specified dimension numbers. In case of a tile
+// maximal sharding returns the original sharding.
+HloSharding TransposeSharding(const HloSharding& sharding,
+                              const std::vector<int64>& dimensions);
+
+// Returns the HloSharding with the tile shape reshaped based on the source and
+// target shapes and the tile assignment adjusted to correspond to the new tile
+// shape or absl::nullopt if the resulting reshape would create an invalid
+// sharding (non continuous or non uniformly sized tiles). In case of a tile
+// maximal sharding returns the original sharding.
+absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding);
+
+// Returns a sharding tiled on unique dimension dim by reshaping the tile
+// assignment of the sharding argument. Only dimensions in the dims span
+// argument are considered for reshaping, the others are ignored.
+// Assumptions: sharding is tile sharded, and dim must be included in dims.
+HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
+                                   absl::Span<const int64> dims);
+
+// Returns true if the provided module includes one or more instructions with
+// a tile sharding.
+bool ContainsTileSharding(const HloModule& module);
+
+// Returns the preferred output sharding for a gather op based on the sharding
+// of the indces.
+HloSharding GatherOutputSharding(const HloSharding& index_sharding,
+                                 const HloInstruction* hlo);
+
+// Returns the preferred index sharding for a gather op based on the sharding
+// of the output.
+HloSharding GatherIndexSharding(const HloSharding& output_sharding,
+                                const HloInstruction* hlo);
+
+// Returns a new HloSharding for a gather op so that only non offset dimensions
+// are sharded. Assume "result" is returned by this function. It is ensured that
+// "GetIndexSharding(result, hlo)" will have the same number of elements as
+// "result".
+HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo);
+
+// Returns the preferred index sharding for a scatter op based on the sharding
+// of the data.
+HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
+                                 const HloInstruction* hlo);
+
+// Returns the preferred data sharding for a scatter op based on the sharding
+// of the index.
+HloSharding ScatterDataSharding(const HloSharding& index_sharding,
+                                const HloInstruction* hlo);
+
+// Returns a new index sharding for a scatter op so that we only shard on first
+// "number of scatter_window_dims" dimensions. Assume "result" is returned by
+// this function. It is ensured that "ScatterDataSharding(result, hlo)" will
+// have the same number of elements as "result".
+HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
+                                          const HloInstruction& hlo);
+
+// Returns a new data sharding for a scatter op so that we only shard on
+// scatter_window_dims. Assume "result" is returned by this function. It is
+// ensured that "ScatterIndexSharding(result, hlo)" will have the same number of
+// elements as "result".
+HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
+                                         const HloInstruction& hlo);
+
+// Returns an identity value and an HloOpcode for reduce computation of scatter
+// instruction.
+// - If computation is add/or, return 0/false with corresponding op code;
+// - If computation is multiply/and, return 1/true with corresponding op code.
+// - If computation is min/max, return max value/min value with corresponding op
+//   code.
+// - Otherwise, return error status.
+StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
+IdentityValueAndHloOpcodeForScatterReduceComputation(
+    const HloScatterInstruction& scatter);
+
+// Given a sharding and a list of devices in the topology, return a
+// list of the devices that `sharding` applies to.
+std::vector<int64> DevicesForSharding(
+    const HloSharding& sharding, const std::vector<int64>& available_devices);
+
+}  // namespace hlo_sharding_util
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
new file mode 100644
index 00000000000..02496c75965
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
@@ -0,0 +1,206 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+namespace {
+
+TEST(HloShardingUtilTest, TransposeShardingReplicated) {
+  EXPECT_EQ(TransposeSharding(HloSharding::Replicate(), {0, 1, 2}),
+            HloSharding::Replicate());
+}
+
+TEST(HloShardingUtilTest, TransposeShardingTiled) {
+  HloSharding input = HloSharding::Tile(Array4D<int64>({{{{0, 1}}, {{2, 3}}}}));
+  HloSharding output =
+      HloSharding::Tile(Array4D<int64>({{{{0}, {2}}}, {{{1}, {3}}}}));
+  EXPECT_EQ(TransposeSharding(input, {3, 0, 1, 2}), output);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingMaximal) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 2});
+  HloSharding sharding = HloSharding::AssignDevice(7);
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledInvalid) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 2});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledMerge) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {4, 5, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {20, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  HloSharding output_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplit) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 4, 7});
+  HloSharding input_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 4, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 16, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledArbitraryMinorDimensions) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 7, 5, 3});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 15, 2, 14});
+  Array<int64> sharding_array({2, 1, 1, 1});
+  sharding_array(0, 0, 0, 0) = 0;
+  sharding_array(1, 0, 0, 0) = 1;
+  HloSharding sharding = HloSharding::Tile(sharding_array);
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledTrivialDimensions) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {3, 1, 5, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 1, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array4D<int64>({{{{0}, {1}}}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array4D<int64>({{{{0}}, {{1}}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTrivialDImensionInsertedToEnd) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 16});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 1});
+  HloSharding input_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, NoopReshapeShardingEmptyTile) {
+  Shape shape = ShapeUtil::MakeShape(F32, {7, 1, 1});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result = ReshapeSharding(shape, shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingScalar) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim0) {
+  HloSharding sharding = HloSharding::Tile(Array2D<int64>({{0, 1}, {2, 3}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/0, /*dims=*/{0, 1});
+  EXPECT_EQ(result.tile_assignment(), Array2D<int64>({{0}, {1}, {2}, {3}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim1) {
+  HloSharding sharding = HloSharding::Tile(Array2D<int64>({{0, 1}, {2, 3}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/1, /*dims=*/{0, 1});
+  EXPECT_EQ(result.tile_assignment(), Array2D<int64>({{0, 2, 1, 3}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim0) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/0, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(
+      result.tile_assignment(),
+      Array3D<int64>({{{0}}, {{1}}, {{2}}, {{3}}, {{4}}, {{5}}, {{6}}, {{7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim1) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/1, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0}, {1}, {4}, {5}, {2}, {3}, {6}, {7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim2) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/2, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0, 2, 4, 6, 1, 3, 5, 7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim2_Batch1) {
+  // Tile sharding in batch dimension, i.e.
+  // sharding={devices[2,2,2]0,1,2,3,4,5,6,7,8}.
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  // Reshape on dimensions {1, 2} only, therefore ignoring batch dimension 0.
+  HloSharding result = ReshapeToTileDimension(sharding, /*dim=*/2,
+                                              /*dims=*/{1, 2});
+  // Expected result is {devices=[2,1,4]0,2,1,3,4,6,5,7}, i.e. the two
+  // non-batch dimensions {{0, 1}, {2, 3}} and {{4, 5}, {6, 7}} are individually
+  // reshaped to tile dimension 2, i.e. {{0, 2, 1, 3}}, {{4, 6, 5, 7}}.
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0, 2, 1, 3}}, {{4, 6, 5, 7}}}));
+}
+
+}  // namespace
+}  // namespace hlo_sharding_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
old mode 100755
new mode 100644
index a8f9f612b0f..d15a36532eb
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -210,9 +210,66 @@ static Status CheckReplicaGroups(HloInstruction* hlo) {
           hlo->ToString());
     }
   }
+
+  // When the channel_id() or use_global_device_ids() is set, device ids in
+  // ReplicaGroup config no longer only mean replica ids. So we skip the check
+  // on the replica count.
+  if (auto channel_instr = DynCast<HloChannelInstruction>(hlo)) {
+    if (channel_instr->channel_id()) {
+      return Status::OK();
+    }
+  }
+  if (auto all_reduce = DynCast<HloAllReduceInstruction>(hlo)) {
+    if (all_reduce->use_global_device_ids()) {
+      return Status::OK();
+    }
+  }
+
+  int64 replica_count = hlo->GetModule()->config().replica_count();
+  if (!replicas_seen.empty() && replicas_seen.size() != replica_count) {
+    return InternalError(
+        "Replica count in HloModuleConfig is %d, but ReplicaGroup config "
+        "contains %d replicas: %s",
+        replica_count, replicas_seen.size(), hlo->ToString());
+  }
+
   return Status::OK();
 }
 
+Status ShapeVerifier::HandleAllGather(HloInstruction* hlo) {
+  auto ag = Cast<HloAllGatherInstruction>(hlo);
+  TF_RETURN_IF_ERROR(CheckReplicaGroups(ag));
+  TF_RET_CHECK(ag->all_gather_dimension() >= 0);
+  TF_RET_CHECK(ag->all_gather_dimension() < ag->shape().rank());
+  TF_RET_CHECK(ag->all_gather_dimension() < ag->operand(0)->shape().rank());
+  if (ag->use_global_device_ids() && ag->replica_groups().empty()) {
+    return InternalError(
+        "Replica group must be specified when use_global_device_ids is true");
+  }
+
+  int64 shard_count = CeilOfRatio(
+      ag->shape().dimensions(ag->all_gather_dimension()),
+      ag->operand(0)->shape().dimensions(ag->all_gather_dimension()));
+  if (ag->channel_id().has_value()) {
+    if (ag->use_global_device_ids()) {
+      TF_RET_CHECK(shard_count == ag->replica_groups()[0].replica_ids_size());
+    } else {
+      if (ag->replica_groups().empty() ||
+          ag->replica_groups()[0].replica_ids_size() != 1) {
+        return InternalError(
+            "Replica group size must be 1 when use_global_device_ids is "
+            "false if the all-gather is also cross-partition");
+      }
+    }
+  } else if (!ag->replica_groups().empty()) {
+    // Cross-replica all-gather: shard count is subgroup size.
+    TF_RET_CHECK(shard_count == ag->replica_groups()[0].replica_ids_size());
+  }
+  return CheckShape(ag, ShapeInference::InferAllGatherShape(
+                            ag->operand(0)->shape(), ag->all_gather_dimension(),
+                            shard_count));
+}
+
 Status ShapeVerifier::HandleAllReduce(HloInstruction* crs) {
   TF_RETURN_IF_ERROR(CheckReplicaGroups(crs));
 
@@ -605,9 +662,11 @@ Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
           shape_size_function_(bitcast->operand(0)->shape())) {
     return InternalError(
         "Bitcast cannot have different shape sizes of output (%d) and operand "
-        "(%d)",
+        "(%d) (%s) (%s)",
         shape_size_function_(bitcast->shape()),
-        shape_size_function_(bitcast->operand(0)->shape()));
+        shape_size_function_(bitcast->operand(0)->shape()),
+        bitcast->shape().ToString(true),
+        bitcast->operand(0)->shape().ToString(true));
   }
   return Status::OK();
 }
@@ -674,11 +733,7 @@ Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
   }
   for (HloInstruction* fused_param : fused_parameters) {
     int64 param_no = fused_param->parameter_number();
-    // Since fusion buffers aren't materialized, fusion parameters will not have
-    // the same memory space as the fusion operand.
-    if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape(),
-                    /*minor_to_major_only=*/false,
-                    /*ignore_memory_space=*/true)) {
+    if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape())) {
       return InternalError(
           "Shape mismatch between parameter number %d and its operand in "
           "%s.",
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 2e83361a591..7a2d3dc2e6c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -56,6 +56,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleFft(HloInstruction* fft) override;
   Status HandleCholesky(HloInstruction* hlo) override;
   Status HandleTriangularSolve(HloInstruction* hlo) override;
+  Status HandleAllGather(HloInstruction* hlo) override;
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleCollectivePermute(HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 8b2b7f6726a..e2c363e40c5 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -859,8 +859,17 @@ string ReplicaGroupsStr(std::vector<std::vector<int64>> replica_groups) {
   return absl::StrFormat("{%s}", absl::StrJoin(replica_group_strs, ", "));
 }
 
+int64 ReplicaCount(const std::vector<std::vector<int64>>& replica_groups) {
+  int64 replica_count = 0;
+  for (auto group : replica_groups) {
+    replica_count += group.size();
+  }
+  return replica_count;
+}
+
 StatusOr<std::unique_ptr<HloModule>> MakeAllReduceComputation(
-    std::vector<std::vector<int64>> replica_groups) {
+    std::vector<std::vector<int64>> replica_groups,
+    absl::optional<int64> replica_count = absl::nullopt) {
   const char* kTemplate = R"(
   HloModule test
   add {
@@ -872,8 +881,17 @@ StatusOr<std::unique_ptr<HloModule>> MakeAllReduceComputation(
     p = f32[128]{0} parameter(0)
     crs = f32[128]{0} all-reduce(p), to_apply=add, replica_groups=REPLICA_GROUPS
   })";
-  return ParseAndReturnUnverifiedModule(absl::StrReplaceAll(
-      kTemplate, {{"REPLICA_GROUPS", ReplicaGroupsStr(replica_groups)}}));
+
+  HloModuleConfig config;
+  if (replica_count) {
+    config.set_replica_count(*replica_count);
+  } else {
+    config.set_replica_count(ReplicaCount(replica_groups));
+  }
+  return ParseAndReturnUnverifiedModule(
+      absl::StrReplaceAll(
+          kTemplate, {{"REPLICA_GROUPS", ReplicaGroupsStr(replica_groups)}}),
+      config);
 }
 
 TEST_F(HloVerifierTest, AllReduce_NoReplicaGroupsOK) {
@@ -907,22 +925,36 @@ TEST_F(HloVerifierTest, AllReduce_MissingReplicaId) {
               HasSubstr("Replica 4 is not named"));
 }
 
+TEST_F(HloVerifierTest, AllReduce_NotEnougReplicasInGroupConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, MakeAllReduceComputation({{0, 1}}, 8));
+  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+              HasSubstr("Replica count in HloModuleConfig is 8, but "
+                        "ReplicaGroup config contains 2 replicas"));
+}
+
+TEST_F(HloVerifierTest, AllReduce_TooManyReplicasInGroupConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          MakeAllReduceComputation({{0, 1}, {2, 3}}, 2));
+  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+              HasSubstr("Replica count in HloModuleConfig is 2, but "
+                        "ReplicaGroup config contains 4 replicas"));
+}
+
 StatusOr<std::unique_ptr<HloModule>> MakeAllToAllComputation(
     std::vector<std::vector<int64>> replica_groups) {
   const char* kTemplate = R"(
   HloModule test
-  add {
-    x = f32[] parameter(0)
-    y = f32[] parameter(1)
-    ROOT add = f32[] add(x, y)
-  }
   ENTRY entry {
     p0 = f32[128]{0} parameter(0)
     p1 = f32[128]{0} parameter(1)
     a2a = (f32[128], f32[128]) all-to-all(p0, p1), replica_groups=REPLICA_GROUPS
   })";
-  return ParseAndReturnUnverifiedModule(absl::StrReplaceAll(
-      kTemplate, {{"REPLICA_GROUPS", ReplicaGroupsStr(replica_groups)}}));
+  HloModuleConfig config;
+  config.set_replica_count(ReplicaCount(replica_groups));
+  return ParseAndReturnUnverifiedModule(
+      absl::StrReplaceAll(
+          kTemplate, {{"REPLICA_GROUPS", ReplicaGroupsStr(replica_groups)}}),
+      config);
 }
 
 TEST_F(HloVerifierTest, AllToAll_NoReplicaGroupsOK) {
@@ -957,6 +989,24 @@ TEST_F(HloVerifierTest, AllToAll_WrongNumberOfReplicasInGroup) {
               HasSubstr("Replica group has size 1"));
 }
 
+TEST_F(HloVerifierTest, AllToAll_LayoutConstrained) {
+  const char* const kModuleStr = R"(
+  HloModule test
+  ENTRY entry {
+    p0 = f32[128,4]{0,1} parameter(0)
+    p1 = f32[128,4]{1,0} parameter(1)
+    ROOT a2a = (f32[128,4]{0,1}, f32[128,4]{1,0}) all-to-all(p0, p1),
+      replica_groups={{0,1}}
+  }
+  )";
+  HloModuleConfig config;
+  config.set_replica_count(2);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr, config));
+  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+              HasSubstr("HLO all-to-all has operands with different shapes"));
+}
+
 TEST_F(HloVerifierTest, CollectivePermuteSameSourceTwice) {
   const char* const kModuleStr = R"(
   HloModule test
@@ -966,8 +1016,10 @@ TEST_F(HloVerifierTest, CollectivePermuteSameSourceTwice) {
       source_target_pairs={{0,1}, {0,2}, {1,0}}
   }
   )";
+  HloModuleConfig config;
+  config.set_replica_count(3);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(kModuleStr));
+                          ParseAndReturnUnverifiedModule(kModuleStr, config));
   EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
               HasSubstr("Source 0 appears more than once"));
 }
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 53938a489f1..5de081c6343 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -145,6 +145,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kCholesky:
     case HloOpcode::kConditional:
     case HloOpcode::kConvolution:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
@@ -175,6 +176,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kSendDone:
     case HloOpcode::kSort:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kTanh:
     case HloOpcode::kTrace:
     case HloOpcode::kTriangularSolve:
@@ -500,7 +502,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     while (true) {
       auto next_entry =
           fusion_queue->DequeueNextInstructionAndOperandsToFuseInOrder();
-      auto instruction = next_entry.first;
+      HloInstruction* instruction = next_entry.first;
       if (instruction == nullptr) {
         break;
       }
@@ -510,12 +512,14 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         continue;
       }
 
+      VLOG(5) << "Considering fusion of: " << instruction->ToString();
       std::vector<int64>& sorted_operand_numbers = next_entry.second;
 
       for (int64 i : sorted_operand_numbers) {
         HloInstruction* operand = instruction->mutable_operand(i);
 
         if (!operand->IsFusible()) {
+          VLOG(3) << "Operand (" << operand->ToString() << ") is not fusible";
           continue;
         }
 
@@ -689,6 +693,8 @@ bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
   if (FusionWouldDuplicate(*producer, *consumer) &&
       (!may_duplicate_ || is_expensive_(*producer)) &&
       !IsAlwaysDuplicable(*producer)) {
+    VLOG(4) << "Stopping: fusion may duplicate operand ("
+            << producer->ToString() << ") , and this is expensive";
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 3c35fda55f1..9e4bdeb2b2d 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -203,7 +203,8 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
 
   std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
       override {
-    return std::unique_ptr<internal::StreamInterface>(new host::HostStream());
+    return std::unique_ptr<internal::StreamInterface>(
+        new host::HostStream(/*thread_stack_size=*/0));
   }
 
   std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override {
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 4d3f1a4c09a..13699f3adf9 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -432,10 +432,10 @@ bool IsLayoutConstrainedCustomCall(HloInstruction* instruction) {
   return custom_call != nullptr && custom_call->layout_constrained();
 }
 
-bool IsLayoutConstrainedAllReduce(const HloInstruction* instruction) {
-  const HloAllReduceInstruction* all_reduce =
-      DynCast<HloAllReduceInstruction>(instruction);
-  return all_reduce != nullptr && all_reduce->constrain_layout();
+bool IsLayoutConstrainedCollective(const HloInstruction* instruction) {
+  const HloCollectiveInstruction* collective =
+      DynCast<HloCollectiveInstruction>(instruction);
+  return collective != nullptr && collective->constrain_layout();
 }
 
 }  // namespace
@@ -520,7 +520,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
         TF_RETURN_IF_ERROR(
             constraints->SetBufferLayout(new_shape.layout(), *buffer));
       }
-    } else if (IsLayoutConstrainedAllReduce(instruction)) {
+    } else if (IsLayoutConstrainedCollective(instruction)) {
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(instruction->shape(), instruction));
     } else if (instruction->IsCrossModuleAllReduce()) {
@@ -951,7 +951,8 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
                 if (!Shape::Equal()
                          .IgnoreDynamicDimension()
                          .MinorToMajorOnlyInLayout()(instruction_subshape,
-                                                     buffer->shape())) {
+                                                     buffer->shape()) &&
+                    instruction->opcode() != HloOpcode::kBitcast) {
                   return InternalError(
                       "Layout of instruction %s at index {%s} does not match "
                       "source LogicalBuffer %s: %s vs %s",
@@ -1798,17 +1799,10 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
   // potential bugs in the layout assignment pass that may accidentally use the
   // existing layout.
   for (HloInstruction* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kBitcast) {
-      // bitcasts are inherently layout sensitive and so a bitcast instruction
-      // present in the IR before layout assignment is a bug.
-      return InternalError(
-          "Unexpected bitcast operation seen during layout assignment: %s.",
-          instruction->ToString());
-    }
     // Some instructions carry mandatory layouts in their shape.
     if (instruction->opcode() != HloOpcode::kInfeed &&
         !IsLayoutConstrainedCustomCall(instruction) &&
-        !IsLayoutConstrainedAllReduce(instruction)) {
+        !IsLayoutConstrainedCollective(instruction)) {
       LayoutUtil::ClearLayout(instruction->mutable_shape());
     }
   }
@@ -2179,6 +2173,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kConditional:
     case HloOpcode::kConvert:
     case HloOpcode::kCos:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kDivide:
@@ -2220,6 +2215,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kSlice:
     case HloOpcode::kSort:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
     case HloOpcode::kPopulationCount:
@@ -2315,6 +2311,7 @@ Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) {
     HloDCE dce;
     TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
     TF_RETURN_IF_ERROR(dce.Run(module).status());
+    call_graph_ = CallGraph::Build(module);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 42245ca73df..6e575247e6b 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -814,27 +814,6 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   EXPECT_THAT(false_result->opcode(), HloOpcode::kCopy);
 }
 
-TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
-  auto builder = HloComputation::Builder(TestName());
-  auto constant0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
-  builder.AddInstruction(
-      HloInstruction::CreateBitcast(constant0->shape(), constant0));
-  auto m = CreateNewVerifiedModule();
-  m->AddEntryComputation(builder.Build());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-  LayoutAssignment layout_assignment(&computation_layout);
-  Status error_status = layout_assignment.Run(m.get()).status();
-  EXPECT_FALSE(error_status.ok());
-  EXPECT_THAT(
-      error_status.error_message(),
-      ::testing::HasSubstr(
-          "Unexpected bitcast operation seen during layout assignment"));
-}
-
 TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
   // Pin non matching layouts to parameter and root.
   const char* module_str = R"(
@@ -1385,5 +1364,42 @@ ENTRY entry_computation {
   ExpectLayoutIs(crs->operand(1)->shape(), {1, 0});
 }
 
+TEST_F(LayoutAssignmentTest, LayoutConstrainedAllToAll) {
+  const char* module_str = R"(
+HloModule test_module
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY entry_computation {
+  param = (f32[16,4]{0,1}, f32[16,4]{1,0}) parameter(0)
+  gte0 = f32[16,4] get-tuple-element(param), index=0
+  gte1 = f32[16,4] get-tuple-element(param), index=1
+  alltoall = (f32[16,4]{1,0}, f32[16,4]{1,0}) all-reduce(gte0, gte1),
+    replica_groups={{0,1}}, constrain_layout=true, to_apply=add
+  gte2 = f32[16,4] get-tuple-element(alltoall), index=0
+  gte3 = f32[16,4] get-tuple-element(alltoall), index=1
+  ROOT concat = f32[16,8]{0,1} concatenate(gte2, gte3), dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> m,
+      ParseAndReturnVerifiedModule(module_str, /*replica_count=*/2));
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape(), /*ignore_layouts=*/false);
+
+  ChannelLayoutConstraints channel_constraints;
+  AssignLayouts(m.get(), &computation_layout, &channel_constraints);
+
+  const HloInstruction* alltoall = FindInstruction(m.get(), "alltoall");
+  ExpectTupleLayoutIs(alltoall->shape(), {{1, 0}, {1, 0}});
+  ExpectLayoutIs(alltoall->operand(0)->shape(), {1, 0});
+  ExpectLayoutIs(alltoall->operand(1)->shape(), {1, 0});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 39399df7ad8..cabcc8e06ee 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -64,6 +64,7 @@ cc_library(
     srcs = ["llvm_util.cc"],
     hdrs = ["llvm_util.h"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
index db60e08472d..f7808773592 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
@@ -58,14 +58,14 @@ ENTRY while3 {
 
   CompileAndVerifyIr(hlo_string, R"(
 ; CHECK-LABEL: @body(i8* %retval
-; CHECK: %[[add_result:.*]] = fadd fast float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
-; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], !alias.scope ![[alias_scope_md_for_store:[0-9]+]]
+; CHECK: %[[add_result:.*]] = fadd reassoc nsz contract  float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
+; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], align 4, !alias.scope ![[alias_scope_md_for_store:[0-9]+]]
 ;
 ; CHECK-LABEL: @condition(i8* %retval, i8* noalias %run_options, i8** noalias %params
 ; CHECK: %[[cond_state_buf_ptr:.*]] = getelementptr inbounds i8*, i8** %buffer_table, i64 0
 ; CHECK: %[[cond_state_buf_untyped:.*]] = load i8*, i8** %[[cond_state_buf_ptr]]
 ; CHECK: %[[cond_state_buf_typed:.*]] = bitcast i8* %[[cond_state_buf_untyped]] to float*
-; CHECK: load float, float* %[[cond_state_buf_typed]], !alias.scope ![[alias_scope_md_for_store]], !noalias ![[noalias_md_for_load:.*]]
+; CHECK: load float, float* %[[cond_state_buf_typed]], align 4, !alias.scope ![[alias_scope_md_for_store]], !noalias ![[noalias_md_for_load:.*]]
 ;
 ; CHECK-LABEL: @while3(
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index 7fbd01e1b21..0371ce71874 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -43,9 +44,8 @@ using llvm_ir::IrArray;
 Status FusedIrEmitter::DefaultAction(const HloInstruction* hlo) {
   indexed_generators_[hlo] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    if (generated_value_cache_[hlo].contains(index.multidim())) {
-      llvm::Value* generated_value =
-          generated_value_cache_[hlo][index.multidim()];
+    if (llvm::Value* generated_value = FindOrDefault(
+            generated_value_cache_[hlo], index.multidim(), nullptr)) {
       llvm::BasicBlock* generated_value_bb = nullptr;
       if (auto* generated_instruction =
               llvm::dyn_cast<llvm::Instruction>(generated_value)) {
@@ -71,10 +71,11 @@ Status FusedIrEmitter::DefaultAction(const HloInstruction* hlo) {
               << b_->GetInsertBlock()->getName().str() << ").";
     }
 
-    TF_ASSIGN_OR_RETURN(generated_value_cache_[hlo][index.multidim()],
+    TF_ASSIGN_OR_RETURN(llvm::Value* const generated_value,
                         elemental_emitter_->MakeElementGenerator(
                             hlo, indexed_generators_)(index));
-    return generated_value_cache_[hlo][index.multidim()];
+    generated_value_cache_[hlo][index.multidim()] = generated_value;
+    return generated_value;
   };
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
index 67e65f29005..b438906a4e2 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
@@ -94,18 +94,6 @@ class IrBuilderMixin {
                                        fp_math_tag);
   }
 
-  // DEPRECATED. LLVM is removing getPointerElementType, so calls to this must
-  // be transitioned to one of the other overloads.
-  llvm::CallInst* Call(llvm::Value* callee,
-                       llvm::ArrayRef<llvm::Value*> args = llvm::None,
-                       const llvm::Twine& name = "",
-                       llvm::MDNode* fp_math_tag = nullptr) {
-    return mixin_builder()->CreateCall(
-        llvm::cast<llvm::FunctionType>(
-            callee->getType()->getPointerElementType()),
-        callee, args, name, fp_math_tag);
-  }
-
   template <class... Args>
   llvm::BranchInst* CondBr(Args&&... args) {
     return mixin_builder()->CreateCondBr(std::forward<Args>(args)...);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 4c9a8d3e004..6375bf7341f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
@@ -90,7 +91,9 @@ llvm::CallInst* EmitCallToIntrinsic(
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           llvm::IRBuilder<>* b) {
-  if (b->getFastMathFlags().noNaNs()) {
+  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
+  if (b->getFastMathFlags().noNaNs() ||
+      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
     auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
@@ -103,7 +106,9 @@ llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
 
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           llvm::IRBuilder<>* b) {
-  if (b->getFastMathFlags().noNaNs()) {
+  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
+  if (b->getFastMathFlags().noNaNs() ||
+      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
     auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
@@ -287,7 +292,7 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
   llvm::AllocaInst* alloca =
       b->CreateAlloca(type, element_count, AsStringRef(name));
   if (alignment != 0) {
-    alloca->setAlignment(llvm::MaybeAlign(alignment));
+    alloca->setAlignment(llvm::Align(alignment));
   }
   return alloca;
 }
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index ef8ddfc1a76..c80646e0c70 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -112,6 +112,8 @@ ExecutionOptions CreateExecutionOptions(
   }
   execution_options.set_num_replicas(build_options.num_replicas());
   execution_options.set_num_partitions(build_options.num_partitions());
+  execution_options.set_use_spmd_partitioning(
+      build_options.use_spmd_partitioning());
   if (build_options.has_device_assignment()) {
     TF_CHECK_OK(build_options.device_assignment().Serialize(
         execution_options.mutable_device_assignment()));
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index fb608df5197..44509395b6f 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -16,14 +16,98 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/memory_space_assignment.h"
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/core/lib/math/math_util.h"
 namespace xla {
 
 namespace {
 // Define a dummy chunk for chunks that will be allocated in the default memory
 // space and for keeping track of number of asynchronous copies.
 const HeapSimulator::Chunk kDummyChunk{-1, -1};
+// This variable is used by the cost analysis in estimating how many times each
+// while loop will execute. Nested loops will be assumed to have executed
+// pow(kWhileExecutionCount, nesting_level) times.
+const int kWhileExecutionCount = 5;
+
 }  // namespace
 
+float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
+    const HloInstruction& instruction,
+    float elapsed_time_due_to_alternate_mem) const {
+  float elapsed_time_due_to_compute =
+      GetInstructionElapsedDueToCompute(instruction);
+  float elapsed_time_due_to_memory =
+      GetInstructionElapsedDueToMemory(instruction);
+  if (elapsed_time_due_to_memory > elapsed_time_due_to_compute) {
+    // Memory bound, return how much alternate memory is better.
+    int while_nest_level = CalculateWhileLoopNestLevel(&instruction);
+    return (elapsed_time_due_to_memory - elapsed_time_due_to_alternate_mem) *
+           tensorflow::MathUtil::IPow<float>(kWhileExecutionCount,
+                                             while_nest_level);
+  } else {
+    // Compute bound, return how far off are we to memory boundedness.
+    return elapsed_time_due_to_memory - elapsed_time_due_to_compute;
+  }
+}
+
+float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
+    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+  const HloInstruction& defining_instruction =
+      *interval.buffer->defining_instruction();
+  float alternate_mem_benefit = GetAlternateMemoryBenefit(
+      defining_instruction,
+      GetInstructionElapsedDueToMemory(defining_instruction,
+                                       /*operand_in_alternate_mem=*/{},
+                                       /*output_in_alternate_mem=*/true));
+  for (const HloUse& use : interval.buffer->uses()) {
+    float use_alternate_mem_benefit = GetAlternateMemoryBenefit(
+        *use.instruction,
+        GetInstructionElapsedDueToMemory(*use.instruction, use.operand_number));
+    // If the benefit is positive (memory bound), add it to this buffer's
+    // benefit. If the benefit is negative (compute bound), calculate the
+    // maximum.
+    if (alternate_mem_benefit > 0 && use_alternate_mem_benefit > 0) {
+      alternate_mem_benefit += use_alternate_mem_benefit;
+    } else {
+      alternate_mem_benefit =
+          std::max(alternate_mem_benefit, use_alternate_mem_benefit);
+    }
+  }
+
+  // Get performance slowdown in seconds of prefetching current BufferInterval
+  // causing to other BufferIntervals.
+  float alternate_mem_slowdown =
+      GetInstructionElapsedDueToMemorySlowdown(interval.size);
+
+  // Scale the slowdown based on the time of this buffer. We would want earlier
+  // buffers have lower slowdown values, because they are less likely to overlap
+  // with other HLOs.
+  // TODO(yuemmawang): We may want a piecewise function, where a lower slowdown
+  // for early HLOs, and full slowdown for mid-to-late HLOs.
+  // TODO(yuemmawang): Further in a smarter way, we want buffers overlapped with
+  // more HLOs have higher slowdown, and vice versa.
+  float scale = interval.start * 1.0 / GetScheduleEndTime();
+  alternate_mem_slowdown *= scale;
+
+  return alternate_mem_benefit - alternate_mem_slowdown;
+}
+
+int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
+    const HloInstruction* instruction) const {
+  int nest_level = 0;
+  const HloComputation* computation = instruction->parent();
+  while (!computation->IsEntryComputation()) {
+    auto node = call_graph_.GetNode(computation);
+    auto callsites = node.caller_callsites();
+    CHECK_EQ(callsites.size(), 1) << "The module is not flattened!";
+    auto callsite = callsites[0];
+    if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+      ++nest_level;
+    }
+    computation = callsite.instruction()->parent();
+  }
+  return nest_level;
+}
+
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToCompute(
     const HloInstruction& instruction) const {
   return std::max(
@@ -137,29 +221,30 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     const MemorySpaceAssignmentCostAnalysis& cost_analysis,
     float min_async_copy_to_overlap_ratio,
     float max_async_copy_to_overlap_ratio)
-    : cost_analysis_(cost_analysis),
+    : elapsed_time_(
+          cost_analysis.hlo_live_range().instruction_schedule().size(), 0.0),
+      while_nest_level_(
+          cost_analysis.hlo_live_range().instruction_schedule().size(), 0),
+      cost_analysis_(cost_analysis),
       min_async_copy_to_overlap_ratio_(min_async_copy_to_overlap_ratio),
       max_async_copy_to_overlap_ratio_(max_async_copy_to_overlap_ratio) {
   instruction_schedule_ =
       &cost_analysis_.hlo_live_range().instruction_schedule();
 
-  // First create a vector of elapsed times of HLO instructions.
-  std::vector<float> instructions_elapsed_time(instruction_schedule_->size(),
-                                               0.0);
+  // Create a vector of elapsed times and while nesting levels of HLO
+  // instructions.
   for (const auto& instruction_and_logical_time : *instruction_schedule_) {
     float elapsed_time = cost_analysis_.cost_analysis().optimal_seconds(
         *instruction_and_logical_time.first);
     int64 logical_time = instruction_and_logical_time.second;
-    if (logical_time >= instructions_elapsed_time.size()) {
-      instructions_elapsed_time.resize(logical_time + 1, 0.0);
+    if (logical_time >= elapsed_time_.size()) {
+      elapsed_time_.resize(logical_time + 1, 0.0);
+      while_nest_level_.resize(logical_time + 1, 0);
     }
-    instructions_elapsed_time[logical_time] = elapsed_time;
-  }
-  // As an optimization, create a cumulative sum vector of elapsed time.
-  float cumsum = 0.0;
-  for (float elapsed_time : instructions_elapsed_time) {
-    cumsum += elapsed_time;
-    elapsed_time_cumsum_.push_back(cumsum);
+    elapsed_time_[logical_time] = elapsed_time;
+    while_nest_level_[logical_time] =
+        cost_analysis_.CalculateWhileLoopNestLevel(
+            instruction_and_logical_time.first);
   }
 }
 
@@ -233,7 +318,17 @@ bool CostAnalysisPrefetchIntervalPicker::Done() const {
 
 float CostAnalysisPrefetchIntervalPicker::GetLogicalIntervalElapsed(
     int64 start_time, int64 end_time) const {
-  return elapsed_time_cumsum_[end_time - 1] - elapsed_time_cumsum_[start_time];
+  int interval_nest_level =
+      std::min(while_nest_level_[start_time], while_nest_level_[end_time]);
+  float total_elapsed = 0;
+  for (int i = start_time + 1; i < end_time; ++i) {
+    total_elapsed +=
+        elapsed_time_[i] *
+        tensorflow::MathUtil::IPow<float>(
+            kWhileExecutionCount,
+            std::max(0, while_nest_level_[i] - interval_nest_level));
+  }
+  return total_elapsed;
 }
 
 std::string CostAnalysisPrefetchIntervalPicker::ToDebugString() const {
@@ -255,6 +350,12 @@ std::string CostAnalysisPrefetchIntervalPicker::ToNoCopyDebugString(
       ", logical interval elapsed (s) = ", logical_interval_elapsed);
 }
 
+absl::optional<float>
+CostAnalysisPrefetchIntervalPicker::BufferIntervalAlternateMemoryBenefit(
+    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+  return cost_analysis_.GetMemoryBoundedness(interval);
+}
+
 std::string MemorySpaceAssignment::AllocationValue::ToString() const {
   std::string out = absl::StrCat("computation = ", computation()->name());
   absl::StrAppend(&out, "\n position:\n");
@@ -426,7 +527,8 @@ bool AlternateMemoryBestFitHeap::IsIntervalAllowedInAlternateMemory(
 }
 
 bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
-    const HloUse& use) const {
+    const AllocationValue& value, const HloUse& use) const {
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   if (use.instruction->opcode() == HloOpcode::kWhile) {
     HloComputation* while_body = use.instruction->while_body();
 
@@ -436,7 +538,6 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
     HloValue* parameter_value =
         &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
             while_body->parameter_instruction(0), use.operand_index);
-    const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
     int64 parameter_time =
         instruction_schedule.at(while_body->parameter_instruction(0));
     int64 root_time = instruction_schedule.at(while_body->root_instruction());
@@ -491,10 +592,150 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
                  "there is a required default memory assignment.";
       return false;
     }
+  } else if (use.instruction->opcode() == HloOpcode::kConditional) {
+    // For any use of this conditional (the same value might be passed into
+    // multiple called computations), determine if the parameter->first use
+    // dependency is short.
+    int64 conditional_time = instruction_schedule.at(use.instruction);
+    for (const HloUse& other_use : value.uses()) {
+      if (other_use.instruction != use.instruction) {
+        continue;
+      }
+      HloComputation* called_computation =
+          use.instruction->called_computations().at(other_use.operand_number -
+                                                    1);
+      const HloInstruction* parameter_instruction =
+          called_computation->parameter_instruction(0);
+      HloValue* parameter_value =
+          &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+              parameter_instruction, other_use.operand_index);
+      int64 parameter_time = instruction_schedule.at(parameter_instruction);
+      int64 min_use_time = conditional_time;
+      for (const HloUse& parameter_use : parameter_value->uses()) {
+        if (parameter_use.instruction->parent() == called_computation &&
+            parameter_use.instruction->opcode() !=
+                HloOpcode::kGetTupleElement &&
+            parameter_use.instruction->opcode() != HloOpcode::kTuple &&
+            parameter_use.instruction->opcode() != HloOpcode::kBitcast) {
+          min_use_time = std::min(
+              min_use_time, instruction_schedule.at(parameter_use.instruction));
+        }
+      }
+      if (options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+              parameter_value->shape(), parameter_time, min_use_time)) {
+        VLOG(4) << "Conditional allocation allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+        return true;
+      } else {
+        VLOG(4) << "Conditional allocation not allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+      }
+    }
+    return false;
   }
+
   return true;
 }
 
+void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
+    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+    std::string* debug_str) const {
+  // Columns in buffer information:
+  // buffer_id: int. This value can be used to match the allocation in
+  // allocation information.
+  // buffer_name: string.
+  // alt_mem_benefit: float. Roughly corresponds to how much the cost analysis
+  // thought it would be beneficial to put this in the alternate memory. The
+  // higher the value, the more it is memory bound.
+  // size: int. In bytes.
+  // definition_time: int. Logical time this value was defined in the schedule.
+  // use_times: string. This is a semicolon-separated list of integers for all
+  // the use times.
+  // use_names: string. This is a semicolon-separated list of string
+  // representation of uses.
+  if (debug_str->empty()) {
+    // Append the column names.
+    absl::StrAppend(debug_str,
+                    "buffer_id,buffer_name,alt_mem_benefit,size,"
+                    "definition_time,use_times,use_names\n");
+  }
+  const HloBuffer& buffer =
+      alias_analysis_.GetBufferContainingValue(*interval.buffer);
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
+  int64 definition_time =
+      instruction_schedule.at(interval.buffer->defining_position().instruction);
+  std::vector<std::pair<int64, std::string>> uses;
+  for (const HloValue* value : buffer.values()) {
+    for (const HloUse& use : value->uses()) {
+      uses.push_back(
+          {instruction_schedule.at(use.instruction), use.ToString()});
+    }
+  }
+  absl::c_sort(uses);
+  std::vector<int64> use_times;
+  std::vector<std::string> use_names;
+  use_times.reserve(uses.size());
+  use_names.reserve(uses.size());
+  for (auto use : uses) {
+    use_times.push_back(use.first);
+    use_names.push_back(use.second);
+  }
+
+  absl::StrAppend(debug_str, buffer.id(), ",");
+  absl::StrAppend(debug_str, "\"", interval.buffer->ToShortString(), "\",");
+  auto alternate_memory_benefit =
+      options_.prefetch_interval_picker->BufferIntervalAlternateMemoryBenefit(
+          interval);
+  absl::StrAppend(
+      debug_str, alternate_memory_benefit ? *alternate_memory_benefit : 0, ",");
+  absl::StrAppend(debug_str, interval.size, ",");
+  absl::StrAppend(debug_str, definition_time, ",");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_times, ";"), "\",");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_names, ";"), "\"");
+  absl::StrAppend(debug_str, "\n");
+}
+
+void AlternateMemoryBestFitHeap::AppendAllocationInfoDebugString(
+    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+    const MemorySpaceAssignment::Allocation& allocation,
+    std::string* debug_str) const {
+  // Columns in allocation information:
+  // buffer_id: int. This value can be used the match with buffer info.
+  // size: int. In bytes.
+  // offset: int. In bytes.
+  // start_time: int. Logical start time of the allocation.
+  // end_time: int. Logical end time of the allocation.
+  if (debug_str->empty()) {
+    // Append the column names.
+    absl::StrAppend(debug_str, "buffer_id,size,offset,start_time,end_time\n");
+  }
+  if (allocation.memory_space() == MemorySpace::kAlternate) {
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*interval.buffer);
+    absl::StrAppend(debug_str, buffer.id(), ",");
+    absl::StrAppend(debug_str, interval.size, ",");
+    absl::StrAppend(debug_str, allocation.chunk().offset, ",");
+    absl::StrAppend(debug_str, allocation.start_time(), ",");
+    absl::StrAppend(debug_str, allocation.end_time(), "\n");
+  }
+}
+
+void AlternateMemoryBestFitHeap::DumpIfEnabled(
+    absl::string_view buffer_info_str,
+    absl::string_view allocation_info_str) const {
+  if (!options_.dump_fn) {
+    return;
+  }
+  options_.dump_fn("bufferinfo", buffer_info_str);
+  options_.dump_fn("allocinfo", allocation_info_str);
+}
+
 HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
   std::vector<BufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
@@ -504,16 +745,19 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
 
   AddInputAndOutputRequiredAssignments();
 
-  if (VLOG_IS_ON(4)) {
-    VLOG(4) << "Flattened instruction sequence:";
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << "Flattened instruction sequence:";
     const auto& instruction_sequence =
         hlo_live_range_.flattened_instruction_sequence().instructions();
     for (int i = 0; i < instruction_sequence.size(); ++i) {
-      VLOG(4) << " " << i << ": " << instruction_sequence[i]->parent()->name()
+      VLOG(3) << " " << i << ": " << instruction_sequence[i]->parent()->name()
               << " " << instruction_sequence[i]->name();
     }
   }
 
+  std::string buffer_info_str;
+  std::string allocation_info_str;
+
   for (auto& interval : sorted_buffer_intervals) {
     if (!interval.need_allocation) {
       continue;
@@ -545,7 +789,7 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
     }
 
     if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
-      VLOG(4) << "Interval " << interval.buffer->ToShortString()
+      VLOG(3) << "Interval " << interval.buffer->ToShortString()
               << " is reserved in the alternate memory. Total reserved bytes = "
               << reserved_in_bytes_;
       for (const BufferInterval* colocated_interval : colocated_intervals) {
@@ -554,7 +798,7 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         // alternate memory allocations will not have an entry in preset
         // allocations that is normally used for coloring.
         for (auto& position : value->positions()) {
-          VLOG(3) << "Coloring " << position.ToString();
+          VLOG(4) << "Coloring " << position.ToString();
           Shape* shape = ShapeUtil::GetMutableSubshape(
               position.instruction->mutable_shape(), position.index);
           CHECK(shape->IsArray()) << "Coloring a shape that is not an array: "
@@ -586,8 +830,6 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
     }
 
     const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-    global_max_time_ = instruction_schedule.at(
-        module->entry_computation()->root_instruction());
 
     // TODO(berkin): For now, place the phi values due to conditionals in
     // default memory.
@@ -597,25 +839,19 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         if (position.instruction->opcode() == HloOpcode::kConditional) {
           VLOG(3) << "Adding required assignment for condition output: "
                   << value->ToShortString();
-          required_assignments_[value].push_back(
-              {MemorySpace::kDefault,
-               instruction_schedule.at(position.instruction),
-               /*chunk=*/absl::nullopt});
+          AddRequiredAssignment(position.instruction, position.index,
+                                MemorySpace::kDefault);
           for (const HloComputation* called_computation :
                position.instruction->called_computations()) {
-            HloValue* root_value =
-                &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-                    called_computation->root_instruction(), position.index);
-            required_assignments_[root_value].push_back(
-                {MemorySpace::kDefault,
-                 instruction_schedule.at(
-                     called_computation->root_instruction()),
-                 /*chunk=*/absl::nullopt});
+            AddRequiredAssignment(called_computation->root_instruction(),
+                                  position.index, MemorySpace::kDefault);
           }
         }
       }
     }
 
+    AppendBufferInfoDebugString(interval, &buffer_info_str);
+
     // Data structure to contain the preferred offset for a given computation.
     // We ensure that the same offset will be allocated outside the while loop
     // as well as inside the while loop.
@@ -634,9 +870,13 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
       }
 
       // Iterate over the uses.
-      for (HloUse use : allocation_value.uses()) {
+      for (int use_idx = 0; use_idx < allocation_value.uses().size();
+           ++use_idx) {
+        const HloUse& use = allocation_value.uses().at(use_idx);
         int64 use_time = instruction_schedule.at(use.instruction);
         int64 latest_prefetch_time = use_time;
+        bool allow_no_copy_alternate_mem_allocation = true;
+        absl::optional<int64> earliest_prefetch_time = absl::nullopt;
 
         // Sequential calls include kWhile, kCall, and kConditional opcodes.
         bool is_sequential_call =
@@ -672,19 +912,52 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
             // interval (5-6) can be allocated separately and this buffer
             // doesn't waste alternate memory space within the while loop body.
             HloComputation* while_body = use.instruction->while_body();
+            // We require while body ROOTs to be the last in the schedule.
+            CHECK_EQ(
+                instruction_schedule.at(while_body->root_instruction()) + 1,
+                instruction_schedule.at(use.instruction))
+                << "While body ROOTs need to be the last in the schedule!  "
+                   "Please run RootInstructionSinker.";
             // Replace the use time with the parameter time so that we can
             // decide on alternate memory allocations within the while loop body
             // when we look at uses within the while loop body.
             use_time =
                 instruction_schedule.at(while_body->parameter_instruction(0));
+          } else if (use.instruction->opcode() == HloOpcode::kConditional) {
+            // Replace the use time with the earliest parameter of called
+            // computations.
+            for (const HloComputation* called_computation :
+                 use.instruction->called_computations()) {
+              use_time = std::min(
+                  use_time, instruction_schedule.at(
+                                called_computation->parameter_instruction(0)));
+            }
           }
         }
 
         // Add a required assignment in default memory if the use not allowed in
         // alternate memory.
-        if (!IsUseAllowedInAlternateMemory(use)) {
-          required_assignments_[allocation_value.value()].push_back(
-              {MemorySpace::kDefault, use_time, /*chunk=*/absl::nullopt});
+        if (!IsUseAllowedInAlternateMemory(allocation_value, use)) {
+          AddRequiredAssignment(allocation_value.value(), use.instruction,
+                                MemorySpace::kDefault, use_time);
+        } else if (use_idx > 0) {
+          // We allow buffers in alternate memory that are passed into
+          // conditionals to give up their alternate memory allocation inside
+          // the called computation. This means that if a conditional operator
+          // has an alternate memory allocation, subsequent uses cannot use the
+          // same alternate memory allocation in order not to clobber data. So
+          // we force default memory allocation for these subsequent uses.
+          const HloUse& previous_use = allocation_value.uses().at(use_idx - 1);
+          if (previous_use.instruction->opcode() == HloOpcode::kConditional &&
+              previous_use.instruction != use.instruction) {
+            allow_no_copy_alternate_mem_allocation = false;
+            earliest_prefetch_time =
+                instruction_schedule.at(previous_use.instruction);
+            VLOG(3) << "Previous use (" << previous_use.ToString()
+                    << ") of use (" << use.ToString()
+                    << ") is a conditional, so this use will need to evict. "
+                    << "Earliest prefetch time = " << *earliest_prefetch_time;
+          }
         }
 
         // Bitcasts don't define buffers and don't directly consume buffers.
@@ -692,10 +965,16 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         // bitcasts will be handled specially.
         if (use.instruction->opcode() != HloOpcode::kBitcast) {
           AllocationRequest request;
-          request.start_time = definition_time;
+          // Rarely, (e.g., when conditional true and false parameters are the
+          // same), definition time can be the time of the conditional and use
+          // time is the parameter use, which is less.
+          request.start_time = std::min(definition_time, use_time);
           request.end_time = use_time;
           request.latest_prefetch_time = latest_prefetch_time;
           request.size = interval.size;
+          request.allow_no_copy_alternate_mem_allocation =
+              allow_no_copy_alternate_mem_allocation;
+          request.earliest_prefetch_time = earliest_prefetch_time;
           request.preferred_offset = preferred_offset;
           request.use = use;
           request.allocation_value = &allocation_value;
@@ -737,6 +1016,8 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
     if (allocation_success) {
       for (AllocationValue& allocation_value : allocation_values) {
         for (auto& allocation : *allocation_value.allocation_sequence()) {
+          AppendAllocationInfoDebugString(interval, *allocation,
+                                          &allocation_info_str);
           allocations_->push_back(std::move(allocation));
         }
       }
@@ -746,6 +1027,12 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
     pending_async_copies_.clear();
   }
 
+  VLOG(3) << "Debug buffer info: ";
+  VLOG(3) << buffer_info_str;
+  VLOG(3) << "Debug allocation info: ";
+  VLOG(3) << allocation_info_str;
+  DumpIfEnabled(buffer_info_str, allocation_info_str);
+
   return result_;
 }
 
@@ -873,35 +1160,42 @@ void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment(
   if (aliased_allocation->memory_space() == MemorySpace::kAlternate) {
     chunk = aliased_allocation->chunk();
   }
-  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-  HloValue* value =
-      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
-  int64 instruction_time = instruction_schedule.at(instruction);
+  AddRequiredAssignment(instruction, index, aliased_allocation->memory_space(),
+                        chunk);
+}
+
+void AlternateMemoryBestFitHeap::AddRequiredAssignment(
+    const HloValue* value, const HloInstruction* instruction,
+    MemorySpaceAssignment::MemorySpace memory_space, int64 time,
+    absl::optional<HeapSimulator::Chunk> chunk) {
   // Check for existing required assignment at this time and make sure it is the
   // same as this if there is one.
-  auto existing_required_assignment =
-      RequiredMemoryAssignmentAt(value, instruction_time);
+  auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time);
   if (existing_required_assignment) {
-    CHECK(aliased_allocation->memory_space() ==
-          existing_required_assignment->memory_space);
+    CHECK(memory_space == existing_required_assignment->memory_space)
+        << "inst = " << instruction->ToString() << " at " << time;
     CHECK((!chunk && !existing_required_assignment->chunk) ||
           chunk->offset == existing_required_assignment->chunk->offset);
-    VLOG(3) << "Not adding aliased required assignment because there is one "
-               "already: "
-            << value->ToShortString() << " at " << instruction_time << " at "
-            << (aliased_allocation->memory_space() == MemorySpace::kDefault
-                    ? "def"
-                    : "alt");
-    return;
+    VLOG(3) << "Not adding required assignment because there is one already: "
+            << value->ToShortString() << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+  } else {
+    VLOG(3) << "Adding required assignment: " << value->ToShortString()
+            << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+    required_assignments_[value].push_back({memory_space, time, chunk});
   }
+}
 
-  required_assignments_[value].push_back(
-      {aliased_allocation->memory_space(), instruction_time, chunk});
-  VLOG(3) << "Adding aliased required assignment: " << value->ToShortString()
-          << " at " << instruction_time << " at "
-          << (aliased_allocation->memory_space() == MemorySpace::kDefault
-                  ? "def"
-                  : "alt");
+void AlternateMemoryBestFitHeap::AddRequiredAssignment(
+    const HloInstruction* instruction, ShapeIndex index,
+    MemorySpace memory_space, absl::optional<Chunk> chunk) {
+  const HloValue* value =
+      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
+  int64 instruction_time =
+      hlo_live_range_.instruction_schedule().at(instruction);
+  AddRequiredAssignment(value, instruction, memory_space, instruction_time,
+                        chunk);
 }
 
 void AlternateMemoryBestFitHeap::AddInputAndOutputRequiredAssignments() {
@@ -994,7 +1288,7 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks() {
   for (const auto& interval_and_chunk : pending_chunks_) {
     const BufferInterval& interval = interval_and_chunk.first;
     const Chunk& chunk = interval_and_chunk.second.chunk;
-    VLOG(4) << "Uncommitting: (" << interval.start << ", " << interval.end
+    VLOG(3) << "Uncommitting: (" << interval.start << ", " << interval.end
             << ") off = " << chunk.offset << " size = " << chunk.size;
     interval_tree_.Remove(interval.start, interval.end, chunk);
   }
@@ -1101,6 +1395,7 @@ bool AlternateMemoryBestFitHeap::FindAllocation(
   // First try keeping the allocation entirely in the alternate memory.
   if (required_memory_space_at_start != MemorySpace::kDefault &&
       required_memory_space_at_end != MemorySpace::kDefault &&
+      request.allow_no_copy_alternate_mem_allocation &&
       AllocateInAlternateMemoryNoCopy(request)) {
     return true;
   }
@@ -1139,7 +1434,7 @@ bool AlternateMemoryBestFitHeap::FindAllocation(
 
   // If the buffer must be in default memory at the end_time, don't prefetch.
   if (required_memory_space_at_end == MemorySpace::kDefault) {
-    VLOG(4)
+    VLOG(3)
         << "Not trying to prefetch because use requires buffer in default mem.";
     (*prev_allocation_in_default_mem_it)->Extend(request.end_time);
     (*prev_allocation_in_default_mem_it)->AddUse(request.use);
@@ -1183,8 +1478,10 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
 
   // Register the additional async copy with the interval tree to keep track of
   // the limit at any given time.
-  pending_async_copies_.push_back({start_time, end_time, memory_space});
-  async_copy_interval_tree_.Add(start_time, end_time, kDummyChunk);
+  pending_async_copies_.push_back(
+      {start_time, copy_done_schedule_before_time, memory_space});
+  async_copy_interval_tree_.Add(start_time, copy_done_schedule_before_time,
+                                kDummyChunk);
   if (memory_space == MemorySpaceAssignment::MemorySpace::kAlternate) {
     async_copy_ordering_.AddCopy(pending_async_copies_.back());
   }
@@ -1265,7 +1562,7 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
     preferred_offset = request.preferred_offset;
   }
 
-  VLOG(4) << "We can eliminate copy to alternate memory. Preferred offset = "
+  VLOG(3) << "We can eliminate copy to alternate memory. Preferred offset = "
           << (preferred_offset ? *preferred_offset : -1);
   // In case there are additional uses after this use, we rely on the last use
   // time to try to reserve a chunk in the heap simulator. This is to prevent
@@ -1335,6 +1632,9 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
                    request.allocation_value->defining_position().shape(),
                    eviction_start_time, request.end_time),
                eviction_end_time);
+  // Evictions must complete by the time of this use.
+  preferred_eviction_end_time =
+      std::min(preferred_eviction_end_time, request.latest_prefetch_time);
 
   BufferInterval eviction_mem_interval;
   eviction_mem_interval.buffer = request.allocation_value->value();
@@ -1342,10 +1642,9 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
   // Try to reserve a buffer from the end of the previous allocation to the
   // preferred eviction end time.
   eviction_mem_interval.start = eviction_end_time + 1;
-  eviction_mem_interval.end =
-      std::min(preferred_eviction_end_time, global_max_time_);
+  eviction_mem_interval.end = preferred_eviction_end_time;
   int64 preferred_offset = prev_allocation->chunk().offset;
-  VLOG(4) << "Eviction (" << eviction_start_time << ", " << eviction_end_time
+  VLOG(3) << "Eviction (" << eviction_start_time << ", " << eviction_end_time
           << ") preferred end time = " << eviction_mem_interval.end;
 
   for (; eviction_mem_interval.end > eviction_end_time;
@@ -1385,7 +1684,7 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
     // this interval.
     bool eviction_scheduled = false;
     for (int64 time = eviction_start_time; time < eviction_end_time; ++time) {
-      VLOG(3) << "Try evicting (" << time << ", " << time + 1 << ")";
+      VLOG(4) << "Try evicting (" << time << ", " << time + 1 << ")";
       if (!ViolatesMaximumOutstandingAsyncCopies(time, time + 1)) {
         VLOG(3) << "Eviction successful.";
         AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
@@ -1428,10 +1727,15 @@ bool AlternateMemoryBestFitHeap::Prefetch(
   //                                     ^      ^
   //                                   Copy    Copy
   //                                   Start   Done
-  options_.prefetch_interval_picker->Begin(
-      request.use, prev_allocation_in_default_mem.earliest_available_time(),
-      request.latest_prefetch_time);
-  VLOG(4) << "Trying prefetch picker = "
+  int64 earliest_prefetch_time =
+      prev_allocation_in_default_mem.earliest_available_time();
+  if (request.earliest_prefetch_time) {
+    earliest_prefetch_time =
+        std::max(earliest_prefetch_time, *request.earliest_prefetch_time);
+  }
+  options_.prefetch_interval_picker->Begin(request.use, earliest_prefetch_time,
+                                           request.latest_prefetch_time);
+  VLOG(3) << "Trying prefetch picker = "
           << options_.prefetch_interval_picker->ToDebugString();
 
   // Create an alternate memory interval that starts at the earliest
@@ -1446,12 +1750,12 @@ bool AlternateMemoryBestFitHeap::Prefetch(
     // If this additional asynchronous copy would violate the limit, try a
     // different interval.
     if (ViolatesMaximumOutstandingAsyncCopies(alternate_mem_interval.start,
-                                              request.end_time)) {
+                                              request.latest_prefetch_time)) {
       VLOG(4) << "This would violate the outstanding async copy limit.";
       continue;
     }
     if (ViolatesAsyncCopyOrdering(alternate_mem_interval.start,
-                                  request.end_time)) {
+                                  request.latest_prefetch_time)) {
       VLOG(4) << "This would violate asynchronous copy ordering.";
       continue;
     }
@@ -1516,90 +1820,47 @@ AlternateMemoryBestFitHeap::FindBestChunkCandidate(
   return absl::nullopt;
 }
 
-/*static*/ int64 MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(
-    const HloModule& module) {
-  int64 max_copies = 0;
+StatusOr<MemorySpaceAssignment::AsyncCopyStats>
+MemorySpaceAssignment::CalculateAsyncCopyStats() const {
+  AsyncCopyStats stats;
+  stats.max_outstanding_async_copies = 0;
+  stats.num_prefetches = 0;
+  stats.prefetch_bytes = 0;
+  stats.num_evictions = 0;
+  stats.eviction_bytes = 0;
   int64 current_copies = 0;
-  for (HloInstruction* instruction :
-       module.schedule().sequence(module.entry_computation()).instructions()) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
+                      HloDataflowAnalysis::Run(*module_));
+  for (HloInstruction* instruction : module_->schedule()
+                                         .sequence(module_->entry_computation())
+                                         .instructions()) {
     if (instruction->opcode() == HloOpcode::kCopyStart) {
       current_copies++;
     } else if (instruction->opcode() == HloOpcode::kCopyDone) {
       current_copies--;
+      int64 size =
+          options_.size_fn(dataflow_analysis->GetUniqueValueAt(instruction));
+      if (instruction->shape().layout().memory_space() ==
+          options_.alternate_memory_space) {
+        ++stats.num_prefetches;
+        stats.prefetch_bytes += size;
+      } else {
+        ++stats.num_evictions;
+        stats.eviction_bytes += size;
+      }
     }
-    max_copies = std::max(max_copies, current_copies);
+    stats.max_outstanding_async_copies =
+        std::max(stats.max_outstanding_async_copies, current_copies);
   }
-  return max_copies;
+  return stats;
 }
 
 /*static*/ MemorySpaceAssignment::BufferIntervalCompare
 MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
     const MemorySpaceAssignmentCostAnalysis& cost_analysis) {
   return [&](const BufferInterval& x, const BufferInterval& y) {
-    // Returns a heuristic value that captures how much putting this tensor to
-    // the alternate memory would help if the op is memory bound, or otherwise
-    // how far off is the op to memory boundedness. The larger this number, the
-    // higher priority it will be placed in the alternate memory.
-    auto get_alternate_mem_benefit =
-        [&](const HloInstruction& instruction,
-            float elapsed_time_due_to_alternate_mem) {
-          float elapsed_time_due_to_compute =
-              cost_analysis.GetInstructionElapsedDueToCompute(instruction);
-          float elapsed_time_due_to_memory =
-              cost_analysis.GetInstructionElapsedDueToMemory(instruction);
-          if (elapsed_time_due_to_memory > elapsed_time_due_to_compute) {
-            // Memory bound, return how much alternate memory is better.
-            return elapsed_time_due_to_memory -
-                   elapsed_time_due_to_alternate_mem;
-          } else {
-            // Compute bound, return how far off are we to memory boundedness.
-            return elapsed_time_due_to_memory - elapsed_time_due_to_compute;
-          }
-        };
-
-    auto get_memory_boundedness = [&](const BufferInterval& interval) {
-      const HloInstruction& defining_instruction =
-          *interval.buffer->defining_instruction();
-      float alternate_mem_benefit = get_alternate_mem_benefit(
-          defining_instruction, cost_analysis.GetInstructionElapsedDueToMemory(
-                                    defining_instruction,
-                                    /*operand_in_alternate_mem=*/{},
-                                    /*output_in_alternate_mem=*/true));
-      for (const HloUse& use : interval.buffer->uses()) {
-        float use_alternate_mem_benefit = get_alternate_mem_benefit(
-            *use.instruction, cost_analysis.GetInstructionElapsedDueToMemory(
-                                  *use.instruction, use.operand_number));
-        // If the benefit is positive (memory bound), add it to this buffer's
-        // benefit. If the benefit is negative (compute bound), calculate the
-        // maximum.
-        if (alternate_mem_benefit > 0 && use_alternate_mem_benefit > 0) {
-          alternate_mem_benefit += use_alternate_mem_benefit;
-        } else {
-          alternate_mem_benefit =
-              std::max(alternate_mem_benefit, use_alternate_mem_benefit);
-        }
-      }
-
-      // Get performance slowdown in seconds of prefetching current
-      // BufferInterval causing to other BufferIntervals.
-      float alternate_mem_slowdown =
-          cost_analysis.GetInstructionElapsedDueToMemorySlowdown(interval.size);
-
-      // Scale the slowdown based on the time of this buffer. We would want
-      // earlier buffers have lower slowdown values, because they are less
-      // likely to overlap with other HLOs.
-      // TODO (yuemmawang) We may want a piecewise function, where a lower
-      // slowdown for early HLOs, and full slowdown for mid-to-late HLOs.
-      // TODO (yuemmawang) Further in a smarter way, we want buffers overlapped
-      // with more HLOs have higher slowdown, and vice versa.
-      float scale = interval.start * 1.0 / cost_analysis.GetScheduleEndTime();
-      alternate_mem_slowdown *= scale;
-
-      return alternate_mem_benefit - alternate_mem_slowdown;
-    };
-
-    float x_memory_boundedness = get_memory_boundedness(x);
-    float y_memory_boundedness = get_memory_boundedness(y);
+    float x_memory_boundedness = cost_analysis.GetMemoryBoundedness(x);
+    float y_memory_boundedness = cost_analysis.GetMemoryBoundedness(y);
     if (x_memory_boundedness != y_memory_boundedness) {
       return x_memory_boundedness > y_memory_boundedness;
     }
@@ -1691,32 +1952,6 @@ FindCrossProgramPrefetchCandidate(
   }
   return *best_candidate;
 }
-
-// Finds an AllocationSequence for placing buffers in alternate memory using the
-// AlternateMemoryBestFitHeap algorithm.
-StatusOr<MemorySpaceAssignment::AllocationSequence> FindAllocationSequence(
-    HloModule* module, const HloLiveRange& hlo_live_range,
-    const HloAliasAnalysis& alias_analysis,
-    const MemorySpaceAssignment::Options& options) {
-  MemorySpaceAssignment::AllocationSequence allocations;
-  auto algorithm = absl::make_unique<AlternateMemoryBestFitHeap>(
-      &allocations, options, alias_analysis, hlo_live_range);
-
-  if (options.enable_cross_program_prefetch) {
-    absl::optional<AlternateMemoryBestFitHeap::BufferInterval>
-        prefetch_candiate = FindCrossProgramPrefetchCandidate(
-            alias_analysis, hlo_live_range, options);
-    algorithm->AllocateCrossProgramPrefetchBuffer(module, prefetch_candiate);
-  }
-
-  HeapSimulator::Options heap_simulator_options;
-  heap_simulator_options.may_reuse_operand_buffers = false;
-  TF_RETURN_IF_ERROR(HeapSimulator::Run(std::move(algorithm), *module,
-                                        module->schedule(), alias_analysis,
-                                        options.size_fn, heap_simulator_options)
-                         .status());
-  return std::move(allocations);
-}
 }  // namespace
 
 /*static*/ StatusOr<std::unique_ptr<PresetAssignments>>
@@ -1725,31 +1960,64 @@ MemorySpaceAssignment::Run(HloModule* module,
                            const HloAliasAnalysis& alias_analysis,
                            const Options& options) {
   CHECK(module->has_schedule());
-  VLOG(4) << "Module before memory space assignment: ";
-  XLA_VLOG_LINES(4, module->ToString());
-  VLOG(4) << "Schedule: " << module->schedule().ToString();
+  VLOG(3) << "Module before memory space assignment: ";
+  XLA_VLOG_LINES(3, module->ToString());
+  VLOG(3) << "Schedule: " << module->schedule().ToString();
   MemorySpaceAssignment memory_space_assignment(module, options,
                                                 hlo_live_range);
-  TF_ASSIGN_OR_RETURN(
-      AllocationSequence allocations,
-      FindAllocationSequence(module, hlo_live_range, alias_analysis, options));
 
-  memory_space_assignment.SetAllocationSequence(std::move(allocations));
-  TF_RETURN_IF_ERROR(memory_space_assignment.Process());
-  memory_space_assignment.ScheduleAsynchronousCopies();
-  TF_RETURN_IF_ERROR(memory_space_assignment.SimplifyGraph());
-  TF_RETURN_IF_ERROR(memory_space_assignment.FixSchedule());
+  return memory_space_assignment.RunMemorySpaceAssignment(hlo_live_range,
+                                                          alias_analysis);
+}
 
-  VLOG(4) << "Module after memory space assignment: ";
-  XLA_VLOG_LINES(4, module->ToString());
-  TF_CHECK_OK(module->schedule().Verify());
+StatusOr<std::unique_ptr<PresetAssignments>>
+MemorySpaceAssignment::RunMemorySpaceAssignment(
+    const HloLiveRange& hlo_live_range,
+    const HloAliasAnalysis& alias_analysis) {
+  TF_RETURN_IF_ERROR(FindAllocationSequence(hlo_live_range, alias_analysis));
+  TF_RETURN_IF_ERROR(Process());
+  ScheduleAsynchronousCopies();
+  TF_RETURN_IF_ERROR(SimplifyGraph());
+  TF_RETURN_IF_ERROR(FixSchedule());
+  TF_RETURN_IF_ERROR(ExportAndColorBuffers());
+
+  VLOG(3) << "Module after memory space assignment: ";
+  XLA_VLOG_LINES(3, module_->ToString());
+  TF_CHECK_OK(module_->schedule().Verify());
+  TF_ASSIGN_OR_RETURN(AsyncCopyStats stats, CalculateAsyncCopyStats());
   VLOG(1) << "Maximum number of outstanding async copies: "
-          << CountMaximumOutstandingAsyncCopies(*module);
+          << stats.max_outstanding_async_copies;
+  VLOG(1) << "Number of prefetches: " << stats.num_prefetches
+          << ", in bytes: " << stats.prefetch_bytes;
+  VLOG(1) << "Number of evictions: " << stats.num_evictions
+          << ", in bytes: " << stats.eviction_bytes;
 
-  TF_RETURN_IF_ERROR(
-      memory_space_assignment.VerifyAndExportHeapSimulatorTrace());
+  TF_RETURN_IF_ERROR(VerifyAndExportHeapSimulatorTrace());
 
-  return std::move(memory_space_assignment.preset_assignments_);
+  return std::move(preset_assignments_);
+}
+
+Status MemorySpaceAssignment::FindAllocationSequence(
+    const HloLiveRange& hlo_live_range,
+    const HloAliasAnalysis& alias_analysis) {
+  auto algorithm = absl::make_unique<AlternateMemoryBestFitHeap>(
+      &allocations_, options_, alias_analysis, hlo_live_range);
+
+  if (options_.enable_cross_program_prefetch) {
+    absl::optional<AlternateMemoryBestFitHeap::BufferInterval>
+        prefetch_candiate = FindCrossProgramPrefetchCandidate(
+            alias_analysis, hlo_live_range, options_);
+    algorithm->AllocateCrossProgramPrefetchBuffer(module_, prefetch_candiate);
+  }
+
+  HeapSimulator::Options heap_simulator_options;
+  heap_simulator_options.may_reuse_operand_buffers = false;
+  TF_RETURN_IF_ERROR(HeapSimulator::Run(std::move(algorithm), *module_,
+                                        module_->schedule(), alias_analysis,
+                                        options_.size_fn,
+                                        heap_simulator_options)
+                         .status());
+  return Status::OK();
 }
 
 void MemorySpaceAssignment::Allocation::AddUse(HloUse use) {
@@ -1873,6 +2141,18 @@ HloInstruction* MemorySpaceAssignment::Allocation::AddGetTupleElements() {
   return producing_instruction;
 }
 
+std::string MemorySpaceAssignment::Allocation::ToString() const {
+  return absl::StrCat("Allocation in ",
+                      memory_space_ == MemorySpace::kDefault ? "def" : "alt",
+                      " defined at ", defining_position_.ToString());
+}
+
+std::string MemorySpaceAssignment::CopyAllocation::ToString() const {
+  return absl::StrCat("Copy Allocation in ",
+                      memory_space_ == MemorySpace::kDefault ? "def" : "alt",
+                      " from ", prev_allocation_.ToString());
+}
+
 Status MemorySpaceAssignment::CopyAllocation::Process(
     MemorySpaceAssignment* memory_space_assignment) {
   // Copy allocations need to insert asynchronous copy nodes.
@@ -1917,25 +2197,29 @@ Status MemorySpaceAssignment::CopyAllocation::Process(
 }
 
 Status MemorySpaceAssignment::Process() {
+  VLOG(1) << "Processing assigned buffers...";
   // Insert CopyStart/CopyDone pairs.
-  int64 alternate_memory_size = 0;
-  std::vector<std::pair<HloPosition, Chunk>> position_and_chunks;
   for (auto& allocation : allocations_) {
+    VLOG(3) << "Processing: " << allocation->ToString();
     TF_RETURN_IF_ERROR(allocation->Process(this));
     // Add the offset and size of the allocation in the alternate memory to
     // the output map.
     if (allocation->memory_space() == MemorySpace::kAlternate) {
-      position_and_chunks.emplace_back(allocation->defining_position(),
-                                       allocation->chunk());
-      alternate_memory_size =
-          std::max(alternate_memory_size, allocation->chunk().chunk_end());
+      alternate_memory_assignments_.emplace_back(
+          allocation->defining_position(), allocation->chunk());
+      alternate_memory_size_ =
+          std::max(alternate_memory_size_, allocation->chunk().chunk_end());
     }
   }
+  return Status::OK();
+}
 
+Status MemorySpaceAssignment::ExportAndColorBuffers() {
+  VLOG(1) << "Exporting buffers...";
   TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(module_));
   absl::flat_hash_map<int64, int64> seen_buffer_offsets;
   VLOG(3) << "Exported alternate memory allocations:";
-  for (const auto& position_and_chunk : position_and_chunks) {
+  for (const auto& position_and_chunk : alternate_memory_assignments_) {
     const HloPosition& defining_position = position_and_chunk.first;
     const Chunk& chunk = position_and_chunk.second;
     const HloBuffer& buffer = alias_analysis->GetUniqueBufferAt(
@@ -1957,7 +2241,7 @@ Status MemorySpaceAssignment::Process() {
   if (!preset_assignments_->chunks().empty()) {
     preset_assignments_
         ->assignment_information_for_space(options_.alternate_memory_space)
-        ->size = alternate_memory_size;
+        ->size = alternate_memory_size_;
   }
 
   VLOG(3) << "Exported alternate memory sizes:";
@@ -1965,6 +2249,7 @@ Status MemorySpaceAssignment::Process() {
     VLOG(3) << "  space: " << pair.first << ", size: " << pair.second.size;
   }
 
+  VLOG(1) << "Coloring buffers...";
   // Color the pending positions and all of their aliased buffers.
   for (const auto& defining_position_and_chunk :
        preset_assignments_->chunks()) {
@@ -1973,7 +2258,7 @@ Status MemorySpaceAssignment::Process() {
              defining_position.instruction, defining_position.index)) {
       for (auto& value : buffer->values()) {
         for (auto& position : value->positions()) {
-          VLOG(3) << "Coloring " << position.ToString();
+          VLOG(4) << "Coloring " << position.ToString();
           Shape* shape = ShapeUtil::GetMutableSubshape(
               position.instruction->mutable_shape(), position.index);
           CHECK(shape->IsArray()) << "Coloring a shape that is not an array: "
@@ -1984,25 +2269,25 @@ Status MemorySpaceAssignment::Process() {
       }
     }
   }
-
   return Status::OK();
 }
 
-void PresetAssignments::RemoveAssignmentForInstruction(
+void MemorySpaceAssignment::RemoveAssignmentForInstruction(
     const HloInstruction* instruction) {
-  for (auto& position_and_chunk : chunks_) {
+  for (auto& position_and_chunk : alternate_memory_assignments_) {
     const HloPosition& position = position_and_chunk.first;
     if (position.instruction == instruction) {
-      VLOG(3) << "Removing instruction from preset assignments.";
+      VLOG(3) << "Removing instruction from alternate memory assignments.";
       // Swap the removed position and chunk with the back and pop back.
-      position_and_chunk = chunks_.back();
-      chunks_.pop_back();
+      position_and_chunk = alternate_memory_assignments_.back();
+      alternate_memory_assignments_.pop_back();
       break;
     }
   }
 }
 
 Status MemorySpaceAssignment::SimplifyGraph() {
+  VLOG(1) << "Simplifying graph...";
   for (HloComputation* computation : module_->MakeNonfusionComputations()) {
     // Parallel computations aren't in the schedule and don't need to be
     // modified.
@@ -2037,9 +2322,9 @@ Status MemorySpaceAssignment::SimplifyGraph() {
             instruction->opcode() != HloOpcode::kCopyStart &&
             instruction->opcode() != HloOpcode::kCopyDone) {
           VLOG(4) << "Instruction removed: " << instruction->ToString();
-          // Ensure the exported preset assignments don't contain a reference to
-          // the removed instruction.
-          preset_assignments_->RemoveAssignmentForInstruction(instruction);
+          // Ensure the alternate memory assignments don't contain a reference
+          // to the removed instruction.
+          RemoveAssignmentForInstruction(instruction);
           // Instead of deleting the instruction from the schedule, replace it
           // with a nullptr. This is needed because FixSchedule relies on the
           // logical time that is the index into flattened_instructions_ for
@@ -2125,6 +2410,7 @@ void MemorySpaceAssignment::EnsureInstructionAndOperandsInserted(
 }
 
 void MemorySpaceAssignment::ScheduleAsynchronousCopies() {
+  VLOG(1) << "Scheduling asynchronous copies...";
   for (MemorySpace memory_space :
        {MemorySpace::kDefault, MemorySpace::kAlternate}) {
     std::vector<CopyAllocation*> copy_allocations;
@@ -2173,6 +2459,7 @@ void MemorySpaceAssignment::ScheduleAsynchronousCopies() {
 }
 
 Status MemorySpaceAssignment::FixSchedule() {
+  VLOG(1) << "Fixing schedule...";
   CHECK(module_->has_schedule());
   HloSchedule& schedule = module_->schedule();
   for (const HloComputation* computation :
@@ -2246,7 +2533,7 @@ Status MemorySpaceAssignment::FixSchedule() {
 }
 
 Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
-  VLOG(3) << "Verifying:";
+  VLOG(1) << "Verifying...";
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module_));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloLiveRange> hlo_live_range,
@@ -2255,10 +2542,62 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
 
   BufferIntervalTree interval_tree;
   absl::flat_hash_set<int64> seen_buffers;
-  std::map<std::pair<int64, int64>,
+  // The key for events is: time, is_free, value_id. This is so that the events
+  // are sorted first by time, then within the same time, allocations are sorted
+  // earlier than frees, and finally the value id as a tie breaker.
+  std::map<std::tuple<int64, bool, int64>,
            std::tuple<const HloValue*, Chunk, HeapSimulatorTrace::Event::Kind>>
       events;
 
+  auto add_allocation_and_verify = [&](int64 start_time, int64 end_time,
+                                       const Chunk& chunk,
+                                       const HloValue* value) {
+    events[std::make_tuple(start_time, /*is_free=*/false, value->id())] =
+        std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
+    events[std::make_tuple(end_time, /*is_free=*/true, value->id())] =
+        std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE);
+
+    // Get the chunks overlapping in time and search if they overlap in space
+    // as well.
+    // TODO(berkin): For now checking against end_time - 1 (exclusive), but we
+    // really should check against end_time (inclusive) for cases where the
+    // operand can't share buffer with user (see
+    // HloDataflowAnalysis::CanShareOperandBufferWithUser).
+    for (const Chunk& overlapping_chunk :
+         interval_tree.ChunksOverlappingInTime(start_time, end_time - 1)) {
+      if (chunk.OverlapsWith(overlapping_chunk)) {
+        return InternalError(
+            ("Value %s (%d, %d) off: %d size: %d overlaps with another chunk"
+             " off: %d size: %d"),
+            value->ToShortString(), start_time, end_time, chunk.offset,
+            chunk.size, overlapping_chunk.offset, overlapping_chunk.size);
+      }
+    }
+    interval_tree.Add(start_time, end_time - 1, chunk);
+    return Status::OK();
+  };
+
+  // Go through all instructions in the module to ensure CopyStart/CopyDone
+  // instructions copy between alternate memory and default memory.
+  for (const HloComputation* computation :
+       module_->MakeNonfusionComputations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopyStart) {
+        int64 from_memory_space =
+            ShapeUtil::GetSubshape(instruction->shape(), {1})
+                .layout()
+                .memory_space();
+        int64 to_memory_space =
+            ShapeUtil::GetSubshape(instruction->shape(), {0})
+                .layout()
+                .memory_space();
+        CHECK_NE(from_memory_space, to_memory_space)
+            << "Asynchronous copy to the same memory space: "
+            << instruction->ToString();
+      }
+    }
+  }
+
   for (const auto& position_and_chunk : preset_assignments_->chunks()) {
     const HloPosition& position = position_and_chunk.first;
     const Chunk& chunk = position_and_chunk.second;
@@ -2273,33 +2612,73 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
     for (const HloValue* value : buffer.values()) {
       const HloLiveRange::TimeBound& time_bound =
           hlo_live_range->buffer_live_ranges().at(value);
-      events[std::make_pair(time_bound.start, value->id())] =
-          std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
-      events[std::make_pair(time_bound.end, value->id())] =
-          std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE);
-
-      VLOG(3) << " buffer: " << buffer.ToString()
-              << " value: " << value->ToShortString() << ": ("
-              << time_bound.start << ", " << time_bound.end
-              << ") off: " << chunk.offset << ", size: " << chunk.size;
-      // Get the chunks overlapping in time and search if they overlap in space
-      // as well.
-      // TODO(berkin): For now checking against end_time - 1 (exclusive), but we
-      // really should check against end_time (inclusive) for cases where the
-      // operand can't share buffer with user (see
-      // HloDataflowAnalysis::CanShareOperandBufferWithUser).
-      for (const Chunk& overlapping_chunk :
-           interval_tree.ChunksOverlappingInTime(time_bound.start,
-                                                 time_bound.end - 1)) {
-        if (chunk.OverlapsWith(overlapping_chunk)) {
-          return InternalError(
-              ("Buffer %s (%d, %d) off: %d size: %d overlaps with another chunk"
-               " off: %d size: %d"),
-              buffer.ToString(), time_bound.start, time_bound.end, chunk.offset,
-              chunk.size, overlapping_chunk.offset, overlapping_chunk.size);
+      const HloInstruction* last_use_instruction = nullptr;
+      int64 last_use_time = time_bound.start;
+      for (const HloUse& use : value->uses()) {
+        int64 use_time =
+            hlo_live_range->instruction_schedule().at(use.instruction);
+        if (use_time > last_use_time) {
+          last_use_time = use_time;
+          last_use_instruction = use.instruction;
         }
       }
-      interval_tree.Add(time_bound.start, time_bound.end - 1, chunk);
+
+      if (last_use_instruction &&
+          last_use_instruction->opcode() == HloOpcode::kConditional) {
+        // Special case when verifying conditional: we internally split the use
+        // of alternate memory in conditionals, so fish them out from the
+        // conditionals.
+        VLOG(3) << " Splitting conditional buffer: " << buffer.ToString()
+                << " value: " << value->ToShortString() << ": ("
+                << time_bound.start << ", " << time_bound.end
+                << ") off: " << chunk.offset << ", size: " << chunk.size;
+        int64 earliest_computation_start_time = time_bound.end;
+        for (const HloComputation* called_computation :
+             last_use_instruction->called_computations()) {
+          earliest_computation_start_time =
+              std::min(earliest_computation_start_time,
+                       hlo_live_range->computation_span_times()
+                           .at(called_computation)
+                           .start);
+          int64 parameter_time = -1;
+          int64 last_use_time = -1;
+          for (const HloPosition& position : value->positions()) {
+            if (position.instruction->opcode() == HloOpcode::kParameter &&
+                position.instruction->parent() == called_computation) {
+              parameter_time = hlo_live_range->instruction_schedule().at(
+                  position.instruction);
+              break;
+            }
+          }
+          for (const HloUse& use : value->uses()) {
+            if (use.instruction->parent() == called_computation) {
+              last_use_time = std::max(
+                  last_use_time,
+                  hlo_live_range->instruction_schedule().at(use.instruction));
+            }
+          }
+          if (last_use_time != -1) {
+            CHECK_NE(parameter_time, -1);
+            VLOG(3) << "  computation: " << called_computation->name() << ": ("
+                    << parameter_time << ", " << last_use_time << ")";
+            TF_RETURN_IF_ERROR(add_allocation_and_verify(
+                parameter_time, last_use_time, chunk, value));
+          }
+        }
+        VLOG(3) << "  from beginning until first computation: ("
+                << time_bound.start << ", "
+                << (earliest_computation_start_time - 1) << ")";
+        TF_RETURN_IF_ERROR(add_allocation_and_verify(
+            time_bound.start, earliest_computation_start_time - 1, chunk,
+            value));
+      } else {
+        VLOG(3) << " buffer: " << buffer.ToString()
+                << " value: " << value->ToShortString() << ": ("
+                << time_bound.start << ", " << time_bound.end
+                << ") off: " << chunk.offset << ", size: " << chunk.size;
+        TF_RETURN_IF_ERROR(add_allocation_and_verify(
+            time_bound.start, time_bound.end, chunk, value));
+      }
     }
   }
 
@@ -2310,8 +2689,10 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
   int64 memory_usage = 0;
   int64 max_memory_usage = 0;
   for (const auto& event : events) {
-    int64 time = event.first.first;
-    int64 buffer_id = event.first.second;
+    int64 time;
+    bool is_free;
+    int64 buffer_id;
+    std::tie(time, is_free, buffer_id) = event.first;
     const HloValue* value;
     Chunk chunk;
     HeapSimulatorTrace::Event::Kind kind;
@@ -2330,7 +2711,7 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
       memory_usage -= chunk.size;
     }
     max_memory_usage = std::max(max_memory_usage, memory_usage);
-    VLOG(3) << "Memory usage: " << memory_usage << " at time: " << time;
+    VLOG(4) << "Memory usage: " << memory_usage << " at time: " << time;
   }
   VLOG(1) << "Max memory usage ignoring fragmentation: " << max_memory_usage;
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index aa5566b834f..cf23c792c21 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -63,12 +63,15 @@ class PresetAssignments {
     return assignment_info_;
   }
 
-  // Remove the chunks_ entry that corresponds to instruction.
-  void RemoveAssignmentForInstruction(const HloInstruction* instruction);
+  // Get debugging information.
+  std::string buffer_info_str() const { return buffer_info_str_; }
+  std::string allocation_info_str() const { return allocation_info_str_; }
 
  private:
   std::vector<std::pair<HloPosition, HeapSimulator::Chunk>> chunks_;
   std::vector<std::pair<int64, AssignmentInformation>> assignment_info_;
+  std::string buffer_info_str_;
+  std::string allocation_info_str_;
 };
 
 // A wrapper class around HloCostAnalysis with additional knowledge about the
@@ -79,16 +82,31 @@ class MemorySpaceAssignmentCostAnalysis {
       const HloCostAnalysis& cost_analysis,
       float async_copy_bandwidth_bytes_per_second,
       float alternate_mem_bandwidth_bytes_per_second,
-      const HloLiveRange& hlo_live_range)
+      const HloLiveRange& hlo_live_range, const CallGraph& call_graph)
       : cost_analysis_(cost_analysis),
         async_copy_bandwidth_bytes_per_second_(
             async_copy_bandwidth_bytes_per_second),
         alternate_mem_bandwidth_bytes_per_second_(
             alternate_mem_bandwidth_bytes_per_second),
-        hlo_live_range_(hlo_live_range) {}
+        hlo_live_range_(hlo_live_range),
+        call_graph_(call_graph) {}
 
   const HloCostAnalysis& cost_analysis() const { return cost_analysis_; }
 
+  // Returns a heuristic value that captures how much putting this tensor to the
+  // alternate memory would help if the op is memory bound, or otherwise how far
+  // off is the op to memory boundedness. The larger this number, the higher
+  // priority it will be placed in the alternate memory.
+  float GetAlternateMemoryBenefit(
+      const HloInstruction& instruction,
+      float elapsed_time_due_to_alternate_mem) const;
+
+  // Returns a heuristic value of memory boundedness for the given
+  // BufferInterval.  The larger this number, the higher priority it will be
+  // placed in the alternate memory.
+  float GetMemoryBoundedness(
+      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const;
+
   // Returns the elapsed time in seconds due to compute only.
   float GetInstructionElapsedDueToCompute(
       const HloInstruction& instruction) const;
@@ -124,6 +142,10 @@ class MemorySpaceAssignmentCostAnalysis {
 
   int64 GetScheduleEndTime() const;
 
+  // Returns the number of nested while loop levels this instruction resides in.
+  // 0 means it is not in a while loop.
+  int CalculateWhileLoopNestLevel(const HloInstruction* instruction) const;
+
   const HloLiveRange& hlo_live_range() const { return hlo_live_range_; }
 
  private:
@@ -131,6 +153,7 @@ class MemorySpaceAssignmentCostAnalysis {
   float async_copy_bandwidth_bytes_per_second_;
   float alternate_mem_bandwidth_bytes_per_second_;
   const HloLiveRange& hlo_live_range_;
+  const CallGraph& call_graph_;
 };
 
 // Abstract base class that memory space assignment uses to pick prefetch
@@ -168,6 +191,14 @@ class PrefetchIntervalPicker {
   virtual std::string ToNoCopyDebugString(const Shape& shape, int64 start_time,
                                           int64 end_time) const = 0;
 
+  // Prefetch interval pickers may return a value corresponding to the benefit
+  // of placing the BufferInterval in the alternate memory. The larger value,
+  // the more beneficial.
+  virtual absl::optional<float> BufferIntervalAlternateMemoryBenefit(
+      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+    return absl::nullopt;
+  }
+
  protected:
   const absl::flat_hash_map<const HloInstruction*, int64>*
       instruction_schedule_ = nullptr;
@@ -242,15 +273,19 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   std::string ToNoCopyDebugString(const Shape& shape, int64 start_time,
                                   int64 end_time) const override;
 
+  absl::optional<float> BufferIntervalAlternateMemoryBenefit(
+      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval)
+      const override;
+
  private:
   // Returns the elapsed time in seconds between the logical interval that
   // corresponds to the instruction schedule.
   float GetLogicalIntervalElapsed(int64 start_time, int64 end_time) const;
 
-  // For performance reasons, we calculate the prefix sum of the elapsed time so
-  // that it's efficient to find the elapsed time in seconds in any logical
-  // interval.
-  std::vector<float> elapsed_time_cumsum_;
+  // For each instruction in the flattened schedule, maintain their elapsed time
+  // and while nesting level.
+  std::vector<float> elapsed_time_;
+  std::vector<int> while_nest_level_;
 
   const MemorySpaceAssignmentCostAnalysis& cost_analysis_;
   float min_async_copy_to_overlap_ratio_;
@@ -320,6 +355,11 @@ class MemorySpaceAssignment {
     // buffers.
     bool verify = false;
 
+    // If not nullptr, this function is called to dump debugging information.
+    // The first argument is appended to the file name and the second argument
+    // is the contents of the file.
+    std::function<void(absl::string_view, absl::string_view)> dump_fn = nullptr;
+
     // Enable prefetching buffers into preferred memory across program
     // boundaries
     bool enable_cross_program_prefetch = true;
@@ -398,6 +438,8 @@ class MemorySpaceAssignment {
     int64 start_time() const { return start_time_; }
     int64 end_time() const { return end_time_; }
 
+    virtual std::string ToString() const;
+
    protected:
     // Descend to the shape_index element of the tuple and replace that with
     // new_instruction.
@@ -467,6 +509,8 @@ class MemorySpaceAssignment {
       copy_start_schedule_after_ = copy_start_schedule_after;
     }
 
+    std::string ToString() const override;
+
    private:
     const Allocation& prev_allocation_;
     // These variables define the scheduling boundaries where CopyStart and
@@ -580,14 +624,24 @@ class MemorySpaceAssignment {
     AllocationSequence allocation_sequence_;
   };
 
+  // Statistics of asynchronous copies.
+  struct AsyncCopyStats {
+    int64 max_outstanding_async_copies;
+    int64 num_prefetches;
+    int64 prefetch_bytes;
+    int64 num_evictions;
+    int64 eviction_bytes;
+  };
+
+  virtual ~MemorySpaceAssignment() = default;
+
   // Runs the MemorySpaceAssignment pass.
   static StatusOr<std::unique_ptr<PresetAssignments>> Run(
       HloModule* module, const HloLiveRange& hlo_live_range,
       const HloAliasAnalysis& alias_analysis, const Options& options);
 
-  // Returns the maximum number of outstanding asynchronous copies in the
-  // module.
-  static int64 CountMaximumOutstandingAsyncCopies(const HloModule& module);
+  // Calculates asynchronous copy statistics.
+  StatusOr<AsyncCopyStats> CalculateAsyncCopyStats() const;
 
   static BufferIntervalCompare GetMemoryBoundednessBufferIntervalCompare(
       const MemorySpaceAssignmentCostAnalysis& cost_analysis);
@@ -596,7 +650,20 @@ class MemorySpaceAssignment {
   // export heap simulator trace to be used by buffer_assignment.
   Status VerifyAndExportHeapSimulatorTrace();
 
- private:
+ protected:
+  // Main driver of the memory space assignment pass.
+  virtual StatusOr<std::unique_ptr<PresetAssignments>> RunMemorySpaceAssignment(
+      const HloLiveRange& hlo_live_range,
+      const HloAliasAnalysis& alias_analysis);
+
+  // Finds an AllocationSequence for placing buffers in alternate memory using
+  // the AlternateMemoryBestFitHeap algorithm. Must be set before Process() is
+  // called.
+  virtual Status FindAllocationSequence(const HloLiveRange& hlo_live_range,
+                                        const HloAliasAnalysis& alias_analysis);
+
+  Options options() const { return options_; }
+
   MemorySpaceAssignment(HloModule* module, Options options,
                         const HloLiveRange& hlo_live_range)
       : module_(module),
@@ -615,14 +682,9 @@ class MemorySpaceAssignment {
     }
   }
 
-  // Sets allocations_. Must be set before Process() is called.
-  // Uses an rvalue reference so that the caller is forced to hand over
-  // ownership of the AllocationSequence, e.g.
-  // SetAllocationSequence(std::move(my_allocation)).
-  void SetAllocationSequence(AllocationSequence&& allocations) {
-    allocations_ = std::move(allocations);
-  }
+  AllocationSequence allocations_;
 
+ private:
   // Process calls Process methods of the allocations after the allocations have
   // been finalized.
   Status Process();
@@ -636,6 +698,10 @@ class MemorySpaceAssignment {
   // FixSchedule inserts asynchronous copies in the schedule.
   Status FixSchedule();
 
+  // Export the alternate memory assignments to the PresetAssignments and color
+  // the HLO graph with the determined memory spaces.
+  Status ExportAndColorBuffers();
+
   // Insert an instruction to the schedule, and make sure its dependencies
   // (operands) are already in the schedule. If not, insert these operands
   // before the instruction.
@@ -647,12 +713,17 @@ class MemorySpaceAssignment {
   // corresponding CopyDones follow the same order.
   void ScheduleAsynchronousCopies();
 
+  // Remove the positions and chunks associated with the instruction from
+  // alternate_memory_assignments_.
+  void RemoveAssignmentForInstruction(const HloInstruction* instruction);
+
   HloModule* module_;
   Options options_;
   std::vector<HloInstruction*> flattened_instructions_;
   absl::flat_hash_set<const HloComputation*> computations_in_schedule_;
-  AllocationSequence allocations_;
   std::unique_ptr<PresetAssignments> preset_assignments_;
+  std::vector<std::pair<HloPosition, Chunk>> alternate_memory_assignments_;
+  int64 alternate_memory_size_ = 0;
 
   // These maps hold vectors of new instructions that need to be scheduled after
   // (or before) the instruction index in the key. FixSchedule uses these maps
@@ -765,11 +836,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   // use_times is a sorted sequence of the times of all uses.
   // latest_prefetch_time is the latest time we can schedule the CopyDone for a
   // prefetch.
+  // If allow_no_copy_alternate_mem_allocation is false, an eviction is forced.
+  // If earliest_prefetch_time is set, prefetches cannot start before this
+  // value.
   struct AllocationRequest {
     int64 start_time;
     int64 end_time;
     int64 latest_prefetch_time;
     int64 size;
+    bool allow_no_copy_alternate_mem_allocation;
+    absl::optional<int64> earliest_prefetch_time;
     absl::optional<int64> preferred_offset;
     HloUse use;
     MemorySpaceAssignment::AllocationValue* allocation_value;
@@ -790,7 +866,8 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   bool IsIntervalAllowedInAlternateMemory(const BufferInterval& interval) const;
 
   // Returns true if the use is allowed in the alternate memory.
-  bool IsUseAllowedInAlternateMemory(const HloUse& use) const;
+  bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
+                                     const HloUse& use) const;
 
   // Given an HloValue, creates AllocationValue objects and corresponding
   // AllocationSequences and appends them into allocation_sequence_list_.
@@ -844,6 +921,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       const HloInstruction* instruction, ShapeIndex index,
       const MemorySpaceAssignment::Allocation* aliased_allocation);
 
+  // This sets a required assignment. CHECK fails if there is a conflicting
+  // required assignment at the same time.
+  void AddRequiredAssignment(const HloValue* value,
+                             const HloInstruction* instruction,
+                             MemorySpace memory_space, int64 time,
+                             absl::optional<Chunk> chunk = absl::nullopt);
+  void AddRequiredAssignment(const HloInstruction* instruction,
+                             ShapeIndex index, MemorySpace memory_space,
+                             absl::optional<Chunk> chunk = absl::nullopt);
+
   // Adds input and outputs as required assignments.
   void AddInputAndOutputRequiredAssignments();
 
@@ -889,6 +976,17 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   // buffers from the interval trees.
   void UncommitPendingChunks();
 
+  // Append buffer and allocation infos for debugging and dump it into a file,
+  // if enabled.
+  void AppendBufferInfoDebugString(const BufferInterval& interval,
+                                   std::string* debug_str) const;
+  void AppendAllocationInfoDebugString(
+      const BufferInterval& interval,
+      const MemorySpaceAssignment::Allocation& allocation,
+      std::string* debug_str) const;
+  void DumpIfEnabled(absl::string_view buffer_info_str,
+                     absl::string_view allocation_info_str) const;
+
   // Returns the available heap size in the alternate memory.
   int64 available_heap_size() const {
     return options_.max_size_in_bytes - reserved_in_bytes_;
@@ -910,7 +1008,6 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       required_assignments_;
   // Number of bytes reserved in alternate memory space.
   int64 reserved_in_bytes_ = 0;
-  int64 global_max_time_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 2788dcf1c9e..61843b2e765 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -57,9 +57,10 @@ class MemorySpaceAssignmentTest : public HloTestBase,
         HloLiveRange::Run(module->schedule(), *alias_analysis,
                           module->entry_computation())
             .ValueOrDie();
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
     MemorySpaceAssignmentCostAnalysis cost_analysis(
         hlo_cost_analysis, kAsyncCopyBandwidth, kAlternateMemBandwidth,
-        *hlo_live_range);
+        *hlo_live_range, *call_graph);
     CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
         CostAnalysisPrefetchIntervalPicker(
             cost_analysis, /*min_async_copy_to_overlap_ratio=*/0.8,
@@ -184,6 +185,22 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     }
   }
 
+  /*static*/ int64 CountMaximumOutstandingAsyncCopies(const HloModule& module) {
+    int64 max_copies = 0;
+    int64 current_copies = 0;
+    for (HloInstruction* instruction : module.schedule()
+                                           .sequence(module.entry_computation())
+                                           .instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopyStart) {
+        current_copies++;
+      } else if (instruction->opcode() == HloOpcode::kCopyDone) {
+        current_copies--;
+      }
+      max_copies = std::max(max_copies, current_copies);
+    }
+    return max_copies;
+  }
+
   std::unique_ptr<HloModule> CreateEvictAndPrefetchModule() {
     HloComputation::Builder builder(TestName());
     Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
@@ -391,8 +408,7 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies0) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/0);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            0);
+  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 0);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
@@ -400,8 +416,7 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/1);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            1);
+  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 1);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
@@ -409,8 +424,7 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/2);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            2);
+  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 2);
 }
 
 // TODO(berkin): This test is broken with some prefetch timing improvements.
@@ -737,16 +751,17 @@ TEST_P(MemorySpaceAssignmentTest, Bitcast) {
   // refer to unique positions.
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  Shape param_shape = ShapeUtil::MakeShape(F32, {6});
   HloInstruction* p0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
-  HloInstruction* p1 =
-      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  HloInstruction* p1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "p1"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0));
-  HloInstruction* bitcast =
-      builder.AddInstruction(HloInstruction::CreateBitcast(shape, negate));
+  HloInstruction* bitcast = builder.AddInstruction(
+      HloInstruction::CreateBitcast(param_shape, negate));
   HloInstruction* add = builder.AddInstruction(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, bitcast, p1));
+      HloInstruction::CreateBinary(param_shape, HloOpcode::kAdd, bitcast, p1));
 
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
@@ -757,6 +772,8 @@ TEST_P(MemorySpaceAssignmentTest, Bitcast) {
 
   AssignMemorySpace(module.get());
 
+  bitcast = add->mutable_operand(0);
+  EXPECT_EQ(bitcast->opcode(), HloOpcode::kBitcast);
   EXPECT_EQ(bitcast->shape().layout().memory_space(), kAlternateMemorySpace);
 }
 
@@ -1647,6 +1664,324 @@ TEST_P(MemorySpaceAssignmentTest, ControlPredecessorsBug) {
   AssignMemorySpace(module.get());
 }
 
+TEST_P(MemorySpaceAssignmentTest, ConditionalShouldBeAllocatedInAlternateMem) {
+  // Checks if simple conditionals get alternate memory allocations.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}) tuple(copy)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation, false_computation=false_computation
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Check that copy and gtes got alternate memory allocations.
+    auto copy =
+        module->GetComputationWithName("entry")->GetInstructionWithName("copy");
+    EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto neg1 = module->GetComputationWithName("true_computation")
+                    ->GetInstructionWithName("neg1");
+    auto neg1_operand = neg1->operand(0);
+    EXPECT_EQ(neg1_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    auto neg2 = module->GetComputationWithName("false_computation")
+                    ->GetInstructionWithName("neg2");
+    auto neg2_operand = neg2->operand(0);
+    EXPECT_EQ(neg2_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalAvoidsUnnecessaryPrefetch) {
+  // Checks if we avoid unnecessary allocation in alternate memory if the input
+  // won't be used in the computation for a long time.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}, f32[3]{0}) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    neg0 = f32[3]{0} negate(gte0)
+    neg1 = f32[3]{0} negate(neg0)
+    neg2 = f32[3]{0} negate(neg1)
+    neg3 = f32[3]{0} negate(neg2)
+    neg4 = f32[3]{0} negate(neg3)
+    neg5 = f32[3]{0} negate(neg4)
+    neg6 = f32[3]{0} negate(neg5)
+    neg7 = f32[3]{0} negate(neg6)
+    neg8 = f32[3]{0} negate(neg7)
+    neg9 = f32[3]{0} negate(neg8)
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    ROOT add = f32[3]{0} add(neg9, gte1)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1)
+    tuple1 = (f32[3]{0}) tuple(copy0)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Check that copy1 doesn't get unnecessarily allocated in alternate mem
+    // (due to long negate chain in true_computation) but is prefetched before
+    // add.
+    auto copy0 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy0");
+    EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto copy1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy1");
+    EXPECT_EQ(copy1->shape().layout().memory_space(), kDefaultMemorySpace);
+    auto add = module->GetComputationWithName("true_computation")
+                   ->GetInstructionWithName("add");
+    auto add_operand = add->operand(1);
+    EXPECT_EQ(add_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUse) {
+  // Make sure there is an evict when there is a conditional use followed by
+  // another use.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}, f32[3]{0}) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    add0 = f32[3]{0} add(gte0, gte1)
+    neg0 = f32[3]{0} negate(add0)
+    neg1 = f32[3]{0} negate(neg0)
+    neg2 = f32[3]{0} negate(neg1)
+    neg3 = f32[3]{0} negate(neg2)
+    neg4 = f32[3]{0} negate(neg3)
+    neg5 = f32[3]{0} negate(neg4)
+    neg6 = f32[3]{0} negate(neg5)
+    neg7 = f32[3]{0} negate(neg6)
+    neg8 = f32[3]{0} negate(neg7)
+    ROOT neg9 = f32[3]{0} negate(neg8)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1)
+    tuple1 = (f32[3]{0}) tuple(copy0)
+    conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation
+    ROOT add1 = f32[3]{0} add(copy1, conditional)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure the copy1->add edge is in alternate memory. Before conditional,
+    // this should be evicted to default memory and neg uses the input from
+    // default memory.
+    auto copy1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy1");
+    EXPECT_EQ(copy1->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto add0 = module->GetComputationWithName("true_computation")
+                    ->GetInstructionWithName("add0");
+    auto add0_operand = add0->operand(1);
+    EXPECT_EQ(add0_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    auto add1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName("add1");
+    auto add1_operand = add1->operand(0);
+    EXPECT_EQ(add1_operand->shape().layout().memory_space(),
+              kDefaultMemorySpace);
+    EXPECT_EQ(add1_operand->opcode(), HloOpcode::kCopyDone);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUseInWhile) {
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  while_cond {
+    p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    ROOT gte = pred[] get-tuple-element(p0), index=2
+  }
+
+  while_body {
+    p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    gte2 = pred[] get-tuple-element(p0), index=2
+    cond_tuple = (f32[3]{0}) tuple(gte0)
+    conditional = f32[3]{0} conditional(gte2, cond_tuple, cond_tuple), true_computation=true_computation, false_computation=false_computation
+    add = f32[3]{0} add(conditional, gte1)
+    neg0 = f32[3]{0} negate(add)
+    neg1 = f32[3]{0} negate(neg0)
+    ROOT tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(gte0, neg1, gte2)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(copy0, copy1, p1)
+    while = (f32[3]{0}, f32[3]{0}, pred[]) while(tuple), condition=while_cond, body=while_body
+    ROOT gte = f32[3]{0} get-tuple-element(while), index=1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure copy1/while{0}/cond_tuple{0} gets alternate memory allocation.
+    // This will force an eviction and a prefetch for while body root.
+    auto copy0 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy0");
+    EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto conditional = module->GetComputationWithName("while_body")
+                           ->GetInstructionWithName("conditional");
+    auto conditional_operand = conditional->operand(1);
+    EXPECT_EQ(ShapeUtil::GetSubshape(conditional_operand->shape(), {0})
+                  .layout()
+                  .memory_space(),
+              kAlternateMemorySpace);
+    auto while_root =
+        module->GetComputationWithName("while_body")->root_instruction();
+    auto while_root_operand = while_root->operand(0);
+    EXPECT_THAT(
+        while_root_operand,
+        op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                      op::AsyncCopy(kDefaultMemorySpace, kAlternateMemorySpace,
+                                    op::GetTupleElement(op::Parameter(0)))));
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, NestedConditional) {
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  true_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    slice = f32[1]{0} slice(gte), slice={[0:1]}
+    bitcast = f32[] bitcast(slice)
+    constant = f32[] constant(0.0)
+    compare = pred[] compare(bitcast, constant), direction=GT
+    ROOT conditional = f32[3]{0} conditional(compare, p0, p0), true_computation=true_computation2, false_computation=false_computation2
+  }
+
+  false_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg3 = f32[3]{0} negate(gte)
+  }
+
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}) tuple(copy)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation1, false_computation=false_computation1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure alternate memory allocation gets propagated into both levels of
+    // conditional.
+    auto copy =
+        module->GetComputationWithName("entry")->GetInstructionWithName("copy");
+    EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto neg1_operand = module->GetComputationWithName("true_computation2")
+                            ->GetInstructionWithName("neg1")
+                            ->operand(0);
+    auto neg2_operand = module->GetComputationWithName("false_computation2")
+                            ->GetInstructionWithName("neg2")
+                            ->operand(0);
+    auto neg3_operand = module->GetComputationWithName("false_computation1")
+                            ->GetInstructionWithName("neg3")
+                            ->operand(0);
+    EXPECT_EQ(neg1_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    EXPECT_EQ(neg2_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    EXPECT_EQ(neg3_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
 TEST_P(MemorySpaceAssignmentTest,
        RequestIdentifierShouldNotBeAllocatedInAlternateMem) {
   // Ensure that request identifier returned by Send/Recv HLOs are not allocated
@@ -2133,7 +2468,8 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule3) {
   AssignMemorySpace(module.get(), -1, 5);
 }
 
-TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule4) {
+// TODO(berkin): This might be an incorrect input graph, investigate.
+TEST_P(MemorySpaceAssignmentTest, DISABLED_NonEntryComputationSchedule4) {
   auto module = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(xla::F32, {2, 3});
   Shape shape2 = ShapeUtil::MakeShape(xla::F32, {3, 3});
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.cc b/tensorflow/compiler/xla/service/memory_space_propagation.cc
new file mode 100644
index 00000000000..80eb4017477
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.cc
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_propagation.h"
+
+namespace xla {
+
+StatusOr<bool> MemorySpacePropagation::Run(HloModule* module) {
+  bool modified = false;
+  TF_ASSIGN_OR_RETURN(auto dataflow_analysis,
+                      HloDataflowAnalysis::Run(*module));
+  dataflow_analysis_ = std::move(dataflow_analysis);
+
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kFusion) {
+        // Propagate the operand subshapes.
+        for (int operand_idx = 0; operand_idx < instruction->operand_count();
+             ++operand_idx) {
+          modified |=
+              PropagateSubshapes(instruction->operand(operand_idx)->shape(),
+                                 instruction->fused_parameter(operand_idx));
+        }
+
+        // Propagate output subshapes.
+        modified |= PropagateSubshapes(instruction->shape(),
+                                       instruction->fused_expression_root());
+      }
+    }
+  }
+  return modified;
+}
+
+bool MemorySpacePropagation::PropagateSubshapes(
+    const Shape& caller_shape, const HloInstruction* callee_instruction) const {
+  bool modified = false;
+  for (const ShapeUtil::IndexedShape& indexed_shape :
+       ShapeUtil::GetLeafShapes(caller_shape)) {
+    int64 memory_space = indexed_shape.shape.layout().memory_space();
+    const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
+        callee_instruction, indexed_shape.index);
+
+    for (const HloPosition& position : value.positions()) {
+      Shape* shape = ShapeUtil::GetMutableSubshape(
+          position.instruction->mutable_shape(), position.index);
+      if (shape->layout().memory_space() != memory_space) {
+        shape->mutable_layout()->set_memory_space(memory_space);
+        modified = true;
+      }
+    }
+  }
+  return modified;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.h b/tensorflow/compiler/xla/service/memory_space_propagation.h
new file mode 100644
index 00000000000..65a1dfd14a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This is a legalization pass that propagates the memory space in the layout to
+// the fusion computations.
+class MemorySpacePropagation : public HloModulePass {
+ public:
+  ~MemorySpacePropagation() override = default;
+  absl::string_view name() const override { return "memory-space-propagation"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // Given the caller shape (operand or output) and its corresponding
+  // insturction in the fused computation (parameter or root), propagates the
+  // memory space to all the subshapes in the callee side. Returns true if the
+  // module is modified.
+  bool PropagateSubshapes(const Shape& caller_shape,
+                          const HloInstruction* callee_instruction) const;
+
+  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation_test.cc b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
new file mode 100644
index 00000000000..8d74958f6aa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_propagation.h"
+
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class MemorySpacePropagationTest : public HloTestBase {
+ public:
+  MemorySpacePropagationTest()
+      : HloTestBase(),
+        verifier_(/*layout_sensitive=*/false, /*allow_mixed_precision*/ false) {
+  }
+
+  Status Verify(HloModule* module) { return verifier_.Run(module).status(); }
+
+ private:
+  HloVerifier verifier_;
+};
+
+TEST_F(MemorySpacePropagationTest, NoMemorySpace) {
+  absl::string_view hlo_string = R"(
+  HloModule NoMemorySpace
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    ROOT %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)} copy(%param2)
+    %fusion = s32[6]{0:T(128)} fusion(s32[6]{0:T(128)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_FALSE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_ASSERT_OK_AND_ASSIGN(auto ref, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+TEST_F(MemorySpacePropagationTest, NonTupleOutput) {
+  absl::string_view hlo_string = R"(
+  HloModule NonTupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    ROOT %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule NonTupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    ROOT %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+TEST_F(MemorySpacePropagationTest, TupleOutput) {
+  absl::string_view hlo_string = R"(
+  HloModule TupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    %multiply.0 = s32[6]{0:T(128)} multiply(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    ROOT %tuple = (s32[6]{0:T(128)}, s32[6]{0:T(128)}) tuple(%add.0, %multiply.0)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %gte0 = s32[6]{0:T(128)S(1)} get-tuple-element(%fusion), index=0
+    %gte1 = s32[6]{0:T(128)} get-tuple-element(%fusion), index=1
+    ROOT %root = s32[6]{0:T(128)} add(%gte0, %gte1)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule TupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    %multiply.0 = s32[6]{0:T(128)} multiply(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    ROOT %tuple = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) tuple(%add.0, %multiply.0)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %gte0 = s32[6]{0:T(128)S(1)} get-tuple-element(%fusion), index=0
+    %gte1 = s32[6]{0:T(128)} get-tuple-element(%fusion), index=1
+    ROOT %root = s32[6]{0:T(128)} add(%gte0, %gte1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index abeeb866e8c..07655a61074 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -5,6 +5,7 @@ load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
     default_visibility = [":friends"],
@@ -58,11 +59,26 @@ cc_library(
 
 cc_library(
     name = "mlir_compiler",
-    srcs = if_cuda_is_configured(["mlir_compiler.cc"]),
-    hdrs = if_cuda_is_configured(["mlir_compiler.h"]),
-    deps = if_cuda_is_configured([
+    srcs = ["mlir_compiler.cc"],
+    hdrs = ["mlir_compiler.h"],
+    deps = [
         ":emission_context",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service/gpu:target_constants",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "@llvm-project//llvm:core",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+    ],
+)
+
+cc_library(
+    name = "mlir_compiler_impl",
+    srcs = if_cuda_is_configured(["mlir_compiler_impl.cc"]),
+    deps = if_cuda_is_configured([
+        ":mlir_compiler",
         ":failover_compiler",
+        ":emission_context",
         ":kernel_lowering",
         ":lhlo_dialect_emitter",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -76,7 +92,6 @@ cc_library(
         "@llvm-project//mlir:TargetNVVMIR",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/gpu:gpu_constants",
@@ -92,7 +107,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/core:cuda_libdevice_path",
         "//tensorflow/core:lib",
-        "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/gpu:asm_compiler",
     ]),
     alwayslink = True,  # Contains compiler registration
@@ -159,6 +173,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AffineToStandardTransforms",
         "@llvm-project//mlir:CFGTransforms",
         "@llvm-project//mlir:GPUDialect",
@@ -170,38 +185,57 @@ cc_library(
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgToLLVM",
         "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LoopOps",
-        "@llvm-project//mlir:LoopsToGPUPass",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFToGPUPass",
+        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
     ],
 )
 
 cc_library(
-    name = "mlir_irgen_test_base",
+    name = "xla_gpu_opt_lib",
     testonly = True,
-    srcs = if_cuda_is_configured(["mlir_irgen_test_base.cc"]),
-    hdrs = if_cuda_is_configured(["mlir_irgen_test_base.h"]),
+    srcs = ["xla_gpu_opt.cc"],
+    hdrs = ["xla_gpu_opt.h"],
+    tags = ["no_pip"],
     deps = [
-        ":emission_context",
         ":failover_compiler",
         ":inject_errors_pass",
         ":mlir_compiler",
-        "//tensorflow/compiler/mlir/xla:hlo_utils",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/compiler/xla/tests:codegen_test_base",
-        "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:verified_hlo_module",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core/platform:resource_loader",
-        "//tensorflow/core/platform:test",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
     ],
 )
+
+tf_cc_binary(
+    name = "xla-gpu-opt",
+    testonly = True,
+    srcs = ["xla_gpu_opt_main.cc"],
+    tags = ["no_pip"],
+    deps = [
+        ":mlir_compiler",
+        ":xla_gpu_opt_lib",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla/service:gpu_plugin_mlir",
+        "//tensorflow/core:lib",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc
index c17d686f7dc..36cf37e4044 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
@@ -45,6 +46,8 @@ namespace xla {
 namespace mlir_gpu {
 namespace {
 
+using mlir::OpBuilder;
+
 // Various extracted information for input shapes.
 struct ShapeInfo {
   // Buffer dimensions in the order of NCHW.
@@ -93,7 +96,8 @@ ShapeInfo GetShapeInfo(
   }
 
   shape_info.affine_map = mlir::AffineMap::get(
-      /*dimCount=*/2 + spatial_dims.size(), /*symbolCount=*/0, affine_exprs);
+      /*dimCount=*/2 + spatial_dims.size(), /*symbolCount=*/0, affine_exprs,
+      builder.getContext());
 
   shape_info.element_type = [&] {
     switch (shape.element_type()) {
@@ -154,7 +158,7 @@ mlir::Operation* HoistAndFix(llvm::iplist<mlir::Operation>::iterator begin_op,
     CHECK(std::next(begin_op) == end_op)
         << "alloc() needs to be hoisted by its own";
 
-    mlir::OpBuilder builder(where);
+    OpBuilder builder(where);
     mlir::MemRefType type = alloc.getType();
     CHECK(type.getAffineMaps().empty());
     ancestor_dimensions.insert(ancestor_dimensions.end(),
@@ -178,7 +182,7 @@ mlir::Operation* HoistAndFix(llvm::iplist<mlir::Operation>::iterator begin_op,
           affine_map.operands.size(), builder.getContext());
 
       mlir::Operation* new_op =
-          CloneWithNewAffineMap(owner, affine_map, mlir::OpBuilder(owner));
+          CloneWithNewAffineMap(owner, affine_map, OpBuilder(owner));
       SetMemRef(new_op, new_alloc);
       owner->replaceAllUsesWith(new_op);
       owner->erase();
@@ -198,13 +202,13 @@ mlir::Operation* HoistAndFix(llvm::iplist<mlir::Operation>::iterator begin_op,
   }();
 
   if (any_op_is_loop_variant) {
-    auto builder = mlir::OpBuilder(where);
+    auto builder = OpBuilder(where);
     std::vector<mlir::AffineForOp> new_loops;
     for (auto dim : ancestor_dimensions) {
       auto where =
           builder.create<mlir::AffineForOp>(builder.getUnknownLoc(), 0, dim);
       new_loops.push_back(where);
-      builder = where.getBodyBuilder();
+      builder = OpBuilder::atBlockTerminator(where.getBody());
     }
     for (mlir::Operation& op :
          llvm::make_early_inc_range(llvm::make_range(begin_op, end_op))) {
@@ -244,7 +248,7 @@ StatusOr<InitialMlirConvAnchors> CreateNaiveMlirConv(
     mlir::Value input, mlir::Value filter, mlir::Value output,
     const ShapeInfo& input_shape_info, const ShapeInfo& filter_shape_info,
     const ShapeInfo& output_shape_info, const Window& window,
-    mlir::OpBuilder builder) {
+    OpBuilder builder) {
   CHECK(input_shape_info.element_type == builder.getF16Type());
   CHECK(filter_shape_info.element_type == builder.getF16Type());
   CHECK(output_shape_info.element_type == builder.getF16Type());
@@ -254,7 +258,8 @@ StatusOr<InitialMlirConvAnchors> CreateNaiveMlirConv(
   std::vector<mlir::AffineForOp> cartesian_product_loops =
       CreateNestedSimpleLoops(output_shape_info.nchw_dimensions, builder);
 
-  builder = cartesian_product_loops.back().getBodyBuilder();
+  builder =
+      OpBuilder::atBlockTerminator(cartesian_product_loops.back().getBody());
 
   mlir::AllocOp output_acc = builder.create<mlir::AllocOp>(
       location, mlir::MemRefType::get({}, builder.getF32Type()));
@@ -284,7 +289,7 @@ StatusOr<InitialMlirConvAnchors> CreateNaiveMlirConv(
   int num_spatial_dims = output_spatial_indvars.size();
   CHECK_EQ(num_spatial_dims, filter_spatial_indvars.size());
 
-  builder = reduction_loops.back().getBodyBuilder();
+  builder = OpBuilder::atBlockTerminator(reduction_loops.back().getBody());
 
   mlir::Value loaded_input = [&] {
     std::vector<mlir::AffineExpr> input_indices;
@@ -315,9 +320,9 @@ StatusOr<InitialMlirConvAnchors> CreateNaiveMlirConv(
         builder.createOrFold<mlir::AffineLoadOp>(
             location, input,
             mlir::AffineMap(input_shape_info.affine_map)
-                .compose(
-                    mlir::AffineMap::get(/*dimCount=*/2 + num_spatial_dims * 2,
-                                         /*symbolCount=*/0, input_indices)),
+                .compose(mlir::AffineMap::get(
+                    /*dimCount=*/2 + num_spatial_dims * 2,
+                    /*symbolCount=*/0, input_indices, builder.getContext())),
             input_vars),
         builder.getF32Type());
   }();
@@ -523,7 +528,7 @@ StatusOr<TransformedMlirConvAnchors> TransformMlirConv(
 StatusOr<mlir::FuncOp> EmitConvolutionForwardAsMlir(
     HloInstruction* conv, absl::string_view function_name,
     mlir::MLIRContext* context) {
-  mlir::OpBuilder builder(context);
+  OpBuilder builder(context);
 
   const auto& dim_nums = conv->convolution_dimension_numbers();
   ShapeInfo input_shape_info =
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
index 56684b1f726..d5cad385324 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.cc
index 045d06c9c86..86ada25793d 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.cc
@@ -24,6 +24,8 @@ limitations under the License.
 namespace xla {
 namespace mlir_gpu {
 
+using mlir::OpBuilder;
+
 BoundAffineMap GetBoundAffineMapFrom(mlir::Operation* op) {
   if (auto load = mlir::dyn_cast<mlir::AffineLoadOp>(op)) {
     return {load.getAffineMap(),
@@ -40,7 +42,7 @@ BoundAffineMap GetBoundAffineMapFrom(mlir::Operation* op) {
 
 mlir::Operation* CloneWithNewAffineMap(mlir::Operation* op,
                                        BoundAffineMap new_affine,
-                                       mlir::OpBuilder builder) {
+                                       OpBuilder builder) {
   if (auto load = mlir::dyn_cast<mlir::AffineLoadOp>(op)) {
     return builder.create<mlir::AffineLoadOp>(
         builder.getUnknownLoc(), load.getMemRef(), new_affine.affine_map,
@@ -62,20 +64,20 @@ bool IsSimpleLoop(mlir::AffineForOp loop) {
 }
 
 std::vector<mlir::AffineForOp> CreateNestedSimpleLoops(
-    absl::Span<const int64_t> upper_bounds, mlir::OpBuilder builder) {
+    absl::Span<const int64_t> upper_bounds, OpBuilder builder) {
   std::vector<mlir::AffineForOp> loops;
   loops.reserve(upper_bounds.size());
   for (int64_t dim : upper_bounds) {
     auto loop =
         builder.create<mlir::AffineForOp>(builder.getUnknownLoc(), 0, dim);
     loops.push_back(loop);
-    builder = loop.getBodyBuilder();
+    builder = OpBuilder::atBlockTerminator(loop.getBody());
   }
   return loops;
 }
 
 void SetBoundForSimpleLoop(mlir::AffineForOp loop, mlir::AffineExpr new_bound,
-                           mlir::OpBuilder builder) {
+                           OpBuilder builder) {
   CHECK(IsSimpleLoop(loop));
 
   loop.setUpperBoundMap(mlir::AffineMap::get(
@@ -93,7 +95,7 @@ mlir::AffineForOp TileLoop(mlir::AffineForOp loop, int64_t size,
     CHECK(absl::c_linear_search(all_loops, target));
   }
 
-  auto builder = target.getBodyBuilder();
+  auto builder = OpBuilder::atBlockTerminator(target.getBody());
 
   auto inner_loop =
       builder.create<mlir::AffineForOp>(builder.getUnknownLoc(), 0, size);
@@ -127,8 +129,7 @@ mlir::AffineForOp TileLoop(mlir::AffineForOp loop, int64_t size,
     }
     affine_map.affine_map = affine_map.affine_map.replaceDimsAndSymbols(
         replacements, {}, affine_map.operands.size(), 0);
-    auto new_op =
-        CloneWithNewAffineMap(owner, affine_map, mlir::OpBuilder(owner));
+    auto new_op = CloneWithNewAffineMap(owner, affine_map, OpBuilder(owner));
     owner->replaceAllUsesWith(new_op);
     owner->erase();
   }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
index 0a2c15b3b27..33550273bf5 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
@@ -58,6 +58,8 @@ StatusOr<Value> InsertMlirOp(HloOpcode opcode, OpBuilder func_builder,
       return {func_builder.create<hlo::AndOp>(loc, rets, args, attrs)};
     case HloOpcode::kCeil:
       return {func_builder.create<hlo::CeilOp>(loc, rets, args, attrs)};
+    case HloOpcode::kComplex:
+      return {func_builder.create<hlo::ComplexOp>(loc, rets, args, attrs)};
     case HloOpcode::kCopy:
       return {func_builder.create<hlo::CopyOp>(loc, rets, args, attrs)};
     case HloOpcode::kCos:
@@ -66,6 +68,8 @@ StatusOr<Value> InsertMlirOp(HloOpcode opcode, OpBuilder func_builder,
       return {func_builder.create<hlo::DivOp>(loc, rets, args, attrs)};
     case HloOpcode::kExp:
       return {func_builder.create<hlo::ExpOp>(loc, rets, args, attrs)};
+    case HloOpcode::kImag:
+      return {func_builder.create<hlo::ImagOp>(loc, rets, args, attrs)};
     case HloOpcode::kLog:
       return {func_builder.create<hlo::LogOp>(loc, rets, args, attrs)};
     case HloOpcode::kMaximum:
@@ -76,6 +80,8 @@ StatusOr<Value> InsertMlirOp(HloOpcode opcode, OpBuilder func_builder,
       return {func_builder.create<hlo::MulOp>(loc, rets, args, attrs)};
     case HloOpcode::kNegate:
       return {func_builder.create<hlo::NegOp>(loc, rets, args, attrs)};
+    case HloOpcode::kReal:
+      return {func_builder.create<hlo::RealOp>(loc, rets, args, attrs)};
     case HloOpcode::kRemainder:
       return {func_builder.create<hlo::RemOp>(loc, rets, args, attrs)};
     case HloOpcode::kRsqrt:
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 5b684c075bb..4645b084eb6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -15,22 +15,25 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
 
-#include <memory>
-
 #include "absl/memory/memory.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/LoopOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
@@ -42,8 +45,11 @@ limitations under the License.
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
@@ -55,34 +61,6 @@ namespace {
 
 using ::mlir::xla_lhlo::FusionOp;
 
-// Following are some small transformations that are required to clean up code
-// after lowering from linalg to loops.
-
-// A simple pass that applies lowering of HLO to LHLO only within LHLO ops that
-// contain regions with HLO ops, e.g. FusionOp, ReduceOp, SelectAndScatterOp.
-// This is needed, as these ops are not closed from above and hence nested pass
-// managers can not be applied.
-struct NestedHloRegionsConverter
-    : public mlir::PassWrapper<NestedHloRegionsConverter,
-                               ::mlir::FunctionPass> {
-  void runOnFunction() override {
-    auto& ctx = getContext();
-    mlir::OwningRewritePatternList patterns;
-    mlir::ConversionTarget target(ctx);
-    target.addLegalDialect<::mlir::xla_lhlo::XlaLhloDialect>();
-    ::mlir::xla_hlo::populateHLOToLHLOConversionPattern(&ctx, &patterns);
-
-    getFunction().walk([&](mlir::Operation* op) {
-      if (op->getNumRegions() == 0) {
-        return;
-      }
-      if (failed(applyPartialConversion(op, target, patterns, nullptr))) {
-        signalPassFailure();
-      }
-    });
-  }
-};
-
 // Replaces a FusionOp by the operations contained in its region.
 struct FusionOpRemover
     : public mlir::PassWrapper<FusionOpRemover, ::mlir::FunctionPass> {
@@ -104,84 +82,37 @@ struct FusionOpRemover
   }
 };
 
-// Rewrite the single-trip loops we get out of linalg into just their bodies.
-// TODO(herhut): Make this a general pattern.
-struct SingleTripLoopRemoval
-    : public mlir::PassWrapper<SingleTripLoopRemoval, ::mlir::FunctionPass> {
-  void runOnFunction() override {
-    auto getConstantValue = [](mlir::Value value) -> llvm::Optional<int64_t> {
-      auto definingOp = value.getDefiningOp();
-      if (!definingOp) return llvm::None;
-      auto constantOp = llvm::dyn_cast<mlir::ConstantOp>(definingOp);
-      if (!constantOp) return llvm::None;
-      auto integer = constantOp.getValue().dyn_cast<mlir::IntegerAttr>();
-      if (!integer) return llvm::None;
-      return integer.getInt();
-    };
-    getFunction().walk([&](mlir::loop::ForOp forOp) {
-      auto lower = getConstantValue(forOp.lowerBound());
-      auto upper = getConstantValue(forOp.upperBound());
-      auto step = getConstantValue(forOp.step());
-      if (!lower || !upper || !step) return;
-      if ((lower.getValue() < upper.getValue()) &&
-          (lower.getValue() + step.getValue() >= upper.getValue())) {
-        // This loop has a single trip, so we can move the body in front.
-        mlir::BlockAndValueMapping mapping;
-        mlir::OpBuilder b(forOp);
-        mapping.map(forOp.getInductionVar(), forOp.lowerBound());
-        for (auto& nested_op : forOp.getBody()->without_terminator()) {
-          auto clone = b.clone(nested_op, mapping);
-          for (auto pair :
-               llvm::zip(nested_op.getResults(), clone->getResults())) {
-            mapping.map(std::get<0>(pair), std::get<1>(pair));
-          }
-        }
-        forOp.erase();
-      }
-    });
-  }
-};
-
 // Simple pass that replaces a load that immediately follows a store to the
 // same address with the stored value. This needs generalization.
 struct StoreForwardingPass
-    : mlir::PassWrapper<StoreForwardingPass, ::mlir::FunctionPass> {
-  void runOnFunction() override {
-    llvm::DenseMap<mlir::Value, mlir::Operation*> memrefToAllocOp;
-
-    getFunction().walk([&](mlir::LoadOp loadOp) {
-      auto* block = loadOp.getOperation()->getBlock();
-      auto loadOpIt = std::find_if(block->rbegin(), block->rend(),
-                                   [&loadOp](mlir::Operation& other) {
-                                     return &other == loadOp.getOperation();
-                                   });
-      for (auto storeOpIt = loadOpIt; storeOpIt != block->rend(); ++storeOpIt) {
-        auto storeOp = llvm::dyn_cast<mlir::StoreOp>(&*(storeOpIt));
-        if (!storeOp) {
-          continue;
-        }
-        mlir::Operation* storeOpAlloc =
-            GetAllocOp(storeOp.memref(), &memrefToAllocOp);
-        mlir::Operation* loadOpAlloc =
-            GetAllocOp(loadOp.memref(), &memrefToAllocOp);
-        if (!storeOpAlloc || !loadOpAlloc || storeOpAlloc != loadOpAlloc) {
-          continue;
-        }
-        auto storeIndices = storeOp.getIndices();
-        auto loadIndices = loadOp.getIndices();
-        if (!std::equal(storeIndices.begin(), storeIndices.end(),
-                        loadIndices.begin(), loadIndices.end())) {
-          return;
-        }
-        loadOp.replaceAllUsesWith(storeOp.getValueToStore());
-        loadOp.erase();
-        return;
+    : mlir::PassWrapper<StoreForwardingPass, mlir::FunctionPass> {
+  mlir::StoreOp findStore(mlir::Operation* op,
+                          std::function<bool(mlir::StoreOp)> matches) {
+    // Search from op upwards in the current block.
+    mlir::Block* block = op->getBlock();
+    auto startFromIt =
+        std::find_if(block->rbegin(), block->rend(),
+                     [op](mlir::Operation& other) { return &other == op; });
+    for (auto storeOpIt = startFromIt; storeOpIt != block->rend();
+         ++storeOpIt) {
+      auto storeOp = llvm::dyn_cast<mlir::StoreOp>(&*(storeOpIt));
+      if (!storeOp || !matches(storeOp)) {
+        continue;
       }
-    });
-  };
 
-  // Recursively checks defining ops until finds AllocOp. Return either AllocOp
-  // if it is found or nullptr.
+      return storeOp;
+    }
+    // No store operation found. Continue search outside of the parallel
+    // loop if block is in a parallel loop.
+    if (auto parallelOp =
+            llvm::dyn_cast<mlir::scf::ParallelOp>(block->getParentOp())) {
+      return findStore(parallelOp.getOperation(), matches);
+    }
+    return {};
+  }
+
+  // Recursively search defining ops for AllocOp. Return either AllocOp if it is
+  // found or nullptr.
   mlir::Operation* SearchAllocOp(mlir::Value memref) {
     mlir::Operation* defOp = memref.getDefiningOp();
     while (auto subviewOp = mlir::dyn_cast_or_null<mlir::SubViewOp>(defOp)) {
@@ -205,6 +136,31 @@ struct StoreForwardingPass
     memrefToAllocOp->insert({memref, allocOp});
     return allocOp;
   }
+
+  void runOnFunction() override {
+    llvm::DenseMap<mlir::Value, mlir::Operation*> memrefToAllocOp;
+
+    getFunction().walk([&](mlir::LoadOp loadOp) {
+      auto storeOp = findStore(loadOp, [&](mlir::StoreOp storeOp) {
+        mlir::Operation* storeOpAlloc =
+            GetAllocOp(storeOp.memref(), &memrefToAllocOp);
+        mlir::Operation* loadOpAlloc =
+            GetAllocOp(loadOp.memref(), &memrefToAllocOp);
+        return storeOpAlloc && loadOpAlloc && (storeOpAlloc == loadOpAlloc);
+      });
+      if (!storeOp) {
+        return;
+      }
+      auto storeIndices = storeOp.getIndices();
+      auto loadIndices = loadOp.getIndices();
+      if (!std::equal(storeIndices.begin(), storeIndices.end(),
+                      loadIndices.begin(), loadIndices.end())) {
+        return;
+      }
+      loadOp.replaceAllUsesWith(storeOp.getValueToStore());
+      loadOp.erase();
+    });
+  }
 };
 
 // Simple pass that removes temporary buffers that are only written to but
@@ -237,69 +193,247 @@ struct DeadTempBufferRemoval
     return true;
   }
 
-  void recursiveErase(mlir::Operation* op) {
+  void recursiveErase(mlir::Operation* op,
+                      llvm::SmallVectorImpl<mlir::Operation*>* erase_list) {
     for (auto result : op->getResults()) {
       for (auto user : llvm::make_early_inc_range(result.getUsers())) {
-        recursiveErase(user);
+        recursiveErase(user, erase_list);
       }
     }
-    op->erase();
+    erase_list->push_back(op);
   }
 
   void runOnFunction() override {
-    llvm::SmallVector<mlir::Operation*, 8> opsToErase;
+    llvm::SmallVector<mlir::Operation*, 8> dead_ops;
     getFunction().walk([&](mlir::AllocOp allocOp) {
       if (!operationConsideredDead(allocOp)) {
         return;
       }
 
-      opsToErase.push_back(allocOp);
-    });
-
-    for (auto* op : opsToErase) {
       // TODO(herhut): There should be a generic helper for this.
-      recursiveErase(op);
+      recursiveErase(allocOp, &dead_ops);
+    });
+    for (auto op : dead_ops) {
+      op->erase();
     }
   }
 };
 
-void EnableIRPrinting(mlir::PassManager* passManager) {
-  auto enable_if_vlog_is_on = [](mlir::Pass* pass, mlir::Operation* op) {
-    return VLOG_IS_ON(1);
-  };
-  passManager->enableIRPrinting(/*shouldPrintBeforePass=*/enable_if_vlog_is_on,
-                                /*shouldPrintAfterPass=*/{},
-                                /*printModuleScope=*/false,
-                                /*printAfterOnlyOnChange=*/true, llvm::dbgs());
-  passManager->disableMultithreading();
-}
+// TODO(herhut): Move this to MLIR core.
+struct MoveScalarComputationsIntoGpuLaunch
+    : mlir::PassWrapper<MoveScalarComputationsIntoGpuLaunch,
+                        mlir::FunctionPass> {
+  static bool isInliningBeneficiary(mlir::Operation* op) {
+    return llvm::isa<mlir::ConstantOp>(op) || llvm::isa<mlir::DimOp>(op) ||
+           llvm::isa<mlir::SelectOp>(op) || llvm::isa<mlir::CmpIOp>(op);
+  }
 
+  static bool extractBeneficiaryOps(
+      mlir::Operation* op, llvm::SmallVectorImpl<mlir::Operation*>* ops,
+      llvm::SetVector<mlir::Value> args) {
+    if (!isInliningBeneficiary(op)) {
+      return false;
+    }
+
+    ops->push_back(op);
+    for (auto operand : op->getOperands()) {
+      // It is an existing arg, keep going.
+      if (args.count(operand)) {
+        continue;
+      }
+      mlir::Operation* definingOp = operand.getDefiningOp();
+      if (!definingOp || !extractBeneficiaryOps(definingOp, ops, args)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static void inlineOperationsIntoLaunch(mlir::gpu::LaunchOp launch) {
+    llvm::SetVector<mlir::Value> used_above;
+    mlir::getUsedValuesDefinedAbove(launch.body(), used_above);
+    mlir::BlockAndValueMapping inlined_map;
+    for (mlir::Value v : used_above) {
+      llvm::SmallVector<mlir::Operation*, 8> ops_to_move;
+      mlir::Operation* definingOp = v.getDefiningOp();
+      if (definingOp &&
+          extractBeneficiaryOps(definingOp, &ops_to_move, used_above)) {
+        mlir::OpBuilder b(launch.body());
+        for (mlir::Operation* op : llvm::reverse(ops_to_move)) {
+          auto result = b.clone(*op, inlined_map);
+          for (auto pair : llvm::zip(op->getResults(), result->getResults())) {
+            mlir::replaceAllUsesInRegionWith(std::get<0>(pair),
+                                             std::get<1>(pair), launch.body());
+          }
+          inlined_map.map(op->getResults(), result->getResults());
+        }
+      }
+    }
+  }
+
+  void runOnFunction() override {
+    mlir::FuncOp fun = getFunction();
+    fun.walk(
+        [](mlir::gpu::LaunchOp launch) { inlineOperationsIntoLaunch(launch); });
+  }
+};
+
+// TODO(herhut): Make this a proper thing.
+struct FixKernelFunctionSignatures
+    : mlir::PassWrapper<FixKernelFunctionSignatures, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::FuncOp func = getFunction();
+    mlir::ModuleOp module = func.getParentOfType<mlir::ModuleOp>();
+    getFunction().walk([&](mlir::gpu::LaunchFuncOp launchOp) {
+      mlir::gpu::GPUFuncOp kernel =
+          module.lookupSymbol<mlir::gpu::GPUFuncOp>(launchOp.kernel());
+      // Compute a map from function arguments to kernel function operands.
+      mlir::BlockAndValueMapping func_to_kernel;
+      for (mlir::BlockArgument arg : func.getArguments()) {
+        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
+          if (launchOp.getKernelOperand(i) == arg) {
+            func_to_kernel.map(arg, kernel.getArgument(i));
+            break;
+          }
+        }
+      }
+
+      // Create a new kernel function with modified signature. We know that it
+      // will have the same signature as the original function, so just reuse it
+      // here.
+      auto gpu_module = kernel.getParentOfType<mlir::gpu::GPUModuleOp>();
+      mlir::OpBuilder kernel_builder(gpu_module.body());
+      auto new_kernel = kernel_builder.create<mlir::gpu::GPUFuncOp>(
+          kernel.getLoc(), kernel.getName(), func.getType());
+      new_kernel.setAttr(mlir::gpu::GPUDialect::getKernelFuncAttrName(),
+                         kernel_builder.getUnitAttr());
+
+      // Create a map from old kernel argument to new one.
+      mlir::BlockAndValueMapping old_kernel_to_new;
+      for (int i = 0, e = kernel.getNumFuncArguments(); i < e; ++i) {
+        mlir::Value func_arg = func.getArgument(i);
+        mlir::Value new_kernel_arg = new_kernel.getArgument(i);
+        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(func_arg);
+        if (!old_kernel_arg) {
+          kernel.emitOpError()
+              << "argument " << i
+              << "to kernel is not an argument to the containing function";
+          signalPassFailure();
+          return;
+        }
+        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
+      }
+      // Steal the body by appending the blocks and inserting a branch.
+      kernel.body().cloneInto(&new_kernel.getBody(), old_kernel_to_new);
+      kernel_builder.setInsertionPointToEnd(&new_kernel.body().front());
+      kernel_builder.create<mlir::BranchOp>(
+          new_kernel.getLoc(), &*std::next(new_kernel.body().begin()));
+      // Now create a new launchOp calling the new kernel. We can just forward
+      // the arguments of the function to the launch, as we fixed the
+      // signature.
+      mlir::OpBuilder launch_builder(launchOp);
+      launch_builder.create<mlir::gpu::LaunchFuncOp>(
+          launchOp.getLoc(), new_kernel, launchOp.getGridSizeOperandValues(),
+          launchOp.getBlockSizeOperandValues(), func.getArguments());
+      // Launch does not have results, so we can just erase it. And the kernel
+      // also needs to go.
+      launchOp.erase();
+      kernel.erase();
+    });
+  }
+};
+
+// Extract_element(xla_hlo_scalars_to_dimension_tensor(v_i), i) -> v_i
+//
+// We need to direct fusion to the inner loops. This cannot be done with
+// a passmanager alone ATM, as nested pass managers require operations to
+// be closed from above.
+struct MapParallelLoops
+    : public mlir::PassWrapper<MapParallelLoops, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::greedilyMapParallelSCFToGPU(getFunction().getBody());
+  }
+};
+
+// We need to direct fusion to the inner loops. This cannot be done with
+// a passmanager alone ATM, as nested pass managers require operations to
+// be closed from above.
+struct FuseInnerParallelLoops
+    : public mlir::PassWrapper<FuseInnerParallelLoops, mlir::FunctionPass> {
+  void runOnFunction() override {
+    getFunction().walk([](mlir::scf::ParallelOp op) {
+      mlir::scf::naivelyFuseParallelOps(op.region());
+    });
+  }
+};
+
+// Collapse all loop dimension into the first one.
+struct ParallelLoopCollapsingToFirstDim
+    : public mlir::PassWrapper<ParallelLoopCollapsingToFirstDim,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  void runOnOperation() override {
+    mlir::Operation* module = getOperation();
+
+    module->walk([&](mlir::scf::ParallelOp op) {
+      unsigned num_loops = op.getNumLoops();
+      std::vector<unsigned> combinedLoops;
+      combinedLoops.reserve(num_loops);
+      for (unsigned i = 0; i < num_loops; ++i) {
+        combinedLoops.push_back(i);
+      }
+      mlir::collapseParallelLoops(op, {combinedLoops});
+    });
+  }
+};
 }  // namespace
 
-Status LowerLHLOToGPU(mlir::ModuleOp module) {
+Status LowerLHLOToGPU(mlir::ModuleOp module,
+                      llvm::ArrayRef<unsigned> tile_sizes,
+                      llvm::ArrayRef<unsigned> unroll_factors,
+                      bool collapseParallelLoops) {
   mlir::PassManager pm(module.getContext());
-  EnableIRPrinting(&pm);
+  applyPassManagerCLOptions(pm);
 
-  // First, lower bodies of lhlo operations that contain hlo ops.
-  pm.addPass(absl::make_unique<NestedHloRegionsConverter>());
+  // We have to anticipate later unrolling in tiling to make sure that we get
+  // the requested tiling after unrolling. Compute the new tiling here if
+  // needed.
+  llvm::SmallVector<unsigned, 4> tiling_for_unrolling;
+  llvm::SmallVector<int64_t, 4> as_int64;
+  if (!unroll_factors.empty()) {
+    tiling_for_unrolling.reserve(tile_sizes.size());
+    for (auto pair : llvm::zip(tile_sizes, unroll_factors)) {
+      tiling_for_unrolling.push_back(std::get<0>(pair) * std::get<1>(pair));
+      as_int64.push_back(std::get<1>(pair));
+    }
+  } else {
+    tiling_for_unrolling.append(tile_sizes.begin(), tile_sizes.end());
+  }
+
+  // Legalize from HLO to LHLO.
+  pm.addPass(::mlir::xla_hlo::createLegalizeToLhloPass());
+  // Moving `AllocOp`s and inserting missing `DeallocOp`s
+  pm.addPass(::mlir::createBufferPlacementPass());
   // Next, we can strip the outer fusion operation.
   pm.addPass(absl::make_unique<FusionOpRemover>());
-  // Remove unnecessary Lhlo copies.
+  // Remove unnecessary LHLO copies.
   pm.addPass(::mlir::xla_lhlo::createLhloCopyRemovalPass());
-  // Transform lhlo operations to LinAlg.
+  // Transform LHLO operations to LinAlg.
   pm.addPass(::mlir::xla_lhlo::createLegalizeLhloToLinalgPass());
-  // Fuse linalg operations. This will yield a single tiled loop nest where
-  // the inner loops are single trip.
-  pm.addPass(::mlir::xla_lhlo::createLhloFuseLinalg());
+  // Fuse linalg operations.
+  // TODO(herhut): Make tiling conigurable.
+  pm.addPass(::mlir::xla_lhlo::createLhloFuseLinalg(/*use_parallel_loops=*/true,
+                                                    tiling_for_unrolling));
   // Legalize reduce operations directly to GPU dialect.
   pm.addPass(::mlir::xla_lhlo::createLegalizeToGpuPass());
-  // Fuse linalg operations. This will yield a single tiled loop nest where
-  // Go from linalg to normal loops.
-  pm.addPass(::mlir::createConvertLinalgToLoopsPass());
-  // Canonicalize the code to simplify index computations.
+  // Transform the Linalg operations inside of the loop nest into parallel
+  // loops.
+  pm.addPass(::mlir::createConvertLinalgToParallelLoopsPass());
+  // Canonicalize the code to simplify index computations. This is needed so
+  // that loop bounds have the same value.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
-  // The innermost loops will be single-trip.
-  pm.addPass(absl::make_unique<SingleTripLoopRemoval>());
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Fuse the inner-most loops.
+  pm.addPass(absl::make_unique<FuseInnerParallelLoops>());
   // Run CSE to ensure that loads and stores to the same subview get
   // recognized as such.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
@@ -307,17 +441,30 @@ Status LowerLHLOToGPU(mlir::ModuleOp module) {
   pm.addPass(absl::make_unique<StoreForwardingPass>());
   // Remove now unused temporary buffers.
   pm.addPass(absl::make_unique<DeadTempBufferRemoval>());
-  // Coalesce generated loops to have 1d loops.
-  pm.addPass(::mlir::createLoopCoalescingPass());
-  // Transform the now 1d loops to gpu launches.
-  pm.addPass(::mlir::createSimpleLoopsToGPUPass(/*numBlockDims=*/0,
-                                                /*numThreadDims=*/1));
+  if (!unroll_factors.empty()) {
+    pm.addPass(::mlir::createParallelLoopTilingPass(as_int64));
+  }
+  // Project all loop dimensions to X if necessary.
+  if (collapseParallelLoops) {
+    pm.addPass(absl::make_unique<ParallelLoopCollapsingToFirstDim>());
+  }
   // Some basic cleanup.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Greedily map the remaining loop to GPU hardware dimensions.
+  pm.addPass(absl::make_unique<MapParallelLoops>());
+  // Apply the mapping.
+  pm.addPass(mlir::createParallelLoopToGpuPass());
+  // Some basic cleanup.
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Move scalar operations into the launch to ensure smaller signatures.
+  pm.addPass(absl::make_unique<MoveScalarComputationsIntoGpuLaunch>());
   // Take launches to launches with kernels.
   pm.addPass(::mlir::createGpuKernelOutliningPass());
-
+  // Make sure the kernel signature resembled the original function's
+  // signature
+  pm.addPass(absl::make_unique<FixKernelFunctionSignatures>());
   if (failed(pm.run(module))) {
     return InternalError("Lowering to GPU kernels failed.");
   }
@@ -357,12 +504,12 @@ class LowerToNVVMPass
   }
 };
 
-}  // anonymous namespace
+}  // namespace
 
 Status LowerKernelBodiesToNVVM(mlir::ModuleOp module) {
   // We cannot verify as the signature of the kernel is rewritten.
   ::mlir::PassManager pm(module.getContext(), /*verifyPasses=*/false);
-  EnableIRPrinting(&pm);
+  applyPassManagerCLOptions(pm);
 
   // Rewrite kernel functions to LLVM IR.
   auto& kernelPm = pm.nest<::mlir::gpu::GPUModuleOp>();
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
index 8a8882cab30..ab045808477 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
@@ -23,7 +23,10 @@ limitations under the License.
 namespace xla {
 namespace mlir_gpu {
 
-Status LowerLHLOToGPU(mlir::ModuleOp module);
+Status LowerLHLOToGPU(mlir::ModuleOp module,
+                      llvm::ArrayRef<unsigned> tile_sizes = {16, 64},
+                      llvm::ArrayRef<unsigned> unroll_factors = {},
+                      bool collapseParallelLoops = true);
 
 Status LowerKernelBodiesToNVVM(mlir::ModuleOp module);
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index 3c90d27587f..6e26d8556e7 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -77,6 +77,9 @@ Status InsertMlirOp(HloOpcode opcode, OpBuilder func_builder, Location loc,
     case HloOpcode::kCeil:
       func_builder.create<lhlo::CeilOp>(loc, rets, args, attrs);
       break;
+    case HloOpcode::kComplex:
+      func_builder.create<lhlo::ComplexOp>(loc, rets, args, attrs);
+      break;
     case HloOpcode::kCopy:
       func_builder.create<lhlo::CopyOp>(loc, rets, args, attrs);
       break;
@@ -89,6 +92,9 @@ Status InsertMlirOp(HloOpcode opcode, OpBuilder func_builder, Location loc,
     case HloOpcode::kExp:
       func_builder.create<lhlo::ExpOp>(loc, rets, args, attrs);
       break;
+    case HloOpcode::kImag:
+      func_builder.create<lhlo::ImagOp>(loc, rets, args, attrs);
+      break;
     case HloOpcode::kLog:
       func_builder.create<lhlo::LogOp>(loc, rets, args, attrs);
       break;
@@ -104,6 +110,9 @@ Status InsertMlirOp(HloOpcode opcode, OpBuilder func_builder, Location loc,
     case HloOpcode::kNegate:
       func_builder.create<lhlo::NegOp>(loc, rets, args, attrs);
       break;
+    case HloOpcode::kReal:
+      func_builder.create<lhlo::RealOp>(loc, rets, args, attrs);
+      break;
     case HloOpcode::kRemainder:
       func_builder.create<lhlo::RemOp>(loc, rets, args, attrs);
       break;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
index dc33be5341c..458522f89e6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
@@ -17,69 +17,18 @@ limitations under the License.
 
 #include <memory>
 
-#include "absl/container/flat_hash_map.h"
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "llvm/IR/Module.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Target/NVVMIR.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/dump.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
-#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
-#include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/mlir_gpu/emission_context.h"
-#include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
-#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
-#include "tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/platform/cuda_libdevice_path.h"
-#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
 namespace mlir_gpu {
 namespace {
 
-using ::mlir::BlockArgument;
-using ::mlir::dyn_cast;
-using ::mlir::FuncOp;
 using ::mlir::MLIRContext;
-using ::mlir::ModuleOp;
-using ::mlir::OwningModuleRef;
-using ::mlir::UnknownLoc;
-using ::mlir::Value;
-using ::mlir::gpu::LaunchFuncOp;
 using ::mlir::LLVM::LLVMDialect;
-using ::mlir::LLVM::LLVMFuncOp;
-using ::mlir::LLVM::LLVMType;
-using ::xla::gpu::GpuExecutable;
-using ::xla::gpu::GpuHloSchedule;
-using ::xla::gpu::GpuVersion;
-using ::xla::gpu::StreamAssignment;
-using ::xla::gpu::ThunkSchedule;
 
 int64 ConfigureLLVMModuleAndGetPointerSize(MLIRContext* context) {
   LLVMDialect* dialect = context->getRegisteredDialect<LLVMDialect>();
@@ -89,49 +38,6 @@ int64 ConfigureLLVMModuleAndGetPointerSize(MLIRContext* context) {
   return module.getDataLayout().getPointerSize();
 }
 
-// TODO(b/137624192) Share with NVPTX compiler
-static std::vector<std::string> CandidateCudaRoots(
-    const HloModuleConfig& config) {
-  return tensorflow::CandidateCudaRoots(
-      config.debug_options().xla_gpu_cuda_data_dir());
-}
-
-void PrintCantFindCudaMessage(absl::string_view msg,
-                              const HloModuleConfig& hlo_module_config) {
-  LOG(WARNING) << msg;
-  LOG(WARNING) << "Searched for CUDA in the following directories:";
-
-  for (const auto& dir : CandidateCudaRoots(hlo_module_config)) {
-    LOG(WARNING) << "  " << dir;
-  }
-  LOG(WARNING)
-      << "You can choose the search directory by setting xla_gpu_cuda_data_dir "
-         "in HloModule's DebugOptions.  For most apps, setting the environment "
-         "variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.";
-}
-
-// Returns the directory containing nvvm libdevice files.
-string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
-  for (const string& cuda_root : CandidateCudaRoots(hlo_module_config)) {
-    const string libdevice_dir =
-        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
-    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
-    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
-      VLOG(2) << "Found libdevice dir " << libdevice_dir;
-      return libdevice_dir;
-    }
-  }
-  PrintCantFindCudaMessage(
-      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may "
-      "result in compilation or runtime failures, if the program we try to run "
-      "uses routines from libdevice.",
-      hlo_module_config);
-
-  // GetCudaRootCandidates always includes ".", but if everything fails, we
-  // return it anyway.  Better than returning the empty string.
-  return ".";
-}
-
 }  // namespace
 
 MlirCompiler::MlirCompiler()
@@ -141,428 +47,6 @@ se::Platform::Id MlirCompiler::PlatformId() const {
   return stream_executor::cuda::kCudaPlatformId;
 }
 
-StatusOr<std::unique_ptr<HloModule>> MlirCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
-  // Until we find a reason to do something different, run the same passes
-  // that the normal GPU backend runs.
-  gpu::NVPTXCompiler xla_compiler;
-  TF_RETURN_IF_ERROR(xla_compiler.OptimizeHloModule(module.get(), stream_exec,
-                                                    device_allocator));
-  TF_RETURN_IF_ERROR(xla_compiler.PrepareHloModuleForIrEmitting(module.get()));
-
-  return std::move(module);
-}
-
-namespace {
-
-// TODO(b/137624192): Move this to custom call handling and share.
-absl::optional<bool> CanShareBufferHint(const HloInstruction* user,
-                                        const HloInstruction* operand,
-                                        const ShapeIndex& user_index) {
-  if (user->opcode() == HloOpcode::kCustomCall) {
-    // Share the bias buffer with the parent instruction.
-    if (user->custom_call_target() == xla::gpu::kGemmCallTarget) {
-      if (user->operand_count() == 3 && user->operand(2) == operand) {
-        return true;
-      }
-    }
-    // The operand of cholesky can be shared with the first output.
-    if (user->custom_call_target() == xla::gpu::kCusolverCholeskyCallTarget) {
-      return user_index.size() == 1 && user_index[0] == 0;
-    }
-  }
-  return absl::nullopt;
-}
-
-// TODO(b/137624192): Share this with nvptx backend.
-GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) {
-  int cc_major, cc_minor;
-  const auto& device_description = stream_exec->GetDeviceDescription();
-  if (!device_description.cuda_compute_capability(&cc_major, &cc_minor)) {
-    LOG(WARNING)
-        << "Couldn't get compute capability for device; assuming sm_20.";
-    cc_major = 2;
-    cc_minor = 0;
-  }
-  return std::make_pair(cc_major, cc_minor);
-}
-
-// Return the constant launch bound along the "x" dimension in "dim" if all the
-// other dimensions are 1.  Return nullopt otherwise or when any of the bounds
-// is not constant.
-static absl::optional<int64> getLaunchBound(const mlir::gpu::KernelDim3& dim) {
-  auto get_constant = [](mlir::Operation* op,
-                         mlir::StringRef name) -> absl::optional<int64> {
-    if (auto constant = llvm::dyn_cast_or_null<mlir::ConstantOp>(op)) {
-      return constant.value().cast<mlir::IntegerAttr>().getInt();
-    }
-    op->emitError() << "bound " << name << " is not constant";
-    return absl::nullopt;
-  };
-  auto y_op = dim.y.getDefiningOp();
-  auto dim_y = get_constant(y_op, "y");
-  if (!dim_y.has_value() || dim_y.value() != 1) {
-    y_op->emitError() << "bound 'y' is not constant 1";
-    return absl::nullopt;
-  }
-  auto z_op = dim.z.getDefiningOp();
-  auto dim_z = get_constant(z_op, "z");
-  if (!dim_z.has_value() || dim_z.value() != 1) {
-    z_op->emitError() << "bound 'z' is not constant 1";
-    return absl::nullopt;
-  }
-  return get_constant(dim.x.getDefiningOp(), "x");
-}
-
-namespace {
-
-// Indexes of a range of arguments in a GPU function. This is used to keep the
-// range of arguments that correspond to a lowered kernel argument of
-// (previously) memref type.
-struct LaunchFuncArgument {
-  int kernel_argument_begin;
-  int kernel_argument_size;
-};
-
-}  // end namespace
-
-using OperandToValueMap =
-    absl::flat_hash_map<const HloInstruction*, std::vector<LaunchFuncArgument>>;
-
-static StatusOr<std::vector<const HloInstruction*>> ComputeOperandToValueMap(
-    OperandToValueMap* operand_to_value_map, const HloInstruction* instr,
-    LaunchFuncOp launchOp, LLVMFuncOp kernel) {
-  auto operands = instr->operands();
-  std::vector<const HloInstruction*> ordered_operands;
-  bool has_failed = false;
-  // A memref will expand into multiple kernel operands, accumulate their number
-  // in order to find them later.
-  int cur_operand_position = 0;
-
-  for (int kernel_index = 0; kernel_index < launchOp.getNumKernelOperands();
-       ++kernel_index) {
-    auto launchop_operand =
-        launchOp.getKernelOperand(kernel_index).dyn_cast<BlockArgument>();
-    if (!launchop_operand) {
-      launchOp.emitError("argument to kernel is not a function input");
-      has_failed = true;
-      continue;
-    }
-    auto memref_type =
-        launchop_operand.getType().dyn_cast<::mlir::MemRefType>();
-    if (!memref_type) {
-      launchOp.emitError("only memref-typed arguments are supported");
-      has_failed = true;
-      break;
-    }
-    // host_index is the argument position to the surrounding function that
-    // contains the launch. This index corresponds to HLO operand indices
-    // by construction.
-    auto host_index = launchop_operand.getArgNumber();
-    // The trailing argument to the outer function are the results.
-    auto operand =
-        (host_index < operands.size()) ? operands[host_index] : instr;
-    if (!operand_to_value_map->count(operand)) {
-      ordered_operands.push_back(operand);
-    }
-    // Associate the HLO operand with the argument values of the kernel
-    // function.
-    int num_unpacked =
-        mlir::MemRefDescriptor::getNumUnpackedValues(memref_type);
-    (*operand_to_value_map)[operand].push_back(
-        {cur_operand_position, num_unpacked});
-    cur_operand_position += num_unpacked;
-  }
-  if (has_failed) {
-    return InternalError("Mapping operands to kernel arguments has failed.");
-  }
-  return ordered_operands;
-}
-
-Status InsertBufferLoadPreduleIntoKernel(
-    LLVMFuncOp kernel, const OperandToValueMap& operand_to_value_map,
-    const std::vector<const HloInstruction*>& ordered_operands,
-    BufferAssignment* assignment,
-    const std::vector<const BufferAllocation*>& buffers) {
-  mlir::OpBuilder builder(kernel.getBody());
-  auto llvm_dialect = kernel.getContext()->getRegisteredDialect<LLVMDialect>();
-  auto offset_type = LLVMType::getInt64Ty(llvm_dialect);
-  auto ptr_type = LLVMType::getInt8PtrTy(llvm_dialect);
-  auto void_type = LLVMType::getVoidTy(llvm_dialect);
-  auto loc = kernel.getLoc();
-
-  auto num_original_args = kernel.getNumArguments();
-  std::vector<LLVMType> new_arg_types(buffers.size(), ptr_type);
-  kernel.setAttr(kernel.getTypeAttrName(),
-                 mlir::TypeAttr::get(LLVMType::getFunctionTy(
-                     void_type, new_arg_types, /*isVarArg=*/false)));
-  std::vector<Value> original_args(kernel.args_begin(), kernel.args_end());
-
-  std::vector<mlir::Type> as_mlir_types(new_arg_types.begin(),
-                                        new_arg_types.end());
-  auto new_args = kernel.front().addArguments(as_mlir_types);
-  std::vector<Value> buffer_args(new_args.begin(), new_args.end());
-
-  for (auto operand : ordered_operands) {
-    TF_ASSIGN_OR_RETURN(auto slice,
-                        assignment->GetUniqueTopLevelSlice(operand));
-    auto buffer = std::find(buffers.begin(), buffers.end(), slice.allocation());
-    auto index = buffer - buffers.begin();
-    auto offset = builder.create<mlir::LLVM::ConstantOp>(
-        loc, offset_type, builder.getI64IntegerAttr(slice.offset()));
-    auto ptr = buffer_args[index];
-
-    // Replace uses of function arguments pertaining to memref descriptors with
-    // values derived from HLO buffers. The instructions inserting these values
-    // into memref descriptors were already introduced during the lowering phase
-    // as per MLIR calling convention.
-    for (auto arg : operand_to_value_map.at(operand)) {
-      mlir::MemRefDescriptorView original(
-          mlir::ValueRange(original_args)
-              .slice(arg.kernel_argument_begin, arg.kernel_argument_size));
-
-      // Allocated and aligned pointers are the same.
-      auto casted = builder.create<mlir::LLVM::BitcastOp>(
-          loc, original.alignedPtr().getType().cast<LLVMType>(),
-          mlir::ValueRange(ptr));
-      original.alignedPtr().replaceAllUsesWith(casted);
-      original.allocatedPtr().replaceAllUsesWith(casted);
-
-      // Use the offset of the HLO buffer instead of the one expected in the
-      // function call.
-      original.offset().replaceAllUsesWith(offset);
-
-      // Fill the shape.
-      auto shape = operand->shape();
-      // Unless the operand is a scalar pointer, also fill shape and strides.
-      if (shape.dimensions().empty()) {
-        continue;
-      }
-
-      // TODO(b/137624192) Pass in the descriptor to allow for dynamic shapes.
-      assert(shape.IsArray() && shape.is_static());
-      for (auto extent : llvm::enumerate(shape.dimensions())) {
-        auto shape = builder.create<mlir::LLVM::ConstantOp>(
-            loc, original.size(extent.index()).getType(),
-            builder.getI64IntegerAttr(extent.value()));
-        original.size(extent.index()).replaceAllUsesWith(shape);
-      }
-      // Finally, fill the strides.
-      // TODO(b/137624192): Take assigned layout into account.
-      uint64_t accumulator = 0;
-      for (int64_t idx = shape.rank() - 1; idx >= 0; --idx) {
-        if (accumulator == 0) {
-          accumulator = 1;
-        } else {
-          accumulator *= shape.dimensions(idx + 1);
-        }
-        auto stride = builder.create<mlir::LLVM::ConstantOp>(
-            loc, original.stride(idx).getType(),
-            builder.getI64IntegerAttr(accumulator));
-        original.stride(idx).replaceAllUsesWith(stride);
-      }
-    }
-  }
-
-  // Now we can remove the original arguments, as they should have no more
-  // users.
-  for (int i = 0; i < num_original_args; ++i) {
-    kernel.front().eraseArgument(0);
-  }
-
-  return Status::OK();
-}
-
-StatusOr<std::unique_ptr<gpu::KernelThunk>> TransformKernelToXlaThunk(
-    FuncOp func, const HloInstruction* const instr, ModuleOp kernel_module,
-    BufferAssignment* assignment) {
-  // Find the single LaunchFuncOp and compute a mapping from operands of
-  // the hlo instruction to the corresponding values of the kernel
-  // function in the target module;
-  LaunchFuncOp launchOp;
-  auto walkResult = func.walk([&launchOp](LaunchFuncOp op) {
-    if (launchOp) {
-      op.emitError("multiple kernels for single top-level HLO");
-      return mlir::WalkResult::interrupt();
-    }
-    launchOp = op;
-    return mlir::WalkResult::advance();
-  });
-  if (walkResult.wasInterrupted()) {
-    return InternalError("Multiple kernels for single top-level HLO");
-  }
-  if (!launchOp) {
-    // If there was no launchOp, then no kernel was generated, so the lowering
-    // from the LHLO ops to the GPU dialect is not implemented yet.
-    return Unimplemented("No kernel was generated.");
-  }
-
-  auto kernel = kernel_module.lookupSymbol<LLVMFuncOp>(launchOp.kernel());
-
-  // Store the assignment of operands to block arguments. Note that an operand
-  // might be used in multiple argument positions, hence the vector.
-  OperandToValueMap operand_to_value_map;
-  TF_ASSIGN_OR_RETURN(
-      auto ordered_operands,
-      ComputeOperandToValueMap(&operand_to_value_map, instr, launchOp, kernel));
-
-  // Get the required buffers to support the inputs. Use a set and vector here
-  // to keep the order fixed. This is mostly useful for testing.
-  std::unordered_set<const BufferAllocation*> buffers_needed;
-  std::vector<const BufferAllocation*> buffers;
-  // TODO(b/137624192) Add support for tuples.
-  for (auto operand : ordered_operands) {
-    TF_ASSIGN_OR_RETURN(auto buffer,
-                        assignment->GetUniqueTopLevelSlice(operand));
-    if (buffers_needed.insert(buffer.allocation()).second) {
-      buffers.push_back(buffer.allocation());
-    }
-  }
-
-  // TODO(b/137624192) Add support for temp buffer.
-  // TODO(b/137624192) Add support for constant buffers.
-
-  // Change the signature to match what the XLA runtime expects from the
-  // kernel.
-  TF_RETURN_IF_ERROR(InsertBufferLoadPreduleIntoKernel(
-      kernel, operand_to_value_map, ordered_operands, assignment, buffers));
-
-  // Finally, create the thunk and set the launch dimensions.
-  auto thunk = absl::make_unique<gpu::KernelThunk>(
-      buffers, kernel.getName().str(), instr,
-      /*unroll_factor=*/1);
-
-  // Set launch bounds.
-  mlir::gpu::KernelDim3 block = launchOp.getBlockSizeOperandValues();
-  mlir::gpu::KernelDim3 grid = launchOp.getGridSizeOperandValues();
-  absl::optional<int64> num_threads = getLaunchBound(block);
-  absl::optional<int64> num_blocks = getLaunchBound(grid);
-  if (!num_threads || !num_blocks) {
-    return Unimplemented("Unsupported launch bounds");
-  }
-  thunk->SetLaunchDimensions(gpu::LaunchDimensions(*num_blocks, *num_threads));
-  return std::move(thunk);
-}
-
-}  //  namespace
-
-StatusOr<std::unique_ptr<Executable>> MlirCompiler::RunBackend(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
-  // Determine the HLO schedule, which is an ordering of HLO instructions. This
-  // is used by buffer assignment to enable buffer reuse, and the same ordering
-  // must also be used to determine the thunk launch schedule.
-  std::unique_ptr<StreamAssignment> stream_assignment =
-      xla::gpu::AssignStreams(*module);
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<GpuHloSchedule> hlo_schedule,
-      GpuHloSchedule::Build(*module, *stream_assignment, pointer_size_));
-
-  // Run buffer analysis on the HLO graph. This analysis figures out which
-  // temporary buffers are required to run the computation.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> buffer_assignment,
-                      BufferAssigner::Run(
-                          module.get(), hlo_schedule->ConsumeHloOrdering(),
-                          BufferSizeBytesFunction(),
-                          /*color_alignment=*/
-                          [](LogicalBuffer::Color) {
-                            return xla::gpu::kXlaAllocatedBufferAlignBytes;
-                          },
-                          /*allocate_buffers_for_constants=*/true,
-                          /*colorer=*/BufferAssigner::DefaultColorer(),
-                          /*must_not_live_out=*/{}, &CanShareBufferHint));
-  DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
-
-  EmissionContext emission_context(std::move(module));
-  if (error_handler_) {
-    emission_context.setErrorHandler(error_handler_);
-  }
-
-  OwningModuleRef mlir_module =
-      ModuleOp::create(UnknownLoc::get(emission_context.getContext()));
-  LhloDialectEmitter lhlo_emitter(&emission_context, *buffer_assignment,
-                                  stream_exec->platform(), *mlir_module);
-
-  TF_RETURN_IF_ERROR(lhlo_emitter.EmitComputation(
-      *emission_context.getHloModule()->entry_computation()));
-
-  TF_RETURN_IF_ERROR(
-      module_hook_.invoke(IRHook::LoweringStage::LHLO, *mlir_module));
-
-  TF_RETURN_IF_ERROR(LowerLHLOToGPU(*mlir_module));
-
-  TF_RETURN_IF_ERROR(
-      module_hook_.invoke(IRHook::LoweringStage::GPU, *mlir_module));
-
-  TF_RETURN_IF_ERROR(LowerKernelBodiesToNVVM(*mlir_module));
-
-  TF_RETURN_IF_ERROR(
-      module_hook_.invoke(IRHook::LoweringStage::LLVM, *mlir_module));
-
-  TF_ASSIGN_OR_RETURN(OwningModuleRef kernel_module,
-                      ExtractKernelModule(*mlir_module));
-
-  auto thunk_sequence = lhlo_emitter.ConsumeThunkSequence();
-  for (auto entry : lhlo_emitter.InstructionToFunctionMap()) {
-    TF_ASSIGN_OR_RETURN(
-        auto thunk,
-        TransformKernelToXlaThunk(entry.second, entry.first, *kernel_module,
-                                  buffer_assignment.get()));
-    thunk_sequence->push_back(std::move(thunk));
-  }
-
-  TF_RETURN_IF_ERROR(
-      module_hook_.invoke(IRHook::LoweringStage::KERNEL, *kernel_module));
-
-  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module);
-
-  if (!llvmModule) {
-    return InternalError("Translation to LLVM failed");
-  }
-
-  llvmModule->setModuleIdentifier(emission_context.getHloModule()->name());
-  // TODO(herhut): Why is this needed and does not come from the template?
-  llvmModule->setDataLayout(gpu::nvptx::kDataLayout);
-
-  const auto& config = emission_context.getHloModule()->config();
-  TF_ASSIGN_OR_RETURN(
-      auto ptx, xla::gpu::nvptx::CompileToPtx(llvmModule.get(),
-                                              GetGpuVersion(stream_exec),
-                                              config, GetLibdeviceDir(config)));
-  TF_ASSIGN_OR_RETURN(
-      auto cubin, se::CompileGpuAsm(stream_exec->device_ordinal(), ptx.c_str(),
-                                    gpu::PtxOptsFromConfig(config)));
-
-  auto thunk_schedule = absl::make_unique<ThunkSchedule>(
-      std::move(thunk_sequence), std::move(stream_assignment),
-      hlo_schedule->ThunkLaunchOrder());
-
-  if (DumpingEnabledForHloModule(*emission_context.getHloModule())) {
-    DumpToFileInDirOrStdout(*emission_context.getHloModule(), "",
-                            "thunk_schedule", thunk_schedule->ToString());
-  }
-
-  // TODO(b/137624192): Add profiling support.
-  return {absl::make_unique<GpuExecutable>(
-      ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
-      emission_context.releaseHloModule(), std::move(buffer_assignment),
-      nullptr, nullptr)};
-}
-
-StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompiler::Compile(
-    std::unique_ptr<HloModuleGroup> module_group,
-    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
-    se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented("Not yet implemented in MLIR compiler");
-}
-
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-MlirCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
-                                 const AotCompilationOptions& options) {
-  return Unimplemented("Not yet implemented in MLIR compiler");
-}
-
 void MlirCompiler::SetModuleHook(IRHook module_hook) {
   module_hook_ = module_hook;
 }
@@ -579,14 +63,3 @@ void MlirCompiler::RemoveErrorHandler() { error_handler_ = nullptr; }
 
 }  // namespace mlir_gpu
 }  // namespace xla
-
-static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(
-      stream_executor::cuda::kCudaPlatformId, []() {
-        return absl::make_unique<xla::FailoverCompiler>(
-            absl::make_unique<xla::mlir_gpu::MlirCompiler>(),
-            absl::make_unique<xla::gpu::NVPTXCompiler>());
-      });
-  return true;
-}
-static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
index 9aeef12ac28..a7b2f9446fa 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
 
-#include "absl/container/flat_hash_map.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -27,7 +26,8 @@ namespace mlir_gpu {
 
 // A Compiler implementation that converts XLAs IR to a matching MLIR dialect,
 // performs all lowering on the MLIR IR and finally converts MLIR to LLVMIR for
-// generation of a think suitable for XLAs runtime.
+// generation of a thunk suitable for XLAs runtime. MlirCompilerImpl contains
+// the implementation.
 class MlirCompiler : public Compiler {
   using ErrorHandler =
       std::function<void(const EmissionContext::ErrorMap&, HloModule*)>;
@@ -37,30 +37,6 @@ class MlirCompiler : public Compiler {
 
   se::Platform::Id PlatformId() const override;
 
-  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
-  StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
-  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::unique_ptr<HloModuleGroup> module_group,
-      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
-                     const AotCompilationOptions& options) override;
-
-  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
-    int64 pointer_size = pointer_size_;
-    return [pointer_size](const Shape& shape) {
-      return ShapeUtil::ByteSizeOf(shape, pointer_size);
-    };
-  }
-
   struct IRHook {
     enum class LoweringStage { LHLO, GPU, LLVM, KERNEL };
 
@@ -80,7 +56,7 @@ class MlirCompiler : public Compiler {
   void SetErrorHandler(ErrorHandler error_handler);
   void RemoveErrorHandler();
 
- private:
+ protected:
   ::mlir::MLIRContext context_;
   int64 pointer_size_;
   IRHook module_hook_;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
new file mode 100644
index 00000000000..35ac3b2bf63
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -0,0 +1,585 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Target/NVVMIR.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/dump.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/emission_context.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+
+namespace xla {
+namespace mlir_gpu {
+namespace {
+
+using ::mlir::BlockArgument;
+using ::mlir::dyn_cast;
+using ::mlir::FuncOp;
+using ::mlir::ModuleOp;
+using ::mlir::OwningModuleRef;
+using ::mlir::UnknownLoc;
+using ::mlir::Value;
+using ::mlir::gpu::LaunchFuncOp;
+using ::mlir::LLVM::LLVMDialect;
+using ::mlir::LLVM::LLVMFuncOp;
+using ::mlir::LLVM::LLVMType;
+using ::xla::gpu::GpuExecutable;
+using ::xla::gpu::GpuHloSchedule;
+using ::xla::gpu::GpuVersion;
+using ::xla::gpu::StreamAssignment;
+using ::xla::gpu::ThunkSchedule;
+
+// A Compiler implementation that converts XLAs IR to a matching MLIR dialect,
+// performs all lowering on the MLIR IR and finally converts MLIR to LLVMIR for
+// generation of a thunk suitable for XLAs runtime.
+class MlirCompilerImpl : public MlirCompiler {
+ public:
+  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+      se::DeviceMemoryAllocator* device_allocator) override;
+
+  StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+      se::DeviceMemoryAllocator* device_allocator) override;
+
+  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
+      se::DeviceMemoryAllocator* device_allocator) override;
+
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                     const AotCompilationOptions& options) override;
+
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
+    int64 pointer_size = pointer_size_;
+    return [pointer_size](const Shape& shape) {
+      return ShapeUtil::ByteSizeOf(shape, pointer_size);
+    };
+  }
+};
+
+// TODO(b/137624192) Share with NVPTX compiler
+static std::vector<std::string> CandidateCudaRoots(
+    const HloModuleConfig& config) {
+  return tensorflow::CandidateCudaRoots(
+      config.debug_options().xla_gpu_cuda_data_dir());
+}
+
+void PrintCantFindCudaMessage(absl::string_view msg,
+                              const HloModuleConfig& hlo_module_config) {
+  LOG(WARNING) << msg;
+  LOG(WARNING) << "Searched for CUDA in the following directories:";
+
+  for (const auto& dir : CandidateCudaRoots(hlo_module_config)) {
+    LOG(WARNING) << "  " << dir;
+  }
+  LOG(WARNING)
+      << "You can choose the search directory by setting xla_gpu_cuda_data_dir "
+         "in HloModule's DebugOptions.  For most apps, setting the environment "
+         "variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.";
+}
+
+// Returns the directory containing nvvm libdevice files.
+std::string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
+  for (const string& cuda_root : CandidateCudaRoots(hlo_module_config)) {
+    const std::string libdevice_dir =
+        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
+    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
+    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
+      VLOG(2) << "Found libdevice dir " << libdevice_dir;
+      return libdevice_dir;
+    }
+  }
+  PrintCantFindCudaMessage(
+      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may "
+      "result in compilation or runtime failures, if the program we try to run "
+      "uses routines from libdevice.",
+      hlo_module_config);
+
+  // GetCudaRootCandidates always includes ".", but if everything fails, we
+  // return it anyway.  Better than returning the empty string.
+  return ".";
+}
+
+StatusOr<std::unique_ptr<HloModule>> MlirCompilerImpl::RunHloPasses(
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    se::DeviceMemoryAllocator* device_allocator) {
+  // Until we find a reason to do something different, run the same passes
+  // that the normal GPU backend runs.
+  gpu::NVPTXCompiler xla_compiler;
+  TF_RETURN_IF_ERROR(xla_compiler.OptimizeHloModule(module.get(), stream_exec,
+                                                    device_allocator));
+  TF_RETURN_IF_ERROR(xla_compiler.PrepareHloModuleForIrEmitting(module.get()));
+
+  return std::move(module);
+}
+
+// TODO(b/137624192): Move this to custom call handling and share.
+absl::optional<bool> CanShareBufferHint(const HloInstruction* user,
+                                        const HloInstruction* operand,
+                                        const ShapeIndex& user_index) {
+  if (user->opcode() == HloOpcode::kCustomCall) {
+    // Share the bias buffer with the parent instruction.
+    if (user->custom_call_target() == xla::gpu::kGemmCallTarget) {
+      if (user->operand_count() == 3 && user->operand(2) == operand) {
+        return true;
+      }
+    }
+    // The operand of cholesky can be shared with the first output.
+    if (user->custom_call_target() == xla::gpu::kCusolverCholeskyCallTarget) {
+      return user_index.size() == 1 && user_index[0] == 0;
+    }
+  }
+  return absl::nullopt;
+}
+
+// TODO(b/137624192): Share this with nvptx backend.
+GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) {
+  int cc_major, cc_minor;
+  const auto& device_description = stream_exec->GetDeviceDescription();
+  if (!device_description.cuda_compute_capability(&cc_major, &cc_minor)) {
+    LOG(WARNING)
+        << "Couldn't get compute capability for device; assuming sm_20.";
+    cc_major = 2;
+    cc_minor = 0;
+  }
+  return std::make_pair(cc_major, cc_minor);
+}
+
+// Return the constant launch bound along the "x" dimension in "dim" if all the
+// other dimensions are 1.  Return nullopt otherwise or when any of the bounds
+// is not constant.
+static absl::optional<int64> getLaunchBound(const mlir::gpu::KernelDim3& dim) {
+  auto get_constant = [](mlir::Operation* op,
+                         mlir::StringRef name) -> absl::optional<int64> {
+    if (auto constant = llvm::dyn_cast_or_null<mlir::ConstantOp>(op)) {
+      return constant.value().cast<mlir::IntegerAttr>().getInt();
+    }
+    op->emitError() << "bound " << name << " is not constant";
+    return absl::nullopt;
+  };
+  auto y_op = dim.y.getDefiningOp();
+  auto dim_y = get_constant(y_op, "y");
+  if (!dim_y.has_value() || dim_y.value() != 1) {
+    y_op->emitError() << "bound 'y' is not constant 1";
+    return absl::nullopt;
+  }
+  auto z_op = dim.z.getDefiningOp();
+  auto dim_z = get_constant(z_op, "z");
+  if (!dim_z.has_value() || dim_z.value() != 1) {
+    z_op->emitError() << "bound 'z' is not constant 1";
+    return absl::nullopt;
+  }
+  return get_constant(dim.x.getDefiningOp(), "x");
+}
+
+// Indexes of a range of arguments in a GPU function. This is used to keep the
+// range of arguments that correspond to a lowered kernel argument of
+// (previously) memref type.
+struct LaunchFuncArgument {
+  int kernel_argument_begin;
+  int kernel_argument_size;
+};
+
+using OperandToValueMap =
+    absl::flat_hash_map<const HloInstruction*, std::vector<LaunchFuncArgument>>;
+
+static StatusOr<std::vector<const HloInstruction*>> ComputeOperandToValueMap(
+    OperandToValueMap* operand_to_value_map, const HloInstruction* instr,
+    LaunchFuncOp launchOp, LLVMFuncOp kernel) {
+  auto operands = instr->operands();
+  std::vector<const HloInstruction*> ordered_operands;
+  bool has_failed = false;
+  // A memref will expand into multiple kernel operands, accumulate their number
+  // in order to find them later.
+  int cur_operand_position = 0;
+
+  for (int kernel_index = 0; kernel_index < launchOp.getNumKernelOperands();
+       ++kernel_index) {
+    auto launchop_operand =
+        launchOp.getKernelOperand(kernel_index).dyn_cast<BlockArgument>();
+    if (!launchop_operand) {
+      launchOp.emitError("argument to kernel is not a function input");
+      has_failed = true;
+      continue;
+    }
+    auto memref_type =
+        launchop_operand.getType().dyn_cast<::mlir::MemRefType>();
+    if (!memref_type) {
+      launchOp.emitError("only memref-typed arguments are supported");
+      has_failed = true;
+      break;
+    }
+    // host_index is the argument position to the surrounding function that
+    // contains the launch. This index corresponds to HLO operand indices
+    // by construction.
+    auto host_index = launchop_operand.getArgNumber();
+    // The trailing argument to the outer function are the results.
+    auto operand =
+        (host_index < operands.size()) ? operands[host_index] : instr;
+    if (!operand_to_value_map->count(operand)) {
+      ordered_operands.push_back(operand);
+    }
+    // Associate the HLO operand with the argument values of the kernel
+    // function.
+    int num_unpacked =
+        mlir::MemRefDescriptor::getNumUnpackedValues(memref_type);
+    (*operand_to_value_map)[operand].push_back(
+        {cur_operand_position, num_unpacked});
+    cur_operand_position += num_unpacked;
+  }
+  if (has_failed) {
+    return InternalError("Mapping operands to kernel arguments has failed.");
+  }
+  return ordered_operands;
+}
+
+Status InsertBufferLoadPreduleIntoKernel(
+    LLVMFuncOp kernel, const OperandToValueMap& operand_to_value_map,
+    const std::vector<const HloInstruction*>& ordered_operands,
+    BufferAssignment* assignment,
+    const std::vector<const BufferAllocation*>& buffers) {
+  mlir::OpBuilder builder(kernel.getBody());
+  auto llvm_dialect = kernel.getContext()->getRegisteredDialect<LLVMDialect>();
+  auto offset_type = LLVMType::getInt64Ty(llvm_dialect);
+  auto ptr_type = LLVMType::getInt8PtrTy(llvm_dialect);
+  auto void_type = LLVMType::getVoidTy(llvm_dialect);
+  auto loc = kernel.getLoc();
+
+  auto num_original_args = kernel.getNumArguments();
+  std::vector<LLVMType> new_arg_types(buffers.size(), ptr_type);
+  kernel.setAttr(kernel.getTypeAttrName(),
+                 mlir::TypeAttr::get(LLVMType::getFunctionTy(
+                     void_type, new_arg_types, /*isVarArg=*/false)));
+  std::vector<Value> original_args(kernel.args_begin(), kernel.args_end());
+
+  std::vector<mlir::Type> as_mlir_types(new_arg_types.begin(),
+                                        new_arg_types.end());
+  auto new_args = kernel.front().addArguments(as_mlir_types);
+  std::vector<Value> buffer_args(new_args.begin(), new_args.end());
+
+  for (auto operand : ordered_operands) {
+    TF_ASSIGN_OR_RETURN(auto slice,
+                        assignment->GetUniqueTopLevelSlice(operand));
+    auto buffer = std::find(buffers.begin(), buffers.end(), slice.allocation());
+    auto index = buffer - buffers.begin();
+    auto offset = builder.create<mlir::LLVM::ConstantOp>(
+        loc, offset_type, builder.getI64IntegerAttr(slice.offset()));
+    auto ptr = buffer_args[index];
+
+    // Replace uses of function arguments pertaining to memref descriptors with
+    // values derived from HLO buffers. The instructions inserting these values
+    // into memref descriptors were already introduced during the lowering phase
+    // as per MLIR calling convention.
+    for (auto arg : operand_to_value_map.at(operand)) {
+      mlir::MemRefDescriptorView original(
+          mlir::ValueRange(original_args)
+              .slice(arg.kernel_argument_begin, arg.kernel_argument_size));
+
+      // Allocated and aligned pointers are the same.
+      auto casted = builder.create<mlir::LLVM::BitcastOp>(
+          loc, original.alignedPtr().getType().cast<LLVMType>(),
+          mlir::ValueRange(ptr));
+      original.alignedPtr().replaceAllUsesWith(casted);
+      original.allocatedPtr().replaceAllUsesWith(casted);
+
+      // Use the offset of the HLO buffer instead of the one expected in the
+      // function call.
+      original.offset().replaceAllUsesWith(offset);
+
+      // Fill the shape.
+      auto shape = operand->shape();
+      // Unless the operand is a scalar pointer, also fill shape and strides.
+      if (shape.dimensions().empty()) {
+        continue;
+      }
+
+      // TODO(b/137624192) Pass in the descriptor to allow for dynamic shapes.
+      assert(shape.IsArray() && shape.is_static());
+      for (auto extent : llvm::enumerate(shape.dimensions())) {
+        auto shape = builder.create<mlir::LLVM::ConstantOp>(
+            loc, original.size(extent.index()).getType(),
+            builder.getI64IntegerAttr(extent.value()));
+        original.size(extent.index()).replaceAllUsesWith(shape);
+      }
+      // Finally, fill the strides.
+      // TODO(b/137624192): Take assigned layout into account.
+      uint64_t accumulator = 0;
+      for (int64_t idx = shape.rank() - 1; idx >= 0; --idx) {
+        if (accumulator == 0) {
+          accumulator = 1;
+        } else {
+          accumulator *= shape.dimensions(idx + 1);
+        }
+        auto stride = builder.create<mlir::LLVM::ConstantOp>(
+            loc, original.stride(idx).getType(),
+            builder.getI64IntegerAttr(accumulator));
+        original.stride(idx).replaceAllUsesWith(stride);
+      }
+    }
+  }
+
+  // Now we can remove the original arguments, as they should have no more
+  // users.
+  for (int i = 0; i < num_original_args; ++i) {
+    kernel.front().eraseArgument(0);
+  }
+
+  return Status::OK();
+}
+
+StatusOr<std::unique_ptr<gpu::KernelThunk>> TransformKernelToXlaThunk(
+    FuncOp func, const HloInstruction* const instr, ModuleOp kernel_module,
+    BufferAssignment* assignment) {
+  // Find the single LaunchFuncOp and compute a mapping from operands of
+  // the hlo instruction to the corresponding values of the kernel
+  // function in the target module;
+  LaunchFuncOp launchOp;
+  auto walkResult = func.walk([&launchOp](LaunchFuncOp op) {
+    if (launchOp) {
+      op.emitError("multiple kernels for single top-level HLO");
+      return mlir::WalkResult::interrupt();
+    }
+    launchOp = op;
+    return mlir::WalkResult::advance();
+  });
+  if (walkResult.wasInterrupted()) {
+    return InternalError("Multiple kernels for single top-level HLO");
+  }
+  if (!launchOp) {
+    // If there was no launchOp, then no kernel was generated, so the lowering
+    // from the LHLO ops to the GPU dialect is not implemented yet.
+    return Unimplemented("No kernel was generated.");
+  }
+
+  auto kernel =
+      kernel_module.lookupSymbol<LLVMFuncOp>(launchOp.getKernelName());
+
+  // Store the assignment of operands to block arguments. Note that an operand
+  // might be used in multiple argument positions, hence the vector.
+  OperandToValueMap operand_to_value_map;
+  TF_ASSIGN_OR_RETURN(
+      auto ordered_operands,
+      ComputeOperandToValueMap(&operand_to_value_map, instr, launchOp, kernel));
+
+  // Get the required buffers to support the inputs. Use a set and vector here
+  // to keep the order fixed. This is mostly useful for testing.
+  std::unordered_set<const BufferAllocation*> buffers_needed;
+  std::vector<const BufferAllocation*> buffers;
+  // TODO(b/137624192) Add support for tuples.
+  for (auto operand : ordered_operands) {
+    TF_ASSIGN_OR_RETURN(auto buffer,
+                        assignment->GetUniqueTopLevelSlice(operand));
+    if (buffers_needed.insert(buffer.allocation()).second) {
+      buffers.push_back(buffer.allocation());
+    }
+  }
+
+  // TODO(b/137624192) Add support for temp buffer.
+  // TODO(b/137624192) Add support for constant buffers.
+
+  // Change the signature to match what the XLA runtime expects from the
+  // kernel.
+  TF_RETURN_IF_ERROR(InsertBufferLoadPreduleIntoKernel(
+      kernel, operand_to_value_map, ordered_operands, assignment, buffers));
+
+  // Finally, create the thunk and set the launch dimensions.
+  auto thunk = absl::make_unique<gpu::KernelThunk>(
+      buffers, kernel.getName().str(), instr,
+      /*unroll_factor=*/1);
+
+  // Set launch bounds.
+  mlir::gpu::KernelDim3 block = launchOp.getBlockSizeOperandValues();
+  mlir::gpu::KernelDim3 grid = launchOp.getGridSizeOperandValues();
+  absl::optional<int64> num_threads = getLaunchBound(block);
+  absl::optional<int64> num_blocks = getLaunchBound(grid);
+  if (!num_threads || !num_blocks) {
+    return Unimplemented("Unsupported launch bounds");
+  }
+  thunk->SetLaunchDimensions(gpu::LaunchDimensions(*num_blocks, *num_threads));
+  return std::move(thunk);
+}
+
+StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    se::DeviceMemoryAllocator* device_allocator) {
+  // Determine the HLO schedule, which is an ordering of HLO instructions. This
+  // is used by buffer assignment to enable buffer reuse, and the same ordering
+  // must also be used to determine the thunk launch schedule.
+  std::unique_ptr<StreamAssignment> stream_assignment =
+      xla::gpu::AssignStreams(*module);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<GpuHloSchedule> hlo_schedule,
+      GpuHloSchedule::Build(*module, *stream_assignment, pointer_size_));
+
+  // Run buffer analysis on the HLO graph. This analysis figures out which
+  // temporary buffers are required to run the computation.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> buffer_assignment,
+                      BufferAssigner::Run(
+                          module.get(), hlo_schedule->ConsumeHloOrdering(),
+                          BufferSizeBytesFunction(),
+                          /*color_alignment=*/
+                          [](LogicalBuffer::Color) {
+                            return xla::gpu::kXlaAllocatedBufferAlignBytes;
+                          },
+                          /*allocate_buffers_for_constants=*/true,
+                          /*colorer=*/BufferAssigner::DefaultColorer(),
+                          /*must_not_live_out=*/{}, &CanShareBufferHint));
+  DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
+
+  EmissionContext emission_context(std::move(module));
+  if (error_handler_) {
+    emission_context.setErrorHandler(error_handler_);
+  }
+
+  OwningModuleRef mlir_module =
+      ModuleOp::create(UnknownLoc::get(emission_context.getContext()));
+  LhloDialectEmitter lhlo_emitter(&emission_context, *buffer_assignment,
+                                  stream_exec->platform(), *mlir_module);
+
+  TF_RETURN_IF_ERROR(lhlo_emitter.EmitComputation(
+      *emission_context.getHloModule()->entry_computation()));
+
+  TF_RETURN_IF_ERROR(
+      module_hook_.invoke(IRHook::LoweringStage::LHLO, *mlir_module));
+
+  TF_RETURN_IF_ERROR(LowerLHLOToGPU(*mlir_module));
+
+  TF_RETURN_IF_ERROR(
+      module_hook_.invoke(IRHook::LoweringStage::GPU, *mlir_module));
+
+  TF_RETURN_IF_ERROR(LowerKernelBodiesToNVVM(*mlir_module));
+
+  TF_RETURN_IF_ERROR(
+      module_hook_.invoke(IRHook::LoweringStage::LLVM, *mlir_module));
+
+  TF_ASSIGN_OR_RETURN(OwningModuleRef kernel_module,
+                      ExtractKernelModule(*mlir_module));
+
+  auto thunk_sequence = lhlo_emitter.ConsumeThunkSequence();
+  for (auto entry : lhlo_emitter.InstructionToFunctionMap()) {
+    TF_ASSIGN_OR_RETURN(
+        auto thunk,
+        TransformKernelToXlaThunk(entry.second, entry.first, *kernel_module,
+                                  buffer_assignment.get()));
+    thunk_sequence->push_back(std::move(thunk));
+  }
+
+  TF_RETURN_IF_ERROR(
+      module_hook_.invoke(IRHook::LoweringStage::KERNEL, *kernel_module));
+
+  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module);
+
+  if (!llvmModule) {
+    return InternalError("Translation to LLVM failed");
+  }
+
+  llvmModule->setModuleIdentifier(emission_context.getHloModule()->name());
+  // TODO(herhut): Why is this needed and does not come from the template?
+  llvmModule->setDataLayout(gpu::nvptx::kDataLayout);
+
+  const auto& config = emission_context.getHloModule()->config();
+  TF_ASSIGN_OR_RETURN(
+      auto ptx, xla::gpu::nvptx::CompileToPtx(llvmModule.get(),
+                                              GetGpuVersion(stream_exec),
+                                              config, GetLibdeviceDir(config)));
+  TF_ASSIGN_OR_RETURN(
+      auto cubin, se::CompileGpuAsm(stream_exec->device_ordinal(), ptx.c_str(),
+                                    gpu::PtxOptsFromConfig(config)));
+
+  auto thunk_schedule = absl::make_unique<ThunkSchedule>(
+      std::move(thunk_sequence), std::move(stream_assignment),
+      hlo_schedule->ThunkLaunchOrder());
+
+  if (DumpingEnabledForHloModule(*emission_context.getHloModule())) {
+    DumpToFileInDirOrStdout(*emission_context.getHloModule(), "",
+                            "thunk_schedule", thunk_schedule->ToString());
+  }
+
+  // TODO(b/137624192): Add profiling support.
+  return {absl::make_unique<GpuExecutable>(
+      ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
+      emission_context.releaseHloModule(), std::move(buffer_assignment),
+      nullptr, nullptr)};
+}
+
+StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
+    std::unique_ptr<HloModuleGroup> module_group,
+    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
+    se::DeviceMemoryAllocator* device_allocator) {
+  return Unimplemented("Not yet implemented in MLIR compiler");
+}
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+MlirCompilerImpl::CompileAheadOfTime(
+    std::unique_ptr<HloModuleGroup> /*module_group*/,
+    const AotCompilationOptions& /*options*/) {
+  return Unimplemented("Not yet implemented in MLIR compiler");
+}
+
+}  // namespace
+}  // namespace mlir_gpu
+}  // namespace xla
+
+static bool InitModule() {
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::cuda::kCudaPlatformId, []() {
+        return absl::make_unique<xla::FailoverCompiler>(
+            absl::make_unique<xla::mlir_gpu::MlirCompilerImpl>(),
+            absl::make_unique<xla::gpu::NVPTXCompiler>());
+      });
+  return true;
+}
+static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
deleted file mode 100644
index c8e01b967e7..00000000000
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h"
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/memory/memory.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
-#include "tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.h"
-#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/filecheck.h"
-#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/path.h"
-#include "tensorflow/core/platform/resource_loader.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace xla {
-namespace mlir_gpu {
-
-void MlirIrGenTestBase::CompileIr(std::unique_ptr<HloModule> hlo_module,
-                                  const MlirCompiler::IRHook& ir_hook) {
-  MlirCompiler* compiler = GetMLIRCompiler();
-  compiler->SetModuleHook(ir_hook);
-  Status status = CompileToExecutable(std::move(hlo_module)).status();
-  compiler->RemoveModuleHook();
-  TF_ASSERT_OK(status);
-}
-
-void MlirIrGenTestBase::PatternMatch(const std::string& str,
-                                     const std::string& pattern_file) {
-  StatusOr<bool> filecheck_result =
-      RunFileCheckWithPatternFile(str, pattern_file);
-  TF_ASSERT_OK(filecheck_result.status());
-  EXPECT_TRUE(filecheck_result.ValueOrDie());
-}
-
-string MlirIrGenTestBase::CompileIr(
-    std::unique_ptr<HloModule> hlo_module,
-    MlirCompiler::IRHook::LoweringStage printing_stage) {
-  std::string ir;
-  CompileIr(std::move(hlo_module),
-            {[&ir](mlir::ModuleOp module) -> Status {
-               std::string buffer_string;
-               llvm::raw_string_ostream ostream(buffer_string);
-               module.print(ostream);
-               ostream.flush();
-               ir = buffer_string;
-               return Status::OK();
-             },
-             printing_stage});
-  return ir;
-}
-
-void MlirIrGenTestBase::CompileAndVerifyIr(
-    std::unique_ptr<HloModule> hlo_module, const std::string& pattern_file,
-    LoweringStage printing_stage) {
-  std::string ir = CompileIr(std::move(hlo_module), printing_stage);
-  PatternMatch(ir, pattern_file);
-}
-
-void MlirIrGenTestBase::CompileAndVerifyIr(const std::string& hlo_text_filename,
-                                           LoweringStage printing_stage) {
-  std::string hlo_text_absolute_filename =
-      tensorflow::GetDataDependencyFilepath(hlo_text_filename);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          GetVerifiedHloModule(hlo_text_absolute_filename));
-  CompileAndVerifyIr(std::move(module),
-                     /*pattern_file=*/hlo_text_absolute_filename,
-                     printing_stage);
-}
-
-MlirCompiler::IRHook MlirIrGenTestBase::getIRHookBreakingLoweringStage(
-    LoweringStage breaking_stage) {
-  return {[](mlir::ModuleOp module) -> Status {
-            mlir::PassManager pm(module.getContext());
-            pm.addPass(::mlir::createInjectErrorsForTestingPass());
-            if (failed(pm.run(module))) {
-              return InternalError("InjectErrorsForTestingPass failed.");
-            }
-            return Status::OK();
-          },
-          breaking_stage};
-}
-
-StatusOr<string> MlirIrGenTestBase::CompileAndInjectErrors(
-    std::unique_ptr<HloModule> hlo_module, LoweringStage breaking_stage) {
-  std::string errors;
-  auto error_handler = [&errors](const EmissionContext::ErrorMap& error_map,
-                                 HloModule* hlo_module) {
-    errors = "ERRORS FOUND: ";
-    for (auto& err : error_map) {
-      errors += "[" + err.first->ToString() + ": " +
-                absl::StrJoin(err.second, "; ") + "]";
-    }
-  };
-
-  MlirCompiler* compiler = GetMLIRCompiler();
-  compiler->SetModuleHook(getIRHookBreakingLoweringStage(breaking_stage));
-  compiler->SetErrorHandler(error_handler);
-  Status status = CompileToExecutable(std::move(hlo_module)).status();
-  compiler->RemoveModuleHook();
-  compiler->RemoveErrorHandler();
-
-  if (status.ok()) {
-    return errors;
-  }
-  return status;
-}
-
-void MlirIrGenTestBase::CompileAndVerifyErrors(
-    const std::string& hlo_text_filename, LoweringStage breaking_stage) {
-  std::string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
-  std::string hlo_text_absolute_filename =
-      tensorflow::GetDataDependencyFilepath(hlo_text_filename);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          GetVerifiedHloModule(hlo_text_absolute_filename));
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::string errors,
-      CompileAndInjectErrors(std::move(module), breaking_stage));
-  PatternMatch(errors, /*pattern_file=*/hlo_text_absolute_filename);
-}
-
-StatusOr<std::unique_ptr<VerifiedHloModule>>
-MlirIrGenTestBase::GetVerifiedHloModule(const std::string& hlo_text_filename) {
-  HloModuleConfig config;
-  config.set_debug_options(GetDebugOptionsForTest());
-  auto module = absl::make_unique<VerifiedHloModule>(
-      "Module", config, /*verifier_layout_sensitive=*/true,
-      /*allow_mixed_precision_in_hlo_verifier=*/false,
-      /*shape_size_function=*/ShapeUtil::ByteSizeOfElements);
-  std::string hlo_text;
-  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(
-      tensorflow::Env::Default(), hlo_text_filename, &hlo_text));
-  TF_RETURN_IF_ERROR(module->ParseHloStringAndVerifyModule(hlo_text));
-  return std::move(module);
-}
-
-MlirCompiler* MlirIrGenTestBase::GetMLIRCompiler() {
-  // TODO(b/137624192): Remove failover once no longer in place.
-  auto* failover = static_cast<FailoverCompiler*>(backend().compiler());
-  return static_cast<MlirCompiler*>(failover->GetPrimary());
-}
-
-}  // namespace mlir_gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h
deleted file mode 100644
index 46246c0d4d6..00000000000
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_IRGEN_TEST_BASE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_IRGEN_TEST_BASE_H_
-
-#include <string>
-
-#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
-#include "tensorflow/compiler/xla/tests/codegen_test_base.h"
-
-namespace xla {
-namespace mlir_gpu {
-
-// Tests that verify IR emitted by the CPU/GPU backend is as expected.
-class MlirIrGenTestBase : public CodegenTestBase {
- protected:
-  using LoweringStage = MlirCompiler::IRHook::LoweringStage;
-
-  // Compiles the given HLO module to MLIR IR and verifies the IR matches the
-  // given pattern. `pattern` is in the FileCheck pattern matching syntax
-  // (http://llvm.org/docs/CommandGuide/FileCheck.html).
-  //
-  // This function invokes the JIT compiler.
-  //
-  // If `match_lowered_ir` is true, match the version of the IR after lowering
-  // steps to LLVM IR are applied; otherwise, the IR before lowering is
-  // matched.
-  void CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
-                          const std::string& pattern_file,
-                          LoweringStage printing_stage);
-
-  // A thin wrapper around CompileAndVerifyIr that parses the hlo text in
-  // `hlo_text_filename` to create an HLO module.
-  void CompileAndVerifyIr(const std::string& hlo_text_filename,
-                          LoweringStage printing_stage = LoweringStage::LHLO);
-
-  // Adds the InjectErrorsForTestingPass to MLIRCompiler on the provided
-  // lowering stage, compiles the given HLO module, and returns a std::string
-  // representation of all the errors occurred during compiling.
-  StatusOr<string> CompileAndInjectErrors(std::unique_ptr<HloModule> hlo_module,
-                                          LoweringStage breaking_stage);
-
-  // Adds the InjectErrorsForTestingPass to MLIRCompiler on the provided
-  // lowering stage, parses and compiles `hlo_text`, and verifies that the
-  // std::string representation of all the errors occurred during compiling
-  // matches the given pattern.
-  void CompileAndVerifyErrors(const std::string& hlo_text_filename,
-                              LoweringStage breaking_stage);
-
- private:
-  StatusOr<std::unique_ptr<VerifiedHloModule>> GetVerifiedHloModule(
-      const std::string& hlo_text_filename);
-
-  void CompileIr(std::unique_ptr<HloModule> hlo_module,
-                 const MlirCompiler::IRHook& ir_hook);
-  void PatternMatch(const std::string& str, const std::string& pattern_file);
-  std::string CompileIr(std::unique_ptr<HloModule> hlo_module,
-                        LoweringStage printing_stage);
-  MlirCompiler::IRHook getIRHookBreakingLoweringStage(
-      LoweringStage breaking_stage);
-  MlirCompiler* GetMLIRCompiler();
-};
-
-}  // namespace mlir_gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_IRGEN_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
index 20eb8a8766e..850d5f5a0cf 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
@@ -1,14 +1,9 @@
-# TODO(herhut): describe this package.
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
+    "tf_exec_properties",
 )
-load(
-    "//tensorflow/core/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     default_visibility = [":friends"],
@@ -22,49 +17,28 @@ package_group(
     ],
 )
 
-tf_cc_test(
-    name = "mlir_gpu_lhlo_gen_test",
-    srcs = if_cuda_is_configured(["mlir_gpu_lhlo_gen_test.cc"]),
-    data = [
-        "abs.hlo",
-        "add.hlo",
-        "add_as_kernel.hlo",
-        "add_in_gpu_dialect.hlo",
-        "add_multiply.hlo",
-        "add_multiply_gpu.hlo",
-        "add_reduce.hlo",
-        "broadcast.hlo",
-        "broken_add.hlo",
-        "ceil.hlo",
-        "compare.hlo",
-        "concatenate.hlo",
-        "const.hlo",
-        "copy.hlo",
-        "copy_transpose.hlo",
-        "cos.hlo",
-        "exp.hlo",
-        "fused_reduce.hlo",
-        "iota.hlo",
-        "iota_add_multiply.hlo",
-        "log.hlo",
-        "neg.hlo",
-        "reduce_window.hlo",
-        "rem.hlo",
-        "rsqrt.hlo",
-        "select.hlo",
-        "select_and_scatter.hlo",
-        "sign.hlo",
-        "sqrt.hlo",
-        "tanh.hlo",
+glob_lit_tests(
+    data = [":test_utilities"],
+    default_tags = tf_cuda_tests_tags() + [
+        "no_pip",
+        "config-cuda-only",
+        "no_rocm",
+    ],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    exclude = [
+        # TODO(b/137624192): Reenable once we can fuse reductions.
+        "fused_reduce.hlo",
+    ],
+    exec_properties = tf_exec_properties({"tags": tf_cuda_tests_tags()}),
+    test_file_exts = ["hlo"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/service/mlir_gpu:xla-gpu-opt",
+        "@llvm-project//llvm:FileCheck",
     ],
-    tags = tf_cuda_tests_tags() + ["no_rocm"],
-    deps = [
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:test",
-    ] + if_cuda_is_configured([
-        "//tensorflow/core:lib",
-        "//tensorflow/compiler/xla/service:gpu_plugin_mlir",
-        "//tensorflow/compiler/xla/service/mlir_gpu:mlir_irgen_test_base",
-        "//tensorflow/stream_executor/lib",
-    ]),
 )
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo
index 6a4353d8d45..210d92d6ed2 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Abs
 ENTRY %Abs (val: f32[2,2]) -> f32[2,2] {
   %val = f32[2,2]{1,0} parameter(0)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo
index d48fcf89658..73005dc80e8 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Add
 
 ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
index c477cc99c39..3ee831fc74e 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt -lowering-stage=KERNEL %s | FileCheck %s -dump-input-on-failure
 HloModule Add
 
 ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
index 208ca2799b2..af0bf743092 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt -lowering-stage=GPU %s | FileCheck %s -dump-input-on-failure
 HloModule Add
 
 ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo
index 58cba9711f3..5a972faa282 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule AddMultiply
 
 ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
index fe871c1feb6..bb32f08e69e 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt -lowering-stage=GPU %s | FileCheck %s -dump-input-on-failure
 HloModule AddMultiply
 
 ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
@@ -19,4 +20,4 @@ ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
 //  CHECK:   %[[V2:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
 //  CHECK:   %[[MUL:.*]] = mulf %[[ADD]], %[[V2]]
 //  CHECK:   store %[[MUL]], %{{.*\[}}[[CSTIDX:.*]]]
-//  CHECK-NEXT: return
+//  CHECK: return
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo
index 6df8f284b72..85a7185cd50 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule AddReduce
 
 %add (x: f32[], y: f32[]) -> f32[] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo
index b0613ac96ac..7f4763ef74d 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Broadcast
 
 ENTRY %Broadcast (x: f32[10]) -> f32[10, 5] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo
index b4b22f42f29..0aea08b699b 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt -verify-errors %s | FileCheck %s -dump-input-on-failure
 HloModule Add
 
 ENTRY %Add (x: f32[2,2,2], y: f32[2,2,2]) -> f32[2,2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo
index ff4e8191da4..36699414c98 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Ceil
 ENTRY %Ceil (val: f32[2,2]) -> f32[2,2] {
   %val = f32[2,2]{1,0} parameter(0)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo
index a0f88efbd2f..d464db52e06 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Compare
 
 ENTRY %Compare (x: f32[2,2], y: f32[2,2]) -> pred[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/complex.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/complex.hlo
new file mode 100644
index 00000000000..974eb4e8cff
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/complex.hlo
@@ -0,0 +1,12 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
+HloModule Complex
+
+ENTRY %Complex (real: f32[2,2]{0,1}, imag: f32[2,2]{0,1}) -> c64[2,2] {
+  %real = f32[2,2]{0,1} parameter(0)
+  %imag = f32[2,2]{0,1} parameter(1)
+  ROOT %compl = c64[2,2]{0,1} complex(%real, %imag)
+}
+
+// CHECK: func @complex(%[[REAL:.*]]: [[BUF_F32:.*]], %[[IMAG:.*]]: [[BUF_F32]], %[[OUT:.*]]: [[BUF_C64:.*]]) {
+// CHECK:   "xla_lhlo.complex"(%[[REAL]], %[[IMAG]], %[[OUT]]) : ([[BUF_F32]], [[BUF_F32]], [[BUF_C64]]) -> ()
+// CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/concatenate.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/concatenate.hlo
index e77a14d537e..dde3b739e2e 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/concatenate.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/concatenate.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Concatenate
 
 ENTRY %Concatenate (x: f32[2,3], y: f32[2,2]) -> f32[2,5] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo
index 9c28b3619ac..43f0ffb809c 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Const
 
 ENTRY %Const () -> s32[100] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo
index a729a4375b6..3cedc4c43e5 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Copy
 
 ENTRY %Copy (x: f32[2,4]) -> f32[2,4] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
index 2ad8c1b49e3..f462b6e0e69 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule CopyTranspose
 
 ENTRY %CopyTranspose (x: f32[2,4]) -> f32[2,4]{0,1} {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo
index e10b8e72f34..80353b7b3a8 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Cos
 ENTRY %Cos (val: f32[2,2]) -> f32[2,2] {
   %val = f32[2,2]{1,0} parameter(0)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo
index 5eec5d98b22..03eef5b2a8c 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Exp
 
 ENTRY %Exp (x: f32[2,2]) -> f32[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo
index a673469977f..98b22c5b503 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule FusedReduce
 
 %add (x: f32[], y: f32[]) -> f32[] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/imag.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/imag.hlo
new file mode 100644
index 00000000000..ca79c840ef8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/imag.hlo
@@ -0,0 +1,11 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
+HloModule Imag
+
+ENTRY %Imag (x: c64[2,2]{0,1}) -> f32[2,2] {
+  %x = c64[2,2]{0,1} parameter(0)
+  ROOT %imag = f32[2,2]{0,1} imag(%x)
+}
+
+// CHECK: func @imag(%[[IN:.*]]: [[BUF_C64:.*]], %[[OUT:.*]]: [[BUF_F32:.*]]) {
+// CHECK:   "xla_lhlo.imag"(%[[IN]], %[[OUT]]) : ([[BUF_C64]], [[BUF_F32]]) -> ()
+// CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo
index d622ed0e528..8d903987b78 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Iota
 
  ENTRY %Iota() -> s64[10, 5] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_multiply.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_multiply.hlo
deleted file mode 100644
index 89b7a43a102..00000000000
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_multiply.hlo
+++ /dev/null
@@ -1,15 +0,0 @@
-HloModule AddMultiply
-
-ENTRY %AddMultiply (x: s32[2,2], y: s32[2,2]) -> s32[2,2] {
-  %x = s32[2,2]{1,0} parameter(0)
-  %y = s32[2,2]{1,0} parameter(1)
-
-  %add = s32[2,2]{1,0} add(s32[2,2]{1,0} %x, s32[2,2]{1,0} %y)
-  %iota = s32[2, 2]{1,0} iota(), iota_dimension=0
-
-  ROOT %mul = s32[2,2]{1,0} multiply(s32[2,2]{1,0} %add, s32[2,2]{1,0} %iota)
-}
-
-//  CHECK-NOT:  store
-//  CHECK:      %[[RESULT:.*]] = muli %{{.*}}, %{{.*}}
-//  CHECK:      store %[[RESULT]]
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_subtract.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_subtract.hlo
new file mode 100644
index 00000000000..f42a7cf7ca6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_subtract.hlo
@@ -0,0 +1,16 @@
+// RUN: xla-gpu-opt -lowering-stage=GPU %s | FileCheck %s -dump-input-on-failure
+HloModule AddSubtract
+
+ENTRY %AddSubtract (x: s32[2,2], y: s32[2,2]) -> s32[2,2] {
+  %x = s32[2,2]{1,0} parameter(0)
+  %y = s32[2,2]{1,0} parameter(1)
+
+  %add = s32[2,2]{1,0} add(s32[2,2]{1,0} %x, s32[2,2]{1,0} %y)
+  %iota = s32[2, 2]{1,0} iota(), iota_dimension=0
+
+  ROOT %sub = s32[2,2]{1,0} subtract(s32[2,2]{1,0} %add, s32[2,2]{1,0} %iota)
+}
+
+//  CHECK-NOT:  store
+//  CHECK:      [[RESULT:%.*]] = subi %{{.*}}, %{{.*}}
+//  CHECK:      store [[RESULT]]
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo
index c7e2574558a..ac73201578e 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Log
 
 ENTRY %Log (x: f32[2,2]) -> f32[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
deleted file mode 100644
index 3c69597fbd7..00000000000
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h"
-#include "tensorflow/core/platform/path.h"
-
-namespace xla {
-namespace mlir_gpu {
-
-class LhloGenTest : public MlirIrGenTestBase {};
-
-TEST_F(LhloGenTest, Const) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/tensorflow::io::JoinPath(
-          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
-          "const.hlo"),
-      LoweringStage::LHLO);
-}
-
-TEST_F(LhloGenTest, BrokenAdd) {
-  CompileAndVerifyErrors(
-      /*hlo_text_filename=*/
-      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
-                               "mlir_gpu", "tests", "broken_add.hlo"),
-      LoweringStage::LHLO);
-}
-
-TEST_F(LhloGenTest, Add) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/tensorflow::io::JoinPath(
-          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
-          "add.hlo"));
-}
-
-TEST_F(LhloGenTest, Compare) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/tensorflow::io::JoinPath(
-          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
-          "compare.hlo"));
-}
-
-TEST_F(LhloGenTest, Copy) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/tensorflow::io::JoinPath(
-          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
-          "copy.hlo"));
-}
-
-TEST_F(LhloGenTest, CopyTranspose) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/tensorflow::io::JoinPath(
-          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
-          "copy_transpose.hlo"));
-}
-
-TEST_F(LhloGenTest, Select) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/tensorflow::io::JoinPath(
-          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
-          "select.hlo"));
-}
-
-TEST_F(LhloGenTest, Exp) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/tensorflow::io::JoinPath(
-          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
-          "exp.hlo"));
-}
-
-TEST_F(LhloGenTest, Log) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/tensorflow::io::JoinPath(
-          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
-          "log.hlo"));
-}
-
-TEST_F(LhloGenTest, AddInGPUDialect) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/
-      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
-                               "mlir_gpu", "tests", "add_in_gpu_dialect.hlo"),
-      LoweringStage::GPU);
-}
-
-// This test verifies that the kernel signature is amended correctly. The actual
-// body of the generated function does not matter, it is already checked at the
-// GPU level above.
-TEST_F(LhloGenTest, AddAsKernel) {
-  CompileAndVerifyIr(
-      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
-                               "mlir_gpu", "tests", "add_as_kernel.hlo"),
-      LoweringStage::KERNEL);
-}
-
-// TODO(b/149302060) Reenable once fusion is fixed.
-TEST_F(LhloGenTest, DISABLED_AddMultiply) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "add_multiply.hlo"));
-}
-
-// TODO(b/149302060) Reenable once fusion is fixed.
-TEST_F(LhloGenTest, DISABLED_IotaAddMultiply) {
-  CompileAndVerifyIr(
-      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
-                               "mlir_gpu", "tests", "iota_add_multiply.hlo"),
-      LoweringStage::GPU);
-}
-
-TEST_F(LhloGenTest, AddMultiplyGPU) {
-  CompileAndVerifyIr(
-      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
-                               "mlir_gpu", "tests", "add_multiply_gpu.hlo"),
-      LoweringStage::GPU);
-}
-
-// TODO(b/137624192): Reenable once we can fuse reductions.
-TEST_F(LhloGenTest, DISABLED_FusedReduce) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "fused_reduce.hlo"));
-}
-
-TEST_F(LhloGenTest, Broadcast) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "broadcast.hlo"));
-}
-
-TEST_F(LhloGenTest, Iota) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "iota.hlo"));
-}
-
-TEST_F(LhloGenTest, AddReduce) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "add_reduce.hlo"));
-}
-
-TEST_F(LhloGenTest, Abs) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "abs.hlo"));
-}
-
-TEST_F(LhloGenTest, Ceil) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "ceil.hlo"));
-}
-
-TEST_F(LhloGenTest, Cos) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "cos.hlo"));
-}
-
-TEST_F(LhloGenTest, Neg) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "neg.hlo"));
-}
-
-TEST_F(LhloGenTest, ReduceWindow) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "reduce_window.hlo"));
-}
-
-TEST_F(LhloGenTest, Rem) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "rem.hlo"));
-}
-
-TEST_F(LhloGenTest, Rsqrt) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "rsqrt.hlo"));
-}
-
-TEST_F(LhloGenTest, SelectAndScatter) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "select_and_scatter.hlo"));
-}
-
-TEST_F(LhloGenTest, Sign) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "rsqrt.hlo"));
-}
-
-TEST_F(LhloGenTest, Sqrt) {
-  CompileAndVerifyIr(
-      /*hlo_text_filename=*/tensorflow::io::JoinPath(
-          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
-          "sqrt.hlo"));
-}
-
-TEST_F(LhloGenTest, Tanh) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "tanh.hlo"));
-}
-
-TEST_F(LhloGenTest, Concatenate) {
-  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
-                                              "service", "mlir_gpu", "tests",
-                                              "concatenate.hlo"));
-}
-
-}  // namespace mlir_gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo
index e0b42c4da12..f1914030841 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Neg
 ENTRY %Neg (val: f32[2,2]) -> f32[2,2] {
   %val = f32[2,2]{1,0} parameter(0)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/real.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/real.hlo
new file mode 100644
index 00000000000..cb19c392b7d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/real.hlo
@@ -0,0 +1,11 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
+HloModule Real
+
+ENTRY %Real (x: c64[2,2]{0,1}) -> f32[2,2] {
+  %x = c64[2,2]{0,1} parameter(0)
+  ROOT %real = f32[2,2]{0,1} real(%x)
+}
+
+// CHECK: func @real(%[[IN:.*]]: [[BUF_C64:.*]], %[[OUT:.*]]: [[BUF_F32:.*]]) {
+// CHECK:   "xla_lhlo.real"(%[[IN]], %[[OUT]]) : ([[BUF_C64]], [[BUF_F32]]) -> ()
+// CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/reduce_window.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/reduce_window.hlo
index 1d4786e8151..8284e054d23 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/reduce_window.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/reduce_window.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule ReduceWindow
 
 %max (x: f32[], y: f32[]) -> f32[] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo
index 441ace6ef94..f3ac9bf6529 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Rem
 ENTRY %Rem(x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
   %x = f32[2,2]{1,0} parameter(0)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo
index a10f9ada92b..fb6d995a1aa 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Rsqrt
 
 ENTRY %Rsqrt (x: f32[2,2]) -> f32[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo
index 0cbe8c73700..05c5ca68679 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Select
 
 ENTRY %Select (p: pred[2,2], x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/select_and_scatter.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/select_and_scatter.hlo
index 21979a2815f..abc289ef83a 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/select_and_scatter.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/select_and_scatter.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule SelectAndScatter
 
 %ge (x: f32[], y: f32[]) -> pred[] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo
index a0ff329938b..0952777903b 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Sign
 ENTRY %Sign (val: f32[2,2]) -> f32[2,2] {
   %val = f32[2,2]{1,0} parameter(0)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/sqrt.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/sqrt.hlo
index 95461b912a3..528b97d2765 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/sqrt.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/sqrt.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Sqrt
 
 ENTRY %Sqrt (x: f32[2,2]) -> f32[2,2] {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo
index d539b3002dc..bf5c6dfde6a 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo
@@ -1,3 +1,4 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
 HloModule Tanh
 ENTRY %Tanh (val: f32[2,2]) -> f32[2,2] {
   %val = f32[2,2]{1,0} parameter(0)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.cc b/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.cc
new file mode 100644
index 00000000000..05a7b5b6bbf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.cc
@@ -0,0 +1,166 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/str_join.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace mlir_gpu {
+
+Status XlaGpuOpt::CompileIr(std::unique_ptr<HloModule> hlo_module,
+                            const MlirCompiler::IRHook& ir_hook) {
+  MlirCompiler* compiler = GetMLIRCompiler();
+  compiler->SetModuleHook(ir_hook);
+  TF_ASSIGN_OR_RETURN(hlo_module, backend_->compiler()->RunHloPasses(
+                                      std::move(hlo_module),
+                                      backend_->default_stream_executor(),
+                                      /*device_allocator=*/nullptr));
+  Status status = backend_->compiler()
+                      ->RunBackend(std::move(hlo_module),
+                                   backend_->default_stream_executor(),
+                                   /*device_allocator=*/nullptr)
+                      .status();
+  compiler->RemoveModuleHook();
+  return status;
+}
+
+StatusOr<std::string> XlaGpuOpt::CompileIr(
+    std::unique_ptr<HloModule> hlo_module,
+    MlirCompiler::IRHook::LoweringStage printing_stage) {
+  std::string ir;
+  TF_RETURN_IF_ERROR(CompileIr(
+      std::move(hlo_module), {[&ir](mlir::ModuleOp module) -> Status {
+                                std::string buffer_string;
+                                llvm::raw_string_ostream ostream(buffer_string);
+                                module.print(ostream);
+                                ostream.flush();
+                                ir = buffer_string;
+                                return Status::OK();
+                              },
+                              printing_stage}));
+  return ir;
+}
+
+Status XlaGpuOpt::CompileAndOutputIr(std::unique_ptr<HloModule> hlo_module,
+                                     llvm::raw_ostream& os,
+                                     LoweringStage printing_stage) {
+  TF_ASSIGN_OR_RETURN(std::string ir,
+                      CompileIr(std::move(hlo_module), printing_stage));
+  os << ir;
+  return Status::OK();
+}
+
+Status XlaGpuOpt::CompileAndOutputIr(const std::string& hlo_text,
+                                     llvm::raw_ostream& os,
+                                     LoweringStage printing_stage) {
+  TF_ASSIGN_OR_RETURN(auto module, GetVerifiedHloModule(hlo_text));
+  return CompileAndOutputIr(std::move(module), os, printing_stage);
+}
+
+MlirCompiler::IRHook XlaGpuOpt::GetIRHookBreakingLoweringStage(
+    LoweringStage breaking_stage) {
+  return {[](mlir::ModuleOp module) -> Status {
+            mlir::PassManager pm(module.getContext());
+            pm.addPass(::mlir::createInjectErrorsForTestingPass());
+            if (failed(pm.run(module))) {
+              return InternalError("InjectErrorsForTestingPass failed.");
+            }
+            return Status::OK();
+          },
+          breaking_stage};
+}
+
+StatusOr<string> XlaGpuOpt::CompileAndInjectErrors(
+    std::unique_ptr<HloModule> hlo_module, LoweringStage breaking_stage) {
+  std::string errors;
+  auto error_handler = [&errors](const EmissionContext::ErrorMap& error_map,
+                                 HloModule* hlo_module) {
+    errors = "ERRORS FOUND: ";
+    for (auto& err : error_map) {
+      errors += "[" + err.first->ToString() + ": " +
+                absl::StrJoin(err.second, "; ") + "]";
+    }
+  };
+
+  MlirCompiler* compiler = GetMLIRCompiler();
+  compiler->SetModuleHook(GetIRHookBreakingLoweringStage(breaking_stage));
+  compiler->SetErrorHandler(error_handler);
+  TF_ASSIGN_OR_RETURN(
+      hlo_module, compiler->RunHloPasses(std::move(hlo_module),
+                                         backend_->default_stream_executor(),
+                                         /*device_allocator=*/nullptr));
+  Status status = compiler
+                      ->RunBackend(std::move(hlo_module),
+                                   backend_->default_stream_executor(),
+                                   /*device_allocator=*/nullptr)
+                      .status();
+  compiler->RemoveModuleHook();
+  compiler->RemoveErrorHandler();
+  if (status.ok()) {
+    return errors;
+  }
+  return status;
+}
+
+Status XlaGpuOpt::CompileAndExpectErrors(const std::string& hlo_text,
+                                         llvm::raw_ostream& os,
+                                         LoweringStage breaking_stage) {
+  TF_ASSIGN_OR_RETURN(auto module, GetVerifiedHloModule(hlo_text));
+  TF_ASSIGN_OR_RETURN(
+      std::string errors,
+      CompileAndInjectErrors(std::move(module), breaking_stage));
+  os << errors;
+  return Status::OK();
+}
+
+StatusOr<std::unique_ptr<VerifiedHloModule>> XlaGpuOpt::GetVerifiedHloModule(
+    const std::string& hlo_text) {
+  HloModuleConfig config;
+  auto debug_options = GetDebugOptionsFromFlags();
+  debug_options.add_xla_disable_hlo_passes("constant_folding");
+  config.set_debug_options(debug_options);
+  auto module = absl::make_unique<VerifiedHloModule>(
+      "Module", config, /*verifier_layout_sensitive=*/true,
+      /*allow_mixed_precision_in_hlo_verifier=*/false,
+      /*shape_size_function=*/ShapeUtil::ByteSizeOfElements);
+  TF_RETURN_IF_ERROR(module->ParseHloStringAndVerifyModule(hlo_text));
+  return std::move(module);
+}
+
+MlirCompiler* XlaGpuOpt::GetMLIRCompiler() {
+  // TODO(b/137624192): Remove failover once no longer in place.
+  auto* failover = static_cast<FailoverCompiler*>(backend_->compiler());
+  return static_cast<MlirCompiler*>(failover->GetPrimary());
+}
+
+}  // namespace mlir_gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.h b/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.h
new file mode 100644
index 00000000000..6a46f921417
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_XLA_GPU_OPT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_XLA_GPU_OPT_H_
+
+#include <memory>
+#include <string>
+
+#include "llvm/Support/raw_ostream.h"
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
+
+namespace xla {
+namespace mlir_gpu {
+
+// Prints the IR created by the MLIR GPU backend at a certain lowering stage.
+class XlaGpuOpt {
+ public:
+  using LoweringStage = MlirCompiler::IRHook::LoweringStage;
+  XlaGpuOpt() {
+    backend_ = std::move(Backend::CreateDefaultBackend().ValueOrDie());
+  }
+
+  // Compiles the HLO module given in 'hlo_text' to a GpuExecutable and prints
+  // the IR at the lowering stage 'printing_stage' to the 'os' stream.
+  //
+  // This function invokes the JIT compiler.
+  Status CompileAndOutputIr(const std::string& hlo_text, llvm::raw_ostream& os,
+                            LoweringStage printing_stage = LoweringStage::LHLO);
+
+  // Adds the InjectErrorsForTestingPass to MLIRCompiler on the provided
+  // lowering stage 'breaking_stage', parses and compiles `hlo_text`, and prints
+  // the resulting errors to the 'os' stream.
+  Status CompileAndExpectErrors(const std::string& hlo_text,
+                                llvm::raw_ostream& os,
+                                LoweringStage breaking_stage);
+
+ private:
+  std::unique_ptr<Backend> backend_;
+  StatusOr<std::unique_ptr<VerifiedHloModule>> GetVerifiedHloModule(
+      const std::string& hlo_text_filename);
+
+  Status CompileAndOutputIr(std::unique_ptr<HloModule> hlo_module,
+                            llvm::raw_ostream& os,
+                            LoweringStage printing_stage);
+  Status CompileIr(std::unique_ptr<HloModule> hlo_module,
+                   const MlirCompiler::IRHook& ir_hook);
+  StatusOr<std::string> CompileIr(std::unique_ptr<HloModule> hlo_module,
+                                  LoweringStage printing_stage);
+  MlirCompiler::IRHook GetIRHookBreakingLoweringStage(
+      LoweringStage breaking_stage);
+  StatusOr<std::string> CompileAndInjectErrors(
+      std::unique_ptr<HloModule> hlo_module, LoweringStage breaking_stage);
+  MlirCompiler* GetMLIRCompiler();
+};
+
+}  // namespace mlir_gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_XLA_GPU_OPT_H_
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt_main.cc b/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt_main.cc
new file mode 100644
index 00000000000..f60eea6aead
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt_main.cc
@@ -0,0 +1,90 @@
+/* Copyright 2020 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<std::string> input_filename(llvm::cl::Positional,
+                                                 llvm::cl::desc("<input file>"),
+                                                 llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<std::string> output_filename(
+    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
+    llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> verify_errors(
+    "verify-errors",
+    llvm::cl::desc("Whether we expect errors which should be verified"),
+    llvm::cl::init(false));
+
+static llvm::cl::opt<xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage>
+    // NOLINTNEXTLINE
+    lowering_stage(
+        "lowering-stage",
+        llvm::cl::desc(
+            "The lowering stage up to which the compiler will be run"),
+        llvm::cl::values(
+            clEnumValN(xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::LHLO,
+                       "LHLO", "LHLO"),
+            clEnumValN(xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::GPU,
+                       "GPU", "GPU"),
+            clEnumValN(xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::LLVM,
+                       "LLVM", "LLVM"),
+            clEnumValN(
+                xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::KERNEL,
+                "KERNEL", "Kernel")),
+        llvm::cl::init(
+            xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::LHLO));
+
+int main(int argc, char **argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+  mlir::registerPassManagerCLOptions();
+
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "XLA GPU modular optimizer driver\n");
+
+  // Set up the input file.
+  std::string error_message;
+  auto file = mlir::openInputFile(input_filename, &error_message);
+  QCHECK(file) << error_message;
+
+  auto output = mlir::openOutputFile(output_filename, &error_message);
+  QCHECK(output) << error_message;
+
+  xla::mlir_gpu::XlaGpuOpt opt;
+  xla::Status status =
+      verify_errors ? opt.CompileAndExpectErrors(file->getBuffer().str(),
+                                                 output->os(), lowering_stage)
+                    : opt.CompileAndOutputIr(file->getBuffer().str(),
+                                             output->os(), lowering_stage);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+    return 1;
+  }
+  output->keep();
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/service/rng_bit_generator_expander.cc b/tensorflow/compiler/xla/service/rng_bit_generator_expander.cc
index 24565746b4a..52901df5bf1 100644
--- a/tensorflow/compiler/xla/service/rng_bit_generator_expander.cc
+++ b/tensorflow/compiler/xla/service/rng_bit_generator_expander.cc
@@ -30,6 +30,23 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace xla {
+namespace {
+
+XlaOp GetPhiloxStateOp(XlaOp input_state, const Shape& state_shape) {
+  if (state_shape.dimensions(0) >= 3) {
+    return Slice(input_state, {1}, {3}, {1});
+  }
+  return Rev(input_state, {0});
+}
+
+XlaOp GetPhiloxOutputStateOp(XlaOp output_state, const Shape& state_shape) {
+  if (state_shape.dimensions(0) < 3) {
+    output_state = Slice(output_state, {0}, {1}, {1});
+  }
+  return output_state;
+}
+
+}  // namespace
 
 bool RngBitGeneratorExpander::InstructionMatchesPattern(
     HloInstruction* instruction) {
@@ -48,24 +65,22 @@ StatusOr<HloComputation*> RngBitGeneratorExpander::GetGeneratorComputation(
   XlaBuilder builder("rng");
   XlaOp state_param = Parameter(&builder, 0, state_shape, "state");
   XlaOp key_op = Reshape(Slice(state_param, {0}, {1}, {1}), {});
-  XlaOp state_op;
-
-  BitGeneratorTy generator = nullptr;
+  RngOutput output;
   switch (algorithm) {
     case RandomAlgorithm::RNG_THREE_FRY:
-      generator = ThreeFryBitGenerator;
-      state_op = Slice(state_param, {1}, {2}, {1});
+      output = ThreeFryBitGenerator(key_op, Slice(state_param, {1}, {2}, {1}),
+                                    data_shape);
       break;
     case RandomAlgorithm::RNG_PHILOX:
-      generator = PhiloxBitGenerator;
-      state_op = Slice(state_param, {1}, {3}, {1});
+      output = PhiloxBitGenerator(
+          key_op, GetPhiloxStateOp(state_param, state_shape), data_shape);
+      output.state = GetPhiloxOutputStateOp(output.state, state_shape);
       break;
     default:
       return Unimplemented("Unsupported random algorthm: %s",
                            RandomAlgorithm_Name(algorithm));
   }
 
-  RngOutput output = generator(key_op, state_op, data_shape);
   XlaOp final_state =
       ConcatInDim(&builder, {Reshape(key_op, {1}), output.state}, 0);
   Tuple(&builder, {final_state, output.value});
diff --git a/tensorflow/compiler/xla/service/root_instruction_sinker.cc b/tensorflow/compiler/xla/service/root_instruction_sinker.cc
new file mode 100644
index 00000000000..bee703b85e5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/root_instruction_sinker.cc
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/root_instruction_sinker.h"
+
+#include "tensorflow/compiler/xla/service/tuple_util.h"
+namespace xla {
+
+namespace {
+
+// Sinks the root of the given computation for tuple root types.
+void SinkTupleRoot(HloComputation* computation) {
+  HloInstruction* root = computation->root_instruction();
+  CHECK(root->shape().IsTuple());
+  HloInstruction* new_root = TupleUtil::Duplicate(root);
+  // Add the new instructions to the schedule.
+  HloInstructionSequence& sequence =
+      computation->parent()->schedule().GetOrCreateSequence(computation);
+  for (HloInstruction* operand : new_root->operands()) {
+    sequence.push_back(operand);
+  }
+  sequence.push_back(new_root);
+  computation->set_root_instruction(new_root);
+}
+
+// Sinks the root of the given computation for not-tuple root types.
+void SinkNontupleRoot(HloComputation* computation) {
+  HloInstruction* root = computation->root_instruction();
+  CHECK(!root->shape().IsTuple());
+  HloInstruction* new_root = computation->AddInstruction(
+      HloInstruction::CreateBitcast(root->shape(), root));
+  HloInstructionSequence& sequence =
+      computation->parent()->schedule().GetOrCreateSequence(computation);
+  sequence.push_back(new_root);
+  computation->set_root_instruction(new_root);
+}
+
+}  // namespace
+
+StatusOr<bool> RootInstructionSinker::Run(HloModule* module) {
+  TF_RET_CHECK(module->has_schedule());
+
+  bool modified = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    HloInstructionSequence& sequence =
+        module->schedule().GetOrCreateSequence(computation);
+    if (computation->root_instruction() ==
+        sequence.instructions().at(sequence.size() - 1)) {
+      continue;
+    }
+    if (computation->root_instruction()->shape().IsTuple()) {
+      SinkTupleRoot(computation);
+    } else {
+      SinkNontupleRoot(computation);
+    }
+    modified = true;
+  }
+  return modified;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/root_instruction_sinker.h b/tensorflow/compiler/xla/service/root_instruction_sinker.h
new file mode 100644
index 00000000000..d4d08870699
--- /dev/null
+++ b/tensorflow/compiler/xla/service/root_instruction_sinker.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Given a scheduled HLO module, this pass sinks the ROOT of the instruction to
+// the bottom of the non-fusion computations. To avoid dependency violations of
+// moving the ROOT instruction, it creates a new ROOT instruction that looks
+// like the following:
+//   - For tuple ROOT type:
+//        new_root = tuple(gte(old_root), gte(old_root), ...)
+//   - For non-tuple ROOT type:
+//        new_root = bitcast(old_root)
+class RootInstructionSinker : public HloModulePass {
+ public:
+  ~RootInstructionSinker() override = default;
+  absl::string_view name() const override { return "root-instruction-sinker"; }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
diff --git a/tensorflow/compiler/xla/service/root_instruction_sinker_test.cc b/tensorflow/compiler/xla/service/root_instruction_sinker_test.cc
new file mode 100644
index 00000000000..8a03a92b88a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/root_instruction_sinker_test.cc
@@ -0,0 +1,170 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/root_instruction_sinker.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+using RootInstructionSinkerTest = HloTestBase;
+
+TEST_F(RootInstructionSinkerTest, TupleNoChange) {
+  // ROOTS are already sunk, no change performed to the module.
+  absl::string_view hlo_string = R"(
+  HloModule While, is_scheduled=true
+  While.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, multiply)
+  }
+  While.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(100)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  ENTRY While {
+    constant.3 = s32[] constant(42)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    ROOT while = (s32[], s32[3]{0}) while(tuple.1), condition=
+      While.condition, body=While.body
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto while_body =
+      module->entry_computation()->root_instruction()->while_body();
+  int num_body_instructions = while_body->instruction_count();
+  RootInstructionSinker sinker;
+  EXPECT_FALSE(sinker.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(module->entry_computation()
+                ->root_instruction()
+                ->while_body()
+                ->instruction_count(),
+            num_body_instructions);
+}
+
+TEST_F(RootInstructionSinkerTest, Tuple) {
+  // Sink tuple return type.
+  absl::string_view hlo_string = R"(
+  HloModule While, is_scheduled=true
+  While.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, multiply)
+    after-all = token[] after-all()
+    send = (s32[3]{0}, u32[], token[]) send(multiply, after-all), channel_id=1
+    send-done = token[] send-done(send), channel_id=1
+  }
+  While.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(100)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  ENTRY While {
+    constant.3 = s32[] constant(42)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    ROOT while = (s32[], s32[3]{0}) while(tuple.1), condition=
+      While.condition, body=While.body
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  RootInstructionSinker sinker;
+  EXPECT_TRUE(sinker.Run(module.get()).ValueOrDie());
+  auto while_body =
+      module->entry_computation()->root_instruction()->while_body();
+  const auto& sequence = module->schedule().sequence(while_body);
+  EXPECT_EQ(sequence.instructions().at(sequence.size() - 1),
+            while_body->root_instruction());
+  EXPECT_THAT(while_body->root_instruction(),
+              op::Tuple(op::GetTupleElement(op::Tuple()),
+                        op::GetTupleElement(op::Tuple())));
+}
+
+TEST_F(RootInstructionSinkerTest, NontupleNoChange) {
+  // ROOTS are already sunk, no change performed to the module.
+  absl::string_view hlo_string = R"(
+  HloModule Call, is_scheduled=true
+  Call {
+    param = s32[3]{0} parameter(0)
+    ROOT multiply = s32[3]{0} multiply(param, param)
+  }
+  ENTRY While {
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    ROOT call = s32[3]{0} call(constant.4), to_apply=Call
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto called_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  int num_instructions = called_computation->instruction_count();
+  RootInstructionSinker sinker;
+  EXPECT_FALSE(sinker.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(module->entry_computation()
+                ->root_instruction()
+                ->called_computations()[0]
+                ->instruction_count(),
+            num_instructions);
+}
+
+TEST_F(RootInstructionSinkerTest, Nontuple) {
+  // Sink a non-tuple return type.
+  absl::string_view hlo_string = R"(
+  HloModule Call, is_scheduled=true
+  Call {
+    param = s32[3]{0} parameter(0)
+    ROOT multiply = s32[3]{0} multiply(param, param)
+    after-all = token[] after-all()
+    send = (s32[3]{0}, u32[], token[]) send(multiply, after-all), channel_id=1
+    send-done = token[] send-done(send), channel_id=1
+  }
+  ENTRY While {
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    ROOT call = s32[3]{0} call(constant.4), to_apply=Call
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  RootInstructionSinker sinker;
+  EXPECT_TRUE(sinker.Run(module.get()).ValueOrDie());
+  auto called_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const auto& sequence = module->schedule().sequence(called_computation);
+  EXPECT_EQ(sequence.instructions().at(sequence.size() - 1),
+            called_computation->root_instruction());
+  EXPECT_THAT(called_computation->root_instruction(),
+              op::Bitcast(op::Multiply()));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index e12e1577211..2ed5e709d81 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -313,7 +313,10 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     if (execution_options->num_partitions() > 0) {
       config->set_num_partitions(execution_options->num_partitions());
     }
+    config->set_use_spmd_partitioning(
+        execution_options->use_spmd_partitioning());
     config->set_seed(execution_options->seed());
+    config->set_launch_id(execution_options->launch_id());
     config->set_debug_options(execution_options->debug_options());
   } else {
     config->set_replica_count(options_.number_of_replicas());
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 3b8c2f41ef1..0ea7912c95c 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -257,6 +257,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     case HloOpcode::kLog1p:
     case HloOpcode::kRsqrt:
     case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
     case HloOpcode::kTanh:
       if (!ShapeUtil::ElementIsFloating(shape) &&
           !ShapeUtil::ElementIsComplex(shape)) {
@@ -1998,6 +1999,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return a;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferAllGatherShape(
+    const Shape& operand_shape, int64 all_gather_dimension, int64 shard_count) {
+  TF_RET_CHECK(all_gather_dimension >= 0);
+  TF_RET_CHECK(all_gather_dimension < operand_shape.rank());
+  TF_RET_CHECK(shard_count > 0);
+  auto shape = operand_shape;
+  shape.set_dimensions(all_gather_dimension,
+                       shard_count * shape.dimensions(all_gather_dimension));
+  return shape;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferAllReduceShape(
     absl::Span<const Shape* const> operand_shapes) {
   for (const Shape* operand_shape : operand_shapes) {
@@ -2596,7 +2608,18 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     VLOG(2) << StrFormat("update_sizes[%d] = %d", dim, update_dim_size);
   }
 
-  return operand_shape;
+  auto result_shape = operand_shape;
+
+  // If any of the operand shape and update shape is dynamic, update the result
+  // dimension to dynamic.
+  for (int64 i = 0; i < update_shape.rank(); ++i) {
+    if (update_shape.is_dynamic_dimension(i) ||
+        operand_shape.is_dynamic_dimension(i)) {
+      result_shape.set_dynamic_dimension(i, true);
+    }
+  }
+
+  return result_shape;
 }
 
 /*static */ StatusOr<Shape> ShapeInference::InferReverseShape(
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 2e96a77aa22..2cb5930d098 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -123,6 +123,12 @@ class ShapeInference {
   // Infers the shape produced by the given triangular solve operation.
   static StatusOr<Shape> InferCholeskyShape(const Shape& a);
 
+  // Infers the shape produced by an all-gather with the given operand shape,
+  // concat dimension, and shard count.
+  static StatusOr<Shape> InferAllGatherShape(const Shape& operand_shape,
+                                             int64 all_gather_dimension,
+                                             int64 shard_count);
+
   // Infers the shape produced by a cross replica sum with the given operand
   // shapes.
   static StatusOr<Shape> InferAllReduceShape(
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index a1872330648..b7a67b4e66e 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -93,6 +94,18 @@ class ShapedBuffer {
     buffers_.replace_shape_ptr(&on_device_shape_);
   }
 
+  // Reset the shape of this shaped buffer and underlying buffer structure.
+  //
+  // Precondition: EqualStructure(this->on_device_shape_, on_device_shape).
+  void set_shapes(const Shape& on_host_shape, const Shape& on_device_shape) {
+    CHECK(ShapeUtil::EqualStructure(on_device_shape, on_device_shape_))
+        << "Structures are not the same. new: " << on_device_shape
+        << ", old: " << on_device_shape_;
+    on_host_shape_ = on_host_shape;
+    on_device_shape_ = on_device_shape;
+    buffers_.replace_shape_ptr(&on_device_shape_);
+  }
+
   // Returns the underlying ShapeTree containing all the device addresses in the
   // ShapedBuffer.
   const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
new file mode 100644
index 00000000000..5be6a04f934
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -0,0 +1,69 @@
+# Description: SPMD partitioning pass.
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+cc_library(
+    name = "spmd_partitioner",
+    srcs = [
+        "spmd_partitioner.cc",
+        "spmd_partitioner_util.cc",
+    ],
+    hdrs = [
+        "spmd_partitioner.h",
+        "spmd_partitioner_util.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:protobuf_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:hlo_cse",
+        "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_query",
+        "//tensorflow/compiler/xla/service:hlo_sharding_util",
+        "//tensorflow/compiler/xla/service:shape_inference",
+        "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/core/platform:numbers",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "spmd_partitioner_test",
+    srcs = ["spmd_partitioner_test.cc"],
+    deps = [
+        ":spmd_partitioner",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
new file mode 100644
index 00000000000..b857c8bdbe6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -0,0 +1,4655 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+#include <float.h>
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/numbers.h"
+
+namespace xla {
+namespace spmd {
+
+string SpmdLogger::MakeReport() {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory during transformation *****\n");
+
+  std::sort(entries_.begin(), entries_.end(),
+            [](auto const& entry0, auto const& entry1) {
+              return entry0.first > entry1.first;
+            });
+  for (int64 i = 0;
+       i < std::min<int64>(report_instruction_count_, entries_.size()); ++i) {
+    absl::StrAppend(
+        &report, "\n  ",
+        tensorflow::strings::HumanReadableNumBytes(entries_[i].first), " : ",
+        entries_[i].second, "\n");
+  }
+
+  return report;
+}
+
+void SpmdLogger::RegisterLogEntry(HloInstruction* hlo,
+                                  const std::vector<HloInstruction*>& group) {
+  string report = hlo->ToString();
+  int64 max_value = -1;
+  for (HloInstruction* inst : group) {
+    if (inst->shape().IsTuple()) {
+      continue;
+    }
+    max_value =
+        std::max<int64>(max_value, ShapeUtil::ByteSizeOf(inst->shape(), 4));
+    absl::StrAppend(&report, "     * ", inst->ToString(), "\n");
+  }
+  entries_.push_back(std::make_pair(max_value, report));
+}
+
+/* static */ string SpmdLogger::ReportBeforePartition(
+    const HloModule& module, int64 report_instruction_count) {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory usage before partition *****\n");
+  absl::StrAppend(&report, "\n  ** Replicated instructions\n");
+  absl::StrAppend(&report, ReportMemoryUsage(
+                               module,
+                               [](const HloInstruction* hlo) {
+                                 return !hlo->has_sharding() ||
+                                        hlo->sharding().IsReplicated();
+                               },
+                               report_instruction_count));
+  absl::StrAppend(&report, "\n  ** All instructions\n");
+  absl::StrAppend(&report,
+                  ReportMemoryUsage(
+                      module, [](const HloInstruction* hlo) { return true; },
+                      report_instruction_count));
+  return report;
+}
+
+/* static */ string SpmdLogger::ReportAfterPartition(
+    const HloModule& module, int64 report_instruction_count) {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory usage after partition *****\n");
+  absl::StrAppend(&report,
+                  ReportMemoryUsage(
+                      module, [](const HloInstruction* hlo) { return true; },
+                      report_instruction_count));
+  return report;
+}
+
+template <typename F>
+/* static */ string SpmdLogger::ReportMemoryUsage(
+    const HloModule& module, const F& filter, int64 report_instruction_count) {
+  string report;
+  std::vector<HloInstruction*> instructions;
+  instructions.reserve(module.instruction_count());
+
+  for (auto computation : module.computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    for (auto hlo : computation->instructions()) {
+      if (hlo->shape().IsTuple() ||
+          ShapeUtil::IsEffectiveScalar(hlo->shape())) {
+        continue;
+      }
+      if (filter(hlo)) {
+        instructions.push_back(hlo);
+      }
+    }
+  }
+
+  const auto add_report = [&](std::vector<HloInstruction*>* insts) {
+    std::sort(insts->begin(), insts->end(),
+              [](const HloInstruction* inst0, const HloInstruction* inst1) {
+                return ShapeUtil::ByteSizeOf(inst0->shape()) >
+                       ShapeUtil::ByteSizeOf(inst1->shape());
+              });
+    for (int64 i = 0;
+         i < std::min<int64>(report_instruction_count, insts->size()); ++i) {
+      absl::StrAppend(&report, "  ",
+                      tensorflow::strings::HumanReadableNumBytes(
+                          ShapeUtil::ByteSizeOf((*insts)[i]->shape())),
+                      " : ", (*insts)[i]->ToString(), "\n");
+    }
+  };
+
+  add_report(&instructions);
+  return report;
+}
+
+namespace {
+
+// Returns the replica group configuration where each replica belongs to its own
+// group.
+std::vector<ReplicaGroup> CreateReplicaGroups(int64 num_replicas) {
+  std::vector<ReplicaGroup> groups(num_replicas);
+  for (int64 i = 0; i < num_replicas; ++i) {
+    groups[i].add_replica_ids(i);
+  }
+  return groups;
+}
+
+bool CanReshardWithAllToAll(const HloSharding& source,
+                            const HloSharding& target) {
+  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
+         UniqueTiledDim(source) != UniqueTiledDim(target);
+}
+
+bool CanReshardWithCollectivePermute(const HloSharding& source,
+                                     const HloSharding& target) {
+  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
+         UniqueTiledDim(source) == UniqueTiledDim(target) && source != target;
+}
+
+// Clears all sharding attributes from instructions in the module. This must be
+// called only after all SPMD transformation is complete.
+Status ClearShardingAttributes(HloModule* module) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* hlo : computation->instructions()) {
+      // Keep sharding annotation on Infeed and entry parameters since they're
+      // used by HloReplicationAnalysis later (for ArCrsCombiner).
+      if (hlo->opcode() == HloOpcode::kInfeed) {
+        continue;
+      }
+      if (hlo->opcode() == HloOpcode::kParameter &&
+          computation == module->entry_computation()) {
+        continue;
+      }
+      hlo->clear_sharding();
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+HloInstruction* SpmdBuilder::AddInstruction(
+    std::unique_ptr<HloInstruction> instruction) {
+  HloInstruction* hlo =
+      HloComputation::Builder::AddInstruction(std::move(instruction));
+  if (visiting_hlo_) {
+    instructions_[visiting_hlo_].push_back(hlo);
+  }
+  return hlo;
+}
+
+PartitionedHlo PartitionedHlo::Reshard(const HloSharding& target) {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  for (auto& entry : cache) {
+    if (entry.first == target) {
+      return entry.second;
+    }
+  }
+  cache.emplace_back(target, ReshardNoCache(target));
+  state_.reshard_cache->per_hlo_cache[cache.back().second.hlo()]
+      .reshard_cache.emplace_back(sharding(), *this);
+  return cache.back().second;
+}
+
+PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
+  VLOG(2) << "Resharding " << hlo_->ToString() << " from "
+          << hlo_->sharding().ToString() << " to " << target.ToString();
+  const Shape& shape = hlo_->shape();
+  CHECK(shape.IsTuple() || !target.IsTuple());
+
+  // Tuple shape instructions may have non-tuple sharding, which means that the
+  // same sharding applies to all the leaves.
+  if (shape.IsTuple() && !target.IsTuple()) {
+    return Reshard(target.GetTupleSharding(shape).ValueOrDie());
+  }
+
+  // For a tuple shape, recursively apply Reshard to all the leaves and return
+  // a tuple instruction.
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      auto subshape = ShapeUtil::GetTupleElementShape(shape, i);
+      auto element = state_.b->AddInstruction(
+          HloInstruction::CreateGetTupleElement(subshape, hlo(), i));
+      element->set_sharding(sharding().GetSubSharding(shape, {i}));
+      elements.push_back(
+          PartitionedHlo(
+              element, ShapeUtil::GetTupleElementShape(base_shape_, i), state_)
+              .Reshard(target.GetSubSharding(shape, {i}))
+              .hlo());
+    }
+    auto tuple =
+        state_.b->AddInstruction(HloInstruction::CreateTuple(elements));
+    tuple->set_sharding(target);
+    return PartitionedHlo(tuple, base_shape_, state_);
+  }
+
+  if (sharding() == target) {
+    return *this;
+  }
+
+  if (shape.element_type() == TOKEN) {
+    return *this;
+  }
+
+  if (CanReshardWithCollectivePermute(sharding(), target)) {
+    return ReshardWithCollectivePermute(target);
+  }
+
+  if (CanReshardWithAllToAll(sharding(), target)) {
+    return ReshardWithAllToAll(target);
+  }
+
+  // If not replicated yet, first replicate and then reshard to use one of the
+  // two implementations below.
+  if (!sharding().IsReplicated()) {
+    return Replicate().Reshard(target);
+  }
+
+  // 'Replicated' to 'SingleDevice'.
+  if (target.IsTileMaximal()) {
+    auto copy = state_.b->AddInstruction(
+        HloInstruction::CreateUnary(hlo_->shape(), HloOpcode::kCopy, hlo_));
+    copy->set_sharding(target);
+    return PartitionedHlo(copy, base_shape_, state_);
+  }
+
+  // 'Replicated' to 'Tiled'.
+  auto padded_hlo =
+      PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
+  auto shard_shape = MakePartitionedShape(shape, target);
+  auto slice = state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      shard_shape, padded_hlo,
+      MakePartitionOffsets(shape, target, state_.partition_id, state_.b),
+      shard_shape.dimensions()));
+  slice->set_sharding(target);
+  return PartitionedHlo(slice, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::PadWithValue(HloInstruction* pad_value) const {
+  const HloSharding& sharding = hlo_->sharding();
+  const Shape& shape = hlo_->shape();
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+  if (sharding.IsReplicated() || EvenlyPartitions(base_shape_, sharding)) {
+    return *this;
+  }
+  CHECK(!sharding.IsTileMaximal());
+  auto index_shape = ShapeUtil::ChangeElementType(shape, S32);
+  auto mask_shape = ShapeUtil::ChangeElementType(index_shape, PRED);
+  auto get_mask_for_dim = [&](int64 dim, HloInstruction* start_index) {
+    // Comparison: iota + start_index < valid_size
+    auto iota =
+        state_.b->AddInstruction(HloInstruction::CreateIota(index_shape, dim));
+    auto broadcast_start_index = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(index_shape, start_index, {}));
+    auto index_in_full_shape =
+        state_.b->AddInstruction(HloInstruction::CreateBinary(
+            index_shape, HloOpcode::kAdd, iota, broadcast_start_index));
+    auto valid_size = state_.b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(base_shape_.dimensions(dim))));
+    auto broadcast_valid_size = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(index_shape, valid_size, {}));
+    return state_.b->AddInstruction(HloInstruction::CreateCompare(
+        mask_shape, index_in_full_shape, broadcast_valid_size,
+        ComparisonDirection::kLt));
+  };
+
+  HloInstruction* mask = nullptr;
+  auto offsets = MakePartitionOffsets(base_shape_, sharding,
+                                      state_.partition_id, state_.b);
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    if (base_shape_.dimensions(i) % sharding.tile_assignment().dim(i) == 0) {
+      continue;
+    }
+    if (mask == nullptr) {
+      mask = get_mask_for_dim(i, offsets[i]);
+    } else {
+      mask = state_.b->AddInstruction(
+          HloInstruction::CreateBinary(mask->shape(), HloOpcode::kAnd, mask,
+                                       get_mask_for_dim(i, offsets[i])));
+    }
+  }
+
+  if (mask == nullptr) {
+    return *this;
+  }
+
+  auto broadcast_pad_value = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(shape, pad_value, {}));
+  auto result = state_.b->AddInstruction(HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, mask, hlo_, broadcast_pad_value));
+  result->set_sharding(sharding);
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+absl::optional<PartitionedHlo::WindowedInputShardReturnValue>
+PartitionedHlo::ReshardAsWindowedInput(const Window& window,
+                                       const HloSharding& target,
+                                       HloInstruction* pad_value,
+                                       bool mask_invalid_region) {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].window_reshard_cache;
+  for (auto& entry : cache) {
+    if (std::get<0>(entry) == target &&
+        protobuf_util::ProtobufEquals(std::get<1>(entry), window)) {
+      return std::get<2>(entry);
+    }
+  }
+  auto update_cache = [&](WindowedInputShardReturnValue result) {
+    cache.emplace_back(target, window, std::move(result));
+    return std::get<2>(cache.back());
+  };
+  VLOG(2) << "ReshardAsWindowedInput()\n"
+          << "\twindow:" << window_util::ToString(window)
+          << "\ttarget sharding:" << target.ToString();
+
+  CHECK(!target.IsTileMaximal());
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(target, state_.partition_id, state_.b);
+  auto shard_shape = base_shape_;
+
+  std::vector<MultiplyAddDivideOffsetCalculation> start_on_padded_calculations(
+      base_shape_.rank());
+  std::vector<MultiplyAddDivideOffsetCalculation> limit_on_padded_calculations(
+      base_shape_.rank());
+  std::vector<HloInstruction*> dynamic_slice_offset_on_output(
+      base_shape_.rank(), nullptr);
+
+  Window shard_window = window;
+  auto padded_shape = base_shape_;
+  std::vector<HloInstruction*> offsets_on_padded_shape(base_shape_.rank());
+  std::vector<int64> per_shard_window_counts(base_shape_.rank());
+  std::vector<int64> explicit_left_padding(base_shape_.rank());
+  for (int64 i = 0; i < base_shape_.rank(); ++i) {
+    // Do not pad non-partitioned dimensions.
+    int64 shard_count = target.tile_assignment().dim(i);
+    if (shard_count == 1) {
+      offsets_on_padded_shape[i] = state_.b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      continue;
+    }
+    const auto& wd = window.dimensions(i);
+    if (wd.window_dilation() != 1) {
+      // TODO(yuanzx): Support window dilation.
+      VLOG(2) << "Failed to reshard window operand due to window dilation";
+      return absl::nullopt;
+    }
+    int64 full_size =
+        base_shape_.dimensions(i) +
+        (wd.base_dilation() - 1) * (base_shape_.dimensions(i) - 1) +
+        wd.padding_high() + wd.padding_low();
+    if (full_size < wd.size()) {
+      VLOG(2) << "Failed to reshard window operand because the window size is "
+                 "larger than padded base size";
+      return absl::nullopt;
+    }
+    int64 window_count = (full_size - wd.size()) / wd.stride() + 1;
+    per_shard_window_counts[i] = CeilOfRatio(window_count, shard_count);
+    if (wd.stride() != 1 &&
+        (wd.stride() * per_shard_window_counts[i]) % wd.base_dilation() != 0) {
+      // TODO(yuanzx): Support this case.
+      VLOG(2) << "Failed to reshard window operand due to non-trivial dilation";
+      return absl::nullopt;
+    }
+
+    // We use explicit padding for full dilations, then use padding_low and
+    // padding_high on the sharded op for the remaining. padding_low and
+    // padding_high are now given initial values, which will be later updated if
+    // dilation is not 1.
+    auto swd = shard_window.mutable_dimensions(i);
+    explicit_left_padding[i] = wd.padding_low() / wd.base_dilation();
+    swd->set_padding_low(wd.padding_low() % wd.base_dilation());
+    swd->set_padding_high(0);
+
+    // Calculation for the first element needed on the 'padded-but-not-dilated'
+    // shape. The start on the dilated shape could be a hole, so we add
+    // wd.base_dilation() - 1 to the constant term to skip the leading holes.
+    start_on_padded_calculations[i] = MultiplyAddDivideOffsetCalculation(
+        wd.stride() * per_shard_window_counts[i],
+        wd.base_dilation() - 1 - swd->padding_low(), wd.base_dilation());
+    int64 dilated_shard_size =
+        wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+    limit_on_padded_calculations[i] = MultiplyAddDivideOffsetCalculation(
+        wd.stride() * per_shard_window_counts[i],
+        dilated_shard_size + wd.base_dilation() - 1 - swd->padding_low(),
+        wd.base_dilation());
+
+    offsets_on_padded_shape[i] = start_on_padded_calculations[i].Calculate(
+        partition_ordinals[i], state_.b);
+
+    auto shard_size_function =
+        limit_on_padded_calculations[i] - start_on_padded_calculations[i];
+    int64 max_shard_size = shard_size_function.MaxInRange(0, shard_count);
+    shard_shape.set_dimensions(i, max_shard_size);
+    padded_shape.set_dimensions(
+        i, limit_on_padded_calculations[i].Calculate(shard_count - 1));
+
+    // For base dilation, calculate the needed padding_low and padding_high, as
+    // well as the offset for the output if a dynamic slice is needed after the
+    // sharded op.
+    if (wd.base_dilation() != 1) {
+      // Returns the offset of a shard's first valid element in the dilated
+      // shard.
+      auto get_first_valid_element_offset_on_dilated_shard =
+          [&](int64 shard_ordinal) {
+            return start_on_padded_calculations[i].Calculate(shard_ordinal) *
+                       wd.base_dilation() +
+                   swd->padding_low() -
+                   wd.stride() * per_shard_window_counts[i] * shard_ordinal;
+          };
+      CHECK_EQ(get_first_valid_element_offset_on_dilated_shard(0),
+               swd->padding_low());
+
+      // Determine swd->padding_high.
+      for (int64 shard_ordinal = 0; shard_ordinal < shard_count;
+           ++shard_ordinal) {
+        int64 wanted_limit_on_dilated_shard =
+            wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+        int64 actual_limit_on_dilated_shard_without_pad_high =
+            get_first_valid_element_offset_on_dilated_shard(shard_ordinal) +
+            (max_shard_size - 1) * wd.base_dilation() + 1;
+        swd->set_padding_high(std::max<int64>(
+            swd->padding_high(),
+            wanted_limit_on_dilated_shard -
+                actual_limit_on_dilated_shard_without_pad_high));
+      }
+
+      // Determine swd->padding_low and output dynamic slice index.
+      if (wd.stride() == 1) {
+        int64 max_pad_low = get_first_valid_element_offset_on_dilated_shard(0);
+        bool all_same = true;
+        for (int64 shard_ordinal = 1; shard_ordinal < shard_count;
+             ++shard_ordinal) {
+          int64 start =
+              get_first_valid_element_offset_on_dilated_shard(shard_ordinal);
+          if (start != swd->padding_low()) {
+            all_same = false;
+          }
+          max_pad_low = std::max(max_pad_low, start);
+        }
+        if (!all_same) {
+          auto start_on_padded_input =
+              start_on_padded_calculations[i].Calculate(partition_ordinals[i],
+                                                        state_.b);
+          // We will calculate
+          //   max_pad_low - (first_window - required_first_window)
+          // which equals
+          //   required_first_window - (first_window - max_pad_low)
+          auto first_window_minus_max_pad_low =
+              MultiplyAddDivideOffsetCalculation(
+                  wd.base_dilation(), swd->padding_low() - max_pad_low, 1)
+                  .Calculate(start_on_padded_input, state_.b);
+          auto required_first_window =
+              MultiplyAddDivideOffsetCalculation(per_shard_window_counts[i], 0,
+                                                 1)
+                  .Calculate(partition_ordinals[i], state_.b);
+          dynamic_slice_offset_on_output[i] =
+              state_.b->AddInstruction(HloInstruction::CreateBinary(
+                  required_first_window->shape(), HloOpcode::kSubtract,
+                  required_first_window, first_window_minus_max_pad_low));
+        }
+        swd->set_padding_low(max_pad_low);
+      } else {
+        CHECK_EQ(
+            (wd.stride() * per_shard_window_counts[i]) % wd.base_dilation(), 0)
+            << "General base dilation not yet implemented.";
+        // padding_low on all shards should equal the initially assigned
+        // swd->padding_low(), i.e., the padding_low() on the original window.
+      }
+    }
+  }
+
+  // Returns the output dynamic slice offset when needed, and absl::nullopt
+  // otherwise.
+  auto get_dynamic_slice_offset_on_output_if_needed =
+      [&]() -> absl::optional<std::vector<HloInstruction*>> {
+    if (absl::c_all_of(
+            dynamic_slice_offset_on_output,
+            [](HloInstruction* offset) { return offset == nullptr; })) {
+      return absl::nullopt;
+    }
+    auto zero = state_.b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    for (int64 i = 0; i < dynamic_slice_offset_on_output.size(); ++i) {
+      if (dynamic_slice_offset_on_output[i] == nullptr) {
+        dynamic_slice_offset_on_output[i] = zero;
+      }
+    }
+    return dynamic_slice_offset_on_output;
+  };
+
+  // If the currrent HLO is replicated, pad then slice.
+  if (sharding().IsReplicated()) {
+    PaddingConfig padding_config;
+    for (int64 i = 0; i < base_shape_.rank(); ++i) {
+      auto padding_config_dim = padding_config.add_dimensions();
+      padding_config_dim->set_interior_padding(0);
+      // Do not pad non-partitioned dimensions.
+      if (target.tile_assignment().dim(i) == 1) {
+        padding_config_dim->set_edge_padding_low(0);
+        padding_config_dim->set_edge_padding_high(0);
+        continue;
+      }
+      padding_config_dim->set_edge_padding_low(explicit_left_padding[i]);
+      padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                                explicit_left_padding[i] -
+                                                base_shape_.dimensions(i));
+    }
+    auto padded_hlo = ShapeUtil::Compatible(padded_shape, base_shape_)
+                          ? hlo_
+                          : state_.b->AddInstruction(HloInstruction::CreatePad(
+                                padded_shape, hlo_, pad_value, padding_config));
+    auto sharded_input =
+        state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+            shard_shape, padded_hlo, offsets_on_padded_shape,
+            shard_shape.dimensions()));
+    return update_cache(WindowedInputShardReturnValue{
+        sharded_input, shard_window,
+        get_dynamic_slice_offset_on_output_if_needed()});
+  }
+
+  if (target != sharding()) {
+    return Replicate().ReshardAsWindowedInput(window, target, pad_value);
+  }
+
+  // Halo exchange.
+  HloInstruction* visiting_hlo = hlo_;
+  auto original_shard_shape = MakePartitionedShape(base_shape_, target);
+
+  std::vector<OffsetCalculation> left_halo_size_functions(base_shape_.rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(base_shape_.rank());
+  // TODO(yuanzx): We are concatenating on each sharded dimension one at time,
+  // and in the second dimension (and beyond) we create halos by slicing the
+  // concat in the previous dimension, which is not optimal. We should generate
+  // halos only concating slices, instead of slicing concats.
+  for (int dim = 0; dim < base_shape_.rank(); ++dim) {
+    int64 shard_count = target.tile_assignment().dim(dim);
+    if (shard_count == 1) {
+      continue;
+    }
+    int64 input_shard_size =
+        CeilOfRatio(base_shape_.dimensions(dim), shard_count);
+
+    // Left halo. The size of the halo is derived by subtracting the first read
+    // element offset of the i'th partition from the limit of the (i-1)'th
+    // partition.
+    MultiplyAddDivideOffsetCalculation shard_limit_of_previous_on_padded(
+        input_shard_size, explicit_left_padding[dim], 1);
+    left_halo_size_functions[dim] =
+        shard_limit_of_previous_on_padded - start_on_padded_calculations[dim];
+
+    // Right halo.
+    MultiplyAddDivideOffsetCalculation shard_start_of_next_on_padded(
+        input_shard_size, input_shard_size + explicit_left_padding[dim], 1);
+    right_halo_size_functions[dim] =
+        limit_on_padded_calculations[dim] - shard_start_of_next_on_padded;
+
+    auto resharded = ExchangeHaloAndGetValidData(
+        visiting_hlo, base_shape_, left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding[dim],
+        padded_shape.dimensions(dim), shard_shape.dimensions(dim), dim, target,
+        offsets_on_padded_shape[dim], pad_value, partition_ordinals[dim],
+        state_.collective_ops_creator, state_.next_channel_id, state_.b,
+        mask_invalid_region);
+    if (!resharded) {
+      VLOG(1) << "ReshardAsWindowedInput failed without replicate first: halo "
+                 "is beyond the neighbor.";
+      return Replicate().ReshardAsWindowedInput(window, target, pad_value);
+    }
+    visiting_hlo = *resharded;
+  }
+  return update_cache(WindowedInputShardReturnValue{
+      visiting_hlo, shard_window,
+      get_dynamic_slice_offset_on_output_if_needed()});
+}
+
+PartitionedHlo PartitionedHlo::Replicate() {
+  const HloSharding& sharding = hlo_->sharding();
+  const Shape& shape = hlo_->shape();
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+
+  if (sharding.IsReplicated()) {
+    return *this;
+  }
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  for (auto& entry : cache) {
+    if (entry.first.IsReplicated()) {
+      return entry.second;
+    }
+  }
+  auto update_cache = [&](PartitionedHlo resharded) {
+    state_.reshard_cache->per_hlo_cache[resharded.hlo()]
+        .reshard_cache.emplace_back(sharding, *this);
+    cache.emplace_back(HloSharding::Replicate(), std::move(resharded));
+    return cache.back().second;
+  };
+  // 'Single Device' to 'Repliated'.
+  if (sharding.IsTileMaximal()) {
+    return update_cache(Broadcast());
+  }
+
+  // 'Tiled' to 'Replicated'.
+  Shape padded_base_shape = shape;
+  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
+    padded_base_shape.set_dimensions(
+        i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  auto zero = state_.b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  auto zero_bcast = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(padded_base_shape, zero, {}));
+  auto dus = state_.b->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      padded_base_shape, zero_bcast, hlo_,
+      MakePartitionOffsets(padded_base_shape, sharding, state_.partition_id,
+                           state_.b)));
+  HloComputation* reduction =
+      MakeBinaryAdd(shape.element_type(), state_.module);
+
+  auto all_reduce =
+      state_.collective_ops_creator.create_cross_partition_all_reduce(
+          state_.b, dus, reduction, NewChannel());
+  HloInstruction* result = all_reduce;
+  if (!ShapeUtil::Compatible(base_shape_, padded_base_shape)) {
+    std::vector<int64> start_indices(shape.rank(), 0);
+    std::vector<int64> strides(shape.rank(), 1);
+    result = state_.b->AddInstruction(HloInstruction::CreateSlice(
+        base_shape_, result, start_indices, base_shape_.dimensions(), strides));
+  }
+  result->set_sharding(HloSharding::Replicate());
+  return update_cache(PartitionedHlo(result, base_shape_, state_));
+}
+
+PartitionedHlo PartitionedHlo::Broadcast() const {
+  const Shape& shape = hlo_->shape();
+  const HloSharding& sharding = hlo_->sharding();
+  CHECK(sharding.HasUniqueDevice());
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+
+  auto src_core_id = state_.b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR0<uint32>(sharding.GetUniqueDevice())));
+  Shape bcast_shape = ShapeUtil::ChangeElementType(shape, PRED);
+  auto is_src_core = state_.b->AddInstruction(HloInstruction::CreateBroadcast(
+      bcast_shape,
+      state_.b->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::MakeShape(PRED, {}), state_.partition_id, src_core_id,
+          ComparisonDirection::kEq)),
+      {}));
+
+  auto zero = state_.b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  auto zero_bcast = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(shape, zero, {}));
+  auto operand = state_.b->AddInstruction(HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, is_src_core, hlo(), zero_bcast));
+  HloComputation* reduction =
+      MakeBinaryAdd(shape.element_type(), state_.module);
+
+  auto result = state_.collective_ops_creator.create_cross_partition_all_reduce(
+      state_.b, operand, reduction, NewChannel());
+  result->set_sharding(HloSharding::Replicate());
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
+    const HloSharding& target) const {
+  int64 partition_count = sharding().tile_assignment().num_elements();
+  absl::optional<int64> input_partition_dim = UniqueTiledDim(sharding());
+  absl::optional<int64> output_partition_dim = UniqueTiledDim(target);
+  CHECK(input_partition_dim.has_value());
+  CHECK(output_partition_dim.has_value());
+
+  // If the device order is different in the target, fix the order with
+  // ReshardWithCollectivePermute.
+  auto input_tile_fixed_device_order = target.tile_assignment();
+  input_tile_fixed_device_order.Reshape(
+      sharding().tile_assignment().dimensions());
+  auto input_sharding_fixed_device_order =
+      HloSharding::Tile(input_tile_fixed_device_order);
+  if (input_sharding_fixed_device_order != sharding()) {
+    auto fixed_order =
+        ReshardWithCollectivePermute(input_sharding_fixed_device_order);
+    return fixed_order.ReshardWithAllToAll(target);
+  }
+
+  auto padded_hlo =
+      PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
+
+  // The order of ids in the group must follow the target sharding.
+  std::vector<ReplicaGroup> groups(1);
+  for (int64 device : target.tile_assignment()) {
+    groups[0].add_replica_ids(device);
+  }
+
+  HloInstruction* result = nullptr;
+
+  // Split along the split dimension (output_partition_dim) of the all-to-all
+  // output.
+  std::vector<int64> dimensions;
+  for (int64 i = 0; i < base_shape_.rank(); ++i) {
+    if (i == *output_partition_dim) {
+      dimensions.push_back(partition_count);
+      dimensions.push_back(padded_hlo->shape().dimensions(i) / partition_count);
+    } else {
+      dimensions.push_back(padded_hlo->shape().dimensions(i));
+    }
+  }
+  auto reshape = state_.b->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(base_shape_.element_type(), dimensions),
+      padded_hlo));
+  // After the reshape, it is guaranteed to have at least 3 dimensions.
+  auto all_to_all =
+      state_.collective_ops_creator.create_cross_partition_all_to_all(
+          state_.b, {reshape}, groups, (*state_.next_channel_id)++,
+          output_partition_dim);
+
+  // Reorder the split dimension of the reshape to be located in front of the
+  // input partition dimension, so the two dimensions can be combined.
+  int64 new_input_partition_dim = (*output_partition_dim < *input_partition_dim)
+                                      ? *input_partition_dim + 1
+                                      : *input_partition_dim;
+  std::vector<int64> permutation;
+  for (int64 i = 0; i < all_to_all->shape().rank(); ++i) {
+    if (i == *output_partition_dim) {
+      continue;
+    }
+    if (i == new_input_partition_dim) {
+      permutation.push_back(*output_partition_dim);
+    }
+    permutation.push_back(i);
+  }
+  auto transpose = state_.b->AddInstruction(HloInstruction::CreateTranspose(
+      ShapeInference::InferTransposeShape(all_to_all->shape(), permutation)
+          .ValueOrDie(),
+      all_to_all, permutation));
+
+  // Combine the split dimension and the input partition dimension.
+  auto new_shape = ShapeInference::InferAllToAllShape(
+                       padded_hlo->shape(), *output_partition_dim,
+                       *input_partition_dim, partition_count)
+                       .ValueOrDie();
+  result = state_.b->AddInstruction(
+      HloInstruction::CreateReshape(new_shape, transpose));
+
+  const Shape result_shape = MakePartitionedShape(base_shape_, target);
+  if (result_shape != result->shape()) {
+    result = state_.b->AddInstruction(HloInstruction::CreateSlice(
+        result_shape, result, std::vector<int64>(result_shape.rank(), 0),
+        result_shape.dimensions(), std::vector<int64>(result_shape.rank(), 1)));
+  }
+  result->set_sharding(target);
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::ReshardWithCollectivePermute(
+    const HloSharding& target) const {
+  CHECK(CanReshardWithCollectivePermute(sharding(), target));
+  std::vector<std::pair<int64, int64>> src_dst_pairs;
+  sharding().tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 src_device) {
+        int64 dst_device = target.tile_assignment()(indices);
+        if (dst_device != src_device) {
+          src_dst_pairs.emplace_back(src_device, dst_device);
+        }
+      });
+  auto cp =
+      state_.collective_ops_creator.create_cross_partition_collective_permute(
+          state_.b, hlo(), src_dst_pairs, (*state_.next_channel_id)++);
+  cp->set_sharding(target);
+  return PartitionedHlo(cp, base_shape_, state_);
+}
+
+SpmdPartitioningVisitor::SpmdPartitioningVisitor(
+    HloComputation* computation, int64 num_partitions, int64 num_replicas,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdLogger* logger, SpmdPartitionerOptions options,
+    SpmdPartitioner* partitioner)
+    : changed_(false),
+      module_(computation->parent()),
+      num_partitions_(num_partitions),
+      num_replicas_(num_replicas),
+      collective_ops_creator_(collective_ops_creator),
+      next_channel_id_(next_channel_id),
+      b_(SpmdBuilder(computation->name() + "_spmd", /*hlo=*/nullptr)),
+      partition_id_(collective_ops_creator_.create_partition_id(&b_)),
+      logger_(logger),
+      options_(std::move(options)),
+      partitioner_(partitioner) {}
+
+Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
+  if (hlo->HasSideEffect()) {
+    return Unimplemented("Side-effect ops cannot be replicated: %s",
+                         hlo->ToString());
+  }
+
+  if (hlo->IsElementwise() && hlo->operand_count() > 0) {
+    return HandleElementwise(hlo);
+  }
+
+  if (!hlo->sharding().IsTileMaximal()) {
+    VLOG(1) << "Not partitioned in SPMD mode (DefaultAction):"
+            << hlo->ToString();
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      VLOG(1) << "  operand " << i
+              << " sharding:" << hlo->operand(i)->sharding().ToString();
+    }
+  }
+
+  // If the instruction cannot be partitioned, replicate the instruction unless
+  // the instruction has side-effect.
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(
+        GetPartitionedHlo(operand).Reshard(HloSharding::Replicate()).hlo());
+  }
+  auto clone =
+      b_.AddInstruction(hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+  clone->set_sharding(HloSharding::Replicate());
+  clone->set_metadata(hlo->metadata());
+  SetPartitionedHlo(hlo,
+                    PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
+                        .Reshard(hlo->sharding()));
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
+  visiting_hlo_ = hlo;
+  b_.set_visiting_hlo(hlo);
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
+  logger_->RegisterLogEntry(GetPartitionedHlo(hlo).hlo(),
+                            b_.derived_instructions(hlo));
+  visiting_hlo_ = nullptr;
+  b_.set_visiting_hlo(nullptr);
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleElementwise(HloInstruction* hlo) {
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(
+        GetPartitionedHlo(operand).Reshard(hlo->sharding()).hlo());
+  }
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  const Shape shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  const int64 dimension = hlo->concatenate_dimension();
+  if (sharding.tile_assignment().dim(dimension) == 1) {
+    std::vector<HloInstruction*> new_operands;
+    for (HloInstruction* operand : hlo->operands()) {
+      new_operands.push_back(
+          GetPartitionedHlo(operand).Reshard(sharding).hlo());
+    }
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(shard_shape, new_operands));
+    });
+    return Status::OK();
+  }
+
+  // If the concatenate dimension is along one of the partitioned dimensions,
+  // allocate the full output shape, each partition updates its owned region,
+  // all-reduce across partitions, and then slice its output region.
+
+  // We currently don't support subgroup all-reduce along partitions, so more
+  // than 1 partitioned dimensions is not supported.
+  if (sharding.tile_assignment().dim(dimension) != num_partitions_) {
+    return DefaultAction(hlo);
+  }
+
+  // temp_output_shape is the output shape where the concatenate dimension
+  // is changed to the full (and padded to shard count) dimension size.
+  auto temp_output_shape = MakePartitionedShape(hlo->shape(), sharding);
+  temp_output_shape.set_dimensions(
+      dimension, temp_output_shape.dimensions(dimension) *
+                     sharding.tile_assignment().dim(dimension));
+  auto temp_output = CreateZero(temp_output_shape, &b_);
+
+  // Offset of each operand along the concatenate dimension.
+  int64 offset = 0;
+  for (HloInstruction* operand : hlo->operands()) {
+    auto spmd_operand = GetPartitionedHlo(operand).Reshard(sharding).hlo();
+    std::vector<HloInstruction*> start_indices(
+        hlo->shape().rank(), b_.AddInstruction(HloInstruction::CreateConstant(
+                                 LiteralUtil::Zero(S32))));
+    start_indices[dimension] =
+        MultiplyAddDivideOffsetCalculation(
+            spmd_operand->shape().dimensions(dimension), offset, 1)
+            .Calculate(MakeTiledPartitionOrdinals(sharding, partition_id_,
+                                                  &b_)[dimension],
+                       &b_);
+    temp_output = b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        temp_output_shape, temp_output, spmd_operand, start_indices));
+    offset += operand->shape().dimensions(dimension);
+  }
+  auto all_reduce = collective_ops_creator_.create_cross_partition_all_reduce(
+      &b_, temp_output, MakeBinaryAdd(hlo->shape().element_type(), module_),
+      NewChannel());
+  SetPartitionedHlo(hlo, [&] {
+    auto start_indices =
+        MakeTiledPartitionOrdinals(hlo->sharding(), partition_id_, &b_);
+    start_indices[dimension] = MultiplyAddDivideOffsetCalculation(
+                                   shard_shape.dimensions(dimension), 0, 1)
+                                   .Calculate(start_indices[dimension], &b_);
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, all_reduce, start_indices, shard_shape.dimensions()));
+  });
+
+  return Status::OK();
+}
+
+// If partitioning in the operand only happens in dimensions in passthrough
+// dimensions (offset dimensions in the gather output (or scatter update) that
+// have the same size as the operand), returns the corresponding output (or
+// update) sharding by passing through the input sharding.
+absl::optional<HloSharding> PassthroughOperandToGatherOutputOrScatterUpdate(
+    const PartitionedHlo& operand, const Shape& update_or_gather_shape,
+    absl::Span<const int64> collapsed_or_inserted_dims,
+    absl::Span<const int64> index_map,
+    absl::Span<const int64> offset_or_window_dims,
+    absl::Span<const int64> slice_size) {
+  if (operand.sharding().IsTileMaximal()) {
+    return operand.sharding();
+  }
+  std::vector<int64> passthrough_tile(update_or_gather_shape.rank(), 1);
+  int64 collapsed = 0;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    int64 dim_partitions = operand.sharding().tile_assignment().dim(i);
+    if (absl::c_linear_search(collapsed_or_inserted_dims, i) ||
+        absl::c_linear_search(index_map, i)) {
+      if (dim_partitions > 1) {
+        return absl::nullopt;
+      }
+      collapsed++;
+      continue;
+    }
+    if (slice_size[i] != operand.base_shape().dimensions(i) &&
+        dim_partitions > 1) {
+      return absl::nullopt;
+    }
+    int64 offset_dim = offset_or_window_dims[i - collapsed];
+    if (i - collapsed > 0 &&
+        offset_dim < offset_or_window_dims[i - collapsed - 1]) {
+      // Output offsets are transposed, we do not support this case.
+      return absl::nullopt;
+    }
+    passthrough_tile[offset_dim] = dim_partitions;
+  }
+  Array<int64> tile_assignment = operand.sharding().tile_assignment();
+  tile_assignment.Reshape(passthrough_tile);
+  return HloSharding::Tile(tile_assignment);
+}
+
+// Returns whether partitioning in the operand only happens in dimensions with
+// gather/scatter slice size 1.
+bool GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+    const PartitionedHlo& operand, absl::Span<const int64> index_map,
+    absl::Span<const int64> slice_size, int64 num_partitions) {
+  if (operand.sharding().IsTileMaximal()) {
+    return false;
+  }
+  int64 trivial_slice_dims_partitions = 1;
+  for (int64 dim : index_map) {
+    if (slice_size[dim] == 1) {
+      trivial_slice_dims_partitions *=
+          operand.sharding().tile_assignment().dim(dim);
+    }
+  }
+  return trivial_slice_dims_partitions == num_partitions;
+}
+
+// Returns the min and max for the indices (replicated) in a scatter/gather
+// which has the operand partitioned on trivial slice dimensions (slice size 1).
+std::pair<HloInstruction*, HloInstruction*>
+IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+    const PartitionedHlo& operand, const PartitionedHlo& replicated_indices,
+    HloInstruction* partition_id, absl::Span<const int64> index_map,
+    int64 index_vector_dim, SpmdBuilder* b) {
+  auto operand_offsets = MakePartitionOffsets(
+      operand.base_shape(), operand.sharding(), partition_id, b);
+  // Find the per-dimension index bounds.
+  std::vector<HloInstruction*> min_indices;
+  std::vector<HloInstruction*> max_indices;
+  for (int64 i = 0; i < index_map.size(); ++i) {
+    int64 dim = index_map[i];
+    int64 partitions = operand.sharding().tile_assignment().dim(dim);
+    if (partitions == 1) {
+      min_indices.push_back(CreateR0WithType<int32>(
+          replicated_indices.base_shape().element_type(), 0, b));
+      max_indices.push_back(CreateR0WithType<int32>(
+          replicated_indices.base_shape().element_type(),
+          operand.base_shape().dimensions(dim), b));
+      continue;
+    }
+    auto offset = operand_offsets[dim];
+    if (offset->shape().element_type() !=
+        replicated_indices.base_shape().element_type()) {
+      offset = b->AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::MakeShape(replicated_indices.base_shape().element_type(),
+                               {}),
+          offset));
+    }
+    min_indices.push_back(offset);
+    auto partition_size_minus_1 =
+        CreateR0WithType<int32>(replicated_indices.base_shape().element_type(),
+                                operand.hlo()->shape().dimensions(dim) - 1, b);
+    max_indices.push_back(b->AddInstruction(HloInstruction::CreateBinary(
+        offset->shape(), HloOpcode::kAdd, offset, partition_size_minus_1)));
+  }
+  // Broadcast the index bounds to the same shape as the indices.
+  HloInstruction* broadcast_min;
+  HloInstruction* broadcast_max;
+  if (index_vector_dim < replicated_indices.base_shape().rank()) {
+    // The index vector is an R1, we need to reshape individual bounds to
+    // [1], and concat them if there are more than one.
+    for (int64 i = 0; i < min_indices.size(); ++i) {
+      min_indices[i] = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(min_indices[i]->shape().element_type(), {1}),
+          min_indices[i]));
+      max_indices[i] = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(max_indices[i]->shape().element_type(), {1}),
+          max_indices[i]));
+    }
+    int64 slice_dims = max_indices.size();
+    if (slice_dims > 1) {
+      min_indices[0] = b->AddInstruction(HloInstruction::CreateConcatenate(
+          ShapeUtil::MakeShape(min_indices[0]->shape().element_type(),
+                               {slice_dims}),
+          min_indices, 0));
+      max_indices[0] = b->AddInstruction(HloInstruction::CreateConcatenate(
+          min_indices[0]->shape(), max_indices, 0));
+    }
+    broadcast_min = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), min_indices[0], {index_vector_dim}));
+    broadcast_max = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), max_indices[0], {index_vector_dim}));
+  } else {
+    CHECK_EQ(max_indices.size(), 1);
+    broadcast_min = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), min_indices[0], {}));
+    broadcast_max = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), max_indices[0], {}));
+  }
+  return {broadcast_min, broadcast_max};
+}
+
+Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
+  auto scatter = Cast<HloScatterInstruction>(hlo);
+  auto dnums = scatter->scatter_dimension_numbers();
+  auto operand = GetPartitionedHlo(scatter->operand(0));
+  auto indices = GetPartitionedHlo(scatter->operand(1));
+  auto updates = GetPartitionedHlo(scatter->operand(2));
+  std::vector<int64> slice_size(operand.base_shape().rank(), 1);
+  int64 num_update_window_dims = 0;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.inserted_window_dims(), i)) {
+      continue;
+    }
+    slice_size[i] = updates.base_shape().dimensions(
+        dnums.update_window_dims(num_update_window_dims++));
+  }
+  std::vector<int64> inserted_window_dims(dnums.inserted_window_dims().begin(),
+                                          dnums.inserted_window_dims().end());
+  std::vector<int64> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  std::vector<int64> update_window_dims(dnums.update_window_dims().begin(),
+                                        dnums.update_window_dims().end());
+  if (!operand.sharding().IsTileMaximal()) {
+    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
+        operand, updates.base_shape(), inserted_window_dims,
+        scatter_dims_to_operand_dims, update_window_dims, slice_size);
+    // Handle pass through cases if we can use compatible sharding for update.
+    if (maybe_passthrough.has_value()) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      updates = updates.Reshard(*maybe_passthrough);
+      auto pscatter = b_.AddInstruction(HloInstruction::CreateScatter(
+          operand.hlo()->shape(), operand.hlo(), indices.hlo(), updates.hlo(),
+          scatter->to_apply(), dnums, scatter->indices_are_sorted(),
+          scatter->unique_indices()));
+      pscatter->set_sharding(*maybe_passthrough);
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pscatter, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+    if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+            operand, scatter_dims_to_operand_dims, slice_size,
+            num_partitions_) &&
+        ShapeUtil::ByteSizeOf(updates.base_shape()) <
+            ShapeUtil::ByteSizeOf(scatter->shape())) {
+      // Operand is sharded on trivial slice dims (update slice size 1). We can
+      // adjust the indices on each partition by subtracting the offsets. Then
+      // we execute a scatter on full updated indices, and out-of-bound accesses
+      // will have no effect on the result as guaranteed by the scatter
+      // semantics.
+      indices = indices.Reshard(HloSharding::Replicate());
+      updates = updates.Reshard(HloSharding::Replicate());
+      HloInstruction* indices_min;
+      HloInstruction* indices_max_unused;
+      std::tie(indices_min, indices_max_unused) =
+          IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, indices, partition_id_, scatter_dims_to_operand_dims,
+              dnums.index_vector_dim(), &b_);
+      auto adjusted_indices = b_.AddInstruction(HloInstruction::CreateBinary(
+          indices.hlo()->shape(), HloOpcode::kSubtract, indices.hlo(),
+          indices_min));
+      auto pscatter = b_.AddInstruction(HloInstruction::CreateScatter(
+          operand.hlo()->shape(), operand.hlo(), adjusted_indices,
+          updates.hlo(), scatter->to_apply(), dnums,
+          scatter->indices_are_sorted(), scatter->unique_indices()));
+      pscatter->set_sharding(operand.sharding());
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pscatter, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto operand = GetPartitionedHlo(hlo->operand(0)).Reshard(sharding);
+
+  // Create a window config to represent the slice.
+  Window window;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_stride(hlo->slice_strides(i));
+    dim->set_window_dilation(1);
+    dim->set_window_reversal(false);
+    dim->set_padding_low(-hlo->slice_starts(i));
+    dim->set_padding_high(hlo->slice_limits(i) -
+                          hlo->operand(0)->shape().dimensions(i));
+    dim->set_base_dilation(1);
+  }
+
+  auto reshard_operand = operand.ReshardAsWindowedInput(
+      window, sharding,
+      CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+      /*mask_invalid_region=*/false);
+  if (!reshard_operand.has_value()) {
+    return DefaultAction(hlo);
+  }
+  TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+  const Shape& operand_shape = reshard_operand->sharded_input->shape();
+
+  std::vector<int64> start_indices = hlo->slice_starts();
+  std::vector<int64> limit_indices = hlo->slice_limits();
+  std::vector<int64> strides = hlo->slice_strides();
+  bool need_slice = false;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    auto dim = reshard_operand->shard_window.dimensions(i);
+    start_indices[i] = -dim.padding_low();
+    limit_indices[i] = operand_shape.dimensions(i) + dim.padding_high();
+    if (start_indices[i] != 0 || strides[i] != 1 ||
+        limit_indices[i] != operand_shape.dimensions(i)) {
+      need_slice = true;
+    }
+  }
+
+  SetPartitionedHlo(hlo, [&] {
+    if (need_slice) {
+      auto shard_shape = MakePartitionedShape(hlo->shape(), sharding);
+      return b_.AddInstruction(HloInstruction::CreateSlice(
+          shard_shape, reshard_operand->sharded_input, start_indices,
+          limit_indices, strides));
+    }
+    return reshard_operand->sharded_input;
+  });
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
+  HloSharding sharding = hlo->sharding();
+  if (hlo->shape().IsTuple()) {
+    // Check that all elements are sharded in the same way.
+    if (hlo->shape().tuple_shapes_size() == 0) {
+      return DefaultAction(hlo);
+    }
+    sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+    for (int64 i = 1; i < hlo->operand_count(); ++i) {
+      if (sharding != hlo->sharding().GetSubSharding(hlo->shape(), {i})) {
+        return DefaultAction(hlo);
+      }
+    }
+  }
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 dim : hlo->dimensions()) {
+    if (sharding.tile_assignment().dim(dim) > 1) {
+      return DefaultAction(hlo);
+    }
+  }
+  // Reshard operands to the same as the output.
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
+  }
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
+  if (hlo->custom_call_target() == "SPMDFullToShardShape") {
+    // This op switches from auto partitioning to manual partitioning.
+    auto input_partitioned = GetPartitionedHlo(hlo->operand(0));
+    if (!EvenlyPartitions(hlo->shape(), input_partitioned.sharding())) {
+      input_partitioned = input_partitioned.PadWithValue(
+          CreateR0WithType(hlo->shape().element_type(), 0, &b_));
+    }
+    auto input = input_partitioned.hlo();
+    CHECK(hlo->sharding().IsReplicated());
+    CHECK(ShapeUtil::Compatible(input->shape(), hlo->shape()));
+    auto copy = b_.AddInstruction(
+        HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
+    SetPartitionedHlo(hlo, [&] { return copy; });
+    return Status::OK();
+  }
+  if (hlo->custom_call_target() == "SPMDShardToFullShape") {
+    // This op switches from manual partitioning to auto partitioning.
+    auto input = GetPartitionedHlo(hlo->operand(0)).hlo();
+    CHECK(input->sharding().IsReplicated());
+    auto copy = b_.AddInstruction(
+        HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
+    CHECK(ShapeUtil::Compatible(
+        copy->shape(), MakePartitionedShape(hlo->shape(), hlo->sharding())));
+    SetPartitionedHlo(hlo, [&] { return copy; });
+    return Status::OK();
+  }
+  if (hlo->custom_call_target() != "TopK") {
+    return DefaultAction(hlo);
+  }
+
+  if (!hlo->operand(0)->has_sharding()) {
+    return DefaultAction(hlo);
+  }
+
+  const HloSharding& sharding = hlo->operand(0)->sharding();
+  if (sharding.IsTileMaximal() || sharding.IsReplicated()) {
+    return DefaultAction(hlo);
+  }
+
+  const int64 sort_dim = 1;
+  const int64 shard_count = sharding.tile_assignment().dim(sort_dim);
+
+  if (shard_count <= 1) {
+    return DefaultAction(hlo);
+  }
+
+  const int64 input_size = hlo->operand(0)->shape().dimensions(sort_dim);
+  const int64 batch_size = hlo->shape().tuple_shapes(0).dimensions(0);
+  const int64 k = hlo->shape().tuple_shapes(0).dimensions(sort_dim);
+  const int64 per_partition_size = CeilOfRatio(input_size, shard_count);
+
+  if (k >= per_partition_size) {
+    return DefaultAction(hlo);
+  }
+
+  auto input = hlo->operand(0);
+  const auto element_type = input->shape().element_type();
+
+  // Pad input with minimal value.
+  auto min_value = b_.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::MinValue(element_type)));
+  // TODO(wangtao): add test to see if -NaN < -Inf in BF16.
+  if (element_type == F32) {
+    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
+    min_value = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<float>(-float_pad_value)));
+  }
+  auto partitioned_input = GetPartitionedHlo(input).PadWithValue(min_value);
+
+  // Each partition needs to do TopK separately, thus the base shape
+  // becomes [batch_size, k * shard_count].
+  const Shape replicated_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(hlo->operand(0)->shape().element_type(),
+                            {batch_size, k * shard_count}),
+       ShapeUtil::MakeShape(S32, {batch_size, k * shard_count})});
+  auto custom_call_sharding =
+      sharding.GetTupleSharding(replicated_shape).ValueOrDie();
+  auto shard_shape =
+      MakePartitionedShape(replicated_shape, custom_call_sharding);
+  auto topk = b_.AddInstruction(
+      hlo->CloneWithNewOperands(shard_shape, {partitioned_input.hlo()}));
+  topk->set_sharding(custom_call_sharding);
+  // Partition customcall.
+  PartitionedHlo partitioned_topk(topk, replicated_shape,
+                                  MakePartitioningState());
+  topk = partitioned_topk.hlo();
+
+  // Get value from TopK.
+  HloInstruction* value_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(0), topk, 0));
+  value_gte->set_sharding(sharding);
+  // Partition GetTupleElement of value.
+  PartitionedHlo value_partitioned_gte(
+      value_gte, partitioned_topk.base_shape().tuple_shapes(0),
+      MakePartitioningState());
+  // Reshard value to be replicated.
+  auto replicated_value_gte =
+      value_partitioned_gte.Reshard(HloSharding::Replicate()).hlo();
+
+  // Get index from TopK.
+  HloInstruction* index_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(1), topk, 1));
+  auto partition_id_s32 = b_.AddInstruction(HloInstruction::CreateConvert(
+      ShapeUtil::MakeShape(S32, partition_id_->shape().dimensions()),
+      partition_id_));
+  // Add per partition offset to index, index returned from CustomCall always
+  // starts from 0.
+  auto index_offset = b_.AddInstruction(HloInstruction::CreateBroadcast(
+      index_gte->shape(),
+      b_.AddInstruction(HloInstruction::CreateBinary(
+          partition_id_s32->shape(), HloOpcode::kMultiply, partition_id_s32,
+          b_.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<int32>(per_partition_size))))),
+      {}));
+  index_gte = b_.AddInstruction(HloInstruction::CreateBinary(
+      index_offset->shape(), HloOpcode::kAdd, index_gte, index_offset));
+  index_gte->set_sharding(sharding);
+  // Parttion GetTupleElement of index.
+  PartitionedHlo index_partitioned_gte(
+      index_gte, partitioned_topk.base_shape().tuple_shapes(1),
+      MakePartitioningState());
+  // Reshard index to be replicated.
+  auto replicated_index_gte =
+      index_partitioned_gte.Reshard(HloSharding::Replicate()).hlo();
+
+  // Creates replicated sort to do TopK, the input is value and index pairs
+  // from all the partitions. The reason to use Sort instead of CustomCall TopK
+  // is CustomCall only takes value as input. There will be an extra Gather
+  // to get the correct index if CustomCall is used here.
+
+  // Create comparator for the sort.
+  XlaBuilder b("Sort.Compare");
+  XlaComputation comparator = CreateScalarComparisonComputation(
+      "compare-value-and-index", {input->shape().element_type(), S32}, {Gt, Lt},
+      &b);
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, comparator.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSIGN_OR_RETURN(auto new_module,
+                      HloModule::CreateFromProto(comparator.proto(), config));
+  HloCloneContext context(module_);
+  auto compare_computation =
+      module_->DeepCloneComputation(new_module->entry_computation(), &context);
+  auto sort = b_.AddInstruction(HloInstruction::CreateSort(
+      replicated_shape, sort_dim, {replicated_value_gte, replicated_index_gte},
+      compare_computation, true));
+  sort->set_sharding(
+      HloSharding::Replicate().GetTupleSharding(sort->shape()).ValueOrDie());
+  PartitionedHlo replicated_sort(sort, replicated_shape,
+                                 MakePartitioningState());
+
+  // Slice value and index from top-k for output.
+  HloInstruction* sort_value_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          replicated_sort.hlo()->shape().tuple_shapes(0), replicated_sort.hlo(),
+          0));
+  HloInstruction* sort_index_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          replicated_sort.hlo()->shape().tuple_shapes(1), replicated_sort.hlo(),
+          1));
+  const Shape& hlo_shape = sort_value_gte->shape();
+  auto hlo_dims = hlo_shape.dimensions();
+  std::vector<int64> start_indices(hlo_shape.dimensions_size(), 0);
+  std::vector<int64> limit_indices(hlo_dims.begin(), hlo_dims.end());
+  std::vector<int64> strides(hlo_shape.dimensions_size(), sort_dim);
+  limit_indices[sort_dim] = k;
+  auto output_shape = hlo_shape;
+  output_shape.set_dimensions(sort_dim, k);
+  // Slice value from final sort.
+  HloInstruction* slice_sort_value =
+      b_.AddInstruction(HloInstruction::CreateSlice(
+          output_shape, sort_value_gte, start_indices, limit_indices, strides));
+  // Slice index from final sort.
+  auto index_output_shape = sort_index_gte->shape();
+  index_output_shape.set_dimensions(sort_dim, k);
+  HloInstruction* slice_index_value = b_.AddInstruction(
+      HloInstruction::CreateSlice(index_output_shape, sort_index_gte,
+                                  start_indices, limit_indices, strides));
+  auto create_tuple = b_.AddInstruction(
+      HloInstruction::CreateTuple({slice_sort_value, slice_index_value}));
+  create_tuple->set_sharding(HloSharding::Replicate());
+
+  SetPartitionedHlo(hlo, PartitionedHlo(create_tuple, create_tuple->shape(),
+                                        MakePartitioningState())
+                             .Reshard(hlo->sharding()));
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  std::vector<int64> inverse_dimensions(hlo->shape().rank());
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    inverse_dimensions[hlo->dimensions(i)] = i;
+  }
+  auto desired_operand_sharding =
+      hlo_sharding_util::TransposeSharding(sharding, inverse_dimensions);
+
+  auto operand = GetPartitionedHlo(hlo->operand(0))
+                     .Reshard(desired_operand_sharding)
+                     .hlo();
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand}));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto operand = GetPartitionedHlo(hlo->operand(0));
+  // The output shape is the source and the operand shape is the target to get
+  // the aligned sharding for the operand.
+  auto desired_operand_sharding = hlo_sharding_util::ReshapeSharding(
+      hlo->shape(), hlo->operand(0)->shape(), hlo->sharding());
+  if (desired_operand_sharding.has_value()) {
+    auto operand_hlo = operand.Reshard(*desired_operand_sharding).hlo();
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand_hlo}));
+    });
+    return Status::OK();
+  }
+
+  // Try use halo exchange for certain split-dim/merge-dims cases.
+  // ReshapeSharding failed in these cases probably due to uneven partitioning,
+  // where halo exchange could help. Specifically we check the following
+  // conditions to detect supported cases:
+  // 1) Both input and output are partitioned on one dimension.
+  // 2) The combined size of dimensions before the partitioned dimension are the
+  // same on input and output. This means we don't need to consider the major
+  // dimensions.
+  // 3) Let A = the input size on the partitioned dimension, and
+  //        B = the output size on the partitioned dimension; then
+  //    either A % B == 0 (split dim) or B % A == 0 (merge dims).
+  auto maybe_input_sharded_dim = UniqueTiledDim(operand.sharding());
+  auto maybe_output_sharded_dim = UniqueTiledDim(sharding);
+  if (!maybe_input_sharded_dim || !maybe_output_sharded_dim) {
+    return DefaultAction(hlo);
+  }
+  int64 input_sharded_dim = *maybe_input_sharded_dim;
+  int64 output_sharded_dim = *maybe_output_sharded_dim;
+  // Check that the major dims before the sharded dim have the same total size
+  // for input and output.
+  int64 input_major_dims_size = 1;
+  for (int64 i = 0; i < input_sharded_dim; ++i) {
+    input_major_dims_size *= operand.base_shape().dimensions(i);
+  }
+  int64 output_major_dims_size = 1;
+  for (int64 i = 0; i < output_sharded_dim; ++i) {
+    output_major_dims_size *= hlo->shape().dimensions(i);
+  }
+  if (input_major_dims_size != output_major_dims_size) {
+    return DefaultAction(hlo);
+  }
+  // Fix potential device ordering mismatch in tile assignment.
+  Array<int64> new_input_tile_assignment = sharding.tile_assignment();
+  new_input_tile_assignment.Reshape(
+      operand.sharding().tile_assignment().dimensions());
+  operand = operand.Reshard(HloSharding::Tile(new_input_tile_assignment));
+
+  int64 input_dim_size = operand.base_shape().dimensions(input_sharded_dim);
+  int64 output_dim_size = hlo->shape().dimensions(output_sharded_dim);
+  auto input_shard_shape =
+      MakePartitionedShape(operand.base_shape(), operand.sharding());
+  auto output_shard_shape = MakePartitionedShape(hlo->shape(), sharding);
+  if (input_dim_size % output_dim_size == 0) {
+    // Split dim.
+    int64 split_factor = input_dim_size / output_dim_size;
+    int64 output_shard_size = output_shard_shape.dimensions(output_sharded_dim);
+    // Use halo exchange to fix misaligned data.
+    Window window;
+    for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+      WindowDimension* dim = window.add_dimensions();
+      dim->set_size(1);
+      dim->set_stride(1);
+      dim->set_window_dilation(1);
+      dim->set_window_reversal(false);
+      dim->set_base_dilation(1);
+      dim->set_padding_low(0);
+      if (i == input_sharded_dim) {
+        dim->set_padding_high(output_shard_size * split_factor *
+                                  num_partitions_ -
+                              input_dim_size);
+      } else {
+        dim->set_padding_high(0);
+      }
+    }
+
+    auto reshard_operand = operand.ReshardAsWindowedInput(
+        window, operand.sharding(),
+        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+        /*mask_invalid_region=*/false);
+    if (!reshard_operand.has_value()) {
+      return DefaultAction(hlo);
+    }
+    TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+    CHECK_EQ(
+        reshard_operand->sharded_input->shape().dimensions(input_sharded_dim),
+        output_shard_size * split_factor);
+    SetPartitionedHlo(hlo, [&] {
+      // Do a local reshape.
+      return b_.AddInstruction(HloInstruction::CreateReshape(
+          output_shard_shape, reshard_operand->sharded_input));
+    });
+    return Status::OK();
+  } else if (output_dim_size % input_dim_size == 0) {
+    // Merge dims.
+    int64 merge_factor = output_dim_size / input_dim_size;
+    // First reshape locally. (The sharded dimension could include padded data.)
+    auto tmp_shard_shape = output_shard_shape;
+    tmp_shard_shape.set_dimensions(
+        output_sharded_dim,
+        input_shard_shape.dimensions(input_sharded_dim) * merge_factor);
+    auto tmp_reshape = b_.AddInstruction(
+        HloInstruction::CreateReshape(tmp_shard_shape, operand.hlo()));
+    tmp_reshape->set_metadata(hlo->metadata());
+    tmp_reshape->set_sharding(hlo->sharding());
+    auto tmp_full_shape = tmp_shard_shape;
+    tmp_full_shape.set_dimensions(
+        output_sharded_dim,
+        tmp_shard_shape.dimensions(output_sharded_dim) * num_partitions_);
+    auto tmp_output =
+        PartitionedHlo(tmp_reshape, tmp_full_shape, MakePartitioningState());
+
+    // Use halo exchange to fix misaligned data.
+    Window window;
+    for (int64 i = 0; i < tmp_shard_shape.rank(); ++i) {
+      WindowDimension* dim = window.add_dimensions();
+      dim->set_size(1);
+      dim->set_stride(1);
+      dim->set_window_dilation(1);
+      dim->set_window_reversal(false);
+      dim->set_base_dilation(1);
+      dim->set_padding_low(0);
+      if (i == output_sharded_dim) {
+        dim->set_padding_high(output_dim_size -
+                              tmp_shard_shape.dimensions(output_sharded_dim) *
+                                  num_partitions_);
+      } else {
+        dim->set_padding_high(0);
+      }
+    }
+
+    auto reshard_output = tmp_output.ReshardAsWindowedInput(
+        window, sharding,
+        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+        /*mask_invalid_region=*/false);
+    if (!reshard_output.has_value()) {
+      return DefaultAction(hlo);
+    }
+    TF_RET_CHECK(!reshard_output->dynamic_slice_index_on_output.has_value());
+    CHECK_EQ(
+        reshard_output->sharded_input->shape().dimensions(output_sharded_dim),
+        output_shard_shape.dimensions(output_sharded_dim));
+    SetPartitionedHlo(hlo, [&] { return reshard_output->sharded_input; });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleIota(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  SetPartitionedHlo(hlo, [&] {
+    int64 dimension = Cast<HloIotaInstruction>(hlo)->iota_dimension();
+    auto iota = b_.AddInstruction(HloInstruction::CreateIota(
+        MakePartitionedShape(hlo->shape(), sharding), dimension));
+
+    if (sharding.tile_assignment().dim(dimension) > 1) {
+      auto partition_ordinals =
+          MakeTiledPartitionOrdinals(sharding, partition_id_, &b_);
+      auto multiplier = b_.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(iota->shape().dimensions(dimension))));
+      auto offset = b_.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(S32, {}), HloOpcode::kMultiply,
+          partition_ordinals[dimension], multiplier));
+      if (iota->shape().element_type() != S32) {
+        offset = b_.AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::MakeShape(iota->shape().element_type(), {}), offset));
+      }
+      auto broadcast = b_.AddInstruction(
+          HloInstruction::CreateBroadcast(iota->shape(), offset, {}));
+      return b_.AddInstruction(HloInstruction::CreateBinary(
+          iota->shape(), HloOpcode::kAdd, iota, broadcast));
+    }
+
+    return iota;
+  });
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSingleDevice(const HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->sharding().HasUniqueDevice());
+  int64 device = hlo->sharding().GetUniqueDevice();
+  const HloSharding sharding = HloSharding::AssignDevice(device);
+
+  std::vector<HloInstruction*> operands;
+  std::vector<Shape> operand_shapes;
+  for (const HloInstruction* operand : hlo->operands()) {
+    operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
+    operand_shapes.push_back(operand->shape());
+  }
+  auto operand = b_.AddInstruction(HloInstruction::CreateTuple(operands));
+  auto operand_shape = ShapeUtil::MakeTupleShape(operand_shapes);
+
+  auto on_device = b_.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(device)));
+  auto pred = b_.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}), partition_id_, on_device,
+      ComparisonDirection::kEq));
+
+  SpmdBuilder true_b("true_computation", visiting_hlo_);
+  HloComputation* true_computation;
+  {
+    auto param = true_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, operand_shape, "true_branch_param"));
+    std::vector<HloInstruction*> new_operands;
+    for (int64 i = 0; i < operands.size(); ++i) {
+      new_operands.push_back(true_b.AddInstruction(
+          HloInstruction::CreateGetTupleElement(operand_shapes[i], param, i)));
+    }
+    auto root = true_b.AddInstruction(
+        hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+    true_computation = module_->AddEmbeddedComputation(true_b.Build(root));
+  }
+
+  SpmdBuilder false_b("false_computation", visiting_hlo_);
+  HloComputation* false_computation;
+  {
+    false_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, operand_shape, "false_branch_param"));
+    auto root = CreateZero(hlo->shape(), &false_b);
+    false_computation = module_->AddEmbeddedComputation(false_b.Build(root));
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        hlo->shape(), pred, operand, true_computation, operand,
+        false_computation));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleAllReduce(HloInstruction* hlo) {
+  if (hlo->IsCrossReplicaAllReduce() && hlo->operand_count() == 1) {
+    return HandleElementwise(hlo);
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto& operand = GetPartitionedHlo(hlo->operand(0));
+
+  // Tiled output.
+  std::vector<int64> wanted_input_tile_size(operand.base_shape().rank());
+  std::vector<int64> sharded_new_dims;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    wanted_input_tile_size[i] =
+        hlo->sharding().tile_assignment().dim(hlo->dimensions(i));
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_linear_search(hlo->dimensions(), i) &&
+        hlo->sharding().tile_assignment().dim(i) > 1) {
+      sharded_new_dims.push_back(i);
+    }
+  }
+  if (sharded_new_dims.empty()) {
+    // The new dimensions are replicated, so that we can do the adjustment on
+    // the input.
+    Array<int64> wanted_input_tile_assignment(wanted_input_tile_size);
+    wanted_input_tile_assignment.Each(
+        [&](absl::Span<const int64> indices, int64* val) {
+          std::vector<int64> indices_in_broadcast(hlo->shape().rank(), 0);
+          for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+            indices_in_broadcast[hlo->dimensions(i)] = indices[i];
+          }
+          *val = hlo->sharding().tile_assignment()(indices_in_broadcast);
+        });
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()),
+          {operand.Reshard(HloSharding::Tile(wanted_input_tile_assignment))
+               .hlo()}));
+    });
+  } else {
+    auto input = operand.Reshard(HloSharding::Replicate()).hlo();
+    // We pad and shard the input first, then broadcast to the final shard
+    // shape.
+    auto output_offsets =
+        MakePartitionOffsets(hlo->shape(), hlo->sharding(), partition_id_, &b_);
+    std::vector<HloInstruction*> input_offsets(operand.base_shape().rank());
+    auto output_shard_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto input_shard_shape = input->shape();
+    auto padded_input_shape = input->shape();
+    for (int64 i = 0; i < input_offsets.size(); ++i) {
+      input_offsets[i] = output_offsets[hlo->dimensions(i)];
+      input_shard_shape.set_dimensions(
+          i, output_shard_shape.dimensions(hlo->dimensions(i)));
+      padded_input_shape.set_dimensions(
+          i, hlo->sharding().tile_assignment().dim(hlo->dimensions(i)) *
+                 input_shard_shape.dimensions(i));
+    }
+    auto padded_input = PadToShape(input, padded_input_shape, &b_);
+    auto input_shard =
+        ShapeUtil::Compatible(input_shard_shape, padded_input->shape())
+            ? padded_input
+            : b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+                  input_shard_shape, padded_input, input_offsets,
+                  input_shard_shape.dimensions()));
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(output_shard_shape, {input_shard}));
+    });
+  }
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConstant(HloInstruction* hlo) {
+  const Literal& literal = hlo->literal();
+  if (literal.shape().IsTuple() ||
+      (!hlo->sharding().IsTileMaximal() &&
+       (!EvenlyPartitions(hlo->shape(), hlo->sharding()) ||
+        !literal.IsAllFirst()))) {
+    return DefaultAction(hlo);
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    std::vector<int64> start_indices(hlo->shape().rank(), 0);
+    auto constant = b_.AddInstruction(HloInstruction::CreateConstant(
+        literal.Slice(start_indices, shard_shape.dimensions())));
+    *constant->mutable_shape() = shard_shape;
+    return constant;
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->sharding().tile_assignment().dim(i) != 1 &&
+        (hlo->dynamic_slice_sizes()[i] != hlo->shape().dimensions(i) ||
+         !hlo->operand(i + 1)->IsConstant() ||
+         !hlo->operand(i + 1)->literal().IsZero({}))) {
+      // We currently do not partition the sliced dimensions.
+      return DefaultAction(hlo);
+    }
+  }
+  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+  auto new_input =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  for (int64 i = 0; i < new_indices.size(); ++i) {
+    // Replicate the indices.
+    new_indices[i] = GetPartitionedHlo(hlo->operand(i + 1))
+                         .Reshard(HloSharding::Replicate())
+                         .hlo();
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    auto partitioned_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        partitioned_shape, new_input, new_indices,
+        partitioned_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->sharding().tile_assignment().dim(i) != 1 &&
+        (hlo->operand(1)->shape().dimensions(i) != hlo->shape().dimensions(i) ||
+         !hlo->operand(i + 2)->IsConstant() ||
+         !hlo->operand(i + 2)->literal().IsZero({}))) {
+      // We currently do not partition the sliced dimensions.
+      return DefaultAction(hlo);
+    }
+  }
+  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+  auto new_input =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  auto new_update =
+      GetPartitionedHlo(hlo->operand(1)).Reshard(hlo->sharding()).hlo();
+  for (int64 i = 0; i < new_indices.size(); ++i) {
+    // Replicate the indices.
+    new_indices[i] = GetPartitionedHlo(hlo->operand(i + 2))
+                         .Reshard(HloSharding::Replicate())
+                         .hlo();
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    auto partitioned_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        partitioned_shape, new_input, new_update, new_indices));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
+  auto gather = Cast<HloGatherInstruction>(hlo);
+  const auto& dnums = gather->gather_dimension_numbers();
+  auto operand = GetPartitionedHlo(gather->operand(0));
+  auto indices = GetPartitionedHlo(gather->operand(1));
+  std::vector<int64> collapsed_slice_dims(dnums.collapsed_slice_dims().begin(),
+                                          dnums.collapsed_slice_dims().end());
+  std::vector<int64> start_index_map(dnums.start_index_map().begin(),
+                                     dnums.start_index_map().end());
+  std::vector<int64> offset_dims(dnums.offset_dims().begin(),
+                                 dnums.offset_dims().end());
+  if (!operand.sharding().IsTileMaximal()) {
+    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
+        operand, gather->shape(), collapsed_slice_dims, start_index_map,
+        offset_dims, gather->gather_slice_sizes());
+    if (maybe_passthrough.has_value()) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      auto pshape = MakePartitionedShape(gather->shape(), *maybe_passthrough);
+      std::vector<int64> pslice_sizes(gather->gather_slice_sizes().begin(),
+                                      gather->gather_slice_sizes().end());
+      for (int64 i = 0; i < pslice_sizes.size(); ++i) {
+        if (operand.sharding().tile_assignment().dim(i) > 1) {
+          pslice_sizes[i] = operand.hlo()->shape().dimensions(i);
+        }
+      }
+      auto pgather = b_.AddInstruction(HloInstruction::CreateGather(
+          pshape, operand.hlo(), indices.hlo(), dnums, pslice_sizes,
+          gather->indices_are_sorted()));
+      pgather->set_sharding(*maybe_passthrough);
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pgather, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+    if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+            operand, start_index_map, gather->gather_slice_sizes(),
+            num_partitions_) &&
+        ShapeUtil::ByteSizeOf(gather->shape()) <
+            ShapeUtil::ByteSizeOf(gather->operand(0)->shape())) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      // Now the operand is partitioned in trivial slice dimensions, and the
+      // indices are replicated. We execute a gather on partitioned operand,
+      // with full number of indices, where out-of-bounds indices are clamped,
+      // and masked out with 0 in the result; then we use all-reduce to combine
+      // results. Although gather will not get faster, we avoided the need to
+      // replicate the operand.
+      HloInstruction* indices_min;
+      HloInstruction* indices_max;
+      std::tie(indices_min, indices_max) =
+          IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, indices, partition_id_, start_index_map,
+              dnums.index_vector_dim(), &b_);
+      // Clamp the indices.
+      auto adjusted_indices = b_.AddInstruction(HloInstruction::CreateTernary(
+          indices.base_shape(), HloOpcode::kClamp, indices_min, indices.hlo(),
+          indices_max));
+      // Adjust the indices by subtracting the offset.
+      adjusted_indices = b_.AddInstruction(HloInstruction::CreateBinary(
+          indices.base_shape(), HloOpcode::kSubtract, adjusted_indices,
+          indices_min));
+      // Gather on adjusted indices.
+      auto pgather = b_.AddInstruction(HloInstruction::CreateGather(
+          gather->shape(), operand.hlo(), adjusted_indices, dnums,
+          gather->gather_slice_sizes(), gather->indices_are_sorted()));
+      // Mask out invalid results.
+      auto filter = b_.AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(indices.base_shape(), PRED),
+          indices.hlo(), indices_min, ComparisonDirection::kLt));
+      filter = b_.AddInstruction(HloInstruction::CreateBinary(
+          filter->shape(), HloOpcode::kOr, filter,
+          b_.AddInstruction(HloInstruction::CreateCompare(
+              ShapeUtil::ChangeElementType(indices.base_shape(), PRED),
+              indices.hlo(), indices_max, ComparisonDirection::kGt))));
+      if (dnums.index_vector_dim() < indices.base_shape().rank()) {
+        std::vector<int64> reduced_filter_dims;
+        for (int64 i = 0; i < filter->shape().rank(); ++i) {
+          if (i != dnums.index_vector_dim()) {
+            reduced_filter_dims.push_back(filter->shape().dimensions(i));
+          }
+        }
+        filter = b_.AddInstruction(HloInstruction::CreateReduce(
+            ShapeUtil::MakeShape(PRED, reduced_filter_dims), filter,
+            CreateR0WithType(PRED, false, &b_), {dnums.index_vector_dim()},
+            MakeBinaryAdd(PRED, module_)));
+      }
+      std::vector<int64> batch_dims;
+      for (int64 i = 0; i < pgather->shape().rank(); ++i) {
+        if (!absl::c_linear_search(dnums.offset_dims(), i)) {
+          batch_dims.push_back(i);
+        }
+      }
+      auto broadcast_filter = b_.AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::ChangeElementType(pgather->shape(), PRED), filter,
+          batch_dims));
+      auto filtered = b_.AddInstruction(HloInstruction::CreateTernary(
+          pgather->shape(), HloOpcode::kSelect, broadcast_filter,
+          CreateZero(pgather->shape(), &b_), pgather));
+      // Combine from different partitions.
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, filtered,
+          MakeBinaryAdd(filtered->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleGetTupleElement(HloInstruction* hlo) {
+  const auto& tuple = GetPartitionedHlo(hlo->operand(0));
+  auto gte = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetTupleElementShape(tuple.hlo()->shape(), hlo->tuple_index()),
+      tuple.hlo(), hlo->tuple_index()));
+  SetPartitionedHlo(hlo, [&]() {
+    const auto source_sharding = tuple.sharding().GetSubSharding(
+        tuple.base_shape(), {hlo->tuple_index()});
+    gte->set_sharding(source_sharding);
+    PartitionedHlo source_partitioned_gte(gte, hlo->shape(),
+                                          MakePartitioningState());
+    return source_partitioned_gte.Reshard(hlo->sharding()).hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
+  const Shape& shape = ShapeUtil::GetTupleElementShape(hlo->shape(), 0);
+  auto token = GetPartitionedHlo(hlo->operand(0)).hlo();
+  if (ShapeUtil::GetLeafCount(shape) == 0) {
+    // TODO(b/155819021): HloSharding has issues with tuple-shaped sharding: it
+    // requires one element for an empty tuple, but leaf-count number of
+    // elements for non-empty tuple. So if it has a nested empty tuple, we
+    // cannot invoke GetSubSharding() since it expects a sharding for the empty
+    // tuple. This is a workaround for that case.
+    SetPartitionedHlo(hlo, [&]() {
+      return b_.AddInstruction(
+          HloInstruction::CreateInfeed(shape, token, hlo->infeed_config()));
+    });
+    return Status::OK();
+  }
+  auto sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+  auto shard_shape = MakePartitionedShape(shape, sharding);
+  if (EvenlyPartitions(shape, sharding)) {
+    SetPartitionedHlo(hlo, [&]() {
+      return b_.AddInstruction(HloInstruction::CreateInfeed(
+          shard_shape, token, hlo->infeed_config()));
+    });
+    return Status::OK();
+  }
+
+  if (hlo->sharding().HasUniqueDevice()) {
+    return HandleSingleDevice(hlo);
+  }
+
+  // Create a branch for each unique partitioned shape.
+  std::vector<Shape> per_branch_partitioned_shapes;
+  std::vector<int32> conditional_branch_indices(num_partitions_);
+  for (int64 i = 0; i < num_partitions_; ++i) {
+    auto partitioned_shape =
+        MakeNonPaddedShapeForGivenPartition(shape, sharding, i);
+    int64 matching_existing_index = 0;
+    for (; matching_existing_index < per_branch_partitioned_shapes.size();
+         ++matching_existing_index) {
+      if (ShapeUtil::Compatible(
+              partitioned_shape,
+              per_branch_partitioned_shapes[matching_existing_index])) {
+        break;
+      }
+    }
+    if (matching_existing_index < per_branch_partitioned_shapes.size()) {
+      conditional_branch_indices[i] = matching_existing_index;
+    } else {
+      conditional_branch_indices[i] = per_branch_partitioned_shapes.size();
+      per_branch_partitioned_shapes.push_back(std::move(partitioned_shape));
+    }
+  }
+
+  HloInstruction* branch_index;
+  if (per_branch_partitioned_shapes.size() == num_partitions_) {
+    // Use partition ID as the branch index if each partition has its own
+    // branch.
+    branch_index = partition_id_;
+    // PartitionId's output is U32 but conditional requires S32.
+    if (branch_index->shape().element_type() != S32) {
+      branch_index = b_.AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(branch_index->shape(), S32),
+          branch_index));
+    }
+  } else {
+    // Otherwise, use a constant table to look up the branch index.
+    auto branch_index_table = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<int32>(conditional_branch_indices)));
+    branch_index = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::MakeShape(S32, {1}), branch_index_table, {partition_id_},
+        {1}));
+    branch_index = b_.AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(S32, {}), branch_index));
+  }
+
+  std::vector<HloComputation*> branches(per_branch_partitioned_shapes.size());
+  for (int64 i = 0; i < branches.size(); ++i) {
+    SpmdBuilder branch_b(absl::StrCat("infeed_branch_", i), visiting_hlo_);
+    auto param = branch_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, token->shape(), "infeed_token_param"));
+    auto infeed = branch_b.AddInstruction(HloInstruction::CreateInfeed(
+        per_branch_partitioned_shapes[i], param, hlo->infeed_config()));
+    branches[i] = module_->AddEmbeddedComputation(branch_b.Build(infeed));
+    if (!ShapeUtil::Compatible(per_branch_partitioned_shapes[i], shard_shape)) {
+      TF_ASSIGN_OR_RETURN(
+          auto padded,
+          branches[i]->DeepCopyInstructionWithCustomCopier(
+              infeed, [&](HloInstruction* leaf, const ShapeIndex& leaf_index,
+                          HloComputation* comp) {
+                // Index {1} corresponds to the token.
+                if (leaf_index.empty() || leaf_index[0] != 0) {
+                  return leaf;
+                }
+                ShapeIndexView subindex(leaf_index, 1);
+                if (ShapeUtil::Compatible(
+                        ShapeUtil::GetSubshape(per_branch_partitioned_shapes[i],
+                                               subindex),
+                        ShapeUtil::GetSubshape(shard_shape, subindex))) {
+                  return leaf;
+                }
+                return PadToShape(leaf,
+                                  ShapeUtil::GetSubshape(shard_shape, subindex),
+                                  nullptr, comp);
+              }));
+      branches[i]->set_root_instruction(padded,
+                                        /*accept_different_shape=*/true);
+    }
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        ShapeUtil::MakeTupleShape({shard_shape, token->shape()}), branch_index,
+        branches, std::vector<HloInstruction*>(branches.size(), token)));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    const auto& pd = hlo->padding_config().dimensions(i);
+    // Right now we only support non-padded dimensions to be partitioned.
+    if (hlo->sharding().tile_assignment().dim(i) > 1 &&
+        (pd.edge_padding_high() != 0 || pd.edge_padding_low() != 0 ||
+         pd.interior_padding() != 0)) {
+      return DefaultAction(hlo);
+    }
+  }
+  auto resharded_lhs =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  auto replicated_rhs = GetPartitionedHlo(hlo->operand(1))
+                            .Reshard(HloSharding::Replicate())
+                            .hlo();
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        shard_shape, {resharded_lhs, replicated_rhs}));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleParameter(HloInstruction* hlo) {
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto new_param = b_.AddInstruction(HloInstruction::CreateParameter(
+        hlo->parameter_number(), shard_shape, "param"));
+    if (hlo->parameter_replicated_at_leaf_buffers()) {
+      new_param->set_parameter_replicated_at_leaf_buffers(
+          *hlo->parameter_replicated_at_leaf_buffers());
+    }
+    return new_param;
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
+  int64 input_count = 1;
+  auto per_input_sharding = hlo->sharding();
+  if (hlo->shape().IsTuple()) {
+    input_count = hlo->shape().tuple_shapes_size();
+    CHECK_GT(input_count, 0);
+    per_input_sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+  }
+
+  std::vector<PartitionedHlo> inputs;
+  std::vector<HloInstruction*> inits;
+  for (int64 operand_id = 0; operand_id < input_count; ++operand_id) {
+    inits.push_back(GetPartitionedHlo(hlo->operand(operand_id + input_count))
+                        .Reshard(HloSharding::Replicate())
+                        .hlo());
+    inputs.push_back(GetPartitionedHlo(hlo->operand(operand_id)));
+    if (operand_id > 0) {
+      // Make sure all operands are sharded in the same way.
+      inputs.back() = inputs.back().Reshard(inputs[0].sharding());
+    }
+    if (!inputs[0].sharding().IsTileMaximal()) {
+      inputs.back() = inputs.back().PadWithValue(inits[operand_id]);
+    }
+  }
+  bool reduce_sharded_dimension = false;
+  if (!inputs[0].sharding().IsTileMaximal()) {
+    reduce_sharded_dimension = absl::c_any_of(hlo->dimensions(), [&](int64 i) {
+      return inputs[0].sharding().tile_assignment().dim(i) > 1;
+    });
+
+    // reduce_sharded_dimension is not supported for tuple-shaped reduces.
+    if (reduce_sharded_dimension && input_count > 1) {
+      return DefaultAction(hlo);
+    }
+
+    // Currently we only support reducing all or none of the sharded
+    // dimensions.
+    if (reduce_sharded_dimension) {
+      for (int64 i = 0; i < inputs[0].base_shape().rank(); ++i) {
+        if (inputs[0].sharding().tile_assignment().dim(i) > 1 &&
+            absl::c_count(hlo->dimensions(), i) == 0) {
+          return DefaultAction(hlo);
+        }
+      }
+    }
+  }
+
+  std::vector<Shape*> new_operand_shapes(input_count * 2);
+  for (int64 i = 0; i < input_count; ++i) {
+    new_operand_shapes[i] = inputs[i].hlo()->mutable_shape();
+    new_operand_shapes[i + input_count] = inits[i]->mutable_shape();
+  }
+  // Create the shard shape of the reduce result.
+  TF_ASSIGN_OR_RETURN(
+      auto reduce_shape,
+      ShapeInference::InferReduceShape(new_operand_shapes, hlo->dimensions(),
+                                       hlo->to_apply()->ComputeProgramShape()));
+  *reduce_shape.mutable_layout() = hlo->shape().layout();
+
+  std::vector<HloInstruction*> input_hlos(input_count);
+  for (int64 i = 0; i < input_count; ++i) {
+    input_hlos[i] = inputs[i].hlo();
+  }
+  auto local_reduce = b_.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, input_hlos, inits, hlo->dimensions(), hlo->to_apply()));
+  local_reduce->set_metadata(hlo->metadata());
+
+  SetPartitionedHlo(hlo, [&]() {
+    HloInstruction* reduce;
+    if (reduce_sharded_dimension) {
+      CHECK(local_reduce->shape().IsArray());
+      reduce = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, local_reduce, hlo->to_apply(), NewChannel());
+      reduce->set_sharding(HloSharding::Replicate());
+    } else {
+      reduce = local_reduce;
+      if (inputs[0].sharding().IsTileMaximal()) {
+        reduce->set_sharding(inputs[0].sharding());
+      } else {
+        // Remove tile assignment dimensions that are reduced.
+        std::vector<int64> tile_dimensions;
+        for (int64 i = 0; i < input_hlos[0]->shape().rank(); ++i) {
+          if (absl::c_count(hlo->dimensions(), i) == 0) {
+            tile_dimensions.push_back(
+                inputs[0].sharding().tile_assignment().dim(i));
+          }
+        }
+        Array<int64> new_tile = inputs[0].sharding().tile_assignment();
+        new_tile.Reshape(tile_dimensions);
+        auto sharding = HloSharding::Tile(new_tile);
+        if (input_count > 1) {
+          std::vector<HloSharding> tuple(input_count, sharding);
+          sharding = HloSharding::Tuple(hlo->shape(), tuple);
+        }
+        reduce->set_sharding(sharding);
+      }
+    }
+
+    return PartitionedHlo(reduce, hlo->shape(), MakePartitioningState())
+        .Reshard(hlo->sharding())
+        .hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
+  auto reverse = Cast<HloReverseInstruction>(hlo);
+  if (reverse->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  if (absl::c_all_of(reverse->dimensions(), [&](int64 d) {
+        return reverse->sharding().tile_assignment().dim(d) == 1;
+      })) {
+    auto operand =
+        GetPartitionedHlo(reverse->operand(0)).Reshard(reverse->sharding());
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(operand.hlo()->shape(), {operand.hlo()}));
+    });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+
+  // Shardings for the body parameter, body root, and cond parameter must be
+  // the same, and the condition root must be replicated so that all partitions
+  // follow the same control flow.
+  hlo->while_condition()->parameter_instruction(0)->set_sharding(sharding);
+  hlo->while_body()->parameter_instruction(0)->set_sharding(sharding);
+  TF_RETURN_IF_ERROR(partitioner_
+                         ->PartitionComputation(hlo->while_condition(),
+                                                HloSharding::Replicate(),
+                                                next_channel_id_, logger_)
+                         .status());
+  TF_RETURN_IF_ERROR(partitioner_
+                         ->PartitionComputation(hlo->while_body(), sharding,
+                                                next_channel_id_, logger_)
+                         .status());
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(HloInstruction::CreateWhile(
+        MakePartitionedShape(hlo->shape(), sharding), hlo->while_condition(),
+        hlo->while_body(),
+        GetPartitionedHlo(hlo->operand(0)).Reshard(sharding).hlo()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConditional(HloInstruction* hlo) {
+  std::vector<HloInstruction*> branch_args;
+  for (int64 i = 0; i < hlo->branch_count(); ++i) {
+    HloComputation* computation = hlo->branch_computation(i);
+
+    // Shardings of the branch computation parameter and its argument must be
+    // the same.
+    computation->parameter_instruction(0)->set_sharding(
+        hlo->operand(i + 1)->sharding());
+    branch_args.push_back(GetPartitionedHlo(hlo->operand(i + 1)).hlo());
+  }
+
+  // The root of the branch computations must follow the sharding of the
+  // conditional instruction.
+  for (int64 i = 0; i < hlo->branch_count(); ++i) {
+    HloComputation* computation = hlo->branch_computation(i);
+    TF_RETURN_IF_ERROR(partitioner_
+                           ->PartitionComputation(computation, hlo->sharding(),
+                                                  next_channel_id_, logger_)
+                           .status());
+  }
+
+  // We replicate the predicate of the conditional (the first operand) so that
+  // all partitions follow the same control flow.
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()),
+        GetPartitionedHlo(hlo->operand(0))
+            .Reshard(HloSharding::Replicate())
+            .hlo(),
+        hlo->called_computations(), branch_args));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->sharding().HasUniqueDevice());
+  return HandleSingleDevice(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
+  if (hlo->sharding().HasUniqueDevice()) {
+    return HandleSingleDevice(hlo);
+  }
+
+  if (hlo->sharding().IsReplicated()) {
+    SetPartitionedHlo(hlo, [&] {
+      // Run on a single device (0) and distribute the data to all other cores.
+      std::vector<HloInstruction*> new_operands;
+      for (int64 i = 0; i < hlo->operand_count(); ++i) {
+        new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                                   .Reshard(HloSharding::AssignDevice(0))
+                                   .hlo());
+      }
+      auto clone = b_.AddInstruction(
+          hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+      clone->set_sharding(HloSharding::AssignDevice(0));
+      return PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
+          .Reshard(HloSharding::Replicate())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  TF_RET_CHECK(!hlo->sharding().IsTileMaximal());
+  SetPartitionedHlo(hlo, [&] {
+    // Replicate the operands and run partitioned Rng on all devices.
+    std::vector<HloInstruction*> new_operands;
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                                 .Reshard(HloSharding::Replicate())
+                                 .hlo());
+    }
+    return b_.AddInstruction(HloInstruction::CreateRng(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()),
+        hlo->random_distribution(), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReduceWindow(HloInstruction* hlo) {
+  auto& operand = GetPartitionedHlo(hlo->operand(0));
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  // Replicate init
+  auto replicated_init = GetPartitionedHlo(hlo->mutable_operand(1))
+                             .Reshard(HloSharding::Replicate());
+  auto resharded_operand_and_window = operand.ReshardAsWindowedInput(
+      hlo->window(), hlo->sharding(), replicated_init.hlo());
+  if (!resharded_operand_and_window.has_value()) {
+    return DefaultAction(hlo);
+  }
+
+  TF_ASSIGN_OR_RETURN(Shape sharded_rw_shape,
+                      ShapeInference::InferReduceWindowShape(
+                          resharded_operand_and_window->sharded_input->shape(),
+                          replicated_init.hlo()->shape(),
+                          resharded_operand_and_window->shard_window,
+                          hlo->to_apply()->ComputeProgramShape()));
+  auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  *sharded_rw_shape.mutable_layout() = shard_shape.layout();
+  SetPartitionedHlo(hlo, [&]() {
+    auto sharded_rw = b_.AddInstruction(HloInstruction::CreateReduceWindow(
+        sharded_rw_shape, resharded_operand_and_window->sharded_input,
+        replicated_init.hlo(), resharded_operand_and_window->shard_window,
+        hlo->to_apply()));
+    if (!resharded_operand_and_window->dynamic_slice_index_on_output
+             .has_value()) {
+      CHECK(ShapeUtil::Compatible(shard_shape, sharded_rw->shape()));
+      return sharded_rw;
+    }
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_rw,
+        *resharded_operand_and_window->dynamic_slice_index_on_output,
+        shard_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSelectAndScatter(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  auto operand = GetPartitionedHlo(hlo->operand(0));
+  auto source = GetPartitionedHlo(hlo->mutable_operand(1));
+  if (hlo->sharding() != operand.sharding()) {
+    operand = operand.Reshard(hlo->sharding());
+  }
+  if (hlo->sharding() != source.sharding()) {
+    source = source.Reshard(hlo->sharding());
+  }
+
+  // For F32 and BF16 types, we can use NaN padding to workaround the issue with
+  // low/high padding, since comparison will return false with NaN input.
+  if (hlo->shape().element_type() != F32 &&
+      hlo->shape().element_type() != BF16) {
+    return DefaultAction(hlo);
+  }
+
+  auto select = hlo->called_computations()[0];
+  auto select_root = select->root_instruction();
+  if (select_root->opcode() != HloOpcode::kCompare ||
+      select_root->operand(0)->opcode() != HloOpcode::kParameter ||
+      select_root->operand(1)->opcode() != HloOpcode::kParameter ||
+      select_root->operand(0)->parameter_number() +
+              select_root->operand(1)->parameter_number() !=
+          1) {
+    return DefaultAction(hlo);
+  }
+
+  float float_pad_value;
+  if (select_root->comparison_direction() == ComparisonDirection::kGe ||
+      select_root->comparison_direction() == ComparisonDirection::kGt) {
+    if (select_root->operand(0)->parameter_number() == 0) {
+      float_pad_value = -std::numeric_limits<float>::infinity();
+    } else {
+      float_pad_value = std::numeric_limits<float>::infinity();
+    }
+  } else if (select_root->comparison_direction() == ComparisonDirection::kLe ||
+             select_root->comparison_direction() == ComparisonDirection::kLt) {
+    if (select_root->operand(0)->parameter_number() == 0) {
+      float_pad_value = std::numeric_limits<float>::infinity();
+    } else {
+      float_pad_value = -std::numeric_limits<float>::infinity();
+    }
+  } else {
+    return DefaultAction(hlo);
+  }
+
+  auto pad_value = b_.AddInstruction(HloInstruction::CreateConstant(
+      hlo->shape().element_type() == BF16
+          ? LiteralUtil::CreateR0<bfloat16>(
+                static_cast<bfloat16>(float_pad_value))
+          : LiteralUtil::CreateR0<float>(float_pad_value)));
+
+  // Replicate init
+  auto replicated_init = GetPartitionedHlo(hlo->mutable_operand(2))
+                             .Reshard(HloSharding::Replicate());
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(hlo->sharding(), partition_id_, &b_);
+
+  // The first window for each dimension that overlaps with the shard area.
+  std::vector<MultiplyAddDivideOffsetCalculation> first_window(
+      hlo->shape().rank());
+  // The first window for each dimension that goes beyond with the shard area.
+  std::vector<MultiplyAddDivideOffsetCalculation> limit_window(
+      hlo->shape().rank());
+  std::vector<OffsetCalculation> data_left_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> data_right_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> source_left_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> source_right_halo_sizes(hlo->shape().rank());
+  auto unpadded_data_shard_shape =
+      MakePartitionedShape(hlo->shape(), hlo->sharding());
+  auto unpadded_source_shard_shape =
+      MakePartitionedShape(hlo->operand(1)->shape(), hlo->sharding());
+  auto source_shard_hlo = source.hlo();
+  auto data_shard_hlo = operand.hlo();
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    int64 shard_count = hlo->sharding().tile_assignment().dim(i);
+    if (shard_count == 1) {
+      continue;
+    }
+    // If stride > window_size, there will be gaps between windows. These gaps
+    // will also exist in the output, so we keep them during halo exchange.
+    //
+    // TODO(yuanzx): This could introduce overhead if partitions start at
+    // different offsets in a gap.
+    auto wd = hlo->window().dimensions(i);
+    if (wd.stride() > wd.size()) {
+      wd.set_size(wd.stride());
+    }
+    // shard_size * i < stride * k - pad_low + window_size  =>
+    //   k > (shard_size * i + pad_low - window_size) / stride  =>
+    //   first_k == (shard_size * i + pad_low - window_size + stride) / stride
+    first_window[i] = MultiplyAddDivideOffsetCalculation(
+        unpadded_data_shard_shape.dimensions(i),
+        wd.padding_low() - wd.size() + wd.stride(), wd.stride());
+    // shard_size * (i + 1) <= stride * k - pad_low  =>
+    //   k >= (shard_size * i + shard_size + pad_low) / stride  =>
+    //   limit_k == (shard_size * i + shard_size + pad_low + stride - 1) /
+    //     stride
+    limit_window[i] = MultiplyAddDivideOffsetCalculation(
+        unpadded_data_shard_shape.dimensions(i),
+        unpadded_data_shard_shape.dimensions(i) + wd.padding_low() +
+            wd.stride() - 1,
+        wd.stride());
+    source_left_halo_sizes[i] =
+        MultiplyAddDivideOffsetCalculation(
+            unpadded_source_shard_shape.dimensions(i), 0, 1) -
+        first_window[i];
+    source_right_halo_sizes[i] =
+        limit_window[i] - MultiplyAddDivideOffsetCalculation(
+                              unpadded_source_shard_shape.dimensions(i),
+                              unpadded_source_shard_shape.dimensions(i), 1);
+    data_left_halo_sizes[i] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            unpadded_data_shard_shape.dimensions(i), wd.padding_low(), 1)) -
+        OffsetCalculation(
+            HloOpcode::kMultiply, first_window[i],
+            MultiplyAddDivideOffsetCalculation(0, wd.stride(), 1));
+    data_right_halo_sizes[i] =
+        OffsetCalculation(
+            HloOpcode::kMultiply, limit_window[i],
+            MultiplyAddDivideOffsetCalculation(0, wd.stride(), 1)) -
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            unpadded_data_shard_shape.dimensions(i),
+            unpadded_data_shard_shape.dimensions(i) + wd.stride() +
+                wd.padding_low() - wd.size(),
+            1));
+
+    int64 max_windows =
+        (limit_window[i] - first_window[i]).MaxInRange(0, shard_count);
+    auto first_window_hlo =
+        first_window[i].Calculate(partition_ordinals[i], &b_);
+    // Padding on the source is filled with the init value so they do not change
+    // the data on overlapping windows.
+    auto resharded_source = ExchangeHaloAndGetValidData(
+        source_shard_hlo, source.base_shape(), source_left_halo_sizes[i],
+        source_right_halo_sizes[i], 0,
+        limit_window[i].Calculate(shard_count - 1), max_windows, i,
+        hlo->sharding(), first_window_hlo, replicated_init.hlo(),
+        partition_ordinals[i], collective_ops_creator_, next_channel_id_, &b_);
+    if (!resharded_source) {
+      return DefaultAction(hlo);
+    }
+    source_shard_hlo = *resharded_source;
+
+    auto offset_start_in_data =
+        MultiplyAddDivideOffsetCalculation(wd.stride(), 0, 1)
+            .Calculate(first_window_hlo, &b_);
+    int64 padded_data_size =
+        (limit_window[i].Calculate(shard_count - 1) - 1) * wd.stride() +
+        wd.size();
+    int64 data_shard_size = (max_windows - 1) * wd.stride() + wd.size();
+    auto resharded_data = ExchangeHaloAndGetValidData(
+        data_shard_hlo, operand.base_shape(), data_left_halo_sizes[i],
+        data_right_halo_sizes[i], wd.padding_low(), padded_data_size,
+        data_shard_size, i, hlo->sharding(), offset_start_in_data, pad_value,
+        partition_ordinals[i], collective_ops_creator_, next_channel_id_, &b_);
+    if (!resharded_data) {
+      return DefaultAction(hlo);
+    }
+    data_shard_hlo = *resharded_data;
+  }
+
+  Window window_on_shard = hlo->window();
+  for (int64 i = 0; i < window_on_shard.dimensions_size(); ++i) {
+    int64 shard_count = hlo->sharding().tile_assignment().dim(i);
+    if (shard_count == 1) {
+      continue;
+    }
+    auto reshard_wd = window_on_shard.mutable_dimensions(i);
+    // The shards are already explicitly padded.
+    reshard_wd->set_padding_low(0);
+    reshard_wd->set_padding_high(0);
+  }
+
+  auto sharded_select_and_scatter =
+      b_.AddInstruction(HloInstruction::CreateSelectAndScatter(
+          data_shard_hlo->shape(), data_shard_hlo, select, window_on_shard,
+          source_shard_hlo, replicated_init.hlo(),
+          hlo->called_computations()[1]));
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    if (ShapeUtil::Compatible(sharded_select_and_scatter->shape(),
+                              shard_shape)) {
+      return sharded_select_and_scatter;
+    }
+    auto zero = b_.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    std::vector<HloInstruction*> slice_offsets(shard_shape.rank(), zero);
+    for (int64 i = 0; i < window_on_shard.dimensions_size(); ++i) {
+      if (hlo->sharding().tile_assignment().dim(i) == 1) {
+        continue;
+      }
+      int64 pad_low = hlo->window().dimensions(i).padding_low();
+      auto left_halo_size =
+          data_left_halo_sizes[i].Calculate(partition_ordinals[i], &b_);
+      if (data_left_halo_sizes[i].Calculate(0) == pad_low) {
+        slice_offsets[i] = left_halo_size;
+      } else {
+        auto is_shard0 = b_.AddInstruction(HloInstruction::CreateCompare(
+            ShapeUtil::MakeShape(PRED, {}), zero, partition_ordinals[i],
+            ComparisonDirection::kEq));
+        auto pad_low_hlo = b_.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32>(pad_low)));
+        slice_offsets[i] = b_.AddInstruction(HloInstruction::CreateTernary(
+            zero->shape(), HloOpcode::kSelect, is_shard0, pad_low_hlo,
+            left_halo_size));
+      }
+    }
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_select_and_scatter, slice_offsets,
+        shard_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleTuple(HloInstruction* hlo) {
+  std::vector<HloInstruction*> new_operands;
+  for (int64 i = 0; i < hlo->operand_count(); ++i) {
+    new_operands.push_back(
+        GetPartitionedHlo(hlo->operand(i))
+            .Reshard(hlo->sharding().GetSubSharding(hlo->shape(), {i}))
+            .hlo());
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateTuple(new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
+    HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->opcode() == HloOpcode::kConvolution);
+
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  auto rhs = GetPartitionedHlo(hlo->operand(1));
+  TF_RET_CHECK(!lhs.sharding().IsTileMaximal() &&
+               !rhs.sharding().IsTileMaximal());
+
+  const auto& dnums = hlo->convolution_dimension_numbers();
+
+  // Check if the operand shardings are aligned. Also we currently don't
+  // support partitioning non-spatial dimensions.
+  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                  const HloSharding& rhs_sharding) {
+    return lhs_sharding.tile_assignment().dim(dnums.input_batch_dimension()) !=
+               1 ||
+           rhs_sharding.tile_assignment().dim(
+               dnums.kernel_output_feature_dimension()) != 1;
+  };
+
+  auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
+      ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+    if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+      return DefaultAction(hlo);
+    }
+    lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+    rhs = rhs.PadWithValue(zero);
+  } else {
+    if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+      return DefaultAction(hlo);
+    }
+    lhs = lhs.PadWithValue(zero);
+    rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
+  }
+
+  // Reshard LHS by exchanging halo such that each shard computes the partial
+  // sum of the full shape result, and add AllReduce.
+  //
+  // The size of halo on each dimension can be calculated from the projection
+  // onto the LHS that each RHS shard i needs to read. RHS and LHS below refers
+  // to the shard size of RHS and LHS, WC is the number of windows, and D is the
+  // window dilation.
+  //
+  // * offset(i): RHS * D * i - low_padding
+  // * limit(i): {(RHS - 1) * D + 1} * (i + 1) + (WC - 1) * stride - low_padding
+  //
+  // Since shard i has LHS of range [i * LHS, (i + 1) * LHS)
+  // * left-halo: i * LHS - offset(i)
+  //              = (LHS - RHS) * i + low_padding
+  // * right-halo: limit(i) - (i + 1) * LHS
+  //   = [{(RHS - 1) * D + 1} - LHS] * (i + 1) + (WC - 1) * stride - low_padding
+
+  Window window = hlo->window();
+  std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+  std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+    int64 shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
+    auto wd = window.dimensions(i);
+    if (wd.base_dilation() != 1 || wd.window_reversal()) {
+      return DefaultAction(hlo);
+    }
+
+    int64 lhs_shard_size =
+        CeilOfRatio(lhs.base_shape().dimensions(lhs_dimension), shard_count);
+    int64 rhs_shard_size =
+        CeilOfRatio(rhs.base_shape().dimensions(rhs_dimension), shard_count);
+    shard_counts[i] = shard_count;
+    lhs_shard_sizes[i] = lhs_shard_size;
+    rhs_shard_sizes[i] = rhs_shard_size;
+  }
+
+  std::vector<OffsetCalculation> left_halo_size_functions(hlo->shape().rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(hlo->shape().rank());
+  Window new_window = window;
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
+  HloInstruction* lhs_with_halo = lhs.hlo();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 lhs_shard_size = lhs_shard_sizes[i];
+    int64 rhs_shard_size = rhs_shard_sizes[i];
+
+    if (shard_counts[i] == 1) {
+      continue;
+    }
+
+    // Calculate the left and right halo sizes as described in the comments
+    // above.
+    auto wd = window.dimensions(i);
+    int64 padding_low = wd.padding_low();
+    int64 padding_high = wd.padding_high();
+    int64 base = lhs.base_shape().dimensions(lhs_dimension);
+    int64 window_count = 1 + (padding_low + padding_high + base -
+                              (1 + (wd.size() - 1) * wd.window_dilation())) /
+                                 wd.stride();
+    int64 rhs_shard_size_dilated =
+        (rhs_shard_size - 1) * wd.window_dilation() + 1;
+
+    left_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            lhs_shard_size - rhs_shard_size * wd.window_dilation(), padding_low,
+            1));
+    right_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            rhs_shard_size_dilated - lhs_shard_size,
+            rhs_shard_size_dilated - lhs_shard_size +
+                wd.stride() * (window_count - 1) - padding_low,
+            1));
+
+    // Exchange halo and concatenate.
+    int64 dim = dnums.input_spatial_dimensions(i);
+    int64 explicit_left_padding_on_full_shape = padding_low;
+    int64 shard_size_with_halo =
+        wd.stride() * (window_count - 1) + rhs_shard_size_dilated;
+
+    new_window.mutable_dimensions(i)->set_padding_low(0);
+    new_window.mutable_dimensions(i)->set_padding_high(0);
+    new_window.mutable_dimensions(i)->set_size(rhs_shard_size);
+
+    // offset_on_padded_shape and padded_full_shape_size are needed only if
+    // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+    // Since the default value for both the collective-permute is zero and
+    // also we call PadWithValue() on both operands at the beginning, we
+    // don't need to mask here.
+    //
+    // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+    // if it's always safe.
+    auto offset_on_padded_shape =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation());
+    int64 padded_full_shape_size = 0;
+    auto concat = ExchangeHaloAndGetValidData(
+        lhs_with_halo, lhs.base_shape(), left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+        padded_full_shape_size, shard_size_with_halo, dim, lhs.sharding(),
+        offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_), zero,
+        partition_ordinals[dim], collective_ops_creator_, next_channel_id_, &b_,
+        /*mask_invalid_region=*/false);
+    if (!concat) {
+      return DefaultAction(hlo);
+    }
+    lhs_with_halo = *concat;
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+        hlo->shape(), lhs_with_halo, rhs.hlo(), hlo->feature_group_count(),
+        hlo->batch_group_count(), new_window,
+        hlo->convolution_dimension_numbers(), hlo->precision_config()));
+    auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+        &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
+        NewChannel());
+    ar->set_sharding(HloSharding::Replicate());
+    return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+        .Reshard(hlo->sharding())
+        .hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  auto rhs = GetPartitionedHlo(hlo->operand(1));
+  const HloSharding& sharding = hlo->sharding();
+  const auto& dnums = hlo->convolution_dimension_numbers();
+  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  // Handling cases where both operands' shardings are aligned. We check that
+  // the LHS batch dimension is not partitioned because it is mapped to the
+  // output feature dimension in aligned_rhs_sharding, which are not the same
+  // dimension.
+  if (!lhs.sharding().IsTileMaximal() && !rhs.sharding().IsTileMaximal()) {
+    if (options_.conv_halo_exchange_always_on_lhs) {
+      return HandleConvolutionTiledLhsAndRhs(hlo);
+    } else {
+      // Reshard RHS so that each shard computes the partial sum of the full
+      // shape result, and add AllReduce. See HandleConvolutionTiledLhsAndRhs()
+      // that reshards LHS.
+      //
+      // The size of halo on each dimension can be calculated from the
+      // projection onto the RHS that shard i needs to read. RHS and LHS below
+      // refers to the shard size of RHS and LHS, WC is the number of windows,
+      // and D is the window dilation.
+      //
+      // * offset(i): LHS * i + low_padding - (WC - 1) * stride
+      // * limit(i): LHS * (i + 1) + low_padding
+      //
+      // Since shard i has RHS of range [i * RHS * D, (i + 1) * RHS * D)
+      // * left-halo: i * RHS - offset(i)
+      //              = i * (RHS * D - LHS) + (WC - 1) * stride - low_padding
+      // * right-halo: limit(i) - (i + 1) * RHS
+      //              = (i + 1) * (LHS - RHS * D) + low_pading
+
+      auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                      const HloSharding& rhs_sharding) {
+        // We currently don't support partitioning input batch or output feature
+        // dimensions.
+        return lhs_sharding.tile_assignment().dim(
+                   dnums.input_batch_dimension()) != 1 ||
+               rhs_sharding.tile_assignment().dim(
+                   dnums.kernel_output_feature_dimension()) != 1;
+      };
+      auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::Zero(hlo->shape().element_type())));
+      if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
+          ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+        if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+          return DefaultAction(hlo);
+        }
+        lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+        rhs = rhs.PadWithValue(zero);
+      } else {
+        if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+          return DefaultAction(hlo);
+        }
+        lhs = lhs.PadWithValue(zero);
+        rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
+      }
+
+      Window window = hlo->window();
+      std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+      std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+      std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+        int64 shard_count = rhs.sharding().tile_assignment().dim(rhs_dimension);
+        auto wd = window.dimensions(i);
+        if (wd.base_dilation() != 1 || wd.window_reversal()) {
+          return DefaultAction(hlo);
+        }
+
+        int64 lhs_shard_size = CeilOfRatio(
+            lhs.base_shape().dimensions(lhs_dimension), shard_count);
+        int64 rhs_shard_size = CeilOfRatio(
+            rhs.base_shape().dimensions(rhs_dimension), shard_count);
+        shard_counts[i] = shard_count;
+        lhs_shard_sizes[i] = lhs_shard_size;
+        rhs_shard_sizes[i] = rhs_shard_size;
+      }
+
+      std::vector<OffsetCalculation> left_halo_size_functions(
+          hlo->shape().rank());
+      std::vector<OffsetCalculation> right_halo_size_functions(
+          hlo->shape().rank());
+      Window new_window = window;
+
+      // Data structures needed for Pad and DynamicSlice on LHS if needed.
+      bool need_dynamic_slice_lhs = false;
+      auto partition_ordinals =
+          MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
+      std::vector<int64> zero_padding(hlo->shape().rank());
+      PaddingConfig pad_config =
+          window_util::MakeSymmetricPadding(zero_padding);
+      auto zero_s32 = b_.AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      std::vector<HloInstruction*> dynamic_slice_start_indices(
+          hlo->shape().rank(), zero_s32);
+      Shape dynamic_slice_shape = lhs.hlo()->shape();
+      Shape pad_shape = lhs.hlo()->shape();
+
+      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+        int64 lhs_shard_size = lhs_shard_sizes[i];
+        int64 rhs_shard_size = rhs_shard_sizes[i];
+
+        if (shard_counts[i] == 1) {
+          continue;
+        }
+
+        // Calculate the left and right halo sizes as described in the comments
+        // above. It calculcates the halo sizes with dilation, so we apply
+        // CeilOfRatio({left,right}_halo_size, window_dilation).
+        auto wd = window.dimensions(i);
+        int64 padding_low = wd.padding_low();
+        int64 padding_high = wd.padding_high();
+        int64 base = lhs.base_shape().dimensions(lhs_dimension);
+        int64 window_count =
+            1 + (padding_low + padding_high + base -
+                 (1 + (wd.size() - 1) * wd.window_dilation())) /
+                    wd.stride();
+        left_halo_size_functions[rhs_dimension] =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+                (window_count - 1) * wd.stride() - padding_low +
+                    wd.window_dilation() - 1,
+                wd.window_dilation()));
+        right_halo_size_functions[rhs_dimension] =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                lhs_shard_size - rhs_shard_size * wd.window_dilation(),
+                lhs_shard_size - rhs_shard_size * wd.window_dilation() +
+                    padding_low + wd.window_dilation() - 1,
+                wd.window_dilation()));
+
+        // New RHS window size includes the maximum of both left and right
+        // halos.
+        int64 halo_size = left_halo_size_functions[rhs_dimension].MaxInRange(
+                              1, shard_counts[i]) +
+                          right_halo_size_functions[rhs_dimension].MaxInRange(
+                              0, shard_counts[i] - 1);
+        int64 new_window_size =
+            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size;
+
+        // The amount of new low padding could be dynamic (e.g., window_dilation
+        // != 1), which requires pad (to the maximum) and dynamic slice on LHS.
+        //
+        // If we consider the first window, the offset of the dilated RHS that
+        // aligns with the first valid LHS element for shard i is 'padding_low +
+        // LHS * i'. When the left halo is added to RHS, the offset of the first
+        // RHS element is (RHS * i - left_halo) * window_dilation. The
+        // difference between the two values is the amount of padding_low we
+        // need on LHS.
+        auto new_padding_low_function =
+            OffsetCalculation(
+                HloOpcode::kMultiply, left_halo_size_functions[rhs_dimension],
+                OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                    0, wd.window_dilation(), 1))) -
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+                -padding_low, 1));
+
+        int64 new_padding_low_max =
+            new_padding_low_function.MaxInRange(0, shard_counts[i]);
+        int64 new_padding_low = new_padding_low_max;
+        int64 new_padding_high = window_count * wd.stride() +
+                                 (new_window_size - 1) * wd.window_dilation() -
+                                 new_padding_low - lhs_shard_size;
+
+        // We do pad/dynamic-slice only when the padding is dynamic.
+        if (!new_padding_low_function.IsConstant()) {
+          need_dynamic_slice_lhs = true;
+          new_padding_low = 0;
+          pad_config.mutable_dimensions(lhs_dimension)
+              ->set_edge_padding_low(new_padding_low_max);
+          pad_config.mutable_dimensions(lhs_dimension)
+              ->set_edge_padding_high(new_padding_low_max);
+          pad_shape.set_dimensions(lhs_dimension,
+                                   lhs_shard_size + 2 * new_padding_low_max);
+          dynamic_slice_start_indices[lhs_dimension] =
+              (OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                   0, new_padding_low_max, 1)) -
+               new_padding_low_function)
+                  .Calculate(partition_ordinals[lhs_dimension], &b_);
+          dynamic_slice_shape.set_dimensions(
+              lhs_dimension, lhs_shard_size + new_padding_low_max);
+        }
+
+        // Since the convolution RHS operand size increased with halos, adjust
+        // the window config accordingly.
+        new_window.mutable_dimensions(i)->set_padding_low(new_padding_low);
+        new_window.mutable_dimensions(i)->set_padding_high(new_padding_high);
+        new_window.mutable_dimensions(i)->set_size(
+            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size);
+      }
+
+      HloInstruction* conv_lhs = lhs.hlo();
+      if (need_dynamic_slice_lhs) {
+        auto pad = b_.AddInstruction(
+            HloInstruction::CreatePad(pad_shape, lhs.hlo(), zero, pad_config));
+        conv_lhs = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+            dynamic_slice_shape, pad, dynamic_slice_start_indices,
+            dynamic_slice_shape.dimensions()));
+      }
+
+      // Exchange halo and concatenate.
+      HloInstruction* rhs_with_halo = rhs.hlo();
+      for (int i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) {
+        int64 dim = dnums.kernel_spatial_dimensions(i);
+        int64 explicit_left_padding_on_full_shape =
+            left_halo_size_functions[dim].Calculate(0);
+        int64 shard_size_with_halo = new_window.dimensions(i).size();
+
+        // offset_on_padded_shape and padded_full_shape_size are needed only if
+        // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+        // Since the default value for both the collective-permute is zero and
+        // also we call PadWithValue() on both operands at the beginning, we
+        // don't need to mask here.
+        //
+        // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+        // if it's always safe.
+        auto offset_on_padded_shape =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_sizes[i], explicit_left_padding_on_full_shape, 1)) -
+            left_halo_size_functions[dim];
+        int64 padded_full_shape_size =
+            offset_on_padded_shape.Calculate(shard_counts[i] - 1) +
+            new_window.dimensions(i).size();
+        auto concat = ExchangeHaloAndGetValidData(
+            rhs_with_halo, rhs.base_shape(), left_halo_size_functions[dim],
+            right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+            padded_full_shape_size, shard_size_with_halo, dim, rhs.sharding(),
+            offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_),
+            zero, partition_ordinals[dim], collective_ops_creator_,
+            next_channel_id_, &b_, /*mask_invalid_region=*/false);
+        if (!concat) {
+          return DefaultAction(hlo);
+        }
+        rhs_with_halo = *concat;
+      }
+
+      SetPartitionedHlo(hlo, [&]() {
+        auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+            hlo->shape(), conv_lhs, rhs_with_halo, hlo->feature_group_count(),
+            hlo->batch_group_count(), new_window, dnums,
+            hlo->precision_config()));
+        auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+            &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
+            NewChannel());
+        ar->set_sharding(HloSharding::Replicate());
+        return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+
+  if (!sharding.IsTileMaximal()) {
+    // We don't currently support sharding on output feature dimension.
+    if (sharding.tile_assignment().dim(dnums.output_feature_dimension()) > 1) {
+      return DefaultAction(hlo);
+    }
+
+    // Check if the operand and the output sharding are aligned.
+    std::vector<int64> input_to_output_indices(hlo->shape().rank());
+    input_to_output_indices[dnums.input_batch_dimension()] =
+        dnums.output_batch_dimension();
+    input_to_output_indices[dnums.input_feature_dimension()] =
+        dnums.output_feature_dimension();
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      input_to_output_indices[dnums.input_spatial_dimensions(i)] =
+          dnums.output_spatial_dimensions(i);
+    }
+    auto target_operand_sharding =
+        hlo_sharding_util::TransposeSharding(sharding, input_to_output_indices);
+    lhs = lhs.Reshard(target_operand_sharding);
+
+    // Replicate the RHS.
+    rhs = rhs.Reshard(HloSharding::Replicate());
+
+    // Convolution window config does not include batch and feature dimensions,
+    // whereas ReshardAsWindowedInput() expects the same number of window
+    // dimensions as the rank of the operand. So add two more trivial
+    // dimensions.
+    std::vector<int64> ones(hlo->shape().rank(), 1);
+    auto operand_window = window_util::MakeWindow(ones);
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      *operand_window.mutable_dimensions(dnums.input_spatial_dimensions(i)) =
+          hlo->window().dimensions(i);
+    }
+
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    auto resharded_operand_and_window = lhs.ReshardAsWindowedInput(
+        operand_window, target_operand_sharding, zero);
+    if (!resharded_operand_and_window.has_value()) {
+      return DefaultAction(hlo);
+    }
+    Window new_window;
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      *new_window.add_dimensions() =
+          resharded_operand_and_window->shard_window.dimensions(
+              dnums.input_spatial_dimensions(i));
+    }
+    TF_ASSIGN_OR_RETURN(
+        Shape sharded_conv_shape,
+        ShapeInference::InferConvolveShape(
+            resharded_operand_and_window->sharded_input->shape(),
+            rhs.hlo()->shape(), hlo->feature_group_count(),
+            hlo->batch_group_count(), new_window, dnums));
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    *sharded_conv_shape.mutable_layout() = shard_shape.layout();
+    SetPartitionedHlo(hlo, [&]() {
+      auto sharded_conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+          sharded_conv_shape, resharded_operand_and_window->sharded_input,
+          rhs.hlo(), hlo->feature_group_count(), hlo->batch_group_count(),
+          new_window, dnums, hlo->precision_config()));
+      if (!resharded_operand_and_window->dynamic_slice_index_on_output
+               .has_value()) {
+        CHECK(ShapeUtil::Compatible(shard_shape, sharded_conv->shape()));
+        return sharded_conv;
+      }
+      return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+          shard_shape, sharded_conv,
+          *resharded_operand_and_window->dynamic_slice_index_on_output,
+          shard_shape.dimensions()));
+    });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
+  DotGeneralDimsMapping mapping;
+  const auto& dnums = hlo->dot_dimension_numbers();
+  int64 next_output_dim = 0;
+  for (int64 i = 0; i < dnums.lhs_batch_dimensions_size(); ++i) {
+    mapping.batch_dims.emplace_back();
+    mapping.batch_dims.back().lhs = dnums.lhs_batch_dimensions(i);
+    mapping.batch_dims.back().rhs = dnums.rhs_batch_dimensions(i);
+    mapping.batch_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < dnums.lhs_contracting_dimensions_size(); ++i) {
+    mapping.contracting_dims.emplace_back();
+    mapping.contracting_dims.back().lhs = dnums.lhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().rhs = dnums.rhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().output = -1;
+  }
+  for (int64 i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.lhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.lhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.lhs_non_contracting_dims.emplace_back();
+    mapping.lhs_non_contracting_dims.back().lhs = i;
+    mapping.lhs_non_contracting_dims.back().rhs = -1;
+    mapping.lhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < hlo->operand(1)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.rhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.rhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.rhs_non_contracting_dims.emplace_back();
+    mapping.rhs_non_contracting_dims.back().lhs = -1;
+    mapping.rhs_non_contracting_dims.back().rhs = i;
+    mapping.rhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  auto create_sharded_dot = [&](HloInstruction* l, HloInstruction* r,
+                                SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+    TF_ASSIGN_OR_RETURN(
+        auto sharded_dot_shape,
+        ShapeInference::InferDotOpShape(l->shape(), r->shape(),
+                                        hlo->dot_dimension_numbers()));
+    return b->AddInstruction(HloInstruction::CreateDot(
+        sharded_dot_shape, l, r, hlo->dot_dimension_numbers(),
+        hlo->precision_config()));
+  };
+  return HandleDotHelper(hlo, mapping, create_sharded_dot);
+}
+
+Status SpmdPartitioningVisitor::HandleDotHelper(
+    HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot) {
+  const HloSharding& lhs_sharding = hlo->operand(0)->sharding();
+  const HloSharding& rhs_sharding = hlo->operand(1)->sharding();
+
+  // Similar to hlo_sharding_util::TransposeSharding(), but allows
+  // removing/adding non-partitioned dimensions.
+  auto transpose_sharding =
+      [&](const HloSharding& source, absl::Span<int64 const> src_to_tgt,
+          absl::Span<int64 const> tgt_to_src) -> absl::optional<HloSharding> {
+    if (source.IsTileMaximal()) {
+      return source;
+    }
+    std::vector<int64> tgt_dims_skipping_new(tgt_to_src.size(), -1);
+    int64 skipped_tgt_dims = 0;
+    for (int64 i = 0; i < tgt_to_src.size(); ++i) {
+      if (tgt_to_src[i] < 0) {
+        skipped_tgt_dims++;
+      } else {
+        tgt_dims_skipping_new[i] = i - skipped_tgt_dims;
+      }
+    }
+    int64 skipped_src_dims = absl::c_count(src_to_tgt, -1);
+    std::vector<int64> perm(src_to_tgt.size());
+    for (int64 i = 0; i < src_to_tgt.size(); ++i) {
+      if (src_to_tgt[i] < 0) {
+        if (source.tile_assignment().dim(i) > 1) {
+          return absl::nullopt;
+        }
+        perm[src_to_tgt.size() - skipped_src_dims] = i;
+        skipped_src_dims--;
+      } else {
+        perm[tgt_dims_skipping_new[src_to_tgt[i]]] = i;
+      }
+    }
+    auto tgt_sharding = hlo_sharding_util::TransposeSharding(source, perm);
+    if (skipped_tgt_dims == 0) {
+      return tgt_sharding;
+    }
+    auto reshape_tiles = tgt_sharding.tile_assignment();
+    std::vector<int64> tgt_tiles(tgt_to_src.size(), 1);
+    for (int64 i = 0; i < tgt_tiles.size(); ++i) {
+      if (tgt_to_src[i] >= 0) {
+        tgt_tiles[i] = reshape_tiles.dim(tgt_dims_skipping_new[i]);
+      }
+    }
+    reshape_tiles.Reshape(tgt_tiles);
+    return HloSharding::Tile(reshape_tiles);
+  };
+
+  std::vector<int64> lhs_to_rhs_indices(hlo->operand(0)->shape().rank(), -1);
+  std::vector<int64> lhs_to_output_indices(hlo->operand(0)->shape().rank(), -1);
+  std::vector<int64> rhs_to_lhs_indices(hlo->operand(1)->shape().rank(), -1);
+  std::vector<int64> rhs_to_output_indices(hlo->operand(1)->shape().rank(), -1);
+  std::vector<int64> output_to_lhs_indices(hlo->shape().rank(), -1);
+  std::vector<int64> output_to_rhs_indices(hlo->shape().rank(), -1);
+  auto populate_indices_mapping =
+      [&](const DotGeneralDimsMapping::DimsMapping& mapping) {
+        if (mapping.lhs >= 0) {
+          lhs_to_rhs_indices[mapping.lhs] = mapping.rhs;
+          lhs_to_output_indices[mapping.lhs] = mapping.output;
+        }
+        if (mapping.rhs >= 0) {
+          rhs_to_lhs_indices[mapping.rhs] = mapping.lhs;
+          rhs_to_output_indices[mapping.rhs] = mapping.output;
+        }
+        if (mapping.output >= 0) {
+          output_to_lhs_indices[mapping.output] = mapping.lhs;
+          output_to_rhs_indices[mapping.output] = mapping.rhs;
+        }
+      };
+  for (const auto& mapping : dims_mapping.batch_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.lhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.rhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  auto lhs_sharding_transposed_to_match_rhs =
+      transpose_sharding(lhs_sharding, lhs_to_rhs_indices, rhs_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_lhs =
+      transpose_sharding(rhs_sharding, rhs_to_lhs_indices, lhs_to_rhs_indices);
+  auto lhs_sharding_transposed_to_match_output = transpose_sharding(
+      lhs_sharding, lhs_to_output_indices, output_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_output = transpose_sharding(
+      rhs_sharding, rhs_to_output_indices, output_to_rhs_indices);
+  auto output_sharding_transposed_to_match_lhs = transpose_sharding(
+      hlo->sharding(), output_to_lhs_indices, lhs_to_output_indices);
+  auto output_sharding_transposed_to_match_rhs = transpose_sharding(
+      hlo->sharding(), output_to_rhs_indices, rhs_to_output_indices);
+
+  // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
+  auto get_partitions_for_dims =
+      [&](const HloSharding& sharding,
+          absl::Span<const DotGeneralDimsMapping::DimsMapping> dims,
+          int lhs_rhs_or_output) {
+        int64 partitions = 1;
+        if (sharding.IsTileMaximal()) {
+          return partitions;
+        }
+        for (const auto& dim : dims) {
+          if (lhs_rhs_or_output == 0) {
+            partitions *= sharding.tile_assignment().dim(dim.lhs);
+          } else if (lhs_rhs_or_output == 1) {
+            partitions *= sharding.tile_assignment().dim(dim.rhs);
+          } else {
+            CHECK_EQ(lhs_rhs_or_output, 2);
+            partitions *= sharding.tile_assignment().dim(dim.output);
+          }
+        }
+        return partitions;
+      };
+  const int64 lhs_batch_partitions =
+      get_partitions_for_dims(lhs_sharding, dims_mapping.batch_dims, 0);
+  const int64 rhs_batch_partitions =
+      get_partitions_for_dims(rhs_sharding, dims_mapping.batch_dims, 1);
+  const int64 output_batch_partitions =
+      get_partitions_for_dims(hlo->sharding(), dims_mapping.batch_dims, 2);
+  const int64 lhs_contracting_partitions =
+      get_partitions_for_dims(lhs_sharding, dims_mapping.contracting_dims, 0);
+  const int64 rhs_contracting_partitions =
+      get_partitions_for_dims(rhs_sharding, dims_mapping.contracting_dims, 1);
+  const int64 lhs_non_contracting_partitions = get_partitions_for_dims(
+      lhs_sharding, dims_mapping.lhs_non_contracting_dims, 0);
+  const int64 rhs_non_contracting_partitions = get_partitions_for_dims(
+      rhs_sharding, dims_mapping.rhs_non_contracting_dims, 1);
+  const int64 output_lhs_non_contracting_partitions = get_partitions_for_dims(
+      hlo->sharding(), dims_mapping.lhs_non_contracting_dims, 2);
+  const int64 output_rhs_non_contracting_partitions = get_partitions_for_dims(
+      hlo->sharding(), dims_mapping.rhs_non_contracting_dims, 2);
+
+  auto& lhs = GetPartitionedHlo(hlo->operand(0));
+  auto& rhs = GetPartitionedHlo(hlo->operand(1));
+  // LHS and RHS are partitioned the same way and only partitioned in batch
+  // dimensions.
+  if (lhs_batch_partitions == rhs_batch_partitions &&
+      rhs_batch_partitions == num_partitions_ &&
+      lhs_sharding_transposed_to_match_rhs == rhs_sharding) {
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      dot->set_sharding(*lhs_sharding_transposed_to_match_output);
+      return PartitionedHlo(dot, hlo->shape(), MakePartitioningState())
+          .Reshard(hlo->sharding())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  // Try emit batch-partitioned einsum with one operand resharded. Returns
+  // whether the attempt succeeds. If may_reshard_with_allreduce is false,
+  // reshard must be done using all-to-all; otherwise this attempt fails.
+  auto try_emit_output_batch_partitioned_einsum_with_reshard =
+      [&](bool may_reshard_with_allreduce) -> StatusOr<bool> {
+    // LHS and output are batch partitioned in the same way.
+    if (lhs_batch_partitions == num_partitions_ &&
+        output_batch_partitions == num_partitions_ &&
+        lhs_sharding_transposed_to_match_output == hlo->sharding()) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithAllToAll(rhs.sharding(),
+                                  *lhs_sharding_transposed_to_match_rhs)) {
+        return false;
+      }
+      auto resharded_rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(lhs.hlo(), resharded_rhs.hlo(), &b_));
+      SetPartitionedHlo(hlo, [&] { return dot; });
+      return true;
+    }
+    // RHS and output are batch partitioned in the same way.
+    if (rhs_batch_partitions == num_partitions_ &&
+        output_batch_partitions == num_partitions_ &&
+        rhs_sharding_transposed_to_match_output == hlo->sharding()) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithAllToAll(lhs.sharding(),
+                                  *rhs_sharding_transposed_to_match_lhs)) {
+        return false;
+      }
+      auto resharded_lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(resharded_lhs.hlo(), rhs.hlo(), &b_));
+      SetPartitionedHlo(hlo, [&] { return dot; });
+      return true;
+    }
+    return false;
+  };
+
+  {
+    // Try batch-parallel by resharding one operand, and not using all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        bool emitted,
+        try_emit_output_batch_partitioned_einsum_with_reshard(false));
+    if (emitted) {
+      return Status::OK();
+    }
+  }
+
+  // Try to emit windowed DotGeneral when one operand is partitioned in the same
+  // way as the output along non-contracting dimensions, but the other operand
+  // is tiled in other dimensions.
+  auto emit_windowed_dot_general = [&](int64 matching_operand,
+                                       int64 windowing_operand,
+                                       bool windowed_at_contracting_dims,
+                                       bool windowed_at_batch_dims) {
+    CHECK_EQ(matching_operand + windowing_operand, 1);
+    CHECK(!windowed_at_batch_dims || !windowed_at_contracting_dims);
+    auto unpadded_result_buffer_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto padded_result_buffer_shape = unpadded_result_buffer_shape;
+    // For windowing at batch/non-contracting dims, we produce the result one
+    // partition at a time, so we need to pad the shape in case of uneven
+    // partitioning in order to make dynamic-update-slice in-bound.
+    if (!windowed_at_contracting_dims) {
+      padded_result_buffer_shape = GetPaddedShapeForUnevenPartitioning(
+          padded_result_buffer_shape,
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output);
+    }
+    // Mask the padding area of the windowed operand with zero if there is
+    // uneven partitioning.
+    if (windowed_at_contracting_dims) {
+      auto& to_mask = windowing_operand == 0 ? lhs : rhs;
+      to_mask =
+          to_mask.PadWithValue(b_.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::Zero(hlo->shape().element_type()))));
+    }
+    auto result_buffer = CreateZero(padded_result_buffer_shape, &b_);
+    auto iteration = b_.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(0)));
+
+    // Create a while loop that computes one window per iteration. During each
+    // iteration, each partition sends its input window to its neighbor using
+    // collective-permute for the next iteration.
+    SpmdBuilder body_b("windowed_dot_general_body", visiting_hlo_);
+    auto param = body_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto l = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(lhs.hlo()->shape(), param, 0));
+    auto r = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(rhs.hlo()->shape(), param, 1));
+    auto o = body_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        result_buffer->shape(), param, 2));
+    auto i = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(iteration->shape(), param, 3));
+
+    auto partition_id = collective_ops_creator_.create_partition_id(&body_b);
+    auto data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i, partition_id));
+    auto partition_count = body_b.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<uint32>(num_partitions_)));
+    data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kRemainder, data_partition_id, partition_count));
+    auto dot_lhs = l;
+    auto dot_rhs = r;
+    if (windowed_at_contracting_dims || windowed_at_batch_dims) {
+      // Slice the matching operand according to the partitioned contracting
+      // dimensions on the windowed operand. We do this by treating the matching
+      // operand as replicated, and resharding it to match the windowed operand.
+      auto slice_operand = matching_operand == 0 ? l : r;
+      slice_operand->set_sharding(HloSharding::Replicate());
+      auto state = MakePartitioningState();
+      state.b = &body_b;
+      state.partition_id = data_partition_id;
+      auto slice = PartitionedHlo(slice_operand, slice_operand->shape(), state)
+                       .Reshard(windowing_operand == 0
+                                    ? *lhs_sharding_transposed_to_match_rhs
+                                    : *rhs_sharding_transposed_to_match_lhs)
+                       .hlo();
+      slice_operand->clear_sharding();
+      if (matching_operand == 0) {
+        dot_lhs = slice;
+      } else {
+        dot_rhs = slice;
+      }
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(dot_lhs, dot_rhs, &body_b));
+    if (windowed_at_contracting_dims) {
+      // Accumulate the partial output to the result buffer.
+      o = body_b.AddInstruction(
+          HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
+    } else {
+      // The windowing operand is partitioned along batch/non-contracting
+      // dimensions, so we need a dynamic-update-slice to save the partial
+      // output in the result buffer.
+      auto offsets = MakePartitionOffsets(
+          o->shape(),
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output,
+          data_partition_id, &body_b);
+      o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          o->shape(), o, dot, offsets));
+    }
+
+    // ++i
+    i = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i,
+        body_b.AddInstruction(
+            HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)))));
+    auto has_more = body_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), i,
+        body_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions_))),
+        ComparisonDirection::kLt));
+    // Collective-permute for the next window. We don't need it for the last
+    // iteration, so we use a conditional around the collective-permute.
+    HloInstruction* conditional;
+    {
+      SpmdBuilder cp_b("window_collective_permute", visiting_hlo_);
+      {
+        auto p = cp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+        std::vector<std::pair<int64, int64>> sd_pairs(num_partitions_);
+        for (int64 source = 0; source < num_partitions_; ++source) {
+          // 0 -> n-1, 1 -> 0, 2 -> 1, ...
+          sd_pairs[source] = {source,
+                              (source - 1 + num_partitions_) % num_partitions_};
+        }
+        collective_ops_creator_.create_cross_partition_collective_permute(
+            &cp_b, p, sd_pairs, (*next_channel_id_)++);
+      }
+      SpmdBuilder ncp_b("last_iteration_noop", visiting_hlo_);
+      {
+        ncp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+      }
+      conditional = body_b.AddInstruction(HloInstruction::CreateConditional(
+          windowing_operand == 0 ? l->shape() : r->shape(), has_more,
+          windowing_operand == 0 ? l : r,
+          module_->AddEmbeddedComputation(cp_b.Build()),
+          windowing_operand == 0 ? l : r,
+          module_->AddEmbeddedComputation(ncp_b.Build())));
+    }
+    if (windowing_operand == 0) {
+      l = conditional;
+    } else {
+      r = conditional;
+    }
+    body_b.AddInstruction(HloInstruction::CreateTuple({l, r, o, i}));
+
+    SpmdBuilder cond_b("windowed_dot_general_cond", visiting_hlo_);
+    auto cond_param = cond_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto cond_i = cond_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        iteration->shape(), cond_param, 3));
+    cond_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), cond_i,
+        cond_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions_))),
+        ComparisonDirection::kLt));
+    auto while_loop = b_.AddInstruction(HloInstruction::CreateWhile(
+        cond_param->shape(), module_->AddEmbeddedComputation(cond_b.Build()),
+        module_->AddEmbeddedComputation(body_b.Build()),
+        b_.AddInstruction(HloInstruction::CreateTuple(
+            {lhs.hlo(), rhs.hlo(), result_buffer, iteration}))));
+    windowed_dot_general_loops_.push_back({while_loop, windowing_operand,
+                                           windowed_at_contracting_dims,
+                                           windowed_at_batch_dims});
+    SetPartitionedHlo(hlo, [&] {
+      auto result = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          result_buffer->shape(), while_loop, 2));
+      if (!ShapeUtil::Compatible(padded_result_buffer_shape,
+                                 unpadded_result_buffer_shape)) {
+        result = b_.AddInstruction(HloInstruction::CreateSlice(
+            unpadded_result_buffer_shape, result,
+            std::vector<int64>(padded_result_buffer_shape.rank(), 0),
+            unpadded_result_buffer_shape.dimensions(),
+            std::vector<int64>(padded_result_buffer_shape.rank(), 1)));
+      }
+      return result;
+    });
+    return Status::OK();
+  };
+  if (output_lhs_non_contracting_partitions == num_partitions_ &&
+      output_sharding_transposed_to_match_lhs == lhs_sharding &&
+      ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()) >=
+          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (rhs_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, true, false);
+    }
+    if (rhs_non_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, false, false);
+    }
+    if (rhs_batch_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, false, true);
+    }
+  }
+  if (output_rhs_non_contracting_partitions == num_partitions_ &&
+      output_sharding_transposed_to_match_rhs == rhs_sharding &&
+      ShapeUtil::ByteSizeOf(hlo->operand(0)->shape()) >=
+          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (lhs_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, true, false);
+    }
+    if (lhs_non_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, false, false);
+    }
+    if (lhs_batch_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, false, true);
+    }
+  }
+
+  {
+    // Try batch-parallel by resharding one operand, and allowing all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        bool emitted,
+        try_emit_output_batch_partitioned_einsum_with_reshard(true));
+    if (emitted) {
+      return Status::OK();
+    }
+  }
+
+  // LHS and RHS have the same partitioned contracting dimensions.
+  if (lhs_contracting_partitions == rhs_contracting_partitions &&
+      lhs_contracting_partitions == num_partitions_) {
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    // Pad both sides with zero, since NaN at one side cannot be masked by zero
+    // on the other side.
+    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
+        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+          .Reshard(hlo->sharding())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  // LHS and output have the same partitioned non-contracting dimensions.
+  if (lhs_non_contracting_partitions == num_partitions_ &&
+      output_lhs_non_contracting_partitions == num_partitions_ &&
+      lhs_sharding == hlo->sharding()) {
+    auto rhs_replicated = rhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs_replicated, &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // RHS and output have the same partitioned non-contracting dimensions.
+  if (rhs_non_contracting_partitions == num_partitions_ &&
+      output_rhs_non_contracting_partitions == num_partitions_ &&
+      rhs_sharding_transposed_to_match_output == hlo->sharding()) {
+    auto lhs_replicated = lhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs_replicated, rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // Output is batch partitioned.
+  if (output_batch_partitions == num_partitions_) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
+                                                     resharded_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+  // Output is partitioned along LHS non-contracting dimensions.
+  if (output_lhs_non_contracting_partitions == num_partitions_) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
+    TF_ASSIGN_OR_RETURN(
+        auto dot,
+        create_sharded_dot(resharded_lhs.hlo(), replicated_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+  // Output is partitioned along RHS non-contracting dimensions.
+  if (output_rhs_non_contracting_partitions == num_partitions_) {
+    auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
+                                                     resharded_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // Returns true if it is beneficial to reshard the operand at `operand_idx`
+  // across the contracting dimension.
+  const auto should_partition_contracting_dim = [&](int64 operand_idx) {
+    if (!hlo->sharding().IsReplicated()) {
+      return false;
+    }
+
+    if (operand_idx == 0) {
+      // If LHS and output are replicated, we compare the cost of all-gather
+      // on RHS vs all-reduce on the output.
+      return (rhs_contracting_partitions == num_partitions_) &&
+             lhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(hlo->operand(1)->shape()) >
+                 ShapeUtil::ElementsIn(hlo->shape());
+    } else {
+      return (lhs_contracting_partitions == num_partitions_) &&
+             rhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(hlo->operand(0)->shape()) >
+                 ShapeUtil::ElementsIn(hlo->shape());
+    }
+  };
+
+  // When the output is replicated and one of the operands is partitioned along
+  // contracting dimension, align the other operand to be partitioned along
+  // the contracting dimensions.
+  if (hlo->sharding().IsReplicated() && (should_partition_contracting_dim(0) ||
+                                         should_partition_contracting_dim(1))) {
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    if (should_partition_contracting_dim(0)) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState()).hlo();
+    });
+    return Status::OK();
+  }
+
+  return DefaultAction(hlo);
+}
+
+namespace {
+
+// Finds a cluster of nodes that produce the inputs for `hlo` which only depend
+// on small operands, which means the cluster should start with broadcasts,
+// constants and iotas. All other internal nodes must be non-side-effecting
+// elemntwise ops. Returns the set of nodes, and the small operands. E.g., for
+// the following graph,
+//
+//     a -> broadcast -> multiply
+//     iota  ---> add--/
+//     constant/
+//
+// FindInputNodesIfOnlyDependOnSmallOperands(multiply) will return
+//    <{broadcast, iota, constant, add, multiply}, [a]>.
+std::pair<std::unordered_set<HloInstruction*>, std::vector<HloInstruction*>>
+FindInputNodesIfOnlyDependOnSmallOperands(HloInstruction* hlo) {
+  std::unordered_set<HloInstruction*> nodes_found;
+  std::vector<HloInstruction*> new_operands;
+  std::unordered_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> worklist;
+  worklist.push_back(hlo);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (nodes_found.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast ||
+        inst->opcode() == HloOpcode::kConstant ||
+        inst->opcode() == HloOpcode::kIota) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        auto res = new_operands_set.emplace(o);
+        if (res.second) {
+          new_operands.push_back(o);
+        }
+      }
+    } else if (inst->IsElementwise() && !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        worklist.push_back(o);
+      }
+    } else {
+      nodes_found.clear();
+      new_operands.clear();
+      break;
+    }
+  }
+  return {std::move(nodes_found), std::move(new_operands)};
+}
+
+// Moves a cluster of memory-reducing nodes into the windowed dot-general loop
+// on contracting dimensions. Such a loop has a dynamic slice on the
+// non-windowed operand. If we move the input nodes into the loop, the
+// dynamic-slice could be merged with them by later optimization passes, which
+// reduces memory.
+//
+// small_operands             small_operands
+//        |                          |
+// input_nodes                loop { |
+//        |          =>         input_nodes
+// loop { |                          |
+//    dynamic-slice             dynamic-slice
+//    ...                       ...
+// }                          }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes.
+Status SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+    HloInstruction* loop, int64 non_windowed_operand_index) {
+  auto input_tuple = loop->mutable_operand(0);
+  auto old_operand = input_tuple->mutable_operand(non_windowed_operand_index);
+  auto input_nodes = FindInputNodesIfOnlyDependOnSmallOperands(old_operand);
+  auto to_sink = std::move(input_nodes.first);
+  auto new_operands = std::move(input_nodes.second);
+  if (to_sink.empty()) {
+    return Status::OK();
+  }
+  auto computation = loop->parent();
+  // Replace the old operand with a tuple of the found small operands.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(input_tuple->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_input_subtuple));
+
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto old_body_param_users = body_param->users();
+  // Update all tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body->root_instruction()}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(),
+                                   {non_windowed_operand_index}) =
+        new_input_subtuple->shape();
+  }
+  // Now update the loop body.
+  auto new_operand_tuple_inside =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, non_windowed_operand_index));
+  TF_RETURN_IF_ERROR(body->root_instruction()->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_operand_tuple_inside));
+
+  // Create nodes inside the loop body.
+  std::vector<HloInstruction*> worklist;
+  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_sink.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_operand_tuple_inside, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // HLOs to sink without operands.
+  std::vector<HloInstruction*> nullaries_to_sink;
+  for (auto inst : to_sink) {
+    if (inst->operand_count() == 0) {
+      nullaries_to_sink.push_back(inst);
+    }
+  }
+  // Sort nullaries_to_sink to make it deterministic.
+  absl::c_sort(nullaries_to_sink,
+               [](const HloInstruction* a, const HloInstruction* b) {
+                 return a->unique_id() < b->unique_id();
+               });
+  for (auto inst : nullaries_to_sink) {
+    worklist.push_back(inst);
+  }
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    std::vector<HloInstruction*> inst_new_operands(inst->operand_count());
+    for (int64 i = 0; i < inst->operand_count(); ++i) {
+      inst_new_operands[i] = outside_to_inside[inst->operand(i)];
+    }
+    outside_to_inside[inst] = body->AddInstruction(
+        inst->CloneWithNewOperands(inst->shape(), inst_new_operands));
+    add_users_if_available(inst);
+  }
+  TF_RET_CHECK(outside_to_inside.count(old_operand) > 0);
+  for (auto ou : old_body_param_users) {
+    if (ou->opcode() == HloOpcode::kGetTupleElement &&
+        ou->tuple_index() == non_windowed_operand_index) {
+      TF_RETURN_IF_ERROR(
+          ou->ReplaceAllUsesWith(outside_to_inside[old_operand]));
+      TF_RETURN_IF_ERROR(body->RemoveInstruction(ou));
+    }
+  }
+  return Status::OK();
+}
+
+// Moves a cluster of memory-reducing nodes (with reduce nodes at the end) into
+// the windowed dot-general loop on non-contracting dimensions. Such a loop has
+// a dynamic-update-slice at the output. If we move the user nodes into the loop
+// and before the dynamic-update-slice, the user nodes can operate on smaller
+// shapes, which reduces memory.
+//
+// small_operands                   small_operands
+//  | |                 =>                  | |
+//  | |  loop {                     loop {  | |
+//  | |    conv                             | broadcast      conv
+//  | |      |                              |     |           /
+//  | | dynamic-update-slice                |  dynamic-slice /
+//  | |         |                           |     |         /
+//  | |  }      |                           |  multiply-----
+//  |broadcast  /                           |    /
+//  | |        /                            reduce
+//  |multiply--                             |
+//  \ |                                dynamic-update-slice
+//   reduce                         }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes (broadcast).
+Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+    HloInstruction* loop) {
+  CHECK_EQ(loop->user_count(), 1);
+  // There should be a single direct user of the while loop, which is the
+  // gte for element 2, i.e., the dot output.
+  auto user_gte = loop->users().front();
+  CHECK_EQ(user_gte->opcode(), HloOpcode::kGetTupleElement);
+  CHECK_EQ(user_gte->tuple_index(), 2);
+  auto computation = loop->parent();
+
+  // Find the reduce outputs and the input nodes they depend on, if input nodes
+  // only have small operands.
+  std::unordered_set<HloInstruction*> to_move;
+  std::vector<HloInstruction*> new_operands;
+  std::unordered_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> reduce_outputs;
+  std::vector<HloInstruction*> worklist;
+  Shape padded_shape = user_gte->shape();
+  Shape unpadded_shape = user_gte->shape();
+  auto original_output = user_gte;
+
+  if (user_gte->user_count() == 1 &&
+      user_gte->users().back()->opcode() == HloOpcode::kSlice) {
+    original_output = user_gte->users().back();
+    unpadded_shape = original_output->shape();
+  }
+  for (auto u : original_output->users()) {
+    worklist.push_back(u);
+  }
+  to_move.insert(original_output);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (to_move.count(inst) > 0) {
+      continue;
+    }
+    // We only support reduces with simple reduction function, since we may need
+    // to accumulate across iterations manually.
+    if (inst->opcode() == HloOpcode::kReduce &&
+        inst->to_apply()->instruction_count() == 3 &&
+        inst->to_apply()->num_parameters() == 2 &&
+        inst->to_apply()->root_instruction()->IsElementwise()) {
+      to_move.insert(inst);
+      auto other_operand = inst->mutable_operand(1);
+      auto res = new_operands_set.emplace(other_operand);
+      if (res.second) {
+        new_operands.push_back(other_operand);
+      }
+      reduce_outputs.push_back(inst);
+    } else if (inst != computation->root_instruction() &&
+               inst->user_count() > 0 && inst->IsElementwise() &&
+               !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      // For an elementwise op, we need to make sure that they depend on only
+      // nodes already in to_move and nodes with small operands.
+      bool can_include = true;
+      for (auto operand : inst->operands()) {
+        if (to_move.count(operand) > 0) {
+          continue;
+        }
+        auto find_result = FindInputNodesIfOnlyDependOnSmallOperands(operand);
+        if (find_result.first.empty()) {
+          can_include = false;
+          break;
+        }
+        for (auto n : find_result.first) {
+          to_move.insert(n);
+        }
+        for (auto new_operand : find_result.second) {
+          auto res = new_operands_set.insert(new_operand);
+          if (res.second) {
+            new_operands.push_back(new_operand);
+          }
+        }
+      }
+      if (!can_include) {
+        to_move.clear();
+        break;
+      }
+      to_move.insert(inst);
+      for (auto u : inst->users()) {
+        worklist.push_back(u);
+      }
+    } else {
+      to_move.clear();
+      break;
+    }
+  }
+  // If nothing is found, to_move could contain only original_output, or cleared
+  // by the above code.
+  if (to_move.size() <= 1) {
+    return Status::OK();
+  }
+
+  // We will replace the original loop output with reduce-shape outputs. Create
+  // the initial buffers before the loop.
+  for (auto out : reduce_outputs) {
+    auto padded_out_shape = out->shape();
+    int64 operand_dim = 0;
+    int64 output_dim = 0;
+    while (output_dim < padded_out_shape.rank()) {
+      if (absl::c_linear_search(out->dimensions(), operand_dim)) {
+        // Dimension colapsed.
+        ++operand_dim;
+        continue;
+      }
+      // Kept dimensions have the same size of the padded shape.
+      padded_out_shape.set_dimensions(output_dim,
+                                      padded_shape.dimensions(operand_dim));
+      ++operand_dim;
+      ++output_dim;
+    }
+    auto broadcast =
+        computation->AddInstruction(HloInstruction::CreateBroadcast(
+            padded_out_shape,
+            computation->AddInstruction(HloInstruction::CreateConstant(
+                LiteralUtil::Zero(out->shape().element_type()))),
+            {}));
+    new_operands.push_back(broadcast);
+  }
+
+  auto input_tuple = loop->mutable_operand(0);
+  // Create the new input subtuple that contains the small operands and the
+  // reduce-shape result buffers.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(
+      input_tuple->ReplaceOperandWithDifferentShape(2, new_input_subtuple));
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto body_root = body->root_instruction();
+  CHECK_EQ(body_root->opcode(), HloOpcode::kTuple);
+  // Update tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body_root}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(), {2}) =
+        new_input_subtuple->shape();
+  }
+  auto new_loop_input =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, 2));
+
+  // Now create the moved nodes inside the loop body.
+  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  worklist.clear();
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_move.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_loop_input, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // The elementwise nodes will be created with sliced shape. The original loop
+  // output corresponds to the dynamic-update-slice's update slice.
+  auto dus = body_root->mutable_operand(2);
+  CHECK_EQ(dus->opcode(), HloOpcode::kDynamicUpdateSlice);
+  outside_to_inside[original_output] = dus->mutable_operand(1);
+  add_users_if_available(original_output);
+  std::vector<HloInstruction*> slice_offsets(padded_shape.rank());
+  for (int64 i = 0; i < slice_offsets.size(); ++i) {
+    slice_offsets[i] = dus->mutable_operand(i + 2);
+  }
+  auto get_slice = [&](HloInstruction* padded) {
+    return body->AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                     padded->shape().element_type()),
+        padded, slice_offsets, dus->operand(1)->shape().dimensions()));
+  };
+  // Helper functions to create nodes with small operands.
+  auto add_broadcast = [&](const HloInstruction* broadcast) {
+    auto padded_operand_shape = broadcast->operand(0)->shape();
+    for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+      padded_operand_shape.set_dimensions(
+          i, padded_shape.dimensions(broadcast->dimensions(i)));
+    }
+    auto padded_operand = PadToShape(outside_to_inside[broadcast->operand(0)],
+                                     padded_operand_shape, nullptr, body);
+    outside_to_inside[broadcast] =
+        get_slice(body->AddInstruction(broadcast->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         padded_operand_shape.element_type()),
+            {padded_operand})));
+  };
+  auto add_iota = [&](const HloInstruction* iota) {
+    outside_to_inside[iota] =
+        get_slice(body->AddInstruction(iota->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         iota->shape().element_type()),
+            {})));
+  };
+  auto add_constant = [&](const HloInstruction* constant) {
+    outside_to_inside[constant] = body->AddInstruction(constant->Clone());
+    outside_to_inside[constant] = get_slice(
+        PadToShape(outside_to_inside[constant],
+                   ShapeUtil::ChangeElementType(
+                       padded_shape, constant->shape().element_type()),
+                   nullptr, body));
+  };
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (outside_to_inside.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast) {
+      add_broadcast(inst);
+    } else if (inst->opcode() == HloOpcode::kIota) {
+      add_iota(inst);
+    } else if (inst->opcode() == HloOpcode::kConstant) {
+      add_constant(inst);
+    } else if (inst->opcode() == HloOpcode::kReduce) {
+      // This is an output, for which we has special handling later.
+    } else {
+      std::vector<HloInstruction*> operands_inside(inst->operand_count());
+      for (int64 i = 0; i < operands_inside.size(); ++i) {
+        operands_inside[i] = outside_to_inside[inst->operand(i)];
+      }
+      outside_to_inside[inst] = body->AddInstruction(inst->CloneWithNewOperands(
+          ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                       inst->shape().element_type()),
+          operands_inside));
+    }
+    add_users_if_available(inst);
+  }
+  std::vector<HloInstruction*> new_outputs_inside(new_operands.size());
+  for (int64 i = 0; i < new_outputs_inside.size(); ++i) {
+    new_outputs_inside[i] = outside_to_inside[new_operands[i]];
+  }
+  // Now create the reduce outpus inside of the loop.
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    auto reduce_outside = reduce_outputs[i];
+    CHECK_EQ(reduce_outside->opcode(), HloOpcode::kReduce);
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto last_iter_result = outside_to_inside[new_operands[index_in_operand]];
+    auto operand0 = outside_to_inside[reduce_outside->operand(0)];
+    auto operand1 = outside_to_inside[reduce_outside->operand(1)];
+    TF_ASSIGN_OR_RETURN(auto reduce_shape,
+                        ShapeInference::InferReduceShape(
+                            {&operand0->shape(), &operand1->shape()},
+                            reduce_outside->dimensions(),
+                            reduce_outside->to_apply()->ComputeProgramShape()));
+    *reduce_shape.mutable_layout() = reduce_outside->shape().layout();
+    std::vector<HloInstruction*> reduce_dus_offsets;
+    // If any collapsed dimension is windowed, we need to accumulate with last
+    // iteration's result. If such a dimension has padding, we also need to mask
+    // off invalid data.
+    bool needs_accumulate = false;
+    std::vector<int64> dims_to_mask;
+    for (int64 i = 0; i < slice_offsets.size(); ++i) {
+      if (absl::c_linear_search(reduce_outside->dimensions(), i)) {
+        if (reduce_outside->operand(0)->shape().dimensions(i) !=
+            operand0->shape().dimensions(i)) {
+          needs_accumulate = true;
+          if (unpadded_shape.dimensions(i) != padded_shape.dimensions(i)) {
+            dims_to_mask.push_back(i);
+          }
+        }
+        continue;
+      }
+      reduce_dus_offsets.push_back(slice_offsets[i]);
+    }
+    // Mask off invalid data in collapsed dimensions.
+    for (int64 dim : dims_to_mask) {
+      auto iota = body->AddInstruction(HloInstruction::CreateIota(
+          ShapeUtil::ChangeElementType(operand0->shape(), S32), dim));
+      auto add = body->AddInstruction(HloInstruction::CreateBinary(
+          iota->shape(), HloOpcode::kAdd, iota,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              iota->shape(), slice_offsets[dim], {}))));
+      auto limit = body->AddInstruction(HloInstruction::CreateBroadcast(
+          iota->shape(),
+          body->AddInstruction(
+              HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                  reduce_outside->operand(0)->shape().dimensions(dim)))),
+          {}));
+      auto compare = body->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(iota->shape(), PRED), add, limit,
+          ComparisonDirection::kLt));
+      operand0 = body->AddInstruction(HloInstruction::CreateTernary(
+          operand0->shape(), HloOpcode::kSelect, compare, operand0,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              operand0->shape(), operand1, {}))));
+    }
+    auto output_inside =
+        body->AddInstruction(reduce_outside->CloneWithNewOperands(
+            reduce_shape, {operand0, operand1}));
+    // Accumulate with previous results if needed.
+    if (needs_accumulate) {
+      auto input_slice =
+          body->AddInstruction(HloInstruction::CreateDynamicSlice(
+              output_inside->shape(), last_iter_result, reduce_dus_offsets,
+              output_inside->shape().dimensions()));
+      output_inside = body->AddInstruction(HloInstruction::CreateBinary(
+          output_inside->shape(),
+          reduce_outside->to_apply()->root_instruction()->opcode(),
+          output_inside, input_slice));
+    }
+    // Dynamic-update-slice if needed.
+    if (!ShapeUtil::Compatible(output_inside->shape(),
+                               last_iter_result->shape())) {
+      output_inside =
+          body->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+              last_iter_result->shape(), last_iter_result, output_inside,
+              reduce_dus_offsets));
+    }
+    new_outputs_inside[index_in_operand] = output_inside;
+  }
+  // Body output.
+  auto new_output_inside =
+      body->AddInstruction(HloInstruction::CreateTuple(new_outputs_inside));
+  TF_RETURN_IF_ERROR(
+      body_root->ReplaceOperandWithDifferentShape(2, new_output_inside));
+  TF_RETURN_IF_ERROR(body->RemoveInstructionAndUnusedOperands(dus));
+  // Replace uses of the reduces outside the loop.
+  auto new_output_gte =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_output_inside->shape(), loop, 2));
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto new_output =
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_outputs_inside[index_in_operand]->shape(), new_output_gte,
+            index_in_operand));
+    if (!ShapeUtil::Compatible(new_output->shape(),
+                               reduce_outputs[i]->shape())) {
+      new_output = computation->AddInstruction(HloInstruction::CreateSlice(
+          reduce_outputs[i]->shape(), new_output,
+          std::vector<int64>(new_output->shape().rank(), 0),
+          reduce_outputs[i]->shape().dimensions(),
+          std::vector<int64>(new_output->shape().rank(), 1)));
+    }
+    TF_RETURN_IF_ERROR(reduce_outputs[i]->ReplaceAllUsesWith(new_output));
+    TF_RETURN_IF_ERROR(
+        computation->RemoveInstructionAndUnusedOperands(reduce_outputs[i]));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status SpmdPartitioningVisitor::DoCodeMotionForWindowedDotGeneralLoops(
+    HloComputation* computation) {
+  for (auto& loop : windowed_dot_general_loops_) {
+    if (loop.windowed_in_contracting_dims || loop.windowed_in_batch_dims) {
+      // We have a dynamic-slice for the non-windowed operand in
+      // batch/contracting-dim windowed dot-general. So moving the
+      // broadcast/iota/elementwise ops into the loop could help reduce memory
+      // via fusion.
+      TF_RETURN_IF_ERROR(
+          SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+              loop.while_loop, 1 - loop.windowed_operand));
+    }
+    if (!loop.windowed_in_contracting_dims) {
+      // We have a dynamic-update-slice for the output in
+      // batch/non-contracting-dim windowed dot-general. So moving reduce ops
+      // into the loop could help reduce memory.
+      TF_RETURN_IF_ERROR(
+          MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+              loop.while_loop));
+    }
+  }
+  return Status::OK();
+}
+
+StatusOr<bool> SpmdPartitioningVisitor::DoPartition(
+    HloComputation* computation, const HloSharding& root_sharding) {
+  VLOG(2) << "Partitioning computation " << computation->name() << " for "
+          << num_replicas_ << " replicas and " << num_partitions_
+          << " partitions";
+  TF_RETURN_IF_ERROR(computation->Accept(this));
+
+  HloModule* module = computation->parent();
+  auto new_root =
+      GetPartitionedHlo(computation->root_instruction()).Reshard(root_sharding);
+  auto new_computation =
+      module->AddEmbeddedComputation(b_.Build(new_root.hlo()));
+  TF_RETURN_IF_ERROR(DoCodeMotionForWindowedDotGeneralLoops(new_computation));
+
+  // Replace the original computation with the new SPMD computation.
+  std::unordered_map<HloComputation*, HloComputation*> replacement;
+  replacement[computation] = new_computation;
+  module->ReplaceComputations(replacement);
+  return changed_;
+}
+
+Status SpmdPartitioningVisitor::HandlePartitionId(HloInstruction* hlo) {
+  return Unimplemented(
+      "PartitionId instruction is not supported for SPMD partitioning since "
+      "the meaning is ambiguous -- whether the instruction is replicated or "
+      "the data is replicated, and if the latter which data is replicated.");
+}
+
+SpmdPartitioner::SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                                 SpmdPartitionerOptions options)
+    : SpmdPartitioner(
+          num_partitions, num_replicas, std::move(options),
+          SPMDCollectiveOpsCreator{
+              [](SpmdBuilder* b) {
+                return b->AddInstruction(HloInstruction::CreatePartitionId());
+              },
+              [num_replicas](SpmdBuilder* b, HloInstruction* operand,
+                             HloComputation* reduction, int64 channel_id) {
+                return b->AddInstruction(HloInstruction::CreateAllReduce(
+                    operand->shape(), {operand}, reduction,
+                    CreateReplicaGroups(num_replicas),
+                    /*constrain_layout=*/false, channel_id,
+                    /*use_global_device_ids=*/false));
+              },
+              [](SpmdBuilder* b, HloInstruction* operand,
+                 std::vector<std::pair<int64, int64>>& src_dst_pairs,
+                 int64 channel_id) {
+                return b->AddInstruction(
+                    HloInstruction::CreateCollectivePermute(
+                        operand->shape(), operand, src_dst_pairs, channel_id));
+              },
+              [](SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+                 const std::vector<ReplicaGroup>& replica_groups,
+                 int64 channel_id, absl::optional<int64> split_dimension) {
+                std::vector<Shape> shapes(operands.size(),
+                                          operands[0]->shape());
+                const Shape output_shape =
+                    (shapes.size() == 1) ? shapes[0]
+                                         : ShapeUtil::MakeTupleShape(shapes);
+                return b->AddInstruction(HloInstruction::CreateAllToAll(
+                    output_shape, operands, replica_groups,
+                    /*constrain_layout=*/false, channel_id, split_dimension));
+              },
+          }) {}
+
+StatusOr<bool> SpmdPartitioner::PartitionComputation(
+    HloComputation* computation, const HloSharding& root_sharding,
+    int64* next_channel_id, SpmdLogger* logger) {
+  auto visitor =
+      CreateVisitor(computation, num_partitions_, num_replicas_,
+                    collective_ops_creator_, next_channel_id, logger, options_);
+  return visitor->DoPartition(computation, root_sharding);
+}
+
+std::unique_ptr<SpmdPartitioningVisitor> SpmdPartitioner::CreateVisitor(
+    HloComputation* computation, int64 num_partitions, int64 num_replicas,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdLogger* logger,
+    SpmdPartitionerOptions options) {
+  return absl::make_unique<SpmdPartitioningVisitor>(
+      computation, num_partitions, num_replicas, collective_ops_creator,
+      next_channel_id, logger, std::move(options), this);
+}
+
+StatusOr<bool> SpmdPartitioner::Run(HloModule* module) {
+  TF_RETURN_IF_ERROR(PreprocessSharding(module));
+
+  XLA_VLOG_LINES(1, SpmdLogger::ReportBeforePartition(
+                        *module, options_.report_instruction_count));
+
+  // Add the parameters' and output's shardings to the module.
+  std::vector<HloSharding> entry_params_shardings;
+  for (int64 i = 0; i < module->entry_computation()->num_parameters(); ++i) {
+    auto param = module->entry_computation()->parameter_instruction(i);
+    CHECK(param->has_sharding()) << "Missing sharding in entry parameter " << i;
+    entry_params_shardings.push_back(param->sharding());
+  }
+  module->set_spmd_parameters_shardings(entry_params_shardings);
+  auto entry_root = module->entry_computation()->root_instruction();
+  CHECK(entry_root->has_sharding()) << "Missing sharding in entry root.";
+  module->set_spmd_output_sharding(entry_root->sharding());
+
+  FlattenCallGraph flatten;
+  TF_ASSIGN_OR_RETURN(auto changed, flatten.Run(module));
+
+  SpmdLogger logger(options_.report_instruction_count);
+  auto program_shape = module->entry_computation()->ComputeProgramShape();
+  int64 next_channel_id = hlo_query::NextChannelId(*module);
+  TF_ASSIGN_OR_RETURN(
+      bool partition_changed,
+      PartitionComputation(
+          module->entry_computation(),
+          module->entry_computation()->root_instruction()->sharding(),
+          &next_channel_id, &logger));
+  changed |= partition_changed;
+
+  // For the entry computation, make sure that the root instruction and the
+  // parameters preserve their signatures.
+  auto new_program_shape = module->entry_computation()->ComputeProgramShape();
+  if (!options_.allow_module_signature_change) {
+    TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
+        program_shape.result(), new_program_shape.result()))
+        << "Result shape changed for the entry computation";
+    TF_RET_CHECK(program_shape.parameters_size() ==
+                 new_program_shape.parameters_size())
+        << "Parameter count changed for the entry computation";
+    for (int64 i = 0; i < program_shape.parameters_size(); ++i) {
+      TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
+          program_shape.parameters(i), new_program_shape.parameters(i)))
+          << "Parameter shape changed for the entry computation";
+    }
+  } else {
+    const auto& old_entry_layout = module->entry_computation_layout();
+    // Shapes can change but the layout should still remain the same.
+    for (int64 i = 0; i < new_program_shape.parameters_size(); ++i) {
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          old_entry_layout.parameter_shape(i),
+          new_program_shape.mutable_parameters(i)));
+    }
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        old_entry_layout.result_shape(), new_program_shape.mutable_result()));
+
+    HloModuleConfig config = module->config();
+    *config.mutable_entry_computation_layout() =
+        ComputationLayout(new_program_shape, /*ignore_layouts=*/false);
+    module->set_config(config);
+  }
+
+  XLA_VLOG_LINES(1, SpmdLogger::ReportAfterPartition(
+                        *module, options_.report_instruction_count));
+  XLA_VLOG_LINES(1, logger.MakeReport());
+
+  if (changed) {
+    HloPassPipeline pass("spmd-cleanup");
+    pass.AddPass<TupleSimplifier>();
+    pass.AddPass<HloDCE>();
+    pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+    pass.AddPass<FlattenCallGraph>();
+    TF_RETURN_IF_ERROR(pass.Run(module).status());
+  }
+
+  TF_RETURN_IF_ERROR(ClearShardingAttributes(module));
+  return changed;
+}
+
+Status SpmdPartitioner::PreprocessSharding(HloModule* module) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* hlo : computation->instructions()) {
+      if (hlo->HasSideEffectNoRecurse() && hlo->opcode() != HloOpcode::kRng) {
+        TF_RET_CHECK(hlo->has_sharding())
+            << "Side-effect HLO must have sharding: " << hlo->ToString();
+        TF_RET_CHECK(!HasReplicatedSharding(hlo->sharding()) ||
+                     hlo->opcode() == HloOpcode::kInfeed)
+            << "Non-infeed side-effect HLO cannot have a replicated sharding:"
+            << hlo->ToString();
+      }
+
+      // For unassigned HLOs, annotate with replicated sharding.
+      //
+      // Among side-effecting ops, only Rng is allowed to omit the annotation.
+      // In that case, we currently force it to run on core 0, since we don't
+      // support partitioning or replicating the Rng op (the values depend on
+      // the seed provided to each device).
+      //
+      // TODO(hyouklee): Should we also convert single-device shardings (without
+      // side-effects) into replicated?
+      if (!hlo->has_sharding()) {
+        if (hlo->opcode() == HloOpcode::kRng) {
+          hlo->set_sharding(HloSharding::AssignDevice(0));
+        } else {
+          hlo->set_sharding(
+              HloSharding::Single(hlo->shape(), HloSharding::Replicate()));
+        }
+      } else if (!hlo->sharding().IsTileMaximal()) {
+        std::vector<int64> available(num_partitions_);
+        std::iota(available.begin(), available.end(), 0);
+        TF_RET_CHECK(num_partitions_ == hlo_sharding_util::DevicesForSharding(
+                                            hlo->sharding(), available)
+                                            .size())
+            << "num_partitions:" << num_partitions_ << "\n"
+            << "SPMD partitioner only supports tile sharding that includes all "
+               "partitions. If you didn't add this sharding annotation in the "
+               "model, please file a bug to XLA team.\n"
+            << hlo->ToString();
+      }
+    }
+  }
+
+  // Entry computation's parameter and root sharding must be either all
+  // replicated or all on a single device.
+  if (!options_.allow_module_signature_change) {
+    const HloComputation* entry = module->entry_computation();
+    TF_RET_CHECK(entry->root_instruction()->has_sharding());
+    const HloSharding& root_sharding = entry->root_instruction()->sharding();
+    TF_RET_CHECK(root_sharding.IsReplicated() ||
+                 root_sharding.UniqueDevice().has_value())
+        << "Unsupported entry root sharding: " << root_sharding.ToString();
+
+    for (const HloInstruction* param : entry->parameter_instructions()) {
+      TF_RET_CHECK(param->has_sharding());
+      TF_RET_CHECK(param->sharding().IsReplicated() ||
+                   param->sharding().UniqueDevice().has_value())
+          << "Unsupported entry parameter sharding:"
+          << param->sharding().ToString();
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
new file mode 100644
index 00000000000..f22f564be73
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -0,0 +1,436 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+
+namespace xla {
+namespace spmd {
+
+struct SpmdPartitionerOptions {
+  // Always exchange halo on LHS for all convolutions. If false, backprop filter
+  // convolution exchanges halo on RHS.
+  bool conv_halo_exchange_always_on_lhs = true;
+
+  // The number of instructions to be reported for the highest memory profile
+  // instructions.
+  int64 report_instruction_count = 5;
+
+  // The minimum size in MiB of an einsum operand to be considered using
+  // windowed implementation in an HLO loop.
+  int64 threshold_for_windowed_einsum_mib = 256;
+
+  // Whether the entry computations' signature could change after partitioning.
+  bool allow_module_signature_change = false;
+};
+
+// Class to wrap the computation builder to capture information during SPMD
+// transformation.
+class SpmdBuilder : public HloComputation::Builder {
+ public:
+  SpmdBuilder(const std::string& name, HloInstruction* hlo)
+      : HloComputation::Builder(name) {
+    visiting_hlo_ = hlo;
+  }
+  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction);
+
+  const std::vector<HloInstruction*>& derived_instructions(
+      HloInstruction* hlo) {
+    return instructions_.at(hlo);
+  }
+
+  void set_visiting_hlo(HloInstruction* hlo) { visiting_hlo_ = hlo; }
+
+  HloInstruction* visiting_hlo() const { return visiting_hlo_; }
+
+ private:
+  // Currently visiting instruction.
+  HloInstruction* visiting_hlo_;
+
+  // Map from the currently visiting (old) instruction to new instructions
+  // created during SPMD partitioning.
+  HloInstructionMap<std::vector<HloInstruction*>> instructions_;
+};
+
+// A set of functions that create the cross-partition collective ops.
+struct SPMDCollectiveOpsCreator {
+  // Function used to create a partition ID HLO.
+  std::function<HloInstruction*(SpmdBuilder*)> create_partition_id;
+
+  // Function used to create a cross-partition all-reduce HLO.
+  std::function<HloInstruction*(SpmdBuilder*, HloInstruction* operand,
+                                HloComputation* reduction, int64 channel_id)>
+      create_cross_partition_all_reduce;
+
+  // Function used to create a cross-partition collective-permute HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand,
+      std::vector<std::pair<int64, int64>>& src_dst_pairs,
+      int64 next_channel_id)>
+      create_cross_partition_collective_permute;
+
+  // Function used to create a cross-partition all-to-all HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, absl::Span<HloInstruction* const> operands,
+      const std::vector<ReplicaGroup>& replica_groups, int64 channel_id,
+      absl::optional<int64> split_dimension)>
+      create_cross_partition_all_to_all;
+};
+
+// Logger to report memory usage during SPMD partitioning.
+class SpmdLogger {
+ public:
+  explicit SpmdLogger(int64 report_instruction_count)
+      : report_instruction_count_(report_instruction_count) {}
+  static std::string ReportBeforePartition(const HloModule& module,
+                                           int64 report_instruction_count);
+  static std::string ReportAfterPartition(const HloModule& module,
+                                          int64 report_instruction_count);
+
+  // Registers the logging for the groups of instructions created to transform
+  // the given hlo.
+  void RegisterLogEntry(HloInstruction* hlo,
+                        const std::vector<HloInstruction*>& group);
+
+  std::string MakeReport();
+
+ private:
+  template <typename F>
+  static std::string ReportMemoryUsage(const HloModule& module, const F& filter,
+                                       int64 report_instruction_count);
+
+  // A vector of logging messages (one for each original HLO instruction), where
+  // the first integer of the pair represents the size of the HBM used.
+  std::vector<std::pair<int64, std::string>> entries_;
+
+  int64 report_instruction_count_;
+};
+
+class SpmdPartitioningVisitor;
+
+class SpmdPartitioner : public HloModulePass {
+ public:
+  SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                  SpmdPartitionerOptions options);
+  SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                  SpmdPartitionerOptions options,
+                  SPMDCollectiveOpsCreator collective_ops_creator)
+      : num_partitions_(num_partitions),
+        num_replicas_(num_replicas),
+        options_(std::move(options)),
+        collective_ops_creator_(std::move(collective_ops_creator)) {}
+  absl::string_view name() const override { return "spmd-partitioning"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Transforms the given computation with SPMD instructions, replacing it with
+  // a new computation.
+  StatusOr<bool> PartitionComputation(HloComputation* computation,
+                                      const HloSharding& root_sharding,
+                                      int64* next_channel_id,
+                                      SpmdLogger* logger);
+
+ protected:
+  virtual std::unique_ptr<SpmdPartitioningVisitor> CreateVisitor(
+      HloComputation* computation, int64 num_partitions, int64 num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options);
+
+ private:
+  // Verify that the sharding of instructions in the module are valid, and also
+  // fill in missing sharding information.
+  Status PreprocessSharding(HloModule* module);
+
+  const int64 num_partitions_;
+  const int64 num_replicas_;
+
+  SpmdPartitionerOptions options_;
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+};
+
+// Class describes partition state of the data represented by an HLO created
+// during SPMD partitioning pass.
+//
+// Data on some devices may include padding region, if the base (full) shape
+// could not be evenly partitioned.
+class PartitionedHlo {
+ public:
+  // Return value for ReshardAsWindowedInput which describes the resharded HLO,
+  // the window for the user on the shard, and if necessary, the dynamic slice
+  // offsets to be applied to the output of the op being sharded.
+  struct WindowedInputShardReturnValue {
+    HloInstruction* sharded_input;
+    Window shard_window;
+    absl::optional<std::vector<HloInstruction*>> dynamic_slice_index_on_output;
+  };
+  // A cache for resharding each partitioned HLO.
+  struct ReshardCache {
+    struct PerHloCache {
+      std::vector<std::pair<HloSharding, PartitionedHlo>> reshard_cache;
+      std::vector<
+          std::tuple<HloSharding, Window, WindowedInputShardReturnValue>>
+          window_reshard_cache;
+    };
+    std::unordered_map<HloInstruction*, PerHloCache> per_hlo_cache;
+  };
+  struct PartitioningState {
+    SpmdBuilder* b;
+    HloModule* module;
+    int64 num_replicas;
+    HloInstruction* partition_id;
+    SPMDCollectiveOpsCreator collective_ops_creator;
+    int64* next_channel_id;
+    ReshardCache* reshard_cache;
+  };
+  PartitionedHlo(HloInstruction* hlo, Shape base_shape, PartitioningState state)
+      : hlo_(hlo), base_shape_(base_shape), state_(std::move(state)) {
+    CHECK(hlo->has_sharding())
+        << "PartitionedHlo is missing sharding:" << hlo->ToString();
+    // If the tuple shape instruction does not have a tuple sharding, reassign
+    // to use the tuple sharding. Reshard() implementation assumes this.
+    if (hlo_->shape().IsTuple() && !hlo_->sharding().IsTuple()) {
+      hlo_->set_sharding(
+          hlo_->sharding().GetTupleSharding(hlo_->shape()).ValueOrDie());
+    }
+  }
+
+  // Reshards the current SPMD instruction to a new sharding. Could only modify
+  // the reshard cache.
+  PartitionedHlo Reshard(const HloSharding& target);
+
+  // Pads the garbage area of the output with the provided value.
+  PartitionedHlo PadWithValue(HloInstruction* pad_value) const;
+
+  // Returns the SPMD instruction.
+  HloInstruction* hlo() const { return hlo_; }
+
+  // Returns the sharding of the SPMD instruction.
+  const HloSharding& sharding() const { return hlo_->sharding(); }
+
+  // Original full shape of the data.
+  const Shape& base_shape() const { return base_shape_; }
+
+  int64 NewChannel() const { return (*state_.next_channel_id)++; }
+
+  // Reshards the HLO to a usable partitioned input for a windowed user. Could
+  // only modify the reshard cache.
+  absl::optional<WindowedInputShardReturnValue> ReshardAsWindowedInput(
+      const Window& window, const HloSharding& target,
+      HloInstruction* pad_value, bool mask_invalid_region = true);
+
+ private:
+  // Same as Reshard except that it does not explicitly modify the reshard
+  // cache, although it would indirectly modify by calling Replicate().
+  PartitionedHlo ReshardNoCache(const HloSharding& target);
+
+  // Helper function to replicate the data on all devices. Could only modify
+  // the reshard cache.
+  PartitionedHlo Replicate();
+
+  // Helper function to broadcast data from a single device to all devices.
+  PartitionedHlo Broadcast() const;
+
+  // Helper function to reshard the tensor using AllToAll (instead of the
+  // default of Replicate followed by Slice).
+  PartitionedHlo ReshardWithAllToAll(const HloSharding& target) const;
+
+  // Helper function to reshard the tensor using CollectivePermute.
+  PartitionedHlo ReshardWithCollectivePermute(const HloSharding& target) const;
+
+  // SPMD instruction.
+  HloInstruction* hlo_;
+
+  // The original shape of the data before SPMD transformation is applied.
+  Shape base_shape_;
+
+  PartitioningState state_;
+};
+
+struct DotGeneralDimsMapping {
+  // The dimension numbers for the operands and output corresponding to a
+  // logical dimension (e.g., batch, contracting, non-contracting). If an
+  // operand or the output doesn't have the logical dimension, it is set to
+  // -1.
+  struct DimsMapping {
+    int64 lhs;
+    int64 rhs;
+    int64 output;
+  };
+  std::vector<DimsMapping> batch_dims;
+  std::vector<DimsMapping> contracting_dims;
+  std::vector<DimsMapping> lhs_non_contracting_dims;
+  std::vector<DimsMapping> rhs_non_contracting_dims;
+};
+
+class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
+ public:
+  SpmdPartitioningVisitor(
+      HloComputation* computation, int64 num_partitions, int64 num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options, SpmdPartitioner* partitioner);
+
+  Status DefaultAction(HloInstruction* hlo) override;
+  Status HandleAllReduce(HloInstruction* hlo) override;
+  Status HandleBroadcast(HloInstruction* hlo) override;
+  Status HandleConstant(HloInstruction* hlo) override;
+  Status HandleCustomCall(HloInstruction* hlo) override;
+  Status HandleDot(HloInstruction* hlo) override;
+  Status HandleDynamicSlice(HloInstruction* hlo) override;
+  Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
+  Status HandleGather(HloInstruction* hlo) override;
+  Status HandleGetTupleElement(HloInstruction* hlo) override;
+  Status HandleInfeed(HloInstruction* hlo) override;
+  Status HandleOutfeed(HloInstruction* hlo) override;
+  Status HandlePad(HloInstruction* hlo) override;
+  Status HandleParameter(HloInstruction* hlo) override;
+  Status HandleReduce(HloInstruction* hlo) override;
+  Status HandleReverse(HloInstruction* hlo) override;
+  Status HandleWhile(HloInstruction* hlo) override;
+  Status HandleConditional(HloInstruction* hlo) override;
+  Status HandleReduceWindow(HloInstruction* hlo) override;
+  Status HandleSelectAndScatter(HloInstruction* hlo) override;
+  Status HandleTuple(HloInstruction* hlo) override;
+  Status HandleRng(HloInstruction* hlo) override;
+  Status HandleConvolution(HloInstruction* hlo) override;
+  Status HandleConcatenate(HloInstruction* hlo) override;
+  Status HandleScatter(HloInstruction* hlo) override;
+  Status HandleSlice(HloInstruction* hlo) override;
+  Status HandleSort(HloInstruction* hlo) override;
+  Status HandleTranspose(HloInstruction* hlo) override;
+  Status HandleReshape(HloInstruction* hlo) override;
+  Status HandleIota(HloInstruction* hlo) override;
+  Status HandlePartitionId(HloInstruction* hlo) override;
+
+  // Handles convolution where both LHS and RHS operands are tiled.
+  Status HandleConvolutionTiledLhsAndRhs(HloInstruction* hlo);
+
+  // Implementation of dot partitioning given DotGeneralDimsMapping.
+  Status HandleDotHelper(
+      HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+      const std::function<StatusOr<HloInstruction*>(
+          HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot);
+
+  // Common handle for elementwise HLOs.
+  Status HandleElementwise(HloInstruction* hlo);
+
+  // Common handle for HLOs that runs on a single device.
+  Status HandleSingleDevice(const HloInstruction* hlo);
+
+  // Returns the PartitionedHlo that corresponds to the original hlo.
+  PartitionedHlo& GetPartitionedHlo(const HloInstruction* hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 1);
+    return partitioned_instructions_.find(hlo)->second;
+  }
+
+  // Sets the PartitionedHlo for the original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         const PartitionedHlo& partitioned_hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 0);
+    partitioned_instructions_.emplace(hlo, partitioned_hlo);
+    changed_ = true;
+  }
+
+  // Convenient wrapper that creates PartitionedHlo from the result of the func
+  // and maps it to the given original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         const std::function<HloInstruction*()>& func) {
+    HloInstruction* new_hlo = func();
+    new_hlo->set_sharding(hlo->sharding());
+    new_hlo->set_metadata(hlo->metadata());
+    SetPartitionedHlo(
+        hlo, PartitionedHlo(new_hlo, hlo->shape(), MakePartitioningState()));
+    changed_ = true;
+  }
+
+  int64 NewChannel() { return (*next_channel_id_)++; }
+
+  PartitionedHlo::PartitioningState MakePartitioningState() {
+    PartitionedHlo::PartitioningState state;
+    state.b = &b_;
+    state.module = module_;
+    state.num_replicas = num_replicas_;
+    state.partition_id = partition_id_;
+    state.collective_ops_creator = collective_ops_creator_;
+    state.next_channel_id = next_channel_id_;
+    state.reshard_cache = &reshard_cache_;
+    return state;
+  }
+
+  SpmdBuilder* builder() { return &b_; }
+
+  StatusOr<bool> DoPartition(HloComputation* computation,
+                             const HloSharding& root_sharding);
+
+ private:
+  Status Preprocess(HloInstruction* hlo) override;
+  Status Postprocess(HloInstruction* hlo) override;
+
+  // Performs code motion for windowed dot-general loops in
+  // windowed_dot_general_loops_. Invoked after the visitor finishes traversing
+  // the graph.
+  Status DoCodeMotionForWindowedDotGeneralLoops(HloComputation* computation);
+
+  bool changed_;
+  HloModule* module_;
+  int64 num_partitions_;
+  int64 num_replicas_;
+
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+
+  // Tracks the next channel id to use for cross-partition all-reduce.
+  int64* next_channel_id_;
+  SpmdBuilder b_;
+
+  HloInstruction* partition_id_;
+
+  PartitionedHlo::ReshardCache reshard_cache_;
+
+  // Mapping from the instruction in the original computation to the new SPMD
+  // partitioned instruction.
+  ConstHloInstructionMap<PartitionedHlo> partitioned_instructions_;
+
+  // Information about a loop created for windowed dot-general. Used when
+  // DoCodeMotionForWindowedDotGeneralLoops() executes after the visitor
+  // finishes traversing the graph.
+  struct WindowedDotGeneralLoop {
+    HloInstruction* while_loop;
+    int64 windowed_operand;
+    bool windowed_in_contracting_dims;
+    bool windowed_in_batch_dims;
+  };
+  std::vector<WindowedDotGeneralLoop> windowed_dot_general_loops_;
+
+  HloInstruction* visiting_hlo_;
+  SpmdLogger* logger_;
+  const SpmdPartitionerOptions options_;
+  SpmdPartitioner* partitioner_;
+};
+
+}  // namespace spmd
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
new file mode 100644
index 00000000000..ca1afc816b0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -0,0 +1,3215 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace spmd {
+namespace {
+
+using ::testing::_;
+using ::testing::AllOf;
+namespace op = xla::testing::opcode_matchers;
+
+class SpmdPartitioningTest : public HloTestBase {
+ public:
+  StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
+      const char* hlo_module, int64 num_devices,
+      bool conv_halo_exchange_always_on_lhs = true) {
+    // Some tests (BackpropFilter convs) set this flag false to test two
+    // different paths of the implementation.
+    SpmdPartitionerOptions options;
+    options.conv_halo_exchange_always_on_lhs = conv_halo_exchange_always_on_lhs;
+    options.allow_module_signature_change = true;
+
+    TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(
+                                         hlo_module, GetModuleConfigForTest()));
+    HloPassPipeline pass("spmd-partitioning");
+    pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                              /*allow_mixed_precision=*/false);
+    pass.AddPass<SpmdPartitioner>(num_devices, /*num_replicas=*/1, options);
+    pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                              /*allow_mixed_precision=*/false);
+    TF_RETURN_IF_ERROR(pass.Run(module.get()).status());
+    return StatusOr<std::unique_ptr<HloModule>>(std::move(module));
+  }
+};
+
+TEST_F(SpmdPartitioningTest, InvalidSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[8,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[8,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={maximal device=0}
+})";
+  auto module_status = PartitionComputation(hlo_string, /*num_devices=*/4);
+  EXPECT_FALSE(module_status.status().ok());
+  EXPECT_THAT(module_status.status().ToString(),
+              ::testing::HasSubstr(
+                  "only supports tile sharding that includes all partitions"));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Copy(op::AllReduce(
+                              op::Select(op::Broadcast(op::Compare()),
+                                         op::Constant(), op::Broadcast()))),
+                          op::Shape("s32[2,3]")));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={maximal device=1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  VLOG(1) << module->ToString();
+  EXPECT_THAT(root, op::Copy(AllOf(op::Copy(op::AllReduce(op::Select(
+                                       op::Broadcast(op::Compare()),
+                                       op::Constant(), op::Broadcast()))),
+                                   op::Shape("s32[2,3]"))));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant),
+    sharding={devices=[2,1]1,0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Copy(op::DynamicSlice(
+              op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::Constant(), op::Broadcast())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant())),
+          op::Shape("s32[1,3]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::Copy(op::AllReduce(AllOf(
+          op::DynamicUpdateSlice(
+              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant()),
+          op::Shape("s32[2,3]")))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::Copy(op::Copy(op::AllReduce(AllOf(
+          op::DynamicUpdateSlice(
+              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant()),
+          op::Shape("s32[2,3]"))))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledEven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param= s32[8,2]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[8,2]{1,0} copy(%param), sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Copy(op::Reshape(op::Transpose(op::AllToAll(AllOf(
+                op::Reshape(op::Parameter()), op::Shape("s32[4,2,1]")))))),
+            op::Shape("s32[8,1]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledUneven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param= f32[7,31,128]{2,1,0} parameter(0), sharding={devices=[1,2,1]0,1}
+  ROOT %copy = f32[7,31,128]{2,1,0} copy(%param), sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Copy(op::Slice(op::Reshape(AllOf(op::Transpose(op::AllToAll(
+          op::Reshape(AllOf(op::Pad(), op::Shape("f32[8,16,128]")))))))))));
+}
+
+TEST_F(SpmdPartitioningTest, GetTupleElementSwapDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param.0 = (f32[2,3]{1,0}, u32[]) parameter(0),
+    sharding={{maximal device=1}, {maximal device=1}}
+  %gte.0 = f32[2,3]{1,0} get-tuple-element(%param.0), index=0,
+    sharding={maximal device=0}
+  %gte.1 = u32[] get-tuple-element(%param.0), index=1,
+    sharding={maximal device=0}
+  ROOT %tuple = (f32[2,3]{1,0}, u32[]) tuple(%gte.0, %gte.1),
+    sharding={{maximal device=0},{maximal device=0}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Tuple());
+
+  EXPECT_THAT(root->operand(0),
+              op::Copy(op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::GetTupleElement(op::Parameter()), op::Broadcast()))));
+  EXPECT_THAT(root->operand(1),
+              op::Copy(op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::GetTupleElement(op::Parameter()), op::Broadcast()))));
+}
+
+TEST_F(SpmdPartitioningTest, GetTupleElementTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param.0 = (f32[2,3]{1,0}, u32[2,3]{1,0}) parameter(0),
+    sharding={{replicated}, {replicated}}
+  gte.0 = f32[2,3]{1,0} get-tuple-element(param.0), index=0,
+    sharding={devices=[2,1]0,1}
+  gte.1 = u32[2,3]{1,0} get-tuple-element(param.0), index=1,
+    sharding={devices=[2,1]0,1}
+  ROOT %tuple = (f32[2,3]{1,0}, u32[2,3]{1,0}) tuple(gte.0, gte.1),
+    sharding={{devices=[2,1]0,1},{devices=[2,1]0,1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Tuple());
+
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+
+  EXPECT_THAT(root->operand(0),
+              op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
+                               op::Constant()));
+  EXPECT_THAT(root->operand(1),
+              op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
+                               op::Constant()));
+}
+
+TEST_F(SpmdPartitioningTest, TiledInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[8,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[8,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(),
+                op::GetTupleElement(
+                    AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))),
+                op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                             op::Constant())),
+                op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, UnevenTiledInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[9,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[9,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, AllOf(op::Shape("f32[5,2]"), op::GetTupleElement(op::Conditional(
+                                             op::Convert(op::PartitionId()),
+                                             op::AfterAll(), op::AfterAll()))));
+  EXPECT_THAT(
+      root->operand(0)->called_computations()[0]->root_instruction(),
+      AllOf(op::Shape("(f32[5,2], token[])"), op::Infeed(op::Parameter())));
+  auto second_infeed =
+      AllOf(op::Shape("(f32[4,2], token[])"), op::Infeed(op::Parameter()));
+  EXPECT_THAT(root->operand(0)->called_computations()[1]->root_instruction(),
+              AllOf(op::Shape("(f32[5,2], token[])"),
+                    op::Tuple(op::Pad(op::GetTupleElement(second_infeed),
+                                      op::Constant()),
+                              op::GetTupleElement(second_infeed))));
+}
+
+TEST_F(SpmdPartitioningTest, UnevenTiledTupleInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = ((f32[9,2]{1,0}, f32[2]{0}), token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {replicated}, {maximal device=0}}
+  ROOT infeed.data = (f32[9,2]{1,0}, f32[2]{0}) get-tuple-element(infeed),
+    index=0, sharding={{devices=[2,1]0,1}, {replicated}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("(f32[5,2], f32[2])"),
+                          op::GetTupleElement(op::Conditional(
+                              op::Convert(op::PartitionId()), op::AfterAll(),
+                              op::AfterAll()))));
+  EXPECT_THAT(root->operand(0)->called_computations()[0]->root_instruction(),
+              AllOf(op::Shape("((f32[5,2], f32[2]), token[])"),
+                    op::Infeed(op::Parameter())));
+  auto second_infeed = AllOf(op::Shape("((f32[4,2], f32[2]), token[])"),
+                             op::Infeed(op::Parameter()));
+  EXPECT_THAT(
+      root->operand(0)->called_computations()[1]->root_instruction(),
+      AllOf(op::Shape("((f32[5,2], f32[2]), token[])"),
+            op::Tuple(op::Tuple(op::Pad(op::GetTupleElement(
+                                            op::GetTupleElement(second_infeed)),
+                                        op::Constant()),
+                                op::GetTupleElement(
+                                    op::GetTupleElement(second_infeed))),
+                      op::GetTupleElement(second_infeed))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToReplicatedReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce = f32[] reduce(constant, constant.1), dimensions={0,1},
+    to_apply=sum, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::AllReduce(op::Reduce(
+          op::Select(
+              op::Compare(op::Add(op::Iota(), op::Broadcast(op::Reshape())),
+                          op::Broadcast(op::Constant())),
+              AllOf(op::Shape("f32[2,3]{1,0}"),
+                    op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                     op::Reshape(), op::Constant())),
+              op::Broadcast(op::Constant())),
+          op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, TiledElementwise) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[3,3]{1,0} constant({{2,2,2},{2,2,2},{2,2,2}}),
+    sharding={replicated}
+  multiply = f32[3,3]{1,0} multiply(constant, constant.1),
+    sharding={devices=[2,1]0,1}
+  ROOT add = f32[3,3]{1,0} add(multiply, constant.1),
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Shape("f32[2,3]{1,0}"),
+          op::Add(op::Multiply(
+                      op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                       op::Reshape(), op::Constant()),
+                      op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                       op::Reshape(), op::Constant())),
+                  op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                   op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledAllReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  parameter = f32[3,3]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  ROOT all-reduce = f32[3,3]{1,0} all-reduce(parameter), to_apply=sum,
+    replica_groups={}, sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, AllOf(op::Shape("f32[2,3]{1,0}"), op::AllReduce(op::Parameter(0))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastOnlyNewDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[3,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,4,3]{2,1,0}"),
+                          op::Broadcast(op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastOnlyOldDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2,3]{2,1,0}"),
+                          op::Broadcast(op::DynamicSlice(
+                              op::Constant(), op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastBothOldAndNewDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,2,3]{2,1,0}"),
+            op::Broadcast(AllOf(op::Shape("f32[2,3]{1,0}"),
+                                op::DynamicSlice(op::Constant(), op::Reshape(),
+                                                 op::Constant())))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastPropagateTiledSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,4,1},{1,3,1},{1,2,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2,3]{2,1,0}"),
+                          op::Broadcast(op::DynamicSlice(
+                              op::Constant(), op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, OutfeedSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token.0 = token[] after-all()
+  data = f32[1024]{0} parameter(0), sharding={maximal device=0}
+  outfeed = token[] outfeed(data, token.0), sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("token[]"),
+                          op::Conditional(
+                              op::Compare(op::PartitionId(), op::Constant()),
+                              op::Tuple(op::Parameter(0), op::AfterAll()),
+                              op::Tuple(op::Parameter(0), op::AfterAll()))));
+
+  HloInstruction* root_b0 = root->branch_computation(0)->root_instruction();
+  EXPECT_THAT(root_b0,
+              AllOf(op::Shape("token[]"),
+                    op::Outfeed(op::GetTupleElement(op::Parameter(), 0),
+                                op::GetTupleElement(op::Parameter(), 1))));
+
+  HloInstruction* root_b1 = root->branch_computation(1)->root_instruction();
+  EXPECT_THAT(root_b1, AllOf(op::Shape("token[]"), op::AfterAll()));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowReplicatedInput) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[6,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1},{1,2},{2,2}}),
+    sharding={replicated}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[3,2]{1,0} reduce-window(constant, constant.1),
+    window={size=3x1 stride=2x1 pad=1_0x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,2]{1,0}"),
+            op::ReduceWindow(
+                op::DynamicSlice(AllOf(op::Shape("f32[9,2]{1,0}"),
+                                       op::Pad(op::Constant(), op::Constant())),
+                                 op::Multiply(op::Reshape(), op::Constant()),
+                                 op::Constant()),
+                op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledNegativeLeftHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[6,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1},{1,2},{2,2}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %reduce-window = f32[3,2]{1,0} reduce-window(%constant, %constant.1),
+    window={size=3x1 stride=2x1 pad=0_1x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto right_halo = AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = op::DynamicSlice(
+      AllOf(
+          op::Shape("f32[6,2]{1,0}"),
+          op::Pad(op::Concatenate(sharded_input, right_halo), op::Constant())),
+      op::Reshape(), op::Constant());
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked =
+      op::Select(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+                 pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledOneSideUnequalHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[9,2]{1,0} constant(
+    {{1,1},{1,4},{2,1},{3,1},{1,2},{2,2},{4,1},{1,2},{2,1}}),
+    sharding={devices=[3,1]0,1,2}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[5,2]{1,0} reduce-window(constant, constant.1),
+    window={size=3x1 stride=2x1 pad=1_1x0_0}, to_apply=sum,
+    sharding={devices=[3,1]0,1,2}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/3));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto right_halo = AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = op::DynamicSlice(
+      AllOf(
+          op::Shape("f32[7,2]{1,0}"),
+          op::Pad(op::Concatenate(sharded_input, right_halo), op::Constant())),
+      op::Reshape(), op::Constant());
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked = op::Select(
+      op::And(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_in_padded, op::Broadcast(op::Constant()))),
+      pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledTwoSideHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[4,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[2,2]{1,0} reduce-window(constant, constant.1),
+    window={size=5x1 stride=3x1 pad=2_2x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto left_halo = AllOf(op::Shape("f32[1,2]{1,0}"),
+                         op::CollectivePermute(op::Slice(sharded_input)));
+  auto right_halo = AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = AllOf(
+      op::Shape("f32[5,2]{1,0}"),
+      op::DynamicSlice(
+          AllOf(op::Shape("f32[6,2]{1,0}"),
+                op::Pad(op::Concatenate(left_halo, sharded_input, right_halo),
+                        op::Constant())),
+          op::Reshape(), op::Constant()));
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked = op::Select(
+      op::And(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_in_padded, op::Broadcast(op::Constant()))),
+      pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiled2D) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[4,4,2,2]{3,2,1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,2,1,1]0,1,2,3}, {maximal device=0}}
+  infeed.data = f32[4,4,2,2]{3,2,1,0} get-tuple-element(infeed), index=0,
+    sharding={devices=[2,2,1,1]0,1,2,3}
+  constant = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[2,2,2,2]{3,2,1,0} reduce-window(infeed.data, constant),
+    window={size=5x5x1x1 stride=3x3x1x1 pad=2_2x2_2x0_0x0_0}, to_apply=sum,
+    sharding={devices=[2,2,1,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input = AllOf(op::Shape("f32[2,2,2,2]{3,2,1,0}"),
+                             op::GetTupleElement(op::Infeed()));
+  auto dim0_left_halo = AllOf(op::Shape("f32[1,2,2,2]{3,2,1,0}"),
+                              op::CollectivePermute(op::Slice(sharded_input)));
+  auto dim0_right_halo = AllOf(op::Shape("f32[1,2,2,2]{3,2,1,0}"),
+                               op::CollectivePermute(op::Slice(sharded_input)));
+  auto dim0_pre_masking = op::DynamicSlice(
+      AllOf(op::Shape("f32[6,2,2,2]{3,2,1,0}"),
+            op::Pad(
+                op::Concatenate(dim0_left_halo, sharded_input, dim0_right_halo),
+                op::Constant())),
+      op::Reshape(), op::Constant(), op::Constant(), op::Constant());
+  auto dim0_index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto dim0_masked = op::Select(
+      op::And(op::Compare(dim0_index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(dim0_index_in_padded, op::Broadcast(op::Constant()))),
+      dim0_pre_masking, op::Broadcast(op::Constant()));
+  auto dim0_resharded = AllOf(op::Shape("f32[5,2,2,2]{3,2,1,0}"), dim0_masked);
+  auto dim1_left_halo = AllOf(op::Shape("f32[5,1,2,2]{3,2,1,0}"),
+                              op::CollectivePermute(op::Slice(dim0_resharded)));
+  auto dim1_right_halo =
+      AllOf(op::Shape("f32[5,1,2,2]{3,2,1,0}"),
+            op::CollectivePermute(op::Slice(dim0_resharded)));
+  auto dim1_pre_masking = op::DynamicSlice(
+      AllOf(op::Shape("f32[5,6,2,2]{3,2,1,0}"),
+            op::Pad(op::Concatenate(dim1_left_halo, dim0_resharded,
+                                    dim1_right_halo),
+                    op::Constant())),
+      op::Constant(), op::Reshape(), op::Constant(), op::Constant());
+  auto dim1_index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto dim1_masked = op::Select(
+      op::And(op::Compare(dim1_index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(dim1_index_in_padded, op::Broadcast(op::Constant()))),
+      dim1_pre_masking, op::Broadcast(op::Constant()));
+  auto dim1_resharded = AllOf(op::Shape("f32[5,5,2,2]{3,2,1,0}"), dim1_masked);
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,1,2,2]{3,2,1,0}"),
+                          op::ReduceWindow(dim1_resharded, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(f32[128,224,224,3] %lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(f32[7,7,3,64] %rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(
+    f32[128,224,224,3] %lhs.copy,
+    f32[7,7,3,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(
+                        op::Select(op::And(),
+                                   op::Concatenate(left_halo, lhs, right_halo),
+                                   op::Broadcast()),
+                        rhs),
+                    op::Shape("f32[128,56,112,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicatedNeedReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(f32[128,224,224,3] %lhs),
+    sharding={devices=[2,1,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(f32[7,7,3,64] %rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(
+    f32[128,224,224,3] %lhs.copy,
+    f32[7,7,3,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,224,224,3]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(lhs)), op::Shape("f32[64,2,112,224,3]"));
+  auto reshard_lhs = AllOf(op::Reshape(op::Transpose(all_to_all)),
+                           op::Shape("f32[128,112,224,3]"));
+
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(reshard_lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(reshard_lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Convolution(
+                op::Select(op::And(),
+                           op::Concatenate(left_halo, reshard_lhs, right_halo),
+                           op::Broadcast()),
+                rhs),
+            op::Shape("f32[128,56,112,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicatedReordered) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[224,224,3,128] parameter(0)
+  %lhs.copy = f32[224,224,3,128] copy(%lhs), sharding={devices=[2,1,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(%rhs), sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=01fb_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[112,224,3,128]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[3,224,3,128]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[2,224,3,128]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(
+                        op::Select(op::And(),
+                                   op::Concatenate(left_halo, lhs, right_halo),
+                                   op::Broadcast()),
+                        rhs),
+                    op::Shape("f32[128,56,112,64]")));
+}
+
+// (stride * per_shard_window_count) % dilation == 0
+TEST_F(SpmdPartitioningTest,
+       ConvolutionBaseDilationSameStartPatternLhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,7,7,512] parameter(0)
+  %lhs.copy = f32[128,7,7,512] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[3,3,512,512] parameter(1)
+  %rhs.copy = f32[3,3,512,512] copy(%rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,4,4,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=3x3 stride=4x4 pad=1_1x1_1 lhs_dilate=2x2 rhs_reversal=1x1},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  // There is no halo exchange, and because the last element in the shard is not
+  // needed (stride == 4), the LHS will be just a slice.
+  auto sliced_lhs =
+      AllOf(op::Slice(op::Copy(op::DynamicSlice(
+                op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                op::Reshape(), op::Constant(), op::Constant()))),
+            op::Shape("f32[128,3,7,512]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[3,3,512,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(sliced_lhs, rhs),
+                          op::Shape("f32[128,2,4,512]")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 1);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 1);
+}
+
+// (stride * per_shard_window_count) % dilation != 0 but stride == 1
+TEST_F(SpmdPartitioningTest,
+       ConvolutionBaseDilationStride1LhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,7,7,512] parameter(0)
+  %lhs.copy = f32[128,7,7,512] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[3,3,512,512] parameter(1)
+  %rhs.copy = f32[3,3,512,512] copy(%rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,14,14,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[128,4,7,512]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[3,3,512,512]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,1,7,512]"));
+  auto start_window = op::Multiply(op::Reshape(), op::Constant());
+  auto start_input_element = op::Divide(start_window, op::Constant());
+  auto dynamic_offset_for_padded_concat = op::Subtract(
+      op::Constant(), op::Subtract(op::Multiply(op::Reshape(), op::Constant()),
+                                   start_input_element));
+  auto pre_masking =
+      AllOf(op::Shape("f32[128,5,7,512]"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[128,6,7,512]"),
+                      op::Pad(op::Concatenate(left_halo, lhs), op::Constant())),
+                op::Constant(), dynamic_offset_for_padded_concat,
+                op::Constant(), op::Constant()));
+  auto masked = op::Select(
+      op::Compare(op::Add(op::Iota(), op::Broadcast(start_input_element)),
+                  op::Broadcast(op::Constant())),
+      pre_masking, op::Broadcast(op::Constant()));
+  auto dynamic_offset_on_output = op::Subtract(
+      start_window, op::Multiply(start_input_element, op::Constant()));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(AllOf(op::Convolution(masked, rhs),
+                                           op::Shape("f32[128,8,14,512]")),
+                                     op::Constant(), dynamic_offset_on_output,
+                                     op::Constant(), op::Constant()),
+                    op::Shape("f32[128,7,14,512]")));
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_low(), 1);
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterNoOverlap) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[4,1]0,1,2,3}
+  constant = f32[4,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=3x2 pad=0_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto source =
+      AllOf(op::Shape("f32[1,2]{1,0}"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto masked_data = AllOf(
+      op::Shape("f32[3,4]{1,0}"),
+      op::Select(
+          op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply(
+                                              op::Reshape(), op::Constant()))),
+                      op::Broadcast(op::Constant())),
+          op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                    op::Reshape(), op::Constant())),
+          op::Broadcast(op::Constant())));
+
+  EXPECT_THAT(root,
+              AllOf(op::SelectAndScatter(masked_data, source, op::Constant()),
+                    op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterNoOverlapReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[1,4]0,1,2,3}
+  constant = f32[4,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=3x2 pad=0_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto source =
+      AllOf(op::Shape("f32[1,2]{1,0}"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto operand = AllOf(op::Copy(op::DynamicSlice(
+                           op::Parameter(0), op::Constant(), op::Reshape())),
+                       op::Shape("f32[11,1]"));
+  auto reshard_operand = op::Reshape(op::Transpose(
+      op::AllToAll(op::Reshape(op::Pad(operand, op::Constant())))));
+  auto masked_data = AllOf(
+      op::Shape("f32[3,4]{1,0}"),
+      op::Select(
+          op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply(
+                                              op::Reshape(), op::Constant()))),
+                      op::Broadcast(op::Constant())),
+          reshard_operand, op::Broadcast(op::Constant())));
+
+  EXPECT_THAT(root,
+              AllOf(op::SelectAndScatter(masked_data, source, op::Constant()),
+                    op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterWithOverlap) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[4,1]0,1,2,3}
+  constant = f32[6,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8},{6,6},{1,9}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=2x2 pad=1_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+
+  auto source_shard =
+      AllOf(op::Shape("f32[2,2]{1,0}"),
+            op::DynamicSlice(op::Pad(), op::Reshape(), op::Constant()));
+  // Max halo size is the same as the shard size, so slice is not needed.
+  auto source_left_halo = op::CollectivePermute(source_shard);
+  auto required_source_shard_start =
+      op::Divide(op::Multiply(op::Reshape(), op::Constant()), op::Constant());
+  auto source_with_halo = op::DynamicSlice(
+      AllOf(op::Shape("f32[5,2]{1,0}"),
+            op::Pad(op::Concatenate(source_left_halo, source_shard),
+                    op::Constant())),
+      op::Subtract(op::Constant(),
+                   op::Subtract(op::Multiply(op::Reshape(), op::Constant()),
+                                required_source_shard_start)),
+      op::Constant());
+  auto masked_source_with_halo = AllOf(
+      AllOf(op::Shape("f32[3,2]{1,0}")),
+      op::Select(
+          op::Compare(
+              op::Add(op::Iota(), op::Broadcast(required_source_shard_start)),
+              op::Broadcast(op::Constant())),
+          source_with_halo, op::Broadcast(op::Constant())));
+
+  auto data_shard =
+      AllOf(op::Shape("f32[3,4]{1,0}"),
+            op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                      op::Reshape(), op::Constant())));
+  auto data_left_halo = AllOf(op::Shape("f32[2,4]{1,0}"),
+                              op::CollectivePermute(op::Slice(data_shard)));
+  auto data_right_halo = AllOf(op::Shape("f32[2,4]{1,0}"),
+                               op::CollectivePermute(op::Slice(data_shard)));
+  auto required_data_start_on_padded =
+      op::Multiply(required_source_shard_start, op::Constant());
+  auto left_halo_size = op::Subtract(
+      op::Add(op::Multiply(op::Reshape(), op::Constant()), op::Constant()),
+      required_data_start_on_padded);
+  auto data_with_halo =
+      AllOf(op::Shape("f32[7,4]{1,0}"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[8,4]{1,0}"),
+                      op::Pad(op::Concatenate(data_left_halo, data_shard,
+                                              data_right_halo),
+                              op::Constant())),
+                op::Subtract(op::Constant(), left_halo_size), op::Constant()));
+  auto index_on_padded =
+      op::Add(op::Iota(), op::Broadcast(required_data_start_on_padded));
+  auto masked_data_with_halo = op::Select(
+      op::And(op::Compare(index_on_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_on_padded, op::Broadcast(op::Constant()))),
+      data_with_halo, op::Broadcast(op::Constant()));
+
+  EXPECT_THAT(
+      root, AllOf(op::DynamicSlice(op::SelectAndScatter(masked_data_with_halo,
+                                                        masked_source_with_halo,
+                                                        op::Constant()),
+                                   left_halo_size, op::Constant()),
+                  op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,64] parameter(0)
+  %lhs.copy = f32[128,56,56,64] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,56,56,256] parameter(1)
+  %rhs.copy = f32[128,56,56,256] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,64,256] convolution(%lhs.copy, %rhs.copy),
+    window={size=56x56}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[1,1,64,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, DotLhsTiledRhsTiledWithReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,64] parameter(0)
+  %lhs.copy = f32[128,56,56,64] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,56,56,256] parameter(1)
+  %rhs.copy = f32[128,56,56,256] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[1,1,64,256] convolution(%lhs.copy, %rhs.copy),
+    window={size=56x56}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,56,56,256]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(lhs)), op::Shape("f32[2,64,28,56,64]"));
+  auto reshard = AllOf(op::Reshape(op::Transpose(all_to_all)));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(reshard, rhs)),
+                          op::Shape("f32[1,1,64,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,512] parameter(0)
+  %lhs.copy = f32[128,56,56,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,64] parameter(1)
+  %rhs.copy = f32[128,28,28,64] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[1,1,512,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2},
+    dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,28,28,64]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(rhs)), op::Shape("f32[64,2,14,28,64]"));
+  auto reshard = op::Reshape(op::Transpose(all_to_all));
+
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(op::Slice(lhs), reshard)),
+                    op::Shape("f32[1,1,512,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,28,28,128] parameter(0)
+  %lhs.copy = f32[32,28,28,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,28,28,64] parameter(1)
+  %rhs.copy = f32[32,28,28,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,128,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=1_1x1_1}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[32,1,28,64]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                          op::Shape("f32[32,1,28,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        lhs, AllOf(op::Concatenate(left_halo, rhs, right_halo),
+                                   op::Shape("f32[32,16,28,64]")))),
+                    op::Shape("f32[3,3,128,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowDilate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,112,112,64] parameter(1)
+  %rhs.copy = f32[128,112,112,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[7,7,3,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=112x112 pad=3_2x3_2 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,56,112,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[128,2,112,64]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                          op::Shape("f32[128,2,112,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        lhs, AllOf(op::Concatenate(left_halo, rhs, right_halo),
+                                   op::Shape("f32[128,60,112,64]")))),
+                    op::Shape("f32[7,7,3,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateNegativeRhsPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,14,28,512]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[1,1,256,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowDilateUneven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,14,14,512] parameter(0)
+  %lhs.copy = f32[128,14,14,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,7,7,512] parameter(1)
+  %rhs.copy = f32[128,7,7,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,512,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,7,14,512]"));
+  auto rhs = AllOf(
+      op::Select(op::Compare(),
+                 op::Copy(op::DynamicSlice(
+                     op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                     op::Reshape(), op::Constant(), op::Constant())),
+                 op::Broadcast()),
+      op::Shape("f32[128,4,7,512]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[128,1,7,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        AllOf(op::DynamicSlice(op::Pad(lhs, op::Constant()),
+                                               op::Constant(), op::Subtract(),
+                                               op::Constant(), op::Constant()),
+                              op::Shape("f32[128,10,14,512]")),
+                        AllOf(op::Concatenate(left_halo, rhs),
+                              op::Shape("f32[128,5,7,512]")))),
+                    op::Shape("f32[3,3,512,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithPadding_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,28,28,128] parameter(0)
+  %lhs.copy = f32[32,28,28,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,28,28,64] parameter(1)
+  %rhs.copy = f32[32,28,28,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,128,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=1_1x1_1}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[32,1,28,128]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[32,1,28,128]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(
+                              AllOf(op::Concatenate(left_halo, lhs, right_halo),
+                                    op::Shape("f32[32,16,28,128]")),
+                              rhs)),
+                          op::Shape("f32[3,3,128,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilate_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,112,112,64] parameter(1)
+  %rhs.copy = f32[128,112,112,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[7,7,3,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=112x112 pad=3_2x3_2 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,56,112,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(
+                              AllOf(op::Concatenate(left_halo, lhs, right_halo),
+                                    op::Shape("f32[128,117,224,3]")),
+                              rhs)),
+                          op::Shape("f32[7,7,3,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateNegativeRhsPadding_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,14,28,512]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(op::Slice(lhs), rhs)),
+                          op::Shape("f32[1,1,256,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateUneven_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,14,14,512] parameter(0)
+  %lhs.copy = f32[128,14,14,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,7,7,512] parameter(1)
+  %rhs.copy = f32[128,7,7,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,512,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,7,14,512]"));
+  auto rhs = AllOf(
+      op::Select(op::Compare(),
+                 op::Copy(op::DynamicSlice(
+                     op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                     op::Reshape(), op::Constant(), op::Constant())),
+                 op::Broadcast()),
+      op::Shape("f32[128,4,7,512]"));
+
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,1,14,512]"));
+  EXPECT_THAT(
+      root, AllOf(op::AllReduce(op::Convolution(
+                      AllOf(op::DynamicSlice(
+                                AllOf(op::Pad(op::Concatenate(lhs, right_halo),
+                                              op::Constant()),
+                                      op::Shape("f32[128,10,14,512]")),
+                                op::Constant(), op::Reshape(), op::Constant(),
+                                op::Constant()),
+                            op::Shape("f32[128,9,14,512]")),
+                      rhs)),
+                  op::Shape("f32[3,3,512,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConcatenateAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0)
+  %param0.copy = f32[14,257] copy(%param0), sharding={devices=[2,1]0,1}
+  %param1 = f32[14,116] parameter(1)
+  %param1.copy = f32[14,116] copy(%param1), sharding={devices=[2,1]0,1}
+  ROOT %concatenate = f32[14,373] concatenate(%param0.copy, %param1.copy),
+    dimensions={1}, sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[7,257]"));
+  auto param1 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[7,116]"));
+  EXPECT_THAT(root,
+              AllOf(op::Concatenate(param0, param1), op::Shape("f32[7,373]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConcatenateAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0)
+  %param0.copy = f32[14,257] copy(%param0), sharding={devices=[1,2]0,1}
+  %param1 = f32[14,116] parameter(1)
+  %param1.copy = f32[14,116] copy(%param1), sharding={devices=[1,2]0,1}
+  ROOT %concatenate = f32[14,373] concatenate(%param0.copy, %param1.copy),
+    dimensions={1}, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                      op::Constant(), op::Reshape())),
+            op::Shape("f32[14,129]"));
+  auto param1 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                                op::Reshape())),
+                      op::Shape("f32[14,58]"));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(
+                              AllOf(op::AllReduce(op::DynamicUpdateSlice(
+                                        op::DynamicUpdateSlice(
+                                            op::Broadcast(), param0,
+                                            op::Constant(), op::Multiply()),
+                                        param1, op::Constant(), op::Add())),
+                                    op::Shape("f32[14,374]")),
+                              op::Constant(), op::Multiply()),
+                          op::Shape("f32[14,187]")));
+}
+
+TEST_F(SpmdPartitioningTest, PadAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  %const = f32[] constant(0)
+  ROOT %pad = f32[128,17,257] pad(%param0.copy, %const), padding=0_0x1_2x0_0,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Pad(param0, op::Constant()),
+                          op::Shape("f32[128,17,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, SliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %slice = f32[128,11,257] slice(%param0.copy),
+    slice={[0:128:1], [2:13:1], [0:257:1]}, sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Slice(param0), op::Shape("f32[128,11,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, SliceAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %slice = f32[63,14,251] slice(%param0.copy),
+    slice={[2:128:2], [0:14:1], [5:256:1]}, sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Slice(AllOf(
+                op::DynamicSlice(
+                    AllOf(op::Concatenate(
+                              param0,
+                              AllOf(op::CollectivePermute(op::Slice(param0)),
+                                    op::Shape("f32[128,14,2]"))),
+                          op::Shape("f32[128,14,131]")),
+                    op::Constant(), op::Constant(), op::Add()),
+                op::Shape("f32[128,14,126]"))),
+            op::Shape("f32[63,14,126]")));
+}
+
+TEST_F(SpmdPartitioningTest, SortAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  p.0.lhs.1247 = f32[]{:T(256)} parameter(0), sharding={replicated}
+  bitcast-convert = s32[]{:T(256)} bitcast-convert(p.0.lhs.1247), sharding={replicated}
+  constant = s32[]{:T(256)} constant(0), sharding={replicated}
+  compare = pred[]{:T(256)E(32)} compare(bitcast-convert, constant), direction=LT, sharding={replicated}
+  constant.1 = u32[]{:T(256)} constant(2147483647), sharding={replicated}
+  bitcast-convert.1 = u32[]{:T(256)} bitcast-convert(p.0.lhs.1247), sharding={replicated}
+  subtract = u32[]{:T(256)} subtract(constant.1, bitcast-convert.1), sharding={replicated}
+  bitcast-convert.2 = s32[]{:T(256)} bitcast-convert(subtract), sharding={replicated}
+  select = s32[]{:T(256)} select(compare, bitcast-convert.2, bitcast-convert), sharding={replicated}
+  p.0.rhs.1248 = f32[]{:T(256)} parameter(1), sharding={replicated}
+  bitcast-convert.3 = s32[]{:T(256)} bitcast-convert(p.0.rhs.1248), sharding={replicated}
+  compare.1 = pred[]{:T(256)E(32)} compare(bitcast-convert.3, constant), direction=LT, sharding={replicated}
+  bitcast-convert.4 = u32[]{:T(256)} bitcast-convert(p.0.rhs.1248), sharding={replicated}
+  subtract.1 = u32[]{:T(256)} subtract(constant.1, bitcast-convert.4), sharding={replicated}
+  bitcast-convert.5 = s32[]{:T(256)} bitcast-convert(subtract.1), sharding={replicated}
+  select.1 = s32[]{:T(256)} select(compare.1, bitcast-convert.5, bitcast-convert.3), sharding={replicated}
+  compare.2 = pred[]{:T(256)E(32)} compare(select, select.1), direction=GT, sharding={replicated}
+  compare.258 = pred[]{:T(256)E(32)} compare(select.1, select), direction=GT, sharding={replicated}
+  compare.259 = pred[]{:T(256)E(32)} compare(compare.2, compare.258), direction=EQ, sharding={replicated}
+  p.1.lhs.1249 = s32[]{:T(256)} parameter(2), sharding={replicated}
+  p.1.rhs.1250 = s32[]{:T(256)} parameter(3), sharding={replicated}
+  compare.260 = pred[]{:T(256)E(32)} compare(p.1.lhs.1249, p.1.rhs.1250), direction=LT, sharding={replicated}
+  ROOT select.86 = pred[]{:T(256)E(32)} select(compare.259, compare.260, compare.2), sharding={replicated}
+}
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,2,1]0,1}
+  %param1 = s32[128,14,257] parameter(1)
+  %param1.copy = s32[128,14,257] copy(%param1), sharding={devices=[1,2,1]0,1}
+  ROOT %sort.6 = (f32[128,14,257]{2,1,0:T(8,128)}, s32[128,14,257]{2,1,0:T(8,128)})
+    sort(%param0.copy, %param1.copy), dimensions={2}, is_stable=true,
+    to_apply=%ge, sharding={{devices=[1,2,1]0,1},{devices=[1,2,1]0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("f32[128,7,257]"));
+  auto param1 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("s32[128,7,257]"));
+  EXPECT_THAT(root, AllOf(op::Sort(param0, param1),
+                          op::Shape("(f32[128,7,257], s32[128,7,257])")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionCustomCall) {
+  const char* const hlo_string = R"(
+HloModule cluster_2013453984438090939__.47
+
+ENTRY %cluster_2013453984438090939__.47
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %custom-call = (bf16[2,2000]{1,0}, s32[2,2000]{1,0})
+    custom-call(bf16[2,209664]{1,0} %copy.arg_tuple.1), custom_call_target="TopK"
+  %get-tuple-element = bf16[2,2000]{1,0}
+    get-tuple-element((bf16[2,2000]{1,0}, s32[2,2000]{1,0}) %custom-call),
+    index=0, sharding={replicated}
+  %get-tuple-element.1 = s32[2,2000]{1,0} get-tuple-element((bf16[2,2000]{1,0},
+    s32[2,2000]{1,0}) %custom-call), index=1, sharding={replicated}
+  ROOT %tuple.46 = (bf16[2,2000]{1,0}, s32[2,2000]{1,0})
+    tuple(bf16[2,2000]{1,0} %get-tuple-element, s32[2,2000]{1,0}
+    %get-tuple-element.1), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto custom_call = FindInstruction(module.get(), "custom-call.1");
+  EXPECT_EQ(custom_call->operand(0)->shape().dimensions(1), 104832);
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 4000);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 4000);
+}
+
+TEST_F(SpmdPartitioningTest, ShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0), sharding={devices=[1,2,1,1]0,1}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2}, sharding={devices=[1,1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[16,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[16,4,19,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, MultiDimensionShardedTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[4,2,1,1]0,1,2,3,4,5,6,7}
+  ROOT %transpose = f32[38,4,16,38] transpose(%param0.copy),
+    dimensions={1,3,0,2}, sharding={devices=[2,1,4,1]0,2,4,6,1,3,5,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[4,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[19,4,4,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, NonShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0), sharding={devices=[1,2,1,1]0,1}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2}, sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto resahrd = AllOf(op::Reshape(op::Transpose(op::Reshape(op::AllToAll()))),
+                       op::Shape("f32[16,38,38,2]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(), op::Shape("f32[16,2,38,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, ShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0), sharding={devices=[2,1,1]0,1}
+  ROOT %reshape = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[19,38,324]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(param0), op::Shape("f32[19,38,4,81]")));
+}
+
+TEST_F(SpmdPartitioningTest, NonShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %transpose = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::DynamicSlice(
+                AllOf(op::Pad(
+                          AllOf(op::Reshape(AllOf(op::AllReduce(),
+                                                  op::Shape("f32[38,38,324]"))),
+                                op::Shape("f32[38,38,4,81]")),
+                          op::Constant()),
+                      op::Shape("f32[38,38,4,82]")),
+                op::Constant(), op::Constant(), op::Constant(), op::Reshape()),
+            op::Shape("f32[38,38,4,41]")));
+}
+
+TEST_F(SpmdPartitioningTest, ReshapeMergeDimsWithHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[2,3,7,10] parameter(0), sharding={devices=[1,1,2,1]0,1}
+  ROOT %reshape = s32[3,2,1,14,5] reshape(%input),
+    sharding={devices=[1,1,1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto reshape =
+      AllOf(op::Reshape(op::Parameter(0)), op::Shape("s32[3,2,1,8,5]"));
+  auto halo = op::CollectivePermute(op::Slice(reshape));
+  auto exchanged =
+      op::DynamicSlice(op::Concatenate(halo, reshape), _, _, _, _, _);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(exchanged, op::Shape("s32[3,2,1,7,5]")));
+}
+
+// Produces an invalid module after transformation.
+TEST_F(SpmdPartitioningTest, InceptionV3_4_way_ReduceWindowDilated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[128,5,5,768] parameter(0)
+  %param0.copy = f32[128,5,5,768] copy(%param0),
+    sharding={devices=[1,4,1,1]0,1,2,3}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %rw = f32[128,17,17,768] reduce-window(%param0.copy, %constant.1),
+    window={size=1x5x5x1 pad=0_0x4_4x4_4x0_0 lhs_dilate=1x3x3x1},
+    to_apply=sum, sharding={devices=[1,4,1,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto input_shard = op::Copy(op::DynamicSlice(
+      op::Pad(op::Parameter(0), op::Constant()), op::Constant(), op::Reshape(),
+      op::Constant(), op::Constant()));
+  auto id_mul4_add1 =
+      op::Add(op::Multiply(op::Reshape(), op::Constant()), op::Constant());
+  auto id_mul5 = op::Multiply(op::Reshape(), op::Constant());
+  auto id_mul5_add1_div3 =
+      op::Divide(op::Add(id_mul5, op::Constant()), op::Constant());
+  auto before_masking = AllOf(
+      op::Shape("f32[128,3,5,768]"),
+      op::DynamicSlice(
+          AllOf(
+              op::Shape("f32[128,4,5,768]"),
+              op::Concatenate(op::CollectivePermute(input_shard), input_shard)),
+          op::Constant(),
+          op::Subtract(op::Constant(),
+                       op::Subtract(id_mul4_add1, id_mul5_add1_div3)),
+          op::Constant(), op::Constant()));
+  auto masked = op::Select(
+      op::And(op::Compare(op::Add(op::Iota(), op::Broadcast(id_mul5_add1_div3)),
+                          op::Broadcast(op::Constant())),
+              op::Compare(op::Add(op::Iota(), op::Broadcast(id_mul5_add1_div3)),
+                          op::Broadcast(op::Constant()))),
+      before_masking, op::Broadcast(op::Constant()));
+  auto rw = AllOf(op::Shape("f32[128,7,17,768]"),
+                  op::ReduceWindow(masked, op::Constant()));
+  auto final_slice_index = op::Subtract(
+      id_mul5,
+      op::Add(op::Multiply(id_mul5_add1_div3, op::Constant()), op::Constant()));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Shape("f32[128,5,17,768]"),
+                    op::DynamicSlice(rw, op::Constant(), final_slice_index,
+                                     op::Constant(), op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,32,32,128] parameter(0)
+  %param0.copy = f32[4,32,32,128] copy(%param0),
+    sharding={devices=[1,1,1,2]0,1}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  %reduce = f32[128] reduce(%param0.copy, %constant.1), dimensions={0,1,2},
+    to_apply=%sum, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[4,32,32,64]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::Reduce(param0, op::Constant()), op::Shape("f32[64]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledTupleReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%minmax_func {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(2)
+  %compare.2 = pred[] compare(%lhs_value, %rhs_value), direction=GT
+  %select.4 = f32[] select(%compare.2, %lhs_value, %rhs_value)
+  %lhs_index = s32[] parameter(1)
+  %rhs_index = s32[] parameter(3)
+  %select.5 = s32[] select(%compare.2, %lhs_index, %rhs_index)
+  ROOT %tuple.2 = (f32[], s32[]) tuple(%select.4, %select.5)
+}
+
+ENTRY %main {
+  %param0 = f32[28,10] parameter(0), sharding={devices=[2,1]0,1}
+  %param1 = s32[28,10] parameter(1), sharding={devices=[2,1]0,1}
+  %init0 = f32[] parameter(2)
+  %init1 = s32[] parameter(3)
+  ROOT %reduce = (f32[28], s32[28]) reduce(%param0, %param1, %init0, %init1),
+    dimensions={1}, to_apply=%minmax_func,
+    sharding={{devices=[2]0,1}, {devices=[2]0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Reduce(op::Parameter(0), op::Parameter(1),
+                                     op::Parameter(2), op::Parameter(3)),
+                          op::Shape("(f32[14], s32[14])")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledReduceOutputReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,32,32,128] parameter(0)
+  %param0.copy = f32[4,32,32,128] copy(%param0),
+    sharding={devices=[1,2,1,1]0,1}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  %reduce = f32[128] reduce(%param0.copy, %constant.1), dimensions={0,1,2},
+    to_apply=%sum, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[4,16,32,128]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(
+                        AllOf(op::AllReduce(op::Reduce(param0, op::Constant())),
+                              op::Shape("f32[128]")),
+                        op::Reshape()),
+                    op::Shape("f32[64]")));
+}
+
+TEST_F(SpmdPartitioningTest, IotaAlongNonTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = s32[16,80,91] iota(), iota_dimension=1,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Iota(), op::Shape("s32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, IotaAlongTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = s32[16,80,91] iota(), iota_dimension=2,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Add(op::Iota(), op::Broadcast()),
+                          op::Shape("s32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, U32IotaAlongTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = u32[16,80,91] iota(), iota_dimension=2,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Add(op::Iota(), op::Broadcast()),
+                          op::Shape("u32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, Conditional) {
+  const char* const hlo_string = R"(
+HloModule module
+
+Negate {
+  x = f32[4,5] parameter(0), sharding={replicated}
+  ROOT negate = f32[4,5] negate(x), sharding={replicated}
+}
+
+Identity {
+  y = f32[4,5] parameter(0), sharding={devices=[2,1]0,1}
+  ROOT copy = f32[4,5] copy(y), sharding={devices=[2,1]0,1}
+}
+
+ENTRY entry {
+  %param.0 = pred[] parameter(0)
+  %param.0.copy = pred[] copy(%param.0), sharding={maximal device=0}
+  %param.1 = f32[4,5] parameter(1)
+  %param.1.copy = f32[4,5] copy(%param.1), sharding={replicated}
+  %param.2 = f32[4,5] parameter(2)
+  %param.2.copy = f32[4,5] copy(%param.2), sharding={devices=[2,1]0,1}
+  ROOT cond = f32[4,5] conditional(%param.0.copy, %param.1.copy, %param.2.copy),
+    true_computation=Negate, false_computation=Identity,
+    sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto param0 = AllOf(op::Copy(op::Copy(op::Parameter()), op::Shape("pred[]")));
+  auto param1 = AllOf(op::Copy(op::Parameter()), op::Shape("f32[4,5]"));
+  auto param2 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[2,5]"));
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Conditional(op::AllReduce(), param1, param2),
+                          op::Shape("f32[2,5]")));
+
+  auto then_branch_root = root->branch_computation(0)->root_instruction();
+  EXPECT_THAT(then_branch_root,
+              AllOf(op::DynamicSlice(op::Negate(op::Parameter()), op::Reshape(),
+                                     op::Constant()),
+                    op::Shape("f32[2,5]")));
+
+  auto else_branch_root = root->branch_computation(1)->root_instruction();
+  EXPECT_THAT(else_branch_root,
+              AllOf(op::Copy(op::Parameter()), op::Shape("f32[2,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatter_RetinaNet) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param.0 = f32[32,128,384,64] parameter(0)
+  %param.0.copy = f32[32,128,384,64] copy(%param.0),
+    sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+  %param.1 = f32[32,64,192,64] parameter(1)
+  %param.1.copy = f32[32,64,192,64] copy(%param.1),
+    sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[32,128,384,64] select-and-scatter(param.0.copy,
+    %param.1.copy, constant.1), window={size=1x1x1x1 stride=1x2x2x1},
+    select=ge, scatter=sum, sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto source = AllOf(
+      op::Shape("f32[32,8,192,64]"),
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())));
+  auto data = AllOf(
+      op::Shape("f32[32,16,384,64]"),
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())));
+
+  EXPECT_THAT(root, op::SelectAndScatter(data, source, op::Constant()));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, TiledDot) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,64] parameter(0)
+  %lhs.copy = f32[128,64] copy(%lhs), sharding={devices=[1,2]0,1}
+  %rhs = f32[64,256] parameter(1)
+  %rhs.copy = f32[64,256] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %conv = f32[128,256] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=bf_io->bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                             op::Reshape())),
+                   op::Shape("f32[128,32]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[32,256]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[128,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledDotOutputTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,64] parameter(0)
+  %lhs.copy = f32[128,64] copy(%lhs), sharding={devices=[1,2]0,1}
+  %rhs = f32[64,256] parameter(1)
+  %rhs.copy = f32[64,256] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %conv = f32[128,256] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=bf_io->bf, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                             op::Reshape())),
+                   op::Shape("f32[128,32]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[32,256]"));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(
+                              AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                                    op::Shape("f32[128,256]")),
+                              op::Constant(), op::Reshape()),
+                          op::Shape("f32[128,128]")));
+}
+
+TEST_F(SpmdPartitioningTest, BatchPartitionedConvolution) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,256,256] parameter(0)
+  %lhs.copy = f32[128,256,256] copy(%lhs), sharding={devices=[1,2,1]0,1}
+  %rhs = f32[256,8,1] parameter(1)
+  %rhs.copy = f32[256,8,1] copy(%rhs), sharding={replicated}
+  ROOT %conv = f32[128,256,8] convolution(%lhs.copy, %rhs.copy),
+    window={size=1}, dim_labels=0bf_io0->0bf, sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                             op::Reshape(), op::Constant())),
+                   op::Shape("f32[128,128,256]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[256,8,1]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[128,128,8]")));
+}
+
+TEST_F(SpmdPartitioningTest, DotOutputFeaturePartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,64] parameter(0)
+  %lhs.copy = f32[24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[39296,64] parameter(1)
+  %rhs.copy = f32[39296,64] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %dot = f32[24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[24,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[19648,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[24,19648]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={devices=[2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,24,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumLHSandOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,24,64]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, op::DynamicSlice(rhs, op::Reshape(),
+                                                        op::Constant(),
+                                                        op::Constant())),
+                          op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSandOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[1,2,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={devices=[2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                             op::Reshape(), op::Constant())),
+                   op::Shape("f32[32,12,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,39296,64]"));
+  auto lhs_reshard = op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))));
+  EXPECT_THAT(root,
+              AllOf(op::Dot(lhs_reshard, rhs), op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs_slice =
+      AllOf(op::DynamicSlice(op::Copy(op::Parameter(0)), op::Reshape(),
+                             op::Constant(), op::Constant()),
+            op::Shape("f32[16,24,64]"));
+  auto rhs_slice =
+      AllOf(op::DynamicSlice(op::Copy(op::Parameter(1)), op::Reshape(),
+                             op::Constant(), op::Constant()),
+            op::Shape("f32[16,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs_slice, rhs_slice),
+                          op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,1,2,2]0,1,2,3}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,1,2,2]0,1,2,3}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Reshape())),
+      op::Shape("f32[32,24,32,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Reshape())),
+      op::Shape("f32[32,39296,32,64]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Dot(lhs, rhs)),
+                          op::Shape("f32[32,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumLHSNonContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,128,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[1,2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[32,12,64,64]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[32,12,64,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSNonContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  ROOT %dot = f32[32,24,39296,128] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[1,1,2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[32,19648,64,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[32,24,19648,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputLHSNonContractingDimPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64,128]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Dot(AllOf(op::DynamicSlice(lhs, op::Constant(), op::Reshape(),
+                                           op::Constant(), op::Constant()),
+                          op::Shape("f32[32,12,64,128]")),
+                    rhs),
+            op::Shape("f32[32,12,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputRHSNonContractingDimPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64,128]"));
+  EXPECT_THAT(root,
+              AllOf(op::Dot(lhs, AllOf(op::DynamicSlice(
+                                           rhs, op::Constant(), op::Reshape(),
+                                           op::Constant(), op::Constant()),
+                                       op::Shape("f32[32,19648,64,128]"))),
+                    op::Shape("f32[32,24,19648]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,12,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(1), op::Constant()),
+                                      op::Constant(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[32,19648,64,128]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Slice(AllOf(op::GetTupleElement(op::While(op::Tuple(
+                                lhs, rhs, op::Broadcast(), op::Constant()))),
+                            op::Shape("f32[32,12,39296]"))),
+            op::Shape("f32[32,12,39295]")));
+  auto while_loop = root->operand(0)->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  auto partial_output = op::Dot(op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  EXPECT_THAT(
+      while_loop->while_body()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::Parameter(0)), window,
+                op::DynamicUpdateSlice(op::GetTupleElement(op::Parameter(0)),
+                                       partial_output, op::Constant(),
+                                       op::Constant(), op::Reshape()),
+                next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(1);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,63,128] parameter(0)
+  %lhs.copy = f32[32,24,63,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39296,63,128] parameter(1)
+  %rhs.copy = f32[32,39296,63,128] copy(%rhs), sharding={devices=[1,1,2,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,12,63,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(1), op::Constant()),
+                                      op::Constant(), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("f32[32,39296,32,128]"));
+  auto masked_rhs =
+      op::Select(op::Compare(), rhs, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root,
+              AllOf(op::GetTupleElement(op::While(op::Tuple(
+                        lhs, masked_rhs, op::Broadcast(), op::Constant()))),
+                    op::Shape("f32[32,12,39296]")));
+  auto while_loop = root->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  auto partial_output = op::Dot(
+      op::DynamicSlice(
+          op::Pad(op::GetTupleElement(op::Parameter(0)), op::Constant()),
+          op::Constant(), op::Constant(), op::Reshape(), op::Constant()),
+      op::GetTupleElement(op::Parameter(0)));
+  EXPECT_THAT(
+      while_loop->while_body()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::Parameter(0)), window,
+                op::Add(op::GetTupleElement(op::Parameter(0)), partial_output),
+                next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(1);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContractingReduce1) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+  %constant = f32[] constant(0)
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,39295] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1]0,1}
+  %multiply = f32[32,24,39295] multiply(%dot, %broadcast),
+  sharding={devices=[1,2,1]0,1}
+  ROOT %reduce = f32[32,24] reduce(%multiply, %constant), dimensions={2},
+    to_apply=sum, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContractingReduce2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+  %constant = f32[] constant(0)
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,39295] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1]0,1}
+  %multiply = f32[32,24,39295] multiply(%dot, %broadcast),
+    sharding={devices=[1,2,1]0,1}
+  ROOT %reduce = f32[32,39295] reduce(%multiply, %constant), dimensions={1},
+    to_apply=sum, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedContractingFromBroadcast) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %rhs = f32[32,39296,63,128] parameter(0)
+  %rhs.copy = f32[32,39296,63,128] copy(%rhs), sharding={devices=[1,1,2,1]0,1}
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,63,128] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1,1]0,1}
+  %add = f32[32,24,63,128] add(%broadcast, %broadcast),
+    sharding={devices=[1,2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%add, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, ReplicatedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0)
+  %lhs.copy = s32[] copy(%lhs), sharding={replicated}
+  %rhs = s32[] parameter(1)
+  %rhs.copy = s32[] copy(%rhs), sharding={replicated}
+  ROOT %rng = s32[4]{0} rng(%lhs.copy, %rhs.copy),
+      distribution=rng_uniform, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("s32[]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::AllReduce(op::Select(
+                op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                op::Rng(), op::Broadcast(op::Constant()))),
+            op::Shape("s32[4]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0)
+  %lhs.copy = s32[] copy(%lhs), sharding={replicated}
+  %rhs = s32[] parameter(1)
+  %rhs.copy = s32[] copy(%rhs), sharding={maximal device=1}
+  ROOT %rng = s32[4]{0} rng(%lhs.copy, %rhs.copy),
+      distribution=rng_uniform, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Copy(op::Copy(op::Parameter(1))), op::Shape("s32[]"));
+  EXPECT_THAT(root, AllOf(op::Rng(lhs, op::AllReduce(op::Select(
+                                           op::Broadcast(op::Compare()), rhs,
+                                           op::Broadcast(op::Constant())))),
+                          op::Shape("s32[2]")));
+}
+
+TEST_F(SpmdPartitioningTest, DynamicSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[2,1]0,1}
+  %index = s32[] parameter(1)
+  %constant = s32[] constant(0)
+  ROOT %dynamic-slice = s32[128,2] dynamic-slice(%input.copy, %constant, %index),
+    dynamic_slice_sizes={128,2}, sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Constant())),
+                     op::Shape("s32[64,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(input, op::Constant(), op::Parameter(1)),
+                    op::Shape("s32[64,2]")));
+}
+
+TEST_F(SpmdPartitioningTest, DynamicUpdateSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[2,1]0,1}
+  %index = s32[] parameter(1)
+  %constant = s32[] constant(0)
+  %update = s32[128,2] parameter(2)
+  %update.copy = s32[128,2] copy(%update), sharding={devices=[2,1]0,1}
+  ROOT %dynamic-update-slice = s32[128,64]
+    dynamic-update-slice(%input.copy, %update.copy, %constant, %index),
+    sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Constant())),
+                     op::Shape("s32[64,64]"));
+  auto update = AllOf(op::Copy(op::DynamicSlice(op::Parameter(2), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("s32[64,2]"));
+  EXPECT_THAT(root, AllOf(op::DynamicUpdateSlice(input, update, op::Constant(),
+                                                 op::Parameter(1)),
+                          op::Shape("s32[64,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, PassthroughGather) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Gather(op::Parameter(0), op::Parameter(1)),
+                          op::Shape("f32[3,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, GatherPartitionedOnTrivialSliceDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0), sharding={devices=[2,1]0,1}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[2,3,9] gather(%input, %indices), offset_dims={2},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2,
+    slice_sizes={1,9}, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto min = AllOf(op::Broadcast(offset), op::Shape("s32[2,3]"));
+  auto max = AllOf(op::Broadcast(op::Add(offset, op::Constant())),
+                   op::Shape("s32[2,3]"));
+  auto clamp = op::Clamp(min, op::Parameter(1), max);
+  auto gather = op::Gather(op::Parameter(0), op::Subtract(clamp, min));
+  auto mask =
+      op::Or(op::Lt(op::Parameter(1), min), op::Gt(op::Parameter(1), max));
+  auto masked =
+      op::Select(op::Broadcast(mask), op::Broadcast(op::Constant()), gather);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::AllReduce(masked), op::Shape("f32[2,3,9]")));
+}
+
+TEST_F(SpmdPartitioningTest, PassthroughScatter) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={devices=[1,2]0,1}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Scatter(op::Parameter(0), op::Parameter(1),
+                                      op::Parameter(2)),
+                          op::Shape("f32[2,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, ScatterPartitionedOnTrivialSliceDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0), sharding={devices=[2,1]0,1}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  %updates = f32[2,3,9] parameter(2), sharding={replicated}
+  ROOT %scatter = f32[17,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2, sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto indices = op::Subtract(
+      op::Parameter(1), AllOf(op::Broadcast(offset), op::Shape("s32[2,3]")));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Scatter(op::Parameter(0), indices, op::Parameter(2)),
+                    op::Shape("f32[9,9]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledReverse) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT reverse = f32[3,3]{1,0} reverse(constant), dimensions={1},
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]{1,0}"),
+                          op::Reverse(op::DynamicSlice(
+                              op::Pad(op::Constant(), op::Constant()),
+                              op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, MixWithManualPartitioning) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[8,2] parameter(0), sharding={devices=[2,1]0,1}
+  to_shard = f32[4,2] custom-call(param), custom_call_target="SPMDFullToShardShape", sharding={replicated}
+  add = f32[4,2] add(to_shard, to_shard), sharding={replicated}
+  to_full = f32[8,2] custom-call(add), custom_call_target="SPMDShardToFullShape", sharding={devices=[2,1]0,1}
+  ROOT mul = f32[8,2] multiply(to_full, param), sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  auto to_shard = op::Copy(op::Parameter(0));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2]"),
+                          op::Multiply(op::Copy(op::Add(to_shard, to_shard)),
+                                       op::Parameter(0))));
+}
+
+}  // namespace
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
new file mode 100644
index 00000000000..207f854cd9f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -0,0 +1,662 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace spmd {
+
+bool HasReplicatedSharding(const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    return absl::c_any_of(sharding.tuple_elements(), HasReplicatedSharding);
+  }
+  return sharding.IsReplicated();
+}
+
+HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b) {
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      elements.push_back(
+          CreateZero(ShapeUtil::GetTupleElementShape(shape, i), b));
+    }
+    return b->AddInstruction(HloInstruction::CreateTuple(elements));
+  }
+
+  if (shape.IsToken()) {
+    return b->AddInstruction(HloInstruction::CreateToken());
+  }
+  auto zero = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  return b->AddInstruction(HloInstruction::CreateBroadcast(shape, zero, {}));
+}
+
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
+  HloComputation::Builder sum_b("add");
+  auto x = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
+  auto y = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(type, {}), "y"));
+  if (type == PRED) {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kOr, x, y));
+  } else {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kAdd, x, y));
+  }
+  HloComputation* reduction = module->AddEmbeddedComputation(sum_b.Build());
+  return reduction;
+}
+
+bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      if (!EvenlyPartitions(ShapeUtil::GetTupleElementShape(shape, i),
+                            sharding.GetSubSharding(shape, {i}))) {
+        return false;
+      }
+    }
+  }
+
+  if (sharding.IsTileMaximal()) {
+    return sharding.IsReplicated();
+  }
+  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
+    if (shape.dimensions(i) % sharding.tile_assignment().dim(i) != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    std::vector<Shape> subshapes;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      subshapes.push_back(
+          MakePartitionedShape(ShapeUtil::GetTupleElementShape(shape, i),
+                               sharding.GetSubSharding(shape, {i})));
+    }
+    return ShapeUtil::MakeTupleShape(subshapes);
+  }
+  return sharding.TileShape(shape);
+}
+
+Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
+                                          const HloSharding& sharding,
+                                          int64 partition_id) {
+  if (sharding.IsTuple()) {
+    std::vector<Shape> subshapes;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      subshapes.push_back(MakeNonPaddedShapeForGivenPartition(
+          ShapeUtil::GetTupleElementShape(shape, i),
+          sharding.GetSubSharding(shape, {i}), partition_id));
+    }
+    return ShapeUtil::MakeTupleShape(subshapes);
+  }
+
+  auto partition_shape = shape;
+  std::vector<int64> tile_offset =
+      sharding.TileOffsetForDevice(shape, partition_id);
+  std::vector<int64> tile_limit =
+      sharding.TileLimitForDevice(shape, partition_id);
+  for (int64 i = 0; i < tile_offset.size(); ++i) {
+    if (sharding.UsesDevice(partition_id)) {
+      partition_shape.set_dimensions(i, tile_limit[i] - tile_offset[i]);
+    } else {
+      partition_shape.set_dimensions(i, 0);
+    }
+  }
+  return partition_shape;
+}
+
+std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
+                                                  const HloSharding& sharding,
+                                                  HloInstruction* partition_id,
+                                                  SpmdBuilder* b) {
+  CHECK(!shape.IsTuple());
+
+  Array2D<int32> offset_array(
+      {sharding.tile_assignment().num_elements(), shape.rank()});
+  offset_array.Each([&](int64 i, int64 j, int32* value) {
+    *value = sharding.TileOffsetForDevice(shape, i)[j];
+  });
+  auto offset_table = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2FromArray2D(offset_array)));
+  std::vector<HloInstruction*> offsets;
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    if (sharding.tile_assignment().dim(i) == 1) {
+      offsets.push_back(b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32))));
+    } else {
+      auto index = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          ShapeUtil::MakeShape(S32, {1, 1}), offset_table,
+          {partition_id, b->AddInstruction(HloInstruction::CreateConstant(
+                             LiteralUtil::CreateR0<uint32>(i)))},
+          {1, 1}));
+      offsets.push_back(b->AddInstruction(
+          HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), index)));
+    }
+  }
+  return offsets;
+}
+
+std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
+    const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b) {
+  CHECK(!sharding.IsTileMaximal());
+  auto table_shape =
+      ShapeUtil::MakeShape(S32, sharding.tile_assignment().dimensions());
+  return MakePartitionOffsets(table_shape, sharding, partition_id, b);
+}
+
+HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape,
+                           SpmdBuilder* b, HloComputation* computation) {
+  CHECK(b == nullptr || computation == nullptr);
+  if (ShapeUtil::Compatible(hlo->shape(), padded_shape)) {
+    return hlo;
+  }
+  PaddingConfig padding_config;
+  for (int64 i = 0; i < padded_shape.rank(); ++i) {
+    auto padding_config_dim = padding_config.add_dimensions();
+    padding_config_dim->set_edge_padding_low(0);
+    padding_config_dim->set_interior_padding(0);
+    padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                              hlo->shape().dimensions(i));
+  }
+  auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
+    if (b == nullptr) {
+      return computation->AddInstruction(std::move(to_add));
+    }
+    return b->AddInstruction(std::move(to_add));
+  };
+  auto zero = add_hlo(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  return add_hlo(
+      HloInstruction::CreatePad(padded_shape, hlo, zero, padding_config));
+}
+
+Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
+                                          const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return base_shape;
+  }
+  if (EvenlyPartitions(base_shape, sharding)) {
+    return base_shape;
+  }
+  auto shard_shape = MakePartitionedShape(base_shape, sharding);
+  Shape padded_base_shape = base_shape;
+  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
+    padded_base_shape.set_dimensions(
+        i, shard_shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  return padded_base_shape;
+}
+
+HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
+    HloInstruction* hlo, const HloSharding& sharding, SpmdBuilder* b) {
+  auto padded_base_shape =
+      GetPaddedShapeForUnevenPartitioning(hlo->shape(), sharding);
+  if (ShapeUtil::Compatible(padded_base_shape, hlo->shape())) {
+    return hlo;
+  }
+  return PadToShape(hlo, padded_base_shape, b);
+}
+
+absl::optional<int64> UniqueTiledDim(const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return absl::nullopt;
+  }
+  int64 dim = -1;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (sharding.tile_assignment().dim(i) > 1) {
+      if (dim != -1) {
+        return absl::nullopt;
+      }
+      dim = i;
+    }
+  }
+  CHECK_NE(dim, -1);
+  return dim;
+}
+
+MultiplyAddDivideOffsetCalculation::MultiplyAddDivideOffsetCalculation(
+    int64 multiplier, int64 offset, int64 divisor)
+    : multiplier_(multiplier), offset_(offset), divisor_(divisor) {
+  CHECK_GT(divisor_, 0);
+  Simplify();
+}
+
+OffsetCalculation MultiplyAddDivideOffsetCalculation::operator-(
+    const MultiplyAddDivideOffsetCalculation& other) const {
+  if (divisor_ == 1 && other.divisor_ == 1) {
+    return OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+        multiplier_ - other.multiplier_, offset_ - other.offset_, 1));
+  }
+  return OffsetCalculation(HloOpcode::kSubtract, *this, other);
+}
+
+void MultiplyAddDivideOffsetCalculation::Simplify() {
+  // We could simplify the calculation when multiplier is a multiple of
+  // divisor_. However, when offset_ is not a multiple of divisor_, we must
+  // make sure that offset_ and multiplier_ are both non-negative or both
+  // non-positive. E.g., (3 * i  - 1) / 3 is not equivalent to i or i - 1.
+  if (divisor_ != 1 && multiplier_ % divisor_ == 0 &&
+      (offset_ % divisor_ == 0 || offset_ * multiplier_ > 0)) {
+    multiplier_ /= divisor_;
+    offset_ /= divisor_;
+    divisor_ = 1;
+  }
+}
+
+int64 MultiplyAddDivideOffsetCalculation::Calculate(int64 shard_ordinal) const {
+  return (shard_ordinal * multiplier_ + offset_) / divisor_;
+}
+
+HloInstruction* MultiplyAddDivideOffsetCalculation::Calculate(
+    HloInstruction* shard_ordinal, SpmdBuilder* b) const {
+  auto scalar_shape = ShapeUtil::MakeShape(S32, {});
+  if (multiplier_ == 0) {
+    return b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(offset_ / divisor_)));
+  }
+  HloInstruction* result = shard_ordinal;
+  if (multiplier_ != 1) {
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kMultiply, shard_ordinal,
+        b->AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32>(multiplier_)))));
+  }
+  if (offset_ != 0) {
+    auto offset = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(offset_)));
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kAdd, result, offset));
+  }
+  if (divisor_ != 1) {
+    auto divisor = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(divisor_)));
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kDivide, result, divisor));
+  }
+  return result;
+}
+
+int64 MultiplyAddDivideOffsetCalculation::MaxInRange(
+    int64 start_ordinal, int64 limit_ordinal) const {
+  int64 max = Calculate(start_ordinal);
+  for (int64 i = start_ordinal + 1; i < limit_ordinal; ++i) {
+    max = std::max(max, Calculate(i));
+  }
+  return max;
+}
+
+OffsetCalculation& OffsetCalculation::operator=(
+    const OffsetCalculation& other) {
+  opcode_ = other.opcode_;
+  copy_from_ = other.copy_from_;
+  if (opcode_ != HloOpcode::kCopy) {
+    lhs_ = absl::make_unique<OffsetCalculation>(*other.lhs_);
+    rhs_ = absl::make_unique<OffsetCalculation>(*other.rhs_);
+  }
+  return *this;
+}
+
+bool OffsetCalculation::IsConstant() const {
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_.IsConstant();
+  }
+  if (opcode_ == HloOpcode::kSubtract && *lhs_ == *rhs_) {
+    return true;
+  }
+  return lhs_->IsConstant() && rhs_->IsConstant();
+}
+
+OffsetCalculation OffsetCalculation::operator-(
+    const OffsetCalculation& other) const {
+  if (opcode_ == HloOpcode::kCopy && other.opcode_ == HloOpcode::kCopy) {
+    return copy_from_ - other.copy_from_;
+  }
+  return OffsetCalculation(HloOpcode::kSubtract, *this, other);
+}
+
+bool OffsetCalculation::operator==(const OffsetCalculation& other) const {
+  if (opcode_ != other.opcode_) {
+    return false;
+  }
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_ == other.copy_from_;
+  }
+  return *lhs_ == *other.lhs_ && *rhs_ == *other.rhs_;
+}
+
+int64 OffsetCalculation::Calculate(int64 shard_ordinal) const {
+  switch (opcode_) {
+    case HloOpcode::kCopy:
+      return copy_from_.Calculate(shard_ordinal);
+    case HloOpcode::kSubtract:
+      return lhs_->Calculate(shard_ordinal) - rhs_->Calculate(shard_ordinal);
+    case HloOpcode::kMultiply:
+      return lhs_->Calculate(shard_ordinal) * rhs_->Calculate(shard_ordinal);
+    default:
+      LOG(FATAL) << "Should not happen";
+  }
+}
+
+HloInstruction* OffsetCalculation::Calculate(HloInstruction* shard_ordinal,
+                                             SpmdBuilder* b) const {
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_.Calculate(shard_ordinal, b);
+  }
+  auto lhs = lhs_->Calculate(shard_ordinal, b);
+  auto rhs = rhs_->Calculate(shard_ordinal, b);
+  return b->AddInstruction(
+      HloInstruction::CreateBinary(lhs->shape(), opcode_, lhs, rhs));
+}
+
+int64 OffsetCalculation::MaxInRange(int64 start_ordinal,
+                                    int64 limit_ordinal) const {
+  if (IsConstant()) {
+    return Calculate(start_ordinal);
+  }
+  if (opcode_ == HloOpcode::kCopy) {
+    return std::max(Calculate(start_ordinal), Calculate(limit_ordinal - 1));
+  }
+  int64 max = Calculate(start_ordinal);
+  for (int64 i = start_ordinal + 1; i < limit_ordinal; ++i) {
+    max = std::max(max, Calculate(i));
+  }
+  return max;
+}
+
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo, const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function, int64 dim,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b) {
+  int64 input_shard_size = hlo->shape().dimensions(dim);
+  int64 shard_count = target.tile_assignment().dim(dim);
+
+  std::vector<HloInstruction*> concat_pieces;
+
+  int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
+  if (max_left_halo_size > input_shard_size) {
+    VLOG(1) << "ExchangeHalo failed: halo is beyond the left neighbor.";
+    return absl::nullopt;
+  }
+  if (max_left_halo_size > 0) {
+    std::vector<std::pair<int64, int64>> source_target_pairs;
+    target.tile_assignment().Each(
+        [&](absl::Span<const int64> indices, int64 device) {
+          if (indices[dim] > 0) {
+            std::vector<int64> source_indices(indices.begin(), indices.end());
+            source_indices[dim] -= 1;
+            source_target_pairs.emplace_back(
+                target.tile_assignment()(source_indices), device);
+          }
+        });
+    auto halo_shape = hlo->shape();
+    auto source_halo_slice = hlo;
+    if (max_left_halo_size != hlo->shape().dimensions(dim)) {
+      halo_shape.set_dimensions(dim, max_left_halo_size);
+      std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+      halo_start_indices[dim] =
+          hlo->shape().dimensions(dim) - max_left_halo_size;
+      std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+
+      source_halo_slice = b->AddInstruction(
+          hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
+                           hlo->shape().dimensions(), halo_slice_strides));
+    }
+    auto left_halo =
+        collective_ops_creator.create_cross_partition_collective_permute(
+            b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
+    concat_pieces.push_back(left_halo);
+  }
+
+  concat_pieces.push_back(hlo);
+
+  // Right halo.
+  int64 max_right_halo_size =
+      right_halo_size_function.MaxInRange(0, shard_count - 1);
+  if (max_right_halo_size > input_shard_size) {
+    VLOG(1) << "ExchangeHalo failed: halo is beyond the right neighbor.";
+    return absl::nullopt;
+  }
+  if (max_right_halo_size > 0) {
+    std::vector<std::pair<int64, int64>> source_target_pairs;
+    target.tile_assignment().Each(
+        [&](absl::Span<const int64> indices, int64 device) {
+          if (indices[dim] > 0) {
+            std::vector<int64> target_indices(indices.begin(), indices.end());
+            target_indices[dim] -= 1;
+            source_target_pairs.emplace_back(
+                device, target.tile_assignment()(target_indices));
+          }
+        });
+    auto halo_shape = hlo->shape();
+    halo_shape.set_dimensions(dim, max_right_halo_size);
+    std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+    std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+
+    auto source_halo_slice = b->AddInstruction(
+        hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
+                         halo_shape.dimensions(), halo_slice_strides));
+    auto right_halo =
+        collective_ops_creator.create_cross_partition_collective_permute(
+            b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
+    concat_pieces.push_back(right_halo);
+  }
+
+  auto concat = hlo;
+  // Concat with halos/padding.
+  if (concat_pieces.size() > 1) {
+    auto concat_shape = hlo->shape();
+    int64 concat_dim_size = 0;
+    for (auto piece : concat_pieces) {
+      concat_dim_size += piece->shape().dimensions(dim);
+    }
+    concat_shape.set_dimensions(dim, concat_dim_size);
+    concat = b->AddInstruction(
+        HloInstruction::CreateConcatenate(concat_shape, concat_pieces, dim));
+  }
+
+  return concat;
+}
+
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo,
+    std::vector<OffsetCalculation> left_halo_size_functions,
+    std::vector<OffsetCalculation> right_halo_size_functions,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b) {
+  CHECK(left_halo_size_functions.size() == hlo->shape().rank());
+  CHECK(right_halo_size_functions.size() == hlo->shape().rank());
+
+  HloInstruction* visiting_hlo = hlo;
+  for (int dim = 0; dim < hlo->shape().rank(); ++dim) {
+    auto concat = ExchangeHalo(visiting_hlo, left_halo_size_functions[dim],
+                               right_halo_size_functions[dim], dim, target,
+                               collective_ops_creator, next_channel_id, b);
+    if (!concat) {
+      return absl::nullopt;
+    }
+    visiting_hlo = *concat;
+  }
+  return visiting_hlo;
+}
+
+absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    int64 explicit_left_padding_on_full_shape, int64 padded_full_shape_size,
+    int64 shard_size_with_halo, int64 dim, const HloSharding& target,
+    HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
+    HloInstruction* partition_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b, bool mask_invalid_region) {
+  auto halo_exchange_result =
+      ExchangeHalo(hlo, left_halo_size_function, right_halo_size_function, dim,
+                   target, collective_ops_creator, next_channel_id, b);
+  if (!halo_exchange_result) {
+    return absl::nullopt;
+  }
+  auto concat = *halo_exchange_result;
+  int64 shard_count = target.tile_assignment().dim(dim);
+  int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
+
+  // Now we determine if we need extra padding after the concat.
+  //
+  // The max of halo size or the first shard's explicit left padding.
+  int64 max_left_halo_or_padding_size =
+      std::max(std::max(int64{0}, max_left_halo_size),
+               explicit_left_padding_on_full_shape);
+  // The calculation that returns the dynamic slice index for a shard on the
+  // padded concat, which is the difference between
+  // max_left_halo_or_padding_size and its left halo size.
+  auto start_offset_on_padded_concat_calculation =
+      OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+          0, max_left_halo_or_padding_size, 1)) -
+      left_halo_size_function;
+
+  // See if we need to pad the concat before dynamic slice.
+  int64 extra_left_padding =
+      std::max(int64{0}, max_left_halo_or_padding_size -
+                             std::max(int64{0}, max_left_halo_size));
+  int64 extra_right_padding =
+      start_offset_on_padded_concat_calculation.MaxInRange(0, shard_count) +
+      shard_size_with_halo - concat->shape().dimensions(dim) -
+      extra_left_padding;
+  extra_right_padding = std::max(int64{0}, extra_right_padding);
+  if (extra_left_padding > 0 || extra_right_padding > 0) {
+    PaddingConfig padding_config;
+    auto padded_concat_shape = concat->shape();
+    for (int64 i = 0; i < base_shape.rank(); ++i) {
+      auto padding_config_dim = padding_config.add_dimensions();
+      padding_config_dim->set_interior_padding(0);
+      padding_config_dim->set_edge_padding_low(0);
+      padding_config_dim->set_edge_padding_high(0);
+      if (i != dim) {
+        continue;
+      }
+      padding_config_dim->set_edge_padding_low(extra_left_padding);
+      padding_config_dim->set_edge_padding_high(extra_right_padding);
+      padded_concat_shape.set_dimensions(dim, concat->shape().dimensions(dim) +
+                                                  extra_left_padding +
+                                                  extra_right_padding);
+    }
+    concat = b->AddInstruction(HloInstruction::CreatePad(
+        padded_concat_shape, concat, pad_value, padding_config));
+  }
+
+  auto valid_slice = concat;
+  if (shard_size_with_halo != concat->shape().dimensions(dim)) {
+    // Concat is bigger than the shard shape, so we need a dynamic slice.
+    CHECK_LT(shard_size_with_halo, concat->shape().dimensions(dim));
+    auto slice_shape = concat->shape();
+    slice_shape.set_dimensions(dim, shard_size_with_halo);
+
+    if (left_halo_size_function.IsConstant() &&
+        left_halo_size_function.Calculate(0) ==
+            explicit_left_padding_on_full_shape) {
+      std::vector<int64> start_indices(slice_shape.rank(), 0);
+      std::vector<int64> strides(slice_shape.rank(), 1);
+      valid_slice = b->AddInstruction(
+          HloInstruction::CreateSlice(slice_shape, concat, start_indices,
+                                      slice_shape.dimensions(), strides));
+    } else {
+      auto zero = b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      std::vector<HloInstruction*> slice_offsets(base_shape.rank(), zero);
+      slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
+          partition_ordinal, b);
+      valid_slice = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+    }
+  }
+
+  if (!mask_invalid_region) {
+    return valid_slice;
+  }
+
+  int64 total_right_padding = padded_full_shape_size -
+                              base_shape.dimensions(dim) -
+                              explicit_left_padding_on_full_shape;
+  // Mask off garbage data due to uneven partition or low/high padding.
+  if (explicit_left_padding_on_full_shape > 0 || total_right_padding > 0) {
+    auto index_shape = ShapeUtil::ChangeElementType(valid_slice->shape(), S32);
+    auto iota = b->AddInstruction(HloInstruction::CreateIota(index_shape, dim));
+    auto broadcast_start_index_in_padded_shape =
+        b->AddInstruction(HloInstruction::CreateBroadcast(
+            index_shape, offset_on_padded_shape, {}));
+    auto index_in_padded_shape = b->AddInstruction(
+        HloInstruction::CreateBinary(index_shape, HloOpcode::kAdd, iota,
+                                     broadcast_start_index_in_padded_shape));
+    auto mask_shape = ShapeUtil::ChangeElementType(index_shape, PRED);
+    std::vector<HloInstruction*> predicates;
+    if (explicit_left_padding_on_full_shape > 0) {
+      auto valid_index_start =
+          b->AddInstruction(HloInstruction::CreateBroadcast(
+              index_shape,
+              b->AddInstruction(
+                  HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                      explicit_left_padding_on_full_shape))),
+              {}));
+      predicates.push_back(b->AddInstruction(HloInstruction::CreateCompare(
+          mask_shape, index_in_padded_shape, valid_index_start,
+          ComparisonDirection::kGe)));
+    }
+    if (total_right_padding > 0) {
+      auto valid_index_limit =
+          b->AddInstruction(HloInstruction::CreateBroadcast(
+              index_shape,
+              b->AddInstruction(
+                  HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                      base_shape.dimensions(dim) +
+                      explicit_left_padding_on_full_shape))),
+              {}));
+      predicates.push_back(b->AddInstruction(HloInstruction::CreateCompare(
+          mask_shape, index_in_padded_shape, valid_index_limit,
+          ComparisonDirection::kLt)));
+    }
+    CHECK(!predicates.empty());
+    auto is_valid =
+        predicates.size() == 2
+            ? b->AddInstruction(HloInstruction::CreateBinary(
+                  mask_shape, HloOpcode::kAnd, predicates[0], predicates[1]))
+            : predicates[0];
+    auto masking_value = b->AddInstruction(
+        HloInstruction::CreateBroadcast(valid_slice->shape(), pad_value, {}));
+    valid_slice = b->AddInstruction(
+        HloInstruction::CreateTernary(valid_slice->shape(), HloOpcode::kSelect,
+                                      is_valid, valid_slice, masking_value));
+  }
+  return valid_slice;
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
new file mode 100644
index 00000000000..f96b23d7073
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -0,0 +1,229 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+namespace xla {
+namespace spmd {
+
+// Returns true if the given sharding contains any replicated sharding.
+bool HasReplicatedSharding(const HloSharding& sharding);
+
+// Creates zero value instructions of the given shape.
+HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b);
+
+template <typename NativeT>
+HloInstruction* CreateR0WithType(PrimitiveType type, NativeT value,
+                                 SpmdBuilder* b) {
+  auto literal = LiteralUtil::CreateR0(value)
+                     .ConvertToShape(ShapeUtil::MakeShape(type, {}))
+                     .ValueOrDie();
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+// Create a binary add computation of the given type and add to the module.
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module);
+
+// Returns true if the shape can be evenly partitioned for the given sharding.
+// All tile sharded dimensions should be evenly divisible and there should be no
+// single-device sharding. Replicate sharding is considered even partition.
+bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding);
+
+// Returns the shard shape of the given shape when it is partitioned for the
+// target sharding.
+Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding);
+
+// Returns the shard shape for a partition without padding due to uneven
+// sharding.
+Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
+                                          const HloSharding& sharding,
+                                          int64 partition_id);
+
+// Generates the HLO instructions that represent the dimension offsets on any
+// device. The size of the returned vector is the rank of the given shape.
+std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
+                                                  const HloSharding& sharding,
+                                                  HloInstruction* partition_id,
+                                                  SpmdBuilder* b);
+
+// Returns the offsets of the partition in the tile assignment.
+std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
+    const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Pads hlo to the desired shape using high padding. Either a builder or a
+// computation needs to be supplied, but not both.
+HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape,
+                           SpmdBuilder* b,
+                           HloComputation* computation = nullptr);
+
+// Returns the padded shape when combining all partitions.
+Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
+                                          const HloSharding& sharding);
+
+// Pads the HLO (with base shape) for uneven tiled partition to make it evenly
+// partitionable.
+HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
+    HloInstruction* hlo, const HloSharding& sharding, SpmdBuilder* b);
+
+// Returns the index of the unique tile dimension. Returns absl::nullopt if the
+// given sharding is not tiled or tiled along multiple dimensions.
+absl::optional<int64> UniqueTiledDim(const HloSharding& sharding);
+
+// Utilities for symbolic offset calculation and halo exchange.
+class OffsetCalculation;
+
+// Represents a calculation over integers:
+//   (shard_ordinal * multiplier + offset) / divisor
+class MultiplyAddDivideOffsetCalculation {
+ public:
+  MultiplyAddDivideOffsetCalculation()
+      : multiplier_(0), offset_(0), divisor_(1) {}
+  MultiplyAddDivideOffsetCalculation(int64 multiplier, int64 offset,
+                                     int64 divisor);
+
+  OffsetCalculation operator-(
+      const MultiplyAddDivideOffsetCalculation& other) const;
+
+  bool operator==(const MultiplyAddDivideOffsetCalculation& other) const {
+    return multiplier_ == other.multiplier_ && offset_ == other.offset_ &&
+           divisor_ == other.divisor_;
+  }
+
+  bool IsConstant() const { return multiplier_ == 0; }
+  void Simplify();
+  int64 Calculate(int64 shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64 MaxInRange(int64 start_ordinal, int64 limit_ordinal) const;
+
+ private:
+  int64 multiplier_;
+  int64 offset_;
+  int64 divisor_;
+};
+
+// Represents a calculation over integers based on results of other calculations
+// defined by an opcode. If the opcode is kCopy, it simply wraps an
+// MultiplyAddDivideOffsetCalculation.
+class OffsetCalculation {
+ public:
+  OffsetCalculation() : opcode_(HloOpcode::kCopy), copy_from_() {}
+  explicit OffsetCalculation(
+      const MultiplyAddDivideOffsetCalculation& copy_from)
+      : opcode_(HloOpcode::kCopy), copy_from_(copy_from) {}
+  OffsetCalculation(const OffsetCalculation& copy_from) { *this = copy_from; }
+  OffsetCalculation(HloOpcode opcode,
+                    const MultiplyAddDivideOffsetCalculation& lhs,
+                    const MultiplyAddDivideOffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(absl::make_unique<OffsetCalculation>(lhs)),
+        rhs_(absl::make_unique<OffsetCalculation>(rhs)) {}
+  OffsetCalculation(HloOpcode opcode, const OffsetCalculation& lhs,
+                    const OffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(absl::make_unique<OffsetCalculation>(lhs)),
+        rhs_(absl::make_unique<OffsetCalculation>(rhs)) {}
+
+  OffsetCalculation& operator=(const OffsetCalculation& other);
+
+  // Returns whether the calculation returns the same value for all shards. This
+  // is conservative and could return false even if it is actually constant.
+  bool IsConstant() const;
+
+  OffsetCalculation operator-(const OffsetCalculation& other) const;
+  bool operator==(const OffsetCalculation& other) const;
+  int64 Calculate(int64 shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64 MaxInRange(int64 start_ordinal, int64 limit_ordinal) const;
+
+ private:
+  HloOpcode opcode_;
+  std::unique_ptr<OffsetCalculation> lhs_;
+  std::unique_ptr<OffsetCalculation> rhs_;
+  MultiplyAddDivideOffsetCalculation copy_from_;
+};
+
+// Performs halo exchange on the given dimension based on the provided
+// left/right halo size functions. Returns nullopt if the halo is beyond the
+// direct neighbor of the shard.
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo, const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function, int64 dim,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b);
+
+// Exchange halo on all dimensions of the HLO. Returns nullopt if any one of the
+// dimensions fails to exchange halo (halo is beyond the neighbor shard).
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo,
+    std::vector<OffsetCalculation> left_halo_size_functions,
+    std::vector<OffsetCalculation> right_halo_size_functions,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b);
+
+// Exchanges halos and performs pad/dynamic-slice on the concatenated data such
+// that the result starts with the first needed element on each shard. It also
+// masks off invalid data due to padding.
+// Arguments:
+//  hlo: the HLO op before halo exchange
+//  explicit_left_padding_on_full_shape: the amount of left padding to be added
+//   explicitly by this function on the base shape before partitioning. Without
+//   base dilation, this is usually set to the window's padding_low so that the
+//   sharded op do not need to add padding_low on the window; however, with base
+//   dilation, this could only be set to a custom size.
+//  padded_full_shape_size: the size of the padded full shape on the given
+//   dimension, which includes explicit_left_padding_on_full_shape and required
+//   right padding to make the shape evenly shardable.
+//  shard_size_with_halo: the shard size on the dimension after halo exchange.
+//   If different shards have different sizes, use the maximum size.
+//  offset_on_padded_shape: the offset HLO (S32) that represents the start of
+//   each shard on the padded full shape.
+//  pad_value: the padding value used on the full shape.
+absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    int64 explicit_left_padding_on_full_shape, int64 padded_full_shape_size,
+    int64 shard_size_with_halo, int64 dim, const HloSharding& target,
+    HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
+    HloInstruction* partition_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b, bool mask_invalid_region = true);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index e5fa8ebae53..e3f8ceacc42 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory.h"
 
 namespace xla {
 
@@ -256,6 +257,13 @@ class TransferManager {
     return false;
   }
 
+  // Equivalent to CanShapedBufferBeAccessedNow but for a single device buffer.
+  virtual bool CanBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const se::DeviceMemoryBase& device_buffer) const {
+    return false;
+  }
+
   /////
   // The TransferManager class also serves as a point to register objects for
   // the various platforms.
diff --git a/tensorflow/compiler/xla/service/tuple_util.h b/tensorflow/compiler/xla/service/tuple_util.h
index bc5aac09f27..ee7b8be0818 100644
--- a/tensorflow/compiler/xla/service/tuple_util.h
+++ b/tensorflow/compiler/xla/service/tuple_util.h
@@ -39,6 +39,13 @@ class TupleUtil {
   static HloInstruction* AppendSuffix(
       HloInstruction* input_tuple,
       absl::Span<HloInstruction* const> trailing_values);
+
+  // Generates HLO instructions that duplicates the tuple by inserting
+  // get-tuple-elements and a new tuple instruction. Returns the root of the
+  // graph of instructions generated.
+  static HloInstruction* Duplicate(HloInstruction* input_tuple) {
+    return ExtractPrefix(input_tuple, input_tuple->shape().tuple_shapes_size());
+  }
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 2d33184b7d0..1111811d3a3 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -300,7 +300,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
 }
 
 StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
-  VLOG(2) << "HLO module before WhileLoopConstantSinking:";
+  VLOG(2) << "HLO module before WhileLoopInvariantCodeMotion:";
   XLA_VLOG_LINES(2, module->ToString());
 
   bool changed = false;
@@ -332,10 +332,10 @@ StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
   }
 
   if (changed) {
-    VLOG(2) << "HLO module after WhileLoopConstantSinking:";
+    VLOG(2) << "HLO module after WhileLoopInvariantCodeMotion:";
     XLA_VLOG_LINES(2, module->ToString());
   } else {
-    VLOG(2) << "HLO module unchanged after WhileLoopConstantSinking";
+    VLOG(2) << "HLO module unchanged after WhileLoopInvariantCodeMotion";
   }
 
   return changed;
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 1b29da0660a..c80123bcd50 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -496,14 +496,43 @@ static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
   // Transform while loops with static trip count of 1 into a call op, then
   // inline the call.
   if (trip_count && *trip_count == 1) {
-    auto computation = while_op->parent();
-    auto call_op = computation->AddInstruction(HloInstruction::CreateCall(
-        while_op->shape(), while_op->operands(), while_op->while_body()));
-    TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, call_op));
-    TF_ASSIGN_OR_RETURN(auto inlined_instructions_map,
-                        CallInliner::Inline(call_op));
-    (void)inlined_instructions_map;
-    return true;
+    // Do not simplify the loop away when there is a side-effectful op,
+    // otherwise the infeed op may not inherit the data dependency from
+    // the while loop.
+    //
+    // Example: while_body (param_a) {
+    //   param_a = parameter(0)
+    //   infeed2 = infeed()
+    // }
+    //
+    // infeed1 = ...
+    // while = while(infeed1), body=while_body // infeed2 has implicit
+    // dependency on infeed1.
+    //
+    // After simplification:
+    //
+    // infeed1 = ...
+    // infeed2 = infeed() // no dependency between infeed1 and infeed2. infeed1
+    //                    // can be scheduled after infeed2.
+    //
+    bool has_side_effects = absl::c_any_of(
+        while_op->called_computations(), [](const HloComputation* computation) {
+          return computation->HasSideEffect();
+        });
+    if (!has_side_effects) {
+      auto computation = while_op->parent();
+      auto call_op = computation->AddInstruction(HloInstruction::CreateCall(
+          while_op->shape(), while_op->operands(), while_op->while_body()));
+      TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, call_op));
+      TF_ASSIGN_OR_RETURN(auto inlined_instructions_map,
+                          CallInliner::Inline(call_op));
+      (void)inlined_instructions_map;
+      return true;
+    } else {
+      VLOG(2) << "Not attempting to simplify while loop because it contains a "
+                 "side-effecting node: "
+              << while_op->ToShortString();
+    }
   }
   return false;
 }
@@ -1014,35 +1043,6 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
       continue;
     }
 
-    // Do not simplify the loop away when there is a side-effectful op,
-    // otherwise the infeed op may not inherit the data dependency from
-    // the while loop.
-    //
-    // Example: while_body (param_a) {
-    //   param_a = parameter(0)
-    //   infeed2 = infeed()
-    // }
-    //
-    // infeed1 = ...
-    // while = while(infeed1), body=while_body // infeed2 has implicit
-    // dependency on infeed1.
-    //
-    // After simplification:
-    //
-    // infeed1 = ...
-    // infeed2 = infeed() // no dependency between infeed1 and infeed2. infeed1
-    //                    // can be scheduled after infeed2.
-    //
-    bool has_side_effects = absl::c_any_of(
-        while_op->called_computations(), [](const HloComputation* computation) {
-          return computation->HasSideEffect();
-        });
-    if (has_side_effects) {
-      VLOG(2) << "Not attempting to simplify while loop because it contains a "
-                 "side-effecting node: "
-              << while_op->ToShortString();
-      continue;
-    }
     TF_ASSIGN_OR_RETURN(bool result, TryPropagateConstant(while_op));
     changed |= result;
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index b5f9d0ce9de..d715fb3857a 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -444,6 +444,47 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
                      op::GetTupleElement(op::Parameter(0), /*tuple_index=*/1)));
 }
 
+// Check that we can remove unused loop operands even if the loop contains a
+// side-effecting instruction.
+TEST_F(WhileLoopSimplifierTest,
+       RemoveUnusedLoopOperandsDespiteSideEffectingOps) {
+  const string hlo_string = R"(
+  HloModule RemoveUnusedOperands
+  body {
+    loop_var = (s32[]) parameter(0)
+    gte0 = s32[] get-tuple-element(loop_var), index=0
+    token0 = token[] after-all()
+    unused = ((s32[], pred[]), token[]) infeed(token0)
+    ROOT tuple = (s32[]) tuple(gte0)
+  }
+  cond {
+    loop_var = (s32[]) parameter(0)
+    ROOT constant = pred[] constant(true)
+  }
+  ENTRY RemoveUnusedOperands {
+    x = s32[] parameter(0)
+    tuple.1 = (s32[]) tuple(s32[] x)
+    ROOT while = (s32[]) while((s32[]) tuple.1),
+      condition=cond, body=body
+  }
+  )";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+
+  // The original while instruction is still left in the module as a dead
+  // instruction, find a while instruction with a different name as the new
+  // while instruction.
+  const auto& instrs = m->entry_computation()->instructions();
+  HloInstruction* new_while_op =
+      *absl::c_find_if(instrs, [&](const HloInstruction* instr) {
+        return (instr->opcode() == HloOpcode::kWhile &&
+                instr->name() != "while");
+      });
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(new_while_op->shape()))
+      << new_while_op->shape().ToString();
+}
+
 TEST_F(WhileLoopSimplifierTest, LoopWithNonTupleBodyShapeNotSimplified) {
   const string hlo_string = R"(
   HloModule BodyHasNonTupleRoot
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index e5d64b20f0f..f2c4f7ffed2 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -125,8 +125,9 @@ WhileUtil::MakeInstructionsLiveIn(
   // We want to get rid of the old while instruction even if it has side
   // effecting operations so we do a manual HloComputation::RemoveInstruction
   // instead of relying on HloComputation::ReplaceInstruction.
-  TF_RETURN_IF_ERROR(while_instr->ReplaceAllUsesWith(TupleUtil::ExtractPrefix(
-      new_while, while_instr->shape().tuple_shapes_size())));
+  HloInstruction* replacement_instr = TupleUtil::ExtractPrefix(
+      new_while, while_instr->shape().tuple_shapes_size());
+  TF_RETURN_IF_ERROR(while_instr->ReplaceAllUsesWith(replacement_instr));
   TF_RETURN_IF_ERROR(containing_computation->RemoveInstruction(while_instr));
 
   HloInstruction* while_body_param = new_while_body->parameter_instruction(0);
@@ -142,6 +143,7 @@ WhileUtil::MakeInstructionsLiveIn(
   WhileUtil::MakeInstructionsLiveInResult result;
 
   result.new_while_instr = new_while;
+  result.replacement_instr = replacement_instr;
   result.while_body_live_in_values = std::move(live_in_instructions);
   result.while_body_instruction_map = std::move(inlined_instructions_map);
 
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
index cba41ccd8b1..b4b9d296974 100644
--- a/tensorflow/compiler/xla/service/while_util.h
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -29,6 +29,10 @@ class WhileUtil {
     // The new while operation that has the requested values live in.
     HloInstruction* new_while_instr;
 
+    // The new tuple instruction that replaced the original while instruction
+    // with the same shape.
+    HloInstruction* replacement_instr;
+
     // The i'th element of `while_body_live_in_values` is an instruction in the
     // while body that holds the i'th *newly added* live in value at runtime.
     std::vector<HloInstruction*> while_body_live_in_values;
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index 2793ddfc1ae..dfaac677724 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -63,6 +63,8 @@ class Shape {
   // shapes are traversed recursively.
   bool is_static() const;
 
+  bool is_dynamic() const { return !is_static(); }
+
   // Returns true if the given dimension is dynamically-sized.
   bool is_dynamic_dimension(int dimension) const {
     return dynamic_dimensions_.at(dimension);
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 22ee5a16a30..52cbb8f95ac 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
@@ -150,6 +151,19 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return equal;
 }
 
+/* static */ bool ShapeUtil::EqualStructure(const Shape& lhs,
+                                            const Shape& rhs) {
+  bool equal = true;
+  ForEachSubshape(lhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+    equal &= IndexIsValid(rhs, index);
+  });
+  ForEachSubshape(rhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+    equal &= IndexIsValid(lhs, index);
+  });
+
+  return equal;
+}
+
 /* static */ int64 ShapeUtil::TrueRank(const Shape& shape) {
   int64 accum = 0;
   for (int64 dimension : shape.dimensions()) {
@@ -261,6 +275,12 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return ValidateShape(*shape);
 }
 
+/* static */ Shape ShapeUtil::MakeStaticShape(const Shape& original) {
+  Shape result = original;
+  result.clear_dynamic_dimensions();
+  return result;
+}
+
 /* static */ Shape ShapeUtil::MakeTupleShape(absl::Span<const Shape> shapes) {
   Shape result;
   result.set_element_type(TUPLE);
@@ -626,8 +646,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
   } else if (shape.IsArray()) {
-    int64 byte_size = ByteSizeOfElements(shape);
-    return byte_size;
+    return ByteSizeOfElements(shape);
   } else if (shape.element_type() == TOKEN) {
     return 0;
   } else if (shape.element_type() == OPAQUE_TYPE) {
@@ -1441,6 +1460,19 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return shape;
 }
 
+/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+    const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
+  if (dynamic_shape.rank() != bounded_shape.rank()) {
+    return false;
+  }
+  for (int64 i = 0; i < dynamic_shape.rank(); ++i) {
+    if (dynamic_shape.dimensions(i) > bounded_shape.dimensions(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
   CHECK(shape.IsArray());
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 7e05e17865d..dde56587482 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -298,6 +298,16 @@ class ShapeUtil {
   // As Equal, but allow one of lhs and rhs to be F16 while the other is F32.
   static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
+  // Two shapes have same structure if all subshape indices of lhs are presented
+  // on rhs and vice versa.
+  // A nested tuple shape of (F32, (S32[2], F32[2, 2])) is structurally equal to
+  // (S32, (F32[3], S32[2])) as their structures are both (,(,))
+  //
+  // In contrast, (F32, (F32, F32)) is structurally different from
+  // ((F32, F32), F32) as the former has structure (,(,)) while the latter has
+  // ((,),)
+  static bool EqualStructure(const Shape& lhs, const Shape& rhs);
+
   // Returns the number of dimensions for which the dimension is not (trivially)
   // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
   // fluff. Note that zero dimensions are included in the true rank, e.g.,
@@ -339,6 +349,9 @@ class ShapeUtil {
   // element type changed to type.
   static Shape ChangeElementType(const Shape& original, PrimitiveType type);
 
+  // Retursn a shape with same dimensions but with all dimensions set to static.
+  static Shape MakeStaticShape(const Shape& original);
+
   // Creates a tuple shape from a slice of element shapes within the tuple.
   static Shape MakeTupleShape(absl::Span<const Shape> shapes);
 
@@ -643,12 +656,16 @@ class ShapeUtil {
   static Shape FilterDimensions(const std::function<bool(int64)>& p,
                                 Shape shape);
 
-  // Iterates through all the shape indexes, in minor to major order, starting
-  // from the base indexes, incrementing by the incr steps, up to count
-  // (index[i] < base[i] + count[i]), and calls the visitor_function with the
-  // current index.
-  // The visitor_function visitor function should return true if it wants to
-  // continue, or false otherwise.
+  // Returns true if `dynamic_shape` has dimensions that are less-equal to the
+  // "bounded_shape".
+  static bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
+                                       const xla::Shape& bounded_shape);
+
+  // Iterates through all the shape indexes, in minor to major order,
+  // starting from the base indexes, incrementing by the incr steps, up to
+  // count (index[i] < base[i] + count[i]), and calls the visitor_function
+  // with the current index. The visitor_function visitor function should
+  // return true if it wants to continue, or false otherwise.
   //
   // visitor_function must be a callable of type
   // StatusOr<bool>(absl::Span<int64>) or compatible.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 2d692183338..c8a242c156a 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1104,6 +1104,7 @@ xla_test(
     shard_count = 40,
     tags = [
         "no_rocm",
+        "nozapfhahn",
         "optonly",
     ],
     deps = CONVOLUTION_TEST_DEPS + [
@@ -1304,6 +1305,7 @@ xla_test(
 
 xla_test(
     name = "slice_test",
+    timeout = "long",
     srcs = ["slice_test.cc"],
     shard_count = 40,
     deps = [
@@ -1499,6 +1501,7 @@ xla_test(
     srcs = ["select_and_scatter_test.cc"],
     tags = [
         "no_rocm",
+        "nozapfhahn",
         "optonly",
     ],
     deps = [
@@ -2539,7 +2542,9 @@ xla_test(
     tags = [
         "enable_for_xla_interpreter",
         "noasan",  # sometimes times out, http://b/78650012
+        "nomsan",  # sometimes times out, http://b/78650012
         "notsan",  # sometimes times out, http://b/78650012
+        "optonly",
     ],
     deps = [
         ":test_macros_header",
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
old mode 100755
new mode 100644
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 5b83186ffa4..790497f888e 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -76,6 +76,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   void SetFastMathDisabled(bool disabled) {
     auto* opts = execution_options_.mutable_debug_options();
     opts->set_xla_cpu_enable_fast_math(!disabled);
+    opts->set_xla_cpu_enable_fast_min_max(!disabled);
     opts->set_xla_gpu_enable_fast_min_max(!disabled);
   }
 
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 15d3f7f1cbb..c63f1d0edf3 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -2008,6 +2008,47 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.01, 0.01}));
 }
 
+XLA_TEST_F(ConvolutionHloTest, SwappedOperandConvolve) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %lhs = f32[3,3,7,7] parameter(0)
+  %rhs = f32[5,11,11,7] parameter(1)
+  ROOT %convolution = f32[5,21,2,7] convolution(lhs, rhs),
+     window={size=11x11 pad=3_25x3_6},
+     dim_labels=01bf_o01i->f01b
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.01, 0.01}));
+}
+
+XLA_TEST_F(ConvolutionHloTest, SwappedOperandConvolveWithStride) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %lhs = f32[3,3,7,7] parameter(0)
+  %rhs = f32[5,11,11,7] parameter(1)
+  ROOT %convolution = f32[5,11,2,7] convolution(lhs, rhs),
+     window={size=11x11 pad=3_26x3_6 stride=2x1},
+     dim_labels=01bf_o01i->f01b
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.01, 0.01}));
+}
+XLA_TEST_F(ConvolutionHloTest, SwappedOperandConvolve2) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %lhs = f32[3,3,7,7] parameter(0)
+  %rhs = f32[5,11,11,7] parameter(1)
+  ROOT %convolution = f32[5,11,4,7] convolution(lhs, rhs),
+     window={size=11x11 pad=3_25x3_6 lhs_dilate=1x2 rhs_dilate=2x1},
+     dim_labels=01bf_o01i->f01b
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.01, 0.01}));
+}
+
 XLA_TEST_F(ConvolutionHloTest, TestConv0D) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc b/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc
index 0ed79fa0ad8..44e1b7b5a6f 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc
@@ -352,6 +352,17 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sqrt, {
   Run(Sqrt, std::sqrt, error_spec_gen);
 })
 
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Cbrt, {
+  if (platform_ == "Host" || platform_ == "CUDA") {
+    ErrorSpecGen error_spec_gen = +[](NativeT x) {
+      return ErrorSpec{0.01, 0.01};
+    };
+    Run(Cbrt, std::cbrt, error_spec_gen);
+  } else {
+    Run(Cbrt, std::cbrt);
+  }
+})
+
 // TODO(jlebar): Test trig functions over complex inputs.
 XLA_TEST_P(ExhaustiveF32UnaryTest, Acosh) {
   // Error inherited from Log, which our implementation of Acosh uses.
diff --git a/tensorflow/compiler/xla/tests/half_test.cc b/tensorflow/compiler/xla/tests/half_test.cc
index 74333d66610..566f6559c21 100644
--- a/tensorflow/compiler/xla/tests/half_test.cc
+++ b/tensorflow/compiler/xla/tests/half_test.cc
@@ -34,7 +34,7 @@ class HalfTestBase : public ClientLibraryTestBase {
  protected:
   const ErrorSpec error_spec_{0.001, 0.001};
   // Number of elements in the input buffers.
-  static const int kNumElements = 4;
+  static constexpr int kNumElements = 4;
 };
 
 using UnaryBuildFuncTy = std::function<void(const xla::XlaOp& src)>;
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
old mode 100755
new mode 100644
index 64d586a9514..7b64be5597b
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -117,16 +117,18 @@ std::unique_ptr<HloModule> HloTestBase::CreateNewUnverifiedModule(
 }
 
 std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
-    const string& name) {
+    const string& name, int64 replica_count) {
   return absl::make_unique<VerifiedHloModule>(
-      name, GetModuleConfigForTest(), verifier_layout_sensitive_,
+      name, GetModuleConfigForTest(replica_count), verifier_layout_sensitive_,
       allow_mixed_precision_in_hlo_verifier_,
       backend().compiler()->ShapeSizeBytesFunction());
 }
 
 StatusOr<std::unique_ptr<VerifiedHloModule>>
-HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text) {
-  return ParseAndReturnVerifiedModule(hlo_text, GetModuleConfigForTest());
+HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                                          int64 replica_count) {
+  return ParseAndReturnVerifiedModule(hlo_text,
+                                      GetModuleConfigForTest(replica_count));
 }
 
 StatusOr<std::unique_ptr<VerifiedHloModule>>
@@ -163,6 +165,16 @@ PrecisionConfig HloTestBase::DefaultPrecisionConfig(int operands) {
   return precision_config;
 }
 
+void HloTestBase::SetAotFastMathDebugOptions(DebugOptions* options) {
+  options->set_xla_cpu_enable_fast_math(true);
+  options->set_xla_gpu_enable_fast_min_max(true);
+  options->set_xla_cpu_enable_fast_min_max(true);
+  options->set_xla_cpu_fast_math_honor_nans(false);
+  options->set_xla_cpu_fast_math_honor_infs(false);
+  options->set_xla_cpu_fast_math_honor_functions(false);
+  options->set_xla_cpu_fast_math_honor_division(false);
+}
+
 DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
old mode 100755
new mode 100644
index 0b1801ebe23..85b1876dd3c
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -84,11 +84,11 @@ class HloTestBase : public ::testing::Test {
   // Like CreateNewUnverifiedModule, except the HloModule returned here runs the
   // HLO verifier on destruction.
   std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
-      const string& name = TestName());
+      const string& name = TestName(), int64 replica_count = 1);
 
   // Parses the given string and returns module as a VerifiedHloModule.
   StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
-      absl::string_view hlo_text);
+      absl::string_view hlo_text, int64 replica_count = 1);
   StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
       absl::string_view hlo_text, const HloModuleConfig& config);
 
@@ -100,6 +100,10 @@ class HloTestBase : public ::testing::Test {
 
   static PrecisionConfig DefaultPrecisionConfig(int operands);
 
+  // Sets most fath math options to be enabled to model the fast math flags
+  // generally used for CPU:AOT compilation.
+  static void SetAotFastMathDebugOptions(DebugOptions* options);
+
  protected:
   // This uses the interpreter backend as the reference backend and
   // automatically finds another supported backend as the test backend. If the
@@ -130,9 +134,10 @@ class HloTestBase : public ::testing::Test {
   virtual DebugOptions GetDebugOptionsForTest();
 
   // Gets an HloModuleConfig with options appropriate for tests.
-  HloModuleConfig GetModuleConfigForTest() {
+  HloModuleConfig GetModuleConfigForTest(int64 replica_count = 1) {
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsForTest());
+    config.set_replica_count(replica_count);
     return config;
   }
 
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 3407a68f709..40e226f9902 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -310,8 +310,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
 
 XLA_TEST_F(VecOpsSimpleTest, ClampFloatEdgeCases) {
   XlaBuilder builder(TestName());
-  mutable_debug_options()->set_xla_cpu_enable_fast_math(false);
-  mutable_debug_options()->set_xla_gpu_enable_fast_min_max(false);
+  SetFastMathDisabled(true);
   auto low = ConstantR1<float>(&builder, {NAN, 1, 1});
   auto high = ConstantR1<float>(&builder, {3, NAN, 3});
   auto x = ConstantR1<float>(&builder, {2, 2, NAN});
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 5a482305513..d575bbb1f3e 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -863,7 +863,7 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // Starts = iteration * 2;
     auto starts = Mul(iteration, ConstantR0<int32>(&builder, 2));
     // UpdateSlice.
-    auto out1 = DynamicUpdateSlice(input, update, starts);
+    auto out1 = DynamicUpdateSlice(input, update, {starts});
 
     Tuple(&builder, {out0, out1});
     body = builder.Build().ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index f8bd7a0750e..9374b1fca6a 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -148,9 +148,20 @@ message DebugOptions {
   // xla_cpu_enable_fast_math is false.
   bool xla_cpu_fast_math_honor_functions = 129;
 
+  // When false we lower the Minimum and Maximum hlos in the CPU backend such
+  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NaN.  In other words, if flag
+  // this is false we always propagate NaNs through Min and Max.
+  //
+  // Note, this does not correspond to the exact same behavior as the gpu flag
+  // below!
+  bool xla_cpu_enable_fast_min_max = 140;
+
   // When true we lower the Minimum and Maximum hlos in the GPU backend such
   // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
   // this is true we don't propagate NaNs through Min and Max.
+  //
+  // Note, this does not correspond to the exact same behavior as the cpu flag
+  // above!
   bool xla_gpu_enable_fast_min_max = 100;
 
   // Allows xla to increase the output precision of floating point operations.
@@ -269,7 +280,18 @@ message DebugOptions {
   bool xla_tpu_detect_nan = 135;
   bool xla_tpu_detect_inf = 136;
 
-  // Next id: 137
+  // True if TraceMe annotations are enabled for XLA:CPU.
+  bool xla_cpu_enable_xprof_traceme = 137;
+
+  // It is usually preferable to not fallback to the driver; it can consume more
+  // memory, or have bugs.
+  bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = 138;
+
+  // It is usually preferable to not fallback to the driver; it can consume more
+  // memory, or have bugs.
+  bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_error = 139;
+
+  // Next id: 141
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -319,6 +341,13 @@ message ExecutionOptions {
   // Number of partitions of the computation to run (model parallelism).
   // If zero, uses the default number of partitions for the XLA service.
   int32 num_partitions = 9;
+
+  // Used to identify a set of programs that should be launch together.
+  int32 launch_id = 10;
+
+  // Indicates whether to use SPMD (true) or MPMD (false) partitioning when
+  // num_partitions > 1 and XLA is requested to partition the input program.
+  bool use_spmd_partitioning = 11;
 }
 
 message GetDeviceHandlesRequest {
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index d1445144b76..332c8ff9a14 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -58,6 +58,7 @@ cc_library(
         "xrt_state.h",
         "xrt_util.h",
     ],
+    visibility = ["//visibility:public"],
     deps = [
         ":xrt_proto_cc",
         "//tensorflow/compiler/jit:xla_device",
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 309b4f4c85a..494ba29e981 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -49,6 +49,7 @@ cc_library(
     deps = [
         ":xrt_state_ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -59,6 +60,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/compiler/xrt:xrt_compile_ops_op_lib",
         "//tensorflow/compiler/xrt:xrt_execute_op_op_lib",
         "//tensorflow/compiler/xrt:xrt_proto_cc",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 83b1b4c8a05..ba6e6a093d6 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -51,6 +51,46 @@ namespace tensorflow {
 
 namespace {
 
+Status GenerateXlaDeviceAssignment(
+    const xrt::DeviceAssignment& xrt_device_assignment, int num_replicas,
+    int num_cores_per_replica, xla::DeviceAssignment* device_assignment) {
+  if (num_cores_per_replica !=
+      xrt_device_assignment.computation_devices_size()) {
+    return errors::InvalidArgument(
+        "Device assignment does not have the correct number of "
+        "computation_devices: num_cores_per_replica=",
+        num_cores_per_replica, " computation_devices=",
+        xrt_device_assignment.computation_devices_size());
+  }
+  for (int64 c = 0; c < xrt_device_assignment.computation_devices_size(); ++c) {
+    const auto& computation_devices =
+        xrt_device_assignment.computation_devices(c);
+    if (num_replicas != computation_devices.replica_devices_size()) {
+      return errors::InvalidArgument(
+          "Device assignment does not have the correct number of "
+          "replica_device_ids: num_replicas=",
+          num_replicas,
+          " replica_devices=", computation_devices.replica_devices_size());
+    }
+    for (int64 r = 0; r < computation_devices.replica_devices_size(); ++r) {
+      const auto& coords = computation_devices.replica_devices(r);
+      if (coords.value_size() != 4) {
+        return errors::InvalidArgument(
+            "Device assignment mesh coordinates must have 4 entries, got ",
+            coords.value_size());
+      }
+      for (int n = 0; n < 3; ++n) {
+        if (coords.value(n) != 0) {
+          return errors::InvalidArgument("Mesh coordinate at index ", n,
+                                         " must be 0, got ", coords.value(n));
+        }
+      }
+      (*device_assignment)(r, c) = coords.value(3);
+    }
+  }
+  return Status::OK();
+}
+
 class XRTCompileOp : public OpKernel {
  public:
   explicit XRTCompileOp(OpKernelConstruction* ctx);
@@ -83,14 +123,13 @@ Status XRTCompileOp::Compile(OpKernelContext* ctx,
                              const xrt::XLAComputation& computation_proto,
                              std::unique_ptr<xla::LocalExecutable>* program) {
   const xrt::XLAComputationConfig& config = computation_proto.config();
+  // Sanity checks for options not yet supported.
+  int num_cores_per_replica = std::max<int>(config.num_cores_per_replica(), 1);
+  TF_RET_CHECK(num_cores_per_replica == 1);
+  TF_RET_CHECK(config.per_core_program_shape_size() == 0);
 
   // The default config value is 0; treat it as 1 for convenience.
   int num_replicas = config.num_replicas() ? config.num_replicas() : 1;
-  TF_RET_CHECK(num_replicas == 1);
-  int num_cores_per_replica =
-      config.num_cores_per_replica() ? config.num_cores_per_replica() : 1;
-  TF_RET_CHECK(num_cores_per_replica == 1);
-  TF_RET_CHECK(config.per_core_program_shape_size() == 0);
 
   // We are guaranteed that the underlying device object won't be deleted out
   // from under us, while the ScopedRef is live.
@@ -119,13 +158,22 @@ Status XRTCompileOp::Compile(OpKernelContext* ctx,
     argument_layout_ptrs[i] = &argument_layouts[i];
   }
   xla::ExecutableBuildOptions build_options;
-  build_options.set_device_ordinal(client->default_device_ordinal());
+  build_options.set_device_ordinal(device_ref.device_ordinal());
+  build_options.set_num_replicas(num_replicas);
   build_options.set_result_layout(xla::Shape(config.program_shape().result()));
   build_options.set_device_allocator(device_ref.backend()->memory_allocator());
   if (config.has_debug_options()) {
     *build_options.mutable_debug_options() =
         BuildXlaDebugOptions(config.debug_options());
   }
+  if (config.has_device_assignment()) {
+    xla::DeviceAssignment device_assignment(num_replicas,
+                                            num_cores_per_replica);
+    TF_RETURN_IF_ERROR(
+        GenerateXlaDeviceAssignment(config.device_assignment(), num_replicas,
+                                    num_cores_per_replica, &device_assignment));
+    build_options.set_device_assignment(device_assignment);
+  }
 
   VLOG(1) << "Building executable";
   TF_ASSIGN_OR_RETURN(
@@ -158,7 +206,8 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, CompilationCacheKey(computation_proto, &key));
 
   // Process-wide cache of XLA executables.
-  auto cache_or = GetOrCreateCompilationCache(rm, /*max_number_of_entries=*/0);
+  auto cache_or = XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+      ctx, /*max_number_of_entries=*/0);
   OP_REQUIRES_OK(ctx, cache_or.status());
   auto cache = cache_or.ConsumeValueOrDie();
 
@@ -211,15 +260,11 @@ void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
   auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseCompilationCell());
 
-  ResourceMgr* rm;
-  OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
-
   // Process-wide cache of XLA executables.
-  XRTCompilationCache* cache;
-  OP_REQUIRES_OK(ctx, rm->Lookup<XRTCompilationCache>(
-                          rm->default_container(),
-                          kXRTCompilationCacheResourceName, &cache));
-  core::ScopedUnref cache_unref(cache);
+  auto cache_or = XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+      ctx, /*max_number_of_entries=*/0);
+  OP_REQUIRES_OK(ctx, cache_or.status());
+  auto cache = cache_or.ConsumeValueOrDie();
 
   const Tensor& keys_tensor = ctx->input(0);
   auto flat_keys = keys_tensor.flat<int64>();
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 45c8e1ad59a..2fc599e42df 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -37,7 +39,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/monitoring/timed.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
@@ -145,31 +151,301 @@ xla::StatusOr<InputBuffers> GetChainedOpInputs(
   return std::move(input_buffers);
 }
 
+// Given a shape, returns a byte array representing the shape metadata of the
+// shape. The shape metadata contains dimensions sizes stored as contiguous S32.
+std::vector<int32> PrepareMetadata(const xla::Shape& shape) {
+  DCHECK(shape.is_static());
+  DCHECK(shape.IsArray());
+  // Each dimension size is stored as a S32.
+  std::vector<int32> result(shape.dimensions_size());
+  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
+    result[i] = shape.dimensions(i);
+  }
+  return result;
+}
+
+// Given a buffer with dynamic shape, update buffer metadata at the correct
+// offset starting from that buffer.
+//
+// +-----------+
+// |Payload    |
+// +-----------+
+// | Padding   |
+// +-----------+
+// |dim_size_0 |  (each dim_size is a S32):
+// +-----------+
+// |dim_size_1 |
+// +-----------+
+//  ..........
+// +-----------+
+//
+// Size of payload = ByteSizeOf(runtime_shape)
+// Size of payload + padding = ByteSizeOf(compile_time_shape_static)
+// Size of payload + padding + metadata = ByteSizeOf(compile_time_shape)
+Status UpdateMetadata(se::Stream* stream, se::DeviceMemory<uint8>* buffer,
+                      const xla::Shape& compile_time_shape,
+                      const xla::Shape& runtime_shape) {
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  TF_ASSIGN_OR_RETURN(
+      auto transfer_manager,
+      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  xla::Shape compile_time_shape_static =
+      xla::ShapeUtil::MakeStaticShape(compile_time_shape);
+  uint64 offset = shape_size_fn(compile_time_shape_static);
+  uint64 metadata_size = shape_size_fn(compile_time_shape) - offset;
+  auto metadata_buffer =
+      stream->parent()->GetSubBuffer(buffer, offset, metadata_size);
+
+  auto metadata_literal = std::make_shared<xla::Literal>(
+      xla::LiteralUtil::CreateR1<int32>(PrepareMetadata(runtime_shape)));
+  TF_RETURN_IF_ERROR(transfer_manager->TransferArrayToDeviceAsync(
+      stream, *metadata_literal, metadata_buffer));
+  // Retain the literal until the end of the transfer.
+  stream->ThenDoHostCallback([metadata_literal]() { return Status::OK(); });
+  return Status::OK();
+}
+
+// Given a static input buffer, convert it to dynamic form by expanding it to
+// the bounded size and attaching a metadata filled with dimension sizes.
+//
+// From:
+// +--------+
+// |Payload |
+// +--------+
+//
+// To:
+//
+// +--------+
+// |Payload |
+// +--------+
+// | Padding|
+// +--------+
+// |Metadata|
+// +--------+
+//
+// As we can't expand the size of an existing memory allocation, a reallocation
+// is required. A list of new allocations are returned after this function. The
+// caller is reponsible for maintaining those allocations.
+xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
+    se::Stream* stream, se::DeviceMemoryAllocator* allocator,
+    std::vector<xla::ShapedBuffer*> runtime_inputs,
+    const std::vector<xla::ShapeLayout>& compile_time_shapes) {
+  std::vector<se::OwningDeviceMemory> new_allocations;
+  TF_RET_CHECK(runtime_inputs.size() == compile_time_shapes.size());
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  for (int64 i = 0; i < compile_time_shapes.size(); i++) {
+    const xla::Shape& compile_time_shape = compile_time_shapes[i].shape();
+    if (compile_time_shape.is_static()) {
+      continue;
+    }
+    auto* runtime_input = runtime_inputs[i];
+
+    bool element_modified = false;
+    TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
+        compile_time_shape,
+        [&](const xla::Shape& compile_time_shape,
+            const xla::ShapeIndex& index) -> Status {
+          if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) {
+            return Status::OK();
+          }
+          const xla::Shape& runtime_shape = xla::ShapeUtil::GetSubshape(
+              runtime_input->on_device_shape(), index);
+          TF_RET_CHECK(!runtime_shape.IsTuple());
+          TF_RET_CHECK(xla::ShapeUtil::DynamicShapeIsCompatible(
+              runtime_shape, compile_time_shape));
+          se::DeviceMemoryBase* static_input =
+              runtime_input->buffers().mutable_element(index);
+          TF_ASSIGN_OR_RETURN(
+              auto dynamic_input,
+              allocator->Allocate(stream->parent()->device_ordinal(),
+                                  shape_size_fn(compile_time_shape)));
+          new_allocations.emplace_back(std::move(dynamic_input));
+          se::DeviceMemory<uint8>* dynamic_input_base =
+              new_allocations.back().ptr();
+          // Send the original data to the new location.
+          stream->ThenMemcpyD2D(dynamic_input_base, *static_input,
+                                static_input->size());
+          TF_RETURN_IF_ERROR(UpdateMetadata(stream, dynamic_input_base,
+                                            compile_time_shape, runtime_shape));
+          // Modify the memory location in the input shape tree to point to the
+          // new input.
+          runtime_input->set_buffer(*dynamic_input_base, index);
+          element_modified = true;
+          return Status::OK();
+        }));
+    if (element_modified) {
+      runtime_input->set_shapes(compile_time_shape, compile_time_shape);
+      // The input location has been modified, need to fix tuple table to
+      // point to the correct address.
+      TF_ASSIGN_OR_RETURN(
+          auto transfer_manager,
+          xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+      TF_RETURN_IF_ERROR(
+          transfer_manager->WriteTupleIndexTablesAsync(stream, *runtime_input));
+    }
+  }
+  return std::move(new_allocations);
+}
+
+xla::StatusOr<xla::Literal> ReadMetadataLiteral(
+    se::Stream* stream, se::DeviceMemoryBase* buffer,
+    const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) {
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  xla::Shape buffer_shape_static =
+      xla::ShapeUtil::MakeStaticShape(buffer_shape);
+  const int64 offset = shape_size_fn(buffer_shape_static);
+  int64 metadata_size = shape_size_fn(buffer_shape) - offset;
+  TF_RET_CHECK(metadata_size != 0);
+  auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
+  auto metadata_buffer =
+      stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
+  return transfer_manager->TransferArrayFromDevice(
+      stream,
+      xla::ShapeUtil::MakeShape(xla::S32, {buffer_shape.dimensions_size()}),
+      metadata_buffer);
+}
+
+// For each subshape in the result buffer that's dynamic, read the dynamic
+// dimension sizes from the metadata, and update output shapes. The result shape
+// is a static and concrete shape.
+xla::Status UpdateDynamicOutputs(se::Stream* stream,
+                                 xla::ShapedBuffer* shaped_buffer,
+                                 xla::Shape* output_host_shape,
+                                 xla::Shape* output_device_shape) {
+  DCHECK(output_device_shape->is_dynamic());
+  TF_ASSIGN_OR_RETURN(
+      auto transfer_manager,
+      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        const xla::Shape& buffer_shape =
+            xla::ShapeUtil::GetSubshape(*output_device_shape, index);
+        if (buffer_shape.IsTuple()) {
+          return Status::OK();
+        }
+        xla::Shape& host_shape =
+            *xla::ShapeUtil::GetMutableSubshape(output_host_shape, index);
+        xla::Shape& device_shape =
+            *xla::ShapeUtil::GetMutableSubshape(output_device_shape, index);
+        if (device_shape.is_static()) {
+          return Status::OK();
+        }
+        TF_ASSIGN_OR_RETURN(auto metadata,
+                            ReadMetadataLiteral(stream, buffer, buffer_shape,
+                                                transfer_manager));
+        // Update shape size from metadata.
+        for (int64 i = 0; i < metadata.element_count(); ++i) {
+          host_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+          device_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+        }
+        return Status::OK();
+      }));
+  output_host_shape->clear_dynamic_dimensions();
+  output_device_shape->clear_dynamic_dimensions();
+  return Status::OK();
+}
+
+// Create output tuple from run_result.
+xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
+    se::Stream* stream, xla::ScopedShapedBuffer run_result,
+    xla::Backend* backend, int device_ordinal) {
+  XRTTupleAllocation* output_tuple;
+  xla::ShapedBuffer shaped_buffer = run_result.release();
+  if (shaped_buffer.on_device_shape().is_dynamic()) {
+    // Update dynamic shapes from output buffer, and create a XRT tensor with
+    // dimension sizes read from metadata.
+    xla::Shape output_host_shape = shaped_buffer.on_host_shape();
+    xla::Shape output_device_shape = shaped_buffer.on_device_shape();
+    TF_RETURN_IF_ERROR(UpdateDynamicOutputs(
+        stream, &shaped_buffer, &output_host_shape, &output_device_shape));
+    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
+        shaped_buffer, output_host_shape, output_device_shape, backend,
+        device_ordinal, &output_tuple));
+  } else {
+    // Fast-path: Don't copy shapes of output buffer.
+    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
+        shaped_buffer, backend, device_ordinal, &output_tuple));
+  }
+  return RefPtr<XRTTupleAllocation>(output_tuple);
+}
+
 xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
     OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed) {
+    se::Stream* stream, int rng_seed,
+    const xrt::CommonExecutionConfig& config) {
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
   run_options.set_allocator(device_ref->backend()->memory_allocator());
   run_options.set_intra_op_thread_pool(&context->eigen_cpu_device());
   run_options.set_rng_seed(rng_seed);
+  if (config.run_id() != 0) {
+    run_options.set_run_id(xla::RunId(config.run_id()));
+  }
+  if (executable->executable()
+          ->module_config()
+          .has_static_device_assignment()) {
+    run_options.set_device_assignment(
+        &executable->executable()->module_config().static_device_assignment());
+  }
+  xla::GpuExecutableRunOptions gpu_options;
+  std::vector<xla::GlobalDeviceId> gpu_global_ids;
+  if (config.local_replica_mapping_size() > 0) {
+    gpu_global_ids.reserve(config.local_replica_mapping_size());
+    for (auto& gid : config.local_replica_mapping()) {
+      gpu_global_ids.emplace_back(xla::GlobalDeviceId(gid));
+    }
+    gpu_options.set_gpu_global_device_ids(gpu_global_ids);
+  }
+  std::shared_ptr<NcclUniqueIdFactory> nccl_factory = GetNcclUniqueIdFactory();
+  if (nccl_factory != nullptr) {
+    auto uid_callback =
+        [&](const xla::NcclCliqueKey& key) -> xla::StatusOr<std::string> {
+      std::vector<xla::int64> replicas;
+      for (auto& device : key.devices()) {
+        replicas.push_back(device.value());
+      }
+      return nccl_factory->GetUniqueId(replicas);
+    };
+    gpu_options.set_nccl_unique_id_callback(uid_callback);
+  }
+  run_options.set_gpu_executable_run_options(&gpu_options);
 
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
+  const std::vector<xla::ShapeLayout>& shape_layouts =
+      executable->executable()
+          ->module_config()
+          .entry_computation_layout()
+          .parameter_layouts();
+  TF_ASSIGN_OR_RETURN(auto new_allocations,
+                      UpdateDynamicInputs(stream, run_options.allocator(),
+                                          input_buffers.input_pointers,
+                                          shape_layouts));
+  auto new_allocations_ptr =
+      std::make_shared<std::vector<se::OwningDeviceMemory>>(
+          std::move(new_allocations));
   TF_ASSIGN_OR_RETURN(
       xla::ScopedShapedBuffer run_result,
       executable->Run(input_buffers.input_pointers, run_options));
+  // Retain the new allocation for input memory until the end of execution.
+  stream->ThenDoHostCallback([new_allocations_ptr]() { return Status::OK(); });
+
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
-  auto shaped_buffer = run_result.release();
-  XRTTupleAllocation* output_tuple;
-  TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-      shaped_buffer, device_ref->backend(), device_ref->device_ordinal(),
-      &output_tuple));
-  RefPtr<XRTTupleAllocation> output_tuple_ptr(output_tuple);
+  TF_ASSIGN_OR_RETURN(
+      RefPtr<XRTTupleAllocation> output_tuple_ptr,
+      CreateOutputTuple(stream, std::move(run_result), device_ref->backend(),
+                        device_ref->device_ordinal()));
 
   // The ScopedShapedBuffer returned by the executable Run() API, in case of
   // input/output buffer aliasing, might have holes in it, which need to be
@@ -182,7 +458,7 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
           const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
     TF_RET_CHECK(alias.parameter_number < input_buffers.input_tuples.size());
     return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
-               ? output_tuple->AliasBufferFrom(
+               ? output_tuple_ptr->AliasBufferFrom(
                      *input_buffers.input_tuples[alias.parameter_number],
                      alias.parameter_index, output_index)
                : Status::OK();
@@ -196,10 +472,11 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     OpKernelContext* context, XRTMemoryManager* memory_manager,
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed) {
+    se::Stream* stream, int rng_seed,
+    const xrt::CommonExecutionConfig& config) {
   auto runfn = [&]() {
     return RunExecutable(context, device_ref, executable, input_buffers, stream,
-                         rng_seed);
+                         rng_seed, config);
   };
 
   // We pass zero as requested_free_size as there is no simple way to get the
@@ -215,13 +492,15 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable,
     const std::vector<InputCoords>& input_coords, bool release_inputs,
-    se::Stream* stream, int rng_seed) {
+    se::Stream* stream, int rng_seed,
+    const xrt::CommonExecutionConfig& config) {
   XRTMemoryManager::WorkingSet working_set(memory_manager);
   TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
                       GetInputBuffers(&working_set, device_ref->backend(),
                                       input_coords, release_inputs));
   return ExecuteComputation(context, memory_manager.get(), device_ref,
-                            executable, input_buffers, stream, rng_seed);
+                            executable, input_buffers, stream, rng_seed,
+                            config);
 }
 
 // XRTExecuteOp
@@ -270,8 +549,9 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   bool release_inputs = config_proto.release_input_handles();
   bool release_compilation = config_proto.release_compilation_handle();
 
-  TF_ASSIGN_OR_RETURN(
-      auto cache, GetOrCreateCompilationCache(rm, /*max_number_of_entries=*/0));
+  TF_ASSIGN_OR_RETURN(auto cache,
+                      XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+                          context, /*max_number_of_entries=*/0));
   // We are guaranteed that the underlying device object won't be deleted out
   // from under us, while the ScopedRef is live.
   class XRTGenericDeviceAccessor::ScopedRef device_ref;
@@ -302,7 +582,8 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   TF_ASSIGN_OR_RETURN(
       RefPtr<XRTTupleAllocation> output_tuple,
       ExecuteComputation(context, memory_manager, &device_ref, executable,
-                         input_coords, release_inputs, stream, rng_seed));
+                         input_coords, release_inputs, stream, rng_seed,
+                         config_proto.common_config()));
 
   return CreateExecuteOutput(context, memory_manager.get(),
                              std::move(output_tuple),
@@ -351,8 +632,9 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
   xrt::XRTChainedExecuteConfig config;
   TF_RET_CHECK(ParseFromTString(execution_config.scalar<tstring>()(), &config));
 
-  TF_ASSIGN_OR_RETURN(
-      auto cache, GetOrCreateCompilationCache(rm, /*max_number_of_entries=*/0));
+  TF_ASSIGN_OR_RETURN(auto cache,
+                      XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+                          context, /*max_number_of_entries=*/0));
   // We are guaranteed that the underlying device object won't be deleted out
   // from under us, while the ScopedRef is live.
   class XRTGenericDeviceAccessor::ScopedRef device_ref;
@@ -379,7 +661,8 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
     xla::LocalExecutable* executable = entry->get().get_executable();
 
     return ExecuteComputation(context, memory_manager.get(), &device_ref,
-                              executable, input_buffers, stream, rng_seed);
+                              executable, input_buffers, stream, rng_seed,
+                              config.common_config());
   };
 
   return ExecuteChained(context, memory_manager, device_ref.backend(),
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 243289c8821..fbf9dfd0a17 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -49,6 +49,67 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+xla::XlaComputation ReturnDynamicR1() {
+  xla::XlaBuilder builder("ReturnDynamicR1");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
+  return builder.Build(pad_sum).ValueOrDie();
+}
+
+xla::XlaComputation AcceptDynamicR1() {
+  xla::XlaBuilder builder("AcceptDynamicR1");
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  auto p0 = xla::Parameter(&builder, 0, dyn_shape, "P0");
+  auto p1 = xla::Parameter(&builder, 1, dyn_shape, "P1");
+  auto sum = xla::Add(p0, p1);
+  return builder.Build(sum).ValueOrDie();
+}
+
+xla::XlaComputation ReturnDynamicR1Tuple() {
+  xla::XlaBuilder builder("ReturnDynamicR1Tuple");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  auto one = xla::One(&builder, xla::S32);
+  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
+  auto pad_sub = xla::SetDimensionSize(sub, p2 + one, 0);
+  auto tuple = xla::Tuple(&builder, {pad_sum, sum, pad_sub});
+  return builder.Build(tuple, /*remove_dynamic_dimensions=*/true).ValueOrDie();
+}
+
+xla::XlaComputation AcceptDynamicR1Tuple() {
+  xla::XlaBuilder builder("AcceptDynamicR1");
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  xla::Shape tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
+  xla::Shape nest_tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
+  auto p = xla::Parameter(&builder, 0, tuple_shape, "P0");
+  auto p0 = xla::GetTupleElement(p, 0);
+  auto p1 = xla::GetTupleElement(p, 1);
+  auto sum = xla::Add(p0, p1);
+  return builder.Build(sum).ValueOrDie();
+}
+
+template <typename T>
+xla::LiteralProto CreateR0(T v) {
+  auto array = xla::LiteralUtil::CreateR0<T>(v);
+  return array.ToProto();
+}
+
 class XrtClientSession : public ClientSession {
  public:
   explicit XrtClientSession(const Scope& scope) : ClientSession(scope) {
@@ -61,6 +122,11 @@ class XrtClientSession : public ClientSession {
 string* xla_test_device_ptr;  // initial value set in main()
 string* xla_platform_ptr;     // initial value set in main()
 
+bool SupportDynamicShapes() {
+  // TODO(jackcao): Support dynamic shapes on XLA GPU.
+  return *xla_test_device_ptr != "XLA_GPU";
+}
+
 string DeviceFromFlag() {
   string xla_test_device = *xla_test_device_ptr;
   return absl::StrCat("/device:", xla_test_device, ":0");
@@ -1035,6 +1101,239 @@ TEST(RawApiTest, CompileAndExecute) {
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
 
+TEST(RawApiTest, DynamicR1Test) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, 2.5f, 1.17f});
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(ReturnDynamicR1(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, DynamicR1TupleTest) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f, 1.0f});
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape(
+          {dyn_shape, xla::ShapeUtil::MakeShape(xla::F32, {4}), dyn_shape})
+          .ToProto();
+  StoreComputationSnapshot(ReturnDynamicR1Tuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected0 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
+  auto expected1 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f, 0.0f});
+  auto expected2 = xla::LiteralUtil::CreateR1<float>({0.0f, 3.0f, 1.0f});
+  auto expected =
+      xla::LiteralUtil::MakeTuple({&expected0, &expected1, &expected2});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, AcceptDynamicR1TupleTest) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
+
+  xrt::XLATupleNode tuple_desc;
+  auto subdesc_10 = tuple_desc.add_tuples();
+  auto subdesc_11 = tuple_desc.add_tuples();
+  subdesc_10->set_input_index(0);
+  subdesc_10->set_release_input_handle(true);
+  subdesc_11->set_input_index(1);
+  subdesc_11->set_release_input_handle(true);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  xla::Shape dyn_tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_input_shape, dyn_input_shape});
+  *shapes->add_parameters() = dyn_tuple_shape.ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR1Tuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+
+  auto tuple_0 = ops::Const(root.WithDevice("/device:CPU:0"),
+                            tuple_desc.SerializeAsString());
+  auto t0_handle = ops::XRTMakeTuple(
+      root, tuple_0,
+      {static_cast<Output>(p0_handle), static_cast<Output>(p1_handle)});
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {static_cast<Output>(t0_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, AcceptDynamicR1Test) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  *shapes->add_parameters() = dyn_input_shape.ToProto();
+  *shapes->add_parameters() = dyn_input_shape.ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR1(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto allocate_op_0 = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto allocate_op_1 = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(allocate_op_0), Output(allocate_op_1)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
 TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f});
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 47b7cda2760..9a351732c4b 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -111,6 +111,17 @@ message XLATupleNode {
   repeated XLATupleNode tuples = 3;
 }
 
+message CommonExecutionConfig {
+  // The replica index this execute is driving.
+  int32 replica_id = 1;
+  // Mapping local device ordinals to global replica IDs.
+  // local_replica_mapping[LOCAL_DEVICE_ORDINAL] = GLOBAL_REPLICA_ID
+  repeated int32 local_replica_mapping = 2;
+  // The execution run ID used to correlate different XRT execute operations
+  // happeining in parallel from different threads.
+  int64 run_id = 3;
+}
+
 // Options for an XLA execution.
 message XRTExecutionConfig {
   // Local device to run on. This is present because the execute Op
@@ -133,6 +144,9 @@ message XRTExecutionConfig {
   // a single tuple allocation the execution will return a vector of
   // allocations, one for each of the first-level elements of the result tuple.
   bool return_exploded_tuple = 7;
+  reserved 8;
+  // The common configuration for XRT execute operations.
+  CommonExecutionConfig common_config = 9;
 }
 
 message XRTChainedExecuteConfig {
@@ -143,6 +157,9 @@ message XRTChainedExecuteConfig {
   // Optional key to disambiguate between executions. This is only needed if
   // multiple host send/recvs may be outstanding concurrently with executions.
   string execution_instance_key = 3;
+  reserved 4;
+  // The common configuration for XRT execute operations.
+  CommonExecutionConfig common_config = 5;
 }
 
 // A single chained execute operation. An operation can either be a device data
diff --git a/tensorflow/compiler/xrt/xrt_device.cc b/tensorflow/compiler/xrt/xrt_device.cc
index 1b5557d556d..46954572c5d 100644
--- a/tensorflow/compiler/xrt/xrt_device.cc
+++ b/tensorflow/compiler/xrt/xrt_device.cc
@@ -17,19 +17,56 @@ limitations under the License.
 
 #include "tensorflow/compiler/xrt/xrt_device.h"
 
+#include <map>
+
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
+namespace {
+
+class ResourceMgrArena {
+ public:
+  static ResourceMgrArena* Get() {
+    static ResourceMgrArena* arena = new ResourceMgrArena();
+    return arena;
+  }
+
+  ResourceMgr* GetResourceMgr(const std::string& platform_name) {
+    mutex_lock lock(mutex_);
+    auto it = resource_managers_.find(platform_name);
+    if (it == resource_managers_.end()) {
+      it = resource_managers_.emplace(platform_name, new ResourceMgr()).first;
+    }
+    return it->second;
+  }
+
+ private:
+  mutex mutex_;
+  std::map<std::string, ResourceMgr*> resource_managers_;
+};
+
+}  // namespace
 
 /*static*/ Status XRTGenericDeviceAccessor::GetResourceManager(
     OpKernelContext* ctx, ResourceMgr** rm) {
-  *rm = ctx->resource_manager();
+  const XlaDevice::Metadata* metadata;
+  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
+  *rm = ResourceMgrArena::Get()->GetResourceMgr(metadata->platform()->Name());
   return Status::OK();
 }
 
+/* static */ xla::StatusOr<RefPtr<XRTCompilationCache>>
+XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+    OpKernelContext* ctx, int64 max_number_of_entries) {
+  ResourceMgr* rm;
+  TF_RETURN_IF_ERROR(GetResourceManager(ctx, &rm));
+  return tensorflow::GetOrCreateCompilationCache(rm, max_number_of_entries);
+}
+
 /*static*/ Status XRTGenericDeviceAccessor::InitScopedRef(
     OpKernelContext* ctx, int device_ordinal, ScopedRef* scoped_ref) {
   const XlaDevice::Metadata* metadata;
diff --git a/tensorflow/compiler/xrt/xrt_device.h b/tensorflow/compiler/xrt/xrt_device.h
index 5ebee7641f0..02fab315830 100644
--- a/tensorflow/compiler/xrt/xrt_device.h
+++ b/tensorflow/compiler/xrt/xrt_device.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XRT_XRT_DEVICE_H_
 
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 
@@ -31,6 +32,9 @@ class XRTGenericDeviceAccessor {
  public:
   static Status GetResourceManager(OpKernelContext* ctx, ResourceMgr** rm);
 
+  static xla::StatusOr<RefPtr<XRTCompilationCache>> GetOrCreateCompilationCache(
+      OpKernelContext* ctx, int64 max_number_of_entries);
+
   // We use a ScopedRef pattern here even though it's not strictly necessary,
   // just so that templated uses of this and the TPU accessor class will be as
   // similar as possible.
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index a0daa5c6c23..c2f9a1c62c9 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -588,7 +588,8 @@ xla::StatusOr<xla::ShapedBuffer> XRTTupleAllocation::ToShapedBuffer() {
                                   allocator_->platform(), device_ordinal_);
   for (const auto& index_buffer : buffers_) {
     if (index_buffer.second == nullptr ||
-        index_buffer.second->allocation().is_null()) {
+        (index_buffer.second->allocation().is_null() &&
+         index_buffer.second->allocation().size() > 0)) {
       return errors::InvalidArgument("Literal buffer at index ",
                                      index_buffer.first.ToString(),
                                      " has been released");
@@ -652,7 +653,8 @@ xla::StatusOr<xla::ExecutionInput> XRTTupleAllocation::ToExecutionInput(
   xla::ExecutionInput result(on_device_shape());
   for (const auto& index_buffer : buffers_) {
     if (index_buffer.second == nullptr ||
-        index_buffer.second->allocation().is_null()) {
+        (index_buffer.second->allocation().is_null() &&
+         index_buffer.second->allocation().size() > 0)) {
       return errors::InvalidArgument("Literal buffer at index ",
                                      index_buffer.first.ToString(),
                                      " has been released");
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index 4d19d4b1226..b8a0afc92c5 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -21,10 +21,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 namespace {
 
+mutex nccl_factory_mutex(LINKER_INITIALIZED);
+std::shared_ptr<NcclUniqueIdFactory>* nccl_factory;
+
 // The ScopedHandles data structure is used in the ExecuteChained() API and its
 // task is to track tuple allocation registrations. It is used both the track
 // intermediate results of a chained computation, or its final results. Anything
@@ -162,6 +166,19 @@ Status PopulateOpWorkingSet(xla::Backend* backend,
 
 }  // namespace
 
+void SetNcclUniqueIdFactory(std::shared_ptr<NcclUniqueIdFactory> factory) {
+  mutex_lock lock(nccl_factory_mutex);
+  if (nccl_factory == nullptr) {
+    nccl_factory = new std::shared_ptr<NcclUniqueIdFactory>();
+  }
+  *nccl_factory = std::move(factory);
+}
+
+std::shared_ptr<NcclUniqueIdFactory> GetNcclUniqueIdFactory() {
+  mutex_lock lock(nccl_factory_mutex);
+  return nccl_factory != nullptr ? *nccl_factory : nullptr;
+}
+
 xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
   static const bool options_passthrough = DebugOptionsPassThroughEnabled();
   if (options_passthrough) {
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
index 32244a63081..cc1480fdb00 100644
--- a/tensorflow/compiler/xrt/xrt_util.h
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -18,6 +18,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -31,6 +35,19 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Factory class which creates NCCL unique IDs based on the replicas
+// participating to a given communication. This is only used for GPU backends.
+struct NcclUniqueIdFactory {
+  virtual ~NcclUniqueIdFactory() {}
+
+  // Generates the NCCL unique ID for the given set of replica IDs.
+  virtual std::string GetUniqueId(absl::Span<const xla::int64> replicas) = 0;
+};
+
+void SetNcclUniqueIdFactory(std::shared_ptr<NcclUniqueIdFactory> factory);
+
+std::shared_ptr<NcclUniqueIdFactory> GetNcclUniqueIdFactory();
+
 struct InputCoords {
   explicit InputCoords(int64 handle) : handle(handle) {}
   InputCoords(int64 handle, xla::ShapeIndex index)
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c36664c70fc..6b4874a8393 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -83,7 +83,6 @@ load(
     "tf_gen_op_libs",
     "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_lite_protos",
-    "tf_opts_nortti_if_mobile",
     "tf_portable_full_lite_protos",
     "transitive_hdrs",
 )
@@ -100,28 +99,23 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 
-# buildifier: disable=same-origin-load
-# Placeholder: load("//tensorflow:tensorflow.bzl", "tf_portable_proto_lib")
-
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_monitoring_deps")
 
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_additional_all_protos",
     "tf_additional_lib_deps",
     "tf_additional_test_deps",
     "tf_jspb_proto_library",
     "tf_kernel_tests_linkstatic",
     "tf_lib_proto_parsing_deps",
     "tf_portable_deps_no_runtime",
+    "tf_portable_proto_lib",
     "tf_proto_library",
-    "tf_proto_library_cc",
     "tf_protos_all_impl",
     "tf_protos_grappler_impl",
     "tf_protos_profiler_impl",
-    "tf_pyclif_proto_library",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -184,18 +178,18 @@ package_group(name = "friends")
 # filegroup; e.g.  ones with individual proto_library targets.
 # LINT.IfChange
 COMMON_PROTO_SRCS = [
-    "protobuf/bfc_memory_map.proto",
-    "protobuf/config.proto",
-    "protobuf/cluster.proto",
-    "protobuf/debug.proto",
-    "protobuf/device_filters.proto",
-    "protobuf/device_properties.proto",
-    "protobuf/graph_debug_info.proto",
-    "protobuf/queue_runner.proto",
-    "protobuf/rewriter_config.proto",
-    "protobuf/tensor_bundle.proto",
-    "protobuf/saver.proto",
-    "protobuf/verifier_config.proto",
+    "//tensorflow/core/protobuf:bfc_memory_map.proto",
+    "//tensorflow/core/protobuf:config.proto",
+    "//tensorflow/core/protobuf:cluster.proto",
+    "//tensorflow/core/protobuf:debug.proto",
+    "//tensorflow/core/protobuf:device_filters.proto",
+    "//tensorflow/core/protobuf:device_properties.proto",
+    "//tensorflow/core/protobuf:graph_debug_info.proto",
+    "//tensorflow/core/protobuf:queue_runner.proto",
+    "//tensorflow/core/protobuf:rewriter_config.proto",
+    "//tensorflow/core/protobuf:tensor_bundle.proto",
+    "//tensorflow/core/protobuf:saver.proto",
+    "//tensorflow/core/protobuf:verifier_config.proto",
 ]
 
 EXAMPLE_PROTO_SRCS = [
@@ -242,7 +236,7 @@ PROFILER_PROTO_SRCS = [
 ]
 
 ERROR_CODES_PROTO_SRCS = [
-    "protobuf/error_codes.proto",
+    "//tensorflow/core/protobuf:error_codes.proto",
     "//tensorflow/core/lib/core:error_codes.proto",
 ]
 # LINT.ThenChange(//tensorflow/core/portable_proto_config.asciipb)
@@ -255,11 +249,13 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        ":core_protos",
-        ":error_codes_proto_impl",
         "//tensorflow/core/example:protos_all",
         "//tensorflow/core/framework:protos_all",
         "//tensorflow/core/lib/core:error_codes_proto",
+        "//tensorflow/core/profiler/protobuf:xplane_proto",
+        "//tensorflow/core/profiler:profiler_options_proto",
+        "//tensorflow/core/protobuf:error_codes_proto_impl",
+        "//tensorflow/core/protobuf:for_core_protos",
         "//tensorflow/core/util:protos_all",
         "//tensorflow/core/util:test_log_proto_impl",
     ],
@@ -619,6 +615,7 @@ tf_gen_op_libs(
         "clustering_ops",
         "collective_ops",
         "control_flow_ops",
+        "count_ops",
         "ctc_ops",
         "data_flow_ops",
         "dataset_ops",
@@ -847,6 +844,7 @@ cc_library(
         ":clustering_ops_op_lib",
         ":collective_ops_op_lib",
         ":control_flow_ops_op_lib",
+        ":count_ops_op_lib",
         ":ctc_ops_op_lib",
         ":cudnn_rnn_ops_op_lib",
         ":data_flow_ops_op_lib",
@@ -889,23 +887,29 @@ cc_library(
         ":state_ops_op_lib",
         ":stateless_random_ops_op_lib",
         ":string_ops_op_lib",
-        ":tpu_configuration_ops_op_lib",
-        ":tpu_cross_replica_ops_op_lib",
-        ":tpu_embedding_ops_op_lib",
-        ":tpu_embedding_load_retrieve_ops_op_lib",
-        ":tpu_functional_ops_op_lib",
-        ":tpu_heartbeat_ops_op_lib",
-        ":tpu_host_compute_ops_op_lib",
-        ":tpu_infeed_ops_op_lib",
-        ":tpu_outfeed_ops_op_lib",
-        ":tpu_ordinal_selector_ops_op_lib",
-        ":tpu_replication_ops_op_lib",
         ":training_ops_op_lib",
         ":user_ops_op_lib",
         ":word2vec_ops",
         "//tensorflow/c/kernels:bitcast_op_lib",
         "//tensorflow/compiler/mlir/tensorflow:mlir_passthrough_op",
-    ] + if_mkl([
+    ] + if_chromiumos(
+        [],
+        # Non-tpu platforms don't need tpu dependency. It would be best to guard
+        # them by if_tpu. But there is no such flag yet.
+        [
+            ":tpu_configuration_ops_op_lib",
+            ":tpu_cross_replica_ops_op_lib",
+            ":tpu_embedding_ops_op_lib",
+            ":tpu_embedding_load_retrieve_ops_op_lib",
+            ":tpu_functional_ops_op_lib",
+            ":tpu_heartbeat_ops_op_lib",
+            ":tpu_host_compute_ops_op_lib",
+            ":tpu_infeed_ops_op_lib",
+            ":tpu_outfeed_ops_op_lib",
+            ":tpu_ordinal_selector_ops_op_lib",
+            ":tpu_replication_ops_op_lib",
+        ],
+    ) + if_mkl([
         ":mkl_array_ops_op_lib",
         ":mkl_nn_ops_op_lib",
     ]) + if_tensorrt([
@@ -1006,6 +1010,7 @@ cc_library(
         "//tensorflow/core/kernels:collective_ops",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core/kernels:count_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/kernels:decode_proto_op",
@@ -1140,6 +1145,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "distributed_tensorflow_dependencies",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/distributed_runtime/rpc:grpc_session",
+        "//tensorflow/core/kernels:data_service_ops",
+    ],
+)
+
 cc_library(
     name = "testlib_kernels_impl",
     deps = [
@@ -1256,7 +1270,7 @@ filegroup(
         "//tensorflow/core/platform:mobile_srcs_no_runtime",
         "//tensorflow/core/public:mobile_srcs_no_runtime",
         "//tensorflow/core/util:mobile_srcs_no_runtime",
-        "//tensorflow/core/util/ctc:android_srcs",
+        "//tensorflow/core/util/ctc:mobile_srcs",
     ] + glob(
         [
             "client/**/*.cc",
@@ -1280,17 +1294,18 @@ filegroup(
     srcs = [
         # Sources for which we do not yet have granular targets.
         "//tensorflow/c/eager:srcs",
+        "//tensorflow/c/experimental/saved_model/core:mobile_srcs_only_runtime",
         "//tensorflow/c:srcs",
         "//tensorflow/core/common_runtime:mobile_srcs_only_runtime",
         "//tensorflow/core/common_runtime/eager:srcs",
         "//tensorflow/core/framework:mobile_srcs_only_runtime",
         "//tensorflow/core/graph:mobile_srcs_only_runtime",
-        "//tensorflow/core/kernels:android_srcs",
+        "//tensorflow/core/kernels:mobile_srcs",
         "//tensorflow/core/lib/io:mobile_srcs_only_runtime",
         "//tensorflow/core/profiler:mobile_srcs",
         "//tensorflow/core/public:mobile_srcs_only_runtime",
         "//tensorflow/core/util/sparse:mobile_srcs_only_runtime",
-        "//tensorflow/core/util/tensor_bundle:android_srcs",
+        "//tensorflow/core/util/tensor_bundle:mobile_srcs",
         "//tensorflow/core/util:mobile_srcs_only_runtime",
 
         # Sources for which we already have granular targets.
@@ -1355,10 +1370,7 @@ cc_library(
     name = "portable_tensorflow_lib_lite",
     srcs = if_mobile([":mobile_srcs"]),
     copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_lite_protos() + if_ios(["-Os"]),
-    defines = ["SUPPORT_SELECTIVE_REGISTRATION"] + tf_portable_full_lite_protos(
-        full = [],
-        lite = ["TENSORFLOW_LITE_PROTOS"],
-    ) + if_chromiumos(["IS_MOBILE_PLATFORM"]) + tf_defines_nortti_if_lite_protos(),
+    defines = ["SUPPORT_SELECTIVE_REGISTRATION"] + if_chromiumos(["IS_MOBILE_PLATFORM"]) + tf_defines_nortti_if_lite_protos(),
     linkopts = if_android(["-lz"]) + if_ios(["-lz"]),
     tags = [
         "manual",
@@ -1366,10 +1378,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":protos_all_cc_impl",
         "//tensorflow/core/util:stats_calculator_portable",
         "//tensorflow/core:mobile_additional_lib_deps",
-    ] + tf_portable_deps_no_runtime(),
+    ] + tf_portable_proto_lib() + tf_portable_deps_no_runtime(),
     alwayslink = 1,
 )
 
@@ -1401,55 +1412,12 @@ cc_library(
     ],
 )
 
-# Native library support for iOS applications.
-#
-# bazel  build --config=ios_x86_64 \
-# :ios_tensorflow_lib
-cc_library(
-    name = "ios_tensorflow_lib",
-    srcs = if_ios([
-        ":portable_op_registrations_and_gradients",
-        "//tensorflow/core/kernels:android_core_ops",
-        "//tensorflow/core/kernels:android_extended_ops",
-    ]),
-    copts = tf_copts() + tf_opts_nortti_if_lite_protos() + ["-Os"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":portable_tensorflow_lib_lite",
-        ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "//third_party/fft2d:fft2d_headers",
-        "@com_google_protobuf//:protobuf",
-        "@fft2d",
-        "@gemmlowp",
-    ],
-    alwayslink = 1,
-)
-
 alias(
     name = "ios_tensorflow_lib_lite",
     actual = ":portable_tensorflow_lib_lite",
     visibility = ["//visibility:public"],
 )
 
-cc_library(
-    name = "ios_tensorflow_test_lib",
-    testonly = 1,
-    srcs = if_ios([":android_test_srcs"]),
-    copts = tf_copts() + ["-Os"],
-    tags = [
-        "manual",
-        "notap",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":ios_tensorflow_lib",
-        ":portable_test_proto_lib",
-        "//tensorflow/core/platform/default/build_config:gtest",
-        "//third_party/eigen3",
-    ],
-)
-
 # Full TensorFlow library with operator support. Use this unless reducing
 # binary size (by packaging a reduced operator set) is a concern.
 alias(
@@ -1458,10 +1426,16 @@ alias(
     visibility = ["//visibility:public"],
 )
 
+alias(
+    name = "ios_tensorflow_lib",
+    actual = ":portable_tensorflow_lib",
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "portable_tensorflow_lib",
     srcs = if_mobile([":portable_op_registrations_and_gradients"]),
-    copts = tf_copts() + tf_opts_nortti_if_lite_protos(),
+    copts = tf_copts() + tf_opts_nortti_if_lite_protos() + if_ios(["-Os"]),
     features = tf_features_nomodules_if_mobile(),
     tags = [
         "manual",
@@ -1544,6 +1518,12 @@ alias(
     visibility = ["//visibility:public"],
 )
 
+alias(
+    name = "ios_tensorflow_test_lib",
+    actual = ":portable_tensorflow_test_lib",
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "portable_tensorflow_test_lib",
     testonly = 1,
@@ -1554,7 +1534,7 @@ cc_library(
         "//tensorflow/core/framework:android_test_hdrs",
         "//tensorflow/core/util:android_test_hdrs",
     ],
-    copts = tf_copts(android_optimization_level_override = None),
+    copts = tf_copts(android_optimization_level_override = None) + if_ios(["-Os"]),
     features = tf_features_nomodules_if_mobile() + tf_opts_nortti_if_lite_protos(),
     tags = [
         "manual",
@@ -1622,20 +1602,13 @@ alias(
     [
         alias(
             name = "protobuf_%s_pyclif%s" % (proto_name, target_suffix),
-            actual = ":protobuf/%s_pyclif%s" % (proto_name, target_suffix),
+            actual = "//tensorflow/core/protobuf:%s_pyclif%s" % (proto_name, target_suffix),
             visibility = ["//visibility:public"],
         )
         for target_suffix in [
             "",
             "_pb2",
         ]
-    ] + [
-        tf_pyclif_proto_library(
-            name = "protobuf/%s_pyclif" % proto_name,
-            proto_lib = ":protos_all",
-            proto_srcfile = "protobuf/%s.proto" % proto_name,
-            visibility = ["//visibility:public"],
-        ),
     ]
     for proto_name in [
         "config",
@@ -1649,77 +1622,74 @@ alias(
 # -----------------------------------------------------------------------------
 # Internal targets
 
-tf_proto_library(
+alias(
     name = "autotuning_proto",
-    srcs = ["protobuf/autotuning.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
+    actual = "//tensorflow/core/protobuf:autotuning_proto",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library(
+alias(
+    name = "autotuning_proto_cc",
+    actual = "//tensorflow/core/protobuf:autotuning_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
     name = "conv_autotuning_proto",
-    srcs = ["protobuf/conv_autotuning.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    protodeps = [
-        "//tensorflow/stream_executor:dnn_proto",
-    ],
+    actual = "//tensorflow/core/protobuf:conv_autotuning_proto",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "worker_proto",
-    srcs = ["protobuf/worker.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_cc(
-    name = "worker_service_proto",
-    srcs = ["protobuf/worker_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_stubby_versions = ["2"],
-    protodeps = [":worker_proto"],
+alias(
+    name = "conv_autotuning_proto_cc",
+    actual = "//tensorflow/core/protobuf:conv_autotuning_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "master_proto",
-    srcs = ["protobuf/master.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//tensorflow:internal"],
-)
-
-tf_proto_library_cc(
-    name = "master_service_proto",
-    srcs = ["protobuf/master_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_stubby_versions = ["2"],
-    protodeps = [":master_proto"],
+alias(
+    name = "worker_proto_cc",
+    actual = "//tensorflow/core/protobuf:worker_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "eager_service_proto",
-    srcs = ["protobuf/eager_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    cc_stubby_versions = ["2"],
-    protodeps = tf_additional_all_protos(),
+alias(
+    name = "worker_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:worker_service_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "master_proto_cc",
+    actual = "//tensorflow/core/protobuf:master_proto_cc",
+    visibility = [
+        "//learning/brain/frameworks/uptc:__subpackages__",
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "master_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:master_service_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "eager_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:eager_service_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
@@ -2057,7 +2027,13 @@ cc_library(
         "//tensorflow/core/platform/default:logging.h",
     ],
     copts = tf_copts(),
-    linkopts = ["-ldl"],
+    linkopts = select({
+        "//tensorflow:freebsd": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "-ldl",
+        ],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         ":platform_base",
@@ -2125,49 +2101,14 @@ cc_library(
     ],
 )
 
-tf_proto_library(
+alias(
     name = "error_codes_proto_impl",
-    srcs = ["protobuf/error_codes.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
+    actual = "//tensorflow/core/protobuf:error_codes_proto_impl",
 )
 
-tf_proto_library(
-    name = "core_protos",
-    srcs = COMMON_PROTO_SRCS + [
-        # Protos which are not needed on mobile builds, but should be included
-        # in protos_all.
-        #
-        # Note that some protos are in neither core_proto_srcs nor this
-        # filegroup; e.g. ones with individual proto_library targets.
-        "protobuf/control_flow.proto",
-        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
-        # "protobuf/critical_section.proto",
-        "protobuf/data/experimental/snapshot.proto",
-        "protobuf/debug_event.proto",
-        "protobuf/meta_graph.proto",
-        "protobuf/named_tensor.proto",
-        "protobuf/remote_tensor_handle.proto",
-        "protobuf/saved_model.proto",
-        "protobuf/saved_object_graph.proto",
-        "protobuf/struct.proto",
-        "protobuf/tensorflow_server.proto",
-        "protobuf/trackable_object_graph.proto",
-        "protobuf/transport_options.proto",
-    ],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    protodeps = [
-        ":error_codes_proto_impl",
-        "//tensorflow/core/example:protos_all",
-        "//tensorflow/core/framework:protos_all",
-        "//tensorflow/core/lib/core:error_codes_proto",
-        "//tensorflow/core/profiler/protobuf:xplane_proto",
-        "//tensorflow/core/profiler:profiler_options_proto",
-        "//tensorflow/core/util:protos_all",
-        "//tensorflow/core/util:test_log_proto_impl",
-    ],
-    visibility = ["//visibility:private"],
+alias(
+    name = "error_codes_proto_impl_cc",
+    actual = "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
 )
 
 alias(
@@ -2287,6 +2228,7 @@ tf_cuda_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "//third_party/eigen3",
         "//tensorflow/core/example:feature_util",
@@ -2375,10 +2317,6 @@ alias(
 
 # Library containing all of the graph construction code that is
 # independent of the runtime.
-#
-# TODO(mrry): Refactor graph_constructor.cc so that it does not depend on code
-# in "common_runtime/", and then the entire "graph/" directory can be included
-# in this library.
 tf_cuda_library(
     name = "graph",
     srcs = ["//tensorflow/core/graph:graph_srcs"],
@@ -2462,13 +2400,9 @@ alias(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
-    name = "replay_log_proto",
-    srcs = ["protobuf/replay_log.proto"],
-    cc_api_version = 2,
-    protodeps = [
-        ":master_proto",
-    ] + tf_additional_all_protos(),
+alias(
+    name = "replay_log_proto_cc",
+    actual = "//tensorflow/core/protobuf:replay_log_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
@@ -2546,7 +2480,6 @@ tf_cc_tests(
     ],
     create_named_test_suite = True,
     deps = [
-        ":core_cpu_internal",
         ":lib",
         ":lib_internal",
         ":lib_test_internal",
@@ -2725,42 +2658,6 @@ tf_cc_tests(
     ],
 )
 
-tf_cc_tests(
-    name = "higher_level_tests_needing_kernels",
-    size = "small",
-    srcs = [
-        "//tensorflow/core/graph:higher_level_tests_needing_kernels",
-    ],
-    linkopts = select({
-        "//tensorflow:macos": ["-headerpad_max_install_names"],
-        "//conditions:default": [],
-    }),
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":all_kernels",
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:cc_ops_internal",
-        "//tensorflow/cc:scope",
-        "//tensorflow/cc:sendrecv_ops",
-        "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/util:protos_test_cc",
-        "//third_party/eigen3",
-    ],
-)
-
 tf_cc_test(
     name = "cudnn_rnn_ops_test_cc",
     size = "small",
@@ -2781,7 +2678,6 @@ tf_cc_test_mkl(
     name = "mkl_related_tests",
     size = "small",
     srcs = [
-        "//tensorflow/core/graph:mkl_related_tests",
         "//tensorflow/core/util:mkl_util_test_srcs",
     ],
     linkstatic = 1,
@@ -3137,6 +3033,11 @@ alias(
     actual = "//tensorflow/core/platform:cuda_libdevice_path",
 )
 
+# Normalize CORE_PROTO_SRCS to generate valid output file names.
+PORTABLE_PROTO_HEADERS_OUT = tf_android_core_proto_headers(CORE_PROTO_SRCS) + [
+    "//google/protobuf/any.proto.h",
+]
+
 transitive_hdrs(
     name = "headers",
     visibility = ["//tensorflow:__subpackages__"],
@@ -3149,8 +3050,3 @@ transitive_hdrs(
         "//tensorflow/core/platform:platform_strings",
     ],
 )
-
-# Normalize CORE_PROTO_SRCS to generate valid output file names.
-PORTABLE_PROTO_HEADERS_OUT = tf_android_core_proto_headers(CORE_PROTO_SRCS) + [
-    "//google/protobuf/any.proto.h",
-]
diff --git a/tensorflow/core/api_def/base_api/api_def_AdjustHue.pbtxt b/tensorflow/core/api_def/base_api/api_def_AdjustHue.pbtxt
index bfaf6768601..c34b5c6fbcb 100644
--- a/tensorflow/core/api_def/base_api/api_def_AdjustHue.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AdjustHue.pbtxt
@@ -21,7 +21,7 @@ END
   summary: "Adjust the hue of one or more images."
   description: <<END
 `images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
+interpreted as channels, and must be three.
 
 The input image is considered in the RGB colorspace. Conceptually, the RGB
 colors are first mapped into HSV. A delta is then applied all the hue values,
diff --git a/tensorflow/core/api_def/base_api/api_def_AdjustSaturation.pbtxt b/tensorflow/core/api_def/base_api/api_def_AdjustSaturation.pbtxt
index 97be0fda11f..727f793c8b1 100644
--- a/tensorflow/core/api_def/base_api/api_def_AdjustSaturation.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AdjustSaturation.pbtxt
@@ -21,7 +21,7 @@ END
   summary: "Adjust the saturation of one or more images."
   description: <<END
 `images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
+interpreted as channels, and must be three.
 
 The input image is considered in the RGB colorspace. Conceptually, the RGB
 colors are first mapped into HSV. A scale is then applied all the saturation
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
index 0f49a18a114..f3379461a5f 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
@@ -65,7 +65,7 @@ END
   summary: "Update \'*var\' according to the Ftrl-proximal scheme."
   description: <<END
 accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
 accum = accum_new
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
index 3218ab7776c..1eb33005e91 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
@@ -65,8 +65,8 @@ END
   summary: "Update \'*var\' according to the Ftrl-proximal scheme."
   description: <<END
 grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
+accum_new = accum + grad * grad
+linear += grad_with_shrinkage -
     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
diff --git a/tensorflow/core/api_def/base_api/api_def_BeginEpoch.pbtxt b/tensorflow/core/api_def/base_api/api_def_BeginEpoch.pbtxt
deleted file mode 100644
index d5fd0d609c8..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_BeginEpoch.pbtxt
+++ /dev/null
@@ -1,5 +0,0 @@
-op {
-  graph_op_name: "BeginEpoch"
-  visibility: HIDDEN
-  summary: "Begins a tf.data service dataset epoch."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplitV2.pbtxt
index 2bbaba26257..84382d8a99c 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplitV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplitV2.pbtxt
@@ -47,7 +47,7 @@ END
   in_arg {
     name: "min_node_weight"
     description: <<END
-mininum avg of hessians in a node before required for the node to be considered for splitting.
+minimum avg of hessians in a node before required for the node to be considered for splitting.
 END
   }
   out_arg {
diff --git a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
index 2af0ea31c62..dc1e74df148 100644
--- a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
@@ -38,5 +38,14 @@ tf.Tensor(
 
 In the above example, the input Tensor with the shape of `[1, 3]`
 is broadcasted to output Tensor with shape of `[3, 3]`.
+
+When doing broadcasted operations such as multiplying a tensor
+by a scalar, broadcasting (usually) confers some time or space
+benefit, as the broadcasted tensor is never materialized.
+
+However, `broadcast_to` does not carry with it any such benefits.
+The newly-created tensor takes the full memory of the broadcasted
+shape. (In a graph context, `broadcast_to` might be fused to
+subsequent operation and then be optimized away, however.)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
index 461b498662d..99774c69e18 100644
--- a/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
@@ -28,7 +28,7 @@ output: A 3-D tensor with the shape of [seq_length, batch_size,
     dir * num_units].
 output_h: The same shape has input_h.
 output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-is_training: Indicates whether this operation is used for inferenece or
+is_training: Indicates whether this operation is used for inference or
   training.
 reserve_space: An opaque tensor that can be used in backprop calculation. It
   is only produced if is_training is false.
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt
index c8a39de68cf..cd9e10e4487 100644
--- a/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt
@@ -29,7 +29,7 @@ output: A 3-D tensor with the shape of [seq_length, batch_size,
     dir * num_units].
 output_h: The same shape has input_h.
 output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-is_training: Indicates whether this operation is used for inferenece or
+is_training: Indicates whether this operation is used for inference or
   training.
 reserve_space: An opaque tensor that can be used in backprop calculation. It
   is only produced if is_training is true.
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt
index e076d3cda28..a1809391c27 100644
--- a/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt
@@ -34,7 +34,7 @@ output: If time_major is true, this is a 3-D tensor with the shape of
     shape is [batch_size, seq_length, dir * num_units].
 output_h: The same shape has input_h.
 output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-is_training: Indicates whether this operation is used for inferenece or
+is_training: Indicates whether this operation is used for inference or
   training.
 time_major: Indicates whether the input/output format is time major or batch
     major.
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
new file mode 100644
index 00000000000..11043899ba4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "DenseBincount"
+  in_arg {
+    name: "input"
+    description: <<END
+1D or 2D int `Tensor`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+non-negative int scalar `Tensor`.
+END
+  }
+  in_arg {
+    name: "weights"
+    description: <<END
+is an int32, int64, float32, or float64 `Tensor` with the same
+shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+equal to 1.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1D `Tensor` with length equal to `size` or 2D `Tensor` with [batch_size, `size`].
+The counts or summed weights for each value in the range [0, size).
+END
+  }
+  attr {
+    name: "binary_output"
+    description: <<END
+bool; Whether the kernel should count the appearance or number of occurrences.
+END
+  }
+  summary: "Counts the number of occurrences of each value in an integer array."
+  description: <<END
+Outputs a vector with length `size` and the same dtype as `weights`. If
+`weights` are empty, then index `i` stores the number of times the value `i` is
+counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+the value in `weights` at each index where the corresponding value in `arr` is
+`i`.
+
+Values in `arr` outside of the range [0, size) are ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
new file mode 100644
index 00000000000..8296bfe6d7b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "DenseCountSparseOutput"
+  visibility: HIDDEN
+  in_arg {
+    name: "values"
+    description: <<END
+Tensor containing data to count.
+END
+  }
+  in_arg {
+    name: "weights"
+    description: <<END
+A Tensor of the same shape as indices containing per-index weight values. May
+also be the empty tensor if no weights are used.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+Indices tensor for the resulting sparse tensor object.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+Values tensor for the resulting sparse tensor object.
+END
+  }
+  out_arg {
+    name: "output_dense_shape"
+    description: <<END
+Shape tensor for the resulting sparse tensor object.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+Dtype of the input values tensor.
+END
+  }
+  attr {
+    name: "minlength"
+    description: <<END
+Minimum value to count. Can be set to -1 for no minimum.
+END
+  }
+  attr {
+    name: "maxlength"
+    description: <<END
+Maximum value to count. Can be set to -1 for no maximum.
+END
+  }
+  attr {
+    name: "binary_output"
+    description: <<END
+Whether to output the number of occurrences of each value or 1.
+END
+  }
+  attr {
+    name: "output_type"
+    description: <<END
+Dtype of the output values tensor.
+END
+  }
+  summary: "Performs sparse-output bin counting for a tf.tensor input."
+  description: <<END
+  Counts the number of times each value occurs in the input.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DummyIterationCounter.pbtxt b/tensorflow/core/api_def/base_api/api_def_DummyIterationCounter.pbtxt
new file mode 100644
index 00000000000..dcaf11ef54b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DummyIterationCounter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DummyIterationCounter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DummyMemoryCache.pbtxt b/tensorflow/core/api_def/base_api/api_def_DummyMemoryCache.pbtxt
new file mode 100644
index 00000000000..3b940d48bc7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DummyMemoryCache.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DummyMemoryCache"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DummySeedGenerator.pbtxt b/tensorflow/core/api_def/base_api/api_def_DummySeedGenerator.pbtxt
new file mode 100644
index 00000000000..3e771fd324d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DummySeedGenerator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DummySeedGenerator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
index c6104da4a64..7f2a8a1cf1a 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
@@ -30,8 +30,8 @@ END
   summary: "Gather slices from `params` axis `axis` according to `indices`."
   description: <<END
 `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-params.shape[axis + 1:]` where:
+Produces an output tensor with shape `params.shape[:axis] +
+indices.shape[batch_dims:] + params.shape[axis + 1:]` where:
 
 ```python
     # Scalar indices (output is rank(params) - 1).
diff --git a/tensorflow/core/api_def/base_api/api_def_GenerateBoundingBoxProposals.pbtxt b/tensorflow/core/api_def/base_api/api_def_GenerateBoundingBoxProposals.pbtxt
index 6403e16a8bc..a2a3d011268 100644
--- a/tensorflow/core/api_def/base_api/api_def_GenerateBoundingBoxProposals.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GenerateBoundingBoxProposals.pbtxt
@@ -71,7 +71,7 @@ END
       `nms_threshold` intersection-over-union (iou) value, discarding boxes where shorter
       side is less than `min_size`.
       Inputs:
-      `scores`: A 4D tensor of shape [Batch, Height, Width, Num Anchors] containing the scores per anchor at given postion
+      `scores`: A 4D tensor of shape [Batch, Height, Width, Num Anchors] containing the scores per anchor at given position
       `bbox_deltas`: is a tensor of shape [Batch, Height, Width, 4 x Num Anchors] boxes encoded to each anchor
       `anchors`: A 1D tensor of shape [4 x Num Anchors], representing the anchors.
       Outputs:
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
index 2fcd3659dc7..c0c160d1be4 100644
--- a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
@@ -20,9 +20,11 @@ op {
         "A `Tensor` of type T. An alias of `x`. The content "
         "of `y` is undefined if there are duplicates in `i`."
   }
-  summary: <<END
-    Updates specified rows with values in `v`.
+  summary: "Updates specified rows 'i' with values 'v'."
+  description: <<END
+Computes `x[i, :] = v; return x`.
 
-    Computes `x[i, :] = v; return x`.
+Originally this function is mutative however for compilation we make this
+operation create / operate on a copy of `x`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MakeDataServiceIterator.pbtxt b/tensorflow/core/api_def/base_api/api_def_MakeDataServiceIterator.pbtxt
deleted file mode 100644
index 0d516687ebc..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_MakeDataServiceIterator.pbtxt
+++ /dev/null
@@ -1,5 +0,0 @@
-op {
-  graph_op_name: "MakeDataServiceIterator"
-  visibility: HIDDEN
-  summary: "Creates an iterator for reading from the tf.data service."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixBandPart.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixBandPart.pbtxt
index eaf3d28437b..9d83972a8d9 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatrixBandPart.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixBandPart.pbtxt
@@ -26,10 +26,8 @@ END
 Rank `k` tensor of the same shape as input. The extracted banded tensor.
 END
   }
-  summary: "Copy a tensor setting everything outside a central band in each innermost matrix"
+  summary: "Copy a tensor setting everything outside a central band in each innermost matrix to zero."
   description: <<END
-to zero.
-
 The `band` part is computed as follows:
 Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
 tensor with the same shape where
diff --git a/tensorflow/core/api_def/base_api/api_def_MlirPassthroughOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_MlirPassthroughOp.pbtxt
index 42b80a60725..1486759f23f 100644
--- a/tensorflow/core/api_def/base_api/api_def_MlirPassthroughOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MlirPassthroughOp.pbtxt
@@ -27,7 +27,7 @@ func @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32>
 
 @tf.function
 def foo(x, y):
-  return = mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
+  return mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
 
 graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.TensorSpec([10], tf.float32)).graph.as_graph_def()
 ```
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseSequenceExampleV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseSequenceExampleV2.pbtxt
index e20bdf23e85..b718d50550e 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParseSequenceExampleV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ParseSequenceExampleV2.pbtxt
@@ -59,7 +59,7 @@ END
   in_arg {
     name: "feature_list_dense_missing_assumed_empty"
     description: <<END
-A vector corresponding 1:1 with featue_list_dense_keys, indicating which
+A vector corresponding 1:1 with feature_list_dense_keys, indicating which
 features may be missing from the SequenceExamples.  If the associated
 FeatureList is missing, it is treated as empty.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBias.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBias.pbtxt
index 140144961f3..a510716b09e 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBias.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBias.pbtxt
@@ -79,6 +79,6 @@ The inputs must be two-dimensional matrices and 1D bias vector. And the inner
 dimension of `a` (after being transposed if `transpose_a` is non-zero) must
 match the outer dimension of `b` (after being transposed if `transposed_b` is
 non-zero). Then do broadcast add operation with bias values on the matrix
-mulplication result. The bias size must match inner dimension of `b`.
+multiplication result. The bias size must match inner dimension of `b`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBiasAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBiasAndRelu.pbtxt
index 19b4fc6f84d..2475fcb3321 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBiasAndRelu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBiasAndRelu.pbtxt
@@ -79,7 +79,7 @@ The inputs must be two-dimensional matrices and 1D bias vector. And the inner
 dimension of `a` (after being transposed if `transpose_a` is non-zero) must
 match the outer dimension of `b` (after being transposed if `transposed_b` is
 non-zero). Then do broadcast add operation with bias values on the matrix
-mulplication result. The bias size must match inner dimension of `b`. Then do
+multiplication result. The bias size must match inner dimension of `b`. Then do
 relu activation to get non-negative result.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt
index c6f949f82d6..df5ee848b5b 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt
@@ -92,7 +92,7 @@ The inputs must be two-dimensional matrices and 1D bias vector. And the inner
 dimension of `a` (after being transposed if `transpose_a` is non-zero) must
 match the outer dimension of `b` (after being transposed if `transposed_b` is
 non-zero). Then do broadcast add operation with bias values on the matrix
-mulplication result. The bias size must match inner dimension of `b`.  Then do
+multiplication result. The bias size must match inner dimension of `b`.  Then do
 relu activation to get non-negative result. Then do requantize operation to get
 final uint8 result.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
new file mode 100644
index 00000000000..b6299ada526
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "RaggedBincount"
+  in_arg {
+    name: "splits"
+    description: <<END
+1D int64 `Tensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+2D int `Tensor`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+non-negative int scalar `Tensor`.
+END
+  }
+  in_arg {
+    name: "weights"
+    description: <<END
+is an int32, int64, float32, or float64 `Tensor` with the same
+shape as `input`, or a length-0 `Tensor`, in which case it acts as all weights
+equal to 1.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1D `Tensor` with length equal to `size` or 2D `Tensor` with [batch_size, `size`].
+The counts or summed weights for each value in the range [0, size).
+END
+  }
+  attr {
+    name: "binary_output"
+    description: <<END
+bool; Whether the kernel should count the appearance or number of occurrences.
+END
+  }
+  summary: "Counts the number of occurrences of each value in an integer array."
+  description: <<END
+Outputs a vector with length `size` and the same dtype as `weights`. If
+`weights` are empty, then index `i` stores the number of times the value `i` is
+counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+the value in `weights` at each index where the corresponding value in `arr` is
+`i`.
+
+Values in `arr` outside of the range [0, size) are ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
new file mode 100644
index 00000000000..37224d841de
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
@@ -0,0 +1,75 @@
+op {
+  graph_op_name: "RaggedCountSparseOutput"
+  visibility: HIDDEN
+  in_arg {
+    name: "splits"
+    description: <<END
+Tensor containing the row splits of the ragged tensor to count.
+END
+  }
+in_arg {
+    name: "values"
+    description: <<END
+Tensor containing values of the sparse tensor to count.
+END
+  }
+  in_arg {
+    name: "weights"
+    description: <<END
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+Indices tensor for the resulting sparse tensor object.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+Values tensor for the resulting sparse tensor object.
+END
+  }
+  out_arg {
+    name: "output_dense_shape"
+    description: <<END
+Shape tensor for the resulting sparse tensor object.
+  END
+  }
+  attr {
+    name: "T"
+    description: <<END
+Dtype of the input values tensor.
+END
+  }
+  attr {
+    name: "minlength"
+    description: <<END
+Minimum value to count. Can be set to -1 for no minimum.
+END
+  }
+  attr {
+    name: "maxlength"
+    description: <<END
+Maximum value to count. Can be set to -1 for no maximum.
+END
+  }
+  attr {
+    name: "binary_output"
+    description: <<END
+Whether to output the number of occurrences of each value or 1.
+END
+  }
+  attr {
+    name: "output_type"
+    description: <<END
+Dtype of the output values tensor.
+END
+  }
+  summary: "Performs sparse-output bin counting for a ragged tensor input."
+  description: <<END
+  Counts the number of times each value occurs in the input.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradV2.pbtxt
index d99d4184a2c..9eb6a48b40f 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradV2.pbtxt
@@ -42,6 +42,6 @@ END
   summary: "Update \'*var\' according to the adagrad scheme."
   description: <<END
 accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
+var -= lr * grad * (1 / (sqrt(accum) + epsilon))
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyMomentum.pbtxt
index d1a84a4c34b..3920cca25ad 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyMomentum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyMomentum.pbtxt
@@ -46,9 +46,9 @@ var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 END
   }
-  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
+  summary: "Update \'*var\' according to the momentum scheme."
   description: <<END
-want to use Nesterov momentum.
+Set use_nesterov = True if you want to use Nesterov momentum.
 
 accum = accum * momentum + grad
 var -= lr * accum
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDatasetV2.pbtxt
new file mode 100644
index 00000000000..93135231b87
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDatasetV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShuffleAndRepeatDatasetV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleDatasetV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleDatasetV3.pbtxt
new file mode 100644
index 00000000000..0cb628bb988
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleDatasetV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShuffleDatasetV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
index df924f29636..5300b5570cb 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
@@ -72,8 +72,8 @@ END
   description: <<END
 That is for rows we have grad for, we update var, accum and linear as follows:
 grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
+accum_new = accum + grad * grad
+linear += grad_with_shrinkage -
     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
new file mode 100644
index 00000000000..12cb5f43218
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "SparseBincount"
+  in_arg {
+    name: "indices"
+    description: <<END
+2D int64 `Tensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1D int `Tensor`.
+END
+  }
+  in_arg {
+    name: "dense_shape"
+    description: <<END
+1D int64 `Tensor`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+non-negative int scalar `Tensor`.
+END
+  }
+  in_arg {
+    name: "weights"
+    description: <<END
+is an int32, int64, float32, or float64 `Tensor` with the same
+shape as `input`, or a length-0 `Tensor`, in which case it acts as all weights
+equal to 1.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1D `Tensor` with length equal to `size` or 2D `Tensor` with [batch_size, `size`].
+The counts or summed weights for each value in the range [0, size).
+END
+  }
+  attr {
+    name: "binary_output"
+    description: <<END
+bool; Whether the kernel should count the appearance or number of occurrences.
+END
+  }
+  summary: "Counts the number of occurrences of each value in an integer array."
+  description: <<END
+Outputs a vector with length `size` and the same dtype as `weights`. If
+`weights` are empty, then index `i` stores the number of times the value `i` is
+counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+the value in `weights` at each index where the corresponding value in `arr` is
+`i`.
+
+Values in `arr` outside of the range [0, size) are ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
new file mode 100644
index 00000000000..a346710c8b3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
@@ -0,0 +1,81 @@
+op {
+  graph_op_name: "SparseCountSparseOutput"
+  visibility: HIDDEN
+  in_arg {
+    name: "indices"
+    description: <<END
+Tensor containing the indices of the sparse tensor to count.
+END
+  }
+in_arg {
+    name: "values"
+    description: <<END
+Tensor containing values of the sparse tensor to count.
+END
+  }
+in_arg {
+    name: "dense_shape"
+    description: <<END
+Tensor containing the dense shape of the sparse tensor to count.
+END
+  }
+ in_arg {
+    name: "weights"
+    description: <<END
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+Indices tensor for the resulting sparse tensor object.
+END
+  }
+  out_arg {
+      name: "output_values"
+      description: <<END
+Values tensor for the resulting sparse tensor object.
+END
+  }
+  out_arg {
+      name: "output_dense_shape"
+      description: <<END
+Shape tensor for the resulting sparse tensor object.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+Dtype of the input values tensor.
+END
+  }
+  attr {
+    name: "minlength"
+    description: <<END
+Minimum value to count. Can be set to -1 for no minimum.
+END
+  }
+  attr {
+    name: "maxlength"
+    description: <<END
+Maximum value to count. Can be set to -1 for no maximum.
+END
+  }
+  attr {
+    name: "binary_output"
+    description: <<END
+Whether to output the number of occurrences of each value or 1.
+END
+  }
+  attr {
+    name: "output_type"
+    description: <<END
+Dtype of the output values tensor.
+END
+  }
+  summary: "Performs sparse-output bin counting for a sparse tensor input."
+  description: <<END
+  Counts the number of times each value occurs in the input.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..2c4340cb9b7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
@@ -0,0 +1,104 @@
+op {
+  graph_op_name: "SparseCrossHashed"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.   values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.   Shapes of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "dense_inputs"
+    description: <<END
+2-D.    Columns represented by dense `Tensor`.
+END
+  }
+  in_arg {
+    name: "num_buckets"
+    description: <<END
+It is used if hashed_output is true.
+output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+END
+  }
+  in_arg {
+    name: "strong_hash"
+    description: <<END
+boolean, if true, siphash with salt will be used instead of farmhash.
+END
+  }
+  in_arg {
+    name: "salt"
+    description: <<END
+Specify the salt that will be used by the siphash function.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated or hashed
+`SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: <<END
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..0627d9b3909
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
@@ -0,0 +1,91 @@
+op {
+  graph_op_name: "SparseCrossV2"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.   values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.   Shapes of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "dense_inputs"
+    description: <<END
+2-D.    Columns represented by dense `Tensor`.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+string used when joining a list of string inputs, can be used as separator later.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated or hashed
+`SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: <<END
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
index 65b2358830e..5f7e59c9f10 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -28,7 +28,7 @@ END
   summary: "Computes the mean along sparse segments of a tensor."
   description: <<END
 Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-misisng, the `output` tensor at that position will be zeroed.
+missing, the `output` tensor at that position will be zeroed.
 
 Read
 [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
index 8a5d2bb02c4..57f0bac5a01 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -30,7 +30,7 @@ END
 N is the size of the segment being reduced.
 
 Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-misisng, the `output` tensor at that position will be zeroed.
+missing, the `output` tensor at that position will be zeroed.
 
 Read
 [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
index 039ca9a23ba..4c176ca68f3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -28,7 +28,7 @@ END
   summary: "Computes the sum along sparse segments of a tensor."
   description: <<END
 Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-misisng, the `output` tensor at that position will be zeroed.
+missing, the `output` tensor at that position will be zeroed.
 
 Read
 [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
diff --git a/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt b/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt
index cce72dfe302..30caee56fe7 100644
--- a/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt
@@ -116,7 +116,7 @@ Raises:
 
   * `ValueError`: If the first argument cannot be converted to a
      Tensor of `dtype string`.
-  * `InvalidArgumentError`: If indicies are out of range.
+  * `InvalidArgumentError`: If indices are out of range.
   * `ValueError`: If `pos` and `len` are not the same shape.
 
 END
diff --git a/tensorflow/core/api_def/python_api/api_def_DenseBincount.pbtxt b/tensorflow/core/api_def/python_api/api_def_DenseBincount.pbtxt
new file mode 100644
index 00000000000..75d4a235664
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DenseBincount.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DenseBincount"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RaggedBincount.pbtxt b/tensorflow/core/api_def/python_api/api_def_RaggedBincount.pbtxt
new file mode 100644
index 00000000000..2e3ca0fbb71
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RaggedBincount.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RaggedBincount"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseBincount.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseBincount.pbtxt
new file mode 100644
index 00000000000..c09c6ff7768
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseBincount.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseBincount"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..2c830668733
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseCrossHashed"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..dfa0a670c4c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseCrossV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index bf5b1cfaba3..016896b36f4 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -53,10 +53,14 @@ package(
 tf_cuda_library(
     name = "core_cpu",
     hdrs = [
+        "composite_device.h",
         "device.h",
         "device_factory.h",
         "function.h",
         "function_optimization_registry.h",
+        "gradients.h",
+        "graph_constructor.h",
+        "graph_def_builder_util.h",
         "optimization_registry.h",
         "shape_refiner.h",
         "//tensorflow/core/graph:core_cpu_headers",
@@ -145,13 +149,21 @@ filegroup(
 filegroup(
     name = "core_cpu_base_headers",
     srcs = [
+        "composite_device.h",
         "device.h",
         "device_factory.h",
         "device_mgr.h",
         "device_set.h",
         "eval_const_tensor.h",
         "function.h",
+        "function_body.h",
+        "function_def_utils.h",
+        "function_utils.h",
+        "graph_constructor.h",
+        "graph_def_builder_util.h",
+        "graph_optimizer.h",
         "graph_runner.h",
+        "inline_function_utils.h",
         "metrics.h",
         "process_function_library_runtime.h",
         "scoped_allocator.h",
@@ -165,12 +177,6 @@ filegroup(
 tf_cuda_library(
     name = "core_cpu_base_no_ops",
     srcs = [
-        "eval_const_tensor.cc",
-        "graph_optimizer.h",
-        "scoped_allocator.cc",
-        "scoped_allocator_mgr.cc",
-        "shape_refiner.cc",
-        "//tensorflow/core/graph:core_cpu_base_no_ops_srcs",
         "//tensorflow/core/public:session_options.h",
         "//tensorflow/core/public:version.h",
     ],
@@ -180,6 +186,7 @@ tf_cuda_library(
     ],
     copts = tf_copts(),
     deps = [
+        ":scoped_allocator",
         "//tensorflow/core:graph",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -189,6 +196,8 @@ tf_cuda_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "//third_party/eigen3",
     ] + if_static([
+        ":graph_constructor",
+        ":graph_def_builder_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ]),
@@ -217,16 +226,15 @@ filegroup(
         "debugger_state_interface.h",
         "device_resolver_local.h",
         "dma_helper.h",
-        "entry.h",
         "executor.h",
         "executor_factory.h",
         "function_optimization_registry.h",
         "graph_optimizer.h",
-        "graph_view.h",
-        "immutable_executor_state.h",
+        "gradients.h",
         "input_colocation_exemption_registry.h",
         "isolate_placer_inspection_required_ops_pass.h",
         "local_device.h",
+        "local_executor_params.h",
         "lower_function_call_op.h",
         "lower_if_op.h",
         "lower_case_op.h",
@@ -234,15 +242,15 @@ filegroup(
         "lower_while_op.h",
         "memory_types.h",
         "mkl_cpu_allocator.h",
+        "mkl_layout_pass.h",
+        "mkl_tfconversion_pass.h",
         "optimization_registry.h",
-        "pending_counts.h",
         "partitioning_utils.h",
         "placer.h",
         "process_util.h",
         "inspecting_placer.h",
         "profile_handler.h",
-        "propagator_debug_utils.h",
-        "propagator_state.h",
+        "quantize_training.h",
         "renamed_device.h",
         "rendezvous_mgr.h",
         "rendezvous_util.h",
@@ -251,118 +259,1337 @@ filegroup(
         "ring_alg.h",
         "ring_gatherer.h",
         "session_factory.h",
-        "simple_propagator_state.h",
         "single_threaded_cpu_device.h",
         "stats_publisher_interface.h",
         "step_stats_collector.h",
         "threadpool_device.h",
         "process_state.h",
         "pool_allocator.h",
-        "//tensorflow/core/graph:core_cpu_lib_headers",
     ] + if_mkl(["//tensorflow/core/graph:mkl_graph_util_header"]),
 )
 
+cc_library(
+    name = "accumulate_n_optimizer",
+    srcs = ["accumulate_n_optimizer.cc"],
+    copts = tf_copts(),
+    deps = [
+        ":optimization_registry",
+        "//tensorflow/core:graph",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "base_collective_executor",
+    srcs = ["base_collective_executor.cc"],
+    hdrs = ["base_collective_executor.h"],
+    copts = tf_copts(),
+    deps = [
+        ":buf_rendezvous",
+        ":copy_tensor",
+        ":device_mgr",
+        ":dma_helper",
+        ":process_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "buf_rendezvous",
+    srcs = ["buf_rendezvous.cc"],
+    hdrs = ["buf_rendezvous.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":device_mgr",
+        ":process_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "build_graph_options",
+    srcs = ["build_graph_options.cc"],
+    hdrs = ["build_graph_options.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "collective_executor_mgr",
+    srcs = ["collective_executor_mgr.cc"],
+    hdrs = ["collective_executor_mgr.h"],
+    copts = tf_copts(),
+    deps = [
+        ":base_collective_executor",
+        ":build_graph_options",
+        ":collective_rma_local",
+        ":device_mgr",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "collective_util",
+    srcs = ["collective_util.cc"],
+    hdrs = ["collective_util.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":device_mgr",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "copy_tensor",
+    srcs = ["copy_tensor.cc"],
+    hdrs = ["copy_tensor.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":dma_helper",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/lib:scoped_annotation",
+    ],
+)
+
+cc_library(
+    name = "collective_param_resolver_local",
+    srcs = ["collective_param_resolver_local.cc"],
+    hdrs = ["collective_param_resolver_local.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":device_mgr",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "collective_rma_local",
+    srcs = ["collective_rma_local.cc"],
+    hdrs = ["collective_rma_local.h"],
+    copts = tf_copts(),
+    deps = [
+        ":buf_rendezvous",
+        ":copy_tensor",
+        ":device",
+        ":device_mgr",
+        ":dma_helper",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "colocation_graph",
+    srcs = [
+        "colocation_graph.cc",
+        "inspecting_placer.cc",
+    ],
+    hdrs = [
+        "colocation_graph.h",
+        "inspecting_placer.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":composite_device",
+        ":device",
+        ":device_set",
+        ":function_body",
+        ":function_def_utils",
+        ":input_colocation_exemption_registry",
+        ":partitioning_utils",
+        ":placer_inspection_required_ops_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "composite_device",
+    srcs = ["composite_device.cc"],
+    hdrs = ["composite_device.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "constant_folding",
+    srcs = ["constant_folding.cc"],
+    hdrs = ["constant_folding.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":device_factory",
+        ":executor",
+        ":function_utils",
+        ":graph_constructor",
+        ":memory_types",
+        ":rendezvous_mgr",
+        ":session_options",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "costmodel_manager",
+    srcs = ["costmodel_manager.cc"],
+    hdrs = ["costmodel_manager.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "debugger_state_interface",
+    srcs = ["debugger_state_interface.cc"],
+    hdrs = ["debugger_state_interface.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "device",
+    srcs = ["device.cc"],
+    hdrs = ["device.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "device_factory",
+    srcs = ["device_factory.cc"],
+    hdrs = ["device_factory.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":session_options",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "device_mgr",
+    srcs = [
+        "device_mgr.cc",
+        "dynamic_device_mgr.cc",
+    ],
+    hdrs = ["device_mgr.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":local_device",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "device_resolver_local",
+    srcs = ["device_resolver_local.cc"],
+    hdrs = ["device_resolver_local.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device_mgr",
+        "//tensorflow/core:framework",
+    ],
+)
+
+cc_library(
+    name = "entry",
+    hdrs = ["entry.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "executor",
+    srcs = ["executor.cc"],
+    hdrs = ["executor.h"],
+    copts = tf_copts(),
+    deps = [
+        ":costmodel_manager",
+        ":device",
+        ":entry",
+        ":executor_factory",
+        ":graph_view",
+        ":immutable_executor_state",
+        ":local_executor_params",
+        ":pending_counts",
+        ":propagator_state",
+        ":renamed_device",
+        ":simple_propagator_state",
+        ":step_stats_collector",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:annotated_traceme",
+        "//tensorflow/core/profiler/lib:scoped_annotation",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "executor_factory",
+    srcs = ["executor_factory.cc"],
+    hdrs = ["executor_factory.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "device_set",
+    srcs = ["device_set.cc"],
+    hdrs = ["device_set.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":device_factory",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "dma_helper",
+    hdrs = ["dma_helper.h"],
+    copts = tf_copts(),
+    deps = ["//tensorflow/core:framework"],
+)
+
+cc_library(
+    name = "function",
+    srcs = [
+        "function.cc",
+        "process_function_library_runtime.cc",
+    ],
+    hdrs = [
+        "function.h",
+        "process_function_library_runtime.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":composite_device",
+        ":device",
+        ":device_mgr",
+        ":device_set",
+        ":executor",
+        ":executor_factory",
+        ":function_body",
+        ":function_def_utils",
+        ":function_optimization_registry",
+        ":function_utils",
+        ":gradients",
+        ":graph_constructor",
+        ":graph_optimizer",
+        ":inline_function_utils",
+        ":memory_types",
+        ":optimization_registry",
+        ":partitioning_utils",
+        ":placer",
+        ":process_util",
+        ":rendezvous_mgr",
+        ":rendezvous_util",
+        ":replicate_per_replica_nodes",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "function_body",
+    srcs = ["function_body.cc"],
+    hdrs = ["function_body.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "function_optimization_registry",
+    srcs = ["function_optimization_registry.cc"],
+    hdrs = ["function_optimization_registry.h"],
+    deps = [
+        ":device_set",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "function_def_utils",
+    srcs = ["function_def_utils.cc"],
+    hdrs = ["function_def_utils.h"],
+    copts = tf_copts(),
+    deps = [
+        ":function_body",
+        ":graph_constructor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "function_utils",
+    srcs = ["function_utils.cc"],
+    hdrs = ["function_utils.h"],
+    copts = tf_copts(),
+    deps = [
+        ":function_body",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "gradients",
+    srcs = ["gradients.cc"],
+    hdrs = ["gradients.h"],
+    deps = [
+        ":device",
+        ":executor",
+        ":graph_constructor",
+        ":graph_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+# This library also includes "eval_const_tensor", "graph_runner", and
+# "shape_refiner", because there are circular dependencies between these
+# modules.
+cc_library(
+    name = "graph_constructor",
+    srcs = [
+        "eval_const_tensor.cc",
+        "graph_constructor.cc",
+        "graph_runner.cc",
+        "shape_refiner.cc",
+        "//tensorflow/core/framework:versions.h",
+    ],
+    hdrs = [
+        "eval_const_tensor.h",
+        "graph_constructor.h",
+        "graph_runner.h",
+        "shape_refiner.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":device_factory",
+        ":executor",
+        ":function_utils",
+        ":memory_types",
+        ":rendezvous_mgr",
+        ":session_options",
+        ":single_threaded_cpu_device",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "graph_def_builder_util",
+    srcs = ["graph_def_builder_util.cc"],
+    hdrs = ["graph_def_builder_util.h"],
+    copts = tf_copts(),
+    deps = [
+        ":graph_constructor",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "graph_optimizer",
+    srcs = ["graph_optimizer.cc"],
+    hdrs = ["graph_optimizer.h"],
+    copts = tf_copts(),
+    deps = [
+        ":constant_folding",
+        ":function_utils",
+        ":graph_constructor",
+        ":inline_function_utils",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "graph_view",
+    srcs = ["graph_view.cc"],
+    hdrs = ["graph_view.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "hierarchical_tree_broadcaster",
+    srcs = ["hierarchical_tree_broadcaster.cc"],
+    hdrs = ["hierarchical_tree_broadcaster.h"],
+    copts = tf_copts(),
+    deps = [
+        ":base_collective_executor",
+        ":collective_rma_local",
+        ":collective_util",
+        ":device_mgr",
+        ":dma_helper",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "immutable_executor_state",
+    srcs = ["immutable_executor_state.cc"],
+    hdrs = ["immutable_executor_state.h"],
+    copts = tf_copts(),
+    deps = [
+        ":graph_view",
+        ":local_executor_params",
+        ":pending_counts",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "inline_function_utils",
+    srcs = ["inline_function_utils.cc"],
+    hdrs = ["inline_function_utils.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":function_body",
+        ":function_utils",
+        ":graph_constructor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "input_colocation_exemption_registry",
+    srcs = ["input_colocation_exemption_registry.cc"],
+    hdrs = ["input_colocation_exemption_registry.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "isolate_placer_inspection_required_ops_pass",
+    srcs = ["isolate_placer_inspection_required_ops_pass.cc"],
+    hdrs = ["isolate_placer_inspection_required_ops_pass.h"],
+    copts = tf_copts(),
+    deps = [
+        ":optimization_registry",
+        ":placer_inspection_required_ops_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "local_device",
+    srcs = ["local_device.cc"],
+    hdrs = ["local_device.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        ":process_state",
+        ":process_util",
+        ":session_options",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "local_executor_params",
+    hdrs = ["local_executor_params.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "lower_case_op",
+    srcs = ["lower_case_op.cc"],
+    hdrs = ["lower_case_op.h"],
+    deps = [
+        ":inline_function_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "lower_function_call_op",
+    srcs = ["lower_function_call_op.cc"],
+    hdrs = ["lower_function_call_op.h"],
+    copts = tf_copts(),
+    deps = [
+        ":function_def_utils",
+        ":inline_function_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+cc_library(
+    name = "lower_functional_ops",
+    srcs = ["lower_functional_ops.cc"],
+    hdrs = ["lower_functional_ops.h"],
+    copts = tf_copts(),
+    deps = [
+        ":function_utils",
+        ":inline_function_utils",
+        ":lower_case_op",
+        ":lower_function_call_op",
+        ":lower_if_op",
+        ":lower_while_op",
+        ":optimization_registry",
+        ":session_options",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "lower_if_op",
+    srcs = ["lower_if_op.cc"],
+    hdrs = ["lower_if_op.h"],
+    copts = tf_copts(),
+    deps = [
+        ":inline_function_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "lower_while_op",
+    srcs = ["lower_while_op.cc"],
+    hdrs = ["lower_while_op.h"],
+    copts = tf_copts(),
+    deps = [
+        ":inline_function_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "memory_types",
+    srcs = ["memory_types.cc"],
+    hdrs = ["memory_types.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "mkl_cpu_allocator",
+    srcs = ["mkl_cpu_allocator.cc"],
+    hdrs = ["mkl_cpu_allocator.h"],
+    copts = tf_copts(),
+    deps = [
+        ":bfc_allocator",
+        ":pool_allocator",
+        "//tensorflow/core:lib",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "mkl_layout_pass",
+    srcs = ["mkl_layout_pass.cc"],
+    hdrs = [
+        "mkl_layout_pass.h",
+        "//tensorflow/core/graph:mkl_graph_util_header",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":function",
+        ":optimization_registry",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ] + mkl_deps(),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "mkl_tfconversion_pass",
+    srcs = ["mkl_tfconversion_pass.cc"],
+    hdrs = [
+        "mkl_tfconversion_pass.h",
+        "//tensorflow/core/graph:mkl_graph_util_header",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":function",
+        ":optimization_registry",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ] + mkl_deps(),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "optimization_registry",
+    srcs = ["optimization_registry.cc"],
+    hdrs = ["optimization_registry.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device_set",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "parallel_concat_optimizer",
+    srcs = ["parallel_concat_optimizer.cc"],
+    copts = tf_copts(),
+    deps = [
+        ":optimization_registry",
+        "//tensorflow/core:graph",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "partitioning_utils",
+    srcs = ["partitioning_utils.cc"],
+    hdrs = ["partitioning_utils.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device_set",
+        ":graph_constructor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "pending_counts",
+    hdrs = ["pending_counts.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "pool_allocator",
+    srcs = ["pool_allocator.cc"],
+    hdrs = ["pool_allocator.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "placer",
+    srcs = ["placer.cc"],
+    hdrs = ["placer.h"],
+    copts = tf_copts(),
+    deps = [
+        ":colocation_graph",
+        ":device",
+        ":device_set",
+        ":session_options",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "placer_inspection_required_ops_utils",
+    srcs = ["placer_inspection_required_ops_utils.cc"],
+    hdrs = ["placer_inspection_required_ops_utils.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "process_state",
+    srcs = ["process_state.cc"],
+    hdrs = ["process_state.h"],
+    copts = tf_copts(),
+    deps = [
+        ":bfc_allocator",
+        ":pool_allocator",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/util:env_var",
+        "@com_google_absl//absl/base",
+    ],
+)
+
+cc_library(
+    name = "process_util",
+    srcs = ["process_util.cc"],
+    hdrs = ["process_util.h"],
+    copts = tf_copts() + tf_openmp_copts(),
+    deps = [
+        ":session_options",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "profile_handler",
+    hdrs = ["profile_handler.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "propagator_debug_utils",
+    srcs = ["propagator_debug_utils.cc"],
+    hdrs = ["propagator_debug_utils.h"],
+    copts = tf_copts(),
+    deps = [
+        ":entry",
+        ":graph_view",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "propagator_state",
+    srcs = ["propagator_state.cc"],
+    hdrs = ["propagator_state.h"],
+    copts = tf_copts(),
+    deps = [
+        ":entry",
+        ":graph_view",
+        ":immutable_executor_state",
+        ":pending_counts",
+        ":propagator_debug_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:hash",
+        "//tensorflow/core/profiler/lib:traceme",
+    ],
+)
+
+filegroup(
+    name = "quantize_training_hdrs",
+    srcs = [
+        "quantize_training.h",
+    ],
+    visibility = [
+        "//tensorflow/python:__pkg__",
+    ],
+)
+
+cc_library(
+    name = "quantize_training",
+    srcs = ["quantize_training.cc"],
+    hdrs = [":quantize_training_hdrs"],
+    copts = tf_copts(),
+    deps = [
+        ":executor",
+        ":graph_constructor",
+        ":memory_types",
+        ":session_options",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "renamed_device",
+    srcs = ["renamed_device.cc"],
+    hdrs = ["renamed_device.h"],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "rendezvous_mgr",
+    srcs = ["rendezvous_mgr.cc"],
+    hdrs = ["rendezvous_mgr.h"],
+    copts = tf_copts(),
+    deps = [
+        ":copy_tensor",
+        ":device",
+        ":device_mgr",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "ring_alg",
+    srcs = ["ring_alg.cc"],
+    hdrs = ["ring_alg.h"],
+    copts = tf_copts(),
+    deps = [
+        ":base_collective_executor",
+        ":collective_rma_local",
+        ":collective_util",
+        ":copy_tensor",
+        ":device",
+        ":device_mgr",
+        ":dma_helper",
+        ":process_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "ring_gatherer",
+    srcs = ["ring_gatherer.cc"],
+    hdrs = ["ring_gatherer.h"],
+    copts = tf_copts(),
+    deps = [
+        ":base_collective_executor",
+        ":collective_rma_local",
+        ":collective_util",
+        ":copy_tensor",
+        ":device",
+        ":device_mgr",
+        ":dma_helper",
+        ":process_util",
+        ":ring_alg",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "ring_reducer",
+    srcs = ["ring_reducer.cc"],
+    hdrs = ["ring_reducer.h"],
+    copts = tf_copts(),
+    deps = [
+        ":base_collective_executor",
+        ":collective_rma_local",
+        ":collective_util",
+        ":copy_tensor",
+        ":device",
+        ":device_mgr",
+        ":dma_helper",
+        ":process_util",
+        ":ring_alg",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "rendezvous_util",
+    srcs = ["rendezvous_util.cc"],
+    hdrs = ["rendezvous_util.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "replicate_per_replica_nodes",
+    srcs = ["replicate_per_replica_nodes.cc"],
+    hdrs = ["replicate_per_replica_nodes.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "scoped_allocator",
+    srcs = [
+        "scoped_allocator.cc",
+        "scoped_allocator_mgr.cc",
+    ],
+    hdrs = [
+        "scoped_allocator.h",
+        "scoped_allocator_mgr.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "session",
+    srcs = ["session.cc"],
+    hdrs = ["//tensorflow/core/public:session.h"],
+    copts = tf_copts(),
+    deps = [
+        ":session_factory",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "session_factory",
+    srcs = ["session_factory.cc"],
+    hdrs = ["session_factory.h"],
+    copts = tf_copts(),
+    deps = [
+        ":session_options",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "session_options",
+    srcs = ["session_options.cc"],
+    hdrs = [
+        "//tensorflow/core/public:session_options.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "simple_propagator_state",
+    srcs = ["simple_propagator_state.cc"],
+    hdrs = ["simple_propagator_state.h"],
+    copts = tf_copts(),
+    deps = [
+        ":entry",
+        ":graph_view",
+        ":immutable_executor_state",
+        ":pending_counts",
+        ":propagator_debug_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "single_threaded_cpu_device",
+    srcs = ["single_threaded_cpu_device.cc"],
+    hdrs = [
+        "single_threaded_cpu_device.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":device",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "session_state",
+    srcs = ["session_state.cc"],
+    hdrs = ["//tensorflow/core/framework:session_state.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "stats_publisher_interface",
+    srcs = ["stats_publisher_interface.cc"],
+    hdrs = ["stats_publisher_interface.h"],
+    copts = tf_copts(),
+    deps = [
+        ":build_graph_options",
+        ":profile_handler",
+        ":session_options",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "step_stats_collector",
+    srcs = ["step_stats_collector.cc"],
+    hdrs = ["step_stats_collector.h"],
+    copts = tf_copts(),
+    deps = [
+        ":costmodel_manager",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "threadpool_device",
+    srcs = ["threadpool_device.cc"],
+    hdrs = ["threadpool_device.h"],
+    copts = tf_copts() + tf_openmp_copts(),
+    deps = [
+        ":device_factory",
+        ":local_device",
+        ":scoped_allocator",
+        ":session_options",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ] + if_mkl([":mkl_cpu_allocator"]),
+)
+
+cc_library(
+    name = "threadpool_device_factory",
+    srcs = ["threadpool_device_factory.cc"],
+    copts = tf_copts(),
+    deps = [
+        ":device_factory",
+        ":process_state",
+        ":session_options",
+        ":threadpool_device",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,
+)
+
 tf_cuda_library(
     name = "core_cpu_impl",
-    srcs = [
-        "accumulate_n_optimizer.cc",
-        "base_collective_executor.cc",
-        "buf_rendezvous.cc",
-        "build_graph_options.cc",
-        "collective_executor_mgr.cc",
-        "collective_param_resolver_local.cc",
-        "collective_rma_local.cc",
-        "collective_util.cc",
-        "colocation_graph.cc",
-        "constant_folding.cc",
-        "copy_tensor.cc",
-        "costmodel_manager.cc",
-        "debugger_state_interface.cc",
-        "device.cc",
-        "device_factory.cc",
-        "device_mgr.cc",
-        "device_resolver_local.cc",
-        "device_set.cc",
-        "dynamic_device_mgr.cc",
-        "executor.cc",
-        "executor_factory.cc",
-        "function.cc",
-        "function_optimization_registry.cc",
-        "graph_optimizer.cc",
-        "graph_runner.cc",
-        "graph_view.cc",
-        "hierarchical_tree_broadcaster.cc",
-        "immutable_executor_state.cc",
-        "input_colocation_exemption_registry.cc",
-        "inspecting_placer.cc",
-        "isolate_placer_inspection_required_ops_pass.cc",
-        "local_device.cc",
-        "lower_case_op.cc",
-        "lower_function_call_op.cc",
-        "lower_functional_ops.cc",
-        "lower_if_op.cc",
-        "lower_while_op.cc",
-        "memory_types.cc",
-        "mkl_cpu_allocator.cc",
-        "optimization_registry.cc",
-        "parallel_concat_optimizer.cc",
-        "partitioning_utils.cc",
-        "placer.cc",
-        "placer_inspection_required_ops_utils.cc",
-        "placer_inspection_required_ops_utils.h",
-        "pool_allocator.cc",
-        "process_function_library_runtime.cc",
-        "process_state.cc",
-        "process_util.cc",
-        "propagator_debug_utils.cc",
-        "propagator_state.cc",
-        "renamed_device.cc",
-        "rendezvous_mgr.cc",
-        "rendezvous_util.cc",
-        "replicate_per_replica_nodes.cc",
-        "ring_alg.cc",
-        "ring_gatherer.cc",
-        "ring_reducer.cc",
-        "session.cc",
-        "session_factory.cc",
-        "session_options.cc",
-        "session_state.cc",
-        "simple_propagator_state.cc",
-        "single_threaded_cpu_device.cc",
-        "stats_publisher_interface.cc",
-        "step_stats_collector.cc",
-        "threadpool_device.cc",
-        "threadpool_device_factory.cc",
-        "//tensorflow/core/graph:core_cpu_impl_srcs",
-        "//tensorflow/core/public:session.h",
-        "//tensorflow/core/public:session_options.h",
-    ],
     hdrs = [":core_cpu_lib_headers"],
-    copts = tf_copts() + tf_openmp_copts(),
+    copts = tf_copts(),
     deps = [
+        ":accumulate_n_optimizer",
+        ":base_collective_executor",
         ":bfc_allocator",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:variant",
-        "//third_party/eigen3",
-        "//tensorflow/core/public:version",
-        "//tensorflow/core/grappler/utils:functions",
-        "//tensorflow/core/profiler/lib:annotated_traceme",
-        "//tensorflow/core/profiler/lib:scoped_annotation",
-        "//tensorflow/core/profiler/lib:traceme",
-    ] + mkl_deps(),
-    alwayslink = 1,
+        ":buf_rendezvous",
+        ":build_graph_options",
+        ":collective_executor_mgr",
+        ":collective_param_resolver_local",
+        ":collective_rma_local",
+        ":collective_util",
+        ":copy_tensor",
+        ":costmodel_manager",
+        ":debugger_state_interface",
+        ":device",
+        ":device_factory",
+        ":device_mgr",
+        ":device_resolver_local",
+        ":device_set",
+        ":entry",
+        ":function",
+        ":graph_def_builder_util",
+        ":graph_view",
+        ":hierarchical_tree_broadcaster",
+        ":input_colocation_exemption_registry",
+        ":isolate_placer_inspection_required_ops_pass",
+        ":local_device",
+        ":lower_functional_ops",
+        ":memory_types",
+        ":mkl_cpu_allocator",
+        ":mkl_layout_pass",
+        ":mkl_tfconversion_pass",
+        ":optimization_registry",
+        ":parallel_concat_optimizer",
+        ":partitioning_utils",
+        ":pending_counts",
+        ":placer",
+        ":pool_allocator",
+        ":process_state",
+        ":process_util",
+        ":profile_handler",
+        ":quantize_training",
+        ":renamed_device",
+        ":rendezvous_mgr",
+        ":rendezvous_util",
+        ":replicate_per_replica_nodes",
+        ":ring_alg",
+        ":ring_gatherer",
+        ":ring_reducer",
+        ":session",
+        ":session_factory",
+        ":session_options",
+        ":session_state",
+        ":single_threaded_cpu_device",
+        ":stats_publisher_interface",
+        ":step_stats_collector",
+        ":threadpool_device",
+        ":threadpool_device_factory",
+    ],
 )
 
 tf_cuda_library(
@@ -529,6 +1756,22 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "composite_device_test",
+    size = "small",
+    srcs = [
+        "composite_device_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core_cpu",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_tests(
     name = "core_higher_level_tests",
     size = "small",
@@ -562,6 +1805,7 @@ tf_cc_tests(
         ":core_cpu",
         ":core_cpu_internal",
         ":direct_session_internal",
+        ":pending_counts",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
@@ -636,7 +1880,6 @@ tf_cc_tests_gpu(
         ":core",
         ":core_cpu",
         ":core_cpu_internal",
-        ":direct_session_internal",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -665,7 +1908,6 @@ tf_cc_tests_gpu(
         ":core",
         ":core_cpu",
         ":core_cpu_internal",
-        ":direct_session_internal",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -694,7 +1936,6 @@ tf_cc_tests_gpu(
         ":core",
         ":core_cpu",
         ":core_cpu_internal",
-        ":direct_session_internal",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -739,9 +1980,7 @@ tf_cc_test_gpu(
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
+        ":memory_types",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -854,7 +2093,7 @@ tf_cc_test(
     srcs = ["process_util_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
-        ":core_cpu_internal",
+        ":process_util",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -866,7 +2105,7 @@ tf_cc_test(
     srcs = ["rendezvous_util_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
-        ":core_cpu_internal",
+        ":rendezvous_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -879,7 +2118,7 @@ tf_cc_test(
     srcs = ["replicate_per_replica_nodes_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
-        ":core_cpu_internal",
+        ":replicate_per_replica_nodes",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
@@ -1056,7 +2295,7 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/core/kernels:cwise_op",
-    ] + if_mkl([":mkl_array_ops_op_lib"]),
+    ] + if_mkl(["//tensorflow/core:mkl_array_ops_op_lib"]),
 )
 
 tf_cc_test(
@@ -1068,6 +2307,10 @@ tf_cc_test(
         ":core",
         ":core_cpu",
         ":core_cpu_internal",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -1078,6 +2321,8 @@ tf_cc_test(
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:functional_ops",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:state",
@@ -1119,6 +2364,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:partitioned_function_ops",
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:shape_ops",
+        "//tensorflow/core/kernels/data:single_threaded_executor",
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1164,8 +2410,8 @@ tf_cc_test(
     srcs = ["scoped_allocator_mgr_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
-        ":core_cpu",
-        ":core_cpu_internal",
+        ":dma_helper",
+        ":scoped_allocator",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1178,8 +2424,7 @@ tf_cc_test(
     size = "small",
     srcs = ["input_colocation_exemption_registry_test.cc"],
     deps = [
-        ":core_cpu",
-        ":core_cpu_internal",
+        ":input_colocation_exemption_registry",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -1275,10 +2520,12 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1304,3 +2551,114 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
+
+tf_cc_test_mkl(
+    name = "mkl_related_tests",
+    size = "small",
+    srcs = [
+        "mkl_layout_pass_test.cc",
+        "mkl_tfconversion_pass_test.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
+    ] + if_mkl([
+        "//tensorflow/core/kernels:mkl_aggregate_ops",
+        "//tensorflow/core/kernels:mkl_batch_matmul_op",
+        "//tensorflow/core/kernels:mkl_concat_op",
+        "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_cwise_ops_common",
+        "//tensorflow/core/kernels:mkl_dequantize_op",
+        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels:mkl_identity_op",
+        "//tensorflow/core/kernels:mkl_input_conversion_op",
+        "//tensorflow/core/kernels:mkl_lrn_op",
+        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_pooling_ops",
+        "//tensorflow/core/kernels:mkl_qmatmul_op",
+        "//tensorflow/core/kernels:mkl_quantize_op",
+        "//tensorflow/core/kernels:mkl_relu_op",
+        "//tensorflow/core/kernels:mkl_reshape_op",
+        "//tensorflow/core/kernels:mkl_slice_op",
+        "//tensorflow/core/kernels:mkl_softmax_op",
+        "//tensorflow/core/kernels:mkl_tfconv_op",
+        "//tensorflow/core/kernels:mkl_transpose_op",
+        "//tensorflow/core/kernels:mkl_tmp_bf16_ops",
+    ]),
+)
+
+# TODO(bmzhao): Refactor this target to use granular dependencies
+# after stage 4 of the TF build refactor is complete:
+# https://github.com/tensorflow/community/pull/179
+tf_cc_test(
+    name = "quantize_training_test",
+    srcs = ["quantize_training_test.cc"],
+    deps = [
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        "//tensorflow/core",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/util:protos_test_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "graph_constructor_test",
+    size = "small",
+    srcs = ["graph_constructor_test.cc"],
+    linkopts = select({
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/util:protos_test_cc",
+        "//third_party/eigen3",
+    ],
+)
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index de2dc28c979..7a614a8d224 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -21,9 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h"
 #include "tensorflow/core/common_runtime/process_util.h"
-#include "tensorflow/core/common_runtime/ring_reducer.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index f65cfcf8f00..371d52da58f 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -441,34 +441,48 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   return nullptr;
 }
 
-void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
-                              const void* chunk_ptr) {
-  // Internal users will see the memory profile with default trace level.
-  auto traceme_level = profiler::TraceMeLevel::kVerbose;
-#ifdef PLATFORM_GOOGLE
-  traceme_level = profiler::TraceMeLevel::kInfo;
-#endif
+int64 BFCAllocator::LargestFreeChunk() {
+  for (int i = kNumBins - 1; i >= 0; i--) {
+    if (!BinFromIndex(i)->free_chunks.empty()) {
+      return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size;
+    }
+  }
+  return 0;
+}
 
+double BFCAllocator::GetFragmentation() {
+  int64 bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use;
+  DCHECK_GT(bytes_available, 0);
+  return static_cast<double>(bytes_available - LargestFreeChunk()) /
+         bytes_available;
+}
+
+void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* ptr) {
+  BFCAllocator::Chunk* chunk = ChunkFromHandle(region_manager_.get_handle(ptr));
+  AddTraceMe(traceme_name, chunk->ptr, chunk->requested_size, chunk->size);
+}
+
+void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
+                              const void* chunk_ptr, int64 req_bytes,
+                              int64 alloc_bytes) {
   tensorflow::profiler::TraceMe trace_me(
       [&]() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
         AllocatorStats stats = stats_;
         int64 bytes_available =
             memory_limit_ - stats.bytes_reserved - stats.bytes_in_use;
-        BFCAllocator::Chunk* chunk =
-            ChunkFromHandle(region_manager_.get_handle(chunk_ptr));
         const auto& annotation =
             ScopedMemoryDebugAnnotation::CurrentAnnotation();
         std::string tensor_shape = annotation.pending_shape
                                        ? annotation.pending_shape->DebugString()
                                        : "";
-
         return absl::StrCat(traceme_name, "#allocator_name=", name_,
                             ",bytes_reserved=", stats.bytes_reserved,
                             ",bytes_allocated=", stats.bytes_in_use,
                             ",bytes_available=", bytes_available,
+                            ",fragmentation=", GetFragmentation(),
                             ",peak_bytes_in_use=", stats.peak_bytes_in_use,
-                            ",requested_bytes=", chunk->requested_size,
-                            ",allocation_bytes=", chunk->size,
+                            ",requested_bytes=", req_bytes,
+                            ",allocation_bytes=", alloc_bytes,
                             ",addr=", reinterpret_cast<uint64>(chunk_ptr),
                             ",tf_op=", annotation.pending_op_name,
                             ",id=", annotation.pending_step_id,
@@ -476,7 +490,7 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
                             ",data_type=", annotation.pending_data_type,
                             ",shape=", tensor_shape, "#");
       },
-      traceme_level);
+      /*level=*/profiler::TraceMeLevel::kInfo);
 }
 
 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
@@ -613,11 +627,13 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) {
   // Find the chunk from the ptr.
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle);
+  // Record chunk information before it's freed.
+  Chunk* chunk = ChunkFromHandle(h);
+  void* chunk_ptr = chunk->ptr;
+  int64 req_bytes = chunk->requested_size;
+  int64 alloc_bytes = chunk->size;
 
   MarkFree(h);
-  // TraceMe needs to be added after MarkFree and before InsertFreeChunkIntoBin
-  // for correct memory stats.
-  AddTraceMe("MemoryDeallocation", ptr);
 
   // Consider coalescing it.
   if (timing_counter_) {
@@ -627,6 +643,10 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) {
     InsertFreeChunkIntoBin(TryToCoalesce(h, false));
   }
 
+  // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for
+  // correct aggregation stats (bytes_in_use, fragmentation).
+  AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes);
+
   if (VLOG_IS_ON(4)) {
     LOG(INFO) << "F: " << RenderOccupancy();
   }
@@ -1115,31 +1135,6 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
   return md;
 }
 
-double BFCAllocator::GetFragmentation() {
-  int64 largest_free_chunk = 0;
-  int64 free_bytes = 0;
-  for (const auto& region : region_manager_.regions()) {
-    ChunkHandle chunk_handle = region_manager_.get_handle(region.ptr());
-    while (chunk_handle != kInvalidChunkHandle) {
-      const Chunk* chunk = ChunkFromHandle(chunk_handle);
-      if (!chunk->in_use()) {
-        free_bytes += chunk->size;
-        if (chunk->size > largest_free_chunk) {
-          largest_free_chunk = chunk->size;
-        }
-      }
-      chunk_handle = chunk->next;
-    }
-  }
-  double frag_metric = 0.0;
-  if (free_bytes > 0) {
-    frag_metric =
-        (free_bytes - largest_free_chunk) / static_cast<double>(free_bytes);
-  }
-
-  return frag_metric;
-}
-
 absl::optional<AllocatorStats> BFCAllocator::GetStats() {
   mutex_lock l(lock_);
   return stats_;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 94506bb3b7e..509fa9e8eed 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -115,21 +115,30 @@ class BFCAllocator : public Allocator {
   bool MergeTimestampedChunks(size_t required_bytes)
       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
+  // Return the largest free chunk bytes from the largest bin in constant time.
+  // The free chunks are sorted by size (and then address) in a bin.
+  int64 LargestFreeChunk() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
   // Add TraceMe (in memory allocation and deallocation) for memory stats
   // profiling. The chunk_ptr is passed to get information such as address,
   // chunk size and requested_size.
-  void AddTraceMe(absl::string_view traceme_name, const void* chunk_ptr)
+  void AddTraceMe(absl::string_view traceme_name, const void* ptr)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Overloaded AddTraceMe function with chunk information.
+  void AddTraceMe(absl::string_view traceme_name, const void* chunk_ptr,
+                  int64 req_bytes, int64 alloc_bytes)
       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
   // kInvalidChunkHandle means an invalid chunk
   typedef size_t ChunkHandle;
-  static const int kInvalidChunkHandle = -1;
+  static constexpr int kInvalidChunkHandle = -1;
 
   typedef int BinNum;
-  static const int kInvalidBinNum = -1;
+  static constexpr int kInvalidBinNum = -1;
   // The following means that the largest bin'd chunk size is 256 << 21 = 512MB.
-  static const int kNumBins = 21;
+  static constexpr int kNumBins = 21;
 
   // A Chunk points to a piece of memory that's either entirely free or entirely
   // in use by one user memory allocation.
@@ -243,8 +252,8 @@ class BFCAllocator : public Allocator {
         : bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
   };
 
-  static const size_t kMinAllocationBits = 8;
-  static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
+  static constexpr size_t kMinAllocationBits = 8;
+  static constexpr size_t kMinAllocationSize = 1 << kMinAllocationBits;
 
   // BFCAllocator allocates memory into a collection of disjoint
   // AllocationRegions.  Each AllocationRegion corresponds to one call to
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.cc b/tensorflow/core/common_runtime/buf_rendezvous.cc
index 6733a2e16a3..49cc9fd3db8 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.h b/tensorflow/core/common_runtime/buf_rendezvous.h
index 527d0e28690..74857e46a53 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.h
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -28,6 +27,7 @@ limitations under the License.
 namespace tensorflow {
 class Device;
 class DeviceContext;
+class DeviceMgr;
 class Tensor;
 
 // EXPERIMENTAL: RDMA oriented producer/consumer rendezvous on a local
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
index c9a27b53384..270165114f7 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous_test.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/notification.h"
diff --git a/tensorflow/core/common_runtime/build_graph_options.h b/tensorflow/core/common_runtime/build_graph_options.h
index 24b71cc741d..4bc953d5ce4 100644
--- a/tensorflow/core/common_runtime/build_graph_options.h
+++ b/tensorflow/core/common_runtime/build_graph_options.h
@@ -32,7 +32,7 @@ struct BuildGraphOptions {
   // TODO(mrry): Remove this when the distributed runtime supports Arg/Retval.
   bool use_function_convention = false;
 
-  static const int64 kNoCollectiveGraphKey = 0;
+  static constexpr int64 kNoCollectiveGraphKey = 0;
   int64 collective_graph_key = kNoCollectiveGraphKey;
 
   // If not `kNone`, order all CollectiveReduce operations statically and
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index ad67ce5cb64..f3dea5c606a 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index f1bdf63d698..c724ed93b7e 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -23,13 +23,13 @@ limitations under the License.
 
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 class CompleteGroupRequest;
 class CompleteGroupResponse;
 class CompleteInstanceRequest;
 class CompleteInstanceResponse;
+class ConfigProto;
 class DeviceMgr;
 
 // Implements ParamResolverInterface for a single-task context.
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
index 41058ae208a..0a13f973106 100644
--- a/tensorflow/core/common_runtime/colocation_graph.cc
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
-#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
 #include "tensorflow/core/common_runtime/inspecting_placer.h"
 #include "tensorflow/core/common_runtime/partitioning_utils.h"
@@ -136,6 +136,10 @@ bool IsXlaDevice(absl::string_view device_type) {
           device_type == "TPU");
 }
 
+bool IsCompositeDevice(absl::string_view device_type) {
+  return device_type == kCompositeDeviceType;
+}
+
 }  // namespace
 
 Status Member::SetParentAndSupportedDevices(
@@ -220,6 +224,26 @@ Status Member::FillPossibleDevices(PossibleDevices* possible_device) const {
   return Status::OK();
 }
 
+bool Member::IsEdgeFromCompositeDeviceToPhysicalDevice(
+    const Member& src_root) const {
+  auto compatible_edge_from_composite_device_to_physical_device =
+      [](const DeviceNameUtils::ParsedName& src_device,
+         const DeviceNameUtils::ParsedName& dst_device) -> bool {
+    return src_device.has_type && dst_device.has_type &&
+           IsCompositeDevice(src_device.type) &&
+           !IsCompositeDevice(dst_device.type);
+  };
+  if (compatible_edge_from_composite_device_to_physical_device(
+          src_root.assigned_device_name_, assigned_device_name_) ||
+      compatible_edge_from_composite_device_to_physical_device(
+          src_root.resource_device_name_, resource_device_name_) ||
+      compatible_edge_from_composite_device_to_physical_device(
+          src_root.requested_device_name_, requested_device_name_)) {
+    return true;
+  }
+  return false;
+}
+
 Status Member::EnsureCompatibilityAcrossResourceEdge(
     const Node& src, const Member& src_root,
     const Node& dst, /*dst_root is this*/
@@ -484,7 +508,10 @@ Status Member::AssignDevice(const Node& node) {
 void Member::MaybeExcludeXlaDevices() {
   for (const auto& parsed_name :
        {requested_device_name_, assigned_device_name_, resource_device_name_}) {
-    if (parsed_name.has_type && IsXlaDevice(parsed_name.type)) {
+    // Don't exculde XLA devices from supported devices if member is explicitly
+    // assigned to a CompositeDevice.
+    if (parsed_name.has_type && (IsXlaDevice(parsed_name.type) ||
+                                 IsCompositeDevice(parsed_name.type))) {
       return;
     }
   }
@@ -664,6 +691,12 @@ Status ColocationGraph::ColocateResourceOrRefEdge(const Node* src,
   auto& src_root = members_[src_root_id];
   auto& dst_root = members_[dst_root_id];
 
+  if (dst_root.IsEdgeFromCompositeDeviceToPhysicalDevice(src_root)) {
+    // If the src root is assigned to a composite device and the dst root is
+    // assigned to a physical device, don't colocate the dst root with the src
+    // root.
+    return Status::OK();
+  }
   TF_RETURN_IF_ERROR(dst_root.EnsureCompatibilityAcrossResourceEdge(
       *src, src_root, *dst, log_device_placement_));
   Status status = ColocateNodes(*src, src_root_id, *dst, dst_root_id);
@@ -890,6 +923,15 @@ Status GetGroupNodes(const IOColocationGroups& groups, const Node& node,
   return Status::OK();
 }
 
+// Returns whether the device_type in `device_attributes` is supported.
+bool IsSupportedDeviceType(const DeviceAttributes& device_attributes,
+                           const DeviceType& supported_type) {
+  if (DeviceType(device_attributes.device_type()) == supported_type) {
+    return true;
+  }
+  return IsCompositeDevice(device_attributes.device_type());
+}
+
 }  // namespace
 
 Status ColocationGraph::ApplyIOColocationGroups(
@@ -1364,7 +1406,7 @@ Status ColocationGraph::InitializeMemberWithAssignedDevice(
   }
 
   for (const auto& d : member->supported_device_types()) {
-    if (DeviceType(assigned_device->attributes().device_type()) == d.first) {
+    if (IsSupportedDeviceType(assigned_device->attributes(), d.first)) {
       return Status::OK();
     }
   }
@@ -1434,8 +1476,8 @@ Status ColocationGraph::InitializeMember(const Node& node, Member* member) {
   PrioritizedDeviceVector prioritized_filtered_devices;
   for (const auto& supported_device_type : supported_device_types) {
     for (Device* device : devices) {
-      if (DeviceType(device->attributes().device_type()) ==
-          supported_device_type.first) {
+      if (IsSupportedDeviceType(device->attributes(),
+                                supported_device_type.first)) {
         if (default_local_device &&
             (device == default_local_device ||
              // TODO(nareshmodi, fishx): At times the device pointer in the
diff --git a/tensorflow/core/common_runtime/colocation_graph.h b/tensorflow/core/common_runtime/colocation_graph.h
index d0714d54a5a..c9f48a1689b 100644
--- a/tensorflow/core/common_runtime/colocation_graph.h
+++ b/tensorflow/core/common_runtime/colocation_graph.h
@@ -51,6 +51,10 @@ class Member {
 
   Status FillPossibleDevices(PossibleDevices* possible_device) const;
 
+  // Returns whether `src_root` is assigned to a CompositeDevice and `this` is
+  // assigned to a physical device.
+  bool IsEdgeFromCompositeDeviceToPhysicalDevice(const Member& src_root) const;
+
   Status EnsureCompatibilityAcrossResourceEdge(
       const Node& src, const Member& src_root,
       const Node& dst, /*dst_root is this*/
diff --git a/tensorflow/core/common_runtime/composite_device.cc b/tensorflow/core/common_runtime/composite_device.cc
new file mode 100644
index 00000000000..7fd41e00a04
--- /dev/null
+++ b/tensorflow/core/common_runtime/composite_device.cc
@@ -0,0 +1,79 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/composite_device.h"
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+const char* const kCompositeDeviceType = "COMPOSITE";
+
+std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
+    const std::vector<string>& underlying_devices, const int unique_device_id,
+    const DeviceNameUtils::ParsedName& host_name, Status* status) {
+  if (underlying_devices.empty()) {
+    status->Update(
+        errors::InvalidArgument("underlying_devices should not be empty."));
+    return nullptr;
+  }
+  std::set<string> unique_devices;
+  for (const string& device : underlying_devices) {
+    if (!unique_devices.insert(device).second) {
+      status->Update(errors::InvalidArgument(
+          "Got a duplicated device in underlying_devices: ", device));
+      return nullptr;
+    }
+  }
+  DeviceNameUtils::ParsedName parsed_name;
+  if (!DeviceNameUtils::ParseFullName(underlying_devices.at(0), &parsed_name)) {
+    status->Update(tensorflow::errors::InvalidArgument(
+        "Cannot parse device name ", underlying_devices.at(0),
+        " when creating CompositeDevice."));
+    return nullptr;
+  }
+  const string& underlying_type = parsed_name.type;
+  for (int i = 1; i < underlying_devices.size(); ++i) {
+    DeviceNameUtils::ParsedName name;
+    if (!DeviceNameUtils::ParseFullName(underlying_devices.at(i), &name)) {
+      status->Update(tensorflow::errors::InvalidArgument(
+          "Cannot parse device name ", underlying_devices.at(i),
+          " when creating CompositeDevice."));
+      return nullptr;
+    }
+    if (name.type != underlying_type) {
+      status->Update(tensorflow::errors::InvalidArgument(
+          "Expect device type ", parsed_name.type, "; but got type ", name.type,
+          " from device: ", underlying_devices.at(i),
+          " when creating CompositeDevice."));
+      return nullptr;
+    }
+  }
+
+  DeviceNameUtils::ParsedName parsed_composite_name = host_name;
+  DeviceAttributes device_attributes;
+  parsed_composite_name.type = kCompositeDeviceType;
+  parsed_composite_name.id = unique_device_id;
+  const string composite_name =
+      DeviceNameUtils::ParsedNameToString(parsed_composite_name);
+  device_attributes.set_name(composite_name);
+  device_attributes.set_device_type(kCompositeDeviceType);
+
+  return absl::WrapUnique(
+      new CompositeDevice(device_attributes, underlying_devices));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/composite_device.h b/tensorflow/core/common_runtime/composite_device.h
new file mode 100644
index 00000000000..850eae55e8d
--- /dev/null
+++ b/tensorflow/core/common_runtime/composite_device.h
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COMPOSITE_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COMPOSITE_DEVICE_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+extern const char* const kCompositeDeviceType;
+
+// A virtual device which represents a set of devices. We don't execute any
+// op on this virtial device.
+class CompositeDevice : public Device {
+ public:
+  Status Sync() override {
+    return errors::Internal(
+        "Sync() should never been invoked on CompositeDevice.");
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+
+  const std::vector<string>* underlying_devices() const {
+    return &underlying_devices_;
+  }
+
+  // Helper for creating a CompositeDevice on the same task as the given host
+  // CPU.
+  static std::unique_ptr<CompositeDevice> MakeDevice(
+      const std::vector<string>& underlying_devices, const int unique_device_id,
+      const DeviceNameUtils::ParsedName& host_name, Status* status);
+
+ private:
+  CompositeDevice(const DeviceAttributes& device_attributes,
+                  const std::vector<string>& underlying_devices)
+      : Device(/*env=*/nullptr, device_attributes),
+        underlying_devices_(underlying_devices) {}
+
+  const std::vector<string> underlying_devices_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CompositeDevice);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COMPOSITE_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/composite_device_test.cc b/tensorflow/core/common_runtime/composite_device_test.cc
new file mode 100644
index 00000000000..73a6ae44912
--- /dev/null
+++ b/tensorflow/core/common_runtime/composite_device_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/composite_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+TEST(CompositeDeviceTest, Basic) {
+  const string host_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  DeviceNameUtils::ParsedName parsed_host_name;
+  EXPECT_TRUE(DeviceNameUtils::ParseFullName(host_name, &parsed_host_name));
+  std::vector<string> underlying_devices;
+  {
+    Status status;
+    std::unique_ptr<CompositeDevice> composite_device =
+        CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0,
+                                    parsed_host_name, &status);
+    EXPECT_EQ(composite_device, nullptr);
+    EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
+    EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                  "underlying_devices should not be empty"))
+        << status.ToString();
+  }
+
+  {
+    Status status;
+    underlying_devices.push_back(
+        "/job:localhost/replica:0/task:0/device:CPU:0");
+    underlying_devices.push_back(
+        "/job:localhost/replica:0/task:0/device:CPU:1");
+    std::unique_ptr<CompositeDevice> composite_device =
+        CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0,
+                                    parsed_host_name, &status);
+    TF_ASSERT_OK(status);
+    EXPECT_EQ(composite_device->device_type(), kCompositeDeviceType);
+    EXPECT_EQ(underlying_devices, *composite_device->underlying_devices());
+  }
+
+  {
+    Status status;
+    underlying_devices.push_back(
+        "/job:localhost/replica:0/task:0/device:CPU:0");
+    std::unique_ptr<CompositeDevice> composite_device =
+        CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1,
+                                    parsed_host_name, &status);
+    EXPECT_EQ(composite_device, nullptr);
+    EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
+    EXPECT_TRUE(
+        absl::StrContains(status.error_message(), "Got a duplicated device"))
+        << status.ToString();
+    underlying_devices.pop_back();
+  }
+
+  {
+    Status status;
+    underlying_devices.push_back(
+        "/job:localhost/replica:0/task:0/device:GPU:0");
+    std::unique_ptr<CompositeDevice> composite_device =
+        CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1,
+                                    parsed_host_name, &status);
+    EXPECT_EQ(composite_device, nullptr);
+    EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
+    EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                  "Expect device type CPU; but got type GPU"))
+        << status.ToString();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 467147921be..f87efb369ed 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index c7583c374f2..0b693085da3 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -45,6 +45,7 @@ StaticDeviceMgr::StaticDeviceMgr(std::vector<std::unique_ptr<Device>> devices)
     }
     const auto& t = d->device_type();
     device_type_counts_[t]++;
+    device_incarnation_set_.insert(d->attributes().incarnation());
     if (cpu_device_ == nullptr && t == "CPU" && d->parsed_name().id == 0) {
       cpu_device_ = d.get();
     }
@@ -123,6 +124,10 @@ Status StaticDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   return Status::OK();
 }
 
+bool StaticDeviceMgr::ContainsDevice(int64 device_incarnation) const {
+  return device_incarnation_set_.contains(device_incarnation);
+}
+
 void StaticDeviceMgr::ClearContainers(
     gtl::ArraySlice<string> containers) const {
   Status s;
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index 56248b39078..83a0d0cc29c 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/lib/core/arena.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -56,6 +57,11 @@ class DeviceMgr {
   // Accepts either a full device name, or just the replica-local suffix.
   virtual Status LookupDevice(StringPiece name, Device** device) const = 0;
 
+  // Check if the current device manager contains device with the given
+  // incarnation ID. Looking up by incarnation IDs because they are randomly
+  // generated and not intentionally reused (unlike device pointers).
+  virtual bool ContainsDevice(int64 device_incarnation) const = 0;
+
   // Clears given containers of all devices if 'container' is
   // non-empty. Otherwise, clears default containers of all devices.
   virtual void ClearContainers(gtl::ArraySlice<string> containers) const = 0;
@@ -86,6 +92,7 @@ class StaticDeviceMgr : public DeviceMgr {
   string DebugString() const override;
   string DeviceMappingString() const override;
   Status LookupDevice(StringPiece name, Device** device) const override;
+  bool ContainsDevice(int64 device_incarnation) const override;
   void ClearContainers(gtl::ArraySlice<string> containers) const override;
   int NumDeviceType(const string& type) const override;
   Device* HostCPU() const override;
@@ -95,6 +102,7 @@ class StaticDeviceMgr : public DeviceMgr {
 
   StringPiece CopyToBackingStore(StringPiece s);
 
+  absl::flat_hash_set<int64> device_incarnation_set_;
   std::unordered_map<StringPiece, Device*, StringPieceHasher> device_map_;
   core::Arena name_backing_store_;  // Storage for keys in device_map_
   std::unordered_map<string, int> device_type_counts_;
@@ -117,6 +125,7 @@ class DynamicDeviceMgr : public DeviceMgr {
   string DebugString() const override;
   string DeviceMappingString() const override;
   Status LookupDevice(StringPiece name, Device** device) const override;
+  bool ContainsDevice(int64 device_incarnation) const override;
   void ClearContainers(gtl::ArraySlice<string> containers) const override;
   int NumDeviceType(const string& type) const override;
   Device* HostCPU() const override;
@@ -140,6 +149,7 @@ class DynamicDeviceMgr : public DeviceMgr {
   std::unordered_map<Device*, std::unique_ptr<Device>> dynamic_devices_
       TF_GUARDED_BY(devices_mu_);
 
+  absl::flat_hash_set<int64> device_incarnation_set_ TF_GUARDED_BY(devices_mu_);
   std::unordered_map<string, Device*> device_map_ TF_GUARDED_BY(devices_mu_);
 
   std::unordered_map<string, int> device_type_counts_
diff --git a/tensorflow/core/common_runtime/device_set.cc b/tensorflow/core/common_runtime/device_set.cc
index 112769f3d71..b062529a3ff 100644
--- a/tensorflow/core/common_runtime/device_set.cc
+++ b/tensorflow/core/common_runtime/device_set.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 
diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h
index fdb7453aa79..608705c32f7 100644
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index c1426394a17..d104e0a985f 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/executor_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/metrics.h"
@@ -49,7 +50,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_partition.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/graph/tensor_id.h"
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr.cc b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
index f35fa7e416a..f47de47c5b9 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
@@ -92,6 +92,11 @@ Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   return Status::OK();
 }
 
+bool DynamicDeviceMgr::ContainsDevice(int64 device_incarnation) const {
+  tf_shared_lock l(devices_mu_);
+  return device_incarnation_set_.contains(device_incarnation);
+}
+
 void DynamicDeviceMgr::ClearContainers(
     gtl::ArraySlice<string> containers) const {
   Status s;
@@ -138,6 +143,7 @@ Status DynamicDeviceMgr::AddDevices(
       device_map_[name] = d.get();
     }
     device_type_counts_[d->device_type()]++;
+    device_incarnation_set_.insert(d->attributes().incarnation());
     dynamic_devices_.emplace(d.get(), std::move(d));
   }
   return Status::OK();
@@ -171,6 +177,7 @@ Status DynamicDeviceMgr::RemoveDevices(std::vector<Device*> devices) {
       device_map_.erase(name);
     }
     device_type_counts_[d->device_type()]--;
+    device_incarnation_set_.erase(d->attributes().incarnation());
     dynamic_devices_.erase(it);
   }
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 328f29b2cb4..625468b39d5 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -47,7 +47,7 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu_lib",
@@ -72,17 +72,21 @@ tf_cuda_library(
     deps = [
         ":eager_executor",
         ":kernel_and_device",
+        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/c/eager:context_interface",
         "//tensorflow/c/eager:tensor_handle_interface",
         "//tensorflow/c/eager:operation_interface",
+        "//tensorflow/c/experimental/saved_model/core:saved_model_api",
+        "//tensorflow/c/experimental/saved_model/core:tf_saved_model_impl",
         "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker_env",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "@com_google_absl//absl/types:optional",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -143,7 +147,7 @@ tf_cuda_library(
         "//tensorflow/core/platform:platform_port",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu_lib",
@@ -151,6 +155,18 @@ tf_cuda_library(
     }),
 )
 
+tf_cc_test(
+    name = "eager_operation_test",
+    srcs = ["eager_operation_test.cc"],
+    deps = [
+        ":core",
+        ":eager_operation",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cuda_library(
     name = "tensor_handle_data",
     srcs = [
@@ -165,7 +181,7 @@ tf_cuda_library(
         ":eager_executor",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/types:variant",
@@ -191,7 +207,7 @@ tf_cuda_library(
         ":tensor_handle_data",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/strings",
@@ -289,13 +305,14 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":attr_builder",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
         "@farmhash_archive//:farmhash",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:windows": KERNEL_AND_DEVICE_DEPS,
         "//conditions:default": KERNEL_AND_DEVICE_DEPS + [
@@ -353,6 +370,7 @@ cc_library(
         ":eager_operation",
         ":kernel_and_device",
         ":tensor_handle",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/strings",
@@ -363,7 +381,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
@@ -380,6 +398,24 @@ cc_library(
     }) + if_mkl([":mkl_eager_op_rewrite"]),
 )
 
+tf_cc_test(
+    name = "execute_node_test",
+    srcs = ["execute_node_test.cc"],
+    deps = [
+        ":context",
+        ":core",
+        ":execute",
+        ":kernel_and_device",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 cc_library(
     name = "mkl_eager_op_rewrite",
     srcs = ["mkl_eager_op_rewrite.cc"],
@@ -420,9 +456,8 @@ tf_cc_test(
     name = "eager_op_rewrite_registry_test",
     srcs = ["eager_op_rewrite_registry_test.cc"],
     deps = [
-        ":core",
         ":eager_op_rewrite_registry",
-        ":execute",
+        ":mkl_core",
         "//tensorflow/core:lib",
         "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:test",
@@ -430,6 +465,56 @@ tf_cc_test(
     ],
 )
 
+# Temporary rule until the circular dependencies issue is resolved.
+# TODO(mabuzain): remove this once original "core" package is fixed.
+cc_library(
+    name = "mkl_core",
+    srcs = [
+        "core.cc",
+        "execute.cc",
+        "execute_node.cc",
+    ],
+    hdrs = [
+        "execute.h",
+        "execute_node.h",
+    ],
+    deps = [
+        ":context",
+        ":copy_to_device_node",
+        ":eager_executor",
+        ":eager_op_rewrite_registry",
+        ":eager_operation",
+        ":kernel_and_device",
+        ":tensor_handle",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/compiler/jit:common",
+        "//tensorflow/core/profiler/lib:traceme",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core/distributed_runtime/eager:remote_mgr",
+            "//tensorflow/core:core_cpu_lib",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core/distributed_runtime/eager:eager_client",
+            "//tensorflow/core/distributed_runtime/eager:remote_execute_node",
+            "//tensorflow/core/distributed_runtime/eager:remote_copy_node",
+        ],
+    }),
+)
+
 tf_cuda_library(
     name = "attr_builder",
     srcs = ["attr_builder.cc"],
@@ -442,7 +527,7 @@ tf_cuda_library(
         "@farmhash_archive//:farmhash",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 1a871b01a4d..3dfae3396f7 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -111,6 +111,14 @@ class AttrBuilder {
     return *this;
   }
 
+  size_t NumAttributes() const { return encoded_attrs_.size(); }
+
+  AttrBuilder& Set(StringPiece attr_name, const AttrValue& value) {
+    AddAttrIfNotPresent(attr_name, value);
+    cached_cache_key_ = absl::nullopt;
+    return *this;
+  }
+
   // Retrieves the attribute value.
   // Note that Get() can involve a linear scan of all attributes with the same
   // value type in this Node. This is not an issue, because Get is used rarely
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index b74829fdb54..207c6a02d5b 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -28,9 +28,12 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/c/eager/operation_interface.h"
 #include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/core/tf_saved_model_impl.h"
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/colocation_graph.h"
@@ -110,6 +113,7 @@ EagerContext::EagerContext(
 
 #if !defined(IS_MOBILE_PLATFORM)
   context_id_ = kInvalidContextId;
+  context_view_id_ = 0;
 #endif  // IS_MOBILE_PLATFORM
 
   std::unique_ptr<DeviceResolverInterface> drl(
@@ -165,6 +169,41 @@ AbstractTensorInterface* EagerContext::CreateTensor(
   return new TensorInterface(Tensor(dtype, TensorShape(dim_sizes)));
 }
 
+AbstractTensorInterface* EagerContext::CreateTensor(
+    DataType dtype, const int64_t* dims, int num_dims, void* data, size_t len,
+    bool convert_string, MemoryReleaser memory_releaser,
+    void* memory_releaser_arg) {
+  TF_Tensor* tensor_wrapper =
+      TF_NewTensor(static_cast<TF_DataType>(dtype), dims, num_dims, data, len,
+                   memory_releaser, memory_releaser_arg);
+
+  if (convert_string) {
+    tensorflow::Tensor tensor;
+    Status status = TF_TensorToTensor(tensor_wrapper, &tensor);
+    TF_DeleteTensor(tensor_wrapper);
+    if (!status.ok()) return nullptr;
+    return new TensorInterface(std::move(tensor));
+  } else {
+    AbstractTensorInterface* result = nullptr;
+    std::swap(result, tensor_wrapper->tensor);
+    TF_DeleteTensor(tensor_wrapper);
+    return result;
+  }
+}
+
+std::unique_ptr<SavedModelAPI> EagerContext::LoadSavedModelAPI(
+    const std::string& directory,
+    const absl::optional<std::unordered_set<std::string>>& tags,
+    tensorflow::Status* status) {
+  auto result = std::make_unique<TFSavedModelAPIImpl>();
+  auto load_status = TFSavedModelAPIImpl::Load(directory, tags, result.get());
+  if (!load_status.ok()) {
+    status->Update(load_status);
+    result.reset();
+  }
+  return result;
+}
+
 void EagerContext::ResetPFLR(const DeviceMgr* device_mgr, Env* env,
                              const ConfigProto* config, int graph_def_version,
                              const FunctionLibraryDefinition* lib_def,
@@ -194,7 +233,9 @@ void EagerContext::InitPrioritizedDeviceTypeList() {
       ds.AddDevice(d);
     }
   }
-  prioritized_device_type_list_ = ds.PrioritizedDeviceTypeList();
+  mutex_lock l(device_type_list_mu_);
+  prioritized_device_type_list_ =
+      std::make_shared<std::vector<DeviceType>>(ds.PrioritizedDeviceTypeList());
 }
 
 namespace {
@@ -260,8 +301,9 @@ Status EagerContext::SelectDevice(DeviceNameUtils::ParsedName preferred,
   // Select the first matching registered device from the supported device
   // list. If nothing matches and soft placement is enabled, pick a suitable
   // device from the available ones.
+  const auto pflr_device_set = pflr()->device_set();
   const PrioritizedDeviceVector& existing =
-      pflr()->device_set()->prioritized_devices();
+      pflr_device_set->prioritized_devices();
   *out = SelectBestMatchingDevice(preferred, existing, supported);
   if (*out != nullptr) {
     return Status::OK();
@@ -381,9 +423,20 @@ bool EagerContext::LazyCopyFunctionRemoteInputs() const {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
+std::vector<string> EagerContext::GetRemoteContexts() {
+  tf_shared_lock l(remote_state_mu_);
+  return remote_contexts_;
+}
+
+bool EagerContext::IsRemoteContextsEmpty() {
+  tf_shared_lock l(remote_state_mu_);
+  return remote_contexts_.empty();
+}
+
 void EagerContext::CloseAndClearAllRemoteContexts() {
   uint64 context_id;
   uint64 context_view_id;
+  std::vector<string> remote_contexts_copy;
   {
     mutex_lock l(remote_state_mu_);
     if (!is_master_) return;
@@ -392,9 +445,13 @@ void EagerContext::CloseAndClearAllRemoteContexts() {
     context_id_ = kInvalidContextId;
     // Forget the current view id and reset to the starting value 0.
     context_view_id_ = 0;
+
+    // Make a copy of remote targets to avoid holding the lock when sending
+    // close context requests.
+    remote_contexts_copy = remote_contexts_;
+    remote_contexts_.clear();
   }
-  CloseRemoteContexts(remote_contexts_, context_id, context_view_id);
-  remote_contexts_.clear();
+  CloseRemoteContexts(remote_contexts_copy, context_id, context_view_id);
 }
 
 void EagerContext::CloseRemoteContexts(
@@ -412,7 +469,7 @@ void EagerContext::CloseRemoteContexts(
   int i = 0;
   for (const auto& worker : remote_contexts) {
     core::RefCountPtr<eager::EagerClient> client;
-    Status s = remote_eager_workers_->GetClient(worker, &client);
+    Status s = GetClient(worker, &client);
 
     client->CloseContextAsync(
         &request, &responses[i],
@@ -443,7 +500,7 @@ void EagerContext::WaitForAndCloseRemoteContexts() {
   }
   keep_alive_thread_.reset();
 
-  if (!remote_contexts_.empty()) {
+  if (!IsRemoteContextsEmpty()) {
     CloseAndClearAllRemoteContexts();
   }
 
@@ -459,16 +516,16 @@ void EagerContext::WaitForAndCloseRemoteContexts() {
     for (const auto& it : executors_copy) {
       it.second->ShutDown().IgnoreError();
     }
-  }
 
-  // This shuts down the completion queue and joins the thread polling it.
-  // The thread exits only after the completion queue has been drained of all
-  // the events. These events' completion should invoke all remaining RPC
-  // callbacks.
-  // This also deletes all EagerClient instances. There should not be any
-  // references to EagerClients left after all RPCs and async ops have been
-  // finished.
-  remote_eager_workers_ = nullptr;
+    // This shuts down the completion queue and joins the thread polling it.
+    // The thread exits only after the completion queue has been drained of all
+    // the events. These events' completion should invoke all remaining RPC
+    // callbacks.
+    // This also deletes all EagerClient instances. There should not be any
+    // references to EagerClients left after all RPCs and async ops have been
+    // finished.
+    remote_eager_workers_ = nullptr;
+  }
 #endif  // !IS_MOBILE_PLATFORM
 }
 
@@ -477,6 +534,10 @@ EagerContext::~EagerContext() {
   // don't send RPCs and block in destructor.
   WaitForAndCloseRemoteContexts();
 
+  // Custom devices may have obtained references to various context components
+  // (executors, thread pool). It's safer to run their destructors early.
+  custom_devices_.clear();
+
   ClearCachesAndThreadExecutors();
   for (auto& entry : registered_functions_) {
     while (!entry.second->Unref()) {
@@ -589,9 +650,10 @@ Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
       *OpRegistry::Global(),
       register_function->mutable_function_def()->mutable_node_def());
 
-  for (const auto& target : remote_contexts_) {
+  auto remote_contexts = GetRemoteContexts();
+  for (const auto& target : remote_contexts) {
     core::RefCountPtr<eager::EagerClient> eager_client;
-    TF_RETURN_IF_ERROR(remote_eager_workers_->GetClient(target, &eager_client));
+    TF_RETURN_IF_ERROR(GetClient(target, &eager_client));
 
     eager::EnqueueResponse* response = new eager::EnqueueResponse();
     eager_client->StreamingEnqueueAsync(
@@ -617,8 +679,7 @@ Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers(
   uint64 context_id = GetContextId();
   for (int i = 0; i < remote_workers.size(); i++) {
     core::RefCountPtr<eager::EagerClient> eager_client;
-    Status s =
-        remote_eager_workers_->GetClient(remote_workers[i], &eager_client);
+    Status s = GetClient(remote_workers[i], &eager_client);
     if (!s.ok()) {
       continue;
     }
@@ -730,17 +791,18 @@ Status EagerContext::SyncExecutors() {
   }
 
 #if !defined(IS_MOBILE_PLATFORM)
+  auto remote_contexts = GetRemoteContexts();
   // Synchronize executors on remote workers
   eager::EnqueueRequest request;
   request.set_context_id(GetContextId());
   request.add_queue()->mutable_sync_remote_executor_for_stream();
-  BlockingCounter counter(static_cast<int>(remote_contexts_.size()));
-  std::vector<Status> statuses(remote_contexts_.size());
+  BlockingCounter counter(static_cast<int>(remote_contexts.size()));
+  std::vector<Status> statuses(remote_contexts.size());
 
-  for (int i = 0; i < remote_contexts_.size(); i++) {
-    const auto& target = remote_contexts_[i];
+  for (int i = 0; i < remote_contexts.size(); i++) {
+    const auto& target = remote_contexts[i];
     core::RefCountPtr<eager::EagerClient> eager_client;
-    TF_RETURN_IF_ERROR(remote_eager_workers_->GetClient(target, &eager_client));
+    TF_RETURN_IF_ERROR(GetClient(target, &eager_client));
 
     eager::EnqueueResponse* response = new eager::EnqueueResponse();
     eager_client->StreamingEnqueueAsync(
@@ -814,6 +876,18 @@ Status EagerContext::FindDeviceFromName(const char* device_name,
   return status;
 }
 
+Status EagerContext::FindCompositeDeviceFromName(
+    const char* device_name, CompositeDevice** device) const {
+  tf_shared_lock l(composite_devices_mu_);
+  for (const auto& d : composite_devices_) {
+    if (d.second->name() == device_name) {
+      *device = d.second.get();
+      return Status::OK();
+    }
+  }
+  return errors::NotFound("Unknown composite device: ", device_name);
+}
+
 Status EagerContext::FindCustomDeviceFromName(const string& device_name,
                                               CustomDevice** dev) const {
   auto dev_it = custom_devices_.find(device_name);
@@ -848,6 +922,31 @@ Status EagerContext::RegisterCustomDevice(
   return Status::OK();
 }
 
+Status EagerContext::FindOrCreateCompositeDevice(
+    const std::vector<string>& underlying_devices,
+    CompositeDevice** composite_device) {
+  const uint64 hash_key = Fingerprint64(absl::StrJoin(underlying_devices, ","));
+
+  mutex_lock l(composite_devices_mu_);
+  auto iter = composite_devices_.find(hash_key);
+  if (iter != composite_devices_.end()) {
+    *composite_device = iter->second.get();
+    return Status::OK();
+  }
+
+  Status s;
+  // Create a CompositeDevice on the same task as the host CPU, in order to
+  // trigger packed TensorHandle copy from a client to a remote worker.
+  auto device =
+      CompositeDevice::MakeDevice(underlying_devices, composite_devices_.size(),
+                                  HostCPU()->parsed_name(), &s);
+  TF_RETURN_IF_ERROR(s);
+  *composite_device = device.get();
+  pflr_->AddCompositeDevice(*composite_device);
+  composite_devices_.emplace(hash_key, std::move(device));
+  return Status::OK();
+}
+
 bool EagerContext::OnSameTask(const Device* first, const Device* second) const {
   if (first == nullptr) first = HostCPU();
   if (second == nullptr) second = HostCPU();
@@ -885,10 +984,6 @@ Status EagerContext::GetClient(Device* device,
 
 Status EagerContext::GetClient(const DeviceNameUtils::ParsedName& device_name,
                                core::RefCountPtr<eager::EagerClient>* client) {
-  if (remote_eager_workers_ == nullptr) {
-    return errors::Internal(
-        "Haven't set up remote eager worker in this eager context yet.");
-  }
   string device_task_name;
   if (!DeviceNameUtils::GetTaskName(device_name, &device_task_name)) {
     return errors::InvalidArgument(
@@ -896,19 +991,25 @@ Status EagerContext::GetClient(const DeviceNameUtils::ParsedName& device_name,
         DeviceNameUtils::ParsedNameToString(device_name));
   }
 
-  TF_RETURN_IF_ERROR(
-      remote_eager_workers_->GetClient(device_task_name, client));
+  {
+    tf_shared_lock l(remote_state_mu_);
+    if (remote_eager_workers_ == nullptr) {
+      return errors::Internal(
+          "Haven't set up remote eager worker in this eager context yet.");
+    }
+    TF_RETURN_IF_ERROR(
+        remote_eager_workers_->GetClient(device_task_name, client));
 
-  if (*client == nullptr) {
-    return errors::InvalidArgument(
-        "Unable to find eager client corresponding to device ",
-        DeviceNameUtils::ParsedNameToString(device_name));
-  }
-
-  if (std::find(remote_contexts_.begin(), remote_contexts_.end(),
-                device_task_name) == remote_contexts_.end()) {
-    return errors::Internal("Unable to find a context for handle on task: ",
-                            device_task_name, ". This should not be possible");
+    if (*client == nullptr) {
+      return errors::InvalidArgument(
+          "Unable to find eager client corresponding to device ",
+          DeviceNameUtils::ParsedNameToString(device_name));
+    }
+    if (std::find(remote_contexts_.begin(), remote_contexts_.end(),
+                  device_task_name) == remote_contexts_.end()) {
+      return errors::Internal("Unable to find a context for handle on task: ",
+                              device_task_name, ". This should not happen.");
+    }
   }
 
   return Status::OK();
@@ -916,11 +1017,14 @@ Status EagerContext::GetClient(const DeviceNameUtils::ParsedName& device_name,
 
 Status EagerContext::GetClient(const string& remote_task,
                                core::RefCountPtr<eager::EagerClient>* client) {
-  if (remote_eager_workers_ == nullptr) {
-    return errors::Internal(
-        "Haven't set up remote eager worker in this eager context yet.");
+  {
+    tf_shared_lock l(remote_state_mu_);
+    if (remote_eager_workers_ == nullptr) {
+      return errors::Internal(
+          "Haven't set up remote eager worker in this eager context yet.");
+    }
+    TF_RETURN_IF_ERROR(remote_eager_workers_->GetClient(remote_task, client));
   }
-  TF_RETURN_IF_ERROR(remote_eager_workers_->GetClient(remote_task, client));
 
   if (*client == nullptr) {
     return errors::InvalidArgument(
@@ -1082,10 +1186,13 @@ Status EagerContext::InitializeRemoteMaster(
         "context id");
   }
 
-  if (!remote_contexts_.empty()) {
+  if (!IsRemoteContextsEmpty()) {
     CloseAndClearAllRemoteContexts();
   }
-  remote_contexts_ = remote_contexts;
+  {
+    mutex_lock l(remote_state_mu_);
+    remote_contexts_ = remote_contexts;
+  }
 
   return SetMasterContextState(
       std::move(server), worker_env, std::move(worker_session),
@@ -1095,11 +1202,10 @@ Status EagerContext::InitializeRemoteMaster(
 }
 
 Status EagerContext::UpdateRemoteMaster(
-    WorkerEnv* worker_env,
+    uint64 context_id,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     const std::vector<string>& add_remote_contexts,
-    const std::vector<string>& remove_remote_contexts, uint64 context_id,
-    Rendezvous* r) {
+    const std::vector<string>& remove_remote_contexts) {
   {
     tf_shared_lock l(remote_state_mu_);
     if (context_id != context_id_) {
@@ -1117,6 +1223,7 @@ Status EagerContext::UpdateRemoteMaster(
     // since the newly created context on the remote worker will be holding
     // a larger view id and ignores this request.
     CloseRemoteContexts(remove_remote_contexts, context_id, GetContextViewId());
+    mutex_lock l(remote_state_mu_);
     for (const string& remote_context : remove_remote_contexts) {
       remote_contexts_.erase(
           std::remove(remote_contexts_.begin(), remote_contexts_.end(),
@@ -1125,6 +1232,7 @@ Status EagerContext::UpdateRemoteMaster(
     }
   }
   if (!add_remote_contexts.empty()) {
+    mutex_lock l(remote_state_mu_);
     remote_contexts_.insert(std::end(remote_contexts_),
                             std::begin(add_remote_contexts),
                             std::end(add_remote_contexts));
@@ -1135,9 +1243,6 @@ Status EagerContext::UpdateRemoteMaster(
     mutex_lock l(remote_state_mu_);
     context_view_id_++;
 
-    worker_env_ = worker_env;
-    if (rendezvous_ != nullptr) rendezvous_->Unref();
-    rendezvous_ = r;
     remote_eager_workers_ = std::move(remote_eager_workers);
     pflr_->InitializeDeviceSet();
     InitPrioritizedDeviceTypeList();
@@ -1337,11 +1442,8 @@ Status EagerContext::InitializeRemoteWorker(
 }
 
 Status EagerContext::UpdateRemoteWorker(
-    const DeviceMgr* worker_session_device_mgr,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-    DynamicDeviceMgr* remote_device_mgr,
-    const std::vector<string>& remote_contexts, uint64 context_id,
-    DistributedFunctionLibraryRuntime* cluster_flr) {
+    const std::vector<string>& remote_contexts, uint64 context_id) {
   {
     mutex_lock l(remote_state_mu_);
     if (context_id != context_id_) {
@@ -1351,15 +1453,20 @@ Status EagerContext::UpdateRemoteWorker(
           " but current id = ", context_id_);
     }
     context_view_id_++;
+
+    remote_contexts_ = remote_contexts;
+    remote_eager_workers_ = std::move(remote_eager_workers);
+    InitPrioritizedDeviceTypeList();
+    pflr_->InitializeDeviceSet();
   }
 
-  remote_contexts_ = remote_contexts;
-
-  remote_eager_workers_ = std::move(remote_eager_workers);
-  ResetClusterFLR(cluster_flr);
-
-  remote_device_manager_.Reset(remote_device_mgr);
-  InitPrioritizedDeviceTypeList();
+  // No need to update remote_device_manager_ since it's not owned for remote
+  // worker context (owned by the corresponding worker session).
+  if (remote_device_manager_.Owned()) {
+    return errors::FailedPrecondition(
+        "EagerContext::UpdateRemoteWorker failed because the context was "
+        "initialized as a master context.");
+  }
 
   ClearCachesAndThreadExecutors();
   default_executor_.ClearError();
@@ -1369,13 +1476,6 @@ Status EagerContext::UpdateRemoteWorker(
       entry.second->ClearError();
     }
   }
-
-  SessionOptions options = SessionOptions();
-  const auto* config = pflr_->config();
-  ResetPFLR(worker_session_device_mgr, options.env, config,
-            TF_GRAPH_DEF_VERSION, FuncLibDef(),
-            config->graph_options().optimizer_options(), thread_pool_.get(),
-            cluster_flr_.Get(), custom_kernel_creator_);
   return Status::OK();
 }
 #endif  // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 1670345efd5..d03a91c817a 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 // clang-format off
@@ -30,7 +31,11 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "absl/types/optional.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
@@ -61,6 +66,7 @@ limitations under the License.
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -132,7 +138,7 @@ using VariantDevice = absl::variant<Device*, CustomDevice*>;
 
 class EagerContext : public AbstractContextInterface, public core::RefCounted {
  public:
-  static const uint64 kInvalidContextId = 0;
+  static constexpr uint64 kInvalidContextId = 0;
 
   static uint64 NewContextId() {
     uint64 context_id = random::New64();
@@ -167,11 +173,28 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
 
   AbstractTensorInterface* CreateTensor(
       DataType dtype, absl::Span<const int64> dim_sizes) override;
+  AbstractTensorInterface* CreateTensor(DataType dtype, const int64_t* dims,
+                                        int num_dims, void* data, size_t len,
+                                        bool convert_string,
+                                        MemoryReleaser memory_releaser,
+                                        void* memory_releaser_arg) override;
 
   AbstractTensorHandleInterface* CreateLocalHandle(
       AbstractTensorInterface* t) override;
+  AbstractTensorHandleInterface* CopyTensorHandleToDevice(
+      AbstractTensorHandleInterface* handle, const char* device_name,
+      Status* status) override;
   AbstractOperationInterface* CreateOperation() override;
 
+  // Loads a SavedModelAPI from `directory`, with a metagraphdef fitting
+  // the optional "tags". On success status->ok() will be true, and the
+  // returned pointer is non-null. On failure, `status` will be set to
+  // an appropriate error, and nullptr is returned.
+  std::unique_ptr<SavedModelAPI> LoadSavedModelAPI(
+      const std::string& directory,
+      const absl::optional<std::unordered_set<std::string>>& tags,
+      tensorflow::Status* status) override;
+
   void ListDevices(std::vector<DeviceAttributes>* devices) override;
 
   // Returns the function library runtime for the given device.
@@ -186,7 +209,9 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // Specify a executor for this thread.
   void SetExecutorForThread(EagerExecutor* executor);
 
-  const std::vector<DeviceType>& prioritized_device_type_list() const {
+  const std::shared_ptr<std::vector<DeviceType>> prioritized_device_type_list()
+      const {
+    mutex_lock l(device_type_list_mu_);
     return prioritized_device_type_list_;
   }
 
@@ -270,12 +295,16 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // errors, and the error message will be combined from all executors.
   Status SyncExecutors();
 
+  Status AsyncWait() override { return SyncExecutors(); }
+
   core::RefCountPtr<KernelAndDevice> GetCachedKernel(Fprint128 cache_key);
 
   void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);
 
   bool LogDevicePlacement() const { return log_device_placement_; }
+  void SetLogDevicePlacement(bool enable) { log_device_placement_ = enable; }
   bool AllowSoftPlacement() const { return allow_soft_placement_; }
+  void SetAllowSoftPlacement(bool enable) { allow_soft_placement_ = enable; }
   bool LogMemory() const { return log_memory_; }
 
   Rendezvous* GetRendezvous() const { return rendezvous_; }
@@ -326,8 +355,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
   void ClearRunMetadata() TF_EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
 
-  void StartStep();
-  void EndStep();
+  void StartStep() override;
+  void EndStep() override;
   ScopedStepContainer* StepContainer();
 
   FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; }
@@ -382,11 +411,10 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // can still be accessed, and will automatically register existing functions
   // if there are newly added hosts.
   Status UpdateRemoteMaster(
-      WorkerEnv* worker_env,
+      uint64 context_id,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       const std::vector<string>& add_remote_contexts,
-      const std::vector<string>& remove_remote_contexts, uint64 context_id,
-      Rendezvous* r);
+      const std::vector<string>& remove_remote_contexts);
 
   // Similar with InitializeRemoteMaster but this context will not kill remote
   // contexts in shutdown.
@@ -404,11 +432,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // Similar with InitializeRemoteWorker but will reuse existing context and
   // increment context_view_id.
   Status UpdateRemoteWorker(
-      const DeviceMgr* worker_session_device_mgr,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-      DynamicDeviceMgr* remote_device_mgr,
-      const std::vector<string>& remote_contexts, uint64 context_id,
-      DistributedFunctionLibraryRuntime* cluster_flr);
+      const std::vector<string>& remote_contexts, uint64 context_id);
 
   Status StoreCollectiveOpsServer(
       std::unique_ptr<ServerInterface> new_server, DeviceMgr* device_mgr,
@@ -467,12 +492,20 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
 
   Status FindDeviceFromName(const char* device_name, Device** device) const;
 
+  Status FindCompositeDeviceFromName(const char* device_name,
+                                     CompositeDevice** device) const;
+
   Status FindCustomDeviceFromName(const string& device_name,
                                   CustomDevice** dev) const;
 
   Status RegisterCustomDevice(const string& name,
                               std::unique_ptr<CustomDevice> device);
 
+  // Find or create a composite device with the given `underlying_devices`.
+  Status FindOrCreateCompositeDevice(
+      const std::vector<string>& underlying_devices,
+      CompositeDevice** composite_device);
+
   bool OnSameTask(const Device* first, const Device* second) const;
   // Gets the CPU device on the task of device.
   Status CPUDeviceOnTask(const Device* device, Device** cpu_device) const;
@@ -548,11 +581,20 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   OwnedOrUnownedHelper<DynamicDeviceMgr> remote_device_manager_;
 
   Device* host_cpu_device_;  // Owned by device_manager
-  std::vector<DeviceType> prioritized_device_type_list_;
+  mutable mutex device_type_list_mu_;
+  std::shared_ptr<std::vector<DeviceType>> prioritized_device_type_list_
+      TF_GUARDED_BY(device_type_list_mu_);
   Rendezvous* rendezvous_;
   std::function<Rendezvous*(const int64)> rendezvous_creator_;
   std::unordered_map<string, std::unique_ptr<CustomDevice>> custom_devices_;
 
+  mutable mutex composite_devices_mu_;
+  // Maps from the fingerprint of a set of device names to a virtual
+  // CompositeDevice.
+  // TODO(b/145922293): Consider taking device names as keys.
+  absl::flat_hash_map<uint64, std::unique_ptr<CompositeDevice>>
+      composite_devices_ GUARDED_BY(composite_devices_mu_);
+
   FunctionLibraryDefinition func_lib_def_{OpRegistry::Global(), {}};
 
   std::unique_ptr<thread::ThreadPool> thread_pool_;
@@ -587,9 +629,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   mutex metadata_mu_;
   RunMetadata run_metadata_ TF_GUARDED_BY(metadata_mu_);
   GraphCollector graph_collector_;
-  // TODO(fishx): Allow update following two bool after context creation.
-  const bool log_device_placement_;
-  const bool allow_soft_placement_;
+  std::atomic<bool> log_device_placement_;
+  std::atomic<bool> allow_soft_placement_;
 
   // Information related to step containers.
   std::atomic<int> num_active_steps_;
@@ -609,6 +650,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   OwnedOrUnownedHelper<CollectiveExecutorMgrInterface> collective_executor_mgr_;
 
 #if !defined(IS_MOBILE_PLATFORM)
+  std::vector<string> GetRemoteContexts() TF_LOCKS_EXCLUDED(remote_state_mu_);
+  bool IsRemoteContextsEmpty() TF_LOCKS_EXCLUDED(remote_state_mu_);
   void CloseAndClearAllRemoteContexts();
   void CloseRemoteContexts(const std::vector<string>& remote_contexts,
                            uint64 context_id, uint64 context_view_id);
@@ -630,7 +673,6 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   std::unique_ptr<ServerInterface> server_;
   WorkerEnv* worker_env_ = nullptr;
   std::shared_ptr<WorkerSession> worker_session_;
-  std::unique_ptr<eager::EagerClientCache> remote_eager_workers_;
 
   mutable mutex remote_state_mu_;
 
@@ -639,7 +681,9 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // and continuously incremented when context with the same context_id gets
   // updated. The view id should be consistent between master and workers.
   uint64 context_view_id_ TF_GUARDED_BY(remote_state_mu_);
-  std::vector<string> remote_contexts_;
+  std::vector<string> remote_contexts_ TF_GUARDED_BY(remote_state_mu_);
+  std::unique_ptr<eager::EagerClientCache> remote_eager_workers_
+      TF_GUARDED_BY(remote_state_mu_);
 
   int keep_alive_secs_ TF_GUARDED_BY(remote_state_mu_);
   std::atomic<int> sleep_for_secs_;
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index 9581f149e4a..c6ed61c80c4 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -31,7 +31,7 @@ static Device* CreateDevice(const string& type, int n) {
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
   DeviceAttributes attr;
-  attr.set_name("/job:a/replica:0/task:0/device:" + type + ":" +
+  attr.set_name("/job:localhost/replica:0/task:0/device:" + type + ":" +
                 std::to_string(n));
   attr.set_device_type(type);
   return new FakeDevice(attr);
@@ -170,5 +170,37 @@ TEST_F(EagerContextTest, SelectDeviceExplicitSoftPlacement) {
   EXPECT_EQ(dev->device_type(), DEVICE_CPU);
 }
 
+TEST_F(EagerContextTest, CompositeDevice) {
+  InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
+  std::vector<string> underlying_devices = {
+      "/job:worker/replica:0/task:0/device:CPU:0",
+      "/job:worker/replica:0/task:0/device:CPU:1"};
+  CompositeDevice* composite_device_0 = nullptr;
+  TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      &composite_device_0));
+  EXPECT_EQ(composite_device_0->name(),
+            "/job:localhost/replica:0/task:0/device:COMPOSITE:0");
+  CompositeDevice* device = nullptr;
+  TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0", &device));
+  EXPECT_EQ(device, composite_device_0);
+  CompositeDevice* composite_device_1 = nullptr;
+  TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      &composite_device_1));
+  EXPECT_EQ(composite_device_1, composite_device_0);
+  underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:2");
+  CompositeDevice* composite_device_2 = nullptr;
+  TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      &composite_device_2));
+  EXPECT_EQ(composite_device_2->name(),
+            "/job:localhost/replica:0/task:0/device:COMPOSITE:1");
+  TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:1", &device));
+  EXPECT_EQ(device, composite_device_2);
+
+  EXPECT_TRUE(errors::IsNotFound(context()->FindCompositeDeviceFromName(
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:2", &device)));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index b3f38c2843e..5d93b0e42ff 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -54,7 +54,13 @@ class CopyToDeviceNode : public EagerNode {
         "eager::CopyToDeviceNode", "dynamic", tensor.dtype(), &tensor.shape());
     TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &tensor));
     if (!async_ && mirror_) {
-      return dst_->AddLocalMirror(std::move(tensor), dstd_);
+      Status s = dst_->AddLocalMirror(std::move(tensor), dstd_);
+      // If a mirror was added since we called HasLocalMirror then just return
+      // and ignore the error.
+      if (s.ok() || (s.code() == error::Code::ALREADY_EXISTS)) {
+        return Status::OK();
+      }
+      return s;
     } else {
       return dst_->SetTensor(std::move(tensor), dstd_);
     }
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index cfb188bdd77..e342f6ae6cd 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -48,7 +48,7 @@ AbstractTensorInterface* TensorHandle::Resolve(Status* status) {
     }
   }
 
-  if (IsRemote()) {
+  if (Type() == REMOTE) {
     const tensorflow::Tensor* t = nullptr;
     TensorHandle* h_cpu = nullptr;
     *status = EagerCopyToDevice(this, ctx_, &ctx_->Executor(), ctx_->HostCPU(),
@@ -68,7 +68,7 @@ AbstractTensorInterface* TensorHandle::Resolve(Status* status) {
     h_cpu->Unref();
     delete tf_tensor;
     return retval;
-  } else {
+  } else if (Type() == LOCAL) {
     tensorflow::Tensor tensor;
     if (IsCPU(device()) || HasLocalMirror(nullptr)) {
       const tensorflow::Tensor* src = nullptr;
@@ -78,16 +78,26 @@ AbstractTensorInterface* TensorHandle::Resolve(Status* status) {
         *status = Tensor(&src);
       }
       if (!status->ok()) return nullptr;
+
       tensor = *src;
     } else {
       *status = CopyToDevice(*ctx_, ctx_->HostCPU(), &tensor);
       if (!status->ok()) return nullptr;
 
-      *status = AddEmptyLocalMirror(nullptr);
-      if (!status->ok()) return nullptr;
       tensorflow::Tensor mirror = tensor;
-      *status = SetTensor(std::move(mirror), nullptr);
-      if (!status->ok()) return nullptr;
+      *status = AddLocalMirror(std::move(mirror), nullptr);
+      if (!status->ok()) {
+        // If a mirror was added since we called HasLocalMirror then drop the
+        // newly copied tensor and use the previously added mirror.
+        if (status->code() != error::Code::ALREADY_EXISTS) {
+          return nullptr;
+        }
+        const tensorflow::Tensor* src = nullptr;
+        *status = TensorFromDevice(nullptr, &src);
+        if (!status->ok()) return nullptr;
+
+        tensor = *src;
+      }
     }
     // TODO(b/153052876): Change TF_TensorFromTensor to just return an
     // AbstractTensorInterface
@@ -95,9 +105,55 @@ AbstractTensorInterface* TensorHandle::Resolve(Status* status) {
     AbstractTensorInterface* retval = tf_tensor->tensor;
     delete tf_tensor;
     return retval;
+  } else {
+    *status = errors::InvalidArgument(
+        "Resolve() is not supoorted on packed TensorHandles.");
+    return nullptr;
   }
 }
 
+AbstractTensorHandleInterface* EagerContext::CopyTensorHandleToDevice(
+    AbstractTensorHandleInterface* handle, const char* device_name,
+    Status* status) {
+  TensorHandle* input = TensorHandleFromInterface(handle);
+  TensorHandle* result = nullptr;
+  Device* device;
+  *status = this->FindDeviceFromName(device_name, &device);
+  if (!status->ok()) {
+    tensorflow::CustomDevice* dev;
+    *status = this->FindCustomDeviceFromName(device_name, &dev);
+    if (status->ok()) {
+      *status = dev->CopyTensorToDevice(input, &result);
+      if (status->ok()) {
+        return result;
+      }
+    }
+    return nullptr;
+  }
+  // Handle tensor handles currently in custom devices
+  const char* handle_device_name = input->DeviceName(status);
+  if (!status->ok()) {
+    return nullptr;
+  }
+  tensorflow::CustomDevice* dev;
+  *status = this->FindCustomDeviceFromName(handle_device_name, &dev);
+  if (status->ok()) {
+    *status = dev->CopyTensorFromDevice(input, device_name, &result);
+    if (status->ok()) {
+      return result;
+    }
+    return nullptr;
+  }
+
+  // Handle regular case.
+  *status =
+      EagerCopyToDevice(input, this, &this->Executor(), device, false, &result);
+  if (status->ok()) {
+    return result;
+  }
+  return nullptr;
+}
+
 // TODO(b/152902651): We unfortunately need to put this EagerContext function
 // here to a circular BUILD dep issue. If we move this to context.cc, then we
 // will have the circular dependency of:
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index 13630a01ea9..7850978410f 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -98,7 +98,7 @@ const char* EagerExecutor::StateStringLocked() {
 
 Status EagerExecutor::SyncExecute(EagerNode* node) {
   if (Async()) {
-    return errors::Internal("Executor does not support sync execution");
+    return errors::Internal("Executor does not support async execution");
   }
   if (node->AsAsync() != nullptr) {
     return errors::Internal("Executor does not support executing async nodes");
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index e3a455e7e0d..090bfef46bd 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -36,11 +36,10 @@ void EagerOperation::Clear() {
   ClearInferenceState();
 }
 
-const string& EagerOperation::DeviceName() const {
-  VariantDevice variant_device =
-      (Device() == kVariantDeviceNull) ? EagerContext().HostCPU() : Device();
-  return absl::visit([](auto* d) -> const string& { return d->name(); },
-                     variant_device);
+Status EagerOperation::SetAttrValue(const char* attr_name,
+                                    const AttrValue& value) {
+  MutableAttrs()->Set(attr_name, value);
+  return Status::OK();
 }
 
 Status EagerOperation::SetAttrString(const char* attr_name, const char* data,
@@ -256,7 +255,7 @@ Status EagerOperation::OutputLength(const char* output_name, int* length) {
 
 Status EagerOperation::AddInput(AbstractTensorHandleInterface* input) {
   TensorHandle* h = TensorHandleFromInterface(input);
-  AddInput(h);
+  AddTensorHandle(h);
   return MaybeInferSingleInputAttrs(h);
 }
 
@@ -264,7 +263,7 @@ Status EagerOperation::AddInputList(
     absl::Span<AbstractTensorHandleInterface*> inputs) {
   for (auto& input : inputs) {
     TensorHandle* h = TensorHandleFromInterface(input);
-    AddInput(h);
+    AddTensorHandle(h);
   }
   return InferInputListAttrs(inputs.size());
 }
@@ -311,15 +310,7 @@ Status EagerOperation::Reset(
   executor_ = executor ? executor : &ctx_.Executor();
   remote_func_params_ = remote_func_params;
   op_name_ = op;
-  if (device_name != nullptr && strlen(device_name) > 0) {
-    return SetDeviceName(device_name);
-  } else {
-    last_set_device_name_.clear();
-    device_name_.clear();
-    device_parsed_name_.Clear();
-    device_ = kVariantDeviceNull;
-    return Status::OK();
-  }
+  return SetDeviceName(device_name);
 }
 
 Status EagerOperation::MaybeInferSingleInputAttrs(TensorHandle* handle) {
@@ -389,23 +380,22 @@ Status EagerOperation::InferInputListAttrs(int num_inputs) {
   return Status::OK();
 }
 
-Status EagerOperation::SetDeviceName(const char* name) {
-  if (name != nullptr && strlen(name) > 0) {
-    if (name != last_set_device_name_) {
-      if (!DeviceNameUtils::ParseFullName(name, &device_parsed_name_)) {
-        return errors::InvalidArgument("Malformed device specification '", name,
-                                       "' in eager op: ", DebugString());
-      }
-      last_set_device_name_ = name;
-      device_name_ = DeviceNameUtils::ParsedNameToString(device_parsed_name_);
-      CustomDevice* custom_device;
-      if (ctx_.FindCustomDeviceFromName(device_name_, &custom_device).ok()) {
-        device_ = custom_device;
-      } else {
-        // Device placement for physical devices happens lazily in
-        // EagerExecute/EagerRemoteExecute, and can depend on the inputs.
-        device_ = kVariantDeviceNull;
-      }
+Status EagerOperation::SetDeviceName(const char* c_name) {
+  string name(c_name != nullptr ? c_name : "");
+  if (name != last_set_device_name_) {
+    if (!DeviceNameUtils::ParseFullName(name, &device_parsed_name_)) {
+      return errors::InvalidArgument("Malformed device specification '", name,
+                                     "' in eager op: ", DebugString());
+    }
+    last_set_device_name_ = name;
+    device_name_ = DeviceNameUtils::ParsedNameToString(device_parsed_name_);
+    CustomDevice* custom_device;
+    if (ctx_.FindCustomDeviceFromName(device_name_, &custom_device).ok()) {
+      device_ = custom_device;
+    } else {
+      // Device placement for physical devices happens lazily in
+      // EagerExecute/EagerRemoteExecute, and can depend on the inputs.
+      device_ = kVariantDeviceNull;
     }
   }
   return Status::OK();
@@ -442,4 +432,10 @@ string EagerOperation::DebugString() const {
   return out;
 }
 
+void EagerOperation::AddTensorHandle(TensorHandle* h) {
+  h->Ref();
+  inputs_.push_back(h);
+  attrs_.NumInputs(static_cast<int>(inputs_.size()));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 44dce9dc057..14268ef2630 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -48,29 +48,33 @@ class EagerOperation : public AbstractOperationInterface {
   }
 
   const string& Name() const override { return attrs_.op_name(); }
-  const string& DeviceName() const override;
 
-  const string& GetDeviceName() const { return device_name_; }
+  const string& DeviceName() const override { return device_name_; }
 
   const DeviceNameUtils::ParsedName& GetDeviceParsedName() const {
     return device_parsed_name_;
   }
 
+  // Replaces the previous device name with the given one (see
+  // AbstractOperationInterface::SetDeviceName for more details).
+  //
+  // This also resets the internal device pointer, unless the given name refers
+  // to a known custom device, in which case the internal device pointer is
+  // updated to that device.
   Status SetDeviceName(const char* name) override;
 
-  void SetDevice(tensorflow::Device* device) {
+  void SetDevice(VariantDevice device) {
     device_ = device;
-    last_set_device_name_.clear();
-    device_name_ = device->name();
-    device_parsed_name_ = device->parsed_name();
+    device_name_ =
+        device == kVariantDeviceNull ? "" : VariantDeviceName(device);
+    DeviceNameUtils::ParseFullName(device_name_, &device_parsed_name_);
+    // TODO(b/154133594): Due to intricacies of external logic, we can not
+    // set this do device_name_ as it would be natural, because we need the
+    // next call to SetDeviceName to reset the device pointer.
+    last_set_device_name_ = "\177";  // DEL (an invalid value)
   }
 
-  void SetDevice(tensorflow::CustomDevice* device) {
-    device_ = device;
-    last_set_device_name_.clear();
-    device_name_ = device->name();
-    DeviceNameUtils::ParseFullName(device_name_, &device_parsed_name_);
-  }
+  Status SetAttrValue(const char* attr_name, const AttrValue& value);
 
   Status AddInput(AbstractTensorHandleInterface* input) override;
   Status AddInputList(
@@ -123,7 +127,6 @@ class EagerOperation : public AbstractOperationInterface {
   bool colocation_exempt() const { return colocation_exempt_; }
 
   tensorflow::EagerContext& EagerContext() { return ctx_; }
-  const tensorflow::EagerContext& EagerContext() const { return ctx_; }
 
   AttrBuilder* MutableAttrs() { return &attrs_; }
   const AttrBuilder& Attrs() const { return attrs_; }
@@ -133,11 +136,8 @@ class EagerOperation : public AbstractOperationInterface {
   }
   absl::InlinedVector<TensorHandle*, 4>* MutableInputs() { return &inputs_; }
 
-  void AddInput(TensorHandle* h);
   void UpdateInput(int i, TensorHandle* h);
 
-  const AttrTypeMap* AttrTypes() const { return attr_types_; }
-
   // Like TensorHandles, EagerOperations may be placed either on a virtual
   // CustomDevice or on a physical Device.
   VariantDevice Device() const { return device_; }
@@ -163,12 +163,10 @@ class EagerOperation : public AbstractOperationInterface {
 
   // Op name recorded for memory debugging purpose.
   const char* op_name() const { return op_name_; }
-  const char* op_name_ = nullptr;
-
-  Status MaybeInferSingleInputAttrs(TensorHandle* handle);
-  Status InferInputListAttrs(int num_inputs);
 
  private:
+  void AddTensorHandle(TensorHandle* h);
+
   const tensorflow::OpDef* GetOpDef(Status* status);
 
   void ClearInferenceState() {
@@ -176,12 +174,17 @@ class EagerOperation : public AbstractOperationInterface {
     inference_arg_idx_ = 0;
     inference_attrs_.clear_no_resize();
   }
+
+  Status MaybeInferSingleInputAttrs(TensorHandle* handle);
+  Status InferInputListAttrs(int num_inputs);
+
   void InferSingleTypeInputListAttrs(const OpDef::ArgDef& input_def,
                                      const DataType dtype, int num_inputs);
   void InferMixedTypeInputListAttrs(const OpDef::ArgDef& input_def,
                                     const std::vector<DataType>& dtypes);
 
   tensorflow::EagerContext& ctx_;
+  const char* op_name_ = nullptr;
   AttrBuilder attrs_;
   const AttrTypeMap* attr_types_;
   absl::InlinedVector<TensorHandle*, 4> inputs_;
@@ -191,10 +194,20 @@ class EagerOperation : public AbstractOperationInterface {
   // calls to SetDeviceName.
   string last_set_device_name_;
 
+  // The operation's device name.
+  // This contains the named passed to SetDeviceName until device_ is set,
+  // at which point it contains the device_ name.
   string device_name_;
 
+  // The parsed device name.
+  // This will always contain the result of
+  // DeviceNameUtils::ParseFullName(device_name_).
   DeviceNameUtils::ParsedName device_parsed_name_;
 
+  // The operation's device.
+  // This is set by the execution device placement logic, and should conform
+  // with the contents of device_name_. Once it is set, the device_name_ is
+  // updated accordingly.
   VariantDevice device_;
 
   bool use_xla_ = false;
@@ -211,12 +224,6 @@ class EagerOperation : public AbstractOperationInterface {
   gtl::FlatSet<std::string> inference_attrs_;  // attributes inferred so far
 };
 
-inline void EagerOperation::AddInput(TensorHandle* h) {
-  h->Ref();
-  inputs_.push_back(h);
-  attrs_.NumInputs(static_cast<int>(inputs_.size()));
-}
-
 inline void EagerOperation::UpdateInput(int i, TensorHandle* h) {
   TensorHandle** slot = &inputs_[i];
   TensorHandle* existing = *slot;
diff --git a/tensorflow/core/common_runtime/eager/eager_operation_test.cc b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
new file mode 100644
index 00000000000..352c7f03365
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(EagerOperationTest, DeviceName) {
+  StaticDeviceMgr device_mgr(DeviceFactory::NewDevice(
+      "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &device_mgr, false, nullptr, nullptr, nullptr);
+
+  auto op = new EagerOperation(ctx);
+
+  TF_ASSERT_OK(op->SetDeviceName("/device:DONTHAVE"));
+  EXPECT_EQ("/device:DONTHAVE:*", op->DeviceName());
+
+  TF_ASSERT_OK(op->SetDeviceName(""));
+  EXPECT_EQ("", op->DeviceName());
+
+  TF_ASSERT_OK(op->SetDeviceName("/job:localhost"));
+  EXPECT_EQ("/job:localhost", op->DeviceName());
+
+  EXPECT_NE(Status::OK(), op->SetDeviceName("/not/a/valid/name"));
+
+  delete op;
+  ctx->Unref();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 7ce7d97d775..f23b0fa7877 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -200,7 +200,8 @@ Status ValidateInputTypeAndPlacement(
             "together.");
       }
       Device* handle_device = absl::get<Device*>(handle_device_variant);
-      const bool maybe_copy = !skip_remote_copy || !handle->IsRemote();
+      const bool maybe_copy =
+          !skip_remote_copy || handle->Type() != TensorHandle::REMOTE;
       // If the input is already on the right device, then nothing to do.
       if (expected_device != handle_device && maybe_copy) {
         TF_RETURN_IF_ERROR(CopyInputToExpectedDevice(ctx, op, kernel->device(),
@@ -258,7 +259,7 @@ Status GetDeviceForInput(const EagerContext& ctx, TensorHandle* tensor_handle,
   }
   Device* cpu_device = ctx.HostCPU();
   string device_name;
-  if (tensor_handle->IsRemote()) {
+  if (tensor_handle->Type() != TensorHandle::LOCAL) {
     Device* device = absl::get<Device*>(tensor_handle->device());
     device_name = device != nullptr ? device->name() : cpu_device->name();
     *result = (device == nullptr ? cpu_device : device);
@@ -357,37 +358,19 @@ Status MustCompileWithXLA(const EagerOperation* op, const EagerContext& ctx,
   return Status::OK();
 }
 
-// There are a lot of references to devices in this function and around.
-// Here is what they mean:
-//  EagerOperation::Device(): The device on which the user requested the op
-//    be executed, except if we had to change the device due to resource inputs
-//    or CPU pinning. If the user did not request a device, the op does not
-//    take resources, and we did not pin it to CPU, the device can be nullptr.
-//  KernelAndDevice::Device(): The first time we see an op (combined with
-//    its attributes), we need to create a KernelAndDevice object for it.
-//    If op->Device() is a nullptr, we select a device for the op when
-//    creating the KernelAndDevice. A concrete device will always be selected
-//    here except when `op` is a function to be executed using function library
-//    runtime. In this case, we don't select a device because running
-//    a function with explicitly requested device has different behavior than
-//    running without an explicitly requested device.
-Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
-                         int* num_retvals) {
-  ScopedMemoryDebugAnnotation op_annotation(
-      op->op_name(), op->remote_func_params().has_value()
-                         ? op->remote_func_params().value().step_id.value_or(0)
-                         : 0);
-  profiler::TraceMe activity(
-      [&] { return absl::StrCat("EagerLocalExecute: ", op->Name()); },
-      profiler::TraceMeLevel::kInfo);
+Status GetOrCreateKernelAndDevice(
+    EagerOperation* op, TensorHandle** retvals, int* num_retvals,
+    core::RefCountPtr<KernelAndDevice>* out_kernel) {
   EagerContext& ctx = op->EagerContext();
-  auto& executor = op->Executor();
-  TF_RETURN_IF_ERROR(executor.status());
   Device* device = absl::get<Device*>(op->Device());
 
-  Fprint128 cache_key = op->MutableAttrs()->CacheKey(op->GetDeviceName());
+  Fprint128 cache_key = op->MutableAttrs()->CacheKey(op->DeviceName());
+  /// Include soft placement policy in cache key since the placement strategy
+  // can change and thus affect which kernel is picked.
+  cache_key = FingerprintCat128(cache_key, ctx.AllowSoftPlacement());
 
   std::vector<Device*> input_dev_ptrs;
+  absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_variable_dtypes_and_shapes;
   // We can eliminate some overhead by running simple functions using regular
@@ -414,11 +397,13 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
     // which doesn't accept remote inputs.
     for (int i = 0; i < op->Inputs().size(); i++) {
       TensorHandle* input = op->Inputs()[i];
-      if (!ctx.LazyCopyFunctionRemoteInputs() && input->IsRemote()) {
+      if (!ctx.LazyCopyFunctionRemoteInputs() &&
+          input->Type() == TensorHandle::REMOTE) {
         TensorHandle* handle = nullptr;
-        TF_RETURN_IF_ERROR(EagerCopyToDevice(
-            input, &ctx, &executor, device == nullptr ? ctx.HostCPU() : device,
-            /* mirror= */ true, &handle));
+        TF_RETURN_IF_ERROR(
+            EagerCopyToDevice(input, &ctx, &op->Executor(),
+                              device == nullptr ? ctx.HostCPU() : device,
+                              /*mirror=*/true, &handle));
         op->UpdateInput(i, handle);
         // Unref handle since it has a ref as an input now
         handle->Unref();
@@ -429,6 +414,13 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
       Device* input_device;
       TF_RETURN_IF_ERROR(GetDeviceForInput(ctx, input, &input_device));
       input_dev_ptrs.push_back(input_device);
+      CompositeDevice* composite_device = nullptr;
+      if (ctx.FindCompositeDeviceFromName(input_device->name().c_str(),
+                                          &composite_device)
+              .ok()) {
+        composite_devices[input_device->name()] =
+            composite_device->underlying_devices();
+      }
       cache_key =
           FingerprintCat128(cache_key, Fingerprint128(input_device->name()));
 
@@ -480,9 +472,10 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
     const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
     if (device == nullptr) {
       PrioritizedDeviceTypeVector supported_devs;
-      TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
-          ctx.prioritized_device_type_list(), ndef, &supported_devs,
-          &ctx.HostCPU()->parsed_name()));
+      auto device_type_list = ctx.prioritized_device_type_list();
+      TF_RETURN_IF_ERROR(
+          SupportedDeviceTypesForNode(*device_type_list, ndef, &supported_devs,
+                                      &ctx.HostCPU()->parsed_name()));
       if (supported_devs.empty()) {
         return errors::NotFound("Could not find valid device for node.\nNode:",
                                 FormatNodeDefForError(ndef),
@@ -498,13 +491,6 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
                << KernelsRegisteredForOp(op->Name());
       op->SetDevice(device);
     }
-    if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
-      string msg = strings::StrCat("Executing op ", ndef.op(), " in device ",
-                                   DeviceNameOrUnspecified(device));
-      if (!logging::LogToListeners(msg)) {
-        LOG(INFO) << msg;
-      }
-    }
 
     FunctionLibraryRuntime* flr =
         device == nullptr ? nullptr : ctx.func_lib(device);
@@ -538,6 +524,7 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
 #endif  // IS_MOBILE_PLATFORM
       kernel.reset(new KernelAndDeviceFunc(
           flr, ctx.pflr(), std::move(input_dev_ptrs),
+          std::move(composite_devices),
           std::move(input_resource_variable_dtypes_and_shapes), runner,
           ctx.GetCollectiveExecutorHandle(), ctx.HostCPU(), op->Name(),
           [&ctx](const int64 step_id) { return ctx.CreateRendezvous(step_id); },
@@ -568,6 +555,7 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
       }
     }
   }
+
   int num_outputs = kernel->num_outputs();
   if (num_outputs > *num_retvals) {
     return errors::InvalidArgument("Expecting ", num_outputs,
@@ -575,8 +563,54 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
                                    *num_retvals);
   }
   *num_retvals = num_outputs;
+
+  kernel->Ref();  // Ownership of reference is passed to out_kernel.
+  out_kernel->reset(kernel.get());
+  return Status::OK();
+}
+
+// There are a lot of references to devices in this function and around.
+// Here is what they mean:
+//  EagerOperation::Device(): The device on which the user requested the op
+//    be executed, except if we had to change the device due to resource inputs
+//    or CPU pinning. If the user did not request a device, the op does not
+//    take resources, and we did not pin it to CPU, the device can be nullptr.
+//  KernelAndDevice::Device(): The first time we see an op (combined with
+//    its attributes), we need to create a KernelAndDevice object for it.
+//    If op->Device() is a nullptr, we select a device for the op when
+//    creating the KernelAndDevice. A concrete device will always be selected
+//    here except when `op` is a function to be executed using function library
+//    runtime. In this case, we don't select a device because running
+//    a function with explicitly requested device has different behavior than
+//    running without an explicitly requested device.
+Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
+                         int* num_retvals) {
+  ScopedMemoryDebugAnnotation op_annotation(
+      op->op_name(), op->remote_func_params().has_value()
+                         ? op->remote_func_params().value().step_id.value_or(0)
+                         : 0);
+  profiler::TraceMe activity(
+      [&] { return absl::StrCat("EagerLocalExecute: ", op->Name()); },
+      profiler::TraceMeLevel::kInfo);
+  EagerContext& ctx = op->EagerContext();
+  auto& executor = op->Executor();
+  TF_RETURN_IF_ERROR(executor.status());
+
+  core::RefCountPtr<KernelAndDevice> kernel;
+  TF_RETURN_IF_ERROR(
+      GetOrCreateKernelAndDevice(op, retvals, num_retvals, &kernel));
+
+  int num_outputs = kernel->num_outputs();
   TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(&ctx, op, kernel));
 
+  if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 kernel->device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   GraphCollector* graph_collector = nullptr;
   if (ctx.ShouldStoreGraphs()) {
     graph_collector = ctx.GetGraphCollector();
@@ -596,6 +630,10 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
         &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel),
         graph_collector, op->GetCancellationManager(),
         absl::Span<TensorHandle*>(retvals, num_outputs));
+    // Release the inputs from the eager operation since the AsyncExecuteNode
+    // would have taken ownership. This allows the inputs to be forwarded if
+    // possible.
+    op->Clear();
     // For async mode, execution order will make sure that all
     // input handles are ready before executing them.
     // TODO(b/137118203): Consider executing "cheap" kernels inline for
@@ -609,6 +647,10 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
                      graph_collector, op->GetCancellationManager(),
                      {retvals, static_cast<size_t>(num_outputs)});
     s = executor.SyncExecute(&node);
+    // We release the inputs AFTER executing the operation in sync mode since
+    // ExecuteNode does not increment the reference count and thus does not have
+    // ownership of the inputs while executing.
+    op->Clear();
   }
   // Since the operation failed, we need to Unref any outputs if they were
   // allocated.
@@ -669,7 +711,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   // TODO(fishx): Remove following code when lazy tensor copy is ready.
   if (op->Device() == kVariantDeviceNull) {
     tensorflow::Device* device = nullptr;
-    string device_name = op->GetDeviceName();
+    string device_name = op->DeviceName();
     TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(device_name.c_str(), &device));
     op->SetDevice(device);
   }
@@ -740,9 +782,15 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         }
       }
       auto* input_handle = remote_op->add_op_inputs()->mutable_remote_handle();
+      // For a multi-device function, a remote RunComponentFunction request is
+      // not sent through StreamingEnqueueAsync. It could arrive at a remote
+      // worker before a remote execution request which produces an input of the
+      // component function. So we wait until the remote input is ready before
+      // serializing it.
+      const bool wait_until_ready = op->is_function();
       TF_RETURN_IF_ERROR(ctx.RemoteMgr()->SerializeRemoteTensorHandle(
-          input, input_handle, input_device, *input_device_name,
-          serialize_resource_dtype_and_shape));
+          input, wait_until_ready, input_handle, input_device,
+          *input_device_name, serialize_resource_dtype_and_shape));
       if (!input_handle->resource_dtypes_and_shapes().empty()) {
         TF_RETURN_IF_ERROR(
             input->AddResourceShapeMirror(op_device, input_handle->op_id(),
@@ -803,6 +851,16 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
       ctx.GetContextViewId(), eager_client.get(),
       op->MutableAttrs()->BuildNodeDef(), op->EagerContext().FuncLibDef(),
       op->Inputs(), {retvals, num_outputs}));
+
+  if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat(
+        "Executing op ", op->Name(), " on task ",
+        DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   Status s = executor.AddOrExecute(std::move(node));
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
@@ -836,6 +894,19 @@ bool IsPinnableOp(const string& op_type) {
          !absl::StartsWith(op_type, "XRT");
 }
 
+// Validate if the remote device with the given incarnation is valid in the
+// remote device manager of the current eager context.
+Status ValidateTensorHandleRemoteDevice(EagerContext* ctx,
+                                        int64 device_incarnation) {
+  if (ctx->remote_device_mgr()->ContainsDevice(device_incarnation)) {
+    return Status::OK();
+  }
+  return errors::InvalidArgument(
+      "Resource input tensor contains an invalid device. This might happen "
+      "when the client has connected to a different cluster, or some remote "
+      "workers have been restarted.");
+}
+
 // The Op device may be updated if:
 // - A resource touching input is specified: all resource-touching ops run in
 // the device the resource is, regardless of anything else that has been
@@ -844,7 +915,47 @@ bool IsPinnableOp(const string& op_type) {
 // - All op inputs are on the CPU, small (<64 elements) and integers
 // (int32/int64). This can be disabled by setting the environment variable
 // "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
+//
+// TODO(b/154234908): Unify placement logic.
 Status MaybeUpdateOpDevice(EagerOperation* op) {
+  // If operation was already placed on a custom device, use it.
+  if (VariantDeviceIsCustom(op->Device())) {
+    return Status::OK();
+  }
+
+  // If all the inputs are on the same custom device, use that custom
+  // device. Otherwise, it is an error to have a custom device as an input.
+  if (!op->Inputs().empty()) {
+    // We keep track of what we've seen with devices instead of booleans to be
+    // able to provide a meaningful error message below.
+    VariantDevice first = op->Inputs()[0]->device();
+    VariantDevice different = first;  // A different input device, if any.
+    VariantDevice custom = first;     // The first custom device seen, or an
+                                      // arbitrary non-custom device otherwise.
+    for (size_t i = 1; first == different && i < op->Inputs().size(); ++i) {
+      VariantDevice device = op->Inputs()[i]->device();
+      if (device != first) {
+        different = device;
+      }
+      if (!VariantDeviceIsCustom(custom) && VariantDeviceIsCustom(device)) {
+        custom = device;
+      }
+      if (different != first && VariantDeviceIsCustom(custom)) {
+        return errors::InvalidArgument(absl::StrCat(
+            "If an operation has one of its inputs in a custom device, then "
+            "all inputs should be on that same device. Operation ",
+            op->Name(), " has one input in custom device ",
+            VariantDeviceName(custom),
+            " and at least one input in a different device ",
+            VariantDeviceName(custom == first ? different : first)));
+      }
+    }
+    if (different == first && VariantDeviceIsCustom(custom)) {
+      op->SetDevice(first);
+      return Status::OK();
+    }
+  }
+
   if (op->colocation_exempt()) {
     return Status::OK();
   }
@@ -856,10 +967,11 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
                           : absl::get<Device*>(op->Device());
   for (int i = 0; i < op->Inputs().size(); ++i) {
     TensorHandle* tensor_handle = op->Inputs()[i];
-    if (VariantDeviceIsCustom(tensor_handle->DeviceOrHostCPU(ctx))) {
-      continue;  // Do not try to let custom devices influence op placement.
-    }
     if (tensor_handle->dtype == DT_RESOURCE) {
+      if (tensor_handle->resource_remote_device_incarnation() != 0) {
+        TF_RETURN_IF_ERROR(ValidateTensorHandleRemoteDevice(
+            &ctx, tensor_handle->resource_remote_device_incarnation()));
+      }
       Device* resource_device = tensor_handle->resource_device();
       DVLOG(2) << "for op " << op->Name() << " input " << i << " "
                << DataTypeString(tensor_handle->dtype)
@@ -878,9 +990,9 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
           // TODO(b/145922293): Support allowed_devices specified in wildcard
           // patterns.
           if (std::find(allowed_devices.begin(), allowed_devices.end(),
-                        op->GetDeviceName()) != allowed_devices.end()) {
-            TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(
-                op->GetDeviceName().c_str(), &resource_device));
+                        op->DeviceName()) != allowed_devices.end()) {
+            TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(op->DeviceName().c_str(),
+                                                      &resource_device));
           }
         }
         DVLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
@@ -940,6 +1052,61 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
 
   return Status::OK();
 }
+
+Status GetKernelOutputs(std::vector<Tensor>* outputs, int num_outputs,
+                        TensorHandle** retvals, EagerContext* ctx,
+                        KernelAndDevice* kernel) {
+  for (int i = 0; i < num_outputs; ++i) {
+    if (retvals[i] == nullptr) {
+      retvals[i] = TensorHandle::CreateLocalHandle(
+          std::move((*outputs)[i]),
+          /* d= */ ctx->CanonicalDevice(kernel->OutputDevice(i)),
+          /* op_device= */ kernel->device(),
+          /* resource_device= */ kernel->OutputResourceDevice(i), ctx);
+    } else {
+      if (TF_PREDICT_FALSE(kernel->device() != retvals[i]->op_device())) {
+        return errors::Internal(
+            "Kernel output tensor handle has a different op device than the "
+            "kernel. This should never happen.");
+      }
+      if (TF_PREDICT_FALSE(ctx->CanonicalDevice(kernel->OutputDevice(i)) !=
+                           absl::get<Device*>(retvals[i]->device()))) {
+        return errors::Internal(
+            "Kernel output tensor handle locates on a different device than "
+            "the specified kernel output device. This should never happen.");
+      }
+
+      TF_RETURN_IF_ERROR(
+          retvals[i]->SetTensor(std::move((*outputs)[i]),
+                                ctx->CanonicalDevice(kernel->OutputDevice(i))));
+    }
+  }
+  return Status::OK();
+}
+
+void CollectGraphs(EagerContext* ctx) {
+  mutex_lock ml(*ctx->MetadataMu());
+
+  GraphCollector* collector = ctx->GetGraphCollector();
+  mutex_lock mll(collector->mu);
+
+  // Adding to partition graphs for backward compatibility.
+  for (const auto& graph : collector->partitioned_graphs) {
+    *ctx->RunMetadataProto()->add_partition_graphs() = graph;
+  }
+
+  if (collector->dirty) {
+    auto* function_graphs = ctx->RunMetadataProto()->add_function_graphs();
+    *function_graphs->mutable_post_optimization_graph() =
+        collector->optimized_graph;
+    *function_graphs->mutable_pre_optimization_graph() = collector->raw_graph;
+    for (const auto& graph : collector->partitioned_graphs) {
+      *function_graphs->add_partition_graphs() = graph;
+    }
+  }
+
+  collector->ClearGraphs();
+}
 }  // namespace
 
 Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
@@ -948,13 +1115,13 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
       [&] { return absl::StrCat("EagerExecute: ", op->Name()); },
       profiler::TraceMeLevel::kInfo);
 
+  TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op));
+
   if (VariantDeviceIsCustom(op->Device())) {
     return absl::get<CustomDevice*>(op->Device())
         ->Execute(op, retvals, num_retvals);
   }
 
-  TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op));
-
   if (!op->Executor().Async()) {
     // In sync mode, always clear error to maintain the same behavior as before.
     // TODO(b/141004939): Remove this.
@@ -972,15 +1139,6 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
     return EagerLocalExecute(op, retvals, num_retvals);
   }
 
-  if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = strings::StrCat(
-        "Executing op ", op->Name(), " on task ",
-        DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
-    if (!logging::LogToListeners(msg)) {
-      LOG(INFO) << msg;
-    }
-  }
-
 #if defined(IS_MOBILE_PLATFORM)
   return errors::Unimplemented(
       "Eager's remote execution is not available on mobile devices.");
@@ -1015,50 +1173,18 @@ Status EagerKernelExecute(
   TF_RETURN_IF_ERROR(kernel->Run(container, inputs, &outputs,
                                  cancellation_manager, remote_func_params));
   if (graph_collector != nullptr) {
-    mutex_lock ml(*ctx->MetadataMu());
-    {
-      GraphCollector* collector = ctx->GetGraphCollector();
-      mutex_lock mll(collector->mu);
-
-      // Adding to partition graphs for backward compatibility.
-      for (const auto& graph : collector->partitioned_graphs) {
-        *ctx->RunMetadataProto()->add_partition_graphs() = graph;
-      }
-
-      if (collector->dirty) {
-        auto* function_graphs = ctx->RunMetadataProto()->add_function_graphs();
-        *function_graphs->mutable_post_optimization_graph() =
-            collector->optimized_graph;
-        *function_graphs->mutable_pre_optimization_graph() =
-            collector->raw_graph;
-        for (const auto& graph : collector->partitioned_graphs) {
-          *function_graphs->add_partition_graphs() = graph;
-        }
-      }
-
-      collector->ClearGraphs();
-    }
+    CollectGraphs(ctx);
   }
-  DCHECK_EQ(retvals.size(), outputs.size());
 
-  for (int i = 0; i < retvals.size(); ++i) {
-    if (retvals[i] == nullptr) {
-      retvals[i] = TensorHandle::CreateLocalHandle(
-          std::move(outputs[i]),
-          /* d= */ ctx->CanonicalDevice(kernel->OutputDevice(i)),
-          /* op_device= */ kernel->device(),
-          /* resource_device= */ kernel->OutputResourceDevice(i), ctx);
-    } else {
-      DCHECK_EQ(kernel->device(), retvals[i]->op_device());
-      DCHECK_EQ(ctx->CanonicalDevice(kernel->OutputDevice(i)),
-                absl::get<Device*>(retvals[i]->device()));
-
-      TF_RETURN_IF_ERROR(
-          retvals[i]->SetTensor(std::move(outputs[i]),
-                                ctx->CanonicalDevice(kernel->OutputDevice(i))));
-    }
+  if (TF_PREDICT_FALSE(retvals.size() != outputs.size())) {
+    return errors::Internal(
+        "EagerKernelExecute returns a list of ", outputs.size(),
+        " tensors but ", retvals.size(),
+        " is expected. This should never "
+        "happen. Please file a bug with the TensorFlow team.");
   }
-  return Status::OK();
+  return GetKernelOutputs(&outputs, retvals.size(), retvals.data(), ctx,
+                          kernel.get());
 }
 
 namespace {
@@ -1076,15 +1202,33 @@ Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
 
   bool async = executor->Async();
   if (mirror) {
+    h->Ref();
+    *result = h;
+
+    if (h->HasLocalMirror(d)) {
+      return Status::OK();
+    }
+
     // We don't bother adding an empty local mirror in sync mode since we'll be
     // executing the operation directly and be calling AddLocalMirror. A
     // reference count is still needed which will be removed if the operation
     // fails.
     if (async) {
-      TF_RETURN_IF_ERROR(h->AddEmptyLocalMirror(d));
+      Status s = h->AddEmptyLocalMirror(d);
+      if (!s.ok()) {
+        // If a mirror was added since we called HasLocalMirror then just return
+        // since another thread has already added the mirror.
+        if (s.code() == error::Code::ALREADY_EXISTS) {
+          return Status::OK();
+        }
+
+        // Remove the previously added reference count since adding the mirror
+        // failed.
+        h->Unref();
+        *result = nullptr;
+        return s;
+      }
     }
-    h->Ref();
-    *result = h;
   } else {
     *result = TensorHandle::CreateEmptyLocalHandle(
         d, dstd, h->resource_device(), h->dtype, ctx);
@@ -1141,19 +1285,31 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
     uint64 recv_op_id = 0;
     if (receiver_is_local) {
       Device* d = ctx->CanonicalDevice(device);
-      if (mirror && h->HasLocalMirror(d)) {
-        h->Ref();
-        *result = h;
-        return Status::OK();
-      }
-
       // TODO(gjn): Need to add support for async execution. Note if receiver
       // is local, we need to first add support in TensorHandle to wait on local
       // mirrors.
       if (mirror) {
-        TF_RETURN_IF_ERROR(h->AddEmptyLocalMirror(d));
         h->Ref();
         *result = h;
+
+        if (h->HasLocalMirror(d)) {
+          return Status::OK();
+        }
+
+        Status s = h->AddEmptyLocalMirror(d);
+        if (!s.ok()) {
+          // If a mirror was added since we called HasLocalMirror then just
+          // return since another thread has already added the mirror.
+          if (s.code() == error::Code::ALREADY_EXISTS) {
+            return Status::OK();
+          }
+
+          // Remove the previously added reference count since adding the mirror
+          // failed.
+          h->Unref();
+          *result = nullptr;
+          return s;
+        }
       } else {
         *result = TensorHandle::CreateEmptyLocalHandle(
             /* d= */ d, /* op_device= */ device,
@@ -1196,4 +1352,125 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
   }
 }
 
+namespace {
+// Low-level utility function to execute the kernel specified by `kernel` on
+// `kernel->device()`, with the provided inputs as `op_inputs` in the 'ctx'.
+// Different from `EagerKernelExecute` that ties up the thread until the
+// underlying function finishes execute, this function does not block the thread
+// and could return before the function execution finishes. The provided
+// `StatusCallback` will be triggered after function execution with its status.
+void EagerKernelExecuteAsync(
+    EagerContext* ctx, const absl::InlinedVector<TensorHandle*, 4>& op_inputs,
+    const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
+    const core::RefCountPtr<KernelAndDevice> kernel,
+    GraphCollector* graph_collector, CancellationManager* cancellation_manager,
+    TensorHandle** retvals, int num_outputs, StatusCallback done) {
+  auto inputs = std::make_shared<ExecuteNodeArgs>(op_inputs.size());
+  auto outputs = std::make_shared<std::vector<Tensor>>(1);
+
+  Status s = inputs->Init(ctx, op_inputs, kernel);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
+  kernel->Ref();  // Ownership of reference is transferred to the callback
+  kernel->RunAsync(
+      ctx->StepContainer(), *inputs, outputs.get(), cancellation_manager,
+      remote_func_params,
+      [retvals, inputs, outputs, num_outputs, ctx, graph_collector,
+       kernel_raw = kernel.get(), done = std::move(done)](const Status& s) {
+        auto wrapped_done = [&](const Status& s) {
+          kernel_raw->Unref();
+          done(s);
+        };
+        if (!s.ok()) {
+          wrapped_done(s);
+          return;
+        }
+        if (graph_collector != nullptr) {
+          CollectGraphs(ctx);
+        }
+        DCHECK_EQ(num_outputs, outputs->size());
+        wrapped_done(GetKernelOutputs(outputs.get(), num_outputs, retvals, ctx,
+                                      kernel_raw));
+      });
+}
+}  // namespace
+
+// Low-level utility to run the eager operation on local devices. Different from
+// `EagerLocalExecute` which blocks and waits for the finishing the op
+// execution, this method does not block the thread and could return before the
+// eager operation execution finishes. The provided `StatusCallback` will be
+// triggered after execution with its status.
+void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
+                            int* num_retvals, StatusCallback done) {
+  if (VariantDeviceIsCustom(op->Device())) {
+    done(errors::Unimplemented(
+        "Custom device is not supported in EagerLocalExecuteAsync."));
+    return;
+  }
+  if (!op->IsLocal()) {
+    done(errors::InvalidArgument(
+        "Remote execution is not supported in async EagerLocalExecuteAsync"));
+    return;
+  }
+
+  ScopedMemoryDebugAnnotation op_annotation(
+      op->op_name(), op->remote_func_params().has_value()
+                         ? op->remote_func_params().value().step_id.value_or(0)
+                         : 0);
+  profiler::TraceMe activity(
+      [&] { return absl::StrCat("EagerLocalExecuteAsync: ", op->Name()); },
+      profiler::TraceMeLevel::kInfo);
+  EagerContext& ctx = op->EagerContext();
+
+  core::RefCountPtr<KernelAndDevice> kernel;
+  Status s = GetOrCreateKernelAndDevice(op, retvals, num_retvals, &kernel);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
+  int num_outputs = kernel->num_outputs();
+  s = ValidateInputTypeAndPlacement(&ctx, op, kernel);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
+  if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 kernel->device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
+  GraphCollector* graph_collector = nullptr;
+  if (ctx.ShouldStoreGraphs()) {
+    graph_collector = ctx.GetGraphCollector();
+  }
+
+  for (int i = 0; i < num_outputs; ++i) {
+    retvals[i] = nullptr;
+  }
+
+  EagerKernelExecuteAsync(
+      &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel),
+      graph_collector, op->GetCancellationManager(), retvals, num_outputs,
+      [op, num_outputs, &retvals, done = std::move(done)](const Status& s) {
+        op->Clear();
+        // Since the operation failed, we need to Unref any outputs if they were
+        // allocated.
+        if (!s.ok()) {
+          for (int i = 0; i < num_outputs; ++i) {
+            if (retvals[i] != nullptr) {
+              retvals[i]->Unref();
+            }
+          }
+        }
+        done(s);
+      });
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 8ed8b9555e3..2224981db94 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -63,6 +63,27 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
                          EagerExecutor* executor, Device* device, bool mirror,
                          TensorHandle** result);
 
+// Utility function that executes a fully constructed EagerOperation
+// asynchronously on the local task. This function works differently from
+// EagerExecute in several ways:
+//  - It supports local execution only.
+//  - It returns after launching the eager operation to run asynchronously.
+//    Different from EagerExecute with async context that apends the operation
+//    to the end of the eager executor schedule queue, this call bypasses the
+//    executor logic and directly launches op execution. Ops running through
+//    this call does NOT have an ordering and can be executed in parallel.
+//  - It takes a StatusCallback which will be triggered after execution with the
+//    execution status.
+//
+// Does not support custom device.
+//
+// 'retvals' must point to a pre-allocated array of TensorHandle* and
+// '*num_retvals' should be set to the size of this array. It is an error if
+// the size of 'retvals' is less than the number of outputs. This call sets
+// *num_retvals to the number of outputs.
+void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
+                            int* num_retvals, StatusCallback done);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index f2528081877..27503cfd99d 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -17,6 +17,51 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
+
+#if !defined(IS_MOBILE_PLATFORM)
+bool ExecuteNodeArgs::IsRemote(EagerContext* ctx, Device* input_device,
+                               TensorHandle* handle) {
+  uint64 context_view_id = ctx->GetContextViewId();
+  if (handle->Type() == TensorHandle::REMOTE ||
+      handle->HasRemoteMirror(input_device, context_view_id)) {
+    if (!has_remote_inputs_) {
+      has_remote_inputs_ = true;
+    }
+    return true;
+  }
+  return false;
+}
+#endif  // IS_MOBILE_PLATFORM
+
+Status ExecuteNodeArgs::InitPackedHandle(const int index, EagerContext* ctx,
+                                         Device* input_device,
+                                         TensorHandle* packed_handle) {
+  int num_handles = packed_handle->NumPackedHandles();
+  packed_args_.emplace(index, gtl::InlinedVector<TensorValue, 4>(num_handles));
+  TensorValue* packed_arg_flat = &(packed_args_[index][0]);
+  for (int i = 0; i < num_handles; ++i) {
+    TensorHandle* h = nullptr;
+    TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
+    // We have validated that h->device() is not a CustomDevice when
+    // constructing a pack TensorHandle.
+    const Status status =
+        h->TensorValue(absl::get<Device*>(h->device()), &packed_arg_flat[i]);
+    if (!status.ok()) {
+#if !defined(IS_MOBILE_PLATFORM)
+      if (IsRemote(ctx, input_device, h)) {
+        continue;
+      }
+#endif  // IS_MOBILE_PLATFORM
+      if (h->Type() == TensorHandle::PACKED) {
+        return errors::InvalidArgument(
+            "Nested packed handles are not supported");
+      }
+      return status;
+    }
+  }
+  return Status::OK();
+}
+
 Status ExecuteNodeArgs::Init(
     EagerContext* ctx, const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
     const core::RefCountPtr<KernelAndDevice>& kernel) {
@@ -35,37 +80,72 @@ Status ExecuteNodeArgs::Init(
       Status s = in->TensorValue(ctx->CanonicalDevice(d), &tensor_args_flat[i]);
       if (!s.ok()) {
 #if !defined(IS_MOBILE_PLATFORM)
-        uint64 context_view_id = ctx->GetContextViewId();
-        if (in->IsRemote() || in->HasRemoteMirror(d, context_view_id)) {
-          if (!has_remote_inputs_) {
-            has_remote_inputs_ = true;
-          }
+        if (IsRemote(ctx, d, in)) {
           continue;
         }
 #endif
-        return s;
+        if (in->Type() != TensorHandle::PACKED) {
+          return s;
+        }
+        if (!has_packed_inputs_) {
+          has_packed_inputs_ = true;
+        }
+        TF_RETURN_IF_ERROR(InitPackedHandle(i, ctx, d, in));
       }
     }
   }
 
 #if !defined(IS_MOBILE_PLATFORM)
   if (has_remote_inputs_) {
+    const bool is_function = kernel->IsFunction();
     serialize_remote_handle_ =
-        [ctx, &op_inputs](const int i,
-                          eager::RemoteTensorHandle* handle) -> Status {
-      VariantDevice variant_device = op_inputs[i]->device();
+        [ctx, &op_inputs, is_function](
+            const FunctionArgIndex& index,
+            eager::RemoteTensorHandle* handle) -> Status {
+      TensorHandle* h = op_inputs[index.index];
+      if (op_inputs[index.index]->Type() == TensorHandle::PACKED) {
+        TF_RETURN_IF_ERROR(
+            op_inputs[index.index]->ExtractPackedHandle(index.sub_index, &h));
+      }
+      VariantDevice variant_device = h->device();
       if (VariantDeviceIsCustom(variant_device)) {
         return errors::Internal(
             "Custom devices and remote execution are currently not supported "
             "together.");
       }
       Device* device = absl::get<Device*>(variant_device);
+      // For a multi-device function, a remote RunComponentFunction request is
+      // not sent through StreamingEnqueueAsync. It could arrive at a remote
+      // worker before a remote execution request which produces an input of the
+      // component function. So we wait until the remote input is ready before
+      // serializing it.
+      const bool wait_util_ready = is_function;
       return ctx->RemoteMgr()->SerializeRemoteTensorHandle(
-          op_inputs[i], handle, device, device->name());
+          h, wait_util_ready, handle, device, device->name());
     };
   }
 #endif  // !IS_MOBILE_PLATFORM
   return Status::OK();
 }
 
+Status ExecuteNodeArgs::GetLocalArg(const FunctionArgIndex& index,
+                                    Tensor* val) const {
+  Status s = EagerKernelArgs::GetLocalArg(index, val);
+  if (s.ok()) {
+    return Status::OK();
+  }
+  if (packed_args_.contains(index.index)) {
+    Tensor* arg = packed_args_.at(index.index).at(index.sub_index).tensor;
+    if (arg) {
+      *val = *arg;
+      return Status::OK();
+    } else {
+      return errors::NotFound("Argument (", index.index, ",", index.sub_index,
+                              ") has no local tensor.");
+    }
+  } else {
+    return s;
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index be6e4009896..7924471066e 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <string>
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
@@ -54,19 +55,36 @@ class ExecuteNodeArgs : public EagerKernelArgs {
               const absl::InlinedVector<TensorHandle*, 4>& op_inputs,
               const core::RefCountPtr<KernelAndDevice>& kernel);
 
-  bool HasRemoteInputs() const override { return has_remote_inputs_; };
+  Status GetLocalArg(const FunctionArgIndex& index, Tensor* val) const override;
+
+  bool HasRemoteOrPackedInputs() const override {
+    return has_remote_inputs_ || has_packed_inputs_;
+  };
 
 #if !defined(IS_MOBILE_PLATFORM)
-  Status GetRemoteArg(const int index,
+  Status GetRemoteArg(const FunctionArgIndex& index,
                       eager::RemoteTensorHandle* val) const override {
     return serialize_remote_handle_(index, val);
   }
 #endif  // IS_MOBILE_PLATFORM
 
  private:
-  bool has_remote_inputs_ = false;
 #if !defined(IS_MOBILE_PLATFORM)
-  std::function<Status(const int, eager::RemoteTensorHandle*)>
+  // Returns whether `handle` is a remote handle or has a remote mirror on
+  // `input_device`
+  bool IsRemote(EagerContext* ctx, Device* input_device, TensorHandle* handle);
+#endif  // IS_MOBILE_PLATFORM
+
+  // Initialize a packed TensorHandle which is the `index`-th argument.
+  Status InitPackedHandle(const int index, EagerContext* ctx,
+                          Device* input_device, TensorHandle* packed_handle);
+
+  bool has_remote_inputs_ = false;
+  bool has_packed_inputs_ = false;
+  // Maps from the index of a packed arg to a list of sub-args.
+  absl::flat_hash_map<int, gtl::InlinedVector<TensorValue, 4>> packed_args_;
+#if !defined(IS_MOBILE_PLATFORM)
+  std::function<Status(const FunctionArgIndex&, eager::RemoteTensorHandle*)>
       serialize_remote_handle_;
 #endif  // IS_MOBILE_PLATFORM
 };
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
new file mode 100644
index 00000000000..83fbcf5017e
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/execute_node.h"
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/composite_device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class TestKernelAndDeviceFunc final : public KernelAndDeviceFunc {
+ public:
+  TestKernelAndDeviceFunc(std::vector<Device*> input_devices,
+                          Device* host_cpu_device)
+      : KernelAndDeviceFunc(
+            /*flr=*/nullptr, /*pflr=*/nullptr, /*input_devices=*/{},
+            /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
+            /*runner=*/nullptr, /*collective_executor=*/nullptr,
+            host_cpu_device, /*name=*/"",
+            /*rendezvous_creator=*/nullptr, /*get_op_id=*/nullptr),
+        test_input_devices_(std::move(input_devices)) {}
+
+  Device* InputDevice(int i) const override { return test_input_devices_[i]; }
+
+ private:
+  std::vector<Device*> test_input_devices_;
+};
+
+TEST(ExecuteNodeTest, ExecuteNodeArgs) {
+  StaticDeviceMgr device_mgr(
+      DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
+  Device* device0 = device_mgr.ListDevices().at(0);
+  auto remote_device_mgr = absl::make_unique<DynamicDeviceMgr>();
+  std::vector<std::unique_ptr<Device>> remote_devices;
+  remote_devices.emplace_back(
+      DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:1"));
+  TF_ASSERT_OK(remote_device_mgr->AddDevices(std::move(remote_devices)));
+  Device* device1 = remote_device_mgr->ListDevices().at(0);
+
+  Status s;
+  std::unique_ptr<CompositeDevice> composite_device =
+      CompositeDevice::MakeDevice({device0->name(), device1->name()},
+                                  /*unique_device_id=*/0,
+                                  device_mgr.HostCPU()->parsed_name(), &s);
+  TF_ASSERT_OK(s);
+
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &device_mgr, false, nullptr, nullptr, nullptr);
+
+  // Set a RemoteMgr to the EagerContext.
+  auto remote_mgr = absl::make_unique<eager::RemoteMgr>(
+      /*is_master=*/true, ctx);
+  TF_ASSERT_OK(ctx->InitializeRemoteMaster(
+      /*server=*/nullptr, /*worker_env=*/nullptr,
+      /*worker_session=*/nullptr, /*remote_eager_workers=*/nullptr,
+      std::move(remote_device_mgr), /*remote_contexts=*/{},
+      EagerContext::NewContextId(),
+      /*r=*/nullptr, &device_mgr, /*keep_alive_secs*/ 600,
+      /*cluster_flr=*/nullptr, std::move(remote_mgr)));
+
+  DataType dtype = DT_FLOAT;
+  Tensor t0(dtype, TensorShape({}));
+  // Create two local TensorHandles
+  t0.scalar<float>()() = {1.0f};
+  TensorHandle* h0 =
+      TensorHandle::CreateLocalHandle(std::move(t0), device0, device0, ctx);
+  Tensor t1(dtype, TensorShape({}));
+  t1.scalar<float>()() = {2.0f};
+  TensorHandle* h1 =
+      TensorHandle::CreateLocalHandle(std::move(t1), device0, device0, ctx);
+  // Create two remote TensorHandles
+  TensorHandle* h2 = TensorHandle::CreateLazyRemoteHandle(
+      /*op_id=*/1, /*output_num=*/0, dtype, device1, ctx);
+  TensorHandle* h3 = TensorHandle::CreateLazyRemoteHandle(
+      /*op_id=*/2, /*output_num=*/1, dtype, device1, ctx);
+  // Create a packed TensorHandle
+  TensorHandle* packed_h = nullptr;
+  TF_ASSERT_OK(TensorHandle::CreatePackedHandle({h1, h2}, ctx, &packed_h));
+
+  // LOCAL, PACKED, REMOTE
+  absl::InlinedVector<TensorHandle*, 4> inputs = {h0, packed_h, h3};
+
+  std::vector<Device*> input_devices;
+  for (auto* h : inputs) {
+    input_devices.push_back(absl::get<Device*>(h->DeviceOrHostCPU(*ctx)));
+  }
+  const core::RefCountPtr<KernelAndDevice> kernel(
+      new TestKernelAndDeviceFunc(std::move(input_devices), device0));
+
+  ExecuteNodeArgs args(inputs.size());
+  TF_EXPECT_OK(args.Init(ctx, inputs, kernel));
+  EXPECT_TRUE(args.HasRemoteOrPackedInputs());
+  Tensor local0;
+  TF_EXPECT_OK(args.GetLocalArg(FunctionArgIndex(0), &local0));
+  EXPECT_EQ(local0.flat<float>().size(), 1);
+  EXPECT_EQ(local0.flat<float>()(0), 1.0);
+  Tensor local1;
+  TF_EXPECT_OK(args.GetLocalArg(FunctionArgIndex(1, 0), &local1));
+  EXPECT_EQ(local1.flat<float>().size(), 1);
+  EXPECT_EQ(local1.flat<float>()(0), 2.0);
+  eager::RemoteTensorHandle remote0;
+  TF_EXPECT_OK(args.GetRemoteArg(FunctionArgIndex(1, 1), &remote0));
+  EXPECT_EQ(remote0.op_id(), 1);
+  EXPECT_EQ(remote0.output_num(), 0);
+  eager::RemoteTensorHandle remote1;
+  TF_EXPECT_OK(args.GetRemoteArg(FunctionArgIndex(2), &remote1));
+  EXPECT_EQ(remote1.op_id(), 2);
+  EXPECT_EQ(remote1.output_num(), 1);
+
+  h0->Unref();
+  h1->Unref();
+  h2->Unref();
+  h3->Unref();
+  packed_h->Unref();
+  ctx->Unref();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 1eabd5c7eee..bf7c083f24b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -35,7 +35,10 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/denormal.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
@@ -49,13 +52,18 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status EagerKernelArgs::GetLocalArg(const int index, Tensor* val) const {
-  Tensor* arg = tensor_args_.at(index).tensor;
+Status EagerKernelArgs::GetLocalArg(const FunctionArgIndex& index,
+                                    Tensor* val) const {
+  if (index.sub_index >= 0) {
+    return errors::InvalidArgument("Got unexpected sub_index ", index.sub_index,
+                                   " for argument ", index.index);
+  }
+  Tensor* arg = tensor_args_.at(index.index).tensor;
   if (arg) {
     *val = *arg;
     return Status::OK();
   } else {
-    return errors::NotFound("Argument ", index, " has no local tensor.");
+    return errors::NotFound("Argument ", index.index, " has no local tensor.");
   }
 }
 
@@ -152,6 +160,7 @@ Status KernelAndDeviceFunc::InstantiateFunc(const NodeDef& ndef,
   for (const Device* device : input_devices_) {
     options.input_devices.push_back(device->name());
   }
+  options.composite_devices = composite_devices_;
   options.input_resource_dtypes_and_shapes = input_resource_dtypes_and_shapes_;
 
   const auto& it = ndef.attr().find("executor_type");
@@ -274,6 +283,8 @@ Status KernelAndDeviceOp::Run(
   OpKernelContext context(&params);
 
   {
+    port::ScopedFlushDenormal flush;
+    port::ScopedSetRound round(FE_TONEAREST);
     // 'AnnotatedTraceMe' will trace both scheduling time on host and execution
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
@@ -302,21 +313,37 @@ Status KernelAndDeviceFunc::Run(
     ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
     std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
-  std::unique_ptr<FunctionLibraryRuntime::Options> opts = nullptr;
+  Notification n;
+  Status status;
+  RunAsync(step_container, inputs, outputs, cancellation_manager,
+           remote_func_params, [&status, &n](const Status& s) {
+             status = s;
+             n.Notify();
+           });
+  n.WaitForNotification();
+  return status;
+}
+
+void KernelAndDeviceFunc::RunAsync(
+    ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+    std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+    const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
+    std::function<void(const Status&)> done) {
+  std::shared_ptr<FunctionLibraryRuntime::Options> opts = nullptr;
   if (remote_func_params.has_value()) {
     const EagerRemoteFunctionParams& params = remote_func_params.value();
     if (params.step_id.has_value()) {
       // If the function is a remote component of a cross-process function,
       // re-use the step id as its parent function's.
-      opts = absl::make_unique<FunctionLibraryRuntime::Options>(
+      opts = std::make_shared<FunctionLibraryRuntime::Options>(
           params.step_id.value());
     } else {
-      opts = absl::make_unique<FunctionLibraryRuntime::Options>();
+      opts = std::make_shared<FunctionLibraryRuntime::Options>();
     }
     // Reuse the op id if it exists.
     opts->op_id = params.op_id;
   } else {
-    opts = absl::make_unique<FunctionLibraryRuntime::Options>();
+    opts = std::make_shared<FunctionLibraryRuntime::Options>();
     if (get_op_id_ && is_cross_process_) {
       // If the function is a cross-process function and the remote execution
       // goes through eager service, create an eager op id for the function.
@@ -331,49 +358,43 @@ Status KernelAndDeviceFunc::Run(
   opts->rendezvous = rendezvous;
   opts->create_rendezvous = false;
 
-  CancellationManager cm;
+  // Create a cancellation manager to be used by FLR options if caller does not
+  // pass in one. If the caller does provide one, pass it to process FLR and the
+  // locally created one will be unused.
+  std::shared_ptr<CancellationManager> local_cm;
   if (cancellation_manager) {
     opts->cancellation_manager = cancellation_manager;
   } else {
-    opts->cancellation_manager = &cm;
+    local_cm = std::make_shared<CancellationManager>();
+    opts->cancellation_manager = local_cm.get();
   }
   opts->allow_dead_tensors = true;
-
   opts->step_container =
       step_container == nullptr ? &step_container_ : step_container;
-  auto step_container_cleanup = gtl::MakeCleanup([step_container, this] {
-    if (step_container == nullptr) {
-      this->step_container_.CleanUp();
-    }
-  });
-
   opts->collective_executor =
       collective_executor_ ? collective_executor_->get() : nullptr;
 
   opts->stats_collector = nullptr;
   opts->runner = get_runner();
 
-  Notification done;
-  Status status;
   outputs->clear();
 
-  {
-    profiler::TraceMe activity(
-        [&] {
-          return absl::StrCat("FunctionRun#name=", name(),
-                              ",id=", opts->step_id, "#");
-        },
-        profiler::TraceMeLevel::kInfo);
-    pflr_->Run(*opts, handle_, inputs, outputs,
-               [&status, &done](const Status& s) {
-                 status = s;
-                 done.Notify();
-               });
-    done.WaitForNotification();
-  }
-
-  rendezvous->Unref();
-  return status;
+  profiler::TraceMe* activity = new profiler::TraceMe(
+      [&] {
+        return absl::StrCat("FunctionRun#name=", name(), ",id=", opts->step_id,
+                            "#");
+      },
+      profiler::TraceMeLevel::kInfo);
+  pflr_->Run(*opts, handle_, inputs, outputs,
+             [opts, rendezvous, local_cm, step_container, this, activity,
+              done = std::move(done)](const Status& s) {
+               delete activity;
+               rendezvous->Unref();
+               if (step_container == nullptr) {
+                 this->step_container_.CleanUp();
+               }
+               done(s);
+             });
 }
 
 tensorflow::Device* KernelAndDeviceOp::OutputDevice(int idx) const {
@@ -409,7 +430,9 @@ Device* KernelAndDeviceOp::InputDevice(int i) const {
 }
 
 Device* KernelAndDeviceFunc::InputDevice(int i) const {
-  if (input_dtypes_[i] == DT_RESOURCE) {
+  if ((input_dtypes_[i] == DT_RESOURCE) &&
+      (composite_devices_.find(input_devices_[i]->name()) ==
+       composite_devices_.end())) {
     return host_cpu_device_;
   } else {
     return input_devices_[i];
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 641e4e79a6b..d2c54322513 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
@@ -66,16 +67,16 @@ class EagerKernelArgs : public FunctionArgsInterface {
 
   ~EagerKernelArgs() override{};
 
-  bool HasRemoteInputs() const override { return false; };
+  bool HasRemoteOrPackedInputs() const override { return false; };
   TensorValue* MutableInput(int i) { return &tensor_args_[i]; }
 
-  Status GetLocalArg(const int index, Tensor* val) const override;
+  Status GetLocalArg(const FunctionArgIndex& index, Tensor* val) const override;
 
   std::vector<Tensor> GetLocalTensors() const override;
 
-  const gtl::InlinedVector<TensorValue, 4>* GetTensorValues() const override {
+  const gtl::InlinedVector<TensorValue, 4>* GetTensorValues() const {
     return &tensor_args_;
-  };
+  }
 
  protected:
   gtl::InlinedVector<TensorValue, 4> tensor_args_;
@@ -124,6 +125,20 @@ class KernelAndDevice : public core::RefCounted {
       std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params) = 0;
 
+  // Execute kernel asynchronously when applicable. Different from `Run` which
+  // blocks the caller thread and waits for the execution of the op/function,
+  // `RunAsync` could return before finishing the execution. The `done` callback
+  // will be triggered once the op/function execution finishes.
+  // Currently, calling RunAsync on ops might not honor the asynchronicity when
+  // it is called on an instance with only sync implementation, execute the
+  // kernel synchronously and then call the callback with the return status
+  // from sync execution.
+  virtual void RunAsync(
+      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
+      StatusCallback done) = 0;
+
   virtual Device* InputDevice(int i) const = 0;
   virtual Device* OutputDevice(int idx) const = 0;
   // If idx'th output is a resource, returns the device backing the resource.
@@ -187,6 +202,16 @@ class KernelAndDeviceOp final : public KernelAndDevice {
              const absl::optional<EagerRemoteFunctionParams>&
                  remote_func_params) override;
 
+  void RunAsync(
+      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
+      StatusCallback done) override {
+    // Trivial async implementation on top of the sync version
+    done(Run(step_container, inputs, outputs, cancellation_manager,
+             remote_func_params));
+  }
+
   const OpKernel* kernel() const override { return kernel_.get(); }
 
   Device* InputDevice(int i) const override;
@@ -217,7 +242,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
 // Represents a multi-device function. Functions can also be run using
 // various function-calling kernels including CallOp and PartitionedCallOp.
 // In such cases, KernelAndDeviceOp is used.
-class KernelAndDeviceFunc final : public KernelAndDevice {
+class KernelAndDeviceFunc : public KernelAndDevice {
  public:
   // `flr` can be nullptr.
   // `pflr` must not be nullptr.
@@ -225,6 +250,7 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   KernelAndDeviceFunc(
       FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
       std::vector<Device*> input_devices,
+      absl::flat_hash_map<string, const std::vector<string>*> composite_devices,
       std::unordered_map<int, DtypeAndPartialTensorShape>
           input_resource_dtypes_and_shapes,
       std::function<void(std::function<void()>)>* runner,
@@ -237,6 +263,7 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
         pflr_(pflr),
         handle_(kInvalidHandle),
         input_devices_(std::move(input_devices)),
+        composite_devices_(std::move(composite_devices)),
         input_resource_dtypes_and_shapes_(
             std::move(input_resource_dtypes_and_shapes)),
         name_(name),
@@ -265,6 +292,12 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
              const absl::optional<EagerRemoteFunctionParams>&
                  remote_func_params) override;
 
+  void RunAsync(
+      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
+      StatusCallback done) override;
+
   const OpKernel* kernel() const override { return nullptr; }
 
   Device* InputDevice(int i) const override;
@@ -290,6 +323,8 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   // CPU devices are not null. Resource handles' devices are actual backing
   // devices.
   std::vector<Device*> input_devices_;
+  // Maps from a CompositeDevice name to a list of physical device names.
+  absl::flat_hash_map<string, const std::vector<string>*> composite_devices_;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_dtypes_and_shapes_;
 
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index 346ccc11ca8..f2339806814 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -15,9 +15,10 @@ limitations under the License.
 #ifdef INTEL_MKL
 #include <string>
 #include <unordered_map>
+
 #include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
+#include "tensorflow/core/common_runtime/mkl_layout_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
-#include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/util.h"
@@ -135,15 +136,8 @@ Status MklEagerOpRewrite::SetupNewOp(
       ->MutableAttrs()
       ->Set("_kernel", mkl_op_registry::kMklNameChangeOpLabel);
 
-  if (orig_op->Device() == kVariantDeviceNull) {
-    string device_name = orig_op->GetDeviceName();
-    (*new_mkl_op)->SetDeviceName(device_name.c_str());
-  } else if (VariantDeviceIsCustom(orig_op->Device())) {
-    (*new_mkl_op)->SetDevice(absl::get<CustomDevice*>(orig_op->Device()));
-  } else {
-    (*new_mkl_op)->SetDevice(absl::get<Device*>(orig_op->Device()));
-  }
-  return Status::OK();
+  string device_name = orig_op->DeviceName();
+  return (*new_mkl_op)->SetDeviceName(device_name.c_str());
 }
 
 Status MklEagerOpRewrite::CreateGenericMklOp(
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
index 37415ec1123..91ca800cbac 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
@@ -36,19 +36,19 @@ class EagerOpRewriteTest {
     bool lazy_remote_tensor_copy = false;
     tensorflow::Rendezvous* rendezvous =
         new tensorflow::IntraProcessRendezvous(device_mgr.get());
-    std::unique_ptr<tensorflow::EagerContext> eager_ctx =
-        std::unique_ptr<tensorflow::EagerContext>(new tensorflow::EagerContext(
-            SessionOptions(),
-            tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-            tensorflow::ContextMirroringPolicy::MIRRORING_NONE, async,
-            lazy_remote_tensor_copy, device_mgr.get(), false, rendezvous,
-            GetDefaultCustomKernelCreator()));
+    tensorflow::EagerContext* eager_ctx = new tensorflow::EagerContext(
+        SessionOptions(),
+        tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, async,
+        lazy_remote_tensor_copy, device_mgr.get(), false, rendezvous,
+        GetDefaultCustomKernelCreator());
 
     EagerExecutor executor_(false);
     std::unique_ptr<tensorflow::EagerOperation> op(
-        new tensorflow::EagerOperation(eager_ctx.get()));
+        new tensorflow::EagerOperation(eager_ctx));
     EXPECT_EQ(Status::OK(),
               op.get()->Reset(op_name.c_str(), nullptr, false, &executor_));
+    eager_ctx->Unref();
     return op;
   }
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 858d0a338ae..dbfc5639017 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -23,8 +23,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/substitute.h"
 #include "absl/types/variant.h"
 #include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
@@ -47,6 +49,102 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+int64 GetRemoteDeviceIncarnation(Device* device) {
+  if (device == nullptr || device->IsLocal()) return 0;
+  return device->attributes().incarnation();
+}
+}  // namespace
+
+TensorHandle::PackedTensorHandleData::PackedTensorHandleData(
+    std::vector<TensorHandle*>&& handles, const TensorShape& shape)
+    : handles_(std::move(handles)), shape_(shape) {
+  for (auto* handle : handles_) {
+    handle->Ref();
+  }
+}
+
+TensorHandle::PackedTensorHandleData::~PackedTensorHandleData() {
+  for (auto* handle : handles_) {
+    handle->Unref();
+  }
+}
+
+Status TensorHandle::PackedTensorHandleData::Shape(TensorShape* shape) const {
+  *shape = shape_;
+  return Status::OK();
+}
+
+Status TensorHandle::PackedTensorHandleData::NumDims(int* num_dims) const {
+  *num_dims = shape_.dims();
+  return Status::OK();
+}
+
+Status TensorHandle::PackedTensorHandleData::Dim(int dim_index,
+                                                 int64* dim) const {
+  *dim = shape_.dim_size(dim_index);
+  return Status::OK();
+}
+
+Status TensorHandle::PackedTensorHandleData::NumElements(
+    int64* num_elements) const {
+  *num_elements = shape_.num_elements();
+  return Status::OK();
+}
+
+Status TensorHandle::PackedTensorHandleData::Unprotect() {
+  for (auto* handle : handles_) {
+    TF_RETURN_IF_ERROR(absl::visit([](auto& data) { return data.Unprotect(); },
+                                   handle->data_));
+  }
+  return Status::OK();
+}
+
+bool TensorHandle::PackedTensorHandleData::IsReady() const {
+  {
+    tf_shared_lock l(mu_);
+    if (!is_poisoned_.ok()) {
+      return true;
+    }
+  }
+  for (auto* handle : handles_) {
+    if (!handle->IsReady()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void TensorHandle::PackedTensorHandleData::Poison(Status status) {
+  mutex_lock l(mu_);
+  is_poisoned_ = status;
+}
+
+string TensorHandle::PackedTensorHandleData::DebugString() const {
+  string debug_str = "PackedTensorHandleData: ";
+  for (const auto* handle : handles_) {
+    debug_str.append(
+        absl::StrCat(absl::visit([](auto& data) { return data.DebugString(); },
+                                 handle->data_),
+                     "; "));
+  }
+  return debug_str;
+}
+
+int TensorHandle::PackedTensorHandleData::NumPackedHandles() const {
+  return handles_.size();
+}
+
+Status TensorHandle::PackedTensorHandleData::ExtractPackedHandle(
+    const int index, TensorHandle** handle) const {
+  if (index < 0 || index >= handles_.size()) {
+    return errors::InvalidArgument("Expect an index within [0, ",
+                                   handles_.size(), "), but got ", index);
+  }
+  *handle = handles_.at(index);
+  return Status::OK();
+}
+
 void TensorHandle::SetResourceHandleInfo(
     ResourceHandleInfo&& resource_handle_info) {
   resource_handle_info_ = std::move(resource_handle_info);
@@ -61,7 +159,7 @@ Status TensorHandle::GetResourceHandleInfoImpl(
         dtype);
   }
 
-  if (IsRemote()) {
+  if (Type() != LOCAL) {
     set_resource_info();
     return Status::OK();
   }
@@ -98,6 +196,23 @@ Status TensorHandle::GetResourceAllowedDevices(std::vector<string>* result) {
   return GetResourceHandleInfoImpl(get_resource_info);
 }
 
+int TensorHandle::NumPackedHandles() const {
+  if (Type() != PACKED) {
+    return 0;
+  }
+  return absl::get<PackedTensorHandleData>(data_).NumPackedHandles();
+}
+
+Status TensorHandle::ExtractPackedHandle(const int index,
+                                         TensorHandle** handle) const {
+  if (Type() != PACKED) {
+    return errors::Internal("Invalid ExtractPackedHandleOnDevice call on a",
+                            TypeString(), " handle: ", this);
+  }
+  return absl::get<PackedTensorHandleData>(data_).ExtractPackedHandle(index,
+                                                                      handle);
+}
+
 TensorHandle* TensorHandle::CreateLocalHandle(const tensorflow::Tensor& t) {
   // TODO(b/136608821): Move away from nullptr
   tensorflow::Tensor tensor = t;
@@ -136,6 +251,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       device_((!ctx || d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   DVLOG(3) << "Creating Local TensorHandle: " << this
@@ -150,6 +267,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       op_device_(op_device),
       resource_device_(
           GetResourceDevice(t.flat<class ResourceHandle>()(0), ctx)),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       resource_handle_info_(
           {t.flat<class ResourceHandle>()(0).dtypes_and_shapes(),
@@ -166,6 +285,7 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, CustomDevice* d,
       device_(d),
       op_device_(nullptr),
       resource_device_(nullptr),
+      resource_remote_device_incarnation_(0),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   // TODO(allenl): Figure out a better op_device story for custom devices,
@@ -189,12 +309,87 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
       device_((d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>) {
   DVLOG(3) << "Creating empty Local TensorHandle: " << this
            << " device: " << VariantDeviceDebugString(device_);
 }
 
+Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                        const tensorflow::DataType dtype,
+                                        const tensorflow::TensorShape& shape,
+                                        EagerContext* ctx,
+                                        TensorHandle** packed_handle) {
+  if (handles.empty()) {
+    return errors::InvalidArgument("Handles should not be empty.");
+  }
+
+  ResourceHandleInfo resource_handle_info;
+  if (dtype == DT_RESOURCE) {
+    TF_RETURN_IF_ERROR(
+        handles.at(0)->GetResourceHandleInfo(&resource_handle_info));
+  }
+  std::vector<string> devices;
+  for (auto* handle : handles) {
+    if (VariantDeviceIsCustom(handle->device())) {
+      return errors::InvalidArgument(
+          "CustomDevice is not supported for packing.");
+    } else {
+      devices.push_back(handle->op_device() ? handle->op_device()->name()
+                                            : ctx->HostCPU()->name());
+    }
+  }
+
+  Device* device;
+  if (devices.size() == 1) {
+    device = absl::get<Device*>(handles.at(0)->DeviceOrHostCPU(*ctx));
+  } else {
+    CompositeDevice* composite_device = nullptr;
+    TF_RETURN_IF_ERROR(
+        ctx->FindOrCreateCompositeDevice(devices, &composite_device));
+    device = composite_device;
+  }
+  *packed_handle =
+      new TensorHandle(std::move(handles), device, dtype, shape, ctx);
+  (*packed_handle)->SetResourceHandleInfo(std::move(resource_handle_info));
+  return Status::OK();
+}
+
+Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                        EagerContext* ctx,
+                                        TensorHandle** packed_handle) {
+  if (handles.empty()) {
+    return errors::InvalidArgument("Handles should not be empty.");
+  }
+
+  // Get the dtype and shape from the fisrt handle since all handles have the
+  // same dtype and shape.
+  tensorflow::DataType dtype = handles.at(0)->dtype;
+  tensorflow::TensorShape shape;
+  TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
+  return CreatePackedHandle(std::move(handles), dtype, shape, ctx,
+                            packed_handle);
+}
+
+TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
+                           const tensorflow::DataType dtype,
+                           const tensorflow::TensorShape& shape,
+                           EagerContext* ctx)
+    : dtype(dtype),
+      device_(device),
+      op_device_(device),
+      resource_device_(dtype == DT_RESOURCE ? device : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
+      ctx_(ctx),
+      data_(absl::in_place_type<PackedTensorHandleData>, std::move(handles),
+            shape) {
+  DVLOG(3) << "Creating a packed TensorHandle: " << this
+           << " device: " << VariantDeviceDebugString(device_);
+}
+
 #if !defined(IS_MOBILE_PLATFORM)
 TensorHandle* TensorHandle::CreateUnshapedRemoteHandle(
     int64 op_id, int32 output_num, const string& remote_task,
@@ -210,6 +405,8 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             remote_task, ctx) {
@@ -232,6 +429,8 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             ctx->GetContextViewId()) {
@@ -253,19 +452,32 @@ bool TensorHandle::IsReady() const {
   return absl::visit([](auto& data) { return data.IsReady(); }, data_);
 }
 
-bool TensorHandle::IsRemote() const {
-#if !defined(IS_MOBILE_PLATFORM)
-  return data_.index() == 1;
-#else
-  return false;
-#endif
+TensorHandle::HandleType TensorHandle::Type() const {
+  if (data_.index() == 0) {
+    return LOCAL;
+  } else if (data_.index() == 1) {
+    return PACKED;
+  } else {
+    return REMOTE;
+  }
+}
+
+string TensorHandle::TypeString() const {
+  if (data_.index() == 0) {
+    return "LOCAL";
+  } else if (data_.index() == 1) {
+    return "PACKED";
+  } else {
+    return "REMOTE";
+  }
 }
 
 Status TensorHandle::Tensor(const tensorflow::Tensor** t) const {
   DVLOG(3) << "Tensor on TensorHandle: " << this;
 
-  if (IsRemote()) {
-    return errors::Internal("Invalid Tensor call on remote handle: ", this);
+  if (Type() != LOCAL) {
+    return errors::Internal("Invalid Tensor call on a ", TypeString(),
+                            " handle: ", this);
   }
 
   auto& data = absl::get<LocalTensorHandleData>(data_);
@@ -277,8 +489,9 @@ Status TensorHandle::TensorFromDevice(const Device* d,
   DVLOG(3) << "TensorFromDevice on TensorHandle: " << this << " device: " << d;
 
   if (d == absl::get<Device*>(device_)) {
-    if (IsRemote()) {
-      return errors::Internal("Invalid Tensor call on remote handle: ", this);
+    if (Type() != LOCAL) {
+      return errors::Internal("Invalid Tensor call on a ", TypeString(),
+                              " handle: ", this);
     }
 
     auto& data = absl::get<LocalTensorHandleData>(data_);
@@ -306,9 +519,9 @@ Status TensorHandle::TensorValue(const Device* d, tensorflow::TensorValue* t) {
         VariantDeviceDebugString(device_),
         ", requested device: ", d != nullptr ? d->name() : "(nil)");
   } else if (d == absl::get<Device*>(device_)) {
-    if (IsRemote()) {
-      return errors::Internal("Invalid TensorValue call on remote handle: ",
-                              this);
+    if (Type() != LOCAL) {
+      return errors::Internal("Invalid TensorValue call on a ", TypeString(),
+                              " handle: ", this);
     }
 
     auto& data = absl::get<LocalTensorHandleData>(data_);
@@ -450,8 +663,7 @@ Status TensorHandle::Unprotect(const Device* d) {
   DVLOG(3) << "Unprotect on TensorHandle: " << this << " device: " << d;
 
   if (d == absl::get<Device*>(device_)) {
-    auto& data = absl::get<LocalTensorHandleData>(data_);
-    return data.Unprotect();
+    return absl::visit([](auto& data) { return data.Unprotect(); }, data_);
   }
 
   tf_shared_lock l(mu_);
@@ -483,7 +695,7 @@ Status TensorHandle::AddEmptyLocalMirror(const Device* d) {
 
   mutex_lock l(mu_);
   if (local_mirrors_.find(d) != local_mirrors_.end()) {
-    return errors::Internal("Attempted to duplicate a local mirror.");
+    return errors::AlreadyExists("Attempted to duplicate a local mirror.");
   }
 
   local_mirrors_.emplace(std::piecewise_construct, std::forward_as_tuple(d),
@@ -493,8 +705,8 @@ Status TensorHandle::AddEmptyLocalMirror(const Device* d) {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-Status TensorHandle::RemoteAddress(const Device* d, int64* op_id,
-                                   int32* output_num) const {
+Status TensorHandle::RemoteAddress(const Device* d, const bool wait_until_ready,
+                                   int64* op_id, int32* output_num) const {
   DVLOG(3) << "RemoteAddress on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -502,24 +714,20 @@ Status TensorHandle::RemoteAddress(const Device* d, int64* op_id,
     tf_shared_lock l(mu_);
     auto mirror = remote_mirrors_.find(d->name());
     if (mirror != remote_mirrors_.end()) {
-      *op_id = mirror->second.op_id();
-      *output_num = mirror->second.output_num();
-      return Status::OK();
+      return mirror->second.OpIdAndOutputNum(wait_until_ready, op_id,
+                                             output_num);
     }
 
     return errors::FailedPrecondition(
         "Could not find remote mirror for specified device");
   }
 
-  if (!IsRemote()) {
+  if (Type() != REMOTE) {
     return errors::InvalidArgument("Primary device is not remote");
   }
 
   auto& data = absl::get<RemoteTensorHandleData>(data_);
-  *op_id = data.op_id();
-  *output_num = data.output_num();
-
-  return Status::OK();
+  return data.OpIdAndOutputNum(wait_until_ready, op_id, output_num);
 }
 
 bool TensorHandle::HasRemoteMirror(const Device* d,
@@ -568,7 +776,7 @@ Status TensorHandle::AddUnshapedRemoteMirror(const Device* d, int64 op_id,
   mutex_lock l(mu_);
   auto remote_mirror = remote_mirrors_.find(d->name());
   if (remote_mirror != remote_mirrors_.end()) {
-    if (remote_mirror->second.context_view_id() == ctx->GetContextId()) {
+    if (remote_mirror->second.context_view_id() >= ctx->GetContextId()) {
       return errors::Internal("Attempted to duplicate a remote mirror.");
     }
     // Remove stale mirror
@@ -610,26 +818,39 @@ Status TensorHandle::SetRemoteShape(const TensorShape& shape, const Device* d,
            << " " << d->name();
 
   if (VariantDeviceIsCustom(device_) || d != absl::get<Device*>(device_)) {
-    mutex_lock l(mu_);
+    tf_shared_lock l(mu_);
     auto remote_mirror = remote_mirrors_.find(d->name());
-    if (remote_mirror != remote_mirrors_.end()) {
-      auto& mirror = remote_mirror->second;
-      if (mirror.context_view_id() == context_view_id) {
-        return mirror.SetShape(shape);
-      }
-      remote_mirrors_.erase(remote_mirror);
+    if (remote_mirror == remote_mirrors_.end()) {
+      return Status::OK();
+    }
+    auto& mirror = remote_mirror->second;
+    if (mirror.context_view_id() == context_view_id) {
+      return mirror.SetShape(shape);
+    } else if (mirror.context_view_id() < context_view_id) {
+      return errors::Internal(
+          absl::Substitute("Unexpected context_view_id ($0) which should not "
+                           "be newer than the "
+                           "one ($1) associated to the remote mirror.",
+                           context_view_id, mirror.context_view_id()));
+    } else {
+      LOG(WARNING) << "SetRemoteShape is ignored for a remote mirror that is "
+                      "accociated with a newer context_view_id.";
     }
-
     return Status::OK();
   }
 
-  DCHECK(IsRemote()) << "SetRemoteShape is only called on remote handles.";
+  DCHECK(Type() == REMOTE)
+      << "SetRemoteShape is only called on remote handles.";
 
   auto& data = absl::get<RemoteTensorHandleData>(data_);
-  if (data.context_view_id() != context_view_id) {
-    return errors::Internal("Attempted to set remote shape for an old handle.");
-  }
-
+  // context_view_id is currently used to validate mirrors. The shape of
+  // RemoteTensorHandleData should be set without checking context_view_id.
+  // The reason behind it is that for the primary copy of data, if the remote
+  // worker / device is removed, the consumer should report a connection error
+  // indicating the remote tensor is no longer available.
+  // For mirrors, this is not the case because they colocate with the data
+  // consuming op/function device, and we (for now) have to aggressively
+  // invalidate those copies to avoid any false positives during cluster update.
   return data.SetShape(shape);
 }
 
@@ -639,7 +860,8 @@ void TensorHandle::PoisonRemote(Status status, const Device* d,
            << " " << d->name();
 
   if (!VariantDeviceIsCustom(device_) && d == absl::get<Device*>(device_)) {
-    DCHECK(IsRemote()) << "Poison can only be on remote handles: " << this;
+    DCHECK(Type() == REMOTE)
+        << "Poison can only be on remote handles: " << this;
 
     auto& data = absl::get<RemoteTensorHandleData>(data_);
     data.Poison(status);
@@ -667,7 +889,7 @@ Status TensorHandle::AddLocalMirror(tensorflow::Tensor&& tensor,
       local_mirrors_.emplace(std::piecewise_construct, std::forward_as_tuple(d),
                              std::forward_as_tuple(std::move(tensor)));
   if (!elem.second) {
-    return errors::Internal("Attempted to set tensor for existing mirror.");
+    return errors::AlreadyExists("Attempted to add existing mirror.");
   }
 
   return Status::OK();
@@ -677,7 +899,7 @@ Status TensorHandle::SetTensor(tensorflow::Tensor&& t, const Device* d) {
   DVLOG(3) << "SetTensor on TensorHandle: " << this << " device: " << d;
 
   if (d == absl::get<Device*>(device_)) {
-    DCHECK(!IsRemote()) << "SetTensor is not called on remote handles.";
+    DCHECK(Type() == LOCAL) << "SetTensor is not called on local handles.";
 
     if (t.dtype() == DT_RESOURCE && t.NumElements() > 0) {
       auto& resource_handle = t.flat<class ResourceHandle>()(0);
@@ -705,10 +927,8 @@ void TensorHandle::Poison(Status status, const Device* d) {
   DVLOG(3) << "Poison on TensorHandle: " << this << " device: " << d;
 
   if (!VariantDeviceIsCustom(device_) && d == absl::get<Device*>(device_)) {
-    DCHECK(!IsRemote()) << "Poison can only be on local handles: " << this;
-
-    auto& data = absl::get<LocalTensorHandleData>(data_);
-    data.Poison(status);
+    DCHECK(Type() != REMOTE) << "Poison can only be on local handles: " << this;
+    absl::visit([status](auto& data) { data.Poison(status); }, data_);
   } else {
     tf_shared_lock l(mu_);
     auto elem = local_mirrors_.find(d);
@@ -792,6 +1012,9 @@ bool VariantDeviceIsCustom(VariantDevice variant_device) {
 }
 
 string VariantDeviceName(VariantDevice device) {
+  if (device == kVariantDeviceNull) {
+    return "[]";
+  }
   return absl::visit([](auto* device) { return device->name(); }, device);
 }
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 0b39161af73..5e7638ae03c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -87,6 +87,19 @@ class TensorHandle : public AbstractTensorHandleInterface,
                                               Device* resource_device,
                                               tensorflow::DataType dtype,
                                               EagerContext* ctx);
+
+  // Create a handle which packs the given handles of the same dtype and shape.
+  // If handles are on different devices, assign the packed handle to a
+  // CompositeDevice.
+  static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                   const tensorflow::DataType dtype,
+                                   const tensorflow::TensorShape& shape,
+                                   EagerContext* ctx,
+                                   TensorHandle** packed_handle);
+  static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                   EagerContext* ctx,
+                                   TensorHandle** packed_handle);
+
 #if !defined(IS_MOBILE_PLATFORM)
   static TensorHandle* CreateUnshapedRemoteHandle(int64 op_id, int32 output_num,
                                                   const string& remote_task,
@@ -125,6 +138,9 @@ class TensorHandle : public AbstractTensorHandleInterface,
   VariantDevice device() const { return device_; }
   Device* op_device() const { return op_device_; }
   Device* resource_device() const { return resource_device_; }
+  int64 resource_remote_device_incarnation() const {
+    return resource_remote_device_incarnation_;
+  }
 
   VariantDevice DeviceOrHostCPU(const EagerContext& ctx) const;
 
@@ -153,7 +169,10 @@ class TensorHandle : public AbstractTensorHandleInterface,
                                 EagerContext* ctx);
 
   // Return the op_id and output num if the handle refers to a remote tensor.
-  Status RemoteAddress(const Device* d, int64* op_id, int32* output_num) const;
+  // If wait_until_ready is true, block until the remote tensor is ready on the
+  // given remote worker.
+  Status RemoteAddress(const Device* d, const bool wait_until_ready,
+                       int64* op_id, int32* output_num) const;
 
   // Called on an async remote tensor once it's shape has been determined. This
   // transitions the tensor handle from a non-ready to a ready state by
@@ -184,6 +203,7 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // tensor for a specific device.
   void Poison(Status status, const Device* d);
 
+  // TODO(b/154282629): Consider moving it to EagerContext.
   Status CopyToDevice(const EagerContext& ctx, tensorflow::Device* d,
                       tensorflow::Tensor* output);
 
@@ -199,7 +219,10 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // ready.
   const tensorflow::DataType dtype;
 
-  bool IsRemote() const;
+  enum HandleType { LOCAL = 0, PACKED = 1, REMOTE = 2 };
+
+  HandleType Type() const;
+  string TypeString() const;
 
   string DebugString() const;
 
@@ -217,7 +240,19 @@ class TensorHandle : public AbstractTensorHandleInterface,
       std::vector<DtypeAndPartialTensorShape>* result);
   Status GetResourceAllowedDevices(std::vector<string>* result);
 
+  // Returns the number of packed handles. 0 if the handle type is not PACKED.
+  int NumPackedHandles() const;
+  // It's called on a packed TensorHandle. Extract a handle with the given
+  // index.
+  Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
+
  private:
+  friend class PackedTensorHandleTest;
+
+  TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
+               const tensorflow::DataType dtype,
+               const tensorflow::TensorShape& shape, EagerContext* ctx);
+
   ~TensorHandle() override;
 
   // The TensorHandleData can either represent a local or remote tensor handle.
@@ -239,6 +274,9 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
   // backing the resource. Else resource_device_ is nullptr.
   tensorflow::Device* const resource_device_;
+  // Incarnation ID of the resource device if it locates on a remote device, or
+  // 0 if it locates on a local device.
+  const int64 resource_remote_device_incarnation_;
 
   mutable mutex mu_;
 
@@ -274,12 +312,45 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // devices for the underlying resource.
   ResourceHandleInfo resource_handle_info_;
 
+  // A handle data which refers to multiple TensorHandles of the same dtype and
+  // shape.
+  class PackedTensorHandleData {
+   public:
+    PackedTensorHandleData(std::vector<TensorHandle*>&& handles,
+                           const TensorShape& shape);
+
+    ~PackedTensorHandleData();
+
+    Status Shape(TensorShape* shape) const;
+    Status NumDims(int* num_dims) const;
+    Status Dim(int dim_index, int64* dim) const;
+    Status NumElements(int64* num_elements) const;
+    Status Unprotect();
+    bool IsReady() const;
+    void Poison(Status status);
+    string DebugString() const;
+
+    // Number of packed handles.
+    int NumPackedHandles() const;
+    // Extract a handle on the given index.
+    Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
+
+   private:
+    const std::vector<TensorHandle*> handles_;
+    const TensorShape shape_;
+
+    mutable mutex mu_;
+    Status is_poisoned_ TF_GUARDED_BY(mu_);
+  };
+
   // Does not need synchronization because it can be accessed only after
   // WaitReady() has returned. At that point, data_ is immutable.
 #if !defined(IS_MOBILE_PLATFORM)
-  absl::variant<LocalTensorHandleData, RemoteTensorHandleData> data_;
+  absl::variant<LocalTensorHandleData, PackedTensorHandleData,
+                RemoteTensorHandleData>
+      data_;
 #else
-  absl::variant<LocalTensorHandleData> data_;
+  absl::variant<LocalTensorHandleData, PackedTensorHandleData> data_;
 #endif
 
   PartialTensorShape inference_shape_;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 9485dfb4764..13b634bbec4 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 
+#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -64,5 +67,239 @@ TEST(TensorHandle_ShapeTest, AsyncShape) {
   ctx->Unref();
 }
 
+static Device* CreateDevice(const char* type, const char* name,
+                            bool is_local = true) {
+  class FakeDevice : public Device {
+   public:
+    explicit FakeDevice(const DeviceAttributes& attr, bool is_local)
+        : Device(nullptr, attr), is_local_(is_local) {}
+    Status Sync() override { return Status::OK(); }
+    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+    bool IsLocal() const override { return is_local_; }
+
+   private:
+    const bool is_local_;
+  };
+  DeviceAttributes attr;
+  attr.set_name(name);
+  attr.set_device_type(type);
+  int64 incarnation = random::New64();
+  while (incarnation == 0) {
+    incarnation = random::New64();
+  }
+  attr.set_incarnation(incarnation);
+  return new FakeDevice(attr, is_local);
+}
+
 }  // namespace
+
+class PackedTensorHandleTest : public ::testing::Test {
+ public:
+  PackedTensorHandleTest() {
+    std::vector<std::unique_ptr<Device>> devices;
+    for (const char* name : device_names_) {
+      devices.emplace_back(CreateDevice("GPU", name));
+    }
+    devices.emplace_back(CreateDevice("CPU", host_name_));
+    device_mgr_ = new StaticDeviceMgr(std::move(devices));
+
+    context_ = new EagerContext(
+        SessionOptions(),
+        tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /* async= */ false,
+        /* lazy_copy_function_remote_inputs= */ false, device_mgr_,
+        /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+        /* custom_kernel_creator= */ nullptr,
+        /* cluster_flr= */ nullptr);
+  }
+
+  ~PackedTensorHandleTest() override {
+    delete device_mgr_;
+    context_->Unref();
+  }
+
+  EagerContext* context() { return context_; }
+
+  std::vector<Device*> ListDevices() const {
+    return device_mgr_->ListDevices();
+  }
+
+  bool IsReady(TensorHandle* handle) const { return handle->IsReady(); }
+
+ private:
+  const std::vector<const char*> device_names_ = {
+      "/job:worker/replica:0/task:0/device:GPU:0",
+      "/job:worker/replica:0/task:0/device:GPU:1",
+      "/job:worker/replica:0/task:1/device:GPU:0",
+      "/job:worker/replica:0/task:1/device:GPU:1"};
+
+  const char* host_name_ = "/job:worker/replica:0/task:0/device:CPU:0";
+
+  StaticDeviceMgr* device_mgr_;
+  EagerContext* context_;
+};
+
+TEST_F(PackedTensorHandleTest, PackedHandle) {
+  tensorflow::DataType dtype = DT_RESOURCE;
+  TensorShape shape = {};
+  DtypeAndPartialTensorShape dtype_and_shape = {DT_FLOAT, {2, 2}};
+
+  // Create 2 local TensorHandles (ready)
+  std::vector<TensorHandle*> handles;
+  Tensor t0(dtype, shape);
+  Device* d0 = ListDevices().at(0);
+  TensorHandle* h0 =
+      TensorHandle::CreateLocalHandle(std::move(t0), d0, d0, d0, context());
+  h0->SetResourceHandleInfo({{dtype_and_shape}, {}});
+  handles.push_back(h0);
+  Tensor t1(dtype, shape);
+  Device* d1 = ListDevices().at(1);
+  TensorHandle* h1 =
+      TensorHandle::CreateLocalHandle(std::move(t1), d1, d1, d1, context());
+  h1->SetResourceHandleInfo({{dtype_and_shape}, {}});
+  handles.push_back(h1);
+
+  // Create 2 remote TensorHandles (not ready).
+  const string remote_task = "/job:worker/replica:0/task:1";
+  Device* d2 = ListDevices().at(2);
+  TensorHandle* h2 = TensorHandle::CreateUnshapedRemoteHandle(
+      /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d2, context());
+  handles.push_back(h2);
+  Device* d3 = ListDevices().at(3);
+  TensorHandle* h3 = TensorHandle::CreateUnshapedRemoteHandle(
+      /*op_id=*/1, /*output_num=*/0, remote_task, dtype, d3, context());
+  handles.push_back(h3);
+
+  TensorHandle* packed_handle = nullptr;
+  TF_EXPECT_OK(TensorHandle::CreatePackedHandle(std::move(handles), context(),
+                                                &packed_handle));
+
+  h0->Unref();
+  h1->Unref();
+  h2->Unref();
+  h3->Unref();
+
+  EXPECT_EQ(packed_handle->NumPackedHandles(), 4);
+  EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
+  EXPECT_EQ(packed_handle->dtype, dtype);
+  TensorShape packed_shape;
+  TF_ASSERT_OK(packed_handle->Shape(&packed_shape));
+  EXPECT_EQ(packed_shape, shape);
+  TensorHandle::ResourceHandleInfo resource_handle_info;
+  TF_ASSERT_OK(packed_handle->GetResourceHandleInfo(&resource_handle_info));
+  EXPECT_EQ(resource_handle_info.dtypes_and_shapes.size(), 1);
+  EXPECT_EQ(resource_handle_info.dtypes_and_shapes.at(0).dtype, DT_FLOAT);
+  EXPECT_EQ(
+      resource_handle_info.dtypes_and_shapes.at(0).shape.IsIdenticalTo({2, 2}),
+      true);
+
+  CompositeDevice* device = reinterpret_cast<CompositeDevice*>(
+      absl::get<Device*>(packed_handle->device()));
+  EXPECT_EQ(device->name(), "/job:worker/replica:0/task:0/device:COMPOSITE:0");
+  EXPECT_EQ(device->underlying_devices()->size(), 4);
+
+  const std::vector<TensorHandle::HandleType> expected_handle_types = {
+      TensorHandle::LOCAL, TensorHandle::LOCAL, TensorHandle::REMOTE,
+      TensorHandle::REMOTE};
+  for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
+    TensorHandle* h = nullptr;
+    TF_ASSERT_OK(packed_handle->ExtractPackedHandle(i, &h));
+    EXPECT_EQ(absl::get<Device*>(h->device()), ListDevices().at(i));
+    EXPECT_EQ(h->Type(), expected_handle_types.at(i));
+  }
+  EXPECT_FALSE(IsReady(packed_handle));
+
+  TF_ASSERT_OK(h2->SetRemoteShape(shape, ListDevices().at(2),
+                                  context()->GetContextViewId()));
+  EXPECT_FALSE(IsReady(packed_handle));
+  TF_ASSERT_OK(h3->SetRemoteShape(shape, ListDevices().at(3),
+                                  context()->GetContextViewId()));
+  EXPECT_TRUE(IsReady(packed_handle));
+
+  packed_handle->Unref();
+}
+
+TEST(TensorHandle_ResourceDeviceTest, OnLocalDevice) {
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(d0));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &local_device_mgr, false, nullptr, nullptr, nullptr);
+
+  tensorflow::DataType dtype = DT_RESOURCE;
+  TensorShape shape = {2};
+  Tensor t(dtype, shape);
+
+  Device* d = local_device_mgr.ListDevices()[0];
+  TensorHandle* th =
+      TensorHandle::CreateLocalHandle(std::move(t), d, d, d, ctx);
+  // Remote device incarnation for local resource should be 0 (invalid)
+  EXPECT_EQ(0, th->resource_remote_device_incarnation());
+  // Local device manager must contain the resource device.
+  EXPECT_TRUE(local_device_mgr.ContainsDevice(
+      th->resource_device()->attributes().incarnation()));
+
+  std::unique_ptr<Device> d1(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr new_device_mgr(std::move(d1));
+  EXPECT_FALSE(new_device_mgr.ContainsDevice(
+      th->resource_device()->attributes().incarnation()));
+
+  th->Unref();
+  ctx->Unref();
+}
+
+TEST(TensorHandle_ResourceDeviceTest, OnRemoteDevice) {
+  std::unique_ptr<Device> d_local(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(d_local));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &local_device_mgr, false, nullptr, nullptr, nullptr);
+
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:worker/task:0/device:CPU:0", false));
+  Device* d0_ptr = d0.get();
+  std::unique_ptr<Device> d1(
+      CreateDevice("CPU", "/job:worker/task:1/device:CPU:0", false));
+  Device* d1_ptr = d1.get();
+
+  DynamicDeviceMgr remote_device_mgr;
+  std::vector<std::unique_ptr<Device>> vector_d0;
+  vector_d0.emplace_back(std::move(d0));
+  TF_ASSERT_OK(remote_device_mgr.AddDevices(std::move(vector_d0)));
+
+  TensorHandle* th0 = TensorHandle::CreateUnshapedRemoteHandle(
+      0, 0, "", DT_RESOURCE, d0_ptr, ctx);
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  std::vector<std::unique_ptr<Device>> vector_d1;
+  vector_d1.emplace_back(std::move(d1));
+  TF_ASSERT_OK(remote_device_mgr.AddDevices(std::move(vector_d1)));
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  TensorHandle* th1 = TensorHandle::CreateUnshapedRemoteHandle(
+      0, 0, "", DT_RESOURCE, d1_ptr, ctx);
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th1->resource_remote_device_incarnation()));
+
+  std::vector<Device*> remove_d1{d1_ptr};
+  TF_ASSERT_OK(remote_device_mgr.RemoveDevices(std::move(remove_d1)));
+  EXPECT_FALSE(remote_device_mgr.ContainsDevice(
+      th1->resource_remote_device_incarnation()));
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  th0->Unref();
+  th1->Unref();
+  ctx->Unref();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index e744cdf1c4d..447a9e0ae77 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -200,9 +200,9 @@ class ExecutorImpl : public Executor {
     // Initial time (in CPU cycles) we expect an operation to take.  Used to
     // determine whether an operation should be place in a threadpool.
     // Operations start out "expensive".
-    static const uint64 kInitialCostEstimateCycles = 100 * 1000 * 1000;
-    static const uint64 kOpIsExpensiveThresholdCycles = 5000;
-    static const uint64 kCostDecay = 10;
+    static constexpr uint64 kInitialCostEstimateCycles = 100 * 1000 * 1000;
+    static constexpr uint64 kOpIsExpensiveThresholdCycles = 5000;
+    static constexpr uint64 kCostDecay = 10;
 
     std::unique_ptr<std::atomic<bool>[]> is_expensive_;
     std::unique_ptr<std::atomic_uint_fast64_t[]> cost_estimates_;
@@ -302,7 +302,7 @@ class ExecutorState {
 
   // After item->kernel computation is done, processes its outputs.
   Status ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
-                        EntryVector* outputs, NodeExecStatsInterface* stats);
+                        Entry* outputs, NodeExecStatsInterface* stats);
 
   // Called after each node finishes. Takes ownership of "stats". Returns true
   // if execution has completed.
@@ -403,7 +403,7 @@ ExecutorState<PropagatorStateType>::ExecutorState(
       runner_(args.runner),
       sync_on_finish_(args.sync_on_finish),
       run_all_kernels_inline_(args.run_all_kernels_inline),
-      propagator_(immutable_state, step_id_),
+      propagator_(immutable_state, step_id_, vlog_),
       num_outstanding_ops_(0) {
   if (args.user_intra_op_threadpool != nullptr) {
     Device* device = immutable_state_.params().device;
@@ -533,8 +533,6 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
         },
         profiler::GetTFTraceMeLevel(is_expensive));
     device->Compute(op_kernel, &ctx);
-    nodestats::SetOpEnd(stats);
-    s = ProcessOutputs(item, &ctx, outputs, stats);
   } else {
     // In the common case, avoid creating any tracing objects.
     if (is_expensive) {
@@ -544,9 +542,10 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
     } else {
       device->Compute(op_kernel, &ctx);
     }
-    nodestats::SetOpEnd(stats);
-    s = ProcessOutputs(item, &ctx, outputs, stats);
   }
+  nodestats::SetOpEnd(stats);
+  if (outputs->size() < item.num_outputs) outputs->resize(item.num_outputs);
+  s = ProcessOutputs(item, &ctx, outputs->data(), stats);
   nodestats::SetMemory(stats, &ctx);
   return s;
 }
@@ -567,8 +566,8 @@ void ExecutorState<PropagatorStateType>::ProcessAsync(
     Entry* first_input = state->first_input;       // Shorthand
 
     nodestats::SetOpEnd(stats);
-    EntryVector outputs;
-    Status s = ProcessOutputs(*state->item, &state->ctx, &outputs, stats);
+    EntryVector outputs(state->item->num_outputs);
+    Status s = ProcessOutputs(*state->item, &state->ctx, outputs.data(), stats);
     nodestats::SetMemory(stats, &state->ctx);
     if (vlog_) {
       VLOG(2) << "Async kernel done: " << state->item->node_id << " step "
@@ -617,7 +616,6 @@ void ExecutorState<PropagatorStateType>::ProcessConstTensor(
     const NodeItem& item, EntryVector* outputs, NodeExecStatsInterface* stats) {
   nodestats::SetOpStart(stats);
   nodestats::SetOpEnd(stats);
-  outputs->resize(1);
   Entry& output = (*outputs)[0];
   output.state = Entry::State::HAS_CONST_TENSOR;
   output.const_tensor = item.const_tensor;
@@ -697,7 +695,8 @@ void ExecutorState<PropagatorStateType>::Process(TaggedNode tagged_node,
   Status s;
   NodeExecStatsInterface* stats = nullptr;
 
-  EntryVector outputs;
+  EntryVector outputs(1);
+
   bool completed = false;
   inline_ready.push_back(tagged_node);
   while (!inline_ready.empty()) {
@@ -727,14 +726,13 @@ void ExecutorState<PropagatorStateType>::Process(TaggedNode tagged_node,
     }
 
     Entry* first_input = propagator_.GetInputTensors(tagged_node);
-    outputs.clear();
 
     // Only execute this node if it is not dead or it is a send/recv
     // transfer node. For transfer nodes, we need to propagate the "dead"
     // bit even when the node is dead.
     bool launched_asynchronously = false;
     if (tagged_node.get_is_dead() && !item.is_transfer_node) {
-      outputs.resize(item.num_outputs);
+      if (outputs.size() < item.num_outputs) outputs.resize(item.num_outputs);
     } else if (TF_PREDICT_FALSE(item.is_noop)) {
       ProcessNoop(stats);
     } else if (item.const_tensor != nullptr && !params.track_allocations) {
@@ -790,7 +788,13 @@ void ExecutorState<PropagatorStateType>::Process(TaggedNode tagged_node,
       if (s.ok()) {
         propagator_.PropagateOutputs(tagged_node, &outputs, &ready);
       }
-      outputs.clear();
+
+      // Clear outputs without deallocating the `outputs` vector.
+      const int num_outputs = item.num_outputs;
+      for (int i = 0; i < num_outputs; ++i) {
+        outputs[i].ClearVal();
+      }
+
       if (stats) {
         scheduled_nsec = nodestats::NowInNsec();
       }
@@ -807,16 +811,14 @@ template <class PropagatorStateType>
 Status ExecutorState<PropagatorStateType>::PrepareInputs(
     const NodeItem& item, Entry* first_input, TensorValueVec* inputs,
     AllocatorAttributeVec* input_alloc_attrs, bool* is_input_dead) {
-  inputs->clear();
   inputs->resize(item.num_inputs);
-  input_alloc_attrs->clear();
   input_alloc_attrs->resize(item.num_inputs);
 
   *is_input_dead = false;
 
-  bool is_merge = item.is_merge;
   for (int i = 0; i < item.num_inputs; ++i) {
-    const bool expect_ref = IsRefType(item.input_type(i));
+    const bool expect_ref = TF_PREDICT_FALSE(item.is_any_input_ref_typed) &&
+                            IsRefType(item.input_type(i));
     Entry* entry = first_input + i;
     (*input_alloc_attrs)[i] = entry->alloc_attr;
 
@@ -826,7 +828,10 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
     switch (entry->state) {
       case Entry::State::NO_VALUE: {
         // Only merge and transfer nodes can have no-value inputs.
-        if (!is_merge) {
+        inp->mutex_if_ref = nullptr;
+        if (item.is_merge) {
+          inp->tensor = nullptr;
+        } else {
           DCHECK(item.is_transfer_node)
               << item.kernel->name() << " - input " << i;
           entry->state = Entry::State::HAS_CONST_TENSOR;
@@ -842,17 +847,18 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
       }
 
       case Entry::State::HAS_VALUE: {
-        if (expect_ref) {
+        if (TF_PREDICT_FALSE(expect_ref)) {
           return AttachDef(
               errors::InvalidArgument(i, "-th input expects a ref type"),
               item.kernel->def());
         }
+        inp->mutex_if_ref = nullptr;
         inp->tensor = entry->val.get();
         break;
       }
 
       case Entry::State::HAS_CONST_TENSOR: {
-        if (expect_ref) {
+        if (TF_PREDICT_FALSE(expect_ref)) {
           return AttachDef(
               errors::InvalidArgument(i, "-th input expects a ref type"),
               item.kernel->def());
@@ -861,6 +867,7 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
         // stores a non-const `Tensor*`, and relies on the `OpKernelContext`
         // accessors making dynamic checks that prevent using an immutable
         // tensor as a mutable tensor.
+        inp->mutex_if_ref = nullptr;
         inp->tensor = const_cast<Tensor*>(entry->const_tensor);
         break;
       }
@@ -868,8 +875,8 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
       case Entry::State::HAS_REF_TENSOR: {
         {
           tf_shared_lock ml(*entry->ref_tensor.mu);
-          if (!entry->ref_tensor.tensor->IsInitialized() &&
-              !item.is_initialization_op) {
+          if (TF_PREDICT_FALSE(!entry->ref_tensor.tensor->IsInitialized() &&
+                               !item.is_initialization_op)) {
             return AttachDef(errors::FailedPrecondition(
                                  "Attempting to use uninitialized value ",
                                  item.kernel->requested_input(i)),
@@ -892,12 +899,13 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
           }
           entry->state = Entry::State::HAS_VALUE;
 
+          inp->mutex_if_ref = nullptr;
           inp->tensor = entry->val.get();
           // The dtype of entry->ref_tensor.tensor could have been changed by
           // another operation that ran after the operation that "produced" it
           // executed, so re-validate that the type of the dereferenced tensor
           // matches the expected input type.
-          if (item.input_type(i) != inp->tensor->dtype()) {
+          if (TF_PREDICT_FALSE(item.input_type(i) != inp->tensor->dtype())) {
             return AttachDef(
                 errors::InvalidArgument(
                     i, "-th input expects type ",
@@ -916,11 +924,8 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
 
 template <class PropagatorStateType>
 Status ExecutorState<PropagatorStateType>::ProcessOutputs(
-    const NodeItem& item, OpKernelContext* ctx, EntryVector* outputs,
+    const NodeItem& item, OpKernelContext* ctx, Entry* outputs,
     NodeExecStatsInterface* stats) {
-  DCHECK_EQ(0, outputs->size());
-  outputs->resize(item.num_outputs);
-
   Status s = ctx->status();
   if (!s.ok()) {
     s = AttachDef(s, item.kernel->def());
@@ -949,6 +954,9 @@ Status ExecutorState<PropagatorStateType>::ProcessOutputs(
 
   for (int i = 0; i < item.num_outputs; ++i) {
     const TensorValue val = ctx->release_output(i);
+    Entry* out = &outputs[i];
+    DCHECK(out->state == Entry::State::NO_VALUE);
+
     if (val.tensor == nullptr) {
       // Unless it's a Switch or a Recv, or the executor has marked the output
       // as not required, the node must produce a tensor value at i-th output.
@@ -958,8 +966,6 @@ Status ExecutorState<PropagatorStateType>::ProcessOutputs(
                                   FormatNodeDefForError(item.kernel->def())));
       }
     } else {
-      Entry* out = &((*outputs)[i]);
-
       // Set the allocator attributes of the output entry.
       out->alloc_attr = ctx->output_alloc_attr(i);
 
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index b2a01f3497b..d590ae0f711 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
 
 #include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/local_executor_params.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -131,23 +131,6 @@ class Executor {
 //
 // "params" provides a set of context for the executor. We expect that
 // different context would provide different implementations.
-struct LocalExecutorParams {
-  Device* device;
-
-  const SessionMetadata* session_metadata = nullptr;
-
-  // The library runtime support.
-  FunctionLibraryRuntime* function_library = nullptr;
-
-  // create_kernel returns an instance of op kernel based on NodeDef.
-  // delete_kernel is called for every kernel used by the executor
-  // when the executor is deleted.
-  std::function<Status(const std::shared_ptr<const NodeProperties>&,
-                       OpKernel**)>
-      create_kernel;
-  std::function<void(OpKernel*)> delete_kernel;
-};
-
 ::tensorflow::Status NewLocalExecutor(const LocalExecutorParams& params,
                                       const Graph& graph, Executor** executor);
 
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index fe62a8459f1..dd65b5dce1d 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -17,17 +17,26 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/common_runtime/lower_functional_ops.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -477,6 +486,34 @@ BENCHMARK(BM_executor)->ArgPair(8192, 32);
 // Tall fat graph
 BENCHMARK(BM_executor)->ArgPair(1024, 1024);
 
+static void BM_const_identity(int iters, int width, int outputs_per_const) {
+#ifdef PLATFORM_GOOGL
+  BenchmarkUseRealTime();
+#endif  // PLATFORM_GOOGLE
+  Graph* g = new Graph(OpRegistry::Global());
+  for (int i = 0; i < width; ++i) {
+    Tensor i_t(i);
+    Node* const_node = test::graph::Constant(g, i_t);
+    for (int j = 0; j < outputs_per_const; ++j) {
+      test::graph::Identity(g, const_node);
+    }
+  }
+  FixupSourceAndSinkEdges(g);
+#ifdef PLATFORM_GOOGLE
+  SetBenchmarkLabel(
+      strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
+  SetBenchmarkItemsProcessed((1 + outputs_per_const) * width *
+                             static_cast<int64>(iters));
+#endif  // PLATFORM_GOOGLE
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+// Graph with actual op execution.
+BENCHMARK(BM_const_identity)->ArgPair(1, 1);
+BENCHMARK(BM_const_identity)->ArgPair(1, 100);
+BENCHMARK(BM_const_identity)->ArgPair(100, 1);
+BENCHMARK(BM_const_identity)->ArgPair(100, 100);
+
 static void BM_FeedInputFetchOutput(int iters) {
   testing::StopTiming();
   Graph* g = new Graph(OpRegistry::Global());
@@ -504,4 +541,157 @@ static void BM_FeedInputFetchOutput(int iters) {
 }
 BENCHMARK(BM_FeedInputFetchOutput);
 
+// Defines a graph to perform the following computation:
+//
+//     i = 0
+//     while (i < loop_iters)
+//       i += 1;
+//
+// ...using the functional `WhileOp` (if `lower` is false) or the
+// `Switch`/`Merge`-style of control flow (if `lower` is true).
+static void BM_WhileLoopHelper(int iters, int loop_iters, int loop_vars,
+                               bool lower) {
+  testing::StopTiming();
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+
+  // Define the loop body as a function: `x = x + 1`.
+  const Tensor one_t = test::AsScalar<int32>(1);
+
+  std::vector<string> args;
+  args.reserve(loop_vars);
+  args.push_back("x: int32");
+  for (int i = 1; i < loop_vars; ++i) {
+    args.push_back(strings::StrCat("x", i, ": int32"));
+  }
+
+  std::vector<string> body_rets;
+  body_rets.reserve(loop_vars);
+  body_rets.push_back("y: int32");
+  for (int i = 1; i < loop_vars; ++i) {
+    body_rets.push_back(strings::StrCat("y", i, ": int32"));
+  }
+
+  std::vector<FunctionDefHelper::Node> body_nodes;
+  body_nodes.reserve(1 + loop_vars);
+  body_nodes.push_back(
+      {{"one"}, "Const", {}, {{"value", one_t}, {"dtype", DT_INT32}}});
+  body_nodes.push_back({{"y"}, "Add", {"x", "one"}, {{"T", DT_INT32}}});
+  for (int i = 1; i < loop_vars; ++i) {
+    body_nodes.push_back({{strings::StrCat("y", i)},
+                          "Identity",
+                          {strings::StrCat("x", i)},
+                          {{"T", DT_INT32}}});
+  }
+
+  *f_lib_proto.add_function() = FunctionDefHelper::Define(
+      // Name
+      "XPlusOne",
+      // Args
+      args,
+      // Return values
+      body_rets,
+      // Attr def
+      {},
+      // Nodes
+      body_nodes);
+
+  // Define the loop condition as a function: `x < loop_iters`.
+  const Tensor loop_iters_t = test::AsScalar<int32>(loop_iters);
+  *f_lib_proto.add_function() = FunctionDefHelper::Define(
+      // Name
+      "LessThanOrEqualToN",
+      // Args
+      args,
+      // Return values
+      {"z: bool"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"N"}, "Const", {}, {{"value", loop_iters_t}, {"dtype", DT_INT32}}},
+          {{"z"}, "LessEqual", {"x", "N"}, {{"T", DT_INT32}}},
+      });
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto a = ops::Const(root.WithOpName("A"), 0, {});
+  Node* while_node;
+  std::vector<NodeBuilder::NodeOut> inputs;
+  std::vector<DataType> input_types(loop_vars, DT_INT32);
+  inputs.reserve(loop_vars);
+  for (int i = 0; i < loop_vars; ++i) {
+    inputs.push_back(NodeBuilder::NodeOut(a.node()));
+  }
+  AttrValue int32_attr;
+  int32_attr.set_type(DT_INT32);
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("XPlusOne");
+  TF_ASSERT_OK(
+      NodeBuilder("while", "While", &root.graph()->flib_def())
+          .Input(inputs)
+          .Attr("T", input_types)
+          .Attr("cond", cond_func)
+          .Attr("body", body_func)
+          .Attr("parallel_iterations", 100)
+          .Attr(LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr, true)
+          .Finalize(root.graph(), &while_node));
+  auto c = ops::Identity(
+      root.WithOpName("C").WithControlDependencies(Output(while_node)),
+      Output(while_node));
+  TF_ASSERT_OK(root.DoShapeInference(while_node));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  if (lower) {
+    FunctionLibraryDefinition flib_def(graph->flib_def());
+    GraphOptimizationPassOptions opt_options;
+    SessionOptions session_options;
+    session_options.config.mutable_graph_options()
+        ->mutable_optimizer_options()
+        ->set_do_function_inlining(true);
+    opt_options.session_options = &session_options;
+    opt_options.graph = &graph;
+    opt_options.flib_def = &flib_def;
+    LowerFunctionalOpsPass pass;
+    TF_ASSERT_OK(pass.Run(opt_options));
+  }
+
+  FixupSourceAndSinkEdges(graph.get());
+  testing::StartTiming();
+  test::Benchmark("cpu", graph.release()).Run(iters);
+}
+
+static void BM_LoweredWhileLoop(int iters, int loop_iters, int loop_vars) {
+  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ true);
+}
+BENCHMARK(BM_LoweredWhileLoop)
+    ->ArgPair(0, 1)
+    ->ArgPair(1, 1)
+    ->ArgPair(10, 1)
+    ->ArgPair(100, 1)
+    ->ArgPair(1000, 1)
+    ->ArgPair(0, 100)
+    ->ArgPair(1, 100)
+    ->ArgPair(10, 100)
+    ->ArgPair(100, 100)
+    ->ArgPair(1000, 100);
+
+static void BM_FunctionalWhileLoop(int iters, int loop_iters, int loop_vars) {
+  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ false);
+}
+BENCHMARK(BM_FunctionalWhileLoop)
+    ->ArgPair(0, 1)
+    ->ArgPair(1, 1)
+    ->ArgPair(10, 1)
+    ->ArgPair(100, 1)
+    ->ArgPair(1000, 1)
+    ->ArgPair(0, 100)
+    ->ArgPair(1, 100)
+    ->ArgPair(10, 100)
+    ->ArgPair(100, 100)
+    ->ArgPair(1000, 100);
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index bed6d9582b2..72a08b4dc9d 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -24,8 +24,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/executor_factory.h"
+#include "tensorflow/core/common_runtime/gradients.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/function.h"
@@ -36,8 +40,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/graph/gradients.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/optimizer_cse.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -95,30 +97,6 @@ struct EndpointEq {
 
 // The following Add* routines are used to add a few graph nodes while
 // functions are transformed.
-static Node* AddNoOp(StringPiece name, Graph* g) {
-  NodeDef ndef;
-  ndef.set_name(g->NewName(absl::StrCat(kNodeLabel, "/", name)));
-  ndef.set_op("NoOp");
-  Status s;
-  Node* ret = g->AddNode(ndef, &s);
-  TF_CHECK_OK(s);
-  return ret;
-}
-
-static Node* AddIdentity(StringPiece name, Graph* g, Endpoint input) {
-  DCHECK_LT(0, input.dtype());
-  NodeDef ndef;
-  ndef.set_name(g->NewName(absl::StrCat(kNodeLabel, "/", name)));
-  ndef.set_op("Identity");
-  ndef.add_input(input.name());
-  AddNodeAttr("T", BaseType(input.dtype()), &ndef);
-  Status s;
-  Node* ret = g->AddNode(ndef, &s);
-  TF_CHECK_OK(s);
-  g->AddEdge(input.node, input.index, ret, 0);
-  return ret;
-}
-
 static Node* AddArg(Graph* g, DataType dtype, int index) {
   DCHECK_LT(0, dtype);
   DCHECK_LT(dtype, DT_FLOAT_REF);
@@ -187,6 +165,12 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
   void Run(const Options& opts, Handle handle, CallFrameInterface* call_frame,
            DoneCallback done) override;
 
+  Status RunSync(Options opts, Handle handle, gtl::ArraySlice<Tensor> args,
+                 std::vector<Tensor>* rets) override;
+
+  Status RunSync(Options opts, Handle handle,
+                 CallFrameInterface* frame) override;
+
   Status CreateKernel(const std::shared_ptr<const NodeProperties>& props,
                       OpKernel** kernel) override;
 
@@ -257,6 +241,17 @@ void FunctionLibraryRuntimeOverlay::Run(const Options& opts, Handle handle,
   base_flr_->Run(opts, handle, call_frame, std::move(done));
 }
 
+Status FunctionLibraryRuntimeOverlay::RunSync(Options opts, Handle handle,
+                                              gtl::ArraySlice<Tensor> args,
+                                              std::vector<Tensor>* rets) {
+  return base_flr_->RunSync(std::move(opts), handle, args, rets);
+}
+
+Status FunctionLibraryRuntimeOverlay::RunSync(Options opts, Handle handle,
+                                              CallFrameInterface* call_frame) {
+  return base_flr_->RunSync(std::move(opts), handle, call_frame);
+}
+
 Status FunctionLibraryRuntimeOverlay::CreateKernel(
     const std::shared_ptr<const NodeProperties>&, OpKernel**) {
   // We don't have access to base_lib_def_ in base function library runtime (aka
@@ -353,6 +348,10 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
            std::vector<Tensor>* rets, DoneCallback done) override;
   void Run(const Options& opts, Handle handle, CallFrameInterface* frame,
            DoneCallback done) override;
+  Status RunSync(Options opts, Handle handle, gtl::ArraySlice<Tensor> args,
+                 std::vector<Tensor>* rets) override;
+  Status RunSync(Options opts, Handle handle,
+                 CallFrameInterface* call_frame) override;
 
   bool IsStateful(const string& function) const override;
 
@@ -446,6 +445,10 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
                  gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
                  Item* item, DoneCallback done);
 
+  Status PrepareRunSync(
+      Handle handle, Options* run_opts, Item** out_item,
+      std::unique_ptr<PrivateIntraProcessRendezvous>* out_rendezvous);
+
   void ExecutorArgsFromOptions(const FunctionLibraryRuntime::Options& run_opts,
                                CallFrameInterface* frame,
                                Executor::Args* exec_args);
@@ -859,32 +862,6 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   return parent_status;
 }
 
-void DumpGraph(StringPiece label, const Graph* g) {
-  // TODO(zhifengc): Change Graph to record #nodes.
-  VLOG(2) << "Graph " << label << " #nodes " << g->num_nodes() << " #edges "
-          << g->num_edges();
-  if (VLOG_IS_ON(5)) {
-    for (const auto& line : str_util::Split(DebugString(g), '\n')) {
-      VLOG(5) << "|| " << line;
-    }
-  }
-}
-
-void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
-                   const GraphOptimizer::Options& graph_optimizer_options) {
-  OptimizerOptions opts;
-  opts.set_do_common_subexpression_elimination(true);
-  opts.set_do_function_inlining(true);
-  opts.set_do_constant_folding(true);
-  GraphOptimizer optimizer(opts);
-  optimizer.Optimize(lib, lib->env(), lib->device(), g,
-                     graph_optimizer_options);
-}
-
-void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
-  OptimizeGraph(lib, g, GraphOptimizer::Options());
-}
-
 namespace {
 // Removes all stateless nodes that do not contribute to a return
 // value from the function body. Unlike `RemoveDeadNodes()`, which is
@@ -1203,7 +1180,8 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     };
   }
 
-  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  LocalHandle local_handle = parent_->GetHandleOnDevice(
+      device_name_, handle, /*include_multi_device=*/true);
   if (local_handle == kInvalidLocalHandle) {
     parent_->Run(run_opts, handle, frame, done);
     return;
@@ -1234,6 +1212,80 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   item->exec->RunAsync(exec_args, std::move(done));
 }
 
+Status FunctionLibraryRuntimeImpl::PrepareRunSync(
+    Handle handle, Options* run_opts, Item** out_item,
+    std::unique_ptr<PrivateIntraProcessRendezvous>* out_rendezvous) {
+  if (run_opts->cancellation_manager &&
+      run_opts->cancellation_manager->IsCancelled()) {
+    return errors::Cancelled("");
+  }
+
+  if (run_opts->remote_execution) {
+    // NOTE(mrry): This bit is only set for a local function when `parent_`
+    // calls back into this class, and the current implementation of
+    // `ProcessFunctionLibraryRuntime` currently always uses the asynchronous
+    // Run() method.
+    return errors::Unimplemented("Remote calling with RunSync()");
+  }
+
+  if (run_opts->create_rendezvous) {
+    *out_rendezvous =
+        absl::make_unique<PrivateIntraProcessRendezvous>(device_mgr_);
+    run_opts->rendezvous = out_rendezvous->get();
+    run_opts->create_rendezvous = false;
+  }
+
+  LocalHandle local_handle = parent_->GetHandleOnDevice(
+      device_name_, handle, /*include_multi_device=*/true);
+  if (local_handle == kInvalidLocalHandle) {
+    *out_item = nullptr;
+    return Status::OK();
+  }
+
+  TF_RETURN_IF_ERROR(GetOrCreateItem(local_handle, out_item));
+
+  if (run_opts->runner == nullptr) {
+    run_opts->runner = &default_runner_;
+  }
+  DCHECK(run_opts->runner != nullptr);
+
+  return Status::OK();
+}
+
+Status FunctionLibraryRuntimeImpl::RunSync(Options opts, Handle handle,
+                                           gtl::ArraySlice<Tensor> args,
+                                           std::vector<Tensor>* rets) {
+  Item* item = nullptr;
+  std::unique_ptr<PrivateIntraProcessRendezvous> rendezvous;
+  TF_RETURN_IF_ERROR(PrepareRunSync(handle, &opts, &item, &rendezvous));
+  if (item == nullptr) {
+    return parent_->RunSync(opts, handle, args, rets);
+  }
+
+  Executor::Args exec_args;
+  const FunctionBody* fbody = GetFunctionBody(handle);
+  FunctionCallFrame frame(fbody->arg_types, fbody->ret_types);
+  TF_RETURN_IF_ERROR(frame.SetArgs(args));
+  ExecutorArgsFromOptions(opts, &frame, &exec_args);
+
+  TF_RETURN_IF_ERROR(item->exec->Run(exec_args));
+  return frame.ConsumeRetvals(rets, opts.allow_dead_tensors);
+}
+
+Status FunctionLibraryRuntimeImpl::RunSync(Options opts, Handle handle,
+                                           CallFrameInterface* call_frame) {
+  Item* item = nullptr;
+  std::unique_ptr<PrivateIntraProcessRendezvous> rendezvous;
+  TF_RETURN_IF_ERROR(PrepareRunSync(handle, &opts, &item, &rendezvous));
+  if (item == nullptr) {
+    return parent_->RunSync(opts, handle, call_frame);
+  }
+
+  Executor::Args exec_args;
+  ExecutorArgsFromOptions(opts, call_frame, &exec_args);
+  return item->exec->Run(exec_args);
+}
+
 bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) const {
   const OpDef* op_def;
   const Status s = base_lib_def_->LookUpOpDef(func, &op_def);
@@ -1315,1077 +1367,6 @@ std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
       optimizer_options, custom_kernel_creator, session_metadata, parent));
 }
 
-bool RemoveDeadNodes(Graph* g) {
-  VLOG(2) << "Removing dead nodes";
-  std::unordered_set<const Node*> nodes;
-  for (auto n : g->nodes()) {
-    if (n->IsSource() || n->IsSink() || n->IsControlFlow() ||
-        n->op_def().is_stateful()) {
-      nodes.insert(n);
-    }
-  }
-  return PruneForReverseReachability(g, std::move(nodes));
-}
-
-namespace {
-// If 'edges' contains only 1 non-control edge, returns it. Otherwise,
-// returns a nullptr.
-const Edge* GetTheOnlyDataEdge(const EdgeSet& edges) {
-  const Edge* ret = nullptr;
-  for (const Edge* e : edges) {
-    if (e->IsControlEdge() || ret) {
-      // Don't touch it if there is a control edge.
-      return nullptr;
-    }
-    if (IsRefType(e->src()->output_type(e->src_output()))) {
-      // Don't touch it if the identity node is effectively de-reffing
-      // a ref.
-      return nullptr;
-    }
-    if (IsRecv(e->src()) || IsSwitch(e->src())) {
-      // Don't touch it if the identity is introduced for control flow.
-      // Recv disables all its successors if it receives a dead signal.
-      // When Recv has an outgoing control edge, the current executor
-      // would not disable the destination. The current solution (see
-      // graph_partition.cc) is to add an identity after Recv and change
-      // the control edge to be from this identity node. So the identity
-      // can't be removed.
-      return nullptr;
-    }
-    ret = e;
-  }
-  return ret;
-}
-}  // end namespace
-
-bool RemoveIdentityNodes(Graph* g) {
-  VLOG(2) << "Removing identity nodes";
-  bool removed_any = false;
-  gtl::InlinedVector<Node*, 8> matches;
-  for (Node* n : g->nodes()) {
-    if (!n->IsIdentity()) continue;
-    if (!GetTheOnlyDataEdge(n->in_edges())) continue;
-
-    // Some identity nodes are used as sink nodes to give names to output
-    // tensors. These nodes are not going to be executed unless they are in the
-    // fetch set. But if they are in the fetch set we don't want to remove them.
-    if (n->out_edges().empty()) continue;
-
-    matches.push_back(n);
-  }
-  if (!matches.empty()) {
-    for (Node* n : matches) {
-      const Edge* in = GetTheOnlyDataEdge(n->in_edges());
-      for (const Edge* out : n->out_edges()) {
-        if (out->IsControlEdge()) {
-          g->AddControlEdge(in->src(), out->dst());
-        } else {
-          g->AddEdge(in->src(), in->src_output(), out->dst(), out->dst_input());
-        }
-      }
-      VLOG(2) << "Remove Identity: " << n->DebugString();
-      g->RemoveNode(n);
-      removed_any = true;
-    }
-  }
-  return removed_any;
-}
-
-bool RemoveListArrayConverter(Graph* g) {
-  VLOG(2) << "Removing list array converter";
-  gtl::InlinedVector<Node*, 8> matches;
-  for (Node* n : g->nodes()) {
-    if ((n->type_string() == "_ListToArray") ||
-        (n->type_string() == "_ArrayToList")) {
-      matches.push_back(n);
-    }
-  }
-  bool removed_any = false;
-  if (!matches.empty()) {
-    for (Node* n : matches) {
-      if (n->num_inputs() != n->num_outputs()) {
-        continue;  // Not expected. Skip.
-      }
-      gtl::InlinedVector<Node*, 8> identity_nodes(n->num_inputs(), nullptr);
-
-      const auto no_op = [&](StringPiece name) -> Node* {
-        return AddNoOp(absl::StrCat(n->name(), "/", name), g);
-      };
-
-      const auto identity = [&](StringPiece name, Endpoint input) -> Node* {
-        Node* node = AddIdentity(absl::StrCat(n->name(), "/", name), g, input);
-        node->set_requested_device(input.node->def().device());
-        return node;
-      };
-
-      // Process input edges first.
-      Node* input_control_node = nullptr;
-      for (const Edge* e : n->in_edges()) {
-        if (e->IsControlEdge()) {
-          if (input_control_node == nullptr) {
-            // If node "n" has any control dependencies, adds a no-op
-            // node (input_control_node) which the additional Identity
-            // nodes depends on and the input_control_node depends on
-            // the node "n"s control dependencies.
-            input_control_node = no_op("input_control_node");
-          }
-          g->AddControlEdge(e->src(), input_control_node);
-        } else {
-          const int index = e->dst_input();
-          Node** id_node = &identity_nodes[index];
-          if (*id_node != nullptr) {
-            LOG(ERROR)
-                << "RemoveListArrayConverter unexpected duplicated input: "
-                << e->dst_input();
-            return removed_any;
-          }
-          *id_node = identity("input", {e->src(), e->src_output()});
-        }
-      }
-
-      // If node "n" has any control dependencies, the added identity
-      // nodes should have control dependencies on input_control_node.
-      if (input_control_node != nullptr) {
-        for (Node* id : identity_nodes) {
-          g->AddControlEdge(input_control_node, id);
-        }
-      }
-
-      Node* output_control_node = nullptr;
-      for (const Edge* e : n->out_edges()) {
-        if (e->IsControlEdge()) {
-          if (output_control_node == nullptr) {
-            // If node "n" is control-depended upon by other nodes,
-            // adds a no-op node (output_control_node) which those
-            // nodes will depend on and output_control_node depends on
-            // all Identity nodes.
-            output_control_node = no_op("output_control_node");
-          }
-          g->AddControlEdge(output_control_node, e->dst());
-        } else {
-          Node* id_node = identity_nodes[e->src_output()];
-          if (id_node == nullptr) {
-            LOG(ERROR) << "RemoveListArrayConverter unexpected missing input: "
-                       << e->src_output();
-            return removed_any;
-          }
-          CHECK(id_node);
-          g->AddEdge(id_node, 0, e->dst(), e->dst_input());
-        }
-      }
-
-      // If any nodes have control dependencies on node "n", those
-      // nodes should have control dependencies on
-      // output_control_node.
-      if (output_control_node != nullptr) {
-        for (Node* id : identity_nodes) {
-          g->AddControlEdge(id, output_control_node);
-        }
-      }
-
-      g->RemoveNode(n);
-      removed_any = true;
-    }
-  }
-  return removed_any;
-}
-
-Status NameAndAttrsFromFunctionCall(const NodeDef& call_def,
-                                    NameAttrList* function) {
-  if (call_def.op() == "PartitionedCall" ||
-      call_def.op() == "StatefulPartitionedCall") {
-    TF_RETURN_IF_ERROR(GetNodeAttr(call_def, "f", function));
-  } else {
-    function->set_name(call_def.op());
-    *function->mutable_attr() = call_def.attr();
-  }
-  return Status::OK();
-}
-
-Status InstantiateFunctionCall(const NodeDef& call_def,
-                               FunctionLibraryRuntime* flr,
-                               FunctionLibraryRuntime::Handle* handle) {
-  NameAttrList function;
-  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(call_def, &function));
-  return flr->Instantiate(function.name(), AttrSlice(&function.attr()), handle);
-}
-
-namespace {
-
-std::vector<string> InputDevices(const Node& caller) {
-  std::vector<string> input_devices(caller.in_edges().size());
-  std::vector<string> input_tensors(caller.in_edges().size());
-
-  for (const Edge* edge : caller.in_edges()) {
-    if (edge->IsControlEdge()) continue;
-    const string& input_device = edge->src()->has_assigned_device_name()
-                                     ? edge->src()->assigned_device_name()
-                                     : edge->src()->requested_device();
-    input_devices[edge->dst_input()] = input_device;
-    input_tensors[edge->dst_input()] =
-        absl::StrCat(edge->src()->name(), ":", edge->src_output());
-  }
-
-  if (VLOG_IS_ON(4)) {
-    VLOG(4) << "Function instantiation input devices:";
-    for (int i = 0; i < input_devices.size(); ++i) {
-      if (input_tensors[i].empty()) continue;  // skip control edges
-      VLOG(4) << "    [index " << i << "]"
-              << " device: " << input_devices[i]
-              << " (input: " << input_tensors[i] << ")";
-    }
-  }
-
-  return input_devices;
-}
-
-// Place input nodes on the same device as the corresponding caller input
-// node. Do not specify any placement for all other nodes.
-class DefaultFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
- public:
-  explicit DefaultFunctionBodyPlacer(const Node& caller)
-      : input_devices_(InputDevices(caller)) {}
-
-  absl::optional<string> InputNodeDevice(int input_index) const override {
-    return input_devices_[input_index];
-  }
-  absl::optional<string> OutputNodeDevice(int output_index) const override {
-    return absl::nullopt;
-  }
-  bool ColocateInputOutputIdentities() const override { return false; }
-  absl::optional<string> ControlNodeDevice() const override {
-    return absl::nullopt;
-  }
-  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
-    return absl::nullopt;
-  }
-
- private:
-  const std::vector<string> input_devices_;
-};
-
-// Place all nodes on the same device as caller node.
-class SingleDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
- public:
-  explicit SingleDeviceFunctionBodyPlacer(const Node& caller)
-      : caller_device_(caller.def().device()) {}
-
-  absl::optional<string> InputNodeDevice(int input_index) const override {
-    return caller_device_;
-  }
-  absl::optional<string> OutputNodeDevice(int output_index) const override {
-    return caller_device_;
-  }
-  bool ColocateInputOutputIdentities() const override { return false; }
-  absl::optional<string> ControlNodeDevice() const override {
-    return caller_device_;
-  }
-  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
-    return caller_device_;
-  }
-
- private:
-  const string caller_device_;
-};
-
-// Place input nodes on the same device as the corresponding caller input
-// node. Do not place output node. Place control nodes on the same device as
-// caller node. For all function body nodes overrides job, replica and task
-// parts of the device assignment to match function caller node.
-class MultiDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
- public:
-  explicit MultiDeviceFunctionBodyPlacer(const Node& caller)
-      : caller_device_(caller.def().device()),
-        input_devices_(InputDevices(caller)) {
-    has_parsed_caller_device_ =
-        DeviceNameUtils::ParseFullName(caller_device_, &caller_parsed_device_);
-  }
-
-  absl::optional<string> InputNodeDevice(int input_index) const override {
-    return input_devices_[input_index];
-  }
-  absl::optional<string> OutputNodeDevice(int output_index) const override {
-    return absl::nullopt;
-  }
-  bool ColocateInputOutputIdentities() const override { return true; }
-  absl::optional<string> ControlNodeDevice() const override {
-    return caller_device_;
-  }
-  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
-    // TODO(ezhulenev): If function would have been instantiated as a
-    // multi-device function and executed via FunctionLibraryRuntime, it could
-    // be potentially placed on any available device. However there are multiple
-    // tests relying on this assumption. Fix them, and remove this line.
-    if (ndef.device().empty()) return caller_device_;
-
-    if (!has_parsed_caller_device_) return ndef.device();
-
-    DeviceNameUtils::ParsedName ndef_parsed_device;
-    if (!DeviceNameUtils::ParseFullName(ndef.device(), &ndef_parsed_device))
-      return ndef.device();
-
-    if (caller_parsed_device_.has_job) {
-      ndef_parsed_device.has_job = caller_parsed_device_.has_job;
-      ndef_parsed_device.job = caller_parsed_device_.job;
-    }
-
-    if (caller_parsed_device_.has_replica) {
-      ndef_parsed_device.has_replica = caller_parsed_device_.has_replica;
-      ndef_parsed_device.replica = caller_parsed_device_.replica;
-    }
-
-    if (caller_parsed_device_.has_task) {
-      ndef_parsed_device.has_task = caller_parsed_device_.has_task;
-      ndef_parsed_device.task = caller_parsed_device_.task;
-    }
-    return DeviceNameUtils::ParsedNameToString(ndef_parsed_device);
-  }
-
- private:
-  string caller_device_;
-  bool has_parsed_caller_device_;
-  DeviceNameUtils::ParsedName caller_parsed_device_;
-  std::vector<string> input_devices_;
-};
-
-}  // namespace
-
-std::unique_ptr<InlinedFunctionBodyPlacer>
-InlinedFunctionBodyPlacer::DefaultPlacer(const Graph& graph,
-                                         const Node& caller) {
-  VLOG(3) << "Create default placer for inlined function body.";
-  return absl::make_unique<DefaultFunctionBodyPlacer>(caller);
-}
-
-std::unique_ptr<InlinedFunctionBodyPlacer>
-InlinedFunctionBodyPlacer::SingleDevicePlacer(const Graph& graph,
-                                              const Node& caller) {
-  VLOG(3) << "Create single device placer for inlined function body.";
-  return absl::make_unique<SingleDeviceFunctionBodyPlacer>(caller);
-}
-
-std::unique_ptr<InlinedFunctionBodyPlacer>
-InlinedFunctionBodyPlacer::MultiDevicePlacer(const Graph& graph,
-                                             const Node& caller) {
-  VLOG(3) << "Create multi device placer for inlined function body.";
-  return absl::make_unique<MultiDeviceFunctionBodyPlacer>(caller);
-}
-
-namespace {
-
-Status ValidateNoInline(const FunctionBody* fbody) {
-  const auto attr = AttrSlice(&fbody->fdef.attr());
-  bool noinline = false;
-  if (TryGetNodeAttr(attr, kNoInlineAttr, &noinline) && noinline) {
-    return errors::InvalidArgument(
-        "Can't inline function marked with '_noinline'");
-  }
-  return Status::OK();
-}
-
-using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
-
-// Propagate the debug info of `nodes` in function `func` to the `target` node.
-// If the debug info of any node is missing, its node name and function name
-// is used.
-void PropagateDebugInfoToNode(const string& func,
-                              const std::vector<const Node*>& nodes,
-                              NodeDef* target) {
-  if (nodes.empty() || target->has_experimental_debug_info()) {
-    return;
-  }
-  for (const Node* node : nodes) {
-    const auto& node_def = node->def();
-    if (node_def.has_experimental_debug_info()) {
-      target->mutable_experimental_debug_info()->MergeFrom(
-          node_def.experimental_debug_info());
-    } else {
-      target->mutable_experimental_debug_info()->add_original_node_names(
-          node_def.name());
-      target->mutable_experimental_debug_info()->add_original_func_names(func);
-    }
-  }
-}
-}  // namespace
-
-string InlineFunctionBodyOptions::DebugString() const {
-  const auto true_false = [](bool b) { return b ? "true" : "false"; };
-
-  const auto keep_caller_node_str = [this]() -> string {
-    switch (keep_caller_node) {
-      case KeepCallerNode::kDoNotKeep:
-        return "DoNotKeep";
-      case KeepCallerNode::kFetchable:
-        return "Fetchable";
-      case KeepCallerNode::kTargetable:
-        return "Targetable";
-    }
-  };
-
-  return absl::StrCat(
-      "disable_inlining=", true_false(disable_inlining),
-      ", ignore_noinline=", true_false(ignore_noinline),
-      ", inline_impl_selection_group_functions=",
-      true_false(inline_impl_selection_group_functions),
-      ", keep_caller_node=", keep_caller_node_str(), ", output_control_src=",
-      output_control_src == OutputControlSrc::kDataOutputs ? "DataOutputs"
-                                                           : "ControlOutputs",
-      ", inlined_function_body_placer=", inlined_function_body_placer.name,
-      ", uniquify_frame_names=", true_false(uniquify_frame_names));
-}
-
-Status ValidateInlining(const Node* node, const FunctionBody* fbody,
-                        const InlineFunctionBodyOptions& options) {
-  // TODO(ezhulenev): Currently common_runtime function inlining can't guarantee
-  // that all side-effectful ops will be executed after inlining. See Grappler
-  // function_optimizer for details. Unify all function inlining mechanism.
-  // Do not inline if `!fbody->control_ret_nodes.empty()`.
-
-  const auto num_node_inputs = static_cast<size_t>(node->num_inputs());
-  const auto num_node_outputs = static_cast<size_t>(node->num_outputs());
-
-  if (num_node_inputs != fbody->arg_types.size() ||
-      num_node_inputs != fbody->arg_nodes.size()) {
-    return errors::InvalidArgument(
-        "Node inputs do not match function arguments: inputs=", num_node_inputs,
-        " arg_types=", fbody->arg_types.size(),
-        " arg_nodes=", fbody->arg_nodes.size());
-  }
-
-  if (num_node_outputs != fbody->ret_types.size() ||
-      num_node_outputs != fbody->ret_nodes.size()) {
-    return errors::InvalidArgument(
-        "Node outputs do not match function returns: outputs=",
-        num_node_outputs, " ret_types=", fbody->ret_types.size(),
-        " ret_nodes=", fbody->ret_nodes.size());
-  }
-
-  for (int i = 0; i < node->num_inputs(); ++i) {
-    if (node->input_type(i) != fbody->arg_types[i]) {
-      return errors::InvalidArgument(
-          "Node input type doesn't match function argument type: ",
-          node->input_type(i), " != ", fbody->arg_types[i], " @ index=", i);
-    }
-  }
-  for (int i = 0; i < node->num_outputs(); ++i) {
-    if (node->output_type(i) != fbody->ret_types[i]) {
-      return errors::InvalidArgument(
-          "Node output type doesn't match function return type: ",
-          node->output_type(i), " != ", fbody->ret_types[i], " @ index=", i);
-    }
-  }
-
-  if (options.disable_inlining) {
-    return errors::InvalidArgument(
-        "Function inlining explicitly disabled by 'options.disable_inlining'");
-  }
-
-  if (!options.inline_impl_selection_group_functions) {
-    bool is_impl_selection_group_function =
-        fbody->fdef.attr().find("api_implements") != fbody->fdef.attr().end();
-    if (is_impl_selection_group_function) {
-      return errors::InvalidArgument(
-          "Inlining of implementation selection group function ",
-          fbody->fdef.signature().name(),
-          " is disabled by options.inline_impl_selection_group_functions");
-    }
-  }
-
-  if (!options.ignore_noinline) {
-    TF_RETURN_IF_ERROR(ValidateNoInline(fbody));
-  }
-
-  return Status::OK();
-}
-
-// Function inlining must preserve function execution semantics with regards to
-// side-effects visibility. Tensorflow in Eager mode has an automatic control
-// dependencies tracking mechanism, which enforces well-defined execution order
-// of all side-effects. Any other frontend (e.g. Swift) must produce graphs
-// following the same rules, to ensure that function inlining works correctly.
-//
-// IMPORTANT: Currently we do not have a true notion of "side-effectful" node,
-// we assume that all stateful nodes might have side-effects, though it's not
-// true in practice, e.g. `ReadVariableOp` doesn't have an observable
-// side-effect.
-//
-// Automatic control dependency rules in Tensorflow 2.0 (python in eager mode):
-//
-// 1) When a function has a resource (DT_RESOURCE data type) input argument it
-//   "captures" the mutable resource.  This is implemented by automatically
-//    adding a incoming control edge from the previous side-effectful op
-//    touching that resource, and an outgoing control edge to the next
-//    side-effectful op using the same resource. This serializes the mutations
-//    of the resource to make graph execution deterministic.
-//
-// 2) All stateful ops inside a function body are guaranteed to execute in
-//    program order, this is achieved by adding control edges between stateful
-//    ops at graph construction time. Stateful ops (or ops that must execute)
-//    should be in the function control return set. Having a data edge to the
-//    regular function output might be not enough, because after function
-//    inlining it might happen that data output is unused.
-//
-// 3) Furthermore, all ops accepting the same resource as an input are
-//    guaranteed to run in program order. This is also done by adding control
-//    edges at graph construction time. The last op touching the resource
-//    must be in a control return set, which will guarantee that all side
-//    effects to the resource will happen before function completion.
-//
-// Function inlining must preserve side-effect visibility:
-//
-// 1) All side-effects to the captured resources, that happened before function
-//    call must be visible to the function body nodes using that resources.
-//
-// 2) All side-effects to the captured resources, that happened inside function
-//    body, must be visible to every op/function using that resource after the
-//    function call completed.
-//
-// To guarantee that these properties are preserved after inlining we:
-//
-// 1) Create "input_control_node" NoOp. Function call node incoming control
-//    edges will be forwarded *to* this node. Function inputs (Identity nodes)
-//    will have a control edge *from* this node. If function body has nodes
-//    without inputs, they will have a control edge *from* this node.
-//
-// 2) Create "output_control_node" NoOp. All nodes that have incoming control
-//    edge *from* the function call node, will be forwarded to this node.
-//
-//    We have two options for choosing which nodes will have a control edge *to*
-//    the "output control node":
-//       a) control returns            (`control_ret` field in FunctionDef)
-//       b) data returns               (`ret` field in FunctionDef)
-//
-//    We do a) for multi-device function calls in Tensorflow v2 and b)
-//    for the rest for compatibility with Tensorflow v1.
-//
-//    Following the automatic control dependencies tracking rules, a node that
-//    has an incoming control edge from the function call node is dependent on
-//    the side-effects happening inside the function body. The output control
-//    node will guarantee side-effects execution order.
-//
-//    If function call node doesn't have an outgoing control edge, it means that
-//    no one is interested in observing side-effects that might have happened.
-//
-// Function inlining might leave the graph in partially-placed state. Function
-// inlining caller must call Placer to guarantee that all nodes are placed.
-//
-// Function inlining with `options.override_device=true` will leave graph in
-// fully placed state, by overriding all inlined nodes devices with the caller
-// node device, but it will make functions always single-device. These functions
-// after inlining will not be able to handle resources on multiple devices. This
-// is currently acceptable for XLA use cases (XLA cluster is always executed on
-// a single device).
-//
-// TODO(ezhulenev): Documentation above is ahead of implementation below.
-Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
-                          Node* caller, const FunctionBody* fbody,
-                          const InlineFunctionBodyOptions& options) {
-  VLOG(3) << "Inline function call: " << SummarizeNode(*caller) << " ["
-          << options.DebugString() << "]";
-
-  Status validation = ValidateInlining(caller, fbody, options);
-  if (!validation.ok()) {
-    return errors::Internal("Inlining mismatch: ", validation.error_message());
-  }
-
-  // Placer is responsible for assigning devices for all nodes that we will add
-  // to the graph.
-  const std::unique_ptr<InlinedFunctionBodyPlacer> placer =
-      options.inlined_function_body_placer.get(*g, *caller);
-
-  // We can't possibly introduce a duplicate control edge during function
-  // inlining, so we skip this check in calls to the 'g->AddControlEdge(...)'.
-  static constexpr bool kDoNotCheckDuplicates = true;
-
-  // ------------------------------------------------------------------------ //
-  // Helper functions to create `NoOp` and `Identity` nodes for auxiliary
-  // control nodes and inlined function inputs and outputs.
-
-  // Add a NoOp node for function control inputs/outputs.
-  const auto no_op = [&](StringPiece name) -> Node* {
-    Node* node = AddNoOp(absl::StrCat(caller->name(), "/", name), g);
-    const absl::optional<string> device = placer->ControlNodeDevice();
-    if (device.has_value()) node->set_requested_device(*device);
-    return node;
-  };
-
-  // Add an Identity node for function input.
-  const auto input_identity = [&](StringPiece name, Endpoint input,
-                                  int index) -> Node* {
-    Node* node = AddIdentity(absl::StrCat(caller->name(), "/", name), g, input);
-    const absl::optional<string> device = placer->InputNodeDevice(index);
-    if (device.has_value()) node->set_requested_device(*device);
-    bool colocate_identity = placer->ColocateInputOutputIdentities();
-    if (colocate_identity) {
-      node->AddAttr(kColocationAttrName,
-                    std::vector<string>{absl::StrCat(kColocationGroupPrefix,
-                                                     input.node->name())});
-    }
-    return node;
-  };
-
-  // Add an Identity node for function output.
-  const auto output_identity = [&](StringPiece name, Endpoint input,
-                                   int index) -> Node* {
-    Node* node = AddIdentity(absl::StrCat(caller->name(), "/", name), g, input);
-    const absl::optional<string> device = placer->OutputNodeDevice(index);
-    if (device.has_value()) node->set_requested_device(*device);
-    bool colocate_identity = placer->ColocateInputOutputIdentities();
-    if (colocate_identity) {
-      node->AddAttr(kColocationAttrName,
-                    std::vector<string>{absl::StrCat(kColocationGroupPrefix,
-                                                     input.node->name())});
-    }
-    return node;
-  };
-
-  // ------------------------------------------------------------------------ //
-  // Input edges. For data edges coming into "caller", we first compute the
-  // <src>:<src_output> for the i-th input in "inputs".
-  // If "caller" has any input control dependencies, we add a NoOp
-  // node "input_control_node", which depends on "caller"'s control inputs.
-  std::vector<Endpoint> inputs(caller->num_inputs());
-  Node* input_control_node = nullptr;
-  for (const Edge* e : caller->in_edges()) {
-    if (e->IsControlEdge()) {
-      if (input_control_node == nullptr) {
-        input_control_node = no_op("input_control_node");
-      }
-      g->AddControlEdge(e->src(), input_control_node, kDoNotCheckDuplicates);
-    } else {
-      inputs[e->dst_input()] = {e->src(), e->src_output()};
-    }
-  }
-  if (input_control_node != nullptr) {
-    VLOG(3) << "Created input control node: " << input_control_node->name();
-  }
-
-  // ------------------------------------------------------------------------ //
-  // Duplicate fbody->graph into 'g'.  First, we copy the nodes of
-  // fbody->graph into 'g' except the source and sink nodes.  We copy
-  // edges among nodes in 'fbody->graph'.
-  //
-  // If 'x' is a node in fbody->graph and its copy in 'g' is 'y', we
-  // remember 'y' in node_map[x->id()].
-  std::vector<Node*> node_map(fbody->graph->num_node_ids());
-  for (Node* n : fbody->graph->op_nodes()) {
-    NodeDef ndef = n->def();
-
-    // Maybe override requested node device assignment.
-    const absl::optional<string> device = placer->BodyNodeDevice(ndef);
-    if (device.has_value()) ndef.set_device(*device);
-
-    // Add inlined function name to inlined node debug information.
-    PropagateDebugInfoToNode(fbody->fdef.signature().name(), {n}, &ndef);
-
-    // Add the function node name as a prefix:
-    //  1) to node name to avoid collisions
-    //  2) to frame name to avoid multiple LoopCond nodes in one frame
-    //  3) to colocation attribute
-    const string prefix = strings::StrCat(caller->name(), "/");
-    TF_RETURN_IF_ERROR(AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &ndef,
-                                                options.uniquify_frame_names));
-
-    Status added_node;
-    Node* clone = g->AddNode(ndef, &added_node);
-    TF_CHECK_OK(added_node);
-    node_map[n->id()] = clone;
-
-    // If there is an input control node, and one of:
-    // a) the node has no data or control inputs, or
-    // b) the node is a function call (including SymbolicGradient),
-    //    then add a control edge from the input control node to the clone (only
-    //    if it does not already have a control input).
-    //
-    // We must not execute any nodes if the original function call would not
-    // have executed. This is especially critical when the function call is
-    // inside a control-flow construct like tf.cond(). Case (a) ensures that
-    // such nodes do not run.
-    //
-    // The purpose of case (b) is to ensure that instances of case (a) created
-    // by further inlining steps also receive the control dependency.
-    //
-    // This edge is required to transfer execution frame down to all function
-    // body nodes of inlined nested function calls.
-    if (input_control_node) {
-      const auto is_input_edge = [](const Edge* e) -> bool {
-        return !e->src()->IsSource();
-      };
-      const auto is_control_edge = [](const Edge* e) -> bool {
-        return !e->src()->IsSource() && e->IsControlEdge();
-      };
-
-      // Forward execution frame if:
-      //
-      // a) The node has no data or control inputs.
-      // b) OR the node is a function call without control inputs (control edge
-      //    will be used in nested function inlining to forward execution frame
-      //    to constants inside the function body).
-      //
-      // c) Do not forward control frame to function argument nodes, they will
-      //    be connected to the corresponding function input later.
-      const bool forward_execution_frame =
-          (absl::c_none_of(n->in_edges(), is_input_edge) ||       // (a)
-           (n->IsFunctionCall() &&                                // (b)
-            absl::c_none_of(n->in_edges(), is_control_edge))) &&  //
-          !n->IsArg();                                            // (c)
-
-      if (forward_execution_frame) {
-        VLOG(4) << "Add control edge from input control node to: "
-                << clone->name();
-        g->AddControlEdge(input_control_node, clone, kDoNotCheckDuplicates);
-      }
-    }
-  }
-  for (const Edge* e : fbody->graph->edges()) {
-    if (e->src()->IsSource() || e->src()->IsSink() || e->dst()->IsSource() ||
-        e->dst()->IsSink()) {
-      continue;
-    }
-    Node* src_copy = node_map[e->src()->id()];
-    Node* dst_copy = node_map[e->dst()->id()];
-    g->AddEdge(src_copy, e->src_output(), dst_copy, e->dst_input());
-  }
-
-  // ------------------------------------------------------------------------ //
-  // Connect input edges.
-  //
-  // We create one Identity node for each input. Then, we connect inputs[i] to
-  // the i-th identity node added. The nodes that previously connected
-  // to the j-th output of i-th arg node are reconnected to the i-th
-  // identity node.
-  //
-  // The added identity nodes depend on "input_control_node".
-  VLOG(4) << "Add input Identity nodes for each function argument:";
-  for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
-    Node* arg = node_map[fbody->arg_nodes[i]->id()];
-    Node* n = input_identity("input", inputs[i], i);
-    VLOG(4) << "    [index " << i << "] "
-            << fbody->fdef.signature().input_arg(i).name() << " as "
-            << n->name() << " (input: " << inputs[i].name()
-            << ", requested_device: " << n->requested_device() << ")";
-
-    if (input_control_node) {
-      g->AddControlEdge(input_control_node, n, kDoNotCheckDuplicates);
-    }
-    for (const Edge* e : arg->out_edges()) {
-      if (e->IsControlEdge()) {
-        g->AddControlEdge(n, e->dst(), kDoNotCheckDuplicates);
-      } else {
-        g->AddEdge(n, 0, e->dst(), e->dst_input());
-      }
-    }
-    node_map[fbody->arg_nodes[i]->id()] = n;
-    g->RemoveNode(arg);  // 'arg' is disconnected.
-  }
-
-  // ------------------------------------------------------------------------ //
-  // Connect output edges.
-  //
-  // For i-th return node in fbody->graph, we add in "g" an identity node
-  // (outputs[i-th]). We then reconnect every incoming edge into the i-th return
-  // node to the added identity node.
-  //
-  // For every data edge coming out of "callee"s i-th output, we reconnect it to
-  // the i-th identity added above.
-  //
-  // If "callee" is control-depended upon by any other nodes, we add a NoOp node
-  // "output_control_node". "output_control_node" depends on all identity nodes
-  // added above or on all control return nodes (controlled by
-  // `options.output_control_src` value). And nodes previously depend on
-  // "callee" is changed to depend on "output_control_node".
-  //
-  // If `keep_node_fetchable` is `true` we always add an output control node, to
-  // guarantee that executing a fetchable node will execute all side-effects.
-  VLOG(4) << "Add output Identity nodes for each function output argument:";
-  std::vector<Node*> outputs(caller->num_outputs());
-  for (std::size_t i = 0; i < fbody->ret_nodes.size(); ++i) {
-    Node* ret = node_map[fbody->ret_nodes[i]->id()];
-    Endpoint data;  // Data input for the ret node.
-    for (const Edge* e : ret->in_edges()) {
-      if (!e->IsControlEdge()) {
-        data = {e->src(), e->src_output()};
-        break;
-      }
-    }
-    CHECK(data.node != nullptr);
-    Node* n = output_identity("output", data, i);
-    outputs[i] = n;
-    VLOG(4) << "    [index " << i << "] "
-            << fbody->fdef.signature().output_arg(i).name() << " as "
-            << n->name() << " (ret: " << data.node->name() << ":" << data.index
-            << ", requested_device: " << n->requested_device() << ")";
-    for (const Edge* e : ret->in_edges()) {
-      if (e->IsControlEdge()) {
-        g->AddControlEdge(e->src(), n, kDoNotCheckDuplicates);
-      }
-    }
-    g->RemoveNode(ret);  // 'ret' is disconnected.
-  }
-
-  Node* output_control_node = nullptr;
-  const bool has_control_outputs = absl::c_any_of(
-      caller->out_edges(), [](const Edge* e) { return e->IsControlEdge(); });
-
-  using KeepCallerNode = InlineFunctionBodyOptions::KeepCallerNode;
-  const bool keep_caller_node =
-      options.keep_caller_node == KeepCallerNode::kFetchable ||
-      options.keep_caller_node == KeepCallerNode::kTargetable;
-
-  if (has_control_outputs || keep_caller_node) {
-    output_control_node = no_op("output_control_node");
-    VLOG(4) << "Add output control node: " << output_control_node->name();
-    if (options.output_control_src == OutputControlSrc::kDataOutputs) {
-      for (Node* n : outputs) {
-        VLOG(4) << "    [data output] add control edge from: " << n->name();
-        g->AddControlEdge(n, output_control_node, kDoNotCheckDuplicates);
-      }
-    } else {
-      for (Node* fbody_node : fbody->control_ret_nodes) {
-        Node* n = node_map[fbody_node->id()];
-        VLOG(4) << "    [control output] add control edge from: " << n->name();
-        g->AddControlEdge(n, output_control_node, kDoNotCheckDuplicates);
-      }
-    }
-  }
-
-  // We can't leave output control node without incoming control edges, because
-  // in this case outgoing control edge will loose execution frame information.
-  // We connect input_control_node and output_control_node with a control edge
-  // to forward execution frame to the controlled nodes. Above we add a control
-  // edge to all function calls inside function body, to guarantee that we will
-  // always have input_control_node when we need it.
-  if (output_control_node && output_control_node->in_edges().empty()) {
-    if (input_control_node) {
-      VLOG(4)
-          << "Add add a control edge between input and output control nodes: "
-          << input_control_node->name() << " to "
-          << output_control_node->name();
-      g->AddControlEdge(input_control_node, output_control_node,
-                        kDoNotCheckDuplicates);
-    } else {
-      VLOG(4) << "Function inlining potentially dropped execution frame "
-                 "information from outgoing control edges.";
-    }
-  }
-
-  for (const Edge* e : caller->out_edges()) {
-    if (e->IsControlEdge()) {
-      g->AddControlEdge(output_control_node, e->dst(), kDoNotCheckDuplicates);
-    } else {
-      g->AddEdge(outputs[e->src_output()], 0, e->dst(), e->dst_input());
-    }
-  }
-
-  // ------------------------------------------------------------------------ //
-  // Add an IdentityN or NoOp node in-place of caller node to keep `caller`
-  // fetchable or targetable.
-
-  if (keep_caller_node) {
-    std::vector<NodeBuilder::NodeOut> output_tensors;
-    absl::c_transform(outputs, std::back_inserter(output_tensors),
-                      [](Node* n) { return NodeBuilder::NodeOut(n, 0); });
-
-    Node* caller_substitute_node;
-    if (options.keep_caller_node == KeepCallerNode::kTargetable ||
-        output_tensors.empty()) {
-      // IdentityN node must have at least one data input. If function has no
-      // data outputs, we can't keep it fetchable.
-      TF_CHECK_OK(NodeBuilder(caller->name(), "NoOp")
-                      .Device(caller->requested_device())
-                      .ControlInput(output_control_node)
-                      .Finalize(g, &caller_substitute_node));
-
-    } else if (options.keep_caller_node == KeepCallerNode::kFetchable) {
-      TF_CHECK_OK(NodeBuilder(caller->name(), "IdentityN")
-                      .Device(caller->requested_device())
-                      .Input(output_tensors)
-                      .ControlInput(output_control_node)
-                      .Finalize(g, &caller_substitute_node));
-    }
-  }
-
-  // ------------------------------------------------------------------------ //
-  // 'caller' is replaced with inlined function body nodes and maybe IdentityN
-  // to keep it fetchable.
-  VLOG(3) << "Successfully inlined function call node: " << caller->name();
-  g->RemoveNode(caller);
-
-  return Status::OK();
-}
-
-bool IsFunctionCall(const FunctionLibraryDefinition& lib_def,
-                    const Node& node) {
-  return node.IsFunctionCall();
-}
-
-bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
-                           const ExpandInlineFunctionsOptions& options) {
-  std::vector<std::pair<Node*, const FunctionBody*>> candidates;
-
-  const FunctionLibraryDefinition* fld = lib->GetFunctionLibraryDefinition();
-
-  for (Node* node : graph->nodes()) {
-    // Skip nodes that are not function calls or SymbolicGradient calls.
-    if (!IsFunctionCall(*lib->GetFunctionLibraryDefinition(), *node)) {
-      continue;
-    }
-    // Skip function calls that marked noinline.
-    bool noinline;
-    if (fld->GetAttr(*node, kNoInlineAttr, &noinline).ok() && noinline) {
-      VLOG(3) << "noinline: " << SummarizeNode(*node);
-      continue;
-    }
-    FunctionLibraryRuntime::Handle handle;
-    Status s = InstantiateFunctionCall(node->def(), lib, &handle);
-    if (!s.ok()) {
-      LOG(ERROR) << "Failed to instantiate a function:  " << s.error_message();
-      continue;
-    }
-    const FunctionBody* fbody = lib->GetFunctionBody(handle);
-    CHECK_NOTNULL(fbody);
-    candidates.emplace_back(node, fbody);
-  }
-
-  bool inlined_any = false;
-  for (const auto& p : candidates) {
-    Status inlined = InlineFunctionBody(*fld, graph, p.first, p.second,
-                                        p.first->IsPartitionedCall()
-                                            ? options.multi_device_options
-                                            : options.native_options);
-    if (inlined.ok()) {
-      inlined_any = true;
-    } else {
-      VLOG(1) << "Failed to inline function call: node=" << p.first->name()
-              << " error=" << inlined.error_message();
-    }
-  }
-
-  // TODO(ezhulenev): Release handles for inlined function calls.
-
-  return inlined_any;
-}
-
-string NewName(const Node* n, bool pretty) {
-  if (pretty) {
-    return strings::StrCat(n->type_string(), n->id());
-  } else {
-    return strings::StrCat("n", n->id());
-  }
-}
-
-// TODO(zhifengc): Maybe this should be the default Graph::AsGraphDef.
-// and stash the original NodeDef name as an attr for documentation
-// purpose.
-void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
-  // We visit nodes in forward topological sort order, which is a
-  // possible execution order of the graph.
-  gtl::InlinedVector<const Edge*, 4> inputs;
-  gdef->Clear();
-  *gdef->mutable_versions() = g->versions();
-
-  std::vector<Node*> start_nodes;
-  for (Node* n : g->nodes()) {
-    if (n->out_edges().empty()) {
-      start_nodes.push_back(n);
-    }
-  }
-
-  ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, pretty, &inputs](Node* n) {
-    if (!n->IsOp()) return;
-    NodeDef* ndef = gdef->add_node();
-    ndef->set_name(NewName(n, pretty));
-    ndef->set_op(n->type_string());
-    for (const auto& attr : n->attrs()) {
-      (*ndef->mutable_attr())[attr.first] = attr.second;
-    }
-
-    if (!n->assigned_device_name().empty()) {
-      ndef->set_device(n->assigned_device_name());
-    } else {
-      ndef->set_device(n->requested_device());
-    }
-
-    inputs.clear();
-    inputs.resize(n->num_inputs());
-    for (const Edge* e : n->in_edges()) {
-      if (e->IsControlEdge()) {
-        inputs.push_back(e);
-      } else {
-        if (inputs[e->dst_input()] == nullptr) {
-          inputs[e->dst_input()] = e;
-        } else {
-          LOG(WARNING) << "Malformed graph node. multiple input edges: "
-                       << n->DebugString();
-        }
-      }
-    }
-    // node->name() is merely NodeDef::name, which are not guaranteed
-    // to be unique and stable after optimization rewrites. Therefore,
-    // we use "n<node id>" instead.
-    for (const Edge* e : inputs) {
-      if (e == nullptr) {
-        ndef->add_input("unknown");
-        continue;
-      }
-      const string srcname = NewName(e->src(), pretty);
-      if (!e->src()->IsOp()) {
-      } else if (e->IsControlEdge()) {
-        ndef->add_input(strings::StrCat("^", srcname));
-      } else if (e->src_output() == 0) {
-        ndef->add_input(srcname);
-      } else {
-        ndef->add_input(strings::StrCat(srcname, ":", e->src_output()));
-      }
-    }
-  });
-}
-
-string DebugString(const Graph* g) {
-  GraphDef gdef;
-  ToGraphDef(g, &gdef);
-  return DebugString(gdef);
-}
-
-FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
-                           DataTypeSlice ret_t, Graph* g)
-    : fdef(f),
-      graph(g),
-      arg_types(arg_t.begin(), arg_t.end()),
-      ret_types(ret_t.begin(), ret_t.end()) {
-  // 1. Find regular Arg/Ret nodes.
-  this->arg_nodes.resize(arg_types.size());
-  this->ret_nodes.resize(ret_types.size());
-  for (Node* n : this->graph->op_nodes()) {
-    gtl::InlinedVector<Node*, 4>* node_vec;
-    if (n->type_string() == kRetOp || n->type_string() == kDeviceRetOp) {
-      node_vec = &this->ret_nodes;
-    } else if (n->type_string() == kArgOp || n->type_string() == kDeviceArgOp) {
-      node_vec = &this->arg_nodes;
-    } else {
-      continue;
-    }
-    int index;
-    TF_CHECK_OK(GetNodeAttr(n->attrs(), "index", &index));
-    CHECK_LE(0, index);
-    CHECK_LT(index, node_vec->size());
-    (*node_vec)[index] = n;
-  }
-  // 2. Find ControlRet nodes that must be always executed.
-  std::unordered_set<StringPiece, StringPieceHasher> control_ret_node_names;
-  for (const auto& control_ret : fdef.control_ret()) {
-    control_ret_node_names.insert(control_ret.second);
-  }
-  this->control_ret_nodes.reserve(control_ret_node_names.size());
-  for (Node* n : this->graph->op_nodes()) {
-    if (control_ret_node_names.count(n->name()) > 0) {
-      this->control_ret_nodes.push_back(n);
-    }
-  }
-}
-
-FunctionBody::~FunctionBody() { delete this->graph; }
-
 class SymbolicGradientHelper {
  public:
   explicit SymbolicGradientHelper(const FunctionBody& f) : fbody_(&f) {}
@@ -2514,38 +1495,4 @@ std::unique_ptr<FunctionBody> SymbolicGradient(const FunctionBody& f) {
   return SymbolicGradientHelper(f).Compute();
 }
 
-Status FunctionDefToBodyHelper(
-    const FunctionDef& fdef, const AttrSlice& attrs,
-    const FunctionLibraryDefinition* const lib_def,
-    const std::function<Status(const string&, const OpDef**)>& get_func_sig,
-    std::unique_ptr<FunctionBody>* fbody) {
-  // Instantiates the function template into a graph def.
-  InstantiationResult result;
-  TF_RETURN_IF_ERROR(InstantiateFunction(fdef, attrs, get_func_sig, &result));
-
-  std::unique_ptr<Graph> graph(new Graph(lib_def));
-  GraphConstructorOptions opts;
-  opts.allow_internal_ops = true;
-  opts.expect_device_spec = false;
-  TF_RETURN_IF_ERROR(ConvertNodeDefsToGraph(opts, result.nodes, graph.get()));
-
-  // Call BuildControlFlowInfo to validate that this function body has
-  // well-formed control flow.
-  std::vector<ControlFlowInfo> dummy;
-  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph.get(), &dummy));
-
-  *fbody = absl::make_unique<FunctionBody>(fdef, result.arg_types,
-                                           result.ret_types, graph.release());
-  return Status::OK();
-}
-
-Status FunctionDefToBodyHelper(const FunctionDef& fdef, const AttrSlice& attrs,
-                               const FunctionLibraryDefinition* lib_def,
-                               std::unique_ptr<FunctionBody>* fbody) {
-  const auto get_func_sig = [&lib_def](const string& op, const OpDef** sig) {
-    return lib_def->LookUpOpDef(op, sig);
-  };
-  return FunctionDefToBodyHelper(fdef, attrs, lib_def, get_func_sig, fbody);
-}
-
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index 9071a5cfa50..e75e5c82a40 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -22,7 +22,11 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
@@ -30,8 +34,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-static constexpr const char* const kNoInlineAttr = "_noinline";
-
 // Get default customizable kernel creator if set
 const CustomKernelCreator* GetDefaultCustomKernelCreator();
 
@@ -67,88 +69,6 @@ std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     const SessionMetadata* session_metadata,
     ProcessFunctionLibraryRuntime* parent);
 
-// FunctionLibraryRuntime::GetFunctionBody returns a description of an
-// instantiated function that is represented as a Graph with arg/ret
-// nodes annotated.
-struct FunctionBody {
-  FunctionDef fdef;
-  Graph* graph = nullptr;  // owned.
-  DataTypeVector arg_types;
-  DataTypeVector ret_types;
-  // arg_nodes[i] contains the i'th function input. In other words,
-  // GetNodeAttr(arg_nodes[i]->attrs(), "index") == i.
-  gtl::InlinedVector<Node*, 4> arg_nodes;
-  // ret_nodes[i] contains the i'th function output. In other words,
-  // GetNodeAttr(ret_nodes[i]->attrs(), "index") == i.
-  gtl::InlinedVector<Node*, 4> ret_nodes;
-  gtl::InlinedVector<Node*, 4> control_ret_nodes;
-
-  FunctionBody() {}
-  FunctionBody(const FunctionDef& f, DataTypeSlice arg_types,
-               DataTypeSlice ret_types, Graph* g);
-  ~FunctionBody();
-};
-
-// Debugging facility.  Returns a debug string for a graph
-// representing an instantiated function.
-string DebugString(const Graph* g);
-
-// A few hand-crafted optimization on the instantiated function body
-// (a Graph*).
-
-// Removes nodes that are
-//   1. not stateful; and
-//   2. not _Arg; and
-//   3. not reachable from _Retval.
-//
-// This function is triggered by function inlining, unlike 'PruneFunctionBody'
-// it doesn't preserve nodes that are reachable from control returns. Function
-// inlining is responsible for connecting control return nodes with the nodes
-// that have input control edges from the inlined function call node.
-//
-// Assuming that automatic control dependency tracking is correct, absence of
-// outgoing control edge from the function call node means that no one needs to
-// observe side-effect that might have been generated by the function (see
-// documentation in common_runtime/function.cc for details).
-//
-// Returns true iff any node is removed from "g".
-bool RemoveDeadNodes(Graph* g);
-
-// Find a pattern:
-//   src -(in)-> node -(out)-> dst, where
-// 1) node is an identity node;
-// 2) in is the only incoming data edge;
-// 3) out is the only outgoing data edge;
-//
-// Rewrites the above pattern with src->dst and relevant data
-// dependencies updated. Repeat the process until no such pattern
-// left.
-bool RemoveIdentityNodes(Graph* g);
-
-// Rewrites _ListToArray and _ArrayToList to a set of Identity nodes.
-bool RemoveListArrayConverter(Graph* g);
-
-// Dump the contents of the "graph" to log files if the logging level is
-// sufficiently high.
-void DumpGraph(StringPiece label, const Graph* g);
-
-// Applies graph rewrite optimization such as inlining, dead code
-// removal, etc.
-//
-// **g is a graph constructed based on the runtime library 'lib'.
-// OptimizeGraph mutates **g extensively and replaces '*g' with a
-// complete copy. Therefore, the caller should not keep any references
-// to nodes *g.
-void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
-                   const GraphOptimizer::Options& graph_optimizer_options);
-void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g);
-
-// Convert the Graph of a function to a GraphDef.
-//
-// Handles renaming of nodes to avoid duplicate names which may
-// be present after various rewriting operations.
-void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty = false);
-
 // Given a numerical function "f", returns another numerical function
 // "g", such that if "f" takes N inputs and produces M outputs, "g"
 // takes N + M inputs and produces N outputs. I.e., if
@@ -161,241 +81,6 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty = false);
 // TODO(zhifengc): Asks math expert to say the comment again.
 std::unique_ptr<FunctionBody> SymbolicGradient(const FunctionBody& f);
 
-// Optionally override device assignment for nodes added to the graph for
-// inlined functions:
-// (1) Identity nodes added in place of function input arguments.
-// (2) Identity nodes added in place of function return values.
-// (3) Special NoOp nodes that enforce side-effects execution order.
-// (4) All nodes inside function body specified in FunctionDef.
-class InlinedFunctionBodyPlacer {
- public:
-  virtual ~InlinedFunctionBodyPlacer() = default;
-
-  virtual absl::optional<string> InputNodeDevice(int input_index) const = 0;
-  virtual absl::optional<string> OutputNodeDevice(int output_index) const = 0;
-  // Returns true if the added input/output identity nodes should be colocated
-  // with the corresponding input/output from the function body.
-  virtual bool ColocateInputOutputIdentities() const = 0;
-  virtual absl::optional<string> ControlNodeDevice() const = 0;
-  virtual absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const = 0;
-
-  // Place input nodes on the same device as the corresponding caller input
-  // node. Do not specify any placement for all other nodes.
-  static std::unique_ptr<InlinedFunctionBodyPlacer> DefaultPlacer(
-      const Graph& graph, const Node& caller);
-
-  // Place all nodes on the same device as caller node.
-  static std::unique_ptr<InlinedFunctionBodyPlacer> SingleDevicePlacer(
-      const Graph& graph, const Node& caller);
-
-  // Place input nodes on the same device as the corresponding caller input
-  // node. Do not place output node. Place control nodes on the same device as
-  // caller node. For all function body nodes overrides job, replica and task
-  // parts of the device assignment to match function caller node.
-  static std::unique_ptr<InlinedFunctionBodyPlacer> MultiDevicePlacer(
-      const Graph& graph, const Node& caller);
-
-  using Factory = std::function<std::unique_ptr<InlinedFunctionBodyPlacer>(
-      const Graph&, const Node&)>;
-
-  struct Config {
-    string name;
-    Factory get;
-  };
-
-  static Config Default() { return {"default", DefaultPlacer}; }
-  static Config SingleDevice() { return {"single_device", SingleDevicePlacer}; }
-  static Config MultiDevice() { return {"multi_device", MultiDevicePlacer}; }
-};
-
-struct InlineFunctionBodyOptions {
-  // All nodes that have incoming control edge *from* the function call node,
-  // will be forwarded to the "output control node". There are two options for
-  // choosing which nodes will have a control edge *to* the "output control
-  // node":
-  //   a) control returns            (`control_ret` field in FunctionDef)
-  //   b) data returns               (`ret` field in FunctionDef)
-  enum class OutputControlSource { kDataOutputs, kControlOutputs };
-
-  // Keep a node in a graph with the same name as the function call node:
-  //
-  // a) DoNotKeep: Function call node is fully inlined, and there is no node in
-  //    a graph with the same name.
-  //
-  // b) Fetchable: Add an IdentityN node to the graph in place of the inlined
-  //    function call node. It will have a control edge from inlined
-  //    'output_control_node' and data edges from function output nodes.
-  //    The IdentityN node will be placed on the same device as the caller node.
-  //
-  //    This is mostly for compatibility with Tensorflow v1 and sessions.
-  //    When we prepare a graph for execution in
-  //    GraphExecutionState::MakeForBaseGraph we don't know what nodes will be
-  //    fetched, so we can't safely remove any of them. When graph executed as a
-  //    function it has 'Retval' nodes for all fetched tensors, and we can
-  //    safely inline function calls.
-  //
-  // c) Targetable: Add a NoOp node to the graph in place of the inlined
-  //    function call node. It will have a control edge from inline
-  //    'output_control_node' and no data edges. NoOp node will be placed on the
-  //    same device as the caller node. This will keep the inlined function call
-  //    node a valid 'session.run' target, and also will keep it a valid control
-  //    output node.
-  enum class KeepCallerNode { kDoNotKeep, kFetchable, kTargetable };
-
-  // If 'true' function inlining is completely disabled. This allows to control
-  // function inlining for different types of function calls (see
-  // 'ExpandInlineFunctionsOptions' below).
-  bool disable_inlining = false;
-  // Ignore '_noinline' function attribute.
-  bool ignore_noinline = false;
-  // If 'true' function inlining will inline functions in implementation
-  // selection group. Normally those functions should not be inlined; they will
-  // be handled by Grappler.
-  bool inline_impl_selection_group_functions = false;
-  // Controls if we want to keep a node with the name as the function call node
-  // in a graph after function inlining.
-  KeepCallerNode keep_caller_node = KeepCallerNode::kDoNotKeep;
-  // For compatibility with Tensorflow v1 by default we will use data outputs.
-  // Control returns were added to Tensorflow v2 with automatic control
-  // dependencies tracking in Eager mode.
-  OutputControlSource output_control_src = OutputControlSource::kDataOutputs;
-  // Inlined function body placer decides what requested device assignments
-  // should be added to the nodes added to the graph. See documentation above
-  // for available strategies.
-  InlinedFunctionBodyPlacer::Config inlined_function_body_placer =
-      InlinedFunctionBodyPlacer::Default();
-  // If true, frame names in the function body will be
-  // made unique in the resulting graph (e.g. by prepending a unique prefix).
-  // NOTE(mrry): Only set this option to false when there is a single function
-  // call in the graph (e.g. when making a remote function call via
-  // ClusterFunctionLibraryRuntime). This option is provided because the graph
-  // partitioner generates frame names that must remain unmodified across all
-  // partitions of a multi-device function.
-  bool uniquify_frame_names = true;
-
-  // A human-readable debug string for this options.
-  string DebugString() const;
-};
-
-// Returns 'Status::OK()' iff the function '*fbody' can be inlined at 'node'
-// based on the type signature of 'node' and 'fbody':
-//
-// (1) Caller node has the same number of inputs and outputs as the function.
-// (2) Caller node inputs and outputs have the same data types as function
-//     inputs and returns.
-// (3) Validation rules defined in InlineFunctionBodyOptions.
-//
-// If function can't be safely inlined, returns error message with details why
-// inlining is not possible or safe.
-Status ValidateInlining(const Node* node, const FunctionBody* fbody,
-                        const InlineFunctionBodyOptions& options);
-
-// Given a "caller" in graph "g", which is a function call of a function
-// to "fbody". Replaces the "caller" with fbody->graph and connects
-// edges properly. "override_device" specifies whether inlining should replace
-// explicitly specified devices inside fbody with the callee's device.
-//
-// Returns 'Status::OK()' if function was successfully inlined into the graph.
-// If function inlining is not possible returns an error with a reason, and
-// leaves the graph in unmodified state.
-Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
-                          Node* caller, const FunctionBody* fbody,
-                          const InlineFunctionBodyOptions& options);
-
-// There are three types of function calls that could be invoked during
-// *Tensorflow graph execution*:
-//
-// 1) Native function call (node.type_string() is the function name). These
-//    functions are always executed on a single-device, which is the device of
-//    the function call node.
-//
-// 2) Multi-device function calls (PartitionedCall or StatefulPartitionedCall
-//    ops) can execute on multiple devices and accept DT_RESOURCE inputs that
-//    belong to different devices. This type of functions was added in
-//    Tensorflow 2.0 Eager mode, and it has control outputs to represent
-//    side-effects that must always execute (see `control_ret` in FunctionDef).
-//
-// 3) SymbolicGradient has been deprecated for a while, but we still keep it and
-//    use `native` options for inlining for compatibility.
-//
-// We need to have distinct inlining rules for compatibility with Tensorflow v1.
-//
-// There are few other places in Tensorflow that could execute functions:
-//
-// 1) common_runtime/eager/kernel_and_device.{h,cc} - executes "top level"
-//    functions directly via function library runtime, without going through
-//    the graph.
-// 2) tf.data pipelines - also execute functions directly via function library
-//    runtime with custom executors.
-struct ExpandInlineFunctionsOptions {
-  ExpandInlineFunctionsOptions() : native_options(), multi_device_options() {
-    using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
-    multi_device_options.output_control_src = OutputControlSrc::kControlOutputs;
-  }
-
-  InlineFunctionBodyOptions native_options;
-  InlineFunctionBodyOptions multi_device_options;
-};
-
-// WARNING(ezhulenev): PLEASE DO NOT USE THIS FUNCTION. This is a temporary
-// workaround that will be enabled only during the function inlining unification
-// (b/126811947). Contact ezhulenev@ if you think you need it.
-// TODO(ezhulenev): Delete this function.
-bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
-                           const ExpandInlineFunctionsOptions& options);
-
-// For each node in "graph", if "lib" indicates that the node is a
-// function call, inline the function body. Returns true if at least
-// one node is inlined.
-//
-// This routine goes through "graph" nodes once and applies the
-// inlining. The caller may decide to apply the inlining on "graph"
-// multiple times by calling ExpandInlineFunctions a few times.
-//
-// Function calls that can't be safely inlined into the graph (ValidateInlining
-// returns error), are ignored.
-//
-// TODO(ezhulenev): We do not FunctionLibraryRuntime for this. We need just the
-// FunctionLibraryDefinition and FunctionDefToBodyHelper to implement this (see
-// lower_function_call.cc).
-inline bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
-  return ExpandInlineFunctions(lib, graph, ExpandInlineFunctionsOptions());
-}
-
-// Extracts function name and attributes from `call_def`
-// `call_def` can be a native function call (where the op type is the function
-// name) or a call through PartitionedCall/StatefulPartitionedCall.
-Status NameAndAttrsFromFunctionCall(const NodeDef& call_def,
-                                    NameAttrList* function);
-
-// Extracts function name and attributes from `call_def` and invokes
-// flr->Instantiate(name, attrs, handle).
-// `call_def` can be a native function call (where the op type is the function
-// name) or a call through PartitionedCall/StatefulPartitionedCall.
-Status InstantiateFunctionCall(const NodeDef& call_def,
-                               FunctionLibraryRuntime* flr,
-                               FunctionLibraryRuntime::Handle* handle);
-
-// Returns true iff `n` represents a function call. `n` can be a native
-// function call (n.type_string() is the function name),
-// a PartitionedCall/StatefulPartitionedCall, or a SymbolicGradient (which
-// has been deprecated for a while).
-bool IsFunctionCall(const FunctionLibraryDefinition& lib_def, const Node& n);
-
-// Instantiates FunctionDef into a graph. Set *fbody to point to the
-// FunctionBody that holds the instantiated FunctionDef.
-Status FunctionDefToBodyHelper(const FunctionDef& fdef, const AttrSlice& attrs,
-                               const FunctionLibraryDefinition* lib_def,
-                               std::unique_ptr<FunctionBody>* fbody);
-
-// Instantiates FunctionDef into a graph. Set *fbody to point to the
-// FunctionBody that holds the instantiated FunctionDef. Use custom function
-// signature lookup, in case instantiated function is not in the 'lib_def'.
-Status FunctionDefToBodyHelper(
-    const FunctionDef& fdef, const AttrSlice& attrs,
-    const FunctionLibraryDefinition* lib_def,
-    const std::function<Status(const string&, const OpDef**)>& get_func_sig,
-    std::unique_ptr<FunctionBody>* fbody);
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_H_
diff --git a/tensorflow/core/common_runtime/function_body.cc b/tensorflow/core/common_runtime/function_body.cc
new file mode 100644
index 00000000000..3b3442bf7f5
--- /dev/null
+++ b/tensorflow/core/common_runtime/function_body.cc
@@ -0,0 +1,64 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/function_body.h"
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
+                           DataTypeSlice ret_t, Graph* g)
+    : fdef(f),
+      graph(g),
+      arg_types(arg_t.begin(), arg_t.end()),
+      ret_types(ret_t.begin(), ret_t.end()) {
+  // 1. Find regular Arg/Ret nodes.
+  this->arg_nodes.resize(arg_types.size());
+  this->ret_nodes.resize(ret_types.size());
+  for (Node* n : this->graph->op_nodes()) {
+    gtl::InlinedVector<Node*, 4>* node_vec;
+    if (n->type_string() == FunctionLibraryDefinition::kRetOp ||
+        n->type_string() == FunctionLibraryDefinition::kDeviceRetOp) {
+      node_vec = &this->ret_nodes;
+    } else if (n->type_string() == FunctionLibraryDefinition::kArgOp ||
+               n->type_string() == FunctionLibraryDefinition::kDeviceArgOp) {
+      node_vec = &this->arg_nodes;
+    } else {
+      continue;
+    }
+    int index;
+    TF_CHECK_OK(GetNodeAttr(n->attrs(), "index", &index));
+    CHECK_LE(0, index);
+    CHECK_LT(index, node_vec->size());
+    (*node_vec)[index] = n;
+  }
+  // 2. Find ControlRet nodes that must be always executed.
+  std::unordered_set<StringPiece, StringPieceHasher> control_ret_node_names;
+  for (const auto& control_ret : fdef.control_ret()) {
+    control_ret_node_names.insert(control_ret.second);
+  }
+  this->control_ret_nodes.reserve(control_ret_node_names.size());
+  for (Node* n : this->graph->op_nodes()) {
+    if (control_ret_node_names.count(n->name()) > 0) {
+      this->control_ret_nodes.push_back(n);
+    }
+  }
+}
+
+FunctionBody::~FunctionBody() { delete this->graph; }
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function_body.h b/tensorflow/core/common_runtime/function_body.h
new file mode 100644
index 00000000000..cbd602612a2
--- /dev/null
+++ b/tensorflow/core/common_runtime/function_body.h
@@ -0,0 +1,52 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_BODY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_BODY_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+class Graph;
+class Node;
+
+// FunctionLibraryRuntime::GetFunctionBody returns a description of an
+// instantiated function that is represented as a Graph with arg/ret
+// nodes annotated.
+struct FunctionBody {
+  FunctionDef fdef;
+  Graph* graph = nullptr;  // owned.
+  DataTypeVector arg_types;
+  DataTypeVector ret_types;
+  // arg_nodes[i] contains the i'th function input. In other words,
+  // GetNodeAttr(arg_nodes[i]->attrs(), "index") == i.
+  gtl::InlinedVector<Node*, 4> arg_nodes;
+  // ret_nodes[i] contains the i'th function output. In other words,
+  // GetNodeAttr(ret_nodes[i]->attrs(), "index") == i.
+  gtl::InlinedVector<Node*, 4> ret_nodes;
+  gtl::InlinedVector<Node*, 4> control_ret_nodes;
+
+  FunctionBody() {}
+  FunctionBody(const FunctionDef& f, DataTypeSlice arg_types,
+               DataTypeSlice ret_types, Graph* g);
+  ~FunctionBody();
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_BODY_H_
diff --git a/tensorflow/core/common_runtime/function_def_utils.cc b/tensorflow/core/common_runtime/function_def_utils.cc
new file mode 100644
index 00000000000..b880a5488f9
--- /dev/null
+++ b/tensorflow/core/common_runtime/function_def_utils.cc
@@ -0,0 +1,63 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/function_def_utils.h"
+
+#include <vector>
+
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+Status FunctionDefToBodyHelper(
+    const FunctionDef& fdef, const AttrSlice& attrs,
+    const FunctionLibraryDefinition* const lib_def,
+    const std::function<Status(const string&, const OpDef**)>& get_func_sig,
+    std::unique_ptr<FunctionBody>* fbody) {
+  // Instantiates the function template into a graph def.
+  InstantiationResult result;
+  TF_RETURN_IF_ERROR(InstantiateFunction(fdef, attrs, get_func_sig, &result));
+
+  std::unique_ptr<Graph> graph(new Graph(lib_def));
+  GraphConstructorOptions opts;
+  opts.allow_internal_ops = true;
+  opts.expect_device_spec = false;
+  TF_RETURN_IF_ERROR(ConvertNodeDefsToGraph(opts, result.nodes, graph.get()));
+
+  // Call BuildControlFlowInfo to validate that this function body has
+  // well-formed control flow.
+  std::vector<ControlFlowInfo> dummy;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph.get(), &dummy));
+
+  *fbody = absl::make_unique<FunctionBody>(fdef, result.arg_types,
+                                           result.ret_types, graph.release());
+  return Status::OK();
+}
+
+Status FunctionDefToBodyHelper(const FunctionDef& fdef, const AttrSlice& attrs,
+                               const FunctionLibraryDefinition* lib_def,
+                               std::unique_ptr<FunctionBody>* fbody) {
+  const auto get_func_sig = [&lib_def](const string& op, const OpDef** sig) {
+    return lib_def->LookUpOpDef(op, sig);
+  };
+  return FunctionDefToBodyHelper(fdef, attrs, lib_def, get_func_sig, fbody);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function_def_utils.h b/tensorflow/core/common_runtime/function_def_utils.h
new file mode 100644
index 00000000000..f269cc6a608
--- /dev/null
+++ b/tensorflow/core/common_runtime/function_def_utils.h
@@ -0,0 +1,49 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_DEF_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_DEF_UTILS_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class AttrSlice;
+struct FunctionBody;
+class FunctionDef;
+class FunctionLibraryDefinition;
+class OpDef;
+
+// Instantiates FunctionDef into a graph. Set *fbody to point to the
+// FunctionBody that holds the instantiated FunctionDef.
+Status FunctionDefToBodyHelper(const FunctionDef& fdef, const AttrSlice& attrs,
+                               const FunctionLibraryDefinition* lib_def,
+                               std::unique_ptr<FunctionBody>* fbody);
+
+// Instantiates FunctionDef into a graph. Set *fbody to point to the
+// FunctionBody that holds the instantiated FunctionDef. Use custom function
+// signature lookup, in case instantiated function is not in the 'lib_def'.
+Status FunctionDefToBodyHelper(
+    const FunctionDef& fdef, const AttrSlice& attrs,
+    const FunctionLibraryDefinition* lib_def,
+    const std::function<Status(const string&, const OpDef**)>& get_func_sig,
+    std::unique_ptr<FunctionBody>* fbody);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_DEF_UTILS_H_
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index c0d69cf6d93..1deafe31ae2 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/executor_factory.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/versions.pb.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -394,6 +394,100 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesTwo) {
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
 }
 
+TEST_F(FunctionLibraryRuntimeTest, XTimesTwo_MultiDeviceBacked) {
+  Init({test::function::XTimesTwo()});
+  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  Tensor y;
+
+  FunctionLibraryRuntime::InstantiateOptions options;
+  options.is_multi_device_function = true;
+
+  TF_CHECK_OK(InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, options,
+                                {x}, {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+}
+
+class ConsumeArgumentCallFrame : public CallFrameInterface {
+ public:
+  ConsumeArgumentCallFrame(Tensor* arg, Tensor* retval)
+      : arg_(arg), retval_(retval) {}
+
+  size_t num_args() const override { return 1; }
+  size_t num_retvals() const override { return 1; }
+
+  Status GetArg(int index, const Tensor** val) override {
+    LOG(FATAL) << "Should not be called.";
+  }
+
+  bool CanConsumeArg(int index) const override { return index == 0; }
+
+  void ConsumeArg(int index, Tensor* val) override { *val = std::move(*arg_); }
+
+  Status SetRetval(int index, const Tensor& val) override {
+    CHECK_EQ(index, 0);
+    *retval_ = val;
+    return Status::OK();
+  }
+
+ private:
+  Tensor* const arg_;
+  Tensor* const retval_;
+};
+
+TEST_F(FunctionLibraryRuntimeTest, XTimesTwo_ConsumeArgument_DefaultExecutor) {
+  Init({test::function::XTimesTwo()});
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(flr0_->Instantiate(
+      "XTimesTwo", test::function::Attrs({{"T", DT_FLOAT}}), &handle));
+
+  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  float* x_base_ptr = &x.flat<float>()(0);
+  Tensor y;
+  ConsumeArgumentCallFrame frame(&x, &y);
+
+  FunctionLibraryRuntime::Options opts;
+  TF_CHECK_OK(Run(flr0_, handle, opts, &frame));
+
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+
+  // Expect that the buffer for `x` has been forwarded to and used as the buffer
+  // for `y`.
+  float* y_base_ptr = &y.flat<float>()(0);
+  EXPECT_EQ(x_base_ptr, y_base_ptr);
+  EXPECT_FALSE(x.IsInitialized());
+
+  TF_CHECK_OK(flr0_->ReleaseHandle(handle));
+}
+
+TEST_F(FunctionLibraryRuntimeTest,
+       XTimesTwo_ConsumeArgument_SingleThreadedExecutor) {
+  Init({test::function::XTimesTwo()});
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(flr0_->Instantiate("XTimesTwo",
+                                 test::function::Attrs({{"T", DT_FLOAT}}),
+                                 instantiate_opts, &handle));
+
+  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  float* x_base_ptr = &x.flat<float>()(0);
+  Tensor y;
+  ConsumeArgumentCallFrame frame(&x, &y);
+
+  FunctionLibraryRuntime::Options opts;
+  TF_CHECK_OK(Run(flr0_, handle, opts, &frame, /* add_runner= */ false));
+
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+
+  // Expect that the buffer for `x` has been forwarded to and used as the buffer
+  // for `y`.
+  float* y_base_ptr = &y.flat<float>()(0);
+  EXPECT_EQ(x_base_ptr, y_base_ptr);
+  EXPECT_FALSE(x.IsInitialized());
+
+  TF_CHECK_OK(flr0_->ReleaseHandle(handle));
+}
+
 TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 3b1f90e7198..0786d9032a8 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/function.h"
-
 #include <atomic>
 #include <utility>
 
@@ -25,7 +23,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/common_runtime/function_utils.cc b/tensorflow/core/common_runtime/function_utils.cc
new file mode 100644
index 00000000000..c332927cb95
--- /dev/null
+++ b/tensorflow/core/common_runtime/function_utils.cc
@@ -0,0 +1,368 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/function_utils.h"
+
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+static constexpr const char* const kNodeLabel = "Func";
+
+// Represents the index-th output of a node.
+struct Endpoint {
+  Node* node;
+  int index;
+
+  // Returns the string name represents this endpoint.
+  string name() const {
+    if (index == 0) {
+      return node->name();
+    } else {
+      return strings::StrCat(node->name(), ":", index);
+    }
+  }
+
+  DataType dtype() const { return node->output_type(index); }
+};
+
+// The following Add* routines are used to add a few graph nodes while
+// functions are transformed.
+static Node* AddNoOp(StringPiece name, Graph* g) {
+  NodeDef ndef;
+  ndef.set_name(g->NewName(absl::StrCat(kNodeLabel, "/", name)));
+  ndef.set_op("NoOp");
+  Status s;
+  Node* ret = g->AddNode(ndef, &s);
+  TF_CHECK_OK(s);
+  return ret;
+}
+
+static Node* AddIdentity(StringPiece name, Graph* g, Endpoint input) {
+  DCHECK_LT(0, input.dtype());
+  NodeDef ndef;
+  ndef.set_name(g->NewName(absl::StrCat(kNodeLabel, "/", name)));
+  ndef.set_op("Identity");
+  ndef.add_input(input.name());
+  AddNodeAttr("T", BaseType(input.dtype()), &ndef);
+  Status s;
+  Node* ret = g->AddNode(ndef, &s);
+  TF_CHECK_OK(s);
+  g->AddEdge(input.node, input.index, ret, 0);
+  return ret;
+}
+
+void DumpGraph(StringPiece label, const Graph* g) {
+  // TODO(zhifengc): Change Graph to record #nodes.
+  VLOG(2) << "Graph " << label << " #nodes " << g->num_nodes() << " #edges "
+          << g->num_edges();
+  if (VLOG_IS_ON(5)) {
+    for (const auto& line : str_util::Split(DebugString(g), '\n')) {
+      VLOG(5) << "|| " << line;
+    }
+  }
+}
+
+bool RemoveDeadNodes(Graph* g) {
+  VLOG(2) << "Removing dead nodes";
+  std::unordered_set<const Node*> nodes;
+  for (auto n : g->nodes()) {
+    if (n->IsSource() || n->IsSink() || n->IsControlFlow() ||
+        n->op_def().is_stateful()) {
+      nodes.insert(n);
+    }
+  }
+  return PruneForReverseReachability(g, std::move(nodes));
+}
+
+namespace {
+// If 'edges' contains only 1 non-control edge, returns it. Otherwise,
+// returns a nullptr.
+const Edge* GetTheOnlyDataEdge(const EdgeSet& edges) {
+  const Edge* ret = nullptr;
+  for (const Edge* e : edges) {
+    if (e->IsControlEdge() || ret) {
+      // Don't touch it if there is a control edge.
+      return nullptr;
+    }
+    if (IsRefType(e->src()->output_type(e->src_output()))) {
+      // Don't touch it if the identity node is effectively de-reffing
+      // a ref.
+      return nullptr;
+    }
+    if (IsRecv(e->src()) || IsSwitch(e->src())) {
+      // Don't touch it if the identity is introduced for control flow.
+      // Recv disables all its successors if it receives a dead signal.
+      // When Recv has an outgoing control edge, the current executor
+      // would not disable the destination. The current solution (see
+      // graph_partition.cc) is to add an identity after Recv and change
+      // the control edge to be from this identity node. So the identity
+      // can't be removed.
+      return nullptr;
+    }
+    ret = e;
+  }
+  return ret;
+}
+}  // end namespace
+
+bool RemoveIdentityNodes(Graph* g) {
+  VLOG(2) << "Removing identity nodes";
+  bool removed_any = false;
+  gtl::InlinedVector<Node*, 8> matches;
+  for (Node* n : g->nodes()) {
+    if (!n->IsIdentity()) continue;
+    if (!GetTheOnlyDataEdge(n->in_edges())) continue;
+
+    // Some identity nodes are used as sink nodes to give names to output
+    // tensors. These nodes are not going to be executed unless they are in the
+    // fetch set. But if they are in the fetch set we don't want to remove them.
+    if (n->out_edges().empty()) continue;
+
+    matches.push_back(n);
+  }
+  if (!matches.empty()) {
+    for (Node* n : matches) {
+      const Edge* in = GetTheOnlyDataEdge(n->in_edges());
+      for (const Edge* out : n->out_edges()) {
+        if (out->IsControlEdge()) {
+          g->AddControlEdge(in->src(), out->dst());
+        } else {
+          g->AddEdge(in->src(), in->src_output(), out->dst(), out->dst_input());
+        }
+      }
+      VLOG(2) << "Remove Identity: " << n->DebugString();
+      g->RemoveNode(n);
+      removed_any = true;
+    }
+  }
+  return removed_any;
+}
+
+bool RemoveListArrayConverter(Graph* g) {
+  VLOG(2) << "Removing list array converter";
+  gtl::InlinedVector<Node*, 8> matches;
+  for (Node* n : g->nodes()) {
+    if ((n->type_string() == "_ListToArray") ||
+        (n->type_string() == "_ArrayToList")) {
+      matches.push_back(n);
+    }
+  }
+  bool removed_any = false;
+  if (!matches.empty()) {
+    for (Node* n : matches) {
+      if (n->num_inputs() != n->num_outputs()) {
+        continue;  // Not expected. Skip.
+      }
+      gtl::InlinedVector<Node*, 8> identity_nodes(n->num_inputs(), nullptr);
+
+      const auto no_op = [&](StringPiece name) -> Node* {
+        return AddNoOp(absl::StrCat(n->name(), "/", name), g);
+      };
+
+      const auto identity = [&](StringPiece name, Endpoint input) -> Node* {
+        Node* node = AddIdentity(absl::StrCat(n->name(), "/", name), g, input);
+        node->set_requested_device(input.node->def().device());
+        return node;
+      };
+
+      // Process input edges first.
+      Node* input_control_node = nullptr;
+      for (const Edge* e : n->in_edges()) {
+        if (e->IsControlEdge()) {
+          if (input_control_node == nullptr) {
+            // If node "n" has any control dependencies, adds a no-op
+            // node (input_control_node) which the additional Identity
+            // nodes depends on and the input_control_node depends on
+            // the node "n"s control dependencies.
+            input_control_node = no_op("input_control_node");
+          }
+          g->AddControlEdge(e->src(), input_control_node);
+        } else {
+          const int index = e->dst_input();
+          Node** id_node = &identity_nodes[index];
+          if (*id_node != nullptr) {
+            LOG(ERROR)
+                << "RemoveListArrayConverter unexpected duplicated input: "
+                << e->dst_input();
+            return removed_any;
+          }
+          *id_node = identity("input", {e->src(), e->src_output()});
+        }
+      }
+
+      // If node "n" has any control dependencies, the added identity
+      // nodes should have control dependencies on input_control_node.
+      if (input_control_node != nullptr) {
+        for (Node* id : identity_nodes) {
+          g->AddControlEdge(input_control_node, id);
+        }
+      }
+
+      Node* output_control_node = nullptr;
+      for (const Edge* e : n->out_edges()) {
+        if (e->IsControlEdge()) {
+          if (output_control_node == nullptr) {
+            // If node "n" is control-depended upon by other nodes,
+            // adds a no-op node (output_control_node) which those
+            // nodes will depend on and output_control_node depends on
+            // all Identity nodes.
+            output_control_node = no_op("output_control_node");
+          }
+          g->AddControlEdge(output_control_node, e->dst());
+        } else {
+          Node* id_node = identity_nodes[e->src_output()];
+          if (id_node == nullptr) {
+            LOG(ERROR) << "RemoveListArrayConverter unexpected missing input: "
+                       << e->src_output();
+            return removed_any;
+          }
+          CHECK(id_node);
+          g->AddEdge(id_node, 0, e->dst(), e->dst_input());
+        }
+      }
+
+      // If any nodes have control dependencies on node "n", those
+      // nodes should have control dependencies on
+      // output_control_node.
+      if (output_control_node != nullptr) {
+        for (Node* id : identity_nodes) {
+          g->AddControlEdge(id, output_control_node);
+        }
+      }
+
+      g->RemoveNode(n);
+      removed_any = true;
+    }
+  }
+  return removed_any;
+}
+
+Status NameAndAttrsFromFunctionCall(const NodeDef& call_def,
+                                    NameAttrList* function) {
+  if (call_def.op() == "PartitionedCall" ||
+      call_def.op() == "StatefulPartitionedCall") {
+    TF_RETURN_IF_ERROR(GetNodeAttr(call_def, "f", function));
+  } else {
+    function->set_name(call_def.op());
+    *function->mutable_attr() = call_def.attr();
+  }
+  return Status::OK();
+}
+
+Status InstantiateFunctionCall(const NodeDef& call_def,
+                               FunctionLibraryRuntime* flr,
+                               FunctionLibraryRuntime::Handle* handle) {
+  NameAttrList function;
+  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(call_def, &function));
+  return flr->Instantiate(function.name(), AttrSlice(&function.attr()), handle);
+}
+
+bool IsFunctionCall(const FunctionLibraryDefinition& lib_def,
+                    const Node& node) {
+  return node.IsFunctionCall();
+}
+
+string NewName(const Node* n, bool pretty) {
+  if (pretty) {
+    return strings::StrCat(n->type_string(), n->id());
+  } else {
+    return strings::StrCat("n", n->id());
+  }
+}
+
+// TODO(zhifengc): Maybe this should be the default Graph::AsGraphDef.
+// and stash the original NodeDef name as an attr for documentation
+// purpose.
+void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
+  // We visit nodes in forward topological sort order, which is a
+  // possible execution order of the graph.
+  gtl::InlinedVector<const Edge*, 4> inputs;
+  gdef->Clear();
+  *gdef->mutable_versions() = g->versions();
+
+  std::vector<Node*> start_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->out_edges().empty()) {
+      start_nodes.push_back(n);
+    }
+  }
+
+  ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, pretty, &inputs](Node* n) {
+    if (!n->IsOp()) return;
+    NodeDef* ndef = gdef->add_node();
+    ndef->set_name(NewName(n, pretty));
+    ndef->set_op(n->type_string());
+    for (const auto& attr : n->attrs()) {
+      (*ndef->mutable_attr())[attr.first] = attr.second;
+    }
+
+    if (!n->assigned_device_name().empty()) {
+      ndef->set_device(n->assigned_device_name());
+    } else {
+      ndef->set_device(n->requested_device());
+    }
+
+    inputs.clear();
+    inputs.resize(n->num_inputs());
+    for (const Edge* e : n->in_edges()) {
+      if (e->IsControlEdge()) {
+        inputs.push_back(e);
+      } else {
+        if (inputs[e->dst_input()] == nullptr) {
+          inputs[e->dst_input()] = e;
+        } else {
+          LOG(WARNING) << "Malformed graph node. multiple input edges: "
+                       << n->DebugString();
+        }
+      }
+    }
+    // node->name() is merely NodeDef::name, which are not guaranteed
+    // to be unique and stable after optimization rewrites. Therefore,
+    // we use "n<node id>" instead.
+    for (const Edge* e : inputs) {
+      if (e == nullptr) {
+        ndef->add_input("unknown");
+        continue;
+      }
+      const string srcname = NewName(e->src(), pretty);
+      if (!e->src()->IsOp()) {
+      } else if (e->IsControlEdge()) {
+        ndef->add_input(strings::StrCat("^", srcname));
+      } else if (e->src_output() == 0) {
+        ndef->add_input(srcname);
+      } else {
+        ndef->add_input(strings::StrCat(srcname, ":", e->src_output()));
+      }
+    }
+  });
+}
+
+string DebugString(const Graph* g) {
+  GraphDef gdef;
+  ToGraphDef(g, &gdef);
+  return DebugString(gdef);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function_utils.h b/tensorflow/core/common_runtime/function_utils.h
new file mode 100644
index 00000000000..8a3de2a8402
--- /dev/null
+++ b/tensorflow/core/common_runtime/function_utils.h
@@ -0,0 +1,105 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_UTILS_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class AttrSlice;
+class Graph;
+class GraphDef;
+class NameAttrList;
+class Node;
+class NodeDef;
+class OpDef;
+
+// Debugging facility.  Returns a debug string for a graph
+// representing an instantiated function.
+string DebugString(const Graph* g);
+
+// Dump the contents of the "graph" to log files if the logging level is
+// sufficiently high.
+void DumpGraph(StringPiece label, const Graph* g);
+
+// Convert the Graph of a function to a GraphDef.
+//
+// Handles renaming of nodes to avoid duplicate names which may
+// be present after various rewriting operations.
+void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty = false);
+
+// Extracts function name and attributes from `call_def`
+// `call_def` can be a native function call (where the op type is the function
+// name) or a call through PartitionedCall/StatefulPartitionedCall.
+Status NameAndAttrsFromFunctionCall(const NodeDef& call_def,
+                                    NameAttrList* function);
+
+// A few hand-crafted optimization on the instantiated function body
+// (a Graph*).
+
+// Removes nodes that are
+//   1. not stateful; and
+//   2. not _Arg; and
+//   3. not reachable from _Retval.
+//
+// This function is triggered by function inlining, unlike 'PruneFunctionBody'
+// it doesn't preserve nodes that are reachable from control returns. Function
+// inlining is responsible for connecting control return nodes with the nodes
+// that have input control edges from the inlined function call node.
+//
+// Assuming that automatic control dependency tracking is correct, absence of
+// outgoing control edge from the function call node means that no one needs to
+// observe side-effect that might have been generated by the function (see
+// documentation in common_runtime/function.cc for details).
+//
+// Returns true iff any node is removed from "g".
+bool RemoveDeadNodes(Graph* g);
+
+// Find a pattern:
+//   src -(in)-> node -(out)-> dst, where
+// 1) node is an identity node;
+// 2) in is the only incoming data edge;
+// 3) out is the only outgoing data edge;
+//
+// Rewrites the above pattern with src->dst and relevant data
+// dependencies updated. Repeat the process until no such pattern
+// left.
+bool RemoveIdentityNodes(Graph* g);
+
+// Rewrites _ListToArray and _ArrayToList to a set of Identity nodes.
+bool RemoveListArrayConverter(Graph* g);
+
+// Extracts function name and attributes from `call_def` and invokes
+// flr->Instantiate(name, attrs, handle).
+// `call_def` can be a native function call (where the op type is the function
+// name) or a call through PartitionedCall/StatefulPartitionedCall.
+Status InstantiateFunctionCall(const NodeDef& call_def,
+                               FunctionLibraryRuntime* flr,
+                               FunctionLibraryRuntime::Handle* handle);
+
+// Returns true iff `n` represents a function call. `n` can be a native
+// function call (n.type_string() is the function name),
+// a PartitionedCall/StatefulPartitionedCall, or a SymbolicGradient (which
+// has been deprecated for a while).
+bool IsFunctionCall(const FunctionLibraryDefinition& lib_def, const Node& n);
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_UTILS_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 00e237ad253..cf2e7043cae 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -1444,7 +1444,7 @@ struct CudaVersion {
 };
 
 std::vector<CudaVersion> supported_cuda_compute_capabilities = {
-    TF_CUDA_CAPABILITIES,};
+    CudaVersion("3.5"), CudaVersion("5.2")};
 
 std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
   auto cuda_caps = supported_cuda_compute_capabilities;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index fd77aa379b1..0d66324a8e5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -94,25 +94,14 @@ class GPUDeviceTest : public ::testing::Test {
 
   void CopyCPUToGPU(Tensor* cpu_tensor, Tensor* gpu_tensor, Device* device,
                     DeviceContext* device_context) {
-    Notification note;
-    device_context->CopyCPUTensorToDevice(cpu_tensor, device, gpu_tensor,
-                                          [&note](const Status& s) {
-                                            TF_ASSERT_OK(s);
-                                            note.Notify();
-                                          });
-    note.WaitForNotification();
+    TF_ASSERT_OK(device_context->CopyCPUTensorToDeviceSync(cpu_tensor, device,
+                                                           gpu_tensor));
   }
 
   void CopyGPUToCPU(Tensor* gpu_tensor, Tensor* cpu_tensor, Device* device,
                     DeviceContext* device_context) {
-    Notification note;
-    device_context->CopyDeviceTensorToCPU(gpu_tensor, /*tensor_name=*/"",
-                                          device, cpu_tensor,
-                                          [&note](const Status& s) {
-                                            TF_ASSERT_OK(s);
-                                            note.Notify();
-                                          });
-    note.WaitForNotification();
+    TF_ASSERT_OK(device_context->CopyDeviceTensorToCPUSync(
+        gpu_tensor, /*tensor_name=*/"", device, cpu_tensor));
   }
 };
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 0df7a84d757..c89bf54564c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -191,7 +191,7 @@ class EMBenchmarkHelper {
 
  public:
   // Length of tensors.  TODO(tucker): make this a variable parameter.
-  static const int kTDim = 1024;
+  static constexpr int kTDim = 1024;
 
   int num_ops() const { return add_kernels_.size(); }
   size_t tensor_size() const {
diff --git a/tensorflow/core/graph/gradients.cc b/tensorflow/core/common_runtime/gradients.cc
similarity index 99%
rename from tensorflow/core/graph/gradients.cc
rename to tensorflow/core/common_runtime/gradients.cc
index ac4fcb7c4bf..5230f354df9 100644
--- a/tensorflow/core/graph/gradients.cc
+++ b/tensorflow/core/common_runtime/gradients.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/gradients.h"
+
 #include <deque>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -25,8 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/gradients.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/optimizer_cse.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/graph/gradients.h b/tensorflow/core/common_runtime/gradients.h
similarity index 93%
rename from tensorflow/core/graph/gradients.h
rename to tensorflow/core/common_runtime/gradients.h
index ddfed084b09..1f86af5298d 100644
--- a/tensorflow/core/graph/gradients.h
+++ b/tensorflow/core/common_runtime/gradients.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
-#define TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRADIENTS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRADIENTS_H_
 
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -55,4 +55,4 @@ Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRADIENTS_H_
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/common_runtime/graph_constructor.cc
similarity index 99%
rename from tensorflow/core/graph/graph_constructor.cc
rename to tensorflow/core/common_runtime/graph_constructor.cc
index feaf9f6e70e..ab5b086b25c 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/common_runtime/graph_constructor.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 
 #include <algorithm>
 #include <set>
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/common_runtime/graph_constructor.h
similarity index 97%
rename from tensorflow/core/graph/graph_constructor.h
rename to tensorflow/core/common_runtime/graph_constructor.h
index 6930a57bf2e..c58a4aafd40 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/common_runtime/graph_constructor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_CONSTRUCTOR_H_
-#define TENSORFLOW_CORE_GRAPH_GRAPH_CONSTRUCTOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_CONSTRUCTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_CONSTRUCTOR_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
@@ -201,4 +201,4 @@ extern void CopyGraph(const Graph& src, Graph* dest);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_CONSTRUCTOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_CONSTRUCTOR_H_
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/common_runtime/graph_constructor_test.cc
similarity index 99%
rename from tensorflow/core/graph/graph_constructor_test.cc
rename to tensorflow/core/common_runtime/graph_constructor_test.cc
index 89cb7d82b2b..462657d0b9d 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 
 #include <vector>
 
diff --git a/tensorflow/core/graph/graph_def_builder_util.cc b/tensorflow/core/common_runtime/graph_def_builder_util.cc
similarity index 88%
rename from tensorflow/core/graph/graph_def_builder_util.cc
rename to tensorflow/core/common_runtime/graph_def_builder_util.cc
index 3ca9f8a21ff..4062fe2a4a8 100644
--- a/tensorflow/core/graph/graph_def_builder_util.cc
+++ b/tensorflow/core/common_runtime/graph_def_builder_util.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/graph/graph_def_builder_util.h b/tensorflow/core/common_runtime/graph_def_builder_util.h
similarity index 86%
rename from tensorflow/core/graph/graph_def_builder_util.h
rename to tensorflow/core/common_runtime/graph_def_builder_util.h
index 4a157e5b71d..01f3d710460 100644
--- a/tensorflow/core/graph/graph_def_builder_util.h
+++ b/tensorflow/core/common_runtime/graph_def_builder_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_UTIL_H_
-#define TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_UTIL_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_DEF_BUILDER_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_DEF_BUILDER_UTIL_H_
 
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -32,4 +32,4 @@ Status GraphDefBuilderToGraph(const GraphDefBuilder& builder, Graph* graph);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_UTIL_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_DEF_BUILDER_UTIL_H_
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 42247c664ec..ebe96d5dbd6 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/collective_order.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/graph/validate.h"
@@ -671,32 +671,55 @@ Status GraphExecutionState::OptimizeGraph(
 
     if (!(options.callable_options.feed().empty() &&
           options.callable_options.tensor_connection().empty())) {
-      std::unordered_set<string> feeds;
+      std::vector<SafeTensorId> feeds;
+
       for (const string& feed : options.callable_options.feed()) {
-        TensorId id = ParseTensorName(feed);
-        if (id.second != 0) {
-          return errors::InvalidArgument("Unsupported feed: ", feed);
-        }
-        feeds.emplace(id.first);
+        feeds.emplace_back(ParseTensorName(feed));
       }
       for (const TensorConnection& tensor_connection :
            options.callable_options.tensor_connection()) {
-        TensorId id = ParseTensorName(tensor_connection.to_tensor());
-        if (id.second != 0) {
-          return errors::InvalidArgument("Unsupported feed: ",
-                                         tensor_connection.to_tensor());
-        }
-        feeds.emplace(id.first);
+        feeds.emplace_back(ParseTensorName(tensor_connection.to_tensor()));
       }
-      for (const Node* node : graph_->nodes()) {
-        if (feeds.find(node->name()) == feeds.end()) {
+
+      // For feeds with tensor index 0 we try to find the corresponding node in
+      // the graph to infer feed data type and shape.
+      std::unordered_set<std::string> feed_nodes;
+
+      // For feeds with tensor index larger than 0, we can't infer data type or
+      // shape from the graph. Currently we only support type and shape
+      // inference from a small set of node types: Placeholder, Const, etc...
+      for (const SafeTensorId& feed : feeds) {
+        if (feed.index() > 0) {
+          VLOG(3) << "Add undefined feed for: " << feed.ToString();
+          Tensor fake_input(DT_INVALID, {0});
+          item.feed.emplace_back(feed.ToString(), fake_input);
+        } else {
+          VLOG(3) << "Add node for feed inference: " << feed.ToString();
+          feed_nodes.insert(feed.node());
           continue;
         }
-        // Get the type and shape of the feed node.
+      }
+
+      // For feeds with tensor index == 0 we try to infer data type and tensor
+      // shape from the graph, by looking at the fed node attributes.
+      for (const Node* node : graph_->nodes()) {
+        if (feed_nodes.find(node->name()) == feed_nodes.end()) continue;
+
+        // Try to get the type and shape of the feed node.
         PartialTensorShape partial_shape;
         DataType type;
-        TF_RETURN_IF_ERROR(GetFeedShapeAndTypeFromAttribute(
-            node->def(), &partial_shape, &type));
+        Status st = GetFeedShapeAndTypeFromAttribute(node->def(),
+                                                     &partial_shape, &type);
+
+        // Failed to get type and shape of the feed node.
+        if (!st.ok()) {
+          VLOG(3) << "Failed to infer feed node type and shape."
+                  << " Add undefined feed for: " << node->name();
+          Tensor fake_input(DT_INVALID, {0});
+          item.feed.emplace_back(node->name(), fake_input);
+          continue;
+        }
+
         // If the shape of the placeholder is only partially known, we are free
         // to set unknown dimensions of its shape to any value we desire. We
         // choose 0 to minimize the memory impact. Note that this only matters
@@ -717,6 +740,8 @@ Status GraphExecutionState::OptimizeGraph(
           }
         }
 
+        VLOG(3) << "Add feed for: " << node->name() << "; type: " << type
+                << "; shape: " << shape;
         Tensor fake_input(type, shape);
         item.feed.emplace_back(node->name(), fake_input);
       }
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 774a5067481..746930750ad 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -16,9 +16,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/optimizer_cse.h"
 
@@ -40,7 +41,8 @@ void GraphOptimizer::Optimize(
         shape_map,
     const NodePredicate& cse_consider_fn, const NodePredicate& cf_consider_fn,
     bool inline_multi_device_functions,
-    bool inline_impl_selection_group_functions) {
+    bool inline_impl_selection_group_functions,
+    bool inline_with_single_device_body_placer) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -92,6 +94,13 @@ void GraphOptimizer::Optimize(
       ExpandInlineFunctionsOptions expand_inline_opts;
       expand_inline_opts.native_options.inlined_function_body_placer =
           InlinedFunctionBodyPlacer::SingleDevice();
+
+      // Force single device placement strategy for multi-device function body.
+      if (inline_with_single_device_body_placer) {
+        expand_inline_opts.multi_device_options.inlined_function_body_placer =
+            InlinedFunctionBodyPlacer::SingleDevice();
+      }
+
       if (!inline_multi_device_functions) {
         // GraphOptimizer is running:
         //   (1) After partitioning when executing with a Session API.
@@ -132,7 +141,23 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
   Optimize(runtime, env, device, graph, options.shape_map,
            options.cse_consider_fn, options.cf_consider_fn,
            options.inline_multi_device_functions,
-           options.inline_impl_selection_group_functions);
+           options.inline_impl_selection_group_functions,
+           options.inline_with_single_device_body_placer);
+}
+
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
+                   const GraphOptimizer::Options& graph_optimizer_options) {
+  OptimizerOptions opts;
+  opts.set_do_common_subexpression_elimination(true);
+  opts.set_do_function_inlining(true);
+  opts.set_do_constant_folding(true);
+  GraphOptimizer optimizer(opts);
+  optimizer.Optimize(lib, lib->env(), lib->device(), g,
+                     graph_optimizer_options);
+}
+
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
+  OptimizeGraph(lib, g, GraphOptimizer::Options());
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index df8b37dcc07..099ea8efa12 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -54,6 +54,10 @@ class GraphOptimizer {
     // If true, functions in implementation selection group will be inlined if
     // opts_.do_function_inlining() is true.
     bool inline_impl_selection_group_functions = false;
+
+    // If true all functions will be inlined with a single device function
+    // body placer strategy.
+    bool inline_with_single_device_body_placer = false;
   };
 
   explicit GraphOptimizer(const OptimizerOptions& opts);
@@ -76,7 +80,8 @@ class GraphOptimizer {
       const NodePredicate& cse_consider_fn = nullptr,
       const NodePredicate& cf_consider_fn = nullptr,
       bool inline_multi_device_functions = false,
-      bool inline_impl_selection_group_functions = false);
+      bool inline_impl_selection_group_functions = false,
+      bool inline_with_single_device_body_placer = false);
 
   const OptimizerOptions& options() { return opts_; }
 
@@ -86,6 +91,17 @@ class GraphOptimizer {
   TF_DISALLOW_COPY_AND_ASSIGN(GraphOptimizer);
 };
 
+// Applies graph rewrite optimization such as inlining, dead code
+// removal, etc.
+//
+// **g is a graph constructed based on the runtime library 'lib'.
+// OptimizeGraph mutates **g extensively and replaces '*g' with a
+// complete copy. Therefore, the caller should not keep any references
+// to nodes *g.
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
+                   const GraphOptimizer::Options& graph_optimizer_options);
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 0b3970a468f..2c17bf54a17 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -20,9 +20,10 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_runner.h"
 
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/single_threaded_cpu_device.h"
@@ -31,11 +32,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/graph_runner.h b/tensorflow/core/common_runtime/graph_runner.h
index 1c4b2b719cd..563586e0534 100644
--- a/tensorflow/core/common_runtime/graph_runner.h
+++ b/tensorflow/core/common_runtime/graph_runner.h
@@ -20,15 +20,16 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
+class Device;
+class Env;
+class Graph;
+
 // GraphRunner takes a Graph, some inputs to feed, and some outputs
 // to fetch and executes the graph required to feed and fetch the
 // inputs and outputs.
diff --git a/tensorflow/core/common_runtime/graph_view.cc b/tensorflow/core/common_runtime/graph_view.cc
index 7db0781551d..7a63e06814a 100644
--- a/tensorflow/core/common_runtime/graph_view.cc
+++ b/tensorflow/core/common_runtime/graph_view.cc
@@ -191,9 +191,11 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
 
   DCHECK_LT(DataType_MAX, 255);  // Must fit in uint8
   uint8* input_types = item->input_type_base();
+  item->is_any_input_ref_typed = false;
   for (int i = 0; i < num_inputs; i++) {
     input_types[i] = static_cast<uint8>(n->input_type(i));
     DCHECK_EQ(item->input_type(i), n->input_type(i));
+    item->is_any_input_ref_typed |= IsRefType(n->input_type(i));
   }
 
   // Check ScopedAllocatorAttrs and forward_from.  Also assign output_types.
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
index 6d31555ed9a..38eb3e33bcb 100644
--- a/tensorflow/core/common_runtime/graph_view.h
+++ b/tensorflow/core/common_runtime/graph_view.h
@@ -81,6 +81,8 @@ struct NodeItem {
                                                      // of any output edge is a
                                                      // merge or control trigger
                                                      // node.
+  bool is_any_input_ref_typed : 1;  // True iff any IsRefType(dt) for dt in this
+                                    // node's input types.
 
   // The kernel for this node.
   OpKernel* kernel = nullptr;
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index e5097923f14..d4cb79e3c05 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
@@ -448,6 +447,8 @@ void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
       col_ctx_->device_locality, 0 /*stream_index*/, done);
 }
 
+namespace {
 REGISTER_COLLECTIVE(HierarchicalTreeBroadcast, HierarchicalTreeBroadcaster);
+}  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index 15a25c176dd..2006947258c 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
-#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
@@ -442,13 +441,8 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       Device* dev = instances_[broadcast_dev_id]->device_;
       auto* dev_info = dev->tensorflow_gpu_device_info();
       CHECK(dev_info);
-      dev_info->default_context->CopyDeviceTensorToCPU(
-          t, "" /*tensor_name*/, dev, &cpu_copy,
-          [this, &notification](Status s) {
-            TF_CHECK_OK(s);
-            notification.Notify();
-          });
-      notification.WaitForNotification();
+      TF_CHECK_OK(dev_info->default_context->CopyDeviceTensorToCPUSync(
+          t, "" /*tensor_name*/, dev, &cpu_copy));
       t = &cpu_copy;
     }
     for (size_t i = 0; i < t->NumElements(); ++i) {
@@ -473,17 +467,11 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       if (device_type_ == DEVICE_CPU) {
         CHECK(actual.CopyFrom(*inst, inst->shape()));
       } else if (device_type_ == DEVICE_GPU) {
-        Notification notification;
         Device* dev = instances_[di]->device_;
         auto* dev_info = dev->tensorflow_gpu_device_info();
         CHECK(dev_info);
-        dev_info->default_context->CopyDeviceTensorToCPU(
-            inst, "" /*tensor_name*/, dev, &actual,
-            [this, &notification](Status s) {
-              TF_CHECK_OK(s);
-              notification.Notify();
-            });
-        notification.WaitForNotification();
+        TF_CHECK_OK(dev_info->default_context->CopyDeviceTensorToCPUSync(
+            inst, "" /*tensor_name*/, dev, &actual));
       }
       for (int i = 0; i < tensor_len; ++i) {
         switch (dtype) {
@@ -623,12 +611,8 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
         Notification notification;
         auto* dev_info = device_->tensorflow_gpu_device_info();
         CHECK(dev_info);
-        dev_info->default_context->CopyCPUTensorToDevice(
-            &cpu_tensor, device_, &tensor_, [&notification](Status s) {
-              TF_CHECK_OK(s);
-              notification.Notify();
-            });
-        notification.WaitForNotification();
+        TF_CHECK_OK(dev_info->default_context->CopyCPUTensorToDeviceSync(
+            &cpu_tensor, device_, &tensor_));
       } else {
         LOG(FATAL) << "Unsupported device_type " << device_type_;
       }
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.cc b/tensorflow/core/common_runtime/immutable_executor_state.cc
index 2f6d985b9cc..03d12a0e98a 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.cc
+++ b/tensorflow/core/common_runtime/immutable_executor_state.cc
@@ -16,12 +16,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/immutable_executor_state.h"
 
 #include "absl/memory/memory.h"
-#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/edgeset.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -39,9 +40,6 @@ ImmutableExecutorState::~ImmutableExecutorState() {
       params_.delete_kernel(item->kernel);
     }
   }
-  for (auto fiter : frame_info_) {
-    delete fiter.second;
-  }
 }
 
 namespace {
@@ -71,11 +69,16 @@ void GetMaxPendingCounts(const Node* n, size_t* max_pending,
 
 ImmutableExecutorState::FrameInfo* ImmutableExecutorState::EnsureFrameInfo(
     const string& fname) {
-  auto slot = &frame_info_[fname];
-  if (*slot == nullptr) {
-    *slot = new FrameInfo;
+  auto iter = frame_info_.find(fname);
+  if (iter != frame_info_.end()) {
+    return iter->second.get();
+  } else {
+    auto frame_info = absl::make_unique<FrameInfo>(fname);
+    absl::string_view fname_view = frame_info->name;
+    auto emplace_result =
+        frame_info_.emplace(fname_view, std::move(frame_info));
+    return emplace_result.first->second.get();
   }
-  return *slot;
 }
 
 Status ImmutableExecutorState::Initialize(const Graph& graph) {
@@ -89,7 +92,7 @@ Status ImmutableExecutorState::Initialize(const Graph& graph) {
     EnsureFrameInfo(it)->nodes =
         absl::make_unique<std::vector<const NodeItem*>>();
   }
-  root_frame_info_ = frame_info_[""];
+  root_frame_info_ = frame_info_[""].get();
 
   pending_ids_.resize(gview_.num_nodes());
 
@@ -157,6 +160,28 @@ Status ImmutableExecutorState::Initialize(const Graph& graph) {
       TF_RETURN_IF_ERROR(
           GetNodeAttr(n->attrs(), "is_constant", &is_constant_enter));
       item->is_constant_enter = is_constant_enter;
+
+      string frame_name;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &frame_name));
+      FrameInfo* frame_info = frame_info_[frame_name].get();
+
+      int parallel_iterations;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(n->attrs(), "parallel_iterations", &parallel_iterations));
+
+      if (frame_info->parallel_iterations == -1) {
+        frame_info->parallel_iterations = parallel_iterations;
+      } else if (frame_info->parallel_iterations != parallel_iterations) {
+        LOG(WARNING) << "Loop frame \"" << frame_name
+                     << "\" had two different values for parallel_iterations: "
+                     << frame_info->parallel_iterations << " vs. "
+                     << parallel_iterations << ".";
+      }
+
+      if (enter_frame_info_.size() <= id) {
+        enter_frame_info_.resize(id + 1);
+      }
+      enter_frame_info_[id] = frame_info;
     } else {
       item->is_constant_enter = false;
     }
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.h b/tensorflow/core/common_runtime/immutable_executor_state.h
index 9a2987cfaae..a35edfe227c 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.h
+++ b/tensorflow/core/common_runtime/immutable_executor_state.h
@@ -20,8 +20,9 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/executor.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/graph_view.h"
+#include "tensorflow/core/common_runtime/local_executor_params.h"
 #include "tensorflow/core/common_runtime/pending_counts.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -41,11 +42,16 @@ class Graph;
 class ImmutableExecutorState {
  public:
   struct FrameInfo {
-    FrameInfo()
-        : input_count(0),
+    explicit FrameInfo(string name)
+        : name(std::move(name)),
+          input_count(0),
           total_inputs(0),
           pending_counts(nullptr),
-          nodes(nullptr) {}
+          nodes(nullptr),
+          parallel_iterations(-1) {}
+
+    // The name of the frame.
+    string name;
 
     // The total number of inputs to a frame.
     int input_count;
@@ -63,6 +69,9 @@ class ImmutableExecutorState {
 
     // The nodes in a frame. Used only for debugging.
     std::unique_ptr<std::vector<const NodeItem*>> nodes;
+
+    // The number of iterations of this frame that can execute concurrently.
+    int32 parallel_iterations;
   };
 
   explicit ImmutableExecutorState(const LocalExecutorParams& p)
@@ -83,17 +92,13 @@ class ImmutableExecutorState {
   }
   const std::vector<const NodeItem*>& root_nodes() const { return root_nodes_; }
 
-  const FrameInfo* get_frame_info(const string& frame_name) const {
-    auto it_frame_info = frame_info_.find(frame_name);
-    if (it_frame_info == frame_info_.end()) {
-      return nullptr;
-    } else {
-      return it_frame_info->second;
-    }
-  }
-
   const FrameInfo& get_root_frame_info() const { return *root_frame_info_; }
 
+  const FrameInfo& get_enter_frame_info(const NodeItem& node_item) const {
+    DCHECK(node_item.is_enter);
+    return *enter_frame_info_[node_item.node_id];
+  }
+
   bool requires_control_flow_support() const { return requires_control_flow_; }
 
   // Copies the pending counts for nodes in this graph to the given array.
@@ -135,9 +140,14 @@ class ImmutableExecutorState {
   // Mapping from frame name to static information about the frame.
   // TODO(yuanbyu): We could cache it along with the graph so to avoid
   // the overhead of constructing it for each executor instance.
-  gtl::FlatMap<string, FrameInfo*> frame_info_;
+  absl::flat_hash_map<absl::string_view, std::unique_ptr<FrameInfo>>
+      frame_info_;
   const FrameInfo* root_frame_info_;  // Not owned.
 
+  // If the graph contains any "Enter" or "RefEnter" nodes, this vector maps
+  // dense node IDs to the corresponding FrameInfo.
+  std::vector<FrameInfo*> enter_frame_info_;
+
   // If `requires_control_flow_` is false, this points to an array of initial
   // pending counts for the nodes in the graph, indexed by node ID.
   std::unique_ptr<std::atomic<int32>[]> atomic_pending_counts_;
diff --git a/tensorflow/core/common_runtime/inline_function_utils.cc b/tensorflow/core/common_runtime/inline_function_utils.cc
new file mode 100644
index 00000000000..a074942629d
--- /dev/null
+++ b/tensorflow/core/common_runtime/inline_function_utils.cc
@@ -0,0 +1,870 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
+
+#include <deque>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/optimizer_cse.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+/*static*/ constexpr const char* const
+    LowerFunctionalOpsConstants::kLowerUsingSwitchMergeAttr;
+/*static*/ constexpr const char* const
+    LowerFunctionalOpsConstants::kLowerAsMultiDeviceFunctionAttr;
+
+namespace {
+// A few string constant used throughout this module.
+static constexpr const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static constexpr const char* const kDeviceArgOp =
+    FunctionLibraryDefinition::kDeviceArgOp;
+static constexpr const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
+static constexpr const char* const kDeviceRetOp =
+    FunctionLibraryDefinition::kDeviceRetOp;
+static constexpr const char* const kGradientOp =
+    FunctionLibraryDefinition::kGradientOp;
+static constexpr const char* const kNodeLabel = "Func";
+static constexpr const char* const kFuncAttr =
+    FunctionLibraryDefinition::kFuncAttr;
+
+// Represents the index-th output of a node.
+struct Endpoint {
+  Node* node;
+  int index;
+
+  // Returns the string name represents this endpoint.
+  string name() const {
+    if (index == 0) {
+      return node->name();
+    } else {
+      return strings::StrCat(node->name(), ":", index);
+    }
+  }
+
+  DataType dtype() const { return node->output_type(index); }
+};
+
+struct EndpointHash {
+  uint64 operator()(const Endpoint& x) const {
+    return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*),
+                  x.index);
+  }
+};
+
+struct EndpointEq {
+  bool operator()(const Endpoint& x, const Endpoint& y) const {
+    return (x.node == y.node) && (x.index == y.index);
+  }
+};
+
+// The following Add* routines are used to add a few graph nodes while
+// functions are transformed.
+static Node* AddNoOp(StringPiece name, Graph* g) {
+  NodeDef ndef;
+  ndef.set_name(g->NewName(absl::StrCat(kNodeLabel, "/", name)));
+  ndef.set_op("NoOp");
+  Status s;
+  Node* ret = g->AddNode(ndef, &s);
+  TF_CHECK_OK(s);
+  return ret;
+}
+
+static Node* AddIdentity(StringPiece name, Graph* g, Endpoint input) {
+  DCHECK_LT(0, input.dtype());
+  NodeDef ndef;
+  ndef.set_name(g->NewName(absl::StrCat(kNodeLabel, "/", name)));
+  ndef.set_op("Identity");
+  ndef.add_input(input.name());
+  AddNodeAttr("T", BaseType(input.dtype()), &ndef);
+  Status s;
+  Node* ret = g->AddNode(ndef, &s);
+  TF_CHECK_OK(s);
+  g->AddEdge(input.node, input.index, ret, 0);
+  return ret;
+}
+
+std::vector<string> InputDevices(const Node& caller) {
+  std::vector<string> input_devices(caller.in_edges().size());
+  std::vector<string> input_tensors(caller.in_edges().size());
+
+  for (const Edge* edge : caller.in_edges()) {
+    if (edge->IsControlEdge()) continue;
+    const string& input_device = edge->src()->has_assigned_device_name()
+                                     ? edge->src()->assigned_device_name()
+                                     : edge->src()->requested_device();
+    input_devices[edge->dst_input()] = input_device;
+    input_tensors[edge->dst_input()] =
+        absl::StrCat(edge->src()->name(), ":", edge->src_output());
+  }
+
+  if (VLOG_IS_ON(4)) {
+    VLOG(4) << "Function instantiation input devices:";
+    for (int i = 0; i < input_devices.size(); ++i) {
+      if (input_tensors[i].empty()) continue;  // skip control edges
+      VLOG(4) << "    [index " << i << "]"
+              << " device: " << input_devices[i]
+              << " (input: " << input_tensors[i] << ")";
+    }
+  }
+
+  return input_devices;
+}
+
+// Place input nodes on the same device as the corresponding caller input
+// node. Do not specify any placement for all other nodes.
+class DefaultFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
+ public:
+  explicit DefaultFunctionBodyPlacer(const Node& caller)
+      : input_devices_(InputDevices(caller)) {}
+
+  absl::optional<string> InputNodeDevice(int input_index) const override {
+    return input_devices_[input_index];
+  }
+  absl::optional<string> OutputNodeDevice(int output_index) const override {
+    return absl::nullopt;
+  }
+  bool ColocateInputOutputIdentities() const override { return false; }
+  absl::optional<string> ControlNodeDevice() const override {
+    return absl::nullopt;
+  }
+  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
+    return absl::nullopt;
+  }
+
+ private:
+  const std::vector<string> input_devices_;
+};
+
+// Place all nodes on the same device as caller node.
+class SingleDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
+ public:
+  explicit SingleDeviceFunctionBodyPlacer(const Node& caller)
+      : caller_device_(caller.def().device()) {}
+
+  absl::optional<string> InputNodeDevice(int input_index) const override {
+    return caller_device_;
+  }
+  absl::optional<string> OutputNodeDevice(int output_index) const override {
+    return caller_device_;
+  }
+  bool ColocateInputOutputIdentities() const override { return false; }
+  absl::optional<string> ControlNodeDevice() const override {
+    return caller_device_;
+  }
+  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
+    return caller_device_;
+  }
+
+ private:
+  const string caller_device_;
+};
+
+// Place input nodes on the same device as the corresponding caller input
+// node. Do not place output node. Place control nodes on the same device as
+// caller node. For all function body nodes overrides job, replica and task
+// parts of the device assignment to match function caller node.
+class MultiDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
+ public:
+  explicit MultiDeviceFunctionBodyPlacer(const Node& caller)
+      : caller_device_(caller.def().device()),
+        input_devices_(InputDevices(caller)) {
+    has_parsed_caller_device_ =
+        DeviceNameUtils::ParseFullName(caller_device_, &caller_parsed_device_);
+  }
+
+  absl::optional<string> InputNodeDevice(int input_index) const override {
+    return input_devices_[input_index];
+  }
+  absl::optional<string> OutputNodeDevice(int output_index) const override {
+    return absl::nullopt;
+  }
+  bool ColocateInputOutputIdentities() const override { return true; }
+  absl::optional<string> ControlNodeDevice() const override {
+    return caller_device_;
+  }
+  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
+    // TODO(ezhulenev): If function would have been instantiated as a
+    // multi-device function and executed via FunctionLibraryRuntime, it could
+    // be potentially placed on any available device. However there are multiple
+    // tests relying on this assumption. Fix them, and remove this line.
+    if (ndef.device().empty()) return caller_device_;
+
+    if (!has_parsed_caller_device_) return ndef.device();
+
+    DeviceNameUtils::ParsedName ndef_parsed_device;
+    if (!DeviceNameUtils::ParseFullName(ndef.device(), &ndef_parsed_device))
+      return ndef.device();
+
+    if (caller_parsed_device_.has_job) {
+      ndef_parsed_device.has_job = caller_parsed_device_.has_job;
+      ndef_parsed_device.job = caller_parsed_device_.job;
+    }
+
+    if (caller_parsed_device_.has_replica) {
+      ndef_parsed_device.has_replica = caller_parsed_device_.has_replica;
+      ndef_parsed_device.replica = caller_parsed_device_.replica;
+    }
+
+    if (caller_parsed_device_.has_task) {
+      ndef_parsed_device.has_task = caller_parsed_device_.has_task;
+      ndef_parsed_device.task = caller_parsed_device_.task;
+    }
+    return DeviceNameUtils::ParsedNameToString(ndef_parsed_device);
+  }
+
+ private:
+  string caller_device_;
+  bool has_parsed_caller_device_;
+  DeviceNameUtils::ParsedName caller_parsed_device_;
+  std::vector<string> input_devices_;
+};
+
+}  // namespace
+
+std::unique_ptr<InlinedFunctionBodyPlacer>
+InlinedFunctionBodyPlacer::DefaultPlacer(const Graph& graph,
+                                         const Node& caller) {
+  VLOG(3) << "Create default placer for inlined function body.";
+  return absl::make_unique<DefaultFunctionBodyPlacer>(caller);
+}
+
+std::unique_ptr<InlinedFunctionBodyPlacer>
+InlinedFunctionBodyPlacer::SingleDevicePlacer(const Graph& graph,
+                                              const Node& caller) {
+  VLOG(3) << "Create single device placer for inlined function body.";
+  return absl::make_unique<SingleDeviceFunctionBodyPlacer>(caller);
+}
+
+std::unique_ptr<InlinedFunctionBodyPlacer>
+InlinedFunctionBodyPlacer::MultiDevicePlacer(const Graph& graph,
+                                             const Node& caller) {
+  VLOG(3) << "Create multi device placer for inlined function body.";
+  return absl::make_unique<MultiDeviceFunctionBodyPlacer>(caller);
+}
+
+namespace {
+
+Status ValidateNoInline(const FunctionBody* fbody) {
+  const auto attr = AttrSlice(&fbody->fdef.attr());
+  bool noinline = false;
+  if (TryGetNodeAttr(attr, kNoInlineAttr, &noinline) && noinline) {
+    return errors::InvalidArgument(
+        "Can't inline function marked with '_noinline'");
+  }
+  return Status::OK();
+}
+
+using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
+
+// Propagate the debug info of `nodes` in function `func` to the `target` node.
+// If the debug info of any node is missing, its node name and function name
+// is used.
+void PropagateDebugInfoToNode(const string& func,
+                              const std::vector<const Node*>& nodes,
+                              NodeDef* target) {
+  if (nodes.empty() || target->has_experimental_debug_info()) {
+    return;
+  }
+  for (const Node* node : nodes) {
+    const auto& node_def = node->def();
+    if (node_def.has_experimental_debug_info()) {
+      target->mutable_experimental_debug_info()->MergeFrom(
+          node_def.experimental_debug_info());
+    } else {
+      target->mutable_experimental_debug_info()->add_original_node_names(
+          node_def.name());
+      target->mutable_experimental_debug_info()->add_original_func_names(func);
+    }
+  }
+}
+}  // namespace
+
+string InlineFunctionBodyOptions::DebugString() const {
+  const auto true_false = [](bool b) { return b ? "true" : "false"; };
+
+  const auto keep_caller_node_str = [this]() -> string {
+    switch (keep_caller_node) {
+      case KeepCallerNode::kDoNotKeep:
+        return "DoNotKeep";
+      case KeepCallerNode::kFetchable:
+        return "Fetchable";
+      case KeepCallerNode::kTargetable:
+        return "Targetable";
+    }
+  };
+
+  return absl::StrCat(
+      "disable_inlining=", true_false(disable_inlining),
+      ", ignore_noinline=", true_false(ignore_noinline),
+      ", inline_impl_selection_group_functions=",
+      true_false(inline_impl_selection_group_functions),
+      ", keep_caller_node=", keep_caller_node_str(), ", output_control_src=",
+      output_control_src == OutputControlSrc::kDataOutputs ? "DataOutputs"
+                                                           : "ControlOutputs",
+      ", inlined_function_body_placer=", inlined_function_body_placer.name,
+      ", uniquify_frame_names=", true_false(uniquify_frame_names));
+}
+
+Status ValidateInlining(const Node* node, const FunctionBody* fbody,
+                        const InlineFunctionBodyOptions& options) {
+  // TODO(ezhulenev): Currently common_runtime function inlining can't guarantee
+  // that all side-effectful ops will be executed after inlining. See Grappler
+  // function_optimizer for details. Unify all function inlining mechanism.
+  // Do not inline if `!fbody->control_ret_nodes.empty()`.
+
+  const auto num_node_inputs = static_cast<size_t>(node->num_inputs());
+  const auto num_node_outputs = static_cast<size_t>(node->num_outputs());
+
+  if (num_node_inputs != fbody->arg_types.size() ||
+      num_node_inputs != fbody->arg_nodes.size()) {
+    return errors::InvalidArgument(
+        "Node inputs do not match function arguments: inputs=", num_node_inputs,
+        " arg_types=", fbody->arg_types.size(),
+        " arg_nodes=", fbody->arg_nodes.size());
+  }
+
+  if (num_node_outputs != fbody->ret_types.size() ||
+      num_node_outputs != fbody->ret_nodes.size()) {
+    return errors::InvalidArgument(
+        "Node outputs do not match function returns: outputs=",
+        num_node_outputs, " ret_types=", fbody->ret_types.size(),
+        " ret_nodes=", fbody->ret_nodes.size());
+  }
+
+  for (int i = 0; i < node->num_inputs(); ++i) {
+    if (node->input_type(i) != fbody->arg_types[i]) {
+      return errors::InvalidArgument(
+          "Node input type doesn't match function argument type: ",
+          node->input_type(i), " != ", fbody->arg_types[i], " @ index=", i);
+    }
+  }
+  for (int i = 0; i < node->num_outputs(); ++i) {
+    if (node->output_type(i) != fbody->ret_types[i]) {
+      return errors::InvalidArgument(
+          "Node output type doesn't match function return type: ",
+          node->output_type(i), " != ", fbody->ret_types[i], " @ index=", i);
+    }
+  }
+
+  if (options.disable_inlining) {
+    return errors::InvalidArgument(
+        "Function inlining explicitly disabled by 'options.disable_inlining'");
+  }
+
+  if (!options.inline_impl_selection_group_functions) {
+    bool is_impl_selection_group_function =
+        fbody->fdef.attr().find("api_implements") != fbody->fdef.attr().end();
+    if (is_impl_selection_group_function) {
+      return errors::InvalidArgument(
+          "Inlining of implementation selection group function ",
+          fbody->fdef.signature().name(),
+          " is disabled by options.inline_impl_selection_group_functions");
+    }
+  }
+
+  if (!options.ignore_noinline) {
+    TF_RETURN_IF_ERROR(ValidateNoInline(fbody));
+  }
+
+  return Status::OK();
+}
+
+// Function inlining must preserve function execution semantics with regards to
+// side-effects visibility. Tensorflow in Eager mode has an automatic control
+// dependencies tracking mechanism, which enforces well-defined execution order
+// of all side-effects. Any other frontend (e.g. Swift) must produce graphs
+// following the same rules, to ensure that function inlining works correctly.
+//
+// IMPORTANT: Currently we do not have a true notion of "side-effectful" node,
+// we assume that all stateful nodes might have side-effects, though it's not
+// true in practice, e.g. `ReadVariableOp` doesn't have an observable
+// side-effect.
+//
+// Automatic control dependency rules in Tensorflow 2.0 (python in eager mode):
+//
+// 1) When a function has a resource (DT_RESOURCE data type) input argument it
+//   "captures" the mutable resource.  This is implemented by automatically
+//    adding a incoming control edge from the previous side-effectful op
+//    touching that resource, and an outgoing control edge to the next
+//    side-effectful op using the same resource. This serializes the mutations
+//    of the resource to make graph execution deterministic.
+//
+// 2) All stateful ops inside a function body are guaranteed to execute in
+//    program order, this is achieved by adding control edges between stateful
+//    ops at graph construction time. Stateful ops (or ops that must execute)
+//    should be in the function control return set. Having a data edge to the
+//    regular function output might be not enough, because after function
+//    inlining it might happen that data output is unused.
+//
+// 3) Furthermore, all ops accepting the same resource as an input are
+//    guaranteed to run in program order. This is also done by adding control
+//    edges at graph construction time. The last op touching the resource
+//    must be in a control return set, which will guarantee that all side
+//    effects to the resource will happen before function completion.
+//
+// Function inlining must preserve side-effect visibility:
+//
+// 1) All side-effects to the captured resources, that happened before function
+//    call must be visible to the function body nodes using that resources.
+//
+// 2) All side-effects to the captured resources, that happened inside function
+//    body, must be visible to every op/function using that resource after the
+//    function call completed.
+//
+// To guarantee that these properties are preserved after inlining we:
+//
+// 1) Create "input_control_node" NoOp. Function call node incoming control
+//    edges will be forwarded *to* this node. Function inputs (Identity nodes)
+//    will have a control edge *from* this node. If function body has nodes
+//    without inputs, they will have a control edge *from* this node.
+//
+// 2) Create "output_control_node" NoOp. All nodes that have incoming control
+//    edge *from* the function call node, will be forwarded to this node.
+//
+//    We have two options for choosing which nodes will have a control edge *to*
+//    the "output control node":
+//       a) control returns            (`control_ret` field in FunctionDef)
+//       b) data returns               (`ret` field in FunctionDef)
+//
+//    We do a) for multi-device function calls in Tensorflow v2 and b)
+//    for the rest for compatibility with Tensorflow v1.
+//
+//    Following the automatic control dependencies tracking rules, a node that
+//    has an incoming control edge from the function call node is dependent on
+//    the side-effects happening inside the function body. The output control
+//    node will guarantee side-effects execution order.
+//
+//    If function call node doesn't have an outgoing control edge, it means that
+//    no one is interested in observing side-effects that might have happened.
+//
+// Function inlining might leave the graph in partially-placed state. Function
+// inlining caller must call Placer to guarantee that all nodes are placed.
+//
+// Function inlining with `options.override_device=true` will leave graph in
+// fully placed state, by overriding all inlined nodes devices with the caller
+// node device, but it will make functions always single-device. These functions
+// after inlining will not be able to handle resources on multiple devices. This
+// is currently acceptable for XLA use cases (XLA cluster is always executed on
+// a single device).
+//
+// TODO(ezhulenev): Documentation above is ahead of implementation below.
+Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
+                          Node* caller, const FunctionBody* fbody,
+                          const InlineFunctionBodyOptions& options) {
+  VLOG(3) << "Inline function call: " << SummarizeNode(*caller) << " ["
+          << options.DebugString() << "]";
+
+  Status validation = ValidateInlining(caller, fbody, options);
+  if (!validation.ok()) {
+    return errors::Internal("Inlining mismatch: ", validation.error_message());
+  }
+
+  // Placer is responsible for assigning devices for all nodes that we will add
+  // to the graph.
+  const std::unique_ptr<InlinedFunctionBodyPlacer> placer =
+      options.inlined_function_body_placer.get(*g, *caller);
+
+  // We can't possibly introduce a duplicate control edge during function
+  // inlining, so we skip this check in calls to the 'g->AddControlEdge(...)'.
+  static constexpr bool kDoNotCheckDuplicates = true;
+
+  // ------------------------------------------------------------------------ //
+  // Helper functions to create `NoOp` and `Identity` nodes for auxiliary
+  // control nodes and inlined function inputs and outputs.
+
+  // Add a NoOp node for function control inputs/outputs.
+  const auto no_op = [&](StringPiece name) -> Node* {
+    Node* node = AddNoOp(absl::StrCat(caller->name(), "/", name), g);
+    const absl::optional<string> device = placer->ControlNodeDevice();
+    if (device.has_value()) node->set_requested_device(*device);
+    return node;
+  };
+
+  // Add an Identity node for function input.
+  const auto input_identity = [&](StringPiece name, Endpoint input,
+                                  int index) -> Node* {
+    Node* node = AddIdentity(absl::StrCat(caller->name(), "/", name), g, input);
+    const absl::optional<string> device = placer->InputNodeDevice(index);
+    if (device.has_value()) node->set_requested_device(*device);
+    bool colocate_identity = placer->ColocateInputOutputIdentities();
+    if (colocate_identity) {
+      node->AddAttr(kColocationAttrName,
+                    std::vector<string>{absl::StrCat(kColocationGroupPrefix,
+                                                     input.node->name())});
+    }
+    return node;
+  };
+
+  // Add an Identity node for function output.
+  const auto output_identity = [&](StringPiece name, Endpoint input,
+                                   int index) -> Node* {
+    Node* node = AddIdentity(absl::StrCat(caller->name(), "/", name), g, input);
+    const absl::optional<string> device = placer->OutputNodeDevice(index);
+    if (device.has_value()) node->set_requested_device(*device);
+    bool colocate_identity = placer->ColocateInputOutputIdentities();
+    if (colocate_identity) {
+      node->AddAttr(kColocationAttrName,
+                    std::vector<string>{absl::StrCat(kColocationGroupPrefix,
+                                                     input.node->name())});
+    }
+    return node;
+  };
+
+  // ------------------------------------------------------------------------ //
+  // Input edges. For data edges coming into "caller", we first compute the
+  // <src>:<src_output> for the i-th input in "inputs".
+  // If "caller" has any input control dependencies, we add a NoOp
+  // node "input_control_node", which depends on "caller"'s control inputs.
+  std::vector<Endpoint> inputs(caller->num_inputs());
+  Node* input_control_node = nullptr;
+  for (const Edge* e : caller->in_edges()) {
+    if (e->IsControlEdge()) {
+      if (input_control_node == nullptr) {
+        input_control_node = no_op("input_control_node");
+      }
+      g->AddControlEdge(e->src(), input_control_node, kDoNotCheckDuplicates);
+    } else {
+      inputs[e->dst_input()] = {e->src(), e->src_output()};
+    }
+  }
+  if (input_control_node != nullptr) {
+    VLOG(3) << "Created input control node: " << input_control_node->name();
+  }
+
+  // ------------------------------------------------------------------------ //
+  // Duplicate fbody->graph into 'g'.  First, we copy the nodes of
+  // fbody->graph into 'g' except the source and sink nodes.  We copy
+  // edges among nodes in 'fbody->graph'.
+  //
+  // If 'x' is a node in fbody->graph and its copy in 'g' is 'y', we
+  // remember 'y' in node_map[x->id()].
+  std::vector<Node*> node_map(fbody->graph->num_node_ids());
+  for (Node* n : fbody->graph->op_nodes()) {
+    NodeDef ndef = n->def();
+
+    // Maybe override requested node device assignment.
+    const absl::optional<string> device = placer->BodyNodeDevice(ndef);
+    if (device.has_value()) ndef.set_device(*device);
+
+    // Add inlined function name to inlined node debug information.
+    PropagateDebugInfoToNode(fbody->fdef.signature().name(), {n}, &ndef);
+
+    // Add the function node name as a prefix:
+    //  1) to node name to avoid collisions
+    //  2) to frame name to avoid multiple LoopCond nodes in one frame
+    //  3) to colocation attribute
+    const string prefix = strings::StrCat(caller->name(), "/");
+    TF_RETURN_IF_ERROR(AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &ndef,
+                                                options.uniquify_frame_names));
+
+    Status added_node;
+    Node* clone = g->AddNode(ndef, &added_node);
+    TF_CHECK_OK(added_node);
+    node_map[n->id()] = clone;
+
+    // If there is an input control node, and one of:
+    // a) the node has no data or control inputs, or
+    // b) the node is a function call (including SymbolicGradient),
+    //    then add a control edge from the input control node to the clone (only
+    //    if it does not already have a control input).
+    //
+    // We must not execute any nodes if the original function call would not
+    // have executed. This is especially critical when the function call is
+    // inside a control-flow construct like tf.cond(). Case (a) ensures that
+    // such nodes do not run.
+    //
+    // The purpose of case (b) is to ensure that instances of case (a) created
+    // by further inlining steps also receive the control dependency.
+    //
+    // This edge is required to transfer execution frame down to all function
+    // body nodes of inlined nested function calls.
+    if (input_control_node) {
+      const auto is_input_edge = [](const Edge* e) -> bool {
+        return !e->src()->IsSource();
+      };
+      const auto is_control_edge = [](const Edge* e) -> bool {
+        return !e->src()->IsSource() && e->IsControlEdge();
+      };
+
+      // Forward execution frame if:
+      //
+      // a) The node has no data or control inputs.
+      // b) OR the node is a function call without control inputs (control edge
+      //    will be used in nested function inlining to forward execution frame
+      //    to constants inside the function body).
+      //
+      // c) Do not forward control frame to function argument nodes, they will
+      //    be connected to the corresponding function input later.
+      const bool forward_execution_frame =
+          (absl::c_none_of(n->in_edges(), is_input_edge) ||       // (a)
+           (n->IsFunctionCall() &&                                // (b)
+            absl::c_none_of(n->in_edges(), is_control_edge))) &&  //
+          !n->IsArg();                                            // (c)
+
+      if (forward_execution_frame) {
+        VLOG(4) << "Add control edge from input control node to: "
+                << clone->name();
+        g->AddControlEdge(input_control_node, clone, kDoNotCheckDuplicates);
+      }
+    }
+  }
+  for (const Edge* e : fbody->graph->edges()) {
+    if (e->src()->IsSource() || e->src()->IsSink() || e->dst()->IsSource() ||
+        e->dst()->IsSink()) {
+      continue;
+    }
+    Node* src_copy = node_map[e->src()->id()];
+    Node* dst_copy = node_map[e->dst()->id()];
+    g->AddEdge(src_copy, e->src_output(), dst_copy, e->dst_input());
+  }
+
+  // ------------------------------------------------------------------------ //
+  // Connect input edges.
+  //
+  // We create one Identity node for each input. Then, we connect inputs[i] to
+  // the i-th identity node added. The nodes that previously connected
+  // to the j-th output of i-th arg node are reconnected to the i-th
+  // identity node.
+  //
+  // The added identity nodes depend on "input_control_node".
+  VLOG(4) << "Add input Identity nodes for each function argument:";
+  for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
+    Node* arg = node_map[fbody->arg_nodes[i]->id()];
+    Node* n = input_identity("input", inputs[i], i);
+    VLOG(4) << "    [index " << i << "] "
+            << fbody->fdef.signature().input_arg(i).name() << " as "
+            << n->name() << " (input: " << inputs[i].name()
+            << ", requested_device: " << n->requested_device() << ")";
+
+    if (input_control_node) {
+      g->AddControlEdge(input_control_node, n, kDoNotCheckDuplicates);
+    }
+    for (const Edge* e : arg->out_edges()) {
+      if (e->IsControlEdge()) {
+        g->AddControlEdge(n, e->dst(), kDoNotCheckDuplicates);
+      } else {
+        g->AddEdge(n, 0, e->dst(), e->dst_input());
+      }
+    }
+    node_map[fbody->arg_nodes[i]->id()] = n;
+    g->RemoveNode(arg);  // 'arg' is disconnected.
+  }
+
+  // ------------------------------------------------------------------------ //
+  // Connect output edges.
+  //
+  // For i-th return node in fbody->graph, we add in "g" an identity node
+  // (outputs[i-th]). We then reconnect every incoming edge into the i-th return
+  // node to the added identity node.
+  //
+  // For every data edge coming out of "callee"s i-th output, we reconnect it to
+  // the i-th identity added above.
+  //
+  // If "callee" is control-depended upon by any other nodes, we add a NoOp node
+  // "output_control_node". "output_control_node" depends on all identity nodes
+  // added above or on all control return nodes (controlled by
+  // `options.output_control_src` value). And nodes previously depend on
+  // "callee" is changed to depend on "output_control_node".
+  //
+  // If `keep_node_fetchable` is `true` we always add an output control node, to
+  // guarantee that executing a fetchable node will execute all side-effects.
+  VLOG(4) << "Add output Identity nodes for each function output argument:";
+  std::vector<Node*> outputs(caller->num_outputs());
+  for (std::size_t i = 0; i < fbody->ret_nodes.size(); ++i) {
+    Node* ret = node_map[fbody->ret_nodes[i]->id()];
+    Endpoint data;  // Data input for the ret node.
+    for (const Edge* e : ret->in_edges()) {
+      if (!e->IsControlEdge()) {
+        data = {e->src(), e->src_output()};
+        break;
+      }
+    }
+    CHECK(data.node != nullptr);
+    Node* n = output_identity("output", data, i);
+    outputs[i] = n;
+    VLOG(4) << "    [index " << i << "] "
+            << fbody->fdef.signature().output_arg(i).name() << " as "
+            << n->name() << " (ret: " << data.node->name() << ":" << data.index
+            << ", requested_device: " << n->requested_device() << ")";
+    for (const Edge* e : ret->in_edges()) {
+      if (e->IsControlEdge()) {
+        g->AddControlEdge(e->src(), n, kDoNotCheckDuplicates);
+      }
+    }
+    g->RemoveNode(ret);  // 'ret' is disconnected.
+  }
+
+  Node* output_control_node = nullptr;
+  const bool has_control_outputs = absl::c_any_of(
+      caller->out_edges(), [](const Edge* e) { return e->IsControlEdge(); });
+
+  using KeepCallerNode = InlineFunctionBodyOptions::KeepCallerNode;
+  const bool keep_caller_node =
+      options.keep_caller_node == KeepCallerNode::kFetchable ||
+      options.keep_caller_node == KeepCallerNode::kTargetable;
+
+  if (has_control_outputs || keep_caller_node) {
+    output_control_node = no_op("output_control_node");
+    VLOG(4) << "Add output control node: " << output_control_node->name();
+    if (options.output_control_src == OutputControlSrc::kDataOutputs) {
+      for (Node* n : outputs) {
+        VLOG(4) << "    [data output] add control edge from: " << n->name();
+        g->AddControlEdge(n, output_control_node, kDoNotCheckDuplicates);
+      }
+    } else {
+      for (Node* fbody_node : fbody->control_ret_nodes) {
+        Node* n = node_map[fbody_node->id()];
+        VLOG(4) << "    [control output] add control edge from: " << n->name();
+        g->AddControlEdge(n, output_control_node, kDoNotCheckDuplicates);
+      }
+    }
+  }
+
+  // We can't leave output control node without incoming control edges, because
+  // in this case outgoing control edge will loose execution frame information.
+  // We connect input_control_node and output_control_node with a control edge
+  // to forward execution frame to the controlled nodes. Above we add a control
+  // edge to all function calls inside function body, to guarantee that we will
+  // always have input_control_node when we need it.
+  if (output_control_node && output_control_node->in_edges().empty()) {
+    if (input_control_node) {
+      VLOG(4)
+          << "Add add a control edge between input and output control nodes: "
+          << input_control_node->name() << " to "
+          << output_control_node->name();
+      g->AddControlEdge(input_control_node, output_control_node,
+                        kDoNotCheckDuplicates);
+    } else {
+      VLOG(4) << "Function inlining potentially dropped execution frame "
+                 "information from outgoing control edges.";
+    }
+  }
+
+  for (const Edge* e : caller->out_edges()) {
+    if (e->IsControlEdge()) {
+      g->AddControlEdge(output_control_node, e->dst(), kDoNotCheckDuplicates);
+    } else {
+      g->AddEdge(outputs[e->src_output()], 0, e->dst(), e->dst_input());
+    }
+  }
+
+  // ------------------------------------------------------------------------ //
+  // Add an IdentityN or NoOp node in-place of caller node to keep `caller`
+  // fetchable or targetable.
+
+  if (keep_caller_node) {
+    std::vector<NodeBuilder::NodeOut> output_tensors;
+    absl::c_transform(outputs, std::back_inserter(output_tensors),
+                      [](Node* n) { return NodeBuilder::NodeOut(n, 0); });
+
+    Node* caller_substitute_node;
+    if (options.keep_caller_node == KeepCallerNode::kTargetable ||
+        output_tensors.empty()) {
+      // IdentityN node must have at least one data input. If function has no
+      // data outputs, we can't keep it fetchable.
+      TF_CHECK_OK(NodeBuilder(caller->name(), "NoOp")
+                      .Device(caller->requested_device())
+                      .ControlInput(output_control_node)
+                      .Finalize(g, &caller_substitute_node));
+
+    } else if (options.keep_caller_node == KeepCallerNode::kFetchable) {
+      TF_CHECK_OK(NodeBuilder(caller->name(), "IdentityN")
+                      .Device(caller->requested_device())
+                      .Input(output_tensors)
+                      .ControlInput(output_control_node)
+                      .Finalize(g, &caller_substitute_node));
+    }
+  }
+
+  // ------------------------------------------------------------------------ //
+  // 'caller' is replaced with inlined function body nodes and maybe IdentityN
+  // to keep it fetchable.
+  VLOG(3) << "Successfully inlined function call node: " << caller->name();
+  g->RemoveNode(caller);
+
+  return Status::OK();
+}
+
+bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
+                           const ExpandInlineFunctionsOptions& options) {
+  std::vector<std::pair<Node*, const FunctionBody*>> candidates;
+
+  const FunctionLibraryDefinition* fld = lib->GetFunctionLibraryDefinition();
+
+  for (Node* node : graph->nodes()) {
+    // Skip nodes that are not function calls or SymbolicGradient calls.
+    if (!IsFunctionCall(*lib->GetFunctionLibraryDefinition(), *node)) {
+      continue;
+    }
+    // Skip function calls that marked noinline.
+    bool noinline;
+    if (fld->GetAttr(*node, kNoInlineAttr, &noinline).ok() && noinline) {
+      VLOG(3) << "noinline: " << SummarizeNode(*node);
+      continue;
+    }
+    FunctionLibraryRuntime::Handle handle;
+    Status s = InstantiateFunctionCall(node->def(), lib, &handle);
+    if (!s.ok()) {
+      LOG(ERROR) << "Failed to instantiate a function:  " << s.error_message();
+      continue;
+    }
+    const FunctionBody* fbody = lib->GetFunctionBody(handle);
+    CHECK_NOTNULL(fbody);
+    candidates.emplace_back(node, fbody);
+  }
+
+  bool inlined_any = false;
+  for (const auto& p : candidates) {
+    Status inlined = InlineFunctionBody(*fld, graph, p.first, p.second,
+                                        p.first->IsPartitionedCall()
+                                            ? options.multi_device_options
+                                            : options.native_options);
+    if (inlined.ok()) {
+      inlined_any = true;
+    } else {
+      VLOG(1) << "Failed to inline function call: node=" << p.first->name()
+              << " error=" << inlined.error_message();
+    }
+  }
+
+  // TODO(ezhulenev): Release handles for inlined function calls.
+
+  return inlined_any;
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/inline_function_utils.h b/tensorflow/core/common_runtime/inline_function_utils.h
new file mode 100644
index 00000000000..1469885ccda
--- /dev/null
+++ b/tensorflow/core/common_runtime/inline_function_utils.h
@@ -0,0 +1,243 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_INLINE_FUNCTION_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_INLINE_FUNCTION_UTILS_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+static constexpr const char* const kNoInlineAttr = "_noinline";
+
+// Optionally override device assignment for nodes added to the graph for
+// inlined functions:
+// (1) Identity nodes added in place of function input arguments.
+// (2) Identity nodes added in place of function return values.
+// (3) Special NoOp nodes that enforce side-effects execution order.
+// (4) All nodes inside function body specified in FunctionDef.
+class InlinedFunctionBodyPlacer {
+ public:
+  virtual ~InlinedFunctionBodyPlacer() = default;
+
+  virtual absl::optional<string> InputNodeDevice(int input_index) const = 0;
+  virtual absl::optional<string> OutputNodeDevice(int output_index) const = 0;
+  // Returns true if the added input/output identity nodes should be colocated
+  // with the corresponding input/output from the function body.
+  virtual bool ColocateInputOutputIdentities() const = 0;
+  virtual absl::optional<string> ControlNodeDevice() const = 0;
+  virtual absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const = 0;
+
+  // Place input nodes on the same device as the corresponding caller input
+  // node. Do not specify any placement for all other nodes.
+  static std::unique_ptr<InlinedFunctionBodyPlacer> DefaultPlacer(
+      const Graph& graph, const Node& caller);
+
+  // Place all nodes on the same device as caller node.
+  static std::unique_ptr<InlinedFunctionBodyPlacer> SingleDevicePlacer(
+      const Graph& graph, const Node& caller);
+
+  // Place input nodes on the same device as the corresponding caller input
+  // node. Do not place output node. Place control nodes on the same device as
+  // caller node. For all function body nodes overrides job, replica and task
+  // parts of the device assignment to match function caller node.
+  static std::unique_ptr<InlinedFunctionBodyPlacer> MultiDevicePlacer(
+      const Graph& graph, const Node& caller);
+
+  using Factory = std::function<std::unique_ptr<InlinedFunctionBodyPlacer>(
+      const Graph&, const Node&)>;
+
+  struct Config {
+    string name;
+    Factory get;
+  };
+
+  static Config Default() { return {"default", DefaultPlacer}; }
+  static Config SingleDevice() { return {"single_device", SingleDevicePlacer}; }
+  static Config MultiDevice() { return {"multi_device", MultiDevicePlacer}; }
+};
+
+struct InlineFunctionBodyOptions {
+  // All nodes that have incoming control edge *from* the function call node,
+  // will be forwarded to the "output control node". There are two options for
+  // choosing which nodes will have a control edge *to* the "output control
+  // node":
+  //   a) control returns            (`control_ret` field in FunctionDef)
+  //   b) data returns               (`ret` field in FunctionDef)
+  enum class OutputControlSource { kDataOutputs, kControlOutputs };
+
+  // Keep a node in a graph with the same name as the function call node:
+  //
+  // a) DoNotKeep: Function call node is fully inlined, and there is no node in
+  //    a graph with the same name.
+  //
+  // b) Fetchable: Add an IdentityN node to the graph in place of the inlined
+  //    function call node. It will have a control edge from inlined
+  //    'output_control_node' and data edges from function output nodes.
+  //    The IdentityN node will be placed on the same device as the caller node.
+  //
+  //    This is mostly for compatibility with Tensorflow v1 and sessions.
+  //    When we prepare a graph for execution in
+  //    GraphExecutionState::MakeForBaseGraph we don't know what nodes will be
+  //    fetched, so we can't safely remove any of them. When graph executed as a
+  //    function it has 'Retval' nodes for all fetched tensors, and we can
+  //    safely inline function calls.
+  //
+  // c) Targetable: Add a NoOp node to the graph in place of the inlined
+  //    function call node. It will have a control edge from inline
+  //    'output_control_node' and no data edges. NoOp node will be placed on the
+  //    same device as the caller node. This will keep the inlined function call
+  //    node a valid 'session.run' target, and also will keep it a valid control
+  //    output node.
+  enum class KeepCallerNode { kDoNotKeep, kFetchable, kTargetable };
+
+  // If 'true' function inlining is completely disabled. This allows to control
+  // function inlining for different types of function calls (see
+  // 'ExpandInlineFunctionsOptions' below).
+  bool disable_inlining = false;
+  // Ignore '_noinline' function attribute.
+  bool ignore_noinline = false;
+  // If 'true' function inlining will inline functions in implementation
+  // selection group. Normally those functions should not be inlined; they will
+  // be handled by Grappler.
+  bool inline_impl_selection_group_functions = false;
+  // Controls if we want to keep a node with the name as the function call node
+  // in a graph after function inlining.
+  KeepCallerNode keep_caller_node = KeepCallerNode::kDoNotKeep;
+  // For compatibility with Tensorflow v1 by default we will use data outputs.
+  // Control returns were added to Tensorflow v2 with automatic control
+  // dependencies tracking in Eager mode.
+  OutputControlSource output_control_src = OutputControlSource::kDataOutputs;
+  // Inlined function body placer decides what requested device assignments
+  // should be added to the nodes added to the graph. See documentation above
+  // for available strategies.
+  InlinedFunctionBodyPlacer::Config inlined_function_body_placer =
+      InlinedFunctionBodyPlacer::Default();
+  // If true, frame names in the function body will be
+  // made unique in the resulting graph (e.g. by prepending a unique prefix).
+  // NOTE(mrry): Only set this option to false when there is a single function
+  // call in the graph (e.g. when making a remote function call via
+  // ClusterFunctionLibraryRuntime). This option is provided because the graph
+  // partitioner generates frame names that must remain unmodified across all
+  // partitions of a multi-device function.
+  bool uniquify_frame_names = true;
+
+  // A human-readable debug string for this options.
+  string DebugString() const;
+};
+
+// Returns 'Status::OK()' iff the function '*fbody' can be inlined at 'node'
+// based on the type signature of 'node' and 'fbody':
+//
+// (1) Caller node has the same number of inputs and outputs as the function.
+// (2) Caller node inputs and outputs have the same data types as function
+//     inputs and returns.
+// (3) Validation rules defined in InlineFunctionBodyOptions.
+//
+// If function can't be safely inlined, returns error message with details why
+// inlining is not possible or safe.
+Status ValidateInlining(const Node* node, const FunctionBody* fbody,
+                        const InlineFunctionBodyOptions& options);
+
+// Given a "caller" in graph "g", which is a function call of a function
+// to "fbody". Replaces the "caller" with fbody->graph and connects
+// edges properly. "override_device" specifies whether inlining should replace
+// explicitly specified devices inside fbody with the callee's device.
+//
+// Returns 'Status::OK()' if function was successfully inlined into the graph.
+// If function inlining is not possible returns an error with a reason, and
+// leaves the graph in unmodified state.
+Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
+                          Node* caller, const FunctionBody* fbody,
+                          const InlineFunctionBodyOptions& options);
+
+// There are three types of function calls that could be invoked during
+// *Tensorflow graph execution*:
+//
+// 1) Native function call (node.type_string() is the function name). These
+//    functions are always executed on a single-device, which is the device of
+//    the function call node.
+//
+// 2) Multi-device function calls (PartitionedCall or StatefulPartitionedCall
+//    ops) can execute on multiple devices and accept DT_RESOURCE inputs that
+//    belong to different devices. This type of functions was added in
+//    Tensorflow 2.0 Eager mode, and it has control outputs to represent
+//    side-effects that must always execute (see `control_ret` in FunctionDef).
+//
+// 3) SymbolicGradient has been deprecated for a while, but we still keep it and
+//    use `native` options for inlining for compatibility.
+//
+// We need to have distinct inlining rules for compatibility with Tensorflow v1.
+//
+// There are few other places in Tensorflow that could execute functions:
+//
+// 1) common_runtime/eager/kernel_and_device.{h,cc} - executes "top level"
+//    functions directly via function library runtime, without going through
+//    the graph.
+// 2) tf.data pipelines - also execute functions directly via function library
+//    runtime with custom executors.
+struct ExpandInlineFunctionsOptions {
+  ExpandInlineFunctionsOptions() : native_options(), multi_device_options() {
+    using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
+    multi_device_options.output_control_src = OutputControlSrc::kControlOutputs;
+  }
+
+  InlineFunctionBodyOptions native_options;
+  InlineFunctionBodyOptions multi_device_options;
+};
+
+// WARNING(ezhulenev): PLEASE DO NOT USE THIS FUNCTION. This is a temporary
+// workaround that will be enabled only during the function inlining unification
+// (b/126811947). Contact ezhulenev@ if you think you need it.
+// TODO(ezhulenev): Delete this function.
+bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
+                           const ExpandInlineFunctionsOptions& options);
+
+// For each node in "graph", if "lib" indicates that the node is a
+// function call, inline the function body. Returns true if at least
+// one node is inlined.
+//
+// This routine goes through "graph" nodes once and applies the
+// inlining. The caller may decide to apply the inlining on "graph"
+// multiple times by calling ExpandInlineFunctions a few times.
+//
+// Function calls that can't be safely inlined into the graph (ValidateInlining
+// returns error), are ignored.
+//
+// TODO(ezhulenev): We do not FunctionLibraryRuntime for this. We need just the
+// FunctionLibraryDefinition and FunctionDefToBodyHelper to implement this (see
+// lower_function_call.cc).
+inline bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
+  return ExpandInlineFunctions(lib, graph, ExpandInlineFunctionsOptions());
+}
+
+struct LowerFunctionalOpsConstants {
+  static constexpr const char* const kLowerUsingSwitchMergeAttr =
+      "_lower_using_switch_merge";
+  static constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
+      "_lower_as_multi_device_function";
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_INLINE_FUNCTION_UTILS_H_
diff --git a/tensorflow/core/common_runtime/inspecting_placer.cc b/tensorflow/core/common_runtime/inspecting_placer.cc
index 2dd4eaff303..468ab37bcdb 100644
--- a/tensorflow/core/common_runtime/inspecting_placer.cc
+++ b/tensorflow/core/common_runtime/inspecting_placer.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/colocation_graph.h"
 #include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
 #include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
index ef1a74c5f29..13ed50f264f 100644
--- a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
+++ b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
@@ -20,14 +20,13 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/equal_graph_def.h"
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 4118534cb3e..1b1234d114f 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -19,7 +19,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/executor_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -62,8 +64,10 @@ Benchmark::Benchmark(const string& device, Graph* g,
   // Allow NewDevice to allocate a new threadpool with different number of
   // threads for each new benchmark.
   LocalDevice::set_use_global_threadpool(false);
-  device_ =
-      DeviceFactory::NewDevice(t, *options, "/job:localhost/replica:0/task:0");
+
+  device_mgr_ = absl::make_unique<StaticDeviceMgr>(
+      DeviceFactory::NewDevice(t, *options, "/job:localhost/replica:0/task:0"));
+  device_ = device_mgr_->ListDevices()[0];
   CHECK(device_) << "Could not create a " << device << " device";
 
   pool_ =
@@ -81,14 +85,24 @@ Benchmark::Benchmark(const string& device, Graph* g,
 
   const int graph_def_version = g->versions().producer();
 
+  flib_def_ = absl::make_unique<FunctionLibraryDefinition>(g->flib_def());
+
+  pflr_ = std::unique_ptr<ProcessFunctionLibraryRuntime>(
+      new ProcessFunctionLibraryRuntime(
+          device_mgr_.get(), Env::Default(), nullptr, graph_def_version,
+          flib_def_.get(), OptimizerOptions(), pool_, nullptr, nullptr, nullptr,
+          Rendezvous::Factory()));
+
+  flr_ = pflr_->GetFLR(device_->name());
+
   LocalExecutorParams params;
-  params.device = device_.get();
-  params.function_library = nullptr;
+  params.device = device_;
+  params.function_library = flr_;
   params.create_kernel = [this, graph_def_version](
                              const std::shared_ptr<const NodeProperties>& props,
                              OpKernel** kernel) {
-    return CreateNonCachedKernel(device_.get(), nullptr, props,
-                                 graph_def_version, kernel);
+    return CreateNonCachedKernel(device_, flr_, props, graph_def_version,
+                                 kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
@@ -109,11 +123,12 @@ Benchmark::Benchmark(const string& device, Graph* g,
 Benchmark::~Benchmark() {
   if (device_) {
     rendez_->Unref();
-    // We delete `exec_` before `device_` because the `exec_` destructor may
+    // We delete `exec_` before `device_mgr_` because the `exec_` destructor may
     // run kernel destructors that may attempt to access state borrowed from
-    // `device_`, such as the resource manager.
+    // `device_mgr_`, such as the resource manager.
     exec_.reset();
-    device_.reset();
+    pflr_.reset();
+    device_mgr_.reset();
     delete pool_;
   }
 }
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 742f40de0c2..9c6b1eb088c 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -29,7 +29,10 @@ limitations under the License.
 namespace tensorflow {
 
 class Device;
+class FunctionLibraryRuntime;
+class ProcessFunctionLibraryRuntime;
 struct SessionOptions;
+class StaticDeviceMgr;
 
 namespace test {
 
@@ -55,9 +58,13 @@ class Benchmark {
       const std::vector<string>& outputs, int iters);
 
  private:
-  thread::ThreadPool* pool_ = nullptr;
-  std::unique_ptr<Device> device_ = nullptr;
+  thread::ThreadPool* pool_ = nullptr;  // Not owned.
+  Device* device_ = nullptr;            // Not owned.
   Rendezvous* rendez_ = nullptr;
+  std::unique_ptr<StaticDeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* flr_;  // Not owned.
   std::unique_ptr<Executor> exec_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Benchmark);
diff --git a/tensorflow/core/common_runtime/local_executor_params.h b/tensorflow/core/common_runtime/local_executor_params.h
new file mode 100644
index 00000000000..caa9b68c128
--- /dev/null
+++ b/tensorflow/core/common_runtime/local_executor_params.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_EXECUTOR_PARAMS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_EXECUTOR_PARAMS_H_
+
+#include <functional>
+#include <memory>
+
+namespace tensorflow {
+
+class Device;
+class StepStatsCollector;
+class SessionMetadata;
+class FunctionLibraryRuntime;
+class NodeProperties;
+class OpKernel;
+class Status;
+
+// LocalExecutorParams provides arguments that will be shared by all invocations
+// of an executor. We expect that different contexts would provide different
+// implementations (e.g. local versus distributed).
+struct LocalExecutorParams {
+  Device* device;
+
+  const SessionMetadata* session_metadata = nullptr;
+
+  // The library runtime support.
+  FunctionLibraryRuntime* function_library = nullptr;
+
+  // create_kernel returns an instance of op kernel based on NodeDef.
+  // delete_kernel is called for every kernel used by the executor
+  // when the executor is deleted.
+  std::function<Status(const std::shared_ptr<const NodeProperties>&,
+                       OpKernel**)>
+      create_kernel;
+  std::function<void(OpKernel*)> delete_kernel;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_EXECUTOR_PARAMS_H_
diff --git a/tensorflow/core/common_runtime/lower_case_op.cc b/tensorflow/core/common_runtime/lower_case_op.cc
index 24ca8a94b85..a13c55d5aa5 100644
--- a/tensorflow/core/common_runtime/lower_case_op.cc
+++ b/tensorflow/core/common_runtime/lower_case_op.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/lower_case_op.h"
 
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/lower_functional_ops.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -29,7 +28,7 @@ namespace {
 using NodeOut = NodeBuilder::NodeOut;
 
 constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
-    LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr;
+    LowerFunctionalOpsConstants::kLowerAsMultiDeviceFunctionAttr;
 
 // Convenience builder to make it easy to construct a case with a single
 // function call in each branch. This first converts the Case node
diff --git a/tensorflow/core/common_runtime/lower_case_op.h b/tensorflow/core/common_runtime/lower_case_op.h
index 9148f43c6c1..110ac20a929 100644
--- a/tensorflow/core/common_runtime/lower_case_op.h
+++ b/tensorflow/core/common_runtime/lower_case_op.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_CASE_OP_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_CASE_OP_H_
 
-#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
+class Graph;
+class Node;
+
 // Replaces Case node `n` with a lowered form that uses _SwitchN/Merge nodes.
 Status RewriteCaseNode(Node* n, Graph* g, bool keep_node_fetchable);
 
diff --git a/tensorflow/core/common_runtime/lower_case_op_test.cc b/tensorflow/core/common_runtime/lower_case_op_test.cc
index ce34a21f0ca..185d7b9d502 100644
--- a/tensorflow/core/common_runtime/lower_case_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_case_op_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/lower_functional_ops.h"
-
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
@@ -22,14 +20,14 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/lower_functional_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/common_runtime/lower_function_call_op.cc b/tensorflow/core/common_runtime/lower_function_call_op.cc
index f65c157e485..ed72d6d720b 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/lower_function_call_op.h"
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/lower_functional_ops.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
@@ -29,7 +29,7 @@ using KeepCallerNode = InlineFunctionBodyOptions::KeepCallerNode;
 using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
 
 constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
-    LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr;
+    LowerFunctionalOpsConstants::kLowerAsMultiDeviceFunctionAttr;
 
 bool LowerAsMultiDeviceFunction(const Node* n) {
   if (n->IsPartitionedCall()) return true;
diff --git a/tensorflow/core/common_runtime/lower_function_call_op.h b/tensorflow/core/common_runtime/lower_function_call_op.h
index 6a418a92822..89ce6b28220 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op.h
+++ b/tensorflow/core/common_runtime/lower_function_call_op.h
@@ -16,11 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTION_CALL_OP_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTION_CALL_OP_H_
 
-#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
+class FunctionLibraryDefinition;
+class Graph;
+class Node;
+
 // Replaces function call node `n` with its function body. Uses
 // InlineFunctionBody from `common_runtime/function.{h,cc}`. If function
 // inlining is not possible or safe (see ValidateInlining), leaves the graph in
diff --git a/tensorflow/core/common_runtime/lower_function_call_op_test.cc b/tensorflow/core/common_runtime/lower_function_call_op_test.cc
index c7e6b16dca6..600c422c8f5 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/lower_functional_ops.h"
-
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
@@ -22,14 +20,14 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/lower_functional_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/common_runtime/lower_functional_ops.cc b/tensorflow/core/common_runtime/lower_functional_ops.cc
index 8c99fce17d5..7bfc36c14fc 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/lower_functional_ops.h"
 
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/common_runtime/lower_case_op.h"
 #include "tensorflow/core/common_runtime/lower_function_call_op.h"
 #include "tensorflow/core/common_runtime/lower_if_op.h"
@@ -27,17 +28,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-/*static*/ constexpr const char* const
-    LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr;
-/*static*/ constexpr const char* const
-    LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr;
-
 namespace {
 
 constexpr const char* const kLowerUsingSwitchMergeAttr =
-    LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr;
+    LowerFunctionalOpsConstants::kLowerUsingSwitchMergeAttr;
 constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
-    LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr;
+    LowerFunctionalOpsConstants::kLowerAsMultiDeviceFunctionAttr;
 
 constexpr const char* const kTpuReplicateAttr = "_tpu_replicate";
 constexpr const char* const kXlaClusterAttr = "_xla_compile_id";
@@ -173,7 +169,7 @@ Status LowerFunctionalOpsPass::Run(
       DCHECK(!lower_control_flow(n))
           << "Node " << FormatNodeForError(*n) << " of type "
           << n->type_string() << " has '"
-          << LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr
+          << LowerFunctionalOpsConstants::kLowerUsingSwitchMergeAttr
           << "' attr set but it does not support lowering.\n";
     }
   }
diff --git a/tensorflow/core/common_runtime/lower_functional_ops.h b/tensorflow/core/common_runtime/lower_functional_ops.h
index 84d15a11572..32b6a450f1c 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops.h
+++ b/tensorflow/core/common_runtime/lower_functional_ops.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTIONAL_OPS_H_
 
 #include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -38,9 +39,9 @@ class LowerFunctionalOpsPass : public GraphOptimizationPass {
   Status Run(const GraphOptimizationPassOptions& options) override;
 
   static constexpr const char* const kLowerUsingSwitchMergeAttr =
-      "_lower_using_switch_merge";
+      LowerFunctionalOpsConstants::kLowerUsingSwitchMergeAttr;
   static constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
-      "_lower_as_multi_device_function";
+      LowerFunctionalOpsConstants::kLowerAsMultiDeviceFunctionAttr;
 
  private:
   // If defined use the value to control if functional ops must be fetchable
diff --git a/tensorflow/core/common_runtime/lower_functional_ops_test.cc b/tensorflow/core/common_runtime/lower_functional_ops_test.cc
index 21f2a5e82d8..34faf7f80f1 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops_test.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops_test.cc
@@ -21,14 +21,13 @@ limitations under the License.
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 9b1d2b8e270..5cde4f9049c 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/lower_if_op.h"
 
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/lower_functional_ops.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -27,7 +26,7 @@ namespace {
 using NodeOut = NodeBuilder::NodeOut;
 
 constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
-    LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr;
+    LowerFunctionalOpsConstants::kLowerAsMultiDeviceFunctionAttr;
 
 // Convenience builder to make it easy to construct a conditional with a single
 // function call in the then and else branch. This first converts the if node
diff --git a/tensorflow/core/common_runtime/lower_if_op.h b/tensorflow/core/common_runtime/lower_if_op.h
index cfaf15e71f1..55b7b91b56f 100644
--- a/tensorflow/core/common_runtime/lower_if_op.h
+++ b/tensorflow/core/common_runtime/lower_if_op.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
 
-#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
+class Graph;
+class Node;
+
 // Replaces If node `n` with its lowered form that uses Switch and Merge nodes.
 Status RewriteIfNode(Node* n, Graph* g, bool keep_node_fetchable);
 
diff --git a/tensorflow/core/common_runtime/lower_if_op_test.cc b/tensorflow/core/common_runtime/lower_if_op_test.cc
index 71e1a011aa3..cf7d35409bb 100644
--- a/tensorflow/core/common_runtime/lower_if_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_if_op_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/lower_functional_ops.h"
-
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
@@ -22,14 +20,14 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/lower_functional_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 1f8cbe374bb..90fdc886c50 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/lower_while_op.h"
 
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/lower_functional_ops.h"
-#include "tensorflow/core/common_runtime/lower_if_op.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
@@ -30,7 +28,7 @@ namespace {
 using NodeOut = NodeBuilder::NodeOut;
 
 constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
-    LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr;
+    LowerFunctionalOpsConstants::kLowerAsMultiDeviceFunctionAttr;
 
 // Helper to convert a functional While op to its lowered form.
 //
@@ -240,12 +238,14 @@ Status LowerWhileHelper::CreateEnterNodes() {
   TF_RETURN_IF_ERROR(while_op_->input_edges(&edges));
   for (const Edge* edge : edges) {
     Node* enter_node;
-    NodeBuilder builder = NodeBuilder(NewName("enter"), "Enter",
-                                      graph_->op_registry(), &debug_info_)
-                              .Input(NodeOut(edge->src(), edge->src_output()))
-                              .Attr("frame_name", name_)
-                              .Attr("parallel_iterations", parallel_iterations_)
-                              .Device(while_op_->requested_device());
+    NodeBuilder builder =
+        NodeBuilder(NewName("enter"), "Enter", graph_->op_registry(),
+                    &debug_info_)
+            .Input(NodeOut(edge->src(), edge->src_output()))
+            .Attr("frame_name", name_)
+            .Attr("parallel_iterations", parallel_iterations_)
+            .Device(edge->src()->requested_device())
+            .AssignedDevice(edge->src()->assigned_device_name());
     if (IsResource(edge->dst_input())) {
       builder.Attr("is_constant", true);
     }
@@ -284,7 +284,8 @@ Status LowerWhileHelper::CreateMergeNodes() {
         NodeBuilder(NewName("merge"), "Merge", graph_->op_registry(),
                     &debug_info_)
             .Input({NodeOut(enter_node, 0), NodeOut(enter_node, 0)})
-            .Device(while_op_->requested_device())
+            .Device(enter_node->requested_device())
+            .AssignedDevice(enter_node->assigned_device_name())
             .Finalize(graph_, &merge_node));
     merge_nodes_.emplace_back(merge_node);
   }
@@ -325,21 +326,19 @@ Status LowerWhileHelper::CreateSwitchNodes() {
       TF_RETURN_IF_ERROR(while_op_->input_node(i, &input_node));
       op_name = strings::StrCat(input_node->name(), "_switch");
     }
+    Node* merge_node = merge_nodes_[op_input_output_to_lowered_node_[i]];
     Node* switch_node;
     string op_type = "Switch";
-    if (IsRefType(
-            merge_nodes_[op_input_output_to_lowered_node_[i]]->output_type(
-                0))) {
+    if (IsRefType(merge_node->output_type(0))) {
       op_type = "RefSwitch";
     }
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName(op_name), op_type, graph_->op_registry(),
-                    &debug_info_)
-            .Input(
-                NodeOut(merge_nodes_[op_input_output_to_lowered_node_[i]], 0))
-            .Input(NodeOut(loop_cond_node_, 0))
-            .Device(while_op_->requested_device())
-            .Finalize(graph_, &switch_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName(op_name), op_type,
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(merge_node, 0))
+                           .Input(NodeOut(loop_cond_node_, 0))
+                           .Device(merge_node->requested_device())
+                           .AssignedDevice(merge_node->assigned_device_name())
+                           .Finalize(graph_, &switch_node));
     switch_nodes_.emplace_back(switch_node);
   }
   return Status::OK();
@@ -394,7 +393,10 @@ Status LowerWhileHelper::CreateExitNodes() {
                       &debug_info_)
               .Input(NodeOut(switch_nodes_[op_input_output_to_lowered_node_[i]],
                              0))
-              .Device(while_op_->requested_device())
+              .Device(switch_nodes_[op_input_output_to_lowered_node_[i]]
+                          ->requested_device())
+              .AssignedDevice(switch_nodes_[op_input_output_to_lowered_node_[i]]
+                                  ->assigned_device_name())
               .Finalize(graph_, &exit_node));
       exit_nodes_.emplace_back(exit_node);
       outputs.emplace_back(NodeOut(exit_node, 0));
@@ -442,11 +444,13 @@ Status LowerWhileHelper::CreateNextIterationNodes() {
     if (IsResource(i)) {
       continue;
     }
+    Node* merge_node = merge_nodes_[op_input_output_to_lowered_node_[i]];
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("next_iteration"), "NextIteration",
                                    graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(body_call_node_, i))
                            .ControlInput(body_call_node_)
-                           .Device(while_op_->requested_device())
+                           .Device(merge_node->requested_device())
+                           .AssignedDevice(merge_node->assigned_device_name())
                            .Finalize(graph_, &next_iteration));
     next_iterations_nodes_.emplace_back(next_iteration);
   }
diff --git a/tensorflow/core/common_runtime/lower_while_op.h b/tensorflow/core/common_runtime/lower_while_op.h
index 9f016c45892..1dd22389ec4 100644
--- a/tensorflow/core/common_runtime/lower_while_op.h
+++ b/tensorflow/core/common_runtime/lower_while_op.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_WHILE_OP_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_WHILE_OP_H_
 
-#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
+class Graph;
+class Node;
+
 // Replaces While node `n` with its lowered form that uses Enter, Exit, Switch,
 // Merge, NextIteration and LoopCond nodes.
 Status RewriteWhileNode(Node* n, Graph* g, bool keep_node_fetchable);
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index 5aec79f35c3..9d7870f891d 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -13,21 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/lower_functional_ops.h"
-
+#include "absl/strings/match.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/lower_functional_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -171,6 +171,238 @@ TEST(LowerWhileOpTest, Simple) {
   }
 }
 
+TEST(LowerWhileOpTest, ForwardAssignedInputDevice) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+  *f_lib_proto.add_function() = test::function::XTimesTwo();
+  *f_lib_proto.add_function() = test::function::LessThanOrEqualToN(8);
+
+  TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
+  auto type = DT_FLOAT;
+  Node* placeholder;
+  TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
+                  .Attr("dtype", type)
+                  .Finalize(graph.get(), &placeholder));
+  const string assigned_device_name = "/job:localhost/replica:0/task:0/gpu:0";
+  placeholder->set_assigned_device_name(assigned_device_name);
+  Node* while_node;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(placeholder)});
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("XTimesTwo");
+  TF_ASSERT_OK(
+      NodeBuilder("while", "While", &graph->flib_def())
+          .Input(inputs)
+          .Attr("T", {type})
+          .Attr("cond", cond_func)
+          .Attr("body", body_func)
+          .Attr("parallel_iterations", 100)
+          .Attr(LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr, true)
+          .Finalize(graph.get(), &while_node));
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  const Node* placeholder_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->name() == "placed_node") {
+      placeholder_node = op;
+    }
+  }
+  ASSERT_NE(placeholder_node, nullptr);
+  // Verify the assigned device of the Enter node.
+  int enter_consumers = 0;
+  const Node* enter_node = nullptr;
+  for (const Node* consumer : placeholder_node->out_nodes()) {
+    if (consumer->type_string() == "Enter") {
+      enter_consumers += 1;
+      enter_node = consumer;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(enter_consumers, 1);
+  // Verify the assigned device of the Merge node.
+  int merge_consumers = 0;
+  const Node* merge_node = nullptr;
+  for (const Node* consumer : enter_node->out_nodes()) {
+    if (consumer->type_string() == "Merge") {
+      merge_consumers += 1;
+      merge_node = consumer;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(merge_consumers, 1);
+  // Verify the assigned device of the NextIteration node.
+  int next_iteration_consumers = 0;
+  for (const Node* consumer : merge_node->in_nodes()) {
+    if (consumer->type_string() == "NextIteration") {
+      next_iteration_consumers += 1;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(next_iteration_consumers, 1);
+  // Verify the assigned device of the Switch node.
+  int switch_consumers = 0;
+  const Node* switch_node = nullptr;
+  for (const Node* consumer : merge_node->out_nodes()) {
+    if (consumer->type_string() == "Switch") {
+      switch_consumers += 1;
+      switch_node = consumer;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(switch_consumers, 1);
+  // Verify the assigned device of the Exit node.
+  int exit_consumers = 0;
+  for (const Node* consumer : switch_node->out_nodes()) {
+    if (consumer->type_string() == "Exit") {
+      exit_consumers += 1;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(exit_consumers, 1);
+}
+
+TEST(LowerWhileOpTest, ForwardRequestedInputDevice) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+  *f_lib_proto.add_function() = test::function::XTimesTwo();
+  *f_lib_proto.add_function() = test::function::LessThanOrEqualToN(8);
+
+  TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
+  auto type = DT_FLOAT;
+  // We will place the loop var on the gpu:0.
+  const string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
+  // We will place loop's control input on the gpu:1.
+  const string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
+  // We will place While op on gpu:2.
+  const string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
+  Node* gpu_0_ph;
+  TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
+                  .Attr("dtype", type)
+                  .Device(gpu_0_device)
+                  .Finalize(graph.get(), &gpu_0_ph));
+  Node* control_in;
+  // Add a control input to the While op to trigger the creation of a
+  // LoopExecuted node.
+  TF_CHECK_OK(NodeBuilder("control_in", "Placeholder")
+                  .Attr("dtype", type)
+                  .Device(gpu_1_device)
+                  .Finalize(graph.get(), &control_in));
+  Node* while_node;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(gpu_0_ph)});
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("XTimesTwo");
+  TF_ASSERT_OK(
+      NodeBuilder("while", "While", &graph->flib_def())
+          .Input(inputs)
+          .ControlInput(control_in)
+          .Device(gpu_2_device)
+          .Attr("T", {type})
+          .Attr("cond", cond_func)
+          .Attr("body", body_func)
+          .Attr("parallel_iterations", 100)
+          .Attr(LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr, true)
+          .Finalize(graph.get(), &while_node));
+
+  // Create an empty Const node with control dep from the While op.
+  // This triggers the creation of a LoopExecuted node.
+  Node* control_out;
+  TensorProto proto;
+  proto.set_dtype(DT_FLOAT);
+  TensorShape empty_shape({0});
+  empty_shape.AsProto(proto.mutable_tensor_shape());
+  TF_ASSERT_OK(NodeBuilder("control_out", "Const")
+                   .ControlInput(while_node)
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("value", proto)
+                   .Finalize(graph.get(), &control_out));
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  const Node* placeholder_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->name() == "placed_node") {
+      placeholder_node = op;
+    }
+  }
+  ASSERT_NE(placeholder_node, nullptr);
+  // Verify the requested device of the Enter node.
+  int enter_consumers = 0;
+  const Node* enter_node = nullptr;
+  for (const Node* consumer : placeholder_node->out_nodes()) {
+    if (consumer->type_string() == "Enter") {
+      enter_consumers += 1;
+      enter_node = consumer;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(enter_consumers, 1);
+  // Verify the requested device of the Merge node.
+  int merge_consumers = 0;
+  const Node* merge_node = nullptr;
+  for (const Node* consumer : enter_node->out_nodes()) {
+    if (consumer->type_string() == "Merge") {
+      merge_consumers += 1;
+      merge_node = consumer;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(merge_consumers, 1);
+  // Verify the requested device of the NextIteration node.
+  int next_iteration_consumers = 0;
+  for (const Node* consumer : merge_node->in_nodes()) {
+    if (consumer->type_string() == "NextIteration") {
+      next_iteration_consumers += 1;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(next_iteration_consumers, 1);
+  // Verify the requested device of the Switch node.
+  int switch_consumers = 0;
+  const Node* switch_node = nullptr;
+  for (const Node* consumer : merge_node->out_nodes()) {
+    if (consumer->type_string() == "Switch") {
+      switch_consumers += 1;
+      switch_node = consumer;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(switch_consumers, 1);
+  // Verify the requested device of the Exit node.
+  int exit_consumers = 0;
+  for (const Node* consumer : switch_node->out_nodes()) {
+    if (consumer->type_string() == "Exit") {
+      exit_consumers += 1;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(exit_consumers, 1);
+  // Verify the requested device of LoopControlInputs.
+  const Node* loop_control_inputs_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (absl::StrContains(op->name(), "LoopControlInputs")) {
+      loop_control_inputs_node = op;
+    }
+  }
+  ASSERT_NE(loop_control_inputs_node, nullptr);
+  ASSERT_EQ(loop_control_inputs_node->requested_device(), gpu_2_device);
+  // Verify the requested device of LoopExecuted.
+  const Node* loop_executed_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (absl::StrContains(op->name(), "LoopExecuted")) {
+      loop_executed_node = op;
+    }
+  }
+  ASSERT_NE(loop_executed_node, nullptr);
+  ASSERT_EQ(loop_executed_node->requested_device(), gpu_2_device);
+}
+
 TEST(LowerWhileOpTest, MultipleInputs) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
similarity index 97%
rename from tensorflow/core/graph/mkl_layout_pass.cc
rename to tensorflow/core/common_runtime/mkl_layout_pass.cc
index c27c7aa911b..3374113465f 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // all over the place, we should log an error and execute the original graph.
 #ifdef INTEL_MKL
 
-#include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/common_runtime/mkl_layout_pass.h"
 
 #include <algorithm>
 #include <functional>
@@ -268,11 +268,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.dequantize = "Dequantize";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.fused_batch_norm_ex = "_FusedBatchNormEx";
     csinfo_.fused_batch_norm_v2 = "FusedBatchNormV2";
     csinfo_.fused_batch_norm_grad_v2 = "FusedBatchNormGradV2";
     csinfo_.fused_batch_norm_v3 = "FusedBatchNormV3";
     csinfo_.fused_batch_norm_grad_v3 = "FusedBatchNormGradV3";
     csinfo_.fused_conv2d = "_FusedConv2D";
+    csinfo_.fused_depthwise_conv2d = "_FusedDepthwiseConv2dNative";
     csinfo_.fused_matmul = "_FusedMatMul";
     csinfo_.identity = "Identity";
     csinfo_.leakyrelu = "LeakyRelu";
@@ -294,7 +296,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         "_MklDepthwiseConv2dNativeBackpropInput";
     csinfo_.mkl_depthwise_conv2d_grad_filter =
         "_MklDepthwiseConv2dNativeBackpropFilter";
+    csinfo_.mkl_fused_batch_norm_ex = "_MklFusedBatchNormEx";
     csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
+    csinfo_.mkl_fused_depthwise_conv2d = "_MklFusedDepthwiseConv2dNative";
     csinfo_.mkl_fused_matmul = "_MklFusedMatMul";
     csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
     csinfo_.mkl_pad_with_fused_conv2d = "_MklPadWithFusedConv2D";
@@ -476,9 +480,18 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         {csinfo_.fused_batch_norm_grad_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v3),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#ifdef ENABLE_MKLDNN_V1
+    rinfo_.push_back({csinfo_.fused_batch_norm_ex,
+                      csinfo_.mkl_fused_batch_norm_ex, CopyAttrsAll,
+                      FusedBatchNormExRewrite, kRewriteForLayoutPropagation});
+#endif
     rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d,
                       CopyAttrsFusedConv2D, FusedConv2DRewrite,
                       kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.fused_depthwise_conv2d,
+                      csinfo_.mkl_fused_depthwise_conv2d, CopyAttrsFusedConv2D,
+                      FusedDepthwiseConv2DRewrite,
+                      kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.fused_matmul, csinfo_.mkl_fused_matmul,
                       CopyAttrsAllCheckConstFilter, FusedMatMulRewrite});
 
@@ -920,11 +933,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string dequantize;
     string fused_batch_norm;
     string fused_batch_norm_grad;
+    string fused_batch_norm_ex;
     string fused_batch_norm_v2;
     string fused_batch_norm_grad_v2;
     string fused_batch_norm_v3;
     string fused_batch_norm_grad_v3;
     string fused_conv2d;
+    string fused_depthwise_conv2d;
     string fused_matmul;
     string identity;
     string leakyrelu;
@@ -944,7 +959,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_with_bias;
     string mkl_depthwise_conv2d_grad_input;
     string mkl_depthwise_conv2d_grad_filter;
+    string mkl_fused_batch_norm_ex;
     string mkl_fused_conv2d;
+    string mkl_fused_depthwise_conv2d;
     string mkl_fused_matmul;
     string mkl_pad_with_conv2d;
     string mkl_pad_with_fused_conv2d;
@@ -1105,8 +1122,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     DataType T_m;
     TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
 
+#ifndef ENABLE_INTEL_MKL_BFLOAT16
     // Don't try to merge if datatype is not DT_FLOAT
     if (T_m != DT_FLOAT) return n;
+#else
+    // Don't try to merge if datatype is not DT_FLOAT or DT_BFLOAT16
+    if (T_m != DT_FLOAT && T_m != DT_BFLOAT16) return n;
+#endif
 
     if (m->type_string() == csinfo_.bias_add) {
       // If a is BiasAdd, then Conv2D is 0th input of BiasAdd.
@@ -1262,8 +1284,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     DataType T_m;
     TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
 
+#ifndef ENABLE_INTEL_MKL_BFLOAT16
     // Don't try to merge if datatype is not DT_FLOAT
     if (T_m != DT_FLOAT) return n;
+#else
+    // Don't try to merge if datatype is not DT_FLOAT or DT_BFLOAT16
+    if (T_m != DT_FLOAT && T_m != DT_BFLOAT16) return n;
+#endif
 
     if (m->type_string() == csinfo_.bias_add_grad) {
       // Get 1st input 'g' of BiasAddGrad.
@@ -1652,6 +1679,31 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return do_rewrite;
   }
 
+  static bool FusedBatchNormExRewrite(const Node* n) {
+    DCHECK(n);
+
+    int num_side_inputs;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "num_side_inputs", &num_side_inputs));
+    string activation_mode;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "activation_mode", &activation_mode));
+
+    // if the num_side_inputs is not 0, don't rewrite the node.
+    if (num_side_inputs != 0) {
+      VLOG(1) << "FusedBatchNormExRewrite: The model sets num_side_inputs"
+              << "larger than 0 is not optimized by Intel MKL.";
+      return false;
+    }
+
+    // if the activation_mode is not 'Relu', don't rewrite the node.
+    if (activation_mode != "Relu") {
+      VLOG(1) << "FusedBatchNormExRewrite: Only Relu activation mode is"
+              << "supported by Intel MKL.";
+      return false;
+    }
+
+    return true;
+  }
+
   static bool FusedConv2DRewrite(const Node* n) {
     // MKL DNN currently doesn't support all fusions that grappler fuses
     // together with Conv2D (ex. batchnorm). We rewrite _FusedConv2D only if
@@ -1675,6 +1727,25 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
             fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"});
   }
 
+  static bool FusedDepthwiseConv2DRewrite(const Node* n) {
+    // MKL DNN currently doesn't support all fusions that grappler fuses
+    // together with DepthwiseConv2D (ex. batchnorm). We rewrite
+    // _FusedDepthwiseConv2DNative only if it includes those we support.
+    DataType T;
+    if (!TryGetNodeAttr(n->def(), "T", &T) ||
+        !mkl_op_registry::IsMklLayoutDependentOp(
+            csinfo_.mkl_fused_depthwise_conv2d, T)) {
+      return false;
+    }
+
+    std::vector<string> fused_ops;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "fused_ops", &fused_ops));
+    return (fused_ops == std::vector<string>{"BiasAdd"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Relu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Relu6"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Elu"});
+  }
+
   // Rewrites input node to a new node specified by its matching rewrite info.
   //
   // Method first searches matching rewrite info for input node and then
@@ -2131,9 +2202,6 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   // Number of input slots to original op
   // Input slots are represented by .Input() calls in REGISTER_OP.
   int old_node_input_slots = old_node->op_def().input_arg_size();
-  // Actual number of inputs can be greater than or equal to number
-  // of Input slots because inputs of type list could be unfolded.
-  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
   int nn_slot_idx = 0;  // slot index for inputs of new node
 
   // Let's copy all inputs (TF tensors) of original node to new node.
@@ -2141,13 +2209,14 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
     // An input slot could be a single tensor or a list. We need
     // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
     const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
-                                    &new_node_inputs);
+      int tensor_list_length = GetTensorListLength(arg, old_node);
+      if (tensor_list_length != 0) {
+        GetNodesProducingTFTensorList(old_node_inputs, &iidx,
+                                      tensor_list_length, &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -2180,13 +2249,14 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
     // An input slot could be a single tensor or a list. We need
     // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
     const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
-                                     &new_node_inputs);
+      int tensor_list_length = GetTensorListLength(arg, old_node);
+      if (tensor_list_length != 0) {
+        GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                       tensor_list_length, &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -3702,7 +3772,9 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
       n->type_string() != csinfo_.pad_with_conv2d &&
       n->type_string() != csinfo_.pad_with_fused_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      n->type_string() != csinfo_.fused_batch_norm_ex &&
       n->type_string() != csinfo_.fused_conv2d &&
+      n->type_string() != csinfo_.fused_depthwise_conv2d &&
       n->type_string() != csinfo_.fused_matmul &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
                                 T)) {
diff --git a/tensorflow/core/graph/mkl_layout_pass.h b/tensorflow/core/common_runtime/mkl_layout_pass.h
similarity index 85%
rename from tensorflow/core/graph/mkl_layout_pass.h
rename to tensorflow/core/common_runtime/mkl_layout_pass.h
index e7175149df8..6b5c586ceab 100644
--- a/tensorflow/core/graph/mkl_layout_pass.h
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // A graph pass that rewrites graph for propagating MKL layout as a tensor
 
-#ifndef TENSORFLOW_CORE_GRAPH_MKL_LAYOUT_PASS_H_
-#define TENSORFLOW_CORE_GRAPH_MKL_LAYOUT_PASS_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_LAYOUT_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_LAYOUT_PASS_H_
 
 #ifdef INTEL_MKL
 
@@ -33,4 +33,4 @@ extern bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g);
 
 #endif
 
-#endif  // TENSORFLOW_CORE_GRAPH_MKL_LAYOUT_PASS_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_LAYOUT_PASS_H_
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
similarity index 93%
rename from tensorflow/core/graph/mkl_layout_pass_test.cc
rename to tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index 6fe969a99c3..71ab786f8a5 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -15,15 +15,15 @@ limitations under the License.
 
 #if defined(INTEL_MKL) && defined(ENABLE_MKL)
 
-#include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/common_runtime/mkl_layout_pass.h"
 
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -259,8 +259,7 @@ TEST_F(MklLayoutPassTest, Basic) {
         "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;" \
         "DMT/_2->E:5;E->Z;Y->Z:1");                                        \
   }
-// TODO(nhasabni): Enable bfloat16 test when we enable the operator.
-REGISTER_TEST_FLOAT32(NodeMerge_Conv2DWithBias_Positive);
+REGISTER_TEST_ALL_TYPES(NodeMerge_Conv2DWithBias_Positive);
 #undef REGISTER_TEST
 
 // Graph contains only Conv2D, no AddBias.
@@ -411,8 +410,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_Conv2DWithBias_Negative_AttrMismatch);
         "DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"            \
         "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");                              \
   }
-// TODO(nhasabni): Enable bfloat16 test when we enable the operator.
-REGISTER_TEST_FLOAT32(NodeMerge_Conv2DBackpropFilterFusion_Positive);
+REGISTER_TEST_ALL_TYPES(NodeMerge_Conv2DBackpropFilterFusion_Positive);
 #undef REGISTER_TEST
 
 // BiasAddGrad fusion in the presence of BackpropFilter. But nodes do not match
@@ -1789,6 +1787,56 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive6);
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive7);
 #undef REGISTER_TEST
 
+// Rewrite test for _FusedDepthwiseConv2dNative Op fusion
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+    InitGraph(                                                                 \
+        "node { name: 'A' op: '" #INPUT "'}"                                   \
+        "node { name: 'B' op: '" #INPUT "'}"                                   \
+        "node { name: 'C' op: '" #INPUT "'}"                                   \
+        "node { name: 'D' op: '_FusedDepthwiseConv2dNative'"                   \
+        " attr { key: 'T'                value { type: " #T " } }"             \
+        " attr { key: 'num_args'         value { i: 1 } }"                     \
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"                \
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} " \
+        "} }"                                                                  \
+        " attr { key: 'padding'          value { s: 'SAME' } }"                \
+        " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} " \
+        "} }"                                                                  \
+        " attr { key: 'fused_ops'        value { list: " FUSED_OPS " } }"      \
+        " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " input: ['A', 'B', 'C']}"                                             \
+        "node { name: 'E' op: 'Zeta'"                                          \
+        "attr { key: 'T' value { type: " #T " } }"                             \
+        " input: ['D', 'C'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
+              "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");"                     \
+              "D(_MklFusedDepthwiseConv2dNative);"                             \
+              "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;"        \
+              "A:control->DMT/_0:control;A:control->DMT/_1:control;"           \
+              "A:control->DMT/_2:control;B->D:1;C->D:2;C->E:1;D->E;"           \
+              "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");                          \
+  }
+
+// BiasAdd fusion
+#define FUSED_OPS "{s: 'BiasAdd'}"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedDepthwiseConv2dNative_Positive1);
+
+// BiasAdd + Relu fusion
+#define FUSED_OPS "{s: 'BiasAdd', s: 'Relu'}"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedDepthwiseConv2dNative_Positive2);
+
+// BiasAdd + Relu6 fusion
+#define FUSED_OPS "{s: 'BiasAdd', s: 'Relu6'}"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedDepthwiseConv2dNative_Positive3);
+
+// BiasAdd + Elu fusion
+#define FUSED_OPS "{s: 'BiasAdd', s: 'Elu'}"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedDepthwiseConv2dNative_Positive4);
+
+#undef FUSED_OPS
+#undef REGISTER_TEST
+
 // Rewrite test for _FusedConv2D Op with unsupported fusion
 #define REGISTER_TEST(NAME, T, INPUT)                                          \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
@@ -1818,6 +1866,36 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive7);
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Negative1);
 #undef REGISTER_TEST
 
+// Rewrite test for _FusedDepthwiseConv2dNative with unsupported fusion
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+    InitGraph(                                                                 \
+        "node { name: 'A' op: '" #INPUT "'}"                                   \
+        "node { name: 'B' op: '" #INPUT "'}"                                   \
+        "node { name: 'C' op: '" #INPUT "'}"                                   \
+        "node { name: 'D' op: '_FusedDepthwiseConv2dNative'"                   \
+        " attr { key: 'T'                value { type: " #T " } }"             \
+        " attr { key: 'num_args'         value { i: 1 } }"                     \
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"                \
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} " \
+        "} }"                                                                  \
+        " attr { key: 'padding'          value { s: 'SAME' } }"                \
+        " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} " \
+        "} }"                                                                  \
+        " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }" \
+        " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " input: ['A', 'B', 'C']}"                                             \
+        "node { name: 'E' op: 'Zeta'"                                          \
+        "attr { key: 'T' value { type: " #T " } }"                             \
+        " input: ['D', 'C'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
+              "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");"                     \
+              "D(_FusedDepthwiseConv2dNative);"                                \
+              "E(Zeta)|A->D;B->D:1;C->D:2;C->E:1;D->E");                       \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedDepthwiseConv2dNative_Negative1);
+#undef REGISTER_TEST
+
 // Rewrite test for _FusedConv2D Op with unsupported type
 #define REGISTER_TEST(NAME, T, INPUT)                                          \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
@@ -1847,6 +1925,37 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Negative1);
 REGISTER_TEST(NodeRewrite_FusedConv2D_Negative2, DT_DOUBLE, DoubleInput);
 #undef REGISTER_TEST
 
+// Rewrite test for _FusedDepthwiseConv2dNativeOp with unsupported type
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+    InitGraph(                                                                 \
+        "node { name: 'A' op: '" #INPUT "'}"                                   \
+        "node { name: 'B' op: '" #INPUT "'}"                                   \
+        "node { name: 'C' op: '" #INPUT "'}"                                   \
+        "node { name: 'D' op: '_FusedDepthwiseConv2dNative'"                   \
+        " attr { key: 'T'                value { type:" #T  "} }"              \
+        " attr { key: 'num_args'         value { i: 1 } }"                     \
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"                \
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} " \
+        "} }"                                                                  \
+        " attr { key: 'padding'          value { s: 'SAME' } }"                \
+        " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} " \
+        "} }"                                                                  \
+        " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
+        " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " input: ['A', 'B', 'C']}"                                             \
+        "node { name: 'E' op: 'Zeta'"                                          \
+        "attr { key: 'T' value { type: " #T "} }"                              \
+        " input: ['D', 'C'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
+              "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");"                     \
+              "D(_FusedDepthwiseConv2dNative);"                                \
+              "E(Zeta)|A->D;B->D:1;C->D:2;C->E:1;D->E");                       \
+}
+REGISTER_TEST(NodeRewrite_FusedDepthwiseConv2dNative_Negative2,
+              DT_DOUBLE, DoubleInput);
+#undef REGISTER_TEST
+
 // Test set: _FusedMatMul -> MklFusedMatMul rewrite tests
 #define REGISTER_TEST(NAME, T, INPUT)                                          \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
@@ -3107,6 +3216,100 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
             "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
+// clang-format off
+#ifdef ENABLE_MKLDNN_V1
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                           \
+              "node { name: 'B' op: 'Input'}"                                \
+              "node { name: 'C' op: 'Input'}"                                \
+              "node { name: 'D' op: 'Input'}"                                \
+              "node { name: 'E' op: 'Input'}"                                \
+              "node { name: 'F' op: '_FusedBatchNormEx'"                     \
+              " attr { key: 'T'               value { type: " #T " } }"      \
+              " attr { key: 'U'               value { type: DT_FLOAT } }"    \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"         \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"         \
+              " attr { key: 'num_side_inputs' value { i: 0 } }"              \
+              " attr { key: 'is_training'     value { b: true } }"           \
+              " attr { key: 'activation_mode' value { s: 'Relu' } }"         \
+              " input: ['A', 'B', 'C', 'D', 'E'] }"                          \
+              "node { name: 'G' op: 'Zeta'"                                  \
+              " attr { key: 'T' value { type: " #T " } }"                    \
+              " input: ['A', 'F'] }");                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);"                    \
+              "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);"     \
+              "DMT/_4(Const);E(Input);"                                      \
+              "F(_MklFusedBatchNormEx);G(Zeta)|A->F;A->G;"                   \
+              "A:control->DMT/_0:control;A:control->DMT/_1:control;"         \
+              "A:control->DMT/_2:control;A:control->DMT/_3:control;"         \
+              "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"              \
+              "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;" \
+              "E->F:4;F->G:1");                                              \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Positive);
+#undef REGISTER_TEST
+
+// Rewrite test for _FusedBatchNormEx Op with side input
+#define REGISTER_TEST(NAME, T, INPUT)                                     \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                        \
+              "node { name: 'B' op: 'Input'}"                             \
+              "node { name: 'C' op: 'Input'}"                             \
+              "node { name: 'D' op: 'Input'}"                             \
+              "node { name: 'E' op: 'Input'}"                             \
+              "node { name: 'F' op: '" #INPUT "'}"                        \
+              "node { name: 'G' op: '_FusedBatchNormEx'"                  \
+              " attr { key: 'T'               value { type: " #T " } }"   \
+              " attr { key: 'U'               value { type: DT_FLOAT } }" \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
+              " attr { key: 'num_side_inputs' value { i: 1 } }"           \
+              " attr { key: 'is_training'     value { b: true } }"        \
+              " attr { key: 'activation_mode' value { s: 'Relu' } }"      \
+              " input: ['A', 'B', 'C', 'D', 'E', 'F'] }"                  \
+              "node { name: 'H' op: 'Zeta'"                               \
+              " attr { key: 'T' value { type: " #T " } }"                 \
+              " input: ['A', 'G'] }");                                    \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);E(Input);"        \
+              "F(" #INPUT ");G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"     \
+              "B->G:1;C->G:2;D->G:3;E->G:4;F->G:5;G->H:1");               \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
+#undef REGISTER_TEST
+
+// Rewrite test for _FusedBatchNormEx Op with Identity activation
+#define REGISTER_TEST(NAME, T, INPUT)                                     \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                        \
+              "node { name: 'B' op: 'Input'}"                             \
+              "node { name: 'C' op: 'Input'}"                             \
+              "node { name: 'D' op: 'Input'}"                             \
+              "node { name: 'E' op: 'Input'}"                             \
+              "node { name: 'G' op: '_FusedBatchNormEx'"                  \
+              " attr { key: 'T'               value { type: " #T " } }"   \
+              " attr { key: 'U'               value { type: DT_FLOAT } }" \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
+              " attr { key: 'num_side_inputs' value { i: 1 } }"           \
+              " attr { key: 'is_training'     value { b: true } }"        \
+              " attr { key: 'activation_mode' value { s: 'Identity' } }"  \
+              " input: ['A', 'B', 'C', 'D', 'E'] }"                       \
+              "node { name: 'H' op: 'Zeta'"                               \
+              " attr { key: 'T' value { type: " #T " } }"                 \
+              " input: ['A', 'G'] }");                                    \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);E(Input);"        \
+              "G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                   \
+              "B->G:1;C->G:2;D->G:3;E->G:4;G->H:1");                      \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative2);
+#undef REGISTER_TEST
+#endif  // ENABLE_MKLDNN_V1
+// clang-format on
+
 TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
   InitGraph(
       "node { name: 'A' op: 'QuantizedUnsignedInt8Input'}"
@@ -4240,6 +4443,33 @@ TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Positive) {
                                                           "_MklFusedConv2D"));
 }
 
+// _FusedDepthwiseConv2dNative + BiasAdd fusion where filter is a constant.
+TEST_F(MklLayoutPassTest,
+       FusedDepthwiseConv2dNativeWithBias_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const'"  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedDepthwiseConv2dNative'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklFusedDepthwiseConv2dNative"));
+}
+
 // _FusedConv2D + BiasAdd fusion where filter is NOT a constant.
 TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Negative) {
   InitGraph(
@@ -4262,6 +4492,28 @@ TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Negative) {
                                                            "_MklFusedConv2D"));
 }
 
+// _FusedDepthwiseConv2dNative + BiasAdd fusion where filter is NOT a constant.
+TEST_F(MklLayoutPassTest,
+       FusedDepthwiseConv2dNativeWithBias_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedDepthwiseConv2dNative'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklFusedDepthwiseConv2dNative"));
+}
 // Depthwise Conv2D op where filter is a constant.
 TEST_F(MklLayoutPassTest, DepthwiseConv2dNative_FilterCaching_Positive) {
   InitGraph(
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc
similarity index 99%
rename from tensorflow/core/graph/mkl_tfconversion_pass.cc
rename to tensorflow/core/common_runtime/mkl_tfconversion_pass.cc
index 95de6883da3..6f22c9c62ae 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
+#include "tensorflow/core/common_runtime/mkl_tfconversion_pass.h"
+
 #include <memory>
 #include <queue>
 #include <set>
@@ -26,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -33,8 +36,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/util.h"
 
-#include "tensorflow/core/graph/mkl_graph_util.h"
-#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.h b/tensorflow/core/common_runtime/mkl_tfconversion_pass.h
similarity index 85%
rename from tensorflow/core/graph/mkl_tfconversion_pass.h
rename to tensorflow/core/common_runtime/mkl_tfconversion_pass.h
index 84e50ee6e0d..899da21881a 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.h
+++ b/tensorflow/core/common_runtime/mkl_tfconversion_pass.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // An optimization pass that inserts MklToTf conversion nodes in the graph
 
-#ifndef TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
-#define TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_TFCONVERSION_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_TFCONVERSION_PASS_H_
 
 #ifdef INTEL_MKL
 
@@ -37,4 +37,4 @@ extern bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g);
 
 #endif
 
-#endif  // TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_TFCONVERSION_PASS_H_
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/common_runtime/mkl_tfconversion_pass_test.cc
similarity index 98%
rename from tensorflow/core/graph/mkl_tfconversion_pass_test.cc
rename to tensorflow/core/common_runtime/mkl_tfconversion_pass_test.cc
index b8235e5140e..25737838fba 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_tfconversion_pass_test.cc
@@ -15,16 +15,16 @@ limitations under the License.
 
 #if defined(INTEL_MKL) && defined(ENABLE_MKL)
 
-#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
-#include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/common_runtime/mkl_tfconversion_pass.h"
 
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index 03d0e7b2b57..cfaeb05d66f 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 
-#include "tensorflow/core/common_runtime/metrics.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
index 2bcd6fb40ef..5aa53d5de5a 100644
--- a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
+++ b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/graph_optimizer.h"
-
-#include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/node_builder.h"
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index f8194e6c4ba..6fb7526c512 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/partitioning_utils.h"
 
+#include <algorithm>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_partition.h"
 
 namespace tensorflow {
@@ -71,31 +74,52 @@ Status PartitionFunctionGraph(
 }
 
 Status UpdateArgAndRetvalMetadata(
-    Graph* subgraph, const string& device_type, std::vector<int>* arg_indices,
-    std::vector<int>* ret_indices,
+    Graph* subgraph, const string& device_type,
+    std::vector<FunctionArgIndex>* arg_indices, std::vector<int>* ret_indices,
     std::vector<AllocatorAttributes>* arg_alloc_attrs,
     std::vector<AllocatorAttributes>* ret_alloc_attrs) {
-  std::vector<std::pair<Node*, int>> arg_nodes;
+  std::vector<std::pair<Node*, FunctionArgIndex>> arg_nodes;
   std::vector<std::pair<Node*, int>> ret_nodes;
   const AttrValue* attr_value;
 
   // Find the Arg and Retval nodes, along with their corresponding indices
   // in the original function.
   for (Node* node : subgraph->op_nodes()) {
-    string node_type = node->type_string();
     if (node->IsArg()) {
       TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
       int index = static_cast<int>(attr_value->i());
-      arg_indices->push_back(index);
-      arg_nodes.push_back(std::make_pair(node, index));
+      int sub_index = -1;
+      if (node->attrs().Find("sub_index", &attr_value).ok()) {
+        sub_index = static_cast<int>(attr_value->i());
+      }
+      arg_nodes.emplace_back(node, FunctionArgIndex(index, sub_index));
     } else if (node->IsRetval()) {
       TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
       int index = static_cast<int>(attr_value->i());
-      ret_indices->push_back(index);
-      ret_nodes.push_back(std::make_pair(node, index));
+      ret_nodes.emplace_back(node, index);
     }
   }
 
+  // Sort the nodes by index so that the order is stable.
+  //
+  // In particular, this enables calling a single-partition function with
+  // the same signature as the original unpartitioned function.
+  auto arg_comparator = [](std::pair<Node*, FunctionArgIndex> a,
+                           std::pair<Node*, FunctionArgIndex> b) {
+    return std::tie(a.second.index, a.second.sub_index) <
+           std::tie(b.second.index, b.second.sub_index);
+  };
+  std::sort(arg_nodes.begin(), arg_nodes.end(), arg_comparator);
+  auto ret_comparator = [](std::pair<Node*, int> a, std::pair<Node*, int> b) {
+    return a.second < b.second;
+  };
+  std::sort(ret_nodes.begin(), ret_nodes.end(), ret_comparator);
+
+  arg_indices->reserve(arg_nodes.size());
+  for (const auto& pair : arg_nodes) arg_indices->push_back(pair.second);
+  ret_indices->reserve(ret_nodes.size());
+  for (const auto& pair : ret_nodes) ret_indices->push_back(pair.second);
+
   for (int i = 0; i < arg_nodes.size(); ++i) {
     Node* arg = arg_nodes[i].first;
     arg->AddAttr("index", i);
@@ -130,16 +154,6 @@ Status UpdateArgAndRetvalMetadata(
   return Status::OK();
 }
 
-std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
-                                      gtl::ArraySlice<Tensor> arguments) {
-  std::vector<Tensor> args;
-  args.reserve(indices.size());
-  for (int i : indices) {
-    args.push_back(arguments[i]);
-  }
-  return args;
-}
-
 string FunctionNameGenerator::GetName() {
   while (true) {
     const string candidate = strings::StrCat(name_, "_", counter_++);
diff --git a/tensorflow/core/common_runtime/partitioning_utils.h b/tensorflow/core/common_runtime/partitioning_utils.h
index 1fc34186251..1eb17423de0 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.h
+++ b/tensorflow/core/common_runtime/partitioning_utils.h
@@ -41,31 +41,28 @@ Status PartitionFunctionGraph(
 //
 // More specifically, this function
 //  (1) rewrites the indices of the `Arg` and `Retval` nodes placed
-//      on a particular device.  When a function is partitioned each
-//      partition, `subgraph`, get a subset of the arguments and
+//      on a particular device.  When a function is partitioned, each
+//      partition `subgraph` gets a subset of the arguments and
 //      return values. The `index` attributes of these _Arg and _Retval
 //      nodes reflect the indices of these parameters in the original
 //      function. To convert `subgraph` to a function, we need to replace
 //      there original indices with 0, 1, 2, ... .
 //
 //      The argument and return value order in the partitioned function is
-//      determined by the node iteration order in `subgraph`. This order
-//      is also used in UpdateArgAndRetvalMetadata. This is fine because the
-//      node iteration order is deterministic - it follows the node ids.
+//      determined by the argument and return value order in the original
+//      function. This stability is important because it enables us to treat
+//      a single-partition function as having the same signature as the
+//      subgraph.
 //  (2) records the subsets of `Arg` and `Retval` nodes assigned to the
 //      device in `*_indices`, and
 //  (3) records which `Arg` and `Retval` nodes live in host memory in
 //      `*_alloc_attrs`.
 Status UpdateArgAndRetvalMetadata(
-    Graph* subgraph, const string& device_type, std::vector<int>* arg_indices,
-    std::vector<int>* ret_indices,
+    Graph* subgraph, const string& device_type,
+    std::vector<FunctionArgIndex>* arg_indices, std::vector<int>* ret_indices,
     std::vector<AllocatorAttributes>* arg_alloc_attrs,
     std::vector<AllocatorAttributes>* ret_alloc_attrs);
 
-// Extracts tensors at `indices` from `arguments`.
-std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
-                                      gtl::ArraySlice<Tensor> arguments);
-
 // Utility for generating function names not present in `flib_def`, using
 // given `name` as the base for the name.
 class FunctionNameGenerator {
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
index 86fa68d03d7..b33eae85ba1 100644
--- a/tensorflow/core/common_runtime/partitioning_utils_test.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -91,12 +92,18 @@ class PartitioningUtilsTest : public ::testing::Test {
   // Fills subgraph with an identify function arg->identity->ret
   // where each node has type `dtype` and arg/ret nodes have
   // indices `arg_index` and `ret_index`.
-  void SubGraph(Graph* subgraph, DataType dtype, int arg_index, int ret_index) {
+  void SubGraph(Graph* subgraph, DataType dtype,
+                gtl::ArraySlice<int> arg_indices,
+                gtl::ArraySlice<int> ret_indices) {
     Scope s = Scope::NewRootScope();
     Scope s1 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:0");
-    auto x = ops::_Arg(s1.WithOpName("x"), dtype, arg_index);
-    auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
-    auto dx_retval = ops::_Retval(s1.WithOpName("retval1"), id_x, ret_index);
+    CHECK_EQ(arg_indices.size(), ret_indices.size());
+    for (size_t i = 0; i < arg_indices.size(); ++i) {
+      auto x = ops::_Arg(s1.WithOpName("x"), dtype, arg_indices[i]);
+      auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
+      auto dx_retval =
+          ops::_Retval(s1.WithOpName("retval1"), id_x, ret_indices[i]);
+    }
     TF_ASSERT_OK(s.ToGraph(subgraph));
     Placer placer(subgraph, "", &device_set_, device0_);
     TF_ASSERT_OK(placer.Run());
@@ -151,14 +158,23 @@ TEST_F(PartitioningUtilsTest, TwoDevices) {
   ASSERT_EQ(3, part2->num_op_nodes());
 }
 
-void CheckIndices(const std::vector<int>& expected,
-                  const std::vector<int>& actual) {
+void CheckRetIndices(const std::vector<int>& expected,
+                     const std::vector<int>& actual) {
   ASSERT_EQ(expected.size(), actual.size());
   for (int i = 0; i < expected.size(); ++i) {
     ASSERT_EQ(expected[i], actual[i]) << " at index " << i;
   }
 }
 
+void CheckArgIndices(const std::vector<FunctionArgIndex>& expected,
+                     const std::vector<FunctionArgIndex>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i].index, actual[i].index) << " at index " << i;
+    ASSERT_EQ(expected[i].sub_index, actual[i].sub_index) << " at index " << i;
+  }
+}
+
 void CheckAlloc(const std::vector<bool>& expected,
                 const std::vector<AllocatorAttributes>& actual) {
   ASSERT_EQ(expected.size(), actual.size());
@@ -175,10 +191,10 @@ void CheckIndex(const Node& node, int expected_index) {
 }
 
 TEST_F(PartitioningUtilsTest, UpdateArgsAndRets) {
-  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
-  SubGraph(graph.get(), DT_FLOAT, 3, 5);
+  auto graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SubGraph(graph.get(), DT_FLOAT, {3}, {5});
 
-  std::vector<int> arg_indices;
+  std::vector<FunctionArgIndex> arg_indices;
   std::vector<int> ret_indices;
   std::vector<AllocatorAttributes> arg_alloc_attrs;
   std::vector<AllocatorAttributes> ret_alloc_attrs;
@@ -190,8 +206,8 @@ TEST_F(PartitioningUtilsTest, UpdateArgsAndRets) {
       &ret_alloc_attrs);
   ASSERT_TRUE(status.ok()) << status.ToString();
 
-  CheckIndices({3}, arg_indices);
-  CheckIndices({5}, ret_indices);
+  CheckArgIndices({{3, -1}}, arg_indices);
+  CheckRetIndices({5}, ret_indices);
   CheckAlloc({false}, arg_alloc_attrs);
   CheckAlloc({false}, ret_alloc_attrs);
 
@@ -202,5 +218,38 @@ TEST_F(PartitioningUtilsTest, UpdateArgsAndRets) {
   CheckIndex(*nodes["retval1"], 0);
 }
 
+TEST_F(PartitioningUtilsTest, UpdateArgsAndRets_Order) {
+  auto graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SubGraph(graph.get(), DT_FLOAT, {9, 7, 5, 3, 1}, {2, 4, 6, 8, 10});
+
+  const std::map<int, int> sub_indices = {
+      {7, 2}, {3, 1}, {1, 0}, {5, 2}, {9, 0}};
+  const AttrValue* attr_value;
+  for (Node* n : graph->op_nodes()) {
+    if (n->IsArg()) {
+      TF_ASSERT_OK(n->attrs().Find("index", &attr_value));
+      n->AddAttr("sub_index",
+                 sub_indices.at(static_cast<int>(attr_value->i())));
+    }
+  }
+
+  std::vector<FunctionArgIndex> arg_indices;
+  std::vector<int> ret_indices;
+  std::vector<AllocatorAttributes> arg_alloc_attrs;
+  std::vector<AllocatorAttributes> ret_alloc_attrs;
+
+  string device_type = "CPU";
+
+  Status status = UpdateArgAndRetvalMetadata(
+      graph.get(), device_type, &arg_indices, &ret_indices, &arg_alloc_attrs,
+      &ret_alloc_attrs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  CheckArgIndices({{1, 0}, {3, 1}, {5, 2}, {7, 2}, {9, 0}}, arg_indices);
+  CheckRetIndices({2, 4, 6, 8, 10}, ret_indices);
+  CheckAlloc({false, false, false, false, false}, arg_alloc_attrs);
+  CheckAlloc({false, false, false, false, false}, ret_alloc_attrs);
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index b4338af6fde..46b6509390f 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -261,7 +261,7 @@ class PendingCounts {
   // Each frame in this subgraph has its own PendingCounts.
 
   // We use 3 bits each for dead_count and pending.
-  static const int kMaxCountForPackedCounts = 7;
+  static constexpr int kMaxCountForPackedCounts = 7;
 
   // Most counts are small, so we pack a pending count and a dead
   // count into 3 bits each, use 1 bit to indicate that the node has
diff --git a/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc
index 3363930c882..75d150834cc 100644
--- a/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc
+++ b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
-#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/graph.h"
diff --git a/tensorflow/core/common_runtime/placer_inspection_required_ops_utils_test.cc b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils_test.cc
index 0f8a439752a..b04f3ece2e6 100644
--- a/tensorflow/core/common_runtime/placer_inspection_required_ops_utils_test.cc
+++ b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils_test.cc
@@ -19,12 +19,11 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index f1fd2574490..e0e887e7d66 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/function.h"
@@ -34,9 +36,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -96,7 +96,7 @@ class FakeDevice : public Device {
                                             const string& device_type) {
     DeviceAttributes device_attributes;
     device_attributes.set_name(name);
-    device_attributes.set_device_type(DeviceType(device_type).type());
+    device_attributes.set_device_type(device_type);
     return std::unique_ptr<Device>(new FakeDevice(device_attributes));
   }
 
@@ -233,6 +233,9 @@ class PlacerTest : public ::testing::Test {
     local_devices_.emplace_back(FakeDevice::MakeDevice(
         "/job:a/replica:0/task:0/device:XLA_CPU:0", "XLA_CPU"));
     devices_.AddDevice(local_devices_.back().get());
+    local_devices_.emplace_back(FakeDevice::MakeDevice(
+        "/job:a/replica:0/task:0/device:COMPOSITE:0", "COMPOSITE"));
+    devices_.AddDevice(local_devices_.back().get());
   }
 
   // Builds the given graph, and (if successful) indexes the node
@@ -1175,6 +1178,40 @@ TEST_F(PlacerTest, TestReferenceConnectionNoSourceDevice) {
   EXPECT_DEVICE_TYPE(g, "assign", "FakeCPU");
 }
 
+TEST_F(PlacerTest, TestResourceHandleOnCompositeDevice) {
+  auto build_graph = [this](Graph* g) -> Status {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestInput", b.opts().WithName("in"));
+    // Build ten variable-and-assignment pairs.
+    Node* var = ops::SourceOp("HandleVariableCPU", b.opts().WithName("var"));
+    ops::BinaryOp("TestHandleAssign", var, input, b.opts().WithName("assign"));
+    TF_RETURN_IF_ERROR(BuildGraph(b, g));
+    // `var` is assigned to COMPOSITE.
+    GetNodeByName(*g, "var")->set_assigned_device_name(
+        "/job:a/replica:0/task:0/device:COMPOSITE:0");
+    return Status::OK();
+  };
+
+  {
+    // `assign` is not assigned to any device.
+    Graph g(OpRegistry::Global());
+    TF_ASSERT_OK(build_graph(&g));
+    TF_ASSERT_OK(Place(&g));
+    EXPECT_DEVICE_TYPE(g, "var", "COMPOSITE");
+    EXPECT_DEVICE_TYPE(g, "assign", "COMPOSITE");
+  }
+  {
+    // `assign` is assigned to FakeCPU.
+    Graph g(OpRegistry::Global());
+    TF_ASSERT_OK(build_graph(&g));
+    GetNodeByName(g, "assign")
+        ->set_assigned_device_name("/job:a/replica:0/task:0/device:FakeCPU:0");
+    TF_ASSERT_OK(Place(&g));
+    EXPECT_DEVICE_TYPE(g, "var", "COMPOSITE");
+    EXPECT_DEVICE_TYPE(g, "assign", "FakeCPU");
+  }
+}
+
 TEST_F(PlacerTest, TestColocationGroup) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
@@ -1282,6 +1319,9 @@ TEST_F(PlacerTest, TestColocationGroupWithReferenceConnections) {
     Node* input = ops::SourceOp("TestInput", b.opts().WithName("in"));
     Node* var1 = ops::SourceOp("VariableCPU", b.opts().WithName("var1"));
     Node* var2 = ops::SourceOp("VariableCPU", b.opts().WithName("var2"));
+    Node* var3 = ops::SourceOp(
+        "VariableCPU",
+        b.opts().WithName("var3").WithDevice("/device:COMPOSITE:0"));
 
     // Two assigns (reference connections) with two different
     // colocation groups. Because their colocation groups all map to the
@@ -1292,14 +1332,20 @@ TEST_F(PlacerTest, TestColocationGroupWithReferenceConnections) {
     ops::BinaryOp(
         "TestAssign", var2, input,
         b.opts().WithName("assign2").WithAttr("_class", {"loc:@var2"}));
+    ops::BinaryOp(
+        "TestAssign", var3, input,
+        b.opts().WithName("assign3").WithAttr("_class", {"loc:@var3"}));
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
   TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
   EXPECT_COLOCATED(g, "in", "var1");
   EXPECT_COLOCATED(g, "in", "var2");
   EXPECT_COLOCATED(g, "var1", "assign2");
   EXPECT_COLOCATED(g, "var2", "assign1");
+  EXPECT_DEVICE_TYPE(g, "var3", "COMPOSITE");
+  EXPECT_COLOCATED(g, "var3", "assign3");
 }
 
 TEST_P(SoftPlacementPlacerTest,
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index e465ea72844..364750b6679 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -17,17 +17,21 @@ limitations under the License.
 #include <iterator>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/function_optimization_registry.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/partitioning_utils.h"
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
+#include "tensorflow/core/common_runtime/replicate_per_replica_nodes.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -35,7 +39,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/graph/graph_partition.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
@@ -216,7 +219,8 @@ void ProcessFunctionLibraryRuntime::InitializeDeviceSet() {
     all_devices = parent_->remote_device_mgr();
   }
 
-  device_set_.reset(new DeviceSet);
+  mutex_lock l(mu_);
+  device_set_ = std::make_shared<DeviceSet>();
   for (auto d : all_devices->ListDevices()) {
     device_set_->AddDevice(d);
   }
@@ -227,7 +231,7 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
-      VLOG(1) << "Could not find device: " << device_name;
+      VLOG(4) << "Could not find device: " << device_name;
       return nullptr;
     }
   }
@@ -282,12 +286,25 @@ bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
 
 FunctionLibraryRuntime::LocalHandle
 ProcessFunctionLibraryRuntime::GetHandleOnDevice(
-    const string& device_name, FunctionLibraryRuntime::Handle handle) const {
+    const string& device_name, FunctionLibraryRuntime::Handle handle,
+    bool include_multi_device) const {
   tf_shared_lock l(mu_);
 
   auto miter = mdevice_data_.find(handle);
   if (miter != mdevice_data_.end()) {
-    return kInvalidLocalHandle;
+    if (!include_multi_device) return kInvalidLocalHandle;
+
+    const MultiDeviceFunctionData& data = *miter->second;
+    if (data.glue_.size() != 1) return kInvalidLocalHandle;
+
+    const auto& pair = *data.glue_.begin();
+    const string& func_device_name = pair.first;
+    const ComponentFunctionData& component_data = pair.second;
+    if (func_device_name != device_name) return kInvalidLocalHandle;
+
+    // Replace the given handle with the handle for the single component
+    // function.
+    handle = component_data.handle;
   }
 
   auto iter = function_data_.find(handle);
@@ -683,11 +700,12 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     }
     default_device = flr->device();
   }
+  const std::shared_ptr<DeviceSet> dev_set = device_set();
 
   TF_RETURN_IF_ERROR(
       SetArgShape(options.input_resource_dtypes_and_shapes, arg_nodes));
   TF_RETURN_IF_ERROR(PinArgsAndRets(
-      options.input_devices, options.output_devices, *device_set_, arg_nodes,
+      options.input_devices, options.output_devices, *dev_set, arg_nodes,
       ret_nodes,
       options.config_proto.allow_soft_placement() ? default_device : nullptr));
 
@@ -700,7 +718,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 
   bool control_rets_updated = false;
   TF_RETURN_IF_ERROR(FunctionOptimizationPassRegistry::Global().Run(
-      *device_set_, options.config_proto, &graph, &data->lib_def_,
+      *dev_set, options.config_proto, &graph, &data->lib_def_,
       &control_ret_node_names, &control_rets_updated));
 
   if (control_rets_updated) {
@@ -723,7 +741,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   optimization_options.session_options = &session_options;
   optimization_options.graph = &graph;
   optimization_options.flib_def = &data->lib_def_;
-  optimization_options.device_set = device_set_.get();
+  optimization_options.device_set = dev_set.get();
   optimization_options.is_function_graph = true;
 
   DumpGraph("Before running PRE_PLACEMENT passes", graph.get());
@@ -734,7 +752,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   // exceptions/warnings in case where nested function call options are ignored.
   DumpGraph("Before calling Placer", graph.get());
   Placer placer(graph.get(), function_name, optimization_options.flib_def,
-                device_set_.get(), default_device,
+                dev_set.get(), default_device,
                 options.config_proto.allow_soft_placement(),
                 options.config_proto.log_device_placement());
   TF_RETURN_IF_ERROR(placer.Run());
@@ -750,7 +768,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     DumpGraph("Before running graph optimization fn", graph.get());
     Status status = options.optimize_graph_fn(
         std::move(ret_node_names), std::move(control_ret_node_names),
-        &data->lib_def_, *device_set_, cpu_device, &graph);
+        &data->lib_def_, *dev_set, cpu_device, &graph);
     if (!status.ok()) {
       LOG(WARNING) << "Ignoring multi-device function optimization failure: "
                    << status.ToString();
@@ -762,6 +780,14 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
 
+  // Expand the nodes assigned to a CompositeDevice before graph partition to
+  // avoid generating a subgraph on a virtual device for execution.
+  // This transformation should happen as late as possible, in order to run as
+  // more graph optimization passes (e.g. PRE_PLACEMENT, PLACER,
+  // POST_PLACEMENT, POST_REWRITE_FOR_EXEC) on a smaller graph as possible.
+  TF_RETURN_IF_ERROR(ReplicatePerReplicaNodesInFunctionGraph(
+      options.composite_devices, graph.get()));
+
   if (options.graph_collector != nullptr) {
     GraphDef def;
     graph->ToGraphDef(&def);
@@ -774,7 +800,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 
   std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
   TF_RETURN_IF_ERROR(
-      PartitionFunctionGraph(*device_set_, std::move(graph), &subgraphs));
+      PartitionFunctionGraph(*dev_set, std::move(graph), &subgraphs));
 
   for (const auto& pair : subgraphs) {
     DumpGraph(strings::StrCat("Before running POST_PARTITIONING passes (",
@@ -850,13 +876,13 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       const string& target = pair.first;
 
       const string& device_type =
-          device_set_->FindDeviceByName(target)->device_type();
+          device_set()->FindDeviceByName(target)->device_type();
       Graph* subgraph = pair.second.get();
 
       status->Update(UpdateArgAndRetvalMetadata(
-          subgraph, device_type, &comp_data->arg_indices_,
-          &comp_data->ret_indices_, &comp_data->arg_alloc_attrs_,
-          &comp_data->ret_alloc_attrs_));
+          subgraph, device_type, &comp_data->arg_indices,
+          &comp_data->ret_indices, &comp_data->arg_alloc_attrs,
+          &comp_data->ret_alloc_attrs));
       if (!status->ok()) {
         counter.DecrementCount();
         return;
@@ -898,7 +924,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
               data->is_cross_process_ = true;
             }
           }
-          comp_data->handle_ = *component_handle;
+          comp_data->handle = *component_handle;
         }
         delete component_handle;
         counter.DecrementCount();
@@ -940,16 +966,16 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
 
   for (const auto& pair : data->glue_) {
     const ComponentFunctionData& comp_data = pair.second;
-    DCHECK(comp_data.ret_alloc_attrs_.size() == comp_data.ret_indices_.size());
+    DCHECK(comp_data.ret_alloc_attrs.size() == comp_data.ret_indices.size());
 
     const string& target = pair.first;
     FunctionLibraryRuntime* target_flr = GetFLR(target);
     if (target_flr == nullptr) {
-      if (!comp_data.ret_indices_.empty()) {
+      if (!comp_data.ret_indices.empty()) {
         return errors::Unimplemented(
             "Currently, outputting tensors on remote devices is not supported. "
             "The ",
-            comp_data.ret_indices_[0],
+            comp_data.ret_indices[0],
             "-th return value of the function outputs to target_device: ",
             target,
             " Please copy the tensor to local device explicitly using "
@@ -958,17 +984,17 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
       continue;
     }
     Device* target_device = target_flr->device();
-    const FunctionBody* fbody = target_flr->GetFunctionBody(comp_data.handle_);
+    const FunctionBody* fbody = target_flr->GetFunctionBody(comp_data.handle);
     DCHECK(fbody != nullptr);
 
     output_devices->resize(data->num_outputs_);
-    for (int j = 0; j < comp_data.ret_indices_.size(); ++j) {
-      int ret_index = comp_data.ret_indices_[j];
+    for (int j = 0; j < comp_data.ret_indices.size(); ++j) {
+      int ret_index = comp_data.ret_indices[j];
       if (fbody->ret_types[j] == DT_RESOURCE) {
         (*output_devices)[ret_index] = target_device;
       } else {
         (*output_devices)[ret_index] =
-            comp_data.ret_alloc_attrs_[j].on_host() ? nullptr : target_device;
+            comp_data.ret_alloc_attrs[j].on_host() ? nullptr : target_device;
       }
     }
   }
@@ -998,9 +1024,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
 
   const MultiDeviceFunctionData* data = IsMultiDevice(handle);
   if (data == nullptr) {
-    done(
-        errors::InvalidArgument("Failed for find multi-device function handle ",
-                                handle, ". Was the function instantiated?"));
+    done(errors::NotFound("Multi-device function handle ", handle,
+                          "not found. Was the function instantiated?"));
     return;
   }
 
@@ -1022,7 +1047,37 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     return;
   }
 
-  auto* refcounted_done = new ReffedStatusCallback(std::move(done));
+  // A locally created cancellation manager, used only when the caller does not
+  // provide one in argument.
+  std::shared_ptr<CancellationManager> local_cm;
+  CancellationManager* cm = opts.cancellation_manager;
+  if (cm == nullptr) {
+    local_cm = std::make_shared<CancellationManager>();
+    cm = local_cm.get();
+  }
+  auto token = cm->get_cancellation_token();
+  const auto cancelled_error = errors::Cancelled(
+      "ProcessFunctionLibraryRuntime::RunMultiDevice was cancelled.");
+  const bool already_cancelled = !cm->RegisterCallback(
+      token,
+      [rendez = opts.rendezvous, n_func = data->glue_.size(), cancelled_error] {
+        // Abort rendezvous only if there are more than one component functions
+        // to avoid reporting cancellation error directly to PartitionedCallOps
+        // that launch a single component function.
+        if (rendez && n_func > 1) {
+          rendez->StartAbort(cancelled_error);
+        }
+      });
+  if (already_cancelled) {
+    done(cancelled_error);
+    return;
+  }
+
+  auto* refcounted_done = new ReffedStatusCallback(
+      [cm, token, local_cm, done = std::move(done)](const Status& s) {
+        cm->TryDeregisterCallback(token);
+        done(s);
+      });
   for (int i = 0; i < data->glue_.size(); ++i) {
     refcounted_done->Ref();
   }
@@ -1031,11 +1086,11 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
   for (const auto& pair : data->glue_) {
     const string& target = pair.first;
     const ComponentFunctionData& comp_data = pair.second;
-    FunctionLibraryRuntime::Handle handle = pair.second.handle_;
+    FunctionLibraryRuntime::Handle handle = pair.second.handle;
 
-    opts_copy.args_alloc_attrs = comp_data.arg_alloc_attrs_;
-    opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs_;
-    opts_copy.remote_execution = false;
+    opts_copy.args_alloc_attrs = comp_data.arg_alloc_attrs;
+    opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs;
+    opts_copy.cancellation_manager = cm;
 
     InternalArgs comp_args;
     Status s = get_component_args(comp_data, &comp_args);
@@ -1043,13 +1098,39 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       VLOG(2) << "Failed to get component function arguments: " << s;
       refcounted_done->UpdateStatus(s);
       refcounted_done->Unref();
+      cm->StartCancel();
       continue;
     }
     std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
     rets->resize(data->num_outputs_);
 
+    auto component_fn_callback = [comp_rets, rets, comp_data, refcounted_done,
+                                  cm, local_cm, data,
+                                  target](const Status& status) {
+      if (!status.ok()) {
+        VLOG(2) << "Component function execution on target " << target
+                << " failed: " << status;
+        const string function_and_msg = strings::StrCat(
+            errors::FormatFunctionForError(data->function_name_), " ",
+            status.error_message());
+        refcounted_done->UpdateStatus(Status(status.code(), function_and_msg));
+        // Cancel the execution of other component functions.
+        cm->StartCancel();
+      } else {
+        VLOG(2) << "Component function execution on target " << target
+                << " succeeded.";
+        for (int i = 0; i < comp_rets->size(); ++i) {
+          (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
+        }
+      }
+      delete comp_rets;
+      // refcounted_done is thread-safe
+      refcounted_done->Unref();
+    };
+
     FunctionLibraryRuntime* flr = GetFLR(target);
     if (flr != nullptr) {
+      opts_copy.remote_execution = false;
       // When target device has private thread pool, use the target device
       // runner
       thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
@@ -1060,24 +1141,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       VLOG(4) << "    with " << opts_copy.DebugString();
 
       flr->Run(opts_copy, handle, GetLocalArgs(comp_args.args), comp_rets,
-               [comp_rets, rets, comp_data, refcounted_done,
-                data](const Status& status) {
-                 if (!status.ok()) {
-                   VLOG(2) << "Component function execution failed: " << status;
-                   const string function_and_msg = strings::StrCat(
-                       errors::FormatFunctionForError(data->function_name_),
-                       " ", status.error_message());
-                   refcounted_done->UpdateStatus(
-                       Status(status.code(), function_and_msg));
-                 } else {
-                   for (int i = 0; i < comp_rets->size(); ++i) {
-                     (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
-                   }
-                 }
-                 delete comp_rets;
-                 // refcounted_done is thread-safe
-                 refcounted_done->Unref();
-               });
+               std::move(component_fn_callback));
     } else {
       opts_copy.remote_execution = true;
 
@@ -1085,21 +1149,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
               << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
 
-      RunInternal(
-          opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
-          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
-            if (!status.ok()) {
-              VLOG(2) << "Component function execution failed: " << status;
-              refcounted_done->UpdateStatus(status);
-            } else {
-              for (int i = 0; i < comp_rets->size(); ++i) {
-                (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
-              }
-            }
-            delete comp_rets;
-            // refcounted_done is thread-safe
-            refcounted_done->Unref();
-          });
+      RunInternal(opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
+                  std::move(component_fn_callback));
     }
   }
   refcounted_done->Unref();
@@ -1210,7 +1261,7 @@ Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
   Status overall_status;
   for (const auto& it : mdata->glue_) {
     const string& device = it.first;
-    FunctionLibraryRuntime::Handle flr_handle = it.second.handle_;
+    FunctionLibraryRuntime::Handle flr_handle = it.second.handle;
     FunctionLibraryRuntime* flr = GetFLR(device);
     if (flr == nullptr) {
       // TODO(nareshmodi): Implement DeregisterGraph call to remote device if
@@ -1282,6 +1333,19 @@ ProcessFunctionLibraryRuntime::ApplyCleanUpToDoneCallback(
       };
 }
 
+Status ProcessFunctionLibraryRuntime::CreateRendezvous(
+    const FunctionLibraryRuntime::Options& opts,
+    Rendezvous** created_rendezvous) const {
+  if (rendezvous_factory_) {
+    return rendezvous_factory_(opts.step_id, device_mgr_, created_rendezvous);
+  } else {
+    return errors::FailedPrecondition(
+        "The caller does not provide a rendezvous and "
+        "ProcessFunctionLibraryRuntime was created without a rendezvous "
+        "factory.");
+  }
+}
+
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
@@ -1290,21 +1354,12 @@ void ProcessFunctionLibraryRuntime::Run(
   FunctionLibraryRuntime::Options new_opts = opts;
   Rendezvous* created_rendezvous = nullptr;
   if (!opts.rendezvous) {
-    if (rendezvous_factory_) {
-      Status s =
-          rendezvous_factory_(opts.step_id, device_mgr_, &created_rendezvous);
-      if (!s.ok()) {
-        done(s);
-        return;
-      }
-      new_opts.rendezvous = created_rendezvous;
-    } else {
-      done(
-          errors::FailedPrecondition("The caller does not provide a rendezvous "
-                                     "and ProcessFunctionLibraryRuntime was "
-                                     "created without a rendezvous factory."));
+    Status s = CreateRendezvous(opts, &created_rendezvous);
+    if (!s.ok()) {
+      done(s);
       return;
     }
+    new_opts.rendezvous = created_rendezvous;
     new_opts.create_rendezvous = false;
   }
 
@@ -1319,9 +1374,14 @@ void ProcessFunctionLibraryRuntime::Run(
   if (multi_device) {
     auto get_component_args = [&args](const ComponentFunctionData& comp_data,
                                       InternalArgs* comp_args) -> Status {
-      for (const auto& tensor :
-           GetArgsForIndices(comp_data.arg_indices_, args)) {
-        comp_args->args.push_back(tensor);
+      // "Index"s of _Arg nodes are unique when all arguments are local Tensors.
+      for (const auto& it : comp_data.arg_indices) {
+        if (it.sub_index >= 0) {
+          return errors::InvalidArgument("Got unexpected sub_index ",
+                                         it.sub_index, " for argument ",
+                                         it.index);
+        }
+        comp_args->args.push_back(args[it.index]);
       }
       return Status::OK();
     };
@@ -1473,16 +1533,55 @@ void ProcessFunctionLibraryRuntime::Run(
       });
 }
 
+Status ProcessFunctionLibraryRuntime::RunSync(
+    const FunctionLibraryRuntime::Options& opts,
+    FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
+    std::vector<Tensor>* rets) const {
+  Notification n;
+  Status s;
+  Run(opts, handle, args, rets, [&n, &s](const Status& status) {
+    s.Update(status);
+    n.Notify();
+  });
+  n.WaitForNotification();
+  return s;
+}
+
+Status ProcessFunctionLibraryRuntime::RunSync(
+    const FunctionLibraryRuntime::Options& opts,
+    FunctionLibraryRuntime::Handle handle, CallFrameInterface* frame) const {
+  Notification n;
+  Status s;
+  Run(opts, handle, frame, [&n, &s](const Status& status) {
+    s.Update(status);
+    n.Notify();
+  });
+  n.WaitForNotification();
+  return s;
+}
+
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, const FunctionArgsInterface& args,
     std::vector<Tensor>* rets,
     FunctionLibraryRuntime::DoneCallback done) const {
-  if (!args.HasRemoteInputs()) {
+  if (!args.HasRemoteOrPackedInputs()) {
     const std::vector<Tensor> local_inputs = args.GetLocalTensors();
     return Run(opts, handle, local_inputs, rets, std::move(done));
   }
 
+  FunctionLibraryRuntime::Options new_opts = opts;
+  Rendezvous* created_rendezvous = nullptr;
+  if (!opts.rendezvous) {
+    Status s = CreateRendezvous(opts, &created_rendezvous);
+    if (!s.ok()) {
+      done(s);
+      return;
+    }
+    new_opts.rendezvous = created_rendezvous;
+    new_opts.create_rendezvous = false;
+  }
+
 #if defined(IS_MOBILE_PLATFORM)
   done(errors::Unimplemented(
       "Remote inputs are not available on mobile devices."));
@@ -1490,12 +1589,12 @@ void ProcessFunctionLibraryRuntime::Run(
 #else   // !IS_MOBILE_PLATFORM
   auto* cleanup_items = new std::vector<std::unique_ptr<CleanUpItem>>;
   done = ApplyCleanUpToDoneCallback(cleanup_items, done, opts.step_id,
-                                    /*rendezvous=*/nullptr);
+                                    created_rendezvous);
 
   auto get_component_args = [&args](const ComponentFunctionData& comp_data,
                                     InternalArgs* comp_args) -> Status {
-    for (int i = 0; i < comp_data.arg_indices_.size(); ++i) {
-      const int index = comp_data.arg_indices_.at(i);
+    for (int i = 0; i < comp_data.arg_indices.size(); ++i) {
+      const FunctionArgIndex index = comp_data.arg_indices.at(i);
       Tensor tensor;
       if (args.GetLocalArg(index, &tensor).ok()) {
         comp_args->args.push_back(std::move(tensor));
@@ -1510,7 +1609,7 @@ void ProcessFunctionLibraryRuntime::Run(
     }
     return Status::OK();
   };
-  return RunMultiDevice(opts, handle, rets, cleanup_items, std::move(done),
+  return RunMultiDevice(new_opts, handle, rets, cleanup_items, std::move(done),
                         std::move(get_component_args));
 #endif  // !IS_MOBILE_PLATFORM
 }
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index c42ad9eb08c..bc68c9c2807 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
+#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/function.h"
@@ -40,16 +41,15 @@ class FunctionArgsInterface {
  public:
   virtual ~FunctionArgsInterface() {}
 
-  virtual bool HasRemoteInputs() const = 0;
+  virtual bool HasRemoteOrPackedInputs() const = 0;
 
-  virtual Status GetLocalArg(const int index, Tensor* val) const = 0;
+  virtual Status GetLocalArg(const FunctionArgIndex& index,
+                             Tensor* val) const = 0;
 
   virtual std::vector<Tensor> GetLocalTensors() const = 0;
 
-  virtual const gtl::InlinedVector<TensorValue, 4>* GetTensorValues() const = 0;
-
 #if !defined(IS_MOBILE_PLATFORM)
-  virtual Status GetRemoteArg(const int index,
+  virtual Status GetRemoteArg(const FunctionArgIndex& index,
                               eager::RemoteTensorHandle* val) const {
     return errors::Unimplemented(
         "Serializing a remote argument is not implemented.");
@@ -137,8 +137,13 @@ class ProcessFunctionLibraryRuntime {
   // index of instantiation of that function. If the function was not
   // instantiated on `device_name` or the function is multi-device,
   // returns kInvalidLocalHandle.
+  //
+  // If `include_multi_device` is true and `handle` is a multi-device function
+  // with a single component that is placed on `device_name`, then this method
+  // will return the local handle for that component.
   FunctionLibraryRuntime::LocalHandle GetHandleOnDevice(
-      const string& device_name, FunctionLibraryRuntime::Handle handle) const;
+      const string& device_name, FunctionLibraryRuntime::Handle handle,
+      bool include_multi_device = false) const;
 
   // Fills `output_devices` with the devices on which the results will
   // be produced. If some output is produced on CPU, the corresponding Device*
@@ -189,9 +194,19 @@ class ProcessFunctionLibraryRuntime {
            const FunctionArgsInterface& args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) const;
 
+  Status RunSync(const FunctionLibraryRuntime::Options& opts,
+                 FunctionLibraryRuntime::Handle handle,
+                 gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets) const;
+  Status RunSync(const FunctionLibraryRuntime::Options& opts,
+                 FunctionLibraryRuntime::Handle handle,
+                 CallFrameInterface* frame) const;
+
   const DeviceMgr* device_mgr() { return device_mgr_; }
 
-  const DeviceSet* device_set() { return device_set_.get(); }
+  const std::shared_ptr<DeviceSet> device_set() {
+    tf_shared_lock l(mu_);
+    return device_set_;
+  }
 
   // Initialize the set of local and remote devices for op device selection.
   void InitializeDeviceSet();
@@ -202,6 +217,12 @@ class ProcessFunctionLibraryRuntime {
     return lib_def_;
   }
 
+  // Add a CompositeDevice to `device_set_`
+  void AddCompositeDevice(CompositeDevice* d) TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    device_set_->AddDevice(d);
+  }
+
  protected:
   friend class FunctionLibraryRuntimeImpl;
 
@@ -217,21 +238,21 @@ class ProcessFunctionLibraryRuntime {
   // piece of a multi-device function) fits into the multi-device function.
   struct ComponentFunctionData {
     // The handle for the instantiated component function.
-    FunctionLibraryRuntime::Handle handle_;
-    // arg_indices_.size() is the number of arguments to the component function.
+    FunctionLibraryRuntime::Handle handle;
+    // arg_indices.size() is the number of arguments to the component function.
     // The i-th argument of the component function comes from the
-    // `arg_indices_[i]`-th argument of the multi-device function.
-    std::vector<int> arg_indices_;
-    // ret_indices_.size() is the number of return values of the component
+    // `arg_indices[i]`-th argument of the multi-device function.
+    std::vector<FunctionArgIndex> arg_indices;
+    // ret_indices.size() is the number of return values of the component
     // function.  The i-th return value of the component function goes to the
-    // `ret_indices_[i]`-th return value of the multi-device function.
-    std::vector<int> ret_indices_;
-    // arg_alloc_attrs_[i] are the allocator attributes of the i-th argument to
+    // `ret_indices[i]`-th return value of the multi-device function.
+    std::vector<int> ret_indices;
+    // arg_alloc_attrs[i] are the allocator attributes of the i-th argument to
     // the component function.
-    std::vector<AllocatorAttributes> arg_alloc_attrs_;
-    // ret_alloc_attrs_[i] are the allocator attributes of the i-th return value
+    std::vector<AllocatorAttributes> arg_alloc_attrs;
+    // ret_alloc_attrs[i] are the allocator attributes of the i-th return value
     // of the component function.
-    std::vector<AllocatorAttributes> ret_alloc_attrs_;
+    std::vector<AllocatorAttributes> ret_alloc_attrs;
   };
 
   // Data structure holding information for a single instantiated multi-device
@@ -289,6 +310,9 @@ class ProcessFunctionLibraryRuntime {
                            InternalArgs* args)>
           get_component_args) const;
 
+  Status CreateRendezvous(const FunctionLibraryRuntime::Options& opts,
+                          Rendezvous** created_rendezvous) const;
+
   FunctionLibraryRuntime::DoneCallback ApplyCleanUpToDoneCallback(
       std::vector<std::unique_ptr<CleanUpItem>>* items,
       FunctionLibraryRuntime::DoneCallback done, const int64 step_id,
@@ -419,10 +443,15 @@ class ProcessFunctionLibraryRuntime {
   Env* const env_;
   const absl::optional<const ConfigProto> config_;
   const DeviceMgr* const device_mgr_;
-  std::unique_ptr<DeviceSet> device_set_;
   const FunctionLibraryDefinition* lib_def_;
   thread::ThreadPool* default_thread_pool_;
 
+  // Cluster update can reinitialize the device_set_ due to remote device
+  // changes. At the same time, InstantiateMultiDevice can use the cached
+  // devices to instantiate multi-worker functions. Function instantiation would
+  // fail if it spans the changed remote devices.
+  std::shared_ptr<DeviceSet> device_set_ TF_GUARDED_BY(mu_);
+
   // Holds all the function instantiations. Maps function_keys to handles.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
       TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index d53861a4d25..5bdb4601d37 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
@@ -143,6 +144,10 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
             }}));
   }
 
+  void AddCompositeDevice(CompositeDevice* d) {
+    proc_flr_->AddCompositeDevice(d);
+  }
+
   Status Instantiate(
       const string& name, test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
@@ -157,17 +162,11 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     DeviceContext* device_context =
         gpu_device_->tensorflow_gpu_device_info()->default_context;
 
-    Notification n;
-    Status status;
     Tensor cpu_tensor(device_tensor.dtype(), device_tensor.shape());
-    device_context->CopyDeviceTensorToCPU(&device_tensor, "", gpu_device_,
-                                          &cpu_tensor,
-                                          [&n, &status](const Status& s) {
-                                            status = s;
-                                            n.Notify();
-                                          });
-    n.WaitForNotification();
-    CHECK(status.ok());
+    CHECK(device_context
+              ->CopyDeviceTensorToCPUSync(&device_tensor, "", gpu_device_,
+                                          &cpu_tensor)
+              .ok());
     return cpu_tensor;
 #else
     CHECK(false);
@@ -181,29 +180,24 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     DeviceContext* device_context =
         gpu_device_->tensorflow_gpu_device_info()->default_context;
 
-    Notification n;
-    Status status;
     Tensor device_tensor(gpu_device_->GetAllocator({}), cpu_tensor.dtype(),
                          cpu_tensor.shape(), {});
-    device_context->CopyCPUTensorToDevice(&cpu_tensor, gpu_device_,
-                                          &device_tensor,
-                                          [&n, &status](const Status& s) {
-                                            status = s;
-                                            n.Notify();
-                                          });
-    n.WaitForNotification();
-    CHECK(status.ok());
+    CHECK(device_context
+              ->CopyCPUTensorToDeviceSync(&cpu_tensor, gpu_device_,
+                                          &device_tensor)
+              .ok());
     return device_tensor;
 #else
     CHECK(false);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   }
 
+  template <typename T>
   Status RunWithRuntime(
       const string& name, FunctionLibraryRuntime::Options opts,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
-      const std::vector<Tensor>& args, std::vector<Tensor*> rets,
+      const T& args, std::vector<Tensor*> rets,
       ProcessFunctionLibraryRuntime* pflr) {
     FunctionLibraryRuntime::Handle handle;
     Status status = pflr->Instantiate(name, attrs, instantiate_opts, &handle);
@@ -260,9 +254,20 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   Status Run(const string& name, FunctionLibraryRuntime::Options opts,
              test::function::Attrs attrs,
              const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
-             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
-    return RunWithRuntime(name, opts, attrs, instantiate_opts, args, rets,
-                          proc_flr_.get());
+             const std::vector<Tensor>& args, std::vector<Tensor*> rets,
+             ProcessFunctionLibraryRuntime* pflr = nullptr) {
+    return RunWithRuntime<std::vector<Tensor>>(
+        name, opts, attrs, instantiate_opts, args, rets, proc_flr_.get());
+  }
+
+  Status RunWithPackedArgs(
+      const string& name, FunctionLibraryRuntime::Options opts,
+      test::function::Attrs attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
+      const FunctionArgsInterface& args, std::vector<Tensor*> rets,
+      ProcessFunctionLibraryRuntime* pflr = nullptr) {
+    return RunWithRuntime<FunctionArgsInterface>(
+        name, opts, attrs, instantiate_opts, args, rets, proc_flr_.get());
   }
 
   Status RunInstantiated(FunctionLibraryRuntime::Handle handle,
@@ -731,6 +736,113 @@ Tensor GetResourceHandle(const string& var_name, const string& container,
   return tensor;
 }
 
+// Returns a function which adds two variables on different devices.
+FunctionDef AddVarAcrossDevices() {
+  return FunctionDefHelper::Create(
+      // Name
+      "AddVarAcrossDevices",
+      // Args
+      {"x: resource"},
+      // Return values
+      {"y: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"read0"},
+           "ReadVariableOp",
+           {"x"},
+           {{"dtype", DT_FLOAT}},
+           {},
+           "/device:CPU:0"},
+          {{"read1"},
+           "ReadVariableOp",
+           {"x"},
+           {{"dtype", DT_FLOAT}},
+           {},
+           "/device:CPU:1"},
+          {{"add"},
+           "Add",
+           {"read0:value:0", "read1:value:0"},
+           {{"T", DT_FLOAT}},
+           {},
+           "/device:CPU:0"},
+      },
+      {{"y", "add:z:0"}});
+}
+
+// An implementation of FunctionArgsInterface for packed inputs.
+class TestFunctionPackedArgs : public FunctionArgsInterface {
+ public:
+  TestFunctionPackedArgs(const int index,
+                         gtl::InlinedVector<TensorValue, 4>&& tensor_args) {
+    packed_args_.emplace(index, std::move(tensor_args));
+  }
+
+  ~TestFunctionPackedArgs() override{};
+
+  bool HasRemoteOrPackedInputs() const override { return true; };
+
+  Status GetLocalArg(const FunctionArgIndex& index,
+                     Tensor* val) const override {
+    *val = *packed_args_.at(index.index).at(index.sub_index).tensor;
+    return Status::OK();
+  };
+
+  std::vector<Tensor> GetLocalTensors() const override { return {}; }
+
+ private:
+  absl::flat_hash_map<int, gtl::InlinedVector<TensorValue, 4>> packed_args_;
+};
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_CompositeDevice) {
+  Init({AddVarAcrossDevices()});
+  // Create two variables on two devices.
+  const Tensor initial_resource_value0 = test::AsTensor<float>({10, 20});
+  Var* resource0 = new Var(DT_FLOAT);
+  *resource0->tensor() = initial_resource_value0;
+  resource0->is_initialized = true;
+  const Tensor initial_resource_value1 = test::AsTensor<float>({30, 40});
+  Var* resource1 = new Var(DT_FLOAT);
+  *resource1->tensor() = initial_resource_value1;
+  resource1->is_initialized = true;
+  ResourceMgr* mgr0 = device0_->resource_manager();
+  ResourceMgr* mgr1 = device1_->resource_manager();
+  TF_ASSERT_OK(mgr0->Create(mgr0->default_container(), "var", resource0));
+  TF_ASSERT_OK(mgr1->Create(mgr1->default_container(), "var", resource1));
+
+  Tensor resource_handle0 =
+      GetResourceHandle("var", mgr0->default_container(), device0_->name());
+  Tensor resource_handle1 =
+      GetResourceHandle("var", mgr1->default_container(), device1_->name());
+
+  // Create a CompositeDevice
+  Status s;
+  std::unique_ptr<CompositeDevice> composite_device =
+      CompositeDevice::MakeDevice({device0_->name(), device1_->name()},
+                                  /*unique_device_id=*/0,
+                                  device_mgr_->HostCPU()->parsed_name(), &s);
+  TF_ASSERT_OK(s);
+  AddCompositeDevice(composite_device.get());
+
+  FunctionLibraryRuntime::Options opts;
+  FunctionLibraryRuntime::InstantiateOptions inst_opts =
+      MakeOptions("CPU:0", {"COMPOSITE:0"}, {"CPU:0"});
+  inst_opts.composite_devices[composite_device->name()] =
+      composite_device->underlying_devices();
+  inst_opts.input_resource_dtypes_and_shapes[0] = {
+      initial_resource_value0.dtype(), initial_resource_value0.shape()};
+
+  gtl::InlinedVector<TensorValue, 4> handles;
+  handles.push_back(TensorValue(&resource_handle0));
+  handles.push_back(TensorValue(&resource_handle1));
+  TestFunctionPackedArgs args(0, std::move(handles));
+  Tensor ret;
+  TF_CHECK_OK(RunWithPackedArgs("AddVarAcrossDevices", opts, {{"T", DT_FLOAT}},
+                                inst_opts, args, {&ret}));
+  test::ExpectTensorEqual<float>(ret, test::AsTensor<float>({40, 60}));
+}
+
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_GPU) {
   if (gpu_device_ == nullptr) {
     GTEST_SKIP() << "No GPUs available";
@@ -1037,9 +1149,9 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataPresentAfterCloning) {
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
   const auto x = test::AsTensor<int64>({17});
   Tensor y;
-  TF_CHECK_OK(RunWithRuntime("SessionMetadataReaderFn", opts, {},
-                             instantiate_opts, {x}, {&y},
-                             cloned_proc_flr.get()));
+  TF_CHECK_OK(RunWithRuntime<std::vector<Tensor>>(
+      "SessionMetadataReaderFn", opts, {}, instantiate_opts, {x}, {&y},
+      cloned_proc_flr.get()));
   SessionMetadata read_metadata;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(y.scalar<tstring>()(),
                                                     &read_metadata));
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index a106a934677..a833c22db1c 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -84,8 +84,8 @@ class ProcessState : public ProcessStateInterface {
 
   // If these flags need to be runtime configurable consider adding
   // them to ConfigProto.
-  static const bool FLAGS_brain_mem_reg_gpu_dma = true;
-  static const bool FLAGS_brain_gpu_record_mem_types = false;
+  static constexpr bool FLAGS_brain_mem_reg_gpu_dma = true;
+  static constexpr bool FLAGS_brain_gpu_record_mem_types = false;
 
   // Helper method for unit tests to reset the ProcessState singleton by
   // cleaning up everything. Never use in production.
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 2c7f66bd5bc..8f87873a5bd 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -101,8 +101,7 @@ int32 NumIntraOpThreadsFromEnvironment() {
   const char* val = std::getenv("TF_NUM_INTRAOP_THREADS");
   return (val && strings::safe_strto32(val, &num)) ? num : 0;
 }
-
-#ifdef INTEL_MKL
+#if !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
 int32 OMPThreadsFromEnvironment() {
   // 1) std::getenv is thread-safe (as long as no other function modifies the
   // host env) from C++11 onward. 2) Most of TF code (except tests and
@@ -122,14 +121,14 @@ int32 DefaultNumIntraOpThreads() {
   // Default to the maximum parallelism for the current process.
   return port::MaxParallelism();
 }
-#endif  // INTEL_MKL
+#endif  // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32 inter_op = options.config.inter_op_parallelism_threads();
   if (inter_op > 0) return inter_op;
   const int32 env_inter_op = GetEnvNumInterOpThreads();
   if (env_inter_op > 0) return env_inter_op;
 
-#ifdef INTEL_MKL
+#if !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
   if (!DisableMKL()) {
     // MKL library executes ops in parallel using OMP threads.
     // Setting inter_op conservatively to avoid thread oversubscription that
@@ -150,7 +149,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
         << ". Tune using inter_op_parallelism_threads for best performance.";
     return mkl_inter_op;
   }
-#endif  // INTEL_MKL
+#endif  // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
   return DefaultNumInterOpThreads();
 }
 
diff --git a/tensorflow/core/common_runtime/propagator_debug_utils.cc b/tensorflow/core/common_runtime/propagator_debug_utils.cc
index 27f9da7ea52..1bf75fd5282 100644
--- a/tensorflow/core/common_runtime/propagator_debug_utils.cc
+++ b/tensorflow/core/common_runtime/propagator_debug_utils.cc
@@ -14,11 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/propagator_debug_utils.h"
 
-#include <vector>
-
 #include "tensorflow/core/common_runtime/entry.h"
-#include "tensorflow/core/common_runtime/immutable_executor_state.h"
+#include "tensorflow/core/common_runtime/graph_view.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/strcat.h"
 
 namespace tensorflow {
 
@@ -39,10 +40,8 @@ const Tensor* GetTensorValueForDump(const Entry& input) {
   }
 }
 
-void DumpPendingNodeState(const ImmutableExecutorState& immutable_state,
-                          const int node_id, const Entry* input_vector,
+void DumpPendingNodeState(const NodeItem& node_item, const Entry* input_vector,
                           const bool show_nodes_with_no_ready_inputs) {
-  const NodeItem& node_item = immutable_state.graph_view().node_ref(node_id);
   const int input_base = node_item.input_start;
   if (!show_nodes_with_no_ready_inputs) {
     bool has_ready_input = false;
@@ -73,9 +72,7 @@ void DumpPendingNodeState(const ImmutableExecutorState& immutable_state,
   }
 }
 
-void DumpActiveNodeState(const ImmutableExecutorState& immutable_state,
-                         const int node_id, const Entry* input_vector) {
-  const NodeItem& node_item = immutable_state.graph_view().node_ref(node_id);
+void DumpActiveNodeState(const NodeItem& node_item, const Entry* input_vector) {
   LOG(WARNING) << "    Active Node: " << node_item.DebugString();
   const int input_base = node_item.input_start;
   for (int i = 0; i < node_item.num_inputs; ++i) {
diff --git a/tensorflow/core/common_runtime/propagator_debug_utils.h b/tensorflow/core/common_runtime/propagator_debug_utils.h
index 8f1204998ff..2e837104798 100644
--- a/tensorflow/core/common_runtime/propagator_debug_utils.h
+++ b/tensorflow/core/common_runtime/propagator_debug_utils.h
@@ -18,22 +18,20 @@ limitations under the License.
 namespace tensorflow {
 
 struct Entry;
-class ImmutableExecutorState;
+struct NodeItem;
 class Tensor;
 
 // Returns a pointer to the tensor in `input` if one exists, or `nullptr`.
 const Tensor* GetTensorValueForDump(const Entry& input);
 
-// Writes a LOG(WARNING) message describing the state of the pending node
-// `node_id` in the graph described by `immutable_state`.
-void DumpPendingNodeState(const ImmutableExecutorState& immutable_state,
-                          const int node_id, const Entry* input_vector,
+// Writes a LOG(WARNING) message describing the state of the given pending node
+// in the graph described by `immutable_state`.
+void DumpPendingNodeState(const NodeItem& node_item, const Entry* input_vector,
                           const bool show_nodes_with_no_ready_inputs);
 
-// Writes a LOG(WARNING) message describing the state of the active node
-// `node_id` in the graph described by `immutable_state`.
-void DumpActiveNodeState(const ImmutableExecutorState& immutable_state,
-                         const int node_id, const Entry* input_vector);
+// Writes a LOG(WARNING) message describing the state of the given active node
+// in the graph described by `immutable_state`.
+void DumpActiveNodeState(const NodeItem& node_item, const Entry* input_vector);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index a4e311cbc6b..a6639b1132e 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -16,30 +16,33 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/propagator_state.h"
 
 #include "tensorflow/core/common_runtime/graph_view.h"
+#include "tensorflow/core/common_runtime/immutable_executor_state.h"
 #include "tensorflow/core/common_runtime/propagator_debug_utils.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/hash.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 
 PropagatorState::PropagatorState(const ImmutableExecutorState& immutable_state,
-                                 int64 step_id)
+                                 int64 step_id, bool vlog)
     : immutable_state_(immutable_state),
       step_id_(step_id),
-      vlog_(VLOG_IS_ON(1)) {
+      vlog_(vlog || VLOG_IS_ON(1)) {
   // We start the entire execution in iteration 0 of the root frame
   // so let us create the root frame and the state for iteration 0.
   // We assume root_frame_->frame_name.empty().
   root_frame_ = new FrameState(immutable_state_, 1);
   root_frame_->frame_id = 0;  // must be 0
-  root_frame_->InitializeFrameInfo(root_frame_->frame_name);
+  root_frame_->InitializeFrameInfo(immutable_state_.get_root_frame_info());
 
   // Initialize iteration 0.
   root_frame_->SetIteration(
-      0, new PropagatorState::IterationState(root_frame_->pending_counts,
+      0, new PropagatorState::IterationState(0, root_frame_->pending_counts,
                                              root_frame_->total_input_tensors));
 
-  outstanding_frames_.insert({root_frame_->frame_name, root_frame_});
+  outstanding_frames_.emplace(root_frame_->frame_id, root_frame_);
 }
 
 PropagatorState::~PropagatorState() {
@@ -50,12 +53,13 @@ PropagatorState::~PropagatorState() {
 
 void PropagatorState::ActivateRoots(gtl::ArraySlice<const NodeItem*> roots,
                                     TaggedNodeSeq* ready) {
+  mutex_lock l(root_frame_->mu);
+  IterationState* root_iter = root_frame_->GetIteration(0);
   for (const NodeItem* item : roots) {
     DCHECK_EQ(item->num_inputs, 0);
-    ready->emplace_back(item, root_frame_, 0, false);
+    ready->emplace_back(item, root_frame_, root_iter, false);
   }
-  mutex_lock l(root_frame_->mu);
-  root_frame_->GetIteration(0)->outstanding_ops = ready->size();
+  root_iter->outstanding_ops = ready->size();
 }
 
 void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
@@ -74,7 +78,7 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
 
   const NodeItem* const item = tagged_node.node_item;
   FrameState* const input_frame = tagged_node.input_frame;
-  const int64 input_iter = tagged_node.input_iter;
+  IterationState* const input_iter = tagged_node.input_iter;
   const bool is_dead = tagged_node.is_dead;
 
   // Propagates outputs along out edges, and puts newly ready nodes
@@ -82,7 +86,7 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
   DCHECK(ready->empty());
   bool is_frame_done = false;
   FrameState* output_frame = input_frame;
-  int64 output_iter = input_iter;
+  IterationState* output_iter = input_iter;
 
   if (!item->is_enter_exit_or_next_iter) {
     // Fast path for nodes types that don't need special handling
@@ -94,9 +98,9 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
         input_frame->DecrementOutstandingOpsLocked(input_iter, ready);
   } else if (item->is_enter) {
     FindOrCreateChildFrame(input_frame, input_iter, *item, &output_frame);
-    output_iter = 0;
     {
       mutex_lock l(output_frame->mu);
+      output_iter = output_frame->GetIteration(0);
       if (item->is_constant_enter) {
         // Propagate to all active iterations if this is a loop invariant.
         output_frame->AddLoopInv(item, (*outputs)[0], ready);
@@ -110,7 +114,7 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
     if (is_dead) {
       mutex_lock l(input_frame->mu);
       // Stop and remember this node if it is a dead exit.
-      if (input_iter == input_frame->iteration_count) {
+      if (input_iter->iter_num == input_frame->iteration_count) {
         input_frame->dead_exits.push_back(item);
       }
       is_frame_done =
@@ -131,7 +135,7 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
       // Stop the deadness propagation.
       output_frame = nullptr;
     } else {
-      if (input_iter == input_frame->iteration_count &&
+      if (input_iter->iter_num == input_frame->iteration_count &&
           input_frame->num_outstanding_iterations ==
               input_frame->max_parallel_iterations) {
         // Reached the maximum for parallel iterations.
@@ -139,10 +143,11 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
         output_frame = nullptr;
       } else {
         // If this is a new iteration, start it.
-        if (input_iter == input_frame->iteration_count) {
-          input_frame->IncrementIteration(ready);
+        if (input_iter->iter_num == input_frame->iteration_count) {
+          output_iter = input_frame->IncrementIteration(ready);
+        } else {
+          output_iter = input_frame->GetIteration(input_iter->iter_num + 1);
         }
-        output_iter = input_iter + 1;
       }
     }
     if (output_frame != nullptr) {
@@ -158,7 +163,7 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
   // completion of this node makes its frame completed.
   if (is_frame_done) {
     FrameState* parent_frame = input_frame->parent_frame;
-    const int64 parent_iter = input_frame->parent_iter;
+    IterationState* parent_iter = input_frame->parent_iter;
     DeleteFrame(input_frame, ready);
     if (parent_frame != nullptr) {
       // The completion of frame may cause completions in its parent frame.
@@ -177,8 +182,7 @@ void PropagatorState::DumpIterationState(const FrameState* frame,
         immutable_state_.pending_ids()[node->node_id];
     if (iteration->node_state(pending_id) == PendingCounts::PENDING_NOTREADY ||
         iteration->node_state(pending_id) == PendingCounts::PENDING_READY) {
-      DumpPendingNodeState(immutable_state_, node->node_id,
-                           iteration->input_tensors, false);
+      DumpPendingNodeState(*node, iteration->input_tensors, false);
     }
   }
   // Then the active nodes.
@@ -186,8 +190,7 @@ void PropagatorState::DumpIterationState(const FrameState* frame,
     PendingCounts::Handle pending_id =
         immutable_state_.pending_ids()[node->node_id];
     if (iteration->node_state(pending_id) == PendingCounts::STARTED) {
-      DumpActiveNodeState(immutable_state_, node->node_id,
-                          iteration->input_tensors);
+      DumpActiveNodeState(*node, iteration->input_tensors);
     }
   }
   // Show all input tensors in use.
@@ -218,20 +221,21 @@ void PropagatorState::DumpState() {
   }
 }
 
-void PropagatorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
+void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
+                                             IterationState* iter_state,
                                              const NodeItem& node_item,
                                              FrameState** child) {
   // Get the child frame name.
-  AttrSlice attrs(node_item.kernel->def());
-  const string& enter_name = GetNodeAttrString(attrs, "frame_name");
-  DCHECK(!enter_name.empty()) << "Could not find \"frame_name\" attr in node "
-                              << node_item.kernel->name();
-  const string child_name =
-      strings::StrCat(frame->frame_name, ";", iter, ";", enter_name);
+  const ImmutableExecutorState::FrameInfo& frame_info =
+      immutable_state_.get_enter_frame_info(node_item);
+
+  const uint64 child_id = Hash64Combine(
+      frame->frame_id,
+      Hash64Combine(iter_state->iter_num, Hash64(frame_info.name)));
 
   {
-    mutex_lock executor_lock(mu_);
-    auto it = outstanding_frames_.find(child_name);
+    tf_shared_lock executor_lock(mu_);
+    auto it = outstanding_frames_.find(child_id);
     if (it != outstanding_frames_.end()) {
       *child = it->second;
       return;
@@ -240,37 +244,35 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
 
   // Need to create a new frame instance.
   // Note that this new frame instance is created without any locks.
-  if (vlog_) VLOG(2) << "Create frame: " << child_name;
+  if (vlog_) {
+    const string child_name = strings::StrCat(
+        frame->frame_name, ";", iter_state->iter_num, ";", frame_info.name);
+    VLOG(2) << "Create frame: " << child_name << " id: " << child_id;
+  }
 
-  int parallel_iters;
-  bool found_parallel_iters =
-      TryGetNodeAttr(attrs, "parallel_iterations", &parallel_iters);
-  DCHECK(found_parallel_iters)
-      << "Could not find \"parallel_iterations\" attr in node "
-      << node_item.kernel->name();
-  FrameState* temp = new FrameState(immutable_state_, parallel_iters);
-  temp->frame_name = child_name;
-  temp->frame_id = Hash64(child_name);
+  FrameState* temp =
+      new FrameState(immutable_state_, frame_info.parallel_iterations);
+  temp->frame_id = child_id;
   temp->parent_frame = frame;
-  temp->parent_iter = iter;
-  temp->InitializeFrameInfo(enter_name);
+  temp->parent_iter = iter_state;
+  temp->InitializeFrameInfo(frame_info);
 
   // Initialize iteration 0.
   {
     mutex_lock l(temp->mu);
-    temp->SetIteration(
-        0, new IterationState(temp->pending_counts, temp->total_input_tensors));
+    temp->SetIteration(0, new IterationState(0, temp->pending_counts,
+                                             temp->total_input_tensors));
   }
 
   {
     mutex_lock executor_lock(mu_);
-    auto it = outstanding_frames_.find(child_name);
+    auto it = outstanding_frames_.find(child_id);
     if (it != outstanding_frames_.end()) {
       *child = it->second;
     } else {
       mutex_lock frame_lock(frame->mu);
-      frame->GetIteration(iter)->outstanding_frame_count++;
-      outstanding_frames_[child_name] = temp;
+      iter_state->outstanding_frame_count++;
+      outstanding_frames_[child_id] = temp;
       *child = temp;
       temp = nullptr;
     }
@@ -281,20 +283,19 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
 void PropagatorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   // First, propagate dead_exits (if any) to the parent frame.
   FrameState* parent_frame = frame->parent_frame;
-  const int64 parent_iter = frame->parent_iter;
+  IterationState* parent_iter_state = frame->parent_iter;
   if (parent_frame != nullptr) {
     mutex_lock parent_frame_lock(parent_frame->mu);
     // Propagate all the dead exits to the parent frame.
     mutex_lock this_frame_lock(frame->mu);
 
     for (const NodeItem* item : frame->dead_exits) {
-      auto parent_iter_state = parent_frame->GetIteration(parent_iter);
-
       auto maybe_add_to_ready = [&](const NodeItem& dst_item, bool dst_ready,
                                     bool dst_dead) {
         if (dst_ready) {
           if (dst_item.is_control_trigger) dst_dead = false;
-          ready->emplace_back(&dst_item, parent_frame, parent_iter, dst_dead);
+          ready->emplace_back(&dst_item, parent_frame, parent_iter_state,
+                              dst_dead);
           parent_iter_state->outstanding_ops++;
         }
       };
@@ -348,26 +349,26 @@ void PropagatorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   }
 
   // Delete the frame.
-  const string& frame_name = frame->frame_name;
-  if (vlog_) VLOG(2) << "Delete frame " << frame_name;
+  if (vlog_) VLOG(2) << "Delete frame " << frame->frame_id;
   {
     mutex_lock executor_lock(mu_);
-    outstanding_frames_.erase(frame_name);
+    outstanding_frames_.erase(frame->frame_id);
   }
   delete frame;
 }
 
-void PropagatorState::CleanupFramesIterations(FrameState* frame, int64 iter,
+void PropagatorState::CleanupFramesIterations(FrameState* frame,
+                                              IterationState* iter_state,
                                               TaggedNodeSeq* ready) {
   bool is_frame_done = false;
   {
     mutex_lock frame_lock(frame->mu);
-    frame->GetIteration(iter)->outstanding_frame_count--;
-    is_frame_done = frame->CleanupIterations(iter, ready);
+    iter_state->outstanding_frame_count--;
+    is_frame_done = frame->CleanupIterations(iter_state, ready);
   }
   if (is_frame_done) {
     FrameState* parent_frame = frame->parent_frame;
-    const int64 parent_iter = frame->parent_iter;
+    IterationState* parent_iter = frame->parent_iter;
     DeleteFrame(frame, ready);
     if (parent_frame != nullptr) {
       // The completion of frame may cause completions in its parent frame.
@@ -377,16 +378,13 @@ void PropagatorState::CleanupFramesIterations(FrameState* frame, int64 iter,
   }
 }
 
-void PropagatorState::FrameState::ActivateNodesFastPath(const NodeItem* item,
-                                                        const bool is_dead,
-                                                        int64 iter,
-                                                        EntryVector* outputs,
-                                                        TaggedNodeSeq* ready) {
+void PropagatorState::FrameState::ActivateNodesFastPath(
+    const NodeItem* item, const bool is_dead, IterationState* iter_state,
+    EntryVector* outputs, TaggedNodeSeq* ready) {
   // If we know that none of the item's edge destinations require special
   // handling (i.e. none of the nodes is a merge or control trigger node), we
   // can take a fast path that avoids accessing the destination NodeItem.
   const GraphView& gview = immutable_state.graph_view();
-  IterationState* iter_state = GetIteration(iter);
 
 // Add dst to the ready queue if it's ready
 //
@@ -399,7 +397,7 @@ void PropagatorState::FrameState::ActivateNodesFastPath(const NodeItem* item,
       TaggedNode& t = ready->emplace_back();              \
       t.node_item = dst_item;                             \
       t.input_frame = this;                               \
-      t.input_iter = iter;                                \
+      t.input_iter = iter_state;                          \
       t.is_dead = adjust_result.any_dead;                 \
       iter_state->outstanding_ops++;                      \
     }                                                     \
@@ -437,23 +435,20 @@ void PropagatorState::FrameState::ActivateNodesFastPath(const NodeItem* item,
 #undef MAYBE_ADD_TO_READY
 }
 
-void PropagatorState::FrameState::ActivateNodesSlowPath(const NodeItem* item,
-                                                        const bool is_dead,
-                                                        int64 iter,
-                                                        EntryVector* outputs,
-                                                        TaggedNodeSeq* ready) {
+void PropagatorState::FrameState::ActivateNodesSlowPath(
+    const NodeItem* item, const bool is_dead, IterationState* iter_state,
+    EntryVector* outputs, TaggedNodeSeq* ready) {
   // If any of the edge destinations is a merge or a control trigger node,
   // we need to read each destination NodeItem to determine what action
   // to take.
   const GraphView& gview = immutable_state.graph_view();
-  IterationState* iter_state = GetIteration(iter);
 
   auto maybe_add_to_ready = [&](int dst_id, const NodeItem* dst_item,
                                 bool dst_ready, bool dst_dead) {
     // Add dst to the ready queue if it's ready
     if (dst_ready) {
       if (dst_item->is_control_trigger) dst_dead = false;
-      ready->emplace_back(dst_item, this, iter, dst_dead);
+      ready->emplace_back(dst_item, this, iter_state, dst_dead);
       iter_state->outstanding_ops++;
     }
   };
@@ -552,17 +547,18 @@ void PropagatorState::FrameState::ActivateNodesSlowPath(const NodeItem* item,
 }
 
 void PropagatorState::FrameState::ActivateNodes(const NodeItem* item,
-                                                const bool is_dead, int64 iter,
+                                                const bool is_dead,
+                                                IterationState* iter_state,
                                                 EntryVector* outputs,
                                                 TaggedNodeSeq* ready) {
   if (TF_PREDICT_FALSE(item->is_any_consumer_merge_or_control_trigger)) {
-    ActivateNodesSlowPath(item, is_dead, iter, outputs, ready);
+    ActivateNodesSlowPath(item, is_dead, iter_state, outputs, ready);
   } else {
-    ActivateNodesFastPath(item, is_dead, iter, outputs, ready);
+    ActivateNodesFastPath(item, is_dead, iter_state, outputs, ready);
   }
 }
 
-void PropagatorState::FrameState::ActivateNexts(int64 iter,
+void PropagatorState::FrameState::ActivateNexts(IterationState* iter_state,
                                                 TaggedNodeSeq* ready) {
   // Propagate the deferred NextIteration nodes to the new iteration.
   for (auto& node_entry : next_iter_roots) {
@@ -570,12 +566,12 @@ void PropagatorState::FrameState::ActivateNexts(int64 iter,
     const Entry& entry = node_entry.second;
     const bool is_dead = entry.state == Entry::State::NO_VALUE;
     EntryVector outputs{entry};
-    ActivateNodes(item, is_dead, iter, &outputs, ready);
+    ActivateNodes(item, is_dead, iter_state, &outputs, ready);
   }
   next_iter_roots.clear();
 }
 
-void PropagatorState::FrameState::ActivateLoopInvs(int64 iter,
+void PropagatorState::FrameState::ActivateLoopInvs(IterationState* iter_state,
                                                    TaggedNodeSeq* ready) {
   // Propagate loop invariants to the new iteration.
   for (auto& node_entry : inv_values) {
@@ -583,7 +579,7 @@ void PropagatorState::FrameState::ActivateLoopInvs(int64 iter,
     const Entry& entry = node_entry.second;
     const bool is_dead = entry.state == Entry::State::NO_VALUE;
     EntryVector outputs{entry};
-    ActivateNodes(item, is_dead, iter, &outputs, ready);
+    ActivateNodes(item, is_dead, iter_state, &outputs, ready);
   }
 }
 
@@ -597,33 +593,32 @@ void PropagatorState::FrameState::AddLoopInv(const NodeItem* item,
   const bool is_dead = entry.state == Entry::State::NO_VALUE;
   for (int i = 0; i <= iteration_count; ++i) {
     EntryVector outputs{entry};
-    ActivateNodes(item, is_dead, i, &outputs, ready);
+    ActivateNodes(item, is_dead, GetIteration(i), &outputs, ready);
   }
 }
 
-bool PropagatorState::FrameState::IsIterationDone(int64 iter) {
-  IterationState* iter_state = GetIteration(iter);
+bool PropagatorState::FrameState::IsIterationDone(IterationState* iter_state) {
   if (iter_state->outstanding_ops == 0 &&
       iter_state->outstanding_frame_count == 0) {
-    if (iter == 0) {
+    if (iter_state->iter_num == 0) {
       // The enclosing frame has no pending input.
       return num_pending_inputs == 0;
     } else {
       // The preceding iteration is deleted (and therefore done).
-      return (GetIteration(iter - 1) == nullptr);
+      return (GetIteration(iter_state->iter_num - 1) == nullptr);
     }
   }
   return false;
 }
 
-void PropagatorState::FrameState::IncrementIteration(TaggedNodeSeq* ready) {
+PropagatorState::IterationState*
+PropagatorState::FrameState::IncrementIteration(TaggedNodeSeq* ready) {
   iteration_count++;
-  const int64 next_iter = iteration_count;
 
   // Initialize the next iteration.
-  IterationState* iter_state =
-      new IterationState(pending_counts, total_input_tensors);
-  SetIteration(next_iter, iter_state);
+  IterationState* next_iter =
+      new IterationState(iteration_count, pending_counts, total_input_tensors);
+  SetIteration(iteration_count, next_iter);
   num_outstanding_iterations++;
   dead_exits.clear();
 
@@ -632,14 +627,15 @@ void PropagatorState::FrameState::IncrementIteration(TaggedNodeSeq* ready) {
 
   // Activate the loop invariants in the new iteration.
   ActivateLoopInvs(next_iter, ready);
+
+  return next_iter;
 }
 
-bool PropagatorState::FrameState::CleanupIterations(int64 iter,
+bool PropagatorState::FrameState::CleanupIterations(IterationState* iter_state,
                                                     TaggedNodeSeq* ready) {
-  int64 curr_iter = iter;
-  while (curr_iter <= iteration_count && IsIterationDone(curr_iter)) {
-    // Delete the iteration curr_iter.
-    delete GetIteration(curr_iter);
+  int64 curr_iter = iter_state->iter_num;
+  while (curr_iter <= iteration_count && IsIterationDone(iter_state)) {
+    delete iter_state;
     SetIteration(curr_iter, nullptr);
     --num_outstanding_iterations;
     ++curr_iter;
@@ -649,19 +645,20 @@ bool PropagatorState::FrameState::CleanupIterations(int64 iter,
     if (!next_iter_roots.empty()) {
       IncrementIteration(ready);
     }
+
+    if (curr_iter <= iteration_count) {
+      iter_state = GetIteration(curr_iter);
+    }
   }
   return IsFrameDone();
 }
 
 void PropagatorState::FrameState::InitializeFrameInfo(
-    const string& enter_name) {
-  const ImmutableExecutorState::FrameInfo* finfo =
-      immutable_state.get_frame_info(enter_name);
-  DCHECK_NE(finfo, nullptr);
-  pending_counts = finfo->pending_counts.get();
-  total_input_tensors = finfo->total_inputs;
-  num_pending_inputs = finfo->input_count;
-  nodes = finfo->nodes.get();
+    const ImmutableExecutorState::FrameInfo& finfo) {
+  pending_counts = finfo.pending_counts.get();
+  total_input_tensors = finfo.total_inputs;
+  num_pending_inputs = finfo.input_count;
+  nodes = finfo.nodes.get();
 }
 
 void PropagatorState::FrameState::SetIteration(int64 iter,
@@ -678,21 +675,21 @@ void PropagatorState::FrameState::SetIteration(int64 iter,
 // Decrement the outstanding op count and clean up the iterations in the
 // frame. Return true iff the execution of the frame is done.
 bool PropagatorState::FrameState::DecrementOutstandingOps(
-    int64 iter, TaggedNodeSeq* ready) {
+    IterationState* iter_state, TaggedNodeSeq* ready) {
   mutex_lock l(mu);
-  return DecrementOutstandingOpsLocked(iter, ready);
+  return DecrementOutstandingOpsLocked(iter_state, ready);
 }
 
 // Decrement the outstanding op count and clean up the iterations in the
 // frame. Return true iff the execution of the frame is done.
 bool PropagatorState::FrameState::DecrementOutstandingOpsLocked(
-    int64 iter, TaggedNodeSeq* ready) TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
-  IterationState* istate = GetIteration(iter);
-  istate->outstanding_ops--;
-  if (istate->outstanding_ops != 0) {
+    IterationState* iter_state, TaggedNodeSeq* ready)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+  iter_state->outstanding_ops--;
+  if (iter_state->outstanding_ops != 0) {
     return false;
   } else {
-    return CleanupIterations(iter, ready);
+    return CleanupIterations(iter_state, ready);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index 6d5abd02afa..167519ccc73 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -33,7 +33,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-typedef gtl::InlinedVector<TensorValue, 4> TensorValueVec;
 typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;
 
 // Represents the ephemeral "edge state" associated with one invocation of
@@ -46,12 +45,15 @@ typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;
 // adding them to a `TaggedNodeSeq`.
 class PropagatorState {
  public:
-  PropagatorState(const ImmutableExecutorState& immutable_state, int64 step_id);
+  PropagatorState(const ImmutableExecutorState& immutable_state, int64 step_id,
+                  bool vlog);
   ~PropagatorState();
 
  private:
-  // Forward declaration so that `TaggedNode` can include a `FrameState*`.
+  // Forward declaration so that `TaggedNode` can include a `FrameState*` and an
+  // `IterationState*`.
   struct FrameState;
+  struct IterationState;
 
  public:
   // A `TaggedNode` corresponds to a single invocation of a node's kernel,
@@ -60,12 +62,12 @@ class PropagatorState {
   struct TaggedNode {
     const NodeItem* node_item;
     FrameState* input_frame;
-    int64 input_iter;
+    IterationState* input_iter;
     bool is_dead;
 
     TaggedNode() = default;
-    TaggedNode(const NodeItem* node_item, FrameState* in_frame, int64 in_iter,
-               bool dead)
+    TaggedNode(const NodeItem* node_item, FrameState* in_frame,
+               IterationState* in_iter, bool dead)
         : node_item(node_item),
           input_frame(in_frame),
           input_iter(in_iter),
@@ -74,7 +76,7 @@ class PropagatorState {
     const NodeItem& get_node_item() const { return *node_item; }
 
     bool get_is_dead() const { return is_dead; }
-    int64 get_iter_num() const { return input_iter; }
+    int64 get_iter_num() const;
   };
 
   // A drop-in replacement for std::deque<TaggedNode>.  We typically don't
@@ -117,16 +119,18 @@ class PropagatorState {
   typedef gtl::InlinedVector<TaggedNode, 8> TaggedNodeSeq;
 
  private:
+  // The state of an iteration in a particular frame.
   struct IterationState {
-    explicit IterationState(const PendingCounts* pending_counts,
+    explicit IterationState(int64 iter_num, const PendingCounts* pending_counts,
                             int total_input_tensors)
-        : input_tensors(new Entry[total_input_tensors]),
+        : iter_num(iter_num),
+          input_tensors(new Entry[total_input_tensors]),
           outstanding_ops(0),
           outstanding_frame_count(0),
           counts(*pending_counts) {  // Initialize with copy of *pending_counts
     }
 
-    // The state of an iteration.
+    const int64 iter_num;  // The index of this iteration in the enclosing loop.
 
     // One copy per iteration. For iteration k, i-th node's j-th input is in
     // input_tensors[k][immutable_state_.nodes[i].input_start + j]. An entry is
@@ -222,10 +226,10 @@ class PropagatorState {
     // frame_name.
     uint64 frame_id;
 
-    // The iteration id of its parent frame when this frame is created.
-    // -1 if there is no parent frame. The frame_name/parent_iter pair
+    // The iteration state of its parent frame when this frame is created.
+    // nullptr if there is no parent frame. The frame_name/parent_iter pair
     // uniquely identifies this FrameState.
-    int64 parent_iter = -1;
+    IterationState* parent_iter = nullptr;
 
     // The FrameState of its parent frame.
     FrameState* parent_frame = nullptr;
@@ -276,7 +280,7 @@ class PropagatorState {
     // during structured traversal: parent_frame->mu < mu.
     mutex mu;
 
-    void InitializeFrameInfo(const string& enter_name);
+    void InitializeFrameInfo(const ImmutableExecutorState::FrameInfo& finfo);
 
     inline IterationState* GetIteration(int64 iter)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
@@ -292,28 +296,33 @@ class PropagatorState {
 
     // Decrement the outstanding op count and clean up the iterations in the
     // frame. Return true iff the execution of the frame is done.
-    bool DecrementOutstandingOps(int64 iter, TaggedNodeSeq* ready);
+    bool DecrementOutstandingOps(IterationState* iter_state,
+                                 TaggedNodeSeq* ready);
 
     // Decrement the outstanding op count and clean up the iterations in the
     // frame. Return true iff the execution of the frame is done.
-    bool DecrementOutstandingOpsLocked(int64 iter, TaggedNodeSeq* ready);
+    bool DecrementOutstandingOpsLocked(IterationState* iter_state,
+                                       TaggedNodeSeq* ready);
 
     // Returns true if the computation in the frame is completed.
     bool IsFrameDone();
 
     // Returns true if the iteration of the frame is completed.
-    bool IsIterationDone(int64 iter) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+    bool IsIterationDone(IterationState* iter_state)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Increments the iteration id. If this is a new iteration, initialize it.
-    void IncrementIteration(TaggedNodeSeq* ready)
+    //
+    // Returns a pointer to the new iteration.
+    IterationState* IncrementIteration(TaggedNodeSeq* ready)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Activate all the deferred NextIteration nodes in a new iteration.
-    void ActivateNexts(int64 iter, TaggedNodeSeq* ready)
+    void ActivateNexts(IterationState* iter_state, TaggedNodeSeq* ready)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Activate all the current loop invariants in a new iteration.
-    void ActivateLoopInvs(int64 iter, TaggedNodeSeq* ready)
+    void ActivateLoopInvs(IterationState* iter_state, TaggedNodeSeq* ready)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Add a new loop invariant and make it available to all active
@@ -323,12 +332,12 @@ class PropagatorState {
 
     // Activate the successors of a node. Contents of *outputs are left in an
     // indeterminate state after returning from this method.
-    void ActivateNodes(const NodeItem* item, const bool is_dead, int64 iter,
-                       EntryVector* outputs, TaggedNodeSeq* ready)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+    void ActivateNodes(const NodeItem* item, const bool is_dead,
+                       IterationState* iter_state, EntryVector* outputs,
+                       TaggedNodeSeq* ready) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
-    // Cleanup iterations of this frame starting from iteration iter.
-    bool CleanupIterations(int64 iter, TaggedNodeSeq* ready)
+    // Cleanup iterations of this frame starting from the given iteration.
+    bool CleanupIterations(IterationState* iter_state, TaggedNodeSeq* ready)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     void DumpIterationState(PropagatorState* parent) {
@@ -351,12 +360,12 @@ class PropagatorState {
    private:
     // REQUIRES: `!item->is_any_consumer_merge_or_control_trigger`.
     void ActivateNodesFastPath(const NodeItem* item, const bool is_dead,
-                               int64 iter, EntryVector* outputs,
+                               IterationState* iter_state, EntryVector* outputs,
                                TaggedNodeSeq* ready)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     void ActivateNodesSlowPath(const NodeItem* item, const bool is_dead,
-                               int64 iter, EntryVector* outputs,
+                               IterationState* iter_state, EntryVector* outputs,
                                TaggedNodeSeq* ready)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
   };
@@ -380,13 +389,13 @@ class PropagatorState {
   // same address while the iteration is live.
   Entry* GetInputTensors(const TaggedNode& tagged_node) const
       TF_NO_THREAD_SAFETY_ANALYSIS {
-    return tagged_node.input_frame->GetIteration(tagged_node.input_iter)
-               ->input_tensors +
+    return tagged_node.input_iter->input_tensors +
            tagged_node.node_item->input_start;
   }
 
   FrameAndIter GetFrameAndIter(const TaggedNode& tagged_node) const {
-    return {tagged_node.input_frame->frame_id, tagged_node.input_iter};
+    return {tagged_node.input_frame->frame_id,
+            tagged_node.input_iter->iter_num};
   }
 
   // Provide debugging output of the state of the executor.
@@ -398,9 +407,8 @@ class PropagatorState {
     // optional debugging support.
     if (TF_PREDICT_FALSE(vlog_) && VLOG_IS_ON(1)) {
       mutex_lock l(tagged_node.input_frame->mu);
-      tagged_node.input_frame->GetIteration(tagged_node.input_iter)
-          ->mark_started(
-              immutable_state_.pending_ids()[tagged_node.node_item->node_id]);
+      tagged_node.input_iter->mark_started(
+          immutable_state_.pending_ids()[tagged_node.node_item->node_id]);
     }
   }
 
@@ -409,16 +417,15 @@ class PropagatorState {
     // optional debugging support.
     if (TF_PREDICT_FALSE(vlog_) && VLOG_IS_ON(1)) {
       mutex_lock l(tagged_node.input_frame->mu);
-      tagged_node.input_frame->GetIteration(tagged_node.input_iter)
-          ->mark_completed(
-              immutable_state_.pending_ids()[tagged_node.node_item->node_id]);
+      tagged_node.input_iter->mark_completed(
+          immutable_state_.pending_ids()[tagged_node.node_item->node_id]);
     }
   }
 
  private:
   // Find an existing or create a new child frame in the frame 'frame' at
   // iteration 'iter'.
-  void FindOrCreateChildFrame(FrameState* frame, int64 iter,
+  void FindOrCreateChildFrame(FrameState* frame, IterationState* iter_state,
                               const NodeItem& node_item, FrameState** child);
 
   // Delete a frame. Called when the frame is done.
@@ -426,7 +433,7 @@ class PropagatorState {
 
   // Cleanup frames and iterations starting from frame/iter. Called when
   // a child frame is done.
-  void CleanupFramesIterations(FrameState* frame, int64 iter,
+  void CleanupFramesIterations(FrameState* frame, IterationState* iter_state,
                                TaggedNodeSeq* ready);
 
   // Provide debugging output about an outstanding iteration in the executor.
@@ -441,16 +448,21 @@ class PropagatorState {
   // The root frame in which the execution of this step is started.
   FrameState* root_frame_;
 
-  // Mapping from frame name to outstanding frames. A new frame is created
+  // Mapping from frame ID to outstanding frames. A new frame is created
   // at some iteration of an active frame. So the unique key for the new
-  // child frame is composed of the name of the parent frame, the iteration
+  // child frame is a hash composed of the ID of the parent frame, the iteration
   // number at which the parent frame is creating the new frame, and the
   // name of the new frame from nodedef.
-  gtl::FlatMap<string, FrameState*> outstanding_frames_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<uint64, FrameState*> outstanding_frames_
+      TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(PropagatorState);
 };
 
+inline int64 PropagatorState::TaggedNode::get_iter_num() const {
+  return input_iter->iter_num;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROPAGATOR_STATE_H_
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/common_runtime/quantize_training.cc
similarity index 99%
rename from tensorflow/core/graph/quantize_training.cc
rename to tensorflow/core/common_runtime/quantize_training.cc
index 4670e7a543c..893d1dd92a0 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/common_runtime/quantize_training.cc
@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/quantize_training.h"
+
 #include <algorithm>
 #include <atomic>
 #include <set>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/graph/quantize_training.h"
-
-#include "tensorflow/core/common_runtime/executor.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/core/graph/quantize_training.h b/tensorflow/core/common_runtime/quantize_training.h
similarity index 92%
rename from tensorflow/core/graph/quantize_training.h
rename to tensorflow/core/common_runtime/quantize_training.h
index dc3d7e3b1f2..7822ac498c3 100644
--- a/tensorflow/core/graph/quantize_training.h
+++ b/tensorflow/core/common_runtime/quantize_training.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPH_QUANTIZE_TRAINING_H_
-#define TENSORFLOW_CORE_GRAPH_QUANTIZE_TRAINING_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_QUANTIZE_TRAINING_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_QUANTIZE_TRAINING_H_
 
 #include "tensorflow/core/graph/graph.h"
 
@@ -53,4 +53,4 @@ Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPH_QUANTIZE_TRAINING_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_QUANTIZE_TRAINING_H_
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/common_runtime/quantize_training_test.cc
similarity index 99%
rename from tensorflow/core/graph/quantize_training_test.cc
rename to tensorflow/core/common_runtime/quantize_training_test.cc
index 0e55a1ae3b8..4bd89d1a874 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/common_runtime/quantize_training_test.cc
@@ -13,21 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/quantize_training.h"
+
 #include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/graph/quantize_training.h"
-
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index 4c32a54aee4..fbae80aef55 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -42,6 +42,9 @@ class ReplicateHelper {
       Node* replicated_node = graph->AddNode(node_def, &status);
       TF_RETURN_IF_ERROR(status);
       replicated_node->set_assigned_device_name(device);
+      if (replicated_node->IsArg()) {
+        replicated_node->AddAttr("sub_index", i);
+      }
       replicated_nodes[i] = replicated_node;
     }
     replicated_nodes_map_.emplace(node, std::move(replicated_nodes));
@@ -180,7 +183,8 @@ Status ReplicateEdges(const ReplicateHelper& helper,
 }  // namespace
 
 Status ReplicatePerReplicaNodesInFunctionGraph(
-    const absl::flat_hash_map<string, std::vector<string>>& composite_devices,
+    const absl::flat_hash_map<string, const std::vector<string>*>&
+        composite_devices,
     Graph* graph) {
   std::set<string> composite_device_names;
   for (const auto& it : composite_devices) {
@@ -193,12 +197,16 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
   for (Node* n : graph->op_nodes()) {
     if (composite_device_names.find(n->assigned_device_name()) !=
         composite_device_names.end()) {
+      // TODO(b/145922293): Validate that an _Arg node assigned to a
+      // CompositeDevice should have an attribute indicating that the _Arg node
+      // represents a packed input.
       composite_device_to_cluster_nodes[n->assigned_device_name()].push_back(n);
     }
   }
 
   for (const auto& it : composite_device_to_cluster_nodes) {
-    const std::vector<string>& allowed_devices = composite_devices.at(it.first);
+    const std::vector<string>& allowed_devices =
+        *composite_devices.at(it.first);
     if (allowed_devices.empty()) {
       return errors::InvalidArgument("No allowed device of composite device: ",
                                      it.first);
@@ -208,6 +216,9 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
       // Reuse the original nodes if there is only one allowed device.
       for (Node* n : cluster_nodes) {
         n->set_assigned_device_name(allowed_devices.at(0));
+        if (n->IsArg()) {
+          n->AddAttr("sub_index", 0);
+        }
       }
       continue;
     }
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.h b/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
index 872e77c8671..fd696db4905 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
@@ -35,7 +35,8 @@ namespace tensorflow {
 // dependency.
 // TODO(b/145922293): Register it as a POST_REWRITE_FOR_EXEC pass.
 Status ReplicatePerReplicaNodesInFunctionGraph(
-    const absl::flat_hash_map<string, std::vector<string>>& composite_devices,
+    const absl::flat_hash_map<string, const std::vector<string>*>&
+        composite_devices,
     Graph* graph);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
index 094d86944ee..db05907710c 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
@@ -75,8 +75,9 @@ TEST(ReplicatePerReplicaNodesTest, SingleCompositeDevice) {
   auto ret = ops::_Retval(
       scope.WithOpName("ret").WithControlDependencies({write}), read, 0);
 
-  const absl::flat_hash_map<string, std::vector<string>> composite_devices = {
-      {"TPU_COMPOSITE:0", {"TPU:0", "TPU:1"}}};
+  const std::vector<string> underlying_devices = {"TPU:0", "TPU:1"};
+  const absl::flat_hash_map<string, const std::vector<string>*>
+      composite_devices = {{"TPU_COMPOSITE:0", &underlying_devices}};
 
   Graph graph(OpRegistry::Global());
   TF_ASSERT_OK(scope.ToGraph(&graph));
@@ -118,8 +119,9 @@ TEST(ReplicatePerReplicaNodesTest, SingleCompositeDeviceToSingleDevice) {
   auto read = ops::ReadVariableOp(scope.WithOpName("read"), arg, DT_INT32);
   auto ret = ops::_Retval(scope.WithOpName("ret"), read, 0);
 
-  const absl::flat_hash_map<string, std::vector<string>> composite_devices = {
-      {"TPU_COMPOSITE:0", {"TPU:0"}}};
+  const std::vector<string> underlying_devices = {"TPU:0"};
+  const absl::flat_hash_map<string, const std::vector<string>*>
+      composite_devices = {{"TPU_COMPOSITE:0", &underlying_devices}};
 
   Graph graph(OpRegistry::Global());
   TF_ASSERT_OK(scope.ToGraph(&graph));
@@ -156,9 +158,11 @@ TEST(ReplicatePerReplicaNodesTest, MultipleCompositeDevices) {
   auto add = ops::Add(scope.WithOpName("add"), identity0, identity1);
   auto ret = ops::_Retval(scope.WithOpName("ret"), add, 0);
 
-  const absl::flat_hash_map<string, std::vector<string>> composite_devices = {
-      {"TPU_COMPOSITE:0", {"TPU:0", "TPU:1"}},
-      {"TPU_COMPOSITE:1", {"TPU:2", "TPU:3"}}};
+  const std::vector<string> underlying_devices_0 = {"TPU:0", "TPU:1"};
+  const std::vector<string> underlying_devices_1 = {"TPU:2", "TPU:3"};
+  const absl::flat_hash_map<string, const std::vector<string>*>
+      composite_devices = {{"TPU_COMPOSITE:0", &underlying_devices_0},
+                           {"TPU_COMPOSITE:1", &underlying_devices_1}};
 
   Graph graph(OpRegistry::Global());
   TF_ASSERT_OK(scope.ToGraph(&graph));
@@ -204,8 +208,9 @@ TEST(ReplicatePerReplicaNodesTest, MultipleCompositeDevices) {
 }
 
 TEST(ReplicatePerReplicaNodesTest, NestedFunctions) {
-  const absl::flat_hash_map<string, std::vector<string>> composite_devices = {
-      {"TPU_COMPOSITE:0", {"TPU:0", "TPU:1"}}};
+  const std::vector<string> underlying_devices = {"TPU:0", "TPU:1"};
+  const absl::flat_hash_map<string, const std::vector<string>*>
+      composite_devices = {{"TPU_COMPOSITE:0", &underlying_devices}};
 
   FunctionDefLibrary fdef_lib;
   FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index c6ba1ec58ff..3a1a84a376d 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -254,14 +254,9 @@ string RingAlg::TensorDebugString(const Tensor& tensor) {
       col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
   if (gpu_device_info) {
     Tensor cpu_tensor(tensor.dtype(), tensor.shape());
-    Notification note;
-    gpu_device_info->default_context->CopyDeviceTensorToCPU(
-        &tensor, "" /*tensor_name*/, col_ctx_->device, &cpu_tensor,
-        [&note](const Status& s) {
-          DCHECK(s.ok());
-          note.Notify();
-        });
-    note.WaitForNotification();
+    Status st = gpu_device_info->default_context->CopyDeviceTensorToCPUSync(
+        &tensor, "" /*tensor_name*/, col_ctx_->device, &cpu_tensor);
+    DCHECK(st.ok());
     return cpu_tensor.SummarizeValue(64);
   } else {
     return tensor.SummarizeValue(64);
diff --git a/tensorflow/core/common_runtime/ring_gatherer.cc b/tensorflow/core/common_runtime/ring_gatherer.cc
index db096ba5d92..ecffd4a6eea 100644
--- a/tensorflow/core/common_runtime/ring_gatherer.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
@@ -273,6 +272,8 @@ bool RingGatherer::RunAsyncParts() {
   return !aborted;
 }
 
+namespace {
 REGISTER_COLLECTIVE(RingGather, RingGatherer);
+}  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index bb2b6f3e0cf..3af4890e3d3 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
-#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
@@ -310,16 +309,13 @@ class RingGathererTest : public ::testing::Test {
           CHECK(actual.CopyFrom(*inst, inst->shape()));
           VLOG(1) << "actual " << actual.SummarizeValue(100);
         } else if (device_type_ == DEVICE_GPU) {
-          Notification note;
           Device* dev = instances_[di]->device_;
           auto* dev_info = dev->tensorflow_gpu_device_info();
           CHECK(dev_info);
-          dev_info->default_context->CopyDeviceTensorToCPU(
-              inst, "" /*tensor_name*/, dev, &actual, [&note](const Status& s) {
-                CHECK(s.ok());
-                note.Notify();
-              });
-          note.WaitForNotification();
+          CHECK(dev_info->default_context
+                    ->CopyDeviceTensorToCPUSync(inst, "" /*tensor_name*/, dev,
+                                                &actual)
+                    .ok());
         }
 
         auto alias = actual.template unaligned_flat<T>();
@@ -433,13 +429,10 @@ class RingGathererTest : public ::testing::Test {
         init_f(&cpu_tensor);
         auto* dev_info = device_->tensorflow_gpu_device_info();
         CHECK(dev_info);
-        Notification note;
-        dev_info->default_context->CopyCPUTensorToDevice(
-            &cpu_tensor, device_, &input_tensor_, [&note](const Status& s) {
-              CHECK(s.ok());
-              note.Notify();
-            });
-        note.WaitForNotification();
+        CHECK(dev_info->default_context
+                  ->CopyCPUTensorToDeviceSync(&cpu_tensor, device_,
+                                              &input_tensor_)
+                  .ok());
       } else {
         LOG(FATAL) << "Unsupported device_type " << device_type_;
       }
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index cc950df52f1..ab4542d58d8 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
@@ -350,6 +349,8 @@ bool RingReducer::RunAsyncParts() {
   return !aborted;
 }
 
+namespace {
 REGISTER_COLLECTIVE(RingReduce, RingReducer);
+}  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index d1c705437ba..318d6e91afb 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
-#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
@@ -331,16 +330,13 @@ class RingReducerTest : public ::testing::Test {
           CHECK(actual.CopyFrom(*inst, inst->shape()));
           VLOG(1) << "actual " << actual.SummarizeValue(100);
         } else if (device_type_ == DEVICE_GPU) {
-          Notification note;
           Device* dev = instances_[di]->device_;
           auto* dev_info = dev->tensorflow_gpu_device_info();
           CHECK(dev_info);
-          dev_info->default_context->CopyDeviceTensorToCPU(
-              inst, "" /*tensor_name*/, dev, &actual, [&note](const Status& s) {
-                CHECK(s.ok());
-                note.Notify();
-              });
-          note.WaitForNotification();
+          CHECK(dev_info->default_context
+                    ->CopyDeviceTensorToCPUSync(inst, "" /*tensor_name*/, dev,
+                                                &actual)
+                    .ok());
         }
 
         auto alias = actual.template unaligned_flat<T>();
@@ -458,13 +454,9 @@ class RingReducerTest : public ::testing::Test {
         init_f(&cpu_tensor);
         auto* dev_info = device_->tensorflow_gpu_device_info();
         CHECK(dev_info);
-        Notification note;
-        dev_info->default_context->CopyCPUTensorToDevice(
-            &cpu_tensor, device_, &tensor_, [&note](const Status& s) {
-              CHECK(s.ok());
-              note.Notify();
-            });
-        note.WaitForNotification();
+        CHECK(dev_info->default_context
+                  ->CopyCPUTensorToDeviceSync(&cpu_tensor, device_, &tensor_)
+                  .ok());
       } else {
         LOG(FATAL) << "Unsupported device_type " << device_type_;
       }
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index 8ab5ee038c8..f25bee45b01 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -27,8 +27,8 @@ class ScopedAllocatorInstance;
 // Manages a single backing tensor and a collection of aliases.
 class ScopedAllocator {
  public:
-  static const int32 kInvalidId = 0;
-  static const size_t kMaxAlignment = 64;
+  static constexpr int32 kInvalidId = 0;
+  static constexpr size_t kMaxAlignment = 64;
 
   // A subrange of the TensorBuffer associated with this object that
   // will be the backing memory for one aliased tensor.
@@ -39,7 +39,7 @@ class ScopedAllocator {
     size_t bytes_allocated;
   };
   // Field index that refers to backing tensor, not any aliased field.
-  static const int32 kBackingIndex = -1;
+  static constexpr int32 kBackingIndex = -1;
 
   // backing_tensor is expected to be newly allocated by a ScopedAllocatorOp
   // instance.  It must be large enough to back all of the specified
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index d6ab1e30a55..a968aaf09b6 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -20,7 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/eval_const_tensor.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -28,9 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 33ebba07b9f..c83bd81705b 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -236,7 +237,8 @@ class ShapeRefiner {
   GraphRunner graph_runner_;
 
   // Stores a map from a node to its ExtendedInferenceContext.
-  std::unordered_map<const Node*, std::unique_ptr<ExtendedInferenceContext>>
+  absl::flat_hash_map<const Node*, std::unique_ptr<ExtendedInferenceContext>,
+                      hash<const Node*>>
       node_to_context_;
 
   // Holds a cache from 'tensor name' to the tensor that is
@@ -257,9 +259,10 @@ class ShapeRefiner {
   // shape inference.
   const tensorflow::FunctionLibraryDefinition* function_library_ = nullptr;
 
-  // Cache the graph corresponding to each functin definition for which shapes
+  // Cache the graph corresponding to each function definition for which shapes
   // are refined.
-  std::unordered_map<const FunctionDef*, std::unique_ptr<const Graph>>
+  absl::flat_hash_map<const FunctionDef*, std::unique_ptr<const Graph>,
+                      hash<const FunctionDef*>>
       functions_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeRefiner);
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 3812a8c181d..e14dbbe2b5e 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -849,17 +849,17 @@ REGISTER_OP("TensorAsShapeInt64")
 
 REGISTER_OP("NonConstScalarInt32")
     .Output("o: int32")
-    .SetIsStateful()  // prevents constant folding
+    .SetDoNotOptimize()
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("NonConstScalarInt64")
     .Output("o: int64")
-    .SetIsStateful()  // prevents constant folding
+    .SetDoNotOptimize()
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("WithEmptyVectorShape")
     .Output("o: int32")
-    .SetIsStateful()  // prevents constant folding
+    .SetDoNotOptimize()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Vector(0));
       return Status::OK();
@@ -867,7 +867,7 @@ REGISTER_OP("WithEmptyVectorShape")
 
 REGISTER_OP("WithPartialShape")
     .Output("o: int32")
-    .SetIsStateful()  // prevents constant folding
+    .SetDoNotOptimize()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(
           0, c->MakeShape({1, shape_inference::InferenceContext::kUnknownDim, 3,
@@ -877,7 +877,7 @@ REGISTER_OP("WithPartialShape")
 
 REGISTER_OP("WithPartialShape2")
     .Output("o: int32")
-    .SetIsStateful()  // prevents constant folding
+    .SetDoNotOptimize()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(
           0,
@@ -887,7 +887,7 @@ REGISTER_OP("WithPartialShape2")
 
 REGISTER_OP("WithUnknownShape")
     .Output("o: int32")
-    .SetIsStateful()  // prevents constant folding
+    .SetDoNotOptimize()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->UnknownShape());
       return Status::OK();
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.cc b/tensorflow/core/common_runtime/simple_propagator_state.cc
index bf6172bf3cf..01322cc3514 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.cc
+++ b/tensorflow/core/common_runtime/simple_propagator_state.cc
@@ -17,21 +17,22 @@ limitations under the License.
 #include <atomic>
 
 #include "tensorflow/core/common_runtime/propagator_debug_utils.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 
 SimplePropagatorState::SimplePropagatorState(
-    const ImmutableExecutorState& immutable_state, int64 step_id)
+    const ImmutableExecutorState& immutable_state, int64 step_id, bool vlog)
     : SimplePropagatorState(immutable_state, step_id,
-                            immutable_state.get_root_frame_info()) {}
+                            immutable_state.get_root_frame_info(), vlog) {}
 
 SimplePropagatorState::SimplePropagatorState(
     const ImmutableExecutorState& immutable_state, int64 step_id,
-    const ImmutableExecutorState::FrameInfo& finfo)
+    const ImmutableExecutorState::FrameInfo& finfo, bool vlog)
     : immutable_state_(immutable_state),
       step_id_(step_id),
-      vlog_(VLOG_IS_ON(1)),
+      vlog_(vlog || VLOG_IS_ON(1)),
       input_tensors_(finfo.total_inputs),
       pending_(
           new std::atomic<int32>[immutable_state.graph_view().num_nodes()]),
@@ -107,15 +108,13 @@ void SimplePropagatorState::DumpState() {
   // Dump any waiting nodes that are holding on to tensors.
   for (const NodeItem* node : *nodes_) {
     if (pending_[node->node_id]) {
-      DumpPendingNodeState(immutable_state_, node->node_id,
-                           input_tensors_.data(), false);
+      DumpPendingNodeState(*node, input_tensors_.data(), false);
     }
   }
   // Then the active nodes.
   for (const NodeItem* node : *nodes_) {
     if ((*active_)[node->node_id]) {
-      DumpActiveNodeState(immutable_state_, node->node_id,
-                          input_tensors_.data());
+      DumpActiveNodeState(*node, input_tensors_.data());
     }
   }
   // Show all input tensors in use.
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.h b/tensorflow/core/common_runtime/simple_propagator_state.h
index 1aee4c7ff2f..024341e5048 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.h
+++ b/tensorflow/core/common_runtime/simple_propagator_state.h
@@ -47,7 +47,7 @@ namespace tensorflow {
 class SimplePropagatorState {
  public:
   SimplePropagatorState(const ImmutableExecutorState& immutable_state,
-                        int64 step_id);
+                        int64 step_id, bool vlog);
   ~SimplePropagatorState();
 
   // A `TaggedNode` corresponds to a single invocation of a node's kernel,
@@ -157,7 +157,8 @@ class SimplePropagatorState {
  private:
   SimplePropagatorState(const ImmutableExecutorState& immutable_state_,
                         int64 step_id,
-                        const ImmutableExecutorState::FrameInfo& finfo);
+                        const ImmutableExecutorState::FrameInfo& finfo,
+                        bool vlog);
 
   const ImmutableExecutorState& immutable_state_;
   const int64 step_id_;
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index f2789c729b9..ce6fa970c1e 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -188,7 +187,7 @@ class StepStatsCollector : public StepStatsCollectorInterface {
  private:
   // TODO(suharshs): Make this configurable if its not possible to find a value
   // that works for all cases.
-  static const uint64 kMaxCollectedNodes = 1 << 20;
+  static constexpr uint64 kMaxCollectedNodes = 1 << 20;
 
   typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector;
   typedef std::unordered_map<uint32, string> ThreadNamesMap;
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 68fcc9a079a..44fa5bf2d3a 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -50,7 +50,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
       scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
-#ifdef INTEL_MKL
+#if !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
   // Early return when MKL is disabled
   if (DisableMKL()) return;
 #ifdef _OPENMP
@@ -69,7 +69,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     }
   }
 #endif  // _OPENMP
-#endif  // INTEL_MKL
+#endif  // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
 }
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 9c58be108fc..e42c46d6348 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -1,5 +1,10 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core/platform:build_config.bzl", "tf_protos_all")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_all_protos",
+    "tf_proto_library",
+    "tf_protos_all",
+)
 
 package(
     default_visibility = [
@@ -10,6 +15,46 @@ package(
 
 exports_files(["LICENSE"])
 
+cc_library(
+    name = "compression_utils",
+    srcs = ["compression_utils.cc"],
+    hdrs = [
+        "compression_utils.h",
+    ],
+    deps = [
+        ":dataset_proto_cc",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cc_test(
+    name = "compression_utils_test",
+    srcs = ["compression_utils_test.cc"],
+    deps = [
+        ":compression_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+    ],
+)
+
+tf_proto_library(
+    name = "dataset_proto",
+    srcs = ["dataset.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+)
+
 cc_library(
     name = "standalone",
     srcs = ["standalone.cc"],
diff --git a/tensorflow/core/data/service/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
similarity index 90%
rename from tensorflow/core/data/service/compression_utils.cc
rename to tensorflow/core/data/compression_utils.cc
index c4a47e1b00e..ea06a082128 100644
--- a/tensorflow/core/data/service/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -21,11 +21,11 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
-Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
+Status CompressElement(const std::vector<Tensor>& element,
+                       CompressedElement* out) {
   tensorflow::profiler::TraceMe activity(
-      "Compress", tensorflow::profiler::TraceMeLevel::kInfo);
+      "CompressElement", tensorflow::profiler::TraceMeLevel::kInfo);
 
   // Step 1: Determine the total uncompressed size. This requires serializing
   // non-memcopyable tensors, which we save to use again later.
@@ -51,7 +51,8 @@ Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
   char* position = uncompressed.mdata();
   int non_memcpy_component_index = 0;
   for (auto& component : element) {
-    ComponentMetadata* metadata = out->mutable_component_metadata()->Add();
+    CompressedComponentMetadata* metadata =
+        out->mutable_component_metadata()->Add();
     metadata->set_dtype(component.dtype());
     component.shape().AsProto(metadata->mutable_tensor_shape());
     if (DataTypeCanUseMemcpy(component.dtype())) {
@@ -74,10 +75,10 @@ Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
   return Status::OK();
 }
 
-Status Uncompress(const CompressedElement& compressed,
-                  std::vector<Tensor>* out) {
+Status UncompressElement(const CompressedElement& compressed,
+                         std::vector<Tensor>* out) {
   tensorflow::profiler::TraceMe activity(
-      "Uncompress", tensorflow::profiler::TraceMeLevel::kInfo);
+      "UncompressElement", tensorflow::profiler::TraceMeLevel::kInfo);
   int num_components = compressed.component_metadata_size();
   out->clear();
   out->reserve(num_components);
@@ -92,7 +93,8 @@ Status Uncompress(const CompressedElement& compressed,
   tensor_proto_strs.reserve(num_components);
   int64 total_size = 0;
   for (int i = 0; i < num_components; ++i) {
-    const ComponentMetadata& metadata = compressed.component_metadata(i);
+    const CompressedComponentMetadata& metadata =
+        compressed.component_metadata(i);
     if (DataTypeCanUseMemcpy(metadata.dtype())) {
       out->emplace_back(metadata.dtype(), metadata.tensor_shape());
       TensorBuffer* buffer = DMAHelper::buffer(&out->back());
@@ -146,6 +148,5 @@ Status Uncompress(const CompressedElement& compressed,
   return Status::OK();
 }
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/compression_utils.h b/tensorflow/core/data/compression_utils.h
similarity index 82%
rename from tensorflow/core/data/service/compression_utils.h
rename to tensorflow/core/data/compression_utils.h
index 96698aaaf09..5e033771272 100644
--- a/tensorflow/core/data/service/compression_utils.h
+++ b/tensorflow/core/data/compression_utils.h
@@ -16,24 +16,23 @@ limitations under the License.
 #define TENSORFLOW_CORE_DATA_SERVICE_COMPRESSION_UTILS_H_
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
 // Compresses the components of `element` into the `CompressedElement` proto.
 //
 // In addition to writing the actual compressed bytes, `Compress` fills
 // out the per-component metadata for the `CompressedElement`.
-Status Compress(const std::vector<Tensor>& element, CompressedElement* out);
+Status CompressElement(const std::vector<Tensor>& element,
+                       CompressedElement* out);
 
 // Uncompresses a `CompressedElement` into a vector of tensor components.
-Status Uncompress(const CompressedElement& compressed,
-                  std::vector<Tensor>* out);
+Status UncompressElement(const CompressedElement& compressed,
+                         std::vector<Tensor>* out);
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/data/service/compression_utils_test.cc b/tensorflow/core/data/compression_utils_test.cc
similarity index 89%
rename from tensorflow/core/data/service/compression_utils_test.cc
rename to tensorflow/core/data/compression_utils_test.cc
index b5da13efeed..eb220092f88 100644
--- a/tensorflow/core/data/service/compression_utils_test.cc
+++ b/tensorflow/core/data/compression_utils_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/data/dataset_test_base.h"
@@ -20,7 +20,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
 class ParameterizedCompressionUtilsTest
     : public DatasetOpsTestBase,
@@ -29,9 +28,9 @@ class ParameterizedCompressionUtilsTest
 TEST_P(ParameterizedCompressionUtilsTest, RoundTrip) {
   std::vector<Tensor> element = GetParam();
   CompressedElement compressed;
-  TF_ASSERT_OK(Compress(element, &compressed));
+  TF_ASSERT_OK(CompressElement(element, &compressed));
   std::vector<Tensor> round_trip_element;
-  TF_ASSERT_OK(Uncompress(compressed, &round_trip_element));
+  TF_ASSERT_OK(UncompressElement(compressed, &round_trip_element));
   TF_EXPECT_OK(
       ExpectEqual(element, round_trip_element, /*compare_order=*/true));
 }
@@ -50,6 +49,5 @@ std::vector<std::vector<Tensor>> TestCases() {
 INSTANTIATE_TEST_SUITE_P(Instantiation, ParameterizedCompressionUtilsTest,
                          ::testing::ValuesIn(TestCases()));
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/dataset.proto b/tensorflow/core/data/dataset.proto
new file mode 100644
index 00000000000..27a36364e76
--- /dev/null
+++ b/tensorflow/core/data/dataset.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+package tensorflow.data;
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+// This file contains protocol buffers for working with tf.data Datasets.
+
+// Metadata describing a compressed component of a dataset element.
+message CompressedComponentMetadata {
+  // The dtype of the component tensor.
+  .tensorflow.DataType dtype = 1;
+  // The shape of the component tensor.
+  .tensorflow.TensorShapeProto tensor_shape = 2;
+  // Size of the uncompressed tensor bytes. For tensors serialized as
+  // TensorProtos, this is TensorProto::BytesAllocatedLong(). For raw Tensors,
+  // this is the size of the buffer underlying the Tensor.
+  int64 tensor_size_bytes = 3;
+}
+
+message CompressedElement {
+  // Compressed tensor bytes for all components of the element.
+  bytes data = 1;
+  // Metadata for the components of the element.
+  repeated CompressedComponentMetadata component_metadata = 2;
+}
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index 799dd8ea9f6..b87f4f171cd 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -44,6 +44,7 @@ tf_proto_library(
     cc_api_version = 2,
     protodeps = tf_additional_all_protos() + [
         ":common_proto",
+        "//tensorflow/core/data:dataset_proto",
     ],
 )
 
@@ -56,6 +57,7 @@ cc_library(
     deps = [
         ":common_proto_cc",
         ":credentials_factory",
+        ":data_service",
         ":grpc_util",
         ":master_proto_cc",
         ":worker_cc_grpc_proto",
@@ -83,7 +85,6 @@ cc_library(
     ],
     deps = [
         ":common_proto_cc",
-        ":compression_utils",
         ":credentials_factory",
         ":grpc_util",
         ":master_cc_grpc_proto",
@@ -97,6 +98,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:compression_utils",
         "//tensorflow/core/data:standalone",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -128,39 +130,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "compression_utils",
-    srcs = ["compression_utils.cc"],
-    hdrs = [
-        "compression_utils.h",
-    ],
-    deps = [
-        ":common_proto_cc",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "compression_utils_test",
-    srcs = ["compression_utils_test.cc"],
-    deps = [
-        ":compression_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels/data:dataset_test_base",
-    ],
-)
-
 cc_library(
     name = "credentials_factory",
     srcs = ["credentials_factory.cc"],
@@ -275,22 +244,48 @@ cc_library(
     name = "server_lib",
     srcs = ["server_lib.cc"],
     hdrs = ["server_lib.h"],
+    visibility = [
+        "//visibility:public",
+    ],
     deps = [
         ":credentials_factory",
         ":grpc_master_impl",
+        ":grpc_util",
         ":grpc_worker_impl",
+        ":local_credentials_factory",
         "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
         tf_grpc_cc_dependency(),
     ],
 )
 
+cc_library(
+    name = "data_service",
+    srcs = ["data_service.cc"],
+    hdrs = [
+        "data_service.h",
+    ],
+    deps = [
+        ":credentials_factory",
+        ":grpc_util",
+        ":master_cc_grpc_proto",
+        ":master_proto_cc",
+        ":worker_cc_grpc_proto",
+        ":worker_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
 tf_cc_test(
     name = "data_service_test",
     srcs = ["data_service_test.cc"],
     tags = ["no_windows"],
     deps = [
-        ":compression_utils",
+        ":data_service",
         ":grpc_master_impl",
         ":grpc_util",
         ":grpc_worker_impl",
@@ -305,6 +300,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/data:compression_utils",
         "//tensorflow/core/kernels/data:dataset_test_base",
         "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
diff --git a/tensorflow/core/data/service/common.proto b/tensorflow/core/data/service/common.proto
index 0faaa661e08..4bde56fe1ca 100644
--- a/tensorflow/core/data/service/common.proto
+++ b/tensorflow/core/data/service/common.proto
@@ -3,7 +3,6 @@ syntax = "proto3";
 package tensorflow.data;
 
 import "tensorflow/core/framework/graph.proto";
-import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 
 message DatasetDef {
@@ -12,29 +11,11 @@ message DatasetDef {
   GraphDef graph = 1;
 }
 
-message ComponentMetadata {
-  // The dtype of the component tensor.
-  .tensorflow.DataType dtype = 1;
-  // The shape of the component tensor.
-  .tensorflow.TensorShapeProto tensor_shape = 2;
-  // Size of the uncompressed tensor bytes. For tensors serialized as
-  // TensorProtos, this is TensorProto::BytesAllocatedLong(). For raw Tensors,
-  // this is the size of the buffer underlying the Tensor.
-  int64 tensor_size_bytes = 3;
-}
-
-message CompressedElement {
-  // Compressed tensor bytes for all components of the element.
-  bytes data = 1;
-  // Metadata for the components of the element.
-  repeated ComponentMetadata component_metadata = 2;
-}
-
 message TaskDef {
   // The dataset to iterate over.
   // TODO(aaudibert): load the dataset from disk instead of passing it here.
   DatasetDef dataset = 1;
   int64 dataset_id = 2;
   int64 task_id = 3;
-  int64 epoch_id = 4;
+  int64 job_id = 4;
 }
diff --git a/tensorflow/core/data/service/data_service.cc b/tensorflow/core/data/service/data_service.cc
new file mode 100644
index 00000000000..915435d8fcb
--- /dev/null
+++ b/tensorflow/core/data/service/data_service.cc
@@ -0,0 +1,192 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/service/data_service.h"
+
+#include "grpcpp/create_channel.h"
+#include "grpcpp/security/credentials.h"
+#include "tensorflow/core/data/service/credentials_factory.h"
+#include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/data/service/master.grpc.pb.h"
+#include "tensorflow/core/data/service/worker.grpc.pb.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+constexpr const char kParallelEpochs[] = "parallel_epochs";
+constexpr const char kOneEpoch[] = "one_epoch";
+}  // namespace
+
+Status ParseProcessingMode(const std::string& s, ProcessingMode* mode) {
+  if (s == kParallelEpochs) {
+    *mode = ProcessingMode::PARALLEL_EPOCHS;
+  } else if (s == kOneEpoch) {
+    *mode = ProcessingMode::ONE_EPOCH;
+  } else {
+    return errors::InvalidArgument("Unrecognized processing mode: ", s);
+  }
+  return Status::OK();
+}
+
+std::string ProcessingModeToString(ProcessingMode mode) {
+  switch (mode) {
+    case ProcessingMode::PARALLEL_EPOCHS:
+      return kParallelEpochs;
+    case ProcessingMode::ONE_EPOCH:
+      return kOneEpoch;
+    default:
+      DCHECK(false);
+      return "Unknown";
+  }
+}
+
+Status DataServiceMasterClient::RegisterDataset(GraphDef dataset,
+                                                int64* dataset_id) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  GetOrRegisterDatasetRequest req;
+  *req.mutable_dataset()->mutable_graph() = dataset;
+  GetOrRegisterDatasetResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->GetOrRegisterDataset(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to register dataset", status);
+  }
+  *dataset_id = resp.dataset_id();
+  return Status::OK();
+}
+
+Status DataServiceMasterClient::CreateJob(int64 dataset_id,
+                                          ProcessingMode processing_mode,
+                                          int64* job_id) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  CreateJobRequest req;
+  req.set_dataset_id(dataset_id);
+  req.set_processing_mode(ProcessingModeDef(processing_mode));
+  CreateJobResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->CreateJob(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError(
+        absl::StrCat("Failed to create job for dataset with id ", dataset_id),
+        status);
+  }
+  *job_id = resp.job_id();
+  return Status::OK();
+}
+
+Status DataServiceMasterClient::GetOrCreateJob(int64 dataset_id,
+                                               ProcessingMode processing_mode,
+                                               const std::string& job_name,
+                                               int job_name_index,
+                                               int64* job_id) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  GetOrCreateJobRequest req;
+  req.set_dataset_id(dataset_id);
+  req.set_processing_mode(ProcessingModeDef(processing_mode));
+  req.set_job_name(job_name);
+  req.set_job_name_index(job_name_index);
+  GetOrCreateJobResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->GetOrCreateJob(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError(
+        absl::StrCat("Failed to get or create job for dataset with id ",
+                     dataset_id),
+        status);
+  }
+  *job_id = resp.job_id();
+  return Status::OK();
+}
+
+Status DataServiceMasterClient::GetTasks(int64 job_id,
+                                         std::vector<TaskInfo>* tasks,
+                                         bool* job_finished) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  GetTasksRequest req;
+  req.set_job_id(job_id);
+  GetTasksResponse resp;
+  grpc_impl::ClientContext ctx;
+  grpc::Status s = stub_->GetTasks(&ctx, req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to get tasks", s);
+  }
+  tasks->clear();
+  for (auto& task : resp.task_info()) {
+    tasks->push_back(task);
+  }
+  *job_finished = resp.job_finished();
+  return Status::OK();
+}
+
+Status DataServiceMasterClient::EnsureInitialized() {
+  std::shared_ptr<grpc::ChannelCredentials> credentials;
+  TF_RETURN_IF_ERROR(
+      CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
+  auto channel = grpc::CreateChannel(address_, credentials);
+  stub_ = MasterService::NewStub(channel);
+  return Status::OK();
+}
+
+Status DataServiceWorkerClient::GetElement(int64 task_id,
+                                           CompressedElement* element,
+                                           bool* end_of_sequence) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  GetElementRequest req;
+  req.set_task_id(task_id);
+  GetElementResponse resp;
+  grpc_impl::ClientContext ctx;
+  grpc::Status s = stub_->GetElement(&ctx, req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to get element", s);
+  }
+  *end_of_sequence = resp.end_of_sequence();
+  if (!*end_of_sequence) {
+    *element = std::move(*resp.mutable_compressed_element());
+  }
+  return Status::OK();
+}
+
+Status DataServiceWorkerClient::EnsureInitialized() {
+  std::shared_ptr<grpc::ChannelCredentials> credentials;
+  TF_RETURN_IF_ERROR(
+      CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
+  grpc::ChannelArguments args;
+  args.SetMaxReceiveMessageSize(-1);
+  auto channel = grpc::CreateCustomChannel(address_, credentials, args);
+  stub_ = WorkerService::NewStub(channel);
+  return Status::OK();
+}
+
+Status CreateDataServiceMasterClient(
+    const std::string& address, const std::string& protocol,
+    std::unique_ptr<DataServiceMasterClient>* out) {
+  auto client = absl::make_unique<DataServiceMasterClient>(address, protocol);
+  TF_RETURN_IF_ERROR(client->Initialize());
+  *out = std::move(client);
+  return Status::OK();
+}
+
+Status CreateDataServiceWorkerClient(
+    const std::string& address, const std::string& protocol,
+    std::unique_ptr<DataServiceWorkerClient>* out) {
+  auto client = absl::make_unique<DataServiceWorkerClient>(address, protocol);
+  TF_RETURN_IF_ERROR(client->Initialize());
+  *out = std::move(client);
+  return Status::OK();
+}
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/data_service.h b/tensorflow/core/data/service/data_service.h
new file mode 100644
index 00000000000..d205b4d9ebf
--- /dev/null
+++ b/tensorflow/core/data/service/data_service.h
@@ -0,0 +1,139 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_DATA_SERVICE_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_DATA_SERVICE_H_
+
+#include "tensorflow/core/data/service/master.grpc.pb.h"
+#include "tensorflow/core/data/service/worker.grpc.pb.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+
+// Modes for how a tf.data service job should process a dataset.
+enum class ProcessingMode : int64 {
+  // Each tf.data worker processes an entire epoch. If a dataset contains 2
+  // elements and there are 3 workers, the job will produce 6 elements.
+  PARALLEL_EPOCHS = 0,
+  // Processing of a single epoch is distributed across all tf.data workers.
+  ONE_EPOCH = 1,
+};
+
+// Parses a string representing a processing mode and stores the result in
+// *mode. Returns an InvalidArgument status if the string is not recognized.
+Status ParseProcessingMode(const std::string& s, ProcessingMode* mode);
+
+// Converts a processing mode to its corresponding string.
+std::string ProcessingModeToString(ProcessingMode mode);
+
+// Base class for data service clients. Data service clients are
+// thread-compatible, requiring external synchronization when used from multiple
+// threads.
+class DataServiceClientBase {
+ public:
+  DataServiceClientBase(const std::string& address, const std::string& protocol)
+      : address_(address), protocol_(protocol) {}
+
+  virtual ~DataServiceClientBase() = default;
+  // Not copyable or movable.
+  DataServiceClientBase(const DataServiceClientBase&) = delete;
+  DataServiceClientBase& operator=(const DataServiceClientBase&) = delete;
+
+  // Initializes the client. Calling `Initialize()` is not required since the
+  // first RPC will perform any necessary initialization. However, it can be
+  // useful to call `Initialize()` proactively so that any errors that happen
+  // during initialization can be surfaced earlier.
+  Status Initialize() { return EnsureInitialized(); }
+
+ protected:
+  // Initializes the client if it isn't already initialized.
+  virtual Status EnsureInitialized() = 0;
+
+  const std::string address_;
+  const std::string protocol_;
+};
+
+// Client for communicating with the tf.data service master.
+class DataServiceMasterClient : public DataServiceClientBase {
+ public:
+  DataServiceMasterClient(const std::string& address,
+                          const std::string& protocol)
+      : DataServiceClientBase(address, protocol) {}
+
+  // Registers a dataset with the tf.data service, and stores the generated
+  // dataset id in `*dataset_id`.
+  Status RegisterDataset(GraphDef dataset, int64* dataset_id);
+
+  // Creates a new tf.data service job for the specified dataset. The id for the
+  // created job will be stored in `*job_id`.
+  Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
+                   int64* job_id);
+
+  // Gets the job id for the job represented by the tuple
+  // (job_name, job_name_index), and stores the id in *job_id. If the
+  // job doesn't exist yet, it will be created.
+  Status GetOrCreateJob(int64 dataset_id, ProcessingMode processing_mode,
+                        const std::string& job_name, int job_name_index,
+                        int64* job_id);
+
+  // Queries the master for the tasks associated with the specified job.
+  // The tasks will be stored in *tasks, and whether the job is finished will
+  // be stored in `*job_finished`.
+  Status GetTasks(int64 job_id, std::vector<TaskInfo>* tasks,
+                  bool* job_finished);
+
+ protected:
+  Status EnsureInitialized() override;
+
+ private:
+  std::unique_ptr<MasterService::Stub> stub_;
+};
+
+// Client for communicating with the tf.data service worker.
+class DataServiceWorkerClient : public DataServiceClientBase {
+ public:
+  DataServiceWorkerClient(const std::string& address,
+                          const std::string& protocol)
+      : DataServiceClientBase(address, protocol) {}
+
+  // Fetches the next element for the specified task_id. The element's
+  // compressed tensors will be stored in *element. If no element is available,
+  // `*end_of_sequence` will be `true`, and `element` will be left unchanged.
+  Status GetElement(int64 task_id, CompressedElement* element,
+                    bool* end_of_sequence);
+
+ protected:
+  Status EnsureInitialized() override;
+
+ private:
+  std::unique_ptr<WorkerService::Stub> stub_;
+};
+
+// Creates and initializes a new tf.data service master client.
+Status CreateDataServiceMasterClient(
+    const std::string& address, const std::string& protocol,
+    std::unique_ptr<DataServiceMasterClient>* out);
+
+// Creates and initializes a new tf.data service worker client.
+Status CreateDataServiceWorkerClient(
+    const std::string& address, const std::string& protocol,
+    std::unique_ptr<DataServiceWorkerClient>* out);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_DATA_SERVICE_H_
diff --git a/tensorflow/core/data/service/data_service_test.cc b/tensorflow/core/data/service/data_service_test.cc
index de52595021d..bd01cb90a66 100644
--- a/tensorflow/core/data/service/data_service_test.cc
+++ b/tensorflow/core/data/service/data_service_test.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/data/service/data_service.h"
+
 #include "grpcpp/create_channel.h"
 #include "grpcpp/security/credentials.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.grpc.pb.h"
 #include "tensorflow/core/data/service/master.pb.h"
@@ -34,97 +36,57 @@ namespace tensorflow {
 namespace data {
 
 namespace {
-Status RegisterDataset(MasterService::Stub* master_stub,
-                       const GraphDef& dataset_graph, int64* dataset_id) {
-  grpc_impl::ClientContext ctx;
-  GetOrRegisterDatasetRequest req;
-  *req.mutable_dataset()->mutable_graph() = dataset_graph;
-  GetOrRegisterDatasetResponse resp;
-  grpc::Status s = master_stub->GetOrRegisterDataset(&ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError("Failed to register dataset", s);
-  }
-  *dataset_id = resp.dataset_id();
-  return Status::OK();
+constexpr const char kProtocol[] = "grpc+local";
+
+TEST(DataService, ParseParallelEpochsProcessingMode) {
+  ProcessingMode mode;
+  TF_ASSERT_OK(ParseProcessingMode("parallel_epochs", &mode));
+  EXPECT_EQ(mode, ProcessingMode::PARALLEL_EPOCHS);
 }
 
-Status BeginEpoch(MasterService::Stub* master_stub, int64 dataset_id,
-                  int64* epoch_id) {
-  grpc_impl::ClientContext ctx;
-  BeginEpochRequest req;
-  req.set_dataset_id(dataset_id);
-  BeginEpochResponse resp;
-  grpc::Status s = master_stub->BeginEpoch(&ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError("Failed to begin epoch", s);
-  }
-  *epoch_id = resp.epoch_id();
-  return Status::OK();
+TEST(DataService, ParseOneEpochProcessingMode) {
+  ProcessingMode mode;
+  TF_ASSERT_OK(ParseProcessingMode("one_epoch", &mode));
+  EXPECT_EQ(mode, ProcessingMode::ONE_EPOCH);
 }
 
-Status GetTasks(MasterService::Stub* master_stub, int64 epoch_id,
-                std::vector<TaskInfo>* tasks) {
-  grpc_impl::ClientContext ctx;
-  GetTasksRequest req;
-  req.set_epoch_id(epoch_id);
-  GetTasksResponse resp;
-  grpc::Status s = master_stub->GetTasks(&ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError("Failed to get tasks", s);
-  }
-  tasks->clear();
-  for (auto& task : resp.task_info()) {
-    tasks->push_back(task);
-  }
-  return Status::OK();
+TEST(DataService, ParseInvalidProcessingMode) {
+  ProcessingMode mode;
+  Status s = ParseProcessingMode("invalid", &mode);
+  EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
 }
 
-Status GetElement(WorkerService::Stub* worker_stub, int64 task_id,
-                  std::vector<Tensor>* element, bool* end_of_sequence) {
-  grpc_impl::ClientContext ctx;
-  GetElementRequest req;
-  req.set_task_id(task_id);
-  GetElementResponse resp;
-  grpc::Status s = worker_stub->GetElement(&ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError("Failed to get element", s);
-  }
-  *end_of_sequence = resp.end_of_sequence();
-  if (!*end_of_sequence) {
-    const CompressedElement& compressed = resp.compressed_element();
-    TF_RETURN_IF_ERROR(service_util::Uncompress(compressed, element));
-  }
-  return Status::OK();
+TEST(DataService, ProcessingModeToString) {
+  EXPECT_EQ("parallel_epochs",
+            ProcessingModeToString(ProcessingMode::PARALLEL_EPOCHS));
+  EXPECT_EQ("one_epoch", ProcessingModeToString(ProcessingMode::ONE_EPOCH));
 }
 
 Status CheckWorkerOutput(const std::string& worker_address, int64 task_id,
                          std::vector<std::vector<Tensor>> expected_output) {
-  auto worker_channel = grpc::CreateChannel(
-      worker_address, grpc::experimental::LocalCredentials(LOCAL_TCP));
-  std::unique_ptr<WorkerService::Stub> worker_stub =
-      WorkerService::NewStub(worker_channel);
+  DataServiceWorkerClient worker(worker_address, kProtocol);
   for (std::vector<Tensor>& expected : expected_output) {
     bool end_of_sequence;
-    std::vector<Tensor> element;
+    CompressedElement compressed;
     TF_RETURN_IF_ERROR(
-        GetElement(worker_stub.get(), task_id, &element, &end_of_sequence));
+        worker.GetElement(task_id, &compressed, &end_of_sequence));
     if (end_of_sequence) {
       return errors::Internal("Reached end of sequence too early.");
     }
+    std::vector<Tensor> element;
+    TF_RETURN_IF_ERROR(UncompressElement(compressed, &element));
     TF_RETURN_IF_ERROR(DatasetOpsTestBase::ExpectEqual(element, expected,
                                                        /*compare_order=*/true));
   }
   // Call GetElement a couple more times to verify tha end_of_sequence keeps
   // returning true.
   bool end_of_sequence;
-  std::vector<Tensor> element;
-  TF_RETURN_IF_ERROR(
-      GetElement(worker_stub.get(), task_id, &element, &end_of_sequence));
+  CompressedElement compressed;
+  TF_RETURN_IF_ERROR(worker.GetElement(task_id, &compressed, &end_of_sequence));
   if (!end_of_sequence) {
     return errors::Internal("Expected end_of_sequence to be true");
   }
-  TF_RETURN_IF_ERROR(
-      GetElement(worker_stub.get(), task_id, &element, &end_of_sequence));
+  TF_RETURN_IF_ERROR(worker.GetElement(task_id, &compressed, &end_of_sequence));
   if (!end_of_sequence) {
     return errors::Internal("Expected end_of_sequence to be true");
   }
@@ -138,22 +100,21 @@ TEST(DataService, IterateDatasetOneWorker) {
   TF_ASSERT_OK(cluster.Initialize());
   test_util::GraphDefTestCase test_case;
   TF_ASSERT_OK(test_util::map_test_case(&test_case));
-  auto master_channel = grpc::CreateChannel(
-      cluster.MasterAddress(), grpc::experimental::LocalCredentials(LOCAL_TCP));
-  std::unique_ptr<MasterService::Stub> master_stub =
-      MasterService::NewStub(master_channel);
+  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
 
   int64 dataset_id;
+  TF_ASSERT_OK(master.RegisterDataset(test_case.graph_def, &dataset_id));
+  int64 job_id;
   TF_ASSERT_OK(
-      RegisterDataset(master_stub.get(), test_case.graph_def, &dataset_id));
-  int64 epoch_id;
-  TF_ASSERT_OK(BeginEpoch(master_stub.get(), dataset_id, &epoch_id));
+      master.CreateJob(dataset_id, ProcessingMode::PARALLEL_EPOCHS, &job_id));
   std::vector<TaskInfo> tasks;
-  TF_ASSERT_OK(GetTasks(master_stub.get(), epoch_id, &tasks));
+  bool job_finished;
+  TF_ASSERT_OK(master.GetTasks(job_id, &tasks, &job_finished));
   ASSERT_EQ(tasks.size(), 1);
-  ASSERT_EQ(tasks[0].worker_address(), cluster.WorkerAddress(0));
+  EXPECT_EQ(tasks[0].worker_address(), cluster.WorkerAddress(0));
+  EXPECT_FALSE(job_finished);
 
-  TF_ASSERT_OK(CheckWorkerOutput(tasks[0].worker_address(), tasks[0].id(),
+  TF_EXPECT_OK(CheckWorkerOutput(tasks[0].worker_address(), tasks[0].id(),
                                  test_case.output));
 }
 
@@ -162,23 +123,22 @@ TEST(DataService, IterateDatasetTwoWorkers) {
   TF_ASSERT_OK(cluster.Initialize());
   test_util::GraphDefTestCase test_case;
   TF_ASSERT_OK(test_util::map_test_case(&test_case));
-  auto master_channel = grpc::CreateChannel(
-      cluster.MasterAddress(), grpc::experimental::LocalCredentials(LOCAL_TCP));
-  std::unique_ptr<MasterService::Stub> master_stub =
-      MasterService::NewStub(master_channel);
+  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
 
   int64 dataset_id;
+  TF_ASSERT_OK(master.RegisterDataset(test_case.graph_def, &dataset_id));
+  int64 job_id;
   TF_ASSERT_OK(
-      RegisterDataset(master_stub.get(), test_case.graph_def, &dataset_id));
-  int64 epoch_id;
-  TF_ASSERT_OK(BeginEpoch(master_stub.get(), dataset_id, &epoch_id));
+      master.CreateJob(dataset_id, ProcessingMode::PARALLEL_EPOCHS, &job_id));
   std::vector<TaskInfo> tasks;
-  TF_ASSERT_OK(GetTasks(master_stub.get(), epoch_id, &tasks));
-  ASSERT_EQ(tasks.size(), 2);
+  bool job_finished;
+  TF_EXPECT_OK(master.GetTasks(job_id, &tasks, &job_finished));
+  EXPECT_EQ(tasks.size(), 2);
+  EXPECT_FALSE(job_finished);
 
   // Each worker produces the full dataset.
   for (TaskInfo task : tasks) {
-    TF_ASSERT_OK(
+    TF_EXPECT_OK(
         CheckWorkerOutput(task.worker_address(), task.id(), test_case.output));
   }
 }
@@ -188,26 +148,26 @@ TEST(DataService, AddWorkerMidEpoch) {
   TF_ASSERT_OK(cluster.Initialize());
   test_util::GraphDefTestCase test_case;
   TF_ASSERT_OK(test_util::map_test_case(&test_case));
-  auto master_channel = grpc::CreateChannel(
-      cluster.MasterAddress(), grpc::experimental::LocalCredentials(LOCAL_TCP));
-  std::unique_ptr<MasterService::Stub> master_stub =
-      MasterService::NewStub(master_channel);
+  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
 
   int64 dataset_id;
+  TF_ASSERT_OK(master.RegisterDataset(test_case.graph_def, &dataset_id));
+  int64 job_id;
   TF_ASSERT_OK(
-      RegisterDataset(master_stub.get(), test_case.graph_def, &dataset_id));
-  int64 epoch_id;
-  TF_ASSERT_OK(BeginEpoch(master_stub.get(), dataset_id, &epoch_id));
+      master.CreateJob(dataset_id, ProcessingMode::PARALLEL_EPOCHS, &job_id));
   std::vector<TaskInfo> tasks;
-  TF_ASSERT_OK(GetTasks(master_stub.get(), epoch_id, &tasks));
-  ASSERT_EQ(tasks.size(), 1);
+  bool job_finished;
+  TF_ASSERT_OK(master.GetTasks(job_id, &tasks, &job_finished));
+  EXPECT_EQ(tasks.size(), 1);
+  EXPECT_FALSE(job_finished);
   TF_ASSERT_OK(cluster.AddWorker());
-  TF_ASSERT_OK(GetTasks(master_stub.get(), epoch_id, &tasks));
-  ASSERT_EQ(tasks.size(), 2);
+  TF_EXPECT_OK(master.GetTasks(job_id, &tasks, &job_finished));
+  EXPECT_EQ(tasks.size(), 2);
+  EXPECT_FALSE(job_finished);
 
   // Each worker produces the full dataset.
   for (TaskInfo task : tasks) {
-    TF_ASSERT_OK(
+    TF_EXPECT_OK(
         CheckWorkerOutput(task.worker_address(), task.id(), test_case.output));
   }
 }
diff --git a/tensorflow/core/data/service/grpc_master_impl.cc b/tensorflow/core/data/service/grpc_master_impl.cc
index d7f21bfc406..ba27959fee7 100644
--- a/tensorflow/core/data/service/grpc_master_impl.cc
+++ b/tensorflow/core/data/service/grpc_master_impl.cc
@@ -39,8 +39,10 @@ GrpcMasterImpl::GrpcMasterImpl(ServerBuilder* server_builder,
     return ToGrpcStatus(impl_.method(request, response));       \
   }
 HANDLER(RegisterWorker);
+HANDLER(WorkerUpdate);
 HANDLER(GetOrRegisterDataset);
-HANDLER(BeginEpoch);
+HANDLER(CreateJob);
+HANDLER(GetOrCreateJob);
 HANDLER(GetTasks);
 #undef HANDLER
 
diff --git a/tensorflow/core/data/service/grpc_master_impl.h b/tensorflow/core/data/service/grpc_master_impl.h
index cd4ffe30d79..32eb0f3fc6a 100644
--- a/tensorflow/core/data/service/grpc_master_impl.h
+++ b/tensorflow/core/data/service/grpc_master_impl.h
@@ -38,17 +38,19 @@ class GrpcMasterImpl : public MasterService::Service {
                           const std::string& protocol);
   ~GrpcMasterImpl() override {}
 
- private:
 #define HANDLER(method)                               \
   grpc::Status method(grpc::ServerContext* context,   \
                       const method##Request* request, \
                       method##Response* response) override;
   HANDLER(RegisterWorker);
+  HANDLER(WorkerUpdate);
   HANDLER(GetOrRegisterDataset);
-  HANDLER(BeginEpoch);
+  HANDLER(CreateJob);
+  HANDLER(GetOrCreateJob);
   HANDLER(GetTasks);
 #undef HANDLER
 
+ private:
   DataServiceMasterImpl impl_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterImpl);
diff --git a/tensorflow/core/data/service/master.proto b/tensorflow/core/data/service/master.proto
index 03be51c79e7..005e5affb7d 100644
--- a/tensorflow/core/data/service/master.proto
+++ b/tensorflow/core/data/service/master.proto
@@ -16,6 +16,21 @@ message RegisterWorkerResponse {
   repeated TaskDef tasks = 2;
 }
 
+message TaskProgress {
+  // The task that this message is about.
+  int64 task_id = 1;
+  // Whether the task has completed.
+  bool completed = 2;
+}
+
+message WorkerUpdateRequest {
+  // The worker id that the update is for.
+  int64 worker_id = 1;
+  repeated TaskProgress updates = 2;
+}
+
+message WorkerUpdateResponse {}
+
 message GetOrRegisterDatasetRequest {
   // The dataset to register.
   DatasetDef dataset = 1;
@@ -26,19 +41,45 @@ message GetOrRegisterDatasetResponse {
   int64 dataset_id = 1;
 }
 
-message BeginEpochRequest {
-  // The id of the dataset to iterate over.
-  int64 dataset_id = 1;
+enum ProcessingModeDef {
+  // Each tf.data worker processes an entire epoch.
+  PARALLEL_EPOCHS = 0;
+  // Processing of an epoch is distributed across all tf.data workers.
+  ONE_EPOCH = 1;
 }
 
-message BeginEpochResponse {
-  // The id for the created epoch.
-  int64 epoch_id = 1;
+message CreateJobRequest {
+  // The id of the dataset to create a job for.
+  int64 dataset_id = 1;
+  // A mode controlling how the tf.data service produces data for the job.
+  ProcessingModeDef processing_mode = 2;
+}
+
+message CreateJobResponse {
+  // An id for the created job.
+  int64 job_id = 1;
+}
+
+message GetOrCreateJobRequest {
+  // The id of the dataset to create a job for.
+  int64 dataset_id = 1;
+  // A mode controlling how the tf.data service produces data for the job.
+  ProcessingModeDef processing_mode = 2;
+  // A name for the job.
+  string job_name = 3;
+  // An index for the job. Multiple jobs can be created for the same name, if
+  // they have different indices.
+  int64 job_name_index = 4;
+}
+
+message GetOrCreateJobResponse {
+  // The id of the (potentially newly created) job.
+  int64 job_id = 1;
 }
 
 message GetTasksRequest {
-  // The epoch to look up tasks for.
-  int64 epoch_id = 1;
+  // The job to look up tasks for.
+  int64 job_id = 1;
 }
 
 message TaskInfo {
@@ -49,14 +90,21 @@ message TaskInfo {
 }
 
 message GetTasksResponse {
-  // A list of all tasks for an epoch.
+  // A list of all tasks for a job.
   repeated TaskInfo task_info = 1;
+  // Whether the job has finished. An empty `task_info` list could either mean
+  // that no tasks have been started yet, or that all tasks have finished. This
+  // field gives us a way to tell the difference.
+  bool job_finished = 2;
 }
 
 service MasterService {
   // Registers a worker with the master.
   rpc RegisterWorker(RegisterWorkerRequest) returns (RegisterWorkerResponse);
 
+  // Updates the master with information about the worker's state.
+  rpc WorkerUpdate(WorkerUpdateRequest) returns (WorkerUpdateResponse);
+
   // Registers a dataset with the server, or returns its id if it is already
   // registered.
   //
@@ -65,9 +113,12 @@ service MasterService {
   rpc GetOrRegisterDataset(GetOrRegisterDatasetRequest)
       returns (GetOrRegisterDatasetResponse);
 
-  // Begins an epoch over a dataset.
-  rpc BeginEpoch(BeginEpochRequest) returns (BeginEpochResponse);
+  // Gets a job if it already exists, otherwise creates it.
+  rpc GetOrCreateJob(GetOrCreateJobRequest) returns (GetOrCreateJobResponse);
 
-  // Reports a list of all tasks for an epoch.
+  // Creates a job for reading from the tf.data service.
+  rpc CreateJob(CreateJobRequest) returns (CreateJobResponse);
+
+  // Reports a list of all tasks for a job.
   rpc GetTasks(GetTasksRequest) returns (GetTasksResponse);
 }
diff --git a/tensorflow/core/data/service/master_impl.cc b/tensorflow/core/data/service/master_impl.cc
index 033b28c03a8..336ab068c40 100644
--- a/tensorflow/core/data/service/master_impl.cc
+++ b/tensorflow/core/data/service/master_impl.cc
@@ -15,18 +15,24 @@ limitations under the License.
 
 #include "tensorflow/core/data/service/master_impl.h"
 
+#include <memory>
+#include <tuple>
+#include <utility>
+
 #include "grpcpp/create_channel.h"
 #include "grpcpp/impl/codegen/server_context.h"
 #include "grpcpp/security/credentials.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
+#include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.pb.h"
 #include "tensorflow/core/data/service/worker.grpc.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -55,34 +61,49 @@ Status DataServiceMasterImpl::RegisterWorker(
   VLOG(3) << "Received register worker request";
   mutex_lock l(mu_);
   int64 worker_id = next_worker_id_++;
-  workers_.emplace_back();
-  workers_.back().address = request->worker_address();
-  workers_.back().id = worker_id;
+  workers_.emplace_back(worker_id, request->worker_address());
   response->set_worker_id(worker_id);
 
   // Allocate tasks to the worker.
-  for (auto& entry : epochs_) {
-    Epoch& epoch = entry.second;
-    int64 task_id = next_task_id_++;
-    DCHECK(!tasks_.contains(task_id));
-    Task& task = tasks_[task_id];
-    task.id = task_id;
-    task.dataset_id = epoch.dataset_id;
-    task.worker_address = request->worker_address();
-    epoch.task_ids.push_back(task_id);
+  for (auto& entry : jobs_) {
+    std::shared_ptr<Job> job = entry.second;
+    if (job->finished()) {
+      continue;
+    }
+    int64 task_id = CreateTask(job.get(), request->worker_address());
 
     TaskDef* task_def = response->add_tasks();
     *task_def->mutable_dataset() =
-        datasets_by_id_[task.dataset_id]->dataset_def;
-    task_def->set_dataset_id(task.dataset_id);
-    task_def->set_epoch_id(epoch.id);
-    task_def->set_task_id(task.id);
+        datasets_by_id_[job->dataset_id()]->dataset_def();
+    task_def->set_dataset_id(job->dataset_id());
+    task_def->set_job_id(job->job_id());
+    task_def->set_task_id(task_id);
   }
 
   VLOG(1) << "Registered worker " << workers_.back().DebugString();
   return Status::OK();
 }
 
+Status DataServiceMasterImpl::WorkerUpdate(const WorkerUpdateRequest* request,
+                                           WorkerUpdateResponse* response) {
+  mutex_lock l(mu_);
+  int64 worker_id = request->worker_id();
+  for (auto& update : request->updates()) {
+    int64 task_id = update.task_id();
+    if (!tasks_.contains(task_id)) {
+      return errors::NotFound("WorkerUpdate called for worker ", worker_id,
+                              " with unknown task id ", task_id);
+    }
+    if (update.completed()) {
+      int64 job_id = tasks_.at(task_id).job_id();
+      DCHECK(jobs_.contains(job_id));
+      jobs_.at(job_id)->task_finished(task_id);
+      VLOG(3) << "Task " << task_id << " from job " << job_id << " completed";
+    }
+  }
+  return Status::OK();
+}
+
 Status DataServiceMasterImpl::GetOrRegisterDataset(
     const GetOrRegisterDatasetRequest* request,
     GetOrRegisterDatasetResponse* response) {
@@ -92,7 +113,7 @@ Status DataServiceMasterImpl::GetOrRegisterDataset(
   VLOG(3) << "Registering dataset graph: "
           << request->dataset().graph().DebugString();
   if (datasets_by_fingerprint_.contains(fingerprint)) {
-    int64 id = datasets_by_fingerprint_[fingerprint]->id;
+    int64 id = datasets_by_fingerprint_[fingerprint]->dataset_id();
     VLOG(3) << "Received duplicate RegisterDataset request with fingerprint "
             << fingerprint << ". Returning id " << id;
     response->set_dataset_id(id);
@@ -108,75 +129,164 @@ Status DataServiceMasterImpl::GetOrRegisterDataset(
 int64 DataServiceMasterImpl::RegisterDataset(uint64 fingerprint,
                                              const DatasetDef& dataset)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  auto new_dataset = std::make_shared<Dataset>();
   int64 dataset_id = next_dataset_id_++;
-  new_dataset->id = dataset_id;
-  new_dataset->fingerprint = fingerprint;
-  new_dataset->dataset_def = dataset;
+  auto new_dataset =
+      std::make_shared<Dataset>(dataset_id, fingerprint, dataset);
 
   DCHECK(!datasets_by_id_.contains(dataset_id));
   datasets_by_id_[dataset_id] = new_dataset;
   DCHECK(!datasets_by_fingerprint_.contains(fingerprint));
-  datasets_by_fingerprint_[dataset_id] = new_dataset;
+  datasets_by_fingerprint_[fingerprint] = new_dataset;
   return dataset_id;
 }
 
-Status DataServiceMasterImpl::BeginEpoch(const BeginEpochRequest* request,
-                                         BeginEpochResponse* response) {
-  VLOG(3) << "Received begin epoch request for dataset id "
+Status DataServiceMasterImpl::CreateJob(const CreateJobRequest* request,
+                                        CreateJobResponse* response) {
+  VLOG(3) << "Received create job request for dataset id "
           << request->dataset_id();
+  ProcessingMode processing_mode = ProcessingMode(request->processing_mode());
   mutex_lock l(mu_);
-  if (!datasets_by_id_.contains(request->dataset_id())) {
-    return errors::NotFound("BeginEpoch failed. Dataset id: <",
-                            request->dataset_id(), "> not found.");
-  }
+  int64 job_id;
+  TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), processing_mode,
+                               absl::optional<std::string>(), &job_id));
+  response->set_job_id(job_id);
 
-  int64 epoch_id = next_epoch_id_++;
-  DCHECK(!epochs_.contains(epoch_id));
-  Epoch& epoch = epochs_[epoch_id];
-  epoch.id = epoch_id;
-  epoch.dataset_id = request->dataset_id();
-  response->set_epoch_id(epoch_id);
-
-  for (auto& worker : workers_) {
-    int64 task_id = next_task_id_++;
-    DCHECK(!tasks_.contains(task_id));
-    Task& task = tasks_[task_id];
-    task.id = task_id;
-    task.dataset_id = request->dataset_id();
-    task.worker_address = worker.address;
-    epoch.task_ids.push_back(task_id);
-
-    std::unique_ptr<WorkerService::Stub> stub;
-    TF_RETURN_IF_ERROR(CreateWorkerStub(worker.address, protocol_, &stub));
-    // TODO(aaudibert): perform these calls asynchronously.
-    TF_RETURN_IF_ERROR(AllocateTaskToWorker(task, &worker));
-  }
-
-  VLOG(3) << "Beginning epoch " << epoch_id << " for dataset "
+  VLOG(3) << "Creating job " << job_id << " for dataset "
           << request->dataset_id();
   return Status::OK();
 }
 
-Status DataServiceMasterImpl::AllocateTaskToWorker(const Task& task,
-                                                   WorkerInfo* worker)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  if (!worker->stub) {
-    TF_RETURN_IF_ERROR(
-        CreateWorkerStub(worker->address, protocol_, &worker->stub));
+Status DataServiceMasterImpl::GetOrCreateJob(
+    const GetOrCreateJobRequest* request, GetOrCreateJobResponse* response) {
+  VLOG(3) << "Received get or create job request for dataset id "
+          << request->dataset_id() << " with name " << request->job_name()
+          << " and index " << request->job_name_index();
+  mutex_lock l(mu_);
+  NamedJobKey key(request->job_name(), request->job_name_index());
+  ProcessingMode requested_processing_mode =
+      ProcessingMode(request->processing_mode());
+  std::shared_ptr<Job>* job = gtl::FindOrNull(named_jobs_, key);
+  if (job != nullptr) {
+    TF_RETURN_IF_ERROR(ValidateMatchingJob(**job, requested_processing_mode,
+                                           request->dataset_id()));
+    int64 job_id = (*job)->job_id();
+    response->set_job_id(job_id);
+    VLOG(3) << "Found existing job for name=" << request->job_name()
+            << ", index=" << request->job_name_index()
+            << ". job_id: " << job_id;
+    return Status::OK();
   }
+  int64 job_id;
+  TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), requested_processing_mode,
+                               request->job_name(), &job_id));
+  named_jobs_[key] = jobs_[job_id];
+  response->set_job_id(job_id);
+  VLOG(3) << "Created job " << job_id << " for dataset "
+          << request->dataset_id() << " and name " << request->job_name();
+  return Status::OK();
+}
+
+// Validates that the job matches the given processing_mode and dataset_id.
+Status DataServiceMasterImpl::ValidateMatchingJob(
+    const Job& job, ProcessingMode processing_mode, int64 dataset_id) {
+  DCHECK(job.name().has_value());
+  std::string job_name = job.name().value();
+  if (job.processing_mode() != processing_mode) {
+    std::string requested = ProcessingModeToString(processing_mode);
+    std::string actual = ProcessingModeToString(job.processing_mode());
+    return errors::FailedPrecondition(
+        "Found a job with name ", job_name, ", but the processing mode <",
+        actual, "> doesn't match the requested processing mode <", requested,
+        ">.");
+  }
+  if (job.dataset_id() != dataset_id) {
+    return errors::FailedPrecondition(
+        "Found a job with name ", job_name, ", but the dataset id <",
+        job.dataset_id(), "> doesn't match the requested dataset id <",
+        dataset_id, ">.");
+  }
+  return Status::OK();
+}
+
+Status DataServiceMasterImpl::CreateJob(int64 dataset_id,
+                                        ProcessingMode processing_mode,
+                                        absl::optional<std::string> job_name,
+                                        int64* out_job_id)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  switch (processing_mode) {
+    case ProcessingMode::PARALLEL_EPOCHS:
+      break;
+    case ProcessingMode::ONE_EPOCH:
+      return errors::Unimplemented(
+          "CreateJob only supports the PARALLEL_EPOCHS job mode. "
+          "ONE_EPOCH is not currently supported.");
+    default:
+      return errors::Unimplemented("ProcessingMode ",
+                                   ProcessingModeToString(processing_mode),
+                                   " not recognized");
+  }
+  if (!datasets_by_id_.contains(dataset_id)) {
+    return errors::NotFound("Dataset id: <", dataset_id, "> not found.");
+  }
+
+  int64 job_id = next_job_id_++;
+  DCHECK(!jobs_.contains(job_id));
+  auto job =
+      std::make_shared<Job>(job_id, dataset_id, processing_mode, job_name);
+  jobs_[job_id] = job;
+
+  for (auto& worker : workers_) {
+    int64 task_id = CreateTask(job.get(), worker.address());
+
+    // TODO(aaudibert): perform these calls asynchronously.
+    // TODO(aaudibert): clean up in case some calls succeed, but later calls
+    // fail
+    TF_RETURN_IF_ERROR(AllocateTaskToWorker(tasks_.at(task_id), &worker));
+  }
+
+  *out_job_id = job_id;
+  return Status::OK();
+}
+
+int64 DataServiceMasterImpl::CreateTask(Job* job,
+                                        const std::string& worker_address)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  int64 task_id = next_task_id_++;
+  DCHECK(!tasks_.contains(task_id));
+  auto result =
+      tasks_.emplace(std::piecewise_construct, std::forward_as_tuple(task_id),
+                     std::forward_as_tuple(task_id, job->job_id(),
+                                           job->dataset_id(), worker_address));
+  job->add_task_id(task_id);
+  DCHECK(result.second);
+  return task_id;
+}
+
+Status DataServiceMasterImpl::EnsureWorkerStubInitialized(Worker* worker) {
+  if (!worker->stub()) {
+    std::unique_ptr<WorkerService::Stub> stub;
+    TF_RETURN_IF_ERROR(CreateWorkerStub(worker->address(), protocol_, &stub));
+    worker->set_stub(std::move(stub));
+  }
+  return Status::OK();
+}
+
+Status DataServiceMasterImpl::AllocateTaskToWorker(const Task& task,
+                                                   Worker* worker)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  TF_RETURN_IF_ERROR(EnsureWorkerStubInitialized(worker));
   grpc::ClientContext client_ctx;
   ProcessTaskRequest req;
-  req.mutable_task()->set_dataset_id(task.dataset_id);
-  DCHECK(datasets_by_id_.contains(task.dataset_id));
+  req.mutable_task()->set_dataset_id(task.dataset_id());
+  DCHECK(datasets_by_id_.contains(task.dataset_id()));
   *req.mutable_task()->mutable_dataset() =
-      datasets_by_id_[task.dataset_id]->dataset_def;
-  req.mutable_task()->set_task_id(task.id);
+      datasets_by_id_.at(task.dataset_id())->dataset_def();
+  req.mutable_task()->set_task_id(task.task_id());
   ProcessTaskResponse resp;
-  grpc::Status s = worker->stub->ProcessTask(&client_ctx, req, &resp);
+  grpc::Status s = worker->stub()->ProcessTask(&client_ctx, req, &resp);
   if (!s.ok()) {
     return grpc_util::WrapError(
-        absl::StrCat("Failed to submit task to worker ", worker->address), s);
+        absl::StrCat("Failed to submit task to worker ", worker->address()), s);
   }
   return Status::OK();
 }
@@ -184,23 +294,24 @@ Status DataServiceMasterImpl::AllocateTaskToWorker(const Task& task,
 Status DataServiceMasterImpl::GetTasks(const GetTasksRequest* request,
                                        GetTasksResponse* response) {
   mutex_lock l(mu_);
-  VLOG(3) << "Looking up tasks for epoch id " << request->epoch_id();
-  auto it = epochs_.find(request->epoch_id());
-  if (it == epochs_.end()) {
-    return errors::NotFound("GetTasks failed. Epoch id <", request->epoch_id(),
+  VLOG(3) << "Looking up tasks for job id " << request->job_id();
+  auto it = jobs_.find(request->job_id());
+  if (it == jobs_.end()) {
+    return errors::NotFound("GetTasks failed. Job id <", request->job_id(),
                             "> not found.");
   }
-  Epoch& epoch = it->second;
-  for (const auto& task_id : epoch.task_ids) {
+  std::shared_ptr<Job> job = it->second;
+  for (const auto& task_id : job->task_ids()) {
     auto task_iter = tasks_.find(task_id);
     DCHECK(task_iter != tasks_.end());
     Task& task = task_iter->second;
     TaskInfo* task_info = response->mutable_task_info()->Add();
-    task_info->set_worker_address(task.worker_address);
-    task_info->set_id(task.id);
+    task_info->set_worker_address(task.worker_address());
+    task_info->set_id(task.task_id());
   }
-  VLOG(3) << "Found " << response->task_info_size() << " tasks for epoch id "
-          << request->epoch_id();
+  response->set_job_finished(job->finished());
+  VLOG(3) << "Found " << response->task_info_size() << " tasks for job id "
+          << request->job_id();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/master_impl.h b/tensorflow/core/data/service/master_impl.h
index c2bbd36d2a0..e8b70e84d0f 100644
--- a/tensorflow/core/data/service/master_impl.h
+++ b/tensorflow/core/data/service/master_impl.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/data/service/master.pb.h"
 #include "tensorflow/core/data/service/worker.grpc.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -33,13 +34,12 @@ namespace data {
 // Glossary:
 // * Dataset: A definition of how to generate a potentially large collection of
 //   elements.
-// * Epoch: A single pass over a dataset. There may be multiple epochs
-//   for the same dataset, and they can be iterated over independently
-// * Task: An epoch is broken into multiple tasks, which each represent
-//   iterating over all of or part of the dataset. Workers process tasks. We
-//   don't currently implement dataset splitting, so every task represents a
-//   full iteration over the dataset. In the future, we will partition the data
-//   across all tasks for the same epoch.
+// * Job: A coordinated phase of reading from the tf.data service. A job
+//   produces some amount of data, and (potentially multiple) consumers consume
+//   the data from the job until there is no data left. Each job has a
+//   ProcessingModeDef which determines what data it produces.
+// * Task: A job is broken into multiple tasks, which each represent
+//   iterating over all of or part of the dataset. Workers process tasks.
 class DataServiceMasterImpl {
  public:
   explicit DataServiceMasterImpl(const std::string protocol);
@@ -49,48 +49,146 @@ class DataServiceMasterImpl {
   /// Worker-facing API.
   Status RegisterWorker(const RegisterWorkerRequest* request,
                         RegisterWorkerResponse* response);
+  Status WorkerUpdate(const WorkerUpdateRequest* request,
+                      WorkerUpdateResponse* response);
 
   /// Client-facing API.
   Status GetOrRegisterDataset(const GetOrRegisterDatasetRequest* request,
                               GetOrRegisterDatasetResponse* response);
-  Status BeginEpoch(const BeginEpochRequest* request,
-                    BeginEpochResponse* response);
+  Status CreateJob(const CreateJobRequest* request,
+                   CreateJobResponse* response);
+  Status GetOrCreateJob(const GetOrCreateJobRequest* request,
+                        GetOrCreateJobResponse* response);
   Status GetTasks(const GetTasksRequest* request, GetTasksResponse* response);
 
  private:
-  typedef struct WorkerInfo {
-    std::string address;
-    int64 id;
-    std::unique_ptr<WorkerService::Stub> stub;
+  class Worker {
+   public:
+    Worker(int64 worker_id, const std::string address)
+        : worker_id_(worker_id), address_(address) {}
+
+    int64 worker_id() { return worker_id_; }
+    std::string address() { return address_; }
+    WorkerService::Stub* stub() { return stub_.get(); }
+    void set_stub(std::unique_ptr<WorkerService::Stub> stub) {
+      stub_ = std::move(stub);
+    }
 
     std::string DebugString() {
-      return absl::StrCat("id: ", id, "address: ", address);
+      return absl::StrCat("id: ", worker_id_, " address: ", address_);
     }
-  } WorkerInfo;
 
-  typedef struct Dataset {
-    int64 id;
-    int64 fingerprint;
-    DatasetDef dataset_def;
-  } Dataset;
+   private:
+    const int64 worker_id_;
+    const std::string address_;
+    std::unique_ptr<WorkerService::Stub> stub_;
+  };
 
-  typedef struct Epoch {
-    int64 id;
-    int64 dataset_id;
-    std::vector<int64> task_ids;
-  } Epoch;
+  class Dataset {
+   public:
+    Dataset(int64 dataset_id, int64 fingerprint, const DatasetDef& dataset_def)
+        : dataset_id_(dataset_id),
+          fingerprint_(fingerprint),
+          dataset_def_(dataset_def) {}
 
-  typedef struct Task {
-    int64 id;
-    int64 dataset_id;
-    std::string worker_address;
-  } Task;
+    int64 dataset_id() const { return dataset_id_; }
+    int64 fingerprint() const { return fingerprint_; }
+    const DatasetDef& dataset_def() { return dataset_def_; }
+
+   private:
+    const int64 dataset_id_;
+    const int64 fingerprint_;
+    const DatasetDef dataset_def_;
+  };
+
+  class Job {
+   public:
+    Job(int64 job_id, int64 dataset_id, ProcessingMode processing_mode,
+        absl::optional<absl::string_view> job_name)
+        : job_id_(job_id),
+          dataset_id_(dataset_id),
+          processing_mode_(processing_mode),
+          job_name_(job_name) {}
+
+    int64 job_id() const { return job_id_; }
+    int64 dataset_id() const { return dataset_id_; }
+    ProcessingMode processing_mode() const { return processing_mode_; }
+    absl::optional<std::string> name() const { return job_name_; }
+    const std::vector<int64>& task_ids() const { return task_ids_; }
+    void add_task_id(int64 task_id) { task_ids_.push_back(task_id); }
+    void task_finished(int64 task_id) {
+      finished_tasks_.push_back(task_id);
+      if (finished_tasks_.size() == task_ids_.size()) {
+        finished_ = true;
+      }
+    }
+    bool finished() const { return finished_; }
+
+   private:
+    const int64 job_id_;
+    const int64 dataset_id_;
+    const ProcessingMode processing_mode_;
+    const absl::optional<std::string> job_name_;
+    std::vector<int64> task_ids_;
+    std::vector<int64> finished_tasks_;
+    bool finished_ = false;
+  };
+
+  class NamedJobKey {
+   public:
+    NamedJobKey(absl::string_view name, int64 index)
+        : name_(name), index_(index) {}
+
+    friend bool operator==(const NamedJobKey& lhs, const NamedJobKey& rhs) {
+      return lhs.name_ == rhs.name_ && lhs.index_ == rhs.index_;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const NamedJobKey& k) {
+      return H::combine(std::move(h), k.name_, k.index_);
+    }
+
+   private:
+    const std::string name_;
+    const int64 index_;
+  };
+
+  class Task {
+   public:
+    Task(int64 task_id, int64 job_id, int64 dataset_id,
+         const std::string& worker_address)
+        : task_id_(task_id),
+          job_id_(job_id),
+          dataset_id_(dataset_id),
+          worker_address_(worker_address) {}
+
+    int64 task_id() const { return task_id_; }
+    int64 job_id() const { return job_id_; }
+    int64 dataset_id() const { return dataset_id_; }
+    std::string worker_address() const { return worker_address_; }
+
+   private:
+    const int64 task_id_;
+    const int64 job_id_;
+    const int64 dataset_id_;
+    const std::string worker_address_;
+  };
 
   // Registers a dataset with the given fingerprint, returning a new dataset id.
   int64 RegisterDataset(uint64 fingerprint, const DatasetDef& dataset);
+  // Initializes a workers stub, if it hasn't been initialized already.
+  Status EnsureWorkerStubInitialized(Worker* worker);
   // Instructs a worker to begin processing a task.
-  Status AllocateTaskToWorker(const Task& task_id, WorkerInfo* worker);
-
+  Status AllocateTaskToWorker(const Task& task_id, Worker* worker);
+  // Creates a job and stores its job_id in `*job_id`.
+  Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
+                   absl::optional<std::string> job_name, int64* out_job_id);
+  // Creates a new task for a job, returning the new task's id.
+  int64 CreateTask(Job* job, const std::string& worker_address);
+  // Validates that an existing job matches the given processing_mode and
+  // dataset_id, returning an error status describing any difference.
+  Status ValidateMatchingJob(const Job& job, ProcessingMode processing_mode,
+                             int64 dataset_id);
   // Protocol to use for communicating with workers.
   const std::string protocol_;
 
@@ -98,21 +196,25 @@ class DataServiceMasterImpl {
 
   int64 next_worker_id_ TF_GUARDED_BY(mu_) = 0;
   int64 next_dataset_id_ TF_GUARDED_BY(mu_) = 0;
-  int64 next_epoch_id_ TF_GUARDED_BY(mu_) = 0;
+  int64 next_job_id_ TF_GUARDED_BY(mu_) = 0;
   int64 next_task_id_ TF_GUARDED_BY(mu_) = 0;
 
   // Registered workers.
-  std::vector<WorkerInfo> workers_ TF_GUARDED_BY(mu_);
+  std::vector<Worker> workers_ TF_GUARDED_BY(mu_);
   // Registered datasets, keyed by dataset ids.
   absl::flat_hash_map<int64, std::shared_ptr<Dataset>> datasets_by_id_
       TF_GUARDED_BY(mu_);
   // Registered datasets, keyed by dataset fingerprints.
   absl::flat_hash_map<uint64, std::shared_ptr<Dataset>> datasets_by_fingerprint_
       TF_GUARDED_BY(mu_);
-  // Information about epochs, keyed by epoch ids.
-  absl::flat_hash_map<int64, Epoch> epochs_ TF_GUARDED_BY(mu_);
+  // Information about jobs, keyed by job ids.
+  absl::flat_hash_map<int64, std::shared_ptr<Job>> jobs_ TF_GUARDED_BY(mu_);
   // Information about tasks, keyed by task ids.
   absl::flat_hash_map<int64, Task> tasks_ TF_GUARDED_BY(mu_);
+  // Named jobs, keyed by their names and indices. Not all jobs have names, so
+  // this is a subset of the jobs stored in `jobs_`.
+  absl::flat_hash_map<NamedJobKey, std::shared_ptr<Job>> named_jobs_
+      TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(DataServiceMasterImpl);
 };
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index d4ec8dd0a9d..66fc1e20603 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -17,20 +17,20 @@ limitations under the License.
 
 #include "tensorflow/core/data/service/credentials_factory.h"
 #include "tensorflow/core/data/service/grpc_master_impl.h"
+#include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/grpc_worker_impl.h"
 
 namespace tensorflow {
 namespace data {
 
-GrpcDataServer::GrpcDataServer(int port, const std::string& protocol,
-                               bool is_master,
-                               const std::string& master_address)
-    : requested_port_(port),
-      protocol_(protocol),
-      is_master_(is_master),
-      master_address_(master_address) {}
+namespace {
+constexpr char kPortPlaceholder[] = "%port%";
+}
 
-Status GrpcDataServer::Start() {
+GrpcDataServerBase::GrpcDataServerBase(int port, const std::string& protocol)
+    : requested_port_(port), protocol_(protocol) {}
+
+Status GrpcDataServerBase::Start() {
   ::grpc::ServerBuilder builder;
   std::shared_ptr<::grpc::ServerCredentials> credentials;
   TF_RETURN_IF_ERROR(
@@ -39,48 +39,94 @@ Status GrpcDataServer::Start() {
                            credentials, &bound_port_);
   builder.SetMaxReceiveMessageSize(-1);
 
-  if (is_master_) {
-    service_ = absl::make_unique<GrpcMasterImpl>(&builder, protocol_);
-  } else {
-    service_ =
-        absl::make_unique<GrpcWorkerImpl>(&builder, master_address_, protocol_);
-  }
-
+  AddServiceToBuilder(&builder);
   server_ = builder.BuildAndStart();
   if (!server_) {
     return errors::Internal("Could not start gRPC server");
   }
 
-  if (!is_master_) {
-    static_cast<GrpcWorkerImpl*>(service_.get())
-        ->Start(strings::StrCat("localhost:", bound_port_));
-  }
+  TF_RETURN_IF_ERROR(StartServiceInternal());
 
-  LOG(INFO) << "Started data service " << (is_master_ ? "master" : "worker")
-            << " running at " << Target();
+  VLOG(1) << "Started tf.data service running at 0.0.0.0:" << BoundPort();
   return Status::OK();
 }
 
-void GrpcDataServer::Stop() { server_->Shutdown(); }
+void GrpcDataServerBase::Stop() { server_->Shutdown(); }
 
-void GrpcDataServer::Join() { server_->Wait(); }
+void GrpcDataServerBase::Join() { server_->Wait(); }
 
-std::string GrpcDataServer::Target() {
-  return strings::StrCat(protocol_, "://localhost:", bound_port_);
+int GrpcDataServerBase::BoundPort() { return bound_port(); }
+
+MasterGrpcDataServer::MasterGrpcDataServer(int port,
+                                           const std::string& protocol)
+    : GrpcDataServerBase(port, protocol) {}
+
+MasterGrpcDataServer::~MasterGrpcDataServer() { delete service_; }
+
+void MasterGrpcDataServer::AddServiceToBuilder(grpc::ServerBuilder* builder) {
+  auto service = absl::make_unique<GrpcMasterImpl>(builder, protocol_);
+  service_ = service.release();
+}
+
+Status MasterGrpcDataServer::NumTasks(int* num_tasks) {
+  GetTasksRequest req;
+  GetTasksResponse resp;
+  grpc::ServerContext ctx;
+  grpc::Status s = service_->GetTasks(&ctx, &req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to get num tasks", s);
+  }
+  *num_tasks = resp.task_info_size();
+  return Status::OK();
+}
+
+WorkerGrpcDataServer::WorkerGrpcDataServer(int port,
+                                           const std::string& protocol,
+                                           const std::string& master_address,
+                                           const std::string& worker_address)
+    : GrpcDataServerBase(port, protocol),
+      master_address_(master_address),
+      worker_address_(worker_address) {}
+
+WorkerGrpcDataServer::~WorkerGrpcDataServer() { delete service_; }
+
+void WorkerGrpcDataServer::AddServiceToBuilder(grpc::ServerBuilder* builder) {
+  auto service =
+      absl::make_unique<GrpcWorkerImpl>(builder, master_address_, protocol_);
+  service_ = service.release();
+}
+
+Status WorkerGrpcDataServer::StartServiceInternal() {
+  std::string worker_address = worker_address_;
+  if (worker_address.empty()) {
+    worker_address = absl::StrCat("localhost:", kPortPlaceholder);
+  }
+  std::string resolved_address = str_util::StringReplace(
+      worker_address, kPortPlaceholder, absl::StrCat(bound_port()),
+      /*replace_all=*/false);
+  service_->Start(resolved_address);
+  return Status::OK();
 }
 
 Status NewMasterServer(int port, const std::string& protocol,
-                       std::unique_ptr<GrpcDataServer>* out_server) {
-  *out_server = absl::make_unique<GrpcDataServer>(
-      port, protocol, /*is_master=*/true, /*master_address=*/"");
+                       std::unique_ptr<MasterGrpcDataServer>* out_server) {
+  *out_server = absl::make_unique<MasterGrpcDataServer>(port, protocol);
   return Status::OK();
 }
 
 Status NewWorkerServer(int port, const std::string& protocol,
                        const std::string& master_address,
-                       std::unique_ptr<GrpcDataServer>* out_server) {
-  *out_server = absl::make_unique<GrpcDataServer>(
-      port, protocol, /*is_master=*/false, master_address);
+                       std::unique_ptr<WorkerGrpcDataServer>* out_server) {
+  return NewWorkerServer(port, protocol, master_address, /*worker_address=*/"",
+                         out_server);
+}
+
+Status NewWorkerServer(int port, const std::string& protocol,
+                       const std::string& master_address,
+                       const std::string& worker_address,
+                       std::unique_ptr<WorkerGrpcDataServer>* out_server) {
+  *out_server = absl::make_unique<WorkerGrpcDataServer>(
+      port, protocol, master_address, worker_address);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/server_lib.h b/tensorflow/core/data/service/server_lib.h
index 753dd5ddfbf..0ef305db89a 100644
--- a/tensorflow/core/data/service/server_lib.h
+++ b/tensorflow/core/data/service/server_lib.h
@@ -16,23 +16,28 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SERVICE_SERVER_LIB_H_
 #define TENSORFLOW_CORE_DATA_SERVICE_SERVER_LIB_H_
 
-#include "grpcpp/impl/codegen/service_type.h"
 #include "grpcpp/server.h"
+#include "grpcpp/server_builder.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace data {
 
-// A grpc server for the dataset service.
-class GrpcDataServer {
+// Forward declared because transitively depending on .grpc.pb.h files causes
+// issues in the pywrap build.
+class GrpcMasterImpl;
+class GrpcWorkerImpl;
+
+// A grpc server for the tf.data service.
+class GrpcDataServerBase {
  public:
-  // Constructs a dataset server with the specified port. If the port is 0, the
+  // Constructs a tf.data server with the specified port. If the port is 0, the
   // server will find an available port in `Start()`. The chosen port can be
   // found in the output of `Target()`.
   //
   // master_address is only needed for worker data servers.
-  explicit GrpcDataServer(int requested_port, const std::string& protocol,
-                          bool is_master, const std::string& master_address);
+  GrpcDataServerBase(int requested_port, const std::string& protocol);
+  virtual ~GrpcDataServerBase() {}
 
   // Starts the server running asynchronously.
   Status Start();
@@ -43,29 +48,84 @@ class GrpcDataServer {
   // Blocks until the server stops.
   void Join();
 
-  // Returns the target string for the server. Only valid after calling Start().
-  std::string Target();
+  // Returns the port bound by the server. Only valid after calling Start().
+  int BoundPort();
+
+ protected:
+  virtual void AddServiceToBuilder(::grpc::ServerBuilder* builder) = 0;
+  // Starts the service. This will be called after building the service, so
+  // bound_port() will return the actual bound port.
+  virtual Status StartServiceInternal() = 0;
+
+  int bound_port() { return bound_port_; }
 
- private:
   const int requested_port_;
   const std::string protocol_;
-  const bool is_master_;
-  const std::string master_address_;
 
+ private:
   int bound_port_;
 
-  std::unique_ptr<grpc::Service> service_;
   std::unique_ptr<grpc::Server> server_;
 };
 
-// Creates a master dataset server and stores it in `*out_server`.
-Status NewMasterServer(int port, const std::string& protocol,
-                       std::unique_ptr<GrpcDataServer>* out_server);
+class MasterGrpcDataServer : public GrpcDataServerBase {
+ public:
+  MasterGrpcDataServer(int requested_port, const std::string& protocol);
+  ~MasterGrpcDataServer() override;
 
-// Creates a worker dataset server and stores it in `*out_server`.
+  // Returns the number of tasks created by the master.
+  Status NumTasks(int* num_tasks);
+
+ protected:
+  void AddServiceToBuilder(grpc::ServerBuilder* builder) override;
+  Status StartServiceInternal() override { return Status::OK(); }
+
+ private:
+  // Owned. We use a raw pointer because GrpcMasterImpl is forward-declared.
+  GrpcMasterImpl* service_;
+};
+
+class WorkerGrpcDataServer : public GrpcDataServerBase {
+ public:
+  WorkerGrpcDataServer(int requested_port, const std::string& protocol,
+                       const std::string& master_address,
+                       const std::string& worker_address);
+  ~WorkerGrpcDataServer() override;
+
+ protected:
+  void AddServiceToBuilder(grpc::ServerBuilder* builder) override;
+  Status StartServiceInternal() override;
+
+ private:
+  const std::string master_address_;
+  const std::string worker_address_;
+  // Owned. We use a raw pointer because GrpcWorkerImpl is forward-declared.
+  GrpcWorkerImpl* service_;
+};
+
+// Creates a master tf.data server and stores it in `*out_server`.
+Status NewMasterServer(int port, const std::string& protocol,
+                       std::unique_ptr<MasterGrpcDataServer>* out_server);
+
+// Creates a worker tf.data server and stores it in `*out_server`.
+//
+// The port can be a specific port or 0. If the port is 0, an available port
+// will be chosen in Start(). This value can be queried with BoundPort().
+//
+// The worker_address argument is optional. If left empty, it will default to
+// "localhost:%port%". When the worker registers with the master, the worker
+// will report the worker address, so that the master can tell clients where to
+// read from. The address may contain the placeholder "%port%", which will be
+// replaced with the value of BoundPort().
 Status NewWorkerServer(int port, const std::string& protocol,
                        const std::string& master_address,
-                       std::unique_ptr<GrpcDataServer>* out_server);
+                       const std::string& worker_address,
+                       std::unique_ptr<WorkerGrpcDataServer>* out_server);
+
+// Creates a worker using the default worker_address.
+Status NewWorkerServer(int port, const std::string& protocol,
+                       const std::string& master_address,
+                       std::unique_ptr<WorkerGrpcDataServer>* out_server);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/test_cluster.cc b/tensorflow/core/data/service/test_cluster.cc
index bfa337b3dce..ded3ebb91b5 100644
--- a/tensorflow/core/data/service/test_cluster.cc
+++ b/tensorflow/core/data/service/test_cluster.cc
@@ -47,7 +47,7 @@ Status TestCluster::Initialize() {
   initialized_ = true;
   TF_RETURN_IF_ERROR(NewMasterServer(/*port=*/0, kProtocol, &master_));
   TF_RETURN_IF_ERROR(master_->Start());
-  TF_RETURN_IF_ERROR(AddressFromTarget(master_->Target(), &master_address_));
+  master_address_ = absl::StrCat("localhost:", master_->BoundPort());
   workers_.reserve(num_workers_);
   worker_addresses_.reserve(num_workers_);
   for (int i = 0; i < num_workers_; ++i) {
@@ -57,14 +57,12 @@ Status TestCluster::Initialize() {
 }
 
 Status TestCluster::AddWorker() {
-  std::unique_ptr<GrpcDataServer> worker;
+  std::unique_ptr<WorkerGrpcDataServer> worker;
   TF_RETURN_IF_ERROR(
       NewWorkerServer(/*port=*/0, kProtocol, master_address_, &worker));
   TF_RETURN_IF_ERROR(worker->Start());
-  std::string address;
-  TF_RETURN_IF_ERROR(AddressFromTarget(worker->Target(), &address));
+  worker_addresses_.push_back(absl::StrCat("localhost:", worker->BoundPort()));
   workers_.push_back(std::move(worker));
-  worker_addresses_.push_back(address);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/test_cluster.h b/tensorflow/core/data/service/test_cluster.h
index 6aa75f4b86a..c4b05ad0543 100644
--- a/tensorflow/core/data/service/test_cluster.h
+++ b/tensorflow/core/data/service/test_cluster.h
@@ -42,9 +42,9 @@ class TestCluster {
  private:
   bool initialized_ = false;
   int num_workers_;
-  std::unique_ptr<GrpcDataServer> master_;
+  std::unique_ptr<MasterGrpcDataServer> master_;
   std::string master_address_;
-  std::vector<std::unique_ptr<GrpcDataServer>> workers_;
+  std::vector<std::unique_ptr<WorkerGrpcDataServer>> workers_;
   std::vector<std::string> worker_addresses_;
 };
 
diff --git a/tensorflow/core/data/service/worker.proto b/tensorflow/core/data/service/worker.proto
index 04b8f03474c..51c6899f540 100644
--- a/tensorflow/core/data/service/worker.proto
+++ b/tensorflow/core/data/service/worker.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package tensorflow.data;
 
+import "tensorflow/core/data/dataset.proto";
 import "tensorflow/core/data/service/common.proto";
 
 message ProcessTaskRequest {
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index dde51ab77a9..b4be18ebccd 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.grpc.pb.h"
@@ -50,11 +50,20 @@ DataServiceWorkerImpl::DataServiceWorkerImpl(const std::string& master_address,
   tf_data_service_created->GetCell()->Set(true);
 }
 
+DataServiceWorkerImpl::~DataServiceWorkerImpl() {
+  mutex_lock l(mu_);
+  cancelled_ = true;
+  heartbeat_cv_.notify_one();
+}
+
 void DataServiceWorkerImpl::Start(const std::string& worker_address) {
   VLOG(3) << "Starting tf.data service worker at address " << worker_address;
   mutex_lock l(mu_);
   worker_address_ = worker_address;
 
+  Thread* thread = Env::Default()->StartThread(
+      {}, "data-service-worker-heartbeat", [this]() { HeartbeatThread(); });
+  heartbeat_thread_.reset(thread);
   Status s = Register();
   while (!s.ok()) {
     LOG(WARNING) << "Failed to register with master at " << master_address_
@@ -64,32 +73,6 @@ void DataServiceWorkerImpl::Start(const std::string& worker_address) {
   }
 }
 
-Status DataServiceWorkerImpl::Register() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  VLOG(3) << "Registering with master at " << master_address_;
-  if (!master_stub_) {
-    ::grpc::ChannelArguments args;
-    std::shared_ptr<::grpc::ChannelCredentials> credentials;
-    TF_RETURN_IF_ERROR(
-        CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
-    auto channel =
-        ::grpc::CreateCustomChannel(master_address_, credentials, args);
-    master_stub_ = MasterService::NewStub(channel);
-  }
-  RegisterWorkerRequest req;
-  req.set_worker_address(worker_address_);
-  RegisterWorkerResponse resp;
-
-  grpc::ClientContext ctx;
-  grpc::Status s = master_stub_->RegisterWorker(&ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError("Failed to register worker", s);
-  }
-  for (const TaskDef& task : resp.tasks()) {
-    TF_RETURN_IF_ERROR(ProcessTaskInternal(task));
-  }
-  VLOG(3) << "Registered worker with id " << resp.worker_id();
-  return Status::OK();
-}
 
 Status DataServiceWorkerImpl::ProcessTask(const ProcessTaskRequest* request,
                                           ProcessTaskResponse* response) {
@@ -101,6 +84,7 @@ Status DataServiceWorkerImpl::ProcessTask(const ProcessTaskRequest* request,
 
 Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  VLOG(3) << "Received request to process task " << task_def.task_id();
   standalone::Dataset::Params params;
   std::unique_ptr<standalone::Dataset> dataset;
   TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
@@ -117,6 +101,7 @@ Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
   task.id = task_def.task_id();
   task.dataset = std::move(dataset);
   task.iterator = std::move(iterator);
+  VLOG(3) << "Began processing for task " << task_def.task_id();
   return Status::OK();
 }
 
@@ -134,24 +119,103 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
     }
     std::unique_ptr<standalone::Iterator>& iter = it->second.iterator;
     if (iter == nullptr) {
+      VLOG(3) << "Task " << request->task_id() << " is already finished";
       response->set_end_of_sequence(true);
       return Status::OK();
     }
     TF_RETURN_IF_ERROR(iter->GetNext(&outputs, &end_of_sequence));
     if (end_of_sequence) {
+      VLOG(3) << "Reached end_of_sequence for task " << request->task_id();
       // Release iterator memory and leave a null entry as a tombstone.
       iter.reset();
+      pending_completed_tasks_.push_back(request->task_id());
+      heartbeat_cv_.notify_one();
     }
   }
 
   if (!end_of_sequence) {
-    TF_RETURN_IF_ERROR(service_util::Compress(
-        outputs, response->mutable_compressed_element()));
+    VLOG(3) << "Producing an element for task " << request->task_id();
+    TF_RETURN_IF_ERROR(
+        CompressElement(outputs, response->mutable_compressed_element()));
   }
   response->set_end_of_sequence(end_of_sequence);
 
   return Status::OK();
 }
 
+Status DataServiceWorkerImpl::EnsureMasterStubInitialized()
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  if (!master_stub_) {
+    ::grpc::ChannelArguments args;
+    std::shared_ptr<::grpc::ChannelCredentials> credentials;
+    TF_RETURN_IF_ERROR(
+        CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
+    auto channel =
+        ::grpc::CreateCustomChannel(master_address_, credentials, args);
+    master_stub_ = MasterService::NewStub(channel);
+  }
+  return Status::OK();
+}
+
+Status DataServiceWorkerImpl::Register() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  VLOG(3) << "Registering with master at " << master_address_;
+  TF_RETURN_IF_ERROR(EnsureMasterStubInitialized());
+  RegisterWorkerRequest req;
+  req.set_worker_address(worker_address_);
+  RegisterWorkerResponse resp;
+
+  grpc::ClientContext ctx;
+  grpc::Status s = master_stub_->RegisterWorker(&ctx, req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to register worker", s);
+  }
+  for (const TaskDef& task : resp.tasks()) {
+    TF_RETURN_IF_ERROR(ProcessTaskInternal(task));
+  }
+  worker_id_ = resp.worker_id();
+  VLOG(3) << "Registered worker with id " << resp.worker_id();
+  return Status::OK();
+}
+
+Status DataServiceWorkerImpl::SendTaskUpdate() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  VLOG(3) << "Sending " << pending_completed_tasks_.size()
+          << " task updates to master";
+  TF_RETURN_IF_ERROR(EnsureMasterStubInitialized());
+  WorkerUpdateRequest req;
+  req.set_worker_id(worker_id_);
+  for (int task_id : pending_completed_tasks_) {
+    TaskProgress* update = req.add_updates();
+    update->set_task_id(task_id);
+    update->set_completed(true);
+  }
+
+  WorkerUpdateResponse resp;
+  grpc::ClientContext ctx;
+  grpc::Status s = master_stub_->WorkerUpdate(&ctx, req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to send task updates", s);
+  }
+  pending_completed_tasks_.clear();
+  VLOG(3) << "Sent " << req.updates().size() << " task updates ";
+  return Status::OK();
+}
+
+void DataServiceWorkerImpl::HeartbeatThread() {
+  while (true) {
+    mutex_lock l(mu_);
+    while (!cancelled_ && pending_completed_tasks_.empty()) {
+      heartbeat_cv_.wait(l);
+    }
+    if (cancelled_) {
+      VLOG(3) << "Heartbeat thread shutting down";
+      return;
+    }
+    Status s = SendTaskUpdate();
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to send task updates to master: " << s;
+    }
+  }
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/worker_impl.h b/tensorflow/core/data/service/worker_impl.h
index 9595702f5d7..8c5fc2ea51c 100644
--- a/tensorflow/core/data/service/worker_impl.h
+++ b/tensorflow/core/data/service/worker_impl.h
@@ -31,7 +31,7 @@ class DataServiceWorkerImpl {
  public:
   explicit DataServiceWorkerImpl(const std::string& master_address,
                                  const std::string& protocol);
-  virtual ~DataServiceWorkerImpl() {}
+  ~DataServiceWorkerImpl();
 
   // Starts the worker. The worker needs to know its own address so that it can
   // register with the master.
@@ -48,10 +48,16 @@ class DataServiceWorkerImpl {
                     GetElementResponse* response);
 
  private:
+  // Sets master_stub_ if it isn't already set.
+  Status EnsureMasterStubInitialized();
   // Registers the worker with the master.
   Status Register();
+  // Sends task status to the master.
+  Status SendTaskUpdate();
   // Creates an iterator to process a task.
   Status ProcessTaskInternal(const TaskDef& task);
+  // A thread for updating the master with worker status.
+  void HeartbeatThread();
 
   typedef struct Task {
     int64 id;
@@ -68,9 +74,16 @@ class DataServiceWorkerImpl {
   std::string worker_address_;
 
   mutex mu_;
+  int64 worker_id_ TF_GUARDED_BY(mu_);
   std::unique_ptr<MasterService::Stub> master_stub_ TF_GUARDED_BY(mu_);
   // Information about tasks, keyed by task ids.
   absl::flat_hash_map<int64, Task> tasks_ TF_GUARDED_BY(mu_);
+  // List of completed tasks which haven't yet been communicated to the master.
+  std::vector<int64> pending_completed_tasks_ TF_GUARDED_BY(mu_);
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+  // Condition variable for notifying the heartbeat thread.
+  condition_variable heartbeat_cv_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<Thread> heartbeat_thread_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DataServiceWorkerImpl);
 };
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index f1c48bd8fed..bbfa4612146 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/ptr_util.h"
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index c7fdfa176b1..c27758cbb44 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -42,6 +42,7 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:worker_session",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
@@ -68,6 +69,7 @@ cc_library(
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:call_options",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 10924857ac5..808188aa36d 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -64,13 +66,7 @@ void EagerClusterFunctionLibraryRuntime::Instantiate(
   VLOG(1) << "CFLR::Instantiate: " << function_name << " on " << target
           << " (this: " << this << ")";
   core::RefCountPtr<eager::EagerClient> eager_client;
-  Device* device;
-  s = ctx_->FindDeviceFromName(target.c_str(), &device);
-  if (!s.ok()) {
-    done(s);
-    return;
-  }
-  s = ctx_->GetClient(device, &eager_client);
+  s = ctx_->GetClient(target, &eager_client);
   if (!s.ok()) {
     done(s);
     return;
@@ -159,9 +155,10 @@ void EagerClusterFunctionLibraryRuntime::Run(
     return;
   }
 
-  eager::EnqueueRequest* request = new eager::EnqueueRequest;
+  auto request = std::make_shared<RunComponentFunctionRequest>();
+  auto response = std::make_shared<RunComponentFunctionResponse>();
   request->set_context_id(context_id_);
-  eager::Operation* remote_op = request->add_queue()->mutable_operation();
+  eager::Operation* remote_op = request->mutable_operation();
 
   for (const auto& arg : args) {
     if (arg.index() == 0) {
@@ -188,39 +185,46 @@ void EagerClusterFunctionLibraryRuntime::Run(
   op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
   remote_op->set_device(function_data->target);
 
-  // StreamingEnqueueAsync may introduce a deadlock. When streaming RPC is
-  // disabled, Run() returns when the remote function execution completes, which
-  // might be blocked by a non-enqueued function execution.
-  EnqueueResponse* response = new EnqueueResponse;
-  eager_client->EnqueueAsync(
-      request, response,
-      [request, response, rets, done = std::move(done)](const Status& s) {
-        Status status = s;
-        auto cleanup = gtl::MakeCleanup([request, response, &status, &done] {
-          done(status);
-          delete request;
-          delete response;
-        });
+  CancellationManager* cm = opts.cancellation_manager;
+  CancellationToken token = 0;
+  auto call_opts = std::make_shared<CallOptions>();
+  if (cm != nullptr) {
+    token = cm->get_cancellation_token();
+    const bool already_cancelled = !cm->RegisterCallback(
+        token,
+        [call_opts, request, response, done]() { call_opts->StartCancel(); });
+    if (already_cancelled) {
+      done(errors::Cancelled("EagerClusterFunctionLibraryRuntime::Run"));
+      return;
+    }
+  }
 
-        if (!status.ok()) {
+  // Execute component function on remote worker using RunComponentFunction RPC.
+  // Different from executing remote functions with Enqueue, this method runs
+  // a function on remote worker without tying up a thread (i.e., pure
+  // asynchronously).
+  eager_client->RunComponentFunctionAsync(
+      call_opts.get(), request.get(), response.get(),
+      [request, response, rets, call_opts, cm, token,
+       done = std::move(done)](const Status& s) {
+        if (cm != nullptr) {
+          cm->TryDeregisterCallback(token);
+        }
+        if (!s.ok()) {
+          done(s);
           return;
         }
-        if (response->queue_response_size() != 1) {
-          status.Update(errors::Internal(
-              "Expect that the size of response queue equals 1, but got: ",
-              response->queue_response_size()));
-          return;
-        }
-        for (const auto& tensor_proto : response->queue_response(0).tensor()) {
+        for (const auto& tensor_proto : response->tensor()) {
           Tensor t;
           if (t.FromProto(tensor_proto)) {
             rets->push_back(std::move(t));
           } else {
-            status.Update(errors::Internal("Could not convert tensor proto: ",
-                                           tensor_proto.DebugString()));
+            done(errors::Internal("Could not convert tensor proto: ",
+                                  tensor_proto.DebugString()));
             return;
           }
         }
+        done(Status::OK());
       });
 }
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index 5f260e477d6..d6cf0943176 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
 
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
@@ -43,6 +44,10 @@ class EagerClient : public core::RefCounted {
 
 #undef CLIENT_METHOD
 
+  virtual void RunComponentFunctionAsync(
+      CallOptions* call_opts, const RunComponentFunctionRequest* request,
+      RunComponentFunctionResponse* response, StatusCallback done) = 0;
+
   // Feeds `request` into the request stream of EagerService::StreamingEnqueue.
   // `response` will be filled with the response for this `request`. The
   // 1-to-1 correspondence between requests and responses is a property
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index d87012de104..6dc03cbc527 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -44,8 +44,10 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
@@ -89,6 +91,98 @@ Status GetNumRetvals(tensorflow::EagerContext* context, const string& op_name,
 
   return Status::OK();
 }
+
+Status GetEagerOperationAndNumRetvals(const Operation& operation,
+                                      EagerContext* eager_context,
+                                      EagerExecutor* eager_executor,
+                                      EagerOperation* eager_op,
+                                      int* num_retvals) {
+  const char* name = operation.name().c_str();  // Shorthand
+  absl::optional<tensorflow::EagerRemoteFunctionParams> remote_func_params =
+      absl::nullopt;
+  if (operation.is_function()) {
+    if (operation.is_component_function()) {
+      remote_func_params = {operation.id(), operation.func_step_id()};
+    } else {
+      remote_func_params = {operation.id(), absl::nullopt};
+    }
+  }
+  TF_RETURN_IF_ERROR(eager_op->Reset(name, operation.device().c_str(), false,
+                                     eager_executor, remote_func_params));
+
+  {
+    profiler::TraceMe activity("EagerService:RemoteTensorHandleInternal",
+                               profiler::TraceMeLevel::kVerbose);
+    for (const auto& input : operation.op_inputs()) {
+      tensorflow::TensorHandle* handle;
+      if (input.has_remote_handle()) {
+        TF_RETURN_IF_ERROR(
+            eager_context->RemoteMgr()->DeserializeRemoteTensorHandle(
+                input.remote_handle(), &handle));
+        TF_RETURN_IF_ERROR(eager_op->AddInput(handle));
+      } else {
+        Tensor tensor;
+        if (!ParseTensorProtoToTensor(input.tensor(), &tensor)) {
+          return errors::InvalidArgument("Invalid TensorProto: ",
+                                         input.tensor().DebugString());
+        } else {
+          handle = TensorHandle::CreateLocalHandle(std::move(tensor), nullptr,
+                                                   nullptr, eager_context);
+          TF_RETURN_IF_ERROR(eager_op->AddInput(handle));
+        }
+      }
+      // Unref handle since it has a ref as an input now.
+      handle->Unref();
+    }
+  }
+
+  for (const auto& attr : operation.attrs()) {
+    eager_op->MutableAttrs()->Set(attr.first, attr.second);
+  }
+
+  // TODO(nareshmodi): Consider caching this.
+  return GetNumRetvals(eager_context, operation.name(), operation.attrs(),
+                       num_retvals);
+}
+
+Status TensorHandleProto(TensorHandle* handle, TensorProto* proto) {
+  const tensorflow::Tensor* t = nullptr;
+  TF_RETURN_IF_ERROR(handle->Tensor(&t));
+  t->AsProtoTensorContent(proto);
+  return Status::OK();
+}
+
+Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
+  const tensorflow::Tensor* t = nullptr;
+
+  // TODO(nareshmodi): This call makes async calls sync calls. Fix this.
+  TF_RETURN_IF_ERROR(handle->Tensor(&t));
+
+  t->shape().AsProto(proto);
+
+  return Status::OK();
+}
+
+Status AddOpRetvalsToResponse(
+    EagerContext* eager_context, int op_id, int num_retvals,
+    TensorHandle** retvals, std::function<TensorProto*()> add_tensor_proto_fn,
+    std::function<TensorShapeProto*()> add_shape_proto_fn) {
+  if (op_id == kInvalidRemoteOpId) {
+    // Copy the output tensors back along with the response, since the op id
+    // is invalid which cannot be added to RemoteMgr.
+    for (int i = 0; i < num_retvals; i++) {
+      TF_RETURN_IF_ERROR(TensorHandleProto(retvals[i], add_tensor_proto_fn()));
+      retvals[i]->Unref();
+    }
+  } else {
+    eager_context->RemoteMgr()->AddOperationOutputs(
+        absl::MakeSpan(retvals, num_retvals), op_id);
+    for (int i = 0; i < num_retvals; i++) {
+      TF_RETURN_IF_ERROR(TensorHandleShape(retvals[i], add_shape_proto_fn()));
+    }
+  }
+  return Status::OK();
+}
 }  // namespace
 
 Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
@@ -253,12 +347,6 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
   auto session_name =
       tensorflow::strings::StrCat("eager_", request->context_id());
 
-  // Hold `context_update_mu_` exclusively update the context state. This lock
-  // prevents other threads from processing an enqueued request at the same
-  // time. Each enqueue request will be processed either with context state
-  // before or after the update, but the exact ordering needs to be enforced
-  // by the client if desired.
-  mutex_lock l(context_update_mu_);
   TF_RETURN_IF_ERROR(env_->session_mgr->UpdateSession(
       session_name, request->server_def(), request->cluster_device_attributes(),
       true));
@@ -286,14 +374,9 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
   TF_RETURN_IF_ERROR(worker_session->worker_cache()->GetEagerClientCache(
       &remote_eager_workers));
 
-  DistributedFunctionLibraryRuntime* cluster_flr =
-      eager::CreateClusterFLR(request->context_id(), ctx, worker_session.get());
-
   ctx->ClearCachesAndThreadExecutors();
-  Status s = ctx->UpdateRemoteWorker(
-      device_mgr, std::move(remote_eager_workers),
-      worker_session->remote_device_mgr(), remote_workers,
-      request->context_id(), cluster_flr);
+  Status s = ctx->UpdateRemoteWorker(std::move(remote_eager_workers),
+                                     remote_workers, request->context_id());
   if (!s.ok()) {
     VLOG(1) << "EagerContext::UpdateRemoteWorker failed with " << s.ToString();
     return s;
@@ -327,100 +410,87 @@ Status EagerServiceImpl::CreateMasterContext(
   return Status::OK();
 }
 
-Status TensorHandleProto(TensorHandle* handle, TensorProto* proto) {
-  const tensorflow::Tensor* t = nullptr;
-  TF_RETURN_IF_ERROR(handle->Tensor(&t));
-  t->AsProtoTensorContent(proto);
-  return Status::OK();
-}
+void EagerServiceImpl::RunComponentFunction(
+    const RunComponentFunctionRequest* request,
+    RunComponentFunctionResponse* response, StatusCallback done) {
+  ServerContext* context = nullptr;
+  Status s = GetServerContext(request->context_id(), &context);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  core::ScopedUnref context_unref(context);
 
-Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
-  const tensorflow::Tensor* t = nullptr;
+  auto& operation = request->operation();
+  // This codepath should only be triggered for executing component function
+  if (!operation.is_function() || !operation.is_component_function()) {
+    done(errors::Internal(
+        "RunComponentFunction request can only be used to execute "
+        "component functions."));
+    return;
+  }
 
-  // TODO(nareshmodi): This call makes async calls sync calls. Fix this.
-  TF_RETURN_IF_ERROR(handle->Tensor(&t));
+  EagerContext* eager_context = context->Context();
+  EagerExecutor* eager_executor = &eager_context->Executor();
 
-  t->shape().AsProto(proto);
+  EagerOperation* op = new EagerOperation(eager_context);
+  int* num_retvals = new int(0);
+  s = GetEagerOperationAndNumRetvals(operation, eager_context, eager_executor,
+                                     op, num_retvals);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  if (!op->IsLocal()) {
+    done(errors::Internal(
+        "Received RunComponentFunction request with remote function device. "));
+    return;
+  }
 
-  return Status::OK();
+  auto* retvals = new absl::FixedArray<TensorHandle*>(*num_retvals);
+  VLOG(3) << "ServerContext: Calling EagerLocalExecuteAsync for op "
+          << operation.id();
+
+  context->Ref();
+  EagerLocalExecuteAsync(
+      op, retvals->data(), num_retvals,
+      [op, op_id = operation.id(), num_retvals, retvals, response,
+       eager_context, context, done = std::move(done)](const Status& status) {
+        auto wrapped_done = [&](const Status& status) {
+          context->Unref();
+          done(status);
+          delete op;
+          delete num_retvals;
+          delete retvals;
+        };
+        if (!status.ok()) {
+          wrapped_done(status);
+          return;
+        }
+        wrapped_done(AddOpRetvalsToResponse(
+            eager_context, op_id, *num_retvals, retvals->data(),
+            [response] { return response->add_tensor(); },
+            [response] { return response->add_shape(); }));
+      });
 }
 
 Status EagerServiceImpl::ExecuteOp(const Operation& operation,
                                    EagerContext* eager_context,
                                    EagerExecutor* eager_executor,
                                    QueueResponse* queue_response) {
-  std::unique_ptr<tensorflow::EagerOperation> op;
-  const char* name = operation.name().c_str();  // Shorthand
-  absl::optional<tensorflow::EagerRemoteFunctionParams> remote_func_params =
-      absl::nullopt;
-  if (operation.is_function()) {
-    if (operation.is_component_function()) {
-      remote_func_params = {operation.id(), operation.func_step_id()};
-    } else {
-      remote_func_params = {operation.id(), absl::nullopt};
-    }
-  }
-  op.reset(new tensorflow::EagerOperation(eager_context));
-  TF_RETURN_IF_ERROR(op->Reset(name, operation.device().c_str(), false,
-                               eager_executor, remote_func_params));
-
-  {
-    profiler::TraceMe activity("EagerService:RemoteTensorHandleInternal",
-                               profiler::TraceMeLevel::kVerbose);
-    for (const auto& input : operation.op_inputs()) {
-      tensorflow::TensorHandle* handle;
-      if (input.has_remote_handle()) {
-        TF_RETURN_IF_ERROR(
-            eager_context->RemoteMgr()->DeserializeRemoteTensorHandle(
-                input.remote_handle(), &handle));
-        op->AddInput(handle);
-      } else {
-        Tensor tensor;
-        if (!ParseTensorProtoToTensor(input.tensor(), &tensor)) {
-          return errors::InvalidArgument("Invalid TensorProto: ",
-                                         input.tensor().DebugString());
-        } else {
-          handle = TensorHandle::CreateLocalHandle(std::move(tensor), nullptr,
-                                                   nullptr, eager_context);
-          op->AddInput(handle);
-        }
-      }
-      // Unref handle since it has a ref as an input now.
-      handle->Unref();
-    }
-  }
-
-  for (const auto& attr : operation.attrs()) {
-    op->MutableAttrs()->Set(attr.first, attr.second);
-  }
-
+  tensorflow::EagerOperation op(eager_context);
   int num_retvals = 0;
-  // TODO(nareshmodi): Consider caching this.
-  TF_RETURN_IF_ERROR(GetNumRetvals(eager_context, operation.name(),
-                                   operation.attrs(), &num_retvals));
+  TF_RETURN_IF_ERROR(GetEagerOperationAndNumRetvals(
+      operation, eager_context, eager_executor, &op, &num_retvals));
 
   absl::FixedArray<tensorflow::TensorHandle*> retvals(num_retvals);
   VLOG(3) << "ServerContext: Calling EagerExecute for op " << operation.id();
-  TF_RETURN_IF_ERROR(EagerExecute(op.get(), retvals.data(), &num_retvals));
+  TF_RETURN_IF_ERROR(EagerExecute(&op, retvals.data(), &num_retvals));
 
-  if (operation.id() == kInvalidRemoteOpId) {
-    // Copy the output tensors back along with the response, since the op id
-    // is invalid which cannot be added to RemoteMgr.
-    for (int i = 0; i < num_retvals; i++) {
-      TF_RETURN_IF_ERROR(
-          TensorHandleProto(retvals[i], queue_response->add_tensor()));
-      retvals[i]->Unref();
-    }
-  } else {
-    eager_context->RemoteMgr()->AddOperationOutputs(
-        absl::MakeSpan(retvals.data(), num_retvals), operation.id());
-    for (int i = 0; i < num_retvals; i++) {
-      TF_RETURN_IF_ERROR(
-          TensorHandleShape(retvals[i], queue_response->add_shape()));
-    }
-  }
-
-  return Status::OK();
+  return AddOpRetvalsToResponse(
+      eager_context, operation.id(), num_retvals, retvals.data(),
+      [queue_response] { return queue_response->add_tensor(); },
+      [queue_response] { return queue_response->add_shape(); });
 }
 
 Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
@@ -443,9 +513,6 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
   Status s;
   for (const auto& item : request->queue()) {
     auto* queue_response = response->add_queue_response();
-    // Acquire shared lock to prevent handling enqueue requests while updating
-    // context (see UpdateContext).
-    tf_shared_lock l(context_update_mu_);
     if (item.has_operation()) {
       s = ExecuteOp(item.operation(), context->Context(), &executor,
                     queue_response);
@@ -457,6 +524,8 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
       s = context->Context()->Executor().AddOrExecute(std::move(node));
     } else if (item.has_send_tensor()) {
       s = SendTensor(item.send_tensor(), context->Context());
+    } else if (item.has_send_packed_handle()) {
+      s = SendPackedHandle(item.send_packed_handle(), context->Context());
     } else if (item.has_register_function()) {
       s = RegisterFunction(item.register_function(), context->Context());
     } else if (item.has_cleanup_function()) {
@@ -576,6 +645,52 @@ Status EagerServiceImpl::SendTensor(const SendTensorOp& send_tensor,
   return Status::OK();
 }
 
+Status EagerServiceImpl::SendPackedHandle(
+    const SendPackedHandleOp& send_packed_handle, EagerContext* eager_context) {
+  if (send_packed_handle.handles().empty()) {
+    return errors::InvalidArgument("Handles should not be empty.");
+  }
+
+  std::vector<tensorflow::TensorHandle*> handles;
+  handles.resize(send_packed_handle.handles_size());
+  for (int i = 0; i < send_packed_handle.handles_size(); ++i) {
+    const auto& item = send_packed_handle.handles(i);
+    if (item.has_local_handle()) {
+      Tensor tensor;
+      if (!ParseTensorProtoToTensor(item.local_handle().tensor(), &tensor)) {
+        return errors::InvalidArgument(
+            "Invalid TensorProto: ",
+            item.local_handle().tensor().DebugString());
+      }
+      Device* op_device = nullptr;
+      TF_RETURN_IF_ERROR(eager_context->FindDeviceFromName(
+          item.local_handle().device().c_str(), &op_device));
+      handles[i] = TensorHandle::CreateLocalHandle(
+          std::move(tensor), /*d=*/nullptr, op_device, eager_context);
+    } else {
+      TF_RETURN_IF_ERROR(
+          eager_context->RemoteMgr()->DeserializeRemoteTensorHandle(
+              item.remote_handle(), &handles[i]));
+    }
+  }
+
+  tensorflow::TensorHandle* packed_handle = nullptr;
+  std::vector<tensorflow::TensorHandle*> handles_to_pack = handles;
+  // Create a unshaped packed TensorHandle.
+  TF_RETURN_IF_ERROR(TensorHandle::CreatePackedHandle(
+      std::move(handles_to_pack), handles.at(0)->dtype, TensorShape(),
+      eager_context, &packed_handle));
+
+  for (auto* h : handles) {
+    // Unref handle since it has a ref in the packed handle now.
+    h->Unref();
+  }
+
+  eager_context->RemoteMgr()->AddOperationOutputs({packed_handle},
+                                                  send_packed_handle.op_id());
+  return Status::OK();
+}
+
 tensorflow::Status EagerServiceImpl::GetServerContext(
     uint64 context_id, ServerContext** server_context) {
   tf_shared_lock l(contexts_mu_);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index f1c02f0938c..1e4d36ccf9f 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -87,7 +87,7 @@ class EagerServiceImpl {
   Status CreateMasterContext(const tensorflow::uint64 context_id,
                              EagerContext* context);
 
-  static const uint64 kInvalidStreamId = 0;
+  static constexpr uint64 kInvalidStreamId = 0;
 
   // Used by both Enqueue and StreamingEnqueue RPCs.
   Status Enqueue(const EnqueueRequest* request, EnqueueResponse* response,
@@ -96,6 +96,10 @@ class EagerServiceImpl {
   Status WaitQueueDone(const WaitQueueDoneRequest* request,
                        WaitQueueDoneResponse* response);
 
+  void RunComponentFunction(const RunComponentFunctionRequest* request,
+                            RunComponentFunctionResponse* response,
+                            StatusCallback done);
+
   Status KeepAlive(const KeepAliveRequest* request,
                    KeepAliveResponse* response);
 
@@ -208,6 +212,8 @@ class EagerServiceImpl {
                    QueueResponse* queue_response);
   Status SendTensor(const SendTensorOp& send_tensor,
                     EagerContext* eager_context);
+  Status SendPackedHandle(const SendPackedHandleOp& send_packed_handle,
+                          EagerContext* eager_context);
   Status RegisterFunction(const RegisterFunctionOp& register_function,
                           EagerContext* eager_context);
   Status CleanupFunction(const CleanupFunctionOp& cleanup_function);
@@ -217,12 +223,6 @@ class EagerServiceImpl {
   std::unordered_map<uint64, ServerContext*> contexts_
       TF_GUARDED_BY(contexts_mu_);
 
-  // Mutex to guard access to EagerContext in `contexts_`. Different from
-  // `contexts_mu_` which guards adding / removing item from the map, this mutex
-  // is supposed to be used to avoid concurrent reading/updating the state of an
-  // EagerContext inside the map.
-  mutex context_update_mu_;
-
   std::unique_ptr<Thread> gc_thread_;
   mutex gc_thread_shutdown_mu_;
   condition_variable gc_thread_cv_;
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 50f822bf468..3c537d99a3a 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -90,6 +90,13 @@ class FakeEagerClient : public EagerClient {
   CLIENT_METHOD(CloseContext);
 #undef CLIENT_METHOD
 
+  void RunComponentFunctionAsync(CallOptions* call_opts,
+                                 const RunComponentFunctionRequest* request,
+                                 RunComponentFunctionResponse* response,
+                                 StatusCallback done) override {
+    impl_->RunComponentFunction(request, response, std::move(done));
+  }
+
   void StreamingEnqueueAsync(const EnqueueRequest* request,
                              EnqueueResponse* response,
                              StatusCallback done) override {
@@ -494,11 +501,11 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
         : EagerKernelArgs(std::move(tensor_args)),
           serialize_remote_handle_(std::move(serialize_remote_handle)) {}
 
-    bool HasRemoteInputs() const override { return true; }
+    bool HasRemoteOrPackedInputs() const override { return true; }
 
-    Status GetRemoteArg(const int index,
+    Status GetRemoteArg(const FunctionArgIndex& index,
                         eager::RemoteTensorHandle* val) const override {
-      return serialize_remote_handle_(index, val);
+      return serialize_remote_handle_(index.index, val);
     }
 
    private:
@@ -556,7 +563,14 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
     eager_pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
         remote_device_mgr_.get(), Env::Default(), /*config=*/
         nullptr, TF_GRAPH_DEF_VERSION, &func_lib_def_, OptimizerOptions(),
-        /*thread_pool=*/nullptr, eager_cluster_flr_.get());
+        /*thread_pool=*/nullptr, eager_cluster_flr_.get(),
+        /*custom_kernel_creator=*/nullptr, /*session_metadata=*/nullptr,
+        Rendezvous::Factory{[this](const int64 step_id,
+                                   const DeviceMgr* device_mgr,
+                                   Rendezvous** r) {
+          *r = worker_env_.rendezvous_mgr->Find(step_id);
+          return Status::OK();
+        }});
   }
 
   void CheckOutputTensorAndClose(const Tensor& tensor) {
@@ -702,7 +716,7 @@ TEST_F(FunctionWithRemoteInputsTest,
   CheckOutputTensorAndClose(outputs.at(0));
 }
 
-// Test executes a remote function through KernelAndDeviceFunc.
+// Test executes a remote function through KernelAndDeviceFunc::Run.
 TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
   Init();
   Device* local_device;
@@ -715,7 +729,9 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
   core::RefCountPtr<KernelAndDeviceFunc> kernel = nullptr;
   const int64 op_id = 2;
   kernel.reset(new KernelAndDeviceFunc(
-      flr, eager_pflr_.get(), std::move(input_dev_ptrs), {}, /*runner=*/nullptr,
+      flr, eager_pflr_.get(), std::move(input_dev_ptrs),
+      /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
+      /*runner=*/nullptr,
       /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
       [ctx](const int64 step_id) { return ctx->CreateRendezvous(step_id); },
       [=]() { return op_id; }));
@@ -747,6 +763,60 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
   CheckOutputsAndClose(op_id);
 }
 
+// Test executes a remote function through KernelAndDeviceFunc::RunAsync.
+TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
+  Init();
+  Device* local_device;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(local_device_, &local_device));
+  std::vector<Device*> input_dev_ptrs;
+  input_dev_ptrs.push_back(local_device);
+  FunctionLibraryRuntime* flr = eager_pflr_->GetFLR(remote_device_);
+  EagerContext* ctx = nullptr;
+  TF_ASSERT_OK(eager_service_impl_.GetEagerContext(context_id_, &ctx));
+  core::RefCountPtr<KernelAndDeviceFunc> kernel = nullptr;
+  const int64 op_id = 2;
+  kernel.reset(new KernelAndDeviceFunc(
+      flr, eager_pflr_.get(), std::move(input_dev_ptrs),
+      /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
+      /*runner=*/nullptr,
+      /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
+      [ctx](const int64 step_id) { return ctx->CreateRendezvous(step_id); },
+      [=]() { return op_id; }));
+
+  // Instantiate MatMulFunction on remote_device.
+  const NodeDef node_def = MatMulFunctionNodeDef();
+  TF_ASSERT_OK(kernel->InstantiateFunc(node_def, nullptr));
+
+  // Run MatMulFunction on remote_device.
+  gtl::InlinedVector<TensorValue, 4> input_tensors = {TensorValue()};
+  RemoteTensorHandle input;
+  input.set_op_id(1);
+  input.set_output_num(0);
+  input.set_op_device(local_device_);
+  input.set_device(local_device_);
+  std::vector<RemoteTensorHandle> remote_handles = {input};
+  TestExecuteNodeArgs inputs(
+      std::move(input_tensors),
+      [&remote_handles](const int index, RemoteTensorHandle* handle) -> Status {
+        *handle = remote_handles.at(index);
+        return Status::OK();
+      });
+  std::vector<Tensor> outputs;
+
+  Status status;
+  Notification n;
+  kernel->RunAsync(/*step_container=*/nullptr, inputs, &outputs,
+                   /*cancellation_manager=*/nullptr,
+                   /*remote_func_params=*/absl::nullopt,
+                   [&status, &n](const Status& s) {
+                     status = s;
+                     n.Notify();
+                   });
+  n.WaitForNotification();
+  TF_ASSERT_OK(status);
+  CheckOutputsAndClose(op_id);
+}
+
 // Test creates a context and attempts to send a tensor (using the RPC), and
 // then use the tensor.
 TEST_F(EagerServiceImplTest, SendTensorTest) {
@@ -812,6 +882,109 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
                                                &close_context_response));
 }
 
+// Test serializes and sends a pack TensorHandle.
+TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+
+  const string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  uint64 context_id = random::New64();
+  CreateContextRequest request;
+  auto* server_def = request.mutable_server_def();
+  server_def->set_job_name("localhost");
+  server_def->set_task_index(0);
+  request.add_cluster_device_attributes()->set_name(device0);
+  request.add_cluster_device_attributes()->set_name(device1);
+  request.add_cluster_device_attributes()->set_name(device2);
+  request.set_context_id(context_id);
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  // Copy a tensor to device0
+  auto* send_tensor = remote_enqueue_request.add_queue()->mutable_send_tensor();
+  send_tensor->set_op_id(1);
+  SetTensorProto(send_tensor->add_tensors());
+
+  // Copy a packed handle to device0
+  auto* send_packed_handle =
+      remote_enqueue_request.add_queue()->mutable_send_packed_handle();
+  send_packed_handle->set_op_id(3);
+  RemoteTensorHandle* remote_handle =
+      send_packed_handle->add_handles()->mutable_remote_handle();
+  remote_handle->set_op_id(send_tensor->op_id());
+  remote_handle->set_output_num(0);
+  remote_handle->set_op_device(device0);
+  remote_handle->set_device(device0);
+
+  SendPackedHandleOp::LocalTensorHandle* lcoal_handle =
+      send_packed_handle->add_handles()->mutable_local_handle();
+  SetTensorProto(lcoal_handle->mutable_tensor());
+  lcoal_handle->set_device(device1);
+
+  remote_handle = send_packed_handle->add_handles()->mutable_remote_handle();
+  remote_handle->set_op_id(2);
+  remote_handle->set_output_num(5);
+  remote_handle->set_op_device(device2);
+  remote_handle->set_device(device2);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  tensorflow::TensorHandle* packed_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      context_id, RemoteTensorHandleInternal(3, 0), &packed_handle));
+
+  EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
+  EXPECT_EQ(packed_handle->NumPackedHandles(), 3);
+
+  TensorHandle* handle0 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(0, &handle0));
+  EXPECT_EQ(handle0->Type(), TensorHandle::LOCAL);
+  EXPECT_EQ(handle0->op_device()->name(), device0);
+  const Tensor* t0 = nullptr;
+  TF_ASSERT_OK(handle0->Tensor(&t0));
+  auto actual = t0->flat<float>();
+  EXPECT_EQ(4, actual.size());
+  EXPECT_EQ(1.0, actual(0));
+  EXPECT_EQ(2.0, actual(1));
+  EXPECT_EQ(3.0, actual(2));
+  EXPECT_EQ(4.0, actual(3));
+
+  TensorHandle* handle1 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(1, &handle1));
+  EXPECT_EQ(handle1->Type(), TensorHandle::LOCAL);
+  EXPECT_EQ(handle1->op_device()->name(), device1);
+  const Tensor* t1 = nullptr;
+  TF_ASSERT_OK(handle0->Tensor(&t1));
+  EXPECT_EQ(t1, t0);
+
+  TensorHandle* handle2 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(2, &handle2));
+  EXPECT_EQ(handle2->Type(), TensorHandle::REMOTE);
+  EXPECT_EQ(handle2->op_device()->name(), device2);
+  int64 op_id;
+  int32 output_num;
+  TF_ASSERT_OK(handle2->RemoteAddress(absl::get<Device*>(handle2->device()),
+                                      /*wait_until_ready=*/true, &op_id,
+                                      &output_num));
+  EXPECT_EQ(op_id, 2);
+  EXPECT_EQ(output_num, 5);
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  close_context_request.set_context_view_id(0);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
 // Test requests sent to the eager service on master.
 TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   tensorflow::Rendezvous* rendezvous =
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 1749bca1176..090417863f3 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace eager {
@@ -96,7 +98,7 @@ RemoteCopyNode::~RemoteCopyNode() {
 Status RemoteCopyNode::RunLocalSend(EagerOperation* op) {
   TF_RETURN_IF_ERROR(executor_->status());
 
-  op->AddInput(src_);
+  TF_RETURN_IF_ERROR(op->AddInput(src_));
 
   core::RefCountPtr<KernelAndDevice> kernel;
   TF_RETURN_IF_ERROR(CreateUncachedKernelAndDeviceOp(op, &kernel));
@@ -145,7 +147,8 @@ void RemoteCopyNode::StartSend() {
     request.set_context_id(ctx_->GetContextId());
     auto* remote_op = request.add_queue()->mutable_operation();
     status = ctx_->RemoteMgr()->SerializeRemoteTensorHandle(
-        src_, remote_op->add_op_inputs()->mutable_remote_handle(),
+        src_, /*wait_until_ready=*/false,
+        remote_op->add_op_inputs()->mutable_remote_handle(),
         absl::get<Device*>(src_->device()),
         absl::get<Device*>(src_->DeviceOrHostCPU(*ctx_))->name());
     if (!status.ok()) {
@@ -290,6 +293,103 @@ void RemoteCopyNode::StartRecv(StatusCallback done) {
   }
 }
 
+Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
+                             const Device* target_device, EagerContext* ctx,
+                             SendPackedHandleOp* op) {
+  op->set_op_id(op_id);
+  for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
+    TensorHandle* h = nullptr;
+    TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
+    if (h->Type() == TensorHandle::LOCAL) {
+      // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
+      // copy it to the CPU before copying it out.
+      Tensor tensor;
+      TF_RETURN_IF_ERROR(h->CopyToDevice(*ctx, ctx->HostCPU(), &tensor));
+      auto* local_handle = op->add_handles()->mutable_local_handle();
+      local_handle->set_device(h->op_device() ? h->op_device()->name()
+                                              : ctx->HostCPU()->name());
+      tensor.AsProtoTensorContent(local_handle->mutable_tensor());
+    } else if (h->Type() == TensorHandle::REMOTE) {
+      // Only serialize the resource dtype and shape of the first handle, since
+      // all handles are of the same resource dtype and shape.
+      Device* src_device = absl::get<Device*>(h->device());
+      const bool serialize_resource_dtype_and_shape =
+          (i == 0) && (h->dtype == DT_RESOURCE) &&
+          (ctx->OnSameTask(src_device, target_device));
+      TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
+          h, /*wait_until_ready=*/false,
+          op->add_handles()->mutable_remote_handle(), src_device,
+          absl::get<Device*>(h->DeviceOrHostCPU(*ctx))->name(),
+          serialize_resource_dtype_and_shape));
+    } else {
+      return errors::InvalidArgument("Nested packed handles are not supported");
+    }
+  }
+  return Status::OK();
+}
+
+void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
+  Status s;
+  const uint64 context_view_id = ctx_->GetContextViewId();
+  if (!send_device_->IsLocal()) {
+    s = errors::InvalidArgument(
+        "Copy a packed handle from a remote device is not supported");
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  EnqueueRequest request;
+  uint64 context_id = ctx_->GetContextId();
+  request.set_context_id(context_id);
+  s = SerializePackedHandle(recv_op_id_, src_, recv_device_, ctx_,
+                            request.add_queue()->mutable_send_packed_handle());
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  TensorShape shape;
+  s = src_->Shape(&shape);
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+  captured_state_->SetSrcShape(shape);
+
+  core::RefCountPtr<eager::EagerClient> eager_client;
+  s = ctx_->GetClient(recv_device_, &eager_client);
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  EnqueueResponse* response = new EnqueueResponse;
+  Device* recv_device = recv_device_;
+  const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
+  eager_client->StreamingEnqueueAsync(
+      &request, response,
+      [captured_state, response, recv_device, context_view_id,
+       done](const Status& s) {
+        if (s.ok()) {
+          Status status = captured_state->dst()->SetRemoteShape(
+              captured_state->GetSrcShape(), recv_device, context_view_id);
+          if (!status.ok()) {
+            LOG(ERROR) << "Ignoring an error encountered when setting remote "
+                          "shape of tensor received by SendPackedHadnle rpc: "
+                       << status.ToString();
+          }
+        } else {
+          captured_state->dst()->PoisonRemote(s, recv_device, context_view_id);
+        }
+        done(s);
+        delete response;
+      });
+}
+
 void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
   Status s;
   EnqueueRequest request;
@@ -351,7 +451,11 @@ Status RemoteCopyNode::Prepare() {
 
 void RemoteCopyNode::RunAsync(StatusCallback done) {
   started_ = true;
-  if (ctx_->UseSendTensorRPC() && send_device_->IsLocal() &&
+  if (src_->Type() == TensorHandle::PACKED) {
+    return StartSendPackedHandle(std::move(done));
+  }
+
+  if ((ctx_->UseSendTensorRPC()) && send_device_->IsLocal() &&
       !recv_device_->IsLocal()) {
     return StartRemoteSendTensor(std::move(done));
   }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
index a527cd47127..7816a24ed33 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
@@ -121,6 +121,9 @@ class RemoteCopyNode : public AsyncEagerNode {
   // SendTensor RPC *on the receiver*.
   void StartRemoteSendTensor(StatusCallback done);
 
+  // Send a local packed TensorHandle to a remote device.
+  void StartSendPackedHandle(StatusCallback done);
+
   // State that is captured by Send and/or Recv callbacks (depending on which
   // one(s) is remote) and outlives this node in the case of remote->remote
   // copy.
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
index 3eab62b7c9d..067e26a31e4 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
@@ -76,7 +76,8 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
             if (!s.ok()) {
               LOG(ERROR) << "Ignoring an error encountered when setting "
                             "remote shape of tensor handle: "
-                         << retvals[i] << " with status: " << status.ToString()
+                         << retvals[i]
+                         << " with execute status: " << status.ToString()
                          << " and SetRemoteShape status: " << s.ToString()
                          << "\nThis should never happen. "
                             "Please file an issue with the TensorFlow Team.";
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index c120a28032c..94a4f199337 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -74,6 +74,7 @@ Status RemoteMgr::GetMirroredResourceShape(
 }
 
 Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
+                                        const bool wait_until_ready,
                                         int64* op_id, int32* output_num) {
   // TODO(allenl): Consider supporting remote handles on custom devices.
   VariantDevice device = handle->device();
@@ -82,8 +83,8 @@ Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
         "Custom devices and remote execution are currently not supported "
         "together.");
   }
-  TF_RETURN_IF_ERROR(
-      handle->RemoteAddress(absl::get<Device*>(device), op_id, output_num));
+  TF_RETURN_IF_ERROR(handle->RemoteAddress(
+      absl::get<Device*>(device), wait_until_ready, op_id, output_num));
   tensorflow::TensorHandle* h;
   TF_RETURN_IF_ERROR(
       GetTensorHandleImpl(RemoteTensorHandleInternal(*op_id, *output_num), &h));
@@ -120,13 +121,15 @@ Status RemoteMgr::DeleteTensorHandle(
 }
 
 Status RemoteMgr::SerializeRemoteTensorHandle(
-    TensorHandle* in, RemoteTensorHandle* out, Device* device,
-    const string& device_name, const bool serialize_resource_dtype_and_shape) {
+    TensorHandle* in, const bool wait_until_ready, RemoteTensorHandle* out,
+    Device* device, const string& device_name,
+    const bool serialize_resource_dtype_and_shape) {
   int64 op_id;
   int32 output_num;
-  if (!in->RemoteAddress(device, &op_id, &output_num).ok()) {
+  if (!in->RemoteAddress(device, wait_until_ready, &op_id, &output_num).ok()) {
     tf_shared_lock l(remote_tensor_handle_mu_);
-    TF_RETURN_IF_ERROR(GetRemoteTensorHandle(in, &op_id, &output_num));
+    TF_RETURN_IF_ERROR(
+        GetRemoteTensorHandle(in, wait_until_ready, &op_id, &output_num));
   }
   out->Clear();
   out->set_op_id(op_id);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 54c987d4daa..2446352c931 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -61,9 +61,11 @@ class RemoteMgr {
   }
 
   // Serialize a remote TensorHandle to a RemoteTensorHandle.
+  // If wait_until_ready is true, block until the remote handle is ready on a
+  // remote worker.
   Status SerializeRemoteTensorHandle(
-      TensorHandle* in, RemoteTensorHandle* out, Device* device,
-      const string& device_name,
+      TensorHandle* in, const bool wait_until_ready, RemoteTensorHandle* out,
+      Device* device, const string& device_name,
       const bool serialize_resource_dtype_and_shape = false);
 
   // Deserialize a RemoteTensorHandle to a TensorHandle(local/remote).
@@ -83,7 +85,8 @@ class RemoteMgr {
   // Returns the op_id and output_num if the given local TensorHandle exists in
   // remote_tensor_handle_map_.
   Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
-                               int64* op_id, int32* output_num)
+                               const bool wait_until_ready, int64* op_id,
+                               int32* output_num)
       TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
 
   Status GetTensorHandleImpl(const RemoteTensorHandleInternal& remote_handle,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index eb2f2aea632..1e33a9d0f62 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -68,7 +68,8 @@ class RemoteMgrTest : public ::testing::Test {
 
 TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
   RemoteMgr remote_mgr(false, ctx_);
-  Tensor t(DT_FLOAT, TensorShape({0}));
+  const TensorShape shape({0});
+  Tensor t(DT_FLOAT, shape);
 
   TensorHandle* handle = TensorHandle::CreateLocalHandle(
       std::move(t), local_device_, local_device_, ctx_);
@@ -76,9 +77,12 @@ TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
   const int output_num = 3;
   TF_ASSERT_OK(handle->AddUnshapedRemoteMirror(remote_device_, op_id,
                                                output_num, "", ctx_));
+  TF_ASSERT_OK(
+      handle->SetRemoteShape(shape, remote_device_, ctx_->GetContextViewId()));
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
-      handle, &remote_handle, remote_device_, remote_device_->name()));
+      handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
+      remote_device_->name()));
   EXPECT_EQ(op_id, remote_handle.op_id());
   EXPECT_EQ(output_num, remote_handle.output_num());
   EXPECT_EQ(remote_device_->name(), remote_handle.device());
@@ -90,18 +94,65 @@ TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
 
   const uint64 op_id = 3;
   const int output_num = 1;
-  TensorHandle* handle = TensorHandle::CreateUnshapedRemoteHandle(
-      op_id, output_num,
-      /*remote_task=*/"", DT_FLOAT, remote_device_, ctx_);
+  TensorHandle* handle = TensorHandle::CreateLazyRemoteHandle(
+      op_id, output_num, DT_FLOAT, remote_device_, ctx_);
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
-      handle, &remote_handle, remote_device_, remote_device_->name()));
+      handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
+      remote_device_->name()));
   EXPECT_EQ(op_id, remote_handle.op_id());
   EXPECT_EQ(output_num, remote_handle.output_num());
   EXPECT_EQ(remote_device_->name(), remote_handle.device());
   handle->Unref();
 }
 
+TEST_F(RemoteMgrTest, InvalidateRemoteMirrorWithClusterUpdate) {
+  RemoteMgr remote_mgr(false, ctx_);
+  Tensor t(DT_FLOAT, TensorShape({0}));
+
+  TensorHandle* handle = TensorHandle::CreateLocalHandle(
+      std::move(t), local_device_, local_device_, ctx_);
+  const uint64 op_id = 2;
+  const int output_num = 3;
+  TF_ASSERT_OK(handle->AddUnshapedRemoteMirror(remote_device_, op_id,
+                                               output_num, "", ctx_));
+  EXPECT_TRUE(
+      handle->HasRemoteMirror(remote_device_, ctx_->GetContextViewId()));
+
+  // When updating cluster, remote mirror should be invalidated.
+  ctx_->IncrementContextViewId();
+  EXPECT_FALSE(
+      handle->HasRemoteMirror(remote_device_, ctx_->GetContextViewId()));
+  EXPECT_FALSE(handle
+                   ->SetRemoteShape(TensorShape({0}), remote_device_,
+                                    ctx_->GetContextViewId())
+                   .ok());
+  handle->Unref();
+}
+
+TEST_F(RemoteMgrTest, SetRemoteShapeWithClusterUpdate) {
+  RemoteMgr remote_mgr(false, ctx_);
+
+  const uint64 op_id = 3;
+  const int output_num = 1;
+  TensorHandle* handle = TensorHandle::CreateUnshapedRemoteHandle(
+      op_id, output_num,
+      /*remote_task=*/"", DT_FLOAT, remote_device_, ctx_);
+  TF_ASSERT_OK(handle->SetRemoteShape(TensorShape({0}), remote_device_,
+                                      ctx_->GetContextViewId()));
+  handle->Unref();
+
+  // Setting remote shape on primary (non-mirror) remote handle works after
+  // cluster being updated
+  handle = TensorHandle::CreateUnshapedRemoteHandle(
+      op_id, output_num,
+      /*remote_task=*/"", DT_FLOAT, remote_device_, ctx_);
+  ctx_->IncrementContextViewId();
+  TF_ASSERT_OK(handle->SetRemoteShape(TensorShape({0}), remote_device_,
+                                      ctx_->GetContextViewId()));
+  handle->Unref();
+}
+
 }  // namespace
 }  // namespace eager
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 1f0f5a43fe2..6f4d5ada759 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -86,7 +86,7 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
 
 RemoteTensorHandleData::RemoteTensorHandleData(int64 op_id, int output_num,
                                                uint64 context_view_id)
-    : is_ready_(false),
+    : is_ready_(true),
       op_id_(op_id),
       output_num_(output_num),
       context_view_id_(context_view_id),
@@ -173,6 +173,10 @@ Status RemoteTensorHandleData::IsPoisoned() const {
 }
 
 Status RemoteTensorHandleData::SetShape(const TensorShape& shape) {
+  // If `is_ready_` is set previously due to poisoning, return the original
+  // error that poisoned this tensor.
+  TF_RETURN_IF_ERROR(IsPoisoned());
+
   mutex_lock l(mu_);
   if (is_ready_) {
     return errors::Internal("SetShape is only called on non-ready handles.");
@@ -190,17 +194,26 @@ string RemoteTensorHandleData::DebugString() const {
                          " output_num: ", output_num_);
 }
 
-Status RemoteTensorHandleData::WaitReady(const char* caller) const {
-  if (ctx_ == nullptr) {
-    return errors::Internal("Cannot wait on lazy remote handle");
+Status RemoteTensorHandleData::OpIdAndOutputNum(const bool wait_util_ready,
+                                                int64* op_id,
+                                                int32* output_num) const {
+  if (wait_util_ready) {
+    TF_RETURN_IF_ERROR(WaitReady("OpIdAndOutputNumUntilReady"));
   }
+  *op_id = op_id_;
+  *output_num = output_num_;
+  return Status::OK();
+}
 
+Status RemoteTensorHandleData::WaitReady(const char* caller) const {
   tf_shared_lock l(mu_);
   if (!is_ready_) {
     profiler::TraceMe activity(
         [caller] { return absl::StrCat(caller, " WaitReady"); },
         profiler::TraceMeLevel::kInfo);
     DVLOG(3) << "WaitReady: " << caller << " " << this;
+    // TODO(b/155493048): add a timeout here if it could cause any hanging
+    // issue.
     mu_.Await(Condition(&is_ready_));
   }
   return is_poisoned_;
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index d6a4a221300..5f096677225 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -25,7 +25,10 @@ namespace tensorflow {
 // the shape is known.
 class RemoteTensorHandleData {
  public:
-  // Constructor for lazy remote handles
+  // Constructor for lazy remote handles. A lazy remote handle is created on
+  // a remote worker with an op_id and an output_num sent by a client. The
+  // client won't serialize them until the corresponding remote tensor is ready.
+  // So the remote tensor should be ready when we create a lazy remote handle.
   RemoteTensorHandleData(int64 op_id, int output_num, uint64 context_view_id);
   // Constructor for unshaped remote handles
   RemoteTensorHandleData(int64 op_id, int output_num, const string& remote_task,
@@ -38,6 +41,7 @@ class RemoteTensorHandleData {
   Status NumDims(int* num_dims) const;
   Status Dim(int dim_index, int64* dim) const;
   Status NumElements(int64* num_elements) const;
+  Status Unprotect() { return Status::OK(); }
 
   bool IsReady() const;
   Status SetShape(const TensorShape& shape);
@@ -46,8 +50,11 @@ class RemoteTensorHandleData {
 
   string DebugString() const;
 
-  int64 op_id() const { return op_id_; }
-  int32 output_num() const { return output_num_; }
+  // Return the op id and output num. If wait_util_ready is true, block until
+  // the remote tensor is ready on a remote worker.
+  Status OpIdAndOutputNum(const bool wait_util_ready, int64* op_id,
+                          int32* output_num) const;
+
   uint64 context_view_id() const { return context_view_id_; }
 
  private:
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 70704a27736..8b363e66d87 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/metrics.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_partition.h"
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index cf8a2d90ea4..9b837bd5671 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -131,8 +131,8 @@ void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
           // on the job called "worker" (but still adds the CPUs of other jobs).
           if (getenv("TPU_NO_POPULATE_DEVICE_LIST_FROM_CLUSTER_SPEC") !=
               nullptr) {
-            if (worker_name_parsed.job != "worker" ||
-                device_name_parsed.type.find("TPU") != std::string::npos) {
+            if (worker_name_parsed.job == "worker" ||
+                device_name_parsed.type.find("TPU") == std::string::npos) {
               remote_devices.push_back(d);
             }
           } else {
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index d7251029d10..c1deabc23cd 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index cbbd76b42ad..c8288f28c36 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
 
 #include "grpcpp/generic/generic_stub.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
@@ -106,8 +107,8 @@ class GrpcEagerClientThread : public core::RefCounted {
 class GrpcEagerClient : public EagerClient {
  public:
   GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel,
-                  GrpcEagerClientThread* thread)
-      : stub_(channel), thread_(thread) {
+                  GrpcEagerClientThread* thread, const string& target)
+      : stub_(channel), thread_(thread), target_(target) {
     // Hold a reference to make sure the corresponding EagerClientThread
     // outlives the client.
     thread_->Ref();
@@ -127,7 +128,8 @@ class GrpcEagerClient : public EagerClient {
     new RPCState<protobuf::Message>(                                      \
         &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
         response, std::move(done_wrapped), /*call_opts=*/nullptr,         \
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true);   \
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,    \
+        &target_);                                                        \
   }
 
   CLIENT_METHOD(CreateContext);
@@ -145,7 +147,8 @@ class GrpcEagerClient : public EagerClient {
     new RPCState<protobuf::Message>(
         &stub_, cq_, "/tensorflow.eager.EagerService/CloseContext", *request,
         response, std::move(done_wrapped), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr);
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
+        &target_);
 
     VLOG(1) << "Sending RPC to close remote eager context "
             << request->DebugString();
@@ -161,6 +164,18 @@ class GrpcEagerClient : public EagerClient {
     }
   }
 
+  void RunComponentFunctionAsync(CallOptions* call_opts,
+                                 const RunComponentFunctionRequest* request,
+                                 RunComponentFunctionResponse* response,
+                                 StatusCallback done) override {
+    StatusCallback done_wrapped = callback_wrapper(std::move(done));
+    new RPCState<protobuf::Message>(
+        &stub_, cq_, "/tensorflow.eager.EagerService/RunComponentFunction",
+        *request, response, std::move(done_wrapped), call_opts,
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
+        &target_);
+  }
+
   void StreamingEnqueueAsync(const EnqueueRequest* request,
                              EnqueueResponse* response,
                              StatusCallback done) override {
@@ -193,6 +208,7 @@ class GrpcEagerClient : public EagerClient {
  private:
   ::grpc::GenericStub stub_;
   const GrpcEagerClientThread* thread_;
+  const string target_;
 
   ::grpc::CompletionQueue* cq_;
 
@@ -235,7 +251,7 @@ class GrpcEagerClientCache : public EagerClientCache {
       int assigned_index = AssignClientToThread(target);
       GrpcEagerClientThread* thread = threads_[assigned_index].get();
       core::RefCountPtr<EagerClient> worker(
-          new GrpcEagerClient(shared, thread));
+          new GrpcEagerClient(shared, thread, target));
       it = clients_.emplace(target, std::move(worker)).first;
     }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
index c74c648b985..27c3f30c9ab 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -52,6 +52,7 @@ void GrpcEagerServiceImpl::HandleRPCsLoop() {
   ENQUEUE_REQUEST(UpdateContext);
   ENQUEUE_REQUEST(Enqueue);
   ENQUEUE_REQUEST(WaitQueueDone);
+  ENQUEUE_REQUEST(RunComponentFunction);
   ENQUEUE_REQUEST(KeepAlive);
   ENQUEUE_REQUEST(CloseContext);
 #undef ENQUEUE_REQUEST
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 167a4cf2703..d95589704b1 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -72,6 +72,23 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   HANDLER(CloseContext);
 #undef HANDLER
 
+  void RunComponentFunctionHandler(
+      EagerCall<RunComponentFunctionRequest, RunComponentFunctionResponse>*
+          call) {
+    env_->compute_pool->Schedule([this, call]() {
+      local_impl_.RunComponentFunction(
+          &call->request, &call->response,
+          [call](const Status& s) { call->SendResponse(ToGrpcStatus(s)); });
+    });
+    Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,
+         RunComponentFunctionRequest, RunComponentFunctionResponse>::
+        EnqueueRequest(
+            &service_, cq_.get(),
+            &grpc::EagerService::AsyncService::RequestRunComponentFunction,
+            &GrpcEagerServiceImpl::RunComponentFunctionHandler,
+            /*supports_cancel=*/false);
+  }
+
   // Called when a new request has been received as part of a StreamingEnqueue
   // call.
   // StreamingEnqueueHandler gets the request from the `call` and fills the
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 85431acdf0c..6e706179863 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -45,7 +45,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
                             thread::ThreadPool* callback_threadpool,
-                            WorkerCacheLogger* logger)
+                            WorkerCacheLogger* logger, const string& target)
       : channel_(std::move(channel)),
         stub_(channel_),
         cq_(completion_queue),
@@ -66,7 +66,8 @@ class GrpcRemoteWorker : public WorkerInterface {
         instancesource_(Method(GrpcWorkerMethod::kCompleteInstance)),
         getstepsequence_(Method(GrpcWorkerMethod::kGetStepSequence)),
         markrecvfinished_(Method(GrpcWorkerMethod::kMarkRecvFinished)),
-        logger_(logger) {}
+        logger_(logger),
+        target_(target) {}
 
   ~GrpcRemoteWorker() override {}
 
@@ -273,7 +274,7 @@ class GrpcRemoteWorker : public WorkerInterface {
                     bool fail_fast = true) {
     new RPCState<protobuf::Message>(
         &stub_, cq_, method, *request, response, std::move(done), call_opts,
-        callback_threadpool_, /*max_retries=*/0, fail_fast);
+        callback_threadpool_, /*max_retries=*/0, fail_fast, &target_);
   }
 
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
@@ -281,7 +282,8 @@ class GrpcRemoteWorker : public WorkerInterface {
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_);
+                                 callback_threadpool_, /*max_retries=*/0,
+                                 /*fail_fast=*/true, &target_);
   }
 
   void IssueMarkRecvFinishedRequest(int64 request_id) {
@@ -321,6 +323,7 @@ class GrpcRemoteWorker : public WorkerInterface {
 
   // Support for logging.
   WorkerCacheLogger* logger_;
+  const string target_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcRemoteWorker);
 };
@@ -328,9 +331,10 @@ class GrpcRemoteWorker : public WorkerInterface {
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      thread::ThreadPool* callback_threadpool,
-                                     WorkerCacheLogger* logger) {
+                                     WorkerCacheLogger* logger,
+                                     const string& target) {
   return new GrpcRemoteWorker(std::move(channel), completion_queue,
-                              callback_threadpool, logger);
+                              callback_threadpool, logger, target);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index c0a49ecfc38..97e590e0ad1 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -29,7 +29,8 @@ class WorkerInterface;
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      thread::ThreadPool* callback_threadpool,
-                                     WorkerCacheLogger* logger);
+                                     WorkerCacheLogger* logger,
+                                     const string& target);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index 272d6bb1b20..bcb98baaeb9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -210,7 +210,8 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
       get_stub(index), &completion_queue_, *get_method_ptr(index),
       call->request(), call->response(),
       /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
-      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_, 0 /* max_retries */);
+      /*threadpool=*/nullptr, fail_fast_, timeout_in_ms_, /*max_retries=*/0,
+      /*target=*/nullptr);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 32083fc272f..754209082fd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -70,6 +70,18 @@ class NoReusePortOption : public ::grpc::ServerBuilderOption {
                          plugins) override {}
 };
 
+// Define an option subclass in order to enable SO_REUSEPORT for the
+// server socket.
+class ReusePortOption : public ::grpc::ServerBuilderOption {
+ public:
+  void UpdateArguments(::grpc::ChannelArguments* args) override {
+    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 1);
+  }
+
+  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
+                         plugins) override {}
+};
+
 // static utility function
 RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) {
   return new RpcRendezvousMgr(env);
@@ -131,9 +143,11 @@ GrpcServer::~GrpcServer() {
 
 void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
 
-// Look up the port that has been requested for this task in `server_def`.
-Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const {
+// Look up the requested host name and port for this task in `server_def`.
+Status GrpcServer::GetHostAndPort(const ServerDef& server_def,
+                                  string* host_name, int* port) const {
   *port = -1;
+  *host_name = "localhost";
   for (const auto& job : server_def.cluster().job()) {
     if (job.name() == server_def.job_name()) {
       auto iter = job.tasks().find(server_def.task_index());
@@ -153,6 +167,11 @@ Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const {
               "Could not parse port for local server from \"", iter->second,
               "\".");
         }
+
+        if (colon_index != string::npos &&
+            !iter->second.substr(0, colon_index).empty()) {
+          *host_name = iter->second.substr(0, colon_index);
+        }
       }
       break;
     }
@@ -175,7 +194,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   // otherwise if 'task_index=-1' the program will abort.
 
   int requested_port;
-  TF_RETURN_IF_ERROR(GetPort(server_def_, &requested_port));
+  TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name_, &requested_port));
 
   SessionOptions sess_opts;
   ConfigProto config = server_def_.default_session_config();
@@ -220,8 +239,18 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
                            GetServerCredentials(server_def_), &bound_port_);
   builder.SetMaxMessageSize(std::numeric_limits<int32>::max());
 
-  builder.SetOption(
-      std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
+  bool reuse_port = false;
+  const Status status =
+      ReadBoolFromEnvVar("TF_GRPC_REUSE_PORT", false, &reuse_port);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
+  auto server_build_option =
+      reuse_port
+          ? std::unique_ptr<::grpc::ServerBuilderOption>(new ReusePortOption)
+          : std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption);
+  builder.SetOption(std::move(server_build_option));
+
   // Allow subclasses to specify more args to pass to the gRPC server.
   MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);
@@ -325,7 +354,7 @@ Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                        task.second);
       }
       if (job.name() == *options.job_name && task.first == options.task_index) {
-        host_port = strings::StrCat("localhost:", bound_port_);
+        host_port = strings::StrCat(host_name_, ":", bound_port_);
       } else {
         host_port = task.second;
       }
@@ -478,7 +507,7 @@ Status GrpcServer::Join() {
 }
 
 const string GrpcServer::target() const {
-  return strings::StrCat("grpc://localhost:", bound_port_);
+  return strings::StrCat("grpc://", host_name_, ":", bound_port_);
 }
 
 std::shared_ptr<::grpc::ServerCredentials> GrpcServer::GetServerCredentials(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 8e25b8835eb..b3fa7d1f303 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -104,7 +104,8 @@ class GrpcServer : public ServerInterface {
   Status UpdateServerDef(const ServerDef& server_def);
 
  protected:
-  virtual Status GetPort(const ServerDef& server_def, int* port) const;
+  virtual Status GetHostAndPort(const ServerDef& server_def, string* host_name,
+                                int* port) const;
   Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
 
   // A subclass can override this method to support secure credentials.
@@ -136,6 +137,9 @@ class GrpcServer : public ServerInterface {
   // The port to which this server is bound.
   int bound_port_ = 0;
 
+  // The host name of this server
+  string host_name_;
+
   // Guards server configuration, server, and state.
   mutex mu_;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index c72ba6035a4..041b6e51ffb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -45,7 +45,7 @@ class RPCState : public GrpcClientCQTag {
            const ::grpc::string& method, const protobuf::Message& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
            thread::ThreadPool* threadpool, int32 max_retries = 0,
-           bool fail_fast = true)
+           bool fail_fast = true, const string* target = nullptr)
       : RPCState(
             stub, cq, method, request, response, std::move(done), call_opts,
             threadpool,
@@ -63,7 +63,7 @@ class RPCState : public GrpcClientCQTag {
 #endif  // PLATFORM_GOOGLE
               return x;
             }(),
-            /*timeout_in_ms=*/0, max_retries) {
+            /*timeout_in_ms=*/0, max_retries, target) {
   }
 
   template <typename Request>
@@ -71,7 +71,7 @@ class RPCState : public GrpcClientCQTag {
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
            thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms,
-           int32 max_retries)
+           int32 max_retries, const string* target)
       : call_opts_(call_opts),
         threadpool_(threadpool),
         done_(std::move(done)),
@@ -80,7 +80,8 @@ class RPCState : public GrpcClientCQTag {
         cq_(cq),
         stub_(stub),
         method_(method),
-        fail_fast_(fail_fast) {
+        fail_fast_(fail_fast),
+        target_(target) {
     response_ = response;
     ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
     if (!s.ok()) {
@@ -152,10 +153,13 @@ class RPCState : public GrpcClientCQTag {
       StartCall();
     } else {
       // Attach additional GRPC error information if any to the final status
-      s = Status(s.code(),
-                 strings::StrCat(s.error_message(),
-                                 "\nAdditional GRPC error information:\n",
-                                 context_->debug_error_string()));
+      string error_msg = s.error_message();
+      strings::StrAppend(&error_msg, "\nAdditional GRPC error information");
+      if (target_) {
+        strings::StrAppend(&error_msg, " from remote target ", *target_);
+      }
+      strings::StrAppend(&error_msg, ":\n:", context_->debug_error_string());
+      s = Status(s.code(), error_msg);
       // Always treat gRPC cancellation as a derived error. This ensures that
       // other error types are preferred during status aggregation. (gRPC
       // cancellation messages do not contain the original status message).
@@ -196,6 +200,7 @@ class RPCState : public GrpcClientCQTag {
   ::grpc::GenericStub* stub_;
   ::grpc::string method_;
   bool fail_fast_;
+  const string* target_;
 };
 
 // Represents state associated with one streaming RPC call.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index f6b6e15a2ba..1d75728ddd2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -69,9 +69,9 @@ class GrpcWorkerCache : public WorkerCachePartial {
         return nullptr;
       }
       size_t index = AssignWorkerToThread(target);
-      return NewGrpcRemoteWorker(channel,
-                                 worker_env_->GetCompletionQueue(index),
-                                 worker_env_->GetThreadPool(), &logger_);
+      return NewGrpcRemoteWorker(
+          channel, worker_env_->GetCompletionQueue(index),
+          worker_env_->GetThreadPool(), &logger_, target);
     }
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 37e88bafadb..b973421efa4 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -144,6 +144,20 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
       recv_done();
     };
     wi_->RecvTensorAsync(&opts_, &req_, &resp_, std::move(cb));
+
+    // NOTE: Check if the rendezvous was aborted after sending out the RPC. The
+    // ordering is important because `StartAbort` could be called right before
+    // the `RecvTensorAsync` request registers its RPC cancellation to `opts_`.
+    // In that case, the previous `StartAbort` would not trigger the
+    // cancellation of this call.
+    Status s;
+    {
+      mutex_lock l(mu_);
+      s = status_;
+    }
+    if (!s.ok()) {
+      opts_.StartCancel();
+    }
   }
 
   string src_worker_;
@@ -197,7 +211,7 @@ class RpcRecvTensorFreeList {
   }
 
  private:
-  static const int kMaxObjects = 1000;
+  static constexpr int kMaxObjects = 1000;
 
   mutex mu_;
   std::vector<RpcRecvTensorCall*> objects_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index 8a758700695..ca4f25f08f5 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -129,14 +129,14 @@ Status WorkerSession::UpdateWorkerCacheAndDevices(
     std::unique_ptr<WorkerCacheInterface> new_worker_cache,
     std::vector<std::unique_ptr<Device>> added_remote_devices,
     const std::vector<Device*>& removed_remote_devices) {
-  worker_cache_ = std::shared_ptr<WorkerCacheInterface>(
-      new WorkerFreeListCache(std::move(new_worker_cache)));
+  {
+    mutex_lock l(worker_session_state_mu_);
+    worker_cache_ = std::shared_ptr<WorkerCacheInterface>(
+        new WorkerFreeListCache(std::move(new_worker_cache)));
+  }
   TF_RETURN_IF_ERROR(remote_device_mgr_->RemoveDevices(removed_remote_devices));
   TF_RETURN_IF_ERROR(
       remote_device_mgr_->AddDevices(std::move(added_remote_devices)));
-  cluster_flr_ = std::unique_ptr<ClusterFunctionLibraryRuntime>(
-      new ClusterFunctionLibraryRuntime(this, !session_name_.empty(),
-                                        remote_device_mgr()));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 4dc46aba35b..3b2d1122558 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -46,7 +46,10 @@ class WorkerSession {
   const string& session_name() const { return session_name_; }
   const string& worker_name() const { return worker_name_; }
 
-  WorkerCacheInterface* worker_cache() const { return worker_cache_.get(); }
+  WorkerCacheInterface* worker_cache() const {
+    tf_shared_lock l(worker_session_state_mu_);
+    return worker_cache_.get();
+  }
   GraphMgr* graph_mgr() const { return graph_mgr_.get(); }
 
   ClusterFunctionLibraryRuntime* cluster_flr() const {
@@ -70,6 +73,7 @@ class WorkerSession {
   // worker in the cache is used in RPCs, the caller should hold a shared
   // pointer to avoid the workers getting deleted.
   std::shared_ptr<WorkerCacheInterface> GetSharedWorkerCache() {
+    tf_shared_lock l(worker_session_state_mu_);
     return worker_cache_;
   }
 
@@ -96,8 +100,10 @@ class WorkerSession {
   // The name of the worker. E.g., /job:mnist/replica:0/task:1.
   const string worker_name_;
 
+  mutable mutex worker_session_state_mu_;
   // Object from which WorkerInterface instances can be obtained.
-  std::shared_ptr<WorkerCacheInterface> worker_cache_;
+  std::shared_ptr<WorkerCacheInterface> worker_cache_
+      TF_GUARDED_BY(worker_session_state_mu_);
 
   // graph_mgr keeps track of the registered graphs of this session.
   //
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 234c5a403bc..0e923bd1236 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -634,7 +634,7 @@ cc_library(
     ],
     visibility = [
         "//tensorflow/core:__pkg__",
-        "//tensorflow/core/runtime_fallback:__pkg__",
+        "//tensorflow/core/runtime_fallback:__subpackages__",
     ],
     deps = [
         ":bounds_check",
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 8cc8a29fe48..d20f779c8da 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -31,16 +31,22 @@ thread_local MemoryDebugAnnotation ScopedMemoryDebugAnnotation::annotation_;
 
 string AllocatorStats::DebugString() const {
   return strings::Printf(
-      "Limit:        %20lld\n"
-      "InUse:        %20lld\n"
-      "MaxInUse:     %20lld\n"
-      "NumAllocs:    %20lld\n"
-      "MaxAllocSize: %20lld\n",
+      "Limit:            %20lld\n"
+      "InUse:            %20lld\n"
+      "MaxInUse:         %20lld\n"
+      "NumAllocs:        %20lld\n"
+      "MaxAllocSize:     %20lld\n"
+      "Reserved:         %20lld\n"
+      "PeakReserved:     %20lld\n"
+      "LargestFreeBlock: %20lld\n",
       static_cast<long long>(this->bytes_limit ? *this->bytes_limit : 0),
       static_cast<long long>(this->bytes_in_use),
       static_cast<long long>(this->peak_bytes_in_use),
       static_cast<long long>(this->num_allocs),
-      static_cast<long long>(this->largest_alloc_size));
+      static_cast<long long>(this->largest_alloc_size),
+      static_cast<long long>(this->bytes_reserved),
+      static_cast<long long>(this->peak_bytes_reserved),
+      static_cast<long long>(this->largest_free_block_bytes));
 }
 
 constexpr size_t Allocator::kAllocatorAlignment;
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 087505f8cd5..dd226b205a9 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -160,13 +160,16 @@ struct AllocatorStats {
   // if such a limit is known.
   absl::optional<int64> bytes_reservable_limit;
 
+  int64 largest_free_block_bytes;  // Largest free block's size in heap.
+
   AllocatorStats()
       : num_allocs(0),
         bytes_in_use(0),
         peak_bytes_in_use(0),
         largest_alloc_size(0),
         bytes_reserved(0),
-        peak_bytes_reserved(0) {}
+        peak_bytes_reserved(0),
+        largest_free_block_bytes(0) {}
 
   std::string DebugString() const;
 };
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 7a49e6e2561..3635cf7c4ba 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -59,6 +59,8 @@ namespace data {
 
 using TraceMeMetadata = std::vector<std::pair<StringPiece, string>>;
 
+constexpr char kTFDataFunction[] = "_tf_data_function";
+
 constexpr int kInfiniteCardinality = -1;
 constexpr int kUnknownCardinality = -2;
 
@@ -303,10 +305,6 @@ class Runner {
 // are doing is safe. We should formalize the properties here.
 class IteratorContext {
  public:
-  // Epoch IDs are only used for tf.data service datasets. Other datasets use
-  // an epoch ID value of -1.
-  static constexpr const int64 kNoEpochId = -1;
-
   struct Params {
     explicit Params(IteratorContext* ctx)
         : allocator_getter(ctx->allocator_getter()),
@@ -314,7 +312,6 @@ class IteratorContext {
           env(ctx->env()),
           flr(ctx->flr()),
           function_handle_cache(ctx->function_handle_cache()),
-          epoch_id(ctx->epoch_id()),
           resource_mgr(ctx->resource_mgr()),
           model(ctx->model()),
           runner(*(ctx->runner())),
@@ -373,10 +370,6 @@ class IteratorContext {
     // A FunctionHandleCache that owns all the function handles. Not owned.
     FunctionHandleCache* function_handle_cache = nullptr;
 
-    // Identifies the epoch this iterator was created for. It is used for
-    // reading from the tf.data service.
-    int64 epoch_id = kNoEpochId;
-
     // A resource manager for storing dataset-related state, e.g. random
     // seeds or cached tensors. Not owned.
     ResourceMgr* resource_mgr = nullptr;
@@ -426,8 +419,6 @@ class IteratorContext {
     return params_.function_handle_cache;
   }
 
-  int64 epoch_id() { return params_.epoch_id; }
-
   ResourceMgr* resource_mgr() { return params_.resource_mgr; }
 
   const std::shared_ptr<model::Model>& model() { return params_.model; }
@@ -666,6 +657,11 @@ class IteratorBase {
   virtual Status RestoreInternal(IteratorContext* ctx,
                                  IteratorStateReader* reader) = 0;
 
+  // Returns a pointer to the node representing this iterator in the performance
+  // model. It may be null, if performance modeling is not enabled for this
+  // iterator.
+  std::shared_ptr<model::Node> model_node() const { return node_; }
+
   // Returns the number of elements produced by this iterator.
   int64 num_elements() const {
     if (node_) return node_->num_elements();
@@ -682,7 +678,7 @@ class IteratorBase {
                         const string& output_prefix);
 
   std::vector<std::function<void()>> cleanup_fns_;
-  model::Node* node_ = nullptr;  // Not owned.
+  std::shared_ptr<model::Node> node_ = nullptr;
   const IteratorBase* parent_ = nullptr;  // Not owned.
   int64 id_ = 0;
   int64 parent_id_ = 0;
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index c962f233962..a43bb4fd656 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/notification.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -33,6 +34,35 @@ DeviceBase::~DeviceBase() {
   eigen_cpu_devices_.clear();
 }
 
+Status DeviceContext::CopyDeviceTensorToCPUSync(const Tensor* device_tensor,
+                                                StringPiece tensor_name,
+                                                Device* device,
+                                                Tensor* cpu_tensor) {
+  absl::Notification n;
+  Status status;
+  CopyDeviceTensorToCPU(device_tensor, tensor_name, device, cpu_tensor,
+                        [&](const Status& s) {
+                          status = s;
+                          n.Notify();
+                        });
+  n.WaitForNotification();
+  return status;
+}
+
+Status DeviceContext::CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor,
+                                                Device* device,
+                                                Tensor* device_tensor) const {
+  absl::Notification n;
+  Status status;
+  CopyCPUTensorToDevice(cpu_tensor, device, device_tensor,
+                        [&](const Status& s) {
+                          status = s;
+                          n.Notify();
+                        });
+  n.WaitForNotification();
+  return status;
+}
+
 const DeviceAttributes& DeviceBase::attributes() const {
   LOG(FATAL) << "Device does not implement attributes()";
 }
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index eba64a6b41e..81ddf8df98d 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -84,6 +84,10 @@ class DeviceContext : public core::RefCounted {
     done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
   }
 
+  // Same as CopyCPUTensorToDevice, but in a synchronous way.
+  Status CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor, Device* device,
+                                   Tensor* device_tensor) const;
+
   // Copies a tensor in this device.
   virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
                                       Device* device, Tensor* output_tensor,
@@ -100,6 +104,11 @@ class DeviceContext : public core::RefCounted {
     done(errors::Internal("Unrecognized device type in device-to-CPU Copy"));
   }
 
+  // Same as `CopyDeviceTensorToCPU`, but blocks until the copy is done.
+  Status CopyDeviceTensorToCPUSync(const Tensor* device_tensor,
+                                   StringPiece tensor_name, Device* device,
+                                   Tensor* cpu_tensor);
+
   // If possible, wait for all events on *stream to complete then execute func.
   // A non-OK Status is returned otherwise.  The stream argument should be the
   // one provided by GpuDeviceInfo.  This function is not applicable to devices
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index cd00fa4f449..22fc27771f0 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1505,9 +1505,17 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
     // function's attrs to see if noinline is specified. Otherwise,
     // uses func's attrs.
     if (!grad_name.empty()) {
-      return &(FindHelper(grad_name)->fdef);
+      if (const auto helper = FindHelper(grad_name)) {
+        return &(helper->fdef);
+      } else {
+        return nullptr;
+      }
+    }
+    if (const auto helper = FindHelper(func_name)) {
+      return &(helper->fdef);
+    } else {
+      return nullptr;
     }
-    return &(FindHelper(func_name)->fdef);
   }
 }
 
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index b6adf5b3190..314d57d8ba4 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -275,6 +276,18 @@ class CallFrameInterface {
   virtual size_t num_retvals() const = 0;
 
   virtual Status GetArg(int index, const Tensor** val) = 0;
+
+  // Optimized implementation of `GetArg()` that allows the caller to take
+  // ownership of the tensor. This method may only be called once per
+  // value of `index` and `CallFrameInterface` instance.
+  //
+  // REQUIRES: `this->CanConsumeArg(index) == true`.
+  virtual void ConsumeArg(int index, Tensor* val) {
+    LOG(ERROR) << "This `CallFrameInterface` implementation does not support "
+                  "consuming arguments.";
+  }
+  virtual bool CanConsumeArg(int index) const { return false; }
+
   virtual Status SetRetval(int index, const Tensor& val) = 0;
 };
 
@@ -525,6 +538,20 @@ class Device;
 // Forward declare. Defined in common_runtime/device_mgr.h
 class DeviceMgr;
 
+// Index of an _Arg node.
+struct FunctionArgIndex {
+  explicit FunctionArgIndex(const int index) : index(index) {}
+  FunctionArgIndex(const int index, const int sub_index)
+      : index(index), sub_index(sub_index) {}
+
+  // The value of the attribute "Index" of the _Arg node.
+  int index;
+  // Set only when the _Arg node represents multiple arguments (e.g. an _Arg
+  // node is replicated to multiple devices/subgraphs). Use sub-index to
+  // distinguish arguments with the same index.
+  int sub_index = -1;
+};
+
 class FunctionLibraryRuntime {
  public:
   virtual ~FunctionLibraryRuntime() {}
@@ -576,6 +603,10 @@ class FunctionLibraryRuntime {
     // infer correct device.
     std::vector<string> output_devices;
 
+    // Maps from a CompositeDevice name to a list of underlying physical
+    // devices.
+    absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
+
     // This interface is EXPERIMENTAL and subject to change.
     //
     // For multi-device functions, a mapping from _Arg node index to type and
@@ -730,6 +761,12 @@ class FunctionLibraryRuntime {
   virtual void Run(const Options& opts, Handle handle,
                    CallFrameInterface* call_frame, DoneCallback done) = 0;
 
+  virtual Status RunSync(Options opts, Handle handle,
+                         gtl::ArraySlice<Tensor> args,
+                         std::vector<Tensor>* rets) = 0;
+  virtual Status RunSync(Options opts, Handle handle,
+                         CallFrameInterface* call_frame) = 0;
+
   // Creates a "kernel" for the given NodeProperties "props".
   //
   // If succeeds, returns OK and the caller takes the ownership of the
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index 4e965e7b5bb..bbd70151849 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -276,17 +276,10 @@ Status FillFunctionBody(
         }
       }
       if (!node_attr_def) {
-#ifdef TENSORFLOW_LITE_PROTOS
-        return errors::Unimplemented(
-            "Placeholder value is not supported for attributes not in OpDef. "
-            "Attribute: ",
-            node_attr_name);
-#else
         return errors::Unimplemented(
             "Placeholder value is not supported for attributes not in OpDef. "
             "Attribute: ",
             node_attr_name, ", OpDef: ", node->op_def().DebugString());
-#endif
       }
       OpDef::AttrDef* attr_def = fdef->mutable_signature()->add_attr();
       attr_def->set_name(func_attr_name);
diff --git a/tensorflow/core/framework/graph_to_functiondef_test.cc b/tensorflow/core/framework/graph_to_functiondef_test.cc
index c692f312d1e..3643c29a677 100644
--- a/tensorflow/core/framework/graph_to_functiondef_test.cc
+++ b/tensorflow/core/framework/graph_to_functiondef_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 16f36c65d9c..658be94b9bb 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -25,103 +25,9 @@ namespace data {
 namespace model {
 namespace {
 
-// Key of the derivative w.r.t. the last input time in the gradient of
-// `OutputTime`.
-constexpr char kInputTimeDerivativeKey[] = "last_input_time";
-
 // Wrapper for the square function to reduce verbosity.
 inline double Square(double x) { return x * x; }
 
-// Given the average time between output events (`output_time`), the average
-// time between input events (`input_time`) and the buffer size, the method
-// computes the expected time an input event will have to wait.
-//
-// The wait time is approximated as the product of the probability the buffer
-// will be empty and the time it takes to produce an element into the buffer.
-//
-// The formula used for computing the probability is derived by modeling the
-// problem as an M/M/1/K queue
-// (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
-//
-// Collects derivatives of `ComputeWaitTime` w.r.t `output_time`, `input_time'
-// and `buffer_size` if the corresponding pointers are not `nullptr`.
-double ComputeWaitTime(double output_time, double input_time,
-                       double buffer_size, double* output_time_derivative,
-                       double* input_time_derivative,
-                       double* buffer_size_derivative) {
-  // Case 0: either the producer or the consumer are infinitely fast. Wait time
-  // is the time to produce an output.
-  if (output_time == 0 || input_time == 0) {
-    if (output_time_derivative) {
-      *output_time_derivative = 1.0L;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      *buffer_size_derivative = 0.0L;
-    }
-    return output_time;
-  }
-  // Case 1: the consumer is slower than the producer. Wait time is 0 since the
-  // buffer will be full in the long run.
-  if (input_time > output_time) {
-    if (output_time_derivative) {
-      *output_time_derivative = 0.0L;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      *buffer_size_derivative = 0.0L;
-    }
-    return 0;
-  }
-  // Case 2: the consumer and the producer are equally fast. Expected wait time
-  // decreases linearly with the size of the buffer.
-  if (input_time == output_time) {
-    const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
-    if (output_time_derivative) {
-      *output_time_derivative = p_buffer_empty;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      const double p_buffer_empty_der = -1.0L / Square(buffer_size + 1.0L);
-      *buffer_size_derivative = p_buffer_empty_der * output_time;
-    }
-    return p_buffer_empty * output_time;
-  }
-  // Case 3: the producer is slower than the consumer and neither is infinitely
-  // fast.
-  const double alpha = 1.0L / input_time;
-  const double beta = 1.0L / output_time;
-  const double ratio_pow = std::pow((beta / alpha), (buffer_size + 1.0L));
-  const double p_buffer_empty = (1.0L - beta / alpha) / (1.0L - ratio_pow);
-  if (output_time_derivative) {
-    *output_time_derivative =
-        (1.0L - ratio_pow -
-         (output_time - input_time) * (buffer_size + 1.0L) * ratio_pow /
-             output_time) /
-        Square(1.0L - ratio_pow);
-  }
-  if (input_time_derivative) {
-    *input_time_derivative =
-        (ratio_pow - 1.0L +
-         (buffer_size + 1.0L) * ratio_pow * (alpha / beta - 1.0L)) /
-        Square(1.0L - ratio_pow);
-  }
-  if (buffer_size_derivative) {
-    const double p_buffer_empty_der = (1.0L - beta / alpha) * ratio_pow *
-                                      std::log(beta / alpha) /
-                                      Square(1.0L - ratio_pow);
-    *buffer_size_derivative = p_buffer_empty_der * output_time;
-  }
-
-  return p_buffer_empty * output_time;
-}
-
 // The first input of InterleaveMany corresponds to the input dataset whose
 // elements are used to create the (derived) input datasets whose elements are
 // interleaved as output.
@@ -140,67 +46,93 @@ class InterleaveMany : public Node {
         Args{id_, name_, std::move(output)});
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
+    if (num_inputs() <= 1) {
+      (*input_times)[long_name()] = old_input_time;
+      return;
+    }
+    double new_input_time =
+        old_input_time +
+        SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
+    (*input_times)[long_name()] = new_input_time;
+  }
+
   // The output time is the sum of the self processing time and the average
   // output time of inputs comprising the interleave "cycle".
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          std::map<string, double>* gradient) const override
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
-    }
-    double delta = SelfProcessingTimeLocked() * (num_inputs() - 1);
-    input_times->back() += delta;
-    auto cleanup = gtl::MakeCleanup(
-        [input_times, delta]() { input_times->back() -= delta; });
-    double output_time;
-    if (gradient) {
-      std::map<string, double> inputs_gradient;
-      output_time =
-          (OutputTimeForInputs(input_times, &inputs_gradient) -
-           inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-          static_cast<double>(num_inputs() - 1);
-      for (auto& pair : inputs_gradient) {
-        (*gradient)[pair.first] =
-            pair.second / static_cast<double>(num_inputs() - 1);
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
       }
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + inputs_gradient[kInputTimeDerivativeKey] /
-                                    static_cast<double>(num_inputs() - 1);
+      return;
+    }
+
+    double output_time = (OutputTimeForInputs(*output_times) -
+                          (*output_times)[inputs_.front()->long_name()]) /
+                         static_cast<double>(num_inputs() - 1);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient /= static_cast<double>(num_inputs() - 1);
+        }
+      }
+
+      (*output_time_gradients)[long_name()] =
+          (OutputTimeGradientsForInputs(*output_time_gradients) -
+           (*output_time_gradients)[inputs_.front()->long_name()]) /
+          static_cast<double>(num_inputs() - 1);
+
       // Set derivatives w.r.t. tunable parameters of the subtree rooted in the
       // first input equal to 0 since its output time is excluded from
       // computations.
-      std::map<string, std::shared_ptr<Parameter>> first_input_parameters;
+      absl::flat_hash_map<string, std::shared_ptr<Parameter>>
+          first_input_parameters;
       inputs_.front()->CollectTunableParameters(&first_input_parameters);
       for (auto& pair : first_input_parameters) {
-        (*gradient)[pair.first] = 0.0L;
+        (*gradients)[pair.first] = 0.0L;
       }
-    } else {
-      output_time =
-          (OutputTimeForInputs(input_times, /*gradient=*/nullptr) -
-           inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-          static_cast<double>(num_inputs() - 1);
     }
-    return SelfProcessingTimeLocked() + output_time;
+    (*output_times)[long_name()] = self_processing_time + output_time;
   }
 
   // The processing time is the sum of the self processing time and the average
   // processing time of inputs comprising the interleave "cycle".
-  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void TotalProcessingTimeLocked(
+      absl::flat_hash_map<string, double>* processing_times,
+      absl::flat_hash_map<string, double>* total_processing_times) override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (num_inputs() <= 1) {
-      return self_processing_time;
+      (*total_processing_times)[long_name()] = self_processing_time;
+      return;
     }
     double processing_time =
-        (TotalProcessingTimeForInputs(processing_times) -
-         inputs_.front()->TotalProcessingTime(/*processing_times=*/nullptr)) /
+        (TotalProcessingTimeForInputs(*total_processing_times) -
+         (*total_processing_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
-    return self_processing_time + processing_time;
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -232,96 +164,129 @@ class AsyncInterleaveMany : public Node {
         Args{id_, name_, std::move(output)}, parameters);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double input_time;
+
+    if (num_inputs() <= 1) {
+      if (output_) {
+        input_time = (*input_times)[output_->long_name()];
+      } else {
+        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      }
+    } else {
+      input_time =
+          SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
+    }
+    (*input_times)[long_name()] = input_time;
+  }
+
   // The output time is estimated using `ComputeWaitTime(output_time,
   // input_time, parallelism, ...)`, where `output_time` is the sum of the
   // self-processing time and the average output time of inputs comprising the
   // interleave "cycle", `input_time` is specified through `input_times` and
   // `buffer_size` is derived from parallelism.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          std::map<string, double>* gradient) const override
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
+      }
+      return;
     }
-    double old_input_time = input_times->back();
-    double new_input_time =
-        SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
-    input_times->push_back(new_input_time);
-    auto cleanup =
-        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
+
+    double input_time;
+    if (output_) {
+      input_time = input_times.at(output_->long_name());
+    } else {
+      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
+    }
+
     double parallelism = num_inputs() - 1;  // default to cycle length
     auto* parameter = gtl::FindOrNull(parameters_, kParallelism);
     if (parameter) {
       parallelism = std::min(parallelism, (*parameter)->value);
     }
-    if (gradient) {
-      std::map<string, double> inputs_gradient;
-      double output_time_for_inputs =
-          OutputTimeForInputs(input_times, &inputs_gradient) -
-          inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr);
-      double output_time = output_time_for_inputs /
-                           static_cast<double>(num_inputs() - 1) / parallelism;
+
+    double output_time_for_inputs =
+        OutputTimeForInputs(*output_times) -
+        (*output_times)[inputs_.front()->long_name()];
+    double output_time = output_time_for_inputs /
+                         static_cast<double>(num_inputs() - 1) / parallelism;
+    double result;
+
+    if (gradients) {
       double output_time_der = 0.0L;
       double input_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      double result = ComputeWaitTime(
-          SelfProcessingTimeLocked() + output_time, old_input_time, parallelism,
-          &output_time_der, &input_time_der, &buffer_size_der);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + input_time_der;
+      result = ComputeWaitTime(self_processing_time + output_time, input_time,
+                               parallelism, &output_time_der, &input_time_der,
+                               &buffer_size_der);
+      (*output_time_gradients)[long_name()] = input_time_der;
       double parallelism_der = -output_time_for_inputs /
                                static_cast<double>(num_inputs() - 1) /
                                Square(parallelism);
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = output_time_der * pair.second /
-                                    static_cast<double>(num_inputs() - 1) /
-                                    parallelism;
+
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= (output_time_der /
+                        static_cast<double>(num_inputs() - 1) / parallelism);
         }
       }
+
       // Set derivatives w.r.t. tunable parameters of the subtree rooted in the
       // first input equal to 0 since its output time is excluded from
       // computations.
-      std::map<string, std::shared_ptr<Parameter>> first_input_parameters;
+      absl::flat_hash_map<string, std::shared_ptr<Parameter>>
+          first_input_parameters;
       inputs_.front()->CollectTunableParameters(&first_input_parameters);
       for (auto& pair : first_input_parameters) {
-        (*gradient)[pair.first] = 0.0L;
+        (*gradients)[pair.first] = 0.0L;
       }
       // Add derivative w.r.t. own parallelism parameter.
       if (parameter && (*parameter)->state->tunable) {
-        (*gradient)[long_name()] =
+        (*gradients)[long_name()] =
             output_time_der * parallelism_der + buffer_size_der;
       }
-      return result;
+    } else {
+      result = ComputeWaitTime(self_processing_time + output_time, input_time,
+                               parallelism,
+                               /*output_time_derivative=*/nullptr,
+                               /*input_time_derivative=*/nullptr,
+                               /*buffer_size_derivative=*/nullptr);
     }
-    double output_time =
-        (OutputTimeForInputs(input_times, /*gradient=*/nullptr) -
-         inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-        static_cast<double>(num_inputs() - 1) / parallelism;
-    return ComputeWaitTime(
-        SelfProcessingTimeLocked() + output_time, old_input_time, parallelism,
-        /*output_time_derivative=*/nullptr,
-        /*input_time_derivative=*/nullptr, /*buffer_size_derivative=*/nullptr);
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the average
   // processing time of inputs comprising the interleave "cycle".
-  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void TotalProcessingTimeLocked(
+      absl::flat_hash_map<string, double>* processing_times,
+      absl::flat_hash_map<string, double>* total_processing_times) override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (num_inputs() <= 1) {
-      return self_processing_time;
+      (*total_processing_times)[long_name()] = self_processing_time;
+      return;
     }
     double processing_time =
-        TotalProcessingTimeForInputs(processing_times) -
-        inputs_.front()->TotalProcessingTime(/*processing_times=*/nullptr);
-    return self_processing_time +
-           processing_time / static_cast<double>(num_inputs() - 1);
+        (TotalProcessingTimeForInputs(*total_processing_times) -
+         (*total_processing_times)[inputs_.front()->long_name()]) /
+        static_cast<double>(num_inputs() - 1);
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -338,53 +303,71 @@ class KnownRatio : public Node {
                                         ratio_);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
+    if (ratio_ == 0) {
+      (*input_times)[long_name()] = old_input_time;
+      return;
+    }
+    double new_input_time =
+        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
+    (*input_times)[long_name()] = new_input_time;
+  }
+
   // The output time is the sum of the self processing time and the product of
   // `ratio_` and the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          std::map<string, double>* gradient) const override
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (ratio_ == 0) {
-      return SelfProcessingTimeLocked();
-    }
-    double old_input_time = input_times->back();
-    input_times->back() +=
-        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
-    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
-      input_times->back() = old_input_time;
-    });
-    double result;
-    if (gradient) {
-      std::map<string, double> inputs_gradient;
-      result = SelfProcessingTimeLocked() +
-               ratio_ * OutputTimeForInputs(input_times, &inputs_gradient);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + ratio_ *
-                                    inputs_gradient[kInputTimeDerivativeKey] *
-                                    (1.0L + 1.0L / ratio_);
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio_;
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
         }
       }
-    } else {
-      result = SelfProcessingTimeLocked() +
-               ratio_ * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
+      return;
     }
-    return result;
+    double result =
+        self_processing_time + ratio_ * OutputTimeForInputs(*output_times);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= ratio_;
+        }
+      }
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
   // of `ratio_` and the sum of processing times of inputs.
-  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void TotalProcessingTimeLocked(
+      absl::flat_hash_map<string, double>* processing_times,
+      absl::flat_hash_map<string, double>* total_processing_times) override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
     }
-    return self_processing_time +
-           ratio_ * TotalProcessingTimeForInputs(processing_times);
+    double processing_time =
+        ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 
  private:
@@ -414,6 +397,29 @@ class AsyncKnownRatio : public Node {
         Args{id_, name_, std::move(output)}, ratio_, parameters);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double input_time;
+
+    if (ratio_ == 0.0) {
+      if (output_) {
+        input_time = (*input_times)[output_->long_name()];
+      } else {
+        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      }
+      (*input_times)[long_name()] = input_time;
+      return;
+    }
+
+    double parallelism = 1.0;
+    auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
+    if (parallelism_parameter) {
+      parallelism = (*parallelism_parameter)->value;
+    }
+    input_time = SelfProcessingTimeLocked() / ratio_ / parallelism;
+    (*input_times)[long_name()] = input_time;
+  }
+
   // The output time is estimated using `ComputeWaitTime(output_time,
   // input_time, parallelism, ...)`, where `output_time` is the sum of the self
   // processing time and the product of `ratio_` and the sum of output times of
@@ -421,8 +427,11 @@ class AsyncKnownRatio : public Node {
   // has parallelism parameter, then `buffer_size` is derived from parallelism.
   //
   // Current implementation assumes that there is at most 1 parameter per node.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          std::map<string, double>* gradient) const override
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
     double parallelism = 1.0;
     double buffer_size = 0.0;
@@ -435,92 +444,101 @@ class AsyncKnownRatio : public Node {
       buffer_size = (*buffer_size_parameter)->value;
     }
     double self_processing_time = SelfProcessingTimeLocked();
+    double result;
+    double input_time;
+    if (output_) {
+      input_time = input_times.at(output_->long_name());
+    } else {
+      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
+    }
+
     if (ratio_ == 0.0) {
       double output_time = self_processing_time / parallelism;
-      if (gradient) {
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
+
         double output_time_der = 0.0L;
         double input_time_der = 0.0L;
         double buffer_size_der = 0.0L;
-        double result = ComputeWaitTime(output_time, input_times->back(),
-                                        buffer_size, &output_time_der,
-                                        &input_time_der, &buffer_size_der);
-        auto last_input_time_der =
-            gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-        (*gradient)[kInputTimeDerivativeKey] =
-            last_input_time_der + input_time_der;
+        result = ComputeWaitTime(output_time, input_time, buffer_size,
+                                 &output_time_der, &input_time_der,
+                                 &buffer_size_der);
+        (*output_time_gradients)[long_name()] = input_time_der;
         // Add derivative w.r.t. own parameter if it's tunable.
         if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-          (*gradient)[long_name()] =
+          (*gradients)[long_name()] =
               -output_time_der * self_processing_time / Square(parallelism) +
               buffer_size_der;
         } else if (buffer_size_parameter &&
                    (*buffer_size_parameter)->state->tunable) {
-          (*gradient)[long_name()] = buffer_size_der;
+          (*gradients)[long_name()] = buffer_size_der;
         }
-        return result;
+      } else {
+        result = ComputeWaitTime(output_time, input_time, buffer_size,
+                                 /*output_time_derivative=*/nullptr,
+                                 /*input_time_derivative=*/nullptr,
+                                 /*buffer_size_derivative=*/nullptr);
       }
-      return ComputeWaitTime(output_time, input_times->back(), buffer_size,
-                             /*output_time_derivative=*/nullptr,
-                             /*input_time_derivative=*/nullptr,
-                             /*buffer_size_derivative=*/nullptr);
+      (*output_times)[long_name()] = result;
+      return;
     }
-    double old_input_time = input_times->back();
-    double new_input_time = self_processing_time / ratio_ / parallelism;
-    input_times->push_back(new_input_time);
-    auto cleanup =
-        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-    if (gradient) {
-      std::map<string, double> inputs_gradient;
+
+    double output_time = self_processing_time / parallelism +
+                         ratio_ * OutputTimeForInputs(*output_times);
+    if (gradients) {
       double output_time_der = 0.0L;
       double input_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      double output_time =
-          self_processing_time / parallelism +
-          ratio_ * OutputTimeForInputs(input_times, &inputs_gradient);
-      double result =
-          ComputeWaitTime(output_time, old_input_time, buffer_size,
+      result =
+          ComputeWaitTime(output_time, input_time, buffer_size,
                           &output_time_der, &input_time_der, &buffer_size_der);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + input_time_der;
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio_ * output_time_der;
+      (*output_time_gradients)[long_name()] = input_time_der;
+
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= (ratio_ * output_time_der);
         }
       }
+
       // Add derivative w.r.t. own parameter if it's tunable.
       if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-        (*gradient)[long_name()] =
+        double inputs_time_der_sum =
+            OutputTimeGradientsForInputs(*output_time_gradients);
+        (*gradients)[long_name()] =
             -output_time_der * self_processing_time / Square(parallelism) +
             buffer_size_der -
-            output_time_der * inputs_gradient[kInputTimeDerivativeKey] *
-                self_processing_time / Square(parallelism);
+            output_time_der * inputs_time_der_sum * self_processing_time /
+                Square(parallelism);
       } else if (buffer_size_parameter &&
                  (*buffer_size_parameter)->state->tunable) {
-        (*gradient)[long_name()] = buffer_size_der;
+        (*gradients)[long_name()] = buffer_size_der;
       }
-      return result;
+    } else {
+      result = ComputeWaitTime(output_time, input_time, buffer_size,
+                               /*output_time_derivative=*/nullptr,
+                               /*input_time_derivative=*/nullptr,
+                               /*buffer_size_derivative=*/nullptr);
     }
-    double output_time =
-        self_processing_time / parallelism +
-        ratio_ * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
-    return ComputeWaitTime(output_time, old_input_time, buffer_size,
-                           /*output_time_derivative=*/nullptr,
-                           /*input_time_derivative=*/nullptr,
-                           /*buffer_size_derivative=*/nullptr);
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
   // of `ratio_` and the sum of processing times of inputs.
-  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void TotalProcessingTimeLocked(
+      absl::flat_hash_map<string, double>* processing_times,
+      absl::flat_hash_map<string, double>* total_processing_times) override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
     }
-    return self_processing_time +
-           ratio_ * TotalProcessingTimeForInputs(processing_times);
+    double processing_time =
+        ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 
  private:
@@ -539,64 +557,89 @@ class UnknownRatio : public Node {
     return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
   }
 
-  // The output time is the sum of the self processing time and the product of
-  // the ratio estimate and the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          std::map<string, double>* gradient) const override
-      TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
     if (num_elements_ == 0 || inputs_.empty() ||
         inputs_.front()->num_elements() == 0) {
-      return SelfProcessingTimeLocked();
+      (*input_times)[long_name()] = old_input_time;
+      return;
     }
-    // TODO(jsimsa): The current implementation assumes that the number of input
-    // elements consumed per output is the same across all inputs.
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    double old_input_time = input_times->back();
-    input_times->back() = (old_input_time + SelfProcessingTimeLocked()) / ratio;
-    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
-      input_times->back() = old_input_time;
-    });
-    if (gradient) {
-      std::map<string, double> inputs_gradient;
-      double result =
-          SelfProcessingTimeLocked() +
-          ratio * OutputTimeForInputs(input_times, &inputs_gradient);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der +
-          inputs_gradient[kInputTimeDerivativeKey] / ratio;
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio;
+    double new_input_time =
+        (old_input_time + SelfProcessingTimeLocked()) / ratio;
+    (*input_times)[long_name()] = new_input_time;
+  }
+
+  // The output time is the sum of the self processing time and the product of
+  // the ratio estimate and the sum of output times of inputs.
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
+    if (num_elements_ == 0 || inputs_.empty() ||
+        inputs_.front()->num_elements() == 0) {
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
         }
       }
-      return result;
+      return;
     }
-    return SelfProcessingTimeLocked() +
-           ratio * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
+    // TODO(jsimsa): The current implementation assumes that the number of input
+    // elements consumed per output is the same across all inputs.
+    double ratio = static_cast<double>(inputs_.front()->num_elements()) /
+                   static_cast<double>(num_elements_);
+    double result =
+        self_processing_time + ratio * OutputTimeForInputs(*output_times);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= ratio;
+        }
+      }
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
   // of the ratio estimate and the sum of processing times of inputs.
-  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void TotalProcessingTimeLocked(
+      absl::flat_hash_map<string, double>* processing_times,
+      absl::flat_hash_map<string, double>* total_processing_times) override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (inputs_.empty() || num_elements_ == 0) {
-      return self_processing_time;
+      (*total_processing_times)[long_name()] = self_processing_time;
+      return;
     }
     // TODO(jsimsa): The current implementation assumes that the number of input
     // elements consumed per output is the same across all inputs.
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    return self_processing_time +
-           ratio * TotalProcessingTimeForInputs(processing_times);
+    double processing_time =
+        ratio * TotalProcessingTimeForInputs(*total_processing_times);
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -612,22 +655,47 @@ class Unknown : public Node {
     return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double input_time;
+    if (output_) {
+      input_time = (*input_times)[output_->long_name()];
+    } else {
+      input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+    (*input_times)[long_name()] = input_time;
+  }
+
   // The output time is the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          std::map<string, double>* gradient) const override
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return OutputTimeForInputs(input_times, gradient);
+    double result = OutputTimeForInputs(*output_times);
+    (*output_times)[long_name()] = result;
+    if (gradients) {
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
   }
 
   // The processing time is the sum of processing times of inputs.
-  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return TotalProcessingTimeForInputs(processing_times);
+  void TotalProcessingTimeLocked(
+      absl::flat_hash_map<string, double>* processing_times,
+      absl::flat_hash_map<string, double>* total_processing_times) override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double processing_time =
+        TotalProcessingTimeForInputs(*total_processing_times);
+    (*total_processing_times)[long_name()] = processing_time;
   }
 };
 
 }  // namespace
 
+thread_local int64 Node::work_start_;
+
 std::shared_ptr<Parameter> MakeParameter(const string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max) {
@@ -667,8 +735,495 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args) {
   return std::make_shared<Unknown>(std::move(args));
 }
 
+double Node::ComputeWaitTime(const double& output_time,
+                             const double& input_time,
+                             const double& buffer_size,
+                             double* output_time_derivative,
+                             double* input_time_derivative,
+                             double* buffer_size_derivative) {
+  // If we set x=`input_time`, y=`output_time`, n=`buffer_size`,
+  // p=`p_buffer_empty`, T=`wait_time`, then we have:
+  // if y = 0, then p = 0;
+  // elif x = 0, then p = 1;
+  // elif x = y, then p = 1 / (n+1);
+  // else p = [1 - x/y] / [1 - power(x/y, n+1)].
+  //
+  // We also have T = p * y, and derivatives of T w.r.t. x, y, n are computed:
+  // dT/dx = dp/dx * y,
+  // dT/dy = p + dp/dy * y,
+  // dT/dn = dp/dn * y.
+  // Then the remaining work is to compute dp/dx, dp/dy, dp/dn by considering
+  // different cases and substitute the values into above formulas.
+
+  // Case 1: if producer is infinitely fast. The buffer will always be full.
+  // Wait time will always be 0.
+  if (output_time == 0) {
+    if (output_time_derivative) {
+      // Note a common error is `*output_time_derivative = 0` since p=0 on the
+      // line y=0 doesn't imply dp/dy = 0 there. Actually to compute dp/dy at
+      // (x,0), we need to consider lim_{dy->0+} [p(x,dy)-p(x,0)] / dy, where
+      // p(x,0)=0 and p(x,dy) = [1 - x/dy] / [1 - power(x/dy, n+1)].
+      if (buffer_size == 0 || input_time == 0) {
+        *output_time_derivative = 1.0L;
+      } else {
+        *output_time_derivative = 0.0L;
+      }
+    }
+    if (input_time_derivative) {
+      *input_time_derivative = 0.0L;
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = 0.0L;
+    }
+    return 0.0L;
+  }
+
+  // Case 2: if consumer is infinitely fast. Wait time is always the time to
+  // produce an output.
+  if (input_time == 0) {
+    if (output_time_derivative) {
+      *output_time_derivative = 1.0L;
+    }
+    if (input_time_derivative) {
+      // Note a common error is `*input_time_derivative = 0` since p=1 on the
+      // line x=0 doesn't imply dp/dx = 0 there. Actually to compute dp/dx at
+      // (0,y), we need to consider lim_{dx->0+} [p(dx,y)-p(0,y)] / dx, where
+      // p(0,y)=1, p(dx,y) = [1 - dx/y] / [1 - power(dx/y, n+1)] if y!=0.
+      if (buffer_size == 0) {
+        *input_time_derivative = 0.0L;
+      } else {
+        *input_time_derivative = -1.0L;
+      }
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = 0.0L;
+    }
+    return output_time;
+  }
+
+  // Case 3: the consumer and the producer are equally fast. Expected wait time
+  // decreases linearly with the size of the buffer.
+  if (input_time == output_time) {
+    const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
+    const double p_buffer_empty_der =
+        -buffer_size / (2.0L * buffer_size + 2.0L);
+    if (output_time_derivative) {
+      // Note a common error is `*output_time_derivative = p_buffer_empty` since
+      // p=1/(n+1) on the line x=y doesn't imply dp/dy = 0 there. Actually to
+      // compute dp/dy at (y,y), we need to consider
+      // lim_{dy->0} [p(y,y+dy)-p(y,y)] / dy, where p(y,y)=1/(n+1),
+      // p(y,y+dy) = [1 - y/(y+dy)] / [1 - power(y/(y+dy), n+1)].
+      *output_time_derivative = p_buffer_empty - p_buffer_empty_der;
+    }
+    if (input_time_derivative) {
+      // Note a common error is `*input_time_derivative = 0` since
+      // p=1/(n+1) on the line x=y doesn't imply dp/dx = 0 there. Actually to
+      // compute dp/dx at (x,x), we need to consider
+      // lim_{dx->0} [p(x+dx,x)-p(x,x)] / dx, where p(x,x)=1/(n+1),
+      // p(x+dx,x) = [1 - (x+dx)/x] / [1 - power((x+dx)/x, n+1)].
+      *input_time_derivative = p_buffer_empty_der;
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = -output_time / Square(buffer_size + 1.0L);
+    }
+    return p_buffer_empty * output_time;
+  }
+
+  // Case 4: the consumer is slower than the producer and neither is infinitely
+  // fast. Case 4 and Case 5 actually follow same formula. Separate them for
+  // numerical computation reasons.
+  if (input_time > output_time) {
+    const double ratio = output_time / input_time;
+    const double ratio_pow = std::pow(ratio, buffer_size);
+    const double p_buffer_empty =
+        ratio_pow * (1.0L - ratio) / (1.0L - ratio * ratio_pow);
+    const double p_buffer_empty_der =
+        (buffer_size - (buffer_size + 1.0L) * ratio + ratio_pow * ratio) *
+        ratio_pow / ratio / Square(1.0L - ratio_pow * ratio);
+    if (output_time_derivative) {
+      *output_time_derivative = p_buffer_empty + p_buffer_empty_der * ratio;
+    }
+    if (input_time_derivative) {
+      *input_time_derivative = -p_buffer_empty_der * Square(ratio);
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
+                                std::log(ratio) * output_time;
+    }
+    return p_buffer_empty * output_time;
+  }
+
+  // Case 5: the producer is slower than the consumer and neither is infinitely
+  // fast.
+  const double ratio = input_time / output_time;
+  const double ratio_pow = std::pow(ratio, buffer_size);
+  const double p_buffer_empty = (1.0L - ratio) / (1.0L - ratio_pow * ratio);
+  const double p_buffer_empty_der =
+      ((buffer_size + 1.0L - buffer_size * ratio) * ratio_pow - 1.0L) /
+      Square(1.0L - ratio_pow * ratio);
+  if (output_time_derivative) {
+    *output_time_derivative = p_buffer_empty - p_buffer_empty_der * ratio;
+  }
+  if (input_time_derivative) {
+    *input_time_derivative = p_buffer_empty_der;
+  }
+  if (buffer_size_derivative) {
+    *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
+                              ratio_pow * ratio * std::log(ratio) * output_time;
+  }
+  return p_buffer_empty * output_time;
+}
+
+void Node::CollectTunableParameters(
+    absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
+  tf_shared_lock l(mu_);
+  // Collect tunable parameters from the leaves of the nodes tree to the root.
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
+    node->CollectTunableParametersHelper(parameters);
+  }
+  CollectTunableParametersHelper(parameters);
+}
+
+string Node::DebugString() const {
+  absl::flat_hash_map<string, string> debug_strings;
+  tf_shared_lock l(mu_);
+  // Build up the debug string from the leaves of the nodes tree to the root.
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
+    node->DebugStringHelper(&debug_strings);
+  }
+  DebugStringHelper(&debug_strings);
+
+  return debug_strings[long_name()];
+}
+
+void Node::FlushMetrics() {
+  if (!record_metrics_) {
+    return;
+  }
+  metrics_.record_bytes_consumed(bytes_consumed_);
+  metrics_.record_bytes_produced(bytes_produced_);
+  metrics_.record_num_elements(num_elements_);
+}
+
+double Node::OutputTime(absl::flat_hash_map<string, double>* input_times,
+                        absl::flat_hash_map<string, double>* gradients) const {
+  // To store the output time gradient w.r.t. input time (if `gradients` is not
+  // `nullptr`) and the output time for each node.
+  absl::flat_hash_map<string, double> output_time_gradients, output_times;
+  tf_shared_lock l(mu_);
+  auto nodes = CollectNodes(TraversalOrder::BFS);
+
+  // Computes and stores input time for each node from the root to leaves of the
+  // nodes tree.
+  InputTimeLocked(input_times);
+  for (const auto& node : nodes) {
+    tf_shared_lock l(node->mu_);
+    node->InputTimeLocked(input_times);
+  }
+
+  std::reverse(nodes.begin(), nodes.end());
+  // Computes and stores the output time and output time gradient w.r.t. input
+  // time (if `gradients` is not `nullptr`) for each node from leaves of the
+  // nodes tree to the root.
+  for (const auto& node : nodes) {
+    tf_shared_lock l(node->mu_);
+    node->OutputTimeLocked(*input_times, gradients, &output_times,
+                           &output_time_gradients);
+  }
+  OutputTimeLocked(*input_times, gradients, &output_times,
+                   &output_time_gradients);
+
+  return output_times[long_name()];
+}
+
+std::shared_ptr<Node> Node::Snapshot(std::shared_ptr<Node> output) const {
+  NodePairList node_pairs;
+  auto result = SnapshotHelper(output, &node_pairs);
+
+  while (!node_pairs.empty()) {
+    auto node_pair = node_pairs.front();
+    node_pairs.pop_front();
+    std::shared_ptr<Node> input_node = node_pair.first,
+                          parent_node_copy = node_pair.second;
+    parent_node_copy->add_input(
+        input_node->SnapshotHelper(parent_node_copy, &node_pairs));
+  }
+  return result;
+}
+
+double Node::SelfProcessingTime() const {
+  tf_shared_lock l(mu_);
+  return SelfProcessingTimeLocked();
+}
+
+double Node::TotalBufferedBytes() const {
+  absl::flat_hash_map<string, double> total_bytes;
+  tf_shared_lock l(mu_);
+  // Compute total buffered bytes from the leaves of the nodes tree to the root.
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
+    node->TotalBufferedBytesHelper(&total_bytes);
+  }
+  TotalBufferedBytesHelper(&total_bytes);
+
+  return total_bytes[long_name()];
+}
+
+double Node::TotalMaximumBufferedBytes() const {
+  absl::flat_hash_map<string, double> total_bytes;
+  tf_shared_lock l(mu_);
+  // Compute total maximum buffered bytes from the leaves of the nodes tree
+  // to the root.
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
+    node->TotalMaximumBufferedBytesHelper(&total_bytes);
+  }
+  TotalMaximumBufferedBytesHelper(&total_bytes);
+
+  return total_bytes[long_name()];
+}
+
+double Node::TotalProcessingTime(
+    absl::flat_hash_map<string, double>* processing_times) {
+  // Create a hash map to store the per-element CPU time spent in the subtree
+  // rooted in each node.
+  absl::flat_hash_map<string, double> total_processing_times;
+  tf_shared_lock l(mu_);
+
+  // Computes per-element CPU time spent in the subtree rooted in the node from
+  // the leaves of the nodes tree to the root.
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
+    node->TotalProcessingTimeLocked(processing_times, &total_processing_times);
+  }
+  TotalProcessingTimeLocked(processing_times, &total_processing_times);
+
+  return total_processing_times[long_name()];
+}
+
+double Node::AverageBufferedElementSize() const {
+  if (buffered_elements_ == 0) {
+    return 0;
+  }
+  return static_cast<double>(buffered_bytes_) /
+         static_cast<double>(buffered_elements_);
+}
+
+double Node::OutputTimeForInputs(
+    const absl::flat_hash_map<string, double>& output_times) const {
+  double sum = 0;
+  for (auto& input : inputs_) {
+    // Inputs for which autotuning is disabled are excluded.
+    if (input->autotune()) {
+      sum += output_times.at(input->long_name());
+    }
+  }
+  return sum;
+}
+
+double Node::OutputTimeGradientsForInputs(
+    const absl::flat_hash_map<string, double>& output_time_gradients) const {
+  double sum = 0;
+  for (auto& input : inputs_) {
+    // Inputs for which autotuning is disabled are excluded.
+    if (input->autotune()) {
+      sum +=
+          gtl::FindWithDefault(output_time_gradients, input->long_name(), 0.0L);
+    }
+  }
+  return sum;
+}
+
+double Node::TotalProcessingTimeForInputs(
+    const absl::flat_hash_map<string, double>& total_processing_times) {
+  // If the number of elements produced by an input is smaller than this
+  // constant, then its processing time is estimated using a weighted average
+  // of the empirical processing time and processing time history.
+  constexpr int kNumElementsThreshold = 30;
+
+  // Identifies the minimum number of input processing times to collect
+  // before the processing time history is used as a prior.
+  constexpr int kCountThreshold = 30;
+
+  double sum = 0;
+  for (auto& input : inputs_) {
+    // Inputs for which autotuning is disabled are excluded.
+    if (input->autotune()) {
+      double input_processing_time =
+          total_processing_times.at(input->long_name());
+      int64 num_elements = input->num_elements();
+      if (num_elements < kNumElementsThreshold) {
+        if (input_processing_time_count_ < kCountThreshold) {
+          sum += input_processing_time;
+        } else {
+          // The fewer elements the input has produced so far, the more weight
+          // is assigned to the prior to reduce volatility.
+          double prior_weight = 1.0L / static_cast<double>(2 << num_elements);
+          double prior =
+              input_processing_time_sum_ / input_processing_time_count_;
+          sum += (1.0L - prior_weight) * input_processing_time +
+                 prior_weight * prior;
+        }
+      } else {
+        sum += input_processing_time;
+        input_processing_time_count_++;
+        input_processing_time_sum_ += input_processing_time;
+      }
+    }
+  }
+  return sum;
+}
+
+double Node::SelfProcessingTimeLocked() const {
+  if (num_elements_ == 0) {
+    return 0;
+  }
+  return static_cast<double>(processing_time_) /
+         static_cast<double>(num_elements_);
+}
+
+Node::NodeVector Node::CollectNodes(TraversalOrder order) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
+  NodeVector node_vector;
+  std::list<std::shared_ptr<Node>> temp_list;
+
+  {
+    for (auto& input : inputs_) {
+      node_vector.push_back(input);
+      temp_list.push_back(input);
+    }
+  }
+
+  while (!temp_list.empty()) {
+    auto cur_node = temp_list.front();
+    temp_list.pop_front();
+    {
+      tf_shared_lock l(cur_node->mu_);
+      for (auto& input : cur_node->inputs_) {
+        node_vector.push_back(input);
+        temp_list.push_back(input);
+      }
+    }
+  }
+
+  if (order == TraversalOrder::REVERSE_BFS) {
+    std::reverse(node_vector.begin(), node_vector.end());
+  }
+  return node_vector;
+}
+
+void Node::CollectTunableParametersHelper(
+    absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
+  if (!autotune_) {
+    return;
+  }
+  for (auto& pair : parameters_) {
+    if (pair.second->state->tunable) {
+      parameters->insert(std::make_pair(long_name(), pair.second));
+    }
+  }
+}
+
+void Node::DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
+    const TF_SHARED_LOCKS_REQUIRED(mu_) {
+  string result;
+  strings::StrAppend(&result, long_name(), ":\n");
+  strings::StrAppend(&result, "  autotune=", autotune_.load(), "\n");
+  strings::StrAppend(&result, "  buffered_bytes=", buffered_bytes_.load(),
+                     "\n");
+  strings::StrAppend(&result, "  buffered_elements=", buffered_elements_.load(),
+                     "\n");
+  strings::StrAppend(&result, "  bytes_consumed=", bytes_consumed_.load(),
+                     "\n");
+  strings::StrAppend(&result, "  bytes_produced=", bytes_produced_.load(),
+                     "\n");
+  strings::StrAppend(&result, "  processing_time=", processing_time_.load(),
+                     "\n");
+  strings::StrAppend(&result, "  num_elements=", num_elements_.load(), "\n");
+  string inputs;
+  for (auto& input : inputs_) {
+    strings::StrAppend(&inputs, input->long_name(), ",");
+  }
+  strings::StrAppend(&result, "  inputs={", inputs, "}\n");
+  for (auto& input : inputs_) {
+    strings::StrAppend(&result, debug_strings->at(input->long_name()));
+  }
+  debug_strings->insert(std::make_pair(long_name(), result));
+}
+
+std::shared_ptr<Node> Node::SnapshotHelper(
+    std::shared_ptr<Node> clone_base, Node::NodePairList* node_pairs) const {
+  tf_shared_lock l(mu_);
+  std::shared_ptr<Node> result_node = Clone(clone_base);
+  {
+    result_node->autotune_.store(autotune_);
+    result_node->buffered_bytes_.store(buffered_bytes_);
+    result_node->buffered_elements_.store(buffered_elements_);
+    result_node->bytes_consumed_.store(bytes_consumed_);
+    result_node->bytes_produced_.store(bytes_produced_);
+    result_node->num_elements_.store(num_elements_);
+    result_node->record_metrics_.store(false);
+    result_node->processing_time_.store(processing_time_);
+    mutex_lock l2(result_node->mu_);
+    result_node->parameters_ = parameters_;
+  }
+
+  for (auto& input : inputs_) {
+    node_pairs->push_back(std::make_pair(input, result_node));
+  }
+  return result_node;
+}
+
+void Node::TotalBufferedBytesHelper(
+    absl::flat_hash_map<string, double>* total_bytes) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
+  if (!autotune_) {
+    total_bytes->insert(std::make_pair(long_name(), 0));
+    return;
+  }
+
+  double result = 0;
+  auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
+  if (!parameter) {
+    parameter = gtl::FindOrNull(parameters_, kParallelism);
+  }
+  if (parameter) {
+    result = buffered_bytes_;
+  }
+  for (auto& input : inputs_) {
+    result += total_bytes->at(input->long_name());
+  }
+  total_bytes->insert(std::make_pair(long_name(), result));
+}
+
+void Node::TotalMaximumBufferedBytesHelper(
+    absl::flat_hash_map<string, double>* total_bytes) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
+  if (!autotune_) {
+    total_bytes->insert(std::make_pair(long_name(), 0));
+    return;
+  }
+
+  double result = 0;
+  auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
+  if (!parameter) {
+    parameter = gtl::FindOrNull(parameters_, kParallelism);
+  }
+  if (parameter) {
+    result = (*parameter)->value * AverageBufferedElementSize();
+  }
+  for (auto& input : inputs_) {
+    result += total_bytes->at(input->long_name());
+  }
+  total_bytes->insert(std::make_pair(long_name(), result));
+}
+
 void Model::AddNode(Node::Factory factory, const string& name,
-                    const string& output_name, Node** out_node) {
+                    const string& output_name,
+                    std::shared_ptr<Node>* out_node) {
   // The name captures the sequence of iterators joined by `::`. We use the full
   // sequence as the key in the lookup table, but only the last element of the
   // sequence as the name node.
@@ -700,15 +1255,7 @@ void Model::AddNode(Node::Factory factory, const string& name,
   collect_resource_usage_ =
       collect_resource_usage_ || node->has_tunable_parameters();
   lookup_table_.insert(std::make_pair(name, node));
-  *out_node = node.get();
-}
-
-void Model::AddProcessingTime(const string& name, int64 delta) {
-  tf_shared_lock l(mu_);
-  auto node = gtl::FindOrNull(lookup_table_, name);
-  if (node) {
-    (*node)->add_processing_time(delta);
-  }
+  *out_node = node;
 }
 
 void Model::FlushMetrics() {
@@ -718,15 +1265,6 @@ void Model::FlushMetrics() {
   }
 }
 
-int64 Model::NumElements(const string& name) {
-  tf_shared_lock l(mu_);
-  auto node = gtl::FindOrNull(lookup_table_, name);
-  if (node) {
-    return (*node)->num_elements();
-  }
-  return 0;
-}
-
 void Model::Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget,
                      int64 ram_budget) {
   switch (algorithm) {
@@ -739,30 +1277,6 @@ void Model::Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget,
   }
 }
 
-void Model::RecordStart(const string& name, bool stop_output) {
-  tf_shared_lock l(mu_);
-  auto node = gtl::FindOrNull(lookup_table_, name);
-  if (collect_resource_usage_ && node) {
-    int64 now_nanos = absl::GetCurrentTimeNanos();
-    if (stop_output && (*node)->output()) {
-      (*node)->output()->record_stop(now_nanos);
-    }
-    (*node)->record_start(now_nanos);
-  }
-}
-
-void Model::RecordStop(const string& name, bool start_output) {
-  tf_shared_lock l(mu_);
-  auto node = gtl::FindOrNull(lookup_table_, name);
-  if (collect_resource_usage_ && node) {
-    int64 now_nanos = absl::GetCurrentTimeNanos();
-    (*node)->record_stop(now_nanos);
-    if (start_output && (*node)->output()) {
-      (*node)->output()->record_start(now_nanos);
-    }
-  }
-}
-
 void Model::RemoveNode(const string& name) {
   mutex_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
@@ -775,27 +1289,27 @@ void Model::RemoveNode(const string& name) {
   lookup_table_.erase(name);
 }
 
-std::map<string, std::shared_ptr<Parameter>> Model::CollectTunableParameters(
-    std::shared_ptr<Node> node) {
-  std::map<string, std::shared_ptr<Parameter>> parameters;
+absl::flat_hash_map<string, std::shared_ptr<Parameter>>
+Model::CollectTunableParameters(std::shared_ptr<Node> node) {
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   node->CollectTunableParameters(&parameters);
   return parameters;
 }
 
-std::map<string, std::shared_ptr<Parameter>> Model::CollectEssentialParallelism(
-    std::shared_ptr<Node> node) {
+absl::flat_hash_map<string, std::shared_ptr<Parameter>>
+Model::CollectEssentialParallelism(std::shared_ptr<Node> node) {
   // Parallelism parameter is considered to be essential if the corresponding
   // transformations's processing time is greater than essential rate times the
   // average transformation self processing time.
   constexpr double kEssentialRate = 0.3L;
 
-  std::map<string, std::shared_ptr<Parameter>> parameters;
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   node->CollectTunableParameters(&parameters);
-  std::map<string, double> processing_times;
+  absl::flat_hash_map<string, double> processing_times;
   double processing_time = node->TotalProcessingTime(&processing_times);
   double uniform_share =
       processing_time / static_cast<double>(processing_times.size());
-  std::map<string, std::shared_ptr<Parameter>> essential_parameters;
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> essential_parameters;
   for (auto& pair : parameters) {
     if (pair.second->name == kParallelism &&
         processing_times[pair.first] > kEssentialRate * uniform_share) {
@@ -834,8 +1348,8 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
   double new_output_time;
   double new_value;
   for (int i = 0; i < kMaxIterations; ++i) {
-    std::map<string, double> gradient;
-    new_output_time = OutputTime(snapshot, &gradient);
+    absl::flat_hash_map<string, double> gradients;
+    new_output_time = OutputTime(snapshot, &gradients);
     int64 model_parallelism = 0;
     for (auto& pair : essential_parameters) {
       model_parallelism += std::round(pair.second->value);
@@ -852,12 +1366,12 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
     for (auto& pair : parameters) {
       if (pair.second->value != pair.second->max) {
         max_abs_derivative =
-            std::max(max_abs_derivative, std::abs(gradient[pair.first]));
+            std::max(max_abs_derivative, std::abs(gradients[pair.first]));
       }
     }
     for (auto& pair : parameters) {
       new_value = pair.second->value -
-                  kDescentStep * gradient[pair.first] / max_abs_derivative;
+                  kDescentStep * gradients[pair.first] / max_abs_derivative;
       // Projection on a feasible interval.
       if (new_value > pair.second->max) {
         pair.second->value = pair.second->max;
@@ -901,7 +1415,7 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
     pair.second->value = pair.second->min;
   }
   while (true) {
-    const double output_time = OutputTime(snapshot, /*gradient=*/nullptr);
+    const double output_time = OutputTime(snapshot, /*gradients=*/nullptr);
     bool all_max = true;
     for (auto& pair : parameters) {
       if (pair.second->value < pair.second->max) {
@@ -920,7 +1434,7 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
         continue;
       }
       pair.second->value++;
-      double new_output_time = OutputTime(snapshot, /*gradient=*/nullptr);
+      double new_output_time = OutputTime(snapshot, /*gradients=*/nullptr);
       double delta = output_time - new_output_time;
       if (delta > best_delta &&
           (delta > kBufferSizeMinDelta || pair.second->name != kBufferSize)) {
@@ -950,15 +1464,18 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
 }
 
 double Model::OutputTime(std::shared_ptr<Node> node,
-                         std::map<string, double>* gradient) {
-  std::vector<double> input_times(1, 0);
+                         absl::flat_hash_map<string, double>* gradients) {
+  // To store the input time for each node.
+  absl::flat_hash_map<string, double> input_times;
+
   // TODO(jsimsa): Now that we are accounting for buffer size in wait time
   // computation, assuming that the input is infinitely fast will result in
   // inaccurate estimates of the output latency.
   //
   // We should compute the output latency as a fix-point of the following
   // equation: `output_time = node(OutputTime(input_times(1, output_time))`.
-  return node->OutputTime(&input_times, gradient);
+
+  return node->OutputTime(&input_times, gradients);
 }
 
 double Model::TotalBufferedBytes(std::shared_ptr<Node> node) {
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 81dfda7acb6..e325056f0c4 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -41,11 +42,19 @@ constexpr int64 kAutotune = -1;
 constexpr char kParallelism[] = "parallelism";
 constexpr char kBufferSize[] = "buffer_size";
 
+// A key used to identify input time gradient.
+constexpr char kInputTimeKey[] = "input_time";
+
 enum class AutotuneAlgorithm {
   HILL_CLIMB = 0,
   GRADIENT_DESCENT = 1,
 };
 
+enum class TraversalOrder {
+  BFS = 0,
+  REVERSE_BFS = 1,
+};
+
 // Represents thread-safe state that can be shared between an input pipeline and
 // the performance model.
 struct SharedState {
@@ -123,6 +132,9 @@ class Node {
   };
 
   using Factory = std::function<std::shared_ptr<Node>(Args)>;
+  using NodeVector = std::vector<std::shared_ptr<Node>>;
+  using NodePairList =
+      std::list<std::pair<std::shared_ptr<Node>, std::shared_ptr<Node>>>;
 
   explicit Node(Args args)
       : id_(args.id),
@@ -133,11 +145,36 @@ class Node {
         bytes_consumed_(0),
         bytes_produced_(0),
         num_elements_(0),
+        processing_time_(0),
         record_metrics_(true),
         metrics_(name_),
         output_(args.output.get()) {}
 
-  virtual ~Node() { FlushMetrics(); }
+  virtual ~Node() {
+    // Clear the sub-nodes instead of relying on implicit shared pointer
+    // destructor to avoid potential stack overflow when the tree is deep.
+    std::deque<std::shared_ptr<Node>> queue;
+    {
+      mutex_lock l(mu_);
+      while (inputs_.size() > 0) {
+        queue.push_back(inputs_.front());
+        inputs_.pop_front();
+      }
+    }
+    while (!queue.empty()) {
+      auto node = queue.back();
+      queue.pop_back();
+      {
+        mutex_lock l(node->mu_);
+        while (node->inputs_.size() > 0) {
+          queue.push_back(node->inputs_.front());
+          node->inputs_.pop_front();
+        }
+      }
+    }
+
+    FlushMetrics();
+  }
 
   // Adds an input.
   void add_input(std::shared_ptr<Node> node) TF_LOCKS_EXCLUDED(mu_) {
@@ -147,7 +184,6 @@ class Node {
 
   // Increments the aggregate processing time by the given delta.
   void add_processing_time(int64 delta) TF_LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
     processing_time_ += delta;
   }
 
@@ -210,7 +246,6 @@ class Node {
 
   // Returns the aggregate processing time.
   int64 processing_time() const TF_LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock l(mu_);
     return processing_time_;
   }
 
@@ -233,18 +268,15 @@ class Node {
 
   // Records that a node thread has started executing.
   void record_start(int64 time_nanos) TF_LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    work_start_[std::this_thread::get_id()] = time_nanos;
+    DCHECK_EQ(work_start_, 0);
+    work_start_ = time_nanos;
   }
 
   // Records that a node thread has stopped executing.
   void record_stop(int64 time_nanos) TF_LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    std::thread::id tid = std::this_thread::get_id();
-    auto iter = work_start_.find(tid);
-    if (iter != work_start_.end()) {
-      processing_time_ += time_nanos - iter->second;
-      work_start_.erase(iter);
+    if (work_start_ != 0) {
+      processing_time_ += time_nanos - work_start_;
+      work_start_ = 0;
     } else {
       VLOG(1) << "Encountered a stop event without a matching start event.";
     }
@@ -261,155 +293,70 @@ class Node {
     autotune_.store(autotune);
   }
 
+  // Given the average time between output events (`output_time`), the average
+  // time between input events (`input_time`) and the buffer size, the method
+  // computes the expected time an input event will have to wait.
+  //
+  // The wait time is approximated as the product of the probability the buffer
+  // will be empty and the time it takes to produce an element into the buffer.
+  //
+  // The formula used for computing the probability is derived by modeling the
+  // problem as an M/M/1/K queue
+  // (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
+  //
+  // Collects derivatives of `ComputeWaitTime` w.r.t `output_time`, `input_time'
+  // and `buffer_size` if the corresponding pointers are not `nullptr`.
+  static double ComputeWaitTime(const double& output_time,
+                                const double& input_time,
+                                const double& buffer_size,
+                                double* output_time_derivative,
+                                double* input_time_derivative,
+                                double* buffer_size_derivative);
+
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
-      std::map<string, std::shared_ptr<Parameter>>* parameters) const
-      TF_LOCKS_EXCLUDED(mu_) {
-    if (!autotune_) {
-      return;
-    }
-    tf_shared_lock l(mu_);
-    for (auto& pair : parameters_) {
-      if (pair.second->state->tunable) {
-        parameters->insert(std::make_pair(long_name(), pair.second));
-      }
-    }
-    for (auto& input : inputs_) {
-      input->CollectTunableParameters(parameters);
-    }
-  }
+      absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Returns a human-readable representation of this node.
-  string DebugString() const TF_LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock l(mu_);
-    string result;
-    strings::StrAppend(&result, long_name(), ":\n");
-    strings::StrAppend(&result, "  autotune=", autotune_.load(), "\n");
-    strings::StrAppend(&result, "  buffered_bytes=", buffered_bytes_.load(),
-                       "\n");
-    strings::StrAppend(&result,
-                       "  buffered_elements=", buffered_elements_.load(), "\n");
-    strings::StrAppend(&result, "  bytes_consumed=", bytes_consumed_.load(),
-                       "\n");
-    strings::StrAppend(&result, "  bytes_produced=", bytes_produced_.load(),
-                       "\n");
-    strings::StrAppend(&result, "  processing_time=", processing_time_, "\n");
-    strings::StrAppend(&result, "  num_elements=", num_elements_.load(), "\n");
-    string inputs;
-    for (auto& input : inputs_) {
-      strings::StrAppend(&inputs, input->long_name(), ",");
-    }
-    strings::StrAppend(&result, "  inputs={", inputs, "}\n");
-    for (auto& input : inputs_) {
-      strings::StrAppend(&result, input->DebugString());
-    }
-    return result;
-  }
+  string DebugString() const TF_LOCKS_EXCLUDED(mu_);
 
   // Flushes the metrics recorded by this node.
-  void FlushMetrics() TF_LOCKS_EXCLUDED(mu_) {
-    if (!record_metrics_) {
-      return;
-    }
-    metrics_.record_bytes_consumed(bytes_consumed_);
-    metrics_.record_bytes_produced(bytes_produced_);
-    metrics_.record_num_elements(num_elements_);
-  }
+  void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
 
-  // Returns the per-element output time for this node and if `gradient` is not
-  // `nullptr`, collects the gradient of the output time w.r.t. tunable
-  // parameters of the subtree rooted in this node and the last input time.
-  double OutputTime(std::vector<double>* input_times,
-                    std::map<string, double>* gradient) const
-      TF_LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock l(mu_);
-    return OutputTimeLocked(input_times, gradient);
-  }
+  // Returns the per-element output time for this node and if `gradients` is not
+  // `nullptr`, collects the output time gradient w.r.t. tunable parameters of
+  // the subtree rooted in this node.
+  double OutputTime(absl::flat_hash_map<string, double>* input_times,
+                    absl::flat_hash_map<string, double>* gradients) const
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Returns a copy of this node, making a deep copy of its inputs and a
   // shallow copy of its tunable parameters.
   //
   // The purpose for this method is to allow the model optimization logic to
   // operate over immutable state while allowing concurrent model updates.
-  std::shared_ptr<Node> Snapshot(std::shared_ptr<Node> output)
-      TF_LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock l(mu_);
-    std::shared_ptr<Node> result = Clone(output);
-    {
-      result->autotune_.store(autotune_);
-      result->buffered_bytes_.store(buffered_bytes_);
-      result->buffered_elements_.store(buffered_elements_);
-      result->bytes_consumed_.store(bytes_consumed_);
-      result->bytes_produced_.store(bytes_produced_);
-      result->num_elements_.store(num_elements_);
-      result->record_metrics_.store(false);
-      mutex_lock l2(result->mu_);
-      result->parameters_ = parameters_;
-      result->processing_time_ = processing_time_;
-    }
-    for (auto& input : inputs_) {
-      result->add_input(input->Snapshot(result));
-    }
-    return result;
-  }
+  std::shared_ptr<Node> Snapshot(std::shared_ptr<Node> output) const
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Returns the per-element processing time spent in this node.
-  double SelfProcessingTime() const TF_LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock l(mu_);
-    return SelfProcessingTimeLocked();
-  }
+  double SelfProcessingTime() const TF_LOCKS_EXCLUDED(mu_);
 
   // Returns the total number of bytes buffered in all nodes in the subtree for
   // which autotuning is enabled.
-  double TotalBufferedBytes() const TF_LOCKS_EXCLUDED(mu_) {
-    if (!autotune_) {
-      return 0;
-    }
-    tf_shared_lock l(mu_);
-    double result = 0;
-    auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
-    if (!parameter) {
-      parameter = gtl::FindOrNull(parameters_, kParallelism);
-    }
-    if (parameter) {
-      result = buffered_bytes_;
-    }
-    for (auto& input : inputs_) {
-      result += input->TotalBufferedBytes();
-    }
-    return result;
-  }
+  double TotalBufferedBytes() const TF_LOCKS_EXCLUDED(mu_);
 
   // Collects the total buffer limit of all nodes in the subtree for which
   // autotuning is enabled. This number represents the amount of memory that
   // would be used by the subtree nodes if all of their buffers were full.
-  double TotalMaximumBufferedBytes() const TF_LOCKS_EXCLUDED(mu_) {
-    if (!autotune_) {
-      return 0;
-    }
-    tf_shared_lock l(mu_);
-    double result = 0;
-    auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
-    if (!parameter) {
-      parameter = gtl::FindOrNull(parameters_, kParallelism);
-    }
-    if (parameter) {
-      result = (*parameter)->value * AverageBufferedElementSize();
-    }
-    for (auto& input : inputs_) {
-      result += input->TotalMaximumBufferedBytes();
-    }
-    return result;
-  }
+  double TotalMaximumBufferedBytes() const TF_LOCKS_EXCLUDED(mu_);
 
   // Returns the per-element CPU time spent in the subtree rooted in this node.
   // If `processing_times` is not `nullptr`, collects the per-element CPU time
   // spent in each node of the subtree.
-  double TotalProcessingTime(std::map<string, double>* processing_times)
-      TF_LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock l(mu_);
-    return TotalProcessingTimeLocked(processing_times);
-  }
+  double TotalProcessingTime(
+      absl::flat_hash_map<string, double>* processing_times)
+      TF_LOCKS_EXCLUDED(mu_);
 
  protected:
   // Used for (incrementally) recording metrics. The class is thread-safe.
@@ -473,103 +420,101 @@ class Node {
       TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
   // Returns the average size of an element buffered in this node.
-  double AverageBufferedElementSize() const TF_SHARED_LOCKS_REQUIRED(mu_) {
-    if (buffered_elements_ == 0) {
-      return 0;
-    }
-    return static_cast<double>(buffered_bytes_) /
-           static_cast<double>(buffered_elements_);
-  }
+  double AverageBufferedElementSize() const TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  // Returns the sum of per-element output time for the inputs of this node and
-  // if `gradient` is not `nullptr`, collects gradients of output times w.r.t.
-  // tunable parameters and the last input time.
-  double OutputTimeForInputs(std::vector<double>* input_times,
-                             std::map<string, double>* gradient) const
-      TF_SHARED_LOCKS_REQUIRED(mu_) {
-    double sum = 0;
-    for (auto& input : inputs_) {
-      // Inputs for which autotuning is disabled are excluded.
-      if (input->autotune()) {
-        sum += input->OutputTime(input_times, gradient);
-      }
-    }
-    return sum;
-  }
+  // Returns the sum of per-element output time for the tunable inputs of this
+  // node.
+  double OutputTimeForInputs(
+      const absl::flat_hash_map<string, double>& output_times) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  // Returns the per-element output time for this node and if `gradient` is not
-  // `nullptr`, collects the gradient of the output time w.r.t. tunable
-  // parameters of the subtree rooted in this node and the last input time.
-  virtual double OutputTimeLocked(std::vector<double>* input_times,
-                                  std::map<string, double>* gradient) const
+  // Returns the sum of output time gradient w.r.t. input time for the tunable
+  // inputs of this node.
+  double OutputTimeGradientsForInputs(
+      const absl::flat_hash_map<string, double>& output_time_gradients) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Computes the input time for this node and stores it in `input_times`.
+  virtual void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
+
+  // Computes the per-element output time for this node and stores it in
+  // `output_times`. If `gradients` is not `nullptr`, computes the output time
+  // gradient w.r.t. tunable parameters of the subtree rooted in this node and
+  // stores it in `gradients`, also computes the output time gradient w.r.t.
+  // input time and stores it in `output_time_gradients`.
+  virtual void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const
       TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
-  // Returns the sum of per-element processing time for the inputs of this node.
-  // Processing time for a given input is a weighted combination of a statistic
-  // based on history of input processing time and the actual time. This is done
-  // to improve accuracy of processing time estimation for newly created inputs.
-  // If `processing_times` is not `nullptr`, collects the per-element CPU time
-  // spent in each input node.
+  // Returns the sum of per-element processing time for the inputs of this node
+  // by adding values for input nodes in `total_processing_times`. Processing
+  // time for a given input is a weighted combination of a statistic based on
+  // history of input processing time and the actual time. This is done to
+  // improve accuracy of processing time estimation for newly created inputs.
   //
   // Uniform distribution of per-element processing times across different
   // inputs is assumed.
   double TotalProcessingTimeForInputs(
-      std::map<string, double>* processing_times)
-      TF_SHARED_LOCKS_REQUIRED(mu_) {
-    // If the number of elements produced by an input is smaller than this
-    // constant, then its processing time is estimated using a weighted average
-    // of the empirical processing time and processing time history.
-    constexpr int kNumElementsThreshold = 30;
-
-    // Identifies the minimum number of input processing times to collect
-    // before the processing time history is used as a prior.
-    constexpr int kCountThreshold = 30;
-
-    double sum = 0;
-    for (auto& input : inputs_) {
-      // Inputs for which autotuning is disabled are excluded.
-      if (input->autotune()) {
-        double input_processing_time =
-            input->TotalProcessingTime(processing_times);
-        int64 num_elements = input->num_elements();
-        if (num_elements < kNumElementsThreshold) {
-          if (input_processing_time_count_ < kCountThreshold) {
-            sum += input_processing_time;
-          } else {
-            // The fewer elements the input has produced so far, the more weight
-            // is assigned to the prior to reduce volatility.
-            double prior_weight = 1.0L / static_cast<double>(2 << num_elements);
-            double prior =
-                input_processing_time_sum_ / input_processing_time_count_;
-            sum += (1.0L - prior_weight) * input_processing_time +
-                   prior_weight * prior;
-          }
-        } else {
-          sum += input_processing_time;
-          input_processing_time_count_++;
-          input_processing_time_sum_ += input_processing_time;
-        }
-      }
-    }
-    return sum;
-  }
+      const absl::flat_hash_map<string, double>& total_processing_times)
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Returns the per-element processing time spent in this node.
-  double SelfProcessingTimeLocked() const TF_SHARED_LOCKS_REQUIRED(mu_) {
-    if (num_elements_ == 0) {
-      return 0;
-    }
-    return static_cast<double>(processing_time_) /
-           static_cast<double>(num_elements_);
-  }
+  double SelfProcessingTimeLocked() const TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  // Returns the per-element CPU time spent in the subtree rooted in this node.
-  // If `processing_times` is not `nullptr`, collects the per-element CPU time
-  // spent in each node of the subtree.
-  virtual double TotalProcessingTimeLocked(
-      std::map<string, double>* processing_times)
+  // Computes the per-element CPU time spent in the subtree rooted in this node
+  // and stores it in `total_processing_times`. If `processing_times` is not
+  // `nullptr`, collects the per-element CPU time spent in each node of the
+  // subtree.
+  virtual void TotalProcessingTimeLocked(
+      absl::flat_hash_map<string, double>* processing_times,
+      absl::flat_hash_map<string, double>* total_processing_times)
       TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
+  // Returns a vector of nodes of the subtree rooted in this node. The nodes are
+  // either in breadth-first search or reverse breadth-first search order
+  // depending on the `order` argument. The root node itself is not collected.
+  NodeVector CollectNodes(TraversalOrder order) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Collect tunable parameters for the node.
+  void CollectTunableParametersHelper(
+      absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Build up debug string for the node and store in the debug strings map.
+  void DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
+      const TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Copy the node and add the (input, copy) pairs to the NodePairList.
+  std::shared_ptr<Node> SnapshotHelper(std::shared_ptr<Node> clone_base,
+                                       NodePairList* node_pairs) const;
+
+  // Compute total buffered bytes for the node and store in the total bytes map.
+  void TotalBufferedBytesHelper(
+      absl::flat_hash_map<string, double>* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Compute total maximum buffered bytes for the node and store in the total
+  // bytes map.
+  void TotalMaximumBufferedBytesHelper(
+      absl::flat_hash_map<string, double>* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Stores the time passed to the last call to `Node::record_start()` on the
+  // current thread.
+  //
+  // NOTE: This thread-local variable is shared between all instances of `Node`
+  // on which the same thread calls `record_start()` or `record_stop()`. It
+  // relies on the invariant that at most one `Node` can be "active" on a
+  // particular thread at any time. Therefore if `n->record_start()` is called
+  // on thread `t`, then `n->record_stop()` must be called before another call
+  // to `Node::record_start()` (for any node).
+  static thread_local int64 work_start_;  // Will be initialized to zero.
+
   mutable mutex mu_;
   const int64 id_;
   const string name_;
@@ -583,11 +528,11 @@ class Node {
   std::atomic<int64> bytes_consumed_;
   std::atomic<int64> bytes_produced_;
   std::atomic<int64> num_elements_;
+  std::atomic<int64> processing_time_;
   std::atomic<bool> record_metrics_;
   Metrics metrics_;
-  std::map<string, std::shared_ptr<Parameter>> parameters_ TF_GUARDED_BY(mu_);
-  int64 processing_time_ TF_GUARDED_BY(mu_) = 0;
-  std::map<std::thread::id, int64> work_start_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters_
+      TF_GUARDED_BY(mu_);
 
   // Statistic of inputs processing time history.
   double input_processing_time_sum_ = 0.0L;
@@ -656,47 +601,32 @@ class Model {
   // Adds a node with the given name and given output. The method returns
   // a pointer to the node but does not transfer ownership.
   void AddNode(Node::Factory factory, const string& name,
-               const string& output_name, Node** out_node)
-      TF_LOCKS_EXCLUDED(mu_);
-
-  // Increments the processing time for the given node..
-  void AddProcessingTime(const string& name, int64 delta)
+               const string& output_name, std::shared_ptr<Node>* out_node)
       TF_LOCKS_EXCLUDED(mu_);
 
   // Flushes metrics record by the model.
   void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
 
-  // Returns the number of elements that the input pipeline has produced.
-  int64 NumElements(const string& name) TF_LOCKS_EXCLUDED(mu_);
-
   // Uses the given algorithm to perform the autotuning optimization.
   void Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget, int64 ram_budget)
       TF_LOCKS_EXCLUDED(mu_);
 
-  // Records that the given node has started work. If `stop_output` is set, it
-  // also records that the output of the given node has stopped work.
-  void RecordStart(const string& name, bool stop_output) TF_LOCKS_EXCLUDED(mu_);
-
-  // Records that the given node has stopped work. If `stop_output` is set, it
-  // also records that the output of the given node has started work.
-  void RecordStop(const string& name, bool start_output) TF_LOCKS_EXCLUDED(mu_);
-
   // Removes the given node.
   void RemoveNode(const string& name) TF_LOCKS_EXCLUDED(mu_);
 
  private:
   // Collects tunable parameters in the tree rooted in the given node, returning
   // a mapping from a (unique) node name to a tunable parameter.
-  std::map<string, std::shared_ptr<Parameter>> CollectTunableParameters(
-      std::shared_ptr<Node> node);
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>>
+  CollectTunableParameters(std::shared_ptr<Node> node);
 
   // Collects "essential" parallelism parameters of transformations in the tree
   // rooted in the given node. Which parameters are essential is determined by
   // comparison the processing time spent in the corresponding transformation
   // relative to other transformations. The collected parameters are returned
   // as a mapping from a (unique) node name to a parallelism parameter.
-  std::map<string, std::shared_ptr<Parameter>> CollectEssentialParallelism(
-      std::shared_ptr<Node> node);
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>>
+  CollectEssentialParallelism(std::shared_ptr<Node> node);
 
   // This optimization algorithm starts by setting all tunable parallelism
   // parameters to the minimum value. It then repeatedly identifies the
@@ -715,11 +645,11 @@ class Model {
   // an element divided by CPU budget.
   void OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget);
 
-  // Collects the output time and if `gradient` is not `nullptr`, the output
+  // Collects the output time and if `gradients` is not `nullptr`, the output
   // time gradient w.r.t. tunable parameters of the subtree rooted in the given
-  // node and the last input time.
+  // node.
   double OutputTime(std::shared_ptr<Node> node,
-                    std::map<string, double>* gradient);
+                    absl::flat_hash_map<string, double>* gradients);
 
   // Collects the processing time for the given node.
   double TotalProcessingTime(std::shared_ptr<Node> node);
@@ -740,7 +670,8 @@ class Model {
   mutex mu_;
   int64 id_counter_ TF_GUARDED_BY(mu_) = 1;
   std::shared_ptr<Node> output_ TF_GUARDED_BY(mu_);
-  std::map<string, std::shared_ptr<Node>> lookup_table_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<string, std::shared_ptr<Node>> lookup_table_
+      TF_GUARDED_BY(mu_);
 
   // Indicates whether the modeling framework should collect resource usage
   // (e.g. CPU, memory). The logic for collecting this information assumes that
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index af3ded6bc55..688dd0083e9 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -44,18 +44,19 @@ TEST_P(AsyncInterleaveManyTest, Model) {
     async_interleave_many->remove_input(meta_source);
   });
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({1, "source1", async_interleave_many});
+      model::MakeSourceNode({2, "source1", async_interleave_many});
   async_interleave_many->add_input(source1);
   auto cleanup1 = gtl::MakeCleanup([async_interleave_many, source1]() {
     async_interleave_many->remove_input(source1);
   });
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", async_interleave_many});
+      model::MakeSourceNode({3, "source2", async_interleave_many});
   async_interleave_many->add_input(source2);
   auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
     async_interleave_many->remove_input(source2);
   });
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   EXPECT_EQ(async_interleave_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_interleave_many->TotalMaximumBufferedBytes(), 0);
   async_interleave_many->record_buffer_event(110, 10);
@@ -123,7 +124,8 @@ TEST_P(AsyncKnownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
   async_known_many->add_input(source2);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   EXPECT_EQ(async_known_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(), 0);
   async_known_many->record_buffer_event(110, 10);
@@ -194,12 +196,12 @@ TEST(InterleaveManyTest, Model) {
       model::MakeSourceNode({1, "meta_source", interleave_many});
   interleave_many->add_input(meta_source);
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({1, "source1", interleave_many});
+      model::MakeSourceNode({2, "source1", interleave_many});
   interleave_many->add_input(source1);
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", interleave_many});
+      model::MakeSourceNode({3, "source2", interleave_many});
   interleave_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   interleave_many->add_processing_time(100);
   EXPECT_EQ(interleave_many->processing_time(), 100);
   EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
@@ -238,7 +240,7 @@ TEST_P(KnownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", known_many});
   known_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source1->add_processing_time(100);
   EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr), 0);
@@ -286,7 +288,7 @@ INSTANTIATE_TEST_SUITE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
 
 TEST(SourceTest, Model) {
   std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source->add_processing_time(100);
   EXPECT_EQ(source->processing_time(), 100);
   EXPECT_EQ(source->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -310,7 +312,7 @@ TEST(UnknownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown_many});
   unknown_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   unknown_many->add_processing_time(100);
   EXPECT_EQ(unknown_many->processing_time(), 100);
   EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -345,7 +347,7 @@ TEST(UnknownTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown});
   unknown->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source1->add_processing_time(100);
   EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 0);
@@ -390,15 +392,23 @@ class TestNode : public model::Node {
     return nullptr;
   }
 
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          std::map<string, double>* gradient) const override
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {}
+
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return 0;
+    (*output_times)[long_name()] = 0;
   }
 
-  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return 0;
+  void TotalProcessingTimeLocked(
+      absl::flat_hash_map<string, double>* processing_times,
+      absl::flat_hash_map<string, double>* total_processing_times) override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    (*total_processing_times)[long_name()] = 0;
   }
 };
 
@@ -502,7 +512,7 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
     async_interleave_many->remove_input(meta_source);
   });
   std::shared_ptr<Node> source1 = model::MakeAsyncInterleaveManyNode(
-      {0, "async_interleave_many", nullptr},
+      {2, "async_interleave_many", async_interleave_many},
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -512,13 +522,14 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
     async_interleave_many->remove_input(source1);
   });
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", async_interleave_many});
+      model::MakeSourceNode({3, "source2", async_interleave_many});
   async_interleave_many->add_input(source2);
   auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
     async_interleave_many->remove_input(source2);
   });
-  std::vector<double> input_times(1, input_time);
-  std::map<string, std::shared_ptr<Parameter>> parameters;
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   async_interleave_many->CollectTunableParameters(&parameters);
   async_interleave_many->record_element();
   async_interleave_many->add_processing_time(100);
@@ -530,13 +541,13 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
   parameters[source1->long_name()]->value = 1;
 
   // Test gradient of own parameters.
-  std::map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   double output_time =
-      async_interleave_many->OutputTime(&input_times, &gradient);
+      async_interleave_many->OutputTime(&input_times, &gradients);
   parameters[async_interleave_many->long_name()]->value += kParameterStep;
   double new_output_time =
       async_interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_interleave_many->long_name()],
+  EXPECT_NEAR(gradients[async_interleave_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 
@@ -544,7 +555,7 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
   parameters[async_interleave_many->long_name()]->value -= kParameterStep;
   parameters[source1->long_name()]->value += kParameterStep;
   new_output_time = async_interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[source1->long_name()],
+  EXPECT_NEAR(gradients[source1->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -563,7 +574,7 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
           std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
           parameter_value)});
   std::shared_ptr<Node> source1 = model::MakeAsyncKnownRatioNode(
-      {0, "source1", nullptr}, num_inputs_per_output,
+      {1, "source1", async_known_many}, num_inputs_per_output,
       {model::MakeParameter(
           parameter_name,
           std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
@@ -571,7 +582,8 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   async_known_many->add_input(source1);
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   async_known_many->add_input(source2);
   source1->record_element();
   source1->add_processing_time(100);
@@ -581,15 +593,15 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   async_known_many->add_processing_time(300);
 
   // Test gradient of own parameters.
-  std::map<string, std::shared_ptr<Parameter>> parameters;
-  std::map<string, double> gradient;
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
+  absl::flat_hash_map<string, double> gradients;
   async_known_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
   parameters[source1->long_name()]->value = 1;
-  double output_time = async_known_many->OutputTime(&input_times, &gradient);
+  double output_time = async_known_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = async_known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 
@@ -597,7 +609,7 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   parameters[async_known_many->long_name()]->value -= kParameterStep;
   parameters[source1->long_name()]->value += kParameterStep;
   new_output_time = async_known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[source1->long_name()],
+  EXPECT_NEAR(gradients[source1->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -612,28 +624,29 @@ TEST(InterleaveManyGradientTest, Model) {
   std::shared_ptr<Node> interleave_many =
       model::MakeInterleaveManyNode({0, "interleave_many", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", interleave_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
           parallelism)});
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({2, "source1", async_known_many});
+      model::MakeSourceNode({2, "source1", interleave_many});
   interleave_many->record_element();
   interleave_many->add_processing_time(100);
   interleave_many->add_input(source1);
   interleave_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
-  std::map<string, std::shared_ptr<Parameter>> parameters;
-  std::map<string, double> gradient;
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
+  absl::flat_hash_map<string, double> gradients;
   interleave_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = interleave_many->OutputTime(&input_times, &gradient);
+  double output_time = interleave_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -645,7 +658,7 @@ TEST(KnownRatioGradientTest, Model) {
   std::shared_ptr<Node> known_many = model::MakeKnownRatioNode(
       {0, "known_many", nullptr}, num_inputs_per_output);
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", known_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -655,15 +668,16 @@ TEST(KnownRatioGradientTest, Model) {
   known_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
-  std::map<string, std::shared_ptr<Parameter>> parameters;
-  std::map<string, double> gradient;
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
+  absl::flat_hash_map<string, double> gradients;
   known_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = known_many->OutputTime(&input_times, &gradient);
+  double output_time = known_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -675,7 +689,7 @@ TEST(UnknownRatioGradientTest, Model) {
   std::shared_ptr<Node> unknown_many =
       model::MakeUnknownRatioNode({0, "unknown_many", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", unknown_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -685,15 +699,16 @@ TEST(UnknownRatioGradientTest, Model) {
   unknown_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
-  std::map<string, std::shared_ptr<Parameter>> parameters;
-  std::map<string, double> gradient;
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
+  absl::flat_hash_map<string, double> gradients;
   unknown_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = unknown_many->OutputTime(&input_times, &gradient);
+  double output_time = unknown_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = unknown_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -705,7 +720,7 @@ TEST(UnknownGradientTest, Model) {
   std::shared_ptr<Node> unknown =
       model::MakeUnknownNode({0, "unknown", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", unknown}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -715,18 +730,119 @@ TEST(UnknownGradientTest, Model) {
   unknown->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
-  std::map<string, std::shared_ptr<Parameter>> parameters;
-  std::map<string, double> gradient;
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
+  absl::flat_hash_map<string, double> gradients;
   unknown->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = unknown->OutputTime(&input_times, &gradient);
+  double output_time = unknown->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = unknown->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
+
+TEST(SnapshotTest, Model) {
+  std::shared_ptr<Node> root =
+      model::MakeUnknownNode({0, std::to_string(0), nullptr});
+  std::shared_ptr<Node> cur_node = root;
+
+  int64 num_nodes = 100;
+  for (int64 i = 1; i < num_nodes; i++) {
+    cur_node->add_input(
+        model::MakeUnknownNode({i, std::to_string(i), cur_node}));
+    cur_node = cur_node->inputs().front();
+  }
+
+  std::shared_ptr<Node> root_copy = root->Snapshot(nullptr);
+  cur_node = root;
+  std::shared_ptr<Node> cur_node_copy = root_copy;
+
+  for (int64 i = 0; i < num_nodes; i++) {
+    EXPECT_EQ(cur_node->id(), cur_node_copy->id());
+    EXPECT_EQ(cur_node->name(), cur_node_copy->name());
+    EXPECT_NE(cur_node.get(), cur_node_copy.get());
+
+    if (i < num_nodes - 1) {
+      cur_node = cur_node->inputs().front();
+      cur_node_copy = cur_node_copy->inputs().front();
+    }
+  }
+}
+
+class ComputeWaitTimeTest
+    : public ::testing::TestWithParam<std::tuple<double, double, double>> {};
+
+TEST_P(ComputeWaitTimeTest, Model) {
+  const double output_time = std::get<0>(GetParam());
+  const double input_time = std::get<1>(GetParam());
+  const double buffer_size = std::get<2>(GetParam());
+
+  double output_time_derivative = 0.0L;
+  double input_time_derivative = 0.0L;
+  double buffer_size_derivative = 0.0L;
+
+  double wait_time = model::Node::ComputeWaitTime(
+      output_time, input_time, buffer_size, &output_time_derivative,
+      &input_time_derivative, &buffer_size_derivative);
+
+  double new_wait_time =
+      model::Node::ComputeWaitTime(output_time + kParameterStep, input_time,
+                                   buffer_size, nullptr, nullptr, nullptr);
+  EXPECT_NEAR(output_time_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (output_time >= kParameterStep) {
+    new_wait_time =
+        model::Node::ComputeWaitTime(output_time - kParameterStep, input_time,
+                                     buffer_size, nullptr, nullptr, nullptr);
+    EXPECT_NEAR(output_time_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+
+  new_wait_time =
+      model::Node::ComputeWaitTime(output_time, input_time + kParameterStep,
+                                   buffer_size, nullptr, nullptr, nullptr);
+  EXPECT_NEAR(input_time_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (input_time >= kParameterStep) {
+    new_wait_time =
+        model::Node::ComputeWaitTime(output_time, input_time - kParameterStep,
+                                     buffer_size, nullptr, nullptr, nullptr);
+    EXPECT_NEAR(input_time_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+
+  new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+                                               buffer_size + kParameterStep,
+                                               nullptr, nullptr, nullptr);
+  EXPECT_NEAR(buffer_size_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (buffer_size >= kParameterStep) {
+    new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+                                                 buffer_size - kParameterStep,
+                                                 nullptr, nullptr, nullptr);
+    EXPECT_NEAR(buffer_size_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Test, ComputeWaitTimeTest,
+    ::testing::Combine(::testing::Values(0, 20, 40, 80, 100),
+                       ::testing::Values(0, 20, 40, 80, 100),
+                       ::testing::Values(0, 1, 2, 4, 10, 20, 40)));
+
 }  // namespace
 }  // namespace model
 }  // namespace data
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 04de2d5b2b0..86911c7310a 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -158,6 +158,15 @@ const AttrValue* AttrSlice::Find(StringPiece attr_name) const {
   return nullptr;
 }
 
+const AttrValue* AttrSlice::FindByString(const string& attr_name) const {
+  auto iter = attrs_->find(attr_name);
+  if (iter != attrs_->end()) {
+    return &iter->second;
+  } else {
+    return nullptr;
+  }
+}
+
 Status AttrSlice::Find(StringPiece attr_name,
                        const AttrValue** attr_value) const {
   *attr_value = Find(attr_name);
@@ -478,6 +487,11 @@ Status AddArgToSig(const NodeDefOrAttrSlice& node_or_attrs,
   if (arg_def.is_ref()) {
     // For all types that were added by this function call, make them refs.
     for (size_t i = original_size; i < sig->size(); ++i) {
+      if (IsRefType((*sig)[i])) {
+        return errors::InvalidArgument(
+            "Requested reference to a reference type: ",
+            arg_def.ShortDebugString());
+      }
       (*sig)[i] = MakeRefType((*sig)[i]);
     }
   }
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index c913997011f..db3f2570a92 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -143,6 +143,7 @@ class AttrSlice {
   // Returns the attr with attr_name if found.  Otherwise, returns
   // nullptr.
   const AttrValue* Find(StringPiece attr_name) const;
+  const AttrValue* FindByString(const string& attr_name) const;
 
   // Returns the attr_value for attr_name if found. Otherwise, returns a
   // NotFound status.
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 278534e1ab0..86bc70448d2 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -249,6 +249,12 @@ class OpDefBuilderWrapper<true> {
     builder_.SetIsStateful();
     return *this;
   }
+  OpDefBuilderWrapper<true>& SetDoNotOptimize() {
+    // We don't have a separate flag to disable optimizations such as constant
+    // folding and CSE so we reuse the stateful flag.
+    builder_.SetIsStateful();
+    return *this;
+  }
   OpDefBuilderWrapper<true>& SetAllowsUninitializedInput() {
     builder_.SetAllowsUninitializedInput();
     return *this;
@@ -282,6 +288,7 @@ class OpDefBuilderWrapper<false> {
   OpDefBuilderWrapper<false>& SetIsCommutative() { return *this; }
   OpDefBuilderWrapper<false>& SetIsAggregate() { return *this; }
   OpDefBuilderWrapper<false>& SetIsStateful() { return *this; }
+  OpDefBuilderWrapper<false>& SetDoNotOptimize() { return *this; }
   OpDefBuilderWrapper<false>& SetAllowsUninitializedInput() { return *this; }
   OpDefBuilderWrapper<false>& Deprecated(int, StringPiece) { return *this; }
   OpDefBuilderWrapper<false>& Doc(StringPiece text) { return *this; }
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 0ebc4bf2483..115c24e1968 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -783,11 +783,13 @@ void RemoveDescriptionsFromOpList(OpList* op_list) {
 }
 
 bool AttrDefEqual(const OpDef::AttrDef& a1, const OpDef::AttrDef& a2) {
-#ifndef TENSORFLOW_LITE_PROTOS
-  DCHECK_EQ(7, a1.GetDescriptor()->field_count())
-      << "Please modify these equality and hash functions to reflect the "
-         "changes to the AttrDef protobuf";
-#endif  // TENSORFLOW_LITE_PROTOS
+  if (std::is_base_of<protobuf::Message, OpDef::AttrDef>()) {
+    DCHECK_EQ(7, reinterpret_cast<const protobuf::Message*>(&a1)
+                     ->GetDescriptor()
+                     ->field_count())
+        << "Please modify these equality and hash functions to reflect the "
+           "changes to the AttrDef protobuf";
+  }
 
   if (a1.name() != a2.name()) return false;
   if (a1.type() != a2.type()) return false;
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index be34afc5105..2e7747380b4 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/base/call_once.h"
 #include "absl/strings/match.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -1100,6 +1101,15 @@ void OpKernelContext::set_record_memory_consumption(bool v) {
   }
 }
 
+const string& OpKernelContext::executor_type() const {
+  if (params_->executor_type) {
+    return *params_->executor_type;
+  } else {
+    static const string& kEmptyString = *new string("");
+    return kEmptyString;
+  }
+}
+
 // OpKernel registration ------------------------------------------------------
 
 struct KernelRegistration {
@@ -1273,7 +1283,21 @@ OpKernel* OpKernelRegistrar::PtrOpKernelFactory::Create(
 
 namespace {
 
-static const StringPiece kKernelAttr("_kernel");
+// Label defaults to empty if not found in NodeDef.
+const string& GetKernelLabelAttr(const AttrSlice& node_attrs) {
+  static const string& kKernelAttr = *new string("_kernel");
+  static const string& kEmptyString = *new string("");
+
+  // NOTE: We inline the implementation of `GetNodeAttrString()` here in order
+  // to use the `AttrSlice::FindByString()` overload, which does a more
+  // efficient map lookup (instead of a linear scan) when the attribute name is
+  // already a `const string&`.
+  const AttrValue* attr_value = node_attrs.FindByString(kKernelAttr);
+  if (attr_value == nullptr || attr_value->value_case() != AttrValue::kS)
+    return kEmptyString;
+  else
+    return attr_value->s();
+}
 
 // TODO(irving): Replace with const Node& version below.
 Status FindKernelRegistration(
@@ -1284,8 +1308,8 @@ Status FindKernelRegistration(
     bool* was_attr_mismatch) {
   *reg = nullptr;
   *was_attr_mismatch = false;
-  // Label defaults to empty if not found in NodeDef.
-  const string& label = GetNodeAttrString(node_attrs, kKernelAttr);
+
+  const string& label = GetKernelLabelAttr(node_attrs);
 
   const string key = Key(node_op, device_type, label);
   auto typed_registry = GlobalKernelRegistryTyped();
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 2f140316b3a..55de3d9fa03 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -689,13 +689,14 @@ class OpKernelContext {
     StepStatsCollectorInterface* stats_collector = nullptr;
     GraphCollector* graph_collector = nullptr;
     bool run_all_kernels_inline = false;
+    const string* executor_type = nullptr;
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
 
     // Support for forwarding reservations (used by ScopedAllocator).
-    static const int kNeverForward = -2;
-    static const int kNoReservation = -1;
+    static constexpr int kNeverForward = -2;
+    static constexpr int kNoReservation = -1;
     // Values in [0,...) represent reservations for the indexed output.
     const int* forward_from_array = nullptr;
 
@@ -830,6 +831,10 @@ class OpKernelContext {
     return params_->run_all_kernels_inline;
   }
 
+  // Returns the registered name for the executor type that is executing the
+  // current kernel. If empty, the default executor is used.
+  const string& executor_type() const;
+
   // Input to output forwarding.
 
   // Set the output Ref Tensor at output_index to be an alias of the
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index c0ab50fe4e2..bf3b14a46fb 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -167,7 +167,7 @@ Task ThreadWorkSource::EnqueueTask(Task t, bool is_blocking) {
   }
 
   Waiter* w = nullptr;
-  bool use_sub_thread_pool =
+  static const bool use_sub_thread_pool =
       ParamFromEnvBoolWithDefault("TF_RUN_HANDLER_USE_SUB_THREAD_POOL", false);
 
   Waiter* waiter_queue;
@@ -424,7 +424,8 @@ void RunHandlerThreadPool::SetThreadWorkSources(
     // thread_work_sources are order as start_request_idx, 0, 2, 4 ... 1, 3,
     // 5... for half of the threads and start_request_idx, 1, 3, 5 ... 0, 2,
     // 4... for the other half of the threads.
-    int num_shards = ParamFromEnvWithDefault("TF_RUN_HANDLER_QUEUE_SHARDS", 1);
+    static const int num_shards =
+        ParamFromEnvWithDefault("TF_RUN_HANDLER_QUEUE_SHARDS", 1);
     int token = tid % num_shards;
     for (int i = 0; i < num_shards; ++i) {
       for (int j = token; j < thread_work_sources.size(); j += num_shards) {
diff --git a/tensorflow/core/framework/run_handler_util.cc b/tensorflow/core/framework/run_handler_util.cc
index c832a643385..932fb4aee94 100644
--- a/tensorflow/core/framework/run_handler_util.cc
+++ b/tensorflow/core/framework/run_handler_util.cc
@@ -23,16 +23,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-double ParamFromEnvWithDefault(const std::string& var_name,
-                               double default_value) {
-  const char* val = std::getenv(var_name.c_str());
+double ParamFromEnvWithDefault(const char* var_name, double default_value) {
+  const char* val = std::getenv(var_name);
   double num;
   return (val && strings::safe_strtod(val, &num)) ? num : default_value;
 }
 
-std::vector<double> ParamFromEnvWithDefault(const std::string& var_name,
+std::vector<double> ParamFromEnvWithDefault(const char* var_name,
                                             std::vector<double> default_value) {
-  const char* val = std::getenv(var_name.c_str());
+  const char* val = std::getenv(var_name);
   if (!val) {
     return default_value;
   }
@@ -51,9 +50,9 @@ std::vector<double> ParamFromEnvWithDefault(const std::string& var_name,
   return result;
 }
 
-std::vector<int> ParamFromEnvWithDefault(const std::string& var_name,
+std::vector<int> ParamFromEnvWithDefault(const char* var_name,
                                          std::vector<int> default_value) {
-  const char* val = std::getenv(var_name.c_str());
+  const char* val = std::getenv(var_name);
   if (!val) {
     return default_value;
   }
@@ -72,9 +71,8 @@ std::vector<int> ParamFromEnvWithDefault(const std::string& var_name,
   return result;
 }
 
-bool ParamFromEnvBoolWithDefault(const std::string& var_name,
-                                 bool default_value) {
-  const char* val = std::getenv(var_name.c_str());
+bool ParamFromEnvBoolWithDefault(const char* var_name, bool default_value) {
+  const char* val = std::getenv(var_name);
   return (val) ? str_util::Lowercase(val) == "true" : default_value;
 }
 
@@ -148,19 +146,20 @@ std::vector<int> ChooseRequestsWithExponentialDistribution(
   static const double kPowerBase =
       ParamFromEnvWithDefault("TF_RUN_HANDLER_EXP_DIST_POWER_BASE", 2.0);
 
+  static const int kMinEvenThreadsFromEnv = static_cast<int>(
+      ParamFromEnvWithDefault("TF_RUN_HANDLER_EXP_DIST_MIN_EVEN_THREADS", 1));
+  static const int kMaxEvenThreadsFromEnv = static_cast<int>(
+      ParamFromEnvWithDefault("TF_RUN_HANDLER_EXP_DIST_MAX_EVEN_THREADS", 3));
+
   std::vector<int> request_idx_list;
   request_idx_list.resize(num_threads);
   // Each request gets at least this number of threads that steal from it first.
   int min_threads_per_request =
       num_threads * kCapacityFractionForEvenDistribution / num_active_requests;
   min_threads_per_request =
-      std::max(static_cast<int>(ParamFromEnvWithDefault(
-                   "TF_RUN_HANDLER_EXP_DIST_MIN_EVEN_THREADS", 1)),
-               min_threads_per_request);
+      std::max(kMinEvenThreadsFromEnv, min_threads_per_request);
   min_threads_per_request =
-      std::min(static_cast<int>(ParamFromEnvWithDefault(
-                   "TF_RUN_HANDLER_EXP_DIST_MAX_EVEN_THREADS", 3)),
-               min_threads_per_request);
+      std::min(kMaxEvenThreadsFromEnv, min_threads_per_request);
 
   int num_remaining_threads =
       std::max(0, num_threads - num_active_requests * min_threads_per_request);
diff --git a/tensorflow/core/framework/run_handler_util.h b/tensorflow/core/framework/run_handler_util.h
index 982f06fb7e0..c63583da9a0 100644
--- a/tensorflow/core/framework/run_handler_util.h
+++ b/tensorflow/core/framework/run_handler_util.h
@@ -56,25 +56,23 @@ std::vector<int> ChooseRequestsWithExponentialDistribution(
 
 // Look up environment variable named 'var_name' and return the value if it
 // exist and can be parsed. Return 'default_value' otherwise.
-double ParamFromEnvWithDefault(const std::string& var_name,
-                               double default_value);
+double ParamFromEnvWithDefault(const char* var_name, double default_value);
 
 // Look up environment variable named 'var_name' and return the value if it
 // exist and can be parsed. The value must be in format val1,val2... Return
 // 'default_value' otherwise.
-std::vector<double> ParamFromEnvWithDefault(const std::string& var_name,
+std::vector<double> ParamFromEnvWithDefault(const char* var_name,
                                             std::vector<double> default_value);
 
 // Look up environment variable named 'var_name' and return the value if it
 // exist and can be parsed. The value must be in format val1,val2... Return
 // 'default_value' otherwise.
-std::vector<int> ParamFromEnvWithDefault(const std::string& var_name,
+std::vector<int> ParamFromEnvWithDefault(const char* var_name,
                                          std::vector<int> default_value);
 
 // Look up environment variable named 'var_name' and return the value if it
 // exist and can be parsed. Return 'default_value' otherwise.
-bool ParamFromEnvBoolWithDefault(const std::string& var_name,
-                                 bool default_value);
+bool ParamFromEnvBoolWithDefault(const char* var_name, bool default_value);
 
 }  // end namespace tensorflow
 #endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index d413882e400..45cfb2395d2 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -74,7 +74,7 @@ class ShapeInferenceTest : public ::testing::Test {
   void TestMergeHandles(bool input_not_output);
   void TestRelaxHandles(bool input_not_output);
 
-  static const int kVersion = 0;  // used for graph-def version.
+  static constexpr int kVersion = 0;  // used for graph-def version.
 };
 
 TEST_F(ShapeInferenceTest, InputOutputByName) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 40a6d53d223..361f7ed13c1 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
 
 #include <vector>
+
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -90,7 +91,7 @@ class ShapeInferenceTestutil {
         ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
             op, i, "e")                                                     \
             .error_message();                                               \
-    const std::string& substring = error_substring;                         \
+    const std::string substring = error_substring;                          \
     EXPECT_NE("", error_message);                                           \
     EXPECT_TRUE(absl::StrContains(error_message, substring))                \
         << "Expected to see '" << substring << "' in '" << error_message    \
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 54541be0b4f..744a14e007e 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <type_traits>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -239,6 +240,12 @@ class Tensor {
   /// are not valid.
   Tensor(Tensor&& other);
 
+  // Explicitly delete constructor that take a pointer (except char*)
+  // so that the pointer doesn't get implicitly cast to bool.
+  template <typename T, typename std::enable_if<!std::is_same<T, char>::value,
+                                                T>::type* = nullptr>
+  explicit Tensor(T* t) = delete;
+
   ~Tensor();
 
   /// Returns the data type.
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index ac1bef12370..b0d4944baf3 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -103,10 +103,10 @@ class TensorShapeRep {
 
   // We use the max value of uint16 or uint32 to represent unknown shapes, so
   // the maximum representable valid shape in these representations is one less.
-  static const int64 kMaxRep16 = std::numeric_limits<uint16>::max() - 1;
-  static const int64 kMaxRep32 = std::numeric_limits<uint32>::max() - 1;
-  static const uint16 kUnknownRep16 = std::numeric_limits<uint16>::max();
-  static const uint32 kUnknownRep32 = std::numeric_limits<uint32>::max();
+  static constexpr int64 kMaxRep16 = std::numeric_limits<uint16>::max() - 1;
+  static constexpr int64 kMaxRep32 = std::numeric_limits<uint32>::max() - 1;
+  static constexpr uint16 kUnknownRep16 = std::numeric_limits<uint16>::max();
+  static constexpr uint32 kUnknownRep32 = std::numeric_limits<uint32>::max();
 
   Rep16* as16() { return reinterpret_cast<Rep16*>(buf()); }
   Rep32* as32() { return reinterpret_cast<Rep32*>(buf()); }
@@ -134,7 +134,7 @@ class TensorShapeRep {
   // We store the number of dimensions in byte 14, and the RepTag in byte 15.
   // Bytes [0..13] vary depending on the representation.
   // A value of 255 indicates unknown rank in the PartialTensorShape case.
-  static const uint8 kUnknownRank = 255;
+  static constexpr uint8 kUnknownRank = 255;
   uint8 ndims_byte() const { return buf()[14]; }
   void set_ndims_byte(uint8 nd) { buf()[14] = nd; }
 
diff --git a/tensorflow/core/framework/tensor_testutil.h b/tensorflow/core/framework/tensor_testutil.h
index 1d476baa927..80dddfba801 100644
--- a/tensorflow/core/framework/tensor_testutil.h
+++ b/tensorflow/core/framework/tensor_testutil.h
@@ -116,11 +116,11 @@ namespace internal {
 
 template <typename T>
 struct is_floating_point_type {
-  static const bool value = std::is_same<T, Eigen::half>::value ||
-                            std::is_same<T, float>::value ||
-                            std::is_same<T, double>::value ||
-                            std::is_same<T, std::complex<float>>::value ||
-                            std::is_same<T, std::complex<double>>::value;
+  static constexpr bool value = std::is_same<T, Eigen::half>::value ||
+                                std::is_same<T, float>::value ||
+                                std::is_same<T, double>::value ||
+                                std::is_same<T, std::complex<float>>::value ||
+                                std::is_same<T, std::complex<double>>::value;
 };
 
 template <typename T>
diff --git a/tensorflow/core/framework/type_traits.h b/tensorflow/core/framework/type_traits.h
index 96fbf929388..a7826a642de 100644
--- a/tensorflow/core/framework/type_traits.h
+++ b/tensorflow/core/framework/type_traits.h
@@ -26,10 +26,10 @@ namespace tensorflow {
 
 // Functions to define quantization attribute of types.
 struct true_type {
-  static const bool value = true;
+  static constexpr bool value = true;
 };
 struct false_type {
-  static const bool value = false;
+  static constexpr bool value = false;
 };
 
 // Default is_quantized is false.
diff --git a/tensorflow/core/graph/BUILD b/tensorflow/core/graph/BUILD
index c8ea3ee1437..4834cdc53ed 100644
--- a/tensorflow/core/graph/BUILD
+++ b/tensorflow/core/graph/BUILD
@@ -18,31 +18,6 @@ cc_library(
     hdrs = ["mkl_graph_util.h"],
 )
 
-# TODO(bmzhao): Refactor this target to use granular dependencies
-# after stage 4 of the TF build refactor is complete:
-# https://github.com/tensorflow/community/pull/179
-tf_cc_test(
-    name = "quantize_training_test",
-    srcs = ["quantize_training_test.cc"],
-    deps = [
-        "//tensorflow/core",
-        "//tensorflow/core:all_kernels",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:direct_session_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/util:protos_test_cc",
-    ],
-)
-
 tf_cc_test(
     name = "collective_order_test",
     size = "small",
@@ -69,11 +44,8 @@ filegroup(
     srcs = [
         "algorithm.h",
         "default_device.h",
-        "gradients.h",
         "graph.h",
-        "graph_constructor.h",
         "graph_def_builder.h",
-        "graph_def_builder_util.h",
         "graph_node_util.h",
         "node_builder.h",
         "validate.h",
@@ -110,7 +82,7 @@ filegroup(
 # Note(bmzhao): This target is a holdover from the GRAPH_HDRS array
 # in tensorflow/core/BUILD. This target contains all '.h' files under
 # tensorflow/core/graph, except for the following:
-# 'benchmark_testlib.h', 'mkl_graph_util.h', 'gradients.h', 'quantize_training.h'.
+# 'benchmark_testlib.h'.
 filegroup(
     name = "graph_headers",
     srcs = [
@@ -122,13 +94,9 @@ filegroup(
         "default_device.h",
         "edgeset.h",
         "graph.h",
-        "graph_constructor.h",  # NOTE(mrry): Don't include the .cc since it depends on common_runtime.
         "graph_def_builder.h",
-        "graph_def_builder_util.h",
         "graph_node_util.h",
         "graph_partition.h",
-        "mkl_layout_pass.h",
-        "mkl_tfconversion_pass.h",
         "node_builder.h",
         "optimizer_cse.h",
         "subgraph.h",
@@ -155,33 +123,6 @@ filegroup(
     ],
 )
 
-filegroup(
-    name = "core_cpu_lib_headers",
-    srcs = [
-        "gradients.h",
-        "quantize_training.h",
-    ],
-)
-
-# Both of these files depend on common_runtime.
-filegroup(
-    name = "core_cpu_base_no_ops_srcs",
-    srcs = [
-        "graph_constructor.cc",
-        "graph_def_builder_util.cc",
-    ],
-)
-
-filegroup(
-    name = "core_cpu_impl_srcs",
-    srcs = [
-        "gradients.cc",
-        "mkl_layout_pass.cc",
-        "mkl_tfconversion_pass.cc",
-        "quantize_training.cc",
-    ],
-)
-
 filegroup(
     name = "testlib_headers",
     srcs = [
@@ -204,31 +145,6 @@ filegroup(
     ],
 )
 
-filegroup(
-    name = "higher_level_tests_needing_kernels",
-    srcs = [
-        "graph_constructor_test.cc",
-    ],
-)
-
-filegroup(
-    name = "mkl_related_tests",
-    srcs = [
-        "mkl_layout_pass_test.cc",
-        "mkl_tfconversion_pass_test.cc",
-    ],
-)
-
-filegroup(
-    name = "quantize_training_hdrs",
-    srcs = [
-        "quantize_training.h",
-    ],
-    visibility = [
-        "//tensorflow/python:__pkg__",
-    ],
-)
-
 filegroup(
     name = "mobile_srcs_only_runtime",
     srcs = [
@@ -246,31 +162,19 @@ filegroup(
         "default_device.h",
         "edgeset.cc",
         "edgeset.h",
-        "gradients.cc",
-        "gradients.h",
         "graph.cc",
         "graph.h",
-        "graph_constructor.cc",
-        "graph_constructor.h",
         "graph_def_builder.cc",
         "graph_def_builder.h",
-        "graph_def_builder_util.cc",
-        "graph_def_builder_util.h",
         "graph_node_util.cc",
         "graph_node_util.h",
         "graph_partition.cc",
         "graph_partition.h",
         "mkl_graph_util.h",
-        "mkl_layout_pass.cc",
-        "mkl_layout_pass.h",
-        "mkl_tfconversion_pass.cc",
-        "mkl_tfconversion_pass.h",
         "node_builder.cc",
         "node_builder.h",
         "optimizer_cse.cc",
         "optimizer_cse.h",
-        "quantize_training.cc",
-        "quantize_training.h",
         "subgraph.cc",
         "subgraph.h",
         "tensor_id.cc",
diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index cb321ed2c02..faefb0b82e9 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/graph/benchmark_testlib.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/graph/collective_order_test.cc b/tensorflow/core/graph/collective_order_test.cc
index 9a158e5c3fd..0f5c424c29e 100644
--- a/tensorflow/core/graph/collective_order_test.cc
+++ b/tensorflow/core/graph/collective_order_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 #include "tensorflow/core/graph/collective_order.h"
 
 #include <gmock/gmock.h>
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 1f8a4d06c7a..2cc297afb82 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_properties.h"
@@ -38,65 +39,63 @@ namespace tensorflow {
 const int Graph::kControlSlot = -1;
 
 // Node
+Node::NodeClass Node::GetNodeClassForOp(const string& ts) {
+  static const absl::flat_hash_map<string, Node::NodeClass>* kNodeClassTable =
 #define REF_CLASS(key, value) \
   {key, value}, { "Ref" key, value }
-
-const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
-    *new std::unordered_map<string, Node::NodeClass>({
-        // Keep in same order as NodeClass values
-        REF_CLASS("Switch", NC_SWITCH),
-        REF_CLASS("_SwitchN", NC_SWITCH),
-        REF_CLASS("Merge", NC_MERGE),
-        REF_CLASS("Enter", NC_ENTER),
-        REF_CLASS("Exit", NC_EXIT),
-        REF_CLASS("NextIteration", NC_NEXT_ITERATION),
-        {"LoopCond", NC_LOOP_COND},
-        {"ControlTrigger", NC_CONTROL_TRIGGER},
-        {"_Send", NC_SEND},
-        {"_HostSend", NC_HOST_SEND},
-        {"_Recv", NC_RECV},
-        {"_HostRecv", NC_HOST_RECV},
-        {"Const", NC_CONSTANT},
-        {"HostConst", NC_CONSTANT},
-        {"Variable", NC_VARIABLE},
-        {"VariableV2", NC_VARIABLE},
-        REF_CLASS("Identity", NC_IDENTITY),
-        {"GetSessionHandle", NC_GET_SESSION_HANDLE},
-        {"GetSessionHandleV2", NC_GET_SESSION_HANDLE},
-        {"GetSessionTensor", NC_GET_SESSION_TENSOR},
-        {"DeleteSessionTensor", NC_DELETE_SESSION_TENSOR},
-        {"Size", NC_METADATA},
-        {"Shape", NC_METADATA},
-        {"Rank", NC_METADATA},
-        {"_ScopedAllocator", NC_SCOPED_ALLOCATOR},
-        {"CollectiveReduce", NC_COLLECTIVE},
-        {"CollectiveBcastSend", NC_COLLECTIVE},
-        {"CollectiveBcastRecv", NC_COLLECTIVE},
-        {"CollectiveGather", NC_COLLECTIVE},
-        {"FakeParam", NC_FAKE_PARAM},
-        {"PartitionedCall", NC_PARTITIONED_CALL},
-        {"StatefulPartitionedCall", NC_PARTITIONED_CALL},
-        {"SymbolicGradient", NC_SYMBOLIC_GRADIENT},
-        {"If", NC_IF},
-        {"StatelessIf", NC_IF},
-        {"While", NC_WHILE},
-        {"StatelessWhile", NC_WHILE},
-        // Not using the constants defined in FunctionLibraryDefinition
-        // for the
-        // 4 ops below because android inference library does not link
-        // tf.function related files.
-        {"_Arg", NC_ARG},
-        {"_DeviceArg", NC_ARG},
-        {"_Retval", NC_RETVAL},
-        {"_DeviceRetval", NC_RETVAL},
-        {"_XlaMerge", NC_MERGE},
-    });
-
+      new absl::flat_hash_map<string, Node::NodeClass>({
+          // Keep in same order as NodeClass values
+          REF_CLASS("Switch", NC_SWITCH),
+          REF_CLASS("_SwitchN", NC_SWITCH),
+          REF_CLASS("Merge", NC_MERGE),
+          REF_CLASS("Enter", NC_ENTER),
+          REF_CLASS("Exit", NC_EXIT),
+          REF_CLASS("NextIteration", NC_NEXT_ITERATION),
+          {"LoopCond", NC_LOOP_COND},
+          {"ControlTrigger", NC_CONTROL_TRIGGER},
+          {"_Send", NC_SEND},
+          {"_HostSend", NC_HOST_SEND},
+          {"_Recv", NC_RECV},
+          {"_HostRecv", NC_HOST_RECV},
+          {"Const", NC_CONSTANT},
+          {"HostConst", NC_CONSTANT},
+          {"Variable", NC_VARIABLE},
+          {"VariableV2", NC_VARIABLE},
+          REF_CLASS("Identity", NC_IDENTITY),
+          {"GetSessionHandle", NC_GET_SESSION_HANDLE},
+          {"GetSessionHandleV2", NC_GET_SESSION_HANDLE},
+          {"GetSessionTensor", NC_GET_SESSION_TENSOR},
+          {"DeleteSessionTensor", NC_DELETE_SESSION_TENSOR},
+          {"Size", NC_METADATA},
+          {"Shape", NC_METADATA},
+          {"Rank", NC_METADATA},
+          {"_ScopedAllocator", NC_SCOPED_ALLOCATOR},
+          {"CollectiveReduce", NC_COLLECTIVE},
+          {"CollectiveBcastSend", NC_COLLECTIVE},
+          {"CollectiveBcastRecv", NC_COLLECTIVE},
+          {"CollectiveGather", NC_COLLECTIVE},
+          {"FakeParam", NC_FAKE_PARAM},
+          {"PartitionedCall", NC_PARTITIONED_CALL},
+          {"StatefulPartitionedCall", NC_PARTITIONED_CALL},
+          {"SymbolicGradient", NC_SYMBOLIC_GRADIENT},
+          {"If", NC_IF},
+          {"StatelessIf", NC_IF},
+          {"While", NC_WHILE},
+          {"StatelessWhile", NC_WHILE},
+          // Not using the constants defined in FunctionLibraryDefinition
+          // for the
+          // 4 ops below because android inference library does not link
+          // tf.function related files.
+          {"_Arg", NC_ARG},
+          {"_DeviceArg", NC_ARG},
+          {"_Retval", NC_RETVAL},
+          {"_DeviceRetval", NC_RETVAL},
+          {"_XlaMerge", NC_MERGE},
+      });
 #undef REF_CLASS
 
-Node::NodeClass Node::GetNodeClassForOp(const string& ts) {
-  auto it = kNodeClassTable.find(ts);
-  if (it != kNodeClassTable.end()) {
+  auto it = kNodeClassTable->find(ts);
+  if (it != kNodeClassTable->end()) {
     return it->second;
   } else {
     return NC_OTHER;
@@ -127,7 +126,7 @@ Node::Node()
 
 void Node::Initialize(int id, int cost_id,
                       std::shared_ptr<NodeProperties> props,
-                      bool is_function_op) {
+                      Node::NodeClass node_class) {
   DCHECK_EQ(id_, -1);
   DCHECK(in_edges_.empty());
   DCHECK(out_edges_.empty());
@@ -135,12 +134,7 @@ void Node::Initialize(int id, int cost_id,
   cost_id_ = cost_id;
 
   props_ = std::move(props);
-  // Initialize the class_ based on the type string
-  if (is_function_op) {
-    class_ = NC_FUNCTION_OP;
-  } else {
-    class_ = GetNodeClassForOp(props_->node_def.op());
-  }
+  class_ = node_class;
 }
 
 void Node::Clear() {
@@ -162,8 +156,17 @@ void Node::UpdateProperties() {
     LOG(ERROR) << "Failed at updating node: " << status;
     return;
   }
-  props_ = std::make_shared<NodeProperties>(props_->op_def, props_->node_def,
-                                            inputs, outputs);
+  if (props_->input_types != inputs || props_->output_types != outputs) {
+    if (TF_PREDICT_TRUE(props_.use_count() == 1)) {
+      props_->input_types = inputs;
+      props_->input_types_slice = props_->input_types;
+      props_->output_types = outputs;
+      props_->output_types_slice = props_->output_types;
+    } else {
+      props_ = std::make_shared<NodeProperties>(
+          props_->op_def, std::move(props_->node_def), inputs, outputs);
+    }
+  }
 }
 
 const string& Node::name() const { return props_->node_def.name(); }
@@ -423,18 +426,21 @@ Node* Graph::AddNode(NodeDef node_def, Status* status) {
     return nullptr;
   }
 
+  Node::NodeClass node_class = op_reg_data->is_function_op
+                                   ? Node::NC_FUNCTION_OP
+                                   : Node::GetNodeClassForOp(node_def.op());
+
   Node* node = AllocateNode(
       std::make_shared<NodeProperties>(&op_reg_data->op_def,
                                        std::move(node_def), inputs, outputs),
-      nullptr, op_reg_data->is_function_op);
+      nullptr, node_class);
   return node;
 }
 
 Node* Graph::CopyNode(const Node* node) {
   DCHECK(!node->IsSource());
   DCHECK(!node->IsSink());
-  Node* copy =
-      AllocateNode(node->props_, node, node->class_ == Node::NC_FUNCTION_OP);
+  Node* copy = AllocateNode(node->props_, node, node->class_);
   copy->set_assigned_device_name(node->assigned_device_name());
 
   // Since the OpDef of a function may be owned by the Graph that owns 'node',
@@ -759,7 +765,7 @@ Status Graph::IsValidInputTensor(const Node* node, int idx) const {
 }
 
 Node* Graph::AllocateNode(std::shared_ptr<NodeProperties> props,
-                          const Node* cost_node, bool is_function_op) {
+                          const Node* cost_node, Node::NodeClass node_class) {
   Node* node = nullptr;
   if (free_nodes_.empty()) {
     node = new (arena_.Alloc(sizeof(Node))) Node;  // placement new
@@ -770,7 +776,7 @@ Node* Graph::AllocateNode(std::shared_ptr<NodeProperties> props,
   node->graph_ = this;
   const int id = nodes_.size();
   int cost_id = cost_node ? cost_node->cost_id() : id;
-  node->Initialize(id, cost_id, std::move(props), is_function_op);
+  node->Initialize(id, cost_id, std::move(props), node_class);
   nodes_.push_back(node);
   ++num_nodes_;
   return node;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index cdb2d123eaf..675f96fa5cd 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -236,10 +236,6 @@ class Node {
   friend class Graph;
   Node();
 
-
-  void Initialize(int id, int cost_id, std::shared_ptr<NodeProperties> props,
-                  bool is_function_op);
-
   // Releases memory from props_, in addition to restoring *this to its
   // uninitialized state.
   void Clear();
@@ -291,7 +287,8 @@ class Node {
     NC_OTHER  // Not a special kind of node
   };
 
-  static const std::unordered_map<string, NodeClass>& kNodeClassTable;
+  void Initialize(int id, int cost_id, std::shared_ptr<NodeProperties> props,
+                  NodeClass node_class);
 
   static NodeClass GetNodeClassForOp(const string& ts);
 
@@ -692,7 +689,7 @@ class Graph {
   //
   // Ownership of the returned Node is not transferred to caller.
   Node* AllocateNode(std::shared_ptr<NodeProperties> props,
-                     const Node* cost_node, bool is_function_op);
+                     const Node* cost_node, Node::NodeClass node_class);
   void ReleaseNode(Node* node);
   // Insert edge in free_edges_ for possible reuse.
   void RecycleEdge(const Edge* edge);
diff --git a/tensorflow/core/graph/graph_def_builder_test.cc b/tensorflow/core/graph/graph_def_builder_test.cc
index be3c2be8007..f701154ecc2 100644
--- a/tensorflow/core/graph/graph_def_builder_test.cc
+++ b/tensorflow/core/graph/graph_def_builder_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/graph/graph_def_builder.h"
 
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/graph/graph_partition.h b/tensorflow/core/graph/graph_partition.h
index 8020c2d2478..04ea0ac2e67 100644
--- a/tensorflow/core/graph/graph_partition.h
+++ b/tensorflow/core/graph/graph_partition.h
@@ -42,7 +42,7 @@ struct PartitionOptions {
   // A function that returns the incarnation of a device given the
   // device's fullname. If not found, GetIncarnationFunc should return
   // kIllegalIncarnation.
-  static const uint64 kIllegalIncarnation = 0;
+  static constexpr uint64 kIllegalIncarnation = 0;
   typedef std::function<uint64(const string&)> GetIncarnationFunc;
   GetIncarnationFunc get_incarnation = nullptr;
 
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 61512ef2131..797871053ba 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -26,12 +26,12 @@ limitations under the License.
 #include "tensorflow/cc/ops/random_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/while_loop.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 032ab738d4a..a8b421367ab 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -18,10 +18,11 @@ limitations under the License.
 #include <set>
 #include <unordered_map>
 #include <vector>
+
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/graph/benchmark_testlib.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 21affb608a6..08292068efc 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -17,10 +17,11 @@ limitations under the License.
 
 #include <utility>
 #include <vector>
+
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index e70427f9ef8..32c1aeac0de 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index 5877d376027..a8a834a0a83 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 443b1918a0f..02a26cdd390 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -91,6 +91,7 @@ tf_cc_test(
     deps = [
         ":graph_properties",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -270,6 +271,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:utils",
+        "//tensorflow/core/grappler/utils:transitive_fanin",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
diff --git a/tensorflow/core/grappler/costs/graph_memory_test.cc b/tensorflow/core/grappler/costs/graph_memory_test.cc
index 95170ba49b7..bcb20098575 100644
--- a/tensorflow/core/grappler/costs/graph_memory_test.cc
+++ b/tensorflow/core/grappler/costs/graph_memory_test.cc
@@ -59,13 +59,13 @@ TEST_F(GraphMemoryTest, Basic) {
   for (const auto& t : mem_usage.live_tensors) {
     tensors.insert(strings::StrCat(t.node, ":", t.output_id));
   }
-  // When the execution of the 'Square' node completes, TF can start executing
-  // 'Square_1' and release the memory used by 'x'. Since we can't be sure of
+  // When the execution of the 'Sign' node completes, TF can start executing
+  // 'Sign_1' and release the memory used by 'x'. Since we can't be sure of
   // the order in which this takes place, in the worst case the 3 tensors are in
   // memory.
   std::set<string> expected;
-  expected.insert("Square:0");
-  expected.insert("Square_1:0");
+  expected.insert("Sign:0");
+  expected.insert("Sign_1:0");
   expected.insert("x:0");
   EXPECT_EQ(expected, tensors);
 }
@@ -91,7 +91,7 @@ TEST_F(GraphMemoryTest, UnknownBatchSize) {
   }
   std::set<string> expected;
   expected.insert("Const/Const:0");
-  expected.insert("Square:0");
+  expected.insert("Sign:0");
   expected.insert("x:0");
   EXPECT_EQ(expected, tensors);
 }
@@ -114,8 +114,8 @@ TEST_F(GraphMemoryTest, MultiDevice) {
     cpu_tensors.insert(strings::StrCat(t.node, ":", t.output_id));
   }
   std::set<string> cpu_expected;
-  cpu_expected.insert("Recv_Square_1_0_on_/CPU_0:0");
-  cpu_expected.insert("Square:0");
+  cpu_expected.insert("Recv_Sign_1_0_on_/CPU_0:0");
+  cpu_expected.insert("Sign:0");
   cpu_expected.insert("x:0");
   cpu_expected.insert("AddN:0");
   EXPECT_EQ(cpu_expected, cpu_tensors);
@@ -128,7 +128,7 @@ TEST_F(GraphMemoryTest, MultiDevice) {
   }
   std::set<string> gpu_expected;
   gpu_expected.insert("Recv_AddN_0_on_/GPU_0:0");
-  gpu_expected.insert("Square_1:0");
+  gpu_expected.insert("Sign_1:0");
   gpu_expected.insert("AddN_1:0");
   gpu_expected.insert("AddN_3:0");
   EXPECT_EQ(gpu_expected, gpu_tensors);
@@ -154,8 +154,8 @@ TEST_F(GraphMemoryTest, GpuSwapping) {
       gpu_tensors.insert(strings::StrCat(t.node, ":", t.output_id));
     }
     std::set<string> gpu_expected;
-    gpu_expected.insert("Square:0");
-    gpu_expected.insert("Square_1:0");
+    gpu_expected.insert("Sign:0");
+    gpu_expected.insert("Sign_1:0");
     gpu_expected.insert("AddN:0");
     gpu_expected.insert("AddN_1:0");
     gpu_expected.insert("AddN_2:0");
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index be987f2d151..ee691e7a081 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
 #include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -24,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
@@ -590,6 +591,20 @@ bool IsWhiteListedOpTypeForEvaluateNode(const string& op_type) {
   return kOpTpeWhitelist->find(op_type) != kOpTpeWhitelist->end();
 }
 
+// Negative shape size of '-1' represents unknown, while negative shape sizes
+// less than -1 represent unknown symbolic shapes (e.g. the shape of [-5, 5, -1,
+// -5] really means [x, 5, ?, x]). Before we can output the tensors as shapes,
+// we need to normalize them: mark all values <-1 as "unknown" (-1).
+static void NormalizeShapeForOutput(TensorShapeProto* shape) {
+  for (int i = 0; i < shape->dim_size(); i++) {
+    if (shape->dim(i).size() < -1) {
+      VLOG(2) << "Normalizing dimension: " << i << " from "
+              << shape->dim(i).size() << " to -1";
+      shape->mutable_dim(i)->set_size(-1);
+    }
+  }
+}
+
 // Processes symbolic shapes.
 // Each symbolic shape or dimension is represented by a handle. Unlike the TF
 // shape refiner which creates new handles every time it processes an unknown
@@ -722,7 +737,8 @@ class SymbolicShapeRefiner {
     return it->second.inference_context.get();
   }
 
-  // Forward the shapes from the function input nodes to
+  // Forward the shapes from the function input nodes, PartitionedCalls or
+  // StatefulPartitionedCall to
   // the argument nodes (which are Placeholder nodes), then
   // perform shape inference on the function body.
   //
@@ -732,10 +748,12 @@ class SymbolicShapeRefiner {
   // In the event of an error, UpdateNode will simply set `function_node`'s
   // output shape to be Unknown.
   Status UpdateFunction(const NodeDef* function_node) {
-    auto it = fun_to_grappler_function_item_.find(function_node->op());
+    NameAttrList function;
+    TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(*function_node, &function));
+    auto it = fun_to_grappler_function_item_.find(function.name());
     if (it == fun_to_grappler_function_item_.end()) {
       return errors::InvalidArgument(
-          function_node->op(),
+          function.name(),
           " was not previously added to SymbolicShapeRefiner.");
     }
 
@@ -743,7 +761,7 @@ class SymbolicShapeRefiner {
         it->second;
     if (!maybe_grappler_function_item.has_value()) {
       VLOG(3) << "Skip failed to instantiate function call: function_name="
-              << function_node->op();
+              << function.name();
 
       auto* ctx = GetNodeContext(function_node);
       auto* ic = ctx->inference_context.get();
@@ -786,18 +804,47 @@ class SymbolicShapeRefiner {
       int output_port_num = input_tensor.index();
       AttrValue attr_output_shape;
       TensorShapeProto proto;
-      const auto& handle = input_ic->output(output_port_num);
+      const auto handle = input_ic->output(output_port_num);
       input_ic->ShapeHandleToProto(handle, &proto);
       // There may be dim.size < -1 in SymbolicShapeRefiner. Change those to -1.
-      for (int i = 0; i < proto.dim_size(); i++) {
-        if (proto.dim(i).size() < -1) {
-          proto.mutable_dim(i)->set_size(-1);
-        }
-      }
-
+      NormalizeShapeForOutput(&proto);
+      // _Arg op's output shape uses _output_shapes attr.
       AttrValue output_attr;
       output_attr.mutable_list()->add_shape()->Swap(&proto);
       (*fun_node->mutable_attr())["_output_shapes"] = output_attr;
+
+      // If dtype is DT_RESOURCE, ops that read _Arg op use _handle_dtypes and
+      // _handle_shapes attr for its shapes and dtypes.
+      if (fun_input.data_type == DT_RESOURCE) {
+        auto* shapes_and_types =
+            input_ic->output_handle_shapes_and_types(output_port_num);
+        if (shapes_and_types != nullptr && !shapes_and_types->empty()) {
+          AttrValue dtype_attr;
+          AttrValue shape_attr;
+          for (const auto& shape_and_type : *shapes_and_types) {
+            const auto& dtype = shape_and_type.dtype;
+            const auto& shape_handle = shape_and_type.shape;
+            dtype_attr.mutable_list()->add_type(dtype);
+            input_ic->ShapeHandleToProto(
+                shape_handle, shape_attr.mutable_list()->add_shape());
+          }
+          (*fun_node->mutable_attr())["_handle_dtypes"] = dtype_attr;
+          (*fun_node->mutable_attr())["_handle_shapes"] = shape_attr;
+        } else {
+          // Note that we do not return error here, even if the input node does
+          // not have shapes_and_types. Within the function, we cannot infer the
+          // output shape of the DT_RESOURCE input; hence, potentially unknown
+          // shapes/dims in the function output shapes.
+          VLOG(2)
+              << "A function node (" << function_node->name()
+              << ") has input with DT_RESOURCE, but the input node does not "
+              << "have shapes_and_types information: \n"
+              << "function_node: " << function_node->ShortDebugString() << "\n"
+              << "function input: " << i
+              << ", input node's output: " << output_port_num << "\n"
+              << "input node: " << input_node->ShortDebugString();
+        }
+      }
     }
 
     // Replace input nodes with Consts, if values are known. Note that
@@ -806,7 +853,7 @@ class SymbolicShapeRefiner {
     auto* ic = ctx->inference_context.get();
     for (int i = grappler_function_item.inputs().size() - 1; i >= 0; --i) {
       const string& input = function_node->input(i);
-      const string& node_name = NodeName(input);
+      const string node_name = NodeName(input);
       const NodeDef* input_node = graph_.GetNode(node_name);
       if (IsConstant(*input_node)) {
         TF_CHECK_OK(
@@ -870,8 +917,9 @@ class SymbolicShapeRefiner {
             out_tensor.ToString(), " has invalid position ", out_tensor.index(),
             " (output_properties.size() = ", output_properties.size(), ").");
       }
-      auto const& outprop = output_properties[out_tensor.index()];
-      const TensorShapeProto& shape = outprop.shape();
+      auto& outprop = output_properties[out_tensor.index()];
+      TensorShapeProto shape = outprop.shape();
+      NormalizeShapeForOutput(&shape);
       ShapeHandle out;
       TF_RETURN_IF_ERROR(ic->MakeShapeFromShapeProto(shape, &out));
       ic->set_output(output, out);
@@ -978,9 +1026,11 @@ class SymbolicShapeRefiner {
     // Convert all kUnknownDimFromConst to -1 for shape inference.
     ic->set_input_tensors_as_shapes(ReplaceUnknownDimFromConstWithUnknownDim(
         ic, ctx->input_tensors_as_shapes_to_propagate));
-    // Notice: UpdateFunction only uses input_tensors_as_shapes, so for function
-    // nodes, we dont' perform the conversion from TensorProtos to Tensors for
-    // constant inputs here.
+    // Note: UpdateFunction uses input_tensors_as_shapes and
+    // input_tensor_protos (not the Tensor object) for input values.
+    // so for function nodes, we don't need to convert TensorProtos
+    // to Tensors here. If the current op is not a function op, we convert
+    // TensorProtos to Tensors before calling InferShapes.
 
     // Properly handle function nodes.
     if (ctx->op_data && ctx->op_data->is_function_op) {
@@ -1196,15 +1246,14 @@ class SymbolicShapeRefiner {
     return true;
   }
 
-  Status AddFunction(const NodeDef* function_node) {
-    auto it = fun_to_grappler_function_item_.find(function_node->op());
+  Status AddFunction(const NodeDef* function_node, NameAttrList function) {
+    auto it = fun_to_grappler_function_item_.find(function.name());
     if (it != fun_to_grappler_function_item_.end()) {
       return Status::OK();
     }
 
     const FunctionDef* function_def =
-        CHECK_NOTNULL(function_library_.Find(function_node->op()));
-
+        CHECK_NOTNULL(function_library_.Find(function.name()));
     GrapplerFunctionItem grappler_function_item;
     Status function_instantiated =
         MakeGrapplerFunctionItem(*function_def, function_library_,
@@ -1242,10 +1291,15 @@ class SymbolicShapeRefiner {
 
   Status AddNode(const NodeDef* node) {
     NodeContext& node_ctx = node_to_context_[node];
-    TF_RETURN_IF_ERROR(function_library_.LookUp(node->op(), &node_ctx.op_data));
+    NameAttrList function;
+    TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(*node, &function));
+
+    // For PartitionedCall, op_data represents the function info.
+    TF_RETURN_IF_ERROR(
+        function_library_.LookUp(function.name(), &node_ctx.op_data));
 
     if (node_ctx.op_data->is_function_op) {
-      TF_RETURN_IF_ERROR(AddFunction(node));
+      TF_RETURN_IF_ERROR(AddFunction(node, function));
     }
 
     TF_RETURN_IF_ERROR(InOutTypesForNode(*node, node_ctx.op_data->op_def,
@@ -2525,13 +2579,7 @@ Status GraphProperties::AnnotateOutputShapes(GraphDef* output_graph_def,
       TensorShapeProto* proto = attr_output_shape.mutable_list()->add_shape();
       *proto = tensor_property.shape();
       if (!allow_symbolic_shapes) {
-        // There may be dim.size < -1 in SymbolicShapeRefiner. Change those to
-        // -1.
-        for (int i = 0; i < proto->dim_size(); i++) {
-          if (proto->dim(i).size() < -1) {
-            proto->mutable_dim(i)->set_size(-1);
-          }
-        }
+        NormalizeShapeForOutput(proto);
       }
     }
     (*node->mutable_attr())["_output_shapes"] = attr_output_shape;
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 135fc521668..4137d4315bc 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
 #include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -58,7 +59,7 @@ class GraphPropertiesTest : public ::testing::Test {
   void SetUp() override {
     // Provision a single machine with 3 cpu cores
     cluster_.reset(new SingleMachine(5 * 60, 3, 0));
-    TF_CHECK_OK(cluster_->Provision());
+    TF_ASSERT_OK(cluster_->Provision());
 
     // This function is simply
     // out = Fill(shape, value), but
@@ -86,7 +87,7 @@ class GraphPropertiesTest : public ::testing::Test {
   }
 
   void TearDown() override {
-    TF_CHECK_OK(cluster_->Shutdown());
+    TF_ASSERT_OK(cluster_->Shutdown());
     cluster_.reset();
   }
 
@@ -113,11 +114,11 @@ class GraphPropertiesTest : public ::testing::Test {
   void ExpectTensorValues(const std::vector<int64>& expected,
                           const TensorProto& tensor_proto_to_compare) {
     Tensor tensor;
-    EXPECT_TRUE(tensor.FromProto(tensor_proto_to_compare));
+    ASSERT_TRUE(tensor.FromProto(tensor_proto_to_compare));
     EXPECT_EQ(expected.size(), tensor.NumElements());
     // We're interested in only integer tensors as only shapes are exported as
     // graph properties values.
-    CHECK(tensor.dtype() == DT_INT32 || tensor.dtype() == DT_INT64);
+    ASSERT_TRUE(tensor.dtype() == DT_INT32 || tensor.dtype() == DT_INT64);
     if (tensor.dtype() == DT_INT32) {
       for (int i = 0; i < tensor.NumElements(); i++) {
         EXPECT_EQ(expected[i], tensor.flat<int32>()(i));
@@ -129,6 +130,19 @@ class GraphPropertiesTest : public ::testing::Test {
     }
   }
 
+  // Compare values of float (DT_FLOAT) tensor against expected
+  // ones.
+  void ExpectFloatTensorValues(const std::vector<float>& expected,
+                               const TensorProto& tensor_proto_to_compare) {
+    Tensor tensor;
+    ASSERT_TRUE(tensor.FromProto(tensor_proto_to_compare));
+    EXPECT_EQ(expected.size(), tensor.NumElements());
+    ASSERT_EQ(tensor.dtype(), DT_FLOAT);
+    for (int i = 0; i < tensor.NumElements(); i++) {
+      EXPECT_EQ(expected[i], tensor.flat<float>()(i));
+    }
+  }
+
   std::unique_ptr<SingleMachine> cluster_;
   FunctionDefLibrary function_lib_;
 };
@@ -141,7 +155,7 @@ TEST_F(GraphPropertiesTest, StaticProperties) {
 
   GraphProperties properties(item);
   Status s = properties.InferStatically(true);
-  TF_CHECK_OK(s);
+  TF_ASSERT_OK(s);
 
   for (const auto& node : item.graph.node()) {
     if (node.op() == "RandomStandardNormal") {
@@ -182,7 +196,7 @@ TEST_F(GraphPropertiesTest, ClearProperties) {
 
   GraphProperties properties(item);
   Status s = properties.InferStatically(true);
-  TF_CHECK_OK(s);
+  TF_ASSERT_OK(s);
 
   for (const auto& node : item.graph.node()) {
     if (node.op() == "RandomStandardNormal") {
@@ -208,9 +222,9 @@ TEST_F(GraphPropertiesTest, DynamicProperties) {
   CHECK(fake_input.NextItem(&item));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(cluster_->Initialize(item));
+  TF_ASSERT_OK(cluster_->Initialize(item));
   Status s = properties.InferDynamically(cluster_.get());
-  TF_CHECK_OK(s);
+  TF_ASSERT_OK(s);
 
   for (const auto& node : item.graph.node()) {
     if (node.op() == "RandomStandardNormal") {
@@ -256,27 +270,27 @@ TEST_F(GraphPropertiesTest, DynamicProperties) {
 
 TEST_F(GraphPropertiesTest, Variables) {
   GrapplerItem item;
-  TF_CHECK_OK(NodeDefBuilder("Var", "Variable")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("shape", TensorShape({3, 7}))
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Var", "Variable")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("shape", TensorShape({3, 7}))
+                   .Finalize(item.graph.add_node()));
   item.fetch.push_back("Var");
 
   Tensor initial_val(DT_FLOAT, TensorShape({3, 7}));
   test::FillIota<float>(&initial_val, 0);
-  TF_CHECK_OK(NodeDefBuilder("InitialVal", "Const")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("value", initial_val)
-                  .Finalize(item.graph.add_node()));
-  TF_CHECK_OK(NodeDefBuilder("InitVar", "Assign")
-                  .Input("Var", 0, DT_FLOAT_REF)
-                  .Input("InitialVal", 0, DT_FLOAT)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("InitialVal", "Const")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("value", initial_val)
+                   .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("InitVar", "Assign")
+                   .Input("Var", 0, DT_FLOAT_REF)
+                   .Input("InitialVal", 0, DT_FLOAT)
+                   .Finalize(item.graph.add_node()));
   item.init_ops.push_back("InitVar");
 
   {
     GraphProperties static_properties(item);
-    TF_CHECK_OK(static_properties.InferStatically(false));
+    TF_ASSERT_OK(static_properties.InferStatically(false));
 
     const auto props = static_properties.GetOutputProperties("Var");
     EXPECT_EQ(1, props.size());
@@ -288,9 +302,9 @@ TEST_F(GraphPropertiesTest, Variables) {
     EXPECT_EQ(7, prop.shape().dim(1).size());
   }
   {
-    TF_CHECK_OK(cluster_->Initialize(item));
+    TF_ASSERT_OK(cluster_->Initialize(item));
     GraphProperties dynamic_properties(item);
-    TF_CHECK_OK(dynamic_properties.InferDynamically(cluster_.get()));
+    TF_ASSERT_OK(dynamic_properties.InferDynamically(cluster_.get()));
 
     const auto props = dynamic_properties.GetOutputProperties("Var");
     EXPECT_EQ(1, props.size());
@@ -305,24 +319,24 @@ TEST_F(GraphPropertiesTest, Variables) {
 
 TEST_F(GraphPropertiesTest, ReadVariableOpAfterEnter) {
   GrapplerItem item;
-  TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("shape", TensorShape({3, 7}))
-                  .Finalize(item.graph.add_node()));
-  TF_CHECK_OK(NodeDefBuilder("Enter", "Enter")
-                  .Attr("T", DT_RESOURCE)
-                  .Attr("frame_name", "while_context")
-                  .Attr("is_constant", true)
-                  .Attr("parallel_iterations", 10)
-                  .Input("Var", 0, DT_RESOURCE)
-                  .Finalize(item.graph.add_node()));
-  TF_CHECK_OK(NodeDefBuilder("ReadVariableOpAfterEnter", "ReadVariableOp")
-                  .Attr("dtype", DT_FLOAT)
-                  .Input("Enter", 0, DT_RESOURCE)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Var", "VarHandleOp")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("shape", TensorShape({3, 7}))
+                   .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Enter", "Enter")
+                   .Attr("T", DT_RESOURCE)
+                   .Attr("frame_name", "while_context")
+                   .Attr("is_constant", true)
+                   .Attr("parallel_iterations", 10)
+                   .Input("Var", 0, DT_RESOURCE)
+                   .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("ReadVariableOpAfterEnter", "ReadVariableOp")
+                   .Attr("dtype", DT_FLOAT)
+                   .Input("Enter", 0, DT_RESOURCE)
+                   .Finalize(item.graph.add_node()));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto props = properties.GetOutputProperties("ReadVariableOpAfterEnter");
   EXPECT_EQ(1, props.size());
   const OpInfo::TensorProperties& prop = props[0];
@@ -335,18 +349,18 @@ TEST_F(GraphPropertiesTest, ReadVariableOpAfterEnter) {
 
 TEST_F(GraphPropertiesTest, VarHandles) {
   GrapplerItem item;
-  TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("shape", TensorShape({3, 7}))
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Var", "VarHandleOp")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("shape", TensorShape({3, 7}))
+                   .Finalize(item.graph.add_node()));
 
-  TF_CHECK_OK(NodeDefBuilder("VarRead", "ReadVariableOp")
-                  .Attr("dtype", DT_FLOAT)
-                  .Input("Var", 0, DT_RESOURCE)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("VarRead", "ReadVariableOp")
+                   .Attr("dtype", DT_FLOAT)
+                   .Input("Var", 0, DT_RESOURCE)
+                   .Finalize(item.graph.add_node()));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto props = properties.GetOutputProperties("VarRead");
   EXPECT_EQ(1, props.size());
@@ -377,9 +391,9 @@ TEST_F(GraphPropertiesTest, WhileLoopWithVarHandleOpInput) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "while_loop_var_handle_op.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   std::vector<string> resource_nodes{
       "loop_var",       "while/Enter",         "while/Merge", "while/Switch",
@@ -403,10 +417,10 @@ TEST_F(GraphPropertiesTest, QueueWithOnlyDequeue_NoShapeAttr) {
       ops::QueueDequeue(root.WithOpName("Dequeue1"), q1, {DataType::DT_FLOAT});
 
   GrapplerItem item;
-  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(root.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto props1 = properties.GetOutputProperties("Dequeue1");
   ASSERT_EQ(1, props1.size());
@@ -421,10 +435,10 @@ TEST_F(GraphPropertiesTest, QueueWithOnlyDequeue_ShapeAttr) {
       ops::QueueDequeue(root.WithOpName("Dequeue1"), q1, {DataType::DT_FLOAT});
 
   GrapplerItem item;
-  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(root.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto props1 = properties.GetOutputProperties("Dequeue1");
   ASSERT_EQ(1, props1.size());
@@ -439,10 +453,10 @@ TEST_F(GraphPropertiesTest, QueueWithOnlyDequeue_PartialShapeAttr) {
       ops::QueueDequeue(root.WithOpName("Dequeue1"), q1, {DataType::DT_FLOAT});
 
   GrapplerItem item;
-  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(root.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto props1 = properties.GetOutputProperties("Dequeue1");
   ASSERT_EQ(1, props1.size());
@@ -492,10 +506,10 @@ TEST_F(GraphPropertiesTest, Queues) {
       {DataType::DT_FLOAT, DataType::DT_DOUBLE, DataType::DT_FLOAT});
 
   GrapplerItem item;
-  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(root.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto props1 = properties.GetOutputProperties("Dequeue1");
   ASSERT_EQ(1, props1.size());
@@ -538,9 +552,9 @@ TEST_F(GraphPropertiesTest, MergeWithoutLoops) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "merge_without_loops.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"cond/Merge", "cond/concat", "cond/concat_1"};
   std::vector<string> expected_outputs{"float: [-1,-1,1]", "float: [2,1,1]",
@@ -580,9 +594,9 @@ TEST_F(GraphPropertiesTest, WhileLoop) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "while_loop.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
                             "while/Exit_1"};
@@ -638,9 +652,9 @@ TEST_F(GraphPropertiesTest, NestedLoop) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "nested_loop.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -701,9 +715,9 @@ TEST_F(GraphPropertiesTest, LoopsAndQueues) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "loops_and_queues.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -759,9 +773,9 @@ TEST_F(GraphPropertiesTest, LoopsAndResourceVars) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "loops_and_resource_vars.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -811,9 +825,9 @@ TEST_F(GraphPropertiesTest, QueuesAndLoops) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "queues_and_loops.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
                             "while/Exit_1"};
@@ -858,11 +872,11 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape) {
       ops::Assign(s.WithOpName("init_restore_v2"), var, restore_v2);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   item.fetch.push_back("init_restore");
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto restore_props = properties.GetOutputProperties("restore");
   const OpInfo::TensorProperties& restore_prop = restore_props[0];
@@ -904,12 +918,12 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   Output init2 = ops::Assign(s.WithOpName("init2"), var2, restore);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   item.fetch.push_back("init");
   item.fetch.push_back("init2");
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto props = properties.GetOutputProperties("restore");
   const OpInfo::TensorProperties& prop = props[0];
@@ -927,9 +941,9 @@ TEST_F(GraphPropertiesTest, TensorAsShapesPropagation) {
   Output c1 = ops::Identity(s.WithOpName("c1"), c);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   // Check output shapes.
   EXPECT_EQ("int32: [2]", PropToString(properties.GetOutputProperties("a")[0]));
@@ -984,9 +998,9 @@ TEST_F(GraphPropertiesTest, IdentityPassingShape) {
   Output d = ops::Fill(s.WithOpName("fill"), b, c);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto out_props = properties.GetOutputProperties("fill");
   const OpInfo::TensorProperties out_prop0 = out_props[0];
   EXPECT_EQ("float: [5,5]", PropToString(out_prop0));
@@ -1007,9 +1021,9 @@ TEST_F(GraphPropertiesTest, SkippingValueInferenceForLargeTensors) {
     Output c = ops::Fill(s.WithOpName("fill"), a, b);
 
     GrapplerItem item;
-    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
     GraphProperties properties(item);
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/false,
         /*aggressive_shape_inference=*/true,
         /*include_tensor_values=*/true));
@@ -1027,9 +1041,9 @@ TEST_F(GraphPropertiesTest, SkippingValueInferenceForLargeTensors) {
     Output c = ops::Fill(s.WithOpName("fill"), a, b);
 
     GrapplerItem item;
-    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
     GraphProperties properties(item);
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/false,
         /*aggressive_shape_inference=*/true,
         /*include_tensor_values=*/true));
@@ -1055,9 +1069,9 @@ TEST_F(GraphPropertiesTest, PackWithConstInput) {
   Output g = ops::Fill(s.WithOpName("fill"), e, f);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto out_props = properties.GetOutputProperties("fill");
   const OpInfo::TensorProperties out_prop0 = out_props[0];
   EXPECT_EQ("float: [1,2,3,4]", PropToString(out_prop0));
@@ -1070,9 +1084,9 @@ TEST_F(GraphPropertiesTest, RankOp) {
   Output i = ops::Identity(s.WithOpName("Identity"), r);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto rank_props = properties.GetOutputProperties("Rank");
   const OpInfo::TensorProperties rank_prop0 = rank_props[0];
   EXPECT_EQ("int32: []", PropToString(rank_prop0));
@@ -1092,9 +1106,9 @@ TEST_F(GraphPropertiesTest, SizeOp) {
   Output i = ops::Identity(s.WithOpName("Identity"), r);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto size_props = properties.GetOutputProperties("Size");
   const OpInfo::TensorProperties size_props0 = size_props[0];
   EXPECT_EQ("int32: []", PropToString(size_props0));
@@ -1131,17 +1145,75 @@ TEST_F(GraphPropertiesTest, PackWithIdentityInput) {
   Output g = ops::Fill(s.WithOpName("fill"), e, f);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto out_props = properties.GetOutputProperties("fill");
   const OpInfo::TensorProperties out_prop0 = out_props[0];
   EXPECT_EQ("float: [1,2,3,4]", PropToString(out_prop0));
 }
 
+TEST_F(GraphPropertiesTest, FunctionWithDtResourceInput) {
+  // Function ops may have DT_RESOURCE input; if not properly set shapes and
+  // dtypes through the DT_RESOURCE _Arg, we cannot infer output shapes of such
+  // function ops.
+  GrapplerItem item;
+  string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
+                                 "function_with_dt_resource_input.pbtxt");
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
+
+  // This graph evaluates FunctionWithDtResourceInput with two inputs:
+  // x [DT_FLOAT Const],
+  // _Arg [DT_RESOURCE _Arg]
+  // and has two outputs:
+  // z1 = x + _Arg
+  // z2 = x
+  {
+    GraphProperties properties(item);
+    TF_ASSERT_OK(properties.InferStatically(false));
+    const auto out_props =
+        properties.GetOutputProperties("FunctionWithDtResourceInput");
+    EXPECT_EQ(out_props.size(), 2);
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("float: [1,3]", PropToString(out_prop0));
+    const OpInfo::TensorProperties out_prop1 = out_props[1];
+    EXPECT_EQ("float: [1,3]", PropToString(out_prop1));
+  }
+
+  {
+    // Delete _handle_dtypes and _handle_shapes attr for the input _Arg node.
+    for (int i = 0; i < item.graph.node_size(); i++) {
+      auto* node = item.graph.mutable_node(i);
+      if (node->name() == "y") {  // _Arg node with DT_RESOURCE
+        node->mutable_attr()->erase("_handle_dtypes");
+        node->mutable_attr()->erase("_handle_shapes");
+        break;
+      }
+    }
+    // We cannot infer the function output shape correclty without those attr,
+    // but still it shouldn't fail; also, there can be some shapes we can
+    // infer in such a case. In this test graph,
+    // z2 of the function node just returns x input; hence, even if _Arg's shape
+    // cannot be inferred, we can infer z2 output shape.
+    GraphProperties properties(item);
+    TF_ASSERT_OK(properties.InferStatically(false));
+    const auto out_props =
+        properties.GetOutputProperties("FunctionWithDtResourceInput");
+    EXPECT_EQ(out_props.size(), 2);
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    // Without shape and dtype attr, we don't know _Arg's shape; hence, unknown
+    // for x + _Arg.
+    EXPECT_EQ("float: ?", PropToString(out_prop0));
+    // The 2nd output is just x, so even if _Arg's shape is unknown, we can
+    // infer this output shape.
+    const OpInfo::TensorProperties out_prop1 = out_props[1];
+    EXPECT_EQ("float: [1,3]", PropToString(out_prop1));
+  }
+}
+
 TEST_F(GraphPropertiesTest, FunctionWithConstInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  TF_CHECK_OK(s.graph()->AddFunctionLibrary(function_lib_));
+  TF_ASSERT_OK(s.graph()->AddFunctionLibrary(function_lib_));
   Output shape = ops::Const(s.WithOpName("shape"), {1, 2, 3, 4});
   Output value = ops::Const(s.WithOpName("value"), 0.1f, {});
   auto builder = tensorflow::NodeBuilder("MyFillFunc", "MyFillFunc",
@@ -1149,13 +1221,13 @@ TEST_F(GraphPropertiesTest, FunctionWithConstInput) {
   tensorflow::Node* func_op;
   auto _shape = tensorflow::ops::AsNodeOut(s, shape);
   auto _value = tensorflow::ops::AsNodeOut(s, value);
-  TF_CHECK_OK(
+  TF_ASSERT_OK(
       builder.Input(_shape).Input(_value).Finalize(s.graph(), &func_op));
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto out_props = properties.GetOutputProperties("MyFillFunc");
   const OpInfo::TensorProperties out_prop0 = out_props[0];
   EXPECT_EQ("float: [1,2,3,4]", PropToString(out_prop0));
@@ -1166,7 +1238,7 @@ TEST_F(GraphPropertiesTest, FunctionWithIdentityOfConstInput) {
   // so tensor shapes, not tensor value, should be used as Const input to
   // function.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  TF_CHECK_OK(s.graph()->AddFunctionLibrary(function_lib_));
+  TF_ASSERT_OK(s.graph()->AddFunctionLibrary(function_lib_));
   Output shape_ = ops::Const(s.WithOpName("shape_"), {1, 2, 3, 4});
   Output shape = ops::Identity(s.WithOpName("shape"), shape_);
   Output value = ops::Const(s.WithOpName("value"), 0.1f, {});
@@ -1175,13 +1247,13 @@ TEST_F(GraphPropertiesTest, FunctionWithIdentityOfConstInput) {
   tensorflow::Node* func_op;
   auto _shape = tensorflow::ops::AsNodeOut(s, shape);
   auto _value = tensorflow::ops::AsNodeOut(s, value);
-  TF_CHECK_OK(
+  TF_ASSERT_OK(
       builder.Input(_shape).Input(_value).Finalize(s.graph(), &func_op));
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto out_props = properties.GetOutputProperties("MyFillFunc");
   const OpInfo::TensorProperties out_prop0 = out_props[0];
   EXPECT_EQ("float: [1,2,3,4]", PropToString(out_prop0));
@@ -1197,7 +1269,7 @@ TEST_F(GraphPropertiesTest, FunctionReturnTensorValue) {
       {{{"a"}, "Identity", {"x"}, {{"T", DataType::DT_INT32}}}},  // Nodes
       {{"out", "a:output:0"}});                                   // Returns
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  TF_CHECK_OK(s.graph()->AddFunctionLibrary(library));
+  TF_ASSERT_OK(s.graph()->AddFunctionLibrary(library));
 
   // MyFunc takes Const (shape) and passes it with Identity. Expect function
   // output has the same shape as well as value (output_tensors_as_shape) as
@@ -1207,13 +1279,13 @@ TEST_F(GraphPropertiesTest, FunctionReturnTensorValue) {
   auto builder =
       tensorflow::NodeBuilder("MyFunc", "MyFunc", s.graph()->op_registry());
   tensorflow::Node* func_op;
-  TF_CHECK_OK(builder.Input(_shape).Finalize(s.graph(), &func_op));
+  TF_ASSERT_OK(builder.Input(_shape).Finalize(s.graph(), &func_op));
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(true));
+  TF_ASSERT_OK(properties.InferStatically(true));
   const auto out_props = properties.GetOutputProperties("MyFunc");
   const OpInfo::TensorProperties out_prop0 = out_props[0];
   EXPECT_EQ("int32: [2]", PropToString(out_prop0));
@@ -1234,23 +1306,23 @@ TEST_F(GraphPropertiesTest, ArithmeticFunctionReturnTensorValue) {
       {{{"a"}, "Add", {"x", "y"}, {{"T", DataType::DT_INT32}}}},  // Nodes
       {{"out", "a:z:0"}});                                        // Returns
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  TF_CHECK_OK(s.graph()->AddFunctionLibrary(library));
+  TF_ASSERT_OK(s.graph()->AddFunctionLibrary(library));
 
   Output shape = ops::Const(s.WithOpName("shape"), {5, 7}, {2});
   auto _shape = tensorflow::ops::AsNodeOut(s, shape);
   auto builder =
       tensorflow::NodeBuilder("MyFunc", "MyFunc", s.graph()->op_registry());
   tensorflow::Node* func_op;
-  TF_CHECK_OK(
+  TF_ASSERT_OK(
       builder.Input(_shape).Input(_shape).Finalize(s.graph(), &func_op));
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   {
     GraphProperties properties(item);
     // Without aggressive_shape_inference, the internal function does not
     // evaluate output value.
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/true,
         /*aggressive_shape_inference=*/false,
         /*include_tensor_values=*/true));
@@ -1263,7 +1335,7 @@ TEST_F(GraphPropertiesTest, ArithmeticFunctionReturnTensorValue) {
   {
     GraphProperties properties(item);
     // With aggressive_shape_inference, output value is evaluated.
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/true,
         /*aggressive_shape_inference=*/true,
         /*include_tensor_values=*/true));
@@ -1280,6 +1352,66 @@ TEST_F(GraphPropertiesTest, ArithmeticFunctionReturnTensorValue) {
   }
 }
 
+// Same as the above, but float values; also, one of the function input is
+// Identity of Const.
+TEST_F(GraphPropertiesTest, ArithmeticFunctionReturnTensorValueFloat) {
+  FunctionDefLibrary library;
+  // Function that adds two input values.
+  *library.add_function() = FunctionDefHelper::Create(
+      "MyFunc",                                                   // Name
+      {"x: float", "y: float"},                                   // Inputs
+      {"out: float"},                                             // Outputs
+      {},                                                         // Attrs
+      {{{"a"}, "Add", {"x", "y"}, {{"T", DataType::DT_FLOAT}}}},  // Nodes
+      {{"out", "a:z:0"}});                                        // Returns
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  TF_ASSERT_OK(s.graph()->AddFunctionLibrary(library));
+
+  Output x1 = ops::Const(s.WithOpName("x1"), {5.0f, 7.0f}, {2});
+  Output x2 = ops::Identity(s.WithOpName("x1"), x1);
+  auto _x1 = tensorflow::ops::AsNodeOut(s, x1);
+  auto _x2 = tensorflow::ops::AsNodeOut(s, x2);
+  auto builder =
+      tensorflow::NodeBuilder("MyFunc", "MyFunc", s.graph()->op_registry());
+  tensorflow::Node* func_op;
+  TF_ASSERT_OK(builder.Input(_x1).Input(_x2).Finalize(s.graph(), &func_op));
+
+  GrapplerItem item;
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+  {
+    GraphProperties properties(item);
+    // Without aggressive_shape_inference, the internal function does not
+    // evaluate output value.
+    TF_ASSERT_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/true,
+        /*aggressive_shape_inference=*/false,
+        /*include_tensor_values=*/true));
+    const auto out_props = properties.GetOutputProperties("MyFunc");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("float: [2]", PropToString(out_prop0));
+    EXPECT_FALSE(out_prop0.has_value());
+  }
+
+  {
+    GraphProperties properties(item);
+    // With aggressive_shape_inference, output value is evaluated.
+    TF_ASSERT_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/true,
+        /*aggressive_shape_inference=*/true,
+        /*include_tensor_values=*/true));
+    const auto out_props = properties.GetOutputProperties("MyFunc");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("float: [2]", PropToString(out_prop0));
+    EXPECT_TRUE(out_prop0.has_value());
+
+    ExpectFloatTensorValues({10.0, 14.0}, out_prop0.value());
+    ExpectFloatTensorValues({5.0, 7.0},
+                            properties.GetInputProperties("MyFunc")[0].value());
+    ExpectFloatTensorValues({5.0, 7.0},
+                            properties.GetInputProperties("MyFunc")[1].value());
+  }
+}
+
 TEST_F(GraphPropertiesTest, FunctionWithScalarInput) {
   // Create graph with a function that takes a scalar value so that we use
   // Placeholder with scalar as for input to the function shape inference.
@@ -1294,7 +1426,7 @@ TEST_F(GraphPropertiesTest, FunctionWithScalarInput) {
       {{{"a"}, "Identity", {"x"}, {{"T", DataType::DT_FLOAT}}}},  // Nodes
       {{"out", "a:output:0"}});                                   // Returns
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  TF_CHECK_OK(s.graph()->AddFunctionLibrary(library));
+  TF_ASSERT_OK(s.graph()->AddFunctionLibrary(library));
   Output placeholder =
       ops::Placeholder(s.WithOpName("Placeholder"), DataType::DT_FLOAT,
                        ops::Placeholder::Shape(TensorShape({})));
@@ -1303,9 +1435,9 @@ TEST_F(GraphPropertiesTest, FunctionWithScalarInput) {
   auto builder =
       tensorflow::NodeBuilder("MyFunc", "MyFunc", s.graph()->op_registry());
   tensorflow::Node* func_op;
-  TF_CHECK_OK(builder.Input(_identity).Finalize(s.graph(), &func_op));
+  TF_ASSERT_OK(builder.Input(_identity).Finalize(s.graph(), &func_op));
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
   // Tensorflow version < 21 infers output shape of Placeholder with empty shape
   // as unknown, instead of scalar.
@@ -1313,7 +1445,7 @@ TEST_F(GraphPropertiesTest, FunctionWithScalarInput) {
 
   // MyFunc output shouldn't be unknown rank.
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(true));
+  TF_ASSERT_OK(properties.InferStatically(true));
   const auto out_props = properties.GetOutputProperties("MyFunc");
   const OpInfo::TensorProperties out_prop0 = out_props[0];
   EXPECT_EQ(DT_FLOAT, out_prop0.dtype());
@@ -1336,9 +1468,9 @@ TEST_F(GraphPropertiesTest, SimpleFunctionStaticShapeInference) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "simple_function.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto out_props = properties.GetOutputProperties("MyAdd_55e046a8");
   const OpInfo::TensorProperties& out_prop = out_props[0];
   EXPECT_EQ(DT_FLOAT, out_prop.dtype());
@@ -1361,9 +1493,9 @@ TEST_F(GraphPropertiesTest, LargeFunctionStaticShapeInference) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "large_function_graph.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto out_props = properties.GetOutputProperties("y0");
   EXPECT_EQ(2, out_props.size());
@@ -1412,9 +1544,9 @@ TEST_F(GraphPropertiesTest, LargeFunctionWithMultipleOutputs) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "function_functional_while.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto out_props = properties.GetOutputProperties("MyFunc_AenMyWWx1Us");
   EXPECT_EQ(2, out_props.size());
@@ -1432,9 +1564,9 @@ TEST_F(GraphPropertiesTest, FunctionWithErrorStaticShapeInference) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "function_error.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 
   const auto out_props = properties.GetOutputProperties("MyAdd_yabA4wXEdM4");
   EXPECT_EQ(1, out_props.size());
@@ -1469,9 +1601,9 @@ TEST_F(GraphPropertiesTest, FunctionSwitchStaticShapeInference) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "function_switch.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto out_props = properties.GetOutputProperties("MyAdd_MPaeanipb7o");
   const OpInfo::TensorProperties& out_prop = out_props[0];
   EXPECT_EQ(DT_FLOAT, out_prop.dtype());
@@ -1503,9 +1635,9 @@ TEST_F(GraphPropertiesTest, FunctionSwitch2StaticShapeInference) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "function_switch_2.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto out_props = properties.GetOutputProperties("MyAdd_MPaeanipb7o");
   const OpInfo::TensorProperties& out_prop = out_props[0];
   EXPECT_EQ("float: [1,2]", PropToString(out_prop));
@@ -1540,9 +1672,9 @@ TEST_F(GraphPropertiesTest, FunctionSwitchShapesStaticShapeInference) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "function_switch_shapes.pbtxt");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto out_props = properties.GetOutputProperties("MyAdd_lEKAAnIwI5I");
   const OpInfo::TensorProperties& out_prop = out_props[0];
   EXPECT_EQ("float: [1,2]", PropToString(out_prop));
@@ -1580,10 +1712,10 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   Output j = ops::Sum(s.WithOpName("j"), a, zero_idx);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
   const auto shape_c = properties.GetOutputProperties("c").at(0).shape();
   EXPECT_EQ(2, shape_a.dim_size());
@@ -1632,7 +1764,7 @@ TEST_F(GraphPropertiesTest, DoNotValidateColocationConstraints) {
   Output b = ops::Const(s.WithOpName("b"), 2.0f, {1});
   Output c = ops::Const(s.WithOpName("c").ColocateWith(a), 3.0f, {1});
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   // Create a graph with node a removed (say by some graph optimization
   // pass), noting that node c is colocated with a. This is fine as it
   // is in the late stage of graph execution, the colocation constraints have
@@ -1664,10 +1796,10 @@ TEST_F(GraphPropertiesTest, ShapeTracking) {
   Output o2 = ops::Fill(s.WithOpName("o2"), shp[1], zero);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
   const auto shape_b = properties.GetOutputProperties("b").at(0).shape();
   const auto shape_o1 = properties.GetOutputProperties("o1").at(0).shape();
@@ -1686,7 +1818,7 @@ TEST_F(GraphPropertiesTest, FedNodes) {
     // Conservative shape analysis: the shape of fed ports should be unknown
     GraphProperties properties(item);
     Status s = properties.InferStatically(false);
-    TF_CHECK_OK(s);
+    TF_ASSERT_OK(s);
     for (const auto& node : item.graph.node()) {
       if (node.op() == "Const") {
         continue;
@@ -1717,7 +1849,7 @@ TEST_F(GraphPropertiesTest, FedNodes) {
     // the shape of the fanin.
     GraphProperties properties(item);
     Status s = properties.InferStatically(true);
-    TF_CHECK_OK(s);
+    TF_ASSERT_OK(s);
     for (const auto& node : item.graph.node()) {
       if (node.op() == "Square" || node.op() == "AddN") {
         const auto in_props = properties.GetInputProperties(node.name());
@@ -1743,14 +1875,14 @@ TEST_F(GraphPropertiesTest, Performance) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "large_graph.pbtxt.html");
-  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
-  TF_CHECK_OK(AddDefaultAttrsToGraphDef(
+  TF_ASSERT_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_ASSERT_OK(AddDefaultAttrsToGraphDef(
       &item.graph,
       FunctionLibraryDefinition(OpRegistry::Global(), item.graph.library()), 0,
       true));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
 }
 
 TEST_F(GraphPropertiesTest, StridedSlicesOfShapes) {
@@ -1772,10 +1904,10 @@ TEST_F(GraphPropertiesTest, StridedSlicesOfShapes) {
   Output o2 = ops::Fill(s.WithOpName("o2"), c, zero);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(false));
+  TF_ASSERT_OK(properties.InferStatically(false));
   const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
   const auto shape_o1 = properties.GetOutputProperties("o1").at(0).shape();
   const auto shape_o2 = properties.GetOutputProperties("o2").at(0).shape();
@@ -1802,13 +1934,13 @@ TEST_F(GraphPropertiesTest, StridedSliceOfShapeWithShrinkAxisMask) {
                         stride, ops::StridedSlice::ShrinkAxisMask(1));
 
   GrapplerItem item;
-  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
 
   // Without aggressive shape inference, it cannot infer output value of
   // StridedSlice with ShrinkAxisMask.
   {
     GraphProperties properties(item);
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/false,
         /*aggressive_shape_inference=*/false,
         /*include_tensor_values=*/true));
@@ -1819,7 +1951,7 @@ TEST_F(GraphPropertiesTest, StridedSliceOfShapeWithShrinkAxisMask) {
   // StridedSlice with ShrinkAxisMask.
   {
     GraphProperties properties(item);
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/false,
         /*aggressive_shape_inference=*/true,
         /*include_tensor_values=*/true));
@@ -1844,9 +1976,9 @@ TEST_F(GraphPropertiesTest, ValuePropagationThroughArithmeticOps) {
       ops::Add(s.WithOpName("c_plus_b_plus_2a"), c, b_plus_2a);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(
+  TF_ASSERT_OK(properties.InferStatically(
       /*assume_valid_feeds=*/false,
       /*aggressive_shape_inference=*/true,
       /*include_tensor_values=*/true));
@@ -1876,21 +2008,21 @@ TEST_F(GraphPropertiesTest, ValuePropagationThroughArithmeticOps) {
 
 TEST_F(GraphPropertiesTest, ShapeAnnotation) {
   GrapplerItem item;
-  TF_CHECK_OK(NodeDefBuilder("Input", "Placeholder")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("shape", PartialTensorShape({-1, -1}))
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Input", "Placeholder")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("shape", PartialTensorShape({-1, -1}))
+                   .Finalize(item.graph.add_node()));
   // Annotate shapes.
-  TF_CHECK_OK(NodeDefBuilder("Identity", "Identity")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("_same_output_for_iterations", true)
-                  .Attr("_output_shape_vector", {TensorShape({5, 7})})
-                  .Input("Input", 0, DT_FLOAT)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Identity", "Identity")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("_same_output_for_iterations", true)
+                   .Attr("_output_shape_vector", {TensorShape({5, 7})})
+                   .Input("Input", 0, DT_FLOAT)
+                   .Finalize(item.graph.add_node()));
   {
     GraphProperties properties(item);
     // Without aggressive_shape_inference, ignore annotated information.
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/false,
         /*aggressive_shape_inference=*/false,
         /*include_tensor_values=*/true));
@@ -1905,7 +2037,7 @@ TEST_F(GraphPropertiesTest, ShapeAnnotation) {
   {
     GraphProperties properties(item);
     // Use annotated information.
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/false,
         /*aggressive_shape_inference=*/true,
         /*include_tensor_values=*/true));
@@ -1921,20 +2053,20 @@ TEST_F(GraphPropertiesTest, ShapeAnnotation) {
 
 TEST_F(GraphPropertiesTest, ShapeAnnotationWithCompatibleShapes) {
   GrapplerItem item;
-  TF_CHECK_OK(NodeDefBuilder("Input", "Placeholder")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("shape", PartialTensorShape({-1, 100}))
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Input", "Placeholder")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("shape", PartialTensorShape({-1, 100}))
+                   .Finalize(item.graph.add_node()));
   // Annotate shapes.
-  TF_CHECK_OK(NodeDefBuilder("Identity", "Identity")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("_same_output_for_iterations", true)
-                  .Attr("_output_shape_vector", {TensorShape({10, 100})})
-                  .Input("Input", 0, DT_FLOAT)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Identity", "Identity")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("_same_output_for_iterations", true)
+                   .Attr("_output_shape_vector", {TensorShape({10, 100})})
+                   .Input("Input", 0, DT_FLOAT)
+                   .Finalize(item.graph.add_node()));
   GraphProperties properties(item);
   // Use annotated information.
-  TF_CHECK_OK(properties.InferStatically(
+  TF_ASSERT_OK(properties.InferStatically(
       /*assume_valid_feeds=*/false,
       /*aggressive_shape_inference=*/true,
       /*include_tensor_values=*/true));
@@ -1949,20 +2081,20 @@ TEST_F(GraphPropertiesTest, ShapeAnnotationWithCompatibleShapes) {
 
 TEST_F(GraphPropertiesTest, ShapeAnnotationWithIncompatibleShapes) {
   GrapplerItem item;
-  TF_CHECK_OK(NodeDefBuilder("Input", "Placeholder")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("shape", PartialTensorShape({-1, 100}))
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Input", "Placeholder")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("shape", PartialTensorShape({-1, 100}))
+                   .Finalize(item.graph.add_node()));
   // Annotate shapes.
-  TF_CHECK_OK(NodeDefBuilder("Identity", "Identity")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("_same_output_for_iterations", true)
-                  .Attr("_output_shape_vector", {TensorShape({10, 10})})
-                  .Input("Input", 0, DT_FLOAT)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Identity", "Identity")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("_same_output_for_iterations", true)
+                   .Attr("_output_shape_vector", {TensorShape({10, 10})})
+                   .Input("Input", 0, DT_FLOAT)
+                   .Finalize(item.graph.add_node()));
   GraphProperties properties(item);
   // Use annotated information.
-  TF_CHECK_OK(properties.InferStatically(
+  TF_ASSERT_OK(properties.InferStatically(
       /*assume_valid_feeds=*/false,
       /*aggressive_shape_inference=*/true,
       /*include_tensor_values=*/true));
@@ -1977,12 +2109,12 @@ TEST_F(GraphPropertiesTest, ShapeAnnotationWithIncompatibleShapes) {
 
 TEST_F(GraphPropertiesTest, ShapeAnnotationWithoutInferenceFn) {
   GrapplerItem item;
-  TF_CHECK_OK(NodeDefBuilder("Input", "Placeholder")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("shape", PartialTensorShape({-1, -1}))
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("Input", "Placeholder")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("shape", PartialTensorShape({-1, -1}))
+                   .Finalize(item.graph.add_node()));
   // Annotate shapes.
-  TF_CHECK_OK(
+  TF_ASSERT_OK(
       NodeDefBuilder("TestOpWithNoInferenceFn", "TestOpWithNoInferenceFn")
           .Attr("_same_output_for_iterations", true)
           .Attr("_output_shape_vector", {TensorShape({10, 100})})
@@ -1990,7 +2122,7 @@ TEST_F(GraphPropertiesTest, ShapeAnnotationWithoutInferenceFn) {
           .Finalize(item.graph.add_node()));
   GraphProperties properties(item);
   // Use annotated information.
-  TF_CHECK_OK(properties.InferStatically(
+  TF_ASSERT_OK(properties.InferStatically(
       /*assume_valid_feeds=*/false,
       /*aggressive_shape_inference=*/true,
       /*include_tensor_values=*/true));
@@ -2002,6 +2134,79 @@ TEST_F(GraphPropertiesTest, ShapeAnnotationWithoutInferenceFn) {
   EXPECT_EQ("float: [10,100]", PropToString(prop));
 }
 
+TEST_F(GraphPropertiesTest, PartitionedCallOp) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  FunctionDefLibrary library;
+  FunctionDef called_func = FunctionDefHelper::Create(
+      "identity_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int32"},
+      /*attr_def=*/{},
+      {{{"Identity"}, "Identity", {"arg0"}, {{"T", DT_INT32}}}},
+      /*ret_def=*/{{"ret0", "Identity:output:0"}});
+  *library.add_function() = called_func;
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(library));
+
+  Output in = ops::Const(root, {3, 1, 2, 0});
+  NameAttrList b_name_attr;
+  b_name_attr.set_name("identity_function");
+  ops::PartitionedCall call(root.WithOpName("identity_call"), {in}, {DT_INT32},
+                            b_name_attr);
+
+  GrapplerItem item;
+  TF_ASSERT_OK(root.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_ASSERT_OK(properties.InferStatically(
+      /*assume_valid_feeds=*/true,
+      /*aggressive_shape_inference=*/false,
+      /*include_tensor_values=*/true));
+
+  EXPECT_EQ("int32: [4]",
+            PropToString(properties.GetOutputProperties("identity_call")[0]));
+}
+
+TEST_F(GraphPropertiesTest, NonTrivialInputPartitionedCallOp) {
+  auto f = FunctionDefHelper::Create(
+      // Name
+      "FunctionWhichAdds",
+      // Inputs
+      {"arg0: int32", "arg1: int32"},
+      // Outputs
+      {"ret0: int32"},
+      /*attr_def=*/{},
+      // Nodes
+      {{{"a"}, "Add", {"arg0", "arg1"}, {{"T", DT_INT32}}}},
+      /*ret_def=*/{{"ret0", "a:z:0"}});
+
+  FunctionDefLibrary function_lib;
+  function_lib.add_function()->Swap(&f);
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(function_lib));
+
+  PartialTensorShape input_shape({2, 2, -1});
+  Output in1 =
+      ops::Placeholder(root, DT_INT32, ops::Placeholder::Shape(input_shape));
+  Output in2 =
+      ops::Placeholder(root, DT_INT32, ops::Placeholder::Shape(input_shape));
+  NameAttrList b_name_attr;
+  b_name_attr.set_name("FunctionWhichAdds");
+  ops::PartitionedCall call(root.WithOpName("add_call"), {in1, in2}, {DT_INT32},
+                            b_name_attr);
+
+  GrapplerItem item;
+  TF_ASSERT_OK(root.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_ASSERT_OK(properties.InferStatically(
+      /*assume_valid_feeds=*/true,
+      /*aggressive_shape_inference=*/false,
+      /*include_tensor_values=*/true));
+
+  EXPECT_EQ("int32: [2,2,-1]",
+            PropToString(properties.GetOutputProperties("add_call")[0]));
+}
+
 TEST_F(GraphPropertiesTest, ShapeAnnotatedFunctionOp) {
   // A function, which we cannot infer output shape statically.
   auto f = FunctionDefHelper::Create(
@@ -2023,7 +2228,7 @@ TEST_F(GraphPropertiesTest, ShapeAnnotatedFunctionOp) {
   FunctionDefLibrary function_lib;
   function_lib.add_function()->Swap(&f);
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  TF_CHECK_OK(s.graph()->AddFunctionLibrary(function_lib));
+  TF_ASSERT_OK(s.graph()->AddFunctionLibrary(function_lib));
   tensorflow::Node* func_op;
   TensorShapeProto output_shape;
   output_shape.set_unknown_rank(false);
@@ -2032,21 +2237,21 @@ TEST_F(GraphPropertiesTest, ShapeAnnotatedFunctionOp) {
   output_shape.add_dim()->set_size(3);
   output_shape.add_dim()->set_size(4);
   // The function node, f, includes shape annotation.
-  TF_CHECK_OK(tensorflow::NodeBuilder("f", "FuncShapeCannotBeInferred",
-                                      s.graph()->op_registry())
-                  .Attr("_execution_count", 1)
-                  .Attr("_same_output_for_iterations", true)
-                  .Attr("_output_dtype_vector", {DataType::DT_FLOAT})
-                  .Attr("_output_shape_vector", {output_shape})
-                  .Finalize(s.graph(), &func_op));
+  TF_ASSERT_OK(tensorflow::NodeBuilder("f", "FuncShapeCannotBeInferred",
+                                       s.graph()->op_registry())
+                   .Attr("_execution_count", 1)
+                   .Attr("_same_output_for_iterations", true)
+                   .Attr("_output_dtype_vector", {DataType::DT_FLOAT})
+                   .Attr("_output_shape_vector", {output_shape})
+                   .Finalize(s.graph(), &func_op));
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
   // InferStatically with aggressive_shape_inference would fail to infer
   // the output shape of the node f.
   {
     GraphProperties properties(item);
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/false,
         /*aggressive_shape_inference=*/false,
         /*include_tensor_values=*/false));
@@ -2058,7 +2263,7 @@ TEST_F(GraphPropertiesTest, ShapeAnnotatedFunctionOp) {
   // InferStatically for the function node and outputs annotated shape info.
   {
     GraphProperties properties(item);
-    TF_CHECK_OK(properties.InferStatically(
+    TF_ASSERT_OK(properties.InferStatically(
         /*assume_valid_feeds=*/false,
         /*aggressive_shape_inference=*/true,
         /*include_tensor_values=*/true));
@@ -2082,87 +2287,87 @@ TEST_F(GraphPropertiesTest,
   // a special case of computed output dim, not unknown dim.
   // data and num_segments are inputs to UnsortedSegmenetSum.
 
-  TF_CHECK_OK(NodeDefBuilder("data", "Placeholder")
-                  .Attr("dtype", DT_FLOAT)
-                  .Attr("shape", TensorShape({10, 10, 10, 10}))
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("data", "Placeholder")
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("shape", TensorShape({10, 10, 10, 10}))
+                   .Finalize(item.graph.add_node()));
   Tensor num_segments(DT_INT32, TensorShape({}));
   // Build semgent_ids input to UnsortedSegmentSum from Const ops, ConcatV2,
   // and Reshape ops. tensors_as_shape from Const ops are propagated to ConcatV2
   // output to form shape vector [-1, 10] to Reshape.
   test::FillIota<int>(&num_segments, 3);
-  TF_CHECK_OK(NodeDefBuilder("num_segments", "Const")
-                  .Attr("dtype", DT_INT32)
-                  .Attr("value", num_segments)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("num_segments", "Const")
+                   .Attr("dtype", DT_INT32)
+                   .Attr("value", num_segments)
+                   .Finalize(item.graph.add_node()));
   Tensor minus_one(DT_INT32, TensorShape({1}));
   test::FillIota<int>(&minus_one, -1);
-  TF_CHECK_OK(NodeDefBuilder("minus_one", "Const")
-                  .Attr("dtype", DT_INT32)
-                  .Attr("value", minus_one)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("minus_one", "Const")
+                   .Attr("dtype", DT_INT32)
+                   .Attr("value", minus_one)
+                   .Finalize(item.graph.add_node()));
   Tensor plus_ten(DT_INT32, TensorShape({1}));
   test::FillIota<int>(&plus_ten, 10);
-  TF_CHECK_OK(NodeDefBuilder("plus_ten", "Const")
-                  .Attr("dtype", DT_INT32)
-                  .Attr("value", plus_ten)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("plus_ten", "Const")
+                   .Attr("dtype", DT_INT32)
+                   .Attr("value", plus_ten)
+                   .Finalize(item.graph.add_node()));
   Tensor axis(DT_INT32, TensorShape({}));
   test::FillIota<int>(&axis, -1);
-  TF_CHECK_OK(NodeDefBuilder("axis", "Const")
-                  .Attr("dtype", DT_INT32)
-                  .Attr("value", axis)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("axis", "Const")
+                   .Attr("dtype", DT_INT32)
+                   .Attr("value", axis)
+                   .Finalize(item.graph.add_node()));
   std::vector<NodeDefBuilder::NodeOut> inputs(2);
   inputs[0] = NodeDefBuilder::NodeOut{"minus_one", 0, DT_INT32};
   inputs[1] = NodeDefBuilder::NodeOut{"plus_ten", 0, DT_INT32};
-  TF_CHECK_OK(NodeDefBuilder("concat", "ConcatV2")
-                  .Input(inputs)
-                  .Input("axis", 0, DT_INT32)
-                  .Attr("N", 2)
-                  .Attr("T", DT_INT32)
-                  .Attr("Tidx", DT_INT32)
-                  .Finalize(item.graph.add_node()));
-  TF_CHECK_OK(NodeDefBuilder("segment_ids_", "Placeholder")
-                  .Attr("dtype", DT_FLOAT)
-                  .Finalize(item.graph.add_node()));
-  TF_CHECK_OK(NodeDefBuilder("segment_ids_shape_before_reshape", "Shape")
-                  .Input("segment_ids_", 0, DT_FLOAT)
-                  .Attr("T", DT_FLOAT)
-                  .Attr("out_type", DT_INT32)
-                  .Finalize(item.graph.add_node()));
-  TF_CHECK_OK(NodeDefBuilder("segment_ids", "Reshape")
-                  .Input("segment_ids_", 0, DT_FLOAT)
-                  .Input("concat", 0, DT_INT32)
-                  .Attr("T", DT_FLOAT)
-                  .Attr("Tshape", DT_INT32)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("concat", "ConcatV2")
+                   .Input(inputs)
+                   .Input("axis", 0, DT_INT32)
+                   .Attr("N", 2)
+                   .Attr("T", DT_INT32)
+                   .Attr("Tidx", DT_INT32)
+                   .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("segment_ids_", "Placeholder")
+                   .Attr("dtype", DT_FLOAT)
+                   .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("segment_ids_shape_before_reshape", "Shape")
+                   .Input("segment_ids_", 0, DT_FLOAT)
+                   .Attr("T", DT_FLOAT)
+                   .Attr("out_type", DT_INT32)
+                   .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("segment_ids", "Reshape")
+                   .Input("segment_ids_", 0, DT_FLOAT)
+                   .Input("concat", 0, DT_INT32)
+                   .Attr("T", DT_FLOAT)
+                   .Attr("Tshape", DT_INT32)
+                   .Finalize(item.graph.add_node()));
   // Shape function of UnsortedSegmentSum applies MergePrefix to data and
   // segment_ids (the latter being prefix). data shape is [10,10,10,10] and
   // segment_ids shape is [-1, 10], but MergePrefix and symbolic shape inference
   // assign 10 from data shape to the unknown dim in segment_ids.
-  TF_CHECK_OK(NodeDefBuilder("y", "UnsortedSegmentSum")
-                  .Input("data", 0, DT_FLOAT)
-                  .Input("segment_ids", 0, DT_INT32)
-                  .Input("num_segments", 0, DT_INT32)
-                  .Attr("T", DT_FLOAT)
-                  .Attr("Tindices", DT_INT32)
-                  .Attr("Tnumsegments", DT_INT32)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("y", "UnsortedSegmentSum")
+                   .Input("data", 0, DT_FLOAT)
+                   .Input("segment_ids", 0, DT_INT32)
+                   .Input("num_segments", 0, DT_INT32)
+                   .Attr("T", DT_FLOAT)
+                   .Attr("Tindices", DT_INT32)
+                   .Attr("Tnumsegments", DT_INT32)
+                   .Finalize(item.graph.add_node()));
   // Note that y2=Reshape(x1) using the same shape vector as segment_ids, but
   // y2 shape shouldn't be affected by symbolic shape inference w/ segment_ids.
-  TF_CHECK_OK(NodeDefBuilder("x1", "Placeholder")
-                  .Attr("dtype", DT_FLOAT)
-                  .Finalize(item.graph.add_node()));
-  TF_CHECK_OK(NodeDefBuilder("y1", "Reshape")
-                  .Input("x1", 0, DT_FLOAT)
-                  .Input("concat", 0, DT_INT32)
-                  .Attr("T", DT_FLOAT)
-                  .Attr("Tshape", DT_INT32)
-                  .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("x1", "Placeholder")
+                   .Attr("dtype", DT_FLOAT)
+                   .Finalize(item.graph.add_node()));
+  TF_ASSERT_OK(NodeDefBuilder("y1", "Reshape")
+                   .Input("x1", 0, DT_FLOAT)
+                   .Input("concat", 0, DT_INT32)
+                   .Attr("T", DT_FLOAT)
+                   .Attr("Tshape", DT_INT32)
+                   .Finalize(item.graph.add_node()));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically(true));
+  TF_ASSERT_OK(properties.InferStatically(true));
   const auto& y1_output_properties = properties.GetOutputProperties("y1");
   // y1=reshape(x1), but x1's shape in unknown, so y1 should be [-1, 10].
   // The first dimensino should not be 10.
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/function_with_dt_resource_input.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/function_with_dt_resource_input.pbtxt
new file mode 100644
index 00000000000..6b40ed0597a
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/function_with_dt_resource_input.pbtxt
@@ -0,0 +1,130 @@
+node {
+  name: "x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+        }
+        float_val: 2.0
+      }
+    }
+  }
+}
+node {
+  name: "y"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_handle_dtypes"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_handle_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "FunctionWithDtResourceInput"
+  op: "FunctionWithDtResourceInput"
+  input: "x"
+  input: "y"
+}
+library {
+  function {
+    signature {
+      name: "FunctionWithDtResourceInput"
+      input_arg {
+        name: "x"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "y"
+        type: DT_RESOURCE
+      }
+      output_arg {
+        name: "z1"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "z2"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "y1"
+      op: "ReadVariableOp"
+      input: "y"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Add"
+      op: "Add"
+      input: "x"
+      input: "y1:value:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "z1"
+      value: "Add:z:0"
+    }
+    ret {
+      key: "z2"
+      value: "x"
+    }
+  }
+}
+versions {
+  producer: 26
+  min_consumer: 12
+}
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 7cb61b76514..5339b00627e 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/transitive_fanin.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
@@ -35,16 +36,16 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+const char kAttrInputSrc[] = "input_source_";
+const char kAttrSrcDevice[] = "send_device";
+const char kAttrDstDevice[] = "recv_device";
+const char kAttrTensorName[] = "tensor_name";
+const char kChannelDevice[] = "Channel";
+
 namespace {
 
 using ::tensorflow::strings::HumanReadableNumBytes;
 
-constexpr char kAttrInputSrc[] = "input_source_";
-constexpr char kAttrSrcDevice[] = "send_device";
-constexpr char kAttrDstDevice[] = "recv_device";
-constexpr char kAttrTensorName[] = "tensor_name";
-constexpr char kChannelDevice[] = "Channel";
-
 float Round2(const float x) {
   // Not using std::round from <cmath> here because not all platforms seem to
   // support that (specifically Android).
@@ -346,13 +347,11 @@ std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
   return nullptr;
 }
 
-VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
-                                   const bool use_aggressive_shape_inference,
-                                   Cluster* cluster,
-                                   ReadyNodeManager* ready_nodes,
-                                   std::unique_ptr<VirtualPlacer> placer)
-    : ready_nodes_(ready_nodes),
-      graph_costs_(Costs::ZeroCosts()),
+SchedulerState::SchedulerState(const bool use_static_shapes,
+                               const bool use_aggressive_shape_inference,
+                               Cluster* cluster,
+                               std::unique_ptr<VirtualPlacer> placer)
+    : graph_costs_(Costs::ZeroCosts()),
       cluster_(cluster),
       use_static_shapes_(use_static_shapes),
       use_aggressive_shape_inference_(use_aggressive_shape_inference),
@@ -363,10 +362,12 @@ VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
   track_mem_usage_snapshot_ = VLOG_IS_ON(1);
 }
 
-Status VirtualScheduler::Init(const GrapplerItem* item) {
+Status SchedulerState::Init(const GrapplerItem* item,
+                            std::vector<const NodeDef*>* initial_nodes,
+                            bool create_explicit_channel_device) {
   initialized_ = false;
 
-  // Clear all internal states so that the VirtualScheduler is reusable for
+  // Clear all internal states so that the SchedulerState is reusable for
   // different GrapplerItems
   node_map_.clear();
   device_.clear();
@@ -379,14 +380,12 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
   op_counts_.clear();
   op_costs_.clear();
 
-  // Init() preprocesses the input grappler_item and graph_properties to extract
-  // necessary information for emulating tensorflow op scheduling and
-  // construct internal data structures (NodeState and DeviceState) for virtual
-  // scheduling.
-  TF_RETURN_IF_ERROR(ready_nodes_->Init(GetNodeStates()));
+  initial_nodes->clear();
 
   // Constructs graph properties and performs shape inference.
   graph_properties_ = absl::make_unique<GraphProperties>(*item);
+  // TODO(safeen,dyoon): Will we ever use InferDynamically? If not we may want
+  // to get rid of use_static_shapes_ and cluster_.
   if (use_static_shapes_) {
     TF_RETURN_IF_ERROR(graph_properties_->InferStatically(
         true, use_aggressive_shape_inference_, true));
@@ -398,6 +397,7 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
   const auto& graph = grappler_item_->graph;
   const auto& fetch_nodes = grappler_item_->fetch;
   std::set<string> feed_nodes;
+
   for (const auto& f : grappler_item_->feed) {
     auto iter_and_inserted_flag = feed_nodes.insert(f.first);
     QCHECK(iter_and_inserted_flag.second)
@@ -405,14 +405,10 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
   }
 
   // Get the nodes that would run to output fetch_nodes.
-  bool ill_formed = false;
   std::unordered_map<string, const NodeDef*> name_to_node;
-  const std::vector<const NodeDef*> fetch_fanin_nodes =
-      ComputeTransitiveFanin(graph, fetch_nodes, &name_to_node, &ill_formed);
-  if (ill_formed) {
-    return errors::InvalidArgument(
-        "Ill formed graph or invalid set of fetch nodes specified");
-  }
+  std::vector<const NodeDef*> fetch_fanin_nodes;
+  TF_RETURN_IF_ERROR(ComputeTransitiveFanin(graph, fetch_nodes, &name_to_node,
+                                            &fetch_fanin_nodes));
 
   // Once ComputeTransitiveFanin is complete, only the nodes that can be reached
   // from the fetch nodes are scheduled. So the scheduled nodes should be
@@ -489,8 +485,9 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
         } else {
           // Different device, no cached copy; transfer input_node to the
           // curr_node's device.
-          auto send_and_recv = CreateSendRecv(input_node, curr_node, input_node,
-                                              input_node_name);
+          auto send_and_recv =
+              CreateSendRecv(input_node, curr_node, input_node, input_node_name,
+                             create_explicit_channel_device);
           // Note that CreateSendRecv() already connected input/output between
           // _Send and _Recv ops.
           const auto* send = send_and_recv.first;
@@ -517,7 +514,7 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
 
     if (given_as_feed || has_no_inputs) {
       curr_node_state.time_ready = Costs::Duration();
-      ready_nodes_->AddNode(curr_node);
+      initial_nodes->push_back(curr_node);
       VLOG(3) << "Added ready node: " << curr_node->name();
     }
 
@@ -533,7 +530,7 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
     }
   }
 
-  if (ready_nodes_->Empty()) {
+  if (initial_nodes->empty()) {
     return errors::InvalidArgument("No ready nodes in the graph.");
   }
 
@@ -549,20 +546,20 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
   return Status::OK();
 }
 
-void VirtualScheduler::MaybeUpdateInputOutput(const NodeDef* node) {
+void SchedulerState::MaybeUpdateInputOutput(const NodeDef* node) {
   CHECK(!initialized_) << "MaybeUpdateInputOutput is called after Init().";
   // This method is called when NodeState is created and adds input and output
   // properties for a few exceptional cases that GraphProperties cannot provide
   // input/output properties.
   if ((IsSend(*node) || IsRecv(*node)) && node->attr().count(kAttrInputSrc)) {
-    // _Send and _Recv ops created from VirtualScheduler have kAttrInputSrc
+    // _Send and _Recv ops created from SchedulerState have kAttrInputSrc
     // attr; normal _Send and _Recv ops (from the input graph) do not have that
     // attr.
     auto& node_state = node_map_[node];
     auto& inputs = node_state.input_properties;
     auto& outputs = node_state.output_properties;
 
-    // _Send and _Recv ops are created from VirtualScheduler, so
+    // _Send and _Recv ops are created from SchedulerState, so
     // there should be no inputs TensorProperties.
     CHECK(inputs.empty());
     CHECK(outputs.empty());
@@ -598,27 +595,27 @@ void VirtualScheduler::MaybeUpdateInputOutput(const NodeDef* node) {
   }
 }
 
-string VirtualScheduler::DeviceName(const NodeDef* node) const {
+string SchedulerState::DeviceName(const NodeDef* node) const {
   return placer_->get_canonical_device_name(*node);
 }
 
-string VirtualScheduler::SanitizedDeviceName(const NodeDef* node) const {
+string SchedulerState::SanitizedDeviceName(const NodeDef* node) const {
   // Replace the ":" characters that may be present in the device name with "_".
   // This makes it possible to then use the resulting string in a node name.
   return absl::StrReplaceAll(placer_->get_canonical_device_name(*node),
                              {{":", "_"}});
 }
 
-string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
-                                           const NodeDef* to) const {
+string SchedulerState::ChannelDeviceName(const NodeDef* from,
+                                         const NodeDef* to) const {
   CHECK(!initialized_) << "ChannelDeviceName is called after Init().";
   return absl::StrCat(kChannelDevice, "_from_", SanitizedDeviceName(from),
                       "_to_", SanitizedDeviceName(to));
 }
 
-std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
+std::pair<const NodeDef*, const NodeDef*> SchedulerState::CreateSendRecv(
     const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
-    const string& input_name) {
+    const string& input_name, bool create_channel_device) {
   CHECK(!initialized_) << "CreateSendRecv is called after Init().";
 
   // Connect "from" node to "to" node with _Send and _Recv such that
@@ -646,7 +643,9 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
                  "_to_" + SanitizedDeviceName(to));
   send->set_op("_Send");
   send->add_input(from->name());
-  send->set_device(ChannelDeviceName(from, to));
+  auto send_device =
+      create_channel_device ? ChannelDeviceName(from, to) : DeviceName(from);
+  send->set_device(send_device);
   auto& send_attr = *(send->mutable_attr());
   send_attr[kAttrInputSrc].set_s(input_name);
   send_attr[kAttrSrcDevice].set_s(DeviceName(from));
@@ -690,9 +689,7 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   return std::make_pair(send, recv);
 }
 
-OpContext VirtualScheduler::GetCurrNode() const {
-  const NodeDef* node = ready_nodes_->GetCurrNode();
-
+OpContext SchedulerState::CreateOpContext(const NodeDef* node) const {
   // Get the device from the placer.
   DeviceProperties device;
   device = placer_->get_device(*node);
@@ -724,7 +721,7 @@ OpContext VirtualScheduler::GetCurrNode() const {
   return op_context;
 }
 
-NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
+NodeState& SchedulerState::GetNodeStateOrCreateIt(const NodeDef* node) {
   CHECK(!initialized_) << "GetNodeStateOrCreateIt is called after Init().";
 
   auto it = node_map_.find(node);
@@ -769,8 +766,9 @@ NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
   return it->second;
 }
 
-void VirtualScheduler::AddOutputNodesToReadyQueue(
-    const NodeDef* node, const Costs::Duration& curr_time) {
+void SchedulerState::GetOutputNodes(const NodeDef* node,
+                                    const Costs::Duration& curr_time,
+                                    std::vector<const NodeDef*>* output_nodes) {
   // Checks whether the Switch's output slots change over iterations.
   int slot = -1;
   if (IsSwitch(*node) && node->attr().count(kOutputSlots) > 0 &&
@@ -783,7 +781,6 @@ void VirtualScheduler::AddOutputNodesToReadyQueue(
       }
     }
   }
-
   // Increment num_inputs_ready of the output nodes and maybe add to ready
   // nodes.
   auto& node_state = node_map_[node];
@@ -802,16 +799,15 @@ void VirtualScheduler::AddOutputNodesToReadyQueue(
           IsMerge(*output_node)) {
         // This output node is now ready.
         output_state.time_ready = curr_time;
-        ready_nodes_->AddNode(output_node);
+        output_nodes->push_back(output_node);
         VLOG(3) << "  Add output: " << output_node->name();
       }
     }
   }
 }
 
-bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
-  // Update graph_costs_ and per-op costs.
-  const NodeDef* node = ready_nodes_->GetCurrNode();
+std::vector<const NodeDef*> SchedulerState::MarkNodeExecuted(
+    const NodeDef* node, const Costs& node_costs, const OpContext& op_context) {
   auto& node_state = node_map_[node];
   // TODO(dyoon, andiryxu): Consider to revisit node execution w.r.t. Switch and
   // Merge -- it can create a loop which may include loop-carried dependency,
@@ -837,8 +833,6 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
 
   if (VLOG_IS_ON(2)) {
     // Also keep track of op counts and costs per op (with their shapes).
-    OpContext op_context = GetCurrNode();
-
     string node_description = GetOpDescription(op_context.op_info);
     op_counts_[node_description] += 1;
     op_costs_[node_description] =
@@ -889,7 +883,7 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
           << ", ready: " << node_state.time_ready.count()
           << ", scheduled: " << node_state.time_scheduled.count()
           << ", finished: " << node_state.time_finished.count();
-
+  std::vector<const NodeDef*> new_nodes;
   if (previously_executed_merge) {
     // Skip AddOutputNodesToReadyQueue; this is due to Switch-Merge.
     VLOG(1) << "node [ " << node->name() << ", " << node->op() << " ] "
@@ -897,7 +891,7 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
             << "Skip scheduling its output nodes.";
   } else {
     // Checks outputs, and adds ready nodes to queue.
-    AddOutputNodesToReadyQueue(node, curr_time);
+    GetOutputNodes(node, curr_time, &new_nodes);
   }
 
   // Increment num_outputs_executed of the input nodes and maybe update memory.
@@ -932,13 +926,10 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
       }
     }
   }
-
-  ready_nodes_->RemoveCurrNode();
-
-  return !ready_nodes_->Empty();
+  return new_nodes;
 }
 
-Costs VirtualScheduler::Summary() const {
+Costs SchedulerState::Summary() const {
   // Overall statement about accuracy
   VLOG(1) << graph_costs_.num_ops_total << " ops processed in total, with "
           << graph_costs_.num_ops_with_unknown_shapes
@@ -1112,12 +1103,12 @@ Costs VirtualScheduler::Summary() const {
   return critical_path_costs;
 }
 
-Costs VirtualScheduler::Summary(RunMetadata* metadata) {
+Costs SchedulerState::Summary(RunMetadata* metadata) {
   if (metadata) GenerateRunMetadata(metadata);
   return Summary();
 }
 
-void VirtualScheduler::GenerateRunMetadata(RunMetadata* metadata) {
+void SchedulerState::GenerateRunMetadata(RunMetadata* metadata) {
   // Fill RunMetadata's step_stats and partition_graphs fields.
   StepStats* stepstats = metadata->mutable_step_stats();
   for (const auto& device : device_) {
@@ -1179,7 +1170,7 @@ void VirtualScheduler::GenerateRunMetadata(RunMetadata* metadata) {
                                         nodestate.time_scheduled.count());
 
       auto* mem_stats = node_stats->mutable_memory_stats();
-      // VirtualScheduler does not specify scratch pad memory usage.
+      // SchedulerState does not specify scratch pad memory usage.
       mem_stats->set_temp_memory_size(0);
       int64 persistent_memory_size = 0;
       if (IsPersistent(*node_def)) {
@@ -1191,7 +1182,7 @@ void VirtualScheduler::GenerateRunMetadata(RunMetadata* metadata) {
   }
 }
 
-const std::unordered_map<string, int64> VirtualScheduler::GetPeakMemoryUsage()
+const std::unordered_map<string, int64> SchedulerState::GetPeakMemoryUsage()
     const {
   std::unordered_map<string, int64> result;
   for (const auto& device : device_) {
@@ -1203,7 +1194,7 @@ const std::unordered_map<string, int64> VirtualScheduler::GetPeakMemoryUsage()
 }
 
 const std::unordered_map<string, int64>
-VirtualScheduler::GetPersistentMemoryUsage() const {
+SchedulerState::GetPersistentMemoryUsage() const {
   std::unordered_map<string, int64> result;
   for (const auto& device : device_) {
     const string& name = device.first;
@@ -1220,5 +1211,51 @@ VirtualScheduler::GetPersistentMemoryUsage() const {
   }
   return result;
 }
+
+VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
+                                   const bool use_aggressive_shape_inference,
+                                   Cluster* cluster,
+                                   ReadyNodeManager* ready_nodes,
+                                   std::unique_ptr<VirtualPlacer> placer)
+    : scheduler_state_(use_static_shapes, use_aggressive_shape_inference,
+                       cluster, std::move(placer)),
+      ready_nodes_(ready_nodes) {}
+
+Status VirtualScheduler::Init(const GrapplerItem* item) {
+  // SchedulerState::Init() preprocesses the input grappler_item and
+  // graph_properties to extract necessary information for emulating tensorflow
+  // op scheduling and construct internal data structures (NodeState and
+  // DeviceState) for virtual scheduling.
+  TF_RETURN_IF_ERROR(ready_nodes_->Init(GetNodeStates()));
+  std::vector<const NodeDef*> initial_nodes;
+  auto status = scheduler_state_.Init(item, &initial_nodes);
+  if (status.ok()) {
+    // Add the set of initial nodes to ready_nodes_
+    for (auto node : initial_nodes) {
+      ready_nodes_->AddNode(node);
+    }
+  }
+  return status;
+}
+
+OpContext VirtualScheduler::GetCurrNode() const {
+  const NodeDef* node = ready_nodes_->GetCurrNode();
+  return scheduler_state_.CreateOpContext(node);
+}
+
+bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
+  // Update graph_costs_ and per-op costs.
+  const NodeDef* node = ready_nodes_->GetCurrNode();
+  auto new_nodes = scheduler_state_.MarkNodeExecuted(
+      node, node_costs,
+      scheduler_state_.CreateOpContext(ready_nodes_->GetCurrNode()));
+  ready_nodes_->RemoveCurrNode();
+  // Add the set of new nodes obtained from MarkNodeExecuted() to ready_nodes_.
+  for (auto node : new_nodes) {
+    ready_nodes_->AddNode(node);
+  }
+  return !ready_nodes_->Empty();
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index d380947f158..70f00f53927 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -32,6 +32,12 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+ABSL_CONST_INIT extern const char kAttrInputSrc[];
+ABSL_CONST_INIT extern const char kAttrSrcDevice[];
+ABSL_CONST_INIT extern const char kAttrDstDevice[];
+ABSL_CONST_INIT extern const char kAttrTensorName[];
+ABSL_CONST_INIT extern const char kChannelDevice[];
+
 struct NodeState {
   // A node (i.e., an op) takes a set of input:port pairs and produces
   // a set of output ports.
@@ -233,7 +239,7 @@ class HeapReadyManager : public ReadyNodeManager {
   // functor for keeping the smallest time_ready node at the front of heap.
   std::function<bool(const NodeDef*, const NodeDef*)> greater_;
 
-  // NodeState structure from VirtualScheduler to get time_ready of ready nodes.
+  // NodeState structure from SchedulerState to get time_ready of ready nodes.
   // Not owned by FirstReadyManager.
   const std::unordered_map<const NodeDef*, NodeState>* node_map_;
 };
@@ -298,7 +304,7 @@ class CompositeNodeManager : public ReadyNodeManager {
   FirstReadyManager send_manager_;
   FirstReadyManager recv_manager_;
 
-  // NodeState structure from VirtualScheduler to get time_ready of ready nodes.
+  // NodeState structure from SchedulerState to get time_ready of ready nodes.
   // Not owned by CompositeReadyManager.
   const std::unordered_map<const NodeDef*, NodeState>* node_map_;
 
@@ -310,32 +316,22 @@ class CompositeNodeManager : public ReadyNodeManager {
 std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
     const string& ready_node_manager);
 
-// The virtual scheduler emulates execution of nodes in a graph, considering
-// dependencies, device, etc.
-class VirtualScheduler {
+// Encapsulates all of the various pieces uses to track state of a scheduler;
+// enables reuse of all scheduler state-related utilities across different
+// scheduler implementations.
+class SchedulerState {
  public:
-  // Does not take ownership of cluster or ready_nodes.
-  VirtualScheduler(const bool use_static_shapes,
-                   const bool use_aggressive_shape_inference, Cluster* cluster,
-                   ReadyNodeManager* ready_nodes,
-                   std::unique_ptr<VirtualPlacer> placer);
+  SchedulerState(const bool use_static_shapes,
+                 const bool use_aggressive_shape_inference, Cluster* cluster,
+                 std::unique_ptr<VirtualPlacer> placer);
+  // Sets up the graph while also performing some necessary transformations
+  // initial_nodes is the set of nodes (primary inputs) discovered by Init()
+  // which may be added by a ReadyNodeManager (or related/derivative scheduler)
+  // to begin node schedule and graph simulation.
+  Status Init(const GrapplerItem* item,
+              std::vector<const NodeDef*>* initial_nodes,
+              bool create_explicit_channel_device = true);
 
-  // Initializes the scheduler for the specific grappler item.
-  // Should be called immediately after the c'tor or when the scheduler will be
-  // reused for a new grappler item. All internal states of the scheduler
-  // related to the previous grappler item will be reset/cleared.
-  //
-  // This function should be called at least once after the scheduler is
-  // constructed. An uninitialized or failed-to-initialize scheduler will cause
-  // undefined behavior.
-  Status Init(const GrapplerItem* item);
-
-  OpContext GetCurrNode() const;
-
-  // Returns true if there is any node to be scheduled.
-  bool MarkCurrNodeExecuted(const Costs& node_costs);
-
-  // Prints out summary of execution (timing, memory usage, etc.)
   Costs Summary() const;
   // Like the above, but writes detailed stats to RunMetadata.
   // If metadata is nullptr, then just calls and return Summary().
@@ -347,34 +343,40 @@ class VirtualScheduler {
   // Returns per device memory usage.
   const std::unordered_map<string, int64> GetPeakMemoryUsage() const;
   const std::unordered_map<string, int64> GetPersistentMemoryUsage() const;
-
-  // Returns VirtualScheduler (read only) device and node states.
+  void enable_mem_usage_tracking() { track_mem_usage_snapshot_ = true; }
+  // Returns (read only) device and node states.
   const std::unordered_map<string, DeviceState>* GetDeviceStates() const {
     return &device_;
   }
+
   const std::unordered_map<const NodeDef*, NodeState>* GetNodeStates() const {
     return &node_map_;
   }
 
-  void enable_mem_usage_tracking() { track_mem_usage_snapshot_ = true; }
+  OpContext CreateOpContext(const NodeDef* node) const;
+  std::vector<const NodeDef*> MarkNodeExecuted(const NodeDef* node,
+                                               const Costs& node_costs,
+                                               const OpContext& op_context);
 
  private:
   // Methods called from Init(). Fails if initialize_ is set.
+
   void MaybeUpdateInputOutput(const NodeDef* node);
   NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
+  // Creates a Send_ and Recv_ pair between from and to. The argument
+  // create_channel_device tells the function to create an explicit device for
+  // the channel.
   std::pair<const NodeDef*, const NodeDef*> CreateSendRecv(
       const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
-      const string& input_name);
+      const string& input_name, bool create_channel_device);
   string DeviceName(const NodeDef* node) const;
   string SanitizedDeviceName(const NodeDef* node) const;
   string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
 
   // Helper methods.
-  void AddOutputNodesToReadyQueue(const NodeDef* node,
-                                  const Costs::Duration& curr_time);
+  void GetOutputNodes(const NodeDef* node, const Costs::Duration& curr_time,
+                      std::vector<const NodeDef*>* output_nodes);
 
-  // Scheduler states:
-  ReadyNodeManager* ready_nodes_;  // Not owned.
   std::unordered_map<const NodeDef*, NodeState> node_map_;
   std::unordered_map<string, DeviceState> device_;
 
@@ -396,16 +398,81 @@ class VirtualScheduler {
   // Auxiliary data structures for constructing NodeState and DeviceState.
   std::unique_ptr<GraphProperties> graph_properties_;  // Initialized in Init().
   Cluster* cluster_;                                   // Not owned.
-
   const GrapplerItem* grappler_item_;  // Not owned.
   bool use_static_shapes_;
   bool initialized_;
   bool track_mem_usage_snapshot_;
   const bool use_aggressive_shape_inference_;
-
   std::unique_ptr<VirtualPlacer> placer_;
 };
 
+// The virtual scheduler emulates execution of nodes in a graph, considering
+// dependencies, device, etc.
+class VirtualScheduler {
+ public:
+  // Does not take ownership of cluster or ready_nodes.
+  VirtualScheduler(const bool use_static_shapes,
+                   const bool use_aggressive_shape_inference, Cluster* cluster,
+                   ReadyNodeManager* ready_nodes,
+                   std::unique_ptr<VirtualPlacer> placer);
+
+  // Initializes the scheduler for the specific grappler item.
+  // Should be called immediately after the c'tor or when the scheduler will be
+  // reused for a new grappler item. All internal states of the scheduler
+  // related to the previous grappler item will be reset/cleared.
+  //
+  // This function should be called at least once after the scheduler is
+  // constructed. An uninitialized or failed-to-initialize scheduler will cause
+  // undefined behavior.
+  Status Init(const GrapplerItem* item);
+
+  // Gets the current scheduled node for execution; the caller of this function
+  // can accordingly simulate the execution of the current scheduled node.
+  OpContext GetCurrNode() const;
+  // Marks the current scheduled node as executed. Note that we should call this
+  // function only after the execution of the node has been simulated;
+  // node_costs_ capture the simulated costs of the node.
+  // Returns true if there is any node to be scheduled.
+  bool MarkCurrNodeExecuted(const Costs& node_costs);
+
+  // Prints out summary of execution (timing, memory usage, etc.)
+  Costs Summary() const { return scheduler_state_.Summary(); }
+  // Like the above, but writes detailed stats to RunMetadata.
+  // If metadata is nullptr, then just calls and return Summary().
+  Costs Summary(RunMetadata* metadata) {
+    return scheduler_state_.Summary(metadata);
+  }
+  // Generates RunMetadata's step_stats and partition_graphs fields from results
+  // of the virtual execution of the graph.
+  void GenerateRunMetadata(RunMetadata* metadata) {
+    scheduler_state_.GenerateRunMetadata(metadata);
+  }
+  // Returns per device memory usage.
+  const std::unordered_map<string, int64> GetPeakMemoryUsage() const {
+    return scheduler_state_.GetPeakMemoryUsage();
+  }
+  const std::unordered_map<string, int64> GetPersistentMemoryUsage() const {
+    return scheduler_state_.GetPersistentMemoryUsage();
+  }
+  // Returns VirtualScheduler (read only) device and node states.
+  const std::unordered_map<string, DeviceState>* GetDeviceStates() const {
+    return scheduler_state_.GetDeviceStates();
+  }
+  const std::unordered_map<const NodeDef*, NodeState>* GetNodeStates() const {
+    return scheduler_state_.GetNodeStates();
+  }
+  void enable_mem_usage_tracking() {
+    scheduler_state_.enable_mem_usage_tracking();
+  }
+
+ private:
+  // The state of the scheduler and the execution of the graph is encapsulated
+  // by the scheduler_state_ object.
+  SchedulerState scheduler_state_;
+  // ready_nodes_ is responsible for ordering the traversal of the graph.
+  ReadyNodeManager* ready_nodes_;  // Not owned.
+};
+
 }  // namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/graph_analyzer/BUILD b/tensorflow/core/grappler/graph_analyzer/BUILD
index da27c10bb18..d252a950557 100644
--- a/tensorflow/core/grappler/graph_analyzer/BUILD
+++ b/tensorflow/core/grappler/graph_analyzer/BUILD
@@ -44,7 +44,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
-        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:transitive_fanin",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc
index 924ca11e611..f5fd66f151e 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/graph_analyzer/graph_analyzer.h"
-#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils/transitive_fanin.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -54,9 +54,9 @@ void MaybePruneGraph(const tensorflow::MetaGraphDef& metagraph,
   if (fetch_nodes.empty()) {
     *graph = metagraph.graph_def();
   } else {
-    std::vector<const tensorflow::NodeDef*> fanin_nodes =
-        tensorflow::grappler::ComputeTransitiveFanin(metagraph.graph_def(),
-                                                     fetch_nodes);
+    std::vector<const tensorflow::NodeDef*> fanin_nodes;
+    TF_CHECK_OK(tensorflow::grappler::ComputeTransitiveFanin(
+        metagraph.graph_def(), fetch_nodes, &fanin_nodes));
     for (const tensorflow::NodeDef* node : fanin_nodes) {
       *(graph->add_node()) = *node;
     }
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index 7be98dc43b4..fd59b7a167a 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -134,14 +134,14 @@ TEST_F(GraphViewTest, BasicGraph) {
   EXPECT_EQ(input.node->name(), "AddN");
   EXPECT_EQ(input.port_id, 0);
   GraphView::OutputPort fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ(fanin.node->name(), "Square");
+  EXPECT_EQ(fanin.node->name(), "Sign");
   EXPECT_EQ(fanin.port_id, 0);
 
   input = graph.GetInputPort("AddN", 1);
   EXPECT_EQ(input.node->name(), "AddN");
   EXPECT_EQ(input.port_id, 1);
   fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ(fanin.node->name(), "Square_1");
+  EXPECT_EQ(fanin.node->name(), "Sign_1");
   EXPECT_EQ(fanin.port_id, 0);
 
   GraphView::OutputPort output = graph.GetOutputPort("AddN", 0);
@@ -169,7 +169,7 @@ TEST_F(GraphViewTest, BasicGraph) {
   EXPECT_EQ(fanouts, expected_fanouts);
 
   absl::flat_hash_set<string> fanins;
-  absl::flat_hash_set<string> expected_fanins = {"Square_1:0", "Square:0"};
+  absl::flat_hash_set<string> expected_fanins = {"Sign_1:0", "Sign:0"};
   for (const auto& fi : graph.GetFanins(*add_node, false)) {
     fanins.insert(absl::StrCat(fi.node->name(), ":", fi.port_id));
   }
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 9c3fef48e1c..4b5845698d8 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -50,7 +50,9 @@ GrapplerItem GrapplerItem::WithGraph(GraphDef&& graph_def) const {
 }
 
 std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
-  return ComputeTransitiveFanin(graph, fetch);
+  std::vector<const NodeDef*> fanin_nodes;
+  TF_CHECK_OK(ComputeTransitiveFanin(graph, fetch, &fanin_nodes));
+  return fanin_nodes;
 }
 
 std::vector<const NodeDef*> GrapplerItem::EnqueueOpsFanin() const {
@@ -60,15 +62,20 @@ std::vector<const NodeDef*> GrapplerItem::EnqueueOpsFanin() const {
       enqueue_ops.push_back(enqueue_op);
     }
   }
-  return ComputeTransitiveFanin(graph, enqueue_ops);
+  std::vector<const NodeDef*> fanin_nodes;
+  TF_CHECK_OK(ComputeTransitiveFanin(graph, fetch, &fanin_nodes));
+  return fanin_nodes;
 }
 
 std::vector<const NodeDef*> GrapplerItem::InitOpsFanin() const {
-  return ComputeTransitiveFanin(graph, init_ops);
+  std::vector<const NodeDef*> fanin_nodes;
+  TF_CHECK_OK(ComputeTransitiveFanin(graph, init_ops, &fanin_nodes));
+  return fanin_nodes;
 }
 
 std::vector<const NodeDef*> GrapplerItem::MainVariables() const {
-  std::vector<const NodeDef*> fanin = ComputeTransitiveFanin(graph, init_ops);
+  std::vector<const NodeDef*> fanin;
+  TF_CHECK_OK(ComputeTransitiveFanin(graph, init_ops, &fanin));
   std::vector<const NodeDef*> vars;
   for (const NodeDef* node : fanin) {
     if (IsVariable(*node)) {
@@ -200,22 +207,5 @@ GrapplerItem::OptimizationOptions& GrapplerItem::optimization_options() {
   return optimization_options_;
 }
 
-std::vector<const NodeDef*> ComputeTransitiveFanin(
-    const GraphDef& graph, const std::vector<string>& terminal_nodes) {
-  bool ill_formed = false;
-  std::vector<const NodeDef*> result =
-      ComputeTransitiveFanin(graph, terminal_nodes, &ill_formed);
-  CHECK(!ill_formed);
-  return result;
-}
-
-std::vector<const NodeDef*> ComputeTransitiveFanin(
-    const GraphDef& graph, const std::vector<string>& terminal_nodes,
-    bool* ill_formed) {
-  std::unordered_map<string, const NodeDef*> name_to_fanin_node;
-  return ComputeTransitiveFanin(graph, terminal_nodes, &name_to_fanin_node,
-                                ill_formed);
-}
-
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index ed97eec80b7..99d6d2c4566 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -133,25 +133,6 @@ struct GrapplerItem {
   OptimizationOptions optimization_options_;
 };
 
-// Return the transitive fanin of a set of terminal nodes.
-std::vector<const NodeDef*> ComputeTransitiveFanin(
-    const GraphDef& graph, const std::vector<string>& terminal_nodes);
-
-// Return the transitive fanin of a set of terminal nodes. Sets 'ill_formed' to
-// true if one of the node is missing in the graph, or some node inputs don't
-// exist.
-std::vector<const NodeDef*> ComputeTransitiveFanin(
-    const GraphDef& graph, const std::vector<string>& terminal_nodes,
-    bool* ill_formed);
-
-// Return the transitive fanin of a set of terminal nodes. Sets 'ill_formed' to
-// true if one of the node is missing in the graph, or some node inputs don't
-// exist. Sets name_to_fanin_node for name to fanin nodes map.
-std::vector<const NodeDef*> ComputeTransitiveFanin(
-    const GraphDef& graph, const std::vector<string>& terminal_nodes,
-    std::unordered_map<string, const NodeDef*>* name_to_fanin_node,
-    bool* ill_formed);
-
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 8b90bb26e92..cffe7df8186 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/grappler/grappler_item_builder.h"
 
+#include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -22,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
@@ -34,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/inputs/utils.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
@@ -480,28 +481,28 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
           meta_graph.collection_def().at("saved_model_assets");
       const auto& any_assets = collection.any_list().value();
       if (!any_assets.empty()) {
-#ifndef TENSORFLOW_LITE_PROTOS
-        for (const auto& any_asset : any_assets) {
-          AssetFileDef asset_file_def;
-          if (!ParseAny(any_asset, &asset_file_def, "tensorflow.AssetFileDef")
-                   .ok()) {
-            LOG(ERROR) << "Failed to parse AssetFile.";
-            continue;
+        if (std::is_base_of<protobuf::Message, AssetFileDef>()) {
+          for (const auto& any_asset : any_assets) {
+            AssetFileDef asset_file_def;
+            if (!ParseAny(any_asset, &asset_file_def, "tensorflow.AssetFileDef")
+                     .ok()) {
+              LOG(ERROR) << "Failed to parse AssetFile.";
+              continue;
+            }
+            string asset_filepath = io::JoinPath(cfg.assets_directory_override,
+                                                 asset_file_def.filename());
+            if (!FilesExist({asset_filepath}, nullptr)) {
+              LOG(ERROR) << "Can't access one or more of the asset files "
+                         << asset_filepath << ", skipping this input";
+              return nullptr;
+            }
+            asset_node_to_value[NodeName(asset_file_def.tensor_info().name())] =
+                asset_filepath;
           }
-          string asset_filepath = io::JoinPath(cfg.assets_directory_override,
-                                               asset_file_def.filename());
-          if (!FilesExist({asset_filepath}, nullptr)) {
-            LOG(ERROR) << "Can't access one or more of the asset files "
-                       << asset_filepath << ", skipping this input";
-            return nullptr;
-          }
-          asset_node_to_value[NodeName(asset_file_def.tensor_info().name())] =
-              asset_filepath;
+        } else {
+          LOG(ERROR) << "Can't parse AssetFileDef when using lite protos.";
+          return nullptr;
         }
-#else
-        LOG(ERROR) << "Can't parse AssetFileDef on mobile.";
-        return nullptr;
-#endif  // TENSORFLOW_LITE_PROTOS
       }
     }
   } else if (meta_graph.collection_def().count("asset_filepaths") > 0) {
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
index ec54bd5c759..9ce0284369a 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
@@ -47,11 +47,11 @@ GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
     std::vector<Output> this_stage;
     for (int j = 0; j < width; j++) {
       if (last_stage.size() == 1) {
-        Output unary_op = Square(
-            s.WithDevice(
-                device_names[use_multiple_devices ? j % device_names.size()
-                                                  : 0]),
-            last_stage[0]);
+        Output unary_op =
+            Sign(s.WithDevice(
+                     device_names[use_multiple_devices ? j % device_names.size()
+                                                       : 0]),
+                 last_stage[0]);
         this_stage.push_back(unary_op);
       } else {
         Output combine =
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
index 74e5080a30f..bf776bcd2bc 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -30,7 +30,7 @@ class TrivialTestGraphInputYielder : public InputYielder {
  public:
   TrivialTestGraphInputYielder(int num_stages, int width, int tensor_size,
                                bool insert_queue,
-                               const std::vector<string>& device_names);
+                               const std::vector<std::string>& device_names);
   bool NextItem(GrapplerItem* item) override;
 
  private:
@@ -38,7 +38,7 @@ class TrivialTestGraphInputYielder : public InputYielder {
   const int width_;
   const int tensor_size_;
   const bool insert_queue_;
-  std::vector<string> device_names_;
+  std::vector<std::string> device_names_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 8b8df527041..9e3b401154a 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -76,6 +76,15 @@ bool IsAnyMin(const NodeDef& node) {
   return op == "Min" || op == "SegmentMin" || op == "UnsortedSegmentMin";
 }
 
+bool IsAnySparseSegmentReduction(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "SparseSegmentSum" || op == "SparseSegmentSumWithNumSegments" ||
+         op == "SparseSegmentMean" ||
+         op == "SparseSegmentMeanWithNumSegments" ||
+         op == "SparseSegmentSqrtN" ||
+         op == "SparseSegmentSqrtNWithNumSegments";
+}
+
 bool IsApproximateEqual(const NodeDef& node) {
   return node.op() == "ApproximateEqual";
 }
@@ -268,6 +277,11 @@ bool IsFusedBatchNormGrad(const NodeDef& node) {
          op == "FusedBatchNormGradV3";
 }
 
+bool IsGather(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Gather" || op == "GatherV2";
+}
+
 bool IsGreater(const NodeDef& node) { return node.op() == "Greater"; }
 
 bool IsGreaterEqual(const NodeDef& node) { return node.op() == "GreaterEqual"; }
@@ -589,6 +603,11 @@ bool IsTruncateDiv(const NodeDef& node) { return node.op() == "TruncateDiv"; }
 
 bool IsTruncateMod(const NodeDef& node) { return node.op() == "TruncateMod"; }
 
+bool IsUnique(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Unique" || op == "UniqueV2";
+}
+
 bool IsUnpack(const NodeDef& node) { return node.op() == "Unpack"; }
 
 bool IsVariable(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 88d81c5b202..b1624ac70c6 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -34,6 +34,7 @@ bool IsAnyMax(const NodeDef& node);
 bool IsAnyMaxPool(const NodeDef& node);
 bool IsAnyMin(const NodeDef& node);
 bool IsAnyMul(const NodeDef& node);
+bool IsAnySparseSegmentReduction(const NodeDef& node);
 bool IsApproximateEqual(const NodeDef& node);
 bool IsArg(const NodeDef& node);
 bool IsArgMax(const NodeDef& node);
@@ -81,6 +82,7 @@ bool IsFloorMod(const NodeDef& node);
 bool IsFusedBatchNorm(const NodeDef& node);
 bool IsFusedBatchNormEx(const NodeDef& node);
 bool IsFusedBatchNormGrad(const NodeDef& node);
+bool IsGather(const NodeDef& node);
 bool IsGreater(const NodeDef& node);
 bool IsGreaterEqual(const NodeDef& node);
 bool IsHistogramSummary(const NodeDef& node);
@@ -187,6 +189,7 @@ bool IsTile(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
 bool IsTruncateDiv(const NodeDef& node);
 bool IsTruncateMod(const NodeDef& node);
+bool IsUnique(const NodeDef& node);
 bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 bool IsWhile(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index ae854ad85f1..b880055b47d 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -75,6 +75,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/utils:transitive_fanin",
     ],
 )
 
@@ -530,7 +531,10 @@ cc_library(
 tf_cuda_cc_test(
     name = "memory_optimizer_test",
     srcs = ["memory_optimizer_test.cc"],
-    tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
+    tags = [
+        "no_cuda_on_cpu_tap",  # Do not re-enable again without actually testing.
+        "no_windows",  # b/56402646
+    ],
     deps = [
         ":gpu_swapping_kernels",
         ":gpu_swapping_ops",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index cec6d7cce7f..520346b0166 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1273,7 +1273,7 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "T", &input_type));
     DataType output_type;
     TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "type", &output_type));
-    if (input_type == output_type) {
+    if ((input_type == output_type) && !IsInPreserveSet(*node)) {
       *simplified_node_name = node->input(0);
       return Status::OK();
     }
@@ -1283,7 +1283,7 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
     NodeDef* operand;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &operand));
 
-    if (IsBitcast(*operand)) {
+    if (IsBitcast(*operand) && !IsInPreserveSet(*operand)) {
       AttrSlice operand_attrs(*operand);
       DataType operand_input_type;
       TF_RETURN_IF_ERROR(GetNodeAttr(operand_attrs, "T", &operand_input_type));
@@ -1308,7 +1308,9 @@ class RemoveRedundantCastStage : public ArithmeticOptimizerStage {
       : ArithmeticOptimizerStage("RemoveRedundantCast", ctx, ctx_ext) {}
   ~RemoveRedundantCastStage() override = default;
 
-  bool IsSupported(const NodeDef* node) const override { return IsCast(*node); }
+  bool IsSupported(const NodeDef* node) const override {
+    return IsCast(*node) && !IsInPreserveSet(*node);
+  }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
@@ -1334,7 +1336,7 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
   ~RemoveNegationStage() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsAdd(*node) || IsSub(*node);
+    return (IsAdd(*node) || IsSub(*node)) && !IsInPreserveSet(*node);
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -1921,7 +1923,7 @@ class RemoveRedundantReshape : public ArithmeticOptimizerStage {
   ~RemoveRedundantReshape() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsReshape(*node);
+    return IsReshape(*node) && !IsInPreserveSet(*node);
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -2005,6 +2007,7 @@ class ReorderCastLikeAndValuePreserving : public ArithmeticOptimizerStage {
         ((producer_is_cast && IsValuePreserving(*consumer)) ||
          (IsValuePreserving(*producer) && IsCastLike(*consumer)));
     if (!can_optimize || IsControlFlow(*producer) ||
+        IsInPreserveSet(*producer) ||
         producer->device() != consumer->device()) {
       return Status::OK();
     }
@@ -2176,6 +2179,8 @@ class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
     // Check that value preserving chain is the only consumer of the Mul output.
     TF_RETURN_IF_TRUE(!IsAnyMul(*source));
     TF_RETURN_IF_TRUE(NumNonControlOutputs(*source, *ctx().node_map) != 1);
+    // And that Mul is not in the preserve set.
+    TF_RETURN_IF_TRUE(IsInPreserveSet(*source));
 
     const NodeDef* mul = source;
     int input_idx = 0;
@@ -2248,7 +2253,7 @@ class FoldTransposeIntoMatMul : public ArithmeticOptimizerStage {
   ~FoldTransposeIntoMatMul() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsAnyMatMul(*node);
+    return IsAnyMatMul(*node) && !IsInPreserveSet(*node);
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -3118,7 +3123,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
   ~RemoveStackSliceSameAxis() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsStridedSlice(*node) || IsSlice(*node);
+    return (IsStridedSlice(*node) || IsSlice(*node)) && !IsInPreserveSet(*node);
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -3464,6 +3469,167 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
   }
 };
 
+// Eliminates unnecessary copies during sparse embedding lookup operations.
+//
+// For non-partitioned variables, the `tf.nn.embedding_lookup_sparse()` function
+// generates code of the form:
+//
+//     embeddings = <a 2D Tensor>
+//     sparse_ids = <a tf.int64 SparseTensor>
+//     segment_ids = sparse_ids.indices[:, 0]
+//     ids, idx = tf.unique(sparse_ids.values)
+//     gathered_rows = tf.gather(params, ids)
+//     result = tf.sparse.segment_<combiner>(gathered_rows, idx, segment_ids)
+//
+// In this case, all of the work in `tf.unique()` and `tf.gather()`
+// can be avoided by passing the full embeddings to
+// `tf.sparse.segment_<combiner>()` and performing the same amount of
+// computation (but fewer copies and allocations) as follows:
+//
+//     embeddings = <a 2D Tensor>
+//     sparse_ids = <a tf.int64 SparseTensor>
+//     segment_ids = sparse_ids.indices[:, 0]
+//     result = tf.sparse.segment_<combiner>(
+//          embeddings, sparse_ids.values, segment_ids)
+class SimplifyEmbeddingLookupStage : public ArithmeticOptimizerStage {
+ public:
+  explicit SimplifyEmbeddingLookupStage(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("SimplifyEmbeddingLookupStage", ctx, ctx_ext) {
+  }
+  ~SimplifyEmbeddingLookupStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsAnySparseSegmentReduction(*node);
+  }
+
+  Status TrySimplify(NodeDef* reduction_node,
+                     string* simplified_node_name) override {
+    if (IsInPreserveSet(*reduction_node)) return Status::OK();
+
+    // Input 0 (data) of the reduction node must be a tf.gather() on the 0th
+    // axis.
+    NodeDef* gather_node = nullptr;
+    TF_RETURN_IF_ERROR(GetInputNode(reduction_node->input(0), &gather_node));
+    if (!IsGather(*gather_node) || IsInPreserveSet(*gather_node) ||
+        gather_node->device() != reduction_node->device())
+      return Status::OK();
+    if (gather_node->op() == "GatherV2" && !IsAxis0(*gather_node, 2))
+      return Status::OK();
+
+    // Input 1 (indices) of the gather node must be a tf.unique() on the 0th
+    // axis.
+    NodeDef* unique_node = nullptr;
+    TF_RETURN_IF_ERROR(GetInputNode(gather_node->input(1), &unique_node));
+    if (!IsUnique(*unique_node) || IsInPreserveSet(*unique_node) ||
+        unique_node->device() != gather_node->device())
+      return Status::OK();
+    if (unique_node->op() == "UniqueV2" && !IsAxis0(*unique_node, 1))
+      return Status::OK();
+
+    DataType unique_element_type;
+    TF_RETURN_IF_ERROR(GetNodeAttr(*unique_node, "T", &unique_element_type));
+
+    // Input 1 (indices) of the reduction node must be output 1 of the unique
+    // node.
+    const TensorId idx_tensor = ParseTensorName(reduction_node->input(1));
+    if (idx_tensor != TensorId(unique_node->name(), 1)) return Status::OK();
+
+    // Input 0 (data) of the reduction node becomes input 1 (params) of the
+    // gather node.
+    reduction_node->set_input(0, gather_node->input(0));
+    ctx().node_map->UpdateInput(reduction_node->name(),
+                                reduction_node->input(0),
+                                gather_node->input(0));
+
+    // Input 1 (indices) of the reduction node becomes input 0 (x) of the unique
+    // node.
+    reduction_node->set_input(1, unique_node->input(0));
+    ctx().node_map->UpdateInput(reduction_node->name(),
+                                reduction_node->input(1),
+                                unique_node->input(0));
+    SetDataTypeToAttr(unique_element_type, "Tidx", reduction_node);
+
+    *simplified_node_name = reduction_node->name();
+    return Status::OK();
+  }
+
+ private:
+  bool IsAxis0(const NodeDef& node, int axis_input) {
+    Tensor axis_tensor;
+    if (!GetTensorFromConstNode(node.input(axis_input), &axis_tensor))
+      return false;
+    if (axis_tensor.NumElements() != 1) return false;
+    if (axis_tensor.dtype() == DT_INT32) {
+      return axis_tensor.flat<int32>()(0) == 0;
+    } else if (axis_tensor.dtype() == DT_INT64) {
+      return axis_tensor.flat<int64>()(0) == 0;
+    } else {
+      return false;
+    }
+  }
+};
+
+// Eliminates unnecessary casts before sparse segment reduction operations.
+//
+// Existing graphs and library code would often insert a cast from DT_INT64 to
+// DT_INT32 on the indices and/or segment_ids inputs to "SparseSegment*" ops.
+// Support for for DT_INT64 indices and/or segment_ids now exists, so we can
+// pass the input directly without a cast.
+class RemoveCastIntoSegmentReductionStage : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveCastIntoSegmentReductionStage(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveCastIntoSegmentReductionStage", ctx,
+                                 ctx_ext) {}
+  ~RemoveCastIntoSegmentReductionStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsAnySparseSegmentReduction(*node);
+  }
+
+  Status TrySimplify(NodeDef* reduction_node,
+                     string* simplified_node_name) override {
+    if (IsInPreserveSet(*reduction_node)) return Status::OK();
+
+    bool optimized = false;
+
+    // Inputs 1 (indices) and 2 (segment_ids) can be either DT_INT32 or
+    // DT_INT64.
+    std::array<std::pair<int, string>, 2> input_details = {
+        std::make_pair(1, "Tidx"), std::make_pair(2, "Tsegmentids")};
+
+    for (const auto& input : input_details) {
+      int input_index = input.first;
+      const string& type_attr_name = input.second;
+      NodeDef* cast_node = nullptr;
+      TF_RETURN_IF_ERROR(
+          GetInputNode(reduction_node->input(input_index), &cast_node));
+      DataType original_index_type;
+      if (IsCastFromSupportedType(*cast_node, &original_index_type)) {
+        reduction_node->set_input(input_index, cast_node->input(0));
+        ctx().node_map->UpdateInput(reduction_node->name(),
+                                    reduction_node->input(1),
+                                    cast_node->input(0));
+        SetDataTypeToAttr(original_index_type, type_attr_name, reduction_node);
+        optimized = true;
+      }
+    }
+
+    if (optimized) *simplified_node_name = reduction_node->name();
+    return Status::OK();
+  }
+
+ private:
+  bool IsCastFromSupportedType(const NodeDef& node, DataType* out_input_type) {
+    if (!IsCast(node)) return false;
+    if (!GetNodeAttr(node, "SrcT", out_input_type).ok()) return false;
+    return *out_input_type == DT_INT32 || *out_input_type == DT_INT64;
+  }
+};
+
 }  // namespace
 
 Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
@@ -3536,6 +3702,10 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<RemoveStackSliceSameAxis>(ctx, ctx_ext);
   if (options_.fuse_squared_diff)
     pipeline.AddStage<FuseSquaredDiffStage>(ctx, ctx_ext);
+  if (options_.simplify_embedding_lookup)
+    pipeline.AddStage<SimplifyEmbeddingLookupStage>(ctx, ctx_ext);
+  if (options_.remove_cast_into_segment_reduction)
+    pipeline.AddStage<RemoveCastIntoSegmentReductionStage>(ctx, ctx_ext);
 
   VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
           << absl::StrJoin(pipeline.StageNames(), ", ");
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 76aca8b840e..044dc855244 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -85,6 +85,8 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool convert_expm1 = true;
     bool unary_ops_composition = true;
     bool remove_stack_slice_same_axis = true;
+    bool simplify_embedding_lookup = true;
+    bool remove_cast_into_segment_reduction = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 50896b11923..8b403b17841 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -736,7 +736,7 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
     auto identity = ops::Identity(s.WithOpName("identity"), matmul);
 
     GrapplerItem item;
-    item.fetch = {"matmul"};
+    item.fetch = {"identity"};
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
     auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
@@ -795,9 +795,10 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   Output trans_a = ops::ConjugateTranspose(s.WithOpName("trans_a"), a, perm);
   Output trans_b = ops::ConjugateTranspose(s.WithOpName("trans_b"), b, perm);
   Output matmul = ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+  Output identity = ops::Identity(s.WithOpName("identity"), matmul);
 
   GrapplerItem item;
-  item.fetch = {"matmul"};
+  item.fetch = {"identity"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
@@ -808,7 +809,7 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   OptimizeTwice(&optimizer, &item, &output);
 
   NodeMap node_map(&output);
-  EXPECT_EQ(output.node_size(), 11);
+  EXPECT_EQ(output.node_size(), 12);
 
   const string p = "ArithmeticOptimizer/FoldTransposeIntoMatMul";
   const string optimized_name = absl::StrCat(p, "_", "matmul");
@@ -4085,5 +4086,93 @@ TEST_F(ArithmeticOptimizerTest, SimplifyAggregationBFloat16) {
   test::ExpectTensorEqual<bfloat16>(tensors[0], tensors_expected[0]);
 }
 
+TEST_F(ArithmeticOptimizerTest, SimplifyEmbeddingLookup) {
+  for (DataType unique_idx_type : {DT_INT32, DT_INT64}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output embeddings = ops::Const(s.WithOpName("embeddings"),
+                                   {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+    Output segment_ids =
+        ops::Const(s.WithOpName("segment_ids"), {0, 1, 1, 2, 2, 2, 2});
+    Output indices = ops::Const(s.WithOpName("indices"), {0, 0, 1, 0, 1, 0, 1});
+    auto unique = ops::Unique(s.WithOpName("unique"), indices,
+                              /*attrs=*/{unique_idx_type});
+    Output ids = unique.y;
+    Output idx = unique.idx;
+    Output gathered_rows =
+        ops::Gather(s.WithOpName("gathered_rows"), embeddings, ids);
+    Output result = ops::SparseSegmentSum(s.WithOpName("result"), gathered_rows,
+                                          idx, segment_ids);
+    Output id = ops::Identity(s.WithOpName("id"), result);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch = {"id"};
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+    ASSERT_EQ(tensors_expected.size(), 1);
+
+    GraphDef output;
+    ArithmeticOptimizer optimizer;
+    EnableOnlySimplifyEmbeddingLookup(&optimizer);
+    OptimizeAndPrune(&optimizer, &item, &output);
+
+    for (const auto& node : output.node()) {
+      if (node.name() == "result") {
+        EXPECT_EQ(node.input(0), "embeddings");
+        EXPECT_EQ(node.input(1), "indices");
+      }
+      EXPECT_NE(node.op(), "Unique");
+      EXPECT_NE(node.op(), "Gather");
+    }
+
+    auto tensors = EvaluateNodes(output, item.fetch);
+    ASSERT_EQ(tensors.size(), 1);
+    test::ExpectTensorEqual<float>(tensors[0], tensors_expected[0]);
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, RemoveCastIntoSegmentReduction) {
+  for (DataType indices_type : {DT_INT32, DT_INT64}) {
+    for (DataType segment_ids_type : {DT_INT32, DT_INT64}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+      Output embeddings = ops::Const(s.WithOpName("embeddings"),
+                                     {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+      Output indices =
+          ops::Cast(s.WithOpName("cast_indices"),
+                    ops::Const(s.WithOpName("indices"), {0, 0, 1, 0, 1, 0, 1}),
+                    indices_type);
+      Output segment_ids = ops::Cast(
+          s.WithOpName("cast_segment_ids"),
+          ops::Const(s.WithOpName("segment_ids"), {0, 1, 1, 2, 2, 2, 2}),
+          segment_ids_type);
+      Output result = ops::SparseSegmentSum(s.WithOpName("result"), embeddings,
+                                            indices, segment_ids);
+      Output id = ops::Identity(s.WithOpName("id"), result);
+
+      GrapplerItem item;
+      TF_CHECK_OK(s.ToGraphDef(&item.graph));
+      item.fetch = {"id"};
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+      ASSERT_EQ(tensors_expected.size(), 1);
+
+      GraphDef output;
+      ArithmeticOptimizer optimizer;
+      EnableOnlyRemoveCastIntoSegmentReduction(&optimizer);
+      OptimizeAndPrune(&optimizer, &item, &output);
+
+      for (const auto& node : output.node()) {
+        if (node.name() == "result") {
+          EXPECT_EQ(node.input(1), "indices");
+          EXPECT_EQ(node.input(2), "segment_ids");
+        }
+        EXPECT_NE(node.op(), "Cast");
+      }
+
+      auto tensors = EvaluateNodes(output, item.fetch);
+      ASSERT_EQ(tensors.size(), 1);
+      test::ExpectTensorEqual<float>(tensors[0], tensors_expected[0]);
+    }
+  }
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
index 73bb5a0d97c..9025635e668 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
@@ -223,6 +223,17 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.remove_stack_slice_same_axis = true;
   }
 
+  void EnableOnlySimplifyEmbeddingLookup(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.simplify_embedding_lookup = true;
+  }
+
+  void EnableOnlyRemoveCastIntoSegmentReduction(
+      ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_cast_into_segment_reduction = true;
+  }
+
  private:
   void DisableAllStages(ArithmeticOptimizer* optimizer) {
     ArithmeticOptimizer::ArithmeticOptimizerOptions options;
@@ -250,6 +261,8 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.replace_mul_with_square = false;
     options.simplify_aggregation = false;
     options.unary_ops_composition = false;
+    options.simplify_embedding_lookup = false;
+    options.remove_cast_into_segment_reduction = false;
     optimizer->options_ = options;
   }
 };
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 55f83eb7a76..fa6ca3144a5 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -58,7 +58,7 @@ const char kCastToFp32[] = "CastToFp32";
 // node. It handles regular type attributes, list type attributes (where
 // type_index is set to the index in the type list), and fixed types.
 struct TypeAttrId {
-  static const int kSingleType = -1;
+  static constexpr int kSingleType = -1;
 
   explicit TypeAttrId(const string& _attr_name, int _type_index = kSingleType)
       : attr_name(_attr_name),
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.cc b/tensorflow/core/grappler/optimizers/auto_parallel.cc
index 3f58a2abeac..a537fa256ba 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.cc
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/transitive_fanin.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -147,7 +148,8 @@ Status AutoParallel::Initialize(const GrapplerItem& item) {
   }
   LOG(INFO) << "Graph size after adding div nodes: " << all_nodes_.size();
 
-  auto train_nodes = ComputeTransitiveFanin(graph_, item.fetch);
+  std::vector<const NodeDef*> train_nodes;
+  TF_RETURN_IF_ERROR(ComputeTransitiveFanin(graph_, item.fetch, &train_nodes));
   LOG(INFO) << "Number of training nodes: " << train_nodes.size();
 
   const NodeDef* dequeue_node;
@@ -161,7 +163,8 @@ Status AutoParallel::Initialize(const GrapplerItem& item) {
   std::vector<const NodeDef*> input_nodes;
   if (dequeue_node) {
     LOG(INFO) << "Dequeue node: " << dequeue_node->name();
-    input_nodes = ComputeTransitiveFanin(graph_, {dequeue_node->name()});
+    TF_RETURN_IF_ERROR(ComputeTransitiveFanin(graph_, {dequeue_node->name()},
+                                              {}, &input_nodes));
   }
   LOG(INFO) << "Number of input nodes: " << input_nodes.size();
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 66fca58e907..a0ec3714070 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -2336,6 +2336,13 @@ bool ConstantFolding::SimplifyPack(GraphDef* optimized_graph, NodeDef* node) {
       node_map_->NodeExists(axis_node_name)) {
     return false;
   }
+
+  // It's unsafe to add a control dependency on the feed node, because it might
+  // have been never executed otherwiwise.
+  if (feed_nodes_.find(NodeName(node->input(0))) != feed_nodes_.end()) {
+    return false;
+  }
+
   // Create constant axis node.
   Tensor axis_t(DT_INT32, TensorShape({}));
   const int axis =
@@ -3754,20 +3761,6 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   return Status::OK();
 }
 
-namespace {
-Status CompressConstants(GraphDef* graph) {
-  for (int i = 0; i < graph->node_size(); ++i) {
-    NodeDef* node = graph->mutable_node(i);
-    if ((IsConstant(*node) || IsHostConstant(*node)) &&
-        HasNodeAttr(*node, "value")) {
-      AttrValue& attr_val = (*node->mutable_attr())["value"];
-      tensor::CompressTensorProtoInPlace(attr_val.mutable_tensor());
-    }
-  }
-  return Status::OK();
-}
-}  // namespace
-
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
   // TensorFlow flushes denormals to zero and rounds to nearest, so we do
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 6c829bb353b..7fae40501ff 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -548,6 +548,8 @@ TEST_F(ConstantFoldingTest, ConstantPushDownBiasAdd) {
   }
 }
 
+// This test fails on ROCm platform (see commit message for details)
+#ifndef TENSORFLOW_USE_ROCM
 TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
   for (string data_format : {
          "NHWC",
@@ -565,7 +567,10 @@ TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
         /*expect_folded=*/true);
   }
 }
+#endif
 
+// This test fails on ROCm platform (see commit message for details)
+#ifndef TENSORFLOW_USE_ROCM
 TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_SingletonConst) {
   for (string data_format : {
          "NHWC",
@@ -585,6 +590,7 @@ TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_SingletonConst) {
     }
   }
 }
+#endif
 
 TEST_F(ConstantFoldingTest,
        MulConvPushDownTest_Conv2D_SingletonConst_ShapeMismatch) {
@@ -668,8 +674,7 @@ TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_3x1Const) {
   }
 }
 
-// This test fails on ROCm platform with two vaue miscompare
-// TODO(rocm) : analysze and fix the cause of the failure and re-enable test
+// This test fails on ROCm platform (see commit message for details)
 #ifndef TENSORFLOW_USE_ROCM
 TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv3D_NDHWC_1x1x3Const) {
   MulConvPushDownTest(
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 519f689b278..bab28d44686 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -20,7 +20,6 @@ cc_library(
         ":inject_prefetch",
         ":latency_all_edges",
         ":make_sloppy",
-        ":make_stateless",
         ":map_and_batch_fusion",
         ":map_and_filter_fusion",
         ":map_fusion",
@@ -43,6 +42,7 @@ cc_library(
         ":optimizer_base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
@@ -390,37 +390,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "make_stateless",
-    srcs = ["make_stateless.cc"],
-    hdrs = ["make_stateless.h"],
-    deps = [
-        ":graph_utils",
-        ":optimizer_base",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-    ] + tf_protos_all(),
-    alwayslink = 1,
-)
-
-tf_cc_test(
-    name = "make_stateless_test",
-    srcs = ["make_stateless_test.cc"],
-    deps = [
-        ":graph_test_utils",
-        ":graph_utils",
-        ":make_stateless",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/grappler:grappler_item",
-    ],
-)
-
 cc_library(
     name = "map_and_batch_fusion",
     srcs = ["map_and_batch_fusion.cc"],
@@ -640,9 +609,13 @@ cc_library(
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler/optimizers:dependency_optimizer",
         "//tensorflow/core/grappler/optimizers:function_optimizer",
+        "//tensorflow/core/grappler/optimizers:loop_optimizer",
         "//tensorflow/core/grappler/optimizers:model_pruner",
+        "//tensorflow/core/grappler/optimizers:remapper",
         "//tensorflow/core/grappler/optimizers:shape_optimizer",
+        "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ptr_util",
     ] + tf_protos_all(),
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index d6593bf9880..3e8583d74e9 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -29,20 +30,26 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-// clang-format off
+constexpr char kAssertCardinalityDatasetOpName[] = "AssertCardinalityDataset";
 constexpr char kShardDatasetOpName[] = "ShardDataset";
 constexpr char kShuffleDatasetOpName[] = "ShuffleDataset";
 constexpr char kShuffleDatasetV2OpName[] = "ShuffleDatasetV2";
+constexpr char kShuffleDatasetV3OpName[] = "ShuffleDatasetV3";
 
 constexpr char kNumWorkersAttrName[] = "num_workers";
 constexpr char kIndexAttrName[] = "index";
 constexpr char kAutoShardPolicyAttrName[] = "auto_shard_policy";
+constexpr char kReshuffleEachIteration[] = "reshuffle_each_iteration";
+constexpr char kOutputShapes[] = "output_shapes";
+constexpr char kOutputTypes[] = "output_types";
 
+// clang-format off
 constexpr std::array<const char*, 6> kReaderDatasetOps = {
     "FixedLengthRecordDataset",
     "FixedLengthRecordDatasetV2",
@@ -57,7 +64,7 @@ constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
     "ZipDataset"
 };
 
-constexpr std::array<const char*, 29> kPassThroughOps = {
+constexpr std::array<const char*, 30> kPassThroughOps = {
     "_Retval",
     "AssertNextDataset",
     "BatchDataset",
@@ -84,6 +91,7 @@ constexpr std::array<const char*, 29> kPassThroughOps = {
     "ShuffleAndRepeatDataset",
     "ShuffleDataset",
     "ShuffleDatasetV2",
+    "ShuffleDatasetV3",
     "SkipDataset",
     "TakeDataset",
     "WindowDataset",
@@ -145,28 +153,27 @@ Status AddShardNode(MutableGraphView* graph, const NodeDef& add_before,
   // Add shapes and other attributes
   NodeDef* add_after = graph->GetNode(add_before.input(0));
 
-  if (absl::EndsWith(add_after->op(), "Dataset") ||
-      absl::EndsWith(add_after->op(), "DatasetV2")) {
+  if (absl::StrContains(add_after->op(), "Dataset")) {
     // We still may or may not have the right attributes because Datasets like
     // TFRecordDataset doesn't have a output type or shape, and by default we
     // set them to DT_STRING and an unknown shape.
-    if (add_after->attr().count("output_shapes") > 0) {
-      graph_utils::CopyAttribute("output_shapes", *add_after, &new_node);
+    if (add_after->attr().count(kOutputShapes) > 0) {
+      graph_utils::CopyAttribute(kOutputShapes, *add_after, &new_node);
     } else {
       tensorflow::TensorShapeProto* shape =
-          (*(new_node.mutable_attr()))["output_shapes"]
+          (*(new_node.mutable_attr()))[kOutputShapes]
               .mutable_list()
               ->add_shape();
       shape->set_unknown_rank(true);
     }
 
-    if (add_after->attr().count("output_types") > 0) {
-      graph_utils::CopyAttribute("output_types", *add_after, &new_node);
+    if (add_after->attr().count(kOutputTypes) > 0) {
+      graph_utils::CopyAttribute(kOutputTypes, *add_after, &new_node);
     } else if (add_after->attr().count("Toutput_types") > 0) {
-      (*(new_node.mutable_attr()))["output_types"] =
+      (*(new_node.mutable_attr()))[kOutputTypes] =
           add_after->attr().at("Toutput_types");
     } else {
-      (*(new_node.mutable_attr()))["output_types"].mutable_list()->add_type(
+      (*(new_node.mutable_attr()))[kOutputTypes].mutable_list()->add_type(
           tensorflow::DataType::DT_STRING);
     }
   } else {
@@ -188,9 +195,10 @@ Status AddShardNode(MutableGraphView* graph, const NodeDef& add_before,
   return Status::OK();
 }
 
-Status AddShuffleNode(MutableGraphView* graph, const NodeDef& add_before,
-                      const string& buffer_size_node, const string& seed_node,
-                      const string& seed2_node, bool reshuffle_each_iteration) {
+Status AddShuffleDataset(MutableGraphView* graph, const NodeDef& add_before,
+                         const string& buffer_size_node,
+                         const string& seed_node, const string& seed2_node,
+                         bool reshuffle_each_iteration) {
   NodeDef* add_after = graph->GetNode(add_before.input(0));
   NodeDef new_node;
   new_node.set_op(kShuffleDatasetOpName);
@@ -202,12 +210,12 @@ Status AddShuffleNode(MutableGraphView* graph, const NodeDef& add_before,
   new_node.add_input(seed_node);
   new_node.add_input(seed2_node);
 
-  graph_utils::CopyAttribute("output_shapes", *add_after, &new_node);
-  graph_utils::CopyAttribute("output_types", *add_after, &new_node);
+  graph_utils::CopyAttribute(kOutputShapes, *add_after, &new_node);
+  graph_utils::CopyAttribute(kOutputTypes, *add_after, &new_node);
 
   AttrValue reshuffle_attr;
   reshuffle_attr.set_b(reshuffle_each_iteration);
-  (*new_node.mutable_attr())["reshuffle_each_iteration"] = reshuffle_attr;
+  (*new_node.mutable_attr())[kReshuffleEachIteration] = reshuffle_attr;
 
   NodeDef* new_node_graph = graph->AddNode(std::move(new_node));
 
@@ -216,9 +224,9 @@ Status AddShuffleNode(MutableGraphView* graph, const NodeDef& add_before,
   return Status::OK();
 }
 
-Status AddShuffleV2Node(MutableGraphView* graph, const NodeDef& add_before,
-                        const string& buffer_size_node,
-                        const string& seed_generator_node) {
+Status AddShuffleDatasetV2(MutableGraphView* graph, const NodeDef& add_before,
+                           const string& buffer_size_node,
+                           const string& seed_generator_node) {
   NodeDef* add_after = graph->GetNode(add_before.input(0));
   NodeDef new_node;
   new_node.set_op(kShuffleDatasetV2OpName);
@@ -229,8 +237,39 @@ Status AddShuffleV2Node(MutableGraphView* graph, const NodeDef& add_before,
   new_node.add_input(buffer_size_node);
   new_node.add_input(seed_generator_node);
 
-  graph_utils::CopyAttribute("output_shapes", *add_after, &new_node);
-  graph_utils::CopyAttribute("output_types", *add_after, &new_node);
+  graph_utils::CopyAttribute(kOutputShapes, *add_after, &new_node);
+  graph_utils::CopyAttribute(kOutputTypes, *add_after, &new_node);
+
+  NodeDef* new_node_graph = graph->AddNode(std::move(new_node));
+
+  TF_RETURN_IF_ERROR(
+      graph->UpdateFanouts(add_after->name(), new_node_graph->name()));
+  return Status::OK();
+}
+
+Status AddShuffleDatasetV3(MutableGraphView* graph, const NodeDef& add_before,
+                           const string& buffer_size_node,
+                           const string& seed_node, const string& seed2_node,
+                           const string& seed_generator_node,
+                           bool reshuffle_each_iteration) {
+  NodeDef* add_after = graph->GetNode(add_before.input(0));
+  NodeDef new_node;
+  new_node.set_op(kShuffleDatasetV3OpName);
+  graph_utils::SetUniqueGraphNodeName(kShuffleDatasetV3OpName, graph->graph(),
+                                      &new_node);
+
+  new_node.add_input(add_before.input(0));
+  new_node.add_input(buffer_size_node);
+  new_node.add_input(seed_node);
+  new_node.add_input(seed2_node);
+  new_node.add_input(seed_generator_node);
+
+  graph_utils::CopyAttribute(kOutputShapes, *add_after, &new_node);
+  graph_utils::CopyAttribute(kOutputTypes, *add_after, &new_node);
+
+  AttrValue reshuffle_attr;
+  reshuffle_attr.set_b(reshuffle_each_iteration);
+  (*new_node.mutable_attr())[kReshuffleEachIteration] = reshuffle_attr;
 
   NodeDef* new_node_graph = graph->AddNode(std::move(new_node));
 
@@ -267,7 +306,7 @@ Status RemoveShuffleDataset(MutableGraphView* graph, const NodeDef& node,
     *buffer_size_node = node.input(1);
     *seed_node = node.input(2);
     *seed2_node = node.input(3);
-    *reshuffle_each_iteration = node.attr().at("reshuffle_each_iteration").b();
+    *reshuffle_each_iteration = node.attr().at(kReshuffleEachIteration).b();
     TF_RETURN_IF_ERROR(graph->UpdateFanouts(node.name(), node.input(0)));
     nodes_to_delete->insert(node.name());
   }
@@ -304,6 +343,33 @@ Status RemoveShuffleDatasetV2(MutableGraphView* graph, const NodeDef& node,
   return Status::OK();
 }
 
+Status RemoveShuffleDatasetV3(MutableGraphView* graph, const NodeDef& node,
+                              absl::flat_hash_set<string>* nodes_to_delete,
+                              string* op_name, string* buffer_size_node,
+                              string* seed_node, string* seed2_node,
+                              string* seed_generator_node,
+                              bool* reshuffle_each_iteration) {
+  if (node.op() == kShuffleDatasetV3OpName) {
+    *op_name = node.op();
+    *buffer_size_node = node.input(1);
+    *seed_node = node.input(2);
+    *seed2_node = node.input(3);
+    *seed_generator_node = node.input(4);
+    *reshuffle_each_iteration = node.attr().at(kReshuffleEachIteration).b();
+    TF_RETURN_IF_ERROR(graph->UpdateFanouts(node.name(), node.input(0)));
+    nodes_to_delete->insert(node.name());
+  }
+
+  for (const auto& fanin : graph->GetFanins(node, true)) {
+    TF_RETURN_IF_ERROR(RemoveShuffleDatasetV3(
+        graph, *fanin.node, nodes_to_delete, op_name, buffer_size_node,
+        seed_node, seed2_node, seed_generator_node, reshuffle_each_iteration));
+  }
+
+  // TODO(frankchn): Traverse functions too.
+  return Status::OK();
+}
+
 Status ProcessDatasetSourceNode(MutableGraphView* graph, const NodeDef& node,
                                 absl::flat_hash_set<string>* nodes_to_delete,
                                 int64 num_workers, int64 index) {
@@ -323,13 +389,24 @@ Status ProcessDatasetSourceNode(MutableGraphView* graph, const NodeDef& node,
         RemoveShuffleDatasetV2(graph, node, nodes_to_delete, &shuffle_op_name,
                                &buffer_size_node, &seed_generator_node));
   }
+  if (shuffle_op_name.empty()) {
+    TF_RETURN_IF_ERROR(RemoveShuffleDatasetV3(
+        graph, node, nodes_to_delete, &shuffle_op_name, &buffer_size_node,
+        &seed_node, &seed2_node, &seed_generator_node,
+        &reshuffle_each_iteration));
+  }
 
   if (shuffle_op_name == kShuffleDatasetOpName) {
-    TF_RETURN_IF_ERROR(AddShuffleNode(graph, node, buffer_size_node, seed_node,
-                                      seed2_node, reshuffle_each_iteration));
+    TF_RETURN_IF_ERROR(AddShuffleDataset(graph, node, buffer_size_node,
+                                         seed_node, seed2_node,
+                                         reshuffle_each_iteration));
   } else if (shuffle_op_name == kShuffleDatasetV2OpName) {
-    TF_RETURN_IF_ERROR(
-        AddShuffleV2Node(graph, node, buffer_size_node, seed_generator_node));
+    TF_RETURN_IF_ERROR(AddShuffleDatasetV2(graph, node, buffer_size_node,
+                                           seed_generator_node));
+  } else if (shuffle_op_name == kShuffleDatasetV3OpName) {
+    TF_RETURN_IF_ERROR(AddShuffleDatasetV3(
+        graph, node, buffer_size_node, seed_node, seed2_node,
+        seed_generator_node, reshuffle_each_iteration));
   }
 
   return Status::OK();
@@ -339,6 +416,16 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, int64 index,
                            FunctionLibraryDefinition* flib,
                            MutableGraphView* graph,
                            absl::flat_hash_set<string>* nodes_to_delete) {
+  if (node.op() == kAssertCardinalityDatasetOpName) {
+    LOG(WARNING) << "The `assert_cardinality` transformation is currently not "
+                    "handled by the auto-shard rewrite and will be removed.";
+    nodes_to_delete->insert(node.name());
+    TF_RETURN_IF_ERROR(graph->UpdateFanouts(node.name(), node.input(0)));
+    const NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
+    return RecursivelyHandleOp(*input_node, num_workers, index, flib, graph,
+                               nodes_to_delete);
+  }
+
   if (IsDatasetNodeOfType(node, kUnshardableSourceDatasetOps)) {
     return errors::NotFound("Found an unshardable source dataset: ",
                             node.DebugString());
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index d5308ad31a8..d70a1ca486e 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/fusion_utils.h"
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -428,8 +429,14 @@ FunctionDef* FuseFunctions(
     StringPiece fused_name_prefix, const SetFunctionSignatureFn& set_signature,
     const SetInputFn& set_input, const SetOutputFn& set_output,
     const SetNodesFn& set_nodes, FunctionDefLibrary* library) {
-  if (first_function.attr_size() != 0 || second_function.attr_size() != 0)
-    return nullptr;  // Functions with attributes are currently not supported
+  auto has_attrs = [](const FunctionDef& func) {
+    return !(
+        func.attr_size() == 0 ||
+        (func.attr_size() == 1 && func.attr().contains(data::kTFDataFunction)));
+  };
+  if (has_attrs(first_function) || has_attrs(second_function)) {
+    return nullptr;  // Functions with attributes are currently not supported.
+  }
 
   // This function will be used as a clone of second function, having unique
   // names.
@@ -467,7 +474,7 @@ FunctionDef* FuseFunctions(
               fused_function->mutable_ret());
 
   set_nodes(first_function, setup_function, fused_function, library);
-
+  (*fused_function->mutable_attr())[data::kTFDataFunction].set_b(true);
   return fused_function;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/make_stateless.cc b/tensorflow/core/grappler/optimizers/data/make_stateless.cc
deleted file mode 100644
index a18ca58f246..00000000000
--- a/tensorflow/core/grappler/optimizers/data/make_stateless.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/grappler/optimizers/data/make_stateless.h"
-
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/mutable_graph_view.h"
-#include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-
-namespace tensorflow {
-namespace grappler {
-namespace {
-
-constexpr char kCacheDataset[] = "CacheDataset";
-constexpr char kCacheDatasetV2[] = "CacheDatasetV2";
-constexpr char kReshuffleEachIteration[] = "reshuffle_each_iteration";
-constexpr char kShuffleDataset[] = "ShuffleDataset";
-constexpr char kShuffleDatasetV2[] = "ShuffleDatasetV2";
-
-}  // namespace
-
-Status MakeStateless::OptimizeAndCollectStats(Cluster* cluster,
-                                              const GrapplerItem& item,
-                                              GraphDef* output,
-                                              OptimizationStats* stats) {
-  *output = item.graph;
-  MutableGraphView graph(output);
-
-  NodeDef* zero_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
-
-  for (NodeDef& node : *output->mutable_node()) {
-    if (node.op() == kShuffleDatasetV2) {
-      *node.mutable_op() = kShuffleDataset;
-      // remove `seed_generator` input
-      node.mutable_input()->RemoveLast();
-      // add `seed` input
-      node.add_input(zero_node->name());
-      // add `seed2` input
-      node.add_input(zero_node->name());
-      // set `reshuffle_each_iteration` attr
-      (*node.mutable_attr())[kReshuffleEachIteration].set_b(true);
-    } else if (node.op() == kCacheDatasetV2) {
-      *node.mutable_op() = kCacheDataset;
-      // remove `cache` input
-      node.mutable_input()->RemoveLast();
-    }
-  }
-
-  return Status::OK();
-}
-
-REGISTER_GRAPH_OPTIMIZER_AS(MakeStateless, "make_stateless");
-
-}  // namespace grappler
-}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/make_stateless.h b/tensorflow/core/grappler/optimizers/data/make_stateless.h
deleted file mode 100644
index cd95c23a276..00000000000
--- a/tensorflow/core/grappler/optimizers/data/make_stateless.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_STATELESS_H_
-#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_STATELESS_H_
-
-#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
-
-namespace tensorflow {
-namespace grappler {
-
-// This rewrite replaces transformations that depend on external state (such as
-// `ShuffleDatasetV2`) with a stateless alternative so that the input pipeline
-// graph can be cloned.
-//
-// Note that this rewrites may change observable behavior of the input pipeline
-// (e.g. `reshuffle_each_iteration` will not work) and is a stop gap solution
-// to enable cloning until a better mechanism exists.
-class MakeStateless : public TFDataOptimizerBase {
- public:
-  MakeStateless() = default;
-  ~MakeStateless() override = default;
-
-  string name() const override { return "make_stateless"; }
-
-  bool UsesFunctionLibrary() const override { return false; }
-
-  Status Init(
-      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return Status::OK();
-  }
-
-  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
-                                 GraphDef* output,
-                                 OptimizationStats* stats) override;
-
-  void Feedback(Cluster* cluster, const GrapplerItem& item,
-                const GraphDef& optimize_output, double result) override {}
-};
-
-}  // namespace grappler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_STATELESS_H_
diff --git a/tensorflow/core/grappler/optimizers/data/make_stateless_test.cc b/tensorflow/core/grappler/optimizers/data/make_stateless_test.cc
deleted file mode 100644
index a30b7c63726..00000000000
--- a/tensorflow/core/grappler/optimizers/data/make_stateless_test.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/grappler/optimizers/data/make_stateless.h"
-
-#include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace grappler {
-namespace {
-
-TEST(MakeStateless, Cache) {
-  using test::function::NDef;
-  GrapplerItem item;
-  item.graph = test::function::GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
-       NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_INT64}}),
-       NDef("handle", "Const", {}, {{"value", 1}, {"dtype", DT_RESOURCE}}),
-       graph_tests_utils::MakeCacheV2Node("cache", "range", "filename",
-                                          "handle")},
-      {});
-
-  MakeStateless optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("cache", output));
-  int index = graph_utils::FindGraphNodeWithName("cache", output);
-  EXPECT_EQ(output.node(index).op(), "CacheDataset");
-  EXPECT_EQ(output.node(index).input_size(), 2);
-}
-
-TEST(MakeStateless, Shuffle) {
-  using test::function::NDef;
-  GrapplerItem item;
-  item.graph = test::function::GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
-       NDef("buffer_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT64}}),
-       NDef("handle", "Const", {}, {{"value", 1}, {"dtype", DT_RESOURCE}}),
-       graph_tests_utils::MakeShuffleV2Node("shuffle", "range", "buffer_size",
-                                            "handle")},
-      {});
-
-  MakeStateless optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("shuffle", output));
-  int index = graph_utils::FindGraphNodeWithName("shuffle", output);
-  EXPECT_EQ(output.node(index).op(), "ShuffleDataset");
-  EXPECT_EQ(output.node(index).input_size(), 4);
-}
-
-}  // namespace
-}  // namespace grappler
-}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index c2f1e723641..50eac2e23df 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -342,6 +343,7 @@ Status AddBranch(gtl::ArraySlice<const NodeDef*> branch,
                  FunctionDefLibrary* library) {
   FunctionDef* branch_func = library->add_function();
   auto* signature = branch_func->mutable_signature();
+  (*branch_func->mutable_attr())[data::kTFDataFunction].set_b(true);
   graph_utils::SetUniqueGraphFunctionName("branch", library, branch_func);
 
   // Input dataset.
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 39b59a229df..3591cd525ac 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/meta_optimizer.h"
 
 #include "absl/strings/str_split.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
@@ -23,8 +26,11 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/optimizers/remapper.h"
 #include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+#include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/ptr_util.h"
 
@@ -37,8 +43,7 @@ using ConfigMap =
     std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
 // tf.data optimizations, in the order we want to perform them.
-constexpr std::array<const char*, 16> kTFDataOptimizations = {
-    "make_stateless",
+constexpr std::array<const char*, 15> kTFDataOptimizations = {
     "noop_elimination",
     "shuffle_and_repeat_fusion",
     "map_fusion",
@@ -56,8 +61,12 @@ constexpr std::array<const char*, 16> kTFDataOptimizations = {
     "inject_prefetch"};
 
 // Standard grappler optimizations, in the order we want to perform them.
-constexpr std::array<const char*, 5> kGrapplerOptimizations = {
-    "pruning", "function", "shape", "arithmetic", "dependency"};
+// The order matches the order in the generic meta optimizer.
+constexpr std::array<const char*, 9> kGrapplerOptimizations = {
+    "pruning",  "function",   "common_subgraph_elimination",
+    "shape",    "arithmetic", "layout_optimizer",
+    "remapper", "loop",       "dependency",
+};
 
 // Parses a list of string optimizer configurations into a map from
 // optimizer name -> rewriter config for that optimizer.
@@ -116,6 +125,48 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Store the final result of all the optimizations in `output`.
   output->Swap(&optimized_item.graph);
+
+  // Optimize tf.data user-defined functions.
+  FunctionLibraryDefinition flib =
+      FunctionLibraryDefinition(OpRegistry::Global(), output->library())
+          .ReachableDefinitions(*output);
+  const auto producer = output->versions().producer();
+  bool optimized_functions = false;
+  for (const FunctionDef& func : output->library().function()) {
+    // Skip non tf.data functions.
+    if (!func.attr().contains(data::kTFDataFunction)) continue;
+    VLOG(3) << "Optimize function: function=" << func.signature().name();
+    optimized_functions = true;
+
+    // Make a GrapplerItem from a FunctionDef.
+    GrapplerFunctionItem func_item;
+    TF_RETURN_IF_ERROR(
+        MakeGrapplerFunctionItem(func, flib, producer, &func_item));
+
+    GraphDef optimized_func_graph;
+    TF_RETURN_IF_ERROR(Optimize(cluster, func_item, &optimized_func_graph));
+
+    // Function body optimization might have created new functions. Add them to
+    // the library.
+    for (const FunctionDef& func_def :
+         optimized_func_graph.library().function()) {
+      if (flib.Find(func_def.signature().name()) == nullptr) {
+        TF_RETURN_IF_ERROR(flib.AddFunctionDef(func_def));
+      }
+    }
+
+    // Convert optimized graph back to FunctionDef.
+    FunctionDef optimized_func;
+    func_item.SwapFunctionBody(std::move(optimized_func_graph));
+    TF_RETURN_IF_ERROR(MakeFunctionDef(func_item, flib, &optimized_func));
+
+    // Replace optimized function with a new FunctionDef.
+    TF_RETURN_IF_ERROR(
+        flib.ReplaceFunction(func.signature().name(), optimized_func));
+  }
+  if (optimized_functions) {
+    *output->mutable_library() = flib.ToProto();
+  }
   return Status::OK();
 }
 
@@ -170,15 +221,26 @@ Status TFDataMetaOptimizer::Init(
     }
   }
 
-  // Initialize standard grappler optimizers.
+  // Enable a subset of grappler optimization that are enabled by default.
+  //
+  // Layout optimizations are excluded because they assume that ops without
+  // explicit device assignment will be placed on GPU (if available) but that's
+  // not the case for operations within tf.data functions.
+  //
+  // TODO(b/120437209): Re-enable constant folding.
+  //
+  // TODO(jsimsa): Make the set of generic Grappler optimization applied to
+  // tf.data functions configurable.
   enabled_optimizers_["pruning"] = MakeUnique<ModelPruner>();
-  enabled_optimizers_["function"] = MakeUnique<FunctionOptimizer>(
-      RewriterConfig::ON, /*lower_control_flow=*/true);
   enabled_optimizers_["shape"] = MakeUnique<ShapeOptimizer>();
+  enabled_optimizers_["remapping"] = MakeUnique<Remapper>(RewriterConfig::ON);
   enabled_optimizers_["common_subgraph_elimination"] =
       MakeUnique<CommonSubgraphElimination>();
   enabled_optimizers_["arithmetic"] = MakeUnique<ArithmeticOptimizer>();
   enabled_optimizers_["dependency"] = MakeUnique<DependencyOptimizer>();
+  enabled_optimizers_["loop"] = MakeUnique<LoopOptimizer>();
+  enabled_optimizers_["function"] = MakeUnique<FunctionOptimizer>(
+      RewriterConfig::ON, /*lower_control_flow=*/true);
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index 6f5c32edf26..64bb4528f62 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -26,12 +26,128 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/strcat.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kFusedOpName[] = "ShuffleAndRepeatDataset";
+constexpr char kShuffleDataset[] = "ShuffleDataset";
+constexpr char kShuffleDatasetV2[] = "ShuffleDatasetV2";
+constexpr char kShuffleDatasetV3[] = "ShuffleDatasetV3";
+constexpr char kRepeatDataset[] = "RepeatDataset";
+constexpr char kShuffleAndRepeatDataset[] = "ShuffleAndRepeatDataset";
+constexpr char kShuffleAndRepeatDatasetV2[] = "ShuffleAndRepeatDatasetV2";
+
+constexpr char kOutputShapes[] = "output_shapes";
+constexpr char kOutputTypes[] = "output_types";
+constexpr char kReshuffleEachIteration[] = "reshuffle_each_iteration";
+
+Status FuseShuffleV1AndRepeat(const NodeDef& shuffle_node,
+                              const NodeDef& repeat_node,
+                              MutableGraphView* graph, GraphDef* output,
+                              NodeDef* fused_node) {
+  fused_node->set_op(kShuffleAndRepeatDataset);
+  graph_utils::SetUniqueGraphNodeName(kShuffleAndRepeatDataset, output,
+                                      fused_node);
+
+  // Set the `input` input argument.
+  fused_node->add_input(shuffle_node.input(0));
+
+  // Set the `buffer_size` input argument.
+  fused_node->add_input(shuffle_node.input(1));
+
+  // Set the `seed` input argument.
+  fused_node->add_input(shuffle_node.input(2));
+
+  // Set the `seed2` input argument.
+  fused_node->add_input(shuffle_node.input(3));
+
+  // Set the `count` input argument.
+  fused_node->add_input(repeat_node.input(1));
+
+  // Set `output_types`, `output_shapes`, and `reshuffle_each_iteration`
+  // attributes.
+  for (auto key : {kOutputShapes, kOutputTypes, kReshuffleEachIteration}) {
+    graph_utils::CopyAttribute(key, shuffle_node, fused_node);
+  }
+
+  return Status::OK();
+}
+
+Status FuseShuffleV2AndRepeat(const NodeDef& shuffle_node,
+                              const NodeDef& repeat_node,
+                              MutableGraphView* graph, GraphDef* output,
+                              NodeDef* fused_node) {
+  fused_node->set_op(kShuffleAndRepeatDatasetV2);
+  graph_utils::SetUniqueGraphNodeName(kShuffleAndRepeatDatasetV2, output,
+                                      fused_node);
+
+  NodeDef zero_node = *graph_utils::AddScalarConstNode<int64>(0, graph);
+
+  // Set the `input` input argument.
+  fused_node->add_input(shuffle_node.input(0));
+
+  // Set the `buffer_size` input argument.
+  fused_node->add_input(shuffle_node.input(1));
+
+  // Default the `seed` input argument to 0.
+  fused_node->add_input(zero_node.name());
+
+  // Default the `seed2` input argument to 0.
+  fused_node->add_input(zero_node.name());
+
+  // Set the `count` input argument.
+  fused_node->add_input(repeat_node.input(1));
+
+  // Set the `seed_generator` input argument.
+  fused_node->add_input(shuffle_node.input(2));
+
+  // Set `output_types` and `output_shapes` attributes.
+  for (auto key : {kOutputShapes, kOutputTypes}) {
+    graph_utils::CopyAttribute(key, shuffle_node, fused_node);
+  }
+
+  // Default the `reshuffle_each_iteration` attribute to true.
+  (*fused_node->mutable_attr())[kReshuffleEachIteration].set_b(true);
+
+  return Status::OK();
+}
+
+Status FuseShuffleV3AndRepeat(const NodeDef& shuffle_node,
+                              const NodeDef& repeat_node,
+                              MutableGraphView* graph, GraphDef* output,
+                              NodeDef* fused_node) {
+  fused_node->set_op(kShuffleAndRepeatDatasetV2);
+  graph_utils::SetUniqueGraphNodeName(kShuffleAndRepeatDataset, output,
+                                      fused_node);
+
+  // Set the `input` input argument.
+  fused_node->add_input(shuffle_node.input(0));
+
+  // Set the `buffer_size` input argument.
+  fused_node->add_input(shuffle_node.input(1));
+
+  // Set the `seed` input argument.
+  fused_node->add_input(shuffle_node.input(2));
+
+  // Set the `seed2` input argument.
+  fused_node->add_input(shuffle_node.input(3));
+
+  // Set the `count` input argument.
+  fused_node->add_input(repeat_node.input(1));
+
+  // Set the `seed_generator` input argument.
+  fused_node->add_input(shuffle_node.input(4));
+
+  // Set `output_types`, `output_shapes`, and `reshuffle_each_iteration`
+  // attributes.
+  for (auto key : {kOutputShapes, kOutputTypes, kReshuffleEachIteration}) {
+    graph_utils::CopyAttribute(key, shuffle_node, fused_node);
+  }
+
+  return Status::OK();
+}
 
 }  // namespace
 
@@ -42,65 +158,46 @@ Status ShuffleAndRepeatFusion::OptimizeAndCollectStats(
   MutableGraphView graph(output);
   absl::flat_hash_set<string> nodes_to_delete;
 
-  auto make_shuffle_and_repeat_node = [&output](const NodeDef& shuffle_node,
-                                                const NodeDef& repeat_node) {
-    NodeDef new_node;
-    new_node.set_op(kFusedOpName);
-    graph_utils::SetUniqueGraphNodeName(kFusedOpName, output, &new_node);
-
-    // Set the `input` input argument.
-    new_node.add_input(shuffle_node.input(0));
-
-    // Set the `buffer_size` input argument.
-    new_node.add_input(shuffle_node.input(1));
-
-    // Set the `seed` input argument.
-    new_node.add_input(shuffle_node.input(2));
-
-    // Set the `seed2` input argument.
-    new_node.add_input(shuffle_node.input(3));
-
-    // Set the `count` input argument.
-    new_node.add_input(repeat_node.input(1));
-
-    // Set `output_types` and `output_shapes` attributes.
-    for (auto key : {"output_shapes", "output_types"}) {
-      graph_utils::CopyAttribute(key, repeat_node, &new_node);
-    }
-    return new_node;
-  };
-
-  for (const NodeDef& node : item.graph.node()) {
-    if (node.op() != "RepeatDataset") {
+  for (const NodeDef& repeat_node : item.graph.node()) {
+    if (repeat_node.op() != kRepeatDataset) {
       continue;
     }
 
-    // Use a more descriptive variable name now that we know the node type.
-    const NodeDef& repeat_node = node;
-    NodeDef* node2 = graph_utils::GetInputNode(repeat_node, graph);
+    const NodeDef& shuffle_node =
+        *graph_utils::GetInputNode(repeat_node, graph);
 
-    if (node2->op() != "ShuffleDataset") {
+    NodeDef fused_node;
+    if (shuffle_node.op() == kShuffleDataset) {
+      TF_RETURN_IF_ERROR(FuseShuffleV1AndRepeat(shuffle_node, repeat_node,
+                                                &graph, output, &fused_node));
+    } else if (shuffle_node.op() == kShuffleDatasetV2) {
+      TF_RETURN_IF_ERROR(FuseShuffleV2AndRepeat(shuffle_node, repeat_node,
+                                                &graph, output, &fused_node));
+
+    } else if (shuffle_node.op() == kShuffleDatasetV3) {
+      TF_RETURN_IF_ERROR(FuseShuffleV3AndRepeat(shuffle_node, repeat_node,
+                                                &graph, output, &fused_node));
+    } else {
       continue;
     }
 
-    // Use a more descriptive variable name now that we know the node type.
-    const NodeDef& shuffle_node = *node2;
-
-    // TODO(b/129712758): Remove when the fused kernel supports disabling
-    // reshuffling for each iteration.
-    if (HasNodeAttr(shuffle_node, "reshuffle_each_iteration") &&
-        !shuffle_node.attr().at("reshuffle_each_iteration").b()) {
-      continue;
-    }
-
-    NodeDef* shuffle_and_repeat_node =
-        graph.AddNode(make_shuffle_and_repeat_node(shuffle_node, repeat_node));
+    NodeDef& shuffle_and_repeat_node = *graph.AddNode(std::move(fused_node));
     TF_RETURN_IF_ERROR(graph.UpdateFanouts(repeat_node.name(),
-                                           shuffle_and_repeat_node->name()));
+                                           shuffle_and_repeat_node.name()));
+    // Update shuffle node fanouts to shuffle_and_repeat fanouts to take care of
+    // control dependencies.
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(shuffle_node.name(),
+                                           shuffle_and_repeat_node.name()));
 
-    // Mark the `Shuffle` and `Repeat` nodes for removal.
-    nodes_to_delete.insert(shuffle_node.name());
-    nodes_to_delete.insert(repeat_node.name());
+    // Mark the `Shuffle` and `Repeat` nodes for removal (as long as neither of
+    // them needs to be preserved).
+    const auto nodes_to_preserve = item.NodesToPreserve();
+    if (nodes_to_preserve.find(shuffle_node.name()) ==
+            nodes_to_preserve.end() &&
+        nodes_to_preserve.find(repeat_node.name()) == nodes_to_preserve.end()) {
+      nodes_to_delete.insert(shuffle_node.name());
+      nodes_to_delete.insert(repeat_node.name());
+    }
     stats->num_changes++;
   }
 
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
index 556e1d3ab57..9a5c454ad0c 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
@@ -25,17 +25,21 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
+constexpr char kOutputShapes[] = "output_shapes";
+constexpr char kOutputTypes[] = "output_types";
+constexpr char kReshuffleEachIteration[] = "reshuffle_each_iteration";
+
+TEST(ShuffleAndRepeatFusionTest, FuseShuffleV1AndRepeat) {
   GrapplerItem item;
   MutableGraphView graph(&item.graph);
 
   std::vector<std::pair<string, AttrValue>> common_attrs(2);
   AttrValue shapes_attr;
-  SetAttrValue("output_shapes", &shapes_attr);
-  common_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+  SetAttrValue(kOutputShapes, &shapes_attr);
+  common_attrs[0] = std::make_pair(kOutputShapes, shapes_attr);
   AttrValue types_attr;
-  SetAttrValue("output_types", &types_attr);
-  common_attrs[1] = std::make_pair("output_types", types_attr);
+  SetAttrValue(kOutputTypes, &types_attr);
+  common_attrs[1] = std::make_pair(kOutputTypes, types_attr);
 
   NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
@@ -59,6 +63,7 @@ TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
   shuffle_inputs[3] = seed2_node->name();
   NodeDef *shuffle_node = graph_utils::AddNode(
       "", "ShuffleDataset", shuffle_inputs, common_attrs, &graph);
+  (*shuffle_node->mutable_attr())[kReshuffleEachIteration].set_b(true);
 
   NodeDef *count_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
   std::vector<string> repeat_inputs(2);
@@ -85,12 +90,148 @@ TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
   EXPECT_EQ(shuffle_and_repeat_node.input(2), shuffle_node->input(2));
   EXPECT_EQ(shuffle_and_repeat_node.input(3), shuffle_node->input(3));
   EXPECT_EQ(shuffle_and_repeat_node.input(4), repeat_node->input(1));
+  for (const auto &attr :
+       {kOutputShapes, kOutputTypes, kReshuffleEachIteration}) {
+    EXPECT_TRUE(AreAttrValuesEqual(shuffle_and_repeat_node.attr().at(attr),
+                                   shuffle_node->attr().at(attr)));
+  }
+}
+
+TEST(ShuffleAndRepeatFusionTest, FuseShuffleV2AndRepeat) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+
+  std::vector<std::pair<string, AttrValue>> common_attrs(2);
+  AttrValue shapes_attr;
+  SetAttrValue(kOutputShapes, &shapes_attr);
+  common_attrs[0] = std::make_pair(kOutputShapes, shapes_attr);
+  AttrValue types_attr;
+  SetAttrValue(kOutputTypes, &types_attr);
+  common_attrs[1] = std::make_pair(kOutputTypes, types_attr);
+
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             common_attrs, &graph);
+
+  NodeDef *buffer_size_node =
+      graph_utils::AddScalarConstNode<int64>(128, &graph);
+  NodeDef *seed_generator_node =
+      graph_utils::AddScalarConstNode<StringPiece>("dummy_resource", &graph);
+  std::vector<string> shuffle_inputs(3);
+  shuffle_inputs[0] = range_node->name();
+  shuffle_inputs[1] = buffer_size_node->name();
+  shuffle_inputs[2] = seed_generator_node->name();
+  NodeDef *shuffle_node = graph_utils::AddNode(
+      "", "ShuffleDatasetV2", shuffle_inputs, common_attrs, &graph);
+
+  NodeDef *count_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
+  std::vector<string> repeat_inputs(2);
+  repeat_inputs[0] = shuffle_node->name();
+  repeat_inputs[1] = count_node->name();
+  NodeDef *repeat_node = graph_utils::AddNode(
+      "", "RepeatDataset", repeat_inputs, common_attrs, &graph);
+
+  ShuffleAndRepeatFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(shuffle_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(repeat_node->name(), output));
   EXPECT_TRUE(
-      AreAttrValuesEqual(shuffle_and_repeat_node.attr().at("output_shapes"),
-                         repeat_node->attr().at("output_shapes")));
+      graph_utils::ContainsNodeWithOp("ShuffleAndRepeatDatasetV2", output));
+  NodeDef shuffle_and_repeat_node = output.node(
+      graph_utils::FindGraphNodeWithOp("ShuffleAndRepeatDatasetV2", output));
+  EXPECT_EQ(shuffle_and_repeat_node.input_size(), 6);
+  EXPECT_EQ(shuffle_and_repeat_node.input(0), shuffle_node->input(0));
+  EXPECT_EQ(shuffle_and_repeat_node.input(1), shuffle_node->input(1));
+  EXPECT_EQ(shuffle_and_repeat_node.input(4), repeat_node->input(1));
+  EXPECT_EQ(shuffle_and_repeat_node.input(5), shuffle_node->input(2));
+  for (const auto &attr : {kOutputShapes, kOutputTypes}) {
+    EXPECT_TRUE(AreAttrValuesEqual(shuffle_and_repeat_node.attr().at(attr),
+                                   shuffle_node->attr().at(attr)));
+  }
+  EXPECT_TRUE(shuffle_and_repeat_node.attr().at(kReshuffleEachIteration).b());
+}
+
+TEST(ShuffleAndRepeatFusionTest, FuseShuffleV3AndRepeat) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+
+  std::vector<std::pair<string, AttrValue>> common_attrs(2);
+  AttrValue shapes_attr;
+  SetAttrValue(kOutputShapes, &shapes_attr);
+  common_attrs[0] = std::make_pair(kOutputShapes, shapes_attr);
+  AttrValue types_attr;
+  SetAttrValue(kOutputTypes, &types_attr);
+  common_attrs[1] = std::make_pair(kOutputTypes, types_attr);
+
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             common_attrs, &graph);
+
+  NodeDef *buffer_size_node =
+      graph_utils::AddScalarConstNode<int64>(128, &graph);
+  NodeDef *seed_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
+  NodeDef *seed2_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
+  NodeDef *seed_generator_node =
+      graph_utils::AddScalarConstNode<StringPiece>("dummy_resource", &graph);
+  std::vector<string> shuffle_inputs(5);
+  shuffle_inputs[0] = range_node->name();
+  shuffle_inputs[1] = buffer_size_node->name();
+  shuffle_inputs[2] = seed_node->name();
+  shuffle_inputs[3] = seed2_node->name();
+  shuffle_inputs[4] = seed_generator_node->name();
+  NodeDef *shuffle_node = graph_utils::AddNode(
+      "", "ShuffleDatasetV3", shuffle_inputs, common_attrs, &graph);
+  (*shuffle_node->mutable_attr())[kReshuffleEachIteration].set_b(true);
+
+  NodeDef *count_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
+  std::vector<string> repeat_inputs(2);
+  repeat_inputs[0] = shuffle_node->name();
+  repeat_inputs[1] = count_node->name();
+  NodeDef *repeat_node = graph_utils::AddNode(
+      "", "RepeatDataset", repeat_inputs, common_attrs, &graph);
+
+  ShuffleAndRepeatFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(shuffle_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(repeat_node->name(), output));
   EXPECT_TRUE(
-      AreAttrValuesEqual(shuffle_and_repeat_node.attr().at("output_types"),
-                         repeat_node->attr().at("output_types")));
+      graph_utils::ContainsNodeWithOp("ShuffleAndRepeatDatasetV2", output));
+  NodeDef shuffle_and_repeat_node = output.node(
+      graph_utils::FindGraphNodeWithOp("ShuffleAndRepeatDatasetV2", output));
+  EXPECT_EQ(shuffle_and_repeat_node.input_size(), 6);
+  EXPECT_EQ(shuffle_and_repeat_node.input(0), shuffle_node->input(0));
+  EXPECT_EQ(shuffle_and_repeat_node.input(1), shuffle_node->input(1));
+  EXPECT_EQ(shuffle_and_repeat_node.input(2), shuffle_node->input(2));
+  EXPECT_EQ(shuffle_and_repeat_node.input(3), shuffle_node->input(3));
+  EXPECT_EQ(shuffle_and_repeat_node.input(4), repeat_node->input(1));
+  EXPECT_EQ(shuffle_and_repeat_node.input(5), shuffle_node->input(4));
+  for (const auto &attr :
+       {kOutputShapes, kOutputTypes, kReshuffleEachIteration}) {
+    EXPECT_TRUE(AreAttrValuesEqual(shuffle_and_repeat_node.attr().at(attr),
+                                   shuffle_node->attr().at(attr)));
+  }
 }
 
 TEST(ShuffleAndRepeatFusionTest, NoChange) {
@@ -99,11 +240,11 @@ TEST(ShuffleAndRepeatFusionTest, NoChange) {
 
   std::vector<std::pair<string, AttrValue>> common_attrs(2);
   AttrValue shapes_attr;
-  SetAttrValue("output_shapes", &shapes_attr);
-  common_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+  SetAttrValue(kOutputShapes, &shapes_attr);
+  common_attrs[0] = std::make_pair(kOutputShapes, shapes_attr);
   AttrValue types_attr;
-  SetAttrValue("output_types", &types_attr);
-  common_attrs[1] = std::make_pair("output_types", types_attr);
+  SetAttrValue(kOutputTypes, &types_attr);
+  common_attrs[1] = std::make_pair(kOutputTypes, types_attr);
 
   NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
diff --git a/tensorflow/core/grappler/optimizers/data/slack.cc b/tensorflow/core/grappler/optimizers/data/slack.cc
index 6d1aab0c688..27915e2d5d6 100644
--- a/tensorflow/core/grappler/optimizers/data/slack.cc
+++ b/tensorflow/core/grappler/optimizers/data/slack.cc
@@ -51,7 +51,7 @@ bool IsDatasetNodeOfType(const NodeDef& node,
 constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
     "ZipDataset", "ConcatenateDataset"};
 
-constexpr std::array<const char*, 21> kPassThroughOps = {
+constexpr std::array<const char*, 22> kPassThroughOps = {
     "CacheDataset",
     "CacheDatasetV2",
     "ExperimentalMaxIntraOpParallelismDataset",
@@ -70,6 +70,7 @@ constexpr std::array<const char*, 21> kPassThroughOps = {
     "ShuffleAndRepeatDataset",
     "ShuffleDataset",
     "ShuffleDatasetV2",
+    "ShuffleDatasetV3",
     "SkipDataset",
     "TakeDataset",
     "WindowDataset",
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index c6818d4f2f6..ed3af955c13 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/lower_case_op.h"
 #include "tensorflow/core/common_runtime/lower_functional_ops.h"
 #include "tensorflow/core/common_runtime/lower_if_op.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -280,6 +280,13 @@ class FunctionOptimizerContext {
 
   const GraphView& graph_view() const { return graph_view_; }
 
+  bool IsFeedNode(const string& node_name) const {
+    return absl::c_any_of(
+        item_->feed, [&](const std::pair<std::string, Tensor>& feed) {
+          return ParseTensorName(feed.first).node() == node_name;
+        });
+  }
+
   bool IsFetchNode(const string& node_name) const {
     return absl::c_any_of(item_->fetch, [&](const string& fetch) {
       return ParseTensorName(fetch).node() == node_name;
@@ -821,7 +828,7 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        // Op types that should not run in program order, e.g. because they need
        // to run asynchronously to avoid deadlock.
        "CollectiveGather", "CollectiveReduce", "CollectiveBcastSend",
-       "CollectiveBcastRecv", "NcclAllReduce",
+       "CollectiveBcastRecv", "NcclAllReduce", "Send", "Recv",
 
        // Legacy random ops.
        // See details in tensorflow/python/framework/auto_control_deps.py.
@@ -842,7 +849,8 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        // TPUEmbedding EnqueueOps are stateful but this is only between ops with
        // the same device_ordinal on the same host.
        "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
-       "EnqueueTPUEmbeddingSparseTensorBatch"});
+       "EnqueueTPUEmbeddingSparseTensorBatch",
+       "EnqueueTPUEmbeddingRaggedTensorBatch"});
   return exemption->contains(op);
 }
 
@@ -1114,7 +1122,15 @@ void AddStrictInputSemantics(Node* caller, Graph* g) {
 
   VLOG(3) << "Add control edges from all data inputs to enforce strict "
              "semantics with regard to function inputs";
+
+  // Do not add control edges from placeholders, because it will prevent
+  // pruning, and they can't produce any side effects anyway.
+  const auto is_placeholder = [](const Node* node) -> bool {
+    return node->type_string() == "Placeholder";
+  };
+
   for (const Node* node : data_inputs) {
+    if (is_placeholder(node)) continue;
     g->AddControlEdge(g->FindNodeId(node->id()), caller,
                       /*allow_duplicates=*/true);
   }
@@ -1198,6 +1214,13 @@ Status InlineFunctionCalls(const GrapplerItem& item,
 
   std::vector<string> inlined_function_names;
 
+  // Do not inline function call nodes that are part of a feed set.
+  NodeNames feed_nodes;
+  feed_nodes.reserve(item.feed.size());
+  for (const std::pair<std::string, Tensor>& feed : item.feed) {
+    feed_nodes.insert(ParseTensorName(feed.first).node());
+  }
+
   // If a function call is inside a While loop, it must have an incoming control
   // edge, because it will be used to pass execution frame into the function
   // body. All nodes without inputs in the function body (e.g. Const and NoOp)
@@ -1233,6 +1256,8 @@ Status InlineFunctionCalls(const GrapplerItem& item,
     if (!IsFunctionCall(flib_def, *n)) continue;
     // Skip function calls that we plan to compile later.
     if (MarkedForXlaCompilation(n->def())) continue;
+    // Skip nodes in a feed set.
+    if (feed_nodes.contains(n->name())) continue;
 
     // Function body that we will inline into the main graph. It can be a
     // function instantiation, or a gradient function instantiated from
@@ -1436,9 +1461,9 @@ Status FunctionOptimizer::RunFunctionOptimizerPass(
 
     // Do not specialize if function has custom gradient or marked nospecialize.
     const string grad_func = ctx.function_library().FindGradient(func_name);
-    const bool no_specialize = !grad_func.empty() ||
-                               MarkedNoSpecialize(*func) ||
-                               MarkedForXlaCompilation(node);
+    const bool no_specialize =
+        !grad_func.empty() || ctx.IsFeedNode(node.name()) ||
+        MarkedNoSpecialize(*func) || MarkedForXlaCompilation(node);
 
     if (specialization_worthy && !no_specialize) {
       // TODO(ezhulenev): Specialize function call if input has a known shape.
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
index 4ceb0264909..c85d85e69ff 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
@@ -77,17 +77,21 @@ Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
 
 Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
                                  int filter_size, const string& padding,
-                                 bool dilated) {
+                                 bool dilated, const int input_sizes_length) {
   int batch_size = 128;
   int input_height = input_size;
   int input_width = input_size;
   int input_depth = 3;
   int filter_count = 2;
   int stride = 1;
-  TensorShape input_sizes_shape({4});
+  TensorShape input_sizes_shape({input_sizes_length});
   Tensor input_data(DT_INT32, input_sizes_shape);
-  test::FillValues<int>(&input_data,
-                        {batch_size, input_height, input_width, input_depth});
+  if (input_sizes_length == 4) {
+    test::FillValues<int>(&input_data,
+                          {batch_size, input_height, input_width, input_depth});
+  } else {
+    test::FillValues<int>(&input_data, {input_height, input_width});
+  }
   Output input_sizes =
       ops::Const(s->WithOpName("InputSizes"), Input::Initializer(input_data));
 
@@ -353,7 +357,8 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
   GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
 #endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   Scope s = Scope::NewRootScope();
-  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false);
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
+                                        /*input_sizes_length=*/4);
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
@@ -380,6 +385,30 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
   VerifyRegularFaninMatch(input_sizes_node, 0, "InputSizesIdentity", 0);
 }
 
+TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInput2DInputSizes) {
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  Scope s = Scope::NewRootScope();
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
+                                        /*input_sizes_length=*/2);
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+  GenericLayoutOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  Status status;
+  utils::GraphView graph_view(&output, &status);
+  TF_ASSERT_OK(status);
+  auto* conv2d_backprop_node = graph_view.GetNode("Conv2DBackpropInput");
+  ASSERT_NE(conv2d_backprop_node, nullptr);
+  ASSERT_EQ(conv2d_backprop_node->NumRegularFanins(), 3);
+  VerifyRegularFaninMatch(conv2d_backprop_node, 0, "InputSizesIdentity", 0);
+}
+
 TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
 #if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 6c6c0e4caf5..a5a5f7ae64a 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -266,7 +266,7 @@ Status Transposer::CreateTransposeNode(
   *transpose_node_name = node_name;
 
   NodeDef node;
-  node.set_name(string(node_name));
+  node.set_name(node_name);
   node.set_op(kOpTranspose);
   node.set_device(string(device));
 
@@ -725,12 +725,42 @@ Status Conv2DBackpropInputTransposer::TransposeNode(
   if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) {
     return Status::OK();
   }
+
+  const auto& fanin = node->GetRegularFanin(0);
+  auto* fanin_node = fanin.node_view();
+  const auto* output_shape_attr = fanin_node->GetAttr(kAttrOutputShape);
+  if (output_shape_attr == nullptr) {
+    VLOG(3) << "Cannot compute the shape of " << fanin_node->GetName()
+            << " because it is missing attribute " << kAttrOutputShape;
+    return Status::OK();
+  }
+  TensorShapeProto fanin_shape = output_shape_attr->list().shape(fanin.index());
+  if (fanin_shape.dim_size() != 1) {
+    VLOG(3) << fanin_node->GetName() << " is not a vector.";
+    return Status::OK();
+  }
+  int vector_size = fanin_shape.dim(0).size();
+  if (vector_size == -1) {
+    VLOG(3) << "The number of elements in " << fanin_node->GetName()
+            << " is unknown.";
+    return Status::OK();
+  }
+  if (vector_size != 2 && vector_size != 4) {
+    return errors::InvalidArgument(
+        fanin_node->GetName(), " must be a vector of size 2 or 4, but found ",
+        vector_size);
+  }
+
   VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
           << "' with op '" << node->GetOp() << "' from data format '"
           << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
-  TF_RETURN_IF_ERROR(
-      UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
+  // Do not permute a input_sizes of size 2 because it represents HW regardless
+  // of whether NCHW or NHWC.
+  if (vector_size != 2) {
+    TF_RETURN_IF_ERROR(
+        UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
+  }
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {2}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
   return context->graph_view->GetMutationBuilder()->Apply();
@@ -1642,19 +1672,14 @@ string GetDeviceName(const VirtualPlacer* virtual_placer, const NodeDef& node) {
 }
 
 bool IsDefaultLayoutSensitiveOp(const NodeDef& node) {
-  std::set<string> default_layout_sensitive_ops = {"AvgPool",
-                                                   "BiasAdd",
-                                                   "Conv2D",
-                                                   "DepthwiseConv2dNative",
-                                                   "DepthToSpace",
-                                                   "FusedBatchNorm",
-                                                   "FusedBatchNormV2",
-                                                   "FusedBatchNormV3",
-                                                   "FusedConv2DBiasActivation",
-                                                   "MaxPool",
-                                                   "SpaceToDepth"};
-  return default_layout_sensitive_ops.find(node.op()) !=
-         default_layout_sensitive_ops.end();
+  static absl::flat_hash_set<string>* default_layout_sensitive_ops =
+      new absl::flat_hash_set<std::string>(
+          {"AvgPool", "BiasAdd", "Conv2D", "DepthwiseConv2dNative",
+           "DepthToSpace", "FusedBatchNorm", "FusedBatchNormV2",
+           "FusedBatchNormV3", "FusedConv2DBiasActivation", "MaxPool",
+           "SpaceToDepth"});
+  return default_layout_sensitive_ops->find(node.op()) !=
+         default_layout_sensitive_ops->end();
 }
 
 bool IsLayoutSensitiveOp(const NodeDef& node) {
@@ -1669,37 +1694,72 @@ bool IsLayoutSensitiveOp(const NodeDef& node) {
 }
 
 bool IsDefaultLayoutAgnosticOp(const NodeDef& node) {
-  std::set<string> agnostic_nodes = {"Abs",          "Acos",
-                                     "Acosh",        "Angle",
-                                     "Asin",         "Asinh",
-                                     "Atan",         "Atanh",
-                                     "Bitcast",      "Cast",
-                                     "Ceil",         "CheckNumerics",
-                                     "ComplexAbs",   "Conj",
-                                     "Cos",          "Cosh",
-                                     "Digamma",      "Elu",
-                                     "Enter",        "Erf",
-                                     "Erfc",         "Exit",
-                                     "Exp",          "Expm1",
-                                     "Floor",        "GuaranteeConst",
-                                     "Identity",     "Imag",
-                                     "Inv",          "IsFinite",
-                                     "IsInf",        "IsNan",
-                                     "Lgamma",       "Log",
-                                     "LogicalNot",   "Log1p",
-                                     "Neg",          "NextIteration",
-                                     "OnesLike",     "PreventGradient",
-                                     "Real",         "Reciprocal",
-                                     "Relu",         "Relu6",
-                                     "Rint",         "Selu",
-                                     "Sigmoid",      "Sign",
-                                     "Sin",          "Sinh",
-                                     "Snapshot",     "Softplus",
-                                     "Round",        "Rsqrt",
-                                     "Sqrt",         "Square",
-                                     "StopGradient", "Tan",
-                                     "Tanh",         "ZerosLike"};
-  return agnostic_nodes.find(node.op()) != agnostic_nodes.end();
+  static absl::flat_hash_set<string>* agnostic_nodes =
+      new absl::flat_hash_set<std::string>({"Abs",
+                                            "Acos",
+                                            "Acosh",
+                                            "Angle",
+                                            "Asin",
+                                            "Asinh",
+                                            "Atan",
+                                            "Atanh",
+                                            "Bitcast",
+                                            "Cast",
+                                            "Ceil",
+                                            "CheckNumerics",
+                                            "ComplexAbs",
+                                            "Conj",
+                                            "Cos",
+                                            "Cosh",
+                                            "Digamma",
+                                            "Elu",
+                                            "Enter",
+                                            "Erf",
+                                            "Erfc",
+                                            "Exit",
+                                            "Exp",
+                                            "Expm1",
+                                            "FakeQuantWithMinMaxVars",
+                                            "FakeQuantWithMinMaxArgs",
+                                            "Floor",
+                                            "GuaranteeConst",
+                                            "Identity",
+                                            "Imag",
+                                            "Inv",
+                                            "IsFinite",
+                                            "IsInf",
+                                            "IsNan",
+                                            "Lgamma",
+                                            "Log",
+                                            "LogicalNot",
+                                            "Log1p",
+                                            "Neg",
+                                            "NextIteration",
+                                            "OnesLike",
+                                            "PreventGradient",
+                                            "QuantizeAndDequantizeV2",
+                                            "QuantizeAndDequantizeV3",
+                                            "Real",
+                                            "Reciprocal",
+                                            "Relu",
+                                            "Relu6",
+                                            "Rint",
+                                            "Selu",
+                                            "Sigmoid",
+                                            "Sign",
+                                            "Sin",
+                                            "Sinh",
+                                            "Snapshot",
+                                            "Softplus",
+                                            "Round",
+                                            "Rsqrt",
+                                            "Sqrt",
+                                            "Square",
+                                            "StopGradient",
+                                            "Tan",
+                                            "Tanh",
+                                            "ZerosLike"});
+  return agnostic_nodes->find(node.op()) != agnostic_nodes->end();
 }
 
 bool IsLayoutAgnosticOp(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 2d6e201f084..c92693adef4 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -84,8 +84,9 @@ class LayoutOptimizerTest : public GrapplerTest {
         ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
     ops::Conv2D::Attrs attrs;
+    const int kExplicitPaddings[] = {0, 0, 1, 2, 3, 4, 0, 0};
     if (padding == "EXPLICIT") {
-      attrs = attrs.ExplicitPaddings({0, 0, 1, 2, 3, 4, 0, 0});
+      attrs = attrs.ExplicitPaddings(kExplicitPaddings);
     }
 
     Output conv = ops::Conv2D(s->WithOpName("Conv2D").WithDevice(device), input,
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 0c8fa0449f5..cd0d44e8e12 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -18,12 +18,13 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/metrics.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/versions.pb.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
@@ -90,6 +91,10 @@ bool IsRunOnceOptimizer(const string& name) {
          name == "loop_optimizer" || name == "auto_mixed_precision";
 }
 
+bool IsTFDataFunction(const FunctionDef& func) {
+  return func.attr().contains(data::kTFDataFunction);
+}
+
 // Creates a function library stub from a real function library: copy only
 // signatures and attributes of all the function defined in fdef_lib. This stub
 // can be swapped with real function library in a graph, before passing it to
@@ -109,12 +114,12 @@ FunctionDefLibrary GetFunctionDefLibraryStub(
 }
 
 uint64 DeadlineMicroSeconds(const RewriterConfig& cfg) {
-  const uint64 kFiveMinutesInUsec = 5 * 60 * 1000 * 1000;
+  const uint64 kTwentyMinutesInUsec = 20 * 60 * 1000 * 1000;
   if (cfg.meta_optimizer_timeout_ms() < 0) {
     return 0;
   } else {
     return cfg.meta_optimizer_timeout_ms() == 0
-               ? Env::Default()->NowMicros() + kFiveMinutesInUsec
+               ? Env::Default()->NowMicros() + kTwentyMinutesInUsec
                : Env::Default()->NowMicros() +
                      cfg.meta_optimizer_timeout_ms() * 1000;
   }
@@ -696,6 +701,9 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
       // the function optimizer, before we can optimize function body.
       if (IsParametrized(func)) continue;
 
+      // Skip tf.data functions as they are optimized by tf.data meta optimizer.
+      if (IsTFDataFunction(func)) continue;
+
       VLOG(3) << "Optimize function: function=" << func_name << " ["
               << function_idx++ << " of "
               << optimized_graph->library().function_size() << "]";
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 83eca92e51c..cf1953fcdb2 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -20,14 +20,21 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
 
 class MklRemapperTest : public GrapplerTest {
+ public:
+  const string kAddNOp = "AddN";
+  const string kAddOp = "Add";
+  const string kAddV2Op = "AddV2";
+
  protected:
-  void FuseConv2DWithBiasAndAddN(const string& data_format, bool has_relu) {
+  void FuseConv2DWithBiasAndAddNOrAdd(const string& data_format, bool has_relu,
+                                      string add_op, bool add_with_bcast) {
     using ::tensorflow::ops::Placeholder;
 
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -35,9 +42,18 @@ class MklRemapperTest : public GrapplerTest {
     auto input_shape = (data_format == "NHWC")
                            ? ops::Placeholder::Shape({8, 32, 32, 3})
                            : ops::Placeholder::Shape({8, 3, 32, 32});
-    auto input_shape_addn = (data_format == "NHWC")
-                                ? ops::Placeholder::Shape({8, 32, 32, 128})
-                                : ops::Placeholder::Shape({8, 128, 32, 32});
+    auto input_shape_addn = ops::Placeholder::Shape({});
+    if (data_format == "NHWC") {
+      if (add_with_bcast)
+        input_shape_addn = ops::Placeholder::Shape({128});
+      else
+        input_shape_addn = ops::Placeholder::Shape({8, 32, 32, 128});
+    } else {
+      if (add_with_bcast)
+        input_shape_addn = ops::Placeholder::Shape({32});
+      else
+        input_shape_addn = ops::Placeholder::Shape({8, 128, 32, 32});
+    }
     auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
     auto bias_shape = ops::Placeholder::Shape({128});
 
@@ -53,13 +69,31 @@ class MklRemapperTest : public GrapplerTest {
                     ops::Conv2D::Attrs().DataFormat(data_format));
     auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias,
                                  ops::BiasAdd::Attrs().DataFormat(data_format));
-    auto addn = ops::AddN(s.WithOpName("addn"),
-                          std::initializer_list<Input>{input_addn, bias_add});
-    if (has_relu) {
-      auto relu = ops::Relu(s.WithOpName("relu"), addn);
-      ops::Identity(s.WithOpName("fetch"), relu);
+    if (add_op == kAddNOp) {
+      auto addn = ops::AddN(s.WithOpName(add_op),
+                            std::initializer_list<Input>{input_addn, bias_add});
+      if (has_relu) {
+        auto relu = ops::Relu(s.WithOpName("relu"), addn);
+        ops::Identity(s.WithOpName("fetch"), relu);
+      } else {
+        ops::Identity(s.WithOpName("fetch"), addn);
+      }
+    } else if (add_op == kAddV2Op) {
+      auto add = ops::AddV2(s.WithOpName(add_op), input_addn, bias_add);
+      if (has_relu) {
+        auto relu = ops::Relu(s.WithOpName("relu"), add);
+        ops::Identity(s.WithOpName("fetch"), relu);
+      } else {
+        ops::Identity(s.WithOpName("fetch"), add);
+      }
     } else {
-      ops::Identity(s.WithOpName("fetch"), addn);
+      auto add = ops::Add(s.WithOpName(add_op), input_addn, bias_add);
+      if (has_relu) {
+        auto relu = ops::Relu(s.WithOpName("relu"), add);
+        ops::Identity(s.WithOpName("fetch"), relu);
+      } else {
+        ops::Identity(s.WithOpName("fetch"), add);
+      }
     }
     auto input_tensor = GenerateRandomTensor<DT_FLOAT>(
         TensorShape(input_shape.shape_.dim_sizes()));
@@ -83,32 +117,48 @@ class MklRemapperTest : public GrapplerTest {
       item.graph.mutable_node(i)->set_device("/device:CPU:0");
     }
 
-    Remapper optimizer(RewriterConfig::ON);
+    // Set Rewriter config to AGGRESSIVE so that we can use Placeholder shape
+    // to test that Add with both inputs having same shape get fused with
+    // Conv2D. Setting this config to AGGRESSIVE is not required for the feature
+    // though.
+    Remapper optimizer(RewriterConfig::AGGRESSIVE);
     GraphDef output;
     TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
 
+    bool check_fusion = !add_with_bcast;
     int found = 0;
     for (const NodeDef& node : output.node()) {
-      auto fetch_node_name = has_relu ? "relu" : "addn";
+      auto fetch_node_name = has_relu ? "relu" : add_op;
       if (node.name() == fetch_node_name) {
-        EXPECT_EQ("_FusedConv2D", node.op());
-        EXPECT_EQ("input", node.input(0));
-        EXPECT_EQ("filter", node.input(1));
+        if (check_fusion) {
+          EXPECT_EQ("_FusedConv2D", node.op());
+          EXPECT_EQ("input", node.input(0));
+          EXPECT_EQ("filter", node.input(1));
 
-        EXPECT_EQ(2, node.attr().at("num_args").i());
-        EXPECT_EQ("bias", node.input(2));
-        EXPECT_EQ("input_addn", node.input(3));
+          EXPECT_EQ(2, node.attr().at("num_args").i());
+          EXPECT_EQ("bias", node.input(2));
+          EXPECT_EQ("input_addn", node.input(3));
 
-        const auto fused_ops = node.attr().at("fused_ops").list().s();
-        if (has_relu) {
-          EXPECT_EQ(3, fused_ops.size());
-          EXPECT_EQ("BiasAdd", fused_ops[0]);
-          EXPECT_EQ("Add", fused_ops[1]);
-          EXPECT_EQ("Relu", fused_ops[2]);
+          const auto fused_ops = node.attr().at("fused_ops").list().s();
+          if (has_relu) {
+            EXPECT_EQ(3, fused_ops.size());
+            EXPECT_EQ("BiasAdd", fused_ops[0]);
+            EXPECT_EQ("Add", fused_ops[1]);
+            EXPECT_EQ("Relu", fused_ops[2]);
+          } else {
+            EXPECT_EQ(2, fused_ops.size());
+            EXPECT_EQ("BiasAdd", fused_ops[0]);
+            EXPECT_EQ("Add", fused_ops[1]);
+          }
         } else {
-          EXPECT_EQ(2, fused_ops.size());
-          EXPECT_EQ("BiasAdd", fused_ops[0]);
-          EXPECT_EQ("Add", fused_ops[1]);
+          if (has_relu) {
+            EXPECT_EQ(node.op(), "Relu");
+            ASSERT_EQ(node.input_size(), 1);
+            EXPECT_EQ(node.input(0), add_op);
+          } else {
+            EXPECT_EQ(node.op(), add_op);
+            ASSERT_EQ(node.input_size(), 2);
+          }
         }
         found++;
       }
@@ -123,25 +173,266 @@ class MklRemapperTest : public GrapplerTest {
   }
 };
 
-TEST_F(MklRemapperTest, FuseConv2DWithBiasAndAddN_NHWC_WithoutRelu) {
-  const bool kShouldFuseRelu = false;
-  FuseConv2DWithBiasAndAddN("NHWC", kShouldFuseRelu);
+#define CREATE_CONV2DFUSION_TEST(data_format, addop, relu, bcast)                    \
+  TEST_F(                                                                            \
+      MklRemapperTest,                                                               \
+      FuseConv2DWithBiasAnd##addop##_##data_format##_relu##relu##_addbcast##bcast) { \
+    const bool kShouldFuseRelu = relu;                                               \
+    const bool kIsAddWithBcast = bcast;                                              \
+    FuseConv2DWithBiasAndAddNOrAdd(#data_format, relu, #addop, bcast);               \
+  }
+
+#define CREATE_CONV2DFUSION_ADD_NOBCAST_TEST(addop)    \
+  CREATE_CONV2DFUSION_TEST(NHWC, addop, false, false); \
+  CREATE_CONV2DFUSION_TEST(NHWC, addop, true, false);  \
+  CREATE_CONV2DFUSION_TEST(NCHW, addop, false, false); \
+  CREATE_CONV2DFUSION_TEST(NCHW, addop, true, false);
+
+CREATE_CONV2DFUSION_ADD_NOBCAST_TEST(AddN);
+
+#define CREATE_CONV2DFUSION_ADD_BCAST_TEST(addop)      \
+  CREATE_CONV2DFUSION_TEST(NHWC, addop, false, false); \
+  CREATE_CONV2DFUSION_TEST(NHWC, addop, true, false);  \
+  CREATE_CONV2DFUSION_TEST(NCHW, addop, false, false); \
+  CREATE_CONV2DFUSION_TEST(NCHW, addop, true, false);  \
+  CREATE_CONV2DFUSION_TEST(NHWC, addop, false, true);  \
+  CREATE_CONV2DFUSION_TEST(NHWC, addop, true, true);   \
+  CREATE_CONV2DFUSION_TEST(NCHW, addop, false, true);  \
+  CREATE_CONV2DFUSION_TEST(NCHW, addop, true, true);
+
+CREATE_CONV2DFUSION_ADD_BCAST_TEST(Add);
+CREATE_CONV2DFUSION_ADD_BCAST_TEST(AddV2);
+
+#undef CREATE_CONV2DFUSION_ADD_NOBCAST_TEST
+#undef CREATE_CONV2DFUSION_ADD_BCAST_TEST
+#undef CREATE_CONV2DFUSION_TEST
+
+TEST_F(MklRemapperTest, FuseDepthwiseConv2DWithBiasAndActivation) {
+  using ::tensorflow::ops::Placeholder;
+
+  for (const string& activation : {"Relu", "Relu6", "Elu", "None"}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    auto input_shape = Placeholder::Shape({8, 32, 32, 3});
+    auto filter_shape = Placeholder::Shape({1, 1, 3, 1});
+    auto bias_shape = Placeholder::Shape({3});
+
+    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+    auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+    auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+    std::vector<int> strides = {1, 1, 1, 1};
+    auto conv = ops::DepthwiseConv2dNative(s.WithOpName("depthwise_conv"),
+                                           input, filter, strides, "SAME");
+    auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+
+    ops::Identity fetch = [&]() -> ops::Identity {
+      auto activate = s.WithOpName("activation");
+      auto fetch = s.WithOpName("fetch");
+
+      if (activation == "Relu") {
+        return ops::Identity(fetch, ops::Relu(activate, bias_add));
+      } else if (activation == "Relu6") {
+        return ops::Identity(fetch, ops::Relu6(activate, bias_add));
+      } else if (activation == "Elu") {
+        return ops::Identity(fetch, ops::Elu(activate, bias_add));
+      }
+
+      DCHECK(activation == "None");
+      return ops::Identity(fetch, bias_add);
+    }();
+
+    auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+    auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 1});
+    auto bias_t = GenerateRandomTensor<DT_FLOAT>({3});
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    // Place all nodes on CPU.
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::ON);
+    GraphDef output;
+    TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() != "bias_add" && node.name() != "activation") continue;
+
+      EXPECT_EQ(node.op(), "_FusedDepthwiseConv2dNative");
+      ASSERT_EQ(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "input");
+      EXPECT_EQ(node.input(1), "filter");
+
+      EXPECT_EQ(node.attr().at("num_args").i(), 1);
+      EXPECT_EQ(node.input(2), "bias");
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      if (node.name() == "bias_add") {
+        ASSERT_EQ(fused_ops.size(), 1);
+        EXPECT_EQ(fused_ops[0], "BiasAdd");
+        found++;
+      }
+      if (node.name() == "activation") {
+        ASSERT_EQ(fused_ops.size(), 2);
+        EXPECT_EQ(fused_ops[0], "BiasAdd");
+        EXPECT_EQ(fused_ops[1], activation);
+        found++;
+      }
+    }
+    EXPECT_EQ(found, 1);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    ASSERT_EQ(tensors.size(), 1);
+    test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+  }
 }
 
-TEST_F(MklRemapperTest, FuseConv2DWithBiasAndAddN_NHWC_WithRelu) {
-  const bool kShouldFuseRelu = true;
-  FuseConv2DWithBiasAndAddN("NHWC", kShouldFuseRelu);
-}
+#ifdef ENABLE_MKLDNN_V1
+TEST_F(MklRemapperTest, FuseBatchNormWithRelu) {
+  using ::tensorflow::ops::Placeholder;
 
-TEST_F(MklRemapperTest, FuseConv2DWithBiasAndAddN_NCHW_WithoutRelu) {
-  const bool kShouldFuseRelu = false;
-  FuseConv2DWithBiasAndAddN("NCHW", kShouldFuseRelu);
-}
+  for (bool is_training : {true, false}) {
+    for (bool has_side_input : {true, false}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-TEST_F(MklRemapperTest, FuseConv2DWithBiasAndAddN_NCHW_WithRelu) {
-  const bool kShouldFuseRelu = true;
-  FuseConv2DWithBiasAndAddN("NCHW", kShouldFuseRelu);
+      const int num_channels = 24;
+
+      TensorShape channel_shape({num_channels});
+      TensorShape empty_shape({0});
+
+      auto input =
+          Placeholder(s.WithOpName("input"), DT_FLOAT,
+                      ops::Placeholder::Shape({2, 8, 8, num_channels}));
+      auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
+      auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
+      auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
+      auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
+      auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
+
+      float epsilon = 0.1f;
+      auto fbn =
+          ops::FusedBatchNormV3(s.WithOpName("fused_batch_norm"), input_cast,
+                                scale, offset, mean, var,
+                                ops::FusedBatchNormV3::IsTraining(is_training)
+                                    .Epsilon(epsilon)
+                                    .DataFormat("NHWC"));
+
+      if (has_side_input) {
+        auto side_input =
+            Placeholder(s.WithOpName("side_input"), DT_FLOAT,
+                        ops::Placeholder::Shape({2, 8, 8, num_channels}));
+        auto side_input_cast =
+            ops::Cast(s.WithOpName("side_input_cast"), side_input, DT_FLOAT);
+        auto add = ops::Add(s.WithOpName("add"), fbn.y, side_input_cast);
+        auto relu = ops::Relu(s.WithOpName("relu"), add);
+      } else {
+        auto relu = ops::Relu(s.WithOpName("relu"), fbn.y);
+      }
+
+      auto input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+      auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+      auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+      auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                               : channel_shape);
+      auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                              : channel_shape);
+      auto side_input_t =
+          GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+
+      GrapplerItem item;
+      item.fetch = {"relu"};
+      if (has_side_input)
+        item.feed = {{"input", input_t},   {"scale", scale_t},
+                     {"offset", offset_t}, {"mean", mean_t},
+                     {"var", var_t},       {"side_input", side_input_t}};
+      else
+        item.feed = {{"input", input_t},
+                     {"scale", scale_t},
+                     {"offset", offset_t},
+                     {"mean", mean_t},
+                     {"var", var_t}};
+      TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+      // Place all nodes on CPU.
+      for (int i = 0; i < item.graph.node_size(); ++i) {
+        item.graph.mutable_node(i)->set_device("/device:CPU:0");
+      }
+
+      Remapper optimizer(RewriterConfig::AGGRESSIVE);
+      GraphDef output;
+      TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+      int found = 0;
+      if (has_side_input) {
+        for (const NodeDef& node : output.node()) {
+          if (node.name() == "add") {
+            EXPECT_EQ(node.op(), "Add");
+            ASSERT_EQ(node.input_size(), 2);
+            EXPECT_EQ(node.input(0), "fused_batch_norm");
+            EXPECT_EQ(node.input(1), "side_input_cast");
+            found++;
+          }
+          if (node.name() == "relu") {
+            EXPECT_EQ(node.op(), "Relu");
+            ASSERT_EQ(node.input_size(), 1);
+            EXPECT_EQ(node.input(0), "add");
+            found++;
+          }
+          if (node.name() == "fused_batch_norm") {
+            EXPECT_EQ(node.op(), "FusedBatchNormV3");
+            ASSERT_EQ(node.input_size(), 5);
+            EXPECT_EQ(node.input(0), "input_cast");
+            EXPECT_EQ(node.input(1), "scale");
+            EXPECT_EQ(node.input(2), "offset");
+            EXPECT_EQ(node.input(3), "mean");
+            EXPECT_EQ(node.input(4), "var");
+            found++;
+          }
+        }
+        EXPECT_EQ(found, 3);
+      } else {
+        for (const NodeDef& node : output.node()) {
+          if (node.name() == "relu") {
+            EXPECT_EQ(node.op(), "Identity");
+            ASSERT_EQ(node.input_size(), 1);
+            EXPECT_EQ(node.input(0), "fused_batch_norm");
+            found++;
+          }
+          if (node.name() == "fused_batch_norm") {
+            EXPECT_EQ(node.op(), "_FusedBatchNormEx");
+            ASSERT_EQ(node.input_size(), 5);
+            EXPECT_EQ(node.input(0), "input_cast");
+            EXPECT_EQ(node.input(1), "scale");
+            EXPECT_EQ(node.input(2), "offset");
+            EXPECT_EQ(node.input(3), "mean");
+            EXPECT_EQ(node.input(4), "var");
+
+            auto attr = node.attr();
+            EXPECT_EQ(attr["num_side_inputs"].i(), 0);
+            EXPECT_EQ(attr["activation_mode"].s(), "Relu");
+            found++;
+          }
+        }
+        EXPECT_EQ(found, 2);
+      }
+
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+      ASSERT_EQ(tensors_expected.size(), 1);
+      auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+      ASSERT_EQ(tensors.size(), 1);
+      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+    }
+  }
 }
+#endif  // ENABLE_MKLDNN_V1
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index a0ba6326b8a..9a7d1953105 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -57,6 +57,7 @@ namespace {
 
 constexpr char kFusedConv2D[] = "_FusedConv2D";
 constexpr char kFusedMatMul[] = "_FusedMatMul";
+constexpr char kFusedDepthwiseConv2dNative[] = "_FusedDepthwiseConv2dNative";
 constexpr char kFusedBatchNormEx[] = "_FusedBatchNormEx";
 
 constexpr char kDataFormat[] = "data_format";
@@ -218,14 +219,22 @@ bool HasDataType(const NodeDef* node, const DataType& expected,
 bool IsCpuCompatibleDataType(const NodeDef* contraction,
                              const string& type_attr = "T") {
   DataType dtype = GetDataTypeFromAttr(*contraction, type_attr);
+#if defined(INTEL_MKL)
+#if defined(ENABLE_INTEL_MKL_BFLOAT16)
+  if (IsConv2D(*contraction) || IsDepthwiseConv2dNative(*contraction) ||
+      IsMatMul(*contraction)) {
+    return dtype == DT_FLOAT || dtype == DT_BFLOAT16;
+#else
+  if (IsConv2D(*contraction) || IsDepthwiseConv2dNative(*contraction) ||
+      IsMatMul(*contraction)) {
+    return dtype == DT_FLOAT;
+#endif  // ENABLE_INTEL_MKL_BFLOAT16
+#else
   if (IsConv2D(*contraction)) {
     return dtype == DT_FLOAT || dtype == DT_DOUBLE;
   } else if (IsMatMul(*contraction)) {
-#if defined(INTEL_MKL) && defined(ENABLE_INTEL_MKL_BFLOAT16)
-    return dtype == DT_FLOAT || dtype == DT_BFLOAT16;
-#else
     return dtype == DT_FLOAT;
-#endif
+#endif  // INTEL_MKL
   } else {
     return false;
   }
@@ -274,12 +283,24 @@ bool IsCpuCompatibleMatMul(const NodeDef* matmul) {
   return NodeIsOnCpu(matmul) && IsCpuCompatibleDataType(matmul);
 }
 
+bool IsCpuCompatibleDepthwiseConv2dNative(const NodeDef* dw_conv2d) {
+  DCHECK(IsDepthwiseConv2dNative(*dw_conv2d))
+      << "Expected DepthwiseConv2dNative op";
+  return NodeIsOnCpu(dw_conv2d) && IsCpuCompatibleDataType(dw_conv2d);
+}
+
 // Checks if we can rewrite a pattern to the `_Fused{Conv2D,MatMul}` on CPU.
 template <typename Pattern>
 bool IsCpuCompatible(const RemapperContext& ctx, const Pattern& matched) {
   const NodeDef& node = ctx.graph_view.graph()->node(matched.contraction);
   if (IsConv2D(node)) {
     return IsCpuCompatibleConv2D(&node);
+  } else if (IsDepthwiseConv2dNative(node)) {
+#ifdef INTEL_MKL
+    return IsCpuCompatibleDepthwiseConv2dNative(&node);
+#else
+    return false;
+#endif  // INTEL_MKL
   } else if (IsMatMul(node)) {
     return IsCpuCompatibleMatMul(&node);
   } else {
@@ -376,11 +397,12 @@ bool FindContractionWithBias(const RemapperContext& ctx, int node_index,
   const auto* contraction_node_view = regular_fanin_0.node_view();
   const auto* contraction_node_def = contraction_node_view->node();
 
-  bool is_conv2d_or_matmul =
-      IsConv2D(*contraction_node_def) || IsMatMul(*contraction_node_def);
+  // Conv2D, MatMul or DepthwiseConv2D
+  bool is_contraction = IsConv2D(*contraction_node_def) ||
+                        IsMatMul(*contraction_node_def) ||
+                        IsDepthwiseConv2dNative(*contraction_node_def);
 
-  if (!is_conv2d_or_matmul ||
-      !HaveSameDataType(node_def, contraction_node_def) ||
+  if (!is_contraction || !HaveSameDataType(node_def, contraction_node_def) ||
       HasControlFaninOrFanout(*contraction_node_view) ||
       !HasAtMostOneFanoutAtPort0(*contraction_node_view) ||
       IsInPreserveSet(ctx, contraction_node_def))
@@ -598,6 +620,18 @@ bool FindContractionWithBiasInPort(const RemapperContext& ctx,
   return true;
 }
 
+bool IsAddWithNoBroadcast(const RemapperContext& ctx, const NodeDef& node) {
+  if (!IsAdd(node)) return false;
+
+  // Check if this is case of broadcasting - Add node supports broadcasting.
+  const auto& props = ctx.graph_properties.GetInputProperties(node.name());
+  if (props.size() == 2 &&
+      ShapesSymbolicallyEqual(props[0].shape(), props[1].shape())) {
+    return true;
+  }
+  return false;
+}
+
 bool FindContractionWithBiasAddAndAdd(const RemapperContext& ctx,
                                       const utils::MutableNodeView& node_view,
                                       ContractionWithBiasAddAndAdd* matched) {
@@ -606,12 +640,19 @@ bool FindContractionWithBiasAddAndAdd(const RemapperContext& ctx,
   if (HasControlFaninOrFanout(node_view) || node_view.NumRegularFanins() != 2)
     return false;
 
-  // Root of the pattern must be a AddN
+  // Root of the pattern must be a AddN or Add with same input shapes
+  // (no broadcasting).
   const auto* node_def = node_view.node();
-  if (!IsAddN(*node_def)) return false;
+  if (!IsAddN(*node_def) && !IsAddWithNoBroadcast(ctx, *node_def)) return false;
 
+#ifdef ENABLE_INTEL_MKL_BFLOAT16
+  // MKL AddN ops only support float and bfloat16 data types.
+  if (!HasDataType(node_def, DT_FLOAT) && !HasDataType(node_def, DT_BFLOAT16))
+    return false;
+#else
   // MKL AddN ops only support float data type.
   if (!HasDataType(node_def, DT_FLOAT)) return false;
+#endif  // ENABLE_INTEL_MKL_BFLOAT16
 
   ContractionWithBiasAdd base;
   matched->port_id = 0;
@@ -626,7 +667,7 @@ bool FindContractionWithBiasAddAndAdd(const RemapperContext& ctx,
     }
   }
 
-  // We successfully found a Conv2D+BiasAdd+AddN pattern.
+  // We successfully found a Conv2D+BiasAdd+{AddN,Add} pattern.
   matched->contraction = base.contraction;
   matched->bias_add = base.bias_add;
   matched->add = node_view.node_index();
@@ -653,8 +694,14 @@ bool FindContractionWithBiasAndAddActivation(
   if (node_def == nullptr) return false;
   if (!IsSupportedActivation(*node_def)) return false;
 
+#ifdef ENABLE_INTEL_MKL_BFLOAT16
+  // MKL activation op only supports float and bfloat16 data types.
+  if (!HasDataType(node_def, DT_FLOAT) && !HasDataType(node_def, DT_BFLOAT16))
+    return false;
+#else
   // MKL activation op only supports float data type.
   if (!HasDataType(node_def, DT_FLOAT)) return false;
+#endif  // ENABLE_INTEL_MKL_BFLOAT16
 
   // And input to activation must match ContractionWithBiasAddAndAdd pattern.
   if (node_view->NumRegularFanins() < 1) return false;
@@ -750,23 +797,27 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
     const auto* fused_batch_norm_node_def = fused_batch_norm.node();
     if (!IsFusedBatchNorm(*fused_batch_norm_node_def)) return false;
 
-    // We fuse FusedBatchNorm only on GPU, because on CPU we fuse it with
-    // contraction (MatMul or Conv2D node).
+#ifndef ENABLE_MKLDNN_V1
+    // We fuse FusedBatchNorm on GPU or MKL CPU.
     if (!NodeIsOnGpu(fused_batch_norm_node_def)) return false;
+#endif
 
     DataType t_dtype = GetDataTypeFromAttr(*fused_batch_norm_node_def, "T");
+#ifndef ENABLE_MKLDNN_V1
     if (t_dtype != DT_FLOAT && t_dtype != DT_HALF) return false;
+#else
+    if (t_dtype != DT_FLOAT && t_dtype != DT_BFLOAT16) return false;
+#endif
 
     // Get the FusedBatchNorm training mode.
     bool is_training;
     if (!GetNodeAttr(*fused_batch_norm_node_def, kIsTraining, &is_training)
              .ok())
       return false;
-
     // In training mode we rely on cuDNN for computing FusedBatchNorm with side
     // inputs and activation, and it has its own limitations. In inference mode
     // we have a custom CUDA kernel that doesn't not have these constraints.
-    if (is_training) {
+    if (is_training && NodeIsOnGpu(fused_batch_norm_node_def)) {
       // cuDNN only supports NHWC data layout.
       string data_format;
       if (!GetNodeAttr(*fused_batch_norm_node_def, kDataFormat, &data_format)
@@ -818,6 +869,12 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
 
   // Input to a Relu can be an Add node with FusedBatchNorm as one of the inputs
   if (IsAdd(*relu_fanin_0_node_def)) {
+    // Currently no CPU implementation for "FusedBatchNorm + SideInput +
+    // <Activation>""
+#ifdef ENABLE_MKLDNN_V1
+    return false;
+#endif
+
     // Check that only Relu node consumes the output of an Add node.
     if (HasControlFaninOrFanout(*relu_fanin_0_node_view) ||
         !HasAtMostOneFanoutAtPort0(*relu_fanin_0_node_view) ||
@@ -872,6 +929,21 @@ void CopyConv2DAttributes(const NodeDef& conv2d, NodeDef* fused_conv2d) {
   (*attr)["use_cudnn_on_gpu"] = src_attr.at("use_cudnn_on_gpu");
 }
 
+void CopyDepthwiseConv2dNativeAttributes(const NodeDef& dw_conv2d,
+                                         NodeDef* fused_dw_conv2d) {
+  DCHECK(IsDepthwiseConv2dNative(dw_conv2d))
+      << "Input node must be a DepthwiseConv2dNative";
+
+  auto* attr = fused_dw_conv2d->mutable_attr();
+  auto& src_attr = dw_conv2d.attr();
+
+  (*attr)["T"] = src_attr.at("T");
+  (*attr)["strides"] = src_attr.at("strides");
+  (*attr)["padding"] = src_attr.at("padding");
+  (*attr)["dilations"] = src_attr.at("dilations");
+  (*attr)["data_format"] = src_attr.at("data_format");
+}
+
 void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
                                   NodeDef* fused_batch_norm_ex) {
   DCHECK(IsFusedBatchNorm(fused_batch_norm))
@@ -884,12 +956,17 @@ void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
   (*attr)["is_training"] = src_attr.at("is_training");
   (*attr)["data_format"] = src_attr.at("data_format");
   (*attr)["epsilon"] = src_attr.at("epsilon");
+  (*attr)["exponential_avg_factor"] = src_attr.at("exponential_avg_factor");
 
   // FusedBatchNormV2 and V3 have an extra type parameter.
   if (fused_batch_norm.op() != "FusedBatchNorm") {
-    (*attr)["U"] = src_attr.at("U");
+    SetAttrValue(src_attr.at("U"), &(*attr)["U"]);
   } else {
-    (*attr)["U"] = src_attr.at("T");
+#ifndef ENABLE_MKLDNN_V1
+    SetAttrValue(src_attr.at("T"), &(*attr)["U"]);
+#else
+    SetAttrValue(DT_FLOAT, &(*attr)["U"]);
+#endif
   }
 }
 
@@ -936,6 +1013,9 @@ Status AddFusedContractionNode(RemapperContext* ctx,
   if (IsConv2D(contraction)) {
     fused_op.set_op(kFusedConv2D);
     CopyConv2DAttributes(contraction, &fused_op);
+  } else if (IsDepthwiseConv2dNative(contraction)) {
+    fused_op.set_op(kFusedDepthwiseConv2dNative);
+    CopyDepthwiseConv2dNativeAttributes(contraction, &fused_op);
   } else if (IsMatMul(contraction)) {
     fused_op.set_op(kFusedMatMul);
     CopyMatMulAttributes(contraction, &fused_op);
@@ -980,6 +1060,9 @@ Status AddFusedContractionNode(
   if (IsConv2D(contraction)) {
     fused_op.set_op(kFusedConv2D);
     CopyConv2DAttributes(contraction, &fused_op);
+  } else if (IsDepthwiseConv2dNative(contraction)) {
+    fused_op.set_op(kFusedDepthwiseConv2dNative);
+    CopyDepthwiseConv2dNativeAttributes(contraction, &fused_op);
   } else if (IsMatMul(contraction)) {
     fused_op.set_op(kFusedMatMul);
     CopyMatMulAttributes(contraction, &fused_op);
@@ -1465,10 +1548,52 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
   return mutation->Apply();
 }
 
+#ifdef INTEL_MKL
+bool IsConv2DWithAdd(const RemapperContext& ctx, int node_index) {
+  const auto* node_view = ctx.graph_view.GetNode(node_index);
+  const auto* node_def = node_view->node();
+
+  // Candidate for Conv2D + Add or Conv2D + BiasAdd + Add fusion.
+  auto is_supported_add_input = [](const auto* node_view) -> bool {
+    if (IsConv2D(*node_view->node())) return true;
+    if (IsBiasAdd(*node_view->node())) {
+      if (node_view->NumRegularFanins() < 2) return false;
+      const auto& bias_add_fanin_0 = node_view->GetRegularFanin(0);
+      const auto& bias_add_fanin_1 = node_view->GetRegularFanin(1);
+      return IsConv2D(*bias_add_fanin_0.node_view()->node()) ||
+             IsConv2D(*bias_add_fanin_1.node_view()->node());
+    }
+    return false;
+  };
+
+  auto is_supported_add = [&](const auto* node_view) -> bool {
+    const auto* node_def = node_view->node();
+    if (IsAdd(*node_def)) {
+      if (node_view->NumRegularFanins() < 2) return false;
+      const auto& add_fanin_0 = node_view->GetRegularFanin(0);
+      const auto& add_fanin_1 = node_view->GetRegularFanin(1);
+      return is_supported_add_input(add_fanin_0.node_view()) ||
+             is_supported_add_input(add_fanin_1.node_view());
+    }
+    return false;
+  };
+
+  bool ret = false;
+  for (int i = 0; i < node_view->NumRegularFanins(); i++) {
+    const auto& fanin_i = node_view->GetRegularFanin(i);
+    ret = is_supported_add(fanin_i.node_view());
+    if (ret) break;
+  }
+
+  return ret;
+}
+#endif
+
 // Check if a node is a candidate to one of the patterns that require inferred
 // shapes:
 //   (1) Splitting FusedBatchNorm into primitives.
 //   (2) Fusing side input and/or activation into FusedBatchNorm.
+//   (3) INTEL_MKL specific: Conv2D -> Add or Conv2D -> BiasAdd -> Add.
 bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
   // Candidate for a FusedBatchNorm splitting.
   const auto* node_view = ctx.graph_view.GetNode(node_index);
@@ -1514,7 +1639,12 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return false;
   };
 
+#ifdef INTEL_MKL
+  return is_batch_norm_candidate() || is_batch_norm_fusion_candidate() ||
+         IsConv2DWithAdd(ctx, node_index);
+#else
   return is_batch_norm_candidate() || is_batch_norm_fusion_candidate();
+#endif  // INTEL_MKL
 }
 
 }  // namespace
@@ -1547,6 +1677,17 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
       continue;
     }
 
+    // Infer properties lazily in case they are not needed.
+    if (!ctx.inferred_graph_properties && RequiresInferredShapes(ctx, i)) {
+      const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
+      TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(
+          assume_valid_feeds,
+          /*aggressive_shape_inference=*/false,
+          /*include_input_tensor_values=*/true,
+          /*include_output_tensor_values=*/false));
+      ctx.inferred_graph_properties = true;
+    }
+
 #ifdef INTEL_MKL
     ContractionWithBiasAddAndAdd contract_with_bias_and_add;
     ContractionWithBiasAndAddActivation contract_with_bias_and_add_activation;
@@ -1572,7 +1713,8 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 #endif  //! INTEL_MKL
 
-    // Remap {Conv2D,MatMul}+BiasAdd into the _Fused{Conv2D,MatMul}
+    // Remap {Conv2D,DepthwiseConv2D,MatMul}+BiasAdd into the
+    // _Fused{Conv2D,DepthwiseConv2dNative,MatMul}
     ContractionWithBiasAdd contract_with_bias;
     if (allow_non_differentiable_rewrites &&
         FindContractionWithBias(ctx, i, &contract_with_bias)) {
@@ -1581,7 +1723,8 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
       continue;
     }
 
-    // Remap {Conv2D,MatMul}+BiasAdd+Activation into the _Fused{Conv2D,MatMul}.
+    // Remap {Conv2D,DepthwiseConv2D,MatMul}+BiasAdd+Activation into the
+    // _Fused{Conv2D,DepthwiseConv2dNative,MatMul}.
     ContractionWithBiasAddAndActivation contract_with_bias_and_activation;
     if (allow_non_differentiable_rewrites &&
         FindContractionWithBiasAndActivation(
@@ -1632,17 +1775,6 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 #endif  // !INTEL_MKL
 
-    // Infer properties lazily in case they are not needed.
-    if (!ctx.inferred_graph_properties && RequiresInferredShapes(ctx, i)) {
-      const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
-      TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(
-          assume_valid_feeds,
-          /*aggressive_shape_inference=*/false,
-          /*include_input_tensor_values=*/true,
-          /*include_output_tensor_values=*/false));
-      ctx.inferred_graph_properties = true;
-    }
-
     // Remap FusedBatchNorm+<SideInput>+<Activation> into the _FusedBatchNormEx.
     FusedBatchNormEx fused_batch_norm_ex;
     if (allow_non_differentiable_rewrites &&
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 35e09b28205..1946b864b9a 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -607,6 +607,7 @@ TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
   using ops::Placeholder;
 
@@ -850,6 +851,7 @@ TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
+#endif
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 358cc79826b..464a2c17197 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -117,7 +117,11 @@ Status CheckTypesAndGetShapes(const GraphProperties& graph_properties,
       *type = props.dtype();
     } else if (*type != props.dtype()) {
       return errors::Internal("Group ops don't all have same type");
-    } else if (!TensorShape::IsValid(props.shape())) {
+    } else if (!TensorShape::IsValid(props.shape()) ||
+               props.shape().unknown_rank()) {
+      // TensorShape::IsValid may return true if unknown_rank is True, i.e.
+      // number of dimensions is unknown.  But for ScopedAllocatorOptimizer we
+      // need to know the shape fully.
       return errors::Internal("Complete shape not known for ", n->name());
     }
     if (*type != dtype) {
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
index acc28f934dc..96bc90c8a95 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
@@ -43,7 +43,7 @@ class ScopedAllocatorOptimizer : public GraphOptimizer {
 
   string name() const override { return "scoped_allocator_optimizer"; }
 
-  bool UsesFunctionLibrary() const override { return false; }
+  bool UsesFunctionLibrary() const override { return true; }
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index d766cfdeee3..e8ee7db16a6 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -111,8 +111,8 @@ TEST_F(StaticScheduleTest, BasicGraph) {
   std::vector<std::string> ordered_node_names =
       GetOrderedNodeNames(completion_times);
   EXPECT_EQ(ordered_node_names,
-            (std::vector<std::string>{"Const/Const", "x", "Square", "Square_1",
-                                      "Square_2", "Square_3", "y"}));
+            (std::vector<std::string>{"Const/Const", "x", "Sign", "Sign_1",
+                                      "Sign_2", "Sign_3", "y"}));
 }
 
 TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
@@ -192,8 +192,8 @@ TEST_F(StaticScheduleTest, RequiredTimes) {
   std::vector<std::string> ordered_node_names =
       GetOrderedNodeNames(required_times);
   EXPECT_EQ(ordered_node_names,
-            (std::vector<std::string>{"Const/Const", "x", "Square", "Square_1",
-                                      "Square_2", "Square_3", "y"}));
+            (std::vector<std::string>{"Const/Const", "x", "Sign", "Sign_1",
+                                      "Sign_2", "Sign_3", "y"}));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/utils/transitive_fanin.cc b/tensorflow/core/grappler/utils/transitive_fanin.cc
index 92de38061ed..6622807aedf 100644
--- a/tensorflow/core/grappler/utils/transitive_fanin.cc
+++ b/tensorflow/core/grappler/utils/transitive_fanin.cc
@@ -20,15 +20,15 @@ limitations under the License.
 
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace grappler {
 
-std::vector<const NodeDef*> ComputeTransitiveFanin(
+Status ComputeTransitiveFanin(
     const GraphDef& graph, const std::vector<string>& terminal_nodes,
     std::unordered_map<string, const NodeDef*>* name_to_fanin_node,
-    bool* ill_formed) {
-  *ill_formed = false;
+    std::vector<const NodeDef*>* fanin_nodes) {
   std::unordered_map<string, const NodeDef*> name_to_node;
   std::unordered_map<string, const NodeDef*> name_to_send;
   for (const auto& node : graph.node()) {
@@ -43,14 +43,12 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
   for (const string& root : terminal_nodes) {
     const NodeDef* node = name_to_node[NodeName(root)];
     if (!node) {
-      *ill_formed = true;
-      VLOG(2) << "ComputeTransitiveFanin: problem with root node: " << root;
-      return {};
+      return errors::InvalidArgument("Graph does not contain terminal node ",
+                                     root, ".");
     }
     queue.push_back(node);
   }
 
-  std::vector<const NodeDef*> result;
   std::unordered_set<const NodeDef*> visited;
 
   while (!queue.empty()) {
@@ -60,15 +58,17 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
       // The node has already been visited.
       continue;
     }
-    result.push_back(node);
-    name_to_fanin_node->insert(
-        std::pair<string, const NodeDef*>(node->name(), node));
+    fanin_nodes->push_back(node);
+    if (name_to_fanin_node) {
+      name_to_fanin_node->insert(
+          std::pair<string, const NodeDef*>(node->name(), node));
+    }
     for (const string& input : node->input()) {
       const NodeDef* in = name_to_node[NodeName(input)];
       if (!in) {
-        VLOG(2) << "ComputeTransitiveFanin: problem with node: " << input;
-        *ill_formed = true;
-        return {};
+        return errors::InvalidArgument("Graph does not contain input ",
+                                       NodeName(input), " of node ",
+                                       node->name(), ".");
       }
       queue.push_back(in);
     }
@@ -82,7 +82,13 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
       // So, we do not set ill_formed for missing _Send.
     }
   }
-  return result;
+  return Status::OK();
+}
+
+Status ComputeTransitiveFanin(const GraphDef& graph,
+                              const std::vector<string>& terminal_nodes,
+                              std::vector<const NodeDef*>* fanin_nodes) {
+  return ComputeTransitiveFanin(graph, terminal_nodes, nullptr, fanin_nodes);
 }
 
 Status SetTransitiveFaninGraph(const GraphDef& input_graph,
@@ -90,15 +96,9 @@ Status SetTransitiveFaninGraph(const GraphDef& input_graph,
                                const std::vector<string>& terminal_nodes) {
   // Determines transitive fanin nodes from terminal nodes and add them to the
   // output graph.
-  bool ill_formed = false;
-  std::unordered_map<string, const NodeDef*> name_to_fanin_node;
-  std::vector<const NodeDef*> keep = ComputeTransitiveFanin(
-      input_graph, terminal_nodes, &name_to_fanin_node, &ill_formed);
-  if (ill_formed) {
-    // Some graph edges are invalid, or some of the feeds/fetch don't exist:
-    // let's be conservative and preserve the graph as is.
-    return errors::InvalidArgument("Invalid input graph.");
-  }
+  std::vector<const NodeDef*> keep;
+  TF_RETURN_IF_ERROR(
+      ComputeTransitiveFanin(input_graph, terminal_nodes, &keep));
   // Try to keep the nodes ordered somewhat topologically since this helps
   // further optimizations perform better.
   output_graph->mutable_node()->Reserve(keep.size());
diff --git a/tensorflow/core/grappler/utils/transitive_fanin.h b/tensorflow/core/grappler/utils/transitive_fanin.h
index 1b72eb4361c..11dccfc82e0 100644
--- a/tensorflow/core/grappler/utils/transitive_fanin.h
+++ b/tensorflow/core/grappler/utils/transitive_fanin.h
@@ -25,13 +25,17 @@ namespace tensorflow {
 namespace grappler {
 
 // Computes the transitive fanin of the graph based on reachability from the
-// specified terminal nodes. ill_formed will be set to true if the graph is
-// deemed structurally invalid. Returns the set of nodes comprising the
-// transitive fanin.
-std::vector<const NodeDef*> ComputeTransitiveFanin(
+// specified terminal nodes. Returns the set of nodes comprising the
+// transitive fanin into fanin_nodes. Optionally returns a map of name->node
+// for that graph into name_to_fanin_node if that is not set to nullptr.
+Status ComputeTransitiveFanin(
     const GraphDef& graph, const std::vector<string>& terminal_nodes,
     std::unordered_map<string, const NodeDef*>* name_to_fanin_node,
-    bool* ill_formed);
+    std::vector<const NodeDef*>* fanin_nodes);
+
+Status ComputeTransitiveFanin(const GraphDef& graph,
+                              const std::vector<string>& terminal_nodes,
+                              std::vector<const NodeDef*>* fanin_nodes);
 
 // Creates output_graph from input_graph using the transitive fanin from the
 // specified terminal nodes. Returns error if the input_graph is deemed
diff --git a/tensorflow/core/grappler/utils/transitive_fanin_test.cc b/tensorflow/core/grappler/utils/transitive_fanin_test.cc
index 94d98b93078..3233dfd34ba 100644
--- a/tensorflow/core/grappler/utils/transitive_fanin_test.cc
+++ b/tensorflow/core/grappler/utils/transitive_fanin_test.cc
@@ -117,7 +117,7 @@ TEST_F(TransitiveFaninTest, PruneNodesUnreachableFromMultipleTerminalNodes) {
   ASSERT_FALSE(node_map.NodeExists("6"));
 }
 
-TEST_F(TransitiveFaninTest, InvalidGraph) {
+TEST_F(TransitiveFaninTest, InvalidGraphOrTerminalNodes) {
   GraphDef graph = CreateGraph({
       {"1", {"2"}},  //
       {"2", {"3"}},  //
@@ -131,7 +131,11 @@ TEST_F(TransitiveFaninTest, InvalidGraph) {
   const std::vector<string> terminal_nodes = {"1", "5"};
   auto s = SetTransitiveFaninGraph(graph, &output_graph, terminal_nodes);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(), "Invalid input graph.");
+  EXPECT_EQ(s.error_message(), "Graph does not contain input 6 of node 5.");
+  const std::vector<string> invalid_terminal_nodes = {"0", "1", "5"};
+  s = SetTransitiveFaninGraph(graph, &output_graph, invalid_terminal_nodes);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(), "Graph does not contain terminal node 0.");
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 99f0c0b56b4..492cf0b9fd6 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -517,6 +517,16 @@ cc_library(
     deps = ["//third_party/eigen3"],
 )
 
+cc_library(
+    name = "gpu_prim_hdrs",
+    hdrs = ["gpu_prim.h"],
+    deps = if_cuda([
+        "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rocprim",
+    ]),
+)
+
 cc_library(
     name = "conv_ops_gpu_hdrs",
     hdrs = ["conv_ops_gpu.h"],
@@ -1271,7 +1281,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "shape_ops",
     prefix = "shape_ops",
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + ["//tensorflow/core/common_runtime:dma_helper"],
 )
 
 tf_kernel_library(
@@ -1329,6 +1339,7 @@ tf_kernel_library(
         "tile_functor_cpu_int8.cc",
         "tile_functor_cpu_tstring.cc",
         "tile_functor_cpu_uint8.cc",
+        "tile_functor_cpu_variant.cc",
         "tile_functor_sycl.cc",
     ],
     hdrs = ["tile_functor.h"],
@@ -1361,7 +1372,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "unique_op",
     prefix = "unique_op",
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + [
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
 )
 
 tf_kernel_library(
@@ -1393,12 +1406,9 @@ tf_kernel_library(
         "where_op_gpu_impl_8.cu.cc",
     ],
     deps = if_cuda_or_rocm([
-        ":cuda_solvers",
-    ]) + if_cuda([
-        "@cub_archive//:cub",
-    ]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]) + ARRAY_DEPS,
+               ":cuda_solvers",
+           ]) + [":gpu_prim_hdrs"] +
+           ARRAY_DEPS,
 )
 
 cc_library(
@@ -2185,6 +2195,9 @@ tf_cuda_cc_test(
     name = "quantize_and_dequantize_op_test",
     size = "small",
     srcs = ["quantize_and_dequantize_op_test.cc"],
+    tags = [
+        "no_windows",  # test uses rand_r which does not exist on Windows
+    ],
     deps = [
         ":ops_testutil",
         ":ops_util",
@@ -2271,6 +2284,25 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "shape_ops_test",
+    size = "small",
+    srcs = ["shape_ops_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":shape_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:single_threaded_executor",
+    ],
+)
+
 tf_cc_test(
     name = "slice_op_test",
     size = "small",
@@ -2328,6 +2360,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:single_threaded_executor",
     ],
 )
 
@@ -2637,10 +2670,9 @@ tf_kernel_library(
     deps = DYNAMIC_DEPS + [
         ":fill_functor",
         ":gather_functor",
+        ":gpu_prim_hdrs",
         "//tensorflow/core:framework_internal",
-    ] + if_cuda(["@cub_archive//:cub"]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
+    ],
 )
 
 tf_kernel_library(
@@ -3082,8 +3114,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "generate_box_proposals_op",
     gpu_srcs = ["generate_box_proposals_op.cu.cc"],
-    deps = if_cuda([
-        "@cub_archive//:cub",
+    deps = [":gpu_prim_hdrs"] + if_cuda([
         ":non_max_suppression_op_gpu",
     ]),
 )
@@ -3091,7 +3122,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "non_max_suppression_op",
     prefix = "non_max_suppression_op",
-    deps = IMAGE_DEPS + if_cuda(["@cub_archive//:cub"]),
+    deps = IMAGE_DEPS + [":gpu_prim_hdrs"],
 )
 
 tf_kernel_library(
@@ -3762,7 +3793,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "svd_op",
     prefix = "svd_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        ":eye_functor",
+    ]),
 )
 
 tf_kernel_library(
@@ -3989,7 +4022,7 @@ cc_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//third_party/eigen3",
@@ -4014,7 +4047,7 @@ cc_library(
         ":eigen_spatial_convolutions-inl",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -4030,7 +4063,7 @@ cc_library(
     deps = select({
         "//tensorflow:android": [
             ":conv_3d_mobile",
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             ":conv_3d",
@@ -4160,11 +4193,10 @@ tf_kernel_library(
     name = "reduction_ops",
     gpu_srcs = ["reduction_gpu_kernels.cu.h"],
     prefix = "reduction_ops",
-    deps = MATH_DEPS + [":transpose_functor"] + if_cuda([
-        "@cub_archive//:cub",
-    ]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
+    deps = MATH_DEPS + [
+        ":gpu_prim_hdrs",
+        ":transpose_functor",
+    ],
 )
 
 tf_kernel_library(
@@ -4187,11 +4219,7 @@ tf_kernel_library(
         "scan_ops_gpu_half.cu.cc",
         "scan_ops_gpu_int.cu.cc",
     ],
-    deps = MATH_DEPS + if_cuda([
-        "@cub_archive//:cub",
-    ]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
+    deps = MATH_DEPS + [":gpu_prim_hdrs"],
 )
 
 tf_kernel_library(
@@ -4518,6 +4546,9 @@ tf_cuda_cc_test(
     name = "split_v_op_test",
     size = "small",
     srcs = ["split_v_op_test.cc"],
+    tags = [
+        "no_windows",  # split_v_op uses lrand48 which does not exist on Windows
+    ],
     deps = [
         ":ops_testutil",
         ":ops_util",
@@ -4792,11 +4823,7 @@ tf_kernel_library(
     prefix = "softmax_op",
     deps = NN_DEPS + if_cuda_or_rocm([
         ":reduction_ops",
-    ]) + if_cuda([
-        "@cub_archive//:cub",
-    ]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
+    ]) + [":gpu_prim_hdrs"],
 )
 
 tf_kernel_library(
@@ -4828,11 +4855,7 @@ tf_kernel_library(
         "topk_op_gpu_int8.cu.cc",
         "topk_op_gpu_uint8.cu.cc",
     ],
-    deps = NN_DEPS + if_cuda([
-        "@cub_archive//:cub",
-    ]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
+    deps = NN_DEPS + [":gpu_prim_hdrs"],
 )
 
 tf_kernel_library(
@@ -4851,41 +4874,39 @@ tf_kernel_library(
     name = "bincount_op",
     prefix = "bincount_op",
     deps = [
+        ":fill_functor",
+        ":gpu_prim_hdrs",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
-    ] + if_cuda(["@cub_archive//:cub"]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
+    ],
 )
 
 tf_kernel_library(
     name = "histogram_op",
     prefix = "histogram_op",
     deps = [
+        ":gpu_prim_hdrs",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
-    ] + if_cuda(["@cub_archive//:cub"]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
+    ],
 )
 
 tf_kernel_library(
     name = "l2loss_op",
     prefix = "l2loss_op",
     deps = [
+        ":gpu_prim_hdrs",
         ":reduction_ops",
-        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_grad",
-    ] + if_cuda(["@cub_archive//:cub"]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
+        "//third_party/eigen3",
+    ],
 )
 
 tf_cuda_cc_test(
@@ -5654,6 +5675,32 @@ tf_kernel_library(
     deps = STATE_DEPS,
 )
 
+tf_kernel_library(
+    name = "count_ops",
+    prefix = "count_ops",
+    deps = STATE_DEPS + [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core/framework:op_requires",
+    ],
+)
+
+tf_cc_test(
+    name = "count_ops_test",
+    size = "small",
+    srcs = ["count_ops_test.cc"],
+    deps = [
+        ":count_ops",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "scatter_nd_op",
     srcs = [
@@ -6024,16 +6071,13 @@ tf_kernel_library(
         ":random_op",
         ":random_ops",
         ":stateless_random_ops",
+        ":gpu_prim_hdrs",
         "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
     ] + if_cuda_or_rocm([
         ":reduction_ops",
-    ]) + if_cuda([
-        "@cub_archive//:cub",
-    ]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
     ]),
 )
 
@@ -6668,6 +6712,7 @@ filegroup(
         "cwise_op_bitwise_and.cc",
         "cwise_op_bitwise_or.cc",
         "cwise_op_bitwise_xor.cc",
+        "cwise_op_conj.cc",
         "cwise_op_cos.cc",
         "cwise_op_cosh.cc",
         "cwise_op_div.cc",
@@ -6863,6 +6908,7 @@ filegroup(
         "tile_functor_cpu_int8.cc",
         "tile_functor_cpu_tstring.cc",
         "tile_functor_cpu_uint8.cc",
+        "tile_functor_cpu_variant.cc",
         "tile_ops.cc",
         "tile_ops_cpu_impl_1.cc",
         "tile_ops_cpu_impl_2.cc",
@@ -7004,6 +7050,11 @@ filegroup(
         # rule above. Seems to have only have worked before because of
         # hdrs_check loose.
         "stateful_random_ops_cpu_gpu.h",
+        # Allows conv_3d ops for android but excluded from *_3d* rule above.
+        "conv_3d.h",
+        "conv_ops_3d.h",
+        "conv_ops_3d.cc",
+        "conv_ops_gpu.h",
     ],
     visibility = ["//visibility:public"],
 )
@@ -7049,7 +7100,7 @@ cc_library(
 
 build_test(
     name = "android_tensorflow_kernels_build_test",
-    targets = [":android_tensorflow_kernels"],
+    targets = [":portable_tensorflow_kernels"],
 )
 
 cc_library(
@@ -7062,7 +7113,7 @@ cc_library(
         "//tensorflow/core:android_gif_internal",
         "//tensorflow/core:android_jpeg_internal",
         "//tensorflow/core:android_png_internal",
-        "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
     ],
     alwayslink = 1,
 )
@@ -7079,7 +7130,7 @@ cc_library(
     linkopts = ["-ldl"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
     ],
     alwayslink = 1,
 )
@@ -7221,8 +7272,8 @@ tf_cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":quantized_ops",
@@ -7282,8 +7333,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_util",
@@ -7367,8 +7418,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_testutil",
@@ -7554,8 +7605,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_util",
@@ -7780,8 +7831,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -8178,7 +8229,10 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + mkl_deps(),
+    deps = NN_DEPS + [
+        ":fused_batch_norm_op",
+        ":no_op",
+    ] + mkl_deps(),
 )
 
 tf_cc_test_mkl(
@@ -8365,6 +8419,13 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "data_service_ops",
+    deps = [
+        "//tensorflow/core/kernels/data/experimental:data_service_kernels",
+    ],
+)
+
 tf_kernel_library(
     name = "dataset_ops",
     deps = [
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index 63bef41a272..1478797227c 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -154,6 +154,7 @@ class ArgMinOp
                           ArgMinOp<CPUDevice, type, int32>);
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_ARGMAX);
+TF_CALL_bool(REGISTER_ARGMAX);
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
@@ -194,7 +195,9 @@ namespace functor {
   extern template struct ArgMin<GPUDevice, T, int32>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+TF_CALL_bool(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_CLASS);
+TF_CALL_bool(DECLARE_GPU_CLASS);
 
 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_CLASS
@@ -233,6 +236,7 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_CLASS);
                           ArgMinOp<GPUDevice, type, int32>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ARGMAX_GPU);
+TF_CALL_bool(REGISTER_ARGMAX_GPU);
 
 #undef REGISTER_ARGMAX_GPU
 
diff --git a/tensorflow/core/kernels/argmax_op_gpu.cu.cc b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
index bd7c4b4027c..659048e6a1f 100644
--- a/tensorflow/core/kernels/argmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
@@ -32,6 +32,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::ArgMin<GPUDevice, T, int32>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index d5b6e7d5e17..89c438b62cc 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -78,8 +78,9 @@ struct ParallelMatMulKernel {
   }
 
   static void Run(const OpKernelContext* context, const Tensor& in_x,
-                  const Tensor in_y, bool adj_x, bool adj_y,
-                  const MatMulBCast& bcast, Tensor* out, int start, int limit) {
+                  const Tensor in_y, bool adj_x, bool adj_y, bool trans_x,
+                  bool trans_y, const MatMulBCast& bcast, Tensor* out,
+                  int start, int limit) {
     static_assert(IsComplex, "Complex type expected.");
     auto Tx = in_x.tensor<Scalar, 3>();
     auto Ty = in_y.tensor<Scalar, 3>();
@@ -90,7 +91,7 @@ struct ParallelMatMulKernel {
     // to halve the number of cases. The final conjugation of the result is
     // done at the end of LaunchBatchMatMul<CPUDevice, Scalar>::Launch().
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
-    contract_pairs[0] = ContractionDims(adj_x, adj_y);
+    contract_pairs[0] = ContractionDims(adj_x || trans_x, adj_y || trans_y);
     const Eigen::ThreadPoolDevice d = context->eigen_cpu_device();
 
     const bool should_bcast = bcast.IsBroadcastingRequired();
@@ -121,13 +122,14 @@ struct ParallelMatMulKernel<Scalar, false> {
   static void Conjugate(const OpKernelContext* context, Tensor* out) {}
 
   static void Run(const OpKernelContext* context, const Tensor& in_x,
-                  const Tensor& in_y, bool adj_x, bool adj_y,
-                  const MatMulBCast& bcast, Tensor* out, int start, int limit) {
+                  const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                  bool trans_y, const MatMulBCast& bcast, Tensor* out,
+                  int start, int limit) {
     auto Tx = in_x.tensor<Scalar, 3>();
     auto Ty = in_y.tensor<Scalar, 3>();
     auto Tz = out->tensor<Scalar, 3>();
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
-    contract_pairs[0] = ContractionDims(adj_x, adj_y);
+    contract_pairs[0] = ContractionDims(adj_x || trans_x, adj_y || trans_y);
     const Eigen::ThreadPoolDevice d = context->eigen_cpu_device();
 
     const bool should_bcast = bcast.IsBroadcastingRequired();
@@ -169,8 +171,8 @@ struct SequentialMatMulKernel {
   }
 
   static void Run(const Tensor& in_x, const Tensor& in_y, bool adj_x,
-                  bool adj_y, const MatMulBCast& bcast, Tensor* out, int start,
-                  int limit) {
+                  bool adj_y, bool trans_x, bool trans_y,
+                  const MatMulBCast& bcast, Tensor* out, int start, int limit) {
     const bool should_bcast = bcast.IsBroadcastingRequired();
     const auto& x_batch_indices = bcast.x_batch_indices();
     const auto& y_batch_indices = bcast.y_batch_indices();
@@ -180,17 +182,31 @@ struct SequentialMatMulKernel {
       auto x = ConstTensorSliceToEigenMatrix(in_x, x_batch_index);
       auto y = ConstTensorSliceToEigenMatrix(in_y, y_batch_index);
       auto z = TensorSliceToEigenMatrix(out, i);
-      if (!adj_x) {
-        if (!adj_y) {
+      // Assume at most one of adj_x or trans_x is true. Similarly, for adj_y
+      // and trans_y.
+      if (!adj_x && !trans_x) {
+        if (!adj_y && !trans_y) {
           z.noalias() = x * y;
-        } else {
+        } else if (adj_y) {
           z.noalias() = x * y.adjoint();
+        } else {  // trans_y == true
+          z.noalias() = x * y.transpose();
         }
-      } else {
-        if (!adj_y) {
+      } else if (adj_x) {
+        if (!adj_y && !trans_y) {
           z.noalias() = x.adjoint() * y;
-        } else {
+        } else if (adj_y) {
           z.noalias() = x.adjoint() * y.adjoint();
+        } else {  // trans_y == true
+          z.noalias() = x.adjoint() * y.transpose();
+        }
+      } else {  // trans_x == true
+        if (!adj_y && !trans_y) {
+          z.noalias() = x.transpose() * y;
+        } else if (adj_y) {
+          z.noalias() = x.transpose() * y.adjoint();
+        } else {  // trans_y == true
+          z.noalias() = x.transpose() * y.transpose();
         }
       }
     }
@@ -205,8 +221,8 @@ struct LaunchBatchMatMul;
 template <typename Scalar>
 struct LaunchBatchMatMul<CPUDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
-                     const Tensor& in_y, bool adj_x, bool adj_y,
-                     const MatMulBCast& bcast, Tensor* out) {
+                     const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
     typedef ParallelMatMulKernel<Scalar, Eigen::NumTraits<Scalar>::IsComplex>
         ParallelMatMulKernel;
     bool conjugate_result = false;
@@ -226,17 +242,19 @@ struct LaunchBatchMatMul<CPUDevice, Scalar> {
       // Parallelize over inner dims.
       // For large matrix products it is counter-productive to parallelize
       // over the batch dimension.
-      ParallelMatMulKernel::Run(context, in_x, in_y, adj_x, adj_y, bcast, out,
-                                0, batch_size);
+      ParallelMatMulKernel::Run(context, in_x, in_y, adj_x, adj_y, trans_x,
+                                trans_y, bcast, out, 0, batch_size);
       conjugate_result = adj_x;
     } else {
       // Parallelize over outer dims. For small matrices and large batches, it
       // is counter-productive to parallelize the inner matrix multiplies.
       Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
             cost_per_unit,
-            [&in_x, &in_y, adj_x, adj_y, &bcast, out](int start, int limit) {
+            [&in_x, &in_y, adj_x, adj_y, trans_x, trans_y, &bcast, out](
+                int start, int limit) {
               SequentialMatMulKernel<Scalar>::Run(in_x, in_y, adj_x, adj_y,
-                                                  bcast, out, start, limit);
+                                                  trans_x, trans_y, bcast, out,
+                                                  start, limit);
             });
     }
     if (conjugate_result) {
@@ -297,19 +315,17 @@ class BlasScratchAllocator : public se::ScratchAllocator {
 template <typename Scalar>
 struct LaunchBatchMatMul<GPUDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
-                     const Tensor& in_y, bool adj_x, bool adj_y,
-                     const MatMulBCast& bcast, Tensor* out) {
-    constexpr se::blas::Transpose kTranspose =
-        is_complex<Scalar>::value ? se::blas::Transpose::kConjugateTranspose
-                                  : se::blas::Transpose::kTranspose;
+                     const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
     se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
-                                   kTranspose};
-    const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
-    const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
-    const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
+                                   se::blas::Transpose::kTranspose,
+                                   se::blas::Transpose::kConjugateTranspose};
+    const uint64 m = in_x.dim_size(adj_x || trans_x ? 2 : 1);
+    const uint64 k = in_x.dim_size(adj_x || trans_x ? 1 : 2);
+    const uint64 n = in_y.dim_size(adj_y || trans_y ? 1 : 2);
     const int64 batch_size = bcast.output_batch_size();
-    auto blas_transpose_a = trans[adj_x];
-    auto blas_transpose_b = trans[adj_y];
+    auto blas_transpose_a = trans[adj_x ? 2 : (trans_x ? 1 : 0)];
+    auto blas_transpose_b = trans[adj_y ? 2 : (trans_y ? 1 : 0)];
 
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
@@ -399,9 +415,10 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
                                 : se::blas::Transpose::kTranspose;
         bool blas_launch_status =
             stream
-                ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
+                ->ThenBlasGemv(gemv_trans_a, adj_x || trans_x ? m : k,
+                               adj_x || trans_x ? k : m,
                                static_cast<Coefficient>(1.0), *(a_ptrs[0]),
-                               adj_x ? m : k, *(b_ptrs[0]), 1,
+                               adj_x || trans_x ? m : k, *(b_ptrs[0]), 1,
                                static_cast<Coefficient>(0.0), c_ptrs[0], 1)
                 .ok();
         if (!blas_launch_status) {
@@ -415,7 +432,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
             stream
                 ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
                                static_cast<Coefficient>(1.0), *(b_ptrs[0]),
-                               adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
+                               adj_y || trans_y ? k : n, *(a_ptrs[0]),
+                               adj_x || trans_x ? m : k,
                                static_cast<Coefficient>(0.0), c_ptrs[0], n)
                 .ok();
         if (!blas_launch_status) {
@@ -430,8 +448,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
           stream
               ->ThenBlasGemmStridedBatched(
                   blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Coefficient>(1.0), *b_ptrs[0], adj_y ? k : n,
-                  b_stride, *a_ptrs[0], adj_x ? m : k, a_stride,
+                  static_cast<Coefficient>(1.0), *b_ptrs[0],
+                  adj_y || trans_y ? k : n, b_stride, *a_ptrs[0],
+                  adj_x || trans_x ? m : k, a_stride,
                   static_cast<Coefficient>(0.0), c_ptrs[0], n, c_stride,
                   batch_size)
               .ok();
@@ -448,9 +467,10 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
           stream
               ->ThenBlasGemmBatchedWithScratch(
                   blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
-                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
-                  batch_size, &scratch_allocator)
+                  static_cast<Coefficient>(1.0), b_ptrs,
+                  adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
+                  static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
+                  &scratch_allocator)
               .ok();
       if (!blas_launch_status) {
         context->SetStatus(errors::Internal(
@@ -466,21 +486,18 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
 template <>
 struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
-                     const Tensor& in_y, bool adj_x, bool adj_y,
-                     const MatMulBCast& bcast, Tensor* out) {
+                     const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
     typedef Eigen::half Scalar;
-    constexpr perftools::gputools::blas::Transpose kTranspose =
-        is_complex<Scalar>::value
-            ? perftools::gputools::blas::Transpose::kConjugateTranspose
-            : perftools::gputools::blas::Transpose::kTranspose;
-    perftools::gputools::blas::Transpose trans[] = {
-        perftools::gputools::blas::Transpose::kNoTranspose, kTranspose};
-    const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
-    const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
-    const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
+    se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
+                                   se::blas::Transpose::kTranspose,
+                                   se::blas::Transpose::kConjugateTranspose};
+    const uint64 m = in_x.dim_size(adj_x || trans_x ? 2 : 1);
+    const uint64 k = in_x.dim_size(adj_x || trans_x ? 1 : 2);
+    const uint64 n = in_y.dim_size(adj_y || trans_y ? 1 : 2);
     const uint64 batch_size = bcast.output_batch_size();
-    auto blas_transpose_a = trans[adj_x];
-    auto blas_transpose_b = trans[adj_y];
+    auto blas_transpose_a = trans[adj_x ? 2 : (trans_x ? 1 : 0)];
+    auto blas_transpose_b = trans[adj_y ? 2 : (trans_y ? 1 : 0)];
 
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
@@ -563,7 +580,8 @@ struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
           stream
               ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
                              static_cast<Coefficient>(1.0), *(b_ptrs[0]),
-                             adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
+                             adj_y || trans_y ? k : n, *(a_ptrs[0]),
+                             adj_x || trans_x ? m : k,
                              static_cast<Coefficient>(0.0), c_ptrs[0], n)
               .ok();
       if (!blas_launch_status) {
@@ -577,8 +595,9 @@ struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
           stream
               ->ThenBlasGemmStridedBatched(
                   blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Coefficient>(1.0), *b_ptrs[0], adj_y ? k : n,
-                  b_stride, *a_ptrs[0], adj_x ? m : k, a_stride,
+                  static_cast<Coefficient>(1.0), *b_ptrs[0],
+                  adj_y || trans_y ? k : n, b_stride, *a_ptrs[0],
+                  adj_x || trans_x ? m : k, a_stride,
                   static_cast<Coefficient>(0.0), c_ptrs[0], n, c_stride,
                   batch_size)
               .ok();
@@ -595,9 +614,10 @@ struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
           stream
               ->ThenBlasGemmBatchedWithScratch(
                   blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
-                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
-                  batch_size, &scratch_allocator)
+                  static_cast<Coefficient>(1.0), b_ptrs,
+                  adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
+                  static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
+                  &scratch_allocator)
               .ok();
       if (!blas_launch_status) {
         context->SetStatus(errors::Internal(
@@ -616,13 +636,14 @@ struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
 template <typename Scalar>
 struct ParallelMatMulKernelSYCL {
   static void Run(const OpKernelContext* context, const Tensor& in_x,
-                  const Tensor& in_y, bool adj_x, bool adj_y,
-                  const MatMulBCast& bcast, Tensor* out, int start, int limit) {
+                  const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                  bool trans_y, const MatMulBCast& bcast, Tensor* out,
+                  int start, int limit) {
     auto Tx = in_x.tensor<Scalar, 3>();
     auto Ty = in_y.tensor<Scalar, 3>();
     auto Tz = out->tensor<Scalar, 3>();
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
-    contract_pairs[0] = ContractionDims(adj_x, adj_y);
+    contract_pairs[0] = ContractionDims(adj_x || trans_x, adj_y || trans_y);
     auto d = context->eigen_sycl_device();
 
     const bool should_bcast = bcast.IsBroadcastingRequired();
@@ -643,12 +664,13 @@ struct ParallelMatMulKernelSYCL {
 template <typename Scalar>
 struct LaunchBatchMatMul<SYCLDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
-                     const Tensor& in_y, bool adj_x, bool adj_y,
-                     const MatMulBCast& bcast, Tensor* out) {
+                     const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
     // Number of matrix multiplies i.e. size of the batch.
     const int64 batch_size = bcast.output_batch_size();
     ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
-                                          bcast, out, 0, batch_size);
+                                          trans_x, trans_y, bcast, out, 0,
+                                          batch_size);
   }
 };
 #endif  // TENSORFLOW_USE_SYCL
@@ -720,7 +742,8 @@ class BaseBatchMatMulOp : public OpKernel {
                 errors::Internal("Failed to reshape output from ",
                                  out->shape().DebugString()));
     LaunchBatchMatMul<Device, Scalar>::Launch(
-        ctx, in0_reshaped, in1_reshaped, adj_x_, adj_y_, bcast, &out_reshaped);
+        ctx, in0_reshaped, in1_reshaped, adj_x_, adj_y_, /*trans_x=*/false,
+        /*trans_y=*/false, bcast, &out_reshaped);
   }
 
  protected:
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index d2b531bae3d..a84b25f2541 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -15,12 +15,14 @@ limitations under the License.
 
 // See docs in ../ops/math_ops.cc.
 
+#include "tensorflow/core/platform/errors.h"
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/bincount_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bincount_op.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -33,19 +35,18 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
-template <typename T>
-struct BincountFunctor<CPUDevice, T> {
+template <typename Tidx, typename T>
+struct BincountFunctor<CPUDevice, Tidx, T, true> {
   static Status Compute(OpKernelContext* context,
-                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<Tidx, 1>::ConstTensor& arr,
                         const typename TTypes<T, 1>::ConstTensor& weights,
-                        typename TTypes<T, 1>::Tensor& output) {
-    int size = output.size();
-
+                        typename TTypes<T, 1>::Tensor& output,
+                        const Tidx num_bins) {
     Tensor all_nonneg_t;
     TF_RETURN_IF_ERROR(context->allocate_temp(
         DT_BOOL, TensorShape({}), &all_nonneg_t, AllocatorAttributes()));
     all_nonneg_t.scalar<bool>().device(context->eigen_cpu_device()) =
-        (arr >= 0).all();
+        (arr >= Tidx(0)).all();
     if (!all_nonneg_t.scalar<bool>()()) {
       return errors::InvalidArgument("Input arr must be non-negative!");
     }
@@ -56,17 +57,62 @@ struct BincountFunctor<CPUDevice, T> {
         context->device()->tensorflow_cpu_worker_threads()->workers;
     const int64 num_threads = thread_pool->NumThreads() + 1;
     Tensor partial_bins_t;
-    TF_RETURN_IF_ERROR(context->allocate_temp(DataTypeToEnum<T>::value,
-                                              TensorShape({num_threads, size}),
-                                              &partial_bins_t));
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DT_BOOL, TensorShape({num_threads, num_bins}), &partial_bins_t));
+    auto partial_bins = partial_bins_t.matrix<bool>();
+    partial_bins.setZero();
+    thread_pool->ParallelForWithWorkerId(
+        arr.size(), 8 /* cost */,
+        [&](int64 start_ind, int64 limit_ind, int64 worker_id) {
+          for (int64 i = start_ind; i < limit_ind; i++) {
+            Tidx value = arr(i);
+            if (value < num_bins) {
+              partial_bins(worker_id, value) = true;
+            }
+          }
+        });
+
+    // Sum the partial bins along the 0th axis.
+    Eigen::array<int, 1> reduce_dim({0});
+    output.device(context->eigen_cpu_device()) =
+        partial_bins.any(reduce_dim).cast<T>();
+    return Status::OK();
+  }
+};
+
+template <typename Tidx, typename T>
+struct BincountFunctor<CPUDevice, Tidx, T, false> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<Tidx, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output,
+                        const Tidx num_bins) {
+    Tensor all_nonneg_t;
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DT_BOOL, TensorShape({}), &all_nonneg_t, AllocatorAttributes()));
+    all_nonneg_t.scalar<bool>().device(context->eigen_cpu_device()) =
+        (arr >= Tidx(0)).all();
+    if (!all_nonneg_t.scalar<bool>()()) {
+      return errors::InvalidArgument("Input arr must be non-negative!");
+    }
+
+    // Allocate partial output bin sums for each worker thread. Worker ids in
+    // ParallelForWithWorkerId range from 0 to NumThreads() inclusive.
+    ThreadPool* thread_pool =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    const int64 num_threads = thread_pool->NumThreads() + 1;
+    Tensor partial_bins_t;
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DataTypeToEnum<T>::value, TensorShape({num_threads, num_bins}),
+        &partial_bins_t));
     auto partial_bins = partial_bins_t.matrix<T>();
     partial_bins.setZero();
     thread_pool->ParallelForWithWorkerId(
         arr.size(), 8 /* cost */,
         [&](int64 start_ind, int64 limit_ind, int64 worker_id) {
           for (int64 i = start_ind; i < limit_ind; i++) {
-            int32 value = arr(i);
-            if (value < size) {
+            Tidx value = arr(i);
+            if (value < num_bins) {
               if (weights.size()) {
                 partial_bins(worker_id, value) += weights(i);
               } else {
@@ -78,8 +124,43 @@ struct BincountFunctor<CPUDevice, T> {
         });
 
     // Sum the partial bins along the 0th axis.
-    Eigen::array<int, 1> reduce_dims({0});
-    output.device(context->eigen_cpu_device()) = partial_bins.sum(reduce_dims);
+    Eigen::array<int, 1> reduce_dim({0});
+    output.device(context->eigen_cpu_device()) = partial_bins.sum(reduce_dim);
+    return Status::OK();
+  }
+};
+
+template <typename Tidx, typename T, bool binary_output>
+struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_output> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<Tidx, 2>::ConstTensor& in,
+                        const typename TTypes<T, 2>::ConstTensor& weights,
+                        typename TTypes<T, 2>::Tensor& out,
+                        const Tidx num_bins) {
+    const int num_rows = out.dimension(0);
+    const int num_cols = in.dimension(1);
+    ThreadPool* thread_pool =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    thread_pool->ParallelForWithWorkerId(
+        num_rows, 8 /* cost */,
+        [&](int64 start_row, int64 end_row, int64 worker_id) {
+          for (int64 i = start_row; i < end_row; ++i) {
+            for (int64 j = 0; j < num_cols; ++j) {
+              Tidx value = in(i, j);
+              if (value < num_bins) {
+                if (binary_output) {
+                  out(i, value) = T(1);
+                } else {
+                  if (weights.size()) {
+                    out(i, value) += weights(i, j);
+                  } else {
+                    out(i, value) += T(1);
+                  }
+                }
+              }
+            }
+          }
+        });
     return Status::OK();
   }
 };
@@ -107,8 +188,9 @@ class BincountOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output(0, TensorShape({size}), &output_t));
     auto output = output_t->flat<T>();
-    OP_REQUIRES_OK(ctx, functor::BincountFunctor<Device, T>::Compute(
-                            ctx, arr, weights, output));
+    OP_REQUIRES_OK(ctx,
+                   functor::BincountFunctor<Device, int32, T, false>::Compute(
+                       ctx, arr, weights, output, size));
   }
 };
 
@@ -135,4 +217,244 @@ TF_CALL_float(REGISTER_KERNELS);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+template <typename Device, typename Tidx, typename T>
+class DenseBincountOp : public OpKernel {
+ public:
+  explicit DenseBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& data = ctx->input(0);
+    const Tensor& size_t = ctx->input(1);
+    const Tensor& weights = ctx->input(2);
+
+    Tidx size = size_t.scalar<Tidx>()();
+    OP_REQUIRES(
+        ctx, size >= 0,
+        errors::InvalidArgument("size (", size, ") must be non-negative"));
+
+    Tensor* out_t;
+    functor::SetZeroFunctor<Device, T> fill;
+    if (data.dims() == 1) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
+      auto out = out_t->flat<T>();
+      fill(ctx->eigen_device<Device>(), out);
+      if (binary_output_) {
+        OP_REQUIRES_OK(
+            ctx, functor::BincountFunctor<Device, Tidx, T, true>::Compute(
+                     ctx, data.flat<Tidx>(), weights.flat<T>(), out, size));
+      } else {
+        OP_REQUIRES_OK(
+            ctx, functor::BincountFunctor<Device, Tidx, T, false>::Compute(
+                     ctx, data.flat<Tidx>(), weights.flat<T>(), out, size));
+      }
+    } else if (data.dims() == 2) {
+      const int64 num_rows = data.dim_size(0);
+      auto weight_matrix =
+          (weights.NumElements() == 0)
+              ? weights.shaped<T, 2>(gtl::InlinedVector<int64, 2>(2, 0))
+              : weights.matrix<T>();
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0, TensorShape({num_rows, size}), &out_t));
+      auto out = out_t->matrix<T>();
+      fill(ctx->eigen_device<Device>(), out_t->flat<T>());
+      if (binary_output_) {
+        OP_REQUIRES_OK(
+            ctx, functor::BincountReduceFunctor<Device, Tidx, T, true>::Compute(
+                     ctx, data.matrix<Tidx>(), weight_matrix, out, size));
+      } else {
+        OP_REQUIRES_OK(
+            ctx,
+            functor::BincountReduceFunctor<Device, Tidx, T, false>::Compute(
+                ctx, data.matrix<Tidx>(), weight_matrix, out, size));
+      }
+    }
+  }
+
+ private:
+  bool binary_output_;
+};
+
+#define REGISTER_KERNELS(Tidx, T)                            \
+  REGISTER_KERNEL_BUILDER(Name("DenseBincount")              \
+                              .Device(DEVICE_CPU)            \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<Tidx>("Tidx"), \
+                          DenseBincountOp<CPUDevice, Tidx, T>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(int32, T);   \
+  REGISTER_KERNELS(int64, T);
+
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define REGISTER_KERNELS(Tidx, T)                            \
+  REGISTER_KERNEL_BUILDER(Name("DenseBincount")              \
+                              .Device(DEVICE_GPU)            \
+                              .HostMemory("size")            \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<Tidx>("Tidx"), \
+                          DenseBincountOp<GPUDevice, Tidx, T>);
+#define REGISTER_GPU_KERNELS(T) \
+  REGISTER_KERNELS(int32, T);   \
+  REGISTER_KERNELS(int64, T);
+
+TF_CALL_int32(REGISTER_GPU_KERNELS);
+TF_CALL_float(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#undef REGISTER_KERNELS
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename Device, typename Tidx, typename T>
+class SparseBincountOp : public OpKernel {
+ public:
+  explicit SparseBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& indices = ctx->input(0);
+    const auto values = ctx->input(1).flat<Tidx>();
+    const Tensor& dense_shape = ctx->input(2);
+    const Tensor& size_t = ctx->input(3);
+    const auto weights = ctx->input(4).flat<T>();
+    const int64 weights_size = weights.size();
+
+    Tidx size = size_t.scalar<Tidx>()();
+    OP_REQUIRES(
+        ctx, size >= 0,
+        errors::InvalidArgument("size (", size, ") must be non-negative"));
+
+    bool is_1d = dense_shape.NumElements() == 1;
+
+    Tensor* out_t;
+    functor::SetZeroFunctor<Device, T> fill;
+    if (is_1d) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
+      auto out = out_t->flat<T>();
+      fill(ctx->eigen_device<Device>(), out);
+      if (binary_output_) {
+        OP_REQUIRES_OK(ctx,
+                       functor::BincountFunctor<Device, Tidx, T, true>::Compute(
+                           ctx, values, weights, out, size));
+      } else {
+        OP_REQUIRES_OK(
+            ctx, functor::BincountFunctor<Device, Tidx, T, false>::Compute(
+                     ctx, values, weights, out, size));
+      }
+    } else {
+      const auto shape = dense_shape.flat<int64>();
+      const int64 num_rows = shape(0);
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0, TensorShape({num_rows, size}), &out_t));
+      const auto out = out_t->matrix<T>();
+      fill(ctx->eigen_device<Device>(), out_t->flat<T>());
+      const auto indices_mat = indices.matrix<int64>();
+      for (int64 i = 0; i < indices_mat.dimension(0); ++i) {
+        const int64 batch = indices_mat(i, 0);
+        const Tidx bin = values(i);
+        if (bin < size) {
+          if (binary_output_) {
+            out(batch, bin) = T(1);
+          } else {
+            if (weights_size) {
+              out(batch, bin) += weights(i);
+            } else {
+              out(batch, bin) += T(1);
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  bool binary_output_;
+};
+
+#define REGISTER_KERNELS(Tidx, T)                            \
+  REGISTER_KERNEL_BUILDER(Name("SparseBincount")             \
+                              .Device(DEVICE_CPU)            \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<Tidx>("Tidx"), \
+                          SparseBincountOp<CPUDevice, Tidx, T>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(int32, T);   \
+  REGISTER_KERNELS(int64, T);
+
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+template <typename Device, typename Tidx, typename T>
+class RaggedBincountOp : public OpKernel {
+ public:
+  explicit RaggedBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const auto splits = ctx->input(0).flat<int64>();
+    const auto values = ctx->input(1).flat<Tidx>();
+    const Tensor& size_t = ctx->input(2);
+    const auto weights = ctx->input(3).flat<T>();
+    const int64 weights_size = weights.size();
+
+    Tidx size = size_t.scalar<Tidx>()();
+    OP_REQUIRES(
+        ctx, size >= 0,
+        errors::InvalidArgument("size (", size, ") must be non-negative"));
+
+    int num_rows = splits.size() - 1;
+    int num_values = values.size();
+    int batch_idx = 0;
+
+    Tensor* out_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, TensorShape({num_rows, size}), &out_t));
+    functor::SetZeroFunctor<Device, T> fill;
+    fill(ctx->eigen_device<Device>(), out_t->flat<T>());
+    const auto out = out_t->matrix<T>();
+
+    for (int idx = 0; idx < num_values; ++idx) {
+      while (idx >= splits(batch_idx)) {
+        batch_idx++;
+      }
+      Tidx bin = values(idx);
+      OP_REQUIRES(ctx, bin >= 0,
+                  errors::InvalidArgument("Input must be non-negative"));
+      if (bin < size) {
+        if (binary_output_) {
+          out(batch_idx - 1, bin) = T(1);
+        } else {
+          T value = (weights_size > 0) ? weights(idx) : T(1);
+          out(batch_idx - 1, bin) += value;
+        }
+      }
+    }
+  }
+
+ private:
+  bool binary_output_;
+};
+
+#define REGISTER_KERNELS(Tidx, T)                            \
+  REGISTER_KERNEL_BUILDER(Name("RaggedBincount")             \
+                              .Device(DEVICE_CPU)            \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<Tidx>("Tidx"), \
+                          RaggedBincountOp<CPUDevice, Tidx, T>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(int32, T);   \
+  REGISTER_KERNELS(int64, T);
+
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bincount_op.h b/tensorflow/core/kernels/bincount_op.h
index 54cfb79de78..054552a0075 100644
--- a/tensorflow/core/kernels/bincount_op.h
+++ b/tensorflow/core/kernels/bincount_op.h
@@ -26,12 +26,22 @@ namespace tensorflow {
 
 namespace functor {
 
-template <typename Device, typename T>
+template <typename Device, typename Tidx, typename T, bool binary_count>
 struct BincountFunctor {
   static Status Compute(OpKernelContext* context,
-                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<Tidx, 1>::ConstTensor& arr,
                         const typename TTypes<T, 1>::ConstTensor& weights,
-                        typename TTypes<T, 1>::Tensor& output);
+                        typename TTypes<T, 1>::Tensor& output,
+                        const Tidx num_bins);
+};
+
+template <typename Device, typename Tidx, typename T, bool binary_count>
+struct BincountReduceFunctor {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<Tidx, 2>::ConstTensor& in,
+                        const typename TTypes<T, 2>::ConstTensor& weights,
+                        typename TTypes<T, 2>::Tensor& out,
+                        const Tidx num_bins);
 };
 
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
index de1457d6ddf..b137413d5e3 100644
--- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -17,38 +17,29 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#if GOOGLE_CUDA
-#include "third_party/cub/device/device_histogram.cuh"
-#elif TENSORFLOW_USE_ROCM
-#include "rocm/include/hipcub/hipcub.hpp"
-#endif
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bincount_op.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
-template <typename T>
-struct BincountFunctor<GPUDevice, T> {
+template <typename Tidx, typename T>
+struct BincountFunctor<GPUDevice, Tidx, T, false> {
   static Status Compute(OpKernelContext* context,
-                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<Tidx, 1>::ConstTensor& arr,
                         const typename TTypes<T, 1>::ConstTensor& weights,
-                        typename TTypes<T, 1>::Tensor& output) {
+                        typename TTypes<T, 1>::Tensor& output,
+                        const Tidx num_bins) {
     if (weights.size() != 0) {
       return errors::InvalidArgument(
           "Weights should not be passed as it should be "
@@ -59,11 +50,11 @@ struct BincountFunctor<GPUDevice, T> {
     }
     // In case weight.size() == 0, use CUB
     size_t temp_storage_bytes = 0;
-    const int32* d_samples = arr.data();
+    const Tidx* d_samples = arr.data();
     T* d_histogram = output.data();
     int num_levels = output.size() + 1;
-    int32 lower_level = 0;
-    int32 upper_level = output.size();
+    Tidx lower_level = Tidx(0);
+    Tidx upper_level = num_bins;
     int num_samples = arr.size();
     const gpuStream_t& stream = GetGpuStream(context);
 
@@ -110,10 +101,142 @@ struct BincountFunctor<GPUDevice, T> {
   }
 };
 
+template <typename Tidx, typename T>
+__global__ void BincountReduceKernel(const Tidx* in, T* out, const int nthreads,
+                                     const Tidx num_bins) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
+    Tidx bin = ldg(in + index);
+    if (bin < num_bins) {
+      out[bin] = T(1);
+    }
+  }
+}
+
+template <typename Tidx, typename T>
+struct BincountFunctor<GPUDevice, Tidx, T, true> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<Tidx, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output,
+                        const Tidx num_bins) {
+    const int nthreads = arr.dimension(0);
+
+    auto d = context->eigen_gpu_device();
+    GpuLaunchConfig config = GetGpuLaunchConfig(nthreads, d);
+    return GpuLaunchKernel(BincountReduceKernel<Tidx, T>, config.block_count,
+                           config.thread_per_block, 0, d.stream(), arr.data(),
+                           output.data(), nthreads, num_bins);
+    return Status::OK();
+  }
+};
+
+template <typename Tidx, typename T, bool binary_count>
+__global__ void BincountColReduceKernel(const Tidx* in, const T* weights,
+                                        const int weights_size, T* out,
+                                        const int num_rows, const int num_cols,
+                                        const Tidx num_bins) {
+  const int nthreads = num_rows * num_cols;
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
+    Tidx bin = ldg(in + index);
+    if (bin < num_bins) {
+      int row = index / num_cols;
+      int offset = row * num_bins + bin;
+      if (binary_count) {
+        out[offset] = T(1);
+      } else {
+        T value = (weights_size == 0) ? T(1) : ldg(weights + index);
+        GpuAtomicAdd(out + offset, value);
+      }
+    }
+  }
+}
+
+template <typename Tidx, typename T, bool binary_count>
+__global__ void BincountColReduceSharedKernel(const Tidx* in, const T* weights,
+                                              const int weights_size, T* out,
+                                              const int num_rows,
+                                              const int num_cols,
+                                              const Tidx num_bins) {
+  const int out_size = num_rows * num_bins;
+  GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(T), unsigned char, shared_col_mem);
+  T* shared_col_bins = reinterpret_cast<T*>(shared_col_mem);
+  for (unsigned int binIdx = threadIdx.x; binIdx < out_size;
+       binIdx += blockDim.x) {
+    shared_col_bins[binIdx] = T(0);
+  }
+  __syncthreads();
+  const int nthreads = num_rows * num_cols;
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
+    Tidx bin = ldg(in + index);
+    if (bin < num_bins) {
+      int row = index / num_cols;
+      int offset = row * num_bins + bin;
+      if (binary_count) {
+        shared_col_bins[offset] = T(1);
+      } else {
+        T value = (weights_size == 0) ? T(1) : ldg(weights + index);
+        GpuAtomicAdd(shared_col_bins + offset, value);
+      }
+    }
+  }
+  __syncthreads();
+  for (unsigned int binIdx = threadIdx.x; binIdx < out_size;
+       binIdx += blockDim.x) {
+    if (binary_count) {
+      // out[binIdx] = out[binIdx] & shared_col_bins[binIdx];
+      if (shared_col_bins[binIdx]) {
+        out[binIdx] = shared_col_bins[binIdx];
+      }
+    } else {
+      GpuAtomicAdd(out + binIdx, shared_col_bins[binIdx]);
+    }
+  }
+}
+
+template <typename Tidx, typename T, bool binary_count>
+struct BincountReduceFunctor<GPUDevice, Tidx, T, binary_count> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<Tidx, 2>::ConstTensor& in,
+                        const typename TTypes<T, 2>::ConstTensor& weights,
+                        typename TTypes<T, 2>::Tensor& out,
+                        const Tidx num_bins) {
+    const int num_rows = in.dimension(0);
+    const int num_cols = in.dimension(1);
+
+    auto d = context->eigen_gpu_device();
+    GpuLaunchConfig config = GetGpuLaunchConfig(num_rows * num_cols, d);
+
+    // Use half of maximum shared memory, approximately 6 * 1024 inputs.
+    int smem_max = d.sharedMemPerBlock() / 2;
+    int smem_usage = out.size() * sizeof(T);
+    if (smem_usage < smem_max) {
+      return GpuLaunchKernel(
+          BincountColReduceSharedKernel<Tidx, T, binary_count>,
+          config.block_count, config.thread_per_block, smem_usage, d.stream(),
+          in.data(), weights.data(), weights.size(), out.data(), num_rows,
+          num_cols, num_bins);
+    } else {
+      return GpuLaunchKernel(
+          BincountColReduceKernel<Tidx, T, binary_count>, config.block_count,
+          config.thread_per_block, 0, d.stream(), in.data(), weights.data(),
+          weights.size(), out.data(), num_rows, num_cols, num_bins);
+    }
+
+    return Status::OK();
+  }
+};
+
 }  // end namespace functor
 
-#define REGISTER_GPU_SPEC(type) \
-  template struct functor::BincountFunctor<GPUDevice, type>;
+#define REGISTER_GPU_SPEC(T)                                                  \
+  template struct functor::BincountFunctor<GPUDevice, int32, T, true>;        \
+  template struct functor::BincountFunctor<GPUDevice, int64, T, true>;        \
+  template struct functor::BincountFunctor<GPUDevice, int32, T, false>;       \
+  template struct functor::BincountFunctor<GPUDevice, int64, T, false>;       \
+  template struct functor::BincountReduceFunctor<GPUDevice, int32, T, true>;  \
+  template struct functor::BincountReduceFunctor<GPUDevice, int64, T, true>;  \
+  template struct functor::BincountReduceFunctor<GPUDevice, int32, T, false>; \
+  template struct functor::BincountReduceFunctor<GPUDevice, int64, T, false>;
 
 TF_CALL_int32(REGISTER_GPU_SPEC);
 TF_CALL_float(REGISTER_GPU_SPEC);
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
index 5690c3a6014..a22af7ab71e 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
@@ -16,6 +16,7 @@
 #define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
 
 #include <cstring>
+#include <list>
 #include <vector>
 
 #include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer.h"
@@ -250,10 +251,37 @@ class WeightedQuantilesSummary {
     float compression_eps = ApproximationError() + (1.0 / num_boundaries);
     compressed_summary.Compress(num_boundaries, compression_eps);
 
+    // Remove the least important boundaries by the gap removing them would
+    // create.
+    std::list<int64> boundaries_to_keep;
+    for (int64 i = 0; i != compressed_summary.entries_.size(); ++i) {
+      boundaries_to_keep.push_back(i);
+    }
+    while (boundaries_to_keep.size() > num_boundaries) {
+      std::list<int64>::iterator min_element = boundaries_to_keep.end();
+      auto prev = boundaries_to_keep.begin();
+      auto curr = prev;
+      ++curr;
+      auto next = curr;
+      ++next;
+      WeightType min_weight = TotalWeight();
+      for (; next != boundaries_to_keep.end(); ++prev, ++curr, ++next) {
+        WeightType new_weight =
+            compressed_summary.entries_[*next].PrevMaxRank() -
+            compressed_summary.entries_[*prev].NextMinRank();
+        if (new_weight < min_weight) {
+          min_element = curr;
+          min_weight = new_weight;
+        }
+      }
+      boundaries_to_keep.erase(min_element);
+    }
+
     // Return boundaries.
-    output.reserve(compressed_summary.entries_.size());
-    for (const auto& entry : compressed_summary.entries_) {
-      output.push_back(entry.value);
+    output.reserve(boundaries_to_keep.size());
+    for (auto itr = boundaries_to_keep.begin(); itr != boundaries_to_keep.end();
+         ++itr) {
+      output.push_back(compressed_summary.entries_[*itr].value);
     }
     return output;
   }
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
index a3844b8b769..db6f1871f47 100644
--- a/tensorflow/core/kernels/broadcast_to_op.cc
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -115,6 +115,7 @@ namespace functor {
   extern template struct BroadcastTo<GPUDevice, Type>;
 
 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_TEMPLATE);
+TF_CALL_int64(DECLARE_GPU_TEMPLATE);
 #undef DECLARE_GPU_KERNEL
 }  // namespace functor
 
@@ -126,6 +127,7 @@ TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_TEMPLATE);
                           BroadcastToOp<GPUDevice, type>);
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
index aae1fb718d4..0e9ec7a4c01 100644
--- a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
@@ -28,6 +28,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #define INSTANTIATE_GPU_KERNEL(Type) \
   template class functor::BroadcastTo<GPUDevice, Type>;
 TF_CALL_GPU_ALL_TYPES(INSTANTIATE_GPU_KERNEL);
+TF_CALL_int64(INSTANTIATE_GPU_KERNEL);
 #undef INSTANTIATE_GPU_KERNEL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index 0feef164fec..6922158413d 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -353,7 +353,7 @@ class CheckNumericsV2Op<GPUDevice, T> : public CheckNumericsOp<GPUDevice, T> {
     }
   }
 
-  static const int abnormal_detected_size = 3;
+  static constexpr int abnormal_detected_size = 3;
 };
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index 8659dd9805b..8f3a958149b 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -205,15 +205,10 @@ class NcclTestBase : public ::testing::Test {
       VLOG(2) << "rank " << rank << " output " << output << " buf "
               << DMAHelper::base(output);
       Tensor actual(DT_FLOAT, TensorShape({output_length}));
-      Notification note;
       Device* dev = instances_[rank]->device_;
       auto* dev_info = dev->tensorflow_gpu_device_info();
-      dev_info->default_context->CopyDeviceTensorToCPU(
-          output, /*tensor_name=*/"", dev, &actual, [&note](const Status& s) {
-            TF_CHECK_OK(s);
-            note.Notify();
-          });
-      note.WaitForNotification();
+      TF_CHECK_OK(dev_info->default_context->CopyDeviceTensorToCPUSync(
+          output, /*tensor_name=*/"", dev, &actual));
       VLOG(3) << "rank " << rank << " got output tensor "
               << actual.DebugString(output_length);
       for (int i = 0; i < output_length; ++i) {
@@ -270,13 +265,8 @@ class NcclTestBase : public ::testing::Test {
         VLOG(2) << "input tensor " << cpu_tensor.DebugString();
       }
       auto* dev_info = device_->tensorflow_gpu_device_info();
-      Notification note;
-      dev_info->default_context->CopyCPUTensorToDevice(
-          &cpu_tensor, device_, &input_, [&note](const Status& s) {
-            TF_CHECK_OK(s);
-            note.Notify();
-          });
-      note.WaitForNotification();
+      TF_CHECK_OK(dev_info->default_context->CopyCPUTensorToDeviceSync(
+          &cpu_tensor, device_, &input_));
     }
 
     void PrepareDeviceContext(OpKernelContext::Params* params) {
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index f787d879ed6..4bcbc076446 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -48,12 +48,15 @@ namespace tensorflow {
 namespace {
 
 NodeDef StripTensorDataFromNodeDef(OpKernelConstruction* ctx) {
-#ifndef TENSORFLOW_LITE_PROTOS
-  DCHECK_EQ(NodeDef::descriptor()->field_count(), 6)
-      << "The NodeDef format has changed, and the attr-stripping code may need "
-      << "to be updated.";
-#endif
   const NodeDef& original = ctx->def();
+  if (std::is_base_of<protobuf::Message, NodeDef>()) {
+    DCHECK_EQ(reinterpret_cast<const protobuf::Message*>(&original)
+                  ->GetDescriptor()
+                  ->field_count(),
+              6)
+        << "The NodeDef format has changed, and the attr-stripping code may "
+           "need to be updated.";
+  }
   NodeDef ret;
   ret.set_name(original.name());
   ret.set_op(original.op());
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 21dffa3cc5e..308ec4053c3 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -1028,12 +1028,14 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBias(filter_size, filter_count);
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
   this->VerifyConv2DWithBias(filter_size, filter_count,
                              /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
+#endif
 
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1062,6 +1064,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1072,6 +1075,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
+#endif
 
 // -------------------------------------------------------------------------- //
 // Conv2D + FusedBatchNorm + {Activation}                                     //
@@ -1095,6 +1099,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
@@ -1102,6 +1107,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
       filter_size, filter_count,
       /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
+#endif
 
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1131,6 +1137,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1141,34 +1148,49 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
+#endif
 
-REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,          //
-                            OneByOneConvolution,                //
-                            ImageSizeConvolution,               //
-                            SpatialConvolution,                 //
-                            ExplicitPaddingConvolution,         //
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,  //
+                            OneByOneConvolution,        //
+                            ImageSizeConvolution,       //
+                            SpatialConvolution,         //
+#ifndef INTEL_MKL
+                            ExplicitPaddingConvolution,  //
+#endif
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
-                            SpatialConvolutionAndActivation,    //
+#ifndef INTEL_MKL
+                            SpatialConvolutionAndActivation,  //
                             ExplicitPaddingConvolutionAndActivation);
+#else
+                            SpatialConvolutionAndActivation);
+#endif
 
-REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
-                            OneByOneConvolution,                //
-                            ImageSizeConvolution,               //
-                            SpatialConvolution,                 //
-                            ExplicitPaddingConvolution,         //
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,  //
+                            OneByOneConvolution,             //
+                            ImageSizeConvolution,            //
+                            SpatialConvolution,              //
+#ifndef INTEL_MKL
+                            ExplicitPaddingConvolution,  //
+#endif
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
-                            SpatialConvolutionAndActivation,    //
+#ifndef INTEL_MKL
+                            SpatialConvolutionAndActivation,  //
                             ExplicitPaddingConvolutionAndActivation);
+#else
+                            SpatialConvolutionAndActivation);
+#endif
 
 using FusedBiasAddDataTypes = ::testing::Types<float, double>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
                                FusedBiasAddDataTypes);
 
+#ifndef INTEL_MKL
 using FusedBatchNormDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
                                FusedBatchNormDataTypes);
+#endif
 
 #endif  // TENSORFLOW_USE_ROCM
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
new file mode 100644
index 00000000000..7c85b050039
--- /dev/null
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -0,0 +1,310 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class T>
+using BatchedMap = std::vector<absl::flat_hash_map<int64, T>>;
+
+namespace {
+// TODO(momernick): Extend this function to work with outputs of rank > 2.
+template <class T>
+Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
+                    bool is_1d, OpKernelContext* context) {
+  int total_values = 0;
+  int num_batches = per_batch_counts.size();
+  for (const auto& per_batch_count : per_batch_counts) {
+    total_values += per_batch_count.size();
+  }
+
+  Tensor* indices;
+  int inner_dim = is_1d ? 1 : 2;
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      0, TensorShape({total_values, inner_dim}), &indices));
+
+  Tensor* values;
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(1, TensorShape({total_values}), &values));
+
+  auto output_indices = indices->matrix<int64>();
+  auto output_values = values->flat<T>();
+  int64 value_loc = 0;
+  for (int b = 0; b < num_batches; ++b) {
+    const auto& per_batch_count = per_batch_counts[b];
+    std::vector<std::pair<int, T>> pairs(per_batch_count.begin(),
+                                         per_batch_count.end());
+    std::sort(pairs.begin(), pairs.end());
+    for (const auto& x : pairs) {
+      if (is_1d) {
+        output_indices(value_loc, 0) = x.first;
+      } else {
+        output_indices(value_loc, 0) = b;
+        output_indices(value_loc, 1) = x.first;
+      }
+      output_values(value_loc) = x.second;
+      ++value_loc;
+    }
+  }
+  Tensor* dense_shape;
+  if (is_1d) {
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(2, TensorShape({1}), &dense_shape));
+    dense_shape->flat<int64>().data()[0] = num_values;
+  } else {
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(2, TensorShape({2}), &dense_shape));
+    dense_shape->flat<int64>().data()[0] = num_batches;
+    dense_shape->flat<int64>().data()[1] = num_values;
+  }
+
+  return Status::OK();
+}
+
+int GetOutputSize(int max_seen, int max_length, int min_length) {
+  return max_length > 0 ? max_length : std::max((max_seen + 1), min_length);
+}
+
+}  // namespace
+
+template <class T, class W>
+class DenseCount : public OpKernel {
+ public:
+  explicit DenseCount(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
+    OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& data = context->input(0);
+    const Tensor& weights = context->input(1);
+    bool use_weights = weights.NumElements() > 0;
+
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsVector(data.shape()) ||
+                    TensorShapeUtils::IsMatrix(data.shape()),
+                errors::InvalidArgument(
+                    "Input must be a 1 or 2-dimensional tensor. Got: ",
+                    data.shape().DebugString()));
+
+    if (use_weights) {
+      OP_REQUIRES(
+          context, weights.shape() == data.shape(),
+          errors::InvalidArgument(
+              "Weights and data must have the same shape. Weight shape: ",
+              weights.shape().DebugString(),
+              "; data shape: ", data.shape().DebugString()));
+    }
+
+    bool is_1d = TensorShapeUtils::IsVector(data.shape());
+    int negative_valued_axis = -1;
+    int num_batch_dimensions = (data.shape().dims() + negative_valued_axis);
+
+    int num_batch_elements = 1;
+    for (int i = 0; i < num_batch_dimensions; ++i) {
+      num_batch_elements *= data.shape().dim_size(i);
+    }
+    int num_value_elements = data.shape().num_elements() / num_batch_elements;
+    auto per_batch_counts = BatchedMap<W>(num_batch_elements);
+
+    T max_value = 0;
+
+    const auto data_values = data.flat<T>();
+    const auto weight_values = weights.flat<W>();
+    int i = 0;
+    for (int b = 0; b < num_batch_elements; ++b) {
+      for (int v = 0; v < num_value_elements; ++v) {
+        const auto& value = data_values(i);
+        if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
+          if (binary_output_) {
+            per_batch_counts[b][value] = 1;
+          } else if (use_weights) {
+            per_batch_counts[b][value] += weight_values(i);
+          } else {
+            per_batch_counts[b][value]++;
+          }
+          if (value > max_value) {
+            max_value = value;
+          }
+        }
+        ++i;
+      }
+    }
+
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
+  }
+
+ private:
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+};
+
+template <class T, class W>
+class SparseCount : public OpKernel {
+ public:
+  explicit SparseCount(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
+    OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& indices = context->input(0);
+    const Tensor& values = context->input(1);
+    const Tensor& shape = context->input(2);
+    const Tensor& weights = context->input(3);
+    bool use_weights = weights.NumElements() > 0;
+
+    bool is_1d = shape.NumElements() == 1;
+    int num_batches = is_1d ? 1 : shape.flat<int64>()(0);
+    int num_values = values.NumElements();
+
+    const auto indices_values = indices.matrix<int64>();
+    const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
+
+    auto per_batch_counts = BatchedMap<W>(num_batches);
+
+    T max_value = 0;
+
+    for (int idx = 0; idx < num_values; ++idx) {
+      int batch = is_1d ? 0 : indices_values(idx, 0);
+      const auto& value = values_values(idx);
+      if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
+        if (binary_output_) {
+          per_batch_counts[batch][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch][value] += weight_values(idx);
+        } else {
+          per_batch_counts[batch][value]++;
+        }
+        if (value > max_value) {
+          max_value = value;
+        }
+      }
+    }
+
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
+  }
+
+ private:
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
+};
+
+template <class T, class W>
+class RaggedCount : public OpKernel {
+ public:
+  explicit RaggedCount(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
+    OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& splits = context->input(0);
+    const Tensor& values = context->input(1);
+    const Tensor& weights = context->input(2);
+    bool use_weights = weights.NumElements() > 0;
+    bool is_1d = false;
+
+    const auto splits_values = splits.flat<int64>();
+    const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
+    int num_batches = splits.NumElements() - 1;
+    int num_values = values.NumElements();
+
+    auto per_batch_counts = BatchedMap<W>(num_batches);
+    T max_value = 0;
+    int batch_idx = 0;
+
+    for (int idx = 0; idx < num_values; ++idx) {
+      while (idx >= splits_values(batch_idx)) {
+        batch_idx++;
+      }
+      const auto& value = values_values(idx);
+      if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
+        if (binary_output_) {
+          per_batch_counts[batch_idx - 1][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch_idx - 1][value] += weight_values(idx);
+        } else {
+          per_batch_counts[batch_idx - 1][value]++;
+        }
+        if (value > max_value) {
+          max_value = value;
+        }
+      }
+    }
+
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
+  }
+
+ private:
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
+};
+
+#define REGISTER_W(W_TYPE) \
+  REGISTER(int32, W_TYPE)  \
+  REGISTER(int64, W_TYPE)
+
+#define REGISTER(I_TYPE, W_TYPE)                                     \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")             \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          DenseCount<I_TYPE, W_TYPE>)                \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          SparseCount<I_TYPE, W_TYPE>)               \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          RaggedCount<I_TYPE, W_TYPE>)
+
+TF_CALL_INTEGRAL_TYPES(REGISTER_W);
+TF_CALL_float(REGISTER_W);
+TF_CALL_double(REGISTER_W);
+
+#undef REGISTER_W
+#undef REGISTER
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/count_ops_test.cc b/tensorflow/core/kernels/count_ops_test.cc
new file mode 100644
index 00000000000..5c504d8e8f9
--- /dev/null
+++ b/tensorflow/core/kernels/count_ops_test.cc
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST_F(OpsTestBase, DenseCountSparseOutputShapeFn) {
+  ShapeInferenceTestOp op("DenseCountSparseOutput");
+  INFER_OK(op, "[?];?", "[?,1];[?];[1]");
+  INFER_OK(op, "[?,?];?", "[?,2];[?];[2]");
+}
+
+TEST_F(OpsTestBase, SparseCountSparseOutputShapeFn) {
+  ShapeInferenceTestOp op("SparseCountSparseOutput");
+  INFER_OK(op, "[?,1];?;?;?", "[?,d0_1];[?];[d0_1]");
+  INFER_OK(op, "[?,2];?;?;?", "[?,d0_1];[?];[d0_1]");
+}
+
+TEST_F(OpsTestBase, RaggedCountSparseOutputShapeFn) {
+  ShapeInferenceTestOp op("RaggedCountSparseOutput");
+  INFER_OK(op, "?;[?];?", "[?,2];[?];[2]");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
new file mode 100644
index 00000000000..509ac008355
--- /dev/null
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -0,0 +1,67 @@
+# Generates headers containing cubin for CUDA kernels.
+load("//tensorflow/core/kernels/cubin_headers:build_defs.bzl", "gen_kernel_image_hdr")
+
+bias_add_kernel = """
+func @bias_add(%arg0: tensor<?x?xf99>,
+         %arg1: tensor<?xf99>) -> tensor<?x?xf99> {
+  %0 = "tf.BiasAdd"(%arg0, %arg1) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?x?xf99>, tensor<?xf99>) -> tensor<?x?xf99>
+  return %0 : tensor<?x?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "bias_add_{type}_kernel".format(type = type),
+        op = bias_add_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        same_shape = "0,2",
+        tile_size = "16x16",
+    )
+    for (type, dtype) in [
+        ("f16", "DT_HALF"),
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
+
+relu_kernel = """
+func @relu(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+  %0 = "tf.Relu"(%arg0) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?xf99>) -> tensor<?xf99>
+  return %0 : tensor<?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "relu_{type}_kernel".format(type = type),
+        op = relu_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        same_shape = "0,1",
+        tile_size = "256",
+    )
+    for (type, dtype) in [
+        ("f16", "DT_HALF"),
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
+
+tanh_kernel = """
+func @tanh(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+  %0 = "tf.Tanh"(%arg0) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?xf99>) -> tensor<?xf99>
+  return %0 : tensor<?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "tanh_{type}_kernel".format(type = type),
+        op = tanh_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        tile_size = "256",
+    )
+    for (type, dtype) in [
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
new file mode 100644
index 00000000000..14f47601f06
--- /dev/null
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -0,0 +1,100 @@
+"""Generates cubin headers for TF dialect ops."""
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
+
+def _lookup_file(filegroup, path):
+    """Extracts file at (relative) path in filegroup."""
+    for file in filegroup.files.to_list():
+        if file.path.endswith(path):
+            return file
+    return None
+
+def _gen_kernel_image_hdr_impl(ctx):
+    if not ctx.attr.gpu_archs:
+        fail("No GPU architecture specified, use --config=cuda or similar")
+
+    name = ctx.attr.name
+    tile_sizes = ctx.attr.tile_size.replace("x", ",")
+    same_shape = []
+    if ctx.attr.same_shape:
+        same_shape.append("--same_shape=%s" % ctx.attr.same_shape)
+
+    cubins = []
+    images = []
+    for arch in ctx.attr.gpu_archs:
+        filename = "%s.%s.cubin" % (name, arch)
+        cubin = ctx.actions.declare_file(filename)
+        ctx.actions.run(
+            outputs = [cubin],
+            executable = ctx.executable._tool,
+            arguments = same_shape + [
+                "--tile_sizes=%s" % tile_sizes,
+                "--arch=%s" % arch.split("_")[1],
+                "--output=%s" % cubin.path,
+                ctx.attr.op,
+            ],
+            mnemonic = "compile",
+        )
+        cubins.append(cubin)
+        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
+
+    # Generate fatbin file from all cubins.
+    fatbin = ctx.actions.declare_file("%s.fatbin" % name)
+    ctx.actions.run(
+        outputs = [fatbin],
+        inputs = cubins,
+        executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
+        arguments = [
+            "--64",
+            "--cmdline=--compile-only",
+            "--link",
+            "--compress-all",
+            "--create=%s" % fatbin.path,
+        ] + images,
+        mnemonic = "fatbinary",
+    )
+
+    bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c")
+    ctx.actions.run_shell(
+        outputs = [ctx.outputs.out],
+        inputs = [fatbin],
+        tools = [bin2c],
+        command = "%s --static --const --type=int --name=%s %s 1> %s" %
+                  (bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
+        mnemonic = "bin2c",
+    )
+
+_gen_kernel_image_hdr = rule(
+    implementation = _gen_kernel_image_hdr_impl,
+    output_to_genfiles = True,
+    attrs = {
+        "op": attr.string(mandatory = True),
+        "tile_size": attr.string(mandatory = True),
+        "same_shape": attr.string(),
+        "out": attr.output(mandatory = True),
+        "symbol": attr.string(mandatory = True),
+        "gpu_archs": attr.string_list(mandatory = True),
+        "_cuda_root": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda_root"),
+        ),
+        "_tool": attr.label(
+            executable = True,
+            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
+            cfg = "host",
+        ),
+    },
+)
+
+def gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None):
+    """Generates a C header with fatbin data from a Tensorflow op."""
+    if cuda_gpu_architectures():
+        _gen_kernel_image_hdr(
+            name = name,
+            op = op,
+            tile_size = tile_size,
+            same_shape = same_shape,
+            out = "%s.h" % name,
+            symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
+            gpu_archs = cuda_gpu_architectures(),
+            tags = tags,
+        )
diff --git a/tensorflow/core/kernels/cuda_sparse.cc b/tensorflow/core/kernels/cuda_sparse.cc
index 9d4ddc13d0d..141aae61571 100644
--- a/tensorflow/core/kernels/cuda_sparse.cc
+++ b/tensorflow/core/kernels/cuda_sparse.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/gpus/cuda/include/cusparse.h"
+#include "third_party/gpus/cuda/include/library_types.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -179,6 +180,10 @@ Status GpuSparse::Initialize() {
   return Status::OK();
 }
 
+#define TF_CALL_CUSPARSE_DTYPES(m)           \
+  m(float, CUDA_R_32F) m(double, CUDA_R_64F) \
+      m(std::complex<float>, CUDA_C_32F) m(std::complex<double>, CUDA_C_64F)
+
 // Macro that specializes a sparse method for all 4 standard
 // numeric types.
 // TODO: reuse with cuda_solvers
@@ -359,23 +364,30 @@ Status GpuSparse::Csr2coo(const int* csrRowPtr, int nnz, int m,
   return Status::OK();
 }
 
-Status GpuSparse::CsrgeamNnz(int m, int n, const cusparseMatDescr_t descrA,
-                             int nnzA, const int* csrSortedRowPtrA,
-                             const int* csrSortedColIndA,
-                             const cusparseMatDescr_t descrB, int nnzB,
-                             const int* csrSortedRowPtrB,
-                             const int* csrSortedColIndB,
-                             const cusparseMatDescr_t descrC,
-                             int* csrSortedRowPtrC, int* nnzTotalDevHostPtr) {
+Status GpuSparse::CsrgeamNnz(
+    int m, int n, const cusparseMatDescr_t descrA, int nnzA,
+    const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const int* csrSortedRowPtrB,
+    const int* csrSortedColIndB, const cusparseMatDescr_t descrC,
+    int* csrSortedRowPtrC, int* nnzTotalDevHostPtr, void* workspace) {
   DCHECK(initialized_);
   DCHECK(nnzTotalDevHostPtr != nullptr);
+#if CUDA_VERSION >= 10000
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseXcsrgeam2Nnz(
+      *gpusparse_handle_, m, n, descrA, nnzA, csrSortedRowPtrA,
+      csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB,
+      descrC, csrSortedRowPtrC, nnzTotalDevHostPtr, workspace));
+#else
   TF_RETURN_IF_GPUSPARSE_ERROR(cusparseXcsrgeamNnz(
       *gpusparse_handle_, m, n, descrA, nnzA, csrSortedRowPtrA,
       csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB,
       descrC, csrSortedRowPtrC, nnzTotalDevHostPtr));
+#endif
   return Status::OK();
 }
 
+#if CUDA_VERSION < 10020
+
 template <typename Scalar, typename SparseFnT>
 static inline Status CsrmmImpl(
     SparseFnT op, OpKernelContext* context, cusparseHandle_t cusparse_handle,
@@ -416,6 +428,45 @@ static inline Status CsrmmImpl(
 
 TF_CALL_LAPACK_TYPES(CSRMM_INSTANCE);
 
+#else
+
+#define SPMM_BUFFERSIZE_INSTANCE(Scalar, dtype)                              \
+  template <>                                                                \
+  Status GpuSparse::SpMMBufferSize<Scalar>(                                  \
+      cusparseOperation_t transA, cusparseOperation_t transB,                \
+      const Scalar* alpha, const cusparseSpMatDescr_t matA,                  \
+      const gpusparseDnMatDescr_t matB, const Scalar* beta,                  \
+      gpusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize) \
+      const {                                                                \
+    DCHECK(initialized_);                                                    \
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpMM_bufferSize(                    \
+        *gpusparse_handle_, transA, transB, alpha, matA, matB, beta, matC,   \
+        dtype, alg, bufferSize));                                            \
+    return Status::OK();                                                     \
+  }
+
+TF_CALL_CUSPARSE_DTYPES(SPMM_BUFFERSIZE_INSTANCE);
+
+#define SPMM_INSTANCE(Scalar, dtype)                                           \
+  template <>                                                                  \
+  Status GpuSparse::SpMM<Scalar>(                                              \
+      cusparseOperation_t transA, cusparseOperation_t transB,                  \
+      const Scalar* alpha, const cusparseSpMatDescr_t matA,                    \
+      const gpusparseDnMatDescr_t matB, const Scalar* beta,                    \
+      gpusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, int8* buffer) const { \
+    DCHECK(initialized_);                                                      \
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpMM(*gpusparse_handle_, transA,      \
+                                              transB, alpha, matA, matB, beta, \
+                                              matC, dtype, alg, buffer));      \
+    return Status::OK();                                                       \
+  }
+
+TF_CALL_CUSPARSE_DTYPES(SPMM_INSTANCE);
+
+#endif
+
+#if CUDA_VERSION < 10020
+
 template <typename Scalar, typename SparseFnT>
 static inline Status CsrmvImpl(
     SparseFnT op, OpKernelContext* context, cusparseHandle_t cusparse_handle,
@@ -455,6 +506,115 @@ static inline Status CsrmvImpl(
 
 TF_CALL_LAPACK_TYPES(CSRMV_INSTANCE);
 
+#else
+
+template <typename Scalar>
+static inline Status CsrmvExImpl(cudaDataType_t dtype, OpKernelContext* context,
+                                 cusparseHandle_t cusparse_handle,
+                                 cusparseOperation_t transA, int m, int n,
+                                 int nnz, const Scalar* alpha_host,
+                                 const Scalar* csrSortedValA,
+                                 const int* csrSortedRowPtrA,
+                                 const int* csrSortedColIndA, const Scalar* x,
+                                 const Scalar* beta_host, Scalar* y) {
+  cusparseMatDescr_t descrA;
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
+  // CUSPARSE_ALG_MERGE_PATH algo only supports non-transpose matrix.
+  DCHECK(transA == CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+  size_t bufferSize;
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCsrmvEx_bufferSize(
+      cusparse_handle, CUSPARSE_ALG_MERGE_PATH, transA, m, n, nnz, alpha_host,
+      dtype, descrA, csrSortedValA, dtype, csrSortedRowPtrA, csrSortedColIndA,
+      x, dtype, beta_host, dtype, y, dtype, dtype, &bufferSize));
+
+  Tensor buffer;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DT_INT8, TensorShape({static_cast<int64>(bufferSize)}), &buffer));
+  auto pBuffer = buffer.flat<int8>();
+  DCHECK(pBuffer.data() != nullptr);
+
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCsrmvEx(
+      cusparse_handle, CUSPARSE_ALG_MERGE_PATH, transA, m, n, nnz, alpha_host,
+      dtype, descrA, csrSortedValA, dtype, csrSortedRowPtrA, csrSortedColIndA,
+      x, dtype, beta_host, dtype, y, dtype, dtype, pBuffer.data()));
+
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyMatDescr(descrA));
+  return Status::OK();
+}
+
+template <typename Scalar>
+static inline Status SpMVImpl(cudaDataType_t dtype, OpKernelContext* context,
+                              cusparseHandle_t cusparse_handle,
+                              cusparseOperation_t transA, int m, int n, int nnz,
+                              const Scalar* alpha_host,
+                              const Scalar* csrSortedValA,
+                              const int* csrSortedRowPtrA,
+                              const int* csrSortedColIndA, const Scalar* x,
+                              const Scalar* beta_host, Scalar* y) {
+  cusparseSpMatDescr_t matA;
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsr(
+      &matA, m, n, nnz, const_cast<int*>(csrSortedRowPtrA),
+      const_cast<int*>(csrSortedColIndA), const_cast<Scalar*>(csrSortedValA),
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, dtype));
+
+  cusparseDnVecDescr_t vecX, vecY;
+  int sizeX = (transA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
+  int sizeY = (transA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      cusparseCreateDnVec(&vecX, sizeX, const_cast<Scalar*>(x), dtype));
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateDnVec(&vecY, sizeY, y, dtype));
+
+  size_t bufferSize;
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpMV_bufferSize(
+      cusparse_handle, transA, alpha_host, matA, vecX, beta_host, vecY, dtype,
+      CUSPARSE_CSRMV_ALG1, &bufferSize));
+
+  Tensor buffer;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DT_INT8, TensorShape({static_cast<int64>(bufferSize)}), &buffer));
+  auto pBuffer = buffer.flat<int8>();
+  DCHECK(pBuffer.data() != nullptr);
+
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      cusparseSpMV(cusparse_handle, transA, alpha_host, matA, vecX, beta_host,
+                   vecY, dtype, CUSPARSE_CSRMV_ALG1, pBuffer.data()));
+
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnVec(vecY));
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnVec(vecX));
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroySpMat(matA));
+  return Status::OK();
+}
+
+#define CSRMV_INSTANCE(Scalar, cudaDataType)                                   \
+  template <>                                                                  \
+  Status GpuSparse::Csrmv<Scalar>(                                             \
+      cusparseOperation_t transA, int m, int n, int nnz,                       \
+      const Scalar* alpha_host, const Scalar* csrSortedValA,                   \
+      const int* csrSortedRowPtrA, const int* csrSortedColIndA,                \
+      const Scalar* x, const Scalar* beta_host, Scalar* y) const {             \
+    DCHECK(initialized_);                                                      \
+    if (transA == CUSPARSE_OPERATION_NON_TRANSPOSE) {                          \
+      return CsrmvExImpl(cudaDataType, context_, *gpusparse_handle_, transA,   \
+                         m, n, nnz, alpha_host, csrSortedValA,                 \
+                         csrSortedRowPtrA, csrSortedColIndA, x, beta_host, y); \
+    } else {                                                                   \
+      return SpMVImpl(cudaDataType, context_, *gpusparse_handle_, transA, m,   \
+                      n, nnz, alpha_host, csrSortedValA, csrSortedRowPtrA,     \
+                      csrSortedColIndA, x, beta_host, y);                      \
+    }                                                                          \
+  }
+
+TF_CALL_CUSPARSE_DTYPES(CSRMV_INSTANCE);
+
+#endif  // CUDA_VERSION < 10020
+
+#if CUDA_VERSION < 10000
+
 template <typename Scalar, typename SparseFnT>
 static inline Status CsrgeamImpl(
     SparseFnT op, OpKernelContext* context, cusparseHandle_t cusparse_handle,
@@ -483,7 +643,7 @@ static inline Status CsrgeamImpl(
       const cusparseMatDescr_t descrB, int nnzB, const Scalar* csrSortedValB, \
       const int* csrSortedRowPtrB, const int* csrSortedColIndB,               \
       const cusparseMatDescr_t descrC, Scalar* csrSortedValC,                 \
-      int* csrSortedRowPtrC, int* csrSortedColIndC) {                         \
+      int* csrSortedRowPtrC, int* csrSortedColIndC, void* workspace) {        \
     DCHECK(initialized_);                                                     \
     return CsrgeamImpl(SPARSE_FN(csrgeam, sparse_prefix), context_,           \
                        *gpusparse_handle_, m, n, alpha, descrA, nnzA,         \
@@ -493,8 +653,113 @@ static inline Status CsrgeamImpl(
                        csrSortedRowPtrC, csrSortedColIndC);                   \
   }
 
+#else
+
+template <typename Scalar, typename SparseFnT>
+static inline Status Csrgeam2Impl(
+    SparseFnT op, OpKernelContext* context, cusparseHandle_t cusparse_handle,
+    int m, int n, const Scalar* alpha, const cusparseMatDescr_t descrA,
+    int nnzA, const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
+    const int* csrSortedColIndA, const Scalar* beta,
+    const cusparseMatDescr_t descrB, int nnzB, const Scalar* csrSortedValB,
+    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
+    const cusparseMatDescr_t descrC, Scalar* csrSortedValC,
+    int* csrSortedRowPtrC, int* csrSortedColIndC, void* workspace) {
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(
+      cusparse_handle, m, n, AsCudaComplex(alpha), descrA, nnzA,
+      AsCudaComplex(csrSortedValA), csrSortedRowPtrA, csrSortedColIndA,
+      AsCudaComplex(beta), descrB, nnzB, AsCudaComplex(csrSortedValB),
+      csrSortedRowPtrB, csrSortedColIndB, descrC, AsCudaComplex(csrSortedValC),
+      csrSortedRowPtrC, csrSortedColIndC, workspace));
+  return Status::OK();
+}
+
+#define CSRGEAM_INSTANCE(Scalar, sparse_prefix)                               \
+  template <>                                                                 \
+  Status GpuSparse::Csrgeam<Scalar>(                                          \
+      int m, int n, const Scalar* alpha, const cusparseMatDescr_t descrA,     \
+      int nnzA, const Scalar* csrSortedValA, const int* csrSortedRowPtrA,     \
+      const int* csrSortedColIndA, const Scalar* beta,                        \
+      const cusparseMatDescr_t descrB, int nnzB, const Scalar* csrSortedValB, \
+      const int* csrSortedRowPtrB, const int* csrSortedColIndB,               \
+      const cusparseMatDescr_t descrC, Scalar* csrSortedValC,                 \
+      int* csrSortedRowPtrC, int* csrSortedColIndC, void* workspace) {        \
+    DCHECK(initialized_);                                                     \
+    return Csrgeam2Impl(SPARSE_FN(csrgeam2, sparse_prefix), context_,         \
+                        *gpusparse_handle_, m, n, alpha, descrA, nnzA,        \
+                        csrSortedValA, csrSortedRowPtrA, csrSortedColIndA,    \
+                        beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB,  \
+                        csrSortedColIndB, descrC, csrSortedValC,              \
+                        csrSortedRowPtrC, csrSortedColIndC, workspace);       \
+  }
+
+#endif
+
 TF_CALL_LAPACK_TYPES(CSRGEAM_INSTANCE);
 
+#if CUDA_VERSION < 10000
+
+#define CSRGEAM_BUFFERSIZE_INSTANCE(Scalar, sparse_prefix)                    \
+  template <>                                                                 \
+  Status GpuSparse::CsrgeamBufferSizeExt<Scalar>(                             \
+      int m, int n, const Scalar* alpha, const cusparseMatDescr_t descrA,     \
+      int nnzA, const Scalar* csrSortedValA, const int* csrSortedRowPtrA,     \
+      const int* csrSortedColIndA, const Scalar* beta,                        \
+      const cusparseMatDescr_t descrB, int nnzB, const Scalar* csrSortedValB, \
+      const int* csrSortedRowPtrB, const int* csrSortedColIndB,               \
+      const cusparseMatDescr_t descrC, Scalar* csrSortedValC,                 \
+      int* csrSortedRowPtrC, int* csrSortedColIndC, size_t* bufferSize) {     \
+    DCHECK(initialized_);                                                     \
+    *bufferSize = 0;                                                          \
+    return Status::OK();                                                      \
+  }
+
+#else
+
+template <typename Scalar, typename SparseFnT>
+static inline Status CsrgeamBufferSizeExtImpl(
+    SparseFnT op, OpKernelContext* context, cusparseHandle_t sparse_handle,
+    int m, int n, const Scalar* alpha, const cusparseMatDescr_t descrA,
+    int nnzA, const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
+    const int* csrSortedColIndA, const Scalar* beta,
+    const cusparseMatDescr_t descrB, int nnzB, const Scalar* csrSortedValB,
+    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
+    const cusparseMatDescr_t descrC, Scalar* csrSortedValC,
+    int* csrSortedRowPtrC, int* csrSortedColIndC, size_t* bufferSize) {
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(
+      sparse_handle, m, n, AsCudaComplex(alpha), descrA, nnzA,
+      AsCudaComplex(csrSortedValA), csrSortedRowPtrA, csrSortedColIndA,
+      AsCudaComplex(beta), descrB, nnzB, AsCudaComplex(csrSortedValB),
+      csrSortedRowPtrB, csrSortedColIndB, descrC, AsCudaComplex(csrSortedValC),
+      csrSortedRowPtrC, csrSortedColIndC, bufferSize));
+  return Status::OK();
+}
+
+#define CSRGEAM_BUFFERSIZE_INSTANCE(Scalar, sparse_prefix)                     \
+  template <>                                                                  \
+  Status GpuSparse::CsrgeamBufferSizeExt<Scalar>(                              \
+      int m, int n, const Scalar* alpha, const cusparseMatDescr_t descrA,      \
+      int nnzA, const Scalar* csrSortedValA, const int* csrSortedRowPtrA,      \
+      const int* csrSortedColIndA, const Scalar* beta,                         \
+      const cusparseMatDescr_t descrB, int nnzB, const Scalar* csrSortedValB,  \
+      const int* csrSortedRowPtrB, const int* csrSortedColIndB,                \
+      const cusparseMatDescr_t descrC, Scalar* csrSortedValC,                  \
+      int* csrSortedRowPtrC, int* csrSortedColIndC, size_t* bufferSize) {      \
+    DCHECK(initialized_);                                                      \
+    return CsrgeamBufferSizeExtImpl(                                           \
+        SPARSE_FN(csrgeam2_bufferSizeExt, sparse_prefix), context_,            \
+        *gpusparse_handle_, m, n, alpha, descrA, nnzA, csrSortedValA,          \
+        csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB, csrSortedValB, \
+        csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC,             \
+        csrSortedRowPtrC, csrSortedColIndC, bufferSize);                       \
+  }
+
+#endif
+
+TF_CALL_LAPACK_TYPES(CSRGEAM_BUFFERSIZE_INSTANCE);
+
+#if CUDA_VERSION < 10000
+
 Status GpuSparse::CsrgemmNnz(
     cusparseOperation_t transA, cusparseOperation_t transB, int m, int k, int n,
     const cusparseMatDescr_t descrA, int nnzA, const int* csrSortedRowPtrA,
@@ -551,6 +816,101 @@ static inline Status CsrgemmImpl(
 
 TF_CALL_LAPACK_TYPES(CSRGEMM_INSTANCE);
 
+#else
+
+template <typename T>
+static const T* one_ptr() {
+  static const T one = static_cast<T>(1);
+  return &one;
+}
+
+template <typename T>
+static const T* null_ptr() {
+  return nullptr;
+}
+
+#define CSRGEMM_BUFFERSIZE_INSTANCE(Scalar, sparse_prefix)                     \
+  template <>                                                                  \
+  Status GpuSparse::CsrgemmBufferSize<Scalar>(                                 \
+      int m, int n, int k, const cusparseMatDescr_t descrA, int nnzA,          \
+      const int* csrSortedRowPtrA, const int* csrSortedColIndA,                \
+      const cusparseMatDescr_t descrB, int nnzB, const int* csrSortedRowPtrB,  \
+      const int* csrSortedColIndB, csrgemm2Info_t info,                        \
+      size_t* workspaceBytes) {                                                \
+    DCHECK(initialized_);                                                      \
+    TF_RETURN_IF_GPUSPARSE_ERROR(SPARSE_FN(csrgemm2_bufferSizeExt,             \
+                                           sparse_prefix)(                     \
+        *gpusparse_handle_, m, n, k, AsCudaComplex(one_ptr<Scalar>()), descrA, \
+        nnzA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,                \
+        csrSortedRowPtrB, csrSortedColIndB, AsCudaComplex(null_ptr<Scalar>()), \
+        descrA, 0, null_ptr<int>(), null_ptr<int>(), info, workspaceBytes));   \
+    return Status::OK();                                                       \
+  }
+
+TF_CALL_LAPACK_TYPES(CSRGEMM_BUFFERSIZE_INSTANCE);
+
+Status GpuSparse::CsrgemmNnz(
+    int m, int n, int k, const cusparseMatDescr_t descrA, int nnzA,
+    const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const int* csrSortedRowPtrB,
+    const int* csrSortedColIndB, const cusparseMatDescr_t descrC,
+    int* csrSortedRowPtrC, int* nnzTotalDevHostPtr, csrgemm2Info_t info,
+    void* workspace) {
+  DCHECK(initialized_);
+  DCHECK(nnzTotalDevHostPtr != nullptr);
+
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseXcsrgemm2Nnz(
+      *gpusparse_handle_, m, n, k, descrA, nnzA, csrSortedRowPtrA,
+      csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB,
+      descrA, 0, null_ptr<int>(), null_ptr<int>(), descrC, csrSortedRowPtrC,
+      nnzTotalDevHostPtr, info, workspace));
+  return Status::OK();
+}
+
+template <typename Scalar, typename SparseFnT>
+static inline Status CsrgemmImpl(
+    SparseFnT op, OpKernelContext* context, cusparseHandle_t cusparse_handle,
+    int m, int n, int k, const cusparseMatDescr_t descrA, int nnzA,
+    const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
+    const int* csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
+    const int* csrSortedColIndB, const cusparseMatDescr_t descrC,
+    Scalar* csrSortedValC, int* csrSortedRowPtrC, int* csrSortedColIndC,
+    const csrgemm2Info_t info, void* workspace) {
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      op(cusparse_handle, m, n, k, AsCudaComplex(one_ptr<Scalar>()), descrA,
+         nnzA, AsCudaComplex(csrSortedValA), csrSortedRowPtrA, csrSortedColIndA,
+         descrB, nnzB, AsCudaComplex(csrSortedValB), csrSortedRowPtrB,
+         csrSortedColIndB, AsCudaComplex(null_ptr<Scalar>()), descrA, 0,
+         AsCudaComplex(null_ptr<Scalar>()), null_ptr<int>(), null_ptr<int>(),
+         descrC, AsCudaComplex(csrSortedValC), csrSortedRowPtrC,
+         csrSortedColIndC, info, workspace));
+  return Status::OK();
+}
+
+#define CSRGEMM_INSTANCE(Scalar, sparse_prefix)                               \
+  template <>                                                                 \
+  Status GpuSparse::Csrgemm<Scalar>(                                          \
+      int m, int n, int k, const cusparseMatDescr_t descrA, int nnzA,         \
+      const Scalar* csrSortedValA, const int* csrSortedRowPtrA,               \
+      const int* csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB, \
+      const Scalar* csrSortedValB, const int* csrSortedRowPtrB,               \
+      const int* csrSortedColIndB, const cusparseMatDescr_t descrC,           \
+      Scalar* csrSortedValC, int* csrSortedRowPtrC, int* csrSortedColIndC,    \
+      const csrgemm2Info_t info, void* workspace) {                           \
+    DCHECK(initialized_);                                                     \
+    return CsrgemmImpl(SPARSE_FN(csrgemm2, sparse_prefix), context_,          \
+                       *gpusparse_handle_, m, n, k, descrA, nnzA,             \
+                       csrSortedValA, csrSortedRowPtrA, csrSortedColIndA,     \
+                       descrB, nnzB, csrSortedValB, csrSortedRowPtrB,         \
+                       csrSortedColIndB, descrC, csrSortedValC,               \
+                       csrSortedRowPtrC, csrSortedColIndC, info, workspace);  \
+  }
+
+TF_CALL_LAPACK_TYPES(CSRGEMM_INSTANCE);
+
+#endif  // CUDA_VERSION < 10000
+
 template <typename Scalar, typename BufferSizeFnT, typename SparseFnT>
 static inline Status Csru2csrImpl(SparseFnT op, BufferSizeFnT buffer_size_op,
                                   OpKernelContext* context,
@@ -596,6 +956,8 @@ static inline Status Csru2csrImpl(SparseFnT op, BufferSizeFnT buffer_size_op,
 
 TF_CALL_LAPACK_TYPES(CSRU2CSR_INSTANCE);
 
+#if CUDA_VERSION < 10010
+
 template <typename Scalar, typename SparseFnT>
 static inline Status Csr2cscImpl(SparseFnT op, OpKernelContext* context,
                                  cusparseHandle_t cusparse_handle, int m, int n,
@@ -624,6 +986,53 @@ static inline Status Csr2cscImpl(SparseFnT op, OpKernelContext* context,
 
 TF_CALL_LAPACK_TYPES(CSR2CSC_INSTANCE);
 
+#else
+
+template <typename Scalar>
+static inline Status Csr2cscImpl(cudaDataType_t dtype, OpKernelContext* context,
+                                 cusparseHandle_t cusparse_handle, int m, int n,
+                                 int nnz, const Scalar* csrVal,
+                                 const int* csrRowPtr, const int* csrColInd,
+                                 Scalar* cscVal, int* cscRowInd, int* cscColPtr,
+                                 const cusparseAction_t copyValues) {
+  size_t bufferSize;
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCsr2cscEx2_bufferSize(
+      cusparse_handle, m, n, nnz, AsCudaComplex(csrVal), csrRowPtr, csrColInd,
+      AsCudaComplex(cscVal), cscColPtr, cscRowInd, dtype, copyValues,
+      CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG2, &bufferSize));
+
+  Tensor buffer;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataTypeToEnum<Scalar>::value,
+      TensorShape({static_cast<int64>(bufferSize)}), &buffer));
+
+  DCHECK(buffer.flat<Scalar>().data() != nullptr);
+
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      cusparseCsr2cscEx2(cusparse_handle, m, n, nnz, AsCudaComplex(csrVal),
+                         csrRowPtr, csrColInd, AsCudaComplex(cscVal), cscColPtr,
+                         cscRowInd, dtype, copyValues, CUSPARSE_INDEX_BASE_ZERO,
+                         CUSPARSE_CSR2CSC_ALG2, buffer.flat<Scalar>().data()));
+
+  return Status::OK();
+}
+
+#define CSR2CSC_INSTANCE(Scalar, cudaDataType)                                \
+  template <>                                                                 \
+  Status GpuSparse::Csr2csc<Scalar>(                                          \
+      int m, int n, int nnz, const Scalar* csrVal, const int* csrRowPtr,      \
+      const int* csrColInd, Scalar* cscVal, int* cscRowInd, int* cscColPtr,   \
+      const cusparseAction_t copyValues) {                                    \
+    DCHECK(initialized_);                                                     \
+    return Csr2cscImpl(cudaDataType, context_, *gpusparse_handle_, m, n, nnz, \
+                       csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd,       \
+                       cscColPtr, copyValues);                                \
+  }
+
+TF_CALL_CUSPARSE_DTYPES(CSR2CSC_INSTANCE);
+
+#endif  // CUDA_VERSION < 10010
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cuda_sparse.h b/tensorflow/core/kernels/cuda_sparse.h
index 5dd62037ff0..2d41cc72421 100644
--- a/tensorflow/core/kernels/cuda_sparse.h
+++ b/tensorflow/core/kernels/cuda_sparse.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cusparse.h"
 
 using gpusparseStatus_t = cusparseStatus_t;
@@ -34,6 +35,14 @@ using gpusparseMatDescr_t = cusparseMatDescr_t;
 using gpusparseAction_t = cusparseAction_t;
 using gpusparseHandle_t = cusparseHandle_t;
 using gpuStream_t = cudaStream_t;
+#if CUDA_VERSION >= 10020
+using gpusparseDnMatDescr_t = cusparseDnMatDescr_t;
+using gpusparseSpMatDescr_t = cusparseSpMatDescr_t;
+using gpusparseSpMMAlg_t = cusparseSpMMAlg_t;
+#endif
+
+#define GPUSPARSE(postfix) CUSPARSE_##postfix
+#define gpusparse(postfix) cusparse##postfix
 
 #elif TENSORFLOW_USE_ROCM
 
@@ -46,6 +55,9 @@ using gpusparseAction_t = hipsparseAction_t;
 using gpusparseHandle_t = hipsparseHandle_t;
 using gpuStream_t = hipStream_t;
 
+#define GPUSPARSE(postfix) HIPSPARSE_##postfix
+#define gpusparse(postfix) hipsparse##postfix
+
 #endif
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -247,6 +259,7 @@ class GpuSparse {
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-coo2csr.
   Status Coo2csr(const int* cooRowInd, int nnz, int m, int* csrRowPtr) const;
 
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM
   // Sparse-dense matrix multiplication C = alpha * op(A) * op(B)  + beta * C,
   // where A is a sparse matrix in CSR format, B and C are dense tall
   // matrices.  This routine allows transposition of matrix B, which
@@ -266,18 +279,64 @@ class GpuSparse {
                const int* csrSortedRowPtrA, const int* csrSortedColIndA,
                const Scalar* B, int ldb, const Scalar* beta_host, Scalar* C,
                int ldc) const;
+#else
+  // Workspace size query for sparse-dense matrix multiplication. Helper
+  // function for SpMM which computes y = alpha * op(A) * op(B) + beta * C,
+  // where A is a sparse matrix in CSR format, B and C are dense matricies in
+  // column-major format. Returns needed workspace size in bytes.
+  template <typename Scalar>
+  Status SpMMBufferSize(gpusparseOperation_t transA,
+                        gpusparseOperation_t transB, const Scalar* alpha,
+                        const gpusparseSpMatDescr_t matA,
+                        const gpusparseDnMatDescr_t matB, const Scalar* beta,
+                        gpusparseDnMatDescr_t matC, gpusparseSpMMAlg_t alg,
+                        size_t* bufferSize) const;
+
+  // Sparse-dense matrix multiplication y = alpha * op(A) * op(B) + beta * C,
+  // where A is a sparse matrix in CSR format, B and C are dense matricies in
+  // column-major format. Buffer is assumed to be at least as large as the
+  // workspace size returned by SpMMBufferSize().
+  //
+  // **NOTE** This is an in-place operation for data in C.
+  template <typename Scalar>
+  Status SpMM(gpusparseOperation_t transA, gpusparseOperation_t transB,
+              const Scalar* alpha, const gpusparseSpMatDescr_t matA,
+              const gpusparseDnMatDescr_t matB, const Scalar* beta,
+              gpusparseDnMatDescr_t matC, gpusparseSpMMAlg_t alg,
+              int8* buffer) const;
+#endif
 
   // Sparse-dense vector multiplication y = alpha * op(A) * x  + beta * y,
   // where A is a sparse matrix in CSR format, x and y are dense vectors. See:
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv_mergepath
   //
   // **NOTE** This is an in-place operation for data in y.
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM
   template <typename Scalar>
   Status Csrmv(gpusparseOperation_t transA, int m, int n, int nnz,
                const Scalar* alpha_host, const gpusparseMatDescr_t descrA,
                const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
                const int* csrSortedColIndA, const Scalar* x,
                const Scalar* beta_host, Scalar* y) const;
+#else
+  template <typename Scalar>
+  Status Csrmv(gpusparseOperation_t transA, int m, int n, int nnz,
+               const Scalar* alpha_host, const Scalar* csrSortedValA,
+               const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+               const Scalar* x, const Scalar* beta_host, Scalar* y) const;
+#endif  // CUDA_VERSION < 10020
+
+  // Computes workspace size for sparse - sparse matrix addition of matrices
+  // stored in CSR format.
+  template <typename Scalar>
+  Status CsrgeamBufferSizeExt(
+      int m, int n, const Scalar* alpha, const gpusparseMatDescr_t descrA,
+      int nnzA, const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
+      const int* csrSortedColIndA, const Scalar* beta,
+      const gpusparseMatDescr_t descrB, int nnzB, const Scalar* csrSortedValB,
+      const int* csrSortedRowPtrB, const int* csrSortedColIndB,
+      const gpusparseMatDescr_t descrC, Scalar* csrSortedValC,
+      int* csrSortedRowPtrC, int* csrSortedColIndC, size_t* bufferSize);
 
   // Computes sparse-sparse matrix addition of matrices
   // stored in CSR format.  This is part one: calculate nnz of the
@@ -289,7 +348,7 @@ class GpuSparse {
                     const gpusparseMatDescr_t descrB, int nnzB,
                     const int* csrSortedRowPtrB, const int* csrSortedColIndB,
                     const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
-                    int* nnzTotalDevHostPtr);
+                    int* nnzTotalDevHostPtr, void* workspace);
 
   // Computes sparse - sparse matrix addition of matrices
   // stored in CSR format.  This is part two: perform sparse-sparse
@@ -305,13 +364,26 @@ class GpuSparse {
                  const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
                  const int* csrSortedColIndB, const gpusparseMatDescr_t descrC,
                  Scalar* csrSortedValC, int* csrSortedRowPtrC,
-                 int* csrSortedColIndC);
+                 int* csrSortedColIndC, void* workspace);
+
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
+  // Computes sparse-sparse matrix multiplication of matrices
+  // stored in CSR format.  This is part zero: calculate required workspace
+  // size.
+  template <typename Scalar>
+  Status CsrgemmBufferSize(
+      int m, int n, int k, const gpusparseMatDescr_t descrA, int nnzA,
+      const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+      const gpusparseMatDescr_t descrB, int nnzB, const int* csrSortedRowPtrB,
+      const int* csrSortedColIndB, csrgemm2Info_t info, size_t* workspaceBytes);
+#endif
 
   // Computes sparse-sparse matrix multiplication of matrices
   // stored in CSR format.  This is part one: calculate nnz of the
   // output.  csrSortedRowPtrC must be preallocated on device with
   // m + 1 entries.  See:
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
   Status CsrgemmNnz(gpusparseOperation_t transA, gpusparseOperation_t transB,
                     int m, int k, int n, const gpusparseMatDescr_t descrA,
                     int nnzA, const int* csrSortedRowPtrA,
@@ -320,12 +392,23 @@ class GpuSparse {
                     const int* csrSortedRowPtrB, const int* csrSortedColIndB,
                     const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
                     int* nnzTotalDevHostPtr);
+#else
+  Status CsrgemmNnz(int m, int n, int k, const gpusparseMatDescr_t descrA,
+                    int nnzA, const int* csrSortedRowPtrA,
+                    const int* csrSortedColIndA,
+                    const gpusparseMatDescr_t descrB, int nnzB,
+                    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
+                    const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
+                    int* nnzTotalDevHostPtr, csrgemm2Info_t info,
+                    void* workspace);
+#endif
 
   // Computes sparse - sparse matrix matmul of matrices
   // stored in CSR format.  This is part two: perform sparse-sparse
   // addition.  csrValC and csrColIndC must be allocated on the device
   // with nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz).  See:
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
   template <typename Scalar>
   Status Csrgemm(gpusparseOperation_t transA, gpusparseOperation_t transB,
                  int m, int k, int n, const gpusparseMatDescr_t descrA,
@@ -336,6 +419,18 @@ class GpuSparse {
                  const int* csrSortedColIndB, const gpusparseMatDescr_t descrC,
                  Scalar* csrSortedValC, int* csrSortedRowPtrC,
                  int* csrSortedColIndC);
+#else
+  template <typename Scalar>
+  Status Csrgemm(int m, int n, int k, const gpusparseMatDescr_t descrA,
+                 int nnzA, const Scalar* csrSortedValA,
+                 const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+                 const gpusparseMatDescr_t descrB, int nnzB,
+                 const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
+                 const int* csrSortedColIndB, const gpusparseMatDescr_t descrC,
+                 Scalar* csrSortedValC, int* csrSortedRowPtrC,
+                 int* csrSortedColIndC, const csrgemm2Info_t info,
+                 void* workspace);
+#endif
 
   // In-place reordering of unsorted CSR to sorted CSR.
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csru2csr
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
index ac66e558d03..64cd784af73 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
@@ -18,6 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "Equal", functor::equal_to, float, Eigen::half, double,
           uint8, int8, int16, bfloat16);
+REGISTER3(BinaryOp, CPU, "Equal", functor::equal_to, uint16, uint32, uint64);
 REGISTER_KERNEL_BUILDER(
     Name("ApproximateEqual").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     ApproximateEqualOp<CPUDevice, float>);
diff --git a/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
index 522df1ba664..53530f91b2a 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY4(maximum, Eigen::half, float, double, int64);
+DEFINE_BINARY6(maximum, Eigen::half, float, double, uint8, int16, int64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
index e9413e62f39..beab671616d 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY4(minimum, Eigen::half, float, double, int64);
+DEFINE_BINARY6(minimum, Eigen::half, float, double, uint8, int16, int64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 347022c208f..5ebfa74eb4e 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
-          bfloat16, double, int32, int64);
+REGISTER8(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
+          bfloat16, double, uint8, int16, int32, int64);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER4(BinaryOp, GPU, "Maximum", functor::maximum, float, Eigen::half,
-          double, int64);
+REGISTER6(BinaryOp, GPU, "Maximum", functor::maximum, float, Eigen::half,
+          double, uint8, int16, int64);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 15491b8258c..8b301e8ce64 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "Minimum", functor::minimum, float, Eigen::half,
-          bfloat16, double, int32, int64);
+REGISTER8(BinaryOp, CPU, "Minimum", functor::minimum, float, Eigen::half,
+          bfloat16, double, uint8, int16, int32, int64);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER4(BinaryOp, GPU, "Minimum", functor::minimum, float, Eigen::half,
-          double, int64);
+REGISTER6(BinaryOp, GPU, "Minimum", functor::minimum, float, Eigen::half,
+          double, uint8, int16, int64);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 646ff841d88..4e2aa6bbc58 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -19,8 +19,8 @@ namespace tensorflow {
 
 REGISTER6(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
           int32, bfloat16);
-REGISTER5(BinaryOp, CPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
-          double, complex64, complex128);
+REGISTER6(BinaryOp, CPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
+          double, complex64, complex128, bfloat16);
 
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index f207158b843..4de69edd21d 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -18,6 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8, int8, int16, bfloat16);
+REGISTER3(BinaryOp, CPU, "NotEqual", functor::not_equal_to, uint16, uint32,
+          uint64);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index a64f59a97b3..b8bf19c2cec 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -793,7 +793,7 @@ struct base {
   // operation. Each functor for which this is enabled increases the
   // code size, so by default this is disabled for binary functors and
   // is enabled on a per-op basis as needed.
-  static const bool use_bcast_optimization = false;
+  static constexpr bool use_bcast_optimization = false;
 
   // operator() has the signature:
   //  out_type operator()(in_type in0, in_type in1 ...)
@@ -811,24 +811,24 @@ struct base {
 
   // Whether the functor can error out.  Currently applies only to integer
   // div and mod.
-  static const bool has_errors = false;
+  static constexpr bool has_errors = false;
 };
 
 // For now, we only apply certain speed optimization for
 // float/double's broadcast binary op.
 template <typename T>
 struct use_bcast_optimization {
-  static const bool value = false;
+  static constexpr bool value = false;
 };
 
 template <>
 struct use_bcast_optimization<float> {
-  static const bool value = true;
+  static constexpr bool value = true;
 };
 
 template <>
 struct use_bcast_optimization<double> {
-  static const bool value = true;
+  static constexpr bool value = true;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1007,17 +1007,17 @@ struct rint : base<T, Eigen::internal::scalar_rint_op<T>> {};
 
 template <typename T>
 struct add : base<T, Eigen::internal::scalar_sum_op<T>> {
-  static const bool use_bcast_optimization = true;
+  static constexpr bool use_bcast_optimization = true;
 };
 
 template <typename T>
 struct sub : base<T, Eigen::internal::scalar_difference_op<T>> {
-  static const bool use_bcast_optimization = true;
+  static constexpr bool use_bcast_optimization = true;
 };
 
 template <typename T>
 struct mul : base<T, Eigen::internal::scalar_product_op<T>> {
-  static const bool use_bcast_optimization = true;
+  static constexpr bool use_bcast_optimization = true;
 };
 
 template <typename T>
@@ -1029,7 +1029,7 @@ struct div : base<T, Eigen::internal::scalar_quotient_op<T>> {};
 template <typename T>
 struct safe_div : base<T, Eigen::internal::safe_div_or_mod_op<
                               T, Eigen::internal::scalar_quotient_op<T>>> {
-  static const bool has_errors = true;
+  static constexpr bool has_errors = true;
 };
 
 template <typename T>
@@ -1044,7 +1044,7 @@ struct mod : base<T, Eigen::internal::scalar_mod2_op<T>> {};
 template <typename T>
 struct safe_mod : base<T, Eigen::internal::safe_div_or_mod_op<
                               T, Eigen::internal::scalar_mod2_op<T>>> {
-  static const bool has_errors = true;
+  static constexpr bool has_errors = true;
 };
 
 template <typename T>
@@ -1053,7 +1053,7 @@ struct floor_fmod : base<T, Eigen::internal::google_floor_fmod<T>> {};
 template <typename T>
 struct safe_floor_mod : base<T, Eigen::internal::safe_div_or_mod_op<
                                     T, Eigen::internal::google_floor_mod<T>>> {
-  static const bool has_errors = true;
+  static constexpr bool has_errors = true;
 };
 
 template <typename T>
@@ -1062,7 +1062,7 @@ struct floor_div : base<T, Eigen::internal::google_floor_div<T>> {};
 template <typename T>
 struct safe_floor_div : base<T, Eigen::internal::safe_div_or_mod_op<
                                     T, Eigen::internal::google_floor_div<T>>> {
-  static const bool has_errors = true;
+  static constexpr bool has_errors = true;
 };
 
 template <typename T>
@@ -1073,7 +1073,7 @@ struct pow : base<T, Eigen::internal::scalar_pow_op<T, T>> {};
 
 template <typename T>
 struct safe_pow : base<T, Eigen::internal::safe_scalar_binary_pow_op<T, T>> {
-  static const bool has_errors = true;
+  static constexpr bool has_errors = true;
 };
 
 template <typename T>
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 823a800e7bb..d088abc00e6 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -167,9 +167,10 @@ cc_library(
     srcs = ["single_threaded_executor.cc"],
     hdrs = ["single_threaded_executor.h"],
     deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime:core_cpu_internal",
+        "//tensorflow/core/common_runtime:entry",
+        "//tensorflow/core/common_runtime:local_executor_params",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 707800bc896..9a1a4ee1ed3 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -48,7 +48,6 @@ constexpr char kShardId[] = "shard_id";
 constexpr char kCreatedAt[] = "Created at";
 constexpr char kMemoryDatasetPrefix[] = "Memory";
 constexpr char kMemoryCache[] = "MemoryCache";
-constexpr char kTFData[] = "tf_data";
 constexpr char kCacheClaimed[] = "cache_claimed";
 constexpr char kCacheSize[] = "cache_size";
 constexpr char kCache[] = "cache";
@@ -58,10 +57,10 @@ constexpr char kIndex[] = "index";
 constexpr char kImpl[] = "Impl";
 constexpr char kCacheDataset[] = "CacheDataset";
 
-class CacheDatasetOp::FileDataset : public DatasetBase {
+class CacheDatasetOp::FileDatasetBase : public DatasetBase {
  public:
-  explicit FileDataset(OpKernelContext* ctx, const DatasetBase* input,
-                       string filename, Env* env)
+  FileDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
+                  string filename, Env* env)
       : DatasetBase(DatasetContext(ctx)),
         input_(input),
         filename_(std::move(filename)),
@@ -76,7 +75,7 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
     DCHECK_EQ(item_index_padding_size_, 7);
   }
 
-  ~FileDataset() override { input_->Unref(); }
+  ~FileDatasetBase() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
@@ -107,17 +106,6 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
   }
 
  protected:
-  Status AsGraphDefInternal(SerializationContext* ctx,
-                            DatasetGraphDefBuilder* b,
-                            Node** output) const override {
-    Node* input_graph = nullptr;
-    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph));
-    Node* filename = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(filename_, &filename));
-    TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph, filename}, output));
-    return Status::OK();
-  }
-
   const DatasetBase* const input_;
   const tstring filename_;
 
@@ -131,10 +119,10 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
                            tensor_index);
   }
 
-  class FileIterator : public DatasetIterator<FileDataset> {
+  class FileIterator : public DatasetIterator<FileDatasetBase> {
    public:
     explicit FileIterator(const Params& params)
-        : DatasetIterator<FileDataset>(params) {
+        : DatasetIterator<FileDatasetBase>(params) {
       if (params.dataset->env_
               ->FileExists(MetaFilename(params.dataset->filename_))
               .ok()) {
@@ -199,7 +187,7 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
 
    private:
     // FileWriterIterator passes through and caches items from the input
-    // FileDataset.
+    // FileDatasetBase.
     //
     // This iterator is used when the cache directory is not found on disk. It
     // creates the cache directory, and passes on the underlying iterator's
@@ -214,10 +202,10 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
     // partial cache gets flushed to disk in files with prefix
     // <filename>_<shard_id> where shard_id is unique for each checkpoint.
     // When all elements have been produced, these shards get coalesced.
-    class FileWriterIterator : public DatasetIterator<FileDataset> {
+    class FileWriterIterator : public DatasetIterator<FileDatasetBase> {
      public:
       explicit FileWriterIterator(const Params& params)
-          : DatasetIterator<FileDataset>(params),
+          : DatasetIterator<FileDatasetBase>(params),
             cur_index_(0),
             shard_id_(0),
             filename_(
@@ -483,10 +471,10 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
       bool iteration_completed_ TF_GUARDED_BY(mu_);
     };  // FileWriterIterator
 
-    class FileReaderIterator : public DatasetIterator<FileDataset> {
+    class FileReaderIterator : public DatasetIterator<FileDatasetBase> {
      public:
       explicit FileReaderIterator(const Params& params)
-          : DatasetIterator<FileDataset>(params),
+          : DatasetIterator<FileDatasetBase>(params),
             cur_index_(0),
             reader_(dataset()->env_, dataset()->filename_),
             iterator_restored_(false) {}
@@ -603,17 +591,34 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
   Env* const env_;
   const size_t num_tensors_;
   const size_t tensor_index_padding_size_;
-  static const size_t kMaxItems = 10000000;  // 10 million
+  static constexpr size_t kMaxItems = 10000000;  // 10 million
   const size_t item_index_padding_size_;
   const string tensor_format_string_;
-};  // FileDataset
+};  // FileDatasetBase
 
-class CacheDatasetOp::FileDatasetV2 : public CacheDatasetOp::FileDataset {
+class CacheDatasetOp::FileDataset : public CacheDatasetOp::FileDatasetBase {
+ public:
+  using FileDatasetBase::FileDatasetBase;
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph));
+    Node* filename = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(filename_, &filename));
+    TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph, filename}, output));
+    return Status::OK();
+  }
+};
+
+class CacheDatasetOp::FileDatasetV2 : public CacheDatasetOp::FileDatasetBase {
  public:
   explicit FileDatasetV2(OpKernelContext* ctx, const DatasetBase* input,
                          string filename, Env* env,
                          const Tensor& resource_handle)
-      : FileDataset(ctx, input, filename, env),
+      : FileDatasetBase(ctx, input, filename, env),
         resource_handle_(resource_handle) {}
 
  protected:
@@ -686,20 +691,17 @@ Status RestoreCache(IteratorContext* ctx, IteratorStateReader* reader, T* cache,
 
 }  // namespace
 
-class CacheDatasetOp::MemoryDataset : public DatasetBase {
+class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
  public:
-  explicit MemoryDataset(OpKernelContext* ctx, const DatasetBase* input,
-                         MemoryCache* cache)
-      : DatasetBase(DatasetContext(ctx)), input_(input), cache_(cache) {
+  explicit MemoryDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
+                             std::shared_ptr<MemoryCache> cache)
+      : DatasetBase(DatasetContext(ctx)),
+        input_(input),
+        cache_(std::move(cache)) {
     input_->Ref();
   }
 
-  ~MemoryDataset() override {
-    input_->Unref();
-    if (cache_) {
-      cache_->Unref();
-    }
-  }
+  ~MemoryDatasetBase() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
@@ -708,7 +710,7 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
     return absl::make_unique<MemoryIterator>(
         MemoryIterator::Params{
             this, name_utils::IteratorPrefix(kDatasetType, prefix, params)},
-        cache_);
+        cache_.get());
   }
 
   const DataTypeVector& output_dtypes() const override {
@@ -732,44 +734,13 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
   }
 
  protected:
-  Status AsGraphDefInternal(SerializationContext* ctx,
-                            DatasetGraphDefBuilder* b,
-                            Node** output) const override {
-    Node* input_node = nullptr;
-    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
-    Node* filename_node = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(tstring(""), &filename_node));
-    TF_RETURN_IF_ERROR(
-        b->AddDataset(this, {input_node, filename_node}, output));
-    return Status::OK();
-  }
-
-  class MemoryIterator : public DatasetIterator<MemoryDataset> {
+  class MemoryIterator : public DatasetIterator<MemoryDatasetBase> {
    public:
     explicit MemoryIterator(const Params& params, MemoryCache* cache)
-        : DatasetIterator<MemoryDataset>(params), cache_(cache) {}
-
-    ~MemoryIterator() override {
-      if (dataset()->cache_ == nullptr) {
-        cache_->Unref();
-      }
-    }
+        : DatasetIterator<MemoryDatasetBase>(params), cache_(cache) {}
 
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(mu_);
-      if (cache_ == nullptr) {
-        // Use the resource manager in the iterator context to get / create
-        // a cache.
-        ResourceMgr* mgr = ctx->resource_mgr();
-        const string name = strings::StrCat(
-            prefix(), name_utils::kDelimiter, dataset()->node_name(),
-            name_utils::kDelimiter, kMemoryCache);
-        TF_RETURN_IF_ERROR(mgr->LookupOrCreate<MemoryCache>(
-            kTFData, name, &cache_, [](MemoryCache** cache) {
-              *cache = new MemoryCache();
-              return Status::OK();
-            }));
-      }
       InitializeIterator();
       return iterator_->Initialize(ctx);
     }
@@ -778,7 +749,14 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
       mutex_lock l(mu_);
-      return iterator_->GetNext(ctx, out_tensors, end_of_sequence);
+      // TODO(b/154341936): Explicitly stopping and starting this iterator
+      // should not be necessary, but the `kImpl` added to the prefix passed
+      // to `iterator_` when it was created prevents the model from identifying
+      // this iterator as the output of `iterator_`.
+      RecordStop(ctx);
+      Status s = iterator_->GetNext(ctx, out_tensors, end_of_sequence);
+      RecordStart(ctx);
+      return s;
     }
 
    protected:
@@ -817,10 +795,10 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
     }
 
    private:
-    class MemoryWriterIterator : public DatasetIterator<MemoryDataset> {
+    class MemoryWriterIterator : public DatasetIterator<MemoryDatasetBase> {
      public:
       explicit MemoryWriterIterator(const Params& params, MemoryCache* cache)
-          : DatasetIterator<MemoryDataset>(params), cache_(cache) {}
+          : DatasetIterator<MemoryDatasetBase>(params), cache_(cache) {}
 
       ~MemoryWriterIterator() override {
         mutex_lock l(mu_);
@@ -900,12 +878,12 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
       std::vector<std::vector<Tensor>> temp_cache_ TF_GUARDED_BY(mu_);
     };  // MemoryWriterIterator
 
-    class MemoryReaderIterator : public DatasetIterator<MemoryDataset> {
+    class MemoryReaderIterator : public DatasetIterator<MemoryDatasetBase> {
      public:
       explicit MemoryReaderIterator(const Params& params, MemoryCache* cache)
-          : DatasetIterator<MemoryDataset>(params), cache_(cache), index_(0) {
-        CHECK(cache);
-      }
+          : DatasetIterator<MemoryDatasetBase>(params),
+            cache_(cache),
+            index_(0) {}
 
       Status Initialize(IteratorContext* ctx) override {
         // The memory allocated for the cache is owned by the parent
@@ -988,19 +966,73 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
   };  // MemoryIterator
 
   const DatasetBase* const input_;
-  MemoryCache* cache_ = nullptr;
-};  // MemoryDataset
+  const std::shared_ptr<MemoryCache> cache_;
+};  // MemoryDatasetBase
 
-class CacheDatasetOp::MemoryDatasetV2 : public CacheDatasetOp::MemoryDataset {
+// This version of memory dataset has an exclusive ownership of the memory cache
+// resource. It supports sharing of the cache across different iterations of the
+// `repeat` transformation but not across different iterators.
+class CacheDatasetOp::MemoryDataset : public CacheDatasetOp::MemoryDatasetBase {
  public:
-  explicit MemoryDatasetV2(OpKernelContext* ctx, const DatasetBase* input,
-                           MemoryCache* cache,
-                           std::unique_ptr<OwnedResourceHandle> handle)
-      : MemoryDataset(ctx, input, cache), handle_(std::move(handle)) {}
+  MemoryDataset(OpKernelContext* ctx, const DatasetBase* input,
+                MemoryCacheManager* manager, ResourceHandle&& resource_handle)
+      : MemoryDatasetBase(ctx, input, manager->get()),
+        manager_(manager),
+        resource_handle_(std::move(resource_handle)),
+        resource_mgr_(ctx->resource_manager()) {}
 
-  Status CheckExternalState() const override {
-    return errors::FailedPrecondition(DebugString(),
-                                      " depends on memory cache resource.");
+  ~MemoryDataset() override {
+    manager_->Unref();
+    Status s = resource_mgr_->Delete<MemoryCacheManager>(
+        resource_handle_.container(), resource_handle_.name());
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to delete cache resource: " << s.ToString();
+    }
+  }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+    Node* filename_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(tstring(""), &filename_node));
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this, {input_node, filename_node}, output));
+    return Status::OK();
+  }
+
+ private:
+  MemoryCacheManager* const manager_;  // Owned.
+  const ResourceHandle resource_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned.
+};
+
+// This version of memory dataset has a shared ownership of the memory cache
+// resource. It supports sharing of the cache across different iterations of
+// the `repeat` transformation and also across different iterators.
+class CacheDatasetOp::MemoryDatasetV2
+    : public CacheDatasetOp::MemoryDatasetBase {
+ public:
+  MemoryDatasetV2(OpKernelContext* ctx, const DatasetBase* input,
+                  MemoryCacheManager* manager, ResourceHandle&& resource_handle,
+                  bool owns_resource)
+      : MemoryDatasetBase(ctx, input, manager->get()),
+        manager_(manager),
+        owns_resource_(owns_resource),
+        resource_handle_(std::move(resource_handle)),
+        resource_mgr_(ctx->resource_manager()) {}
+
+  ~MemoryDatasetV2() override {
+    manager_->Unref();
+    if (owns_resource_) {
+      Status s = resource_mgr_->Delete<MemoryCacheManager>(
+          resource_handle_.container(), resource_handle_.name());
+      if (!s.ok()) {
+        LOG(WARNING) << "Failed to delete cache resource: " << s.ToString();
+      }
+    }
   }
 
  protected:
@@ -1013,7 +1045,7 @@ class CacheDatasetOp::MemoryDatasetV2 : public CacheDatasetOp::MemoryDataset {
     TF_RETURN_IF_ERROR(b->AddScalar(tstring(""), &filename_node));
     Node* resource_handle_node = nullptr;
     Tensor handle(DT_RESOURCE, TensorShape({}));
-    handle.scalar<ResourceHandle>()() = handle_->handle();
+    handle.scalar<ResourceHandle>()() = resource_handle_;
     TF_RETURN_IF_ERROR(b->AddTensor(handle, &resource_handle_node));
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {input_node, filename_node, resource_handle_node}, output));
@@ -1021,7 +1053,10 @@ class CacheDatasetOp::MemoryDatasetV2 : public CacheDatasetOp::MemoryDataset {
   }
 
  private:
-  std::unique_ptr<OwnedResourceHandle> handle_;
+  MemoryCacheManager* const manager_;  // Owned.
+  const bool owns_resource_;
+  const ResourceHandle resource_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned.
 };
 
 CacheDatasetOp::CacheDatasetOp(OpKernelConstruction* ctx)
@@ -1033,22 +1068,45 @@ void CacheDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
   // Parse out the filenames tensor.
   tstring filename;
   OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, kFileName, &filename));
-
   if (filename.empty()) {
+    static std::atomic<int64> resource_id_counter(0);
+    const string& container = ctx->resource_manager()->default_container();
+    auto name = strings::StrCat(ctx->op_kernel().name(), "/", kMemoryCache, "_",
+                                resource_id_counter.fetch_add(1));
     if (op_version_ == 2) {
-      MemoryCache* cache = nullptr;
-      OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 2), &cache));
-
-      // Create a fresh handle for the resource because the input handle can
-      // become invalid after this op executes.
-      std::unique_ptr<OwnedResourceHandle> handle;
-      OP_REQUIRES_OK(
-          ctx, OwnedResourceHandle::Create(ctx, cache, kMemoryCache, &handle));
-
-      // Ownership of cache is transferred onto `MemoryDatasetV2`.
-      *output = new MemoryDatasetV2(ctx, input, cache, std::move(handle));
+      bool owns_resource = false;
+      MemoryCacheManager* manager = nullptr;
+      auto handle = HandleFromInput(ctx, 2);
+      Status s = ctx->resource_manager()->Lookup<MemoryCacheManager>(
+          handle.container(), handle.name(), &manager);
+      if (errors::IsNotFound(s)) {
+        owns_resource = true;
+        OP_REQUIRES_OK(
+            ctx,
+            ctx->resource_manager()->LookupOrCreate<MemoryCacheManager>(
+                container, name, &manager, [](MemoryCacheManager** manager) {
+                  *manager = new MemoryCacheManager();
+                  return Status::OK();
+                }));
+        handle = MakeResourceHandle<MemoryCacheManager>(ctx, container, name);
+      } else {
+        OP_REQUIRES_OK(ctx, s);
+      }
+      // Ownership of manager is transferred onto `MemoryDatasetV2`.
+      *output = new MemoryDatasetV2(ctx, input, manager, std::move(handle),
+                                    owns_resource);
     } else {
-      *output = new MemoryDataset(ctx, input, /*cache=*/nullptr);
+      MemoryCacheManager* manager;
+      OP_REQUIRES_OK(
+          ctx, ctx->resource_manager()->LookupOrCreate<MemoryCacheManager>(
+                   container, name, &manager, [](MemoryCacheManager** manager) {
+                     *manager = new MemoryCacheManager();
+                     return Status::OK();
+                   }));
+      auto handle =
+          MakeResourceHandle<MemoryCacheManager>(ctx, container, name);
+      // Ownership of manager is transferred onto `MemoryDataset`.
+      *output = new MemoryDataset(ctx, input, manager, std::move(handle));
     }
   } else {
     if (op_version_ == 2) {
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.h b/tensorflow/core/kernels/data/cache_dataset_ops.h
index 484d0489336..e0ceee2a253 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.h
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.h
@@ -22,8 +22,8 @@ namespace data {
 
 class CacheDatasetOp : public UnaryDatasetOpKernel {
  public:
-  class FileDataset;
-  class MemoryDataset;
+  class FileDatasetBase;
+  class MemoryDatasetBase;
 
   static constexpr const char* const kDatasetType = "Cache";
   static constexpr const char* const kInputDataset = "input_dataset";
@@ -38,10 +38,12 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
                    DatasetBase** output) override;
 
  private:
+  class FileDataset;
   class FileDatasetV2;
+  class MemoryDataset;
   class MemoryDatasetV2;
 
-  int op_version_;
+  const int op_version_;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/cache_ops.cc b/tensorflow/core/kernels/data/cache_ops.cc
index 371a2ae5d25..90c2e905c32 100644
--- a/tensorflow/core/kernels/data/cache_ops.cc
+++ b/tensorflow/core/kernels/data/cache_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -26,11 +27,11 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-const char kMemoryCache[] = "MemoryCache";
+constexpr char kMemoryCache[] = "MemoryCache";
 
 }  // namespace
 
-string MemoryCache::DebugString() const { return kMemoryCache; }
+string MemoryCacheManager::DebugString() const { return kMemoryCache; }
 
 void MemoryCache::Complete(std::vector<std::vector<Tensor>>&& cache) {
   mutex_lock l(mu_);
@@ -64,28 +65,25 @@ size_t MemoryCache::size() {
 
 AnonymousMemoryCacheHandleOp::AnonymousMemoryCacheHandleOp(
     OpKernelConstruction* ctx)
-    : AnonymousResourceOp<MemoryCache>(ctx) {}
-
-void AnonymousMemoryCacheHandleOp::Compute(OpKernelContext* ctx) {
-  AnonymousResourceOp<MemoryCache>::Compute(ctx);
-}
+    : AnonymousResourceOp<MemoryCacheManager>(ctx) {}
 
 string AnonymousMemoryCacheHandleOp::name() { return kMemoryCache; }
 
 Status AnonymousMemoryCacheHandleOp::CreateResource(
     OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-    FunctionLibraryRuntime* lib, MemoryCache** resource) {
-  *resource = new MemoryCache();
+    FunctionLibraryRuntime* lib, MemoryCacheManager** manager) {
+  *manager = new MemoryCacheManager();
   return Status::OK();
 }
 
 void DeleteMemoryCacheOp::Compute(OpKernelContext* ctx) {
   const ResourceHandle& handle = ctx->input(0).flat<ResourceHandle>()(0);
-  // The resource is guaranteed to exist because the variant tensor wrapping the
-  // deleter is provided as an unused input to this op, which guarantees that it
-  // has not run yet.
-  OP_REQUIRES_OK(ctx, ctx->resource_manager()->Delete(handle));
+  // The resource might have been already deleted by the dataset.
+  Status s = ctx->resource_manager()->Delete(handle);
+  if (!errors::IsNotFound(s)) {
+    OP_REQUIRES_OK(ctx, s);
+  }
 }
 
 namespace {
@@ -96,6 +94,9 @@ REGISTER_KERNEL_BUILDER(Name("AnonymousMemoryCache").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("DeleteMemoryCache").Device(DEVICE_CPU),
                         DeleteMemoryCacheOp);
 
+REGISTER_KERNEL_BUILDER(Name("DummyMemoryCache").Device(DEVICE_CPU),
+                        DummyResourceOp<MemoryCache>);
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/cache_ops.h b/tensorflow/core/kernels/data/cache_ops.h
index d21679bc703..c670d6f0e50 100644
--- a/tensorflow/core/kernels/data/cache_ops.h
+++ b/tensorflow/core/kernels/data/cache_ops.h
@@ -27,12 +27,10 @@ namespace data {
 // The expected use is that a single `MemoryWriterIterator` populates the
 // cache with dataset elements. Once all elements are cached, the cache can
 // be used by one or more `MemoryReaderIterator`s.
-class MemoryCache : public ResourceBase {
+class MemoryCache {
  public:
   MemoryCache() = default;
 
-  string DebugString() const override;
-
   // Marks the cache as completed.
   void Complete(std::vector<std::vector<Tensor>>&& cache);
 
@@ -55,11 +53,24 @@ class MemoryCache : public ResourceBase {
   std::vector<std::vector<Tensor>> cache_ TF_GUARDED_BY(mu_);
 };
 
+// A resource wrapping a shared instance of a memory cache.
+class MemoryCacheManager : public ResourceBase {
+ public:
+  MemoryCacheManager() : cache_(std::make_shared<MemoryCache>()) {}
+
+  string DebugString() const override;
+
+  std::shared_ptr<MemoryCache> get() { return cache_; }
+
+ private:
+  std::shared_ptr<MemoryCache> cache_;
+};
+
 // Creates an instance of cache resource and transfers ownership to the caller.
-class AnonymousMemoryCacheHandleOp : public AnonymousResourceOp<MemoryCache> {
+class AnonymousMemoryCacheHandleOp
+    : public AnonymousResourceOp<MemoryCacheManager> {
  public:
   explicit AnonymousMemoryCacheHandleOp(OpKernelConstruction* ctx);
-  void Compute(OpKernelContext* ctx) override;
 
  private:
   string name() override;
@@ -67,7 +78,7 @@ class AnonymousMemoryCacheHandleOp : public AnonymousResourceOp<MemoryCache> {
                         std::unique_ptr<FunctionLibraryDefinition> flib_def,
                         std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                         FunctionLibraryRuntime* lib,
-                        MemoryCache** resource) override;
+                        MemoryCacheManager** manager) override;
 };
 
 // Deletes an instance of cache resource.
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 97c8a2ac5e3..adba99d37a4 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -39,6 +39,8 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+const char kDataServiceDataset[] = "DataServiceDataset";
+
 // Simplistic implementation of the `StepStatsCollectorInterface` that only
 // cares about collecting the CPU time needed to execute a captured function.
 class SimpleStepStatsCollector : public StepStatsCollectorInterface {
@@ -321,6 +323,17 @@ class OwnedArgsCallFrame : public CallFrameBase {
     }
   }
 
+  // Since we own the argument tensors in `args_`, we can implement
+  // `ConsumeArg()` for those arguments.
+  void ConsumeArg(int index, Tensor* val) override {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, args_.size());
+    *val = std::move(args_[index]);
+  }
+  bool CanConsumeArg(int index) const override {
+    return index >= 0 && index < args_.size();
+  }
+
  private:
   std::vector<Tensor> args_;
   const std::vector<Tensor>* const captured_inputs_;  // Not owned.
@@ -453,17 +466,15 @@ Status FunctionMetadata::Create(
 
   auto attr = fdef->attr().find(FunctionLibraryDefinition::kIntsOnDeviceAttr);
   if (attr != fdef->attr().end() && attr->second.b()) {
-    LOG(WARNING)
-        << "Disabling multi-device execution for a function that uses the "
-        << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
+    VLOG(1) << "Disabling multi-device execution for a function that uses the "
+            << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
     (*out_metadata)->use_multi_device_function_ = false;
     return Status::OK();
   }
   auto validate_arg = [](const OpDef::ArgDef& arg) {
     if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
-      LOG(WARNING) << "Disabling multi-device execution for a function with "
-                      "a vector argument "
-                   << arg.name() << ".";
+      VLOG(1) << "Disabling multi-device execution for a function with "
+              << "a vector argument " << arg.name() << ".";
       return false;
     }
     return true;
@@ -480,6 +491,13 @@ Status FunctionMetadata::Create(
       return Status::OK();
     }
   }
+  for (const auto& node : fdef->node_def()) {
+    if (node.op() == kDataServiceDataset) {
+      return errors::InvalidArgument(
+          "The `.distribute(...)` dataset transformation is not supported "
+          "within tf.data functions.");
+    }
+  }
   return Status::OK();
 }
 
@@ -662,20 +680,13 @@ Status InstantiatedCapturedFunction::Run(IteratorContext* ctx,
 
   OwnedArgsCallFrame frame(std::move(args), &captured_func_->captured_inputs(),
                            ret_types_);
-  Notification n;
-  Status s;
   profiler::TraceMe activity(
       [&] {
         return absl::StrCat(
             "InstantiatedCapturedFunction::Run#id=", f_opts.step_id, "#");
       },
       profiler::TraceMeLevel::kInfo);
-  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](const Status& func_status) {
-    s.Update(func_status);
-    n.Notify();
-  });
-  n.WaitForNotification();
-  TF_RETURN_IF_ERROR(s);
+  TF_RETURN_IF_ERROR(lib_->RunSync(std::move(f_opts), f_handle_, &frame));
   return frame.ConsumeRetvals(rets);
 }
 
@@ -700,9 +711,6 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
 
   BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
                               ret_types_);
-  Notification n;
-  Status s;
-
   profiler::TraceMe activity(
       [&] {
         return absl::StrCat(
@@ -710,12 +718,7 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
             f_opts.step_id, "#");
       },
       profiler::TraceMeLevel::kInfo);
-  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](const Status& func_status) {
-    s.Update(func_status);
-    n.Notify();
-  });
-  n.WaitForNotification();
-  TF_RETURN_IF_ERROR(s);
+  TF_RETURN_IF_ERROR(lib_->RunSync(std::move(f_opts), f_handle_, &frame));
   return frame.ConsumeRetvals(rets);
 }
 
@@ -739,27 +742,20 @@ Status InstantiatedCapturedFunction::RunInstantiated(
 
   BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
                               ret_types_);
-  Notification n;
-  Status s;
-
   profiler::TraceMe activity(
       [&] {
         return absl::StrCat("InstantiatedCapturedFunction::RunInstantiated#id=",
                             f_opts.step_id, "#");
       },
       profiler::TraceMeLevel::kInfo);
-  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](const Status& func_status) {
-    s.Update(func_status);
-    n.Notify();
-  });
-  n.WaitForNotification();
-  TF_RETURN_IF_ERROR(s);
+  TF_RETURN_IF_ERROR(lib_->RunSync(std::move(f_opts), f_handle_, &frame));
   return frame.ConsumeRetvals(rets);
 }
 
 void InstantiatedCapturedFunction::RunAsync(
     IteratorContext* ctx, std::vector<Tensor>&& args, std::vector<Tensor>* rets,
-    FunctionLibraryRuntime::DoneCallback done, const string& prefix) const {
+    FunctionLibraryRuntime::DoneCallback done,
+    const std::shared_ptr<model::Node>& node) const {
   auto& info = captured_func_->short_circuit_info();
   if (!info.indices.empty()) {
     // Run the `done` callback on a threadpool thread, because it will
@@ -792,18 +788,21 @@ void InstantiatedCapturedFunction::RunAsync(
   f_opts.cancellation_manager = cancellation_manager.get();
 
   std::shared_ptr<SimpleStepStatsCollector> stats_collector;
-  if (ctx->model() || ctx->stats_aggregator()) {
-    stats_collector = absl::make_unique<SimpleStepStatsCollector>();
+  if (node || ctx->stats_aggregator()) {
+    stats_collector = std::make_shared<SimpleStepStatsCollector>();
   }
+  const bool collect_usage =
+      node && ctx->model() && ctx->model()->collect_resource_usage();
   f_opts.stats_collector = stats_collector.get();
 
   // Transfer ownership of the cancellation manager to `callback`.
   CancellationManager* raw_cancellation_manager =
       cancellation_manager.release();
   auto callback = std::bind(
-      [this, rets, step_container, raw_cancellation_manager, frame](
+      [this, rets, step_container, raw_cancellation_manager, frame, node,
+       collect_usage](
           const FunctionLibraryRuntime::DoneCallback& done,
-          IteratorContext* ctx, const string& prefix,
+          IteratorContext* ctx,
           const std::shared_ptr<SimpleStepStatsCollector>& stats_collector,
           // Begin unbound arguments.
           Status s) {
@@ -813,32 +812,30 @@ void InstantiatedCapturedFunction::RunAsync(
           s = frame->ConsumeRetvals(rets);
         }
         delete frame;
-        if (ctx->model()) {
+        if (node) {
           // TODO(b/129085499) Utilize the `node_name` which would be unique
           // than the prefix for the function execution time statistics.
           // prefix_with_func_name would then be node_name + func_name.
           if (ctx->stats_aggregator()) {
-            string prefix_end =
-                str_util::Split(prefix, "::", str_util::SkipEmpty()).back();
             string prefix_with_func_name =
-                strings::StrCat(prefix_end, stats_utils::kDelimiter,
+                strings::StrCat(node->name(), stats_utils::kDelimiter,
                                 captured_func_->func().name());
             ctx->stats_aggregator()->AddToHistogram(
                 stats_utils::ExecutionTimeHistogramName(prefix_with_func_name),
                 {static_cast<float>(stats_collector->processing_time())},
-                ctx->model()->NumElements(prefix));
+                node->num_elements());
           }
-          ctx->model()->AddProcessingTime(prefix,
-                                          stats_collector->processing_time());
-          ctx->model()->RecordStart(prefix, false /* stop_output */);
+          node->add_processing_time(stats_collector->processing_time());
+        }
+        if (collect_usage) {
+          node->record_start(EnvTime::NowNanos());
         }
         done(s);
-        if (ctx->model()) {
-          ctx->model()->RecordStop(prefix, false /* start_output */);
+        if (collect_usage) {
+          node->record_stop(EnvTime::NowNanos());
         }
       },
-      std::move(done), ctx, prefix, std::move(stats_collector),
-      std::placeholders::_1);
+      std::move(done), ctx, std::move(stats_collector), std::placeholders::_1);
 
   profiler::TraceMe activity(
       [&] {
@@ -846,7 +843,12 @@ void InstantiatedCapturedFunction::RunAsync(
             "InstantiatedCapturedFunction::RunAsync#id=", f_opts.step_id, "#");
       },
       profiler::TraceMeLevel::kInfo);
+  // Stop the usage collection before calling `Run()` because `callback` may
+  // be executed synchronously, and so the `node->record_start()` call within
+  // `callback` would violate nesting.
+  if (collect_usage) node->record_stop(EnvTime::NowNanos());
   lib_->Run(f_opts, f_handle_, frame, std::move(callback));
+  if (collect_usage) node->record_start(EnvTime::NowNanos());
 }
 
 bool InstantiatedCapturedFunction::ShouldCreateRendezvous() const {
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 564ab9418ee..284a02091dd 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -95,7 +96,7 @@ class InstantiatedCapturedFunction {
   void RunAsync(IteratorContext* ctx, std::vector<Tensor>&& args,
                 std::vector<Tensor>* rets,
                 FunctionLibraryRuntime::DoneCallback done,
-                const string& prefix) const;
+                const std::shared_ptr<model::Node>& node) const;
 
  private:
   InstantiatedCapturedFunction(
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index 20049bf51f7..ddfaa69b14b 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/dataset_ops.h"
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/utils/traversal.h"
@@ -40,6 +40,10 @@ namespace data {
 /* static */ constexpr const char* const DatasetFromGraphOp::kGraphDef;
 /* static */ constexpr const char* const DatasetFromGraphOp::kHandle;
 
+namespace {
+constexpr char kPyFunc[] = "PyFunc";
+}  // namespace
+
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 DatasetToGraphOp::DatasetToGraphOp(OpKernelConstruction* ctx)
@@ -89,7 +93,9 @@ void DatasetToGraphOp::Compute(OpKernelContext* ctx) {
     auto library = graph_def.mutable_library();
     for (auto& function : (*library->mutable_function())) {
       for (auto& node : (*function.mutable_node_def())) {
-        if (!node.device().empty()) {
+        // We do not strip the device assignment from `PyFunc` ops because they
+        // need to be pinned to a host that is known to have Python interpreter.
+        if (!node.device().empty() && node.op() != kPyFunc) {
           *node.mutable_device() = DeviceNameUtils::LocalName(node.device());
         }
       }
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index 968bb509cb9..b91ab9b733c 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -55,7 +56,6 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/data/batch_dataset_op.h"
 #include "tensorflow/core/kernels/data/concatenate_dataset_op.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
index c70b0a7d417..0d07a93d4f2 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.h
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -27,23 +27,20 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index bedd5facda9..70ca70176e8 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -39,40 +39,6 @@ Status CreateHandle(OpKernelContext* ctx, T* resource,
   return Status::OK();
 }
 
-// A wrapper class that manages the lifetime of a resource handle from its
-// creation to its deletion from the resource manager.
-class OwnedResourceHandle {
- public:
-  template <typename T>
-  static Status Create(OpKernelContext* ctx, T* resource, const string& name,
-                       std::unique_ptr<OwnedResourceHandle>* result) {
-    ResourceHandle handle;
-    TF_RETURN_IF_ERROR(CreateHandle<T>(ctx, resource, name, &handle));
-    // We need to increase the refcount to match the decrease that occurs when
-    // the resource associate.
-    resource->Ref();
-    *result = absl::make_unique<OwnedResourceHandle>(ctx, std::move(handle));
-    return Status::OK();
-  }
-
-  OwnedResourceHandle(OpKernelContext* ctx, ResourceHandle&& handle)
-      : mgr_(ctx->resource_manager()), handle_(handle) {}
-
-  ~OwnedResourceHandle() {
-    Status s = mgr_->Delete(handle_);
-    if (!s.ok()) {
-      VLOG(2) << s.ToString();
-    }
-  }
-
-  // Returns the wrapped `ResourceHandle` object.
-  const ResourceHandle& handle() const { return handle_; }
-
- private:
-  ResourceMgr* mgr_;  // not owned
-  const ResourceHandle handle_;
-};
-
 template <typename T>
 class AnonymousResourceOp : public OpKernel {
  public:
@@ -97,7 +63,10 @@ class AnonymousResourceOp : public OpKernel {
 
     if (create_deleter_) {
       Tensor* deleter_t;
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &deleter_t));
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(1, TensorShape({}), &deleter_t, attr));
       deleter_t->scalar<Variant>()() =
           ResourceDeleter(handle, ctx->resource_manager());
     }
@@ -286,6 +255,28 @@ Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
 std::function<void(std::function<void()>)> RunnerWithMaxParallelism(
     std::function<void(std::function<void()>)> runner, int max_parallelism);
 
+// Op for creating a typed dummy resource.
+//
+// This op is used to provide a resource "placeholder" for ops such as
+// `CacheDatasetV2` or `ShuffleDatasetV2` that expects a resource input.
+// Originally, the lifetime of the resources passed into these ops was managed
+// externally. After the implementation changed to manage the lifetime of the
+// resources (including creation) by the ops themselves, the resource input is
+// only needed to pass a resource handle through graph rewrites. When they are
+// invoked from user code, the implementation passes in a dummy resource.
+template <typename ResourceType>
+class DummyResourceOp : public OpKernel {
+ public:
+  explicit DummyResourceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &tensor));
+    tensor->scalar<ResourceHandle>()() = MakeResourceHandle<ResourceType>(
+        ctx, /*container=*/"", /*name=*/"dummy_resource");
+  }
+};
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index e5614be2727..85f8af878ee 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -1,7 +1,6 @@
 # Description:
 #   Contains experimental kernels for datasets and iterators.
 
-load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -132,20 +131,17 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/data/service:compression_utils",
-        "//tensorflow/core/data/service:credentials_factory",
-        "//tensorflow/core/data/service:grpc_util",
-        "//tensorflow/core/data/service:master_cc_grpc_proto",
-        "//tensorflow/core/data/service:master_proto_cc",
-        "//tensorflow/core/data/service:worker_cc_grpc_proto",
+        "//tensorflow/core/data:compression_utils",
+        "//tensorflow/core/data:dataset_proto_cc",
+        "//tensorflow/core/data/service:data_service",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:serialization_utils",
+        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
-        tf_grpc_cc_dependency(),
     ],
 )
 
@@ -158,13 +154,9 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/data/service:credentials_factory",
-        "//tensorflow/core/data/service:grpc_util",
-        "//tensorflow/core/data/service:master_cc_grpc_proto",
-        "//tensorflow/core/data/service:master_proto_cc",
+        "//tensorflow/core/data/service:data_service",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:iterator_ops",
-        tf_grpc_cc_dependency(),
     ],
 )
 
@@ -526,6 +518,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/platform:coding",
         "//tensorflow/core/platform:random",
         "//tensorflow/core/profiler/lib:traceme",
@@ -670,6 +663,16 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    # data service kernels depend on GRPC, so we package them separately
+    # so that downstream rules can avoid depending on GRPC.
+    name = "data_service_kernels",
+    deps = [
+        ":data_service_dataset_op",
+        ":data_service_ops",
+    ],
+)
+
 tf_kernel_library(
     name = "dataset_kernels",
     deps = [
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
index 7edbe1e8712..821314740a2 100644
--- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
@@ -61,7 +61,6 @@ void AutoShardDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
   // function optimization and explicitly handle function modifications
   // for those datasets in the rewrite.
   OP_REQUIRES_OK(ctx, RewriteDataset(ctx, input, std::move(config_factory),
-                                     /*optimize_function_library=*/false,
                                      /*record_fingerprint=*/false, output));
 }
 
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
index f346dcc70c3..6ab72d85a99 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
@@ -319,6 +319,8 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
       }
 
       void RunnerThread(IteratorContext* ctx, InvocationResult* result, int i) {
+        RecordStart(ctx);
+        auto cleanup = gtl::MakeCleanup([this, ctx]() { RecordStop(ctx); });
         int64 start = EnvTime::NowNanos();
         Status s = input_impls_[i]->GetNext(ctx, &result->out_tensors,
                                             &result->end_of_sequence);
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 3e9edbf3349..3f8e778d1d8 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -18,18 +18,12 @@ limitations under the License.
 #include <memory>
 #include <queue>
 
-#include "grpcpp/create_channel.h"
-#include "grpcpp/impl/codegen/server_context.h"
-#include "grpcpp/security/credentials.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/data/service/compression_utils.h"
-#include "tensorflow/core/data/service/credentials_factory.h"
-#include "tensorflow/core/data/service/grpc_util.h"
-#include "tensorflow/core/data/service/master.grpc.pb.h"
-#include "tensorflow/core/data/service/master.pb.h"
-#include "tensorflow/core/data/service/worker.grpc.pb.h"
+#include "tensorflow/core/data/compression_utils.h"
+#include "tensorflow/core/data/dataset.pb.h"
+#include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/model.h"
@@ -43,26 +37,34 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/snappy.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
 namespace data {
 
 /* static */ constexpr const char* const DataServiceDatasetOp::kDatasetType;
+/* static */ constexpr const char* const DataServiceDatasetOp::kDatasetId;
+/* static */ constexpr const char* const DataServiceDatasetOp::kProcessingMode;
 /* static */ constexpr const char* const DataServiceDatasetOp::kAddress;
-/* static */ constexpr const char* const DataServiceDatasetOp::kEpochId;
 /* static */ constexpr const char* const DataServiceDatasetOp::kProtocol;
+/* static */ constexpr const char* const DataServiceDatasetOp::kJobName;
 /* static */ constexpr const char* const
     DataServiceDatasetOp::kMaxOutstandingRequests;
+/* static */ constexpr const char* const
+    DataServiceDatasetOp::kIterationCounter;
 /* static */ constexpr const char* const DataServiceDatasetOp::kOutputTypes;
 /* static */ constexpr const char* const DataServiceDatasetOp::kOutputShapes;
 
+namespace {
 // Once we've spent `kRetryTimeoutMicros` in `GetNextInternal`, we will wait for
 // the current attempt to complete and perform no more retries.
 const int64 kRetryTimeoutMicros = 1000LL * 1000 * 60 * 60;  // 60 minutes.
 
-// How often to refresh the task list.
-const int64 kRefreshTasksIntervalMicros = 1000LL * 1000 * 60;  // 60 seconds.
+// Default interval between task list refreshes.
+const int64 kDefaultTaskRefreshIntervalMs = 1000;  // 1 second.
+
+}  // namespace
 
 // Dataset for reading data from the tf.data service non-deterministically.
 //
@@ -71,21 +73,47 @@ const int64 kRefreshTasksIntervalMicros = 1000LL * 1000 * 60;  // 60 seconds.
 // to read from (in case workers are added or removed).
 class DataServiceDatasetOp::Dataset : public DatasetBase {
  public:
-  Dataset(OpKernelContext* ctx, const std::string& address,
-          const std::string& protocol, const int64 max_outstanding_requests,
+  Dataset(OpKernelContext* ctx, int64 dataset_id,
+          ProcessingMode processing_mode, const std::string& address,
+          const std::string& protocol, const std::string& job_name,
+          int64 max_outstanding_requests, int64 task_refresh_interval_ms,
+          IterationCounter* iteration_counter, bool owns_resource,
+          ResourceHandle iteration_counter_handle,
           const DataTypeVector& output_types,
           const std::vector<PartialTensorShape>& output_shapes)
       : DatasetBase(DatasetContext(ctx)),
+        dataset_id_(dataset_id),
+        processing_mode_(processing_mode),
         address_(address),
         protocol_(protocol),
+        job_name_(job_name),
         max_outstanding_requests_(max_outstanding_requests),
+        task_refresh_interval_ms_(task_refresh_interval_ms),
+        iteration_counter_(iteration_counter),
+        owns_resource_(owns_resource),
+        iteration_counter_handle_(iteration_counter_handle),
+        resource_mgr_(ctx->resource_manager()),
         output_types_(output_types),
         output_shapes_(output_shapes) {}
 
+  ~Dataset() override {
+    iteration_counter_->Unref();
+    if (owns_resource_) {
+      Status s = resource_mgr_->Delete<IterationCounter>(
+          iteration_counter_handle_.container(),
+          iteration_counter_handle_.name());
+      if (!s.ok()) {
+        LOG(WARNING) << "Failed to delete iteration counter resource: " << s;
+      }
+    }
+  }
+
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return absl::make_unique<Iterator>(Iterator::Params{
-        this, name_utils::IteratorPrefix(kDatasetType, prefix)});
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this,
+                         name_utils::IteratorPrefix(kDatasetType, prefix)},
+        iteration_counter_->GetAndIncrement());
   }
 
   const DataTypeVector& output_dtypes() const override { return output_types_; }
@@ -108,31 +136,62 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** output) const override {
+    Node* dataset_id;
+    TF_RETURN_IF_ERROR(b->AddScalar(dataset_id_, &dataset_id));
+
+    Node* processing_mode;
+    tstring processing_mode_str = ProcessingModeToString(processing_mode_);
+    TF_RETURN_IF_ERROR(b->AddScalar(processing_mode_str, &processing_mode));
+
     Node* address;
     TF_RETURN_IF_ERROR(b->AddScalar(address_, &address));
 
     Node* protocol;
     TF_RETURN_IF_ERROR(b->AddScalar(protocol_, &protocol));
 
+    Node* job_name;
+    TF_RETURN_IF_ERROR(b->AddScalar(job_name_, &job_name));
+
     Node* max_outstanding_requests;
     TF_RETURN_IF_ERROR(
         b->AddScalar(max_outstanding_requests_, &max_outstanding_requests));
 
-    TF_RETURN_IF_ERROR(b->AddDataset(
-        this, {address, protocol, max_outstanding_requests}, {}, output));
+    Node* iteration_counter_handle = nullptr;
+    Tensor handle(DT_RESOURCE, TensorShape({}));
+    handle.scalar<ResourceHandle>()() = iteration_counter_handle_;
+    TF_RETURN_IF_ERROR(b->AddTensor(handle, &iteration_counter_handle));
+
+    AttrValue task_refresh_interval_hint_ms;
+    b->BuildAttrValue(task_refresh_interval_ms_,
+                      &task_refresh_interval_hint_ms);
+
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this,
+                      {dataset_id, processing_mode, address, protocol, job_name,
+                       max_outstanding_requests, iteration_counter_handle},
+                      {std::make_pair(kTaskRefreshIntervalHintMs,
+                                      task_refresh_interval_hint_ms)},
+                      output));
     return Status::OK();
   }
 
  private:
   class Iterator : public DatasetIterator<Dataset> {
    public:
-    explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params) {}
+    explicit Iterator(const Params& params, int64 iterator_index)
+        : DatasetIterator<Dataset>(params),
+          iterator_index_(iterator_index),
+          max_outstanding_requests_(params.dataset->max_outstanding_requests_) {
+    }
 
     ~Iterator() override {
       mutex_lock l(mu_);
+      VLOG(1) << "Destroying data service dataset iterator for job id "
+              << job_id_;
       cancelled_ = true;
-      cv_.notify_all();
+      worker_thread_cv_.notify_all();
+      manager_thread_cv_.notify_all();
+      get_next_cv_.notify_all();
       // Thread destructors will block until the threads finish, no need to wait
       // here.
     }
@@ -140,15 +199,16 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     Status Initialize(IteratorContext* ctx) override {
       VLOG(3) << "Connecting to " << dataset()->address_
               << " in data service dataset op";
-      if (ctx->epoch_id() == IteratorContext::kNoEpochId) {
-        // TODO(aaudibert): add instructions for passing an epoch id after we
-        // add a Python API.
-        return errors::FailedPrecondition(
-            "Expected an epoch id, but none found.");
+      DataServiceMasterClient master(dataset()->address_, dataset()->protocol_);
+      if (dataset()->job_name_.empty()) {
+        TF_RETURN_IF_ERROR(master.CreateJob(
+            dataset()->dataset_id_, dataset()->processing_mode_, &job_id_));
+      } else {
+        TF_RETURN_IF_ERROR(master.GetOrCreateJob(
+            dataset()->dataset_id_, dataset()->processing_mode_,
+            dataset()->job_name_, iterator_index_, &job_id_));
       }
-      epoch_id_ = ctx->epoch_id();
-      TF_RETURN_IF_ERROR(CredentialsFactory::CreateClientCredentials(
-          dataset()->protocol_, &credentials_));
+      VLOG(1) << "Created data service job with id " << job_id_;
       return Status::OK();
     }
 
@@ -158,27 +218,31 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       VLOG(3) << "Calling GetNext in data service dataset op";
       mutex_lock l(mu_);
       if (!task_thread_manager_ && !cancelled_) {
-        task_thread_manager_ = ctx->StartThread(
-            "task-thread-manager", [this, ctx]() { TaskThreadManager(ctx); });
+        task_thread_manager_ =
+            ctx->StartThread("task-thread-manager", [this, ctx]() {
+              TaskThreadManager(absl::make_unique<IteratorContext>(*ctx));
+            });
       }
 
-      // tasks_.empty() indicates that we haven't yet received tasks from the
-      // master, so we should wait.
-      while (results_.empty() &&
-             (tasks_.empty() || num_unfinished_tasks_ > 0) && !cancelled_) {
-        cv_.wait(l);
+      while (results_.empty() && !job_finished_ && !cancelled_ &&
+             status_.ok()) {
+        get_next_cv_.wait(l);
       }
       if (cancelled_) {
         return errors::Cancelled("Data service iterator was cancelled");
       }
+      if (!status_.ok()) {
+        return status_;
+      }
       if (results_.empty()) {
         *end_of_sequence = true;
         return Status::OK();
       }
       DCHECK(!results_.empty());
+      *end_of_sequence = false;
       out_tensors->swap(results_.front());
       results_.pop();
-      cv_.notify_all();
+      worker_thread_cv_.notify_one();
 
       return Status::OK();
     }
@@ -201,25 +265,29 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     }
 
    private:
-    typedef struct TaskThread {
-      int64 task_id;
-      // Cached address of the worker for task `task_id`.
-      std::string address;
-      std::unique_ptr<WorkerService::Stub> worker_stub;
-      std::unique_ptr<Thread> thread;
-      bool end_of_sequence = false;
-    } TaskThread;
+    struct Task {
+      Task(int64 task_id, const std::string& address,
+           std::unique_ptr<DataServiceWorkerClient> worker)
+          : task_id(task_id), address(address), worker(std::move(worker)) {}
+
+      const int64 task_id;
+      // Address of the tf.data service worker for task `task_id`.
+      const std::string address;
+      // Client for fetching task elements from the tf.data service worker.
+      const std::unique_ptr<DataServiceWorkerClient> worker;
+      // Indicates whether a worker thread is currently processing the task.
+      bool in_use TF_GUARDED_BY(&Iterator::mu_) = false;
+      // Indicates whether the worker has returned end_of_sequence for the task.
+      bool end_of_sequence TF_GUARDED_BY(&Iterator::mu_) = false;
+    };
 
     // Periodically refresh the task list.
     // Maintain one thread fetching elements for each task.
     // TODO(aaudibert): Instead of polling, have master send updates when
     // the list of tasks changes.
-    void TaskThreadManager(IteratorContext* ctx) {
-      VLOG(3) << "Starting task handler manager";
-      auto channel = ::grpc::CreateChannel(dataset()->address_, credentials_);
-      std::unique_ptr<MasterService::Stub> master_stub =
-          MasterService::NewStub(channel);
-
+    void TaskThreadManager(std::unique_ptr<IteratorContext> ctx) {
+      VLOG(3) << "Starting task thread manager";
+      DataServiceMasterClient master(dataset()->address_, dataset()->protocol_);
       uint64 next_check = Env::Default()->NowMicros();
       while (true) {
         {
@@ -227,185 +295,189 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
           // All units are microseconds.
           while (!cancelled_ && Env::Default()->NowMicros() < next_check) {
             int64 remaining_time = next_check - Env::Default()->NowMicros();
-            VLOG(3) << "Task manager waiting for " << remaining_time << "us";
-            cv_.wait_for(l, std::chrono::microseconds(remaining_time));
+            VLOG(3) << "Task thread manager waiting for " << remaining_time
+                    << "us";
+            manager_thread_cv_.wait_for(
+                l, std::chrono::microseconds(remaining_time));
           }
           if (cancelled_) {
+            VLOG(3) << "Task thread manager finished";
             return;
           }
         }
-        UpdateTaskThreads(master_stub.get(), ctx);
-        next_check = Env::Default()->NowMicros() + kRefreshTasksIntervalMicros;
+        UpdateTasks(&master);
+        UpdateWorkerThreads(ctx.get());
+        next_check = Env::Default()->NowMicros() +
+                     dataset()->task_refresh_interval_ms_ * 1000;
       }
     }
 
-    void UpdateTaskThreads(MasterService::Stub* master_stub,
-                           IteratorContext* ctx) LOCKS_EXCLUDED(mu_) {
-      VLOG(3) << "Updating task handler threads";
-      GetTasksResponse resp;
-      GetTasksRequest req;
-      req.set_epoch_id(epoch_id_);
-      grpc::ClientContext client_ctx;
-      grpc::Status s = master_stub->GetTasks(&client_ctx, req, &resp);
+    void UpdateTasks(DataServiceMasterClient* master) LOCKS_EXCLUDED(mu_) {
+      VLOG(3) << "Updating tasks";
+      std::vector<TaskInfo> tasks;
+      bool job_finished;
+      Status s = master->GetTasks(job_id_, &tasks, &job_finished);
       if (!s.ok()) {
-        LOG(INFO) << "Failed to get task info for epoch id " << epoch_id_
-                  << ": " << s.error_message() << "(" << s.error_code() << ")";
+        LOG(WARNING) << "Failed to get task info for job id " << job_id_ << ": "
+                     << s;
         return;
       }
-      absl::flat_hash_set<int64> task_ids;
-      mutex_lock l(mu_);
-      for (auto& task : resp.task_info()) {
-        task_ids.insert(task.id());
-        if (task_threads_.contains(task.id())) {
-          continue;
-        }
-        tasks_[task.id()] = task;
-        task_threads_[task.id()] = absl::make_unique<TaskThread>();
-        TaskThread* task_handler = task_threads_[task.id()].get();
-        task_handler->task_id = task.id();
-        num_unfinished_tasks_++;
-        task_handler->thread = ctx->StartThread(
-            "tf-data-service-task_handler",
-            [this, task_handler]() { RunTaskThread(task_handler); });
+      absl::flat_hash_map<int64, TaskInfo> task_id_to_task;
+      for (auto& task : tasks) {
+        task_id_to_task[task.id()] = task;
       }
-      // Mark deleted tasks and clean up finished task threads.
-      for (auto it = task_threads_.begin(); it != task_threads_.end();) {
-        TaskThread* task_thread = it->second.get();
-        if (task_thread->end_of_sequence) {
-          task_threads_.erase(it++);
+      mutex_lock l(mu_);
+      job_finished_ = job_finished;
+      if (job_finished) {
+        get_next_cv_.notify_all();
+        return;
+      }
+      for (int i = 0; i < tasks_.size(); ++i) {
+        std::shared_ptr<Task> task = tasks_[i];
+        if (task_id_to_task.contains(task->task_id)) {
+          // Remove already-known tasks from `task_id_to_task`, so that at the
+          // end of the loop, only new tasks remain.
+          task_id_to_task.erase(task->task_id);
+        } else {
+          // Task has been removed.
+          if (task->end_of_sequence) {
+            finished_tasks_--;
+          }
+          tasks_[i] = tasks_[tasks_.size() - 1];
+          tasks_.pop_back();
+        }
+      }
+      for (auto& new_task_entry : task_id_to_task) {
+        TaskInfo& task_info = new_task_entry.second;
+        std::unique_ptr<DataServiceWorkerClient> worker;
+        Status s = CreateDataServiceWorkerClient(task_info.worker_address(),
+                                                 dataset()->protocol_, &worker);
+        if (!s.ok()) {
+          status_ = s;
+          get_next_cv_.notify_all();
           continue;
         }
-        if (!task_ids.contains(task_thread->task_id)) {
-          task_thread->end_of_sequence = true;
-        }
-        ++it;
+        tasks_.push_back(std::make_shared<Task>(
+            task_info.id(), task_info.worker_address(), std::move(worker)));
       }
       if (dataset()->max_outstanding_requests_ == model::kAutotune) {
         // Adjust max_outstanding_requests to account for newly added tasks.
-        max_outstanding_requests_ = task_threads_.size();
+        max_outstanding_requests_ = tasks_.size();
       }
     }
 
-    void RunTaskThread(TaskThread* task_handler) {
-      auto cleanup = gtl::MakeCleanup([this]() {
-        mutex_lock l(mu_);
-        outstanding_requests_--;
-        num_unfinished_tasks_--;
-        cv_.notify_all();
-      });
-      {
-        mutex_lock l(mu_);
+    void UpdateWorkerThreads(IteratorContext* ctx) LOCKS_EXCLUDED(mu_) {
+      mutex_lock l(mu_);
+      while (num_running_worker_threads_ < max_outstanding_requests_) {
+        num_running_worker_threads_++;
         outstanding_requests_++;
-        task_handler->address = tasks_[task_handler->task_id].worker_address();
+        auto done = [this]() {
+          mutex_lock l(mu_);
+          num_running_worker_threads_--;
+          outstanding_requests_--;
+          VLOG(3) << "Exiting worker thread";
+        };
+        worker_threads_.push_back(ctx->StartThread(
+            "tf-data-service-task_thread", [this, done = std::move(done)]() {
+              RunWorkerThread(std::move(done));
+            }));
       }
-      VLOG(3) << "Starting task handler thread for task "
-              << task_handler->task_id << " with worker address "
-              << task_handler->address;
+    }
+
+    void RunWorkerThread(std::function<void()> done) {
+      auto cleanup = gtl::MakeCleanup([done = std::move(done)]() { done(); });
+      VLOG(3) << "Starting worker thread";
+      std::shared_ptr<Task> task_to_process;
       while (true) {
-        if (!task_handler->worker_stub) {
-          Status s = CreateWorkerStub(task_handler->address,
-                                      &task_handler->worker_stub);
-          if (!s.ok()) {
-            LOG(WARNING) << "Failed to create a worker stub for "
-                         << task_handler->address << ": " << s;
-          }
-        }
         {
           mutex_lock l(mu_);
-          if (task_handler->end_of_sequence) {
-            return;
+          if (task_to_process) {
+            task_to_process->in_use = false;
+            task_to_process = nullptr;
+            worker_thread_cv_.notify_one();
           }
           outstanding_requests_--;
-          while (!cancelled_ && results_.size() + outstanding_requests_ >=
-                                    max_outstanding_requests_) {
-            VLOG(3) << "Task thread for task " << task_handler->task_id
-                    << " waiting. results_.size()=" << results_.size()
-                    << " outstanding_requests_=" << outstanding_requests_;
-            cv_.wait(l);
+          while (!cancelled_ && !(SpaceInBuffer() && TaskAvailable())) {
+            if (VLOG_IS_ON(3)) {
+              VLOG(3) << "Sleeping with results_.size=" << results_.size()
+                      << ", outstanding_requests_=" << outstanding_requests_
+                      << ", max_oustanding_requests="
+                      << max_outstanding_requests_
+                      << " finished_tasks=" << finished_tasks_
+                      << " tasks_.size()=" << tasks_.size();
+            }
+            worker_thread_cv_.wait(l);
           }
-          outstanding_requests_++;
           if (cancelled_) {
             return;
           }
+          outstanding_requests_++;
+          // Search for a task to update.
+          int num_tasks = tasks_.size();
+          for (int i = 0; i < num_tasks; ++i) {
+            int index = (next_task_index_ + i) % num_tasks;
+            std::shared_ptr<Task>& task = tasks_[index];
+            if (!task->in_use && !task->end_of_sequence) {
+              task->in_use = true;
+              task_to_process = task;
+              next_task_index_ = (index + 1) % num_tasks;
+              break;
+            }
+          }
+          DCHECK(task_to_process != nullptr);
+          VLOG(3) << "Processing task " << task_to_process->task_id;
         }
-        // TODO(aaudibert): add backoff and max retries.
         int64 deadline_micros =
             Env::Default()->NowMicros() + kRetryTimeoutMicros;
-        Status s = FetchElement(task_handler, deadline_micros);
+        Status s = GetElement(task_to_process.get(), deadline_micros);
         if (!s.ok()) {
-          LOG(WARNING) << "Failed to fetch element from worker at "
-                       << task_handler->address << ": " << s;
+          mutex_lock l(mu_);
+          status_ = s;
+          get_next_cv_.notify_all();
+          return;
         }
       }
     }
 
-    Status FetchElement(TaskThread* task_handler, int64 deadline_micros) {
-      VLOG(3) << "Fetchng an element for task id " << task_handler->task_id;
-      GetElementResponse resp;
-      TF_RETURN_IF_ERROR(
-          GetElementWithDeadline(task_handler, &resp, deadline_micros));
-      std::vector<Tensor> element;
-      if (!resp.end_of_sequence()) {
-        TF_RETURN_IF_ERROR(
-            service_util::Uncompress(resp.compressed_element(), &element));
-      }
-      mutex_lock l(mu_);
-      if (resp.end_of_sequence()) {
-        task_handler->end_of_sequence = true;
-        return Status::OK();
-      }
-      results_.push(std::move(element));
-      cv_.notify_all();
-      VLOG(3) << "Fetched an element for task id " << task_handler->task_id;
-      return Status::OK();
-    }
-
-    Status CreateWorkerStub(const std::string& worker_address,
-                            std::unique_ptr<WorkerService::Stub>* stub) {
-      ::grpc::ChannelArguments args;
-      args.SetMaxReceiveMessageSize(-1);
-      std::shared_ptr<::grpc::ChannelCredentials> credentials;
-      TF_RETURN_IF_ERROR(CredentialsFactory::CreateClientCredentials(
-          dataset()->protocol_, &credentials));
-      auto channel =
-          ::grpc::CreateCustomChannel(worker_address, credentials, args);
-      *stub = WorkerService::NewStub(channel);
-      return Status::OK();
-    }
-
-    Status GetElementWithDeadline(TaskThread* task_handler,
-                                  GetElementResponse* resp,
-                                  int64 deadline_micros) {
-      return RetryWithDeadline(
-          [task_handler, resp] {
-            GetElementRequest req;
-            req.set_task_id(task_handler->task_id);
-            grpc::ClientContext client_ctx;
-            grpc::Status s =
-                task_handler->worker_stub->GetElement(&client_ctx, req, resp);
-            if (s.ok()) {
-              return Status::OK();
-            }
-            return grpc_util::WrapError("Failed to fetch an element", s);
-          },
-          deadline_micros);
-    }
-
-    static bool ShouldRetryError(error::Code error_code) {
-      // Retry all errors that could indicate preemption.
-      return error_code == error::Code::UNAVAILABLE ||
-             error_code == error::Code::CANCELLED ||
-             error_code == error::Code::ABORTED;
-    }
-
-    static Status RetryWithDeadline(const std::function<Status()>& call,
-                                    int64 deadline_micros) {
-      Status s;
+    // Gets an element from a task and adds the element to `results_`.
+    //
+    // If the task reaches end_of_sequence or is cancelled (e.g. due to a
+    // worker dying), GetElement returns Status::OK() without adding to
+    // `results_`.
+    Status GetElement(Task* task, int64 deadline_micros)
+        TF_LOCKS_EXCLUDED(mu_) {
+      VLOG(3) << "Getting an element for task id " << task->task_id;
+      tensorflow::profiler::TraceMe activity(
+          "GetElement", tensorflow::profiler::TraceMeLevel::kInfo);
+      CompressedElement compressed;
+      bool end_of_sequence;
       for (int num_retries = 0;; ++num_retries) {
-        s = call();
-        if (s.ok() || !ShouldRetryError(s.code())) {
+        Status s = task->worker->GetElement(task->task_id, &compressed,
+                                            &end_of_sequence);
+        if (s.ok()) {
+          break;
+        }
+        if (errors::IsNotFound(s)) {
+          // This indicates that the worker was restarted. The restarted worker
+          // will get a new task, and the old task is lost.
+          mutex_lock l(mu_);
+          finished_tasks_++;
+          task->end_of_sequence = true;
+          return Status::OK();
+        }
+        // Retry all errors that could indicate preemption.
+        if (!errors::IsUnavailable(s) && !errors::IsCancelled(s) &&
+            !errors::IsAborted(s)) {
           return s;
         }
+        {
+          mutex_lock l(mu_);
+          // If `UpdateTaskThreads` finds that the task has been cancelled, it
+          // will set end_of_sequence to `true`.
+          if (task->end_of_sequence || cancelled_) {
+            return Status::OK();
+          }
+        }
         const int64 now_micros = EnvTime::NowMicros();
         if (now_micros > deadline_micros) {
           return s;
@@ -421,12 +493,38 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
                 : deadline_micros;
         Env::Default()->SleepForMicroseconds(backoff_until - now_micros);
       }
+
+      std::vector<Tensor> element;
+      if (!end_of_sequence) {
+        TF_RETURN_IF_ERROR(UncompressElement(compressed, &element));
+      }
+      mutex_lock l(mu_);
+      if (end_of_sequence) {
+        task->end_of_sequence = true;
+        finished_tasks_++;
+        return Status::OK();
+      }
+      results_.push(std::move(element));
+      get_next_cv_.notify_all();
+      VLOG(3) << "Got an element for task id " << task->task_id;
+      return Status::OK();
     }
 
+    bool SpaceInBuffer() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return results_.size() + outstanding_requests_ <
+             max_outstanding_requests_;
+    }
+
+    bool TaskAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return finished_tasks_ + outstanding_requests_ < tasks_.size();
+    }
+
+    const int64 iterator_index_;
+
     mutex mu_;
-    // TODO(aaudibert): split this into a couple cvs for different conditions
-    // so that we can use notify_one and avoid unnecessary wakeups.
-    condition_variable cv_ TF_GUARDED_BY(mu_);
+    condition_variable get_next_cv_ TF_GUARDED_BY(mu_);
+    condition_variable worker_thread_cv_ TF_GUARDED_BY(mu_);
+    condition_variable manager_thread_cv_ TF_GUARDED_BY(mu_);
     bool cancelled_ TF_GUARDED_BY(mu_) = false;
 
     int64 outstanding_requests_ TF_GUARDED_BY(mu_) = 0;
@@ -434,39 +532,74 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     // at the same time. This count includes both in-progress requests for
     // elements as well as completed requests which haven't yet been produced.
     int64 max_outstanding_requests_ TF_GUARDED_BY(mu_);
+
+    // The number of threads in `worker_threads_` which are still running.
+    int64 num_running_worker_threads_ TF_GUARDED_BY(mu_) = 0;
+
+    // The index of the next task in `tasks_` to read from.
+    int64 next_task_index_ TF_GUARDED_BY(mu_) = 0;
+
+    // The number tasks in the `tasks_` list that have reached end_of_sequence.
+    int64 finished_tasks_ TF_GUARDED_BY(mu_) = 0;
+
+    // List of tasks to read from.
+    std::vector<std::shared_ptr<Task>> tasks_ TF_GUARDED_BY(mu_);
+
+    // A status to be returned from the next call to `GetNext`. This is set by
+    // asynchronous threads when they encounter errors.
+    Status status_ TF_GUARDED_BY(mu_) = Status::OK();
     std::queue<std::vector<Tensor>> results_ TF_GUARDED_BY(mu_);
 
     // Set once in Initialize().
-    int64 epoch_id_;
-    std::shared_ptr<::grpc::ChannelCredentials> credentials_;
-    int64 num_unfinished_tasks_ TF_GUARDED_BY(mu_) = 0;
-    // Map from task id to task info.
-    absl::flat_hash_map<int64, TaskInfo> tasks_ TF_GUARDED_BY(mu_);
+    int64 job_id_;
 
-    // Must come second to last so that task threads are joined before
+    bool job_finished_ = false;
+    // Must be ordered second to last so that worker threads are joined before
     // destroying other fields.
-    absl::flat_hash_map<int64, std::unique_ptr<TaskThread>> task_threads_
-        TF_GUARDED_BY(mu_);
+    std::vector<std::unique_ptr<Thread>> worker_threads_ TF_GUARDED_BY(mu_);
     // Must be ordered last so that the thread is joined before destroying other
     // fields.
     std::unique_ptr<Thread> task_thread_manager_ GUARDED_BY(mu_);
   };
 
+  const int64 dataset_id_;
+  const ProcessingMode processing_mode_;
   const tstring address_;
   const tstring protocol_;
+  const tstring job_name_;
   const int64 max_outstanding_requests_;
+  const int64 task_refresh_interval_ms_;
+  IterationCounter* const iteration_counter_;  // Owned
+  const bool owns_resource_;
+  const ResourceHandle iteration_counter_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned
   const DataTypeVector output_types_;
   const std::vector<PartialTensorShape> output_shapes_;
 };
 
 DataServiceDatasetOp::DataServiceDatasetOp(OpKernelConstruction* ctx)
     : DatasetOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kTaskRefreshIntervalHintMs,
+                                   &task_refresh_interval_hint_ms_));
+  if (task_refresh_interval_hint_ms_ == model::kAutotune) {
+    task_refresh_interval_hint_ms_ = kDefaultTaskRefreshIntervalMs;
+  }
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
 }
 
 void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
                                        DatasetBase** output) {
+  int64 dataset_id;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kDatasetId, &dataset_id));
+
+  tstring processing_mode_str;
+  OP_REQUIRES_OK(
+      ctx, ParseScalarArgument(ctx, kProcessingMode, &processing_mode_str));
+  ProcessingMode processing_mode;
+  OP_REQUIRES_OK(ctx,
+                 ParseProcessingMode(processing_mode_str, &processing_mode));
+
   tstring address;
   OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kAddress, &address));
   OP_REQUIRES(ctx, !address.empty(),
@@ -477,9 +610,41 @@ void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
   OP_REQUIRES(ctx, !protocol.empty(),
               errors::InvalidArgument(kProtocol, " must be non-empty."));
 
+  tstring job_name;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kJobName, &job_name));
+
   int64 max_outstanding_requests;
   OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kMaxOutstandingRequests,
                                           &max_outstanding_requests));
+
+  ResourceHandle iteration_counter_handle;
+  OP_REQUIRES_OK(
+      ctx, HandleFromInput(ctx, kIterationCounter, &iteration_counter_handle));
+  IterationCounter* iteration_counter = nullptr;
+  Status s = ctx->resource_manager()->Lookup<IterationCounter>(
+      iteration_counter_handle.container(), iteration_counter_handle.name(),
+      &iteration_counter);
+  bool owns_resource = false;
+  if (errors::IsNotFound(s)) {
+    owns_resource = true;
+    static std::atomic<int64> resource_id_counter(0);
+    const std::string& container = ctx->resource_manager()->default_container();
+    std::string name =
+        strings::StrCat(ctx->op_kernel().name(), "/", kIterationCounter, "_",
+                        resource_id_counter.fetch_add(1));
+    OP_REQUIRES_OK(ctx,
+                   ctx->resource_manager()->LookupOrCreate<IterationCounter>(
+                       container, name, &iteration_counter,
+                       [](IterationCounter** counter) {
+                         *counter = new IterationCounter();
+                         return Status::OK();
+                       }));
+    iteration_counter_handle =
+        MakeResourceHandle<IterationCounter>(ctx, container, name);
+  } else {
+    OP_REQUIRES_OK(ctx, s);
+  }
+
   OP_REQUIRES(
       ctx,
       max_outstanding_requests == model::kAutotune ||
@@ -487,12 +652,17 @@ void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
       errors::InvalidArgument(kMaxOutstandingRequests, " must be positive or ",
                               model::kAutotune));
 
-  *output = new Dataset(ctx, address, protocol, max_outstanding_requests,
-                        output_types_, output_shapes_);
+  *output =
+      new Dataset(ctx, dataset_id, processing_mode, address, protocol, job_name,
+                  max_outstanding_requests, task_refresh_interval_hint_ms_,
+                  iteration_counter, owns_resource, iteration_counter_handle,
+                  output_types_, output_shapes_);
 }
 
 REGISTER_KERNEL_BUILDER(Name("DataServiceDataset").Device(DEVICE_CPU),
                         DataServiceDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("DummyIterationCounter").Device(DEVICE_CPU),
+                        DummyResourceOp<IterationCounter>);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h
index b0356fd53bf..b2c7f368c8e 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h
@@ -15,20 +15,48 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DATA_SERVICE_DATASET_OP_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DATA_SERVICE_DATASET_OP_H_
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 
 namespace tensorflow {
 namespace data {
 
+// A resource which counts how many iterators have been created. This is used
+// by the DataServiceDataset to coordinate jobs across multiple iterations.
+class IterationCounter : public ResourceBase {
+ public:
+  IterationCounter() : counter_(0) {}
+
+  std::string DebugString() const override {
+    mutex_lock l(mu_);
+    return absl::StrCat(counter_);
+  }
+
+  int64 GetAndIncrement() {
+    mutex_lock l(mu_);
+    return ++counter_;
+  }
+
+ private:
+  mutable mutex mu_;
+  int64 counter_ TF_GUARDED_BY(mu_) = 0;
+};
+
 // Creates a dataset for reading from the tf.data service.
 class DataServiceDatasetOp : public DatasetOpKernel {
  public:
   static constexpr const char* const kDatasetType = "DataService";
+  static constexpr const char* const kDatasetId = "dataset_id";
+  static constexpr const char* const kProcessingMode = "processing_mode";
   static constexpr const char* const kAddress = "address";
-  static constexpr const char* const kEpochId = "epoch_id";
   static constexpr const char* const kProtocol = "protocol";
+  static constexpr const char* const kJobName = "job_name";
   static constexpr const char* const kMaxOutstandingRequests =
       "max_outstanding_requests";
+  static constexpr const char* const kTaskRefreshIntervalHintMs =
+      "task_refresh_interval_hint_ms";
+  static constexpr const char* const kIterationCounter = "iteration_counter";
   static constexpr const char* const kOutputTypes = "output_types";
   static constexpr const char* const kOutputShapes = "output_shapes";
 
@@ -40,6 +68,7 @@ class DataServiceDatasetOp : public DatasetOpKernel {
  private:
   class Dataset;
 
+  int64 task_refresh_interval_hint_ms_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
 };
diff --git a/tensorflow/core/kernels/data/experimental/data_service_ops.cc b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
index 8a23d927914..c6a54baad64 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
@@ -15,13 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/experimental/data_service_ops.h"
 
-#include "grpcpp/create_channel.h"
-#include "grpcpp/security/credentials.h"
-#include "tensorflow/core/data/service/credentials_factory.h"
-#include "tensorflow/core/data/service/grpc_util.h"
-#include "tensorflow/core/data/service/master.grpc.pb.h"
+#include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace data {
@@ -56,86 +53,18 @@ void RegisterDatasetOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(
       ctx, AsGraphDef(ctx, dataset, std::move(serialization_ctx), &graph_def));
 
-  // ::grpc::ChannelArguments args;
-  std::shared_ptr<::grpc::ChannelCredentials> credentials;
-  OP_REQUIRES_OK(
-      ctx, CredentialsFactory::CreateClientCredentials(protocol, &credentials));
-  auto channel = ::grpc::CreateChannel(address, credentials);
-  auto master_stub = MasterService::NewStub(channel);
-  GetOrRegisterDatasetRequest req;
-  *req.mutable_dataset()->mutable_graph() = graph_def;
-  GetOrRegisterDatasetResponse resp;
-  grpc::ClientContext client_ctx;
-  auto status = master_stub->GetOrRegisterDataset(&client_ctx, req, &resp);
-  if (!status.ok()) {
-    ctx->CtxFailure(grpc_util::WrapError("Failed to register dataset", status));
-    return;
-  }
+  DataServiceMasterClient client(address, protocol);
+  int64 dataset_id;
+  OP_REQUIRES_OK(ctx, client.RegisterDataset(graph_def, &dataset_id));
+
   Tensor* output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &output));
   auto output_dataset_id = output->tensor<int64, 0>();
-  output_dataset_id() = resp.dataset_id();
-}
-
-BeginEpochOp::BeginEpochOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-void BeginEpochOp::Compute(OpKernelContext* ctx) {
-  int64 dataset_id;
-  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kDatasetId, &dataset_id));
-
-  tstring address;
-  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kAddress, &address));
-  OP_REQUIRES(ctx, !address.empty(),
-              errors::InvalidArgument(kAddress, " must be non-empty."));
-
-  tstring protocol;
-  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kProtocol, &protocol));
-  OP_REQUIRES(ctx, !protocol.empty(),
-              errors::InvalidArgument(kProtocol, " must be non-empty."));
-
-  std::shared_ptr<::grpc::ChannelCredentials> credentials;
-  OP_REQUIRES_OK(
-      ctx, CredentialsFactory::CreateClientCredentials(protocol, &credentials));
-  auto channel = ::grpc::CreateChannel(address, credentials);
-  auto master_stub = MasterService::NewStub(channel);
-  BeginEpochRequest req;
-  req.set_dataset_id(dataset_id);
-  BeginEpochResponse resp;
-  grpc::ClientContext client_ctx;
-  auto status = master_stub->BeginEpoch(&client_ctx, req, &resp);
-  if (!status.ok()) {
-    ctx->CtxFailure(grpc_util::WrapError(
-        absl::StrCat("Failed to begin epoch for dataset id ", dataset_id),
-        status));
-    return;
-  }
-  Tensor* output;
-  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &output));
-  auto output_epoch_id = output->tensor<int64, 0>();
-  output_epoch_id() = resp.epoch_id();
-}
-
-Status MakeDataServiceIteratorOp::DoCompute(OpKernelContext* ctx) {
-  DatasetBase* dataset;
-  TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(ctx->input(0), &dataset));
-
-  const Tensor* epoch_id_tensor;
-  TF_RETURN_IF_ERROR(ctx->input(kEpochId, &epoch_id_tensor));
-  int64 epoch_id = epoch_id_tensor->scalar<int64>()();
-
-  IteratorResource* iterator_resource;
-  TF_RETURN_IF_ERROR(
-      LookupResource(ctx, HandleFromInput(ctx, 2), &iterator_resource));
-
-  core::ScopedUnref unref_iterator(iterator_resource);
-  return iterator_resource->SetIteratorFromDataset(ctx, dataset, epoch_id);
+  output_dataset_id() = dataset_id;
 }
 
 REGISTER_KERNEL_BUILDER(Name("RegisterDataset").Device(DEVICE_CPU),
                         RegisterDatasetOp);
-REGISTER_KERNEL_BUILDER(Name("BeginEpoch").Device(DEVICE_CPU), BeginEpochOp);
-REGISTER_KERNEL_BUILDER(Name("MakeDataServiceIterator").Device(DEVICE_CPU),
-                        MakeDataServiceIteratorOp);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/data_service_ops.h b/tensorflow/core/kernels/data/experimental/data_service_ops.h
index 6a72d99bef4..b7d66938ae6 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_ops.h
+++ b/tensorflow/core/kernels/data/experimental/data_service_ops.h
@@ -44,39 +44,6 @@ class RegisterDatasetOp : public OpKernel {
   SerializationContext::ExternalStatePolicy external_state_policy_;
 };
 
-// Begins a new epoch for a tf.data service dataset.
-//
-// The dataset_id input identifies which dataset to start a new epoch for.
-// The address and protocol inputs are used to connect to the tf.data service
-// master.
-// The op produces an epoch id to identify the newly created epoch.
-class BeginEpochOp : public OpKernel {
- public:
-  static constexpr const char* const kDatasetId = "dataset_id";
-  static constexpr const char* const kAddress = "address";
-  static constexpr const char* const kProtocol = "protocol";
-
-  explicit BeginEpochOp(OpKernelConstruction* ctx);
-
-  void Compute(OpKernelContext* ctx) override;
-};
-
-// Creates a new iterator for iterating over a tf.data service dataset.
-//
-// The epoch_id input identifies which epoch to read from. Multiple iterators
-// may read from the same epoch, causing the elements of the epoch to be split
-// across all iterators.
-class MakeDataServiceIteratorOp : public MakeIteratorOp {
- public:
-  static constexpr const char* const kEpochId = "epoch_id";
-
-  explicit MakeDataServiceIteratorOp(OpKernelConstruction* ctx)
-      : MakeIteratorOp(ctx) {}
-
- protected:
-  Status DoCompute(OpKernelContext* ctx) override;
-};
-
 }  // namespace data
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DATA_SERVICE_OPS_H_
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index c016711bedc..38550730a0f 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -439,7 +439,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       // `return_values`, and invoking `done` when finished.
       instantiated_captured_func_->RunAsync(ctx.get(), std::move(input_element),
                                             return_values.get(),
-                                            std::move(done), prefix());
+                                            std::move(done), model_node());
     }
 
     void CancelThreads(bool wait) TF_LOCKS_EXCLUDED(mu_) {
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index c5972f11a38..943759fa4b5 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -351,10 +351,12 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
       Status CheckExternalState() override { return Status::OK(); }
 
-      void MapFunc(IteratorContext* ctx, const string& prefix,
+      void MapFunc(IteratorContext* ctx,
+                   const std::shared_ptr<model::Node>& node,
                    std::vector<Tensor> input, std::vector<Tensor>* output,
                    StatusCallback callback) override {
-        (*ctx->runner())([this, ctx, prefix, input, output, callback]() {
+        (*ctx->runner())([this, ctx, node, input, output,
+                          callback = std::move(callback)]() {
           thread::ThreadPool* device_threadpool =
               ctx->flr()->device()->tensorflow_cpu_worker_threads()->workers;
           std::vector<tstring> slice_vec;
@@ -423,7 +425,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
                 stats_aggregator->IncrementCounter(
                     stats_utils::kFeatureValuesCount, "trainer",
                     feature_stats.feature_values_count);
-                int64 steps = ctx->model()->NumElements(prefix);
+                int64 steps = node ? node->num_elements() : 0;
                 stats_aggregator->AddToHistogram(
                     stats_utils::FeatureHistogramName(dataset_->node_name()),
                     {static_cast<double>(feature_stats.features_count)}, steps);
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index db9984e02f8..2cc602a15ae 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -549,7 +549,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           if (!background_threads_started_) {
             for (int i = 0; i < dataset()->num_reader_threads_; ++i) {
               ++num_active_threads_;
-              thread_pool_->Schedule([this, i]() { ReadingFilesLoop(i); });
+              thread_pool_->Schedule(
+                  [this, i, env = ctx->env()]() { ReadingFilesLoop(env, i); });
             }
             background_threads_started_ = true;
           }
@@ -731,13 +732,11 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
        private:
         // Reads one file end to end.
-        Status ReadFile(const string& filename) {
-          std::unique_ptr<RandomAccessFile> file;
-          TF_RETURN_IF_ERROR(
-              Env::Default()->NewRandomAccessFile(filename, &file));
-          snapshot_util::Reader reader(file.get(), dataset()->compression_,
-                                       version_, dataset()->output_dtypes());
-
+        Status ReadFile(Env* env, const string& filename) {
+          std::unique_ptr<snapshot_util::Reader> reader;
+          TF_RETURN_IF_ERROR(snapshot_util::Reader::Create(
+              Env::Default(), filename, dataset()->compression_, version_,
+              dataset()->output_dtypes(), &reader));
           while (true) {
             // Wait for a slot in the buffer.
             {
@@ -754,7 +753,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               }
             }
             std::vector<Tensor> read_tensors;
-            Status s = reader.ReadTensors(&read_tensors);
+            Status s = reader->ReadTensors(&read_tensors);
             if (s.ok()) {
               profiler::TraceMe activity(
                   [&]() { return absl::StrCat(prefix(), kSeparator, kParse); },
@@ -787,7 +786,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
         // Pulls one file off the filenames_ list and reads it through. When
         // all files are read, terminates.
-        void ReadingFilesLoop(int i) {
+        void ReadingFilesLoop(Env* env, int i) {
           auto cleanup = gtl::MakeCleanup([this]() {
             mutex_lock l(mu_);
             --num_active_threads_;
@@ -803,7 +802,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               }
               VLOG(2) << "Starting to read: " << filename;
             }
-            Status s = ReadFile(filename);
+            Status s = ReadFile(env, filename);
             // If we get to the end of the file, it's a clean termination and
             // we are at the end of the file. If all files have been processed,
             // then we insert an end_of_sequence marker in the buffer and
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index ba8336653f4..877d05ebb3f 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/experimental/snapshot_util.h"
 
+#include <queue>
+
 #include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/io/snappy/snappy_inputbuffer.h"
@@ -58,7 +62,7 @@ Status Writer::Create(Env* env, const std::string& filename,
 }
 
 Status Writer::Initialize(tensorflow::Env* env) {
-  TF_RETURN_IF_ERROR(env->NewWritableFile(filename_, &dest_));
+  TF_RETURN_IF_ERROR(env->NewAppendableFile(filename_, &dest_));
 #if defined(IS_SLIM_BUILD)
   if (compression_type_ != io::compression::kNone) {
     LOG(ERROR) << "Compression is unsupported on mobile platforms. Turning "
@@ -209,13 +213,236 @@ Status Writer::WriteRecord(const absl::Cord& data) {
 }
 #endif  // PLATFORM_GOOGLE
 
-Reader::Reader(RandomAccessFile* file, const string& compression_type,
+Status Reader::Create(Env* env, const std::string& filename,
+                      const string& compression_type, int version,
+                      const DataTypeVector& dtypes,
+                      std::unique_ptr<Reader>* out_reader) {
+  *out_reader =
+      absl::WrapUnique(new Reader(filename, compression_type, version, dtypes));
+
+  return (*out_reader)->Initialize(env);
+}
+
+class Reader::Dataset : public DatasetBase {
+ public:
+  explicit Dataset(const std::string& filename, const std::string& compression,
+                   const int64 version, const DataTypeVector& dtypes,
+                   const std::vector<PartialTensorShape>& shapes,
+                   const int64 start_index, DatasetContext::Params params)
+      : DatasetBase(DatasetContext(std::move(params))),
+        filename_(filename),
+        compression_(compression),
+        version_(version),
+        dtypes_(dtypes),
+        shapes_(shapes),
+        start_index_(start_index) {}
+
+  const DataTypeVector& output_dtypes() const override { return dtypes_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return shapes_;
+  }
+
+  std::string DebugString() const override {
+    return "snapshot_util::Reader::Dataset";
+  }
+
+  Status CheckExternalState() const override { return Status::OK(); }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** node) const override {
+    // TODO(frankchn): Implement for serialization and checkpointing.
+    return Status::OK();
+  }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(node_name(), prefix)});
+  }
+
+ private:
+  std::string filename_;
+  std::string compression_;
+  int64 version_;
+  DataTypeVector dtypes_;
+  std::vector<PartialTensorShape> shapes_;
+  const int64 start_index_;
+
+  class Iterator : public DatasetIterator<Dataset> {
+   public:
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      TF_RETURN_IF_ERROR(Reader::Create(
+          ctx->env(), dataset()->filename_, dataset()->compression_,
+          dataset()->version_, dataset()->dtypes_, &reader_));
+      return reader_->SkipRecords(dataset()->start_index_);
+    }
+
+   protected:
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      *end_of_sequence = false;
+      Status s = reader_->ReadTensors(out_tensors);
+      if (errors::IsOutOfRange(s)) {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+      return s;
+    }
+
+    Status SaveInternal(SerializationContext* ctx,
+                        IteratorStateWriter* writer) override {
+      // TODO(frankchn): Implement for serialization and checkpointing.
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      // TODO(frankchn): Implement for serialization and checkpointing.
+      return Status::OK();
+    }
+
+   private:
+    std::unique_ptr<Reader> reader_;
+  };
+};
+
+class Reader::NestedDataset : public DatasetBase {
+ public:
+  explicit NestedDataset(std::vector<DatasetBase*> datasets,
+                         DatasetContext::Params params)
+      : DatasetBase(DatasetContext(std::move(params))), datasets_(datasets) {
+    dtypes_.push_back(DT_VARIANT);
+    gtl::InlinedVector<int64, 1> element_dim_sizes;
+    element_dim_sizes.push_back(1);
+    partial_shapes_.emplace_back(element_dim_sizes);
+  }
+
+  const DataTypeVector& output_dtypes() const override { return dtypes_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return partial_shapes_;
+  }
+
+  std::string DebugString() const override {
+    return "snapshot_util::Reader::NestedDataset";
+  }
+
+  Status CheckExternalState() const override { return Status::OK(); }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** node) const override {
+    // TODO(frankchn): Implement for serialization and checkpointing.
+    return Status::OK();
+  }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(node_name(), prefix)});
+  }
+
+ private:
+  std::vector<DatasetBase*> datasets_;
+  DataTypeVector dtypes_;
+  std::vector<PartialTensorShape> partial_shapes_;
+
+  class Iterator : public DatasetIterator<NestedDataset> {
+   public:
+    explicit Iterator(const Params& params)
+        : DatasetIterator<NestedDataset>(params), index_(0) {}
+
+   protected:
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      *end_of_sequence = dataset()->datasets_.size() == index_;
+      if (!*end_of_sequence) {
+        Tensor tensor(DT_VARIANT, TensorShape({}));
+
+        TF_RETURN_IF_ERROR(
+            StoreDatasetInVariantTensor(dataset()->datasets_[index_], &tensor));
+        out_tensors->clear();
+        out_tensors->push_back(std::move(tensor));
+
+        index_++;
+      }
+      return Status::OK();
+    }
+
+    Status SaveInternal(SerializationContext* ctx,
+                        IteratorStateWriter* writer) override {
+      // TODO(frankchn): Implement for serialization and checkpointing.
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      // TODO(frankchn): Implement for serialization and checkpointing.
+      return Status::OK();
+    }
+
+   private:
+    int64 index_;
+  };
+};
+
+Status Reader::MakeNestedDataset(Env* env,
+                                 const std::vector<std::string>& filenames,
+                                 const string& compression_type, int version,
+                                 const DataTypeVector& dtypes,
+                                 const std::vector<PartialTensorShape>& shapes,
+                                 const int64 start_index,
+                                 DatasetBase** output) {
+  std::vector<DatasetBase*> datasets;
+
+  datasets.reserve(filenames.size());
+  for (const auto& filename : filenames) {
+    // TODO(frankchn): The reading pattern could be controlled in a non-round
+    // robin fashion, so we cannot assume a round-robin manner when restoring.
+    int64 dataset_start_index = start_index / filenames.size();
+    if (start_index % filenames.size() > datasets.size()) {
+      dataset_start_index++;
+    }
+
+    datasets.push_back(
+        new Dataset(filename, compression_type, version, dtypes, shapes,
+                    dataset_start_index,
+                    DatasetContext::Params({"snapshot_util::Reader::Dataset",
+                                            "snapshot_util_reader_Dataset"})));
+  }
+
+  // Rotate the vector such that the first dataset contains the next element
+  // to be produced.
+  std::rotate(datasets.begin(),
+              datasets.begin() + (start_index % filenames.size()),
+              datasets.end());
+
+  *output = new NestedDataset(
+      datasets, DatasetContext::Params({"snapshot_util::Reader::NestedDataset",
+                                        "snapshot_util_reader_NestedDataset"}));
+  return Status::OK();
+}
+
+Reader::Reader(const std::string& filename, const string& compression_type,
                int version, const DataTypeVector& dtypes)
-    : file_(file),
-      input_stream_(new io::RandomAccessInputStream(file)),
+    : filename_(filename),
       compression_type_(compression_type),
       version_(version),
-      dtypes_(dtypes) {
+      dtypes_(dtypes) {}
+
+Status Reader::Initialize(Env* env) {
+  TF_RETURN_IF_ERROR(Env::Default()->NewRandomAccessFile(filename_, &file_));
+  input_stream_ = std::make_unique<io::RandomAccessInputStream>(file_.get());
+
 #if defined(IS_SLIM_BUILD)
   if (compression_type_ != io::compression::kNone) {
     LOG(ERROR) << "Compression is unsupported on mobile platforms. Turning "
@@ -232,16 +459,16 @@ Reader::Reader(RandomAccessFile* file, const string& compression_type,
   } else if (compression_type_ == io::compression::kSnappy) {
     if (version_ == 0) {
       input_stream_ = absl::make_unique<io::SnappyInputBuffer>(
-          file_, /*input_buffer_bytes=*/kSnappyReaderInputBufferSizeBytes,
+          file_.get(), /*input_buffer_bytes=*/kSnappyReaderInputBufferSizeBytes,
           /*output_buffer_bytes=*/kSnappyReaderOutputBufferSizeBytes);
     } else {
       input_stream_ =
-          absl::make_unique<io::BufferedInputStream>(file_, 64 << 20);
+          absl::make_unique<io::BufferedInputStream>(file_.get(), 64 << 20);
     }
   }
 #endif  // IS_SLIM_BUILD
-  simple_tensor_mask_.reserve(dtypes.size());
-  for (const auto& dtype : dtypes) {
+  simple_tensor_mask_.reserve(dtypes_.size());
+  for (const auto& dtype : dtypes_) {
     if (DataTypeCanUseMemcpy(dtype)) {
       simple_tensor_mask_.push_back(true);
       num_simple_++;
@@ -250,6 +477,17 @@ Reader::Reader(RandomAccessFile* file, const string& compression_type,
       num_complex_++;
     }
   }
+
+  return Status::OK();
+}
+
+Status Reader::SkipRecords(int64 num_records) {
+  // TODO(frankchn): Optimize to not parse the entire Tensor and actually skip.
+  for (int i = 0; i < num_records; ++i) {
+    std::vector<Tensor> unused_tensors;
+    TF_RETURN_IF_ERROR(ReadTensors(&unused_tensors));
+  }
+  return Status::OK();
 }
 
 Status Reader::ReadTensors(std::vector<Tensor>* read_tensors) {
@@ -292,12 +530,10 @@ Status Reader::ReadTensors(std::vector<Tensor>* read_tensors) {
       size_t tensor_proto_size = tensor_proto_strs[complex_index].second;
       TensorProto tp;
 #if defined(PLATFORM_GOOGLE)
-      auto tensor_proto_ptr = tensor_proto_str.release();
-      absl::Cord c;
-      c.AppendExternalMemory(
-          absl::string_view(tensor_proto_ptr, tensor_proto_size),
-          tensor_proto_ptr,
-          [](void* arg) { delete[] static_cast<char*>(arg); });
+      absl::string_view tensor_proto_view(tensor_proto_str.get(),
+                                          tensor_proto_size);
+      absl::Cord c = absl::MakeCordFromExternal(
+          tensor_proto_view, [s = std::move(tensor_proto_str)] {});
       if (!tp.ParseFromCord(c)) {
         return errors::Internal("Could not parse TensorProto");
       }
@@ -404,11 +640,9 @@ Status Reader::ReadRecord(absl::Cord* record) {
   } else {
     auto tmp_str = absl::make_unique<tstring>();
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(length, tmp_str.get()));
-    tstring* tmp_str_raw = tmp_str.release();
-    record->AppendExternalMemory(*tmp_str_raw, tmp_str_raw,
-                                 [](absl::string_view unused_data, void* arg) {
-                                   delete static_cast<tstring*>(arg);
-                                 });
+    absl::string_view tmp_str_view(*tmp_str);
+    record->Append(
+        absl::MakeCordFromExternal(tmp_str_view, [s = std::move(tmp_str)] {}));
     return Status::OK();
   }
 }
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.h b/tensorflow/core/kernels/data/experimental/snapshot_util.h
index e1c6dbeb67b..79299bb79b4 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.h
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_UTIL_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_UTIL_H_
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/io/compression.h"
@@ -114,12 +115,34 @@ class Reader {
   static constexpr const char* const kReadCord = "ReadCord";
   static constexpr const char* const kSeparator = "::";
 
-  explicit Reader(RandomAccessFile* file, const string& compression_type,
-                  int version, const DataTypeVector& dtypes);
+  static Status Create(Env* env, const std::string& filename,
+                       const string& compression_type, int version,
+                       const DataTypeVector& dtypes,
+                       std::unique_ptr<Reader>* out_reader);
+
+  // Returns a nested dataset for a set of given snapshot file names.
+  //
+  // This function takes a vector of snapshot files, and returns a nested
+  // dataset. Each element within the nested dataset is itself a dataset, and
+  // contains all the elements written out to each individual snapshot file.
+  static Status MakeNestedDataset(Env* env,
+                                  const std::vector<std::string>& filenames,
+                                  const string& compression_type, int version,
+                                  const DataTypeVector& dtypes,
+                                  const std::vector<PartialTensorShape>& shapes,
+                                  const int64 start_index,
+                                  DatasetBase** output);
 
   Status ReadTensors(std::vector<Tensor>* read_tensors);
 
+  Status SkipRecords(int64 num_records);
+
  private:
+  explicit Reader(const std::string& filename, const string& compression_type,
+                  int version, const DataTypeVector& dtypes);
+
+  Status Initialize(Env* env);
+
   Status ReadTensorsV0(std::vector<Tensor>* read_tensors);
 
   Status SnappyUncompress(
@@ -134,7 +157,8 @@ class Reader {
   Status ReadRecord(absl::Cord* record);
 #endif
 
-  RandomAccessFile* file_;
+  std::string filename_;
+  std::unique_ptr<RandomAccessFile> file_;
   std::unique_ptr<io::InputStreamInterface> input_stream_;
   const string compression_type_;
   const int version_;
@@ -142,6 +166,9 @@ class Reader {
   int num_simple_ = 0;
   int num_complex_ = 0;
   std::vector<bool> simple_tensor_mask_;  // true for simple, false for complex.
+
+  class Dataset;
+  class NestedDataset;
 };
 
 Status WriteMetadataFile(const string& hash_dir,
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 111afa218df..e813de70931 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -38,8 +38,12 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
     explicit Dataset(OpKernelContext* ctx, DatasetBase* input)
         : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
+      batch_size_ = -1;
       for (const PartialTensorShape& shape : input->output_shapes()) {
         if (!shape.unknown_rank()) {
+          if (batch_size_ < 0 && shape.dim_size(0) >= 0) {
+            batch_size_ = shape.dim_size(0);
+          }
           gtl::InlinedVector<int64, 4> partial_dim_sizes;
           for (int i = 1; i < shape.dims(); ++i) {
             partial_dim_sizes.push_back(shape.dim_size(i));
@@ -69,6 +73,17 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "UnbatchDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      if (batch_size_ > 0) {
+        return n * batch_size_;
+      }
+      return kUnknownCardinality;
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -222,6 +237,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     const DatasetBase* const input_;
     std::vector<PartialTensorShape> shapes_;
+    // batch_size_ may or may not be known, with -1 as unknown
+    int64 batch_size_;
   };
 };
 
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index b797a6ff3bd..9fb3c5fb46e 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
 #include "tensorflow/core/common_runtime/metrics.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/optional_ops.h"
@@ -166,8 +166,7 @@ Status IteratorResource::Restore(OpKernelContext* ctx,
 }
 
 Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
-                                                DatasetBase* dataset,
-                                                int64 epoch_id) {
+                                                DatasetBase* dataset) {
   std::shared_ptr<State> new_state;
   {
     tf_shared_lock l(mu_);
@@ -180,7 +179,6 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
   IteratorContext::Params params(ctx);
   params.flr = new_state->flr;
   params.function_handle_cache = new_state->function_handle_cache.get();
-  params.epoch_id = epoch_id;
   params.resource_mgr = &new_state->resource_mgr;
   params.thread_factory = unbounded_thread_pool_.get_thread_factory();
   params.thread_pool = &unbounded_thread_pool_;
@@ -532,18 +530,17 @@ Status MakeIteratorOp::DoCompute(OpKernelContext* ctx) {
   TF_RETURN_IF_ERROR(
       LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
   core::ScopedUnref unref_iterator(iterator_resource);
-  return iterator_resource->SetIteratorFromDataset(
-      ctx, dataset, /*epoch_id=*/IteratorContext::kNoEpochId);
+  return iterator_resource->SetIteratorFromDataset(ctx, dataset);
 }
 
-void DeleteIteratorOp::Compute(OpKernelContext* ctx) {
+Status DeleteIteratorOp::DoCompute(OpKernelContext* ctx) {
   tensorflow::ResourceTagger tag(kTFDataResourceTag,
                                  ctx->op_kernel().type_string());
-  ResourceHandle handle = ctx->input(0).flat<ResourceHandle>()(0);
+  const ResourceHandle& handle = ctx->input(0).flat<ResourceHandle>()(0);
   // The iterator resource is guaranteed to exist because the variant tensor
   // wrapping the deleter is provided as an unused input to this op, which
   // guarantees that it has not run yet.
-  OP_REQUIRES_OK(ctx, ctx->resource_manager()->Delete(handle));
+  return ctx->resource_manager()->Delete(handle);
 }
 
 namespace {
@@ -605,6 +602,7 @@ class ReduceDatasetOp : public HybridAsyncOpKernel {
     FunctionMetadata::Params params;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
                                      &params.use_inter_op_parallelism));
+    params.use_default_device = false;
     OP_REQUIRES_OK(ctx,
                    FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
@@ -839,16 +837,9 @@ class OneShotIteratorOp : public AsyncOpKernel {
     opts.step_container = &step_container;
     opts.runner = ctx->runner();
     opts.run_all_kernels_inline = ctx->run_all_kernels_inline();
-    Notification n;
-    Status factory_status;
     std::vector<Tensor> return_values;
-    ctx->function_library()->Run(opts, f_handle, {}, &return_values,
-                                 [&n, &factory_status](Status s) {
-                                   factory_status.Update(s);
-                                   n.Notify();
-                                 });
-    n.WaitForNotification();
-    TF_RETURN_IF_ERROR(factory_status);
+    TF_RETURN_IF_ERROR(ctx->function_library()->RunSync(
+        std::move(opts), f_handle, {}, &return_values));
     if (return_values.size() != 1 || return_values[0].dtype() != DT_VARIANT ||
         !TensorShapeUtils::IsScalar(return_values[0].shape())) {
       return errors::InvalidArgument(
@@ -860,8 +851,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
     // factory function.
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
-    TF_RETURN_IF_ERROR((*iterator)->SetIteratorFromDataset(
-        ctx, dataset, /*epoch_id=*/IteratorContext::kNoEpochId));
+    TF_RETURN_IF_ERROR((*iterator)->SetIteratorFromDataset(ctx, dataset));
     (*iterator)->Ref();
     return Status::OK();
   }
@@ -935,6 +925,13 @@ Status IteratorGetNextOp::DoCompute(OpKernelContext* ctx) {
 }
 
 Status IteratorGetNextAsOptionalOp::DoCompute(OpKernelContext* ctx) {
+  profiler::TraceMe traceme(
+      [&] {
+        return strings::StrCat(
+            "IteratorGetNextAsOptionalOp::DoCompute#id=", ctx->step_id(),
+            ",iter_num=", ctx->frame_iter().iter_id, "#");
+      },
+      profiler::kInfo);
   tensorflow::ResourceTagger tag(kTFDataResourceTag,
                                  ctx->op_kernel().type_string());
   IteratorResource* iterator;
@@ -1105,9 +1102,8 @@ REGISTER_KERNEL_BUILDER(
     MakeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_CPU).Priority(2),
                         DeleteIteratorOp);
-REGISTER_KERNEL_BUILDER(
-    Name("DeleteIterator").Device(DEVICE_GPU).HostMemory("deleter").Priority(1),
-    DeleteIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_GPU).Priority(1),
+                        DeleteIteratorOp);
 REGISTER_KERNEL_BUILDER(
     Name("AnonymousIterator").Device(DEVICE_CPU).Priority(2),
     AnonymousIteratorHandleOp);
@@ -1119,7 +1115,6 @@ REGISTER_KERNEL_BUILDER(
     AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("AnonymousIteratorV2")
                             .Device(DEVICE_GPU)
-                            .HostMemory("deleter")
                             .Priority(1),
                         AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("DatasetToSingleElement").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index 36e034b5b6e..86db80ed75c 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -69,14 +69,9 @@ class IteratorResource : public ResourceBase {
   // Creates an iterator for `dataset`, and associates the iterator with this
   // iterator resource.
   //
-  // The `epoch_id` will be passed through the IteratorContext when creating
-  // the iterator. This id is used by the tf.data service to determine which
-  // epoch to iterate through.
-  //
   // `SetIteratorFromDataset` should be called before calling `GetNext`, `Save`,
   // or `Restore`.
-  Status SetIteratorFromDataset(OpKernelContext* ctx, DatasetBase* dataset,
-                                int64 epoch_id);
+  Status SetIteratorFromDataset(OpKernelContext* ctx, DatasetBase* dataset);
 
   string DebugString() const override { return "Iterator resource"; }
 
@@ -229,11 +224,13 @@ class IteratorGetNextOp : public HybridAsyncOpKernel {
   Status DoCompute(OpKernelContext* ctx) override;
 };
 
-class DeleteIteratorOp : public OpKernel {
+class DeleteIteratorOp : public HybridAsyncOpKernel {
  public:
-  explicit DeleteIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit DeleteIteratorOp(OpKernelConstruction* ctx)
+      : HybridAsyncOpKernel(ctx, "tf_data_delete_iterator") {}
 
-  void Compute(OpKernelContext* ctx) override;
+ protected:
+  Status DoCompute(OpKernelContext* ctx) override;
 };
 
 class IteratorGetNextAsOptionalOp : public HybridAsyncOpKernel {
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index c249d82ee9b..d5d4e8c8f14 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -56,7 +56,6 @@ void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
     return CreateConfig(optimizations, optimization_configs_);
   };
   Status s = RewriteDataset(ctx, input, std::move(config_factory),
-                            /*optimize_function_library=*/true,
                             /*record_fingerprint=*/true, output);
   if (errors::IsDeadlineExceeded(s)) {
     // Ignore DeadlineExceeded as it implies that the attempted rewrite took too
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index e1677b95959..8aca5005789 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -714,6 +714,12 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // Thread responsible for launching all worker threads. The thread stays
     // around after startup in case autotuning increases num_parallel_calls.
     void WorkerManagerThread() TF_LOCKS_EXCLUDED(mu_) {
+      RecordStart(ctx_.get());
+      auto cleanup = gtl::MakeCleanup([this]() {
+        RecordStop(ctx_.get());
+        mutex_lock l(*mu_);
+        DecrementOutstandingThreads();
+      });
       int initial_current_workers;
       // When elements are moved from `future_elements_` to `current_elements_`,
       // the future worker which created the element may continue to process
@@ -748,7 +754,6 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             RecordStart(ctx_.get());
           }
           if (cancelled_ || end_of_input_) {
-            DecrementOutstandingThreads();
             return;
           }
           IncrementOutstandingThreads();
@@ -1323,16 +1328,19 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       for (int idx = 0; idx < size; ++idx) {
         threadpool->Schedule(
             [this, ctx, reader, idx, name, &s, &counter, elements] {
+              RecordStart(ctx);
+              auto cleanup = gtl::MakeCleanup([this, ctx, &counter]() {
+                RecordStop(ctx);
+                counter.DecrementCount();
+              });
               std::shared_ptr<Element> elem;
               Status ret_status = ReadElement(ctx, reader, idx, name, &elem);
               mutex_lock l(*mu_);
               if (!ret_status.ok()) {
                 s.Update(ret_status);
-                counter.DecrementCount();
                 return;
               }
               (*elements)[idx] = elem;
-              counter.DecrementCount();
             });
       }
       counter.Wait();
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index ca547fb6339..7b8f697d2d3 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -194,22 +194,22 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       return dataset_->captured_func_->CheckExternalState();
     }
 
-    void MapFunc(IteratorContext* ctx, const string& prefix,
+    void MapFunc(IteratorContext* ctx, const std::shared_ptr<model::Node>& node,
                  std::vector<Tensor> input_element, std::vector<Tensor>* result,
                  StatusCallback done) override {
-      auto map_func = [this](IteratorContext* ctx, const string& prefix,
+      auto map_func = [this](IteratorContext* ctx,
+                             const std::shared_ptr<model::Node>& node,
                              std::vector<Tensor> input_element,
                              std::vector<Tensor>* result, StatusCallback done) {
         instantiated_captured_func_->RunAsync(ctx, std::move(input_element),
-                                              result, std::move(done), prefix);
+                                              result, std::move(done), node);
       };
       if (!dataset_->captured_func_->use_inter_op_parallelism()) {
-        (*ctx->runner())(std::bind(map_func, ctx, prefix,
+        (*ctx->runner())(std::bind(map_func, ctx, node,
                                    std::move(input_element), result,
                                    std::move(done)));
       } else {
-        map_func(ctx, prefix, std::move(input_element), result,
-                 std::move(done));
+        map_func(ctx, node, std::move(input_element), result, std::move(done));
       }
     }
 
@@ -540,7 +540,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
     // Apply the map function on `input_element`, storing the result in
     // `result->return_values`, and invoking `done` when finished.
-    parallel_map_functor_->MapFunc(ctx.get(), prefix(),
+    parallel_map_functor_->MapFunc(ctx.get(), model_node(),
                                    std::move(input_element),
                                    &result->return_values, std::move(done));
   }
@@ -621,6 +621,11 @@ class ParallelMapIterator : public DatasetBaseIterator {
       return false;
     }
     if (!deterministic_) {
+      // Iterate through in-flight results and returns the first one that is
+      // found to be available and not end-of-input. If the first result (in
+      // order) is end-of-input, we know that all earlier iterations have
+      // already been completed, so it is safe to return that result for the
+      // caller to process end of iteration.
       for (auto it = invocation_results_.begin();
            it != invocation_results_.end(); ++it) {
         if ((*it)->notification.HasBeenNotified() &&
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.h b/tensorflow/core/kernels/data/parallel_map_dataset_op.h
index 064ccb5f812..ca04b0791bb 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.h
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.h
@@ -77,7 +77,8 @@ class ParallelMapFunctor {
   // 2. A `std::vector<Tensor>` containing the input element.
   // 3. A `std::vector<Tensor>*` to which the function will write the result.
   // 4. A `StatusCallback` that should be invoked when the function is complete.
-  virtual void MapFunc(IteratorContext* ctx, const string& prefix,
+  virtual void MapFunc(IteratorContext* ctx,
+                       const std::shared_ptr<model::Node>& node,
                        std::vector<Tensor> input, std::vector<Tensor>* output,
                        StatusCallback callback) = 0;
 };
diff --git a/tensorflow/core/kernels/data/random_seed_ops.cc b/tensorflow/core/kernels/data/random_seed_ops.cc
index ea403086818..7f1c2fa4238 100644
--- a/tensorflow/core/kernels/data/random_seed_ops.cc
+++ b/tensorflow/core/kernels/data/random_seed_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace data {
@@ -28,8 +29,6 @@ namespace {
 
 const char kAnonymousRandomSeedGenerator[] = "AnonymousRandomSeedGenerator";
 const char kNumRandomSamples[] = "num_random_samples";
-const char kFixedSeedGenerator[] = "FixedSeedGenerator";
-const char kRandomSeedGenerator[] = "RandomSeedGenerator";
 const char kSeedGenerator[] = "SeedGenerator";
 const char kSeed[] = "seed";
 const char kSeed2[] = "seed2";
@@ -37,27 +36,15 @@ const char kReshuffle[] = "reshuffle";
 
 }  // namespace
 
-int64 SeedGenerator::num_random_samples() {
-  tf_shared_lock l(mu_);
-  return num_random_samples_;
-}
-
-void SeedGenerator::set_num_random_samples(int64 num_random_samples) {
-  mutex_lock l(mu_);
-  num_random_samples_ = num_random_samples;
-}
-
-string FixedSeedGenerator::DebugString() const { return kFixedSeedGenerator; }
+string SeedGeneratorManager::DebugString() const { return kSeedGenerator; }
 
 void FixedSeedGenerator::GenerateSeeds(int64* seed1, int64* seed2) {
   mutex_lock l(mu_);
   num_random_samples_++;
-  *seed1 = seed_;
-  *seed2 = seed2_;
+  *seed1 = seeds_.seed();
+  *seed2 = seeds_.seed2();
 }
 
-string RandomSeedGenerator::DebugString() const { return kRandomSeedGenerator; }
-
 void RandomSeedGenerator::GenerateSeeds(int64* seed1, int64* seed2) {
   mutex_lock l(mu_);
   num_random_samples_++;
@@ -69,7 +56,7 @@ void RandomSeedGenerator::GenerateSeeds(int64* seed1, int64* seed2) {
 void RandomSeedGenerator::Reset() {
   mutex_lock l(mu_);
   // Reset the generators based on the current seeds.
-  parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+  parent_generator_ = random::PhiloxRandom(seeds_.seed(), seeds_.seed2());
   generator_ =
       random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
   generator_.Skip(num_random_samples_);
@@ -77,29 +64,18 @@ void RandomSeedGenerator::Reset() {
 
 AnonymousSeedGeneratorHandleOp::AnonymousSeedGeneratorHandleOp(
     OpKernelConstruction* ctx)
-    : AnonymousResourceOp<SeedGenerator>(ctx) {}
+    : AnonymousResourceOp<SeedGeneratorManager>(ctx) {}
 
 void AnonymousSeedGeneratorHandleOp::Compute(OpKernelContext* ctx) {
   int64 seed;
   OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed));
   int64 seed2;
   OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed2, &seed2));
-  if (seed == 0 && seed2 == 0) {
-    seed = random::New64();
-    seed2 = random::New64();
-  }
-  seed_ = seed;
-  seed2_ = seed2;
-
-  // TODO(b/151115950): Remove this case when the forward compatibility window
-  // expires.
-  if (ctx->op_kernel().def().op() == kAnonymousRandomSeedGenerator) {
-    reshuffle_ = true;
-  } else {
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<bool>(ctx, kReshuffle, &reshuffle_));
-  }
-  AnonymousResourceOp<SeedGenerator>::Compute(ctx);
+  // Seeds will be consumed by `CreateResource`, which is called via `Compute`.
+  mutex_lock l(mu_);
+  seeds_ = absl::make_unique<RandomSeeds>(seed, seed2);
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, kReshuffle, &reshuffle_));
+  AnonymousResourceOp<SeedGeneratorManager>::Compute(ctx);
 }
 
 std::string AnonymousSeedGeneratorHandleOp::name() { return kSeedGenerator; }
@@ -107,12 +83,14 @@ std::string AnonymousSeedGeneratorHandleOp::name() { return kSeedGenerator; }
 Status AnonymousSeedGeneratorHandleOp::CreateResource(
     OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-    FunctionLibraryRuntime* lib, SeedGenerator** resource) {
+    FunctionLibraryRuntime* lib, SeedGeneratorManager** manager)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   if (reshuffle_) {
-    *resource = new RandomSeedGenerator(seed_, seed2_);
+    *manager = new SeedGeneratorManager(new RandomSeedGenerator(*seeds_));
   } else {
-    *resource = new FixedSeedGenerator(seed_, seed2_);
+    *manager = new SeedGeneratorManager(new FixedSeedGenerator(*seeds_));
   }
+  seeds_ = nullptr;
   return Status::OK();
 }
 
@@ -137,6 +115,9 @@ REGISTER_KERNEL_BUILDER(Name("AnonymousRandomSeedGenerator").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("DeleteRandomSeedGenerator").Device(DEVICE_CPU),
                         DeleteSeedGeneratorOp);
 
+REGISTER_KERNEL_BUILDER(Name("DummySeedGenerator").Device(DEVICE_CPU),
+                        DummyResourceOp<SeedGenerator>);
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/random_seed_ops.h b/tensorflow/core/kernels/data/random_seed_ops.h
index 54332c7a820..68c7e88426f 100644
--- a/tensorflow/core/kernels/data/random_seed_ops.h
+++ b/tensorflow/core/kernels/data/random_seed_ops.h
@@ -25,51 +25,102 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+// Represents a pair of random seeds. By TensorFlow convention, if both seeds
+// are 0, then pseudo-random values are used instead.
+class RandomSeeds {
+ public:
+  RandomSeeds(int64 seed, int64 seed2)
+      : input_seed_(seed),
+        input_seed2_(seed2),
+        seed_((seed | seed2) == 0 ? random::New64() : seed),
+        seed2_((seed | seed2) == 0 ? random::New64() : seed2) {}
+
+  int64 input_seed() const { return input_seed_; }
+  int64 input_seed2() const { return input_seed2_; }
+  int64 seed() const { return seed_; }
+  int64 seed2() const { return seed2_; }
+
+ private:
+  const int64 input_seed_;
+  const int64 input_seed2_;
+  const int64 seed_;
+  const int64 seed2_;
+};
+
 // Base class for seed generator resources. Subclasses customize how seeds are
 // generated.
-class SeedGenerator : public ResourceBase {
+class SeedGenerator {
  public:
+  virtual ~SeedGenerator() {}
+
+  virtual int64 seed() const = 0;
+  virtual int64 seed2() const = 0;
+  virtual bool reshuffle_each_iteration() const = 0;
+
   virtual void GenerateSeeds(int64* seed1, int64* seed2) = 0;
   virtual void Reset() = 0;
 
-  virtual int64 num_random_samples();
-  virtual void set_num_random_samples(int64 num_random_samples);
+  virtual int64 num_random_samples() const {
+    tf_shared_lock l(mu_);
+    return num_random_samples_;
+  }
+  virtual void set_num_random_samples(int64 num_random_samples) {
+    mutex_lock l(mu_);
+    num_random_samples_ = num_random_samples;
+  }
 
  protected:
-  mutex mu_;
+  mutable mutex mu_;
   int64 num_random_samples_ TF_GUARDED_BY(mu_) = 0;
 };
 
+// A resource wrapping a shared instance of a seed generator.
+class SeedGeneratorManager : public ResourceBase {
+ public:
+  explicit SeedGeneratorManager(SeedGenerator* seed_generator)
+      : seed_generator_(seed_generator) {}
+
+  std::string DebugString() const override;
+
+  std::shared_ptr<SeedGenerator> get() { return seed_generator_; }
+
+ private:
+  std::shared_ptr<SeedGenerator> seed_generator_;
+};
+
 // Always generates the specified seed values.
 class FixedSeedGenerator : public SeedGenerator {
  public:
-  FixedSeedGenerator(int64 seed, int64 seed2) : seed_(seed), seed2_(seed2) {}
+  explicit FixedSeedGenerator(RandomSeeds seeds) : seeds_(std::move(seeds)) {}
+
+  int64 seed() const override { return seeds_.seed(); }
+  int64 seed2() const override { return seeds_.seed(); }
+  bool reshuffle_each_iteration() const override { return false; }
 
-  std::string DebugString() const override;
   void GenerateSeeds(int64* seed1, int64* seed2) override;
   void Reset() override {}
 
  private:
-  const int64 seed_;
-  const int64 seed2_;
+  const RandomSeeds seeds_;
 };
 
 // Generates different (but deterministically chosen) seed values.
 class RandomSeedGenerator : public SeedGenerator {
  public:
-  RandomSeedGenerator(int64 seed, int64 seed2)
-      : seed_(seed),
-        seed2_(seed2),
-        parent_generator_(seed, seed2),
+  explicit RandomSeedGenerator(RandomSeeds seeds)
+      : seeds_(std::move(seeds)),
+        parent_generator_(seeds_.seed(), seeds_.seed2()),
         generator_(&parent_generator_) {}
 
-  std::string DebugString() const override;
+  int64 seed() const override { return seeds_.seed(); }
+  int64 seed2() const override { return seeds_.seed2(); }
+  bool reshuffle_each_iteration() const override { return true; }
+
   void GenerateSeeds(int64* seed1, int64* seed2) override;
   void Reset() override;
 
  private:
-  const int64 seed_;
-  const int64 seed2_;
+  const RandomSeeds seeds_;
   random::PhiloxRandom parent_generator_ TF_GUARDED_BY(mu_);
   random::SingleSampleAdapter<random::PhiloxRandom> generator_
       TF_GUARDED_BY(mu_);
@@ -78,7 +129,7 @@ class RandomSeedGenerator : public SeedGenerator {
 // Creates an instance of seed generator resource and transfers ownership
 // to the caller.
 class AnonymousSeedGeneratorHandleOp
-    : public AnonymousResourceOp<SeedGenerator> {
+    : public AnonymousResourceOp<SeedGeneratorManager> {
  public:
   explicit AnonymousSeedGeneratorHandleOp(OpKernelConstruction* ctx);
   void Compute(OpKernelContext* ctx) override;
@@ -89,10 +140,10 @@ class AnonymousSeedGeneratorHandleOp
                         std::unique_ptr<FunctionLibraryDefinition> flib_def,
                         std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                         FunctionLibraryRuntime* lib,
-                        SeedGenerator** resource) override;
+                        SeedGeneratorManager** manager) override;
 
-  int64 seed_;
-  int64 seed2_;
+  mutex mu_;
+  std::unique_ptr<RandomSeeds> seeds_ TF_GUARDED_BY(mu_);
   bool reshuffle_;
 };
 
diff --git a/tensorflow/core/kernels/data/rewrite_utils.cc b/tensorflow/core/kernels/data/rewrite_utils.cc
index 609c402fd29..0ea708abbc7 100644
--- a/tensorflow/core/kernels/data/rewrite_utils.cc
+++ b/tensorflow/core/kernels/data/rewrite_utils.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/rewrite_utils.h"
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -79,8 +79,7 @@ void RemoveFakeSinks(FunctionDef* function_def) {
 
 Status ApplyRewrites(OpKernelContext* ctx,
                      const std::function<RewriterConfig(void)> config_factory,
-                     bool optimize_function_library, GraphDef* graph_def,
-                     string* output_node) {
+                     GraphDef* graph_def, string* output_node) {
   // Add an identity node as the fetch node, otherwise we might get 'placeholder
   // is both fed and fetched' errors in some cases when using input list with
   // placeholder dataset nodes.
@@ -117,8 +116,9 @@ Status ApplyRewrites(OpKernelContext* ctx,
   std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef(
           "graph", meta_graph_def, item_config);
-  grappler_item->optimization_options().optimize_function_library =
-      optimize_function_library;
+  // Grappler should not optimize function library of tf.data graphs. The
+  // tf.data meta optimizer takes care of optimizing tf.data functions.
+  grappler_item->optimization_options().optimize_function_library = false;
   std::unordered_map<string, tensorflow::DeviceProperties> device_map;
   tensorflow::grappler::VirtualCluster cluster(device_map);
 
@@ -143,8 +143,7 @@ Status ApplyRewrites(OpKernelContext* ctx,
 
 Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
                       std::function<RewriterConfig(void)> config_factory,
-                      bool optimize_function_library, bool record_fingerprint,
-                      DatasetBase** rewritten_input) {
+                      bool record_fingerprint, DatasetBase** rewritten_input) {
   SerializationContext::Params params;
   std::vector<std::pair<string, Tensor>> input_list;
   params.input_list = &input_list;
@@ -166,9 +165,8 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
   }
 
   VLOG(3) << "Before graph rewrites: " << graph_def.DebugString();
-  TF_RETURN_IF_ERROR(ApplyRewrites(ctx, config_factory,
-                                   optimize_function_library, &graph_def,
-                                   &output_node));
+  TF_RETURN_IF_ERROR(
+      ApplyRewrites(ctx, config_factory, &graph_def, &output_node));
   VLOG(3) << "After graph rewrites: " << graph_def.DebugString();
 
   // Instantiate the optimized input pipeline by running the optimized graph
diff --git a/tensorflow/core/kernels/data/rewrite_utils.h b/tensorflow/core/kernels/data/rewrite_utils.h
index ebad6d276f3..aed878e79cf 100644
--- a/tensorflow/core/kernels/data/rewrite_utils.h
+++ b/tensorflow/core/kernels/data/rewrite_utils.h
@@ -27,8 +27,7 @@ namespace data {
 // Rewrites the input dataset using the given config.
 Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
                       std::function<RewriterConfig(void)> config_factory,
-                      bool optimize_function_library, bool record_fingerprint,
-                      DatasetBase** rewritten_input);
+                      bool record_fingerprint, DatasetBase** rewritten_input);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index e0312b5fe08..3e549246a95 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -44,10 +44,10 @@ namespace data {
 /* static */ constexpr const char* const ShuffleDatasetOpBase::kSeed2;
 /* static */ constexpr const char* const ShuffleDatasetOpBase::kOutputTypes;
 /* static */ constexpr const char* const ShuffleDatasetOpBase::kOutputShapes;
+/* static */ constexpr const char* const
+    ShuffleDatasetOpBase::kReshuffleEachIteration;
 
 /* static */ constexpr const char* const ShuffleDatasetOp::kDatasetType;
-/* static */ constexpr const char* const
-    ShuffleDatasetOp::kReshuffleEachIteration;
 
 /* static */ constexpr const char* const
     ShuffleAndRepeatDatasetOp::kDatasetType;
@@ -68,34 +68,12 @@ constexpr char kBuffer[] = "buffer";
 constexpr char kSize[] = "size";
 constexpr char kSeedGenerator[] = "SeedGenerator";
 constexpr char kTFData[] = "tf_data";
-constexpr char kDSNumRandomSamples[] = "ds_num_random_samples";
-constexpr char kFixedSeedDatasetPrefix[] = "FixedSeed";
-constexpr char kDatasetPrefix[] = "Dataset";
-constexpr char kDatasetV2Prefix[] = "DatasetV2";
-constexpr char kShuffleDataset[] = "ShuffleDataset";
-
-namespace {
-class Seeds {
- public:
-  Seeds(int64 seed, int64 seed2) {
-    input_seed_ = seed;
-    input_seed2_ = seed2;
-    seed_ = seed;
-    seed2_ = seed2;
-    // By TensorFlow convention, if both seeds are 0, then shuffling should be
-    // seeded non-deterministically.
-    if (seed == 0 && seed2 == 0) {
-      seed_ = random::New64();
-      seed2_ = random::New64();
-    }
-  }
-
-  int64 input_seed_;
-  int64 input_seed2_;
-  int64 seed_;
-  int64 seed2_;
-};
-}  // namespace
+constexpr char kEpochNumRandomSamples[] = "epoch_num_random_samples";
+constexpr char kShuffleDatasetV1[] = "ShuffleDataset";
+constexpr char kShuffleDatasetV2[] = "ShuffleDatasetV2";
+constexpr char kShuffleDatasetV3[] = "ShuffleDatasetV3";
+constexpr char kShuffleAndRepeatDatasetV1[] = "ShuffleAndRepeatDatasetV1";
+constexpr char kShuffleAndRepeatDatasetV2[] = "ShuffleAndRepeatDatasetV2";
 
 ShuffleDatasetOpBase::ShuffleDatasetOpBase(OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx) {}
@@ -104,10 +82,12 @@ ShuffleDatasetOpBase::ShuffleDatasetOpBase(OpKernelConstruction* ctx)
 class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
  public:
   ShuffleDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
-                     int64 buffer_size, int64 count)
+                     int64 buffer_size,
+                     std::shared_ptr<SeedGenerator> seed_generator, int64 count)
       : DatasetBase(DatasetContext(ctx)),
         input_(input),
         buffer_size_(buffer_size),
+        seed_generator_(std::move(seed_generator)),
         count_(count),
         traceme_metadata_(
             {{"buffer_size",
@@ -117,6 +97,8 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
 
   ~ShuffleDatasetBase() override { input_->Unref(); }
 
+  virtual string op_type() const = 0;
+
   const DataTypeVector& output_dtypes() const override {
     return input_->output_dtypes();
   }
@@ -139,37 +121,40 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
     return input_->CheckExternalState();
   }
 
- protected:
-  // Adds the seeds to the given graphdef builder. `preserve_random_seeds`
-  // controls whether to add the input seeds or the resolved seeds.
-  Status AddSeeds(Seeds seeds, bool preserve_random_seeds,
-                  DatasetGraphDefBuilder* b, Node** seed, Node** seed2) const {
-    int64 seed_to_add = preserve_random_seeds ? seeds.input_seed_ : seeds.seed_;
-    int64 seed2_to_add =
-        preserve_random_seeds ? seeds.input_seed2_ : seeds.seed2_;
-    TF_RETURN_IF_ERROR(b->AddScalar(seed_to_add, seed));
-    TF_RETURN_IF_ERROR(b->AddScalar(seed2_to_add, seed2));
-    return Status::OK();
+  string DebugString() const override {
+    name_utils::DatasetDebugStringParams params;
+    params.set_args(buffer_size_, seed_generator_->seed(),
+                    seed_generator_->seed2(), count_);
+    return name_utils::DatasetDebugString(op_type(), params);
   }
 
-  template <class T>
-  class Iterator : public DatasetIterator<T> {
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this, name_utils::IteratorPrefix(op_type(), prefix)},
+        seed_generator_.get());
+  }
+
+ protected:
+  class Iterator : public DatasetIterator<ShuffleDatasetBase> {
    public:
-    explicit Iterator(const typename DatasetIterator<T>::Params& params,
-                      int64 seed, int64 seed2)
-        : DatasetIterator<T>(params),
-          seed_(seed),
-          seed2_(seed2),
-          input_impl_(nullptr),
-          epoch_(0),
-          num_elements_(0),
-          parent_generator_(seed, seed2),
+    explicit Iterator(const Params& params, SeedGenerator* seed_generator)
+        : DatasetIterator<ShuffleDatasetBase>(params),
+          seed_generator_(seed_generator),
+          parent_generator_(seed_generator->seed(), seed_generator->seed2()),
           generator_(&parent_generator_) {
       buffer_ = absl::make_unique<std::vector<Tensor>[]>(
           params.dataset->buffer_size_);
       slices_.push_back(absl::make_unique<Slice>(0, 0));
     }
 
+    Status Initialize(IteratorContext* ctx) override {
+      mutex_lock l(mu_);
+      seed_generator_->GenerateSeeds(&seed_, &seed2_);
+      ResetRngs();
+      return Status::OK();
+    }
+
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
@@ -242,6 +227,10 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
         while (!slices_.empty() &&
                slices_.front()->start == slices_.front()->end) {
           slices_.pop_front();
+          // Reinitialize the RNG state for the next epoch.
+          num_random_samples_ = 0;
+          seed_generator_->GenerateSeeds(&seed_, &seed2_);
+          ResetRngs();
         }
         DCHECK(!slices_.empty());
         // Choose an element to produce uniformly at random from the first
@@ -283,6 +272,9 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
       // Save state needed to restore the random number generators.
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kEpochNumRandomSamples),
+                              seed_generator_->num_random_samples()));
       TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name(kNumRandomSamples),
                                              num_random_samples_));
       TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name(kSeed), seed_));
@@ -337,6 +329,11 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
       // Restore the random number generators.
+      int64 num_random_samples;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kEpochNumRandomSamples),
+                                            &num_random_samples));
+      seed_generator_->set_num_random_samples(num_random_samples);
+      seed_generator_->Reset();
       TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name(kNumRandomSamples),
                                             &num_random_samples_));
       TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name(kSeed), &seed_));
@@ -402,10 +399,6 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       return this->dataset()->traceme_metadata_;
     }
 
-    mutex mu_;
-    int64 seed_ TF_GUARDED_BY(mu_);
-    int64 seed2_ TF_GUARDED_BY(mu_);
-
    private:
     // Used to represent slices of `buffer_` that belong to different epochs.
     // The invariant maintained by the implementation is: `start` <= `end`.
@@ -426,10 +419,14 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       return out;
     }
 
+    mutex mu_;
+    SeedGenerator* const seed_generator_ TF_GUARDED_BY(mu_);  // Not owned.
     std::unique_ptr<std::vector<Tensor>[]> buffer_ TF_GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
-    int64 epoch_ TF_GUARDED_BY(mu_);
-    int64 num_elements_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_) = nullptr;
+    int64 epoch_ TF_GUARDED_BY(mu_) = 0;
+    int64 num_elements_ TF_GUARDED_BY(mu_) = 0;
+    int64 seed_ TF_GUARDED_BY(mu_) = 0;
+    int64 seed2_ TF_GUARDED_BY(mu_) = 0;
     // Indices into `buffer_` indicating which data belongs to which epoch.
     // The slice at the front of the deque references data from the earliest
     // buffered epoch. It is an invariant that all slices reference
@@ -444,135 +441,59 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
 
   const DatasetBase* const input_;
   const int64 buffer_size_;
+  const std::shared_ptr<SeedGenerator> seed_generator_;
   // The number of epochs to run for. Normally this is just 1, but sometimes we
   // fuse shuffle and repeat together, and make the shuffle dataset op
   // responsible for repeating as well.
   const int64 count_;
   const TraceMeMetadata traceme_metadata_;
-};
+};  // ShuffleDatasetBase
 
+// This version of memory dataset has an exclusive ownership of the seed
+// generator resource. It supports sharing of the seed generator across
+// different iterations of the `repeat` transformation but not across different
+// iterators.
 class ShuffleDatasetOp::Dataset : public ShuffleDatasetBase {
  public:
   Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
-          Seeds seeds, int64 count, bool reshuffle_each_iteration)
-      : ShuffleDatasetBase(ctx, input, buffer_size, count),
-        seeds_(seeds),
-        reshuffle_each_iteration_(reshuffle_each_iteration) {}
+          int64 count, RandomSeeds&& seeds, SeedGeneratorManager* manager,
+          ResourceHandle&& resource_handle)
+      : ShuffleDatasetBase(ctx, input, buffer_size, manager->get(), count),
+        manager_(manager),
+        resource_handle_(std::move(resource_handle)),
+        resource_mgr_(ctx->resource_manager()),
+        seeds_(std::move(seeds)) {}
 
-  string DebugString() const override {
-    name_utils::DatasetDebugStringParams params;
-    params.dataset_prefix = kDatasetPrefix;
-    params.set_args(buffer_size_, seeds_.seed_, seeds_.seed2_);
-    return name_utils::DatasetDebugString(kDatasetType, params);
+  ~Dataset() override {
+    manager_->Unref();
+    Status s = resource_mgr_->Delete<SeedGeneratorManager>(
+        resource_handle_.container(), resource_handle_.name());
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to delete RNG resource: " << s.ToString();
+    }
   }
 
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return absl::make_unique<Iterator>(
-        Iterator::Params{this,
-                         name_utils::IteratorPrefix(kDatasetType, prefix)},
-        seeds_.seed_, seeds_.seed2_);
-  }
+  string op_type() const override { return kDatasetType; }
 
  protected:
-  class Iterator : public ShuffleDatasetBase::Iterator<Dataset> {
-   public:
-    Iterator(const Params& params, int64 seed, int64 seed2)
-        : ShuffleDatasetBase::Iterator<Dataset>(params, seed, seed2) {}
-
-    ~Iterator() override { seed_generator_->Unref(); }
-
-    Status Initialize(IteratorContext* ctx) override {
-      // Firstly, lookup or create a seed generator from the IteratorResource
-      // resource_mgr.
-      ResourceMgr* mgr = ctx->resource_mgr();
-      SeedGenerator* seed_generator;
-      const string name = strings::StrCat(
-          prefix(), name_utils::kDelimiter, dataset()->type_string(),
-          name_utils::kDelimiter, kSeedGenerator);
-
-      int64 dataset_seed, dataset_seed2;
-      {
-        tf_shared_lock l(mu_);
-        // Ideally we'd like to hold this lock in the LookupOrCreate method,
-        // but that trips up our Deadlock detection code.
-        dataset_seed = seed_;
-        dataset_seed2 = seed2_;
-      }
-      TF_RETURN_IF_ERROR(mgr->LookupOrCreate<SeedGenerator>(
-          kTFData, name, &seed_generator,
-          [this, dataset_seed, dataset_seed2](SeedGenerator** seed_generator) {
-            // On the first iterator creation, use the original seeds from the
-            // dataset to seed a `SeedGenerator` that will provide seeds
-            // for subsequent repetitions of the same dataset.
-            if (dataset()->reshuffle_each_iteration_) {
-              *seed_generator =
-                  new RandomSeedGenerator(dataset_seed, dataset_seed2);
-            } else {
-              *seed_generator =
-                  new FixedSeedGenerator(dataset_seed, dataset_seed2);
-            }
-            return Status::OK();
-          }));
-      seed_generator_ = seed_generator;
-      seed_generator_->GenerateSeeds(&seed_, &seed2_);
-      mutex_lock l(mu_);
-      ResetRngs();
-      return Status::OK();
-    }
-
-   protected:
-    std::shared_ptr<model::Node> CreateNode(
-        IteratorContext* ctx, model::Node::Args args) const override {
-      return model::MakeKnownRatioNode(std::move(args),
-                                       /*ratio=*/1);
-    }
-
-    Status SaveInternal(SerializationContext* ctx,
-                        IteratorStateWriter* writer) override {
-      // Save RNG state of Dataset.
-      TF_RETURN_IF_ERROR(
-          writer->WriteScalar(full_name(kDSNumRandomSamples),
-                              seed_generator_->num_random_samples()));
-
-      // Save the Iterator.
-      return ShuffleDatasetBase::Iterator<Dataset>::SaveInternal(ctx, writer);
-    }
-
-    Status RestoreInternal(IteratorContext* ctx,
-                           IteratorStateReader* reader) override {
-      // Restore RNG state of Dataset.
-      int64 num_random_samples;
-      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kDSNumRandomSamples),
-                                            &num_random_samples));
-      seed_generator_->set_num_random_samples(num_random_samples);
-      seed_generator_->Reset();
-
-      // Restore the Iterator.
-      return ShuffleDatasetBase::Iterator<Dataset>::RestoreInternal(ctx,
-                                                                    reader);
-    }
-
-   private:
-    SeedGenerator* seed_generator_;
-  };
-
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** output) const override {
     Node* input_graph_node = nullptr;
     TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-    Node* buffer_size = nullptr;
-    Node* seed = nullptr;
-    Node* seed2 = nullptr;
+    Node* buffer_size_node = nullptr;
+    Node* seed_node = nullptr;
+    Node* seed2_node = nullptr;
     AttrValue reshuffle_each_iteration;
 
-    TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
-    TF_RETURN_IF_ERROR(
-        AddSeeds(seeds_, /*preserve_random_seeds=*/true, b, &seed, &seed2));
-    b->BuildAttrValue(reshuffle_each_iteration_, &reshuffle_each_iteration);
+    TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed(), &seed_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed2(), &seed2_node));
+    b->BuildAttrValue(seed_generator_->reshuffle_each_iteration(),
+                      &reshuffle_each_iteration);
     TF_RETURN_IF_ERROR(b->AddDataset(
-        this, {input_graph_node, buffer_size, seed, seed2},  // Inputs
+        this,
+        {input_graph_node, buffer_size_node, seed_node, seed2_node},  // Inputs
         {std::make_pair(kReshuffleEachIteration,
                         reshuffle_each_iteration)},  // Attrs
         output));
@@ -580,92 +501,41 @@ class ShuffleDatasetOp::Dataset : public ShuffleDatasetBase {
   }
 
  private:
-  const Seeds seeds_;
-  const bool reshuffle_each_iteration_;
+  SeedGeneratorManager* const manager_;  // Owned.
+  const ResourceHandle resource_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned.
+  const RandomSeeds seeds_;
 };
 
-// A shuffle dataset that uses an external seed generator resource to choose the
-// shuffle seeds for each iteration.
+// This version of shuffle dataset has a shared ownership of the seed generator
+// resource. It supports sharing of the generator state across different
+// iterations of the `repeat` transformation and also across different
+// iterators.
 class ShuffleDatasetOp::DatasetV2 : public ShuffleDatasetBase {
  public:
   DatasetV2(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
-            int64 count, SeedGenerator* seed_generator,
-            std::unique_ptr<OwnedResourceHandle> handle)
-      : ShuffleDatasetBase(ctx, input, buffer_size, count),
-        seed_generator_(seed_generator),
-        handle_(std::move(handle)) {}
+            int64 count, SeedGeneratorManager* manager,
+            ResourceHandle&& resource_handle, bool owns_resource)
+      : ShuffleDatasetBase(ctx, input, buffer_size, manager->get(), count),
+        manager_(manager),
+        owns_resource_(owns_resource),
+        resource_handle_(std::move(resource_handle)),
+        resource_mgr_(ctx->resource_manager()) {}
 
-  ~DatasetV2() override { seed_generator_->Unref(); }
-
-  string DebugString() const override {
-    name_utils::DatasetDebugStringParams params;
-    params.dataset_prefix = kDatasetV2Prefix;
-    params.set_args(buffer_size_);
-    return name_utils::DatasetDebugString(kDatasetType, params);
+  ~DatasetV2() override {
+    manager_->Unref();
+    if (owns_resource_) {
+      Status s = resource_mgr_->Delete<SeedGeneratorManager>(
+          resource_handle_.container(), resource_handle_.name());
+      if (!s.ok()) {
+        LOG(WARNING) << "Failed to delete RNG resource: " << s.ToString();
+      }
+    }
   }
 
-  Status CheckExternalState() const override {
-    return errors::FailedPrecondition(
-        DebugString(), " depends on random seed generator resource.");
-  }
-
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return absl::make_unique<Iterator>(
-        Iterator::Params{this,
-                         name_utils::IteratorPrefix(kDatasetType, prefix)},
-        seed_generator_);
-  }
+  string op_type() const override { return kDatasetType; }
 
  protected:
-  class Iterator : public ShuffleDatasetBase::Iterator<DatasetV2> {
-   public:
-    Iterator(const Params& params, SeedGenerator* seed_generator)
-        : ShuffleDatasetBase::Iterator<DatasetV2>(params, 0, 0),
-          seed_generator_(seed_generator) {}
-
-    Status Initialize(IteratorContext* ctx) override {
-      mutex_lock l(mu_);
-      seed_generator_->GenerateSeeds(&seed_, &seed2_);
-      ResetRngs();
-      return Status::OK();
-    }
-
-   protected:
-    std::shared_ptr<model::Node> CreateNode(
-        IteratorContext* ctx, model::Node::Args args) const override {
-      return model::MakeKnownRatioNode(std::move(args), /*ratio=*/1);
-    }
-
-    Status SaveInternal(SerializationContext* ctx,
-                        IteratorStateWriter* writer) override {
-      // Save state of the seed generator.
-      TF_RETURN_IF_ERROR(
-          writer->WriteScalar(full_name(kDSNumRandomSamples),
-                              seed_generator_->num_random_samples()));
-
-      // Save the tterator state.
-      return ShuffleDatasetBase::Iterator<DatasetV2>::SaveInternal(ctx, writer);
-    }
-
-    Status RestoreInternal(IteratorContext* ctx,
-                           IteratorStateReader* reader) override {
-      // Restore state of the seed generator.
-      int64 num_random_samples;
-      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kDSNumRandomSamples),
-                                            &num_random_samples));
-      seed_generator_->set_num_random_samples(num_random_samples);
-      seed_generator_->Reset();
-
-      // Restore the iterator state.
-      return ShuffleDatasetBase::Iterator<DatasetV2>::RestoreInternal(ctx,
-                                                                      reader);
-    }
-
-   private:
-    SeedGenerator* seed_generator_;
-  };
-
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** output) const override {
@@ -675,7 +545,7 @@ class ShuffleDatasetOp::DatasetV2 : public ShuffleDatasetBase {
     TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size_node));
     Node* resource_handle_node = nullptr;
     Tensor handle(DT_RESOURCE, TensorShape({}));
-    handle.scalar<ResourceHandle>()() = handle_->handle();
+    handle.scalar<ResourceHandle>()() = resource_handle_;
     TF_RETURN_IF_ERROR(b->AddTensor(handle, &resource_handle_node));
     TF_RETURN_IF_ERROR(b->AddDataset(
         this,
@@ -686,33 +556,39 @@ class ShuffleDatasetOp::DatasetV2 : public ShuffleDatasetBase {
   }
 
  private:
-  SeedGenerator* seed_generator_ = nullptr;
-  std::unique_ptr<OwnedResourceHandle> handle_;
+  SeedGeneratorManager* const manager_;  // Owned.
+  const bool owns_resource_;
+  const ResourceHandle resource_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned.
 };
 
-// A dataset that uses the same fixed seed for all iterators created from it.
-// Used when `reshuffle_each_iteration` is false.
-// TODO(b/151115950): delete this class.
-class ShuffleDatasetOp::FixedSeedDataset : public ShuffleDatasetBase {
+// This version of shuffle dataset extends the functionality of DatasetV2 with
+// the ability to preserve seed generator configuration (i.e. initial seeds and
+// whether to reshuffle each iteration) across serialization of the dataset.
+class ShuffleDatasetOp::DatasetV3 : public ShuffleDatasetBase {
  public:
-  FixedSeedDataset(OpKernelContext* ctx, const DatasetBase* input,
-                   int64 buffer_size, Seeds seeds, int64 count)
-      : ShuffleDatasetBase(ctx, input, buffer_size, count), seeds_(seeds) {}
+  DatasetV3(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
+            int64 count, RandomSeeds&& seeds, SeedGeneratorManager* manager,
+            ResourceHandle&& resource_handle, bool owns_resource)
+      : ShuffleDatasetBase(ctx, input, buffer_size, manager->get(), count),
+        manager_(manager),
+        owns_resource_(owns_resource),
+        resource_handle_(std::move(resource_handle)),
+        resource_mgr_(ctx->resource_manager()),
+        seeds_(std::move(seeds)) {}
 
-  string DebugString() const override {
-    name_utils::DatasetDebugStringParams params;
-    params.dataset_prefix = kFixedSeedDatasetPrefix;
-    params.set_args(buffer_size_, seeds_.seed_, seeds_.seed2_);
-    return name_utils::DatasetDebugString(kDatasetType, params);
+  ~DatasetV3() override {
+    manager_->Unref();
+    if (owns_resource_) {
+      Status s = resource_mgr_->Delete<SeedGeneratorManager>(
+          resource_handle_.container(), resource_handle_.name());
+      if (!s.ok()) {
+        LOG(WARNING) << "Failed to delete RNG resource: " << s.ToString();
+      }
+    }
   }
 
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return absl::make_unique<ShuffleDatasetBase::Iterator<ShuffleDatasetBase>>(
-        ShuffleDatasetBase::Iterator<ShuffleDatasetBase>::Params{
-            this, name_utils::IteratorPrefix(kDatasetType, prefix)},
-        seeds_.seed_, seeds_.seed2_);
-  }
+  string op_type() const override { return kDatasetType; }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -720,30 +596,47 @@ class ShuffleDatasetOp::FixedSeedDataset : public ShuffleDatasetBase {
                             Node** output) const override {
     Node* input_graph_node = nullptr;
     TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-    Node* buffer_size = nullptr;
-    Node* seed = nullptr;
-    Node* seed2 = nullptr;
+    Node* buffer_size_node = nullptr;
+    Node* seed_node = nullptr;
+    Node* seed2_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed(), &seed_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed2(), &seed2_node));
+    Node* resource_handle_node = nullptr;
+    Tensor handle(DT_RESOURCE, TensorShape({}));
+    handle.scalar<ResourceHandle>()() = resource_handle_;
+    TF_RETURN_IF_ERROR(b->AddTensor(handle, &resource_handle_node));
     AttrValue reshuffle_each_iteration;
-
-    TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
+    b->BuildAttrValue(seed_generator_->reshuffle_each_iteration(),
+                      &reshuffle_each_iteration);
     TF_RETURN_IF_ERROR(
-        AddSeeds(seeds_, ctx->preserve_random_seeds(), b, &seed, &seed2));
-    b->BuildAttrValue(false, &reshuffle_each_iteration);
-    TF_RETURN_IF_ERROR(b->AddDataset(
-        this, {input_graph_node, buffer_size, seed, seed2},  // Inputs
-        {std::make_pair(kReshuffleEachIteration,
-                        reshuffle_each_iteration)},  // Attrs
-        output));
+        b->AddDataset(this,
+                      {input_graph_node, buffer_size_node, seed_node,
+                       seed2_node, resource_handle_node},  // Inputs
+                      {std::make_pair(kReshuffleEachIteration,
+                                      reshuffle_each_iteration)},  // Attrs
+                      output));
     return Status::OK();
   }
 
  private:
-  const Seeds seeds_;
+  SeedGeneratorManager* const manager_;  // Owned
+  const bool owns_resource_;
+  const ResourceHandle resource_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned.
+  const RandomSeeds seeds_;
 };
 
 ShuffleDatasetOp::ShuffleDatasetOp(OpKernelConstruction* ctx)
-    : ShuffleDatasetOpBase(ctx),
-      op_version_(ctx->def().op() == kShuffleDataset ? 1 : 2) {
+    : ShuffleDatasetOpBase(ctx) {
+  auto& op_name = ctx->def().op();
+  if (op_name == kShuffleDatasetV3) {
+    op_version_ = 3;
+  } else if (op_name == kShuffleDatasetV2) {
+    op_version_ = 2;
+  } else if (op_name == kShuffleDatasetV1) {
+    op_version_ = 1;
+  }
   if (ctx->HasAttr(kReshuffleEachIteration)) {
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr(kReshuffleEachIteration, &reshuffle_each_iteration_));
@@ -760,71 +653,133 @@ void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
       errors::InvalidArgument("buffer_size must be greater than zero."));
 
   int64 count = 1;
-  if (op_version_ == 2) {
-    SeedGenerator* seed_generator = nullptr;
-    Status s = LookupResource(ctx, HandleFromInput(ctx, 2), &seed_generator);
+  static std::atomic<int64> resource_id_counter(0);
+  const string& container = ctx->resource_manager()->default_container();
+  auto name = strings::StrCat(ctx->op_kernel().name(), "/", kSeedGenerator, "_",
+                              resource_id_counter.fetch_add(1));
+  if (op_version_ == 3) {
+    auto handle = HandleFromInput(ctx, 4);
+    SeedGeneratorManager* manager = nullptr;
+    Status s = ctx->resource_manager()->Lookup<SeedGeneratorManager>(
+        handle.container(), handle.name(), &manager);
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed));
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed2, &seed2));
+    RandomSeeds seeds(seed, seed2);
+    bool owns_resource = false;
     if (errors::IsNotFound(s)) {
-      LOG(WARNING) << "Failed to find seed generator resource. Falling back to "
-                      "using a non-deterministically-seeded seed generator.";
-      *output =
-          new ShuffleDatasetOp::Dataset(ctx, input, buffer_size, Seeds(0, 0),
-                                        count, reshuffle_each_iteration_);
-      return;
+      owns_resource = true;
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->resource_manager()->LookupOrCreate<SeedGeneratorManager>(
+              container, name, &manager,
+              [reshuffle = reshuffle_each_iteration_,
+               &seeds](SeedGeneratorManager** manager) {
+                if (reshuffle) {
+                  *manager =
+                      new SeedGeneratorManager(new RandomSeedGenerator(seeds));
+                } else {
+                  *manager =
+                      new SeedGeneratorManager(new FixedSeedGenerator(seeds));
+                }
+                return Status::OK();
+              }));
+      handle = MakeResourceHandle<SeedGenerator>(ctx, container, name);
+    } else {
+      OP_REQUIRES_OK(ctx, s);
     }
-    OP_REQUIRES_OK(ctx, s);
 
-    // Create a fresh handle for the resource because the input handle can
-    // become invalid after this op executes.
-    std::unique_ptr<OwnedResourceHandle> handle;
-    OP_REQUIRES_OK(
-        ctx, OwnedResourceHandle::Create(
-                 ctx, seed_generator, seed_generator->DebugString(), &handle));
+    // Ownership of manager is transferred onto `DatasetV3`.
+    *output = new ShuffleDatasetOp::DatasetV3(ctx, input, buffer_size, count,
+                                              std::move(seeds), manager,
+                                              std::move(handle), owns_resource);
+  } else if (op_version_ == 2) {
+    auto handle = HandleFromInput(ctx, 2);
+    SeedGeneratorManager* manager = nullptr;
+    Status s = ctx->resource_manager()->Lookup<SeedGeneratorManager>(
+        handle.container(), handle.name(), &manager);
+    bool owns_resource = false;
+    if (errors::IsNotFound(s)) {
+      owns_resource = true;
+      LOG(WARNING) << "Failed to find seed generator resource. Falling back to "
+                      "using a non-deterministically seeded generator and "
+                      "reshuffling each iteration.";
+      RandomSeeds seeds(0, 0);
+      OP_REQUIRES_OK(
+          ctx, ctx->resource_manager()->LookupOrCreate<SeedGeneratorManager>(
+                   container, name, &manager,
+                   [&seeds](SeedGeneratorManager** manager) {
+                     *manager = new SeedGeneratorManager(
+                         new RandomSeedGenerator(seeds));
+                     return Status::OK();
+                   }));
+      handle = MakeResourceHandle<SeedGeneratorManager>(ctx, container, name);
+    } else {
+      OP_REQUIRES_OK(ctx, s);
+    }
 
-    // Ownership of seed generator is transferred onto `DatasetV2`.
-    *output = new ShuffleDatasetOp::DatasetV2(
-        ctx, input, buffer_size, count, seed_generator, std::move(handle));
-    return;
-  }
-
-  int64 seed;
-  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed));
-
-  int64 seed2;
-  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed2, &seed2));
-
-  if (!reshuffle_each_iteration_) {
-    // This dataset is only needed to support old clients running v2 eager with
-    // reshuffle_each_iteration_=false. We can't tell here whether we are in v2
-    // eager, so we conservatively always use FixedSeedDataset when
-    // reshuffle_each_iteration=false.
-    *output = new FixedSeedDataset(ctx, input, buffer_size, Seeds(seed, seed2),
-                                   count);
+    // Ownership of manager is transferred onto `DatasetV2`.
+    *output =
+        new ShuffleDatasetOp::DatasetV2(ctx, input, buffer_size, count, manager,
+                                        std::move(handle), owns_resource);
   } else {
-    *output = new ShuffleDatasetOp::Dataset(ctx, input, buffer_size,
-                                            Seeds(seed, seed2), count,
-                                            reshuffle_each_iteration_);
+    if (op_version_ != 1) {
+      LOG(WARNING) << "Unsupported version of shuffle dataset op: "
+                   << op_version_ << ". Defaulting to version 1.";
+    }
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed));
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed2, &seed2));
+    RandomSeeds seeds(seed, seed2);
+    SeedGeneratorManager* manager;
+    OP_REQUIRES_OK(
+        ctx,
+        ctx->resource_manager()->LookupOrCreate<SeedGeneratorManager>(
+            container, name, &manager,
+            [reshuffle = reshuffle_each_iteration_,
+             &seeds](SeedGeneratorManager** manager) {
+              if (reshuffle) {
+                *manager =
+                    new SeedGeneratorManager(new RandomSeedGenerator(seeds));
+              } else {
+                *manager =
+                    new SeedGeneratorManager(new FixedSeedGenerator(seeds));
+              }
+              return Status::OK();
+            }));
+    auto handle =
+        MakeResourceHandle<SeedGeneratorManager>(ctx, container, name);
+
+    // Ownership of manager is transferred onto `Dataset`.
+    *output = new ShuffleDatasetOp::Dataset(ctx, input, buffer_size, count,
+                                            std::move(seeds), manager,
+                                            std::move(handle));
   }
 }
 
 class ShuffleAndRepeatDatasetOp::Dataset : public ShuffleDatasetBase {
  public:
   Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
-          Seeds seeds, int64 count)
-      : ShuffleDatasetBase(ctx, input, buffer_size, count), seeds_(seeds) {}
+          RandomSeeds&& seeds, SeedGeneratorManager* manager, int64 count,
+          ResourceHandle&& resource_handle)
+      : ShuffleDatasetBase(ctx, input, buffer_size, manager->get(), count),
+        manager_(manager),
+        resource_handle_(std::move(resource_handle)),
+        resource_mgr_(ctx->resource_manager()),
+        seeds_(std::move(seeds)) {}
 
-  string DebugString() const override {
-    name_utils::DatasetDebugStringParams params;
-    params.set_args(buffer_size_, seeds_.seed_, seeds_.seed2_);
-    return name_utils::DatasetDebugString(kDatasetType, params);
+  ~Dataset() override {
+    manager_->Unref();
+    Status s = resource_mgr_->Delete<SeedGeneratorManager>(
+        resource_handle_.container(), resource_handle_.name());
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to delete RNG resource: " << s.ToString();
+    }
   }
 
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return absl::make_unique<ShuffleDatasetBase::Iterator<ShuffleDatasetBase>>(
-        ShuffleDatasetBase::Iterator<ShuffleDatasetBase>::Params{
-            this, name_utils::IteratorPrefix(kDatasetType, prefix)},
-        seeds_.seed_, seeds_.seed2_);
-  }
+  string op_type() const override { return kDatasetType; }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -838,22 +793,104 @@ class ShuffleAndRepeatDatasetOp::Dataset : public ShuffleDatasetBase {
     Node* count = nullptr;
 
     TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
-    TF_RETURN_IF_ERROR(
-        AddSeeds(seeds_, /*preserve_random_seeds=*/true, b, &seed, &seed2));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed(), &seed));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed2(), &seed2));
     TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
+    AttrValue reshuffle_each_iteration;
+    b->BuildAttrValue(seed_generator_->reshuffle_each_iteration(),
+                      &reshuffle_each_iteration);
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {input_graph_node, buffer_size, seed, seed2, count},  // Inputs
-        {},                                                         // Attrs
+        {std::make_pair(kReshuffleEachIteration,
+                        reshuffle_each_iteration)},  // Attrs
         output));
     return Status::OK();
   }
 
  private:
-  const Seeds seeds_;
+  SeedGeneratorManager* const manager_;  // Owned.
+  const ResourceHandle resource_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned.
+  const RandomSeeds seeds_;
+};
+
+class ShuffleAndRepeatDatasetOp::DatasetV2 : public ShuffleDatasetBase {
+ public:
+  DatasetV2(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
+            int64 count, RandomSeeds&& seeds, SeedGeneratorManager* manager,
+            ResourceHandle&& resource_handle, bool owns_resource)
+      : ShuffleDatasetBase(ctx, input, buffer_size, manager->get(), count),
+        manager_(manager),
+        owns_resource_(owns_resource),
+        resource_handle_(std::move(resource_handle)),
+        resource_mgr_(ctx->resource_manager()),
+        seeds_(std::move(seeds)) {}
+
+  ~DatasetV2() override {
+    manager_->Unref();
+    if (owns_resource_) {
+      Status s = resource_mgr_->Delete<SeedGeneratorManager>(
+          resource_handle_.container(), resource_handle_.name());
+      if (!s.ok()) {
+        LOG(WARNING) << "Failed to delete RNG resource: " << s.ToString();
+      }
+    }
+  }
+
+  string op_type() const override { return kDatasetType; }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+    Node* buffer_size_node = nullptr;
+    Node* seed_node = nullptr;
+    Node* seed2_node = nullptr;
+    Node* count_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed(), &seed_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed2(), &seed2_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(count_, &count_node));
+    Node* resource_handle_node = nullptr;
+    Tensor handle(DT_RESOURCE, TensorShape({}));
+    handle.scalar<ResourceHandle>()() = resource_handle_;
+    TF_RETURN_IF_ERROR(b->AddTensor(handle, &resource_handle_node));
+    AttrValue reshuffle_each_iteration;
+    b->BuildAttrValue(seed_generator_->reshuffle_each_iteration(),
+                      &reshuffle_each_iteration);
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this,
+                      {input_graph_node, buffer_size_node, seed_node,
+                       seed2_node, count_node, resource_handle_node},  // Inputs
+                      {std::make_pair(kReshuffleEachIteration,
+                                      reshuffle_each_iteration)},  // Attrs
+                      output));
+    return Status::OK();
+  }
+
+ private:
+  SeedGeneratorManager* const manager_;  // Owned
+  const bool owns_resource_;
+  const ResourceHandle resource_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned.
+  const RandomSeeds seeds_;
 };
 
 ShuffleAndRepeatDatasetOp::ShuffleAndRepeatDatasetOp(OpKernelConstruction* ctx)
-    : ShuffleDatasetOpBase(ctx) {}
+    : ShuffleDatasetOpBase(ctx) {
+  auto& op_name = ctx->def().op();
+  if (op_name == kShuffleAndRepeatDatasetV2) {
+    op_version_ = 2;
+  } else if (op_name == kShuffleAndRepeatDatasetV1) {
+    op_version_ = 1;
+  }
+  if (ctx->HasAttr(kReshuffleEachIteration)) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr(kReshuffleEachIteration, &reshuffle_each_iteration_));
+  }
+}
 
 void ShuffleAndRepeatDatasetOp::MakeDataset(OpKernelContext* ctx,
                                             DatasetBase* input,
@@ -878,7 +915,72 @@ void ShuffleAndRepeatDatasetOp::MakeDataset(OpKernelContext* ctx,
               errors::InvalidArgument(
                   "count must be greater than zero or equal to -1."));
 
-  *output = new Dataset(ctx, input, buffer_size, Seeds(seed, seed2), count);
+  RandomSeeds seeds(seed, seed2);
+
+  static std::atomic<int64> resource_id_counter(0);
+  const string& container = ctx->resource_manager()->default_container();
+  auto name = strings::StrCat(ctx->op_kernel().name(), "/", kSeedGenerator, "_",
+                              resource_id_counter.fetch_add(1));
+  if (op_version_ == 2) {
+    auto handle = HandleFromInput(ctx, 5);
+    SeedGeneratorManager* manager = nullptr;
+    Status s = ctx->resource_manager()->Lookup<SeedGeneratorManager>(
+        handle.container(), handle.name(), &manager);
+    bool owns_resource = false;
+    if (errors::IsNotFound(s)) {
+      owns_resource = true;
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->resource_manager()->LookupOrCreate<SeedGeneratorManager>(
+              container, name, &manager,
+              [reshuffle = reshuffle_each_iteration_,
+               &seeds](SeedGeneratorManager** manager) {
+                if (reshuffle) {
+                  *manager =
+                      new SeedGeneratorManager(new RandomSeedGenerator(seeds));
+                } else {
+                  *manager =
+                      new SeedGeneratorManager(new FixedSeedGenerator(seeds));
+                }
+                return Status::OK();
+              }));
+      handle = MakeResourceHandle<SeedGenerator>(ctx, container, name);
+    } else {
+      OP_REQUIRES_OK(ctx, s);
+    }
+
+    // Ownership of manager is transferred onto `DatasetV2`.
+    *output = new ShuffleAndRepeatDatasetOp::DatasetV2(
+        ctx, input, buffer_size, count, std::move(seeds), manager,
+        std::move(handle), owns_resource);
+  } else {
+    if (op_version_ != 1) {
+      LOG(WARNING) << "Unsupported version of shuffle dataset op: "
+                   << op_version_ << ". Defaulting to version 1.";
+    }
+    SeedGeneratorManager* manager;
+    OP_REQUIRES_OK(
+        ctx,
+        ctx->resource_manager()->LookupOrCreate<SeedGeneratorManager>(
+            container, name, &manager,
+            [reshuffle = reshuffle_each_iteration_,
+             &seeds](SeedGeneratorManager** manager) {
+              if (reshuffle) {
+                *manager =
+                    new SeedGeneratorManager(new RandomSeedGenerator(seeds));
+              } else {
+                *manager =
+                    new SeedGeneratorManager(new FixedSeedGenerator(seeds));
+              }
+              return Status::OK();
+            }));
+    auto handle =
+        MakeResourceHandle<SeedGeneratorManager>(ctx, container, name);
+
+    // Ownership of manager is transferred onto `Dataset`.
+    *output = new Dataset(ctx, input, buffer_size, std::move(seeds), manager,
+                          count, std::move(handle));
+  }
 }
 
 namespace {
@@ -888,8 +990,14 @@ REGISTER_KERNEL_BUILDER(Name("ShuffleDataset").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("ShuffleDatasetV2").Device(DEVICE_CPU),
                         ShuffleDatasetOp);
 
+REGISTER_KERNEL_BUILDER(Name("ShuffleDatasetV3").Device(DEVICE_CPU),
+                        ShuffleDatasetOp);
+
 REGISTER_KERNEL_BUILDER(Name("ShuffleAndRepeatDataset").Device(DEVICE_CPU),
                         ShuffleAndRepeatDatasetOp);
+
+REGISTER_KERNEL_BUILDER(Name("ShuffleAndRepeatDatasetV2").Device(DEVICE_CPU),
+                        ShuffleAndRepeatDatasetOp);
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.h b/tensorflow/core/kernels/data/shuffle_dataset_op.h
index 165a1db4c45..f33f75c84eb 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.h
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.h
@@ -28,6 +28,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
   static constexpr const char* const kSeed2 = "seed2";
   static constexpr const char* const kOutputTypes = "output_types";
   static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kReshuffleEachIteration =
+      "reshuffle_each_iteration";
 
   explicit ShuffleDatasetOpBase(OpKernelConstruction* ctx);
 
@@ -38,8 +40,6 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
 class ShuffleDatasetOp : public ShuffleDatasetOpBase {
  public:
   static constexpr const char* const kDatasetType = "Shuffle";
-  static constexpr const char* const kReshuffleEachIteration =
-      "reshuffle_each_iteration";
 
   explicit ShuffleDatasetOp(OpKernelConstruction* ctx);
 
@@ -50,9 +50,9 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
  private:
   class Dataset;
   class DatasetV2;
-  class FixedSeedDataset;
-  int op_version_;
-  bool reshuffle_each_iteration_;
+  class DatasetV3;
+  int op_version_ = 0;
+  bool reshuffle_each_iteration_ = true;
 };
 
 class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
@@ -68,6 +68,9 @@ class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
 
  private:
   class Dataset;
+  class DatasetV2;
+  int op_version_ = 0;
+  bool reshuffle_each_iteration_ = true;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
index ca9afce7fc1..65f6855b7fa 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
@@ -72,10 +72,8 @@ class ShuffleDatasetParams : public DatasetParams {
                               output_dtypes_);
     attr_vector->emplace_back(ShuffleDatasetOpBase::kOutputShapes,
                               output_shapes_);
-    if (count_ == 1) {
-      attr_vector->emplace_back(ShuffleDatasetOp::kReshuffleEachIteration,
-                                reshuffle_each_iteration_);
-    }
+    attr_vector->emplace_back(ShuffleDatasetOp::kReshuffleEachIteration,
+                              reshuffle_each_iteration_);
     return Status::OK();
   }
 
@@ -298,22 +296,22 @@ std::vector<GetNextTestCase<ShuffleDatasetParams>> GetNextTestCases() {
        /*expected_shuffle_outputs=*/
        CreateTensors<int64>(TensorShape({}),
                             {{9}, {0}, {8}, {6}, {1}, {3}, {7}, {2}, {4}, {5},
-                             {4}, {3}, {0}, {5}, {8}, {2}, {6}, {9}, {7}, {1}}),
+                             {9}, {0}, {8}, {6}, {1}, {3}, {7}, {2}, {4}, {5}}),
        /*expected_reshuffle_outputs=*/
        CreateTensors<int64>(TensorShape({}), {{9}, {0}, {8}, {6}, {1}, {3}, {7},
-                                              {2}, {4}, {5}, {4}, {3}, {0}, {5},
-                                              {8}, {2}, {6}, {9}, {7}, {1}})},
+                                              {2}, {4}, {5}, {9}, {0}, {8}, {6},
+                                              {1}, {3}, {7}, {2}, {4}, {5}})},
       {/*dataset_params=*/ShuffleDatasetParams8(),
        /*expected_shuffle_outputs=*/
        CreateTensors<int64>(
            TensorShape({}),
-           {{2}, {0}, {1}, {2}, {0}, {1}, {1}, {2}, {0}, {1}, {0},
-            {2}, {2}, {0}, {1}, {1}, {0}, {2}, {2}, {1}, {0}}),
+           {{2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0},
+            {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}}),
        /*expected_reshuffle_outputs=*/
        CreateTensors<int64>(
            TensorShape({}),
-           {{2}, {0}, {1}, {2}, {0}, {1}, {1}, {2}, {0}, {1}, {0},
-            {2}, {2}, {0}, {1}, {1}, {0}, {2}, {2}, {1}, {0}})}};
+           {{2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0},
+            {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}})}};
 }
 
 class ParameterizedGetNextTest : public ShuffleDatasetOpTest,
@@ -497,15 +495,15 @@ IteratorSaveAndRestoreTestCases() {
        /*breakpoints=*/{0, 5, 22},
        /*expected_shuffle_outputs=*/
        CreateTensors<int64>(TensorShape({}), {{9}, {0}, {8}, {6}, {1}, {3}, {7},
-                                              {2}, {4}, {5}, {4}, {3}, {0}, {5},
-                                              {8}, {2}, {6}, {9}, {7}, {1}})},
+                                              {2}, {4}, {5}, {9}, {0}, {8}, {6},
+                                              {1}, {3}, {7}, {2}, {4}, {5}})},
       {/*dataset_params=*/ShuffleDatasetParams8(),
        /*breakpoints=*/{0, 5, 20},
        /*expected_shuffle_outputs=*/
        CreateTensors<int64>(
            TensorShape({}),
-           {{2}, {0}, {1}, {2}, {0}, {1}, {1}, {2}, {0}, {1}, {0},
-            {2}, {2}, {0}, {1}, {1}, {0}, {2}, {2}, {1}, {0}})}};
+           {{2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0},
+            {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}})}};
 }
 
 class ParameterizedIteratorSaveAndRestoreTest
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index 45413e8d312..3a16f1018dd 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/single_threaded_executor.h"
 
+#include "tensorflow/core/common_runtime/entry.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/executor_factory.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -28,6 +29,9 @@ namespace {
 typedef gtl::InlinedVector<TensorValue, 4> TensorValueVec;
 typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;
 
+static const string& kSingleThreadedExecutor =
+    *new string("SINGLE_THREADED_EXECUTOR");
+
 class SingleThreadedExecutorImpl : public Executor {
  public:
   explicit SingleThreadedExecutorImpl(const LocalExecutorParams& params)
@@ -37,6 +41,9 @@ class SingleThreadedExecutorImpl : public Executor {
     for (const KernelState& kernel_state : kernels_) {
       params_.delete_kernel(kernel_state.kernel);
     }
+    for (const ConstTensorKernelState& kernel_state : const_tensor_kernels_) {
+      params_.delete_kernel(kernel_state.kernel);
+    }
   }
 
   Status Initialize(const Graph& graph) {
@@ -53,6 +60,7 @@ class SingleThreadedExecutorImpl : public Executor {
 
     kernels_.reserve(ordered_nodes.size());
     std::vector<Node*> nodes_with_kernels;
+    std::vector<Node*> nodes_with_const_tensor_kernels;
     nodes_with_kernels.reserve(ordered_nodes.size());
 
     std::map<size_t, Node*> arg_index_to_node_map;
@@ -102,24 +110,38 @@ class SingleThreadedExecutorImpl : public Executor {
         continue;
       }
 
-      const size_t kernel_index = kernels_.size();
-      kernels_.push_back({});
-      nodes_with_kernels.push_back(n);
-      KernelState& kernel_state = kernels_[kernel_index];
-      node_to_index_map[n] = kernel_index;
+      OpKernel* kernel;
+      TF_RETURN_IF_ERROR(params_.create_kernel(n->properties(), &kernel));
 
-      TF_RETURN_IF_ERROR(
-          params_.create_kernel(n->properties(), &kernel_state.kernel));
-      kernel_state.num_inputs = n->num_inputs();
-      kernel_state.num_outputs = n->num_outputs();
-
-      if (kernel_index == 0) {
-        kernel_state.input_start_index = 0;
+      const Tensor* const_tensor;
+      if (n->num_outputs() == 1 && (const_tensor = kernel->const_tensor())) {
+        // Nodes that produce a single constant tensor are handled specially:
+        // we evaluate the tensor once, and propagate it to its consumers as
+        // a `const Tensor*`, to avoid refcount manipulation.
+        const size_t kernel_index = const_tensor_kernels_.size();
+        const_tensor_kernels_.push_back({});
+        nodes_with_const_tensor_kernels.push_back(n);
+        ConstTensorKernelState& kernel_state =
+            const_tensor_kernels_[kernel_index];
+        kernel_state.kernel = kernel;
+        kernel_state.const_tensor = *const_tensor;
       } else {
-        const KernelState& previous_kernel_state = kernels_[kernel_index - 1];
-        kernel_state.input_start_index =
-            previous_kernel_state.input_start_index +
-            previous_kernel_state.num_inputs;
+        const size_t kernel_index = kernels_.size();
+        kernels_.push_back({});
+        nodes_with_kernels.push_back(n);
+        KernelState& kernel_state = kernels_[kernel_index];
+        kernel_state.kernel = kernel;
+        kernel_state.num_inputs = n->num_inputs();
+        kernel_state.num_outputs = n->num_outputs();
+        node_to_index_map[n] = kernel_index;
+        if (kernel_index == 0) {
+          kernel_state.input_start_index = 0;
+        } else {
+          const KernelState& previous_kernel_state = kernels_[kernel_index - 1];
+          kernel_state.input_start_index =
+              previous_kernel_state.input_start_index +
+              previous_kernel_state.num_inputs;
+        }
       }
     }
 
@@ -146,6 +168,28 @@ class SingleThreadedExecutorImpl : public Executor {
       }
     }
 
+    // Build the mapping from each const tensor kernel to the input slot for the
+    // corresponding destination node.
+    for (size_t i = 0; i < const_tensor_kernels_.size(); ++i) {
+      Node* n = nodes_with_const_tensor_kernels[i];
+      ConstTensorKernelState& kernel_state = const_tensor_kernels_[i];
+      for (const Edge* e : n->out_edges()) {
+        if (e->src_output() == Graph::kControlSlot) {
+          continue;
+        } else if (e->src_output() != 0) {
+          return errors::Internal("Invalid output index ", e->src_output(),
+                                  " from node ", n->DebugString());
+        }
+        kernel_state.output_locations.push_back(
+            kernels_[node_to_index_map[e->dst()]].input_start_index +
+            e->dst_input());
+      }
+
+      bool on_host =
+          kernel_state.kernel->output_memory_types()[0] == HOST_MEMORY;
+      kernel_state.output_alloc_attr.set_on_host(on_host);
+    }
+
     // Build the mapping from each node output to the input slot for the
     // corresponding destination node.
     for (size_t i = 0; i < kernels_.size(); ++i) {
@@ -230,7 +274,7 @@ class SingleThreadedExecutorImpl : public Executor {
     // * In an error case (see below), we use the connectivity information in
     //   `KernelState::output_locations` to determine which locations have been
     //   initialized, and manually destroy them.
-    std::vector<ManualConstructor<Tensor>> inputs(total_num_inputs_);
+    std::vector<Entry> inputs(total_num_inputs_);
 
     // TODO(mrry): Can we avoid copying into these vectors? Consider modifying
     // OpKernelContext to take the TensorValueVec as a pointer into `inputs`.
@@ -259,6 +303,7 @@ class SingleThreadedExecutorImpl : public Executor {
     params.runner = &runner_copy;
     params.run_all_kernels_inline = args.run_all_kernels_inline;
     params.stats_collector = args.stats_collector;
+    params.executor_type = &kSingleThreadedExecutor;
 
     // NOTE(mrry): We are assuming that the graph is loopless and condless.
     params.frame_iter = FrameAndIter(0, 0);
@@ -284,14 +329,52 @@ class SingleThreadedExecutorImpl : public Executor {
     for (size_t i = 0; i < arg_output_locations_.size(); ++i) {
       const size_t num_destinations = arg_output_locations_[i].size();
       if (num_destinations > 0) {
-        const Tensor* arg;
-        TF_CHECK_OK(args.call_frame->GetArg(i, &arg));
-        for (size_t j = 0; j < num_destinations; ++j) {
-          inputs[arg_output_locations_[i][j]].Init(*arg);
+        if (args.call_frame->CanConsumeArg(i)) {
+          // The first destination input can consume the argument.
+          Entry& first_input = inputs[arg_output_locations_[i][0]];
+          first_input.state = Entry::State::HAS_VALUE;
+          first_input.val.Init();
+          args.call_frame->ConsumeArg(i, first_input.val.get());
+          // All subsequent destination inputs get a shallow copy of the first
+          // destination input.
+          //
+          // NOTE: If we had metadata about which kernels might attempt to
+          // forward their input, we could arrange the kernel order so that
+          // one of those kernels was executed last.
+          for (size_t j = 1; j < num_destinations; ++j) {
+            Entry& input = inputs[arg_output_locations_[i][j]];
+            input.state = Entry::State::HAS_VALUE;
+            input.val.Init(*first_input.val);
+          }
+        } else {
+          const Tensor* arg;
+          TF_CHECK_OK(args.call_frame->GetArg(i, &arg));
+          for (size_t j = 0; j < num_destinations; ++j) {
+            Entry& input = inputs[arg_output_locations_[i][j]];
+            // NOTE: We must make at least one shallow copy of the argument
+            // tensor that remains live until all consuming kernels have
+            // executed, to keep the reference count > 1, and inhibit buffer
+            // forwarding. For simplicity, we shallow copy into the input entry
+            // for each consuming kernel.
+            input.state = Entry::State::HAS_VALUE;
+            input.val.Init(*arg);
+          }
         }
       }
     }
 
+    // Kernels that return a constant value (e.g. ConstOp) are relatively
+    // expensive due to the Tensor allocations that they perform. Therefore we
+    // specialize their implementation and forward their constant value directly
+    // to the inputs of kernels that consume them.
+    for (const ConstTensorKernelState& kernel_state : const_tensor_kernels_) {
+      for (size_t i = 0; i < kernel_state.output_locations.size(); ++i) {
+        Entry& input = inputs[kernel_state.output_locations[i]];
+        input.state = Entry::State::HAS_CONST_TENSOR;
+        input.const_tensor = &kernel_state.const_tensor;
+      }
+    }
+
     // Execute the kernels one-at-a-time in topological order.
     for (size_t i = 0; i < kernels_.size(); ++i) {
       const KernelState& kernel_state = kernels_[i];
@@ -306,8 +389,21 @@ class SingleThreadedExecutorImpl : public Executor {
       input_alloc_attrs.clear();
       input_alloc_attrs.resize(num_inputs);
       for (size_t j = 0; j < num_inputs; ++j) {
-        auto t = inputs[input_start_index + j].get();
-        node_inputs[j].tensor = t;
+        Entry& input = inputs[input_start_index + j];
+        switch (input.state) {
+          case Entry::State::HAS_CONST_TENSOR:
+            // NOTE(mrry): This `const_cast` is necessary because `TensorValue`
+            // stores a non-const `Tensor*`, and relies on the `OpKernelContext`
+            // accessors making dynamic checks that prevent using an immutable
+            // tensor as a mutable tensor.
+            node_inputs[j].tensor = const_cast<Tensor*>(input.const_tensor);
+            break;
+          case Entry::State::HAS_VALUE:
+            node_inputs[j].tensor = input.val.get();
+            break;
+          default:
+            DCHECK(false) << "Input did not have a valid value.";
+        }
         input_alloc_attrs[j] = input_alloc_attrs_[input_start_index + j];
       }
       params.op_kernel = kernel_state.kernel;
@@ -316,41 +412,11 @@ class SingleThreadedExecutorImpl : public Executor {
 
       // Actually execute the kernel.
       device->Compute(kernel_state.kernel, &ctx);
-
-      if (!ctx.status().ok()) {
-        // On failure, we must manually free all intermediate tensors. We have
-        // already freed all the inputs for kernels up to (but not including)
-        // the `i`th kernel. We scan through the previously executed kernels and
-        // destroy any tensors that were destined to be the input for a kernel
-        // that has not yet executed.
-        for (size_t j = 0; j < arg_output_locations_.size(); ++j) {
-          for (size_t output_location : arg_output_locations_[j]) {
-            if (output_location >= input_start_index) {
-              // Only destroy an output location if it is an input to an
-              // operation that has not yet executed.
-              inputs[output_location].Destroy();
-            }
-          }
-        }
-        for (size_t j = 0; j < i; ++j) {
-          const KernelState& executed_kernel_state = kernels_[j];
-          for (size_t k = 0; k < executed_kernel_state.num_outputs; ++k) {
-            for (size_t output_location :
-                 executed_kernel_state.output_locations[k]) {
-              if (output_location >= input_start_index) {
-                // Only destroy an output location if it is an input to an
-                // operation that has not yet executed.
-                inputs[output_location].Destroy();
-              }
-            }
-          }
-        }
-        return ctx.status();
-      }
+      TF_RETURN_IF_ERROR(ctx.status());
 
       // Free the inputs to the current kernel.
       for (size_t j = 0; j < num_inputs; ++j) {
-        inputs[input_start_index + j].Destroy();
+        inputs[input_start_index + j].ClearVal();
       }
 
       // Forward the outputs of the kernel to the inputs of subsequent kernels.
@@ -363,11 +429,15 @@ class SingleThreadedExecutorImpl : public Executor {
           for (size_t k = 0; k < num_destinations - 1; ++k) {
             // TODO(mrry): Validate that the types match the expected values or
             // ensure that the necessary validation has already happened.
-            inputs[kernel_state.output_locations[j][k]].Init(*val.tensor);
+            Entry& input = inputs[kernel_state.output_locations[j][k]];
+            input.state = Entry::State::HAS_VALUE;
+            input.val.Init(*val.tensor);
           }
           // Move `arg` to the last consumer to avoid the cost of copying it.
-          inputs[kernel_state.output_locations[j][num_destinations - 1]].Init(
-              std::move(*val.tensor));
+          Entry& input =
+              inputs[kernel_state.output_locations[j][num_destinations - 1]];
+          input.state = Entry::State::HAS_VALUE;
+          input.val.Init(std::move(*val.tensor));
         }
         delete val.tensor;
       }
@@ -406,7 +476,7 @@ class SingleThreadedExecutorImpl : public Executor {
 
     // For the `j`th output of `kernel`, `output_locations[j]` contains the
     // locations in the flat `inputs` vector to which that output must be
-    // copied. See comment at the beginning of `RunAsync()` for details.
+    // copied. See comment at the beginning of `Run()` for details.
     std::vector<std::vector<size_t>>
         output_locations;  // Length = `num_outputs`.
 
@@ -421,6 +491,33 @@ class SingleThreadedExecutorImpl : public Executor {
   std::vector<std::vector<size_t>>
       arg_output_locations_;  // Length = `num_args`.
 
+  // Represents cached graph structure state for each kernel that produces
+  // a single constant-valued tensor.
+  struct ConstTensorKernelState {
+    // The kernel object. Not owned.
+    //
+    // This pointer is managed by `params_.create_kernel()` and
+    // `params_.delete_kernel()`.
+    OpKernel* kernel;
+
+    // The cached value of `kernel->const_tensor()`.
+    //
+    // NOTE: We keep a `Tensor` rather than a `const Tensor*` here in order to
+    // keep the reference count on the underlying buffer above 1. Otherwise, a
+    // kernel could interpret the input as a forwardable tensor, and mutate the
+    // underlying constant tensor.
+    Tensor const_tensor;
+
+    // For the single output of `kernel`, `output_locations` contains the
+    // locations in the flat `inputs` vector to which that output must be
+    // copied. See comment at the beginning of `Run()` for details.
+    std::vector<size_t> output_locations;  // Length = `num_outputs`.
+
+    // Memory space information for the single output of `kernel`.
+    AllocatorAttributes output_alloc_attr;
+  };
+  std::vector<ConstTensorKernelState> const_tensor_kernels_;
+
   // Memory space information for each input. This information is stored in the
   // same order as the flat `inputs` vector. See comment at the beginning of
   // `RunAsync()` for details.
@@ -431,7 +528,7 @@ class SingleThreadedExecutorImpl : public Executor {
 class SingleThreadedExecutorRegistrar {
  public:
   SingleThreadedExecutorRegistrar() {
-    ExecutorFactory::Register("SINGLE_THREADED_EXECUTOR", new Factory());
+    ExecutorFactory::Register(kSingleThreadedExecutor, new Factory());
   }
 
  private:
diff --git a/tensorflow/core/kernels/data/single_threaded_executor_test.cc b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
index 898a6555265..16ad78e5f9b 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
@@ -20,13 +20,14 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -154,6 +155,14 @@ TEST_F(ExecutorTest, SimpleAdd) {
   std::vector<Tensor> retvals;
   TF_ASSERT_OK(call_frame.ConsumeRetvals(&retvals, false));
   EXPECT_EQ(3.0, V(retvals[0]));  // out = 1.0 + 2.0 = 3.0
+
+  // Verify that the argument values are unchanged.
+  const Tensor* arg_0;
+  TF_ASSERT_OK(call_frame.GetArg(0, &arg_0));
+  EXPECT_EQ(1.0, V(*arg_0));
+  const Tensor* arg_1;
+  TF_ASSERT_OK(call_frame.GetArg(1, &arg_1));
+  EXPECT_EQ(2.0, V(*arg_1));
 }
 
 TEST_F(ExecutorTest, SelfAdd) {
@@ -246,6 +255,24 @@ TEST_F(ExecutorTest, OpError) {
   EXPECT_TRUE(errors::IsInvalidArgument(Run(&call_frame)));
 }
 
+TEST_F(ExecutorTest, ControlDependenciesFromSpecialNodes) {
+  auto g = absl::make_unique<Graph>(OpRegistry::Global());
+  auto in0 = test::graph::Arg(g.get(), 0, DT_FLOAT);
+  auto one = test::graph::Constant(g.get(), V(2.0));
+  auto add = test::graph::Add(g.get(), in0, one);
+  auto ret = test::graph::Retval(g.get(), 0, add);
+  g->AddControlEdge(in0, add);
+  g->AddControlEdge(one, ret);
+  FixupSourceAndSinkEdges(g.get());
+  Create(std::move(g));
+  FunctionCallFrame call_frame({DT_FLOAT}, {DT_FLOAT});
+  TF_ASSERT_OK(call_frame.SetArgs({V(1.0)}));
+  TF_ASSERT_OK(Run(&call_frame));
+  std::vector<Tensor> retvals;
+  TF_ASSERT_OK(call_frame.ConsumeRetvals(&retvals, false));
+  EXPECT_EQ(3.0, V(retvals[0]));  // out = 1.0 + 2.0 = 3.0
+}
+
 static void BM_executor(int iters, int width, int depth) {
 #ifdef PLATFORM_GOOGLE
   BenchmarkUseRealTime();
@@ -299,6 +326,36 @@ BENCHMARK(BM_executor)->ArgPair(8192, 32);
 // Tall fat graph
 BENCHMARK(BM_executor)->ArgPair(1024, 1024);
 
+static void BM_const_identity(int iters, int width, int outputs_per_const) {
+#ifdef PLATFORM_GOOGLE
+  BenchmarkUseRealTime();
+#endif  // PLATFORM_GOOGLE
+  Graph* g = new Graph(OpRegistry::Global());
+  for (int i = 0; i < width; ++i) {
+    Tensor i_t(i);
+    Node* const_node = test::graph::Constant(g, i_t);
+    for (int j = 0; j < outputs_per_const; ++j) {
+      test::graph::Identity(g, const_node);
+    }
+  }
+  FixupSourceAndSinkEdges(g);
+#ifdef PLATFORM_GOOGLE
+  SetBenchmarkLabel(
+      strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
+  SetBenchmarkItemsProcessed((1 + outputs_per_const) * width *
+                             static_cast<int64>(iters));
+#endif  // PLATFORM_GOOGLE
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
+}
+
+// Graph with actual op execution.
+BENCHMARK(BM_const_identity)->ArgPair(1, 1);
+BENCHMARK(BM_const_identity)->ArgPair(1, 100);
+BENCHMARK(BM_const_identity)->ArgPair(100, 1);
+BENCHMARK(BM_const_identity)->ArgPair(100, 100);
+
 // TODO(mrry): This benchmark currently crashes with a use-after free, because
 // test::Benchmark::RunWithArgs() assumes that the executor will take ownership
 // of the given graph, *and* keep its nodes (`x`, `y` and `z`) alive for the
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 00356778026..42364e416ea 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -435,9 +435,9 @@ class DebugIdentityV2Op : public OpKernel {
     for (const string& dump_root : dump_roots_) {
       tfdbg::DebugEventsWriter* debug_events_writer =
           tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root);
-      debug_events_writer->WriteGraphExecutionTrace(
-          tfdbg_context_id_, device_name_, op_name_, output_slot_,
-          tensor_debug_mode_, tensor);
+      OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
+                                  tfdbg_context_id_, device_name_, op_name_,
+                                  output_slot_, tensor_debug_mode_, tensor));
     }
     context->set_output(0, tensor);
   }
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 750c0318a4d..7a2bc1eab55 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -185,7 +185,7 @@ struct CopyFilterDepth {
 template <typename T>
 struct ComputeFilterRangeTransform {
   typedef typename Eigen::internal::packet_traits<T>::type Packet;
-  static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
+  static constexpr int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
   typedef Eigen::Map<
       Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 0f5a7019b1f..3b38daf0067 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -61,7 +61,9 @@ class DequantizeOp : public OpKernel {
                                 " is '" +
                                 DataTypeString(ctx->output_type(0)) + "'"));
 
+    need_cast_ = true;
     if (ctx->output_type(0) == DT_FLOAT) {
+      need_cast_ = false;
       OP_REQUIRES(ctx,
                   (mode_string == "MIN_COMBINED" ||
                    mode_string == "MIN_FIRST" || mode_string == "SCALED"),
@@ -98,8 +100,9 @@ class DequantizeOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    Tensor float_output = tensorflow::Tensor(DT_FLOAT, input.shape());
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    Tensor float_output =
+        need_cast_ ? tensorflow::Tensor(DT_FLOAT, input.shape()) : *output;
     if (num_slices == 1) {
       const float min_range = input_min_tensor.flat<float>()(0);
       const float max_range = input_max_tensor.flat<float>()(0);
@@ -128,10 +131,12 @@ class DequantizeOp : public OpKernel {
                         max_ranges(i), output_tensor.template chip<1>(i));
       }
     }
-    S* out_ptr = output->flat<S>().data();
-    float* in_ptr = float_output.flat<float>().data();
-    for (int64 i = 0; i < float_output.NumElements(); ++i) {
-      out_ptr[i] = static_cast<S>(in_ptr[i]);
+    if (need_cast_) {
+      S* out_ptr = output->flat<S>().data();
+      float* in_ptr = float_output.flat<float>().data();
+      for (int64 i = 0; i < float_output.NumElements(); ++i) {
+        out_ptr[i] = static_cast<S>(in_ptr[i]);
+      }
     }
   }
 
@@ -219,6 +224,7 @@ class DequantizeOp : public OpKernel {
   int mode_;
   int axis_;
   bool narrow_range_;
+  bool need_cast_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("Dequantize")
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index 06e4b8dbaee..811d48af091 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -178,6 +178,7 @@ TF_CALL_int32(REGISTER_DIAGOP);
 TF_CALL_int64(REGISTER_DIAGOP);
 TF_CALL_complex64(REGISTER_DIAGOP);
 TF_CALL_complex128(REGISTER_DIAGOP);
+TF_CALL_half(REGISTER_DIAGOP);
 #undef REGISTER_DIAGOP
 
 #define REGISTER_DIAGPARTOP(T)                                    \
@@ -191,6 +192,7 @@ TF_CALL_int32(REGISTER_DIAGPARTOP);
 TF_CALL_int64(REGISTER_DIAGPARTOP);
 TF_CALL_complex64(REGISTER_DIAGPARTOP);
 TF_CALL_complex128(REGISTER_DIAGPARTOP);
+TF_CALL_half(REGISTER_DIAGPARTOP);
 #undef REGISTER_DIAGPARTOP
 
 // Register the GPU kernels.
@@ -217,6 +219,7 @@ TF_CALL_int32(REGISTER_DIAGOP_GPU);
 TF_CALL_int64(REGISTER_DIAGOP_GPU);
 TF_CALL_complex64(REGISTER_DIAGOP_GPU);
 TF_CALL_complex128(REGISTER_DIAGOP_GPU);
+TF_CALL_half(REGISTER_DIAGOP_GPU);
 #undef REGISTER_DIAGOP_GPU
 
 // Forward declarations of the functor specializations for GPU.
@@ -227,6 +230,7 @@ extern template struct DiagPartFunctor<GPUDevice, int32>;
 extern template struct DiagPartFunctor<GPUDevice, int64>;
 extern template struct DiagPartFunctor<GPUDevice, complex64>;
 extern template struct DiagPartFunctor<GPUDevice, complex128>;
+extern template struct DiagPartFunctor<GPUDevice, Eigen::half>;
 }  // namespace functor
 
 #define REGISTER_DIAGPARTOP_GPU(T)                                \
@@ -240,6 +244,7 @@ TF_CALL_int32(REGISTER_DIAGPARTOP_GPU);
 TF_CALL_int64(REGISTER_DIAGPARTOP_GPU);
 TF_CALL_complex64(REGISTER_DIAGPARTOP_GPU);
 TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
+TF_CALL_half(REGISTER_DIAGPARTOP_GPU);
 #undef REGISTER_DIAGPARTOP_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index ae541fb365a..c6859d748d3 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -76,6 +76,7 @@ template struct DiagFunctor<GPUDevice, int32>;
 template struct DiagFunctor<GPUDevice, int64>;
 template struct DiagFunctor<GPUDevice, complex64>;
 template struct DiagFunctor<GPUDevice, complex128>;
+template struct DiagFunctor<GPUDevice, Eigen::half>;
 
 template <typename T>
 __global__ void DiagPartGpuKernel(const int num_threads, const int64 size,
@@ -113,6 +114,7 @@ template struct DiagPartFunctor<GPUDevice, int32>;
 template struct DiagPartFunctor<GPUDevice, int64>;
 template struct DiagPartFunctor<GPUDevice, complex64>;
 template struct DiagPartFunctor<GPUDevice, complex128>;
+template struct DiagPartFunctor<GPUDevice, Eigen::half>;
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index e0fb36eca57..98c2fb57833 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -35,14 +35,6 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#if GOOGLE_CUDA
-#include "third_party/cub/device/device_radix_sort.cuh"
-#include "third_party/cub/device/device_reduce.cuh"
-#include "third_party/cub/iterator/constant_input_iterator.cuh"
-#include "third_party/cub/thread/thread_operators.cuh"
-#elif TENSORFLOW_USE_ROCM
-#include "rocm/include/hipcub/hipcub.hpp"
-#endif
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -52,15 +44,10 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/gather_functor_gpu.cu.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/transform_output_iterator.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 56ac04ce87a..7233020c1c1 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -270,10 +270,10 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
   static constexpr float kScaleN = 1.0;
 
   // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48.
-  static const StorageIndex kUnrollM = 48;
+  static constexpr StorageIndex kUnrollM = 48;
 
   // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8.
-  static const StorageIndex kUnrollN = 24;
+  static constexpr StorageIndex kUnrollN = 24;
 
  public:
   TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
diff --git a/tensorflow/core/kernels/eigen_convolution_helpers.h b/tensorflow/core/kernels/eigen_convolution_helpers.h
index b6587daf139..965a2835c8a 100644
--- a/tensorflow/core/kernels/eigen_convolution_helpers.h
+++ b/tensorflow/core/kernels/eigen_convolution_helpers.h
@@ -63,7 +63,7 @@ class TensorEvaluatorHasPartialPacket {
       functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
           nullptr)) status;
 
-  static const bool value = status::value;
+  static constexpr bool value = status::value;
 };
 
 // Compute a mask for loading/storing coefficients in/from a packet in a
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 2b36bad80b2..7db4a69a8b3 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -277,11 +277,11 @@ struct AvgPoolMeanReducer {
 #if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__) && \
     !defined(__HIPCC__)
   // We only support packet access for floats.
-  static const bool PacketAccess = internal::is_same<T, float>::value;
+  static constexpr bool PacketAccess = internal::is_same<T, float>::value;
 #else
   static const bool PacketAccess = false;
 #endif
-  static const bool IsStateful = true;
+  static constexpr bool IsStateful = true;
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) {
     typedef typename packet_traits<T>::type Packet;
diff --git a/tensorflow/core/kernels/eigen_volume_patch.h b/tensorflow/core/kernels/eigen_volume_patch.h
index bb71919838b..7afc05db73a 100644
--- a/tensorflow/core/kernels/eigen_volume_patch.h
+++ b/tensorflow/core/kernels/eigen_volume_patch.h
@@ -28,15 +28,15 @@ template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType,
 struct CustomTensorEvaluator {
   typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
   typedef typename XprType::Index Index;
-  static const int NumInputDims = internal::array_size<
+  static constexpr int NumInputDims = internal::array_size<
       typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  static const int NumDims = NumInputDims + 1;
+  static constexpr int NumDims = NumInputDims + 1;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef
       typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const Index PacketSize =
+  static constexpr Index PacketSize =
       internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
diff --git a/tensorflow/core/kernels/einsum_op_impl.h b/tensorflow/core/kernels/einsum_op_impl.h
index 679a3de9b82..620a144e886 100644
--- a/tensorflow/core/kernels/einsum_op_impl.h
+++ b/tensorflow/core/kernels/einsum_op_impl.h
@@ -537,29 +537,13 @@ struct EinsumHelper {
     return CopyFrom(input, output_shape, output);
   }
 
-  // Conjugates the input.
-  template <typename Device, typename T>
-  static Status Conjugate(OpKernelContext* ctx, Tensor* input) {
-    std::vector<int> permutation(input->dims());
-    std::iota(permutation.begin(), permutation.end(), 0);
-    Tensor output;
-    TF_RETURN_IF_ERROR(
-        ctx->allocate_temp(DataTypeToEnum<T>::value, input->shape(), &output));
-    const Device& d = ctx->eigen_device<Device>();
-    TF_RETURN_IF_ERROR(DoConjugateTranspose(d, *input, permutation, &output));
-    std::swap(*input, output);
-    return Status::OK();
-  }
-
   // Contracts the inputs along the last axis. (or the second last if the
   // corresponding value of swap_free_and_contract is true). The batch
   // dimensions are broadcast to the output shape.
-  // TODO(anudhyan): Factor this function into a BatchMatMul functor and support
-  // transpose_x and transpose_y attributes (in addition to adj_x and adj_y).
-  // Also, the BatchMatMul might devolve into a component-wise multiplication
-  // when the matrix shape is [1,1]; in this case BatchMatMul functor would be
-  // very inefficient. The functor should detect if this is the case and perform
-  // componentwise multiplication functor instead.
+  // TODO(anudhyan): BatchMatMul might devolve into a component-wise
+  // multiplication when the matrix shape is [1,1]; in this case BatchMatMul
+  // functor would be very inefficient. The functor should detect if this is the
+  // case and perform componentwise multiplication functor instead.
   template <typename Device, typename T>
   static Status ContractOperands(OpKernelContext* ctx,
                                  absl::Span<const Tensor> inputs,
@@ -584,12 +568,8 @@ struct EinsumHelper {
           inputs[i].dims() - (swap_free_and_contract[i] ? 1 : 2);
       output_shape.AddDim(inputs[i].dim_size(free_axis));
     }
-    bool adj_x = swap_free_and_contract[0];
-    bool adj_y = !swap_free_and_contract[1];
-    if (is_complex<T>::value) {
-      if (adj_x) TF_RETURN_IF_ERROR(Conjugate<Device, T>(ctx, &lhs));
-      if (adj_y) TF_RETURN_IF_ERROR(Conjugate<Device, T>(ctx, &rhs));
-    }
+    bool trans_x = swap_free_and_contract[0];
+    bool trans_y = !swap_free_and_contract[1];
     TF_RETURN_IF_ERROR(
         ctx->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
     if (lhs.NumElements() == 0 || rhs.NumElements() == 0) {
@@ -600,8 +580,9 @@ struct EinsumHelper {
     Tensor output_reshaped;
     TF_RETURN_IF_ERROR(
         ReshapeToRank3(*output, bcast.output_batch_size(), &output_reshaped));
-    LaunchBatchMatMul<Device, T>::Launch(ctx, lhs, rhs, adj_x, adj_y, bcast,
-                                         &output_reshaped);
+    LaunchBatchMatMul<Device, T>::Launch(ctx, lhs, rhs, /*adj_x=*/false,
+                                         /*adj_y=*/false, trans_x, trans_y,
+                                         bcast, &output_reshaped);
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 3412d00136e..0a940e52eb7 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -569,57 +569,115 @@ class ParseSequenceExampleOp : public OpKernel {
       const Tensor* dense_keys, const Tensor* sparse_keys,
       const Tensor* ragged_keys,
       const OpInputList& context_dense_defaults) const {
+    // Convert the tensors/attrs to ArraySlices once, instead of re-evaluating
+    // them in each loop iteration.
+    gtl::ArraySlice<tstring> dense_keys_slice =
+        dense_keys
+            ? gtl::ArraySlice<tstring>(dense_keys->flat<tstring>().data(),
+                                       attrs_.num_context_dense)
+            : attrs_.context_dense_keys;
+    gtl::ArraySlice<tstring> sparse_keys_slice =
+        sparse_keys
+            ? gtl::ArraySlice<tstring>(sparse_keys->flat<tstring>().data(),
+                                       attrs_.num_context_sparse)
+            : attrs_.context_sparse_keys;
+    gtl::ArraySlice<tstring> ragged_keys_slice =
+        ragged_keys
+            ? gtl::ArraySlice<tstring>(ragged_keys->flat<tstring>().data(),
+                                       attrs_.num_context_ragged)
+            : gtl::ArraySlice<tstring>(nullptr, 0);
+
     example::FastParseExampleConfig config;
+    config.dense.reserve(attrs_.num_context_dense);
     for (int d = 0; d < attrs_.num_context_dense; ++d) {
-      const tstring& key = dense_keys ? dense_keys->flat<tstring>()(d)
-                                      : attrs_.context_dense_keys[d];
-      config.dense.push_back({key, attrs_.context_dense_types[d],
-                              attrs_.context_dense_shapes[d],
-                              context_dense_defaults[d],
-                              false /* attrs_.context_variable_length[d] */,
-                              0 /*attrs_.context_elements_per_stride[d] */});
+      const tstring& key = dense_keys_slice[d];
+      config.dense.emplace_back(key, attrs_.context_dense_types[d],
+                                attrs_.context_dense_shapes[d],
+                                context_dense_defaults[d],
+                                false /* attrs_.context_variable_length[d] */,
+                                0 /*attrs_.context_elements_per_stride[d] */);
     }
+    config.sparse.reserve(attrs_.num_context_sparse);
     for (int d = 0; d < attrs_.num_context_sparse; ++d) {
-      const tstring& key = sparse_keys ? sparse_keys->flat<tstring>()(d)
-                                       : attrs_.context_sparse_keys[d];
-      config.sparse.push_back({key, attrs_.context_sparse_types[d]});
+      const tstring& key = sparse_keys_slice[d];
+      config.sparse.emplace_back(key, attrs_.context_sparse_types[d]);
     }
+    config.ragged.reserve(attrs_.num_context_ragged);
     for (int d = 0; d < attrs_.num_context_ragged; ++d) {
-      config.ragged.push_back({ragged_keys->flat<tstring>()(d),
-                               attrs_.context_ragged_value_types[d],
-                               attrs_.context_ragged_split_types[d]});
+      config.ragged.emplace_back(ragged_keys_slice[d],
+                                 attrs_.context_ragged_value_types[d],
+                                 attrs_.context_ragged_split_types[d]);
     }
     return config;
   }
 
+  static Tensor ConstructDefaultScalar(DataType dtype) {
+    switch (dtype) {
+      case DT_INT64:
+        return Tensor(static_cast<int64>(0));
+      case DT_FLOAT:
+        return Tensor(static_cast<float>(0.0));
+      case DT_STRING:
+        return Tensor("");
+      default:
+        return Tensor(DT_INVALID);
+    }
+  }
+
   example::FastParseExampleConfig MakeFeatureListConfig(
       const Tensor* dense_keys, const Tensor* sparse_keys,
       const Tensor* ragged_keys,
       const Tensor* feature_list_dense_missing_assumed_empty) const {
+    // Convert the tensors/attrs to ArraySlices once, instead of re-evaluating
+    // them in each loop iteration.
+    gtl::ArraySlice<tstring> dense_keys_slice =
+        dense_keys
+            ? gtl::ArraySlice<tstring>(dense_keys->flat<tstring>().data(),
+                                       attrs_.num_feature_list_dense)
+            : attrs_.feature_list_dense_keys;
+    gtl::ArraySlice<tstring> sparse_keys_slice =
+        sparse_keys
+            ? gtl::ArraySlice<tstring>(sparse_keys->flat<tstring>().data(),
+                                       attrs_.num_feature_list_sparse)
+            : attrs_.feature_list_sparse_keys;
+    gtl::ArraySlice<tstring> ragged_keys_slice =
+        ragged_keys
+            ? gtl::ArraySlice<tstring>(ragged_keys->flat<tstring>().data(),
+                                       attrs_.num_feature_list_ragged)
+            : gtl::ArraySlice<tstring>(nullptr, 0);
+    // Use an empty slice to indicate that the map in attrs_ should be used
+    // instead.
+    gtl::ArraySlice<bool> feature_list_dense_missing_assumed_empty_slice =
+        feature_list_dense_missing_assumed_empty
+            ? gtl::ArraySlice<bool>(
+                  feature_list_dense_missing_assumed_empty->flat<bool>().data(),
+                  attrs_.num_feature_list_dense)
+            : gtl::ArraySlice<bool>(nullptr, 0);
+
     example::FastParseExampleConfig config;
+    config.dense.reserve(attrs_.num_feature_list_dense);
     for (int d = 0; d < attrs_.num_feature_list_dense; ++d) {
-      const tstring& key = dense_keys ? dense_keys->flat<tstring>()(d)
-                                      : attrs_.feature_list_dense_keys[d];
+      const tstring& key = dense_keys_slice[d];
       bool missing_assumed_empty =
-          feature_list_dense_missing_assumed_empty
-              ? feature_list_dense_missing_assumed_empty->flat<bool>()(d)
+          !feature_list_dense_missing_assumed_empty_slice.empty()
+              ? feature_list_dense_missing_assumed_empty_slice[d]
               : attrs_.feature_list_dense_missing_assumed_empty.count(key) > 0;
       DataType dtype = attrs_.feature_list_dense_types[d];
-      Tensor default_value = Tensor(dtype, TensorShape({}));
-      config.dense.push_back(
-          {key, dtype, attrs_.feature_list_dense_shapes[d], default_value,
-           missing_assumed_empty,
-           0 /*attrs_.feature_list_elements_per_stride[d] */});
+      config.dense.emplace_back(
+          key, dtype, attrs_.feature_list_dense_shapes[d],
+          ConstructDefaultScalar(dtype), missing_assumed_empty,
+          0 /*attrs_.feature_list_elements_per_stride[d] */);
     }
+    config.sparse.reserve(attrs_.num_feature_list_sparse);
     for (int d = 0; d < attrs_.num_feature_list_sparse; ++d) {
-      const tstring& key = sparse_keys ? sparse_keys->flat<tstring>()(d)
-                                       : attrs_.feature_list_sparse_keys[d];
-      config.sparse.push_back({key, attrs_.feature_list_sparse_types[d]});
+      const tstring& key = sparse_keys_slice[d];
+      config.sparse.emplace_back(key, attrs_.feature_list_sparse_types[d]);
     }
+    config.ragged.reserve(attrs_.num_feature_list_ragged);
     for (int d = 0; d < attrs_.num_feature_list_ragged; ++d) {
-      config.ragged.push_back({ragged_keys->flat<tstring>()(d),
-                               attrs_.feature_list_ragged_value_types[d],
-                               attrs_.feature_list_ragged_split_types[d]});
+      config.ragged.emplace_back(ragged_keys_slice[d],
+                                 attrs_.feature_list_ragged_value_types[d],
+                                 attrs_.feature_list_ragged_split_types[d]);
     }
     return config;
   }
@@ -892,9 +950,6 @@ class ParseSingleSequenceExampleOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->output_list("feature_list_dense_values",
                                          &feature_list_dense_values));
 
-#ifdef TENSORFLOW_LITE_PROTOS
-    SequenceExample ex;
-#else
     // Allocate the SequenceExample on an arena. Provides better memory locality
     // and greatly speeds up destruction.
     protobuf::ArenaOptions options;
@@ -907,7 +962,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
     options.max_block_size = std::max(options.max_block_size, block_size);
     protobuf::Arena arena(options);
     auto& ex = *protobuf::Arena::CreateMessage<SequenceExample>(&arena);
-#endif
+
     OP_REQUIRES(
         ctx, ParseProtoUnlimited(&ex, serialized_t()),
         errors::InvalidArgument("Could not parse example input, value: '",
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 4908e2cc938..5c4d68545a1 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/gradients.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/gradients.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@@ -45,12 +45,27 @@ void ArgOp::Compute(OpKernelContext* ctx) {
   auto frame = ctx->call_frame();
   OP_REQUIRES(ctx, frame != nullptr, errors::Internal("no call frame"));
   const Tensor* val;
-  OP_REQUIRES_OK(ctx, frame->GetArg(index_, &val));
-  OP_REQUIRES(ctx, val->dtype() == dtype_,
-              errors::InvalidArgument("Type mismatch: actual ",
-                                      DataTypeString(val->dtype()),
-                                      " vs. expect ", DataTypeString(dtype_)));
-  ctx->set_output(0, *val);
+
+  auto validate_type = [this](const Tensor& val) {
+    if (val.dtype() == dtype_) {
+      return Status::OK();
+    } else {
+      return errors::InvalidArgument("Type mismatch: actual ",
+                                     DataTypeString(val.dtype()),
+                                     " vs. expect ", DataTypeString(dtype_));
+    }
+  };
+
+  if (frame->CanConsumeArg(index_)) {
+    Tensor val;
+    frame->ConsumeArg(index_, &val);
+    OP_REQUIRES_OK(ctx, validate_type(val));
+    ctx->set_output(0, std::move(val));
+  } else {
+    OP_REQUIRES_OK(ctx, frame->GetArg(index_, &val));
+    OP_REQUIRES_OK(ctx, validate_type(*val));
+    ctx->set_output(0, *val);
+  }
 }
 
 RetvalOp::RetvalOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -365,7 +380,7 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
 
   FunctionLibraryRuntime::Options opts;
-  opts.runner = ctx->runner();
+  opts.runner = nullptr;  // Use default runner at remote device.
   opts.run_all_kernels_inline = ctx->run_all_kernels_inline();
   opts.source_device = source_device;
   if (opts.source_device != target_device) {
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index c8cebd0ff4d..7f4d1144cb2 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/types.h"
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -26,6 +27,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
@@ -42,6 +45,14 @@ Status Instantiate(FunctionLibraryRuntime* lib, const NameAttrList& func,
   return lib->Instantiate(func.name(), AttrSlice(&func.attr()), handle);
 }
 
+Status Instantiate(OpKernelContext* ctx, const NameAttrList& func,
+                   FunctionLibraryRuntime::Handle* handle) {
+  FunctionLibraryRuntime::InstantiateOptions opts;
+  opts.executor_type = ctx->executor_type();
+  return ctx->function_library()->Instantiate(
+      func.name(), AttrSlice(&func.attr()), opts, handle);
+}
+
 // If "t" is a scalar of a supported type, returns t != 0 in "*v".
 Status ToBool(gtl::ArraySlice<Tensor> t, bool* v) {
   if (t.size() != 1) {
@@ -224,8 +235,8 @@ class IfOp : public AsyncOpKernel {
         *then_handle = iter->second.first;
         *else_handle = iter->second.second;
       } else {
-        TF_RETURN_IF_ERROR(Instantiate(lib, then_func_, then_handle));
-        TF_RETURN_IF_ERROR(Instantiate(lib, else_func_, else_handle));
+        TF_RETURN_IF_ERROR(Instantiate(ctx, then_func_, then_handle));
+        TF_RETURN_IF_ERROR(Instantiate(ctx, else_func_, else_handle));
         handles_[lib] = {*then_handle, *else_handle};
       }
     }
@@ -357,14 +368,30 @@ class WhileOp : public AsyncOpKernel {
   ~WhileOp() override {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    auto lib = ctx->function_library();
-    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
-                      errors::Internal("No function library"), done);
-    FHandle cond_handle;
-    FHandle body_handle;
-    OP_REQUIRES_OK_ASYNC(ctx, GetHandles(ctx, &cond_handle, &body_handle),
-                         done);
-    (new State(this, ctx, cond_handle, body_handle, done))->Start();
+    if (ctx->run_all_kernels_inline()) {
+      // Use the non-callback-based implementation when kernels (and function
+      // callbacks) execute inline to avoid stack overflow.
+      OP_REQUIRES_OK_ASYNC(ctx, DoComputeSync(ctx), done);
+    } else {
+      FHandle cond_handle;
+      FHandle body_handle;
+      OP_REQUIRES_OK_ASYNC(ctx, GetHandles(ctx, &cond_handle, &body_handle),
+                           done);
+      (new State(this, ctx, cond_handle, body_handle, done))->Start();
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    // Use the non-callback-based implementation when the synchronous Compute()
+    // method is invoked, because the caller is explicitly donating a thread.
+    Status s = DoComputeSync(ctx);
+    // NOTE: Unfortunately, we cannot use OP_REQUIRES_OK here, because this is
+    // still an AsyncOpKernel, and there is a run-time check to avoid calling
+    // OP_REQUIRES_OK in AsyncOpKernel::ComputeAsync() (which would deadlock in
+    // the event of an error).
+    if (TF_PREDICT_FALSE(!s.ok())) {
+      ctx->SetStatus(s);
+    }
   }
 
  private:
@@ -375,6 +402,119 @@ class WhileOp : public AsyncOpKernel {
   std::unordered_map<FunctionLibraryRuntime*, std::pair<FHandle, FHandle>>
       handles_ GUARDED_BY(mu_);
 
+  static string EvalCondTraceString(
+      OpKernelContext* ctx, const FunctionLibraryRuntime::Options& opts) {
+    return absl::StrCat("WhileOp-EvalCond #parent_step_id=", ctx->step_id(),
+                        ",function_step_id=", opts.step_id, "#");
+  }
+
+  static string StartBodyTraceString(
+      OpKernelContext* ctx, const FunctionLibraryRuntime::Options& opts) {
+    return absl::StrCat("WhileOp-StartBody #parent_step_id=", ctx->step_id(),
+                        ",function_step_id=", opts.step_id, "#");
+  }
+
+  static Status CondResultToBool(OpKernelContext* ctx,
+                                 const FunctionLibraryRuntime::Options& opts,
+                                 const Tensor& cond_t, bool* out_result) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    const DeviceBase::GpuDeviceInfo* gpu_device_info =
+        ctx->device()->tensorflow_gpu_device_info();
+    const bool is_hostmem_dtype =
+        cond_t.dtype() == DT_INT32 || cond_t.dtype() == DT_INT64;
+    if (!is_hostmem_dtype && gpu_device_info &&
+        (opts.rets_alloc_attrs.empty() ||
+         !opts.rets_alloc_attrs[0].on_host())) {
+      // Copy the ret value to host if it's allocated on device.
+      Device* device = down_cast<Device*>(ctx->device());
+      DeviceContext* device_ctx = ctx->op_device_context();
+      Tensor host_cond_t = Tensor(cond_t.dtype(), cond_t.shape());
+      TF_RETURN_IF_ERROR(device_ctx->CopyDeviceTensorToCPUSync(
+          &cond_t, /*tensor_name=*/"", device, &host_cond_t));
+      return ToBool({host_cond_t}, out_result);
+    }
+#endif
+    return ToBool({cond_t}, out_result);
+  }
+
+  // The initial loop variable args are the inputs to the kernel.
+  //
+  // We attempt to forward the input so that it can be consumed inside the
+  // body function (and participate in buffer forwarding, etc.).
+  static void GetArgsFromContext(OpKernelContext* ctx,
+                                 std::vector<Tensor>* out_args,
+                                 DataTypeVector* out_var_types) {
+    const int num_loop_vars = ctx->num_inputs();
+    out_args->reserve(num_loop_vars);
+    out_var_types->resize(num_loop_vars);
+    for (int i = 0; i < num_loop_vars; ++i) {
+      const Tensor& input = ctx->input(i);
+      (*out_var_types)[i] = input.dtype();
+      std::unique_ptr<Tensor> maybe_forwarded_input = ctx->forward_input(
+          i, /* output_index= */ OpKernelContext::Params::kNoReservation,
+          input.dtype(), input.shape(), ctx->input_memory_type(i),
+          ctx->input_alloc_attr(i));
+      if (maybe_forwarded_input) {
+        out_args->push_back(std::move(*maybe_forwarded_input));
+      } else {
+        out_args->push_back(input);
+      }
+    }
+  }
+
+  class BodyFuncCallFrame : public CallFrameInterface {
+   public:
+    BodyFuncCallFrame(std::vector<Tensor>* args, std::vector<Tensor>* retvals,
+                      DataTypeSlice ret_types)
+        : args_(args), retvals_(retvals), ret_types_(ret_types) {}
+
+    size_t num_args() const override { return args_->size(); }
+    size_t num_retvals() const override { return retvals_->size(); }
+
+    Status GetArg(int index, const Tensor** val) override {
+      if (index < args_->size()) {
+        *val = &(*args_)[index];
+        return Status::OK();
+      } else {
+        return errors::InvalidArgument("Argument ", index, " is out of range.");
+      }
+    }
+
+    void ConsumeArg(int index, Tensor* val) override {
+      DCHECK_GE(index, 0);
+      DCHECK_LT(index, args_->size());
+      *val = std::move((*args_)[index]);
+    }
+    bool CanConsumeArg(int index) const override {
+      return index >= 0 && index < args_->size();
+    }
+
+    Status SetRetval(int index, const Tensor& val) override {
+      if (TF_PREDICT_FALSE(index < 0)) {
+        return errors::InvalidArgument(
+            "Expected non-negative return value index, but got: ", index, ".");
+      } else if (TF_PREDICT_FALSE(index >= retvals_->size())) {
+        return errors::InvalidArgument("While loop body returned ", index + 1,
+                                       " arguments. Expected: ", num_retvals(),
+                                       ".");
+      } else if (TF_PREDICT_FALSE(val.dtype() != ret_types_[index])) {
+        return errors::InvalidArgument("Expected type ",
+                                       DataTypeString(ret_types_[index]),
+                                       " for return value ", index, " but got ",
+                                       DataTypeString(val.dtype()), ".");
+      }
+      (*retvals_)[index] = val;
+      return Status::OK();
+    }
+
+   private:
+    std::vector<Tensor>* const args_;     // Not owned.
+    std::vector<Tensor>* const retvals_;  // Not owned.
+    DataTypeSlice ret_types_;
+
+    TF_DISALLOW_COPY_AND_ASSIGN(BodyFuncCallFrame);
+  };
+
   class State {
    public:
     State(WhileOp* kernel, OpKernelContext* ctx, FHandle cond_handle,
@@ -386,9 +526,9 @@ class WhileOp : public AsyncOpKernel {
           done_(std::move(done)),
           lib_(CHECK_NOTNULL(ctx_->function_library())) {
       SetRunOptions(ctx_, &opts_, false /* always_collect_stats */);
-      for (int i = 0; i < ctx_->num_inputs(); ++i) {
-        args_.push_back(ctx_->input(i));
-      }
+      GetArgsFromContext(ctx, &args_, &loop_var_types_);
+      body_frame_ =
+          absl::make_unique<BodyFuncCallFrame>(&args_, &rets_, loop_var_types_);
     }
 
     ~State() {}
@@ -405,14 +545,12 @@ class WhileOp : public AsyncOpKernel {
     FunctionLibraryRuntime::Options opts_;
     TensorVec args_;
     TensorVec rets_;
+    DataTypeVector loop_var_types_;
+    std::unique_ptr<BodyFuncCallFrame> body_frame_;
 
     void EvalCond() {
       profiler::TraceMe trace_me(
-          [&] {
-            return absl::StrCat(
-                "WhileOp-EvalCond #parent_step_id=", ctx_->step_id(),
-                ",function_step_id=", opts_.step_id, "#");
-          },
+          [&] { return EvalCondTraceString(ctx_, opts_); },
           /*level=*/2);
       lib_->Run(
           // Evaluate the condition.
@@ -434,56 +572,27 @@ class WhileOp : public AsyncOpKernel {
             rets_.size(), " tensors.");
         return Finish(s);
       }
-      Tensor cond_t;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-      const DeviceBase::GpuDeviceInfo* gpu_device_info =
-          ctx_->device()->tensorflow_gpu_device_info();
-      const bool is_hostmem_dtype =
-          rets_[0].dtype() == DT_INT32 || rets_[0].dtype() == DT_INT64;
-      if (!is_hostmem_dtype && gpu_device_info &&
-          (opts_.rets_alloc_attrs.empty() ||
-           !opts_.rets_alloc_attrs[0].on_host())) {
-        // Copy the ret value to host if it's allocated on device.
-        Device* device = down_cast<Device*>(ctx_->device());
-        DeviceContext* device_ctx = ctx_->op_device_context();
-        cond_t = Tensor(rets_[0].dtype(), rets_[0].shape());
-        Notification done_copy;
-        device_ctx->CopyDeviceTensorToCPU(
-            &rets_[0], /*tensor_name=*/"", device, &cond_t,
-            [&done_copy, &s](const Status& status) {
-              s = status;
-              done_copy.Notify();
-            });
-        done_copy.WaitForNotification();
-        if (!s.ok()) {
-          return Finish(s);
-        }
-      } else {
-        cond_t = rets_[0];
-      }
-#else
-      cond_t = rets_[0];
-#endif
-      bool cond;
-      s = ToBool({cond_t}, &cond);
 
       if (!s.ok()) {
         return Finish(s);
       }
+      bool cond;
+      s = CondResultToBool(ctx_, opts_, rets_[0], &cond);
+      if (!s.ok()) {
+        return Finish(s);
+      }
+
       if (!cond) {
         return Finish(Status::OK());
       }
       rets_.clear();
+      rets_.resize(args_.size());
       profiler::TraceMe trace_me(
-          [&] {
-            return absl::StrCat(
-                "WhileOp-StartBody #parent_step_id=", ctx_->step_id(),
-                ",function_step_id=", opts_.step_id, "#");
-          },
+          [&] { return StartBodyTraceString(ctx_, opts_); },
           /*level=*/2);
       lib_->Run(
           // Evaluate the body.
-          opts_, body_handle_, args_, &rets_,
+          opts_, body_handle_, body_frame_.get(),
           // Done callback
           [this](const Status& s) {
             if (!s.ok()) {
@@ -511,6 +620,65 @@ class WhileOp : public AsyncOpKernel {
     }
   };
 
+  Status DoComputeSync(OpKernelContext* ctx) {
+    FHandle cond_handle;
+    FHandle body_handle;
+    TF_RETURN_IF_ERROR(GetHandles(ctx, &cond_handle, &body_handle));
+    auto lib = ctx->function_library();
+    FunctionLibraryRuntime::Options opts;
+    SetRunOptions(ctx, &opts, false /* always_collect_stats */);
+
+    // Pre-allocate argument and return value vectors for the cond and body
+    // functions.
+    std::vector<Tensor> args;
+    const int num_loop_vars = ctx->num_inputs();
+    DataTypeVector loop_var_types(num_loop_vars);
+    GetArgsFromContext(ctx, &args, &loop_var_types);
+    std::vector<Tensor> cond_rets;
+    cond_rets.reserve(1);
+    std::vector<Tensor> body_rets;
+    body_rets.reserve(num_loop_vars);
+
+    // Implement the logic of the while loop as a single C++ do-while loop that
+    // executes the cond and body functions synchronously.
+    do {
+      // Evaluate the cond function on the current loop variables.
+      {
+        profiler::TraceMe trace_me(
+            [&] { return EvalCondTraceString(ctx, opts); },
+            /*level=*/2);
+        TF_RETURN_IF_ERROR(lib->RunSync(opts, cond_handle, args, &cond_rets));
+      }
+      if (cond_rets.size() != 1) {
+        return errors::InvalidArgument(
+            "Expected a single scalar return value from WhileOp cond, got ",
+            cond_rets.size(), " tensors.");
+      }
+
+      // If the cond function evaluates to false, we are done: output the
+      // current loop variables.
+      bool cond_result;
+      TF_RETURN_IF_ERROR(
+          CondResultToBool(ctx, opts, cond_rets[0], &cond_result));
+      if (!cond_result) {
+        return SetOutputs(this, ctx, args);
+      }
+
+      // Evaluate the body function on the current loop variables, to get an
+      // updated vector of loop variables.
+      {
+        profiler::TraceMe trace_me(
+            [&] { return StartBodyTraceString(ctx, opts); },
+            /*level=*/2);
+        body_rets.resize(num_loop_vars);
+        BodyFuncCallFrame call_frame(&args, &body_rets, loop_var_types);
+        TF_RETURN_IF_ERROR(lib->RunSync(opts, body_handle, &call_frame));
+      }
+      std::swap(body_rets, args);
+      body_rets.clear();
+    } while (true);
+  }
+
   Status GetHandles(OpKernelContext* ctx, FHandle* cond_handle,
                     FHandle* body_handle) {
     // TODO(b/37549631): Because this op has `SetIsStateful()` in its
@@ -539,8 +707,8 @@ class WhileOp : public AsyncOpKernel {
         *cond_handle = iter->second.first;
         *body_handle = iter->second.second;
       } else {
-        TF_RETURN_IF_ERROR(Instantiate(lib, cond_func_, cond_handle));
-        TF_RETURN_IF_ERROR(Instantiate(lib, body_func_, body_handle));
+        TF_RETURN_IF_ERROR(Instantiate(ctx, cond_func_, cond_handle));
+        TF_RETURN_IF_ERROR(Instantiate(ctx, body_func_, body_handle));
         handles_[lib] = {*cond_handle, *body_handle};
       }
     }
diff --git a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
index 367759d374e..cc14899bd71 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
@@ -56,7 +56,7 @@ class FuzzStringSplitV2 : public FuzzSession {
   }
 
  private:
-  static const size_t kMaxSepSize = 4;
+  static constexpr size_t kMaxSepSize = 4;
 };
 
 STANDARD_TF_FUZZ_FUNCTION(FuzzStringSplitV2);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.h b/tensorflow/core/kernels/gather_functor_gpu.cu.h
index 1cadee41a88..b2dd43885d0 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -92,13 +92,18 @@ struct GatherFunctor<GPUDevice, T, Index> {
     const int64 indices_size = indices.size();
     const int64 slice_size = params.dimension(2);
 
-    GpuLaunchConfig config = GetGpuLaunchConfig(out_size, d);
     if (is_axis_zero) {
+      GpuLaunchConfig config = GetGpuLaunchConfig(
+          out_size, d, &GatherOpKernel<T, Index, true>,
+          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
       TF_CHECK_OK(GpuLaunchKernel(
           GatherOpKernel<T, Index, true>, config.block_count,
           config.thread_per_block, 0, d.stream(), params.data(), indices.data(),
           out.data(), gather_dim_size, indices_size, slice_size, out_size));
     } else {
+      GpuLaunchConfig config = GetGpuLaunchConfig(
+          out_size, d, &GatherOpKernel<T, Index, false>,
+          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
       TF_CHECK_OK(GpuLaunchKernel(
           GatherOpKernel<T, Index, false>, config.block_count,
           config.thread_per_block, 0, d.stream(), params.data(), indices.data(),
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 849a2b4389f..5e6bd1de9d6 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -88,18 +88,18 @@ class GatherOp : public OpKernel {
     }
 
     if (batch_dims_ != 0) {
-      if (batch_dims_ < 0) {
-        batch_dims_ = indices.dims() + batch_dims_;
-      }
-
-      if (!axis_is_set) axis = batch_dims_;
-
       OP_REQUIRES(
           c, batch_dims_ >= -indices.dims() && batch_dims_ <= indices.dims(),
           errors::InvalidArgument("Expected batch_dims in the range [",
                                   -indices.dims(), ", ", indices.dims(),
                                   "], but got ", batch_dims_));
 
+      if (batch_dims_ < 0) {
+        batch_dims_ = indices.dims() + batch_dims_;
+      }
+
+      if (!axis_is_set) axis = batch_dims_;
+
       OP_REQUIRES(c, batch_dims_ < params.dims(),
                   errors::InvalidArgument("batch_dims (", batch_dims_,
                                           ") must be less than rank(params) (",
@@ -154,6 +154,7 @@ class GatherOp : public OpKernel {
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
     if (N == 0) return;
+    if (inner_size == 0) return;
 
     int64 bad_i = -1;
     auto indices_flat = indices.flat<Index>();
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index ecac2274ae8..e4c77881ea8 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -40,11 +40,12 @@ namespace {
 
 class GatherOpTest : public OpsTestBase {
  protected:
-  void MakeOp(DataType data_type, DataType index_type) {
+  void MakeOp(DataType data_type, DataType index_type, int batch_dims = 0) {
     TF_ASSERT_OK(NodeDefBuilder("myop", "GatherV2")
                      .Input(FakeInput(data_type))
                      .Input(FakeInput(index_type))
                      .Input(FakeInput(index_type))
+                     .Attr("batch_dims", batch_dims)
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
   }
@@ -176,6 +177,20 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
       << s;
 }
 
+TEST_F(GatherOpTest, Error_BatchDimsOutOfRange) {
+  MakeOp(DT_FLOAT, DT_INT32, 10);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(absl::StrContains(
+      s.ToString(), "Expected batch_dims in the range [-1, 1], but got 10"))
+      << s;
+}
+
 constexpr int kLookups = 2000;
 
 template <typename Index>
diff --git a/tensorflow/core/kernels/generate_box_proposals_op.cu.cc b/tensorflow/core/kernels/generate_box_proposals_op.cu.cc
index d3a7574e956..b862c42d299 100644
--- a/tensorflow/core/kernels/generate_box_proposals_op.cu.cc
+++ b/tensorflow/core/kernels/generate_box_proposals_op.cu.cc
@@ -20,12 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/cub/device/device_radix_sort.cuh"
-#include "third_party/cub/device/device_segmented_radix_sort.cuh"
-#include "third_party/cub/device/device_select.cuh"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/non_max_suppression_op.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/gpu_prim.h b/tensorflow/core/kernels/gpu_prim.h
new file mode 100644
index 00000000000..82fcb21e0ac
--- /dev/null
+++ b/tensorflow/core/kernels/gpu_prim.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+To in writing unless required by applicable law or agreed,
+distributed on an, software distributed under the license is "AS IS"
+BASIS, WITHOUT OF ANY KIND WARRANTIES OR CONDITIONS, either express
+or implied. For the specific language governing permissions and
+limitations under the license, the license you must see.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
+#define TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
+
+#if GOOGLE_CUDA
+#include "third_party/cub/block/block_load.cuh"
+#include "third_party/cub/block/block_scan.cuh"
+#include "third_party/cub/block/block_store.cuh"
+#include "third_party/cub/device/device_histogram.cuh"
+#include "third_party/cub/device/device_radix_sort.cuh"
+#include "third_party/cub/device/device_reduce.cuh"
+#include "third_party/cub/device/device_segmented_radix_sort.cuh"
+#include "third_party/cub/device/device_segmented_reduce.cuh"
+#include "third_party/cub/device/device_select.cuh"
+#include "third_party/cub/iterator/counting_input_iterator.cuh"
+#include "third_party/cub/iterator/transform_input_iterator.cuh"
+#include "third_party/cub/thread/thread_operators.cuh"
+#include "third_party/cub/warp/warp_reduce.cuh"
+#include "third_party/gpus/cuda/include/cusparse.h"
+
+namespace gpuprim = ::cub;
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/hipcub/hipcub.hpp"
+namespace gpuprim = ::hipcub;
+
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<Eigen::half>
+    : radix_key_codec_floating<Eigen::half, unsigned short> {};
+};  // namespace detail
+};  // namespace rocprim
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index df2796f24b4..7f15f3ab20d 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <algorithm>
 #include <cinttypes>
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/histogram_op_gpu.cu.cc b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
index b3d21a0f561..e8a1c630e70 100644
--- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
@@ -18,26 +18,16 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#if GOOGLE_CUDA
-#include "third_party/cub/device/device_histogram.cuh"
-#elif TENSORFLOW_USE_ROCM
-#include "rocm/include/hipcub/hipcub.hpp"
-#endif
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/histogram_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
index 8cb46204869..a2c288c36d1 100644
--- a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
@@ -17,19 +17,12 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/l2loss_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
-
+#include "tensorflow/core/kernels/gpu_prim.h"
+#include "tensorflow/core/kernels/l2loss_op.h"
 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 855506e9d8a..37fc1b3ae08 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -435,8 +435,10 @@ class TensorListConcat : public OpKernel {
     for (int i = 0; i < tensor_list->tensors().size(); i++) {
       const Tensor& element_tensor = tensor_list->tensors()[i];
       if (element_tensor.dtype() != DT_INVALID) {
-        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-            element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+        if (element_tensor.NumElements() > 0) {
+          inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+              element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+        }
       } else {
         AllocatorAttributes attr;
         if (element_dtype_ == DT_VARIANT) {
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 47d1505ceb3..bcd0446b748 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/conv_grad_shape_utils.h"
 #include "tensorflow/core/kernels/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -380,7 +381,15 @@ class MklConvCustomBackpropInputOp
       // tensor containing shape of filter. So filter.shape() is not
       // a correct way to get filter shape. These operator-specific calls
       // allow this class to handle this case.
-      TensorShape src_tf_shape = MakeInputTfShape(context, src_tensor);
+      TensorShape src_tf_shape;
+      if (src_tensor.dim_size(0) == 2) {
+        Conv2DBackpropComputeInputShape(src_tensor, filter_tensor.shape(),
+                                        diff_dst_tensor.shape(),
+                                        this->data_format_, &src_tf_shape);
+      } else {
+        src_tf_shape = MakeInputTfShape(context, src_tensor);
+      }
+
       TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
       TensorShape diff_dst_tf_shape =
           GetTfShape(context, kOutbpropIdx, eager_mode);
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 80a53ad277e..59de3229211 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -51,12 +51,10 @@ limitations under the License.
 using mkldnn::convolution_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
-
 using ConvFwdPd = mkldnn::convolution_forward::primitive_desc;
 using ReorderPd = mkldnn::reorder::primitive_desc;
 
 namespace tensorflow {
-
 // This structure aggregates multiple inputs to Conv2DFwd* methods.
 struct MklConvFwdParams {
   memory::dims src_dims;
@@ -96,14 +94,12 @@ template <typename Tinput, typename Tfilter, typename Tbias, typename Toutput>
 class MklConvFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create convolution primitive
     if (context_.conv_fwd == nullptr) {
       Setup(convFwdDims);
     }
   }
-
   ~MklConvFwdPrimitive() {}
 
   // Convolution forward execute with bias
@@ -112,7 +108,8 @@ class MklConvFwdPrimitive : public MklPrimitive {
   //   bias_data:   input data buffer of bias
   //   dst_data:    output data buffer of dst
   void Execute(const Tinput* src_data, const Tfilter* filter_data,
-               const Tbias* bias_data, const Toutput* dst_data) {
+               const Tbias* bias_data, const Toutput* dst_data,
+               std::shared_ptr<stream> fwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
@@ -127,11 +124,11 @@ class MklConvFwdPrimitive : public MklPrimitive {
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
     for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
-      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
+      context_.fwd_primitives.at(i).execute(*fwd_stream,
                                             context_.fwd_primitives_args.at(i));
     }
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // After execution, set data handle back
@@ -148,8 +145,8 @@ class MklConvFwdPrimitive : public MklPrimitive {
   //   filter_data: input data buffer of filter (weights)
   //   dst_data:    output data buffer of dst
   void Execute(const Tinput* src_data, const Tfilter* filter_data,
-               const Toutput* dst_data) {
-    Execute(src_data, filter_data, nullptr, dst_data);
+               const Toutput* dst_data, std::shared_ptr<stream> fwd_stream) {
+    Execute(src_data, filter_data, nullptr, dst_data, fwd_stream);
   }
 
 #ifndef ENABLE_MKLDNN_V1
@@ -191,7 +188,6 @@ class MklConvFwdPrimitive : public MklPrimitive {
     std::shared_ptr<ConvFwdPd> fwd_pd;
     std::shared_ptr<mkldnn::primitive> conv_fwd;
 
-    std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -213,8 +209,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
           filter_md(nullptr),
           bias_md(nullptr),
           fwd_pd(nullptr),
-          conv_fwd(nullptr),
-          fwd_stream(nullptr) {
+          conv_fwd(nullptr) {
     }
   };
 
@@ -346,7 +341,6 @@ class MklConvFwdPrimitive : public MklPrimitive {
   }
 
   struct ConvFwdContext context_;
-  engine cpu_engine_;
 };
 
 // TODO(nhasabni): We should not require passing a type to MklPrimitiveFactory.
@@ -678,11 +672,9 @@ class MklConvOp : public OpKernel {
 
       // TODO(mdfaijul): Extend the basic parameters for data types and fusions
       this->ExtendConvFwdParams(context, convFwdDims);
-
       conv_fwd =
           MklConvFwdPrimitiveFactory<Tinput, Tfilter, Tbias, Ttemp_output>::Get(
               convFwdDims, do_not_cache);
-
       // Allocate output tensors `output_tensor` and `filter_out_tensor`
       MklDnnShape output_mkl_shape;
       std::shared_ptr<ConvFwdPd> conv_fwd_pd = conv_fwd->GetPrimitiveDesc();
@@ -703,8 +695,10 @@ class MklConvOp : public OpKernel {
       Tinput* src_data = nullptr;
       if (IS_SRC_REORDER_NEEDED(src_md, conv_fwd_pd, conv_fwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_SRC_DESC_FROM_OP_PD(conv_fwd_pd), cpu_engine_));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_SRC_DESC_FROM_OP_PD(conv_fwd_pd),
+                                   cpu_engine_),
+            context);
         src_data = static_cast<Tinput*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<Tinput*>(
@@ -735,13 +729,16 @@ class MklConvOp : public OpKernel {
         if (!is_filter_cached) {
           filter.SetUsrMem(filter_md, &filter_tensor);
           if (filter_out_tensor == nullptr) {
-            filter.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-                GET_WEIGHTS_DESC_FROM_OP_PD(conv_fwd_pd), cpu_engine_));
+            filter.CheckReorderToOpMem(
+                MEMORY_PD_WITHOUT_DATA(GET_WEIGHTS_DESC_FROM_OP_PD(conv_fwd_pd),
+                                       cpu_engine_),
+                context);
           } else {
             filter.CheckReorderToOpMem(
                 GET_WEIGHTS_DESC_FROM_OP_PD(conv_fwd_pd),
                 DATA_WITH_ENGINE(filter.GetTensorBuffer(filter_out_tensor),
-                                 cpu_engine_));
+                                 cpu_engine_),
+                context);
           }
           filter_data =
               static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
@@ -752,20 +749,23 @@ class MklConvOp : public OpKernel {
       }
 
       // Execute convolution
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, conv_fwd->GetEngine()));
       if (fuse_biasadd_) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
         Tbias* bias_data =
             this->GetBiasHandle(context, conv_fwd_pd, bias_tensor);
-        conv_fwd->Execute(src_data, filter_data, bias_data, dst_data);
+        conv_fwd->Execute(src_data, filter_data, bias_data, dst_data,
+                          fwd_cpu_stream);
       } else {
         if (!eager_mode) {
-          conv_fwd->Execute(src_data, filter_data, dst_data);
+          conv_fwd->Execute(src_data, filter_data, dst_data, fwd_cpu_stream);
         } else {
           // In eager mode we first write the output to temporary
           // buffer in MKL format. Then we convert the data to TF format.
           Ttemp_output* tmp_data = reinterpret_cast<Ttemp_output*>(
               tmp_tensor.flat<Toutput>().data());
-          conv_fwd->Execute(src_data, filter_data, tmp_data);
+          conv_fwd->Execute(src_data, filter_data, tmp_data, fwd_cpu_stream);
 
           // Now we need to convert the output to TF format.
           auto output_tf_md = output_mkl_shape.GetTfLayout();
@@ -780,12 +780,13 @@ class MklConvOp : public OpKernel {
           memory* dst_data_mem =
               new MEMORY_CONSTRUCTOR(OUTPUT_TF_MD, cpu_engine_, dst_data);
           CreateAndExecuteReorder(reorder_pd, *tmp_data_mem, *dst_data_mem,
-                                  cpu_engine_);
+                                  cpu_engine_, context);
         }
       }
 
       // Delete primitive since it is not cached.
       if (do_not_cache) delete conv_fwd;
+
     } catch (mkldnn::error& e) {
       string error_msg = tensorflow::strings::StrCat(
           "Status: ", e.status, ", message: ", string(e.message), ", in file ",
@@ -970,8 +971,9 @@ class MklConvOp : public OpKernel {
             new MEMORY_CONSTRUCTOR(DST_MD, this->cpu_engine_, dst_buf));
         auto reorder_desc =
             REORDER_PD_CONSTRUCTOR(ADD_MD, DST_MD, this->cpu_engine_);
+
         CreateAndExecuteReorder(reorder_desc, *fuse_add_src_, *fuse_add_dst_,
-                                this->cpu_engine_);
+                                this->cpu_engine_, context);
       }
     } else {
       AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
@@ -1097,6 +1099,7 @@ class MklConvOp : public OpKernel {
                               filter_tf_shape, filter_mkl_shape);
   }
 
+  // TODO(intel-mkl): This function does not seem to be called. Remove it.
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(const ConvFwdPd& conv_prim_desc,
                             MklDnnData<Tinput>* src,
@@ -1185,7 +1188,7 @@ class MklConvOp : public OpKernel {
     // Otherwise, cache filter
     filter.SetUsrMem(filter_md, &filter_tensor);
     filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_desc(),
-                               this->cpu_engine_);
+                               this->cpu_engine_, context);
     filter_data = static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
 
     Tensor* filter_tensor_ptr = nullptr;
@@ -1251,9 +1254,9 @@ class MklConvOp : public OpKernel {
     const Tensor& cached_filter_md =
         *cached_filter_md_ptensor_.AccessTensor(context);
 
-    // Check if the memory descriptor of the cached weights is same as
-    // filter_md. If so, we can use the cached weights; otherwise
-    // return nullptr.
+// Check if the memory descriptor of the cached weights is same as
+// filter_md. If so, we can use the cached weights; otherwise
+// return nullptr.
 #ifdef ENABLE_MKLDNN_V1
     if (filter_md == *static_cast<memory::desc*>(cached_filter_md.data())) {
 #else
@@ -1363,6 +1366,58 @@ class MklFusedConvOp
   virtual ~MklFusedConvOp() {}
 };
 
+template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput, typename Ttemp_output, typename Tpadding,
+          bool pad_enabled, bool bias_enabled, bool is_depthwise>
+class MklFusedDepthwiseConvOp
+    : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
+                       Tpadding, bias_enabled, false, is_depthwise, false> {
+ public:
+  explicit MklFusedDepthwiseConvOp(OpKernelConstruction* context)
+      : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
+                  Tpadding, bias_enabled, false, is_depthwise, false>(context) {
+    // Since we came here through the registration of
+    // _MklFusedDepthwiseConv2dNative, get all
+    // information from 'fused_ops' and 'num_args'
+    std::vector<string> fused_ops;
+    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
+
+    int num_args;
+    OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
+    OP_REQUIRES(context, !fused_ops.empty(),
+                errors::InvalidArgument(
+                    "Fused DepthwiseConv2D must have at least one fused op."));
+
+    if (fused_ops == std::vector<string>{"BiasAdd"}) {
+      this->set_fuse_biasadd(true);
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu);
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu6"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_bounded_relu, 6.0);
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Elu"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_elu, 1.0);
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::Unimplemented("Fusion is not implemented: [",
+                                        absl::StrJoin(fused_ops, ","), "]"));
+    }
+
+    OP_REQUIRES(
+        context, num_args == 1,
+        errors::InvalidArgument(
+            "Fused DepthwiseConv2D must have one extra argument: bias."));
+
+    if (pad_enabled) {
+      this->set_fuse_pad(true);
+    }
+  }
+
+  virtual ~MklFusedDepthwiseConvOp() {}
+};
+
 // We create new class for each version of Quantized Convolution and inherit
 // from the FP32 version of the base class
 template <typename Device, typename Tinput, typename Tbias, typename Toutput,
@@ -1600,7 +1655,7 @@ class MklQuantizedConv2DOp
           input_bias_->GET_DESC, scaled_bias_->GET_DESC, this->cpu_engine_,
           bias_attr);
       CreateAndExecuteReorder(reorder_desc, *input_bias_, *scaled_bias_,
-                              this->cpu_engine_);
+                              this->cpu_engine_, context);
 
       Tbias* bias_data =
           reinterpret_cast<Tbias*>(scaled_bias_->get_data_handle());
@@ -1856,7 +1911,8 @@ class MklQuantizedConv2DSumReluOp
     auto reorder_desc = REORDER_PD_CONSTRUCTOR_WITH_ATTR(
         SUMMAND_MD, conv_prim_desc.PRIMITIVE_DESC_DST, this->cpu_engine_,
         reorder_attr);
-    CreateAndExecuteReorder(reorder_desc, *summand_, *dst_, this->cpu_engine_);
+    CreateAndExecuteReorder(reorder_desc, *summand_, *dst_, this->cpu_engine_,
+                            context);
   }
 
   std::shared_ptr<mkldnn::memory> summand_;
@@ -2253,6 +2309,11 @@ REGISTER_KERNEL_BUILDER(
         .TypeConstraint<quint8>("out_type"),
     NoOp);
 
+REGISTER_KERNEL_BUILDER(Name("_FusedDepthwiseConv2dNative")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        NoOp);
+
 // Register templatized MKL kernels for non-fused and fused-versions of
 // QuantizedDepthwiseConv2D.
 REGISTER_KERNEL_BUILDER(Name("_MklQuantizedDepthwiseConv2D")
@@ -2306,6 +2367,14 @@ REGISTER_KERNEL_BUILDER(
     MklQuantizedConv2DReluOp<CPUDevice, quint8, qint32, quint8, quint8, true,
                              true>);
 
+REGISTER_KERNEL_BUILDER(
+    Name("_MklFusedDepthwiseConv2dNative")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<float>("T")
+        .Label(mkl_op_registry::kMklLayoutDependentOpLabel),
+    MklFusedDepthwiseConvOp<CPUDevice, float, float, float, float, float, int32,
+                            false, true, true>);
+
 // Register 2D operations
 #define REGISTER_MKL_CPU_2D(T)                                                 \
   REGISTER_KERNEL_BUILDER(                                                     \
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 115b3597964..6ef806d94c7 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fused_batch_norm_op.h"
+#include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -37,11 +39,14 @@ using BatchNormBwdPd = mkldnn::batch_normalization_backward::primitive_desc;
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
+using FusedBNActivationMode = functor::FusedBatchNormActivationMode;
+
 struct MklBatchNormFwdParams {
   memory::dims src_dims;
   int depth;
   float eps;
   bool training;
+  FusedBNActivationMode activation_mode;
 #ifndef ENABLE_MKLDNN_V1
   MEMORY_FORMAT src_format;
 #else
@@ -50,14 +55,17 @@ struct MklBatchNormFwdParams {
 
   MklBatchNormFwdParams(const memory::dims& src_dims, int depth, float eps,
 #ifndef ENABLE_MKLDNN_V1
-                        bool training, MEMORY_FORMAT src_format)
+                        bool training, MEMORY_FORMAT src_format,
+                        FusedBNActivationMode activation_mode)
 #else
-                        bool training, memory::desc src_md)
+                        bool training, memory::desc src_md,
+                        FusedBNActivationMode activation_mode)
 #endif  // !ENABLE_MKLDNN_V1
       : src_dims(src_dims),
         depth(depth),
         eps(eps),
         training(training),
+        activation_mode(activation_mode),
 #ifndef ENABLE_MKLDNN_V1
         src_format(src_format) {
   }
@@ -90,7 +98,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   //   mean_data:     output data buffer of means
   //   variance_data: output data buffer of variances
   void Execute(const T* src_data, const U* weights_data, T* dst_data,
-               U* mean_data, U* variance_data) {
+               U* mean_data, U* variance_data, U* workspace_data) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -104,6 +112,9 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
       context_.mean_mem->set_data_handle(static_cast<void*>(mean_data));
       context_.variance_mem->set_data_handle(static_cast<void*>(variance_data));
     }
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(workspace_data);
+    }
 #ifdef ENABLE_MKLDNN_V1
     // Execute batch-normalization forward primitives.
     execute_primitives(context_.fwd_primitives, context_.fwd_stream,
@@ -123,6 +134,10 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
       context_.mean_mem->set_data_handle(DummyData);
       context_.variance_mem->set_data_handle(DummyData);
     }
+
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(DummyData);
+    }
   }
 
   MEMORY_PRIMITIVE_DESC GetDstPd() const { return context_.dst_mem->GET_DESC; }
@@ -158,6 +173,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::memory> dst_mem;
     std::shared_ptr<mkldnn::memory> mean_mem;
     std::shared_ptr<mkldnn::memory> variance_mem;
+    std::shared_ptr<mkldnn::memory> ws_mem;
 
     // Forward BatchNorm primitive descriptor.
     std::shared_ptr<BatchNormFwdPd> fwd_pd;
@@ -179,6 +195,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           dst_mem(nullptr),
           mean_mem(nullptr),
           variance_mem(nullptr),
+          ws_mem(nullptr),
           bn_fwd(nullptr),
           fwd_stream(nullptr) {}
   };
@@ -192,6 +209,9 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
                                         : prop_kind::forward_scoring;
 
 #ifdef ENABLE_MKLDNN_V1
+    if (fwdParams.activation_mode == FusedBNActivationMode::kRelu) {
+      context_.flags |= GET_FLAG(fuse_norm_relu);
+    }
     // Memory descriptor
     auto src_md = fwdParams.src_md;
     // Create forward BatchNorm descriptor and primitive descriptor.
@@ -229,6 +249,13 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           m_dims, U, MEMORY_FORMAT::nc, cpu_engine_, DummyData));
     }
 
+#ifdef ENABLE_MKLDNN_V1
+    if (IS_SET(fuse_norm_relu)) {
+      context_.ws_mem.reset(new MEMORY_CONSTRUCTOR(
+          context_.fwd_pd->workspace_desc(), cpu_engine_, DummyData));
+    }
+#endif  // ENABLE_MKLDNN_V1
+
     // BatchNorm forward primitive.
     // TODO(intel-tf): Merge all the #ifdefs and simplify code
     if (!fwdParams.training && !(IS_SET(use_global_stats))) {
@@ -258,20 +285,41 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     } else if (IS_SET(use_global_stats)) {
 #ifdef ENABLE_MKLDNN_V1
       if ((IS_SET(use_scale_shift)) && GET_FLAG(use_scale_shift)) {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
-             {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
-             { MKLDNN_ARG_DST,
-               *context_.dst_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               { MKLDNN_ARG_DST,
+                 *context_.dst_mem }});
+        }
       } else {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
-             { MKLDNN_ARG_DST,
-               *context_.dst_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_DST,
+                 *context_.dst_mem }});
+        }
       }
       context_.bn_fwd.reset(new batch_normalization_forward(*context_.fwd_pd));
 #else
@@ -291,19 +339,40 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     } else {
 #ifdef ENABLE_MKLDNN_V1
       if ((IS_SET(use_scale_shift)) && GET_FLAG(use_scale_shift)) {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
-             {MKLDNN_ARG_DST, *context_.dst_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             { MKLDNN_ARG_VARIANCE,
-               *context_.variance_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               { MKLDNN_ARG_VARIANCE,
+                 *context_.variance_mem }});
+        }
       } else {
-        context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
-                                     {MKLDNN_ARG_DST, *context_.dst_mem},
-                                     {MKLDNN_ARG_MEAN, *context_.mean_mem},
-                                     { MKLDNN_ARG_VARIANCE,
-                                       *context_.variance_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
+                                       {MKLDNN_ARG_DST, *context_.dst_mem},
+                                       {MKLDNN_ARG_MEAN, *context_.mean_mem},
+                                       { MKLDNN_ARG_VARIANCE,
+                                         *context_.variance_mem }});
+        }
       }
       context_.bn_fwd.reset(new batch_normalization_forward(*context_.fwd_pd));
 #else
@@ -360,6 +429,7 @@ class MklFusedBatchNormFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey<int>(fwdParams.depth);
     key_creator.AddAsKey<float>(fwdParams.eps);
     key_creator.AddAsKey<bool>(fwdParams.training);
+    key_creator.AddAsKey<FusedBNActivationMode>(fwdParams.activation_mode);
     key_creator.AddAsKey(typeid(T).name());
     key_creator.AddAsKey(typeid(U).name());
     return key_creator.GetKey();
@@ -676,7 +746,8 @@ class MklFusedBatchNormBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 //  Adding a third parameter to the template to support FusedBatchNormV3
 //  with MKL. This is different from default where the classes are
 //  derived. Moves enabling to compile-time rather than runtime.
-template <typename Device, typename T, typename U, bool reserved_space>
+template <typename Device, typename T, typename U, bool reserved_space,
+          bool is_batch_norm_ex = false>
 class MklFusedBatchNormOp : public OpKernel {
  public:
   explicit MklFusedBatchNormOp(OpKernelConstruction* context)
@@ -696,6 +767,28 @@ class MklFusedBatchNormOp : public OpKernel {
     depth_ = 0;
     mean_values_ = nullptr;
     variance_values_ = nullptr;
+
+#ifndef ENABLE_MKLDNN_V1
+    OP_REQUIRES(context, !is_batch_norm_ex,
+                errors::InvalidArgument(
+                    "_MklFusedBatchNormEx is not supported in DNNL 0.x ."));
+#endif
+    if (!is_batch_norm_ex) {
+      activation_mode_ = FusedBNActivationMode::kIdentity;
+    } else {
+      int num_side_inputs;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("num_side_inputs", &num_side_inputs));
+      // Currently _MKLFusedBatchNormEx do not support "SideInput"
+      OP_REQUIRES(context, num_side_inputs == 0,
+                  errors::InvalidArgument(
+                      "_MKLFusedBatchNorm do not support side input now."));
+
+      OP_REQUIRES_OK(context, ParseActivationMode(context, &activation_mode_));
+      OP_REQUIRES(context, activation_mode_ == FusedBNActivationMode::kRelu,
+                  errors::InvalidArgument(
+                      "_MKLFusedBatchNorm only support Relu activation"));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -744,9 +837,12 @@ class MklFusedBatchNormOp : public OpKernel {
 
       // Handle the special case: input with 0 element and 0 batch size.
       Tensor* dst_tensor = nullptr;
+      TensorShape workspace_tf_shape;
       if (tf_shape_src.num_elements() == 0) {
-        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
-                         &dst_tensor);
+        size_t workspace_bytes = 0;
+        workspace_tf_shape.AddDim(workspace_bytes);
+        HandleEmptyInput(context, tf_shape_src, workspace_tf_shape,
+                         scale_tensor.shape(), &dst_tensor);
         return;
       }
 
@@ -758,23 +854,16 @@ class MklFusedBatchNormOp : public OpKernel {
       // Index of output tensor(diff_src).
       const size_t kDstIndex = 0;
 
-      // Allocate 4 output TF tensors.
+      // Allocate 5 output TF tensors.
       Tensor* batch_mean_tensor = nullptr;
       Tensor* batch_variance_tensor = nullptr;
       Tensor* saved_mean_tensor = nullptr;
       Tensor* saved_variance_tensor = nullptr;
       Tensor* reserved_space_tensor = nullptr;
-      AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor,
-                        &batch_variance_tensor, &saved_mean_tensor,
-                        &saved_variance_tensor, &reserved_space_tensor);
-
-      if (is_training_)
-        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
-      else
-        SetMeanVariance(est_mean_tensor, est_variance_tensor);
 
       MklDnnData<T> src(&cpu_engine_);
       MklDnnData<U> weights(&cpu_engine_);
+      MklDnnData<U> wksp(&cpu_engine_);
 
       MEMORY_FORMAT dnn_fmt;
       MKL_TENSOR_FORMAT mkl_tensor_fmt;
@@ -801,6 +890,51 @@ class MklFusedBatchNormOp : public OpKernel {
                         ? dnn_shape_src.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), dnn_fmt);
 
+#ifdef ENABLE_MKLDNN_V1
+      MklBatchNormFwdParams fwdParams(src_dims, depth_, epsilon_, is_training_,
+                                      src_md, activation_mode_);
+#else
+      MklBatchNormFwdParams fwdParams(
+          src_dims, depth_, epsilon_, is_training_,
+          static_cast<MEMORY_FORMAT>(src_md.data.format), activation_mode_);
+#endif  // ENABLE_MKLDNN_V1
+      // Get forward batch-normalization op from the primitive caching pool.
+      MklFusedBatchNormFwdPrimitive<T, U>* bn_fwd =
+          MklFusedBatchNormFwdPrimitiveFactory<T, U>::Get(fwdParams);
+
+      // Allocate workspace tensor
+      U* ws_data = nullptr;
+      if (fwdParams.activation_mode == FusedBNActivationMode::kRelu) {
+#ifdef ENABLE_MKLDNN_V1
+        MEMORY_PRIMITIVE_DESC workspace_pd =
+            bn_fwd->GetBatchNormFwdPd()->workspace_desc();
+        size_t workspace_bytes = workspace_pd.get_size();
+        workspace_tf_shape.AddDim(workspace_bytes);
+
+        AllocateTFOutputs(context, scale_tensor.shape(), workspace_tf_shape,
+                          &batch_mean_tensor, &batch_variance_tensor,
+                          &saved_mean_tensor, &saved_variance_tensor,
+                          &reserved_space_tensor);
+        if (reserved_space) {
+          wksp.SetUsrMem(workspace_pd, reserved_space_tensor);
+          ws_data = static_cast<U*>(wksp.GetOpMem().get_data_handle());
+        }
+#endif  // ENABLE_MKLDNN_V1
+      } else {
+        // There is actually no workspace tensor out, so we make a dummy one.
+        size_t workspace_bytes = 0;
+        workspace_tf_shape.AddDim(workspace_bytes);
+        AllocateTFOutputs(context, scale_tensor.shape(), workspace_tf_shape,
+                          &batch_mean_tensor, &batch_variance_tensor,
+                          &saved_mean_tensor, &saved_variance_tensor,
+                          &reserved_space_tensor);
+      }
+
+      if (is_training_)
+        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
+      else
+        SetMeanVariance(est_mean_tensor, est_variance_tensor);
+
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
       weights.AllocateBuffer(2 * depth_ * sizeof(U));
@@ -821,18 +955,6 @@ class MklFusedBatchNormOp : public OpKernel {
                   reinterpret_cast<char*>(variance_values_),
                   depth_ * sizeof(U));
 
-#ifdef ENABLE_MKLDNN_V1
-      MklBatchNormFwdParams fwdParams(src_dims, depth_, epsilon_, is_training_,
-                                      src_md);
-#else
-      MklBatchNormFwdParams fwdParams(
-          src_dims, depth_, epsilon_, is_training_,
-          static_cast<MEMORY_FORMAT>(src_md.data.format));
-#endif  // ENABLE_MKLDNN_V1
-      // Get forward batch-normalization op from the primitive caching pool.
-      MklFusedBatchNormFwdPrimitive<T, U>* bn_fwd =
-          MklFusedBatchNormFwdPrimitiveFactory<T, U>::Get(fwdParams);
-
       // Check if reorder is needed for src.
       const T* src_data = nullptr;
       std::shared_ptr<BatchNormFwdPd> bn_fwd_pd = bn_fwd->GetBatchNormFwdPd();
@@ -866,7 +988,7 @@ class MklFusedBatchNormOp : public OpKernel {
 
       // Execute
       bn_fwd->Execute(src_data, weights_op_data, dst_data, mean_op_data,
-                      variance_op_data);
+                      variance_op_data, ws_data);
 
       float adjust_factor = 1.0;
       if (is_training_) {
@@ -924,6 +1046,7 @@ class MklFusedBatchNormOp : public OpKernel {
   U* mean_values_;
   U* variance_values_;
   size_t depth_;  // Batch normalization is performed for per channel.
+  FusedBNActivationMode activation_mode_;
   engine cpu_engine_ = engine(ENGINE_CPU, 0);
 
   void ExtractParams(OpKernelContext* context) {
@@ -938,6 +1061,7 @@ class MklFusedBatchNormOp : public OpKernel {
   }
 
   void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape workspace_tf_shape,
                         TensorShape tf_shape_scale, Tensor** dst_tensor) {
     DCHECK(dst_tensor);
 
@@ -955,12 +1079,14 @@ class MklFusedBatchNormOp : public OpKernel {
     Tensor* saved_mean_tensor = nullptr;
     Tensor* saved_variance_tensor = nullptr;
     Tensor* reserved_space_tensor = nullptr;
-    AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor,
-                      &batch_variance_tensor, &saved_mean_tensor,
-                      &saved_variance_tensor, &reserved_space_tensor);
+    AllocateTFOutputs(context, tf_shape_scale, workspace_tf_shape,
+                      &batch_mean_tensor, &batch_variance_tensor,
+                      &saved_mean_tensor, &saved_variance_tensor,
+                      &reserved_space_tensor);
   }
 
   void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale,
+                         TensorShape workspace_tf_shape,
                          Tensor** batch_mean_tensor,
                          Tensor** batch_variance_tensor,
                          Tensor** saved_mean_tensor,
@@ -1024,21 +1150,15 @@ class MklFusedBatchNormOp : public OpKernel {
     std::fill_n(saved_variance_data, num_elements, static_cast<U>(0));
 
     // Changes to support reserved_space_3 parameter in FusedBatchNormV3.
-    // TODO: This parameter functionality is not implemented on CPU.
-    //       It is used to hold intermediate results. So the allocated
-    //       memory is filled with 0s.
     if (reserved_space) {
       DCHECK(reserved_space_tensor != nullptr);
 
       MklDnnShape mkl_shape_reserved_space;
       mkl_shape_reserved_space.SetMklTensor(false);
       AllocateOutputSetMklShape(context, kReservedSpaceIndex,
-                                reserved_space_tensor, tf_shape_scale,
+                                reserved_space_tensor, workspace_tf_shape,
                                 mkl_shape_reserved_space);
       DCHECK((*reserved_space_tensor) != nullptr);
-      auto saved_reserved_space_data =
-          (*reserved_space_tensor)->flat<U>().data();
-      std::fill_n(saved_reserved_space_data, num_elements, static_cast<U>(0));
     }
   }
 };
@@ -1363,7 +1483,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
           .Device(DEVICE_CPU)                                  \
           .TypeConstraint<T>("T")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, T, false>);
+      MklFusedBatchNormOp<CPUDevice, T, T, false, false>);
 
 TF_CALL_float(REGISTER_MKL_FUSED_BATCHNORM_CPU);
 TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
@@ -1376,7 +1496,7 @@ TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, false>);
+      MklFusedBatchNormOp<CPUDevice, T, U, false, false>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(bfloat16, float);
@@ -1417,12 +1537,30 @@ REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(bfloat16, float);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, true>);
+      MklFusedBatchNormOp<CPUDevice, T, U, true, false>);      \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("_MklFusedBatchNormEx")                             \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<T>("T")                              \
+          .TypeConstraint<U>("U")                              \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
+      MklFusedBatchNormOp<CPUDevice, T, U, true, true>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(bfloat16, float);
 #undef REGISTER_MKL_FUSED_BATCHNORM_V3_CPU
 
+REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<float>("U"),
+                        NoOp);
+REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<bfloat16>("T")
+                            .TypeConstraint<float>("U"),
+                        NoOp);
+
 #define REGISTER_MKL_FUSED_BATCHNORM_GRAD_V3_CPU(T, U)         \
   REGISTER_KERNEL_BUILDER(                                     \
       Name("_MklFusedBatchNormGradV3")                         \
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc
index ff4f678e476..edd1201a09c 100644
--- a/tensorflow/core/kernels/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc
@@ -134,6 +134,7 @@ class CommonTestUtilities : public OpsTestBase {
   static void VerifyFusedTensorsClose(int depth, int image_width,
                                       int image_height, int image_batch_count,
                                       int filter_size, int filter_count,
+                                      int bias_size,
                                       const std::vector<string>& fused_ops,
                                       const FusedGraphRunner& run_default,
                                       const FusedGraphRunner& run_fused) {
@@ -145,7 +146,6 @@ class CommonTestUtilities : public OpsTestBase {
     Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
     filter.flat<T>() = filter.flat<T>().template setRandom<random_gen_>();
 
-    const int bias_size = filter_count;
     Tensor bias(dtype, {bias_size});
     bias.flat<T>() = bias.flat<T>().template setRandom<random_gen_>();
 
@@ -321,9 +321,10 @@ class MklFusedConv2DOpTest : public OpsTestBase {
                               out);
         };
 
+    const int bias_size = filter_count;
     CommonTestUtilities<T>::VerifyFusedTensorsClose(
         depth, image_width, image_height, image_batch_count, filter_size,
-        filter_count, fused_ops, run_default, run_fused);
+        filter_count, bias_size, fused_ops, run_default, run_fused);
   }
 };
 
@@ -449,6 +450,223 @@ REGISTER_TYPED_TEST_CASE_P(
 using MklFusedBiasAddDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_CASE_P(Test, MklFusedConv2DWithBiasOpTest,
                               MklFusedBiasAddDataTypes);
+
+// Testing MKL's fused depthwise convolution ops
+template <typename T>
+class MklFusedDepthwiseConv2DOpTest : public OpsTestBase {
+ protected:
+  static constexpr int kDepth = 3;
+  static constexpr int kImageWidth = 32;
+  static constexpr int kImageHeight = 32;
+  static constexpr int kImageBatchCount = 8;
+
+  void RunDepthwiseConv2DUnfused(const Tensor& input_data,
+                                 const Tensor& filter_data,
+                                 const Tensor& bias_data,
+                                 const std::vector<string>& fused_ops,
+                                 Tensor* output, int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+    auto input_data_op =
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data));
+    Output next_op = ops::DepthwiseConv2dNative(
+        root.WithOpName("depthwise_conv"), input_data_op,
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    string last_op = "";
+    if (std::find(fused_ops.begin(), fused_ops.end(), "BiasAdd") !=
+        fused_ops.end()) {
+      last_op = "with_bias";
+      next_op = ops::BiasAdd(
+          root.WithOpName(last_op), next_op,
+          ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+    }
+
+    if (std::find(fused_ops.begin(), fused_ops.end(), "Relu") !=
+        fused_ops.end()) {
+      last_op = "with_relu";
+      next_op = ops::Relu(root.WithOpName(last_op), next_op);
+    }
+
+    if (std::find(fused_ops.begin(), fused_ops.end(), "Relu6") !=
+        fused_ops.end()) {
+      last_op = "with_relu6";
+      next_op = ops::Relu6(root.WithOpName(last_op), next_op);
+    }
+
+    if (std::find(fused_ops.begin(), fused_ops.end(), "Elu") !=
+        fused_ops.end()) {
+      last_op = "with_elu";
+      next_op = ops::Elu(root.WithOpName(last_op), next_op);
+    }
+
+    CommonTestUtilities<T>::RunAndFetch(root, last_op, output);
+  }
+
+  void RunMklFusedDepthwiseConv2DOp(const Tensor& image, const Tensor& filter,
+                                    const std::vector<Tensor>& args,
+                                    const std::vector<string>& fused_ops,
+                                    Tensor* output, int stride = 1) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    int num_args = static_cast<int>(args.size());
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_depthwise_conv_op",
+                                "_MklFusedDepthwiseConv2dNative")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(num_args, dtype))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(num_args, DT_UINT8))
+                     .Attr("T", dtype)
+                     .Attr("num_args", num_args)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "SAME")
+                     .Attr("fused_ops", fused_ops)
+                     .Attr("_kernel", "MklLayoutDependentOp")
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    for (const Tensor& arg : args)
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& output_tensor = *GetOutput(0);
+    // Index 2 will need to be changed if the number of outputs produced
+    // by MklDepthwiseConv2D change.
+    const Tensor& output_meta_tensor = *GetOutput(2);
+    CommonTestUtilities<T> test_util;
+    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                output);
+  }
+
+  // Verifies computing unfused ops in a graph is identical to
+  // FusedDepthwiseConv2D.
+  void VerifyFusedDepthwiseConv2D(int filter_size, int filter_count,
+                                  int bias_size,
+                                  const std::vector<string>& fused_ops,
+                                  int depth = kDepth,
+                                  int image_width = kImageWidth,
+                                  int image_height = kImageHeight,
+                                  int image_batch_count = kImageBatchCount) {
+    const FusedGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, const std::vector<string>& fused_ops,
+               Tensor* out) {
+          RunDepthwiseConv2DUnfused(input_data, filter_data, bias_data,
+                                    fused_ops, out);
+        };
+
+    const FusedGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, const std::vector<string>& fused_ops,
+               Tensor* out) {
+          std::vector<Tensor> fused_input = {bias_data};
+          RunMklFusedDepthwiseConv2DOp(input_data, filter_data, fused_input,
+                                       fused_ops, out);
+        };
+
+    CommonTestUtilities<T>::VerifyFusedTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, bias_size, fused_ops, run_default, run_fused);
+  }
+};
+
+template <typename T>
+class MklFusedDepthwiseConv2DWithBiasOpTest
+    : public MklFusedDepthwiseConv2DOpTest<T> {};
+
+TYPED_TEST_SUITE_P(MklFusedDepthwiseConv2DWithBiasOpTest);
+
+// -------------------------------------------------------------------------- //
+// DepthwiseConv2D + BiasAdd + {Activation}                                   //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(MklFusedDepthwiseConv2DWithBiasOpTest, OneByOneConvolution) {
+  const int kFilterSize = 1;
+  const int kFilterCount = 1;
+  const int kBiasSize = 3;
+  this->VerifyFusedDepthwiseConv2D(kFilterSize, kFilterCount, kBiasSize,
+                                   {"BiasAdd"});
+}
+
+TYPED_TEST_P(MklFusedDepthwiseConv2DWithBiasOpTest, SpatialConvolution) {
+  const int kFilterSize = 3;
+  const int kFilterCount = 1;
+  const int kBiasSize = 3;
+  this->VerifyFusedDepthwiseConv2D(kFilterSize, kFilterCount, kBiasSize,
+                                   {"BiasAdd"});
+}
+
+TYPED_TEST_P(MklFusedDepthwiseConv2DWithBiasOpTest,
+             OneByOneConvolutionAndRelu) {
+  const int kFilterSize = 1;
+  const int kFilterCount = 1;
+  const int kBiasSize = 3;
+  this->VerifyFusedDepthwiseConv2D(kFilterSize, kFilterCount, kBiasSize,
+                                   {"BiasAdd", "Relu"});
+}
+
+TYPED_TEST_P(MklFusedDepthwiseConv2DWithBiasOpTest, SpatialConvolutionAndRelu) {
+  const int kFilterSize = 3;
+  const int kFilterCount = 1;
+  const int kBiasSize = 3;
+  this->VerifyFusedDepthwiseConv2D(kFilterSize, kFilterCount, kBiasSize,
+                                   {"BiasAdd", "Relu"});
+}
+
+TYPED_TEST_P(MklFusedDepthwiseConv2DWithBiasOpTest,
+             OneByOneConvolutionAndRelu6) {
+  const int kFilterSize = 1;
+  const int kFilterCount = 1;
+  const int kBiasSize = 3;
+  this->VerifyFusedDepthwiseConv2D(kFilterSize, kFilterCount, kBiasSize,
+                                   {"BiasAdd", "Relu6"});
+}
+
+TYPED_TEST_P(MklFusedDepthwiseConv2DWithBiasOpTest,
+             SpatialConvolutionAndRelu6) {
+  const int kFilterSize = 3;
+  const int kFilterCount = 1;
+  const int kBiasSize = 3;
+  this->VerifyFusedDepthwiseConv2D(kFilterSize, kFilterCount, kBiasSize,
+                                   {"BiasAdd", "Relu6"});
+}
+
+TYPED_TEST_P(MklFusedDepthwiseConv2DWithBiasOpTest, OneByOneConvolutionAndElu) {
+  const int kFilterSize = 1;
+  const int kFilterCount = 1;
+  const int kBiasSize = 3;
+  this->VerifyFusedDepthwiseConv2D(kFilterSize, kFilterCount, kBiasSize,
+                                   {"BiasAdd", "Elu"});
+}
+
+TYPED_TEST_P(MklFusedDepthwiseConv2DWithBiasOpTest, SpatialConvolutionAndElu) {
+  const int kFilterSize = 3;
+  const int kFilterCount = 1;
+  const int kBiasSize = 3;
+  this->VerifyFusedDepthwiseConv2D(kFilterSize, kFilterCount, kBiasSize,
+                                   {"BiasAdd", "Elu"});
+}
+
+REGISTER_TYPED_TEST_SUITE_P(
+    MklFusedDepthwiseConv2DWithBiasOpTest, OneByOneConvolution,
+    SpatialConvolution, OneByOneConvolutionAndRelu, SpatialConvolutionAndRelu,
+    OneByOneConvolutionAndRelu6, SpatialConvolutionAndRelu6,
+    OneByOneConvolutionAndElu, SpatialConvolutionAndElu);
+
+using MklFusedBiasAddDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklFusedDepthwiseConv2DWithBiasOpTest,
+                               MklFusedBiasAddDataTypes);
+
 // Testing fusion of pad and convolution
 
 class FusedPadConvOpTest : public OpsTestBase {
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index e69fddd327a..f7866cbcea6 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -165,7 +165,7 @@ class MklInputConversionOp : public OpKernel {
                   input1_md, tensor_out, net, net_args, cpu_engine)),
               errors::Internal(
                   "MklInputConversionOp: Failed to create reorder for input0"));
-          ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine);
+          ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine, context);
           // Input1 will be passed through
           ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
@@ -273,7 +273,7 @@ class MklInputConversionOp : public OpKernel {
                     errors::Internal("MklInputConversionOp: Failed to forward "
                                      "input tensor to output"));
       } else {
-        ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine);
+        ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine, context);
       }
 
       // -- The tensor in MKL format passes through --
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index c31e67b84cb..bda3fad38cf 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -172,7 +172,8 @@ class MklReshapeOp : public OpKernel {
           // shape_from != shape_to), then we just copy input tensor to
           // output tensor with target shape (we cannot forward Mkl layout
           // in such case because shape has changed.)
-          if (dnn_data_input.CheckReorderToOpMem(OUTPUT_TF_MD, output_tensor)) {
+          if (dnn_data_input.CheckReorderToOpMem(OUTPUT_TF_MD, output_tensor,
+                                                 context)) {
           } else {
             OP_REQUIRES(context,
                         output_tensor->CopyFrom(input_tensor, shape_to),
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index 2e489617a40..f7aa4d2bebf 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -111,7 +111,8 @@ class MklToTfOp : public OpKernel {
       if (input.IsReorderNeeded(OUTPUT_TF_MD)) {
         // Insert reorder between MKL layout and TensorFlow layout
         OP_REQUIRES(
-            context, input.CheckReorderToOpMem(OUTPUT_TF_MD, output_tensor),
+            context,
+            input.CheckReorderToOpMem(OUTPUT_TF_MD, output_tensor, context),
             errors::Internal("MklToTfOp: Failed to create input reorder"));
       } else {
         // If not, just forward input tensor to output tensor.
diff --git a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
index 7f45979a57e..9b2d09fb827 100644
--- a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
+++ b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
@@ -39,6 +39,8 @@ namespace tensorflow {
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"),  \
       NoOp);                                                                  \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);  \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);       \
   REGISTER_KERNEL_BUILDER(                                                    \
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 4cb38d5873e..95bc0ed357a 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <stdio.h>
 
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/multinomial_op.h"
 #include "tensorflow/core/kernels/random_op.h"
 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
@@ -29,12 +30,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 namespace functor {
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 331bbe25b17..ced97481ca9 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
index 7b2848b2a77..53559b20419 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
@@ -19,12 +19,10 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/cub/device/device_radix_sort.cuh"
-#include "third_party/cub/device/device_segmented_radix_sort.cuh"
-#include "third_party/cub/device/device_select.cuh"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/non_max_suppression_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index 09dc3ffd129..4eab9052830 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -317,7 +317,7 @@ namespace {
 template <typename Device, typename T>
 class ParameterizedTruncatedNormalOp : public OpKernel {
   // Reshape batches so each batch is this size if possible.
-  static const int32 kDesiredBatchSize = 100;
+  static constexpr int32 kDesiredBatchSize = 100;
 
  public:
   explicit ParameterizedTruncatedNormalOp(OpKernelConstruction* context)
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index a85f3f449fd..3045fd050d5 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -245,6 +245,7 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
   run_opts.source_device =
       lib->device() == nullptr ? "" : lib->device()->name();
   run_opts.allow_dead_tensors = true;
+  run_opts.rendezvous = ctx->rendezvous();
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
   const string& func_name = func_->name();
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index ae5ca6e6c1e..b88053b035e 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/core/common_runtime/gradients.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/gradients.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index d1a9403fd95..ea60403b5aa 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -36,7 +36,7 @@ namespace tensorflow {
 class QueueBase : public QueueInterface {
  public:
   // As a possible value of 'capacity'.
-  static const int32 kUnbounded = INT_MAX;
+  static constexpr int32 kUnbounded = INT_MAX;
 
   // Args:
   //   component_dtypes: The types of each component in a queue-element tuple.
diff --git a/tensorflow/core/kernels/random_binomial_op.cc b/tensorflow/core/kernels/random_binomial_op.cc
index ea42239e77f..4647457ff6f 100644
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@@ -326,7 +326,7 @@ namespace {
 template <typename Device, typename T, typename U>
 class RandomBinomialOp : public OpKernel {
   // Reshape batches so each batch is this size if possible.
-  static const int32 kDesiredBatchSize = 100;
+  static constexpr int32 kDesiredBatchSize = 100;
 
  public:
   explicit RandomBinomialOp(OpKernelConstruction* context)
@@ -439,7 +439,7 @@ class RandomBinomialOp : public OpKernel {
 template <typename Device, typename T, typename U>
 class StatelessRandomBinomialOp : public OpKernel {
   // Reshape batches so each batch is this size if possible.
-  static const int32 kDesiredBatchSize = 100;
+  static constexpr int32 kDesiredBatchSize = 100;
 
  public:
   explicit StatelessRandomBinomialOp(OpKernelConstruction* context)
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 7b7f5153436..152ab5f7d1e 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -208,6 +208,7 @@ class RandomGammaOp : public OpKernel {
                    alpha_flat](int start_output, int limit_output) {
       using Eigen::numext::exp;
       using Eigen::numext::log;
+      using Eigen::numext::log1p;
       using Eigen::numext::pow;
 
       // Capturing "rng" by-value would only make a copy for the _shared_
@@ -241,7 +242,7 @@ class RandomGammaOp : public OpKernel {
             gen.Skip(kReservedSamplesPerOutput * output_idx);
             int16 uniform_remaining = 0;
             UNIFORM(u);
-            const double res = -log(1.0 - u);
+            const double res = -log1p(-u);
             samples_alpha_offset[sample_idx * num_alphas] = static_cast<T>(res);
           }       // for (sample_idx)
         } else {  // if alpha != 1.0
diff --git a/tensorflow/core/kernels/random_op_cpu.h b/tensorflow/core/kernels/random_op_cpu.h
index 0e24bc3a54b..eac1faee2e4 100644
--- a/tensorflow/core/kernels/random_op_cpu.h
+++ b/tensorflow/core/kernels/random_op_cpu.h
@@ -111,7 +111,7 @@ struct FillPhiloxRandomTask<Distribution, false> {
 template <class Distribution>
 struct FillPhiloxRandomTask<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  static const int64 kReservedSamplesPerOutput = 256;
+  static constexpr int64 kReservedSamplesPerOutput = 256;
 
   static void Run(random::PhiloxRandom base_gen, T* data, int64 size,
                   int64 start_group, int64 limit_group, Distribution dist) {
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index e26b9fd5ad1..c043c6a8e33 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -23,16 +23,7 @@ limitations under the License.
 #include <sstream>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#if GOOGLE_CUDA
-#include "third_party/cub/device/device_reduce.cuh"
-#include "third_party/cub/device/device_segmented_reduce.cuh"
-#include "third_party/cub/iterator/counting_input_iterator.cuh"
-#include "third_party/cub/iterator/transform_input_iterator.cuh"
-#include "third_party/cub/warp/warp_reduce.cuh"
-#include "third_party/gpus/cuda/include/cuComplex.h"
-#elif TENSORFLOW_USE_ROCM
-#include "rocm/include/hipcub/hipcub.hpp"
-#endif
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/reduction_ops.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/util/gpu_device_functions.h"
@@ -40,12 +31,6 @@ limitations under the License.
 #include "tensorflow/core/util/permutation_input_iterator.h"
 #include "tensorflow/core/util/transform_output_iterator.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 namespace functor {
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
index ea6b6a10154..b864b5a31a5 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 9af56265b58..964646194a8 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -58,7 +58,7 @@ class ResizeBicubicOpTest : public OpsTestBase {
   }
 
  private:
-  static const int64 kTableSize = (1 << 10);
+  static constexpr int64 kTableSize = (1 << 10);
 
   const float* InitCoeffsTable() {
     // Allocate and initialize coefficients table using Bicubic
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index 0e112133915..b5b62bc76ca 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -43,9 +43,9 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename Tlen>
 void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
-  const Tensor& seq_lens = context->input(1);
+  const Tensor& seq_lengths = context->input(1);
 
-  auto seq_lens_t = seq_lens.vec<Tlen>();
+  auto seq_lens_t = seq_lengths.vec<Tlen>();
 
   std::vector<Tlen> seq_lens_vec(seq_lens_t.size());
 
@@ -56,15 +56,16 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
-              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("seq_dim must be < input rank", " ( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
   OP_REQUIRES(context, batch_dim < input.dims(),
-              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("batch_dim must be < input rank", " ( ",
                                       batch_dim, " vs. ", input.dims(), ")"));
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
-                                      "), ", "(", seq_lens.NumElements(),
-                                      " vs. ", input.dim_size(batch_dim), ")"));
+  OP_REQUIRES(
+      context, seq_lengths.NumElements() == input.dim_size(batch_dim),
+      errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim,
+                              "), ", "(", seq_lengths.NumElements(), " vs. ",
+                              input.dim_size(batch_dim), ")"));
 
   for (size_t d = 0; d < seq_lens_vec.size(); ++d) {
     OP_REQUIRES(context, seq_lens_vec[d] >= 0,
@@ -77,21 +78,22 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
 
 void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
-  const Tensor& seq_lens = context->input(1);
+  const Tensor& seq_lengths = context->input(1);
 
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
-              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("seq_dim must be < input rank", " ( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
   OP_REQUIRES(context, batch_dim < input.dims(),
-              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("batch_dim must be < input rank", " ( ",
                                       batch_dim, " vs. ", input.dims(), ")"));
 
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
-                                      "), ", "(", seq_lens.NumElements(),
-                                      " vs. ", input.dim_size(batch_dim), ")"));
+  OP_REQUIRES(
+      context, seq_lengths.NumElements() == input.dim_size(batch_dim),
+      errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim,
+                              "), ", "(", seq_lengths.NumElements(), " vs. ",
+                              input.dim_size(batch_dim), ")"));
 }
 
 template <>
@@ -117,14 +119,14 @@ class ReverseSequenceOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    const Tensor& seq_lens = context->input(1);
+    const Tensor& seq_lengths = context->input(1);
 
     // Preliminary validation of sizes.
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens.shape()),
-                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
-                                        seq_lens.dims()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lengths.shape()),
+                errors::InvalidArgument("seq_lengths must be 1-dim, not ",
+                                        seq_lengths.dims()));
 
-    auto seq_lens_t = seq_lens.vec<Tlen>();
+    auto seq_lens_t = seq_lengths.vec<Tlen>();
 
     CheckErrors<Device, Tlen>(context, batch_dim_, seq_dim_);
     if (!context->status().ok()) return;
@@ -186,7 +188,7 @@ namespace functor {
   void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(             \
       const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
       int32 batch_dim, int32 seq_dim,                                  \
-      typename TTypes<Tlen>::ConstVec seq_lens,                        \
+      typename TTypes<Tlen>::ConstVec seq_lengths,                     \
       typename TTypes<T, Dims>::Tensor output);                        \
   extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;
 
diff --git a/tensorflow/core/kernels/scan_ops.h b/tensorflow/core/kernels/scan_ops.h
index 1fd98f6656d..8afcac86c3f 100644
--- a/tensorflow/core/kernels/scan_ops.h
+++ b/tensorflow/core/kernels/scan_ops.h
@@ -24,6 +24,7 @@ namespace functor {
 
 typedef Eigen::Index Index;
 
+// TODO(b/154339590): Needs to be vectorized.
 template <typename Device, typename Reducer, typename T>
 struct Scan {
   void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor in,
@@ -44,18 +45,33 @@ template <typename T>
 struct LogSumExp {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& a,
                                                      const T& b) const {
-    Eigen::internal::scalar_sum_op<T> sum_op;
-    Eigen::internal::scalar_exp_op<T> exp_op;
-    Eigen::internal::scalar_log_op<T> log_op;
-    Eigen::internal::scalar_max_op<T> max_op;
-    Eigen::internal::scalar_min_op<T> min_op;
-    Eigen::internal::scalar_log1p_op<T> log1p_op;
-    Eigen::internal::scalar_difference_op<T> diff_op;
+    auto mi = Eigen::internal::scalar_min_op<T>()(a, b);
+    auto ma = Eigen::internal::scalar_max_op<T>()(a, b);
 
-    auto mi = min_op(a, b);
-    auto ma = max_op(a, b);
+    auto sub = Eigen::internal::scalar_difference_op<T>();
+    auto add = Eigen::internal::scalar_sum_op<T>();
+    auto exp = Eigen::internal::scalar_exp_op<T>();
+    auto log1p = Eigen::internal::scalar_log1p_op<T>();
+    auto cmp_lt =
+        Eigen::internal::scalar_cmp_op<T, T, Eigen::internal::cmp_LT>();
 
-    return sum_op(log1p_op(exp_op(diff_op(mi, ma))), ma);
+    auto logsumexp = add(log1p(exp(sub(mi, ma))), ma);
+    return cmp_lt(ma, Eigen::NumTraits<T>::lowest()) ? ma : logsumexp;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const T& a,
+                                                   const T& b) const {
+    auto mi = Eigen::internal::pmin(a, b);
+    auto ma = Eigen::internal::pmax(a, b);
+    using Eigen::internal::padd;
+    using Eigen::internal::pcmp_lt;
+    using Eigen::internal::pexp;
+    using Eigen::internal::plog1p;
+    using Eigen::internal::pset1;
+    using Eigen::internal::psub;
+
+    auto logsumexp = padd(plog1p(pexp(psub(mi, ma))), ma);
+    return pselect(pcmp_lt(ma, pset1(Eigen::NumTraits<T>::lowest())), ma,
+                   logsumexp);
   }
 };
 
@@ -66,13 +82,58 @@ struct LogSumExpReducer {
     *accum = logsumexp(*accum, t);
   }
 
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p,
+                                                          Packet* accum) const {
+    LogSumExp<T> logsumexp;
+    *accum = logsumexp.packetOp(*accum, p);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return Eigen::NumTraits<T>::lowest();
+    return -Eigen::NumTraits<T>::infinity();
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return Eigen::internal::pset1(initialize());
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
     return accum;
   }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
+  finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T
+  finalizeBoth(const T saccum, const Packet& vaccum) const {
+    auto max_reducer = Eigen::internal::MaxReducer<T>();
+    auto sum_reducer = Eigen::internal::SumReducer<T>();
+    auto exp = Eigen::internal::scalar_exp_op<T>();
+    auto cmp_lt =
+        Eigen::internal::scalar_cmp_op<T, T, Eigen::internal::cmp_LT>();
+    auto log = Eigen::internal::scalar_log_op<T>();
+    auto add = Eigen::internal::scalar_sum_op<T>();
+
+    using Eigen::internal::pexp;
+    using Eigen::internal::psub;
+
+    // `ma = max(x1, ..., xn)`
+    // If the max of all of the `xi` is `-infinity` then the result is
+    // -infinity. If the max is larger than `-infinity` then it's safe to use
+    // for normalization even if the other elements are `-infinity`.
+    //
+    // `logsumexp(x1, ..., xn) = ma + log (exp(x1 - ma) + ... + exp(xn - ma))`
+    auto ma = max_reducer.finalizeBoth(saccum, vaccum);
+    auto logsumexp = add(log(sum_reducer.finalizeBoth(
+                             exp(saccum - ma), pexp(psub(vaccum, pset1(ma))))),
+                         ma);
+    return cmp_lt(ma, Eigen::NumTraits<T>::lowest()) ? initialize() : logsumexp;
+  }
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/scan_ops_gpu.h b/tensorflow/core/kernels/scan_ops_gpu.h
index 27da21982af..aca2a8985de 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.h
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@@ -24,30 +24,15 @@ limitations under the License.
 #define CUB_USE_COOPERATIVE_GROUPS
 #endif  // CUDA_VERSION >= 9000
 
-#if GOOGLE_CUDA
-#include "third_party/cub/block/block_load.cuh"
-#include "third_party/cub/block/block_scan.cuh"
-#include "third_party/cub/block/block_store.cuh"
-#include "third_party/cub/iterator/counting_input_iterator.cuh"
-#include "third_party/cub/iterator/transform_input_iterator.cuh"
-#include "third_party/gpus/cuda/include/cuComplex.h"
-#elif TENSORFLOW_USE_ROCM
-#include "rocm/include/hipcub/hipcub.hpp"
-#endif
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/scan_ops.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
 #include "tensorflow/core/util/permutation_input_iterator.h"
 #include "tensorflow/core/util/permutation_output_iterator.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index bd6aaa07d9d..3bd76eca4a2 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -451,7 +451,9 @@ TF_CALL_bool(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
+// TODO(b/155931747): Use HostMemory for int32
 TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
+TF_CALL_int64(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
@@ -491,7 +493,10 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
   REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type); \
   REGISTER_SCATTER_ND_TENSOR_SUB_GPU(type);
 
+TF_CALL_int64(REGISTER_SCATTER_ND_TENSOR_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_TENSOR_GPU);
+TF_CALL_complex64(REGISTER_SCATTER_ND_TENSOR_GPU);
+TF_CALL_complex128(REGISTER_SCATTER_ND_TENSOR_GPU);
 
 #undef REGISTER_SCATTER_ND_ADD
 #undef REGISTER_SCATTER_ND_ADD_SUB
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index c7b14af5bc0..49bb22c94bb 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -173,6 +173,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
 TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index ba75150c517..8954dcd4681 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -425,7 +425,13 @@ class UnsortedSegmentReductionOp : public OpKernel {
 // Same as SegmentReductionOp but takes as input a "sparse" tensor, represented
 // by two dense tensors, one containing the data, and the other containing
 // indices into the data.
-template <typename Device, class T>
+//
+// The template parameters are:
+// * Device: An Eigen device object, on which the kernel will execute.
+// * T: The value type.
+// * Index: The element type of the indices tensor (int32 or int64).
+// * SegmentId: The element type of the segment_ids tensor (int32 or int64).
+template <typename Device, class T, typename Index, typename SegmentId>
 class SparseSegmentReductionOpBase : public OpKernel {
  public:
   explicit SparseSegmentReductionOpBase(OpKernelConstruction* context,
@@ -468,11 +474,10 @@ class SparseSegmentReductionOpBase : public OpKernel {
     auto input_flat = input.flat_outer_dims<T>();
     const int64 num_col = input_flat.dimension(1);
     const auto indices_vec = indices.vec<Index>();
-    typedef int32 OutputRow;
-    const auto segment_vec = segment_ids.vec<OutputRow>();
+    const auto segment_vec = segment_ids.vec<SegmentId>();
     // Note that the current implementation assumes that segment_vec values are
     // sorted.
-    const OutputRow last_segment_id_plus_one =
+    const SegmentId last_segment_id_plus_one =
         num_indices > 0
             ? internal::SubtleMustCopy(segment_vec(num_indices - 1)) + 1
             : 0;
@@ -505,14 +510,14 @@ class SparseSegmentReductionOpBase : public OpKernel {
 
     int64 start = 0, end = 1;
     // Index from which the output is not initialized.
-    OutputRow uninitialized_index = 0;
-    OutputRow out_index = internal::SubtleMustCopy(segment_vec(start));
+    SegmentId uninitialized_index = 0;
+    SegmentId out_index = internal::SubtleMustCopy(segment_vec(start));
 
     while (true) {
       // We initialize next_index to 0 to avoid "warning: 'next_index' may be
       // used uninitialized in this function" in the Mac build (since the
       // compiler isn't smart enough to realize the code is safe).
-      OutputRow next_index = 0;
+      SegmentId next_index = 0;
       if (end < num_indices) {
         next_index = internal::SubtleMustCopy(segment_vec(end));
         if (out_index == next_index) {
@@ -567,8 +572,6 @@ class SparseSegmentReductionOpBase : public OpKernel {
   }
 
  private:
-  typedef int32 Index;
-
   int64 Reduce(const typename TTypes<T>::ConstMatrix& input_flat,
                const typename TTypes<Index>::ConstVec& indices_vec, int64 start,
                int64 num,
@@ -702,70 +705,78 @@ class SparseSegmentReductionOpBase : public OpKernel {
   const T default_value_;
 };
 
-template <typename Device, class T>
+template <typename Device, class T, typename Index, typename SegmentId>
 class SparseSegmentReductionMeanOp
-    : public SparseSegmentReductionOpBase<Device, T> {
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
  public:
   explicit SparseSegmentReductionMeanOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
             context, true /*is_mean*/, false /*is_sqrtn*/,
             false /* has_num_segments */, T(0) /* default_value */) {}
 };
 
-template <typename Device, class T>
+template <typename Device, class T, typename Index, typename SegmentId>
 class SparseSegmentReductionMeanWithNumSegmentsOp
-    : public SparseSegmentReductionOpBase<Device, T> {
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
  public:
   explicit SparseSegmentReductionMeanWithNumSegmentsOp(
       OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
             context, true /*is_mean*/, false /*is_sqrtn*/,
             true /* has_num_segments */, T(0) /* default_value */) {}
 };
 
-template <typename Device, class T>
+template <typename Device, class T, typename Index, typename SegmentId>
 class SparseSegmentReductionSqrtNOp
-    : public SparseSegmentReductionOpBase<Device, T> {
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
  public:
   explicit SparseSegmentReductionSqrtNOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
             context, false /*is_mean*/, true /*is_sqrtn*/,
             false /* has_num_segments */, T(0) /* default_value */) {}
 };
 
-template <typename Device, class T>
+template <typename Device, class T, typename Index, typename SegmentId>
 class SparseSegmentReductionSqrtNWithNumSegmentsOp
-    : public SparseSegmentReductionOpBase<Device, T> {
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
  public:
   explicit SparseSegmentReductionSqrtNWithNumSegmentsOp(
       OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
             context, false /*is_mean*/, true /*is_sqrtn*/,
             true /* has_num_segments */, T(0) /* default_value */) {}
 };
 
-template <typename Device, class T>
+template <typename Device, class T, typename Index, typename SegmentId>
 class SparseSegmentReductionSumOp
-    : public SparseSegmentReductionOpBase<Device, T> {
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
  public:
   explicit SparseSegmentReductionSumOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
             context, false /*is_mean*/, false /*is_sqrtn*/,
             false /* has_num_segments */, T(0) /* default_value */) {}
 };
 
-template <typename Device, class T>
+template <typename Device, class T, typename Index, typename SegmentId>
 class SparseSegmentReductionSumWithNumSegmentsOp
-    : public SparseSegmentReductionOpBase<Device, T> {
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
  public:
   explicit SparseSegmentReductionSumWithNumSegmentsOp(
       OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
             context, false /*is_mean*/, false /*is_sqrtn*/,
             true /* has_num_segments */, T(0) /* default_value */) {}
 };
 
-template <class T>
+// Implements the common logic for the gradients of SparseSegmentReduction
+// kernels.
+//
+// The template parameters are:
+// * Device: An Eigen device object, on which the kernel will execute.
+// * T: The value type.
+// * Index: The element type of the indices tensor (int32 or int64).
+// * SegmentId: The element type of the segment_ids tensor (int32 or int64).
+template <class T, typename Index, typename SegmentId>
 class SparseSegmentGradOpBase : public OpKernel {
  public:
   explicit SparseSegmentGradOpBase(OpKernelConstruction* context, bool is_sqrtn)
@@ -788,12 +799,9 @@ class SparseSegmentGradOpBase : public OpKernel {
     OP_REQUIRES(context, N == segment_ids.NumElements(),
                 errors::InvalidArgument(
                     "segment_ids and indices should have same size."));
-    typedef int32 SegmentId;
-    const SegmentId M =
-        internal::SubtleMustCopy(output_dim0.scalar<SegmentId>()());
+    const SegmentId M = internal::SubtleMustCopy(output_dim0.scalar<int32>()());
 
     auto input_flat = input.flat_outer_dims<T>();
-    typedef int32 Index;
     const auto indices_vec = indices.vec<Index>();
     const auto segment_vec = segment_ids.vec<SegmentId>();
 
@@ -871,18 +879,22 @@ class SparseSegmentGradOpBase : public OpKernel {
   const bool is_sqrtn_;
 };
 
-template <class T>
-class SparseSegmentMeanGradOp : public SparseSegmentGradOpBase<T> {
+template <class T, typename Index, typename SegmentId>
+class SparseSegmentMeanGradOp
+    : public SparseSegmentGradOpBase<T, Index, SegmentId> {
  public:
   explicit SparseSegmentMeanGradOp(OpKernelConstruction* context)
-      : SparseSegmentGradOpBase<T>(context, false /*is_sqrtn*/) {}
+      : SparseSegmentGradOpBase<T, Index, SegmentId>(context,
+                                                     false /*is_sqrtn*/) {}
 };
 
-template <class T>
-class SparseSegmentSqrtNGradOp : public SparseSegmentGradOpBase<T> {
+template <class T, typename Index, typename SegmentId>
+class SparseSegmentSqrtNGradOp
+    : public SparseSegmentGradOpBase<T, Index, SegmentId> {
  public:
   explicit SparseSegmentSqrtNGradOp(OpKernelConstruction* context)
-      : SparseSegmentGradOpBase<T>(context, true /*is_sqrtn*/) {}
+      : SparseSegmentGradOpBase<T, Index, SegmentId>(context,
+                                                     true /*is_sqrtn*/) {}
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
index 638c698fb3c..fee0f818c5e 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
@@ -18,71 +18,100 @@ limitations under the License.
 
 namespace tensorflow {
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSum")                       \
-                              .Device(DEVICE_CPU)                        \
-                              .TypeConstraint<type>("T")                 \
-                              .TypeConstraint<int32>("Tidx"),            \
-                          SparseSegmentReductionSumOp<CPUDevice, type>); \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("SparseSegmentSumWithNumSegments")                            \
-          .Device(DEVICE_CPU)                                            \
-          .TypeConstraint<type>("T")                                     \
-          .TypeConstraint<int32>("Tidx"),                                \
-      SparseSegmentReductionSumWithNumSegmentsOp<CPUDevice, type>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS);
+#define REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_SEGMENT_ID_TYPE(type, index_type) \
+  REGISTER_CPU_SPARSE_KERNELS(type, index_type, int32)                         \
+  REGISTER_CPU_SPARSE_KERNELS(type, index_type, int64)
+#define REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(type)       \
+  REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_SEGMENT_ID_TYPE(type, int32) \
+  REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_SEGMENT_ID_TYPE(type, int64)
+
+#define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type)       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("SparseSegmentSum")                                                \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<type>("T")                                          \
+          .TypeConstraint<index_type>("Tidx")                                 \
+          .TypeConstraint<segment_ids_type>("Tsegmentids"),                   \
+      SparseSegmentReductionSumOp<CPUDevice, type, index_type,                \
+                                  segment_ids_type>);                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("SparseSegmentSumWithNumSegments")                                 \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<type>("T")                                          \
+          .TypeConstraint<index_type>("Tidx")                                 \
+          .TypeConstraint<segment_ids_type>("Tsegmentids"),                   \
+      SparseSegmentReductionSumWithNumSegmentsOp<CPUDevice, type, index_type, \
+                                                 segment_ids_type>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                                 \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentMean")                       \
-                              .Device(DEVICE_CPU)                         \
-                              .TypeConstraint<type>("T")                  \
-                              .TypeConstraint<int32>("Tidx"),             \
-                          SparseSegmentReductionMeanOp<CPUDevice, type>); \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("SparseSegmentMeanWithNumSegments")                            \
-          .Device(DEVICE_CPU)                                             \
-          .TypeConstraint<type>("T")                                      \
-          .TypeConstraint<int32>("Tidx"),                                 \
-      SparseSegmentReductionMeanWithNumSegmentsOp<CPUDevice, type>);
-REGISTER_CPU_SPARSE_KERNELS(float);
-REGISTER_CPU_SPARSE_KERNELS(double);
+#define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type)        \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("SparseSegmentMean")                                                \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<type>("T")                                           \
+          .TypeConstraint<index_type>("Tidx")                                  \
+          .TypeConstraint<segment_ids_type>("Tsegmentids"),                    \
+      SparseSegmentReductionMeanOp<CPUDevice, type, index_type,                \
+                                   segment_ids_type>);                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("SparseSegmentMeanWithNumSegments")                                 \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<type>("T")                                           \
+          .TypeConstraint<index_type>("Tidx")                                  \
+          .TypeConstraint<segment_ids_type>("Tsegmentids"),                    \
+      SparseSegmentReductionMeanWithNumSegmentsOp<CPUDevice, type, index_type, \
+                                                  segment_ids_type>);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                                  \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSqrtN")                       \
-                              .Device(DEVICE_CPU)                          \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int32>("Tidx"),              \
-                          SparseSegmentReductionSqrtNOp<CPUDevice, type>); \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("SparseSegmentSqrtNWithNumSegments")                            \
-          .Device(DEVICE_CPU)                                              \
-          .TypeConstraint<type>("T")                                       \
-          .TypeConstraint<int32>("Tidx"),                                  \
-      SparseSegmentReductionSqrtNWithNumSegmentsOp<CPUDevice, type>);
-REGISTER_CPU_SPARSE_KERNELS(float);
-REGISTER_CPU_SPARSE_KERNELS(double);
+#define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("SparseSegmentSqrtN")                                        \
+          .Device(DEVICE_CPU)                                           \
+          .TypeConstraint<type>("T")                                    \
+          .TypeConstraint<index_type>("Tidx")                           \
+          .TypeConstraint<segment_ids_type>("Tsegmentids"),             \
+      SparseSegmentReductionSqrtNOp<CPUDevice, type, index_type,        \
+                                    segment_ids_type>);                 \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("SparseSegmentSqrtNWithNumSegments")                         \
+          .Device(DEVICE_CPU)                                           \
+          .TypeConstraint<type>("T")                                    \
+          .TypeConstraint<index_type>("Tidx")                           \
+          .TypeConstraint<segment_ids_type>("Tsegmentids"),             \
+      SparseSegmentReductionSqrtNWithNumSegmentsOp<                     \
+          CPUDevice, type, index_type, segment_ids_type>);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentMeanGrad")       \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentMeanGradOp<type>);
-REGISTER_CPU_SPARSE_KERNELS(float);
-REGISTER_CPU_SPARSE_KERNELS(double);
+#define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("SparseSegmentMeanGrad")                                     \
+          .Device(DEVICE_CPU)                                           \
+          .TypeConstraint<type>("T")                                    \
+          .TypeConstraint<index_type>("Tidx")                           \
+          .TypeConstraint<segment_ids_type>("Tsegmentids"),             \
+      SparseSegmentMeanGradOp<type, index_type, segment_ids_type>);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSqrtNGrad")      \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentSqrtNGradOp<type>);
-REGISTER_CPU_SPARSE_KERNELS(float);
-REGISTER_CPU_SPARSE_KERNELS(double);
+#define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("SparseSegmentSqrtNGrad")                                    \
+          .Device(DEVICE_CPU)                                           \
+          .TypeConstraint<type>("T")                                    \
+          .TypeConstraint<index_type>("Tidx")                           \
+          .TypeConstraint<segment_ids_type>("Tsegmentids"),             \
+      SparseSegmentSqrtNGradOp<type, index_type, segment_ids_type>);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
+#undef REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE
+#undef REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_SEGMENT_ID_TYPE
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 93830515040..3d94fe1b6a5 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -92,14 +92,16 @@ void SendOp::Compute(OpKernelContext* ctx) {
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {
     // Use the cached rendezvous key.
-    VLOG(2) << "Send " << parsed_key_.buf_;
+    VLOG(2) << "Send " << parsed_key_.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     ctx->SetStatus(ctx->rendezvous()->Send(parsed_key_, args, ctx->input(0),
                                            ctx->is_input_dead()));
     return;
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
     GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
-    VLOG(2) << "Send " << in_loop_parsed.buf_;
+    VLOG(2) << "Send " << in_loop_parsed.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     OP_REQUIRES_OK(ctx,
                    Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed));
 
@@ -200,13 +202,15 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
 
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {
-    VLOG(2) << "Recv " << parsed_key_.buf_;
+    VLOG(2) << "Recv " << parsed_key_.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     ctx->rendezvous()->RecvAsync(parsed_key_, args,
                                  make_recv_callback(ctx, std::move(done)));
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
     GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
-    VLOG(2) << "Recv " << in_loop_parsed.buf_;
+    VLOG(2) << "Recv " << in_loop_parsed.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     OP_REQUIRES_OK_ASYNC(
         ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
     ctx->rendezvous()->RecvAsync(in_loop_parsed, args,
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 03b32b88d9b..b1c3905636e 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -138,41 +140,43 @@ class ExpandDimsOp : public OpKernel {
   explicit ExpandDimsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
-    OP_REQUIRES(ctx, ctx->input(0).dtype() != DT_VARIANT,
+    const Tensor& input_t = ctx->input(0);
+    OP_REQUIRES(ctx, input_t.dtype() != DT_VARIANT,
                 errors::InvalidArgument("ExpandDims on Variant not supported"));
 
+    const Tensor& dim_t = ctx->input(1);
     OP_REQUIRES(
-        ctx, (ctx->input(1).NumElements() == 1),
+        ctx, (dim_t.NumElements() == 1),
         errors::InvalidArgument("'dim' must be a tensor with a single value"));
-    Tdim dim = ctx->input(1).flat<Tdim>()(0);
-    OP_REQUIRES(
-        ctx, (dim >= -1 - ctx->input(0).dims() && dim <= ctx->input(0).dims()),
-        errors::InvalidArgument("Tried to expand dim index ", dim,
-                                " for tensor with ", ctx->input(0).dims(),
-                                " dimensions."));
-
-    auto existing_dims = ctx->input(0).shape().dim_sizes();
-    // Safe - # elements in tensor dims bounded.
-    const int existing_dims_size = static_cast<int>(existing_dims.size());
-    std::vector<int64> new_shape(existing_dims_size);
-    for (size_t i = 0; i < new_shape.size(); ++i) {
-      new_shape[i] = existing_dims[i];
-    }
+    DCHECK_EQ(dim_t.dtype(), DataTypeToEnum<Tdim>::v());
+    Tdim dim = *static_cast<const Tdim*>(DMAHelper::base(&dim_t));
+    const TensorShape& input_shape = input_t.shape();
+    int input_dims = input_shape.dims();
+    OP_REQUIRES(ctx, dim >= -1 - input_dims && dim <= input_dims,
+                errors::InvalidArgument("Tried to expand dim index ", dim,
+                                        " for tensor with ", input_dims,
+                                        " dimensions."));
 
     // We emulate numpy's interpretation of the dim axis when
     // -input.dims() >= dim <= input.dims().
     if (dim < 0) {
-      dim += existing_dims.size() + 1;
+      // Clamp to the end if needed.
+      dim = std::min<Tdim>(dim + input_dims + 1, input_dims);
     }
 
-    // Clamp to the end if needed.
-    dim = std::min<Tdim>(dim, existing_dims_size);
-    new_shape.emplace(new_shape.begin() + dim, 1);
-    const TensorShape output_shape(new_shape);
+    // Compute new shape with an additional dimension.
+    absl::InlinedVector<int64, 8> output_shape_vec(input_dims + 1);
+    for (int64 i = 0; i < dim; ++i) {
+      output_shape_vec[i] = input_shape.dim_size(i);
+    }
+    output_shape_vec[dim] = 1;
+    for (int64 i = dim + 1; i < input_dims + 1; ++i) {
+      output_shape_vec[i] = input_shape.dim_size(i - 1);
+    }
+    TensorShape output_shape(output_shape_vec);
 
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {0}, &output));
-    if (!output->CopyFrom(ctx->input(0), output_shape)) {
+    Tensor output_t;
+    if (!output_t.CopyFrom(input_t, output_shape)) {
       // This should never happen, since the sizes of the input and output
       // should always be the same (we only expand the dimension with 1).
       ctx->SetStatus(
@@ -180,6 +184,7 @@ class ExpandDimsOp : public OpKernel {
                            ctx->input(0).shape().DebugString(),
                            " and output shape ", output_shape.DebugString()));
     }
+    ctx->set_output(0, std::move(output_t));
   }
 
   bool IsExpensive() override { return false; }
diff --git a/tensorflow/core/kernels/shape_ops_test.cc b/tensorflow/core/kernels/shape_ops_test.cc
new file mode 100644
index 00000000000..cdb1fac2daa
--- /dev/null
+++ b/tensorflow/core/kernels/shape_ops_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+static void BM_ExpandDims(int iters) {
+  testing::StopTiming();
+  Graph* g = new Graph(OpRegistry::Global());
+
+  Tensor input(DT_INT32, TensorShape({1, 1, 1, 1}));
+  input.flat<int32>()(0) = 10;
+
+  Tensor axis(DT_INT32, TensorShape({}));
+  axis.flat<int32>()(0) = 2;
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "ExpandDims")
+                  .Input(test::graph::Constant(g, input))
+                  .Input(test::graph::Constant(g, axis))
+                  .Attr("T", DT_INT32)
+                  .Attr("Tdim", DT_INT32)
+                  .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);
+
+  testing::StartTiming();
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
+
+  testing::UseRealTime();
+}
+
+BENCHMARK(BM_ExpandDims);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 0ec2b008aee..0c09fd2852b 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -18,22 +18,18 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 namespace {
diff --git a/tensorflow/core/kernels/softplus_op.h b/tensorflow/core/kernels/softplus_op.h
index 3b35af07039..0e4de9cdeb1 100644
--- a/tensorflow/core/kernels/softplus_op.h
+++ b/tensorflow/core/kernels/softplus_op.h
@@ -53,7 +53,7 @@ struct Softplus {
     activations.device(d) = too_large.select(
         features,                       // softplus(x) ~= x for x large
         too_small.select(features_exp,  // softplus(x) ~= exp(x) for x small
-                         (features_exp + features.constant(T(1))).log()));
+                         features_exp.log1p()));
   }
 };
 
diff --git a/tensorflow/core/kernels/sparse/BUILD b/tensorflow/core/kernels/sparse/BUILD
index fc7fe089f64..1d281bc1d61 100644
--- a/tensorflow/core/kernels/sparse/BUILD
+++ b/tensorflow/core/kernels/sparse/BUILD
@@ -78,6 +78,7 @@ tf_kernel_library(
         "//tensorflow/core/kernels:scatter_nd_op",
         "//tensorflow/core/kernels:slice_op",
         "//tensorflow/core/kernels:transpose_functor",
+        "//tensorflow/core/kernels:gpu_prim_hdrs",
     ] + if_cuda_or_rocm([
         "//tensorflow/core/kernels:cuda_solvers",
         "//tensorflow/core/kernels:cuda_sparse",
diff --git a/tensorflow/core/kernels/sparse/add_op.cc b/tensorflow/core/kernels/sparse/add_op.cc
index 81bc7dfdb7d..b6265a1412c 100644
--- a/tensorflow/core/kernels/sparse/add_op.cc
+++ b/tensorflow/core/kernels/sparse/add_op.cc
@@ -107,6 +107,26 @@ class CSRSparseMatrixAddFunctor {
     const Device& d = ctx_->eigen_device<Device>();
     set_zero(d, c_row_ptr_t.flat<int32>());
 
+    size_t maxWorkspaceSize = 0;
+    for (int i = 0; i < batch_size; ++i) {
+      ConstCSRComponent<T> a_comp{a.row_pointers_vec(i), a.col_indices_vec(i),
+                                  a.values_vec<T>(i), a_dense_shape};
+      ConstCSRComponent<T> b_comp{b.row_pointers_vec(i), b.col_indices_vec(i),
+                                  b.values_vec<T>(i), b_dense_shape};
+
+      size_t thisWorkspaceSize;
+      TF_RETURN_IF_ERROR(
+          csr_geam.GetWorkspaceSize(a_comp, b_comp, &thisWorkspaceSize));
+      if (thisWorkspaceSize > maxWorkspaceSize) {
+        maxWorkspaceSize = thisWorkspaceSize;
+      }
+    }
+
+    Tensor temp;
+    TF_RETURN_IF_ERROR(ctx_->allocate_temp(
+        DT_INT8, TensorShape({static_cast<int64>(maxWorkspaceSize)}), &temp));
+    void* workspace = temp.flat<int8>().data();
+
     for (int i = 0; i < batch_size; ++i) {
       // Calculate output sizes for all minibatch entries.
       // Store in c_batch_ptr and update c_row_ptrs.
@@ -121,8 +141,8 @@ class CSRSparseMatrixAddFunctor {
       TTypes<int32>::UnalignedVec c_row_ptr_i(&c_row_ptr(i * (rows + 1)),
                                               rows + 1);
       int c_nnz_i;
-      TF_RETURN_IF_ERROR(
-          csr_geam.GetOutputStructure(a_comp, b_comp, c_row_ptr_i, &c_nnz_i));
+      TF_RETURN_IF_ERROR(csr_geam.GetOutputStructure(
+          a_comp, b_comp, c_row_ptr_i, &c_nnz_i, workspace));
       c_batch_ptr(i + 1) = c_batch_ptr(i) + c_nnz_i;
     }
 
@@ -151,7 +171,7 @@ class CSRSparseMatrixAddFunctor {
       CSRComponent<T> c_comp{c->row_pointers_vec(i), c->col_indices_vec(i),
                              c->values_vec<T>(i), c_dense_shape_t.vec<int64>()};
 
-      TF_RETURN_IF_ERROR(csr_geam.Compute(a_comp, b_comp, &c_comp));
+      TF_RETURN_IF_ERROR(csr_geam.Compute(a_comp, b_comp, &c_comp, workspace));
     }
 
     return Status::OK();
@@ -269,10 +289,36 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
     return Status::OK();
   }
 
+  Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
+                          const ConstCSRComponent<T>& b, size_t* bufferSize) {
+    DCHECK(initialized_);
+
+    const int m = a.row_ptr.size() - 1;
+    DCHECK_EQ(m, b.row_ptr.size() - 1);
+    const int row_dim = a.dense_shape_host.size() == 2 ? 0 : 1;
+    DCHECK_EQ(m, a.dense_shape_host(row_dim));
+    DCHECK_EQ(m, b.dense_shape_host(row_dim));
+    const int nnzA = a.col_ind.size();
+    const int nnzB = b.col_ind.size();
+
+    const int n = a.dense_shape_host(row_dim + 1);
+    DCHECK_EQ(n, b.dense_shape_host(row_dim + 1));
+    T* null_T = nullptr;
+    int* null_int = nullptr;
+
+    TF_RETURN_IF_ERROR(cuda_sparse_.CsrgeamBufferSizeExt(
+        m, n, &alpha_, descrA_.descr(), nnzA, a.values.data(), a.row_ptr.data(),
+        a.col_ind.data(), &beta_, descrB_.descr(), nnzB, b.values.data(),
+        b.row_ptr.data(), b.col_ind.data(), descrC_.descr(), null_T, null_int,
+        null_int, bufferSize));
+
+    return Status::OK();
+  }
+
   Status GetOutputStructure(const ConstCSRComponent<T>& a,
                             const ConstCSRComponent<T>& b,
                             TTypes<int32>::UnalignedVec c_row_ptr,
-                            int* output_nnz) {
+                            int* output_nnz, void* workspace) {
     DCHECK(initialized_);
 
     const int m = a.row_ptr.size() - 1;
@@ -290,7 +336,7 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
     TF_RETURN_IF_ERROR(cuda_sparse_.CsrgeamNnz(
         m, n, descrA_.descr(), nnzA, a.row_ptr.data(), a.col_ind.data(),
         descrB_.descr(), nnzB, b.row_ptr.data(), b.col_ind.data(),
-        descrC_.descr(), c_row_ptr.data(), output_nnz));
+        descrC_.descr(), c_row_ptr.data(), output_nnz, workspace));
 
     if (*output_nnz < 0) {
       return errors::Internal(
@@ -300,7 +346,7 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
   }
 
   Status Compute(const ConstCSRComponent<T>& a, const ConstCSRComponent<T>& b,
-                 CSRComponent<T>* c) {
+                 CSRComponent<T>* c, void* workspace) {
     DCHECK(initialized_);
 
     const int m = a.row_ptr.size() - 1;
@@ -319,7 +365,7 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
         m, n, &alpha_, descrA_.descr(), nnzA, a.values.data(), a.row_ptr.data(),
         a.col_ind.data(), &beta_, descrB_.descr(), nnzB, b.values.data(),
         b.row_ptr.data(), b.col_ind.data(), descrC_.descr(), c->values.data(),
-        c->row_ptr.data(), c->col_ind.data()));
+        c->row_ptr.data(), c->col_ind.data(), workspace));
 
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/sparse/kernels.h b/tensorflow/core/kernels/sparse/kernels.h
index f795829af05..0c4ef9e26dc 100644
--- a/tensorflow/core/kernels/sparse/kernels.h
+++ b/tensorflow/core/kernels/sparse/kernels.h
@@ -167,13 +167,18 @@ struct CSRStructureModifyingFunctor {
 
   virtual Status Initialize() = 0;
 
+  virtual Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
+                                  const ConstCSRComponent<T>& b,
+                                  size_t* bufferSize) = 0;
+
   virtual Status GetOutputStructure(const ConstCSRComponent<T>& a,
                                     const ConstCSRComponent<T>& b,
                                     TTypes<int32>::UnalignedVec c_row_ptr,
-                                    int* output_nnz) = 0;
+                                    int* output_nnz, void* workspace) = 0;
 
   virtual Status Compute(const ConstCSRComponent<T>& a,
-                         const ConstCSRComponent<T>& b, CSRComponent<T>* c) = 0;
+                         const ConstCSRComponent<T>& b, CSRComponent<T>* c,
+                         void* workspace) = 0;
 };
 
 // Calculates C = alpha * A + beta * B, where A and B are in CSR
diff --git a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
index 99c6d5b9259..1c014db3d0a 100644
--- a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
@@ -18,30 +18,17 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#if GOOGLE_CUDA
-#include "third_party/cub/device/device_histogram.cuh"
-#include "third_party/cub/iterator/counting_input_iterator.cuh"
-#include "third_party/cub/iterator/transform_input_iterator.cuh"
-#include "third_party/gpus/cuda/include/cusparse.h"
-#elif TENSORFLOW_USE_ROCM
-#include "rocm/include/hipcub/hipcub.hpp"
-#endif
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #include "tensorflow/core/kernels/gpu_device_array.h"
 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/sparse/kernels.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc
index 1a9186b7e4b..50fa0ec88ea 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@@ -721,6 +721,54 @@ REGISTER_GPU(complex128)
 
 namespace functor {
 
+namespace {
+
+// GPUDataType<T>::type translates from a C++ type (e.g. float) to a
+// GPUDataType_t (e.g. CUDA_R_32F).
+template <typename T>
+struct GPUDataType;
+
+// GPUDataType templates are currently not instantiated in the ROCm flow
+// So leaving out the #elif TENSORFLOW_USE_ROCM blocks for now
+// hipblas library is not (yet) being pulled in via rocm_configure.bzl
+// so cannot reference tyeps from hipblas headers here
+template <>
+struct GPUDataType<Eigen::half> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_R_16F;
+#endif
+};
+
+template <>
+struct GPUDataType<float> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_R_32F;
+#endif
+};
+
+template <>
+struct GPUDataType<std::complex<float>> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_C_32F;
+#endif
+};
+
+template <>
+struct GPUDataType<double> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_R_64F;
+#endif
+};
+
+template <>
+struct GPUDataType<std::complex<double>> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_C_64F;
+#endif
+};
+
+}  // namespace
+
 template <typename T>
 class CSRSparseMatrixMatMul<GPUDevice, T> {
  public:
@@ -733,10 +781,10 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
     GpuSparse cuda_sparse(ctx);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
     {
-      // Use Csrmm to calculate:
+      // Use Csrmm/SpMM to calculate:
       //   C = alpha * op(A) * op(B) + beta * C
       // where alpha = 1.0, beta = 0.0, A is sparse and B and C are dense.
-      // Note that Csrmm assumes B and C are in column-major form; so we
+      // Note that Csrmm/Spmm assumes B and C are in column-major form; so we
       // use transB == true, and manually transpose the output in place
       // using blas<t>geam.
       // TODO(ebrevdo,rmlarsen): Add support for transposition and adjoint.
@@ -746,37 +794,6 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
       const T alpha = 1;
       const T beta = 0;
 
-      // transA must be non-transpose if transB is transpose (cusparse
-      // limitation).
-#if GOOGLE_CUDA
-      const gpusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-#elif TENSORFLOW_USE_ROCM
-      const gpusparseOperation_t transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
-#endif
-
-      // transB: b is row-major, and cusparse requires col-major b (or
-      // equivalently transB == transpose).  this version is actually more
-      // efficient.
-#if GOOGLE_CUDA
-      const gpusparseOperation_t transB = CUSPARSE_OPERATION_TRANSPOSE;
-
-      gpusparseMatDescr_t descrA;
-      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
-      TF_RETURN_IF_GPUSPARSE_ERROR(
-          cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
-      TF_RETURN_IF_GPUSPARSE_ERROR(
-          cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
-#elif TENSORFLOW_USE_ROCM
-      const gpusparseOperation_t transB = HIPSPARSE_OPERATION_TRANSPOSE;
-
-      gpusparseMatDescr_t descrA;
-      TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descrA));
-      TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
-      TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
-#endif
-
       // A is (m, k), Bt is (ldb, k) and Ct is (ldc, n)
       const int k = b.dimension(0);
       DCHECK_EQ(k, a.dense_shape_host(1));
@@ -801,10 +818,87 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
       // op(A) = A and at least max(1, k) otherwise.
       const int ldc = m;
 
+      // transA must be non-transpose if transB is transpose (cusparse
+      // limitation).
+#if GOOGLE_CUDA
+      const gpusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+#elif TENSORFLOW_USE_ROCM
+      const gpusparseOperation_t transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+#endif
+
+      // transB: b is row-major, and cusparse requires col-major b (or
+      // equivalently transB == transpose).  this version is actually more
+      // efficient.
+#if GOOGLE_CUDA && CUDA_VERSION >= 10020
+
+      const gpusparseOperation_t transB = CUSPARSE_OPERATION_TRANSPOSE;
+      gpusparseSpMatDescr_t matA;
+      gpusparseDnMatDescr_t matB, matC;
+
+      // NOTE: the following APIs are not available in ROCM
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsr(
+          &matA, m, k, nnz, const_cast<int*>(a.row_ptr.data()),
+          const_cast<int*>(a.col_ind.data()), const_cast<T*>(a.values.data()),
+          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+          GPUDataType<T>::type));
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          cusparseCreateDnMat(&matB, n, k, ldb, const_cast<T*>(b.data()),
+                              GPUDataType<T>::type, CUSPARSE_ORDER_COL));
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          cusparseCreateDnMat(&matC, m, n, ldc, c.data(), GPUDataType<T>::type,
+                              CUSPARSE_ORDER_COL));
+
+      size_t bufferSize = 0;
+      TF_RETURN_IF_ERROR(cuda_sparse.SpMMBufferSize(
+          transA, transB, &alpha, matA, matB, &beta, matC,
+          CUSPARSE_MM_ALG_DEFAULT, &bufferSize));
+
+      Tensor buffer;
+      TF_RETURN_IF_ERROR(ctx->allocate_temp(
+          DT_INT8, TensorShape({static_cast<int64>(bufferSize)}), &buffer));
+      DCHECK(buffer.flat<int8>().data() != nullptr);
+
+      TF_RETURN_IF_ERROR(cuda_sparse.SpMM(transA, transB, &alpha, matA, matB,
+                                          &beta, matC, CUSPARSE_MM_ALG_DEFAULT,
+                                          buffer.flat<int8>().data()));
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnMat(matB));
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnMat(matC));
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroySpMat(matA));
+
+#else
+
+#if GOOGLE_CUDA
+
+      const gpusparseOperation_t transB = CUSPARSE_OPERATION_TRANSPOSE;
+
+      gpusparseMatDescr_t descrA;
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
+
+#elif TENSORFLOW_USE_ROCM
+
+      const gpusparseOperation_t transB = HIPSPARSE_OPERATION_TRANSPOSE;
+
+      gpusparseMatDescr_t descrA;
+      TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
+#endif  // GOOGLE_CUDA
+
       TF_RETURN_IF_ERROR(
           cuda_sparse.Csrmm(transA, transB, m, n, k, nnz, &alpha, descrA,
                             a.values.data(), a.row_ptr.data(), a.col_ind.data(),
                             b.data(), ldb, &beta, c.data(), ldc));
+
+#endif  // GOOGLE_CUDA && CUDA_VERSION >= 10020
     }
 
     return Status::OK();
@@ -837,14 +931,15 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
       const T alpha = 1;
       const T beta = 0;
 
+#if GOOGLE_CUDA && CUDA_VERSION < 10020
       gpusparseMatDescr_t descrA;
-#if GOOGLE_CUDA
       TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
       TF_RETURN_IF_GPUSPARSE_ERROR(
           cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
       TF_RETURN_IF_GPUSPARSE_ERROR(
           cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
 #elif TENSORFLOW_USE_ROCM
+      gpusparseMatDescr_t descrA;
       TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descrA));
       TF_RETURN_IF_GPUSPARSE_ERROR(
           hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
@@ -856,9 +951,15 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
       const int n = a.dense_shape_host(1);
       const int nnz = a.values.size();
       DCHECK_EQ(nnz, a.col_ind.size());
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10020)
+      TF_RETURN_IF_ERROR(cuda_sparse.Csrmv(transA_, m, n, nnz, &alpha,
+                                           a.values.data(), a.row_ptr.data(),
+                                           a.col_ind.data(), x, &beta, y));
+#else
       TF_RETURN_IF_ERROR(cuda_sparse.Csrmv(transA_, m, n, nnz, &alpha, descrA,
                                            a.values.data(), a.row_ptr.data(),
                                            a.col_ind.data(), x, &beta, y));
+#endif
     }
 
     return Status::OK();
diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
index e06dbcb0242..fb652e13d15 100644
--- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
@@ -417,6 +417,36 @@ class CSRSparseMatMulGPUOp : public OpKernel {
     }
     auto b_input_dense_shape = b_input_matrix->dense_shape().vec<int64>();
 
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
+    size_t maxWorkspaceSize = 0;
+    for (int i = 0; i < batch_size; ++i) {
+      // Calculate maximum workspace size over batch.
+      ConstCSRComponent<T> a_comp{a_input_matrix->row_pointers_vec(i),
+                                  a_input_matrix->col_indices_vec(i),
+                                  a_input_matrix->values_vec<T>(i),
+                                  a_input_dense_shape};
+      ConstCSRComponent<T> b_comp{b_input_matrix->row_pointers_vec(i),
+                                  b_input_matrix->col_indices_vec(i),
+                                  b_input_matrix->values_vec<T>(i),
+                                  b_input_dense_shape};
+      size_t thisWorkspaceSize;
+      OP_REQUIRES_OK(
+          ctx, csr_gemm.GetWorkspaceSize(a_comp, b_comp, &thisWorkspaceSize));
+      if (thisWorkspaceSize > maxWorkspaceSize) {
+        maxWorkspaceSize = thisWorkspaceSize;
+      }
+    }
+
+    Tensor temp;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(
+                 DT_INT8, TensorShape({static_cast<int64>(maxWorkspaceSize)}),
+                 &temp));
+    void* workspace = temp.flat<int8>().data();
+#else
+    void* workspace = nullptr;
+#endif
+
     for (int i = 0; i < batch_size; ++i) {
       // Calculate output sizes for all minibatch entries.
       // Store in c_batch_ptr and update c_row_ptrs.
@@ -433,8 +463,9 @@ class CSRSparseMatMulGPUOp : public OpKernel {
                                               rows + 1);
 
       int c_nnz_i;
-      OP_REQUIRES_OK(ctx, csr_gemm.GetOutputStructure(a_comp, b_comp,
-                                                      c_row_ptr_i, &c_nnz_i));
+      OP_REQUIRES_OK(ctx,
+                     csr_gemm.GetOutputStructure(a_comp, b_comp, c_row_ptr_i,
+                                                 &c_nnz_i, workspace));
       c_batch_ptr(i + 1) = c_batch_ptr(i) + c_nnz_i;
     }
 
@@ -464,7 +495,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
                                   b_input_dense_shape};
       CSRComponent<T> c_comp{c.row_pointers_vec(i), c.col_indices_vec(i),
                              c.values_vec<T>(i), c_dense_shape};
-      OP_REQUIRES_OK(ctx, csr_gemm.Compute(a_comp, b_comp, &c_comp));
+      OP_REQUIRES_OK(ctx, csr_gemm.Compute(a_comp, b_comp, &c_comp, workspace));
     }
 
     Tensor c_t(cpu_allocator(), DT_VARIANT, TensorShape({}));
@@ -527,24 +558,29 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
         initialized_(false),
         transpose_a_(transpose_a),
         adjoint_a_(adjoint_a),
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
         transpose_b_(transpose_b) {
+#else
+        transpose_b_(transpose_b),
+        info_(nullptr) {
+#endif  // CUDA_VERSION < 10000
     // TODO(ebrevdo): Figure out why transposed implementations crash cuSparse.
-#if GOOGLE_CUDA
-    transA_ = transpose_a ? (adjoint_a ? CUSPARSE_OPERATION_TRANSPOSE
-                                       : CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE)
-                          : CUSPARSE_OPERATION_NON_TRANSPOSE;
-    transB_ = transpose_b ? CUSPARSE_OPERATION_TRANSPOSE
-                          : CUSPARSE_OPERATION_NON_TRANSPOSE;
-#elif TENSORFLOW_USE_ROCM
     transA_ = transpose_a
-                  ? (adjoint_a ? HIPSPARSE_OPERATION_TRANSPOSE
-                               : HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE)
-                  : HIPSPARSE_OPERATION_NON_TRANSPOSE;
-    transB_ = transpose_b ? HIPSPARSE_OPERATION_TRANSPOSE
-                          : HIPSPARSE_OPERATION_NON_TRANSPOSE;
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+                  ? (adjoint_a ? GPUSPARSE(OPERATION_TRANSPOSE)
+                               : GPUSPARSE(OPERATION_CONJUGATE_TRANSPOSE))
+                  : GPUSPARSE(OPERATION_NON_TRANSPOSE);
+    transB_ = transpose_b ? GPUSPARSE(OPERATION_TRANSPOSE)
+                          : GPUSPARSE(OPERATION_NON_TRANSPOSE);
   }
 
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
+  ~CSRSparseSparseMatrixMatMul() {
+    if (initialized_) {
+      cusparseDestroyCsrgemm2Info(info_);
+    }
+  }
+#endif
+
   Status Initialize() {
     if (adjoint_a_ && transpose_a_) {
       return errors::InvalidArgument(
@@ -555,14 +591,46 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
     TF_RETURN_IF_ERROR(descrA_.Initialize());
     TF_RETURN_IF_ERROR(descrB_.Initialize());
     TF_RETURN_IF_ERROR(descrC_.Initialize());
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsrgemm2Info(&info_));
+#endif
     initialized_ = true;
     return Status::OK();
   }
 
+  Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
+                          const ConstCSRComponent<T>& b, size_t* bufferSize) {
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
+    DCHECK(initialized_);
+    const int m =
+        a.dense_shape_host(a.dense_shape_host.size() - (transpose_a_ ? 1 : 2));
+    if (!transpose_a_) {
+      DCHECK_EQ(m, a.row_ptr.size() - 1);
+    }
+    const int k =
+        a.dense_shape_host(a.dense_shape_host.size() - (transpose_a_ ? 2 : 1));
+    if (!transpose_b_) {
+      DCHECK_EQ(k, b.row_ptr.size() - 1);
+    }
+    const int nnzA = a.col_ind.size();
+    const int nnzB = b.col_ind.size();
+
+    const int n =
+        b.dense_shape_host(b.dense_shape_host.size() - (transpose_b_ ? 2 : 1));
+
+    TF_RETURN_IF_ERROR(cuda_sparse_.CsrgemmBufferSize<T>(
+        m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(), a.col_ind.data(),
+        descrB_.descr(), nnzB, b.row_ptr.data(), b.col_ind.data(), info_,
+        bufferSize));
+#endif
+
+    return Status::OK();
+  }
+
   Status GetOutputStructure(const ConstCSRComponent<T>& a,
                             const ConstCSRComponent<T>& b,
                             TTypes<int32>::UnalignedVec c_row_ptr,
-                            int* output_nnz) {
+                            int* output_nnz, void* workspace) {
     DCHECK(initialized_);
 
     const int m =
@@ -584,10 +652,17 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
 
     *output_nnz = -1;
 
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
     TF_RETURN_IF_ERROR(cuda_sparse_.CsrgemmNnz(
         transA_, transB_, m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(),
         a.col_ind.data(), descrB_.descr(), nnzB, b.row_ptr.data(),
         b.col_ind.data(), descrC_.descr(), c_row_ptr.data(), output_nnz));
+#else
+    TF_RETURN_IF_ERROR(cuda_sparse_.CsrgemmNnz(
+        m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(), a.col_ind.data(),
+        descrB_.descr(), nnzB, b.row_ptr.data(), b.col_ind.data(),
+        descrC_.descr(), c_row_ptr.data(), output_nnz, info_, workspace));
+#endif
 
     if (*output_nnz < 0) {
       return errors::Internal(
@@ -598,7 +673,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
   }
 
   Status Compute(const ConstCSRComponent<T>& a, const ConstCSRComponent<T>& b,
-                 CSRComponent<T>* c) {
+                 CSRComponent<T>* c, void* workspace) {
     DCHECK(initialized_);
 
     const int m =
@@ -620,11 +695,19 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
         b.dense_shape_host(b.dense_shape_host.size() - (transpose_b_ ? 2 : 1));
     DCHECK_EQ(n, c->dense_shape_host(c->dense_shape_host.size() - 1));
 
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
     TF_RETURN_IF_ERROR(cuda_sparse_.Csrgemm(
         transA_, transB_, m, k, n, descrA_.descr(), nnzA, a.values.data(),
         a.row_ptr.data(), a.col_ind.data(), descrB_.descr(), nnzB,
         b.values.data(), b.row_ptr.data(), b.col_ind.data(), descrC_.descr(),
         c->values.data(), c->row_ptr.data(), c->col_ind.data()));
+#else
+    TF_RETURN_IF_ERROR(cuda_sparse_.Csrgemm(
+        m, n, k, descrA_.descr(), nnzA, a.values.data(), a.row_ptr.data(),
+        a.col_ind.data(), descrB_.descr(), nnzB, b.values.data(),
+        b.row_ptr.data(), b.col_ind.data(), descrC_.descr(), c->values.data(),
+        c->row_ptr.data(), c->col_ind.data(), info_, workspace));
+#endif
 
     // TODO(ebrevdo): Add a flag to CSRSparseMatrix whether matrix
     // columns are sorted?  Above operation leads to unsorted columns.
@@ -651,6 +734,9 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
   GpuSparseMatrixDescriptor descrC_;
   gpusparseOperation_t transA_;
   gpusparseOperation_t transB_;
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
+  csrgemm2Info_t info_;
+#endif
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/sparse/transpose_op.cc b/tensorflow/core/kernels/sparse/transpose_op.cc
index f9ddb1d8d97..3158eb5016d 100644
--- a/tensorflow/core/kernels/sparse/transpose_op.cc
+++ b/tensorflow/core/kernels/sparse/transpose_op.cc
@@ -262,11 +262,7 @@ struct CSRSparseMatrixTransposeComponent<GPUDevice, T> {
     TF_RETURN_IF_ERROR(ValidateTransposeInputs(x, *y));
     GpuSparse cuda_sparse(ctx);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
-#if GOOGLE_CUDA
-    const gpusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
-#elif TENSORFLOW_USE_ROCM
-    const gpusparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    const gpusparseAction_t copyValues = GPUSPARSE(ACTION_NUMERIC);
     const int rank = x.dense_shape_host.size();
     const int m = x.row_ptr.size() - 1;
     const int n = x.dense_shape_host(rank - 1);
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index c7c538a945f..9a80aad5d04 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // Contains OP to generate sparse crosses.
 #include <assert.h>
+
 #include <limits>
 #include <string>
 #include <vector>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/strong_hash.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -42,7 +44,8 @@ class ColumnInterface {
   virtual int64 FeatureCount(int64 batch) const = 0;
 
   // Returns the fingerprint of nth feature from the specified batch.
-  virtual InternalType Feature(int64 batch, int64 n) const = 0;
+  virtual InternalType Feature(int64 batch, int64 n,
+                               bool strong_hash) const = 0;
 
   virtual ~ColumnInterface() {}
 };
@@ -63,7 +66,7 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
     return feature_counts_[batch];
   }
 
-  InternalType Feature(int64 batch, int64 n) const override;
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
 
   ~SparseTensorColumn() override {}
 
@@ -73,18 +76,69 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
   std::vector<int64> feature_start_indices_;
 };
 
+// A column that is backed by a sparse tensor.
+template <typename InternalType>
+class KeyedSparseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  KeyedSparseTensorColumn(const Tensor& values,
+                          std::vector<int64> feature_counts,
+                          std::vector<int64> feature_start_indices,
+                          std::vector<int64> key)
+      : values_(values),
+        feature_counts_(std::move(feature_counts)),
+        feature_start_indices_(std::move(feature_start_indices)) {
+    DCHECK_EQ(feature_counts_.size(), feature_start_indices_.size());
+    std::memcpy(key_, key.data(), sizeof(key_));
+  }
+
+  int64 FeatureCount(int64 batch) const override {
+    return feature_counts_[batch];
+  }
+
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
+
+  ~KeyedSparseTensorColumn() override {}
+
+ private:
+  const Tensor& values_;
+  uint64 key_[2];
+  std::vector<int64> feature_counts_;
+  std::vector<int64> feature_start_indices_;
+};
+
 // InternalType is int64 only when using HashCrosser.
 template <>
-int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                         bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
     return Fingerprint64(values_.vec<tstring>().data()[start + n]);
   return values_.vec<int64>().data()[start + n];
 }
 
+template <>
+int64 KeyedSparseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                              bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  if (strong_hash) {
+    if (DT_STRING == values_.dtype()) {
+      return StrongKeyedHash(key_, values_.vec<tstring>()(start + n));
+    }
+    return StrongKeyedHash(
+        key_, {reinterpret_cast<const char*>(&values_.vec<int64>()(start + n)),
+               sizeof(values_.dtype())});
+  }
+  if (DT_STRING == values_.dtype())
+    return Fingerprint64(values_.vec<tstring>()(start + n));
+  return Fingerprint64(
+      {reinterpret_cast<const char*>(&values_.vec<int64>()(start + n)),
+       sizeof(values_.dtype())});
+}
+
 // InternalType is string or StringPiece when using StringCrosser.
 template <>
-tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                             bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
     return values_.vec<tstring>().data()[start + n];
@@ -92,8 +146,24 @@ tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
 }
 
 template <>
-StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch,
-                                                     int64 n) const {
+tstring KeyedSparseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                                  bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  if (DT_STRING == values_.dtype())
+    return values_.vec<tstring>().data()[start + n];
+  return std::to_string(values_.vec<int64>().data()[start + n]);
+}
+
+template <>
+StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch, int64 n,
+                                                     bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  return values_.vec<tstring>().data()[start + n];
+}
+
+template <>
+StringPiece KeyedSparseTensorColumn<StringPiece>::Feature(
+    int64 batch, int64 n, bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   return values_.vec<tstring>().data()[start + n];
 }
@@ -106,7 +176,7 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
 
   int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
 
-  InternalType Feature(int64 batch, int64 n) const override;
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
 
   ~DenseTensorColumn() override {}
 
@@ -114,9 +184,46 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
   const Tensor& tensor_;
 };
 
+// A column that is backed by a dense tensor.
+template <typename InternalType>
+class KeyedDenseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  explicit KeyedDenseTensorColumn(const Tensor& tensor, std::vector<int64> key)
+      : tensor_(tensor) {
+    std::memcpy(key_, key.data(), sizeof(key_));
+  }
+
+  int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
+
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
+
+  ~KeyedDenseTensorColumn() override {}
+
+ private:
+  const Tensor& tensor_;
+  uint64 key_[2];
+};
+
 // InternalType is int64 only when using HashCrosser.
 template <>
-int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                        bool strong_hash) const {
+  if (DT_STRING == tensor_.dtype())
+    return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
+  return tensor_.matrix<int64>()(batch, n);
+}
+
+template <>
+int64 KeyedDenseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                             bool strong_hash) const {
+  if (strong_hash) {
+    if (DT_STRING == tensor_.dtype()) {
+      return StrongKeyedHash(key_, tensor_.matrix<tstring>()(batch, n));
+    }
+    return StrongKeyedHash(
+        key_, {reinterpret_cast<const char*>(tensor_.matrix<int64>()(batch, n)),
+               sizeof(tensor_.dtype())});
+  }
   if (DT_STRING == tensor_.dtype())
     return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
   return tensor_.matrix<int64>()(batch, n);
@@ -124,14 +231,28 @@ int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
 
 // Internal type is string or StringPiece when using StringCrosser.
 template <>
-tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                            bool strong_hash) const {
   if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
   return std::to_string(tensor_.matrix<int64>()(batch, n));
 }
 
 template <>
-StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch,
-                                                    int64 n) const {
+tstring KeyedDenseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                                 bool strong_hash) const {
+  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
+  return std::to_string(tensor_.matrix<int64>()(batch, n));
+}
+
+template <>
+StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch, int64 n,
+                                                    bool strong_hash) const {
+  return tensor_.matrix<tstring>()(batch, n);
+}
+
+template <>
+StringPiece KeyedDenseTensorColumn<StringPiece>::Feature(
+    int64 batch, int64 n, bool strong_hash) const {
   return tensor_.matrix<tstring>()(batch, n);
 }
 
@@ -169,24 +290,24 @@ class StringCrosser {
  public:
   StringCrosser(const std::vector<
                     std::unique_ptr<ColumnInterface<InternalType>>>& columns,
-                const int64 num_buckets_unused, const uint64 hash_key_unused)
-      : columns_(columns) {}
-
-  string Generate(const int64 batch_index,
-                  const std::vector<int>& permutation) const {
-    static const auto k_feature_separator = "_X_";
+                const int64 num_buckets_unused, const uint64 hash_key_unused,
+                const tstring k_feature_separator)
+      : columns_(columns), k_feature_separator_(k_feature_separator) {}
 
+  string Generate(const int64 batch_index, const std::vector<int>& permutation,
+                  bool unused_strong_hash) const {
     gtl::InlinedVector<InternalType, 6> cross_vec(columns_.size());
     for (int i = 0; i < permutation.size(); i++) {
-      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
+      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i], false);
     }
     // TODO(zakaria): this will copy the string twice, might effect
     // performance.
-    return absl::StrJoin(cross_vec, k_feature_separator);
+    return absl::StrJoin(cross_vec, k_feature_separator_);
   }
 
  private:
   const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+  const tstring k_feature_separator_;
 };
 
 // Generates the sparse crosses as nested hash to avoid string manipulations.
@@ -194,15 +315,16 @@ class HashCrosser {
  public:
   HashCrosser(
       const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
-      const int64 num_buckets, const uint64 hash_key)
+      const int64 num_buckets, const uint64 hash_key,
+      const tstring k_feature_separator_unused)
       : columns_(columns), num_buckets_(num_buckets), hash_key_(hash_key) {}
 
-  int64 Generate(const int64 batch_index,
-                 const std::vector<int>& permutation) const {
+  int64 Generate(const int64 batch_index, const std::vector<int>& permutation,
+                 bool unused_strong_hash) const {
     // Do the fingerprint concatenation on uint64.
     uint64 hashed_output = hash_key_;
     for (size_t i = 0; i < permutation.size(); ++i) {
-      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i]);
+      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i], false);
       hashed_output = FingerprintCat64(hashed_output, hash_i);
     }
     // The return value is int64 based on the number of buckets.
@@ -220,6 +342,39 @@ class HashCrosser {
   const uint64 hash_key_;
 };
 
+// Generates the sparse crosses as nested hash to avoid string manipulations.
+class HashCrosserV2 {
+ public:
+  HashCrosserV2(
+      const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
+      const int64 num_buckets, const uint64 hash_key_unused,
+      const tstring k_feature_separator_unused)
+      : columns_(columns), num_buckets_(num_buckets) {}
+
+  int64 Generate(const int64 batch_index, const std::vector<int>& permutation,
+                 bool strong_hash) const {
+    // Do the fingerprint concatenation on uint64.
+    uint64 hashed_output =
+        columns_[0]->Feature(batch_index, permutation[0], strong_hash);
+    for (size_t i = 1; i < permutation.size(); ++i) {
+      uint64 hash_i =
+          columns_[i]->Feature(batch_index, permutation[i], strong_hash);
+      hashed_output = FingerprintCat64(hashed_output, hash_i);
+    }
+    // The return value is int64 based on the number of buckets.
+    if (num_buckets_ > 0) {
+      return hashed_output % num_buckets_;
+    } else {
+      // To prevent negative output we take modulo to max int64.
+      return hashed_output % std::numeric_limits<int64>::max();
+    }
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns_;
+  const int64 num_buckets_;
+};
+
 // ProductIterator generates cartesian products based on indices.
 template <typename InternalType>
 class ProductIterator {
@@ -275,16 +430,264 @@ struct CrossTraits;
 template <typename InternalType>
 struct CrossTraits<false, InternalType> {
   typedef StringCrosser<InternalType> Crosser;
+  typedef StringCrosser<InternalType> CrosserV2;
   typedef OutputUpdater<tstring> Updater;
 };
 
 template <>
 struct CrossTraits<true, int64> {
   typedef HashCrosser Crosser;
+  typedef HashCrosserV2 CrosserV2;
   typedef OutputUpdater<int64> Updater;
 };
 }  // namespace
 
+// Calculate the batch size from either the shapes input or the dense input.
+int64 CalculateBatchSize(const OpInputList& shapes_list_in,
+                         const OpInputList& dense_list_in) {
+  if (shapes_list_in.size() > 0) {
+    return shapes_list_in[0].vec<int64>()(0);
+  }
+
+  if (dense_list_in.size() > 0) {
+    return dense_list_in[0].dim_size(0);
+  }
+
+  return 0;
+}
+
+// Validates input tensors.
+Status ValidateInput(const OpInputList& indices_list_in,
+                     const OpInputList& values_list_in,
+                     const OpInputList& shapes_list_in,
+                     const OpInputList& dense_list_in) {
+  const auto size = indices_list_in.size();
+  // Validates indices_list_in OpInputList.
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsMatrix(indices_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input indices should be a matrix but received shape ",
+          indices_list_in[i].shape().DebugString(), " at position ", i);
+    }
+    if (indices_list_in[i].shape().dim_size(1) != 2) {
+      return errors::InvalidArgument("Expected D2 of index to be 2 got ",
+                                     indices_list_in[i].shape().dim_size(1),
+                                     " at position ", i);
+    }
+  }
+
+  // Validates values_list_in OpInputList.
+  if (values_list_in.size() != size) {
+    return errors::InvalidArgument("Expected ", size, " input values, got ",
+                                   values_list_in.size());
+  }
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsVector(values_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input values should be a vector but received shape ",
+          values_list_in[i].shape().DebugString(), " at position ", i);
+    }
+    if (indices_list_in[i].shape().dim_size(0) !=
+        values_list_in[i].shape().dim_size(0)) {
+      return errors::InvalidArgument(
+          "Expected size of values to be ",
+          indices_list_in[i].shape().dim_size(0), " got ",
+          values_list_in[i].shape().dim_size(0), " at position ", i);
+    }
+  }
+
+  // Validates shapes_list_in OpInputList
+  if (shapes_list_in.size() != size) {
+    return errors::InvalidArgument("Expected ", size, " input shapes, got ",
+                                   shapes_list_in.size());
+  }
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsVector(shapes_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input shapes should be a vector but received shape ",
+          shapes_list_in[i].shape().DebugString(), " at position ", i);
+    }
+
+    if (shapes_list_in[i].vec<int64>().size() != 2) {
+      return errors::InvalidArgument("shape should imply a 2D tensor, but got ",
+                                     shapes_list_in[i].shape().DebugString(),
+                                     " at position ", i);
+    }
+  }
+
+  // Validates dense_list_in OpInputList
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    if (!TensorShapeUtils::IsMatrix(dense_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Dense inputs should be a matrix but received shape ",
+          dense_list_in[i].shape().DebugString(), " at position ", i);
+    }
+  }
+
+  // Validates batch sizes.  (Note: we do this after validating the input
+  // shapes, because CalculateBatchSize() depends on inputs having valid
+  // shapes).
+  const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  for (int i = 0; i < size; i++) {
+    if (shapes_list_in[i].vec<int64>()(0) != batch_size) {
+      return errors::InvalidArgument("Expected batch size ", batch_size,
+                                     " got ", shapes_list_in[i].vec<int64>()(0),
+                                     " at position ", i);
+    }
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    if (dense_list_in[i].dim_size(0) != batch_size) {
+      return errors::InvalidArgument("Expected batch size ", batch_size,
+                                     " got ", dense_list_in[i].dim_size(0),
+                                     " at dense tensor ", i);
+    }
+  }
+
+  return Status::OK();
+}
+
+// Extracts data about the features and populates feature data.
+void ExtractFeatureData(
+    const OpInputList& indices_list_in, int64 batch_size,
+    std::vector<std::vector<int64>>* feature_counts,
+    std::vector<std::vector<int64>>* feature_start_indices) {
+  gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
+  for (int b = 0; b < batch_size; b++) {
+    for (int i = 0; i < indices_list_in.size(); i++) {
+      const auto indices = indices_list_in[i].matrix<int64>();
+      int64 feature_count = 0;
+      int64 start_index = current_row[i];
+      // Loops until we reach next batch index for current feature column.
+      while (current_row[i] < indices_list_in[i].dim_size(0) &&
+             indices(current_row[i], 0) == b) {
+        feature_count++;
+        current_row[i]++;
+      }
+      (*feature_counts)[i].push_back(feature_count);
+      (*feature_start_indices)[i].push_back(start_index);
+    }
+  }
+}
+
+// Returns number of crosses for a given batch_index
+template <typename InternalType>
+int64 CrossCountByBatchIndex(
+    const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+    int batch_index) {
+  int64 cross_count = 1;
+  for (int i = 0; i < columns.size(); i++) {
+    const auto feature_count = columns[i]->FeatureCount(batch_index);
+    // If one column is missing any feature, there won't be any cross.
+    if (feature_count == 0) {
+      return 0;
+    }
+    cross_count *= feature_count;
+  }
+  return cross_count;
+}
+
+// Generate the columns given the sparse and dense inputs.
+template <typename InternalType>
+std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+GenerateColumnsFromInput(const OpInputList& indices_list_in,
+                         const OpInputList& values_list_in,
+                         const OpInputList& shapes_list_in,
+                         const OpInputList& dense_list_in) {
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+  const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  const int64 number_of_columns = shapes_list_in.size();
+
+  std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                 std::vector<int64>());
+  std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                        std::vector<int64>());
+
+  ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                     &feature_start_indices);
+
+  columns.reserve(values_list_in.size());
+  for (int i = 0; i < values_list_in.size(); ++i) {
+    columns.emplace_back(new SparseTensorColumn<InternalType>(
+        values_list_in[i], std::move(feature_counts[i]),
+        std::move(feature_start_indices[i])));
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    columns.emplace_back(new DenseTensorColumn<InternalType>(dense_list_in[i]));
+  }
+
+  return columns;
+}
+
+// Generate the columns given the sparse and dense inputs.
+template <typename InternalType>
+std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+GenerateKeyedColumnsFromInput(const OpInputList& indices_list_in,
+                              const OpInputList& values_list_in,
+                              const OpInputList& shapes_list_in,
+                              const OpInputList& dense_list_in,
+                              std::vector<int64> keys) {
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+  const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  const int64 number_of_columns = shapes_list_in.size();
+
+  std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                 std::vector<int64>());
+  std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                        std::vector<int64>());
+
+  ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                     &feature_start_indices);
+
+  columns.reserve(values_list_in.size());
+  for (int i = 0; i < values_list_in.size(); ++i) {
+    columns.emplace_back(new KeyedSparseTensorColumn<InternalType>(
+        values_list_in[i], std::move(feature_counts[i]),
+        std::move(feature_start_indices[i]), keys));
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    columns.emplace_back(
+        new KeyedDenseTensorColumn<InternalType>(dense_list_in[i], keys));
+  }
+
+  return columns;
+}
+
+// Allocates output tensors with proper size and sets the shape tensor of
+// the output SparseTensor.
+// It also output_start_indices which contains the start indices for each
+// input in the output SparseTensor.
+template <typename InternalType>
+Status CreateOutputTensors(
+    const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+    int64 batch_size, OpKernelContext* context, Tensor** indices_out,
+    Tensor** values_out, Tensor** shape_out,
+    std::vector<int64>* output_start_indices) {
+  // Calculates dimensions for output tensors.
+  int64 cross_count_total = 0;
+  int64 max_cross_count = 0;
+  for (int64 b = 0; b < batch_size; b++) {
+    // For each input, sets starting indices in output SparseTensor
+    (*output_start_indices)[b] = cross_count_total;
+    const auto cross_count = CrossCountByBatchIndex(columns, b);
+    max_cross_count = std::max(max_cross_count, cross_count);
+    cross_count_total += cross_count;
+  }
+
+  // Allocates tensors.
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      0, TensorShape({cross_count_total, 2}), indices_out));
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      1, TensorShape({cross_count_total}), values_out));
+  TF_RETURN_IF_ERROR(context->allocate_output(2, TensorShape({2}), shape_out));
+
+  // Sets shape.
+  auto shape_vec = (*shape_out)->vec<int64>();
+  shape_vec(0) = batch_size;
+  shape_vec(1) = max_cross_count;
+
+  return Status::OK();
+}
+
 template <bool HASHED_OUTPUT, typename InternalType>
 class SparseCrossOp : public OpKernel {
  public:
@@ -312,11 +715,12 @@ class SparseCrossOp : public OpKernel {
                                           shapes_list_in, dense_list_in));
 
     std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
-        GenerateColumnsFromInput(indices_list_in, values_list_in,
-                                 shapes_list_in, dense_list_in);
+        GenerateColumnsFromInput<InternalType>(indices_list_in, values_list_in,
+                                               shapes_list_in, dense_list_in);
 
+    const tstring k_feature_separator = "_X_";
     typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(
-        columns, num_buckets_, hash_key_);
+        columns, num_buckets_, hash_key_, k_feature_separator);
     Tensor* indices_out;
     Tensor* values_out;
     Tensor* shape_out;
@@ -335,7 +739,8 @@ class SparseCrossOp : public OpKernel {
         int64 cross_count = 0;
         while (product_iterator.HasNext()) {
           const auto permutation = product_iterator.Next();
-          updater.Update(b, cross_count, crosser.Generate(b, permutation));
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, false));
           cross_count++;
         }
       }
@@ -349,222 +754,138 @@ class SparseCrossOp : public OpKernel {
   }
 
  private:
-  // Validates input tensors.
-  Status ValidateInput(const OpInputList& indices_list_in,
-                       const OpInputList& values_list_in,
-                       const OpInputList& shapes_list_in,
-                       const OpInputList& dense_list_in) {
-    const auto size = indices_list_in.size();
-    // Validates indices_list_in OpInputList.
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsMatrix(indices_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input indices should be a matrix but received shape ",
-            indices_list_in[i].shape().DebugString(), " at position ", i);
-      }
-      if (indices_list_in[i].shape().dim_size(1) != 2) {
-        return errors::InvalidArgument("Expected D2 of index to be 2 got ",
-                                       indices_list_in[i].shape().dim_size(1),
-                                       " at position ", i);
-      }
-    }
-
-    // Validates values_list_in OpInputList.
-    if (values_list_in.size() != size) {
-      return errors::InvalidArgument("Expected ", size, " input values, got ",
-                                     values_list_in.size());
-    }
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsVector(values_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input values should be a vector but received shape ",
-            values_list_in[i].shape().DebugString(), " at position ", i);
-      }
-      if (indices_list_in[i].shape().dim_size(0) !=
-          values_list_in[i].shape().dim_size(0)) {
-        return errors::InvalidArgument(
-            "Expected size of values to be ",
-            indices_list_in[i].shape().dim_size(0), " got ",
-            values_list_in[i].shape().dim_size(0), " at position ", i);
-      }
-    }
-
-    // Validates shapes_list_in OpInputList
-    if (shapes_list_in.size() != size) {
-      return errors::InvalidArgument("Expected ", size, " input shapes, got ",
-                                     shapes_list_in.size());
-    }
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsVector(shapes_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input shapes should be a vector but received shape ",
-            shapes_list_in[i].shape().DebugString(), " at position ", i);
-      }
-
-      if (shapes_list_in[i].vec<int64>().size() != 2) {
-        return errors::InvalidArgument(
-            "shape should imply a 2D tensor, but got ",
-            shapes_list_in[i].shape().DebugString(), " at position ", i);
-      }
-    }
-
-    // Validates dense_list_in OpInputList
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      if (!TensorShapeUtils::IsMatrix(dense_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Dense inputs should be a matrix but received shape ",
-            dense_list_in[i].shape().DebugString(), " at position ", i);
-      }
-    }
-
-    // Validates batch sizes.  (Note: we do this after validating the input
-    // shapes, because CalculateBatchSize() depends on inputs having valid
-    // shapes).
-    const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
-    for (int i = 0; i < size; i++) {
-      if (shapes_list_in[i].vec<int64>()(0) != batch_size) {
-        return errors::InvalidArgument(
-            "Expected batch size ", batch_size, " got ",
-            shapes_list_in[i].vec<int64>()(0), " at position ", i);
-      }
-    }
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      if (dense_list_in[i].dim_size(0) != batch_size) {
-        return errors::InvalidArgument("Expected batch size ", batch_size,
-                                       " got ", dense_list_in[i].dim_size(0),
-                                       " at dense tensor ", i);
-      }
-    }
-
-    return Status::OK();
-  }
-
-  // Calculate the batch size from either the shapes input or the dense input.
-  int64 CalculateBatchSize(const OpInputList& shapes_list_in,
-                           const OpInputList& dense_list_in) {
-    if (shapes_list_in.size() > 0) {
-      return shapes_list_in[0].vec<int64>()(0);
-    }
-
-    if (dense_list_in.size() > 0) {
-      return dense_list_in[0].dim_size(0);
-    }
-
-    return 0;
-  }
-
-  // Generate the columns given the sparse and dense inputs.
-  std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
-  GenerateColumnsFromInput(const OpInputList& indices_list_in,
-                           const OpInputList& values_list_in,
-                           const OpInputList& shapes_list_in,
-                           const OpInputList& dense_list_in) {
-    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
-    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
-    const int64 number_of_columns = shapes_list_in.size();
-
-    std::vector<std::vector<int64>> feature_counts(number_of_columns,
-                                                   std::vector<int64>());
-    std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
-                                                          std::vector<int64>());
-
-    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
-                       &feature_start_indices);
-
-    columns.reserve(values_list_in.size());
-    for (int i = 0; i < values_list_in.size(); ++i) {
-      columns.emplace_back(new SparseTensorColumn<InternalType>(
-          values_list_in[i], std::move(feature_counts[i]),
-          std::move(feature_start_indices[i])));
-    }
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      columns.emplace_back(
-          new DenseTensorColumn<InternalType>(dense_list_in[i]));
-    }
-
-    return columns;
-  }
-
-  // Extracts data about the features and populates feature data.
-  void ExtractFeatureData(
-      const OpInputList& indices_list_in, int64 batch_size,
-      std::vector<std::vector<int64>>* feature_counts,
-      std::vector<std::vector<int64>>* feature_start_indices) {
-    gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
-    for (int b = 0; b < batch_size; b++) {
-      for (int i = 0; i < indices_list_in.size(); i++) {
-        const auto indices = indices_list_in[i].matrix<int64>();
-        int64 feature_count = 0;
-        int64 start_index = current_row[i];
-        // Loops until we reach next batch index for current feature column.
-        while (current_row[i] < indices_list_in[i].dim_size(0) &&
-               indices(current_row[i], 0) == b) {
-          feature_count++;
-          current_row[i]++;
-        }
-        (*feature_counts)[i].push_back(feature_count);
-        (*feature_start_indices)[i].push_back(start_index);
-      }
-    }
-  }
-
-  // Allocates output tensors with proper size and sets the shape tensor of
-  // the output SparseTensor.
-  // It also output_start_indices which contains the start indices for each
-  // input in the output SparseTensor.
-  Status CreateOutputTensors(
-      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
-          columns,
-      int64 batch_size, OpKernelContext* context, Tensor** indices_out,
-      Tensor** values_out, Tensor** shape_out,
-      std::vector<int64>* output_start_indices) {
-    // Calculates dimensions for output tensors.
-    int64 cross_count_total = 0;
-    int64 max_cross_count = 0;
-    for (int64 b = 0; b < batch_size; b++) {
-      // For each input, sets starting indices in output SparseTensor
-      (*output_start_indices)[b] = cross_count_total;
-      const auto cross_count = CrossCountByBatchIndex(columns, b);
-      max_cross_count = std::max(max_cross_count, cross_count);
-      cross_count_total += cross_count;
-    }
-
-    // Allocates tensors.
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        0, TensorShape({cross_count_total, 2}), indices_out));
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        1, TensorShape({cross_count_total}), values_out));
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({2}), shape_out));
-
-    // Sets shape.
-    auto shape_vec = (*shape_out)->vec<int64>();
-    shape_vec(0) = batch_size;
-    shape_vec(1) = max_cross_count;
-
-    return Status::OK();
-  }
-
-  // Returns number of crosses for a given batch_index
-  int64 CrossCountByBatchIndex(
-      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
-          columns,
-      int batch_index) {
-    int64 cross_count = 1;
-    for (int i = 0; i < columns.size(); i++) {
-      const auto feature_count = columns[i]->FeatureCount(batch_index);
-      // If one column is missing any feature, there won't be any cross.
-      if (feature_count == 0) {
-        return 0;
-      }
-      cross_count *= feature_count;
-    }
-    return cross_count;
-  }
   int64 num_buckets_;
   uint64 hash_key_;
 };
 
+class SparseCrossV2Op : public OpKernel {
+ public:
+  explicit SparseCrossV2Op(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    OP_REQUIRES_OK(context, ValidateInput(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in));
+
+    const Tensor* sep_t;
+    OP_REQUIRES_OK(context, context->input("sep", &sep_t));
+    const tstring separator = sep_t->scalar<tstring>()();
+
+    std::vector<std::unique_ptr<ColumnInterface<tstring>>> columns =
+        GenerateColumnsFromInput<tstring>(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        CreateOutputTensors(columns, batch_size, context, &indices_out,
+                            &values_out, &shape_out, &output_start_indices));
+    StringCrosser<tstring> crosser(columns, 0, 0, separator);
+    OutputUpdater<tstring> updater(output_start_indices, indices_out,
+                                   values_out);
+    auto do_work = [&columns, crosser, updater](int64 begin, int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<tstring> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, false));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+};
+
+class SparseCrossHashedOp : public OpKernel {
+ public:
+  explicit SparseCrossHashedOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    OP_REQUIRES_OK(context, ValidateInput(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in));
+
+    const Tensor* num_buckets_t;
+    OP_REQUIRES_OK(context, context->input("num_buckets", &num_buckets_t));
+    const int64 num_buckets = num_buckets_t->scalar<int64>()();
+
+    const Tensor* strong_hash_t;
+    OP_REQUIRES_OK(context, context->input("strong_hash", &strong_hash_t));
+    const bool strong_hash = strong_hash_t->scalar<bool>()();
+
+    const Tensor* salt_t;
+    OP_REQUIRES_OK(context, context->input("salt", &salt_t));
+    const auto salt = salt_t->flat<int64>();
+    std::vector<int64> key_{salt(0), salt(1)};
+
+    std::vector<std::unique_ptr<ColumnInterface<int64>>> columns =
+        GenerateKeyedColumnsFromInput<int64>(indices_list_in, values_list_in,
+                                             shapes_list_in, dense_list_in,
+                                             key_);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        CreateOutputTensors(columns, batch_size, context, &indices_out,
+                            &values_out, &shape_out, &output_start_indices));
+    const tstring unused_sep;
+    HashCrosserV2 crosser(columns, num_buckets, 0, unused_sep);
+    OutputUpdater<int64> updater(output_start_indices, indices_out, values_out);
+    auto do_work = [&columns, crosser, updater, strong_hash](int64 begin,
+                                                             int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<int64> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, strong_hash));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<tstring>("out_type")
@@ -589,4 +910,10 @@ REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .TypeConstraint<int64>("internal_type"),
                         SparseCrossOp<true, int64>);
 
+REGISTER_KERNEL_BUILDER(Name("SparseCrossV2").Device(DEVICE_CPU),
+                        SparseCrossV2Op);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCrossHashed").Device(DEVICE_CPU),
+                        SparseCrossHashedOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index ebc6d8fa4ec..1dc51cd804c 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -311,7 +311,7 @@ class SparseMatmulOpTest : public ::testing::Test {
 #elif defined EIGEN_VECTORIZE_AVX || defined EIGEN_VECTORIZE_AVX2
   static const int kMaxPacketSize = 8;
 #else
-  static const int kMaxPacketSize = 4;
+  static constexpr int kMaxPacketSize = 4;
 #endif
   typedef typename Eigen::internal::packet_traits<float>::type Packet;
   const int PacketSize;
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index d2dce3fb695..9baaa6edb7b 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -244,7 +244,7 @@ Status MOutOfBoundsError(int64 m, std::size_t i, int lhs_index_a,
 template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
 struct SparseTensorDenseMatMulFunctor<CPUDevice, T, Tindices, ADJ_A, ADJ_B> {
   // Vectorize certain operations above this size.
-  static const std::size_t kNumVectorize = 32;
+  static constexpr std::size_t kNumVectorize = 32;
 
   static Status Compute(const CPUDevice& d, typename TTypes<T>::Matrix out,
                         typename TTypes<Tindices>::ConstMatrix a_indices,
diff --git a/tensorflow/core/kernels/sparse_utils.cc b/tensorflow/core/kernels/sparse_utils.cc
index 198862940d1..678c94af6f3 100644
--- a/tensorflow/core/kernels/sparse_utils.cc
+++ b/tensorflow/core/kernels/sparse_utils.cc
@@ -79,11 +79,14 @@ std::vector<Tindices> GetStartIndicesOfEachDenseRow(
   std::vector<Tindices> segment_indices;
   const Tindices num_entries_in_sparse_tensor = indices_mat.dimension(0);
   const Tindices num_dense_rows_in_sparse_tensor =
-      1 + indices_mat(num_entries_in_sparse_tensor - 1, 0) - indices_mat(0, 0);
+      1 + indices_mat(num_entries_in_sparse_tensor - 1, 0);
   // Reserve an extra slot for the 0 we store in the first entry by convention.
   segment_indices.reserve(1 + num_dense_rows_in_sparse_tensor);
   segment_indices.push_back(0);
-  *contains_empty_rows = false;
+  for (Tindices i = 0; i < indices_mat(0, 0); ++i) {
+    segment_indices.push_back(0);
+  }
+  *contains_empty_rows = indices_mat(0, 0) > 0;
   while (true) {
     const Tindices start_sparse_index_of_next_dense_row =
         FindNextDenseRowStartIndex<Tindices>(
@@ -127,9 +130,9 @@ std::vector<Tindices> ParseRowStartIndices(
 
 template <typename Tindices>
 bool ContainsEmptyRows(const std::vector<Tindices>& row_start_indices) {
-  // Skip checking the lengths of the first and last dense rows since those are
+  // Skip checking the length of the last dense row since it is
   // always non-empty.
-  for (size_t i = 2; i < row_start_indices.size() - 1; ++i) {
+  for (size_t i = 1; i < row_start_indices.size() - 1; ++i) {
     if (row_start_indices.at(i) - row_start_indices.at(i - 1) == 0) {
       return true;
     }
diff --git a/tensorflow/core/kernels/sparse_utils.h b/tensorflow/core/kernels/sparse_utils.h
index 9e3c41a4964..d43b2e34470 100644
--- a/tensorflow/core/kernels/sparse_utils.h
+++ b/tensorflow/core/kernels/sparse_utils.h
@@ -44,7 +44,7 @@ Tindices FindNextDenseRowStartIndex(
 // v.front() = 0, v.back() = indices_mat.dimension(0), and for i > 0,
 // v[i] - v[i-1] is the length of the ith dense row in indices_mat.
 // *contains_empty_rows = true if and only if indices_mat contains empty rows
-// (rows without values) between its first and last row.
+// (rows without values) between row 0 and the last row.
 template <typename Tindices>
 std::vector<Tindices> GetStartIndicesOfEachDenseRow(
     const typename TTypes<Tindices>::ConstMatrix& indices_mat,
diff --git a/tensorflow/core/kernels/sparse_utils_test.cc b/tensorflow/core/kernels/sparse_utils_test.cc
index 5d0adff8860..5b0e52166d1 100644
--- a/tensorflow/core/kernels/sparse_utils_test.cc
+++ b/tensorflow/core/kernels/sparse_utils_test.cc
@@ -66,8 +66,8 @@ TEST(SparseUtilsTest, GetStartIndicesOfEachDenseRow) {
     bool contains_empty_rows;
     EXPECT_TRUE(GetStartIndicesOfEachDenseRow<int64>(indices_mat,
                                                      &contains_empty_rows) ==
-                std::vector<int64>({0, 1}));
-    EXPECT_FALSE(contains_empty_rows);
+                std::vector<int64>({0, 0, 0, 0, 1}));
+    EXPECT_TRUE(contains_empty_rows);
   }
   {
     uint32 data[] = {3, 0, 3, 0};
@@ -75,8 +75,8 @@ TEST(SparseUtilsTest, GetStartIndicesOfEachDenseRow) {
     bool contains_empty_rows;
     EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint32>(indices_mat,
                                                       &contains_empty_rows) ==
-                std::vector<uint32>({0, 2}));
-    EXPECT_FALSE(contains_empty_rows);
+                std::vector<uint32>({0, 0, 0, 0, 2}));
+    EXPECT_TRUE(contains_empty_rows);
   }
   {
     uint16 data[] = {0, 0, 0, 0, 0, 0, 1, 0};
@@ -165,7 +165,7 @@ TEST(SparseUtilsTest, ContainsEmptyRows) {
     const auto segment_indices =
         GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
     // indices_list = {1, 1, 2, 2, 2, 3};
-    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
   }
   {
     uint16 data[] = {1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
@@ -174,7 +174,7 @@ TEST(SparseUtilsTest, ContainsEmptyRows) {
     const auto segment_indices = GetStartIndicesOfEachDenseRow<uint16>(
         indices_mat, &contains_empty_rows);
     // indices_list = {1, 1, 2, 2, 2, 3};
-    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
   }
   {
     int32 data[] = {0, 0, 1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
diff --git a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
index f651358b47f..862048603f5 100644
--- a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
@@ -17,19 +17,13 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/sparse_xent_op.h"
-
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
+#include "tensorflow/core/kernels/sparse_xent_op.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 167daf2ff9e..6738a34e3fd 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -259,6 +259,7 @@ class StatelessRandomGammaOp : public StatelessRandomOpBase {
 
       using Eigen::numext::exp;
       using Eigen::numext::log;
+      using Eigen::numext::log1p;
       using Eigen::numext::pow;
 
       Normal normal;
@@ -288,7 +289,7 @@ class StatelessRandomGammaOp : public StatelessRandomOpBase {
             gen.Skip(kReservedSamplesPerOutput * output_idx);
             int16 uniform_remaining = 0;
             UNIFORM(u);
-            const double res = -log(1.0 - u);
+            const double res = -log1p(-u);
             samples_alpha_offset[sample_idx * num_alphas] = static_cast<T>(res);
           }       // for (sample_idx)
         } else {  // if alpha != 1.0
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 2821abf8a6c..482fd057e4e 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/eye_functor.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -390,8 +391,22 @@ class SvdOpGpu : public AsyncOpKernel {
                          done);
 
     if (n == 0 || m == 0) {
-      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
-      // Therefore, we return X.
+      if (n == m || !compute_uv_ || !full_matrices_) {
+        // S, U, and V are all empty. Nothing to do.
+        done();
+        return;
+      }
+      auto device = context->eigen_device<GPUDevice>();
+      functor::EyeFunctor<GPUDevice, Scalar> eye;
+      if (m > 0) {
+        // Return a full canonical basis for the column space.
+        auto outputU_reshaped = outputU->flat_inner_dims<Scalar, 3>();
+        eye(device, outputU_reshaped);
+      } else if (n > 0) {
+        // Return a full canonical basis for the row space.
+        auto outputV_reshaped = outputV->flat_inner_dims<Scalar, 3>();
+        eye(device, outputV_reshaped);
+      }
       done();
       return;
     }
diff --git a/tensorflow/core/kernels/svd_op_impl.h b/tensorflow/core/kernels/svd_op_impl.h
index 2a67700c126..675826a057c 100644
--- a/tensorflow/core/kernels/svd_op_impl.h
+++ b/tensorflow/core/kernels/svd_op_impl.h
@@ -83,16 +83,29 @@ class SvdOp : public LinearAlgebraOp<Scalar> {
 
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
+    int64 n = inputs[0].cols();
+    int64 m = inputs[0].rows();
+    const bool empty = (m == 0 || n == 0);
     int options = 0;  // Don't compute singular vectors;
     if (compute_uv_) {
       options = full_matrices_ ? Eigen::ComputeFullU | Eigen::ComputeFullV
                                : Eigen::ComputeThinU | Eigen::ComputeThinV;
     }
-    Eigen::BDCSVD<Matrix> svd(inputs[0], options);
-    outputs->at(0) = svd.singularValues().template cast<Scalar>();
-    if (compute_uv_) {
-      outputs->at(1) = svd.matrixU();
-      outputs->at(2) = svd.matrixV();
+    if (!empty) {
+      Eigen::BDCSVD<Matrix> svd(inputs[0], options);
+      outputs->at(0) = svd.singularValues().template cast<Scalar>();
+      if (compute_uv_) {
+        outputs->at(1) = svd.matrixU();
+        outputs->at(2) = svd.matrixV();
+      }
+    } else if (compute_uv_ && full_matrices_) {
+      // For an empty matrix where only one dimension is zero, we still set
+      // U or V to the unit matrix for the dimension that is non-zero.
+      if (m > 0) {
+        outputs->at(1) = Matrix::Identity(m, m);
+      } else {
+        outputs->at(2) = Matrix::Identity(n, n);
+      }
     }
   }
 
diff --git a/tensorflow/core/kernels/tile_functor_cpu_variant.cc b/tensorflow/core/kernels/tile_functor_cpu_variant.cc
new file mode 100644
index 00000000000..9ecfb4e9fe1
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_cpu_variant.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/tile_functor_cpu.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template struct Tile<CPUDevice, Variant, int32>;
+template struct Tile<CPUDevice, Variant, int64>;
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index cd047ed9d4a..5000e3b0f12 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -143,6 +143,7 @@ TF_CALL_half(DECLARE_TYPE);
 TF_CALL_complex64(DECLARE_TYPE);
 TF_CALL_complex128(DECLARE_TYPE);
 TF_CALL_tstring(DECLARE_TYPE);
+TF_CALL_variant(DECLARE_TYPE);
 #undef DECLARE_TYPE
 
 #define DECLARE_DIM(T, NDIM)                           \
@@ -244,6 +245,7 @@ class TileOp : public OpKernel {
     TF_CALL_tstring(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
     TF_CALL_complex64(HANDLE_TYPE_NAME);
     TF_CALL_complex128(HANDLE_TYPE_NAME);
+    TF_CALL_variant(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice
 
 #undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
@@ -323,6 +325,7 @@ TF_CALL_half(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
 TF_CALL_tstring(HANDLE_TYPE_NAME_CPU);
+TF_CALL_variant(HANDLE_TYPE_NAME_CPU);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
diff --git a/tensorflow/core/kernels/topk_op_gpu.h b/tensorflow/core/kernels/topk_op_gpu.h
index 3156b6d9bd9..d26dd7a8bc3 100644
--- a/tensorflow/core/kernels/topk_op_gpu.h
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -23,26 +23,25 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/cub/device/device_segmented_radix_sort.cuh"
-#include "third_party/cub/iterator/counting_input_iterator.cuh"
-#include "third_party/cub/iterator/transform_input_iterator.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/topk_op.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
+#if GOOGLE_CUDA
 // Required for sorting Eigen::half
 namespace cub {
 template <>
 struct NumericTraits<Eigen::half>
-    : BaseTraits<FLOATING_POINT, true, false, unsigned short int, Eigen::half> {
-};
+    : BaseTraits<FLOATING_POINT, true, false, unsigned short, Eigen::half> {};
 }  // namespace cub
+#endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index e2418dda59a..7e2a8b363c5 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -747,9 +747,7 @@ namespace functor {
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 DECLARE_GPU_SPEC(complex64);
 DECLARE_GPU_SPEC(complex128);
 #endif
@@ -759,9 +757,7 @@ DECLARE_GPU_SPEC(complex128);
 REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 REGISTER_KERNELS(GPU, complex64);
 REGISTER_KERNELS(GPU, complex128);
 #endif
@@ -924,9 +920,7 @@ namespace functor {
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 DECLARE_GPU_SPEC(complex64);
 DECLARE_GPU_SPEC(complex128);
 #endif
@@ -936,9 +930,7 @@ DECLARE_GPU_SPEC(complex128);
 REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 REGISTER_KERNELS(GPU, complex64);
 REGISTER_KERNELS(GPU, complex128);
 #endif
@@ -1411,9 +1403,7 @@ namespace functor {
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 DECLARE_GPU_SPEC(complex64);
 DECLARE_GPU_SPEC(complex128);
 #endif
@@ -1423,9 +1413,7 @@ DECLARE_GPU_SPEC(complex128);
 REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 REGISTER_KERNELS(GPU, complex64);
 REGISTER_KERNELS(GPU, complex128);
 #endif
@@ -1525,9 +1513,7 @@ namespace functor {
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 DECLARE_GPU_SPEC(complex64);
 DECLARE_GPU_SPEC(complex128);
 #endif
@@ -1537,9 +1523,7 @@ DECLARE_GPU_SPEC(complex128);
 REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 REGISTER_KERNELS(GPU, complex64);
 REGISTER_KERNELS(GPU, complex128);
 #endif
@@ -4275,9 +4259,7 @@ namespace functor {
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 DECLARE_GPU_SPEC(complex64);
 DECLARE_GPU_SPEC(complex128);
 #endif
@@ -4287,9 +4269,7 @@ DECLARE_GPU_SPEC(complex128);
 REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support complex sqrt
 REGISTER_KERNELS(GPU, complex64);
 REGISTER_KERNELS(GPU, complex128);
 #endif
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 1e53cfed777..92496e63e1a 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -118,12 +118,182 @@ struct ApplyGradientDescent<GPUDevice, T> {
   }
 };
 
+#if TENSORFLOW_USE_ROCM
+
+#include "rocm/include/hip/hip_complex.h"
+
+// if any kernels involving complex sqrt/rsqrt are compiled with ROCm, build
+// process completes without errors,but the resulting executable ends up
+// unusable (throwing errors "no device code available for function" for
+/// completely unrelated kernels.)
+// We also can't cast to hipFloatComplex etc. because (as of 2020-01) HIP does
+// not provide sqrt for complex.
+// We have no choice but to implement sqrt and rsqrt by hand
+template <typename T>
+__device__ T impl_sqrt(T x) {
+  return sqrt(x);
+}
+template <typename T>
+__device__ T impl_rsqrt(T x) {
+  return rsqrt(x);
+}
+template <>
+__device__ Eigen::half impl_sqrt(Eigen::half x) {
+  return __float2half(sqrt(__half2float(x)));
+}
+template <>
+__device__ Eigen::half impl_rsqrt(Eigen::half x) {
+  return __float2half(rsqrt(__half2float(x)));
+}
+
+template <class T>
+__device__ std::complex<T> impl_sqrt(std::complex<T> x) {
+  T re = x.real(), im = x.imag();
+  T mod_x = sqrt(re * re + im * im);
+  const T root2 = 0.7071067811865475;
+  // We pick the root with the same sign of the imaginary component as
+  // the input.
+  T root[2] = {T(sqrt(mod_x + re) * root2),
+               T(sqrt(mod_x - re) * root2 * (im >= 0 ? 1. : -1.))};
+  // hcc/clang is really weird with its support of complex in device code;
+  // for some reason it does not permit a 2-argument constructor
+  return *(reinterpret_cast<std::complex<T>*>(&root));
+}
+
+template <class T>
+__device__ T rsqrt_helper(T x) {
+  return 0.5 * x + 0.125 * x * x + 0.0625 * x * x * x;
+}
+
+template <class T>
+__device__ std::complex<T> impl_rsqrt(std::complex<T> x) {
+  T re = x.real(), im = x.imag();
+  T r = rsqrt(re * re + im * im);
+  T ar2 = re * r * r;
+  const T root2 = 0.7071067811865475;
+  T root[2];
+  // With float, calculating 1+re*r and 1-re*r may result in excessive errors
+  // due to subtraction of two close values. We have to get fancy
+  root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98)
+                          ? rsqrt_helper(im * im * r * r)
+                          : 1 + re * r)) *
+            root2;
+  root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98)
+                          ? rsqrt_helper(im * im * r * r)
+                          : 1 - re * r)) *
+            root2 * (im >= 0 ? -1. : 1.);
+  return *(reinterpret_cast<std::complex<T>*>(&root));
+}
+
+template <typename T>
+__global__ void ApplyAdagradKernel(GpuLaunchConfig cfg, T* var, T* accum,
+                                   const T* lr, const T* grad,
+                                   bool update_slots) {
+  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
+    if (update_slots) accum[i] += grad[i] * grad[i];
+    var[i] -= lr[0] * grad[i] * impl_rsqrt(accum[i]);
+  }
+}
+
+template <typename T>
+__global__ void ApplyAdagradV2Kernel(GpuLaunchConfig cfg, T* var, T* accum,
+                                     const T* lr, const T* epsilon,
+                                     const T* grad, bool update_slots) {
+  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
+    if (update_slots) accum[i] += grad[i] * grad[i];
+    T update = grad[i] / (impl_sqrt(accum[i]) + epsilon[0]);
+    var[i] -= lr[0] * update;
+  }
+}
+
+template <typename T>
+__global__ void ApplyAdadeltaKernel(GpuLaunchConfig cfg, T* var, T* accum,
+                                    T* accum_update, const T* plr,
+                                    const T* prho, const T* peps,
+                                    const T* grad) {
+  T rho = prho[0];
+  T eps = peps[0];
+  T lr = plr[0];
+  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
+    accum[i] = accum[i] * rho + grad[i] * grad[i] * (T(1.0) - rho);
+    T update =
+        impl_sqrt(accum_update[i] + eps) * grad[i] * impl_rsqrt(accum[i] + eps);
+    var[i] -= update * lr;
+    accum_update[i] = accum_update[i] * rho + update * update * (T(1.0) - rho);
+  }
+}
+
+template <typename T>
+__global__ void ApplyRMSPropKernel(GpuLaunchConfig cfg, T* var, T* ms, T* mom,
+                                   const T* plr, const T* prho,
+                                   const T* pmomentum, const T* peps,
+                                   const T* grad) {
+  T rho = prho[0];
+  T eps = peps[0];
+  T lr = plr[0];
+  T momentum = pmomentum[0];
+  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
+    ms[i] += (T(1.0) - rho) * (grad[i] * grad[i] - ms[i]);
+    mom[i] = mom[i] * momentum + lr * grad[i] * impl_rsqrt(eps + ms[i]);
+    var[i] -= mom[i];
+  }
+}
+
+template <typename T>
+__global__ void ApplyCenteredRMSPropKernel(GpuLaunchConfig cfg, T* var, T* mg,
+                                           T* ms, T* mom, const T* plr,
+                                           const T* prho, const T* pmomentum,
+                                           const T* peps, const T* grad) {
+  T rho = prho[0];
+  T eps = peps[0];
+  T lr = plr[0];
+  T momentum = pmomentum[0];
+  T one_minus_rho = T(1.0) - rho;
+  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
+    ms[i] += one_minus_rho * (grad[i] * grad[i] - ms[i]);
+    mg[i] += one_minus_rho * (grad[i] - mg[i]);
+    T denom = (ms[i] - mg[i] * mg[i]) + eps;
+    mom[i] = mom[i] * momentum + lr * grad[i] * impl_rsqrt(denom);
+    var[i] -= mom[i];
+  }
+}
+
+namespace kernel_forward {
+bool to_pointers(bool x) { return x; }
+template <class T>
+typename T::PointerType to_pointers(T& x) {
+  return x.data();
+}
+template <class T>
+typename T::ConstPointerType to_pointers(const T& x) {
+  return x.data();
+}
+
+template <typename T, typename... CallerArgs, typename... KernelArgs>
+void wrap_kernel_call(void (*func)(KernelArgs...), const GPUDevice& d, T var,
+                      CallerArgs... args) {
+  int32 data_dim = var.dimension(0);
+  auto config = GetGpuLaunchConfig(data_dim, d);
+  TF_CHECK_OK(GpuLaunchKernel(func, config.block_count, config.thread_per_block,
+                              0, d.stream(), config, var.data(),
+                              to_pointers(args)...));
+}
+};  // namespace kernel_forward
+
+using kernel_forward::wrap_kernel_call;
+
+#endif
+
 template <typename T>
 struct ApplyAdagrad<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
                   typename TTypes<T>::ConstFlat grad, bool update_slots) {
+#if TENSORFLOW_USE_ROCM
+    wrap_kernel_call(ApplyAdagradKernel<T>, d, var, accum, lr, grad,
+                     update_slots);
+#else
     if (update_slots) {
       accum.device(d) += grad.square();
     }
@@ -131,6 +301,7 @@ struct ApplyAdagrad<GPUDevice, T> {
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
     var.device(d) -= lr.reshape(single).broadcast(bcast) * grad * accum.rsqrt();
+#endif
   }
 };
 
@@ -141,6 +312,10 @@ struct ApplyAdagradV2<GPUDevice, T> {
                   typename TTypes<T>::ConstScalar lr,
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad, bool update_slots) {
+#if TENSORFLOW_USE_ROCM
+    wrap_kernel_call(ApplyAdagradV2Kernel<T>, d, var, accum, lr, epsilon, grad,
+                     update_slots);
+#else
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
@@ -150,9 +325,9 @@ struct ApplyAdagradV2<GPUDevice, T> {
     const auto update =
         grad / (accum.sqrt() + epsilon.reshape(single).broadcast(bcast));
     var.device(d) -= lr.reshape(single).broadcast(bcast) * update;
+#endif
   }
 };
-
 template <typename T>
 struct ApplyAdadelta<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -162,6 +337,10 @@ struct ApplyAdadelta<GPUDevice, T> {
                   typename TTypes<T>::ConstScalar rho,
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad) {
+#if TENSORFLOW_USE_ROCM
+    wrap_kernel_call(ApplyAdadeltaKernel<T>, d, var, accum, accum_update, lr,
+                     rho, epsilon, grad);
+#else
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
@@ -177,6 +356,7 @@ struct ApplyAdadelta<GPUDevice, T> {
         accum_update * rho.reshape(single).broadcast(bcast) +
         update.square() *
             (grad.constant(T(1)) - rho.reshape(single).broadcast(bcast));
+#endif
   }
 };
 
@@ -489,6 +669,10 @@ struct ApplyRMSProp<GPUDevice, T> {
                   typename TTypes<T>::ConstScalar momentum,
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad) {
+#if TENSORFLOW_USE_ROCM
+    wrap_kernel_call(ApplyRMSPropKernel<T>, d, var, ms, mom, lr, rho, momentum,
+                     epsilon, grad);
+#else
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
@@ -501,6 +685,7 @@ struct ApplyRMSProp<GPUDevice, T> {
         lr.reshape(single).broadcast(bcast) * grad /
             ((epsilon.reshape(single).broadcast(bcast) + ms).sqrt());
     var.device(d) -= mom;
+#endif
   }
 };
 
@@ -514,6 +699,10 @@ struct ApplyCenteredRMSProp<GPUDevice, T> {
                   typename TTypes<T>::ConstScalar momentum,
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad) {
+#if TENSORFLOW_USE_ROCM
+    wrap_kernel_call(ApplyCenteredRMSPropKernel<T>, d, var, mg, ms, mom, lr,
+                     rho, momentum, epsilon, grad);
+#else
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
@@ -526,6 +715,7 @@ struct ApplyCenteredRMSProp<GPUDevice, T> {
     mom.device(d) = mom * momentum.reshape(single).broadcast(bcast) +
                     lr.reshape(single).broadcast(bcast) * grad / denom.sqrt();
     var.device(d) -= mom;
+#endif
   }
 };
 
@@ -599,9 +789,8 @@ struct ApplyPowerSign<GPUDevice, T> {
 template struct functor::ApplyGradientDescent<GPUDevice, Eigen::half>;
 template struct functor::ApplyGradientDescent<GPUDevice, float>;
 template struct functor::ApplyGradientDescent<GPUDevice, double>;
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support
+                             // complex sqrt
 template struct functor::ApplyGradientDescent<GPUDevice, complex64>;
 template struct functor::ApplyGradientDescent<GPUDevice, complex128>;
 #endif
@@ -609,9 +798,8 @@ template struct functor::ApplyGradientDescent<GPUDevice, complex128>;
 template struct functor::ApplyAdagrad<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdagrad<GPUDevice, float>;
 template struct functor::ApplyAdagrad<GPUDevice, double>;
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support
+                             // complex sqrt
 template struct functor::ApplyAdagrad<GPUDevice, complex64>;
 template struct functor::ApplyAdagrad<GPUDevice, complex128>;
 #endif
@@ -619,9 +807,8 @@ template struct functor::ApplyAdagrad<GPUDevice, complex128>;
 template struct functor::ApplyAdagradV2<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdagradV2<GPUDevice, float>;
 template struct functor::ApplyAdagradV2<GPUDevice, double>;
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support
+                             // complex sqrt
 template struct functor::ApplyAdagradV2<GPUDevice, complex64>;
 template struct functor::ApplyAdagradV2<GPUDevice, complex128>;
 #endif
@@ -629,9 +816,8 @@ template struct functor::ApplyAdagradV2<GPUDevice, complex128>;
 template struct functor::ApplyAdadelta<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdadelta<GPUDevice, float>;
 template struct functor::ApplyAdadelta<GPUDevice, double>;
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support
+                             // complex sqrt
 template struct functor::ApplyAdadelta<GPUDevice, complex64>;
 template struct functor::ApplyAdadelta<GPUDevice, complex128>;
 #endif
@@ -710,9 +896,8 @@ template struct functor::ApplyAdaMax<GPUDevice, double>;
 template struct functor::ApplyRMSProp<GPUDevice, Eigen::half>;
 template struct functor::ApplyRMSProp<GPUDevice, float>;
 template struct functor::ApplyRMSProp<GPUDevice, double>;
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support
+                             // complex sqrt
 template struct functor::ApplyRMSProp<GPUDevice, complex64>;
 template struct functor::ApplyRMSProp<GPUDevice, complex128>;
 #endif
@@ -720,9 +905,8 @@ template struct functor::ApplyRMSProp<GPUDevice, complex128>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, Eigen::half>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, float>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, double>;
-#if !defined(TENSORFLOW_USE_NVCC) && \
-    !defined(TENSORFLOW_USE_ROCM)  // TODO(b/143684500): Eigen to support
-                                   // complex sqrt
+#ifndef TENSORFLOW_USE_NVCC  // TODO(b/143684500): Eigen to support
+                             // complex sqrt
 template struct functor::ApplyCenteredRMSProp<GPUDevice, complex64>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, complex128>;
 #endif
diff --git a/tensorflow/core/kernels/unary_ops_composition.cc b/tensorflow/core/kernels/unary_ops_composition.cc
index 0b76a603843..dc8c9b7767c 100644
--- a/tensorflow/core/kernels/unary_ops_composition.cc
+++ b/tensorflow/core/kernels/unary_ops_composition.cc
@@ -137,7 +137,8 @@ class UnaryOpsComposition : public OpKernel {
   }
 
  private:
-  static const int kPacketSize = Eigen::internal::unpacket_traits<Packet>::size;
+  static constexpr int kPacketSize =
+      Eigen::internal::unpacket_traits<Packet>::size;
 
   static inline int64 AlignBlockSize(int64 block_size) {
     // Align block size to packet size and account for unrolling in run above.
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 8a9965fe16e..8316018294b 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -17,18 +17,62 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
+namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// `UniqueOpHashMap` defines the map type that is used when elements of type
+// `T` are to be uniquified. By default, we use `absl::flat_hash_map<T, TIndex>`
+// as the map type. Subsequent specializations are provided for
+// performance and/or correctness.
+template <typename T, typename TIndex>
+struct UniqueOpHashMap {
+  using map_type = absl::flat_hash_map<T, TIndex>;
+};
+
+// NOTE(mrry): For `tstring` elements, we use an `absl::string_view` key to
+// avoid copying the input strings into the map.
+template <typename TIndex>
+struct UniqueOpHashMap<tstring, TIndex> {
+  using map_type = absl::flat_hash_map<absl::string_view, TIndex>;
+};
+
+// NOTE(mrry): `absl::flat_hash_map<float, ...>` does not allow `NaN` as a key,
+// because `NaN != NaN`, so we fall back to `std::unordered_map<>` for
+// floating-point types.
+template <typename TIndex>
+struct UniqueOpHashMap<float, TIndex> {
+  using map_type = std::unordered_map<float, TIndex>;
+};
+template <typename TIndex>
+struct UniqueOpHashMap<double, TIndex> {
+  using map_type = std::unordered_map<double, TIndex>;
+};
+template <typename TIndex>
+struct UniqueOpHashMap<Eigen::half, TIndex> {
+  using map_type = std::unordered_map<Eigen::half, TIndex>;
+};
+template <typename TIndex>
+struct UniqueOpHashMap<bfloat16, TIndex> {
+  using map_type = std::unordered_map<bfloat16, TIndex>;
+};
+
+// `UniqueOp` computes the unique elements in the input tensor.
+//
+// * `T` is the element type.
+// * `TIndex` is the type used to represent indices in the output, either
+//   `int32` or `int64`.
 template <typename T, typename TIndex>
 class UniqueOp : public OpKernel {
  public:
@@ -106,10 +150,10 @@ class UniqueOp : public OpKernel {
       auto Tin = input.flat<T>();
       const int64 N = static_cast<int64>(Tin.size());
 
-      std::unordered_map<T, TIndex> uniq;
+      typename UniqueOpHashMap<T, TIndex>::map_type uniq;
       uniq.reserve(2 * N);
       for (Eigen::Index i = 0, j = 0; i < N; ++i) {
-        auto it = uniq.insert(std::make_pair(Tin(i), j));
+        auto it = uniq.emplace(Tin(i), j);
         idx_vec(i) = it.first->second;
         if (it.second) {
           ++j;
@@ -153,13 +197,14 @@ class UniqueOp : public OpKernel {
         return true;
       };
 
-      std::unordered_map<int64, int64, decltype(hash_fn), decltype(equal_to_fn)>
+      absl::flat_hash_map<int64, int64, decltype(hash_fn),
+                          decltype(equal_to_fn)>
           uniq(0, hash_fn, equal_to_fn);
 
       uniq.reserve(2 * Tin.dimension(1));
 
       for (int64 i = 0, j = 0; i < Tin.dimension(1); ++i) {
-        auto it = uniq.insert(std::make_pair(i, j));
+        auto it = uniq.emplace(i, j);
         idx_vec(i) = it.first->second;
         if (it.second) {
           ++j;
@@ -311,4 +356,6 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("idx"),
                         UniqueOp<int64, int64>);
 #endif  // TENSORFLOW_USE_SYCL
+
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc
index 4861a45848c..a0249d9bc4c 100644
--- a/tensorflow/core/kernels/unique_op_test.cc
+++ b/tensorflow/core/kernels/unique_op_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
@@ -75,11 +76,14 @@ static void BM_Unique_INT32(int iters, int dim, int max_int) {
                   .Input(test::graph::Constant(g, input))
                   .Attr("T", DT_INT32)
                   .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);
 
   testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(int32));
   testing::UseRealTime();
   testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
 }
 
 static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
@@ -95,12 +99,15 @@ static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
                   .Input(test::graph::Constant(g, input))
                   .Attr("T", DT_INT32)
                   .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);
 
   testing::BytesProcessed(static_cast<int64>(iters) * dim * 200 *
                           sizeof(int32));
   testing::UseRealTime();
   testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
 }
 
 TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
@@ -132,11 +139,14 @@ static void BM_Unique_STRING(int iters, int dim) {
                   .Input(test::graph::Constant(g, input))
                   .Attr("T", DT_STRING)
                   .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);
 
   testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(tstring));
   testing::UseRealTime();
   testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
 }
 
 BENCHMARK(BM_Unique_INT32)
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 42b262b70eb..d3aadf04144 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -231,6 +231,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
                           IsVariableInitializedOp);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_uint32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index f13f504c1d7..98f216c1e5b 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -21,28 +21,15 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#if GOOGLE_CUDA
-#include "third_party/cub/device/device_reduce.cuh"
-#include "third_party/cub/device/device_select.cuh"
-#include "third_party/cub/iterator/counting_input_iterator.cuh"
-#include "third_party/cub/iterator/transform_input_iterator.cuh"
-#elif TENSORFLOW_USE_ROCM
-#include "rocm/include/hipcub/hipcub.hpp"
-#endif
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/where_op.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
-#if GOOGLE_CUDA
-namespace gpuprim = ::cub;
-#elif TENSORFLOW_USE_ROCM
-namespace gpuprim = ::hipcub;
-#endif
-
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index e3bbff69cb5..54d78480066 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -194,171 +194,170 @@ struct bfloat16 {
     input = f.u;
     bfloat16 output;
 
+    // Fast rounding algorithm that rounds a half value to nearest even. This
+    // reduces expected error when we convert a large number of floats. Here
+    // is how it works:
+    //
+    // Definitions:
+    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+    // with the following tags:
+    //
+    // Sign |  Exp (8 bits) | Frac (23 bits)
+    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+    //
+    //  S: Sign bit.
+    //  E: Exponent bits.
+    //  F: First 6 bits of fraction.
+    //  L: Least significant bit of resulting bfloat16 if we truncate away the
+    //  rest of the float32. This is also the 7th bit of fraction
+    //  R: Rounding bit, 8th bit of fraction.
+    //  T: Sticky bits, rest of fraction, 15 bits.
+    //
+    // To round half to nearest even, there are 3 cases where we want to round
+    // down (simply truncate the result of the bits away, which consists of
+    // rounding bit and sticky bits) and two cases where we want to round up
+    // (truncate then add one to the result).
+    //
+    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+    // 1s) as the rounding bias, adds the rounding bias to the input, then
+    // truncates the last 16 bits away.
+    //
+    // To understand how it works, we can analyze this algorithm case by case:
+    //
+    // 1. L = 0, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input may create any carry, depending on
+    //   whether there is any value set to 1 in T bits.
+    //   - R may be set to 1 if there is a carry.
+    //   - L remains 0.
+    //   - Note that this case also handles Inf and -Inf, where all fraction
+    //   bits, including L, R and Ts are all 0. The output remains Inf after
+    //   this algorithm.
+    //
+    // 2. L = 1, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits but
+    //   adds 1 to rounding bit.
+    //   - L remains 1.
+    //
+    // 3. L = 0, R = 1, all of T are 0:
+    //   Expect: round down, this is exactly at half, the result is already
+    //   even (L=0).
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input sets all sticky bits to 1, but
+    //   doesn't create a carry.
+    //   - R remains 1.
+    //   - L remains 0.
+    //
+    // 4. L = 1, R = 1:
+    //   Expect: round up, this is exactly at half, the result needs to be
+    //   round to the next even number.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits, but
+    //   creates a carry from rounding bit.
+    //   - The carry sets L to 0, creates another carry bit and propagate
+    //   forward to F bits.
+    //   - If all the F bits are 1, a carry then propagates to the exponent
+    //   bits, which then creates the minimum value with the next exponent
+    //   value. Note that we won't have the case where exponents are all 1,
+    //   since that's either a NaN (handled in the other if condition) or inf
+    //   (handled in case 1).
+    //
+    // 5. L = 0, R = 1, any of T is 1:
+    //   Expect: round up, this is greater than half.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input creates a carry from sticky bits,
+    //   sets rounding bit to 0, then create another carry.
+    //   - The second carry sets L to 1.
+    //
+    // Examples:
+    //
+    //  Exact half value that is already even:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+    //
+    //     This falls into case 3. We truncate the rest of 16 bits and no
+    //     carry is created into F and L:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //  Exact half value, round to next even number:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     which then propagates into L and F:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //
+    //  Max denormal value round to min normal value:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+    //
+    //  Max normal value round to Inf:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+    //
+    //
+    // Least significant bit of resulting bfloat.
+    uint32_t lsb = (input >> 16) & 1;
+    uint32_t rounding_bias = 0x7fff + lsb;
+    input += rounding_bias;
+    output.value = static_cast<uint16_t>(input >> 16);
+    if ((f.u & 0xff800000u) == 0) {
+      // Flush positive denormal to 0
+      output.value = 0x0;
+    }
+    if ((f.u & 0xff800000u) == 0x80000000u) {
+      // Flush negative denormal to -0
+      output.value = 0x8000;
+    }
     if (float_isnan(v)) {
-      // If the value is a NaN, squash it to a qNaN with msb of fraction set,
-      // this makes sure after truncation we don't end up with an inf.
-      //
-      // qNaN magic: All exponent bits set + most significant bit of fraction
-      // set.
-      output.value = 0x7fc0;
-    } else if (std::fabs(v) < std::numeric_limits<float>::min()) {
-      // Flush denormal to +/- 0.0
-      output.value = std::signbit(v) ? 0x8000 : 0;
-    } else {
-      // Fast rounding algorithm that rounds a half value to nearest even. This
-      // reduces expected error when we convert a large number of floats. Here
-      // is how it works:
-      //
-      // Definitions:
-      // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
-      // with the following tags:
-      //
-      // Sign |  Exp (8 bits) | Frac (23 bits)
-      //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
-      //
-      //  S: Sign bit.
-      //  E: Exponent bits.
-      //  F: First 6 bits of fraction.
-      //  L: Least significant bit of resulting bfloat16 if we truncate away the
-      //  rest of the float32. This is also the 7th bit of fraction
-      //  R: Rounding bit, 8th bit of fraction.
-      //  T: Sticky bits, rest of fraction, 15 bits.
-      //
-      // To round half to nearest even, there are 3 cases where we want to round
-      // down (simply truncate the result of the bits away, which consists of
-      // rounding bit and sticky bits) and two cases where we want to round up
-      // (truncate then add one to the result).
-      //
-      // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
-      // 1s) as the rounding bias, adds the rounding bias to the input, then
-      // truncates the last 16 bits away.
-      //
-      // To understand how it works, we can analyze this algorithm case by case:
-      //
-      // 1. L = 0, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input may create any carry, depending on
-      //   whether there is any value set to 1 in T bits.
-      //   - R may be set to 1 if there is a carry.
-      //   - L remains 0.
-      //   - Note that this case also handles Inf and -Inf, where all fraction
-      //   bits, including L, R and Ts are all 0. The output remains Inf after
-      //   this algorithm.
-      //
-      // 2. L = 1, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits but
-      //   adds 1 to rounding bit.
-      //   - L remains 1.
-      //
-      // 3. L = 0, R = 1, all of T are 0:
-      //   Expect: round down, this is exactly at half, the result is already
-      //   even (L=0).
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input sets all sticky bits to 1, but
-      //   doesn't create a carry.
-      //   - R remains 1.
-      //   - L remains 0.
-      //
-      // 4. L = 1, R = 1:
-      //   Expect: round up, this is exactly at half, the result needs to be
-      //   round to the next even number.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits, but
-      //   creates a carry from rounding bit.
-      //   - The carry sets L to 0, creates another carry bit and propagate
-      //   forward to F bits.
-      //   - If all the F bits are 1, a carry then propagates to the exponent
-      //   bits, which then creates the minimum value with the next exponent
-      //   value. Note that we won't have the case where exponents are all 1,
-      //   since that's either a NaN (handled in the other if condition) or inf
-      //   (handled in case 1).
-      //
-      // 5. L = 0, R = 1, any of T is 1:
-      //   Expect: round up, this is greater than half.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input creates a carry from sticky bits,
-      //   sets rounding bit to 0, then create another carry.
-      //   - The second carry sets L to 1.
-      //
-      // Examples:
-      //
-      //  Exact half value that is already even:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
-      //
-      //     This falls into case 3. We truncate the rest of 16 bits and no
-      //     carry is created into F and L:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //  Exact half value, round to next even number:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     which then propagates into L and F:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //
-      //  Max denormal value round to min normal value:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
-      //
-      //  Max normal value round to Inf:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
-      //
-      //
-      // Least significant bit of resulting bfloat.
-      uint32_t lsb = (input >> 16) & 1;
-      uint32_t rounding_bias = 0x7fff + lsb;
-      input += rounding_bias;
-      output.value = static_cast<uint16_t>(input >> 16);
+      output.value = NAN_VALUE;
     }
     return output;
   }
@@ -392,11 +391,11 @@ struct bfloat16 {
   uint16_t value;
 
   // A value that represents "not a number".
-  static const uint16_t NAN_VALUE = 0x7FC0;
+  static constexpr uint16_t NAN_VALUE = 0x7FC0;
 
  private:
   // A value that represents "zero".
-  static const uint16_t ZERO_VALUE = 0;
+  static constexpr uint16_t ZERO_VALUE = 0;
 
   B16_DEVICE_FUNC static bool float_isnan(const float& x) {
 #ifdef __CUDA_ARCH__
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index 80ad4943f16..491e4c5e7aa 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -138,10 +138,13 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        "//tensorflow/core:error_codes_proto_impl",
+        "//tensorflow/core/protobuf:error_codes_proto_impl",
     ],
-    visibility = ["//tensorflow/core:__subpackages__"],
-    exports = ["//tensorflow/core:error_codes_proto_impl"],
+    visibility = [
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow/core/protobuf:__subpackages__",
+    ],
+    exports = ["//tensorflow/core/protobuf:error_codes_proto_impl"],
 )
 
 # Export source files needed for mobile builds, which do not use granular targets.
diff --git a/tensorflow/core/lib/core/arena.h b/tensorflow/core/lib/core/arena.h
index 624ee77027e..4b791afa75f 100644
--- a/tensorflow/core/lib/core/arena.h
+++ b/tensorflow/core/lib/core/arena.h
@@ -57,7 +57,7 @@ class Arena {
 #ifdef __i386__
   static const int kDefaultAlignment = 4;
 #else
-  static const int kDefaultAlignment = 8;
+  static constexpr int kDefaultAlignment = 8;
 #endif
 
  protected:
diff --git a/tensorflow/core/lib/core/bitmap.h b/tensorflow/core/lib/core/bitmap.h
index 8ff1e666b4f..726c6474a70 100644
--- a/tensorflow/core/lib/core/bitmap.h
+++ b/tensorflow/core/lib/core/bitmap.h
@@ -63,7 +63,7 @@ class Bitmap {
 
  private:
   typedef uint32 Word;
-  static const size_t kBits = 32;
+  static constexpr size_t kBits = 32;
 
   // Return the number of words needed to store n bits.
   static size_t NumWords(size_t n) { return (n + kBits - 1) / kBits; }
diff --git a/tensorflow/core/lib/gtl/flatrep.h b/tensorflow/core/lib/gtl/flatrep.h
index 65a076b0f39..d9d962276f4 100644
--- a/tensorflow/core/lib/gtl/flatrep.h
+++ b/tensorflow/core/lib/gtl/flatrep.h
@@ -45,8 +45,8 @@ template <typename Key, typename Bucket, class Hash, class Eq>
 class FlatRep {
  public:
   // kWidth is the number of entries stored in a bucket.
-  static const uint32 kBase = 3;
-  static const uint32 kWidth = (1 << kBase);
+  static constexpr uint32 kBase = 3;
+  static constexpr uint32 kWidth = (1 << kBase);
 
   FlatRep(size_t N, const Hash& hf, const Eq& eq) : hash_(hf), equal_(eq) {
     Init(N);
diff --git a/tensorflow/core/lib/hash/crc32c.cc b/tensorflow/core/lib/hash/crc32c.cc
index 9a3eba704ac..244077b6037 100644
--- a/tensorflow/core/lib/hash/crc32c.cc
+++ b/tensorflow/core/lib/hash/crc32c.cc
@@ -265,9 +265,7 @@ uint32 Extend(uint32 crc, const char *buf, size_t size) {
 
 #if defined(PLATFORM_GOOGLE)
 uint32 Extend(uint32 crc, const absl::Cord &cord) {
-  absl::CordReader reader(cord);
-  absl::string_view fragment;
-  while (reader.ReadFragment(&fragment)) {
+  for (absl::string_view fragment : cord.Chunks()) {
     crc = Extend(crc, fragment.data(), fragment.size());
   }
   return crc;
diff --git a/tensorflow/core/lib/io/cache_test.cc b/tensorflow/core/lib/io/cache_test.cc
index 38552d43b34..002ab0bd35d 100644
--- a/tensorflow/core/lib/io/cache_test.cc
+++ b/tensorflow/core/lib/io/cache_test.cc
@@ -44,7 +44,7 @@ class CacheTest : public ::testing::Test {
     current_->deleted_values_.push_back(DecodeValue(v));
   }
 
-  static const int kCacheSize = 1000;
+  static constexpr int kCacheSize = 1000;
   std::vector<int> deleted_keys_;
   std::vector<int> deleted_values_;
   Cache* cache_;
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index d1453e7cff3..dd7def79f05 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -63,8 +63,8 @@ class RecordReader {
   //  uint32    masked crc of length
   //  byte      data[length]
   //  uint32    masked crc of data
-  static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
-  static const size_t kFooterSize = sizeof(uint32);
+  static constexpr size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
+  static constexpr size_t kFooterSize = sizeof(uint32);
 
   // Statistics (sizes are in units of bytes)
   struct Stats {
diff --git a/tensorflow/core/lib/io/record_writer.h b/tensorflow/core/lib/io/record_writer.h
index dba4d75799e..012c2fbbc91 100644
--- a/tensorflow/core/lib/io/record_writer.h
+++ b/tensorflow/core/lib/io/record_writer.h
@@ -55,8 +55,8 @@ class RecordWriter {
   //  uint32    masked crc of length
   //  byte      data[length]
   //  uint32    masked crc of data
-  static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
-  static const size_t kFooterSize = sizeof(uint32);
+  static constexpr size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
+  static constexpr size_t kFooterSize = sizeof(uint32);
 
   // Create a writer that will append data to "*dest".
   // "*dest" must be initially empty.
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index b0c8b9a28a1..563503a1319 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
@@ -42,9 +42,7 @@ Status SnappyOutputBuffer::Append(StringPiece data) { return Write(data); }
 
 #if defined(PLATFORM_GOOGLE)
 Status SnappyOutputBuffer::Append(const absl::Cord& cord) {
-  absl::CordReader reader(cord);
-  absl::string_view fragment;
-  while (reader.ReadFragment(&fragment)) {
+  for (absl::string_view fragment : cord.Chunks()) {
     TF_RETURN_IF_ERROR(Append(fragment));
   }
   return Status::OK();
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.cc b/tensorflow/core/lib/io/zlib_outputbuffer.cc
index 3b3b4745508..5840ca60242 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.cc
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.cc
@@ -192,9 +192,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
 
 #if defined(PLATFORM_GOOGLE)
 Status ZlibOutputBuffer::Append(const absl::Cord& cord) {
-  absl::CordReader reader(cord);
-  absl::string_view fragment;
-  while (reader.ReadFragment(&fragment)) {
+  for (absl::string_view fragment : cord.Chunks()) {
     TF_RETURN_IF_ERROR(Append(fragment));
   }
   return Status::OK();
diff --git a/tensorflow/core/lib/random/philox_random.h b/tensorflow/core/lib/random/philox_random.h
index ee32e40f235..1d419f109f2 100644
--- a/tensorflow/core/lib/random/philox_random.h
+++ b/tensorflow/core/lib/random/philox_random.h
@@ -49,7 +49,7 @@ namespace random {
 template <typename T, int ElementCount>
 class Array {
  public:
-  static const int kElementCount = ElementCount;
+  static constexpr int kElementCount = ElementCount;
   PHILOX_DEVICE_INLINE Array() {
     for (int i = 0; i < ElementCount; ++i) {
       data_[i] = T(0);
@@ -105,9 +105,9 @@ class PhiloxRandom {
   using ResultType = Array<uint32, 4>;
   using ResultElementType = uint32;
   // The number of elements that will be returned.
-  static const int kResultElementCount = 4;
+  static constexpr int kResultElementCount = 4;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 10;
+  static constexpr int kElementCost = 10;
   // The type for the 64-bit key stored in the form of two 32-bit uint
   // that are used in the diffusion process.
   using Key = Array<uint32, 2>;
@@ -192,10 +192,10 @@ class PhiloxRandom {
 
  private:
   // We use the same constants as recommended by the original paper.
-  static const uint32 kPhiloxW32A = 0x9E3779B9;
-  static const uint32 kPhiloxW32B = 0xBB67AE85;
-  static const uint32 kPhiloxM4x32A = 0xD2511F53;
-  static const uint32 kPhiloxM4x32B = 0xCD9E8D57;
+  static constexpr uint32 kPhiloxW32A = 0x9E3779B9;
+  static constexpr uint32 kPhiloxW32B = 0xBB67AE85;
+  static constexpr uint32 kPhiloxM4x32A = 0xD2511F53;
+  static constexpr uint32 kPhiloxM4x32B = 0xCD9E8D57;
 
   // Helper function to skip the next sample of 128-bits in the current stream.
   PHILOX_DEVICE_INLINE void SkipOne() {
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 2da5e29cc50..386f13347d7 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -71,12 +71,12 @@ template <class Generator>
 class UniformDistribution<Generator, Eigen::half> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount;
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 3;
+  static constexpr int kElementCost = 3;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<Eigen::half, kResultElementCount> ResultType;
   typedef Eigen::half ResultElementType;
 
@@ -95,12 +95,12 @@ template <class Generator>
 class UniformDistribution<Generator, bfloat16> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount;
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 3;
+  static constexpr int kElementCost = 3;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<bfloat16, kResultElementCount> ResultType;
   typedef bfloat16 ResultElementType;
 
@@ -119,12 +119,12 @@ template <class Generator>
 class UniformDistribution<Generator, float> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount;
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 3;
+  static constexpr int kElementCost = 3;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<float, kResultElementCount> ResultType;
   typedef float ResultElementType;
 
@@ -143,12 +143,12 @@ template <class Generator>
 class UniformDistribution<Generator, double> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount / 2;
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 3;
+  static constexpr int kElementCost = 3;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<double, kResultElementCount> ResultType;
   typedef double ResultElementType;
 
@@ -167,12 +167,12 @@ template <class Generator>
 class UniformDistribution<Generator, int32> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount;
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 3;
+  static constexpr int kElementCost = 3;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<int32, kResultElementCount> ResultType;
   typedef int32 ResultElementType;
 
@@ -202,12 +202,12 @@ template <class Generator>
 class UniformDistribution<Generator, int64> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount / 2;
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 3;
+  static constexpr int kElementCost = 3;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<int64, kResultElementCount> ResultType;
   typedef int64 ResultElementType;
 
@@ -244,12 +244,12 @@ template <typename Generator, typename IntType>
 class UniformFullIntDistribution32 {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount;
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 3;
+  static constexpr int kElementCost = 3;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<IntType, kResultElementCount> ResultType;
   typedef IntType ResultElementType;
 
@@ -268,12 +268,12 @@ template <typename Generator, typename IntType>
 class UniformFullIntDistribution64 {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount / 2;
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 3;
+  static constexpr int kElementCost = 3;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<IntType, kResultElementCount> ResultType;
   typedef IntType ResultElementType;
 
@@ -307,9 +307,9 @@ template <class Generator>
 class SingleSampleAdapter {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = 1;
+  static constexpr int kResultElementCount = 1;
   // The number of elements that will be returned by the underlying generator.
-  static const int kNativeElementCount = Generator::kResultElementCount;
+  static constexpr int kNativeElementCount = Generator::kResultElementCount;
   typedef typename Generator::ResultElementType ResultType;
   typedef typename Generator::ResultElementType ResultElementType;
 
@@ -391,12 +391,12 @@ template <class Generator>
 class NormalDistribution<Generator, Eigen::half> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount;
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 70;
+  static constexpr int kElementCost = 70;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<Eigen::half, kResultElementCount> ResultType;
   typedef Eigen::half ResultElementType;
 
@@ -418,12 +418,12 @@ template <class Generator>
 class NormalDistribution<Generator, bfloat16> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount;
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 70;
+  static constexpr int kElementCost = 70;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<bfloat16, kResultElementCount> ResultType;
   typedef bfloat16 ResultElementType;
 
@@ -448,12 +448,12 @@ template <class Generator>
 class NormalDistribution<Generator, float> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount;
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 70;
+  static constexpr int kElementCost = 70;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<float, kResultElementCount> ResultType;
   typedef float ResultElementType;
 
@@ -472,12 +472,12 @@ template <class Generator>
 class NormalDistribution<Generator, double> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount = Generator::kResultElementCount / 2;
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 70;
+  static constexpr int kElementCost = 70;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = false;
+  static constexpr bool kVariableSamplesPerOutput = false;
   typedef Array<double, kResultElementCount> ResultType;
   typedef double ResultElementType;
 
@@ -515,13 +515,13 @@ template <class SingleSampleGenerator>
 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount =
+  static constexpr int kResultElementCount =
       SingleSampleGenerator::kNativeElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 90;
+  static constexpr int kElementCost = 90;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = true;
+  static constexpr bool kVariableSamplesPerOutput = true;
   // The threshold where the normal distribution is truncated.
   const float kTruncateValue = 2.0f;
 
@@ -561,13 +561,13 @@ template <class SingleSampleGenerator>
 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount =
+  static constexpr int kResultElementCount =
       SingleSampleGenerator::kNativeElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 90;
+  static constexpr int kElementCost = 90;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = true;
+  static constexpr bool kVariableSamplesPerOutput = true;
   // The threshold where the normal distribution is truncated.
   const float kTruncateValue = 2.0f;
 
@@ -608,13 +608,13 @@ template <class SingleSampleGenerator>
 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount =
+  static constexpr int kResultElementCount =
       SingleSampleGenerator::kNativeElementCount;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 90;
+  static constexpr int kElementCost = 90;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = true;
+  static constexpr bool kVariableSamplesPerOutput = true;
   // The threshold where the normal distribution is truncated.
   const float kTruncateValue = 2.0f;
 
@@ -655,15 +655,15 @@ template <class SingleSampleGenerator>
 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
  public:
   // The number of elements that will be returned.
-  static const int kResultElementCount =
+  static constexpr int kResultElementCount =
       (SingleSampleGenerator::kNativeElementCount > 1)
           ? SingleSampleGenerator::kNativeElementCount / 2
           : 1;
   // Cost of generation of a single element (in cycles).
-  static const int kElementCost = 90;
+  static constexpr int kElementCost = 90;
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
-  static const bool kVariableSamplesPerOutput = true;
+  static constexpr bool kVariableSamplesPerOutput = true;
   typedef Array<double, kResultElementCount> ResultType;
   typedef double ResultElementType;
   const double kTruncateValue = 2.0;
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index a4973160182..13dfca7dd56 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -307,7 +307,7 @@ class MockGenerator {
   explicit MockGenerator(uint64 seed) : counter_(seed) {}
   using ResultType = std::vector<uint32>;
   using ResultElementType = uint32;
-  static const int kResultElementCount = 1;
+  static constexpr int kResultElementCount = 1;
   ResultType operator()() {
     ResultType result;
     result.push_back(counter_++);
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index d318059e8f6..592ab2bd2a5 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -235,7 +235,8 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
       ReadValue<uint32>(wav_string, &format_chunk_size, &offset));
   if ((format_chunk_size != 16) && (format_chunk_size != 18)) {
     return errors::InvalidArgument(
-        "Bad file size for WAV: Expected 16 or 18, but got", format_chunk_size);
+        "Bad format chunk size for WAV: Expected 16 or 18, but got",
+        format_chunk_size);
   }
   uint16 audio_format;
   TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &audio_format, &offset));
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 9f9fa28be0b..68af9fbb2f0 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -452,6 +452,7 @@ void NcclManager::AddReduceSend(std::unique_ptr<Participant> participant,
 void NcclManager::AddReduceRecv(std::unique_ptr<Participant> participant,
                                 const Context& context,
                                 ncclRedOp_t reduction_op) {
+  participant->root = true;
   AddParticipant(std::move(participant), context, kReduce, reduction_op);
 }
 
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 98803bfe086..b894d3d73da 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -402,7 +402,7 @@ REGISTER_OP("Empty")
     .Output("output: dtype")
     .Attr("dtype: type")
     .Attr("init: bool = false")
-    .SetIsStateful()
+    .SetDoNotOptimize()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
@@ -744,7 +744,7 @@ REGISTER_OP("GuaranteeConst")
       return UnchangedShape(c);
     })
     // We don't want this to be optimized away.
-    .SetIsStateful();
+    .SetDoNotOptimize();
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ZerosLike")
@@ -2164,8 +2164,10 @@ namespace {
 template <typename InputType>
 std::vector<int64> GetFlatInt64(const Tensor& t) {
   std::vector<int64> output(t.shape().num_elements());
-  auto eigen_vec = t.flat<InputType>();
-  std::copy_n(&eigen_vec(0), output.size(), output.begin());
+  if (t.shape().num_elements() > 0) {
+    auto eigen_vec = t.flat<InputType>();
+    std::copy_n(&eigen_vec(0), output.size(), output.begin());
+  }
   return output;
 }
 
@@ -2202,7 +2204,7 @@ Status SpaceToBatchShapeHelper(InferenceContext* c, ShapeHandle input_shape,
 
   DimensionHandle batch_size = c->Dim(input_shape, 0);
   std::vector<int64> block_shape_vec;
-  if (block_shape_t) {
+  if (block_shape_t && (block_shape_t->NumElements() > 0)) {
     block_shape_vec = GetFlatInt64(*block_shape_t);
     for (int64 dim = 0; dim < num_block_dims; ++dim) {
       const int64 block_shape_value = block_shape_vec[dim];
@@ -2223,7 +2225,7 @@ Status SpaceToBatchShapeHelper(InferenceContext* c, ShapeHandle input_shape,
   std::vector<DimensionHandle> output_dims{batch_size};
   output_dims.resize(num_block_dims + 1, c->UnknownDim());
 
-  if (paddings_t) {
+  if (paddings_t && (paddings_t->NumElements() > 0)) {
     const std::vector<int64> paddings_vec = GetFlatInt64(*paddings_t);
     for (int64 dim = 0; dim < num_block_dims; ++dim) {
       const int64 pad_start = paddings_vec[dim * 2],
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 443e1124df8..1725bdbac39 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -1442,6 +1442,16 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
     op.input_tensors[2] = nullptr;
   }
 
+  {
+    Tensor block_shape = test::AsTensor<int32>({});
+    op.input_tensors[1] = &block_shape;
+    Tensor paddings = test::AsTensor<int32>({});
+    op.input_tensors[2] = &paddings;
+    INFER_OK(op, "?;[0];[0,2]", "?");
+    op.input_tensors[1] = nullptr;
+    op.input_tensors[2] = nullptr;
+  }
+
   INFER_ERROR("rank", op, "[1,3,3,1];[2];[1]");
   INFER_ERROR("shape", op, "[1,3,3,1];[2];[1,2]");
 }
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ArgMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ArgMax.pbtxt
index 6fd71eb1bcb..73f03d3ea01 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/ArgMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/ArgMax.pbtxt
@@ -308,3 +308,70 @@ op {
     }
   }
 }
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ArgMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ArgMin.pbtxt
index b6fa24e7750..3d51efa9870 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/ArgMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/ArgMin.pbtxt
@@ -308,3 +308,70 @@ op {
     }
   }
 }
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BeginEpoch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BeginEpoch.pbtxt
deleted file mode 100644
index d933a0a37fa..00000000000
--- a/tensorflow/core/ops/compat/ops_history_v1/BeginEpoch.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-op {
-  name: "BeginEpoch"
-  input_arg {
-    name: "dataset_id"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "address"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "protocol"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "epoch_id"
-    type: DT_INT64
-  }
-}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DataServiceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DataServiceDataset.pbtxt
deleted file mode 100644
index 5fc666e77a9..00000000000
--- a/tensorflow/core/ops/compat/ops_history_v1/DataServiceDataset.pbtxt
+++ /dev/null
@@ -1,32 +0,0 @@
-op {
-  name: "DataServiceDataset"
-  input_arg {
-    name: "address"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "protocol"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "max_outstanding_requests"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MakeDataServiceIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MakeDataServiceIterator.pbtxt
deleted file mode 100644
index 11a7fc1a764..00000000000
--- a/tensorflow/core/ops/compat/ops_history_v1/MakeDataServiceIterator.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-op {
-  name: "MakeDataServiceIterator"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "epoch_id"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt
index 6fd71eb1bcb..73f03d3ea01 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt
@@ -308,3 +308,70 @@ op {
     }
   }
 }
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt
index b6fa24e7750..3d51efa9870 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt
@@ -308,3 +308,70 @@ op {
     }
   }
 }
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BeginEpoch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BeginEpoch.pbtxt
deleted file mode 100644
index d933a0a37fa..00000000000
--- a/tensorflow/core/ops/compat/ops_history_v2/BeginEpoch.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-op {
-  name: "BeginEpoch"
-  input_arg {
-    name: "dataset_id"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "address"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "protocol"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "epoch_id"
-    type: DT_INT64
-  }
-}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt
index 5fc666e77a9..be7deca49cb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt
@@ -1,5 +1,13 @@
 op {
   name: "DataServiceDataset"
+  input_arg {
+    name: "dataset_id"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "processing_mode"
+    type: DT_STRING
+  }
   input_arg {
     name: "address"
     type: DT_STRING
@@ -8,14 +16,29 @@ op {
     name: "protocol"
     type: DT_STRING
   }
+  input_arg {
+    name: "job_name"
+    type: DT_STRING
+  }
   input_arg {
     name: "max_outstanding_requests"
     type: DT_INT64
   }
+  input_arg {
+    name: "iteration_counter"
+    type: DT_RESOURCE
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
+  attr {
+    name: "task_refresh_interval_hint_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
   attr {
     name: "output_types"
     type: "list(type)"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
new file mode 100644
index 00000000000..9bab6854e40
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "DenseBincount"
+  input_arg {
+    name: "input"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
new file mode 100644
index 00000000000..be566eab9f4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
@@ -0,0 +1,67 @@
+op {
+  name: "DenseCountSparseOutput"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "minlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "maxlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DummyIterationCounter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DummyIterationCounter.pbtxt
new file mode 100644
index 00000000000..b1df20cae73
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DummyIterationCounter.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "DummyIterationCounter"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DummyMemoryCache.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DummyMemoryCache.pbtxt
new file mode 100644
index 00000000000..63901e2585e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DummyMemoryCache.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "DummyMemoryCache"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DummySeedGenerator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DummySeedGenerator.pbtxt
new file mode 100644
index 00000000000..585bc7c7528
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DummySeedGenerator.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "DummySeedGenerator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt
index 2ebe636cb53..f84733accce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt
@@ -164,3 +164,53 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "incompatible_shape_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MakeDataServiceIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MakeDataServiceIterator.pbtxt
deleted file mode 100644
index 11a7fc1a764..00000000000
--- a/tensorflow/core/ops/compat/ops_history_v2/MakeDataServiceIterator.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-op {
-  name: "MakeDataServiceIterator"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "epoch_id"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Maximum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Maximum.pbtxt
index 6ca150466b9..f0ac23bce83 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Maximum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Maximum.pbtxt
@@ -116,3 +116,64 @@ op {
     }
   }
 }
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Minimum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Minimum.pbtxt
index 9cebfc5352e..5920b630137 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Minimum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Minimum.pbtxt
@@ -116,3 +116,64 @@ op {
     }
   }
 }
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MulNoNan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MulNoNan.pbtxt
index 06145e59f54..ca5c92fb15b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MulNoNan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MulNoNan.pbtxt
@@ -52,3 +52,32 @@ op {
     }
   }
 }
+op {
+  name: "MulNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt
index 3c9fcbce46c..0039b5e2cb6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt
@@ -164,3 +164,53 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "incompatible_shape_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
new file mode 100644
index 00000000000..4f5fb24109c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "RaggedBincount"
+  input_arg {
+    name: "splits"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
new file mode 100644
index 00000000000..aa1a4e07aaf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
@@ -0,0 +1,71 @@
+op {
+  name: "RaggedCountSparseOutput"
+  input_arg {
+    name: "splits"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "minlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "maxlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt
index 5af8dd5896a..21ffe33b6e6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt
@@ -37,3 +37,49 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ShuffleAndRepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt
new file mode 100644
index 00000000000..bac1de1c30c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "ShuffleAndRepeatDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt
new file mode 100644
index 00000000000..792a1b410d1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt
@@ -0,0 +1,47 @@
+op {
+  name: "ShuffleDatasetV3"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
new file mode 100644
index 00000000000..9bbc5132845
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "SparseBincount"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
new file mode 100644
index 00000000000..ed79733f97f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
@@ -0,0 +1,75 @@
+op {
+  name: "SparseCountSparseOutput"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "minlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "maxlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..73002a92f24
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "SparseCrossHashed"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "num_buckets"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "strong_hash"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "salt"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..206542e4713
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt
@@ -0,0 +1,64 @@
+op {
+  name: "SparseCrossV2"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "sep"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
index 0447e6f4c96..a3fde8699b1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
@@ -40,3 +40,58 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGrad.pbtxt
index c31439fdfbe..092e04a2817 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGrad.pbtxt
@@ -44,3 +44,62 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMeanGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
index ed3693a59de..2d1d816200a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
@@ -57,3 +57,75 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMeanWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
index f8564802e51..6ab44de93ec 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
@@ -40,3 +40,58 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGrad.pbtxt
index 569b5b88177..7520a44fc8d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGrad.pbtxt
@@ -44,3 +44,62 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtNGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
index 753cfe4d7a0..038a5a2bd28 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -57,3 +57,75 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtNWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSum.pbtxt
index 9ecc20766d7..c8a078df1c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSum.pbtxt
@@ -202,3 +202,68 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumWithNumSegments.pbtxt
index 06087450eb6..067eef82644 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumWithNumSegments.pbtxt
@@ -136,3 +136,85 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSumWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/count_ops.cc b/tensorflow/core/ops/count_ops.cc
new file mode 100644
index 00000000000..8de0a2ef954
--- /dev/null
+++ b/tensorflow/core/ops/count_ops.cc
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
+  auto values = c->input(0);
+  auto weights = c->input(1);
+  ShapeHandle output;
+  auto num_weights = c->NumElements(weights);
+  if (c->ValueKnown(num_weights) && c->Value(num_weights) == 0) {
+    output = values;
+  } else {
+    TF_RETURN_IF_ERROR(c->Merge(weights, values, &output));
+  }
+  auto rank = c->Rank(output);
+  auto nvals = c->UnknownDim();
+  c->set_output(0, c->Matrix(nvals, rank));  // out.indices
+  c->set_output(1, c->Vector(nvals));        // out.values
+  c->set_output(2, c->Vector(rank));         // out.dense_shape
+  return Status::OK();
+}
+
+Status SparseCountSparseOutputShapeFn(InferenceContext *c) {
+  auto rank = c->Dim(c->input(0), 1);
+  auto nvals = c->UnknownDim();
+  c->set_output(0, c->Matrix(nvals, rank));  // out.indices
+  c->set_output(1, c->Vector(nvals));        // out.values
+  c->set_output(2, c->Vector(rank));         // out.dense_shape
+  return Status::OK();
+}
+
+Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
+  int32 rank = c->Rank(c->input(1));
+  if (rank != c->kUnknownRank) {
+    ++rank;  // Add the ragged dimension
+  }
+  auto nvals = c->UnknownDim();
+  c->set_output(0, c->Matrix(nvals, rank));  // out.indices
+  c->set_output(1, c->Vector(nvals));        // out.values
+  c->set_output(2, c->Vector(rank));         // out.dense_shape
+  return Status::OK();
+}
+
+REGISTER_OP("DenseCountSparseOutput")
+    .Input("values: T")
+    .Input("weights: output_type")
+    .Attr("T: {int32, int64}")
+    .Attr("minlength: int >= -1 = -1")
+    .Attr("maxlength: int >= -1 = -1")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
+    .SetShapeFn(DenseCountSparseOutputShapeFn)
+    .Output("output_indices: int64")
+    .Output("output_values: output_type")
+    .Output("output_dense_shape: int64");
+
+REGISTER_OP("SparseCountSparseOutput")
+    .Input("indices: int64")
+    .Input("values: T")
+    .Input("dense_shape: int64")
+    .Input("weights: output_type")
+    .Attr("T: {int32, int64}")
+    .Attr("minlength: int >= -1 = -1")
+    .Attr("maxlength: int >= -1 = -1")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
+    .SetShapeFn(SparseCountSparseOutputShapeFn)
+    .Output("output_indices: int64")
+    .Output("output_values: output_type")
+    .Output("output_dense_shape: int64");
+
+REGISTER_OP("RaggedCountSparseOutput")
+    .Input("splits: int64")
+    .Input("values: T")
+    .Input("weights: output_type")
+    .Attr("T: {int32, int64}")
+    .Attr("minlength: int >= -1 = -1")
+    .Attr("maxlength: int >= -1 = -1")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
+    .SetShapeFn(RaggedCountSparseOutputShapeFn)
+    .Output("output_indices: int64")
+    .Output("output_values: output_type")
+    .Output("output_dense_shape: int64");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 74e0d5bcf84..0122cbed087 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -37,8 +37,8 @@ REGISTER_OP("TensorDataset")
     .Output("handle: variant")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that
                                                 // `components` have shapes
                                                 // compatible with
@@ -49,8 +49,8 @@ REGISTER_OP("TensorSliceDataset")
     .Output("handle: variant")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that the
                                                 // dim-0 slices of `components`
                                                 // have shapes compatible with
@@ -62,8 +62,8 @@ REGISTER_OP("SparseTensorSliceDataset")
     .Input("dense_shape: int64")
     .Output("handle: variant")
     .Attr("Tvalues: type")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GeneratorDataset")
@@ -79,8 +79,8 @@ REGISTER_OP("GeneratorDataset")
     .Attr("Tfinalize_func_args: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ZipDataset")
@@ -392,8 +392,8 @@ REGISTER_OP("RangeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // start, stop, and step should be scalars.
@@ -438,6 +438,13 @@ REGISTER_OP("DeleteRandomSeedGenerator")
     .Input("deleter: variant")
     .SetShapeFn(shape_inference::NoOutputs);
 
+REGISTER_OP("DummySeedGenerator")
+    .Output("handle: resource")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -465,12 +472,32 @@ REGISTER_OP("ShuffleDatasetV2")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
-      // buffer_size, seed, and seed2 should be scalars.
+      // buffer_size and seed_generator should be scalars.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ShuffleDatasetV3")
+    .Input("input_dataset: variant")
+    .Input("buffer_size: int64")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Input("seed_generator: resource")
+    .Output("handle: variant")
+    .Attr("reshuffle_each_iteration: bool = true")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, seed2, and seed_generator should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("ShuffleAndRepeatDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -480,6 +507,7 @@ REGISTER_OP("ShuffleAndRepeatDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("reshuffle_each_iteration: bool = true")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size, seed, seed2, and count should be scalars.
@@ -490,6 +518,28 @@ REGISTER_OP("ShuffleAndRepeatDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ShuffleAndRepeatDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("buffer_size: int64")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Input("count: int64")
+    .Input("seed_generator: resource")
+    .Output("handle: variant")
+    .Attr("reshuffle_each_iteration: bool = true")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, seed2, count, and seed_generator should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("AnonymousMemoryCache")
     .Output("handle: resource")
     .Output("deleter: variant")
@@ -504,6 +554,13 @@ REGISTER_OP("DeleteMemoryCache")
     .Input("deleter: variant")
     .SetShapeFn(shape_inference::NoOutputs);
 
+REGISTER_OP("DummyMemoryCache")
+    .Output("handle: resource")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("CacheDataset")
     .Input("input_dataset: variant")
     .Input("filename: string")
@@ -538,8 +595,8 @@ REGISTER_OP("TextLineDataset")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
@@ -558,8 +615,8 @@ REGISTER_OP("FixedLengthRecordDataset")
     .Input("footer_bytes: int64")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
@@ -581,8 +638,8 @@ REGISTER_OP("FixedLengthRecordDatasetV2")
     .Input("buffer_size: int64")
     .Input("compression_type: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
@@ -601,8 +658,8 @@ REGISTER_OP("TFRecordDataset")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index d3e3a62aef5..2c9cbe2f416 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -145,8 +145,8 @@ REGISTER_OP("CSVDataset")
     .Output("handle: variant")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
@@ -187,8 +187,8 @@ REGISTER_OP("ExperimentalCSVDataset")
     .Output("handle: variant")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
@@ -426,8 +426,8 @@ REGISTER_OP("LMDBDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalLMDBDataset")
@@ -435,8 +435,8 @@ REGISTER_OP("ExperimentalLMDBDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MapAndBatchDataset")
@@ -508,8 +508,8 @@ REGISTER_OP("ExperimentalMapDataset")
 REGISTER_OP("MatchingFilesDataset")
     .Input("patterns: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `patterns` must be a scalar or a vector.
@@ -520,8 +520,8 @@ REGISTER_OP("MatchingFilesDataset")
 REGISTER_OP("ExperimentalMatchingFilesDataset")
     .Input("patterns: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `patterns` must be a scalar or a vector.
@@ -689,8 +689,8 @@ REGISTER_OP("ExperimentalRandomDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size, seed, and seed2 should be scalars.
@@ -705,8 +705,8 @@ REGISTER_OP("RandomDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size, seed, and seed2 should be scalars.
@@ -893,8 +893,8 @@ REGISTER_OP("SqlDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // driver_name, data_source_name, and query should be scalars.
@@ -911,8 +911,8 @@ REGISTER_OP("ExperimentalSqlDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // driver_name, data_source_name, and query should be scalars.
@@ -1037,11 +1037,23 @@ REGISTER_OP("ExperimentalUniqueDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("DummyIterationCounter")
+    .Output("handle: resource")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("DataServiceDataset")
+    .Input("dataset_id: int64")
+    .Input("processing_mode: string")
     .Input("address: string")
     .Input("protocol: string")
+    .Input("job_name: string")
     .Input("max_outstanding_requests: int64")
+    .Input("iteration_counter: resource")
     .Output("handle: variant")
+    .Attr("task_refresh_interval_hint_ms: int = -1")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()
@@ -1055,17 +1067,4 @@ REGISTER_OP("RegisterDataset")
     .Attr("external_state_policy: int")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("BeginEpoch")
-    .Input("dataset_id: int64")
-    .Input("address: string")
-    .Input("protocol: string")
-    .Output("epoch_id: int64")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("MakeDataServiceIterator")
-    .Input("dataset: variant")
-    .Input("epoch_id: int64")
-    .Input("iterator: resource")
-    .SetShapeFn(shape_inference::NoOutputs);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/function_ops.cc b/tensorflow/core/ops/function_ops.cc
index 80d269a07e0..3879e4a2d18 100644
--- a/tensorflow/core/ops/function_ops.cc
+++ b/tensorflow/core/ops/function_ops.cc
@@ -34,49 +34,53 @@ REGISTER_SYSTEM_OP("_Arg")
             "_Arg node does not have attribute \"T\"");
       }
 
-      if (dtype_attr->type() == DT_RESOURCE) {
-        const AttrValue* dtype_attr = context->attrs().Find("_handle_dtypes");
-        const AttrValue* shape_attr = context->attrs().Find("_handle_shapes");
-        if (dtype_attr && shape_attr) {
-          if (dtype_attr->list().type().empty()) {
-            return errors::InvalidArgument(
-                "Invalid \"_handle_dtypes\" attribute value for _Arg node: ",
-                shape_attr->DebugString());
-          }
-          if (shape_attr->list().shape().empty()) {
-            return errors::InvalidArgument(
-                "Invalid \"_handle_shapes\" attribute value for _Arg node: ",
-                shape_attr->DebugString());
-          }
-          DataType dtype = dtype_attr->list().type(0);
-          const TensorShapeProto& shape_proto = shape_attr->list().shape(0);
-          shape_inference::ShapeHandle shape_handle;
-          TF_RETURN_IF_ERROR(
-              context->MakeShapeFromShapeProto(shape_proto, &shape_handle));
-          context->set_output(0, shape_handle);
-          context->set_output_handle_shapes_and_types(
-              0, std::vector<shape_inference::ShapeAndType>{
-                     {shape_handle, dtype}});
-        } else {
-          context->set_output(0, context->UnknownShape());
+      const AttrValue* shape_attr = context->attrs().Find("_output_shapes");
+      if (shape_attr && shape_attr->has_list()) {
+        if (shape_attr->list().shape().empty()) {
+          return errors::InvalidArgument(
+              "Invalid \"_output_shapes\" attribute value for _Arg node: ",
+              shape_attr->DebugString());
         }
+        const TensorShapeProto& shape_proto = shape_attr->list().shape(0);
+        shape_inference::ShapeHandle shape_handle;
+        TF_RETURN_IF_ERROR(
+            context->MakeShapeFromShapeProto(shape_proto, &shape_handle));
+        context->set_output(0, shape_handle);
       } else {
-        const AttrValue* shape_attr = context->attrs().Find("_output_shapes");
-        if (shape_attr && shape_attr->has_list()) {
-          if (shape_attr->list().shape().empty()) {
-            return errors::InvalidArgument(
-                "Invalid \"_output_shapes\" attribute value for _Arg node: ",
-                shape_attr->DebugString());
-          }
-          const TensorShapeProto& shape_proto = shape_attr->list().shape(0);
-          shape_inference::ShapeHandle shape_handle;
-          TF_RETURN_IF_ERROR(
-              context->MakeShapeFromShapeProto(shape_proto, &shape_handle));
-          context->set_output(0, shape_handle);
-        } else {
-          context->set_output(0, context->UnknownShape());
-        }
+        context->set_output(0, context->UnknownShape());
       }
+
+      if (dtype_attr->type() != DT_RESOURCE) {
+        return Status::OK();
+      }
+
+      // If the argument is for a resource type, then also try to infer the
+      // type of the tensor store in the resource type.
+      dtype_attr = context->attrs().Find("_handle_dtypes");
+      shape_attr = context->attrs().Find("_handle_shapes");
+      // If either the shape or type attribute is not set then simply return
+      // with unknown output set above.
+      if (!dtype_attr || !shape_attr) {
+        return Status::OK();
+      }
+
+      if (dtype_attr->list().type().empty()) {
+        return errors::InvalidArgument(
+            "Invalid \"_handle_dtypes\" attribute value for _Arg node: ",
+            shape_attr->DebugString());
+      }
+      if (shape_attr->list().shape().empty()) {
+        return errors::InvalidArgument(
+            "Invalid \"_handle_shapes\" attribute value for _Arg node: ",
+            shape_attr->DebugString());
+      }
+      DataType dtype = dtype_attr->list().type(0);
+      const TensorShapeProto& shape_proto = shape_attr->list().shape(0);
+      shape_inference::ShapeHandle shape_handle;
+      TF_RETURN_IF_ERROR(
+          context->MakeShapeFromShapeProto(shape_proto, &shape_handle));
+      context->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{shape_handle, dtype}});
       return Status::OK();
     })
     .Doc(R"doc(
@@ -86,17 +90,15 @@ output: The argument.
 index: This argument is the index-th argument of the function.
 
 Attributes for shape inference:
-1. _output_shapes: this attribute can be set on an _Arg node producing
-   non-resource output(s). If set, its value should contain a list of
-   TensorShapeProto describing the shape(s) of the tensor(s) this _Arg node will
-   produce. If set, _Arg node's shape inference function will use it as the
-   node's output shapes.
+1. _output_shapes: this attribute should contain a list of TensorShapeProto
+   describing the shape(s) of the tensor(s) this _Arg node will produce. If set,
+   _Arg node's shape inference function will use it as the node's output shapes.
 2. _handle_dtypes and _handle_shapes: these attributes can be set on an _Arg
    node producing resource output(s). If set, value of _handle_dtypes should
    contain the dtype(s) of the resource(s) and value of _handle_shapes should
    contain the shape(s) of the resource(s). If both attributes are set, _Arg
    node's shape inference function will use their values as the node's output
-   type(s) and shape(s).
+   handle's type(s) and shape(s).
 )doc");
 
 REGISTER_SYSTEM_OP("_DeviceArg")
diff --git a/tensorflow/core/ops/functional_ops_test.cc b/tensorflow/core/ops/functional_ops_test.cc
index 0283bb22033..0a91d3fae8c 100644
--- a/tensorflow/core/ops/functional_ops_test.cc
+++ b/tensorflow/core/ops/functional_ops_test.cc
@@ -21,6 +21,36 @@ limitations under the License.
 
 namespace tensorflow {
 
+TEST(FunctionalOpsTest, Arg_ShapeFn) {
+  ShapeInferenceTestOp op("_Arg");
+  std::vector<DataType> out_type_list;
+  out_type_list.emplace_back(DT_RESOURCE);
+  TF_ASSERT_OK(NodeDefBuilder("test", "_Arg")
+                   .Attr("T", DataType::DT_RESOURCE)
+                   .Attr("index", 0)
+                   .Attr("_output_shapes", {TensorShape({5, 4})})
+                   .Attr("_handle_shapes", {TensorShape({3, 7})})
+                   .Attr("_handle_dtypes", {DataType::DT_FLOAT})
+                   .Finalize(&op.node_def));
+
+  const OpRegistrationData* op_reg_data;
+  TF_ASSERT_OK(OpRegistry::Global()->LookUp(op.name, &op_reg_data));
+  shape_inference::InferenceContext c(
+      op.graph_def_version, op.node_def, op_reg_data->op_def,
+      std::vector<shape_inference::ShapeHandle>{}, op.input_tensors, {}, {});
+  TF_ASSERT_OK(c.Run(op_reg_data->shape_inference_fn));
+  auto output = c.output(0);
+  ASSERT_EQ(c.Value(c.Rank(output)), 2);
+  EXPECT_EQ(c.Value(c.Dim(output, 0)), 5);
+  EXPECT_EQ(c.Value(c.Dim(output, 1)), 4);
+
+  auto outputs = c.output_handle_shapes_and_types(0);
+  ASSERT_EQ(outputs->size(), 1);
+  EXPECT_EQ(outputs->front().dtype, DataType::DT_FLOAT);
+  EXPECT_EQ(c.Value(c.Dim(outputs->front().shape, 0)), 3);
+  EXPECT_EQ(c.Value(c.Dim(outputs->front().shape, 1)), 7);
+}
+
 TEST(FunctionalOpsTest, SymbolicGradient_ShapeFn) {
   ShapeInferenceTestOp op("SymbolicGradient");
   int num_inputs = 4;
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b4af577f45a..cbf03d7b045 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -464,7 +464,7 @@ REGISTER_OP("MulNoNan")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, complex64, complex128}")
+    .Attr("T: {bfloat16, half, float, double, complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 // Note: This op is not commutative w.r.t. to all its inputs.
@@ -549,7 +549,7 @@ REGISTER_OP("Maximum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {bfloat16, half, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, uint8, int16, int32, int64}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 // Note: This op is not commutative w.r.t. to all its inputs.
@@ -573,7 +573,7 @@ REGISTER_OP("Minimum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {bfloat16, half, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, uint8, int16, int32, int64}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Mod")
@@ -717,8 +717,8 @@ REGISTER_OP("GreaterEqual").COMPARISON();
       .SetIsCommutative()                                                  \
       .Attr(                                                               \
           "T: {bfloat16, half, float, double, uint8, int8, int16, int32, " \
-          "int64, complex64, quint8, qint8, qint32, string, bool, "        \
-          "complex128}")                                                   \
+          "int64, uint16, uint32, uint64, complex64, "                     \
+          "quint8, qint8, qint32, string, bool, complex128}")              \
       .Attr("incompatible_shape_error: bool = true")                       \
       .SetShapeFn([](InferenceContext* c) {                                \
         ShapeHandle x = c->input(0);                                       \
@@ -1096,7 +1096,7 @@ REGISTER_OP("ArgMax")
     .Input("input: T")
     .Input("dimension: Tidx")
     .Output("output: output_type")
-    .Attr("T: numbertype")
+    .Attr("T: {numbertype, bool}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("output_type: {int32, int64} = DT_INT64")
     .SetShapeFn(ArgOpShape);
@@ -1105,7 +1105,7 @@ REGISTER_OP("ArgMin")
     .Input("input: T")
     .Input("dimension: Tidx")
     .Output("output: output_type")
-    .Attr("T: numbertype")
+    .Attr("T: {numbertype, bool}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("output_type: {int32, int64} = DT_INT64")
     .SetShapeFn(ArgOpShape);
@@ -1313,81 +1313,89 @@ REGISTER_OP("UnsortedSegmentProd")
 REGISTER_OP("SparseSegmentSum")
     .Input("data: T")
     .Input("indices: Tidx")
-    .Input("segment_ids: int32")
+    .Input("segment_ids: Tsegmentids")
     .Output("output: T")
     .Attr("T: realnumbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
 
 REGISTER_OP("SparseSegmentSumWithNumSegments")
     .Input("data: T")
     .Input("indices: Tidx")
-    .Input("segment_ids: int32")
+    .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
     .Attr("T: realnumbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("SparseSegmentMean")
     .Input("data: T")
     .Input("indices: Tidx")
-    .Input("segment_ids: int32")
+    .Input("segment_ids: Tsegmentids")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
 
 REGISTER_OP("SparseSegmentMeanWithNumSegments")
     .Input("data: T")
     .Input("indices: Tidx")
-    .Input("segment_ids: int32")
+    .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("SparseSegmentMeanGrad")
     .Input("grad: T")
     .Input("indices: Tidx")
-    .Input("segment_ids: int32")
+    .Input("segment_ids: Tsegmentids")
     .Input("output_dim0: int32")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionGradShapeFn);
 
 REGISTER_OP("SparseSegmentSqrtN")
     .Input("data: T")
     .Input("indices: Tidx")
-    .Input("segment_ids: int32")
+    .Input("segment_ids: Tsegmentids")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
 
 REGISTER_OP("SparseSegmentSqrtNWithNumSegments")
     .Input("data: T")
     .Input("indices: Tidx")
-    .Input("segment_ids: int32")
+    .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("SparseSegmentSqrtNGrad")
     .Input("grad: T")
     .Input("indices: Tidx")
-    .Input("segment_ids: int32")
+    .Input("segment_ids: Tsegmentids")
     .Input("output_dim0: int32")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionGradShapeFn);
 
 REGISTER_OP("All")
@@ -1643,6 +1651,116 @@ REGISTER_OP("Bincount")
       return Status::OK();
     });
 
+REGISTER_OP("DenseBincount")
+    .Input("input: Tidx")
+    .Input("size: Tidx")
+    .Input("weights: T")
+    .Attr("Tidx: {int32, int64}")
+    .Attr("T: {int32, int64, float32, float64}")
+    .Attr("binary_output: bool = false")
+    .Output("output: T")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      // The input `input` must be at most matrix.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 2, &unused));
+      // The input `size` must be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      const Tensor* size_tensor = c->input_tensor(1);
+      if (size_tensor == nullptr) {
+        // Return unknown shape if size is not known.
+        c->set_output(0, c->UnknownShape());
+        return Status::OK();
+      }
+
+      int64 size_val;
+      DataType dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("Tidx", &dtype));
+      if (dtype == DT_INT32) {
+        size_val = static_cast<int64>(size_tensor->scalar<int32>()());
+      } else if (dtype == DT_INT64) {
+        size_val = size_tensor->scalar<int64>()();
+      } else {
+        return errors::InvalidArgument("size dtype must be int32 or int64");
+      }
+      // Return `[size]` shape if size is known.
+      if (size_val < 0) {
+        return errors::InvalidArgument("size (", size_val,
+                                       ") must be non-negative");
+      }
+      if (c->Rank(c->input(0)) == 1) {
+        c->set_output(0, c->MakeShape({size_val}));
+      } else if (c->Rank(c->input(0)) == 2) {
+        c->set_output(0, c->MakeShape({c->Dim(c->input(0), 0), size_val}));
+      }
+      return Status::OK();
+    });
+
+REGISTER_OP("SparseBincount")
+    .Input("indices: int64")
+    .Input("values: Tidx")
+    .Input("dense_shape: int64")
+    .Input("size: Tidx")
+    .Input("weights: T")
+    .Attr("Tidx: {int32, int64}")
+    .Attr("T: {int32, int64, float32, float64}")
+    .Attr("binary_output: bool = false")
+    .Output("output: T")
+    .SetShapeFn([](InferenceContext* c) {
+      const Tensor* size_tensor = c->input_tensor(3);
+      if (size_tensor == nullptr) {
+        // Return unknown shape if size is not known.
+        c->set_output(0, c->UnknownShape());
+        return Status::OK();
+      }
+
+      int64 size_val;
+      DataType dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("Tidx", &dtype));
+      if (dtype == DT_INT32) {
+        size_val = static_cast<int64>(size_tensor->scalar<int32>()());
+      } else if (dtype == DT_INT64) {
+        size_val = size_tensor->scalar<int64>()();
+      } else {
+        return errors::InvalidArgument("size dtype must be int32 or int64");
+      }
+      // Return `[size]` shape if size is known.
+      if (size_val < 0) {
+        return errors::InvalidArgument("size (", size_val,
+                                       ") must be non-negative");
+      }
+
+      const Tensor* shape_tensor = c->input_tensor(2);
+      if (shape_tensor == nullptr) {
+        // Return unknown shape if size is not known.
+        c->set_output(0, c->UnknownShape());
+        return Status::OK();
+      }
+      if (shape_tensor->NumElements() == 1) {
+        c->set_output(0, c->MakeShape({size_val}));
+      } else if (shape_tensor->NumElements() == 2) {
+        c->set_output(0,
+                      c->MakeShape({shape_tensor->flat<int64>()(0), size_val}));
+      } else {
+        return errors::InvalidArgument("Input must be less than rank 2");
+      }
+      return Status::OK();
+    });
+
+REGISTER_OP("RaggedBincount")
+    .Input("splits: int64")
+    .Input("values: Tidx")
+    .Input("size: Tidx")
+    .Input("weights: T")
+    .Attr("Tidx: {int32, int64}")
+    .Attr("T: {int32, int64, float32, float64}")
+    .Attr("binary_output: bool = false")
+    .Output("output: T")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
 REGISTER_OP("Cumsum")
     .Input("x: T")
     .Input("axis: Tidx")
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 47b3745573b..248cf1d0e8a 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -61,6 +61,30 @@ REGISTER_OP("_MklFusedConv2D")
  is expected to create these operators.
 )doc");
 
+REGISTER_OP("_MklFusedDepthwiseConv2dNative")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_args: num_args * uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {bfloat16, float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
+
 REGISTER_OP("_MklFusedMatMul")
     .Input("a: T")
     .Input("b: T")
@@ -1345,6 +1369,48 @@ REGISTER_OP("_MklFusedBatchNormGradV3")
         R"doc(MKL-DNN implementation of FusedBatchNormGradV3: Do not invoke this operator directly in Python.
              Graph rewrite pass is expected to invoke this operator.)doc");
 
+REGISTER_OP("_MklFusedBatchNormEx")
+    .Input("x: T")
+    .Input("scale: U")
+    .Input("offset: U")
+    .Input("mean: U")
+    .Input("variance: U")
+    .Input("side_input: num_side_inputs * T")
+    .Input("mkl_x: uint8")
+    .Input("mkl_scale: uint8")
+    .Input("mkl_offset: uint8")
+    .Input("mkl_mean: uint8")
+    .Input("mkl_variance: uint8")
+    .Input("mkl_side_input: num_side_inputs * uint8")
+    .Output("y: T")
+    .Output("batch_mean: U")
+    .Output("batch_variance: U")
+    .Output("reserve_space_1: U")
+    .Output("reserve_space_2: U")
+    .Output("reserve_space_3: U")
+    .Output("mkl_y: uint8")
+    .Output("mkl_batch_mean: uint8")
+    .Output("mkl_batch_variance: uint8")
+    .Output("mkl_reserve_space_1: uint8")
+    .Output("mkl_reserve_space_2: uint8")
+    .Output("mkl_reserve_space_3: uint8")
+    .Attr("T: {bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("exponential_avg_factor: float = 1.0")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("num_side_inputs: int >= 0 = 0")
+    .Attr("activation_mode: string = \"Identity\"")
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormShape)
+    .Doc(R"doc(
+MKL version of FusedBatchNormEx operator. Uses MKL DNN APIs to perform fused
+batch normalization and relu.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 83260bfedc9..518972696f1 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -238,7 +238,11 @@ REGISTER_OP("_FusedBatchNormEx")
     .Output("reserve_space_1: U")
     .Output("reserve_space_2: U")
     .Output("reserve_space_3: U")
+#ifdef ENABLE_MKLDNN_V1
+    .Attr("T: {half, float, bfloat16}")
+#else
     .Attr("T: {half, float}")
+#endif
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("exponential_avg_factor: float = 1.0")
@@ -596,6 +600,23 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
       return Status::OK();
     });
 
+REGISTER_OP("_FusedDepthwiseConv2dNative")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Output("output: T")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
+
 // --------------------------------------------------------------------------
 REGISTER_OP("Conv3D")
     .Input("input: T")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 91c61fa7c4b..c951cb11778 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -2153,6 +2153,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BOOL
       }
     }
   }
@@ -2219,6 +2220,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BOOL
       }
     }
   }
@@ -4294,25 +4296,6 @@ op {
     }
   }
 }
-op {
-  name: "BeginEpoch"
-  input_arg {
-    name: "dataset_id"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "address"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "protocol"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "epoch_id"
-    type: DT_INT64
-  }
-}
 op {
   name: "BesselI0e"
   input_arg {
@@ -10494,6 +10477,14 @@ op {
 }
 op {
   name: "DataServiceDataset"
+  input_arg {
+    name: "dataset_id"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "processing_mode"
+    type: DT_STRING
+  }
   input_arg {
     name: "address"
     type: DT_STRING
@@ -10502,14 +10493,29 @@ op {
     name: "protocol"
     type: DT_STRING
   }
+  input_arg {
+    name: "job_name"
+    type: DT_STRING
+  }
   input_arg {
     name: "max_outstanding_requests"
     type: DT_INT64
   }
+  input_arg {
+    name: "iteration_counter"
+    type: DT_RESOURCE
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
+  attr {
+    name: "task_refresh_interval_hint_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
   attr {
     name: "output_types"
     type: "list(type)"
@@ -11468,6 +11474,121 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "DenseBincount"
+  input_arg {
+    name: "input"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "DenseCountSparseOutput"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "minlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "maxlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "DenseToCSRSparseMatrix"
   input_arg {
@@ -12492,6 +12613,30 @@ op {
     }
   }
 }
+op {
+  name: "DummyIterationCounter"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "DummyMemoryCache"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "DummySeedGenerator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
 op {
   name: "DynamicPartition"
   input_arg {
@@ -13373,6 +13518,9 @@ op {
         type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
         type: DT_COMPLEX64
         type: DT_QUINT8
         type: DT_QINT8
@@ -21607,22 +21755,6 @@ op {
     }
   }
 }
-op {
-  name: "MakeDataServiceIterator"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "epoch_id"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
 op {
   name: "MakeIterator"
   input_arg {
@@ -23707,6 +23839,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
       }
@@ -23962,6 +24096,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
       }
@@ -24199,6 +24335,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -25360,6 +25497,9 @@ op {
         type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
         type: DT_COMPLEX64
         type: DT_QUINT8
         type: DT_QINT8
@@ -33023,6 +33163,129 @@ op {
     }
   }
 }
+op {
+  name: "RaggedBincount"
+  input_arg {
+    name: "splits"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "RaggedCountSparseOutput"
+  input_arg {
+    name: "splits"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "minlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "maxlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "RaggedCross"
   input_arg {
@@ -42236,6 +42499,64 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ShuffleAndRepeatDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
 }
 op {
   name: "ShuffleDataset"
@@ -42311,6 +42632,53 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ShuffleDatasetV3"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "ShutdownDistributedTPU"
   is_stateful: true
@@ -44304,6 +44672,62 @@ op {
     }
   }
 }
+op {
+  name: "SparseBincount"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "SparseConcat"
   input_arg {
@@ -44413,6 +44837,81 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "SparseCountSparseOutput"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "output_type"
+  }
+  output_arg {
+    name: "output_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "minlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "maxlength"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "binary_output"
+    type: "bool"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "SparseCross"
   input_arg {
@@ -44506,6 +45005,142 @@ op {
     }
   }
 }
+op {
+  name: "SparseCrossHashed"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "num_buckets"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "strong_hash"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "salt"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "SparseCrossV2"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "sep"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "SparseDenseCwiseAdd"
   input_arg {
@@ -45387,7 +46022,7 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    type: DT_INT32
+    type_attr: "Tsegmentids"
   }
   output_arg {
     name: "output"
@@ -45416,6 +46051,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSegmentMeanGrad"
@@ -45429,7 +46077,7 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    type: DT_INT32
+    type_attr: "Tsegmentids"
   }
   input_arg {
     name: "output_dim0"
@@ -45462,6 +46110,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSegmentMeanWithNumSegments"
@@ -45475,7 +46136,7 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    type: DT_INT32
+    type_attr: "Tsegmentids"
   }
   input_arg {
     name: "num_segments"
@@ -45521,6 +46182,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSegmentSqrtN"
@@ -45534,7 +46208,7 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    type: DT_INT32
+    type_attr: "Tsegmentids"
   }
   output_arg {
     name: "output"
@@ -45563,6 +46237,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSegmentSqrtNGrad"
@@ -45576,7 +46263,7 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    type: DT_INT32
+    type_attr: "Tsegmentids"
   }
   input_arg {
     name: "output_dim0"
@@ -45609,6 +46296,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSegmentSqrtNWithNumSegments"
@@ -45622,7 +46322,7 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    type: DT_INT32
+    type_attr: "Tsegmentids"
   }
   input_arg {
     name: "num_segments"
@@ -45668,6 +46368,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSegmentSum"
@@ -45681,7 +46394,7 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    type: DT_INT32
+    type_attr: "Tsegmentids"
   }
   output_arg {
     name: "output"
@@ -45720,6 +46433,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSegmentSumWithNumSegments"
@@ -45733,7 +46459,7 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    type: DT_INT32
+    type_attr: "Tsegmentids"
   }
   input_arg {
     name: "num_segments"
@@ -45789,6 +46515,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSlice"
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 85186c4a2d8..906cef1f5ec 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -272,6 +272,46 @@ REGISTER_OP("SparseCross")
       return Status::OK();
     });
 
+REGISTER_OP("SparseCrossV2")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Input("sep: string")
+    .Output("output_indices: int64")
+    .Output("output_values: string")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
+REGISTER_OP("SparseCrossHashed")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Input("num_buckets: int64")
+    .Input("strong_hash: bool")
+    .Input("salt: int64")
+    .Output("output_indices: int64")
+    .Output("output_values: int64")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("SparseSplit")
     .Input("split_dim: int64")
     .Input("indices: int64")
diff --git a/tensorflow/core/ops/tpu_host_compute_ops.cc b/tensorflow/core/ops/tpu_host_compute_ops.cc
index 48aeb81ac13..753cc0015d9 100644
--- a/tensorflow/core/ops/tpu_host_compute_ops.cc
+++ b/tensorflow/core/ops/tpu_host_compute_ops.cc
@@ -28,8 +28,7 @@ REGISTER_OP("_XlaSendFromHost")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
-A placeholder op for multiple values that will be sent from TensorFlow to a
-running XLA computation.
+A placeholder op to send values to a running XLA computation.
 
 inputs: A list of tensors that will be sent to the XLA computation.
 dynamic_key: The key sent at runtime by the compile node to identify which
@@ -49,8 +48,7 @@ REGISTER_OP("_XlaRecvAtHost")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape)
     .Doc(R"doc(
-A placeholder op for multiple values that will be sent to TensorFlow from a
-running XLA computation.
+A placeholder op to receive values from a running XLA computation.
 
 dynamic_key: The key sent at runtime by the compile node to identify which
 execution the transfer corresponds to.
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 874f1f2092c..f78b738247d 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -44,7 +44,6 @@ load(
     "tf_cc_tests",
     "tf_copts",  # @unused
     "tf_cuda_library",
-    "tf_portable_full_lite_protos",
 )
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
@@ -86,6 +85,7 @@ exports_files(
         "mutex.h",
         "net.h",
         "numa.h",
+        "ram_file_system.h",
         "resource_loader.h",
         "resource.h",
         "snappy.h",
@@ -382,6 +382,15 @@ cc_library(
     ],
 )
 
+py_test(
+    name = "ram_file_system_test",
+    srcs = ["ram_file_system_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 cc_library(
     name = "numbers",
     srcs = ["numbers.cc"],
@@ -462,10 +471,6 @@ cc_library(
         "protobuf_util.cc",
     ],
     hdrs = ["protobuf.h"],
-    defines = tf_portable_full_lite_protos(
-        full = [],
-        lite = ["TENSORFLOW_LITE_PROTOS"],
-    ),
     deps = [
         ":platform",
         ":types",
@@ -616,7 +621,7 @@ cc_library(
         ":stringpiece",
         ":stringprintf",
         ":types",
-        "//tensorflow/core:error_codes_proto_impl_cc",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base",
     ],
 )
@@ -1293,6 +1298,7 @@ filegroup(
         "profile_utils/cpu_utils.h",
         "profile_utils/i_cpu_utils_helper.h",
         "protobuf.h",
+        "ram_file_system.h",
         "random.h",
         "resource.h",
         "stacktrace.h",
@@ -1466,6 +1472,7 @@ filegroup(
         "abi.h",
         "blocking_counter.h",
         "byte_order.h",
+        "casts.h",
         "coding.cc",
         "coding.h",
         "context.h",
@@ -1507,6 +1514,7 @@ filegroup(
         "protobuf.cc",
         "protobuf.h",
         "protobuf_util.cc",
+        "ram_file_system.h",
         "raw_coding.h",
         "refcount.h",
         "resource.h",
@@ -1550,7 +1558,6 @@ filegroup(
     srcs = [
         "base64.cc",
         "base64.h",
-        "casts.h",
         "cpu_feature_guard.cc",
         "cpu_feature_guard.h",
         "fingerprint.h",
diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl
index d1fb5f829ea..ab452562245 100644
--- a/tensorflow/core/platform/build_config.bzl
+++ b/tensorflow/core/platform/build_config.bzl
@@ -26,13 +26,12 @@ load(
     _tf_platform_alias = "tf_platform_alias",
     _tf_platform_deps = "tf_platform_deps",
     _tf_portable_deps_no_runtime = "tf_portable_deps_no_runtime",
+    _tf_portable_proto_lib = "tf_portable_proto_lib",
     _tf_proto_library = "tf_proto_library",
     _tf_proto_library_cc = "tf_proto_library_cc",
     _tf_proto_library_py = "tf_proto_library_py",
     _tf_protobuf_compiler_deps = "tf_protobuf_compiler_deps",
     _tf_protobuf_deps = "tf_protobuf_deps",
-    _tf_protobuf_full_deps = "tf_protobuf_full_deps",
-    _tf_protobuf_lite_deps = "tf_protobuf_lite_deps",
     _tf_protos_all = "tf_protos_all",
     _tf_protos_all_impl = "tf_protos_all_impl",
     _tf_protos_grappler = "tf_protos_grappler",
@@ -67,14 +66,13 @@ tf_lib_proto_parsing_deps = _tf_lib_proto_parsing_deps
 tf_logging_deps = _tf_logging_deps
 tf_platform_alias = _tf_platform_alias
 tf_platform_deps = _tf_platform_deps
+tf_portable_proto_lib = _tf_portable_proto_lib
 tf_portable_deps_no_runtime = _tf_portable_deps_no_runtime
 tf_proto_library = _tf_proto_library
 tf_proto_library_cc = _tf_proto_library_cc
 tf_proto_library_py = _tf_proto_library_py
 tf_protobuf_compiler_deps = _tf_protobuf_compiler_deps
 tf_protobuf_deps = _tf_protobuf_deps
-tf_protobuf_full_deps = _tf_protobuf_full_deps
-tf_protobuf_lite_deps = _tf_protobuf_lite_deps
 tf_protos_all = _tf_protos_all
 tf_protos_all_impl = _tf_protos_all_impl
 tf_protos_grappler = _tf_protos_grappler
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index dc58c2c5513..e4047c78998 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -133,34 +133,6 @@ Status GetTmpFilename(string* filename) {
   return Status::OK();
 }
 
-/// \brief Splits a GCS path to a bucket and an object.
-///
-/// For example, "gs://bucket-name/path/to/file.txt" gets split into
-/// "bucket-name" and "path/to/file.txt".
-/// If fname only contains the bucket and empty_object_ok = true, the returned
-/// object is empty.
-Status ParseGcsPath(StringPiece fname, bool empty_object_ok, string* bucket,
-                    string* object) {
-  StringPiece scheme, bucketp, objectp;
-  io::ParseURI(fname, &scheme, &bucketp, &objectp);
-  if (scheme != "gs") {
-    return errors::InvalidArgument("GCS path doesn't start with 'gs://': ",
-                                   fname);
-  }
-  *bucket = string(bucketp);
-  if (bucket->empty() || *bucket == ".") {
-    return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
-                                   fname);
-  }
-  absl::ConsumePrefix(&objectp, "/");
-  *object = string(objectp);
-  if (!empty_object_ok && object->empty()) {
-    return errors::InvalidArgument("GCS path doesn't contain an object name: ",
-                                   fname);
-  }
-  return Status::OK();
-}
-
 /// Appends a trailing slash if the name doesn't already have one.
 string MaybeAppendSlash(const string& name) {
   if (name.empty()) {
@@ -1034,6 +1006,34 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& fname, size_t offset,
   return Status::OK();
 }
 
+Status GcsFileSystem::ParseGcsPathForScheme(StringPiece fname, string scheme,
+                                            bool empty_object_ok,
+                                            string* bucket, string* object) {
+  StringPiece parsed_scheme, bucketp, objectp;
+  io::ParseURI(fname, &parsed_scheme, &bucketp, &objectp);
+  if (parsed_scheme != scheme) {
+    return errors::InvalidArgument("GCS path doesn't start with 'gs://': ",
+                                   fname);
+  }
+  *bucket = string(bucketp);
+  if (bucket->empty() || *bucket == ".") {
+    return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
+                                   fname);
+  }
+  absl::ConsumePrefix(&objectp, "/");
+  *object = string(objectp);
+  if (!empty_object_ok && object->empty()) {
+    return errors::InvalidArgument("GCS path doesn't contain an object name: ",
+                                   fname);
+  }
+  return Status::OK();
+}
+
+Status GcsFileSystem::ParseGcsPath(StringPiece fname, bool empty_object_ok,
+                                   string* bucket, string* object) {
+  return ParseGcsPathForScheme(fname, "gs", empty_object_ok, bucket, object);
+}
+
 void GcsFileSystem::ClearFileCaches(const string& fname) {
   tf_shared_lock l(block_cache_lock_);
   file_block_cache_->RemoveFile(fname);
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 7293f0528b9..d1d8aed54d4 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -264,6 +264,19 @@ class GcsFileSystem : public FileSystem {
   virtual Status LoadBufferFromGCS(const string& fname, size_t offset, size_t n,
                                    char* buffer, size_t* bytes_transferred);
 
+  Status ParseGcsPathForScheme(StringPiece fname, string scheme,
+                               bool empty_object_ok, string* bucket,
+                               string* object);
+
+  /// \brief Splits a GCS path to a bucket and an object.
+  ///
+  /// For example, "gs://bucket-name/path/to/file.txt" gets split into
+  /// "bucket-name" and "path/to/file.txt".
+  /// If fname only contains the bucket and empty_object_ok = true, the returned
+  /// object is empty.
+  virtual Status ParseGcsPath(StringPiece fname, bool empty_object_ok,
+                              string* bucket, string* object);
+
   std::shared_ptr<ComputeEngineMetadataClient> compute_engine_metadata_client_;
 
  private:
diff --git a/tensorflow/core/platform/default/BUILD b/tensorflow/core/platform/default/BUILD
index db714938a45..89231b0f206 100644
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@@ -87,6 +87,7 @@ cc_library(
         "//tensorflow/core/platform:env.h",
         "//tensorflow/core/platform:file_system.h",
         "//tensorflow/core/platform:file_system_helper.h",
+        "//tensorflow/core/platform:ram_file_system.h",
         "//tensorflow/core/platform:threadpool.h",
     ],
     tags = [
@@ -508,6 +509,7 @@ filegroup(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
+        "casts.h",
         "context.h",
         "dynamic_annotations.h",
         "env.cc",
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 391de3a4649..2dc4fdc0fd9 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -577,8 +577,8 @@ def tf_additional_all_protos():
 
 def tf_protos_all_impl():
     return [
-        clean_dep("//tensorflow/core:autotuning_proto_cc_impl"),
-        clean_dep("//tensorflow/core:conv_autotuning_proto_cc_impl"),
+        clean_dep("//tensorflow/core/protobuf:autotuning_proto_cc_impl"),
+        clean_dep("//tensorflow/core/protobuf:conv_autotuning_proto_cc_impl"),
         clean_dep("//tensorflow/core:protos_all_cc_impl"),
     ]
 
@@ -638,7 +638,9 @@ def tf_additional_core_deps():
         clean_dep("//tensorflow:android"): [],
         clean_dep("//tensorflow:ios"): [],
         clean_dep("//tensorflow:linux_s390x"): [],
-        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows"): [
+            "//tensorflow/core/platform/cloud:gcs_file_system",
+        ],
         clean_dep("//tensorflow:no_gcp_support"): [],
         "//conditions:default": [
             "//tensorflow/core/platform/cloud:gcs_file_system",
@@ -656,7 +658,9 @@ def tf_additional_core_deps():
         clean_dep("//tensorflow:android"): [],
         clean_dep("//tensorflow:ios"): [],
         clean_dep("//tensorflow:linux_s390x"): [],
-        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows"): [
+            clean_dep("//tensorflow/core/platform/s3:s3_file_system"),
+        ],
         clean_dep("//tensorflow:no_aws_support"): [],
         "//conditions:default": [
             clean_dep("//tensorflow/core/platform/s3:s3_file_system"),
@@ -715,12 +719,6 @@ def tf_fingerprint_deps():
         "@farmhash_archive//:farmhash",
     ]
 
-def tf_protobuf_full_deps():
-    return tf_protobuf_deps()
-
-def tf_protobuf_lite_deps():
-    return tf_protobuf_deps()
-
 def tf_protobuf_deps():
     return if_static(
         [
@@ -729,6 +727,9 @@ def tf_protobuf_deps():
         otherwise = [clean_dep("@com_google_protobuf//:protobuf_headers")],
     )
 
+def tf_portable_proto_lib():
+    return ["//tensorflow/core:protos_all_cc_impl"]
+
 def tf_protobuf_compiler_deps():
     return if_static(
         [
@@ -750,8 +751,8 @@ def tf_windows_aware_platform_deps(name):
 def tf_platform_deps(name, platform_dir = "//tensorflow/core/platform/"):
     return [platform_dir + "default:" + name]
 
-def tf_platform_alias(name):
-    return ["//tensorflow/core/platform/default:" + name]
+def tf_platform_alias(name, platform_dir = "//tensorflow/core/platform/"):
+    return [platform_dir + "default:" + name]
 
 def tf_logging_deps():
     return ["//tensorflow/core/platform/default:logging"]
@@ -766,7 +767,7 @@ def tf_portable_deps_no_runtime():
         "@nsync//:nsync_cpp",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
-    ] + tf_protobuf_deps()
+    ]
 
 def tf_google_mobile_srcs_no_runtime():
     return []
diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
index 35e716bc920..58119f055c1 100644
--- a/tensorflow/core/platform/default/distribute.bzl
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -1,5 +1,9 @@
 """Build rules for tf.distribute testing."""
 
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "register_extension_info",
+)
 load("//tensorflow/python/tpu:tpu.bzl", _tpu_py_test = "tpu_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -40,6 +44,7 @@ def distribute_py_test(
     """
     _ignore = (full_precision)
     tpu_tags = tags if (tpu_tags == None) else tpu_tags
+    main = main if main else "%s.py" % name
 
     cuda_py_test(
         name = name,
@@ -69,3 +74,8 @@ def distribute_py_test(
             disable_v2 = disable_v2,
             disable_v3 = disable_v3,
         )
+
+register_extension_info(
+    extension_name = "distribute_py_test",
+    label_regex_for_dep = "{extension_name}",
+)
diff --git a/tensorflow/core/platform/default/env.cc b/tensorflow/core/platform/default/env.cc
index 5f7822f6583..d63b73e99e4 100644
--- a/tensorflow/core/platform/default/env.cc
+++ b/tensorflow/core/platform/default/env.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/platform/load_library.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/ram_file_system.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -52,25 +53,48 @@ std::map<std::thread::id, string>& GetThreadNameRegistry()
   return *thread_name_registry;
 }
 
-class StdThread : public Thread {
+// We use the pthread API instead of std::thread so we can control stack sizes.
+class PThread : public Thread {
  public:
-  // thread_options is ignored.
-  StdThread(const ThreadOptions& thread_options, const string& name,
-            std::function<void()> fn)
-      : thread_(fn) {
-    mutex_lock l(name_mutex);
-    GetThreadNameRegistry().emplace(thread_.get_id(), name);
+  PThread(const ThreadOptions& thread_options, const std::string& name,
+          std::function<void()> fn) {
+    ThreadParams* params = new ThreadParams;
+    params->name = name;
+    params->fn = std::move(fn);
+    pthread_attr_t attributes;
+    pthread_attr_init(&attributes);
+    if (thread_options.stack_size != 0) {
+      pthread_attr_setstacksize(&attributes, thread_options.stack_size);
+    }
+    int ret = pthread_create(&thread_, &attributes, &ThreadFn, params);
+    // There is no mechanism for the thread creation API to fail, so we CHECK.
+    CHECK_EQ(ret, 0) << "Thread creation via pthread_create() failed.";
+    pthread_attr_destroy(&attributes);
   }
 
-  ~StdThread() override {
-    std::thread::id thread_id = thread_.get_id();
-    thread_.join();
-    mutex_lock l(name_mutex);
-    GetThreadNameRegistry().erase(thread_id);
-  }
+  ~PThread() override { pthread_join(thread_, nullptr); }
 
  private:
-  std::thread thread_;
+  struct ThreadParams {
+    std::string name;
+    std::function<void()> fn;
+  };
+  static void* ThreadFn(void* params_arg) {
+    std::unique_ptr<ThreadParams> params(
+        reinterpret_cast<ThreadParams*>(params_arg));
+    {
+      mutex_lock l(name_mutex);
+      GetThreadNameRegistry().emplace(std::this_thread::get_id(), params->name);
+    }
+    params->fn();
+    {
+      mutex_lock l(name_mutex);
+      GetThreadNameRegistry().erase(std::this_thread::get_id());
+    }
+    return nullptr;
+  }
+
+  pthread_t thread_;
 };
 
 class PosixEnv : public Env {
@@ -106,7 +130,7 @@ class PosixEnv : public Env {
 
   Thread* StartThread(const ThreadOptions& thread_options, const string& name,
                       std::function<void()> fn) override {
-    return new StdThread(thread_options, name, fn);
+    return new PThread(thread_options, name, fn);
   }
 
   int32 GetCurrentThreadId() override {
@@ -214,6 +238,8 @@ class PosixEnv : public Env {
 #if defined(PLATFORM_POSIX) || defined(__APPLE__) || defined(__ANDROID__)
 REGISTER_FILE_SYSTEM("", PosixFileSystem);
 REGISTER_FILE_SYSTEM("file", LocalPosixFileSystem);
+REGISTER_FILE_SYSTEM("ram", RamFileSystem);
+
 Env* Env::Default() {
   static Env* default_env = new PosixEnv;
   return default_env;
diff --git a/tensorflow/core/platform/default/human_readable_json.cc b/tensorflow/core/platform/default/human_readable_json.cc
index a8a79a44b47..acc8d223619 100644
--- a/tensorflow/core/platform/default/human_readable_json.cc
+++ b/tensorflow/core/platform/default/human_readable_json.cc
@@ -23,10 +23,6 @@ namespace tensorflow {
 
 Status ProtoToHumanReadableJson(const protobuf::Message& proto, string* result,
                                 bool ignore_accuracy_loss) {
-#ifdef TENSORFLOW_LITE_PROTOS
-  *result = "[human readable output not available on Android]";
-  return Status::OK();
-#else
   result->clear();
 
   protobuf::util::JsonPrintOptions json_options;
@@ -37,31 +33,37 @@ Status ProtoToHumanReadableJson(const protobuf::Message& proto, string* result,
   if (!status.ok()) {
     // Convert error_msg google::protobuf::StringPiece to
     // tensorflow::StringPiece.
-    auto error_msg = status.error_message();
+    auto error_msg = status.message();
     return errors::Internal(
         strings::StrCat("Could not convert proto to JSON string: ",
                         StringPiece(error_msg.data(), error_msg.length())));
   }
   return Status::OK();
-#endif
+}
+
+Status ProtoToHumanReadableJson(const protobuf::MessageLite& proto,
+                                string* result, bool ignore_accuracy_loss) {
+  *result = "[human readable output not available for lite protos]";
+  return Status::OK();
 }
 
 Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto) {
-#ifdef TENSORFLOW_LITE_PROTOS
-  return errors::Internal("Cannot parse JSON protos on Android");
-#else
   proto->Clear();
   auto status = protobuf::util::JsonStringToMessage(str, proto);
   if (!status.ok()) {
     // Convert error_msg google::protobuf::StringPiece to
     // tensorflow::StringPiece.
-    auto error_msg = status.error_message();
+    auto error_msg = status.message();
     return errors::Internal(
         strings::StrCat("Could not convert JSON string to proto: ",
                         StringPiece(error_msg.data(), error_msg.length())));
   }
   return Status::OK();
-#endif
+}
+
+Status HumanReadableJsonToProto(const string& str,
+                                protobuf::MessageLite* proto) {
+  return errors::Internal("Cannot parse JSON protos on Android");
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/subprocess.h b/tensorflow/core/platform/default/subprocess.h
index 31b0ef39e7b..b066274a574 100644
--- a/tensorflow/core/platform/default/subprocess.h
+++ b/tensorflow/core/platform/default/subprocess.h
@@ -101,7 +101,7 @@ class SubProcess {
                           string* stderr_output);
 
  private:
-  static const int kNFds = 3;
+  static constexpr int kNFds = 3;
   static bool chan_valid(int chan) { return ((chan >= 0) && (chan < kNFds)); }
   static bool retry(int e) {
     return ((e == EINTR) || (e == EAGAIN) || (e == EWOULDBLOCK));
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index b98fd3c4cb1..b29cad05459 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -496,7 +496,7 @@ Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
 
 // A ZeroCopyInputStream on a RandomAccessFile.
 namespace {
-class FileStream : public ::tensorflow::protobuf::io::ZeroCopyInputStream {
+class FileStream : public protobuf::io::ZeroCopyInputStream {
  public:
   explicit FileStream(RandomAccessFile* file) : file_(file), pos_(0) {}
 
@@ -522,7 +522,7 @@ class FileStream : public ::tensorflow::protobuf::io::ZeroCopyInputStream {
   }
 
  private:
-  static const int kBufSize = 512 << 10;
+  static constexpr int kBufSize = 512 << 10;
 
   RandomAccessFile* file_;
   int64 pos_;
@@ -533,14 +533,14 @@ class FileStream : public ::tensorflow::protobuf::io::ZeroCopyInputStream {
 }  // namespace
 
 Status WriteBinaryProto(Env* env, const string& fname,
-                        const ::tensorflow::protobuf::MessageLite& proto) {
+                        const protobuf::MessageLite& proto) {
   string serialized;
   proto.AppendToString(&serialized);
   return WriteStringToFile(env, fname, serialized);
 }
 
 Status ReadBinaryProto(Env* env, const string& fname,
-                       ::tensorflow::protobuf::MessageLite* proto) {
+                       protobuf::MessageLite* proto) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
   std::unique_ptr<FileStream> stream(new FileStream(file.get()));
@@ -549,7 +549,7 @@ Status ReadBinaryProto(Env* env, const string& fname,
   // one to parse arbitrarily large messages for MessageLite. One most likely
   // doesn't want to put protobufs larger than 64MB on Android, so we should
   // eventually remove this and quit loud when a large protobuf is passed in.
-  ::tensorflow::protobuf::io::CodedInputStream coded_stream(stream.get());
+  protobuf::io::CodedInputStream coded_stream(stream.get());
   // Total bytes hard limit / warning limit are set to 1GB and 512MB
   // respectively.
   coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
@@ -563,47 +563,36 @@ Status ReadBinaryProto(Env* env, const string& fname,
 }
 
 Status WriteTextProto(Env* env, const string& fname,
-                      const ::tensorflow::protobuf::Message& proto) {
-#if !defined(TENSORFLOW_LITE_PROTOS)
+                      const protobuf::Message& proto) {
   string serialized;
-  if (!::tensorflow::protobuf::TextFormat::PrintToString(proto, &serialized)) {
+  if (!protobuf::TextFormat::PrintToString(proto, &serialized)) {
     return errors::FailedPrecondition("Unable to convert proto to text.");
   }
   return WriteStringToFile(env, fname, serialized);
-#else
-  return errors::Unimplemented("Can't write text protos with protolite.");
-#endif
 }
 
-Status ReadTextProto(Env* env, const string& fname,
-                     ::tensorflow::protobuf::Message* proto) {
-#if !defined(TENSORFLOW_LITE_PROTOS)
+Status ReadTextProto(Env* env, const string& fname, protobuf::Message* proto) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
   std::unique_ptr<FileStream> stream(new FileStream(file.get()));
 
-  if (!::tensorflow::protobuf::TextFormat::Parse(stream.get(), proto)) {
+  if (!protobuf::TextFormat::Parse(stream.get(), proto)) {
     TF_RETURN_IF_ERROR(stream->status());
     return errors::DataLoss("Can't parse ", fname, " as text proto");
   }
   return Status::OK();
-#else
-  return errors::Unimplemented("Can't parse text protos with protolite.");
-#endif
 }
 
 Status ReadTextOrBinaryProto(Env* env, const string& fname,
-#if !defined(TENSORFLOW_LITE_PROTOS)
-                             ::tensorflow::protobuf::Message* proto
-#else
-                             ::tensorflow::protobuf::MessageLite* proto
-#endif
-) {
-#if !defined(TENSORFLOW_LITE_PROTOS)
+                             protobuf::Message* proto) {
   if (ReadTextProto(env, fname, proto).ok()) {
     return Status::OK();
   }
-#endif
+  return ReadBinaryProto(env, fname, proto);
+}
+
+Status ReadTextOrBinaryProto(Env* env, const string& fname,
+                             protobuf::MessageLite* proto) {
   return ReadBinaryProto(env, fname, proto);
 }
 
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 7b617c0231f..99924ec1143 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -481,37 +481,31 @@ Status WriteStringToFile(Env* env, const string& fname,
 
 /// Write binary representation of "proto" to the named file.
 Status WriteBinaryProto(Env* env, const string& fname,
-                        const ::tensorflow::protobuf::MessageLite& proto);
+                        const protobuf::MessageLite& proto);
 
 /// Reads contents of named file and parse as binary encoded proto data
 /// and store into `*proto`.
 Status ReadBinaryProto(Env* env, const string& fname,
-                       ::tensorflow::protobuf::MessageLite* proto);
+                       protobuf::MessageLite* proto);
 
 /// Write the text representation of "proto" to the named file.
 Status WriteTextProto(Env* env, const string& fname,
-                      const ::tensorflow::protobuf::Message& proto);
+                      const protobuf::Message& proto);
 
 /// Read contents of named file and parse as text encoded proto data
 /// and store into `*proto`.
-template <typename T, typename std::enable_if<!std::is_base_of<
-                          protobuf::Message, T>::value>::type* = nullptr>
-Status ReadTextProto(Env* env, const string& fname, T* proto) {
+inline Status ReadTextProto(Env* /* env */, const string& /* fname */,
+                            protobuf::MessageLite* /* proto */) {
   return errors::Unimplemented("Can't parse text protos with protolite.");
 }
-
-Status ReadTextProto(Env* env, const string& fname,
-                     ::tensorflow::protobuf::Message* proto);
+Status ReadTextProto(Env* env, const string& fname, protobuf::Message* proto);
 
 /// Read contents of named file and parse as either text or binary encoded proto
 /// data and store into `*proto`.
 Status ReadTextOrBinaryProto(Env* env, const string& fname,
-#if !defined(TENSORFLOW_LITE_PROTOS)
-                             ::tensorflow::protobuf::Message* proto
-#else
-                             ::tensorflow::protobuf::MessageLite* proto
-#endif
-);
+                             protobuf::Message* proto);
+Status ReadTextOrBinaryProto(Env* env, const string& fname,
+                             protobuf::MessageLite* proto);
 
 // START_SKIP_DOXYGEN
 
diff --git a/tensorflow/core/platform/human_readable_json.h b/tensorflow/core/platform/human_readable_json.h
index f6830e20207..12a7bd042ee 100644
--- a/tensorflow/core/platform/human_readable_json.h
+++ b/tensorflow/core/platform/human_readable_json.h
@@ -31,10 +31,14 @@ namespace tensorflow {
 // accuracy loss with large integers.
 Status ProtoToHumanReadableJson(const protobuf::Message& proto, string* result,
                                 bool ignore_accuracy_loss);
+Status ProtoToHumanReadableJson(const protobuf::MessageLite& proto,
+                                string* result, bool ignore_accuracy_loss);
 
 // Converts a string produced by ProtoToHumanReadableJSON to a protobuf.  Not
 // guaranteed to work for general JSON.
 Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto);
+Status HumanReadableJsonToProto(const string& str,
+                                protobuf::MessageLite* proto);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index 0534443d17c..bf7db553be0 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -19,6 +19,7 @@ limitations under the License.
     (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
 
 #include <asm/unistd.h>
+#include <inttypes.h>
 #include <linux/perf_event.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -114,8 +115,8 @@ int64 AndroidArmV7ACpuUtilsHelper::ReadCpuFrequencyFile(
   if (fp == nullptr) {
     return INVALID_CPU_FREQUENCY;
   }
-  int64 freq_in_khz = INVALID_CPU_FREQUENCY;
-  const int retval = fscanf(fp, "%lld", &freq_in_khz);
+  int64_t freq_in_khz = INVALID_CPU_FREQUENCY;
+  const int retval = fscanf(fp, "%" SCNd64, &freq_in_khz);
   if (retval < 0) {
     LOG(WARNING) << "Failed to \"" << file_path << "\"";
     fclose(fp);
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index 587c97875a0..b22123a804a 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -88,6 +88,8 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
      defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
     retval = sscanf(line.c_str(), "clock              : %lfMHz", &cpu_freq);
     freq_factor = 1.0;
+#elif defined(__s390x__)
+    retval = sscanf(line.c_str(), "bogomips per cpu: %lf", &cpu_freq);
 #else
     retval = sscanf(line.c_str(), "bogomips : %lf", &cpu_freq);
 #endif
diff --git a/tensorflow/core/platform/protobuf.h b/tensorflow/core/platform/protobuf.h
index d05095dcf55..6b4db77ea3f 100644
--- a/tensorflow/core/platform/protobuf.h
+++ b/tensorflow/core/platform/protobuf.h
@@ -25,22 +25,20 @@ limitations under the License.
 // TensorFlow code should use the ::tensorflow::protobuf namespace to
 // refer to all protobuf APIs.
 
-#ifndef TENSORFLOW_LITE_PROTOS
+#include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/tokenizer.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
 #include "google/protobuf/descriptor.pb.h"
+#include "google/protobuf/arena.h"
 #include "google/protobuf/descriptor.h"
 #include "google/protobuf/dynamic_message.h"
+#include "google/protobuf/map.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/repeated_field.h"
 #include "google/protobuf/text_format.h"
 #include "google/protobuf/util/json_util.h"
 #include "google/protobuf/util/type_resolver_util.h"
-#endif
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-#include "google/protobuf/arena.h"
-#include "google/protobuf/map.h"
-#include "google/protobuf/repeated_field.h"
 
 namespace tensorflow {
 
@@ -79,10 +77,10 @@ inline void SetProtobufStringSwapAllowed(std::string* src, std::string* dest) {
 // tools/proto_text's generated code.  They have the same name as the versions
 // in core/platform/protobuf.h, so the generation code doesn't need to determine
 // if the type is Cord or string at generation time.
-inline std::string ProtobufStringToString(const Cord& s) {
+inline std::string ProtobufStringToString(const absl::Cord& s) {
   return s.ToString();
 }
-inline void SetProtobufStringSwapAllowed(std::string* src, Cord* dest) {
+inline void SetProtobufStringSwapAllowed(std::string* src, absl::Cord* dest) {
   dest->CopyFrom(*src);
 }
 #endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
@@ -114,7 +112,7 @@ class TStringOutputStream : public protobuf::io::ZeroCopyOutputStream {
   int64_t ByteCount() const override;
 
  private:
-  static const int kMinimumSize = 16;
+  static constexpr int kMinimumSize = 16;
 
   tstring* target_;
 };
diff --git a/tensorflow/core/platform/protobuf_internal.h b/tensorflow/core/platform/protobuf_internal.h
index d41ee5a468a..0d303197d45 100644
--- a/tensorflow/core/platform/protobuf_internal.h
+++ b/tensorflow/core/platform/protobuf_internal.h
@@ -24,46 +24,19 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Returns the DebugString when available, or a stub message otherwise. Useful
-// for messages that are incompatible with proto_text (e.g. those using Any).
-#ifdef TENSORFLOW_LITE_PROTOS
-template <class T>
-string DebugStringIfAvailable(T proto) {
-  return "[DebugString not available with lite protos]";
-}
-#else
-template <class T>
-auto DebugStringIfAvailable(T proto) -> decltype(proto.DebugString()) {
-  return proto.DebugString();
-}
-#endif  // defined(TENSORFLOW_LITE_PROTOS)
-
 // Utility for parsing an Any value with full or lite protos.
 template <class T>
 Status ParseAny(const google::protobuf::Any& any, T* message,
                 const string& type_name) {
-#ifdef TENSORFLOW_LITE_PROTOS
-  if (any.type_url() != strings::StrCat("type.googleapis.com/", type_name)) {
-    return errors::FailedPrecondition(
-        "Expected Any type_url for: ", type_name,
-        ". Got: ", string(any.type_url().data(), any.type_url().size()), ".");
-  }
-  if (!message->ParseFromString(ProtobufStringToString(any.value()))) {
-    return errors::FailedPrecondition("Failed to unpack: ",
-                                      DebugStringIfAvailable(any));
-  }
-#else
-  CHECK_EQ(type_name, message->descriptor()->full_name());
+  CHECK_EQ(type_name, message->GetTypeName());
   if (!any.Is<T>()) {
     return errors::FailedPrecondition(
-        "Expected Any type_url for: ", message->descriptor()->full_name(),
+        "Expected Any type_url for: ", message->GetTypeName(),
         ". Got: ", string(any.type_url().data(), any.type_url().size()), ".");
   }
   if (!any.UnpackTo(message)) {
-    return errors::FailedPrecondition("Failed to unpack: ",
-                                      DebugStringIfAvailable(any));
+    return errors::FailedPrecondition("Failed to unpack: ", any.DebugString());
   }
-#endif
   return Status::OK();
 }
 
diff --git a/tensorflow/core/platform/ram_file_system.h b/tensorflow/core/platform/ram_file_system.h
new file mode 100644
index 00000000000..871d38f97c5
--- /dev/null
+++ b/tensorflow/core/platform/ram_file_system.h
@@ -0,0 +1,239 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_RAM_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_RAM_FILE_SYSTEM_H_
+
+// Implementation of an in-memory TF filesystem for simple prototyping (e.g.
+// via Colab). The TPU TF server does not have local filesystem access, which
+// makes it difficult to provide Colab tutorials: users must have GCS access
+// and sign-in in order to try out an example.
+//
+// Files are implemented on top of std::string. Directories, as with GCS or S3,
+// are implicit based on the existence of child files. Multiple files may
+// reference a single FS location, though no thread-safety guarantees are
+// provided.
+
+#include <string>
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+#ifdef PLATFORM_WINDOWS
+#undef DeleteFile
+#undef CopyFile
+#undef TranslateName
+#endif
+
+namespace tensorflow {
+
+class RamRandomAccessFile : public RandomAccessFile, public WritableFile {
+ public:
+  RamRandomAccessFile(std::string name, std::shared_ptr<std::string> cord)
+      : name_(name), data_(cord) {}
+  ~RamRandomAccessFile() override {}
+
+  Status Name(StringPiece* result) const override {
+    *result = name_;
+    return Status::OK();
+  }
+
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
+    if (offset >= data_->size()) {
+      return errors::OutOfRange("");
+    }
+
+    uint64 left = std::min(static_cast<uint64>(n), data_->size() - offset);
+    auto start = data_->begin() + offset;
+    auto end = data_->begin() + offset + left;
+
+    std::copy(start, end, scratch);
+    *result = StringPiece(scratch, left);
+
+    // In case of a partial read, we must still fill `result`, but also return
+    // OutOfRange.
+    if (left < n) {
+      return errors::OutOfRange("");
+    }
+    return Status::OK();
+  }
+
+  Status Append(StringPiece data) override {
+    data_->append(data.data(), data.size());
+    return Status::OK();
+  }
+
+#if defined(PLATFORM_GOOGLE)
+  Status Append(const absl::Cord& cord) override {
+    data_->append(cord.char_begin(), cord.char_end());
+    return Status::OK();
+  }
+#endif
+
+  Status Close() override { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
+  Status Sync() override { return Status::OK(); }
+
+  Status Tell(int64* position) override {
+    *position = -1;
+    return errors::Unimplemented("This filesystem does not support Tell()");
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(RamRandomAccessFile);
+  string name_;
+  std::shared_ptr<std::string> data_;
+};
+
+class RamFileSystem : public FileSystem {
+ public:
+  Status NewRandomAccessFile(
+      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
+    mutex_lock m(mu_);
+    if (fs_.find(fname) == fs_.end()) {
+      return errors::NotFound("");
+    }
+    *result = std::unique_ptr<RandomAccessFile>(
+        new RamRandomAccessFile(fname, fs_[fname]));
+    return Status::OK();
+  }
+
+  Status NewWritableFile(const string& fname,
+                         std::unique_ptr<WritableFile>* result) override {
+    mutex_lock m(mu_);
+    if (fs_.find(fname) == fs_.end()) {
+      fs_[fname] = std::make_shared<std::string>();
+    }
+    *result = std::unique_ptr<WritableFile>(
+        new RamRandomAccessFile(fname, fs_[fname]));
+    return Status::OK();
+  }
+  Status NewAppendableFile(const string& fname,
+                           std::unique_ptr<WritableFile>* result) override {
+    mutex_lock m(mu_);
+    if (fs_.find(fname) == fs_.end()) {
+      fs_[fname] = std::make_shared<std::string>();
+    }
+    *result = std::unique_ptr<WritableFile>(
+        new RamRandomAccessFile(fname, fs_[fname]));
+    return Status::OK();
+  }
+
+  Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
+    return errors::Unimplemented("");
+  }
+
+  Status FileExists(const string& fname) override {
+    FileStatistics stat;
+    return Stat(fname, &stat);
+  }
+
+  Status GetChildren(const string& dir, std::vector<string>* result) override {
+    mutex_lock m(mu_);
+    auto it = fs_.lower_bound(dir);
+    while (it != fs_.end() && absl::StartsWith(it->first, dir)) {
+      result->push_back(it->first);
+      ++it;
+    }
+
+    return Status::OK();
+  }
+
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override {
+    mutex_lock m(mu_);
+    Env* env = Env::Default();
+    for (auto it = fs_.begin(); it != fs_.end(); ++it) {
+      if (env->MatchPath(it->first, pattern)) {
+        results->push_back(it->first);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Stat(const string& fname, FileStatistics* stat) override {
+    mutex_lock m(mu_);
+    auto it = fs_.lower_bound(fname);
+    if (it == fs_.end()) {
+      return errors::NotFound("");
+    }
+
+    if (it->first == fname) {
+      stat->is_directory = false;
+      stat->length = fs_[fname]->size();
+      stat->mtime_nsec = 0;
+      return Status::OK();
+    }
+
+    stat->is_directory = true;
+    stat->length = 0;
+    stat->mtime_nsec = 0;
+    return Status::OK();
+  }
+
+  Status DeleteFile(const string& fname) override {
+    mutex_lock m(mu_);
+    if (fs_.find(fname) != fs_.end()) {
+      fs_.erase(fname);
+      return Status::OK();
+    }
+
+    return errors::NotFound("");
+  }
+
+  Status CreateDir(const string& dirname) override { return Status::OK(); }
+
+  Status RecursivelyCreateDir(const string& dirname) override {
+    return Status::OK();
+  }
+
+  Status DeleteDir(const string& dirname) override { return Status::OK(); }
+
+  Status GetFileSize(const string& fname, uint64* file_size) override {
+    mutex_lock m(mu_);
+    if (fs_.find(fname) != fs_.end()) {
+      *file_size = fs_[fname]->size();
+      return Status::OK();
+    }
+    return errors::NotFound("");
+  }
+
+  Status RenameFile(const string& src, const string& target) override {
+    mutex_lock m(mu_);
+    if (fs_.find(src) != fs_.end()) {
+      fs_[target] = fs_[src];
+      fs_.erase(fs_.find(src));
+      return Status::OK();
+    }
+    return errors::NotFound("");
+  }
+
+  RamFileSystem() {}
+  ~RamFileSystem() override {}
+
+ private:
+  mutex mu_;
+  std::map<string, std::shared_ptr<std::string>> fs_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_RAM_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/ram_file_system_test.py b/tensorflow/core/platform/ram_file_system_test.py
new file mode 100644
index 00000000000..0f4f47ec44e
--- /dev/null
+++ b/tensorflow/core/platform/ram_file_system_test.py
@@ -0,0 +1,119 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for ram_file_system.h."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.estimator.estimator import Estimator
+from tensorflow.python.estimator.model_fn import EstimatorSpec
+from tensorflow.python.estimator.run_config import RunConfig
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+from tensorflow.python.training import training_util
+
+
+class RamFilesystemTest(test_util.TensorFlowTestCase):
+
+  def test_write_file(self):
+    with gfile.GFile('ram://a.txt', 'w') as f:
+      f.write('Hello, world.')
+      f.write('Hello, world.')
+
+    with gfile.GFile('ram://a.txt', 'r') as f:
+      self.assertEqual(f.read(), 'Hello, world.' * 2)
+
+  def test_append_file_with_seek(self):
+    with gfile.GFile('ram://c.txt', 'w') as f:
+      f.write('Hello, world.')
+
+    with gfile.GFile('ram://c.txt', 'w+') as f:
+      f.seek(offset=0, whence=2)
+      f.write('Hello, world.')
+
+    with gfile.GFile('ram://c.txt', 'r') as f:
+      self.assertEqual(f.read(), 'Hello, world.' * 2)
+
+  def test_list_dir(self):
+    for i in range(10):
+      with gfile.GFile('ram://a/b/%d.txt' % i, 'w') as f:
+        f.write('')
+      with gfile.GFile('ram://c/b/%d.txt' % i, 'w') as f:
+        f.write('')
+
+    matches = ['ram://a/b/%d.txt' % i for i in range(10)]
+    self.assertEqual(gfile.ListDirectory('ram://a/b/'), matches)
+
+  def test_glob(self):
+    for i in range(10):
+      with gfile.GFile('ram://a/b/%d.txt' % i, 'w') as f:
+        f.write('')
+      with gfile.GFile('ram://c/b/%d.txt' % i, 'w') as f:
+        f.write('')
+
+    matches = ['ram://a/b/%d.txt' % i for i in range(10)]
+    self.assertEqual(gfile.Glob('ram://a/b/*'), matches)
+
+    matches = []
+    self.assertEqual(gfile.Glob('ram://b/b/*'), matches)
+
+    matches = ['ram://c/b/%d.txt' % i for i in range(10)]
+    self.assertEqual(gfile.Glob('ram://c/b/*'), matches)
+
+  def test_estimator(self):
+
+    def model_fn(features, labels, mode, params):
+      del params
+      x = core_layers.dense(features, 100)
+      x = core_layers.dense(x, 100)
+      x = core_layers.dense(x, 100)
+      x = core_layers.dense(x, 100)
+      y = core_layers.dense(x, 1)
+      loss = losses.mean_squared_error(labels, y)
+      opt = adam.AdamOptimizer(learning_rate=0.1)
+      train_op = opt.minimize(
+          loss, global_step=training_util.get_or_create_global_step())
+
+      return EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+
+    def input_fn():
+      batch_size = 128
+      return (constant_op.constant(np.random.randn(batch_size, 100),
+                                   dtype=dtypes.float32),
+              constant_op.constant(np.random.randn(batch_size, 1),
+                                   dtype=dtypes.float32))
+
+    config = RunConfig(
+        model_dir='ram://estimator-0/', save_checkpoints_steps=1)
+    estimator = Estimator(config=config, model_fn=model_fn)
+
+    estimator.train(input_fn=input_fn, steps=10)
+    estimator.train(input_fn=input_fn, steps=10)
+    estimator.train(input_fn=input_fn, steps=10)
+    estimator.train(input_fn=input_fn, steps=10)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index d174b108279..6afe943c8e3 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -81,7 +81,12 @@ cc_library(
     hdrs = [
         "s3_file_system.h",
     ],
-    deps = [
+    deps = select({
+        "@org_tensorflow//tensorflow:windows": [
+            "//tensorflow/core/platform:retrying_file_system",
+        ],
+        "//conditions:default": [],
+    }) + [
         ":aws_crypto",
         ":aws_logging",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 02658242ab7..1726c9fbc6c 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -48,6 +48,13 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
+#ifdef PLATFORM_WINDOWS
+// On Windows, `Aws::FileSystem::CreateTempFilePath()` return
+// `C:\Users\username\AppData\Local\Temp\`. Adding template will cause an error.
+static const char* kS3TempFileTemplate = nullptr;
+#else
+static const char* kS3TempFileTemplate = "/tmp/s3_filesystem_XXXXXX";
+#endif
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
 static const int64 kS3TimeoutMsec = 300000;                       // 5 min
@@ -271,7 +278,7 @@ class S3WritableFile : public WritableFile {
         transfer_manager_(transfer_manager),
         sync_needed_(true),
         outfile_(Aws::MakeShared<Aws::Utils::TempFile>(
-            kS3FileSystemAllocationTag, "/tmp/s3_filesystem_XXXXXX",
+            kS3FileSystemAllocationTag, kS3TempFileTemplate,
             std::ios_base::binary | std::ios_base::trunc | std::ios_base::in |
                 std::ios_base::out)) {}
 
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 95c7467fb74..224e30c6bb3 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -232,7 +232,7 @@ TEST_F(S3FileSystemTest, HasAtomicMove) {
   const string fname = TmpDir("HasAtomicMove");
   TF_ASSERT_OK(WriteString(fname, "test"));
   bool has_atomic_move = true;
-  TF_EXPECT_OK(s3fs.NeedsTempLocation(fname, &has_atomic_move).code());
+  TF_EXPECT_OK(s3fs.HasAtomicMove(fname, &has_atomic_move));
   EXPECT_EQ(has_atomic_move, false);
 }
 
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 6497fdd0ce0..cd938a5be1d 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -132,19 +132,12 @@ std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
 }
 
 #if defined(TENSORFLOW_PROTOBUF_USES_CORD)
-void AssignRefCounted(StringPiece src, core::RefCounted* obj, Cord* out) {
+void AssignRefCounted(StringPiece src, core::RefCounted* obj, absl::Cord* out) {
   obj->Ref();
-  out->Clear();
-  // Defines a lambda to unref "obj" when Cord deletes this piece of
-  // memory. +[] converts the lambda to a C style function pointer.
-  auto cleanup = +[](absl::string_view donotcare, void* obj) {
-    reinterpret_cast<core::RefCounted*>(obj)->Unref();
-  };
-  out->AppendExternalMemory(absl::string_view(src.data(), src.size()), obj,
-                            cleanup);
+  *out = absl::MakeCordFromExternal(src, [obj] { obj->Unref(); });
 }
 
-void EncodeStringList(const tstring* strings, int64 n, Cord* out) {
+void EncodeStringList(const tstring* strings, int64 n, absl::Cord* out) {
   out->Clear();
   for (int i = 0; i < n; ++i) {
     ::strings::CordAppendVarint(strings[i].size(), out);
@@ -154,7 +147,7 @@ void EncodeStringList(const tstring* strings, int64 n, Cord* out) {
   }
 }
 
-bool DecodeStringList(const Cord& src, string* strings, int64 n) {
+bool DecodeStringList(const absl::Cord& src, string* strings, int64 n) {
   std::vector<uint32> sizes(n);
   CordReader reader(src);
   int64 tot = 0;
@@ -177,7 +170,7 @@ bool DecodeStringList(const Cord& src, string* strings, int64 n) {
   return true;
 }
 
-bool DecodeStringList(const Cord& src, tstring* strings, int64 n) {
+bool DecodeStringList(const absl::Cord& src, tstring* strings, int64 n) {
   std::vector<uint32> sizes(n);
   CordReader reader(src);
   int64 tot = 0;
@@ -200,13 +193,13 @@ bool DecodeStringList(const Cord& src, tstring* strings, int64 n) {
   return true;
 }
 
-void CopyFromArray(Cord* c, const char* base, size_t bytes) {
+void CopyFromArray(absl::Cord* c, const char* base, size_t bytes) {
   c->CopyFrom(base, bytes);
 }
 
 class CordStringListEncoderImpl : public StringListEncoder {
  public:
-  explicit CordStringListEncoderImpl(Cord* out) : out_(out) {}
+  explicit CordStringListEncoderImpl(absl::Cord* out) : out_(out) {}
   ~CordStringListEncoderImpl() override = default;
 
   void Append(const protobuf::MessageLite& m) override {
@@ -222,13 +215,13 @@ class CordStringListEncoderImpl : public StringListEncoder {
   void Finalize() override { out_->Append(rest_); }
 
  private:
-  Cord* out_;
+  absl::Cord* out_;
   string rest_;
 };
 
 class CordStringListDecoderImpl : public StringListDecoder {
  public:
-  explicit CordStringListDecoderImpl(const Cord& in) : reader_(in) {}
+  explicit CordStringListDecoderImpl(const absl::Cord& in) : reader_(in) {}
   ~CordStringListDecoderImpl() override = default;
 
   bool ReadSizes(std::vector<uint32>* sizes) override {
@@ -254,11 +247,11 @@ class CordStringListDecoderImpl : public StringListDecoder {
   std::vector<char> tmp_;
 };
 
-std::unique_ptr<StringListEncoder> NewStringListEncoder(Cord* out) {
+std::unique_ptr<StringListEncoder> NewStringListEncoder(absl::Cord* out) {
   return std::unique_ptr<StringListEncoder>(new CordStringListEncoderImpl(out));
 }
 
-std::unique_ptr<StringListDecoder> NewStringListDecoder(const Cord& in) {
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const absl::Cord& in) {
   return std::unique_ptr<StringListDecoder>(new CordStringListDecoderImpl(in));
 }
 
diff --git a/tensorflow/core/platform/tensor_coding.h b/tensorflow/core/platform/tensor_coding.h
index 010f9f11de7..8b6baf7e0d8 100644
--- a/tensorflow/core/platform/tensor_coding.h
+++ b/tensorflow/core/platform/tensor_coding.h
@@ -100,31 +100,33 @@ std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in);
 // Store src contents in *out.  If backing memory for src is shared with *out,
 // will ref obj during the call and will arrange to unref obj when no
 // longer needed.
-void AssignRefCounted(StringPiece src, core::RefCounted* obj, Cord* out);
+void AssignRefCounted(StringPiece src, core::RefCounted* obj, absl::Cord* out);
 
 // TODO(kmensah): Macro guard this with a check for Cord support.
-inline void CopyToArray(const Cord& src, char* dst) { src.CopyToArray(dst); }
+inline void CopyToArray(const absl::Cord& src, char* dst) {
+  src.CopyToArray(dst);
+}
 
 // Copy n bytes of src to dst. If pos >= src.size() the result is empty.
 // If pos + n > src.size() the subrange [pos, size()) is copied.
-inline void CopySubrangeToArray(const Cord& src, int64 pos, int64 n,
+inline void CopySubrangeToArray(const absl::Cord& src, int64 pos, int64 n,
                                 char* dst) {
   src.Subcord(pos, n).CopyToArray(dst);
 }
 
 // Store encoding of strings[0..n-1] in *out.
-void EncodeStringList(const tstring* strings, int64 n, Cord* out);
+void EncodeStringList(const tstring* strings, int64 n, absl::Cord* out);
 
 // Decode n strings from src and store in strings[0..n-1].
 // Returns true if successful, false on parse error.
-bool DecodeStringList(const Cord& src, std::string* strings, int64 n);
-bool DecodeStringList(const Cord& src, tstring* strings, int64 n);
+bool DecodeStringList(const absl::Cord& src, std::string* strings, int64 n);
+bool DecodeStringList(const absl::Cord& src, tstring* strings, int64 n);
 
 // Assigns base[0..bytes-1] to *c
-void CopyFromArray(Cord* c, const char* base, size_t bytes);
+void CopyFromArray(absl::Cord* c, const char* base, size_t bytes);
 
-std::unique_ptr<StringListEncoder> NewStringListEncoder(Cord* out);
-std::unique_ptr<StringListDecoder> NewStringListDecoder(const Cord& in);
+std::unique_ptr<StringListEncoder> NewStringListEncoder(absl::Cord* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const absl::Cord& in);
 #endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
 }  // namespace port
diff --git a/tensorflow/core/platform/windows/BUILD b/tensorflow/core/platform/windows/BUILD
index caba7db0d61..dddb4b9aed4 100644
--- a/tensorflow/core/platform/windows/BUILD
+++ b/tensorflow/core/platform/windows/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/core/platform:env.cc",
         "//tensorflow/core/platform:file_system.cc",
         "//tensorflow/core/platform:file_system_helper.cc",
+        "//tensorflow/core/platform:ram_file_system.h",
         "//tensorflow/core/platform:threadpool.cc",
     ],
     hdrs = [
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 843f41765ef..d75d2d5773d 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -31,6 +31,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/load_library.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/ram_file_system.h"
 #include "tensorflow/core/platform/windows/wide_char.h"
 #include "tensorflow/core/platform/windows/windows_file_system.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
@@ -192,6 +193,7 @@ class WindowsEnv : public Env {
 
 REGISTER_FILE_SYSTEM("", WindowsFileSystem);
 REGISTER_FILE_SYSTEM("file", LocalWinFileSystem);
+REGISTER_FILE_SYSTEM("ram", RamFileSystem);
 
 Env* Env::Default() {
   static Env* default_env = new WindowsEnv;
diff --git a/tensorflow/core/platform/windows/subprocess.cc b/tensorflow/core/platform/windows/subprocess.cc
index 59707eae498..cf0cabbc054 100644
--- a/tensorflow/core/platform/windows/subprocess.cc
+++ b/tensorflow/core/platform/windows/subprocess.cc
@@ -378,7 +378,7 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
       }
     } else {
       CloseHandle(parent_pipe_[CHAN_STDIN]);
-      parent_pipe_[CHAN_STDIN] == NULL;
+      parent_pipe_[CHAN_STDIN] = NULL;
     }
 
     if (parent_pipe_[CHAN_STDOUT] != nullptr) {
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 7e911b5c310..369d26a92d9 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -17,15 +17,18 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:cost_utils",
-        "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:op_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:trace_utils",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -35,9 +38,11 @@ tf_cc_test(
     srcs = ["xplane_to_op_metrics_db_test.cc"],
     deps = [
         ":xplane_to_op_metrics_db",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
@@ -86,12 +91,15 @@ cc_library(
         ":op_stats_to_input_pipeline_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:logging",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
+        "//tensorflow/core/profiler/utils:errors",
+        "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
@@ -115,12 +123,13 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/utils:errors",
         "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/util:stats_calculator_portable",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -133,13 +142,12 @@ cc_library(
     hdrs = ["op_stats_to_tf_stats.h"],
     deps = [
         ":op_metrics_to_record",
+        "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
-        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -150,13 +158,18 @@ tf_cc_test(
     deps = [
         ":op_stats_to_tf_stats",
         ":xplane_to_op_stats",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -169,6 +182,9 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:timespan",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -203,21 +219,29 @@ cc_library(
     srcs = ["xplane_to_op_stats.cc"],
     hdrs = ["xplane_to_op_stats.h"],
     deps = [
+        ":op_metrics_db_combiner",
         ":step_events_to_steps_db",
         ":xplane_to_kernel_stats_db",
         ":xplane_to_op_metrics_db",
         ":xplane_to_step_events",
+        ":xplane_to_tf_functions",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
+        "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -227,6 +251,7 @@ tf_cc_test(
     srcs = ["xplane_to_op_stats_test.cc"],
     deps = [
         ":xplane_to_op_stats",
+        ":xplane_to_tf_functions",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -234,11 +259,15 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -251,6 +280,7 @@ cc_library(
         ":op_stats_to_overview_page",
         ":op_stats_to_tf_stats",
         ":trace_events_to_json",
+        ":xplane_to_memory_profile",
         ":xplane_to_op_stats",
         ":xplane_to_trace_events",
         "//tensorflow/core:lib",
@@ -258,11 +288,15 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/rpc/client:save_profile",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
@@ -274,12 +308,14 @@ tf_cc_test(
     srcs = ["xplane_to_profile_response_test.cc"],
     deps = [
         ":xplane_to_profile_response",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
@@ -293,13 +329,16 @@ cc_library(
     hdrs = ["xplane_to_step_events.h"],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -309,12 +348,16 @@ tf_cc_test(
     srcs = ["xplane_to_step_events_test.cc"],
     deps = [
         ":xplane_to_step_events",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -328,7 +371,9 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -344,6 +389,8 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
@@ -359,13 +406,97 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "xplane_to_tf_functions",
+    srcs = ["xplane_to_tf_functions.cc"],
+    hdrs = ["xplane_to_tf_functions.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:math_utils",
+        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:timespan",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "xplane_to_tf_functions_test",
+    size = "small",
+    srcs = ["xplane_to_tf_functions_test.cc"],
+    deps = [
+        ":xplane_to_tf_functions",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:xplane_builder",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "xplane_to_memory_profile",
+    srcs = ["xplane_to_memory_profile.cc"],
+    hdrs = ["xplane_to_memory_profile.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:protos_all_cc",
+        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "xplane_to_memory_profile_test",
+    size = "small",
+    srcs = ["xplane_to_memory_profile_test.cc"],
+    deps = [
+        ":xplane_to_memory_profile",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:xplane_builder",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
index b4c856e689c..8229d1020b9 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -28,6 +29,7 @@ void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst) {
   DCHECK_EQ(src.name(), dst->name());
   dst->set_category(src.category());
   dst->set_provenance(src.provenance());
+  dst->set_is_eager(dst->is_eager() || src.is_eager());
   dst->set_deduplicated_name(src.deduplicated_name());
   if (!dst->has_layout() && src.has_layout()) {
     *dst->mutable_layout() = src.layout();
diff --git a/tensorflow/core/profiler/convert/op_metrics_to_record.cc b/tensorflow/core/profiler/convert/op_metrics_to_record.cc
index b51c679776b..8e28199b827 100644
--- a/tensorflow/core/profiler/convert/op_metrics_to_record.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_to_record.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 
+#include <iterator>
 #include <tuple>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 7b24cd9801f..89b4939f5d0 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 
+#include <math.h>
+
 #include <algorithm>
-#include <utility>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
@@ -36,7 +36,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/errors.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -83,6 +85,10 @@ constexpr int kHostAnalysisSectionNumber = 3;
 const char* kAllOthersPythonExplanation =
     " % of the total step time sampled is spent on 'All Others' time. "
     "This could be due to Python execution overhead.";
+// Explanation for "Kernel Launch" time due to CPU contention with tf.data.
+const char* kKernelLaunchTfDataContention =
+    " It could be due to CPU contention with tf.data. In this case, you may "
+    "try to set the environment variable TF_GPU_THREAD_MODE=gpu_private.";
 
 template <class Collection>
 double GetTimeInMs(const Collection& type_ps, EventType event_type) {
@@ -98,7 +104,7 @@ StepSummary GetStepSummaryForSampleStats(const Stat<double>& sample_stats) {
     avg = sdv = min = max = 0.0;
   } else {
     avg = sample_stats.avg();
-    sdv = std::sqrt(sample_stats.sample_variance());
+    sdv = sqrt(sample_stats.sample_variance());
     min = sample_stats.min();
     max = sample_stats.max();
   }
@@ -238,7 +244,7 @@ enum class InputOpCategory {
   kPreprocessing      // data preprocessing.
 };
 
-string InputOpCategoryString(InputOpCategory category) {
+std::string InputOpCategoryString(InputOpCategory category) {
   switch (category) {
     case InputOpCategory::kEnqueue:
       return "Enqueue";
@@ -322,10 +328,6 @@ InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
   return details;
 }
 
-string AnchorElement(absl::string_view url, absl::string_view text) {
-  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
-}
-
 // Returns the ratio of the host-to-device time in each step to the step-time.
 double RatioOfHostToDeviceTimeToStepTime(
     const OpMetricsDb& host_tf_metrics_db,
@@ -356,21 +358,27 @@ double RatioOfHostToDeviceTimeToStepTime(
   return 0.0;
 }
 
-void KernelLaunchAnalysis(double kernel_launch_percent,
-                          string* kernel_launch_classification,
-                          string* kernel_launch_statement) {
-  string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
+void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
+                          std::string* kernel_launch_classification,
+                          std::string* kernel_launch_statement) {
+  std::string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
   if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
     *kernel_launch_classification = "high";
     *kernel_launch_statement = absl::StrCat(
         percent_str,
         " % of the total step time sampled is spent on 'Kernel Launch'.");
+    if (tfdata_used) {
+      absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
+    }
   } else if (kernel_launch_percent >=
              kModeratelyKernelLaunchBoundThresholdInPercent) {
     *kernel_launch_classification = "moderate";
     *kernel_launch_statement = absl::StrCat(
         percent_str,
         " % of the total step time sampled is spent on 'Kernel Launch'.");
+    if (tfdata_used) {
+      absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
+    }
   } else {
     *kernel_launch_classification = "no";
     *kernel_launch_statement = "";
@@ -378,14 +386,14 @@ void KernelLaunchAnalysis(double kernel_launch_percent,
 }
 
 void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
-                      string* all_other_classification,
-                      string* all_other_statement) {
+                      std::string* all_other_classification,
+                      std::string* all_other_statement) {
   if (all_other_reported) {
     *all_other_classification = "no";
     *all_other_statement = "";
     return;
   }
-  string percent_str = absl::StrFormat("%.1lf", all_other_percent);
+  std::string percent_str = absl::StrFormat("%.1lf", all_other_percent);
   if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
     *all_other_classification = "high";
     *all_other_statement =
@@ -544,17 +552,29 @@ StepSummary ComputeStepTimeSummaryInMs(
   return GetStepSummaryForSampleStats(total_step_stats_in_ms);
 }
 
+void AddErrorMessages(const OpStats& op_stats,
+                      InputPipelineAnalysisResult* result) {
+  if (op_stats.step_db().use_incomplete_step()) {
+    *result->add_error_messages() =
+        absl::StrCat("WARNING: ", kErrorIncompleteStep);
+  } else if (op_stats.step_db().step_sequence().empty()) {
+    *result->add_error_messages() =
+        absl::StrCat("WARNING: ", kErrorNoStepMarker);
+  }
+}
+
 InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
     const OpStats& op_stats, const HardwareType& hardware_type) {
   InputPipelineAnalysisResult result =
       ComputeGenericInputPipelineAnalysisResult(
           op_stats.step_db().step_sequence());
+  AddErrorMessages(op_stats, &result);
   result.set_hardware_type(HardwareType_Name(hardware_type));
   GenerateHostResult(op_stats.host_op_metrics_db(), &result);
 
   InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
-  BottleneckAnalysis bottleneck_analysis =
-      ComputeBottleneckAnalysis(result.step_details());
+  BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
+      result.input_time_breakdown(), result.step_details());
   recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
   *recommendation.mutable_summary_next_step() =
       GetSummaryNextStep(bottleneck_analysis.input_classification(),
@@ -565,9 +585,10 @@ InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
 }
 
 bool InputAnalysis(double input_percent, double all_other_percent,
-                   string* input_classification, string* input_statement) {
+                   std::string* input_classification,
+                   std::string* input_statement) {
   absl::string_view non_input_time = "other time";
-  string infeed_percent_str = absl::StrFormat("%.1lf", input_percent);
+  std::string infeed_percent_str = absl::StrFormat("%.1lf", input_percent);
   if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
     *input_classification = "host";
     *input_statement = absl::StrCat(
@@ -587,9 +608,10 @@ bool InputAnalysis(double input_percent, double all_other_percent,
     // Input analysis says it is not input-bound, but "All-Other" time
     // is significant. It could still be input-bound (or Python overhead).
     *input_classification = "both";
-    string all_other_percent_str = absl::StrFormat("%.1lf", all_other_percent);
+    std::string all_other_percent_str =
+        absl::StrFormat("%.1lf", all_other_percent);
     *input_statement = absl::StrCat(
-        "Your program in POTENTIALLY input-bound because ",
+        "Your program is POTENTIALLY input-bound because ",
         all_other_percent_str,
         "% of the total step time sampled is spent on 'All Others' time (which "
         "could be due to I/O or Python execution or both).");
@@ -607,8 +629,8 @@ bool InputAnalysis(double input_percent, double all_other_percent,
   }
 }
 
-void OutputAnalysis(double output_percent, string* output_classification,
-                    string* output_statement) {
+void OutputAnalysis(double output_percent, std::string* output_classification,
+                    std::string* output_statement) {
   string tc_outfeed_percent_str = absl::StrFormat("%.1lf", output_percent);
   if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
     *output_classification = "host";
@@ -633,6 +655,7 @@ void OutputAnalysis(double output_percent, string* output_classification,
 }
 
 BottleneckAnalysis ComputeBottleneckAnalysis(
+    const InputTimeBreakdown& input_time_breakdown,
     const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
         any_step_details) {
   double total_step_time_ms = 0;
@@ -679,19 +702,19 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   double kernel_launch_percent =
       100.0 * total_host_prepare_ms / total_step_time_ms;
   double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
-  string input_classification;
-  string input_statement;
+  std::string input_classification;
+  std::string input_statement;
   bool all_other_reported =
       InputAnalysis(input_percent, all_other_percent, &input_classification,
                     &input_statement);
 
-  string kernel_launch_classification;
-  string kernel_launch_statement;
-  KernelLaunchAnalysis(kernel_launch_percent, &kernel_launch_classification,
-                       &kernel_launch_statement);
+  std::string kernel_launch_classification;
+  std::string kernel_launch_statement;
+  KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
+                       &kernel_launch_classification, &kernel_launch_statement);
 
-  string all_other_classification;
-  string all_other_statement;
+  std::string all_other_classification;
+  std::string all_other_statement;
   AllOtherAnalysis(all_other_reported, all_other_percent,
                    &all_other_classification, &all_other_statement);
 
@@ -705,9 +728,9 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   return analysis;
 }
 
-string GetSummaryNextStep(absl::string_view input_classification,
-                          const InputTimeBreakdown& breakdown) {
-  string summary_next_step;
+std::string GetSummaryNextStep(absl::string_view input_classification,
+                               const InputTimeBreakdown& breakdown) {
+  std::string summary_next_step;
   if (input_classification == "host" || input_classification == "both") {
     if (!TfDataInUse(breakdown)) {
       summary_next_step = absl::StrCat(
@@ -729,5 +752,17 @@ string GetSummaryNextStep(absl::string_view input_classification,
   return summary_next_step;
 }
 
+double HostToDeviceTransferAsPercentOfInputTime(
+    const InputTimeBreakdown& breakdown) {
+  // Thanks to the scaling trick we did in GenerateHostResult(), we can
+  // estimate the percentage of input-time spent on host-to-device transfer in
+  // the following way.
+  double total_input_time_us =
+      breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
+      breakdown.preprocessing_us() + breakdown.enqueue_us() +
+      breakdown.unclassified_non_enqueue_us();
+  return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
index a6377896bb6..2191251ee88 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
@@ -16,18 +16,32 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
 
+#include <string>
+
 #include "google/protobuf/any.pb.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
+// If the percent of input-time spent on host-to-device transfer is greater than
+// kHostToDeviceTimePercentAsSignificant, we should advise the
+// user to optimize this transfer.
+constexpr double kHostToDeviceTimePercentAsSignificant = 10.0;
+
+// If the percent of input-time spent on host-to-device transfer is greater than
+// kHostToDeviceTimePercentAsDominant, we should ONLY advise the
+// user to optimize this transfer; we won't bother to suggest optimization for
+// tf.data.
+constexpr double kHostToDeviceTimePercentAsDominant = 90.0;
+
 // Computes the summary of step time in milliseconds.
 StepSummary ComputeStepTimeSummaryInMs(
     const ::tensorflow::protobuf::RepeatedPtrField<PerCoreStepInfo>&
@@ -40,6 +54,7 @@ InputPipelineAnalysisRecommendation GenerateRecommendation();
 
 // Returns the performance bottleneck of the program executed.
 BottleneckAnalysis ComputeBottleneckAnalysis(
+    const InputTimeBreakdown& input_time_breakdown,
     const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
         any_step_details);
 
@@ -49,14 +64,23 @@ InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
 // Returns true if explanation for "All Others" time is also included in
 // input_statement.
 bool InputAnalysis(double input_percent, double all_other_percent,
-                   string* input_classification, string* input_statement);
+                   std::string* input_classification,
+                   std::string* input_statement);
 
-void OutputAnalysis(double output_percent, string* output_classification,
-                    string* output_statement);
+void OutputAnalysis(double output_percent, std::string* output_classification,
+                    std::string* output_statement);
 
 string GetSummaryNextStep(absl::string_view input_classification,
                           const InputTimeBreakdown& breakdown);
 
+// Returns the percentage of the input time that is spent on transferring the
+// data from host to device.
+double HostToDeviceTransferAsPercentOfInputTime(
+    const InputTimeBreakdown& breakdown);
+
+void AddErrorMessages(const OpStats& op_stats,
+                      InputPipelineAnalysisResult* result);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index b3d7ef3e443..bec92e0d998 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -15,13 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
 
-#include <algorithm>
-#include <utility>
+#include <string>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
@@ -30,6 +28,10 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/utils/errors.h"
+#include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -43,24 +45,23 @@ namespace {
 // statement of suggestion will be made.
 constexpr double kLowPrecisionPercentThreshold = 10;
 
-OverviewPageTip MakeOverviewPageTip(const string& text) {
-  OverviewPageTip tip;
-  tip.set_link(text);
-  return tip;
-}
+struct TfFunctionInfo {
+  absl::string_view function_name;
+  double expensive_call_percent;
+};
 
-string AnchorElement(const string& url, const string& text) {
-  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
+OverviewPageTip MakeOverviewPageTip(std::string text) {
+  OverviewPageTip tip;
+  tip.set_link(std::move(text));
+  return tip;
 }
 
 // Makes a recommendation for looking up a document.
 // doc_url is expected to be already be escaped suitably for use in an HTML
 // attribute.
-OverviewPageTip MakeOverviewPageTipDocLink(const string& doc_url,
-                                           const string& text) {
-  OverviewPageTip tip;
-  tip.set_link(AnchorElement(doc_url, text));
-  return tip;
+OverviewPageTip MakeOverviewPageTipDocLink(absl::string_view doc_url,
+                                           absl::string_view text) {
+  return MakeOverviewPageTip(AnchorElement(doc_url, text));
 }
 
 void ComputeHostTips(OverviewPageRecommendation* re) {
@@ -74,12 +75,13 @@ void ComputeHostTips(OverviewPageRecommendation* re) {
 
 void ComputeDeviceTips(HardwareType hardware_type,
                        OverviewPageRecommendation* re) {
-  const string& device_name = HardwareType_Name(hardware_type);
-  string timeline_name =
-      (hardware_type == tensorflow::profiler::TPU) ? "TPU core" : device_name;
-  string op_stats_toolname = (hardware_type == tensorflow::profiler::TPU)
-                                 ? "op_profile"
-                                 : "tensorflow_stats";
+  absl::string_view device_name = HardwareType_Name(hardware_type);
+  absl::string_view timeline_name = device_name;
+  absl::string_view op_stats_toolname = "tensorflow_stats";
+  if (hardware_type == tensorflow::profiler::TPU) {
+    timeline_name = "TPU core";
+    op_stats_toolname = "op_profile";
+  }
   *re->add_device_tips() = MakeOverviewPageTip(
       absl::StrCat(op_stats_toolname,
                    " (identify the time-consuming operations "
@@ -120,14 +122,16 @@ std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
 
 }  // namespace
 
-void SetCommonRecommendation(const string& input_classification,
-                             const string& input_statement,
-                             const string& output_statement,
+void SetCommonRecommendation(absl::string_view input_classification,
+                             absl::string_view input_statement,
+                             absl::string_view output_statement,
                              HardwareType hardware_type,
+                             absl::string_view tf_function_statement_html,
                              OverviewPageRecommendation* re) {
-  re->set_bottleneck(input_classification);
-  re->set_statement(input_statement);
-  re->set_output_statement(output_statement);
+  re->set_bottleneck(std::string(input_classification));
+  re->set_statement(std::string(input_statement));
+  re->set_output_statement(std::string(output_statement));
+  re->set_tf_function_statement_html(std::string(tf_function_statement_html));
   ComputeHostTips(re);
   ComputeDeviceTips(hardware_type, re);
   ComputeDocumentationTips(re);
@@ -244,6 +248,35 @@ OverviewPageRunEnvironment ComputeRunEnvironment(
   return re;
 }
 
+std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
+  std::vector<TfFunctionInfo> candidates;
+  for (const auto& name_fun : tf_function_db.tf_functions()) {
+    const auto& fun = name_fun.second;
+    if (fun.expensive_call_percent() >= kTfFunctionReportThresholdInPercent) {
+      candidates.push_back({name_fun.first, fun.expensive_call_percent()});
+    }
+  }
+  if (candidates.empty()) return "";
+  auto cmp = [](const TfFunctionInfo& a, const TfFunctionInfo& b) {
+    return a.expensive_call_percent > b.expensive_call_percent;
+  };
+  // Sorts candidates in descending order of expensive_call_percent.
+  absl::c_sort(candidates, cmp);
+  std::string expensive_functions = "";
+  auto num_functions_shown = std::min(
+      static_cast<decltype(candidates)::size_type>(3), candidates.size());
+
+  for (auto i = 0; i < num_functions_shown; i++) {
+    if (i > 0) absl::StrAppend(&expensive_functions, ", ");
+    absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,
+                    "\"");
+  }
+  if (candidates.size() > num_functions_shown)
+    absl::StrAppend(&expensive_functions, " and more");
+  return absl::StrCat("Expensive tf-functions detected (", expensive_functions,
+                      ") due to either retracing or eager execution.");
+}
+
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
                                           HardwareType hardware_type) {
   OverviewPage overview_page;
@@ -252,22 +285,24 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
   *overview_page.mutable_analysis() = ComputeAnalysisResult(op_stats);
   *overview_page.mutable_input_analysis() =
       ConvertOpStatsToInputPipelineAnalysis(op_stats, hardware_type);
-  BottleneckAnalysis bottleneck =
-      ComputeBottleneckAnalysis(overview_page.input_analysis().step_details());
+  BottleneckAnalysis bottleneck = ComputeBottleneckAnalysis(
+      overview_page.input_analysis().input_time_breakdown(),
+      overview_page.input_analysis().step_details());
   *overview_page.mutable_recommendation() = ComputeGenericRecommendation(
       bottleneck, op_stats.device_op_metrics_db().precision_stats());
-  SetCommonRecommendation(bottleneck.input_classification(),
-                          bottleneck.input_statement(), "", hardware_type,
-                          overview_page.mutable_recommendation());
+  SetCommonRecommendation(
+      bottleneck.input_classification(), bottleneck.input_statement(), "",
+      hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
+      overview_page.mutable_recommendation());
   return overview_page;
 }
 
 void SetRemarks(const OpStats& op_stats, OverviewPageAnalysis* analysis) {
-  if (op_stats.step_db().step_sequence_size() == 0) {
-    analysis->set_remark_text(
-        "WARNING: No step markers observed and hence the step time is actually "
-        "unknown. This may happen if your profiling duration is shorter than "
-        "the step time. In that case, you may try to profile longer.");
+  if (op_stats.step_db().use_incomplete_step()) {
+    analysis->set_remark_text(absl::StrCat("WARNING: ", kErrorIncompleteStep));
+    analysis->set_remark_color("red");
+  } else if (op_stats.step_db().step_sequence().empty()) {
+    analysis->set_remark_text(absl::StrCat("WARNING: ", kErrorNoStepMarker));
     analysis->set_remark_color("red");
   } else {
     analysis->set_remark_text("");
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index e6d12708e9f..b4b3991a18d 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -17,9 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
@@ -29,10 +27,16 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-void SetCommonRecommendation(const string& input_classification,
-                             const string& input_statement,
-                             const string& output_statement,
+// Reports tf-function optimization opportunity in the Overview Page if the
+// expensive-call-time percentage is over this threshold for at least one of
+// the tf-functions profiled.
+const double kTfFunctionReportThresholdInPercent = 20;
+
+void SetCommonRecommendation(absl::string_view input_classification,
+                             absl::string_view input_statement,
+                             absl::string_view output_statement,
                              HardwareType hardware_type,
+                             absl::string_view tf_function_statement_html,
                              OverviewPageRecommendation* re);
 
 OverviewPageRecommendation ComputeGenericRecommendation(
@@ -47,6 +51,9 @@ OverviewPageRunEnvironment ComputeRunEnvironment(
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
                                           HardwareType hardware_type);
 
+// Returns a html which provides tf-function related recommendation.
+std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
+
 void SetRemarks(const OpStats& op_stats, OverviewPageAnalysis* analysis);
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
index 23561169c4e..e23813a5b5d 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
@@ -15,13 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 
-#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
 namespace tensorflow {
@@ -33,6 +32,7 @@ TfStatsRecord ConvertOpMetricsToTfStatsRecord(
     double ridge_point_operational_intensity) {
   TfStatsRecord record;
   record.set_host_or_device(on_device ? "Device" : "Host");
+  record.set_is_eager(metrics.is_eager());
   record.set_op_type(metrics.category());
   record.set_op_name(metrics.name());
   SetExecutionTimes(metrics, &record);
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 3e098da7eb8..9ca83b51a70 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -75,8 +79,8 @@ TEST(OpStatsToTfStats, GpuTfStats) {
                        kKernel3DurationNs, /*on_device=*/true, kKernel3,
                        &device_plane, &stream2);
 
-  const OpStats& op_stats = ConvertXSpaceToOpStats(space);
-  const TfStatsDatabase& tf_stats = ConvertOpStatsToTfStats(op_stats);
+  const OpStats op_stats = ConvertXSpaceToOpStats(space);
+  const TfStatsDatabase tf_stats = ConvertOpStatsToTfStats(op_stats);
 
   // TfOp1, TfOp2, Idle
   EXPECT_EQ(3, tf_stats.with_idle().tf_stats_record_size());
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
index ed0d83ade2f..e4713cd73fb 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
@@ -15,10 +15,18 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 
 #include <sstream>
+#include <utility>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.h b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
index b3ea74e905f..9db65163f7a 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.h
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
 
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json.cc b/tensorflow/core/profiler/convert/trace_events_to_json.cc
index 9c8176c10ad..07e32ced9d0 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json.cc
+++ b/tensorflow/core/profiler/convert/trace_events_to_json.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/trace_events_to_json.h"
 
+#include <algorithm>
+#include <map>
+#include <utility>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "include/json/json.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
index 785902e2a50..023d6a73d77 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
@@ -15,16 +15,20 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 
+#include <functional>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
index 04bd0e8ae5f..9c7fca22887 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
@@ -17,9 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
 
 #include <functional>
-#include <vector>
 
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -31,6 +29,7 @@ KernelStatsDb ConvertDeviceTraceXPlaneToKernelStatsDb(
     const XPlane& device_trace,
     const std::function<void(const XEventVisitor&, KernelReport*)>&
         on_kernel_fn);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
new file mode 100644
index 00000000000..5b2a7489241
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -0,0 +1,474 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
+
+#include <algorithm>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+// Index of the time-sorted memory_profile_snapshots list, and the
+// MemoryActivityMetadata proto it contains.
+using IndexMetaPair = std::pair<int64 /*index*/, const MemoryActivityMetadata*>;
+
+// Aggregated memory stats from an allocator. Temporary container to fill
+// MemoryAggregationStats.
+struct AggregationStats {
+  int64 bytes_reserved = 0;
+  int64 bytes_allocated = 0;
+  int64 bytes_available = 0;
+  double fragmentation = 0;
+  int64 peak_bytes_in_use = 0;
+};
+
+// Metadata associated with each memory allocation/deallocation activity.
+// Temporary container to fill MemoryActivityMetadata.
+struct ActivityMetadata {
+  int64 requested_bytes = 0;
+  int64 allocation_bytes = 0;
+  uint64 address = 0;
+  absl::string_view tf_op_name;
+  int64 step_id = -1;
+  absl::string_view region_type;
+  int64 data_type = 0;
+  absl::string_view tensor_shape;
+};
+
+bool IsMemoryAllocation(int64 event_type) {
+  return event_type == HostEventType::kMemoryAllocation;
+}
+
+bool IsMemoryDeallocation(int64 event_type) {
+  return event_type == HostEventType::kMemoryDeallocation;
+}
+
+void FillAggregationStats(const AggregationStats& src,
+                          MemoryAggregationStats* dst) {
+  dst->set_stack_reserved_bytes(src.bytes_reserved);
+  dst->set_heap_allocated_bytes(src.bytes_allocated);
+  dst->set_free_memory_bytes(src.bytes_available);
+  dst->set_fragmentation(src.fragmentation);
+  dst->set_peak_bytes_in_use(src.peak_bytes_in_use);
+}
+
+void FillActivityMetadata(int64 event_type, const ActivityMetadata& src,
+                          MemoryActivityMetadata* dst) {
+  if (IsMemoryAllocation(event_type)) {
+    dst->set_memory_activity(ALLOCATION);
+  } else if (IsMemoryDeallocation(event_type)) {
+    dst->set_memory_activity(DEALLOCATION);
+  }
+  dst->set_requested_bytes(src.requested_bytes);
+  dst->set_allocation_bytes(src.allocation_bytes);
+  dst->set_address(src.address);
+  dst->set_tf_op_name(std::string(src.tf_op_name));
+  dst->set_step_id(src.step_id);
+  dst->set_region_type(std::string(src.region_type));
+  dst->set_data_type(tensorflow::DataTypeString(
+      static_cast<tensorflow::DataType>(src.data_type)));
+  dst->set_tensor_shape(std::string(src.tensor_shape));
+}
+
+void UpdateProfileSummary(const AggregationStats& stats, int64 time_offset_ps,
+                          MemoryProfileSummary* summary) {
+  // Update the peak memory usage over allocator's lifetime.
+  summary->set_peak_bytes_usage_lifetime(stats.peak_bytes_in_use);
+  MemoryAggregationStats* peak_stats = summary->mutable_peak_stats();
+  // If we reach (or stay at) peak memory usage within the profiling window,
+  // update memory profile summary.
+  if (stats.bytes_reserved + stats.bytes_allocated >=
+      peak_stats->peak_bytes_in_use()) {
+    peak_stats->set_peak_bytes_in_use(stats.bytes_reserved +
+                                      stats.bytes_allocated);
+    peak_stats->set_stack_reserved_bytes(stats.bytes_reserved);
+    peak_stats->set_heap_allocated_bytes(stats.bytes_allocated);
+    peak_stats->set_free_memory_bytes(stats.bytes_available);
+    peak_stats->set_fragmentation(stats.fragmentation);
+    summary->set_peak_stats_time_ps(time_offset_ps);
+    summary->set_memory_capacity(stats.bytes_reserved + stats.bytes_allocated +
+                                 stats.bytes_available);
+  }
+}
+
+// Generate memory profile proto by processing host trace XPlane.
+MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(host_trace);
+  MemoryProfile memory_profile;
+  auto* step_count = memory_profile.mutable_step_count();
+  // Iterate over all XEvents in the XPlane, and add the XStats to a new
+  // MemoryProfileSnapshot if the EventType is kMemoryAllocation or
+  // kMemoryDeallocation.
+  plane.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      int64 event_type = event.Type().value_or(kUnknownHostEventType);
+      if (!(IsMemoryAllocation(event_type) ||
+            IsMemoryDeallocation(event_type))) {
+        return;
+      }
+
+      AggregationStats stats;
+      ActivityMetadata metadata;
+      std::string memory_id;
+      event.ForEachStat([&](const XStatVisitor& stat) {
+        if (stat.Type() == StatType::kIndexOnHost ||
+            stat.Type() == StatType::kDeviceOrdinal) {
+          memory_id = absl::StrFormat("%d", stat.IntValue());
+        } else if (stat.Type() == StatType::kAllocatorName) {
+          memory_id = stat.ToString();
+        } else if (stat.Type() == StatType::kBytesReserved) {
+          stats.bytes_reserved = stat.IntValue();
+        } else if (stat.Type() == StatType::kBytesAllocated) {
+          stats.bytes_allocated = stat.IntValue();
+        } else if (stat.Type() == StatType::kBytesAvailable) {
+          stats.bytes_available = stat.IntValue();
+        } else if (stat.Type() == StatType::kFragmentation) {
+          stats.fragmentation = stat.DoubleValue();
+        } else if (stat.Type() == StatType::kPeakBytesInUse) {
+          stats.peak_bytes_in_use = stat.IntValue();
+        } else if (stat.Type() == StatType::kRequestedBytes) {
+          metadata.requested_bytes = stat.IntValue();
+        } else if (stat.Type() == StatType::kAllocationBytes) {
+          metadata.allocation_bytes = stat.IntValue();
+        } else if (stat.Type() == StatType::kAddress) {
+          metadata.address = stat.IntValue();
+        } else if (stat.Type() == StatType::kTfOp) {
+          metadata.tf_op_name = stat.StrOrRefValue();
+        } else if (stat.Type() == StatType::kStepId) {
+          metadata.step_id = stat.IntValue();
+          if (metadata.step_id != 0) (*step_count)[metadata.step_id]++;
+        } else if (stat.Type() == StatType::kRegionType) {
+          metadata.region_type = stat.StrOrRefValue();
+        } else if (stat.Type() == StatType::kDataType) {
+          metadata.data_type = stat.IntValue();
+        } else if (stat.Type() == StatType::kTensorShapes) {
+          metadata.tensor_shape = stat.StrOrRefValue();
+        }
+      });
+
+      MemoryProfileSnapshot* snapshot =
+          (*memory_profile.mutable_memory_profile_per_allocator())[memory_id]
+              .add_memory_profile_snapshots();
+      snapshot->set_time_offset_ps(event.OffsetPs());
+      FillAggregationStats(stats, snapshot->mutable_aggregation_stats());
+      FillActivityMetadata(event_type, metadata,
+                           snapshot->mutable_activity_metadata());
+
+      MemoryProfileSummary* summary =
+          (*memory_profile.mutable_memory_profile_per_allocator())[memory_id]
+              .mutable_profile_summary();
+      UpdateProfileSummary(stats, event.OffsetPs(), summary);
+    });
+  });
+  return memory_profile;
+}
+
+// Sequentialize step ids for the memory profile.
+void UpdateStepId(const tensorflow::protobuf::Map<
+                      tensorflow::protobuf_int64 /*orig_step_id*/,
+                      tensorflow::protobuf_int64 /*count*/>& step_count,
+                  PerAllocatorMemoryProfile* memory_profile) {
+  // Map from original random step id to sequential step id.
+  absl::flat_hash_map<int64 /*orig_step_id*/, int64 /*step_id*/> step_map;
+  constexpr int kUnknownStep = -2;
+  constexpr double kStepFilterRatio = 0.1;  // Magic number for filtering.
+  tensorflow::protobuf_int64 max_step_count = 0;
+  for (const auto& step_and_count : step_count) {
+    max_step_count = std::max(max_step_count, step_and_count.second);
+  }
+  // Filter out noisy and incomplete original step ids.
+  for (const auto& step_and_count : step_count) {
+    if (static_cast<double>(step_and_count.second) / max_step_count >
+        kStepFilterRatio) {
+      step_map[step_and_count.first] = kUnknownStep;
+    }
+  }
+
+  // Update the step ids in memory_profile for this allocator.
+  int64 step_id = -1;
+  for (auto& snapshot : *memory_profile->mutable_memory_profile_snapshots()) {
+    DCHECK(snapshot.has_activity_metadata());
+    // Convert the random step id to sequential step id.
+    int64 orig_step_id = snapshot.activity_metadata().step_id();
+    if (step_map.contains(orig_step_id) &&
+        step_map[orig_step_id] == kUnknownStep) {
+      step_map[orig_step_id] = ++step_id;
+    }
+    snapshot.mutable_activity_metadata()->set_step_id(step_id);
+  }
+  VLOG(2) << "Max sequential step id in profile: " << step_id;
+}
+
+// Update the MemoryActivityMetadata for each deallocation event by copying from
+// matching allocation.
+void UpdateDeallocation(PerAllocatorMemoryProfile* memory_profile) {
+  absl::flat_hash_map<uint64 /*address*/, const MemoryActivityMetadata*>
+      addr_metadata_map;
+  for (auto& snapshot : *memory_profile->mutable_memory_profile_snapshots()) {
+    // Match the deallocation with previous allocation based on address.
+    uint64 address = snapshot.activity_metadata().address();
+    if (snapshot.activity_metadata().memory_activity() == DEALLOCATION) {
+      if (addr_metadata_map.contains(address)) {
+        const MemoryActivityMetadata* alloc_meta = addr_metadata_map[address];
+        snapshot.mutable_activity_metadata()->set_tf_op_name(
+            alloc_meta->tf_op_name());
+        snapshot.mutable_activity_metadata()->set_region_type(
+            alloc_meta->region_type());
+        snapshot.mutable_activity_metadata()->set_data_type(
+            alloc_meta->data_type());
+        snapshot.mutable_activity_metadata()->set_tensor_shape(
+            alloc_meta->tensor_shape());
+        // In case of following (unexpected) deallocations to the same chunk
+        // address, leave the metadata as it is (empty or already captured).
+        addr_metadata_map.erase(address);
+      } else {
+        VLOG(2)
+            << "Can't find matching memory allocation for this deallocation: "
+            << snapshot.DebugString();
+      }
+    } else if (!addr_metadata_map.contains(address)) {  // Allocation.
+      addr_metadata_map[address] = &snapshot.activity_metadata();
+    } else {
+      VLOG(2) << "There are two allocations recorded for the same address: "
+              << address
+              << ". The later allocation event is: " << snapshot.DebugString();
+    }
+  }
+  VLOG(2) << "Number of allocations that cannot find matching dealloctions: "
+          << addr_metadata_map.size();
+}
+
+// Return the step id for the peak memory usage data point.
+int64 GetPeakMemoryStep(int64 peak_bytes_profile,
+                        const PerAllocatorMemoryProfile* memory_profile) {
+  int64 peak_bytes_profile_step_id = 0;
+  for (const auto& snapshot : memory_profile->memory_profile_snapshots()) {
+    // Get the step id of the peak memory usage.
+    if (peak_bytes_profile ==
+        snapshot.aggregation_stats().heap_allocated_bytes() +
+            snapshot.aggregation_stats().stack_reserved_bytes()) {
+      DCHECK(snapshot.has_activity_metadata());
+      peak_bytes_profile_step_id = snapshot.activity_metadata().step_id();
+    }
+  }
+  return peak_bytes_profile_step_id;
+}
+
+// Functor that compares (index, metadata) pair to sort in the order of
+// allocation bytes and requested bytes (descending), as well as TF Op name,
+// region type, data type, and tensor shape (ascending).
+struct MetadataComparator {
+  bool operator()(const IndexMetaPair& a, const IndexMetaPair& b) const {
+    const MemoryActivityMetadata* a_meta = a.second;
+    const MemoryActivityMetadata* b_meta = b.second;
+    DCHECK_NE(a_meta, nullptr);
+    DCHECK_NE(b_meta, nullptr);
+
+    auto lhs =
+        std::make_tuple(-a_meta->allocation_bytes(), -a_meta->requested_bytes(),
+                        a_meta->tf_op_name(), a_meta->region_type(),
+                        a_meta->data_type(), a_meta->tensor_shape());
+    auto rhs =
+        std::make_tuple(-b_meta->allocation_bytes(), -b_meta->requested_bytes(),
+                        b_meta->tf_op_name(), b_meta->region_type(),
+                        b_meta->data_type(), b_meta->tensor_shape());
+    return lhs < rhs;
+  }
+};
+
+// If applicable, add items into active_allocs vector and special_allocations
+// proto for the unmapped memory usage (in heap) and stack reservation at peak.
+void InsertSpecialAllocations(int64 unmapped_allocation_bytes, int64 step_id,
+                              PerAllocatorMemoryProfile* memory_profile,
+                              std::vector<IndexMetaPair>* active_allocs) {
+  int index = 0;
+  if (unmapped_allocation_bytes > 0) {
+    MemoryActivityMetadata* special_allocation =
+        memory_profile->add_special_allocations();
+    FillActivityMetadata(
+        HostEventType::kMemoryAllocation,
+        {unmapped_allocation_bytes, unmapped_allocation_bytes, 0,
+         "preallocated/unknown", step_id, "persist", 0, "unknown"},
+        special_allocation);
+    active_allocs->push_back({--index, special_allocation});
+  }
+  int64 stack_bytes =
+      memory_profile->profile_summary().peak_stats().stack_reserved_bytes();
+  if (stack_bytes > 0) {
+    MemoryActivityMetadata* special_allocation =
+        memory_profile->add_special_allocations();
+    FillActivityMetadata(
+        HostEventType::kMemoryAllocation,
+        {stack_bytes, stack_bytes, 0, "stack", step_id, "stack", 0, "unknown"},
+        special_allocation);
+    active_allocs->push_back({--index, special_allocation});
+  }
+}
+
+bool operator==(const IndexMetaPair& a, const IndexMetaPair& b) {
+  const MemoryActivityMetadata* a_meta = a.second;
+  const MemoryActivityMetadata* b_meta = b.second;
+  return a_meta->allocation_bytes() == b_meta->allocation_bytes() &&
+         a_meta->requested_bytes() == b_meta->requested_bytes() &&
+         a_meta->tf_op_name() == b_meta->tf_op_name() &&
+         a_meta->region_type() == b_meta->region_type() &&
+         a_meta->data_type() == b_meta->data_type() &&
+         a_meta->tensor_shape() == b_meta->tensor_shape();
+}
+
+// Generate the memory breakdown table of active allocations at the peak usage
+// (within profiling window) and fill each ActiveAllocation proto (i.e. a row).
+void ProcessActiveAllocations(int64 peak_bytes_profile_step_id,
+                              PerAllocatorMemoryProfile* memory_profile) {
+  int64 unmapped_allocation_bytes =
+      memory_profile->profile_summary().peak_stats().heap_allocated_bytes();
+  int64 unmapped_deallocation_bytes = 0;
+  absl::flat_hash_map<int64 /*address*/, IndexMetaPair> active_alloc_map;
+  // Only account for the memory activities in the step that includes peak
+  // memory usage.
+  for (int i = 0; i < memory_profile->memory_profile_snapshots_size(); i++) {
+    const auto& snapshot = memory_profile->memory_profile_snapshots().at(i);
+    DCHECK(snapshot.has_activity_metadata());
+    const MemoryActivityMetadata& metadata = snapshot.activity_metadata();
+    if (snapshot.time_offset_ps() >
+        memory_profile->profile_summary().peak_stats_time_ps())
+      break;
+    if (metadata.step_id() != peak_bytes_profile_step_id) continue;
+
+    if (metadata.memory_activity() == ALLOCATION) {
+      active_alloc_map[metadata.address()] = {i, &metadata};
+      unmapped_allocation_bytes -= metadata.allocation_bytes();
+    } else {
+      DCHECK_EQ(metadata.memory_activity(), DEALLOCATION);
+      if (active_alloc_map.contains(metadata.address())) {
+        active_alloc_map.erase(metadata.address());
+      } else {
+        unmapped_deallocation_bytes += metadata.allocation_bytes();
+      }
+      unmapped_allocation_bytes += metadata.allocation_bytes();
+    }
+  }
+  // This separates the persistent memory from the freed memory from last step's
+  // allocations.
+  unmapped_allocation_bytes -= unmapped_deallocation_bytes;
+
+  VLOG(2) << "unmapped_allocation_bytes=" << unmapped_allocation_bytes
+          << ", unmapped_deallocation_bytes=" << unmapped_deallocation_bytes;
+
+  // Using pair of (index, MemoryActivityMetadata*) so that we can sort by the
+  // metadata, and fetch metadata by indexing the time-sorted snapshots at
+  // frontend.
+  std::vector<IndexMetaPair> active_allocs;
+  for (const auto& address_and_index_meta : active_alloc_map) {
+    active_allocs.push_back(address_and_index_meta.second);
+  }
+
+  InsertSpecialAllocations(unmapped_allocation_bytes,
+                           peak_bytes_profile_step_id, memory_profile,
+                           &active_allocs);
+
+  std::sort(active_allocs.begin(), active_allocs.end(), MetadataComparator());
+
+  // Fill the sorted active_allocations proto messages at peak memory usage.
+  // Merge identical allocations and show occurrences.
+  for (int i = 0; i < active_allocs.size(); i++) {
+    ActiveAllocation* allocation = memory_profile->add_active_allocations();
+    allocation->set_snapshot_index(active_allocs[i].first);
+    if (active_allocs[i].first < 0) {
+      allocation->set_special_index(-active_allocs[i].first - 1);
+    } else {
+      allocation->set_special_index(-1);
+    }
+    allocation->set_num_occurrences(1);
+    while (i < active_allocs.size() - 1 &&
+           active_allocs[i] == active_allocs[i + 1]) {
+      allocation->set_num_occurrences(allocation->num_occurrences() + 1);
+      i++;
+    }
+  }
+
+  VLOG(2) << "Distinctive active allocation count="
+          << memory_profile->active_allocations_size();
+}
+
+// Post-process the memory profile to correctly update proto fields, and break
+// down peak memory usage for each allocator.
+void ProcessMemoryProfileProto(MemoryProfile* memory_profile) {
+  memory_profile->set_num_hosts(1);
+  // Add sorted memory ids within memory profile data to the selection list.
+  for (const auto& id_and_allocator_profile :
+       memory_profile->memory_profile_per_allocator()) {
+    if (!id_and_allocator_profile.second.memory_profile_snapshots().empty()) {
+      memory_profile->add_memory_ids(id_and_allocator_profile.first);
+    }
+  }
+  absl::c_sort(*memory_profile->mutable_memory_ids());
+
+  for (auto& id_and_allocator_profile :
+       *memory_profile->mutable_memory_profile_per_allocator()) {
+    PerAllocatorMemoryProfile* allocator_memory_profile =
+        &id_and_allocator_profile.second;
+    // Sort the memory_profile_snapshots by time_offset_ps (ascending) in proto.
+    absl::c_sort(
+        *allocator_memory_profile->mutable_memory_profile_snapshots(),
+        [](const MemoryProfileSnapshot& a, const MemoryProfileSnapshot& b) {
+          return a.time_offset_ps() < b.time_offset_ps();
+        });
+
+    UpdateStepId(memory_profile->step_count(), allocator_memory_profile);
+    UpdateDeallocation(allocator_memory_profile);
+
+    int64 peak_bytes_profile = allocator_memory_profile->profile_summary()
+                                   .peak_stats()
+                                   .peak_bytes_in_use();
+    int64 peak_step_id =
+        GetPeakMemoryStep(peak_bytes_profile, allocator_memory_profile);
+    ProcessActiveAllocations(peak_step_id, allocator_memory_profile);
+  }
+}
+
+}  // namespace
+
+MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane) {
+  MemoryProfile memory_profile = GenerateMemoryProfile(&host_plane);
+  ProcessMemoryProfileProto(&memory_profile);
+  return memory_profile;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
new file mode 100644
index 00000000000..bd8a6e8df08
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
+
+#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Process the host threads XPlane and generate MemoryProfile result.
+MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
new file mode 100644
index 00000000000..e0d87ac7567
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+// Tests with a sample profile with multiple memory allocation and deallocation
+// activities within one memory allocator captured in host trace.
+TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
+  XSpace space;
+  XPlane* host_plane = space.add_planes();
+  XPlaneBuilder host_plane_builder(host_plane);
+  host_plane_builder.SetName(kHostThreads);
+  host_plane_builder.ReserveLines(1);
+
+  auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateXEventWithIntAndStringViewMetadataValue(
+      &host_plane_builder, &tf_executor_thread, "MemoryAllocation", 40000, 1000,
+      {{StatType::kBytesReserved, 2000},
+       {StatType::kBytesAllocated, 3000},
+       {StatType::kBytesAvailable, 5000},
+       {StatType::kPeakBytesInUse, 8500},
+       {StatType::kRequestedBytes, 200},
+       {StatType::kAllocationBytes, 256},
+       {StatType::kAddress, 222333},
+       {StatType::kStepId, -93746},
+       {StatType::kDataType, 1}},
+      {{StatType::kAllocatorName, "GPU_0_bfc"},
+       {StatType::kTfOp, "foo/bar"},
+       {StatType::kRegionType, "output"},
+       {StatType::kTensorShapes, "[3, 3, 512, 512]"}});
+
+  CreateXEventWithIntAndStringViewMetadataValue(
+      &host_plane_builder, &tf_executor_thread, "MemoryDeallocation", 50000,
+      1000,
+      {{StatType::kBytesReserved, 2000},
+       {StatType::kBytesAllocated, 2744},
+       {StatType::kBytesAvailable, 5256},
+       {StatType::kPeakBytesInUse, 8500},
+       {StatType::kRequestedBytes, 200},
+       {StatType::kAllocationBytes, 256},
+       {StatType::kAddress, 222333},
+       {StatType::kStepId, 0},
+       {StatType::kDataType, 0}},
+      {{StatType::kAllocatorName, "GPU_0_bfc"},
+       {StatType::kRegionType, ""},
+       {StatType::kTensorShapes, ""}});
+
+  CreateXEventWithIntAndStringViewMetadataValue(
+      &host_plane_builder, &tf_executor_thread, "MemoryAllocation", 70000, 1000,
+      {{StatType::kBytesReserved, 2000},
+       {StatType::kBytesAllocated, 5000},
+       {StatType::kBytesAvailable, 3000},
+       {StatType::kPeakBytesInUse, 9500},
+       {StatType::kRequestedBytes, 300},
+       {StatType::kAllocationBytes, 300},
+       {StatType::kAddress, 345678},
+       {StatType::kStepId, -93746},
+       {StatType::kDataType, 9}},
+      {{StatType::kAllocatorName, "GPU_0_bfc"},
+       {StatType::kTfOp, "mul_grad/Sum"},
+       {StatType::kRegionType, "temp"},
+       {StatType::kTensorShapes, "[1, 2]"}});
+
+  MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
+  EXPECT_EQ(memory_profile.memory_profile_per_allocator().size(), 1);
+  EXPECT_EQ(memory_profile.num_hosts(), 1);
+  EXPECT_EQ(memory_profile.memory_ids_size(), 1);
+  EXPECT_EQ(memory_profile.step_count().size(), 1);
+  EXPECT_EQ(memory_profile.memory_profile_per_allocator().begin()->first,
+            "GPU_0_bfc");
+  const auto& allocator_memory_profile =
+      memory_profile.memory_profile_per_allocator().begin()->second;
+  EXPECT_EQ(
+      allocator_memory_profile.profile_summary().peak_bytes_usage_lifetime(),
+      9500);
+  EXPECT_EQ(allocator_memory_profile.profile_summary()
+                .peak_stats()
+                .peak_bytes_in_use(),
+            7000);
+  EXPECT_EQ(allocator_memory_profile.profile_summary().peak_stats_time_ps(),
+            70000);
+  EXPECT_EQ(allocator_memory_profile.memory_profile_snapshots_size(), 3);
+  EXPECT_EQ(allocator_memory_profile.active_allocations_size(), 3);
+  EXPECT_EQ(
+      allocator_memory_profile.active_allocations().at(2).snapshot_index(), 2);
+  EXPECT_EQ(allocator_memory_profile.special_allocations_size(), 2);
+  EXPECT_EQ(allocator_memory_profile.special_allocations().at(1).tf_op_name(),
+            "stack");
+  EXPECT_EQ(
+      allocator_memory_profile.special_allocations().at(1).allocation_bytes(),
+      2000);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
index d207cf558d1..4a369b8b96a 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
@@ -15,21 +15,31 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
 
+#include <algorithm>
+#include <memory>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/convert/op_stack.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/cost_utils.h"
+#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -48,6 +58,8 @@ struct TfActivity {
   TfActivityType activity_type;
   // Full TF op name and type of this activity (backed by XEvent::name).
   TfOp tf_op;
+  // Whether it is eagerly executed.
+  bool is_eager;
 };
 
 // TF Op metrics stored as element in OpStack.
@@ -83,8 +95,8 @@ void ProcessOneTfActivity(const TfActivity& activity,
       Timespan tf_op_span =
           PicoSpan(info->start_timestamp_ps, activity.timestamp_ps);
       tf_metrics_data->tf_metrics_db_builder.EnterOp(
-          activity.tf_op.name, activity.tf_op.type, tf_op_span.duration_ps(),
-          info->children_duration_ps);
+          activity.tf_op.name, activity.tf_op.type, activity.is_eager,
+          tf_op_span.duration_ps(), info->children_duration_ps);
       TfOpInfo* parent_info = tf_op_stack->Top();
       if (parent_info != nullptr) {
         parent_info->children_duration_ps += tf_op_span.duration_ps();
@@ -135,9 +147,17 @@ void CollectTfActivities(const XLineVisitor& line,
     const TfOp* tf_op = gtl::FindOrNull(tf_ops, event.Id());
     if (tf_op != nullptr) {
       ++tf_op_id;
+      bool is_eager = false;
+      event.ForEachStat([&](const XStatVisitor& stat) {
+        if (stat.Type() == StatType::kIsEager) {
+          is_eager = stat.IntValue();
+        }
+      });
       Timespan span(event.TimestampPs(), event.DurationPs());
-      tf_activities->push_back({span.begin_ps(), tf_op_id, kTfOpBegin, *tf_op});
-      tf_activities->push_back({span.end_ps(), tf_op_id, kTfOpEnd, *tf_op});
+      tf_activities->push_back(
+          {span.begin_ps(), tf_op_id, kTfOpBegin, *tf_op, is_eager});
+      tf_activities->push_back(
+          {span.end_ps(), tf_op_id, kTfOpEnd, *tf_op, is_eager});
     }
   });
 }
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
index 1a785d0335f..f2d7fc702fc 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
@@ -21,10 +21,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
-#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
index 3e577d00e1c..8bd0443b8f6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index f1182e095e2..f008219cbd2 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -15,19 +15,31 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
+#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -77,6 +89,24 @@ void SetRunEnvironment(int32 accelerator_count, RunEnvironment* env) {
   env->set_device_core_count(accelerator_count);
 }
 
+void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
+                      OpMetricsDb* op_metrics_db, StepEvents* step_events,
+                      TfFunctionDb* tf_function_db) {
+  absl::flat_hash_map<int64, TfOp> tf_ops =
+      CollectTfOpsFromHostThreadsXPlane(*host_plane);
+  OpMetricsDbCombiner combiner(op_metrics_db);
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
+  plane.ForEachLine([&](const XLineVisitor& line) {
+    ConsumeTfMetricsDbData(
+        ConvertHostThreadsXLineToTfMetricsDbData(line, tf_ops), &combiner);
+    CombineStepEvents(ConvertHostThreadsXLineToStepEvents(
+                          line, use_device_step_events, *step_events),
+                      step_events);
+    CombineTfFunctionDb(ConvertHostThreadsXLineToTfFunctionDb(line),
+                        tf_function_db);
+  });
+}
+
 }  // namespace
 
 OpStats ConvertXSpaceToOpStats(const XSpace& space) {
@@ -112,12 +142,9 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space) {
   // Convert a host plane.
   bool has_device = !device_planes.empty();
   if (host_plane) {
-    *op_stats.mutable_host_op_metrics_db() =
-        ConvertHostThreadsXPlaneToOpMetricsDb(*host_plane);
-    CombineStepEvents(
-        ConvertHostThreadsXPlaneToStepEvents(
-            *host_plane, /*use_device_step_events=*/has_device, step_events),
-        &step_events);
+    ProcessHostPlane(host_plane, has_device,
+                     op_stats.mutable_host_op_metrics_db(), &step_events,
+                     op_stats.mutable_tf_function_db());
   }
   StepEvents nonoverlapped_step_events = ToNonOverlappedStepEvents(step_events);
   *op_stats.mutable_step_db() =
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index 69cf591284f..7b4652f6c0b 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -96,7 +101,7 @@ TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 80,
                {{StatType::kStepId, 0}});
-  CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70, {});
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70);
 
   GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
   OpStats op_stats = ConvertXSpaceToOpStats(space);
@@ -144,6 +149,42 @@ TEST(ConvertXPlaneToOpStats, GpuStepDbTest) {
   EXPECT_EQ(precision_stats.compute_32bit_ps(), 40);
 }
 
+TEST(ConcertXPlaneToOpStats, TfFunctionTest) {
+  XSpace space;
+  XPlaneBuilder host_plane_builder(space.add_planes());
+  host_plane_builder.SetName(kHostThreads);
+  host_plane_builder.ReserveLines(1);
+  std::string kFunctionName = "increment";
+
+  auto main_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
+                            10, 100, "traced-nonXla", 1);
+  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
+                            150, 20, "notTraced-nonXla", 1);
+  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
+                            200, 80, "traced-nonXla", 2);
+
+  OpStats op_stats = ConvertXSpaceToOpStats(space);
+  const TfFunctionDb& tf_function_db = op_stats.tf_function_db();
+
+  EXPECT_EQ(tf_function_db.tf_functions().size(), 1);
+  EXPECT_EQ(tf_function_db.tf_functions().count(kFunctionName), 1);
+  const TfFunction& tf_function =
+      tf_function_db.tf_functions().at(kFunctionName);
+  EXPECT_EQ(tf_function.total_tracing_count(), 2);
+  EXPECT_EQ(tf_function.compiler(), OTHER_COMPILER);
+  const auto& metrics = tf_function.metrics();
+  EXPECT_EQ(metrics.size(), 2);
+  EXPECT_EQ(metrics.count(TRACED_MODE), 1);
+  EXPECT_EQ(metrics.count(NOT_TRACED_MODE), 1);
+  const auto& traced_mode = metrics.at(TRACED_MODE);
+  EXPECT_EQ(traced_mode.count(), 2);
+  EXPECT_EQ(traced_mode.self_time_ps(), 180);
+  const auto& not_traced_mode = metrics.at(NOT_TRACED_MODE);
+  EXPECT_EQ(not_traced_mode.count(), 1);
+  EXPECT_EQ(not_traced_mode.self_time_ps(), 20);
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index 75973a475ab..70a07171310 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -14,24 +14,34 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 
+#include <string>
+
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 #include "tensorflow/core/profiler/convert/trace_events_to_json.h"
+#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -42,6 +52,8 @@ const absl::string_view kTensorflowStats = "tensorflow_stats";
 const absl::string_view kInputPipeline = "input_pipeline";
 const absl::string_view kOverviewPage = "overview_page";
 const absl::string_view kKernelStats = "kernel_stats";
+const absl::string_view kMemoryProfile = "memory_profile";
+const absl::string_view kXPlane = "xplane";
 
 HardwareType HardwareTypeFromRunEnvironment(const RunEnvironment& run_env) {
   if (run_env.device_type() == "GPU") return HardwareType::GPU;
@@ -57,9 +69,27 @@ void AddToolData(absl::string_view tool_name, const Proto& tool_output,
   tool_output.SerializeToString(tool_data->mutable_data());
 }
 
+template <typename Proto>
+Status ConvertProtoToJson(const Proto& proto_output, std::string* json_output) {
+  protobuf::util::JsonPrintOptions json_options;
+  json_options.always_print_primitive_fields = true;
+  auto status = protobuf::util::MessageToJsonString(proto_output, json_output,
+                                                    json_options);
+  if (!status.ok()) {
+    // Convert error_msg google::protobuf::StringPiece (or absl::string_view) to
+    // tensorflow::StringPiece.
+    auto error_msg = status.message();
+    return errors::Internal(
+        "Could not convert proto to JSON string: ",
+        absl::string_view(error_msg.data(), error_msg.length()));
+  }
+  return Status::OK();
+}
+
 // Returns the tool name with extension.
-string ToolName(absl::string_view tool) {
+std::string ToolName(absl::string_view tool) {
   if (tool == kTraceViewer) return "trace.json.gz";
+  if (tool == kMemoryProfile) return "memory_profile.json.gz";
   return absl::StrCat(tool, ".pb");
 }
 
@@ -70,6 +100,7 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
                                       ProfileResponse* response) {
   absl::flat_hash_set<absl::string_view> tools(req.tools().begin(),
                                                req.tools().end());
+  AddToolData(ToolName(kXPlane), xspace, response);
   if (tools.empty()) return Status::OK();
   if (tools.contains(kTraceViewer)) {
     Trace trace;
@@ -107,6 +138,16 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
   if (tools.contains(kKernelStats)) {
     AddToolData(ToolName(kKernelStats), op_stats.kernel_stats_db(), response);
   }
+  if (tools.contains(kMemoryProfile)) {
+    if (const XPlane* host_plane = FindPlaneWithName(xspace, kHostThreads)) {
+      MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
+      std::string json_output;
+      TF_RETURN_IF_ERROR(ConvertProtoToJson(memory_profile, &json_output));
+      TF_RETURN_IF_ERROR(SaveGzippedToolDataToTensorboardProfile(
+          req.repository_root(), req.session_id(), req.host_name(),
+          ToolName(kMemoryProfile), json_output));
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.h b/tensorflow/core/profiler/convert/xplane_to_profile_response.h
index 84b9fdd914b..03ca13f1788 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.h
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.h
@@ -15,8 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_PROFILE_RESPONSE_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_PROFILE_RESPONSE_H_
 
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
index 8414f263288..ad9ca1028f6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
@@ -14,13 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -76,11 +77,11 @@ TEST(ConvertXPlaneToProfileResponse, OverviewPage) {
   request.add_tools("overview_page");
   ProfileResponse response;
   TF_CHECK_OK(ConvertXSpaceToProfileResponse(xspace, request, &response));
-  EXPECT_EQ(1, response.tool_data_size());
-  EXPECT_EQ("overview_page.pb", response.tool_data(/*index=*/0).name());
+  EXPECT_EQ(2, response.tool_data_size());
+  EXPECT_EQ("overview_page.pb", response.tool_data(/*index=*/1).name());
   OverviewPage overview_page;
   ASSERT_TRUE(
-      overview_page.ParseFromString(response.tool_data(/*index=*/0).data()));
+      overview_page.ParseFromString(response.tool_data(/*index=*/1).data()));
 }
 
 TEST(ConvertXPlaneToProfileResponse, InputPipeline) {
@@ -90,11 +91,11 @@ TEST(ConvertXPlaneToProfileResponse, InputPipeline) {
   request.add_tools("input_pipeline");
   ProfileResponse response;
   TF_CHECK_OK(ConvertXSpaceToProfileResponse(xspace, request, &response));
-  EXPECT_EQ(1, response.tool_data_size());
-  EXPECT_EQ("input_pipeline.pb", response.tool_data(/*index=*/0).name());
+  EXPECT_EQ(2, response.tool_data_size());
+  EXPECT_EQ("input_pipeline.pb", response.tool_data(/*index=*/1).name());
   InputPipelineAnalysisResult input_pipeline;
   ASSERT_TRUE(
-      input_pipeline.ParseFromString(response.tool_data(/*index=*/0).data()));
+      input_pipeline.ParseFromString(response.tool_data(/*index=*/1).data()));
 }
 
 TEST(ConvertXPlaneToProfileResponse, TensorflowStats) {
@@ -104,11 +105,11 @@ TEST(ConvertXPlaneToProfileResponse, TensorflowStats) {
   request.add_tools("tensorflow_stats");
   ProfileResponse response;
   TF_CHECK_OK(ConvertXSpaceToProfileResponse(xspace, request, &response));
-  EXPECT_EQ(1, response.tool_data_size());
-  EXPECT_EQ("tensorflow_stats.pb", response.tool_data(/*index=*/0).name());
+  EXPECT_EQ(2, response.tool_data_size());
+  EXPECT_EQ("tensorflow_stats.pb", response.tool_data(/*index=*/1).name());
   TfStatsDatabase tf_stats_db;
   ASSERT_TRUE(
-      tf_stats_db.ParseFromString(response.tool_data(/*index=*/0).data()));
+      tf_stats_db.ParseFromString(response.tool_data(/*index=*/1).data()));
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.cc b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
index 78bd3dbee0f..7bb7cd6943c 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
@@ -15,29 +15,37 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
 inline bool IsExplicitHostStepMarker(absl::string_view event_name) {
-  return (str_util::StartsWith(event_name, "train") ||
-          str_util::StartsWith(event_name, "test") ||
-          str_util::StartsWith(event_name, "TraceContext")) &&
-         !str_util::StrContains(event_name, "/");
+  return (absl::StartsWith(event_name, "train") ||
+          absl::StartsWith(event_name, "test") ||
+          absl::StartsWith(event_name, "TraceContext")) &&
+         !absl::StrContains(event_name, "/");
 }
 
 // Returns true if the given event_name should be considered as real computation
 // on CPU.
 inline bool IsRealCpuCompute(absl::string_view event_name) {
-  bool not_real = str_util::StartsWith(event_name, "EagerExecute") ||
-                  str_util::StartsWith(event_name, "EagerLocalExecute") ||
-                  str_util::StartsWith(event_name, "EagerKernelExecute") ||
-                  str_util::StartsWith(event_name, "FunctionRun") ||
+  bool not_real = absl::StartsWith(event_name, "EagerExecute") ||
+                  absl::StartsWith(event_name, "EagerLocalExecute") ||
+                  absl::StartsWith(event_name, "EagerKernelExecute") ||
+                  absl::StartsWith(event_name, "FunctionRun") ||
                   IsExplicitHostStepMarker(event_name);
   return !not_real;
 }
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.h b/tensorflow/core/profiler/convert/xplane_to_step_events.h
index a7ac3b9e89e..62fc89813a1 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.h
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
index 3e1610c2e0f..36e6a2c3091 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
@@ -15,7 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
new file mode 100644
index 00000000000..b25cdc4d219
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
@@ -0,0 +1,296 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
+
+#include <algorithm>
+#include <stack>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/math_utils.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+std::pair<TfFunctionExecutionMode, TfFunctionCompiler> Decode(
+    absl::string_view function_name, absl::string_view mode) {
+  // mode is one of ["eager", "concrete", "traced-xla", "traced-nonXla",
+  // "notTraced-xla", "notTraced-nonXla"]
+  if (mode == "eager") return {EAGER_MODE, INVALID_COMPILER};
+  if (mode == "concrete") return {CONCRETE_MODE, INVALID_COMPILER};
+  if (mode == "traced-xla") return {TRACED_MODE, XLA_COMPILER};
+  if (mode == "traced-nonXla") return {TRACED_MODE, OTHER_COMPILER};
+  if (mode == "notTraced-xla") return {NOT_TRACED_MODE, XLA_COMPILER};
+  if (mode == "notTraced-nonXla") return {NOT_TRACED_MODE, OTHER_COMPILER};
+  // Shouldn't reach here.
+  LOG(ERROR) << absl::StrCat("tf-function '", function_name,
+                             "' has an unexpected execution mode '", mode, "'")
+             << std::endl;
+  return {INVALID_MODE, INVALID_COMPILER};
+  DCHECK(false);
+}
+
+double ComputeExpensiveCallPercent(const TfFunction& tf_function) {
+  // Computes the expensiveness in terms of time (rather than count).
+  uint64 total_call_time_ps = 0;
+  uint64 expensive_call_time_ps = 0;
+  for (const auto& mode_metrics : tf_function.metrics()) {
+    const auto mode = mode_metrics.first;
+    const auto& metrics = mode_metrics.second;
+    total_call_time_ps += metrics.self_time_ps();
+    if (mode == TRACED_MODE || mode == EAGER_MODE) {
+      expensive_call_time_ps += metrics.self_time_ps();
+    }
+  }
+  return SafeDivide(100.0 * expensive_call_time_ps, total_call_time_ps);
+}
+
+// Each invocation of a tf-function creates an ActivationRecord.
+struct ActivationRecord {
+  std::string function_name;               // name of the tf-function.
+  Timespan timespan;                       // timespan of this invocation.
+  TfFunctionExecutionMode execution_mode;  // execution mode.
+  TfFunctionCompiler compiler;             // compiler used.
+  int64 tracing_count;  // the total tracing count of this function when this
+                        // invocation happened.
+  uint64 children_duration_ps;  // Sum of the duration of all (immediate)
+                                // children tf-functions of this function.
+  ActivationRecord()
+      : function_name(""),
+        execution_mode(INVALID_MODE),
+        compiler(INVALID_COMPILER),
+        tracing_count(0),
+        children_duration_ps(0) {}
+  ActivationRecord(absl::string_view name, const Timespan& timespan,
+                   TfFunctionExecutionMode exe_mode,
+                   TfFunctionCompiler compiler, int64 tracing_cnt)
+      : function_name(std::string(name)),
+        timespan(timespan),
+        execution_mode(exe_mode),
+        compiler(compiler),
+        tracing_count(tracing_cnt),
+        children_duration_ps(0) {}
+  std::string DebugString() const {
+    return absl::StrCat("{", function_name, ", ",
+                        TfFunctionExecutionMode_Name(execution_mode), ", ",
+                        TfFunctionCompiler_Name(compiler),
+                        ", tracing_count:", tracing_count,
+                        ", children_duration:", children_duration_ps,
+                        " ps, timespan:", timespan.DebugString(), "}");
+  }
+};
+
+// Entry or exit point of a tf-function.
+struct EntryOrExit {
+  bool is_entry;        // true for entry, false for exit.
+  int64 index;          // index to the ActivationRecord.
+  uint64 timestamp_ps;  // the time when this entry/exit happens.
+  EntryOrExit() : is_entry(false), index(-1), timestamp_ps(0) {}
+  EntryOrExit(bool is_entry, int64 index, uint64 timestamp_ps)
+      : is_entry(is_entry), index(index), timestamp_ps(timestamp_ps) {}
+  std::string DebugString() const {
+    std::string entry_or_exit = is_entry ? "entry, " : "exit,  ";
+    return absl::StrCat("{", entry_or_exit, "idx:", index,
+                        ", timestamp:", timestamp_ps, "}");
+  }
+};
+
+TfFunctionCompiler CombineCompilers(TfFunctionCompiler a,
+                                    TfFunctionCompiler b) {
+  if (a == INVALID_COMPILER) return b;
+  if (b == INVALID_COMPILER) return a;
+  if (a == b) return a;
+  return MIXED_COMPILER;
+}
+
+void CombineTfFunctionMetrics(const TfFunctionMetrics& src,
+                              TfFunctionMetrics* dst) {
+  dst->set_count(src.count() + dst->count());
+  dst->set_self_time_ps(src.self_time_ps() + dst->self_time_ps());
+}
+
+void CombineTfFunction(const TfFunction& src, TfFunction* dst) {
+  dst->set_total_tracing_count(
+      std::max(src.total_tracing_count(), dst->total_tracing_count()));
+  dst->set_compiler(CombineCompilers(src.compiler(), dst->compiler()));
+  for (const auto& mode_metrics : src.metrics()) {
+    int32 execution_mode = mode_metrics.first;
+    const TfFunctionMetrics& src_metrics = mode_metrics.second;
+    TfFunctionMetrics* dst_metrics =
+        gtl::FindOrNull(*dst->mutable_metrics(), execution_mode);
+    if (dst_metrics == nullptr) {
+      (*dst->mutable_metrics())[execution_mode] = src_metrics;
+    } else {
+      CombineTfFunctionMetrics(src_metrics, dst_metrics);
+    }
+  }
+  dst->set_expensive_call_percent(ComputeExpensiveCallPercent(*dst));
+}
+
+// Execution history of all tf-functions invoked.
+class TfFunctionExecutions {
+ public:
+  explicit TfFunctionExecutions(const XLineVisitor& line) {
+    // Creates points_ and activations_ from line.
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      std::string mode = "";
+      int64 tracing_count = 0;
+      event.ForEachStat([&mode, &tracing_count](const XStatVisitor& stat) {
+        if (stat.Type() == StatType::kTfFunctionCall)
+          mode = std::string(stat.StrOrRefValue());
+        if (stat.Type() == StatType::kTfFunctionTracingCount)
+          tracing_count = stat.IntValue();
+      });
+      if (mode.empty()) return;
+
+      // event is a tf-function.
+      int64 index = activations_.size();
+      auto timespan = event.GetTimespan();
+      auto mode_compiler = Decode(event.Name(), mode);
+      ActivationRecord activation_record =
+          ActivationRecord(event.Name(), timespan, mode_compiler.first,
+                           mode_compiler.second, tracing_count);
+      activations_.push_back(activation_record);
+      EntryOrExit entry_point =
+          EntryOrExit(/*is_entry=*/true, index, timespan.begin_ps());
+      EntryOrExit exit_point =
+          EntryOrExit(/*is_entry=*/false, index, timespan.end_ps());
+      points_.push_back(entry_point);
+      points_.push_back(exit_point);
+    });
+
+    // Sorts points_ in ascending order of timestamps.
+    auto ascending_in_timestamp = [](const EntryOrExit& a,
+                                     const EntryOrExit& b) {
+      return a.timestamp_ps < b.timestamp_ps;
+    };
+    absl::c_sort(points_, ascending_in_timestamp);
+
+    // Calculates the children duration for each activation record.
+    CalculateChildrenDurations();
+  }
+
+  std::string DebugString() const {
+    std::string result = "\nActivations:\n";
+    for (auto i = 0; i < activations_.size(); i++) {
+      absl::StrAppend(&result, "[", i, "] ", activations_[i].DebugString(),
+                      "\n");
+    }
+    absl::StrAppend(&result, "tf-function Entry/Exit Points:\n");
+    for (const auto& pt : points_) {
+      absl::StrAppend(&result, pt.DebugString(), "\n");
+    }
+    return result;
+  }
+
+  // Converts this execution history to a TfFunctionDb.
+  TfFunctionDb ConvertToTfFunctionDb() {
+    TfFunctionDb result;
+    for (const auto& record : activations_) {
+      TfFunction* fun = &(*result.mutable_tf_functions())[record.function_name];
+      fun->set_total_tracing_count(
+          std::max(static_cast<int64>(fun->total_tracing_count()),
+                   record.tracing_count));
+      fun->set_compiler(CombineCompilers(fun->compiler(), record.compiler));
+      // The self-time of this function is the difference between the duration
+      // of this function and the duration of its children.
+      uint64 self_time_ps =
+          record.timespan.duration_ps() - record.children_duration_ps;
+      // Updates the metrics for this execution mode with this invocation.
+      TfFunctionMetrics* metrics =
+          &(*fun->mutable_metrics())[record.execution_mode];
+      metrics->set_count(metrics->count() + 1);
+      metrics->set_self_time_ps(metrics->self_time_ps() + self_time_ps);
+    }
+    for (auto& name_fun : *result.mutable_tf_functions()) {
+      TfFunction& fun = name_fun.second;
+      fun.set_expensive_call_percent(ComputeExpensiveCallPercent(fun));
+    }
+    return result;
+  }
+
+  // Calculates the children duration of every tf-function.
+  void CalculateChildrenDurations() {
+    std::stack<int64> call_stack;
+    for (const auto& pt : points_) {
+      if (pt.is_entry) {
+        // Function entry.
+        call_stack.push(pt.index);
+      } else {
+        // Function exit.
+        DCHECK(call_stack.top() == pt.index);  // must be well nested.
+        uint64 call_duration = activations_[pt.index].timespan.duration_ps();
+        call_stack.pop();
+        if (!call_stack.empty()) {
+          // call_stack.top() is the parent tf-function; adds call_duration to
+          // its children_duration.
+          activations_[call_stack.top()].children_duration_ps += call_duration;
+        }
+      }
+    }
+  }
+
+ private:
+  // ActivationRecords for all tf-function invocations.
+  std::vector<ActivationRecord> activations_;
+  // Entry and exit points of all invocations.
+  std::vector<EntryOrExit> points_;
+};
+
+}  // namespace
+
+std::string DebugString(const TfFunctionDb& tf_function_db) {
+  std::string str;
+  protobuf::TextFormat::PrintToString(tf_function_db, &str);
+  return str;
+}
+
+void CombineTfFunctionDb(const TfFunctionDb& src, TfFunctionDb* dst) {
+  for (const auto& name_function : src.tf_functions()) {
+    const auto& name = name_function.first;
+    const auto& src_fun = name_function.second;
+    TfFunction* dst_fun = gtl::FindOrNull(*dst->mutable_tf_functions(), name);
+    if (dst_fun == nullptr) {
+      (*dst->mutable_tf_functions())[name] = src_fun;
+    } else {
+      CombineTfFunction(src_fun, dst_fun);
+    }
+  }
+}
+
+TfFunctionDb ConvertHostThreadsXLineToTfFunctionDb(const XLineVisitor& line) {
+  TfFunctionExecutions tf_function_executions = TfFunctionExecutions(line);
+  return tf_function_executions.ConvertToTfFunctionDb();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.h b/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
new file mode 100644
index 00000000000..df55ac79bb8
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
+
+#include <string>
+
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Converts from the given XLine to a TfFunctionDb.
+TfFunctionDb ConvertHostThreadsXLineToTfFunctionDb(const XLineVisitor& line);
+
+// Returns a debugging string for the given TfFunctionDb.
+std::string DebugString(const TfFunctionDb tf_function_db);
+
+// Combines the tf-function statistics from src and dst into dst.
+void CombineTfFunctionDb(const TfFunctionDb& src, TfFunctionDb* dst);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
new file mode 100644
index 00000000000..25e56d17418
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+const absl::string_view kEager = "eager";
+const absl::string_view kConcrete = "concrete";
+const absl::string_view kTracedNonXla = "traced-nonXla";
+const absl::string_view kTracedXla = "traced-xla";
+const absl::string_view kNotTracedNonXla = "notTraced-nonXla";
+const absl::string_view kNotTracedXla = "notTraced-xla";
+
+constexpr double kMaxError = 0.001;
+
+TfFunctionDb ConvertXSpaceToTfFunctionDb(const XSpace& space) {
+  TfFunctionDb result;
+  const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
+  if (host_plane) {
+    XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
+    plane.ForEachLine([&result](const XLineVisitor& line) {
+      CombineTfFunctionDb(ConvertHostThreadsXLineToTfFunctionDb(line), &result);
+    });
+  }
+  return result;
+}
+
+TEST(ConvertXPlaneToTfFunctions, CombineTwoThreads) {
+  XSpace space;
+  XPlaneBuilder host_plane_builder(space.add_planes());
+  host_plane_builder.SetName(kHostThreads);
+  host_plane_builder.ReserveLines(2);
+  std::string kFunctionName = "decrement";
+
+  auto main_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
+                            10, 100, kTracedNonXla, 1);
+  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
+                            150, 20, kNotTracedNonXla, 2);
+  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
+                            200, 80, kTracedNonXla, 3);
+
+  auto other_thread = host_plane_builder.GetOrCreateLine(1);
+  CreateTfFunctionCallEvent(&host_plane_builder, &other_thread, kFunctionName,
+                            20, 100, kTracedNonXla, 2);
+  CreateTfFunctionCallEvent(&host_plane_builder, &other_thread, kFunctionName,
+                            160, 20, kNotTracedNonXla, 2);
+  CreateTfFunctionCallEvent(&host_plane_builder, &other_thread, kFunctionName,
+                            210, 80, kTracedXla, 4);
+
+  TfFunctionDb tf_function_db = ConvertXSpaceToTfFunctionDb(space);
+  EXPECT_EQ(tf_function_db.tf_functions().size(), 1);
+  EXPECT_EQ(tf_function_db.tf_functions().count(kFunctionName), 1);
+  const TfFunction& tf_function =
+      tf_function_db.tf_functions().at(kFunctionName);
+  EXPECT_EQ(tf_function.total_tracing_count(), 4);
+  EXPECT_EQ(tf_function.compiler(), MIXED_COMPILER);
+  EXPECT_NEAR(tf_function.expensive_call_percent(), 90, kMaxError);
+
+  const auto& metrics = tf_function.metrics();
+  EXPECT_EQ(metrics.size(), 2);
+  EXPECT_EQ(metrics.count(TRACED_MODE), 1);
+  EXPECT_EQ(metrics.count(NOT_TRACED_MODE), 1);
+  const auto& traced_mode = metrics.at(TRACED_MODE);
+  EXPECT_EQ(traced_mode.count(), 4);
+  EXPECT_EQ(traced_mode.self_time_ps(), 360);
+  const auto& not_traced_mode = metrics.at(NOT_TRACED_MODE);
+  EXPECT_EQ(not_traced_mode.count(), 2);
+  EXPECT_EQ(not_traced_mode.self_time_ps(), 40);
+}
+
+TEST(ConvertXPlaneToTfFunctions, NestedFunctions) {
+  XSpace space;
+  XPlaneBuilder host_plane_builder(space.add_planes());
+  host_plane_builder.SetName(kHostThreads);
+  host_plane_builder.ReserveLines(1);
+  std::string kOuterFunctionName = "outer";
+  std::string kInnerFunctionName = "inner";
+
+  auto main_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread,
+                            kOuterFunctionName, 10, 100, kTracedNonXla, 1);
+  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread,
+                            kInnerFunctionName, 30, 40, kNotTracedXla, 0);
+  TfFunctionDb tf_function_db = ConvertXSpaceToTfFunctionDb(space);
+  EXPECT_EQ(tf_function_db.tf_functions().size(), 2);
+  EXPECT_EQ(tf_function_db.tf_functions().count(kOuterFunctionName), 1);
+  EXPECT_EQ(tf_function_db.tf_functions().count(kInnerFunctionName), 1);
+  const TfFunction& outer =
+      tf_function_db.tf_functions().at(kOuterFunctionName);
+  EXPECT_EQ(outer.total_tracing_count(), 1);
+  EXPECT_EQ(outer.compiler(), OTHER_COMPILER);
+  EXPECT_NEAR(outer.expensive_call_percent(), 100, kMaxError);
+  const auto& outer_metrics = outer.metrics();
+  EXPECT_EQ(outer_metrics.size(), 1);
+  EXPECT_EQ(outer_metrics.count(TRACED_MODE), 1);
+  const auto& traced_mode = outer_metrics.at(TRACED_MODE);
+  EXPECT_EQ(traced_mode.count(), 1);
+  EXPECT_EQ(traced_mode.self_time_ps(), 60);
+  const TfFunction& inner =
+      tf_function_db.tf_functions().at(kInnerFunctionName);
+  EXPECT_EQ(inner.total_tracing_count(), 0);
+  EXPECT_EQ(inner.compiler(), XLA_COMPILER);
+  EXPECT_NEAR(inner.expensive_call_percent(), 0, kMaxError);
+  const auto& inner_metrics = inner.metrics();
+  EXPECT_EQ(inner_metrics.size(), 1);
+  EXPECT_EQ(inner_metrics.count(NOT_TRACED_MODE), 1);
+  const auto& not_traced_mode = inner_metrics.at(NOT_TRACED_MODE);
+  EXPECT_EQ(not_traced_mode.count(), 1);
+  EXPECT_EQ(not_traced_mode.self_time_ps(), 40);
+}
+
+TEST(ConvertXPlaneToTfFunctions, EagerPlusConcrete) {
+  XSpace space;
+  XPlaneBuilder host_plane_builder(space.add_planes());
+  host_plane_builder.SetName(kHostThreads);
+  host_plane_builder.ReserveLines(2);
+  std::string kEagerFunctionName = "i_am_eager";
+  std::string kConcreteFunctionName = "i_am_concrete";
+
+  auto main_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread,
+                            kEagerFunctionName, 10, 200, kEager);
+  auto other_thread = host_plane_builder.GetOrCreateLine(1);
+  CreateTfFunctionCallEvent(&host_plane_builder, &other_thread,
+                            kConcreteFunctionName, 20, 40, kConcrete);
+  TfFunctionDb tf_function_db = ConvertXSpaceToTfFunctionDb(space);
+  EXPECT_EQ(tf_function_db.tf_functions().size(), 2);
+  EXPECT_EQ(tf_function_db.tf_functions().count(kEagerFunctionName), 1);
+  EXPECT_EQ(tf_function_db.tf_functions().count(kConcreteFunctionName), 1);
+  const TfFunction& eager =
+      tf_function_db.tf_functions().at(kEagerFunctionName);
+  EXPECT_EQ(eager.total_tracing_count(), 0);
+  EXPECT_EQ(eager.compiler(), INVALID_COMPILER);
+  EXPECT_NEAR(eager.expensive_call_percent(), 100, kMaxError);
+  const auto& eager_metrics = eager.metrics();
+  EXPECT_EQ(eager_metrics.size(), 1);
+  EXPECT_EQ(eager_metrics.count(EAGER_MODE), 1);
+  const auto& eager_mode = eager_metrics.at(EAGER_MODE);
+  EXPECT_EQ(eager_mode.count(), 1);
+  EXPECT_EQ(eager_mode.self_time_ps(), 200);
+  const TfFunction& concrete =
+      tf_function_db.tf_functions().at(kConcreteFunctionName);
+  EXPECT_EQ(concrete.total_tracing_count(), 0);
+  EXPECT_EQ(concrete.compiler(), INVALID_COMPILER);
+  EXPECT_NEAR(concrete.expensive_call_percent(), 0, kMaxError);
+  const auto& concrete_metrics = concrete.metrics();
+  EXPECT_EQ(concrete_metrics.size(), 1);
+  EXPECT_EQ(concrete_metrics.count(CONCRETE_MODE), 1);
+  const auto& concrete_mode = concrete_metrics.at(CONCRETE_MODE);
+  EXPECT_EQ(concrete_mode.count(), 1);
+  EXPECT_EQ(concrete_mode.self_time_ps(), 40);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
index a40af395558..c404f7bb7e4 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
@@ -15,8 +15,21 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 
+#include <stddef.h>
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -72,6 +85,12 @@ void ConvertXSpaceToTraceEvents(const XSpace& xspace, Trace* trace) {
     xplane.ForEachLine([&](const XLineVisitor& xline) {
       int64 resource_id = xline.Id();  // Either thread id or CUDA stream id.
       xline.ForEachEvent([&](const XEventVisitor& xevent) {
+        int64 event_type =
+            xevent.Type().value_or(HostEventType::kUnknownHostEventType);
+        if (event_type == HostEventType::kMemoryAllocation ||
+            event_type == HostEventType::kMemoryDeallocation) {
+          return;
+        }
         auto* event = trace->add_trace_events();
         auto& args = *event->mutable_args();
         event->set_device_id(device_id);
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.h b/tensorflow/core/profiler/convert/xplane_to_trace_events.h
index 5c6fbead805..b7bddb7b366 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.h
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
 
-#include "absl/strings/str_split.h"
+#include <string>
+
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
index afff5e60d97..b9a9fe09981 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
@@ -16,8 +16,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 9fab42cd54a..85fa4e7fc44 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -423,8 +423,10 @@ tf_cc_test(
     deps = [
         ":traceme_recorder",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -434,7 +436,6 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
     ],
 )
@@ -444,6 +445,7 @@ cc_library(
     hdrs = ["profiler_factory.h"],
     deps = [
         ":profiler_interface",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
     ] + if_static([
         ":profiler_factory_impl",
     ]),
@@ -461,8 +463,7 @@ cc_library(
     deps = [
         ":profiler_interface",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
     ],
     alwayslink = True,
 )
@@ -513,15 +514,10 @@ tf_cc_test(
     srcs = ["scoped_annotation_test.cc"],
     deps = [
         ":annotation_stack",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "@com_google_absl//absl/strings",
     ],
@@ -544,6 +540,6 @@ tf_cc_test(
         ":parse_annotation",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/internal/annotation_stack.cc b/tensorflow/core/profiler/internal/annotation_stack.cc
index 4cfd1027a68..4c15ca47c3d 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.cc
+++ b/tensorflow/core/profiler/internal/annotation_stack.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/annotation_stack.h"
 
+#include <atomic>
+
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 namespace profiler {
 namespace internal {
diff --git a/tensorflow/core/profiler/internal/annotation_stack.h b/tensorflow/core/profiler/internal/annotation_stack.h
index 38cd962cb32..e626c4c73cc 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.h
+++ b/tensorflow/core/profiler/internal/annotation_stack.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index e156667c5a7..c24c8c7d456 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -18,6 +18,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -26,10 +27,10 @@ cc_library(
     srcs = ["host_tracer.cc"],
     deps = [
         ":host_tracer_utils",
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/internal:traceme_recorder",
@@ -50,14 +51,17 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -67,17 +71,14 @@ cc_library(
     copts = ["-fexceptions"],
     features = ["-use_header_modules"],
     deps = [
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/python/profiler/internal:python_hooks",
-        "@com_google_absl//absl/strings",
     ],
     alwayslink = True,
 )
@@ -86,9 +87,12 @@ cc_library(
     name = "metadata_collector",
     srcs = ["metadata_collector.cc"],
     deps = [
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service/gpu:gpu_debug_info_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 753d8c53b9c..be1a7a2777b 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
@@ -119,8 +124,8 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
           std::vector<absl::string_view> parts =
               absl::StrSplit(event.name, kUserMetadataMarker);
           if (parts.size() >= 2) {
-            ns->set_node_name(string(parts[0]));
-            ns->set_timeline_label(string(parts[1]));
+            ns->set_node_name(std::string(parts[0]));
+            ns->set_timeline_label(std::string(parts[1]));
           } else {
             ns->set_node_name(std::move(event.name));
           }
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index e32ba92de66..499b7b6b564 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -12,17 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <ostream>
 #include <string>
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -38,13 +44,13 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-NodeExecStats MakeNodeStats(const string& name, uint32 thread_id,
-                            const string& label = "") {
+NodeExecStats MakeNodeStats(absl::string_view name, uint32 thread_id,
+                            absl::string_view label = "") {
   NodeExecStats ns;
-  ns.set_node_name(name);
+  ns.set_node_name(std::string(name));
   ns.set_thread_id(thread_id);
   if (!label.empty()) {
-    ns.set_timeline_label(label);
+    ns.set_timeline_label(std::string(label));
   }
   return ns;
 }
@@ -109,7 +115,7 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsRunMetadata) {
 
 TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) {
   uint32 thread_id;
-  string thread_name = "MyThreadName";
+  std::string thread_name = "MyThreadName";
   XSpace space;
 
   // We start a thread with a known and controled name. As of the time of
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
index a4709ae2113..2e5d8ac1770 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
 
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
index c6aa7840920..58da20ae3c5 100644
--- a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
+++ b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
@@ -13,17 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/cpu/python_tracer.cc b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
index aa259f53cfa..d684cb8f768 100644
--- a/tensorflow/core/profiler/internal/cpu/python_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
@@ -12,18 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <utility>
-#include <vector>
+#include <memory>
 
-#include "absl/strings/str_split.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/python/profiler/internal/python_hooks.h"
 
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index 24568091e88..c6fe4d77031 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -91,8 +91,9 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
+    ] + if_cuda_is_configured_compat([
         "//tensorflow/stream_executor/cuda:cupti_stub",
-    ],
+    ]),
 )
 
 tf_cuda_library(
@@ -103,8 +104,9 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cupti_interface",
+    ] + if_cuda_is_configured_compat([
         "//tensorflow/stream_executor/cuda:cupti_stub",
-    ],
+    ]),
 )
 
 tf_cuda_library(
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index fadf506fa94..9119c3d5d0b 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -17,10 +17,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
@@ -121,7 +119,7 @@ const char *getActivityUnifiedMemoryKindString(
       if (status == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) {                  \
         return errors::PermissionDenied("CUPTI need root access!");         \
       } else {                                                              \
-        return errors::Internal(absl::StrCat("CUPTI call error", errstr));  \
+        return errors::Internal("CUPTI call error", errstr);                \
       }                                                                     \
     }                                                                       \
   } while (false)
@@ -286,19 +284,14 @@ void CUPTIAPI FreeCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
           << reinterpret_cast<uintptr_t>(buffer) << std::dec
           << " size: " << size << " valid_size: " << valid_size;
 
-  // Ensure buffer is free when this function returns.
-  auto buffer_cleanup =
-      gtl::MakeCleanup([buffer] { port::AlignedFree(buffer); });
+  if (valid_size > 0) {
+    VLOG(3) << "Activity profile for stream " << stream_id;
 
-  if (valid_size <= 0) {
-    return;
+    CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
+    cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
+        .IgnoreError();
   }
-
-  VLOG(3) << "Activity profile for stream " << stream_id;
-
-  CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
-  cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
-      .IgnoreError();
+  port::AlignedFree(buffer);
 }
 
 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
@@ -984,7 +977,7 @@ class CudaEventRecorder {
   using StreamKey = std::pair<CUcontext, CUstream>;
 
   absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
-  absl::flat_hash_map<StreamKey, StreamInfo, hash<StreamKey>> stream_infos_;
+  absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
 };
 
 // This hook uses cuda events to measure device side activities.
@@ -1390,6 +1383,7 @@ void CuptiTracer::Enable(const CuptiTracerOptions &option,
 
   Status status = EnableApiTracing();
   need_root_access_ |= status.code() == error::PERMISSION_DENIED;
+  if (!status.ok()) return;
 
   if (option_->enable_activity_api) {
     EnableActivityTracing().IgnoreError();
@@ -1412,11 +1406,14 @@ void CuptiTracer::Disable() {
 
 Status CuptiTracer::EnableApiTracing() {
   if (api_tracing_enabled_) return Status::OK();
-  api_tracing_enabled_ = true;
 
   VLOG(1) << "Enable subscriber";
+  // Subscribe can return CUPTI_ERROR_MAX_LIMIT_REACHED.
+  // The application which calls CUPTI APIs cannot be used with Nvidia tools
+  // like nvprof, Nvidia Visual Profiler, Nsight Compute, Nsight Systems.
   RETURN_IF_CUPTI_ERROR(cupti_interface_->Subscribe(
       &subscriber_, (CUpti_CallbackFunc)ApiCallback, this));
+  api_tracing_enabled_ = true;
 
   if (!option_->cbids_selected.empty()) {
     for (auto cbid : option_->cbids_selected) {
@@ -1530,7 +1527,7 @@ Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
   RETURN_IF_CUPTI_ERROR(
       cupti_interface_->GetDeviceId(cbdata->context, &device_id));
   if (device_id >= num_gpus_) {
-    return errors::Internal(absl::StrCat("Invalid device id:", device_id));
+    return errors::Internal("Invalid device id:", device_id);
   }
 
   if (cbdata->callbackSite == CUPTI_API_ENTER) {
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
index c6e0c50b093..e236afc5c41 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/container/node_hash_set.h"
 #include "absl/types/optional.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_interface.h"
 
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 534a1d53752..ac6662c8432 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/abi.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -85,7 +85,7 @@ void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
   if (!event.annotation.empty()) {
     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                             GetStatTypeStr(StatType::kKernelAnnotation)),
-                        event.annotation);
+                        *plane->GetOrCreateStatMetadata(event.annotation));
   }
   if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
     xevent.AddStatValue(
@@ -102,7 +102,7 @@ void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
                         event.kernel_info.block_y, event.kernel_info.block_z);
     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                             GetStatTypeStr(StatType::kKernelDetails)),
-                        kernel_details);
+                        *plane->GetOrCreateStatMetadata(kernel_details));
   } else if (event.type == CuptiTracerEventType::MemcpyH2D ||
              event.type == CuptiTracerEventType::MemcpyD2H ||
              event.type == CuptiTracerEventType::MemcpyD2D ||
@@ -145,7 +145,7 @@ void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
   if (!annotation_stack.empty()) {
     xevent.AddStatValue(
         *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-        annotation_stack.begin()->name);
+        *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
   }
 }
 
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
index 2a3fa3f8454..32c26befa3d 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 
 #include <stack>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_split.h"
diff --git a/tensorflow/core/profiler/internal/parse_annotation.h b/tensorflow/core/profiler/internal/parse_annotation.h
index 6c2e536962b..bb0f12217d3 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.h
+++ b/tensorflow/core/profiler/internal/parse_annotation.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
 
-#include <utility>
 #include <vector>
 
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/profiler/internal/parse_annotation_test.cc b/tensorflow/core/profiler/internal/parse_annotation_test.cc
index 4d4a2d5ea95..e5d876ac5af 100644
--- a/tensorflow/core/profiler/internal/parse_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 
+#include <vector>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/internal/profiler_factory.cc b/tensorflow/core/profiler/internal/profiler_factory.cc
index e2bae59b892..5152e79bdc8 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.cc
+++ b/tensorflow/core/profiler/internal/profiler_factory.cc
@@ -14,8 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/profiler_factory.h b/tensorflow/core/profiler/internal/profiler_factory.h
index 6bcdcf28c3c..c223d7275d9 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.h
+++ b/tensorflow/core/profiler/internal/profiler_factory.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
index 2605e834f09..9fe85e38652 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.h
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -15,8 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
diff --git a/tensorflow/core/profiler/internal/scoped_annotation_test.cc b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
index 70a627fd640..50c1244b9ee 100644
--- a/tensorflow/core/profiler/internal/scoped_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
 
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/annotation_stack.h"
 
 namespace tensorflow {
@@ -48,11 +49,13 @@ TEST(ScopedAnnotation, Simple) {
   EXPECT_EQ(AnnotationStack::Get(), "");  // not enabled
 }
 
-string GenerateRandomString(int length) { return string(length, 'a'); }
+std::string GenerateRandomString(int length) {
+  return std::string(length, 'a');
+}
 
 void BM_ScopedAnnotationDisabled(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     ScopedAnnotation trace(annotation);
@@ -64,7 +67,7 @@ BENCHMARK(BM_ScopedAnnotationDisabled)->Arg(8)->Arg(32)->Arg(128);
 
 void BM_ScopedAnnotationEnabled(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   AnnotationStack::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
@@ -78,7 +81,7 @@ BENCHMARK(BM_ScopedAnnotationEnabled)->Arg(8)->Arg(32)->Arg(128);
 
 void BM_ScopedAnnotationEnabled_Nested(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   AnnotationStack::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 22b3bdc2042..56e6e2bcba3 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -58,7 +58,6 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
       ckpt_reader_(std::move(ckpt_reader)) {
   CHECK(graph) << "Must at least have GraphDef";
 
-  absl::PrintF("Parsing Inputs...\n");
   AddGraph(std::move(graph));
   if (run_meta && run_meta->has_step_stats()) {
     AddRunMeta(0, std::move(run_meta));
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.cc b/tensorflow/core/profiler/internal/traceme_recorder.cc
index 365e3992bc3..268585bde8c 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder.cc
@@ -16,8 +16,18 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <algorithm>
+#include <atomic>
+#include <new>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
index 8b5b32cf4bc..5fdea5bddbd 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -15,9 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 
-#include <stddef.h>
-
 #include <atomic>
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -54,13 +53,13 @@ class TraceMeRecorder {
   // Times are in ns since the Unix epoch.
   struct Event {
     uint64 activity_id;
-    string name;
+    std::string name;
     uint64 start_time;  // 0 = missing
     uint64 end_time;    // 0 = missing
   };
   struct ThreadInfo {
     uint32 tid;
-    string name;
+    std::string name;
   };
   struct ThreadEvents {
     ThreadInfo thread;
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
index 90478881361..8d7abc94e8f 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder_test.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
@@ -15,19 +15,28 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 
 #include <atomic>
+#include <istream>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
+using ::testing::ElementsAre;
+
 MATCHER_P(Named, name, "") { return arg.name == name; }
 
 constexpr static uint64 kNanosInSec = 1000000000;
@@ -45,7 +54,7 @@ TEST(RecorderTest, SingleThreaded) {
 
   ASSERT_EQ(results.size(), 1);
   EXPECT_THAT(results[0].events,
-              ::testing::ElementsAre(Named("during1"), Named("during2")));
+              ElementsAre(Named("during1"), Named("during2")));
 }
 
 void SpinNanos(int nanos) {
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 18ffc3f1e5c..e80b9fc9766 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -47,18 +47,19 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:framework",
         "//tensorflow/core/platform",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/util:ptr_util",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/core:protos_all_cc",
     ] + if_not_android([
         ":profiler_utils",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/utils:derived_timeline",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_schema",
     ]),
     alwayslink = True,
 )
@@ -67,11 +68,17 @@ tf_cuda_library(
     name = "profiler_backends",
     cuda_deps = [
         "//tensorflow/core/profiler/internal/gpu:device_tracer",
+        # Metadata collector does not compile for Cast,
+        # due to its dependency on human_readable_json.
+        # Also, it is only used for XLA:GPU support and
+        # thus move it inside cuda_deps.
+        # TODO(b/155130190): Gate deps and cuda_deps here so they are empty
+        # when compiling for a mobile platform.
+        "//tensorflow/core/profiler/internal/cpu:metadata_collector",
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core/profiler/internal/cpu:host_tracer",
-        "//tensorflow/core/profiler/internal/cpu:metadata_collector",
     ],
     alwayslink = True,
 )
@@ -87,6 +94,7 @@ cc_library(
     hdrs = ["traceme.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":traceme_encode",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
@@ -95,6 +103,16 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "traceme_encode",
+    hdrs = ["traceme_encode.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "annotated_traceme",
     hdrs = ["annotated_traceme.h"],
@@ -104,13 +122,21 @@ cc_library(
         ":traceme",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
-tf_pybind_cc_library_wrapper(
-    name = "scoped_annotation_headers",
-    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
-    deps = [":scoped_annotation"],
+cc_library(
+    name = "connected_traceme",
+    hdrs = ["connected_traceme.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":traceme",
+        ":traceme_encode",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
 )
 
 cc_library(
@@ -141,6 +167,7 @@ filegroup(
         "profiler_session.h",
         "scoped_annotation.h",
         "traceme.h",
+        "traceme_encode.h",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h
index f40c1e9ad92..c3257e2adbe 100644
--- a/tensorflow/core/profiler/lib/annotated_traceme.h
+++ b/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 
+#include <utility>
+
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
new file mode 100644
index 00000000000..5b16e2e3adf
--- /dev/null
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -0,0 +1,123 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
+
+namespace tensorflow {
+namespace profiler {
+
+enum class ContextType : int {
+  kGeneric,
+  kTfExecutor,
+};
+
+/*
+ * TraceMeProducer and TraceMeConsumer are used to correlate TraceMe events on
+ * different threads. TraceMeProducer generates the context information to be
+ * passed to TraceMeConsumer, which consists of the context id and optionally
+ * the context type. They may be provided by the user. Then, the events of the
+ * same context information can be correlated during the analysis.
+ *
+ * Example Usages:
+ * (1) Using the user-provided context type and id. The user is responsible for
+ *     providing the same context type and id to TraceMeProducer and
+ *     TraceMeConsumer.
+ * [Producer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     ContextType::kTfExecutor, user_context_id);
+ * [Consumer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, user_context_id, ContextType::kTfExecutor);
+ *
+ * (2) Using the user-provided context type and generic id. The user is
+ *     responsible for passing the TraceMeProducer's context id to
+ *     TraceMeConsumer as well as providing the same context type to
+ *     TraceMeProducer and TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     ContextType::kTfExecutor);
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, context_id, ContextType::kTfExecutor);
+ *
+ * (3) Using the generic context information. The user is responsible for
+ *     passing the TraceMeProducer's context id to TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); });
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer([&] { return "op_execute"; }, context_id);
+ */
+class TraceMeProducer {
+ public:
+  template <typename NameT>
+  explicit TraceMeProducer(NameT name,
+                           ContextType context_type = ContextType::kGeneric,
+                           absl::optional<uint64> context_id = absl::nullopt,
+                           int level = 2)
+      : trace_me_(name, level) {
+    trace_me_.AppendMetadata([&] {
+      context_id_ =
+          context_id.has_value() ? *context_id : TraceMe::NewActivityId();
+      return TraceMeEncode(
+          {{"$pt", static_cast<int>(context_type)}, {"$p", context_id_}});
+    });
+  }
+
+  uint64 GetContextId() const { return context_id_; }
+
+ private:
+  TraceMe trace_me_;
+  uint64 context_id_ = 0;
+};
+
+class TraceMeConsumer {
+ public:
+  template <typename NameT>
+  TraceMeConsumer(NameT name, uint64 context_id,
+                  ContextType context_type = ContextType::kGeneric,
+                  int level = 2)
+      : trace_me_(name, level) {
+    trace_me_.AppendMetadata([&] {
+      return TraceMeEncode(
+          {{"$ct", static_cast<int>(context_type)}, {"$c", context_id}});
+    });
+  }
+
+ private:
+  TraceMe trace_me_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index b907f74179c..9783cd14f95 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -15,19 +15,28 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 
-#include "tensorflow/core/lib/core/errors.h"
+#include <memory>
+
+#include "absl/memory/memory.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/env_var.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/lib/profiler_utils.h"
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #endif
 
@@ -44,7 +53,7 @@ ProfileOptions GetOptions(const ProfileOptions& opts) {
 
 /*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
     const ProfileOptions& options) {
-  return WrapUnique(new ProfilerSession(options));
+  return absl::WrapUnique(new ProfilerSession(options));
 }
 
 /*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create() {
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 1c20876d9d0..6f92b047eb7 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -18,12 +18,14 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/profiler/lib/scoped_annotation.h b/tensorflow/core/profiler/lib/scoped_annotation.h
index 61b0cf42dd6..2cad5fd4708 100644
--- a/tensorflow/core/profiler/lib/scoped_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_annotation.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
+#include <utility>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 8b42f187850..6df196bdba7 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 
-#include "absl/strings/match.h"
+#include <new>
+#include <string>
+#include <utility>
+
 #include "absl/strings/string_view.h"
-#include "absl/strings/strip.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #endif
+#include "tensorflow/core/profiler/lib/traceme_encode.h"  // IWYU pragma: export
 
 namespace tensorflow {
 namespace profiler {
@@ -74,20 +77,21 @@ inline int GetTFTraceMeLevel(bool is_expensive) {
 //          auto id = ActivityStart("step");
 //          ... do some work ...
 //          ActivityEnd(id);
+//       The two static methods should be called within the same thread.
 class TraceMe {
  public:
-  // Constructor that traces a user-defined activity labeled with activity_name
+  // Constructor that traces a user-defined activity labeled with name
   // in the UI. Level defines the trace priority, used for filtering TraceMe
   // events. By default, traces with TraceMe level <= 2 are recorded. Levels:
   // - Must be a positive integer.
   // - Can be a value in enum TraceMeLevel.
   // Users are welcome to use level > 3 in their code, if they wish to filter
   // out their host traces based on verbosity.
-  explicit TraceMe(absl::string_view activity_name, int level = 1) {
+  explicit TraceMe(absl::string_view name, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(activity_name);
+      new (&no_init_.name) std::string(name);
       start_time_ = EnvTime::NowNanos();
     }
 #endif
@@ -98,46 +102,55 @@ class TraceMe {
   // Note: We can't take the string by value because a) it would make the
   // overloads ambiguous, and b) we want lvalue strings to use the string_view
   // constructor so we avoid copying them when tracing is disabled.
-  explicit TraceMe(string &&activity_name, int level = 1) {
+  explicit TraceMe(std::string&& name, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(std::move(activity_name));
+      new (&no_init_.name) std::string(std::move(name));
       start_time_ = EnvTime::NowNanos();
     }
 #endif
   }
 
   // Do not allow passing strings by reference or value since the caller
-  // may unintentionally maintain ownership of the activity_name.
-  // Explicitly std::move the activity_name or wrap it in a string_view if
+  // may unintentionally maintain ownership of the name.
+  // Explicitly std::move the name or wrap it in a string_view if
   // you really wish to maintain ownership.
-  explicit TraceMe(const string &activity_name, int level = 1) = delete;
+  explicit TraceMe(const std::string& name, int level = 1) = delete;
 
   // This overload is necessary to make TraceMe's with string literals work.
   // Otherwise, the string&& and the string_view constructor would be equally
   // good overload candidates.
-  explicit TraceMe(const char *raw, int level = 1)
+  explicit TraceMe(const char* raw, int level = 1)
       : TraceMe(absl::string_view(raw), level) {}
 
-  // This overload only generates the activity name if tracing is enabled.
-  // Useful for avoiding things like string concatenation when tracing is
-  // disabled. The |name_generator| may be a lambda or functor that returns a
-  // type that the string() constructor can take.
+  // This overload only generates the name (and possibly metadata) if tracing is
+  // enabled. Useful for avoiding expensive operations (e.g., string
+  // concatenation) when tracing is disabled.
+  // name_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
   // name_generator is templated, rather than a std::function to avoid
   // allocations std::function might make even if never called.
-  // Usage: profiler::TraceMe([&]{ return StrCat(prefix, ":", postfix); });
+  // Example Usage:
+  //   TraceMe op_trace_me([&]() {
+  //     return StrCat(op_name, ":", op_type);
+  //   }
+  //   TraceMe trace_me_with_metadata([&value1]() {
+  //     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+  //   });
   template <typename NameGeneratorT>
   explicit TraceMe(NameGeneratorT name_generator, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(name_generator());
+      new (&no_init_.name) std::string(name_generator());
       start_time_ = EnvTime::NowNanos();
     }
 #endif
   }
 
+  ~TraceMe() { Stop(); }
+
   // Stop tracing the activity. Called by the destructor, but exposed to allow
   // stopping tracing before the object goes out of scope. Only has an effect
   // the first time it is called.
@@ -162,28 +175,27 @@ class TraceMe {
 #endif
   }
 
-  // Sets new_metadata in the metadata part of no_init_.name.
-  void SetMetadata(absl::string_view new_metadata) {
+  // Appends new_metadata to the TraceMe name passed to the constructor.
+  // metadata_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
+  // metadata_generator is only evaluated when tracing is enabled.
+  // metadata_generator is templated, rather than a std::function to avoid
+  // allocations std::function might make even if never called.
+  // Example Usage:
+  //   trace_me.AppendMetadata([&value1]() {
+  //     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+  //   });
+  template <typename MetadataGeneratorT>
+  void AppendMetadata(MetadataGeneratorT metadata_generator) {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        absl::string_view orig = no_init_.name;
-        if (absl::EndsWith(orig, "#")) {
-          // orig does have metadata.
-          absl::ConsumeSuffix(&orig, "#");
-          absl::ConsumePrefix(&new_metadata, "#");
-          no_init_.name = absl::StrCat(orig, ",", new_metadata);
-        } else {
-          // orig does not have metadata.
-          absl::StrAppend(&no_init_.name, new_metadata);
-        }
+        traceme_internal::AppendMetadata(&no_init_.name, metadata_generator());
       }
     }
 #endif
   }
 
-  ~TraceMe() { Stop(); }
-
   // Static API, for use when scoped objects are inconvenient.
 
   // Record the start time of an activity.
@@ -192,7 +204,7 @@ class TraceMe {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
       uint64 activity_id = TraceMeRecorder::NewActivityId();
-      TraceMeRecorder::Record({activity_id, string(name),
+      TraceMeRecorder::Record({activity_id, std::string(name),
                                /*start_time=*/EnvTime::NowNanos(),
                                /*end_time=*/0});
       return activity_id;
@@ -207,7 +219,8 @@ class TraceMe {
     // We don't check the level again (see TraceMe::Stop()).
     if (TF_PREDICT_FALSE(activity_id != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        TraceMeRecorder::Record({activity_id, /*name=*/"", /*start_time=*/0,
+        TraceMeRecorder::Record({activity_id, /*name=*/std::string(),
+                                 /*start_time=*/0,
                                  /*end_time=*/EnvTime::NowNanos()});
       }
     }
@@ -222,6 +235,14 @@ class TraceMe {
 #endif
   }
 
+  static uint64 NewActivityId() {
+#if !defined(IS_MOBILE_PLATFORM)
+    return TraceMeRecorder::NewActivityId();
+#else
+    return 0;
+#endif
+  }
+
  private:
   // Activity ID or start time used when tracing is disabled.
   constexpr static uint64 kUntracedActivity = 0;
@@ -235,7 +256,7 @@ class TraceMe {
   union NoInit {
     NoInit() {}
     ~NoInit() {}
-    string name;
+    std::string name;
   } no_init_;
 
   uint64 start_time_ = kUntracedActivity;
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
new file mode 100644
index 00000000000..2e23c6d878b
--- /dev/null
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+
+#include <string.h>
+
+#include <initializer_list>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace traceme_internal {
+
+// Copies the contents of str to the address pointed by out.
+// Returns the address after the copy.
+// REQUIRED: The address range [out, out + str.size()] must have been allocated.
+TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out,
+                                               absl::string_view str) {
+  const size_t str_size = str.size();
+  if (TF_PREDICT_TRUE(str_size > 0)) {
+    memcpy(out, str.data(), str_size);
+    out += str_size;
+  }
+  return out;
+}
+
+// Appends args encoded as TraceMe metadata to name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string AppendArgs(
+    std::string name,
+    const std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>>&
+        args) {
+  if (TF_PREDICT_TRUE(args.size() > 0)) {
+    const auto old_size = name.size();
+    auto new_size = old_size + args.size() * 2 + 1;
+    for (const auto& arg : args) {
+      new_size += arg.first.size() + arg.second.size();
+    }
+    name.resize(new_size);
+    char* const begin = &name[0];
+    char* out = begin + old_size;
+    *out++ = '#';
+    for (const auto& arg : args) {
+      out = Append(out, arg.first);
+      *out++ = '=';
+      out = Append(out, arg.second.Piece());
+      *out++ = ',';
+    }
+    *(out - 1) = '#';
+    DCHECK_EQ(out, begin + new_size);
+  }
+  return name;
+}
+
+// Appends new_metadata to the metadata part of name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata(
+    std::string* name, absl::string_view new_metadata) {
+  if (!TF_PREDICT_FALSE(new_metadata.empty())) {
+    if (!name->empty() && name->back() == '#') {  // name already has metadata
+      name->back() = ',';
+      if (TF_PREDICT_TRUE(new_metadata.front() == '#')) {
+        new_metadata.remove_prefix(1);
+      }
+    }
+    name->append(new_metadata.data(), new_metadata.size());
+  }
+}
+
+}  // namespace traceme_internal
+
+// Encodes an event name and arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me([value1]() {
+//     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::string name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::move(name), args);
+}
+inline std::string TraceMeEncode(
+    absl::string_view name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+inline std::string TraceMeEncode(
+    const char* name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+
+// Encodes arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me("my_trace");
+//   ...
+//   trace_me.AppendMetadata([value1]() {
+//     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(), args);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
diff --git a/tensorflow/core/profiler/profiler_service.proto b/tensorflow/core/profiler/profiler_service.proto
index 37ca4084e42..a096a10efe2 100644
--- a/tensorflow/core/profiler/profiler_service.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -10,6 +10,10 @@ import "tensorflow/core/profiler/profiler_service_monitor_result.proto";
 service ProfilerService {
   // Starts a profiling session, blocks until it completes, and returns data.
   rpc Profile(ProfileRequest) returns (ProfileResponse) {}
+  // Signal to terminate the Profile rpc for a on-going profiling session,
+  // The Profile rpc will return successfully and prematurely without timeout.
+  // This is used by programmatic mode to end the session in workers.
+  rpc Terminate(TerminateRequest) returns (TerminateResponse) {}
   // Collects profiling data and returns user-friendly metrics.
   rpc Monitor(MonitorRequest) returns (MonitorResponse) {}
 }
@@ -81,6 +85,13 @@ message ProfileResponse {
   // next-field: 8
 }
 
+message TerminateRequest {
+  // Which session id to terminate.
+  string session_id = 1;
+}
+
+message TerminateResponse {}
+
 message MonitorRequest {
   // Duration for which to profile between each update.
   uint64 duration_ms = 1;
diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD
index ce5bc9bd120..b102fe2ec25 100644
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@@ -77,6 +77,7 @@ tf_proto_library(
         ":kernel_stats_proto",
         ":op_metrics_proto",
         ":steps_db_proto",
+        ":tf_function_proto",
     ],
     visibility = [
         ":friends",
@@ -90,6 +91,13 @@ tf_proto_library(
     visibility = [":friends"],
 )
 
+tf_proto_library(
+    name = "tf_function_proto",
+    srcs = ["tf_function.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
+
 # This proto is deprecating and not guaranteed to be compatible across versions.
 # Please don't refer in new project unless you are double confirmed.
 tf_proto_library(
@@ -112,3 +120,17 @@ tf_proto_library(
     cc_api_version = 2,
     visibility = [":friends"],
 )
+
+tf_proto_library(
+    name = "tfstreamz_proto",
+    srcs = ["tfstreamz.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
+
+tf_proto_library(
+    name = "memory_profile_proto",
+    srcs = ["memory_profile.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
diff --git a/tensorflow/core/profiler/protobuf/input_pipeline.proto b/tensorflow/core/profiler/protobuf/input_pipeline.proto
index 97896b62a2f..584e05713b6 100644
--- a/tensorflow/core/profiler/protobuf/input_pipeline.proto
+++ b/tensorflow/core/profiler/protobuf/input_pipeline.proto
@@ -147,5 +147,7 @@ message InputPipelineAnalysisResult {
   // Breakdown of the step time. Can be unpacked into a
   // GenericStepTimeBreakdown.
   google.protobuf.Any step_time_breakdown = 8;
+  // Error messages.
+  repeated string error_messages = 10;
   reserved 1;
 }
diff --git a/tensorflow/core/profiler/protobuf/memory_profile.proto b/tensorflow/core/profiler/protobuf/memory_profile.proto
new file mode 100644
index 00000000000..7a5272c60b2
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/memory_profile.proto
@@ -0,0 +1,128 @@
+// This proto is used for analysis of TensorFlow runtime memory profile.
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+// The memory activity that causes change of memory state.
+enum MemoryActivity {
+  UNKNOWN_ACTIVITY = 0;
+  // Memory allocation in heap.
+  ALLOCATION = 1;
+  // Memory deallocation in heap.
+  DEALLOCATION = 2;
+  // Memory reservation for stack.
+  RESERVATION = 3;
+  // Expansion of existing memory allocation.
+  EXPANSION = 4;
+}
+
+// The aggregated memory stats including heap, stack, free memory and
+// fragmentation at a specific time.
+message MemoryAggregationStats {
+  // Memory usage by stack reservation, in bytes.
+  int64 stack_reserved_bytes = 1;
+  // Memory usage by heap allocation, in bytes.
+  int64 heap_allocated_bytes = 2;
+  // Free memory available for allocation or reservation, in bytes.
+  int64 free_memory_bytes = 3;
+  // Fragmentation value within [0, 1].
+  double fragmentation = 4;
+  // The peak memory usage over the entire program (lifetime of memory
+  // allocator). It monotonically increases with upper limit as memory capacity.
+  int64 peak_bytes_in_use = 5;
+}
+
+// The metadata associated with each memory allocation/deallocation. It can
+// also be interpreted as the metadata for the delta of memory state.
+// Next ID: 10
+message MemoryActivityMetadata {
+  // The activity associated with the MemoryProfileSnapshot.
+  MemoryActivity memory_activity = 1;
+  // The requested memory size in bytes from the caller of memory allocation.
+  // Should be a positive number.
+  int64 requested_bytes = 2;
+  // The allocated (block/chunk) size for the memory allocation.
+  // Should be a positive number.
+  int64 allocation_bytes = 3;
+  // Starting address of the allocated memory chunk/block.
+  uint64 address = 4;
+  // TensorFlow Op name for the memory activity.
+  string tf_op_name = 5;
+  // Step Id at which the memory activity occurred.
+  int64 step_id = 6;
+  // Tensor memory region type including "output", "temp", "persist", and
+  // "dynamic".
+  string region_type = 7;
+  // From enum DataType defined in tensorflow/core/framework/types.proto.
+  string data_type = 8;
+  // Tensor shape printed in string, e.g. "[3, 3, 512, 512]".
+  string tensor_shape = 9;
+}
+
+// Profile snapshot of the TensorFlow memory at runtime, including
+// MemoryAggregationStats (memory usage breakdown etc.), and
+// MemoryActivityMetadata (allocation or deallocation, TF Op name etc.).
+message MemoryProfileSnapshot {
+  // Memory activity timestamp.
+  int64 time_offset_ps = 1;
+  // The memory aggregation stats at the snapshot time.
+  MemoryAggregationStats aggregation_stats = 2;
+  // The metadata for the memory activity at the snapshot time.
+  MemoryActivityMetadata activity_metadata = 3;
+}
+
+// The summary of memory profile within the profiling window duration.
+message MemoryProfileSummary {
+  // The peak memory usage over the entire program (lifetime of memory
+  // allocator).
+  int64 peak_bytes_usage_lifetime = 1;
+  // The peak memory usage stats within the profiling window.
+  MemoryAggregationStats peak_stats = 2;
+  // The timestamp for peak memory usage within the profiling window.
+  int64 peak_stats_time_ps = 3;
+  // The memory capacity of the allocator.
+  int64 memory_capacity = 4;
+}
+
+// The active memory allocations at the peak memory usage.
+message ActiveAllocation {
+  // The index of a snapshot in the time-sorted list, used to fetch the
+  // MemoryActivityMetadata at front end from the memory_profile_snapshots list.
+  int64 snapshot_index = 1;
+  // The index of MemoryActivityMetadata in the special_allocations list.
+  int64 special_index = 2;
+  // Number of occurrences for identical memory allocations.
+  int64 num_occurrences = 3;
+}
+
+// Memory profile snapshots per memory allocator.
+message PerAllocatorMemoryProfile {
+  // A list of MemoryProfileSnapshots sorted by time_offset_ps.
+  repeated MemoryProfileSnapshot memory_profile_snapshots = 1;
+  // The summary of memory profile (e.g. the peak memory usage).
+  MemoryProfileSummary profile_summary = 2;
+  // The rows in the table of active allocations at peak memory usage within
+  // profiling window.
+  repeated ActiveAllocation active_allocations = 3;
+  // The special allocations (e.g. pre-allocated heap memory, stack reservation)
+  // that are not captured in the MemoryActivityMetadata of
+  // memory_profile_snapshots. Need to handle separately.
+  repeated MemoryActivityMetadata special_allocations = 4;
+}
+
+// Data for memory usage analysis in one host.
+message MemoryProfile {
+  // A map from memory allocator's id to PerAllocatorMemoryProfile for memory
+  // usage analysis on this host.
+  map<string /*memory_id*/, PerAllocatorMemoryProfile>
+      memory_profile_per_allocator = 1;
+  // Number of hosts profiled, used to populate host selection list at front
+  // end.
+  int32 num_hosts = 2;
+  // Ids for profiled memory allocators, used to populate memory selection list
+  // at front end.
+  repeated string memory_ids = 3;
+  // Map of original random int64 step id to the count of memory activity events
+  // assigned with it.
+  map<int64 /*orig_step_id*/, int64 /*count*/> step_count = 4;
+}
diff --git a/tensorflow/core/profiler/protobuf/op_stats.proto b/tensorflow/core/profiler/protobuf/op_stats.proto
index 2e3ace87adf..aa7cd563a33 100644
--- a/tensorflow/core/profiler/protobuf/op_stats.proto
+++ b/tensorflow/core/profiler/protobuf/op_stats.proto
@@ -5,6 +5,7 @@ package tensorflow.profiler;
 import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
 import "tensorflow/core/profiler/protobuf/op_metrics.proto";
 import "tensorflow/core/profiler/protobuf/steps_db.proto";
+import "tensorflow/core/profiler/protobuf/tf_function.proto";
 
 // Performance environment, e.g the peak performance capabilities of the device.
 message PerfEnv {
@@ -104,6 +105,8 @@ message OpStats {
   RunEnvironment run_environment = 5;
   // Kernel stats results from all GPUs.
   KernelStatsDb kernel_stats_db = 6;
+  // Statistics for all tf-functions.
+  TfFunctionDb tf_function_db = 8;
   // Errors seen.
   repeated string errors = 7;
 }
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index 8c83dbd0871..1590076d55f 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -81,9 +81,14 @@ message OverviewPageRecommendation {
   // A statement for input that recommends the next steps for investigating the
   // bottleneck.
   string statement = 2;
+  // A list of tips for tackling input bottleneck.
+  repeated OverviewPageTip input_tips = 11;
   // A statement for output that recommends the next steps for investigating the
   // bottleneck.
   string output_statement = 9;
+  // A statement that recommends the next steps for investigating tf-function
+  // related bottleneck (it is a html so that it can link to other tools/docs.
+  string tf_function_statement_html = 10;
   // A list of tips for improving host performance.
   repeated OverviewPageTip host_tips = 3;
   // A list of tips for improving device performance.
diff --git a/tensorflow/core/profiler/protobuf/steps_db.proto b/tensorflow/core/profiler/protobuf/steps_db.proto
index df58ceeb4ec..9396103c7ce 100644
--- a/tensorflow/core/profiler/protobuf/steps_db.proto
+++ b/tensorflow/core/profiler/protobuf/steps_db.proto
@@ -96,6 +96,15 @@ message PerCoreStepInfo {
 
 // Result proto for a StepDatabase.
 message StepDatabaseResult {
+  // Whether the step db uses incomplete step information.
+  // This flag is set to true when:
+  // 1) no step marker or annotation present.
+  // 2) profiling duration is too short to cover a full step.
+  // If this flag is false, we will group and breakdown the
+  // profile by complete steps only and ignore incomplete steps.
+  // If this flag is true, we will simply aggregate and breakdown over the total
+  // profile as a single step.
+  bool use_incomplete_step = 2;
   // A sequence of PerCoreStepInfo.
   repeated PerCoreStepInfo step_sequence = 1;
 }
diff --git a/tensorflow/core/profiler/protobuf/tf_function.proto b/tensorflow/core/profiler/protobuf/tf_function.proto
new file mode 100644
index 00000000000..1f5e1530475
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/tf_function.proto
@@ -0,0 +1,61 @@
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+// All possible execution modes of a tf-function.
+enum TfFunctionExecutionMode {
+  // Yet to be set.
+  INVALID_MODE = 0;
+  // Eager execution.
+  EAGER_MODE = 1;
+  // Graph execution with tracing.
+  TRACED_MODE = 2;
+  // Graph execution without tracing.
+  NOT_TRACED_MODE = 3;
+  // Concrete function.
+  CONCRETE_MODE = 4;
+}
+
+// All possible compilers that can be used to compile a tf-function in the graph
+// mode.
+enum TfFunctionCompiler {
+  // Yet to be set.
+  INVALID_COMPILER = 0;
+  // Any other compiler.
+  OTHER_COMPILER = 1;
+  // If some instance of the function is compiled with XLA and some is compiled
+  // with Non-XLA, use "MIXED_COMPILER".
+  MIXED_COMPILER = 2;
+  // XLA compiler.
+  XLA_COMPILER = 3;
+  // MLIR compiler.
+  MLIR_COMPILER = 4;
+}
+
+// Metrics associated with a particular execution mode of a tf-function.
+message TfFunctionMetrics {
+  // Number of invocations to the function in that execution mode.
+  uint64 count = 1;
+  // The sum of "self-execution" time of this function over those invocations.
+  uint64 self_time_ps = 2;
+}
+
+// Statistics for a tf-function.
+message TfFunction {
+  // A map from each execution mode to its corresponding metrics.
+  map<int32, TfFunctionMetrics> metrics = 1;
+  // Total tracing count from the program's beginning (i.e. beyond the profiling
+  // period) of this tf-function.
+  int64 total_tracing_count = 2;
+  // Compiler used to compile this function.
+  TfFunctionCompiler compiler = 3;
+  // Percentage of time spent in the expensive calls to this function in the
+  // profiled period.
+  double expensive_call_percent = 4;
+}
+
+// Statistics for all tf-functions.
+message TfFunctionDb {
+  // A map from function name to the statistics of that function.
+  map<string, TfFunction> tf_functions = 1;
+}
diff --git a/tensorflow/core/profiler/protobuf/tf_stats.proto b/tensorflow/core/profiler/protobuf/tf_stats.proto
index 0b4cbb77b11..2dae6230f50 100644
--- a/tensorflow/core/profiler/protobuf/tf_stats.proto
+++ b/tensorflow/core/profiler/protobuf/tf_stats.proto
@@ -69,4 +69,6 @@ message TfStatsRecord {
   // Whether this operation is "Compute" or "Memory" bound,
   // according to the Roofline Model.
   string bound_by = 17;
+  // Whether this TF-op is eagerly executed.
+  bool is_eager = 18;
 }
diff --git a/tensorflow/core/profiler/protobuf/tfstreamz.proto b/tensorflow/core/profiler/protobuf/tfstreamz.proto
new file mode 100644
index 00000000000..4fe5c168f3a
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/tfstreamz.proto
@@ -0,0 +1,33 @@
+// This proto describes the format of the output profile file from
+// the TF-stats tool.
+syntax = "proto3";
+
+package tensorflow.profiler.tfstreamz;
+
+// A proxy proto to serialize tensorflow::monitoring::Percentiles
+
+enum UnitOfMeasure {
+  NUMBER = 0;
+  TIME = 1;
+  BYTES = 2;
+}
+
+message PercentilePoint {
+  // In the [0, 100] range.
+  double percentile = 1;
+  double value = 2;
+}
+
+message Percentiles {
+  UnitOfMeasure unit_of_measure = 1;
+  uint64 start_nstime = 2;
+  uint64 end_nstime = 3;
+  double min_value = 4;
+  double max_value = 5;
+  double mean = 6;
+  double stddev = 7;
+  uint64 num_samples = 8;
+  uint64 total_samples = 9;
+  double accumulator = 10;
+  repeated PercentilePoint points = 11;
+}
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index d8af53fe8f9..1e572dfd9bd 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -14,14 +14,12 @@ cc_library(
         ["//tensorflow_serving/model_servers:__pkg__"],
     ),
     deps = [
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/lib:profiler_session_headers",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/memory",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -36,7 +34,6 @@ cc_library(
     ],
     deps = [
         ":profiler_service_impl",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 43ebb35230c..609f98aa6c1 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -11,9 +11,10 @@ cc_library(
     visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
     deps = [
         ":save_profile",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_analysis_proto_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
@@ -28,8 +29,8 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index ebc74c9252c..a8642aff54a 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -14,19 +14,25 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 
+#include <iostream>
+#include <limits>
+#include <memory>
 #include <vector>
 
 #include "grpcpp/grpcpp.h"
-#include "absl/strings/escaping.h"
-#include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_analysis.pb.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
-#include "tensorflow/core/util/events_writer.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
index 404912ef716..c809d2099ae 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -17,9 +17,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
 
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index ab2e494871c..9cf2e291692 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -15,20 +15,27 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 
-#include <cstdio>
-#include <ctime>
+#include <initializer_list>
+#include <memory>
+#include <sstream>
+#include <string>
 #include <vector>
 
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/io/compression.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
+
 // Windows.h #defines ERROR, but it is also used in
 // tensorflow/core/util/event.proto
 #undef ERROR
@@ -56,9 +63,9 @@ string ProfilerJoinPathImpl(std::initializer_list<absl::string_view> paths) {
 
     path = absl::StripPrefix(path, kPathSep);
     if (absl::EndsWith(result, kPathSep)) {
-      strings::StrAppend(&result, path);
+      absl::StrAppend(&result, path);
     } else {
-      strings::StrAppend(&result, kPathSep, path);
+      absl::StrAppend(&result, kPathSep, path);
     }
   }
 
@@ -75,7 +82,8 @@ string ProfilerJoinPath(const T&... args) {
 constexpr char kProtoTraceFileName[] = "trace";
 constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
 
-Status DumpToolDataToLogDirectory(StringPiece run_dir, const string& host,
+Status DumpToolDataToLogDirectory(absl::string_view run_dir,
+                                  absl::string_view host,
                                   const ProfileToolData& tool,
                                   std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.h b/tensorflow/core/profiler/rpc/client/save_profile.h
index d9070f06c71..2e8fc96390a 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.h
+++ b/tensorflow/core/profiler/rpc/client/save_profile.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 
-#include "tensorflow/core/lib/core/status.h"
+#include <ostream>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index 36f0f9efad9..f05a829fb93 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -16,18 +16,19 @@ limitations under the License.
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
 
 #include <memory>
-#include <utility>
+#include <string>
 
 #include "grpcpp/grpcpp.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
 void ProfilerServer::StartProfilerServer(int32 port) {
-  string server_address = absl::StrCat("0.0.0.0:", port);
+  std::string server_address = absl::StrCat("0.0.0.0:", port);
   service_ = CreateProfilerService();
   ::grpc::ServerBuilder builder;
   builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials());
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index 8f1be23594a..0a234d7e4da 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -15,19 +15,24 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 
+#include <memory>
+
 #include "grpcpp/support/status.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
@@ -61,11 +66,16 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
     }
 
     Env* env = Env::Default();
-    for (size_t i = 0; i < req->duration_ms(); ++i) {
+    for (uint64 i = 0; i < req->duration_ms(); ++i) {
       env->SleepForMicroseconds(EnvTime::kMillisToMicros);
       if (ctx->IsCancelled()) {
         return ::grpc::Status::CANCELLED;
       }
+      if (TF_PREDICT_FALSE(IsStopped(req->session_id()))) {
+        mutex_lock lock(mutex_);
+        stop_signals_per_session_.erase(req->session_id());
+        break;
+      }
     }
 
     status = CollectDataToResponse(*req, profiler.get(), response);
@@ -76,12 +86,31 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
 
     return ::grpc::Status::OK;
   }
+
+  ::grpc::Status Terminate(::grpc::ServerContext* ctx,
+                           const TerminateRequest* req,
+                           TerminateResponse* response) override {
+    mutex_lock lock(mutex_);
+    stop_signals_per_session_[req->session_id()] = true;
+    return ::grpc::Status::OK;
+  }
+
+ private:
+  bool IsStopped(const std::string& session_id) {
+    mutex_lock lock(mutex_);
+    auto it = stop_signals_per_session_.find(session_id);
+    return it != stop_signals_per_session_.end() && it->second;
+  }
+
+  mutex mutex_;
+  absl::flat_hash_map<std::string, bool> stop_signals_per_session_
+      GUARDED_BY(mutex_);
 };
 
 }  // namespace
 
 std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService() {
-  return MakeUnique<ProfilerServiceImpl>();
+  return absl::make_unique<ProfilerServiceImpl>();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
index 4a7636cf101..00a850acbf2 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.h
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -15,10 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
 
-#include "grpcpp/grpcpp.h"
-#include "grpcpp/server_context.h"
-#include "grpcpp/support/status.h"
-#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include <memory>
+
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 4dcbf38c1bd..ca20236d63b 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -12,6 +12,15 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "errors",
+    srcs = ["errors.cc"],
+    hdrs = ["errors.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "event_span",
     srcs = ["event_span.cc"],
@@ -21,6 +30,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
@@ -41,6 +51,14 @@ cc_library(
     hdrs = ["math_utils.h"],
 )
 
+cc_library(
+    name = "html_utils",
+    hdrs = ["html_utils.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "op_metrics_db_utils",
     srcs = ["op_metrics_db_utils.cc"],
@@ -74,7 +92,6 @@ cc_library(
     hdrs = ["tf_op_utils.h"],
     deps = [
         "//tensorflow/core:regexp_internal",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -87,6 +104,7 @@ tf_cc_test(
         ":tf_op_utils",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -137,6 +155,20 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "xplane_builder_test",
+    size = "small",
+    srcs = ["xplane_builder_test.cc"],
+    deps = [
+        ":xplane_builder",
+        ":xplane_visitor",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "xplane_schema",
     srcs = ["xplane_schema.cc"],
@@ -148,7 +180,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -174,7 +205,6 @@ tf_cc_test(
     name = "xplane_utils_test",
     srcs = ["xplane_utils_test.cc"],
     deps = [
-        ":time_utils",
         ":xplane_builder",
         ":xplane_utils",
         ":xplane_visitor",
@@ -183,6 +213,8 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -210,6 +242,7 @@ cc_library(
     deps = [
         ":xplane_schema",
         ":xplane_visitor",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -219,10 +252,13 @@ cc_library(
     hdrs = ["group_events.h"],
     visibility = [":friends"],
     deps = [
+        ":tf_op_utils",
         ":tf_xplane_visitor",
+        ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
         ":xplane_visitor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -240,10 +276,13 @@ tf_cc_test(
         ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
+        ":xplane_visitor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -258,10 +297,13 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/grappler/costs:op_context",
         "//tensorflow/core/grappler/costs:op_level_cost_estimator",
         "//tensorflow/core/grappler/costs:op_performance_data_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -273,6 +315,7 @@ cc_library(
         ":group_events",
         ":tf_op_utils",
         ":tf_xplane_visitor",
+        ":time_utils",
         ":timespan",
         ":trace_utils",
         ":xplane_builder",
@@ -282,8 +325,10 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -298,6 +343,8 @@ tf_cc_test(
         ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
+        ":xplane_visitor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
@@ -315,3 +362,19 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "tfstreamz_utils",
+    srcs = ["tfstreamz_utils.cc"],
+    hdrs = ["tfstreamz_utils.h"],
+    deps = [
+        ":xplane_builder",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:protos_all_cc",
+        "//tensorflow/core/profiler/protobuf:tfstreamz_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/profiler/utils/cost_utils.cc b/tensorflow/core/profiler/utils/cost_utils.cc
index 754aa655af3..a94f09bb79c 100644
--- a/tensorflow/core/profiler/utils/cost_utils.cc
+++ b/tensorflow/core/profiler/utils/cost_utils.cc
@@ -15,12 +15,27 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/cost_utils.h"
 
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_context.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/cost_utils.h b/tensorflow/core/profiler/utils/cost_utils.h
index f1095556c2b..a778bca5330 100644
--- a/tensorflow/core/profiler/utils/cost_utils.h
+++ b/tensorflow/core/profiler/utils/cost_utils.h
@@ -15,12 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
 
-#include <set>
+#include <string>
 
-#include "absl/strings/string_view.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -46,7 +47,8 @@ class TfOpRoofLineCostEstimator
   OpRoofLineStats Predict(const XEventVisitor& event);
 
  private:
-  std::set<string> unsupported_ops_;  // summary for unsupported ops.
+  absl::flat_hash_set<std::string>
+      unsupported_ops_;  // summary for unsupported ops.
 
   TF_DISALLOW_COPY_AND_ASSIGN(TfOpRoofLineCostEstimator);
 };
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 86cad675427..112c0977763 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -14,13 +14,24 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
@@ -39,92 +50,16 @@ absl::string_view DummySymbolResolver(absl::string_view hlo_module,
   return absl::string_view();
 }
 
-// Helper for deriving an XLine from events in another XLine.
-class DerivedXLineBuilder {
- public:
-  DerivedXLineBuilder(XPlaneBuilder* plane, int64 line_id,
-                      absl::string_view name, int64 timestamp_ns,
-                      std::vector<DerivedXLineBuilder*> dependent_lines)
-      : line_(plane->GetOrCreateLine(line_id)) {
-    line_.SetName(name);
-    line_.SetTimestampNs(timestamp_ns);
-    dependent_lines_ = std::move(dependent_lines);
-  }
-
-  void ExpandOrAddEvents(const std::vector<XEvent>& event_per_level) {
-    for (int level = 0; level < event_per_level.size(); ++level) {
-      ExpandOrAddLevelEvent(event_per_level[level], level);
-    }
-  }
-
-  void ExpandOrAddEvent(const XEvent& event) {
-    ExpandOrAddLevelEvent(event, /*level=*/0);
-  }
-
-  // Reset last events lower than the given level.
-  void ResetLastEvents(int level = -1) {
-    for (int i = level + 1; i < last_event_by_level_.size(); ++i) {
-      last_event_by_level_[i] = absl::nullopt;
-    }
-  }
-
- private:
-  // If the last event of the given level has the same metadata, expands it to
-  // include the time until the given event's (offset_ps + duration_ps).
-  // Otherwise, adds a new event and clears last_event_by_level_ for the levels
-  // below the given level and all levels of the dependent lines. Clearing
-  // last_event_by_level_ prevents a nested event from growing larger than the
-  // parent event(s).
-  void ExpandOrAddLevelEvent(const XEvent& event, int level) {
-    int64 offset_ps = event.offset_ps();
-    int64 duration_ps = event.duration_ps();
-    auto& last_event = last_event_by_level_[level];
-    // If last_event is not nullptr, its offset must be less than or equal to
-    // the given event's offset.
-    DCHECK(!last_event || last_event->OffsetPs() <= offset_ps);
-    if (last_event && last_event->MetadataId() == event.metadata_id()) {
-      // If last_event is not nullptr and metadata is same, merge the given
-      // event into last_event.
-      last_event->SetDurationPs((offset_ps + duration_ps) -
-                                last_event->OffsetPs());
-    } else {
-      // Otherwise, create a new event for the given level.
-      last_event = line_.AddEvent(event);
-      // Reset last events lower than the given level.
-      ResetLastEvents(level);
-      if (level == 0) ResetDependentLines();
-    }
-  }
-
-  void ResetDependentLines() {
-    for (DerivedXLineBuilder* line : dependent_lines_) {
-      line->ResetLastEvents();
-    }
-  }
-
-  XLineBuilder line_;
-  absl::flat_hash_map<int, absl::optional<XEventBuilder>> last_event_by_level_;
-  std::vector<DerivedXLineBuilder*> dependent_lines_;
-};
-
-const absl::string_view kDerivedLineSteps = "Steps";
-const absl::string_view kDerivedLineTensorFlowNameScope =
-    "TensorFlow Name Scope";
-const absl::string_view kDerivedLineTensorFlowOps = "TensorFlow Ops";
-const absl::string_view kDerivedLineXlaModules = "XLA Modules";
-const absl::string_view kDerivedLineXlaOps = "XLA Ops";
-const absl::string_view kDerivedLineKernelLaunch = "Launch Stats";
 const absl::string_view kAnnotationDelimiter = "::";
 
-XEvent CreateXEvent(const XEventVisitor& src_event_visitor,
-                    const XEventMetadata& metadata,
-                    int64 group_id_stat_metadata_id,
+XEvent CreateXEvent(const XEventMetadata& metadata, int64 offset_ps,
+                    int64 duration_ps, int64 group_id_stat_metadata_id,
                     absl::optional<int64> group_id) {
   XEvent event;
   event.set_metadata_id(metadata.id());
   // TODO(b/150498419): Normalize with the line start time.
-  event.set_offset_ps(src_event_visitor.OffsetPs());
-  event.set_duration_ps(src_event_visitor.DurationPs());
+  event.set_offset_ps(offset_ps);
+  event.set_duration_ps(duration_ps);
   if (group_id) {
     XStat* stat = event.add_stats();
     stat->set_metadata_id(group_id_stat_metadata_id);
@@ -133,9 +68,10 @@ XEvent CreateXEvent(const XEventVisitor& src_event_visitor,
   return event;
 }
 
-void ProcessTfOpEvent(const XEventVisitor& event,
-                      absl::string_view tf_op_full_name,
-                      absl::optional<int64> group_id,
+}  // namespace
+
+void ProcessTfOpEvent(absl::string_view tf_op_full_name, int64 offset_ps,
+                      int64 duration_ps, absl::optional<int64> group_id,
                       XPlaneBuilder* plane_builder,
                       DerivedXLineBuilder* tf_name_scope_line_builder,
                       DerivedXLineBuilder* tf_op_line_builder) {
@@ -148,8 +84,8 @@ void ProcessTfOpEvent(const XEventVisitor& event,
     std::vector<XEvent> name_scope_event_per_level;
     for (const auto& tf_name_scope : ParseTfNameScopes(tf_op)) {
       name_scope_event_per_level.push_back(CreateXEvent(
-          event, *plane_builder->GetOrCreateEventMetadata(tf_name_scope),
-          group_id_stat_metadata_id, group_id));
+          *plane_builder->GetOrCreateEventMetadata(tf_name_scope), offset_ps,
+          duration_ps, group_id_stat_metadata_id, group_id));
     }
     tf_name_scope_line_builder->ExpandOrAddEvents(name_scope_event_per_level);
   }
@@ -158,11 +94,47 @@ void ProcessTfOpEvent(const XEventVisitor& event,
   // Set the display name to op_type so that the events of the same op_type have
   // the same color in the trace viewer.
   tf_op_event_metadata->set_display_name(TfOpEventName(tf_op));
-  tf_op_line_builder->ExpandOrAddEvent(CreateXEvent(
-      event, *tf_op_event_metadata, group_id_stat_metadata_id, group_id));
+  tf_op_line_builder->ExpandOrAddEvent(
+      CreateXEvent(*tf_op_event_metadata, offset_ps, duration_ps,
+                   group_id_stat_metadata_id, group_id));
 }
 
-}  // namespace
+DerivedXLineBuilder::DerivedXLineBuilder(
+    XPlaneBuilder* plane, int64 line_id, absl::string_view name,
+    int64 timestamp_ns, std::vector<DerivedXLineBuilder*> dependent_lines)
+    : line_(plane->GetOrCreateLine(line_id)) {
+  line_.SetName(name);
+  line_.SetTimestampNs(timestamp_ns);
+  dependent_lines_ = std::move(dependent_lines);
+}
+
+void DerivedXLineBuilder::ExpandOrAddLevelEvent(const XEvent& event,
+                                                int level) {
+  int64 offset_ps = event.offset_ps();
+  int64 duration_ps = event.duration_ps();
+  auto& last_event = last_event_by_level_[level];
+  // If last_event is not nullptr, its offset must be less than or equal to
+  // the given event's offset.
+  DCHECK(!last_event || last_event->OffsetPs() <= offset_ps);
+  if (last_event && last_event->MetadataId() == event.metadata_id()) {
+    // If last_event is not nullptr and metadata is same, merge the given
+    // event into last_event.
+    last_event->SetDurationPs((offset_ps + duration_ps) -
+                              last_event->OffsetPs());
+  } else {
+    // Otherwise, reset the last events lower than or equal to the given level.
+    ResetLastEvents(level);
+    // And create a new event for the given level.
+    last_event = line_.AddEvent(event);
+  }
+}
+
+void DerivedXLineBuilder::ResetLastEvents(int level) {
+  for (int i = level; i < last_event_by_level_.size(); ++i) {
+    last_event_by_level_[i] = absl::nullopt;
+  }
+  if (level == 0) ResetDependentLines();
+}
 
 void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
                                  const EventGroupNameMap& event_group_name_map,
@@ -180,19 +152,18 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
   absl::c_sort(events);
 
   XPlaneBuilder plane(device_trace);
-  DerivedXLineBuilder tf_ops(&plane, kThreadIdTfOp, kDerivedLineTensorFlowOps,
+  DerivedXLineBuilder tf_ops(&plane, kThreadIdTfOp, kTensorFlowOpLineName,
                              start_timestamp_ns, {});
   DerivedXLineBuilder tf_name_scope(&plane, kThreadIdTfNameScope,
-                                    kDerivedLineTensorFlowNameScope,
+                                    kTensorFlowNameScopeLineName,
                                     start_timestamp_ns, {&tf_ops});
-  DerivedXLineBuilder hlo_ops(&plane, kThreadIdHloOp, kDerivedLineXlaOps,
+  DerivedXLineBuilder hlo_ops(&plane, kThreadIdHloOp, kXlaOpLineName,
                               start_timestamp_ns, {});
   DerivedXLineBuilder hlo_modules(&plane, kThreadIdHloModule,
-                                  kDerivedLineXlaModules, start_timestamp_ns,
-                                  {&tf_ops, &tf_name_scope, &hlo_ops});
-  DerivedXLineBuilder steps(&plane, kThreadIdStepInfo, kDerivedLineSteps,
-                            start_timestamp_ns,
-                            {&tf_ops, &tf_name_scope, &hlo_ops, &hlo_modules});
+                                  kXlaModuleLineName, start_timestamp_ns,
+                                  {&tf_name_scope, &hlo_ops});
+  DerivedXLineBuilder steps(&plane, kThreadIdStepInfo, kStepLineName,
+                            start_timestamp_ns, {&hlo_modules});
   int64 group_id_stat_metadata_id =
       plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId))->id();
   int64 step_name_stat_metadata_id =
@@ -200,6 +171,8 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
 
   // Process events in order by start time.
   for (const XEventVisitor& event : events) {
+    int64 offset_ps = event.OffsetPs();
+    int64 duration_ps = event.DurationPs();
     absl::string_view tf_op_full_name;
     absl::string_view hlo_module_name;
     std::vector<absl::string_view> hlo_op_names;
@@ -222,8 +195,8 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
 
     if (group_id) {
       XEvent step_event = CreateXEvent(
-          event, *plane.GetOrCreateEventMetadata(absl::StrCat(*group_id)),
-          group_id_stat_metadata_id, group_id);
+          *plane.GetOrCreateEventMetadata(absl::StrCat(*group_id)), offset_ps,
+          duration_ps, group_id_stat_metadata_id, group_id);
       if (auto group_name = gtl::FindOrNull(event_group_name_map, *group_id)) {
         XStat* stat = step_event.add_stats();
         stat->set_metadata_id(step_name_stat_metadata_id);
@@ -239,9 +212,9 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
     if (!is_kernel) continue;
 
     if (!hlo_module_name.empty()) {
-      hlo_modules.ExpandOrAddEvent(
-          CreateXEvent(event, *plane.GetOrCreateEventMetadata(hlo_module_name),
-                       group_id_stat_metadata_id, group_id));
+      hlo_modules.ExpandOrAddEvent(CreateXEvent(
+          *plane.GetOrCreateEventMetadata(hlo_module_name), offset_ps,
+          duration_ps, group_id_stat_metadata_id, group_id));
     }
 
     if (!hlo_op_names.empty()) {  // GPU kernel compiled by XLA
@@ -249,19 +222,19 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
       std::vector<XEvent> hlo_op_event_per_level;
       for (absl::string_view hlo_op_name : hlo_op_names) {
         DCHECK(!hlo_op_name.empty());
-        hlo_op_event_per_level.push_back(
-            CreateXEvent(event, *plane.GetOrCreateEventMetadata(hlo_op_name),
-                         group_id_stat_metadata_id, group_id));
+        hlo_op_event_per_level.push_back(CreateXEvent(
+            *plane.GetOrCreateEventMetadata(hlo_op_name), offset_ps,
+            duration_ps, group_id_stat_metadata_id, group_id));
       }
       hlo_ops.ExpandOrAddEvents(hlo_op_event_per_level);
       auto tf_op_name = symbol_resolver(hlo_module_name, hlo_op_names.back());
       if (!tf_op_name.empty()) {
-        ProcessTfOpEvent(event, tf_op_name, group_id, &plane, &tf_name_scope,
-                         &tf_ops);
+        ProcessTfOpEvent(tf_op_name, offset_ps, duration_ps, group_id, &plane,
+                         &tf_name_scope, &tf_ops);
       }
     } else if (!tf_op_full_name.empty()) {  // GPU kernel not compiled by XLA
-      ProcessTfOpEvent(event, tf_op_full_name, group_id, &plane, &tf_name_scope,
-                       &tf_ops);
+      ProcessTfOpEvent(tf_op_full_name, offset_ps, duration_ps, group_id,
+                       &plane, &tf_name_scope, &tf_ops);
     }
   }
   RemoveEmptyLines(device_trace);
@@ -332,7 +305,7 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
     XPlaneBuilder device_plane(device_traces[i]);
     XLineBuilder launch_line =
         device_plane.GetOrCreateLine(kThreadIdKernelLaunch);
-    launch_line.SetName(kDerivedLineKernelLaunch);
+    launch_line.SetName(kKernelLaunchLineName);
     launch_line.SetTimestampNs(std::min(device_plane_start, host_plane_start));
     for (const auto& it : per_device_launch_info[i]) {
       uint64 group_id = it.first;
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index 0b5118ae6d9..cd4da7996c5 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -15,16 +15,69 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
 
+#include <functional>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
 
 namespace tensorflow {
 namespace profiler {
 
-typedef std::function<absl::string_view(absl::string_view hlo_module_name,
-                                        absl::string_view hlo_op)>
-    SymbolResolver;
+// Helper for deriving an XLine from events in another XLine.
+class DerivedXLineBuilder {
+ public:
+  DerivedXLineBuilder(XPlaneBuilder* plane, int64 line_id,
+                      absl::string_view name, int64 timestamp_ns,
+                      std::vector<DerivedXLineBuilder*> dependent_lines);
+
+  void ExpandOrAddEvents(const std::vector<XEvent>& event_per_level) {
+    for (int level = 0; level < event_per_level.size(); ++level) {
+      ExpandOrAddLevelEvent(event_per_level[level], level);
+    }
+  }
+
+  void ExpandOrAddEvent(const XEvent& event) {
+    ExpandOrAddLevelEvent(event, /*level=*/0);
+  }
+
+  // Reset the last events lower than or equal to the given level.
+  void ResetLastEvents(int level = 0);
+
+ private:
+  // If the last event of the given level has the same metadata, expands it to
+  // include the time until the given event's (offset_ps + duration_ps).
+  // Otherwise, adds a new event and clears last_event_by_level_ for the levels
+  // below the given level and all levels of the dependent lines. Clearing
+  // last_event_by_level_ prevents a nested event from growing larger than the
+  // parent event(s).
+  void ExpandOrAddLevelEvent(const XEvent& event, int level);
+
+  void ResetDependentLines() {
+    for (DerivedXLineBuilder* line : dependent_lines_) {
+      line->ResetLastEvents();
+    }
+  }
+
+  XLineBuilder line_;
+  absl::flat_hash_map<int, absl::optional<XEventBuilder>> last_event_by_level_;
+  std::vector<DerivedXLineBuilder*> dependent_lines_;
+};
+
+using SymbolResolver = std::function<absl::string_view(
+    absl::string_view hlo_module_name, absl::string_view hlo_op)>;
+
+// Derives TF name scope and op events from the TF op's fully qualified name.
+void ProcessTfOpEvent(absl::string_view tf_op_full_name, int64 offset_ps,
+                      int64 duration_ps, absl::optional<int64> group_id,
+                      XPlaneBuilder* plane_builder,
+                      DerivedXLineBuilder* tf_name_scope_line_builder,
+                      DerivedXLineBuilder* tf_op_line_builder);
 
 // Derives "Step Info", "Tensorflow Ops", "XLA Ops" and "XLA Module" lines in
 // an NVIDIA_GPU device trace from data passed as ScopedAnnotations and stored
diff --git a/tensorflow/core/profiler/utils/derived_timeline_test.cc b/tensorflow/core/profiler/utils/derived_timeline_test.cc
index b92f4c5f801..76a0188480a 100644
--- a/tensorflow/core/profiler/utils/derived_timeline_test.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline_test.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 
-#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -45,8 +47,7 @@ TEST(DerivedTimelineTest, HloModuleNameTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event =
-      CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100, {});
+  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100);
   first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
                                GetStatTypeStr(StatType::kHloModule)),
                            kHloModuleName);
@@ -54,7 +55,7 @@ TEST(DerivedTimelineTest, HloModuleNameTest) {
                                GetStatTypeStr(StatType::kKernelDetails)),
                            kKernelDetails);
   auto second_event =
-      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300, {});
+      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300);
   second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
                                 GetStatTypeStr(StatType::kHloModule)),
                             kHloModuleName);
@@ -85,8 +86,7 @@ TEST(DerivedTimelineTest, TfOpLineTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event =
-      CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100, {});
+  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100);
   first_event.AddStatValue(
       *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
       kTfOpName);
@@ -94,7 +94,7 @@ TEST(DerivedTimelineTest, TfOpLineTest) {
                                GetStatTypeStr(StatType::kKernelDetails)),
                            kKernelDetails);
   auto second_event =
-      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300, {});
+      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300);
   second_event.AddStatValue(
       *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
       kTfOpName);
@@ -164,8 +164,7 @@ TEST(DerivedTimelineTest, TfOpNameScopeTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event =
-      CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100, {});
+  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100);
   first_event.AddStatValue(
       *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
       kTfOpName);
@@ -173,7 +172,7 @@ TEST(DerivedTimelineTest, TfOpNameScopeTest) {
                                GetStatTypeStr(StatType::kKernelDetails)),
                            kKernelDetails);
   auto second_event =
-      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300, {});
+      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300);
   second_event.AddStatValue(
       *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
       kTfOpName);
diff --git a/tensorflow/core/profiler/utils/errors.cc b/tensorflow/core/profiler/utils/errors.cc
new file mode 100644
index 00000000000..9c678e98a43
--- /dev/null
+++ b/tensorflow/core/profiler/utils/errors.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/errors.h"
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace profiler {
+
+const absl::string_view kErrorIncompleteStep =
+    "Incomplete step observed and hence the step time is unknown."
+    "Instead, we use the trace duration as the step time. This may happen"
+    " if your profiling duration is shorter than the step time. In this"
+    " case, you may try to profile longer.";
+
+const absl::string_view kErrorNoStepMarker =
+    "No step marker observed and hence the step time is unknown."
+    " This may happen if (1) training steps are not instrumented (e.g., if"
+    " you are not using Keras) or (2) the profiling duration is shorter"
+    " than the step time. For (1), you need to add step instrumentation;"
+    " for (2), you may try to profile longer.";
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/errors.h b/tensorflow/core/profiler/utils/errors.h
new file mode 100644
index 00000000000..b213fd05c71
--- /dev/null
+++ b/tensorflow/core/profiler/utils/errors.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_ERRORS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_ERRORS_H_
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Error message that the visualization is based on incomplete step.
+ABSL_CONST_INIT extern const absl::string_view kErrorIncompleteStep;
+
+// Error message that no step marker is seen and visualization contains no
+// step info.
+ABSL_CONST_INIT extern const absl::string_view kErrorNoStepMarker;
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_ERRORS_H_
diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
index 1c64f7bf6bb..5e0413c4ba2 100644
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ b/tensorflow/core/profiler/utils/event_span.cc
@@ -14,14 +14,19 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/event_span.h"
 
-#include <chrono>  // NOLINT
-#include <ctime>
-#include <thread>  // NOLINT
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -269,10 +274,7 @@ void CombineStepEvents(const StepEvents& src, StepEvents* dst) {
 
 // Converts from overlapped step-events to non-overlapped step-events.
 StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
-  auto start_time = std::chrono::steady_clock::now();
   StepEvents non_overlapped_step_events;
-
-  // We could parallelize the following loop if necessary.
   for (const auto& step_events : overlapped_step_events) {
     const auto& step_id = step_events.first;
     const auto& step_details = step_events.second;
@@ -281,12 +283,6 @@ StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
     *non_overlapped_step_events[step_id].MutableEvents() =
         ToNonOverlappedEvents(step_details.Events());
   }
-  auto end_time = std::chrono::steady_clock::now();
-  auto elapsed_time_us = std::chrono::duration_cast<std::chrono::microseconds>(
-      end_time - start_time);
-  double elapsed_time_ms = elapsed_time_us.count() / 1000.0;
-  LOG(INFO) << "Generation of step-events took " << elapsed_time_ms << " ms"
-            << std::endl;
   return non_overlapped_step_events;
 }
 
diff --git a/tensorflow/core/profiler/utils/event_span.h b/tensorflow/core/profiler/utils/event_span.h
index 36b31722968..1adc6a75d82 100644
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
 
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/logging.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 7e3a17bdcbe..42961492225 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -15,12 +15,25 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/group_events.h"
 
-#include <stack>
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -31,17 +44,18 @@ namespace {
 
 static const int64 kFunctionalOpEventTypes[] = {
     HostEventType::kCallOp,
-    HostEventType::kParallelForOp,
-    HostEventType::kForeverOp,
     HostEventType::kNumericalGradientOpEvalRight,
     HostEventType::kNumericalGradientOpEvalLeft,
     HostEventType::kSymbolicGradientOp,
     HostEventType::kRemoteCallOp,
     HostEventType::kIfOp,
     HostEventType::kCaseOp,
-    HostEventType::kWhileOpEvalCond,
-    HostEventType::kWhileOpStartBody,
-    HostEventType::kForOp,
+    // TODO(b/154510598): Fix handling of the loop ops.
+    // HostEventType::kWhileOpEvalCond,
+    // HostEventType::kWhileOpStartBody,
+    // HostEventType::kForOp,
+    // HostEventType::kParallelForOp,
+    // HostEventType::kForeverOp,
     HostEventType::kPartitionedCallOp,
 };
 
@@ -66,6 +80,12 @@ absl::optional<int64> GetKernelEventType(const XPlaneVisitor& visitor,
   return absl::nullopt;
 }
 
+bool IsTfOpEvent(const XPlaneVisitor& visitor, const XEvent& event) {
+  TfOp tf_op =
+      ParseTfOpFullname(visitor.GetEventMetadata(event.metadata_id())->name());
+  return tf_op.category == Category::kTensorFlow;
+}
+
 int64 GetEventType(const XPlaneVisitor& visitor, const XEvent& event) {
   if (absl::optional<int64> event_type = visitor.GetEventType(event)) {
     return *event_type;
@@ -76,6 +96,8 @@ int64 GetEventType(const XPlaneVisitor& visitor, const XEvent& event) {
     // TODO(148346217): Make XPlaneVisitor support KernelLaunch and
     // KernelExecute event types.
     return *kernel_event_type;
+  } else if (IsTfOpEvent(visitor, event)) {
+    return HostEventType::kTfOpRun;
   } else {
     return HostEventType::kUnknownHostEventType;
   }
@@ -183,6 +205,14 @@ void EventNode::SetIsEager(bool is_eager) {
                      is_eager ? 1 : 0, event_);
 }
 
+bool EventNode::IsEager() {
+  // It is eagerly executed if its trace context includes the EagerKernelExecute
+  // event (which may execute an op eagerly or through the TF executor) but not
+  // the TF executor event.
+  return FindParent(HostEventType::kExecutorStateProcess) == nullptr &&
+         FindParent(HostEventType::kEagerKernelExecute) != nullptr;
+}
+
 bool EventNode::IsNestedIn(EventNode* parent) {
   return parent && IsNested(GetEvent(), parent->GetEvent());
 }
@@ -290,16 +320,21 @@ void EventForest::CreateEventGroup(
   }
 }
 
-void EventForest::MarkEagerlyExecutedKernels() {
+void EventForest::MarkEagerlyExecutedGpuKernels() {
   auto kernel_execute_event_node_list =
       gtl::FindOrNull(event_node_map_, HostEventType::kKernelExecute);
   if (!kernel_execute_event_node_list) return;
   for (auto& kernel_execute_event_node : *kernel_execute_event_node_list) {
-    // A kernel is eagerly executed if its trace context does not include the
-    // TF executor.
-    bool is_eager = kernel_execute_event_node->FindParent(
-                        HostEventType::kExecutorStateProcess) == nullptr;
-    kernel_execute_event_node->SetIsEager(is_eager);
+    kernel_execute_event_node->SetIsEager(kernel_execute_event_node->IsEager());
+  }
+}
+
+void EventForest::MarkEagerlyExecutedCpuTfOps() {
+  auto tf_op_run_event_node_list =
+      gtl::FindOrNull(event_node_map_, HostEventType::kTfOpRun);
+  if (!tf_op_run_event_node_list) return;
+  for (auto& tf_op_run_event_node : *tf_op_run_event_node_list) {
+    tf_op_run_event_node->SetIsEager(tf_op_run_event_node->IsEager());
   }
 }
 
@@ -381,7 +416,8 @@ EventForest::EventForest(
     CreateVirtualEventsForAsyncExecutor();
   }
   CreateEventGroup(root_event_types);
-  MarkEagerlyExecutedKernels();
+  MarkEagerlyExecutedGpuKernels();
+  MarkEagerlyExecutedCpuTfOps();
 }
 
 std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList() {
@@ -398,9 +434,21 @@ std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList() {
       {HostEventType::kSessionRun,
        HostEventType::kExecutorDoneCallback,
        {StatType::kStepId}},
+      {HostEventType::kRunGraph,
+       HostEventType::kExecutorStateProcess,
+       {StatType::kStepId}},
+      {HostEventType::kRunGraph,
+       HostEventType::kExecutorDoneCallback,
+       {StatType::kStepId}},
+      {HostEventType::kRunGraph,
+       HostEventType::kRunGraphDone,
+       {StatType::kStepId}},
       {HostEventType::kExecutorStateProcess,
        HostEventType::kIteratorGetNextOp,
        {StatType::kStepId, StatType::kIterNum}},
+      {HostEventType::kExecutorStateProcess,
+       HostEventType::kIteratorGetNextAsOptionalOp,
+       {StatType::kStepId, StatType::kIterNum}},
       {HostEventType::kKernelLaunch,
        HostEventType::kKernelExecute,
        {StatType::kCorrelationId}},
@@ -426,7 +474,8 @@ void GroupTfEvents(XSpace* space, EventGroupNameMap* event_group_name_map) {
       CreateInterThreadConnectInfoList();
   const std::vector<int64 /*EventType*/> root_event_types(
       {HostEventType::kTraceContext, HostEventType::kFunctionRun,
-       HostEventType::kSessionRun, HostEventType::kHostTrainingLoopIteration});
+       HostEventType::kSessionRun, HostEventType::kRunGraph,
+       HostEventType::kHostTrainingLoopIteration});
   EventForest event_forest(connect_info_list, root_event_types,
                            CreateTfXPlaneVisitor, space);
   if (event_group_name_map) {
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index a8e3c1a393b..4b6fc58e3b8 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -16,9 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 
+#include <functional>
 #include <memory>
+#include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
@@ -73,6 +80,9 @@ class EventNode {
 
   void SetIsEager(bool is_eager);
 
+  // Returns true if this event is part of eagerly executed op.
+  bool IsEager();
+
   bool IsNestedIn(EventNode* parent);
 
   // Returns the closest parent of the given event type.
@@ -125,8 +135,11 @@ class EventForest {
   void CreateEventGroup(
       const std::vector<int64 /*EventType*/>& root_event_types);
 
-  // Sets the is_eager stat to true for the eagerly executed kernel events.
-  void MarkEagerlyExecutedKernels();
+  // Sets the is_eager stat to true for the eagerly executed GPU kernel events.
+  void MarkEagerlyExecutedGpuKernels();
+
+  // Sets the is_eager stat to true for the eagerly executed CPU TF op events.
+  void MarkEagerlyExecutedCpuTfOps();
 
   // Create virtual events of HostEventType::kHostTrainingLoopIteration and
   // event nodes for them. A virtual event is created for each iteration of the
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index 13709d442af..11996ba4068 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/group_events.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -145,16 +148,21 @@ TEST(GroupEventsTest, GroupFunctionalOp) {
 
 TEST(GroupEventsTest, EagerOpTest) {
   XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
+  XPlane* host_plane = space.add_planes();
+  XPlaneBuilder host_plane_builder(host_plane);
   host_plane_builder.SetName(kHostThreads);
   host_plane_builder.ReserveLines(1);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
   // Eagerly scheduled GPU kernel.
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kEagerKernelExecute, 10, 100, {});
   CreateXEvent(&host_plane_builder, &main_thread, "matmul", 10, 100,
                {{StatType::kCorrelationId, 100}});
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               110, 200, {{StatType::kStepId, 0}});
+  // Eagerly executed CPU TF op.
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kEagerKernelExecute, 120, 80, {});
+  CreateXEvent(&host_plane_builder, &main_thread, "add:Add", 120, 80);
 
   XPlane* device_plane = space.add_planes();
   XPlaneBuilder device_plane_builder(device_plane);
@@ -166,23 +174,32 @@ TEST(GroupEventsTest, EagerOpTest) {
                {{StatType::kCorrelationId, 100}});
 
   GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
-  XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
-  EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 2);
-  EXPECT_EQ(device_plane_visitor.GetStatType(
-                device_plane->lines(0).events(0).stats(1)),
+  XPlaneVisitor host_plane_visitor = CreateTfXPlaneVisitor(host_plane);
+  const XEvent& eager_cpu_tf_op = host_plane->lines(0).events(3);
+  EXPECT_EQ(eager_cpu_tf_op.stats_size(), 1);
+  EXPECT_EQ(host_plane_visitor.GetStatType(eager_cpu_tf_op.stats(0)),
             StatType::kIsEager);
-  EXPECT_EQ(device_plane->lines(0).events(0).stats(1).int64_value(), 1);
+  EXPECT_EQ(eager_cpu_tf_op.stats(0).int64_value(), 1);
+  XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
+  const XEvent& eager_gpu_kernel = device_plane->lines(0).events(0);
+  EXPECT_EQ(eager_gpu_kernel.stats_size(), 2);
+  EXPECT_EQ(device_plane_visitor.GetStatType(eager_gpu_kernel.stats(1)),
+            StatType::kIsEager);
+  EXPECT_EQ(eager_gpu_kernel.stats(1).int64_value(), 1);
 }
 
 TEST(GroupEventsTest, FunctionOpTest) {
   XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
+  XPlane* host_plane = space.add_planes();
+  XPlaneBuilder host_plane_builder(host_plane);
   host_plane_builder.SetName(kHostThreads);
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
                0, 100, {{StatType::kStepNum, 123}});
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kEagerKernelExecute, 10, 90, {});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
                10, 90, {{StatType::kStepId, 0}});
 
@@ -191,8 +208,10 @@ TEST(GroupEventsTest, FunctionOpTest) {
                HostEventType::kExecutorStateProcess, 20, 80,
                {{StatType::kStepId, 0}});
   // GPU kernel scheduled inside tf.function.
-  CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70,
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 30,
                {{StatType::kCorrelationId, 100}});
+  // CPU TF op executed inside tf.function.
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "add:Add", 70, 20);
 
   XPlane* device_plane = space.add_planes();
   XPlaneBuilder device_plane_builder(device_plane);
@@ -204,12 +223,18 @@ TEST(GroupEventsTest, FunctionOpTest) {
                {{StatType::kCorrelationId, 100}});
 
   GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
-  XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
-  EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 3);
-  EXPECT_EQ(device_plane_visitor.GetStatType(
-                device_plane->lines(0).events(0).stats(2)),
+  XPlaneVisitor host_plane_visitor = CreateTfXPlaneVisitor(host_plane);
+  const XEvent& cpu_tf_op = host_plane->lines(1).events(2);
+  EXPECT_EQ(cpu_tf_op.stats_size(), 2);
+  EXPECT_EQ(host_plane_visitor.GetStatType(cpu_tf_op.stats(1)),
             StatType::kIsEager);
-  EXPECT_EQ(device_plane->lines(0).events(0).stats(2).int64_value(), 0);
+  EXPECT_EQ(cpu_tf_op.stats(1).int64_value(), 0);
+  XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
+  const XEvent& gpu_kernel = device_plane->lines(0).events(0);
+  EXPECT_EQ(gpu_kernel.stats_size(), 3);
+  EXPECT_EQ(device_plane_visitor.GetStatType(gpu_kernel.stats(2)),
+            StatType::kIsEager);
+  EXPECT_EQ(gpu_kernel.stats(2).int64_value(), 0);
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.cc b/tensorflow/core/profiler/utils/hardware_type_utils.cc
index 75896c03851..e2a4004555b 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.cc
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/html_utils.h b/tensorflow/core/profiler/utils/html_utils.h
new file mode 100644
index 00000000000..215d9f51d5b
--- /dev/null
+++ b/tensorflow/core/profiler/utils/html_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Creates a html that links to the given url with the given text.
+inline std::string AnchorElement(absl::string_view url,
+                                 absl::string_view text) {
+  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
index 14038d5c177..c40c3a89c9c 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@@ -15,15 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 
+#include <algorithm>
+#include <string>
 #include <tuple>
 #include <vector>
 
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 
 namespace tensorflow {
@@ -34,15 +36,15 @@ void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
   const std::vector<absl::string_view> params =
       absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(":\n"));
 
-  constexpr uint32_t kNumDimensions = 3;
-  for (uint32_t dim = 0; dim < kNumDimensions; ++dim) {
+  constexpr uint32 kNumDimensions = 3;
+  for (uint32 dim = 0; dim < kNumDimensions; ++dim) {
     kernel->add_block_dim(1);
     kernel->add_grid_dim(1);
   }
 
   // Process value pairs.
-  for (uint32_t ii = 0; ii < params.size(); ii += 2) {
-    uint32_t value = 0;
+  for (uint32 ii = 0; ii < params.size(); ii += 2) {
+    uint32 value = 0;
     if (params[ii] == "registers_per_thread" &&
         absl::SimpleAtoi(params[ii + 1], &value)) {
       kernel->set_registers_per_thread(value);
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
index 06307d6d102..863d2f79819 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 
+#include <algorithm>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
@@ -40,7 +45,7 @@ class DeviceTfOpMetricsDbBuilder : public OpMetricsDbBuilder {
         /*hlo_module_id=*/0, tf_op_name);
     if (tf_op_metrics->category().empty()) {
       tf_op_metrics->set_category(
-          tf_op_type == kUnknownOp ? "Unknown" : string(tf_op_type));
+          tf_op_type == kUnknownOp ? "Unknown" : std::string(tf_op_type));
     }
     tf_op_metrics->set_is_eager(device_op_metrics.is_eager());
     // The occurrences of a TF-op is the maximum among the occurrences of all
@@ -89,8 +94,8 @@ uint64 IdleTimePs(const OpMetricsDb& metrics_db) {
 void AddIdleOp(OpMetricsDb* db) {
   uint64 idle_time_ps = IdleTimePs(*db);
   OpMetrics* metrics = db->add_metrics_db();
-  metrics->set_name(string(kIdle));
-  metrics->set_category(string(kIdle));
+  metrics->set_name(std::string(kIdle));
+  metrics->set_category(std::string(kIdle));
   metrics->set_occurrences(0);
   metrics->set_time_ps(idle_time_ps);
   metrics->set_self_time_ps(idle_time_ps);
diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
index cc17564c6f8..921e0617902 100644
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ b/tensorflow/core/profiler/utils/op_utils.cc
@@ -15,8 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/op_utils.h"
 
+#include <algorithm>
+#include <string>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -34,13 +40,14 @@ double GetCappedPerf(double perf, uint64 time, double rate_limit) {
 }  // namespace
 
 void HostOpMetricsDbBuilder::EnterOp(absl::string_view name,
-                                     absl::string_view category, uint64 time_ps,
-                                     uint64 children_time_ps) {
+                                     absl::string_view category, bool is_eager,
+                                     uint64 time_ps, uint64 children_time_ps) {
   uint64 self_time_ps = time_ps - children_time_ps;
   DCHECK_GE(time_ps, self_time_ps);
   OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(/*hlo_module_id=*/0, name);
   if (op_metrics->category().empty())
     op_metrics->set_category(category.data(), category.size());
+  op_metrics->set_is_eager(op_metrics->is_eager() || is_eager);
   op_metrics->set_occurrences(op_metrics->occurrences() + 1);
   op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
   op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
@@ -68,10 +75,10 @@ void DeviceOpMetricsDbBuilder::EnterOp(uint64 program_id,
   OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
   if (op_metrics->category().empty())
     op_metrics->set_category(category == kUnknownOp ? "unknown"
-                                                    : string(category));
+                                                    : std::string(category));
   if (op_metrics->provenance().empty())
-    op_metrics->set_provenance(string(provenance));
-  op_metrics->set_is_eager(is_eager);
+    op_metrics->set_provenance(std::string(provenance));
+  op_metrics->set_is_eager(op_metrics->is_eager() || is_eager);
   op_metrics->set_occurrences(op_metrics->occurrences() + occurrences);
   op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
   op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h
index aeb973705e9..f94328d1b8d 100644
--- a/tensorflow/core/profiler/utils/op_utils.h
+++ b/tensorflow/core/profiler/utils/op_utils.h
@@ -16,13 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
 
-#include <string>
-
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -35,12 +32,13 @@ class HostOpMetricsDbBuilder : public OpMetricsDbBuilder {
   // observed on a trace, where:
   //   name = the OP name.
   //   category = the OP category.
+  //   is_eager = whether this OP is eagerly executed.
   //   time_ps = the total execution time of the OP in picoseconds, including
   //             the execution time of its children.
   //   children_time_ps = the execution time of the children of this OP in
   //                      picoseconds
   void EnterOp(absl::string_view name, absl::string_view category,
-               uint64 time_ps, uint64 children_time_ps);
+               bool is_eager, uint64 time_ps, uint64 children_time_ps);
 
   // Updates total_host_infeed_enq_duration_ps_ and
   // total_host_infeed_enq_duration_ps_.
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.cc b/tensorflow/core/profiler/utils/tf_op_utils.cc
index 5a4204440a3..630a74c4e47 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
+#include <string>
+#include <vector>
+
 #include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
-#include "absl/strings/strip.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.h b/tensorflow/core/profiler/utils/tf_op_utils.h
index d1ac69e2976..b8af9463d51 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.h
+++ b/tensorflow/core/profiler/utils/tf_op_utils.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TF_OP_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TF_OP_UTILS_H_
 
+#include <string>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 
diff --git a/tensorflow/core/profiler/utils/tf_op_utils_test.cc b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
index fa5169557d1..136dbee2430 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils_test.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/tf_xplane_visitor.h b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
index 33a170f8efd..17a7b94ef92 100644
--- a/tensorflow/core/profiler/utils/tf_xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TF_XPLANE_VISITOR_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TF_XPLANE_VISITOR_H_
 
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.cc b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
new file mode 100644
index 00000000000..f4cbaa84100
--- /dev/null
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
@@ -0,0 +1,133 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/utils/tfstreamz_utils.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/monitoring/collected_metrics.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/lib/monitoring/types.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/tfstreamz.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+std::string ConstructXStatName(absl::string_view name,
+                               const monitoring::Point& point) {
+  if (point.labels.empty()) {
+    return std::string(name);
+  }
+  return absl::Substitute(
+      "$0{$1}", name,
+      absl::StrJoin(
+          point.labels, ", ",
+          [](std::string* out, const monitoring::Point::Label& label) {
+            absl::StrAppend(out, label.name, "=", label.value);
+          }));
+}
+
+std::string SerializePercentile(const monitoring::Percentiles& percentiles) {
+  tfstreamz::Percentiles output;
+  output.set_unit_of_measure(
+      static_cast<tfstreamz::UnitOfMeasure>(percentiles.unit_of_measure));
+  output.set_start_nstime(percentiles.start_nstime);
+  output.set_end_nstime(percentiles.end_nstime);
+  output.set_min_value(percentiles.min_value);
+  output.set_max_value(percentiles.max_value);
+  output.set_mean(percentiles.mean);
+  output.set_stddev(percentiles.stddev);
+  output.set_num_samples(percentiles.num_samples);
+  output.set_total_samples(percentiles.total_samples);
+  output.set_accumulator(percentiles.accumulator);
+  for (const auto& pp : percentiles.points) {
+    auto* percentile_point = output.add_points();
+    percentile_point->set_percentile(pp.percentile);
+    percentile_point->set_value(pp.value);
+  }
+  return output.SerializeAsString();
+}
+
+}  // namespace
+
+Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
+                         XPlane* plane, uint64 line_start_time_ns) {
+  XPlaneBuilder xplane(plane);
+  XLineBuilder line = xplane.GetOrCreateLine(0);  // This plane has single line.
+  line.SetTimestampNs(line_start_time_ns);
+
+  // For each snapshot, create a virtual event.
+  for (const auto& snapshot : snapshots) {
+    XEventMetadata* event_metadata =
+        xplane.GetOrCreateEventMetadata("TFStreamz Snapshot");
+    XEventBuilder xevent = line.AddEvent(*event_metadata);
+    xevent.SetTimestampNs(snapshot.start_time_ns);
+    xevent.SetEndTimestampNs(snapshot.end_time_ns);
+    auto& metric_descriptor_map = snapshot.metrics->metric_descriptor_map;
+    for (const auto& point_set : snapshot.metrics->point_set_map) {
+      const std::string& metric_name = point_set.first;
+      // Each metrics have multiple points corresponding to different labels.
+      for (const auto& point : point_set.second->points) {
+        // Generates one KPI metric for each point.
+        std::string stat_name = ConstructXStatName(metric_name, *point);
+        auto* metadata = xplane.GetOrCreateStatMetadata(stat_name);
+        auto it = metric_descriptor_map.find(metric_name);
+        if (it != metric_descriptor_map.end()) {
+          metadata->set_description(it->second->description);
+        }
+        switch (point->value_type) {
+          case monitoring::ValueType::kInt64:
+            xevent.AddStatValue(*metadata, point->int64_value);
+            break;
+          case monitoring::ValueType::kBool:
+            xevent.AddStatValue(*metadata, point->bool_value);
+            break;
+          case monitoring::ValueType::kString:
+            xevent.AddStatValue(*metadata, *xplane.GetOrCreateStatMetadata(
+                                               point->string_value));
+            break;
+          case monitoring::ValueType::kHistogram:
+            xevent.AddStatValue(*metadata,
+                                point->histogram_value.SerializeAsString(),
+                                /*is_bytes=*/true);
+            break;
+          case monitoring::ValueType::kPercentiles:
+            xevent.AddStatValue(*metadata,
+                                SerializePercentile(point->percentiles_value),
+                                /*is_bytes=*/true);
+            break;
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.h b/tensorflow/core/profiler/utils/tfstreamz_utils.h
new file mode 100644
index 00000000000..1ab21ed1b5e
--- /dev/null
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/lib/monitoring/collected_metrics.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+struct TfStreamzSnapshot {
+  std::unique_ptr<monitoring::CollectedMetrics> metrics;
+  uint64 start_time_ns;  // time before collection.
+  uint64 end_time_ns;    // time after collection.
+};
+
+Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
+                         XPlane* plane, uint64 line_start_time_ns);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/timespan.h b/tensorflow/core/profiler/utils/timespan.h
index bccbeaa796f..82775af1415 100644
--- a/tensorflow/core/profiler/utils/timespan.h
+++ b/tensorflow/core/profiler/utils/timespan.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TIMESPAN_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TIMESPAN_H_
 
+#include <algorithm>
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/profiler/utils/xplane_builder.cc b/tensorflow/core/profiler/utils/xplane_builder.cc
index 9e66a15cc36..f923f3982f4 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.cc
+++ b/tensorflow/core/profiler/utils/xplane_builder.cc
@@ -14,6 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 
+#include <algorithm>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
 namespace tensorflow {
@@ -54,7 +62,7 @@ XEventMetadata* XPlaneBuilder::GetOrCreateEventMetadata(
   return metadata;
 }
 
-XEventMetadata* XPlaneBuilder::GetOrCreateEventMetadata(string&& name) {
+XEventMetadata* XPlaneBuilder::GetOrCreateEventMetadata(std::string&& name) {
   XEventMetadata*& metadata = event_metadata_by_name_[name];
   if (metadata == nullptr) {
     metadata =
diff --git a/tensorflow/core/profiler/utils/xplane_builder.h b/tensorflow/core/profiler/utils/xplane_builder.h
index 803cc7b89c2..b0d743a0caf 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.h
+++ b/tensorflow/core/profiler/utils/xplane_builder.h
@@ -15,10 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_BUILDER_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_BUILDER_H_
 
+#include <stddef.h>
+
+#include <string>
+#include <utility>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -53,12 +58,12 @@ class XStatsBuilder {
   void AddStatValue(const XStatMetadata& metadata, absl::string_view value,
                     bool is_bytes = false) {
     if (is_bytes) {
-      AddStat(metadata)->set_bytes_value(string(value));
+      AddStat(metadata)->set_bytes_value(std::string(value));
     } else {
-      AddStat(metadata)->set_str_value(string(value));
+      AddStat(metadata)->set_str_value(std::string(value));
     }
   }
-  void AddStatValue(const XStatMetadata& metadata, string&& value,
+  void AddStatValue(const XStatMetadata& metadata, std::string&& value,
                     bool is_bytes = false) {
     if (is_bytes) {
       AddStat(metadata)->set_bytes_value(std::move(value));
@@ -160,7 +165,7 @@ class XLineBuilder {
 
   int64 NumEvents() { return line_->events_size(); }
 
-  void SetName(absl::string_view name) { line_->set_name(string(name)); }
+  void SetName(absl::string_view name) { line_->set_name(std::string(name)); }
 
   void SetNameIfEmpty(absl::string_view name) {
     if (line_->name().empty()) SetName(name);
@@ -205,7 +210,7 @@ class XPlaneBuilder : public XStatsBuilder<XPlane> {
   int64 Id() { return plane_->id(); }
   void SetId(int64 id) { plane_->set_id(id); }
 
-  void SetName(absl::string_view name) { plane_->set_name(string(name)); }
+  void SetName(absl::string_view name) { plane_->set_name(std::string(name)); }
 
   void ReserveLines(size_t num_lines) {
     plane_->mutable_lines()->Reserve(num_lines);
@@ -222,7 +227,7 @@ class XPlaneBuilder : public XStatsBuilder<XPlane> {
 
   XEventMetadata* GetOrCreateEventMetadata(int64 metadata_id);
   XEventMetadata* GetOrCreateEventMetadata(absl::string_view name);
-  XEventMetadata* GetOrCreateEventMetadata(string&& name);
+  XEventMetadata* GetOrCreateEventMetadata(std::string&& name);
   inline XEventMetadata* GetOrCreateEventMetadata(const char* name) {
     return GetOrCreateEventMetadata(absl::string_view(name));
   }
@@ -251,7 +256,7 @@ void XStatsBuilder<T>::AddStat(const XStatMetadata& key, const XStat& stat,
   if (stat.value_case() == XStat::kRefValue) {
     const auto& stat_metadata_map = src.stat_metadata();
     const auto it = stat_metadata_map.find(stat.ref_value());
-    if (ABSL_PREDICT_FALSE(it == stat_metadata_map.end())) {
+    if (TF_PREDICT_FALSE(it == stat_metadata_map.end())) {
       // the reference value in stat is not found in XStatMetadata from src.
       return;
     }
diff --git a/tensorflow/core/profiler/utils/xplane_builder_test.cc b/tensorflow/core/profiler/utils/xplane_builder_test.cc
new file mode 100644
index 00000000000..e55e01d8233
--- /dev/null
+++ b/tensorflow/core/profiler/utils/xplane_builder_test.cc
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+TEST(TimespanTests, NonInstantSpanIncludesSingleTimeTests) {
+  XPlane plane;
+  XPlaneBuilder xplane_builder(&plane);
+  XLineBuilder xline_builder = xplane_builder.GetOrCreateLine(0);
+  XEventBuilder event_builder = xline_builder.AddEvent(
+      *xplane_builder.GetOrCreateEventMetadata("1st event"));
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("int stat"), 1234LL);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("string stat"),
+      std::string("abc"));
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("double stat"), 1.0);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("ref stat"),
+      *xplane_builder.GetOrCreateStatMetadata("referenced abc"));
+
+  XPlaneVisitor xplane_visitor(&plane);
+  EXPECT_EQ(xplane_visitor.NumLines(), 1);
+  int num_stats = 0;
+  xplane_visitor.ForEachLine([&](const XLineVisitor& xline) {
+    xline.ForEachEvent([&](const XEventVisitor& xevent) {
+      EXPECT_EQ(xevent.Name(), "1st event");
+      xevent.ForEachStat([&](const XStatVisitor& stat) {
+        if (stat.Name() == "int stat") {
+          EXPECT_EQ(stat.IntValue(), 1234LL);
+          num_stats++;
+        } else if (stat.Name() == "string stat") {
+          EXPECT_EQ(stat.StrOrRefValue(), "abc");
+          num_stats++;
+        } else if (stat.Name() == "double stat") {
+          EXPECT_EQ(stat.DoubleValue(), 1.0);
+          num_stats++;
+        } else if (stat.Name() == "ref stat") {
+          EXPECT_EQ(stat.StrOrRefValue(), "referenced abc");
+          num_stats++;
+        }
+      });
+    });
+  });
+  EXPECT_EQ(num_stats, 4);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index e93bea09b91..28d5d303940 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -26,11 +29,23 @@ const absl::string_view kHostThreads = "/host:CPU";
 const absl::string_view kGpuPlanePrefix = "/device:GPU:";
 const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
 const absl::string_view kMetadataPlane = "/host:metadata";
+const absl::string_view kTFStreamzPlane = "/host:tfstreamz";
+
+const absl::string_view kStepLineName = "Steps";
+const absl::string_view kTensorFlowNameScopeLineName = "TensorFlow Name Scope";
+const absl::string_view kTensorFlowOpLineName = "TensorFlow Ops";
+const absl::string_view kXlaModuleLineName = "XLA Modules";
+const absl::string_view kXlaOpLineName = "XLA Ops";
+const absl::string_view kKernelLaunchLineName = "Launch Stats";
 
 const int32 kHostPlaneId = 49;
 const int32 kGpuPlaneBaseId = 0;
 const int32 kCuptiDriverApiPlaneId = 50;
-const int32 kMetadataPlaneId = 51;
+const int32 kMetadataPlaneId = 99;
+const int32 kTFStreamzPlaneId = 98;
+
+const int32 kThreadGroupMinPlaneId = kCuptiDriverApiPlaneId + 1;
+const int32 kThreadGroupMaxPlaneId = kTFStreamzPlaneId - 1;
 
 namespace {
 
@@ -53,6 +68,8 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"SessionRun", kSessionRun},
       {"FunctionRun", kFunctionRun},
       {"RunGraph", kRunGraph},
+      {"RunGraphDone", kRunGraphDone},
+      {"TfOpRun", kTfOpRun},
       {"EagerKernelExecute", kEagerKernelExecute},
       {"ExecutorState::Process", kExecutorStateProcess},
       {"ExecutorDoneCallback", kExecutorDoneCallback},
@@ -88,6 +105,7 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"LocalExecutable::Execute", kLocalExecutableExecute},
       // tf.data related.
       {"IteratorGetNextOp::DoCompute", kIteratorGetNextOp},
+      {"IteratorGetNextAsOptionalOp::DoCompute", kIteratorGetNextAsOptionalOp},
       // Virtual events for grouping.
       {"HostTrainingLoopIteration", kHostTrainingLoopIteration},
       {"AsyncExecutorTraceContext", kAsyncExecutorTraceContext},
@@ -129,6 +147,11 @@ const StatTypeMap& GetStatTypeMap() {
       {"region_type", kRegionType},
       {"data_type", kDataType},
       {"shape", kTensorShapes},
+      // XPlane semantics related.
+      {"$pt", kProducerType},
+      {"$ct", kConsumerType},
+      {"$p", kProducerId},
+      {"$c", kConsumerId},
       // Device trace arguments.
       {"device_id", kDeviceId},
       {"context_id", kContextId},
@@ -147,6 +170,8 @@ const StatTypeMap& GetStatTypeMap() {
       {"hlo_module", kHloModule},
       {"equation", kEquation},
       {"is_eager", kIsEager},
+      {"tf_function_call", kTfFunctionCall},
+      {"tracing_count", kTfFunctionTracingCount},
       // Performance counter related.
       {"Raw Value", kRawValue},
       {"Scaled Value", kScaledValue},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 2a8ba268825..98264c3d6e4 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -16,11 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
 
-#include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
-#include "absl/types/span.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -33,6 +32,16 @@ ABSL_CONST_INIT extern const absl::string_view kGpuPlanePrefix;
 ABSL_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
 // Name of XPlane that contains profile metadata such as XLA debug info.
 ABSL_CONST_INIT extern const absl::string_view kMetadataPlane;
+// Name of XPlane that contains kpi related metrics.
+ABSL_CONST_INIT extern const absl::string_view kTFStreamzPlane;
+
+// Names of XLines that contain ML-level events.
+ABSL_CONST_INIT extern const absl::string_view kStepLineName;
+ABSL_CONST_INIT extern const absl::string_view kTensorFlowNameScopeLineName;
+ABSL_CONST_INIT extern const absl::string_view kTensorFlowOpLineName;
+ABSL_CONST_INIT extern const absl::string_view kXlaModuleLineName;
+ABSL_CONST_INIT extern const absl::string_view kXlaOpLineName;
+ABSL_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
 
 // Id of XPlane that contains TraceMe events.
 ABSL_CONST_INIT extern const int32 kHostPlaneId;
@@ -43,6 +52,11 @@ ABSL_CONST_INIT extern const int32 kGpuPlaneBaseId;
 ABSL_CONST_INIT extern const int32 kCuptiDriverApiPlaneId;
 // Id of XPlane that contains profile metadata such as XLA debug info.
 ABSL_CONST_INIT extern const int32 kMetadataPlaneId;
+// Id of XPlane that contains kpi related metrics.
+ABSL_CONST_INIT extern const int32 kTFStreamzPlaneId;
+
+ABSL_CONST_INIT extern const int32 kThreadGroupMinPlaneId;
+ABSL_CONST_INIT extern const int32 kThreadGroupMaxPlaneId;
 
 // Interesting event types (i.e., TraceMe names).
 enum HostEventType {
@@ -52,6 +66,8 @@ enum HostEventType {
   kSessionRun,
   kFunctionRun,
   kRunGraph,
+  kRunGraphDone,
+  kTfOpRun,
   kEagerKernelExecute,
   kExecutorStateProcess,
   kExecutorDoneCallback,
@@ -83,6 +99,7 @@ enum HostEventType {
   kLocalExecutableExecute,
   // tf.data related.
   kIteratorGetNextOp,
+  kIteratorGetNextAsOptionalOp,
   // Virtual events for grouping.
   kHostTrainingLoopIteration,
   kAsyncExecutorTraceContext,
@@ -122,6 +139,11 @@ enum StatType {
   kRegionType,
   kDataType,
   kTensorShapes,
+  // XPlane semantics related.
+  kProducerType,
+  kConsumerType,
+  kProducerId,
+  kConsumerId,
   // Device trace arguments.
   kDeviceId,
   kContextId,
@@ -140,6 +162,8 @@ enum StatType {
   kHloModule,
   kEquation,
   kIsEager,
+  kTfFunctionCall,
+  kTfFunctionTracingCount,
   // Performance counter related.
   kRawValue,
   kScaledValue,
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index df1d5fc6e7a..7f5221c5391 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -14,12 +14,21 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -154,6 +163,46 @@ XEventBuilder CreateXEvent(
                       stats);
 }
 
+XEventBuilder CreateXEventWithStringViewMetadataValue(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
+    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
+        stats) {
+  auto event_builder = line_builder->AddEvent(
+      *plane_builder->GetOrCreateEventMetadata(event_name));
+  event_builder.SetOffsetPs(offset_ps);
+  event_builder.SetDurationPs(duration_ps);
+  for (const auto& stat_type_and_value : stats) {
+    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
+                                   GetStatTypeStr(stat_type_and_value.first)),
+                               stat_type_and_value.second);
+  }
+  return event_builder;
+}
+
+XEventBuilder CreateXEventWithIntAndStringViewMetadataValue(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
+    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& int_stats,
+    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
+        str_stats) {
+  auto event_builder = line_builder->AddEvent(
+      *plane_builder->GetOrCreateEventMetadata(event_name));
+  event_builder.SetOffsetPs(offset_ps);
+  event_builder.SetDurationPs(duration_ps);
+  for (const auto& stat_type_and_value : int_stats) {
+    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
+                                   GetStatTypeStr(stat_type_and_value.first)),
+                               stat_type_and_value.second);
+  }
+  for (const auto& stat_type_and_value : str_stats) {
+    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
+                                   GetStatTypeStr(stat_type_and_value.first)),
+                               stat_type_and_value.second);
+  }
+  return event_builder;
+}
+
 void RemovePlaneWithName(XSpace* space, absl::string_view name) {
   auto* planes = space->mutable_planes();
   planes->erase(
@@ -209,10 +258,17 @@ void SortXSpace(XSpace* space) {
   for (XPlane& plane : *space->mutable_planes()) SortXPlane(&plane);
 }
 
+// Normalize the line's timestamp in this XPlane.
+// NOTE: This can be called multiple times on the same plane. Only the first
+// call will do the normalization, subsequent calls will do nothing.
+// The assumption is that both line's timestamp_ns and start_time_ns are
+// nano-seconds from epoch time, the different of these values is much
+// smaller than these value.
 void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
   for (XLine& line : *plane->mutable_lines()) {
-    DCHECK_GE(line.timestamp_ns(), start_time_ns);
-    line.set_timestamp_ns(line.timestamp_ns() - start_time_ns);
+    if (line.timestamp_ns() >= start_time_ns) {
+      line.set_timestamp_ns(line.timestamp_ns() - start_time_ns);
+    }
   }
 }
 
@@ -290,5 +346,23 @@ uint64 GetStartTimestampNs(const XPlane& plane) {
   return plane_timestamp;
 }
 
+void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
+                               XLineBuilder* line_builder,
+                               absl::string_view function_name, int64 offset_ps,
+                               int64 duration_ps,
+                               absl::string_view execution_mode,
+                               int64 tracing_count) {
+  XEventBuilder event_builder = CreateXEventWithStringViewMetadataValue(
+      plane_builder, line_builder, function_name, offset_ps, duration_ps,
+      {{StatType::kTfFunctionCall, execution_mode}});
+  if (tracing_count >= 0) {
+    // Adds the tracing_count stats only if tracing_count is valid.
+    event_builder.AddStatValue(
+        *plane_builder->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kTfFunctionTracingCount)),
+        tracing_count);
+  }
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index 41035a65214..49087c49cd8 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
@@ -46,16 +47,31 @@ void AddOrUpdateIntStat(int64 metadata_id, int64 value,
 void AddOrUpdateStrStat(int64 metadata_id, absl::string_view value,
                         tensorflow::profiler::XEvent* event);
 
+// Creates an XEvent with int64 stats.
 XEventBuilder CreateXEvent(
     XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
     absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats);
-
+    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats = {});
 XEventBuilder CreateXEvent(
     XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
     HostEventType event_type, int64 offset_ps, int64 duration_ps,
     const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats);
 
+// Creates an XEvent with string stats.
+XEventBuilder CreateXEventWithStringViewMetadataValue(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
+    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
+        stats);
+
+// Creates an XEvent with int64 and string stats.
+XEventBuilder CreateXEventWithIntAndStringViewMetadataValue(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
+    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& int_stats,
+    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
+        str_stats);
+
 void RemovePlaneWithName(XSpace* space, absl::string_view name);
 void RemoveEmptyPlanes(XSpace* space);
 void RemoveEmptyLines(XPlane* plane);
@@ -87,6 +103,13 @@ void MergePlanes(const XPlane& src_plane, XPlane* dst_plane);
 // timestamps. If zero line exists, return 0;
 uint64 GetStartTimestampNs(const XPlane& plane);
 
+// Creates a Xevent in the given plane & line for a tf-function.
+void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
+                               XLineBuilder* line_builder,
+                               absl::string_view function_name, int64 offset_ps,
+                               int64 duration_ps,
+                               absl::string_view execution_mode,
+                               int64 tracing_count = -1);
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/xplane_utils_test.cc b/tensorflow/core/profiler/utils/xplane_utils_test.cc
index b9b15b2e8a9..04e06fcb05b 100644
--- a/tensorflow/core/profiler/utils/xplane_utils_test.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils_test.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 
+#include <string>
+
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.cc b/tensorflow/core/profiler/utils/xplane_visitor.cc
index 1cfcefe5520..42068b7c61a 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.cc
+++ b/tensorflow/core/profiler/utils/xplane_visitor.cc
@@ -14,7 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -25,11 +34,6 @@ XStatVisitor::XStatVisitor(const XPlaneVisitor* plane, const XStat* stat)
       plane_(plane),
       type_(plane->GetStatType(stat->metadata_id())) {}
 
-absl::string_view XStatVisitor::RefValue() const {
-  const XStatMetadata* metadata = plane_->GetStatMetadata(stat_->ref_value());
-  return metadata ? metadata->name() : "";
-}
-
 std::string XStatVisitor::ToString() const {
   switch (stat_->value_case()) {
     case XStat::kInt64Value:
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.h b/tensorflow/core/profiler/utils/xplane_visitor.h
index ef6d0ad6f86..4120a2821ca 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/xplane_visitor.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
 
+#include <stddef.h>
+
 #include <functional>
-#include <unordered_map>
-#include <utility>
+#include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
@@ -53,10 +55,6 @@ class XStatVisitor {
 
   double DoubleValue() const { return stat_->double_value(); }
 
-  absl::string_view StrValue() const { return stat_->str_value(); }
-
-  absl::string_view RefValue() const;
-
   // Returns a string view.
   // REQUIRED: the value type should be string type or reference type.
   absl::string_view StrOrRefValue() const;
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
new file mode 100644
index 00000000000..a374c808a14
--- /dev/null
+++ b/tensorflow/core/protobuf/BUILD
@@ -0,0 +1,182 @@
+# For platform specific build config
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_all_protos",
+    "tf_proto_library",
+    "tf_proto_library_cc",
+    "tf_pyclif_proto_library",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow_models:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+COMMON_PROTO_SRCS = [
+    "bfc_memory_map.proto",
+    "config.proto",
+    "cluster.proto",
+    "debug.proto",
+    "device_filters.proto",
+    "device_properties.proto",
+    "graph_debug_info.proto",
+    "queue_runner.proto",
+    "rewriter_config.proto",
+    "tensor_bundle.proto",
+    "saver.proto",
+    "verifier_config.proto",
+]
+
+[
+    [
+        tf_pyclif_proto_library(
+            name = "%s_pyclif" % proto_name,
+            proto_lib = ":for_core_protos",
+            proto_srcfile = "%s.proto" % proto_name,
+            visibility = ["//visibility:public"],
+        ),
+    ]
+    for proto_name in [
+        "config",
+        "device_properties",
+        "graph_debug_info",
+        "meta_graph",
+        "saved_model",
+    ]
+]
+
+tf_proto_library(
+    name = "autotuning_proto",
+    srcs = ["autotuning.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+tf_proto_library(
+    name = "conv_autotuning_proto",
+    srcs = ["conv_autotuning.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        "//tensorflow/stream_executor:dnn_proto",
+    ],
+)
+
+tf_proto_library_cc(
+    name = "worker_proto",
+    srcs = ["worker.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_cc(
+    name = "worker_service_proto",
+    srcs = ["worker_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = [":worker_proto"],
+)
+
+tf_proto_library_cc(
+    name = "master_proto",
+    srcs = ["master.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//tensorflow:internal"],
+)
+
+tf_proto_library_cc(
+    name = "master_service_proto",
+    srcs = ["master_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = [":master_proto"],
+)
+
+tf_proto_library_cc(
+    name = "eager_service_proto",
+    srcs = ["eager_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    cc_stubby_versions = ["2"],
+    protodeps = tf_additional_all_protos(),
+)
+
+tf_proto_library_cc(
+    name = "replay_log_proto",
+    srcs = ["replay_log.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":master_proto",
+    ] + tf_additional_all_protos(),
+)
+
+tf_proto_library(
+    name = "error_codes_proto_impl",
+    srcs = ["error_codes.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+exports_files(
+    srcs = ["error_codes.proto"] + COMMON_PROTO_SRCS + [
+        # Protos which are not needed on mobile builds, but should be included
+        # in protos_all.
+        #
+        # Note that some protos are in neither core_proto_srcs nor this
+        # filegroup; e.g. ones with individual proto_library targets.
+        "control_flow.proto",
+        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+        # "critical_section.proto",
+        "data/experimental/snapshot.proto",
+        "debug_event.proto",
+        "meta_graph.proto",
+        "named_tensor.proto",
+        "remote_tensor_handle.proto",
+        "saved_model.proto",
+        "saved_object_graph.proto",
+        "struct.proto",
+        "tensorflow_server.proto",
+        "trackable_object_graph.proto",
+        "transport_options.proto",
+    ],
+)
+
+tf_proto_library(
+    name = "for_core_protos",
+    srcs = COMMON_PROTO_SRCS + [
+        # Protos which are not needed on mobile builds, but should be included
+        # in protos_all.
+        #
+        # Note that some protos are in neither core_proto_srcs nor this
+        # filegroup; e.g. ones with individual proto_library targets.
+        "control_flow.proto",
+        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+        # "critical_section.proto",
+        "data/experimental/snapshot.proto",
+        "debug_event.proto",
+        "meta_graph.proto",
+        "named_tensor.proto",
+        "remote_tensor_handle.proto",
+        "saved_model.proto",
+        "saved_object_graph.proto",
+        "struct.proto",
+        "tensorflow_server.proto",
+        "trackable_object_graph.proto",
+        "transport_options.proto",
+    ],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        ":error_codes_proto_impl",
+        "//tensorflow/core/framework:protos_all",
+    ],
+)
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 7be7199f10c..3fe2bd486ba 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -69,6 +69,7 @@ message QueueItem {
     // enqueued in streaming call. Request with this item type waits for pending
     // nodes to finish on the remote executor and report status.
     SyncRemoteExecutorForStream sync_remote_executor_for_stream = 6;
+    SendPackedHandleOp send_packed_handle = 7;
   }
 }
 
@@ -173,6 +174,18 @@ message WaitQueueDoneResponse {
   // propagate some stats.
 }
 
+message RunComponentFunctionRequest {
+  fixed64 context_id = 1;
+
+  Operation operation = 2;
+}
+
+message RunComponentFunctionResponse {
+  repeated TensorShapeProto shape = 1;
+
+  repeated TensorProto tensor = 2;
+}
+
 message KeepAliveRequest {
   fixed64 context_id = 1;
 }
@@ -226,6 +239,27 @@ message SendTensorOp {
   string device_name = 3;
 }
 
+// Send a packed TensorHandle to a remote worker.
+message SendPackedHandleOp {
+  // Op id of the remote packed TensorHandle.
+  int64 op_id = 1;
+
+  message LocalTensorHandle {
+    TensorProto tensor = 1;
+    // Device where the tensor is produced.
+    string device = 2;
+  }
+
+  message Handle {
+    oneof item {
+      LocalTensorHandle local_handle = 1;
+      RemoteTensorHandle remote_handle = 2;
+    }
+  }
+
+  repeated Handle handles = 2;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Eager Service defines a TensorFlow service that executes operations eagerly
@@ -272,6 +306,22 @@ service EagerService {
   // in the stream so far.
   rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse);
 
+  // This takes an Eager operation and executes it in async mode on the remote
+  // server. Different from EnqueueRequest, ops/functions sent through this
+  // type of requests are allowed to execute in parallel and no ordering is
+  // preserved by RPC stream or executor.
+  // This request type should only be used for executing component functions.
+  // Ordering of component functions should be enforced by their corresponding
+  // main functions. The runtime ensures the following invarients for component
+  // functions (CFs) and their main functions (MFs):
+  // (1) MF1 -> MF2 ==> CF1 -> CF2 ("->" indicates order of execution);
+  // (2) MF1 || MF2 ==> CF1 || CF2 ("||" indicates possible parallel execution);
+  // (3) For CF1 and CF2 that come from the same MF, CF1 || CF2
+  // For executing ops/main functions, use Enqueue or StreamingEnqueue instead
+  // for correct ordering.
+  rpc RunComponentFunction(RunComponentFunctionRequest)
+      returns (RunComponentFunctionResponse);
+
   // Contexts are always created with a deadline and no RPCs within a deadline
   // will trigger a context garbage collection. KeepAlive calls can be used to
   // delay this. It can also be used to validate the existence of a context ID
diff --git a/tensorflow/core/protobuf/remote_tensor_handle.proto b/tensorflow/core/protobuf/remote_tensor_handle.proto
index 10995226a9b..36e3f810b73 100644
--- a/tensorflow/core/protobuf/remote_tensor_handle.proto
+++ b/tensorflow/core/protobuf/remote_tensor_handle.proto
@@ -21,11 +21,11 @@ message RemoteTensorHandle {
   int64 op_id = 1;
   // The index into the outputs of the operation that produced this tensor.
   int32 output_num = 2;
-  // Device of the operation that produced this tensor. Cannot be empty.
+  // Device where the tensor is located. Cannot be empty.
   // For multi-device functions, it's the default device passed to placer.
   string device = 3;
-  // Device where the tensor is located. Can be empty if the operation producing
-  // this tensor is a multi-device function.
+  // Device of the operation producing this tensor. Can be empty if the
+  // operation producing this tensor is a multi-device function.
   string op_device = 4;
   // Tensor type.
   DataType dtype = 5;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 10d6b545b2a..048ed8e930e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 1
+#define TF_MINOR_VERSION 2
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 175  // Updated: 2019/10/1
+#define TF_GRAPH_DEF_VERSION 406  // Updated: 2020/5/19
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 6184f52d240..48a9a229d2a 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -1,6 +1,10 @@
 # Description: Utilities for TPU Operations
 
 package(
+    default_visibility = [
+        "//tensorflow/core/tpu:__subpackages__",
+        "//tensorflow/stream_executor/tpu:__subpackages__",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -32,3 +36,58 @@ cc_library(
         "//tensorflow/core/protobuf/tpu:tpu_embedding_output_layout_proto_cc",
     ],
 )
+
+cc_library(
+    name = "tpu_compilation_device",
+    srcs = ["tpu_compilation_device.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tpu_defs",
+        ":tpu_node_device_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tpu_node_device_util",
+    srcs = ["tpu_node_device_util.cc"],
+    hdrs = ["tpu_node_device_util.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tpu_defs",
+    srcs = ["tpu_defs.cc"],
+    hdrs = ["tpu_defs.h"],
+    deps = ["//tensorflow/core:protos_all_cc"],
+)
+
+cc_library(
+    name = "tpu_configuration",
+    srcs = ["tpu_configuration.cc"],
+    hdrs = ["tpu_configuration.h"],
+    deps = ["//tensorflow/core:framework"],
+)
+
+cc_library(
+    name = "tpu_init_mode",
+    srcs = ["tpu_init_mode.cc"],
+    hdrs = ["tpu_init_mode.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "tpu_config_c_api",
+    hdrs = ["tpu_config_c_api.h"],
+    deps = [
+        "//tensorflow/c:tf_status",
+    ],
+)
diff --git a/tensorflow/core/tpu/tpu_compilation_device.cc b/tensorflow/core/tpu/tpu_compilation_device.cc
new file mode 100644
index 00000000000..2b2314820bc
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_compilation_device.cc
@@ -0,0 +1,24 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/tpu/tpu_node_device_util.h"
+
+namespace tensorflow {
+
+REGISTER_XLA_BACKEND(DEVICE_TPU_XLA_JIT, kTpuAllTypes, TpuOpFilter);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
new file mode 100644
index 00000000000..b7caf0648b1
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
+#define TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/c/tf_status.h"
+
+typedef struct TpuSerializedProto TpuSerializedProto;
+
+extern "C" {
+
+bool TPUHostInitialized();
+
+void ConfigureDistributedTpuOp_DoWork(const size_t num_cores_per_host_size,
+                                      const int32_t* num_cores_per_host,
+                                      size_t* host_config_output_size,
+                                      char** host_config_output,
+                                      TF_Status* status);
+
+void WaitForDistributedTpuOp_DoWork(
+    const size_t num_hosts, const size_t num_cores_per_host,
+    const int32_t** host_ordinal_to_global_core_id_map,
+    size_t* tpu_topology_output_size, char** tpu_topology_output,
+    TF_Status* status);
+
+void ShutdownDistributedTpuOp_DoWork(TF_Status* status);
+
+void InitializeHostForDistributedTpuOp_DoWork(
+    const size_t tpu_host_config_size, const char* tpu_host_config,
+    const bool enable_whole_mesh_compilations, size_t* core_id_output_size,
+    int32_t** core_id_output, TF_Status* status);
+
+void SetGlobalTPUArrayOp_DoWork(const size_t tpu_topology_size,
+                                const char* tpu_topology, TF_Status* status);
+
+void DisconnectDistributedTpuChipsOp_DoWork(int32_t* number_of_chips_output,
+                                            TF_Status* status);
+
+void TpuConfigurationApi_FreeCharArray(char* output);
+void TpuConfigurationApi_FreeInt32Array(int32_t* output);
+}
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
diff --git a/tensorflow/core/tpu/tpu_configuration.cc b/tensorflow/core/tpu/tpu_configuration.cc
new file mode 100644
index 00000000000..3788d5cc6c2
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_configuration.cc
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_configuration.h"
+
+namespace tensorflow {
+
+namespace {
+
+ResourceMgr* GetGlobalResourceMgr() {
+  static ResourceMgr* const rmgr = new ResourceMgr();
+  return rmgr;
+}
+
+}  // namespace
+
+#if !defined(PLATFORM_GOOGLE)
+// Used only by Google-internal tests, so deliberately left empty.
+void MaybeInitializeTPUSystemForTests() {}
+#endif
+
+ResourceMgr* GetTPUConfigResourceMgr() {
+  MaybeInitializeTPUSystemForTests();
+
+  // Put all TPU-related state in the global ResourceMgr. This includes the
+  // TpuPodState, compilation cache, etc. We don't use the TPU_SYSTEM
+  // ResourceMgr because there may be more than one TPU_SYSTEM ResourceMgr when
+  // DirectSession or isolate_session_state are used.
+  return GetGlobalResourceMgr();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_configuration.h b/tensorflow/core/tpu/tpu_configuration.h
new file mode 100644
index 00000000000..6c337bd0fe7
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_configuration.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
+#define TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+void MaybeInitializeTPUSystemForTests();
+
+// Returns a process-wide global ResourceMgr.
+ResourceMgr* GetTPUConfigResourceMgr();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
diff --git a/tensorflow/core/tpu/tpu_defs.cc b/tensorflow/core/tpu/tpu_defs.cc
new file mode 100644
index 00000000000..dc370ea2ba7
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_defs.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+
+const char* const DEVICE_TPU_NODE = "TPU";
+const char* const TPU_FAST_MEM_ATTR = "_TPU_FAST_MEM";
+const char* const DEVICE_TPU_REPLICATED_CORE = "TPU_REPLICATED_CORE";
+const char* const DEVICE_TPU_SYSTEM = "TPU_SYSTEM";
+const char* const DEVICE_TPU_XLA_JIT = "XLA_TPU_JIT";
+const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR =
+    "_mirrored_variable_indices";
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_defs.h b/tensorflow/core/tpu/tpu_defs.h
new file mode 100644
index 00000000000..497afb5c392
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_defs.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common definitions related to TPUs.
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_DEFS_H_
+#define TENSORFLOW_CORE_TPU_TPU_DEFS_H_
+
+#include <array>
+
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+
+// Name of the TPU device, which corresponds to a single core.
+extern const char* const DEVICE_TPU_NODE;  // "TPU";
+
+// The TPU_REPLICATED_CORE device is a virtual device corresponding to one core
+// of a replicated TPU computation. Only valid within the body of a
+// TPUReplicate computation.
+extern const char* const DEVICE_TPU_REPLICATED_CORE;
+
+extern const char* const DEVICE_TPU_SYSTEM;  // "TPU_SYSTEM";
+
+// Name of the XLA_TPU_JIT compilation device, which is an internal device to
+// compile graphs for TPU. Not registered as a device; no operators can be
+// assigned to this device by a user.
+extern const char* const DEVICE_TPU_XLA_JIT;  // "XLA_TPU_JIT";
+
+// Attribute used internally to pass "is_mirrored_variable" attribute on
+// TPUReplicatedInput nodes to _TPUReplicate.
+extern const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR;
+
+// Attribute used internally to annoate ops which might consume TPU FastMem
+// variable.
+extern const char* const TPU_FAST_MEM_ATTR;  // "_TPU_FAST_MEM"
+
+// Supported types for TPUs.
+static constexpr std::array<DataType, 11> kTpuAllTypes = {
+    {DT_INT32, DT_UINT32, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL,
+     DT_COMPLEX64, DT_INT64, DT_UINT64, DT_QINT8, DT_QUINT8}};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_DEFS_H_
diff --git a/tensorflow/core/tpu/tpu_init_mode.cc b/tensorflow/core/tpu/tpu_init_mode.cc
new file mode 100644
index 00000000000..42952df29d8
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_init_mode.cc
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_init_mode.h"
+
+#include <atomic>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+mutex init_mode_mutex(LINKER_INITIALIZED);
+TPUInitMode init_mode TF_GUARDED_BY(init_mode_mutex);
+
+}  // namespace
+
+namespace test {
+
+void ForceSetTPUInitMode(const TPUInitMode mode) {
+  mutex_lock l(init_mode_mutex);
+  init_mode = mode;
+}
+
+}  // namespace test
+
+Status SetTPUInitMode(const TPUInitMode mode) {
+  if (mode == TPUInitMode::kNone) {
+    return errors::InvalidArgument("State cannot be set to: ",
+                                   static_cast<int>(mode));
+  }
+  {
+    mutex_lock l(init_mode_mutex);
+    if (init_mode != TPUInitMode::kNone && mode != init_mode) {
+      return errors::FailedPrecondition(
+          "TPUInit already attempted with mode: ", static_cast<int>(init_mode),
+          " and cannot be changed to: ", static_cast<int>(mode),
+          ". You are most probably trying to initialize the TPU system, both "
+          "using the explicit API and using an initialization Op within the "
+          "graph; please choose one. ");
+    }
+    init_mode = mode;
+  }
+  return Status::OK();
+}
+
+TPUInitMode GetTPUInitMode() {
+  mutex_lock l(init_mode_mutex);
+  return init_mode;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_init_mode.h b/tensorflow/core/tpu/tpu_init_mode.h
new file mode 100644
index 00000000000..73ca68ad8a0
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_init_mode.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
+#define TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+enum class TPUInitMode : int { kNone, kGlobal, kRegular };
+
+// Sets the TPU initialization mode appropriately.
+//
+// Requires that mode is not kNone, and mode doesn't transition kGlobal
+// <-> kRegular.
+//
+// IMPLEMENTATION DETAILS:
+// Used internally to record the current mode and type of API used for TPU
+// initialization in a global static variable.
+Status SetTPUInitMode(TPUInitMode mode);
+
+// Returns the current TPUInitMode.
+TPUInitMode GetTPUInitMode();
+
+namespace test {
+
+// Forces the tpu init mode to be changed.
+void ForceSetTPUInitMode(TPUInitMode mode);
+
+}  // namespace test
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
diff --git a/tensorflow/core/tpu/tpu_node_device_util.cc b/tensorflow/core/tpu/tpu_node_device_util.cc
new file mode 100644
index 00000000000..2dfd7d984d6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_node_device_util.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_node_device_util.h"
+
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+bool TpuOpFilter(KernelDef* kdef) {
+  StringPiece op(kdef->op());
+  VLOG(2) << "TpuOpFilter " << op;
+  // Enable const string operands to Assert op (b/69167214).
+  if (op == "Const") {
+    AddDtypeToKernelDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (op == "Assert") {
+    AddDtypeToKernelDefConstraint("T", DT_STRING, kdef);
+  }
+  return true;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_node_device_util.h b/tensorflow/core/tpu/tpu_node_device_util.h
new file mode 100644
index 00000000000..c6d5be9f5a6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_node_device_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
+#define TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
+
+#include "tensorflow/core/framework/kernel_def.pb.h"
+
+namespace tensorflow {
+
+// This is a BackendOpFilter. (see tensorflow/compiler/tf2xla/xla_op_registry.h)
+// It returns true if the op should be registered on the device, it may
+// optionally modify the KernelDef.
+bool TpuOpFilter(KernelDef* kdef);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index b8c2b3b4f59..de2dce9c0c2 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -144,6 +144,7 @@ filegroup(
         "matmul_autotune.h",
         "matmul_bcast.h",
         "mirror_pad_mode.h",
+        "mkl_threadpool.h",
         "mkl_types.h",
         "mkl_util.h",
         "overflow.h",
@@ -273,6 +274,7 @@ filegroup(
 filegroup(
     name = "mkl_util_hdrs",
     srcs = [
+        "mkl_threadpool.h",
         "mkl_util.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
diff --git a/tensorflow/core/util/debug_events_writer.cc b/tensorflow/core/util/debug_events_writer.cc
index 595f92d07c0..d9c3393ce3c 100644
--- a/tensorflow/core/util/debug_events_writer.cc
+++ b/tensorflow/core/util/debug_events_writer.cc
@@ -179,7 +179,7 @@ Status DebugEventsWriter::Init() {
   metadata->set_tensorflow_version(TF_VERSION_STRING);
   metadata->set_file_version(
       strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion));
-  SerializeAndWriteDebugEvent(&debug_event, METADATA);
+  TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA));
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       metadata_writer_->Flush(), "Failed to flush debug event metadata writer");
 
@@ -189,38 +189,38 @@ Status DebugEventsWriter::Init() {
   return Status::OK();
 }
 
-void DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
+Status DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
   DebugEvent debug_event;
   debug_event.set_allocated_source_file(source_file);
-  SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
+  return SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
 }
 
-void DebugEventsWriter::WriteStackFrameWithId(
+Status DebugEventsWriter::WriteStackFrameWithId(
     StackFrameWithId* stack_frame_with_id) {
   DebugEvent debug_event;
   debug_event.set_allocated_stack_frame_with_id(stack_frame_with_id);
-  SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
+  return SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
 }
 
-void DebugEventsWriter::WriteGraphOpCreation(
+Status DebugEventsWriter::WriteGraphOpCreation(
     GraphOpCreation* graph_op_creation) {
   DebugEvent debug_event;
   debug_event.set_allocated_graph_op_creation(graph_op_creation);
-  SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
+  return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
 }
 
-void DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
+Status DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
   DebugEvent debug_event;
   debug_event.set_allocated_debugged_graph(debugged_graph);
-  SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
+  return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
 }
 
-void DebugEventsWriter::WriteExecution(Execution* execution) {
+Status DebugEventsWriter::WriteExecution(Execution* execution) {
   if (circular_buffer_size_ <= 0) {
     // No cyclic-buffer behavior.
     DebugEvent debug_event;
     debug_event.set_allocated_execution(execution);
-    SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
+    return SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
   } else {
     // Circular buffer behavior.
     DebugEvent debug_event;
@@ -234,16 +234,18 @@ void DebugEventsWriter::WriteExecution(Execution* execution) {
     if (execution_buffer_.size() > circular_buffer_size_) {
       execution_buffer_.pop_front();
     }
+    return Status::OK();
   }
 }
 
-void DebugEventsWriter::WriteGraphExecutionTrace(
+Status DebugEventsWriter::WriteGraphExecutionTrace(
     GraphExecutionTrace* graph_execution_trace) {
+  TF_RETURN_IF_ERROR(Init());
   if (circular_buffer_size_ <= 0) {
     // No cyclic-buffer behavior.
     DebugEvent debug_event;
     debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
-    SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
+    return SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
   } else {
     // Circular buffer behavior.
     DebugEvent debug_event;
@@ -257,15 +259,14 @@ void DebugEventsWriter::WriteGraphExecutionTrace(
     if (graph_execution_trace_buffer_.size() > circular_buffer_size_) {
       graph_execution_trace_buffer_.pop_front();
     }
+    return Status::OK();
   }
 }
 
-void DebugEventsWriter::WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                                 const string& device_name,
-                                                 const string& op_name,
-                                                 int32 output_slot,
-                                                 int32 tensor_debug_mode,
-                                                 const Tensor& tensor_value) {
+Status DebugEventsWriter::WriteGraphExecutionTrace(
+    const string& tfdbg_context_id, const string& device_name,
+    const string& op_name, int32 output_slot, int32 tensor_debug_mode,
+    const Tensor& tensor_value) {
   std::unique_ptr<GraphExecutionTrace> trace(new GraphExecutionTrace());
   trace->set_tfdbg_context_id(tfdbg_context_id);
   if (!op_name.empty()) {
@@ -279,7 +280,7 @@ void DebugEventsWriter::WriteGraphExecutionTrace(const string& tfdbg_context_id,
   }
   trace->set_device_name(device_name);
   tensor_value.AsProtoTensorContent(trace->mutable_tensor_proto());
-  WriteGraphExecutionTrace(trace.release());
+  return WriteGraphExecutionTrace(trace.release());
 }
 
 void DebugEventsWriter::WriteSerializedNonExecutionDebugEvent(
@@ -487,8 +488,8 @@ Status DebugEventsWriter::InitNonMetadataFile(DebugEventFileType type) {
   return Status::OK();
 }
 
-void DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
-                                                    DebugEventFileType type) {
+Status DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
+                                                      DebugEventFileType type) {
   std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
   SelectWriter(type, &writer);
   if (writer != nullptr) {
@@ -497,6 +498,11 @@ void DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
     string str;
     debug_event->AppendToString(&str);
     (*writer)->WriteSerializedDebugEvent(str);
+    return Status::OK();
+  } else {
+    return errors::Internal(
+        "Unable to find debug events file writer for DebugEventsFileType ",
+        type);
   }
 }
 
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index 6d219d7c9ef..39835adf1a6 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -119,27 +119,27 @@ class DebugEventsWriter {
   // The four DebugEvent fields below are written _without_ the circular buffer.
   // Source file contents are written to the *.source_files file.
   // Takes ownership of source_file.
-  void WriteSourceFile(SourceFile* source_file);
+  Status WriteSourceFile(SourceFile* source_file);
   // Stack frames are written to the *.code_locations file.
   // Takes ownership of stack_frame_with_id.
-  void WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
+  Status WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
   // Graph op creation events are written to the *.graphs file.
   // Takes ownership of graph_op_creation.
-  void WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
+  Status WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
   // Debugged graphs are written to the *.graphs file.
   // Takes ownership of debugged_graph.
-  void WriteDebuggedGraph(DebuggedGraph* debugged_graph);
+  Status WriteDebuggedGraph(DebuggedGraph* debugged_graph);
 
   // The two DebugEvent fields below are written to the circular buffer
   // and saved to disk only at the FlushExecutionFiles() call.
   // Execution events (eager execution of an op or a tf.function) are written to
   // the *.execution file.
   // Takes ownership of execution.
-  void WriteExecution(Execution* execution);
+  Status WriteExecution(Execution* execution);
   // Graph execution traces (graph-internal tensor values or their summaries)
   // are written to the *.graph_execution_traces file.
   // Takes ownership of graph_execution_trace.
-  void WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
+  Status WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
 
   // Write a graph execution trace without using a protocol buffer.
   // Instead, pass the raw values related to the graph execution trace.
@@ -155,11 +155,11 @@ class DebugEventsWriter {
   //   tensor_value: The value of the tensor that describes the tensor(s)
   //     that this trace is concerned with. The semantics of this tensor value
   //     depends on the value of `tensor_debug_mode`.
-  void WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                const string& device_name,
-                                const string& op_name, int32 output_slot,
-                                int32 tensor_debug_mode,
-                                const Tensor& tensor_value);
+  Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
+                                  const string& device_name,
+                                  const string& op_name, int32 output_slot,
+                                  int32 tensor_debug_mode,
+                                  const Tensor& tensor_value);
 
   // Writes a serialized DebugEvent to one of the debug-events files
   // concerned with the non-execution events: the SOURCE_FILES, STACK_FRAMES
@@ -217,8 +217,8 @@ class DebugEventsWriter {
   // Initialize the TFRecord writer for non-metadata file type.
   Status InitNonMetadataFile(DebugEventFileType type);
 
-  void SerializeAndWriteDebugEvent(DebugEvent* debug_event,
-                                   DebugEventFileType type);
+  Status SerializeAndWriteDebugEvent(DebugEvent* debug_event,
+                                     DebugEventFileType type);
 
   void SelectWriter(DebugEventFileType type,
                     std::unique_ptr<SingleDebugEventFileWriter>** writer);
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index 66cde55864b..bd0c731bc90 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -263,7 +263,7 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
   source_file_1->add_lines("");
   source_file_1->add_lines("print(tf.constant([42.0]))");
   source_file_1->add_lines("");
-  writer->WriteSourceFile(source_file_1);
+  TF_ASSERT_OK(writer->WriteSourceFile(source_file_1));
 
   SourceFile* source_file_2 = new SourceFile();
   source_file_2->set_file_path("/home/tf_programs/train.py");
@@ -271,7 +271,7 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
   source_file_2->add_lines("import tensorflow.keras as keras");
   source_file_2->add_lines("");
   source_file_2->add_lines("model = keras.Sequential()");
-  writer->WriteSourceFile(source_file_2);
+  TF_ASSERT_OK(writer->WriteSourceFile(source_file_2));
 
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
@@ -336,8 +336,8 @@ TEST_F(DebugEventsWriterTest, WriteStackFramesFile) {
   file_line_col->set_func("my_func");
   file_line_col->set_code("  x = x ** 2.0");
 
-  writer->WriteStackFrameWithId(stack_frame_1);
-  writer->WriteStackFrameWithId(stack_frame_2);
+  TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame_1));
+  TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame_2));
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
 
@@ -382,12 +382,12 @@ TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) {
   GraphOpCreation* graph_op_creation = new GraphOpCreation();
   graph_op_creation->set_op_type("MatMul");
   graph_op_creation->set_op_name("Dense_1/MatMul");
-  writer->WriteGraphOpCreation(graph_op_creation);
+  TF_ASSERT_OK(writer->WriteGraphOpCreation(graph_op_creation));
 
   DebuggedGraph* debugged_graph = new DebuggedGraph();
   debugged_graph->set_graph_id("deadbeaf");
   debugged_graph->set_graph_name("my_func_graph");
-  writer->WriteDebuggedGraph(debugged_graph);
+  TF_ASSERT_OK(writer->WriteDebuggedGraph(debugged_graph));
 
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
@@ -428,7 +428,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
     source_file->set_host_name("localhost.localdomain");
-    writer->WriteSourceFile(source_file);
+    TF_ASSERT_OK(writer->WriteSourceFile(source_file));
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
     thread_pool->Schedule(fn);
@@ -469,7 +469,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
     source_file->set_host_name("localhost.localdomain");
-    writer->WriteSourceFile(source_file);
+    TF_ASSERT_OK(writer->WriteSourceFile(source_file));
     TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
@@ -512,16 +512,16 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
       source_file->set_file_path(
           strings::Printf("/home/tf_programs/program_%.2d.py", index));
       source_file->set_host_name("localhost.localdomain");
-      writer->WriteSourceFile(source_file);
+      TF_ASSERT_OK(writer->WriteSourceFile(source_file));
     } else if (index % 3 == 1) {
       StackFrameWithId* stack_frame = new StackFrameWithId();
       stack_frame->set_id(strings::Printf("e%.2d", index));
-      writer->WriteStackFrameWithId(stack_frame);
+      TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame));
     } else {
       GraphOpCreation* op_creation = new GraphOpCreation();
       op_creation->set_op_type("Log");
       op_creation->set_op_name(strings::Printf("Log_%.2d", index));
-      writer->WriteGraphOpCreation(op_creation);
+      TF_ASSERT_OK(writer->WriteGraphOpCreation(op_creation));
     }
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
@@ -586,7 +586,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
 
   std::vector<DebugEvent> actuals;
@@ -611,7 +611,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
 
   TF_ASSERT_OK(writer->FlushExecutionFiles());
@@ -637,7 +637,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Abs");
     execution->add_input_tensor_ids(counter.fetch_add(1));
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   };
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     thread_pool->Schedule(fn);
@@ -682,7 +682,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
 
   std::vector<DebugEvent> actuals;
@@ -695,6 +695,31 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
   TF_ASSERT_OK(writer->Close());
 }
 
+TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) {
+  const size_t kCyclicBufferSize = -1;
+  DebugEventsWriter* writer =
+      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  // NOTE(cais): `writer->Init()` is not called here before
+  // WriteGraphExecutionTrace() is called. This test checks that this is okay
+  // and the `GraphExecutionTrace` gets written correctly even without `Init()`
+  // being called first. This scenario can happen when a TF Graph with tfdbg
+  // debug ops are executed on a remote TF server.
+
+  GraphExecutionTrace* trace = new GraphExecutionTrace();
+  trace->set_tfdbg_context_id(strings::Printf("graph_0"));
+  TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
+  TF_ASSERT_OK(writer->FlushExecutionFiles());
+
+  std::vector<DebugEvent> actuals;
+  ReadDebugEventProtos(writer, DebugEventFileType::GRAPH_EXECUTION_TRACES,
+                       &actuals);
+  EXPECT_EQ(actuals.size(), 1);
+  EXPECT_EQ(actuals[0].graph_execution_trace().tfdbg_context_id(), "graph_0");
+
+  // Close the writer so the files can be safely deleted.
+  TF_ASSERT_OK(writer->Close());
+}
+
 TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
   const size_t kCyclicBufferSize = 10;
   DebugEventsWriter* writer =
@@ -706,7 +731,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
 
   TF_ASSERT_OK(writer->FlushExecutionFiles());
@@ -731,7 +756,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(
         strings::Printf("new_graph_%.2ld", counter.fetch_add(1)));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   };
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     thread_pool->Schedule(fn);
@@ -818,7 +843,7 @@ TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
   TF_ASSERT_OK(writer->FlushExecutionFiles());
 
@@ -834,7 +859,7 @@ TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
   for (size_t i = 0; i < kNumEvents; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
   TF_ASSERT_OK(writer->FlushExecutionFiles());
 
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index dd273af2a00..0e16f9d3fb3 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -156,18 +156,22 @@ Status CreateWritableFile(Env* env, const string& dirname, const string& name,
   return env->NewWritableFile(*filepath, file);
 }
 
-template <class T>
-Status WriteTextProtoToUniqueFile(T& proto, WritableFile* file) {
+Status WriteTextProtoToUniqueFile(const tensorflow::protobuf::Message& proto,
+                                  WritableFile* file) {
   string s;
-#if defined(TENSORFLOW_LITE_PROTOS)
-  if (!SerializeToStringDeterministic(proto, &s)) {
-    return errors::Internal("Failed to serialize proto to string.");
-  }
-#else
   if (!::tensorflow::protobuf::TextFormat::PrintToString(proto, &s)) {
     return errors::FailedPrecondition("Unable to convert proto to text.");
   }
-#endif
+  TF_RETURN_IF_ERROR(file->Append(s));
+  return file->Close();
+}
+
+Status WriteTextProtoToUniqueFile(
+    const tensorflow::protobuf::MessageLite& proto, WritableFile* file) {
+  string s;
+  if (!SerializeToStringDeterministic(proto, &s)) {
+    return errors::Internal("Failed to serialize proto to string.");
+  }
   TF_RETURN_IF_ERROR(file->Append(s));
   return file->Close();
 }
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index eb50d621b02..b148ffab042 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -1272,20 +1272,23 @@ Status FastParseExample(const Config& config,
       SparseBuffer& buffer = sparse_buffers[i][d];
 
       // Update indices.
-      int64* ix_p = &indices->matrix<int64>()(offset, 0);
       size_t delta = 0;
-      size_t example_index = first_example_of_minibatch(i);
-      for (size_t example_end_index : buffer.example_end_indices) {
-        size_t feature_index = 0;
-        for (; delta < example_end_index; ++delta) {
-          // Column 0: example index
-          *ix_p = example_index;
-          // Column 1: the feature index buffer example
-          *(ix_p + 1) = feature_index;
-          ix_p += 2;
-          ++feature_index;
+
+      if (indices->NumElements() > 0) {
+        int64* ix_p = &indices->matrix<int64>()(offset, 0);
+        size_t example_index = first_example_of_minibatch(i);
+        for (size_t example_end_index : buffer.example_end_indices) {
+          size_t feature_index = 0;
+          for (; delta < example_end_index; ++delta) {
+            // Column 0: example index
+            *ix_p = example_index;
+            // Column 1: the feature index buffer example
+            *(ix_p + 1) = feature_index;
+            ix_p += 2;
+            ++feature_index;
+          }
+          ++example_index;
         }
-        ++example_index;
       }
 
       CopySparseBufferToTensor(config.sparse[d].dtype, offset, &buffer, values);
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index 61d1e3c9453..b4de2ffa8e9 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -672,6 +672,12 @@ __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
   return atomicAdd(ptr, value);
 }
 
+__device__ inline int64 GpuAtomicAdd(int64* ptr, int64 value) {
+  // This cast should be safe since module-2 addition should work fine. However,
+  // signed overflow is not handled correctly since it's undefined behavior.
+  return atomicAdd(reinterpret_cast<uint64*>(ptr), static_cast<uint64>(value));
+}
+
 __device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
                                            Eigen::half value) {
   return detail::GpuAtomicCasHelper(
@@ -725,9 +731,14 @@ __device__ inline double GpuAtomicSub(double* ptr, double value) {
   return GpuAtomicAdd(ptr, -value);
 }
 
+__device__ inline tensorflow::int64 GpuAtomicSub(tensorflow::int64* ptr,
+                                                 tensorflow::int64 value) {
+  return GpuAtomicAdd(ptr, -value);
+}
+
 __device__ inline tensorflow::uint64 GpuAtomicSub(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
-  return GpuAtomicAdd(ptr, -value);
+  return GpuAtomicAdd(ptr, -static_cast<tensorflow::int64>(value));
 }
 
 __device__ inline Eigen::half GpuAtomicSub(Eigen::half* ptr,
diff --git a/tensorflow/core/util/mkl_threadpool.h b/tensorflow/core/util/mkl_threadpool.h
new file mode 100644
index 00000000000..da4b516d3b8
--- /dev/null
+++ b/tensorflow/core/util/mkl_threadpool.h
@@ -0,0 +1,143 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MKL_THREADPOOL_H_
+#define TENSORFLOW_CORE_UTIL_MKL_THREADPOOL_H_
+#ifdef INTEL_MKL
+
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "mkldnn.hpp"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/threadpool.h"
+#define EIGEN_USE_THREADS
+#ifdef ENABLE_MKLDNN_THREADPOOL
+using dnnl::stream_attr;
+using dnnl::threadpool_iface;
+
+namespace tensorflow {
+
+// Divide 'n' units of work equally among 'teams' threads. If 'n' is not
+// divisible by 'teams' and has a remainder 'r', the first 'r' teams have one
+// unit of work more than the rest. Returns the range of work that belongs to
+// the team 'tid'.
+// Parameters
+//   n        Total number of jobs.
+//   team     Number of workers.
+//   tid      Current thread_id.
+//   n_start  start of range operated by the thread.
+//   n_end    end of the range operated by the thread.
+
+template <typename T, typename U>
+inline void balance211(T n, U team, U tid, T* n_start, T* n_end) {
+  if (team <= 1 || n == 0) {
+    *n_start = 0;
+    *n_end = n;
+    return;
+  }
+  T min_per_team = n / team;
+  T remainder = n - min_per_team * team;  // i.e., n % teams.
+  *n_start = tid * min_per_team + std::min(tid, remainder);
+  *n_end = *n_start + min_per_team + (tid < remainder);
+}
+
+struct MklDnnThreadPool : public dnnl::threadpool_iface {
+  MklDnnThreadPool() = default;
+
+  MklDnnThreadPool(OpKernelContext* ctx)
+      : eigen_interface_(ctx->device()
+                             ->tensorflow_cpu_worker_threads()
+                             ->workers->AsEigenThreadPool()) {}
+  virtual int get_num_threads() const override {
+    return eigen_interface_->NumThreads();
+  }
+  virtual bool get_in_parallel() const override {
+    return (eigen_interface_->CurrentThreadId() != -1) ? true : false;
+  }
+  virtual uint64_t get_flags() const override { return ASYNCHRONOUS; }
+  virtual void parallel_for(int n,
+                            const std::function<void(int, int)>& fn) override {
+    // Should never happen (handled by DNNL)
+    if (n == 0) return;
+
+    // Should never happen (handled by DNNL)
+    if (n == 1) {
+      fn(0, 1);
+      return;
+    }
+
+    int nthr = get_num_threads();
+    int njobs = std::min(n, nthr);
+    bool balance = (nthr < n);
+    for (int i = 0; i < njobs; i++) {
+      eigen_interface_->ScheduleWithHint(
+          [balance, i, n, njobs, fn]() {
+            if (balance) {
+              int start, end;
+              balance211(n, njobs, i, &start, &end);
+              for (int j = start; j < end; j++) fn(j, n);
+            } else {
+              fn(i, n);
+            }
+          },
+          i, i + 1);
+    }
+  }
+  ~MklDnnThreadPool() {}
+
+ private:
+  Eigen::ThreadPoolInterface* eigen_interface_ = nullptr;
+};
+
+class MklDnnThreadPoolWrapper {
+ public:
+  static MklDnnThreadPoolWrapper& GetInstance() {
+    static MklDnnThreadPoolWrapper instance_;
+    return instance_;
+  }
+  MklDnnThreadPool* CreateThreadPoolPtr(OpKernelContext* ctx) {
+    if (threadpool_map_.empty() ||
+        threadpool_map_.find(ctx->device()) == threadpool_map_.end()) {
+      auto tp_iface = new MklDnnThreadPool(ctx);
+      threadpool_map_.emplace(std::make_pair(ctx->device(), tp_iface));
+      return tp_iface;
+    } else {
+      auto entry = threadpool_map_.find(ctx->device());
+      return entry->second;
+    }
+  }
+
+ private:
+  std::unordered_map<DeviceBase*, MklDnnThreadPool*> threadpool_map_;
+  MklDnnThreadPoolWrapper() {}
+  MklDnnThreadPoolWrapper(const MklDnnThreadPoolWrapper&) = delete;
+  MklDnnThreadPoolWrapper& operator=(const MklDnnThreadPoolWrapper&) = delete;
+  ~MklDnnThreadPoolWrapper() {
+    for (auto& tp : threadpool_map_) {
+      delete tp.second;
+    }
+  }
+};
+
+}  // namespace tensorflow
+#endif  // ENABLE_MKLDNN_THREADPOOL
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_UTIL_MKL_THREADPOOL_H_
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index e0a399f2d6c..7f6272b09c1 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/mkl_threadpool.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -48,7 +49,6 @@ using mkldnn::padding_kind;
 using mkldnn::primitive;
 using mkldnn::reorder;
 using mkldnn::stream;
-
 using CPUDevice = Eigen::ThreadPoolDevice;
 using MemoryArgsMap = std::unordered_map<int, memory>;
 using ReorderPd = mkldnn::reorder::primitive_desc;
@@ -232,6 +232,27 @@ inline bool array_cmp(const T* a1, const T* a2, size_t size) {
   return true;
 }
 
+inline mkldnn::stream* CreateStream(OpKernelContext* ctx,
+                                    const engine& engine) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+  stream_attr tp_stream_attr(ENGINE_CPU);
+  if (ctx != nullptr) {
+    auto eigen_tp =
+        MklDnnThreadPoolWrapper::GetInstance().CreateThreadPoolPtr(ctx);
+    tp_stream_attr.set_threadpool(eigen_tp);
+    stream* tp_stream =
+        new stream(engine, stream::flags::default_flags, tp_stream_attr);
+    return tp_stream;
+  } else {
+    stream* tp_stream = new CPU_STREAM(engine);
+    return tp_stream;
+  }
+#else
+  stream* tp_stream = new CPU_STREAM(engine);
+  return tp_stream;
+#endif  // ENABLE_MKLDNN_THREADPOOL
+}
+
 class MklDnnShape {
  private:
   typedef struct {
@@ -679,20 +700,21 @@ class MklDnnData;
 // TODO merge with the execute_primitives.
 inline void ExecutePrimitive(const std::vector<primitive>& net,
                              const std::vector<MemoryArgsMap>* net_args,
-                             const engine& cpu_engine) {
+                             const engine& cpu_engine,
+                             OpKernelContext* context = nullptr) {
 #ifdef ENABLE_MKLDNN_V1
   DCHECK(net_args);
   DCHECK_EQ(net.size(), net_args->size());
-  stream cpu_stream(cpu_engine);
+  stream* cpu_stream = CreateStream(context, cpu_engine);
   for (size_t i = 0; i < net.size(); ++i) {
-    net.at(i).execute(cpu_stream, net_args->at(i));
+    net.at(i).execute(*cpu_stream, net_args->at(i));
   }
-  cpu_stream.wait();
+  cpu_stream->wait();
+  delete cpu_stream;
 #else
   stream(stream::kind::eager_nostore).submit(net).wait();
 #endif  // ENABLE_MKLDNN_V1
 }
-
 template <typename T>
 inline Status ConvertMklToTF(OpKernelContext* context,
                              const Tensor& input_mkl_tensor,
@@ -731,7 +753,7 @@ inline Status ConvertMklToTF(OpKernelContext* context,
         return Status(error::Code::INTERNAL,
                       "ConvertMklToTF(): Failed to create reorder for input");
       }
-      ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine);
+      ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine, context);
     } else {
       // If not, just forward input tensor to output tensor.
       bool status =
@@ -1301,8 +1323,8 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
 
 inline void CreateAndExecuteReorder(const ReorderPd& reorder_desc,
                                     const memory& src_mem,
-                                    const memory& dst_mem,
-                                    const engine& engine) {
+                                    const memory& dst_mem, const engine& engine,
+                                    OpKernelContext* ctx = nullptr) {
   std::vector<primitive> net;
 #ifdef ENABLE_MKLDNN_V1
   net.push_back(mkldnn::reorder(reorder_desc));
@@ -1311,7 +1333,7 @@ inline void CreateAndExecuteReorder(const ReorderPd& reorder_desc,
 #else
   net.push_back(mkldnn::reorder(reorder_desc, src_mem, dst_mem));
 #endif  // ENABLE_MKLDNN_V1
-  ExecutePrimitive(net, NET_ARGS_PTR, engine);
+  ExecutePrimitive(net, NET_ARGS_PTR, engine, ctx);
 }
 
 class MklReorderPrimitive;
@@ -1629,22 +1651,26 @@ class MklDnnData {
 
 #ifdef ENABLE_MKLDNN_V1
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
-                                  const engine& engine) {
+                                  const engine& engine,
+                                  OpKernelContext* context = nullptr) {
     DCHECK(user_memory_);
     if (IsReorderNeeded(op_md)) {
       // TODO(nhasabni): can we remove dynamic memory allocation?
       // primitive reuse don't allow two same reorder prim in
       // one stream, so submit it immediately
       reorder_memory_ = new memory(op_md, engine);
-      std::vector<primitive> net;
       auto* prim = FindOrCreateReorder<T>(user_memory_, reorder_memory_);
+      std::shared_ptr<stream> cpu_stream;
+      cpu_stream.reset(CreateStream(context, prim->GetEngine()));
+      std::vector<primitive> net;
       net.push_back(*(prim->GetPrimitive()));
       std::vector<MemoryArgsMap> net_args;
       net_args.push_back({{MKLDNN_ARG_FROM, *user_memory_},
                           {MKLDNN_ARG_TO, *reorder_memory_}});
-      execute_primitives(net, prim->GetStream(), net_args);
+      execute_primitives(net, cpu_stream, net_args);
 #else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd) {
+  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
+                                  OpKernelContext* ctx = nullptr) {
     CHECK_NOTNULL(user_memory_);
     if (IsReorderNeeded(op_pd)) {
       reorder_memory_ = new memory(op_pd);
@@ -1708,7 +1734,8 @@ class MklDnnData {
   /// TODO(bhavanis): Need to use reorder cache here for better performance.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   void* reorder_data_handle,
-                                  const engine& engine) {
+                                  const engine& engine,
+                                  OpKernelContext* context = nullptr) {
     DCHECK(reorder_data_handle);
     DCHECK(user_memory_);
     if (IsReorderNeeded(op_md)) {
@@ -1716,16 +1743,19 @@ class MklDnnData {
       // primitive reuse don't allow two same reorder prim in
       // one stream, so submit it immediately
       reorder_memory_ = new memory(op_md, engine, reorder_data_handle);
-      std::vector<primitive> net;
       auto* prim = FindOrCreateReorder<T>(user_memory_, reorder_memory_);
+      std::shared_ptr<stream> cpu_stream;
+      cpu_stream.reset(CreateStream(context, prim->GetEngine()));
+      std::vector<primitive> net;
       net.push_back(*(prim->GetPrimitive()));
       std::vector<MemoryArgsMap> net_args;
       net_args.push_back({{MKLDNN_ARG_FROM, *user_memory_},
                           {MKLDNN_ARG_TO, *reorder_memory_}});
-      execute_primitives(net, prim->GetStream(), net_args);
+      execute_primitives(net, cpu_stream, net_args);
 #else
   inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  void* reorder_data_handle) {
+                                  void* reorder_data_handle,
+                                  OpKernelContext* context = nullptr) {
     CHECK_NOTNULL(reorder_data_handle);
     CHECK_NOTNULL(user_memory_);
     if (IsReorderNeeded(op_pd)) {
@@ -1778,13 +1808,14 @@ class MklDnnData {
   /// remove
   /// slow path in the future
   inline bool CheckReorderToOpMem(const MEMORY_PRIMITIVE_DESC& op_pd,
-                                  Tensor* reorder_tensor) {
+                                  Tensor* reorder_tensor,
+                                  OpKernelContext* ctx = nullptr) {
     DCHECK(reorder_tensor);
 #ifdef ENABLE_MKLDNN_V1
     return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor),
-                               *cpu_engine_);
+                               *cpu_engine_, ctx);
 #else
-    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor));
+    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), ctx);
 #endif  // ENABLE_MKLDNN_V1
   }
 
@@ -1843,7 +1874,7 @@ class MklDnnData {
   /// TODO: this is a faster path with reorder primitive cache compared with
   ///       InsertReorderToUserMem(net, net_args), will remove
   ///       slow path in the future
-  inline void InsertReorderToUserMem() {
+  inline void InsertReorderToUserMem(OpKernelContext* ctx = nullptr) {
     DCHECK(user_memory_);
     DCHECK(reorder_memory_);
     DCHECK(cpu_engine_);
@@ -1857,8 +1888,8 @@ class MklDnnData {
     net_args.push_back(
         {{MKLDNN_ARG_FROM, *reorder_memory_}, {MKLDNN_ARG_TO, *user_memory_}});
     std::shared_ptr<stream> cpu_stream;
-    cpu_stream.reset(new stream(*cpu_engine_));
-    execute_primitives(net, prim->GetStream(), net_args);
+    cpu_stream.reset(CreateStream(ctx, prim->GetEngine()));
+    execute_primitives(net, cpu_stream, net_args);
 #else
     net.push_back(FindOrCreateReorder<T>(reorder_memory_, user_memory_));
     ExecutePrimitive(net, NET_ARGS_PTR, *cpu_engine_);
@@ -1870,9 +1901,12 @@ class MklDnnData {
 class MklPrimitive {
  public:
   virtual ~MklPrimitive() {}
-
+  MklPrimitive() {}
+  MklPrimitive(const engine& cpu_engine) { cpu_engine_ = cpu_engine; }
   // Dummy data which MKL DNN never operates on
   unsigned char* DummyData = nullptr;
+  engine cpu_engine_ = engine(ENGINE_CPU, 0);
+  const engine& GetEngine() { return cpu_engine_; }
 };
 
 const mkldnn::memory::dims NONE_DIMS = {};
@@ -2058,7 +2092,8 @@ class FactoryKeyCreator {
 
 class MklReorderPrimitive : public MklPrimitive {
  public:
-  explicit MklReorderPrimitive(const memory* from, const memory* to) {
+  explicit MklReorderPrimitive(const memory* from, const memory* to)
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     Setup(from, to);
   }
   ~MklReorderPrimitive() {}
@@ -2081,7 +2116,6 @@ class MklReorderPrimitive : public MklPrimitive {
         : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
   } context_;
 
-  engine cpu_engine_ = engine(ENGINE_CPU, 0);
   std::shared_ptr<mkldnn::stream> stream_;
 
   void Setup(const memory* from, const memory* to) {
diff --git a/tensorflow/core/util/padding.h b/tensorflow/core/util/padding.h
index 6c109d15d36..90e353d23fa 100644
--- a/tensorflow/core/util/padding.h
+++ b/tensorflow/core/util/padding.h
@@ -38,7 +38,8 @@ class NodeDef;
 //         dimensions as the input.
 //   EXPLICIT: The user specifies the pad values in the explicit_paddings
 //             attribute.
-// The padded area is zero-filled.
+// The padded area is typically zero-filled. For pooling ops, the padded area is
+// instead ignored. For max pool, this is equivalent to padding with -infinity.
 enum Padding {
   VALID = 1,     // No padding.
   SAME = 2,      // Input and output layers have the same size.
diff --git a/tensorflow/core/util/proto/descriptors.cc b/tensorflow/core/util/proto/descriptors.cc
index 3f82091ba91..56abd495d16 100644
--- a/tensorflow/core/util/proto/descriptors.cc
+++ b/tensorflow/core/util/proto/descriptors.cc
@@ -68,14 +68,18 @@ Status GetDescriptorPoolFromBinary(
     const string& source,
     std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool) {
   if (!absl::StartsWith(source, "bytes://")) {
-    return errors::InvalidArgument(
-        "Source does not represent serialized file descriptor set proto.");
+    return errors::InvalidArgument(absl::StrCat(
+        "Source does not represent serialized file descriptor set proto. ",
+        "This may be due to a missing dependency on the file containing ",
+        "REGISTER_DESCRIPTOR_POOL(\"", source, "\", ...);"));
   }
   // Parse the FileDescriptorSet.
   protobuf::FileDescriptorSet proto;
   if (!proto.ParseFromString(string(absl::StripPrefix(source, "bytes://")))) {
-    return errors::InvalidArgument(
-        "Source does not represent serialized file descriptor set proto.");
+    return errors::InvalidArgument(absl::StrCat(
+        "Source does not represent serialized file descriptor set proto. ",
+        "This may be due to a missing dependency on the file containing ",
+        "REGISTER_DESCRIPTOR_POOL(\"", source, "\", ...);"));
   }
   return CreatePoolFromSet(proto, owned_desc_pool);
 }
diff --git a/tensorflow/core/util/tensor_slice_reader.h b/tensorflow/core/util/tensor_slice_reader.h
index 4aa9a4708e2..0fb2e11bf8d 100644
--- a/tensorflow/core/util/tensor_slice_reader.h
+++ b/tensorflow/core/util/tensor_slice_reader.h
@@ -61,7 +61,7 @@ class TensorSliceReader {
   };
   typedef std::function<Status(const string&, Table**)> OpenTableFunction;
 
-  static const int kLoadAllShards = -1;
+  static constexpr int kLoadAllShards = -1;
   TensorSliceReader(const string& filepattern);
   TensorSliceReader(const string& filepattern, OpenTableFunction open_function);
   TensorSliceReader(const string& filepattern, OpenTableFunction open_function,
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index b610565e1e0..86077a54ff8 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -68,7 +68,7 @@ class TensorSliceWriter {
   static size_t MaxBytesPerElement(DataType dt);
 
  private:
-  static const size_t kMaxMessageBytes = 1LL << 31;
+  static constexpr size_t kMaxMessageBytes = 1LL << 31;
   // Filling in the TensorProto in a SavedSlice will add the following
   // header bytes, in addition to the data:
   // - 1 byte: TensorProto tag and wire format
@@ -77,7 +77,7 @@ class TensorSliceWriter {
   // - <= 5 bytes: *_val length
   // However, we add 1KB of slack, to be conservative and guard
   // against other additions to the TensorProto.
-  static const size_t kTensorProtoHeaderBytes = 1 << 10;
+  static constexpr size_t kTensorProtoHeaderBytes = 1 << 10;
 
   const string filename_;
   const CreateBuilderFunction create_builder_;
diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index ddb7a0275ac..6d3af02e657 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -1,6 +1,8 @@
 // Protocol messages for describing the results of benchmarks and unit tests.
 syntax = "proto3";
 
+package tensorflow;
+
 import "google/protobuf/any.proto";
 import "google/protobuf/wrappers.proto";
 
@@ -9,14 +11,12 @@ option java_outer_classname = "TestLogProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util.testlog";
 
-package tensorflow;
-
 message EntryValue {
   oneof kind {
     double double_value = 1;
     string string_value = 2;
   }
-};
+}
 
 message MetricEntry {
   // Metric name
@@ -62,7 +62,7 @@ message BenchmarkEntry {
   // Metric name, value and expected range. This can include accuracy metrics
   // typically used to determine whether the accuracy test has passed
   repeated MetricEntry metrics = 7;
-};
+}
 
 message BenchmarkEntries {
   repeated BenchmarkEntry entry = 1;
@@ -72,7 +72,7 @@ message BuildConfiguration {
   string mode = 1;               // opt, dbg, etc
   repeated string cc_flags = 2;  // CC compiler flags, if known
   repeated string opts = 3;      // Bazel compilation options, if known
-};
+}
 
 message CommitId {
   oneof kind {
@@ -85,7 +85,7 @@ message CommitId {
   string snapshot = 3;
   // Changelist tested if the change list is not already submitted.
   int64 pending_changelist = 4;
-};
+}
 
 message CPUInfo {
   int64 num_cores = 1;
@@ -105,7 +105,7 @@ message CPUInfo {
 
   // Cache sizes (in bytes), e.g. "L2": 262144 (for 256KB)
   map<string, int64> cache_size = 6;
-};
+}
 
 message MemoryInfo {
   int64 total = 1;      // Total virtual memory in bytes
@@ -113,26 +113,26 @@ message MemoryInfo {
 }
 
 message GPUInfo {
-  string model = 1;  // e.g. "Tesla K40c"
-  string uuid = 2;   // Final entry in output of "nvidia-smi -L"
+  string model = 1;   // e.g. "Tesla K40c"
+  string uuid = 2;    // Final entry in output of "nvidia-smi -L"
   string bus_id = 3;  // e.g. "0000:04:00.0"
-};
+}
 
 message PlatformInfo {
-  string bits = 1;       // e.g. '64bit'
-  string linkage = 2;    // e.g. 'ELF'
-  string machine = 3;    // e.g. 'i386'
-  string release = 4;    // e.g. '3.13.0-76-generic'
-  string system = 5;     // e.g. 'Linux'
-  string version = 6;    // e.g. '#120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016'
-};
+  string bits = 1;     // e.g. '64bit'
+  string linkage = 2;  // e.g. 'ELF'
+  string machine = 3;  // e.g. 'i386'
+  string release = 4;  // e.g. '3.13.0-76-generic'
+  string system = 5;   // e.g. 'Linux'
+  string version = 6;  // e.g. '#120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016'
+}
 
 message AvailableDeviceInfo {       // Matches DeviceAttributes
   string name = 1;                  // Device name.
   string type = 2;                  // Device type, e.g. 'CPU' or 'GPU'.
   int64 memory_limit = 3;           // Memory capacity in bytes.
   string physical_description = 4;  // The physical description of this device.
-};
+}
 
 message MachineConfiguration {
   // Host name of machine that ran the benchmark.
@@ -154,7 +154,7 @@ message MachineConfiguration {
   repeated AvailableDeviceInfo available_device_info = 5;
 
   MemoryInfo memory_info = 6;
-};
+}
 
 // Run-specific items such as arguments to the test / benchmark.
 message RunConfiguration {
@@ -206,6 +206,7 @@ message TestResults {
     PYTHON_BENCHMARK = 2;
     ANDROID_BENCHMARK = 3;
     EDGE_BENCHMARK = 4;
+    IOS_BENCHMARK = 5;
   }
   BenchmarkType benchmark_type = 10;
 
@@ -219,4 +220,4 @@ message TestResults {
   // TensorFlow version this benchmark runs against.
   // This can be either set to full version or just the major version.
   string tf_version = 12;
-};
+}
diff --git a/tensorflow/core/util/use_cudnn.cc b/tensorflow/core/util/use_cudnn.cc
index fc9988df5fd..d0157f8ad37 100644
--- a/tensorflow/core/util/use_cudnn.cc
+++ b/tensorflow/core/util/use_cudnn.cc
@@ -22,6 +22,24 @@ limitations under the License.
 
 namespace tensorflow {
 
+bool CanUseCudnn() {
+  static bool is_enabled = [] {
+    bool is_enabled = true;
+    // TODO(b/155239286): Remove TF_USE_CUDNN after TF 2.3 is released.
+    Status status =
+        ReadBoolFromEnvVar("TF_USE_CUDNN", /*default_val=*/true, &is_enabled);
+    if (!status.ok()) {
+      LOG(ERROR) << status;
+    }
+    if (!is_enabled) {
+      LOG(WARNING) << "The environmental variable TF_USE_CUDNN is deprecated "
+                      "and will be ignored in the future";
+    }
+    return is_enabled;
+  }();
+  return is_enabled;
+}
+
 #define ADD_BOOL_CUDNN_FLAG(func_name, flag_name, default_value)           \
   bool func_name() {                                                       \
     bool value = default_value;                                            \
@@ -32,7 +50,6 @@ namespace tensorflow {
     return value;                                                          \
   }
 
-ADD_BOOL_CUDNN_FLAG(CanUseCudnn, TF_USE_CUDNN, true);
 ADD_BOOL_CUDNN_FLAG(CudnnUseAutotune, TF_CUDNN_USE_AUTOTUNE, true);
 // Whether to auto-tuning Cudnn RNN forward and backward pass to pick
 // statistically the best cudnnRNNAlgo_t and cudnnMathType_t.
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 162a44ac109..a0e5005d45a 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -35,7 +35,7 @@ tf_cc_binary(
             # cc:cc_ops is used to include image ops (for label_image)
             # Jpg, gif, and png related code won't be included
             "//tensorflow/cc:cc_ops",
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
             # cc:android_tensorflow_image_op is for including jpeg/gif/png
             # decoder to enable real-image evaluation on Android
             "//tensorflow/core/kernels:android_tensorflow_image_op",
diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py
index 4a48a440b6e..44940b0647f 100644
--- a/tensorflow/examples/speech_commands/freeze.py
+++ b/tensorflow/examples/speech_commands/freeze.py
@@ -80,6 +80,9 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
     preprocess: How the spectrogram is processed to produce features, for
       example 'mfcc', 'average', or 'micro'.
 
+  Returns:
+    Input and output tensor objects.
+
   Raises:
     Exception: If the preprocessing mode isn't recognized.
   """
@@ -150,7 +153,59 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
       runtime_settings=runtime_settings)
 
   # Create an output to use for inference.
-  tf.nn.softmax(logits, name='labels_softmax')
+  softmax = tf.nn.softmax(logits, name='labels_softmax')
+
+  return reshaped_input, softmax
+
+
+def save_graph_def(file_name, frozen_graph_def):
+  """Writes a graph def file out to disk.
+
+  Args:
+    file_name: Where to save the file.
+    frozen_graph_def: GraphDef proto object to save.
+  """
+  tf.io.write_graph(
+      frozen_graph_def,
+      os.path.dirname(file_name),
+      os.path.basename(file_name),
+      as_text=False)
+  tf.compat.v1.logging.info('Saved frozen graph to %s', file_name)
+
+
+def save_saved_model(file_name, sess, input_tensor, output_tensor):
+  """Writes a SavedModel out to disk.
+
+  Args:
+    file_name: Where to save the file.
+    sess: TensorFlow session containing the graph.
+    input_tensor: Tensor object defining the input's properties.
+    output_tensor: Tensor object defining the output's properties.
+  """
+  # Store the frozen graph as a SavedModel for v2 compatibility.
+  builder = tf.compat.v1.saved_model.builder.SavedModelBuilder(file_name)
+  tensor_info_inputs = {
+      'input': tf.compat.v1.saved_model.utils.build_tensor_info(input_tensor)
+  }
+  tensor_info_outputs = {
+      'output': tf.compat.v1.saved_model.utils.build_tensor_info(output_tensor)
+  }
+  signature = (
+      tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs=tensor_info_inputs,
+          outputs=tensor_info_outputs,
+          method_name=tf.compat.v1.saved_model.signature_constants
+          .PREDICT_METHOD_NAME))
+  builder.add_meta_graph_and_variables(
+      sess,
+      [tf.compat.v1.saved_model.tag_constants.SERVING],
+      signature_def_map={
+          tf.compat.v1.saved_model.signature_constants
+          .DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              signature,
+      },
+  )
+  builder.save()
 
 
 def main(_):
@@ -167,7 +222,7 @@ def main(_):
 
   # Create the model and load its weights.
   sess = tf.compat.v1.InteractiveSession()
-  create_inference_graph(
+  input_tensor, output_tensor = create_inference_graph(
       FLAGS.wanted_words, FLAGS.sample_rate, FLAGS.clip_duration_ms,
       FLAGS.clip_stride_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms,
       FLAGS.feature_bin_count, FLAGS.model_architecture, FLAGS.preprocess)
@@ -178,12 +233,14 @@ def main(_):
   # Turn all the variables into inline constants inside the graph and save it.
   frozen_graph_def = graph_util.convert_variables_to_constants(
       sess, sess.graph_def, ['labels_softmax'])
-  tf.io.write_graph(
-      frozen_graph_def,
-      os.path.dirname(FLAGS.output_file),
-      os.path.basename(FLAGS.output_file),
-      as_text=False)
-  tf.compat.v1.logging.info('Saved frozen graph to %s', FLAGS.output_file)
+
+  if FLAGS.save_format == 'graph_def':
+    save_graph_def(FLAGS.output_file, frozen_graph_def)
+  elif FLAGS.save_format == 'saved_model':
+    save_saved_model(FLAGS.output_file, sess, input_tensor, output_tensor)
+  else:
+    raise Exception('Unknown save format "%s" (should be "graph_def" or'
+                    ' "saved_model")' % (FLAGS.save_format))
 
 
 if __name__ == '__main__':
@@ -246,5 +303,10 @@ if __name__ == '__main__':
       type=str,
       default='mfcc',
       help='Spectrogram processing mode. Can be "mfcc" or "average"')
+  parser.add_argument(
+      '--save_format',
+      type=str,
+      default='graph_def',
+      help='How to save the result. Can be "graph_def" or "saved_model"')
   FLAGS, unparsed = parser.parse_known_args()
   tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/freeze_test.py b/tensorflow/examples/speech_commands/freeze_test.py
index a242453d0e5..93a79b0b4f7 100644
--- a/tensorflow/examples/speech_commands/freeze_test.py
+++ b/tensorflow/examples/speech_commands/freeze_test.py
@@ -18,8 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os.path
+
 from tensorflow.examples.speech_commands import freeze
+from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.platform import test
 
 
@@ -103,6 +107,27 @@ class FreezeTest(test.TestCase):
       ops = [node.op for node in sess.graph_def.node]
       self.assertEqual(0, ops.count('Mfcc'))
 
+  @test_util.run_deprecated_v1
+  def testCreateSavedModel(self):
+    tmp_dir = self.get_temp_dir()
+    saved_model_path = os.path.join(tmp_dir, 'saved_model')
+    with self.cached_session() as sess:
+      input_tensor, output_tensor = freeze.create_inference_graph(
+          wanted_words='a,b,c,d',
+          sample_rate=16000,
+          clip_duration_ms=1000.0,
+          clip_stride_ms=30.0,
+          window_size_ms=30.0,
+          window_stride_ms=10.0,
+          feature_bin_count=40,
+          model_architecture='conv',
+          preprocess='micro')
+      global_variables_initializer().run()
+      graph_util.convert_variables_to_constants(
+          sess, sess.graph_def, ['labels_softmax'])
+      freeze.save_saved_model(saved_model_path, sess, input_tensor,
+                              output_tensor)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index 95547045111..c4ea8abb543 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -567,11 +567,10 @@ func isListAttr(attrdef *odpb.OpDef_AttrDef) bool {
 // This is useful when 's' corresponds to a "oneof" protocol buffer message.
 // For example, consider the protocol buffer message:
 //   oneof value { bool b = 1;  int64 i = 2; }
-// String() on a Go corresponding object (using proto.CompactTextString) will
-// print "b:true", or "i:7" etc. This function strips out the leading "b:" or
-// "i:".
-func stripLeadingColon(s fmt.Stringer) string {
-	x := s.String()
+// proto.CompactTextString) will print "b:true", or "i:7" etc. This function
+// strips out the leading "b:" or "i:".
+func stripLeadingColon(m proto.Message) string {
+	x := proto.CompactTextString(m)
 	y := strings.SplitN(x, ":", 2)
 	if len(y) < 2 {
 		return x
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index a339d181e8d..b467efc7aea 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -533,6 +533,261 @@ func TestOp(scope *Scope, bb tf.Output, aa tf.Output, optional ...TestOpAttr) (c
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
+`,
+		},
+		{
+			tag: "SampleDistortedBoundingBox",
+			opdef: `
+name: "SampleDistortedBoundingBox"
+input_arg {
+	name: "image_size"
+	type_attr: "T"
+}
+input_arg {
+	name: "bounding_boxes"
+	type: DT_FLOAT
+}
+output_arg {
+	name: "begin"
+	type_attr: "T"
+}
+output_arg {
+	name: "size"
+	type_attr: "T"
+}
+output_arg {
+	name: "bboxes"
+	type: DT_FLOAT
+}
+attr {
+	name: "T"
+	type: "type"
+	allowed_values {
+		list {
+			type: DT_UINT8
+			type: DT_INT8
+			type: DT_INT16
+			type: DT_INT32
+			type: DT_INT64
+		}
+	}
+}
+attr {
+	name: "seed"
+	type: "int"
+	default_value {
+		i: 0
+	}
+}
+attr {
+	name: "seed2"
+	type: "int"
+	default_value {
+		i: 0
+	}
+}
+attr {
+	name: "min_object_covered"
+	type: "float"
+	default_value {
+		f: 0.1
+	}
+}
+attr {
+	name: "aspect_ratio_range"
+	type: "list(float)"
+	default_value {
+		list {
+			f: 0.75
+			f: 1.33
+		}
+	}
+}
+attr {
+	name: "area_range"
+	type: "list(float)"
+	default_value {
+		list {
+			f: 0.05
+			f: 1
+		}
+	}
+}
+attr {
+	name: "max_attempts"
+	type: "int"
+	default_value {
+		i: 100
+	}
+}
+attr {
+	name: "use_image_if_no_bounding_boxes"
+	type: "bool"
+	default_value {
+		b: false
+	}
+}
+is_stateful: true
+`,
+			apidef: `
+op {
+  graph_op_name: "SampleDistortedBoundingBox"
+  in_arg {
+    name: "image_size"
+    description: "Blah blah"
+  }
+  in_arg {
+    name: "bounding_boxes"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "begin"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "size"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "bboxes"
+    description: "Blah blah"
+  }
+  attr {
+    name: "seed"
+    description: "Blah blah"
+  }
+  attr {
+    name: "seed2"
+    description: "Blah blah"
+  }
+  attr {
+    name: "min_object_covered"
+    description: "Blah blah"
+  }
+  attr {
+    name: "aspect_ratio_range"
+    description: "Blah blah"
+  }
+  attr {
+    name: "area_range"
+    description: "Blah blah"
+  }
+  attr {
+    name: "max_attempts"
+    description: "Blah blah"
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    description: "Blah blah"
+  }
+  summary: "Generate a single randomly distorted bounding box for an image."
+	description: "Blah blah"
+}
+`,
+			wanted: `
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Blah blah
+//
+// Arguments:
+//	image_size: Blah blah
+//	bounding_boxes: Blah blah
+//
+// Returns:
+//	begin: Blah blah
+//	size: Blah blah
+//	bboxes: Blah blah
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
 `,
 		},
 	}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ecbd204a3d6..7efdcf181d9 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1274,7 +1274,7 @@ type SqueezeAttr func(optionalAttr)
 // value: If specified, only squeezes the dimensions listed. The dimension
 // index starts at 0. It is an error to squeeze a dimension that is not 1. Must
 // be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func SqueezeAxis(value []int64) SqueezeAttr {
@@ -1358,7 +1358,7 @@ type PlaceholderAttr func(optionalAttr)
 //
 // value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
 // shape is unconstrained.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func PlaceholderShape(value tf.Shape) PlaceholderAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -1950,8 +1950,8 @@ func GatherV2BatchDims(value int64) GatherV2Attr {
 // Gather slices from `params` axis `axis` according to `indices`.
 //
 // `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
+// Produces an output tensor with shape `params.shape[:axis] +
+// indices.shape[batch_dims:] + params.shape[axis + 1:]` where:
 //
 // ```python
 //     # Scalar indices (output is rank(params) - 1).
@@ -2576,6 +2576,15 @@ func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Ou
 // In the above example, the input Tensor with the shape of `[1, 3]`
 // is broadcasted to output Tensor with shape of `[3, 3]`.
 //
+// When doing broadcasted operations such as multiplying a tensor
+// by a scalar, broadcasting (usually) confers some time or space
+// benefit, as the broadcasted tensor is never materialized.
+//
+// However, `broadcast_to` does not carry with it any such benefits.
+// The newly-created tensor takes the full memory of the broadcasted
+// shape. (In a graph context, `broadcast_to` might be fused to
+// subsequent operation and then be optimized away, however.)
+//
 // Arguments:
 //	input: A Tensor to broadcast.
 //	shape: An 1-D `int` Tensor. The shape of the desired output.
@@ -3585,7 +3594,7 @@ func BoostedTreesSparseCalculateBestFeatureSplit(scope *Scope, node_id_range tf.
 //	l1: l1 regularization factor on leaf weights, per instance based.
 //	l2: l2 regularization factor on leaf weights, per instance based.
 //	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	min_node_weight: minimum avg of hessians in a node before required for the node to be considered for splitting.
 //	logits_dimension: The dimension of logit, i.e., number of classes.
 //
 // Returns:
@@ -4007,7 +4016,7 @@ func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSampler
 //
 // value: A list of unigram counts or probabilities, one per ID in sequential
 // order. Exactly one of vocab_file and unigrams should be passed to this op.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["unigrams"] = value
@@ -4701,6 +4710,66 @@ func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAtt
 	return op.Output(0)
 }
 
+// DenseCountSparseOutputAttr is an optional argument to DenseCountSparseOutput.
+type DenseCountSparseOutputAttr func(optionalAttr)
+
+// DenseCountSparseOutputMinlength sets the optional minlength attribute to value.
+//
+// value: Minimum value to count. Can be set to -1 for no minimum.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func DenseCountSparseOutputMinlength(value int64) DenseCountSparseOutputAttr {
+	return func(m optionalAttr) {
+		m["minlength"] = value
+	}
+}
+
+// DenseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
+//
+// value: Maximum value to count. Can be set to -1 for no maximum.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func DenseCountSparseOutputMaxlength(value int64) DenseCountSparseOutputAttr {
+	return func(m optionalAttr) {
+		m["maxlength"] = value
+	}
+}
+
+// Performs sparse-output bin counting for a tf.tensor input.
+//
+//   Counts the number of times each value occurs in the input.
+//
+// Arguments:
+//	values: Tensor containing data to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values. May
+// also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
+//
+// Returns:
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_output bool, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"binary_output": binary_output}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DenseCountSparseOutput",
+		Input: []tf.Input{
+			values, weights,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
 type CTCBeamSearchDecoderAttr func(optionalAttr)
 
@@ -5639,7 +5708,7 @@ func CudnnRNNV3TimeMajor(value bool) CudnnRNNV3Attr {
 //     shape is [batch_size, seq_length, dir * num_units].
 // output_h: The same shape has input_h.
 // output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
+// is_training: Indicates whether this operation is used for inference or
 //   training.
 // time_major: Indicates whether the input/output format is time major or batch
 //     major.
@@ -5790,7 +5859,7 @@ func CudnnRNNV2IsTraining(value bool) CudnnRNNV2Attr {
 //     dir * num_units].
 // output_h: The same shape has input_h.
 // output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
+// is_training: Indicates whether this operation is used for inference or
 //   training.
 // reserve_space: An opaque tensor that can be used in backprop calculation. It
 //   is only produced if is_training is true.
@@ -6915,9 +6984,7 @@ func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
+// Copy a tensor setting everything outside a central band in each innermost matrix to zero.
 //
 // The `band` part is computed as follows:
 // Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
@@ -7099,7 +7166,7 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 type TensorArrayV2Attr func(optionalAttr)
 
 // TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -7224,7 +7291,7 @@ type TensorArrayConcatV3Attr func(optionalAttr)
 // excluding the first dimension. Used to validate the shapes of
 // TensorArray elements. If this shape is not fully specified, concatenating
 // zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape_except0"] = value
@@ -7283,7 +7350,7 @@ type TensorArrayGatherV3Attr func(optionalAttr)
 // value: The expected shape of an element, if known. Used to
 // validate the shapes of TensorArray elements. If this shape is not
 // fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -7774,7 +7841,7 @@ type PriorityQueueV2Attr func(optionalAttr)
 // PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
 //
 // value: The type of each component in a value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
@@ -8081,7 +8148,7 @@ type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
 // MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
 // value: The type list for the return values.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
@@ -8093,7 +8160,7 @@ func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDe
 // MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
 // value: The list of shapes being produced.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
@@ -8449,7 +8516,7 @@ func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output
 type OptimizeDatasetAttr func(optionalAttr)
 
 // OptimizeDatasetOptimizationConfigs sets the optional optimization_configs attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func OptimizeDatasetOptimizationConfigs(value []string) OptimizeDatasetAttr {
 	return func(m optionalAttr) {
 		m["optimization_configs"] = value
@@ -8535,6 +8602,73 @@ func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.Dat
 	return components
 }
 
+// RaggedCountSparseOutputAttr is an optional argument to RaggedCountSparseOutput.
+type RaggedCountSparseOutputAttr func(optionalAttr)
+
+// RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
+//
+// value: Minimum value to count. Can be set to -1 for no minimum.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
+	return func(m optionalAttr) {
+		m["minlength"] = value
+	}
+}
+
+// RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
+//
+// value: Maximum value to count. Can be set to -1 for no maximum.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
+	return func(m optionalAttr) {
+		m["maxlength"] = value
+	}
+}
+
+// Performs sparse-output bin counting for a ragged tensor input.
+//
+//   Counts the number of times each value occurs in the input.
+//
+// Arguments:
+//	splits: Tensor containing the row splits of the ragged tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
+//
+// Returns:
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+//   END
+//   }
+//   attr {
+//     name: "T"
+//     description: <<END
+// Dtype of the input values tensor.
+func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"binary_output": binary_output}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedCountSparseOutput",
+		Input: []tf.Input{
+			splits, values, weights,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Gets the next output from the given iterator .
 func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
@@ -9158,7 +9292,7 @@ type RandomShuffleQueueV2Attr func(optionalAttr)
 // be either 0 or the same as the length of component_types. If the length of
 // this attr is 0, the shapes of queue elements are not constrained, and
 // only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
@@ -9381,7 +9515,7 @@ func DebugIdentityV2TensorDebugMode(value int64) DebugIdentityV2Attr {
 // DebugIdentityV2DebugUrls sets the optional debug_urls attribute to value.
 //
 // value: List of URLs to debug targets, e.g., file:///foo/tfdbg_dump.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugIdentityV2DebugUrls(value []string) DebugIdentityV2Attr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -9446,7 +9580,7 @@ func DebugNanCountTensorName(value string) DebugNanCountAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugNanCountDebugUrls(value []string) DebugNanCountAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -9520,7 +9654,7 @@ func DebugIdentityTensorName(value string) DebugIdentityAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugIdentityDebugUrls(value []string) DebugIdentityAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -9845,37 +9979,6 @@ func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names
 	return op.Output(0)
 }
 
-// Creates an iterator for reading from the tf.data service.
-//
-// Returns the created operation.
-func MakeDataServiceIterator(scope *Scope, dataset tf.Output, epoch_id tf.Output, iterator tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MakeDataServiceIterator",
-		Input: []tf.Input{
-			dataset, epoch_id, iterator,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Begins a tf.data service dataset epoch.
-func BeginEpoch(scope *Scope, dataset_id tf.Output, address tf.Output, protocol tf.Output) (epoch_id tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BeginEpoch",
-		Input: []tf.Input{
-			dataset_id, address, protocol,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Registers a dataset with the tf.data service.
 func RegisterDataset(scope *Scope, dataset tf.Output, address tf.Output, protocol tf.Output, external_state_policy int64) (dataset_id tf.Output) {
 	if scope.Err() != nil {
@@ -9893,16 +9996,30 @@ func RegisterDataset(scope *Scope, dataset tf.Output, address tf.Output, protoco
 	return op.Output(0)
 }
 
+// DataServiceDatasetAttr is an optional argument to DataServiceDataset.
+type DataServiceDatasetAttr func(optionalAttr)
+
+// DataServiceDatasetTaskRefreshIntervalHintMs sets the optional task_refresh_interval_hint_ms attribute to value.
+// If not specified, defaults to -1
+func DataServiceDatasetTaskRefreshIntervalHintMs(value int64) DataServiceDatasetAttr {
+	return func(m optionalAttr) {
+		m["task_refresh_interval_hint_ms"] = value
+	}
+}
+
 // Creates a dataset that reads data from the tf.data service.
-func DataServiceDataset(scope *Scope, address tf.Output, protocol tf.Output, max_outstanding_requests tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func DataServiceDataset(scope *Scope, dataset_id tf.Output, processing_mode tf.Output, address tf.Output, protocol tf.Output, job_name tf.Output, max_outstanding_requests tf.Output, iteration_counter tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...DataServiceDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "DataServiceDataset",
 		Input: []tf.Input{
-			address, protocol, max_outstanding_requests,
+			dataset_id, processing_mode, address, protocol, job_name, max_outstanding_requests, iteration_counter,
 		},
 		Attrs: attrs,
 	}
@@ -10404,7 +10521,7 @@ func ParseExampleDatasetV2Deterministic(value string) ParseExampleDatasetV2Attr
 }
 
 // ParseExampleDatasetV2RaggedKeys sets the optional ragged_keys attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedKeys(value []string) ParseExampleDatasetV2Attr {
@@ -10414,7 +10531,7 @@ func ParseExampleDatasetV2RaggedKeys(value []string) ParseExampleDatasetV2Attr {
 }
 
 // ParseExampleDatasetV2RaggedValueTypes sets the optional ragged_value_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedValueTypes(value []tf.DataType) ParseExampleDatasetV2Attr {
@@ -10424,7 +10541,7 @@ func ParseExampleDatasetV2RaggedValueTypes(value []tf.DataType) ParseExampleData
 }
 
 // ParseExampleDatasetV2RaggedSplitTypes sets the optional ragged_split_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedSplitTypes(value []tf.DataType) ParseExampleDatasetV2Attr {
@@ -11894,75 +12011,6 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
-//
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
-//
-// Usage Example:
-//
-// >>> blue_image = tf.stack([
-// ...    tf.zeros([5,5]),
-// ...    tf.zeros([5,5]),
-// ...    tf.ones([5,5])],
-// ...    axis=-1)
-// >>> blue_hsv_image = tf.image.rgb_to_hsv(blue_image)
-// >>> blue_hsv_image[0,0].numpy()
-// array([0.6666667, 1. , 1. ], dtype=float32)
-//
-//
-// Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
-//
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
-		Input: []tf.Input{
-			images,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Decode the frame(s) of a GIF-encoded image to a uint8 tensor.
-//
-// GIF images with frame or transparency compression are not supported.
-// On Linux and MacOS systems, convert animated GIFs from compressed to
-// uncompressed by running:
-//
-//     convert $src.gif -coalesce $dst.gif
-//
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.io.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The GIF-encoded image.
-//
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB channel order.
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeGif",
-		Input: []tf.Input{
-			contents,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
 type SampleDistortedBoundingBoxAttr func(optionalAttr)
 
@@ -12005,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to <f:0.75 f:1.33 >
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12016,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12119,6 +12167,172 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//	colors: 2-D. A list of RGBA colors to cycle through for the boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxesV2(scope *Scope, images tf.Output, boxes tf.Output, colors tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxesV2",
+		Input: []tf.Input{
+			images, boxes, colors,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxes",
+		Input: []tf.Input{
+			images, boxes,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert one or more images from HSV to RGB.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
+//
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HSVToRGB",
+		Input: []tf.Input{
+			images,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts one or more images from RGB to HSV.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+//
+// Usage Example:
+//
+// >>> blue_image = tf.stack([
+// ...    tf.zeros([5,5]),
+// ...    tf.zeros([5,5]),
+// ...    tf.ones([5,5])],
+// ...    axis=-1)
+// >>> blue_hsv_image = tf.image.rgb_to_hsv(blue_image)
+// >>> blue_hsv_image[0,0].numpy()
+// array([0.6666667, 1. , 1. ], dtype=float32)
+//
+//
+// Arguments:
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RGBToHSV",
+		Input: []tf.Input{
+			images,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode the frame(s) of a GIF-encoded image to a uint8 tensor.
+//
+// GIF images with frame or transparency compression are not supported.
+// On Linux and MacOS systems, convert animated GIFs from compressed to
+// uncompressed by running:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.io.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB channel order.
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeGif",
+		Input: []tf.Input{
+			contents,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DecodeBmpAttr is an optional argument to DecodeBmp.
 type DecodeBmpAttr func(optionalAttr)
 
@@ -13096,7 +13310,7 @@ func ParseExampleDatasetSloppy(value bool) ParseExampleDatasetAttr {
 }
 
 // ParseExampleDatasetRaggedKeys sets the optional ragged_keys attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedKeys(value []string) ParseExampleDatasetAttr {
@@ -13106,7 +13320,7 @@ func ParseExampleDatasetRaggedKeys(value []string) ParseExampleDatasetAttr {
 }
 
 // ParseExampleDatasetRaggedValueTypes sets the optional ragged_value_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedValueTypes(value []tf.DataType) ParseExampleDatasetAttr {
@@ -13116,7 +13330,7 @@ func ParseExampleDatasetRaggedValueTypes(value []tf.DataType) ParseExampleDatase
 }
 
 // ParseExampleDatasetRaggedSplitTypes sets the optional ragged_split_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedSplitTypes(value []tf.DataType) ParseExampleDatasetAttr {
@@ -13481,6 +13695,68 @@ func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_sl
 	return scope.AddOperation(opspec)
 }
 
+// SparseCountSparseOutputAttr is an optional argument to SparseCountSparseOutput.
+type SparseCountSparseOutputAttr func(optionalAttr)
+
+// SparseCountSparseOutputMinlength sets the optional minlength attribute to value.
+//
+// value: Minimum value to count. Can be set to -1 for no minimum.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func SparseCountSparseOutputMinlength(value int64) SparseCountSparseOutputAttr {
+	return func(m optionalAttr) {
+		m["minlength"] = value
+	}
+}
+
+// SparseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
+//
+// value: Maximum value to count. Can be set to -1 for no maximum.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func SparseCountSparseOutputMaxlength(value int64) SparseCountSparseOutputAttr {
+	return func(m optionalAttr) {
+		m["maxlength"] = value
+	}
+}
+
+// Performs sparse-output bin counting for a sparse tensor input.
+//
+//   Counts the number of times each value occurs in the input.
+//
+// Arguments:
+//	indices: Tensor containing the indices of the sparse tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	dense_shape: Tensor containing the dense shape of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
+//
+// Returns:
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_output bool, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"binary_output": binary_output}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseCountSparseOutput",
+		Input: []tf.Input{
+			indices, values, dense_shape, weights,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // DebugNumericSummaryV2Attr is an optional argument to DebugNumericSummaryV2.
 type DebugNumericSummaryV2Attr func(optionalAttr)
 
@@ -13619,7 +13895,7 @@ func DebugNumericSummaryTensorName(value string) DebugNumericSummaryAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugNumericSummaryDebugUrls(value []string) DebugNumericSummaryAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -13859,35 +14135,6 @@ func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output, optional
 	return op.Output(0)
 }
 
-// Calculate product with tridiagonal matrix.
-//
-// Calculates product of two matrices, where left matrix is a tridiagonal matrix.
-//
-// Arguments:
-//	superdiag: Tensor of shape `[..., 1, M]`, representing superdiagonals of
-// tri-diagonal matrices to the left of multiplication. Last element is ignored.
-//	maindiag: Tensor of shape `[..., 1, M]`, representing main diagonals of tri-diagonal
-// matrices to the left of multiplication.
-//	subdiag: Tensor of shape `[..., 1, M]`, representing subdiagonals of tri-diagonal
-// matrices to the left of multiplication. First element is ignored.
-//	rhs: Tensor of shape `[..., M, N]`, representing MxN matrices to the right of
-// multiplication.
-//
-// Returns Tensor of shape `[..., M, N]` containing the product.
-func TridiagonalMatMul(scope *Scope, superdiag tf.Output, maindiag tf.Output, subdiag tf.Output, rhs tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TridiagonalMatMul",
-		Input: []tf.Input{
-			superdiag, maindiag, subdiag, rhs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes gradients for SparseSegmentMean.
 //
 // Returns tensor "output" with same shape as grad, except for dimension 0 whose
@@ -14225,86 +14472,6 @@ func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
-//
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, delta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the matrix logarithm of one or more square matrices:
-//
-//
-// \\(log(exp(A)) = A\\)
-//
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
-//
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that emits the key-value pairs in one or more LMDB files.
 //
 // The Lightning Memory-Mapped Database Manager, or LMDB, is an embedded binary
@@ -14775,42 +14942,6 @@ func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
-//	colors: 2-D. A list of RGBA colors to cycle through for the boxes.
-//
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxesV2(scope *Scope, images tf.Output, boxes tf.Output, colors tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxesV2",
-		Input: []tf.Input{
-			images, boxes, colors,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // AvgPoolAttr is an optional argument to AvgPool.
 type AvgPoolAttr func(optionalAttr)
 
@@ -15008,7 +15139,7 @@ func TensorSummaryDescription(value string) TensorSummaryAttr {
 // TensorSummaryLabels sets the optional labels attribute to value.
 //
 // value: An unused list of strings.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
 		m["labels"] = value
@@ -15265,7 +15396,7 @@ func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableO
 }
 
 // MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
 		m["value_shape"] = value
@@ -15857,7 +15988,7 @@ func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard
 // N is the size of the segment being reduced.
 //
 // Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// missing, the `output` tensor at that position will be zeroed.
 //
 // Read
 // [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
@@ -15981,7 +16112,7 @@ type ParseSingleSequenceExampleAttr func(optionalAttr)
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -15991,7 +16122,7 @@ func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSing
 }
 
 // ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16007,7 +16138,7 @@ func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseS
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
@@ -16022,7 +16153,7 @@ func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleS
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16038,7 +16169,7 @@ func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) Parse
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
@@ -16540,41 +16671,6 @@ func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
-//
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
-		Input: []tf.Input{
-			images, boxes,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Bucketizes 'input' based on 'boundaries'.
 //
 // For example, if the inputs are
@@ -16983,6 +17079,17 @@ func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// ShuffleAndRepeatDatasetAttr is an optional argument to ShuffleAndRepeatDataset.
+type ShuffleAndRepeatDatasetAttr func(optionalAttr)
+
+// ShuffleAndRepeatDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
+// If not specified, defaults to true
+func ShuffleAndRepeatDatasetReshuffleEachIteration(value bool) ShuffleAndRepeatDatasetAttr {
+	return func(m optionalAttr) {
+		m["reshuffle_each_iteration"] = value
+	}
+}
+
 // Creates a dataset that shuffles and repeats elements from `input_dataset`
 //
 // pseudorandomly.
@@ -17000,11 +17107,14 @@ func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x
 // should be repeated. The default is `-1`, which results in infinite repetition.
 //
 //
-func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleAndRepeatDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "ShuffleAndRepeatDataset",
 		Input: []tf.Input{
@@ -17130,6 +17240,242 @@ func CumulativeLogsumexp(scope *Scope, x tf.Output, axis tf.Output, optional ...
 	return op.Output(0)
 }
 
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the matrix logarithm of one or more square matrices:
+//
+//
+// \\(log(exp(A)) = A\\)
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixLogarithm",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseBincountAttr is an optional argument to SparseBincount.
+type SparseBincountAttr func(optionalAttr)
+
+// SparseBincountBinaryOutput sets the optional binary_output attribute to value.
+//
+// value: bool; Whether the kernel should count the appearance or number of occurrences.
+// If not specified, defaults to false
+func SparseBincountBinaryOutput(value bool) SparseBincountAttr {
+	return func(m optionalAttr) {
+		m["binary_output"] = value
+	}
+}
+
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	indices: 2D int64 `Tensor`.
+//	values: 1D int `Tensor`.
+//	dense_shape: 1D int64 `Tensor`.
+//	size: non-negative int scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `input`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size` or 2D `Tensor` with [batch_size, `size`].
+// The counts or summed weights for each value in the range [0, size).
+func SparseBincount(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, size tf.Output, weights tf.Output, optional ...SparseBincountAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseBincount",
+		Input: []tf.Input{
+			indices, values, dense_shape, size, weights,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Calculate product with tridiagonal matrix.
+//
+// Calculates product of two matrices, where left matrix is a tridiagonal matrix.
+//
+// Arguments:
+//	superdiag: Tensor of shape `[..., 1, M]`, representing superdiagonals of
+// tri-diagonal matrices to the left of multiplication. Last element is ignored.
+//	maindiag: Tensor of shape `[..., 1, M]`, representing main diagonals of tri-diagonal
+// matrices to the left of multiplication.
+//	subdiag: Tensor of shape `[..., 1, M]`, representing subdiagonals of tri-diagonal
+// matrices to the left of multiplication. First element is ignored.
+//	rhs: Tensor of shape `[..., M, N]`, representing MxN matrices to the right of
+// multiplication.
+//
+// Returns Tensor of shape `[..., M, N]` containing the product.
+func TridiagonalMatMul(scope *Scope, superdiag tf.Output, maindiag tf.Output, subdiag tf.Output, rhs tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TridiagonalMatMul",
+		Input: []tf.Input{
+			superdiag, maindiag, subdiag, rhs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+//
+// if < 0, `scale * features` otherwise.
+//
+// To be used together with
+// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Selu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DenseBincountAttr is an optional argument to DenseBincount.
+type DenseBincountAttr func(optionalAttr)
+
+// DenseBincountBinaryOutput sets the optional binary_output attribute to value.
+//
+// value: bool; Whether the kernel should count the appearance or number of occurrences.
+// If not specified, defaults to false
+func DenseBincountBinaryOutput(value bool) DenseBincountAttr {
+	return func(m optionalAttr) {
+		m["binary_output"] = value
+	}
+}
+
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	input: 1D or 2D int `Tensor`.
+//	size: non-negative int scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size` or 2D `Tensor` with [batch_size, `size`].
+// The counts or summed weights for each value in the range [0, size).
+func DenseBincount(scope *Scope, input tf.Output, size tf.Output, weights tf.Output, optional ...DenseBincountAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DenseBincount",
+		Input: []tf.Input{
+			input, size, weights,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the complex conjugate of a complex number.
 //
 // Given a tensor `input` of complex numbers, this operation returns a tensor of
@@ -17725,7 +18071,7 @@ func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_
 // Computes the sum along sparse segments of a tensor.
 //
 // Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// missing, the `output` tensor at that position will be zeroed.
 //
 // Read
 // [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
@@ -18623,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to <f:0.75 f:1.33 >
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18634,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19038,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -19886,7 +20232,7 @@ func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Ou
 // Adjust the saturation of one or more images.
 //
 // `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// interpreted as channels, and must be three.
 //
 // The input image is considered in the RGB colorspace. Conceptually, the RGB
 // colors are first mapped into HSV. A scale is then applied all the saturation
@@ -20109,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21253,7 +21599,7 @@ func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -21281,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21961,7 +22307,7 @@ func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -21989,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22185,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22193,7 +22539,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64
 }
 
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList sets the optional padding_list attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["padding_list"] = value
@@ -22254,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22262,7 +22608,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDe
 }
 
 // QuantizedDepthwiseConv2DWithBiasAndReluPaddingList sets the optional padding_list attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func QuantizedDepthwiseConv2DWithBiasAndReluPaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["padding_list"] = value
@@ -22369,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22428,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22602,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22979,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23144,7 +23490,7 @@ func QuantizedMatMulWithBiasAndReluAndRequantizeInputQuantMode(value string) Qua
 // dimension of `a` (after being transposed if `transpose_a` is non-zero) must
 // match the outer dimension of `b` (after being transposed if `transposed_b` is
 // non-zero). Then do broadcast add operation with bias values on the matrix
-// mulplication result. The bias size must match inner dimension of `b`.  Then do
+// multiplication result. The bias size must match inner dimension of `b`.  Then do
 // relu activation to get non-negative result. Then do requantize operation to get
 // final uint8 result.
 //
@@ -23231,7 +23577,7 @@ func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulW
 // dimension of `a` (after being transposed if `transpose_a` is non-zero) must
 // match the outer dimension of `b` (after being transposed if `transposed_b` is
 // non-zero). Then do broadcast add operation with bias values on the matrix
-// mulplication result. The bias size must match inner dimension of `b`. Then do
+// multiplication result. The bias size must match inner dimension of `b`. Then do
 // relu activation to get non-negative result.
 //
 // Arguments:
@@ -23315,7 +23661,7 @@ func QuantizedMatMulWithBiasInputQuantMode(value string) QuantizedMatMulWithBias
 // dimension of `a` (after being transposed if `transpose_a` is non-zero) must
 // match the outer dimension of `b` (after being transposed if `transposed_b` is
 // non-zero). Then do broadcast add operation with bias values on the matrix
-// mulplication result. The bias size must match inner dimension of `b`.
+// multiplication result. The bias size must match inner dimension of `b`.
 //
 // Arguments:
 //	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
@@ -23354,7 +23700,7 @@ func QuantizedMatMulWithBias(scope *Scope, a tf.Output, b tf.Output, bias tf.Out
 type TensorArrayGatherV2Attr func(optionalAttr)
 
 // TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -23549,7 +23895,7 @@ func CopyTensorName(value string) CopyAttr {
 // <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
 // as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
 // "DebugIdentity;file:///tmp/tfdbg_1;0".
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CopyDebugOpsSpec(value []string) CopyAttr {
 	return func(m optionalAttr) {
 		m["debug_ops_spec"] = value
@@ -23588,9 +23934,12 @@ func Copy(scope *Scope, input tf.Output, optional ...CopyAttr) (output tf.Output
 	return op.Output(0)
 }
 
-//     Updates specified rows with values in `v`.
+// Updates specified rows 'i' with values 'v'.
 //
-//     Computes `x[i, :] = v; return x`.
+// Computes `x[i, :] = v; return x`.
+//
+// Originally this function is mutative however for compilation we make this
+// operation create / operate on a copy of `x`.
 //
 // Arguments:
 //	x: A tensor of type `T`.
@@ -23778,7 +24127,7 @@ type FIFOQueueV2Attr func(optionalAttr)
 // be either 0 or the same as the length of component_types. If the length of
 // this attr is 0, the shapes of queue elements are not constrained, and
 // only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
@@ -24122,7 +24471,7 @@ func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTable
 // MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
 // value: The shape of each value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["value_shape"] = value
@@ -25299,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25362,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25585,7 +25934,7 @@ func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *
 type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeBackpropInputExplicitPaddings(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -25613,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25754,6 +26103,173 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	return op.Output(0)
 }
 
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpreted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ClipByValue",
+		Input: []tf.Input{
+			t, clip_value_min, clip_value_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
 type ConfigureDistributedTPUAttr func(optionalAttr)
 
@@ -26097,7 +26613,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26808,7 +27324,7 @@ func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output)
 //
 // @tf.function
 // def foo(x, y):
-//   return = mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
+//   return mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
 //
 // graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.TensorSpec([10], tf.float32)).graph.as_graph_def()
 // ```
@@ -26893,7 +27409,7 @@ func ParseSequenceExampleV2NcontextSparse(value int64) ParseSequenceExampleV2Att
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -26905,7 +27421,7 @@ func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequence
 // ParseSequenceExampleV2ContextRaggedValueTypes sets the optional context_ragged_value_types attribute to value.
 //
 // value: RaggedTensor.value dtypes for the ragged context features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -26917,7 +27433,7 @@ func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSeq
 // ParseSequenceExampleV2ContextRaggedSplitTypes sets the optional context_ragged_split_types attribute to value.
 //
 // value: RaggedTensor.row_split dtypes for the ragged context features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -26933,7 +27449,7 @@ func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSeq
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
@@ -26963,7 +27479,7 @@ func ParseSequenceExampleV2NfeatureListDense(value int64) ParseSequenceExampleV2
 }
 
 // ParseSequenceExampleV2FeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -26978,7 +27494,7 @@ func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSeque
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -26990,7 +27506,7 @@ func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequ
 // ParseSequenceExampleV2FeatureListRaggedValueTypes sets the optional feature_list_ragged_value_types attribute to value.
 //
 // value: RaggedTensor.value dtypes for the ragged FeatureList features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27002,7 +27518,7 @@ func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) Pars
 // ParseSequenceExampleV2FeatureListRaggedSplitTypes sets the optional feature_list_ragged_split_types attribute to value.
 //
 // value: RaggedTensor.row_split dtypes for the ragged FeatureList features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27018,7 +27534,7 @@ func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) Pars
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
@@ -27047,7 +27563,7 @@ func ParseSequenceExampleV2FeatureListDenseShapes(value []tf.Shape) ParseSequenc
 //	feature_list_dense_keys: The keys expected in the SequenceExamples' feature_lists associated
 // with lists of dense values.
 //	feature_list_ragged_keys: The keys expected in the FeatureLists associated with ragged values.
-//	feature_list_dense_missing_assumed_empty: A vector corresponding 1:1 with featue_list_dense_keys, indicating which
+//	feature_list_dense_missing_assumed_empty: A vector corresponding 1:1 with feature_list_dense_keys, indicating which
 // features may be missing from the SequenceExamples.  If the associated
 // FeatureList is missing, it is treated as empty.
 //	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
@@ -28199,7 +28715,7 @@ func BatchMaxEnqueuedBatches(value int64) BatchAttr {
 }
 
 // BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func BatchAllowedBatchSizes(value []int64) BatchAttr {
 	return func(m optionalAttr) {
 		m["allowed_batch_sizes"] = value
@@ -30306,57 +30822,6 @@ func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_in
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
-//
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ClipByValue",
-		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
-//
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // VariableShapeAttr is an optional argument to VariableShape.
 type VariableShapeAttr func(optionalAttr)
 
@@ -30826,7 +31291,7 @@ func VarHandleOpSharedName(value string) VarHandleOpAttr {
 //
 // value: The allowed devices containing the resource variable. Set when the output
 // ResourceHandle represents a per-replica/partitioned resource variable.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func VarHandleOpAllowedDevices(value []string) VarHandleOpAttr {
 	return func(m optionalAttr) {
 		m["allowed_devices"] = value
@@ -31053,29 +31518,6 @@ func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...Booste
 	return op.Output(0)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
-// if < 0, `scale * features` otherwise.
-//
-// To be used together with
-// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
-//
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Selu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
 type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
@@ -32196,7 +32638,7 @@ func CopyHostTensorName(value string) CopyHostAttr {
 // <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
 // as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
 // "DebugIdentity;file:///tmp/tfdbg_1;0".
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CopyHostDebugOpsSpec(value []string) CopyHostAttr {
 	return func(m optionalAttr) {
 		m["debug_ops_spec"] = value
@@ -32525,7 +32967,7 @@ type IteratorFromStringHandleAttr func(optionalAttr)
 //
 // value: If specified, defines the type of each tuple component in an
 // element produced by the resulting iterator.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
@@ -32538,7 +32980,7 @@ func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromString
 //
 // value: If specified, defines the shape of each tuple component in an
 // element produced by the resulting iterator.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
@@ -33870,6 +34312,74 @@ func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	strong_hash: boolean, if true, siphash with salt will be used instead of farmhash.
+//	salt: Specify the salt that will be used by the siphash function.
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCrossHashed(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, num_buckets tf.Output, strong_hash tf.Output, salt tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseCrossHashed",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs), num_buckets, strong_hash, salt,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
 type QuantizedInstanceNormAttr func(optionalAttr)
 
@@ -34131,6 +34641,71 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	sep: string used when joining a list of string inputs, can be used as separator later.
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCrossV2(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, sep tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseCrossV2",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs), sep,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Pads a tensor with mirrored values.
 //
 // This operation pads a `input` with mirrored values according to the `paddings`
@@ -34194,7 +34769,7 @@ type TensorArrayV3Attr func(optionalAttr)
 // value: The expected shape of an element, if known. Used to
 // validate the shapes of TensorArray elements. If this shape is not
 // fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -36284,7 +36859,7 @@ func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36294,7 +36869,7 @@ func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceEx
 }
 
 // ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36310,7 +36885,7 @@ func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenc
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
@@ -36325,7 +36900,7 @@ func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExamp
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36341,7 +36916,7 @@ func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequen
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
@@ -36561,34 +37136,6 @@ func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	return components
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
-//
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustHue",
-		Input: []tf.Input{
-			images, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Says whether the targets are in the top `K` predictions.
 //
 // This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
@@ -37021,7 +37568,7 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 // Computes the mean along sparse segments of a tensor.
 //
 // Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// missing, the `output` tensor at that position will be zeroed.
 //
 // Read
 // [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
@@ -38319,6 +38866,58 @@ func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output,
 	return op.Output(0)
 }
 
+// RaggedBincountAttr is an optional argument to RaggedBincount.
+type RaggedBincountAttr func(optionalAttr)
+
+// RaggedBincountBinaryOutput sets the optional binary_output attribute to value.
+//
+// value: bool; Whether the kernel should count the appearance or number of occurrences.
+// If not specified, defaults to false
+func RaggedBincountBinaryOutput(value bool) RaggedBincountAttr {
+	return func(m optionalAttr) {
+		m["binary_output"] = value
+	}
+}
+
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	splits: 1D int64 `Tensor`.
+//	values: 2D int `Tensor`.
+//	size: non-negative int scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `input`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size` or 2D `Tensor` with [batch_size, `size`].
+// The counts or summed weights for each value in the range [0, size).
+func RaggedBincount(scope *Scope, splits tf.Output, values tf.Output, size tf.Output, weights tf.Output, optional ...RaggedBincountAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedBincount",
+		Input: []tf.Input{
+			splits, values, size, weights,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
 type StatelessRandomNormalAttr func(optionalAttr)
 
@@ -38887,7 +39486,7 @@ type PrelinearizeTupleAttr func(optionalAttr)
 // tuple shapes in the order the shapes appear in the "shapes" input. The layout
 // elements for a sub-shape can be set to -1 in which case the corresponding layout
 // will be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
 	return func(m optionalAttr) {
 		m["layouts"] = value
@@ -39605,712 +40204,41 @@ func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.O
 	return op.Output(0)
 }
 
-// IRFFTAttr is an optional argument to IRFFT.
-type IRFFTAttr func(optionalAttr)
+// ExperimentalRebatchDatasetAttr is an optional argument to ExperimentalRebatchDataset.
+type ExperimentalRebatchDatasetAttr func(optionalAttr)
 
-// IRFFTTreal sets the optional Treal attribute to value.
-// If not specified, defaults to DT_FLOAT
-func IRFFTTreal(value tf.DataType) IRFFTAttr {
-	return func(m optionalAttr) {
-		m["Treal"] = value
-	}
-}
-
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output, optional ...IRFFTAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes square root of x element-wise.
-//
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-//
-// This operation folds the padded areas of `input` by `MirrorPad` according to the
-// `paddings` you specify. `paddings` must be the same as `paddings` argument
-// given to the corresponding `MirrorPad` op.
-//
-// The folded size of each dimension D of the output is:
-//
-// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-// # 'paddings' is [[0, 1]], [0, 1]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[ 1,  5]
-//                       [11, 28]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be folded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: The mode used in the `MirrorPad` op.
-//
-// Returns The folded tensor.
-func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPadGrad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces the max pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns:
-//	output
-//	min_output: The float value that the lowest quantized output value represents.
-//	max_output: The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
-
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// ExperimentalRebatchDatasetUseFallback sets the optional use_fallback attribute to value.
 // If not specified, defaults to true
-func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+func ExperimentalRebatchDatasetUseFallback(value bool) ExperimentalRebatchDatasetAttr {
 	return func(m optionalAttr) {
-		m["update_slots"] = value
+		m["use_fallback"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
+// Creates a dataset that changes the batch size.
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// Creates a dataset that changes the batch size of the dataset to current batch
+// size // num_replicas.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_replicas: A scalar representing the number of replicas to distribute this batch across. As
+// a result of this transformation the current batch size would end up being
+// divided  by this parameter.
 //
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
-type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to {}
-func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// An op that enqueues TPUEmbedding input indices from a SparseTensor.
-//
-// This Op eases the porting of code that uses embedding_lookup_sparse(),
-// although some Python preprocessing of the SparseTensor arguments to
-// embedding_lookup_sparse() is required to produce the arguments to this Op,
-// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
-// step.
-//
-// The tensors at corresponding positions in the three input lists
-// must have the same shape, i.e. rank 1 with dim_size() equal to the total
-// number of lookups into the table described by the corresponding table_id.
-//
-// Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example and
-// feature to which the corresponding embedding_indices and aggregation_weights
-// values belong. sample_indices[i] must equal b * nf + f, where nf is the
-// number of features from the corresponding table, f is in [0, nf), and
-// b is in [0, batch size).
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
-// (training example, feature) -- aggregation weights.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 11, 3, 10, 9, 6, 7, 12]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates and returns an empty tensor list.
-//
-// All list elements must be tensors of dtype element_dtype and shape compatible
-// with element_shape.
-//
-// handle: an empty tensor list.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "EmptyTensorList",
-		Input: []tf.Input{
-			element_shape, max_num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Sets up TPUEmbedding in a distributed TPU system.
-//
-// Arguments:
-//	config: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
-// describes the embedding lookups of the program.
-//
-// Returns the created operation.
-func ConfigureTPUEmbedding(scope *Scope, config string) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"config": config}
-	opspec := tf.OpSpec{
-		Type: "ConfigureTPUEmbedding",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
-type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
-//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
-type StatelessRandomUniformFullIntAttr func(optionalAttr)
-
-// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformFullInt",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Shuts down a running distributed TPU system.
-//
-// The op returns an error if no system is running.
-//
-// Returns the created operation.
-func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShutdownDistributedTPU",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the value stored in an Optional variant or raises an error if none exists.
-func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_replicas tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalRebatchDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "OptionalGetValue",
-		Input: []tf.Input{
-			optional,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("OptionalGetValue", err)
-		return
-	}
-	return components
-}
-
-// Determine the script codes of a given tensor of Unicode integer code points.
-//
-// This operation converts Unicode code points to script codes corresponding to
-// each code point. Script codes correspond to International Components for
-// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
-// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
-// match input shape.
-//
-// Examples:
-//
-// >>> tf.strings.unicode_script([1, 31, 38])
-// <tf.Tensor: shape=(3,), dtype=int32, numpy=array([0, 0, 0], dtype=int32)>
-//
-// Arguments:
-//	input: A Tensor of int32 Unicode code points.
-//
-// Returns A Tensor of int32 script codes corresponding to each input code point.
-func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeScript",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the sampling method for resizing. It can be either
-// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
-// methods are supported: Bilinear and Nearest Neighbor.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
-//
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
-	}
-}
-
-// Extracts crops from the input image tensor and resizes them.
-//
-// Extracts crops from the input image tensor and resizes them using bilinear
-// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
-// common output size specified by `crop_size`. This is more general than the
-// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
-// and does not allow resizing or aspect ratio change.
-//
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear or nearest neighbor interpolation) to a fixed
-// `size = [crop_height, crop_width]`. The result is a 4-D tensor
-// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
-// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
-// results to using `tf.image.resize_bilinear()` or
-// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
-// `align_corners=True`.
-//
-// Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
-//
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "ExperimentalRebatchDataset",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			input_dataset, num_replicas,
 		},
 		Attrs: attrs,
 	}
@@ -40318,1613 +40246,63 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
-func DepthwiseConv2dNativeBackpropFilterExplicitPaddings(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
+// Concatenates tensors along one dimension.
 //
 // Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that zips together `input_datasets`.
-//
-// The elements of the resulting dataset are created by zipping corresponding
-// elements from each of the input datasets.
-//
-// The size of the resulting dataset will match the size of the smallest input
-// dataset, and no error will be raised if input datasets have different sizes.
-//
-// Arguments:
-//	input_datasets: List of `N` variant Tensors representing datasets to be zipped together.
-//
-//
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ZipDataset",
-		Input: []tf.Input{
-			tf.OutputList(input_datasets),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "Concat",
 		Input: []tf.Input{
-			x,
+			concat_dim, tf.OutputList(values),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a tree ensemble model and returns a handle to it.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
-//	stamp_token: Token to use as the initial value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
-//
-// Returns the created operation.
-func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCreateEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
 
-// Calculates the softmax of a CSRSparseMatrix.
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// Calculate the softmax of the innermost dimensions of a SparseMatrix.
-//
-// Missing values are treated as `-inf` (i.e., logits of zero probability); and
-// the output has the same sparsity structure as the input (though missing values
-// in the output may now be treated as having probability zero).
-//
-// Arguments:
-//	logits: A CSRSparseMatrix.
-//
-//
-// Returns A CSRSparseMatrix.
-func SparseMatrixSoftmax(scope *Scope, logits tf.Output, type_ tf.DataType) (softmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"type": type_}
-	opspec := tf.OpSpec{
-		Type: "SparseMatrixSoftmax",
-		Input: []tf.Input{
-			logits,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
-
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
-//
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
-//
-// See also `RestoreSlice`.
-//
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
-//
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Restore",
-		Input: []tf.Input{
-			file_pattern, tensor_name,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EnqueueTPUEmbeddingRaggedTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingRaggedTensorBatch.
-type EnqueueTPUEmbeddingRaggedTensorBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingRaggedTensorBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to {}
-func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to {}
-func EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["max_sequence_lengths"] = value
-	}
-}
-
-// Eases the porting of code that uses tf.nn.embedding_lookup().
-//
-// sample_splits[i], embedding_indices[i] and aggregation_weights[i] correspond
-// to the ith feature. table_ids[i] indicates which embedding table to look up ith
-// feature.
-//
-// The tensors at corresponding positions in two of the input lists,
-// embedding_indices and aggregation_weights, must have the same shape, i.e. rank 1
-// with dim_size() equal to the total number of lookups into the table described by
-// the corresponding feature.
-//
-// Arguments:
-//	sample_splits: A list of rank 1 Tensors specifying the break points for splitting
-// embedding_indices and aggregation_weights into rows.
-// It corresponds to ids.row_splits in embedding_lookup(), when ids is a
-// RaggedTensor.
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-// It corresponds to ids.values in embedding_lookup(), when ids is a RaggedTensor.
-//	aggregation_weights: A list of rank 1 Tensors containing per training example
-// aggregation weights. It corresponds to the values field of a RaggedTensor
-// with the same row_splits as ids in embedding_lookup(), when ids is a
-// RaggedTensor.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//	table_ids: A list of integers specifying the identifier of the embedding table
-// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
-// corresponding input. The ith input is looked up using table_ids[i]. The size
-// of the table_ids list must be equal to that of sample_indices,
-// embedding_indices and aggregation_weights.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingRaggedTensorBatch(scope *Scope, sample_splits []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingRaggedTensorBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"table_ids": table_ids}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingRaggedTensorBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_splits), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
-type ResourceApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// accum = accum * momentum - lr * grad
-// var += accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyKerasMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumsumReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumsum",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Performs gradient updates of embedding tables.
-//
-// Arguments:
-//	inputs: A TensorList of gradients with which to update embedding tables.
-// This argument has the same length and shapes as the return value of
-// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
-// with respect to the embedding activations. The embedding tables are updated
-// from these gradients via the optimizer specified in the TPU embedding
-// configuration given to tpu.initialize_system.
-//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
-// rate tag: see the comments in
-// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
-// Multiple tables can share the same dynamic learning rate tag as specified
-// in the configuration. If the learning rates for all tables are constant,
-// this list should be empty.
-//	config: Serialized TPUEmbeddingConfiguration proto.
-//
-// Returns the created operation.
-func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"config": config}
-	opspec := tf.OpSpec{
-		Type: "SendTPUEmbeddingGradients",
-		Input: []tf.Input{
-			tf.OutputList(inputs), tf.OutputList(learning_rates),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
-type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingCenteredRMSPropParametersConfig(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve centered RMSProp embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the centered RMSProp optimization algorithm.
-//	ms: Parameter ms updated by the centered RMSProp optimization algorithm.
-//	mom: Parameter mom updated by the centered RMSProp optimization algorithm.
-//	mg: Parameter mg updated by the centered RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExample",
-		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sigmoid",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
-type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve ADAM embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the ADAM optimization algorithm.
-//	momenta: Parameter momenta updated by the ADAM optimization algorithm.
-//	velocities: Parameter velocities updated by the ADAM optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the ADAM optimization algorithm.
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
-//
-// $$\text{lr}_t := \mathrm{learning_rate} * \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-// $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-// $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
-// $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DatasetToGraphAttr is an optional argument to DatasetToGraph.
-type DatasetToGraphAttr func(optionalAttr)
-
-// DatasetToGraphStatefulWhitelist sets the optional stateful_whitelist attribute to value.
-// If not specified, defaults to {}
-//
-// REQUIRES: len(value) >= 0
-func DatasetToGraphStatefulWhitelist(value []string) DatasetToGraphAttr {
-	return func(m optionalAttr) {
-		m["stateful_whitelist"] = value
-	}
-}
-
-// DatasetToGraphAllowStateful sets the optional allow_stateful attribute to value.
-// If not specified, defaults to false
-func DatasetToGraphAllowStateful(value bool) DatasetToGraphAttr {
-	return func(m optionalAttr) {
-		m["allow_stateful"] = value
-	}
-}
-
-// DatasetToGraphStripDeviceAssignment sets the optional strip_device_assignment attribute to value.
-// If not specified, defaults to false
-func DatasetToGraphStripDeviceAssignment(value bool) DatasetToGraphAttr {
-	return func(m optionalAttr) {
-		m["strip_device_assignment"] = value
-	}
-}
-
-// Returns a serialized GraphDef representing `input_dataset`.
-//
-// Returns a graph representation for `input_dataset`.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
-//
-// Returns The graph representation of the dataset (as serialized GraphDef).
-func DatasetToGraph(scope *Scope, input_dataset tf.Output, optional ...DatasetToGraphAttr) (graph tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DatasetToGraph",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeV3NarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeV3NarrowRange(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// QuantizeAndDequantizeV3Axis sets the optional axis attribute to value.
-// If not specified, defaults to -1
-func QuantizeAndDequantizeV3Axis(value int64) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
-//
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
-		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x * y element-wise.
-//
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mul",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softplus gradients for a softplus operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
-//
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
-//
-// Returns:
-//	loss: Per example loss (batch_size vector).
-//	backprop: backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
-//
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Worker heartbeat op.
-//
-// Heartbeats may be sent periodically to indicate the coordinator is still active,
-// to retrieve the current worker status and to expedite shutdown when necessary.
-//
-// Arguments:
-//	request: A string tensor containing a serialized WorkerHeartbeatRequest
-//
-// Returns A string tensor containing a serialized WorkerHeartbeatResponse
-func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WorkerHeartbeat",
-		Input: []tf.Input{
-			request,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the item in the list with the given index.
-//
-// input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
-//
-//
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
-		Input: []tf.Input{
-			input_handle, index, element_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
-type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve proximal Adagrad embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the proximal Adagrad optimization algorithm.
-//	accumulators: Parameter accumulators updated by the proximal Adagrad optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns x / y element-wise.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Div",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to {}
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
-// Enqueue a Tensor on the computation outfeed.
-//
-// Arguments:
-//	input: A tensor that will be inserted into the outfeed queue.
-//
-// Returns the created operation.
-func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.io.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of nonzeroes of `sparse_matrix`.
-//
-// Arguments:
-//	sparse_matrix: A CSRSparseMatrix.
-//
-// Returns The number of nonzeroes of `sparse_matrix`.
-func SparseMatrixNNZ(scope *Scope, sparse_matrix tf.Output) (nnz tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseMatrixNNZ",
-		Input: []tf.Input{
-			sparse_matrix,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
-//
-// Returns:
-//	output_indices: 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.
-//	output_shape: 1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReshape",
-		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
-type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load proximal Adagrad embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
-
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AdaMax algorithm.
+// Update '*var' according to the AddSign update.
 //
 // m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
 //	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -41933,101 +40311,86 @@ func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+// Converts a tensor to a scalar predicate.
 //
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
+// Converts a tensor to a scalar predicate with the following rules:
 //
-// Returns:
-//	stamp_token: Stamp token of the tree ensemble resource.
-//	num_trees: The number of trees in the tree ensemble resource.
-//	num_finalized_trees: The number of trees that were finished successfully.
-//	num_attempted_layers: The number of layers we attempted to build (but not necessarily succeeded).
-//	last_layer_nodes_range: Rank size 2 tensor that contains start and end ids of the nodes in the latest
-// layer.
-func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+// - For 0D tensors, truthiness is determined by comparing against a "zero"
+//   value. For numerical types it is the obvious zero. For strings it is the
+//   empty string.
+//
+// - For >0D tensors, truthiness is determined by looking at the number of
+//   elements. If has zero elements, then the result is false. Otherwise the
+//   result is true.
+//
+// This matches the behavior of If and While for determining if a tensor counts
+// as true/false for a branch condition.
+func ToBool(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesGetEnsembleStates",
+		Type: "ToBool",
 		Input: []tf.Input{
-			tree_ensemble_handle,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
+// GenerateBoundingBoxProposalsAttr is an optional argument to GenerateBoundingBoxProposals.
+type GenerateBoundingBoxProposalsAttr func(optionalAttr)
 
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+// GenerateBoundingBoxProposalsPostNmsTopn sets the optional post_nms_topn attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+// value: An integer. Maximum number of rois in the output.
+// If not specified, defaults to 300
+func GenerateBoundingBoxProposalsPostNmsTopn(value int64) GenerateBoundingBoxProposalsAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["post_nms_topn"] = value
 	}
 }
 
-// Applies sparse addition to individual values or slices in a Variable.
+// This op produces Region of Interests from given bounding boxes(bbox_deltas) encoded wrt anchors according to eq.2 in arXiv:1506.01497
 //
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that addition would look like this:
-//
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// add = tf.scatter_nd_add(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(add)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 13, 3, 14, 14, 6, 7, 20]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+//       The op selects top `pre_nms_topn` scoring boxes, decodes them with respect to anchors,
+//       applies non-maximal suppression on overlapping boxes with higher than
+//       `nms_threshold` intersection-over-union (iou) value, discarding boxes where shorter
+//       side is less than `min_size`.
+//       Inputs:
+//       `scores`: A 4D tensor of shape [Batch, Height, Width, Num Anchors] containing the scores per anchor at given position
+//       `bbox_deltas`: is a tensor of shape [Batch, Height, Width, 4 x Num Anchors] boxes encoded to each anchor
+//       `anchors`: A 1D tensor of shape [4 x Num Anchors], representing the anchors.
+//       Outputs:
+//       `rois`: output RoIs, a 3D tensor of shape [Batch, post_nms_topn, 4], padded by 0 if less than post_nms_topn candidates found.
+//       `roi_probabilities`: probability scores of each roi in 'rois', a 2D tensor of shape [Batch,post_nms_topn], padded with 0 if needed, sorted by scores.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
+//	scores: A 4-D float tensor of shape `[num_images, height, width, num_achors]` containing scores of the boxes for given anchors, can be unsorted.
+//	bbox_deltas: A 4-D float tensor of shape `[num_images, height, width, 4 x num_anchors]`. encoding boxes with respec to each anchor.
+// Coordinates are given in the form [dy, dx, dh, dw].
+//	image_info: A 2-D float tensor of shape `[num_images, 5]` containing image information Height, Width, Scale.
+//	anchors: A 2-D float tensor of shape `[num_anchors, 4]` describing the anchor boxes. Boxes are formatted in the form [y1, x1, y2, x2].
+//	nms_threshold: A scalar float tensor for non-maximal-suppression threshold.
+//	pre_nms_topn: A scalar int tensor for the number of top scoring boxes to be used as input.
+//	min_size: A scalar float tensor. Any box that has a smaller size than min_size will be discarded.
 //
-// Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+// Returns:
+//	rois: A 3-D float tensor of shape `[num_images,post_nms_topn,4]` representing the selected
+// region of interest boxes. Sorted in descending order in scores.
+//	roi_probabilities: A 2-D float tensor of shape `[num_images, post_nms_topn]` representing the score of the
+// region of interest box in `rois` tensor at the same index.
+func GenerateBoundingBoxProposals(scope *Scope, scores tf.Output, bbox_deltas tf.Output, image_info tf.Output, anchors tf.Output, nms_threshold tf.Output, pre_nms_topn tf.Output, min_size tf.Output, optional ...GenerateBoundingBoxProposalsAttr) (rois tf.Output, roi_probabilities tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -42036,1480 +40399,80 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
+		Type: "GenerateBoundingBoxProposals",
 		Input: []tf.Input{
-			ref, indices, updates,
+			scores, bbox_deltas, image_info, anchors, nms_threshold, pre_nms_topn, min_size,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that batches input elements into a SparseTensor.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load Adadelta parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, updates, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Convert one or more images from HSV to RGB.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
-//
-// See `rgb_to_hsv` for a description of the HSV encoding.
-//
-// Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
-//
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
-		Input: []tf.Input{
-			images,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of `NOT x` element-wise.
-//
-// Arguments:
-//	x: A `Tensor` of type `bool`.
-//
-// Returns A `Tensor` of type `bool` with the same shape as `x`. The logical negation of `x`.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalNot",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs deterministic pseudorandom random numbers from a Poisson distribution.
-//
-// Outputs random values from a Poisson distribution.
-//
-// The outputs are a deterministic function of `shape`, `seed`, and `lam`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	lam: The rate of the Poisson distribution. Shape must match the rightmost dimensions
-// of `shape`.
-//	dtype: The type of the output.
-//
-// Returns Random values with specified shape.
-func StatelessRandomPoisson(scope *Scope, shape tf.Output, seed tf.Output, lam tf.Output, dtype tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomPoisson",
-		Input: []tf.Input{
-			shape, seed, lam,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve Adadelta embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the Adadelta optimization algorithm.
-//	accumulators: Parameter accumulators updated by the Adadelta optimization algorithm.
-//	updates: Parameter updates updated by the Adadelta optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyFtrlV2MultiplyLinearByLr sets the optional multiply_linear_by_lr attribute to value.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2MultiplyLinearByLr(value bool) ResourceSparseApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["multiply_linear_by_lr"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-//
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regularization. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
-type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load FTRL embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the FTRL optimization algorithm.
-//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
-//	linears: Value of linears used in the FTRL optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, linears, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
-type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersConfig(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load Adadelta embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParameters",
-		Input: []tf.Input{
-			parameters, accumulators, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "LatencyStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
-//
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pow",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Element-wise multiplication of a sparse matrix with a dense tensor.
-//
-// Returns a sparse matrix.
-//
-// The dense tensor `b` may be either a scalar; otherwise `a` must be a rank-3
-// `SparseMatrix`; in this case `b` must be shaped `[batch_size, 1, 1]` and the
-// multiply operation broadcasts.
-//
-// **NOTE** even if `b` is zero, the sparsity structure of the output does not
-// change.
-//
-// Arguments:
-//	a: A CSRSparseMatrix.
-//	b: A dense tensor.
-//
-// Returns A dense output tensor.
-func SparseMatrixMul(scope *Scope, a tf.Output, b tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseMatrixMul",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-//
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
-//
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An op enabling differentiation of TPU Embeddings.
-//
-// This op simply returns its first input, which is assumed to have been sliced
-// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
-// this op, and its first argument being a trainable Variable, enables automatic
-// differentiation of graphs containing embeddings via the TPU Embedding Python
-// libraries.
-//
-// Arguments:
-//	embedding_variable: A trainable variable, enabling optimizers to find this op.
-//	sliced_activations: The embedding activations Tensor to return.
-//	table_id: The id of the table in the embedding layer configuration from which
-// these activations were computed.
-//	lookup_id: Identifier of the set of embedding indices which produced these
-// activations.
-func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
-	opspec := tf.OpSpec{
-		Type: "TPUEmbeddingActivations",
-		Input: []tf.Input{
-			embedding_variable, sliced_activations,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the LSTM cell backward propagation for the entire time sequence.
-//
-// This implementation is to be used in conjunction of BlockLSTMV2.
-//
-// Arguments:
-//	seq_len_max: Maximum time length actually used by this input. Outputs are padded
-// with zeros beyond this length.
-//	x: The sequence input to the LSTM, shape (timelen, batch_size, num_inputs).
-//	cs_prev: Value of the initial cell state.
-//	h_prev: Initial output of cell (to be used for peephole).
-//	w: The weight matrix.
-//	wci: The weight matrix for input gate peephole connection.
-//	wcf: The weight matrix for forget gate peephole connection.
-//	wco: The weight matrix for output gate peephole connection.
-//	b: The bias vector.
-//	i: The input gate over the whole time sequence.
-//	cs: The cell state before the tanh over the whole time sequence.
-//	f: The forget gate over the whole time sequence.
-//	o: The output gate over the whole time sequence.
-//	ci: The cell input over the whole time sequence.
-//	co: The cell after the tanh over the whole time sequence.
-//	h: The output h vector over the whole time sequence.
-//	cs_grad: The current gradient of cs.
-//	h_grad: The gradient of h vector.
-//	use_peephole: Whether to use peephole weights.
-//
-// Returns:
-//	x_grad: The gradient of x to be back-propped.
-//	cs_prev_grad: The gradient of cs_prev to be back-propped.
-//	h_prev_grad: The gradient of h_prev to be back-propped.
-//	w_grad: The gradient for w to be back-propped.
-//	wci_grad: The gradient for wci to be back-propped.
-//	wcf_grad: The gradient for wcf to be back-propped.
-//	wco_grad: The gradient for wco to be back-propped.
-//	b_grad: The gradient for w to be back-propped.
-func BlockLSTMGradV2(scope *Scope, seq_len_max tf.Output, x tf.Output, cs_prev tf.Output, h_prev tf.Output, w tf.Output, wci tf.Output, wcf tf.Output, wco tf.Output, b tf.Output, i tf.Output, cs tf.Output, f tf.Output, o tf.Output, ci tf.Output, co tf.Output, h tf.Output, cs_grad tf.Output, h_grad tf.Output, use_peephole bool) (x_grad tf.Output, cs_prev_grad tf.Output, h_prev_grad tf.Output, w_grad tf.Output, wci_grad tf.Output, wcf_grad tf.Output, wco_grad tf.Output, b_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"use_peephole": use_peephole}
-	opspec := tf.OpSpec{
-		Type: "BlockLSTMGradV2",
-		Input: []tf.Input{
-			seq_len_max, x, cs_prev, h_prev, w, wci, wcf, wco, b, i, cs, f, o, ci, co, h, cs_grad, h_grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6), op.Output(7)
-}
-
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns:
-//	output_indices: 2-D.  The indices of the output SparseTensor.
-//	output_values: 1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes the Bessel i1e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI1e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear gradients for a Relu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
-//
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReluGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
-type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersConfig(value string) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load RMSProp embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the RMSProp optimization algorithm.
-//	ms: Value of ms used in the RMSProp optimization algorithm.
-//	mom: Value of mom used in the RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingRMSPropParameters",
-		Input: []tf.Input{
-			parameters, ms, mom,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
-//
-// which has shape (2, 4, 4)
-// ```
-//
-// Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
-		Input: []tf.Input{
-			diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug.
-type RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve SGD embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the stochastic gradient descent optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr) (parameters tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
-
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
-type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// value: Number of elements of the file, use -1 if unknown.
 // If not specified, defaults to -1
-func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["table_id"] = value
+		m["vocab_size"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["table_name"] = value
+		m["delimiter"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingMomentumParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersConfig(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters.
+// Initializes a table from a text file.
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
 //
-// Returns:
-//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
-//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
-
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to {}
-//
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
-//
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
-type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load Momentum embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Momentum optimization algorithm.
-//	momenta: Value of momenta used in the Momentum optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
-//
-//
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
 //
 // Returns the created operation.
-func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			parameters, momenta, gradient_accumulators,
+			table_handle, filename,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// RecvAttr is an optional argument to Recv.
-type RecvAttr func(optionalAttr)
-
-// RecvClientTerminated sets the optional client_terminated attribute to value.
-//
-// value: If set to true, this indicates that the node was added
-// to the graph as a result of a client-side feed or fetch of Tensor data,
-// in which case the corresponding send or recv is expected to be managed
-// locally by the caller.
-// If not specified, defaults to false
-func RecvClientTerminated(value bool) RecvAttr {
-	return func(m optionalAttr) {
-		m["client_terminated"] = value
-	}
-}
-
-// Receives the named tensor from send_device on recv_device.
-//
-// Arguments:
-//
-//	tensor_name: The name of the tensor to receive.
-//	send_device: The name of the device sending the tensor.
-//	send_device_incarnation: The current incarnation of send_device.
-//	recv_device: The name of the device receiving the tensor.
-//
-// Returns The tensor to receive.
-func Recv(scope *Scope, tensor_type tf.DataType, tensor_name string, send_device string, send_device_incarnation int64, recv_device string, optional ...RecvAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"tensor_type": tensor_type, "tensor_name": tensor_name, "send_device": send_device, "send_device_incarnation": send_device_incarnation, "recv_device": recv_device}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Recv",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
-type TPUReplicateMetadataAttr func(optionalAttr)
-
-// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
-//
-// value: Number of cores per replica. Used for model parallelism.
-// If not specified, defaults to 1
-func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["num_cores_per_replica"] = value
-	}
-}
-
-// TPUReplicateMetadataTopology sets the optional topology attribute to value.
-//
-// value: TopologyProto indicating the topology of the TPU pod slice.
-// If not specified, defaults to ""
-func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["topology"] = value
-	}
-}
-
-// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
-//
-// value: Whether to place the computation on the TPU.
-// If not specified, defaults to true
-func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["use_tpu"] = value
-	}
-}
-
-// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
-//
-// value: The assignment of devices for the computation.
-// If not specified, defaults to {}
-func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["device_assignment"] = value
-	}
-}
-
-// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
-//
-// value: DEPRECATED. Use num_cores_per_replica instead.
-// If not specified, defaults to {}
-func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["computation_shape"] = value
-	}
-}
-
-// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
-// If not specified, defaults to {}
-func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["host_compute_core"] = value
-	}
-}
-
-// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
-// If not specified, defaults to {}
-func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["padding_map"] = value
-	}
-}
-
-// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
-// If not specified, defaults to "STEP_MARK_AT_ENTRY"
-func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["step_marker_location"] = value
-	}
-}
-
-// TPUReplicateMetadataAllowSoftPlacement sets the optional allow_soft_placement attribute to value.
-// If not specified, defaults to false
-func TPUReplicateMetadataAllowSoftPlacement(value bool) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["allow_soft_placement"] = value
-	}
-}
-
-// Metadata indicating how the TPU computation should be replicated.
-//
-// This operation holds the metadata common to operations of a `tpu.replicate()` computation subgraph.
-//
-// Arguments:
-//	num_replicas: Number of replicas of the computation
-//
-// Returns the created operation.
-func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_replicas": num_replicas}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUReplicateMetadata",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
-type RequantizePerChannelAttr func(optionalAttr)
-
-// RequantizePerChannelOutType sets the optional out_type attribute to value.
-//
-// value: The quantized type of output tensor that needs to be converted.
-// If not specified, defaults to DT_QUINT8
-func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Requantizes input with min and max values known per channel.
-//
-// Arguments:
-//	input: The original input tensor.
-//	input_min: The minimum value of the input tensor
-//	input_max: The maximum value of the input tensor.
-//	requested_output_min: The minimum value of the output tensor requested.
-//	requested_output_max: The maximum value of the output tensor requested.
-//
-// Returns:
-//	output: Output tensor.
-//	output_min: The minimum value of the final output tensor
-//	output_max: The maximum value of the final output tensor.
-func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RequantizePerChannel",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// LeakyReluAttr is an optional argument to LeakyRelu.
-type LeakyReluAttr func(optionalAttr)
-
-// LeakyReluAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluAlpha(value float32) LeakyReluAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// Computes rectified linear: `max(features, features * alpha)`.
-func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LeakyRelu",
-		Input: []tf.Input{
-			features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns an element-wise indication of the sign of a number.
 //
 // `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
@@ -43582,580 +40545,6 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
-// LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
-type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingMDLAdagradLightParametersTableId(value int64) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMDLAdagradLightParametersTableName(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingMDLAdagradLightParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMDLAdagradLightParametersConfig(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load MDL Adagrad Light embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the MDL Adagrad Light optimization algorithm.
-//	accumulators: Value of accumulators used in the MDL Adagrad Light optimization algorithm.
-//	weights: Value of weights used in the MDL Adagrad Light optimization algorithm.
-//	benefits: Value of benefits used in the MDL Adagrad Light optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMDLAdagradLightParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMDLAdagradLightParameters",
-		Input: []tf.Input{
-			parameters, accumulators, weights, benefits,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
-//
-// Returns:
-//	key: A scalar.
-//	value: A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumprodReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumprod",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug.
-type LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load SGD embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug(scope *Scope, parameters tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Creates a Dataset that returns a stream of uniformly distributed
-// pseudorandom 64-bit signed integers.
-//
-// In the TensorFlow Python API, you can instantiate this dataset via the
-// class `tf.data.experimental.RandomDataset`.
-//
-// Instances of this dataset are also created as a result of the
-// `hoist_random_uniform` static optimization. Whether this optimization is
-// performed is determined by the `experimental_optimization.hoist_random_uniform`
-// option of `tf.data.Options`.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
-//
-func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
-
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns:
-//	output: output tensor after fractional avg pooling.
-//	row_pooling_sequence: row pooling sequence, needed to calculate gradient.
-//	col_pooling_sequence: column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
-type StatefulUniformFullIntAttr func(optionalAttr)
-
-// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns Random values with specified shape.
-func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulUniformFullInt",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
-type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingStochasticGradientDescentParametersConfig(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load SGD embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
-		Input: []tf.Input{
-			parameters,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
-//
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Bincount",
-		Input: []tf.Input{
-			arr, size, weights,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. See `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-//
-// Returns:
-//	dx: 4D backprop tensor for input.
-//	dm: 1D backprop tensor for mean.
-//	dv: 1D backprop tensor for variance.
-//	db: 1D backprop tensor for beta.
-//	dg: 1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
-		Input: []tf.Input{
-			t, m, v, gamma, backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Strip leading and trailing whitespaces from the Tensor.
-//
-// Arguments:
-//	input: A string `Tensor` of any shape.
-//
-// Returns A string `Tensor` of the same shape as the input.
-//
-// Examples:
-//
-// >>> tf.strings.strip(["\nTensorFlow", "     The python library    "]).numpy()
-// array([b'TensorFlow', b'The python library'], dtype=object)
-func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StringStrip",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the number of work units this Reader has finished processing.
 //
 // Arguments:
@@ -44383,283 +40772,23 @@ func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf
 	return scope.AddOperation(opspec)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
-//
-// and `max` to 'outputs' tensor of same shape as `inputs`.
-//
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-//
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-//
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
-		Input: []tf.Input{
-			inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Enqueue multiple Tensor values on the computation outfeed.
+// Strip leading and trailing whitespaces from the Tensor.
 //
 // Arguments:
-//	inputs: A list of tensors that will be inserted into the outfeed queue as an
-// XLA tuple.
+//	input: A string `Tensor` of any shape.
 //
-// Returns the created operation.
-func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
+// Returns A string `Tensor` of the same shape as the input.
+//
+// Examples:
+//
+// >>> tf.strings.strip(["\nTensorFlow", "     The python library    "]).numpy()
+// array([b'TensorFlow', b'The python library'], dtype=object)
+func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OutfeedEnqueueTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
-type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingADAMParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingADAMParametersConfig(value string) LoadTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load ADAM embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the ADAM optimization algorithm.
-//	momenta: Value of momenta used in the ADAM optimization algorithm.
-//	velocities: Value of velocities used in the ADAM optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingADAMParameters",
-		Input: []tf.Input{
-			parameters, momenta, velocities,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
-//
-// Arguments:
-//	input: Base64 strings to decode.
-//
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "StringStrip",
 		Input: []tf.Input{
 			input,
 		},
@@ -44668,2309 +40797,6 @@ func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
-type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersConfig(value string) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load Adagrad embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdagradParameters",
-		Input: []tf.Input{
-			parameters, accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_mean(c, tf.constant([0, 0, 1]))
-// # ==> [[2.5, 2.5, 2.5, 2.5],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMean",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CTCLossV2Attr is an optional argument to CTCLossV2.
-type CTCLossV2Attr func(optionalAttr)
-
-// CTCLossV2PreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
-//
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossV2PreprocessCollapseRepeated(value bool) CTCLossV2Attr {
-	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
-	}
-}
-
-// CTCLossV2CtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
-//
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossV2CtcMergeRepeated(value bool) CTCLossV2Attr {
-	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
-	}
-}
-
-// CTCLossV2IgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
-//
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
-// If not specified, defaults to false
-func CTCLossV2IgnoreLongerOutputsThanInputs(value bool) CTCLossV2Attr {
-	return func(m optionalAttr) {
-		m["ignore_longer_outputs_than_inputs"] = value
-	}
-}
-
-// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
-//
-// the gradient.  This class performs the softmax operation for you, so inputs
-// should be e.g. linear projections of outputs by an LSTM.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. Default blank
-// label is 0 rather num_classes - 1.
-//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
-// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-// `(batch b, time t)`.
-//	labels_values: The values (labels) associated with the given batch and time.
-//	sequence_length: A vector containing sequence lengths (batch).
-//
-// Returns:
-//	loss: A vector (batch) containing log-probabilities.
-//	gradient: The gradient of `loss`.  3-D, shape:
-// `(max_time x batch_size x num_classes)`.
-func CTCLossV2(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossV2Attr) (loss tf.Output, gradient tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCLossV2",
-		Input: []tf.Input{
-			inputs, labels_indices, labels_values, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
-type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum - lr * grad
-// var += accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyKerasMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LuAttr is an optional argument to Lu.
-type LuAttr func(optionalAttr)
-
-// LuOutputIdxType sets the optional output_idx_type attribute to value.
-// If not specified, defaults to DT_INT32
-func LuOutputIdxType(value tf.DataType) LuAttr {
-	return func(m optionalAttr) {
-		m["output_idx_type"] = value
-	}
-}
-
-// Computes the LU decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be invertible.
-//
-// The output consists of two tensors LU and P containing the LU decomposition
-// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
-// upper triangular factors.
-//
-// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
-// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
-// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
-// entries correspond to the upper triangular part, including the diagonal, of LU.
-//
-// P represents a permutation matrix encoded as a list of indices each between `0`
-// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
-// P, then the L, U and P satisfies P_mat * input = L * U.
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
-// size `[M, M]`.
-//
-// Returns:
-//	lu: A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
-// lower triangular factor `L` with unit diagonal, and whose upper triangular part
-// denotes the upper triangular factor `U`.
-//	p: Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
-// `[..., M]`.
-// @compatibility(scipy)
-// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
-// packed into a single tensor, the permutation is applied to `input` instead of
-// the right hand side and the permutation `P` is returned as a list of indices
-// instead of a permutation matrix.
-// @end_compatibility
-func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Lu",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ImageProjectiveTransformV2Attr is an optional argument to ImageProjectiveTransformV2.
-type ImageProjectiveTransformV2Attr func(optionalAttr)
-
-// ImageProjectiveTransformV2FillMode sets the optional fill_mode attribute to value.
-//
-// value: Fill mode, "REFLECT", "WRAP", or "CONSTANT".
-// If not specified, defaults to "CONSTANT"
-func ImageProjectiveTransformV2FillMode(value string) ImageProjectiveTransformV2Attr {
-	return func(m optionalAttr) {
-		m["fill_mode"] = value
-	}
-}
-
-// Applies the given transform to each of the images.
-//
-// If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps
-// the *output* point `(x, y)` to a transformed *input* point
-// `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
-// `k = c0 x + c1 y + 1`. If the transformed point lays outside of the input
-// image, the output pixel is set to 0.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	transforms: 2-D Tensor, `[batch, 8]` or `[1, 8]` matrix, where each row corresponds to a 3 x 3
-// projective transformation matrix, with the last entry assumed to be 1. If there
-// is one row, the same transformation will be applied to all images.
-//	output_shape: 1-D Tensor [new_height, new_width].
-//	interpolation: Interpolation method, "NEAREST" or "BILINEAR".
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ImageProjectiveTransformV2(scope *Scope, images tf.Output, transforms tf.Output, output_shape tf.Output, interpolation string, optional ...ImageProjectiveTransformV2Attr) (transformed_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"interpolation": interpolation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ImageProjectiveTransformV2",
-		Input: []tf.Input{
-			images, transforms, output_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
-type InfeedEnqueueTupleAttr func(optionalAttr)
-
-// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence for
-// all the tuple shapes, in the order the shapes appear in the "shapes" input.
-// The layout elements for a sub-shape can be set to -1, in which case the
-// corresponding layout will be computed by the infeed operation.
-// If not specified, defaults to {}
-func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["layouts"] = value
-	}
-}
-
-// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Feeds multiple Tensor values into the computation as an XLA tuple.
-//
-// Arguments:
-//	inputs: A list of tensors that will be provided using the infeed mechanism.
-//	shapes: The shapes of each tensor in `inputs`.
-//
-// Returns the created operation.
-func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueueTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Extracts the average gradient in the given ConditionalAccumulator.
-//
-// The op blocks until sufficient (i.e., more than num_required)
-// gradients have been accumulated.  If the accumulator has already
-// aggregated more than num_required gradients, it returns the average of
-// the accumulated gradients.  Also automatically increments the recorded
-// global_step in the accumulator by 1, and resets the aggregate to 0.
-//
-// Arguments:
-//	handle: The handle to an accumulator.
-//	num_required: Number of gradients required before we return an aggregate.
-//	dtype: The data type of accumulated gradients. Needs to correspond to the type
-// of the accumulator.
-//
-// Returns The average of the accumulated gradients.
-func ResourceAccumulatorTakeGradient(scope *Scope, handle tf.Output, num_required tf.Output, dtype tf.DataType) (average tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "ResourceAccumulatorTakeGradient",
-		Input: []tf.Input{
-			handle, num_required,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
-type InfeedEnqueueAttr func(optionalAttr)
-
-// InfeedEnqueueShape sets the optional shape attribute to value.
-//
-// value: The shape of the tensor.
-// If not specified, defaults to {}
-func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// InfeedEnqueueLayout sets the optional layout attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence.
-// If a layout attribute is passed, but its values are all -1, the layout will
-// be computed by the infeed operation.
-// If not specified, defaults to {}
-func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["layout"] = value
-	}
-}
-
-// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op which feeds a single Tensor value into the computation.
-//
-// Arguments:
-//	input: A tensor that will be provided using the infeed mechanism.
-//
-// Returns the created operation.
-func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
-//
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InTopKV2",
-		Input: []tf.Input{
-			predictions, targets, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates an Optional variant with no value.
-func OptionalNone(scope *Scope) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalNone",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
-type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersConfig(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve SGD embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
-func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise left-shift of `x` and `y`.
-//
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
-//
-// Example:
-//
-// ```python
-// import tensorflow as tf
-// from tensorflow.python.ops import bitwise_ops
-// import numpy as np
-// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
-//
-// for dtype in dtype_list:
-//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
-//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
-//
-//   left_shift_result = bitwise_ops.left_shift(lhs, rhs)
-//
-//   print(left_shift_result)
-//
-// # This will print:
-// # tf.Tensor([ -32   -5 -128    0], shape=(4,), dtype=int8)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int16)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int32)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int64)
-//
-// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
-// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
-// bitwise_ops.left_shift(lhs, rhs)
-// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
-// ```
-//
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LeftShift",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates a feature cross from a list of tensors, and returns it as a
-// RaggedTensor.  See `tf.ragged.cross` for more details.
-//
-// Arguments:
-//	ragged_values: The values tensor for each RaggedTensor input.
-//	ragged_row_splits: The row_splits tensor for each RaggedTensor input.
-//	sparse_indices: The indices tensor for each SparseTensor input.
-//	sparse_values: The values tensor for each SparseTensor input.
-//	sparse_shape: The dense_shape tensor for each SparseTensor input.
-//	dense_inputs: The tf.Tensor inputs.
-//	input_order: String specifying the tensor type for each input.  The `i`th character in
-// this string specifies the type of the `i`th input, and is one of: 'R' (ragged),
-// 'D' (dense), or 'S' (sparse).  This attr is used to ensure that the crossed
-// values are combined in the order of the inputs from the call to tf.ragged.cross.
-//
-//
-//
-//
-//
-//
-// Returns:
-//	output_values: The `values` for the returned `RaggedTensor`.
-//	output_row_splits: The `row_splits` for the returned `RaggedTensor`.
-func RaggedCross(scope *Scope, ragged_values []tf.Output, ragged_row_splits []tf.Output, sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shape []tf.Output, dense_inputs []tf.Output, input_order string, hashed_output bool, num_buckets int64, hash_key int64, out_values_type tf.DataType, out_row_splits_type tf.DataType) (output_values tf.Output, output_row_splits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_order": input_order, "hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_values_type": out_values_type, "out_row_splits_type": out_row_splits_type}
-	opspec := tf.OpSpec{
-		Type: "RaggedCross",
-		Input: []tf.Input{
-			tf.OutputList(ragged_values), tf.OutputList(ragged_row_splits), tf.OutputList(sparse_indices), tf.OutputList(sparse_values), tf.OutputList(sparse_shape), tf.OutputList(dense_inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-//   Combines (nests of) input elements into a dataset of (nests of) windows.
-//
-//   A "window" is a finite dataset of flat elements of size `size` (or possibly
-//   fewer if there are not enough input elements to fill the window and
-//   `drop_remainder` evaluates to false).
-//
-//   The `shift` argument determines the number of input elements by which
-//   the window moves on each iteration.  The first element in the `k`th window
-//   will be element
-//
-//   ```
-//   1 + (k-1) * shift
-//   ```
-//
-//   of the input dataset. In particular, the first element of the first window
-//   will always be the first element of the input dataset.
-//
-//   If the `stride` parameter is greater than 1, then each window will skip
-//   `(stride - 1)` input elements between each element that appears in the
-//   window. Output windows will still contain `size` elements regardless of
-//   the value of `stride`.
-//
-//   The `stride` argument determines the stride of the input elements, and the
-//   `shift` argument determines the shift of the window.
-//
-//   For example, letting `{...}` to represent a Dataset:
-//
-//   - `tf.data.Dataset.range(7).window(2)` produces
-//     `{{0, 1}, {2, 3}, {4, 5}, {6}}`
-//   - `tf.data.Dataset.range(7).window(3, 2, 1, True)` produces
-//     `{{0, 1, 2}, {2, 3, 4}, {4, 5, 6}}`
-//   - `tf.data.Dataset.range(7).window(3, 1, 2, True)` produces
-//     `{{0, 2, 4}, {1, 3, 5}, {2, 4, 6}}`
-//
-//   Note that when the `window` transformation is applied to a dataset of
-//   nested elements, it produces a dataset of nested windows.
-//
-//   For example:
-//
-//   - `tf.data.Dataset.from_tensor_slices((range(4), range(4))).window(2)`
-//     produces `{({0, 1}, {0, 1}), ({2, 3}, {2, 3})}`
-//   - `tf.data.Dataset.from_tensor_slices({"a": range(4)}).window(2)`
-//     produces `{{"a": {0, 1}}, {"a": {2, 3}}}`
-//
-// Arguments:
-//
-//	size: An integer scalar, representing the number of elements
-// of the input dataset to combine into a window. Must be positive.
-//	shift: An integer scalar, representing the number of input elements
-// by which the window moves in each iteration.  Defaults to `size`.
-// Must be positive.
-//	stride: An integer scalar, representing the stride of the input elements
-// in the sliding window. Must be positive. The default value of 1 means
-// "retain every input element".
-//	drop_remainder: A Boolean scalar, representing whether the last window should be
-// dropped if its size is smaller than `window_size`.
-//
-//
-func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "WindowDataset",
-		Input: []tf.Input{
-			input_dataset, size, shift, stride, drop_remainder,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
-
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Number of unique elements along last dimension of input `set`.
-//
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
-//
-// Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
-//
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SetSize",
-		Input: []tf.Input{
-			set_indices, set_values, set_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AutoShardDatasetAttr is an optional argument to AutoShardDataset.
-type AutoShardDatasetAttr func(optionalAttr)
-
-// AutoShardDatasetAutoShardPolicy sets the optional auto_shard_policy attribute to value.
-// If not specified, defaults to 0
-func AutoShardDatasetAutoShardPolicy(value int64) AutoShardDatasetAttr {
-	return func(m optionalAttr) {
-		m["auto_shard_policy"] = value
-	}
-}
-
-// Creates a dataset that shards the input dataset.
-//
-// Creates a dataset that shards the input dataset by num_workers, returning a
-// sharded dataset for the index-th worker. This attempts to automatically shard
-// a dataset by examining the Dataset graph and inserting a shard op before the
-// inputs to a reader Dataset (e.g. CSVDataset, TFRecordDataset).
-//
-// This dataset will throw a NotFound error if we cannot shard the dataset
-// automatically.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_workers: A scalar representing the number of workers to distribute this dataset across.
-//	index: A scalar representing the index of the current worker out of num_workers.
-//
-//
-func AutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...AutoShardDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AutoShardDataset",
-		Input: []tf.Input{
-			input_dataset, num_workers, index,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
-type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
-
-// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
-// and = 0 when the Op is running on the CPU device.
-// If not specified, defaults to -1
-func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op which enqueues prelinearized buffer into TPU infeed.
-//
-// Arguments:
-//	input: A variant tensor representing linearized output.
-//
-// Returns the created operation.
-func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueuePrelinearizedBuffer",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// An Op to sum inputs across replicated TPU instances.
-//
-// Each instance supplies its own input.
-//
-// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
-// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
-// and `B, D, F, H` as group 1. Thus we get the outputs:
-// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
-//
-// Arguments:
-//	input: The local input to the sum.
-//	group_assignment: An int32 tensor with shape
-// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-// replica ids in the ith subgroup.
-//
-// Returns The sum of all the distributed inputs.
-func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CrossReplicaSum",
-		Input: []tf.Input{
-			input, group_assignment,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
-
-// SubstrUnit sets the optional unit attribute to value.
-//
-// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
-// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
-// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
-// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
-// UTF-8.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, or if `len` is negative, then as many characters as possible are used.
-//
-// A negative `pos` indicates distance within the string backwards from the end.
-//
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
-//
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
-//
-// Broadcasting `input` onto `pos` and `len`:
-//
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
-//
-// output = [b'hir', b'ee', b'n']
-// ```
-//
-// Raises:
-//
-//   * `ValueError`: If the first argument cannot be converted to a
-//      Tensor of `dtype string`.
-//   * `InvalidArgumentError`: If indicies are out of range.
-//   * `ValueError`: If `pos` and `len` are not the same shape.
-//
-//
-// Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
-//
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Substr",
-		Input: []tf.Input{
-			input, pos, len,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A TPU core selector Op.
-//
-// This Op produces a set of TPU cores (for warm-up) or a single TPU core
-// (for regular inference) to execute the TPU program on. The output is
-// consumed by TPUPartitionedCall.
-//
-// Returns A vector 1 or more TPU cores.
-func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUOrdinalSelector",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
-func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNGradBias sets the optional bias attribute to value.
-//
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNGradAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNGradBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
-//
-// Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
-//
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LRNGrad",
-		Input: []tf.Input{
-			input_grads, input_image, output_image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrelinearizeAttr is an optional argument to Prelinearize.
-type PrelinearizeAttr func(optionalAttr)
-
-// PrelinearizeShape sets the optional shape attribute to value.
-//
-// value: The shape of the tensor.
-// If not specified, defaults to {}
-func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// PrelinearizeLayout sets the optional layout attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence. If a layout
-// attribute is passed but its values are all -1 the layout will be computed by
-// the infeed operation.
-// If not specified, defaults to {}
-func PrelinearizeLayout(value []int64) PrelinearizeAttr {
-	return func(m optionalAttr) {
-		m["layout"] = value
-	}
-}
-
-// An op which linearizes one Tensor value to an opaque variant tensor.
-//
-// Arguments:
-//	input: A tensor that will be linearized.
-func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Prelinearize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the result of a TPU compilation.
-//
-// This operation returns the result of a TPU compilation as a serialized
-// CompilationResultProto, which holds a status and an error message if an error
-// occurred during compilation.
-func TPUCompilationResult(scope *Scope) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUCompilationResult",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExperimentalRebatchDatasetAttr is an optional argument to ExperimentalRebatchDataset.
-type ExperimentalRebatchDatasetAttr func(optionalAttr)
-
-// ExperimentalRebatchDatasetUseFallback sets the optional use_fallback attribute to value.
-// If not specified, defaults to true
-func ExperimentalRebatchDatasetUseFallback(value bool) ExperimentalRebatchDatasetAttr {
-	return func(m optionalAttr) {
-		m["use_fallback"] = value
-	}
-}
-
-// Creates a dataset that changes the batch size.
-//
-// Creates a dataset that changes the batch size of the dataset to current batch
-// size // num_replicas.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_replicas: A scalar representing the number of replicas to distribute this batch across. As
-// a result of this transformation the current batch size would end up being
-// divided  by this parameter.
-//
-//
-func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_replicas tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalRebatchDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalRebatchDataset",
-		Input: []tf.Input{
-			input_dataset, num_replicas,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Concat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
-
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
-		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// TensorListConcatAttr is an optional argument to TensorListConcat.
-type TensorListConcatAttr func(optionalAttr)
-
-// TensorListConcatElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
-func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Concats all tensors in the list along the 0th dimension.
-//
-// Requires that all tensors have the same shape except the first dimension.
-//
-// input_handle: The input list.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
-//
-func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListConcat",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
-type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load RMSProp embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the RMSProp optimization algorithm.
-//	ms: Value of ms used in the RMSProp optimization algorithm.
-//	mom: Value of mom used in the RMSProp optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, ms, mom, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// EqualAttr is an optional argument to Equal.
-type EqualAttr func(optionalAttr)
-
-// EqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
-// If not specified, defaults to true
-func EqualIncompatibleShapeError(value bool) EqualAttr {
-	return func(m optionalAttr) {
-		m["incompatible_shape_error"] = value
-	}
-}
-
-// Returns the truth value of (x == y) element-wise.
-//
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ```python
-// x = tf.constant([2, 4])
-// y = tf.constant(2)
-// tf.math.equal(x, y) ==> array([True, False])
-//
-// x = tf.constant([2, 4])
-// y = tf.constant([2, 4])
-// tf.math.equal(x, y) ==> array([True,  True])
-// ```
-func Equal(scope *Scope, x tf.Output, y tf.Output, optional ...EqualAttr) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Equal",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//
-//
-// Returns:
-//	result_indices: 2D indices of a `SparseTensor`.
-//	result_values: 1D values of a `SparseTensor`.
-//	result_shape: 1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
-		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// UnsortedSegmentJoinAttr is an optional argument to UnsortedSegmentJoin.
-type UnsortedSegmentJoinAttr func(optionalAttr)
-
-// UnsortedSegmentJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func UnsortedSegmentJoinSeparator(value string) UnsortedSegmentJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the elements of `inputs` based on `segment_ids`.
-//
-// Computes the string join along segments of a tensor.
-// Given `segment_ids` with rank `N` and `data` with rank `N+M`:
-//
-//     `output[i, k1...kM] = strings.join([data[j1...jN, k1...kM])`
-//
-// where the join is over all [j1...jN] such that segment_ids[j1...jN] = i.
-// Strings are joined in row-major order.
-//
-// For example:
-//
-// ```python
-// inputs = [['Y', 'q', 'c'], ['Y', '6', '6'], ['p', 'G', 'a']]
-// output_array = string_ops.unsorted_segment_join(inputs=inputs,
-//                                                 segment_ids=[1, 0, 1],
-//                                                 num_segments=2,
-//                                                 separator=':'))
-// # output_array ==> [['Y', '6', '6'], ['Y:p', 'q:G', 'c:a']]
-//
-//
-// inputs = ['this', 'is', 'a', 'test']
-// output_array = string_ops.unsorted_segment_join(inputs=inputs,
-//                                                 segment_ids=[0, 0, 0, 0],
-//                                                 num_segments=1,
-//                                                 separator=':'))
-// # output_array ==> ['this:is:a:test']
-// ```
-//
-// Arguments:
-//	inputs: The input to be joined.
-//	segment_ids: A tensor whose shape is a prefix of data.shape.  Negative segment ids are not
-// supported.
-//	num_segments: A scalar.
-func UnsortedSegmentJoin(scope *Scope, inputs tf.Output, segment_ids tf.Output, num_segments tf.Output, optional ...UnsortedSegmentJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentJoin",
-		Input: []tf.Input{
-			inputs, segment_ids, num_segments,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SerializeIteratorAttr is an optional argument to SerializeIterator.
-type SerializeIteratorAttr func(optionalAttr)
-
-// SerializeIteratorExternalStatePolicy sets the optional external_state_policy attribute to value.
-// If not specified, defaults to 0
-func SerializeIteratorExternalStatePolicy(value int64) SerializeIteratorAttr {
-	return func(m optionalAttr) {
-		m["external_state_policy"] = value
-	}
-}
-
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output, optional ...SerializeIteratorAttr) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
-		Input: []tf.Input{
-			resource_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
-type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingFTRLParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersConfig(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve FTRL embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the FTRL optimization algorithm.
-//	accumulators: Parameter accumulators updated by the FTRL optimization algorithm.
-//	linears: Parameter linears updated by the FTRL optimization algorithm.
-func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingFTRLParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
-type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op that enqueues a list of input batch tensors to TPUEmbedding.
-//
-// Arguments:
-//	batch: A list of 1D tensors, one for each embedding table, containing the
-// indices into the tables.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingIntegerBatch",
-		Input: []tf.Input{
-			tf.OutputList(batch), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the gradient of `Tile`.
-//
-// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
-//
-// Since `Tile` takes an input and repeats the input `multiples` times
-// along each dimension, `TileGrad` takes in `multiples` and aggregates
-// each repeated tile of `input` into `output`.
-func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TileGrad",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
-type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingFTRLParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersConfig(value string) LoadTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load FTRL embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the FTRL optimization algorithm.
-//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
-//	linears: Value of linears used in the FTRL optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingFTRLParameters",
-		Input: []tf.Input{
-			parameters, accumulators, linears,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
-
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
-func DepthwiseConv2dNativeExplicitPaddings(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
-func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
-//
-// ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-// ```
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates an all-zeros CSRSparseMatrix with shape `dense_shape`.
-//
-// Arguments:
-//	dense_shape: The desired matrix shape.
-//
-//
-// Returns An empty CSR matrix with shape `dense_shape`.
-func SparseMatrixZeros(scope *Scope, dense_shape tf.Output, type_ tf.DataType) (sparse_matrix tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"type": type_}
-	opspec := tf.OpSpec{
-		Type: "SparseMatrixZeros",
-		Input: []tf.Input{
-			dense_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the minimum along segments of a tensor.
 //
 // Read
@@ -47059,100 +40885,123 @@ func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.Dat
 	return outputs
 }
 
-// ResourceApplyAdagradV2Attr is an optional argument to ResourceApplyAdagradV2.
-type ResourceApplyAdagradV2Attr func(optionalAttr)
+// LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
+type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
 
-// ResourceApplyAdagradV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradV2UseLocking(value bool) ResourceApplyAdagradV2Attr {
+// LoadTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingMDLAdagradLightParametersTableId(value int64) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// ResourceApplyAdagradV2UpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradV2UpdateSlots(value bool) ResourceApplyAdagradV2Attr {
+// LoadTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMDLAdagradLightParametersTableName(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
 	return func(m optionalAttr) {
-		m["update_slots"] = value
+		m["table_name"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
+// LoadTPUEmbeddingMDLAdagradLightParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMDLAdagradLightParametersConfig(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load MDL Adagrad Light embedding parameters.
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
+//	parameters: Value of parameters used in the MDL Adagrad Light optimization algorithm.
+//	accumulators: Value of accumulators used in the MDL Adagrad Light optimization algorithm.
+//	weights: Value of weights used in the MDL Adagrad Light optimization algorithm.
+//	benefits: Value of benefits used in the MDL Adagrad Light optimization algorithm.
+//
+//
 //
 // Returns the created operation.
-func ResourceApplyAdagradV2(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdagradV2Attr) (o *tf.Operation) {
+func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMDLAdagradLightParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradV2",
+		Type: "LoadTPUEmbeddingMDLAdagradLightParameters",
 		Input: []tf.Input{
-			var_, accum, lr, epsilon, grad,
+			parameters, accumulators, weights, benefits,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns the number of gradients aggregated in the given accumulators.
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	handle: The handle to an accumulator.
-//
-// Returns The number of gradients aggregated in the given accumulator.
-func ResourceAccumulatorNumAccumulated(scope *Scope, handle tf.Output) (num_accumulated tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceAccumulatorNumAccumulated",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Connects N outputs from an N-way replicated TPU computation.
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// This operation holds a replicated output from a `tpu.replicate()` computation subgraph.
-// Each replicated output has the same shape and type alongside the input.
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-// For example:
-// ```
-// %computation = "tf.Computation"()
-// %replicated_output:2 = "tf.TPUReplicatedOutput"(%computation)
-// ```
-// The above computation has a replicated output of two replicas.
-func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TPUReplicatedOutput",
+		Type: "MapPeek",
 		Input: []tf.Input{
-			input,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
@@ -47162,84 +41011,186 @@ func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (out
 	}
 	var idx int
 	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("TPUReplicatedOutput", err)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
 		return
 	}
-	return outputs
+	return values
 }
 
-// Converts a tensor to a scalar predicate.
+// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
+type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersConfig(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve centered RMSProp embedding parameters.
 //
-// Converts a tensor to a scalar predicate with the following rules:
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// - For 0D tensors, truthiness is determined by comparing against a "zero"
-//   value. For numerical types it is the obvious zero. For strings it is the
-//   empty string.
-//
-// - For >0D tensors, truthiness is determined by looking at the number of
-//   elements. If has zero elements, then the result is false. Otherwise the
-//   result is true.
-//
-// This matches the behavior of If and While for determining if a tensor counts
-// as true/false for a branch condition.
-func ToBool(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns:
+//	parameters: Parameter parameters updated by the centered RMSProp optimization algorithm.
+//	ms: Parameter ms updated by the centered RMSProp optimization algorithm.
+//	mom: Parameter mom updated by the centered RMSProp optimization algorithm.
+//	mg: Parameter mg updated by the centered RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ToBool",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// GenerateBoundingBoxProposalsAttr is an optional argument to GenerateBoundingBoxProposals.
-type GenerateBoundingBoxProposalsAttr func(optionalAttr)
-
-// GenerateBoundingBoxProposalsPostNmsTopn sets the optional post_nms_topn attribute to value.
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
-// value: An integer. Maximum number of rois in the output.
-// If not specified, defaults to 300
-func GenerateBoundingBoxProposalsPostNmsTopn(value int64) GenerateBoundingBoxProposalsAttr {
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExample",
+		Input: []tf.Input{
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// DatasetToGraphAttr is an optional argument to DatasetToGraph.
+type DatasetToGraphAttr func(optionalAttr)
+
+// DatasetToGraphStatefulWhitelist sets the optional stateful_whitelist attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func DatasetToGraphStatefulWhitelist(value []string) DatasetToGraphAttr {
 	return func(m optionalAttr) {
-		m["post_nms_topn"] = value
+		m["stateful_whitelist"] = value
 	}
 }
 
-// This op produces Region of Interests from given bounding boxes(bbox_deltas) encoded wrt anchors according to eq.2 in arXiv:1506.01497
+// DatasetToGraphAllowStateful sets the optional allow_stateful attribute to value.
+// If not specified, defaults to false
+func DatasetToGraphAllowStateful(value bool) DatasetToGraphAttr {
+	return func(m optionalAttr) {
+		m["allow_stateful"] = value
+	}
+}
+
+// DatasetToGraphStripDeviceAssignment sets the optional strip_device_assignment attribute to value.
+// If not specified, defaults to false
+func DatasetToGraphStripDeviceAssignment(value bool) DatasetToGraphAttr {
+	return func(m optionalAttr) {
+		m["strip_device_assignment"] = value
+	}
+}
+
+// Returns a serialized GraphDef representing `input_dataset`.
 //
-//       The op selects top `pre_nms_topn` scoring boxes, decodes them with respect to anchors,
-//       applies non-maximal suppression on overlapping boxes with higher than
-//       `nms_threshold` intersection-over-union (iou) value, discarding boxes where shorter
-//       side is less than `min_size`.
-//       Inputs:
-//       `scores`: A 4D tensor of shape [Batch, Height, Width, Num Anchors] containing the scores per anchor at given postion
-//       `bbox_deltas`: is a tensor of shape [Batch, Height, Width, 4 x Num Anchors] boxes encoded to each anchor
-//       `anchors`: A 1D tensor of shape [4 x Num Anchors], representing the anchors.
-//       Outputs:
-//       `rois`: output RoIs, a 3D tensor of shape [Batch, post_nms_topn, 4], padded by 0 if less than post_nms_topn candidates found.
-//       `roi_probabilities`: probability scores of each roi in 'rois', a 2D tensor of shape [Batch,post_nms_topn], padded with 0 if needed, sorted by scores.
+// Returns a graph representation for `input_dataset`.
 //
 // Arguments:
-//	scores: A 4-D float tensor of shape `[num_images, height, width, num_achors]` containing scores of the boxes for given anchors, can be unsorted.
-//	bbox_deltas: A 4-D float tensor of shape `[num_images, height, width, 4 x num_anchors]`. encoding boxes with respec to each anchor.
-// Coordinates are given in the form [dy, dx, dh, dw].
-//	image_info: A 2-D float tensor of shape `[num_images, 5]` containing image information Height, Width, Scale.
-//	anchors: A 2-D float tensor of shape `[num_anchors, 4]` describing the anchor boxes. Boxes are formatted in the form [y1, x1, y2, x2].
-//	nms_threshold: A scalar float tensor for non-maximal-suppression threshold.
-//	pre_nms_topn: A scalar int tensor for the number of top scoring boxes to be used as input.
-//	min_size: A scalar float tensor. Any box that has a smaller size than min_size will be discarded.
+//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
 //
-// Returns:
-//	rois: A 3-D float tensor of shape `[num_images,post_nms_topn,4]` representing the selected
-// region of interest boxes. Sorted in descending order in scores.
-//	roi_probabilities: A 2-D float tensor of shape `[num_images, post_nms_topn]` representing the score of the
-// region of interest box in `rois` tensor at the same index.
-func GenerateBoundingBoxProposals(scope *Scope, scores tf.Output, bbox_deltas tf.Output, image_info tf.Output, anchors tf.Output, nms_threshold tf.Output, pre_nms_topn tf.Output, min_size tf.Output, optional ...GenerateBoundingBoxProposalsAttr) (rois tf.Output, roi_probabilities tf.Output) {
+// Returns The graph representation of the dataset (as serialized GraphDef).
+func DatasetToGraph(scope *Scope, input_dataset tf.Output, optional ...DatasetToGraphAttr) (graph tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -47248,74 +41199,363 @@ func GenerateBoundingBoxProposals(scope *Scope, scores tf.Output, bbox_deltas tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "GenerateBoundingBoxProposals",
+		Type: "DatasetToGraph",
 		Input: []tf.Input{
-			scores, bbox_deltas, image_info, anchors, nms_threshold, pre_nms_topn, min_size,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
 
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["vocab_size"] = value
+		m["use_locking"] = value
 	}
 }
 
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
-//
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["delimiter"] = value
-	}
-}
-
-// Initializes a table from a text file.
-//
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
-//
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
+// var: Should be from a Variable().
 //
 // Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			table_handle, filename,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sigmoid",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
+type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the ADAM optimization algorithm.
+//	momenta: Parameter momenta updated by the ADAM optimization algorithm.
+//	velocities: Parameter velocities updated by the ADAM optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$\text{lr}_t := \mathrm{learning_rate} * \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+// $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+// $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
+// $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumsum",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs gradient updates of embedding tables.
+//
+// Arguments:
+//	inputs: A TensorList of gradients with which to update embedding tables.
+// This argument has the same length and shapes as the return value of
+// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+// with respect to the embedding activations. The embedding tables are updated
+// from these gradients via the optimizer specified in the TPU embedding
+// configuration given to tpu.initialize_system.
+//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+// rate tag: see the comments in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+// Multiple tables can share the same dynamic learning rate tag as specified
+// in the configuration. If the learning rates for all tables are constant,
+// this list should be empty.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns the created operation.
+func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "SendTPUEmbeddingGradients",
+		Input: []tf.Input{
+			tf.OutputList(inputs), tf.OutputList(learning_rates),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
+type ResourceApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
@@ -47326,7 +41566,7 @@ func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filenam
 type TensorArrayConcatV2Attr func(optionalAttr)
 
 // TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape_except0"] = value
@@ -47485,271 +41725,301 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 	return scope.AddOperation(opspec)
 }
 
-// Serializes the tree ensemble to a proto.
+// RecvAttr is an optional argument to Recv.
+type RecvAttr func(optionalAttr)
+
+// RecvClientTerminated sets the optional client_terminated attribute to value.
+//
+// value: If set to true, this indicates that the node was added
+// to the graph as a result of a client-side feed or fetch of Tensor data,
+// in which case the corresponding send or recv is expected to be managed
+// locally by the caller.
+// If not specified, defaults to false
+func RecvClientTerminated(value bool) RecvAttr {
+	return func(m optionalAttr) {
+		m["client_terminated"] = value
+	}
+}
+
+// Receives the named tensor from send_device on recv_device.
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
 //
-// Returns:
-//	stamp_token: Stamp token of the tree ensemble resource.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
-func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
+//	tensor_name: The name of the tensor to receive.
+//	send_device: The name of the device sending the tensor.
+//	send_device_incarnation: The current incarnation of send_device.
+//	recv_device: The name of the device receiving the tensor.
+//
+// Returns The tensor to receive.
+func Recv(scope *Scope, tensor_type tf.DataType, tensor_name string, send_device string, send_device_incarnation int64, recv_device string, optional ...RecvAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"tensor_type": tensor_type, "tensor_name": tensor_name, "send_device": send_device, "send_device_incarnation": send_device_incarnation, "recv_device": recv_device}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesSerializeEnsemble",
+		Type: "Recv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
 		Input: []tf.Input{
-			tree_ensemble_handle,
+			key, indices, tf.OutputList(values),
 		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
+type TPUReplicateMetadataAttr func(optionalAttr)
+
+// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
+//
+// value: Number of cores per replica. Used for model parallelism.
+// If not specified, defaults to 1
+func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["num_cores_per_replica"] = value
+	}
+}
+
+// TPUReplicateMetadataTopology sets the optional topology attribute to value.
+//
+// value: TopologyProto indicating the topology of the TPU pod slice.
+// If not specified, defaults to ""
+func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["topology"] = value
+	}
+}
+
+// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
+//
+// value: Whether to place the computation on the TPU.
+// If not specified, defaults to true
+func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["use_tpu"] = value
+	}
+}
+
+// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
+//
+// value: The assignment of devices for the computation.
+// If not specified, defaults to <>
+func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["device_assignment"] = value
+	}
+}
+
+// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
+//
+// value: DEPRECATED. Use num_cores_per_replica instead.
+// If not specified, defaults to <>
+func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["computation_shape"] = value
+	}
+}
+
+// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["host_compute_core"] = value
+	}
+}
+
+// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["padding_map"] = value
+	}
+}
+
+// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
+// If not specified, defaults to "STEP_MARK_AT_ENTRY"
+func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["step_marker_location"] = value
+	}
+}
+
+// TPUReplicateMetadataAllowSoftPlacement sets the optional allow_soft_placement attribute to value.
+// If not specified, defaults to false
+func TPUReplicateMetadataAllowSoftPlacement(value bool) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["allow_soft_placement"] = value
+	}
+}
+
+// Metadata indicating how the TPU computation should be replicated.
+//
+// This operation holds the metadata common to operations of a `tpu.replicate()` computation subgraph.
+//
+// Arguments:
+//	num_replicas: Number of replicas of the computation
+//
+// Returns the created operation.
+func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicateMetadata",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TensorListConcatAttr is an optional argument to TensorListConcat.
+type TensorListConcatAttr func(optionalAttr)
+
+// TensorListConcatElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Concats all tensors in the list along the 0th dimension.
+//
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcat",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-//
-// Given an input tensor, the function computes inverse hyperbolic cosine of every element.
-// Input range is `[1, inf]`. It returns `nan` if the input lies outside the range.
-//
-// ```python
-// x = tf.constant([-2, -0.5, 1, 1.2, 200, 10000, float("inf")])
-// tf.math.acosh(x) ==> [nan nan 0. 0.62236255 5.9914584 9.903487 inf]
-// ```
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
+type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
 
-// Outputs deterministic pseudorandom random numbers from a gamma distribution.
-//
-// Outputs random values from a gamma distribution.
-//
-// The outputs are a deterministic function of `shape`, `seed`, and `alpha`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	alpha: The concentration of the gamma distribution. Shape must match the rightmost
-// dimensions of `shape`.
-//
-// Returns Random values with specified shape.
-func StatelessRandomGammaV2(scope *Scope, shape tf.Output, seed tf.Output, alpha tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomGammaV2",
-		Input: []tf.Input{
-			shape, seed, alpha,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
-
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
-//
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
-	return func(m optionalAttr) {
-		m["batch_dim"] = value
-	}
-}
-
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
-//
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
-//
-// Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
-//
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
-		Input: []tf.Input{
-			input, seq_lengths,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fetches multiple values from infeed as an XLA tuple.
-//
-// Arguments:
-//	dtypes: The element types of each element in `outputs`.
-//	shapes: The shapes of each tensor in `outputs`.
-//
-// Returns A list of tensors that will be provided using the infeed mechanism.
-func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeueTuple",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("InfeedDequeueTuple", err)
-		return
-	}
-	return outputs
-}
-
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
-//
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
-//
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
-type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
-func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersConfig sets the optional config attribute to value.
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugConfig sets the optional config attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersConfig(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["config"] = value
 	}
 }
 
-// Load proximal Adagrad embedding parameters.
+// Load RMSProp embedding parameters with debug support.
 //
 // An op that loads optimization parameters into HBM for embedding. Must be
 // preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
@@ -47758,13 +42028,15 @@ func LoadTPUEmbeddingProximalAdagradParametersConfig(value string) LoadTPUEmbedd
 // executed.
 //
 // Arguments:
-//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
 //
 //
 //
 // Returns the created operation.
-func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -47773,44 +42045,211 @@ func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
 		Input: []tf.Input{
-			parameters, accumulators,
+			parameters, ms, mom, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// Concatenates tensors along one dimension.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
+type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load FTRL embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, linears, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
+type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersConfig(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load Adadelta embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParameters",
+		Input: []tf.Input{
+			parameters, accumulators, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// ResourceSparseApplyFtrlV2MultiplyLinearByLr sets the optional multiply_linear_by_lr attribute to value.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2MultiplyLinearByLr(value bool) ResourceSparseApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["multiply_linear_by_lr"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
 //	grad: The gradient.
 //	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	l2: L2 shrinkage regularization. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -47819,292 +42258,9 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "ResourceSparseApplyFtrlV2",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes hyperbolic cosine of x element-wise.
-//
-//   Given an input tensor, this function computes hyperbolic cosine of every
-//   element in the tensor. Input range is `[-inf, inf]` and output range
-//   is `[1, inf]`.
-//
-//   ```python
-//   x = tf.constant([-float("inf"), -9, -0.5, 1, 1.2, 2, 10, float("inf")])
-//   tf.math.cosh(x) ==> [inf 4.0515420e+03 1.1276259e+00 1.5430807e+00 1.8106556e+00 3.7621956e+00 1.1013233e+04 inf]
-//   ```
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CollectiveReduceAttr is an optional argument to CollectiveReduce.
-type CollectiveReduceAttr func(optionalAttr)
-
-// CollectiveReduceWaitFor sets the optional wait_for attribute to value.
-// If not specified, defaults to {}
-func CollectiveReduceWaitFor(value []int64) CollectiveReduceAttr {
-	return func(m optionalAttr) {
-		m["wait_for"] = value
-	}
-}
-
-// CollectiveReduceCommunicationHint sets the optional communication_hint attribute to value.
-// If not specified, defaults to "auto"
-func CollectiveReduceCommunicationHint(value string) CollectiveReduceAttr {
-	return func(m optionalAttr) {
-		m["communication_hint"] = value
-	}
-}
-
-// Mutually reduces multiple tensors of identical type and shape.
-func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64, optional ...CollectiveReduceAttr) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CollectiveReduce",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNAttr is an optional argument to CudnnRNN.
-type CudnnRNNAttr func(optionalAttr)
-
-// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNRnnMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNInputMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNDirection(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNDropout(value float32) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed2(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// CudnnRNNIsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// A RNN backed by cuDNN.
-//
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is false.
-func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNN",
-		Input: []tf.Input{
-			input, input_h, input_c, params,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//
-//
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
-type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to {}
-func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to {}
-func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["max_sequence_lengths"] = value
-	}
-}
-
-// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
-//
-// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
-// to the ith feature. table_ids[i] indicates which embedding table to look up ith
-// feature.
-//
-// The tensors at corresponding positions in the three input lists (sample_indices,
-// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
-// with dim_size() equal to the total number of lookups into the table described by
-// the corresponding feature.
-//
-// Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example to
-// which the corresponding embedding_indices and aggregation_weights values
-// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-// It corresponds to sp_ids.values in embedding_lookup_sparse().
-//	aggregation_weights: A list of rank 1 Tensors containing per training example
-// aggregation weights. It corresponds to sp_weights.values in
-// embedding_lookup_sparse().
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//	table_ids: A list of integers specifying the identifier of the embedding table
-// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
-// corresponding input. The ith input is looked up using table_ids[i]. The size
-// of the table_ids list must be equal to that of sample_indices,
-// embedding_indices and aggregation_weights.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"table_ids": table_ids}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
@@ -48438,46 +42594,95 @@ func SparseMatrixTranspose(scope *Scope, input tf.Output, type_ tf.DataType, opt
 	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
+// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
+type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
 
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersConfig(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -48486,55 +42691,110 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of 3D max pooling function.
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+//
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
+//
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmax",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic cosine of x element-wise.
+//
+//   Given an input tensor, this function computes hyperbolic cosine of every
+//   element in the tensor. Input range is `[-inf, inf]` and output range
+//   is `[1, inf]`.
+//
+//   ```python
+//   x = tf.constant([-float("inf"), -9, -0.5, 1, 1.2, 2, 10, float("inf")])
+//   tf.math.cosh(x) ==> [inf 4.0515420e+03 1.1276259e+00 1.5430807e+00 1.8106556e+00 3.7621956e+00 1.1013233e+04 inf]
+//   ```
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CollectiveReduceAttr is an optional argument to CollectiveReduce.
+type CollectiveReduceAttr func(optionalAttr)
+
+// CollectiveReduceWaitFor sets the optional wait_for attribute to value.
+// If not specified, defaults to <>
+func CollectiveReduceWaitFor(value []int64) CollectiveReduceAttr {
+	return func(m optionalAttr) {
+		m["wait_for"] = value
+	}
+}
+
+// CollectiveReduceCommunicationHint sets the optional communication_hint attribute to value.
+// If not specified, defaults to "auto"
+func CollectiveReduceCommunicationHint(value string) CollectiveReduceAttr {
+	return func(m optionalAttr) {
+		m["communication_hint"] = value
+	}
+}
+
+// Mutually reduces multiple tensors of identical type and shape.
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64, optional ...CollectiveReduceAttr) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "CollectiveReduce",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -48542,148 +42802,153 @@ func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, gr
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
 
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
 //
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
-
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// Update '*var' according to the AdaMax algorithm.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-//
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "ResourceApplyAdaMax",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the LSTM cell backward propagation for the entire time sequence.
+//
+// This implementation is to be used in conjunction of BlockLSTMV2.
+//
+// Arguments:
+//	seq_len_max: Maximum time length actually used by this input. Outputs are padded
+// with zeros beyond this length.
+//	x: The sequence input to the LSTM, shape (timelen, batch_size, num_inputs).
+//	cs_prev: Value of the initial cell state.
+//	h_prev: Initial output of cell (to be used for peephole).
+//	w: The weight matrix.
+//	wci: The weight matrix for input gate peephole connection.
+//	wcf: The weight matrix for forget gate peephole connection.
+//	wco: The weight matrix for output gate peephole connection.
+//	b: The bias vector.
+//	i: The input gate over the whole time sequence.
+//	cs: The cell state before the tanh over the whole time sequence.
+//	f: The forget gate over the whole time sequence.
+//	o: The output gate over the whole time sequence.
+//	ci: The cell input over the whole time sequence.
+//	co: The cell after the tanh over the whole time sequence.
+//	h: The output h vector over the whole time sequence.
+//	cs_grad: The current gradient of cs.
+//	h_grad: The gradient of h vector.
+//	use_peephole: Whether to use peephole weights.
+//
+// Returns:
+//	x_grad: The gradient of x to be back-propped.
+//	cs_prev_grad: The gradient of cs_prev to be back-propped.
+//	h_prev_grad: The gradient of h_prev to be back-propped.
+//	w_grad: The gradient for w to be back-propped.
+//	wci_grad: The gradient for wci to be back-propped.
+//	wcf_grad: The gradient for wcf to be back-propped.
+//	wco_grad: The gradient for wco to be back-propped.
+//	b_grad: The gradient for w to be back-propped.
+func BlockLSTMGradV2(scope *Scope, seq_len_max tf.Output, x tf.Output, cs_prev tf.Output, h_prev tf.Output, w tf.Output, wci tf.Output, wcf tf.Output, wco tf.Output, b tf.Output, i tf.Output, cs tf.Output, f tf.Output, o tf.Output, ci tf.Output, co tf.Output, h tf.Output, cs_grad tf.Output, h_grad tf.Output, use_peephole bool) (x_grad tf.Output, cs_prev_grad tf.Output, h_prev_grad tf.Output, w_grad tf.Output, wci_grad tf.Output, wcf_grad tf.Output, wco_grad tf.Output, b_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"use_peephole": use_peephole}
+	opspec := tf.OpSpec{
+		Type: "BlockLSTMGradV2",
+		Input: []tf.Input{
+			seq_len_max, x, cs_prev, h_prev, w, wci, wcf, wco, b, i, cs, f, o, ci, co, h, cs_grad, h_grad,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6), op.Output(7)
 }
 
-// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
-type OutfeedDequeueAttr func(optionalAttr)
-
-// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
+// Returns the element-wise max of two SparseTensors.
 //
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Retrieves a single tensor from the computation outfeed.
-//
-// This operation will block indefinitely until data is available.
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor.
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns A tensor that will be read from the device outfeed.
-func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
+// Returns:
+//	output_indices: 2-D.  The indices of the output SparseTensor.
+//	output_values: 1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMaximum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the Bessel i1e function of `x` element-wise.
+//
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+//
+// This function is faster and numerically stabler than `bessel_i1(x)`.
+func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OutfeedDequeue",
-
-		Attrs: attrs,
+		Type: "BesselI1e",
+		Input: []tf.Input{
+			x,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -48786,53 +43051,145 @@ func LSTMBlockCell(scope *Scope, x tf.Output, cs_prev tf.Output, h_prev tf.Outpu
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
+// A TPU core selector Op.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// This Op produces a set of TPU cores (for warm-up) or a single TPU core
+// (for regular inference) to execute the TPU program on. The output is
+// consumed by TPUPartitionedCall.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns A vector 1 or more TPU cores.
+func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TPUOrdinalSelector",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
+
+// SubstrUnit sets the optional unit attribute to value.
+//
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
+	}
+}
+
+// Return substrings from `Tensor` of strings.
+//
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, or if `len` is negative, then as many characters as possible are used.
+//
+// A negative `pos` indicates distance within the string backwards from the end.
+//
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
+//
+// Raises:
+//
+//   * `ValueError`: If the first argument cannot be converted to a
+//      Tensor of `dtype string`.
+//   * `InvalidArgumentError`: If indices are out of range.
+//   * `ValueError`: If `pos` and `len` are not the same shape.
+//
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "Substr",
 		Input: []tf.Input{
-			shape,
+			input, pos, len,
 		},
 		Attrs: attrs,
 	}
@@ -48905,6 +43262,864 @@ func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
+// SerializeIteratorAttr is an optional argument to SerializeIterator.
+type SerializeIteratorAttr func(optionalAttr)
+
+// SerializeIteratorExternalStatePolicy sets the optional external_state_policy attribute to value.
+// If not specified, defaults to 0
+func SerializeIteratorExternalStatePolicy(value int64) SerializeIteratorAttr {
+	return func(m optionalAttr) {
+		m["external_state_policy"] = value
+	}
+}
+
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
+//
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output, optional ...SerializeIteratorAttr) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeIterator",
+		Input: []tf.Input{
+			resource_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// UnsortedSegmentJoinAttr is an optional argument to UnsortedSegmentJoin.
+type UnsortedSegmentJoinAttr func(optionalAttr)
+
+// UnsortedSegmentJoinSeparator sets the optional separator attribute to value.
+//
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func UnsortedSegmentJoinSeparator(value string) UnsortedSegmentJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the elements of `inputs` based on `segment_ids`.
+//
+// Computes the string join along segments of a tensor.
+// Given `segment_ids` with rank `N` and `data` with rank `N+M`:
+//
+//     `output[i, k1...kM] = strings.join([data[j1...jN, k1...kM])`
+//
+// where the join is over all [j1...jN] such that segment_ids[j1...jN] = i.
+// Strings are joined in row-major order.
+//
+// For example:
+//
+// ```python
+// inputs = [['Y', 'q', 'c'], ['Y', '6', '6'], ['p', 'G', 'a']]
+// output_array = string_ops.unsorted_segment_join(inputs=inputs,
+//                                                 segment_ids=[1, 0, 1],
+//                                                 num_segments=2,
+//                                                 separator=':'))
+// # output_array ==> [['Y', '6', '6'], ['Y:p', 'q:G', 'c:a']]
+//
+//
+// inputs = ['this', 'is', 'a', 'test']
+// output_array = string_ops.unsorted_segment_join(inputs=inputs,
+//                                                 segment_ids=[0, 0, 0, 0],
+//                                                 num_segments=1,
+//                                                 separator=':'))
+// # output_array ==> ['this:is:a:test']
+// ```
+//
+// Arguments:
+//	inputs: The input to be joined.
+//	segment_ids: A tensor whose shape is a prefix of data.shape.  Negative segment ids are not
+// supported.
+//	num_segments: A scalar.
+func UnsortedSegmentJoin(scope *Scope, inputs tf.Output, segment_ids tf.Output, num_segments tf.Output, optional ...UnsortedSegmentJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentJoin",
+		Input: []tf.Input{
+			inputs, segment_ids, num_segments,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LuAttr is an optional argument to Lu.
+type LuAttr func(optionalAttr)
+
+// LuOutputIdxType sets the optional output_idx_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LuOutputIdxType(value tf.DataType) LuAttr {
+	return func(m optionalAttr) {
+		m["output_idx_type"] = value
+	}
+}
+
+// Computes the LU decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be invertible.
+//
+// The output consists of two tensors LU and P containing the LU decomposition
+// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+// upper triangular factors.
+//
+// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+// entries correspond to the upper triangular part, including the diagonal, of LU.
+//
+// P represents a permutation matrix encoded as a list of indices each between `0`
+// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+// P, then the L, U and P satisfies P_mat * input = L * U.
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+// size `[M, M]`.
+//
+// Returns:
+//	lu: A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+// lower triangular factor `L` with unit diagonal, and whose upper triangular part
+// denotes the upper triangular factor `U`.
+//	p: Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+// `[..., M]`.
+// @compatibility(scipy)
+// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+// packed into a single tensor, the permutation is applied to `input` instead of
+// the right hand side and the permutation `P` is returned as a list of indices
+// instead of a permutation matrix.
+// @end_compatibility
+func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Lu",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Outputs deterministic pseudorandom random numbers from a Poisson distribution.
+//
+// Outputs random values from a Poisson distribution.
+//
+// The outputs are a deterministic function of `shape`, `seed`, and `lam`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	lam: The rate of the Poisson distribution. Shape must match the rightmost dimensions
+// of `shape`.
+//	dtype: The type of the output.
+//
+// Returns Random values with specified shape.
+func StatelessRandomPoisson(scope *Scope, shape tf.Output, seed tf.Output, lam tf.Output, dtype tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomPoisson",
+		Input: []tf.Input{
+			shape, seed, lam,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of `NOT x` element-wise.
+//
+// Arguments:
+//	x: A `Tensor` of type `bool`.
+//
+// Returns A `Tensor` of type `bool` with the same shape as `x`. The logical negation of `x`.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalNot",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImageProjectiveTransformV2Attr is an optional argument to ImageProjectiveTransformV2.
+type ImageProjectiveTransformV2Attr func(optionalAttr)
+
+// ImageProjectiveTransformV2FillMode sets the optional fill_mode attribute to value.
+//
+// value: Fill mode, "REFLECT", "WRAP", or "CONSTANT".
+// If not specified, defaults to "CONSTANT"
+func ImageProjectiveTransformV2FillMode(value string) ImageProjectiveTransformV2Attr {
+	return func(m optionalAttr) {
+		m["fill_mode"] = value
+	}
+}
+
+// Applies the given transform to each of the images.
+//
+// If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps
+// the *output* point `(x, y)` to a transformed *input* point
+// `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
+// `k = c0 x + c1 y + 1`. If the transformed point lays outside of the input
+// image, the output pixel is set to 0.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	transforms: 2-D Tensor, `[batch, 8]` or `[1, 8]` matrix, where each row corresponds to a 3 x 3
+// projective transformation matrix, with the last entry assumed to be 1. If there
+// is one row, the same transformation will be applied to all images.
+//	output_shape: 1-D Tensor [new_height, new_width].
+//	interpolation: Interpolation method, "NEAREST" or "BILINEAR".
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ImageProjectiveTransformV2(scope *Scope, images tf.Output, transforms tf.Output, output_shape tf.Output, interpolation string, optional ...ImageProjectiveTransformV2Attr) (transformed_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"interpolation": interpolation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageProjectiveTransformV2",
+		Input: []tf.Input{
+			images, transforms, output_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear gradients for a Relu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
+
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// IRFFTAttr is an optional argument to IRFFT.
+type IRFFTAttr func(optionalAttr)
+
+// IRFFTTreal sets the optional Treal attribute to value.
+// If not specified, defaults to DT_FLOAT
+func IRFFTTreal(value tf.DataType) IRFFTAttr {
+	return func(m optionalAttr) {
+		m["Treal"] = value
+	}
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output, optional ...IRFFTAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
+type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// An op that enqueues TPUEmbedding input indices from a SparseTensor.
+//
+// This Op eases the porting of code that uses embedding_lookup_sparse(),
+// although some Python preprocessing of the SparseTensor arguments to
+// embedding_lookup_sparse() is required to produce the arguments to this Op,
+// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+// step.
+//
+// The tensors at corresponding positions in the three input lists
+// must have the same shape, i.e. rank 1 with dim_size() equal to the total
+// number of lookups into the table described by the corresponding table_id.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example and
+// feature to which the corresponding embedding_indices and aggregation_weights
+// values belong. sample_indices[i] must equal b * nf + f, where nf is the
+// number of features from the corresponding table, f is in [0, nf), and
+// b is in [0, batch size).
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
+// (training example, feature) -- aggregation weights.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes square root of x element-wise.
+//
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
+//
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
+//
+// The folded size of each dimension D of the output is:
+//
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
+//
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPadGrad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produces the max pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns:
+//	output
+//	min_output: The float value that the lowest quantized output value represents.
+//	max_output: The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMaxPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
+
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
+//
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // The gradient of SparseFillEmptyRows.
 //
 // Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
@@ -48937,27 +44152,93 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
 	return op.Output(0), op.Output(1)
 }
 
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
 
-// SerializeSparseOutType sets the optional out_type attribute to value.
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["data_format"] = value
 	}
 }
 
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+// Computes gradients of 3D max pooling function.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -48966,9 +44247,303 @@ func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns:
+//	output_indices: 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.
+//	output_shape: 1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Elementwise computes the bitwise left-shift of `x` and `y`.
+//
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+//
+// Example:
+//
+// ```python
+// import tensorflow as tf
+// from tensorflow.python.ops import bitwise_ops
+// import numpy as np
+// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
+//
+// for dtype in dtype_list:
+//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
+//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
+//
+//   left_shift_result = bitwise_ops.left_shift(lhs, rhs)
+//
+//   print(left_shift_result)
+//
+// # This will print:
+// # tf.Tensor([ -32   -5 -128    0], shape=(4,), dtype=int8)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int16)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int32)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int64)
+//
+// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
+// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
+// bitwise_ops.left_shift(lhs, rhs)
+// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
+// ```
+//
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LeftShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates a feature cross from a list of tensors, and returns it as a
+// RaggedTensor.  See `tf.ragged.cross` for more details.
+//
+// Arguments:
+//	ragged_values: The values tensor for each RaggedTensor input.
+//	ragged_row_splits: The row_splits tensor for each RaggedTensor input.
+//	sparse_indices: The indices tensor for each SparseTensor input.
+//	sparse_values: The values tensor for each SparseTensor input.
+//	sparse_shape: The dense_shape tensor for each SparseTensor input.
+//	dense_inputs: The tf.Tensor inputs.
+//	input_order: String specifying the tensor type for each input.  The `i`th character in
+// this string specifies the type of the `i`th input, and is one of: 'R' (ragged),
+// 'D' (dense), or 'S' (sparse).  This attr is used to ensure that the crossed
+// values are combined in the order of the inputs from the call to tf.ragged.cross.
+//
+//
+//
+//
+//
+//
+// Returns:
+//	output_values: The `values` for the returned `RaggedTensor`.
+//	output_row_splits: The `row_splits` for the returned `RaggedTensor`.
+func RaggedCross(scope *Scope, ragged_values []tf.Output, ragged_row_splits []tf.Output, sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shape []tf.Output, dense_inputs []tf.Output, input_order string, hashed_output bool, num_buckets int64, hash_key int64, out_values_type tf.DataType, out_row_splits_type tf.DataType) (output_values tf.Output, output_row_splits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_order": input_order, "hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_values_type": out_values_type, "out_row_splits_type": out_row_splits_type}
+	opspec := tf.OpSpec{
+		Type: "RaggedCross",
+		Input: []tf.Input{
+			tf.OutputList(ragged_values), tf.OutputList(ragged_row_splits), tf.OutputList(sparse_indices), tf.OutputList(sparse_values), tf.OutputList(sparse_shape), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
+//
+// Returns:
+//	loss: Per example loss (batch_size vector).
+//	backprop: backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Worker heartbeat op.
+//
+// Heartbeats may be sent periodically to indicate the coordinator is still active,
+// to retrieve the current worker status and to expedite shutdown when necessary.
+//
+// Arguments:
+//	request: A string tensor containing a serialized WorkerHeartbeatRequest
+//
+// Returns A string tensor containing a serialized WorkerHeartbeatResponse
+func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WorkerHeartbeat",
+		Input: []tf.Input{
+			request,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniform",
+		Input: []tf.Input{
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -48976,6 +44551,3440 @@ func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Ou
 	return op.Output(0)
 }
 
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve Adadelta embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the Adadelta optimization algorithm.
+//	accumulators: Parameter accumulators updated by the Adadelta optimization algorithm.
+//	updates: Parameter updates updated by the Adadelta optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns:
+//	stamp_token: Stamp token of the tree ensemble resource.
+//	num_trees: The number of trees in the tree ensemble resource.
+//	num_finalized_trees: The number of trees that were finished successfully.
+//	num_attempted_layers: The number of layers we attempted to build (but not necessarily succeeded).
+//	last_layer_nodes_range: Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesGetEnsembleStates",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
+
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse addition to individual values or slices in a Variable.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that addition would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// add = tf.scatter_nd_add(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(add)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdAdd",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Bincount",
+		Input: []tf.Input{
+			arr, size, weights,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gradients for batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. See `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns:
+//	dx: 4D backprop tensor for input.
+//	dm: 1D backprop tensor for mean.
+//	dv: 1D backprop tensor for variance.
+//	db: 1D backprop tensor for beta.
+//	dg: 1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalizationGrad",
+		Input: []tf.Input{
+			t, m, v, gamma, backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
+//
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
+type OutfeedDequeueAttr func(optionalAttr)
+
+// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Retrieves a single tensor from the computation outfeed.
+//
+// This operation will block indefinitely until data is available.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be read from the device outfeed.
+func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedDequeue",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An Op to sum inputs across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+// and `B, D, F, H` as group 1. Thus we get the outputs:
+// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
+//
+// Arguments:
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//
+// Returns The sum of all the distributed inputs.
+func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CrossReplicaSum",
+		Input: []tf.Input{
+			input, group_assignment,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EnqueueTPUEmbeddingRaggedTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingRaggedTensorBatch.
+type EnqueueTPUEmbeddingRaggedTensorBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingRaggedTensorBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["max_sequence_lengths"] = value
+	}
+}
+
+// Eases the porting of code that uses tf.nn.embedding_lookup().
+//
+// sample_splits[i], embedding_indices[i] and aggregation_weights[i] correspond
+// to the ith feature. table_ids[i] indicates which embedding table to look up ith
+// feature.
+//
+// The tensors at corresponding positions in two of the input lists,
+// embedding_indices and aggregation_weights, must have the same shape, i.e. rank 1
+// with dim_size() equal to the total number of lookups into the table described by
+// the corresponding feature.
+//
+// Arguments:
+//	sample_splits: A list of rank 1 Tensors specifying the break points for splitting
+// embedding_indices and aggregation_weights into rows.
+// It corresponds to ids.row_splits in embedding_lookup(), when ids is a
+// RaggedTensor.
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+// It corresponds to ids.values in embedding_lookup(), when ids is a RaggedTensor.
+//	aggregation_weights: A list of rank 1 Tensors containing per training example
+// aggregation weights. It corresponds to the values field of a RaggedTensor
+// with the same row_splits as ids in embedding_lookup(), when ids is a
+// RaggedTensor.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//	table_ids: A list of integers specifying the identifier of the embedding table
+// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+// corresponding input. The ith input is looked up using table_ids[i]. The size
+// of the table_ids list must be equal to that of sample_indices,
+// embedding_indices and aggregation_weights.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingRaggedTensorBatch(scope *Scope, sample_splits []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingRaggedTensorBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_ids": table_ids}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingRaggedTensorBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_splits), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+//
+// and `max` to 'outputs' tensor of same shape as `inputs`.
+//
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVars",
+		Input: []tf.Input{
+			inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Enqueue multiple Tensor values on the computation outfeed.
+//
+// Arguments:
+//	inputs: A list of tensors that will be inserted into the outfeed queue as an
+// XLA tuple.
+//
+// Returns the created operation.
+func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the number of nonzeroes of `sparse_matrix`.
+//
+// Arguments:
+//	sparse_matrix: A CSRSparseMatrix.
+//
+// Returns The number of nonzeroes of `sparse_matrix`.
+func SparseMatrixNNZ(scope *Scope, sparse_matrix tf.Output) (nnz tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatrixNNZ",
+		Input: []tf.Input{
+			sparse_matrix,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.io.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
+type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingADAMParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersConfig(value string) LoadTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load ADAM embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingADAMParameters",
+		Input: []tf.Input{
+			parameters, momenta, velocities,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "LatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Element-wise multiplication of a sparse matrix with a dense tensor.
+//
+// Returns a sparse matrix.
+//
+// The dense tensor `b` may be either a scalar; otherwise `a` must be a rank-3
+// `SparseMatrix`; in this case `b` must be shaped `[batch_size, 1, 1]` and the
+// multiply operation broadcasts.
+//
+// **NOTE** even if `b` is zero, the sparsity structure of the output does not
+// change.
+//
+// Arguments:
+//	a: A CSRSparseMatrix.
+//	b: A dense tensor.
+//
+// Returns A dense output tensor.
+func SparseMatrixMul(scope *Scope, a tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatrixMul",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise sum of a list of tensors.
+//
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "AccumulateNV2",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An op enabling differentiation of TPU Embeddings.
+//
+// This op simply returns its first input, which is assumed to have been sliced
+// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+// this op, and its first argument being a trainable Variable, enables automatic
+// differentiation of graphs containing embeddings via the TPU Embedding Python
+// libraries.
+//
+// Arguments:
+//	embedding_variable: A trainable variable, enabling optimizers to find this op.
+//	sliced_activations: The embedding activations Tensor to return.
+//	table_id: The id of the table in the embedding layer configuration from which
+// these activations were computed.
+//	lookup_id: Identifier of the set of embedding indices which produced these
+// activations.
+func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
+	opspec := tf.OpSpec{
+		Type: "TPUEmbeddingActivations",
+		Input: []tf.Input{
+			embedding_variable, sliced_activations,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3NarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV3NarrowRange(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3Axis sets the optional axis attribute to value.
+// If not specified, defaults to -1
+func QuantizeAndDequantizeV3Axis(value int64) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
+//
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV3",
+		Input: []tf.Input{
+			input, input_min, input_max, num_bits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x * y element-wise.
+//
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mul",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softplus gradients for a softplus operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
+//
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftplusGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the item in the list with the given index.
+//
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
+//
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the proximal Adagrad optimization algorithm.
+//	accumulators: Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the value stored in an Optional variant or raises an error if none exists.
+func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "OptionalGetValue",
+		Input: []tf.Input{
+			optional,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("OptionalGetValue", err)
+		return
+	}
+	return components
+}
+
+// Determine the script codes of a given tensor of Unicode integer code points.
+//
+// This operation converts Unicode code points to script codes corresponding to
+// each code point. Script codes correspond to International Components for
+// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
+// match input shape.
+//
+// Examples:
+//
+// >>> tf.strings.unicode_script([1, 31, 38])
+// <tf.Tensor: shape=(3,), dtype=int32, numpy=array([0, 0, 0], dtype=int32)>
+//
+// Arguments:
+//	input: A Tensor of int32 Unicode code points.
+//
+// Returns A Tensor of int32 script codes corresponding to each input code point.
+func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeScript",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
+
+// CropAndResizeMethod sets the optional method attribute to value.
+//
+// value: A string specifying the sampling method for resizing. It can be either
+// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
+// methods are supported: Bilinear and Nearest Neighbor.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+//
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["extrapolation_value"] = value
+	}
+}
+
+// Extracts crops from the input image tensor and resizes them.
+//
+// Extracts crops from the input image tensor and resizes them using bilinear
+// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+// common output size specified by `crop_size`. This is more general than the
+// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
+// and does not allow resizing or aspect ratio change.
+//
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear or nearest neighbor interpolation) to a fixed
+// `size = [crop_height, crop_width]`. The result is a 4-D tensor
+// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+// results to using `tf.image.resize_bilinear()` or
+// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+// `align_corners=True`.
+//
+// Arguments:
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
+//
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResize",
+		Input: []tf.Input{
+			image, boxes, box_ind, crop_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+// If not specified, defaults to <>
+func DepthwiseConv2dNativeBackpropFilterExplicitPaddings(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that zips together `input_datasets`.
+//
+// The elements of the resulting dataset are created by zipping corresponding
+// elements from each of the input datasets.
+//
+// The size of the resulting dataset will match the size of the smallest input
+// dataset, and no error will be raised if input datasets have different sizes.
+//
+// Arguments:
+//	input_datasets: List of `N` variant Tensors representing datasets to be zipped together.
+//
+//
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Rounds the values of a tensor to the nearest integer, element-wise.
+//
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Round",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tree ensemble model and returns a handle to it.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
+//	stamp_token: Token to use as the initial value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
+//
+// Returns the created operation.
+func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCreateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Calculates the softmax of a CSRSparseMatrix.
+//
+// Calculate the softmax of the innermost dimensions of a SparseMatrix.
+//
+// Missing values are treated as `-inf` (i.e., logits of zero probability); and
+// the output has the same sparsity structure as the input (though missing values
+// in the output may now be treated as having probability zero).
+//
+// Arguments:
+//	logits: A CSRSparseMatrix.
+//
+//
+// Returns A CSRSparseMatrix.
+func SparseMatrixSoftmax(scope *Scope, logits tf.Output, type_ tf.DataType) (softmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"type": type_}
+	opspec := tf.OpSpec{
+		Type: "SparseMatrixSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Restore",
+		Input: []tf.Input{
+			file_pattern, tensor_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the next record (key, value pair) produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns:
+//	key: A scalar.
+//	value: A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumprodReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumprod",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug.
+type LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load SGD embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug(scope *Scope, parameters tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that batches input elements into a SparseTensor.
+//
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
+//
+//
+func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DenseToSparseBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, row_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load Adadelta parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, updates, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns x / y element-wise.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Enqueue a Tensor on the computation outfeed.
+//
+// Arguments:
+//	input: A tensor that will be inserted into the outfeed queue.
+//
+// Returns the created operation.
+func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedEnqueue",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Serializes the tree ensemble to a proto.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns:
+//	stamp_token: Stamp token of the tree ensemble resource.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesSerializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes inverse hyperbolic cosine of x element-wise.
+//
+// Given an input tensor, the function computes inverse hyperbolic cosine of every element.
+// Input range is `[1, inf]`. It returns `nan` if the input lies outside the range.
+//
+// ```python
+// x = tf.constant([-2, -0.5, 1, 1.2, 200, 10000, float("inf")])
+// tf.math.acosh(x) ==> [nan nan 0. 0.62236255 5.9914584 9.903487 inf]
+// ```
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random numbers from a gamma distribution.
+//
+// Outputs random values from a gamma distribution.
+//
+// The outputs are a deterministic function of `shape`, `seed`, and `alpha`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	alpha: The concentration of the gamma distribution. Shape must match the rightmost
+// dimensions of `shape`.
+//
+// Returns Random values with specified shape.
+func StatelessRandomGammaV2(scope *Scope, shape tf.Output, seed tf.Output, alpha tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomGammaV2",
+		Input: []tf.Input{
+			shape, seed, alpha,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that executes a SQL query and emits rows of the result set.
+//
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SqlDataset",
+		Input: []tf.Input{
+			driver_name, data_source_name, query,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a batched diagonal tensor with a given batched diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessTruncatedNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve SGD embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr) (parameters tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniform",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
+type StatelessRandomUniformFullIntAttr func(optionalAttr)
+
+// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformFullInt",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
+//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// EqualAttr is an optional argument to Equal.
+type EqualAttr func(optionalAttr)
+
+// EqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
+// If not specified, defaults to true
+func EqualIncompatibleShapeError(value bool) EqualAttr {
+	return func(m optionalAttr) {
+		m["incompatible_shape_error"] = value
+	}
+}
+
+// Returns the truth value of (x == y) element-wise.
+//
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ```python
+// x = tf.constant([2, 4])
+// y = tf.constant(2)
+// tf.math.equal(x, y) ==> array([True, False])
+//
+// x = tf.constant([2, 4])
+// y = tf.constant([2, 4])
+// tf.math.equal(x, y) ==> array([True,  True])
+// ```
+func Equal(scope *Scope, x tf.Output, y tf.Output, optional ...EqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Equal",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns:
+//	result_indices: 2D indices of a `SparseTensor`.
+//	result_values: 1D values of a `SparseTensor`.
+//	result_shape: 1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToSparseSetOperation",
+		Input: []tf.Input{
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
+type InfeedEnqueueTupleAttr func(optionalAttr)
+
+// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence for
+// all the tuple shapes, in the order the shapes appear in the "shapes" input.
+// The layout elements for a sub-shape can be set to -1, in which case the
+// corresponding layout will be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Feeds multiple Tensor values into the computation as an XLA tuple.
+//
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
+//
+// Returns the created operation.
+func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
+type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersConfig(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a Dataset that returns pseudorandom numbers.
+//
+// Creates a Dataset that returns a stream of uniformly distributed
+// pseudorandom 64-bit signed integers.
+//
+// In the TensorFlow Python API, you can instantiate this dataset via the
+// class `tf.data.experimental.RandomDataset`.
+//
+// Instances of this dataset are also created as a result of the
+// `hoist_random_uniform` static optimization. Whether this optimization is
+// performed is determined by the `experimental_optimization.hoist_random_uniform`
+// option of `tf.data.Options`.
+//
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RandomDataset",
+		Input: []tf.Input{
+			seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
+
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns:
+//	output: output tensor after fractional avg pooling.
+//	row_pooling_sequence: row pooling sequence, needed to calculate gradient.
+//	col_pooling_sequence: column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
+type StatefulUniformFullIntAttr func(optionalAttr)
+
+// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns Random values with specified shape.
+func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniformFullInt",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
+type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersConfig(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load SGD embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
+		Input: []tf.Input{
+			parameters,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
+type RequantizePerChannelAttr func(optionalAttr)
+
+// RequantizePerChannelOutType sets the optional out_type attribute to value.
+//
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QUINT8
+func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Requantizes input with min and max values known per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	requested_output_min: The minimum value of the output tensor requested.
+//	requested_output_max: The maximum value of the output tensor requested.
+//
+// Returns:
+//	output: Output tensor.
+//	output_min: The minimum value of the final output tensor
+//	output_max: The maximum value of the final output tensor.
+func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizePerChannel",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LeakyReluAttr is an optional argument to LeakyRelu.
+type LeakyReluAttr func(optionalAttr)
+
+// LeakyReluAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluAlpha(value float32) LeakyReluAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// Computes rectified linear: `max(features, features * alpha)`.
+func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LeakyRelu",
+		Input: []tf.Input{
+			features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Component-wise divides a SparseTensor by a dense Tensor.
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseDiv",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
+type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op that enqueues a list of input batch tensors to TPUEmbedding.
+//
+// Arguments:
+//	batch: A list of 1D tensors, one for each embedding table, containing the
+// indices into the tables.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingIntegerBatch",
+		Input: []tf.Input{
+			tf.OutputList(batch), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Deserialize `SparseTensor` objects.
+//
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Decode web-safe base64-encoded strings.
+//
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
+//
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
+type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersConfig(value string) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the gradient of `Tile`.
+//
+// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
+//
+// Since `Tile` takes an input and repeats the input `multiples` times
+// along each dimension, `TileGrad` takes in `multiples` and aggregates
+// each repeated tile of `input` into `output`.
+func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TileGrad",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
+type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersConfig(value string) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load FTRL embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingFTRLParameters",
+		Input: []tf.Input{
+			parameters, accumulators, linears,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
+
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeExplicitPaddings sets the optional explicit_paddings attribute to value.
+// If not specified, defaults to <>
+func DepthwiseConv2dNativeExplicitPaddings(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates an all-zeros CSRSparseMatrix with shape `dense_shape`.
+//
+// Arguments:
+//	dense_shape: The desired matrix shape.
+//
+//
+// Returns An empty CSR matrix with shape `dense_shape`.
+func SparseMatrixZeros(scope *Scope, dense_shape tf.Output, type_ tf.DataType) (sparse_matrix tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"type": type_}
+	opspec := tf.OpSpec{
+		Type: "SparseMatrixZeros",
+		Input: []tf.Input{
+			dense_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
+
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PaddingFIFOQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
+type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load Momentum embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, momenta, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Constructs a tensor by tiling a given tensor.
 //
 // This operation creates a new tensor by replicating `input` `multiples` times.
@@ -49021,3 +48030,1479 @@ func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output)
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
+
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeSparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Extracts the average gradient in the given ConditionalAccumulator.
+//
+// The op blocks until sufficient (i.e., more than num_required)
+// gradients have been accumulated.  If the accumulator has already
+// aggregated more than num_required gradients, it returns the average of
+// the accumulated gradients.  Also automatically increments the recorded
+// global_step in the accumulator by 1, and resets the aggregate to 0.
+//
+// Arguments:
+//	handle: The handle to an accumulator.
+//	num_required: Number of gradients required before we return an aggregate.
+//	dtype: The data type of accumulated gradients. Needs to correspond to the type
+// of the accumulator.
+//
+// Returns The average of the accumulated gradients.
+func ResourceAccumulatorTakeGradient(scope *Scope, handle tf.Output, num_required tf.Output, dtype tf.DataType) (average tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "ResourceAccumulatorTakeGradient",
+		Input: []tf.Input{
+			handle, num_required,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
+type InfeedEnqueueAttr func(optionalAttr)
+
+// InfeedEnqueueShape sets the optional shape attribute to value.
+//
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// InfeedEnqueueLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence.
+// If a layout attribute is passed, but its values are all -1, the layout will
+// be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which feeds a single Tensor value into the computation.
+//
+// Arguments:
+//	input: A tensor that will be provided using the infeed mechanism.
+//
+// Returns the created operation.
+func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueue",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the mean along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_mean(c, tf.constant([0, 0, 1]))
+// # ==> [[2.5, 2.5, 2.5, 2.5],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMean",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CTCLossV2Attr is an optional argument to CTCLossV2.
+type CTCLossV2Attr func(optionalAttr)
+
+// CTCLossV2PreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+//
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossV2PreprocessCollapseRepeated(value bool) CTCLossV2Attr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossV2CtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+//
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossV2CtcMergeRepeated(value bool) CTCLossV2Attr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossV2IgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossV2IgnoreLongerOutputsThanInputs(value bool) CTCLossV2Attr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+//
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. Default blank
+// label is 0 rather num_classes - 1.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
+//
+// Returns:
+//	loss: A vector (batch) containing log-probabilities.
+//	gradient: The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLossV2(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossV2Attr) (loss tf.Output, gradient tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCLossV2",
+		Input: []tf.Input{
+			inputs, labels_indices, labels_values, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
+
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGradV2",
+		Input: []tf.Input{
+			orig_input, orig_output, grad, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
+type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersConfig(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
+//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+//   Combines (nests of) input elements into a dataset of (nests of) windows.
+//
+//   A "window" is a finite dataset of flat elements of size `size` (or possibly
+//   fewer if there are not enough input elements to fill the window and
+//   `drop_remainder` evaluates to false).
+//
+//   The `shift` argument determines the number of input elements by which
+//   the window moves on each iteration.  The first element in the `k`th window
+//   will be element
+//
+//   ```
+//   1 + (k-1) * shift
+//   ```
+//
+//   of the input dataset. In particular, the first element of the first window
+//   will always be the first element of the input dataset.
+//
+//   If the `stride` parameter is greater than 1, then each window will skip
+//   `(stride - 1)` input elements between each element that appears in the
+//   window. Output windows will still contain `size` elements regardless of
+//   the value of `stride`.
+//
+//   The `stride` argument determines the stride of the input elements, and the
+//   `shift` argument determines the shift of the window.
+//
+//   For example, letting `{...}` to represent a Dataset:
+//
+//   - `tf.data.Dataset.range(7).window(2)` produces
+//     `{{0, 1}, {2, 3}, {4, 5}, {6}}`
+//   - `tf.data.Dataset.range(7).window(3, 2, 1, True)` produces
+//     `{{0, 1, 2}, {2, 3, 4}, {4, 5, 6}}`
+//   - `tf.data.Dataset.range(7).window(3, 1, 2, True)` produces
+//     `{{0, 2, 4}, {1, 3, 5}, {2, 4, 6}}`
+//
+//   Note that when the `window` transformation is applied to a dataset of
+//   nested elements, it produces a dataset of nested windows.
+//
+//   For example:
+//
+//   - `tf.data.Dataset.from_tensor_slices((range(4), range(4))).window(2)`
+//     produces `{({0, 1}, {0, 1}), ({2, 3}, {2, 3})}`
+//   - `tf.data.Dataset.from_tensor_slices({"a": range(4)}).window(2)`
+//     produces `{{"a": {0, 1}}, {"a": {2, 3}}}`
+//
+// Arguments:
+//
+//	size: An integer scalar, representing the number of elements
+// of the input dataset to combine into a window. Must be positive.
+//	shift: An integer scalar, representing the number of input elements
+// by which the window moves in each iteration.  Defaults to `size`.
+// Must be positive.
+//	stride: An integer scalar, representing the stride of the input elements
+// in the sliding window. Must be positive. The default value of 1 means
+// "retain every input element".
+//	drop_remainder: A Boolean scalar, representing whether the last window should be
+// dropped if its size is smaller than `window_size`.
+//
+//
+func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "WindowDataset",
+		Input: []tf.Input{
+			input_dataset, size, shift, stride, drop_remainder,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Number of unique elements along last dimension of input `set`.
+//
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
+//
+// Arguments:
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SetSize",
+		Input: []tf.Input{
+			set_indices, set_values, set_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AutoShardDatasetAttr is an optional argument to AutoShardDataset.
+type AutoShardDatasetAttr func(optionalAttr)
+
+// AutoShardDatasetAutoShardPolicy sets the optional auto_shard_policy attribute to value.
+// If not specified, defaults to 0
+func AutoShardDatasetAutoShardPolicy(value int64) AutoShardDatasetAttr {
+	return func(m optionalAttr) {
+		m["auto_shard_policy"] = value
+	}
+}
+
+// Creates a dataset that shards the input dataset.
+//
+// Creates a dataset that shards the input dataset by num_workers, returning a
+// sharded dataset for the index-th worker. This attempts to automatically shard
+// a dataset by examining the Dataset graph and inserting a shard op before the
+// inputs to a reader Dataset (e.g. CSVDataset, TFRecordDataset).
+//
+// This dataset will throw a NotFound error if we cannot shard the dataset
+// automatically.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this dataset across.
+//	index: A scalar representing the index of the current worker out of num_workers.
+//
+//
+func AutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...AutoShardDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AutoShardDataset",
+		Input: []tf.Input{
+			input_dataset, num_workers, index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
+type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
+
+// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
+// and = 0 when the Op is running on the CPU device.
+// If not specified, defaults to -1
+func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which enqueues prelinearized buffer into TPU infeed.
+//
+// Arguments:
+//	input: A variant tensor representing linearized output.
+//
+// Returns the created operation.
+func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueuePrelinearizedBuffer",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
+type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersConfig(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the FTRL optimization algorithm.
+//	accumulators: Parameter accumulators updated by the FTRL optimization algorithm.
+//	linears: Parameter linears updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
+
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNGradBias sets the optional bias attribute to value.
+//
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
+//
+// Arguments:
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRNGrad",
+		Input: []tf.Input{
+			input_grads, input_image, output_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrelinearizeAttr is an optional argument to Prelinearize.
+type PrelinearizeAttr func(optionalAttr)
+
+// PrelinearizeShape sets the optional shape attribute to value.
+//
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// PrelinearizeLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence. If a layout
+// attribute is passed but its values are all -1 the layout will be computed by
+// the infeed operation.
+// If not specified, defaults to <>
+func PrelinearizeLayout(value []int64) PrelinearizeAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// An op which linearizes one Tensor value to an opaque variant tensor.
+//
+// Arguments:
+//	input: A tensor that will be linearized.
+func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prelinearize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the result of a TPU compilation.
+//
+// This operation returns the result of a TPU compilation as a serialized
+// CompilationResultProto, which holds a status and an error message if an error
+// occurred during compilation.
+func TPUCompilationResult(scope *Scope) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUCompilationResult",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdagradV2Attr is an optional argument to ResourceApplyAdagradV2.
+type ResourceApplyAdagradV2Attr func(optionalAttr)
+
+// ResourceApplyAdagradV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradV2UseLocking(value bool) ResourceApplyAdagradV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdagradV2UpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradV2UpdateSlots(value bool) ResourceApplyAdagradV2Attr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
+//
+// accum += grad * grad
+// var -= lr * grad * (1 / (sqrt(accum) + epsilon))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdagradV2(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdagradV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagradV2",
+		Input: []tf.Input{
+			var_, accum, lr, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Shuts down a running distributed TPU system.
+//
+// The op returns an error if no system is running.
+//
+// Returns the created operation.
+func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShutdownDistributedTPU",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InTopKV2",
+		Input: []tf.Input{
+			predictions, targets, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates an Optional variant with no value.
+func OptionalNone(scope *Scope) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalNone",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersConfig(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve SGD embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
+
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inference or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNN",
+		Input: []tf.Input{
+			input, input_h, input_c, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
+type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["max_sequence_lengths"] = value
+	}
+}
+
+// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+//
+// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+// to the ith feature. table_ids[i] indicates which embedding table to look up ith
+// feature.
+//
+// The tensors at corresponding positions in the three input lists (sample_indices,
+// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+// with dim_size() equal to the total number of lookups into the table described by
+// the corresponding feature.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example to
+// which the corresponding embedding_indices and aggregation_weights values
+// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+// It corresponds to sp_ids.values in embedding_lookup_sparse().
+//	aggregation_weights: A list of rank 1 Tensors containing per training example
+// aggregation weights. It corresponds to sp_weights.values in
+// embedding_lookup_sparse().
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//	table_ids: A list of integers specifying the identifier of the embedding table
+// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+// corresponding input. The ith input is looked up using table_ids[i]. The size
+// of the table_ids list must be equal to that of sample_indices,
+// embedding_indices and aggregation_weights.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_ids": table_ids}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+//
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
+	}
+}
+
+// Reverses variable length slices.
+//
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseSequence",
+		Input: []tf.Input{
+			input, seq_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fetches multiple values from infeed as an XLA tuple.
+//
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be provided using the infeed mechanism.
+func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("InfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// Creates and returns an empty tensor list.
+//
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
+//
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "EmptyTensorList",
+		Input: []tf.Input{
+			element_shape, max_num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Sets up TPUEmbedding in a distributed TPU system.
+//
+// Arguments:
+//	config: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+// describes the embedding lookups of the program.
+//
+// Returns the created operation.
+func ConfigureTPUEmbedding(scope *Scope, config string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "ConfigureTPUEmbedding",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the number of gradients aggregated in the given accumulators.
+//
+// Arguments:
+//	handle: The handle to an accumulator.
+//
+// Returns The number of gradients aggregated in the given accumulator.
+func ResourceAccumulatorNumAccumulated(scope *Scope, handle tf.Output) (num_accumulated tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceAccumulatorNumAccumulated",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Connects N outputs from an N-way replicated TPU computation.
+//
+// This operation holds a replicated output from a `tpu.replicate()` computation subgraph.
+// Each replicated output has the same shape and type alongside the input.
+//
+// For example:
+// ```
+// %computation = "tf.Computation"()
+// %replicated_output:2 = "tf.TPUReplicatedOutput"(%computation)
+// ```
+// The above computation has a replicated output of two replicas.
+func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicatedOutput",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("TPUReplicatedOutput", err)
+		return
+	}
+	return outputs
+}
diff --git a/tensorflow/go/saved_model.go b/tensorflow/go/saved_model.go
index 7aa1e83cbc4..64ae82e3b01 100644
--- a/tensorflow/go/saved_model.go
+++ b/tensorflow/go/saved_model.go
@@ -22,7 +22,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
+	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 )
 
 // #include <stdlib.h>
diff --git a/tensorflow/go/signature.go b/tensorflow/go/signature.go
index 8aac0e2ec93..c2db0c75247 100644
--- a/tensorflow/go/signature.go
+++ b/tensorflow/go/signature.go
@@ -16,7 +16,7 @@ limitations under the License.
 
 package tensorflow
 
-import corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
+import corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 
 // #include "tensorflow/c/c_api.h"
 import "C"
diff --git a/tensorflow/go/signature_test.go b/tensorflow/go/signature_test.go
index e6927f3cebd..f9fa8427819 100644
--- a/tensorflow/go/signature_test.go
+++ b/tensorflow/go/signature_test.go
@@ -20,9 +20,9 @@ import (
 	"fmt"
 	"testing"
 
-	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
 	tspb "github.com/tensorflow/tensorflow/tensorflow/go/core/framework/tensor_shape_go_proto"
 	typb "github.com/tensorflow/tensorflow/tensorflow/go/core/framework/types_go_proto"
+	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 )
 
 func TestSignatureFromProto(t *testing.T) {
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index ce1acc20b00..aa4a9bb4618 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -16,7 +16,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.5.1</version>
+      <version>3.9.2</version>
     </dependency>
   </dependencies>
 
diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index 727f18d8b6d..f40090ac45d 100644
--- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -33,7 +33,7 @@
         <scala.test.version>2.2.6</scala.test.version>
         <maven.compiler.version>3.0</maven.compiler.version>
         <java.version>1.8</java.version>
-        <spark.version>2.3.1</spark.version>
+        <spark.version>2.4.5</spark.version>
         <yarn.api.version>2.7.3</yarn.api.version>
         <junit.version>4.11</junit.version>
     </properties>
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 0b363ff577e..e38e58d6fe6 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -30,7 +30,7 @@ tf_cuda_library(
     }),
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api",
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 1560a35fe17..ef25f03562f 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -246,6 +246,7 @@ cc_library(
         ":minimal_logging",
         ":simple_memory_arena",
         ":string",
+        ":tflite_with_xnnpack_optional",
         ":type_to_tflitetype",
         ":util",
         ":version",
@@ -253,6 +254,7 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
+        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
@@ -310,6 +312,8 @@ cc_library(
     ],
 )
 
+# Link this library to inject XNNPACK delegate to TFLite runtime automatically
+# by utilizing the weak symbols if they're supported by the platform.
 cc_library(
     name = "tflite_with_xnnpack",
     srcs = ["tflite_with_xnnpack.cc"],
@@ -322,6 +326,35 @@ cc_library(
     alwayslink = 1,
 )
 
+# Enables applying XNNPACK delegate for float models in TFLite runtime.
+# WARNING: This build flag is experimental and subject to change.
+config_setting(
+    name = "tflite_with_xnnpack_enabled",
+    values = {"define": "tflite_with_xnnpack=true"},
+)
+
+cc_library(
+    name = "tflite_with_xnnpack_optional",
+    srcs = ["tflite_with_xnnpack_optional.cc"],
+    hdrs = [
+        "core/macros.h",
+        "tflite_with_xnnpack_optional.h",
+    ],
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
+    defines = select({
+        ":tflite_with_xnnpack_enabled": ["TFLITE_BUILD_WITH_XNNPACK_DELEGATE"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        "//tensorflow/lite/c:common",
+    ] + select({
+        ":tflite_with_xnnpack_enabled": [
+            "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
 cc_test(
     name = "string_util_test",
     size = "small",
@@ -573,6 +606,9 @@ cc_test(
 #   - Windows: `tensorflowlite.dll`
 tflite_cc_shared_object(
     name = "tensorflowlite",
+    # Until we have more granular symbol export for the C++ API on Windows,
+    # export all symbols.
+    features = ["windows_export_all_symbols"],
     linkopts = select({
         "//tensorflow:macos": [
             "-Wl,-exported_symbols_list,$(location //tensorflow/lite:tflite_exported_symbols.lds)",
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index f372e69df00..4af4bd4aae8 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -151,10 +151,8 @@ def tflite_cc_shared_object(
         copts = tflite_copts(),
         linkopts = [],
         linkstatic = 1,
-        deps = [],
-        visibility = None,
         per_os_targets = False,
-        tags = None):
+        **kwargs):
     """Builds a shared object for TFLite."""
     tf_cc_shared_object(
         name = name,
@@ -162,10 +160,8 @@ def tflite_cc_shared_object(
         linkstatic = linkstatic,
         linkopts = linkopts + tflite_jni_linkopts(),
         framework_so = [],
-        deps = deps,
-        visibility = visibility,
-        tags = tags,
         per_os_targets = per_os_targets,
+        **kwargs
     )
 
 def tf_to_tflite(name, src, options, out):
@@ -330,12 +326,14 @@ def generated_test_models():
         "relu6",
         "reshape",
         "resize_bilinear",
+        "resize_nearest_neighbor",
         "resolve_constant_strided_slice",
         "reverse_sequence",
         "reverse_v2",
         "rfft2d",
         "round",
         "rsqrt",
+        "scatter_nd",
         "shape",
         "sigmoid",
         "sin",
@@ -704,6 +702,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
                 "//tensorflow/lite/python:lite",
                 "//tensorflow/python:client_testlib",
             ] + flex_dep(target_op_sets),
+            timeout = "long",
         )
 
 def if_tflite_experimental_runtime(if_eager, if_non_eager, if_none = []):
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index f9549fc3571..1aa043b7c0c 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -22,6 +22,9 @@ package(
 tflite_cc_shared_object(
     name = "tensorflowlite_c",
     linkopts = select({
+        "//tensorflow:ios": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite/c:exported_symbols.lds)",
+        ],
         "//tensorflow:macos": [
             "-Wl,-exported_symbols_list,$(location //tensorflow/lite/c:exported_symbols.lds)",
         ],
@@ -87,6 +90,7 @@ cc_test(
     name = "c_api_test",
     size = "small",
     srcs = ["c_api_test.cc"],
+    copts = tflite_copts(),
     data = [
         "//tensorflow/lite:testdata/add.bin",
         "//tensorflow/lite:testdata/add_quantized.bin",
@@ -103,6 +107,7 @@ cc_test(
     name = "c_api_experimental_test",
     size = "small",
     srcs = ["c_api_experimental_test.cc"],
+    copts = tflite_copts(),
     data = ["//tensorflow/lite:testdata/add.bin"],
     deps = [
         ":c_api",
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 87b86d1838b..9e0e82bc906 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -209,8 +209,8 @@ typedef struct {
 } TfLiteBatchToSpaceNDParams;
 
 typedef struct {
-  bool adjoint_lhs;
-  bool adjoint_rhs;
+  bool adj_x;
+  bool adj_y;
 } TfLiteBatchMatMulParams;
 
 typedef struct {
@@ -297,6 +297,7 @@ typedef struct {
 
 typedef struct {
   bool align_corners;
+  bool half_pixel_centers;
 } TfLiteResizeNearestNeighborParams;
 
 typedef struct {
diff --git a/tensorflow/lite/c/c_api_experimental_test.cc b/tensorflow/lite/c/c_api_experimental_test.cc
index 71a08b5af26..18bc7bb0397 100644
--- a/tensorflow/lite/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/c/c_api_experimental_test.cc
@@ -25,11 +25,10 @@ namespace {
 
 TfLiteRegistration* GetDummyRegistration() {
   static TfLiteRegistration registration = {
-      .init = nullptr,
-      .free = nullptr,
-      .prepare = nullptr,
-      .invoke = [](TfLiteContext*, TfLiteNode*) { return kTfLiteOk; },
-  };
+      /*init=*/nullptr,
+      /*free=*/nullptr,
+      /*prepare=*/nullptr,
+      /*invoke=*/[](TfLiteContext*, TfLiteNode*) { return kTfLiteOk; }};
   return &registration;
 }
 
diff --git a/tensorflow/lite/c/common.c b/tensorflow/lite/c/common.c
index f70a60002dd..e6b47896528 100644
--- a/tensorflow/lite/c/common.c
+++ b/tensorflow/lite/c/common.c
@@ -79,7 +79,8 @@ TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
 void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
 
 void TfLiteTensorDataFree(TfLiteTensor* t) {
-  if (t->allocation_type == kTfLiteDynamic) {
+  if (t->allocation_type == kTfLiteDynamic ||
+      t->allocation_type == kTfLitePersistentRo) {
     free(t->data.raw);
   }
   t->data.raw = NULL;
@@ -172,7 +173,8 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
 }
 
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
-  if (tensor->allocation_type != kTfLiteDynamic) {
+  if (tensor->allocation_type != kTfLiteDynamic &&
+      tensor->allocation_type != kTfLitePersistentRo) {
     return;
   }
   // TODO(b/145340303): Tensor data should be aligned.
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 39ec547198e..ab150e87d93 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -29,6 +29,9 @@ limitations under the License.
 // TfLiteDelegate - allows delegation of nodes to alternative backends.
 //
 // Some abstractions in this file are created and managed by Interpreter.
+//
+// NOTE: The order of values in these structs are "semi-ABI stable". New values
+// should be added only to the end of structs and never reordered.
 
 #ifndef TENSORFLOW_LITE_C_COMMON_H_
 #define TENSORFLOW_LITE_C_COMMON_H_
@@ -41,7 +44,11 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-typedef enum TfLiteStatus { kTfLiteOk = 0, kTfLiteError = 1 } TfLiteStatus;
+typedef enum TfLiteStatus {
+  kTfLiteOk = 0,
+  kTfLiteError = 1,
+  kTfLiteDelegateError = 2
+} TfLiteStatus;
 
 // The list of external context types known to TF Lite. This list exists solely
 // to avoid conflicts and to ensure ops can share the external contexts they
@@ -151,8 +158,16 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
   do {                                              \
     (context)->ReportError((context), __VA_ARGS__); \
   } while (false)
+
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...)        \
+  do {                                                \
+    if ((context) != nullptr) {                       \
+      (context)->ReportError((context), __VA_ARGS__); \
+    }                                                 \
+  } while (false)
 #else  // TF_LITE_STRIP_ERROR_STRINGS
 #define TF_LITE_KERNEL_LOG(context, ...)
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...)
 #endif  // TF_LITE_STRIP_ERROR_STRINGS
 
 // Check whether value is true, and if not return kTfLiteError from
@@ -178,8 +193,9 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 
 #define TF_LITE_ENSURE_STATUS(a) \
   do {                           \
-    if ((a) != kTfLiteOk) {      \
-      return kTfLiteError;       \
+    const TfLiteStatus s = (a);  \
+    if (s != kTfLiteOk) {        \
+      return s;                  \
     }                            \
   } while (0)
 
@@ -208,8 +224,9 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 
 #define TF_LITE_ENSURE_OK(context, status) \
   do {                                     \
-    if ((status) != kTfLiteOk) {           \
-      return kTfLiteError;                 \
+    const TfLiteStatus s = (status);       \
+    if ((s) != kTfLiteOk) {                \
+      return s;                            \
     }                                      \
   } while (0)
 
@@ -304,15 +321,23 @@ typedef union TfLitePtrUnion {
   void* data;
 } TfLitePtrUnion;
 
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
   kTfLiteArenaRw,
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
+  kTfLitePersistentRo,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 6681a3ed550..419a3b2486d 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -26,6 +26,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
diff --git a/tensorflow/lite/core/api/error_reporter.h b/tensorflow/lite/core/api/error_reporter.h
index 7670c53e530..05839a611c7 100644
--- a/tensorflow/lite/core/api/error_reporter.h
+++ b/tensorflow/lite/core/api/error_reporter.h
@@ -48,9 +48,9 @@ class ErrorReporter {
 // reduce binary size, define TF_LITE_STRIP_ERROR_STRINGS when compiling and
 // every call will be stubbed out, taking no memory.
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
-#define TF_LITE_REPORT_ERROR(reporter, ...) \
-  do {                                      \
-    reporter->Report(__VA_ARGS__);          \
+#define TF_LITE_REPORT_ERROR(reporter, ...)                             \
+  do {                                                                  \
+    static_cast<tflite::ErrorReporter*>(reporter)->Report(__VA_ARGS__); \
   } while (false)
 #else  // TF_LITE_STRIP_ERROR_STRINGS
 #define TF_LITE_REPORT_ERROR(reporter, ...)
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 1e931607885..c52fc9f690b 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
-#include <cstdlib>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -89,48 +93,46 @@ TfLiteStatus FlatBufferIntVectorToArray(
 
 TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
                                ErrorReporter* error_reporter) {
-  *type = kTfLiteNoType;
   switch (tensor_type) {
     case TensorType_FLOAT16:
       *type = kTfLiteFloat16;
-      break;
+      return kTfLiteOk;
     case TensorType_FLOAT32:
       *type = kTfLiteFloat32;
-      break;
+      return kTfLiteOk;
     case TensorType_FLOAT64:
       *type = kTfLiteFloat64;
-      break;
+      return kTfLiteOk;
     case TensorType_INT16:
       *type = kTfLiteInt16;
-      break;
+      return kTfLiteOk;
     case TensorType_INT32:
       *type = kTfLiteInt32;
-      break;
+      return kTfLiteOk;
     case TensorType_UINT8:
       *type = kTfLiteUInt8;
-      break;
+      return kTfLiteOk;
     case TensorType_INT8:
       *type = kTfLiteInt8;
-      break;
+      return kTfLiteOk;
     case TensorType_INT64:
       *type = kTfLiteInt64;
-      break;
+      return kTfLiteOk;
     case TensorType_STRING:
       *type = kTfLiteString;
-      break;
+      return kTfLiteOk;
     case TensorType_BOOL:
       *type = kTfLiteBool;
-      break;
+      return kTfLiteOk;
     case TensorType_COMPLEX64:
       *type = kTfLiteComplex64;
-      break;
+      return kTfLiteOk;
+    default:
+      *type = kTfLiteNoType;
+      TF_LITE_REPORT_ERROR(error_reporter,
+                           "Unsupported data type %d in tensor\n", tensor_type);
+      return kTfLiteError;
   }
-  if (*type == kTfLiteNoType) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Unsupported data type %d in tensor\n",
-                         tensor_type);
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
 }
 
 // Parse the appropriate data out of the op.
@@ -195,6 +197,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
   switch (op_type) {
     case BuiltinOperator_CONV_2D: {
       auto params = safe_allocator.Allocate<TfLiteConvParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (auto* conv_params = op->builtin_options_as_Conv2DOptions()) {
         params->padding = parse_padding(conv_params->padding());
         params->stride_width = conv_params->stride_w();
@@ -205,38 +208,38 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->dilation_width_factor = conv_params->dilation_w_factor();
         params->dilation_height_factor = conv_params->dilation_h_factor();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_CAST: {
       auto params = safe_allocator.Allocate<TfLiteCastParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_CastOptions()) {
-        auto in_status =
-            ConvertTensorType(schema_params->in_data_type(),
-                              &params->in_data_type, error_reporter);
-        auto out_status =
-            ConvertTensorType(schema_params->out_data_type(),
-                              &params->out_data_type, error_reporter);
-        if (in_status != kTfLiteOk || out_status != kTfLiteOk) {
-          return kTfLiteError;
-        }
+        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->in_data_type(),
+                                                &params->in_data_type,
+                                                error_reporter));
+        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->out_data_type(),
+                                                &params->out_data_type,
+                                                error_reporter));
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_LSH_PROJECTION: {
       auto params = safe_allocator.Allocate<TfLiteLSHProjectionParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* lshParams =
               op->builtin_options_as_LSHProjectionOptions()) {
         params->type = parseLSHProjectionType(lshParams->type());
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_AVERAGE_POOL_2D:
     case BuiltinOperator_MAX_POOL_2D:
     case BuiltinOperator_L2_POOL_2D: {
       auto params = safe_allocator.Allocate<TfLitePoolParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* pool_params = op->builtin_options_as_Pool2DOptions()) {
         params->padding = parse_padding(pool_params->padding());
         params->stride_width = pool_params->stride_w();
@@ -246,11 +249,12 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(pool_params->fused_activation_function());
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_DEPTHWISE_CONV_2D: {
       auto params = safe_allocator.Allocate<TfLiteDepthwiseConvParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* conv_params =
               op->builtin_options_as_DepthwiseConv2DOptions()) {
         params->padding = parse_padding(conv_params->padding());
@@ -263,11 +267,12 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->dilation_width_factor = conv_params->dilation_w_factor();
         params->dilation_height_factor = conv_params->dilation_h_factor();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_SVDF: {
       auto params = safe_allocator.Allocate<TfLiteSVDFParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* svdf_params = op->builtin_options_as_SVDFOptions()) {
         params->rank = svdf_params->rank();
         params->activation =
@@ -275,11 +280,12 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->asymmetric_quantize_inputs =
             svdf_params->asymmetric_quantize_inputs();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: {
       auto params = safe_allocator.Allocate<TfLiteSequenceRNNParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* sequence_rnn_params =
               op->builtin_options_as_SequenceRNNOptions()) {
         params->activation =
@@ -288,12 +294,13 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->asymmetric_quantize_inputs =
             sequence_rnn_params->asymmetric_quantize_inputs();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN: {
       auto params =
           safe_allocator.Allocate<TfLiteBidirectionalSequenceRNNParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* bidi_sequence_rnn_params =
               op->builtin_options_as_BidirectionalSequenceRNNOptions()) {
         params->activation = parse_activation(
@@ -303,32 +310,35 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->asymmetric_quantize_inputs =
             bidi_sequence_rnn_params->asymmetric_quantize_inputs();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_RNN: {
       auto params = safe_allocator.Allocate<TfLiteRNNParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* rnn_params = op->builtin_options_as_RNNOptions()) {
         params->activation =
             parse_activation(rnn_params->fused_activation_function());
         params->asymmetric_quantize_inputs =
             rnn_params->asymmetric_quantize_inputs();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: {
       auto params =
           safe_allocator.Allocate<TfLiteEmbeddingLookupSparseParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* embedding_params =
               op->builtin_options_as_EmbeddingLookupSparseOptions()) {
         params->combiner = parseCombinerType(embedding_params->combiner());
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_FULLY_CONNECTED: {
       auto params = safe_allocator.Allocate<TfLiteFullyConnectedParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* fully_connected_params =
               op->builtin_options_as_FullyConnectedOptions()) {
         params->activation = parse_activation(
@@ -350,79 +360,87 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
             return kTfLiteError;
         }
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_HASHTABLE_LOOKUP:
       // no-op.
-      break;
+      return kTfLiteOk;
     case BuiltinOperator_SOFTMAX: {
       auto params = safe_allocator.Allocate<TfLiteSoftmaxParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* softmax_params =
               op->builtin_options_as_SoftmaxOptions()) {
         params->beta = softmax_params->beta();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_CONCATENATION: {
       auto params = safe_allocator.Allocate<TfLiteConcatenationParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* concatenation_params =
               op->builtin_options_as_ConcatenationOptions()) {
         params->activation =
             parse_activation(concatenation_params->fused_activation_function());
         params->axis = concatenation_params->axis();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_MUL: {
       auto params = safe_allocator.Allocate<TfLiteMulParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_MulOptions()) {
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_ADD: {
       auto params = safe_allocator.Allocate<TfLiteAddParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_AddOptions()) {
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_DIV: {
       auto params = safe_allocator.Allocate<TfLiteDivParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_DivOptions()) {
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_SUB: {
       auto params = safe_allocator.Allocate<TfLiteSubParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_SubOptions()) {
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_L2_NORMALIZATION: {
       auto params = safe_allocator.Allocate<TfLiteL2NormParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_L2NormOptions()) {
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: {
       auto params = safe_allocator.Allocate<TfLiteLocalResponseNormParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params =
               op->builtin_options_as_LocalResponseNormalizationOptions()) {
         params->radius = schema_params->radius();
@@ -430,11 +448,12 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->alpha = schema_params->alpha();
         params->beta = schema_params->beta();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_LSTM: {
       auto params = safe_allocator.Allocate<TfLiteLSTMParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
         params->activation =
             parse_activation(lstm_params->fused_activation_function());
@@ -460,12 +479,13 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                              "No valid LSTM builtin options exist");
         return kTfLiteError;
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: {
       auto params =
           safe_allocator.Allocate<TfLiteUnidirectionalSequenceLSTMParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* seq_lstm_params =
               op->builtin_options_as_UnidirectionalSequenceLSTMOptions()) {
         params->activation =
@@ -476,12 +496,13 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->asymmetric_quantize_inputs =
             seq_lstm_params->asymmetric_quantize_inputs();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM: {
       auto params =
           safe_allocator.Allocate<TfLiteBidirectionalSequenceLSTMParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* bidi_lstm_params =
               op->builtin_options_as_BidirectionalSequenceLSTMOptions()) {
         params->activation =
@@ -493,11 +514,12 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->asymmetric_quantize_inputs =
             bidi_lstm_params->asymmetric_quantize_inputs();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_RESIZE_BILINEAR: {
       auto params = safe_allocator.Allocate<TfLiteResizeBilinearParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params =
               op->builtin_options_as_ResizeBilinearOptions()) {
         params->align_corners = schema_params->align_corners();
@@ -508,26 +530,27 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->align_corners = false;
         params->half_pixel_centers = false;
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
-      // Large functions confuse MacOS builds with XCode 8 so a lambda is
-      // required to minimize function size. TODO(b/118447267): Simplify
-      // ParseOpData function and reduce its length.
-      [&]() {
-        auto params =
-            safe_allocator.Allocate<TfLiteResizeNearestNeighborParams>();
-        if (const auto* schema_params =
-                op->builtin_options_as_ResizeNearestNeighborOptions()) {
-          params->align_corners = schema_params->align_corners();
-        }
-        *builtin_data = reinterpret_cast<void*>(params.release());
-      }();
-      break;
+      auto params =
+          safe_allocator.Allocate<TfLiteResizeNearestNeighborParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
+      if (const auto* schema_params =
+              op->builtin_options_as_ResizeNearestNeighborOptions()) {
+        params->align_corners = schema_params->align_corners();
+        params->half_pixel_centers = schema_params->half_pixel_centers();
+      } else {
+        params->align_corners = false;
+        params->half_pixel_centers = false;
+      }
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_RESHAPE: {
       auto params = safe_allocator.Allocate<TfLiteReshapeParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
         auto* new_shape = schema_params->new_shape();
         // TODO(b/147203660): We need to figure out when dynamic reshape
@@ -540,47 +563,51 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
           params->num_dimensions = new_shape->size();
         }
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_SKIP_GRAM: {
       auto params = safe_allocator.Allocate<TfLiteSkipGramParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* skip_gram_params =
               op->builtin_options_as_SkipGramOptions()) {
         params->ngram_size = skip_gram_params->ngram_size();
         params->max_skip_size = skip_gram_params->max_skip_size();
         params->include_all_ngrams = skip_gram_params->include_all_ngrams();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_SPACE_TO_DEPTH: {
       auto params = safe_allocator.Allocate<TfLiteSpaceToDepthParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params =
               op->builtin_options_as_SpaceToDepthOptions()) {
         params->block_size = schema_params->block_size();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_DEPTH_TO_SPACE: {
       auto params = safe_allocator.Allocate<TfLiteDepthToSpaceParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params =
               op->builtin_options_as_DepthToSpaceOptions()) {
         params->block_size = schema_params->block_size();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_GATHER: {
       auto params = safe_allocator.Allocate<TfLiteGatherParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       params->axis = 0;
       if (const auto* gather_params = op->builtin_options_as_GatherOptions()) {
         params->axis = gather_params->axis();
       }
 
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_MEAN:
     case BuiltinOperator_REDUCE_MAX:
@@ -589,42 +616,47 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_REDUCE_ANY:
     case BuiltinOperator_SUM: {
       auto params = safe_allocator.Allocate<TfLiteReducerParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_ReducerOptions()) {
         params->keep_dims = schema_params->keep_dims();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_SPLIT: {
       auto params = safe_allocator.Allocate<TfLiteSplitParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_SplitOptions()) {
         params->num_splits = schema_params->num_splits();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_SPLIT_V: {
       auto params = safe_allocator.Allocate<TfLiteSplitParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_SplitVOptions()) {
         params->num_splits = schema_params->num_splits();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_SQUEEZE: {
       auto params = safe_allocator.Allocate<TfLiteSqueezeParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
-        const auto& squeeze_dims = schema_params->squeeze_dims();
+        const auto* squeeze_dims = schema_params->squeeze_dims();
         TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
             sizeof(params->squeeze_dims), squeeze_dims, params->squeeze_dims,
             error_reporter, "squeeze"));
         params->num_squeeze_dims = squeeze_dims->size();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_STRIDED_SLICE: {
       auto params = safe_allocator.Allocate<TfLiteStridedSliceParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params =
               op->builtin_options_as_StridedSliceOptions()) {
         params->begin_mask = schema_params->begin_mask();
@@ -633,64 +665,72 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->new_axis_mask = schema_params->new_axis_mask();
         params->shrink_axis_mask = schema_params->shrink_axis_mask();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_ARG_MAX: {
       auto params = safe_allocator.Allocate<TfLiteArgMaxParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
-        ConvertTensorType(schema_params->output_type(), &params->output_type,
-                          error_reporter);
+        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->output_type(),
+                                                &params->output_type,
+                                                error_reporter));
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_ARG_MIN: {
       auto params = safe_allocator.Allocate<TfLiteArgMinParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_ArgMinOptions()) {
-        ConvertTensorType(schema_params->output_type(), &params->output_type,
-                          error_reporter);
+        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->output_type(),
+                                                &params->output_type,
+                                                error_reporter));
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_TRANSPOSE_CONV: {
       auto params = safe_allocator.Allocate<TfLiteTransposeConvParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* transpose_conv_params =
               op->builtin_options_as_TransposeConvOptions()) {
         params->padding = parse_padding(transpose_conv_params->padding());
         params->stride_width = transpose_conv_params->stride_w();
         params->stride_height = transpose_conv_params->stride_h();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_SPARSE_TO_DENSE: {
       auto params = safe_allocator.Allocate<TfLiteSparseToDenseParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* sparse_to_dense_params =
               op->builtin_options_as_SparseToDenseOptions()) {
         params->validate_indices = sparse_to_dense_params->validate_indices();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_SHAPE: {
       auto params = safe_allocator.Allocate<TfLiteShapeParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_ShapeOptions()) {
-        ConvertTensorType(schema_params->out_type(), &params->out_type,
-                          error_reporter);
+        TF_LITE_ENSURE_STATUS(ConvertTensorType(
+            schema_params->out_type(), &params->out_type, error_reporter));
       }
-      *builtin_data = static_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_PACK: {
       auto params = safe_allocator.Allocate<TfLitePackParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* pack_params = op->builtin_options_as_PackOptions()) {
         params->values_count = pack_params->values_count();
         params->axis = pack_params->axis();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
@@ -700,6 +740,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     }
     case BuiltinOperator_FAKE_QUANT: {
       auto params = safe_allocator.Allocate<TfLiteFakeQuantParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params =
               op->builtin_options_as_FakeQuantOptions()) {
         params->min = schema_params->min();
@@ -707,37 +748,41 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->num_bits = schema_params->num_bits();
         params->narrow_range = schema_params->narrow_range();
       }
-      *builtin_data = static_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_ONE_HOT: {
       auto params = safe_allocator.Allocate<TfLiteOneHotParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_OneHotOptions()) {
         params->axis = schema_params->axis();
       }
-      *builtin_data = static_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_UNPACK: {
       auto params = safe_allocator.Allocate<TfLiteUnpackParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* unpack_params = op->builtin_options_as_UnpackOptions()) {
         params->num = unpack_params->num();
         params->axis = unpack_params->axis();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_LEAKY_RELU: {
       auto params = safe_allocator.Allocate<TfLiteLeakyReluParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* leaky_relu_params =
               op->builtin_options_as_LeakyReluOptions()) {
         params->alpha = leaky_relu_params->alpha();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_MIRROR_PAD: {
       auto params = safe_allocator.Allocate<TfLiteMirrorPaddingParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       const auto* mirror_pad_params = op->builtin_options_as_MirrorPadOptions();
       if (mirror_pad_params != nullptr) {
         params->mode =
@@ -745,11 +790,12 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                 ? TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect
                 : TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingSymmetric;
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_UNIQUE: {
       auto params = safe_allocator.Allocate<TfLiteUniqueParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       const auto* unique_params = op->builtin_options_as_UniqueOptions();
       if (unique_params != nullptr) {
         params->index_out_type =
@@ -757,47 +803,50 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                 ? TfLiteType::kTfLiteInt64
                 : TfLiteType::kTfLiteInt32;
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_REVERSE_SEQUENCE: {
       auto params = safe_allocator.Allocate<TfLiteReverseSequenceParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* reverse_seq_params =
               op->builtin_options_as_ReverseSequenceOptions()) {
         params->seq_dim = reverse_seq_params->seq_dim();
         params->batch_dim = reverse_seq_params->batch_dim();
       }
-      *builtin_data = reinterpret_cast<void*>(params.release());
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_IF: {
-      TfLiteIfParams* params = allocator->AllocatePOD<TfLiteIfParams>();
+      auto params = safe_allocator.Allocate<TfLiteIfParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* if_params = op->builtin_options_as_IfOptions()) {
         params->then_subgraph_index = if_params->then_subgraph_index();
         params->else_subgraph_index = if_params->else_subgraph_index();
       }
-      *builtin_data = reinterpret_cast<void*>(params);
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_WHILE: {
-      TfLiteWhileParams* params = allocator->AllocatePOD<TfLiteWhileParams>();
+      auto params = safe_allocator.Allocate<TfLiteWhileParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* while_params = op->builtin_options_as_WhileOptions()) {
         params->cond_subgraph_index = while_params->cond_subgraph_index();
         params->body_subgraph_index = while_params->body_subgraph_index();
       }
-      *builtin_data = reinterpret_cast<void*>(params);
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     case BuiltinOperator_BATCH_MATMUL: {
-      TfLiteBatchMatMulParams* params =
-          allocator->AllocatePOD<TfLiteBatchMatMulParams>();
+      auto params = safe_allocator.Allocate<TfLiteBatchMatMulParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* bmm_params =
               op->builtin_options_as_BatchMatMulOptions()) {
-        params->adjoint_lhs = bmm_params->adjoint_lhs();
-        params->adjoint_rhs = bmm_params->adjoint_rhs();
+        params->adj_x = bmm_params->adj_x();
+        params->adj_y = bmm_params->adj_y();
       }
-      *builtin_data = reinterpret_cast<void*>(params);
-      break;
+      *builtin_data = params.release();
+      return kTfLiteOk;
     }
     // Below are the ops with no builtin_data structure.
     case BuiltinOperator_ABS:
@@ -870,9 +919,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_SCATTER_ND:
     case BuiltinOperator_DENSIFY:
     case BuiltinOperator_SEGMENT_SUM:
-      break;
+      return kTfLiteOk;
   }
-  return kTfLiteOk;
+  return kTfLiteError;
 }  // NOLINT[readability/fn_size]
 
 }  // namespace tflite
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index a6ed2e92dad..2feddfaa8e6 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -19,9 +19,12 @@ limitations under the License.
 // flatbuffer serialization format into in-memory values that are used by the
 // runtime API and interpreter.
 
+#include <cstddef>
+#include <new>
+#include <type_traits>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -29,7 +32,7 @@ namespace tflite {
 // Interface class for builtin data allocations.
 class BuiltinDataAllocator {
  public:
-  virtual void* Allocate(size_t size) = 0;
+  virtual void* Allocate(size_t size, size_t alignment_hint) = 0;
   virtual void Deallocate(void* data) = 0;
 
   // Allocate a structure, but make sure it is a POD structure that doesn't
@@ -38,8 +41,11 @@ class BuiltinDataAllocator {
   // deallocation.
   template <typename T>
   T* AllocatePOD() {
+    // TODO(b/154346074): Change this to is_trivially_destructible when all
+    // platform targets support that properly.
     static_assert(std::is_pod<T>::value, "Builtin data structure must be POD.");
-    return static_cast<T*>(this->Allocate(sizeof(T)));
+    void* allocated_memory = this->Allocate(sizeof(T), alignof(T));
+    return new (allocated_memory) T;
   }
 
   virtual ~BuiltinDataAllocator() {}
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 24d7ec97e42..89ca3f566ec 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -47,7 +47,7 @@ class MockErrorReporter : public ErrorReporter {
 class MockDataAllocator : public BuiltinDataAllocator {
  public:
   MockDataAllocator() : is_allocated_(false) {}
-  void* Allocate(size_t size) override {
+  void* Allocate(size_t size, size_t alignment_hint) override {
     EXPECT_FALSE(is_allocated_);
     const int max_size = kBufferSize;
     EXPECT_LE(size, max_size);
diff --git a/tensorflow/lite/core/api/op_resolver.cc b/tensorflow/lite/core/api/op_resolver.cc
index 6424071f371..c239d9ed23e 100644
--- a/tensorflow/lite/core/api/op_resolver.cc
+++ b/tensorflow/lite/core/api/op_resolver.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/op_resolver.h"
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+
 namespace tflite {
 
 TfLiteStatus GetRegistrationFromOpCode(
diff --git a/tensorflow/lite/core/api/tensor_utils.cc b/tensorflow/lite/core/api/tensor_utils.cc
index d8d6fc46a18..3aac16b6878 100644
--- a/tensorflow/lite/core/api/tensor_utils.cc
+++ b/tensorflow/lite/core/api/tensor_utils.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string.h>
 
+#include "tensorflow/lite/c/common.h"
+
 namespace tflite {
 
 TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) {
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index f3c1be6fd11..7f4e0e286ea 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -1183,7 +1183,8 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
   // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
   if (tensor->allocation_type == kTfLiteArenaRw ||
       tensor->allocation_type == kTfLiteDynamic ||
-      tensor->allocation_type == kTfLiteArenaRwPersistent) {
+      tensor->allocation_type == kTfLiteArenaRwPersistent ||
+      tensor->allocation_type == kTfLitePersistentRo) {
     tensor_resized_since_op_invoke_ |=
         TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
     if (tensor->type != kTfLiteString) {
@@ -1195,14 +1196,16 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
         return kTfLiteError;
       }
 
-      // Realloc space for kTfLiteDynamic tensors.
+      // Realloc space for heap-allocated tensors.
       TfLiteTensorRealloc(bytesRequired, tensor);
       tensor->bytes = bytesRequired;
     }
     if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
     tensor->dims = new_size;
 
-    if (tensor->allocation_type != kTfLiteDynamic) {
+    // Reset arena-allocated tensors; they will be allocated later.
+    if (tensor->allocation_type == kTfLiteArenaRw ||
+        tensor->allocation_type == kTfLiteArenaRwPersistent) {
       tensor->data.raw = nullptr;
     }
   } else {
@@ -1305,6 +1308,14 @@ TfLiteStatus Subgraph::RedoAllDelegates() {
   return kTfLiteOk;
 }
 
+TfLiteStatus Subgraph::RemoveAllDelegates() {
+  TF_LITE_ENSURE_STATUS(UndoAllDelegates());
+  delegates_applied_.clear();
+  delegates_undone_ = false;
+  TF_LITE_ENSURE_STATUS(EnsureMemoryAllocations());
+  return kTfLiteOk;
+}
+
 TfLiteStatus Subgraph::EnsureMemoryAllocations() {
   if (memory_planner_) {
     state_ = kStateUninvokable;
@@ -1358,15 +1369,11 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
 
   auto reset_delegation_if_not_ok = [this](TfLiteStatus status) {
     if (status != kTfLiteOk) {
-      // This will undo all delegate nodes currently in the graph.
-      TF_LITE_ENSURE_STATUS(this->UndoAllDelegates());
-      // This will call AllocateTensors, thus-reapplying any (successfully
-      // applied) previous delegates.
-      TF_LITE_ENSURE_STATUS(this->EnsureMemoryAllocations());
+      TF_LITE_ENSURE_STATUS(RemoveAllDelegates());
       ReportError(
-          "Restored previous execution plan after delegate application "
+          "Restored original execution plan after delegate application "
           "failure.");
-      return kTfLiteError;
+      return kTfLiteDelegateError;
     }
     return kTfLiteOk;
   };
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index a85eeab5696..0b0c1e31e89 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -531,6 +531,12 @@ class Subgraph {
   // be reallocated if the graph was modified (i.e., the caller does *not* need
   // to explicitly call |AllocateTensors()| again). If tensors were unallocated,
   // they will remain unallocated after delegate application.
+  // Returns one of the following three status codes:
+  // 1. kTfLiteOk: Delegation succeeded
+  // 2. kTfLiteDelegateError: Delegation failed due to an error in the
+  // delegate. The Subgraph has been restored to its pre-delegation state.
+  // NOTE: This reverts all delegates previously applied to the Subgraph.
+  // 3. kTfLiteError: Unexpected/runtime failure.
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
 
   // This un-applies all delegates that have been applied till now, but retains
@@ -542,6 +548,11 @@ class Subgraph {
   // Does nothing if UndoAllDelegates wasn't previously called.
   TfLiteStatus RedoAllDelegates();
 
+  // This removes all delegates.
+  // The old execution plan and nodes are restored. The graph is invokable
+  // afterwards.
+  TfLiteStatus RemoveAllDelegates();
+
   // Cleanups up data reserved for the given node. Does not remove the {node,
   // registration} pair from nodes_and_registrations_.
   void CleanupNode(int node_index);
@@ -553,7 +564,13 @@ class Subgraph {
   void EnsureTensorsVectorCapacity() {
     const size_t required_capacity = tensors_.size() + kTensorsCapacityHeadroom;
     if (required_capacity > tensors_.capacity()) {
-      tensors_.reserve(required_capacity);
+      // Whenever it's required to increase the vector capacity, make it at
+      // least twice bigger. The behavior is consistent with the default
+      // behavior of GCC STL's `std::vector::resize()`. This avoids frequently
+      // allocating and copying the underlying buffer.
+      size_t reserved_capacity =
+          std::max(required_capacity, tensors_.capacity() * 2);
+      tensors_.reserve(reserved_capacity);
       context_.tensors = tensors_.data();
     }
   }
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index df671675ec9..619c4d75130 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -43,3 +43,24 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_test(
+    name = "delegate_test",
+    size = "small",
+    srcs = ["delegate_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:version",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
new file mode 100644
index 00000000000..566cc644d3e
--- /dev/null
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -0,0 +1,982 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace {
+
+// Build a kernel registration for an op that copies its one input
+// to an output
+TfLiteRegistration AddOpRegistration() {
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  reg.custom_name = "my_add";
+  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
+
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    // Set output size to input size
+    const TfLiteTensor* input1 = GetInput(context, node, 0);
+    const TfLiteTensor* input2 = GetInput(context, node, 1);
+    TfLiteTensor* output = GetOutput(context, node, 0);
+
+    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
+    for (int i = 0; i < input1->dims->size; ++i) {
+      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
+    }
+
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+        context, output, TfLiteIntArrayCopy(input1->dims)));
+    return kTfLiteOk;
+  };
+
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    // Copy input data to output data.
+    const TfLiteTensor* a0 = GetInput(context, node, 0);
+    TF_LITE_ENSURE(context, a0);
+    TF_LITE_ENSURE(context, a0->data.f);
+    const TfLiteTensor* a1 = GetInput(context, node, 1);
+    TF_LITE_ENSURE(context, a1);
+    TF_LITE_ENSURE(context, a1->data.f);
+    TfLiteTensor* out = GetOutput(context, node, 0);
+    TF_LITE_ENSURE(context, out);
+    TF_LITE_ENSURE(context, out->data.f);
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  return reg;
+}
+
+}  // namespace
+
+// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = AddOpRegistration();
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+  void TearDown() override {
+    // Interpreter relies on delegate to free the resources properly. Thus
+    // the life cycle of delegate must be longer than interpreter.
+    interpreter_.reset();
+    delegate_.reset();
+  }
+
+  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
+
+  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
+
+  TfLiteStatus RemoveAllDelegates() {
+    return interpreter_->RemoveAllDelegates();
+  }
+
+ protected:
+  class SimpleDelegate {
+   public:
+    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+    // value-copyable and compatible with TfLite.
+    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
+    // min_ops_per_subset: If >0, partitioning preview is used to choose only
+    // those subsets with min_ops_per_subset number of nodes.
+    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
+    explicit SimpleDelegate(
+        const std::vector<int>& nodes,
+        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
+        bool fail_node_prepare = false, int min_ops_per_subset = 0,
+        bool fail_node_invoke = false)
+        : nodes_(nodes),
+          fail_delegate_node_prepare_(fail_node_prepare),
+          min_ops_per_subset_(min_ops_per_subset),
+          fail_delegate_node_invoke_(fail_node_invoke) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             TfLiteDelegate* delegate) -> TfLiteStatus {
+        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
+        TfLiteIntArray* nodes_to_separate =
+            TfLiteIntArrayCreate(simple->nodes_.size());
+        // Mark nodes that we want in TfLiteIntArray* structure.
+        int index = 0;
+        for (auto node_index : simple->nodes_) {
+          nodes_to_separate->data[index++] = node_index;
+          // make sure node is added
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+        // Check that all nodes are available
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        for (int exec_index = 0; exec_index < execution_plan->size;
+             exec_index++) {
+          int node_index = execution_plan->data[exec_index];
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          if (exec_index == node_index) {
+            // Check op details only if it wasn't delegated already.
+            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+          }
+        }
+
+        // Get preview of delegate partitioning from the context.
+        TfLiteDelegateParams* params_array;
+        int num_partitions;
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        if (simple->min_ops_per_subset() > 0) {
+          // Build a new vector of ops from subsets with atleast the minimum
+          // size.
+          std::vector<int> allowed_ops;
+          for (int idx = 0; idx < num_partitions; ++idx) {
+            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
+            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
+            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
+                               nodes_in_subset->data + nodes_in_subset->size);
+          }
+
+          // Free existing nodes_to_separate & initialize a new array with
+          // allowed_ops.
+          TfLiteIntArrayFree(nodes_to_separate);
+          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
+          memcpy(nodes_to_separate->data, allowed_ops.data(),
+                 sizeof(int) * nodes_to_separate->size);
+        }
+
+        // Another call to PreviewDelegateParitioning should be okay, since
+        // partitioning memory is managed by context.
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, simple->FakeFusedRegistration(), nodes_to_separate,
+            delegate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
+        // TODO(b/156586986): Implement tests to test buffer copying logic.
+        return kTfLiteOk;
+      };
+      delegate_.CopyFromBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
+        TFLITE_CHECK_GE(buffer_handle, -1);
+        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
+        const float floats[] = {6., 6., 6.};
+        int num = output->dims->data[0];
+        for (int i = 0; i < num; i++) {
+          output->data.f[i] = floats[i];
+        }
+        return kTfLiteOk;
+      };
+
+      delegate_.FreeBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = static_cast<void*>(this);
+      delegate_.flags = delegate_flags;
+    }
+
+    TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fused_op";
+
+      reg.invoke = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+        // Copy input data to output data.
+        const TfLiteTensor* a0;
+        const TfLiteTensor* a1;
+        if (node->inputs->size == 2) {
+          a0 = GetInput(context, node, 0);
+          a1 = GetInput(context, node, 1);
+        } else {
+          a0 = GetInput(context, node, 0);
+          a1 = a0;
+        }
+        TfLiteTensor* out = GetOutput(context, node, 0);
+        int num = 1;
+        for (int i = 0; i < a0->dims->size; ++i) {
+          num *= a0->dims->data[i];
+        }
+        for (int i = 0; i < num; i++) {
+          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+        }
+        // Make the data stale so that CopyFromBufferHandle can be invoked
+        out->data_is_stale = true;
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_invoke_) {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          return kTfLiteError;
+        };
+      }
+
+      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+        // Set output size to input size
+        const TfLiteTensor* input1;
+        const TfLiteTensor* input2;
+        if (node->inputs->size == 2) {
+          input1 = GetInput(context, node, 0);
+          input2 = GetInput(context, node, 1);
+        } else {
+          input1 = GetInput(context, node, 0);
+          input2 = input1;
+        }
+        TfLiteTensor* output = GetOutput(context, node, 0);
+
+        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+            context, output, TfLiteIntArrayCopy(input1->dims)));
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_prepare_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          return kTfLiteError;
+        };
+      }
+
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+    int min_ops_per_subset() { return min_ops_per_subset_; }
+
+   private:
+    std::vector<int> nodes_;
+    TfLiteDelegate delegate_;
+    bool fail_delegate_node_prepare_ = false;
+    int min_ops_per_subset_ = 0;
+    bool fail_delegate_node_invoke_ = false;
+  };
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
+};
+namespace {
+
+TEST_F(TestDelegate, BasicDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  int node = interpreter_->execution_plan()[0];
+  const auto* node_and_reg = interpreter_->node_and_registration(node);
+  EXPECT_EQ(node_and_reg->second.custom_name,
+            delegate_->FakeFusedRegistration().custom_name);
+
+  const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
+      node_and_reg->first.builtin_data);
+  ASSERT_EQ(params->nodes_to_replace->size, 3);
+  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
+  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
+  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
+
+  ASSERT_EQ(params->input_tensors->size, 2);
+  EXPECT_EQ(params->input_tensors->data[0], 0);
+  EXPECT_EQ(params->input_tensors->data[1], 1);
+
+  ASSERT_EQ(params->output_tensors->size, 2);
+  EXPECT_EQ(params->output_tensors->data[0], 3);
+  EXPECT_EQ(params->output_tensors->data[1], 4);
+}
+
+TEST_F(TestDelegate, DelegateNodePrepareFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
+  // ModifyGraphWithDelegate fails, since the Prepare() method in the node's
+  // TfLiteRegistration returns an error status.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteDelegateError);
+  // Execution plan should remain unchanged.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, DelegateNodeInvokeFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Delegation modified execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  // Verify Invoke() behavior: fails first, succeeds after RemoveAllDelegates().
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
+  ASSERT_EQ(RemoveAllDelegates(), kTfLiteOk);
+  // Delegation removed, returning to original execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, SecondDelegationPrepareFailure) {
+  // First delegate only supports nodes 1, 2. Gets applied successfully.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports node 0, but fails during the delegate-node's
+  // Prepare.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
+
+  // Initially, execution plan has 3 nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // First delegate should be applied successfully, yielding a plan with 2
+  // nodes.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  // Second delegate won't get applied.
+  // As a result, previous delegate should also get undone, restoring the
+  // execution plan to its original state.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteDelegateError);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, SecondDelegationInvokeFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  // Outputs match the AddOp path, rather than delegate path.
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  // Verify Invoke() behavior to ensure Interpreter isn't broken.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
+  EXPECT_EQ(RemoveAllDelegates(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  // Deliberately try to set tensor params with quantization while immutable,
+  // ensuring quantization is properly freed.
+  TfLiteQuantization quant = {};
+  quant.type = kTfLiteAffineQuantization;
+  auto quant_params = static_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  quant_params->scale = nullptr;
+  quant_params->zero_point = nullptr;
+  quant_params->quantized_dimension = 0;
+  quant.params = quant_params;
+  ASSERT_NE(interpreter_->SetTensorParametersReadWrite(0, kTfLiteInt8, "", {3},
+                                                       quant),
+            kTfLiteOk);
+}
+
+TEST_F(TestDelegate, ComplexDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  // 0th should be a non-delegated original op
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+  // 1st should be a new macro op (3) which didn't exist)
+  ASSERT_EQ(interpreter_->execution_plan()[1], 3);
+  const auto* node_and_reg = interpreter_->node_and_registration(3);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            delegate_->FakeFusedRegistration().custom_name);
+}
+
+TEST_F(TestDelegate, SetBufferHandleToInput) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 0;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  ASSERT_EQ(tensor->delegate, nullptr);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+}
+
+TEST_F(TestDelegate, SetBufferHandleToOutput) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+}
+
+TEST_F(TestDelegate, SetInvalidHandleToTensor) {
+  interpreter_->Invoke();
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  SimpleDelegate another_simple_delegate({0, 1, 2});
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status = interpreter_->SetBufferHandle(
+      kOutputTensorIndex, handle,
+      another_simple_delegate.get_tf_lite_delegate());
+  // Setting a buffer handle to a tensor with another delegate will fail.
+  ASSERT_EQ(status, kTfLiteError);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+}
+
+// We utilize delegation in such a way as to allow node subsets with a minimum
+// number of ops only.
+TEST_F(TestDelegate, TestDelegationWithPartitionPreview) {
+  // We set kTfLiteDelegateFlagsAllowDynamicTensors to ensure the second
+  // delegate can be applied.
+  // Ops 0 and 2 are delegated but end up in the same partition (based on
+  // dependency analysis). However, since min_ops_per_subset = 3, no delegation
+  // takes place.
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
+      false /**fail_node_prepare**/, 3 /**min_ops_per_subset**/));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
+
+  // Original execution plan remains.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
+  ASSERT_EQ(interpreter_->execution_plan()[2], 2);
+
+  // Same ops supported, but min_ops_per_subset = 2.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
+      false /**fail_node_prepare**/, 2 /**min_ops_per_subset**/));
+  interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 3);
+  const auto* node_and_reg = interpreter_->node_and_registration(3);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            delegate2_->FakeFusedRegistration().custom_name);
+  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
+}
+
+TEST_F(TestDelegate, TestResizeInputWithNonDynamicDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  // Try resizing input to same shape as before (which should be a No-op).
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // This should fail, since the previous application of the delegate will be
+  // re-done automatically, making the graph immutable again.
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Ensure graph has been restored to its valid delegated state.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  // Resize again, but call AllocateTensors as usual afterwards.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  // Try resizing input to same shape as before (which should be a No-op).
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  // Resizing input tensors should temporarily restore original execution plan
+  // of 3 nodes.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // This should fail, since the previous application of the delegate will be
+  // re-done automatically, making the graph immutable again.
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Ensure graph has been restored to its valid delegated state.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  // Resize again, but call AllocateTensors as usual afterwards.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, ReleaseNonPersistentMemoryWithDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+
+  // No-op.
+  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
+
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // This should fail, since the graph is immutable.
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
+}
+
+TEST_F(TestDelegate, TestCopyFromBufferInvoke) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
+  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
+         floats.size() * sizeof(float));
+
+  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
+         floats.size() * sizeof(float));
+
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  // Called Invoke without setting the buffer will not call the CopyFromBuffer
+  interpreter_->Invoke();
+  std::vector<float> res = {2.0f, 4.0f, 6.0f};
+  for (int i = 0; i < tensor->dims->data[0]; ++i) {
+    ASSERT_EQ(tensor->data.f[i], res[i]);
+  }
+}
+
+TEST_F(TestDelegate, TestCopyFromBuffer) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
+  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
+         floats.size() * sizeof(float));
+
+  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
+         floats.size() * sizeof(float));
+
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  interpreter_->Invoke();
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+  for (int i = 0; i < tensor->dims->data[0]; ++i) {
+    ASSERT_EQ(tensor->data.f[i], 6.0f);
+  }
+}
+
+TEST_F(TestDelegate, DelegateCustomOpResolution) {
+  // Build a flatbuffer model that contains the "my_add" custom op which gets
+  // resolved only after SimpleDelegate is applied.
+  flatbuffers::FlatBufferBuilder builder;
+  // Tensors.
+  const int32_t shape[1] = {3};
+  flatbuffers::Offset<Tensor> tensors[3] = {
+      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
+                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("X")),
+      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
+                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Y")),
+      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
+                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Z")),
+  };
+  // Custom op definition.
+  flatbuffers::Offset<OperatorCode> op_code =
+      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "my_add");
+  const int32_t inputs[2] = {0, 1};
+  const int32_t outputs[1] = {2};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0, builder.CreateVector<int32_t>(inputs, 2),
+      builder.CreateVector<int32_t>(outputs, 1), BuiltinOptions_NONE,
+      /*builtin_options=*/0,
+      /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS);
+  // Subgraph & Model.
+  flatbuffers::Offset<SubGraph> subgraph =
+      CreateSubGraph(builder, builder.CreateVector(tensors, 3),
+                     builder.CreateVector<int32_t>(inputs, 2),
+                     builder.CreateVector<int32_t>(outputs, 1),
+                     builder.CreateVector(&op, 1), /*name=*/0);
+  flatbuffers::Offset<Buffer> buffers[1] = {
+      CreateBuffer(builder, builder.CreateVector({})),
+  };
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&op_code, 1),
+      builder.CreateVector(&subgraph, 1), builder.CreateString("test_model"),
+      builder.CreateVector(buffers, 1));
+  builder.Finish(model_buffer);
+  std::vector<char> buffer =
+      std::vector<char>(builder.GetBufferPointer(),
+                        builder.GetBufferPointer() + builder.GetSize());
+  const Model* model = GetModel(buffer.data());
+
+  // Build an interpreter with the model. Initialization should work fine.
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model, ::tflite::ops::builtin::BuiltinOpResolver())(&interpreter),
+      kTfLiteOk);
+  // AllocateTensors should fail, since my_add hasn't been resolved.
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteError);
+
+  // Applying static delegate won't work, since the interpreter will first try
+  // to Prepare all original nodes.
+  std::unique_ptr<SimpleDelegate> static_delegate(new SimpleDelegate({0}));
+  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
+                static_delegate->get_tf_lite_delegate()),
+            kTfLiteError);
+
+  // Applying delegate that supports dynamic tensors should work.
+  std::unique_ptr<SimpleDelegate> dynamic_delegate(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
+                dynamic_delegate->get_tf_lite_delegate()),
+            kTfLiteOk);
+  // AllocateTensors will now work.
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+}
+
+class TestDelegateWithDynamicTensors : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+
+    interpreter_->AddTensors(2);
+    interpreter_->SetInputs({0});
+    interpreter_->SetOutputs({1});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = DynamicCopyOpRegistration();
+    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
+
+    delegate_.Prepare = [](TfLiteContext* context,
+                           TfLiteDelegate* delegate) -> TfLiteStatus {
+      // In this test, the delegate replaces all the nodes if this function is
+      // called.
+      TfLiteIntArray* execution_plan;
+      TF_LITE_ENSURE_STATUS(
+          context->GetExecutionPlan(context, &execution_plan));
+      context->ReplaceNodeSubsetsWithDelegateKernels(
+          context, DelegateRegistration(), execution_plan, delegate);
+      return kTfLiteOk;
+    };
+    delegate_.flags = kTfLiteDelegateFlagsNone;
+  }
+
+  static TfLiteRegistration DynamicCopyOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      TfLiteTensor* output = GetOutput(context, node, 0);
+      SetTensorToDynamic(output);
+      return kTfLiteOk;
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      // Not implemented since this isn't required in testing.
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  static TfLiteRegistration DelegateRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+    return reg;
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  TfLiteDelegate delegate_;
+};
+
+TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The interpreter should not call delegate's `Prepare` when dynamic tensors
+  // exist. So the node ID isn't changed.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The node should be replaced because dynamic tensors are allowed. Therefore
+  // only node ID in the execution plan is changed from 0 to 1.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, ModifyGraphAfterAllocate) {
+  // Trigger allocation *before* delegate application.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
+
+  // Allocation should still succeed.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 9fe80605e39..98314fdc1b8 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -23,10 +23,10 @@ cc_library(
         "//tensorflow/lite:string_util",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api_internal",
@@ -63,10 +63,10 @@ cc_library(
         ":delegate_only_runtime",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:tensorflow",
@@ -100,10 +100,10 @@ cc_library(
         "//tensorflow/lite:util",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -134,10 +134,10 @@ cc_library(
         "@com_google_absl//absl/memory",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/common_runtime/eager:context",
@@ -180,10 +180,10 @@ cc_library(
         # set of core TensorFlow kernels. We may want to revisit this dependency
         # to allow selective registration via build targets.
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/common_runtime/eager:context",
@@ -208,10 +208,10 @@ tf_cc_test(
         "@com_google_googletest//:gtest",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:tensorflow",
@@ -242,10 +242,10 @@ cc_library(
         "//tensorflow/lite:kernel_api",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api_internal",
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 04d9eca597e..d1c21086703 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -250,7 +250,7 @@ class OpNode {
 
     // Precalculating a cache key saves about 10% of inference time for very
     // small models.
-    op_->MutableAttrs()->CacheKey(op_->GetDeviceName());
+    op_->MutableAttrs()->CacheKey(op_->DeviceName());
 
     return tensorflow::Status::OK();
   }
diff --git a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
index 639adf72fcf..b38a66f5687 100644
--- a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
@@ -76,6 +76,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Concat",
           "ConcatOffset",
           "ConcatV2",
+          "Conj",
           "ConjugateTranspose",
           "Const",
           "ControlTrigger",
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 099f653a1b8..c667c2056f4 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -32,7 +32,11 @@ cc_library(
     linkopts = select({
         "//tensorflow:android": [
             "-lEGL",
-            "-lGLESv3",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
         ],
         "//conditions:default": [],
     }),
@@ -167,7 +171,7 @@ ios_static_framework(
         "metal_delegate.h",
         "metal_delegate_internal.h",
     ],
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     deps = [":metal_delegate"],
 )
 
@@ -220,7 +224,11 @@ cc_library(
     linkopts = select({
         "//tensorflow:android": [
             "-lEGL",
-            "-lGLESv3",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
         ],
         "//conditions:default": [],
     }),
diff --git a/tensorflow/lite/delegates/gpu/api.h b/tensorflow/lite/delegates/gpu/api.h
index 921f2d54006..2a531f1f81b 100644
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@@ -54,7 +54,7 @@ namespace gpu {
 //   H  - height
 //   W  - width
 //   C  - channels
-//   D  - depth := IntegralDivideRoundUp(C, 4)
+//   D  - depth := DivideRoundUp(C, 4)
 //   C4 - is the constant = 4.
 enum class DataLayout {
   UNKNOWN,
@@ -164,7 +164,7 @@ struct Dimensions {
   Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
       : b(batch), h(height), w(width), c(channels) {}
 
-  int32_t d() const { return IntegralDivideRoundUp(c, 4); }
+  int32_t d() const { return DivideRoundUp(c, 4); }
 
   int32_t product() const { return b * h * w * c; }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 09c82307a53..475eed4dccc 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -352,10 +352,10 @@ class GlBufferHolder : public TensorTie {
 };
 
 TensorObject TensorToObj(const Tensor& tensor) {
-  if (tensor.StorageType() == TensorStorageType::BUFFER) {
+  if (tensor.GetStorageType() == TensorStorageType::BUFFER) {
     return OpenClBuffer{tensor.GetMemoryPtr()};
   }
-  if (tensor.StorageType() == TensorStorageType::IMAGE_BUFFER) {
+  if (tensor.GetStorageType() == TensorStorageType::IMAGE_BUFFER) {
     return OpenClBuffer{tensor.GetMemoryPtrForWriting()};
   }
   return OpenClTexture{tensor.GetMemoryPtr()};
@@ -516,9 +516,9 @@ TensorObjectDef TensorToDef(const Tensor& tensor) {
   def.dimensions.h = tensor.Height();
   def.dimensions.w = tensor.Width();
   def.dimensions.c = tensor.Channels();
-  def.object_def.data_layout = ToDataLayout(tensor.StorageType());
-  def.object_def.data_type = tensor.DataType();
-  def.object_def.object_type = ToObjectType(tensor.StorageType());
+  def.object_def.data_layout = ToDataLayout(tensor.GetStorageType());
+  def.object_def.data_type = tensor.GetDataType();
+  def.object_def.object_type = ToObjectType(tensor.GetStorageType());
   def.object_def.user_provided = false;
   return def;
 }
@@ -661,9 +661,8 @@ class InferenceBuilderImpl : public InferenceBuilder {
   }
 
   // Links internal tensors with external user-facing objects.
-  std::vector<TensorTieDef> LinkTensors(
-      const GraphFloat32& graph,
-      const std::vector<Value<TensorRef<BHWC>>*>& values) {
+  std::vector<TensorTieDef> LinkTensors(const GraphFloat32& graph,
+                                        const std::vector<Value*>& values) {
     std::vector<TensorTieDef> links;
     links.reserve(values.size());
     for (const auto& value : values) {
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 2ec911813e6..568b3199a9f 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -119,9 +119,8 @@ bool IsBufferBased(const TensorStorageType& type) {
 
 // Generic add is add that have several runtime inputs and they are not
 // broadcasted, i.e. pointwise add for N tensors where N > 1.
-bool IsGenericAdd(const Node& node,
-                  const std::vector<Value<TensorRef<BHWC>>*>& inputs,
-                  const std::vector<Value<TensorRef<BHWC>>*>& outputs) {
+bool IsGenericAdd(const Node& node, const std::vector<Value*>& inputs,
+                  const std::vector<Value*>& outputs) {
   if (inputs.size() == 1) {
     return false;
   }
@@ -375,7 +374,8 @@ void InferenceContext::Merge() {
     auto& linkable_node = nodes_[next_nodes[0]];
     auto* elementwise =
         dynamic_cast<ElementwiseOperation*>(linkable_node.operations[0].get());
-    if (!elementwise || linkable_node.outputs.size() != 1 ||
+    if (!elementwise || !elementwise->IsLinkable() ||
+        linkable_node.outputs.size() != 1 ||
         !IsReady(ready_tensors, linkable_node)) {
       continue;
     }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index cf9b8d2c6eb..ff6f06eeb68 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -133,6 +133,7 @@ cc_library(
     srcs = ["conv_buffer_1x1.cc"],
     hdrs = ["conv_buffer_1x1.h"],
     deps = [
+        ":conv_common",
         ":gpu_operation",
         ":util",
         ":work_group_picking",
@@ -154,6 +155,11 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "conv_common",
+    hdrs = ["conv_common.h"],
+)
+
 cc_test(
     name = "conv_buffer_1x1_test",
     srcs = ["conv_buffer_1x1_test.cc"],
@@ -217,6 +223,7 @@ cc_library(
     srcs = ["conv_powervr.cc"],
     hdrs = ["conv_powervr.h"],
     deps = [
+        ":conv_common",
         ":gpu_operation",
         ":util",
         ":work_group_picking",
@@ -301,6 +308,22 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "conv_weights_converter",
+    srcs = ["conv_weights_converter.cc"],
+    hdrs = ["conv_weights_converter.h"],
+    deps = [
+        ":conv_common",
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
 cc_library(
     name = "converter",
     srcs = ["converter.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
index 4eaff1283a5..b5d89202483 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
@@ -70,10 +70,10 @@ std::string Add::GetElementWiseCode(
 Add::Add(const OperationDef& definition, const std::vector<int>& channels,
          int dst_channels)
     : ElementwiseOperation(definition),
-      dst_depth_(IntegralDivideRoundUp(dst_channels, 4)) {
+      dst_depth_(DivideRoundUp(dst_channels, 4)) {
   src_depthes_.resize(channels.size());
   for (int i = 0; i < channels.size(); ++i) {
-    src_depthes_[i] = IntegralDivideRoundUp(channels[i], 4);
+    src_depthes_[i] = DivideRoundUp(channels[i], 4);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.h b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
index d47954748c7..a8c3f2c5d3f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
@@ -48,6 +48,7 @@ class Add : public ElementwiseOperation {
   std::string GetCoreCode(const LinkingContext& context) const override;
   std::string GetArgsDeclaration() const override;
   absl::Status BindArguments(CLKernel* kernel) override;
+  bool IsLinkable() const override { return dst_depth_ == src_depthes_[0]; }
 
  private:
   std::string GetElementWiseCode(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
index 3a7ec1c0cb7..f1970cef645 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@@ -79,7 +79,7 @@ std::string GetConcatKernelCode(
     // generation.
     c += "  int Z = 0;\n";
     for (int i = 0; i < channels.size(); ++i) {
-      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      const int depth = DivideRoundUp(channels[i], 4);
       if (depth % 2 == 0) {
         // We can read more at once inside of loop in case depth % 2 == 0
         // it should be better for reading latency hiding
@@ -112,7 +112,7 @@ std::string GetConcatKernelCode(
     int read_index = 0;
     int z = 0;
     for (int i = 0; i < channels.size(); ++i) {
-      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      const int depth = DivideRoundUp(channels[i], 4);
       for (int d = 0; d < depth; ++d) {
         const int channels_in_group = std::min(4, channels[i] - d * 4);
         const std::string temp_name = "t" + std::to_string(read_index);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
index b79599d8e95..564f0d1448e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
@@ -128,24 +128,24 @@ absl::Status Conv3D::BindArguments() {
     RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
   }
   RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
   return absl::OkStatus();
 }
 
 int3 Conv3D::GetGridSize() const {
-  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-                                           conv_params_.block_size.x);
+  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
+                                   conv_params_.block_size.x);
   const int grid_y =
-      IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
   const int grid_z =
-      IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
-      IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
+      DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
   int3 wg;
-  wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
-  wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
-  wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
+  wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
+  wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
+  wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
   return int3(wg[conv_params_.work_group_launch_order[0]] *
                   conv_params_.work_group_size.x,
               wg[conv_params_.work_group_launch_order[1]] *
@@ -885,8 +885,8 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
 Conv3D::ConvParams Conv3D::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
     const Convolution3DAttributes& attr) const {
-  const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4);
-  const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
                              attr.dilations.w == 1 &&
                              attr.padding.prepended.w == 0 &&
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
index 8df7994ca38..8dfeac1ee6f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
@@ -147,8 +147,8 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
                                    CLContext* context) {
   const int block_size = conv_params_.block_size.w;
   const int dst_slices =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size);
-  const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = kernel_size_.x;
   const int kernel_y = kernel_size_.y;
   const int kernel_z = kernel_size_.z;
@@ -219,8 +219,8 @@ void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
                                   absl::Span<T> dst) {
   const int block_size = conv_params_.block_size.w;
   const int dst_slices =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size);
-  const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = kernel_size_.x;
   const int kernel_y = kernel_size_.y;
   const int kernel_z = kernel_size_.z;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
index 70bd1b5249f..cb86e023545 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -303,11 +303,15 @@ absl::Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
 absl::Status ConvBuffer1x1::BindArguments() {
   kernel_.ResetBindingCounter();
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  if (definition_.src_tensors.size() == 1) {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  } else {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  }
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
   RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  const int src_width_elements = IntegralDivideRoundUp(
+  const int src_width_elements = DivideRoundUp(
       src_[0]->Width() * src_[0]->Batch(), (conv_params_.element_size / 4));
   int4 src_size = int4(src_width_elements, src_[0]->Height(), src_[0]->Slices(),
                        src_width_elements * src_[0]->Height());
@@ -317,14 +321,14 @@ absl::Status ConvBuffer1x1::BindArguments() {
 }
 
 int3 ConvBuffer1x1::GetGridSize() const {
-  const int dst_width_elements = IntegralDivideRoundUp(
+  const int dst_width_elements = DivideRoundUp(
       dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
   const int grid_x =
-      IntegralDivideRoundUp(dst_width_elements, conv_params_.block_size.x);
+      DivideRoundUp(dst_width_elements, conv_params_.block_size.x);
   const int grid_y =
-      IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
   const int grid_z =
-      IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
   return int3(grid_x, grid_y, grid_z);
 }
 
@@ -351,6 +355,18 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
          attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
 }
 
+bool IsConvBuffer1x1Supported(const OperationDef& definition,
+                              const BHWC& weights_shape,
+                              const Convolution2DAttributes& attr) {
+  auto src_storage_type = definition.src_tensors[0].storage_type;
+  return src_storage_type == TensorStorageType::BUFFER &&
+         weights_shape.w == 1 && weights_shape.h == 1 &&
+         attr.dilations.w == 1 && attr.dilations.h == 1 &&
+         attr.strides.w == 1 && attr.strides.h == 1 &&
+         attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
+         attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
+}
+
 absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
                                  const OperationDef& definition,
                                  const Convolution2DAttributes& attr,
@@ -358,8 +374,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
   if (!IsConvBuffer1x1Supported(definition, attr)) {
     return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
   }
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (shape) {
     conv_params = GetBestParams(*creation_context.device, definition, *shape,
@@ -376,8 +392,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
                                  const OperationDef& definition,
                                  const FullyConnectedAttributes& attr,
                                  ConvBuffer1x1* result, const BHWC* shape) {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (shape) {
     conv_params = GetBestParams(*creation_context.device, definition, *shape,
@@ -396,8 +412,8 @@ absl::Status CreateConvBuffer1x1Wino4x4To6x6(
     const CreationContext& creation_context, const OperationDef& definition,
     const Convolution2DAttributes& attr, ConvBuffer1x1* result,
     const BHWC* shape) {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (shape) {
     conv_params = GetBestParams(*creation_context.device, definition, *shape,
@@ -414,6 +430,29 @@ absl::Status CreateConvBuffer1x1Wino4x4To6x6(
       attr.weights, *creation_context.device, creation_context.context);
 }
 
+absl::Status CreateConvBuffer1x1DynamicWeights(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    ConvBuffer1x1* result, const BHWC* dst_shape) {
+  const int dst_depth = DivideRoundUp(weights_shape.b, 4);
+  const int src_depth = DivideRoundUp(weights_shape.c, 4);
+  ConvBuffer1x1::ConvParams conv_params;
+  if (dst_shape) {
+    conv_params = GetBestParams(*creation_context.device, definition,
+                                *dst_shape, src_depth, dst_depth);
+  } else {
+    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
+                                dst_depth);
+  }
+  *result = ConvBuffer1x1(definition, conv_params);
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = result->definition_.GetDataType();
+  create_info.aligned_size = weights_shape.b;
+  return CreateLinearStorage(create_info, attr.bias, creation_context.context,
+                             &result->biases_);
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
index 70595727e3a..d85fca2c6d9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_common.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
@@ -48,9 +49,15 @@ class ConvBuffer1x1 : public GPUOperation {
 
   absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
 
+  ConvWeightsDescription GetConvWeightsDescription() const {
+    ConvWeightsDescription desc;
+    desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4;
+    desc.output_group_size = conv_params_.block_size.z;
+    return desc;
+  }
+
   struct ConvParams {
     int3 block_size = int3(1, 1, 1);
     int element_size = 4;  // can be 4, 8 or 16
@@ -77,6 +84,10 @@ class ConvBuffer1x1 : public GPUOperation {
       const CreationContext& creation_context, const OperationDef& definition,
       const Convolution2DAttributes& attr, ConvBuffer1x1* result,
       const BHWC* shape);
+  friend absl::Status CreateConvBuffer1x1DynamicWeights(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const Convolution2DAttributes& attr, const BHWC& weights_shape,
+      ConvBuffer1x1* result, const BHWC* dst_shape);
 
   template <DataType T>
   absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
@@ -135,8 +146,8 @@ absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
 template <DataType T>
 absl::Status ConvBuffer1x1::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
 
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
   const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
@@ -163,6 +174,10 @@ absl::Status ConvBuffer1x1::UploadWeights(
 bool IsConvBuffer1x1Supported(const OperationDef& definition,
                               const Convolution2DAttributes& attr);
 
+bool IsConvBuffer1x1Supported(const OperationDef& definition,
+                              const BHWC& weights_shape,
+                              const Convolution2DAttributes& attr);
+
 absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
                                  const OperationDef& definition,
                                  const Convolution2DAttributes& attr,
@@ -175,6 +190,11 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
                                  ConvBuffer1x1* result,
                                  const BHWC* shape = nullptr);
 
+absl::Status CreateConvBuffer1x1DynamicWeights(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    ConvBuffer1x1* result, const BHWC* dst_shape = nullptr);
+
 absl::Status CreateConvBuffer1x1Wino4x4To6x6(
     const CreationContext& creation_context, const OperationDef& definition,
     const Convolution2DAttributes& attr, ConvBuffer1x1* result,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_common.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_common.h
new file mode 100644
index 00000000000..f630c9d1f1c
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_common.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_COMMON_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_COMMON_H_
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class ConvWeightsLayout {
+  kUnknown,
+  kOHWIOGroupI4O4,
+};
+
+struct ConvWeightsDescription {
+  ConvWeightsLayout layout;
+  int output_group_size;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_COMMON_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index 07d2da9d641..d4dc206ffce 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -40,9 +40,9 @@ std::string GenerateConvolutionConstantCode(
 
   std::string c = GetCommonDefines(op_def.precision);
 
-  const int out_z = IntegralDivideRoundUp(dst_channels, 4);
+  const int out_z = DivideRoundUp(dst_channels, 4);
   const std::string kOutZ = std::to_string(out_z);
-  const int src_depth = IntegralDivideRoundUp(src_channels, 4);
+  const int src_depth = DivideRoundUp(src_channels, 4);
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@@ -290,7 +290,7 @@ bool IsConvConstantsSupported(const CLDevice& device,
                              : sizeof(half);
   const int filters_buffer_size = filters_count * float_size;
   const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
-  const int flt4_registers = IntegralDivideRoundUp(w_shape.o, 4);
+  const int flt4_registers = DivideRoundUp(w_shape.o, 4);
   return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
index 15049cfbc66..8d80d48314d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -88,7 +88,7 @@ class ConvConstants : public GPUOperation {
 template <DataType T>
 absl::Status ConvConstants::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
@@ -112,8 +112,8 @@ absl::Status ConvConstants::UploadWeights(
 template <DataType S, typename T>
 void ConvConstants::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index bd4f53395f3..9bb52b3e9c2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -138,6 +138,18 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                        attr.dilations.w, attr.dilations.h),
       conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
 
+ConvPowerVR::ConvPowerVR(const OperationDef& definition,
+                         const Convolution2DAttributes& attr,
+                         const BHWC& weights_shape, const CLDevice& device,
+                         const BHWC* dst_shape)
+    : GPUOperation(definition),
+      stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
+                      -attr.padding.prepended.h),
+      kernel_dilation_(weights_shape.w, weights_shape.h, attr.dilations.w,
+                       attr.dilations.h),
+      conv_params_(GuessBestParams(device, definition, attr, weights_shape,
+                                   dst_shape)) {}
+
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
                          const CLDevice& device, const BHWC* dst_shape)
@@ -192,7 +204,11 @@ absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
 absl::Status ConvPowerVR::BindArguments() {
   kernel_.ResetBindingCounter();
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  if (definition_.src_tensors.size() == 1) {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  } else {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  }
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
   RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
@@ -205,8 +221,8 @@ absl::Status ConvPowerVR::BindArguments() {
              kernel_dilation_.z * src_[0]->Batch(), kernel_dilation_.w)));
   }
   if (conv_params_.linear_hw) {
-    const int grid_x = IntegralDivideRoundUp(
-        dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x);
+    const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
+                                     conv_params_.block_size.x);
     RETURN_IF_ERROR(kernel_.SetBytesAuto(grid_x));
   }
   RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
@@ -215,27 +231,26 @@ absl::Status ConvPowerVR::BindArguments() {
 }
 
 int3 ConvPowerVR::GetGridSize() const {
-  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-                                           conv_params_.block_size.x);
+  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
+                                   conv_params_.block_size.x);
   const int grid_y =
-      IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
   const int grid_z =
-      IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
   int3 wg;
 
   if (conv_params_.linear_hw) {
-    wg.x =
-        IntegralDivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
-    wg.y = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.y);
+    wg.x = DivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
+    wg.y = DivideRoundUp(grid_z, conv_params_.work_group_size.y);
     return int3(wg[conv_params_.work_group_launch_order[0]] *
                     conv_params_.work_group_size.x,
                 wg[conv_params_.work_group_launch_order[1]] *
                     conv_params_.work_group_size.y,
                 1);
   } else {
-    wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
-    wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
-    wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
+    wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
+    wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
+    wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
     return int3(wg[conv_params_.work_group_launch_order[0]] *
                     conv_params_.work_group_size.x,
                 wg[conv_params_.work_group_launch_order[1]] *
@@ -780,6 +795,13 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+  } else if (device.IsAdreno()) {
+    conv_params.block_size = int3(2, 2, 1);
+    conv_params.work_group_size = int3(8, 2, 1);
+    conv_params.work_group_launch_order = int3(0, 1, 2);
+    conv_params.fixed_work_group_size = false;
+    conv_params.src_depth_loop_size = 1;
+    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   } else {
     conv_params.block_size = int3(1, 1, 4);
     conv_params.work_group_size = int3(8, 2, 1);
@@ -808,8 +830,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
                              attr.dilations.w == 1 &&
                              attr.padding.prepended.w == 0 &&
@@ -822,11 +844,27 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
 }
 
+ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
+    const CLDevice& device, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    const BHWC* dst_shape) const {
+  const int dst_depth = DivideRoundUp(weights_shape.b, 4);
+  const int src_depth = DivideRoundUp(weights_shape.c, 4);
+  const bool x_kernel_is_1 =
+      weights_shape.w == 1 && attr.strides.w == 1 && attr.dilations.w == 1 &&
+      attr.padding.prepended.w == 0 && attr.padding.appended.w == 0;
+  const bool y_kernel_is_1 =
+      weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
+      attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
+  return GuessBestParams(device, definition, src_depth, dst_depth,
+                         x_kernel_is_1, y_kernel_is_1, false, dst_shape);
+}
+
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
     const FullyConnectedAttributes& attr, const BHWC* dst_shape) const {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvPowerVR::ConvParams params = GuessBestParams(
       device, definition, src_depth, dst_depth, true, true, false, dst_shape);
   params.work_group_size.x *= params.work_group_size.y;
@@ -839,8 +877,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
     const CLDevice& device, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvPowerVR::ConvParams params = GuessBestParams(
       device, definition, src_depth, dst_depth, true, true, true, dst_shape);
   params.block_size.x *= params.block_size.y;
@@ -864,6 +902,20 @@ absl::Status CreateConvPowerVR(const CreationContext& creation_context,
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
+absl::Status CreateConvPowerVRDynamicWeights(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    ConvPowerVR* result, const BHWC* dst_shape) {
+  *result = ConvPowerVR(definition, attr, weights_shape,
+                        *creation_context.device, dst_shape);
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = result->conv_params_.weights_data_type;
+  create_info.aligned_size = weights_shape.b;
+  return CreateLinearStorage(create_info, attr.bias, creation_context.context,
+                             &result->biases_);
+}
+
 absl::Status CreateConvPowerVRWino4x4To6x6(
     const CreationContext& creation_context, const OperationDef& definition,
     const Convolution2DAttributes& attr, ConvPowerVR* result,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index 3a1332a5505..5eff4b36053 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_common.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
@@ -44,6 +45,13 @@ class ConvPowerVR : public GPUOperation {
   absl::Status Tune(const TuningParameters& params) override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
+  ConvWeightsDescription GetConvWeightsDescription() const {
+    ConvWeightsDescription desc;
+    desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4;
+    desc.output_group_size = conv_params_.block_size.z;
+    return desc;
+  }
+
   // Move only
   ConvPowerVR(ConvPowerVR&& operation);
   ConvPowerVR& operator=(ConvPowerVR&& operation);
@@ -82,6 +90,9 @@ class ConvPowerVR : public GPUOperation {
   ConvPowerVR(const OperationDef& definition,
               const Convolution2DAttributes& attr, const CLDevice& device,
               const BHWC* dst_shape = nullptr);
+  ConvPowerVR(const OperationDef& definition,
+              const Convolution2DAttributes& attr, const BHWC& weights_shape,
+              const CLDevice& device, const BHWC* dst_shape = nullptr);
   ConvPowerVR(const OperationDef& definition,
               const FullyConnectedAttributes& attr, const CLDevice& device,
               const BHWC* dst_shape = nullptr);
@@ -112,6 +123,11 @@ class ConvPowerVR : public GPUOperation {
                                         ConvPowerVR* result,
                                         const BHWC* dst_shape);
 
+  friend absl::Status CreateConvPowerVRDynamicWeights(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const Convolution2DAttributes& attr, const BHWC& weights_shape,
+      ConvPowerVR* result, const BHWC* dst_shape);
+
   friend absl::Status CreateConvPowerVRWino4x4To6x6(
       const CreationContext& creation_context, const OperationDef& definition,
       const Convolution2DAttributes& attr, ConvPowerVR* result,
@@ -126,6 +142,11 @@ class ConvPowerVR : public GPUOperation {
                              const OperationDef& definition,
                              const Convolution2DAttributes& attr,
                              const BHWC* dst_shape = nullptr) const;
+  ConvParams GuessBestParams(const CLDevice& device,
+                             const OperationDef& definition,
+                             const Convolution2DAttributes& attr,
+                             const BHWC& weights_shape,
+                             const BHWC* dst_shape = nullptr) const;
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
                              const FullyConnectedAttributes& attr,
@@ -188,8 +209,8 @@ absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
 template <DataType T>
 absl::Status ConvPowerVR::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
 
   const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
   const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
@@ -225,6 +246,11 @@ absl::Status CreateConvPowerVR(const CreationContext& creation_context,
                                ConvPowerVR* result,
                                const BHWC* dst_shape = nullptr);
 
+absl::Status CreateConvPowerVRDynamicWeights(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    ConvPowerVR* result, const BHWC* dst_shape = nullptr);
+
 absl::Status CreateConvPowerVRWino4x4To6x6(
     const CreationContext& creation_context, const OperationDef& definition,
     const Convolution2DAttributes& attr, ConvPowerVR* result,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index 953f564c40a..e92cc13706d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -433,9 +433,9 @@ absl::Status ConvTexture::BindArguments() {
 
 int3 ConvTexture::GetGridSize() const {
   const int grid_x =
-      IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
-  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), block_size_.y);
-  const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z);
+      DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
+  const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
+  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
   return int3(grid_x, grid_y, grid_z);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
index dd9157232b7..42f7ecd51af 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -148,9 +148,9 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
 template <DataType T>
 absl::Status ConvTexture::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  int dst_depth = DivideRoundUp(weights.shape.o, 4);
   dst_depth = AlignByN(dst_depth, block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
@@ -206,9 +206,9 @@ template <DataType S, typename T>
 void ConvTexture::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
     absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
-  int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  int dst_depth = DivideRoundUp(weights.shape.o, 4);
   dst_depth = AlignByN(dst_depth, block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
new file mode 100644
index 00000000000..71559ab587a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
@@ -0,0 +1,160 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetConverterToConvWeightsCode(
+    const OperationDef& op_def,
+    const ConvWeightsDescription& conv_weights_desc) {
+  TensorCodeGenerator src_tensor(
+      "src_data",
+      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
+      op_def.src_tensors[0]);
+  TensorCodeGenerator dst_tensor(
+      "dst_data",
+      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
+      op_def.dst_tensors[0]);
+
+  std::string c = GetCommonDefines(op_def.precision);
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,              \n";
+  c += "    float4 mask\n";
+  c += ") {\n";
+  c += "  int GROUP_SIZE = " +
+       std::to_string(conv_weights_desc.output_group_size) + ";\n";
+  c += "  int O = get_global_id(0) * 4;\n";
+  c += "  int I = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  int W = Z % src_size.x;\n";
+  c += "  int H = Z / src_size.x;\n";
+  c += "  if (O >= src_size.w || I >= src_size.z || H >= src_size.y) return;\n";
+  c += "  FLT4 v0 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 0") + ";\n";
+  c += "  FLT4 v1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  c += "  FLT4 v2 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  c += "  FLT4 v3 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  c += "  if (O + 1 < src_size.w) {\n";
+  c += "    v1 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 1") + ";\n";
+  c += "  }\n";
+  c += "  if (O + 2 < src_size.w) {\n";
+  c += "    v2 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 2") + ";\n";
+  c += "  }\n";
+  c += "  if (O + 3 < src_size.w) {\n";
+  c += "    v3 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 3") + ";\n";
+  c += "  }\n";
+  c += "  if (I == src_size.z - 1) {\n";
+  c += "    FLT4 mask_t = TO_FLT4(mask);\n";
+  c += "    v0 *= mask_t;\n";
+  c += "    v1 *= mask_t;\n";
+  c += "    v2 *= mask_t;\n";
+  c += "    v3 *= mask_t;\n";
+  c += "  }\n";
+  c += "  FLT4 r0 = (FLT4)(v0.x, v1.x, v2.x, v3.x);\n";
+  c += "  FLT4 r1 = (FLT4)(v0.y, v1.y, v2.y, v3.y);\n";
+  c += "  FLT4 r2 = (FLT4)(v0.z, v1.z, v2.z, v3.z);\n";
+  c += "  FLT4 r3 = (FLT4)(v0.w, v1.w, v2.w, v3.w);\n";
+  c += "  int d_index = O / (GROUP_SIZE * 4);\n";
+  c += "  int k_index = (O % (GROUP_SIZE * 4)) / 4;\n";
+  c += "  int dst_offset = (((d_index * src_size.y + H) * src_size.x + W) * "
+       "src_size.z + I) * GROUP_SIZE + "
+       "k_index;\n";
+  c += "  int address0 = dst_offset * 4 + 0;\n";
+  c += "  int address1 = dst_offset * 4 + 1;\n";
+  c += "  int address2 = dst_offset * 4 + 2;\n";
+  c += "  int address3 = dst_offset * 4 + 3;\n";
+  c += "  " + dst_tensor.Write("r0", "address0");
+  c += "  " + dst_tensor.Write("r1", "address1");
+  c += "  " + dst_tensor.Write("r2", "address2");
+  c += "  " + dst_tensor.Write("r3", "address3");
+  c += "}\n";
+  return c;
+}
+}  // namespace
+
+ConverterToConvWeights::ConverterToConvWeights(
+    ConverterToConvWeights&& operation)
+    : GPUOperation(std::move(operation)),
+      conv_weights_desc_(operation.conv_weights_desc_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConverterToConvWeights& ConverterToConvWeights::operator=(
+    ConverterToConvWeights&& operation) {
+  if (this != &operation) {
+    conv_weights_desc_ = operation.conv_weights_desc_;
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+absl::Status ConverterToConvWeights::Compile(
+    const CreationContext& creation_context) {
+  std::string code =
+      GetConverterToConvWeightsCode(definition_, conv_weights_desc_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+absl::Status ConverterToConvWeights::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
+  RETURN_IF_ERROR(
+      kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
+  return absl::OkStatus();
+}
+
+int3 ConverterToConvWeights::GetGridSize() const {
+  const int grid_x = DivideRoundUp(
+      AlignByN(src_[0]->Batch(), 4 * conv_weights_desc_.output_group_size), 4);
+  const int grid_y = src_[0]->Slices();
+  const int grid_z = src_[0]->Width() * src_[0]->Height();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+absl::Status ConverterToConvWeights::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+absl::Status ConverterToConvWeights::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+ConverterToConvWeights CreateConverterToConvWeights(
+    const OperationDef& definition,
+    const ConvWeightsDescription& conv_weights_desc) {
+  return ConverterToConvWeights(definition, conv_weights_desc);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
new file mode 100644
index 00000000000..d79cfb8e3e0
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_WEIGHTS_CONVERTER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_WEIGHTS_CONVERTER_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_common.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConverterToConvWeights : public GPUOperation {
+ public:
+  ConverterToConvWeights(const OperationDef& definition,
+                         const ConvWeightsDescription& conv_weights_desc)
+      : GPUOperation(definition),
+        conv_weights_desc_(conv_weights_desc),
+        work_group_size_(8, 4, 1) {}
+  absl::Status AddToQueue(CLCommandQueue* queue) override;
+  absl::Status Tune(const TuningParameters& params) override;
+
+  absl::Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConverterToConvWeights(ConverterToConvWeights&& operation);
+  ConverterToConvWeights& operator=(ConverterToConvWeights&& operation);
+  ConverterToConvWeights(const ConverterToConvWeights&) = delete;
+  ConverterToConvWeights& operator=(const ConverterToConvWeights&) = delete;
+
+ private:
+  absl::Status BindArguments();
+  int3 GetGridSize() const;
+
+  ConvWeightsDescription conv_weights_desc_;
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+// We expect src BHWC tensor and we assume that B is O, H = H, W = W, C is I
+// as dst we expect Tensor with storage type BUFFER and
+// dst.b * dst.h * dst.w * dst.c = AlignByN(src.b, 4) * src.h * src.w
+// AlignByN(src.c, 4)
+ConverterToConvWeights CreateConverterToConvWeights(
+    const OperationDef& definition,
+    const ConvWeightsDescription& conv_weights_desc);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_WEIGHTS_CONVERTER_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index 417fb63e820..7d7ebeb2020 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -322,7 +322,7 @@ ConvolutionTransposed::ConvolutionTransposed(
       block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
     }
   }
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   if (dst_depth == 1 || dst_depth == 3) {
     if (!device.IsMali()) {
       block_size_.y *= block_size_.z;
@@ -406,10 +406,9 @@ absl::Status ConvolutionTransposed::BindArguments() {
 int3 ConvolutionTransposed::GetGridSize() const {
   const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
   const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
-  const int grid_x =
-      IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
-  const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y);
-  const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z);
+  const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
+  const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
+  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
   return int3(grid_x, grid_y, grid_z);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
index 57fdad17df8..867966f55ad 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -91,8 +91,8 @@ template <DataType T>
 absl::Status ConvolutionTransposed::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
   const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = kernel_size_.x;
   const int kernel_y = kernel_size_.y;
   int texture_width = dst_depth;
@@ -160,8 +160,8 @@ template <DataType S, typename T>
 void ConvolutionTransposed::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
   const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = kernel_size_.x;
   const int kernel_y = kernel_size_.y;
   int texture_width = dst_depth;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
index 9d3f0b2639c..4f024ee6e87 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
@@ -440,8 +440,8 @@ absl::Status ConvolutionTransposed3D::BindArguments() {
   if (definition_.IsBatchSupported()) {
     RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w)));
+  RETURN_IF_ERROR(
+      kernel_.SetBytesAuto(DivideRoundUp(dst_[0]->Slices(), block_size_.w)));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHDS()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHDS()));
   return absl::OkStatus();
@@ -451,11 +451,10 @@ int3 ConvolutionTransposed3D::GetGridSize() const {
   const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
   const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
   const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
-  const int grid_x =
-      IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
-  const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y);
-  const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w) *
-                     IntegralDivideRoundUp(aligned_d, block_size_.z);
+  const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
+  const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
+  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w) *
+                     DivideRoundUp(aligned_d, block_size_.z);
   return int3(grid_x, grid_y, grid_z);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
index c610d1155e1..14757efb5c8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
@@ -91,8 +91,8 @@ template <DataType T>
 absl::Status ConvolutionTransposed3D::UploadWeights(
     const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
   const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = kernel_size_.x;
   const int kernel_y = kernel_size_.y;
   const int kernel_z = kernel_size_.z;
@@ -162,8 +162,8 @@ template <DataType S, typename T>
 void ConvolutionTransposed3D::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
   const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.w);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = kernel_size_.x;
   const int kernel_y = kernel_size_.y;
   const int kernel_z = kernel_size_.z;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index 4be593be57b..4a68eda1d95 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -339,14 +339,13 @@ absl::Status ConvolutionTransposed3x3::BindArguments() {
 }
 
 int3 ConvolutionTransposed3x3::GetGridSize() const {
-  const int grid_x =
-      IntegralDivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
-  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
+  const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
+  const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
   const int grid_z = dst_[0]->Slices();
   int3 wg;
-  wg.x = IntegralDivideRoundUp(grid_x, work_group_size_.x);
-  wg.y = IntegralDivideRoundUp(grid_y, work_group_size_.y);
-  wg.z = IntegralDivideRoundUp(grid_z, work_group_size_.z);
+  wg.x = DivideRoundUp(grid_x, work_group_size_.x);
+  wg.y = DivideRoundUp(grid_y, work_group_size_.y);
+  wg.z = DivideRoundUp(grid_z, work_group_size_.z);
   return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
               wg[work_group_launch_order_[1]] * work_group_size_.y,
               wg[work_group_launch_order_[2]] * work_group_size_.z);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
index 7ff31094ca9..fa44d6a7270 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@@ -84,8 +84,8 @@ class ConvolutionTransposed3x3 : public GPUOperation {
 template <DataType T>
 absl::Status ConvolutionTransposed3x3::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 3;  //  This operation support only 3x3 kernel
   const int kernel_y = 3;
   const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@@ -109,8 +109,8 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
 template <DataType S, typename T>
 void ConvolutionTransposed3x3::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 3;
   const int kernel_y = 3;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
index b8e4b25443e..d65ff071c7e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -224,8 +224,8 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
 absl::Status ConvolutionTransposed3x3Thin::Compile(
     const CreationContext& creation_context) {
   const auto code = GenerateConvolutionTransposedCode(
-      definition_, biases_, IntegralDivideRoundUp(src_channels_, 4),
-      IntegralDivideRoundUp(dst_channels_, 4), *creation_context.device,
+      definition_, biases_, DivideRoundUp(src_channels_, 4),
+      DivideRoundUp(dst_channels_, 4), *creation_context.device,
       linked_operations_);
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
index 2f60ac583ce..447afb621e2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@@ -82,8 +82,8 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
 template <DataType T>
 absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
-  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int src_depth = DivideRoundUp(src_channels_, 4);
+  const int dst_depth = DivideRoundUp(dst_channels_, 4);
   const int kernel_x = 3;  //  This operation support only 3x3 kernel
   const int kernel_y = 3;
   const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@@ -108,8 +108,8 @@ absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
 template <DataType S, typename T>
 void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
-  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int src_depth = DivideRoundUp(src_channels_, 4);
+  const int dst_depth = DivideRoundUp(dst_channels_, 4);
   const int kernel_x = 3;
   const int kernel_y = 3;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index a558fe6cb3c..0f7f90989e8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -332,9 +332,8 @@ absl::Status ConvolutionTransposed4x4::BindArguments() {
 }
 
 int3 ConvolutionTransposed4x4::GetGridSize() const {
-  const int grid_x =
-      IntegralDivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
-  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height() + 2, 2);
+  const int grid_x = DivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
+  const int grid_y = DivideRoundUp(dst_[0]->Height() + 2, 2);
   const int grid_z = dst_[0]->Slices();
   return int3(grid_x, grid_y, grid_z);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
index 9f514d1f627..870c72f7aa2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@@ -82,8 +82,8 @@ class ConvolutionTransposed4x4 : public GPUOperation {
 template <DataType T>
 absl::Status ConvolutionTransposed4x4::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 4;  //  This operation support only 4x4 kernel
   const int kernel_y = 4;
   const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@@ -107,8 +107,8 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
 template <DataType S, typename T>
 void ConvolutionTransposed4x4::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 4;
   const int kernel_y = 4;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
index 8ea40bedd7d..8eca689ed11 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@@ -187,8 +187,8 @@ ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
 absl::Status ConvolutionTransposedThin::Compile(
     const CreationContext& creation_context) {
   const auto code = GenerateConvolutionTransposedCode(
-      definition_, IntegralDivideRoundUp(src_channels_, 4), dst_channels_,
-      kernel_size_, *creation_context.device, linked_operations_);
+      definition_, DivideRoundUp(src_channels_, 4), dst_channels_, kernel_size_,
+      *creation_context.device, linked_operations_);
 
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
index 8cf9a7a5953..db2ad8c71eb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@@ -82,7 +82,7 @@ class ConvolutionTransposedThin : public GPUOperation {
 template <DataType T>
 absl::Status ConvolutionTransposedThin::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int src_depth = DivideRoundUp(src_channels_, 4);
   const int elements_count =
       kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_;
 
@@ -104,7 +104,7 @@ absl::Status ConvolutionTransposedThin::UploadWeights(
 template <DataType S, typename T>
 void ConvolutionTransposedThin::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int src_depth = DivideRoundUp(src_channels_, 4);
   const int kernel_x = kernel_size_.x;
   const int kernel_y = kernel_size_.y;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
index 9d3e33630f8..7655f2abae0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -89,7 +89,7 @@ template <DataType T>
 absl::Status DepthwiseConvolution::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
   const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
@@ -137,7 +137,7 @@ template <DataType S, typename T>
 void DepthwiseConvolution::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
   const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h
index 53e38a3e154..3c87ba5832c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h
@@ -88,7 +88,7 @@ template <DataType T>
 absl::Status DepthwiseConvolution3D::UploadWeights(
     const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
   const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
   const int kernel_z = weights.shape.d;
@@ -130,7 +130,7 @@ template <DataType S, typename T>
 void DepthwiseConvolution3D::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
   const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
   const int kernel_z = weights.shape.d;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
index 348229e69f7..c8ac82581c0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
@@ -323,8 +323,8 @@ absl::Status DepthwiseConv3x3::BindArguments() {
 }
 
 int3 DepthwiseConv3x3::GetGridSize() const {
-  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
-  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
+  const int grid_x = DivideRoundUp(dst_[0]->Width(), 2);
+  const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
   const int grid_z = dst_[0]->Slices();
   return int3(grid_x, grid_y, grid_z);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
index ac7c316df8b..1ab17e3048c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
@@ -83,7 +83,7 @@ template <DataType T>
 absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
     const tflite::gpu::Tensor<OHWI, T>& weights,
     const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   int texture_width = 10;  // 3x3 kernel + 1 bias
   int texture_height = src_depth;
   const int elements_count = texture_width * texture_height;
@@ -129,7 +129,7 @@ template <DataType S, typename T>
 void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
     const tflite::gpu::Tensor<OHWI, S>& weights,
     const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
 
   int counter = 0;
   for (int s = 0; s < src_depth; ++s) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
index 180f6763a2b..0be5288ed3a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -70,8 +70,8 @@ class FullyConnected : public GPUOperation {
 template <DataType T>
 absl::Status FullyConnected::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
 
   const int elements_count = src_depth * dst_depth * 4;
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
@@ -94,8 +94,8 @@ absl::Status FullyConnected::UploadWeights(
 template <DataType T, typename S>
 void FullyConnected::RearrangeWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<S> dst) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   int counter = 0;
 
   for (int s = 0; s < src_depth; ++s) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 17817682bce..71b1b8807a5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -158,6 +158,9 @@ class ElementwiseOperation : public GPUOperation {
     return absl::OkStatus();
   }
 
+  // ovveride to return false if for any reason operation can not be linked.
+  virtual bool IsLinkable() const { return true; }
+
  protected:
   absl::Status BindArguments();
   int3 GetGridSize() const;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
index 2adb6a20bc4..444a380c2e9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
@@ -38,7 +38,7 @@ TEST_F(OpenCLOperationTest, MultiplyAddVectorMul) {
   src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
 
   MultiplyAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, 2.0f};
   attr.param = parameters;
@@ -68,7 +68,7 @@ TEST_F(OpenCLOperationTest, MultiplyAddVectorAdd) {
   src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
 
   AddAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, 2.0f};
   attr.param = parameters;
@@ -152,7 +152,7 @@ TEST_F(OpenCLOperationTest, MultiplyAddVectorMad) {
   src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
 
   MultiplyAttributes mul_attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, 2.0f};
   mul_attr.param = parameters;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
index fb985461c02..e292f2dad7d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@@ -185,9 +185,11 @@ std::string GetMaxPoolingKernelCode(
   TensorCodeGenerator dst_tensor(
       "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
       op_def.dst_tensors[0]);
+  const auto dst_ind_def =
+      output_indices ? op_def.dst_tensors[1] : op_def.dst_tensors[0];
   TensorCodeGenerator indices_tensor(
       "dst_indices", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[1]);
+      dst_ind_def);
 
   std::string c = GetCommonDefines(op_def.precision);
 
@@ -281,10 +283,12 @@ std::string GetMaxPooling3DKernelCode(
       "dst_data",
       WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
       op_def.dst_tensors[0]);
+  const auto dst_ind_def =
+      output_indices ? op_def.dst_tensors[1] : op_def.dst_tensors[0];
   TensorCodeGenerator indices_tensor(
       "dst_indices",
       WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[1]);
+      dst_ind_def);
 
   std::string c = GetCommonDefines(op_def.precision);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
index 01b603b5961..4b0006c7f32 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
@@ -37,7 +37,7 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) {
   src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
 
   PReLUAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, -2.0f};
   attr.alpha = parameters;
@@ -68,7 +68,7 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
   src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
 
   PReLUAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, -2.0f};
   attr.alpha = parameters;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
index 09e6c978026..192bee771d6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@@ -129,8 +129,7 @@ absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
   const int depth = src_[0]->Slices();
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(depth, IntegralDivideRoundUp(depth, 32))));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(int2(depth, DivideRoundUp(depth, 32))));
   RETURN_IF_ERROR(
       kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index aeb8ee17a68..836a95f7407 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -234,12 +234,12 @@ template <DataType S, typename T>
 void RearrangeWeightsToOHWIOGroupI4O4(
     const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
     absl::Span<T> dst) {
-  const int dst_slices = IntegralDivideRoundUp(weights.shape.o, 4);
-  const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
-  const int dst_groups = IntegralDivideRoundUp(dst_slices, out_group_size);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
 
   int counter = 0;
   for (int d = 0; d < dst_groups; ++d) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index b6d04d4d3a9..6219952b9bf 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -436,9 +436,9 @@ absl::Status Winograd4x4To36::BindArguments() {
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  const int tiles_x = IntegralDivideRoundUp(
+  const int tiles_x = DivideRoundUp(
       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
-  const int tiles_y = IntegralDivideRoundUp(
+  const int tiles_y = DivideRoundUp(
       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
   const int tiles_total = tiles_x * tiles_y;
   RETURN_IF_ERROR(
@@ -550,14 +550,14 @@ absl::Status Winograd36To4x4::BindArguments() {
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4);
+  const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
   RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_x));
   return absl::OkStatus();
 }
 
 int3 Winograd36To4x4::GetGridSize() const {
-  const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4);
-  const int tiles_y = IntegralDivideRoundUp(dst_[0]->Height(), 4);
+  const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
+  const int tiles_y = DivideRoundUp(dst_[0]->Height(), 4);
   const int grid_x = tiles_x * tiles_y * dst_[0]->Batch();
   const int grid_y = 4;
   const int grid_z = dst_[0]->Slices();
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
index aff64dd48f3..1dada33ae04 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
@@ -111,7 +111,7 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
     src_tensor.data[i] = sin(i);
   }
 
-  Tensor<Linear, DataType::FLOAT32> biases;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
   biases.shape = Linear(1);
   biases.data.resize(biases.shape.DimensionsProduct());
   for (int i = 0; i < biases.data.size(); ++i) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
index 20c119fe236..5e280d5f98b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
@@ -187,7 +187,7 @@ int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); }
 int3 GetWorkGroup(const int3& grid, int max_size) {
   int wg_z = GetBiggestDividerWithPriority(grid.z, 8);
   int wg_xy_size = max_size / wg_z;
-  int wg_x = std::min(IntegralDivideRoundUp(grid.x, 2), wg_xy_size);
+  int wg_x = std::min(DivideRoundUp(grid.x, 2), wg_xy_size);
   int wg_y = std::min(wg_xy_size / wg_x, grid.y);
   return int3(wg_x, wg_y, wg_z);
 }
@@ -231,12 +231,12 @@ absl::Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
 }
 
 bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
-  int planar_work_groups = IntegralDivideRoundUp(width * height, 128);
+  int planar_work_groups = DivideRoundUp(width * height, 128);
   auto base_work_groups = Get2DWorkgroupsEqualTo128();
   bool have_equal_work_groups = false;
   for (auto& work_group : base_work_groups) {
-    int x_groups = IntegralDivideRoundUp(width, work_group.x);
-    int y_groups = IntegralDivideRoundUp(height, work_group.y);
+    int x_groups = DivideRoundUp(width, work_group.x);
+    int y_groups = DivideRoundUp(height, work_group.y);
     int xy_groups = x_groups * y_groups;
     if (xy_groups == planar_work_groups) {
       have_equal_work_groups = true;
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index 0406ca1edf2..f461b08ebec 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -101,7 +101,7 @@ absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
                                  CLContext* context, LinearStorage* result) {
   int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
                                              : tensor.shape.v;
-  const int depth = IntegralDivideRoundUp(size, 4);
+  const int depth = DivideRoundUp(size, 4);
   if (creation_info.data_type == DataType::FLOAT32) {
     std::vector<float4> gpu_data(depth);
     CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
diff --git a/tensorflow/lite/delegates/gpu/cl/model_hints.h b/tensorflow/lite/delegates/gpu/cl/model_hints.h
index 274064dcf13..7661cc0dacb 100644
--- a/tensorflow/lite/delegates/gpu/cl/model_hints.h
+++ b/tensorflow/lite/delegates/gpu/cl/model_hints.h
@@ -26,11 +26,11 @@ struct ModelHints {
   using ModelHint = uint64_t;
 
   // By default we want the fastest inference
-  static const ModelHint kFastestInference = 0x00000000;
+  static constexpr ModelHint kFastestInference = 0x00000000;
   // Can improve compilation time, but inference can be slower
-  static const ModelHint kReduceKernelsCount = 0x00000001;
+  static constexpr ModelHint kReduceKernelsCount = 0x00000001;
   // Can improve tuning time, but inference can be slower
-  static const ModelHint kFastTuning = 0x00000002;
+  static constexpr ModelHint kFastTuning = 0x00000002;
 
   void Add(ModelHint hint) {
     if (hint == kFastestInference) {
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index e9265257c05..9650b53937a 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -11,9 +11,11 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:model_hints",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer_1x1",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_common",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_constants",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_weights_converter",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/kernels:work_group_picking",
         "//tensorflow/lite/delegates/gpu/common:operations",
@@ -103,6 +105,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:model_hints",
         "//tensorflow/lite/delegates/gpu/cl:storage_type_util",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_common",
         "//tensorflow/lite/delegates/gpu/cl/kernels:elementwise",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/selectors:default_selector",
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
index d2d775f819f..dc34dd7faee 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
@@ -58,6 +59,19 @@ absl::Status SelectConvolutionWinogradAdreno(
   return absl::OkStatus();
 }
 
+absl::Status SelectConvolutionDynamicWeightsAdreno(
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    const BHWC& dst_shape, const CreationContext& creation_context,
+    const OperationDef& op_def, ModelHints hints,
+    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
+  ConvPowerVR conv;
+  RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
+      creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+  *weights_desc = conv.GetConvWeightsDescription();
+  *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+  return absl::OkStatus();
+}
+
 absl::Status SelectConvolutionNVidia(const Convolution2DAttributes& attr,
                                      const CreationContext& creation_context,
                                      const OperationDef& op_def,
@@ -122,6 +136,28 @@ absl::Status SelectConvolutionWinogradMali(
   return absl::OkStatus();
 }
 
+absl::Status SelectConvolutionDynamicWeightsMali(
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    const BHWC& dst_shape, const CreationContext& creation_context,
+    const OperationDef& op_def, ModelHints hints,
+    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
+  if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
+      IsConvBuffer1x1Supported(op_def, weights_shape, attr)) {
+    ConvBuffer1x1 conv;
+    RETURN_IF_ERROR(CreateConvBuffer1x1DynamicWeights(
+        creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+    *weights_desc = conv.GetConvWeightsDescription();
+    *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+  } else {
+    ConvPowerVR conv;
+    RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
+        creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+    *weights_desc = conv.GetConvWeightsDescription();
+    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 absl::Status SelectConvolution(const Convolution2DAttributes& attr,
@@ -173,6 +209,41 @@ absl::Status SelectConvolutionForWinograd(
   }
 }
 
+absl::Status SelectConvolutionWithDynamicWeights(
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    const BHWC& dst_shape, const CreationContext& creation_context,
+    const OperationDef& op_def, ModelHints hints,
+    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
+  switch (creation_context.device->vendor()) {
+    case Vendor::QUALCOMM:
+      return SelectConvolutionDynamicWeightsAdreno(
+          attr, weights_shape, dst_shape, creation_context, op_def, hints, ptr,
+          weights_desc);
+    case Vendor::MALI:
+      return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape,
+                                                 creation_context, op_def,
+                                                 hints, ptr, weights_desc);
+    default: {
+      ConvPowerVR conv;
+      RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
+          creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+      *weights_desc = conv.GetConvWeightsDescription();
+      *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+      return absl::OkStatus();
+    }
+  }
+}
+
+absl::Status SelectConverterToConvWeights(
+    const ConvWeightsDescription& weights_desc,
+    const CreationContext& creation_context, const OperationDef& op_def,
+    ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
+  ConverterToConvWeights converter =
+      ConverterToConvWeights(op_def, weights_desc);
+  *ptr = absl::make_unique<ConverterToConvWeights>(std::move(converter));
+  return absl::OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
index 94723527ad5..58be4b60ce6 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_common.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
@@ -39,6 +40,17 @@ absl::Status SelectConvolutionForWinograd(
     const CreationContext& creation_context, const OperationDef& op_def,
     ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
 
+absl::Status SelectConvolutionWithDynamicWeights(
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    const BHWC& dst_shape, const CreationContext& creation_context,
+    const OperationDef& op_def, ModelHints hints,
+    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc);
+
+absl::Status SelectConverterToConvWeights(
+    const ConvWeightsDescription& weights_desc,
+    const CreationContext& creation_context, const OperationDef& op_def,
+    ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
index e2a941870db..7373e3d545c 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
@@ -30,9 +30,8 @@ namespace cl {
 
 absl::Status SelectDefault(const CreationContext& creation_context,
                            const OperationDef& op_def, ModelHints hints,
-                           const std::vector<Value<TensorRef<BHWC>>*>& inputs,
-                           const std::vector<Value<TensorRef<BHWC>>*>& outputs,
-                           const Node& node,
+                           const std::vector<Value*>& inputs,
+                           const std::vector<Value*>& outputs, const Node& node,
                            GPUOperationsSubgraph* gpu_subgraph) {
   return absl::UnimplementedError(
       absl::StrCat("No selector for ", node.operation.type));
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
index 05e33501cd4..34004240df4 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
@@ -31,9 +31,8 @@ namespace cl {
 
 absl::Status SelectDefault(const CreationContext& creation_context,
                            const OperationDef& op_def, ModelHints hints,
-                           const std::vector<Value<TensorRef<BHWC>>*>& inputs,
-                           const std::vector<Value<TensorRef<BHWC>>*>& outputs,
-                           const Node& node,
+                           const std::vector<Value*>& inputs,
+                           const std::vector<Value*>& outputs, const Node& node,
                            GPUOperationsSubgraph* gpu_subgraph);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 3648a4541f9..60d06d7da89 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h"
 #include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
@@ -37,20 +38,17 @@ namespace gpu {
 namespace cl {
 namespace {
 
-bool IsWidthBroadcastedForSecondInput(
-    const std::vector<Value<TensorRef<BHWC>>*>& inputs) {
+bool IsWidthBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
   return inputs.size() == 2 &&
          inputs[0]->tensor.shape.w != inputs[1]->tensor.shape.w &&
          inputs[1]->tensor.shape.w == 1;
 }
-bool IsHeightBroadcastedForSecondInput(
-    const std::vector<Value<TensorRef<BHWC>>*>& inputs) {
+bool IsHeightBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
   return inputs.size() == 2 &&
          inputs[0]->tensor.shape.h != inputs[1]->tensor.shape.h &&
          inputs[1]->tensor.shape.h == 1;
 }
-bool IsChannelsBroadcastedForSecondInput(
-    const std::vector<Value<TensorRef<BHWC>>*>& inputs) {
+bool IsChannelsBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
   return inputs.size() == 2 &&
          inputs[0]->tensor.shape.c != inputs[1]->tensor.shape.c &&
          inputs[1]->tensor.shape.c == 1;
@@ -59,10 +57,10 @@ bool IsChannelsBroadcastedForSecondInput(
 bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
                                    const CLDevice& device,
                                    const BHWC& dst_shape) {
-  const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
-  const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int tiles_x = DivideRoundUp(dst_shape.w, 4);
+  const int tiles_y = DivideRoundUp(dst_shape.h, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const bool suitable_attributes =
       attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
       attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
@@ -85,8 +83,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
     return absl::UnimplementedError("No implementation for this case.");
   }
 
-  const int tiles_x = IntegralDivideRoundUp(output_shape.w, 4);
-  const int tiles_y = IntegralDivideRoundUp(output_shape.h, 4);
+  const int tiles_x = DivideRoundUp(output_shape.w, 4);
+  const int tiles_y = DivideRoundUp(output_shape.h, 4);
   const BHWC shape_0{input_shape.b, 36, tiles_x * tiles_y, input_shape.c};
   const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
   TensorDescriptor td_0;
@@ -146,11 +144,12 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
 
 }  // namespace
 
-absl::Status GPUOperationFromNode(
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, const std::vector<Value<TensorRef<BHWC>>*>& inputs,
-    const std::vector<Value<TensorRef<BHWC>>*>& outputs, const Node& node,
-    GPUOperationsSubgraph* gpu_subgraph) {
+absl::Status GPUOperationFromNode(const CreationContext& creation_context,
+                                  const OperationDef& op_def, ModelHints hints,
+                                  const std::vector<Value*>& inputs,
+                                  const std::vector<Value*>& outputs,
+                                  const Node& node,
+                                  GPUOperationsSubgraph* gpu_subgraph) {
   std::unique_ptr<GPUOperation>* gpu_op =
       InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
   auto op_type = OperationTypeFromString(node.operation.type);
@@ -198,14 +197,52 @@ absl::Status GPUOperationFromNode(
           absl::any_cast<Convolution2DAttributes>(node.operation.attributes);
       auto input_shape = inputs[0]->tensor.shape;
       auto output_shape = outputs[0]->tensor.shape;
-      if (WinogradFromNode(creation_context, op_def, hints, input_shape,
-                           output_shape, attr, gpu_subgraph)
-              .ok()) {
-        return absl::OkStatus();
+      if (inputs.size() == 1) {
+        if (WinogradFromNode(creation_context, op_def, hints, input_shape,
+                             output_shape, attr, gpu_subgraph)
+                .ok()) {
+          return absl::OkStatus();
+        } else {
+          gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
+          return SelectConvolution(attr, output_shape, creation_context, op_def,
+                                   hints, gpu_op);
+        }
       } else {
-        gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
-        return SelectConvolution(attr, output_shape, creation_context, op_def,
-                                 hints, gpu_op);
+        auto weights_shape = inputs[1]->tensor.shape;
+        TensorDescriptor weights_desc = {op_def.src_tensors[1].data_type,
+                                         TensorStorageType::BUFFER,
+                                         Layout::UNKNOWN};
+        gpu_subgraph->operations.clear();
+        gpu_subgraph->operations.resize(2);
+        auto& converter_op = gpu_subgraph->operations[0];
+        auto& conv_op = gpu_subgraph->operations[1];
+        conv_op.input_ids = {0, -1};
+        conv_op.output_ids = {0};
+        OperationDef conv_def = op_def;
+        conv_def.src_tensors[1] = weights_desc;
+        ConvWeightsDescription conv_weights_desc;
+        RETURN_IF_ERROR(SelectConvolutionWithDynamicWeights(
+            attr, weights_shape, output_shape, creation_context, conv_def,
+            hints, &conv_op.operation, &conv_weights_desc));
+
+        int aligned_output =
+            AlignByN(weights_shape.b, conv_weights_desc.output_group_size * 4);
+        int aligned_input = AlignByN(weights_shape.c, 4);
+        gpu_subgraph->new_tensors = {
+            {BHWC(1, 1, 1,
+                  aligned_output * aligned_input * weights_shape.h *
+                      weights_shape.w),
+             weights_desc}};
+        OperationDef converter_def;
+        converter_def.precision = op_def.precision;
+        converter_def.src_tensors.push_back(op_def.src_tensors[1]);
+        converter_def.dst_tensors.push_back(weights_desc);
+
+        converter_op.input_ids = {1};
+        converter_op.output_ids = {-1};
+        return SelectConverterToConvWeights(conv_weights_desc, creation_context,
+                                            converter_def, hints,
+                                            &converter_op.operation);
       }
     }
     case OperationType::CONVOLUTION_TRANSPOSED: {
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
index dd09c16dad0..f237a385718 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
@@ -29,11 +29,12 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status GPUOperationFromNode(
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, const std::vector<Value<TensorRef<BHWC>>*>& inputs,
-    const std::vector<Value<TensorRef<BHWC>>*>& outputs, const Node& node,
-    GPUOperationsSubgraph* gpu_subgraph);
+absl::Status GPUOperationFromNode(const CreationContext& creation_context,
+                                  const OperationDef& op_def, ModelHints hints,
+                                  const std::vector<Value*>& inputs,
+                                  const std::vector<Value*>& outputs,
+                                  const Node& node,
+                                  GPUOperationsSubgraph* gpu_subgraph);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
index 41660102770..0f18a4b7be5 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
@@ -26,8 +26,7 @@ namespace gpu {
 namespace cl {
 
 std::unique_ptr<GPUOperation>* InitSingleOpSubgraph(
-    const std::vector<Value<TensorRef<BHWC>>*>& inputs,
-    const std::vector<Value<TensorRef<BHWC>>*>& outputs,
+    const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
     GPUOperationsSubgraph* gpu_subgraph) {
   gpu_subgraph->operations.clear();
   gpu_subgraph->new_tensors.clear();
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h
index 7b23df2bdda..60e7fc5c9c5 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h
@@ -43,8 +43,7 @@ struct GPUOperationsSubgraph {
 };
 
 std::unique_ptr<GPUOperation>* InitSingleOpSubgraph(
-    const std::vector<Value<TensorRef<BHWC>>*>& inputs,
-    const std::vector<Value<TensorRef<BHWC>>*>& outputs,
+    const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
     GPUOperationsSubgraph* gpu_subgraph);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc b/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
index f6201fa92ca..755da0c7619 100644
--- a/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
@@ -28,7 +28,7 @@ namespace cl {
 bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
                               const BHWDC& shape,
                               const TensorDescriptor& descriptor) {
-  const int slices = IntegralDivideRoundUp(shape.c, 4);
+  const int slices = DivideRoundUp(shape.c, 4);
   switch (descriptor.storage_type) {
     case TensorStorageType::BUFFER: {
       const int flt4_size =
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 308e1b69205..4a52508af0e 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -29,7 +29,7 @@ namespace cl {
 namespace {
 
 absl::Status CreateImageBufferFromBuffer(const CLContext& context,
-                                         cl_mem memory, enum DataType data_type,
+                                         cl_mem memory, DataType data_type,
                                          int width, cl_mem* result) {
   cl_image_format format;
   cl_image_desc desc;
@@ -65,11 +65,10 @@ absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
   }
   if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
     cl_mem image_memory;
-    RETURN_IF_ERROR(
-        CreateImageBufferFromBuffer(context, memory, descriptor.data_type,
-                                    shape.b * shape.w * shape.h * shape.d *
-                                        IntegralDivideRoundUp(shape.c, 4),
-                                    &image_memory));
+    RETURN_IF_ERROR(CreateImageBufferFromBuffer(
+        context, memory, descriptor.data_type,
+        shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4),
+        &image_memory));
     *result = Tensor(memory, memory_owner, image_memory, shape, descriptor);
   } else {
     *result = Tensor(memory, memory_owner, shape, descriptor);
@@ -386,7 +385,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
                                   const CLDevice& device, const BHWDC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result) {
-  const int slices = IntegralDivideRoundUp(shape.c, 4);
+  const int slices = DivideRoundUp(shape.c, 4);
   switch (descriptor.storage_type) {
     case TensorStorageType::BUFFER:
     case TensorStorageType::IMAGE_BUFFER: {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index a27c54a74e5..cb7d4263a5c 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -61,7 +61,7 @@ class Tensor {
   int Height() const { return shape_.h; }
   int Depth() const { return shape_.d; }
   int Channels() const { return shape_.c; }
-  int Slices() const { return IntegralDivideRoundUp(shape_.c, 4); }
+  int Slices() const { return DivideRoundUp(shape_.c, 4); }
   int Batch() const { return shape_.b; }
 
   // returns int4(width * batch, height, slices, batch)
@@ -75,8 +75,8 @@ class Tensor {
   int4 GetWHSB() const { return int4(shape_.w, shape_.h, Slices(), shape_.b); }
   int4 GetWHDS() const { return int4(shape_.w, shape_.h, shape_.d, Slices()); }
 
-  enum DataType DataType() const { return descriptor_.data_type; }
-  TensorStorageType StorageType() const { return descriptor_.storage_type; }
+  DataType GetDataType() const { return descriptor_.data_type; }
+  TensorStorageType GetStorageType() const { return descriptor_.storage_type; }
 
   // for profiling and memory statistics
   uint64_t GetMemorySizeInBytes() const;
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index 9cc0e4c70c6..723e4cd9e99 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -7,20 +7,11 @@ cc_binary(
     name = "performance_profiling",
     srcs = ["performance_profiling.cc"],
     deps = [
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
         "//tensorflow/lite/delegates/gpu/cl:environment",
         "//tensorflow/lite/delegates/gpu/cl:inference_context",
-        "//tensorflow/lite/delegates/gpu/cl:model_hints",
-        "//tensorflow/lite/delegates/gpu/cl:opencl_wrapper",
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/common:model",
-        "//tensorflow/lite/delegates/gpu/common:model_builder",
-        "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
-        "//tensorflow/lite/delegates/gpu/common/transformations:merge_padding_with",
+        "//tensorflow/lite/delegates/gpu/common/testing:tflite_model_reader",
         "//tensorflow/lite/kernels:builtin_ops",
         "@com_google_absl//absl/time",
     ],
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
index 151924197c2..75dcbc1d163 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@@ -13,112 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <iostream>
 #include <string>
 
-#include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
-#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
-#include "tensorflow/lite/delegates/gpu/common/model_builder.h"
-#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
+#include "tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h"
 #include "tensorflow/lite/kernels/register.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
-
-class DelegateContext {
- public:
-  bool Init(TfLiteContext* context,
-            const TfLiteDelegateParams* delegate_params) {
-    auto denormalized_graph =
-        reinterpret_cast<GraphFloat32*>(delegate_params->delegate->data_);
-    absl::Status status =
-        BuildModel(context, delegate_params, denormalized_graph);
-    if (!status.ok()) {
-      context->ReportError(context, "Failed to convert a model: %s",
-                           std::string(status.message()).c_str());
-    }
-    return status.ok();
-  }
-};
-
-TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  const TfLiteRegistration kRegistration = {
-      .init = [](TfLiteContext* context, const char* buffer, size_t) -> void* {
-        auto* delegate_context = new DelegateContext();
-        if (!delegate_context->Init(
-                context,
-                reinterpret_cast<const TfLiteDelegateParams*>(buffer))) {
-          delete delegate_context;
-          return nullptr;
-        }
-        return delegate_context;
-      },
-      .free = [](TfLiteContext* context, void* buffer) -> void {
-        delete reinterpret_cast<DelegateContext*>(buffer);
-      },
-      .prepare = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
-        return node->user_data ? kTfLiteOk : kTfLiteError;
-      },
-      .invoke = nullptr,
-  };
-
-  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
-  const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, kRegistration, ops_to_replace, delegate);
-  TfLiteIntArrayFree(ops_to_replace);
-  return status;
-}
-
-absl::Status FlatBufferToGPUGraph(
-    const std::unique_ptr<tflite::FlatBufferModel>& flatbuffer,
-    GraphFloat32* graph) {
-  tflite::ops::builtin::BuiltinOpResolver op_resolver;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  tflite::InterpreterBuilder interpreter_builder(*flatbuffer, op_resolver);
-  if (interpreter_builder(&interpreter) != kTfLiteOk || !interpreter) {
-    return absl::InternalError("Unable to prepare TfLite interpreter.");
-  }
-  interpreter->UseNNAPI(false);
-  TfLiteDelegate delegate;
-  delegate.data_ = graph;
-  delegate.flags = kTfLiteDelegateFlagsNone;
-  delegate.Prepare = DelegatePrepare;
-  delegate.CopyFromBufferHandle = nullptr;
-  delegate.CopyToBufferHandle = nullptr;
-  delegate.FreeBufferHandle = nullptr;
-
-  if (interpreter->ModifyGraphWithDelegate(&delegate) != kTfLiteOk) {
-    return absl::InternalError("Conversion from TfLite model failed.");
-  }
-
-  NullTransformationReporter reporter;
-  ModelTransformer transformer(graph, &reporter);
-  if (!ApplyGeneralTransformations(&transformer)) {
-    return absl::InternalError("Graph general transformations failed");
-  }
-
-  return absl::OkStatus();
-}
-}  // namespace
 
 absl::Status RunModelSample(const std::string& model_name) {
   auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
   GraphFloat32 graph_cl;
-  RETURN_IF_ERROR(FlatBufferToGPUGraph(flatbuffer, &graph_cl));
+  ops::builtin::BuiltinOpResolver op_resolver;
+  RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl));
 
   Environment env;
   RETURN_IF_ERROR(CreateEnvironment(&env));
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index d80a2fb0a4a..b7120605902 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:build_config.bzl", "tf_platform_alias")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -19,17 +21,9 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "custom_parsers",
-    srcs = ["custom_parsers.cc"],
-    hdrs = ["custom_parsers.h"],
-    deps = [
-        ":shape",
-        ":status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:any",
-        "@flatbuffers",
-    ],
+exports_files(
+    ["custom_parsers.h"],
+    visibility = ["//tensorflow/lite/delegates/gpu/common:__subpackages__"],
 )
 
 cc_library(
@@ -84,6 +78,7 @@ cc_library(
 
 cc_library(
     name = "model",
+    srcs = ["model.cc"],
     hdrs = ["model.h"],
     deps = [
         ":data_type",
@@ -107,33 +102,21 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "model_builder_helper",
-    hdrs = ["model_builder_helper.h"],
-    deps = [
-        ":status",
-        "//tensorflow/lite:context",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates:utils",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "model_builder",
     srcs = ["model_builder.cc"],
     hdrs = ["model_builder.h"],
     deps = [
-        ":custom_parsers",
         ":data_type",
         ":model",
         ":model_builder_helper",
+        ":object_reader",
         ":operations",
         ":shape",
         ":status",
         ":tensor",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite:context",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
@@ -143,11 +126,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/schema:schema_fbs",
-        "@FP16",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-    ],
+    ] + tf_platform_alias("custom_parsers", "//tensorflow/lite/delegates/gpu/common/"),
 )
 
 cc_test(
@@ -162,6 +141,29 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "model_builder_helper",
+    srcs = ["model_builder_helper.cc"],
+    hdrs = ["model_builder_helper.h"],
+    deps = [
+        ":data_type",
+        ":model",
+        ":shape",
+        ":status",
+        ":tensor",
+        "//tensorflow/lite:context",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/kernels/internal:types",
+        "@FP16",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "model_transformer",
     srcs = ["model_transformer.cc"],
@@ -174,6 +176,20 @@ cc_library(
 
 # TODO(impjdi): Add unit test for model_transformer.
 
+cc_library(
+    name = "object_reader",
+    srcs = ["object_reader.cc"],
+    hdrs = ["object_reader.h"],
+    deps = [
+        ":model",
+        ":model_builder_helper",
+        ":status",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
+        "//tensorflow/lite/kernels:kernel_util",
+    ],
+)
+
 cc_library(
     name = "operations",
     srcs = ["operations.cc"],
diff --git a/tensorflow/lite/delegates/gpu/common/convert.cc b/tensorflow/lite/delegates/gpu/common/convert.cc
index cee2e8f0e60..fb0caf9f167 100644
--- a/tensorflow/lite/delegates/gpu/common/convert.cc
+++ b/tensorflow/lite/delegates/gpu/common/convert.cc
@@ -44,12 +44,11 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
   }
 
   float* output = out.data();
-  for (int p = 0; p < IntegralDivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane);
-       ++p) {
+  for (int p = 0; p < DivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane); ++p) {
     for (int h = 0; h < shape.h; ++h) {
       for (int w = 0; w < shape.w; ++w) {
-        for (int c = 0;
-             c < IntegralDivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane); ++c) {
+        for (int c = 0; c < DivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane);
+             ++c) {
           for (int co = 0; co < kPhwo4i4ChannelsInPlane; ++co) {
             for (int ci = 0; ci < kPhwo4i4ChannelsInPlane; ++ci) {
               float value = 0;
@@ -106,7 +105,7 @@ std::vector<float> ConvertToPHWO4I4Transposed(
 
 uint3 Get3DSizeForPHWO4I4(const OHWI& shape) {
   return uint3(AlignByN(shape.i, 4), shape.h * shape.w,
-               IntegralDivideRoundUp(shape.o, 4));
+               DivideRoundUp(shape.o, 4));
 }
 
 // Layout is Po,H,W,OI4x4.
@@ -123,8 +122,8 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const IHWO& shape,
         out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
   }
 
-  const int dst_depth = IntegralDivideRoundUp(shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(shape.i, 4);
+  const int dst_depth = DivideRoundUp(shape.o, 4);
+  const int src_depth = DivideRoundUp(shape.i, 4);
 
   float* output = out.data();
   for (int f = 0; f < dst_depth; ++f) {
@@ -178,8 +177,7 @@ absl::Status ConvertToPIOHW4(absl::Span<const float> in, const OHWI& shape,
   }
 
   int32_t output_channels = shape.o * shape.i;
-  int32_t num_planes =
-      IntegralDivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
+  int32_t num_planes = DivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
   float* output = out.data();
   for (int p = 0; p < num_planes; ++p) {
     for (int h = 0; h < shape.h; ++h) {
@@ -232,7 +230,7 @@ absl::Status ConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
     return absl::OkStatus();
   }
   // Layout is Pc,H,W,C4 where P - is a plane based on channels.
-  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
+  int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
   const int num_pixels = shape.h * shape.w;
   // A layer is a set of kPhwc4ChannelsInPlane channels images.
   const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
@@ -281,7 +279,7 @@ absl::Status ConvertToPHWC4Half(absl::Span<const float> in, const BHWC& shape,
   RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));
 
   // Layout is Pc,H,W,C4 where P - is a plane based on channels.
-  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
+  int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
   const int num_pixels = shape.h * shape.w;
   // A layer is a set of kPhwc4ChannelsInPlane channels images.
   const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
@@ -407,7 +405,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
     return absl::OkStatus();
   }
 
-  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
+  int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
   const int num_pixels = shape.h * shape.w;
   const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
   // A layer is a set of kPhwc4ChannelsInPlane channels images.
@@ -449,7 +447,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
 absl::Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in,
                                   const BHWC& shape, absl::Span<float> out) {
   RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
-  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
+  int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
   const int num_pixels = shape.h * shape.w;
   const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
   // A layer is a set of kPhwc4ChannelsInPlane channels images.
diff --git a/tensorflow/lite/delegates/gpu/common/default/BUILD b/tensorflow/lite/delegates/gpu/common/default/BUILD
new file mode 100644
index 00000000000..b085f68fcfb
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/default/BUILD
@@ -0,0 +1,16 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "custom_parsers",
+    srcs = ["custom_parsers.cc"],
+    hdrs = ["//tensorflow/lite/delegates/gpu/common:custom_parsers.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:any",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/common/custom_parsers.cc b/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
similarity index 95%
rename from tensorflow/lite/delegates/gpu/common/custom_parsers.cc
rename to tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
index e43cba05525..9844b8d8aee 100644
--- a/tensorflow/lite/delegates/gpu/common/custom_parsers.cc
+++ b/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "tensorflow/lite/delegates/gpu/common/custom_parsers.h"
 
 #include <string>
diff --git a/tensorflow/lite/delegates/gpu/common/model.cc b/tensorflow/lite/delegates/gpu/common/model.cc
new file mode 100644
index 00000000000..a2f9da428ba
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/model.cc
@@ -0,0 +1,451 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+std::vector<Node*> GraphFloat32::nodes() const {
+  return FilterNodes([](const NodeDef&) { return true; });
+}
+
+std::vector<Value*> GraphFloat32::values() const {
+  return FilterValues([](const ValueDef&) { return true; });
+}
+
+std::vector<Value*> GraphFloat32::inputs() const {
+  return FilterValues([](const ValueDef& v) { return v.producer == nullptr; });
+}
+
+std::vector<Value*> GraphFloat32::outputs() const {
+  return FilterValues([](const ValueDef& v) { return v.consumers.empty(); });
+}
+
+std::vector<Value*> GraphFloat32::FindInputs(NodeId id) const {
+  if (id >= nodes_.size()) {
+    return {};
+  }
+  return nodes_.at(id).inputs;
+}
+
+std::vector<Value*> GraphFloat32::FindOutputs(NodeId id) const {
+  if (id >= nodes_.size()) {
+    return {};
+  }
+  return nodes_.at(id).outputs;
+}
+
+bool GraphFloat32::IsGraphInput(ValueId id) const {
+  if (id >= values_.size()) {
+    return false;
+  }
+  return values_[id].producer == nullptr;
+}
+
+bool GraphFloat32::IsGraphOutput(ValueId id) const {
+  if (id >= values_.size()) {
+    return false;
+  }
+  return values_[id].consumers.empty();
+}
+
+Node* GraphFloat32::FindProducer(ValueId id) const {
+  if (id >= values_.size()) {
+    return nullptr;
+  }
+  return values_[id].producer;
+}
+
+std::vector<Node*> GraphFloat32::FindConsumers(ValueId id) const {
+  if (id >= values_.size()) {
+    return {};
+  }
+  return values_[id].consumers;
+}
+
+Node* GraphFloat32::GetNode(NodeId id) const {
+  if (id >= nodes_.size()) {
+    return {};
+  }
+  return nodes_.at(id).node.get();
+}
+
+Value* GraphFloat32::GetValue(ValueId id) const {
+  if (id >= values_.size()) {
+    return nullptr;
+  }
+  return values_[id].value.get();
+}
+
+Node* GraphFloat32::NewNode() {
+  const NodeId new_id = nodes_.size();
+  NodeDef def;
+  def.node = absl::make_unique<Node>(Node{static_cast<NodeId>(new_id), {}});
+  Node* node = def.node.get();
+  nodes_[new_id] = std::move(def);
+  execution_plan_.push_back(new_id);
+  return node;
+}
+
+absl::Status GraphFloat32::InsertNodeAfter(NodeId id, Node** new_node) {
+  if (id >= nodes_.size()) {
+    return absl::OutOfRangeError("NodeId is out of range");
+  }
+  int idx = 0;
+  while (idx < execution_plan_.size()) {
+    if (execution_plan_[idx] == id) break;
+    ++idx;
+  }
+  if (idx == execution_plan_.size()) {
+    return absl::OutOfRangeError("NodeId not in execution plan");
+  }
+
+  const NodeId new_id = nodes_.size();
+  NodeDef def;
+  def.node = absl::make_unique<Node>(Node{static_cast<NodeId>(new_id), {}});
+  *new_node = def.node.get();
+  nodes_[new_id] = std::move(def);
+  execution_plan_.insert(execution_plan_.begin() + idx + 1, new_id);
+  return absl::OkStatus();
+}
+
+Value* GraphFloat32::NewValue() {
+  ValueDef def;
+  def.value =
+      absl::make_unique<Value>(Value{static_cast<ValueId>(values_.size()), {}});
+  Value* value = def.value.get();
+  values_.push_back(std::move(def));
+  return value;
+}
+
+absl::Status GraphFloat32::SetProducer(NodeId producer, ValueId value) {
+  ValueDef* v;
+  RETURN_IF_ERROR(LookupValue(value, &v));
+  Value* value_ptr = v->value.get();
+  NodeDef* n;
+  RETURN_IF_ERROR(LookupNode(producer, &n));
+  Node* node_ptr = n->node.get();
+
+  // check if this value has the same producer already
+  if (node_ptr == v->producer) {
+    return absl::AlreadyExistsError(absl::StrCat(
+        "Node ", producer, " is already a producer of the value ", value));
+  }
+
+  // Check if the node is a consumer of this value.
+  if (IsInput(producer, value)) {
+    return absl::InvalidArgumentError("Node is a consumer of the value");
+  }
+
+  if (v->producer != nullptr) {
+    // value is no longer produced by it's previous producer.
+    Erase(&nodes_[v->producer->id].outputs, value_ptr);
+  }
+  v->producer = node_ptr;
+  n->outputs.push_back(value_ptr);
+  return absl::OkStatus();
+}
+
+absl::Status GraphFloat32::RemoveProducer(ValueId value) {
+  ValueDef* v;
+  RETURN_IF_ERROR(LookupValue(value, &v));
+  Value* value_ptr = v->value.get();
+  if (v->producer == nullptr) {
+    return absl::InvalidArgumentError("Value does not have a producer");
+  }
+  Erase(&nodes_[v->producer->id].outputs, value_ptr);
+  v->producer = nullptr;
+  return absl::OkStatus();
+}
+
+absl::Status GraphFloat32::AddConsumer(NodeId consumer, ValueId value) {
+  ValueDef* v;
+  RETURN_IF_ERROR(LookupValue(value, &v));
+  Value* value_ptr = v->value.get();
+  NodeDef* n;
+  RETURN_IF_ERROR(LookupNode(consumer, &n));
+  Node* node_ptr = n->node.get();
+
+  // check if this value has the same producer already
+  if (node_ptr == v->producer) {
+    return absl::InvalidArgumentError("Node is a producer of the value");
+  }
+
+  // check if this value has the same consumer already
+  if (IsInput(consumer, value)) {
+    return absl::AlreadyExistsError(absl::StrCat(
+        "Node ", consumer, " is already a consumer of the value ", value));
+  }
+
+  n->inputs.push_back(value_ptr);
+  v->consumers.push_back(node_ptr);
+  return absl::OkStatus();
+}
+
+// Replace input value for given node.
+absl::Status GraphFloat32::ReplaceInput(NodeId node, ValueId old_value,
+                                        ValueId new_value) {
+  ValueDef* v_old;
+  RETURN_IF_ERROR(LookupValue(old_value, &v_old));
+  Value* value_old_ptr = v_old->value.get();
+  ValueDef* v_new;
+  RETURN_IF_ERROR(LookupValue(new_value, &v_new));
+  Value* value_new_ptr = v_new->value.get();
+  NodeDef* n;
+  RETURN_IF_ERROR(LookupNode(node, &n));
+  Node* node_ptr = n->node.get();
+
+  // Check if the node is a consumer of old_value.
+  if (!IsInput(node, old_value)) {
+    return absl::InvalidArgumentError("old_value must be input of node.");
+  }
+
+  // Check if the node is not a consumer of new_value.
+  if (IsInput(node, new_value)) {
+    return absl::InvalidArgumentError("new_value can not be input of node.");
+  }
+
+  // Check if this value has the same producer already
+  if (node_ptr == v_new->producer) {
+    return absl::InvalidArgumentError("new_value can not be output of node.");
+  }
+
+  for (int i = 0; i < n->inputs.size(); ++i) {
+    if (n->inputs[i] == value_old_ptr) {
+      n->inputs[i] = value_new_ptr;
+      break;
+    }
+  }
+  v_new->consumers.push_back(node_ptr);
+  Erase(&v_old->consumers, node_ptr);
+  return absl::OkStatus();
+}
+
+absl::Status GraphFloat32::RemoveConsumer(NodeId consumer, ValueId value) {
+  ValueDef* v;
+  RETURN_IF_ERROR(LookupValue(value, &v));
+  Value* value_ptr = v->value.get();
+  NodeDef* n;
+  RETURN_IF_ERROR(LookupNode(consumer, &n));
+  Node* node_ptr = n->node.get();
+  if (!IsInput(consumer, value)) {
+    return absl::InvalidArgumentError("Node is not a consumer of the value");
+  }
+  Erase(&n->inputs, value_ptr);
+  Erase(&v->consumers, node_ptr);
+  return absl::OkStatus();
+}
+
+absl::Status GraphFloat32::DeleteNode(NodeId id) {
+  NodeDef* n;
+  RETURN_IF_ERROR(LookupNode(id, &n));
+  Node* node_ptr = n->node.get();
+  for (auto value : n->inputs) {
+    Erase(&values_[value->id].consumers, node_ptr);
+  }
+  for (auto value : n->outputs) {
+    values_[value->id].producer = nullptr;
+  }
+  n->inputs.clear();
+  n->outputs.clear();
+  n->node.reset();
+  return absl::OkStatus();
+}
+
+absl::Status GraphFloat32::DeleteValue(ValueId id) {
+  ValueDef* v;
+  RETURN_IF_ERROR(LookupValue(id, &v));
+  Value* value_ptr = v->value.get();
+  if (v->producer != nullptr) {
+    Erase(&nodes_[v->producer->id].outputs, value_ptr);
+  }
+  if (!v->consumers.empty()) {
+    for (auto node : v->consumers) {
+      Erase(&nodes_[node->id].inputs, value_ptr);
+    }
+  }
+  v->producer = nullptr;
+  v->consumers.clear();
+  v->value.reset();
+  return absl::OkStatus();
+}
+
+absl::Status GraphFloat32::MakeExactCopy(GraphFloat32* model) const {
+  model->nodes_.clear();
+  model->execution_plan_.clear();
+  model->values_.clear();
+  for (auto& value_def : values_) {
+    model->values_.push_back({});
+    if (value_def.value) {
+      model->values_.back().value = absl::make_unique<Value>(*value_def.value);
+    }
+  }
+  // Add all nodes first.
+  for (auto node_id : execution_plan_) {
+    model->execution_plan_.push_back(node_id);
+    model->nodes_[node_id] = {};
+    auto& node_def = nodes_.at(node_id);
+    if (node_def.node) {
+      model->nodes_[node_id].node = absl::make_unique<Node>(*node_def.node);
+    }
+  }
+  // Wire up dependencies between nodes.
+  for (auto node_id : execution_plan_) {
+    auto& node_def = nodes_.at(node_id);
+    if (node_def.node) {
+      for (auto output : node_def.outputs) {
+        RETURN_IF_ERROR(model->SetProducer(node_def.node->id, output->id));
+      }
+      for (auto input : node_def.inputs) {
+        RETURN_IF_ERROR(model->AddConsumer(node_def.node->id, input->id));
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
+bool GraphFloat32::IsInput(NodeId node, ValueId value) {
+  if (node >= nodes_.size() || value >= values_.size()) {
+    return false;
+  }
+  const NodeDef& n = nodes_[node];
+  const ValueDef& v = values_[value];
+  if (!n.node || !v.value) {
+    return false;
+  }
+  return std::find(n.inputs.begin(), n.inputs.end(), v.value.get()) !=
+         n.inputs.end();
+}
+
+absl::Status GraphFloat32::LookupNode(NodeId id, NodeDef** node_def) {
+  if (id >= nodes_.size()) {
+    return absl::OutOfRangeError("NodeId is out of range");
+  }
+  auto& n = nodes_[id];
+  if (!n.node) {
+    return absl::OutOfRangeError("Node is already deleted");
+  }
+  *node_def = &n;
+  return absl::OkStatus();
+}
+
+absl::Status GraphFloat32::LookupValue(ValueId id, ValueDef** value_def) {
+  if (id >= values_.size()) {
+    return absl::OutOfRangeError("ValueId is out of range");
+  }
+  auto& v = values_[id];
+  if (!v.value) {
+    return absl::OutOfRangeError("Value is already deleted");
+  }
+  *value_def = &v;
+  return absl::OkStatus();
+}
+
+absl::Status RemovePrecedingNode(GraphFloat32* graph, const Node* to_remove,
+                                 const Node* to_keep) {
+  // Make sure all outputs from to_remove are consumed by to_keep.
+  for (auto output : graph->FindOutputs(to_remove->id)) {
+    auto consumers = graph->FindConsumers(output->id);
+    if (consumers.size() > 1 ||
+        (consumers.size() == 1 && consumers[0] != to_keep)) {
+      return absl::InvalidArgumentError(
+          "Output from to_remove node has other consumers");
+    }
+  }
+
+  // Update all references
+  for (auto input : graph->FindInputs(to_remove->id)) {
+    RETURN_IF_ERROR(graph->AddConsumer(to_keep->id, input->id));
+  }
+  for (auto output : graph->FindOutputs(to_remove->id)) {
+    RETURN_IF_ERROR(graph->DeleteValue(output->id));
+  }
+  return graph->DeleteNode(to_remove->id);
+}
+
+absl::Status RemoveFollowingNode(GraphFloat32* graph, const Node* to_remove,
+                                 const Node* to_keep) {
+  // Make sure all inputs to to_remove are produced by to_keep.
+  for (auto input : graph->FindInputs(to_remove->id)) {
+    Node* producer = graph->FindProducer(input->id);
+    if (producer->id != to_keep->id) {
+      return absl::InvalidArgumentError("To_remove node has other inputs");
+    }
+  }
+
+  for (auto input : graph->FindInputs(to_remove->id)) {
+    RETURN_IF_ERROR(graph->DeleteValue(input->id));
+  }
+  for (auto output : graph->FindOutputs(to_remove->id)) {
+    RETURN_IF_ERROR(graph->SetProducer(to_keep->id, output->id));
+  }
+  return graph->DeleteNode(to_remove->id);
+}
+
+absl::Status RemoveOneInputOneOutputNode(GraphFloat32* graph,
+                                         const Node* to_remove) {
+  auto inputs = graph->FindInputs(to_remove->id);
+  auto outputs = graph->FindOutputs(to_remove->id);
+  if (inputs.size() != 1 || outputs.size() != 1) {
+    return absl::InvalidArgumentError(
+        "To_remove node must have 1 input and 1 output");
+  }
+  auto input_id = inputs[0]->id;
+  auto output_id = outputs[0]->id;
+  Node* producer = graph->FindProducer(input_id);
+  auto consumers = graph->FindConsumers(output_id);
+  RETURN_IF_ERROR(graph->DeleteNode(to_remove->id));
+  for (auto& consumer : consumers) {
+    RETURN_IF_ERROR(graph->ReplaceInput(consumer->id, output_id, input_id));
+  }
+  RETURN_IF_ERROR(graph->DeleteValue(output_id));
+  if (!producer && consumers.empty()) {
+    RETURN_IF_ERROR(graph->DeleteValue(input_id));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status AddOutput(GraphFloat32* graph, const Node* from_node,
+                       Value** output) {
+  auto link = graph->NewValue();
+  RETURN_IF_ERROR(graph->SetProducer(from_node->id, link->id));
+  *output = link;
+  return absl::OkStatus();
+}
+
+absl::Status ConnectTwoNodes(GraphFloat32* graph, const Node* from_node,
+                             const Node* to_node, Value** output) {
+  Value* link;
+  RETURN_IF_ERROR(AddOutput(graph, from_node, &link));
+  RETURN_IF_ERROR(graph->AddConsumer(to_node->id, link->id));
+  *output = link;
+  return absl::OkStatus();
+}
+
+bool IsBatchMatchesForAllValues(const GraphFloat32& model) {
+  const int32_t b = model.values()[0]->tensor.shape.b;
+  for (auto value : model.values()) {
+    if (value->tensor.shape.b != b) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/model.h b/tensorflow/lite/delegates/gpu/common/model.h
index 7b499b3ef2b..f6d160977f9 100644
--- a/tensorflow/lite/delegates/gpu/common/model.h
+++ b/tensorflow/lite/delegates/gpu/common/model.h
@@ -50,77 +50,70 @@ struct QuantizationParams {
 };
 
 // Connects tensor's producer and operation that depends on this tensor.
-template <typename TensorT>
 struct Value {
-  using TensorType = TensorT;
-
   const ValueId id;
-
-  TensorType tensor;
-
+  TensorRef<BHWC> tensor;
   absl::optional<QuantizationParams> quant_params;
 };
 
 struct Operation {
   std::string type;
-
   absl::any attributes;
 };
 
 struct Node {
   const NodeId id;
-
   Operation operation;
 };
 
-// Graph is DAG that consists of nodes and values. Each value may have a single
+// A DAG that consists of nodes and values. Each value may have a single
 // producer node and multiple consumer nodes. Therefore, each node may have
 // multiple input and output values.
 //
 // Value that does not have a producer is a graph's input. Value that does not
 // have a consumer is a graph's output.
 //
-// Interface provides methods for graph introspection and manipulation. Abstract
-// interface makes allows subgraphs representation to ensure safe manipulations.
-template <typename TensorT>
-class Graph {
+// It keeps values and nodes referenced by their index in a vector. Therefore,
+// nodes and values are never deleted, but rather erased, where corresponding
+// index remains.
+//
+// It is possible to re-use removed indices, but it is not implemented yet.
+class GraphFloat32 {
  public:
-  virtual ~Graph() = default;
-
   // @return a collection of nodes in this graph.
-  virtual std::vector<Node*> nodes() const = 0;
+  std::vector<Node*> nodes() const;
 
   // @return a collection of values in this graph.
-  virtual std::vector<Value<TensorT>*> values() const = 0;
+  std::vector<Value*> values() const;
 
   // @return graph inputs, that are values without producers.
-  virtual std::vector<Value<TensorT>*> inputs() const = 0;
+  std::vector<Value*> inputs() const;
 
   // @return graph outputs, that are values without consumers.
-  virtual std::vector<Value<TensorT>*> outputs() const = 0;
+  std::vector<Value*> outputs() const;
 
   // @return inputs into the given node. Returns empty vector for deleted node.
-  virtual std::vector<Value<TensorT>*> FindInputs(NodeId id) const = 0;
+  std::vector<Value*> FindInputs(NodeId id) const;
 
   // @return outputs from the given node. Returns empty vector for deleted node.
-  virtual std::vector<Value<TensorT>*> FindOutputs(NodeId id) const = 0;
+  std::vector<Value*> FindOutputs(NodeId id) const;
 
-  virtual bool IsGraphInput(ValueId id) const = 0;
+  bool IsGraphInput(ValueId id) const;
 
-  virtual bool IsGraphOutput(ValueId id) const = 0;
+  bool IsGraphOutput(ValueId id) const;
 
   // @return producer of the given value. Returns nullptr for deleted value.
-  virtual Node* FindProducer(ValueId id) const = 0;
+  Node* FindProducer(ValueId id) const;
 
   // @return consumers of the given value. Returns empty vector for deleted
   // value.
-  virtual std::vector<Node*> FindConsumers(ValueId id) const = 0;
+  std::vector<Node*> FindConsumers(ValueId id) const;
 
   // @return a node or nullptr if node with the given id is not present.
-  virtual Node* GetNode(NodeId id) const = 0;
+  Node* GetNode(NodeId id) const;
 
   // @return a value or nullptr if value with the given id is not present.
-  virtual Value<TensorT>* GetValue(ValueId id) const = 0;
+  Value* GetValue(ValueId id) const;
 
   //////////////////////////////////////////////////////////////////////////////
   // Graph manipulation functions are below
@@ -129,386 +122,61 @@ class Graph {
   // @return new node created in this graph
   // NOTE: nodes should be created in the topological order, e.g. node A that
   // depends on a value from node B should be created after node B.
-  virtual Node* NewNode() = 0;
+  Node* NewNode();
+
+  // Insert Node after another in the execution plan.
+  absl::Status InsertNodeAfter(NodeId id, Node** new_node);
 
   // @return new value created in this graph
-  virtual Value<TensorT>* NewValue() = 0;
+  Value* NewValue();
 
   // Sets a producer for the given value. There could be a single producer
   // for a value. If a value had another producer, it will reassign producer
   // appropriately. If a value didn't have a producer, it will be removed
   // from a graph's input.
-  virtual absl::Status SetProducer(NodeId producer, ValueId value) = 0;
+  absl::Status SetProducer(NodeId producer, ValueId value);
 
   // Removes a producer for the given value. Value becomes producer-less and
   // therefore becomes graph's input.
-  virtual absl::Status RemoveProducer(ValueId value) = 0;
+  absl::Status RemoveProducer(ValueId value);
 
   // Sets a consumer for the given value. There could be multiple consumers
   // for a value.
-  virtual absl::Status AddConsumer(NodeId consumer, ValueId value) = 0;
+  absl::Status AddConsumer(NodeId consumer, ValueId value);
 
   // Replace input value for given node.
-  virtual absl::Status ReplaceInput(NodeId node, ValueId old_value,
-                                    ValueId new_value) = 0;
+  absl::Status ReplaceInput(NodeId node, ValueId old_value, ValueId new_value);
 
   // Removes a consumer for the given value. If value does not have any
   // consumers it becomes graph's output.
-  virtual absl::Status RemoveConsumer(NodeId consumer, ValueId value) = 0;
+  absl::Status RemoveConsumer(NodeId consumer, ValueId value);
 
   // Removes node from this graph. For all input values this node will be
   // removed from consumers and for all output values a producer will be
   // removed.
-  virtual absl::Status DeleteNode(NodeId id) = 0;
+  absl::Status DeleteNode(NodeId id);
 
   // Removes value from this graph. It will be removed from inputs for all
   // dependent nodes. A node that was a producer of this value will loose its
   // output.
-  virtual absl::Status DeleteValue(ValueId id) = 0;
-};
+  absl::Status DeleteValue(ValueId id);
 
-// Implementation of a Graph interface. It keeps values and nodes referenced by
-// their index in a vector. Therefore, nodes and values are never deleted, but
-// rather erased, where corresponding index remains.
-//
-// It is possible to re-use removed indices, but it is not implemented yet.
-template <typename TensorT>
-class Model : public Graph<TensorT> {
- public:
-  const std::string& name() const { return name_; }
-
-  void set_name(std::string name) { name_ = std::move(name); }
-
-  std::vector<Value<TensorT>*> values() const final {
-    return FilterValues([](const ValueDef&) { return true; });
-  }
-
-  // Returns nodes in the execution order.
-  std::vector<Node*> nodes() const final {
-    return FilterNodes([](const NodeDef&) { return true; });
-  }
-
-  std::vector<Value<TensorT>*> inputs() const final {
-    return FilterValues(
-        [](const ValueDef& v) { return v.producer == nullptr; });
-  }
-
-  std::vector<Value<TensorT>*> outputs() const final {
-    return FilterValues([](const ValueDef& v) { return v.consumers.empty(); });
-  }
-
-  bool IsGraphInput(ValueId id) const final {
-    if (id >= values_.size()) {
-      return false;
-    }
-    return values_[id].producer == nullptr;
-  }
-
-  bool IsGraphOutput(ValueId id) const final {
-    if (id >= values_.size()) {
-      return false;
-    }
-    return values_[id].consumers.empty();
-  }
-
-  Node* GetNode(NodeId id) const final {
-    if (id >= nodes_.size()) {
-      return {};
-    }
-    return nodes_.at(id).node.get();
-  }
-
-  Value<TensorT>* GetValue(ValueId id) const final {
-    if (id >= values_.size()) {
-      return nullptr;
-    }
-    return values_[id].value.get();
-  }
-
-  // Append Node to the end of the execution plan.
-  Node* NewNode() final {
-    const NodeId new_id = nodes_.size();
-    NodeDef def;
-    def.node = absl::make_unique<Node>(Node{static_cast<NodeId>(new_id), {}});
-    Node* node = def.node.get();
-    nodes_[new_id] = std::move(def);
-    execution_plan_.push_back(new_id);
-    return node;
-  }
-
-  // Insert Node after another in the execution plan.
-  absl::Status InsertNodeAfter(NodeId id, Node** new_node) {
-    if (id >= nodes_.size()) {
-      return absl::OutOfRangeError("NodeId is out of range");
-    }
-    int idx = 0;
-    while (idx < execution_plan_.size()) {
-      if (execution_plan_[idx] == id) break;
-      ++idx;
-    }
-    if (idx == execution_plan_.size()) {
-      return absl::OutOfRangeError("NodeId not in execution plan");
-    }
-
-    const NodeId new_id = nodes_.size();
-    NodeDef def;
-    def.node = absl::make_unique<Node>(Node{static_cast<NodeId>(new_id), {}});
-    *new_node = def.node.get();
-    nodes_[new_id] = std::move(def);
-    execution_plan_.insert(execution_plan_.begin() + idx + 1, new_id);
-    return absl::OkStatus();
-  }
-
-  Value<TensorT>* NewValue() final {
-    ValueDef def;
-    def.value = absl::make_unique<Value<TensorT>>(
-        Value<TensorT>{static_cast<ValueId>(values_.size()), {}});
-    Value<TensorT>* value = def.value.get();
-    values_.push_back(std::move(def));
-    return value;
-  }
-
-  std::vector<Value<TensorT>*> FindInputs(NodeId id) const final {
-    if (id >= nodes_.size()) {
-      return {};
-    }
-    return nodes_.at(id).inputs;
-  }
-
-  std::vector<Value<TensorT>*> FindOutputs(NodeId id) const final {
-    if (id >= nodes_.size()) {
-      return {};
-    }
-    return nodes_.at(id).outputs;
-  }
-
-  Node* FindProducer(ValueId id) const final {
-    if (id >= values_.size()) {
-      return nullptr;
-    }
-    return values_[id].producer;
-  }
-
-  std::vector<Node*> FindConsumers(ValueId id) const final {
-    if (id >= values_.size()) {
-      return {};
-    }
-    return values_[id].consumers;
-  }
-
-  absl::Status SetProducer(NodeId producer, ValueId value) final {
-    ValueDef* v;
-    RETURN_IF_ERROR(LookupValue(value, &v));
-    Value<TensorT>* value_ptr = v->value.get();
-    NodeDef* n;
-    RETURN_IF_ERROR(LookupNode(producer, &n));
-    Node* node_ptr = n->node.get();
-
-    // check if this value has the same producer already
-    if (node_ptr == v->producer) {
-      return absl::AlreadyExistsError(absl::StrCat(
-          "Node ", producer, " is already a producer of the value ", value));
-    }
-
-    // Check if the node is a consumer of this value.
-    if (IsInput(producer, value)) {
-      return absl::InvalidArgumentError("Node is a consumer of the value");
-    }
-    // TODO(akulik): detect circular dependency?
-
-    if (v->producer != nullptr) {
-      // value is no longer produced by it's previous producer.
-      Erase(&nodes_[v->producer->id].outputs, value_ptr);
-    }
-    v->producer = node_ptr;
-    n->outputs.push_back(value_ptr);
-    return absl::OkStatus();
-  }
-
-  absl::Status RemoveProducer(ValueId value) final {
-    ValueDef* v;
-    RETURN_IF_ERROR(LookupValue(value, &v));
-    Value<TensorT>* value_ptr = v->value.get();
-    if (v->producer == nullptr) {
-      return absl::InvalidArgumentError("Value does not have a producer");
-    }
-    Erase(&nodes_[v->producer->id].outputs, value_ptr);
-    v->producer = nullptr;
-    return absl::OkStatus();
-  }
-
-  absl::Status ReplaceInput(NodeId node, ValueId old_value,
-                            ValueId new_value) final {
-    ValueDef* v_old;
-    RETURN_IF_ERROR(LookupValue(old_value, &v_old));
-    Value<TensorT>* value_old_ptr = v_old->value.get();
-    ValueDef* v_new;
-    RETURN_IF_ERROR(LookupValue(new_value, &v_new));
-    Value<TensorT>* value_new_ptr = v_new->value.get();
-    NodeDef* n;
-    RETURN_IF_ERROR(LookupNode(node, &n));
-    Node* node_ptr = n->node.get();
-
-    // Check if the node is a consumer of old_value.
-    if (!IsInput(node, old_value)) {
-      return absl::InvalidArgumentError("old_value must be input of node.");
-    }
-
-    // Check if the node is not a consumer of new_value.
-    if (IsInput(node, new_value)) {
-      return absl::InvalidArgumentError("new_value can not be input of node.");
-    }
-
-    // Check if this value has the same producer already
-    if (node_ptr == v_new->producer) {
-      return absl::InvalidArgumentError("new_value can not be output of node.");
-    }
-
-    for (int i = 0; i < n->inputs.size(); ++i) {
-      if (n->inputs[i] == value_old_ptr) {
-        n->inputs[i] = value_new_ptr;
-        break;
-      }
-    }
-    v_new->consumers.push_back(node_ptr);
-    Erase(&v_old->consumers, node_ptr);
-    return absl::OkStatus();
-  }
-
-  absl::Status AddConsumer(NodeId consumer, ValueId value) final {
-    ValueDef* v;
-    RETURN_IF_ERROR(LookupValue(value, &v));
-    Value<TensorT>* value_ptr = v->value.get();
-    NodeDef* n;
-    RETURN_IF_ERROR(LookupNode(consumer, &n));
-    Node* node_ptr = n->node.get();
-
-    // check if this value has the same producer already
-    if (node_ptr == v->producer) {
-      return absl::InvalidArgumentError("Node is a producer of the value");
-    }
-
-    // check if this value has the same consumer already
-    if (IsInput(consumer, value)) {
-      return absl::AlreadyExistsError(absl::StrCat(
-          "Node ", consumer, " is already a consumer of the value ", value));
-    }
-
-    n->inputs.push_back(value_ptr);
-    v->consumers.push_back(node_ptr);
-    return absl::OkStatus();
-  }
-
-  absl::Status RemoveConsumer(NodeId consumer, ValueId value) final {
-    ValueDef* v;
-    RETURN_IF_ERROR(LookupValue(value, &v));
-    Value<TensorT>* value_ptr = v->value.get();
-    NodeDef* n;
-    RETURN_IF_ERROR(LookupNode(consumer, &n));
-    Node* node_ptr = n->node.get();
-    if (!IsInput(consumer, value)) {
-      return absl::InvalidArgumentError("Node is not a consumer of the value");
-    }
-    Erase(&n->inputs, value_ptr);
-    Erase(&v->consumers, node_ptr);
-    return absl::OkStatus();
-  }
-
-  absl::Status DeleteNode(NodeId id) final {
-    NodeDef* n;
-    RETURN_IF_ERROR(LookupNode(id, &n));
-    Node* node_ptr = n->node.get();
-    for (auto value : n->inputs) {
-      Erase(&values_[value->id].consumers, node_ptr);
-    }
-    for (auto value : n->outputs) {
-      values_[value->id].producer = nullptr;
-    }
-    n->inputs.clear();
-    n->outputs.clear();
-    n->node.reset();
-    return absl::OkStatus();
-  }
-
-  absl::Status DeleteValue(ValueId id) final {
-    ValueDef* v;
-    RETURN_IF_ERROR(LookupValue(id, &v));
-    Value<TensorT>* value_ptr = v->value.get();
-    if (v->producer != nullptr) {
-      Erase(&nodes_[v->producer->id].outputs, value_ptr);
-    }
-    if (!v->consumers.empty()) {
-      for (auto node : v->consumers) {
-        Erase(&nodes_[node->id].inputs, value_ptr);
-      }
-    }
-    v->producer = nullptr;
-    v->consumers.clear();
-    v->value.reset();
-    return absl::OkStatus();
-  }
-
-  absl::Status MakeExactCopy(Model<TensorT>* model) const {
-    model->nodes_.clear();
-    model->execution_plan_.clear();
-    model->values_.clear();
-    model->name_ = name_;
-    for (auto& value_def : values_) {
-      model->values_.push_back({});
-      if (value_def.value) {
-        model->values_.back().value =
-            absl::make_unique<Value<TensorT>>(*value_def.value);
-      }
-    }
-    // Add all nodes first.
-    for (auto node_id : execution_plan_) {
-      model->execution_plan_.push_back(node_id);
-      model->nodes_[node_id] = {};
-      auto& node_def = nodes_.at(node_id);
-      if (node_def.node) {
-        model->nodes_[node_id].node = absl::make_unique<Node>(*node_def.node);
-      }
-    }
-    // Wire up dependencies between nodes.
-    for (auto node_id : execution_plan_) {
-      auto& node_def = nodes_.at(node_id);
-      if (node_def.node) {
-        for (auto output : node_def.outputs) {
-          RETURN_IF_ERROR(model->SetProducer(node_def.node->id, output->id));
-        }
-        for (auto input : node_def.inputs) {
-          RETURN_IF_ERROR(model->AddConsumer(node_def.node->id, input->id));
-        }
-      }
-    }
-    return absl::OkStatus();
-  }
+  absl::Status MakeExactCopy(GraphFloat32* model) const;
 
  private:
   struct NodeDef {
-    std::vector<Value<TensorT>*> inputs;
-    std::vector<Value<TensorT>*> outputs;
+    std::vector<Value*> inputs;
+    std::vector<Value*> outputs;
     std::unique_ptr<Node> node;
   };
 
   struct ValueDef {
     Node* producer = nullptr;
     std::vector<Node*> consumers;
-    std::unique_ptr<Value<TensorT>> value;
+    std::unique_ptr<Value> value;
   };
 
-  bool IsInput(NodeId node, ValueId value) {
-    if (node >= nodes_.size() || value >= values_.size()) {
-      return false;
-    }
-    const NodeDef& n = nodes_[node];
-    const ValueDef& v = values_[value];
-    if (!n.node || !v.value) {
-      return false;
-    }
-    return std::find(n.inputs.begin(), n.inputs.end(), v.value.get()) !=
-           n.inputs.end();
-  }
+  bool IsInput(NodeId node, ValueId value);
 
   template <typename T>
   static void Erase(std::vector<T>* values, T value) {
@@ -516,34 +184,14 @@ class Model : public Graph<TensorT> {
   }
 
   // @return non-nullptr NodeDef that has valid Node or an error
-  absl::Status LookupNode(NodeId id, NodeDef** node_def) {
-    if (id >= nodes_.size()) {
-      return absl::OutOfRangeError("NodeId is out of range");
-    }
-    auto& n = nodes_[id];
-    if (!n.node) {
-      return absl::OutOfRangeError("Node is already deleted");
-    }
-    *node_def = &n;
-    return absl::OkStatus();
-  }
+  absl::Status LookupNode(NodeId id, NodeDef** node_def);
 
   // @return non-nullptr ValueDef that has valid Value or an error
-  absl::Status LookupValue(ValueId id, ValueDef** value_def) {
-    if (id >= values_.size()) {
-      return absl::OutOfRangeError("ValueId is out of range");
-    }
-    auto& v = values_[id];
-    if (!v.value) {
-      return absl::OutOfRangeError("Value is already deleted");
-    }
-    *value_def = &v;
-    return absl::OkStatus();
-  }
+  absl::Status LookupValue(ValueId id, ValueDef** value_def);
 
   template <typename Pred>
-  std::vector<Value<TensorT>*> FilterValues(const Pred& predicate) const {
-    std::vector<Value<TensorT>*> values;
+  std::vector<Value*> FilterValues(const Pred& predicate) const {
+    std::vector<Value*> values;
     values.reserve(values_.size());
     for (auto& v : values_) {
       if (v.value != nullptr && predicate(v)) {
@@ -566,8 +214,6 @@ class Model : public Graph<TensorT> {
     return nodes;
   }
 
-  std::string name_;
-
   // There are two approaches possible: wrap entire NodeDef and ValueDef into
   // unique_ptr and store it in values_ and nodes_ or store it by value.
   // We store it by value here to make introspection calls cheaper.
@@ -581,108 +227,27 @@ class Model : public Graph<TensorT> {
 // Removes to_remove node that precedes to_keep node only if to_remove has
 // outputs that are consumed only by to_keep. In such case to_keep inherits all
 // to_remove inputs.
-template <typename TensorT>
-absl::Status RemovePrecedingNode(Graph<TensorT>* graph, const Node* to_remove,
-                                 const Node* to_keep) {
-  // Make sure all outputs from to_remove are consumed by to_keep.
-  for (auto output : graph->FindOutputs(to_remove->id)) {
-    auto consumers = graph->FindConsumers(output->id);
-    if (consumers.size() > 1 ||
-        (consumers.size() == 1 && consumers[0] != to_keep)) {
-      return absl::InvalidArgumentError(
-          "Output from to_remove node has other consumers");
-    }
-  }
-
-  // Update all references
-  for (auto input : graph->FindInputs(to_remove->id)) {
-    RETURN_IF_ERROR(graph->AddConsumer(to_keep->id, input->id));
-  }
-  for (auto output : graph->FindOutputs(to_remove->id)) {
-    RETURN_IF_ERROR(graph->DeleteValue(output->id));
-  }
-  return graph->DeleteNode(to_remove->id);
-}
+absl::Status RemovePrecedingNode(GraphFloat32* graph, const Node* to_remove,
+                                 const Node* to_keep);
 
 // Removes to_remove node that follows to_keep node only if to_remove has inputs
 // that are produced by to_keep. to_keep inherits all to_remove inputs.
-template <typename TensorT>
-absl::Status RemoveFollowingNode(Graph<TensorT>* graph, const Node* to_remove,
-                                 const Node* to_keep) {
-  // Make sure all inputs to to_remove are produced by to_keep.
-  for (auto input : graph->FindInputs(to_remove->id)) {
-    Node* producer = graph->FindProducer(input->id);
-    if (producer->id != to_keep->id) {
-      return absl::InvalidArgumentError("To_remove node has other inputs");
-    }
-  }
-
-  for (auto input : graph->FindInputs(to_remove->id)) {
-    RETURN_IF_ERROR(graph->DeleteValue(input->id));
-  }
-  for (auto output : graph->FindOutputs(to_remove->id)) {
-    RETURN_IF_ERROR(graph->SetProducer(to_keep->id, output->id));
-  }
-  return graph->DeleteNode(to_remove->id);
-}
+absl::Status RemoveFollowingNode(GraphFloat32* graph, const Node* to_remove,
+                                 const Node* to_keep);
 
 // Removes to_remove node.
 // Requires that node has one input and one output;
-template <typename TensorT>
-absl::Status RemoveOneInputOneOutputNode(Graph<TensorT>* graph,
-                                         const Node* to_remove) {
-  auto inputs = graph->FindInputs(to_remove->id);
-  auto outputs = graph->FindOutputs(to_remove->id);
-  if (inputs.size() != 1 || outputs.size() != 1) {
-    return absl::InvalidArgumentError(
-        "To_remove node must have 1 input and 1 output");
-  }
-  auto input_id = inputs[0]->id;
-  auto output_id = outputs[0]->id;
-  Node* producer = graph->FindProducer(input_id);
-  auto consumers = graph->FindConsumers(output_id);
-  RETURN_IF_ERROR(graph->DeleteNode(to_remove->id));
-  for (auto& consumer : consumers) {
-    RETURN_IF_ERROR(graph->ReplaceInput(consumer->id, output_id, input_id));
-  }
-  RETURN_IF_ERROR(graph->DeleteValue(output_id));
-  if (!producer && consumers.empty()) {
-    RETURN_IF_ERROR(graph->DeleteValue(input_id));
-  }
-  return absl::OkStatus();
-}
+absl::Status RemoveOneInputOneOutputNode(GraphFloat32* graph,
+                                         const Node* to_remove);
 
-template <typename TensorT>
-absl::Status AddOutput(Graph<TensorT>* graph, const Node* from_node,
-                       Value<TensorT>** output) {
-  auto link = graph->NewValue();
-  RETURN_IF_ERROR(graph->SetProducer(from_node->id, link->id));
-  *output = link;
-  return absl::OkStatus();
-}
+absl::Status AddOutput(GraphFloat32* graph, const Node* from_node,
+                       Value** output);
 
-template <typename TensorT>
-absl::Status ConnectTwoNodes(Graph<TensorT>* graph, const Node* from_node,
-                             const Node* to_node, Value<TensorT>** output) {
-  Value<TensorT>* link;
-  RETURN_IF_ERROR(AddOutput(graph, from_node, &link));
-  RETURN_IF_ERROR(graph->AddConsumer(to_node->id, link->id));
-  *output = link;
-  return absl::OkStatus();
-}
-
-using GraphFloat32 = Model<TensorRef<BHWC>>;
+absl::Status ConnectTwoNodes(GraphFloat32* graph, const Node* from_node,
+                             const Node* to_node, Value** output);
 
 // @return true if all tensors have same batch value.
-inline bool IsBatchMatchesForAllValues(const GraphFloat32& model) {
-  const int32_t b = model.values()[0]->tensor.shape.b;
-  for (auto value : model.values()) {
-    if (value->tensor.shape.b != b) {
-      return false;
-    }
-  }
-  return true;
-}
+bool IsBatchMatchesForAllValues(const GraphFloat32& model);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index b33c006edf2..18b48583295 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 
-#include <stddef.h>
-
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
@@ -28,10 +26,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include <fp16.h>
-#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/lite/builtin_op_data.h"
@@ -44,11 +39,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+#include "tensorflow/lite/delegates/gpu/common/object_reader.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -68,12 +65,11 @@ namespace {
 // will turn into:
 //   node(copy(output)) <- passthrough_node(output)
 absl::Status NewPassthroughNode(GraphFloat32* graph, Node* node,
-                                const Value<TensorRef<BHWC>>* output,
-                                Node** passthru_node) {
+                                const Value* output, Node** passthru_node) {
   *passthru_node = graph->NewNode();
   // Make copies for every output in the original node.
   RETURN_IF_ERROR(graph->SetProducer((*passthru_node)->id, output->id));
-  Value<TensorRef<BHWC>>* copy_output = graph->NewValue();
+  Value* copy_output = graph->NewValue();
   RETURN_IF_ERROR(graph->SetProducer(node->id, copy_output->id));
   RETURN_IF_ERROR(graph->AddConsumer((*passthru_node)->id, copy_output->id));
   copy_output->tensor = output->tensor;
@@ -81,24 +77,6 @@ absl::Status NewPassthroughNode(GraphFloat32* graph, Node* node,
   return absl::OkStatus();
 }
 
-template <typename T>
-absl::Status CreateVectorCopyData(const TfLiteTensor& tensor, T* tensor_data) {
-  if (tensor.bytes % sizeof(T) != 0) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Input data size ", tensor.bytes,
-                     " is not aligned to expected type: ", sizeof(T)));
-  }
-  std::memcpy(tensor_data, tensor.data.uint8, tensor.bytes);
-  return absl::OkStatus();
-}
-
-void ConvertFloat16ToFloat32(size_t num_elements, const uint16_t* src,
-                             float* dst) {
-  for (size_t i = 0; i < num_elements; i++) {
-    *dst++ = fp16_ieee_to_fp32_value(*src++);
-  }
-}
-
 template <typename T>
 inline void DequantizeConstantTensor(const TfLiteTensor& tensor,
                                      const T* source_data,
@@ -123,419 +101,17 @@ inline void DequantizeConstantTensor(const TfLiteTensor& tensor,
   }
 }
 
-template <>
-absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
-                                         float* tensor_data) {
-  switch (tensor.type) {
-    case kTfLiteFloat32:
-      std::memcpy(tensor_data, tensor.data.f, tensor.bytes);
-      break;
-    case kTfLiteFloat16:
-      ConvertFloat16ToFloat32(
-          NumElements(&tensor),
-          reinterpret_cast<uint16_t const*>(tensor.data.f16), tensor_data);
-      break;
-    case kTfLiteInt8:
-      DequantizeConstantTensor(tensor, tensor.data.int8, tensor_data);
-      break;
-    case kTfLiteUInt8:
-      DequantizeConstantTensor(tensor, tensor.data.uint8, tensor_data);
-      break;
-    case kTfLiteInt32:
-      DequantizeConstantTensor(tensor, tensor.data.i32, tensor_data);
-      break;
-    default:
-      return absl::InvalidArgumentError(
-          "Unsupported data type for float32 tensor");
-  }
-  return absl::OkStatus();
-}
-
-template <typename ShapeT>
-absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, ShapeT* shape);
-
-template <>
-absl::Status SetAllDimensions<Scalar>(const TfLiteIntArray* dimensions,
-                                      Scalar* shape) {
-  if (dimensions->size < 0) {
-    return absl::InvalidArgumentError("Invalid Scalar dimensions");
-  }
-  for (int i = 0; i < dimensions->size; ++i) {
-    if (dimensions->data[i] != 1) {
-      return absl::InvalidArgumentError(
-          "Dimension can not be reduced to scalar.");
-    }
-  }
-  shape->v = 1;
-  return absl::OkStatus();
-}
-
-template <>
-absl::Status SetAllDimensions<Linear>(const TfLiteIntArray* dimensions,
-                                      Linear* shape) {
-  if (dimensions->size <= 0) {
-    return absl::InvalidArgumentError("Dimension is empty.");
-  }
-  for (int i = 0; i < dimensions->size - 1; ++i) {
-    if (dimensions->data[i] != 1) {
-      return absl::InvalidArgumentError(
-          "Dimension can not be reduced to linear.");
-    }
-  }
-  shape->v = dimensions->data[dimensions->size - 1];
-  return absl::OkStatus();
-}
-
-template <>
-absl::Status SetAllDimensions<HWC>(const TfLiteIntArray* dimensions,
-                                   HWC* shape) {
-  if (dimensions->size != 4) {
-    return absl::InvalidArgumentError("Dimensions are not HWC");
-  }
-  if (dimensions->data[0] != 1) {
-    return absl::UnimplementedError("Batch size is not equal to 1.");
-  }
-  shape->h = dimensions->data[1];
-  shape->w = dimensions->data[2];
-  shape->c = dimensions->data[3];
-  return absl::OkStatus();
-}
-
-template <>
-absl::Status SetAllDimensions<HW>(const TfLiteIntArray* dimensions, HW* shape) {
-  if (dimensions->size != 2) {
-    return absl::InvalidArgumentError("Dimensions are not HW");
-  }
-  shape->h = dimensions->data[0];
-  shape->w = dimensions->data[1];
-  return absl::OkStatus();
-}
-
-template <>
-absl::Status SetAllDimensions<OHWI>(const TfLiteIntArray* dimensions,
-                                    OHWI* shape) {
-  if (dimensions->size != 4) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Dimensions are not OHWI: ", dimensions->size));
-  }
-  shape->o = dimensions->data[0];
-  shape->h = dimensions->data[1];
-  shape->w = dimensions->data[2];
-  shape->i = dimensions->data[3];
-  return absl::OkStatus();
-}
-
-template <>
-absl::Status SetAllDimensions<BHWC>(const TfLiteIntArray* dimensions,
-                                    BHWC* shape) {
-  if (dimensions->size != 4) {
-    return absl::InvalidArgumentError("Dimensions are not BHWC");
-  }
-  shape->b = dimensions->data[0];
-  shape->h = dimensions->data[1];
-  shape->w = dimensions->data[2];
-  shape->c = dimensions->data[3];
-  return absl::OkStatus();
-}
-
-DataType ToDataType(TfLiteType type) {
-  switch (type) {
-    case kTfLiteFloat32:
-      return DataType::FLOAT32;
-    case kTfLiteInt32:
-      return DataType::INT32;
-    case kTfLiteInt64:
-      return DataType::INT64;
-    case kTfLiteInt8:
-      return DataType::INT8;
-    case kTfLiteUInt8:
-      return DataType::UINT8;
-    default:
-      return DataType::UNKNOWN;
-  }
-}
-
-int GetNumberOfRuntimeInputsForNode(const TfLiteContext* context,
-                                    const TfLiteNode* tflite_node) {
-  int number_of_runtime_inputs = 0;
-  for (int i = 0; i < tflite_node->inputs->size; i++) {
-    if (!IsConstantTensor(&context->tensors[tflite_node->inputs->data[i]])) {
-      number_of_runtime_inputs++;
-    }
-  }
-  return number_of_runtime_inputs;
-}
-
-int GetNumberOfConstInputsForNode(const TfLiteContext* context,
-                                  const TfLiteNode* tflite_node) {
-  return tflite_node->inputs->size -
-         GetNumberOfRuntimeInputsForNode(context, tflite_node);
-}
-
-int GetNumberOfRuntimeOutputsForNode(const TfLiteContext* context,
-                                     const TfLiteNode* tflite_node) {
-  int number_of_runtime_outputs = 0;
-  for (int i = 0; i < tflite_node->outputs->size; i++) {
-    if (!IsConstantTensor(&context->tensors[tflite_node->outputs->data[i]])) {
-      number_of_runtime_outputs++;
-    }
-  }
-  return number_of_runtime_outputs;
-}
-
 absl::Status CheckTensorIsAvailable(const TfLiteContext* context,
                                     const TfLiteNode* tflite_node, int idx) {
   // If tensor id is in range, it's guaranteed that it'll be available.
   if (idx >= tflite_node->inputs->size) {
     return absl::OutOfRangeError(
-        absl::StrFormat("Requested index goes beyond array size (%d vs %d).",
-                        idx, tflite_node->inputs->data[idx]));
+        absl::StrCat("Requested index goes beyond array size: ", idx, " vs ",
+                     idx, tflite_node->inputs->size));
   }
   return absl::OkStatus();
 }
 
-absl::Status CheckInputsOutputs(const TfLiteContext* context,
-                                const TfLiteNode* tflite_node,
-                                int runtime_inputs, int outputs) {
-  int runtime_inputs_from_model =
-      GetNumberOfRuntimeInputsForNode(context, tflite_node);
-  if (runtime_inputs_from_model != runtime_inputs) {
-    return absl::InternalError(absl::StrFormat(
-        "Expected %d runtime input tensor(s), but node has %d runtime "
-        "input(s).",
-        runtime_inputs, runtime_inputs_from_model));
-  }
-  int runtime_outputs = GetNumberOfRuntimeOutputsForNode(context, tflite_node);
-  if (runtime_outputs != outputs) {
-    return absl::InternalError(
-        absl::StrFormat("Expected %d output tensor(s), but node has %d "
-                        "output(s).",
-                        outputs, runtime_outputs));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status CheckInputsConstsOutputs(const TfLiteContext* context,
-                                      const TfLiteNode* tflite_node,
-                                      int runtime_inputs, int const_inputs,
-                                      int outputs) {
-  int const_inputs_from_model =
-      GetNumberOfConstInputsForNode(context, tflite_node);
-  if (const_inputs_from_model != const_inputs) {
-    return absl::InternalError(absl::StrFormat(
-        "Expected %d const input tensor(s), but node has %d const "
-        "input(s).",
-        const_inputs, const_inputs_from_model));
-  }
-  return CheckInputsOutputs(context, tflite_node, runtime_inputs, outputs);
-}
-
-// Populates quantization parameters for non-constant UInt8/Int8 tensors.
-// This helps the delegate emulate quantized inference with
-// QuantizeAndDequantize.
-absl::Status PopulateQuantParams(const TfLiteTensor& tensor,
-                                 QuantizationParams* quant_params) {
-  const TfLiteQuantization& quant = tensor.quantization;
-  if (quant.type != TfLiteQuantizationType::kTfLiteAffineQuantization) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Tensor not quantized: ", std::string(tensor.name)));
-  }
-  const TfLiteAffineQuantization* params =
-      static_cast<const TfLiteAffineQuantization*>(quant.params);
-  if (params->scale->size > 1) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Non-constant per-channel quantized tensor: ",
-                     std::string(tensor.name)));
-  }
-  const float scale = params->scale->data[0];
-  const float zero_point = static_cast<float>(params->zero_point->data[0]);
-
-  float qmin_value = 0;
-  float qmax_value = 0;
-  if (tensor.type == kTfLiteUInt8) {
-    qmin_value = static_cast<float>(std::numeric_limits<uint8_t>::min());
-    qmax_value = static_cast<float>(std::numeric_limits<uint8_t>::max());
-  } else if (tensor.type == kTfLiteInt8) {
-    qmin_value = static_cast<float>(std::numeric_limits<int8_t>::min());
-    qmax_value = static_cast<float>(std::numeric_limits<int8_t>::max());
-  } else {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Type invalid for quantized tensor: ", std::string(tensor.name)));
-  }
-  quant_params->min = scale * (static_cast<float>(qmin_value) - zero_point);
-  quant_params->max = scale * (static_cast<float>(qmax_value) - zero_point);
-  quant_params->scale = scale;
-
-  return absl::OkStatus();
-}
-
-// If quantized tensors exist in the graph & quant_conversion_map is non-null,
-// the mapping between the original tensors (fixed-point) & GPU values (fp) is
-// stored in quant_conversion_map.
-class ObjectReader {
- public:
-  ObjectReader(
-      GraphFloat32* graph, TfLiteContext* context,
-      const TfLiteNode* tflite_node,
-      std::unordered_map<int, Value<TensorRef<BHWC>>*>* tensor_to_value,
-      std::unordered_map<int, int>* quant_conversion_map = nullptr)
-      : graph_(graph),
-        context_(context),
-        tflite_node_(tflite_node),
-        tensor_to_value_(tensor_to_value),
-        quant_conversion_map_(quant_conversion_map) {}
-
-  absl::Status ReadValue(uint32_t idx, Value<TensorRef<BHWC>>** value) {
-    if (idx >= tflite_node_->inputs->size) {
-      return absl::OutOfRangeError(
-          absl::StrCat("ReadValue: input tensor index: ", idx));
-    }
-    return ReadValueByTensorIdx(tflite_node_->inputs->data[idx], value);
-  }
-
-  int GetNumberOfRuntimeInputs() const {
-    return GetNumberOfRuntimeInputsForNode(context_, tflite_node_);
-  }
-
-  absl::Status GetTensorDims(uint32_t idx, TfLiteIntArray* dimensions) const {
-    if (idx >= tflite_node_->inputs->size) {
-      return absl::OutOfRangeError(absl::StrCat("Input tensor index: ", idx));
-    }
-    const int tensor_idx = tflite_node_->inputs->data[idx];
-    if (tensor_idx < 0 || tensor_idx > context_->tensors_size) {
-      return absl::OutOfRangeError(absl::StrCat("Tensor index: ", tensor_idx));
-    }
-    const TfLiteTensor& tflite_tensor = context_->tensors[tensor_idx];
-    *dimensions = *tflite_tensor.dims;
-    return absl::OkStatus();
-  }
-
-  template <typename TensorT>
-  absl::Status ReadTensor(uint32_t idx, TensorT* t) const {
-    RETURN_IF_ERROR(CheckTensorIsAvailable(context_, tflite_node_, idx));
-    const int32_t tensor_idx = tflite_node_->inputs->data[idx];
-    const TfLiteTensor* tflite_tensor = context_->tensors + tensor_idx;
-    t->data.resize(NumElements(tflite_tensor));
-    RETURN_IF_ERROR(CreateVectorCopyData(*tflite_tensor, &t->data[0]));
-
-    // Axis and data layout depend on operation this tensor is used in. So,
-    // postpone resolutions until operations are parsed.
-    t->id = tensor_idx;
-    return SetAllDimensions(tflite_tensor->dims, &t->shape);
-  }
-
-  absl::Status AddOutput(const Node* node, int id) {
-    if (tflite_node_->outputs->size <= id) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Data id ", id, " must be less than tflite node outputs size ",
-          tflite_node_->outputs->size));
-    }
-    int output_tensor_idx = tflite_node_->outputs->data[id];
-    Value<TensorRef<BHWC>>* value;
-    RETURN_IF_ERROR(ReadValueByTensorIdx(output_tensor_idx, &value));
-    RETURN_IF_ERROR(graph_->SetProducer(node->id, value->id));
-    return absl::OkStatus();
-  }
-
-  absl::Status AddOutputs(const Node* node) {
-    for (int i = 0; i < tflite_node_->outputs->size; ++i) {
-      RETURN_IF_ERROR(AddOutput(node, i));
-    }
-    return absl::OkStatus();
-  }
-
-  absl::Status AddInput(const Node* node, uint32_t idx) {
-    Value<TensorRef<BHWC>>* input;
-    RETURN_IF_ERROR(ReadValue(idx, &input));
-    return graph_->AddConsumer(node->id, input->id);
-  }
-
-  absl::Status ReadValueByTensorIdx(uint32_t tensor_idx,
-                                    Value<TensorRef<BHWC>>** value) {
-    if (tensor_idx >= context_->tensors_size) {
-      return absl::OutOfRangeError(
-          absl::StrCat("ReadValue: input tensor index: ", tensor_idx));
-    }
-
-    if (tensor_to_value_->find(tensor_idx) == tensor_to_value_->end()) {
-      const TfLiteTensor& tflite_tensor = context_->tensors[tensor_idx];
-      if (tflite::IsConstantTensor(&tflite_tensor)) {
-        return absl::NotFoundError(absl::StrCat(
-            "ReadValue: value is a constant tensor: ", tensor_idx));
-      }
-
-      if ((tflite_tensor.type == kTfLiteInt8 ||
-           tflite_tensor.type == kTfLiteUInt8) &&
-          quant_conversion_map_) {
-        // Quantized case
-        if (quant_conversion_map_->find(tensor_idx) ==
-            quant_conversion_map_->end()) {
-          // Since the original tensor is fixed-point, add a new float tensor to
-          // the TFLite graph to represent the dequantized data.
-          int fp_tensor_index = 0;
-          TfLiteTensor* fp_tflite_tensor;
-          if (delegates::CreateNewTensorWithDifferentType(
-                  context_, tensor_idx, kTfLiteFloat32, &fp_tflite_tensor,
-                  &fp_tensor_index) != kTfLiteOk) {
-            return absl::InternalError("Could not add new tensor to graph");
-          }
-          // Remember this tensor for later.
-          (*quant_conversion_map_)[fp_tensor_index] = tensor_idx;
-          (*quant_conversion_map_)[tensor_idx] = fp_tensor_index;
-          // Add a new GPU Value for the new dequantized floating-point tensor.
-          Value<TensorRef<BHWC>>* value = graph_->NewValue();
-          RETURN_IF_ERROR(ConvertTfLiteTensorToTensorRef(*fp_tflite_tensor,
-                                                         &value->tensor));
-          value->tensor.ref = fp_tensor_index;
-          value->quant_params.emplace();
-          RETURN_IF_ERROR(
-              PopulateQuantParams(tflite_tensor, &value->quant_params.value()));
-          (*tensor_to_value_)[fp_tensor_index] = value;
-        }
-        // We do not use the original tensor index as reference for the GPU
-        // Value, instead pointing at the corresponding float version.
-        tensor_idx = quant_conversion_map_->at(tensor_idx);
-      } else {
-        // Floating-point case.
-        Value<TensorRef<BHWC>>* value = graph_->NewValue();
-        RETURN_IF_ERROR(
-            ConvertTfLiteTensorToTensorRef(tflite_tensor, &value->tensor));
-        value->tensor.ref = tensor_idx;
-        (*tensor_to_value_)[tensor_idx] = value;
-      }
-    }
-
-    *value = (*tensor_to_value_)[tensor_idx];
-    return absl::OkStatus();
-  }
-
-  TfLiteTensor* GetInputTensor(int index) const {
-    return index >= 0 && index < tflite_node_->inputs->size
-               ? context_->tensors + tflite_node_->inputs->data[index]
-               : nullptr;
-  }
-
-  TfLiteTensor* GetOutputTensor(int index) const {
-    return index >= 0 && index < tflite_node_->outputs->size
-               ? context_->tensors + tflite_node_->outputs->data[index]
-               : nullptr;
-  }
-
-  absl::Status VerifyInputsConstsOutputs(const TfLiteNode* tflite_node,
-                                         int runtime_inputs, int const_inputs,
-                                         int outputs) {
-    return CheckInputsConstsOutputs(context_, tflite_node, runtime_inputs,
-                                    const_inputs, outputs);
-  }
-
- private:
-  GraphFloat32* graph_ = nullptr;
-  TfLiteContext* context_ = nullptr;
-  const TfLiteNode* tflite_node_ = nullptr;
-  std::unordered_map<int, Value<TensorRef<BHWC>>*>* tensor_to_value_;
-  std::unordered_map<int, int>* quant_conversion_map_;
-};
-
 // A parser responsible for parsing TFLite operation and adding it to a graph.
 class TFLiteOperationParser {
  public:
@@ -582,7 +158,7 @@ absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
   if (fused_activation == kTfLiteActNone) {
     return absl::OkStatus();
   }
-  const auto& outputs = graph->FindOutputs(node->id);
+  const auto outputs = graph->FindOutputs(node->id);
   if (outputs.empty()) {
     return absl::InternalError("Empty outputs in fused node");
   }
@@ -652,7 +228,6 @@ absl::Status GetFullyConnectedAttributes(int weights_tensor_id,
   attr->weights.shape.o = weights.shape.h;
   attr->weights.shape.i = weights.shape.w;
   reader->ReadTensor(bias_tensor_id, &attr->bias).IgnoreError();  // optional
-
   return absl::OkStatus();
 }
 
@@ -685,8 +260,8 @@ absl::Status CheckMaxSupportedOpVersion(const TfLiteRegistration* registration,
   const int op_version = registration->version;
   if (op_version > max_version) {
     return absl::UnimplementedError(
-        absl::StrFormat("Max version supported: %d. Requested version %d.",
-                        max_version, op_version));
+        absl::StrCat("Max version supported: ", max_version,
+                     ". Requested version ", op_version));
   }
   return absl::OkStatus();
 }
@@ -696,36 +271,35 @@ absl::Status CheckExactSupportedOpVersion(
   int op_version = registration->version;
   if (op_version != expected_version) {
     return absl::UnimplementedError(
-        absl::StrFormat("Only version %d is supported. Requested version %d.",
-                        expected_version, op_version));
+        absl::StrCat("Only version ", expected_version,
+                     " is supported. Requested version ", op_version));
   }
   return absl::OkStatus();
 }
 
 absl::Status CheckKernels(int kernel_h, int kernel_w) {
   if (kernel_h <= 0 || kernel_w <= 0) {
-    return absl::InvalidArgumentError(absl::StrFormat(
-        "Incorrect kernel values: kernel_height = %d, kernel_width = %d.",
-        kernel_h, kernel_w));
+    return absl::InvalidArgumentError(
+        absl::StrCat("Incorrect kernel values: kernel_height = ", kernel_h,
+                     ", kernel_width = ", kernel_w));
   }
   return absl::OkStatus();
 }
 
 absl::Status CheckStrides(int strides_h, int strides_w) {
   if (strides_h <= 0 || strides_w <= 0) {
-    return absl::InvalidArgumentError(absl::StrFormat(
-        "Incorrect stride values: stride_height = %d, stride_width = %d.",
-        strides_h, strides_w));
+    return absl::InvalidArgumentError(
+        absl::StrCat("Incorrect stride values: stride_height = ", strides_h,
+                     ", stride_width = ", strides_w));
   }
   return absl::OkStatus();
 }
 
 absl::Status CheckDilation(int dilation_h, int dilation_w) {
   if (dilation_h <= 0 || dilation_w <= 0) {
-    return absl::InvalidArgumentError(
-        absl::StrFormat("Incorrect dilation values: dilation_factor = %d, "
-                        "dilation_factor = %d.",
-                        dilation_h, dilation_w));
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Incorrect dilation values: dilation_factor = ", dilation_h,
+        ", dilation_factor = ", dilation_w));
   }
   return absl::OkStatus();
 }
@@ -745,8 +319,7 @@ absl::Status CheckKernelsAndStrides(int kernel_h, int kernel_w, int strides_h,
 }
 
 // Creates a simple node that holds tensor value.
-absl::Status NewConstNode(TensorFloat32 t, GraphFloat32* graph,
-                          Value<TensorRef<BHWC>>** value) {
+absl::Status NewConstNode(TensorFloat32 t, GraphFloat32* graph, Value** value) {
   ConstTensorAttributes attr;
   attr.tensor = std::move(t);
   Node* node = graph->NewNode();
@@ -770,28 +343,6 @@ absl::Status ParsePoolingAttributes(const TfLitePoolParams* tf_options,
   return absl::OkStatus();
 }
 
-absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) {
-  const TfLiteIntArray* dims = tflite_tensor.dims;
-  switch (dims->size) {
-    case 1:
-      *bhwc = BHWC(dims->data[0], 1, 1, 1);
-      return absl::OkStatus();
-    case 2:
-      *bhwc = BHWC(dims->data[0], 1, 1, dims->data[1]);
-      return absl::OkStatus();
-    case 3:
-      *bhwc = BHWC(dims->data[0], 1, dims->data[1], dims->data[2]);
-      return absl::OkStatus();
-    case 4:
-      *bhwc = BHWC(dims->data[0], dims->data[1], dims->data[2], dims->data[3]);
-      return absl::OkStatus();
-    default:
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Tensor \"", tflite_tensor.name ? tflite_tensor.name : "nullptr",
-          "\" has bad input dims size: ", dims->size, "."));
-  }
-}
-
 absl::Status ParseInputsWithConstTensor(Node* node, ObjectReader* reader,
                                         TensorOrScalar* tensor_or_scalar) {
   const std::string& opname = node->operation.type;
@@ -828,7 +379,7 @@ absl::Status ParseInputsWithConstTensor(Node* node, ObjectReader* reader,
       constant_dims = input0->dims;
     }
     RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
-    if (constant_dims->size <= 0) {
+    if (constant_dims->size <= 0 || NumElements(constant_dims) == 1) {
       Tensor<Scalar, DataType::FLOAT32> tensor;
       RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
       *tensor_or_scalar = tensor.data[0];
@@ -901,16 +452,16 @@ class ConcatenationOperationParser : public TFLiteOperationParser {
     ConcatAttributes attr;
     // Read inputs first to make sure const node is added to a graph before
     // concat node to ensure topological order.
-    std::vector<const Value<TensorRef<BHWC>>*> inputs;
+    std::vector<const Value*> inputs;
     for (uint32_t idx = 0; idx < tflite_node->inputs->size; ++idx) {
-      Value<TensorRef<BHWC>>* value;
+      Value* value;
       const auto status = reader->ReadValue(idx, &value);
       if (status.ok()) {
         inputs.push_back(value);
       } else {
         TensorFloat32 tensor;
         RETURN_IF_ERROR(reader->ReadTensor(idx, &tensor));
-        Value<TensorRef<BHWC>>* value;
+        Value* value;
         RETURN_IF_ERROR(NewConstNode(std::move(tensor), graph, &value));
         inputs.push_back(value);
       }
@@ -919,7 +470,7 @@ class ConcatenationOperationParser : public TFLiteOperationParser {
     Node* node = graph->NewNode();
     node->operation.type = ToString(OperationType::CONCAT);
     RETURN_IF_ERROR(reader->AddOutputs(node));
-    for (const Value<TensorRef<BHWC>>* input : inputs) {
+    for (const Value* input : inputs) {
       RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
     }
 
@@ -1005,9 +556,23 @@ class Conv2DOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 3));
-    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
-                                       /*runtime_inputs=*/1, /*outputs=*/1));
-    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    const int runtime_inputs =
+        GetNumberOfRuntimeInputsForNode(context, tflite_node);
+    if (runtime_inputs > 2) {
+      return absl::InternalError(
+          absl::StrCat("Expected 1 or 2 input tensor(s), but node has ",
+                       runtime_inputs, " runtime inputs."));
+    }
+    const int runtime_outputs =
+        GetNumberOfRuntimeOutputsForNode(context, tflite_node);
+    if (runtime_outputs != 1) {
+      return absl::InternalError(
+          absl::StrCat("Expected 1 output tensor(s), but node has ",
+                       runtime_outputs, " runtime outputs."));
+    }
+    if (runtime_inputs == 1) {
+      RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    }
     TfLiteConvParams* tf_options = nullptr;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckStridesAndDilation(
@@ -1025,7 +590,12 @@ class Conv2DOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
     Convolution2DAttributes attr;
-    RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
+    const int runtime_inputs = reader->GetNumberOfRuntimeInputs();
+    if (runtime_inputs == 2) {
+      RETURN_IF_ERROR(reader->AddInput(node, 1));
+    } else {  // runtime_inputs == 1;
+      RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
+    }
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
 
     const auto* tf_options =
@@ -1455,7 +1025,7 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
     if (input->tensor.shape.h != 1 || input->tensor.shape.w != 1) {
       auto& reshape = node;
       conv = graph->NewNode();  // reset conv pointer!
-      Value<TensorRef<BHWC>>* reshaped_value = graph->NewValue();
+      Value* reshaped_value = graph->NewValue();
       reshaped_value->tensor.type = DataType::FLOAT32;
       reshaped_value->tensor.shape =
           BHWC(input->tensor.shape.b, 1, 1, weights.shape.w);
@@ -1565,11 +1135,11 @@ class LSTMOperationParser : public TFLiteOperationParser {
     lstm_attr.kernel_type = LstmKernelType::BASIC;
     lstm_node->operation.attributes = lstm_attr;
 
-    Value<TensorRef<BHWC>>* concat_temp;
+    Value* concat_temp;
     int concat_tensor_idx = tflite_node->outputs->data[2];
     RETURN_IF_ERROR(
         reader->ReadValueByTensorIdx(concat_tensor_idx, &concat_temp));
-    Value<TensorRef<BHWC>>* activ_temp;
+    Value* activ_temp;
     int activ_tensor_idx = tflite_node->outputs->data[3];
     RETURN_IF_ERROR(
         reader->ReadValueByTensorIdx(activ_tensor_idx, &activ_temp));
@@ -2121,7 +1691,7 @@ class SliceOperationParser : public TFLiteOperationParser {
     Node* node = graph->NewNode();
     node->operation.type = ToString(OperationType::SLICE);
     RETURN_IF_ERROR(reader->AddOutputs(node));
-    Value<TensorRef<BHWC>>* input;
+    Value* input;
     RETURN_IF_ERROR(reader->ReadValue(0, &input));
     RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
 
@@ -2133,22 +1703,37 @@ class SliceOperationParser : public TFLiteOperationParser {
     if (starts.data.size() != sizes.data.size()) {
       return absl::InvalidArgumentError("Starts amount != sizes amount.");
     }
+    const auto& in_shape = input->tensor.shape;
     if (starts.data.size() == 4) {
+      sizes.data[0] =
+          sizes.data[0] != -1 ? sizes.data[0] : in_shape.b - starts.data[0];
+      sizes.data[1] =
+          sizes.data[1] != -1 ? sizes.data[1] : in_shape.h - starts.data[1];
+      sizes.data[2] =
+          sizes.data[2] != -1 ? sizes.data[2] : in_shape.w - starts.data[2];
+      sizes.data[3] =
+          sizes.data[3] != -1 ? sizes.data[3] : in_shape.c - starts.data[3];
       attr.starts =
           BHWC(starts.data[0], starts.data[1], starts.data[2], starts.data[3]);
       attr.ends =
           BHWC(starts.data[0] + sizes.data[0], starts.data[1] + sizes.data[1],
                starts.data[2] + sizes.data[2], starts.data[3] + sizes.data[3]);
     } else if (starts.data.size() == 3) {
+      sizes.data[0] =
+          sizes.data[0] != -1 ? sizes.data[0] : in_shape.h - starts.data[0];
+      sizes.data[1] =
+          sizes.data[1] != -1 ? sizes.data[1] : in_shape.w - starts.data[1];
+      sizes.data[2] =
+          sizes.data[2] != -1 ? sizes.data[2] : in_shape.c - starts.data[2];
       attr.starts = BHWC(0, starts.data[0], starts.data[1], starts.data[2]);
       attr.ends =
-          BHWC(input->tensor.shape.b, starts.data[0] + sizes.data[0],
+          BHWC(in_shape.b, starts.data[0] + sizes.data[0],
                starts.data[1] + sizes.data[1], starts.data[2] + sizes.data[2]);
     } else {
       return absl::UnimplementedError(
           "Slicing is supported for 3 or 4 dimensional tensors only.");
     }
-    RETURN_IF_ERROR(UpdateIfNegative(input->tensor.shape, &attr));
+    RETURN_IF_ERROR(UpdateIfNegative(in_shape, &attr));
 
     auto out_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
     if ((attr.ends.b - attr.starts.b) != out_shape.b) {
@@ -2286,7 +1871,7 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
     Node* node = graph->NewNode();
     node->operation.type = ToString(OperationType::SLICE);
     RETURN_IF_ERROR(reader->AddOutputs(node));
-    Value<TensorRef<BHWC>>* input;
+    Value* input;
     RETURN_IF_ERROR(reader->ReadValue(0, &input));
     RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
 
@@ -2471,7 +2056,7 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
                      GraphFloat32* graph, ObjectReader* reader) final {
     auto* node = graph->NewNode();
     node->operation.type = ToString(OperationType::CONVOLUTION_TRANSPOSED);
-    Value<TensorRef<BHWC>>* input;
+    Value* input;
     RETURN_IF_ERROR(reader->ReadValue(2, &input));
     RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
     RETURN_IF_ERROR(reader->AddOutputs(node));
@@ -2696,6 +2281,39 @@ class RoIToTransformMatrixOperationParser : public TFLiteOperationParser {
  private:
 };
 
+class RoIToTransformMatrixV2OperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
+                                       /*runtime_inputs=*/1, /*outputs=*/1));
+    return absl::OkStatus();
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    RETURN_IF_ERROR(reader->AddInput(node, 0));  // bbox
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    std::string op_name = "roi_to_transform_matrix_v2";
+    node->operation.type = op_name;
+    BHWC output_shape;
+    RETURN_IF_ERROR(
+        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
+                              tflite_node->custom_initial_data_size,
+                              &(node->operation.attributes), &output_shape));
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+    output_value->tensor.shape = output_shape;
+    return absl::OkStatus();
+  }
+
+ private:
+};
+
 class TransformTensorOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -2733,6 +2351,43 @@ class TransformTensorOperationParser : public TFLiteOperationParser {
  private:
 };
 
+class TransformTensorBilinearV2OperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
+                                       /*runtime_inputs=*/2, /*outputs=*/1));
+    return absl::OkStatus();
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    RETURN_IF_ERROR(reader->AddInput(node, 0));  // data
+    RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    std::string op_name = "transform_tensor_bilinear_v2";
+    node->operation.type = op_name;
+    BHWC output_shape;
+    RETURN_IF_ERROR(
+        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
+                              tflite_node->custom_initial_data_size,
+                              &(node->operation.attributes), &output_shape));
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+
+    output_value->tensor.shape =
+        BHWC(1, output_shape.h, output_shape.w,
+             graph->FindInputs(node->id)[0]->tensor.shape.c);
+    return absl::OkStatus();
+  }
+
+ private:
+};
+
 class TransformLandmarksOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -2767,6 +2422,40 @@ class TransformLandmarksOperationParser : public TFLiteOperationParser {
  private:
 };
 
+class TransformLandmarksV2OperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
+                                       /*runtime_inputs=*/2, /*outputs=*/1));
+    return absl::OkStatus();
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    RETURN_IF_ERROR(reader->AddInput(node, 0));  // data
+    RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    std::string op_name = "transform_landmarks_v2";
+    node->operation.type = op_name;
+    BHWC output_shape;
+    RETURN_IF_ERROR(
+        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
+                              tflite_node->custom_initial_data_size,
+                              &(node->operation.attributes), &output_shape));
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+
+    output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
+    return absl::OkStatus();
+  }
+
+ private:
+};
+
 class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -2797,6 +2486,37 @@ class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
   }
 };
 
+class Landmarks2TransformMatrixV2OperationParser
+    : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    return CheckInputsOutputs(context, tflite_node, /*runtime_inputs=*/1,
+                              /*outputs=*/1);
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    RETURN_IF_ERROR(reader->AddInput(node, 0));  // landmarks
+    RETURN_IF_ERROR(reader->AddOutputs(node));   // transform matrix
+
+    const std::string op_name = "landmarks_to_transform_matrix_v2";
+    node->operation.type = op_name;
+    BHWC output_shape;
+    RETURN_IF_ERROR(
+        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
+                              tflite_node->custom_initial_data_size,
+                              &(node->operation.attributes), &output_shape));
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+    output_value->tensor.shape = output_shape;
+    return absl::OkStatus();
+  }
+};
+
 class AlignmentPointsToTransformMatrixOperationParser
     : public TFLiteOperationParser {
  public:
@@ -2895,139 +2615,145 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
   const auto builtin_code = registration->builtin_code;
   switch (builtin_code) {
     case kTfLiteBuiltinAbs:
-      return absl::make_unique<ElementwiseOperationParser>(OperationType::ABS);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::ABS);
     case kTfLiteBuiltinAdd:
-      return absl::make_unique<AddOperationParser>();
+      return std::make_unique<AddOperationParser>();
     case kTfLiteBuiltinAveragePool2d:
-      return absl::make_unique<Pooling2DOperationParser>(PoolingType::AVERAGE);
+      return std::make_unique<Pooling2DOperationParser>(PoolingType::AVERAGE);
     case kTfLiteBuiltinConcatenation:
-      return absl::make_unique<ConcatenationOperationParser>();
+      return std::make_unique<ConcatenationOperationParser>();
     case kTfLiteBuiltinConv2d:
-      return absl::make_unique<Conv2DOperationParser>();
+      return std::make_unique<Conv2DOperationParser>();
     case kTfLiteBuiltinCos:
-      return absl::make_unique<ElementwiseOperationParser>(OperationType::COS);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::COS);
     case kTfLiteBuiltinDepthwiseConv2d:
-      return absl::make_unique<DepthwiseConvolutionOperationParser>();
+      return std::make_unique<DepthwiseConvolutionOperationParser>();
     case kTfLiteBuiltinDequantize:
       if (allow_quant_ops) {
-        return absl::make_unique<DequantizeOperationParser>();
+        return std::make_unique<DequantizeOperationParser>();
       }
       break;
     case kTfLiteBuiltinDiv:
-      return absl::make_unique<ElementwiseOperationParser>(OperationType::DIV);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::DIV);
     case kTfLiteBuiltinFullyConnected:
-      return absl::make_unique<FullyConnectedOperationParser>();
+      return std::make_unique<FullyConnectedOperationParser>();
     case kTfLiteBuiltinHardSwish:
-      return absl::make_unique<HardSwishOperationParser>();
+      return std::make_unique<HardSwishOperationParser>();
     case kTfLiteBuiltinLogistic:
-      return absl::make_unique<ElementwiseOperationParser>(
+      return std::make_unique<ElementwiseOperationParser>(
           OperationType::SIGMOID);
     case kTfLiteBuiltinLog:
-      return absl::make_unique<ElementwiseOperationParser>(OperationType::LOG);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::LOG);
     case kTfLiteBuiltinLstm:
-      return absl::make_unique<LSTMOperationParser>();
+      return std::make_unique<LSTMOperationParser>();
     case kTfLiteBuiltinMaximum:
-      return absl::make_unique<ElementwiseOperationParser>(
+      return std::make_unique<ElementwiseOperationParser>(
           OperationType::MAXIMUM);
     case kTfLiteBuiltinMaxPool2d:
-      return absl::make_unique<Pooling2DOperationParser>(PoolingType::MAX);
+      return std::make_unique<Pooling2DOperationParser>(PoolingType::MAX);
     case kTfLiteBuiltinMean:
-      return absl::make_unique<MeanOperationParser>();
+      return std::make_unique<MeanOperationParser>();
     case kTfLiteBuiltinMinimum:
-      return absl::make_unique<ElementwiseOperationParser>(
+      return std::make_unique<ElementwiseOperationParser>(
           OperationType::MINIMUM);
     case kTfLiteBuiltinMirrorPad:
-      return absl::make_unique<PadOperationParser>(/*mirror_pad=*/true);
+      return std::make_unique<PadOperationParser>(/*mirror_pad=*/true);
     case kTfLiteBuiltinMul:
-      return absl::make_unique<MulOperationParser>();
+      return std::make_unique<MulOperationParser>();
     case kTfLiteBuiltinPad:
-      return absl::make_unique<PadOperationParser>(/*mirror_pad=*/false);
+      return std::make_unique<PadOperationParser>(/*mirror_pad=*/false);
     case kTfLiteBuiltinPow:
-      return absl::make_unique<ElementwiseOperationParser>(OperationType::POW);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::POW);
     case kTfLiteBuiltinQuantize:
       if (allow_quant_ops) {
-        return absl::make_unique<QuantizeOperationParser>();
+        return std::make_unique<QuantizeOperationParser>();
       }
       break;
     case kTfLiteBuiltinRelu:
-      return absl::make_unique<ReLUOperationParser>(0);
+      return std::make_unique<ReLUOperationParser>(0);
     case kTfLiteBuiltinRelu6:
-      return absl::make_unique<ReLUOperationParser>(6);
+      return std::make_unique<ReLUOperationParser>(6);
     case kTfLiteBuiltinLeakyRelu:
-      return absl::make_unique<ReLUOperationParser>(0);
+      return std::make_unique<ReLUOperationParser>(0);
     case kTfLiteBuiltinPrelu:
-      return absl::make_unique<PReLUOperationParser>();
+      return std::make_unique<PReLUOperationParser>();
     case kTfLiteBuiltinReshape:
-      return absl::make_unique<ReshapeOperationParser>();
+      return std::make_unique<ReshapeOperationParser>();
     case kTfLiteBuiltinResizeBilinear:
-      return absl::make_unique<Resize2DOperationParser>(SamplingType::BILINEAR);
+      return std::make_unique<Resize2DOperationParser>(SamplingType::BILINEAR);
     case kTfLiteBuiltinResizeNearestNeighbor:
-      return absl::make_unique<Resize2DOperationParser>(SamplingType::NEAREST);
+      return std::make_unique<Resize2DOperationParser>(SamplingType::NEAREST);
     case kTfLiteBuiltinRsqrt:
-      return absl::make_unique<ElementwiseOperationParser>(
-          OperationType::RSQRT);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::RSQRT);
     case kTfLiteBuiltinSin:
-      return absl::make_unique<ElementwiseOperationParser>(OperationType::SIN);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::SIN);
     case kTfLiteBuiltinSlice:
-      return absl::make_unique<SliceOperationParser>();
+      return std::make_unique<SliceOperationParser>();
     case kTfLiteBuiltinSoftmax:
-      return absl::make_unique<SoftmaxOperationParser>();
+      return std::make_unique<SoftmaxOperationParser>();
     case kTfLiteBuiltinSpaceToDepth:
-      return absl::make_unique<SpaceToDepthOperationParser>();
+      return std::make_unique<SpaceToDepthOperationParser>();
     case kTfLiteBuiltinSqrt:
-      return absl::make_unique<ElementwiseOperationParser>(OperationType::SQRT);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::SQRT);
     case kTfLiteBuiltinSquare:
-      return absl::make_unique<ElementwiseOperationParser>(
+      return std::make_unique<ElementwiseOperationParser>(
           OperationType::SQUARE);
     case kTfLiteBuiltinSquaredDifference:
-      return absl::make_unique<ElementwiseOperationParser>(
+      return std::make_unique<ElementwiseOperationParser>(
           OperationType::SQUARED_DIFF);
     case kTfLiteBuiltinStridedSlice:
-      return absl::make_unique<StridedSliceOperationParser>();
+      return std::make_unique<StridedSliceOperationParser>();
     case kTfLiteBuiltinSub:
-      return absl::make_unique<ElementwiseOperationParser>(OperationType::SUB);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::SUB);
     case kTfLiteBuiltinTanh:
-      return absl::make_unique<ElementwiseOperationParser>(OperationType::TANH);
+      return std::make_unique<ElementwiseOperationParser>(OperationType::TANH);
     case kTfLiteBuiltinTranspose:
-      return absl::make_unique<TransposeOperationParser>();
+      return std::make_unique<TransposeOperationParser>();
     case kTfLiteBuiltinTransposeConv:
-      return absl::make_unique<TransposeConvOperationParser>();
+      return std::make_unique<TransposeConvOperationParser>();
 
     case kTfLiteBuiltinCustom:
       const absl::string_view custom_name = registration->custom_name;
       if (custom_name == "Convolution2DTransposeBias") {
-        return absl::make_unique<Convolution2DTransposeBiasParser>();
+        return std::make_unique<Convolution2DTransposeBiasParser>();
       }
       if (custom_name == "MaxPoolingWithArgmax2D") {
-        return absl::make_unique<Pooling2DOperationParser>(PoolingType::MAX);
+        return std::make_unique<Pooling2DOperationParser>(PoolingType::MAX);
       }
       if (custom_name == "MaxUnpooling2D") {
-        return absl::make_unique<Unpooling2DOperationParser>();
+        return std::make_unique<Unpooling2DOperationParser>();
       }
       if (custom_name == "RoIToTransformMatrix") {
-        return absl::make_unique<RoIToTransformMatrixOperationParser>();
+        return std::make_unique<RoIToTransformMatrixOperationParser>();
+      }
+      if (custom_name == "RoIToTransformMatrixV2") {
+        return std::make_unique<RoIToTransformMatrixV2OperationParser>();
       }
-
       if (custom_name == "TransformTensor") {
-        return absl::make_unique<TransformTensorOperationParser>();
+        return std::make_unique<TransformTensorOperationParser>();
+      }
+      if (custom_name == "TransformTensorBilinearV2") {
+        return std::make_unique<TransformTensorBilinearV2OperationParser>();
       }
-
       if (custom_name == "TransformLandmarks") {
-        return absl::make_unique<TransformLandmarksOperationParser>();
+        return std::make_unique<TransformLandmarksOperationParser>();
+      }
+      if (custom_name == "TransformLandmarksV2") {
+        return std::make_unique<TransformLandmarksV2OperationParser>();
       }
-
       if (custom_name == "Landmarks2TransformMatrix") {
-        return absl::make_unique<Landmarks2TransformMatrixOperationParser>();
+        return std::make_unique<Landmarks2TransformMatrixOperationParser>();
+      }
+      if (custom_name == "Landmarks2TransformMatrixV2") {
+        return std::make_unique<Landmarks2TransformMatrixV2OperationParser>();
       }
-
       if (custom_name == "AlignmentPointsToTransformMatrix") {
-        return absl::make_unique<
+        return std::make_unique<
             AlignmentPointsToTransformMatrixOperationParser>();
       }
-
       break;
   }
-  return absl::make_unique<UnsupportedOperationParser>();
+  return std::make_unique<UnsupportedOperationParser>();
 }
 
 absl::Status IsSupported(const TfLiteContext* context, TfLiteNode* node,
@@ -3056,16 +2782,11 @@ bool IsAllAllowedTensors(TfLiteContext* context, const TfLiteIntArray* array,
 }
 }  // namespace
 
-absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
-                                            TensorRef<BHWC>* tensor_ref) {
-  tensor_ref->type = ToDataType(tflite_tensor.type);
-  return ExtractTensorShape(tflite_tensor, &tensor_ref->shape);
-}
-
 // TODO(impjdi): Check number of input/output tensors and their dimensions.
 // TODO(impjdi): Check ops' parameters.
-TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
-  IsNodeSupportedFn node_supported_fn =
+TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops,
+                                int max_delegated_partitions) {
+  delegates::IsNodeSupportedFn node_supported_fn =
       [=](TfLiteContext* context, TfLiteNode* node,
           TfLiteRegistration* registration,
           std::string* unsupported_details) -> bool {
@@ -3089,17 +2810,18 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
     return true;
   };
 
-  GraphWithDequantPartitionHelper partition_helper(context, node_supported_fn);
+  delegates::FP16GraphPartitionHelper partition_helper(context,
+                                                       node_supported_fn);
   std::set<std::string> unsupported_nodes_info;
   if (partition_helper.Partition(&unsupported_nodes_info) != kTfLiteOk) {
     return TfLiteIntArrayCreate(0);
   }
 
-  // We simply get 1st largest partition, but we could later explore whether
-  // getting more partitions could lead to better performance, i.e. by
-  // parameterizing '1' here.
+  // By default, we simply get 1st largest partition as 'max_delegate_partions'
+  // is set to 1 by default.
   std::vector<int> ops_to_replace =
-      partition_helper.GetNodesOfFirstNLargestPartitions(1);
+      partition_helper.GetNodesOfFirstNLargestPartitions(
+          max_delegated_partitions);
 
   if (!unsupported_nodes_info.empty()) {
     std::string unsupported = absl::StrJoin(unsupported_nodes_info, "\n");
@@ -3107,11 +2829,9 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
         "Following operations are not supported by GPU delegate:\n",
         unsupported, "\n");
     if (!ops_to_replace.empty()) {
-      absl::StrAppendFormat(
-          &error_message,
-          "%d operations will run on the GPU (first node: "
-          "%d, last node: %d), and the remaining %d",
-          ops_to_replace.size(), ops_to_replace.front(), ops_to_replace.back(),
+      absl::StrAppend(
+          &error_message, ops_to_replace.size(),
+          " operations will run on the GPU, and the remaining ",
           partition_helper.num_total_nodes() - ops_to_replace.size());
     } else {
       absl::StrAppend(&error_message,
@@ -3124,6 +2844,25 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
   return ConvertVectorToTfLiteIntArray(ops_to_replace);
 }
 
+// Creates inputs and outputs passed by io_tensors parameters in the resulting
+// graph. We force it to make sure that delegated subgraph has same order of
+// inputs and outputs with the original one. When delegated model is built from
+// the tflite model representation tensors are created lazily, so there is no
+// guarantee that the order will match the source model tensors order.
+absl::Status PrecreateIOTensors(
+    TfLiteContext* context, GraphFloat32* graph, TfLiteIntArray* io_tensors,
+    std::unordered_map<int, int>* quant_conversion_map,
+    std::unordered_map<int, Value*>* tensor_to_value) {
+  for (int i = 0; i < io_tensors->size; ++i) {
+    const int tensor_index = io_tensors->data[i];
+    const TfLiteTensor& tflite_tensor = context->tensors[tensor_index];
+    if (tflite::IsConstantTensor(&tflite_tensor)) continue;
+    RETURN_IF_ERROR(ObjectReader::ReadNonConstantTensor(
+        context, tensor_to_value, quant_conversion_map, graph, tensor_index));
+  }
+  return absl::OkStatus();
+}
+
 absl::Status BuildModel(TfLiteContext* context,
                         const TfLiteDelegateParams* delegate_params,
                         GraphFloat32* graph,
@@ -3153,7 +2892,13 @@ absl::Status BuildModel(TfLiteContext* context,
     operations.push_back(std::move(op_parser));
     tflite_nodes.push_back(i);
   }
-  std::unordered_map<int, Value<TensorRef<BHWC>>*> tensor_to_value;
+  std::unordered_map<int, Value*> tensor_to_value;
+  RETURN_IF_ERROR(PrecreateIOTensors(context, graph,
+                                     delegate_params->input_tensors,
+                                     quant_conversion_map, &tensor_to_value));
+  RETURN_IF_ERROR(PrecreateIOTensors(context, graph,
+                                     delegate_params->output_tensors,
+                                     quant_conversion_map, &tensor_to_value));
   for (int i = 0; i < operations.size(); ++i) {
     TfLiteNode* tflite_node;
     TfLiteRegistration* registration;
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.h b/tensorflow/lite/delegates/gpu/common/model_builder.h
index 4b2a2f51db3..1e5016d86b6 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.h
@@ -29,8 +29,12 @@ namespace gpu {
 
 // Validates which operations are supported and returns array of operations to
 // replace with GPU kernels. The caller must free the pointer on TfLiteIntArray.
+// 'max_delegated_partitions' limits the maximum number of partitions to
+// delegate as a graph could possibly have multiple partitions (each partition
+// consists of a subset of ops) to be replaced.
 TfLiteIntArray* GetOpsToReplace(TfLiteContext* context,
-                                bool allow_quant_ops = false);
+                                bool allow_quant_ops = false,
+                                int max_delegated_partitions = 1);
 
 // Extracts TFLite delegate execution plan from the input TFLite context and
 // converts it into generic graph format.
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
new file mode 100644
index 00000000000..4973a8179cd
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -0,0 +1,297 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+
+#include <string>
+
+#include <fp16.h>
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
+                                    TfLiteNode** tflite_node,
+                                    TfLiteRegistration** registration) {
+  if (context->GetNodeAndRegistration(context, node_id, tflite_node,
+                                      registration) != kTfLiteOk) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Couldn't get node and registration info for op: ", node_id));
+  }
+  return absl::OkStatus();
+}
+
+DataType ToDataType(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+      return DataType::FLOAT32;
+    case kTfLiteInt32:
+      return DataType::INT32;
+    case kTfLiteInt64:
+      return DataType::INT64;
+    case kTfLiteInt8:
+      return DataType::INT8;
+    case kTfLiteUInt8:
+      return DataType::UINT8;
+    default:
+      return DataType::UNKNOWN;
+  }
+}
+
+absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) {
+  const TfLiteIntArray* dims = tflite_tensor.dims;
+  switch (dims->size) {
+    case 1:
+      *bhwc = BHWC(dims->data[0], 1, 1, 1);
+      return absl::OkStatus();
+    case 2:
+      *bhwc = BHWC(dims->data[0], 1, 1, dims->data[1]);
+      return absl::OkStatus();
+    case 3:
+      *bhwc = BHWC(dims->data[0], 1, dims->data[1], dims->data[2]);
+      return absl::OkStatus();
+    case 4:
+      *bhwc = BHWC(dims->data[0], dims->data[1], dims->data[2], dims->data[3]);
+      return absl::OkStatus();
+    default:
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Tensor \"", tflite_tensor.name ? tflite_tensor.name : "nullptr",
+          "\" has bad input dims size: ", dims->size, "."));
+  }
+}
+
+absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
+                                            TensorRef<BHWC>* tensor_ref) {
+  tensor_ref->type = ToDataType(tflite_tensor.type);
+  return ExtractTensorShape(tflite_tensor, &tensor_ref->shape);
+}
+
+absl::Status PopulateQuantParams(const TfLiteTensor& tensor,
+                                 QuantizationParams* quant_params) {
+  const TfLiteQuantization& quant = tensor.quantization;
+  if (quant.type != TfLiteQuantizationType::kTfLiteAffineQuantization) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Tensor not quantized: ", std::string(tensor.name)));
+  }
+  const TfLiteAffineQuantization* params =
+      static_cast<const TfLiteAffineQuantization*>(quant.params);
+  if (params->scale->size > 1) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Non-constant per-channel quantized tensor: ",
+                     std::string(tensor.name)));
+  }
+  const float scale = params->scale->data[0];
+  const float zero_point = static_cast<float>(params->zero_point->data[0]);
+
+  float qmin_value = 0;
+  float qmax_value = 0;
+  if (tensor.type == kTfLiteUInt8) {
+    qmin_value = static_cast<float>(std::numeric_limits<uint8_t>::min());
+    qmax_value = static_cast<float>(std::numeric_limits<uint8_t>::max());
+  } else if (tensor.type == kTfLiteInt8) {
+    qmin_value = static_cast<float>(std::numeric_limits<int8_t>::min());
+    qmax_value = static_cast<float>(std::numeric_limits<int8_t>::max());
+  } else {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Type invalid for quantized tensor: ", std::string(tensor.name)));
+  }
+  quant_params->min = scale * (static_cast<float>(qmin_value) - zero_point);
+  quant_params->max = scale * (static_cast<float>(qmax_value) - zero_point);
+  quant_params->scale = scale;
+
+  return absl::OkStatus();
+}
+
+int GetNumberOfRuntimeInputsForNode(const TfLiteContext* context,
+                                    const TfLiteNode* tflite_node) {
+  int number_of_runtime_inputs = 0;
+  for (int i = 0; i < tflite_node->inputs->size; i++) {
+    if (!IsConstantTensor(&context->tensors[tflite_node->inputs->data[i]])) {
+      number_of_runtime_inputs++;
+    }
+  }
+  return number_of_runtime_inputs;
+}
+
+int GetNumberOfConstInputsForNode(const TfLiteContext* context,
+                                  const TfLiteNode* tflite_node) {
+  return tflite_node->inputs->size -
+         GetNumberOfRuntimeInputsForNode(context, tflite_node);
+}
+
+int GetNumberOfRuntimeOutputsForNode(const TfLiteContext* context,
+                                     const TfLiteNode* tflite_node) {
+  int number_of_runtime_outputs = 0;
+  for (int i = 0; i < tflite_node->outputs->size; i++) {
+    if (!IsConstantTensor(&context->tensors[tflite_node->outputs->data[i]])) {
+      number_of_runtime_outputs++;
+    }
+  }
+  return number_of_runtime_outputs;
+}
+
+absl::Status CheckInputsOutputs(const TfLiteContext* context,
+                                const TfLiteNode* tflite_node,
+                                int runtime_inputs, int outputs) {
+  const int runtime_inputs_from_model =
+      GetNumberOfRuntimeInputsForNode(context, tflite_node);
+  if (runtime_inputs_from_model != runtime_inputs) {
+    return absl::InternalError(absl::StrCat(
+        "Expected ", runtime_inputs, " runtime input tensor(s), but node has ",
+        runtime_inputs_from_model, " runtime input(s)."));
+  }
+  const int runtime_outputs =
+      GetNumberOfRuntimeOutputsForNode(context, tflite_node);
+  if (runtime_outputs != outputs) {
+    return absl::InternalError(absl::StrCat("Expected ", outputs,
+                                            " output tensor(s), but node has ",
+                                            runtime_outputs, " output(s)."));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CheckInputsConstsOutputs(const TfLiteContext* context,
+                                      const TfLiteNode* tflite_node,
+                                      int runtime_inputs, int const_inputs,
+                                      int outputs) {
+  const int const_inputs_from_model =
+      GetNumberOfConstInputsForNode(context, tflite_node);
+  if (const_inputs_from_model != const_inputs) {
+    return absl::InternalError(absl::StrCat(
+        "Expected ", const_inputs, " const input tensor(s), but node has ",
+        const_inputs_from_model, " const input(s)."));
+  }
+  return CheckInputsOutputs(context, tflite_node, runtime_inputs, outputs);
+}
+
+void ConvertFloat16ToFloat32(size_t num_elements, const uint16_t* src,
+                             float* dst) {
+  for (size_t i = 0; i < num_elements; i++) {
+    *dst++ = fp16_ieee_to_fp32_value(*src++);
+  }
+}
+
+template <>
+absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
+                                         float* tensor_data) {
+  switch (tensor.type) {
+    case kTfLiteFloat32:
+      std::memcpy(tensor_data, tensor.data.f, tensor.bytes);
+      break;
+    case kTfLiteFloat16:
+      ConvertFloat16ToFloat32(
+          NumElements(&tensor),
+          reinterpret_cast<uint16_t const*>(tensor.data.f16), tensor_data);
+      break;
+    case kTfLiteInt8:
+      DequantizeConstantTensor(tensor, tensor.data.int8, tensor_data);
+      break;
+    case kTfLiteUInt8:
+      DequantizeConstantTensor(tensor, tensor.data.uint8, tensor_data);
+      break;
+    case kTfLiteInt32:
+      DequantizeConstantTensor(tensor, tensor.data.i32, tensor_data);
+      break;
+    default:
+      return absl::InvalidArgumentError(
+          "Unsupported data type for float32 tensor");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape) {
+  if (dimensions->size < 0) {
+    return absl::InvalidArgumentError("Invalid Scalar dimensions");
+  }
+  for (int i = 0; i < dimensions->size; ++i) {
+    if (dimensions->data[i] != 1) {
+      return absl::InvalidArgumentError(
+          "Dimension can not be reduced to scalar.");
+    }
+  }
+  shape->v = 1;
+  return absl::OkStatus();
+}
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
+  if (dimensions->size <= 0) {
+    return absl::InvalidArgumentError("Dimension is empty.");
+  }
+  for (int i = 0; i < dimensions->size - 1; ++i) {
+    if (dimensions->data[i] != 1) {
+      return absl::InvalidArgumentError(
+          "Dimension can not be reduced to linear.");
+    }
+  }
+  shape->v = dimensions->data[dimensions->size - 1];
+  return absl::OkStatus();
+}
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape) {
+  if (dimensions->size != 4) {
+    return absl::InvalidArgumentError("Dimensions are not HWC");
+  }
+  if (dimensions->data[0] != 1) {
+    return absl::UnimplementedError("Batch size is not equal to 1.");
+  }
+  shape->h = dimensions->data[1];
+  shape->w = dimensions->data[2];
+  shape->c = dimensions->data[3];
+  return absl::OkStatus();
+}
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape) {
+  if (dimensions->size != 2) {
+    return absl::InvalidArgumentError("Dimensions are not HW");
+  }
+  shape->h = dimensions->data[0];
+  shape->w = dimensions->data[1];
+  return absl::OkStatus();
+}
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape) {
+  if (dimensions->size != 4) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Dimensions are not OHWI: ", dimensions->size));
+  }
+  shape->o = dimensions->data[0];
+  shape->h = dimensions->data[1];
+  shape->w = dimensions->data[2];
+  shape->i = dimensions->data[3];
+  return absl::OkStatus();
+}
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape) {
+  if (dimensions->size != 4) {
+    return absl::InvalidArgumentError("Dimensions are not BHWC");
+  }
+  shape->b = dimensions->data[0];
+  shape->h = dimensions->data[1];
+  shape->w = dimensions->data[2];
+  shape->c = dimensions->data[3];
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 4dc76eb22ae..9caa5630037 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -12,216 +12,112 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 
-#include <set>
-#include <string>
-#include <unordered_map>
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/utils.h"
-#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace gpu {
-inline absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
-                                           TfLiteNode** tflite_node,
-                                           TfLiteRegistration** registration) {
-  if (context->GetNodeAndRegistration(context, node_id, tflite_node,
-                                      registration) != kTfLiteOk) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Couldn't get node and registration info for op: ", node_id));
+
+absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
+                                    TfLiteNode** tflite_node,
+                                    TfLiteRegistration** registration);
+
+DataType ToDataType(TfLiteType type);
+
+absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc);
+
+absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
+                                            TensorRef<BHWC>* tensor_ref);
+
+// Populates quantization parameters for non-constant UInt8/Int8 tensors.
+// This helps the delegate emulate quantized inference with
+// QuantizeAndDequantize.
+absl::Status PopulateQuantParams(const TfLiteTensor& tensor,
+                                 QuantizationParams* quant_params);
+
+int GetNumberOfRuntimeInputsForNode(const TfLiteContext* context,
+                                    const TfLiteNode* tflite_node);
+
+int GetNumberOfConstInputsForNode(const TfLiteContext* context,
+                                  const TfLiteNode* tflite_node);
+
+int GetNumberOfRuntimeOutputsForNode(const TfLiteContext* context,
+                                     const TfLiteNode* tflite_node);
+
+absl::Status CheckInputsOutputs(const TfLiteContext* context,
+                                const TfLiteNode* tflite_node,
+                                int runtime_inputs, int outputs);
+
+absl::Status CheckInputsConstsOutputs(const TfLiteContext* context,
+                                      const TfLiteNode* tflite_node,
+                                      int runtime_inputs, int const_inputs,
+                                      int outputs);
+
+void ConvertFloat16ToFloat32(size_t num_elements, const uint16_t* src,
+                             float* dst);
+
+template <typename T>
+void DequantizeConstantTensor(const TfLiteTensor& tensor, const T* source_data,
+                              float* dequantized_data) {
+  TfLiteAffineQuantization* quant_params =
+      reinterpret_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
+  if (quant_params->scale->size > 1) {
+    // Tensor is per-channel quantized.
+    PerChannelDequantizationParams op_params;
+    op_params.zero_point = quant_params->zero_point->data;
+    op_params.scale = quant_params->scale->data;
+    op_params.quantized_dimension = quant_params->quantized_dimension;
+    reference_ops::PerChannelDequantize(op_params, GetTensorShape(&tensor),
+                                        source_data, GetTensorShape(&tensor),
+                                        dequantized_data);
+  } else {
+    DequantizationParams op_params;
+    op_params.zero_point = tensor.params.zero_point;
+    op_params.scale = tensor.params.scale;
+    reference_ops::Dequantize(op_params, GetTensorShape(&tensor), source_data,
+                              GetTensorShape(&tensor), dequantized_data);
   }
+}
+
+template <typename T>
+absl::Status CreateVectorCopyData(const TfLiteTensor& tensor, T* tensor_data) {
+  if (tensor.bytes % sizeof(T) != 0) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Input data size ", tensor.bytes,
+                     " is not aligned to expected type: ", sizeof(T)));
+  }
+  std::memcpy(tensor_data, tensor.data.uint8, tensor.bytes);
   return absl::OkStatus();
 }
 
-using IsNodeSupportedFn = tflite::delegates::IsNodeSupportedFn;
+template <>
+absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
+                                         float* tensor_data);
 
-class GraphWithDequantPartitionHelper
-    : public tflite::delegates::GraphPartitionHelper {
- public:
-  GraphWithDequantPartitionHelper(TfLiteContext* context,
-                                  IsNodeSupportedFn is_node_supported_fn)
-      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape);
 
-  TfLiteStatus Partition(
-      std::set<std::string>* unsupported_nodes_info) override {
-    const auto status = GraphPartitionHelper::Partition(unsupported_nodes_info);
-    // Clean up those partitions that have a single dequant op. NoteThose
-    // removed dequant ops have to be reserved in the graph and should not be
-    // delegated.
-    RemoveSingleDequantNodePartitions();
-    return status;
-  }
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape);
 
-  // Returns a list of node indices of all nodes from the first n largest
-  // partitions. If there are fewer paritions than n, all nodes will be
-  // returned. The partition is ranked according to the number of nodes.
-  std::vector<int> GetNodesOfFirstNLargestPartitions(int n) {
-    // We first get partitions to reduce the number of nodes to be checked in
-    // deciding which dequant ops could actually be replaced. And then we
-    // remap input-tensor to dequant nodes' inputs and remove those
-    // to-be-reserved dequant nodes.
-    auto first_nps = GetFirstNLargestPartitions(n);
-    std::vector<int> ops_to_replace;
-    for (const auto p : first_nps) {
-      auto nodes = p->nodes_to_replace;
-      ops_to_replace.insert(ops_to_replace.end(), nodes->data,
-                            nodes->data + nodes->size);
-    }
-    RemapInputTensors(ops_to_replace);
-    RemoveReservedDequantsFromNodes(&ops_to_replace);
-    return ops_to_replace;
-  }
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape);
 
- protected:
-  bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteRegistration* registration, int node_id,
-                       std::string* unsupported_details) override {
-    // If we need to handle dequant nodes, we have to remap input tensors of
-    // this node if some of them come from a dequant node before testing if
-    // the node is supported.
-    std::vector<int> orig_inputs;
-    if (RecordAndRemapInputTensors(registration->builtin_code, node_id, node,
-                                   &orig_inputs)) {
-      // We have a dequant op here. Note that we retrun an Ok status because a
-      // dequant node is first added as supported. Later, this dequant node
-      // will be removed if it has to be preserved in the graph which happens
-      // when its immediate downstream nodes cannot be supported.
-      return true;
-    }
-    const auto status = GraphPartitionHelper::IsNodeSupported(
-        context, node, registration, node_id, unsupported_details);
-    RestoreToOrigInputTensors(node, orig_inputs);
-    return status;
-  }
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape);
 
- private:
-  // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
-  // When it's not a dequant op, remap its inputs to the inputs of the preceding
-  // dequant if there's a one and returns false. 'orig_inputs' records original
-  // input tensor ids of this node if any input is remapped.
-  bool RecordAndRemapInputTensors(int32_t op_code, int node_id,
-                                  TfLiteNode* node,
-                                  std::vector<int>* orig_inputs) {
-    orig_inputs->clear();
-    // Record the dequant node.
-    if (op_code == kTfLiteBuiltinDequantize &&
-        context_->tensors[node->inputs->data[0]].type ==
-            TfLiteType::kTfLiteFloat16) {
-      dequant_nodes_[node->outputs->data[0]] = node->inputs->data[0];
-      return true;
-    }
-    // For a dequantize op, there's no need to remap its input tensors.
-    if (dequant_nodes_.empty()) return false;
-    RemapInputTensors(node, orig_inputs);
-    return false;
-  }
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape);
 
-  // Restore inputs of 'node' to 'orig_inputs' only if two sizes match.
-  void RestoreToOrigInputTensors(TfLiteNode* node,
-                                 const std::vector<int>& orig_inputs) {
-    if (node->inputs->size != orig_inputs.size()) return;
-    for (int j = 0; j < node->inputs->size; ++j) {
-      node->inputs->data[j] = orig_inputs[j];
-    }
-  }
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape);
 
-  // Remap input tensors of every node in 'nodes' (i.e. node indices) if some of
-  // them are from dequant ops.
-  void RemapInputTensors(const std::vector<int>& nodes) const {
-    for (int node_id : nodes) {
-      TfLiteNode* node;
-      TfLiteRegistration* registration;
-      GetNodeAndRegistration(context_, node_id, &node, &registration)
-          .IgnoreError();
-      RemapInputTensors(node, nullptr /* orig_inputs*/);
-    }
-  }
-
-  void RemoveSingleDequantNodePartitions() {
-    auto it = partitions_.begin();
-    while (it != partitions_.end()) {
-      auto p = *it;
-      if (p->nodes_to_replace->size != 1) {
-        ++it;
-        continue;
-      }
-      int node_id = p->nodes_to_replace->data[0];
-      TfLiteNode* node = nullptr;
-      TfLiteRegistration* registration = nullptr;
-      GetNodeAndRegistration(context_, node_id, &node, &registration)
-          .IgnoreError();
-      if (registration->builtin_code != kTfLiteBuiltinDequantize ||
-          context_->tensors[node->inputs->data[0]].type !=
-              TfLiteType::kTfLiteFloat16) {
-        ++it;
-        continue;
-      }
-      // Note such dequant nodes have to be preserved in the graph as dequant
-      // ops are not actually supported in the GPU delegate.
-      dequant_nodes_to_save_.insert(node_id);
-      it = partitions_.erase(it);
-    }
-  }
-
-  void RemoveReservedDequantsFromNodes(std::vector<int>* nodes) {
-    if (dequant_nodes_to_save_.empty()) return;
-    auto it = nodes->begin();
-    while (it != nodes->end()) {
-      if (dequant_nodes_to_save_.find(*it) == dequant_nodes_to_save_.end()) {
-        ++it;
-        continue;
-      }
-      it = nodes->erase(it);
-    }
-  }
-
-  // Remap input tensors of a single 'node' if some of come from a dequant op.
-  // If 'orig_inputs' isn't nullptr, it records original input tensor ids of
-  // this node if any input is remapped.
-  void RemapInputTensors(TfLiteNode* node,
-                         std::vector<int>* orig_inputs) const {
-    TfLiteIntArray* inputs = node->inputs;
-    auto inputs_view = TfLiteIntArrayView(inputs);
-    // Prepopulate 'orig_inputs' first and clear it if there's no input from a
-    // dequant op.
-    if (orig_inputs) {
-      orig_inputs->clear();
-      orig_inputs->reserve(inputs->size);
-      for (auto tid : inputs_view) {
-        orig_inputs->push_back(tid);
-      }
-    }
-    // Fix this node's inputs (i.e. prune out the preceding dequantize node) in
-    // order to test if it is supported.
-    bool is_remapped = false;
-    for (int j = 0; j < inputs->size; ++j) {
-      const int input_tid = inputs->data[j];
-      const auto it = dequant_nodes_.find(input_tid);
-      if (it != dequant_nodes_.end()) {
-        inputs->data[j] = it->second;
-        is_remapped = true;
-      }
-    }
-    if (!is_remapped && orig_inputs) orig_inputs->clear();
-  }
-
-  // A map recording dequantize nodes's input/output tensors of this selected
-  // graph. The key is the output tensor id, and the value is the input tensor
-  // id.
-  std::unordered_map<int, int> dequant_nodes_;
-
-  // A set of dequant nodes as in node indices that have to be preserved in the
-  // graph.
-  std::set<int> dequant_nodes_to_save_;
-};
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
index 7b12f46453d..f0525e5e2c9 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
@@ -502,6 +502,187 @@ TEST(ModelBuilderTest, GetOpsToReplaceDoesNotPruneUint8) {
   TfLiteIntArrayFree(ops_to_replace);
 }
 
+class Interpreter2Fp32 : public DelegatedInterpreter {
+ public:
+  Interpreter2Fp32() : DelegatedInterpreter(4) {
+    void* builtin_data = malloc(sizeof(int));
+    EXPECT_EQ(interpreter_.AddTensors(8), kTfLiteOk);
+    EXPECT_EQ(interpreter_.SetInputs({0, 2, 4, 6}), kTfLiteOk);
+    EXPECT_EQ(interpreter_.SetOutputs({7}), kTfLiteOk);
+
+    // Add a Dequantize Node with uint8 input.
+    const TfLiteRegistration reg_dequant = {/*init=*/nullptr,
+                                            /*free=*/nullptr,
+                                            /*prepare=*/nullptr,
+                                            /*invoke=*/nullptr,
+                                            /*profiling_string=*/nullptr,
+                                            kTfLiteBuiltinDequantize};
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{0}, /*outputs=*/{1}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0, /*builtin_data=*/nullptr,
+                  /*registration=*/&reg_dequant),
+              kTfLiteOk);
+
+    // Add an ADD node that GPU delegate can parse.
+    const TfLiteRegistration reg_add0 = {
+        [](TfLiteContext* context, const char* buffer, size_t length) {
+          return reinterpret_cast<void*>(new int(1));
+        },
+        [](TfLiteContext* context, void* buffer) {
+          delete reinterpret_cast<int*>(buffer);
+        },
+        nullptr,
+        nullptr,
+        nullptr,
+        kTfLiteBuiltinAdd};
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{1, 2}, /*outputs=*/{3}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0,
+                  /*builtin_data=*/builtin_data,
+                  /*registration=*/&reg_add0),
+              kTfLiteOk);
+
+    // Add a Pack Node that GPU delegate doesn't support
+    const TfLiteRegistration reg_pack = {/*init=*/nullptr,
+                                         /*free=*/nullptr,
+                                         /*prepare=*/nullptr,
+                                         /*invoke=*/nullptr,
+                                         /*profiling_string=*/nullptr,
+                                         kTfLiteBuiltinPack};
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{3, 4}, /*outputs=*/{5}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0, /*builtin_data=*/nullptr,
+                  /*registration=*/&reg_pack),
+              kTfLiteOk);
+
+    const TfLiteRegistration reg_add1 = {
+        [](TfLiteContext* context, const char* buffer, size_t length) {
+          return reinterpret_cast<void*>(new int[2]);
+        },
+        [](TfLiteContext* context, void* buffer) {
+          delete reinterpret_cast<int*>(buffer);
+        },
+        nullptr,
+        nullptr,
+        nullptr,
+        kTfLiteBuiltinAdd};
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{5, 6}, /*outputs=*/{7}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0,
+                  /*builtin_data=*/builtin_data,
+                  /*registration=*/&reg_add1),
+              kTfLiteOk);
+
+    std::vector<int> dims = {1};
+    TfLiteQuantization quantization;
+    quantization.type = kTfLiteNoQuantization;
+    EXPECT_EQ(interpreter_.SetTensorParametersReadWrite(
+                  0, TfLiteType::kTfLiteUInt8, "t0", dims, quantization, false),
+              kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            1, TfLiteType::kTfLiteFloat32, "t1", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            2, TfLiteType::kTfLiteFloat32, "t2", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            3, TfLiteType::kTfLiteFloat32, "t3", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            4, TfLiteType::kTfLiteFloat32, "t4", dims, quantization, false),
+        kTfLiteOk);
+
+    dims.push_back(2);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            5, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            6, TfLiteType::kTfLiteFloat32, "t6", dims, quantization, false),
+        kTfLiteOk);
+
+    exec_plan()->data[0] = 0;
+    exec_plan()->data[1] = 1;
+    exec_plan()->data[2] = 2;
+    exec_plan()->data[3] = 3;
+  }
+};
+
+Interpreter2Fp32* interpreter2_fp32 = new Interpreter2Fp32();
+
+TEST(ModelBuilderTest, GetOpsToReplaceMultiplePartitions) {
+  // A graph with a Dequant node with uint8 input, a Pack node are not pruned.
+  // As these ops are currently not supported on the GPU, they will be scheduled
+  // to run on the CPU while the remaining supported op Add on the GPU.
+  //
+  //   t0 (uint8) -> Dequant(0) -> t1 (FP32) -> Add(1) -> t3 (FP32) -> PACK (2)
+  //                               t2 (FP32) -/           t4 (FP32) -/
+  //   PACK (2) -> t5 (FP32) -> Add(3) -> t7
+  //            -> t6 (FP32) -/
+  //
+  TfLiteContext* context = interpreter2_fp32->context();
+
+  // These functions are meant to be called inside delegates. Swap out
+  // for similar functions to permit direct calling of GetOpsToReplace.
+  context->GetExecutionPlan = [](struct TfLiteContext* context,
+                                 TfLiteIntArray** execution_plan) {
+    *execution_plan = interpreter2_fp32->exec_plan();
+    return kTfLiteOk;
+  };
+  context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
+                                       TfLiteNode** node,
+                                       TfLiteRegistration** registration) {
+    auto& node_and_reg =
+        interpreter2_fp32->nodes_and_registration()[node_index];
+    *node = &node_and_reg.first;
+    *registration = &node_and_reg.second;
+    return kTfLiteOk;
+  };
+  context->PreviewDelegatePartitioning =
+      [](struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+         TfLiteDelegateParams** partition_params_array, int* num_partitions) {
+        auto params = interpreter2_fp32->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 1;
+        params->input_tensors = TfLiteIntArrayCreate(2);
+        params->input_tensors->data[0] = 1;
+        params->input_tensors->data[1] = 2;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 3;
+
+        params = interpreter2_fp32->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 3;
+        params->input_tensors = TfLiteIntArrayCreate(2);
+        params->input_tensors->data[0] = 5;
+        params->input_tensors->data[1] = 6;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 7;
+
+        *partition_params_array = interpreter2_fp32->delegate_params();
+        *num_partitions = interpreter2_fp32->num_delegate_params();
+        return kTfLiteOk;
+      };
+
+  TfLiteIntArray* ops_to_replace = GetOpsToReplace(
+      context, /*allow_quant_ops=*/false, /*max_delegated_partitions*/ 2);
+
+  // As the Dequant op is not pruned and the ADD op could run on GPU, we have
+  // 2 partitions.
+  EXPECT_EQ(ops_to_replace->size, 2);
+  // ADD at index 1.
+  EXPECT_EQ(1, ops_to_replace->data[0]);
+  // ADD at index 3.
+  EXPECT_EQ(3, ops_to_replace->data[1]);
+
+  TfLiteIntArrayFree(ops_to_replace);
+}
+
 class InterpreterMultiNode : public DelegatedInterpreter {
  public:
   explicit InterpreterMultiNode(bool add_op_first = true)
diff --git a/tensorflow/lite/delegates/gpu/common/model_test.cc b/tensorflow/lite/delegates/gpu/common/model_test.cc
index 6395bbaa158..87f65eb730a 100644
--- a/tensorflow/lite/delegates/gpu/common/model_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_test.cc
@@ -32,8 +32,8 @@ TEST(Model, SingleNode) {
   // graph_input -> node -> graph_output
   GraphFloat32 graph;
   Node* node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
   ASSERT_TRUE(graph.AddConsumer(node->id, graph_input->id).ok());
   ASSERT_TRUE(graph.SetProducer(node->id, graph_output->id).ok());
 
@@ -53,9 +53,9 @@ TEST(Model, SingleNodeMultipleOutputs) {
   // graph_input -> node -> (graph_output1, graph_output2)
   GraphFloat32 graph;
   Node* node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output1 = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output2 = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output1 = graph.NewValue();
+  Value* graph_output2 = graph.NewValue();
   ASSERT_TRUE(graph.AddConsumer(node->id, graph_input->id).ok());
   ASSERT_TRUE(graph.SetProducer(node->id, graph_output1->id).ok());
   ASSERT_TRUE(graph.SetProducer(node->id, graph_output2->id).ok());
@@ -68,7 +68,7 @@ TEST(Model, SingleNodeMultipleOutputs) {
 TEST(Model, SetSameConsumer) {
   GraphFloat32 graph;
   Node* node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
+  Value* graph_input = graph.NewValue();
   ASSERT_TRUE(graph.AddConsumer(node->id, graph_input->id).ok());
   EXPECT_FALSE(graph.AddConsumer(node->id, graph_input->id).ok());
 }
@@ -77,8 +77,8 @@ TEST(Model, RemoveConsumer) {
   // (graph_input1, graph_input2) -> node
   GraphFloat32 graph;
   Node* node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input1 = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_input2 = graph.NewValue();
+  Value* graph_input1 = graph.NewValue();
+  Value* graph_input2 = graph.NewValue();
   ASSERT_TRUE(graph.AddConsumer(node->id, graph_input1->id).ok());
   ASSERT_TRUE(graph.AddConsumer(node->id, graph_input2->id).ok());
   EXPECT_THAT(graph.FindConsumers(graph_input1->id),
@@ -102,7 +102,7 @@ TEST(Model, RemoveConsumer) {
 TEST(Model, SetSameProducer) {
   GraphFloat32 graph;
   Node* node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
+  Value* graph_output = graph.NewValue();
   ASSERT_TRUE(graph.SetProducer(node->id, graph_output->id).ok());
   EXPECT_FALSE(graph.SetProducer(node->id, graph_output->id).ok());
 }
@@ -110,10 +110,10 @@ TEST(Model, SetSameProducer) {
 TEST(Model, ReplaceInput) {
   GraphFloat32 graph;
   Node* node = graph.NewNode();
-  Value<TensorRef<BHWC>>* v0 = graph.NewValue();
-  Value<TensorRef<BHWC>>* v1 = graph.NewValue();
-  Value<TensorRef<BHWC>>* v2 = graph.NewValue();
-  Value<TensorRef<BHWC>>* v3 = graph.NewValue();
+  Value* v0 = graph.NewValue();
+  Value* v1 = graph.NewValue();
+  Value* v2 = graph.NewValue();
+  Value* v3 = graph.NewValue();
   ASSERT_TRUE(graph.AddConsumer(node->id, v0->id).ok());
   ASSERT_TRUE(graph.AddConsumer(node->id, v1->id).ok());
   ASSERT_TRUE(graph.AddConsumer(node->id, v2->id).ok());
@@ -125,7 +125,7 @@ TEST(Model, ReplaceInput) {
 TEST(Model, RemoveProducer) {
   GraphFloat32 graph;
   Node* node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
+  Value* graph_output = graph.NewValue();
 
   ASSERT_TRUE(graph.SetProducer(node->id, graph_output->id).ok());
   EXPECT_THAT(graph.inputs(), UnorderedElementsAre());
@@ -142,8 +142,8 @@ TEST(Model, RemoveProducer) {
 TEST(Model, RemoveSimpleNodeDegenerateCase) {
   GraphFloat32 graph;
   Node* node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
 
   ASSERT_TRUE(graph.AddConsumer(node->id, graph_input->id).ok());
   ASSERT_TRUE(graph.SetProducer(node->id, graph_output->id).ok());
@@ -161,9 +161,9 @@ TEST(Model, RemoveSimpleNodeNoPreviousNode) {
   GraphFloat32 graph;
   Node* simple_node = graph.NewNode();
   Node* consumer_node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
-  Value<TensorRef<BHWC>>* value = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
+  Value* value = graph.NewValue();
 
   ASSERT_TRUE(graph.AddConsumer(simple_node->id, graph_input->id).ok());
   ASSERT_TRUE(graph.SetProducer(simple_node->id, value->id).ok());
@@ -183,9 +183,9 @@ TEST(Model, RemoveSimpleNodeNoAfterNodes) {
   GraphFloat32 graph;
   Node* simple_node = graph.NewNode();
   Node* producer_node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
-  Value<TensorRef<BHWC>>* value = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
+  Value* value = graph.NewValue();
 
   ASSERT_TRUE(graph.AddConsumer(simple_node->id, value->id).ok());
   ASSERT_TRUE(graph.SetProducer(simple_node->id, graph_output->id).ok());
@@ -206,10 +206,10 @@ TEST(Model, RemoveSimpleNodeGeneralCase) {
   Node* simple_node = graph.NewNode();
   Node* producer_node = graph.NewNode();
   Node* consumer_node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
-  Value<TensorRef<BHWC>>* value0 = graph.NewValue();
-  Value<TensorRef<BHWC>>* value1 = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
+  Value* value0 = graph.NewValue();
+  Value* value1 = graph.NewValue();
 
   ASSERT_TRUE(graph.AddConsumer(producer_node->id, graph_input->id).ok());
   ASSERT_TRUE(graph.SetProducer(producer_node->id, value0->id).ok());
@@ -257,12 +257,12 @@ TEST(Model, RemoveSimpleNodeComplexCase) {
   Node* n0 = graph.NewNode();
   Node* n1 = graph.NewNode();  // node to remove
   Node* n2 = graph.NewNode();
-  Value<TensorRef<BHWC>>* v0 = graph.NewValue();
-  Value<TensorRef<BHWC>>* v1 = graph.NewValue();
-  Value<TensorRef<BHWC>>* v2 = graph.NewValue();  // value to be removed
-  Value<TensorRef<BHWC>>* v3 = graph.NewValue();
-  Value<TensorRef<BHWC>>* o1 = graph.NewValue();
-  Value<TensorRef<BHWC>>* o2 = graph.NewValue();
+  Value* v0 = graph.NewValue();
+  Value* v1 = graph.NewValue();
+  Value* v2 = graph.NewValue();  // value to be removed
+  Value* v3 = graph.NewValue();
+  Value* o1 = graph.NewValue();
+  Value* o2 = graph.NewValue();
 
   ASSERT_TRUE(graph.AddConsumer(n0->id, v0->id).ok());
   ASSERT_TRUE(graph.AddConsumer(n0->id, v1->id).ok());
@@ -289,14 +289,14 @@ TEST(Model, CircularDependency) {
   {
     GraphFloat32 graph;
     Node* node = graph.NewNode();
-    Value<TensorRef<BHWC>>* value = graph.NewValue();
+    Value* value = graph.NewValue();
     ASSERT_TRUE(graph.AddConsumer(node->id, value->id).ok());
     EXPECT_FALSE(graph.SetProducer(node->id, value->id).ok());
   }
   {
     GraphFloat32 graph;
     Node* node = graph.NewNode();
-    Value<TensorRef<BHWC>>* value = graph.NewValue();
+    Value* value = graph.NewValue();
     ASSERT_TRUE(graph.SetProducer(node->id, value->id).ok());
     EXPECT_FALSE(graph.AddConsumer(node->id, value->id).ok());
   }
@@ -309,8 +309,8 @@ TEST(Model, ReassignValue) {
   GraphFloat32 graph;
   Node* node1 = graph.NewNode();
   Node* node2 = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
   ASSERT_TRUE(graph.AddConsumer(node1->id, graph_input->id).ok());
   ASSERT_TRUE(graph.SetProducer(node1->id, graph_output->id).ok());
   ASSERT_TRUE(graph.AddConsumer(node2->id, graph_input->id).ok());
@@ -336,9 +336,9 @@ TEST(Model, DeleteValue) {
   GraphFloat32 graph;
   Node* node1 = graph.NewNode();
   Node* node2 = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
-  Value<TensorRef<BHWC>>* value = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
+  Value* value = graph.NewValue();
   ASSERT_TRUE(graph.AddConsumer(node1->id, graph_input->id).ok());
   ASSERT_TRUE(graph.SetProducer(node1->id, value->id).ok());
   ASSERT_TRUE(graph.AddConsumer(node2->id, value->id).ok());
@@ -377,10 +377,10 @@ TEST(Model, DeleteNode) {
   Node* node1 = graph.NewNode();
   Node* node2 = graph.NewNode();
   Node* node3 = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output2 = graph.NewValue();
-  Value<TensorRef<BHWC>>* value = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
+  Value* graph_output2 = graph.NewValue();
+  Value* value = graph.NewValue();
   ASSERT_TRUE(graph.AddConsumer(node1->id, graph_input->id).ok());
   ASSERT_TRUE(graph.SetProducer(node1->id, value->id).ok());
   ASSERT_TRUE(graph.AddConsumer(node2->id, value->id).ok());
@@ -437,9 +437,9 @@ TEST(Model, InsertNodeAfter) {
   GraphFloat32 graph;
   Node* node1 = graph.NewNode();
   Node* node2 = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
-  Value<TensorRef<BHWC>>* value = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
+  Value* value = graph.NewValue();
   ASSERT_TRUE(graph.AddConsumer(node1->id, graph_input->id).ok());
   ASSERT_TRUE(graph.SetProducer(node1->id, value->id).ok());
   ASSERT_TRUE(graph.AddConsumer(node2->id, value->id).ok());
diff --git a/tensorflow/lite/delegates/gpu/common/object_reader.cc b/tensorflow/lite/delegates/gpu/common/object_reader.cc
new file mode 100644
index 00000000000..55a0aea01a1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/object_reader.cc
@@ -0,0 +1,174 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/object_reader.h"
+
+#include <cstdint>
+#include <unordered_map>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/utils.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ObjectReader::ReadNonConstantTensor(
+    TfLiteContext* context, std::unordered_map<int, Value*>* tensor_to_value,
+    std::unordered_map<int, int>* quant_conversion_map, GraphFloat32* graph,
+    uint32_t tensor_idx, Value** value) {
+  if (tensor_idx >= context->tensors_size) {
+    return absl::OutOfRangeError(
+        absl::StrCat("ReadNonConstTensor: input tensor index: ", tensor_idx));
+  }
+
+  if (tensor_to_value->find(tensor_idx) == tensor_to_value->end()) {
+    const TfLiteTensor& tflite_tensor = context->tensors[tensor_idx];
+    if (tflite::IsConstantTensor(&tflite_tensor)) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "ReadNonConstantTensor: value is a constant tensor: ", tensor_idx));
+    }
+
+    if ((tflite_tensor.type == kTfLiteInt8 ||
+         tflite_tensor.type == kTfLiteUInt8) &&
+        quant_conversion_map) {
+      // Quantized case
+      if (quant_conversion_map->find(tensor_idx) ==
+          quant_conversion_map->end()) {
+        // Since the original tensor is fixed-point, add a new float tensor to
+        // the TFLite graph to represent the dequantized data.
+        int fp_tensor_index = 0;
+        TfLiteTensor* fp_tflite_tensor;
+        if (delegates::CreateNewTensorWithDifferentType(
+                context, tensor_idx, kTfLiteFloat32, &fp_tflite_tensor,
+                &fp_tensor_index) != kTfLiteOk) {
+          return absl::InternalError("Could not add new tensor to graph");
+        }
+        // Remember this tensor for later.
+        (*quant_conversion_map)[fp_tensor_index] = tensor_idx;
+        (*quant_conversion_map)[tensor_idx] = fp_tensor_index;
+        // Add a new GPU Value for the new dequantized floating-point tensor.
+        Value* value = graph->NewValue();
+        RETURN_IF_ERROR(
+            ConvertTfLiteTensorToTensorRef(*fp_tflite_tensor, &value->tensor));
+        value->tensor.ref = fp_tensor_index;
+        value->quant_params.emplace();
+        RETURN_IF_ERROR(
+            PopulateQuantParams(tflite_tensor, &value->quant_params.value()));
+        (*tensor_to_value)[fp_tensor_index] = value;
+      }
+      // We do not use the original tensor index as reference for the GPU
+      // Value, instead pointing at the corresponding float version.
+      tensor_idx = quant_conversion_map->at(tensor_idx);
+    } else {
+      // Floating-point case.
+      Value* value = graph->NewValue();
+      RETURN_IF_ERROR(
+          ConvertTfLiteTensorToTensorRef(tflite_tensor, &value->tensor));
+      value->tensor.ref = tensor_idx;
+      (*tensor_to_value)[tensor_idx] = value;
+    }
+  }
+
+  if (value) {
+    *value = (*tensor_to_value)[tensor_idx];
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ObjectReader::ReadValue(uint32_t idx, Value** value) {
+  if (idx >= node_->inputs->size) {
+    return absl::OutOfRangeError(
+        absl::StrCat("ReadValue: input tensor index: ", idx));
+  }
+  return ReadValueByTensorIdx(node_->inputs->data[idx], value);
+}
+
+absl::Status ObjectReader::ReadValueByTensorIdx(uint32_t tensor_idx,
+                                                Value** value) {
+  // Constant tensors should be handled by ReadTensor.
+  return ReadNonConstantTensor(context_, tensor_to_value_,
+                               quant_conversion_map_, graph_, tensor_idx,
+                               value);
+}
+
+int ObjectReader::GetNumberOfRuntimeInputs() const {
+  return GetNumberOfRuntimeInputsForNode(context_, node_);
+}
+
+absl::Status ObjectReader::GetTensorDims(uint32_t idx,
+                                         TfLiteIntArray* dimensions) const {
+  if (idx >= node_->inputs->size) {
+    return absl::OutOfRangeError(absl::StrCat("Input tensor index: ", idx));
+  }
+  const int tensor_idx = node_->inputs->data[idx];
+  if (tensor_idx < 0 || tensor_idx > context_->tensors_size) {
+    return absl::OutOfRangeError(absl::StrCat("Tensor index: ", tensor_idx));
+  }
+  const TfLiteTensor& tflite_tensor = context_->tensors[tensor_idx];
+  *dimensions = *tflite_tensor.dims;
+  return absl::OkStatus();
+}
+
+absl::Status ObjectReader::AddOutput(const Node* node, int id) {
+  if (node_->outputs->size <= id) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Data id ", id, " must be less than tflite node outputs size ",
+        node_->outputs->size));
+  }
+  int output_tensor_idx = node_->outputs->data[id];
+  Value* value;
+  RETURN_IF_ERROR(ReadValueByTensorIdx(output_tensor_idx, &value));
+  RETURN_IF_ERROR(graph_->SetProducer(node->id, value->id));
+  return absl::OkStatus();
+}
+
+absl::Status ObjectReader::AddOutputs(const Node* node) {
+  for (int i = 0; i < node_->outputs->size; ++i) {
+    RETURN_IF_ERROR(AddOutput(node, i));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ObjectReader::AddInput(const Node* node, uint32_t idx) {
+  Value* input;
+  RETURN_IF_ERROR(ReadValue(idx, &input));
+  return graph_->AddConsumer(node->id, input->id);
+}
+
+TfLiteTensor* ObjectReader::GetInputTensor(int index) const {
+  return index >= 0 && index < node_->inputs->size
+             ? context_->tensors + node_->inputs->data[index]
+             : nullptr;
+}
+
+TfLiteTensor* ObjectReader::GetOutputTensor(int index) const {
+  return index >= 0 && index < node_->outputs->size
+             ? context_->tensors + node_->outputs->data[index]
+             : nullptr;
+}
+
+absl::Status ObjectReader::VerifyInputsConstsOutputs(const TfLiteNode* node,
+                                                     int runtime_inputs,
+                                                     int const_inputs,
+                                                     int outputs) {
+  return CheckInputsConstsOutputs(context_, node, runtime_inputs, const_inputs,
+                                  outputs);
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/object_reader.h b/tensorflow/lite/delegates/gpu/common/object_reader.h
new file mode 100644
index 00000000000..a9fbf546bf6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/object_reader.h
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OBJECT_READER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OBJECT_READER_H_
+
+#include <cstdint>
+#include <unordered_map>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace gpu {
+
+// If quantized tensors exist in the graph & quant_conversion_map is non-null,
+// the mapping between the original tensors (fixed-point) & GPU values (fp) is
+// stored in quant_conversion_map.
+class ObjectReader {
+ public:
+  static absl::Status ReadNonConstantTensor(
+      TfLiteContext* context, std::unordered_map<int, Value*>* tensor_to_value,
+      std::unordered_map<int, int>* quant_conversion_map, GraphFloat32* graph,
+      uint32_t tensor_idx, Value** value = nullptr);
+
+  ObjectReader(GraphFloat32* graph, TfLiteContext* context,
+               const TfLiteNode* node,
+               std::unordered_map<int, Value*>* tensor_to_value,
+               std::unordered_map<int, int>* quant_conversion_map = nullptr)
+      : graph_(graph),
+        context_(context),
+        node_(node),
+        tensor_to_value_(tensor_to_value),
+        quant_conversion_map_(quant_conversion_map) {}
+
+  absl::Status ReadValue(uint32_t idx, Value** value);
+
+  absl::Status ReadValueByTensorIdx(uint32_t tensor_idx, Value** value);
+
+  int GetNumberOfRuntimeInputs() const;
+
+  absl::Status GetTensorDims(uint32_t idx, TfLiteIntArray* dimensions) const;
+
+  template <typename TensorT>
+  absl::Status ReadTensor(uint32_t idx, TensorT* t) const {
+    const int32_t tensor_idx = node_->inputs->data[idx];
+    const TfLiteTensor* tflite_tensor = context_->tensors + tensor_idx;
+    t->data.resize(NumElements(tflite_tensor));
+    RETURN_IF_ERROR(CreateVectorCopyData(*tflite_tensor, &t->data[0]));
+
+    // Axis and data layout depend on operation this tensor is used in. So,
+    // postpone resolutions until operations are parsed.
+    t->id = tensor_idx;
+    return SetAllDimensions(tflite_tensor->dims, &t->shape);
+  }
+
+  absl::Status AddOutput(const Node* node, int id);
+
+  absl::Status AddOutputs(const Node* node);
+
+  absl::Status AddInput(const Node* node, uint32_t idx);
+
+  TfLiteTensor* GetInputTensor(int index) const;
+
+  TfLiteTensor* GetOutputTensor(int index) const;
+
+  absl::Status VerifyInputsConstsOutputs(const TfLiteNode* node,
+                                         int runtime_inputs, int const_inputs,
+                                         int outputs);
+
+ private:
+  GraphFloat32* graph_;
+  TfLiteContext* context_;
+  const TfLiteNode* node_;
+  std::unordered_map<int, Value*>* tensor_to_value_;
+  std::unordered_map<int, int>* quant_conversion_map_;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OBJECT_READER_H_
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index 771ed7378b9..bdcf6f605cc 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -209,7 +209,7 @@ OperationType OperationTypeFromString(const std::string& name) {
 namespace {
 
 template <typename T>
-T IntegralDivideRoundUp(T n, T divisor) {
+T DivideRoundUp(T n, T divisor) {
   return (n - 1) / divisor + 1;
 }
 
@@ -272,7 +272,7 @@ int32_t CalculateOutput(const BHWDC& input,
 }
 
 inline int32_t StridedSize(int32_t size, int32_t stride) {
-  return stride == 0 ? -1 : IntegralDivideRoundUp(size, stride);
+  return stride == 0 ? -1 : DivideRoundUp(size, stride);
 }
 
 template <Axis AxisT, typename AttrT>
@@ -506,6 +506,14 @@ BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr) {
               attr.appended.c + attr.prepended.c + input.c);
 }
 
+BHWDC CalculateOutputShape(const BHWDC& input, const Pad3DAttributes& attr) {
+  return BHWDC(attr.appended.b + attr.prepended.b + input.b,
+               attr.appended.h + attr.prepended.h + input.h,
+               attr.appended.w + attr.prepended.w + input.w,
+               attr.appended.d + attr.prepended.d + input.d,
+               attr.appended.c + attr.prepended.c + input.c);
+}
+
 BHWC CalculateOutputShape(const BHWC& input,
                           const FullyConnectedAttributes& attr) {
   return BHWC(input.b, 1, 1, attr.weights.shape.o);
@@ -562,6 +570,62 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
   return absl::OkStatus();
 }
 
+absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
+                                  const ConcatAttributes& attr,
+                                  BHWDC* output_shape) {
+  BHWDC new_shape = input[0];
+  switch (attr.axis) {
+    case Axis::CHANNELS:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].h != new_shape.h || input[i].w != new_shape.w ||
+            input[i].d != new_shape.d) {
+          return absl::InvalidArgumentError(
+              "Height, Width and Depth must be the same when concatenating "
+              "by channels axis");
+        }
+        new_shape.c += input[i].c;
+      }
+      break;
+    case Axis::HEIGHT:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].w != new_shape.w || input[i].c != new_shape.c ||
+            input[i].d != new_shape.d) {
+          return absl::InvalidArgumentError(
+              "Width, Depth and Channels must be the same when concatenating "
+              "by height axis");
+        }
+        new_shape.h += input[i].h;
+      }
+      break;
+    case Axis::WIDTH:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].h != new_shape.h || input[i].c != new_shape.c ||
+            input[i].d != new_shape.d) {
+          return absl::InvalidArgumentError(
+              "Height, Depth and Channels must be the same when concatenating "
+              "by width axis");
+        }
+        new_shape.w += input[i].w;
+      }
+      break;
+    case Axis::DEPTH:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].w != new_shape.w || input[i].h != new_shape.h ||
+            input[i].c != new_shape.c) {
+          return absl::InvalidArgumentError(
+              "Width, Height and Channels must be the same when concatenating "
+              "by depth axis");
+        }
+        new_shape.d += input[i].d;
+      }
+      break;
+    default:
+      return absl::InvalidArgumentError("Invalid axis");
+  }
+  *output_shape = new_shape;
+  return absl::OkStatus();
+}
+
 Padding2D CalculateSamePadding(const BHWC& input,
                                const Convolution2DAttributes& attr) {
   return MakeSamePadding(input, attr);
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 4eb41dfe1a3..d0268eee585 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -206,6 +206,12 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
                                   const ConcatAttributes& attr,
                                   BHWC* output_shape);
 
+// @return shape of a tensor after Concat operation is applied to the given
+//         input.
+absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
+                                  const ConcatAttributes& attr,
+                                  BHWDC* output_shape);
+
 // @return padding for pooling operation to make sure output keep the same shape
 // as the given input.
 Padding2D CalculateSamePadding(const BHWC& input,
@@ -425,6 +431,17 @@ struct PadAttributes {
 // @return shape of a tensor after Pad operation is applied to the given input.
 BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr);
 
+struct Pad3DAttributes {
+  PaddingContentType type = PaddingContentType::ZEROS;
+
+  BHWDC prepended;
+  BHWDC appended;
+};
+
+// @return shape of a tensor after Pad3D operation is applied to the given
+// input.
+BHWDC CalculateOutputShape(const BHWDC& input, const Pad3DAttributes& attr);
+
 struct ConstTensorAttributes {
   Tensor<BHWC, DataType::FLOAT32> tensor;
 };
diff --git a/tensorflow/lite/delegates/gpu/common/testing/BUILD b/tensorflow/lite/delegates/gpu/common/testing/BUILD
index 6398a4b8a34..a7f97eb67b3 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/BUILD
@@ -18,3 +18,20 @@ cc_library(
         "@com_google_absl//absl/memory",
     ],
 )
+
+cc_library(
+    name = "tflite_model_reader",
+    srcs = ["tflite_model_reader.cc"],
+    hdrs = ["tflite_model_reader.h"],
+    deps = [
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_builder",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
new file mode 100644
index 00000000000..0faa621f72f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h"
+
+#include <memory>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_builder.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model_builder.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+class DelegateContext {
+ public:
+  bool Init(TfLiteContext* context,
+            const TfLiteDelegateParams* delegate_params) {
+    auto denormalized_graph =
+        reinterpret_cast<GraphFloat32*>(delegate_params->delegate->data_);
+    return denormalized_graph
+               ? BuildModel(context, delegate_params, denormalized_graph).ok()
+               : false;
+  }
+};
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  const TfLiteRegistration kRegistration = {
+      .init = [](TfLiteContext* context, const char* buffer, size_t) -> void* {
+        auto* delegate_context = new DelegateContext();
+        if (!delegate_context->Init(
+                context,
+                reinterpret_cast<const TfLiteDelegateParams*>(buffer))) {
+          delete delegate_context;
+          return nullptr;
+        }
+        return delegate_context;
+      },
+      .free = [](TfLiteContext* context, void* buffer) -> void {
+        delete reinterpret_cast<DelegateContext*>(buffer);
+      },
+      .prepare = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+        return node->user_data ? kTfLiteOk : kTfLiteError;
+      },
+      .invoke = nullptr,
+  };
+
+  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
+  const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, kRegistration, ops_to_replace, delegate);
+  TfLiteIntArrayFree(ops_to_replace);
+  return status;
+}
+}  // namespace
+
+absl::Status BuildFromFlatBuffer(const tflite::FlatBufferModel& flatbuffer,
+                                 const tflite::OpResolver& op_resolver,
+                                 GraphFloat32* graph) {
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  tflite::InterpreterBuilder interpreter_builder(flatbuffer, op_resolver);
+  if (interpreter_builder(&interpreter) != kTfLiteOk || !interpreter) {
+    return absl::InternalError("Unable to prepare TfLite interpreter.");
+  }
+  interpreter->UseNNAPI(false);
+  TfLiteDelegate delegate;
+  delegate.data_ = graph;
+  delegate.flags = kTfLiteDelegateFlagsNone;
+  delegate.Prepare = DelegatePrepare;
+  delegate.CopyFromBufferHandle = nullptr;
+  delegate.CopyToBufferHandle = nullptr;
+  delegate.FreeBufferHandle = nullptr;
+
+  if (interpreter->ModifyGraphWithDelegate(&delegate) != kTfLiteOk) {
+    return absl::InternalError("Conversion from TfLite model failed.");
+  }
+
+  NullTransformationReporter reporter;
+  ModelTransformer transformer(graph, &reporter);
+  if (!ApplyGeneralTransformations(&transformer)) {
+    return absl::InternalError("Graph general transformations failed");
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h
new file mode 100644
index 00000000000..1a225082cfc
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_TFLITE_MODEL_READER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_TFLITE_MODEL_READER_H_
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/model_builder.h"
+
+namespace tflite {
+namespace gpu {
+
+// Generates GraphFloat32 basing on the FlatBufferModel without specifying a
+// delegate.
+absl::Status BuildFromFlatBuffer(const tflite::FlatBufferModel& flatbuffer,
+                                 const tflite::OpResolver& op_resolver,
+                                 GraphFloat32* graph);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_TFLITE_MODEL_READER_H_
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc
index 19153d94f83..6262d1575b7 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc
@@ -64,7 +64,7 @@ class AddQuantAdjustments : public NodeTransformation {
 
       // Add one output Value for the new node.
       // The tensor information should rename the same.
-      Value<TensorRef<BHWC>>* adjusted_value = graph->NewValue();
+      Value* adjusted_value = graph->NewValue();
       adjusted_value->tensor = output_value->tensor;
       status =
           graph->SetProducer(quant_and_dequant_node->id, adjusted_value->id);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
index b392ffa87bf..ef75b5bb23b 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
@@ -59,7 +59,7 @@ TEST(AddQuantAdjustments, OneNode) {
 
   ASSERT_TRUE(graph.AddConsumer(add_node->id, input->id).ok());
 
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   AddQuantParams(&input->quant_params, /*min=*/0.0, /*max=*/2.0,
                  /*scale=*/0.008);
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
@@ -114,18 +114,18 @@ TEST(AddQuantAdjustments, GeneralCase) {
 
   // Connections.
   ASSERT_TRUE(graph.AddConsumer(add1_node->id, input->id).ok());
-  Value<TensorRef<BHWC>>* link1;
+  Value* link1;
   ASSERT_TRUE(ConnectTwoNodes(&graph, add1_node, quant_node, &link1).ok());
   AddQuantParams(&link1->quant_params, /*min=*/0.0, /*max=*/2.0,
                  /*scale=*/0.008);
   link1->tensor.shape = BHWC(1, 4, 4, 8);
   ASSERT_TRUE(graph.AddConsumer(add2_node->id, link1->id).ok());
-  Value<TensorRef<BHWC>>* link2;
+  Value* link2;
   ASSERT_TRUE(ConnectTwoNodes(&graph, quant_node, add2_node, &link2).ok());
   AddQuantParams(&link2->quant_params, /*min=*/-1.0, /*max=*/1.0,
                  /*scale=*/0.008);
   link2->tensor.shape = BHWC(1, 4, 4, 8);
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, add2_node, &output).ok());
   AddQuantParams(&output->quant_params, /*min=*/-1.0, /*max=*/1.0,
                  /*scale=*/0.008);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
index 4a83ab3d795..431d8167f81 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
@@ -57,11 +57,11 @@ TEST(MergeConvolutionWithAddTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(conv_node->id, input->id).ok());
 
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value<TensorRef<BHWC>>* link1;
+  Value* link1;
   ASSERT_TRUE(ConnectTwoNodes(&graph, conv_node, add_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
@@ -108,11 +108,11 @@ TEST(MergeAddWithConvolutionTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(add_node->id, input->id).ok());
 
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, conv_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value<TensorRef<BHWC>>* link1;
+  Value* link1;
   ASSERT_TRUE(ConnectTwoNodes(&graph, add_node, conv_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
index 055327d3534..749382c3417 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
@@ -32,6 +32,11 @@ class MergeConvolutionWithMul : public SequenceTransformation {
   TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                        GraphFloat32* graph) final {
     auto& conv_node = *sequence[0];
+    if (graph->FindInputs(conv_node.id).size() != 1) {
+      return {TransformStatus::DECLINED,
+              "This fusion is only applicable to ops with one runtime input."};
+    }
+
     auto& mul_node = *sequence[1];
     if (mul_node.operation.type != ToString(OperationType::MUL) ||
         !mul_node.operation.attributes.has_value()) {
@@ -91,6 +96,10 @@ class MergeMulWithConvolution : public SequenceTransformation {
   TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                        GraphFloat32* graph) final {
     auto& conv_node = *sequence[1];
+    if (graph->FindInputs(conv_node.id).size() != 1) {
+      return {TransformStatus::DECLINED,
+              "This fusion is only applicable to ops with one runtime input."};
+    }
     auto& mul_node = *sequence[0];
     if (mul_node.operation.type != ToString(OperationType::MUL) ||
         !mul_node.operation.attributes.has_value()) {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
index 155ee81c53d..593a18b8731 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
@@ -58,11 +58,11 @@ TEST(MergeConvolutionWithMulTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(conv_node->id, input->id).ok());
 
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, mul_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value<TensorRef<BHWC>>* link1;
+  Value* link1;
   ASSERT_TRUE(ConnectTwoNodes(&graph, conv_node, mul_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
@@ -109,11 +109,11 @@ TEST(MergeMulWithConvolutionTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(mul_node->id, input->id).ok());
 
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, conv_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value<TensorRef<BHWC>>* link1;
+  Value* link1;
   ASSERT_TRUE(ConnectTwoNodes(&graph, mul_node, conv_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
index ec57ed73a61..d3606d4a097 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
@@ -68,16 +68,16 @@ TEST(MakeFullyConnected, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(conv1x1_node0->id, input->id).ok());
 
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, conv1x1_node2, &output).ok());
   output->tensor.shape = BHWC(1, 1, 1, 32);
 
-  Value<TensorRef<BHWC>>* link1;
+  Value* link1;
   ASSERT_TRUE(
       ConnectTwoNodes(&graph, conv1x1_node0, conv4x4_node1, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value<TensorRef<BHWC>>* link2;
+  Value* link2;
   ASSERT_TRUE(
       ConnectTwoNodes(&graph, conv4x4_node1, conv1x1_node2, &link2).ok());
   link2->tensor.shape = BHWC(1, 1, 1, 16);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
index 3d6738e9e49..f8be3218239 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
@@ -38,7 +38,7 @@ TEST(MakePadding, Smoke) {
   attr.axis = Axis::HEIGHT;
   concat_node->operation.attributes = attr;
 
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, concat_node, &output).ok());
   output->tensor.shape = BHWC(1, 7, 3, 5);
 
@@ -50,7 +50,7 @@ TEST(MakePadding, Smoke) {
       std::vector<float>(const_attr.tensor.shape.DimensionsProduct(), 0);
   const_node->operation.attributes = const_attr;
 
-  Value<TensorRef<BHWC>>* const_link;
+  Value* const_link;
   ASSERT_TRUE(
       ConnectTwoNodes(&graph, const_node, concat_node, &const_link).ok());
   const_link->tensor.shape = const_attr.tensor.shape;
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
index 3f3b7a22d6f..01aade9812d 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
@@ -40,7 +40,7 @@ TEST(MergePaddingWith, Smoke) {
   pad_node->operation.attributes = attr;
 
   auto conv_node = graph.NewNode();
-  Value<TensorRef<BHWC>>* temp;
+  Value* temp;
   ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node, conv_node, &temp).ok());
   ASSERT_TRUE(AddOutput(&graph, conv_node, &temp).ok());
   conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
@@ -77,7 +77,7 @@ TEST(MergePaddingWith, MergeTwo) {
   pad_node1->operation.attributes = attr;
 
   auto pad_node2 = graph.NewNode();
-  Value<TensorRef<BHWC>>* temp;
+  Value* temp;
   ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node1, pad_node2, &temp).ok());
   pad_node2->operation.type = ToString(OperationType::PAD);
   attr.prepended = BHWC(0, 0, 0, 0);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
index 3731ee4700a..183b7cdbe13 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
@@ -35,12 +35,12 @@ TEST(RemoveSingleInputAdd, Smoke) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   add_node->operation.attributes = AddAttributes();
 
-  Value<TensorRef<BHWC>>* temp;
+  Value* temp;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -63,14 +63,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_Tensor) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   AddAttributes attr;
   attr.param = Tensor<Linear, DataType::FLOAT32>();
   add_node->operation.attributes = attr;
 
-  Value<TensorRef<BHWC>>* temp;
+  Value* temp;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -90,14 +90,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_Scalar) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   AddAttributes attr;
   attr.param = 0.5f;
   add_node->operation.attributes = attr;
 
-  Value<TensorRef<BHWC>>* temp;
+  Value* temp;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -119,11 +119,11 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_Multiple) {
   ASSERT_TRUE(graph.AddConsumer(node_b->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
 
-  Value<TensorRef<BHWC>>* temp;
+  Value* temp;
   ASSERT_TRUE(ConnectTwoNodes(&graph, node_a, add_node, &temp).ok());
   ASSERT_TRUE(ConnectTwoNodes(&graph, node_b, add_node, &temp).ok());
   ASSERT_EQ(3, graph.nodes().size());
@@ -144,7 +144,7 @@ TEST(RemoveDegenerateUpsampling, Smoke) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto node_to_remove = graph.NewNode();
-  Value<TensorRef<BHWC>>* output;
+  Value* output;
   ASSERT_TRUE(AddOutput(&graph, node_to_remove, &output).ok());
   output->tensor.shape = BHWC(1, 5, 5, 1);
   node_to_remove->operation.type = ToString(OperationType::RESIZE);
@@ -153,7 +153,7 @@ TEST(RemoveDegenerateUpsampling, Smoke) {
   attr.type = SamplingType::BILINEAR;
   node_to_remove->operation.attributes = attr;
 
-  Value<TensorRef<BHWC>>* link;
+  Value* link;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, node_to_remove, &link).ok());
   link->tensor.shape = output->tensor.shape;
   ASSERT_EQ(2, graph.nodes().size());
@@ -175,10 +175,10 @@ TEST(RemoveIdentityReshape, Smoke) {
   Node* simple_node = graph.NewNode();
   Node* producer_node = graph.NewNode();
   Node* consumer_node = graph.NewNode();
-  Value<TensorRef<BHWC>>* graph_input = graph.NewValue();
-  Value<TensorRef<BHWC>>* graph_output = graph.NewValue();
-  Value<TensorRef<BHWC>>* value0 = graph.NewValue();
-  Value<TensorRef<BHWC>>* value1 = graph.NewValue();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output = graph.NewValue();
+  Value* value0 = graph.NewValue();
+  Value* value1 = graph.NewValue();
 
   value0->tensor.shape = BHWC(1, 1, 1, 11);
   simple_node->operation.type = ToString(OperationType::RESHAPE);
diff --git a/tensorflow/lite/delegates/gpu/common/util.h b/tensorflow/lite/delegates/gpu/common/util.h
index 929d00ba356..6a1e79305a7 100644
--- a/tensorflow/lite/delegates/gpu/common/util.h
+++ b/tensorflow/lite/delegates/gpu/common/util.h
@@ -24,24 +24,23 @@ namespace gpu {
 // @param n must be non negative
 // @param divisor must be greater than zero
 template <typename T, typename N>
-T IntegralDivideRoundUp(T n, N divisor) {
+T DivideRoundUp(T n, N divisor) {
   const T div = static_cast<T>(divisor);
   const T q = n / div;
   return n % div == 0 ? q : q + 1;
 }
 
 template <>
-inline uint3 IntegralDivideRoundUp(uint3 n, uint3 divisor) {
-  return uint3(IntegralDivideRoundUp(n.x, divisor.x),
-               IntegralDivideRoundUp(n.y, divisor.y),
-               IntegralDivideRoundUp(n.z, divisor.z));
+inline uint3 DivideRoundUp(uint3 n, uint3 divisor) {
+  return uint3(DivideRoundUp(n.x, divisor.x), DivideRoundUp(n.y, divisor.y),
+               DivideRoundUp(n.z, divisor.z));
 }
 
 // @param number or its components must be greater than zero
 // @param n must be greater than zero
 template <typename T, typename N>
 T AlignByN(T number, N n) {
-  return IntegralDivideRoundUp(number, n) * n;
+  return DivideRoundUp(number, n) * n;
 }
 
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/util_test.cc b/tensorflow/lite/delegates/gpu/common/util_test.cc
index 7c8cb81d156..ce170f476c9 100644
--- a/tensorflow/lite/delegates/gpu/common/util_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/util_test.cc
@@ -24,16 +24,16 @@ namespace {
 
 using testing::Eq;
 
-TEST(UtilTest, IntegralDivideRoundUp) {
-  EXPECT_THAT(IntegralDivideRoundUp(0, 256), Eq(0));
-  EXPECT_THAT(IntegralDivideRoundUp(2u, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(2, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(255u, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(255, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(256u, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(256, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(257u, 256), Eq(2));
-  EXPECT_THAT(IntegralDivideRoundUp(257, 256), Eq(2));
+TEST(UtilTest, DivideRoundUp) {
+  EXPECT_THAT(DivideRoundUp(0, 256), Eq(0));
+  EXPECT_THAT(DivideRoundUp(2u, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(2, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(255u, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(255, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(256u, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(256, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(257u, 256), Eq(2));
+  EXPECT_THAT(DivideRoundUp(257, 256), Eq(2));
 }
 
 TEST(UtilTest, AlignByN) {
diff --git a/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc b/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
index d18e3726a1c..3abab71829f 100644
--- a/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
+++ b/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
@@ -34,9 +34,9 @@ void AddCornerCases(const T& grid, int max_work_group_total_size,
   for (int x = 1; x <= 4; ++x) {
     for (int y = 1; y <= 4; ++y) {
       for (int z = 1; z <= 4; ++z) {
-        int wg_x = IntegralDivideRoundUp(grid.x, x);
-        int wg_y = IntegralDivideRoundUp(grid.y, y);
-        int wg_z = IntegralDivideRoundUp(grid.z, z);
+        int wg_x = DivideRoundUp(grid.x, x);
+        int wg_y = DivideRoundUp(grid.y, y);
+        int wg_z = DivideRoundUp(grid.z, z);
         if (wg_x > max_work_group_sizes.x || wg_y > max_work_group_sizes.y ||
             wg_z > max_work_group_sizes.z ||
             wg_x * wg_y * wg_z > max_work_group_total_size) {
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 540c8ba8c18..4b6727e66e7 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -70,17 +70,25 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
 
 class Delegate {
  public:
-  explicit Delegate(const TfLiteGpuDelegateOptionsV2* options) {
+  explicit Delegate(const TfLiteGpuDelegateOptionsV2* options)
+      : num_delegate_kernels_(0) {
     options_ = options ? *options : TfLiteGpuDelegateOptionsV2Default();
+    if (options_.max_delegated_partitions <= 0) {
+      options_.max_delegated_partitions = 1;
+    }
   }
 
   TfLiteDelegate* tflite_delegate() { return &delegate_; }
   const TfLiteGpuDelegateOptionsV2& options() const { return options_; }
 
-  bool IsQuantOpsAllowed() {
+  bool IsQuantOpsAllowed() const {
     return options_.experimental_flags &
            TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
   }
+  int MaxDelegatedPartitions() const {
+    return options_.max_delegated_partitions;
+  }
+  int num_delegate_kernels() const { return num_delegate_kernels_; }
 
  private:
   TfLiteDelegate delegate_ = {
@@ -93,13 +101,18 @@ class Delegate {
   };
 
   TfLiteGpuDelegateOptionsV2 options_;
+  int num_delegate_kernels_ = 0;
+
+  friend class DelegateKernel;
 };
 
 // Represent the execution of a subset of nodes on GPU.
 class DelegateKernel {
  public:
-  explicit DelegateKernel(const TfLiteGpuDelegateOptionsV2& options)
-      : options_(options) {}
+  explicit DelegateKernel(Delegate* delegate) : delegate_(delegate) {
+    ++delegate_->num_delegate_kernels_;
+  }
+  ~DelegateKernel() { --delegate_->num_delegate_kernels_; }
 
   absl::Status Prepare(TfLiteContext* context,
                        const TfLiteDelegateParams* delegate_params) {
@@ -115,20 +128,29 @@ class DelegateKernel {
 
     std::unique_ptr<InferenceBuilder> builder;
     bool graph_is_destroyed;
-    absl::Status status =
-        InitializeOpenClApi(&graph, &builder, &graph_is_destroyed);
-    if (!status.ok()) {
-      TF_LITE_KERNEL_LOG(context, std::string(status.message()).c_str());
-      context->ReportError(context, "Falling back to OpenGL");
-
-      // Graph need to be re-created because it is moved above.
-      GraphFloat32 graph2;
-      if (graph_is_destroyed) {
-        RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph2,
-                                        &input_refs, &output_refs));
-      }
+    const int experimental_flags = delegate_->options().experimental_flags;
+    if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
       RETURN_IF_ERROR(
-          InitializeOpenGlApi(graph_is_destroyed ? &graph2 : &graph, &builder));
+          InitializeOpenClApi(&graph, &builder, &graph_is_destroyed));
+    } else if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
+      RETURN_IF_ERROR(InitializeOpenGlApi(&graph, &builder));
+    } else {
+      // By default, we try CL first & fall back to GL if that fails.
+      absl::Status status =
+          InitializeOpenClApi(&graph, &builder, &graph_is_destroyed);
+      if (!status.ok()) {
+        TF_LITE_KERNEL_LOG(context, std::string(status.message()).c_str());
+        TF_LITE_KERNEL_LOG(context, "Falling back to OpenGL");
+
+        // Graph needs to be re-created because it is moved above.
+        GraphFloat32 graph2;
+        if (graph_is_destroyed) {
+          RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph2,
+                                          &input_refs, &output_refs));
+        }
+        RETURN_IF_ERROR(InitializeOpenGlApi(
+            graph_is_destroyed ? &graph2 : &graph, &builder));
+      }
     }
 
     // At this point tflite didn't allocate tensors yet, therefore, collect
@@ -232,8 +254,7 @@ class DelegateKernel {
                                std::vector<uint32_t>* input_refs,
                                std::vector<uint32_t>* output_refs) {
     quant_conversion_map_.clear();
-    if (options_.experimental_flags &
-        TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT) {
+    if (delegate_->IsQuantOpsAllowed()) {
       RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph,
                                       &quant_conversion_map_));
     } else {
@@ -242,12 +263,12 @@ class DelegateKernel {
 
     input_refs->clear();
     output_refs->clear();
-    const auto& inputs = graph->inputs();
+    const auto inputs = graph->inputs();
     input_refs->reserve(inputs.size());
     for (const auto& input : inputs) {
       input_refs->push_back(input->tensor.ref);
     }
-    const auto& outputs = graph->outputs();
+    const auto outputs = graph->outputs();
     output_refs->reserve(outputs.size());
     for (const auto& output : outputs) {
       output_refs->push_back(output->tensor.ref);
@@ -328,22 +349,23 @@ class DelegateKernel {
     cl::InferenceEnvironmentProperties properties;
     RETURN_IF_ERROR(cl::NewInferenceEnvironment(env_options, &cl_environment_,
                                                 &properties));
+    auto delegate_options = delegate_->options();
     cl::InferenceOptions options;
     // If is_precision_loss_allowed == -1, then just use priorities instead
     // of paying attention to is_precision_loss_allowed value.
-    if (options_.is_precision_loss_allowed == -1) {
-      options.priority1 = ToPriority(options_.inference_priority1);
-      options.priority2 = ToPriority(options_.inference_priority2);
-      options.priority3 = ToPriority(options_.inference_priority3);
+    if (delegate_options.is_precision_loss_allowed == -1) {
+      options.priority1 = ToPriority(delegate_options.inference_priority1);
+      options.priority2 = ToPriority(delegate_options.inference_priority2);
+      options.priority3 = ToPriority(delegate_options.inference_priority3);
     } else {
       // Users set is_precision_loss_allowed explicitly, thus use it explicitly.
-      if (options_.is_precision_loss_allowed == 0) {
+      if (delegate_options.is_precision_loss_allowed == 0) {
         options.priority1 = InferencePriority::MAX_PRECISION;
       } else {
         options.priority1 = InferencePriority::MIN_LATENCY;
       }
     }
-    options.usage = ToUsage(options_.inference_preference);
+    options.usage = ToUsage(delegate_options.inference_preference);
     *graph_is_destroyed = true;
     RETURN_IF_ERROR(cl_environment_->NewInferenceBuilder(
         options, std::move(*graph), builder));
@@ -358,11 +380,12 @@ class DelegateKernel {
     gl::InferenceEnvironmentProperties properties;
     RETURN_IF_ERROR(
         NewInferenceEnvironment(env_options, &gl_environment_, &properties));
+    auto delegate_options = delegate_->options();
     gl::InferenceOptions options;
-    options.usage = ToUsage(options_.inference_preference);
-    options.priority1 = ToPriority(options_.inference_priority1);
-    options.priority2 = ToPriority(options_.inference_priority2);
-    options.priority3 = ToPriority(options_.inference_priority3);
+    options.usage = ToUsage(delegate_options.inference_preference);
+    options.priority1 = ToPriority(delegate_options.inference_priority1);
+    options.priority2 = ToPriority(delegate_options.inference_priority2);
+    options.priority3 = ToPriority(delegate_options.inference_priority3);
     RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph),
                                                          options, builder));
     enforce_same_thread_ = true;
@@ -371,9 +394,8 @@ class DelegateKernel {
     return absl::OkStatus();
   }
 
-  // Shared across all DelegateKernel instances, passed by the Delegate
-  // instance.
-  const TfLiteGpuDelegateOptionsV2& options_;
+  // The Delegate instance that's shared across all DelegateKernel instances.
+  Delegate* const delegate_;  // doesn't own the memory.
   std::unique_ptr<cl::InferenceEnvironment> cl_environment_;
   std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
   std::unique_ptr<InferenceRunner> runner_;
@@ -405,7 +427,7 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
         // Everything below should happen in prepare function call, but TFLite
         // for whatever reason forbids that.
         auto gpu_delegate_kernel =
-            absl::make_unique<DelegateKernel>(gpu_delegate->options());
+            absl::make_unique<DelegateKernel>(gpu_delegate);
         const auto status = gpu_delegate_kernel->Prepare(context, params);
         if (!status.ok()) {
           context->ReportError(context, "TfLiteGpuDelegate Init: %s",
@@ -454,10 +476,15 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
       "TfLiteGpuDelegateV2",  // .custom_name
       1,                      // .version
   };
-  TfLiteIntArray* ops_to_replace = GetOpsToReplace(
-      context, /*allow_quant_ops=*/GetDelegate(delegate)->IsQuantOpsAllowed());
+
+  auto* gpu_delegate = GetDelegate(delegate);
+  TfLiteIntArray* ops_to_replace =
+      GetOpsToReplace(context, gpu_delegate->IsQuantOpsAllowed(),
+                      gpu_delegate->MaxDelegatedPartitions());
   const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
       context, kRegistration, ops_to_replace, delegate);
+  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Created %d GPU delegate kernels.",
+                  gpu_delegate->num_delegate_kernels());
   TfLiteIntArrayFree(ops_to_replace);
   return status;
 }
@@ -467,15 +494,17 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 }  // namespace tflite
 
 TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
-  TfLiteGpuDelegateOptionsV2 options;
-  // set it to -1 to detect whether it was later adjusted.
-  options.is_precision_loss_allowed = -1;
-  options.inference_preference =
-      TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
-  options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
-  options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO;
-  options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO;
-  options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
+  TfLiteGpuDelegateOptionsV2 options = {
+      // set it to -1 to detect whether it was later adjusted.
+      .is_precision_loss_allowed = -1,
+      .inference_preference =
+          TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+      .inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
+      .inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
+      .inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
+      .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE,
+      .max_delegated_partitions = 1,
+  };
   return options;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/delegate.h b/tensorflow/lite/delegates/gpu/delegate.h
index a60ebec84fe..f03392d9a3c 100644
--- a/tensorflow/lite/delegates/gpu/delegate.h
+++ b/tensorflow/lite/delegates/gpu/delegate.h
@@ -65,7 +65,10 @@ enum TfLiteGpuInferencePriority {
 enum TfLiteGpuExperimentalFlags {
   TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE = 0,
   // Enables inference on quantized models with the delegate.
-  TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT = 1 << 0
+  TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT = 1 << 0,
+  // Enforces execution with the provided backend.
+  TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY = 1 << 1,
+  TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY = 1 << 2
 };
 
 // IMPORTANT: Always use TfLiteGpuDelegateOptionsV2Default() method to create
@@ -106,6 +109,11 @@ typedef struct {
 
   // Bitmask flags. See the comments in TfLiteGpuExperimentalFlags.
   int64_t experimental_flags;
+
+  // A graph could have multiple partitions that can be delegated to the GPU.
+  // This limits the maximum number of partitions to be delegated. By default,
+  // it's set to 1 in TfLiteGpuDelegateOptionsV2Default().
+  int32_t max_delegated_partitions;
 } TfLiteGpuDelegateOptionsV2;
 
 // Populates TfLiteGpuDelegateOptionsV2 as follows:
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index 7c4b5c0379f..91472261d04 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -322,6 +322,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/types:any",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/gl/api.cc b/tensorflow/lite/delegates/gpu/gl/api.cc
index ae050bf1d5d..0240a5cfbed 100644
--- a/tensorflow/lite/delegates/gpu/gl/api.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api.cc
@@ -201,7 +201,7 @@ class CompiledModelImpl
                    ShaderCode code) {
     // Calculate workgroup size.
     uint3 workgroup_size = workgroup_calculator.Calculate(code);
-    uint3 num_workgroups = IntegralDivideRoundUp(code.workload, workgroup_size);
+    uint3 num_workgroups = DivideRoundUp(code.workload, workgroup_size);
 
     for (const auto& object : code.objects) {
       if (IsRef(object)) {
diff --git a/tensorflow/lite/delegates/gpu/gl/api2.cc b/tensorflow/lite/delegates/gpu/gl/api2.cc
index 64e301338e1..c8bf6dd063a 100644
--- a/tensorflow/lite/delegates/gpu/gl/api2.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api2.cc
@@ -136,10 +136,6 @@ class DefaultTensorTie : public TensorTie {
     if (!IsValid(def().external_def, obj)) {
       return absl::InvalidArgumentError("Given object is not valid");
     }
-    // TODO(akulik): external object should propagate to internal.
-    if (IsSameDef()) {
-      return absl::UnimplementedError("Not supported");
-    }
     external_obj_ = obj;
     return absl::OkStatus();
   }
@@ -573,7 +569,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
           } else {
             shader_index = it->second;
           }
-          auto num_workgroups = IntegralDivideRoundUp(code.workload, workgroup);
+          auto num_workgroups = DivideRoundUp(code.workload, workgroup);
           return runtime_ptr->AddProgram(shaders[shader_index], code.parameters,
                                          code.objects, num_workgroups);
         }));
@@ -584,23 +580,30 @@ class InferenceBuilderImpl : public InferenceBuilder {
 
  private:
   // Links internal tensors with external user-facing objects.
-  std::vector<TensorTieDef> LinkTensors(
-      const std::vector<Value<TensorRef<BHWC>>*>& values) {
+  std::vector<TensorTieDef> LinkTensors(const std::vector<Value*>& values) {
     std::vector<TensorTieDef> links;
     links.reserve(values.size());
     for (const auto& value : values) {
-      TensorObjectDef def;
+      TensorObjectDef external_def;
       // So far the compiler always forces inputs and outputs to be in the fixed
       // format.
       const auto& shape = value->tensor.shape;
-      def.dimensions = Dimensions(shape.b, shape.h, shape.w, shape.c);
-      def.object_def.data_type = DataType::FLOAT32;
-      def.object_def.data_layout = DataLayout::DHWC4;
-      def.object_def.object_type = gpu::ObjectType::OPENGL_SSBO;
-      def.object_def.user_provided = true;
+      external_def.dimensions = Dimensions(shape.b, shape.h, shape.w, shape.c);
+      external_def.object_def.data_type = DataType::FLOAT32;
+      external_def.object_def.data_layout = DataLayout::DHWC4;
+      external_def.object_def.object_type = gpu::ObjectType::OPENGL_SSBO;
+
+      // Internal object is not expected to be provided by user because: if
+      // external and internal objects have same defs, the external object is
+      // propagated and just used as an internal one; otherwise, if they have
+      // different defs, internal object will be created, because it is not
+      // provided by user.
+      TensorObjectDef internal_def = external_def;
+      external_def.object_def.user_provided = true;
+      internal_def.object_def.user_provided = false;
       AccessType access =
           graph_.IsGraphInput(value->id) ? AccessType::READ : AccessType::WRITE;
-      links.push_back({value->id, access, def, def});
+      links.push_back({value->id, access, internal_def, external_def});
     }
     return links;
   }
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler.cc b/tensorflow/lite/delegates/gpu/gl/compiler.cc
index a5f5b35f2d2..d316505a0e0 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.cc
@@ -120,8 +120,18 @@ class CompilerImpl : public Compiler {
     for (auto node : compiled_graph_.nodes()) {
       CompiledNodeAttributes attr;
       attr.node_indices.push_back(node->id);
-      RETURN_IF_ERROR(node_shader_.GenerateCode(
-          {&compiled_graph_, &gpu_info_, node, options_}, &attr.code));
+      NodeShader::GenerationContext ctx = {&gpu_info_, options_,
+                                           node->operation.type,
+                                           node->operation.attributes};
+      for (const auto& tensor : graph.FindInputs(node->id)) {
+        const auto& shape = tensor->tensor.shape;
+        ctx.input_shapes.push_back({shape.b, shape.h, shape.w, shape.c});
+      }
+      for (const auto& tensor : graph.FindOutputs(node->id)) {
+        const auto& shape = tensor->tensor.shape;
+        ctx.output_shapes.push_back({shape.b, shape.h, shape.w, shape.c});
+      }
+      RETURN_IF_ERROR(node_shader_.GenerateCode(ctx, &attr.code));
       node->operation.attributes = std::move(attr);
     }
 
@@ -180,8 +190,7 @@ class CompilerImpl : public Compiler {
                 "Workload uint3() requires all output sizes to match");
           }
         }
-        attr.code.workload =
-            uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
+        attr.code.workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
       }
 
       int num_textures = 0;
diff --git a/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.cc b/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.cc
index fc86b0f3cb1..67094d23929 100644
--- a/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.cc
+++ b/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.cc
@@ -88,8 +88,8 @@ absl::Status ConverterBhwcToPhwc4::Convert(const BHWC& shape,
     return absl::UnimplementedError(
         "BhwcToPhwc4: Batch size is not equal to 1.");
   }
-  uint3 workload = uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
-  uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
+  uint3 workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
+  uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
 
   RETURN_IF_ERROR(program_.SetParameter(
       {"sizes_",
diff --git a/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.cc b/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.cc
index 5a9f51c0425..15f859c6d15 100644
--- a/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.cc
+++ b/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.cc
@@ -83,7 +83,7 @@ absl::Status ConverterPhwc4ToBhwc::Convert(const BHWC& shape,
   }
 
   uint3 workload = uint3(shape.w, shape.h, shape.c);
-  uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
+  uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
 
   // TODO(akulik): simply pass workload as soon as UniformParameter
   // supports uint3
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
index 1de49676219..344e494690a 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
@@ -145,6 +145,19 @@ absl::Status CreatePersistentBuffer(size_t size,
   return absl::OkStatus();
 }
 
+namespace gl_buffer_internal {
+
+BufferMapper::BufferMapper(GLenum target, size_t offset, size_t bytes,
+                           GLbitfield access)
+    : target_(target),
+      data_(glMapBufferRange(target_, offset, bytes, access)) {}
+
+BufferMapper::~BufferMapper() {
+  TFLITE_GPU_CALL_GL(glUnmapBuffer, target_).IgnoreError();
+}
+
+};  // namespace gl_buffer_internal
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
index 3225679ec5a..1877fb1f144 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@@ -229,11 +229,9 @@ class BufferBinder {
 // RAII for mapping and unmapping a buffer.
 class BufferMapper {
  public:
-  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access)
-      : target_(target),
-        data_(glMapBufferRange(target_, offset, bytes, access)) {}
+  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access);
 
-  ~BufferMapper() { TFLITE_GPU_CALL_GL(glUnmapBuffer, target_).IgnoreError(); }
+  ~BufferMapper();
 
   void* data() { return data_; }
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index d2ef617a8e2..700a553a125 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -73,6 +73,7 @@ cc_library(
 cc_test(
     name = "add_test",
     srcs = ["add_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -102,6 +103,7 @@ cc_library(
 cc_test(
     name = "concat_test",
     srcs = ["concat_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -136,6 +138,7 @@ cc_library(
 cc_test(
     name = "conv_test",
     srcs = ["conv_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -176,6 +179,7 @@ cc_library(
 cc_test(
     name = "depthwise_conv_test",
     srcs = ["depthwise_conv_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -205,6 +209,7 @@ cc_library(
 cc_test(
     name = "elementwise_test",
     srcs = ["elementwise_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -235,6 +240,7 @@ cc_library(
 cc_test(
     name = "fully_connected_test",
     srcs = ["fully_connected_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -263,6 +269,7 @@ cc_library(
 cc_test(
     name = "lstm_test",
     srcs = ["lstm_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -292,6 +299,7 @@ cc_library(
 cc_test(
     name = "max_unpooling_test",
     srcs = ["max_unpooling_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -322,6 +330,7 @@ cc_library(
 cc_test(
     name = "mean_test",
     srcs = ["mean_test.cc"],
+    linkstatic = True,
     tags = [
         "notap",
         "tflite_not_portable_ios",
@@ -351,6 +360,7 @@ cc_library(
 cc_test(
     name = "mul_test",
     srcs = ["mul_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -380,6 +390,7 @@ cc_library(
 cc_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -409,6 +420,7 @@ cc_library(
 cc_test(
     name = "pooling_test",
     srcs = ["pooling_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -440,6 +452,7 @@ cc_library(
 cc_test(
     name = "prelu_test",
     srcs = ["prelu_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -471,6 +484,7 @@ cc_library(
 cc_test(
     name = "quantize_and_dequantize_test",
     srcs = ["quantize_and_dequantize_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -501,6 +515,7 @@ cc_library(
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -529,6 +544,7 @@ cc_library(
 cc_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -558,6 +574,7 @@ cc_library(
 cc_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -589,6 +606,7 @@ cc_library(
 cc_test(
     name = "softmax_test",
     srcs = ["softmax_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -618,6 +636,7 @@ cc_library(
 cc_test(
     name = "space_to_depth_test",
     srcs = ["space_to_depth_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -679,6 +698,7 @@ cc_library(
 cc_test(
     name = "transpose_conv_test",
     srcs = ["transpose_conv_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -708,6 +728,7 @@ cc_library(
 cc_test(
     name = "resize_test",
     srcs = ["resize_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/add.cc b/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
index 135253112ba..a07e97e822a 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
@@ -36,17 +36,16 @@ class Add : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto attr = absl::any_cast<AddAttributes>(ctx.node->operation.attributes);
+    const auto& attr = absl::any_cast<const AddAttributes&>(ctx.op_attr);
     auto adds = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.param);
     auto scalar = absl::get_if<float>(&attr.param);
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
 
     if (!adds && !scalar) {
       // check if it is a broadcast
-      if (inputs.size() == 2 &&
-          inputs[0]->tensor.shape != inputs[1]->tensor.shape &&
-          inputs[1]->tensor.shape.h == 1 && inputs[1]->tensor.shape.w == 1 &&
-          inputs[0]->tensor.shape.c == inputs[1]->tensor.shape.c) {
+      if (ctx.input_shapes.size() == 2 &&
+          ctx.input_shapes[0] != ctx.input_shapes[1] &&
+          ctx.input_shapes[1][1] == 1 && ctx.input_shapes[1][2] == 1 &&
+          ctx.input_shapes[0][3] == ctx.input_shapes[1][3]) {
         // TODO(b/147771327): investigate why input_data_1[gid.z] worked before
         *generated_code = {
             /*parameters=*/{},
@@ -64,8 +63,8 @@ class Add : public NodeShader {
       }
 
       std::string code = "value_0 = value_0";
-      for (int index = 1; index < inputs.size(); ++index) {
-        if (inputs[index]->tensor.shape != inputs[0]->tensor.shape) {
+      for (int index = 1; index < ctx.input_shapes.size(); ++index) {
+        if (ctx.input_shapes[index] != ctx.input_shapes[0]) {
           return absl::InvalidArgumentError("Shapes are not equal");
         }
         absl::StrAppend(&code, " + value_", index);
@@ -96,14 +95,14 @@ class Add : public NodeShader {
           /*output=*/IOStructure::AUTO,
       };
     } else {
-      auto shape = inputs[0]->tensor.shape;
       *generated_code = {
           /*parameters=*/{},
           /*objects=*/{{"add_buffer", MakeReadonlyObject(adds->data)}},
           /*shared_variables=*/{},
           // Declare workload explicitly because shader depends on gid.z.
           /*workload=*/
-          uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)),
+          uint3(ctx.input_shapes[0][2], ctx.input_shapes[0][1],
+                DivideRoundUp(ctx.input_shapes[0][3], 4)),
           /*workgroup=*/uint3(),
           /*source_code=*/"value_0 += $add_buffer[gid.z]$;",
           /*input=*/IOStructure::AUTO,
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc b/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc
index 43afab2922e..c368f75d128 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc
@@ -34,34 +34,25 @@ namespace {
 class AlignedConcatByChannels : public NodeShader {
  public:
   static bool IsSupported(const GenerationContext& ctx) {
-    auto attr =
-        absl::any_cast<ConcatAttributes>(ctx.node->operation.attributes);
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
+    const auto& attr = absl::any_cast<const ConcatAttributes&>(ctx.op_attr);
 
     // Implementation supports concatenation by channels only.
-    if (attr.axis != Axis::CHANNELS) {
-      return false;
-    }
+    if (attr.axis != Axis::CHANNELS) return false;
 
     // Implementation supports concatenation of 2 tensors only.
-    if (inputs.size() != 2) {
-      return false;
-    }
+    if (ctx.input_shapes.size() != 2) return false;
 
     // H and W must be the same for every concatenated tensor.
-    auto shape0 = inputs[0]->tensor.shape;
-    for (int i = 1; i < inputs.size(); i++) {
-      auto current_shape = inputs[i]->tensor.shape;
-      if (shape0.h != current_shape.h || shape0.w != current_shape.w) {
+    for (int i = 1; i < ctx.input_shapes.size(); i++) {
+      if (ctx.input_shapes[0][1] != ctx.input_shapes[i][1] ||
+          ctx.input_shapes[0][2] != ctx.input_shapes[i][2]) {
         return false;
       }
     }
 
     // Channels must be aligned by 4 for every concatenated tensor.
-    for (int i = 0; i < inputs.size(); i++) {
-      if (inputs[i]->tensor.shape.c % 4 != 0) {
-        return false;
-      }
+    for (const auto& shape : ctx.input_shapes) {
+      if (shape[3] % 4 != 0) return false;
     }
 
     return true;
@@ -73,7 +64,6 @@ class AlignedConcatByChannels : public NodeShader {
       return absl::InvalidArgumentError(
           "This case is not supported by aligned concat");
     }
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
 
     // Shader below concatenates 2 tensors which channels are aligned by 4
     std::string source = R"(
@@ -85,7 +75,8 @@ class AlignedConcatByChannels : public NodeShader {
       }
 )";
     *generated_code = {
-        /*parameters=*/{{"border", inputs[0]->tensor.shape.c / 4}},
+        /*parameters=*/{
+            {"border", static_cast<int>(ctx.input_shapes[0][3]) / 4}},
         /*objects=*/{},
         /*shared_variables=*/{},
         /*workload=*/uint3(),
@@ -101,25 +92,18 @@ class AlignedConcatByChannels : public NodeShader {
 class ConcatByAnyChannel : public NodeShader {
  public:
   static bool IsSupported(const GenerationContext& ctx) {
-    auto attr =
-        absl::any_cast<ConcatAttributes>(ctx.node->operation.attributes);
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
+    const auto& attr = absl::any_cast<const ConcatAttributes&>(ctx.op_attr);
 
     // Implementation supports concatenation by channels only.
-    if (attr.axis != Axis::CHANNELS) {
-      return false;
-    }
+    if (attr.axis != Axis::CHANNELS) return false;
 
     // Implementation supports concatenation of more that 1 tensors only.
-    if (inputs.size() <= 1) {
-      return false;
-    }
+    if (ctx.input_shapes.size() <= 1) return false;
 
     // H and W must be the same for every concatenated tensor.
-    auto shape0 = inputs[0]->tensor.shape;
-    for (int i = 1; i < inputs.size(); i++) {
-      auto current_shape = inputs[i]->tensor.shape;
-      if (shape0.h != current_shape.h || shape0.w != current_shape.w) {
+    for (int i = 1; i < ctx.input_shapes.size(); i++) {
+      if (ctx.input_shapes[0][1] != ctx.input_shapes[i][1] ||
+          ctx.input_shapes[0][2] != ctx.input_shapes[i][2]) {
         return false;
       }
     }
@@ -133,9 +117,6 @@ class ConcatByAnyChannel : public NodeShader {
       return absl::UnimplementedError("This case is not supported by concat");
     }
 
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
-    auto output = ctx.graph->FindOutputs(ctx.node->id)[0];
-
     std::string code = DeclareVariables();
 
     // "already_written" is used to keep the amount of already joined channels
@@ -146,12 +127,12 @@ class ConcatByAnyChannel : public NodeShader {
     // This macros instantiate the variable "var" and
     // reads the value from buffer "buff" by address "addr"
     int t = 0;
-    for (int current_input_id = 0; current_input_id < inputs.size();
+    for (int current_input_id = 0; current_input_id < ctx.input_shapes.size();
          current_input_id++) {
       // Start joining next inout tensor
 
       // Grab channels amount
-      int in_ch = inputs[current_input_id]->tensor.shape.c;
+      int in_ch = ctx.input_shapes[current_input_id][3];
       code += PrintStartMessage(current_input_id, in_ch, already_written);
 
       // Construct the buffer name associated with this tensor
@@ -176,7 +157,9 @@ class ConcatByAnyChannel : public NodeShader {
         /*parameters=*/{},
         /*objects=*/{},
         /*shared_variables=*/{},
-        /*workload=*/uint3(output->tensor.shape.w, output->tensor.shape.h, 1),
+        /*workload=*/
+        uint3(static_cast<int>(ctx.output_shapes[0][2]),
+              static_cast<int>(ctx.output_shapes[0][1]), 1),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(code),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
@@ -227,7 +210,7 @@ vec4 val = vec4(0.0f);
     // * - you are going to write into these cells
     // @ - you will fill these cells next cycles
     // ^ - first elem you start writing from
-    int blocks_amount = IntegralDivideRoundUp<int>(in_ch, 4);
+    int blocks_amount = DivideRoundUp<int>(in_ch, 4);
     code += "// Aligned case\n";
     code += "// I'm going to make " + std::to_string(blocks_amount) +
             " write(s)\n\n";
@@ -322,25 +305,18 @@ vec4 val = vec4(0.0f);
 class FlatConcatByHeight : public NodeShader {
  public:
   static bool IsSupported(const GenerationContext& ctx) {
-    auto attr =
-        absl::any_cast<ConcatAttributes>(ctx.node->operation.attributes);
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
+    const auto& attr = absl::any_cast<const ConcatAttributes&>(ctx.op_attr);
 
     // Implementation supports concatenation by height only.
-    if (attr.axis != Axis::HEIGHT) {
-      return false;
-    }
+    if (attr.axis != Axis::HEIGHT) return false;
 
     // Implementation supports concatenation of more that 1 tensors only.
-    if (inputs.size() <= 1) {
-      return false;
-    }
+    if (ctx.input_shapes.size() <= 1) return false;
 
     // C and W must be the same for every concatenated tensor.
-    auto shape0 = inputs[0]->tensor.shape;
-    for (int i = 1; i < inputs.size(); i++) {
-      auto current_shape = inputs[i]->tensor.shape;
-      if (shape0.c != current_shape.c || shape0.w != current_shape.w) {
+    for (int i = 1; i < ctx.input_shapes.size(); i++) {
+      if (ctx.input_shapes[0][3] != ctx.input_shapes[i][3] ||
+          ctx.input_shapes[0][2] != ctx.input_shapes[i][2]) {
         return false;
       }
     }
@@ -350,26 +326,25 @@ class FlatConcatByHeight : public NodeShader {
 
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
     std::string code;
     std::vector<Variable> params;
-    for (int i = 0, shift = 0; i < inputs.size();
-         shift += inputs[i]->tensor.shape.h, i++) {
+    for (int i = 0, shift = 0; i < ctx.input_shapes.size();
+         shift += ctx.input_shapes[i][1], i++) {
       code += "if (";
       if (i != 0) {
         code += "$input_data_" + std::to_string(i - 1) + "_h$ <= gid.y && ";
       }
-      code += "gid.y < " + std::to_string(shift + inputs[i]->tensor.shape.h) +
-              ") {\n";
+      code +=
+          "gid.y < " + std::to_string(shift + ctx.input_shapes[i][1]) + ") {\n";
       code += "if (gid.y - " + std::to_string(shift) + " >= $input_data_" +
               std::to_string(i) + "_h$) return;\n";
       code += "value_0 = $input_data_" + std::to_string(i) +
               "[gid.x, gid.y - " + std::to_string(shift) + ", gid.z]$;\n}\n";
-      if (i != inputs.size() - 1) {
+      if (i != ctx.input_shapes.size() - 1) {
         code += " else ";
       }
       params.push_back({"input_data_" + std::to_string(i) + "_h",
-                        inputs[i]->tensor.shape.h});
+                        static_cast<int>(ctx.input_shapes[i][1])});
     }
 
     *generated_code = {
@@ -389,25 +364,18 @@ class FlatConcatByHeight : public NodeShader {
 class FlatConcatByWidth : public NodeShader {
  public:
   static bool IsSupported(const GenerationContext& ctx) {
-    auto attr =
-        absl::any_cast<ConcatAttributes>(ctx.node->operation.attributes);
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
+    const auto& attr = absl::any_cast<const ConcatAttributes&>(ctx.op_attr);
 
     // Implementation supports concatenation by width only.
-    if (attr.axis != Axis::WIDTH) {
-      return false;
-    }
+    if (attr.axis != Axis::WIDTH) return false;
 
     // Implementation supports concatenation of more that 1 tensors only.
-    if (inputs.size() <= 1) {
-      return false;
-    }
+    if (ctx.input_shapes.size() <= 1) return false;
 
     // C and H must be the same for every concatenated tensor.
-    auto shape0 = inputs[0]->tensor.shape;
-    for (int i = 1; i < inputs.size(); i++) {
-      auto current_shape = inputs[i]->tensor.shape;
-      if (shape0.c != current_shape.c || shape0.h != current_shape.h) {
+    for (int i = 1; i < ctx.input_shapes.size(); i++) {
+      if (ctx.input_shapes[0][3] != ctx.input_shapes[i][3] ||
+          ctx.input_shapes[0][1] != ctx.input_shapes[i][1]) {
         return false;
       }
     }
@@ -417,26 +385,25 @@ class FlatConcatByWidth : public NodeShader {
 
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
     std::string code;
     std::vector<Variable> params;
-    for (int i = 0, shift = 0; i < inputs.size();
-         shift += inputs[i]->tensor.shape.w, i++) {
+    for (int i = 0, shift = 0; i < ctx.input_shapes.size();
+         shift += ctx.input_shapes[i][2], i++) {
       code += "if (";
       if (i != 0) {
         code += "$input_data_" + std::to_string(i - 1) + "_w$ <= gid.x && ";
       }
-      code += "gid.x < " + std::to_string(shift + inputs[i]->tensor.shape.w) +
-              ") {\n";
+      code +=
+          "gid.x < " + std::to_string(shift + ctx.input_shapes[i][2]) + ") {\n";
       code += "if (gid.x - " + std::to_string(shift) + " >= $input_data_" +
               std::to_string(i) + "_w$) return;\n";
       code += "value_0 = $input_data_" + std::to_string(i) + "[gid.x - " +
               std::to_string(shift) + ", gid.y, gid.z]$;\n}\n";
-      if (i != inputs.size() - 1) {
+      if (i != ctx.input_shapes.size() - 1) {
         code += " else ";
       }
       params.push_back({"input_data_" + std::to_string(i) + "_w",
-                        inputs[i]->tensor.shape.w});
+                        static_cast<int>(ctx.input_shapes[i][2])});
     }
 
     *generated_code = {
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc
index 5c88402c1d1..990d86436fa 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc
@@ -39,24 +39,27 @@ class Convolution : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-    auto attr = absl::any_cast<const Convolution2DAttributes&>(
-        ctx.node->operation.attributes);
+    if (ctx.input_shapes.size() != 1) {
+      return absl::UnimplementedError(
+          "Convolution does not support more than 1 runtime tensor");
+    }
+    const auto& attr =
+        absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
     auto weights = attr.weights.shape;
     const int offsets_count = weights.h * weights.w;
     const bool offsets_count_too_large = offsets_count > kMaxConstArraySize;
     std::vector<Variable> parameters;
     if (offsets_count_too_large) {
       parameters = {
-          {"input_data_0_h", input->tensor.shape.h},
-          {"input_data_0_w", input->tensor.shape.w},
+          {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+          {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
           {"padding_w", attr.padding.prepended.w},
           {"padding_h", attr.padding.prepended.h},
           {"dilation_w", attr.dilations.w},
           {"dilation_h", attr.dilations.h},
           {"kernel_w", weights.w},
           {"kernel_h", weights.h},
-          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"src_depth", DivideRoundUp(weights.i, 4)},
           {"stride", int2(attr.strides.w, attr.strides.h)},
       };
     } else {
@@ -68,11 +71,11 @@ class Convolution : public NodeShader {
         }
       }
       parameters = {
-          {"input_data_0_h", input->tensor.shape.h},
-          {"input_data_0_w", input->tensor.shape.w},
+          {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+          {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
           {"offsets_count", offsets_count},
           {"offsets", offsets},
-          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"src_depth", DivideRoundUp(weights.i, 4)},
           {"stride", int2(attr.strides.w, attr.strides.h)},
       };
     }
@@ -133,8 +136,8 @@ class Convolution : public NodeShader {
         GetIdealWorkgroupIfPossible(
             ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D,
             HW(weights.h, weights.w), attr.strides, uint3(0, 0, 0),
-            OHWI(weights.o, input->tensor.shape.h, input->tensor.shape.w,
-                 input->tensor.shape.c)),
+            OHWI(weights.o, ctx.input_shapes[0][1], ctx.input_shapes[0][2],
+                 ctx.input_shapes[0][3])),
         /*source_code=*/std::move(source),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
         /*output=*/IOStructure::AUTO,
@@ -162,10 +165,12 @@ class Convolution1x1 : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-    auto output = ctx.graph->FindOutputs(ctx.node->id)[0];
-    auto attr = absl::any_cast<const Convolution2DAttributes&>(
-        ctx.node->operation.attributes);
+    if (ctx.input_shapes.size() != 1) {
+      return absl::UnimplementedError(
+          "Convolution does not support more than 1 runtime tensor");
+    }
+    const auto& attr =
+        absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
     if (attr.weights.shape.h != 1 || attr.weights.shape.w != 1) {
       return absl::UnimplementedError("Height and width should be 1.");
     }
@@ -180,17 +185,18 @@ class Convolution1x1 : public NodeShader {
       return absl::UnimplementedError("Padding is not supported.");
     }
 
-    int multiplier = SelectMultiplier(input->tensor.shape.w, ctx);
+    int multiplier = SelectMultiplier(ctx.input_shapes[0][2], ctx);
 
     std::vector<Variable> parameters = {
-        {"src_depth", IntegralDivideRoundUp(input->tensor.shape.c, 4)},
+        {"src_depth",
+         DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
     };
 
     std::vector<std::pair<std::string, Object>> objects = {
-        {"weights", MakeReadonlyObject(
-                        uint3(4, IntegralDivideRoundUp(attr.weights.shape.i, 4),
-                              IntegralDivideRoundUp(attr.weights.shape.o, 4)),
-                        ConvertToPHWO4I4(attr.weights))}};
+        {"weights",
+         MakeReadonlyObject(uint3(4, DivideRoundUp(attr.weights.shape.i, 4),
+                                  DivideRoundUp(attr.weights.shape.o, 4)),
+                            ConvertToPHWO4I4(attr.weights))}};
     std::string source;
     for (int i = 0; i < multiplier; i++) {
       absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n");
@@ -226,7 +232,7 @@ class Convolution1x1 : public NodeShader {
       absl::StrAppend(&source, "value_0 = result0;\n");
     }
 
-    auto dst_depth = IntegralDivideRoundUp(output->tensor.shape.c, 4);
+    auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
     uint3 workgroup = uint3(16, 16, 1);
     if (ctx.gpu_info->type == GpuType::ADRENO) {
       if (dst_depth >= 2) {
@@ -266,15 +272,15 @@ class Convolution1x1 : public NodeShader {
         /*objects=*/std::move(objects),
         /*shared_variables=*/{},
         /*workload=*/
-        uint3(output->tensor.shape.w / multiplier, output->tensor.shape.h,
-              IntegralDivideRoundUp(output->tensor.shape.c, 4)),
+        uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1],
+              DivideRoundUp(ctx.output_shapes[0][3], 4)),
         /*workgroup=*/
         GetIdealWorkgroupIfPossible(
             ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D,
             HW(attr.weights.shape.h, attr.weights.shape.w), attr.strides,
             workgroup,
-            OHWI(attr.weights.shape.o, input->tensor.shape.h,
-                 input->tensor.shape.w, input->tensor.shape.c)),
+            OHWI(attr.weights.shape.o, ctx.input_shapes[0][1],
+                 ctx.input_shapes[0][2], ctx.input_shapes[0][3])),
         /*source_code=*/std::move(source),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
         /*output=*/multiplier == 1 ? IOStructure::AUTO
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
index bc4c61075a3..016be3c5919 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
@@ -64,7 +64,7 @@ class OpenGlConverterImpl : public TensorObjectConverter {
   }
 
   absl::Status Dispatch(const uint3& workload) {
-    uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
+    uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
     if (command_queue_) {
       return command_queue_->Dispatch(program_, num_workgroups);
     }
@@ -256,7 +256,7 @@ class ToTensorConverter : public OpenGlConverterImpl {
       return absl::InvalidArgumentError(
           "ToTensorConverter: output data size does not match expected size.");
     }
-    auto d = IntegralDivideRoundUp(shape_.c, 4);
+    auto d = DivideRoundUp(shape_.c, 4);
     RETURN_IF_ERROR(program_.SetParameter(
         {"sizes",
          int4(static_cast<int32_t>(shape_.w), static_cast<int32_t>(shape_.h),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
index 38ddbf361b4..71217a8e709 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
@@ -38,24 +38,23 @@ class DepthwiseConvolution : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-    auto attr = absl::any_cast<const DepthwiseConvolution2DAttributes&>(
-        ctx.node->operation.attributes);
+    const auto& attr =
+        absl::any_cast<const DepthwiseConvolution2DAttributes&>(ctx.op_attr);
     auto weights = attr.weights.shape;
     const int offsets_count = weights.h * weights.w;
     const bool offsets_count_too_large = offsets_count > kMaxConstArraySize;
     std::vector<Variable> parameters;
     if (offsets_count_too_large) {
       parameters = {
-          {"input_data_0_h", input->tensor.shape.h},
-          {"input_data_0_w", input->tensor.shape.w},
+          {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+          {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
           {"padding_w", attr.padding.prepended.w},
           {"padding_h", attr.padding.prepended.h},
           {"dilation_w", attr.dilations.w},
           {"dilation_h", attr.dilations.h},
           {"kernel_w", weights.w},
           {"kernel_h", weights.h},
-          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"src_depth", DivideRoundUp(weights.i, 4)},
           {"channel_multiplier", weights.o},
           {"stride", int2(attr.strides.w, attr.strides.h)},
       };
@@ -68,11 +67,11 @@ class DepthwiseConvolution : public NodeShader {
         }
       }
       parameters = {
-          {"input_data_0_h", input->tensor.shape.h},
-          {"input_data_0_w", input->tensor.shape.w},
+          {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+          {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
           {"offsets_count", offsets_count},
           {"offsets", offsets},
-          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"src_depth", DivideRoundUp(weights.i, 4)},
           {"channel_multiplier", weights.o},
           {"stride", int2(attr.strides.w, attr.strides.h)},
       };
@@ -140,8 +139,8 @@ class DepthwiseConvolution : public NodeShader {
         GetIdealWorkgroupIfPossible(
             ctx.gpu_info->gpu_model, OperationType::DEPTHWISE_CONVOLUTION,
             HW(attr.weights.shape.h, attr.weights.shape.w), attr.strides,
-            OHWI(attr.weights.shape.o, input->tensor.shape.h,
-                 input->tensor.shape.w, input->tensor.shape.c)),
+            OHWI(attr.weights.shape.o, ctx.input_shapes[0][1],
+                 ctx.input_shapes[0][2], ctx.input_shapes[0][3])),
         /*source_code=*/std::move(source),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
         /*output=*/IOStructure::AUTO,
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
index cadafcfa5f5..b5971c59667 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
@@ -114,35 +114,15 @@ class ElementwiseTwoArguments : public NodeShader {
   explicit ElementwiseTwoArguments(OperationType operation_type)
       : operation_type_(operation_type) {}
 
-  bool IsSupportedElemwise(const GenerationContext& ctx) const {
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
-
-    // Implementation supports concatenation of 2 tensors only.
-    if (inputs.size() != 2) {
-      return false;
-    }
-
-    auto shape0 = inputs[0]->tensor.shape;
-    auto shape1 = inputs[1]->tensor.shape;
-
-    // Shapes must be the same
-    if (shape0 != shape1) {
-      return false;
-    }
-    return true;
+  inline bool IsElementwiseSupported(const GenerationContext& ctx) const {
+    return ctx.input_shapes.size() == 2 &&
+           ctx.input_shapes[0] == ctx.input_shapes[1];
   }
 
-  bool IsSupportedBroadcast(const GenerationContext& ctx) const {
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
-    auto outputs = ctx.graph->FindOutputs(ctx.node->id);
-    if (inputs.size() != 2) {
-      return false;
-    }
-    if (inputs[1]->tensor.shape.h != 1 || inputs[1]->tensor.shape.w != 1 ||
-        inputs[0]->tensor.shape.c != inputs[1]->tensor.shape.c) {
-      return false;
-    }
-    return true;
+  inline bool IsBroadcastSupported(const GenerationContext& ctx) const {
+    return ctx.input_shapes.size() == 2 && ctx.input_shapes[1][1] == 1 &&
+           ctx.input_shapes[1][2] == 1 &&
+           ctx.input_shapes[0][3] == ctx.input_shapes[1][3];
   }
 
   absl::Status GenerateCode(const GenerationContext& ctx,
@@ -150,22 +130,18 @@ class ElementwiseTwoArguments : public NodeShader {
     std::vector<Variable> parameters;
     std::vector<std::pair<std::string, Object>> objects;
     std::string argument0, argument1;
-    if (IsSupportedElemwise(ctx)) {
+    if (IsElementwiseSupported(ctx)) {
       argument0 = "value_0";
       argument1 = "value_1";
-    } else if (IsSupportedBroadcast(ctx)) {
+    } else if (IsBroadcastSupported(ctx)) {
       argument0 = "$input_data_0[gid.x, gid.y, gid.z]$";
       argument1 = "$input_data_1[0, 0, gid.z]$";
     } else {  // Scalar of const vector case
-      const ElementwiseAttributes* attr = absl::any_cast<ElementwiseAttributes>(
-          &ctx.node->operation.attributes);
-      if (!attr) {
-        return absl::InvalidArgumentError(
-            "Couldn't read attributes for the scalar of const vector case.");
-      }
-      auto* tensor =
-          absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr->param);
-      auto* scalar = absl::get_if<float>(&attr->param);
+      const auto& attr =
+          absl::any_cast<const ElementwiseAttributes&>(ctx.op_attr);
+      const auto* tensor =
+          absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.param);
+      const auto* scalar = absl::get_if<float>(&attr.param);
       if (!tensor && !scalar) {
         return absl::InvalidArgumentError(
             "Couldn't read scalar of const vector data from the attributes.");
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
index a472d4e5428..0c3d2cd5587 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
@@ -36,11 +36,11 @@ class FullyConnectedBuffers : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto attr = absl::any_cast<const FullyConnectedAttributes&>(
-        ctx.node->operation.attributes);
+    const auto& attr =
+        absl::any_cast<const FullyConnectedAttributes&>(ctx.op_attr);
 
-    const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
-    const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+    const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+    const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
 
     // This shader can work with any workgroup size, the values below work well
     // for OpenGL.
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc
index c8961eee087..4e899e56500 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc
@@ -35,8 +35,8 @@ class MaxUnpooling : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto attr = absl::any_cast<MaxUnpooling2DAttributes>(
-        ctx.node->operation.attributes);
+    const auto& attr =
+        absl::any_cast<const MaxUnpooling2DAttributes&>(ctx.op_attr);
     std::vector<Variable> parameters = {
         {"stride", int2(attr.strides.w, attr.strides.h)},
         {"offset", int2(attr.padding.prepended.w, attr.padding.prepended.h)},
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
index e94c952ffaa..c66ff55c583 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
@@ -34,17 +34,15 @@ class Mean : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto attr = absl::any_cast<MeanAttributes>(ctx.node->operation.attributes);
+    const auto& attr = absl::any_cast<const MeanAttributes&>(ctx.op_attr);
     if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
       return absl::InvalidArgumentError(
           "Mean calculation is supported only for height and width.");
     }
 
-    auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-
     std::vector<Variable> parameters = {
-        {"input_data_0_h", input->tensor.shape.h},
-        {"input_data_0_w", input->tensor.shape.w}};
+        {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+        {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}};
 
     std::string source = R"(
       // Shaders may be compiled with a precision hint mediump, which means that
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
index 6e825dc862d..c7d98e49f1c 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
@@ -33,36 +33,31 @@ namespace gl {
 namespace {
 
 bool IsApplyMaskSupported(const NodeShader::GenerationContext& ctx) {
-  const auto inputs = ctx.graph->FindInputs(ctx.node->id);
-  if (inputs.size() != 2) return false;
-  const auto& shape0 = inputs[0]->tensor.shape;
-  const auto& shape1 = inputs[1]->tensor.shape;
+  if (ctx.input_shapes.size() != 2) return false;
 
   // [H, W, C] x [H, W, 0][0]
-  if (shape0.h == shape1.h && shape0.w == shape1.w && shape1.c == 1) {
+  if (ctx.input_shapes[0][1] == ctx.input_shapes[1][1] &&
+      ctx.input_shapes[0][2] == ctx.input_shapes[1][2] &&
+      ctx.input_shapes[1][3] == 1) {
     return true;
   }
 
   // [H, W, C] x [H, W, C]
-  if (shape0 == shape1) {
-    return true;
-  }
+  if (ctx.input_shapes[0] == ctx.input_shapes[1]) return true;
 
   // [H, W, C] x [0, 0, C]
-  return shape1.h == 1 && shape1.w == 1 && shape0.c == shape1.c;
+  return ctx.input_shapes[1][1] == 1 && ctx.input_shapes[1][2] == 1 &&
+         ctx.input_shapes[0][3] == ctx.input_shapes[1][3];
 }
 
 absl::Status GenerateApplyMaskCode(const NodeShader::GenerationContext& ctx,
                                    GeneratedCode* generated_code) {
-  const auto inputs = ctx.graph->FindInputs(ctx.node->id);
-  const auto& shape0 = inputs[0]->tensor.shape;
-  const auto& shape1 = inputs[1]->tensor.shape;
-
   std::string source = "value_0 = $input_data_0[gid.x, gid.y, gid.z]$ * ";
-  if (shape1.c == 1) {
+  if (ctx.input_shapes[1][3] == 1) {
     // [H, W, C] x [H, W, 0][0]
     absl::StrAppend(&source, "$input_data_1[gid.x, gid.y, 0]$.x;");
-  } else if (shape0.h == shape1.h && shape0.w == shape1.w) {
+  } else if (ctx.input_shapes[0][1] == ctx.input_shapes[1][1] &&
+             ctx.input_shapes[0][2] == ctx.input_shapes[1][2]) {
     // [H, W, C] x [H, W, C]
     absl::StrAppend(&source, "$input_data_1[gid.x, gid.y, gid.z]$;");
   } else {
@@ -85,8 +80,7 @@ absl::Status GenerateApplyMaskCode(const NodeShader::GenerationContext& ctx,
 
 absl::Status GenerateMultiplyScalarCode(
     const NodeShader::GenerationContext& ctx, GeneratedCode* generated_code) {
-  auto attr =
-      absl::any_cast<MultiplyAttributes>(ctx.node->operation.attributes);
+  const auto& attr = absl::any_cast<const MultiplyAttributes&>(ctx.op_attr);
   auto muls = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.param);
   auto scalar = absl::get_if<float>(&attr.param);
 
@@ -105,14 +99,15 @@ absl::Status GenerateMultiplyScalarCode(
     if (!muls) {
       return absl::InvalidArgumentError("Empty parameters for Multiplication.");
     }
-    auto shape = ctx.graph->FindInputs(ctx.node->id)[0]->tensor.shape;
     *generated_code = {
         /*parameters=*/{},
         /*objects=*/{{"mul_buffer", MakeReadonlyObject(muls->data)}},
         /*shared_variables=*/{},
         // Declare workload explicitly because shader depends on gid.z.
         /*workload=*/
-        uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)),
+        uint3(static_cast<int>(ctx.input_shapes[0][2]),
+              static_cast<int>(ctx.input_shapes[0][1]),
+              DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
         /*workgroup=*/uint3(),
         /*source_code=*/"value_0 *= $mul_buffer[gid.z]$;",
         /*input=*/IOStructure::AUTO,
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc b/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc
index 3fc84aa675e..d5fdaec47f1 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc
@@ -36,8 +36,7 @@ class Pad : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-    auto attr = absl::any_cast<PadAttributes>(ctx.node->operation.attributes);
+    const auto& attr = absl::any_cast<const PadAttributes&>(ctx.op_attr);
 
     if (attr.type != PaddingContentType::ZEROS &&
         attr.type != PaddingContentType::REFLECT) {
@@ -52,9 +51,9 @@ class Pad : public NodeShader {
       return absl::UnimplementedError("Padding for BATCH is not supported.");
     }
     std::vector<Variable> parameters = {
-        {"input_data_0_h", input->tensor.shape.h},
-        {"input_data_0_w", input->tensor.shape.w},
-        {"input_data_0_c", input->tensor.shape.c},
+        {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+        {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
+        {"input_data_0_c", static_cast<int>(ctx.input_shapes[0][3])},
         {"prepended",
          int4(attr.prepended.w, attr.prepended.h, attr.prepended.c, 0)},
     };
@@ -99,7 +98,8 @@ class Pad : public NodeShader {
         source += "    value_0 = $input_data_0[src_x, src_y, gid.z]$;\n";
       } else if (attr.prepended.c % 4 == 0) {
         parameters.push_back(
-            {"src_slices", IntegralDivideRoundUp(input->tensor.shape.c, 4)});
+            {"src_slices",
+             DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)});
         source += R"(
     int src_z = gid.z - $prepended.z$ / 4;
     if (src_z >= 0 && src_z < $src_slices$) {
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc
index 5c6aefcde1c..b5ea4829ad7 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc
@@ -34,16 +34,14 @@ namespace {
 absl::Status GenerateMaxPoolingCode(const Pooling2DAttributes& attr,
                                     const NodeShader::GenerationContext& ctx,
                                     GeneratedCode* generated_code) {
-  auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-
   if (attr.padding.prepended.h > attr.kernel.h ||
       attr.padding.prepended.w > attr.kernel.w) {
     return absl::InvalidArgumentError("Padding is bigger than kernel.");
   }
 
   std::vector<Variable> parameters = {
-      {"input_data_0_h", input->tensor.shape.h},
-      {"input_data_0_w", input->tensor.shape.w},
+      {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+      {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
       {"stride", int2(attr.strides.w, attr.strides.h)},
       {"offset", int2(attr.padding.prepended.w, attr.padding.prepended.h)},
       {"window_h", attr.kernel.h},
@@ -100,11 +98,9 @@ absl::Status GenerateMaxPoolingCode(const Pooling2DAttributes& attr,
 absl::Status GenerateAveragePoolingCode(
     const Pooling2DAttributes& attr, const NodeShader::GenerationContext& ctx,
     GeneratedCode* generated_code) {
-  auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-
   std::vector<Variable> parameters = {
-      {"input_data_0_h", input->tensor.shape.h},
-      {"input_data_0_w", input->tensor.shape.w},
+      {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+      {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
       {"stride", int2(attr.strides.w, attr.strides.h)},
       {"offset", int2(attr.padding.prepended.w, attr.padding.prepended.h)},
       {"window_h", attr.kernel.h},
@@ -143,8 +139,7 @@ class Pooling : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    const auto& attr =
-        absl::any_cast<Pooling2DAttributes>(ctx.node->operation.attributes);
+    const auto& attr = absl::any_cast<const Pooling2DAttributes&>(ctx.op_attr);
     switch (attr.type) {
       case PoolingType::AVERAGE:
         return GenerateAveragePoolingCode(attr, ctx, generated_code);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc
index 28f8551f530..c3e9714c0e6 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc
@@ -37,20 +37,16 @@ class PReLULinearAlpha : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto output = ctx.graph->FindOutputs(ctx.node->id)[0];
-    auto attr =
-        absl::any_cast<const PReLUAttributes&>(ctx.node->operation.attributes);
+    const auto& attr = absl::any_cast<const PReLUAttributes&>(ctx.op_attr);
     auto alpha = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.alpha);
     if (!alpha) {
       return absl::InvalidArgumentError("Alpha is missing");
     }
-    if (alpha->shape.v != output->tensor.shape.c) {
+    if (alpha->shape.v != ctx.output_shapes[0][3]) {
       return absl::InvalidArgumentError(
           "Alpha shape does not match the number of channels.");
     }
 
-    auto shape = output->tensor.shape;
-
     *generated_code =
         attr.clip
             ? GeneratedCode{
@@ -71,7 +67,10 @@ class PReLULinearAlpha : public NodeShader {
                   // Declare workload explicitly because shader depends on
                   // gid.z.
                   /*workload=*/
-                  uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)),
+                  uint3(static_cast<int>(ctx.output_shapes[0][2]),
+                        static_cast<int>(ctx.output_shapes[0][1]),
+                        DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
+                                      4)),
                   /*workgroup=*/uint3(),
                   /*source_code=*/
                   "value_0 = max(value_0, 0.0) + $alpha[gid.z]$ * min(value_0, "
@@ -87,24 +86,22 @@ class PReLUFull : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto output = ctx.graph->FindOutputs(ctx.node->id)[0];
-    auto attr =
-        absl::any_cast<const PReLUAttributes&>(ctx.node->operation.attributes);
+    const auto& attr = absl::any_cast<const PReLUAttributes&>(ctx.op_attr);
     auto alpha = absl::get_if<Tensor<HWC, DataType::FLOAT32>>(&attr.alpha);
     if (!alpha) {
       return absl::InvalidArgumentError("Alpha is missing");
     }
-    if (alpha->shape.h != output->tensor.shape.h ||
-        alpha->shape.w != output->tensor.shape.w ||
-        alpha->shape.c != output->tensor.shape.c) {
+    if (alpha->shape.h != ctx.output_shapes[0][1] ||
+        alpha->shape.w != ctx.output_shapes[0][2] ||
+        alpha->shape.c != ctx.output_shapes[0][3]) {
       return absl::InvalidArgumentError(
           "Alpha shape does not match input shape.");
     }
 
-    auto shape = output->tensor.shape;
-
     ObjectSize obj_size =
-        uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
+        uint3(static_cast<int>(ctx.output_shapes[0][2]),
+              static_cast<int>(ctx.output_shapes[0][1]),
+              DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4));
 
     *generated_code =
         attr.clip
@@ -117,7 +114,10 @@ class PReLUFull : public NodeShader {
                   // Declare workload explicitly because shader
                   // depends on gid.z.
                   /*workload=*/
-                  uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)),
+                  uint3(static_cast<int>(ctx.output_shapes[0][2]),
+                        static_cast<int>(ctx.output_shapes[0][1]),
+                        DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
+                                      4)),
                   /*workgroup=*/uint3(),
                   /*source_code=*/
                   "value_0 = clamp(value_0, 0.0, $clip$) + "
@@ -134,7 +134,10 @@ class PReLUFull : public NodeShader {
                   // Declare workload explicitly because shader depends on
                   // gid.z.
                   /*workload=*/
-                  uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)),
+                  uint3(static_cast<int>(ctx.output_shapes[0][2]),
+                        static_cast<int>(ctx.output_shapes[0][1]),
+                        DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
+                                      4)),
                   /*workgroup=*/uint3(),
                   /*source_code=*/
                   "value_0 = max(value_0, 0.0) + $alpha[gid.x, gid.y, gid.z]$ "
@@ -150,9 +153,8 @@ class PReLU : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto attr =
-        absl::any_cast<const PReLUAttributes&>(ctx.node->operation.attributes);
-    auto alpha = absl::get_if<Tensor<HWC, DataType::FLOAT32>>(&attr.alpha);
+    const auto& attr = absl::any_cast<const PReLUAttributes&>(ctx.op_attr);
+    auto* alpha = absl::get_if<Tensor<HWC, DataType::FLOAT32>>(&attr.alpha);
     return alpha ? full_.GenerateCode(ctx, generated_code)
                  : linear_.GenerateCode(ctx, generated_code);
   }
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.cc b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.cc
index 7d24b3d1798..5ee233634ae 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.cc
@@ -40,8 +40,8 @@ value_0 = floor(value_0 + vec4(0.5));
 value_0 = value_0 * vec4($quant_scale$) + vec4($quant_min$);
 )";
 
-    auto attr = absl::any_cast<const QuantizeAndDequantizeAttributes&>(
-        ctx.node->operation.attributes);
+    const auto& attr =
+        absl::any_cast<const QuantizeAndDequantizeAttributes&>(ctx.op_attr);
     *generated_code = {
         /*parameters=*/{{"quant_min", attr.min},
                         {"quant_max", attr.max},
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
index 8f6de92acd8..913eebdabbe 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@@ -123,7 +123,7 @@ class Registry : public NodeShader {
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
     std::vector<std::string> errors;
-    auto it = shaders_.find(ctx.node->operation.type);
+    auto it = shaders_.find(ctx.op_type);
     if (it != shaders_.end()) {
       for (auto& shader : it->second) {
         const auto status = shader->GenerateCode(ctx, generated_code);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
index a9357968a90..fa7d8753762 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
@@ -35,7 +35,7 @@ class ReLU : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto attr = absl::any_cast<ReLUAttributes>(ctx.node->operation.attributes);
+    const auto& attr = absl::any_cast<const ReLUAttributes&>(ctx.op_attr);
     // clamp(value, min(0, alpha * value), clip)
     std::vector<Variable> params;
     std::string min;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc
index 9734ff14a1e..a92673f76ae 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc
@@ -34,16 +34,17 @@ class Reshape : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-    auto output = ctx.graph->FindOutputs(ctx.node->id)[0];
-    if (input->tensor.shape.DimensionsProduct() !=
-        output->tensor.shape.DimensionsProduct()) {
+    if (ctx.input_shapes[0][1] * ctx.input_shapes[0][2] *
+            ctx.input_shapes[0][3] !=
+        ctx.output_shapes[0][1] * ctx.output_shapes[0][2] *
+            ctx.output_shapes[0][3]) {
       return absl::InvalidArgumentError(
           "Number of elements in input & output tensors don't match.");
     }
-    auto attr =
-        absl::any_cast<ReshapeAttributes>(ctx.node->operation.attributes);
-    if (attr.new_shape != output->tensor.shape) {
+    const auto& attr = absl::any_cast<const ReshapeAttributes&>(ctx.op_attr);
+    if (attr.new_shape.h != ctx.output_shapes[0][1] ||
+        attr.new_shape.w != ctx.output_shapes[0][2] ||
+        attr.new_shape.c != ctx.output_shapes[0][3]) {
       return absl::InvalidArgumentError(
           "Dimensions for output does not match new_shape attribute");
     }
@@ -67,10 +68,10 @@ class Reshape : public NodeShader {
     )";
     *generated_code = {
         /*parameters=*/{
-            {"output_data_0_w", output->tensor.shape.w},
-            {"input_data_0_w", input->tensor.shape.w},
-            {"input_channels", input->tensor.shape.c},
-            {"output_channels", output->tensor.shape.c},
+            {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
+            {"input_channels", static_cast<int>(ctx.input_shapes[0][3])},
+            {"output_data_0_w", static_cast<int>(ctx.output_shapes[0][2])},
+            {"output_channels", static_cast<int>(ctx.output_shapes[0][3])},
         },
         /*objects=*/{},
         /*shared_variables=*/{},
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc b/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc
index 004ae14fe8b..19694a0797a 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc
@@ -35,24 +35,21 @@ class Resize : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-    auto output = ctx.graph->FindOutputs(ctx.node->id)[0];
-    auto attr =
-        absl::any_cast<Resize2DAttributes>(ctx.node->operation.attributes);
+    const auto& attr = absl::any_cast<const Resize2DAttributes&>(ctx.op_attr);
 
-    if (input->tensor.shape.w > output->tensor.shape.w ||
-        input->tensor.shape.h > output->tensor.shape.h) {
+    if (ctx.input_shapes[0][2] > ctx.output_shapes[0][2] ||
+        ctx.input_shapes[0][1] > ctx.output_shapes[0][1]) {
       return absl::InvalidArgumentError("Output size is less than input size.");
     }
-    if (output->tensor.shape.w != attr.new_shape.w ||
-        output->tensor.shape.h != attr.new_shape.h) {
+    if (ctx.output_shapes[0][2] != attr.new_shape.w ||
+        ctx.output_shapes[0][1] != attr.new_shape.h) {
       return absl::InvalidArgumentError(
           "Output size does not match new_size in attributes.");
     }
-    if (input->tensor.shape.c != output->tensor.shape.c) {
+    if (ctx.input_shapes[0][3] != ctx.output_shapes[0][3]) {
       return absl::InvalidArgumentError("Input/output channels mismatch.");
     }
-    if (input->tensor.shape.h == 1 && input->tensor.shape.w == 1) {
+    if (ctx.input_shapes[0][1] == 1 && ctx.input_shapes[0][2] == 1) {
       // Copy a single element from input.
       *generated_code = {
           /*parameters=*/{},
@@ -67,13 +64,13 @@ class Resize : public NodeShader {
       return absl::OkStatus();
     }
     std::vector<Variable> parameters = {
-        {"input_data_0_h", input->tensor.shape.h},
-        {"input_data_0_w", input->tensor.shape.w},
+        {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+        {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
         {"scale_factor",
-         float2(CalculateResizeScale(input->tensor.shape.w,
-                                     output->tensor.shape.w, attr),
-                CalculateResizeScale(input->tensor.shape.h,
-                                     output->tensor.shape.h, attr))},
+         float2(CalculateResizeScale(ctx.input_shapes[0][2],
+                                     ctx.output_shapes[0][2], attr),
+                CalculateResizeScale(ctx.input_shapes[0][1],
+                                     ctx.output_shapes[0][1], attr))},
     };
 
     std::string source;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc b/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc
index ab4497c4b62..4b5e9958b86 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc
@@ -35,10 +35,7 @@ class Slice : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto output = ctx.graph->FindOutputs(ctx.node->id)[0];
-
-    auto attr =
-        absl::any_cast<const SliceAttributes&>(ctx.node->operation.attributes);
+    const auto& attr = absl::any_cast<const SliceAttributes&>(ctx.op_attr);
 
     const int4 channels(attr.starts.c, attr.strides.c, attr.ends.c, 0);
     const int4 heights(attr.starts.h, attr.strides.h, attr.ends.h, 0);
@@ -48,7 +45,7 @@ class Slice : public NodeShader {
         {"channels", channels},
         {"heights", heights},
         {"widths", widths},
-        {"dst_size", output->tensor.shape.c},
+        {"dst_size", static_cast<int>(ctx.output_shapes[0][3])},
     };
 
     std::string code;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
index 03a414c1547..f546fdd8fe9 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
@@ -43,11 +43,8 @@ class Softmax : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    const auto* input = ctx.graph->FindInputs(ctx.node->id)[0];
-    const auto* output = ctx.graph->FindOutputs(ctx.node->id)[0];
-    const auto& attr = absl::any_cast<const SoftmaxAttributes&>(
-        ctx.node->operation.attributes);
-    if (input->tensor.shape != output->tensor.shape) {
+    const auto& attr = absl::any_cast<const SoftmaxAttributes&>(ctx.op_attr);
+    if (ctx.input_shapes[0] != ctx.output_shapes[0]) {
       return absl::InvalidArgumentError(
           "Input and output shapes do not match.");
     }
@@ -55,7 +52,7 @@ class Softmax : public NodeShader {
       return absl::UnimplementedError(
           "Softmax is only supported for channels axis.");
     }
-    return input->tensor.shape.h == 1 && input->tensor.shape.w == 1
+    return ctx.input_shapes[0][1] == 1 && ctx.input_shapes[0][2] == 1
                ? GenerateCodeFor1x1(ctx, generated_code)
                : GenerateCodeGeneral(ctx, generated_code);
   }
@@ -63,15 +60,14 @@ class Softmax : public NodeShader {
  private:
   absl::Status GenerateCodeFor1x1(const GenerationContext& ctx,
                                   GeneratedCode* generated_code) const {
-    const auto* output = ctx.graph->FindOutputs(ctx.node->id)[0];
-    const int depth = IntegralDivideRoundUp(output->tensor.shape.c, 4);
+    const int depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
     std::vector<Variable> shared_variables = {
         {"partial_sum", std::vector<float4>(8)},
     };
     std::vector<Variable> uniform_parameters = {
         {"depth", depth},
-        {"depth_div_32", IntegralDivideRoundUp(depth, 32)},
-        {"mask", GetMask(output->tensor.shape.c)},
+        {"depth_div_32", DivideRoundUp(depth, 32)},
+        {"mask", GetMask(ctx.output_shapes[0][3])},
     };
     std::string source_code = R"(
   highp vec4 kOnes = vec4(1.0);
@@ -140,10 +136,10 @@ class Softmax : public NodeShader {
 
   absl::Status GenerateCodeGeneral(const GenerationContext& ctx,
                                    GeneratedCode* generated_code) const {
-    const auto* output = ctx.graph->FindOutputs(ctx.node->id)[0];
     std::vector<Variable> parameters = {
-        {"src_depth", IntegralDivideRoundUp(output->tensor.shape.c, 4)},
-        {"mask", GetMask(output->tensor.shape.c)},
+        {"src_depth",
+         DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)},
+        {"mask", GetMask(ctx.output_shapes[0][3])},
     };
 
     std::string source_code = R"(
@@ -168,7 +164,9 @@ class Softmax : public NodeShader {
         /*parameters=*/std::move(parameters),
         /*objects=*/{},
         /*shared_variables=*/{},
-        /*workload=*/uint3(output->tensor.shape.w, output->tensor.shape.h, 1),
+        /*workload=*/
+        uint3(static_cast<int>(ctx.output_shapes[0][2]),
+              static_cast<int>(ctx.output_shapes[0][1]), 1),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(source_code),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc
index b1e650a1ffc..9e172ffd69f 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc
@@ -33,9 +33,8 @@ class SpaceToDepth : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    const auto attr =
-        absl::any_cast<SpaceToDepthAttributes>(ctx.node->operation.attributes);
-    const auto& input_data_0 = ctx.graph->FindInputs(ctx.node->id)[0]->tensor;
+    const auto& attr =
+        absl::any_cast<const SpaceToDepthAttributes&>(ctx.op_attr);
     std::string code = R"(
       for (int i = 0; i < 4; ++i) {
         int dst_c = 4 * gid.z + i;
@@ -50,7 +49,7 @@ class SpaceToDepth : public NodeShader {
     *generated_code = {
         /*parameters=*/{
             {"block_size", attr.block_size},
-            {"input_data_0_c", input_data_0.shape.c},
+            {"input_data_0_c", static_cast<int>(ctx.input_shapes[0][3])},
         },
         /*objects=*/{},
         /*shared_variables=*/{},
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
index eb28672d49f..4b848435df6 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
@@ -37,15 +37,14 @@ class ConvolutionTransposedBuffers : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-    auto attr = absl::any_cast<const ConvolutionTransposedAttributes&>(
-        ctx.node->operation.attributes);
+    const auto& attr =
+        absl::any_cast<const ConvolutionTransposedAttributes&>(ctx.op_attr);
     auto weights = attr.weights.shape;
 
     std::vector<Variable> parameters = {
-        {"input_data_0_h", input->tensor.shape.h},
-        {"input_data_0_w", input->tensor.shape.w},
-        {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+        {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+        {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
+        {"src_depth", DivideRoundUp(weights.i, 4)},
         {"kernel_size", int2(weights.w, weights.h)},
         {"stride", int2(attr.stride.w, attr.stride.h)},
         {"padding", int2(weights.w - 1 - attr.padding.prepended.w,
diff --git a/tensorflow/lite/delegates/gpu/gl/node_shader.h b/tensorflow/lite/delegates/gpu/gl/node_shader.h
index d98bdbf8914..0575182f361 100644
--- a/tensorflow/lite/delegates/gpu/gl/node_shader.h
+++ b/tensorflow/lite/delegates/gpu/gl/node_shader.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_NODE_SHADER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_NODE_SHADER_H_
 
+#include <array>
 #include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -94,10 +96,16 @@ class NodeShader {
 
   // A context for generating a code.
   struct GenerationContext {
-    const GraphFloat32* graph;
     const GpuInfo* gpu_info;
-    const Node* node;
     CompilationOptions compiler_options;
+
+    // Information extracted & copied from compiled graph.
+    const std::string& op_type;
+    const absl::any& op_attr;
+    // Do NOT use StrongShape<Layout::BHWC> in preparation for
+    // RankedTensorType::getShape() which returns ArrayRef<int64_t>.
+    std::vector<std::array<int64_t, 4>> input_shapes;
+    std::vector<std::array<int64_t, 4>> output_shapes;
   };
 
   // Generates shader code for a node. The code should be just a function body.
diff --git a/tensorflow/lite/delegates/gpu/gl/object.h b/tensorflow/lite/delegates/gpu/gl/object.h
index 7ea161400b9..3463d0678b6 100644
--- a/tensorflow/lite/delegates/gpu/gl/object.h
+++ b/tensorflow/lite/delegates/gpu/gl/object.h
@@ -153,17 +153,17 @@ inline Object MakeReadonlyBuffer(const ObjectSize& size,
 
 inline Object MakeReadonlyObject(const std::vector<float>& data) {
   return MakeReadonlyObject(
-      IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
+      DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
 }
 
 inline Object MakeReadonlyTexture(const std::vector<float>& data) {
   return MakeReadonlyTexture(
-      IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
+      DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
 }
 
 inline Object MakeReadonlyBuffer(const std::vector<float>& data) {
   return MakeReadonlyBuffer(
-      IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
+      DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
 }
 
 // TODO(akulik): find better place for functions below.
@@ -172,7 +172,7 @@ inline uint3 GetPHWC4Size(const BHWC& shape) {
   uint3 size;
   size.x = shape.w;
   size.y = shape.h;
-  size.z = shape.b * IntegralDivideRoundUp(shape.c, 4);
+  size.z = shape.b * DivideRoundUp(shape.c, 4);
   return size;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index 45e791b9d45..f6b2067d90c 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -145,7 +145,7 @@ class Delegate {
 
     // TODO(impjdi): Remove code duplication.
     auto values = graph.values();
-    auto find_value = [&](int tensor_index) -> Value<TensorRef<BHWC>>* {
+    auto find_value = [&](int tensor_index) -> Value* {
       for (auto value : values) {
         if (value->tensor.ref == tensor_index) return value;
       }
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index 8d802ae044a..78cab0d2cbf 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -17,18 +17,19 @@ package org.tensorflow.lite.gpu;
 
 import java.io.Closeable;
 import org.tensorflow.lite.Delegate;
+import org.tensorflow.lite.annotations.UsedByReflection;
 
 /**
  * {@link Delegate} for GPU inference.
  *
- * <p>Note: When calling {@code Interpreter.modifyGraphWithDelegate()}/
- * {@code Interpreter.Options.addDelegate()} and {@code Interpreter.run()}, the caller must have an
- * {@code EGLContext} in the <b>current thread</b> and {@code Interpreter.run()} must be called from
- * the same {@code EGLContext}. If an {@code EGLContext} does not exist, the delegate will
- * internally create one, but then the developer must ensure that {@code Interpreter.run()} is
- * always called from the same thread in which {@code Interpreter.modifyGraphWithDelegate()} was
- * called.
+ * <p>Note: When calling {@code Interpreter.modifyGraphWithDelegate()}/ {@code
+ * Interpreter.Options.addDelegate()} and {@code Interpreter.run()}, the caller must have an {@code
+ * EGLContext} in the <b>current thread</b> and {@code Interpreter.run()} must be called from the
+ * same {@code EGLContext}. If an {@code EGLContext} does not exist, the delegate will internally
+ * create one, but then the developer must ensure that {@code Interpreter.run()} is always called
+ * from the same thread in which {@code Interpreter.modifyGraphWithDelegate()} was called.
  */
+@UsedByReflection("TFLiteSupport/model/GpuDelegateProxy")
 public class GpuDelegate implements Delegate, Closeable {
 
   private static final long INVALID_DELEGATE_HANDLE = 0;
@@ -62,6 +63,18 @@ public class GpuDelegate implements Delegate, Closeable {
       return this;
     }
 
+    /**
+     * Enables running quantized models with the delegate. Defaults to false.
+     *
+     * <p>WARNING: This is an experimental API and subject to change.
+     *
+     * @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models.
+     */
+    public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) {
+      this.quantizedModelsAllowed = quantizedModelsAllowed;
+      return this;
+    }
+
     /**
      * Sets the inference preference for precision/compilation/runtime tradeoffs.
      *
@@ -74,13 +87,19 @@ public class GpuDelegate implements Delegate, Closeable {
     }
 
     boolean precisionLossAllowed = true;
+    boolean quantizedModelsAllowed = false;
     int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
   }
 
   public GpuDelegate(Options options) {
-    delegateHandle = createDelegate(options.precisionLossAllowed, options.inferencePreference);
+    delegateHandle =
+        createDelegate(
+            options.precisionLossAllowed,
+            options.quantizedModelsAllowed,
+            options.inferencePreference);
   }
 
+  @UsedByReflection("TFLiteSupport/model/GpuDelegateProxy")
   public GpuDelegate() {
     this(new Options());
   }
@@ -107,7 +126,8 @@ public class GpuDelegate implements Delegate, Closeable {
     System.loadLibrary(TFLITE_GPU_LIB);
   }
 
-  private static native long createDelegate(boolean precisionLossAllowed, int preference);
+  private static native long createDelegate(
+      boolean precisionLossAllowed, boolean quantizedModelsAllowed, int preference);
 
   private static native void deleteDelegate(long delegateHandle);
 }
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index 089e2c2f816..900cc0e0d75 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -23,7 +23,7 @@ extern "C" {
 
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
-    jint inference_preference) {
+    jboolean quantized_models_allowed, jint inference_preference) {
   TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
   if (precision_loss_allowed == JNI_TRUE) {
     options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
@@ -31,6 +31,10 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
         TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
     options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
   }
+  options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
+  if (quantized_models_allowed) {
+    options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+  }
   options.inference_preference = static_cast<int32_t>(inference_preference);
   return reinterpret_cast<jlong>(TfLiteGpuDelegateV2Create(&options));
 }
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 192c787b0db..4db8f3d071d 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -80,7 +80,7 @@ objc_library(
 ios_unit_test(
     name = "common_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -117,7 +117,7 @@ objc_library(
 ios_unit_test(
     name = "compiled_model_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -205,7 +205,7 @@ objc_library(
 ios_unit_test(
     name = "inference_context_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -235,7 +235,7 @@ ios_application(
         "iphone",
     ],
     infoplists = ["Info.plist"],
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     provisioning_profile = "//tensorflow/lite/delegates/gpu/metal:provisioning_profile.mobileprovision",
     tags = tf_gpu_tests_tags() + [
         "local",
@@ -267,7 +267,7 @@ objc_library(
 
 ios_unit_test(
     name = "ComponentsTests",
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + ["notap"],
     test_host = ":TestApplication",
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index b4fa657b5b8..14ec8b2b09f 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -54,20 +54,17 @@ namespace gpu {
 namespace metal {
 namespace {
 
-bool IsWidthBroadcastedForSecondInput(
-    const std::vector<Value<TensorRef<BHWC>>*>& inputs) {
+bool IsWidthBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
   return inputs.size() == 2 &&
          inputs[0]->tensor.shape.w != inputs[1]->tensor.shape.w &&
          inputs[1]->tensor.shape.w == 1;
 }
-bool IsHeightBroadcastedForSecondInput(
-    const std::vector<Value<TensorRef<BHWC>>*>& inputs) {
+bool IsHeightBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
   return inputs.size() == 2 &&
          inputs[0]->tensor.shape.h != inputs[1]->tensor.shape.h &&
          inputs[1]->tensor.shape.h == 1;
 }
-bool IsChannelsBroadcastedForSecondInput(
-    const std::vector<Value<TensorRef<BHWC>>*>& inputs) {
+bool IsChannelsBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
   return inputs.size() == 2 &&
          inputs[0]->tensor.shape.c != inputs[1]->tensor.shape.c &&
          inputs[1]->tensor.shape.c == 1;
@@ -170,10 +167,10 @@ std::vector<ComputeTaskDescriptorPtr> SelectWinograd36To4x4(
 
 bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
                                    const BHWC& dst_shape) {
-  const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
-  const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int tiles_x = DivideRoundUp(dst_shape.w, 4);
+  const int tiles_y = DivideRoundUp(dst_shape.h, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const bool suitable_attributes =
       attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
       attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
@@ -228,12 +225,16 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
       break;
     }
     case OperationType::CONVOLUTION_2D: {
+      if (graph.FindInputs(node->id).size() != 1) {
+        return absl::UnimplementedError(
+            "Convolution does not support more than 1 runtime tensor");
+      }
       const auto dst_shape = graph.FindOutputs(node_id)[0]->tensor.shape;
       auto attr =
           absl::any_cast<Convolution2DAttributes>(node->operation.attributes);
       if (IsSuitableForWinograd4x4To6x6(attr, dst_shape)) {
-        int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
-        int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
+        int tiles_x = DivideRoundUp(dst_shape.w, 4);
+        int tiles_y = DivideRoundUp(dst_shape.h, 4);
 
         Winograd4x4To36Attributes wino_up_attr;
         wino_up_attr.padding = attr.padding;
diff --git a/tensorflow/lite/delegates/gpu/metal/buffer_convert.mm b/tensorflow/lite/delegates/gpu/metal/buffer_convert.mm
index 8ddf78eac41..1fdf97ba501 100644
--- a/tensorflow/lite/delegates/gpu/metal/buffer_convert.mm
+++ b/tensorflow/lite/delegates/gpu/metal/buffer_convert.mm
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/common.h"
 
-using ::tflite::gpu::IntegralDivideRoundUp;
 using ::tflite::gpu::BHWC;
+using ::tflite::gpu::DivideRoundUp;
 using ::tflite::gpu::metal::CreateComputeProgram;
 
 @implementation TFLBufferConvert {
@@ -102,10 +102,10 @@ using ::tflite::gpu::metal::CreateComputeProgram;
   [encoder setBytes:uniforms.data() length:uniforms.size() * sizeof(int) atIndex:2];
 
   MTLSize group_size = MTLSizeMake(16, 16, 1);
-  int layers = IntegralDivideRoundUp(shape.c, 4);
-  int groups_x = IntegralDivideRoundUp(shape.w, group_size.width);
-  int groups_y = IntegralDivideRoundUp(shape.h, group_size.height);
-  int groups_z = IntegralDivideRoundUp(layers, group_size.depth);
+  int layers = DivideRoundUp(shape.c, 4);
+  int groups_x = DivideRoundUp(shape.w, group_size.width);
+  int groups_y = DivideRoundUp(shape.h, group_size.height);
+  int groups_z = DivideRoundUp(layers, group_size.depth);
   MTLSize groups_count = MTLSizeMake(groups_x, groups_y, groups_z);
   [encoder dispatchThreadgroups:groups_count threadsPerThreadgroup:group_size];
 }
diff --git a/tensorflow/lite/delegates/gpu/metal/common_test.mm b/tensorflow/lite/delegates/gpu/metal/common_test.mm
index 7cedac0f799..48cdb679461 100644
--- a/tensorflow/lite/delegates/gpu/metal/common_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/common_test.mm
@@ -57,12 +57,12 @@ kernel void FunctionName(device TYPE* const src_buffer[[buffer(0)]],
 
   NSDictionary* macrosFloat4 = @{@"TYPE" : @"float4"};
   status = CreateComputeProgram(device, code, functionName, macrosFloat4, &program);
-  XCTAssertTrue(status.ok(), @"%s", std::string(status.messasge()).c_str());
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
   XCTAssertNotNil(program);
 
   NSDictionary* macrosHalf4 = @{@"TYPE" : @"half4"};
   status = CreateComputeProgram(device, code, functionName, macrosHalf4, &program);
-  XCTAssertTrue(status.ok(), @"%s", std::string(status.messasge()).c_str());
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
   XCTAssertNotNil(program);
 
   // This compilation is intended to be incorrect
diff --git a/tensorflow/lite/delegates/gpu/metal/compiled_model.cc b/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
index ce8e5e7ac79..74202edd585 100644
--- a/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
@@ -440,9 +440,9 @@ ComputeTaskDescriptorPtr NonLinkableStub(int operation_id, ValueId input_id,
   desc->resize_function = [input_id](const std::map<ValueId, BHWC>& buffers) {
     const auto& dimension = buffers.find(input_id)->second;
     uint3 groups_size{16, 16, 1};
-    uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
-                       IntegralDivideRoundUp(dimension.h, groups_size.y),
-                       IntegralDivideRoundUp(dimension.c, 4)};
+    uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
+                       DivideRoundUp(dimension.h, groups_size.y),
+                       DivideRoundUp(dimension.c, 4)};
     return std::make_pair(groups_size, groups_count);
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm b/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm
index 83870123321..3a76178d71f 100644
--- a/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm
@@ -159,7 +159,8 @@ static std::vector<ComputeTaskDescriptorPtr> Add2Linkable(int id, ValueId input_
   std::vector<ComputeTaskDescriptorPtr> descriptors;
   descriptors.push_back(ComputeTaskDescriptorPtr(new ComputeTaskDescriptor({
       id,
-      true,  // Is linkable?
+      true,  // linkable
+      true,  // associative_op
       R"(FLT4 linkable$0(FLT4 value, int linear_index, uint3 gid, device FLT4* const buffer2) {
            return value + buffer2[linear_index];
          }
@@ -250,12 +251,14 @@ static std::vector<ComputeTaskDescriptorPtr> Add2Linkable(int id, ValueId input_
 
 - (void)testAddOperationFused {
   auto graph = Add(1, 1, 3);
-  auto graph2 = Add2Linkable(2, 2, 3, 4);
+  auto graph2 = Add(1, 2, 4);
+  auto graph3 = Add2Linkable(2, 4, 3, 5);
   graph.insert(graph.end(), graph2.begin(), graph2.end());
+  graph.insert(graph.end(), graph3.begin(), graph3.end());
   std::vector<ComputeTaskDescriptorPtr> model;
-  auto status = ValidateOptimizeModel({1, 2}, {4}, graph, &model);
+  auto status = ValidateOptimizeModel({1, 2}, {5}, graph, &model);
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
-  XCTAssertTrue(model.size() == 1, @"Not fused, more than one task descriptor.");
+  XCTAssertTrue(model.size() <= 2, @"Not fused, more than two task descriptors.");
 }
 
 - (void)testBinaryOperationSuccess {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index a1052b8adf4..657e9b53a59 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -71,7 +71,7 @@ objc_library(
 ios_unit_test(
     name = "add_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -109,7 +109,7 @@ objc_library(
 ios_unit_test(
     name = "concat_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -151,7 +151,7 @@ objc_library(
 ios_unit_test(
     name = "conv_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -213,7 +213,7 @@ objc_library(
 ios_unit_test(
     name = "depthwise_conv_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -253,7 +253,7 @@ objc_library(
 ios_unit_test(
     name = "elementwise_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -293,7 +293,7 @@ objc_library(
 ios_unit_test(
     name = "fully_connected_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -332,7 +332,7 @@ objc_library(
 ios_unit_test(
     name = "max_unpooling_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -371,7 +371,7 @@ objc_library(
 ios_unit_test(
     name = "mean_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = [
         "notap",
@@ -450,7 +450,7 @@ objc_library(
 ios_unit_test(
     name = "padding_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -490,7 +490,7 @@ objc_library(
 ios_unit_test(
     name = "pooling_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -530,7 +530,7 @@ objc_library(
 ios_unit_test(
     name = "prelu_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -569,7 +569,7 @@ objc_library(
 ios_unit_test(
     name = "relu_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -608,7 +608,7 @@ objc_library(
 ios_unit_test(
     name = "resize_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -648,7 +648,7 @@ objc_library(
 ios_unit_test(
     name = "reshape_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -687,7 +687,7 @@ objc_library(
 ios_unit_test(
     name = "slice_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -727,7 +727,7 @@ objc_library(
 ios_unit_test(
     name = "softmax_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -764,7 +764,7 @@ objc_library(
 ios_unit_test(
     name = "space_to_depth_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -804,7 +804,7 @@ objc_library(
 ios_unit_test(
     name = "transpose_conv_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -885,7 +885,7 @@ objc_library(
 ios_unit_test(
     name = "winograd_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc b/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc
index c252ee0b348..56d270bfb69 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc
@@ -71,7 +71,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
     // Also it is easy to write a loop in this case, to prevent long kernel
     // generation.
     for (int i = 0; i < channels.size(); ++i) {
-      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      const int depth = DivideRoundUp(channels[i], 4);
       const std::string src_buffer = "src_buffer" + std::to_string(i);
       c += "  for (int i = 0; i < " + std::to_string(depth) + "; ++i) {\n";
       c += "    int src_index = i * U.src_size.w + xy_offset;\n";
@@ -88,7 +88,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
     int read_index = 0;
     int z = 0;
     for (int i = 0; i < channels.size(); ++i) {
-      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      const int depth = DivideRoundUp(channels[i], 4);
       const std::string src_buffer = "src_buffer" + std::to_string(i);
       for (int d = 0; d < depth; ++d) {
         const int channels_in_group = std::min(4, channels[i] - d * 4);
@@ -168,11 +168,11 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
          std::vector<int> uniform_params{
              src_shape.w,
              src_shape.h,
-             IntegralDivideRoundUp(src_shape.c, 4),
+             DivideRoundUp(src_shape.c, 4),
              src_shape.w * src_shape.h,
              dst_shape.w,
              dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
              dst_shape.w * dst_shape.h,
          };
          return GetByteBuffer(uniform_params);
@@ -184,9 +184,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
     uint3 grid(dst_shape.w, dst_shape.h, 1);
     uint3 group_size{8u, 4u, 1u};
     uint3 groups;
-    groups.x = IntegralDivideRoundUp(grid.x, group_size.x);
-    groups.y = IntegralDivideRoundUp(grid.y, group_size.y);
-    groups.z = IntegralDivideRoundUp(grid.z, group_size.z);
+    groups.x = DivideRoundUp(grid.x, group_size.x);
+    groups.y = DivideRoundUp(grid.y, group_size.y);
+    groups.z = DivideRoundUp(grid.z, group_size.z);
     return std::make_pair(group_size, groups);
   };
 
@@ -265,7 +265,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
        [output_id](const std::map<ValueId, BHWC>& buffers) {
          const auto& dimension = buffers.find(output_id)->second;
          std::vector<int> uniform_params{dimension.w, dimension.h,
-                                         IntegralDivideRoundUp(dimension.c, 4),
+                                         DivideRoundUp(dimension.c, 4),
                                          /*padding=*/0};
          return GetByteBuffer(uniform_params);
        }},
@@ -274,9 +274,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
   desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     const auto& output_dims = buffers.find(output_id)->second;
     const uint3 groups_size{8, 4, 1};
-    int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(output_dims.c, 4);
+    int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
+    int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
+    int groups_z = DivideRoundUp(output_dims.c, 4);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
 
@@ -356,7 +356,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
        [output_id](const std::map<ValueId, BHWC>& buffers) {
          const auto& dimension = buffers.find(output_id)->second;
          std::vector<int> uniform_params{dimension.w, dimension.h,
-                                         IntegralDivideRoundUp(dimension.c, 4),
+                                         DivideRoundUp(dimension.c, 4),
                                          /*padding=*/0};
          return GetByteBuffer(uniform_params);
        }},
@@ -365,9 +365,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
   desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     const auto& output_dims = buffers.find(output_id)->second;
     const uint3 groups_size{8, 4, 1};
-    int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(output_dims.c, 4);
+    int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
+    int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
+    int groups_z = DivideRoundUp(output_dims.c, 4);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
index f9ff87e75e2..04cd95de4b1 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@@ -72,7 +72,7 @@ struct ConvParams {
 namespace {
 
 int GetNumOutputSlices(int dst_channels) {
-  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
   if (dst_depth % 4 == 0 || dst_depth >= 16) {
     return 4;
   } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
@@ -571,8 +571,8 @@ kernel void ComputeFunction(
 std::vector<float> ReorderWeightsForConv(
     const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
     const ConvParams& params) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   std::vector<float> weights_reordered(
       weights.shape.w * weights.shape.h *
       AlignByN(dst_depth, params.block_size.z) * 4 * src_depth * 4);
@@ -580,8 +580,7 @@ std::vector<float> ReorderWeightsForConv(
   bool isO4I4 = params.weight_layout == WeightsInnerBlockLayout::O4I4;
 
   int counter = 0;
-  for (int d = 0; d < IntegralDivideRoundUp(dst_depth, params.block_size.z);
-       ++d) {
+  for (int d = 0; d < DivideRoundUp(dst_depth, params.block_size.z); ++d) {
     for (int y = 0; y < weights.shape.h; ++y) {
       for (int x = 0; x < weights.shape.w; ++x) {
         for (int s = 0; s < src_depth; ++s) {
@@ -618,17 +617,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
                                       const BHWC& dst_size,
                                       const Convolution2DAttributes& attr,
                                       const ConvParams& params) {
-  const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x);
-  const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y);
+  const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
+  const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
   std::vector<int> uniform_params = {
       src_size.w,
       src_size.h,
       src_size.w * src_size.h,
-      IntegralDivideRoundUp(src_size.c, 4),
+      DivideRoundUp(src_size.c, 4),
       dst_size.w,
       dst_size.h,
       dst_size.w * dst_size.h,
-      IntegralDivideRoundUp(dst_size.c, 4),
+      DivideRoundUp(dst_size.c, 4),
       attr.strides.w,
       attr.strides.h,
       -attr.padding.prepended.w,
@@ -652,17 +651,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
 std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
                                                  const BHWC& dst_size,
                                                  const ConvParams& params) {
-  const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x);
-  const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y);
+  const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
+  const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
   std::vector<int> uniform_params = {
       src_size.w,
       src_size.h,
       src_size.w * src_size.h,
-      IntegralDivideRoundUp(src_size.c, 4),
+      DivideRoundUp(src_size.c, 4),
       dst_size.w,
       dst_size.h,
       dst_size.w * dst_size.h,
-      IntegralDivideRoundUp(dst_size.c, 4),
+      DivideRoundUp(dst_size.c, 4),
       1,
       1,
       0,
@@ -685,38 +684,37 @@ std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
 
 int GetGroupsCount(const BHWC& dst_shape, const int3& wg_size,
                    const int3& block_size) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
 
-  int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
-  int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
-  int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
+  int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
+  int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
+  int grid_z = DivideRoundUp(dst_slices, block_size.z);
 
-  return IntegralDivideRoundUp(grid_x, wg_size.x) *
-         IntegralDivideRoundUp(grid_y, wg_size.y) *
-         IntegralDivideRoundUp(grid_z, wg_size.z);
+  return DivideRoundUp(grid_x, wg_size.x) * DivideRoundUp(grid_y, wg_size.y) *
+         DivideRoundUp(grid_z, wg_size.z);
 }
 
 int GetGroupsCountForLinearWH(const BHWC& dst_shape, const int3& wg_size,
                               const int3& block_size) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
 
-  int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
-  int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
-  int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
+  int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
+  int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
+  int grid_z = DivideRoundUp(dst_slices, block_size.z);
 
-  return IntegralDivideRoundUp(grid_x * grid_y, wg_size.x) *
-         IntegralDivideRoundUp(grid_z, wg_size.y);
+  return DivideRoundUp(grid_x * grid_y, wg_size.x) *
+         DivideRoundUp(grid_z, wg_size.y);
 }
 
 int GetGroupsCountForLinearWHS(const BHWC& dst_shape, const int3& wg_size,
                                const int3& block_size) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
 
-  int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
-  int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
-  int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
+  int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
+  int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
+  int grid_z = DivideRoundUp(dst_slices, block_size.z);
 
-  return IntegralDivideRoundUp(grid_x * grid_y * grid_z, wg_size.x);
+  return DivideRoundUp(grid_x * grid_y * grid_z, wg_size.x);
 }
 
 bool IsKernelXIs1(const Convolution2DAttributes& attr) {
@@ -758,8 +756,8 @@ int GetRecommendedBlockSize(const AppleGPUInfo& apple_info,
 ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
                                 const Convolution2DAttributes& attr,
                                 const BHWC& dst_shape) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
-  const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
+  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
 
   ConvParams params;
   params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
@@ -835,8 +833,8 @@ ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
 ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
                                        const Convolution2DAttributes& attr,
                                        const BHWC& dst_shape) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
-  const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
+  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
   int blk_total_size = GetRecommendedBlockSize(apple_info, dst_shape);
   int3 block_size = int3(1, 1, 1);
   if (blk_total_size >= 2 && apple_info.IsBionic()) {
@@ -917,8 +915,8 @@ ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
 ConvParams GetConvParamsForIntel(const Convolution2DAttributes& attr,
                                  const RuntimeOptions& options,
                                  const BHWC& dst_shape) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
-  const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
+  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
   ConvParams params;
   params.weights_upload_type = WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST;
   params.x_kernel_is_1 = IsKernelXIs1(attr);
@@ -1017,29 +1015,28 @@ ConvParams GetConvParams(const DeviceInfo& device_info,
 
 std::pair<uint3, uint3> GetDispatchSizes(const ConvParams& params,
                                          const BHWC& shape) {
-  const int dst_slices = IntegralDivideRoundUp(shape.c, 4);
+  const int dst_slices = DivideRoundUp(shape.c, 4);
 
-  int grid_x = IntegralDivideRoundUp(shape.w, params.block_size.x);
-  int grid_y = IntegralDivideRoundUp(shape.h, params.block_size.y);
-  int grid_z = IntegralDivideRoundUp(dst_slices, params.block_size.z);
+  int grid_x = DivideRoundUp(shape.w, params.block_size.x);
+  int grid_y = DivideRoundUp(shape.h, params.block_size.y);
+  int grid_z = DivideRoundUp(dst_slices, params.block_size.z);
 
   const uint3 group_size(params.work_group_size.x, params.work_group_size.y,
                          params.work_group_size.z);
   int3 wg;
   uint3 groups_count;
   if (params.linear_whs) {
-    wg.x = IntegralDivideRoundUp(grid_x * grid_y * grid_z,
-                                 params.work_group_size.x);
+    wg.x = DivideRoundUp(grid_x * grid_y * grid_z, params.work_group_size.x);
     groups_count = uint3(wg.x, 1, 1);
   } else if (params.linear_wh) {
-    wg.x = IntegralDivideRoundUp(grid_x * grid_y, params.work_group_size.x);
-    wg.y = IntegralDivideRoundUp(grid_z, params.work_group_size.y);
+    wg.x = DivideRoundUp(grid_x * grid_y, params.work_group_size.x);
+    wg.y = DivideRoundUp(grid_z, params.work_group_size.y);
     groups_count = uint3(wg[params.work_group_launch_order.x],
                          wg[params.work_group_launch_order.y], 1);
   } else {
-    wg.x = IntegralDivideRoundUp(grid_x, params.work_group_size.x);
-    wg.y = IntegralDivideRoundUp(grid_y, params.work_group_size.y);
-    wg.z = IntegralDivideRoundUp(grid_z, params.work_group_size.z);
+    wg.x = DivideRoundUp(grid_x, params.work_group_size.x);
+    wg.y = DivideRoundUp(grid_y, params.work_group_size.y);
+    wg.z = DivideRoundUp(grid_z, params.work_group_size.z);
     groups_count = uint3(wg[params.work_group_launch_order.x],
                          wg[params.work_group_launch_order.y],
                          wg[params.work_group_launch_order.z]);
@@ -1076,7 +1073,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
   std::string addr_space =
       params.weights_upload_type == WeightsUploadType::CONSTANT_MEM ? "constant"
                                                                     : "device";
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   desc->immutable_buffers = {
       {addr_space + " FLT4* const filters",
        GetByteBufferConverted(weights_reordered, options.storage_precision)},
@@ -1108,7 +1105,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionWino4x4To6x6(
     int id, ValueId input_id, ValueId output_id, const BHWC& dst_shape,
     const Convolution2DAttributes& attr, const DeviceInfo& device_info,
     const RuntimeOptions& options) {
-  const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
   ConvParams params;
   params.work_group_launch_order = int3(2, 0, 1);
   params.src_depth_loop_size = 1;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
index 0291cd7e856..fc9e0157ac0 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
@@ -34,6 +34,7 @@ using ::tflite::gpu::Axis;
 using ::tflite::gpu::BHWC;
 using ::tflite::gpu::Convolution2DAttributes;
 using ::tflite::gpu::DataType;
+using ::tflite::gpu::DivideRoundUp;
 using ::tflite::gpu::HW;
 using ::tflite::gpu::Linear;
 using ::tflite::gpu::OHWI;
@@ -44,7 +45,6 @@ using ::tflite::gpu::TensorRef;
 using ::tflite::gpu::ValueId;
 using ::tflite::gpu::metal::ConvolutionGeneric;
 using ::tflite::gpu::metal::ConvolutionWino4x4To6x6;
-using ::tflite::gpu::IntegralDivideRoundUp;
 using ::tflite::gpu::metal::CompareVectors;
 using ::tflite::gpu::metal::SingleOpModel;
 
@@ -275,7 +275,7 @@ using ::tflite::gpu::metal::SingleOpModel;
   BHWC conv_shape;
   conv_shape.b = dst_shape.b;
   conv_shape.h = 36;
-  conv_shape.w = IntegralDivideRoundUp(new_width, 4) * IntegralDivideRoundUp(new_height, 4);
+  conv_shape.w = DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
   conv_shape.c = dst_shape.c;
 
   TensorFloat32 src_tensor;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
index 6c26a87c267..8c223948bee 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
@@ -208,7 +208,7 @@ kernel void ComputeFunction(
 // DepthWiseConv3x3Stride1x1
 std::vector<float> ReorderWeightsDepthWiseConv3x3Stride1x1(
     const DepthwiseConvolution2DAttributes& attr) {
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   const int kernel_x = 3;
   const int kernel_y = 3;
   std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
@@ -250,11 +250,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride1x1(
       src_size.w,
       src_size.h,
       src_size.w * src_size.h,
-      IntegralDivideRoundUp(src_size.c, 4),
+      DivideRoundUp(src_size.c, 4),
       dst_size.w,
       dst_size.h,
       dst_size.w * dst_size.h,
-      IntegralDivideRoundUp(dst_size.c, 4),
+      DivideRoundUp(dst_size.c, 4),
       -params.padding.prepended.w,
       -params.padding.prepended.h,
       0,  // dummy, for alignment
@@ -403,7 +403,7 @@ kernel void ComputeFunction(
 // DepthWiseConv3x3Stride2
 std::vector<float> ReorderWeightsDepthWiseConv3x3Stride2(
     const DepthwiseConvolution2DAttributes& attr) {
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   const int kernel_x = 3;
   const int kernel_y = 3;
   std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
@@ -445,11 +445,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride2(
       src_size.w,
       src_size.h,
       src_size.w * src_size.h,
-      IntegralDivideRoundUp(src_size.c, 4),
+      DivideRoundUp(src_size.c, 4),
       dst_size.w,
       dst_size.h,
       dst_size.w * dst_size.h,
-      IntegralDivideRoundUp(dst_size.c, 4),
+      DivideRoundUp(dst_size.c, 4),
       -attr.padding.prepended.w,
       -attr.padding.prepended.h,
       attr.strides.w,
@@ -586,11 +586,11 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
          std::vector<int> uniform_params{
              dimension.w,
              dimension.h,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
              0,
              output_dimension.w,
              output_dimension.h,
-             IntegralDivideRoundUp(output_dimension.c, 4),
+             DivideRoundUp(output_dimension.c, 4),
              0,
              attr.strides.w,
              attr.strides.h,
@@ -612,9 +612,9 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
   desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     const auto& dimension = buffers.find(output_id)->second;
     uint3 groups_size{8, 4, 1};
-    uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
-                       IntegralDivideRoundUp(dimension.h, groups_size.y),
-                       IntegralDivideRoundUp(dimension.c, 4)};
+    uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
+                       DivideRoundUp(dimension.h, groups_size.y),
+                       DivideRoundUp(dimension.c, 4)};
     return std::make_pair(groups_size, groups_count);
   };
 
@@ -661,17 +661,17 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride1x1(
 
   desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     const auto& dimension = buffers.find(output_id)->second;
-    const int grid_x = IntegralDivideRoundUp(dimension.w, 2);
-    const int grid_y = IntegralDivideRoundUp(dimension.h, 2);
-    const int grid_z = IntegralDivideRoundUp(dimension.c, 4);
+    const int grid_x = DivideRoundUp(dimension.w, 2);
+    const int grid_y = DivideRoundUp(dimension.h, 2);
+    const int grid_z = DivideRoundUp(dimension.c, 4);
     uint3 group_size{8, 4, 1};
     if (grid_x <= 4) {
       group_size.x = 4;
       group_size.z = grid_z % 2 == 0 ? 2 : 1;
     }
-    const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
-    const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
-    const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
+    const int groups_x = DivideRoundUp(grid_x, group_size.x);
+    const int groups_y = DivideRoundUp(grid_y, group_size.y);
+    const int groups_z = DivideRoundUp(grid_z, group_size.z);
     return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
   };
 
@@ -726,12 +726,12 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride2(
   desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     const auto& dimension = buffers.find(output_id)->second;
     const int grid_x = dimension.w;
-    const int grid_y = IntegralDivideRoundUp(dimension.h, 2);
-    const int grid_z = IntegralDivideRoundUp(dimension.c, 4);
+    const int grid_y = DivideRoundUp(dimension.h, 2);
+    const int grid_z = DivideRoundUp(dimension.c, 4);
     const uint3 group_size{8, 4, 1};
-    const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
-    const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
-    const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
+    const int groups_x = DivideRoundUp(grid_x, group_size.x);
+    const int groups_y = DivideRoundUp(grid_y, group_size.y);
+    const int groups_z = DivideRoundUp(grid_z, group_size.z);
     return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
index 331f3cc051e..9872328f6d7 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
@@ -45,7 +45,7 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
   const std::string barrier = device_info.IsWaveSizeEqualTo32()
                                   ? "SIMDGROUP_BARRIER"
                                   : "threadgroup_barrier";
-  const int src_depth = IntegralDivideRoundUp(src_channels, 4);
+  const int src_depth = DivideRoundUp(src_channels, 4);
   std::stringstream code;
   code << R"(
     #include <metal_stdlib>
@@ -116,9 +116,8 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
   }
 }
   )";
-  const int src_depth_sub_groups = shared_memory
-                                       ? IntegralDivideRoundUp(src_depth, 32)
-                                       : IntegralDivideRoundUp(src_depth, 4);
+  const int src_depth_sub_groups = shared_memory ? DivideRoundUp(src_depth, 32)
+                                                 : DivideRoundUp(src_depth, 4);
   return absl::Substitute(code.str(), src_depth_sub_groups, barrier);
 }
 }  // namespace
@@ -146,7 +145,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
   bool shared_memory =
       device_info.IsAppleGPU() &&
       device_info.apple_info.IsLocalMemoryPreferredOverGlobal();
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   const int src_depth_aligned = AlignByN(src_depth, shared_memory ? 32 : 4);
   const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
 
@@ -179,8 +178,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
       {"constant uniforms& params",
        [attr](const std::map<ValueId, BHWC>& buffers) {
          std::vector<uint32_t> uniform_params{
-             static_cast<uint32_t>(
-                 IntegralDivideRoundUp(attr.weights.shape.i, 4)),
+             static_cast<uint32_t>(DivideRoundUp(attr.weights.shape.i, 4)),
              static_cast<uint32_t>(AlignByN(attr.weights.shape.o, 8)),
              static_cast<uint32_t>(attr.weights.shape.o),
              static_cast<uint32_t>(0),
@@ -192,7 +190,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
   desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
     const uint3 groups_size{8, 4, 1};
     const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
-    int groups_x = IntegralDivideRoundUp(dst_channels_aligned, groups_size.x);
+    int groups_x = DivideRoundUp(dst_channels_aligned, groups_size.x);
     return std::make_pair(groups_size, uint3{groups_x, 1, 1});
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
index a94bf1c364f..d0e326baf2c 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
@@ -131,9 +131,9 @@ std::vector<ComputeTaskDescriptorPtr> MaxUnpooling(
     const auto& src_shape = buffers.find(input_id)->second;
     BHWC dst_shape = CalculateOutputShape(src_shape, params);
     const uint3 groups_size{16, 16, 1};
-    int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(dst_shape.c, 4);
+    int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
+    int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
+    int groups_z = DivideRoundUp(dst_shape.c, 4);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc b/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
index b4e06fb8c0f..431b1e5d6db 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
@@ -133,7 +133,7 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
        [input_id, output_id,
         work_group_size](const std::map<ValueId, BHWC>& buffers) {
          const auto& src_shape = buffers.find(input_id)->second;
-         const int src_slices = IntegralDivideRoundUp(src_shape.c, 4);
+         const int src_slices = DivideRoundUp(src_shape.c, 4);
          struct uniforms {
            int4 src_size;
            float4 inv_multipliers;
@@ -153,8 +153,8 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
   desc->resize_function = [output_id, work_group_size](
                               const std::map<ValueId, BHWC>& buffers) {
     BHWC dst_shape = buffers.find(output_id)->second;
-    const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
-    const int groups_z = IntegralDivideRoundUp(dst_slices, work_group_size.z);
+    const int dst_slices = DivideRoundUp(dst_shape.c, 4);
+    const int groups_z = DivideRoundUp(dst_slices, work_group_size.z);
     return std::make_pair(work_group_size, uint3{1, 1, groups_z});
   };
   return {desc};
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc b/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc
index bc63a231498..b117df93197 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc
@@ -177,12 +177,12 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
              dimension.w,
              dimension.h,
              dimension.c,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
              // int4 dst_size
              output_dimension.w,
              output_dimension.h,
              output_dimension.c,
-             IntegralDivideRoundUp(output_dimension.c, 4),
+             DivideRoundUp(output_dimension.c, 4),
              // int4 prepended padding
              attr.prepended.w,
              attr.prepended.h,
@@ -198,10 +198,10 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
     const uint3 groups_size{16, 16, 1};
     const auto& src_shape = buffers.find(input_id)->second;
     BHWC dst_shape = CalculateOutputShape(src_shape, attr);
-    const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
+    const int dst_layers = DivideRoundUp(dst_shape.c, 4);
+    int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
+    int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
+    int groups_z = DivideRoundUp(dst_layers, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
index 3ba8c90306d..eaf4c9d4d4c 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
@@ -224,11 +224,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
          std::vector<int> uniform_params = {
              dimension.w,
              dimension.h,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
              dimension.w * dimension.h,
              output_dimension.w,
              output_dimension.h,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
              output_dimension.w * output_dimension.h,
              params.strides.w,
              params.strides.h,
@@ -242,11 +242,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
   desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     BHWC dst_shape = buffers.find(output_id)->second;
     const uint3 grid =
-        uint3(dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4));
+        uint3(dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4));
     const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
-    int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    int groups_x = DivideRoundUp(grid.x, groups_size.x);
+    int groups_y = DivideRoundUp(grid.y, groups_size.y);
+    int groups_z = DivideRoundUp(grid.z, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
index 42b8a7379ec..3bf392d051f 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
@@ -159,11 +159,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshape(int id, ValueId input_id,
 
   desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
     const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
-                             IntegralDivideRoundUp(attr.new_shape.c, 4));
+                             DivideRoundUp(attr.new_shape.c, 4));
     const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
-    int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    int groups_x = DivideRoundUp(grid.x, groups_size.x);
+    int groups_y = DivideRoundUp(grid.y, groups_size.y);
+    int groups_z = DivideRoundUp(grid.z, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
 
@@ -197,14 +197,14 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
          const auto& dst_dim = buffers.find(output_id)->second;
          std::vector<int32_t> uniform_params{
              // int4 src_size
-             src_dim.w, src_dim.h, IntegralDivideRoundUp(src_dim.c, 4),
+             src_dim.w, src_dim.h, DivideRoundUp(src_dim.c, 4),
              src_dim.w * src_dim.h,
              // int4 dst_size
-             dst_dim.w, dst_dim.h, IntegralDivideRoundUp(dst_dim.c, 4),
+             dst_dim.w, dst_dim.h, DivideRoundUp(dst_dim.c, 4),
              dst_dim.w * dst_dim.h,
              // int2 plane_xz
-             src_dim.w * IntegralDivideRoundUp(src_dim.c, 4),
-             dst_dim.w * IntegralDivideRoundUp(dst_dim.c, 4),
+             src_dim.w * DivideRoundUp(src_dim.c, 4),
+             dst_dim.w * DivideRoundUp(dst_dim.c, 4),
              0,  // dummy, for alignment
              0,  // dummy, for alignment
              0,  // dummy, for alignment
@@ -218,11 +218,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
 
   desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
     const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
-                             IntegralDivideRoundUp(attr.new_shape.c, 4));
+                             DivideRoundUp(attr.new_shape.c, 4));
     const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
-    int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    int groups_x = DivideRoundUp(grid.x, groups_size.x);
+    int groups_y = DivideRoundUp(grid.y, groups_size.y);
+    int groups_z = DivideRoundUp(grid.z, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc b/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
index 24d7bcf13bc..49a65c17a53 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
@@ -153,10 +153,10 @@ std::vector<ComputeTaskDescriptorPtr> Resize(int id, ValueId input_id,
   desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     const uint3 groups_size{16, 16, 1};
     const auto& dst_dim = buffers.find(output_id)->second;
-    int groups_x = IntegralDivideRoundUp(dst_dim.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_dim.h, groups_size.y);
-    const int dst_layers = IntegralDivideRoundUp(dst_dim.c, 4);
-    int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
+    int groups_x = DivideRoundUp(dst_dim.w, groups_size.x);
+    int groups_y = DivideRoundUp(dst_dim.h, groups_size.y);
+    const int dst_layers = DivideRoundUp(dst_dim.c, 4);
+    int groups_z = DivideRoundUp(dst_layers, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
   return {desc};
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc b/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc
index 8db9dd04589..b1d78dc68f9 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc
@@ -157,12 +157,12 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
              dimension.w,
              dimension.h,
              dimension.c,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
              // int4 dst_size
              output_dimension.w,
              output_dimension.h,
              output_dimension.c,
-             IntegralDivideRoundUp(output_dimension.c, 4),
+             DivideRoundUp(output_dimension.c, 4),
          };
          return GetByteBuffer(uniform_params);
        }},
@@ -173,10 +173,10 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
     const uint3 groups_size{16, 16, 1};
     const auto& src_shape = buffers.find(input_id)->second;
     BHWC dst_shape = CalculateOutputShape(src_shape, attr);
-    int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
-    const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4);
-    int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
+    int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
+    int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
+    const int dst_layers = DivideRoundUp(dst_shape.c, 4);
+    int groups_z = DivideRoundUp(dst_layers, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
index 0ed2e0650e1..0dfbbc830dd 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
@@ -169,8 +169,8 @@ std::vector<ComputeTaskDescriptorPtr> Softmax(int id, ValueId input_id,
   desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     uint3 groups_size{8, 4, 1};
     const auto& dimension = buffers.find(output_id)->second;
-    uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
-                       IntegralDivideRoundUp(dimension.h, groups_size.y), 1};
+    uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
+                       DivideRoundUp(dimension.h, groups_size.y), 1};
     return std::make_pair(groups_size, groups_count);
   };
 
@@ -198,13 +198,13 @@ std::vector<ComputeTaskDescriptorPtr> Softmax1x1(int id, ValueId input_id,
   desc->uniform_buffers = {
       {"constant uniforms& params",
        [channels_count](const std::map<ValueId, BHWC>& buffers) {
-         const int src_depth = IntegralDivideRoundUp(channels_count, 4);
+         const int src_depth = DivideRoundUp(channels_count, 4);
          struct uniforms {
            int4 size;
            float4 mask;
          };
          uniforms params;
-         params.size = {src_depth, IntegralDivideRoundUp(src_depth, 32), 1, 1};
+         params.size = {src_depth, DivideRoundUp(src_depth, 32), 1, 1};
          params.mask = {0.0f, 0.0f, 0.0f, 0.0f};
          const int reminder = channels_count % 4 == 0 ? 4 : channels_count % 4;
          for (int i = 0; i < reminder; ++i) {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
index 3614174ef11..4c11c431f81 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
@@ -113,12 +113,12 @@ kernel void ComputeFunction($1 uint3 gid[[thread_position_in_grid]]) {
                             input_shape.h / attr.block_size,
                             input_shape.w / attr.block_size,
                             input_shape.c * attr.block_size * attr.block_size);
-    const uint3 grid = uint3(output_shape.w, output_shape.h,
-                             IntegralDivideRoundUp(output_shape.c, 4));
+    const uint3 grid =
+        uint3(output_shape.w, output_shape.h, DivideRoundUp(output_shape.c, 4));
     const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
-    const int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
-    const int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
-    const int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    const int groups_x = DivideRoundUp(grid.x, groups_size.x);
+    const int groups_y = DivideRoundUp(grid.y, groups_size.y);
+    const int groups_z = DivideRoundUp(grid.z, groups_size.z);
     return std::make_pair(groups_size, uint3(groups_x, groups_y, groups_z));
   };
   return {desc};
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
index 2282349cedd..4a7f356e822 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
@@ -130,8 +130,8 @@ std::string GetDeconvolution(const ConvolutionTransposedAttributes& attr) {
       constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
       attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
       inner_size_y, kernel_x - 1, kernel_y - 1);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
   return absl::Substitute(shader_source, src_depth * dst_channels_aligned,
                           src_depth, dst_depth, attr.weights.shape.o,
@@ -264,8 +264,8 @@ std::string GetDeconvolutionShared(const ConvolutionTransposedAttributes& attr,
       constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
       attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
       inner_size_y, kernel_x - 1, kernel_y - 1);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
   const int src_local_size_x = (workgroup_x + kernel_x) / attr.stride.w;
   const int src_local_size_y = (workgroup_y + kernel_y) / attr.stride.h;
@@ -464,7 +464,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
       (kThreadGroupWidth + params.weights.shape.w) / params.stride.w;
   const int src_local_size_y =
       (kThreadGroupHeight + params.weights.shape.h) / params.stride.h;
-  const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
   const int shared_size =
       sizeof(float) * 4 * src_depth * src_local_size_x * src_local_size_y;
   if (shared_size < 1000 * 16 &&
@@ -543,8 +543,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
     const uint3 groups_size{kThreadGroupWidth, kThreadGroupHeight, 1};
     BHWC dst_shape =
         CalculateOutputShape(buffers.find(input_id)->second, params);
-    int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
+    int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
+    int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
     int groups_z = 1;
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
@@ -556,8 +556,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
     int id, ValueId input_id, ValueId output_id,
     const ConvolutionTransposedAttributes& params,
     const DeviceInfo& device_info, const RuntimeOptions& options) {
-  const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(params.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(params.weights.shape.o, 4);
   const int kernel_x = 4;
   const int kernel_y = 4;
 
@@ -645,7 +645,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
        [input_id, output_id, params](const std::map<ValueId, BHWC>& buffers) {
          const auto& src_shape = buffers.find(input_id)->second;
          const auto& dst_shape = buffers.find(output_id)->second;
-         const int src_depth = IntegralDivideRoundUp(src_shape.c, 4);
+         const int src_depth = DivideRoundUp(src_shape.c, 4);
          std::vector<int> uniform_params{
              src_shape.w,
              src_shape.h,
@@ -653,7 +653,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
              src_shape.w * src_shape.h,
              dst_shape.w,
              dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
              0,
              4 * 16 * src_depth,
              0,
@@ -667,13 +667,13 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
   desc->resize_function = [output_id, block_size,
                            params](const std::map<ValueId, BHWC>& buffers) {
     const auto& dst_shape = buffers.find(output_id)->second;
-    const int grid_x = IntegralDivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
-    const int grid_y = IntegralDivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
-    const int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
+    const int grid_x = DivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
+    const int grid_y = DivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
+    const int grid_z = DivideRoundUp(dst_shape.c, 4);
     const uint3 group_size{8, 4, 1};
-    int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
+    int groups_x = DivideRoundUp(grid_x, group_size.x);
+    int groups_y = DivideRoundUp(grid_y, group_size.y);
+    int groups_z = DivideRoundUp(grid_z, group_size.z);
     return std::make_pair(group_size, uint3{groups_z, groups_x, groups_y});
   };
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
index 6d68e9e6704..2098155888d 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
@@ -486,8 +486,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
         BHWC dst_shape;
         dst_shape.b = src_shape.b;
         dst_shape.h = 36;
-        dst_shape.w = IntegralDivideRoundUp(new_width, 4) *
-                      IntegralDivideRoundUp(new_height, 4);
+        dst_shape.w =
+            DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
         dst_shape.c = src_shape.c;
         return dst_shape;
       }};
@@ -501,16 +501,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
                          attr.padding.appended.w - 2;
          int new_height = src_shape.h + attr.padding.prepended.h +
                           attr.padding.appended.h - 2;
-         int tiles_x = IntegralDivideRoundUp(new_width, 4);
-         int tiles_y = IntegralDivideRoundUp(new_height, 4);
+         int tiles_x = DivideRoundUp(new_width, 4);
+         int tiles_y = DivideRoundUp(new_height, 4);
          std::vector<int> sizes = {
              src_shape.w,
              src_shape.h,
-             IntegralDivideRoundUp(src_shape.c, 4),
+             DivideRoundUp(src_shape.c, 4),
              0,
              dst_shape.w,
              dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
              0,
              -attr.padding.prepended.w,
              -attr.padding.prepended.h,
@@ -529,12 +529,12 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
         src_shape.w + attr.padding.prepended.w + attr.padding.appended.w - 2;
     int new_height =
         src_shape.h + attr.padding.prepended.h + attr.padding.appended.h - 2;
-    int grid_x = IntegralDivideRoundUp(new_width, 4);
-    int grid_y = IntegralDivideRoundUp(new_height, 4);
-    int grid_z = IntegralDivideRoundUp(src_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
+    int grid_x = DivideRoundUp(new_width, 4);
+    int grid_y = DivideRoundUp(new_height, 4);
+    int grid_z = DivideRoundUp(src_shape.c, 4);
+    int groups_x = DivideRoundUp(grid_x, groups_size.x);
+    int groups_y = DivideRoundUp(grid_y, groups_size.y);
+    int groups_z = DivideRoundUp(grid_z, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
   return {desc};
@@ -563,8 +563,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
         BHWC dst_shape;
         dst_shape.b = src_shape.b;
         dst_shape.h = 36;
-        dst_shape.w = IntegralDivideRoundUp(new_width, 4) *
-                      IntegralDivideRoundUp(new_height, 4);
+        dst_shape.w =
+            DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
         dst_shape.c = src_shape.c;
         return dst_shape;
       }};
@@ -593,16 +593,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
                          attr.padding.appended.w - 2;
          int new_height = src_shape.h + attr.padding.prepended.h +
                           attr.padding.appended.h - 2;
-         int tiles_x = IntegralDivideRoundUp(new_width, 4);
-         int tiles_y = IntegralDivideRoundUp(new_height, 4);
+         int tiles_x = DivideRoundUp(new_width, 4);
+         int tiles_y = DivideRoundUp(new_height, 4);
          std::vector<int> sizes = {
              src_shape.w,
              src_shape.h,
-             IntegralDivideRoundUp(src_shape.c, 4),
+             DivideRoundUp(src_shape.c, 4),
              0,
              dst_shape.w,
              dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
              0,
              -attr.padding.prepended.w,
              -attr.padding.prepended.h,
@@ -619,10 +619,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
     const auto& dst_shape = buffers.find(output_id)->second;
     int grid_x = dst_shape.w;
     int grid_y = 6;
-    int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
+    int grid_z = DivideRoundUp(dst_shape.c, 4);
+    int groups_x = DivideRoundUp(grid_x, groups_size.x);
+    int groups_y = DivideRoundUp(grid_y, groups_size.y);
+    int groups_z = DivideRoundUp(grid_z, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
   return {desc};
@@ -665,8 +665,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
          const auto& src_shape = buffers.find(input_id)->second;
          const auto& dst_shape = buffers.find(output_id)->second;
          std::vector<int> sizes = {
-             src_shape.w, src_shape.h, IntegralDivideRoundUp(src_shape.c, 4), 0,
-             dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4), 0,
+             src_shape.w, src_shape.h, DivideRoundUp(src_shape.c, 4), 0,
+             dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4), 0,
          };
          return GetByteBuffer(sizes);
        }},
@@ -677,10 +677,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
     const auto& src_shape = buffers.find(input_id)->second;
     int grid_x = src_shape.w;
     int grid_y = 1;
-    int grid_z = IntegralDivideRoundUp(src_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
+    int grid_z = DivideRoundUp(src_shape.c, 4);
+    int groups_x = DivideRoundUp(grid_x, groups_size.x);
+    int groups_y = DivideRoundUp(grid_y, groups_size.y);
+    int groups_z = DivideRoundUp(grid_z, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
   return {desc};
@@ -734,16 +734,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
        [input_id, output_id](const std::map<ValueId, BHWC>& buffers) {
          const auto& src_shape = buffers.find(input_id)->second;
          const auto& dst_shape = buffers.find(output_id)->second;
-         const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
-         const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
+         const int tiles_x = DivideRoundUp(dst_shape.w, 4);
+         const int tiles_y = DivideRoundUp(dst_shape.h, 4);
          std::vector<int> sizes = {
              src_shape.w,
              src_shape.h,
-             IntegralDivideRoundUp(src_shape.c, 4),
+             DivideRoundUp(src_shape.c, 4),
              0,
              dst_shape.w,
              dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
              0,
              tiles_x,
              tiles_y,
@@ -757,14 +757,14 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
   desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     const uint3 groups_size{8, 4, 1};
     const auto& dst_shape = buffers.find(output_id)->second;
-    const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
-    const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
+    const int tiles_x = DivideRoundUp(dst_shape.w, 4);
+    const int tiles_y = DivideRoundUp(dst_shape.h, 4);
     int grid_x = tiles_x * tiles_y;
     int grid_y = 4;
-    int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
+    int grid_z = DivideRoundUp(dst_shape.c, 4);
+    int groups_x = DivideRoundUp(grid_x, groups_size.x);
+    int groups_y = DivideRoundUp(grid_y, groups_size.y);
+    int groups_z = DivideRoundUp(grid_z, groups_size.z);
     return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
   };
   return {desc};
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index 16fce886a17..8b8bfe147c3 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -120,19 +120,21 @@ class GpuAlarmClock {
       alarm_thread_ = std::thread([this]() {
         id<MTLCommandBuffer> prev_command_buffer;
         while (!release_thread_) {
-          if (active_alarms_ == total_alarms_) {
-            id<MTLCommandBuffer> command_buffer = [command_queue_ commandBuffer];
-            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
-            [encoder setComputePipelineState:stub_program_];
-            [encoder setBuffer:stub_buffer_ offset:0 atIndex:0];
-            [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1)
-                    threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-            [encoder endEncoding];
-            [command_buffer commit];
-            if (prev_command_buffer != nil) [prev_command_buffer waitUntilScheduled];
-            prev_command_buffer = command_buffer;
-          } else {
-            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+          @autoreleasepool {
+            if (active_alarms_ == total_alarms_) {
+              id<MTLCommandBuffer> command_buffer = [command_queue_ commandBuffer];
+              id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+              [encoder setComputePipelineState:stub_program_];
+              [encoder setBuffer:stub_buffer_ offset:0 atIndex:0];
+              [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1)
+                      threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+              [encoder endEncoding];
+              [command_buffer commit];
+              if (prev_command_buffer != nil) [prev_command_buffer waitUntilScheduled];
+              prev_command_buffer = command_buffer;
+            } else {
+              std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            }
           }
         }
       });
@@ -239,7 +241,7 @@ class Delegate {
 
     // TODO(impjdi): Remove code duplication.
     auto values = graph.values();
-    auto find_value = [&](int tensor_index) -> Value<TensorRef<BHWC>>* {
+    auto find_value = [&](int tensor_index) -> Value* {
       for (auto value : values) {
         if (value->tensor.ref == tensor_index) return value;
       }
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index efe7a75d889..11f6b188d7d 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -73,6 +73,7 @@ cc_library(
         "//tensorflow/lite/nnapi:nnapi_lib",
         "//tensorflow/lite/nnapi:nnapi_util",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
 )
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index c22cbe86175..46a6a720d1e 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -56,6 +56,7 @@ FloatActivationsOpTest/PRelu,29
 LogisticOpTest/LogisticOpTest/Sigmoid(.+nt8)?/\d+
 LogisticOpTest/LogisticOpTest/Sigmoid/\d+
 TanhOpTest/TanhOpTest/Tanh(.+nt8)?/\d+,29
+FloatActivationsOpTest/Elu,30
 FloatActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwishBias
@@ -300,10 +301,15 @@ VariedShapeSpec/ReshapeOpTest/RegularShapes/1
 VariedShapeSpec/ReshapeOpTest/WithStretchDimension/1
 
 # resize_bilinear_test
+// align_corners & half_pixel_centers are not implemented in NNAPI before API 30
+ResizeBilinearOpTest/ResizeBilinearOpTest.+HalfPixelCenters.*/0,30
 // Only models with constant size tensor are accelerated
 ResizeBilinearOpTest/ResizeBilinearOpTest/.+/0,29
 
 # resize_nearest_neighbor_test
+// align_corners & half_pixel_centers are not implemented in NNAPI before API 30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*/0,30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*/0,30
 // Only models with constant size tensor are accelerated
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest/.+/0,29
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 36d9da1f557..b3967800b44 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <functional>
 #include <initializer_list>
@@ -87,6 +88,16 @@ std::string NnApiErrorDescription(int error_code) {
       return "ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE";
     case ANEURALNETWORKS_UNAVAILABLE_DEVICE:
       return "ANEURALNETWORKS_UNAVAILABLE_DEVICE";
+    case ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT:
+      return "ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT";
+    case ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT:
+      return "ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT";
+    case ANEURALNETWORKS_RESOURCE_EXHAUSTED_TRANSIENT:
+      return "ANEURALNETWORKS_RESOURCE_EXHAUSTED_TRANSIENT";
+    case ANEURALNETWORKS_RESOURCE_EXHAUSTED_PERSISTENT:
+      return "ANEURALNETWORKS_RESOURCE_EXHAUSTED_PERSISTENT";
+    case ANEURALNETWORKS_DEAD_OBJECT:
+      return "ANEURALNETWORKS_DEAD_OBJECT";
     default:
       return "Unknown NNAPI error code: " + std::to_string(error_code);
   }
@@ -1613,7 +1624,7 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinResizeBilinear: {
-      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMaxOpVersion(version, 3, &val_ctx);
       const auto& input = context->tensors[node->inputs->data[0]];
       const auto output_dims = context->tensors[node->outputs->data[0]].dims;
       Expect(input.dims->size == 4,
@@ -1638,13 +1649,14 @@ bool NNAPIDelegateKernel::Validate(
       }
       auto builtin =
           reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
-      Expect(!builtin->align_corners,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support align_corners == true.", &val_ctx);
-      // TODO(b/147696142): Update when NNAPI delegate can support TF2 behavior.
-      Expect(!builtin->half_pixel_centers,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      if (android_sdk_version <= kMinSdkVersionForNNAPI12) {
+        Expect(!builtin->align_corners,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support align_corners == true.", &val_ctx);
+        Expect(!builtin->half_pixel_centers,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      }
       if (android_sdk_version < kMinSdkVersionForNNAPI12) {
         Expect(input.type == kTfLiteFloat32,
                NNAPIValidationFailureType::kUnsupportedInputType,
@@ -1652,15 +1664,20 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinResizeNearestNeighbor: {
-      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMaxOpVersion(version, 3, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
       auto builtin = reinterpret_cast<TfLiteResizeNearestNeighborParams*>(
           node->builtin_data);
-      Expect(!builtin->align_corners,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support align_corners == true.", &val_ctx);
+      if (android_sdk_version <= kMinSdkVersionForNNAPI12) {
+        Expect(!builtin->align_corners,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support align_corners == true.", &val_ctx);
+        Expect(!builtin->half_pixel_centers,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      }
     } break;
     case kTfLiteBuiltinSqueeze: {
       ExpectOpVersion(version, 1, &val_ctx);
@@ -1789,7 +1806,7 @@ bool NNAPIDelegateKernel::Validate(
              " NNAPI only support float tanh.", &val_ctx);
     } break;
     case kTfLiteBuiltinSub: {
-      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMaxOpVersion(version, 3, &val_ctx);
       const TfLiteType input_type =
           context->tensors[node->inputs->data[0]].type;
       Expect((android_sdk_version >= kMinSdkVersionForNNAPI11 &&
@@ -1798,6 +1815,13 @@ bool NNAPIDelegateKernel::Validate(
                   IsQuantized(input_type)),
              NNAPIValidationFailureType::kUnsupportedInputType,
              "NNAPI only support float sub.", &val_ctx);
+      const int input0_rank =
+          context->tensors[node->inputs->data[0]].dims->size;
+      const int input1_rank =
+          context->tensors[node->inputs->data[1]].dims->size;
+      Expect(input0_rank <= 4 && input1_rank <= 4,
+             NNAPIValidationFailureType::kUnsupportedOperandRank,
+             "Input rank must be <= 4", &val_ctx);
     } break;
     case kTfLiteBuiltinDiv: {
       ExpectOpVersion(version, 1, &val_ctx);
@@ -2311,13 +2335,18 @@ bool NNAPIDelegateKernel::Validate(
              NNAPIValidationFailureType::kUnsupportedInputType,
              "NNAPI only supports floating point input.", &val_ctx);
     } break;
+    case kTfLiteBuiltinElu: {
+      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI13,
+                                 &val_ctx);
+    } break;
     default:
       // All other operators are not mapped.
       AddValidationFailure(NNAPIValidationFailureType::kUnsupportedOperator,
                            "Unsupported operation type.", &val_ctx);
   }
   return val_ctx.is_valid;
-}
+}  // NOLINT(readability/fn_size)
 
 TfLiteStatus NNAPIDelegateKernel::Map(
     TfLiteContext* context, int builtin_code, int version,
@@ -2414,6 +2443,14 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       const int output_width = output.dims->data[2];
       mapping_args.builder->AddScalarInt32Operand(output_width);
       mapping_args.builder->AddScalarInt32Operand(output_height);
+      auto builtin = reinterpret_cast<TfLiteResizeBilinearParams*>(
+          mapping_args.node->builtin_data);
+      if (builtin->align_corners == true ||
+          builtin->half_pixel_centers == true) {
+        mapping_args.builder->AddScalarBoolOperand(false);  // Use NHWC format
+        mapping_args.builder->AddScalarBoolOperand(builtin->align_corners);
+        mapping_args.builder->AddScalarBoolOperand(builtin->half_pixel_centers);
+      }
       *nn_op_type = ANEURALNETWORKS_RESIZE_BILINEAR;
     } break;
     case kTfLiteBuiltinResizeNearestNeighbor: {
@@ -2423,7 +2460,13 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarInt32Operand(new_shape.data.i32[1]);
       mapping_args.builder->AddScalarInt32Operand(new_shape.data.i32[0]);
       mapping_args.builder->AddScalarBoolOperand(false);  // Use NHWC format
-
+      auto builtin = reinterpret_cast<TfLiteResizeNearestNeighborParams*>(
+          mapping_args.node->builtin_data);
+      if (builtin->align_corners == true ||
+          builtin->half_pixel_centers == true) {
+        mapping_args.builder->AddScalarBoolOperand(builtin->align_corners);
+        mapping_args.builder->AddScalarBoolOperand(builtin->half_pixel_centers);
+      }
       *nn_op_type = ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR;
     } break;
     case kTfLiteBuiltinSqueeze: {
@@ -3074,6 +3117,10 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims);
       *nn_op_type = ANEURALNETWORKS_REDUCE_SUM;
     } break;
+    case kTfLiteBuiltinElu: {
+      mapping_args.builder->AddScalarFloat32Operand(1.0);
+      *nn_op_type = ANEURALNETWORKS_ELU;
+    } break;
     default:
       // All other operators are not mapped.
       return kTfLiteError;
@@ -3114,7 +3161,8 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
                                     "creating NNAPI model", nnapi_errno);
     nn_model_.reset(model);
 
-    TF_LITE_ENSURE_STATUS(BuildGraph(context, params->input_tensors,
+    TF_LITE_ENSURE_STATUS(BuildGraph(context, delegate_options,
+                                     params->input_tensors,
                                      params->output_tensors, nnapi_errno));
   }
 
@@ -3208,6 +3256,22 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
     RETURN_TFLITE_ERROR_IF_NN_ERROR(context, set_caching_result,
                                     "configuring NNAPI caching", nnapi_errno);
   }
+  // Set compilation timeout if applicable.
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13) {
+    if (delegate_options.max_compilation_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksCompilation_setTimeout(
+              compilation,
+              delegate_options.max_compilation_timeout_duration_ns),
+          "setting compilation timeout", nnapi_errno);
+    }
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context,
+        nnapi_->ANeuralNetworksCompilation_setPriority(
+            compilation, delegate_options.execution_priority),
+        "setting compilation priority", nnapi_errno);
+  }
   const int finish_result =
       nnapi_->ANeuralNetworksCompilation_finish(compilation);
   if (finish_result != ANEURALNETWORKS_NO_ERROR) {
@@ -3274,6 +3338,27 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
   std::unique_ptr<ANeuralNetworksExecution, NNFreeExecution>
       execution_unique_ptr(execution, NNFreeExecution(nnapi_));
 
+  // Set compilation timeout if applicable.
+  const auto delegate_options =
+      StatefulNnApiDelegate::GetOptions(node->delegate);
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13) {
+    if (delegate_options.max_execution_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setTimeout(
+              execution, delegate_options.max_execution_timeout_duration_ns),
+          "setting execution timeout", nnapi_errno);
+    }
+    if (delegate_options.max_execution_loop_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setLoopTimeout(
+              execution,
+              delegate_options.max_execution_loop_timeout_duration_ns),
+          "setting execution loop timeout", nnapi_errno);
+    }
+  }
+
   // Set the input tensor buffers. Note: we access tflite tensors using
   // absolute indices but NN api indices inputs by relative indices.
   int relative_input_index = 0;
@@ -3732,7 +3817,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
         }
       } else if (reg->builtin_code == kTfLiteBuiltinMaximum ||
                  reg->builtin_code == kTfLiteBuiltinMinimum) {
-        const TfLiteTensor& operand_tensor = context->tensors[input_pos];
+        const TfLiteTensor& operand_tensor =
+            context->tensors[node->inputs->data[input_pos]];
         if (operand_tensor.dims->size == 0) {
           int tensor_index;
 
@@ -3777,7 +3863,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
                   reg->builtin_code == kTfLiteBuiltinSum) &&
                  (input_pos == 1)) {
         // The axis needs, be converted to a tensor if specified as scalar
-        const TfLiteTensor& axis_tensor = context->tensors[1];
+        const TfLiteTensor& axis_tensor =
+            context->tensors[node->inputs->data[input_pos]];
         if (axis_tensor.dims->size == 0) {
           TF_LITE_ENSURE_STATUS(
               builder.AddVectorInt32Operand(axis_tensor.data.i32, 1));
@@ -3836,8 +3923,10 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
 }
 
 TfLiteStatus NNAPIDelegateKernel::BuildGraph(
-    TfLiteContext* context, const TfLiteIntArray* input_tensors,
-    const TfLiteIntArray* output_tensors, int* nnapi_errno) {
+    TfLiteContext* context,
+    const StatefulNnApiDelegate::Options& delegate_options,
+    const TfLiteIntArray* input_tensors, const TfLiteIntArray* output_tensors,
+    int* nnapi_errno) {
   // Build the ops and tensors.
   TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context, nnapi_errno));
   // Map input and output tensor indices to ANN
@@ -3902,11 +3991,13 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
           outputs.data()),
       "identifying model inputs and outputs", nnapi_errno);
 
+  auto allow_fp16 =
+      context->allow_fp32_relax_to_fp16 | delegate_options.allow_fp16;
   if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) {
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context,
         nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-            nn_model_.get(), context->allow_fp32_relax_to_fp16),
+            nn_model_.get(), allow_fp16),
         "set relaxed computation mode for fp32 if possible", nnapi_errno);
   }
 
@@ -3928,6 +4019,8 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
 
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
+StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
+
 StatefulNnApiDelegate::Data::~Data() {
   std::for_each(std::begin(delegate_state_cache),
                 std::end(delegate_state_cache),
@@ -3965,9 +4058,7 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(Options options)
 
 StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
                                              Options options)
-    : TfLiteDelegate(TfLiteDelegateCreate()),
-      delegate_data_(Data{.execution_preference = options.execution_preference,
-                          .nnapi = nnapi}) {
+    : TfLiteDelegate(TfLiteDelegateCreate()), delegate_data_(nnapi) {
   if (options.accelerator_name) {
     delegate_data_.accelerator_name = options.accelerator_name;
   }
@@ -3977,9 +4068,11 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
   if (options.model_token) {
     delegate_data_.model_token = options.model_token;
   }
+  delegate_data_.execution_preference = options.execution_preference;
   delegate_data_.disallow_nnapi_cpu = options.disallow_nnapi_cpu;
   delegate_data_.max_number_delegated_partitions =
       options.max_number_delegated_partitions;
+  delegate_data_.allow_fp16 = options.allow_fp16;
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
   Prepare = DoPrepare;
@@ -4009,6 +4102,7 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
   options.disallow_nnapi_cpu = delegate_data->disallow_nnapi_cpu;
   options.max_number_delegated_partitions =
       delegate_data->max_number_delegated_partitions;
+  options.allow_fp16 = delegate_data->allow_fp16;
   return options;
 }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index fe777ea99aa..7ef02bc5107 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
@@ -89,6 +90,33 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // The selection is currently done sorting partitions in decreasing order
     // of number of nodes and selecting them until the limit is reached.
     int max_number_delegated_partitions = 3;
+
+    // allow fp32 compuation to be run in fp16.
+    bool allow_fp16 = false;
+
+    // Specifies the relative priority for executions of the model.
+    // Available values are {ANEURALNETWORKS_PRIORITY_LOW,
+    // ANEURALNETWORKS_PRIORITY_MEDIUM, ANEURALNETWORKS_PRIORITY_HIGH,
+    // ANEURALNETWORKS_PRIORITY_DEFAULT}.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model. If the device is not able to complete the compilation within the
+    // specified duration, the compilation may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model. If the device is not able to complete the execution within the
+    // specified duration, the execution may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_execution_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution. If a WHILE loop condition model does not output false
+    // within the specified duration, the execution will be aborted. If set to
+    // 0, the default timeout for loops will be used.
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
   };
 
   // Uses default options.
@@ -153,8 +181,6 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
  private:
   // Encapsulates all delegate data.
   struct Data {
-    // Preferred Power/perf trade-off.
-    Options::ExecutionPreference execution_preference;
     // Pointer to NNAPI implementation to be used by this delegate as
     // set when building the StatefulNnApiDelegate instance.
     // Will generally be the NnApiInstance() singleton but can be overridden
@@ -162,6 +188,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // The ownership of the nnapi instance is left to the caller of
     // the StatefulNnApiDelegate constructor.
     const NnApi* nnapi;
+    // Preferred Power/perf trade-off.
+    Options::ExecutionPreference execution_preference;
     // Selected NNAPI accelerator name.
     std::string accelerator_name;
     // The cache dir for NNAPI model.
@@ -174,7 +202,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     std::vector<MemoryRegistration> tensor_memory_map;
     // Contains a non zero value if any NNAPI method call
     // operation returned a non zero result code.
-    int nnapi_errno;
+    int nnapi_errno = ANEURALNETWORKS_NO_ERROR;
     // Cache of kernels already built in StatefulNnApiDelegate::DoPrepare
     // when trying to understand if all nodes are supported by the target
     // accelerators.
@@ -184,7 +212,21 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // Maximum number of NNAPI partition to delegate. Zero or negative means
     // no limit. Copied from StatefulNnApiDelegate::Options
     int max_number_delegated_partitions;
+    // allow fp32 computation to be run in fp16.
+    bool allow_fp16;
+    // Specifies the relative priority for executions of the model.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model.
+    uint64_t max_execution_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
 
+    explicit Data(const NnApi* nnapi);
     ~Data();
 
     // Caches an initialised NNAPIDelegateKernel.
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
index 3c23054ea25..2bc7ae58449 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
@@ -27,7 +27,8 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(Options /* options */)
     : StatefulNnApiDelegate() {}
 
 StatefulNnApiDelegate::StatefulNnApiDelegate()
-    : TfLiteDelegate(TfLiteDelegateCreate()) {
+    : TfLiteDelegate(TfLiteDelegateCreate()),
+      delegate_data_(/*nnapi=*/nullptr) {
   Prepare = DoPrepare;
 }
 
@@ -46,6 +47,8 @@ int StatefulNnApiDelegate::GetNnApiErrno() const { return 0; }
 
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
+StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
+
 StatefulNnApiDelegate::Data::~Data() {}
 
 void StatefulNnApiDelegate::Data::CacheDelegateKernel(
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index b35bf0224fd..af93d9650c9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -31,6 +31,7 @@ namespace nnapi {
 constexpr int32_t kMinSdkVersionForNNAPI = 27;
 constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
 constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
+constexpr int32_t kMinSdkVersionForNNAPI13 = 30;
 
 // Track tensor indices to NN API tensor indices mapping.
 class OperandMapping {
@@ -349,6 +350,7 @@ class NNAPIDelegateKernel {
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
+                          const StatefulNnApiDelegate::Options& options,
                           const TfLiteIntArray* input_tensors,
                           const TfLiteIntArray* output_tensors,
                           int* nnapi_errno);
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index ea9111c4567..acfa0c77d30 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -304,6 +304,23 @@ TEST(NNAPIDelegate, StatefulDelegateWithCompilationCaching) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
 }
 
+// Sanity check for the state-ful NNAPI delegate with QoS hints.
+TEST(NNAPIDelegate, StatefulDelegateWithQoS) {
+  StatefulNnApiDelegate::Options options;
+  options.execution_priority = ANEURALNETWORKS_PRIORITY_HIGH;
+  options.max_compilation_timeout_duration_ns = UINT64_MAX;
+  options.max_execution_timeout_duration_ns = UINT64_MAX;
+  options.max_execution_loop_timeout_duration_ns = UINT64_MAX;
+
+  FloatAddOpModel m(options, {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
 // Sanity check for the state-ful NNAPI delegate using TfLiteBufferHandle.
 TEST(NNAPIDelegate, StatefulDelegateWithBufferHandles) {
   // Skip the test if Android specific functions could not be found.
diff --git a/tensorflow/lite/delegates/utils.cc b/tensorflow/lite/delegates/utils.cc
index fba8bec39a5..f9cf9380a31 100644
--- a/tensorflow/lite/delegates/utils.cc
+++ b/tensorflow/lite/delegates/utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/context_util.h"
 
 namespace tflite {
@@ -136,5 +137,167 @@ TfLiteStatus GraphPartitionHelper::PrepareSupportedNodes(
   return kTfLiteOk;
 }
 
+TfLiteStatus FP16GraphPartitionHelper::Partition(
+    std::set<std::string>* unsupported_nodes_info) {
+  const auto status = GraphPartitionHelper::Partition(unsupported_nodes_info);
+  // Clean up those partitions that have a single dequant op. NoteThose
+  // removed dequant ops have to be reserved in the graph and should not be
+  // delegated.
+  RemoveSingleDequantNodePartitions();
+  return status;
+}
+
+std::vector<int> FP16GraphPartitionHelper::GetNodesOfFirstNLargestPartitions(
+    int n) {
+  // We first get partitions to reduce the number of nodes to be checked in
+  // deciding which dequant ops could actually be replaced. And then we
+  // remap input-tensor to dequant nodes' inputs and remove those
+  // to-be-reserved dequant nodes.
+  auto first_nps = GetFirstNLargestPartitions(n);
+  std::vector<int> ops_to_replace;
+  for (const auto p : first_nps) {
+    auto nodes = p->nodes_to_replace;
+    ops_to_replace.insert(ops_to_replace.end(), nodes->data,
+                          nodes->data + nodes->size);
+  }
+  RemapInputTensors(ops_to_replace);
+  RemoveReservedDequantsFromNodes(&ops_to_replace);
+  return ops_to_replace;
+}
+
+bool FP16GraphPartitionHelper::IsNodeSupported(
+    TfLiteContext* context, TfLiteNode* node, TfLiteRegistration* registration,
+    int node_id, std::string* unsupported_details) {
+  // If we need to handle dequant nodes, we have to remap input tensors of
+  // this node if some of them come from a dequant node before testing if
+  // the node is supported.
+  std::vector<int> orig_inputs;
+  if (RecordAndRemapInputTensors(registration->builtin_code, node_id, node,
+                                 &orig_inputs)) {
+    // We have a dequant op here. Note that we retrun an Ok status because a
+    // dequant node is first added as supported. Later, this dequant node
+    // will be removed if it has to be preserved in the graph which happens
+    // when its immediate downstream nodes cannot be supported.
+    return true;
+  }
+  const auto status = GraphPartitionHelper::IsNodeSupported(
+      context, node, registration, node_id, unsupported_details);
+  RestoreToOrigInputTensors(node, orig_inputs);
+  return status;
+}
+
+bool FP16GraphPartitionHelper::RecordAndRemapInputTensors(
+    int32_t op_code, int node_id, TfLiteNode* node,
+    std::vector<int>* orig_inputs) {
+  orig_inputs->clear();
+  // Record the dequant node.
+  if (op_code == kTfLiteBuiltinDequantize &&
+      context_->tensors[node->inputs->data[0]].type ==
+          TfLiteType::kTfLiteFloat16) {
+    dequant_nodes_[node->outputs->data[0]] = node->inputs->data[0];
+    return true;
+  }
+  // For a dequantize op, there's no need to remap its input tensors.
+  if (dequant_nodes_.empty()) return false;
+  RemapInputTensors(node, orig_inputs);
+  return false;
+}
+
+void FP16GraphPartitionHelper::RestoreToOrigInputTensors(
+    TfLiteNode* node, const std::vector<int>& orig_inputs) {
+  if (node->inputs->size != orig_inputs.size()) return;
+  for (int j = 0; j < node->inputs->size; ++j) {
+    node->inputs->data[j] = orig_inputs[j];
+  }
+}
+
+void FP16GraphPartitionHelper::RemapInputTensors(
+    const std::vector<int>& nodes) const {
+  for (int node_id : nodes) {
+    TfLiteNode* node;
+    TfLiteRegistration* registration;
+    TfLiteStatus status = context_->GetNodeAndRegistration(
+        context_, node_id, &node, &registration);
+    if (status != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context_,
+                         "Couldn't get node and registration info for op: %d\n",
+                         node_id);
+    }
+    RemapInputTensors(node, nullptr /* orig_inputs*/);
+  }
+}
+
+void FP16GraphPartitionHelper::RemoveSingleDequantNodePartitions() {
+  auto it = partitions_.begin();
+  while (it != partitions_.end()) {
+    auto p = *it;
+    if (p->nodes_to_replace->size != 1) {
+      ++it;
+      continue;
+    }
+    int node_id = p->nodes_to_replace->data[0];
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* registration = nullptr;
+
+    TfLiteStatus status = context_->GetNodeAndRegistration(
+        context_, node_id, &node, &registration);
+    if (status != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context_,
+                         "Couldn't get node and registration info for op: %d\n",
+                         node_id);
+    }
+    if (registration->builtin_code != kTfLiteBuiltinDequantize ||
+        context_->tensors[node->inputs->data[0]].type !=
+            TfLiteType::kTfLiteFloat16) {
+      ++it;
+      continue;
+    }
+    // Note such dequant nodes have to be preserved in the graph as dequant
+    // ops are not actually supported in the GPU delegate.
+    dequant_nodes_to_save_.insert(node_id);
+    it = partitions_.erase(it);
+  }
+}
+
+void FP16GraphPartitionHelper::RemoveReservedDequantsFromNodes(
+    std::vector<int>* nodes) {
+  if (dequant_nodes_to_save_.empty()) return;
+  auto it = nodes->begin();
+  while (it != nodes->end()) {
+    if (dequant_nodes_to_save_.find(*it) == dequant_nodes_to_save_.end()) {
+      ++it;
+      continue;
+    }
+    it = nodes->erase(it);
+  }
+}
+
+void FP16GraphPartitionHelper::RemapInputTensors(
+    TfLiteNode* node, std::vector<int>* orig_inputs) const {
+  TfLiteIntArray* inputs = node->inputs;
+  auto inputs_view = TfLiteIntArrayView(inputs);
+  // Prepopulate 'orig_inputs' first and clear it if there's no input from a
+  // dequant op.
+  if (orig_inputs) {
+    orig_inputs->clear();
+    orig_inputs->reserve(inputs->size);
+    for (auto tid : inputs_view) {
+      orig_inputs->push_back(tid);
+    }
+  }
+  // Fix this node's inputs (i.e. prune out the preceding dequantize node) in
+  // order to test if it is supported.
+  bool is_remapped = false;
+  for (int j = 0; j < inputs->size; ++j) {
+    const int input_tid = inputs->data[j];
+    const auto it = dequant_nodes_.find(input_tid);
+    if (it != dequant_nodes_.end()) {
+      inputs->data[j] = it->second;
+      is_remapped = true;
+    }
+  }
+  if (!is_remapped && orig_inputs) orig_inputs->clear();
+}
+
 }  // namespace delegates
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index d6d22c4efa2..2238ba681e6 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <limits>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
@@ -109,6 +111,70 @@ class GraphPartitionHelper {
   // Contains an array of supported node indices.
   TfLiteIntArray* supported_nodes_ = nullptr;  // owns the memory
 };
+
+// While partitioning the graph, this claims DEQUANTIZE nodes (FP16->FP32) in
+// addition to supported nodes for the delegate, when the DEQUANTIZE node's
+// output is an input to the kernel that supports FP16 input.
+// Noth that you have to use `GetNodesOfFirstNLargestPartitions` instead of
+// superclass' `GetFirstNLargestPartitions` to do actual remapping of FP16
+// inputs.
+class FP16GraphPartitionHelper : public GraphPartitionHelper {
+ public:
+  FP16GraphPartitionHelper(TfLiteContext* context,
+                           IsNodeSupportedFn is_node_supported_fn)
+      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
+
+  TfLiteStatus Partition(
+      std::set<std::string>* unsupported_nodes_info) override;
+
+  // Returns a list of node indices of all nodes from the first n largest
+  // partitions. If there are fewer paritions than n, all nodes will be
+  // returned. The partition is ranked according to the number of nodes.
+  // TODO(b/156707497): Add this to superclass besides
+  // GetFirstNLargestPartitions (one that returns partitions instead of nodes)
+  std::vector<int> GetNodesOfFirstNLargestPartitions(int n);
+
+ protected:
+  bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteRegistration* registration, int node_id,
+                       std::string* unsupported_details) override;
+
+ private:
+  // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
+  // When it's not a dequant op, remap its inputs to the inputs of the preceding
+  // dequant if there's a one and returns false. 'orig_inputs' records original
+  // input tensor ids of this node if any input is remapped.
+  bool RecordAndRemapInputTensors(int32_t op_code, int node_id,
+                                  TfLiteNode* node,
+                                  std::vector<int>* orig_inputs);
+
+  // Restore inputs of 'node' to 'orig_inputs' only if two sizes match.
+  void RestoreToOrigInputTensors(TfLiteNode* node,
+                                 const std::vector<int>& orig_inputs);
+
+  // Remap input tensors of every node in 'nodes' (i.e. node indices) if some of
+  // them are from dequant ops.
+  void RemapInputTensors(const std::vector<int>& nodes) const;
+
+  void RemoveSingleDequantNodePartitions();
+
+  void RemoveReservedDequantsFromNodes(std::vector<int>* nodes);
+
+  // Remap input tensors of a single 'node' if some of come from a dequant op.
+  // If 'orig_inputs' isn't nullptr, it records original input tensor ids of
+  // this node if any input is remapped.
+  void RemapInputTensors(TfLiteNode* node, std::vector<int>* orig_inputs) const;
+
+  // A map recording dequantize nodes's input/output tensors of this selected
+  // graph. The key is the output tensor id, and the value is the input tensor
+  // id.
+  std::unordered_map<int, int> dequant_nodes_;
+
+  // A set of dequant nodes as in node indices that have to be preserved in the
+  // graph.
+  std::set<int> dequant_nodes_to_save_;
+};
+
 }  // namespace delegates
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/utils/BUILD b/tensorflow/lite/delegates/utils/BUILD
new file mode 100644
index 00000000000..069da167455
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/BUILD
@@ -0,0 +1,36 @@
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "simple_delegate",
+    srcs = [
+        "simple_delegate.cc",
+    ],
+    hdrs = [
+        "simple_delegate.h",
+    ],
+    deps = [
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
+        "//tensorflow/lite/kernels/internal:compatibility",
+    ],
+)
+
+cc_test(
+    name = "simple_delegate_test",
+    srcs = ["simple_delegate_test.cc"],
+    deps = [
+        ":simple_delegate",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.cc b/tensorflow/lite/delegates/utils/simple_delegate.cc
new file mode 100644
index 00000000000..51736e56d26
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/simple_delegate.cc
@@ -0,0 +1,140 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/utils.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+namespace tflite {
+namespace {
+TfLiteRegistration GetDelegateKernelRegistration(
+    SimpleDelegateInterface* delegate) {
+  TfLiteRegistration kernel_registration;
+  kernel_registration.profiling_string = nullptr;
+  kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
+  kernel_registration.custom_name = delegate->name();
+  kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
+    delete reinterpret_cast<SimpleDelegateKernelInterface*>(buffer);
+  };
+  kernel_registration.init = [](TfLiteContext* context, const char* buffer,
+                                size_t length) -> void* {
+    const TfLiteDelegateParams* params =
+        reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    if (params == nullptr) {
+      TF_LITE_KERNEL_LOG(context, "NULL TfLiteDelegateParams passed.");
+      return nullptr;
+    }
+    auto* delegate =
+        reinterpret_cast<SimpleDelegateInterface*>(params->delegate->data_);
+    std::unique_ptr<SimpleDelegateKernelInterface> delegate_kernel(
+        delegate->CreateDelegateKernelInterface());
+    if (delegate_kernel->Init(context, params) != kTfLiteOk) {
+      return nullptr;
+    }
+    return delegate_kernel.release();
+  };
+  kernel_registration.prepare = [](TfLiteContext* context,
+                                   TfLiteNode* node) -> TfLiteStatus {
+    if (node->user_data == nullptr) {
+      TF_LITE_KERNEL_LOG(context, "Delegate kernel was not initialized");
+      return kTfLiteError;
+    }
+    SimpleDelegateKernelInterface* delegate_kernel =
+        reinterpret_cast<SimpleDelegateKernelInterface*>(node->user_data);
+    return delegate_kernel->Prepare(context, node);
+  };
+  kernel_registration.invoke = [](TfLiteContext* context,
+                                  TfLiteNode* node) -> TfLiteStatus {
+    SimpleDelegateKernelInterface* delegate_kernel =
+        reinterpret_cast<SimpleDelegateKernelInterface*>(node->user_data);
+    TFLITE_DCHECK(delegate_kernel != nullptr);
+    return delegate_kernel->Invoke(context, node);
+  };
+
+  return kernel_registration;
+}
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context,
+                             TfLiteDelegate* base_delegate) {
+  auto* delegate =
+      reinterpret_cast<SimpleDelegateInterface*>(base_delegate->data_);
+  delegates::IsNodeSupportedFn node_supported_fn =
+      [=](TfLiteContext* context, TfLiteNode* node,
+          TfLiteRegistration* registration,
+          std::string* unsupported_details) -> bool {
+    return delegate->IsNodeSupportedByDelegate(registration, node, context);
+  };
+  // TODO(b/149484598): Update to have method that gets all supported nodes.
+  delegates::GraphPartitionHelper helper(context, node_supported_fn);
+  TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
+
+  const auto delegate_partitions = helper.GetFirstNLargestPartitions();
+
+  // To avoid creating a new TfLiteIntArray and free it later, we reserve one
+  // element to represent TfLiteIntArray.size which is the 1st element of
+  // TfLiteIntArray C struct.
+  std::vector<int> supported_nodes(1);
+  for (const auto partition : delegate_partitions) {
+    auto* nodes = partition->nodes_to_replace;
+    supported_nodes.insert(supported_nodes.end(), nodes->data,
+                           nodes->data + nodes->size);
+  }
+  // Set first element to the number of nodes to replace.
+  supported_nodes[0] = supported_nodes.size() - 1;
+
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                  "%s delegate: %d nodes delegated out of %d nodes with "
+                  "%d partitions.\n",
+                  delegate->name(), supported_nodes[0],
+                  helper.num_total_nodes(), delegate_partitions.size());
+  TfLiteRegistration delegate_kernel_registration =
+      GetDelegateKernelRegistration(delegate);
+
+  return context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, delegate_kernel_registration,
+      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), base_delegate);
+}
+}  // namespace
+
+TfLiteDelegate* TfLiteDelegateFactory::CreateSimpleDelegate(
+    std::unique_ptr<SimpleDelegateInterface> simple_delegate) {
+  if (simple_delegate == nullptr) {
+    return nullptr;
+  }
+  auto delegate = new TfLiteDelegate();
+  delegate->Prepare = &DelegatePrepare;
+  delegate->flags = kTfLiteDelegateFlagsNone;
+  delegate->CopyFromBufferHandle = nullptr;
+  delegate->CopyToBufferHandle = nullptr;
+  delegate->FreeBufferHandle = nullptr;
+  delegate->data_ = simple_delegate.release();
+  return delegate;
+}
+
+void TfLiteDelegateFactory::DeleteSimpleDelegate(TfLiteDelegate* delegate) {
+  if (!delegate) return;
+  SimpleDelegateInterface* simple_delegate =
+      reinterpret_cast<SimpleDelegateInterface*>(delegate->data_);
+  delete simple_delegate;
+  delete delegate;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.h b/tensorflow/lite/delegates/utils/simple_delegate.h
new file mode 100644
index 00000000000..bf35fbc47aa
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/simple_delegate.h
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file has utilities that facilitates creating new delegates.
+// - SimpleDelegateKernelInterface: Represents a Kernel which handles a subgraph
+// to be delegated. It has Init/Prepare/Invoke which are going to be called
+// during inference, similar to TFLite Kernels. Delegate owner should implement
+// this interface to build/prepare/invoke the delegated subgraph.
+// - SimpleDelegateInterface:
+// This class wraps TFLiteDelegate and users need to implement the interface and
+// then Call GetFinalizedDelegate() to get TfLiteDelegate* that can be passed to
+// ModifyGraphWithDelegate.
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+// Users should inherit from this class and implement the interface below.
+// Each instance represents a single part of the graph (subgraph).
+class SimpleDelegateKernelInterface {
+ public:
+  virtual ~SimpleDelegateKernelInterface() {}
+
+  // Initializes a delegated subgraph.
+  // The nodes in the subgraph are inside TfLiteDelegateParams->nodes_to_replace
+  virtual TfLiteStatus Init(TfLiteContext* context,
+                            const TfLiteDelegateParams* params) = 0;
+
+  // Will be called by the framework. Should handle any needed preparation
+  // for the subgraph e.g. allocating buffers, compiling model.
+  // Returns status, and signalling any errors.
+  virtual TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) = 0;
+
+  // Actual subgraph inference should happen on this call.
+  // Returns status, and signalling any errors.
+  virtual TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) = 0;
+};
+
+// Pure Interface that clients should implement.
+// The Interface represents a delegate capabilities and provide factory
+// for SimpleDelegateKernelInterface
+//
+// Clients should implement the following methods:
+// - IsNodeSupportedByDelegate
+// - name
+// - CreateDelegateKernelInterface
+class SimpleDelegateInterface {
+ public:
+  SimpleDelegateInterface() {}
+
+  virtual ~SimpleDelegateInterface() {}
+
+  // Returns true if 'node' is supported by the delegate. False otherwise.
+  virtual bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                         const TfLiteNode* node,
+                                         TfLiteContext* context) const = 0;
+
+  // Returns a name that identifies the delegate.
+  // This name is used for debugging/logging/profiling.
+  virtual const char* name() const = 0;
+
+  // Returns instance of an object that implements the interface
+  // SimpleDelegateKernelInterface.
+  // An instance of SimpleDelegateKernelInterface represents one subgraph to
+  // be delegated.
+  // Caller takes ownership of the returned object.
+  virtual std::unique_ptr<SimpleDelegateKernelInterface>
+  CreateDelegateKernelInterface() = 0;
+};
+
+// Factory class that provides two static methods
+// CreateSimpleDelegate
+// DeleteSimpleDelegate
+// Which should be used to construct TfLiteDelegate from
+// Simple Delegate and delete TfLiteDelegate and SimpleDelegate give
+// tfLiteDelegate* created from 'CreateSimpleDelegate' method.
+// Users should use these methods to Create and Destroy the delegate.
+class TfLiteDelegateFactory {
+ public:
+  // Creates TfLiteDelegate from the provided SimpleDelegateInterface.
+  // The returned TfLiteDelegate should be deleted using DeleteSimpleDelegate.
+  static TfLiteDelegate* CreateSimpleDelegate(
+      std::unique_ptr<SimpleDelegateInterface> simple_delegate);
+
+  // Deletes 'delegate' the passed pointer must be the one returned
+  // from GetFinalizedDelegate.
+  // This function will destruct the SimpleDelegate object too.
+  static void DeleteSimpleDelegate(TfLiteDelegate* delegate);
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
diff --git a/tensorflow/lite/delegates/utils/simple_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
new file mode 100644
index 00000000000..fa6d528a537
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
@@ -0,0 +1,194 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/builtin_op_kernels.h"
+
+namespace tflite {
+namespace {
+// Delegate options.
+struct TestSimpleDelegateOptions {
+  // Allowed ops to delegate.
+  int allowed_builtin_code;
+  // Report error during init.
+  bool error_during_init = false;
+  // Report error during prepare.
+  bool error_during_prepare = false;
+  // Report error during invoke.
+  bool error_during_invoke = false;
+};
+
+// Dummy delegate kernel.
+class TestSimpleDelegateKernel : public SimpleDelegateKernelInterface {
+ public:
+  explicit TestSimpleDelegateKernel(TestSimpleDelegateOptions options)
+      : options_(options) {}
+
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) override {
+    return !options_.error_during_init ? kTfLiteOk : kTfLiteError;
+  }
+
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override {
+    return !options_.error_during_prepare ? kTfLiteOk : kTfLiteError;
+  }
+
+  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) override {
+    return !options_.error_during_invoke ? kTfLiteOk : kTfLiteError;
+  }
+
+ private:
+  TestSimpleDelegateOptions options_;
+};
+
+// Simple delegate which implements the interface of SimpleDelegateInterface.
+// This holds the Delegate capabilities.
+class TestSimpleDelegate : public SimpleDelegateInterface {
+ public:
+  explicit TestSimpleDelegate(TestSimpleDelegateOptions options)
+      : options_(options) {}
+  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context) const override {
+    return options_.allowed_builtin_code == registration->builtin_code;
+  }
+
+  const char* name() const override { return "TestSimpleDelegate"; }
+
+  std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
+      override {
+    return std::make_unique<TestSimpleDelegateKernel>(options_);
+  }
+
+ private:
+  TestSimpleDelegateOptions options_;
+};
+
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration* reg = ops::builtin::Register_ADD();
+    void* builtin_data_1 = malloc(sizeof(int));
+    void* builtin_data_2 = malloc(sizeof(int));
+    void* builtin_data_3 = malloc(sizeof(int));
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, builtin_data_1,
+                                        reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, builtin_data_2,
+                                        reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, builtin_data_3,
+                                        reg);
+  }
+
+  void TearDown() override {
+    interpreter_.reset();
+    TfLiteDelegateFactory::DeleteSimpleDelegate(delegate_);
+  }
+
+ protected:
+  std::unique_ptr<Interpreter> interpreter_;
+  TfLiteDelegate* delegate_ = nullptr;
+};
+
+TEST_F(TestDelegate, BasicDelegate) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  interpreter_->ModifyGraphWithDelegate(delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  int node = interpreter_->execution_plan()[0];
+  const auto* node_and_reg = interpreter_->node_and_registration(node);
+  EXPECT_EQ("TestSimpleDelegate", node_and_reg->second.custom_name);
+
+  const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
+      node_and_reg->first.builtin_data);
+  ASSERT_EQ(params->nodes_to_replace->size, 3);
+  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
+  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
+  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
+
+  ASSERT_EQ(params->input_tensors->size, 2);
+  EXPECT_EQ(params->input_tensors->data[0], 0);
+  EXPECT_EQ(params->input_tensors->data[1], 1);
+
+  ASSERT_EQ(params->output_tensors->size, 2);
+  EXPECT_EQ(params->output_tensors->data[0], 3);
+  EXPECT_EQ(params->output_tensors->data[1], 4);
+}
+
+TEST_F(TestDelegate, NoNodesToDelegate) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinSub;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  interpreter_->ModifyGraphWithDelegate(delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+}
+
+TEST_F(TestDelegate, DelegateFailedPrepare) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  options.error_during_prepare = true;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  ASSERT_EQ(kTfLiteDelegateError,
+            interpreter_->ModifyGraphWithDelegate(delegate_));
+}
+
+TEST_F(TestDelegate, DelegateFailedInvoke) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  options.error_during_invoke = true;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  ASSERT_EQ(kTfLiteOk, interpreter_->ModifyGraphWithDelegate(delegate_));
+  ASSERT_EQ(kTfLiteError, interpreter_->Invoke());
+}
+
+TEST_F(TestDelegate, DelegateFailedInit) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  options.error_during_init = true;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  ASSERT_EQ(kTfLiteDelegateError,
+            interpreter_->ModifyGraphWithDelegate(delegate_));
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 45276e7b560..e8e6c061160 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -43,17 +43,51 @@ cc_library(
     ],
 )
 
-############################## Integration tests ###############################
+################################ Tester classes ################################
 
 cc_library(
-    name = "test_main",
+    name = "binary_elementwise_tester",
     testonly = 1,
-    linkopts = select({
-        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
-        "//conditions:default": [],
-    }),
+    srcs = ["binary_elementwise_tester.cc"],
+    hdrs = ["binary_elementwise_tester.h"],
     deps = [
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "depthwise_conv_2d_tester",
+    testonly = 1,
+    srcs = ["depthwise_conv_2d_tester.cc"],
+    hdrs = ["depthwise_conv_2d_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "fully_connected_tester",
+    testonly = 1,
+    srcs = ["fully_connected_tester.cc"],
+    hdrs = ["fully_connected_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -72,15 +106,59 @@ cc_library(
     ],
 )
 
-cc_test(
-    name = "average_pool_2d_test",
-    srcs = ["average_pool_2d_test.cc"],
+cc_library(
+    name = "softmax_tester",
+    testonly = 1,
+    srcs = ["softmax_tester.cc"],
+    hdrs = ["softmax_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "unary_elementwise_tester",
+    testonly = 1,
+    srcs = ["unary_elementwise_tester.cc"],
+    hdrs = ["unary_elementwise_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+############################## Integration tests ###############################
+
+cc_library(
+    name = "test_main",
+    testonly = 1,
     linkopts = select({
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
     }),
     deps = [
-        ":pool_2d_tester",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "add_test",
+    srcs = ["add_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":binary_elementwise_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "@com_google_googletest//:gtest",
@@ -88,8 +166,8 @@ cc_test(
 )
 
 cc_test(
-    name = "max_pool_2d_test",
-    srcs = ["max_pool_2d_test.cc"],
+    name = "average_pool_2d_test",
+    srcs = ["average_pool_2d_test.cc"],
     linkopts = select({
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
@@ -130,14 +208,148 @@ cc_test(
     }),
     tags = ["nomsan"],  # b/145129478
     deps = [
+        ":depthwise_conv_2d_tester",
         ":test_main",
-        ":xnnpack_delegate",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/schema:schema_fbs",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "fully_connected_test",
+    srcs = ["fully_connected_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":fully_connected_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "hard_swish_test",
+    srcs = ["hard_swish_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":test_main",
+        ":unary_elementwise_tester",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "max_pool_2d_test",
+    srcs = ["max_pool_2d_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    tags = [
+        "notsan",  # TODO(b/155404603)
+    ],
+    deps = [
+        ":pool_2d_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "logistic_test",
+    srcs = ["logistic_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":test_main",
+        ":unary_elementwise_tester",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "mul_test",
+    srcs = ["mul_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":binary_elementwise_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "relu_test",
+    srcs = ["relu_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":test_main",
+        ":unary_elementwise_tester",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "relu6_test",
+    srcs = ["relu6_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":test_main",
+        ":unary_elementwise_tester",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "relu_n1_to_1_test",
+    srcs = ["relu_n1_to_1_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":test_main",
+        ":unary_elementwise_tester",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "softmax_test",
+    srcs = ["softmax_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":softmax_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 76cad421a09..c4e3f540faf 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -1,15 +1,48 @@
 # XNNPACK backend for TensorFlow Lite
 
 XNNPACK is a highly optimized library of floating-point neural network
-inference operators for ARM, WebAssembly, and x86 platforms. This document
-describes how to use the XNNPACK library as a backend for TensorFlow Lite.
+inference operators for ARM, x86, and WebAssembly architectures in Android, iOS,
+Windows, Linux, macOS, and Emscripten environments. This document describes how
+to use the XNNPACK library as an inference engine for TensorFlow Lite.
 
-## Enabling XNNPACK backend in TensorFlow Lite models
+## Using XNNPACK engine with TensorFlow Lite interpreter
 
 XNNPACK integrates with TensorFlow Lite interpreter through the delegation
-mechanism. To leverage XNNPACK library for acceleration, the users need to
-create an XNNPACK delegate with the `TfLiteXNNPackDelegateCreate` function,
-and call `Interpreter::ModifyGraphWithDelegate` to delegate supported parts of
+mechanism. There are three methods to enable XNNPACK engine in TensorFlow Lite.
+
+### Enable XNNPACK via Bazel build flags (recommended)
+
+When building TensorFlow Lite with Bazel, add
+`--define tflite_with_xnnpack=true`, and the TensorFlow Lite interpreter will
+use XNNPACK engine by default.
+
+The exact command depends on the target platform, e.g. for Android AAR you'd use
+
+```
+bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --define tflite_with_xnnpack=true \
+  //tensorflow/lite/java:tensorflow-lite
+```
+
+### Enable XNNPACK via additional dependency
+
+Another way to enable XNNPACK is to build and link the
+`//tensorflow/lite:tflite_with_xnnpack` target into your application alongside
+the TensorFlow Lite framework.
+
+This method works on platforms which support POSIX-style weak symbols (Android,
+iOS, Linux, Mac, but **NOT** Windows).
+
+### Enable XNNPACK via low-level delegate API (not recommended)
+
+While it is possible to use low-level delegate API to enable XNNPACK, this
+method is **NOT RECOMMENDED** unless you need to use TensorFlow Lite both with
+and without XNNPACK (e.g. for benchmarking).
+
+With low-level delegate API users create an XNNPACK delegate with the
+`TfLiteXNNPackDelegateCreate` function, and then call
+`Interpreter::ModifyGraphWithDelegate` to delegate supported parts of
 the model to the XNNPACK delegate. The users must destroy the delegate with
 `TfLiteXNNPackDelegateDelete` **after** releasing the TensorFlow Lite
 interpreter. The snippet below illustrates the typical usage:
@@ -83,6 +116,16 @@ Below is the list of current operators and limitations:
 
 ### `DEPTHWISE_CONV_2D`
 
+* Inputs and outputs must be in 32-bit floating-point format.
+* Bias is mandatory.
+* Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
+* Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
+  but fused `TANH` and `SIGN_BIT` activations are not.
+* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
+  are not supported.
+
+### `FULLY_CONNECTED`
+
 * Inputs and outputs must be in 32-bit floating-point format.
 * Bias is mandatory.
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
diff --git a/tensorflow/lite/delegates/xnnpack/add_test.cc b/tensorflow/lite/delegates/xnnpack/add_test.cc
new file mode 100644
index 00000000000..dd2857e01ce
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/add_test.cc
@@ -0,0 +1,811 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Add, 4DBy4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DBy4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, 1, 1, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, 1, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DBy4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, 1, width, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, width, 1})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DBy4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, height, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, 1, 1})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DBy4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, 1, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, 1, 1, 1})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DBy4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 2DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 2DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({channels})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 2DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, 1, 1, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, 1, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, 1, width, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, width, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, height, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, 1, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, 1, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, 1, 1, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 4DByStatic0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 2DByStatic2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 2DByStatic1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({channels})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, 2DByStatic0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({})
+      .Input2Static(true)
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .ReluActivation()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Relu6Activation()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .ReluMinus1To1Activation()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, DISABLED_TanhActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .TanhActivation()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, DISABLED_SignBitActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .SignBitActivation()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+TEST(Add, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
new file mode 100644
index 00000000000..e846cbeffe3
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
@@ -0,0 +1,298 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+std::vector<int32_t> BinaryElementwiseTester::OutputShape() const {
+  std::vector<int32_t> output_shape;
+  if (!input1_shape_.empty()) {
+    output_shape.insert(
+        output_shape.end(), input1_shape_.cbegin(),
+        input1_shape_.cbegin() +
+            std::max(input1_shape_.size(), input2_shape_.size()) -
+            input2_shape_.size());
+  }
+  if (!input2_shape_.empty()) {
+    output_shape.insert(
+        output_shape.end(), input2_shape_.cbegin(),
+        input2_shape_.cbegin() +
+            std::max(input2_shape_.size(), input1_shape_.size()) -
+            input1_shape_.size());
+  }
+  for (size_t i = std::min(input1_shape_.size(), input2_shape_.size()); i >= 1;
+       i--) {
+    output_shape.push_back(
+        std::max(*(input1_shape_.cend() - i), *(input2_shape_.cend() - i)));
+  }
+  return output_shape;
+}
+
+void BinaryElementwiseTester::Test(tflite::BuiltinOperator binary_op,
+                                   TfLiteDelegate* delegate) const {
+  if (Input1Static()) {
+    ASSERT_FALSE(Input2Static());
+  }
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  std::uniform_real_distribution<float> input1_distribution(-25.0f, 25.0f);
+  std::uniform_real_distribution<float> input2_distribution(-25.0f, 25.0f);
+  switch (binary_op) {
+    case BuiltinOperator_DIV:
+      input1_distribution = std::uniform_real_distribution<float>(-5.0f, 5.0f);
+      input2_distribution = std::uniform_real_distribution<float>(0.1f, 1.0f);
+      break;
+    case BuiltinOperator_MUL:
+      input1_distribution = std::uniform_real_distribution<float>(-5.0f, 5.0f);
+      input2_distribution = std::uniform_real_distribution<float>(-5.0f, 5.0f);
+      break;
+    default:
+      break;
+  }
+  auto input1_rng = std::bind(input1_distribution, std::ref(rng));
+  auto input2_rng = std::bind(input2_distribution, std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel(binary_op);
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  if (Input1Static() || Input2Static()) {
+    ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+    ASSERT_EQ(default_interpreter->inputs().size(), 1);
+  } else {
+    ASSERT_EQ(delegate_interpreter->inputs().size(), 2);
+    ASSERT_EQ(default_interpreter->inputs().size(), 2);
+  }
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  if (!Input1Static()) {
+    float* default_input1_data = default_interpreter->typed_tensor<float>(
+        default_interpreter->inputs()[0]);
+    std::generate(default_input1_data,
+                  default_input1_data + ComputeSize(Input1Shape()),
+                  std::ref(input1_rng));
+
+    float* xnnpack_input1_data = delegate_interpreter->typed_tensor<float>(
+        delegate_interpreter->inputs()[0]);
+    std::copy(default_input1_data,
+              default_input1_data + ComputeSize(Input1Shape()),
+              xnnpack_input1_data);
+  }
+
+  if (!Input2Static()) {
+    float* default_input2_data = default_interpreter->typed_tensor<float>(
+        default_interpreter->inputs()[Input1Static() ? 0 : 1]);
+    std::generate(default_input2_data,
+                  default_input2_data + ComputeSize(Input2Shape()),
+                  std::ref(input2_rng));
+
+    float* xnnpack_input2_data = delegate_interpreter->typed_tensor<float>(
+        delegate_interpreter->inputs()[Input1Static() ? 0 : 1]);
+    std::copy(default_input2_data,
+              default_input2_data + ComputeSize(Input2Shape()),
+              xnnpack_input2_data);
+  }
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* xnnpack_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
+    ASSERT_NEAR(default_output_data[i], xnnpack_output_data[i],
+                std::numeric_limits<float>::epsilon() *
+                    std::max(std::abs(default_output_data[i]) * 2.0f, 1.0f));
+  }
+}
+
+std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
+    tflite::BuiltinOperator binary_op) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  std::uniform_real_distribution<float> input1_distribution(-25.0f, 25.0f);
+  std::uniform_real_distribution<float> input2_distribution(-25.0f, 25.0f);
+  switch (binary_op) {
+    case BuiltinOperator_DIV:
+      input1_distribution = std::uniform_real_distribution<float>(-5.0f, 5.0f);
+      input2_distribution = std::uniform_real_distribution<float>(0.1f, 1.0f);
+      break;
+    case BuiltinOperator_MUL:
+      input1_distribution = std::uniform_real_distribution<float>(-5.0f, 5.0f);
+      input2_distribution = std::uniform_real_distribution<float>(-5.0f, 5.0f);
+      break;
+    default:
+      break;
+  }
+  auto input1_rng = std::bind(input1_distribution, std::ref(rng));
+  auto input2_rng = std::bind(input2_distribution, std::ref(rng));
+
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, binary_op);
+
+  std::vector<flatbuffers::Offset<Buffer>> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+  }};
+
+  int32_t input1_buffer = 0;
+  if (Input1Static()) {
+    std::vector<float> input1_data(ComputeSize(Input1Shape()));
+    std::generate(input1_data.begin(), input1_data.end(), input1_rng);
+
+    input1_buffer = buffers.size();
+    buffers.push_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(input1_data.data()),
+                     sizeof(float) * input1_data.size())));
+  }
+
+  int32_t input2_buffer = 0;
+  if (Input2Static()) {
+    std::vector<float> input2_data(ComputeSize(Input2Shape()));
+    std::generate(input2_data.begin(), input2_data.end(), input2_rng);
+
+    input2_buffer = buffers.size();
+    buffers.push_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(input2_data.data()),
+                     sizeof(float) * input2_data.size())));
+  }
+
+  const std::vector<int32_t> output_shape = OutputShape();
+  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(Input1Shape().data(),
+                                                 Input1Shape().size()),
+                   TensorType_FLOAT32, input1_buffer),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(Input2Shape().data(),
+                                                 Input2Shape().size()),
+                   TensorType_FLOAT32, input2_buffer),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   TensorType_FLOAT32),
+  }};
+
+  tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE;
+  flatbuffers::Offset<void> builtin_options = 0;
+  switch (binary_op) {
+    case BuiltinOperator_ADD:
+      builtin_options_type = BuiltinOptions_AddOptions;
+      builtin_options = CreateAddOptions(builder, Activation()).Union();
+      break;
+    case BuiltinOperator_DIV:
+      builtin_options_type = BuiltinOptions_DivOptions;
+      builtin_options = CreateDivOptions(builder, Activation()).Union();
+      break;
+    case BuiltinOperator_MUL:
+      builtin_options_type = BuiltinOptions_MulOptions;
+      builtin_options = CreateMulOptions(builder, Activation()).Union();
+      break;
+    case BuiltinOperator_SUB:
+      builtin_options_type = BuiltinOptions_SubOptions;
+      builtin_options = CreateSubOptions(builder, Activation()).Union();
+      break;
+    default:
+      EXPECT_EQ(Activation(), ActivationFunctionType_NONE);
+  }
+
+  const std::array<int32_t, 2> op_inputs{{0, 1}};
+  const std::array<int32_t, 1> op_outputs{{2}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      builtin_options_type, builtin_options);
+
+  std::vector<int32_t> subgraph_inputs;
+  if (!Input1Static()) {
+    subgraph_inputs.push_back(0);
+  }
+  if (!Input2Static()) {
+    subgraph_inputs.push_back(1);
+  }
+  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Binary operator model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t BinaryElementwiseTester::ComputeSize(
+    const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
new file mode 100644
index 00000000000..6d9a8b6caa9
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
@@ -0,0 +1,130 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_BINARY_ELEMENTWISE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_BINARY_ELEMENTWISE_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class BinaryElementwiseTester {
+ public:
+  BinaryElementwiseTester() = default;
+  BinaryElementwiseTester(const BinaryElementwiseTester&) = delete;
+  BinaryElementwiseTester& operator=(const BinaryElementwiseTester&) = delete;
+
+  inline BinaryElementwiseTester& Input1Shape(
+      std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input1_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Input1Shape() const {
+    return input1_shape_;
+  }
+
+  inline BinaryElementwiseTester& Input2Shape(
+      std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input2_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Input2Shape() const {
+    return input2_shape_;
+  }
+
+  std::vector<int32_t> OutputShape() const;
+
+  inline BinaryElementwiseTester& Input1Static(bool is_static) {
+    input1_static_ = is_static;
+    return *this;
+  }
+
+  inline bool Input1Static() const { return input1_static_; }
+
+  inline BinaryElementwiseTester& Input2Static(bool is_static) {
+    input2_static_ = is_static;
+    return *this;
+  }
+
+  inline bool Input2Static() const { return input2_static_; }
+
+  inline BinaryElementwiseTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline BinaryElementwiseTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline BinaryElementwiseTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline BinaryElementwiseTester& TanhActivation() {
+    activation_ = ::tflite::ActivationFunctionType_TANH;
+    return *this;
+  }
+
+  inline BinaryElementwiseTester& SignBitActivation() {
+    activation_ = ::tflite::ActivationFunctionType_SIGN_BIT;
+    return *this;
+  }
+
+  void Test(tflite::BuiltinOperator binary_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator binary_op) const;
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input1_shape_;
+  std::vector<int32_t> input2_shape_;
+  bool input1_static_ = false;
+  bool input2_static_ = false;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_BINARY_ELEMENTWISE_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
index 18cf55eb91c..fd82e4fd83f 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
@@ -15,300 +15,36 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <random>
-#include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
 
-namespace {
+TEST(DepthwiseConv2D, 1x1) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-class DepthwiseConv2DTester {
- public:
-  DepthwiseConv2DTester() = default;
-  DepthwiseConv2DTester(const DepthwiseConv2DTester&) = delete;
-  DepthwiseConv2DTester& operator=(const DepthwiseConv2DTester&) = delete;
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
 
-  DepthwiseConv2DTester& BatchSize(int32_t batch_size) {
-    EXPECT_GT(batch_size, 0);
-    batch_size_ = batch_size;
-    return *this;
-  }
-
-  int32_t BatchSize() const { return batch_size_; }
-
-  DepthwiseConv2DTester& Groups(int32_t groups) {
-    EXPECT_GT(groups, 0);
-    groups_ = groups;
-    return *this;
-  }
-
-  int32_t Groups() const { return groups_; }
-
-  DepthwiseConv2DTester& DepthMultiplier(int32_t depth_multiplier) {
-    EXPECT_GT(depth_multiplier, 0);
-    depth_multiplier_ = depth_multiplier;
-    return *this;
-  }
-
-  int32_t DepthMultiplier() const { return depth_multiplier_; }
-
-  int32_t InputChannels() const { return Groups(); }
-
-  int32_t OutputChannels() const { return DepthMultiplier() * Groups(); }
-
-  DepthwiseConv2DTester& InputHeight(int32_t input_height) {
-    EXPECT_GT(input_height, 0);
-    input_height_ = input_height;
-    return *this;
-  }
-
-  int32_t InputHeight() const { return input_height_; }
-
-  DepthwiseConv2DTester& InputWidth(int32_t input_width) {
-    EXPECT_GT(input_width, 0);
-    input_width_ = input_width;
-    return *this;
-  }
-
-  int32_t InputWidth() const { return input_width_; }
-
-  int32_t OutputWidth() const {
-    const int32_t output_width = (InputWidth() - 1) / StrideWidth() + 1;
-    EXPECT_GT(output_width, 0);
-    return output_width;
-  }
-
-  int32_t OutputHeight() const {
-    const int32_t output_height = (InputHeight() - 1) / StrideHeight() + 1;
-    EXPECT_GT(output_height, 0);
-    return output_height;
-  }
-
-  DepthwiseConv2DTester& KernelHeight(int32_t kernel_height) {
-    EXPECT_GT(kernel_height, 0);
-    kernel_height_ = kernel_height;
-    return *this;
-  }
-
-  int32_t KernelHeight() const { return kernel_height_; }
-
-  DepthwiseConv2DTester& KernelWidth(int32_t kernel_width) {
-    EXPECT_GT(kernel_width, 0);
-    kernel_width_ = kernel_width;
-    return *this;
-  }
-
-  int32_t KernelWidth() const { return kernel_width_; }
-
-  DepthwiseConv2DTester& StrideHeight(int32_t stride_height) {
-    EXPECT_GT(stride_height, 0);
-    stride_height_ = stride_height;
-    return *this;
-  }
-
-  int32_t StrideHeight() const { return stride_height_; }
-
-  DepthwiseConv2DTester& StrideWidth(int32_t stride_width) {
-    EXPECT_GT(stride_width, 0);
-    stride_width_ = stride_width;
-    return *this;
-  }
-
-  int32_t StrideWidth() const { return stride_width_; }
-
-  DepthwiseConv2DTester& DilationHeight(int32_t dilation_height) {
-    EXPECT_GT(dilation_height, 0);
-    dilation_height_ = dilation_height;
-    return *this;
-  }
-
-  int32_t DilationHeight() const { return dilation_height_; }
-
-  DepthwiseConv2DTester& DilationWidth(int32_t dilation_width) {
-    EXPECT_GT(dilation_width, 0);
-    dilation_width_ = dilation_width;
-    return *this;
-  }
-
-  int32_t DilationWidth() const { return dilation_width_; }
-
-  void Test(TfLiteDelegate* delegate) const {
-    ASSERT_EQ(DepthMultiplier(), 1) << "Flow does not support depth multiplier";
-
-    std::random_device random_device;
-    auto rng = std::mt19937(random_device());
-    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
-
-    std::vector<char> buffer = CreateTfLiteModel(std::ref(f32rng));
-    const Model* model = GetModel(buffer.data());
-
-    std::unique_ptr<Interpreter> delegate_interpreter;
-    ASSERT_EQ(
-        InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
-            &delegate_interpreter),
-        kTfLiteOk);
-    std::unique_ptr<Interpreter> default_interpreter;
-    ASSERT_EQ(
-        InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
-            &default_interpreter),
-        kTfLiteOk);
-
-    ASSERT_TRUE(delegate_interpreter);
-    ASSERT_TRUE(default_interpreter);
-
-    ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
-    ASSERT_EQ(default_interpreter->inputs().size(), 1);
-
-    ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
-    ASSERT_EQ(default_interpreter->outputs().size(), 1);
-
-    ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
-    ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
-
-    ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate),
-              kTfLiteOk);
-
-    float* default_input_data = default_interpreter->typed_tensor<float>(
-        default_interpreter->inputs()[0]);
-    std::generate(default_input_data,
-                  default_input_data + BatchSize() * InputChannels() *
-                                           InputHeight() * InputWidth(),
-                  std::ref(f32rng));
-
-    float* xnnpack_input_data = delegate_interpreter->typed_tensor<float>(
-        delegate_interpreter->inputs()[0]);
-    std::copy(default_input_data,
-              default_input_data +
-                  BatchSize() * InputChannels() * InputHeight() * InputWidth(),
-              xnnpack_input_data);
-
-    default_interpreter->Invoke();
-    delegate_interpreter->Invoke();
-
-    float* default_output_data = default_interpreter->typed_tensor<float>(
-        default_interpreter->outputs()[0]);
-    float* xnnpack_output_data = delegate_interpreter->typed_tensor<float>(
-        delegate_interpreter->outputs()[0]);
-
-    for (size_t i = 0;
-         i < BatchSize() * OutputChannels() * OutputHeight() * OutputWidth();
-         i++) {
-      ASSERT_NEAR(default_output_data[i], xnnpack_output_data[i],
-                  std::numeric_limits<float>::epsilon() *
-                      std::max(std::abs(default_output_data[i]) * 10.0f, 1.0f));
-    }
-  }
-
- private:
-  std::vector<char> CreateTfLiteModel(std::function<float()> f32rng) const {
-    flatbuffers::FlatBufferBuilder builder;
-    flatbuffers::Offset<OperatorCode> operator_code =
-        CreateOperatorCode(builder, BuiltinOperator_DEPTHWISE_CONV_2D, 0);
-
-    flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
-        CreateDepthwiseConv2DOptions(builder, Padding_SAME, StrideWidth(),
-                                     StrideHeight(), DepthMultiplier(),
-                                     ActivationFunctionType_NONE,
-                                     DilationWidth(), DilationHeight());
-
-    std::vector<float> filter_data(KernelHeight() * KernelWidth() *
-                                   OutputChannels());
-    std::vector<float> bias_data(OutputChannels());
-
-    std::generate(filter_data.begin(), filter_data.end(), f32rng);
-    std::generate(bias_data.begin(), bias_data.end(), f32rng);
-
-    flatbuffers::Offset<Buffer> buffers[3] = {
-        CreateBuffer(builder, builder.CreateVector({})),
-        CreateBuffer(builder,
-                     builder.CreateVector(
-                         reinterpret_cast<const uint8_t*>(filter_data.data()),
-                         sizeof(float) * filter_data.size())),
-        CreateBuffer(builder,
-                     builder.CreateVector(
-                         reinterpret_cast<const uint8_t*>(bias_data.data()),
-                         sizeof(float) * bias_data.size())),
-    };
-
-    const int32_t input_shape[4] = {BatchSize(), InputHeight(), InputWidth(),
-                                    InputChannels()};
-    const int32_t output_shape[4] = {BatchSize(), OutputHeight(), OutputWidth(),
-                                     OutputChannels()};
-    const int32_t filter_shape[4] = {1, KernelHeight(), KernelWidth(),
-                                     OutputChannels()};
-    const int32_t bias_shape[1] = {OutputChannels()};
-
-    flatbuffers::Offset<Tensor> tensors[4] = {
-        CreateTensor(builder, builder.CreateVector<int32_t>(input_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/0,
-                     builder.CreateString("X")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(filter_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/1,
-                     builder.CreateString("W")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(bias_shape, 1),
-                     TensorType_FLOAT32, /*buffer=*/2,
-                     builder.CreateString("b")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(output_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/0,
-                     builder.CreateString("Y")),
-    };
-
-    const int32_t op_inputs[3] = {0, 1, 2};
-    const int32_t op_outputs[1] = {3};
-
-    flatbuffers::Offset<Operator> op = CreateOperator(
-        builder, /*opcode_index=*/0,
-        builder.CreateVector<int32_t>(op_inputs, 3),
-        builder.CreateVector<int32_t>(op_outputs, 1),
-        BuiltinOptions_DepthwiseConv2DOptions, depthwise_conv2d_options.Union(),
-        /*custom_options=*/0, CustomOptionsFormat_FLEXBUFFERS);
-
-    int32_t subgraph_inputs[1] = {0};
-    int32_t subgraph_outputs[1] = {3};
-    flatbuffers::Offset<SubGraph> subgraph =
-        CreateSubGraph(builder, builder.CreateVector(tensors, 4),
-                       builder.CreateVector<int32_t>(subgraph_inputs, 1),
-                       builder.CreateVector<int32_t>(subgraph_outputs, 1),
-                       builder.CreateVector(&op, 1), /*name=*/0);
-
-    flatbuffers::Offset<flatbuffers::String> description =
-        builder.CreateString("DepthwiseConv2D model");
-
-    flatbuffers::Offset<Model> model_buffer = CreateModel(
-        builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
-        builder.CreateVector(&subgraph, 1), description,
-        builder.CreateVector(buffers, 3));
-
-    builder.Finish(model_buffer);
-
-    return std::vector<char>(builder.GetBufferPointer(),
-                             builder.GetBufferPointer() + builder.GetSize());
-  }
-
-  int32_t batch_size_ = 1;
-  int32_t groups_ = 1;
-  int32_t depth_multiplier_ = 1;
-  int32_t input_height_ = 1;
-  int32_t input_width_ = 1;
-  int32_t kernel_height_ = 1;
-  int32_t kernel_width_ = 1;
-  int32_t stride_height_ = 1;
-  int32_t stride_width_ = 1;
-  int32_t dilation_height_ = 1;
-  int32_t dilation_width_ = 1;
-};
-
-}  // namespace
+  DepthwiseConv2DTester()
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(1)
+      .KernelWidth(1)
+      .Test(xnnpack_delegate.get());
+}
 
 TEST(DepthwiseConv2D, 2x2) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
@@ -319,15 +55,16 @@ TEST(DepthwiseConv2D, 2x2) {
   auto rng = std::mt19937(random_device());
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
-  auto groups_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 32), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
 
   DepthwiseConv2DTester()
       .InputHeight(input_rng())
       .InputWidth(input_rng())
-      .Groups(groups_rng())
+      .InputChannels(channel_rng())
       .KernelHeight(2)
       .KernelWidth(2)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -340,19 +77,20 @@ TEST(DepthwiseConv2D, 3x3) {
   auto rng = std::mt19937(random_device());
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
-  auto groups_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 32), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
 
   DepthwiseConv2DTester()
       .InputHeight(input_rng())
       .InputWidth(input_rng())
-      .Groups(groups_rng())
+      .InputChannels(channel_rng())
       .KernelHeight(3)
       .KernelWidth(3)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SmallKernel) {
+TEST(DepthwiseConv2D, 3x3Stride2) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -360,41 +98,460 @@ TEST(DepthwiseConv2D, SmallKernel) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
-  auto kernel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 7), std::ref(rng));
-  auto groups_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 32), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
 
   DepthwiseConv2DTester()
       .InputHeight(input_rng())
       .InputWidth(input_rng())
-      .Groups(groups_rng())
-      .KernelHeight(kernel_rng())
-      .KernelWidth(kernel_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(3)
+      .KernelWidth(3)
+      .StrideHeight(2)
+      .StrideWidth(2)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, Stride) {
+TEST(DepthwiseConv2D, 5x5) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(3)
+      .KernelWidth(3)
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, 5x5Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(3)
+      .KernelWidth(3)
+      .StrideHeight(2)
+      .StrideWidth(2)
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 7), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 7), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
       std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
   auto stride_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
-  auto groups_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 32), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
 
   DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
-      .Groups(groups_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, DilationWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto dilation_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .DilationHeight(dilation_rng())
+      .DilationWidth(dilation_rng())
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, DilationWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto dilation_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .DilationHeight(dilation_rng())
+      .DilationWidth(dilation_rng())
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, DepthMultiplier) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+  auto multiplier_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 8), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .DepthMultiplier(multiplier_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .ReluActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .Relu6Activation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .ReluMinus1To1Activation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, DISABLED_TanhActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .TanhActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, DISABLED_SignBitActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SignBitActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
       .KernelHeight(kernel_rng())
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
@@ -402,32 +559,5 @@ TEST(DepthwiseConv2D, Stride) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, Dilation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto input_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
-  auto kernel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
-  auto dilation_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
-  auto group_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
-
-  DepthwiseConv2DTester()
-      .InputHeight(input_rng())
-      .InputWidth(input_rng())
-      .Groups(group_rng())
-      .KernelHeight(kernel_rng())
-      .KernelWidth(kernel_rng())
-      .DilationHeight(dilation_rng())
-      .DilationWidth(dilation_rng())
-      .Test(xnnpack_delegate.get());
-}
-
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
new file mode 100644
index 00000000000..b6d1dfec69b
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
@@ -0,0 +1,222 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void DepthwiseConv2DTester::Test(TfLiteDelegate* delegate) const {
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data + BatchSize() * InputHeight() *
+                                         InputWidth() * InputChannels(),
+                input_rng);
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data,
+            default_input_data +
+                BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (int32_t i = 0; i < BatchSize(); i++) {
+    for (int32_t y = 0; y < OutputHeight(); y++) {
+      for (int32_t x = 0; x < OutputWidth(); x++) {
+        for (int32_t c = 0; c < OutputChannels(); c++) {
+          const int32_t index = ((i * OutputHeight() + y) * OutputWidth() + x) *
+                                    OutputChannels() +
+                                c;
+          ASSERT_NEAR(default_output_data[index], delegate_output_data[index],
+                      std::abs(default_output_data[index]) * 3.0e-6f)
+              << "batch " << i << " / " << BatchSize() << ", y position " << y
+              << " / " << OutputHeight() << ", x position " << x << " / "
+              << OutputWidth() << ", channel " << c << " / "
+              << OutputChannels();
+        }
+      }
+    }
+  }
+}
+
+std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, BuiltinOperator_DEPTHWISE_CONV_2D);
+
+  flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
+      CreateDepthwiseConv2DOptions(
+          builder, Padding(), StrideWidth(), StrideHeight(), DepthMultiplier(),
+          Activation(), DilationWidth(), DilationHeight());
+
+  std::vector<float> filter_data(KernelHeight() * KernelWidth() *
+                                 OutputChannels());
+  std::vector<float> bias_data(OutputChannels());
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto range_rng = std::bind(
+      std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
+  for (int32_t ic = 0; ic < InputChannels(); ic++) {
+    // Use the same range of all-positive or all-negative values to generate
+    // all pixels within the same batch index & channel, but different ranges
+    // for different channels or batches. This ensures that no catastrophic
+    // cancellation occur, but test covers both positive and negative inputs.
+    const float range = range_rng();
+    auto value_rng =
+        std::bind(std::uniform_real_distribution<float>(std::min(range, 0.0f),
+                                                        std::max(range, 0.0f)),
+                  std::ref(rng));
+    for (int32_t m = 0; m < DepthMultiplier(); m++) {
+      const int32_t oc = ic * DepthMultiplier() + m;
+      bias_data[oc] = value_rng();
+      for (int32_t y = 0; y < KernelHeight(); y++) {
+        for (int32_t x = 0; x < KernelWidth(); x++) {
+          const int32_t index = (y * KernelWidth() + x) * OutputChannels() + oc;
+          filter_data[index] = value_rng();
+        }
+      }
+    }
+  }
+
+  const std::array<flatbuffers::Offset<tflite::Buffer>, 3> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+      CreateBuffer(builder,
+                   builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(float) * filter_data.size())),
+      CreateBuffer(builder,
+                   builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(bias_data.data()),
+                       sizeof(float) * bias_data.size())),
+  }};
+
+  const std::array<int32_t, 4> input_shape{
+      {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
+  const std::array<int32_t, 4> output_shape{
+      {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
+  const std::array<int32_t, 4> filter_shape{
+      {1, KernelHeight(), KernelWidth(), OutputChannels()}};
+  const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
+
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 4> tensors{{
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+          TensorType_FLOAT32),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(filter_shape.data(),
+                                                 filter_shape.size()),
+                   TensorType_FLOAT32, /*buffer=*/1),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+          TensorType_FLOAT32, /*buffer=*/2),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   TensorType_FLOAT32),
+  }};
+
+  const std::array<int32_t, 3> op_inputs{{0, 1, 2}};
+  const std::array<int32_t, 1> op_outputs{{3}};
+
+  flatbuffers::Offset<tflite::Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      BuiltinOptions_DepthwiseConv2DOptions, depthwise_conv2d_options.Union());
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{3}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("DepthwiseConv2D model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
new file mode 100644
index 00000000000..ec8e4cea429
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
@@ -0,0 +1,226 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTHWISE_CONV_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTHWISE_CONV_2D_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class DepthwiseConv2DTester {
+ public:
+  DepthwiseConv2DTester() = default;
+  DepthwiseConv2DTester(const DepthwiseConv2DTester&) = delete;
+  DepthwiseConv2DTester& operator=(const DepthwiseConv2DTester&) = delete;
+
+  inline DepthwiseConv2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline DepthwiseConv2DTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline DepthwiseConv2DTester& DepthMultiplier(int32_t depth_multiplier) {
+    EXPECT_GT(depth_multiplier, 0);
+    depth_multiplier_ = depth_multiplier;
+    return *this;
+  }
+
+  inline int32_t DepthMultiplier() const { return depth_multiplier_; }
+
+  inline int32_t OutputChannels() const {
+    return DepthMultiplier() * InputChannels();
+  }
+
+  inline DepthwiseConv2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline DepthwiseConv2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputWidth(), 1);
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      EXPECT_GE(InputWidth(), DilatedKernelWidth());
+      return 1 + (InputWidth() - DilatedKernelWidth()) / StrideWidth();
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputHeight(), 1);
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      EXPECT_GE(InputHeight(), DilatedKernelHeight());
+      return 1 + (InputHeight() - DilatedKernelHeight()) / StrideHeight();
+    }
+  }
+
+  inline DepthwiseConv2DTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline DepthwiseConv2DTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline DepthwiseConv2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline DepthwiseConv2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline DepthwiseConv2DTester& DilationHeight(int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline DepthwiseConv2DTester& DilationWidth(int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline int32_t DilatedKernelHeight() const {
+    return (KernelHeight() - 1) * DilationHeight() + 1;
+  }
+
+  inline int32_t DilatedKernelWidth() const {
+    return (KernelWidth() - 1) * DilationWidth() + 1;
+  }
+
+  inline DepthwiseConv2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& TanhActivation() {
+    activation_ = ::tflite::ActivationFunctionType_TANH;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& SignBitActivation() {
+    activation_ = ::tflite::ActivationFunctionType_SIGN_BIT;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t depth_multiplier_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTHWISE_CONV_2D_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
new file mode 100644
index 00000000000..a801ce141ed
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
@@ -0,0 +1,326 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fully_connected_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(FullyConnected, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, 1DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .KeepDims(true)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, 2DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .KeepDims(true)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, width, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, 3DReshape) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, width, input_channels})
+      .InputChannels(width * input_channels)
+      .OutputChannels(output_channels)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, 3DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, width, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .KeepDims(true)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, height, width, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, 4DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, height, width, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .KeepDims(true)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .ReluActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .Relu6Activation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .ReluMinus1To1Activation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
new file mode 100644
index 00000000000..05716bf18fb
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -0,0 +1,219 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/fully_connected_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+std::vector<int32_t> FullyConnectedTester::OutputShape() const {
+  EXPECT_NE(input_shape_.size(), 0);
+  if (KeepDims()) {
+    std::vector<int32_t> output_shape(input_shape_.cbegin(),
+                                      input_shape_.cend() - 1);
+    output_shape.push_back(OutputChannels());
+    return output_shape;
+  } else {
+    EXPECT_EQ(InputSize() % InputChannels(), 0);
+    return std::vector<int32_t>(
+        {InputSize() / InputChannels(), OutputChannels()});
+  }
+}
+
+void FullyConnectedTester::Test(TfLiteDelegate* delegate) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data, default_input_data + InputSize(),
+                std::ref(input_rng));
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + InputSize(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
+    ASSERT_NEAR(default_output_data[i], delegate_output_data[i],
+                std::numeric_limits<float>::epsilon() *
+                    std::max(std::abs(default_output_data[i]) * 10.0f, 1.0f));
+  }
+}
+
+std::vector<char> FullyConnectedTester::CreateTfLiteModel() const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+
+  auto range_rng = std::bind(
+      std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
+
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, BuiltinOperator_FULLY_CONNECTED);
+
+  std::vector<float> filter_data(InputChannels() * OutputChannels());
+  std::vector<float> bias_data(OutputChannels());
+
+  for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+    // Use the same range of all-positive or all-negative values to generate
+    // all filter & bias weights within the same channel, but different ranges
+    // for different output channels. This ensures that no catastrophic
+    // cancellation occur, but test covers both positive and negative inputs.
+    const float range = range_rng();
+    auto value_rng =
+        std::bind(std::uniform_real_distribution<float>(std::min(range, 0.0f),
+                                                        std::max(range, 0.0f)),
+                  std::ref(rng));
+
+    bias_data[oc] = value_rng();
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      filter_data[oc * InputChannels() + ic] = value_rng();
+    }
+  }
+
+  std::array<flatbuffers::Offset<Buffer>, 3> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+      CreateBuffer(builder,
+                   builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(float) * filter_data.size())),
+      CreateBuffer(builder,
+                   builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(bias_data.data()),
+                       sizeof(float) * bias_data.size())),
+  }};
+
+  const std::array<int32_t, 2> filter_shape(
+      {OutputChannels(), InputChannels()});
+  const std::array<int32_t, 1> bias_shape({OutputChannels()});
+
+  const std::vector<int32_t> output_shape = OutputShape();
+  const std::array<flatbuffers::Offset<Tensor>, 4> tensors{{
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(InputShape().data(),
+                                                 InputShape().size()),
+                   TensorType_FLOAT32),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(filter_shape.data(),
+                                                 filter_shape.size()),
+                   TensorType_FLOAT32, /*buffer=*/1),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+          TensorType_FLOAT32, /*buffer=*/2),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   TensorType_FLOAT32),
+  }};
+
+  flatbuffers::Offset<FullyConnectedOptions> fully_connected_options =
+      CreateFullyConnectedOptions(builder, Activation(),
+                                  FullyConnectedOptionsWeightsFormat_DEFAULT,
+                                  KeepDims());
+
+  const std::array<int32_t, 3> op_inputs{{0, 1, 2}};
+  const std::array<int32_t, 1> op_outputs{{3}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      BuiltinOptions_FullyConnectedOptions, fully_connected_options.Union());
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{3}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Fully Connected model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t FullyConnectedTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
new file mode 100644
index 00000000000..1c8e3d5d60c
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
@@ -0,0 +1,114 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FULLY_CONNECTED_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_FULLY_CONNECTED_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class FullyConnectedTester {
+ public:
+  FullyConnectedTester() = default;
+  FullyConnectedTester(const FullyConnectedTester&) = delete;
+  FullyConnectedTester& operator=(const FullyConnectedTester&) = delete;
+
+  inline FullyConnectedTester& InputShape(
+      std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input_size_ = ComputeSize(input_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline int32_t InputSize() const { return input_size_; }
+
+  inline FullyConnectedTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline FullyConnectedTester& OutputChannels(int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  std::vector<int32_t> OutputShape() const;
+
+  inline FullyConnectedTester& KeepDims(bool keep_dims) {
+    keep_dims_ = keep_dims;
+    return *this;
+  }
+
+  inline bool KeepDims() const { return keep_dims_; }
+
+  inline FullyConnectedTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline FullyConnectedTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline FullyConnectedTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  int32_t input_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  bool keep_dims_ = false;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_FULLY_CONNECTED_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/hard_swish_test.cc b/tensorflow/lite/delegates/xnnpack/hard_swish_test.cc
new file mode 100644
index 00000000000..efc0d6007cf
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/hard_swish_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(HardSwish, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_HARD_SWISH, xnnpack_delegate.get());
+}
+
+TEST(HardSwish, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, width, channels})
+      .Test(BuiltinOperator_HARD_SWISH, xnnpack_delegate.get());
+}
+
+TEST(HardSwish, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, channels})
+      .Test(BuiltinOperator_HARD_SWISH, xnnpack_delegate.get());
+}
+
+TEST(HardSwish, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  UnaryElementwiseTester().Shape({batch}).Test(BuiltinOperator_HARD_SWISH,
+                                               xnnpack_delegate.get());
+}
+
+TEST(HardSwish, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_HARD_SWISH, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/logistic_test.cc b/tensorflow/lite/delegates/xnnpack/logistic_test.cc
new file mode 100644
index 00000000000..72243c3e837
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/logistic_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Logistic, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .RelativeTolerance(1.0e+4f)
+      .Test(BuiltinOperator_LOGISTIC, xnnpack_delegate.get());
+}
+
+TEST(Logistic, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, width, channels})
+      .RelativeTolerance(1.0e+4f)
+      .Test(BuiltinOperator_LOGISTIC, xnnpack_delegate.get());
+}
+
+TEST(Logistic, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, channels})
+      .RelativeTolerance(1.0e+4f)
+      .Test(BuiltinOperator_LOGISTIC, xnnpack_delegate.get());
+}
+
+TEST(Logistic, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  UnaryElementwiseTester().Shape({batch}).RelativeTolerance(1.0e+4f).Test(
+      BuiltinOperator_LOGISTIC, xnnpack_delegate.get());
+}
+
+TEST(Logistic, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .RelativeTolerance(1.0e+4f)
+      .Test(BuiltinOperator_LOGISTIC, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/mul_test.cc b/tensorflow/lite/delegates/xnnpack/mul_test.cc
new file mode 100644
index 00000000000..6c0475e2b64
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/mul_test.cc
@@ -0,0 +1,811 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Mul, 4DBy4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DBy4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, 1, 1, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, 1, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DBy4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, 1, width, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, width, 1})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DBy4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, height, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, 1, 1})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DBy4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, 1, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, 1, 1, 1})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DBy4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 2DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 2DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({channels})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 2DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, 1, 1, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, 1, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, 1, width, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, width, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, height, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, 1, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, 1, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, 1, 1, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({1, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 4DByStatic0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 2DByStatic2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 2DByStatic1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({channels})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, 2DByStatic0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, channels})
+      .Input2Shape({})
+      .Input2Static(true)
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .ReluActivation()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Relu6Activation()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .ReluMinus1To1Activation()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, DISABLED_TanhActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .TanhActivation()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, DISABLED_SignBitActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .SignBitActivation()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+TEST(Mul, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/relu6_test.cc b/tensorflow/lite/delegates/xnnpack/relu6_test.cc
new file mode 100644
index 00000000000..75f32dcfd39
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/relu6_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Relu6, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_RELU6, xnnpack_delegate.get());
+}
+
+TEST(Relu6, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, width, channels})
+      .Test(BuiltinOperator_RELU6, xnnpack_delegate.get());
+}
+
+TEST(Relu6, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, channels})
+      .Test(BuiltinOperator_RELU6, xnnpack_delegate.get());
+}
+
+TEST(Relu6, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  UnaryElementwiseTester().Shape({batch}).Test(BuiltinOperator_RELU6,
+                                               xnnpack_delegate.get());
+}
+
+TEST(Relu6, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_RELU6, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/relu_n1_to_1_test.cc b/tensorflow/lite/delegates/xnnpack/relu_n1_to_1_test.cc
new file mode 100644
index 00000000000..9e799577e6e
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/relu_n1_to_1_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(ReluMinus1To1, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_RELU_N1_TO_1, xnnpack_delegate.get());
+}
+
+TEST(ReluMinus1To1, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, width, channels})
+      .Test(BuiltinOperator_RELU_N1_TO_1, xnnpack_delegate.get());
+}
+
+TEST(ReluMinus1To1, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, channels})
+      .Test(BuiltinOperator_RELU_N1_TO_1, xnnpack_delegate.get());
+}
+
+TEST(ReluMinus1To1, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  UnaryElementwiseTester().Shape({batch}).Test(BuiltinOperator_RELU_N1_TO_1,
+                                               xnnpack_delegate.get());
+}
+
+TEST(ReluMinus1To1, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_RELU_N1_TO_1, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/relu_test.cc b/tensorflow/lite/delegates/xnnpack/relu_test.cc
new file mode 100644
index 00000000000..8996ff5d04b
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/relu_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Relu, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_RELU, xnnpack_delegate.get());
+}
+
+TEST(Relu, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, width, channels})
+      .Test(BuiltinOperator_RELU, xnnpack_delegate.get());
+}
+
+TEST(Relu, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, channels})
+      .Test(BuiltinOperator_RELU, xnnpack_delegate.get());
+}
+
+TEST(Relu, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  UnaryElementwiseTester().Shape({batch}).Test(BuiltinOperator_RELU,
+                                               xnnpack_delegate.get());
+}
+
+TEST(Relu, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_RELU, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_test.cc b/tensorflow/lite/delegates/xnnpack/softmax_test.cc
new file mode 100644
index 00000000000..ae33a1afad3
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/softmax_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/softmax_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Softmax, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  SoftmaxTester()
+      .Shape({batch, height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Softmax, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  SoftmaxTester().Shape({batch, width, channels}).Test(xnnpack_delegate.get());
+}
+
+TEST(Softmax, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  SoftmaxTester().Shape({batch, channels}).Test(xnnpack_delegate.get());
+}
+
+TEST(Softmax, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  SoftmaxTester().Shape({batch}).Test(xnnpack_delegate.get());
+}
+
+TEST(Softmax, DISABLED_Beta) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  SoftmaxTester()
+      .Shape({batch, height, width, channels})
+      .Beta(0.1f)
+      .Test(xnnpack_delegate.get());
+
+  SoftmaxTester()
+      .Shape({batch, height, width, channels})
+      .Beta(10.0f)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Softmax, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  SoftmaxTester()
+      .Shape({batch, height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
new file mode 100644
index 00000000000..c93aa0d789f
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
@@ -0,0 +1,156 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/softmax_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void SoftmaxTester::Test(TfLiteDelegate* delegate) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng = std::bind(
+      std::uniform_real_distribution<float>(-15.0f, 15.0f), std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data, default_input_data + Size(),
+                std::ref(input_rng));
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + Size(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < Size(); i++) {
+    ASSERT_NEAR(default_output_data[i], delegate_output_data[i],
+                std::numeric_limits<float>::epsilon() *
+                    std::max(std::abs(default_output_data[i]) * 10.0f, 1.0f));
+  }
+}
+
+std::vector<char> SoftmaxTester::CreateTfLiteModel() const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, BuiltinOperator_SOFTMAX);
+
+  const std::array<flatbuffers::Offset<Buffer>, 1> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+  }};
+
+  const std::array<flatbuffers::Offset<Tensor>, 2> tensors{{
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(Shape().data(), Shape().size()),
+          TensorType_FLOAT32),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(Shape().data(), Shape().size()),
+          TensorType_FLOAT32),
+  }};
+
+  flatbuffers::Offset<SoftmaxOptions> softmax_options =
+      CreateSoftmaxOptions(builder, Beta());
+
+  const std::array<int32_t, 1> op_inputs{{0}};
+  const std::array<int32_t, 1> op_outputs{{1}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      BuiltinOptions_SoftmaxOptions, softmax_options.Union());
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{1}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Softmax model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t SoftmaxTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.h b/tensorflow/lite/delegates/xnnpack/softmax_tester.h
new file mode 100644
index 00000000000..9f930a6f21e
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_SOFTMAX_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_SOFTMAX_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class SoftmaxTester {
+ public:
+  SoftmaxTester() = default;
+  SoftmaxTester(const SoftmaxTester&) = delete;
+  SoftmaxTester& operator=(const SoftmaxTester&) = delete;
+
+  inline SoftmaxTester& Shape(std::initializer_list<int32_t> shape) {
+    EXPECT_GT(shape.size(), 0);
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    size_ = SoftmaxTester::ComputeSize(shape_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& Shape() const { return shape_; }
+
+  int32_t Size() const { return size_; }
+
+  inline SoftmaxTester& Beta(float beta) {
+    beta_ = beta;
+    return *this;
+  }
+
+  float Beta() const { return beta_; }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> shape_;
+  int32_t size_;
+  float beta_ = 1.0f;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_SOFTMAX_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
new file mode 100644
index 00000000000..ce787d757bc
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
@@ -0,0 +1,156 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void UnaryElementwiseTester::Test(tflite::BuiltinOperator unary_op,
+                                  TfLiteDelegate* delegate) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng = std::bind(
+      std::uniform_real_distribution<float>(-15.0f, 15.0f), std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel(unary_op);
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data, default_input_data + Size(),
+                std::ref(input_rng));
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + Size(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < Size(); i++) {
+    ASSERT_NEAR(
+        default_output_data[i], delegate_output_data[i],
+        std::numeric_limits<float>::epsilon() *
+            std::max(std::abs(default_output_data[i]) * RelativeTolerance(),
+                     1.0f));
+  }
+}
+
+std::vector<char> UnaryElementwiseTester::CreateTfLiteModel(
+    tflite::BuiltinOperator unary_op) const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, unary_op);
+
+  const std::array<flatbuffers::Offset<Buffer>, 1> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+  }};
+
+  const std::array<flatbuffers::Offset<Tensor>, 2> tensors{{
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(Shape().data(), Shape().size()),
+          TensorType_FLOAT32),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(Shape().data(), Shape().size()),
+          TensorType_FLOAT32),
+  }};
+
+  const std::array<int32_t, 1> op_inputs{{0}};
+  const std::array<int32_t, 1> op_outputs{{1}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{1}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Unary operator model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t UnaryElementwiseTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
new file mode 100644
index 00000000000..88508ccd1c1
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_UNARY_ELEMENTWISE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_UNARY_ELEMENTWISE_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class UnaryElementwiseTester {
+ public:
+  UnaryElementwiseTester() = default;
+  UnaryElementwiseTester(const UnaryElementwiseTester&) = delete;
+  UnaryElementwiseTester& operator=(const UnaryElementwiseTester&) = delete;
+
+  inline UnaryElementwiseTester& Shape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    size_ = UnaryElementwiseTester::ComputeSize(shape_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& Shape() const { return shape_; }
+
+  int32_t Size() const { return size_; }
+
+  inline UnaryElementwiseTester& RelativeTolerance(float relative_tolerance) {
+    relative_tolerance_ = relative_tolerance;
+    return *this;
+  }
+
+  float RelativeTolerance() const { return relative_tolerance_; }
+
+  void Test(tflite::BuiltinOperator unary_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator unary_op) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> shape_;
+  int32_t size_;
+  float relative_tolerance_{10.0f};
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_UNARY_ELEMENTWISE_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index ecebacee420..6d9b4dac8f8 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <cstring>
 #include <limits>
 #include <memory>
 #include <string>
@@ -244,10 +245,9 @@ class Subgraph {
         *flags = 0;
         return kTfLiteOk;
       default:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context, "invalid padding mode (%d) in node #%d",
-                             static_cast<int>(padding), node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid padding mode (%d) in node #%d",
+                                 static_cast<int>(padding), node_index);
         return kTfLiteError;
     }
   }
@@ -273,32 +273,24 @@ class Subgraph {
         *output_max = 6.0f;
         return kTfLiteOk;
       case kTfLiteActTanh:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported fused activation (Tanh) in node #%d",
-                             node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Tanh) in node #%d",
+            node_index);
         return kTfLiteError;
       case kTfLiteActSignBit:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported fused activation (Sign) in node #%d",
-                             node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sign) in node #%d",
+            node_index);
         return kTfLiteError;
       case kTfLiteActSigmoid:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              context, "unsupported fused activation (Sigmoid) in node #%d",
-              node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sigmoid) in node #%d",
+            node_index);
         return kTfLiteError;
       default:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "invalid fused activation (%d) in node #%d",
-                             static_cast<int>(activation), node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid fused activation (%d) in node #%d",
+                                 static_cast<int>(activation), node_index);
         return kTfLiteError;
     }
   }
@@ -307,34 +299,26 @@ class Subgraph {
                                              const TfLiteConvParams* params,
                                              int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation width factor %d in node #%d",
-                           params->dilation_width_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation width factor %d in node #%d",
+                               params->dilation_width_factor, node_index);
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation height factor %d in node #%d",
-                           params->dilation_height_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation height factor %d in node #%d",
+                               params->dilation_height_factor, node_index);
       return kTfLiteError;
     }
 
@@ -345,52 +329,150 @@ class Subgraph {
       TfLiteContext* context, const TfLiteDepthwiseConvParams* params,
       int output_channels, int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->depth_multiplier <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid depth multiplier %d in node #%d",
-                           params->depth_multiplier, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid depth multiplier %d in node #%d",
+                               params->depth_multiplier, node_index);
       return kTfLiteError;
     }
     if (output_channels % params->depth_multiplier != 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "depth multiplier %d is incompatible with "
-                           "number of output channels %d in node #%d",
-                           params->depth_multiplier, output_channels,
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "depth multiplier %d is incompatible with "
+                               "number of output channels %d in node #%d",
+                               params->depth_multiplier, output_channels,
+                               node_index);
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation width factor %d in node #%d",
-                           params->dilation_width_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation width factor %d in node #%d",
+                               params->dilation_width_factor, node_index);
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation height factor %d in node #%d",
-                           params->dilation_height_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation height factor %d in node #%d",
+                               params->dilation_height_factor, node_index);
+      return kTfLiteError;
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus CheckMediaPipeTransposedConvolutionParams(
+      TfLiteContext* context, const TfLiteTransposeConvParams* params,
+      int node_index) {
+    if (params->stride_width <= 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
+      return kTfLiteError;
+    }
+    if (params->stride_height <= 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
+      return kTfLiteError;
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus CheckMediaPipePoolParams(TfLiteContext* context,
+                                               const TfLitePoolParams* params,
+                                               int node_index) {
+    if (params->stride_width <= 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
+      return kTfLiteError;
+    }
+    if (params->stride_height <= 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
+      return kTfLiteError;
+    }
+    if (params->filter_width <= 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
+                               params->filter_width, node_index);
+      return kTfLiteError;
+    }
+    if (params->filter_height <= 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
+                               params->filter_height, node_index);
+      return kTfLiteError;
+    }
+    if (params->filter_width != params->stride_width) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "filter width %d does not match stride width %d in node #%d",
+          params->filter_width, params->stride_width, node_index);
+      return kTfLiteError;
+    }
+    if (params->filter_height != params->stride_height) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "filter height %d does not match stride height %d in node #%d",
+          params->filter_height, params->stride_height, node_index);
+      return kTfLiteError;
+    }
+    switch (params->activation) {
+      case kTfLiteActNone:
+        break;
+      case kTfLiteActRelu:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Relu) in node #%d",
+            node_index);
+        return kTfLiteOk;
+      case kTfLiteActRelu1:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (ReluMinus1To1) in node #%d",
+            node_index);
+        return kTfLiteOk;
+      case kTfLiteActRelu6:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Relu6) in node #%d",
+            node_index);
+        return kTfLiteOk;
+      case kTfLiteActTanh:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Tanh) in node #%d",
+            node_index);
+        return kTfLiteError;
+      case kTfLiteActSignBit:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sign) in node #%d",
+            node_index);
+        return kTfLiteError;
+      case kTfLiteActSigmoid:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sigmoid) in node #%d",
+            node_index);
+        return kTfLiteError;
+      default:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "invalid fused activation (%d) in node #%d",
+            static_cast<int>(params->activation), node_index);
+        return kTfLiteError;
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus CheckFullyConnectedParams(
+      TfLiteContext* context, const TfLiteFullyConnectedParams* params,
+      int node_index) {
+    if (params->weights_format != kTfLiteFullyConnectedWeightsFormatDefault) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unsupported non-default weights format in node #%d",
+          node_index);
       return kTfLiteError;
     }
 
@@ -401,39 +483,29 @@ class Subgraph {
                                          const TfLitePoolParams* params,
                                          int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->filter_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
-                           params->filter_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
+                               params->filter_width, node_index);
       return kTfLiteError;
     }
     if (params->filter_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
-                           params->filter_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
+                               params->filter_height, node_index);
       return kTfLiteError;
     }
     if (params->filter_width == 1 && params->filter_height == 1) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
+                               node_index);
       return kTfLiteError;
     }
 
@@ -446,19 +518,15 @@ class Subgraph {
                                                int expected_num_outputs,
                                                int node_index) {
     if (node->inputs->size != expected_num_inputs) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unexpected number of inputs (%d != %d) in node #%d",
-                           node->inputs->size, expected_num_inputs, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of inputs (%d != %d) in node #%d",
+          node->inputs->size, expected_num_inputs, node_index);
       return kTfLiteError;
     }
     if (node->outputs->size != expected_num_outputs) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context, "unexpected number of output (%d != %d) in node #%d",
-            node->outputs->size, expected_num_outputs, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of output (%d != %d) in node #%d",
+          node->outputs->size, expected_num_outputs, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -468,11 +536,9 @@ class Subgraph {
                                            const TfLiteTensor& tensor,
                                            int tensor_index, int node_index) {
     if (tensor.type != kTfLiteFloat32) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context, "unsupported type %s in tensor #%d in node #%d",
-            TfLiteTypeGetName(tensor.type), tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unsupported type %s in tensor #%d in node #%d",
+          TfLiteTypeGetName(tensor.type), tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -483,18 +549,17 @@ class Subgraph {
                                        int expected_num_dims,
                                        int tensor_index) {
     if (tensor.dims->size != expected_num_dims) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "unexpected number of shape dimensions (%d != %d) in tensor #%d",
-            tensor.dims->size, expected_num_dims, tensor_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "unexpected number of shape dimensions (%d != %d) in tensor #%d",
+          tensor.dims->size, expected_num_dims, tensor_index);
       return kTfLiteError;
     }
     for (int i = 0; i < tensor.dims->size; i++) {
       if (tensor.dims->data[i] <= 0) {
-        TF_LITE_KERNEL_LOG(context, "invalid dimension #%d (%d) in tensor #%d",
-                           i, tensor.dims->data[i], tensor_index);
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid dimension #%d (%d) in tensor #%d", i,
+                                 tensor.dims->data[i], tensor_index);
         return kTfLiteError;
       }
     }
@@ -505,25 +570,22 @@ class Subgraph {
                                             const TfLiteTensor& tensor,
                                             int tensor_index, int node_index) {
     if (tensor.dims->size < 1) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unexpected number of shape dimensions (%d) in "
-                           "tensor #%d in node #%d: "
-                           "expected at least a 1D tensor",
-                           tensor.dims->size, tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of shape dimensions (%d) in "
+                               "tensor #%d in node #%d: "
+                               "expected at least a 1D tensor",
+                               tensor.dims->size, tensor_index, node_index);
       return kTfLiteError;
     }
     // Validate that all non-channel dimensions (if any) are exactly 1.
     for (int i = 0; i < tensor.dims->size - 1; i++) {
       if (tensor.dims->data[i] != 1) {
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unexpected value %d of shape dimension #%d in "
-                             "tensor #%d in node #%d: "
-                             "expected 1 for non-channel dimensions",
-                             tensor.dims[i], i, tensor_index, node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unexpected value %d of shape dimension #%d in "
+            "tensor #%d in node #%d: "
+            "expected 1 for non-channel dimensions",
+            tensor.dims[i], i, tensor_index, node_index);
         return kTfLiteError;
       }
     }
@@ -535,12 +597,11 @@ class Subgraph {
       int node_index) {
     // TODO(b/149120844): remove checks once dynamic tensors are supported
     if (tensor.allocation_type == kTfLiteDynamic) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid allocation type in tensor #%d in node #%d: "
-                           "expected non-dynamic tensor",
-                           tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "invalid allocation type in tensor #%d in node #%d: "
+          "expected non-dynamic tensor",
+          tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -552,12 +613,11 @@ class Subgraph {
                                                   int node_index) {
     if (tensor.allocation_type != kTfLiteMmapRo ||
         tensor.data.raw_const == nullptr) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid allocation type in tensor #%d in node #%d: "
-                           "expected static read-only tensor",
-                           tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "invalid allocation type in tensor #%d in node #%d: "
+          "expected static read-only tensor",
+          tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -604,6 +664,14 @@ class Subgraph {
                                         node, context->tensors, dwconv_params,
                                         xnnpack_tensors);
       }
+      case kTfLiteBuiltinFullyConnected: {
+        const TfLiteFullyConnectedParams* fc_params =
+            static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+        return VisitFullyConnectedNode(subgraph, logging_context, node_index,
+                                       node, context->tensors, fc_params,
+                                       xnnpack_tensors);
+      }
       case kTfLiteBuiltinHardSwish:
         return VisitHardSwishNode(subgraph, logging_context, node_index, node,
                                   context->tensors, xnnpack_tensors);
@@ -646,6 +714,36 @@ class Subgraph {
                                 context->tensors, softmax_params,
                                 xnnpack_tensors);
       }
+      case kTfLiteBuiltinCustom: {
+        if (strcmp(registration->custom_name, "Convolution2DTransposeBias") ==
+            0) {
+          TfLiteTransposeConvParams deconv_params = {kTfLitePaddingUnknown};
+          std::memcpy(&deconv_params, node->custom_initial_data,
+                      node->custom_initial_data_size);
+
+          return VisitMediaPipeDeconvolutionNode(
+              subgraph, context, node_index, node, context->tensors,
+              &deconv_params, xnnpack_tensors);
+        } else if (strcmp(registration->custom_name,
+                          "MaxPoolingWithArgmax2D") == 0) {
+          TfLitePoolParams pool_params = {kTfLitePaddingUnknown};
+          std::memcpy(&pool_params, node->custom_initial_data,
+                      node->custom_initial_data_size);
+
+          return VisitMediaPipeMaxPoolingNode(subgraph, context, node_index,
+                                              node, context->tensors,
+                                              &pool_params, xnnpack_tensors);
+        } else if (strcmp(registration->custom_name, "MaxUnpooling2D") == 0) {
+          TfLitePoolParams pool_params = {kTfLitePaddingUnknown};
+          std::memcpy(&pool_params, node->custom_initial_data,
+                      node->custom_initial_data_size);
+
+          return VisitMediaPipeUnpoolingNode(subgraph, context, node_index,
+                                             node, context->tensors,
+                                             &pool_params, xnnpack_tensors);
+        }
+        return kTfLiteError;
+      }
       default:
         return kTfLiteError;
     }
@@ -934,6 +1032,144 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus VisitFullyConnectedNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const TfLiteFullyConnectedParams* fc_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckFullyConnectedParams(logging_context, fc_params, node_index));
+
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 2,
+                                           node->inputs->data[1]));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, filter_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, filter_tensor, node->inputs->data[2], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
+                                           node->inputs->data[2]));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, bias_tensor, node->inputs->data[2], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int32_t output_channels = filter_tensor.dims->data[0];
+    const int32_t input_channels = filter_tensor.dims->data[1];
+
+    if (input_tensor.dims->size == 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unexpected number of shape dimensions %d in tensor #%d",
+          input_tensor.dims->size, node->inputs->data[0]);
+      return kTfLiteError;
+    }
+
+    int32_t num_input_elements = 1;
+    for (int i = 0; i < input_tensor.dims->size; i++) {
+      if (input_tensor.dims->data[i] <= 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context, "invalid dimension #%d (%d) in tensor #%d", i,
+            input_tensor.dims->data[i], node->inputs->data[0]);
+        return kTfLiteError;
+      }
+      num_input_elements *= input_tensor.dims->data[i];
+    }
+
+    if (fc_params->keep_num_dims) {
+      TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor,
+                                             input_tensor.dims->size,
+                                             node->outputs->data[0]));
+
+      for (int i = 0; i < input_tensor.dims->size - 1; i++) {
+        if (input_tensor.dims->data[i] != output_tensor.dims->data[i]) {
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "mismatch in shape dimension %d (%d != %d) in input and output "
+              "tensors of FULLY_CONNECTED operator #%d",
+              i, input_tensor.dims->data[i], output_tensor.dims->data[i],
+              node_index);
+          return kTfLiteError;
+        }
+      }
+    } else {
+      if (num_input_elements % input_channels != 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "number of elements in input tensor #%d in FULLY_CONNECTED "
+            "operator is not divisible by input channels (%d)",
+            node->inputs->data[0], input_channels);
+        return kTfLiteError;
+      }
+
+      TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 2,
+                                             node->outputs->data[0]));
+
+      if (output_tensor.dims->data[0] != num_input_elements / input_channels) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
+            "does not match batch size %d in reshaped input tensor #%d",
+            output_tensor.dims->data[0], node->outputs->data[0],
+            num_input_elements / input_channels, node->inputs->data[0]);
+        return kTfLiteError;
+      }
+    }
+
+    if (output_tensor.dims->data[output_tensor.dims->size - 1] !=
+        output_channels) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "number of channels %d in output tensor #%d does not match output "
+          "channels %d in filter tensor #%d",
+          output_tensor.dims->data[output_tensor.dims->size - 1],
+          node->outputs->data[0], output_channels, node->inputs->data[1]);
+      return kTfLiteError;
+    }
+
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+        logging_context, node_index, fc_params->activation, &output_min,
+        &output_max));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_fully_connected(
+          subgraph, output_min, output_max,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+          /*flags=*/fc_params->keep_num_dims ? 0
+                                             : XNN_FLAG_TENSORFLOW_RESHAPE_2D);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate FULLY_CONNECTED node #%d",
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus VisitHardSwishNode(
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
@@ -1060,6 +1296,225 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus VisitMediaPipeDeconvolutionNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const TfLiteTransposeConvParams* deconv_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+                                           node->inputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
+                                           node->inputs->data[1]));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, filter_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, filter_tensor, node->inputs->data[2], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
+                                           node->inputs->data[2]));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, bias_tensor, node->inputs->data[2], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
+                                           node->outputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int output_channels = filter_tensor.dims->data[0];
+    const int kernel_height = filter_tensor.dims->data[1];
+    const int kernel_width = filter_tensor.dims->data[2];
+    const int input_channels = filter_tensor.dims->data[3];
+
+    TF_LITE_ENSURE_STATUS(CheckMediaPipeTransposedConvolutionParams(
+        logging_context, deconv_params, node_index));
+
+    uint32_t flags = 0;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, deconv_params->padding, &flags, node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_deconvolution_2d(
+          subgraph,
+          /*padding_top=*/0,
+          /*padding_right=*/0,
+          /*padding_bottom=*/0,
+          /*padding_left=*/0,
+          /*adjustment_height=*/0,
+          /*adjustment_width=*/0, static_cast<uint32_t>(kernel_height),
+          static_cast<uint32_t>(kernel_width),
+          static_cast<uint32_t>(deconv_params->stride_height),
+          static_cast<uint32_t>(deconv_params->stride_width),
+          /*dilation_height=*/1,
+          /*dilation_width=*/1,
+          /*groups=*/1,
+          /*group_input_channels=*/input_channels,
+          /*group_output_channels=*/output_channels,
+          /*output_min=*/-std::numeric_limits<float>::infinity(),
+          /*output_max=*/+std::numeric_limits<float>::infinity(),
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(
+            logging_context,
+            "failed to delegate Convolution2DTransposeBias node #%d",
+            node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMediaPipeMaxPoolingNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const TfLitePoolParams* pool_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 1, 2, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+                                           node->inputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_value_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloatType(logging_context, output_value_tensor,
+                             node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_value_tensor,
+                                           4, node->outputs->data[0]));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorNonDynamicAllocation(logging_context, output_value_tensor,
+                                        node->outputs->data[0], node_index));
+
+    const TfLiteTensor& output_index_tensor = tensors[node->outputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_index_tensor,
+                                           4, node->outputs->data[1]));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorNonDynamicAllocation(logging_context, output_index_tensor,
+                                        node->outputs->data[1], node_index));
+
+    TF_LITE_ENSURE_STATUS(
+        CheckMediaPipePoolParams(logging_context, pool_params, node_index));
+
+    uint32_t flags = 0;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, pool_params->padding, &flags, node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_argmax_pooling_2d(
+          subgraph,
+          /*input_padding_top=*/0,
+          /*input_padding_right=*/0,
+          /*input_padding_bottom=*/0,
+          /*input_padding_left=*/0,
+          static_cast<uint32_t>(pool_params->filter_height),
+          static_cast<uint32_t>(pool_params->filter_width),
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_value_id=*/xnnpack_tensors[node->outputs->data[0]],
+          /*output_index_id=*/xnnpack_tensors[node->outputs->data[1]], flags);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(
+            logging_context,
+            "failed to delegate CUSTOM(MaxPoolingWithArgmax2D) node #%d",
+            node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMediaPipeUnpoolingNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const TfLitePoolParams* pool_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+
+    const TfLiteTensor& input_value_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloatType(logging_context, input_value_tensor,
+                             node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_value_tensor,
+                                           4, node->inputs->data[0]));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorNonDynamicAllocation(logging_context, input_value_tensor,
+                                        node->inputs->data[0], node_index));
+
+    const TfLiteTensor& input_index_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_index_tensor,
+                                           4, node->inputs->data[1]));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorNonDynamicAllocation(logging_context, input_index_tensor,
+                                        node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
+                                           node->outputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    TF_LITE_ENSURE_STATUS(
+        CheckMediaPipePoolParams(logging_context, pool_params, node_index));
+
+    uint32_t flags = 0;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, pool_params->padding, &flags, node_index));
+    if (flags != 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context, "invalid padding mode (%d) in node #%d",
+          static_cast<int>(pool_params->padding), node_index);
+    }
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_unpooling_2d(
+          subgraph,
+          /*padding_top=*/0,
+          /*padding_right=*/0,
+          /*padding_bottom=*/0,
+          /*padding_left=*/0, static_cast<uint32_t>(pool_params->filter_height),
+          static_cast<uint32_t>(pool_params->filter_width),
+          /*input_value_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*input_index_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+          /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate CUSTOM(MaxUnpooling2D) node #%d",
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus VisitMulNode(
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
@@ -1307,10 +1762,18 @@ void* SubgraphInit(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus SubgraphPrepare(TfLiteContext* context, TfLiteNode* node) {
+  if (node->user_data == nullptr) {
+    return kTfLiteError;
+  }
+
   return static_cast<Subgraph*>(node->user_data)->Prepare(context);
 }
 
 TfLiteStatus SubgraphInvoke(TfLiteContext* context, TfLiteNode* node) {
+  if (node->user_data == nullptr) {
+    return kTfLiteError;
+  }
+
   return static_cast<Subgraph*>(node->user_data)->Invoke(context);
 }
 
diff --git a/tensorflow/lite/experimental/delegates/coreml/BUILD b/tensorflow/lite/experimental/delegates/coreml/BUILD
index 92aa96d5c50..c04aba65aa0 100644
--- a/tensorflow/lite/experimental/delegates/coreml/BUILD
+++ b/tensorflow/lite/experimental/delegates/coreml/BUILD
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 package(default_visibility = [
     "//visibility:public",
 ])
@@ -46,6 +45,11 @@ objc_library(
     name = "coreml_delegate",
     srcs = ["coreml_delegate.mm"],
     hdrs = ["coreml_delegate.h"],
+    module_name = "TensorFlowLiteCCoreML",
+    # By setting CoreML as weak_framework, the TensorFlow Lite can be built for older iOS versions.
+    weak_sdk_frameworks = [
+        "CoreML",
+    ],
     deps = [
         ":coreml_delegate_kernel",
         ":mlmodel_proto_cc",
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index 09c386b55f0..2581b58f1e4 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -87,6 +87,16 @@ OpBuilder* GraphBuilder::AddBuilder(
 
 CoreML::Specification::Model* GraphBuilder::BuildModel() {
   CoreML::Specification::Model* model = new CoreML::Specification::Model();
+  if (coreml_version_ == 2) {  // Core ML 2, iOS >= 12.0
+    model->set_specificationversion(3);
+  } else if (coreml_version_ == 3) {  // Core ML 3, iOS >= 13.0
+    model->set_specificationversion(4);
+    model->mutable_neuralnetwork()->set_arrayinputshapemapping(
+        CoreML::Specification::EXACT_ARRAY_MAPPING);
+  } else {
+    fprintf(stderr, "Unsupported Core ML version: %d\n", coreml_version_);
+    return nullptr;
+  }
   auto* neural_network = model->mutable_neuralnetwork();
   for (auto& builder : builders_) {
     CoreML::Specification::NeuralNetworkLayer* layer = builder->Build();
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
index 5367ae20d2f..c59c30a5a28 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
@@ -52,6 +52,8 @@ class TensorID {
 // API is experimental and subject to change.
 class GraphBuilder {
  public:
+  explicit GraphBuilder(int coreml_version) : coreml_version_(coreml_version) {}
+
   // Returns pointer to the created builder. Ownership still belongs
   // to the GraphBuilder.
   OpBuilder* AddBuilder(int builtin_code, const TfLiteNode* node);
@@ -79,6 +81,8 @@ class GraphBuilder {
   // This information is used to mark constant tensors that are used as input.
   bool IsTensorUsed(int tflite_tensor_index);
 
+  const int coreml_version_;
+
  private:
   std::vector<std::unique_ptr<OpBuilder>> builders_;
   // Index in the vector is the tflite_tensor_index, the value
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
index b0fe24ee288..501a304706c 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
@@ -32,7 +32,8 @@ bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
 bool IsReshapeOpSupported(const TfLiteRegistration* registration,
-                          const TfLiteNode* node, TfLiteContext* context);
+                          const TfLiteNode* node, TfLiteContext* context,
+                          int coreml_version);
 bool IsResizeBilinearOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
index 33040e2e070..b7b78653d36 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
@@ -114,7 +114,11 @@ TfLiteStatus ReshapeOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
 }
 
 bool IsReshapeOpSupported(const TfLiteRegistration* registration,
-                          const TfLiteNode* node, TfLiteContext* context) {
+                          const TfLiteNode* node, TfLiteContext* context,
+                          int coreml_version) {
+  if (coreml_version >= 3) {
+    return false;
+  }
   if (node->inputs->size == 1) {
     const auto* params =
         reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/test_util.h b/tensorflow/lite/experimental/delegates/coreml/builders/test_util.h
index 46a990a4aaf..c6f1f3bed03 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/test_util.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/test_util.h
@@ -41,6 +41,7 @@ class SingleOpModelWithCoreMlDelegate : public tflite::SingleOpModel {
   tflite::Interpreter::TfLiteDelegatePtr delegate_;
   TfLiteCoreMlDelegateOptions params_ = {
       .enabled_devices = TfLiteCoreMlDelegateAllDevices,
+      .min_nodes_per_partition = 1,
   };
 };
 
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/test_util.mm b/tensorflow/lite/experimental/delegates/coreml/builders/test_util.mm
index 465ecce31e3..a57e766ba53 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/test_util.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/test_util.mm
@@ -27,7 +27,7 @@ void SingleOpModelWithCoreMlDelegate::ApplyDelegateAndInvoke() {
       delegate_ptr, [](TfLiteDelegate* delegate) { TfLiteCoreMlDelegateDelete(delegate); });
   // Add delegate.
   // TODO(karimnosseir): This doesn't actually make the test fail, switch to something else.
-  ASSERT_TRUE(interpreter_->ModifyGraphWithDelegate(delegate_.get()) != kTfLiteError);
+  ASSERT_TRUE(interpreter_->ModifyGraphWithDelegate(delegate_.get()) == kTfLiteOk);
 
   Invoke();
 }
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
index acfafe0f90e..8ad81040499 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
@@ -31,6 +31,20 @@ typedef enum {
 typedef struct {
   // Only create delegate when Neural Engine is available on the device.
   TfLiteCoreMlDelegateEnabledDevices enabled_devices;
+  // Specifies target Core ML version for model conversion.
+  // Core ML 3 come with a lot more ops, but some ops (e.g. reshape) is not
+  // delegated due to input rank constraint.
+  // if not set to one of the valid versions, the delegate will use highest
+  // version possible in the platform.
+  // Valid versions: (2, 3)
+  int coreml_version;
+  // This sets the maximum number of Core ML delegates created.
+  // Each graph corresponds to one delegated node subset in the
+  // TFLite model. Set this to 0 to delegate all possible partitions.
+  int max_delegated_partitions;
+  // This sets the minimum number of nodes per partition delegated with
+  // Core ML delegate. Defaults to 2.
+  int min_nodes_per_partition;
 } TfLiteCoreMlDelegateOptions;
 
 // Return a delegate that uses CoreML for ops execution.
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
index cc30a459a21..58728659894 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
@@ -16,11 +16,13 @@ limitations under the License.
 
 #include <string.h>
 #include <sys/utsname.h>
+#include <limits>
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h"
 #include "tensorflow/lite/experimental/delegates/coreml/builders/util.h"
 #include "tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h"
@@ -29,10 +31,12 @@ limitations under the License.
 
 namespace tflite {
 namespace {
+constexpr int kMinNodesPerCoreMlDelegate = 2;
+
 using delegates::coreml::CoreMlDelegateKernel;
 
 bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfLiteNode* node,
-                               TfLiteContext* context) {
+                               TfLiteContext* context, const TfLiteCoreMlDelegateOptions* options) {
   if (@available(iOS 11.0, *)) {
   } else {
     return false;
@@ -116,7 +120,8 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
       return true;
     }
     case kTfLiteBuiltinReshape: {
-      return delegates::coreml::IsReshapeOpSupported(registration, node, context);
+      return delegates::coreml::IsReshapeOpSupported(registration, node, context,
+                                                     options->coreml_version);
     }
     case kTfLiteBuiltinResizeBilinear: {
       return delegates::coreml::IsResizeBilinearOpSupported(registration, node, context);
@@ -138,6 +143,39 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
   return false;
 }
 
+class CoreMlDelegate : public TfLiteDelegate {
+ public:
+  explicit CoreMlDelegate(const TfLiteCoreMlDelegateOptions* params)
+      : params_(params != nullptr ? *params : TfLiteCoreMlDelegateOptions()) {
+    {
+      if (@available(iOS 13.0, *)) {
+        if (params_.coreml_version != 2 && params_.coreml_version != 3) {
+          NSLog(@"coreml_version must be 2 or 3. Setting to 3.");
+          params_.coreml_version = 3;
+        }
+      } else if (@available(iOS 12.0, *)) {
+        if (params_.coreml_version != 2) {
+          NSLog(@"coreml_version must be 2 - using Core ML version 2.");
+          params_.coreml_version = 2;
+        }
+      }
+      if (params_.max_delegated_partitions <= 0) {
+        params_.max_delegated_partitions = std::numeric_limits<int>::max();
+      }
+      if (params_.min_nodes_per_partition <= 0) {
+        params_.min_nodes_per_partition = kMinNodesPerCoreMlDelegate;
+      }
+    }
+  }
+
+  TfLiteCoreMlDelegateOptions* params() { return &params_; }
+
+  bool VerifyDelegate() { return true; }
+
+ private:
+  TfLiteCoreMlDelegateOptions params_;
+};
+
 TfLiteRegistration GetCoreMlKernelRegistration() {
   // This is the registration for the Delegate Node that gets added to
   // the TFLite graph instead of the subGraph it replaces it.
@@ -154,8 +192,10 @@ TfLiteRegistration GetCoreMlKernelRegistration() {
   };
   kernel_registration.init = [](TfLiteContext* context, const char* buffer,
                                 size_t length) -> void* {
-    const TfLiteDelegateParams* params = reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-    CoreMlDelegateKernel* coreml_kernel = new CoreMlDelegateKernel();
+    const auto* params = reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    const auto* coreml_options =
+        (reinterpret_cast<CoreMlDelegate*>(params->delegate))->params();
+    CoreMlDelegateKernel* coreml_kernel = new CoreMlDelegateKernel(coreml_options->coreml_version);
     if (coreml_kernel->Init(context, params) != kTfLiteOk) {
       delete coreml_kernel;
       return nullptr;
@@ -183,45 +223,41 @@ TfLiteRegistration GetCoreMlKernelRegistration() {
 }
 
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  // Reserve 1 element, since we need first element to be size, will be updated
-  // later.
-  std::vector<int> supported_nodes(1);
-  TfLiteIntArray* plan;
-  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
-  TfLiteNode* node;
-  TfLiteRegistration* registration;
+  const auto* params = reinterpret_cast<TfLiteCoreMlDelegateOptions*>(delegate->data_);
 
-  for (int node_index : TfLiteIntArrayView(plan)) {
-    TF_LITE_ENSURE_STATUS(
-        context->GetNodeAndRegistration(context, node_index, &node, &registration));
-    if (IsNodeSupportedByDelegate(registration, node, context)) {
-      supported_nodes.push_back(node_index);
-    }
+  delegates::IsNodeSupportedFn node_supported_fn = [=](TfLiteContext* context, TfLiteNode* node,
+                                                       TfLiteRegistration* registration,
+                                                       std::string* unsupported_details) -> bool {
+    return IsNodeSupportedByDelegate(registration, node, context, params);
+  };
+
+  delegates::GraphPartitionHelper helper(context, node_supported_fn);
+  TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
+
+  const auto delegate_partitions = helper.GetFirstNLargestPartitions(
+      params->max_delegated_partitions, params->min_nodes_per_partition);
+
+  // To avoid creating a new TfLiteIntArray and free it later, we reserve one
+  // element to represent TfLiteIntArray.size which is the 1st element of
+  // TfLiteIntArray C struct.
+  std::vector<int> supported_nodes(1);
+  for (const auto partition : delegate_partitions) {
+    auto nodes = TfLiteIntArrayView(partition->nodes_to_replace);
+    supported_nodes.insert(supported_nodes.end(), nodes.begin(), nodes.end());
   }
+
   // Set first element to the number of nodes to replace.
   supported_nodes[0] = supported_nodes.size() - 1;
-  TfLiteRegistration coreml_kernel_registration = GetCoreMlKernelRegistration();
-  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO, "CoreML delegate: %d nodes delegated out of %d nodes.\n",
-                  supported_nodes[0], plan->size);
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                  "CoreML delegate: %d nodes delegated out of %d nodes, "
+                  "with %d partitions.\n",
+                  supported_nodes[0], helper.num_total_nodes(), delegate_partitions.size());
 
   return context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, coreml_kernel_registration,
+      context, GetCoreMlKernelRegistration(),
       reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
 }
 
-class CoreMlDelegate : public TfLiteDelegate {
- public:
-  explicit CoreMlDelegate(const TfLiteCoreMlDelegateOptions* params)
-      : params_(params != nullptr ? *params : TfLiteCoreMlDelegateOptions()) {}
-
-  TfLiteCoreMlDelegateOptions* params() { return &params_; }
-
-  bool VerifyDelegate() { return true; }
-
- private:
-  TfLiteCoreMlDelegateOptions params_;
-};
-
 TfLiteDelegate* CreateCoreMlDelegate(const TfLiteCoreMlDelegateOptions* options) {
   TfLiteDelegate* delegate = new CoreMlDelegate(options);
   if (!static_cast<CoreMlDelegate*>(delegate)->VerifyDelegate()) {
@@ -265,7 +301,7 @@ bool IsNeuralEngineAvailable() {
 }  // namespace
 
 TfLiteDelegate* TfLiteCoreMlDelegateCreate(const TfLiteCoreMlDelegateOptions* options) {
-  if (@available(iOS 11.0, *)) {
+  if (@available(iOS 12.0, *)) {
     if (options->enabled_devices == TfLiteCoreMlDelegateDevicesWithNeuralEngine &&
         !IsNeuralEngineAvailable()) {
       NSLog(@"This device does not have Neural Engine, so Core ML delegate will not be enabled. "
@@ -276,7 +312,7 @@ TfLiteDelegate* TfLiteCoreMlDelegateCreate(const TfLiteCoreMlDelegateOptions* op
     return tflite::CreateCoreMlDelegate(options);
   } else {
     NSLog(@"Core ML delegate is not supported in this iOS version. "
-           "Minimum required iOS version is 11.0.");
+           "Minimum required iOS version is 12.0.");
     return nullptr;
   }
 }
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
index 04053ea81c1..8c983fb11aa 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
@@ -29,6 +29,8 @@ namespace coreml {
 // implements Init/Prepare/Invoke as TFLite kernel nodes.
 class CoreMlDelegateKernel {
  public:
+  explicit CoreMlDelegateKernel(int coreml_version)
+      : coreml_version_(coreml_version) {}
   // Initialize the delegated graph and add required nodes.
   TfLiteStatus Init(TfLiteContext* context, const TfLiteDelegateParams* params);
 
@@ -56,6 +58,7 @@ class CoreMlDelegateKernel {
   std::unique_ptr<delegates::coreml::GraphBuilder> builder_;
   std::unique_ptr<CoreML::Specification::Model> model_;
   ::CoreMlExecutor* executor_;
+  int coreml_version_;
 
   std::vector<int> input_tensor_ids_;
   std::vector<TensorData> inputs_;
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
index a36837bcc44..6a668bc971b 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
@@ -60,7 +60,7 @@ void TransposeToHWC(const float* chw, float* hwc, const TfLiteIntArray* hwc_dims
 
 TfLiteStatus CoreMlDelegateKernel::Init(TfLiteContext* context,
                                         const TfLiteDelegateParams* delegate_params) {
-  if (@available(iOS 11.0, *)) {
+  if (@available(iOS 12.0, *)) {
     executor_ = [[::CoreMlExecutor alloc] init];
     TF_LITE_ENSURE_STATUS(BuildModel(context, delegate_params));
     // Serialize the model protocol buffer and compile it.
@@ -76,7 +76,7 @@ TfLiteStatus CoreMlDelegateKernel::Init(TfLiteContext* context,
     }
     return kTfLiteOk;
   } else {
-    TF_LITE_KERNEL_LOG(context, "Minimum required iOS version is 11.0.");
+    TF_LITE_KERNEL_LOG(context, "Minimum required iOS version is 12.0.");
     return kTfLiteError;
   }
 }
@@ -104,6 +104,9 @@ void CoreMlDelegateKernel::AddOutputTensors(const TfLiteIntArray* output_tensors
     int batch_size, height_size, width_size, depth_size;
     GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
     multi_array->set_datatype(CoreML::Specification::ArrayFeatureType::FLOAT32);
+    if (coreml_version_ >= 3) {
+      multi_array->mutable_shape()->Add(batch_size);
+    }
     multi_array->mutable_shape()->Add(depth_size);
     multi_array->mutable_shape()->Add(height_size);
     multi_array->mutable_shape()->Add(width_size);
@@ -114,7 +117,7 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
                                               const TfLiteDelegateParams* delegate_params) {
   TfLiteNode* node;
   TfLiteRegistration* reg;
-  builder_.reset(new delegates::coreml::GraphBuilder());
+  builder_.reset(new delegates::coreml::GraphBuilder(coreml_version_));
   // Add Inputs
   AddInputTensors(delegate_params->input_tensors, context);
   // Build all ops.
@@ -144,8 +147,6 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
     return kTfLiteError;
   }
   AddOutputTensors(delegate_params->output_tensors, context);
-  // TODO(karimnosseir): Set correct version ?
-  model_->set_specificationversion(1);
   auto* model_description = model_->mutable_description();
   for (int i = 0; i < delegate_params->input_tensors->size; ++i) {
     const int tensor_id = delegate_params->input_tensors->data[i];
@@ -158,6 +159,9 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
       int batch_size, height_size, width_size, depth_size;
       GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
       multi_array->set_datatype(CoreML::Specification::ArrayFeatureType::FLOAT32);
+      if (coreml_version_ >= 3) {
+        multi_array->mutable_shape()->Add(batch_size);
+      }
       multi_array->mutable_shape()->Add(depth_size);
       multi_array->mutable_shape()->Add(height_size);
       multi_array->mutable_shape()->Add(width_size);
@@ -181,9 +185,12 @@ TfLiteStatus CoreMlDelegateKernel::Prepare(TfLiteContext* context, TfLiteNode* n
     int batch_size, height_size, width_size, depth_size;
     GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor->dims);
 
-    inputs_.push_back({std::vector<float>(input_size),
-                       builder_->GetTensorName(tensor_index),
-                       {depth_size, height_size, width_size}});
+    std::vector<int> input_shape = {depth_size, height_size, width_size};
+    if (coreml_version_ >= 3) {
+      input_shape.insert(input_shape.begin(), batch_size);
+    }
+    inputs_.push_back(
+        {std::vector<float>(input_size), builder_->GetTensorName(tensor_index), input_shape});
   }
 
   outputs_.reserve(node->outputs->size);
@@ -222,9 +229,7 @@ TfLiteStatus CoreMlDelegateKernel::Invoke(TfLiteContext* context, TfLiteNode* no
   }
 }
 
-CoreMlDelegateKernel::~CoreMlDelegateKernel() {
-  [executor_ cleanup];
-}
+CoreMlDelegateKernel::~CoreMlDelegateKernel() { [executor_ cleanup]; }
 
 }  // namespace coreml
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
index edec3020cbc..5ce0a0ade6c 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
@@ -45,4 +45,5 @@ struct TensorData {
 @property MLModel* model API_AVAILABLE(ios(11));
 @property NSString* mlModelFilePath;
 @property NSString* compiledModelFilePath;
+@property(nonatomic, readonly) int coreMlVersion;
 @end
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
index 2091c0d7ca0..1f808e08d49 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
@@ -39,17 +39,22 @@ NSURL* createTemporaryFile() {
   NSSet* _featureNames;
 }
 
-- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs;
+- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs
+                 coreMlVersion:(int)coreMlVersion;
 - (MLFeatureValue*)featureValueForName:(NSString*)featureName API_AVAILABLE(ios(11));
 - (NSSet<NSString*>*)featureNames;
 
+@property(nonatomic, readonly) int coreMlVersion;
+
 @end
 
 @implementation MultiArrayFeatureProvider
 
-- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs {
+- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs
+                 coreMlVersion:(int)coreMlVersion {
   self = [super init];
   _inputs = inputs;
+  _coreMlVersion = coreMlVersion;
   for (auto& input : *_inputs) {
     if (input.name.empty()) {
       return nil;
@@ -74,8 +79,31 @@ NSURL* createTemporaryFile() {
   for (auto& input : *_inputs) {
     if ([featureName cStringUsingEncoding:NSUTF8StringEncoding] == input.name) {
       // TODO(b/141492326): Update shape handling for higher ranks
-      NSArray* shape = @[ @(input.shape[0]), @(input.shape[1]), @(input.shape[2]) ];
-      NSArray* strides = @[ @(input.shape[1] * input.shape[2]), @(input.shape[2]), @1 ];
+      NSArray* shape = @[
+        @(input.shape[0]),
+        @(input.shape[1]),
+        @(input.shape[2]),
+      ];
+      NSArray* strides = @[
+        @(input.shape[1] * input.shape[2]),
+        @(input.shape[2]),
+        @1,
+      ];
+
+      if ([self coreMlVersion] >= 3) {
+        shape = @[
+          @(input.shape[0]),
+          @(input.shape[1]),
+          @(input.shape[2]),
+          @(input.shape[3]),
+        ];
+        strides = @[
+          @(input.shape[1] * input.shape[2] * input.shape[3]),
+          @(input.shape[2] * input.shape[3]),
+          @(input.shape[3]),
+          @1,
+        ];
+      };
       NSError* error = nil;
       MLMultiArray* mlArray = [[MLMultiArray alloc] initWithDataPointer:(float*)input.data.data()
                                                                   shape:shape
@@ -106,7 +134,7 @@ NSURL* createTemporaryFile() {
   }
   NSError* error = nil;
   MultiArrayFeatureProvider* inputFeature =
-      [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs];
+      [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs coreMlVersion:[self coreMlVersion]];
   if (inputFeature == nil) {
     NSLog(@"inputFeature is not initialized.");
     return NO;
@@ -153,6 +181,14 @@ NSURL* createTemporaryFile() {
 - (NSURL*)saveModel:(CoreML::Specification::Model*)model {
   NSURL* modelUrl = createTemporaryFile();
   NSString* modelPath = [modelUrl path];
+  if (model->specificationversion() == 3) {
+    _coreMlVersion = 2;
+  } else if (model->specificationversion() == 4) {
+    _coreMlVersion = 3;
+  } else {
+    NSLog(@"Only Core ML models with specification version 3 or 4 are supported");
+    return nil;
+  }
   // Flush data to file.
   // TODO(karimnosseir): Can we mmap this instead of actual writing it to phone ?
   std::ofstream file_stream([modelPath UTF8String], std::ios::out | std::ios::binary);
diff --git a/tensorflow/lite/experimental/delegates/hexagon/BUILD b/tensorflow/lite/experimental/delegates/hexagon/BUILD
index 1028fd53582..e799140b2ef 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/BUILD
@@ -61,7 +61,6 @@ cc_library(
         "//tensorflow/lite/experimental/delegates/hexagon/builders:op_builder",
         "//tensorflow/lite/experimental/delegates/hexagon/hexagon_nn:hexagon_nn_header",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/schema:schema_fbs",
         "@hexagon_nn//:hexagon_nn_ops",
     ],
@@ -82,6 +81,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/delegates/hexagon/README.md b/tensorflow/lite/experimental/delegates/hexagon/README.md
index 6ad7d302bcc..b0d97b42c99 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/README.md
+++ b/tensorflow/lite/experimental/delegates/hexagon/README.md
@@ -76,12 +76,18 @@ are verified in `IsNodeSupportedByHexagon`:
       - dilation only supported when stride == 1
       - Otherwise, stride height/width <= 3
 * FullyConnected (without any activation)
+* Hardswish
 * L2Normalization (without any activation)
 * Logistic (aka Sigmoid)
+* Maximum
 * MaxPool2D (without any activation) (b/129276536)
+* Mean
+* Minimum
+* MirrorPad
 * Mul (without any activation) (b/129276536)
 * Neg
 * Pad: Only supports 0 padding (b/139277813)
+* Quantize (8-bit inputs & outputs only)
 * Relu
 * Relu6
 * Reshape
@@ -89,6 +95,7 @@ are verified in `IsNodeSupportedByHexagon`:
   * Constraints:
     - Requested size <= 65 (b/143105433)
 * Resize Nearest Neighbor
+* Slice
 * SoftMax
 * SpaceToDepth
 * Split
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
index 4a49b457b20..d120d414181 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
@@ -11,19 +11,26 @@ cc_library(
         "activation_builder.cc",
         "arg_min_max_builder.cc",
         "arithmetic_builder.cc",
+        "batch_seq_builder.cc",
+        "cast_builder.cc",
         "concat_builder.cc",
         "conv_2d_builder.cc",
         "conv_2d_helpers.cc",
+        "hardswish_builder.cc",
         "l2_normalization_builder.cc",
         "matmul_builder.cc",
+        "min_max_builder.cc",
+        "mirror_pad_builder.cc",
         "neg_op_builder.cc",
         "op_builder.cc",
         "pad_builder.cc",
         "pool_2d_builder.cc",
+        "quantize_builder.cc",
         "reduce_builder.cc",
         "reshape_builder.cc",
         "resize_bilinear_builder.cc",
         "resize_nearest_neighbor_builder.cc",
+        "slice_builder.cc",
         "softmax_builder.cc",
         "space_to_depth_builder.cc",
         "split_builder.cc",
@@ -34,18 +41,25 @@ cc_library(
         "activation_builder.h",
         "arg_min_max_builder.h",
         "arithmetic_builder.h",
+        "batch_seq_builder.h",
+        "cast_builder.h",
         "concat_builder.h",
         "conv_2d_builder.h",
+        "hardswish_builder.h",
         "l2_normalization_builder.h",
         "matmul_builder.h",
+        "min_max_builder.h",
+        "mirror_pad_builder.h",
         "neg_op_builder.h",
         "op_builder.h",
         "pad_builder.h",
         "pool_2d_builder.h",
+        "quantize_builder.h",
         "reduce_builder.h",
         "reshape_builder.h",
         "resize_bilinear_builder.h",
         "resize_nearest_neighbor_builder.h",
+        "slice_builder.h",
         "softmax_builder.h",
         "space_to_depth_builder.h",
         "split_builder.h",
@@ -66,6 +80,7 @@ cc_library(
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:tensor",
         "@hexagon_nn//:hexagon_nn_ops",
     ],
 )
@@ -77,4 +92,5 @@ cc_library(
         "manual",
         "nobuilder",
     ],
+    deps = ["//tensorflow/lite/c:common"],
 )
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/arg_min_max_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/arg_min_max_builder.cc
index 9ea2ca49ec8..0d4c2e09d76 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/arg_min_max_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/arg_min_max_builder.cc
@@ -57,9 +57,7 @@ TfLiteStatus ArgMinMaxOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
 
   // Compute Min/Max
   TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_,
-                                  std::numeric_limits<uint8_t>::min(),
-                                  std::numeric_limits<uint8_t>::max()));
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/batch_seq_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/batch_seq_builder.cc
new file mode 100644
index 00000000000..17c40afe83a
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/batch_seq_builder.cc
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/batch_seq_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+TfLiteStatus BatchSeqBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                               const TfLiteIntArray* outputs,
+                                               TfLiteContext* context) {
+  // Add config input.
+  static const int config_shape[] = {1, 1, 1, 3};
+  // TODO(b/152562126): Allow custom setting for BQ (preferred batch multiple),
+  // and Options.
+  // BQ is preferred batch multiple
+  // Options is currently 0 or 1, 0 is default and batches
+  // will run in increasing order, this behavior can be disabled by setting 1.
+  // Refer to Hexagon NN docs for more details.
+  int32_t config[] = {max_size_for_batch_, 1, 0};
+
+  auto* input_config = graph_builder_->AddConstNodeWithData(
+      config_shape, reinterpret_cast<char*>(&config), sizeof(int32_t) * 3);
+  AddInput(TensorID(input_config->GetID(), 0));
+
+  // Add Input batch details.
+  const int input_batch_dims_shape[] = {1, 1, 1, input_batch_dims_->size};
+  auto* input_batch_dims_node = graph_builder_->AddConstNodeWithData(
+      input_batch_dims_shape, reinterpret_cast<char*>(input_batch_dims_->data),
+      sizeof(input_batch_dims_[0]) * input_batch_dims_->size);
+  AddInput(TensorID(input_batch_dims_node->GetID(), 0));
+
+  // Add Output batch details.
+  const int output_batch_dims_shape[] = {1, 1, 1, output_batch_dims_->size};
+  auto* output_batch_dims_node = graph_builder_->AddConstNodeWithData(
+      output_batch_dims_shape,
+      reinterpret_cast<char*>(output_batch_dims_->data),
+      sizeof(output_batch_dims_[0]) * output_batch_dims_->size);
+  AddInput(TensorID(output_batch_dims_node->GetID(), 0));
+
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateBatchSeqBuilder(GraphBuilder* graph_builder, int op_type,
+                                 int max_size_for_batch,
+                                 TfLiteIntArray* input_batch_dimensions,
+                                 TfLiteIntArray* output_batch_dimensions) {
+  auto* builder = new BatchSeqBuilder(graph_builder, op_type);
+  builder->SetMaxSizeForBatch(max_size_for_batch);
+  builder->SetInputBatchDimensions(input_batch_dimensions);
+  builder->SetOutputBatchDimensions(output_batch_dimensions);
+  return builder;
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/batch_seq_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/batch_seq_builder.h
new file mode 100644
index 00000000000..17f02207015
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/batch_seq_builder.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_BATCH_SEQ_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_BATCH_SEQ_BUILDER_H_
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class BatchSeqBuilder : public OpBuilder {
+ public:
+  explicit BatchSeqBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override {
+    // BatchSeqConfig doesn't have any outputs.
+    return kTfLiteOk;
+  }
+
+  void SetMaxSizeForBatch(int max_size_for_batch) {
+    max_size_for_batch_ = max_size_for_batch;
+  }
+
+  void SetInputBatchDimensions(TfLiteIntArray* input_batch_dimensions) {
+    input_batch_dims_ = input_batch_dimensions;
+  }
+
+  void SetOutputBatchDimensions(TfLiteIntArray* output_batch_dimensions) {
+    output_batch_dims_ = output_batch_dimensions;
+  }
+
+ private:
+  // Maximum size for the batch dimension in a single run.
+  // The graph can have input with larger batch, internally
+  // multiple runs will happen each won't have more than 'max_size_for_batch_'
+  // in batch dimension.
+  int max_size_for_batch_ = 1;
+  // Input dimension for each input in the graph.
+  // Input with fixed batch should have -1.
+  TfLiteIntArray* input_batch_dims_;
+  // Output dimension for each output in the graph.
+  // Output with fixed batch should have -1.
+  TfLiteIntArray* output_batch_dims_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_BATCH_SEQ_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/cast_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/cast_builder.cc
new file mode 100644
index 00000000000..fb81a7a054f
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/cast_builder.cc
@@ -0,0 +1,94 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/cast_builder.h"
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus CastOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                             const TfLiteIntArray* outputs,
+                                             TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+
+  // Should be only 1 tensor that is cast in-place.
+  if (inputs->size != 1 || outputs->size != 1) {
+    TF_LITE_KERNEL_LOG(context, "Cast supports a single tensor");
+    return kTfLiteError;
+  } else if (inputs->data[0] != outputs->data[0]) {
+    TF_LITE_KERNEL_LOG(context, "input & output should be same for Cast");
+    return kTfLiteError;
+  }
+
+  int tensor_id = inputs->data[0];
+  const auto& tensor = context->tensors[tensor_id];
+  int batch_size, height_size, width_size, depth_size;
+  GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
+  float min_value = 0;
+  float max_value = 0;
+  if (tensor.quantization.type ==
+      TfLiteQuantizationType::kTfLiteAffineQuantization) {
+    // Casting doesn't require min/max, so populate only if available.
+    TF_LITE_ENSURE_STATUS(
+        ComputeMinAndMaxQuantValues(tensor, &min_value, &max_value));
+  }
+  auto* min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&min_value), sizeof(min_value));
+  auto* max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&max_value), sizeof(max_value));
+
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+  AddInput(TensorID(min_const->GetID(), 0));
+  AddInput(TensorID(max_const->GetID(), 0));
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {batch_size, height_size, width_size, depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus CastOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                            TfLiteContext* context) {
+  // Should be only 1 output.
+  // Cast tensor already exists in the graph, so we need to overwrite it with
+  // the new TensorID.
+  if (!graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                       node_output_.second,
+                                       /*overwrite*/ true)) {
+    TF_LITE_KERNEL_LOG(context, "Could not register Cast output.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+CastOpBuilder::~CastOpBuilder() {}
+
+OpBuilder* CreateCastBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new CastOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/cast_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/cast_builder.h
new file mode 100644
index 00000000000..d40a640712e
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/cast_builder.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_CAST_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_CAST_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+// This builder is used to cast int8 input or output tensors to & from uint8
+// respectively. No TFLite op converts to this.
+// NOTE: There are no explicit tests for this, but is required for all int8 unit
+// tests.
+class CastOpBuilder : public OpBuilder {
+ public:
+  explicit CastOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  // inputs & outputs should contain the *same* (one) TFLite tensor-id, since
+  // tensors are cast in-place. The tensor will point to a different Hexagon
+  // TensorID after this runs.
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~CastOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_CAST_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/concat_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/concat_builder.cc
index ef94876b59d..5dc8ebac450 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/concat_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/concat_builder.cc
@@ -30,12 +30,16 @@ TfLiteStatus ConcatOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
                                                TfLiteContext* context) {
   static int quant_bound_shape[] = {1, 1, 1, 1};
 
-  // Only axis 3 is supported.
   const TfLiteConcatenationParams* concat_params =
       reinterpret_cast<const TfLiteConcatenationParams*>(builtin_data_);
+  int concat_axis = concat_params->axis;
+  const int output_dim_size = context->tensors[outputs->data[0]].dims->size;
+  // Axis value is incremented if tensor dims are < 4 and/or axis < 0.
+  concat_axis =
+      concat_axis < 0 ? concat_axis + 4 : concat_axis + 4 - output_dim_size;
   auto* axis_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, (char*)&concat_params->axis,
-      sizeof(concat_params->axis));
+      quant_bound_shape, reinterpret_cast<char*>(&concat_axis),
+      sizeof(concat_axis));
   AddInput(TensorID(axis_const->GetID(), 0));
 
   int tensor_id;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.h
index f67f017299c..7cae15fbc32 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.h
@@ -37,6 +37,7 @@ class Conv2dOpBuilder : public OpBuilder {
   ~Conv2dOpBuilder() override;
 
  private:
+  // TODO(b/142009955): Combine into common util for all types of Conv.
   TfLiteStatus ProcessPerChannelQuantizedWeights(const TfLiteIntArray* inputs,
                                                  const TfLiteIntArray* outputs,
                                                  TfLiteContext* context,
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/hardswish_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/hardswish_builder.cc
new file mode 100644
index 00000000000..86766371b97
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/hardswish_builder.cc
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/hardswish_builder.h"
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus HardSwishOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                                  const TfLiteIntArray* outputs,
+                                                  TfLiteContext* context) {
+  static int quant_bound_shape[] = {1, 1, 1, 1};
+
+  // input data tensor.
+  int tensor_id = inputs->data[0];
+  const auto& input1_tensor = context->tensors[tensor_id];
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(input1_tensor, &input_min_, &input_max_));
+  auto* input_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_min_),
+      sizeof(input_min_));
+  auto* input_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_max_),
+      sizeof(input_max_));
+  AddInput(TensorID(input_min_const->GetID(), 0));
+  AddInput(TensorID(input_max_const->GetID(), 0));
+
+  // Output min/max
+  TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
+      context->tensors[outputs->data[0]], &output_min_, &output_max_));
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_min_),
+      sizeof(output_min_));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_max_),
+      sizeof(output_max_));
+  AddInput(TensorID(output_min_const->GetID(), 0));
+  AddInput(TensorID(output_max_const->GetID(), 0));
+
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus HardSwishOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                                 TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+HardSwishOpBuilder::~HardSwishOpBuilder() {}
+
+OpBuilder* CreateHardSwishBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new HardSwishOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/hardswish_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/hardswish_builder.h
new file mode 100644
index 00000000000..dac297db457
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/hardswish_builder.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_HARDSWISH_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_HARDSWISH_BUILDER_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class HardSwishOpBuilder : public OpBuilder {
+ public:
+  explicit HardSwishOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~HardSwishOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  float input_min_, input_max_, output_min_, output_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_HARDSWISH_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/l2_normalization_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/l2_normalization_builder.cc
index ab91f65e18c..924408912f9 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/l2_normalization_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/l2_normalization_builder.cc
@@ -36,9 +36,7 @@ TfLiteStatus L2NormalizationOpBuilder::PopulateSubGraph(
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
   TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_,
-                                  std::numeric_limits<uint8_t>::min(),
-                                  std::numeric_limits<uint8_t>::max()));
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
index c53e62d27a7..c0c815ffdcc 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
@@ -129,35 +129,41 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
 
   // Bias tensor.
   int bias_tensor_id = inputs->data[2];
-  const auto& bias_tensor = context->tensors[bias_tensor_id];
-  auto* const_bias_node =
-      graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor);
-  graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(), 0);
-  ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_);
-  auto* bias_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&bias_min_),
-      sizeof(bias_min_));
-  auto* bias_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&bias_max_),
-      sizeof(bias_max_));
+  TensorID matmul_and_bias_out = matmul_out,
+           matmul_and_bias_out_min = matmul_out_min,
+           matmul_and_bias_out_max = matmul_out_max;
+  if (bias_tensor_id != -1) {
+    const auto& bias_tensor = context->tensors[bias_tensor_id];
+    auto* const_bias_node =
+        graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor);
+    graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(),
+                                    0);
+    ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_);
+    auto* bias_min_const = graph_builder_->AddConstNodeWithData(
+        quant_bound_shape, reinterpret_cast<char*>(&bias_min_),
+        sizeof(bias_min_));
+    auto* bias_max_const = graph_builder_->AddConstNodeWithData(
+        quant_bound_shape, reinterpret_cast<char*>(&bias_max_),
+        sizeof(bias_max_));
 
-  // MatMul + Bias.
-  auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID());
-  bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
-  bias_add_op->AddInput(matmul_out);
-  bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id));
-  bias_add_op->AddInput(matmul_out_min);
-  bias_add_op->AddInput(matmul_out_max);
-  bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0));
-  bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0));
-  const auto& bias_add_out =
-      bias_add_op->AddOutput(sizeof(int32_t), 4,
-                             {output_batch_size, output_height_size,
-                              output_width_size, output_depth_size});
-  const auto& bias_add_out_min =
-      bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  const auto& bias_add_out_max =
-      bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    // MatMul + Bias.
+    auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID());
+    bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
+    bias_add_op->AddInput(matmul_out);
+    bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id));
+    bias_add_op->AddInput(matmul_out_min);
+    bias_add_op->AddInput(matmul_out_max);
+    bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0));
+    bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0));
+    matmul_and_bias_out =
+        bias_add_op->AddOutput(sizeof(int32_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+    matmul_and_bias_out_min =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    matmul_and_bias_out_max =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  }
 
   // Quantize 32-bit result into 8-bit format using output tensor min/max.
   ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]], &output_min_,
@@ -170,9 +176,9 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
       sizeof(output_max_));
   auto* quantize_biasadd_op = graph_builder_->AddNode(GetTFLiteNodeID());
   quantize_biasadd_op->SetOpType(OP_Requantize_32to8);
-  quantize_biasadd_op->AddInput(bias_add_out);
-  quantize_biasadd_op->AddInput(bias_add_out_min);
-  quantize_biasadd_op->AddInput(bias_add_out_max);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_min);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_max);
   quantize_biasadd_op->AddInput(TensorID(output_min_const->GetID(), 0));
   quantize_biasadd_op->AddInput(TensorID(output_max_const->GetID(), 0));
   node_output_ =
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
new file mode 100644
index 00000000000..ab5895b9a14
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h"
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus MinMaxOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                               const TfLiteIntArray* outputs,
+                                               TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  int a_tensor_id;
+  int b_tensor_id;
+
+  // Input tensors a and b.
+  a_tensor_id = inputs->data[0];
+  b_tensor_id = inputs->data[1];
+  const auto& a_tensor = context->tensors[a_tensor_id];
+  const auto& b_tensor = context->tensors[b_tensor_id];
+  if (a_tensor.allocation_type == kTfLiteMmapRo)
+    graph_builder_->AddConstNodeWithData(a_tensor_id, a_tensor);
+  if (b_tensor.allocation_type == kTfLiteMmapRo)
+    graph_builder_->AddConstNodeWithData(b_tensor_id, b_tensor);
+  AddInput(graph_builder_->GetHexagonTensorId(a_tensor_id));
+  AddInput(graph_builder_->GetHexagonTensorId(b_tensor_id));
+
+  // Add Inputs A & B min/max
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(a_tensor, &a_input_min_, &a_input_max_));
+  auto* a_input_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&a_input_min_),
+      sizeof(a_input_min_));
+  auto* a_input_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&a_input_max_),
+      sizeof(a_input_max_));
+  AddInput(TensorID(a_input_min_const->GetID(), 0));
+  AddInput(TensorID(a_input_max_const->GetID(), 0));
+
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(b_tensor, &b_input_min_, &b_input_max_));
+  auto* b_input_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&b_input_min_),
+      sizeof(b_input_min_));
+  auto* b_input_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&b_input_max_),
+      sizeof(b_input_max_));
+  AddInput(TensorID(b_input_min_const->GetID(), 0));
+  AddInput(TensorID(b_input_max_const->GetID(), 0));
+
+  // Add output min/max
+  const int output_tensor_id = outputs->data[0];
+  const auto& output_tensor = context->tensors[output_tensor_id];
+  float output_min, output_max;
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max));
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
+  AddInput(TensorID(output_min_const->GetID(), 0));
+  AddInput(TensorID(output_max_const->GetID(), 0));
+
+  // Add outputs.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MinMaxOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                              TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new MinMaxOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
new file mode 100644
index 00000000000..4d50d941e4f
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class MinMaxOpBuilder : public OpBuilder {
+ public:
+  explicit MinMaxOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  float a_input_min_, a_input_max_, b_input_min_, b_input_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc
new file mode 100644
index 00000000000..2a04088f4f3
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc
@@ -0,0 +1,112 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h"
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus MirrorPadOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                                  const TfLiteIntArray* outputs,
+                                                  TfLiteContext* context) {
+  static int quant_bound_shape[] = {1, 1, 1, 1};
+  int tensor_id;
+
+  // Input data tensor.
+  tensor_id = inputs->data[0];
+  const auto& input_tensor = context->tensors[tensor_id];
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+
+  // Padding tensor.
+  // Should be a constant.
+  tensor_id = inputs->data[1];
+  const auto& padding_tensor = context->tensors[tensor_id];
+  if (padding_tensor.dims->size != 2 || padding_tensor.dims->data[0] > 4 ||
+      padding_tensor.dims->data[1] != 2) {
+    TF_LITE_KERNEL_LOG(context, "Invalid padding tensor shape");
+    return kTfLiteError;
+  }
+  paddings_shape_ = {1, 1, 4, 2};
+  std::vector<int> padding_data(8, 0);
+  // Hexagon always expects padding data for each dimension in order {b, h, w,
+  // d}. This start value ensures we pad the non-relevant dimensions with 0.
+  int padding_data_start = 8 - padding_tensor.dims->data[0] * 2;
+  for (int i = 0; i < padding_tensor.dims->data[0] * 2; ++i) {
+    padding_data[padding_data_start + i] = padding_tensor.data.i32[i];
+  }
+  auto* const_padding_node = graph_builder_->AddConstNodeWithData(
+      paddings_shape_.data(), reinterpret_cast<char*>(padding_data.data()),
+      padding_data.size() * sizeof(padding_data[0]));
+  AddInput(TensorID(const_padding_node->GetID(), 0));
+  // Padding type.
+  const TfLiteMirrorPaddingParams* params =
+      reinterpret_cast<const TfLiteMirrorPaddingParams*>(builtin_data_);
+  if (params->mode == kTfLiteMirrorPaddingReflect) {
+    SetPaddingType(NN_PAD_MIRROR_REFLECT);
+  } else if (params->mode == kTfLiteMirrorPaddingSymmetric) {
+    SetPaddingType(NN_PAD_MIRROR_SYMMETRIC);
+  }
+
+  // Min/max values for input tensor.
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
+  auto* input_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_min_),
+      sizeof(input_min_));
+  auto* input_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_max_),
+      sizeof(input_max_));
+  AddInput(TensorID(input_min_const->GetID(), 0));
+  AddInput(TensorID(input_max_const->GetID(), 0));
+
+  // Hexagon outputs for this node.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MirrorPadOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                                 TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+MirrorPadOpBuilder::~MirrorPadOpBuilder() {}
+
+OpBuilder* CreateMirrorPadBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new MirrorPadOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h
new file mode 100644
index 00000000000..6fcb2606701
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class MirrorPadOpBuilder : public OpBuilder {
+ public:
+  explicit MirrorPadOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~MirrorPadOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  float input_min_, input_max_;
+  std::vector<int> paddings_shape_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index d9d75cf2b04..072f8da6fff 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
 
+#include "hexagon/hexagon_nn_ops.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h"
@@ -42,6 +43,8 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreateReduceBuilder(this, OP_QuantizedSum_8to32);
     case kTfLiteBuiltinPad:
       return CreatePadBuilder(this, OP_QuantizedPad_8);
+    case kTfLiteBuiltinMirrorPad:
+      return CreateMirrorPadBuilder(this, OP_MirrorPad_8);
     case kTfLiteBuiltinFullyConnected:
       return CreateMatMulBuilder(this, OP_QuantizedMatMul_8x8to32);
     case kTfLiteBuiltinAveragePool2d:
@@ -86,6 +89,16 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreateSpaceToDepthBuilder(this, OP_SpaceToDepth_8);
     case kTfLiteBuiltinDepthToSpace:
       return CreateSpaceToDepthBuilder(this, OP_DepthToSpace_8);
+    case kTfLiteBuiltinQuantize:
+      return CreateQuantizeBuilder(this, OP_Requantize_8to8);
+    case kTfLiteBuiltinHardSwish:
+      return CreateHardSwishBuilder(this, OP_QuantizedHardSwish_8);
+    case kTfLiteBuiltinMinimum:
+      return CreateMinMaxBuilder(this, OP_QuantizedMinimum_8);
+    case kTfLiteBuiltinMaximum:
+      return CreateMinMaxBuilder(this, OP_QuantizedMaximum_8);
+    case kTfLiteBuiltinSlice:
+      return CreateSliceOpBuilder(this, OP_QuantizedSlice_8);
     default:
       context_->ReportError(context_, "Op not supported: %d", op_type);
       return nullptr;
@@ -128,43 +141,94 @@ OpBuilder* GraphBuilder::AddConstNodeWithData(int tensor_id,
   return builders_.back().get();
 }
 
-void delegates::hexagon::GraphBuilder::AddInputTensors(
-    const TfLiteIntArray* input_tensors, TfLiteContext* context) {
-  builders_.emplace_back(new OpBuilder(this, OP_INPUT));
-  builders_.back()->SetNodeId(builders_.size());
+// TODO(b/154604279): Support these casting ops in Hexagon op profiling (which
+// seems to key tensors on a single op, which may not be the case now).
+TfLiteStatus GraphBuilder::AddCastOp(TfLiteContext* context, int op_type,
+                                     int tensor_id,
+                                     OpBuilder::TensorID hexagon_input) {
+  // Create a new OpBuilder for casting the tensor.
+  OpBuilder* cast_builder = CreateCastBuilder(this, op_type);
+  builders_.emplace_back(cast_builder);
+  cast_builder->SetNodeId(builders_.size());
+  // We cast the tensor in-place, so there is only 1 input & output which is the
+  // same.
+  auto* tensor_data = TfLiteIntArrayCreate(1);
+  tensor_data->data[0] = tensor_id;
+
+  TF_LITE_ENSURE_STATUS(
+      cast_builder->PopulateSubGraph(tensor_data, tensor_data, context));
+  TF_LITE_ENSURE_STATUS(cast_builder->RegisterOutputs(tensor_data, context));
+
+  TfLiteIntArrayFree(tensor_data);
+  return kTfLiteOk;
+}
+
+TfLiteStatus GraphBuilder::AddInputTensors(const TfLiteIntArray* input_tensors,
+                                           TfLiteContext* context) {
+  auto* input_op = AddNode();
+  input_op->SetOpType(OP_INPUT);
+
   // We need to track num_inputs since not all input_tensors are actual input
   // data. Some are constants.
   int num_inputs = 0;
   for (int i = 0; i < input_tensors->size; ++i) {
     const int tensor_id = input_tensors->data[i];
     const auto& tensor = context->tensors[tensor_id];
-    if (tensor.allocation_type != kTfLiteMmapRo) {
-      AddTensorWithID(tensor_id, builders_.size(), num_inputs);
-      builders_.back()->AddOutput(tensor.dims);
-      ++num_inputs;
+    if (tensor.allocation_type == kTfLiteMmapRo) continue;
+    input_op->AddOutput(tensor.dims);
+    AddTensorWithID(tensor_id, input_op->GetID(), num_inputs);
+    // If tensor is of type int8, add an op to cast it to uint8.
+    if (tensor.type == kTfLiteInt8) {
+      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastInt8ToUInt8,
+                                      tensor_id,
+                                      GetHexagonTensorId(tensor_id)));
     }
+    ++num_inputs;
   }
+
+  return kTfLiteOk;
 }
 
-void delegates::hexagon::GraphBuilder::AddOutputTensors(
+TfLiteStatus GraphBuilder::AddOutputTensors(
     const TfLiteIntArray* output_tensors, TfLiteContext* context) {
-  builders_.emplace_back(new OpBuilder(this, OP_OUTPUT));
-  builders_.back()->SetNodeId(builders_.size());
+  std::vector<OpBuilder::TensorID> hexagon_output_ids;
+  hexagon_output_ids.reserve(output_tensors->size);
+
   for (int i = 0; i < output_tensors->size; ++i) {
     const int tensor_id = output_tensors->data[i];
-    builders_.back()->AddInput(GetHexagonTensorId(tensor_id));
+    const auto& tensor = context->tensors[tensor_id];
+    // If tensor is of type int8, add an op to cast it to uint8.
+    if (tensor.type == kTfLiteInt8) {
+      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastUInt8ToInt8,
+                                      tensor_id,
+                                      GetHexagonTensorId(tensor_id)));
+    }
+    hexagon_output_ids.push_back(GetHexagonTensorId(tensor_id));
   }
+
+  // Add Hexagon OUTPUT op.
+  auto* output_op = AddNode();
+  output_op->SetOpType(OP_OUTPUT);
+  for (auto hexagon_output : hexagon_output_ids) {
+    output_op->AddInput(hexagon_output);
+  }
+
+  return kTfLiteOk;
 }
 
 OpBuilder::TensorID OpBuilder::AddOutput(const TfLiteIntArray* dims) {
   op_node_.outputs.push_back(hexagon_nn_output());
-  op_node_.outputs.back().elementsize = sizeof(float);
+  op_node_.outputs.back().elementsize = sizeof(uint8_t);
   op_node_.outputs.back().rank = 4;
   // TODO(karimnosseir): What is a good to estimate the max size ?
   int batch_size, height_size, width_size, depth_size;
   GetDims(&batch_size, &height_size, &width_size, &depth_size, dims);
   auto& max_sizes = op_node_.outputs.back().max_sizes;
-  max_sizes[0] = batch_size;
+  if (graph_builder_->GraphHasDynamicBatch()) {
+    max_sizes[0] = graph_builder_->GetMaxBatchSize();
+  } else {
+    max_sizes[0] = batch_size;
+  }
   max_sizes[1] = height_size;
   max_sizes[2] = width_size;
   max_sizes[3] = depth_size;
@@ -180,6 +244,9 @@ OpBuilder::TensorID OpBuilder::AddOutput(
   for (int i = 0; i < max_sizes_vect.size(); ++i) {
     max_sizes[i] = max_sizes_vect[i];
   }
+  if (graph_builder_->GraphHasDynamicBatch()) {
+    max_sizes[0] = graph_builder_->GetMaxBatchSize();
+  }
   return TensorID(GetID(), op_node_.outputs.size() - 1);
 }
 
@@ -211,6 +278,18 @@ OpBuilder* GraphBuilder::AddNodeFromTfLiteOp(int op_type, TfLiteNode* node,
   return op;
 }
 
+void GraphBuilder::AddBatchSeqConfig(int max_size_for_batch,
+                                     TfLiteIntArray* input_batch_dimensions,
+                                     TfLiteIntArray* output_batch_dimensions) {
+  OpBuilder* batch_seq_node =
+      CreateBatchSeqBuilder(this, OP_BatchSeqConfig, max_size_for_batch,
+                            input_batch_dimensions, output_batch_dimensions);
+  builders_.emplace_back(batch_seq_node);
+  batch_seq_node->SetNodeId(builders_.size());
+  batch_seq_node->PopulateSubGraph(nullptr, nullptr, nullptr);
+  max_size_for_batch_ = max_size_for_batch;
+}
+
 }  // namespace hexagon
 }  // namespace delegates
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
index 0278964f6de..267fc818ca1 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
@@ -112,20 +112,8 @@ class OpBuilder {
     return kTfLiteOk;
   }
 
- protected:
-  // Helper method to fetch dimensions.
-  // TODO(karimnosseir): Move to a shared place.
-  void GetDims(int* batch_size, int* height_size, int* width_size,
-               int* depth_size, const TfLiteIntArray* dims) {
-    int* dim[] = {batch_size, height_size, width_size, depth_size};
-    for (int i = 0; i < 4; ++i) *(dim[i]) = 1;
-    for (int i = 4 - dims->size; i < 4; ++i) {
-      *dim[i] = dims->data[i - (4 - dims->size)];
-    }
-  }
-
-  TfLiteStatus ComputeMinAndMaxQuantValues(const TfLiteTensor& tensor,
-                                           float* min, float* max) {
+  static TfLiteStatus ComputeMinAndMaxQuantValues(const TfLiteTensor& tensor,
+                                                  float* min, float* max) {
     if (tensor.type == kTfLiteUInt8) {
       return ComputeMinAndMaxQuantValues(tensor, min, max,
                                          std::numeric_limits<uint8_t>::min(),
@@ -142,10 +130,22 @@ class OpBuilder {
     return kTfLiteError;
   }
 
+ protected:
+  // Helper method to fetch dimensions.
+  // TODO(karimnosseir): Move to a shared place.
+  void GetDims(int* batch_size, int* height_size, int* width_size,
+               int* depth_size, const TfLiteIntArray* dims) {
+    int* dim[] = {batch_size, height_size, width_size, depth_size};
+    for (int i = 0; i < 4; ++i) *(dim[i]) = 1;
+    for (int i = 4 - dims->size; i < 4; ++i) {
+      *dim[i] = dims->data[i - (4 - dims->size)];
+    }
+  }
+
   template <typename T>
-  TfLiteStatus ComputeMinAndMaxQuantValues(const TfLiteTensor& tensor,
-                                           float* min, float* max, T min_value,
-                                           T max_value) {
+  static TfLiteStatus ComputeMinAndMaxQuantValues(const TfLiteTensor& tensor,
+                                                  float* min, float* max,
+                                                  T min_value, T max_value) {
     *min = 0;
     *max = 0;
     const TfLiteQuantization& quant = tensor.quantization;
@@ -189,7 +189,7 @@ class GraphBuilder {
   // Add node to the graph. The caller responsible for setting correct
   // data in the Op.
   // 'tflite_node_index' is the node index in TFLite that creates this op.
-  OpBuilder* AddNode(int tflite_node_index);
+  OpBuilder* AddNode(int tflite_node_index = -1);
 
   // Add const node that provides the data held by 'tensor'.
   OpBuilder* AddConstNodeWithData(int tensor_id, const TfLiteTensor& tensor);
@@ -200,12 +200,19 @@ class GraphBuilder {
   OpBuilder* CreateOpBuilderFromTfLiteOp(int op_type);
 
   // Construct Input node with 'input_tensors' as output.
-  void AddInputTensors(const TfLiteIntArray* input_tensors,
-                       TfLiteContext* context);
+  TfLiteStatus AddInputTensors(const TfLiteIntArray* input_tensors,
+                               TfLiteContext* context);
 
   // Construct Output node with 'output_tensors' as input.
-  void AddOutputTensors(const TfLiteIntArray* output_tensors,
-                        TfLiteContext* context);
+  TfLiteStatus AddOutputTensors(const TfLiteIntArray* output_tensors,
+                                TfLiteContext* context);
+
+  // Adds BatchSeqConfig node to the graph. This is configuration
+  // for a dynamic batch size for the graph.
+  // A graph can have only one node of this type.
+  void AddBatchSeqConfig(int max_size_for_batch,
+                         TfLiteIntArray* input_batch_dimensions,
+                         TfLiteIntArray* output_batch_dimensions);
 
   // Returns tensor id inside Hexagon graph.
   OpBuilder::TensorID GetHexagonTensorId(int tflite_tensor_index) {
@@ -257,8 +264,8 @@ class GraphBuilder {
 
   // Add new tensor mapping to the tensor list.
   bool AddTensorWithID(int tflite_tensor_id, int hexagon_node_id,
-                       int hexagon_node_output_id) {
-    if (HasTensor(tflite_tensor_id)) {
+                       int hexagon_node_output_id, bool overwrite = false) {
+    if (!overwrite && HasTensor(tflite_tensor_id)) {
       return false;
     }
     if (tensors_.size() <= tflite_tensor_id) {
@@ -283,6 +290,13 @@ class GraphBuilder {
     return builders_[node_id - 1]->GetTFLiteNodeID();
   }
 
+  // Returns true if the graph supports dynamic batch. False otherwise.
+  bool GraphHasDynamicBatch() const { return max_size_for_batch_ != -1; }
+
+  // Returns the maximum value for batch dimension the graph supports.
+  // -1 if the graph doesn't support dynamic batch.
+  int GetMaxBatchSize() const { return max_size_for_batch_; }
+
  private:
   // Helper method to fetch dimensions.
   // TODO(karimnosseir): Move this method to shared place.
@@ -295,6 +309,10 @@ class GraphBuilder {
     }
   }
 
+  // Adds a Cast op to convert a tensor from int8 to uint8 (or vice versa).
+  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id,
+                         OpBuilder::TensorID hexagon_input);
+
   const HexagonNN* hexagon_nn_ = nullptr;
   TfLiteContext* context_ = nullptr;
   int graph_id_ = -1;
@@ -302,6 +320,10 @@ class GraphBuilder {
   // Index in the vector is the tflite_tensor_index, the value
   // is the ID in the hexgon graph.
   std::vector<OpBuilder::TensorID> tensors_;
+
+  // If the graph being built supports dynamic batch, this represents
+  // the maximum value for batch.
+  int max_size_for_batch_ = -1;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
index 109a4efced7..181ad57b3cb 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_OP_FACTORY_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_OP_FACTORY_H_
 
+#include "tensorflow/lite/c/common.h"
+
 namespace tflite {
 namespace delegates {
 namespace hexagon {
@@ -33,6 +35,7 @@ OpBuilder* CreatePool2DBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateReshapeBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateSoftmaxBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateReduceBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMirrorPadBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreatePadBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateResizeNearestNeighborBuilder(GraphBuilder* graph_builder,
                                               int op_type);
@@ -44,6 +47,15 @@ OpBuilder* CreateResizeBilinearOpBuilder(GraphBuilder* graph_builder,
 OpBuilder* CreateNegOpBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateTransposeBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateSpaceToDepthBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateBatchSeqBuilder(GraphBuilder* graph_builder, int op_type,
+                                 int max_size_for_batch,
+                                 TfLiteIntArray* input_batch_dimensions,
+                                 TfLiteIntArray* output_batch_dimensions);
+OpBuilder* CreateQuantizeBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateHardSwishBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateCastBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateSliceOpBuilder(GraphBuilder* graph_builder, int op_type);
 
 }  // namespace hexagon
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/quantize_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/quantize_builder.cc
new file mode 100644
index 00000000000..e4258642bb1
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/quantize_builder.cc
@@ -0,0 +1,91 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/quantize_builder.h"
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus QuantizeOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                                 const TfLiteIntArray* outputs,
+                                                 TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+
+  // Input.
+  float input_min = 0;
+  float input_max = 0;
+  const auto& input_tensor = context->tensors[inputs->data[0]];
+  ComputeMinAndMaxQuantValues(input_tensor, &input_min, &input_max);
+  auto* input_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&input_min), sizeof(input_min));
+  auto* input_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&input_max), sizeof(input_max));
+
+  // Output.
+  float output_min = 0;
+  float output_max = 0;
+  const auto& output_tensor = context->tensors[outputs->data[0]];
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max));
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, output_tensor.dims);
+  auto* requantized_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
+  auto* requantized_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
+
+  AddInput(graph_builder_->GetHexagonTensorId(inputs->data[0]));
+  AddInput(TensorID(input_min_const->GetID(), 0));
+  AddInput(TensorID(input_max_const->GetID(), 0));
+  AddInput(TensorID(requantized_min_const->GetID(), 0));
+  AddInput(TensorID(requantized_max_const->GetID(), 0));
+
+  // Hexagon outputs for this node.
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus QuantizeOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                                TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+
+  return kTfLiteOk;
+}
+
+QuantizeOpBuilder::~QuantizeOpBuilder() {}
+
+OpBuilder* CreateQuantizeBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new QuantizeOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/quantize_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/quantize_builder.h
new file mode 100644
index 00000000000..9851ce46f00
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/quantize_builder.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_QUANTIZE_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_QUANTIZE_BUILDER_H_
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class QuantizeOpBuilder : public OpBuilder {
+ public:
+  explicit QuantizeOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  explicit QuantizeOpBuilder(GraphBuilder* graph_builder, int op_type,
+                             int relu_value)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~QuantizeOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_QUANTIZE_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
index 8401f76cf4d..066c82560a8 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace delegates {
@@ -35,9 +36,7 @@ TfLiteStatus ReduceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_,
-                              std::numeric_limits<uint8_t>::min(),
-                              std::numeric_limits<uint8_t>::max());
+  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_);
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
@@ -63,37 +62,48 @@ TfLiteStatus ReduceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     return kTfLiteError;
   }
 
+  auto& output_tensor = context->tensors[outputs->data[0]];
   int output_batch_size, output_height_size, output_width_size,
       output_depth_size;
   GetDims(&output_batch_size, &output_height_size, &output_width_size,
-          &output_depth_size, context->tensors[outputs->data[0]].dims);
+          &output_depth_size, output_tensor.dims);
 
-  // Hexagon's sum-reduction outputs int32, so we shrink it down to UInt8.
-  if (op_node_.op_type == OP_QuantizedSum_8to32) {
-    const auto& reduce_out = AddOutput(sizeof(int32_t), 4,
-                                       {output_batch_size, output_height_size,
-                                        output_width_size, output_depth_size});
-    const auto& reduce_out_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    const auto& reduce_out_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  float output_min = -1, output_max = -1;
+  ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max);
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_min),
+      sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_max),
+      sizeof(output_max));
+  // Min/max values for output tensor.
+  AddInput(TensorID(output_min_const->GetID(), 0));
+  AddInput(TensorID(output_max_const->GetID(), 0));
 
-    auto* quantize_output_op = graph_builder_->AddNode(GetTFLiteNodeID());
-    quantize_output_op->SetOpType(OP_QuantizeDownAndShrinkRange_32to8);
-    quantize_output_op->AddInput(reduce_out);
-    quantize_output_op->AddInput(reduce_out_min);
-    quantize_output_op->AddInput(reduce_out_max);
-    node_output_ =
-        quantize_output_op->AddOutput(sizeof(uint8_t), 4,
-                                      {output_batch_size, output_height_size,
-                                       output_width_size, output_depth_size});
-    quantize_output_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    quantize_output_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  } else {
-    node_output_ = AddOutput(sizeof(uint8_t), 4,
-                             {output_batch_size, output_height_size,
-                              output_width_size, output_depth_size});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  }
+  // Add outputs
+  size_t output_element_size = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetSizeOfType(context, output_tensor.type, &output_element_size));
+  auto mean_output = AddOutput(output_element_size, 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  auto mean_out_min = AddOutput(output_element_size, 4, {1, 1, 1, 1});
+  auto mean_out_max = AddOutput(output_element_size, 4, {1, 1, 1, 1});
+  // Mean op doesn't honor the passed min/max for output, so we need
+  // to add requantize.
+  auto* requantize_op = graph_builder_->AddNode(GetTFLiteNodeID());
+  requantize_op->SetOpType(OP_Requantize_8to8);
+  requantize_op->AddInput(mean_output);
+  requantize_op->AddInput(mean_out_min);
+  requantize_op->AddInput(mean_out_max);
+  requantize_op->AddInput(TensorID(output_min_const->GetID(), 0));
+  requantize_op->AddInput(TensorID(output_max_const->GetID(), 0));
+  node_output_ =
+      requantize_op->AddOutput(sizeof(uint8_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/resize_bilinear_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/resize_bilinear_builder.cc
index 0c120426918..03698b0bb13 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/resize_bilinear_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/resize_bilinear_builder.cc
@@ -50,9 +50,7 @@ TfLiteStatus ResizeBilinearOpBuilder::PopulateSubGraph(
 
   // Input min/max
   TF_LITE_ENSURE_OK(context, ComputeMinAndMaxQuantValues(
-                                 input_tensor, &input_min_, &input_max_,
-                                 std::numeric_limits<uint8_t>::min(),
-                                 std::numeric_limits<uint8_t>::max()));
+                                 input_tensor, &input_min_, &input_max_));
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
index cfd3a5fa2f8..3f74a19f470 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
@@ -36,9 +36,7 @@ TfLiteStatus ResizeNearestNeighborOpBuilder::PopulateSubGraph(
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
   TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_,
-                                  std::numeric_limits<uint8_t>::min(),
-                                  std::numeric_limits<uint8_t>::max()));
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc
new file mode 100644
index 00000000000..cc282343f0c
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h"
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/tensor.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+namespace {
+template <typename T>
+void GetBeginAndSizeVectors(int dimensions, const TfLiteTensor* begin,
+                            const TfLiteTensor* size, std::vector<int>* begins,
+                            std::vector<int>* sizes) {
+  for (int i = 0; i < dimensions; ++i) {
+    begins->push_back(GetTensorData<T>(begin)[i]);
+    sizes->push_back(GetTensorData<T>(size)[i]);
+  }
+}
+}  // namespace
+
+TfLiteStatus SliceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                              const TfLiteIntArray* outputs,
+                                              TfLiteContext* context) {
+  static int quant_bound_shape[] = {1, 1, 1, 1};
+
+  // Input data tensor.
+  const int tensor_id = inputs->data[0];
+  const auto& input_tensor = context->tensors[tensor_id];
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+  // Start / Size
+  const auto& begin_tensor = context->tensors[inputs->data[1]];
+  const auto& size_tensor = context->tensors[inputs->data[2]];
+  std::vector<int32_t> begins, sizes;
+  if (begin_tensor.type == kTfLiteInt32) {
+    GetBeginAndSizeVectors<int32_t>(input_tensor.dims->size, &begin_tensor,
+                                    &size_tensor, &begins, &sizes);
+  } else if (begin_tensor.type == kTfLiteInt64) {
+    GetBeginAndSizeVectors<int64_t>(input_tensor.dims->size, &begin_tensor,
+                                    &size_tensor, &begins, &sizes);
+  } else {
+    return kTfLiteError;
+  }
+  const int32_t begins_shape[] = {1, 1, 1, static_cast<int32_t>(begins.size())};
+  auto begins_node = graph_builder_->AddConstNodeWithData(
+      begins_shape, reinterpret_cast<char*>(begins.data()),
+      sizeof(int32_t) * begins.size());
+  auto sizes_node = graph_builder_->AddConstNodeWithData(
+      begins_shape, reinterpret_cast<char*>(sizes.data()),
+      sizeof(int32_t) * begins.size());
+  AddInput(TensorID(begins_node->GetID(), 0));
+  AddInput(TensorID(sizes_node->GetID(), 0));
+
+  // Input min/max
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
+  auto* input_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_min_),
+      sizeof(input_min_));
+  auto* input_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_max_),
+      sizeof(input_max_));
+  AddInput(TensorID(input_min_const->GetID(), 0));
+  AddInput(TensorID(input_max_const->GetID(), 0));
+
+  // Outputs
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  return kTfLiteOk;
+}
+
+TfLiteStatus SliceOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                             TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateSliceOpBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new SliceOpBuilder(graph_builder, op_type);
+}
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h
new file mode 100644
index 00000000000..0ee06630dba
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class SliceOpBuilder : public OpBuilder {
+ public:
+  explicit SliceOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  float input_min_, input_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/split_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/split_builder.cc
index a3df94c5cd1..3b07f2293f5 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/split_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/split_builder.cc
@@ -56,9 +56,7 @@ TfLiteStatus SplitOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   // Input data tensor & min/max.
   AddInput(graph_builder_->GetHexagonTensorId(input_tensor_id));
   TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_,
-                                  std::numeric_limits<uint8_t>::min(),
-                                  std::numeric_limits<uint8_t>::max()));
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index f8da4c83f35..bcabf0dbe62 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -28,13 +28,19 @@ hexagon_op_tests(
         "arg_min_max_test.cc",
         "concat_test.cc",
         "conv_test.cc",
+        "l2_norm_test.cc",
         "matmul_test.cc",
+        "min_max_builder_test.cc",
+        "mirror_pad_test.cc",
         "mul_test.cc",
         "neg_test.cc",
         "pad_test.cc",
         "pool_test.cc",
+        "quantize_test.cc",
         "reduce_test.cc",
-        "resize_bilinear_test.cc",
+        "reshape_test.cc",
+        "resize_test.cc",
+        "slice_test.cc",
         "softmax_test.cc",
         "space_to_depth_test.cc",
         "split_test.cc",
@@ -43,6 +49,33 @@ hexagon_op_tests(
     ],
     deps = [
         ":hexagon_delegate_op_model",
+        "//tensorflow/lite/kernels:reshape_test_common",
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_test(
+    name = "batch_seq_config_test",
+    srcs = [
+        "batch_seq_config_test.cc",
+    ],
+    tags = [
+        "no_oss",
+        "nobuilder",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/experimental/delegates/hexagon:hexagon_delegate",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools/benchmark:benchmark_utils",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/flags:parse",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc
index 7356c4ba2f3..eed70619acf 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc
@@ -12,12 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdarg>
+#include <cstdint>
+#include <limits>
+#include <random>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
 
+void GenerateUniformRandomVector(int size, float min, float max,
+                                 std::minstd_rand* random_engine,
+                                 std::vector<float>* result) {
+  // Never use std::uniform_*_distribution in tests, it's
+  // implementation-defined. Likewise, don't use std::default_random_engine,
+  // implementation-defined. Implementation-defined is bad because it means that
+  // any toolchain update or new platform may run into test failures.
+  // std::minstd_rand is a standard instantiation of
+  // std::linear_congruential_engine, the cheapest generator in c++11 stdlib,
+  // it's good enough here.
+  result->resize(size);
+  for (int i = 0; i < size; i++) {
+    // We don't care whether the `max` value may ever be produced exactly.
+    // It may actually be thanks to rounding, as std::minstd_rand::modulus
+    // is 2^31 - 1 is greater than the inverse float epsilon.
+    float random_value_scaled_0_1 =
+        (*random_engine)() *
+        (1.0f / static_cast<float>(std::minstd_rand::modulus));
+    (*result)[i] = min + (max - min) * random_value_scaled_0_1;
+  }
+}
+
 class ActivationOpModel : public SingleOpModelWithHexagon {
  public:
   explicit ActivationOpModel(BuiltinOperator type, const TensorData& input,
@@ -152,4 +179,146 @@ TEST(ActivationOpModel, TanhOutput_Int8) {
   TanhTestImpl<int8_t, TensorType_INT8>();
 }
 
+void EvalTestReferenceHardSwish(int size, const std::vector<float>& input,
+                                std::vector<float>* result) {
+  result->resize(size);
+  for (int i = 0; i < size; i++) {
+    const float in = input[i];
+    (*result)[i] = in * std::min(6.0f, std::max(0.0f, in + 3)) * (1.0f / 6.0f);
+  }
+}
+
+template <TensorType Tensor_Type, typename input_type>
+void TestQuantizedHardSwish(int size, float input_min, float input_max,
+                            float output_min, float output_max,
+                            std::minstd_rand* random_engine) {
+  std::vector<float> float_input_values;
+  GenerateUniformRandomVector(size, input_min, input_max, random_engine,
+                              &float_input_values);
+  std::vector<float> float_ref_output_values;
+  EvalTestReferenceHardSwish(size, float_input_values,
+                             &float_ref_output_values);
+  for (float& val : float_ref_output_values) {
+    val = std::min(output_max, std::max(output_min, val));
+  }
+  ActivationOpModel m(
+      BuiltinOperator_HARD_SWISH,
+      /*input=*/{Tensor_Type, {1, 1, 1, size}, input_min, input_max},
+      /*output=*/{Tensor_Type, {1, 1, 1, size}, output_min, output_max});
+  m.SetInput<input_type>(float_input_values);
+
+  m.ApplyDelegateAndInvoke();
+  const std::vector<float> dequantized_output =
+      m.GetDequantizedOutput<input_type>();
+  // QUANTIZATION-RECOMMENDED TOLERANCE:
+  // The numerical error for any 8bit quantized function is at least one half
+  // times the quantization step: 0.5 * (kOutMax - kOutMin) / 256.
+  // To that we add again the quantization step (kOutMax - kOutMin) / 256
+  // to allow for an off-by-one rounding error.
+  // TOLERANCE FOR HEXAGON:
+  // Hexagon also introduces some error, so we choose the max between that value
+  // & 0.03
+  const float quant_recommended_tolerance =
+      std::max(input_max - input_min, output_max - output_min) * (1.5f / 256.f);
+  const float kTolerance = std::max(0.03f, quant_recommended_tolerance);
+  EXPECT_THAT(dequantized_output, ElementsAreArray(ArrayFloatNear(
+                                      float_ref_output_values, kTolerance)));
+}
+
+template <TensorType Tensor_Type, typename input_type>
+void HardSwishTestImpl() {
+  std::minstd_rand random_engine;
+  std::vector<std::pair<float, float>> minmax_pairs{{0.f, 1.f}, {-5.f, 10.f}};
+  for (const auto& input_minmax : minmax_pairs) {
+    for (const auto& output_minmax : minmax_pairs) {
+      float input_min = input_minmax.first;
+      float input_max = input_minmax.second;
+      float output_min = output_minmax.first;
+      float output_max = output_minmax.second;
+      for (int size : {1, 3, 40}) {
+        TestQuantizedHardSwish<Tensor_Type, input_type>(
+            size, input_min, input_max, output_min, output_max, &random_engine);
+      }
+    }
+  }
+}
+
+TEST(ActivationOpModel, HardSwishTestUInt8) {
+  HardSwishTestImpl<TensorType_UINT8, uint8_t>();
+}
+
+TEST(ActivationOpModel, HardSwishTestInt8) {
+  HardSwishTestImpl<TensorType_INT8, int8_t>();
+}
+
+template <TensorType Tensor_Type, typename input_type>
+void HardSwishBiasTestImpl() {
+  float input_min = -11.654928f;
+  float input_max = 25.036512f;
+  float output_min = -0.3905796f;
+  float output_max = 24.50887f;
+  float tolerated_bias = 0.035;
+
+  const float quantized_type_range =
+      static_cast<float>(std::numeric_limits<int8_t>::max()) -
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  const float input_scale = (input_max - input_min) / quantized_type_range;
+  const float output_scale = (output_max - output_min) / quantized_type_range;
+  const float max_scale = std::max(output_scale, input_scale);
+
+  // In this bias-focused test case, no need for randomly generated input
+  // values.
+  ASSERT_LE(input_min, -3.0f);
+  ASSERT_GE(input_max, 3.0f);
+  const int quantized_input_negative_three =
+      std::round(std::numeric_limits<input_type>::min() +
+                 (-3.0f - input_min) / input_scale);
+  const int quantized_input_positive_three =
+      std::round(std::numeric_limits<input_type>::min() +
+                 (3.0f - input_min) / input_scale);
+  std::vector<float> float_input_values;
+  for (int i = quantized_input_negative_three;
+       i <= quantized_input_positive_three; i++) {
+    float_input_values.push_back(
+        input_min + (i - std::numeric_limits<int8_t>::min()) * input_scale);
+  }
+  const int size = float_input_values.size();
+  std::vector<float> float_ref_output_values;
+  EvalTestReferenceHardSwish(size, float_input_values,
+                             &float_ref_output_values);
+  for (float& val : float_ref_output_values) {
+    val = std::min(output_max, std::max(output_min, val));
+  }
+
+  ActivationOpModel m(
+      BuiltinOperator_HARD_SWISH,
+      /*input=*/{Tensor_Type, {1, 1, 1, size}, input_min, input_max},
+      /*output=*/{Tensor_Type, {1, 1, 1, size}, output_min, output_max});
+  m.SetInput<input_type>(float_input_values);
+
+  m.ApplyDelegateAndInvoke();
+  const std::vector<float> dequantized_output =
+      m.GetDequantizedOutput<input_type>();
+
+  float sum_diff = 0;
+  for (int i = 0; i < size; i++) {
+    sum_diff += dequantized_output[i] - float_ref_output_values[i];
+  }
+  const float bias = sum_diff / (size * max_scale);
+  EXPECT_LE(std::abs(bias), tolerated_bias);
+}
+
+// See the comment in the reference implementation of quantized HardSwish:
+// A numerical issue significantly affecting ImageNet classification accuracy
+// with MobileNet v3 is only observable at the scale of HardSwish unit tests
+// if we monitor specifically bias. This testcase is extracted from one of the
+// HardSwish nodes in that MobileNet v3 that exhibited this issue.
+TEST(ActivationOpModel, HardSwishBiasTest) {
+  HardSwishBiasTestImpl<TensorType_UINT8, uint8_t>();
+}
+
+TEST(ActivationOpModel, HardSwishBiasTestInt8) {
+  HardSwishBiasTestImpl<TensorType_INT8, int8_t>();
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arg_min_max_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arg_min_max_test.cc
index 60ad5384788..c4900ac9453 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arg_min_max_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arg_min_max_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
 
 namespace tflite {
@@ -20,9 +21,9 @@ using testing::ElementsAreArray;
 
 class ArgBaseOpModel : public SingleOpModelWithHexagon {
  public:
-  explicit ArgBaseOpModel(TensorType output_type) {
-    input_ = AddInput(TensorType_UINT8);
-    output_ = AddOutput(output_type);
+  explicit ArgBaseOpModel(TensorType input_type) {
+    input_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_INT32);
   }
 
   int input() const { return input_; }
@@ -41,9 +42,8 @@ class ArgBaseOpModel : public SingleOpModelWithHexagon {
 
 class ArgMinOpModel : public ArgBaseOpModel {
  public:
-  ArgMinOpModel(std::initializer_list<int> input_shape)
-      : ArgBaseOpModel(TensorType_INT32 /*output_type*/),
-        input_shape_(input_shape) {}
+  ArgMinOpModel(std::initializer_list<int> input_shape, TensorType input_type)
+      : ArgBaseOpModel(input_type /*input_type*/), input_shape_(input_shape) {}
 
   void Build() {
     SetBuiltinOp(BuiltinOperator_ARG_MIN, BuiltinOptions_ArgMinOptions,
@@ -58,9 +58,8 @@ class ArgMinOpModel : public ArgBaseOpModel {
 
 class ArgMaxOpModel : public ArgBaseOpModel {
  public:
-  ArgMaxOpModel(std::initializer_list<int> input_shape)
-      : ArgBaseOpModel(TensorType_INT32 /*output_type*/),
-        input_shape_(input_shape) {}
+  ArgMaxOpModel(std::initializer_list<int> input_shape, TensorType input_type)
+      : ArgBaseOpModel(input_type /*input_type*/), input_shape_(input_shape) {}
 
   void Build() {
     SetBuiltinOp(BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions,
@@ -73,35 +72,71 @@ class ArgMaxOpModel : public ArgBaseOpModel {
   std::vector<int> input_shape_;
 };
 
-TEST(ArgMinTest, GetArgMin) {
-  ArgMinOpModel model({1, 1, 1, 4});
+template <typename integer_type, TensorType tensor_dtype>
+void ArgMinTestImpl() {
+  ArgMinOpModel model({1, 1, 1, 4}, tensor_dtype);
   model.AddConstInput(TensorType_INT32, {3}, {1});
   model.Build();
-  model.SymmetricQuantizeAndPopulate(model.input(), {1, 5, 0, 7});
-  model.ApplyDelegateAndInvoke();
 
+  if (tensor_dtype == TensorType_UINT8) {
+    model.SymmetricQuantizeAndPopulate(model.input(), {1, 5, 0, 7});
+  } else {
+    model.SignedSymmetricQuantizeAndPopulate(model.input(), {1, 5, 0, 7});
+  }
+  model.ApplyDelegateAndInvoke();
   EXPECT_THAT(model.GetInt32Output(), ElementsAreArray({2}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
-TEST(ArgMinTest, GetArgMinNegative) {
-  ArgMinOpModel model({1, 1, 2, 4});
+template <typename integer_type, TensorType tensor_dtype>
+void ArgMinNegativeTestImpl() {
+  ArgMinOpModel model({1, 1, 2, 4}, tensor_dtype);
   model.AddConstInput(TensorType_INT32, {-2}, {1});
   model.Build();
-  model.SymmetricQuantizeAndPopulate(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
-  model.ApplyDelegateAndInvoke();
 
+  if (tensor_dtype == TensorType_UINT8) {
+    model.SymmetricQuantizeAndPopulate(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
+  } else {
+    model.SignedSymmetricQuantizeAndPopulate(model.input(),
+                                             {1, 2, 7, 8, 1, 9, 7, 3});
+  }
+  model.ApplyDelegateAndInvoke();
   EXPECT_THAT(model.GetInt32Output(), ElementsAreArray({0, 0, 0, 1}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 4}));
 }
 
-TEST(ArgMaxTest, GetArgMax) {
-  ArgMaxOpModel model({1, 1, 1, 4});
+template <typename integer_type, TensorType tensor_dtype>
+void ArgMaxTestImpl() {
+  ArgMaxOpModel model({1, 1, 1, 4}, tensor_dtype);
   model.AddConstInput(TensorType_INT32, {3}, {1});
   model.Build();
-  model.SymmetricQuantizeAndPopulate(model.input(), {1, 5, 0, 7});
-  model.ApplyDelegateAndInvoke();
 
+  if (tensor_dtype == TensorType_UINT8) {
+    model.SymmetricQuantizeAndPopulate(model.input(), {1, 5, 0, 7});
+  } else {
+    model.SignedSymmetricQuantizeAndPopulate(model.input(), {1, 5, 0, 7});
+  }
+  model.ApplyDelegateAndInvoke();
   EXPECT_THAT(model.GetInt32Output(), ElementsAreArray({3}));
 }
+
+TEST(ArgMinTest, GetArgMin_UInt8) {
+  ArgMinTestImpl<uint8_t, TensorType_UINT8>();
+}
+
+TEST(ArgMinTest, GetArgMin_Int8) { ArgMinTestImpl<int8_t, TensorType_INT8>(); }
+
+TEST(ArgMinTest, GetArgMinNegative_UInt8) {
+  ArgMinNegativeTestImpl<uint8_t, TensorType_UINT8>();
+}
+
+TEST(ArgMinTest, GetArgMinNegative_Int8) {
+  ArgMinNegativeTestImpl<int8_t, TensorType_INT8>();
+}
+
+TEST(ArgMaxTest, GetArgMax_UInt8) {
+  ArgMaxTestImpl<uint8_t, TensorType_UINT8>();
+}
+
+TEST(ArgMaxTest, GetArgMax_Int8) { ArgMaxTestImpl<int8_t, TensorType_INT8>(); }
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/batch_seq_config_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/batch_seq_config_test.cc
new file mode 100644
index 00000000000..9217d088db0
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/batch_seq_config_test.cc
@@ -0,0 +1,219 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <random>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/flags/flag.h"
+#include "absl/flags/parse.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
+#include "tensorflow/lite/tools/logging.h"
+
+ABSL_FLAG(std::string, model_file_path, "", "Path to the test model file.");
+ABSL_FLAG(std::string, model_input_shapes, "",
+          "List of different input shapes for testing, the input will "
+          "resized for each one in order and tested. They Should be "
+          "separated by : and each shape has dimensions separated by ,");
+ABSL_FLAG(int, max_batch_size, -1,
+          "Maximum batch size for a single run by hexagon.");
+ABSL_FLAG(double, error_epsilon, 0.2,
+          "Maximum error allowed while diffing the output.");
+
+namespace tflite {
+namespace {
+// Returns a randomly generated data of size 'num_elements'.
+std::vector<uint8> GetData(int num_elements) {
+  std::vector<uint8> result(num_elements);
+  std::random_device random_engine;
+  std::uniform_int_distribution<uint32_t> distribution(0, 254);
+  std::generate_n(result.data(), num_elements, [&]() {
+    return static_cast<uint8>(distribution(random_engine));
+  });
+  return result;
+}
+
+// Returns the total number of elements.
+int NumElements(const std::vector<int>& shape) {
+  int num_elements = 1;
+  for (int dim : shape) num_elements *= dim;
+  return num_elements;
+}
+
+// Returns true if 'control' and 'exp' values match up to 'epsilon'
+bool DiffOutput(const std::vector<float>& control,
+                const std::vector<float>& exp, double epsilon) {
+  if (control.size() != exp.size()) {
+    TFLITE_LOG(ERROR) << "Mismatch size Expected" << control.size() << " got "
+                      << exp.size();
+    return false;
+  }
+  bool has_diff = false;
+  for (int i = 0; i < control.size(); ++i) {
+    if (abs(control[i] - exp[i]) > epsilon) {
+      TFLITE_LOG(ERROR) << control[i] << " " << exp[i];
+      has_diff = true;
+    }
+  }
+  return !has_diff;
+}
+
+bool DiffOutput(const std::vector<float>& control,
+                const std::vector<float>& exp) {
+  return DiffOutput(control, exp, absl::GetFlag(FLAGS_error_epsilon));
+}
+}  // namespace
+
+class TestModel {
+ public:
+  TestModel() : delegate_(nullptr, [](TfLiteDelegate* delegate) {}) {}
+
+  // Initialize the model by reading the model from file and build
+  // interpreter.
+  void Init() {
+    model_ = tflite::FlatBufferModel::BuildFromFile(
+        absl::GetFlag(FLAGS_model_file_path).c_str());
+    ASSERT_TRUE(model_ != nullptr);
+
+    resolver_.reset(new ops::builtin::BuiltinOpResolver());
+    InterpreterBuilder(*model_, *resolver_)(&interpreter_);
+    ASSERT_TRUE(interpreter_ != nullptr);
+  }
+
+  // Add Hexagon delegate to the graph.
+  void ApplyDelegate(int max_batch_size,
+                     const std::vector<int>& input_batch_dimensions,
+                     const std::vector<int>& output_batch_dimensions) {
+    TfLiteIntArray* input_batch_dim =
+        TfLiteIntArrayCreate(input_batch_dimensions.size());
+    TfLiteIntArray* output_batch_dim =
+        TfLiteIntArrayCreate(output_batch_dimensions.size());
+    for (int i = 0; i < input_batch_dimensions.size(); ++i)
+      input_batch_dim->data[i] = input_batch_dimensions[i];
+    for (int i = 0; i < output_batch_dimensions.size(); ++i)
+      output_batch_dim->data[i] = output_batch_dimensions[i];
+    ::TfLiteHexagonDelegateOptions options = {0};
+    options.enable_dynamic_batch_size = true;
+    options.max_batch_size = max_batch_size;
+    options.input_batch_dimensions = input_batch_dim;
+    options.output_batch_dimensions = output_batch_dim;
+    TfLiteDelegate* delegate = TfLiteHexagonDelegateCreate(&options);
+    ASSERT_TRUE(delegate != nullptr);
+    delegate_ = std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
+        delegate, [](TfLiteDelegate* delegate) {
+          TfLiteHexagonDelegateDelete(delegate);
+        });
+    ASSERT_TRUE(interpreter_->ModifyGraphWithDelegate(delegate_.get()) ==
+                kTfLiteOk);
+  }
+
+  void Run(const std::vector<int>& input_shape,
+           const std::vector<uint8>& input_data) {
+    // Resize Inputs.
+    auto interpreter_inputs = interpreter_->inputs();
+    interpreter_->ResizeInputTensor(interpreter_inputs[0], input_shape);
+    ASSERT_EQ(kTfLiteOk, interpreter_->AllocateTensors());
+
+    TfLiteTensor* input_tensor =
+        interpreter_->tensor(interpreter_->inputs()[0]);
+    memcpy(input_tensor->data.raw, input_data.data(),
+           input_data.size() * sizeof(uint8));
+
+    ASSERT_EQ(kTfLiteOk, interpreter_->Invoke());
+  }
+
+  std::vector<float> GetOutput(int output_index) {
+    auto* tensor = interpreter_->output_tensor(output_index);
+    uint8* data = interpreter_->typed_output_tensor<uint8>(output_index);
+    std::vector<float> result;
+    result.resize(NumElements(tensor));
+    const auto scale =
+        reinterpret_cast<TfLiteAffineQuantization*>(tensor->quantization.params)
+            ->scale->data[0];
+    const auto zero_point =
+        reinterpret_cast<TfLiteAffineQuantization*>(tensor->quantization.params)
+            ->zero_point->data[0];
+    for (int i = 0; i < result.size(); ++i) {
+      result[i] = scale * (data[i] - zero_point);
+    }
+    return result;
+  }
+
+ private:
+  std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)> delegate_;
+  std::unique_ptr<FlatBufferModel> model_;
+  std::unique_ptr<tflite::OpResolver> resolver_;
+  std::unique_ptr<Interpreter> interpreter_;
+};
+
+std::vector<std::vector<int>> ParseInputShapes() {
+  std::vector<string> str_input_shapes;
+  benchmark::util::SplitAndParse(absl::GetFlag(FLAGS_model_input_shapes), ':',
+                                 &str_input_shapes);
+  std::vector<std::vector<int>> input_shapes(str_input_shapes.size());
+  for (int i = 0; i < str_input_shapes.size(); ++i) {
+    benchmark::util::SplitAndParse(str_input_shapes[i], ',', &input_shapes[i]);
+  }
+  return input_shapes;
+}
+
+TEST(HexagonDynamicBatch, MultipleResizes) {
+  int num_failed_tests = 0;
+  int num_test = 0;
+  auto test_input_shapes = ParseInputShapes();
+  auto default_model = std::make_unique<TestModel>();
+  auto delegated_model = std::make_unique<TestModel>();
+  default_model->Init();
+  delegated_model->Init();
+  delegated_model->ApplyDelegate(absl::GetFlag(FLAGS_max_batch_size), {0}, {0});
+  for (const auto& input_shape : test_input_shapes) {
+    const auto input = GetData(NumElements(input_shape));
+    default_model->Run(input_shape, input);
+    delegated_model->Run(input_shape, input);
+    const auto default_output = default_model->GetOutput(0);
+    const auto delegated_output = delegated_model->GetOutput(0);
+    if (!DiffOutput(default_output, delegated_output)) {
+      TFLITE_LOG(ERROR) << "Failed for input " << num_test;
+      num_failed_tests++;
+    }
+    num_test++;
+  }
+  if (num_failed_tests == 0) {
+    TFLITE_LOG(INFO) << "All Tests PASSED";
+  } else {
+    TFLITE_LOG(INFO) << "Failed " << num_failed_tests << " out of " << num_test;
+  }
+}
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  absl::ParseCommandLine(argc, argv);
+  testing::InitGoogleTest();
+
+  TfLiteHexagonInit();
+  int return_val = RUN_ALL_TESTS();
+  TfLiteHexagonTearDown();
+  return return_val;
+}
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc
index 2478b41375b..335586d7b13 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc
@@ -71,6 +71,11 @@ class QuantizedConcatenationOpModel : public SingleOpModelWithHexagon {
                          GetZeroPoint(output_));
   }
 
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
  private:
   int output_;
 };
@@ -105,6 +110,39 @@ TEST(QuantizedConcatenationOpModel, FourInputsQuantizedSameRange_Int8) {
   FourInputsQuantizedSameRangeImpl<int8_t, TensorType_INT8>();
 }
 
+template <typename integer_type, TensorType tensor_dtype>
+void TwoInputsNegativeAxisImpl() {
+  auto tensor0 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  auto tensor1 = {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+  QuantizedConcatenationOpModel m0({{tensor_dtype,
+                                     {2, 3},
+                                     std::numeric_limits<integer_type>::min(),
+                                     std::numeric_limits<integer_type>::max()},
+                                    {tensor_dtype,
+                                     {2, 3},
+                                     std::numeric_limits<integer_type>::min(),
+                                     std::numeric_limits<integer_type>::max()}},
+                                   /*axis=*/-2,
+                                   {tensor_dtype,
+                                    {},
+                                    std::numeric_limits<integer_type>::min(),
+                                    std::numeric_limits<integer_type>::max()});
+
+  m0.SetInput<integer_type>(0, tensor0);
+  m0.SetInput<integer_type>(1, tensor1);
+  m0.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m0.GetOutput<integer_type>(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+}
+
+TEST(QuantizedConcatenationOpModel, TwoInputsNegativeAxis_UInt8) {
+  TwoInputsNegativeAxisImpl<uint8_t, TensorType_UINT8>();
+}
+
+TEST(QuantizedConcatenationOpModel, TwoInputsNegativeAxis_Int8) {
+  TwoInputsNegativeAxisImpl<int8_t, TensorType_INT8>();
+}
+
 // NOTE: Int8 Concat does not have mixed-range support.
 
 TEST(QuantizedConcatenationOpModel, FourInputsQuantizedMixedRange) {
@@ -129,6 +167,28 @@ TEST(QuantizedConcatenationOpModel, FourInputsQuantizedMixedRange) {
                   /*max_abs_error=*/0.2)));
 }
 
+TEST(QuantizedConcatenationOpModel, FourInputsAxis2_UInt8) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2,
+                                   {TensorType_UINT8, {2, 1, 2}, -1., 1.});
+
+  m0.SetInput<uint8_t>(0, {1.0f, -3.0f, -4.0f, -7.0f});
+  m0.SetInput<uint8_t>(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput<uint8_t>(2, {1.2f, -3.2f, -4.2f, 7.2f});
+  m0.SetInput<uint8_t>(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m0.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f,   //
+                      -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,  //
+                  },
+                  /*max_abs_error=*/0.2)));
+}
+
 // If the input min/max (across all tensors) is same as the output min/max,
 // Hexagon's Requantize causes errors in InceptionV3.
 // So, we diable it for that case in the builder.
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h
index e2c63e4b8b1..9fd9dc69172 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h
@@ -44,6 +44,8 @@ class SingleOpModelWithHexagon : public SingleOpModel {
         "vendor/lib/rfsa/adsp;/dsp",
         1 /*overwrite*/);
 
+    // For tests, we use one-op-models.
+    params_.min_nodes_per_partition = 1;
     auto* delegate_ptr = TfLiteHexagonDelegateCreate(&params_);
     ASSERT_TRUE(delegate_ptr != nullptr);
     delegate_ = Interpreter::TfLiteDelegatePtr(
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/l2_norm_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/l2_norm_test.cc
new file mode 100644
index 00000000000..34d53d6e68f
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/l2_norm_test.cc
@@ -0,0 +1,122 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+class L2NormOpModel : public SingleOpModelWithHexagon {
+ public:
+  L2NormOpModel(const std::initializer_list<int> input_shape,
+                const TensorType tensor_type) {
+    TensorData data = TensorData{tensor_type};
+    data.min = -2.0;
+    data.max = 2.0;
+    data.scale = 2.0;
+    data.zero_point = 128;
+    input_ = AddInput(data);
+
+    data.min = -1.0;
+    data.max = 127.0 / 128.0;
+    output_ = AddOutput(data);
+
+    SetBuiltinOp(
+        BuiltinOperator_L2_NORMALIZATION, BuiltinOptions_L2NormOptions,
+        CreateL2NormOptions(builder_, ActivationFunctionType_NONE).Union());
+    BuildInterpreter({input_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  int input() const { return input_; }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(L2NormOpTest, ZerosVectorUint8Test) {
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_UINT8);
+
+  m.QuantizeAndPopulate<uint8_t>(m.input(), {0, 0, 0, 0, 0, 0});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({0, 0, 0, 0, 0, 0}, 0.1)));
+}
+
+TEST(L2NormOpTest, ZerosVectorInt8Test) {
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_INT8);
+
+  m.QuantizeAndPopulate<int8_t>(m.input(), {0, 0, 0, 0, 0, 0});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({0, 0, 0, 0, 0, 0}, 0.1)));
+}
+
+TEST(L2NormOpTest, MultipleBatchUint8Test) {
+  L2NormOpModel m({3, 1, 1, 6}, TensorType_UINT8);
+
+  m.QuantizeAndPopulate<uint8_t>(m.input(),
+                                 {
+                                     -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
+                                     -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
+                                     -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+                                 });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+                  },
+                  0.1)));
+}
+
+TEST(L2NormOpTest, MultipleBatchInt8Test) {
+  L2NormOpModel m({3, 1, 1, 6}, TensorType_INT8);
+
+  m.QuantizeAndPopulate<int8_t>(m.input(),
+                                {
+                                    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
+                                    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
+                                    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+                                });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+                  },
+                  0.1)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
index a16e22888dd..3a5f320a6a7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
@@ -22,7 +22,7 @@ using testing::ElementsAreArray;
 class FullyConnectedOpModel : public SingleOpModelWithHexagon {
  public:
   FullyConnectedOpModel(int units, int batches, const TensorData& input,
-                        const TensorData& output)
+                        const TensorData& output, bool optional_bias = false)
       : batches_(batches), units_(units) {
     int total_input_size = 1;
     for (size_t i = 0; i < input.shape.size(); ++i) {
@@ -34,9 +34,13 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon {
     weights_ =
         AddInput({input.type, {units_, input_size_}, input.min, input.max});
 
-    auto bias_scale = GetScale(input_) * GetScale(weights_);
-    TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
-    bias_ = AddInput(bias);
+    if (optional_bias) {
+      bias_ = AddNullInput();
+    } else {
+      auto bias_scale = GetScale(input_) * GetScale(weights_);
+      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
 
     output_ = AddOutput(output);
 
@@ -46,15 +50,16 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon {
                                     FullyConnectedOptionsWeightsFormat_DEFAULT,
                                     /*keep_num_dims=*/false)
             .Union());
-
-    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+    BuildInterpreter({GetShape(input_), GetShape(weights_)});
 
     // Weights & bias tensors need to be constant.
     // We don't use AddConstInput to allow setting filter values later.
     auto* weights_tensor = interpreter_->tensor(weights_);
     weights_tensor->allocation_type = kTfLiteMmapRo;
-    auto* bias_tensor = interpreter_->tensor(bias_);
-    bias_tensor->allocation_type = kTfLiteMmapRo;
+    if (!optional_bias) {
+      auto* bias_tensor = interpreter_->tensor(bias_);
+      bias_tensor->allocation_type = kTfLiteMmapRo;
+    }
   }
 
   void SetBias(const std::vector<float>& data) {
@@ -146,4 +151,56 @@ TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8) {
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8_NoBias) {
+  FullyConnectedOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128}, /*optional_bias*/ true);
+
+  m.SetWeights<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedInt8_NoBias) {
+  FullyConnectedOpModel m(/*units=*/3, /*batches*/ 2,
+                          /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+                          /*output=*/{TensorType_INT8, {}, -127, 128},
+                          /*optional_bias*/ true);
+
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc
new file mode 100644
index 00000000000..315ea909c53
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc
@@ -0,0 +1,171 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename data_type>
+class MinMaxOpModel : public SingleOpModelWithHexagon {
+ public:
+  MinMaxOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                const TensorData& input2, const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  MinMaxOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                std::initializer_list<data_type> input1_values,
+                const TensorData& input2,
+                std::initializer_list<data_type> input2_values,
+                const TensorData& output, bool input1_const) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+
+    // A workaround to mark the tensors as constant.
+    if (input1_const) {
+      auto* input1_tensor = interpreter_->tensor(input1_);
+      input1_tensor->allocation_type = kTfLiteMmapRo;
+    } else {
+      auto* input2_tensor = interpreter_->tensor(input2_);
+      input2_tensor->allocation_type = kTfLiteMmapRo;
+    }
+  }
+
+  void SetInput1(std::vector<data_type> data) { PopulateTensor(input1_, data); }
+
+  void SetInput2(std::vector<data_type> data) { PopulateTensor(input2_, data); }
+
+  std::vector<data_type> GetOutput() {
+    return ExtractVector<data_type>(output_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+template <typename data_type>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<data_type> input1_values,
+               std::initializer_list<data_type> input2_values) {
+  std::unique_ptr<MinMaxOpModel<data_type>> m;
+  m = std::make_unique<MinMaxOpModel<data_type>>(op, input1, input2, output);
+  m->SetInput1(input1_values);
+  m->SetInput2(input2_values);
+
+  m->Invoke();
+  const auto reference_output = m->GetOutput();
+  const auto reference_output_shape = m->GetOutputShape();
+  m->ApplyDelegateAndInvoke();
+  EXPECT_THAT(m->GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m->GetOutput(), ElementsAreArray(reference_output));
+}
+
+template <typename data_type>
+void TestModelConstInput(tflite::BuiltinOperator op, const TensorData& input1,
+                         const TensorData& input2, const TensorData& output,
+                         std::initializer_list<data_type> input1_values,
+                         std::initializer_list<data_type> input2_values,
+                         bool input1_const) {
+  std::unique_ptr<MinMaxOpModel<data_type>> m;
+  m = std::make_unique<MinMaxOpModel<data_type>>(
+      op, input1, input1_values, input2, input2_values, output, input1_const);
+  m->SetInput1(input1_values);
+  m->SetInput2(input2_values);
+
+  m->Invoke();
+  const auto reference_output = m->GetOutput();
+  const auto reference_output_shape = m->GetOutputShape();
+  m->ApplyDelegateAndInvoke();
+  EXPECT_THAT(m->GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m->GetOutput(), ElementsAreArray(reference_output));
+}
+
+TEST(MinMaxOpTest, Maximum_Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MAXIMUM,
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Maximum_Uint8Test_Const) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModelConstInput<uint8_t>(
+      BuiltinOperator_MAXIMUM, {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2, false);
+}
+
+TEST(MinMaxOpTest, Minimum_Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MINIMUM,
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Minimum_Uint8Test_Const) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 20, 1};
+  TestModelConstInput<uint8_t>(
+      BuiltinOperator_MINIMUM, {TensorType_UINT8, {1, 3, 1, 2}, -1, 25},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 25},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 25}, data1, data2, false);
+}
+
+TEST(MinMaxOpTest, Maximum_Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM,
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Minimum_Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 12, 1};
+  TestModel<int8_t>(BuiltinOperator_MINIMUM,
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25}, data1, data2);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc
new file mode 100644
index 00000000000..4caf96ac8ce
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename T>
+class MirrorPadOpModel : public SingleOpModelWithHexagon {
+ public:
+  MirrorPadOpModel(const TensorData& input,
+                   std::initializer_list<int> paddings_shape,
+                   std::initializer_list<int> paddings,
+                   const TensorData& output, const tflite::MirrorPadMode mode) {
+    input_id_ = AddInput(input);
+    padding_matrix_id_ =
+        AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    output_id_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MIRROR_PAD, BuiltinOptions_MirrorPadOptions,
+                 CreateMirrorPadOptions(builder_, mode).Union());
+    BuildInterpreter({GetShape(input_id_), GetShape(padding_matrix_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+
+ protected:
+  int input_id_;
+  int padding_matrix_id_;
+  int output_id_;
+};
+
+TEST(MirrorPadTest, EmptyPad_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {0, 0, 0, 0},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric_Int8) {
+  MirrorPadOpModel<int8_t> model({TensorType_INT8, {2, 3}, -1.0, 1.0}, {2, 2},
+                                 {1, 1, 1, 1}, {TensorType_INT8, {}, -1.0, 1.0},
+                                 tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 3, 1, 1, 2, 3, 3,
+                                4, 4, 5, 6, 6, 4, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {1, 1, 1, 1},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 5, 2, 1, 2, 3, 2,
+                                5, 4, 5, 6, 5, 2, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Reflect_Int8) {
+  MirrorPadOpModel<int8_t> model({TensorType_INT8, {2, 3}, -1.0, 1.0}, {2, 2},
+                                 {1, 0, 1, 0}, {TensorType_INT8, {}, -1.0, 1.0},
+                                 tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 2, 1, 2, 3, 5, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Symmetric_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {0, 1, 0, 1},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 3, 4, 5, 6, 6, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Reflect_Int8) {
+  MirrorPadOpModel<int8_t> model({TensorType_INT8, {3}, -1.0, 1.0}, {1, 2},
+                                 {0, 2}, {TensorType_INT8, {}, -1.0, 1.0},
+                                 tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int8_t>(model.input_tensor_id(), {1, 2, 3});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Symmetric_UInt8) {
+  MirrorPadOpModel<uint8_t> model({TensorType_UINT8, {3}, -1.0, 1.0}, {1, 2},
+                                  {0, 2}, {TensorType_UINT8, {}, -1.0, 1.0},
+                                  tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_Whole_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {1, 1, 2, 2},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1,
+                                6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1}));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/quantize_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/quantize_test.cc
new file mode 100644
index 00000000000..93cd138f014
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/quantize_test.cc
@@ -0,0 +1,170 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+class QuantizeOpModel : public SingleOpModelWithHexagon {
+ public:
+  explicit QuantizeOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_QUANTIZE, BuiltinOptions_QuantizeOptions,
+                 CreateQuantizeOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  template <typename T>
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+ protected:
+  BuiltinOperator op_code_;
+
+  int input_;
+  int output_;
+};
+
+// Input scale 0.500000, output scale 0.500000, input zeropoint 127, output
+// zeropoint 127
+TEST(QuantizeOpTest, UInt8UInt8SameScale) {
+  QuantizeOpModel m({TensorType_UINT8, {1, 1, 2, 5}, -63.5, 64},
+                    {TensorType_UINT8, {1, 1, 2, 5}, -63.5, 64});
+
+  // Input will quantized to {129,131,133,135,137,139,141,143,145,147}.
+  m.SetInput<uint8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetOutput<uint8_t>(),
+      ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147}));
+}
+
+// Input scale 0.500000, output scale 1.000000, input zeropoint 127, output
+// zeropoint 127
+TEST(QuantizeOpTest, Uint8Uint8LargerScale) {
+  QuantizeOpModel m({TensorType_UINT8, {1, 1, 2, 5}, -63.5, 64},
+                    {TensorType_UINT8, {1, 1, 2, 5}, -127, 128});
+
+  // Input will quantized to {129,131,133,135,137,139,141,143,145,147}.
+  m.SetInput<uint8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetOutput<uint8_t>(),
+      ElementsAreArray({128, 129, 130, 131, 132, 133, 134, 135, 136, 137}));
+}
+
+// Input scale 1.000000, output scale 0.500000, input zeropoint 127, output
+// zeropoint 127
+TEST(QuantizeOpTest, Uint8Uint8SmallerScale) {
+  QuantizeOpModel m({TensorType_UINT8, {1, 1, 2, 5}, -127, 128},
+                    {TensorType_UINT8, {1, 1, 2, 5}, -63.5, 64});
+
+  // Input will quantized to {128, 129, 130, 131, 132, 133, 134, 135, 136, 137}.
+  m.SetInput<uint8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetOutput<uint8_t>(),
+      ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147}));
+}
+
+//  Input scale 1.000000, output scale 0.500000, input zeropoint -1, output
+//  zeropoint 127
+TEST(QuantizeOpTest, Int8Uint8SmallerScale) {
+  QuantizeOpModel m({TensorType_INT8, {1, 1, 2, 5}, -127, 128},
+                    {TensorType_UINT8, {1, 1, 2, 5}, -63.5, 64});
+
+  // Input will quantized to {0,1,2,3,4,5,6,7,8,9}.
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetOutput<uint8_t>(),
+      ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147}));
+}
+
+//  Input scale 1.000000, output scale 2.000000, input zeropoint -1, output
+//  zeropoint 127
+TEST(QuantizeOpTest, Int8Uint8LargerScale) {
+  QuantizeOpModel m({TensorType_INT8, {1, 1, 2, 5}, -127, 128},
+                    {TensorType_UINT8, {1, 1, 2, 5}, -254, 256});
+
+  // Input will quantized to {0,1,2,3,4,5,6,7,8,9}.
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetOutput<uint8_t>(),
+      ElementsAreArray({128, 128, 129, 129, 130, 130, 131, 131, 132, 132}));
+}
+
+// input scale 0.500000, output scale 0.500000, input zeropoint 127, output
+// zeropoint -1
+TEST(QuantizeOpTest, UInt8Int8SameScale128Diff) {
+  QuantizeOpModel m({TensorType_UINT8, {1, 1, 2, 5}, -127, 128},
+                    {TensorType_INT8, {1, 1, 2, 5}, -127, 128});
+
+  // Input will quantized to {128, 129, 130, 131, 132, 133, 134, 135, 136, 137}.
+  m.SetInput<uint8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+}
+
+// Input scale 0.500000, output scale 0.500000, input zeropoint -1, output
+// zeropoint -1
+TEST(QuantizeOpTest, Int8Int8SameScale) {
+  QuantizeOpModel m({TensorType_INT8, {1, 1, 2, 5}, -63.5, 64},
+                    {TensorType_INT8, {1, 1, 2, 5}, -63.5, 64});
+
+  // Input will quantized to {1,3,5,7,9,11,13,15,17,19}.
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19}));
+}
+
+// Input scale 0.500000, output scale 1.000000, input zeropoint -1, output
+// zeropoint -1
+TEST(QuantizeOpTest, Int8Int8LargerScale) {
+  QuantizeOpModel m({TensorType_INT8, {1, 1, 2, 5}, -63.5, 64},
+                    {TensorType_INT8, {1, 1, 2, 5}, -127, 128});
+
+  // Input will quantized to {1,3,5,7,9,11,13,15,17,19}.
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+}
+
+// Input scale 1.000000, output scale 0.500000, input zeropoint -1, output
+// zeropoint -1
+TEST(QuantizeOpTest, Int8Int8SmallerScale) {
+  QuantizeOpModel m({TensorType_INT8, {1, 1, 2, 5}, -127, 128},
+                    {TensorType_INT8, {1, 1, 2, 5}, -63.5, 64});
+
+  // Input will quantized to {0,1,2,3,4,5,6,7,8,9}.
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19}));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
index 7e4f95ffa96..a3cd8c8255b 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tflite {
 using testing::ElementsAreArray;
 
-// TODO(b/148390890): All tests are disabled, enable after fix is availabel
-// and op is enabled.
+// TODO(b/148390890): Reduce Sum tests are disabled, enable after fix is
+// available and op is enabled.
 class ReduceOpModel : public SingleOpModelWithHexagon {
  public:
   ReduceOpModel(BuiltinOperator type, const TensorData& input,
@@ -49,32 +49,52 @@ class ReduceOpModel : public SingleOpModelWithHexagon {
   int output_;
 };
 
-TEST(ReduceOpModel, DISABLED_MeanNotKeepDims) {
+template <TensorType Tensor_Type, typename input_type>
+void TestMeanImpl() {
   float kQuantizedTolerance = 2.0 / 255;
   std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  ReduceOpModel m(BuiltinOperator_MEAN,
-                  {TensorType_UINT8, {1, 1, 3, 2}, -1.0, 1.0},
-                  {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {2}, false);
-  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  ReduceOpModel m(BuiltinOperator_MEAN, {Tensor_Type, {1, 1, 3, 2}, -1.0, 1.0},
+                  {Tensor_Type, {2}, -1.0, 1.0}, {1}, {2}, false);
+  m.QuantizeAndPopulate<input_type>(m.Input(), data);
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<input_type>();
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput<uint8_t>(),
-      ElementsAreArray(ArrayFloatNear({0.4, 0.4}, kQuantizedTolerance)));
+      m.GetDequantizedOutput<input_type>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
 }
 
-TEST(ReduceOpModel, DISABLED_MeanKeepDims) {
+TEST(ReduceOpModel, MeanNotKeepDims_Uint8) {
+  TestMeanImpl<TensorType_UINT8, uint8_t>();
+}
+
+TEST(ReduceOpModel, MeanNotKeepDims_Int8) {
+  TestMeanImpl<TensorType_INT8, int8_t>();
+}
+
+template <TensorType Tensor_Type, typename input_type>
+void TestMeanKeppDimsImpl() {
   float kQuantizedTolerance = 2.0 / 255;
   std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  ReduceOpModel m(BuiltinOperator_MEAN,
-                  {TensorType_UINT8, {1, 1, 3, 2}, -1.0, 1.0},
-                  {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {3}, true);
-  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  ReduceOpModel m(BuiltinOperator_MEAN, {Tensor_Type, {1, 1, 3, 2}, -1.0, 1.0},
+                  {Tensor_Type, {3}, -1.0, 1.0}, {1}, {3}, true);
+  m.QuantizeAndPopulate<input_type>(m.Input(), data);
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<input_type>();
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput<uint8_t>(),
-      ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
+      m.GetDequantizedOutput<input_type>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+TEST(ReduceOpModel, MeanKeepDims_Int8) {
+  TestMeanKeppDimsImpl<TensorType_INT8, int8_t>();
+}
+
+TEST(ReduceOpModel, MeanKeepDims_Uint8) {
+  TestMeanKeppDimsImpl<TensorType_UINT8, uint8_t>();
 }
 
 TEST(ReduceOpModel, DISABLED_SumNotKeepDims) {
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reshape_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reshape_test.cc
new file mode 100644
index 00000000000..a8bc9551411
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reshape_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/reshape_test_common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ReshapeOpTest : public ::testing::Test {};
+
+using DataTypes = ::testing::Types<uint8_t, int8_t>;
+TYPED_TEST_SUITE(ReshapeOpTest, DataTypes);
+
+TYPED_TEST(ReshapeOpTest, RegularShapes) {
+  std::vector<ShapeSpecificationType> shape_types = {
+      ShapeSpecificationType::kAsReshapeOption,
+      ShapeSpecificationType::kAsConstantTensor};
+
+  for (ShapeSpecificationType shape_type : shape_types) {
+    ReshapeOpModel<TypeParam, SingleOpModelWithHexagon> m(
+        {1, 2, 4, 1}, {3}, {2, 2, 2}, shape_type);
+    m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+    m.ApplyDelegateAndInvoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+    EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
+  }
+}
+
+TYPED_TEST(ReshapeOpTest, WithStretchDimension) {
+  std::vector<ShapeSpecificationType> shape_types = {
+      ShapeSpecificationType::kAsReshapeOption,
+      ShapeSpecificationType::kAsConstantTensor};
+
+  for (ShapeSpecificationType shape_type : shape_types) {
+    ReshapeOpModel<TypeParam, SingleOpModelWithHexagon> m(
+        {1, 2, 4, 1}, {3}, {2, 1, -1}, shape_type);
+    m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+    m.ApplyDelegateAndInvoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+    EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 4}));
+  }
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/resize_bilinear_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/resize_bilinear_test.cc
deleted file mode 100644
index 0f8fb703246..00000000000
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/resize_bilinear_test.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
-
-namespace tflite {
-using testing::ElementsAreArray;
-
-class ResizeBilinearOpModel : public SingleOpModelWithHexagon {
- public:
-  explicit ResizeBilinearOpModel(const TensorData& input,
-                                 std::initializer_list<int> size_data,
-                                 const TensorData& output) {
-    input_ = AddInput(input);
-    size_ = AddConstInput(TensorType_INT32, size_data, {2});
-    output_ = AddOutput(output);
-    SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
-                 BuiltinOptions_ResizeBilinearOptions,
-                 CreateResizeBilinearOptions(builder_).Union());
-    BuildInterpreter({GetShape(input_)});
-  }
-
-  template <typename T>
-  void SetInput(std::initializer_list<T> data) {
-    PopulateTensor(input_, data);
-  }
-
-  template <typename T>
-  std::vector<T> GetOutput() {
-    return ExtractVector<T>(output_);
-  }
-
-  void SetQuantizedInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
-  }
-
-  std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
-  }
-
-  int input() { return input_; }
-
- private:
-  int input_;
-  int size_;
-  int output_;
-};
-
-TEST(ResizeBilinearOpTest, HorizontalResizeUInt8) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}, -2.0, 10}, {1, 3},
-                          {TensorType_UINT8, {}, -2.0, 10});
-  m.SetQuantizedInput({3, 6});
-  m.ApplyDelegateAndInvoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(ArrayFloatNear({3, 5, 6}, /*max_abs_error=*/1)));
-}
-
-TEST(ResizeBilinearOpTest, VerticalResizeUInt8) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}, -2.0, 20}, {3, 1},
-                          {TensorType_UINT8, {}, -2.0, 20});
-  m.SetQuantizedInput({3, 9});
-  m.ApplyDelegateAndInvoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(ArrayFloatNear({3, 7, 9}, /*max_abs_error=*/1)));
-}
-
-TEST(ResizeBilinearOpTest, ThreeDimensionalResizeUInt8) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}, -2, 30}, {3, 3},
-                          {TensorType_UINT8, {}, -2.0, 30.0});
-  m.SetQuantizedInput({
-      3, 4, 6, 10,     //
-      10, 12, 14, 16,  //
-  });
-  m.ApplyDelegateAndInvoke();
-  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                            {
-                                                3, 4, 5, 8, 6, 10,       //
-                                                7, 9, 10, 12, 11, 14,    //
-                                                10, 12, 12, 14, 14, 16,  //
-                                            },
-                                            /*max_abs_error=*/1)));
-}
-
-TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}, -2, 30}, {3, 3},
-                          {TensorType_UINT8, {}, -2.0, 30.0});
-  m.SetQuantizedInput({
-      3, 6,   //
-      9, 12,  //
-      4, 10,  //
-      12, 16  //
-  });
-  m.ApplyDelegateAndInvoke();
-  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                            {
-                                                3, 5, 6,     //
-                                                7, 9, 10,    //
-                                                9, 11, 12,   //
-                                                4, 8, 10,    //
-                                                9, 12, 14,   //
-                                                12, 14, 16,  //
-                                            },
-                                            /*max_abs_error=*/1)));
-}
-
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/resize_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/resize_test.cc
new file mode 100644
index 00000000000..7164cb38b4c
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/resize_test.cc
@@ -0,0 +1,199 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+class ResizeOpModel : public SingleOpModelWithHexagon {
+ public:
+  explicit ResizeOpModel(BuiltinOperator op_type, const TensorData& input,
+                         std::initializer_list<int> size_data,
+                         const TensorData& output) {
+    input_ = AddInput(input);
+    size_ = AddConstInput(TensorType_INT32, size_data, {2});
+    output_ = AddOutput(output);
+    if (op_type == BuiltinOperator_RESIZE_NEAREST_NEIGHBOR) {
+      SetBuiltinOp(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                   BuiltinOptions_ResizeNearestNeighborOptions,
+                   CreateResizeNearestNeighborOptions(builder_).Union());
+    } else {
+      SetBuiltinOp(op_type, BuiltinOptions_ResizeBilinearOptions,
+                   CreateResizeBilinearOptions(builder_).Union());
+    }
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  template <typename T>
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  int input() { return input_; }
+
+ private:
+  int input_;
+  int size_;
+  int output_;
+};
+
+// TODO(b/154007913): Investigate why NearestNeighbor does not provide the same
+// output always, requiring high allowed error.
+
+TEST(ResizeOpModel, HorizontalResizeBiliear_UInt8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_BILINEAR,
+                  {TensorType_UINT8, {1, 1, 2, 1}, -2.0, 10}, {1, 3},
+                  {TensorType_UINT8, {}, -2.0, 10});
+  m.SetQuantizedInput<uint8_t>({3, 6});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6}, /*max_abs_error=*/1)));
+}
+
+TEST(ResizeOpModel, HorizontalResizeNearestNeighbor_Int8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                  {TensorType_INT8, {1, 1, 2, 1}, -2.0, 10}, {1, 3},
+                  {TensorType_INT8, {}, -2.0, 10});
+  m.SetQuantizedInput<int8_t>({3, 6});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3.01176, 3.01176, 6.02353},
+                                              /*max_abs_error=*/4)));
+}
+
+TEST(ResizeOpModel, VerticalResizeBiliear_Int8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_BILINEAR,
+                  {TensorType_INT8, {1, 2, 1, 1}, -2.0, 20}, {3, 1},
+                  {TensorType_INT8, {}, -2.0, 20});
+  m.SetQuantizedInput<int8_t>({3, 9});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9}, /*max_abs_error=*/1)));
+}
+
+TEST(ResizeOpModel, VerticalResizeNearestNeighbor_UInt8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                  {TensorType_UINT8, {1, 2, 1, 1}, -2.0, 20}, {3, 1},
+                  {TensorType_UINT8, {}, -2.0, 20});
+  m.SetQuantizedInput<uint8_t>({3, 9});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({3.01961, 3.01961, 8.97255},
+                                              /*max_abs_error=*/6)));
+}
+
+TEST(ResizeOpModel, ThreeDimensionalResizeBiliear_UInt8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_BILINEAR,
+                  {TensorType_UINT8, {1, 2, 2, 2}, -2, 30}, {3, 3},
+                  {TensorType_UINT8, {}, -2.0, 30.0});
+  m.SetQuantizedInput<uint8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      3, 4, 5, 8, 6, 10,       //
+                      7, 9, 10, 12, 11, 14,    //
+                      10, 12, 12, 14, 14, 16,  //
+                  },
+                  /*max_abs_error=*/1)));
+}
+
+TEST(ResizeOpModel, ThreeDimensionalResizeNearestNeighbor_Int8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                  {TensorType_INT8, {1, 2, 2, 2}, -2, 30}, {3, 3},
+                  {TensorType_INT8, {}, -2.0, 30.0});
+  m.SetQuantizedInput<int8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      3.01177, 4.01569, 3.01177, 4.01569, 6.02353, 10.0392,  //
+                      3.01177, 4.01569, 3.01177, 4.01569, 6.02353, 10.0392,  //
+                      10.0392, 12.0471, 10.0392, 12.0471, 14.0549, 16.0627,  //
+                  },
+                  /*max_abs_error=*/13)));
+}
+
+TEST(ResizeOpModel, TwoDimensionalResizeBilinearWithTwoBatches_Int8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_BILINEAR,
+                  {TensorType_INT8, {2, 2, 2, 1}, -2, 30}, {3, 3},
+                  {TensorType_INT8, {}, -2.0, 30.0});
+  m.SetQuantizedInput<int8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(), ElementsAreArray(ArrayFloatNear(
+                                                    {
+                                                        3, 5, 6,     //
+                                                        7, 9, 10,    //
+                                                        9, 11, 12,   //
+                                                        4, 8, 10,    //
+                                                        9, 12, 14,   //
+                                                        12, 14, 16,  //
+                                                    },
+                                                    /*max_abs_error=*/1)));
+}
+
+TEST(ResizeOpModel, TwoDimensionalResizeNNWithTwoBatches_UInt8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                  {TensorType_UINT8, {2, 2, 2, 1}, -2, 30}, {3, 3},
+                  {TensorType_UINT8, {}, -2.0, 30.0});
+  m.SetQuantizedInput<uint8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      3.01177, 3.01177, 6.02353,  //
+                      3.01177, 3.01177, 6.02353,  //
+                      9.03529, 9.03529, 12.0471,  //
+                      4.01569, 4.01569, 10.0392,  //
+                      4.01569, 4.01569, 10.0392,  //
+                      12.0471, 12.0471, 16.0627,  //
+                  },
+                  /*max_abs_error=*/13)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc
new file mode 100644
index 00000000000..d3bcfb6a6c2
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc
@@ -0,0 +1,163 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename index_type>
+class SliceOpModel : public SingleOpModelWithHexagon {
+ public:
+  SliceOpModel(const TensorData& input, const TensorData& output,
+               const TensorData& begin, const TensorData& size,
+               std::initializer_list<index_type> begin_data,
+               std::initializer_list<index_type> size_data) {
+    input_ = AddInput(input);
+    begin_ = AddConstInput(begin, begin_data);
+    size_ = AddConstInput(size, size_data);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SLICE, BuiltinOptions_SliceOptions,
+                 CreateSliceOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_), GetShape(begin_), GetShape(size_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int begin_;
+  int size_;
+  int output_;
+};
+
+TEST(SliceOpTest, Input_1D_Uint8) {
+  SliceOpModel<int32_t> m(/*input=*/{TensorType_UINT8, {4}, -10, 10},
+                          /*output=*/{TensorType_UINT8, {2}, -10, 10},
+                          {TensorType_INT32, {1}}, {TensorType_INT32, {1}}, {1},
+                          {2});
+  m.SetInput<uint8_t>({1, 2, 3, 4});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({2, 3}, 0.1)));
+}
+
+TEST(SliceOpTest, Input_2D_Uint8) {
+  SliceOpModel<int32_t> m(
+      /*input=*/{TensorType_UINT8, {2, 3}, -10, 10},
+      /*output=*/{TensorType_UINT8, {1, 2}, -10, 10}, {TensorType_INT32, {2}},
+      {TensorType_INT32, {2}}, {1, 0}, {1, 2});
+  m.SetInput<uint8_t>({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, SizeInt64_Uint8) {
+  SliceOpModel<int64_t> m(/*input=*/{TensorType_UINT8, {4, 1, 1, 1}, -10, 10},
+                          /*output=*/{TensorType_UINT8, {3, 1, 1, 1}, -10, 10},
+                          {TensorType_INT64, {4}}, {TensorType_INT64, {4}},
+                          {1, 0, 0, 0}, {3, 1, 1, 1});
+  m.SetInput<uint8_t>({1, 2, 3, 4});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, SizeMinus1) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_UINT8, {3, 2, 3, 1}, -10, 10},
+      /*output=*/{TensorType_UINT8, {2, 1, 3, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 0, 0, 0},
+      {2, 1, -1, 1});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis1) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_UINT8, {3, 3, 2, 1}, -10, 10},
+      /*output=*/{TensorType_UINT8, {2, 2, 1, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 1, 0, 0},
+      {2, -1, 1, 1});
+  m.SetInput<uint8_t>({1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis2) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_UINT8, {3, 2, 3, 1}, -10, 10},
+      /*output=*/{TensorType_UINT8, {2, 1, 2, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 0, 1, 0},
+      {2, 1, -1, 1});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis2_Int8) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_INT8, {3, 2, 3, 1}, -10, 10},
+      /*output=*/{TensorType_INT8, {2, 1, 2, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 0, 1, 0},
+      {2, 1, -1, 1});
+  m.SetInput<int8_t>({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/split_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/split_test.cc
index 5322f4c96d8..bdadf14adb2 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/split_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/split_test.cc
@@ -51,6 +51,7 @@ class SplitOpModel : public SingleOpModelWithHexagon {
   std::vector<int> outputs_;
 };
 
+template <typename integer_type, TensorType tensor_dtype>
 void CheckSplitBehavior(
     int axis, int num_splits, std::initializer_list<int> input_shape,
     std::initializer_list<int> output_shape,
@@ -65,44 +66,45 @@ void CheckSplitBehavior(
 
   const float kMin = std::min({0.0f, std::min(input_data)});
   const float kMax = std::max(input_data);
-  SplitOpModel const_m({TensorType_UINT8, input_shape, kMin, kMax},
-                       {TensorType_UINT8, output_shape, kMin, kMax}, num_splits,
+  SplitOpModel const_m({tensor_dtype, input_shape, kMin, kMax},
+                       {tensor_dtype, output_shape, kMin, kMax}, num_splits,
                        axis);
-  const_m.SetInput<uint8_t>(input_data);
+  const_m.SetInput<integer_type>(input_data);
   const_m.ApplyDelegateAndInvoke();
   for (int i = 0; i < num_splits; ++i) {
     EXPECT_THAT(
-        const_m.GetDequantizedOutput<uint8_t>(i),
-        ElementsAreArray(ArrayFloatNear(output_data[i], /*tolerance=*/0.1)))
+        const_m.GetDequantizedOutput<integer_type>(i),
+        ElementsAreArray(ArrayFloatNear(output_data[i], /*max_abs_error=*/0.1)))
         << debug(i);
     EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shape))
         << debug(i);
   }
 }
 
-TEST(SplitOpModel, CheckFourDimSplits) {
-  CheckSplitBehavior(
+template <typename integer_type, TensorType tensor_dtype>
+void CheckFourDimSplitImpl() {
+  CheckSplitBehavior<integer_type, tensor_dtype>(
       /*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
       {
           {1, 2, 3, 4, 5, 6, 7, 8},
           {9, 10, 11, 12, 13, 14, 15, 16},
       });
-  CheckSplitBehavior(
+  CheckSplitBehavior<integer_type, tensor_dtype>(
       /*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
       {
           {1, 2, 3, 4, 9, 10, 11, 12},
           {5, 6, 7, 8, 13, 14, 15, 16},
       });
-  CheckSplitBehavior(
+  CheckSplitBehavior<integer_type, tensor_dtype>(
       /*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
       {
           {1, 2, 5, 6, 9, 10, 13, 14},
           {3, 4, 7, 8, 11, 12, 15, 16},
       });
-  CheckSplitBehavior(
+  CheckSplitBehavior<integer_type, tensor_dtype>(
       /*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
       {
@@ -111,14 +113,42 @@ TEST(SplitOpModel, CheckFourDimSplits) {
       });
 }
 
-TEST(SplitOpModel, CheckOneDimensionalSplit) {
-  CheckSplitBehavior(/*axis=*/0, /*num_splits=*/8, {8}, {1},
-                     {1, 2, 3, 4, 5, 6, 7, 8},
-                     {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+TEST(SplitOpModel, CheckFourDimSplitImpl_UInt8) {
+  CheckSplitBehavior<uint8_t, TensorType_UINT8>(
+      /*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+      {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
 }
 
-TEST(SplitOpModel, CheckNegativeAxisSplit) {
-  CheckSplitBehavior(
+TEST(SplitOpModel, CheckFourDimSplitImpl_Int8) {
+  CheckSplitBehavior<int8_t, TensorType_INT8>(
+      /*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+      {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+}
+
+TEST(SplitOpModel, CheckOneDimensionalSplit_UInt8) {
+  CheckSplitBehavior<uint8_t, TensorType_UINT8>(
+      /*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+      {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+}
+
+TEST(SplitOpModel, CheckOneDimensionalSplit_Int8) {
+  CheckSplitBehavior<int8_t, TensorType_INT8>(
+      /*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+      {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+}
+
+TEST(SplitOpModel, CheckNegativeAxisSplit_UInt8) {
+  CheckSplitBehavior<uint8_t, TensorType_UINT8>(
+      /*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+      {
+          {1, 2, 3, 4, 5, 6, 7, 8},
+          {9, 10, 11, 12, 13, 14, 15, 16},
+      });
+}
+
+TEST(SplitOpModel, CheckNegativeAxisSplit_Int8) {
+  CheckSplitBehavior<int8_t, TensorType_INT8>(
       /*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
       {
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/transpose_conv_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/transpose_conv_test.cc
index 13a60730611..232d4bab4b2 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/transpose_conv_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/transpose_conv_test.cc
@@ -13,16 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
 
+template <typename InputType>
 class QuantizedTransposeConvOpModel : public SingleOpModelWithHexagon {
  public:
   QuantizedTransposeConvOpModel(std::initializer_list<int> output_shape_data,
                                 const TensorData& filter,
-                                std::initializer_list<uint8_t> filter_data,
+                                std::initializer_list<InputType> filter_data,
                                 const TensorData& input,
                                 const TensorData& output, Padding padding,
                                 int stride_w, int stride_h) {
@@ -44,12 +46,12 @@ class QuantizedTransposeConvOpModel : public SingleOpModelWithHexagon {
   }
 
   void SetInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<InputType>(input_, data);
   }
 
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<InputType>(ExtractVector<InputType>(output_),
+                                 GetScale(output_), GetZeroPoint(output_));
   }
 
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
@@ -65,7 +67,7 @@ TEST(QuantizedTransposeConvOpModel, SimpleTestQuantized) {
   // Float would be {1, 2, 3, 4, 5, 6, 7, 8, 9}
   std::initializer_list<uint8_t> filter_data = {129, 131, 133, 135, 137,
                                                 139, 141, 143, 145};
-  QuantizedTransposeConvOpModel model(
+  auto model = QuantizedTransposeConvOpModel<uint8_t>(
       {1, 4, 4, 1}, {TensorType_UINT8, {1, 3, 3, 1}, -63.5, 64}, filter_data,
       {TensorType_UINT8, {1, 4, 4, 1}, -63.5, 64},
       {TensorType_UINT8, {}, -508, 512}, Padding_SAME, 1, 1);
@@ -88,7 +90,7 @@ TEST(QuantizedTransposeConvOpModel, PaddingValidTestQuantized) {
   std::initializer_list<uint8_t> filter_data = {129, 131, 133, 135, 137, 139,
                                                 141, 143, 145, 147, 149, 151,
                                                 153, 155, 157, 159, 161, 163};
-  QuantizedTransposeConvOpModel model(
+  auto model = QuantizedTransposeConvOpModel<uint8_t>(
       {1, 6, 6, 1}, {TensorType_UINT8, {1, 3, 3, 2}, -63.5, 64}, filter_data,
       {TensorType_UINT8, {1, 4, 4, 2}, -63.5, 64},
       {TensorType_UINT8, {}, -4064, 4096}, Padding_VALID, 1, 1);
@@ -113,7 +115,7 @@ TEST(QuantizedTransposeConvOpModel, TwoFiltersTestQuantized) {
   std::initializer_list<uint8_t> filter_data = {129, 131, 133, 135, 137, 139,
                                                 141, 143, 145, 147, 149, 151,
                                                 153, 155, 157, 159, 161, 163};
-  QuantizedTransposeConvOpModel model(
+  auto model = QuantizedTransposeConvOpModel<uint8_t>(
       {1, 4, 4, 1}, {TensorType_UINT8, {1, 3, 3, 2}, -63.5, 64}, filter_data,
       {TensorType_UINT8, {1, 4, 4, 2}, -63.5, 64},
       {TensorType_UINT8, {}, -4064, 4096}, Padding_SAME, 1, 1);
@@ -130,4 +132,55 @@ TEST(QuantizedTransposeConvOpModel, TwoFiltersTestQuantized) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
+TEST(QuantizedTransposeConvOpModel,
+     SimpleTestQuantizedPerChannelSingleChannel) {
+  const std::initializer_list<int8_t> filter_data = {14, 28, 42,  56, 71,
+                                                     85, 99, 113, 127};
+  auto model = QuantizedTransposeConvOpModel<int8_t>(
+      {1, 4, 4, 1},
+      {TensorType_INT8, {1, 3, 3, 1}, 0, 0, 0, 0, true, {9.0 / 127}, {0}, 0},
+      filter_data, {TensorType_INT8, {1, 4, 4, 1}, 0, 0, 16.0 / 255, -128},
+      {TensorType_INT8, {}, 0, 0, 2, -128}, Padding_SAME, 1, 1);
+  model.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  model.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(
+      model.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({28, 62, 82, 76, 98, 192, 236, 198, 206,
+                                       372, 416, 330, 262, 446, 486, 366},
+                                      1e-5)));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(QuantizedTransposeConvOpModel, TestQuantizedPerChannelMultiChannel) {
+  const std::initializer_list<int8_t> filter_data = {
+      7,  22, 37, 52, 67, 82, 97, 112, 127,
+      14, 28, 42, 56, 71, 85, 99, 113, 127};
+  auto model = QuantizedTransposeConvOpModel<int8_t>(
+      {1, 5, 5, 2},
+      {TensorType_INT8,
+       {2, 3, 3, 1},
+       0,
+       0,
+       0,
+       0,
+       true,
+       {17.0 / 127, 18.0 / 127},
+       {0, 0},
+       0},
+      filter_data, {TensorType_INT8, {1, 2, 2, 1}, 0, 0, 4.0 / 255, -128},
+      {TensorType_INT8, {}, 0, 0, 1, -128}, Padding_VALID, 2, 2);
+  model.SetInput({1, 2, 3, 4});
+  model.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(
+      model.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {1,  2,  3,  4,  7,  10, 6,  8,  10, 12, 7,   8,   9,  10, 25, 28, 18,
+           20, 22, 24, 16, 20, 24, 28, 62, 72, 42, 48,  54,  60, 21, 24, 27, 30,
+           61, 68, 36, 40, 44, 48, 39, 42, 45, 48, 103, 110, 60, 64, 68, 72},
+          1e-5)));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 5, 5, 2}));
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/transpose_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/transpose_test.cc
index eba4ad21760..bb2494d14e6 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/transpose_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/transpose_test.cc
@@ -39,11 +39,15 @@ class TransposeOpModel : public SingleOpModelWithHexagon {
     }
   }
 
-  void SetInput(const std::vector<uint8_t>& data) {
-    PopulateTensor<uint8_t>(input_, data);
+  template <typename integer_type>
+  void SetInput(const std::vector<integer_type>& data) {
+    PopulateTensor<integer_type>(input_, data);
   }
 
-  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  template <typename integer_type>
+  std::vector<integer_type> GetOutput() {
+    return ExtractVector<integer_type>(output_);
+  }
 
  protected:
   int input_;
@@ -51,10 +55,11 @@ class TransposeOpModel : public SingleOpModelWithHexagon {
   int output_;
 };
 
-void ComputeExpectedTransposeResult(const std::vector<int>& shape,
-                                    const std::vector<int>& perms,
-                                    std::vector<uint8_t>* input,
-                                    std::vector<uint8_t>* input_transposed) {
+template <typename integer_type>
+void ComputeExpectedTransposeResult(
+    const std::vector<int>& shape, const std::vector<int>& perms,
+    std::vector<integer_type>* input,
+    std::vector<integer_type>* input_transposed) {
   // Count elements and allocate output.
   int count = 1;
   for (auto factor : shape) count *= factor;
@@ -79,11 +84,12 @@ void ComputeExpectedTransposeResult(const std::vector<int>& shape,
     params.perm[i] = perms[i];
   }
 
-  reference_ops::Transpose<uint8_t>(params, input_shape, input->data(),
-                                    output_shape, input_transposed->data());
+  reference_ops::Transpose<integer_type>(params, input_shape, input->data(),
+                                         output_shape,
+                                         input_transposed->data());
 }
 
-TEST(TransposeOpTest, Test1D) {
+TEST(TransposeOpTest, Test1D_UInt8) {
   // Basic 1D identity.
   std::vector<uint8_t> expected_output, input;
   std::vector<int> input_shape = {3};
@@ -91,12 +97,25 @@ TEST(TransposeOpTest, Test1D) {
 
   TransposeOpModel model({TensorType_UINT8, input_shape, -10, 10}, {1}, {0},
                          true, {TensorType_UINT8, {}, -10, 10});
-  model.SetInput(input);
+  model.SetInput<uint8_t>(input);
   model.ApplyDelegateAndInvoke();
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray(expected_output));
+  EXPECT_THAT(model.GetOutput<uint8_t>(), ElementsAreArray(expected_output));
 }
 
-TEST(TransposeOpTest, Test2D) {
+TEST(TransposeOpTest, Test1D_Int8) {
+  // Basic 1D identity.
+  std::vector<int8_t> expected_output, input;
+  std::vector<int> input_shape = {3};
+  ComputeExpectedTransposeResult(input_shape, {0}, &input, &expected_output);
+
+  TransposeOpModel model({TensorType_INT8, input_shape, -10, 10}, {1}, {0},
+                         true, {TensorType_INT8, {}, -10, 10});
+  model.SetInput<int8_t>(input);
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput<int8_t>(), ElementsAreArray(expected_output));
+}
+
+TEST(TransposeOpTest, Test2D_UInt8) {
   std::vector<uint8_t> expected_output, input;
   std::vector<int> input_shape = {3, 2};
   std::vector<int> perm = {1, 0};
@@ -104,12 +123,25 @@ TEST(TransposeOpTest, Test2D) {
 
   TransposeOpModel model({TensorType_UINT8, input_shape, -10, 10}, {2}, {1, 0},
                          true, {TensorType_UINT8, {}, -10, 10});
-  model.SetInput(input);
+  model.SetInput<uint8_t>(input);
   model.ApplyDelegateAndInvoke();
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray(expected_output));
+  EXPECT_THAT(model.GetOutput<uint8_t>(), ElementsAreArray(expected_output));
 }
 
-TEST(TransposeOpTest, Test4D) {
+TEST(TransposeOpTest, Test2D_Int8) {
+  std::vector<int8_t> expected_output, input;
+  std::vector<int> input_shape = {3, 2};
+  std::vector<int> perm = {1, 0};
+  ComputeExpectedTransposeResult(input_shape, perm, &input, &expected_output);
+
+  TransposeOpModel model({TensorType_INT8, input_shape, -10, 10}, {2}, {1, 0},
+                         true, {TensorType_INT8, {}, -10, 10});
+  model.SetInput<int8_t>(input);
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput<int8_t>(), ElementsAreArray(expected_output));
+}
+
+TEST(TransposeOpTest, Test4D_UInt8) {
   std::vector<uint8_t> expected_output, input;
   std::vector<int> input_shape = {2, 2, 3, 1};
   std::vector<int> perm = {3, 0, 1, 2};
@@ -117,8 +149,21 @@ TEST(TransposeOpTest, Test4D) {
 
   TransposeOpModel model({TensorType_UINT8, input_shape, -10, 10}, {4},
                          {3, 0, 1, 2}, true, {TensorType_UINT8, {}, -10, 10});
-  model.SetInput(input);
+  model.SetInput<uint8_t>(input);
   model.ApplyDelegateAndInvoke();
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray(expected_output));
+  EXPECT_THAT(model.GetOutput<uint8_t>(), ElementsAreArray(expected_output));
+}
+
+TEST(TransposeOpTest, Test4D_Int8) {
+  std::vector<int8_t> expected_output, input;
+  std::vector<int> input_shape = {2, 2, 3, 1};
+  std::vector<int> perm = {3, 0, 1, 2};
+  ComputeExpectedTransposeResult(input_shape, perm, &input, &expected_output);
+
+  TransposeOpModel model({TensorType_INT8, input_shape, -10, 10}, {4},
+                         {3, 0, 1, 2}, true, {TensorType_INT8, {}, -10, 10});
+  model.SetInput<int8_t>(input);
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput<int8_t>(), ElementsAreArray(expected_output));
 }
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_builder.cc
index c01d5c7e5b0..952f1fb2672 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_builder.cc
@@ -43,9 +43,7 @@ TfLiteStatus TransposeOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   }
 
   TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_,
-                                  std::numeric_limits<uint8_t>::min(),
-                                  std::numeric_limits<uint8_t>::max()));
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_conv_2d_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_conv_2d_builder.cc
index 7c9a64fac4c..d700afa3ed4 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_conv_2d_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_conv_2d_builder.cc
@@ -20,31 +20,84 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
 namespace delegates {
 namespace hexagon {
+namespace {
+
+constexpr uint8_t k8BitSignFlipConstant = 0x80;
+// 1/1024 ~ 0.0009766 is a restriction set by Hexagon's kernels.
+// TODO(b/151103818): Figure out a way to retrieve this constant reliably.
+constexpr float kHexagonMinRelativeScale = 0.0009766f;
+
+}  // namespace
+
+TfLiteStatus TransposeConv2dOpBuilder::ProcessPerChannelQuantizedWeights(
+    const TfLiteIntArray* inputs, const TfLiteIntArray* outputs,
+    TfLiteContext* context, float* weights_min, float* weights_max) {
+  const auto& weights_tensor = context->tensors[inputs->data[1]];
+  TfLiteAffineQuantization* weights_quant_params =
+      reinterpret_cast<TfLiteAffineQuantization*>(
+          weights_tensor.quantization.params);
+
+  // Retrieve channel scales.
+  num_scale_values_ = weights_quant_params->scale->size;
+  // Normalize the scales as expected by Hexagon.
+  scales_data_ = weights_quant_params->scale->data;
+  std::vector<float> normalized_scales;
+  normalized_scales.reserve(num_scale_values_);
+  float scale_max = 0.0;
+  for (int i = 0; i < num_scale_values_; ++i) {
+    normalized_scales.push_back(scales_data_[i]);
+    if (scales_data_[i] > scale_max) {
+      scale_max = scales_data_[i];
+    }
+  }
+  if (scale_max == 0.0) {
+    TF_LITE_KERNEL_LOG(context, "Scale max is zero for: %s",
+                       weights_tensor.name);
+    return kTfLiteError;
+  }
+  for (int i = 0; i < num_scale_values_; ++i) {
+    normalized_scales[i] =
+        std::max(normalized_scales[i] / scale_max, kHexagonMinRelativeScale);
+  }
+  // Add node for channel scales data.
+  const std::vector<int> scales_shape = {1, 1, 1, num_scale_values_};
+  channel_scales_node_ = graph_builder_->AddConstNodeWithData(
+      scales_shape.data(), reinterpret_cast<char*>(normalized_scales.data()),
+      normalized_scales.size() * sizeof(normalized_scales[0]));
+  *weights_min = -128 * scale_max;
+  *weights_max = 127 * scale_max;
+  return kTfLiteOk;
+}
+
 TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
     const TfLiteIntArray* inputs, const TfLiteIntArray* outputs,
     TfLiteContext* context) {
   static std::vector<int> quant_bound_shape = {1, 1, 1, 1};
   int tensor_id;
 
-  // Input data tensor.
+  // DATA TENSOR.
   tensor_id = inputs->data[2];
   const auto& data_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
-      data_tensor, &data_min_, &data_max_, std::numeric_limits<uint8_t>::min(),
-      std::numeric_limits<uint8_t>::max()));
+  float data_min = 0;
+  float data_max = 0;
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(data_tensor, &data_min, &data_max));
   auto* data_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&data_min_, sizeof(data_min_));
+      quant_bound_shape.data(), reinterpret_cast<char*>(&data_min),
+      sizeof(data_min));
   auto* data_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&data_max_, sizeof(data_max_));
+      quant_bound_shape.data(), reinterpret_cast<char*>(&data_max),
+      sizeof(data_max));
 
-  // Weights tensor
+  // WEIGHTS.
   tensor_id = inputs->data[1];
   const auto& weights_tensor = context->tensors[tensor_id];
   if (weights_tensor.allocation_type != kTfLiteMmapRo) {
@@ -59,18 +112,49 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
           &filter_depth_size, weights_tensor.dims);
   weight_shape_ = {filter_batch_size, filter_height_size, filter_width_size,
                    filter_depth_size};
-  auto* const_weights_node = graph_builder_->AddConstNodeWithData(
-      weight_shape_.data(), (char*)weights_tensor.data.raw,
-      weights_tensor.bytes);
+  // Weights tensor could be int8 even for per-tensor quantization.
+  // Therefore, we look at the number of scale values to check if it is
+  // per-channel quantized.
+  TfLiteAffineQuantization* weights_quant_params =
+      reinterpret_cast<TfLiteAffineQuantization*>(
+          weights_tensor.quantization.params);
+  const bool is_per_channel_quant = weights_quant_params->scale->size > 1;
+
+  OpBuilder* const_weights_node;
+  if (weights_tensor.type == kTfLiteInt8) {
+    std::vector<uint8_t> weights_data(NumElements(&weights_tensor));
+    const int8_t* original_data = weights_tensor.data.int8;
+    // Flip bits on the weight values so that the int8 values are treated
+    // as uint8.
+    for (int i = 0; i < NumElements(&weights_tensor); ++i) {
+      weights_data[i] = original_data[i] ^ k8BitSignFlipConstant;
+    }
+    const_weights_node = graph_builder_->AddConstNodeWithData(
+        weight_shape_.data(), reinterpret_cast<char*>(weights_data.data()),
+        weights_data.size() * sizeof(weights_data[0]));
+  } else {
+    const_weights_node = graph_builder_->AddConstNodeWithData(
+        weight_shape_.data(), weights_tensor.data.raw, weights_tensor.bytes);
+  }
   graph_builder_->AddTensorWithID(tensor_id, const_weights_node->GetID(), 0);
   AddInput(TensorID(const_weights_node->GetID(), 0));
-  ComputeMinAndMaxQuantValues(weights_tensor, &weights_min_, &weights_max_,
-                              std::numeric_limits<uint8_t>::min(),
-                              std::numeric_limits<uint8_t>::max());
+
+  // Handle weights quantization.
+  float weights_min = 0;
+  float weights_max = 0;
+  if (is_per_channel_quant) {
+    ProcessPerChannelQuantizedWeights(inputs, outputs, context, &weights_min,
+                                      &weights_max);
+  } else {
+    TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
+        weights_tensor, &weights_min, &weights_max));
+  }
   auto* weights_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&weights_min_, sizeof(weights_min_));
+      quant_bound_shape.data(), reinterpret_cast<char*>(&weights_min),
+      sizeof(weights_min));
   auto* weights_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&weights_max_, sizeof(weights_max_));
+      quant_bound_shape.data(), reinterpret_cast<char*>(&weights_max),
+      sizeof(weights_max));
 
   // Min/max inputs for data & weights tensors.
   AddInput(TensorID(data_min_const->GetID(), 0));
@@ -84,6 +168,7 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
   GetDims(&output_batch_size, &output_height_size, &output_width_size,
           &output_depth_size, context->tensors[outputs->data[0]].dims);
 
+  // PADDING & STRIDE.
   // Hexagon TransposeConv requires an explicit padding tensor. So we compute
   // the same using stride, input & output info.
   const TfLiteTransposeConvParams* params =
@@ -97,8 +182,8 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
                                      padding.width, padding.width};
   std::vector<int> padding_tensor_shape = {1, 1, 2, 2};
   auto* padding_const = graph_builder_->AddConstNodeWithData(
-      padding_tensor_shape.data(), (char*)padding_tensor.data(),
-      (sizeof(int) * 4));
+      padding_tensor_shape.data(),
+      reinterpret_cast<char*>(padding_tensor.data()), (sizeof(int) * 4));
   AddInput(TensorID(padding_const->GetID(), 0));
 
   // Stride shape.
@@ -107,37 +192,47 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
   static int dummy = 0;
   stride_shape_ = {1, stride_height, stride_width, 1};
   auto* stride_node = graph_builder_->AddConstNodeWithData(
-      stride_shape_.data(), (char*)&dummy, sizeof(dummy));
+      stride_shape_.data(), reinterpret_cast<char*>(&dummy), sizeof(dummy));
   AddInput(TensorID(stride_node->GetID(), 0));
 
+  // BIAS.
   // TFLite's TransposeConv doesn't have a bias input, so we just feed in 0s.
-  std::vector<int> bias_data(output_depth_size);
+  std::vector<int> bias_data(output_depth_size, 0);
   // Hexagon's conv ops require bias as a [1, 1, 1, dout] tensor.
   bias_shape_ = {1, 1, 1, output_depth_size};
   auto* bias_const = graph_builder_->AddConstNodeWithData(
-      bias_shape_.data(), (char*)bias_data.data(),
+      bias_shape_.data(), reinterpret_cast<char*>(bias_data.data()),
       sizeof(bias_data[0]) * bias_data.size());
-  bias_min_ = 0;
-  bias_max_ = 0;
+  float zero_bound = 0;
   auto* bias_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&bias_min_, sizeof(bias_min_));
+      quant_bound_shape.data(), reinterpret_cast<char*>(&zero_bound),
+      sizeof(zero_bound));
   auto* bias_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&bias_max_, sizeof(bias_max_));
+      quant_bound_shape.data(), reinterpret_cast<char*>(&zero_bound),
+      sizeof(zero_bound));
   AddInput(TensorID(bias_const->GetID(), 0));
   AddInput(TensorID(bias_min_const->GetID(), 0));
   AddInput(TensorID(bias_max_const->GetID(), 0));
 
-  // Output min/max.
-  ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]], &output_min_,
-                              &output_max_, std::numeric_limits<uint8_t>::min(),
-                              std::numeric_limits<uint8_t>::max());
+  // Output quantization.
+  float output_min = 0;
+  float output_max = 0;
+  ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]], &output_min,
+                              &output_max);
   auto* output_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&output_min_, sizeof(output_min_));
+      quant_bound_shape.data(), reinterpret_cast<char*>(&output_min),
+      sizeof(output_min));
   auto* output_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&output_max_, sizeof(output_max_));
+      quant_bound_shape.data(), reinterpret_cast<char*>(&output_max),
+      sizeof(output_max));
   AddInput(TensorID(output_min_const->GetID(), 0));
   AddInput(TensorID(output_max_const->GetID(), 0));
 
+  // Channel scales, if this op is per-channel quantized.
+  if (channel_scales_node_ != nullptr) {
+    AddInput(TensorID(channel_scales_node_->GetID(), 0));
+  }
+
   // Hexagon outputs for this node.
   node_output_ = AddOutput(sizeof(uint8_t), 4,
                            {output_batch_size, output_height_size,
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_conv_2d_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_conv_2d_builder.h
index 058595ae15a..f38f9a9693a 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_conv_2d_builder.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/transpose_conv_2d_builder.h
@@ -37,13 +37,23 @@ class TransposeConv2dOpBuilder : public OpBuilder {
   ~TransposeConv2dOpBuilder();
 
  private:
+  // TODO(b/142009955): Combine into common util for all types of Conv.
+  TfLiteStatus ProcessPerChannelQuantizedWeights(const TfLiteIntArray* inputs,
+                                                 const TfLiteIntArray* outputs,
+                                                 TfLiteContext* context,
+                                                 float* weights_min,
+                                                 float* weights_max);
+
   TensorID node_output_;
   std::vector<float> transposed_weights_;
   std::vector<int> stride_shape_;
   std::vector<int> weight_shape_, bias_shape_;
   std::vector<int> bias_data_;
-  float data_min_, data_max_, weights_min_, weights_max_, bias_min_, bias_max_,
-      output_min_, output_max_;
+
+  // Non-null only if node has per-channel quantized weights/biases.
+  OpBuilder* channel_scales_node_ = nullptr;
+  float* scales_data_ = nullptr;
+  int num_scale_values_ = 1;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc
index b0507693e35..c6acc5ac947 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_implementation.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/utils.h"
@@ -29,6 +30,8 @@ namespace tflite {
 namespace {
 // Should be > 0. > 16 causes problems.
 constexpr int kMaxHexagonGraphs = 4;
+constexpr int kMaxMaxHexagonGraphs = 16;
+constexpr int kMinNodesPerHexagonGraph = 2;
 
 TfLiteRegistration GetHexagonKernelRegistration() {
   // This is the registration for the Delegate Node that gets added to
@@ -79,51 +82,24 @@ TfLiteRegistration GetHexagonKernelRegistration() {
   return kernel_registration;
 }
 
-TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  // Reserve 1 element, since we need first element to be size, will be updated
-  // later.
-  std::vector<int> supported_nodes(1);
-  TfLiteIntArray* plan;
-  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
-  TfLiteNode* node;
-  TfLiteRegistration* registration;
-
-  // Rudimentary mechanism to check how many Hexagon graphs we initialize.
-  int num_components = 1;
-  int last_index = -1;
-  for (int node_index : TfLiteIntArrayView(plan)) {
-    TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
-        context, node_index, &node, &registration));
-    if (IsNodeSupportedByHexagon(registration, node, context)) {
-      // If there is a 'break' in node indices, a new subgraph (and therefore, a
-      // new Hexagon graph) will be created.
-      if (last_index != -1 && node_index != last_index + 1) {
-        if (num_components == kMaxHexagonGraphs) {
-          break;
-        }
-        ++num_components;
-      }
-      supported_nodes.push_back(node_index);
-      last_index = node_index;
-    }
-  }
-  // Set first element to the number of nodes to replace.
-  supported_nodes[0] = supported_nodes.size() - 1;
-  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
-                  "Hexagon delegate: %d nodes delegated out of %d nodes.\n",
-                  supported_nodes[0], plan->size);
-  TfLiteRegistration hexagon_kernel_registration =
-      GetHexagonKernelRegistration();
-
-  return context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, hexagon_kernel_registration,
-      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
-}
-
 class HexagonDelegate : public TfLiteDelegate {
  public:
   explicit HexagonDelegate(const TfLiteHexagonDelegateOptions* params)
-      : params_(params != nullptr ? *params : TfLiteHexagonDelegateOptions()) {}
+      : params_(params != nullptr ? *params
+                                  : TfLiteHexagonDelegateOptions({0})) {
+    if (params_.max_delegated_partitions <= 0) {
+      params_.max_delegated_partitions = kMaxHexagonGraphs;
+    } else if (params_.max_delegated_partitions > kMaxMaxHexagonGraphs) {
+      TFLITE_LOG_PROD(tflite::TFLITE_LOG_WARNING,
+                      "Hexagon delegate: cannot have this many %d partitions, "
+                      "and will cap to at most %d partitions.\n",
+                      params_.max_delegated_partitions, kMaxMaxHexagonGraphs);
+      params_.max_delegated_partitions = kMaxMaxHexagonGraphs;
+    }
+    if (params_.min_nodes_per_partition <= 0) {
+      params_.min_nodes_per_partition = kMinNodesPerHexagonGraph;
+    }
+  }
 
   TfLiteHexagonDelegateOptions* params() { return &params_; }
 
@@ -160,10 +136,61 @@ class HexagonDelegate : public TfLiteDelegate {
            hexagon_nn->hexagon_nn_is_device_supported();
   }
 
+  ~HexagonDelegate() {
+    TfLiteIntArrayFree(params_.input_batch_dimensions);
+    TfLiteIntArrayFree(params_.output_batch_dimensions);
+  }
+
  private:
   TfLiteHexagonDelegateOptions params_;
 };
 
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  delegates::IsNodeSupportedFn node_supported_fn =
+      [=](TfLiteContext* context, TfLiteNode* node,
+          TfLiteRegistration* registration,
+          std::string* unsupported_details) -> bool {
+    return IsNodeSupportedByHexagon(registration, node, context);
+  };
+  delegates::GraphPartitionHelper helper(context, node_supported_fn);
+  TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
+
+  TfLiteHexagonDelegateOptions* params =
+      static_cast<TfLiteHexagonDelegateOptions*>(delegate->data_);
+  const auto delegate_partitions = helper.GetFirstNLargestPartitions(
+      params->max_delegated_partitions, params->min_nodes_per_partition);
+
+  // To avoid creating a new TfLiteIntArray and free it later, we reserve one
+  // element to represent TfLiteIntArray.size which is the 1st element of
+  // TfLiteIntArray C struct.
+  std::vector<int> supported_nodes(1);
+  for (const auto partition : delegate_partitions) {
+    auto* nodes = partition->nodes_to_replace;
+    supported_nodes.insert(supported_nodes.end(), nodes->data,
+                           nodes->data + nodes->size);
+  }
+  // Set first element to the number of nodes to replace.
+  supported_nodes[0] = supported_nodes.size() - 1;
+  auto* hexagon_delegate = static_cast<HexagonDelegate*>(delegate);
+  // Make sure dynamic batch is requested on fully delegated graph only.
+  if (supported_nodes[0] != helper.num_total_nodes() &&
+      hexagon_delegate != nullptr &&
+      hexagon_delegate->params()->enable_dynamic_batch_size) {
+    TF_LITE_KERNEL_LOG(
+        context, "Dynamic batch requested on non-fully delegated graph !!.");
+    return kTfLiteError;
+  }
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                  "Hexagon delegate: %d nodes delegated out of %d nodes with "
+                  "%d partitions.\n",
+                  supported_nodes[0], helper.num_total_nodes(),
+                  delegate_partitions.size());
+
+  return context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, GetHexagonKernelRegistration(),
+      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
+}
+
 TfLiteDelegate* CreateDelegate(const TfLiteHexagonDelegateOptions* params) {
   TfLiteDelegate* delegate = new HexagonDelegate(params);
   if (!static_cast<HexagonDelegate*>(delegate)->VerifyDelegate()) {
@@ -174,7 +201,7 @@ TfLiteDelegate* CreateDelegate(const TfLiteHexagonDelegateOptions* params) {
   }
 
   delegate->data_ = static_cast<HexagonDelegate*>(delegate)->params();
-  delegate->flags = kTfLiteDelegateFlagsNone;
+  delegate->flags = kTfLiteDelegateFlagsAllowDynamicTensors;
   delegate->Prepare = &DelegatePrepare;
   delegate->CopyFromBufferHandle = nullptr;
   delegate->CopyToBufferHandle = nullptr;
@@ -194,6 +221,11 @@ TfLiteDelegate* TfLiteHexagonDelegateCreate(
   return tflite::CreateDelegate(options);
 }
 
+TfLiteHexagonDelegateOptions TfLiteHexagonDelegateOptionsDefault() {
+  TfLiteHexagonDelegateOptions result{0};
+  return result;
+}
+
 void TfLiteHexagonDelegateDelete(TfLiteDelegate* delegate) { delete delegate; }
 
 void TfLiteHexagonInit() { tflite::HexagonDelegateKernel::InitState(); }
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.h b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.h
index 449184f4ff3..8838181aae7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.h
@@ -34,23 +34,74 @@ limitations under the License.
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
+
+// Use TfLiteHexagonDelegateOptionsDefault() for Default options.
 struct TFL_CAPI_EXPORT TfLiteHexagonDelegateOptions {
   // This corresponds to the debug level in the hexagon SDK. 0 (default)
   // means no debug.
   int debug_level;
+
   // This corresponds to powersave_level in the hexagon SDK.
   // where 0 (default) means high performance which means more power
   // consumption.
   int powersave_level;
+
   // If set to true, performance information about the graph will be dumped
   // to Standard output, this includes cpu cycles.
   // WARNING: Experimental and subject to change anytime.
   bool print_graph_profile;
+
   // If set to true, graph structure will be dumped to Standard output.
   // This is usually beneficial to see what actual nodes executed on
   // the DSP. Combining with 'debug_level' more information will be printed.
   // WARNING: Experimental and subject to change anytime.
   bool print_graph_debug;
+
+  // This sets the maximum number of Hexagon graphs created with
+  // hexagon_nn_init. Each graph corresponds to one delegated node subset in the
+  // TFLite model.
+  int max_delegated_partitions;
+  // This sets the minimum number of nodes per graph created with
+  // hexagon_nn_init. Defaults to 2.
+  int min_nodes_per_partition;
+
+  // If true, then the hexagon graph will adapt for inputs with dynamic batch.
+  // See below options are needed to be set.
+  // Currently, Only supported when the whole graph is delegated, and
+  // with batch as index 0.
+  // WARNING: Experimental and subject to change anytime.
+  bool enable_dynamic_batch_size;
+
+  // Maximum value for a batch dimension when evaluating graphs with
+  // dynamic batch. The input to the graph can have value for batch bigger than
+  // this number, internally the graph will run multiple times each with
+  // batch dimension <= max_batch_size. you should decide the value of this
+  // based on memory/latency tradeoffs.
+  // This needs to be set only if 'enable_dynamic_batch_size' is true.
+  // Not needed for fixed graphs.
+  // WARNING: Experimental and subject to change anytime.
+  int max_batch_size;
+
+  // Each element identifies the index of the batch dimension in a single input.
+  // input_batch_dimensions->data[i] is the index of the batch dimension for
+  // input[i]. If the graph has 1 input then the size of the array should be 1,
+  // and so on. This needs to be set only if 'enable_dynamic_batch_size' is
+  // true. Not needed for fixed graphs.
+  // If input[i] doesn't have dynamic batch, then input_batch_dimensions[i]
+  // should be -1.
+  // Delegate will take ownership of the pointer.
+  // WARNING: Experimental and subject to change anytime.
+  TfLiteIntArray* input_batch_dimensions;
+
+  // Each element identifies the index of the batch dimension in a single
+  // output. output_batch_dimensions->data[i] is the index of the batch
+  // dimension for output[i]. If the graph has 1 output then the size of the
+  // array should be 1, and so on. This needs to be set only if
+  // 'enable_dynamic_batch_size' is true. Not needed for fixed graphs. If
+  // output[i] has doesn't have dynamic batch, then output_batch_dimensions[i]
+  // should be -1. Delegate will take ownership of the pointer. WARNING:
+  // Experimental and subject to change anytime.
+  TfLiteIntArray* output_batch_dimensions;
 };
 
 // Return a delegate that uses Hexagon SDK for ops execution.
@@ -58,6 +109,10 @@ struct TFL_CAPI_EXPORT TfLiteHexagonDelegateOptions {
 TfLiteDelegate* TFL_CAPI_EXPORT
 TfLiteHexagonDelegateCreate(const TfLiteHexagonDelegateOptions* options);
 
+// Returns TfLiteHexagonDelegateOptions populated with default values.
+TFL_CAPI_EXPORT TfLiteHexagonDelegateOptions
+TfLiteHexagonDelegateOptionsDefault();
+
 // Do any needed cleanup and delete 'delegate'.
 void TFL_CAPI_EXPORT TfLiteHexagonDelegateDelete(TfLiteDelegate* delegate);
 
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc
index 529b8c59dd2..5786562fc6a 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_implementation.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/utils.h"
-#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
@@ -144,24 +143,6 @@ TfLiteStatus HexagonDelegateKernel::Invoke(TfLiteContext* context,
     // Const tensors should have been handled at delegation time..
     if (tensor->allocation_type != kTfLiteMmapRo) {
       char* data_ptr = tensor->data.raw;
-      if (tensor->type == kTfLiteInt8) {
-        // If input is int8, we first re-quantize it to uint8 for Hexagon.
-        if (int8_to_uint8_tensors_.size() <= input_idx ||
-            !int8_to_uint8_tensors_[input_idx]) {
-          TF_LITE_KERNEL_LOG(context,
-                             "Found int8 input %d with no uint8 version",
-                             tensor_index);
-          return kTfLiteError;
-        }
-        TfLiteTensor* uint8_tensor = int8_to_uint8_tensors_[input_idx];
-        optimized_ops::Requantize(
-            tensor->data.int8, NumElements(tensor),
-            kSameScaleEffectiveMultiplier, kSameScaleEffectiveShift,
-            tensor->params.zero_point,
-            tensor->params.zero_point + kInt8Uint8ZeroPointDiff,
-            uint8_tensor->data.uint8);
-        data_ptr = uint8_tensor->data.raw;
-      }
 
       if (tensor->dims->size > 4) {
         ReportError(context, HexagonKernelState::INPUT_RANK_NOT_SUPPORTED,
@@ -213,27 +194,51 @@ TfLiteStatus HexagonDelegateKernel::Invoke(TfLiteContext* context,
     return kTfLiteError;
   }
 
-  // Requantize uint8->int8 for eligible output tensors.
-  for (auto tensor_index : TfLiteIntArrayView(node->outputs)) {
-    TfLiteTensor* tensor = &context->tensors[tensor_index];
-    if (tensor->allocation_type != kTfLiteMmapRo &&
-        tensor->type == kTfLiteInt8) {
-      optimized_ops::Requantize(
-          tensor->data.uint8, NumElements(tensor),
-          kSameScaleEffectiveMultiplier, kSameScaleEffectiveShift,
-          tensor->params.zero_point + kInt8Uint8ZeroPointDiff,
-          tensor->params.zero_point, tensor->data.int8);
-    }
-  }
-
   if (params_.print_graph_profile) {
     PrintPerformanceData(reinterpret_cast<Profiler*>(context->profiler));
   }
   return kTfLiteOk;
 }
 
+TfLiteStatus HexagonDelegateKernel::ResizeOutputTensors(TfLiteContext* context,
+                                                        TfLiteNode* node) {
+  if (!params_.enable_dynamic_batch_size) return kTfLiteError;
+  int new_batch = -1;
+  for (int i = 0; i < params_.input_batch_dimensions->size; ++i) {
+    // If this input has no dynamic shape skip it.
+    if (params_.input_batch_dimensions->data[i] == -1) continue;
+    int input_tensor_index = node->inputs->data[i];
+    TfLiteTensor* input_tensor = &context->tensors[input_tensor_index];
+    new_batch =
+        input_tensor->dims->data[params_.input_batch_dimensions->data[i]];
+    break;
+  }
+  if (new_batch == -1) {
+    TF_LITE_KERNEL_LOG(context, "Invalid Batch size.");
+    return kTfLiteError;
+  }
+  for (int i = 0; i < node->outputs->size; ++i) {
+    // If this output has no dynamic shape skip it.
+    if (params_.output_batch_dimensions->data[i] == -1) continue;
+    int output_tensor_index = node->outputs->data[i];
+    TfLiteTensor* output_tensor = &context->tensors[output_tensor_index];
+    TfLiteIntArray* new_shape = TfLiteIntArrayCopy(output_tensor->dims);
+    new_shape->data[params_.output_batch_dimensions->data[i]] = new_batch;
+    TF_LITE_ENSURE_OK(context,
+                      context->ResizeTensor(context, output_tensor, new_shape));
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus HexagonDelegateKernel::Prepare(TfLiteContext* context,
                                             TfLiteNode* node) {
+  if (graph_prepared_) {
+    if (!params_.enable_dynamic_batch_size)
+      TF_LITE_KERNEL_LOG(context, "Calling prepare multiple times");
+    // Graph already prepared, but we must resize TFLite output tensors
+    // based on the new input shape.
+    return ResizeOutputTensors(context, node);
+  }
   if (hexagon_nn_ == nullptr) {
     context->ReportError(context, "Hexagon interface not available. prepare");
     return kTfLiteError;
@@ -266,39 +271,13 @@ TfLiteStatus HexagonDelegateKernel::Prepare(TfLiteContext* context,
     }
   }
 
-  // Assign temporary tensors for any input int8 tensors.
-  std::vector<int> temporary_tensors;
-  int8_to_uint8_tensors_.clear();
-  int8_to_uint8_tensors_.reserve(node->inputs->size);
-  for (auto tensor_index : TfLiteIntArrayView(node->inputs)) {
-    TfLiteTensor* tensor = &context->tensors[tensor_index];
-    // For every int8 tensor, we need to create a new temporary uint8 tensor.
-    if (tensor->allocation_type != kTfLiteMmapRo &&
-        tensor->type == kTfLiteInt8) {
-      TfLiteTensor* uint8_tensor;
-      int uint8_tensor_index;
-      TF_LITE_ENSURE_STATUS(delegates::CreateNewTensorWithDifferentType(
-          context, tensor_index, kTfLiteUInt8, &uint8_tensor,
-          &uint8_tensor_index));
-      int8_to_uint8_tensors_.push_back(uint8_tensor);
-      temporary_tensors.push_back(uint8_tensor_index);
-    } else {
-      int8_to_uint8_tensors_.push_back(nullptr);
-    }
-  }
-  if (!temporary_tensors.empty()) {
-    // This ensures the runtime allocates memory for every required temporary
-    // tensor.
-    node->temporaries = TfLiteIntArrayCreate(temporary_tensors.size());
-    for (int i = 0; i < temporary_tensors.size(); ++i) {
-      node->temporaries->data[i] = temporary_tensors[i];
-    }
-  }
-
   if (params_.print_graph_debug) {
     PrintDebuggingGraph();
   }
 
+  // Mark graph as prepared, since we can't prepare it multiple times.
+  graph_prepared_ = true;
+
   return kTfLiteOk;
 }
 
@@ -307,8 +286,13 @@ TfLiteStatus HexagonDelegateKernel::BuildGraph(
     const TfLiteIntArray* output_tensors) {
   builder_.reset(
       new delegates::hexagon::GraphBuilder(hexagon_nn_, context, graph_id_));
+  if (params_.enable_dynamic_batch_size) {
+    builder_->AddBatchSeqConfig(params_.max_batch_size,
+                                params_.input_batch_dimensions,
+                                params_.output_batch_dimensions);
+  }
   // Add inputs to the graph.
-  builder_->AddInputTensors(input_tensors, context);
+  TF_LITE_ENSURE_STATUS(builder_->AddInputTensors(input_tensors, context));
 
   // Add all ops.
   TfLiteNode* node;
@@ -324,7 +308,7 @@ TfLiteStatus HexagonDelegateKernel::BuildGraph(
   }
 
   // Add Outputs.
-  builder_->AddOutputTensors(output_tensors, context);
+  TF_LITE_ENSURE_STATUS(builder_->AddOutputTensors(output_tensors, context));
 
   builder_->Build();
 
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.h b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.h
index 91e36303574..4e49c0690ed 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.h
@@ -78,6 +78,10 @@ class HexagonDelegateKernel {
   void ReportError(TfLiteContext* context, HexagonKernelState state,
                    const std::string& msg);
 
+  // Resizes output tensors in case the delegate has dynamic batch enabled.
+  // Returns Error otherwise or if the requested size is invalid.
+  TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node);
+
   void PrintLog();
 
   // Prints performance information about the graph including cycles per node.
@@ -96,11 +100,8 @@ class HexagonDelegateKernel {
   std::vector<int> nodes_;
   ::TfLiteHexagonDelegateOptions params_;
 
-  // Used to support int8 TFLite *input* tensors.
-  // This vector, for every node-input, contains:
-  // 1. Pointer to Uint8 version if tensor is non-constant & type is Int8.
-  // 2. nullptr otherwise.
-  std::vector<TfLiteTensor*> int8_to_uint8_tensors_;
+  // Whether the Hexagon graph is prepared or not.
+  bool graph_prepared_ = false;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn_init.cc b/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn_init.cc
deleted file mode 100644
index d1607c4a6d7..00000000000
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn_init.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn_init.h"
-
-#include <fcntl.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#include "remote.h"  // NOLINT
-#include "rpcmem.h"  // NOLINT
-#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/soc_model.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-// Version 1.14
-static const int kHexagonNNVersion = 136193;
-#pragma weak remote_handle_control  // Declare it as a weak symbol
-void hexagon_nn_global_init() {
-  rpcmem_init();
-  // Non-domains QoS invocation
-  struct remote_rpc_control_latency data;
-  data.enable = 1;
-  if (remote_handle_control) {  // Check if API is available before invoking
-    remote_handle_control(DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
-  }
-}
-
-void hexagon_nn_global_teardown() { rpcmem_deinit(); }
-
-bool hexagon_nn_is_device_supported() {
-  return tflite::delegates::getsoc_model().mode != UNSPECIFIED_MODE;
-}
-
-int hexagon_nn_hexagon_interface_version() { return kHexagonNNVersion; }
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index ad264340e74..723349ef23e 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -42,6 +42,8 @@ bool InputsWithCorrectTypes(
     const std::vector<std::vector<TfLiteType>>& per_input_possible_types) {
   if (node->inputs->size != per_input_possible_types.size()) return false;
   for (int i = 0; i < per_input_possible_types.size(); ++i) {
+    // Skip optional tensor.
+    if (node->inputs->data[i] == -1) continue;
     bool type_found = false;
     for (auto possible_type : per_input_possible_types[i]) {
       if (TensorTypeMatch(node->inputs->data[i], context, possible_type)) {
@@ -73,17 +75,31 @@ TfLiteStatus Get4DShape(unsigned int* batch_size, unsigned int* height_size,
 bool CheckOpVersion(const TfLiteRegistration* registration) {
   switch (registration->builtin_code) {
     case kTfLiteBuiltinAdd:
+    case kTfLiteBuiltinArgMax:
+    case kTfLiteBuiltinArgMin:
     case kTfLiteBuiltinAveragePool2d:
     case kTfLiteBuiltinConcatenation:
+    case kTfLiteBuiltinL2Normalization:
     case kTfLiteBuiltinLogistic:
+    case kTfLiteBuiltinMaximum:
     case kTfLiteBuiltinMaxPool2d:
+    case kTfLiteBuiltinMean:
+    case kTfLiteBuiltinMinimum:
+    case kTfLiteBuiltinMirrorPad:
     case kTfLiteBuiltinMul:
     case kTfLiteBuiltinPad:
-    case kTfLiteBuiltinSub:
+    case kTfLiteBuiltinQuantize:
     case kTfLiteBuiltinRelu6:
+    case kTfLiteBuiltinResizeBilinear:
+    case kTfLiteBuiltinResizeNearestNeighbor:
+    case kTfLiteBuiltinSlice:
     case kTfLiteBuiltinSoftmax:
     case kTfLiteBuiltinSpaceToDepth:
+    case kTfLiteBuiltinSplit:
+    case kTfLiteBuiltinSub:
     case kTfLiteBuiltinTanh:
+    case kTfLiteBuiltinTranspose:
+    case kTfLiteBuiltinTransposeConv:
       return registration->version <= 2;
     case kTfLiteBuiltinRelu:
       return registration->version == 2;
@@ -103,6 +119,9 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
   int tensor_id;
   for (int i = 0; i < node->inputs->size; ++i) {
     tensor_id = node->inputs->data[i];
+    // Skip optional tensors. Builders should handle optional tensors
+    // not available.
+    if (tensor_id == -1) continue;
     const auto& tensor = context->tensors[tensor_id];
     if (tensor.dims->size > 4) return false;
   }
@@ -144,11 +163,26 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       return IsActivationReluOrNone(sub_params->activation);
     }
     case kTfLiteBuiltinSum:
-    case kTfLiteBuiltinMean: {
       // TODO(b/139277813): Enable these when they pass unit tests. These seem
       // to recompute the output min/max instead of taking them as inputs, which
       // causes an unexpected shift in dequantized values.
       return false;
+    case kTfLiteBuiltinMean: {
+      return InputsWithCorrectTypes(
+                 node, context,
+                 {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}}) &&
+             IsConstantTensor(GetInput(context, node, 1));
+    }
+    case kTfLiteBuiltinMirrorPad: {
+      if (!InputsWithCorrectTypes(
+              node, context, {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}}) ||
+          !IsConstantTensor(GetInput(context, node, 1)))
+        return false;
+      const TfLiteMirrorPaddingParams* params =
+          reinterpret_cast<const TfLiteMirrorPaddingParams*>(
+              node->builtin_data);
+      return params->mode == kTfLiteMirrorPaddingReflect ||
+             params->mode == kTfLiteMirrorPaddingSymmetric;
     }
     case kTfLiteBuiltinPad: {
       // TODO(b/139277813): Currently we only support padding with the default
@@ -163,20 +197,23 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       if (!InputsWithCorrectTypes(node, context,
                                   {{kTfLiteUInt8, kTfLiteInt8},
                                    {kTfLiteUInt8, kTfLiteInt8},
-                                   {kTfLiteInt32}}))
+                                   {kTfLiteInt32, kTfLiteNoType}}))
         return false;
 
       const auto& weights_tensor = context->tensors[node->inputs->data[1]];
-      const auto& bias_tensor = context->tensors[node->inputs->data[2]];
-      const bool weights_and_bias_const =
-          weights_tensor.allocation_type == kTfLiteMmapRo &&
-          bias_tensor.allocation_type == kTfLiteMmapRo;
+      bool bias_const_or_no_bias = true;
+      if (node->inputs->data[2] != -1) {
+        const auto& bias_tensor = context->tensors[node->inputs->data[2]];
+        bias_const_or_no_bias = bias_tensor.allocation_type == kTfLiteMmapRo;
+      }
+      const bool weights_const =
+          weights_tensor.allocation_type == kTfLiteMmapRo;
 
       const TfLiteFullyConnectedParams* matmul_params =
           reinterpret_cast<const TfLiteFullyConnectedParams*>(
               node->builtin_data);
-      return (weights_and_bias_const &&
-              IsActivationReluOrNone(matmul_params->activation) &&
+      return (weights_const && bias_const_or_no_bias &&
+              matmul_params->activation == kTfLiteActNone &&
               matmul_params->keep_num_dims == false &&
               matmul_params->weights_format ==
                   kTfLiteFullyConnectedWeightsFormatDefault);
@@ -188,11 +225,7 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
             !TensorTypeMatch(node->inputs->data[i], context, kTfLiteInt8))
           return false;
       }
-      // Hexagon only supports concatenation at axis 3.
-      const TfLiteConcatenationParams* concat_params =
-          reinterpret_cast<const TfLiteConcatenationParams*>(
-              node->builtin_data);
-      return (concat_params->axis == 3);
+      return true;
     }
     case kTfLiteBuiltinMaxPool2d: {
       if (!InputsWithCorrectTypes(node, context, {{kTfLiteUInt8, kTfLiteInt8}}))
@@ -211,9 +244,10 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
               pool_params->activation == kTfLiteActNone);
     }
     case kTfLiteBuiltinTransposeConv: {
-      // TODO(b/142009955): Support per-channel quantized Transpose Conv.
-      if (!InputsWithCorrectTypes(
-              node, context, {{kTfLiteInt32}, {kTfLiteUInt8}, {kTfLiteUInt8}}))
+      if (!InputsWithCorrectTypes(node, context,
+                                  {{kTfLiteInt32},
+                                   {kTfLiteUInt8, kTfLiteInt8},
+                                   {kTfLiteUInt8, kTfLiteInt8}}))
         return false;
       const TfLiteTransposeConvParams* params =
           reinterpret_cast<const TfLiteTransposeConvParams*>(
@@ -269,9 +303,9 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
               conv_params->stride_width <= 3 && supported_depth_multiplier);
     }
     case kTfLiteBuiltinReshape: {
-      // TODO(b/142009955): Support int8.
       if (node->inputs->size > 2 ||
-          !TensorTypeMatch(node->inputs->data[0], context, kTfLiteUInt8))
+          (!TensorTypeMatch(node->inputs->data[0], context, kTfLiteUInt8) &&
+           !TensorTypeMatch(node->inputs->data[0], context, kTfLiteInt8)))
         return false;
       return true;
     }
@@ -279,6 +313,7 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       return (
           InputsWithCorrectTypes(node, context, {{kTfLiteUInt8, kTfLiteInt8}}));
     }
+    case kTfLiteBuiltinHardSwish:
     case kTfLiteBuiltinRelu:
     case kTfLiteBuiltinRelu6:
     case kTfLiteBuiltinTanh:
@@ -287,13 +322,13 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
                                     {{kTfLiteUInt8, kTfLiteInt8}});
     }
     case kTfLiteBuiltinResizeNearestNeighbor: {
-      // TODO(b/142009955): Support int8.
-      return InputsWithCorrectTypes(node, context,
-                                    {{kTfLiteUInt8}, {kTfLiteInt32}});
+      return InputsWithCorrectTypes(
+                 node, context,
+                 {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}}) &&
+             IsConstantTensor(GetInput(context, node, 1));
     }
     case kTfLiteBuiltinL2Normalization: {
-      // TODO(b/142009955): Support int8.
-      if (!InputsWithCorrectTypes(node, context, {{kTfLiteUInt8}}))
+      if (!InputsWithCorrectTypes(node, context, {{kTfLiteUInt8, kTfLiteInt8}}))
         return false;
       const TfLiteL2NormParams* norm_params =
           reinterpret_cast<const TfLiteL2NormParams*>(node->builtin_data);
@@ -301,24 +336,22 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
     }
     case kTfLiteBuiltinArgMax:
     case kTfLiteBuiltinArgMin:
-      // TODO(b/142009955): Support int8.
-      return InputsWithCorrectTypes(node, context,
-                                    {{kTfLiteUInt8}, {kTfLiteInt32}});
+      return InputsWithCorrectTypes(
+          node, context, {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}});
     case kTfLiteBuiltinSplit: {
-      // TODO(b/142009955): Support int8.
-      if (!InputsWithCorrectTypes(node, context,
-                                  {{kTfLiteInt32}, {kTfLiteUInt8}}))
+      if (!InputsWithCorrectTypes(
+              node, context, {{kTfLiteInt32}, {kTfLiteUInt8, kTfLiteInt8}}))
         return false;
       const auto& input_tensor = context->tensors[node->inputs->data[1]];
       const bool is_four_dim_or_less = input_tensor.dims->size < 5;
-      // We need splitting axis to be constant, so Hexagon knows output shapes.
+      // We need splitting axis to be constant, so Hexagon knows output
+      // shapes.
       return is_four_dim_or_less &&
              IsConstantTensor(GetInput(context, node, 0));
     }
     case kTfLiteBuiltinResizeBilinear: {
-      // TODO(b/142009955): Support int8.
-      if (!InputsWithCorrectTypes(node, context,
-                                  {{kTfLiteUInt8}, {kTfLiteInt32}}) ||
+      if (!InputsWithCorrectTypes(
+              node, context, {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}}) ||
           !IsConstantTensor(GetInput(context, node, 1))) {
         return false;
       }
@@ -333,15 +366,38 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
                                     {{kTfLiteUInt8, kTfLiteInt8}});
     }
     case kTfLiteBuiltinTranspose: {
-      // TODO(b/142009955): Support int8.
-      return InputsWithCorrectTypes(node, context,
-                                    {{kTfLiteUInt8}, {kTfLiteInt32}});
+      return InputsWithCorrectTypes(
+          node, context, {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}});
     }
     case kTfLiteBuiltinSpaceToDepth:
     case kTfLiteBuiltinDepthToSpace: {
       return InputsWithCorrectTypes(node, context,
                                     {{kTfLiteUInt8, kTfLiteInt8}});
     }
+    case kTfLiteBuiltinQuantize: {
+      return InputsWithCorrectTypes(node, context,
+                                    {{kTfLiteUInt8, kTfLiteInt8}});
+    }
+    case kTfLiteBuiltinMinimum: {
+      return InputsWithCorrectTypes(
+          node, context,
+          {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}});
+    }
+    case kTfLiteBuiltinMaximum: {
+      return InputsWithCorrectTypes(
+          node, context,
+          {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}});
+    }
+    case kTfLiteBuiltinSlice: {
+      const auto& begins_tensor = context->tensors[node->inputs->data[1]];
+      const auto& sizes_tensor = context->tensors[node->inputs->data[2]];
+      if (!IsConstantTensor(&begins_tensor) || !IsConstantTensor(&sizes_tensor))
+        return false;
+      return InputsWithCorrectTypes(node, context,
+                                    {{kTfLiteUInt8, kTfLiteInt8},
+                                     {kTfLiteInt32, kTfLiteInt64},
+                                     {kTfLiteInt32, kTfLiteInt64}});
+    }
     default:
       return false;
   }
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index faa3f12971c..7e2a3623af1 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -11,27 +11,8 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-genrule(
-    name = "strip_coreml_include_hdr",
-    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
-    outs = ["coreml_delegate.h"],
-    cmd = """
-    sed 's/#include \".*common.h"/#include \"common.h\"/' \
-    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)" \
-    > "$@"
-    """,
-)
-
-TFL_LIBRARY_HDRS = [
-    "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-    "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h",
-    "//tensorflow/lite/c:c_api.h",
-    "//tensorflow/lite/c:common.h",
-]
-
 TFL_FRAMEWORK_HDRS = [
     "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-    ":coreml_delegate.h",
     "//tensorflow/lite/c:c_api.h",
     "//tensorflow/lite/c:common.h",
 ]
@@ -42,19 +23,6 @@ ios_static_framework(
     hdrs = TFL_FRAMEWORK_HDRS,
     bundle_name = "TensorFlowLiteC",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
-    deps = [
-        ":TensorFlowLiteC",
-    ],
-)
-
-objc_library(
-    name = "TensorFlowLiteC",
-    hdrs = TFL_LIBRARY_HDRS,
-    module_name = "TensorFlowLiteC",
-    weak_sdk_frameworks = [
-        "Metal",
-        "CoreML",
-    ],
     deps = [
         ":tensorflow_lite_c",
     ],
@@ -78,24 +46,55 @@ ios_static_framework(
     ],
 )
 
-# Using this intermediate target is a workaround for a bug in bazel build rules
-# involving mixed objc_library & cc_library deps mentioned in (b/74809458).
-# When these dependencies are declared directly under the "TensorFlowLiteC"
-# target above, the resulting static library incorrectly contains duplicate
-# symbols from some ObjC code in the transitive dependencies.
+genrule(
+    name = "strip_coreml_include_hdr",
+    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
+    outs = ["coreml_delegate.h"],
+    cmd = """
+    sed 's|#include ".*common.h"|#include "TensorFlowLiteC/common.h"|'\
+    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)"\
+    > "$@"
+    """,
+)
+
+# This target builds the Core ML delegate as a separate static framework, which
+# does not include the TensorFlow Lite runtime. As this target does not contain
+# TensorFlow Lite runtime, it is intended to be linked along with the
+# TensorFlowLiteC framework above in a composable way.
 #
-# When a new dependency should be added to the TensorFlowLiteC framework, the
-# dependency should be added under this target instead.
-# When a new header file needs to be exposed, the header should be added to the
-# TFL_LIBRARY_HDRS list above.
+# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCCoreMl_framework
+ios_static_framework(
+    name = "TensorFlowLiteCCoreML_framework",
+    hdrs = [
+        ":coreml_delegate.h",
+    ],
+    avoid_deps = [
+        ":tensorflow_lite_c",
+    ],
+    bundle_name = "TensorFlowLiteCCoreML",
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    deps = [
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
+    ],
+)
+
 cc_library(
     name = "tensorflow_lite_c",
-    hdrs = TFL_LIBRARY_HDRS,
-    tags = ["nobuilder"],
+    hdrs = [
+        "//tensorflow/lite/c:c_api.h",
+        "//tensorflow/lite/c:common.h",
+        "//tensorflow/lite/delegates/gpu:metal_delegate.h",
+    ],
+    linkopts = [
+        "-Wl,-weak_framework,Metal",
+    ],
+    tags = [
+        "nobuilder",
+        "swift_module=TensorFlowLiteC",
+    ],
     deps = [
         "//tensorflow/lite/c:c_api",
         "//tensorflow/lite/delegates/gpu:metal_delegate",
-        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
index 344b4594774..f379799c8a9 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteC'
-  s.version          = '2.1.0'
+  s.version          = '2.2.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :http => "https://dl.google.com/dl/cpdc/a8eee3017d6b2c5d/TensorFlowLiteC-#{s.version}.tar.gz" }
+  s.source           = { :http => "https://dl.google.com/dl/cpdc/b3338da8d8cfd06b/TensorFlowLiteC-#{s.version}.tar.gz" }
   s.summary          = 'TensorFlow Lite'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
index d69c479282b..d8a5ef8f2e1 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
@@ -19,6 +19,16 @@ Pod::Spec.new do |s|
 
   s.module_name = 'TensorFlowLiteC'
   s.library = 'c++'
-  s.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
-  s.weak_frameworks = 'CoreML'
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.weak_framework = 'CoreML'
+    coreml.dependency 'TensorFlowLiteC/Core'
+    coreml.vendored_frameworks = 'Frameworks/TensorFlowLiteCCoreML.framework'
+  end
 end
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteSelectTfOps.podspec b/tensorflow/lite/experimental/ios/TensorFlowLiteSelectTfOps.podspec
new file mode 100644
index 00000000000..788630a6d4f
--- /dev/null
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteSelectTfOps.podspec
@@ -0,0 +1,22 @@
+Pod::Spec.new do |s|
+  s.name             = 'TensorFlowLiteSelectTfOps'
+  s.version          = '2.2.0'
+  s.authors          = 'Google Inc.'
+  s.license          = { :type => 'Apache' }
+  s.homepage         = 'https://github.com/tensorflow/tensorflow'
+  s.source           = { :http => "https://dl.google.com/dl/cpdc/9604b128278441ac/TensorFlowLiteSelectTfOps-2.2.0.tar.gz" }
+  s.summary          = 'TensorFlow Lite Select TF Ops'
+  s.description      = <<-DESC
+
+  This pod can be used in addition to `TensorFlowLiteSwift` or
+  `TensorFlowLiteObjC` pod, in order to enable Select TensorFlow ops. The
+  resulting binary should also be force-loaded to the final app binary.
+                       DESC
+
+  s.ios.deployment_target = '9.0'
+
+  s.module_name = 'TensorFlowLiteSelectTfOps'
+  s.library = 'c++'
+  s.vendored_frameworks = 'Frameworks/TensorFlowLiteSelectTfOps.framework'
+  s.weak_frameworks = 'CoreML'
+end
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index e5d789690d3..70ae658213f 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -125,50 +125,3 @@ cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
-
-cc_library(
-    name = "hashtable_op_kernels",
-    srcs = [
-        "hashtable.cc",
-        "hashtable_find.cc",
-        "hashtable_import.cc",
-        "hashtable_ops.cc",
-        "hashtable_size.cc",
-    ],
-    hdrs = [
-        "hashtable_ops.h",
-    ],
-    deps = [
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/experimental/resource",
-        "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@flatbuffers",
-    ],
-)
-
-cc_test(
-    name = "hashtable_op_test",
-    size = "small",
-    srcs = [
-        "hashtable_ops_test.cc",
-    ],
-    deps = [
-        ":hashtable_op_kernels",  # buildcleaner: keep
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/experimental/resource",
-        "//tensorflow/lite/kernels:test_main",
-        "//tensorflow/lite/kernels:test_util",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/testing:util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index aaaf864bb60..bf0eb6ae726 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -27,6 +27,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "audio_microfrontend_op_lib",
+    srcs = ["ops/audio_microfrontend_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/experimental/microfrontend/lib:frontend",
+    ],
+    alwayslink = 1,
+)
+
 cc_test(
     name = "audio_microfrontend_test",
     size = "small",
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
index 2447f432664..e039fb57114 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
@@ -24,8 +24,8 @@ Pod::Spec.new do |s|
   s.public_header_files = objc_dir + 'apis/*.h'
   s.source_files = [
     objc_dir + '{apis,sources}/*.{h,m,mm}',
-    tfl_dir + 'experimental/c/c_api.h',
-    tfl_dir + 'experimental/c/common.h',
+    tfl_dir + 'c/c_api.h',
+    tfl_dir + 'c/common.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "~> #{s.version}"
@@ -33,7 +33,7 @@ Pod::Spec.new do |s|
     'HEADER_SEARCH_PATHS' =>
       '"${PODS_TARGET_SRCROOT}" ' +
       '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
-    'VALID_ARCHS' => 'x86_64 armv7 arm64',
+    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
   }
 
   s.test_spec 'Tests' do |ts|
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index e7a4933bdde..c673cfad759 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteObjC'
-  s.version          = '2.1.0'
+  s.version          = '2.2.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
@@ -24,8 +24,8 @@ Pod::Spec.new do |s|
   s.public_header_files = objc_dir + 'apis/*.h'
   s.source_files = [
     objc_dir + '{apis,sources}/*.{h,m,mm}',
-    tfl_dir + 'experimental/c/c_api.h',
-    tfl_dir + 'experimental/c/c_api_types.h',
+    tfl_dir + 'c/c_api.h',
+    tfl_dir + 'c/common.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "#{s.version}"
@@ -33,7 +33,7 @@ Pod::Spec.new do |s|
     'HEADER_SEARCH_PATHS' =>
       '"${PODS_TARGET_SRCROOT}" ' +
       '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
-    'VALID_ARCHS' => 'x86_64 armv7 arm64',
+    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
   }
 
   s.test_spec 'Tests' do |ts|
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
new file mode 100644
index 00000000000..fc9e10e4a2c
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
@@ -0,0 +1,46 @@
+Pod::Spec.new do |s|
+  s.name             = 'TensorFlowLiteObjC'
+  s.version          = '${TFL_BUILD_VERSION}'
+  s.authors          = 'Google Inc.'
+  s.license          = { :type => 'Apache' }
+  s.homepage         = 'https://github.com/tensorflow/tensorflow'
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '${SOURCE_COMMIT}' }
+  s.summary          = 'TensorFlow Lite for Objective-C'
+  s.description      = <<-DESC
+
+  TensorFlow Lite is TensorFlow's lightweight solution for Objective-C
+  developers. It enables low-latency inference of on-device machine learning
+  models with a small binary size and fast performance supporting hardware
+  acceleration.
+                       DESC
+
+  s.ios.deployment_target = '9.0'
+
+  s.module_name = 'TFLTensorFlowLite'
+  s.static_framework = true
+
+  tfl_dir = 'tensorflow/lite/'
+  objc_dir = tfl_dir + 'experimental/objc/'
+  s.public_header_files = objc_dir + 'apis/*.h'
+  s.source_files = [
+    objc_dir + '{apis,sources}/*.{h,m,mm}',
+    tfl_dir + 'c/c_api.h',
+    tfl_dir + 'c/common.h',
+  ]
+  s.module_map = objc_dir + 'apis/framework.modulemap'
+  s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
+  s.pod_target_xcconfig = {
+    'HEADER_SEARCH_PATHS' =>
+      '"${PODS_TARGET_SRCROOT}" ' +
+      '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
+    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
+  }
+
+  s.test_spec 'Tests' do |ts|
+    ts.source_files = objc_dir + 'tests/*.m'
+    ts.resources = [
+      tfl_dir + 'testdata/add.bin',
+      tfl_dir + 'testdata/add_quantized.bin',
+    ]
+  end
+end
diff --git a/tensorflow/lite/experimental/support/BUILD b/tensorflow/lite/experimental/support/BUILD
new file mode 100644
index 00000000000..05673abca57
--- /dev/null
+++ b/tensorflow/lite/experimental/support/BUILD
@@ -0,0 +1,7 @@
+# This package contains TF Lite support libraries.
+licenses(["notice"])  # Apache 2.0
+
+package_group(
+    name = "users",
+    packages = ["//tensorflow/lite/experimental/support/..."],
+)
diff --git a/tensorflow/lite/experimental/support/codegen/android_java_generator.cc b/tensorflow/lite/experimental/support/codegen/android_java_generator.cc
index b0ac31c2fce..02b4d15c6cb 100644
--- a/tensorflow/lite/experimental/support/codegen/android_java_generator.cc
+++ b/tensorflow/lite/experimental/support/codegen/android_java_generator.cc
@@ -112,20 +112,24 @@ TensorInfo CreateTensorInfo(const TensorMetadata* metadata,
   tensor_info.upper_camel_name[0] = toupper(tensor_info.upper_camel_name[0]);
   tensor_info.normalization_unit =
       FindNormalizationUnit(metadata, tensor_identifier, err);
-  if (metadata->content()->content_properties_type() ==
-      ContentProperties_ImageProperties) {
-    if (metadata->content()
-            ->content_properties_as_ImageProperties()
-            ->color_space() == ColorSpaceType_RGB) {
-      tensor_info.content_type = "image";
-      tensor_info.wrapper_type = "TensorImage";
-      tensor_info.processor_type = "ImageProcessor";
-      return tensor_info;
-    } else {
-      err->Warning(
-          "Found Non-RGB image on tensor (%s). Codegen currently does not "
-          "support it, and regard it as a plain numeric tensor.",
-          tensor_identifier.c_str());
+  if (metadata->content() != nullptr &&
+      metadata->content()->content_properties() != nullptr) {
+    // Enter tensor wrapper type inferring
+    if (metadata->content()->content_properties_type() ==
+        ContentProperties_ImageProperties) {
+      if (metadata->content()
+              ->content_properties_as_ImageProperties()
+              ->color_space() == ColorSpaceType_RGB) {
+        tensor_info.content_type = "image";
+        tensor_info.wrapper_type = "TensorImage";
+        tensor_info.processor_type = "ImageProcessor";
+        return tensor_info;
+      } else {
+        err->Warning(
+            "Found Non-RGB image on tensor (%s). Codegen currently does not "
+            "support it, and regard it as a plain numeric tensor.",
+            tensor_identifier.c_str());
+      }
     }
   }
   tensor_info.content_type = "tensor";
@@ -154,12 +158,12 @@ ModelInfo CreateModelInfo(const ModelMetadata* metadata,
       graph->input_tensor_metadata(), graph->output_tensor_metadata());
   std::vector<std::string> input_tensor_names = std::move(names.first);
   std::vector<std::string> output_tensor_names = std::move(names.second);
-  for (int i = 0; i < graph->input_tensor_metadata()->size(); i++) {
+  for (int i = 0; i < input_tensor_names.size(); i++) {
     model_info.inputs.push_back(
         CreateTensorInfo(graph->input_tensor_metadata()->Get(i),
                          input_tensor_names[i], true, i, err));
   }
-  for (int i = 0; i < graph->output_tensor_metadata()->size(); i++) {
+  for (int i = 0; i < output_tensor_names.size(); i++) {
     model_info.outputs.push_back(
         CreateTensorInfo(graph->output_tensor_metadata()->Get(i),
                          output_tensor_names[i], false, i, err));
@@ -945,6 +949,11 @@ GenerationResult AndroidJavaGenerator::Generate(
     const Model* model, const std::string& package_name,
     const std::string& model_class_name, const std::string& model_asset_path) {
   GenerationResult result;
+  if (model == nullptr) {
+    err_.Error(
+        "Cannot read model from the buffer. Codegen will generate nothing.");
+    return result;
+  }
   const ModelMetadata* metadata = GetMetadataFromModel(model);
   if (metadata == nullptr) {
     err_.Error(
diff --git a/tensorflow/lite/experimental/support/codegen/metadata_helper.cc b/tensorflow/lite/experimental/support/codegen/metadata_helper.cc
index 3fcc7aee3bf..6eca86ae5d5 100644
--- a/tensorflow/lite/experimental/support/codegen/metadata_helper.cc
+++ b/tensorflow/lite/experimental/support/codegen/metadata_helper.cc
@@ -24,14 +24,22 @@ namespace codegen {
 
 constexpr char BUFFER_KEY[] = "TFLITE_METADATA";
 const ModelMetadata* GetMetadataFromModel(const Model* model) {
-  if (model->metadata() == nullptr) {
+  if (model == nullptr || model->metadata() == nullptr) {
     return nullptr;
   }
   for (auto i = 0; i < model->metadata()->size(); i++) {
-    if (model->metadata()->Get(i)->name()->str() == BUFFER_KEY) {
+    const auto* name = model->metadata()->Get(i)->name();
+    if (name != nullptr && name->str() == BUFFER_KEY) {
       const auto buffer_index = model->metadata()->Get(i)->buffer();
-      const auto* buffer = model->buffers()->Get(buffer_index)->data()->data();
-      return GetModelMetadata(buffer);
+      if (model->buffers() == nullptr ||
+          model->buffers()->size() <= buffer_index) {
+        continue;
+      }
+      const auto* buffer_vec = model->buffers()->Get(buffer_index)->data();
+      if (buffer_vec == nullptr || buffer_vec->data() == nullptr) {
+        continue;
+      }
+      return GetModelMetadata(buffer_vec->data());
     }
   }
   return nullptr;
diff --git a/tensorflow/lite/experimental/support/java/BUILD b/tensorflow/lite/experimental/support/java/BUILD
index 43e984a0cb8..85f5da17193 100644
--- a/tensorflow/lite/experimental/support/java/BUILD
+++ b/tensorflow/lite/experimental/support/java/BUILD
@@ -9,7 +9,24 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+# TODO(b/156482505): The NOGPU target is a temporary target. Internally, people
+# may already depend on "tensorflow-lite-support" so we shouldn't remove GPU
+# from its dependency. We will have CLs to help users migrate. After migration
+# is done, the "NOGPU" target will be removed.
+android_library(
+    name = "tensorflow-lite-support-nogpu",
+    srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
+    javacopts = JAVACOPTS,
+    manifest = "AndroidManifest.xml",
+    deps = [
+        "//tensorflow/lite/java:tensorflowlite",
+        "@org_checkerframework_qual",
+    ],
+)
+
 # TODO(138904786): Split Java part and Android part to make the support library usable by pure Java.
+# For new users: Please use "tensorflow-lite-support-nogpu" if possible, and
+# additionally depends on "tensorflowlite_gpu" if needed.
 android_library(
     name = "tensorflow-lite-support",
     srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
@@ -17,7 +34,7 @@ android_library(
     manifest = "AndroidManifest.xml",
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
-        "//tensorflow/lite/java:tensorflowlite_gpu",
+        "//tensorflow/lite/java:tensorflowlite_gpu",  # unuseddeps: keep
         "@org_checkerframework_qual",
     ],
 )
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/BoundingBoxUtil.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/BoundingBoxUtil.java
new file mode 100644
index 00000000000..30f562063f3
--- /dev/null
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/BoundingBoxUtil.java
@@ -0,0 +1,202 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.image;
+
+import static org.tensorflow.lite.support.common.SupportPreconditions.checkArgument;
+
+import android.graphics.RectF;
+import java.nio.ByteBuffer;
+import java.nio.FloatBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.tensorflow.lite.DataType;
+import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
+
+/**
+ * Helper class for converting values that represents bounding boxes into rectangles.
+ *
+ * <p>The class provides a static function to create bounding boxes as {@link RectF} from different
+ * types of configurations.
+ *
+ * <p>Generally, a bounding box could be represented by 4 float values, but the values could be
+ * interpreted in many ways. We now support 3 {@link Type} of configurations, and the order of
+ * elements in each type is configurable as well.
+ */
+public final class BoundingBoxUtil {
+
+  /** Denotes how a bounding box is represented. */
+  public enum Type {
+    /**
+     * Represents the bounding box by using the combination of boundaries, {left, top, right,
+     * bottom}. The default order is {left, top, right, bottom}. Other orders can be indicated by an
+     * index array.
+     */
+    BOUNDARIES,
+    /**
+     * Represents the bounding box by using the upper_left corner, width and height. The default
+     * order is {upper_left_x, upper_left_y, width, height}. Other orders can be indicated by an
+     * index array.
+     */
+    UPPER_LEFT,
+    /**
+     * Represents the bounding box by using the center of the box, width and height. The default
+     * order is {center_x, center_y, width, height}. Other orders can be indicated by an index
+     * array.
+     */
+    CENTER,
+  }
+
+  /** Denotes if the coordinates are actual pixels or relative ratios. */
+  public enum CoordinateType {
+    /** The coordinates are relative ratios in range [0, 1]. */
+    RATIO,
+    /** The coordinates are actual pixel values. */
+    PIXEL
+  }
+
+  /**
+   * Creates a list of bounding boxes from a {@link TensorBuffer} which represents bounding boxes.
+   *
+   * @param tensor holds the data representing some boxes.
+   * @param valueIndex denotes the order of the elements defined in each bounding box type. An empty
+   *     index array represent the default order of each bounding box type. For example, to denote
+   *     the default order of BOUNDARIES, {left, top, right, bottom}, the index should be {0, 1, 2,
+   *     3}. To denote the order {left, right, top, bottom}, the order should be {0, 2, 1, 3}.
+   *     <p>The index array can be applied to all bounding box types to adjust the order of their
+   *     corresponding underlying elements.
+   * @param boundingBoxAxis specifies the index of the dimension that represents bounding box. The
+   *     size of that dimension is required to be 4. Index here starts from 0. For example, if the
+   *     tensor has shape 4x10, the axis for bounding boxes is likely to be 0. For shape 10x4, the
+   *     axis is likely to be 1 (or -1, equivalently).
+   * @param type defines how values should be converted into boxes. See {@link Type}
+   * @param coordinateType defines how values are interpreted to coordinates. See {@link
+   *     CoordinateType}
+   * @param height the height of the image which the boxes belong to. Only has effects when {@code
+   *     coordinateType} is {@link CoordinateType#RATIO}
+   * @param width the width of the image which the boxes belong to. Only has effects when {@code
+   *     coordinateType} is {@link CoordinateType#RATIO}
+   * @return A list of bounding boxes that the {@code tensor} represents. All dimensions except
+   *     {@code boundingBoxAxis} will be collapsed with order kept. For example, given {@code
+   *     tensor} with shape {1, 4, 10, 2} and {@code boundingBoxAxis = 1}, The result will be a list
+   *     of 20 bounding boxes.
+   * @throws IllegalArgumentException if size of bounding box dimension (set by {@code
+   *     boundingBoxAxis}) is not 4.
+   * @throws IllegalArgumentException if {@code boundingBoxAxis} is not in {@code (-(D+1), D)} where
+   *     {@code D} is the number of dimensions of the {@code tensor}.
+   * @throws IllegalArgumentException if {@code tensor} has data type other than {@link
+   *     DataType#FLOAT32}.
+   */
+  public static List<RectF> convert(
+      TensorBuffer tensor,
+      int[] valueIndex,
+      int boundingBoxAxis,
+      Type type,
+      CoordinateType coordinateType,
+      int height,
+      int width) {
+    int[] shape = tensor.getShape();
+    checkArgument(
+        boundingBoxAxis >= -shape.length && boundingBoxAxis < shape.length,
+        String.format(
+            "Axis %d is not in range (-(D+1), D), where D is the number of dimensions of input"
+                + " tensor (shape=%s)",
+            boundingBoxAxis, Arrays.toString(shape)));
+    if (boundingBoxAxis < 0) {
+      boundingBoxAxis = shape.length + boundingBoxAxis;
+    }
+    checkArgument(
+        shape[boundingBoxAxis] == 4,
+        String.format(
+            "Size of bounding box dimension %d is not 4. Got %d in shape %s",
+            boundingBoxAxis, shape[boundingBoxAxis], Arrays.toString(shape)));
+    checkArgument(
+        valueIndex.length == 4,
+        String.format(
+            "Bounding box index array length %d is not 4. Got index array %s",
+            valueIndex.length, Arrays.toString(valueIndex)));
+    checkArgument(
+        tensor.getDataType() == DataType.FLOAT32,
+        "Bounding Boxes only create from FLOAT32 buffers. Got: " + tensor.getDataType().name());
+    List<RectF> boundingBoxList = new ArrayList<>();
+    // Collapse dimensions to {a, 4, b}. So each bounding box could be represent as (i, j), and its
+    // four values are (i, k, j), where 0 <= k < 4. We can compute the 4 flattened index by
+    // i * 4b + k * b + j.
+    int a = 1;
+    for (int i = 0; i < boundingBoxAxis; i++) {
+      a *= shape[i];
+    }
+    int b = 1;
+    for (int i = boundingBoxAxis + 1; i < shape.length; i++) {
+      b *= shape[i];
+    }
+    float[] values = new float[4];
+    ByteBuffer byteBuffer = tensor.getBuffer();
+    byteBuffer.rewind();
+    FloatBuffer floatBuffer = byteBuffer.asFloatBuffer();
+    for (int i = 0; i < a; i++) {
+      for (int j = 0; j < b; j++) {
+        for (int k = 0; k < 4; k++) {
+          values[k] = floatBuffer.get((i * 4 + k) * b + j);
+        }
+        boundingBoxList.add(
+            convertOneBoundingBox(values, valueIndex, type, coordinateType, height, width));
+      }
+    }
+    byteBuffer.rewind();
+    return boundingBoxList;
+  }
+
+  private static RectF convertOneBoundingBox(
+      float[] values,
+      int[] valueIndex,
+      Type type,
+      CoordinateType coordinateType,
+      int height,
+      int width) {
+    float[] orderedValues = new float[4];
+    for (int i = 0; i < 4; i++) {
+      orderedValues[i] = values[valueIndex[i]];
+    }
+    return convertOneBoundingBox(orderedValues, type, coordinateType, height, width);
+  }
+
+  private static RectF convertOneBoundingBox(
+      float[] values, Type type, CoordinateType coordinateType, int height, int width) {
+    switch (type) {
+      case BOUNDARIES:
+        return convertFromBoundaries(values, coordinateType, height, width);
+      case UPPER_LEFT:
+      case CENTER:
+        // TODO(b/150824448): convertFrom{UpperLeft, Center}
+        throw new IllegalArgumentException("BoundingBox.Type " + type + " is not yet supported.");
+    }
+    throw new IllegalArgumentException("Cannot recognize BoundingBox.Type " + type);
+  }
+
+  private static RectF convertFromBoundaries(
+      float[] values, CoordinateType coordinateType, int height, int width) {
+    if (coordinateType == CoordinateType.RATIO) {
+      return new RectF(
+          values[0] * width, values[1] * height, values[2] * width, values[3] * height);
+    } else {
+      return new RectF(values[0], values[1], values[2], values[3]);
+    }
+  }
+
+  // Private constructor to prevent initialization.
+  private BoundingBoxUtil() {}
+}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
index 2d57749b7c7..bced23e6f67 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
@@ -28,16 +28,16 @@ import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
  * TFLite.support library, it's common to convert image objects in variant types to TensorImage at
  * first.
  *
- * <p>We are adopting a little bit complex strategy to keep data here. In short, a TensorImage
- * object may have 2 potential sources of truth: the real and updated image could be in a Bitmap, or
- * a TensorBuffer, or both. It's mainly for performance, avoiding redundant data conversions.
+ * <p>At present, only RGB images are supported, and the A channel is always ignored.
+ *
+ * <p>Details of data storage: a {@link TensorImage} object may have 2 potential sources of truth: a
+ * {@link Bitmap} or a {@link TensorBuffer}. {@link TensorImage} maintains the state and only
+ * convert one to the other when needed.
  *
  * <p>IMPORTANT: The container doesn't own its data. Callers should not modify data objects those
  * are passed to {@link ImageContainer#set(Bitmap)} or {@link ImageContainer#set(TensorBuffer)}.
  *
- * <p>IMPORTANT: All methods are not proved thread-safe. Note: This class still a WIP. Currently, it
- * supports only RGB color space in uint8 (0-255). When getting Bitmap, value of A channel is always
- * set by 0.
+ * <p>IMPORTANT: All methods are not proved thread-safe.
  *
  * @see ImageProcessor which is often used for transforming a {@link TensorImage}.
  */
@@ -79,6 +79,18 @@ public class TensorImage {
     container = new ImageContainer(dataType);
   }
 
+  /**
+   * Initializes a {@link TensorImage} object with a {@link Bitmap}.
+   *
+   * @see TensorImage#load(Bitmap) for reusing the object when it's expensive to create objects
+   *     frequently, because every call of {@code fromBitmap} creates a new {@link TensorImage}.
+   */
+  public static TensorImage fromBitmap(Bitmap bitmap) {
+    TensorImage image = new TensorImage();
+    image.load(bitmap);
+    return image;
+  }
+
   /**
    * Creates a deep-copy of a given {@link TensorImage} and converts internal tensor data type.
    *
@@ -171,7 +183,7 @@ public class TensorImage {
    * <p>Important: It's only a reference. DO NOT MODIFY. We don't create a copy here for performance
    * concern, but if modification is necessary, please make a copy.
    *
-   * @return a reference to a Bitmap representing the image in ARGB_8888 config. A is always 0.
+   * @return a reference to a Bitmap in ARGB_8888 config. "A" channel is always opaque.
    * @throws IllegalStateException if the TensorImage never loads data, or if the TensorImage is
    *     holding a float-value image in {@code TensorBuffer}.
    */
@@ -219,6 +231,26 @@ public class TensorImage {
     return container.getDataType();
   }
 
+  /**
+   * Gets the image width.
+   *
+   * @throws IllegalStateException if the TensorImage never loads data.
+   * @throws IllegalArgumentException if the container data is corrupted.
+   */
+  public int getWidth() {
+    return container.getWidth();
+  }
+
+  /**
+   * Gets the image height.
+   *
+   * @throws IllegalStateException if the TensorImage never loads data.
+   * @throws IllegalArgumentException if the container data is corrupted.
+   */
+  public int getHeight() {
+    return container.getHeight();
+  }
+
   // Requires tensor shape [h, w, 3] or [1, h, w, 3].
   static void checkImageTensorShape(int[] shape) {
     SupportPreconditions.checkArgument(
@@ -261,6 +293,41 @@ public class TensorImage {
       isBufferUpdated = true;
     }
 
+    int getWidth() {
+      SupportPreconditions.checkState(
+          isBitmapUpdated || isBufferUpdated,
+          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
+      if (isBitmapUpdated) {
+        return bitmapImage.getWidth();
+      }
+      return getBufferDimensionSize(-2);
+    }
+
+    int getHeight() {
+      SupportPreconditions.checkState(
+          isBitmapUpdated || isBufferUpdated,
+          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
+      if (isBitmapUpdated) {
+        return bitmapImage.getHeight();
+      }
+      return getBufferDimensionSize(-3);
+    }
+
+    // Internal helper method to get the size of one dimension in the shape of the `bufferImage`.
+    // Requires `isBufferUpdated` is true.
+    // Throws `IllegalArgumentException` if data is corrupted.
+    private int getBufferDimensionSize(int dim) {
+      int[] shape = bufferImage.getShape();
+      // The defensive check is needed because bufferImage might be invalidly changed by user
+      // (a.k.a internal data is corrupted)
+      TensorImage.checkImageTensorShape(shape);
+      dim = dim % shape.length;
+      if (dim < 0) {
+        dim += shape.length;
+      }
+      return shape[dim];
+    }
+
     public DataType getDataType() {
       return dataType;
     }
@@ -272,7 +339,8 @@ public class TensorImage {
         return bitmapImage;
       }
       if (!isBufferUpdated) {
-        throw new IllegalStateException("Both buffer and bitmap data are obsolete.");
+        throw new IllegalStateException(
+            "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
       }
       if (bufferImage.getDataType() != DataType.UINT8) {
         throw new IllegalStateException(
@@ -298,7 +366,8 @@ public class TensorImage {
         return bufferImage;
       }
       SupportPreconditions.checkArgument(
-          isBitmapUpdated, "Both buffer and bitmap data are obsolete.");
+          isBitmapUpdated,
+          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
       int requiredFlatSize = bitmapImage.getWidth() * bitmapImage.getHeight() * 3;
       if (bufferImage == null
           || (!bufferImage.isDynamic() && bufferImage.getFlatSize() != requiredFlatSize)) {
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/Category.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/Category.java
new file mode 100644
index 00000000000..ea369c3ac12
--- /dev/null
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/Category.java
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.label;
+
+import java.util.Objects;
+
+/**
+ * Category is a util class, contains a label and a float value. Typically it's used as result of
+ * classification tasks.
+ */
+public final class Category {
+  private final String label;
+  private final float score;
+
+  /** Constructs a Category. */
+  public Category(String label, float score) {
+    this.label = label;
+    this.score = score;
+  }
+
+  /** Gets the reference of category's label. */
+  public String getLabel() {
+    return label;
+  }
+
+  /** Gets the score of the category. */
+  public float getScore() {
+    return score;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (o instanceof Category) {
+      Category other = (Category) o;
+      return (other.getLabel().equals(this.label) && other.getScore() == this.score);
+    }
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(label, score);
+  }
+
+  @Override
+  public String toString() {
+    return "<Category \"" + label + "\" (score=" + score + ")>";
+  }
+}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/TensorLabel.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/TensorLabel.java
index 8c27995c0f7..10763a1a065 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/TensorLabel.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/TensorLabel.java
@@ -17,6 +17,7 @@ package org.tensorflow.lite.support.label;
 
 import android.content.Context;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -150,17 +151,19 @@ public class TensorLabel {
    * than 1, and the axis should be effectively the last axis (which means every sub tensor
    * specified by this axis should have a flat size of 1).
    *
-   * @throws IllegalArgumentException if size of a sub tensor on each label is not 1.
+   * <p>{@link TensorLabel#getCategoryList()} is an alternative API to get the result.
+   *
+   * @throws IllegalStateException if size of a sub tensor on each label is not 1.
    */
   @NonNull
   public Map<String, Float> getMapWithFloatValue() {
     int labeledAxis = getFirstAxisWithSizeGreaterThanOne(tensorBuffer);
-    SupportPreconditions.checkArgument(
+    SupportPreconditions.checkState(
         labeledAxis == shape.length - 1,
         "get a <String, Scalar> map is only valid when the only labeled axis is the last one.");
     List<String> labels = axisLabels.get(labeledAxis);
     float[] data = tensorBuffer.getFloatArray();
-    SupportPreconditions.checkArgument(labels.size() == data.length);
+    SupportPreconditions.checkState(labels.size() == data.length);
     Map<String, Float> result = new LinkedHashMap<>();
     int i = 0;
     for (String label : labels) {
@@ -170,6 +173,37 @@ public class TensorLabel {
     return result;
   }
 
+  /**
+   * Gets a list of {@link Category} from the {@link TensorLabel} object.
+   *
+   * <p>The axis of label should be effectively the last axis (which means every sub tensor
+   * specified by this axis should have a flat size of 1), so that each labelled sub tensor could be
+   * converted into a float value score. Example: A {@link TensorLabel} with shape {@code {2, 5, 3}}
+   * and axis 2 is valid. If axis is 1 or 0, it cannot be converted into a {@link Category}.
+   *
+   * <p>{@link TensorLabel#getMapWithFloatValue()} is an alternative but returns a {@link Map} as
+   * the result.
+   *
+   * @throws IllegalStateException if size of a sub tensor on each label is not 1.
+   */
+  @NonNull
+  public List<Category> getCategoryList() {
+    int labeledAxis = getFirstAxisWithSizeGreaterThanOne(tensorBuffer);
+    SupportPreconditions.checkState(
+        labeledAxis == shape.length - 1,
+        "get a Category list is only valid when the only labeled axis is the last one.");
+    List<String> labels = axisLabels.get(labeledAxis);
+    float[] data = tensorBuffer.getFloatArray();
+    SupportPreconditions.checkState(labels.size() == data.length);
+    List<Category> result = new ArrayList<>();
+    int i = 0;
+    for (String label : labels) {
+      result.add(new Category(label, data[i]));
+      i += 1;
+    }
+    return result;
+  }
+
   private static int getFirstAxisWithSizeGreaterThanOne(@NonNull TensorBuffer tensorBuffer) {
     int[] shape = tensorBuffer.getShape();
     for (int i = 0; i < shape.length; i++) {
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
new file mode 100644
index 00000000000..9cfcf923ded
--- /dev/null
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.model;
+
+import android.util.Log;
+import java.io.Closeable;
+import java.io.IOException;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.tensorflow.lite.Delegate;
+
+/**
+ * Helper class to create and call necessary methods of {@code GpuDelegate} which is not a strict
+ * dependency.
+ */
+class GpuDelegateProxy implements Delegate, Closeable {
+
+  private static final String TAG = "GpuDelegateProxy";
+
+  private final Delegate proxiedDelegate;
+  private final Closeable proxiedCloseable;
+
+  @Nullable
+  public static GpuDelegateProxy maybeNewInstance() {
+    try {
+      Class<?> clazz = Class.forName("org.tensorflow.lite.gpu.GpuDelegate");
+      Object instance = clazz.getDeclaredConstructor().newInstance();
+      return new GpuDelegateProxy(instance);
+    } catch (ReflectiveOperationException e) {
+      Log.e(TAG, "Failed to create the GpuDelegate dynamically.", e);
+      return null;
+    }
+  }
+
+  /** Calls {@code close()} method of the delegate. */
+  @Override
+  public void close() {
+    try {
+      proxiedCloseable.close();
+    } catch (IOException e) {
+      // Should not trigger, because GpuDelegate#close never throws. The catch is required because
+      // of Closeable#close.
+      Log.e(TAG, "Failed to close the GpuDelegate.", e);
+    }
+  }
+
+  /** Calls {@code getNativeHandle()} method of the delegate. */
+  @Override
+  public long getNativeHandle() {
+    return proxiedDelegate.getNativeHandle();
+  }
+
+  private GpuDelegateProxy(Object instance) {
+    this.proxiedCloseable = (Closeable) instance;
+    this.proxiedDelegate = (Delegate) instance;
+  }
+}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
index b99e2a1a818..40659e39848 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
@@ -20,21 +20,67 @@ import java.io.IOException;
 import java.nio.MappedByteBuffer;
 import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
+import org.checkerframework.checker.nullness.qual.Nullable;
 import org.tensorflow.lite.Interpreter;
-import org.tensorflow.lite.gpu.GpuDelegate;
 import org.tensorflow.lite.support.common.FileUtil;
 import org.tensorflow.lite.support.common.SupportPreconditions;
 
-/** Class to load tflite models from App asset folder or remote sources. */
+/**
+ * The wrapper class for a TFLite model and a TFLite interpreter.
+ *
+ * <p>Note: A {@link Model} can only holds 1 TFLite model at a time, and always holds a TFLite
+ * interpreter instance to run it.
+ */
 public class Model {
 
-  /** An instance of the driver c /** The runtime device type used for executing classification. */
+  /** The runtime device type used for executing classification. */
   public enum Device {
     CPU,
     NNAPI,
     GPU
   }
 
+  /**
+   * Options for running the model. Configurable parameters includes:
+   *
+   * <ul>
+   *   <li>{@code device} {@link Builder#setDevice(Device)} specifies the hardware to run the model.
+   *       The default value is {@link Device#CPU}.
+   *   <li>{@code numThreads} {@link Builder#setNumThreads(int)} specifies the number of threads
+   *       used by TFLite inference. It's only effective when device is set to {@link Device#CPU}
+   *       and default value is 1.
+   * </ul>
+   */
+  public static class Options {
+    private final Device device;
+    private final int numThreads;
+
+    /** Builder of {@link Options}. See its doc for details. */
+    public static class Builder {
+      private Device device = Device.CPU;
+      private int numThreads = 1;
+
+      public Builder setDevice(Device device) {
+        this.device = device;
+        return this;
+      }
+
+      public Builder setNumThreads(int numThreads) {
+        this.numThreads = numThreads;
+        return this;
+      }
+
+      public Options build() {
+        return new Options(this);
+      }
+    }
+
+    private Options(Builder builder) {
+      device = builder.device;
+      numThreads = builder.numThreads;
+    }
+  }
+
   /** An instance of the driver class to run model inference with Tensorflow Lite. */
   private final Interpreter interpreter;
 
@@ -44,9 +90,14 @@ public class Model {
   /** The memory-mapped model data. */
   private final MappedByteBuffer byteModel;
 
-  private final GpuDelegate gpuDelegate;
+  private final GpuDelegateProxy gpuDelegateProxy;
 
-  /** Builder for {@link Model}. */
+  /**
+   * Builder for {@link Model}.
+   *
+   * @deprecated Please use {@link Model#createModel(Context, String, Options)}.
+   */
+  @Deprecated
   public static class Builder {
     private Device device = Device.CPU;
     private int numThreads = 1;
@@ -80,12 +131,81 @@ public class Model {
       return this;
     }
 
+    // Note: The implementation is copied from `Model#createModel`. As the builder is going to be
+    // deprecated, this function is also to be removed.
     @NonNull
     public Model build() {
-      return new Model(modelPath, byteModel, device, numThreads);
+      Options options = new Options.Builder().setNumThreads(numThreads).setDevice(device).build();
+      return createModel(byteModel, modelPath, options);
     }
   }
 
+  /**
+   * Loads a model from assets and initialize TFLite interpreter.
+   *
+   * <p>The default options are: (1) CPU device; (2) one thread.
+   *
+   * @param context The App Context.
+   * @param modelPath The path of the model file.
+   * @throws IOException if any exception occurs when open the model file.
+   */
+  public static Model createModel(@NonNull Context context, @NonNull String modelPath)
+      throws IOException {
+    return createModel(context, modelPath, new Options.Builder().build());
+  }
+
+  /**
+   * Loads a model from assets and initialize TFLite interpreter with given options.
+   *
+   * @see Options for details.
+   * @param context The App Context.
+   * @param modelPath The path of the model file.
+   * @param options The options for running the model.
+   * @throws IOException if any exception occurs when open the model file.
+   */
+  public static Model createModel(
+      @NonNull Context context, @NonNull String modelPath, @NonNull Options options)
+      throws IOException {
+    SupportPreconditions.checkNotEmpty(
+        modelPath, "Model path in the asset folder cannot be empty.");
+    MappedByteBuffer byteModel = FileUtil.loadMappedFile(context, modelPath);
+    return createModel(byteModel, modelPath, options);
+  }
+
+  /**
+   * Creates a model with loaded {@link MappedByteBuffer}.
+   *
+   * @see Options for details.
+   * @param byteModel The loaded TFLite model.
+   * @param modelPath The original path of the model. It can be fetched later by {@link
+   *     Model#getPath()}.
+   * @param options The options for running the model.
+   * @throws IllegalArgumentException if {@code options.device} is {@link Device#GPU} but
+   *     "tensorflow-lite-gpu" is not linked to the project.
+   */
+  public static Model createModel(
+      @NonNull MappedByteBuffer byteModel, @NonNull String modelPath, @NonNull Options options) {
+    Interpreter.Options interpreterOptions = new Interpreter.Options();
+    GpuDelegateProxy gpuDelegateProxy = null;
+    switch (options.device) {
+      case NNAPI:
+        interpreterOptions.setUseNNAPI(true);
+        break;
+      case GPU:
+        gpuDelegateProxy = GpuDelegateProxy.maybeNewInstance();
+        SupportPreconditions.checkArgument(
+            gpuDelegateProxy != null,
+            "Cannot inference with GPU. Did you add \"tensorflow-lite-gpu\" as dependency?");
+        interpreterOptions.addDelegate(gpuDelegateProxy);
+        break;
+      case CPU:
+        break;
+    }
+    interpreterOptions.setNumThreads(options.numThreads);
+    Interpreter interpreter = new Interpreter(byteModel, interpreterOptions);
+    return new Model(modelPath, byteModel, interpreter, gpuDelegateProxy);
+  }
+
   /** Returns the memory-mapped model data. */
   @NonNull
   public MappedByteBuffer getData() {
@@ -128,35 +248,19 @@ public class Model {
     if (interpreter != null) {
       interpreter.close();
     }
-    if (gpuDelegate != null) {
-      gpuDelegate.close();
+    if (gpuDelegateProxy != null) {
+      gpuDelegateProxy.close();
     }
   }
 
   private Model(
       @NonNull String modelPath,
       @NonNull MappedByteBuffer byteModel,
-      Device device,
-      int numThreads) {
-    SupportPreconditions.checkNotNull(byteModel, "Model file cannot be null.");
-    SupportPreconditions.checkNotEmpty(
-        modelPath, "Model path in the asset folder cannot be empty.");
-
+      @NonNull Interpreter interpreter,
+      @Nullable GpuDelegateProxy gpuDelegateProxy) {
     this.modelPath = modelPath;
     this.byteModel = byteModel;
-    Interpreter.Options interpreterOptions = new Interpreter.Options();
-    gpuDelegate = device == Device.GPU ? new GpuDelegate() : null;
-    switch (device) {
-      case NNAPI:
-        interpreterOptions.setUseNNAPI(true);
-        break;
-      case GPU:
-        interpreterOptions.addDelegate(gpuDelegate);
-        break;
-      case CPU:
-        break;
-    }
-    interpreterOptions.setNumThreads(numThreads);
-    interpreter = new Interpreter(byteModel, interpreterOptions);
+    this.interpreter = interpreter;
+    this.gpuDelegateProxy = gpuDelegateProxy;
   }
 }
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
index 16622a25333..fa05be363a6 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
@@ -379,13 +379,13 @@ public abstract class TensorBuffer {
 
     // Check if the new shape is the same as current shape.
     int newFlatSize = computeFlatSize(shape);
+    this.shape = shape.clone();
     if (flatSize == newFlatSize) {
       return;
     }
 
     // Update to the new shape.
     flatSize = newFlatSize;
-    this.shape = shape.clone();
     buffer = ByteBuffer.allocateDirect(flatSize * getTypeSize());
     buffer.order(ByteOrder.nativeOrder());
   }
diff --git a/tensorflow/lite/experimental/support/metadata/java/BUILD b/tensorflow/lite/experimental/support/metadata/java/BUILD
index f1cd6173b9e..82b6e9866a9 100644
--- a/tensorflow/lite/experimental/support/metadata/java/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/java/BUILD
@@ -25,6 +25,10 @@ java_library(
     name = "tensorflow-lite-support-metadata-lib",
     srcs = glob(["src/java/org/tensorflow/lite/support/metadata/**/*.java"]),
     javacopts = JAVACOPTS,
+    resource_jars = [
+        "//tensorflow/lite/experimental/support/metadata:libmetadata_schema_java.jar",
+        "//tensorflow/lite/experimental/support/metadata:libschema_fbs_java.jar",
+    ],
     deps = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_java",
         "//tensorflow/lite/experimental/support/metadata:schema_fbs_java",
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
index 054ea0e9730..3ded50e5d95 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
@@ -54,6 +54,11 @@ import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
  * MetadataExtractor} omits subgraph index as an input in its methods.
  */
 public class MetadataExtractor {
+  // TODO(b/156539454): remove the hardcode versioning number and populate the version through
+  // genrule.
+  /** The version of the metadata parser that this {@link MetadataExtractor} library depends on. */
+  public static final String METADATA_PARSER_VERSION = "1.0.0";
+
   /** The helper class to load metadata from TFLite model FlatBuffer. */
   private final ModelInfo modelInfo;
 
@@ -76,6 +81,15 @@ public class MetadataExtractor {
     ByteBuffer metadataBuffer = modelInfo.getMetadataBuffer();
     if (metadataBuffer != null) {
       metadataInfo = new ModelMetadataInfo(metadataBuffer);
+
+      // Prints warning message if the minimum parser version is not satisfied.
+      if (!isMinimumParserVersionSatisfied()) {
+        System.err.printf(
+            "<Warning> Some fields in the metadata belong to a future schema. The minimum parser"
+                + " version required is %s, but the version of the current metadata parser is %s",
+            metadataInfo.getMininumParserVersion(), METADATA_PARSER_VERSION);
+      }
+
       checkArgument(
           modelInfo.getInputTensorCount() == metadataInfo.getInputTensorCount(),
           String.format(
@@ -98,7 +112,7 @@ public class MetadataExtractor {
   }
 
   /** Returns {@code true} if the model has metadata. Otherwise, returns {@code false}. */
-  public Boolean hasMetadata() {
+  public boolean hasMetadata() {
     return metadataInfo != null;
   }
 
@@ -216,7 +230,31 @@ public class MetadataExtractor {
   }
 
   /**
-   * Asserts if {@link metdadataInfo} is not initialized. Some models may not have metadata and this
+   * Returns {@code true} if the minimum parser version required by the given metadata flatbuffer
+   * precedes or equals to the version of the metadata parser that this MetadataExtractor library is
+   * relying on. All fields in the metadata can be parsed correctly with this metadata extractor
+   * library in this case. Otherwise, it returns {@code false}.
+   *
+   * <p>For example, assume the underlying metadata parser version is {@code 1.14.1},
+   *
+   * <ul>
+   *   <li>it returns {@code true}, if the required minimum parser version is the same or older,
+   *       such as {@code 1.14.1} or {@code 1.14.0}. Null version precedes all numeric versions,
+   *       because some metadata flatbuffers are generated before the first versioned release; <br>
+   *   <li>it returns {@code false}, if the required minimum parser version is newer, such as {@code
+   *       1.14.2}.
+   * </ul>
+   */
+  public final boolean isMinimumParserVersionSatisfied() {
+    String minVersion = metadataInfo.getMininumParserVersion();
+    if (minVersion == null) {
+      return true;
+    }
+    return compareVersions(minVersion, METADATA_PARSER_VERSION) <= 0;
+  }
+
+  /**
+   * Asserts if {@link #metadataInfo} is not initialized. Some models may not have metadata and this
    * is allowed. However, invoking methods that reads the metadata is not allowed.
    *
    * @throws IllegalStateException if this model does not contain model metadata
@@ -260,4 +298,35 @@ public class MetadataExtractor {
       return null;
     }
   }
+
+  /**
+   * Compares two semantic version numbers.
+   *
+   * <p>Examples of comparing two versions: <br>
+   * {@code 1.9} precedes {@code 1.14}; <br>
+   * {@code 1.14} precedes {@code 1.14.1}; <br>
+   * {@code 1.14} and {@code 1.14.0} are euqal;
+   *
+   * @return the value {@code 0} if the two versions are equal; a value less than {@code 0} if
+   *     {@code version1} precedes {@code version2}; a value greater than {@code 0} if {@code
+   *     version2} precedes {@code version1}.
+   */
+  private static int compareVersions(String version1, String version2) {
+    // Using String.split instead of the recommanded Guava Splitter because we've been avoiding
+    // depending on other third party libraries in this project.
+    String[] levels1 = version1.split("\\.", 0);
+    String[] levels2 = version2.split("\\.", 0);
+
+    int length = Math.max(levels1.length, levels2.length);
+    for (int i = 0; i < length; i++) {
+      Integer v1 = i < levels1.length ? Integer.parseInt(levels1[i]) : 0;
+      Integer v2 = i < levels2.length ? Integer.parseInt(levels2[i]) : 0;
+      int compare = v1.compareTo(v2);
+      if (compare != 0) {
+        return compare;
+      }
+    }
+
+    return 0;
+  }
 }
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
index 27dfcac7f9b..e2905d108d7 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
@@ -61,12 +61,13 @@ final class ModelInfo {
    * of how to specify subgraph during convertion for more information.</a> Therefore, all methods
    * in {@link ModelInfo} retrieves metadata of the first subgrpah as default.
    *
-   * @param buffer The TFLite model FlatBuffer.
-   * @throws NullPointerException if {@code buffer} is null.
-   * @throws IllegalArgumentException if the model does not contain any subgraph.
+   * @param buffer the TFLite model FlatBuffer
+   * @throws NullPointerException if {@code buffer} is null
+   * @throws IllegalArgumentException if the model does not contain any subgraph, or the model does
+   *     not contain the expected identifier
    */
   ModelInfo(ByteBuffer buffer) {
-    checkNotNull(buffer, "Model flatbuffer cannot be null.");
+    assertTFLiteModel(buffer);
 
     model = Model.getRootAsModel(buffer);
     checkArgument(model.subgraphsLength() > 0, "The model does not contain any subgraph.");
@@ -171,16 +172,6 @@ final class ModelInfo {
     return getDataType(tensor.type());
   }
 
-  private static Map<Byte, DataType> createTensorTypeToDataTypeMap() {
-    Map<Byte, DataType> map = new HashMap<>();
-    map.put(TensorType.FLOAT32, DataType.FLOAT32);
-    map.put(TensorType.INT32, DataType.INT32);
-    map.put(TensorType.UINT8, DataType.UINT8);
-    map.put(TensorType.INT64, DataType.INT64);
-    map.put(TensorType.STRING, DataType.STRING);
-    return Collections.unmodifiableMap(map);
-  }
-
   /**
    * Gets the quantization parameters of a tensor.
    *
@@ -227,6 +218,31 @@ final class ModelInfo {
     return new QuantizationParams(scale, zeroPoint);
   }
 
+  /**
+   * Verifies if the buffer is a valid TFLite model.
+   *
+   * @param buffer the TFLite model flatbuffer
+   * @throws NullPointerException if {@code buffer} is null.
+   * @throws IllegalArgumentException if {@code buffer} does not contain the expected identifier
+   */
+  private static void assertTFLiteModel(ByteBuffer buffer) {
+    checkNotNull(buffer, "Model flatbuffer cannot be null.");
+    checkArgument(
+        Model.ModelBufferHasIdentifier(buffer),
+        "The identifier of the model is invalid. The buffer may not be a valid TFLite model"
+            + " flatbuffer.");
+  }
+
+  private static Map<Byte, DataType> createTensorTypeToDataTypeMap() {
+    Map<Byte, DataType> map = new HashMap<>();
+    map.put(TensorType.FLOAT32, DataType.FLOAT32);
+    map.put(TensorType.INT32, DataType.INT32);
+    map.put(TensorType.UINT8, DataType.UINT8);
+    map.put(TensorType.INT64, DataType.INT64);
+    map.put(TensorType.STRING, DataType.STRING);
+    return Collections.unmodifiableMap(map);
+  }
+
   /**
    * Transforms from TensorType in TFlite FlatBuffer to {@link DataType} in Java.
    *
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
index ad13a3050af..751ed500dc2 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
@@ -38,15 +38,19 @@ final class ModelMetadataInfo {
   /** Metadata array of output tensors. */
   private final List</* @Nullable */ TensorMetadata> outputsMetadata;
 
+  /** The minimum parser version required to fully understand the metadata flatbuffer. */
+  private final String /* @Nullable */ minVersion;
+
   /**
    * Creates a {@link ModelMetadataInfo} with the metadata FlatBuffer, {@code buffer}.
    *
-   * @param buffer The TFLite metadata FlatBuffer.
-   * @throws NullPointerException if {@code buffer} is null.
-   * @throws IllegalArgumentException if the metadata does not contain any subgraph metadata.
+   * @param buffer the TFLite metadata FlatBuffer
+   * @throws NullPointerException if {@code buffer} is null
+   * @throws IllegalArgumentException if {@code buffer} does not contain any subgraph metadata, or
+   *     it does not contain the expected identifier
    */
   ModelMetadataInfo(ByteBuffer buffer) {
-    checkNotNull(buffer, "Metadata flatbuffer cannot be null.");
+    assertTFLiteMetadata(buffer);
 
     modelMetadata = ModelMetadata.getRootAsModelMetadata(buffer);
     checkArgument(
@@ -55,6 +59,7 @@ final class ModelMetadataInfo {
 
     inputsMetadata = getInputsMetadata(modelMetadata);
     outputsMetadata = getOutputsMetadata(modelMetadata);
+    minVersion = modelMetadata.minParserVersion();
   }
 
   /** Gets the count of input tensors with metadata in the metadata FlatBuffer. */
@@ -76,6 +81,15 @@ final class ModelMetadataInfo {
     return inputsMetadata.get(inputIndex);
   }
 
+  /**
+   * Gets the minimum parser version of the metadata. It can be {@code null} if the version is not
+   * populated.
+   */
+  @Nullable
+  String getMininumParserVersion() {
+    return minVersion;
+  }
+
   /** Gets the root handler for the model metadata. */
   ModelMetadata getModelMetadata() {
     return modelMetadata;
@@ -100,6 +114,21 @@ final class ModelMetadataInfo {
     return outputsMetadata.get(outputIndex);
   }
 
+  /**
+   * Verifies if the buffer is a valid TFLite metadata flatbuffer.
+   *
+   * @param buffer the TFLite metadata flatbuffer
+   * @throws NullPointerException if {@code buffer} is null.
+   * @throws IllegalArgumentException if {@code buffer} does not contain the expected identifier
+   */
+  private static void assertTFLiteMetadata(ByteBuffer buffer) {
+    checkNotNull(buffer, "Metadata flatbuffer cannot be null.");
+    checkArgument(
+        ModelMetadata.ModelMetadataBufferHasIdentifier(buffer),
+        "The identifier of the metadata is invalid. The buffer may not be a valid TFLite metadata"
+            + " flatbuffer.");
+  }
+
   /** Gets metadata for all input tensors. */
   private static List<TensorMetadata> getInputsMetadata(ModelMetadata modelMetadata) {
     SubGraphMetadata subgraphMetadata = modelMetadata.subgraphMetadata(0);
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
index 7806899c906..a2812e1b6e3 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
+++ b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
@@ -29,11 +29,31 @@ namespace tflite;
 // generate the model interface. It is recommended to fill in at least those
 // enties to boost the codegen performance.
 
-// This corresponds to the schema version.
+// LINT.IfChange
+
+// The Metadata schema is versioned by the Semantic versioning number, which
+// tracks the schema changes according to the Semantic versioning rules.
+//
+// ModelMetadata.min_parser_version indicates the minimum necessary metadata
+// parser version to fully understand all fields in a given metadata flatbuffer.
+//
+// New fields and types will have associated comments with the schema version for
+// which they were added.
+//
+// Schema Semantic version: 1.0.0
+
+// This indicates the flatbuffer compatibility. The number will bump up when a
+// break change is applied to the schema, such as removing fields or adding new
+// fields to the middle of a table.
 file_identifier "M001";
 // File extension of any written files.
 file_extension "tflitemeta";
 
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//     /supportmetadata/java/src/java/org/tensorflow/lite/support/metadata/\
+//     MetadataExtractor.java)
+
+// LINT.IfChange
 enum AssociatedFileType : byte {
   UNKNOWN = 0,
   // Files such as readme.txt
@@ -195,7 +215,7 @@ enum CoordinateType : byte {
 
 table BoundingBoxProperties {
   // Denotes the order of the elements defined in each bounding box type. An
-  // empty index array represent the defualt order of each bounding box type.
+  // empty index array represent the default order of each bounding box type.
   // For example, to denote the default order of BOUNDARIES, {left, top, right,
   // bottom}, the index should be {0, 1, 2, 3}. To denote the order {left,
   // right, top, bottom}, the order should be {0, 2, 1, 3}.
@@ -297,12 +317,22 @@ table NormalizationOptions{
   // mean and std are normalization parameters. Tensor values are normalized
   // on a per-channel basis, by the formula
   //   (x - mean) / std.
-  // For example, a float MobileNet model will have
-  //   mean = 127.5f and std = 127.5f.
-  // A quantized MobileNet model will have
-  //   mean = 0.0f and std = 1.0f.
   // If there is only one value in mean or std, we'll propogate the value to
   // all channels.
+  //
+  // Quantized models share the same normalization parameters as their
+  // corresponding float models. For example, an image input tensor may have
+  // the normalization parameter of
+  //   mean = 127.5f and std = 127.5f.
+  // The image value will be normalized from [0, 255] to [-1, 1].
+  // Then, for quantized models, the image data should be further quantized
+  // according to the quantization parameters. In the case of uint8, the image
+  // data will be scaled back to [0, 255], while for int8, the image data will
+  // be scaled to [-128, 127].
+  //
+  // Both the normalization parameters and quantization parameters can be
+  // retrieved through the metadata extractor library.
+  // TODO(b/156644598): add link for the metadata extractor library.
 
   // Per-channel mean of the possible values used in normalization.
   //
@@ -498,6 +528,17 @@ table ModelMetadata {
 
   // A list of associated files of this model.
   associated_files:[AssociatedFile];
+
+  // The minimum metadata parser version that can fully understand the fields in
+  // the metadata flatbuffer. The version is effectively the largest version
+  // number among the versions of all the fields populated and the smallest
+  // compatible version indicated by the file identifier.
+  //
+  // This field is automaticaly populated by the MetadataPopulator when
+  // the metadata is populated into a TFLite model.
+  min_parser_version:string;
 }
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//     support/metadata/cc/metadata_version.cc)
 
 root_type ModelMetadata;
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index 2ce8428b1ce..e671721dd1c 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -10,14 +10,32 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+# TODO(b/153554551): investigate if separate delegate libraries can be made with same module_name
+# If you don't need delegates and want to reduce size of the app, you can exclude Metal/Core ML
+# delegate related dependencies from the rule.
+# For example, if you don't want to use Core ML delegate:
+# 1. add `exclude = ["Sources/CoreMLDelegate.swift"]` to `glob`, so that `srcs` would look like this:
+#    ```
+#    srcs = glob(
+#        ["Sources/*.swift"],
+#        exclude = ["Sources/CoreMLDelegate.swift"],
+#    ),
+# 2. remove "-Wl,-weak_framework,CoreML" from `linkopts`
+# 3. remove "...:coreml_delegate" from `deps`
+
 swift_library(
     name = "TensorFlowLite",
     srcs = glob(["Sources/*.swift"]),
+    linkopts = [
+        "-Wl,-weak_framework,CoreML",
+        "-Wl,-weak_framework,Metal",
+    ],
     module_name = "TensorFlowLite",
     tags = TFL_DEFAULT_TAGS,
     visibility = ios_visibility_whitelist(),
     deps = [
-        "//tensorflow/lite/experimental/ios:TensorFlowLiteC",
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
+        "//tensorflow/lite/experimental/ios:tensorflow_lite_c",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
index 6011737d836..9fc76bc3026 100644
--- a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-import TensorFlowLiteC
+import TensorFlowLiteCCoreML
 
 /// A delegate that uses the `Core ML` framework for performing TensorFlow Lite graph operations.
 ///
@@ -22,9 +22,11 @@ public final class CoreMLDelegate: Delegate {
   public let options: Options
 
   // Conformance to the `Delegate` protocol.
-  public private(set) var cDelegate: CDelegate?
+  public private(set) var cDelegate: CDelegate
 
-  /// Creates a new instance configured with the given `options`.
+  /// Creates a new instance configured with the given `options`. Returns `nil` if the underlying
+  /// Core ML delegate could not be created because `Options.enabledDevices` was set to
+  /// `neuralEngine` but the device does not have the Neural Engine.
   ///
   /// - Parameters:
   ///   - options: Configurations for the delegate. The default is a new instance of
@@ -33,10 +35,11 @@ public final class CoreMLDelegate: Delegate {
     self.options = options
     var delegateOptions = TfLiteCoreMlDelegateOptions()
     delegateOptions.enabled_devices = options.enabledDevices.cEnabledDevices
-    cDelegate = TfLiteCoreMlDelegateCreate(&delegateOptions)
-    if cDelegate == nil {
-      return nil
-    }
+    delegateOptions.coreml_version = Int32(options.coreMLVersion)
+    delegateOptions.max_delegated_partitions = Int32(options.maxDelegatedPartitions)
+    delegateOptions.min_nodes_per_partition = Int32(options.minNodesPerPartition)
+    guard let delegate = TfLiteCoreMlDelegateCreate(&delegateOptions) else { return nil }
+    cDelegate = delegate
   }
 
   deinit {
@@ -44,35 +47,44 @@ public final class CoreMLDelegate: Delegate {
   }
 }
 
-
 extension CoreMLDelegate {
+  /// A type indicating which devices the Core ML delegate should be enabled for.
+  public enum EnabledDevices: Equatable, Hashable {
+    /// Enables the delegate for devices with Neural Engine only.
+    case neuralEngine
+    /// Enables the delegate for all devices.
+    case all
+
+    /// The C `TfLiteCoreMlDelegateEnabledDevices` for the current `EnabledDevices`.
+    var cEnabledDevices: TfLiteCoreMlDelegateEnabledDevices {
+      switch self {
+      case .neuralEngine:
+        return TfLiteCoreMlDelegateDevicesWithNeuralEngine
+      case .all:
+        return TfLiteCoreMlDelegateAllDevices
+      }
+    }
+  }
+
   /// Options for configuring the `CoreMLDelegate`.
   // TODO(b/143931022): Add preferred device support.
   public struct Options: Equatable, Hashable {
-    /// A type determines Core ML delegate initialization on devices without Neural Engine. The
-    /// default is .devicesWithNeuralEngine, where the delegate will not be created for
-    /// devices that does not have Neural Engine.
-    public var enabledDevices: CoreMLDelegateEnabledDevices = .devicesWithNeuralEngine
+    /// A type indicating which devices the Core ML delegate should be enabled for. The default
+    /// value is `.neuralEngine` indicating that the delegate is enabled for Neural Engine devices
+    /// only.
+    public var enabledDevices: EnabledDevices = .neuralEngine
+    /// Target Core ML version for the model conversion. When it's not set, Core ML version will
+    /// be set to highest available version for the platform.
+    public var coreMLVersion = 0
+    /// The maximum number of Core ML delegate partitions created. Each graph corresponds to one
+    /// delegated node subset in the TFLite model. The default value is `0` indicating that all
+    /// possible partitions are delegated.
+    public var maxDelegatedPartitions = 0
+    /// The minimum number of nodes per partition to be delegated by the Core ML delegate. The
+    /// default value is `2`.
+    public var minNodesPerPartition = 2
+
     /// Creates a new instance with the default values.
     public init() {}
   }
 }
-
-/// A type determines Core ML delegate initialization on devices without Neural Engine.
-public enum CoreMLDelegateEnabledDevices: Equatable, Hashable {
-  /// Creates the delegate only for devices with Neural Engine.
-  case devicesWithNeuralEngine
-  /// Creates the delegate even when Neural Engine is not available.
-  case allDevices
-
-  /// The C `TfLiteCoreMlDelegateEnabledDevices` for the current `CoreMLDelegateEnabledDevices`.
-  var cEnabledDevices: TfLiteCoreMlDelegateEnabledDevices {
-    switch self {
-    case .devicesWithNeuralEngine:
-      return TfLiteCoreMlDelegateDevicesWithNeuralEngine
-    case .allDevices:
-      return TfLiteCoreMlDelegateAllDevices
-    }
-  }
-}
-
diff --git a/tensorflow/lite/experimental/swift/Sources/Delegate.swift b/tensorflow/lite/experimental/swift/Sources/Delegate.swift
index d4d0a3b32d4..7b73d65bf80 100644
--- a/tensorflow/lite/experimental/swift/Sources/Delegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Delegate.swift
@@ -20,5 +20,5 @@ public protocol Delegate: class {
   typealias CDelegate = UnsafeMutablePointer<TfLiteDelegate>
 
   /// The delegate that performs model computations.
-  var cDelegate: CDelegate? { get }
+  var cDelegate: CDelegate { get }
 }
diff --git a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
index 3bfa4bc048c..8fd15f303da 100644
--- a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
@@ -23,7 +23,7 @@ public final class MetalDelegate: Delegate {
   public let options: Options
 
   // Conformance to the `Delegate` protocol.
-  public private(set) var cDelegate: CDelegate?
+  public private(set) var cDelegate: CDelegate
 
   /// Creates a new instance configured with the given `options`.
   ///
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
index 3b21483f663..8b0e797eeaa 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
@@ -20,8 +20,20 @@ Pod::Spec.new do |s|
 
   tfl_dir = 'tensorflow/lite/'
   swift_dir = tfl_dir + 'experimental/swift/'
-  s.source_files = swift_dir + 'Sources/*.swift'
-  s.dependency 'TensorFlowLiteC', "~> #{s.version}"
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.source_files = swift_dir + 'Sources/*.swift'
+    core.exclude_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
 
   s.test_spec 'Tests' do |ts|
     ts.source_files = swift_dir + 'Tests/*.swift'
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
index e19869ee955..679a894c414 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSwift'
-  s.version          = '2.1.0'
+  s.version          = '2.2.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
new file mode 100644
index 00000000000..a925112f539
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
@@ -0,0 +1,45 @@
+Pod::Spec.new do |s|
+  s.name             = 'TensorFlowLiteSwift'
+  s.version          = '${TFL_BUILD_VERSION}'
+  s.authors          = 'Google Inc.'
+  s.license          = { :type => 'Apache' }
+  s.homepage         = 'https://github.com/tensorflow/tensorflow'
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '${SOURCE_COMMIT}' }
+  s.summary          = 'TensorFlow Lite for Swift'
+  s.description      = <<-DESC
+
+  TensorFlow Lite is TensorFlow's lightweight solution for Swift developers. It
+  enables low-latency inference of on-device machine learning models with a
+  small binary size and fast performance supporting hardware acceleration.
+                       DESC
+
+  s.ios.deployment_target = '9.0'
+
+  s.module_name = 'TensorFlowLite'
+  s.static_framework = true
+
+  tfl_dir = 'tensorflow/lite/'
+  swift_dir = tfl_dir + 'experimental/swift/'
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.source_files = swift_dir + 'Sources/*.swift'
+    core.exclude_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
+
+  s.test_spec 'Tests' do |ts|
+    ts.source_files = swift_dir + 'Tests/*.swift'
+    ts.resources = [
+      tfl_dir + 'testdata/add.bin',
+      tfl_dir + 'testdata/add_quantized.bin',
+    ]
+  end
+end
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index ab0958bea2f..0da3d152090 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -28,8 +28,7 @@ upper_tabs:
         path: https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-ios/
         status: external
       - title: "Transfer learning for image classification"
-        path: https://colab.research.google.com/github/tensorflow/examples/blob/master/tensorflow_examples/lite/model_maker/demo/image_classification.ipynb
-        status: external
+        path: /lite/tutorials/model_maker_image_classification
       - title: "Image classification on Raspberry Pi"
         path: https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/raspberry_pi/
         status: external
@@ -41,9 +40,8 @@ upper_tabs:
         status: external
 
       - heading: "Text"
-      - title: "Sentiment analysis"
-        path: https://colab.research.google.com/github/tensorflow/examples/blob/master/tensorflow_examples/lite/model_maker/demo/text_classification.ipynb
-        status: external
+      - title: "Text classification"
+        path: /lite/tutorials/model_maker_text_classification
 
       - heading: "Microcontrollers"
       - title: "Gesture recognition"
@@ -191,15 +189,17 @@ upper_tabs:
       - title: "Text classification"
         path: /lite/models/text_classification/overview
 
-    - name: "Python API"
+    - name: "API"
       skip_translation: true
       contents:
-      - title: "Python API"
+      - title: API Reference
+        path: /lite/api_docs/
+      - heading: "Python"
+      - title: "Overview"
+        status: external
         path: /api_docs/python/tf/lite
-    - name: "C++ API"
-      skip_translation: true
-      contents:
-      - title: "C++ API"
+      - heading: "C++"
+      - title: Overview
         path: /lite/api_docs/cc/
       - include: /lite/api_docs/cc/_doxygen.yaml
 
diff --git a/tensorflow/lite/g3doc/api_docs/index.md b/tensorflow/lite/g3doc/api_docs/index.md
new file mode 100644
index 00000000000..533f5881eb4
--- /dev/null
+++ b/tensorflow/lite/g3doc/api_docs/index.md
@@ -0,0 +1,10 @@
+# TensorFlow Lite API Reference
+
+The API reference documentation provides detailed information for each of the
+classes and methods in the TensorFlow Lite library. Choose your preferred
+platform from the list below.
+
+*   [Python API reference](/api_docs/python/tf/lite)
+*   Android API reference (coming soon)
+*   iOS API reference (coming soon)
+*   [C++ API reference](/lite/api_docs/cc/)
diff --git a/tensorflow/lite/g3doc/convert/1x_compatibility.md b/tensorflow/lite/g3doc/convert/1x_compatibility.md
index adb2af4d8ad..9f9f277a8d9 100644
--- a/tensorflow/lite/g3doc/convert/1x_compatibility.md
+++ b/tensorflow/lite/g3doc/convert/1x_compatibility.md
@@ -1,30 +1,32 @@
-# TensorFlow 1.x compatibility
+# TensorFlow 1.x Compatibility <a name="differences"></a>
 
-The `tf.lite.TFLiteConverter` was updated between TensorFlow 1.X and 2.0. This
-document explains the differences between the 1.X and 2.0 versions of the
-converter, and provides information about how to use the 1.X version if
-required.
+The `tf.lite.TFLiteConverter` Python API was updated between TensorFlow 1.x and
+2.x. This document explains the differences between the two versions, and
+provides information about how to use the 1.x version if required.
 
-## Summary of changes in Python API between 1.X and 2.0 <a name="differences"></a>
-
-The following section summarizes the changes in the Python API from 1.X to 2.0.
 If any of the changes raise concerns, please file a
-[GitHub issue](https://github.com/tensorflow/tensorflow/issues).
+[GitHub Issue](https://github.com/tensorflow/tensorflow/issues).
 
-### Formats supported by `TFLiteConverter`
+Note: We highly recommend that you
+[migrate your TensorFlow 1.x code to TensorFlow 2.x code](https://www.tensorflow.org/guide/migrate)
+.
 
-The 2.0 version of the converter supports SavedModel and Keras model files
-generated in both 1.X and 2.0. However, the conversion process no longer
-supports "frozen graph" `GraphDef` files generated in 1.X.
+## Model formats
 
-#### Converting frozen graphs
+#### SavedModel and Keras
 
-Users who want to convert frozen graph `GraphDef` files (`.pb` files) to
-TensorFlow Lite should use `tf.compat.v1.lite.TFLiteConverter`.
+The `tf.lite.TFLiteConverter` API supports SavedModel and Keras HDF5 files
+generated in both TensorFlow 1.x and 2.x.
 
-The following snippet shows a frozen graph file being converted:
+#### Frozen Graph
+
+Note: TensorFlow 2.x no longer supports the generation of frozen graph models.
+
+The `tf.compat.v1.lite.TFLiteConverter` API supports frozen graph models
+generated in TensorFlow 1.x, as shown below:
 
 ```python
+import tensorflow as tf
 # Path to the frozen graph file
 graph_def_file = 'frozen_graph.pb'
 # A list of the names of the model's input tensors
@@ -32,70 +34,68 @@ input_arrays = ['input_name']
 # A list of the names of the model's output tensors
 output_arrays = ['output_name']
 # Load and convert the frozen graph
-converter = lite.TFLiteConverter.from_frozen_graph(
+converter = tf.lite.TFLiteConverter.from_frozen_graph(
   graph_def_file, input_arrays, output_arrays)
 tflite_model = converter.convert()
 # Write the converted model to disk
 open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-### Quantization-aware training
+## Converter attributes
 
-The following attributes and methods associated with
-[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize)
-have been removed from `TFLiteConverter` in TensorFlow 2.0:
+#### Renamed attributes
 
-*   `inference_type`
-*   `inference_input_type`
-*   `quantized_input_stats`
-*   `default_ranges_stats`
-*   `reorder_across_fake_quant`
-*   `change_concat_input_ranges`
-*   `post_training_quantize` - Deprecated in the 1.X API
-*   `get_input_arrays()`
+The following 1.x attribute has been renamed in 2.x.
 
-The rewriter function that supports quantization-aware training does not support
-models generated by TensorFlow 2.0. Additionally, TensorFlow Lite’s quantization
-API is being reworked and streamlined in a direction that supports
-quantization-aware training through the Keras API. These attributes will be
-removed in the 2.0 API until the new quantization API is launched. Users who
-want to convert models generated by the rewriter function can use
-`tf.compat.v1.lite.TFLiteConverter`.
+*   `target_ops` has been renamed to `target_spec.supported_ops` - In 2.x, in
+    line with future additions to the optimization framework, it has become an
+    attribute of `TargetSpec` and has been renamed to `supported_ops`.
 
-### Changes to `TFLiteConverter` attributes
+#### Unsupported attributes
 
-The `target_ops` attribute has become an attribute of `TargetSpec` and renamed
-to `supported_ops` in line with future additions to the optimization framework.
+The following 1.x attributes have been removed in 2.x.
 
-Additionally, the following attributes have been removed:
-
-*   `drop_control_dependency` (default: `True`)
-*   _Graph visualization_ - The recommended approach for visualizing a
-    TensorFlow Lite graph in TensorFlow 2.0 will be to use
-    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py).
-    Unlike GraphViz, it enables users to visualize the graph after post training
-    quantization has occurred. The following attributes related to graph
-    visualization will be removed:
+*   _Quantization_ - In 2.x,
+    [quantize aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+    is supported through the Keras API and
+    [post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)
+    uses fewer streamlined converter flags. Thus, the following attributes and
+    methods related to quantization have been removed:
+    *   `inference_type`
+    *   `quantized_input_stats`
+    *   `post_training_quantize`
+    *   `default_ranges_stats`
+    *   `reorder_across_fake_quant`
+    *   `change_concat_input_ranges`
+    *   `get_input_arrays()`
+*   _Visualization_ - In 2.x, the recommended approach for visualizing a
+    TensorFlow Lite graph is to use
+    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
+    . Unlike GraphViz, it enables users to visualize the graph after post
+    training quantization has occurred. Thus, the following attributes related
+    to graph visualization have been removed:
     *   `output_format`
     *   `dump_graphviz_dir`
     *   `dump_graphviz_video`
+*   _Frozen graph_ - In 2.x, the frozen graph model format has been removed.
+    Thus, the following attribute related to frozen graphs has been removed:
+    *   `drop_control_dependency`
 
-### General API changes
+## Unsupported APIs
 
-The following section explains several significant API changes between
-TensorFlow 1.X and 2.0.
+The following section explains several significant features in 1.x that have
+been removed in 2.x.
 
-#### Conversion methods
+#### Conversion APIs
 
-The following methods that were previously deprecated in 1.X will no longer be
-exported in 2.0:
+The following methods were deprecated in 1.x and have been removed in 2.x:
 
 *   `lite.toco_convert`
 *   `lite.TocoConverter`
 
-#### `lite.constants`
+#### `lite.constants` API
 
-The `lite.constants` API was removed in 2.0 in order to decrease duplication
+The `lite.constants` API was removed in 2.x in order to decrease duplication
 between TensorFlow and TensorFlow Lite. The following list maps the
 `lite.constant` type to the TensorFlow type:
 
@@ -106,12 +106,15 @@ between TensorFlow and TensorFlow Lite. The following list maps the
 *   `lite.constants.STRING`: `tf.string`
 *   `lite.constants.QUANTIZED_UINT8`: `tf.uint8`
 
-Additionally, `lite.constants.TFLITE` and `lite.constants.GRAPHVIZ_DOT` were
-removed due to the deprecation of the `output_format` flag in `TFLiteConverter`.
+Additionally, the deprecation of the `output_format` flag in `TFLiteConverter`
+led to the removal of the following constants:
 
-#### `lite.OpHint`
+*   `lite.constants.TFLITE`
+*   `lite.constants.GRAPHVIZ_DOT`
 
-The `OpHint` API is currently not available in 2.0 due to an incompatibility
-with the 2.0 APIs. This API enables conversion of LSTM based models. Support for
-LSTMs in 2.0 is being investigated. All related `lite.experimental` APIs have
-been removed due to this issue.
+#### `lite.OpHint` API
+
+The `OpHint` API is currently unsupported due to an incompatibility with the 2.x
+APIs. This API enables conversion of LSTM based models. Support for LSTMs in 2.x
+is being investigated. All related `lite.experimental` APIs have been removed
+due to this issue.
diff --git a/tensorflow/lite/g3doc/guide/android.md b/tensorflow/lite/g3doc/guide/android.md
index 940e523c1d5..30e77b6d24d 100644
--- a/tensorflow/lite/g3doc/guide/android.md
+++ b/tensorflow/lite/g3doc/guide/android.md
@@ -208,3 +208,18 @@ dependencies {
 Note that the `0.1.100` version here is purely for the sake of
 testing/development. With the local AAR installed, you can use the standard
 [TensorFlow Lite Java inference APIs](../guide/inference.md) in your app code.
+
+##### Build C++ libraries
+
+If you want to use TFLite through C++ libraries, you can build the shared
+libraries:
+
+32bit armeabi-v7a:
+```
+bazel build -c opt --config=android_arm //tensorflow/lite:libtensorflowlite.so
+```
+
+64bit arm64-v8a:
+```
+bazel build -c opt --config=android_arm64 //tensorflow/lite:libtensorflowlite.so
+```
diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index 1e04ee77a0e..dfe3709b024 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -5,87 +5,98 @@ Raspberry Pi. If you just want to start using TensorFlow Lite to execute your
 models, the fastest option is to install the TensorFlow Lite runtime package as
 shown in the [Python quickstart](python.md).
 
-Note: This page shows how to compile only the C++ static library for
-TensorFlow Lite. Alternative install options include: [install just the Python
-interpreter API](python.md) (for inferencing only); [install the full
-TensorFlow package from pip](https://www.tensorflow.org/install/pip);
-or [build the full TensorFlow package](
-https://www.tensorflow.org/install/source_rpi).
-
+**Note:** This page shows how to compile only the C++ static library for
+TensorFlow Lite. Alternative install options include:
+[install just the Python interpreter API](python.md) (for inferencing only);
+[install the full TensorFlow package from pip](https://www.tensorflow.org/install/pip);
+or
+[build the full TensorFlow package](https://www.tensorflow.org/install/source_rpi).
 
 ## Cross-compile for Raspberry Pi
 
-This has been tested on Ubuntu 16.04.3 64bit and TensorFlow devel docker image
+Instruction has been tested on Ubuntu 16.04.3 64-bit PC (AMD64) and TensorFlow
+devel docker image
 [tensorflow/tensorflow:nightly-devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
-To cross compile TensorFlow Lite, first install the toolchain and libs:
+To cross compile TensorFlow Lite follow the steps:
 
-```bash
-sudo apt-get update
-sudo apt-get install crossbuild-essential-armhf
-# The following is only needed for Pi Zero build.
-sudo apt-get install crossbuild-essential-armel
-```
+1.  Clone official Raspberry Pi cross-compilation toolchain:
 
-If you are using Docker, you may not use `sudo`.
+    ```bash
+    git clone https://github.com/raspberrypi/tools.git rpi_tools
+    ```
 
-Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`)—if you're using the TensorFlow
-Docker image, the repo is already provided in `/tensorflow_src/`—and then run
-this script at the root of the TensorFlow repository to download all the
-build dependencies:
+2.  Clone TensorFlow repository:
 
-```bash
-./tensorflow/lite/tools/make/download_dependencies.sh
-```
+    ```bash
+    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
 
-Note that you only need to do this once.
+    ```
 
-You should then be able to compile:
+    **Note:** If you're using the TensorFlow Docker image, the repo is already
+    provided in `/tensorflow_src/`.
 
-To build ARMv7 binary for Raspberry Pi 2, 3 and 4:
+3.  Run following script at the root of the TensorFlow repository to download
+    all the build dependencies:
 
-```bash
-./tensorflow/lite/tools/make/build_rpi_lib.sh
-```
+    ```bash
+    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+    ```
 
-This should compile a static library in:
-`tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
+    **Note:** You only need to do this once.
 
-To build ARMv6 binary for Raspberry Pi Zero:
+4.  To build ARMv7 binary for Raspberry Pi 2, 3 and 4 execute:
 
-```bash
-./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
-```
+    ```bash
+    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh
+    ```
 
-This should compile a static library in:
-`tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
+    **Note:** This should compile a static library in:
+    `tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
+
+5.  To build ARMv6 binary for Raspberry Pi Zero execute:
+
+    ```bash
+    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
+    ```
+
+    **Note:** This should compile a static library in:
+    `tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
 
 ## Compile natively on Raspberry Pi
 
-This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1).
+Instruction has been tested on Raspberry Pi Zero, Raspbian GNU/Linux 10
+(buster), gcc version 8.3.0 (Raspbian 8.3.0-6+rpi1):
 
-Log in to your Raspberry Pi and install the toolchain:
+To natively compile TensorFlow Lite follow the steps:
 
-```bash
-sudo apt-get install build-essential
-```
+1.  Log in to your Raspberry Pi and install the toolchain:
 
-Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`) and run this at the root of
-the repository:
+    ```bash
+    sudo apt-get install build-essential
+    ```
 
-```bash
-./tensorflow/lite/tools/make/download_dependencies.sh
-```
+2.  Clone TensorFlow repository:
 
-Note that you only need to do this once.
+    ```bash
+    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
 
-You should then be able to compile:
+    ```
 
-```bash
-./tensorflow/lite/tools/make/build_rpi_lib.sh
-```
+3.  Run following script at the root of the TensorFlow repository to download
+    all the build dependencies:
 
-This should compile a static library in:
-`tensorflow/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`.
+    ```bash
+    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+    ```
+
+    **Note:** You only need to do this once.
+
+4.  You should then be able to compile TensorFlow Lite with:
+
+    ```bash
+    ./tensorflow/lite/tools/make/build_rpi_lib.sh
+    ```
+
+    **Note:** This should compile a static library in:
+    `tensorflow/lite/tools/make/gen/lib/rpi_armv6/libtensorflow-lite.a`.
diff --git a/tensorflow/lite/g3doc/guide/roadmap.md b/tensorflow/lite/g3doc/guide/roadmap.md
index ff98c856662..35ef44a7dbf 100644
--- a/tensorflow/lite/g3doc/guide/roadmap.md
+++ b/tensorflow/lite/g3doc/guide/roadmap.md
@@ -1,81 +1,87 @@
 # TensorFlow Lite 2019 Roadmap
 
-**Updated: August 29, 2019**
+**Updated: April 18, 2020**
 
-The following represents a high level overview of our 2019 plan. You should be
-conscious that this roadmap may change at anytime relative to a range of factors
-and the order below does not reflect any type of priority. As a matter of
-principle, we typically prioritize issues that the majority of our users are
-asking for and so this list fundamentally reflects that.
+The following represents a high level overview of our 2020 plan. You should be
+aware that this roadmap may change at any time and the order below does not
+reflect any type of priority. As a matter of principle, we typically prioritize
+issues based on the number of users affected.
 
 We break our roadmap into four key segments: usability, performance,
 optimization and portability. We strongly encourage you to comment on our
-roadmap and provide us feedback in the TF Lite discussion groups and forums.
+roadmap and provide us feedback in the
+[TF Lite discussion group](https://groups.google.com/a/tensorflow.org/g/tflite).
 
 ## Usability
 
-*   **New model converter**
-    *   New MLIR-based TensorFlow Lite convertor that better handles graph
-    conversion (e.g., control flow, conditionals, etc...)
-    *   Improved diagnostics and debugging of model conversion failures
 *   **Expanded ops coverage**
     *   Prioritized op additions based on user feedback
 *   **Improvements to using TensorFlow ops in TensorFlow Lite**
     *   Pre-built libraries available via Bintray (Android) and Cocoapods (iOS)
     *   Smaller binary size when using select TF ops via op stripping
 *   **LSTM / RNN support**
-    *   Full support of conversion for LSTMs and RNNs
-*   **Pre-and-post processing support**
-    *   New support library for model-specific pre-and-post processing
-    *   Utilities for common platform-specific functionality, e.g., loading a
-    model efficiently from assets, or converting a Bitmap to a tensor
+    *   Full LSTM and RNN conversion support, including support in Keras
+*   **Pre-and-post processing support libraries and codegen tool**
+    *   Ready-to-use API building blocks for common ML tasks
+    *   Support more models (e.g. NLP) and more platforms (e.g. iOS)
+*   **Android Studio Integration**
+    *   Drag & drop TFLite models into Android Studio to generate model binding
+        classes
 *   **Control Flow & Training on-device**
-    *   Support for control flow related ops
     *   Support for training on-device, focused on personalization and transfer
-    learning
-*   **Graph visualization tooling**
-    *   Provide enhanced graph visualization tooling
+        learning
+*   **Visualization tooling with TensorBoard**
+    *   Provide enhanced tooling with TensorBoard
+*   **Model Maker**
+    *   Support more tasks, including object detection and BERT-based NLP tasks
 *   **More models and examples**
-    *   More models on the support section of the site
-    *   Additional examples to demonstrate model usage as well as new features
-    and APIs, covering different platforms.
-    *   Model customization libraries and tutorials to let beginners to
-    customize those models easily.
+    *   More examples to demonstrate model usage as well as new features and
+        APIs, covering different platforms.
 
 ## Performance
 
 *   **Better tooling**
-    *   Simpler benchmarking and profiling tools for understanding available
-    accelerators and performance tradeoffs
     *   Public dashboard for tracking performance gains with each release
 *   **Improved CPU performance**
-    *   Continued optimization of float and quantized kernels
+    *   New highly optimized floating-point kernel library for convolutional
+        models
     *   First-class x86 support
 *   **Updated NN API support**
-    *   Full support for new Android Q NN API features, ops and types
+    *   Full support for new Android R NN API features, ops and types
 *   **GPU backend optimizations**
-    *   OpenCL and Vulkan support on Android
-    *   Metal and Objective-C CocoaPods for Metal acceleration
+    *   Vulkan support on Android
+    *   Support integer quantized models
 *   **Hexagon DSP backend**
-    *   Initial release of DSP acceleration for pre-Android P devices
+    *   Per-channel quantization support for all models created through
+        post-training quantization
+    *   Dynamic input batch size support
+    *   Better op coverage, including LSTM
+*   **Core ML backend**
+    *   Optimizing start-up time
+    *   Dynamic quantized models support
+    *   Float16 quantized models support
+    *   Better op coverage
 
 ## Optimization
 
 *   **Quantization**
-    *   Post training quantization for hybrid kernels -- [Launched](https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3){:.external}
-    *   Post training quantization for (8b) fixed-point kernels -- [Launched](https://medium.com/tensorflow/tensorflow-model-optimization-toolkit-post-training-integer-quantization-b4964a1ea9ba){:.external}
-    *   Training with quantization for (8b) fixed-point kernels
-    *   Extend post and during training APIs to (8b) fixed-point RNNs
-    *   Training with quantization for low bit-width (< 8b) fixed-point kernels
+
+    *   Post-training quantization for (8b) fixed-point RNNs
+    *   During-training quantization for (8b) fixed-point RNNs
+    *   Quality and performance improvements for post-training dynamic-range
+        quantization
+
 *   **Pruning / sparsity**
-    *   Magnitude based weight pruning during training -- [Launched](https://medium.com/tensorflow/tensorflow-model-optimization-toolkit-pruning-api-42cac9157a6a){:.external}
-    *   Support for sparse model execution
+
+    *   Sparse model execution support in TensorFlow Lite -
+        [WIP](https://github.com/tensorflow/model-optimization/issues/173)
+    *   Weight clustering API
 
 ## Portability
 
 *   **Microcontroller Support**
-    *   Add support for a range of 32-bit MCU architecture use cases for Speech
-    and Image Classification
+    *   Add support for a range of 32-bit MCU architecture use cases for speech
+        and image classification
     *   Sample code and models for vision and audio data
     *   Full TF Lite op support on microcontrollers
     *   Support for more platforms, including CircuitPython support
diff --git a/tensorflow/lite/g3doc/microcontrollers/build_convert.md b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
index b2bd2ce6ac8..cf18b782765 100644
--- a/tensorflow/lite/g3doc/microcontrollers/build_convert.md
+++ b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
@@ -71,7 +71,7 @@ important to change the array declaration to `const` for better memory
 efficiency on embedded platforms.
 
 For an example of how to include and use a model in your program, see
-[`sine_model_data.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/sine_model_data.cc)
+[`model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/model.cc)
 in the *Hello World* example.
 
 ## Model architecture and training
diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started.md b/tensorflow/lite/g3doc/microcontrollers/get_started.md
index 5c46701d1fe..96fa336c2ef 100644
--- a/tensorflow/lite/g3doc/microcontrollers/get_started.md
+++ b/tensorflow/lite/g3doc/microcontrollers/get_started.md
@@ -86,12 +86,10 @@ World README.md</a>
 
 The following section walks through the *Hello World* example's
 [`hello_world_test.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc),
-which demonstrates how to run inference using TensorFlow Lite for
-Microcontrollers.
+unit test which demonstrates how to run inference using TensorFlow Lite for
+Microcontrollers. It loads the model and runs inference several times.
 
-The test loads the model and then uses it to run inference several times.
-
-### Include the library headers
+### 1. Include the library headers
 
 To use the TensorFlow Lite for Microcontrollers library, we must include the
 following header files:
@@ -116,22 +114,20 @@ following header files:
 -   [`version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/version.h)
     provides versioning information for the TensorFlow Lite schema.
 
-### Include the model
+### 2. Include the model header
 
 The TensorFlow Lite for Microcontrollers interpreter expects the model to be
-provided as a C++ array. In the *Hello World* example, the model is defined in
-`sine_model_data.h` and `sine_model_data.cc`. The header is included with the
-following line:
+provided as a C++ array. The model is defined in `model.h` and `model.cc` files.
+The header is included with the following line:
 
 ```C++
-#include "tensorflow/lite/micro/examples/hello_world/sine_model_data.h"
+#include "tensorflow/lite/micro/examples/hello_world/model.h"
 ```
 
-### Set up the unit test
+### 3. Include the unit test framework header
 
-The code we are walking through is a unit test that uses the TensorFlow Lite for
-Microcontrollers unit test framework. To load the framework, we include the
-following file:
+In order to create a unit test, we include the TensorFlow Lite for
+Microcontrollers unit test framework by including the following line:
 
 ```C++
 #include "tensorflow/lite/micro/testing/micro_test.h"
@@ -143,11 +139,16 @@ The test is defined using the following macros:
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
+  . // add code here
+  .
+}
+
+TF_LITE_MICRO_TESTS_END
 ```
 
-The remainder of the code demonstrates how to load the model and run inference.
+We now discuss the code included in the macro above.
 
-### Set up logging
+### 4. Set up logging
 
 To set up logging, a `tflite::ErrorReporter` pointer is created using a pointer
 to a `tflite::MicroErrorReporter` instance:
@@ -162,14 +163,14 @@ logs. Since microcontrollers often have a variety of mechanisms for logging, the
 implementation of `tflite::MicroErrorReporter` is designed to be customized for
 your particular device.
 
-### Load a model
+### 5. Load a model
 
 In the following code, the model is instantiated using data from a `char` array,
-`g_sine_model_data`, which is declared in `sine_model_data.h`. We then check the
-model to ensure its schema version is compatible with the version we are using:
+`g_model`, which is declared in `model.h`. We then check the model to ensure its
+schema version is compatible with the version we are using:
 
 ```C++
-const tflite::Model* model = ::tflite::GetModel(g_sine_model_data);
+const tflite::Model* model = ::tflite::GetModel(g_model);
 if (model->version() != TFLITE_SCHEMA_VERSION) {
   TF_LITE_REPORT_ERROR(error_reporter,
       "Model provided is schema version %d not equal "
@@ -178,7 +179,7 @@ if (model->version() != TFLITE_SCHEMA_VERSION) {
 }
 ```
 
-### Instantiate operations resolver
+### 6. Instantiate operations resolver
 
 An
 [`AllOpsResolver`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/kernels/all_ops_resolver.h)
@@ -198,7 +199,7 @@ This is done using a different class, `MicroMutableOpResolver`. You can see how
 to use it in the *Micro speech* example's
 [`micro_speech_test.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc).
 
-### Allocate memory
+### 7. Allocate memory
 
 We need to preallocate a certain amount of memory for input, output, and
 intermediate arrays. This is provided as a `uint8_t` array of size
@@ -212,7 +213,7 @@ uint8_t tensor_arena[tensor_arena_size];
 The size required will depend on the model you are using, and may need to be
 determined by experimentation.
 
-### Instantiate interpreter
+### 8. Instantiate interpreter
 
 We create a `tflite::MicroInterpreter` instance, passing in the variables
 created earlier:
@@ -222,7 +223,7 @@ tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
                                      tensor_arena_size, error_reporter);
 ```
 
-### Allocate tensors
+### 9. Allocate tensors
 
 We tell the interpreter to allocate memory from the `tensor_arena` for the
 model's tensors:
@@ -231,7 +232,7 @@ model's tensors:
 interpreter.AllocateTensors();
 ```
 
-### Validate input shape
+### 10. Validate input shape
 
 The `MicroInterpreter` instance can provide us with a pointer to the model's
 input tensor by calling `.input(0)`, where `0` represents the first (and only)
@@ -265,7 +266,7 @@ The enum value `kTfLiteFloat32` is a reference to one of the TensorFlow Lite
 data types, and is defined in
 [`common.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/common.h).
 
-### Provide an input value
+### 11. Provide an input value
 
 To provide an input to the model, we set the contents of the input tensor, as
 follows:
@@ -276,7 +277,7 @@ input->data.f[0] = 0.;
 
 In this case, we input a floating point value representing `0`.
 
-### Run the model
+### 12. Run the model
 
 To run the model, we can call `Invoke()` on our `tflite::MicroInterpreter`
 instance:
@@ -300,7 +301,7 @@ successfully run.
 TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 ```
 
-### Obtain the output
+### 12. Obtain the output
 
 The model's output tensor can be obtained by calling `output(0)` on the
 `tflite::MicroInterpreter`, where `0` represents the first (and only) output
@@ -327,7 +328,7 @@ float value = output->data.f[0];
 TF_LITE_MICRO_EXPECT_NEAR(0., value, 0.05);
 ```
 
-### Run inference again
+### 13. Run inference again
 
 The remainder of the code runs inference several more times. In each instance,
 we assign a value to the input tensor, invoke the interpreter, and read the
@@ -350,7 +351,7 @@ value = output->data.f[0];
 TF_LITE_MICRO_EXPECT_NEAR(-0.959, value, 0.05);
 ```
 
-### Read the application code
+### 14. Read the application code
 
 Once you have walked through this unit test, you should be able to understand
 the example's application code, located in
diff --git a/tensorflow/lite/g3doc/models/image_classification/overview.md b/tensorflow/lite/g3doc/models/image_classification/overview.md
index 9bb4493cfe6..b0b177e5a70 100644
--- a/tensorflow/lite/g3doc/models/image_classification/overview.md
+++ b/tensorflow/lite/g3doc/models/image_classification/overview.md
@@ -329,4 +329,4 @@ images for each of the new labels you wish to train.
 Learn how to perform transfer learning in the
 <a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android/#0">Recognize
 flowers with TensorFlow</a> codelab, or with the
-[model maker toolkit](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker/demo/image_classification.ipynb).
+[model maker toolkit](/lite/tutorials/model_maker_image_classification).
diff --git a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
index 3f88b148977..4f2e08a73e8 100644
--- a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
+++ b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
@@ -87,6 +87,7 @@
         "started.\n",
         "\n",
         "\u003ca class=\"button button-primary\" href=\"https://github.com/tensorflow/examples/tree/master/lite/examples/style_transfer/android\"\u003eAndroid\n",
+        "example\u003c/a\u003e \u003ca class=\"button button-primary\" href=\"https://github.com/tensorflow/examples/tree/master/lite/examples/style_transfer/ios\"\u003eiOS\n",
         "example\u003c/a\u003e\n",
         "\n",
         "If you are using a platform other than Android or iOS, or you are already\n",
@@ -466,20 +467,32 @@
         "## Performance Benchmarks\n",
         "\n",
         "Performance benchmark numbers are generated with the tool [described here](https://www.tensorflow.org/lite/performance/benchmarks).\n",
-        "\u003ctable \u003e\u003cthead\u003e\u003ctr\u003e\u003cth\u003eModel name\u003c/th\u003e \u003cth\u003eModel size\u003c/th\u003e  \u003cth\u003eDevice \u003c/th\u003e \u003cth\u003eNNAPI\u003c/th\u003e \u003cth\u003eCPU\u003c/th\u003e\u003c/tr\u003e \u003c/thead\u003e \n",
-        "\u003ctr\u003e \u003ctd rowspan = 3\u003e \u003ca href=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/style_predict_quantized_256.tflite\"\u003eStyle prediction model\u003c/a\u003e \u003c/td\u003e \n",
+        "\u003ctable \u003e\u003cthead\u003e\u003ctr\u003e\u003cth\u003eModel name\u003c/th\u003e \u003cth\u003eModel size\u003c/th\u003e  \u003cth\u003eDevice \u003c/th\u003e \u003cth\u003eNNAPI\u003c/th\u003e \u003cth\u003eCPU\u003c/th\u003e \u003cth\u003eGPU\u003c/th\u003e\u003c/tr\u003e \u003c/thead\u003e \n",
+        "\u003ctr\u003e \u003ctd rowspan = 3\u003e \u003ca href=\"https://tfhub.dev/google/lite-model/magenta/arbitrary-image-stylization-v1-256/int8/prediction/1?lite-format=tflite\"\u003eStyle prediction model (int8)\u003c/a\u003e \u003c/td\u003e \n",
         "\u003ctd rowspan = 3\u003e2.8 Mb\u003c/td\u003e\n",
-        "\u003ctd\u003ePixel 3 (Android 10) \u003c/td\u003e \u003ctd\u003e142ms\u003c/td\u003e\u003ctd\u003e14ms*\u003c/td\u003e\u003c/tr\u003e\n",
-        "\u003ctr\u003e\u003ctd\u003ePixel 4 (Android 10) \u003c/td\u003e \u003ctd\u003e5.2ms\u003c/td\u003e\u003ctd\u003e6.7ms*\u003c/td\u003e\u003c/tr\u003e\n",
-        "\u003ctr\u003e\u003ctd\u003eiPhone XS (iOS 12.4.1) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e10.7ms**\u003c/td\u003e\u003c/tr\u003e\n",
-        "\u003ctr\u003e \u003ctd rowspan = 3\u003e \u003ca href=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/style_transfer_quantized_384.tflite\"\u003eStyle transform model\u003c/a\u003e \u003c/td\u003e \n",
-        "\u003ctd rowspan = 3\u003e0.2s Mb\u003c/td\u003e\n",
-        "\u003ctd\u003ePixel 3 (Android 10) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e540ms*\u003c/td\u003e\u003c/tr\u003e\n",
-        "\u003ctr\u003e\u003ctd\u003ePixel 4 (Android 10) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e405ms*\u003c/td\u003e\u003c/tr\u003e\n",
-        "\u003ctr\u003e\u003ctd\u003eiPhone XS (iOS 12.4.1) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e251ms**\u003c/td\u003e\u003c/tr\u003e\u003c/table\u003e\n",
-        "* 4 threads used.\n",
+        "\u003ctd\u003ePixel 3 (Android 10) \u003c/td\u003e \u003ctd\u003e142ms\u003c/td\u003e\u003ctd\u003e14ms*\u003c/td\u003e\u003ctd\u003e\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003ePixel 4 (Android 10) \u003c/td\u003e \u003ctd\u003e5.2ms\u003c/td\u003e\u003ctd\u003e6.7ms*\u003c/td\u003e\u003ctd\u003e\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003eiPhone XS (iOS 12.4.1) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e10.7ms**\u003c/td\u003e\u003ctd\u003e\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e \u003ctd rowspan = 3\u003e \u003ca href=\"https://tfhub.dev/google/lite-model/magenta/arbitrary-image-stylization-v1-256/int8/transfer/1?lite-format=tflite\"\u003eStyle transform model (int8)\u003c/a\u003e \u003c/td\u003e \n",
+        "\u003ctd rowspan = 3\u003e0.2 Mb\u003c/td\u003e\n",
+        "\u003ctd\u003ePixel 3 (Android 10) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e540ms*\u003c/td\u003e\u003ctd\u003e\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003ePixel 4 (Android 10) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e405ms*\u003c/td\u003e\u003ctd\u003e\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003eiPhone XS (iOS 12.4.1) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e251ms**\u003c/td\u003e\u003ctd\u003e\u003c/td\u003e\u003c/tr\u003e\n",
         "\n",
-        "** 2 threads on iPhone for the best performance.\n",
+        "\u003ctr\u003e \u003ctd rowspan = 2\u003e \u003ca href=\"https://tfhub.dev/google/lite-model/magenta/arbitrary-image-stylization-v1-256/fp16/prediction/1?lite-format=tflite\"\u003eStyle prediction model (float16)\u003c/a\u003e \u003c/td\u003e \n",
+        "\u003ctd rowspan = 2\u003e4.7 Mb\u003c/td\u003e\n",
+        "\u003ctd\u003ePixel 3 (Android 10) \u003c/td\u003e \u003ctd\u003e86ms\u003c/td\u003e\u003ctd\u003e28ms*\u003c/td\u003e\u003ctd\u003e9.1ms\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003ePixel 4 (Android 10) \u003c/td\u003e\u003ctd\u003e32ms\u003c/td\u003e\u003ctd\u003e12ms*\u003c/td\u003e\u003ctd\u003e10ms\u003c/td\u003e\u003c/tr\u003e\n",
+        "\n",
+        "\u003ctr\u003e \u003ctd rowspan = 2\u003e \u003ca href=\"https://tfhub.dev/google/lite-model/magenta/arbitrary-image-stylization-v1-256/fp16/transfer/1?lite-format=tflite\"\u003eStyle transfer model (float16)\u003c/a\u003e \u003c/td\u003e \n",
+        "\u003ctd rowspan = 2\u003e0.4 Mb\u003c/td\u003e\n",
+        "\u003ctd\u003ePixel 3 (Android 10) \u003c/td\u003e \u003ctd\u003e1095ms\u003c/td\u003e\u003ctd\u003e545ms*\u003c/td\u003e\u003ctd\u003e42ms\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003ePixel 4 (Android 10) \u003c/td\u003e\u003ctd\u003e603ms\u003c/td\u003e\u003ctd\u003e377ms*\u003c/td\u003e\u003ctd\u003e42ms\u003c/td\u003e\u003c/tr\u003e\n",
+        "\n",
+        "\u003c/table\u003e\n",
+        "\n",
+        "*\u0026ast; 4 threads used. \u003cbr/\u003e*\n",
+        "*\u0026ast;\u0026ast; 2 threads on iPhone for the best performance.*\n",
         "\n"
       ]
     }
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index 32f5ef485aa..7f7ebc465f4 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -25,8 +25,7 @@ models that have been optimized specifically for mobile and embedded devices.
 
 You can retrain the listed models on your own dataset by using transfer
 learning. Check out our transfer learning tutorial for
-[image classification](https://colab.sandbox.google.com/github/tensorflow/examples/blob/master/tensorflow_examples/lite/model_maker/demo/image_classification.ipynb)
-and
+[image classification](/lite/tutorials/model_maker_image_classification) and
 [object detection](https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193).
 
 ## Profile your model
diff --git a/tensorflow/lite/g3doc/performance/coreml_delegate.md b/tensorflow/lite/g3doc/performance/coreml_delegate.md
index 5d090f62631..c267347cf3f 100644
--- a/tensorflow/lite/g3doc/performance/coreml_delegate.md
+++ b/tensorflow/lite/g3doc/performance/coreml_delegate.md
@@ -6,7 +6,7 @@ which results in faster model inference on iOS devices.
 
 Note: This delegate is in experimental (beta) phase.
 
-Note: Core ML delegate is using Core ML version 2.1.
+Note: Core ML delegate supports Core ML version 2 and later.
 
 **Supported iOS versions and devices:**
 
@@ -34,8 +34,9 @@ target 'YourProjectName'
   pod 'TensorFlowLiteSwift', '~> 0.0.1-nightly'
 ```
 
-Note: After updating `Podfile`, you should run `pod cache clean` and `pod
-update` to reflect changes.
+Note: After updating `Podfile`, you should run `pod update` to reflect changes.
+If you can't see the latest `CoreMLDelegate.swift` file, try running `pod cache
+clean TensorFlowLiteSwift`.
 
 ### Swift
 
@@ -97,16 +98,15 @@ TfLiteCoreMlDelegateDelete(delegate);
 
 By default, Core ML delegate will only be created if the device has Neural
 Engine, and will return `null` if the delegate is not created. If you want to
-run Core ML delegate on other environments (for example, simulator), pass
-`.allDevices` as an option while creating delegate in Swift. On C++ (and
-Objective-C), you can pass `TfLiteCoreMlDelegateAllDevices`. Following example
-shows how to do this:
+run Core ML delegate on other environments (for example, simulator), pass `.all`
+as an option while creating delegate in Swift. On C++ (and Objective-C), you can
+pass `TfLiteCoreMlDelegateAllDevices`. Following example shows how to do this:
 
 #### Swift
 
 ```swift
 var options = CoreMLDelegate.Options()
-options.enabledDevices = .allDevices
+options.enabledDevices = .all
 let coreMLDelegate = CoreMLDelegate(options: options)!
 let interpreter: try Interpreter(modelPath: modelPath,
                                 delegates: [coreMLDelegate])
@@ -158,6 +158,14 @@ for more detail. Alternatively, you can implement your own set of blacklist
 devices using other libraries such as
 [DeviceKit](https://github.com/devicekit/DeviceKit).
 
+### Using older Core ML version
+
+Although iOS 13 supprots Core ML 3, the model might work better when it is
+converted with Core ML 2 model specification. The target conversion version is
+set to the latest version by default, but you can change this by setting
+`coreMLVersion` (in Swift, `coreml_version` in C API) in the delegate option to
+older version.
+
 ## Supported ops
 
 Following ops are supported by the Core ML delegate.
@@ -187,9 +195,13 @@ Following ops are supported by the Core ML delegate.
 *   ReluN1To1
 *   Relu6
 *   Reshape
+    *   Only supported when target Core ML version is 2, not supported when
+        targeting Core ML 3.
 *   ResizeBilinear
 *   SoftMax
 *   Tanh
+*   TransposeConv
+    *   Weights should be constant.
 
 ## Feedback
 
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index 8762afb4c83..b5abf46f845 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -31,7 +31,7 @@ models.
 For a step-by-step tutorial, watch the
 [GPU Delegate for Android](https://youtu.be/Xkhgre8r5G0) video.
 
-Note: This requires OpenGL ES 3.1 or higher.
+Note: This requires OpenCL or OpenGL ES (3.1 or higher).
 
 #### Step 1. Clone the TensorFlow source code and open it in Android Studio
 
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 9f47c2e55e8..dce3eb8db6b 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -1,9 +1,9 @@
 # TensorFlow Lite on GPU
 
 [TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/) supports several
-hardware accelerators.  This document describes how to use the GPU backend using
-the TensorFlow Lite delegate APIs on Android (requires OpenGL ES 3.1 or higher)
-and iOS (requires iOS 8 or later).
+hardware accelerators. This document describes how to use the GPU backend using
+the TensorFlow Lite delegate APIs on Android (requires OpenCL or OpenGL ES 3.1
+and higher) and iOS (requires iOS 8 or later).
 
 ## Benefits of GPU Acceleration
 
@@ -35,25 +35,33 @@ power and generating less heat than the same task run on a CPU.
 TensorFlow Lite on GPU supports the following ops in 16-bit and 32-bit float
 precision:
 
-* `ADD v1`
-* `AVERAGE_POOL_2D v1`
-* `CONCATENATION v1`
-* `CONV_2D v1`
-* `DEPTHWISE_CONV_2D v1-2`
-* `FULLY_CONNECTED v1`
-* `LOGISTIC v1`
-* `MAX_POOL_2D v1`
-* `MUL v1`
-* `PAD v1`
-* `PRELU v1`
-* `RELU v1`
-* `RELU6 v1`
-* `RESHAPE v1`
-* `RESIZE_BILINEAR v1`
-* `SOFTMAX v1`
-* `STRIDED_SLICE v1`
-* `SUB v1`
-* `TRANSPOSE_CONV v1`
+*   `ADD`
+*   `AVERAGE_POOL_2D`
+*   `CONCATENATION`
+*   `CONV_2D`
+*   `DEPTHWISE_CONV_2D v1-2`
+*   `EXP`
+*   `FULLY_CONNECTED`
+*   `LOGISTIC`
+*   `LSTM v2 (Basic LSTM only)`
+*   `MAX_POOL_2D`
+*   `MAXIMUM`
+*   `MINIMUM`
+*   `MUL`
+*   `PAD`
+*   `PRELU`
+*   `RELU`
+*   `RELU6`
+*   `RESHAPE`
+*   `RESIZE_BILINEAR v1-3`
+*   `SOFTMAX`
+*   `STRIDED_SLICE`
+*   `SUB`
+*   `TRANSPOSE_CONV`
+
+By default, all ops are only supported at version 1. Enabling the
+[experimental quantization support](gpu_advanced.md#running-quantized-models-experimental-android-only)
+allows the appropriate versions; for example, ADD v2.
 
 ## Basic Usage
 
@@ -82,8 +90,8 @@ delegate.close();
 ### Android (C/C++)
 
 For C/C++ usage of TensorFlow Lite GPU on Android, the GPU delegate can be
-created with `TfLiteGpuDelegateCreate()` and destroyed with
-`TfLiteGpuDelegateDelete()`.
+created with `TfLiteGpuDelegateV2Create()` and destroyed with
+`TfLiteGpuDelegateV2Delete()`.
 
 ```c++
 // Set up interpreter.
@@ -94,15 +102,7 @@ std::unique_ptr<Interpreter> interpreter;
 InterpreterBuilder(*model, op_resolver)(&interpreter);
 
 // NEW: Prepare GPU delegate.
-const TfLiteGpuDelegateOptions options = {
-  .metadata = NULL,
-  .compile_options = {
-    .precision_loss_allowed = 1,  // FP16
-    .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST,
-    .dynamic_batch_enabled = 0,   // Not fully functional yet
-  },
-};
-auto* delegate = TfLiteGpuDelegateCreate(&options);
+auto* delegate = TfLiteGpuDelegateV2Create(/*default options=*/nullptr);
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 // Run inference.
@@ -111,9 +111,13 @@ if (interpreter->Invoke() != kTfLiteOk) return false;
 ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
 
 // NEW: Clean up.
-TfLiteGpuDelegateDelete(delegate);
+TfLiteGpuDelegateV2Delete(delegate);
 ```
 
+Take a look at `TfLiteGpuDelegateOptionsV2` to create a delegate instance with
+custom options. You can initialize the default options with
+`TfLiteGpuDelegateOptionsV2Default()` and then modify them as necessary.
+
 TFLite GPU for Android C/C++ uses the [Bazel](https://bazel.io) build system.
 The delegate can be built, for example, using the following command:
 
@@ -165,6 +169,43 @@ called.
 
 ## Advanced Usage
 
+### Running quantized models (Experimental, Android only)
+
+The GPU delegate already supports
+[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
+models. There is experimental support on Android to run 8-bit quantized as well.
+This includes all flavors of quantization, including:
+
+*   Models trained with
+    [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)
+*   [Post-training dynamic-range quantization](https://www.tensorflow.org/lite/performance/post_training_quant)
+*   [Post-training full-integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant)
+
+To optimize performance, use models that have floating-point input & output
+tensors.
+
+This feature can be enabled using delegate options as follows:
+
+**C++ API**
+
+```c++
+// NEW: Prepare custom options with feature enabled.
+TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
+options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+
+auto* delegate = TfLiteGpuDelegateV2Create(options);
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+```
+
+**Java API**
+
+```java
+// NEW: Prepare GPU delegate with feature turned on.
+GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+```
+
 ### Delegate Options for iOS
 
 `NewGpuDelegate()` accepts a `struct` of options.
@@ -210,7 +251,7 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set
 the options, to avoid any unexpected behavior if default values are changed in
 the future.
 
-### Input/Output Buffers
+### Input/Output Buffers (iOS only)
 
 To do computation on the GPU, data must be made available to the GPU. This often
 requires performing a memory copy. It is desirable not to cross the CPU/GPU
@@ -229,80 +270,10 @@ To achieve best performance, TensorFlow Lite makes it possible for users to
 directly read from and write to the TensorFlow hardware buffer and bypass
 avoidable memory copies.
 
-#### Android
-
-Assuming the image input is in the GPU memory, it must first be converted to an
-OpenGL Shader Storage Buffer Object (SSBO). You can associate a TfLiteTensor to
-a user-prepared SSBO with `Interpreter.bindGlBufferToTensor()`. Note that
-`Interpreter.bindGlBufferToTensor()` must be called before
-`Interpreter.modifyGraphWithDelegate()`.
-
-```java
-// Ensure a valid EGL rendering context.
-EGLContext eglContext = eglGetCurrentContext();
-if (eglContext.equals(EGL_NO_CONTEXT)) return false;
-
-// Create an SSBO.
-int[] id = new int[1];
-glGenBuffers(id.length, id, 0);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
-glBufferData(GL_SHADER_STORAGE_BUFFER, inputSize, null, GL_STREAM_COPY);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
-int inputSsboId = id[0];
-
-// Create interpreter.
-Interpreter interpreter = new Interpreter(tfliteModel);
-Tensor inputTensor = interpreter.getInputTensor(0);
-GpuDelegate gpuDelegate = new GpuDelegate();
-// The buffer must be bound before the delegate is installed.
-gpuDelegate.bindGlBufferToTensor(inputTensor, inputSsboId);
-interpreter.modifyGraphWithDelegate(gpuDelegate);
-
-// Run inference; the null input argument indicates use of the bound buffer for input.
-fillSsboWithCameraImageTexture(inputSsboId);
-float[] outputArray = new float[outputSize];
-interpreter.runInference(null, outputArray);
-```
-
-A similar approach can be applied to the output tensor. In that case,
-`Interpreter.Options.setAllowBufferHandleOutput(true)` should be passed on, to
-disable the default copying of the network's output from GPU memory to CPU
-memory.
-
-```java
-// Ensure a valid EGL rendering context.
-EGLContext eglContext = eglGetCurrentContext();
-if (eglContext.equals(EGL_NO_CONTEXT)) return false;
-
-// Create a SSBO.
-int[] id = new int[1];
-glGenBuffers(id.length, id, 0);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
-glBufferData(GL_SHADER_STORAGE_BUFFER, outputSize, null, GL_STREAM_COPY);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
-int outputSsboId = id[0];
-
-// Create interpreter.
-Interpreter.Options options = (new Interpreter.Options()).setAllowBufferHandleOutput(true);
-Interpreter interpreter = new Interpreter(tfliteModel, options);
-Tensor outputTensor = interpreter.getOutputTensor(0);
-GpuDelegate gpuDelegate = new GpuDelegate();
-// The buffer must be bound before the delegate is installed.
-gpuDelegate.bindGlBufferToTensor(outputTensor, outputSsboId);
-interpreter.modifyGraphWithDelegate(gpuDelegate);
-
-// Run inference; the null output argument indicates use of the bound buffer for output.
-ByteBuffer input = getCameraImageByteBuffer();
-interpreter.runInference(input, null);
-renderOutputSsbo(outputSsboId);
-```
-
-#### iOS
-
 Assuming the image input is in GPU memory, it must first be converted to a
 `MTLBuffer` object for Metal. You can associate a TfLiteTensor to a
-user-prepared `MTLBuffer` with `BindMetalBufferToTensor()`. Note that
-`BindMetalBufferToTensor()` must be called before
+user-prepared `MTLBuffer` with `TFLGpuDelegateBindMetalBufferToTensor()`. Note
+that `TFLGpuDelegateBindMetalBufferToTensor()` must be called before
 `Interpreter::ModifyGraphWithDelegate()`. Additionally, the inference output is,
 by default, copied from GPU memory to CPU memory. This behavior can be turned
 off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during
@@ -312,8 +283,8 @@ initialization.
 // Prepare GPU delegate.
 auto* delegate = NewGpuDelegate(nullptr);
 interpreter->SetAllowBufferHandleOutput(true);  // disable default gpu->cpu copy
-if (!BindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
-if (!BindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
+if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
+if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 // Run inference.
diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
index 73ae26ced0d..0e947d1d5e1 100644
--- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md
+++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
@@ -22,15 +22,15 @@ are supported, including:
 
 **Supported models:**
 
-The Hexagon delegate currently supports quantized models generated using
-[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize),
-e.g.,
-[these quantized models](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models)
-hosted on the TensorFlow Lite repo. It does not (yet) support models with
-[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec).
-Sample models include
-[MobileNet V1](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz),
-[SSD Mobilenet](https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip).
+The Hexagon delegate supports all models that conform to our
+[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec),
+including those generated using
+[post-training integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant).
+UInt8 models trained with the legacy
+[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize)
+path are also supported, for e.g.,
+[these quantized versions](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models)
+on our Hosted Models page.
 
 ## Hexagon Delegate Java API
 
@@ -74,10 +74,11 @@ dependencies {
     “libhexagon_nn_skel_v66.so”
     *   [v1.10.3](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_1_10_3_1.run)
     *   [v1.14](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_v1.14.run)
+    *   [v1.17](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_v1.17.0.0.run)
 
 Note: You will need to accept the license agreement.
 
-Note: As of 03/03/2020 you should use v1.14.
+Note: As of 04/28/2020 you should use v1.17.
 
 Note: You must use the hexagon_nn libraries with the compatible version of
 interface library. Interface library is part of the AAR and fetched by bazel
@@ -253,47 +254,8 @@ ro.board.platform`).
 
 ## FAQ
 
-*   Will the delegate support models created using
-    [post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)?
-    *   This is tentatively planned for a future release, though there is no
-        concrete timeline.
 *   Which ops are supported by the delegate?
-    *   Initial list of supported ops:
-        *   Add
-        *   ArgMax
-        *   ArgMin
-        *   AveragePool2D (without any activation)
-        *   Concat
-        *   Conv2D with following constraints:
-            *   stride width/height <= 3
-        *   DepthToSpace
-        *   DepthwiseConv2D with following constraints:
-            *   Filter width == 3
-            *   depth_multiplier == 1
-            *   dilation only supported when stride == 1
-            *   Otherwise, stride height/width <= 3
-        *   FullyConnected (without any activation)
-        *   L2Normalization (without any activation)
-        *   Logistic (aka Sigmoid)
-        *   MaxPool2D (without any activation)
-        *   Mul (without any activation)
-        *   Neg
-        *   Pad: Only supports 0 padding
-        *   Relu
-        *   Relu6
-        *   Reshape
-        *   Resize Bilinear with following constraints:
-            *   Requested size <= 65
-        *   Resize Nearest Neighbor
-        *   SoftMax
-        *   SpaceToDepth
-        *   Split
-        *   Sub
-        *   Tanh
-        *   Transpose
-        *   TransposeConv2D with following constraints:
-            *   stride height/width <= 3
-            *   dilation height/width == 1
+    *   See the current list of [supported ops and constraints](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/hexagon/README.md)
 *   How can I tell that the model is using the DSP when I enable the delegate?
     *   Two log messages will be printed when you enable the delegate - one to
         indicate if the delegate was created and another to indicate how many
diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md
index feb6cfecea6..c66b06f9b59 100644
--- a/tensorflow/lite/g3doc/performance/model_optimization.md
+++ b/tensorflow/lite/g3doc/performance/model_optimization.md
@@ -89,9 +89,9 @@ The following types of quantization are available in TensorFlow Lite:
 Technique                                                                                               | Data requirements                | Size reduction | Accuracy                    | Supported hardware
 ------------------------------------------------------------------------------------------------------- | -------------------------------- | -------------- | --------------------------- | ------------------
 [Post-training float16 quantization](post_training_float16_quant.ipynb)                                 | No data                          | Up to 50%      | Insignificant accuracy loss | CPU, GPU
-[Post-training dynamic range quantization](post_training_quant.ipynb)                                   | No data                          | Up to 75%      | Accuracy loss               | CPU
-[Post-training integer quantization](post_training_integer_quant.ipynb)                                 | Unlabelled representative sample | Up to 75%      | Smaller accuracy loss       | CPU, EdgeTPU, Hexagon DSP
-[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data           | Up to 75%      | Smallest accuracy loss      | CPU, EdgeTPU, Hexagon DSP
+[Post-training dynamic range quantization](post_training_quant.ipynb)                                   | No data                          | Up to 75%      | Accuracy loss               | CPU, GPU (Android)
+[Post-training integer quantization](post_training_integer_quant.ipynb)                                 | Unlabelled representative sample | Up to 75%      | Smaller accuracy loss       | CPU, GPU (Android), EdgeTPU, Hexagon DSP
+[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data           | Up to 75%      | Smallest accuracy loss      | CPU, GPU (Android), EdgeTPU, Hexagon DSP
 
 Below are the latency and accuracy results for post-training quantization and
 quantization-aware training on a few models. All latency numbers are measured on
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 194d102d43d..a526be75b61 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -4,51 +4,44 @@ Post-training quantization is a conversion technique that can reduce model size
 while also improving CPU and hardware accelerator latency, with little
 degradation in model accuracy. You can perform these techniques using an
 already-trained float TensorFlow model when you convert it to TensorFlow Lite
-format.
+format using the [TensorFlow Lite Converter](../convert/).
 
 Note: The procedures on this page require TensorFlow 1.15 or higher.
 
-
-### Optimization options
+### Optimization Methods
 
 There are several post-training quantization options to choose from. Here is a
 summary table of the choices and the benefits they provide:
 
-| Technique                 | Benefits                  | Hardware            |
-| ------------------------- | ------------------------- | ------------------- |
-| Dynamic range             | 4x smaller, 2-3x speedup, | CPU                 |
-: quantization              : accuracy                  :                     :
-| Full integer quantization | 4x smaller, 3x+ speedup   | CPU, Edge TPU, etc. |
-| Float16 quantization      | 2x smaller, potential GPU | CPU/GPU             |
-:                           : acceleration              :                     :
+| Technique            | Benefits                  | Hardware         |
+| -------------------- | ------------------------- | ---------------- |
+| Dynamic range        | 4x smaller, 2-3x speedup  | CPU              |
+: quantization         :                           :                  :
+| Full integer         | 4x smaller, 3x+ speedup   | CPU, Edge TPU,   |
+: quantization         :                           : Microcontrollers :
+| Float16 quantization | 2x smaller, potential GPU | CPU, GPU         |
+:                      : acceleration              :                  :
 
 This decision tree can help determine which post-training quantization method is
 best for your use case:
 
 ![post-training optimization options](images/optimization.jpg)
 
-Alternatively, you might achieve higher accuracy if you perform
-[quantization-aware training](
-https://github.com/tensorflow/tensorflow/tree/r1.14/tensorflow/contrib/quantize).
-However, doing so requires some model modifications to add fake quantization
-nodes, whereas the post-training quantization techniques on this page use an
-existing pre-trained model.
-
 ### Dynamic range quantization
 
 The simplest form of post-training quantization statically quantizes only the
-weights from floating point to 8-bits of precision. This technique is enabled as
-an option in the [TensorFlow Lite converter](../convert/):
+weights from floating point to integer, which has 8-bits of precision:
 
-```
+<pre>
 import tensorflow as tf
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]</b>
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-At inference, weights are converted from 8-bits of precision to floating point and
-computed using floating-point kernels. This conversion is done once and cached to reduce latency.
+At inference, weights are converted from 8-bits of precision to floating point
+and computed using floating-point kernels. This conversion is done once and
+cached to reduce latency.
 
 To further improve latency, "dynamic-range" operators dynamically quantize
 activations based on their range to 8-bits and perform computations with 8-bit
@@ -58,89 +51,105 @@ point, so that the speedup with dynamic-range ops is less than a full
 fixed-point computation. Dynamic-range ops are available for the most
 compute-intensive operators in a network:
 
-*  [tf.contrib.layers.fully_connected](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected)
-*  [tf.nn.conv2d](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d)
-*  [tf.nn.embedding_lookup](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup)
-*  [BasicRNN](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicRNNCell)
-*  [tf.nn.bidirectional_dynamic_rnn for BasicRNNCell type](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
-*  [tf.nn.dynamic_rnn for LSTM and BasicRNN Cell types](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
+*   `tf.keras.layers.Dense`
+*   `tf.keras.layers.Conv2D`
+*   `tf.keras.layers.LSTM`
+*   `tf.nn.embedding_lookup`
+*   `tf.compat.v1.nn.rnn_cell.BasicRNNCell`
+*   `tf.compat.v1.nn.bidirectional_dynamic_rnn`
+*   `tf.compat.v1.nn.dynamic_rnn`
 
-
-### Full integer quantization of weights and activations
+### Full integer quantization
 
 You can get further latency improvements, reductions in peak memory usage, and
-access to integer only hardware accelerators by making sure all model math is
-quantized.
+access to integer only hardware devices or accelerators by making sure all model
+math is integer quantized.
 
 To do this, you need to measure the dynamic range of activations and inputs by
-supplying a representative data set. You can simply create an input data
-generator and provide it to our converter. For example:
+supplying sample input data to the converter. Refer to the
+`representative_dataset_gen()` function used in the following code.
 
-```
+#### Integer with float fallback (using default float input/output)
+
+In order to fully integer quantize a model, but use float operators when they
+don't have an integer implementation (to ensure conversion occurs smoothly), use
+the following steps:
+
+<pre>
 import tensorflow as tf
-
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
     # Get sample input data as a numpy array in a method of your choosing.
     yield [input]
-
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.representative_dataset = representative_dataset_gen
+converter.representative_dataset = representative_dataset_gen</b>
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-The resulting model should be fully quantized, but any
-ops that do not have quantized implementations are left in
-floating point. This allows conversion to occur smoothly, but the model won't be
-compatible with accelerators that require full integer quantization.
+Note: This won't be compatible with integer only devices (such as 8-bit
+microcontrollers) and accelerators (such as the Coral Edge TPU). For convenience
+during inference, the input and output still remain float in order to have the
+same interface as the original float only model.
 
-Additionally, the model still uses float input and output for convenience.
+#### Integer only
 
-To ensure compatibility with some accelerators (such as the Coral Edge TPU), you
-can enforce full integer quantization for all ops and use integer input and
-output by adding the following lines before you convert:
+*This is a common use case for
+[TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)
+and [Coral Edge TPUs](https://coral.ai/).*
 
-```
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-converter.inference_input_type = tf.uint8
-converter.inference_output_type = tf.uint8
-```
+Additionally, to ensure compatibility with integer only devices (such as 8-bit
+microcontrollers) and accelerators (such as the Coral Edge TPU), you can enforce
+full integer quantization for all ops including the input and output, by using
+the following steps:
 
-The first line makes the converter throw an error if it encounters an operation
-it cannot currently quantize.
-
-Note: `target_spec.supported_ops` was previously `target_ops` in the Python API.
-
-
-### Float16 quantization of weights
-
-You can reduce the size of a floating point model by quantizing the weights to
-float16, the IEEE standard for 16-bit floating point numbers. The advantages of
-this quantization are as follows:
-
--   reduce model size by up to half (since all weights are now half the original
-    size)
--   minimal loss in accuracy
--   some delegates (e.g. the GPU delegate) can operate directly on float16 data,
-    which results in faster execution than float32 computations.
-
-This quantization may not be a good choice if you need maximum performance (a
-quantization to fixed point math would be better in that case). To enable
-float16 quantization of weights, specify "DEFAULT" optimization as above and
-then specify that float16 is in supported types for the target_spec:
-
-```
+<pre>
 import tensorflow as tf
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]
+def representative_dataset_gen():
+  for _ in range(num_calibration_steps):
+    # Get sample input data as a numpy array in a method of your choosing.
+    yield [input]
+converter.representative_dataset = representative_dataset_gen
+<b>converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]</b>
+<b>converter.inference_input_type = tf.int8</b>  # or tf.uint8
+<b>converter.inference_output_type = tf.int8</b>  # or tf.uint8
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-By default, a float16 quantized model will "dequantize" the weights values to
-float32 when run on the CPU. The GPU delegate will not perform this
-dequantization, since it can operate on float16 data.
+Note: The converter will throw an error if it encounters an operation it cannot
+currently quantize.
+
+### Float16 quantization
+
+You can reduce the size of a floating point model by quantizing the weights to
+float16, the IEEE standard for 16-bit floating point numbers. To enable float16
+quantization of weights, use the following steps:
+
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]</b>
+tflite_quant_model = converter.convert()
+</pre>
+
+The advantages of this quantization are as follows:
+
+*   Reduce model size by up to half (since all weights are now half the original
+    size).
+*   Minimal loss in accuracy.
+*   Supports some delegates (e.g. the GPU delegate) can operate directly on
+    float16 data, which results in faster execution than float32 computations.
+
+The disadvantages of this quantization are as follows:
+
+*   Not a good choice for maximum performance (a quantization to fixed point
+    math would be better in that case).
+*   By default, a float16 quantized model will "dequantize" the weights values
+    to float32 when run on the CPU. (Note that the GPU delegate will not perform
+    this dequantization, since it can operate on float16 data.)
 
 ### Model accuracy
 
@@ -152,13 +161,18 @@ accuracy of the quantized model to verify that any degradation in accuracy is
 within acceptable limits. There is a tool to evaluate
 [TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/ilsvrc/README.md){:.external}.
 
-If the accuracy drop is too high, consider using
-[quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external}.
+Alternatively, if the accuracy drop is too high, consider using
+[quantization aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+. However, doing so requires modifications during model training to add fake
+quantization nodes, whereas the post-training quantization techniques on this
+page use an existing pre-trained model.
 
 ### Representation for quantized tensors
 
 8-bit quantization approximates floating point values using the following
-formula. `real_value = (int8_value - zero_point) * scale`.
+formula.
+
+$$real\_value = (int8\_value - zero\_point) \times scale$$
 
 The representation has two main parts:
 
diff --git a/tensorflow/lite/g3doc/performance/quantization_spec.md b/tensorflow/lite/g3doc/performance/quantization_spec.md
index 9c30fbdc855..e6cc9496f8c 100644
--- a/tensorflow/lite/g3doc/performance/quantization_spec.md
+++ b/tensorflow/lite/g3doc/performance/quantization_spec.md
@@ -73,7 +73,10 @@ multiplied by dynamic input and activation values. This means that there is a
 unavoidable runtime cost of multiplying the zero-point of the weight with the
 activation value. By enforcing that zero-point is 0 we can avoid this cost.
 
-Explanation of the math:
+Explanation of the math: this is similar to section 2.3 in
+[arXiv:1712.05877](https://arxiv.org/abs/1712.05877), except for the difference
+that we allow the scale values to be per-axis. This generalizes readily, as
+follows:
 
 $A$ is a $m \times n$ matrix of quantized activations. <br />
 $B$ is a $n \times p$ matrix of quantized weights. <br />
@@ -539,3 +542,7 @@ QUANTIZE (Requantization)
     range      : [-128, 127]
     granularity: per-tensor
 ```
+
+## References
+
+[arXiv:1712.05877](https://arxiv.org/abs/1712.05877)
diff --git a/tensorflow/lite/g3doc/tools/BUILD b/tensorflow/lite/g3doc/tools/BUILD
new file mode 100644
index 00000000000..f2c6d8efedc
--- /dev/null
+++ b/tensorflow/lite/g3doc/tools/BUILD
@@ -0,0 +1,15 @@
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_binary(
+    name = "build_py_api_docs",
+    srcs = ["build_py_api_docs.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+    ],
+)
diff --git a/tensorflow/lite/g3doc/tools/build_py_api_docs.py b/tensorflow/lite/g3doc/tools/build_py_api_docs.py
new file mode 100644
index 00000000000..90a8e45ca6a
--- /dev/null
+++ b/tensorflow/lite/g3doc/tools/build_py_api_docs.py
@@ -0,0 +1,69 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Generate python docs for tf.lite.
+
+# How to run
+
+```
+python build_docs.py --output_dir=/path/to/output
+```
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pathlib
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+from tensorflow_docs.api_generator import generate_lib
+
+flags.DEFINE_string('output_dir', '/tmp/lite_api/',
+                    'The path to output the files to')
+
+flags.DEFINE_string('code_url_prefix',
+                    'https://github.com/tensorflow/tensorflow/blob/master/',
+                    'The url prefix for links to code.')
+
+flags.DEFINE_bool('search_hints', True,
+                  'Include metadata search hints in the generated files')
+
+flags.DEFINE_string('site_path', 'lite/api_docs/python',
+                    'Path prefix in the _toc.yaml')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  doc_generator = generate_lib.DocGenerator(
+      root_title='TensorFlow Lite',
+      py_modules=[('lite', tf.lite)],
+      base_dir=str(pathlib.Path(tf.__file__).parent),
+      code_url_prefix=FLAGS.code_url_prefix,
+      search_hints=FLAGS.search_hints,
+      site_path=FLAGS.site_path,
+      callbacks=[])
+
+  doc_generator.build(output_dir=FLAGS.output_dir)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/lite/g3doc/tutorials/_index.yaml b/tensorflow/lite/g3doc/tutorials/_index.yaml
index 263011bfbe5..06d5e780cd7 100644
--- a/tensorflow/lite/g3doc/tutorials/_index.yaml
+++ b/tensorflow/lite/g3doc/tutorials/_index.yaml
@@ -87,11 +87,11 @@ landing_page:
       path: https://codelabs.developers.google.com/codelabs/digit-classifier-tflite/#0
     - classname: tfo-landing-page-card
       description: >
-        <a href="https://colab.sandbox.google.com/github/tensorflow/examples/blob/master/tensorflow_examples/lite/model_maker/demo/image_classification.ipynb">
+        <a href="/lite/tutorials/model_maker_image_classification">
           <h3 class="no-link">Transfer learning for image classification</h3>
         </a>
         Learn how to use TensorFlow Lite Model Maker to quickly create image classification models.
-      path: https://colab.sandbox.google.com/github/tensorflow/examples/blob/master/tensorflow_examples/lite/model_maker/demo/image_classification.ipynb
+      path: /lite/tutorials/model_maker_image_classification
 
   # IoT developers
   ## Linux-based IoT devices
@@ -160,7 +160,7 @@ landing_page:
             bar. They show you how to train a model for a specific machine learning task, such as
             <a href="https://blog.tensorflow.org/2018/07/training-and-serving-realtime-mobile-object-detector-cloud-tpus.html">object detection</a>
             or
-            <a href="https://colab.sandbox.google.com/github/tensorflow/examples/blob/master/tensorflow_examples/lite/model_maker/demo/text_classification.ipynb">sentiment analysis</a>.
+            <a href="/lite/tutorials/model_maker_text_classification">sentiment analysis</a>.
           </li>
           <li>
             Learn more about the development workflow in the TensorFlow Lite
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
new file mode 100644
index 00000000000..464a5d1b5ef
--- /dev/null
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
@@ -0,0 +1,938 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "h2q27gKz1H20"
+      },
+      "source": [
+        "##### Copyright 2019 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "cellView": "form",
+        "colab": {},
+        "colab_type": "code",
+        "id": "TUfAcER1oUS6"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Gb7qyhNL1yWt"
+      },
+      "source": [
+        "# Image classification with TensorFlow Lite Model Maker"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "nDABAblytltI"
+      }, 
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/tutorials/model_maker_image_classification\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "m86-Nh4pMHqY"
+      },
+      "source": [
+        "Model Maker library simplifies the process of adapting and converting a TensorFlow neural-network model to particular input data when deploying this model for on-device ML applications.\n",
+        "\n",
+        "This notebook shows an end-to-end example that utilizes this Model Maker library to illustrate the adaption and conversion of a commonly-used image classification model to classify flowers on a mobile device."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "bcLF2PKkSbV3"
+      },
+      "source": [
+        "## Prerequisites\n",
+        "\n",
+        "To run this example, we first need to install serveral required packages, including Model Maker package that in github [repo](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "6cv3K3oaksJv"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Gx1HGRoFQ54j"
+      },
+      "source": [
+        "Import the required packages."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "XtxiUeZEiXpt"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "assert tf.__version__.startswith('2')\n",
+        "\n",
+        "from tensorflow_examples.lite.model_maker.core.data_util.image_dataloader import ImageClassifierDataLoader\n",
+        "from tensorflow_examples.lite.model_maker.core.task import image_classifier\n",
+        "from tensorflow_examples.lite.model_maker.core.task.model_spec import mobilenet_v2_spec\n",
+        "from tensorflow_examples.lite.model_maker.core.task.model_spec import ImageModelSpec\n",
+        "\n",
+        "import matplotlib.pyplot as plt"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "KKRaYHABpob5"
+      },
+      "source": [
+        "## Simple End-to-End Example"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "SiZZ5DHXotaW"
+      },
+      "source": [
+        "### Get the data path\n",
+        "\n",
+        "Let's get some images to play with this simple end-to-end example. Hundreds of images is a good start for Model Maker while more data could achieve better accuracy."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "cellView": "form",
+        "colab": {},
+        "colab_type": "code",
+        "id": "3jz5x0JoskPv"
+      },
+      "outputs": [],
+      "source": [
+        "image_path = tf.keras.utils.get_file(\n",
+        "      'flower_photos',\n",
+        "      'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',\n",
+        "      untar=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "a55MR6i6nuDm"
+      },
+      "source": [
+        "You could replace `image_path` with your own image folders. As for uploading data to colab, you could find the upload button in the left sidebar shown in the image below with the red rectangle. Just have a try to upload a zip file and unzip it. The root file path is the current path.\n",
+        "\n",
+        "\u003cimg src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_image_classification.png\" alt=\"Upload File\" width=\"800\" hspace=\"100\"\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "NNRNv_mloS89"
+      },
+      "source": [
+        "If you prefer not to upload your images to the cloud, you could try to run the library locally following the [guide](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker) in github."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "w-VDriAdsowu"
+      },
+      "source": [
+        "### Run the example\n",
+        "The example just consists of 4 lines of code as shown below, each of which representing one step of the overall process.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6ahtcO86tZBL"
+      },
+      "source": [
+        "Step 1.   Load input data specific to an on-device ML app. Split it to training data and testing data."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "lANoNS_gtdH1"
+      },
+      "outputs": [],
+      "source": [
+        "data = ImageClassifierDataLoader.from_folder(image_path)\n",
+        "train_data, test_data = data.split(0.9)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Y_9IWyIztuRF"
+      },
+      "source": [
+        "Step 2. Customize the TensorFlow model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "yRXMZbrwtyRD"
+      },
+      "outputs": [],
+      "source": [
+        "model = image_classifier.create(train_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "oxU2fDr-t2Ya"
+      },
+      "source": [
+        "Step 3. Evaluate the model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "wQr02VxJt6Cs"
+      },
+      "outputs": [],
+      "source": [
+        "loss, accuracy = model.evaluate(test_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eVZw9zU8t84y"
+      },
+      "source": [
+        "Step 4.  Export to TensorFlow Lite model.\n",
+        "\n",
+        "Here, we export TensorFlow Lite model with [metadata](https://www.tensorflow.org/lite/convert/metadata) which provides a standard for model descriptions.\n",
+        "You could download it in the left sidebar same as the uploading part for your own use."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Zb-eIzfluCoa"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "pyju1qc_v-wy"
+      },
+      "source": [
+        "After this simple 4 steps, we could further use TensorFlow Lite model file and label file in on-device applications like in [image classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification) reference app.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "R1QG32ivs9lF"
+      },
+      "source": [
+        "## Detailed Process\n",
+        "\n",
+        "Currently, we support several models such as  EfficientNet-Lite* models, MobileNetV2, ResNet50 as pre-trained models for image classification. But it is very flexible to add new pre-trained models to this library with just a few lines of code.\n",
+        "\n",
+        "\n",
+        "The following walks through this end-to-end example step by step to show more detail."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ygEncJxtl-nQ"
+      },
+      "source": [
+        "### Step 1: Load Input Data Specific to an On-device ML App\n",
+        "\n",
+        "The flower dataset contains 3670 images belonging to 5 classes. Download the archive version of the dataset and untar it.\n",
+        "\n",
+        "The dataset has the following directory structure:\n",
+        "\n",
+        "\u003cpre\u003e\n",
+        "\u003cb\u003eflower_photos\u003c/b\u003e\n",
+        "|__ \u003cb\u003edaisy\u003c/b\u003e\n",
+        "    |______ 100080576_f52e8ee070_n.jpg\n",
+        "    |______ 14167534527_781ceb1b7a_n.jpg\n",
+        "    |______ ...\n",
+        "|__ \u003cb\u003edandelion\u003c/b\u003e\n",
+        "    |______ 10043234166_e6dd915111_n.jpg\n",
+        "    |______ 1426682852_e62169221f_m.jpg\n",
+        "    |______ ...\n",
+        "|__ \u003cb\u003eroses\u003c/b\u003e\n",
+        "    |______ 102501987_3cdb8e5394_n.jpg\n",
+        "    |______ 14982802401_a3dfb22afb.jpg\n",
+        "    |______ ...\n",
+        "|__ \u003cb\u003esunflowers\u003c/b\u003e\n",
+        "    |______ 12471791574_bb1be83df4.jpg\n",
+        "    |______ 15122112402_cafa41934f.jpg\n",
+        "    |______ ...\n",
+        "|__ \u003cb\u003etulips\u003c/b\u003e\n",
+        "    |______ 13976522214_ccec508fe7.jpg\n",
+        "    |______ 14487943607_651e8062a1_m.jpg\n",
+        "    |______ ...\n",
+        "\u003c/pre\u003e"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "7tOfUr2KlgpU"
+      },
+      "outputs": [],
+      "source": [
+        "image_path = tf.keras.utils.get_file(\n",
+        "      'flower_photos',\n",
+        "      'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',\n",
+        "      untar=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "E051HBUM5owi"
+      },
+      "source": [
+        "Use `ImageClassifierDataLoader` class to load data.\n",
+        "\n",
+        "As for `from_folder()` method, it could load data from the folder. It assumes that the image data of the same class are in the same subdirectory and the subfolder name is the class name. Currently, JPEG-encoded images and PNG-encoded images are supported."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "I_fOlZsklmlL"
+      },
+      "outputs": [],
+      "source": [
+        "data = ImageClassifierDataLoader.from_folder(image_path)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "u501eT4koURB"
+      },
+      "source": [
+        "Split it to training data (80%), validation data (10%, optional) and testing data (10%)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "cY4UU5SUobtJ"
+      },
+      "outputs": [],
+      "source": [
+        "train_data, rest_data = data.split(0.8)\n",
+        "validation_data, test_data = rest_data.split(0.5)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Z9_MYPie3EMO"
+      },
+      "source": [
+        "Show 25 image examples with labels."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Ih4Wx44I482b"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(10,10))\n",
+        "for i, (image, label) in enumerate(data.dataset.take(25)):\n",
+        "  plt.subplot(5,5,i+1)\n",
+        "  plt.xticks([])\n",
+        "  plt.yticks([])\n",
+        "  plt.grid(False)\n",
+        "  plt.imshow(image.numpy(), cmap=plt.cm.gray)\n",
+        "  plt.xlabel(data.index_to_label[label.numpy()])\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "AWuoensX4vDA"
+      },
+      "source": [
+        "### Step 2: Customize the TensorFlow Model\n",
+        "\n",
+        "Create a custom image classifier model based on the loaded data. The default model is EfficientNet-Lite0.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "TvYSUuJY3QxR"
+      },
+      "outputs": [],
+      "source": [
+        "model = image_classifier.create(train_data, validation_data=validation_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4JFOKWnH9x8_"
+      },
+      "source": [
+        "Have a look at the detailed model structure."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "QNXAfjl192dC"
+      },
+      "outputs": [],
+      "source": [
+        "model.summary()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "LP5FPk_tOxoZ"
+      },
+      "source": [
+        "### Step 3: Evaluate the Customized Model\n",
+        "\n",
+        "Evaluate the result of the model, get the loss and accuracy of the model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "A8c2ZQ0J3Riy"
+      },
+      "outputs": [],
+      "source": [
+        "loss, accuracy = model.evaluate(test_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6ZCrYOWoCt05"
+      },
+      "source": [
+        "We could plot the predicted results in 100 test images. Predicted labels with red color are the wrong predicted results while others are correct."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "n9O9Kx7nDQWD"
+      },
+      "outputs": [],
+      "source": [
+        "# A helper function that returns 'red'/'black' depending on if its two input\n",
+        "# parameter matches or not.\n",
+        "def get_label_color(val1, val2):\n",
+        "  if val1 == val2:\n",
+        "    return 'black'\n",
+        "  else:\n",
+        "    return 'red'\n",
+        "\n",
+        "# Then plot 100 test images and their predicted labels.\n",
+        "# If a prediction result is different from the label provided label in \"test\"\n",
+        "# dataset, we will highlight it in red color.\n",
+        "plt.figure(figsize=(20, 20))\n",
+        "predicts = model.predict_top_k(test_data)\n",
+        "for i, (image, label) in enumerate(test_data.dataset.take(100)):\n",
+        "  ax = plt.subplot(10, 10, i+1)\n",
+        "  plt.xticks([])\n",
+        "  plt.yticks([])\n",
+        "  plt.grid(False)\n",
+        "  plt.imshow(image.numpy(), cmap=plt.cm.gray)\n",
+        "\n",
+        "  predict_label = predicts[i][0][0]\n",
+        "  color = get_label_color(predict_label,\n",
+        "                          test_data.index_to_label[label.numpy()])\n",
+        "  ax.xaxis.label.set_color(color)\n",
+        "  plt.xlabel('Predicted: %s' % predict_label)\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "S3H0rkbLUZAG"
+      },
+      "source": [
+        "If the accuracy doesn't meet the app requirement, one could refer to [Advanced Usage](#scrollTo=zNDBP2qA54aK) to explore alternatives such as changing to a larger model, adjusting re-training parameters etc."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "aeHoGAceO2xV"
+      },
+      "source": [
+        "### Step 4: Export to TensorFlow Lite Model\n",
+        "\n",
+        "Convert the existing model to TensorFlow Lite model format and save the image labels in label file. The default TFLite filename is `model.tflite`, the default label filename is `label.txt`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Im6wA9lK3TQB"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ROS2Ay2jMPCl"
+      },
+      "source": [
+        "The TensorFlow Lite model file and label file could be used in [image classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification) reference app.\n",
+        "\n",
+        "As for android reference app as an example, we could add `flower_classifier.tflite` and `flower_label.txt` in [assets](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/app/src/main/assets) folder. Meanwhile, change label filename in [code](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/tflite/ClassifierFloatMobileNet.java#L65) and TensorFlow Lite file name in [code](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/tflite/ClassifierFloatMobileNet.java#L60). Thus, we could run the retrained float TensorFlow Lite model on the android app.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-4jQaxyT5_KV"
+      },
+      "source": [
+        "Here, we also demonstrate how to use the above files to run and evaluate the TensorFlow Lite model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "S1YoPX5wOK-u"
+      },
+      "outputs": [],
+      "source": [
+        "# Read TensorFlow Lite model from TensorFlow Lite file.\n",
+        "with tf.io.gfile.GFile('model.tflite', 'rb') as f:\n",
+        "  model_content = f.read()\n",
+        "\n",
+        "# Read label names from label file.\n",
+        "with tf.io.gfile.GFile('labels.txt', 'r') as f:\n",
+        "  label_names = f.read().split('\\n')\n",
+        "\n",
+        "# Initialze TensorFlow Lite inpterpreter.\n",
+        "interpreter = tf.lite.Interpreter(model_content=model_content)\n",
+        "interpreter.allocate_tensors()\n",
+        "input_index = interpreter.get_input_details()[0]['index']\n",
+        "output = interpreter.tensor(interpreter.get_output_details()[0][\"index\"])\n",
+        "\n",
+        "# Run predictions on each test image data and calculate accuracy.\n",
+        "accurate_count = 0\n",
+        "for i, (image, label) in enumerate(test_data.dataset):\n",
+        "    # Pre-processing should remain the same. Currently, just normalize each pixel value and resize image according to the model's specification.\n",
+        "    image, _ = model.preprocess(image, label)\n",
+        "    # Add batch dimension and convert to float32 to match with the model's input\n",
+        "    # data format.\n",
+        "    image = tf.expand_dims(image, 0).numpy()\n",
+        "\n",
+        "    # Run inference.\n",
+        "    interpreter.set_tensor(input_index, image)\n",
+        "    interpreter.invoke()\n",
+        "\n",
+        "    # Post-processing: remove batch dimension and find the label with highest\n",
+        "    # probability.\n",
+        "    predict_label = np.argmax(output()[0])\n",
+        "    # Get label name with label index.\n",
+        "    predict_label_name = label_names[predict_label]\n",
+        "\n",
+        "    accurate_count += (predict_label == label.numpy())\n",
+        "\n",
+        "accuracy = accurate_count * 1.0 / test_data.size\n",
+        "print('TensorFlow Lite model accuracy = %.4f' % accuracy)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "fuHB-NFqpKTD"
+      },
+      "source": [
+        "Note that preprocessing for inference should be the same as training. Currently, preprocessing contains normalizing each pixel value and resizing the image according to the model's specification. For  EfficientNet-Lite0, input image should be normalized to `[0, 1]` and resized to `[224, 224, 3]`."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "zNDBP2qA54aK"
+      },
+      "source": [
+        "## Advanced Usage\n",
+        "\n",
+        "The `create` function is the critical part of this library. It uses transfer learning with a pretrained model similiar to the [tutorial](https://www.tensorflow.org/tutorials/images/transfer_learning).\n",
+        "\n",
+        "The `create`function contains the following steps:\n",
+        "\n",
+        "1.   Split the data into training, validation, testing data according to parameter `validation_ratio` and `test_ratio`. The default value of `validation_ratio` and `test_ratio` are `0.1` and `0.1`.\n",
+        "2.   Download a [Image Feature Vector](https://www.tensorflow.org/hub/common_signatures/images#image_feature_vector) as the base model from TensorFlow Hub. The default pre-trained model is  EfficientNet-Lite0.\n",
+        "3.   Add a classifier head with a Dropout Layer with `dropout_rate` between head layer and pre-trained model. The default `dropout_rate` is the default `dropout_rate` value from [make_image_classifier_lib](https://github.com/tensorflow/hub/blob/master/tensorflow_hub/tools/make_image_classifier/make_image_classifier_lib.py#L55) by TensorFlow Hub.\n",
+        "4.   Preprocess the raw input data. Currently, preprocessing steps including normalizing the value of each image pixel to model input scale and resizing it to model input size.   EfficientNet-Lite0 have the input scale `[0, 1]` and the input image size `[224, 224, 3]`.\n",
+        "5.   Feed the data into the classifier model. By default, the training parameters such as training epochs, batch size, learning rate, momentum are the default values from [make_image_classifier_lib](https://github.com/tensorflow/hub/blob/master/tensorflow_hub/tools/make_image_classifier/make_image_classifier_lib.py#L55) by TensorFlow Hub. Only the classifier head is trained.\n",
+        "\n",
+        "\n",
+        "In this section, we describe several advanced topics, including switching to a different image classification model, changing the training hyperparameters etc.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "A4kiTJtZ_sDm"
+      },
+      "source": [
+        "## Change the model\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "794vgj6ud7Ep"
+      },
+      "source": [
+        "### Change to the model that's supported in this library.\n",
+        "\n",
+        "This library supports  EfficientNet-Lite models, MobileNetV2, ResNet50 by now. [EfficientNet-Lite](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite) are a family of image classification models that could achieve state-of-art accuracy and suitable for Edge devices. The default model is EfficientNet-Lite0.\n",
+        "\n",
+        "We could switch model to MobileNetV2 by just setting parameter `model_spec` to  `mobilenet_v2_spec` in `create` method."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "7JKsJ6-P6ae1"
+      },
+      "outputs": [],
+      "source": [
+        "model = image_classifier.create(train_data, model_spec=mobilenet_v2_spec, validation_data=validation_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "gm_B1Wv08AxR"
+      },
+      "source": [
+        "Evaluate the newly retrained MobileNetV2 model to see the accuracy and loss in testing data."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "lB2Go3HW8X7_"
+      },
+      "outputs": [],
+      "source": [
+        "loss, accuracy = model.evaluate(test_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "vAciGzVWtmWp"
+      },
+      "source": [
+        "### Change to the model in TensorFlow Hub\n",
+        "\n",
+        "Moreover, we could also switch to other new models that inputs an image and outputs a feature vector with TensorFlow Hub format.\n",
+        "\n",
+        "As [Inception V3](https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1) model as an example, we could define `inception_v3_spec` which is an object of `ImageModelSpec` and contains the specification of the Inception V3 model.\n",
+        "\n",
+        "We need to specify the model name `name`, the url of the TensorFlow Hub model `uri`. Meanwhile, the default value of `input_image_shape` is `[224, 224]`. We need to change it to `[299, 299]` for Inception V3 model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "xdiMF2WMfAR4"
+      },
+      "outputs": [],
+      "source": [
+        "inception_v3_spec = ImageModelSpec(\n",
+        "    uri='https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1')\n",
+        "inception_v3_spec.input_image_shape = [299, 299]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "T_GGIoXZCs5F"
+      },
+      "source": [
+        "Then, by setting parameter `model_spec` to `inception_v3_spec` in `create` method, we could retrain the Inception V3 model.\n",
+        "\n",
+        "The remaining steps are exactly same and we could get a customized InceptionV3 TensorFlow Lite model in the end."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UhZ5IRKdeex3"
+      },
+      "source": [
+        "### Change your own custom model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "svTjlZhrCrcV"
+      },
+      "source": [
+        "If we'd like to use the custom model that's not in TensorFlow Hub, we should create and export [ModelSpec](https://www.tensorflow.org/hub/api_docs/python/hub/ModuleSpec) in TensorFlow Hub.\n",
+        "\n",
+        "Then start to define `ImageModelSpec` object like the process above."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4M9bn703AHt2"
+      },
+      "source": [
+        "## Change the training hyperparameters\n",
+        "We could also change the training hyperparameters like `epochs`, `dropout_rate` and `batch_size` that could affect the model accuracy. For instance,\n",
+        "\n",
+        "\n",
+        "*   `epochs`: more epochs could achieve better accuracy until it converges but training for too many epochs may lead to overfitting.\n",
+        "*   `dropout_rate`: avoid overfitting.\n",
+        "*   `batch_size`: number of samples to use in one training step.\n",
+        "*   `validation_data`: number of samples to use in one training step.\n",
+        "\n",
+        "\n",
+        "For example, we could train with more epochs.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "A3k7mhH54QcK"
+      },
+      "outputs": [],
+      "source": [
+        "model = image_classifier.create(train_data, validation_data=validation_data, epochs=10)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "VaYBQymQDsXU"
+      },
+      "source": [
+        "Evaluate the newly retrained model with 10 training epochs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "VafIYpKWD4Sw"
+      },
+      "outputs": [],
+      "source": [
+        "loss, accuracy = model.evaluate(test_data)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "image_classification.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.8"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
new file mode 100644
index 00000000000..8261d6c9e34
--- /dev/null
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
@@ -0,0 +1,877 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "h2q27gKz1H20"
+      },
+      "source": [
+        "##### Copyright 2019 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "cellView": "form",
+        "colab": {},
+        "colab_type": "code",
+        "id": "TUfAcER1oUS6"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Gb7qyhNL1yWt"
+      },
+      "source": [
+        "# Text classification with TensorFlow Lite Model Maker"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Fw5Y7snSuG51"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/tutorials/model_maker_text_classification\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "sr3q-gvm3cI8"
+      },
+      "source": [
+        "The TensorFlow Lite Model Maker library simplifies the process of adapting and converting a TensorFlow neural-network model to particular input data when deploying this model for on-device ML applications.\n",
+        "\n",
+        "This notebook shows an end-to-end example that utilizes this Model Maker library to illustrate the adaption and conversion of a commonly-used text classification model to classify movie reviews on a mobile device."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "bcLF2PKkSbV3"
+      },
+      "source": [
+        "## Prerequisites\n",
+        "\n",
+        "To run this example, we first need to install several required packages, including Model Maker package that in github [repo](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "qhl8lqVamEty"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "l6lRhVK9Q_0U"
+      },
+      "source": [
+        "Import the required packages."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "XtxiUeZEiXpt"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import os\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "assert tf.__version__.startswith('2')\n",
+        "\n",
+        "from tensorflow_examples.lite.model_maker.core.data_util.text_dataloader import TextClassifierDataLoader\n",
+        "from tensorflow_examples.lite.model_maker.core.task.model_spec import AverageWordVecModelSpec\n",
+        "from tensorflow_examples.lite.model_maker.core.task.model_spec import BertClassifierModelSpec\n",
+        "from tensorflow_examples.lite.model_maker.core.task import text_classifier"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "06sWWfvE6I8e"
+      },
+      "source": [
+        "## Simple End-to-End Example"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "BRd13bfetO7B"
+      },
+      "source": [
+        "### Get the data path\n",
+        "Let's get some texts to play with this simple end-to-end example."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "R2BSkxWg6Rhx"
+      },
+      "outputs": [],
+      "source": [
+        "data_path = tf.keras.utils.get_file(\n",
+        "      fname='aclImdb',\n",
+        "      origin='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',\n",
+        "      untar=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6MSCjPAvs2EQ"
+      },
+      "source": [
+        " You could replace it with your own text folders. As for uploading data to colab, you could find the upload button in the left sidebar shown in the image below with the red rectangle. Just have a try to upload a zip file and unzip it. The root file path is the current path.\n",
+        "\n",
+        "\u003cimg src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_text_classification.png\" alt=\"Upload File\" width=\"800\" hspace=\"100\"\u003e\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "uO5egTlrtWxm"
+      },
+      "source": [
+        "If you prefer not to upload your images to the cloud, you could try to run the library locally following the [guide](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker) in github."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "WlKU3SMX6TnB"
+      },
+      "source": [
+        "### Run the example\n",
+        "\n",
+        "The example just consists of 6 lines of code as shown below, representing 5 steps of the overall process."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "PBPUIhEjMjTR"
+      },
+      "source": [
+        "Step 0. Choose a `model_spec` that represents a model for text classifier."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "CtdZ-JDwMimd"
+      },
+      "outputs": [],
+      "source": [
+        "model_spec = AverageWordVecModelSpec()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "s5U-A3tw6Y27"
+      },
+      "source": [
+        "Step 1.   Load train and test data specific to an on-device ML app and preprocess the data according to specific `model_spec`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "HD5BvzWe6YKa"
+      },
+      "outputs": [],
+      "source": [
+        "train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['pos', 'neg'])\n",
+        "test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "2uZkLR6N6gDR"
+      },
+      "source": [
+        "Step 2. Customize the TensorFlow model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "kwlYdTcg63xy"
+      },
+      "outputs": [],
+      "source": [
+        "model = text_classifier.create(train_data, model_spec=model_spec)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-BzCHLWJ6h7q"
+      },
+      "source": [
+        "Step 3. Evaluate the model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "8xmnl6Yy7ARn"
+      },
+      "outputs": [],
+      "source": [
+        "loss, acc = model.evaluate(test_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "CgCDMe0e6jlT"
+      },
+      "source": [
+        "Step 4.  Export to TensorFlow Lite  model.\n",
+        "You could download it in the left sidebar same as the uploading part for your own use."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Hm_UULdW7A9T"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "rVxaf3x_7OfB"
+      },
+      "source": [
+        "After this simple 5 steps, we could further use TensorFlow Lite model file and label file in on-device applications like in [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "l65ctmtW7_FF"
+      },
+      "source": [
+        "## Detailed Process\n",
+        "\n",
+        "In the above, we tried the simple end-to-end example. The following walks through the example step by step to show more detail."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "kJ_B8fMDOhMR"
+      },
+      "source": [
+        "### Step 0: Choose a model_spec that represents a model for text classifier.\n",
+        "\n",
+        "each `model_spec` object represents a specific model for the text classifier. Currently, we support averging word embedding model and BERT-base model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "vEAWuZQ1PFiX"
+      },
+      "outputs": [],
+      "source": [
+        "model_spec = AverageWordVecModelSpec()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ygEncJxtl-nQ"
+      },
+      "source": [
+        "### Step 1: Load Input Data Specific to an On-device ML App\n",
+        "\n",
+        "The IMDB dataset contains 25000 movie reviews for training and 25000 movie reviews for testing from the [Internet Movie Database](https://www.imdb.com/). The dataset has two classes: positive and negative movie reviews.\n",
+        "\n",
+        "Download the archive version of the dataset and untar it.\n",
+        "\n",
+        "The IMDB dataset has the following directory structure:\n",
+        "\n",
+        "\u003cpre\u003e\n",
+        "\u003cb\u003eaclImdb\u003c/b\u003e\n",
+        "|__ \u003cb\u003etrain\u003c/b\u003e\n",
+        "    |______ \u003cb\u003epos\u003c/b\u003e: [1962_10.txt, 2499_10.txt, ...]\n",
+        "    |______ \u003cb\u003eneg\u003c/b\u003e: [104_3.txt, 109_2.txt, ...]\n",
+        "    |______ unsup: [12099_0.txt, 1424_0.txt, ...]\n",
+        "|__ \u003cb\u003etest\u003c/b\u003e\n",
+        "    |______ \u003cb\u003epos\u003c/b\u003e: [1384_9.txt, 191_9.txt, ...]\n",
+        "    |______ \u003cb\u003eneg\u003c/b\u003e: [1629_1.txt, 21_1.txt]\n",
+        "\n",
+        "\u003c/pre\u003e\n",
+        "\n",
+        "Note that the text data under `train/unsup` folder are unlabeled documents for unsupervised learning and such data should be ignored in this tutorial.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "7tOfUr2KlgpU"
+      },
+      "outputs": [],
+      "source": [
+        "data_path = tf.keras.utils.get_file(\n",
+        "      fname='aclImdb',\n",
+        "      origin='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',\n",
+        "      untar=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "E051HBUM5owi"
+      },
+      "source": [
+        "Use `TextClassifierDataLoader` to load data.\n",
+        "\n",
+        "As for `from_folder()` method, it could load data from the folder. It assumes that the text data of the same class are in the same subdirectory and the subfolder name is the class name. Each text file contains one movie review sample.\n",
+        "\n",
+        "Parameter `class_labels` is used to specify which subfolder should be considered. As for `train` folder, this parameter is used to skip `unsup` subfolder.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "I_fOlZsklmlL"
+      },
+      "outputs": [],
+      "source": [
+        "train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['pos', 'neg'])\n",
+        "test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)\n",
+        "train_data, validation_data = train_data.split(0.9)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "AWuoensX4vDA"
+      },
+      "source": [
+        "### Step 2: Customize the TensorFlow Model\n",
+        "\n",
+        "Create a custom text classifier model based on the loaded data. Currently, we support averaging word embedding and BERT-base model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "TvYSUuJY3QxR"
+      },
+      "outputs": [],
+      "source": [
+        "model = text_classifier.create(train_data, model_spec=model_spec, validation_data=validation_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0JKI-pNc8idH"
+      },
+      "source": [
+        "Have a look at the detailed model structure."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "gd7Hs8TF8n3H"
+      },
+      "outputs": [],
+      "source": [
+        "model.summary()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "LP5FPk_tOxoZ"
+      },
+      "source": [
+        "### Step 3: Evaluate the Customized Model\n",
+        "\n",
+        "Evaluate the result of the model, get the loss and accuracy of the model.\n",
+        "\n",
+        "Evaluate the loss and accuracy in `test_data`. If no data is given the results are evaluated on the data that's splitted in the `create` method."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "A8c2ZQ0J3Riy"
+      },
+      "outputs": [],
+      "source": [
+        "loss, acc = model.evaluate(test_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "aeHoGAceO2xV"
+      },
+      "source": [
+        "### Step 4: Export to TensorFlow Lite Model\n",
+        "\n",
+        "Convert the existing model to TensorFlow Lite model format that could be later used in on-device ML application. Meanwhile, save the text labels in label file and vocabulary in vocab file. The default TFLite filename is `model.tflite`, the default label filename is `label.txt`, the default vocab filename is `vocab`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Im6wA9lK3TQB"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "w12kvDdHJIGH"
+      },
+      "source": [
+        "The TensorFlow Lite model file and label file could be used in the [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app.\n",
+        "\n",
+        "In detail, we could add `movie_review_classifier.tflite`, `text_label.txt` and `vocab.txt` to the [assets directory](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android/app/src/main/assets) folder. Meanwhile, change the filenames in [code](https://github.com/tensorflow/examples/blob/master/lite/examples/text_classification/android/app/src/main/java/org/tensorflow/lite/examples/textclassification/TextClassificationClient.java#L43). "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HZKYthlVrTos"
+      },
+      "source": [
+        "Here, we also demonstrate how to use the above files to run and evaluate the TensorFlow Lite model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ochbq95ZrVFX"
+      },
+      "outputs": [],
+      "source": [
+        "# Read TensorFlow Lite model from TensorFlow Lite file.\n",
+        "with tf.io.gfile.GFile('model.tflite', 'rb') as f:\n",
+        "  model_content = f.read()\n",
+        "\n",
+        "# Read label names from label file.\n",
+        "with tf.io.gfile.GFile('labels.txt', 'r') as f:\n",
+        "  label_names = f.read().split('\\n')\n",
+        "\n",
+        "# Initialze TensorFlow Lite inpterpreter.\n",
+        "interpreter = tf.lite.Interpreter(model_content=model_content)\n",
+        "interpreter.allocate_tensors()\n",
+        "input_index = interpreter.get_input_details()[0]['index']\n",
+        "output = interpreter.tensor(interpreter.get_output_details()[0][\"index\"])\n",
+        "\n",
+        "# Run predictions on each test data and calculate accuracy.\n",
+        "accurate_count = 0\n",
+        "for text, label in test_data.dataset:\n",
+        "    # Add batch dimension and convert to float32 to match with the model's input\n",
+        "    # data format.\n",
+        "    text = tf.expand_dims(text, 0)\n",
+        "    text = tf.cast(text, tf.float32)\n",
+        "\n",
+        "    # Run inference.\n",
+        "    interpreter.set_tensor(input_index, text)\n",
+        "    interpreter.invoke()\n",
+        "\n",
+        "    # Post-processing: remove batch dimension and find the label with highest\n",
+        "    # probability.\n",
+        "    predict_label = np.argmax(output()[0])\n",
+        "    # Get label name with label index.\n",
+        "    predict_label_name = label_names[predict_label]\n",
+        "    accurate_count += (predict_label == label.numpy())\n",
+        "\n",
+        "accuracy = accurate_count * 1.0 / test_data.size\n",
+        "print('TensorFlow Lite model accuracy = %.4f' % accuracy)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "KLKmboKFtgc2"
+      },
+      "source": [
+        "Note that preprocessing for inference should be the same as training. Currently, preprocessing contains split the text to tokens by '\\W', encode the tokens to ids, the pad the text with `pad_id` to have the length of `seq_length`."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "EoWiA_zX8rxE"
+      },
+      "source": [
+        "# Advanced Usage\n",
+        "\n",
+        "The `create` function is the critical part of this library in which parameter `model_spec` defines the specification of the model, currently `AverageWordVecModelSpec` and `BertModelSpec` is supported. The `create` function contains the following steps for `AverageWordVecModelSpec`:\n",
+        "\n",
+        "1.   Tokenize the text and select the top `num_words` most frequent words to generate the vocubulary. The default value of `num_words` in `AverageWordVecModelSpec` object is `10000`.\n",
+        "2.   Encode the text string tokens to int ids.\n",
+        "3.   Create the text classifier model. Currently, this library supports one model: average the word embedding of the text with RELU activation, then leverage softmax dense layer for classification. As for [Embedding layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding), the input dimension is the size of the vocabulary, the output dimension is `AverageWordVecModelSpec` object's variable `wordvec_dim` which default value is `16`, the input length is `AverageWordVecModelSpec` object's variable `seq_len` which default value is `256`.\n",
+        "4.   Train the classifier model. The default epoch is `2` and the default batch size is `32`.\n",
+        "\n",
+        "In this section, we describe several advanced topics, including adjusting the model, changing the training hyperparameters etc.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "mwtiksguDfhl"
+      },
+      "source": [
+        "# Adjust the model\n",
+        "\n",
+        "We could adjust the model infrastructure like variables `wordvec_dim`, `seq_len` in `AverageWordVecModelSpec` class.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "cAOd5_bzH9AQ"
+      },
+      "source": [
+        "*   `wordvec_dim`: Dimension of word embedding.\n",
+        "*   `seq_len`: length of sequence.\n",
+        "\n",
+        "For example, we could train with larger `wordvec_dim`. If we change the model, we need to construct the new `model_spec` firstly."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "e9WBN0UTQoMN"
+      },
+      "outputs": [],
+      "source": [
+        "new_model_spec = AverageWordVecModelSpec(wordvec_dim=32)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6LSTdghTP0Cv"
+      },
+      "source": [
+        "Secondly, we should get the preprocessed data accordingly."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "DVZurFBORG3J"
+      },
+      "outputs": [],
+      "source": [
+        "new_train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=new_model_spec, class_labels=['pos', 'neg'])\n",
+        "new_train_data, new_validation_data = new_train_data.split(0.9)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "tD7QVVHeRZoM"
+      },
+      "source": [
+        "Finally, we could train the new model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "PzpV246_JGEu"
+      },
+      "outputs": [],
+      "source": [
+        "model = text_classifier.create(new_train_data, model_spec=new_model_spec, validation_data=new_validation_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "LvQuy7RSDir3"
+      },
+      "source": [
+        "## Change the training hyperparameters\n",
+        "We could also change the training hyperparameters like `epochs` and `batch_size` that could affect the model accuracy. For instance,\n",
+        "\n",
+        "*   `epochs`: more epochs could achieve better accuracy, but may lead to overfitting.\n",
+        "*   `batch_size`: number of samples to use in one training step.\n",
+        "\n",
+        "For example, we could train with more epochs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "rnWFaYZBG6NW"
+      },
+      "outputs": [],
+      "source": [
+        "model = text_classifier.create(train_data, model_spec=model_spec, validation_data=validation_data, epochs=5)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "nUaKQZBQHBQR"
+      },
+      "source": [
+        "Evaluate the newly retrained model with 5 training epochs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "BMPi1xflHDSY"
+      },
+      "outputs": [],
+      "source": [
+        "loss, accuracy = model.evaluate(test_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Eq6B9lKMfhS6"
+      },
+      "source": [
+        "## Change the Model\n",
+        "\n",
+        "We could change the model by changing the `model_spec`. The following shows how we change to BERT-base model.\n",
+        "\n",
+        "First, we could change `model_spec` to `BertModelSpec`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "QfFCWrwyggrT"
+      },
+      "outputs": [],
+      "source": [
+        "model_spec = BertClassifierModelSpec()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "L2d7yycrgu6L"
+      },
+      "source": [
+        "The remaining steps remains the same.\n",
+        "\n",
+        "Load data and preprocess the data according to `model_spec`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "6GQXQO54iyyE"
+      },
+      "outputs": [],
+      "source": [
+        "train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['pos', 'neg'])\n",
+        "test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ZTMqpDXCi11Q"
+      },
+      "source": [
+        "Then retrain the model. Note that it could take a long time to retrain the BERT model. we just set `epochs` equals 1 to demonstrate it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "c991Bdkgi1Bf"
+      },
+      "outputs": [],
+      "source": [
+        "model = text_classifier.create(train_data, model_spec=model_spec, epochs=1)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "text_classification.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 875a03af817..a419a56a9e6 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -191,11 +191,11 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
   std::vector<NodeSubset>* node_subsets_;
   std::vector<NodeSubset::Type> node_type_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
-  // is an input or constant.
+  // negative values of kEpochNotReady if not assigned, kEpochAlwaysReady if it
+  // is an input to the whole model or a constant that has no dependencies.
   std::vector<int> tensor_epochs_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned.
+  // negative values of kEpochNotReady if not assigned.
   std::vector<int> node_epochs_;
 };
 
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 1aabdf6409b..c8ccf671d60 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -311,10 +311,19 @@ void Interpreter::SetCancellationFunction(void* data,
 }
 
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  TfLiteStatus status = kTfLiteOk;
   for (auto& subgraph : subgraphs_) {
-    TF_LITE_ENSURE_OK(context_, subgraph->ModifyGraphWithDelegate(delegate));
+    status = subgraph->ModifyGraphWithDelegate(delegate);
+    if (status != kTfLiteOk) {
+      break;
+    }
   }
-  return kTfLiteOk;
+  // Delegate-specific errors can be recovered from by restoring Interpreter to
+  // its original state.
+  if (status == kTfLiteDelegateError) {
+    TF_LITE_ENSURE_STATUS(RemoveAllDelegates());
+  }
+  return status;
 }
 
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegatePtr delegate) {
@@ -324,6 +333,13 @@ TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegatePtr delegate) {
   return ModifyGraphWithDelegate(owned_delegates_.back().get());
 }
 
+TfLiteStatus Interpreter::RemoveAllDelegates() {
+  for (auto& subgraph : subgraphs_) {
+    TF_LITE_ENSURE_STATUS(subgraph->RemoveAllDelegates());
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index c6a86572682..b93fd76c13b 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -41,6 +41,7 @@ limitations under the License.
 namespace tflite {
 
 class InterpreterTest;
+class TestDelegate;
 
 namespace impl {
 
@@ -391,6 +392,12 @@ class Interpreter {
   /// parts of the graph themselves. After this is called, the graph may
   /// contain new nodes that replace 1 more nodes.
   /// 'delegate' must outlive the interpreter.
+  /// Returns one of the following three status codes:
+  /// 1. kTfLiteOk: Success.
+  /// 2. kTfLiteDelegateError: Delegation failed due to an error in the
+  /// delegate. The Interpreter has been restored to its pre-delegation state.
+  /// NOTE: This undoes all delegates previously applied to the Interpreter.
+  /// 3. kTfLiteError: Unexpected/runtime failure.
   /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
 
@@ -521,6 +528,7 @@ class Interpreter {
  private:
   friend class InterpreterBuilder;
   friend class tflite::InterpreterTest;
+  friend class tflite::TestDelegate;
 
   /// Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
@@ -530,6 +538,10 @@ class Interpreter {
   // Sets the profiler to all subgraphs.
   void SetSubgraphProfiler(Profiler* profiler);
 
+  // Remove delegates (for fallback behaviour). The interpreter is invokable
+  // afterwards.
+  TfLiteStatus RemoveAllDelegates();
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index 65f4b8e547f..43d81ef0770 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
@@ -34,6 +36,16 @@ limitations under the License.
 #include "tensorflow/lite/profiling/platform_profiler.h"
 #endif
 
+// aligned_alloc is available (via cstdlib/stdlib.h) with C++17/C11.
+#if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
+#if !defined(__ANDROID__) || __ANDROID_API__ >= 28
+// Neither Apple nor Windows provide aligned_alloc.
+#if !defined(__APPLE__) && !defined(_WIN32)
+#define TFLITE_USE_STD_ALIGNED_ALLOC
+#endif
+#endif
+#endif
+
 namespace tflite {
 
 namespace {
@@ -97,27 +109,14 @@ TfLiteStatus ParseSparseIndexVector(const DimensionMetadata* src,
 
 const char* kEmptyTensorName = "";
 
-#if TFLITE_HAS_ATTRIBUTE_WEAK
 // Using weak symbols to create a delegate allows automatic injection of the
 // delegate simply by adding it as a dependency.
-
 // For flex delegate, see also the strong override in
 // lite/delegates/flex/delegate.cc.
 TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
-// For XNNPACK delegate, see also the strong override in
-// lite/tflite_with_xnnpack.cc.
-TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireXNNPACKDelegate(
-    int num_threads) {
-  return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
-}
-#else
-Interpreter::TfLiteDelegatePtr (*AcquireFlexDelegate)() = nullptr;
-Interpreter::TfLiteDelegatePtr (*AcquireXNNPACKDelegate)(int) = nullptr;
-#endif
-
 namespace impl {
 
 InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
@@ -197,7 +196,21 @@ std::vector<int> FlatBufferIntArrayToVector(T* flat_array) {
 // Used to determine how the op data parsing function creates its working space.
 class MallocDataAllocator : public BuiltinDataAllocator {
  public:
-  void* Allocate(size_t size) override { return malloc(size); }
+  void* Allocate(size_t size, size_t alignment_hint) override {
+#ifdef TFLITE_USE_STD_ALIGNED_ALLOC
+    // Ensure that alignment is a power of two and a multiple of sizeof(void *)
+    // and that size is an integral multiple of alignment.
+    size_t used_alignment = std::max(alignment_hint, sizeof(void*));
+    size_t used_size =
+        ((size + used_alignment - 1) / used_alignment) * used_alignment;
+    TFLITE_DCHECK(
+        (used_alignment != 0) &&
+        ((used_alignment & (used_alignment - 1)) == 0));  // is power-of-two
+    return aligned_alloc(used_alignment, used_size);
+#else
+    return malloc(size);
+#endif
+  }
   void Deallocate(void* data) override { free(data); }
 };
 
@@ -516,17 +529,17 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 TfLiteStatus InterpreterBuilder::ApplyDelegates(Interpreter* interpreter,
                                                 int num_threads) {
   // First, apply XNNPACK delegate if applicable.
-  if (AcquireXNNPACKDelegate && num_fp32_tensors_ > 0) {
-    if (auto xnnpack_delegate = AcquireXNNPACKDelegate(num_threads)) {
-      // The execution will fall back to default implementation if the XNNPACK
-      // delegate fails to be applied. Therefore, we ignore the return status
-      // here and let it fall through the rest of the code.
+  if (num_fp32_tensors_ > 0) {
+    // The execution will fall back to default implementation if the XNNPACK
+    // delegate fails to be applied. Therefore, we ignore the return status
+    // here and let it fall through the rest of the code.
+    if (auto xnnpack_delegate = MaybeCreateXNNPACKDelegate(num_threads)) {
       interpreter->ModifyGraphWithDelegate(std::move(xnnpack_delegate));
     }
   }
 
   // Secondly, apply Flex delegate if applicable.
-  if (has_flex_op_ && AcquireFlexDelegate) {
+  if (has_flex_op_) {
     if (auto flex_delegate = AcquireFlexDelegate()) {
       return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate));
     }
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 38f9cd26f40..49b8e7bd816 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/lite/context.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
@@ -749,10 +748,22 @@ TEST(BasicInterpreter, ThreeStepAllocate) {
   ASSERT_EQ(interpreter.SetOutputs({4}), kTfLiteOk);
 
   TfLiteQuantizationParams quantized;
-  char data[] = {1, 0, 0, 0, 12, 0, 0, 0, 15, 0, 0, 0, 'A', 'B', 'C'};
+
+  // String tensor with one string of length 3
+  union {
+    char raw_bytes[15];
+    struct {
+      int32_t num_strs;
+      int32_t offsets[2];
+      char str_data[3];
+    } tensor_data;
+  } data;
+  data.tensor_data = {1, {12, 15}, {'A', 'B', 'C'}};
+
   // Read only string tensor.
   ASSERT_EQ(interpreter.SetTensorParametersReadOnly(0, kTfLiteString, "", {1},
-                                                    quantized, data, 15),
+                                                    quantized, data.raw_bytes,
+                                                    sizeof(data.raw_bytes)),
             kTfLiteOk);
   // Read-write string tensor.
   ASSERT_EQ(interpreter.SetTensorParametersReadWrite(1, kTfLiteString, "", {1},
@@ -960,8 +971,7 @@ TEST(BasicInterpreter, TestUseNNAPI) {
 TEST(BasicInterpreter, TestUnsupportedDelegateFunctions) {
   Interpreter interpreter;
   ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
-  TfLiteRegistration registration = {
-      .init = nullptr, .free = nullptr, .prepare = nullptr, .invoke = nullptr};
+  TfLiteRegistration registration = {nullptr, nullptr, nullptr, nullptr};
   // These functions are only supported inside Delegate's Prepare function.
   // The test verifies that these functions returns `kTfLiteError`, but not
   // `kTfLiteOk` or just crashes.
@@ -1079,8 +1089,11 @@ TEST(InterpreterTensorsCapacityTest, TestExceedHeadroom) {
     TfLiteTensor* first_tensor = context->tensors;
 
     int new_tensor_index;
-    context->AddTensors(context, Interpreter::kTensorsCapacityHeadroom + 1,
-                        &new_tensor_index);
+    // Add enough tensors to trigger buffer re-allocation.
+    context->AddTensors(
+        context,
+        (context->tensors_size + Interpreter::kTensorsCapacityHeadroom + 1) * 2,
+        &new_tensor_index);
     EXPECT_NE(first_tensor, context->tensors);
     return kTfLiteOk;
   };
@@ -1091,7 +1104,7 @@ TEST(InterpreterTensorsCapacityTest, TestExceedHeadroom) {
 }
 
 struct TestExternalContext : public TfLiteExternalContext {
-  static const TfLiteExternalContextType kType = kTfLiteGemmLowpContext;
+  static constexpr TfLiteExternalContextType kType = kTfLiteGemmLowpContext;
 
   static TestExternalContext* Get(TfLiteContext* context) {
     return reinterpret_cast<TestExternalContext*>(
@@ -1291,862 +1304,6 @@ TEST_F(TestExecutionPlan, NullExecutionPlan) {
   ASSERT_EQ(run_order_, std::vector<int>());
 }
 
-// Build a kernel registration for an op that copies its one input
-// to an output
-TfLiteRegistration AddOpRegistration() {
-  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-  reg.custom_name = "my_add";
-  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
-
-  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    // Set output size to input size
-    const TfLiteTensor* input1 = GetInput(context, node, 0);
-    const TfLiteTensor* input2 = GetInput(context, node, 1);
-    TfLiteTensor* output = GetOutput(context, node, 0);
-
-    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
-    for (int i = 0; i < input1->dims->size; ++i) {
-      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
-    }
-
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-        context, output, TfLiteIntArrayCopy(input1->dims)));
-    return kTfLiteOk;
-  };
-
-  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    // Copy input data to output data.
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
-    const TfLiteTensor* a1 = GetInput(context, node, 1);
-    TfLiteTensor* out = GetOutput(context, node, 0);
-    int num = a0->dims->data[0];
-    for (int i = 0; i < num; i++) {
-      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-    }
-    return kTfLiteOk;
-  };
-  return reg;
-}
-
-class TestDelegate : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-    interpreter_->AddTensors(5);
-    interpreter_->SetInputs({0, 1});
-    interpreter_->SetOutputs({3, 4});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = AddOpRegistration();
-    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
-  }
-
-  void TearDown() override {
-    // Interpreter relies on delegate to free the resources properly. Thus
-    // the life cycle of delegate must be longer than interpreter.
-    interpreter_.reset();
-    delegate_.reset();
-  }
-
-  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
-
-  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
-
- protected:
-  class SimpleDelegate {
-   public:
-    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
-    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
-    // value-copyable and compatible with TfLite.
-    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
-    // min_ops_per_subset: If >0, partitioning preview is used to choose only
-    // those subsets with min_ops_per_subset number of nodes.
-    explicit SimpleDelegate(
-        const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
-        bool fail_node_prepare = false, int min_ops_per_subset = 0)
-        : nodes_(nodes),
-          fail_delegate_node_prepare_(fail_node_prepare),
-          min_ops_per_subset_(min_ops_per_subset) {
-      delegate_.Prepare = [](TfLiteContext* context,
-                             TfLiteDelegate* delegate) -> TfLiteStatus {
-        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
-        TfLiteIntArray* nodes_to_separate =
-            TfLiteIntArrayCreate(simple->nodes_.size());
-        // Mark nodes that we want in TfLiteIntArray* structure.
-        int index = 0;
-        for (auto node_index : simple->nodes_) {
-          nodes_to_separate->data[index++] = node_index;
-          // make sure node is added
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-        }
-        // Check that all nodes are available
-        TfLiteIntArray* execution_plan;
-        TF_LITE_ENSURE_STATUS(
-            context->GetExecutionPlan(context, &execution_plan));
-        for (int exec_index = 0; exec_index < execution_plan->size;
-             exec_index++) {
-          int node_index = execution_plan->data[exec_index];
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          if (exec_index == node_index) {
-            // Check op details only if it wasn't delegated already.
-            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-          }
-        }
-
-        // Get preview of delegate partitioning from the context.
-        TfLiteDelegateParams* params_array;
-        int num_partitions;
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        if (simple->min_ops_per_subset() > 0) {
-          // Build a new vector of ops from subsets with atleast the minimum
-          // size.
-          std::vector<int> allowed_ops;
-          for (int idx = 0; idx < num_partitions; ++idx) {
-            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
-            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
-            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
-                               nodes_in_subset->data + nodes_in_subset->size);
-          }
-
-          // Free existing nodes_to_separate & initialize a new array with
-          // allowed_ops.
-          TfLiteIntArrayFree(nodes_to_separate);
-          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
-          memcpy(nodes_to_separate->data, allowed_ops.data(),
-                 sizeof(int) * nodes_to_separate->size);
-        }
-
-        // Another call to PreviewDelegateParitioning should be okay, since
-        // partitioning memory is managed by context.
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        context->ReplaceNodeSubsetsWithDelegateKernels(
-            context, simple->FakeFusedRegistration(), nodes_to_separate,
-            delegate);
-        TfLiteIntArrayFree(nodes_to_separate);
-        return kTfLiteOk;
-      };
-      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        TfLiteTensor* tensor) -> TfLiteStatus {
-        // TODO(ycling): Implement tests to test buffer copying logic.
-        return kTfLiteOk;
-      };
-      delegate_.CopyFromBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle,
-             TfLiteTensor* output) -> TfLiteStatus {
-        TFLITE_CHECK_GE(buffer_handle, -1);
-        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
-        const float floats[] = {6., 6., 6.};
-        int num = output->dims->data[0];
-        for (int i = 0; i < num; i++) {
-          output->data.f[i] = floats[i];
-        }
-        return kTfLiteOk;
-      };
-
-      delegate_.FreeBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
-      // Store type-punned data SimpleDelegate structure.
-      delegate_.data_ = static_cast<void*>(this);
-      delegate_.flags = delegate_flags;
-    }
-
-    TfLiteRegistration FakeFusedRegistration() {
-      TfLiteRegistration reg = {nullptr};
-      reg.custom_name = "fake_fused_op";
-
-      reg.invoke = [](TfLiteContext* context,
-                      TfLiteNode* node) -> TfLiteStatus {
-        // Copy input data to output data.
-        const TfLiteTensor* a0;
-        const TfLiteTensor* a1;
-        if (node->inputs->size == 2) {
-          a0 = GetInput(context, node, 0);
-          a1 = GetInput(context, node, 1);
-        } else {
-          a0 = GetInput(context, node, 0);
-          a1 = a0;
-        }
-        TfLiteTensor* out = GetOutput(context, node, 0);
-        int num = 1;
-        for (int i = 0; i < a0->dims->size; ++i) {
-          num *= a0->dims->data[i];
-        }
-        for (int i = 0; i < num; i++) {
-          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-        }
-        // Make the data stale so that CopyFromBufferHandle can be invoked
-        out->data_is_stale = true;
-        return kTfLiteOk;
-      };
-
-      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-        // Set output size to input size
-        const TfLiteTensor* input1;
-        const TfLiteTensor* input2;
-        if (node->inputs->size == 2) {
-          input1 = GetInput(context, node, 0);
-          input2 = GetInput(context, node, 1);
-        } else {
-          input1 = GetInput(context, node, 0);
-          input2 = input1;
-        }
-        TfLiteTensor* output = GetOutput(context, node, 0);
-
-        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-            context, output, TfLiteIntArrayCopy(input1->dims)));
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_prepare_) {
-        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-          return kTfLiteError;
-        };
-      }
-
-      return reg;
-    }
-
-    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
-
-    int min_ops_per_subset() { return min_ops_per_subset_; }
-
-   private:
-    std::vector<int> nodes_;
-    TfLiteDelegate delegate_;
-    bool fail_delegate_node_prepare_ = false;
-    int min_ops_per_subset_ = 0;
-  };
-
-  std::unique_ptr<Interpreter> interpreter_;
-  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
-};
-
-TEST_F(TestDelegate, BasicDelegate) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  int node = interpreter_->execution_plan()[0];
-  const auto* node_and_reg = interpreter_->node_and_registration(node);
-  EXPECT_EQ(node_and_reg->second.custom_name,
-            delegate_->FakeFusedRegistration().custom_name);
-
-  const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
-      node_and_reg->first.builtin_data);
-  ASSERT_EQ(params->nodes_to_replace->size, 3);
-  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
-  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
-  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
-
-  ASSERT_EQ(params->input_tensors->size, 2);
-  EXPECT_EQ(params->input_tensors->data[0], 0);
-  EXPECT_EQ(params->input_tensors->data[1], 1);
-
-  ASSERT_EQ(params->output_tensors->size, 2);
-  EXPECT_EQ(params->output_tensors->data[0], 3);
-  EXPECT_EQ(params->output_tensors->data[1], 4);
-}
-
-TEST_F(TestDelegate, DelegateNodePrepareFailure) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 1, 2}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
-  // ModifyGraphWithDelegate fails, since the Prepare() method in the node's
-  // TfLiteRegistration returns an error status.
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteError);
-  // Execution plan should remain unchanged.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, SecondDelegationPrepareFailure) {
-  // First delegate only supports nodes 1, 2. Gets applied successfully.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports node 0, but fails during the delegate-node's
-  // Prepare.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
-
-  // Initially, execution plan has 3 nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // First delegate should be applied successfully, yielding a plan with 2
-  // nodes.
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-  // Second delegate won't get applied. However, we should be back to the
-  // previous 2-node plan.
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteError);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  // Node 0: tensor_2 = tensor0 + tensor0
-  // Delegated node: tensor_2 + tensor_1
-  std::vector<float> expected_output = {3.0f, 6.0f, 9.0f};
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior to ensure Interpreter isn't broken.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  // Deliberately try to set tensor params with quantization while immutable,
-  // ensuring quantization is properly freed.
-  TfLiteQuantization quant = {};
-  quant.type = kTfLiteAffineQuantization;
-  auto quant_params = static_cast<TfLiteAffineQuantization*>(
-      malloc(sizeof(TfLiteAffineQuantization)));
-  quant_params->scale = nullptr;
-  quant_params->zero_point = nullptr;
-  quant_params->quantized_dimension = 0;
-  quant.params = quant_params;
-  ASSERT_NE(interpreter_->SetTensorParametersReadWrite(0, kTfLiteInt8, "", {3},
-                                                       quant),
-            kTfLiteOk);
-}
-
-TEST_F(TestDelegate, ComplexDelegate) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
-  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-  // 0th should be a non-delegated original op
-  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
-  // 1st should be a new macro op (3) which didn't exist)
-  ASSERT_EQ(interpreter_->execution_plan()[1], 3);
-  const auto* node_and_reg = interpreter_->node_and_registration(3);
-  ASSERT_EQ(node_and_reg->second.custom_name,
-            delegate_->FakeFusedRegistration().custom_name);
-}
-
-TEST_F(TestDelegate, SetBufferHandleToInput) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 0;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  ASSERT_EQ(tensor->delegate, nullptr);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  ASSERT_EQ(status, kTfLiteOk);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, handle);
-}
-
-TEST_F(TestDelegate, SetBufferHandleToOutput) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  ASSERT_EQ(status, kTfLiteOk);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, handle);
-}
-
-TEST_F(TestDelegate, SetInvalidHandleToTensor) {
-  interpreter_->Invoke();
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  SimpleDelegate another_simple_delegate({0, 1, 2});
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status = interpreter_->SetBufferHandle(
-      kOutputTensorIndex, handle,
-      another_simple_delegate.get_tf_lite_delegate());
-  // Setting a buffer handle to a tensor with another delegate will fail.
-  ASSERT_EQ(status, kTfLiteError);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-}
-
-// We utilize delegation in such a way as to allow node subsets with a minimum
-// number of ops only.
-TEST_F(TestDelegate, TestDelegationWithPartitionPreview) {
-  // We set kTfLiteDelegateFlagsAllowDynamicTensors to ensure the second
-  // delegate can be applied.
-  // Ops 0 and 2 are delegated but end up in the same partition (based on
-  // dependency analysis). However, since min_ops_per_subset = 3, no delegation
-  // takes place.
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
-      false /**fail_node_prepare**/, 3 /**min_ops_per_subset**/));
-  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
-
-  // Original execution plan remains.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
-  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
-  ASSERT_EQ(interpreter_->execution_plan()[2], 2);
-
-  // Same ops supported, but min_ops_per_subset = 2.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
-      false /**fail_node_prepare**/, 2 /**min_ops_per_subset**/));
-  interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate());
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-  ASSERT_EQ(interpreter_->execution_plan()[0], 3);
-  const auto* node_and_reg = interpreter_->node_and_registration(3);
-  ASSERT_EQ(node_and_reg->second.custom_name,
-            delegate2_->FakeFusedRegistration().custom_name);
-  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
-}
-
-TEST_F(TestDelegate, TestResizeInputWithNonDynamicDelegate) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-
-  // Try resizing input to same shape as before (which should be a No-op).
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // This should fail, since the previous application of the delegate will be
-  // re-done automatically, making the graph immutable again.
-  ASSERT_NE(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Ensure graph has been restored to its valid delegated state.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-
-  // Resize again, but call AllocateTensors as usual afterwards.
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
-  // First delegate only supports node 0.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Should be two delegates nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  // Try resizing input to same shape as before (which should be a No-op).
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  // Resizing input tensors should temporarily restore original execution plan
-  // of 3 nodes.
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // This should fail, since the previous application of the delegate will be
-  // re-done automatically, making the graph immutable again.
-  ASSERT_NE(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Ensure graph has been restored to its valid delegated state.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
-  constexpr int kOutputTensorIndex = 2;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-
-  // Resize again, but call AllocateTensors as usual afterwards.
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, ReleaseNonPersistentMemoryWithDelegates) {
-  // First delegate only supports node 0.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
-
-  // No-op.
-  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
-
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Should be two delegates nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-
-  // This should fail, since the graph is immutable.
-  ASSERT_NE(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
-  constexpr int kOutputTensorIndex = 2;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-
-  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
-}
-
-TEST_F(TestDelegate, TestCopyFromBufferInvoke) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
-  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
-         floats.size() * sizeof(float));
-
-  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
-         floats.size() * sizeof(float));
-
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  // Called Invoke without setting the buffer will not call the CopyFromBuffer
-  interpreter_->Invoke();
-  std::vector<float> res = {2.0f, 4.0f, 6.0f};
-  for (int i = 0; i < tensor->dims->data[0]; ++i) {
-    ASSERT_EQ(tensor->data.f[i], res[i]);
-  }
-}
-
-TEST_F(TestDelegate, TestCopyFromBuffer) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
-  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
-         floats.size() * sizeof(float));
-
-  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
-         floats.size() * sizeof(float));
-
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  interpreter_->Invoke();
-  ASSERT_EQ(status, kTfLiteOk);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, handle);
-  for (int i = 0; i < tensor->dims->data[0]; ++i) {
-    ASSERT_EQ(tensor->data.f[i], 6.0f);
-  }
-}
-
-TEST_F(TestDelegate, DelegateCustomOpResolution) {
-  // Build a flatbuffer model that contains the "my_add" custom op which gets
-  // resolved only after SimpleDelegate is applied.
-  flatbuffers::FlatBufferBuilder builder;
-  // Tensors.
-  const int32_t shape[1] = {3};
-  flatbuffers::Offset<Tensor> tensors[3] = {
-      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
-                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("X")),
-      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
-                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Y")),
-      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
-                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Z")),
-  };
-  // Custom op definition.
-  flatbuffers::Offset<OperatorCode> op_code =
-      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "my_add");
-  const int32_t inputs[2] = {0, 1};
-  const int32_t outputs[1] = {2};
-  flatbuffers::Offset<Operator> op = CreateOperator(
-      builder, /*opcode_index=*/0, builder.CreateVector<int32_t>(inputs, 2),
-      builder.CreateVector<int32_t>(outputs, 1), BuiltinOptions_NONE,
-      /*builtin_options=*/0,
-      /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS);
-  // Subgraph & Model.
-  flatbuffers::Offset<SubGraph> subgraph =
-      CreateSubGraph(builder, builder.CreateVector(tensors, 3),
-                     builder.CreateVector<int32_t>(inputs, 2),
-                     builder.CreateVector<int32_t>(outputs, 1),
-                     builder.CreateVector(&op, 1), /*name=*/0);
-  flatbuffers::Offset<Buffer> buffers[1] = {
-      CreateBuffer(builder, builder.CreateVector({})),
-  };
-  flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&op_code, 1),
-      builder.CreateVector(&subgraph, 1), builder.CreateString("test_model"),
-      builder.CreateVector(buffers, 1));
-  builder.Finish(model_buffer);
-  std::vector<char> buffer =
-      std::vector<char>(builder.GetBufferPointer(),
-                        builder.GetBufferPointer() + builder.GetSize());
-  const Model* model = GetModel(buffer.data());
-
-  // Build an interpreter with the model. Initialization should work fine.
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(
-      InterpreterBuilder(
-          model, ::tflite::ops::builtin::BuiltinOpResolver())(&interpreter),
-      kTfLiteOk);
-  // AllocateTensors should fail, since my_add hasn't been resolved.
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteError);
-
-  // Applying static delegate won't work, since the interpreter will first try
-  // to Prepare all original nodes.
-  std::unique_ptr<SimpleDelegate> static_delegate(new SimpleDelegate({0}));
-  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
-                static_delegate->get_tf_lite_delegate()),
-            kTfLiteError);
-
-  // Applying delegate that supports dynamic tensors should work.
-  std::unique_ptr<SimpleDelegate> dynamic_delegate(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
-                dynamic_delegate->get_tf_lite_delegate()),
-            kTfLiteOk);
-  // AllocateTensors will now work.
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
-}
-
-class TestDelegateWithDynamicTensors : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-
-    interpreter_->AddTensors(2);
-    interpreter_->SetInputs({0});
-    interpreter_->SetOutputs({1});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = DynamicCopyOpRegistration();
-    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
-
-    delegate_.Prepare = [](TfLiteContext* context,
-                           TfLiteDelegate* delegate) -> TfLiteStatus {
-      // In this test, the delegate replaces all the nodes if this function is
-      // called.
-      TfLiteIntArray* execution_plan;
-      TF_LITE_ENSURE_STATUS(
-          context->GetExecutionPlan(context, &execution_plan));
-      context->ReplaceNodeSubsetsWithDelegateKernels(
-          context, DelegateRegistration(), execution_plan, delegate);
-      return kTfLiteOk;
-    };
-    delegate_.flags = kTfLiteDelegateFlagsNone;
-  }
-
-  static TfLiteRegistration DynamicCopyOpRegistration() {
-    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-      TfLiteTensor* output = GetOutput(context, node, 0);
-      SetTensorToDynamic(output);
-      return kTfLiteOk;
-    };
-
-    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-      // Not implemented since this isn't required in testing.
-      return kTfLiteOk;
-    };
-    return reg;
-  }
-
-  static TfLiteRegistration DelegateRegistration() {
-    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-    return reg;
-  }
-
-  std::unique_ptr<Interpreter> interpreter_;
-  TfLiteDelegate delegate_;
-};
-
-TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
-  interpreter_->ModifyGraphWithDelegate(&delegate_);
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  // The interpreter should not call delegate's `Prepare` when dynamic tensors
-  // exist. So the node ID isn't changed.
-  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
-}
-
-TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
-  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-  interpreter_->ModifyGraphWithDelegate(&delegate_);
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  // The node should be replaced because dynamic tensors are allowed. Therefore
-  // only node ID in the execution plan is changed from 0 to 1.
-  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
-}
-
-TEST_F(TestDelegateWithDynamicTensors, ModifyGraphAfterAllocate) {
-  // Trigger allocation *before* delegate application.
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-
-  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
-
-  // Allocation should still succeed.
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-}
-
 TEST(TestDelegateOwnership, ProperlyDisposed) {
   struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate {
     TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared)
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 857974ecce2..5eb5e8ab023 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -14,6 +14,7 @@ package(
 exports_files([
     "src/testdata/add.bin",
     "src/testdata/add_unknown_dimensions.bin",
+    "src/testdata/grace_hopper_224.jpg",
 ])
 
 JAVA_SRCS = glob([
@@ -211,6 +212,8 @@ java_test(
         "src/testdata/int64.bin",
         "src/testdata/invalid_model.bin",
         "src/testdata/string.bin",
+        # Takes a scalar string and reshapes to a rank-1, single element string.
+        "src/testdata/string_scalar.bin",
         "src/testdata/uint8.bin",
         "src/testdata/with_custom_op.lite",
     ],
@@ -238,6 +241,7 @@ java_test(
     data = [
         "src/testdata/add.bin",
         "src/testdata/add_unknown_dimensions.bin",
+        "//tensorflow/lite:testdata/dynamic_shapes.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
@@ -346,6 +350,15 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+filegroup(
+    name = "portable_gpu_tests",
+    srcs = [
+        "src/test/java/org/tensorflow/lite/InterpreterTestHelper.java",
+        "src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 filegroup(
     name = "libtensorflowlite_jni",
     srcs = select({
diff --git a/tensorflow/lite/java/ovic/BUILD b/tensorflow/lite/java/ovic/BUILD
index 947fbee1a45..e64bd3036ac 100644
--- a/tensorflow/lite/java/ovic/BUILD
+++ b/tensorflow/lite/java/ovic/BUILD
@@ -58,7 +58,6 @@ android_library(
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
 
@@ -75,7 +74,6 @@ java_library(
         "//tensorflow/lite/java:tensorflowlite_java",
         "//tensorflow/lite/java/src/main/native",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
 
@@ -114,7 +112,6 @@ android_library(
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
 
@@ -131,6 +128,5 @@ java_library(
         "//tensorflow/lite/java:tensorflowlite_java",
         "//tensorflow/lite/java/src/main/native",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
diff --git a/tensorflow/lite/java/ovic/README.md b/tensorflow/lite/java/ovic/README.md
index 95e503e5a05..62d8183c23d 100644
--- a/tensorflow/lite/java/ovic/README.md
+++ b/tensorflow/lite/java/ovic/README.md
@@ -1,32 +1,42 @@
-# OVIC Benchmarker for ICCV-NeurIPS 2019
+# OVIC Benchmarker for LPCV 2020
 
-This folder contains the SDK for track one of the [Low Power Computer Vision workshop at ICCV 2019.](https://rebootingcomputing.ieee.org/lpirc/)
+This folder contains the SDK for track one of the
+[Low Power Computer Vision workshop at CVPR 2020.](https://lpcv.ai/2020CVPR/ovic-track)
 
 ## Pre-requisite
 
-Follow the steps [here](https://www.tensorflow.org/lite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
+Follow the steps [here](https://www.tensorflow.org/lite/demo_android) to install
+Tensorflow, Bazel, and the Android NDK and SDK.
 
 ## Test the benchmarker:
 
-The testing utilities helps the developers (you) to make sure that your submissions in TfLite format will be processed as expected in the competition's benchmarking system.
+The testing utilities helps the developers (you) to make sure that your
+submissions in TfLite format will be processed as expected in the competition's
+benchmarking system.
 
-Note: for now the tests only provides correctness checks, i.e. classifier predicts the correct category on the test image, but no on-device latency measurements. To test the latency measurement functionality, the tests will print the latency running on a desktop computer, which is not indicative of the on-device run-time.
-We are releasing an benchmarker Apk that would allow developers to measure latency on their own devices.
+Note: for now the tests only provides correctness checks, i.e. classifier
+predicts the correct category on the test image, but no on-device latency
+measurements. To test the latency measurement functionality, the tests will
+print the latency running on a desktop computer, which is not indicative of the
+on-device run-time. We are releasing an benchmarker Apk that would allow
+developers to measure latency on their own devices.
 
 ### Obtain the sample models
 
-The test data (models and images) should be downloaded automatically for you by Bazel. In case they are not, you can manually install them as below.
+The test data (models and images) should be downloaded automatically for you by
+Bazel. In case they are not, you can manually install them as below.
 
-Note: all commands should be called from your tensorflow installation folder (under this folder you should find `tensorflow/lite`).
+Note: all commands should be called from your tensorflow installation folder
+(under this folder you should find `tensorflow/lite`).
 
-
-* Download the [testdata package](https://storage.googleapis.com/download.tensorflow.org/data/ovic_2018_10_23.zip):
+*   Download the
+    [testdata package](https://storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip):
 
 ```sh
 curl -L https://storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip -o /tmp/ovic.zip
 ```
 
-* Unzip the package into the testdata folder:
+*   Unzip the package into the testdata folder:
 
 ```sh
 unzip -j /tmp/ovic.zip -d tensorflow/lite/java/ovic/src/testdata/
@@ -34,7 +44,8 @@ unzip -j /tmp/ovic.zip -d tensorflow/lite/java/ovic/src/testdata/
 
 ### Run tests
 
-You can run test with Bazel as below. This helps to ensure that the installation is correct.
+You can run test with Bazel as below. This helps to ensure that the installation
+is correct.
 
 ```sh
 bazel test //tensorflow/lite/java/ovic:OvicClassifierTest --cxxopt=-Wno-all --test_output=all
@@ -44,11 +55,16 @@ bazel test //tensorflow/lite/java/ovic:OvicDetectorTest --cxxopt=-Wno-all --test
 
 ### Test your submissions
 
-Once you have a submission that follows the instructions from the [competition site](https://gdoc.pub/doc/e/2PACX-1vSFTEMAE_N6RgtidT-4DVTje6f6HRJv7Q_zaCab5H66BFyqEiZ8PsUfD_-YmBE7_z67qDiNgk-CJqeE), you can verify it in two ways:
+Once you have a submission that follows the instructions from the
+[competition site](https://lpcv.ai/2020CVPR/ovic-track), you can verify it in
+two ways:
 
 #### Validate using randomly generated images
 
-You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output for classification should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
+You can call the validator binary below to verify that your model fits the
+format requirements. This often helps you to catch size mismatches (e.g. output
+for classification should be [1, 1001] instead of [1,1,1,1001]). Let say the
+submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
 bazel build //tensorflow/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
@@ -62,24 +78,30 @@ Successfully validated /path/to/my_model.lite.
 
 ```
 
-To validate detection models, use the same command but provide "detect" as the second argument instead of "classify".
-
+To validate detection models, use the same command but provide "detect" as the
+second argument instead of "classify".
 
 #### Test that the model produces sensible outcomes
 
-You can go a step further to verify that the model produces results as expected. This helps you catch bugs during TOCO conversion (e.g. using the wrong mean and std values).
+You can go a step further to verify that the model produces results as expected.
+This helps you catch bugs during TFLite conversion (e.g. using the wrong mean
+and std values).
 
-* Move your submission to the testdata folder:
+*   Move your submission to the testdata folder:
 
 ```sh
 cp /path/to/my_model.lite tensorflow/lite/java/ovic/src/testdata/
 ```
 
-* Resize the test image to the resolutions that are expected by your submission:
+*   Resize the test image to the resolutions that are expected by your
+    submission:
 
-The test images can be found at `tensorflow/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these images if your image resolutions are 128x128 or 224x224.
+The test images can be found at
+`tensorflow/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these
+images if your image resolutions are 128x128 or 224x224.
 
-* Add your model and test image to the BUILD rule at `tensorflow/lite/java/ovic/src/testdata/BUILD`:
+*   Add your model and test image to the BUILD rule at
+    `tensorflow/lite/java/ovic/src/testdata/BUILD`:
 
 ```JSON
 filegroup(
@@ -97,33 +119,50 @@ filegroup(
     ...
 ```
 
-* For classification models, modify `OvicClassifierTest.java`:
-  * change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
+*   For classification models, modify `OvicClassifierTest.java`:
 
-  * change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize).
+    *   change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
 
-  * change `TEST_IMAGE_GROUNDTRUTH` (ImageNet class ID) to be consistent with your test image.
+    *   change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to
+        `my_model.lite` depending on whether your model runs inference in float
+        or
+        [8-bit](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize).
 
-* For detection models, modify `OvicDetectorTest.java`:
-  * change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
-  * change `MODEL_PATH` to `my_model.lite`.
-  * change `GROUNDTRUTH` (COCO class ID) to be consistent with your test image.
+    *   change `TEST_IMAGE_GROUNDTRUTH` (ImageNet class ID) to be consistent
+        with your test image.
+
+*   For detection models, modify `OvicDetectorTest.java`:
+
+    *   change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
+    *   change `MODEL_PATH` to `my_model.lite`.
+    *   change `GROUNDTRUTH` (COCO class ID) to be consistent with your test
+        image.
 
 Now you can run the bazel tests to catch any runtime issues with the submission.
 
-Note: Please make sure that your submission passes the test. If a submission fails to pass the test it will not be processed by the submission server.
+Note: Please make sure that your submission passes the test. If a submission
+fails to pass the test it will not be processed by the submission server.
 
 ## Measure on-device latency
 
-We provide two ways to measure the on-device latency of your submission. The first is through our competition server, which is reliable and repeatable, but is limited to a few trials per day. The second is through the benchmarker Apk, which requires a device and may not be as accurate as the server, but has a fast turn-around and no access limitations. We recommend that the participants use the benchmarker apk for early development, and reserve the competition server for evaluating promising submissions.
+We provide two ways to measure the on-device latency of your submission. The
+first is through our competition server, which is reliable and repeatable, but
+is limited to a few trials per day. The second is through the benchmarker Apk,
+which requires a device and may not be as accurate as the server, but has a fast
+turn-around and no access limitations. We recommend that the participants use
+the benchmarker apk for early development, and reserve the competition server
+for evaluating promising submissions.
 
 ### Running the benchmarker app
 
-Make sure that you have followed instructions in [Test your submissions](#test-your-submissions) to add your model to the testdata folder and to the corresponding build rules.
+Make sure that you have followed instructions in
+[Test your submissions](#test-your-submissions) to add your model to the
+testdata folder and to the corresponding build rules.
 
 Modify `tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java`:
 
-* Add your model to the benchmarker apk by changing `modelPath` and `testImagePath` to your submission and test image.
+*   Add your model to the benchmarker apk by changing `modelPath` and
+    `testImagePath` to your submission and test image.
 
 ```
   if (benchmarkClassification) {
@@ -133,13 +172,15 @@ Modify `tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java`:
   } else {  // Benchmarking detection.
   ...
 ```
-If you are adding a detection model, simply modify `modelPath` and `testImagePath` in the else block above.
 
-* Adjust the benchmark parameters when needed:
+If you are adding a detection model, simply modify `modelPath` and
+`testImagePath` in the else block above.
+
+*   Adjust the benchmark parameters when needed:
 
 You can change the length of each experiment, and the processor affinity below.
 `BIG_CORE_MASK` is an integer whose binary encoding represents the set of used
-cores. This number is phone-specific. For example, Pixel 2 has 8 cores: the 4
+cores. This number is phone-specific. For example, Pixel 4 has 8 cores: the 4
 little cores are represented by the 4 less significant bits, and the 4 big cores
 by the 4 more significant bits. Therefore a mask value of 16, or in binary
 `00010000`, represents using only the first big core. The mask 32, or in binary
@@ -151,20 +192,23 @@ mask 16 because the big cores are interchangeable.
   private static final double WALL_TIME = 3000;
   /** Maximum number of iterations in each benchmarking experiment. */
   private static final int MAX_ITERATIONS = 100;
-  /** Mask for binding to a single big core. Pixel 1 (4), Pixel 2 (16). */
+  /** Mask for binding to a single big core. Pixel 1 (4), Pixel 4 (16). */
   private static final int BIG_CORE_MASK = 16;
 ```
 
 Note: You'll need ROOT access to the phone to change processor affinity.
 
-* Build and install the app.
+*   Build and install the app.
 
 ```
 bazel build -c opt --cxxopt=-Wno-all //tensorflow/lite/java/ovic/demo/app:ovic_benchmarker_binary
 adb install -r bazel-bin/tensorflow/lite/java/ovic/demo/app/ovic_benchmarker_binary.apk
 ```
 
-Start the app and pick a task by clicking either the `CLF` button for classification or the `DET` button for detection. The button should turn bright green, signaling that the experiment is running. The benchmarking results will be displayed after about the `WALL_TIME` you specified above. For example:
+Start the app and pick a task by clicking either the `CLF` button for
+classification or the `DET` button for detection. The button should turn bright
+green, signaling that the experiment is running. The benchmarking results will
+be displayed after about the `WALL_TIME` you specified above. For example:
 
 ```
 my_model.lite: Average latency=158.6ms after 20 runs.
@@ -172,87 +216,106 @@ my_model.lite: Average latency=158.6ms after 20 runs.
 
 ### Sample latencies
 
-Note: the benchmarking results can be quite different depending on the background processes running on the phone. A few things that help stabilize the app's readings are placing the phone on a cooling plate, restarting the phone, and shutting down internet access.
+Note: the benchmarking results can be quite different depending on the
+background processes running on the phone. A few things that help stabilize the
+app's readings are placing the phone on a cooling plate, restarting the phone,
+and shutting down internet access.
 
-| Classification Model | Pixel 1 latency (ms)  | Pixel 2 latency (ms) |
-| -------------------- |:---------------------:| --------------------:|
-|  float_model.lite    | 97                   | 113                  |
-| quantized_model.lite | 73                    | 61                   |
-|  low_res_model.lite  | 3                   | 3                  |
+Classification Model | Pixel 1 | Pixel 2 | Pixel 4
+-------------------- | :-----: | ------: | :-----:
+float_model.lite     | 97      | 113     | 37
+quantized_model.lite | 73      | 61      | 13
+low_res_model.lite   | 3       | 3       | 1
 
+Detection Model        | Pixel 2 | Pixel 4
+---------------------- | :-----: | :-----:
+detect.lite            | 248     | 82
+quantized_detect.lite  | 59      | 17
+quantized_fpnlite.lite | 96      | 29
 
-| Detection Model      | Pixel 2 latency (ms)  |
-| -------------------- |:---------------------:|
-|  detect.lite         | 248                   |
-| quantized_detect.lite | 59                    |
-| quantized_fpnlite.lite | 96   |
+All latency numbers are in milliseconds. The Pixel 1 and Pixel 2 latency numbers
+are measured on `Oct 17 2019` (Github commit hash
+[I05def66f58fa8f2161522f318e00c1b520cf0606](https://github.com/tensorflow/tensorflow/commit/4b02bc0e0ff7a0bc02264bc87528253291b7c949#diff-4e94df4d2961961ba5f69bbd666e0552))
 
+The Pixel 4 latency numbers are measured on `Apr 14 2020` (Github commit hash
+[4b2cb67756009dda843c6b56a8b320c8a54373e0](https://github.com/tensorflow/tensorflow/commit/4b2cb67756009dda843c6b56a8b320c8a54373e0)).
 
-All latency numbers above are measured on `Oct 17 2019` (Github commit hash [I05def66f58fa8f2161522f318e00c1b520cf0606]( https://github.com/tensorflow/tensorflow/commit/4b02bc0e0ff7a0bc02264bc87528253291b7c949#diff-4e94df4d2961961ba5f69bbd666e0552]))
-
-Since Pixel 2 has excellent support for 8-bit quantized models, we strongly recommend you to check out the [quantization training tutorial](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize).
-
-The detection models above are both single-shot models (i.e. no object proposal generation) using TfLite's *fast* version of Non-Max-Suppression (NMS). The fast NMS is significant faster than the regular NMS (used by the ObjectDetectionAPI in training) at the expense of about 1% mAP for the listed models.
+Since Pixel 4 has excellent support for 8-bit quantized models, we strongly
+recommend you to check out the
+[Post-Training Quantization tutorial](https://www.tensorflow.org/lite/performance/post_training_quantization).
 
+The detection models above are both single-shot models (i.e. no object proposal
+generation) using TfLite's *fast* version of Non-Max-Suppression (NMS). The fast
+NMS is significant faster than the regular NMS (used by the ObjectDetectionAPI
+in training) at the expense of about 1% mAP for the listed models.
 
 ### Latency table
 
 We have compiled a latency table for common neural network operators such as
-convolutions, separable convolutions, and matrix multiplications.
-The table of results is available here:
+convolutions, separable convolutions, and matrix multiplications. The table of
+results is available here:
 
-* https://storage.cloud.google.com/ovic-data/latency_table.csv
+*   https://storage.cloud.google.com/ovic-data/
 
 The results were generated by creating a small network containing a single
 operation, and running the op under the test harness. For more details see the
 NetAdapt paper<sup>1</sup>. We plan to expand table regularly as we test with
 newer OS releases and updates to Tensorflow Lite.
 
-
 ### Sample benchmarks
 
-Below are the baseline models (8-bit quantized MobilenetV2 and floating point
-MnasNet) used to compute the reference accuracy for ImageNet classification. The
-naming convention of the models are `[model class]_[resolution]_[multiplier]`.
-Latency (ms) is measured on a single Pixel 2 big core using the competition
-server on `Oct 17 2019`
+Below are the baseline models (MobileNetV2, MnasNet, and MobileNetV3) used to
+compute the reference accuracy for ImageNet classification. The naming
+convention of the models are `[precision]_[model
+class]_[resolution]_[multiplier]`. Pixel 2 Latency (ms) is measured on a single
+Pixel 2 big core using the competition server on `Oct 17 2019`, while Pixel 4
+latency (ms) is measured on a single Pixel 4 big core using the competition
+server on `Apr 14 2020`. You can find these models on TFLite's
+[hosted model page](https://www.tensorflow.org/lite/guide/hosted_models#image_classification).
 
-Model                     | Latency | Top-1 Accuracy
-:-----------------------: | :-----: | :------------:
-quant_mobilenetv2_96_35   | 4       | 0.420
-quant_mobilenetv2_96_50   | 5       | 0.478
-quant_mobilenetv2_128_35  | 6       | 0.474
-quant_mobilenetv2_128_50  | 8      | 0.546
-quant_mobilenetv2_160_35  | 9      | 0.534
-quant_mobilenetv2_96_75   | 8      | 0.560
-quant_mobilenetv2_96_100  | 10      | 0.579
-quant_mobilenetv2_160_50  | 12      | 0.583
-quant_mobilenetv2_192_35  | 12      | 0.557
-quant_mobilenetv2_128_75  | 13      | 0.611
-quant_mobilenetv2_224_35  | 17      | 0.581
-quant_mobilenetv2_192_50  | 16      | 0.616
-float_mnasnet_96_100      | 21      | 0.625
-quant_mobilenetv2_128_100 | 16      | 0.629
-quant_mobilenetv2_160_75  | 20      | 0.646
-quant_mobilenetv2_224_50  | 22      | 0.637
-quant_mobilenetv2_160_100 | 25      | 0.674
-float_mnasnet_224_50      | 35      | 0.679
-quant_mobilenetv2_192_75  | 29      | 0.674
-float_mnasnet_160_100     | 45      | 0.706
-quant_mobilenetv2_192_100 | 35      | 0.695
-quant_mobilenetv2_224_75  | 39      | 0.684
-float_mnasnet_224_75      | 55      | 0.718
-float_mnasnet_192_100     | 62      | 0.724
-quant_mobilenetv2_224_100 | 48      | 0.704
-float_mnasnet_224_100     | 84      | 0.742
-float_mnasnet_224_130     | 126     | 0.758
+Model                               | Pixel 2 | Pixel 4 | Top-1 Accuracy
+:---------------------------------: | :-----: | :-----: | :------------:
+quant_mobilenetv2_96_35             | 4       | 1       | 0.420
+quant_mobilenetv2_96_50             | 5       | 1       | 0.478
+quant_mobilenetv2_128_35            | 6       | 2       | 0.474
+quant_mobilenetv2_128_50            | 8       | 2       | 0.546
+quant_mobilenetv2_160_35            | 9       | 2       | 0.534
+quant_mobilenetv2_96_75             | 8       | 2       | 0.560
+quant_mobilenetv2_96_100            | 10      | 3       | 0.579
+quant_mobilenetv2_160_50            | 12      | 3       | 0.583
+quant_mobilenetv2_192_35            | 12      | 3       | 0.557
+quant_mobilenetv2_128_75            | 13      | 3       | 0.611
+quant_mobilenetv2_192_50            | 16      | 4       | 0.616
+quant_mobilenetv2_128_100           | 16      | 4       | 0.629
+quant_mobilenetv2_224_35            | 17      | 5       | 0.581
+quant_mobilenetv2_160_75            | 20      | 5       | 0.646
+float_mnasnet_96_100                | 21      | 7       | 0.625
+quant_mobilenetv2_224_50            | 22      | 6       | 0.637
+quant_mobilenetv2_160_100           | 25      | 6       | 0.674
+quant_mobilenetv2_192_75            | 29      | 7       | 0.674
+quant_mobilenetv2_192_100           | 35      | 9       | 0.695
+float_mnasnet_224_50                | 35      | 12      | 0.679
+quant_mobilenetv2_224_75            | 39      | 10      | 0.684
+float_mnasnet_160_100               | 45      | 15      | 0.706
+quant_mobilenetv2_224_100           | 48      | 12      | 0.704
+float_mnasnet_224_75                | 55      | 18      | 0.718
+float_mnasnet_192_100               | 62      | 20      | 0.724
+float_mnasnet_224_100               | 84      | 27      | 0.742
+float_mnasnet_224_130               | 126     | 40      | 0.758
+float_v3-small-minimalistic_224_100 | -       | 5       | 0.620
+quant_v3-small_224_100              | -       | 5       | 0.641
+float_v3-small_224_75               | -       | 5       | 0.656
+float_v3-small_224_100              | -       | 7       | 0.677
+quant_v3-large_224_100              | -       | 12      | 0.728
+float_v3-large_224_75               | -       | 15      | 0.735
+float_v3-large-minimalistic_224_100 | -       | 17      | 0.722
+float_v3-large_224_100              | -       | 20      | 0.753
 
 ### References
 
-1. **NetAdapt: Platform-Aware Neural Network Adaptation for Mobile
-   Applications**<br />
-   Yang, Tien-Ju, Andrew Howard, Bo Chen, Xiao Zhang, Alec Go, Mark Sandler,
-   Vivienne Sze, and Hartwig Adam. In Proceedings of the European Conference
-   on Computer Vision (ECCV), pp. 285-300. 2018<br />
-  [[link]](https://arxiv.org/abs/1804.03230) arXiv:1804.03230, 2018.
-
+1.  **NetAdapt: Platform-Aware Neural Network Adaptation for Mobile
+    Applications**<br />
+    Yang, Tien-Ju, Andrew Howard, Bo Chen, Xiao Zhang, Alec Go, Mark Sandler,
+    Vivienne Sze, and Hartwig Adam. In Proceedings of the European Conference
+    on Computer Vision (ECCV), pp. 285-300. 2018<br />
+    [[link]](https://arxiv.org/abs/1804.03230) arXiv:1804.03230, 2018.
diff --git a/tensorflow/lite/java/ovic/Winner_OSS_Template.md b/tensorflow/lite/java/ovic/Winner_OSS_Template.md
new file mode 100644
index 00000000000..6716aac456e
--- /dev/null
+++ b/tensorflow/lite/java/ovic/Winner_OSS_Template.md
@@ -0,0 +1,121 @@
+<!--
+• This is a README.md template we encourage you to use when you release your model.
+• There are general sections we added to this template for various ML models.
+• You may need to add or remove sections depending on your needs.
+-->
+
+# Project Name
+
+## Authors
+The **1st place winner** of the **4th On-device Visual Intelligence Competition** ([OVIC](https://docs.google.com/document/d/1Rxm_N7dGRyPXjyPIdRwdhZNRye52L56FozDnfYuCi0k/edit#)) of Low-Power Computer Vision Challenge ([LPCVC](https://lpcv.ai/))
+
+* Last name, First name ([@GitHubUsername](https://github.com/username))
+* Last name, First name ([@GitHubUsername](https://github.com/username))
+* Last name, First name ([@GitHubUsername](https://github.com/username))
+
+## Description
+<!-- Provide description of the model -->
+The model submitted for the OVIC and full implementation code for training, evaluation, and inference
+
+* OVIC track: Image Classification, Object Detection
+
+## Algorithm
+<!-- Provide details of the algorithms used -->
+
+## Requirements
+<!--
+• Provide description of the model 
+• Provide brief information of the algorithms used
+-->
+
+To install requirements:
+
+```setup
+pip install -r requirements.txt
+```
+
+## Pre-trained Models
+
+| Model | Download | MD5 checksum |
+|-------|----------|--------------|
+| Model Name | Download Link (Size: KB) | MD5 checksum |
+
+The model tar file contains the followings:
+* Trained model checkpoint
+* Frozen trained model
+* TensorFlow Lite model
+
+## Results
+
+### [4th OVIC Public Ranked Leaderboard](https://lpcvc.ecn.purdue.edu/score_board_r4/?contest=round4)
+
+#### Image Classification (from the Leaderboard)
+
+| Rank | Username | Latency | Accuracy on Classified | # Classified | Accuracy/Time | Metric | Reference Accuracy |
+|------|----------|---------|------------------------|--------------|---------------|--------|--------------------|
+| 1 | Username | xx.x | 0.xxxx | 20000.0 | xxx | 0.xxxxx | 0.xxxxx |
+
+ * **Metric**: Accuracy improvement over the reference accuracy from the Pareto optimal curve
+ * **Accuracy on Classified**: The accuracy in [0, 1] computed based only on the images classified within the wall-time
+ * **\# Classified**: The number of images classified within the wall-time
+ * **Accuracy/Time**: The accuracy divided by either the total inference time or the wall-time, whichever is longer
+ * **Reference accuracy**: The reference accuracy of models from the Pareto optimal curve that have the same latency as the submission
+
+#### Object Detection
+
+| Rank | Username | Metric | Runtime | mAP over time | mAP of processed |
+|------|----------|--------|---------|---------------|------------------|
+| 1 | Username | 0.xxxxx | xxx.x | xxx | xxx |
+
+* **Metric**: COCO mAP computed on the entire minival dataset
+* **mAP over time**: COCO mAP on the minival dataset divided by latency per image
+* **mAP of processed**: COCO mAP computed only on the processed images
+
+## Dataset
+<!--
+• Provide detailed information of the dataset used
+-->
+
+## Training
+<!--
+• Provide detailed training information (preprocessing, hyperparameters, random seeds, and environment) 
+• Provide a command line example for training.
+-->
+
+Please run this command line for training.
+
+```shell
+python3 ...
+```
+
+## Evaluation
+<!--
+• Provide evaluation script with details of how to reproduce results.
+• Describe data preprocessing / postprocessing steps
+• Provide a command line example for evaluation.
+-->
+
+Please run this command line for evaluation.
+
+```shell
+python3 ...
+```
+
+## References
+<!-- Link to references -->
+
+## License
+<!--
+• Place your license text in a file named LICENSE.txt (or LICENSE.md) in the root of the repository.
+• Please also include information about your license in this README.md file.
+e.g., [Adding a license to a repository](https://help.github.com/en/github/building-a-strong-community/adding-a-license-to-a-repository)
+-->
+
+This project is licensed under the terms of the **Apache License 2.0**.
+
+## Citation
+<!--
+If you want to make your repository citable, please follow the instructions at [Making Your Code Citable](https://guides.github.com/activities/citable-code/)
+-->
+
+If you want to cite this repository in your research paper, please use the following information.
diff --git a/tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java b/tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
index 144530390ff..76e0cedad5f 100644
--- a/tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
+++ b/tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
@@ -45,6 +45,7 @@ public class OvicBenchmarkerActivity extends Activity {
 
   /** Name of the task-dependent data files stored in Assets. */
   private static String labelPath = null;
+
   private static String testImagePath = null;
   private static String modelPath = null;
   /**
@@ -91,7 +92,7 @@ public class OvicBenchmarkerActivity extends Activity {
       labelPath = "labels.txt";
       testImagePath = "test_image_224.jpg";
       modelPath = "quantized_model.lite";
-    } else {  // Benchmarking detection.
+    } else { // Benchmarking detection.
       benchmarker = new OvicDetectorBenchmarker(WALL_TIME);
       labelPath = "coco_labels.txt";
       testImagePath = "test_image_224.jpg";
@@ -145,6 +146,7 @@ public class OvicBenchmarkerActivity extends Activity {
   public void detectPressed(View view) throws IOException {
     benchmarkSession(false);
   }
+
   public void classifyPressed(View view) throws IOException {
     benchmarkSession(true);
   }
@@ -194,7 +196,7 @@ public class OvicBenchmarkerActivity extends Activity {
             displayText
                 + modelPath
                 + ": Average latency="
-                + df2.format(benchmarker.getTotalRunTime() / testIter)
+                + df2.format(benchmarker.getTotalRuntimeNano() * 1.0e-6 / testIter)
                 + "ms after "
                 + testIter
                 + " runs.");
@@ -204,12 +206,15 @@ public class OvicBenchmarkerActivity extends Activity {
     }
   }
 
+  // TODO(b/153429929) Remove with resolution of issue (see below).
+  @SuppressWarnings("RuntimeExec")
   private static void setProcessorAffinity(int mask) throws IOException {
     int myPid = Process.myPid();
     Log.i(TAG, String.format("Setting processor affinity to 0x%02x", mask));
 
     String command = String.format("taskset -a -p %x %d", mask, myPid);
     try {
+      // TODO(b/153429929) This is deprecated, but updating is not safe while verification is hard.
       Runtime.getRuntime().exec(command).waitFor();
     } catch (InterruptedException e) {
       throw new IOException("Interrupted: " + e);
diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
index 32bdd5a97a7..49cf21debc5 100644
--- a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
@@ -43,6 +43,7 @@ public abstract class OvicBenchmarker {
 
   /** Dimensions of inputs. */
   protected static final int DIM_BATCH_SIZE = 1;
+
   protected static final int DIM_PIXEL_SIZE = 3;
   protected int imgHeight = 224;
   protected int imgWidth = 224;
@@ -53,38 +54,38 @@ public abstract class OvicBenchmarker {
   /** A ByteBuffer to hold image data, to be feed into classifier as inputs. */
   protected ByteBuffer imgData = null;
 
-  /** Total runtime in ms. */
-  protected double totalRuntime = 0.0;
+  /** Total runtime in ns. */
+  protected double totalRuntimeNano = 0.0;
   /** Total allowed runtime in ms. */
-  protected double wallTime = 20000 * 30.0;
+  protected double wallTimeNano = 20000 * 30 * 1.0e6;
   /** Record whether benchmark has started (used to skip the first image). */
   protected boolean benchmarkStarted = false;
 
   /**
    * Initializes an {@link OvicBenchmarker}
    *
-   * @param wallTime: a double number specifying the total amount of time to benchmark.
+   * @param wallTimeNano: a double number specifying the total amount of time to benchmark.
    */
-  public OvicBenchmarker(double wallTime) {
+  public OvicBenchmarker(double wallTimeNano) {
     benchmarkStarted = false;
-    totalRuntime = 0.0;
-    this.wallTime = wallTime;
+    totalRuntimeNano = 0.0;
+    this.wallTimeNano = wallTimeNano;
   }
 
   /** Return the cumulative latency of all runs so far. */
-  public double getTotalRunTime() {
-    return totalRuntime;
+  public double getTotalRuntimeNano() {
+    return totalRuntimeNano;
   }
 
   /** Check whether the benchmarker should stop. */
   public Boolean shouldStop() {
-    if (totalRuntime >= wallTime) {
+    if (totalRuntimeNano >= wallTimeNano) {
       Log.e(
           TAG,
-          "Total runtime "
-              + Double.toString(totalRuntime)
-              + " exceeded walltime "
-              + Double.toString(wallTime));
+          "Total runtime (ms) "
+              + (totalRuntimeNano * 1.0e-6)
+              + " exceeded wall-time "
+              + (wallTimeNano * 1.0e-6));
       return true;
     }
     return false;
@@ -120,9 +121,9 @@ public abstract class OvicBenchmarker {
   public abstract String getLastResultString();
 
   /**
-   * Loads input buffer from intValues into ByteBuffer for the interpreter.
-   * Input buffer must be loaded in intValues and output will be placed in imgData.
-  */
+   * Loads input buffer from intValues into ByteBuffer for the interpreter. Input buffer must be
+   * loaded in intValues and output will be placed in imgData.
+   */
   protected void loadsInputToByteBuffer() {
     if (imgData == null || intValues == null) {
       throw new RuntimeException("Benchmarker is not yet ready to test.");
diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java
index 5ab804e6ee2..9aad3719167 100644
--- a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java
@@ -21,34 +21,39 @@ public class OvicClassificationResult {
 
   /** Top K classes and probabilities. */
   public final ArrayList<String> topKClasses;
+
   public final ArrayList<Float> topKProbs;
   public final ArrayList<Integer> topKIndices;
 
   /** Latency (ms). */
-  public Long latency;
+  public Long latencyMilli;
+
+  /** Latency (ns). */
+  public Long latencyNano;
 
   OvicClassificationResult() {
     topKClasses = new ArrayList<>();
     topKProbs = new ArrayList<>();
     topKIndices = new ArrayList<>();
-    latency = -1L;
+    latencyMilli = -1L;
+    latencyNano = -1L;
   }
 
   @Override
   public String toString() {
-    String textToShow = latency + "ms";
+    String textToShow = latencyMilli + "ms";
+    textToShow += "\n" + latencyNano + "ns";
     for (int k = 0; k < topKProbs.size(); ++k) {
       textToShow +=
           "\nPrediction ["
               + k
               + "] = Class "
-              + Integer.toString(topKIndices.get(k))
+              + topKIndices.get(k)
               + " ("
               + topKClasses.get(k)
               + ") : "
-              + Float.toString(topKProbs.get(k));
+              + topKProbs.get(k);
     }
     return textToShow;
   }
-
 }
diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
index d8a54c1f3bc..e27272a5d41 100644
--- a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
@@ -14,13 +14,14 @@ limitations under the License.
 ==============================================================================*/
 package org.tensorflow.ovic;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
-import java.nio.charset.StandardCharsets;
 import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.Collections;
@@ -44,7 +45,7 @@ public class OvicClassifier {
   private Interpreter tflite;
 
   /** Labels corresponding to the output of the vision model. */
-  private List<String> labelList;
+  private final List<String> labelList;
 
   /** An array to hold inference results, to be feed into Tensorflow Lite as outputs. */
   private byte[][] inferenceOutputArray = null;
@@ -56,19 +57,18 @@ public class OvicClassifier {
   /** Whether the model runs as float or quantized. */
   private Boolean outputIsFloat = null;
 
-  private PriorityQueue<Map.Entry<Integer, Float>> sortedLabels =
+  private final PriorityQueue<Map.Entry<Integer, Float>> sortedLabels =
       new PriorityQueue<>(
           RESULTS_TO_SHOW,
           new Comparator<Map.Entry<Integer, Float>>() {
             @Override
             public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
-              return (o1.getValue()).compareTo(o2.getValue());
+              return o1.getValue().compareTo(o2.getValue());
             }
           });
 
   /** Initializes an {@code OvicClassifier}. */
-  public OvicClassifier(InputStream labelInputStream, MappedByteBuffer model)
-      throws IOException, RuntimeException {
+  public OvicClassifier(InputStream labelInputStream, MappedByteBuffer model) throws IOException {
     if (model == null) {
       throw new RuntimeException("Input model is empty.");
     }
@@ -80,12 +80,12 @@ public class OvicClassifier {
       throw new RuntimeException("The model's input dimensions must be 4 (BWHC).");
     }
     if (inputDims[0] != 1) {
-      throw new RuntimeException("The model must have a batch size of 1, got "
-          + inputDims[0] + " instead.");
+      throw new IllegalStateException(
+          "The model must have a batch size of 1, got " + inputDims[0] + " instead.");
     }
     if (inputDims[3] != 3) {
-      throw new RuntimeException("The model must have three color channels, got "
-          + inputDims[3] + " instead.");
+      throw new IllegalStateException(
+          "The model must have three color channels, got " + inputDims[3] + " instead.");
     }
     int minSide = Math.min(inputDims[1], inputDims[2]);
     int maxSide = Math.max(inputDims[1], inputDims[2]);
@@ -93,12 +93,15 @@ public class OvicClassifier {
       throw new RuntimeException("The model's resolution must be between (0, 1000].");
     }
     String outputDataType = TestHelper.getOutputDataType(tflite, 0);
-    if (outputDataType.equals("float")) {
-      outputIsFloat = true;
-    } else if (outputDataType.equals("byte")) {
-      outputIsFloat = false;
-    } else {
-      throw new RuntimeException("Cannot process output type: " + outputDataType);
+    switch (outputDataType) {
+      case "float":
+        outputIsFloat = true;
+        break;
+      case "byte":
+        outputIsFloat = false;
+        break;
+      default:
+        throw new IllegalStateException("Cannot process output type: " + outputDataType);
     }
     inferenceOutputArray = new byte[1][labelList.size()];
     labelProbArray = new float[1][labelList.size()];
@@ -123,7 +126,8 @@ public class OvicClassifier {
       }
     }
     OvicClassificationResult iterResult = computeTopKLabels();
-    iterResult.latency = getLastNativeInferenceLatencyMilliseconds();
+    iterResult.latencyMilli = getLastNativeInferenceLatencyMilliseconds();
+    iterResult.latencyNano = getLastNativeInferenceLatencyNanoseconds();
     return iterResult;
   }
 
@@ -154,6 +158,18 @@ public class OvicClassifier {
     return (latency == null) ? null : (Long) (latency / 1000000);
   }
 
+  /*
+   * Get native inference latency of last image classification run.
+   *  @throws RuntimeException if model is uninitialized.
+   */
+  public Long getLastNativeInferenceLatencyNanoseconds() {
+    if (tflite == null) {
+      throw new IllegalStateException(
+          TAG + ": ImageNet classifier has not been initialized; Failed.");
+    }
+    return tflite.getLastNativeInferenceDurationNanoseconds();
+  }
+
   /** Closes tflite to release resources. */
   public void close() {
     tflite.close();
@@ -162,9 +178,9 @@ public class OvicClassifier {
 
   /** Reads label list from Assets. */
   private static List<String> loadLabelList(InputStream labelInputStream) throws IOException {
-    List<String> labelList = new ArrayList<String>();
+    List<String> labelList = new ArrayList<>();
     try (BufferedReader reader =
-        new BufferedReader(new InputStreamReader(labelInputStream, StandardCharsets.UTF_8))) {
+        new BufferedReader(new InputStreamReader(labelInputStream, UTF_8))) {
       String line;
       while ((line = reader.readLine()) != null) {
         labelList.add(line);
diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
index b35b8ff2c34..8eafd7a25cf 100644
--- a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
@@ -88,16 +88,17 @@ public final class OvicClassifierBenchmarker extends OvicBenchmarker {
       Log.e(TAG, e.getMessage());
       Log.e(TAG, "Failed to classify image.");
     }
-    if (iterResult == null || iterResult.latency == null) {
+    if (iterResult == null || iterResult.latencyMilli == null || iterResult.latencyNano == null) {
       throw new RuntimeException("Classification result or timing is invalid.");
     }
-    Log.d(TAG, "Native inference latency: " + iterResult.latency);
+    Log.d(TAG, "Native inference latency (ms): " + iterResult.latencyMilli);
+    Log.d(TAG, "Native inference latency (ns): " + iterResult.latencyNano);
     Log.i(TAG, iterResult.toString());
 
     if (!benchmarkStarted) {  // Skip the first image to discount warming-up time.
       benchmarkStarted = true;
     } else {
-      totalRuntime += ((double) iterResult.latency);
+      totalRuntimeNano += ((double) iterResult.latencyNano);
     }
     return true;
   }
diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java
index cf2902a5cb2..15e62c5a22f 100644
--- a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java
@@ -22,7 +22,9 @@ public class OvicDetectionResult {
   // Top K classes and probabilities.
   public final ArrayList<BoundingBox> detections;
   // Latency (ms).
-  public Long latency = -1L;
+  public Long latencyMilli = -1L;
+  // Latency (ns).
+  public Long latencyNano = -1L;
   // id of the image.
   public int id = -1;
   // Number of valid detections (separately maintained, maybe different from detections.size()).
@@ -37,9 +39,10 @@ public class OvicDetectionResult {
     }
   }
 
-  public void resetTo(Long latency, int id) {
+  public void resetTo(Long latencyMilli, Long latencyNano, int id) {
     count = 0;
-    this.latency = latency;
+    this.latencyMilli = latencyMilli;
+    this.latencyNano = latencyNano;
     this.id = id;
   }
 
@@ -64,7 +67,8 @@ public class OvicDetectionResult {
 
   @Override
   public String toString() {
-    String textToShow = latency + "ms";
+    String textToShow = latencyMilli + "ms";
+    textToShow += "\n" + latencyNano + "ns";
     int k = 0;
     for (BoundingBox box : detections) {
       textToShow +=
diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
index 84c9816d2b1..c43eb131180 100644
--- a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
@@ -14,13 +14,14 @@ limitations under the License.
 ==============================================================================*/
 package org.tensorflow.ovic;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
-import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -105,7 +106,7 @@ public class OvicDetector implements AutoCloseable {
   private static List<String> loadLabelList(InputStream labelInputStream) throws IOException {
     List<String> labelList = new ArrayList<>();
     try (BufferedReader reader =
-        new BufferedReader(new InputStreamReader(labelInputStream, StandardCharsets.UTF_8))) {
+        new BufferedReader(new InputStreamReader(labelInputStream, UTF_8))) {
       String line;
       while ((line = reader.readLine()) != null) {
         labelList.add(line);
@@ -131,10 +132,11 @@ public class OvicDetector implements AutoCloseable {
     Object[] inputArray = {imgData};
     tflite.runForMultipleInputsOutputs(inputArray, outputMap);
 
-    Long latency = getLastNativeInferenceLatencyMilliseconds();
+    Long latencyMilli = getLastNativeInferenceLatencyMilliseconds();
+    Long latencyNano = getLastNativeInferenceLatencyNanoseconds();
 
     // Update the results.
-    result.resetTo(latency, imageId);
+    result.resetTo(latencyMilli, latencyNano, imageId);
     for (int i = 0; i < NUM_RESULTS; i++) {
       // The model returns normalized coordinates [start_y, start_x, end_y, end_x].
       // The boxes expect pixel coordinates [x1, y1, x2, y2].
@@ -154,7 +156,7 @@ public class OvicDetector implements AutoCloseable {
   /*
    * Get native inference latency of last image detection run.
    *  @throws RuntimeException if model is uninitialized.
-   *  @return The inference latency in millisecond.
+   *  @return The inference latency in milliseconds.
    */
   public Long getLastNativeInferenceLatencyMilliseconds() {
     if (tflite == null) {
@@ -164,6 +166,19 @@ public class OvicDetector implements AutoCloseable {
     return (latency == null) ? null : (Long) (latency / 1000000);
   }
 
+  /*
+   * Get native inference latency of last image detection run.
+   *  @throws RuntimeException if model is uninitialized.
+   *  @return The inference latency in nanoseconds.
+   */
+  public Long getLastNativeInferenceLatencyNanoseconds() {
+    if (tflite == null) {
+      throw new IllegalStateException(
+          TAG + ": ImageNet classifier has not been initialized; Failed.");
+    }
+    return tflite.getLastNativeInferenceDurationNanoseconds();
+  }
+
   public int[] getInputDims() {
     return inputDims;
   }
diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
index 15a4c988123..0c03269c27a 100644
--- a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
@@ -98,7 +98,7 @@ public final class OvicDetectorBenchmarker extends OvicBenchmarker {
     if (!benchmarkStarted) { // Skip the first image to discount warming-up time.
       benchmarkStarted = true;
     } else {
-      totalRuntime += ((double) detector.result.latency);
+      totalRuntimeNano += ((double) detector.result.latencyNano);
     }
     return true;  // Indicating that result is ready.
   }
diff --git a/tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
index 37d716583fc..7ded4df9e07 100644
--- a/tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
+++ b/tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -116,7 +116,7 @@ public final class OvicClassifierTest {
   public void ovicClassifier_latencyNotNull() throws Exception {
     classifier = new OvicClassifier(labelsInputStream, floatModel);
     testResult = classifier.classifyByteBuffer(testImage);
-    assertThat(testResult.latency).isNotNull();
+    assertThat(testResult.latencyNano).isNotNull();
   }
 
   @Test
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index efcdc0e4c65..5625ef98bb6 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -348,7 +348,22 @@ public final class Interpreter implements AutoCloseable {
    */
   public void resizeInput(int idx, @NonNull int[] dims) {
     checkNotClosed();
-    wrapper.resizeInput(idx, dims);
+    wrapper.resizeInput(idx, dims, false);
+  }
+
+  /**
+   * Resizes idx-th input of the native model to the given dims.
+   *
+   * <p>When `strict` is True, only unknown dimensions can be resized. Unknown dimensions are
+   * indicated as `-1` in the array returned by `Tensor.shapeSignature()`.
+   *
+   * @throws IllegalArgumentException if {@code idx} is negtive or is not smaller than the number of
+   *     model inputs; or if error occurs when resizing the idx-th input. Additionally, the error
+   *     occurs when attempting to resize a tensor with fixed dimensions when `struct` is True.
+   */
+  public void resizeInput(int idx, @NonNull int[] dims, boolean strict) {
+    checkNotClosed();
+    wrapper.resizeInput(idx, dims, strict);
   }
 
   /** Gets the number of input tensors. */
@@ -476,6 +491,11 @@ public final class Interpreter implements AutoCloseable {
     wrapper.resetVariableTensors();
   }
 
+  int getExecutionPlanLength() {
+    checkNotClosed();
+    return wrapper.getExecutionPlanLength();
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 73fe506f131..8eb3c66f3b5 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -174,7 +174,12 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   /** Resizes dimensions of a specific input. */
   void resizeInput(int idx, int[] dims) {
-    if (resizeInput(interpreterHandle, errorHandle, idx, dims)) {
+    resizeInput(idx, dims, false);
+  }
+
+  /** Resizes dimensions of a specific input. */
+  void resizeInput(int idx, int[] dims, boolean strict) {
+    if (resizeInput(interpreterHandle, errorHandle, idx, dims, strict)) {
       // Tensor allocation is deferred until either an explicit `allocateTensors()` call or
       // `invoke()` avoiding redundant allocations if multiple tensors are simultaneosly resized.
       isMemoryAllocated = false;
@@ -185,7 +190,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   }
 
   private static native boolean resizeInput(
-      long interpreterHandle, long errorHandle, int inputIdx, int[] dims);
+      long interpreterHandle, long errorHandle, int inputIdx, int[] dims, boolean strict);
 
   /** Triggers explicit allocation of tensors. */
   void allocateTensors() {
@@ -319,6 +324,11 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     return outputTensor;
   }
 
+  /** Gets the number of ops in the execution plan. */
+  int getExecutionPlanLength() {
+    return getExecutionPlanLength(interpreterHandle);
+  }
+
   private void applyDelegates(Interpreter.Options options) {
     // First apply the flex delegate if necessary. This ensures the graph is fully resolved before
     // applying other delegates.
@@ -414,6 +424,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native int getOutputCount(long interpreterHandle);
 
+  private static native int getExecutionPlanLength(long interpreterHandle);
+
   private static native String[] getInputNames(long interpreterHandle);
 
   private static native String[] getOutputNames(long interpreterHandle);
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 34647275b92..cc9a6a451ac 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -31,6 +31,7 @@ import java.util.Arrays;
  * not needed to be closed by the client. However, once the {@code NativeInterpreterWrapper} has
  * been closed, the tensor handle will be invalidated.
  */
+// TODO(b/153882978): Add scalar getters similar to TF's Java API.
 public final class Tensor {
 
   /**
@@ -187,13 +188,15 @@ public final class Tensor {
     throwIfDataIsIncompatible(src);
     if (isBuffer(src)) {
       setTo((Buffer) src);
-    } else {
+    } else if (src.getClass().isArray()) {
       writeMultiDimensionalArray(nativeHandle, src);
+    } else {
+      writeScalar(nativeHandle, src);
     }
   }
 
   private void setTo(Buffer src) {
-    // Note that we attempt to use zero-copy optimization for direct, native-ordered buffers.
+    // Note that we attempt to use a direct memcpy optimization for direct, native-ordered buffers.
     // There are no base Buffer#order() or Buffer#put() methods, so again we have to ugly cast.
     if (src instanceof ByteBuffer) {
       ByteBuffer srcBuffer = (ByteBuffer) src;
@@ -300,19 +303,39 @@ public final class Tensor {
   static DataType dataTypeOf(Object o) {
     if (o != null) {
       Class<?> c = o.getClass();
-      while (c.isArray()) {
-        c = c.getComponentType();
-      }
-      if (float.class.equals(c) || o instanceof FloatBuffer) {
-        return DataType.FLOAT32;
-      } else if (int.class.equals(c) || o instanceof IntBuffer) {
-        return DataType.INT32;
-      } else if (byte.class.equals(c)) {
-        return DataType.UINT8;
-      } else if (long.class.equals(c) || o instanceof LongBuffer) {
-        return DataType.INT64;
-      } else if (String.class.equals(c)) {
-        return DataType.STRING;
+      // For arrays, the data elements must be a *primitive* type, e.g., an
+      // array of floats is fine, but not an array of Floats.
+      if (c.isArray()) {
+        while (c.isArray()) {
+          c = c.getComponentType();
+        }
+        if (float.class.equals(c)) {
+          return DataType.FLOAT32;
+        } else if (int.class.equals(c)) {
+          return DataType.INT32;
+        } else if (byte.class.equals(c)) {
+          return DataType.UINT8;
+        } else if (long.class.equals(c)) {
+          return DataType.INT64;
+        } else if (String.class.equals(c)) {
+          return DataType.STRING;
+        }
+      } else {
+        // For scalars, the type will be boxed.
+        if (Float.class.equals(c) || o instanceof FloatBuffer) {
+          return DataType.FLOAT32;
+        } else if (Integer.class.equals(c) || o instanceof IntBuffer) {
+          return DataType.INT32;
+        } else if (Byte.class.equals(c)) {
+          // Note that we don't check for ByteBuffer here; ByteBuffer payloads
+          // are allowed to map to any type, and should be handled earlier
+          // in the input/output processing pipeline.
+          return DataType.UINT8;
+        } else if (Long.class.equals(c) || o instanceof LongBuffer) {
+          return DataType.INT64;
+        } else if (String.class.equals(c)) {
+          return DataType.STRING;
+        }
       }
     }
     throw new IllegalArgumentException(
@@ -466,6 +489,8 @@ public final class Tensor {
 
   private static native void writeMultiDimensionalArray(long handle, Object src);
 
+  private static native void writeScalar(long handle, Object src);
+
   private static native int index(long handle);
 
   private static native String name(long handle);
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 28d14b6da87..690b58ac1f4 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -241,6 +241,15 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputTensorIndex(
   return interpreter->outputs()[output_index];
 }
 
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getExecutionPlanLength(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  tflite_api_dispatcher::Interpreter* interpreter =
+      convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return 0;
+  return static_cast<jint>(interpreter->execution_plan().size());
+}
+
 JNIEXPORT jint JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputCount(JNIEnv* env,
                                                                 jclass clazz,
@@ -461,31 +470,37 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputDataType(
 JNIEXPORT jboolean JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
     JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
-    jint input_idx, jintArray dims) {
+    jint input_idx, jintArray dims, jboolean strict) {
   BufferErrorReporter* error_reporter =
       convertLongToErrorReporter(env, error_handle);
   if (error_reporter == nullptr) return JNI_FALSE;
   tflite_api_dispatcher::Interpreter* interpreter =
       convertLongToInterpreter(env, interpreter_handle);
   if (interpreter == nullptr) return JNI_FALSE;
-  const int idx = static_cast<int>(input_idx);
-  if (idx < 0 || idx >= interpreter->inputs().size()) {
+  if (input_idx < 0 || input_idx >= interpreter->inputs().size()) {
     ThrowException(env, kIllegalArgumentException,
                    "Input error: Can not resize %d-th input for a model having "
                    "%d inputs.",
-                   idx, interpreter->inputs().size());
+                   input_idx, interpreter->inputs().size());
     return JNI_FALSE;
   }
+  const int tensor_idx = interpreter->inputs()[input_idx];
   // check whether it is resizing with the same dimensions.
-  TfLiteTensor* target = interpreter->tensor(input_idx);
+  TfLiteTensor* target = interpreter->tensor(tensor_idx);
   bool is_changed = AreDimsDifferent(env, target, dims);
   if (is_changed) {
-    TfLiteStatus status = interpreter->ResizeInputTensor(
-        interpreter->inputs()[idx], convertJIntArrayToVector(env, dims));
+    TfLiteStatus status;
+    if (strict) {
+      status = interpreter->ResizeInputTensorStrict(
+          tensor_idx, convertJIntArrayToVector(env, dims));
+    } else {
+      status = interpreter->ResizeInputTensor(
+          tensor_idx, convertJIntArrayToVector(env, dims));
+    }
     if (status != kTfLiteOk) {
       ThrowException(env, kIllegalArgumentException,
-                     "Internal error: Failed to resize %d-th input: %s", idx,
-                     error_reporter->CachedErrorMessage());
+                     "Internal error: Failed to resize %d-th input: %s",
+                     input_idx, error_reporter->CachedErrorMessage());
       return JNI_FALSE;
     }
   }
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index 00706ef0a46..dfa4e22162a 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -81,10 +81,16 @@ size_t ElementByteSize(TfLiteType data_type) {
                     "Interal error: Java int not compatible with kTfLiteInt");
       return 4;
     case kTfLiteUInt8:
+    case kTfLiteInt8:
       static_assert(sizeof(jbyte) == 1,
                     "Interal error: Java byte not compatible with "
                     "kTfLiteUInt8");
       return 1;
+    case kTfLiteBool:
+      static_assert(sizeof(jboolean) == 1,
+                    "Interal error: Java boolean not compatible with "
+                    "kTfLiteBool");
+      return 1;
     case kTfLiteInt64:
       static_assert(sizeof(jlong) == 8,
                     "Interal error: Java long not compatible with "
@@ -265,6 +271,15 @@ size_t WriteMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
   }
 }
 
+void AddStringDynamicBuffer(JNIEnv* env, jstring src,
+                            tflite::DynamicBuffer* dst_buffer) {
+  const char* chars = env->GetStringUTFChars(src, nullptr);
+  // + 1 for terminating character.
+  const int byte_len = env->GetStringUTFLength(src) + 1;
+  dst_buffer->AddString(chars, byte_len);
+  env->ReleaseStringUTFChars(src, chars);
+}
+
 void PopulateStringDynamicBuffer(JNIEnv* env, jobject src,
                                  tflite::DynamicBuffer* dst_buffer,
                                  int dims_left) {
@@ -277,11 +292,7 @@ void PopulateStringDynamicBuffer(JNIEnv* env, jobject src,
     for (int i = 0; i < num_elements; ++i) {
       jstring string_obj =
           static_cast<jstring>(env->GetObjectArrayElement(object_array, i));
-      const char* chars = env->GetStringUTFChars(string_obj, nullptr);
-      // + 1 for terminating character.
-      const int byte_len = env->GetStringUTFLength(string_obj) + 1;
-      dst_buffer->AddString(chars, byte_len);
-      env->ReleaseStringUTFChars(string_obj, chars);
+      AddStringDynamicBuffer(env, string_obj, dst_buffer);
       env->DeleteLocalRef(string_obj);
     }
   } else {
@@ -303,6 +314,56 @@ void WriteMultiDimensionalStringArray(JNIEnv* env, jobject src,
   }
 }
 
+void WriteScalar(JNIEnv* env, jobject src, TfLiteType type, void* dst,
+                 int dst_size) {
+  size_t src_size = ElementByteSize(type);
+  if (src_size != dst_size) {
+    ThrowException(
+        env, kIllegalStateException,
+        "Scalar (%d bytes) not compatible with allocated tensor (%d bytes)",
+        src_size, dst_size);
+    return;
+  }
+  switch (type) {
+// env->FindClass and env->GetMethodID are expensive and JNI best practices
+// suggest that they should be cached. However, until the creation of scalar
+// valued tensors seems to become a noticeable fraction of program execution,
+// ignore that cost.
+#define CASE(type, jtype, method_name, method_signature, call_type)            \
+  case type: {                                                                 \
+    jclass clazz = env->FindClass("java/lang/Number");                         \
+    jmethodID method = env->GetMethodID(clazz, method_name, method_signature); \
+    jtype v = env->Call##call_type##Method(src, method);                       \
+    memcpy(dst, &v, src_size);                                                 \
+    return;                                                                    \
+  }
+    CASE(kTfLiteFloat32, jfloat, "floatValue", "()F", Float);
+    CASE(kTfLiteInt32, jint, "intValue", "()I", Int);
+    CASE(kTfLiteInt64, jlong, "longValue", "()J", Long);
+    CASE(kTfLiteInt8, jbyte, "byteValue", "()B", Byte);
+    CASE(kTfLiteUInt8, jbyte, "byteValue", "()B", Byte);
+#undef CASE
+    case kTfLiteBool: {
+      jclass clazz = env->FindClass("java/lang/Boolean");
+      jmethodID method = env->GetMethodID(clazz, "booleanValue", "()Z");
+      jboolean v = env->CallBooleanMethod(src, method);
+      *(static_cast<unsigned char*>(dst)) = v ? 1 : 0;
+      return;
+    }
+    default:
+      ThrowException(env, kIllegalStateException, "Invalid DataType(%d)", type);
+      return;
+  }
+}
+
+void WriteScalarString(JNIEnv* env, jobject src, TfLiteTensor* tensor) {
+  tflite::DynamicBuffer dst_buffer;
+  AddStringDynamicBuffer(env, static_cast<jstring>(src), &dst_buffer);
+  if (!env->ExceptionCheck()) {
+    dst_buffer.WriteToTensor(tensor, /*new_shape=*/nullptr);
+  }
+}
+
 }  // namespace
 
 #ifdef __cplusplus
@@ -341,14 +402,26 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_writeDirectBuffer(
   TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return;
 
-  char* src_data_raw = static_cast<char*>(env->GetDirectBufferAddress(src));
+  void* src_data_raw = env->GetDirectBufferAddress(src);
   if (!src_data_raw) {
     ThrowException(env, kIllegalArgumentException,
                    "Input ByteBuffer is not a direct buffer");
     return;
   }
 
-  tensor->data.raw = src_data_raw;
+  if (!tensor->data.data) {
+    ThrowException(env, kIllegalArgumentException,
+                   "Internal error: Tensor hasn't been allocated.");
+    return;
+  }
+
+  // Historically, we would simply overwrite the tensor buffer pointer with
+  // the direct Buffer address. However, that is generally unsafe, and
+  // specifically wrong if the graph happens to have dynamic shapes where
+  // arena-allocated input buffers will be refreshed during invocation.
+  // TODO(b/156094015): Explore whether this is actually faster than
+  // using ByteBuffer.put(ByteBuffer).
+  memcpy(tensor->data.data, src_data_raw, tensor->bytes);
 }
 
 JNIEXPORT void JNICALL
@@ -399,6 +472,28 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
   }
 }
 
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_writeScalar(
+    JNIEnv* env, jclass clazz, jlong handle, jobject src) {
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return;
+  if ((tensor->type != kTfLiteString) && (tensor->data.raw == nullptr)) {
+    ThrowException(env, kIllegalArgumentException,
+                   "Internal error: Target Tensor hasn't been allocated.");
+    return;
+  }
+  if ((tensor->dims->size != 0) && (tensor->dims->data[0] != 1)) {
+    ThrowException(env, kIllegalArgumentException,
+                   "Internal error: Cannot write Java scalar to non-scalar "
+                   "Tensor.");
+    return;
+  }
+  if (tensor->type == kTfLiteString) {
+    WriteScalarString(env, src, tensor);
+  } else {
+    WriteScalar(env, src, tensor->type, tensor->data.data, tensor->bytes);
+  }
+}
+
 JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_dtype(JNIEnv* env,
                                                              jclass clazz,
                                                              jlong handle) {
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
index aaac2f9690a..446cf5f7b02 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
@@ -18,7 +18,11 @@ package org.tensorflow.lite;
 import static com.google.common.truth.Truth.assertThat;
 
 import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Map;
+import java.util.PriorityQueue;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -61,14 +65,9 @@ public final class InterpreterMobileNetTest {
   }
 
   private static void runMobileNetFloatTest(Interpreter.Options options) {
-    // Create a gray image.
-    ByteBuffer img = ByteBuffer.allocateDirect(1 * 224 * 224 * 3 * 4);
-    img.order(ByteOrder.nativeOrder());
-    img.rewind();
-    while (img.hasRemaining()) {
-      img.putFloat(0.5f);
-    }
-
+    ByteBuffer img =
+        TestUtils.getTestImageAsFloatByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
     float[][] labels = new float[1][1001];
     try (Interpreter interpreter = new Interpreter(MOBILENET_FLOAT_MODEL_BUFFER, options)) {
       interpreter.run(img, labels);
@@ -78,22 +77,53 @@ public final class InterpreterMobileNetTest {
     assertThat(labels[0])
         .usingExactEquality()
         .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
+    // 653 == "military uniform"
+    assertThat(getTopKLabels(labels, 3)).contains(653);
   }
 
   private static void runMobileNetQuantizedTest(Interpreter.Options options) {
-    // Create a gray image.
-    ByteBuffer img = ByteBuffer.allocateDirect(1 * 224 * 224 * 3);
-    img.order(ByteOrder.nativeOrder());
-    img.rewind();
-    while (img.hasRemaining()) {
-      img.put((byte) 128);
-    }
-
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+    byte[][] labels = new byte[1][1001];
     try (Interpreter interpreter = new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options)) {
-      byte[][] labels = new byte[1][1001];
       interpreter.run(img, labels);
       assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
       assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
     }
+    // 653 == "military uniform"
+    assertThat(getTopKLabels(labels, 3)).contains(653);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(byte[][] byteLabels, int k) {
+    float[][] labels = new float[1][1001];
+    for (int i = 0; i < byteLabels[0].length; ++i) {
+      labels[0][i] = (byteLabels[0][i] & 0xff) / 255.0f;
+    }
+    return getTopKLabels(labels, k);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(float[][] labels, int k) {
+    PriorityQueue<Map.Entry<Integer, Float>> pq =
+        new PriorityQueue<>(
+            k,
+            new Comparator<Map.Entry<Integer, Float>>() {
+              @Override
+              public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return o1.getValue().compareTo(o2.getValue()) * -1;
+              }
+            });
+
+    for (int i = 0; i < labels[0].length; ++i) {
+      pq.add(new AbstractMap.SimpleEntry<>(i, labels[0][i]));
+    }
+
+    final ArrayList<Integer> topKLabels = new ArrayList<>();
+    int topKLabelsSize = Math.min(pq.size(), k);
+    for (int i = 0; i < topKLabelsSize; ++i) {
+      topKLabels.add(pq.poll().getKey());
+    }
+    return topKLabels;
   }
 }
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index b38f1ad771d..6b6799eaad9 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -22,6 +22,7 @@ import java.io.File;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.DoubleBuffer;
+import java.nio.FloatBuffer;
 import java.util.HashMap;
 import java.util.Map;
 import org.junit.Test;
@@ -39,6 +40,8 @@ public final class InterpreterTest {
       "tensorflow/lite/testdata/multi_add_flex.bin";
   private static final String UNKNOWN_DIMS_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/add_unknown_dimensions.bin";
+  private static final String DYNAMIC_SHAPES_MODEL_PATH =
+      "tensorflow/lite/testdata/dynamic_shapes.bin";
 
   private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
   private static final ByteBuffer MULTIPLE_INPUTS_MODEL_BUFFER =
@@ -47,6 +50,8 @@ public final class InterpreterTest {
       TestUtils.getTestFileAsBuffer(FLEX_MODEL_PATH);
   private static final ByteBuffer UNKNOWN_DIMS_MODEL_PATH_BUFFER =
       TestUtils.getTestFileAsBuffer(UNKNOWN_DIMS_MODEL_PATH);
+  private static final ByteBuffer DYNAMIC_SHAPES_MODEL_BUFFER =
+      TestUtils.getTestFileAsBuffer(DYNAMIC_SHAPES_MODEL_PATH);
 
   @Test
   public void testInterpreter() throws Exception {
@@ -209,6 +214,15 @@ public final class InterpreterTest {
     assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
   }
 
+  @Test
+  public void testRunWithScalarInput() {
+    FloatBuffer parsedOutput = FloatBuffer.allocate(1);
+    try (Interpreter interpreter = new Interpreter(MODEL_BUFFER)) {
+      interpreter.run(2.37f, parsedOutput);
+    }
+    assertThat(parsedOutput.get(0)).isWithin(0.1f).of(7.11f);
+  }
+
   @Test
   public void testResizeInput() {
     try (Interpreter interpreter = new Interpreter(MODEL_BUFFER)) {
@@ -256,10 +270,23 @@ public final class InterpreterTest {
       assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(inputDims);
       assertThat(interpreter.getInputTensor(0).shapeSignature()).isEqualTo(inputDimsSignature);
 
+      // Resize tensor with strict checking. Try invalid resize.
+      inputDims[2] = 5;
+      try {
+        interpreter.resizeInput(0, inputDims, true);
+        fail();
+      } catch (IllegalArgumentException e) {
+        assertThat(e)
+            .hasMessageThat()
+            .contains(
+                "ResizeInputTensorStrict only allows mutating unknown dimensions identified by -1");
+      }
+      inputDims[2] = 3;
+
       // Set the dimension of the unknown dimension to the expected dimension and ensure shape
       // signature doesn't change.
       inputDims[1] = 3;
-      interpreter.resizeInput(0, inputDims);
+      interpreter.resizeInput(0, inputDims, true);
       assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(inputDims);
       assertThat(interpreter.getInputTensor(0).shapeSignature()).isEqualTo(inputDimsSignature);
 
@@ -411,7 +438,7 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
-  /** Smoke test validating that flex model loading fails when the flex delegate is not linked. */
+  // Smoke test validating that flex model loading fails when the flex delegate is not linked.
   @Test
   public void testFlexModel() throws Exception {
     try {
@@ -550,6 +577,45 @@ public final class InterpreterTest {
     }
   }
 
+  private static FloatBuffer fill(FloatBuffer buffer, float value) {
+    while (buffer.hasRemaining()) {
+      buffer.put(value);
+    }
+    buffer.rewind();
+    return buffer;
+  }
+
+  // Regression test case to ensure that graphs with dynamically computed shapes work properly.
+  // Historically, direct ByteBuffer addresses would overwrite the arena-allocated tensor input
+  // pointers. Normally this works fine, but for dynamic graphs, the original input tensor pointers
+  // may be "restored" at invocation time by the arena allocator, resetting the direct ByteBuffer
+  // address and leading to stale input data being used.
+  @Test
+  public void testDynamicShapesWithDirectBufferInputs() {
+    try (Interpreter interpreter = new Interpreter(DYNAMIC_SHAPES_MODEL_BUFFER)) {
+      ByteBuffer input0 =
+          ByteBuffer.allocateDirect(8 * 42 * 1024 * 4).order(ByteOrder.nativeOrder());
+      ByteBuffer input1 =
+          ByteBuffer.allocateDirect(1 * 90 * 1024 * 4).order(ByteOrder.nativeOrder());
+      ByteBuffer input2 = ByteBuffer.allocateDirect(1 * 4).order(ByteOrder.nativeOrder());
+      Object[] inputs = {input0, input1, input2};
+
+      fill(input0.asFloatBuffer(), 2.0f);
+      fill(input1.asFloatBuffer(), 0.5f);
+      // Note that the value of this input dictates the shape of the output.
+      fill(input2.asFloatBuffer(), 1.0f);
+
+      FloatBuffer output = FloatBuffer.allocate(8 * 1 * 1024);
+      Map<Integer, Object> outputs = new HashMap<>();
+      outputs.put(0, output);
+
+      interpreter.runForMultipleInputsOutputs(inputs, outputs);
+
+      FloatBuffer expected = fill(FloatBuffer.allocate(8 * 1 * 1024), 2.0f);
+      assertThat(output.array()).usingTolerance(0.1f).containsExactly(expected.array()).inOrder();
+    }
+  }
+
   private static native long getNativeHandleForDelegate();
 
   private static native long getNativeHandleForInvalidDelegate();
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java
new file mode 100644
index 00000000000..34eb47e4dbe
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+/** Utility for interacting with Interpreter in delegate tests. */
+public abstract class InterpreterTestHelper {
+
+  /**
+   * Returns the number of nodes in the execution plan that are invoked per inference.
+   *
+   * <p>WARNING: This is an experimental API and subject to change.
+   */
+  public static int executionPlanLength(Interpreter interpreter) {
+    return interpreter.getExecutionPlanLength();
+  }
+}
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index bab39793130..6436481c285 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -46,6 +46,9 @@ public final class NativeInterpreterWrapperTest {
   private static final String STRING_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/string.bin";
 
+  private static final String STRING_SCALAR_MODEL_PATH =
+      "tensorflow/lite/java/src/testdata/string_scalar.bin";
+
   private static final String INVALID_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/invalid_model.bin";
 
@@ -245,6 +248,20 @@ public final class NativeInterpreterWrapperTest {
     }
   }
 
+  @Test
+  public void testRunWithScalarString() {
+    try (NativeInterpreterWrapper wrapper =
+        new NativeInterpreterWrapper(STRING_SCALAR_MODEL_PATH)) {
+      String[] parsedOutputs = new String[1];
+      Map<Integer, Object> outputs = new HashMap<>();
+      outputs.put(0, parsedOutputs);
+      Object[] inputs = {"s1"};
+      wrapper.run(inputs, outputs);
+      String[] expected = {"s1"};
+      assertThat(parsedOutputs).isEqualTo(expected);
+    }
+  }
+
   @Test
   public void testRunWithString_supplementaryUnicodeCharacters() {
     try (NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(STRING_MODEL_PATH)) {
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
index f828f26f4c5..06a7deacc2c 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
@@ -242,6 +242,22 @@ public final class TensorTest {
     tensor.setTo(inputFloatBuffer);
     tensor.copyTo(output);
     assertThat(output[0][0][0][0]).isEqualTo(5.0f);
+
+    // Assign from scalar float.
+    wrapper.resizeInput(0, new int[0]);
+    wrapper.allocateTensors();
+    float scalar = 5.0f;
+    tensor.setTo(scalar);
+    FloatBuffer outputScalar = FloatBuffer.allocate(1);
+    tensor.copyTo(outputScalar);
+    assertThat(outputScalar.get(0)).isEqualTo(5.0f);
+
+    // Assign from boxed scalar Float.
+    Float boxedScalar = 9.0f;
+    tensor.setTo(boxedScalar);
+    outputScalar = FloatBuffer.allocate(1);
+    tensor.copyTo(outputScalar);
+    assertThat(outputScalar.get(0)).isEqualTo(9.0f);
   }
 
   @Test
@@ -374,6 +390,9 @@ public final class TensorTest {
     float[][][][] differentShapeInput = new float[1][8][8][3];
     assertThat(tensor.getInputShapeIfDifferent(differentShapeInput))
         .isEqualTo(new int[] {1, 8, 8, 3});
+
+    Float differentShapeInputScalar = 5.0f;
+    assertThat(tensor.getInputShapeIfDifferent(differentShapeInputScalar)).isEqualTo(new int[] {});
   }
 
   @Test
@@ -390,6 +409,9 @@ public final class TensorTest {
     FloatBuffer testFloatBuffer = FloatBuffer.allocate(1);
     dataType = Tensor.dataTypeOf(testFloatBuffer);
     assertThat(dataType).isEqualTo(DataType.FLOAT32);
+    float testFloat = 1.0f;
+    dataType = Tensor.dataTypeOf(testFloat);
+    assertThat(dataType).isEqualTo(DataType.FLOAT32);
     try {
       double[] testDoubleArray = {0.783, 0.251};
       Tensor.dataTypeOf(testDoubleArray);
@@ -445,6 +467,20 @@ public final class TensorTest {
     assertThat(shape[2]).isEqualTo(1);
   }
 
+  @Test
+  public void testCopyToScalarUnsupported() {
+    wrapper.resizeInput(0, new int[0]);
+    wrapper.allocateTensors();
+    tensor.setTo(5.0f);
+    Float outputScalar = 7.0f;
+    try {
+      tensor.copyTo(outputScalar);
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Expected failure.
+    }
+  }
+
   @Test
   public void testUseAfterClose() {
     tensor.close();
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
index 1471b4b506b..ae88cddcf57 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
@@ -15,17 +15,24 @@ limitations under the License.
 
 package org.tensorflow.lite;
 
+import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.StandardOpenOption;
 import java.util.EnumSet;
+import javax.imageio.ImageIO;
 
 /** Utility for interacting with test-specific data. */
 public abstract class TestUtils {
 
+  private static final float DEFAULT_IMAGE_MEAN = 127.5f;
+  private static final float DEFAULT_IMAGE_STD = 127.5f;
+
   public static MappedByteBuffer getTestFileAsBuffer(String path) {
     try (FileChannel fileChannel =
         (FileChannel)
@@ -40,5 +47,60 @@ public abstract class TestUtils {
     return true;
   }
 
+  public static ByteBuffer getTestImageAsByteBuffer(String path) {
+    File imageFile = new File(path);
+    try {
+      BufferedImage image = ImageIO.read(imageFile);
+      return toByteBuffer(image);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public static ByteBuffer getTestImageAsFloatByteBuffer(String path) {
+    File imageFile = new File(path);
+    try {
+      BufferedImage image = ImageIO.read(imageFile);
+      return toFloatByteBuffer(image);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static ByteBuffer toByteBuffer(BufferedImage image) {
+    ByteBuffer imgData =
+        ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3)
+            .order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int val = image.getRGB(x, y);
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    return imgData;
+  }
+
+  private static ByteBuffer toFloatByteBuffer(BufferedImage image) {
+    return toFloatByteBuffer(image, DEFAULT_IMAGE_MEAN, DEFAULT_IMAGE_STD);
+  }
+
+  private static ByteBuffer toFloatByteBuffer(
+      BufferedImage image, float imageMean, float imageStd) {
+    ByteBuffer imgData =
+        ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3 * 4)
+            .order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int pixelValue = image.getRGB(x, y);
+        imgData.putFloat((((pixelValue >> 16) & 0xFF) - imageMean) / imageStd);
+        imgData.putFloat((((pixelValue >> 8) & 0xFF) - imageMean) / imageStd);
+        imgData.putFloat(((pixelValue & 0xFF) - imageMean) / imageStd);
+      }
+    }
+    return imgData;
+  }
+
   private TestUtils() {}
 }
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
new file mode 100644
index 00000000000..d92a7119aab
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
@@ -0,0 +1,146 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.gpu;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.nio.ByteBuffer;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.PriorityQueue;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.InterpreterTestHelper;
+import org.tensorflow.lite.TestUtils;
+
+/** Unit tests for {@link org.tensorflow.lite.gpu.GpuDelegate}. */
+@RunWith(JUnit4.class)
+public final class GpuDelegateTest {
+
+  private static final String MODEL_PATH = "tensorflow/lite/testdata/multi_add.bin";
+  private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
+  private static final ByteBuffer MOBILENET_QUANTIZED_MODEL_BUFFER =
+      TestUtils.getTestFileAsBuffer(
+          "third_party/tensorflow/lite/java/demo/app/src/main/assets/mobilenet_v1_1.0_224_quant.tflite");
+
+  @Test
+  public void testBasic() throws Exception {
+    try (GpuDelegate delegate = new GpuDelegate()) {
+      assertThat(delegate.getNativeHandle()).isNotEqualTo(0);
+    }
+  }
+
+  @Test
+  public void testInterpreterWithGpu_FloatModel() throws Exception {
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate = new GpuDelegate();
+        Interpreter interpreter = new Interpreter(MODEL_BUFFER, options.addDelegate(delegate))) {
+      float[] input0 = {1.23f};
+      float[] input1 = {2.43f};
+      Object[] inputs = {input0, input1, input0, input1};
+      float[] parsedOutput0 = new float[1];
+      float[] parsedOutput1 = new float[1];
+      Map<Integer, Object> outputs = new HashMap<>();
+      outputs.put(0, parsedOutput0);
+      outputs.put(1, parsedOutput1);
+      interpreter.runForMultipleInputsOutputs(inputs, outputs);
+      float[] expected0 = {4.89f};
+      float[] expected1 = {6.09f};
+      assertThat(parsedOutput0).usingTolerance(0.1f).containsExactly(expected0).inOrder();
+      assertThat(parsedOutput1).usingTolerance(0.1f).containsExactly(expected1).inOrder();
+    }
+  }
+
+  @Test
+  public void testInterpreterWithGpu_QuantModelRunWithDelegate() throws Exception {
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate =
+            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+        Interpreter interpreter =
+            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
+      byte[][] output = new byte[1][1001];
+      interpreter.run(img, output);
+      // Should be only 1 node (Delegate) in the execution plan.
+      assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(1);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+      // 653 == "military uniform"
+      assertThat(getTopKLabels(output, 3)).contains(653);
+    }
+  }
+
+  @Test
+  public void testInterpreterWithGpu_QuantModelRunOnCPU() throws Exception {
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate = new GpuDelegate();
+        Interpreter interpreter =
+            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
+      byte[][] output = new byte[1][1001];
+      interpreter.run(img, output);
+      // Original execution plan remains since default behavior doesn't allow quantized models.
+      assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+      // 653 == "military uniform"
+      assertThat(getTopKLabels(output, 3)).contains(653);
+    }
+  }
+
+  private static ArrayList<Integer> getTopKLabels(byte[][] byteLabels, int k) {
+    float[][] labels = new float[1][1001];
+    for (int i = 0; i < byteLabels[0].length; ++i) {
+      labels[0][i] = (byteLabels[0][i] & 0xff) / 255.0f;
+    }
+    return getTopKLabels(labels, k);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(float[][] labels, int k) {
+    PriorityQueue<Map.Entry<Integer, Float>> pq =
+        new PriorityQueue<>(
+            k,
+            new Comparator<Map.Entry<Integer, Float>>() {
+              @Override
+              public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return o1.getValue().compareTo(o2.getValue()) * -1;
+              }
+            });
+
+    for (int i = 0; i < labels[0].length; ++i) {
+      pq.add(new AbstractMap.SimpleEntry<>(i, labels[0][i]));
+    }
+
+    final ArrayList<Integer> topKLabels = new ArrayList<>();
+    int topKLabelsSize = Math.min(pq.size(), k);
+    for (int i = 0; i < topKLabelsSize; ++i) {
+      topKLabels.add(pq.poll().getKey());
+    }
+    return topKLabels;
+  }
+}
diff --git a/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg b/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg
new file mode 100644
index 00000000000..15a2f2bd2a5
Binary files /dev/null and b/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg differ
diff --git a/tensorflow/lite/java/src/testdata/string_scalar.bin b/tensorflow/lite/java/src/testdata/string_scalar.bin
new file mode 100644
index 00000000000..8f7d0f69ccf
Binary files /dev/null and b/tensorflow/lite/java/src/testdata/string_scalar.bin differ
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 5bd03b0d14a..657b5d89a85 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -235,6 +235,15 @@ cc_library(
     visibility = ["//visibility:private"],
 )
 
+cc_library(
+    name = "tflite_with_ruy_and_caching_enabled",
+    defines = [
+        "TFLITE_WITH_RUY",
+        "TFLITE_WITH_RUY_GEMV",
+    ],
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "tflite_with_ruy_default",
     build_for_embedded = True,
@@ -270,7 +279,6 @@ cc_library(
     hdrs = [
         "cpu_backend_context.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         ":tflite_with_ruy",
@@ -339,6 +347,7 @@ cc_library(
         # Depend on ruy regardless of `tflite_with_ruy`. See the comment in
         # cpu_backend_gemm.h about why ruy is the generic path.
         "@ruy//ruy",
+        "@ruy//ruy:matrix",
         "@ruy//ruy:path",
         "@ruy//ruy/profiler:instrumentation",
         # We only need to depend on gemmlowp and Eigen when tflite_with_ruy
@@ -357,9 +366,9 @@ cc_test(
         ":cpu_backend_context",
         ":cpu_backend_gemm",
         "@com_google_googletest//:gtest",
-        # ruy's reference path provides the reference implementation
+        # ruy:reference_mul provides the reference implementation
         # that this test compares against.
-        "@ruy//ruy",
+        "@ruy//ruy:reference_mul",
     ],
 )
 
@@ -386,7 +395,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite/kernels/internal:quantization_util",
-        "@flatbuffers",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
@@ -423,139 +432,157 @@ cc_library(
     ],
 )
 
+BUILTIN_KERNEL_SRCS = [
+    "activations.cc",
+    "add.cc",
+    "add_n.cc",
+    "arg_min_max.cc",
+    "audio_spectrogram.cc",
+    "basic_rnn.cc",
+    "batch_matmul.cc",
+    "batch_to_space_nd.cc",
+    "bidirectional_sequence_lstm.cc",
+    "bidirectional_sequence_rnn.cc",
+    "cast.cc",
+    "ceil.cc",
+    "comparisons.cc",
+    "concatenation.cc",
+    "conv.cc",
+    "densify.cc",
+    "depth_to_space.cc",
+    "depthwise_conv.cc",
+    "dequantize.cc",
+    "detection_postprocess.cc",
+    "div.cc",
+    "elementwise.cc",
+    "embedding_lookup.cc",
+    "embedding_lookup_sparse.cc",
+    "exp.cc",
+    "expand_dims.cc",
+    "fake_quant.cc",
+    "fill.cc",
+    "floor.cc",
+    "floor_div.cc",
+    "floor_mod.cc",
+    "fully_connected.cc",
+    "gather.cc",
+    "gather_nd.cc",
+    "hashtable_lookup.cc",
+    "if.cc",
+    "l2norm.cc",
+    "local_response_norm.cc",
+    "logical.cc",
+    "lsh_projection.cc",
+    "lstm.cc",
+    "matrix_diag.cc",
+    "matrix_set_diag.cc",
+    "maximum_minimum.cc",
+    "mfcc.cc",
+    "mirror_pad.cc",
+    "mul.cc",
+    "neg.cc",
+    "non_max_suppression.cc",
+    "numeric_verify.cc",
+    "one_hot.cc",
+    "pack.cc",
+    "pad.cc",
+    "pooling.cc",
+    "pow.cc",
+    "quantize.cc",
+    "range.cc",
+    "rank.cc",
+    "reduce.cc",
+    "reshape.cc",
+    "resize_bilinear.cc",
+    "resize_nearest_neighbor.cc",
+    "reverse.cc",
+    "reverse_sequence.cc",
+    "round.cc",
+    "scatter_nd.cc",
+    "segment_sum.cc",
+    "select.cc",
+    "shape.cc",
+    "skip_gram.cc",
+    "slice.cc",
+    "space_to_batch_nd.cc",
+    "space_to_depth.cc",
+    "sparse_to_dense.cc",
+    "split.cc",
+    "split_v.cc",
+    "squared_difference.cc",
+    "squeeze.cc",
+    "strided_slice.cc",
+    "sub.cc",
+    "svdf.cc",
+    "tile.cc",
+    "topk_v2.cc",
+    "transpose.cc",
+    "transpose_conv.cc",
+    "unidirectional_sequence_lstm.cc",
+    "unidirectional_sequence_rnn.cc",
+    "unique.cc",
+    "unpack.cc",
+    "where.cc",
+    "while.cc",
+    "zeros_like.cc",
+]
+
+BUILTIN_KERNEL_DEPS = [
+    ":cpu_backend_context",
+    ":cpu_backend_gemm",
+    ":cpu_backend_threadpool",
+    ":eigen_support",
+    ":kernel_util",
+    ":lstm_eval",
+    ":lstm_shared",
+    ":op_macros",
+    ":padding",
+    "@com_google_absl//absl/memory",
+    "@com_google_absl//absl/strings",
+    "//third_party/eigen3",
+    "@flatbuffers",
+    "//tensorflow/lite:framework_lib",
+    "//tensorflow/lite:minimal_logging",
+    "//tensorflow/lite:string_util",
+    "//tensorflow/lite/c:common",
+    "//tensorflow/lite/kernels/internal:audio_utils",
+    "//tensorflow/lite/kernels/internal:common",
+    "//tensorflow/lite/kernels/internal:compatibility",
+    "//tensorflow/lite/kernels/internal:cpu_check",
+    "//tensorflow/lite/kernels/internal:kernel_utils",
+    "//tensorflow/lite/kernels/internal:optimized",
+    "//tensorflow/lite/kernels/internal:optimized_base",
+    "//tensorflow/lite/kernels/internal:quantization_util",
+    "//tensorflow/lite/kernels/internal:reference_base",
+    "//tensorflow/lite/kernels/internal:strided_slice_logic",
+    "//tensorflow/lite/kernels/internal:tensor",
+    "//tensorflow/lite/kernels/internal:tensor_utils",
+    "//tensorflow/lite/kernels/internal:types",
+]
+
 cc_library(
     name = "builtin_op_kernels",
-    srcs = [
-        "activations.cc",
-        "add.cc",
-        "add_n.cc",
-        "arg_min_max.cc",
-        "audio_spectrogram.cc",
-        "basic_rnn.cc",
-        "batch_matmul.cc",
-        "batch_to_space_nd.cc",
-        "bidirectional_sequence_lstm.cc",
-        "bidirectional_sequence_rnn.cc",
-        "cast.cc",
-        "ceil.cc",
-        "comparisons.cc",
-        "concatenation.cc",
-        "conv.cc",
-        "densify.cc",
-        "depth_to_space.cc",
-        "depthwise_conv.cc",
-        "dequantize.cc",
-        "detection_postprocess.cc",
-        "div.cc",
-        "elementwise.cc",
-        "embedding_lookup.cc",
-        "embedding_lookup_sparse.cc",
-        "exp.cc",
-        "expand_dims.cc",
-        "fake_quant.cc",
-        "fill.cc",
-        "floor.cc",
-        "floor_div.cc",
-        "floor_mod.cc",
-        "fully_connected.cc",
-        "gather.cc",
-        "gather_nd.cc",
-        "hashtable_lookup.cc",
-        "if.cc",
-        "l2norm.cc",
-        "local_response_norm.cc",
-        "logical.cc",
-        "lsh_projection.cc",
-        "lstm.cc",
-        "matrix_diag.cc",
-        "matrix_set_diag.cc",
-        "maximum_minimum.cc",
-        "mfcc.cc",
-        "mirror_pad.cc",
-        "mul.cc",
-        "neg.cc",
-        "non_max_suppression.cc",
-        "numeric_verify.cc",
-        "one_hot.cc",
-        "pack.cc",
-        "pad.cc",
-        "pooling.cc",
-        "pow.cc",
-        "quantize.cc",
-        "range.cc",
-        "rank.cc",
-        "reduce.cc",
-        "reshape.cc",
-        "resize_bilinear.cc",
-        "resize_nearest_neighbor.cc",
-        "reverse.cc",
-        "reverse_sequence.cc",
-        "round.cc",
-        "scatter_nd.cc",
-        "segment_sum.cc",
-        "select.cc",
-        "shape.cc",
-        "skip_gram.cc",
-        "slice.cc",
-        "space_to_batch_nd.cc",
-        "space_to_depth.cc",
-        "sparse_to_dense.cc",
-        "split.cc",
-        "split_v.cc",
-        "squared_difference.cc",
-        "squeeze.cc",
-        "strided_slice.cc",
-        "sub.cc",
-        "svdf.cc",
-        "tile.cc",
-        "topk_v2.cc",
-        "transpose.cc",
-        "transpose_conv.cc",
-        "unidirectional_sequence_lstm.cc",
-        "unidirectional_sequence_rnn.cc",
-        "unique.cc",
-        "unpack.cc",
-        "where.cc",
-        "while.cc",
-        "zeros_like.cc",
-    ],
+    srcs = BUILTIN_KERNEL_SRCS,
     hdrs = [
         "dequantize.h",
     ],
     copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
     visibility = ["//visibility:private"],
-    deps = [
-        ":cpu_backend_context",
-        ":cpu_backend_threadpool",
-        ":eigen_support",
-        ":kernel_util",
-        ":lstm_eval",
-        ":lstm_shared",
-        ":op_macros",
-        ":padding",
-        "//tensorflow/lite:framework_lib",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:audio_utils",
-        "//tensorflow/lite/kernels/internal:common",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/kernels/internal:cpu_check",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
-        "//tensorflow/lite/kernels/internal:optimized",
-        "//tensorflow/lite/kernels/internal:optimized_base",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
-        "//tensorflow/lite/kernels/internal:strided_slice_logic",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
-        "//tensorflow/lite/kernels/internal:types",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@farmhash_archive//:farmhash",
-        "@flatbuffers",
+    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"],
+)
+
+# Creates a target where Ruy is unconditionally enabled along with caching
+# on GEMV operations. This is useful for TF Lite deployments where custom
+# copts are not allowed, e.g. b/156119344
+cc_library(
+    name = "builtin_op_kernels_ruy_and_caching",
+    srcs = BUILTIN_KERNEL_SRCS,
+    hdrs = [
+        "dequantize.h",
     ],
+    copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
+    visibility = ["//visibility:private"],
+    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"] + [":tflite_with_ruy_and_caching_enabled"],
 )
 
 cc_library(
@@ -601,7 +628,7 @@ cc_library(
         ":op_macros",
         "//tensorflow/lite:context",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/kernels:hashtable_op_kernels",
+        "//tensorflow/lite/kernels/hashtable:hashtable_op_kernels",
         "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/kernels/internal:tensor",
         "//third_party/fft2d:fft2d_headers",
@@ -672,6 +699,22 @@ cc_library(
     ],
 )
 
+#  TODO(b/156664104) Remove once runtime flag available.
+cc_library(
+    name = "builtin_ops_ruy_and_caching_enabled",
+    srcs = ["register.cc"],
+    hdrs = [
+        "builtin_op_kernels.h",
+        "fully_connected.h",
+        "register.h",
+    ],
+    deps = [
+        ":builtin_op_kernels_ruy_and_caching",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite/c:common",
+    ],
+)
+
 # The builtin_ops target will resolve to optimized kernels when available. This
 # target uses reference kernels only, and is useful for validation and testing.
 # It should *not* generally be used in production.
@@ -733,7 +776,10 @@ cc_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.cc"],
-    tags = ["tflite_nnapi"],
+    tags = [
+        "tflite_nnapi",
+        "tflite_xnnpack",
+    ],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -748,7 +794,10 @@ cc_test(
     name = "add_test",
     size = "small",
     srcs = ["add_test.cc"],
-    tags = ["tflite_nnapi"],
+    tags = [
+        "tflite_nnapi",
+        "tflite_xnnpack",
+    ],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1201,7 +1250,10 @@ cc_test(
     name = "mul_test",
     size = "small",
     srcs = ["mul_test.cc"],
-    tags = ["tflite_nnapi"],
+    tags = [
+        "tflite_nnapi",
+        "tflite_xnnpack",
+    ],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1225,6 +1277,17 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "reshape_test_common",
+    testonly = 1,
+    hdrs = [
+        "reshape_test_common.h",
+    ],
+    deps = [
+        ":test_util",
+    ],
+)
+
 cc_test(
     name = "reshape_test",
     size = "small",
@@ -1232,6 +1295,7 @@ cc_test(
     tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
+        ":reshape_test_common",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
@@ -1403,7 +1467,10 @@ cc_test(
     name = "pooling_test",
     size = "small",
     srcs = ["pooling_test.cc"],
-    tags = ["tflite_nnapi"],
+    tags = [
+        "tflite_nnapi",
+        "tflite_xnnpack",
+    ],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1417,7 +1484,10 @@ cc_test(
     name = "softmax_test",
     size = "small",
     srcs = ["softmax_test.cc"],
-    tags = ["tflite_nnapi"],
+    tags = [
+        "tflite_nnapi",
+        "tflite_xnnpack",
+    ],
     deps = [
         ":builtin_ops",
         ":test_main",
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 2a236666739..47146771b50 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -60,7 +60,13 @@ struct OpData {
 
 struct SoftmaxOpData {
   struct SoftmaxParams params = {};
-  float table[256];
+  float table[256]{};
+  const int size_of_lut = 513;
+  int16_t exp_lut[513]{};  // int16 LUT for exp(x), where x uniform distributed
+                           // between [-10.0 , 0.0]
+  int16_t one_over_one_plus_x_lut[513]{};  // int16 LUT for 1 / (1 + x), where
+                                           // x uniform distributed between
+                                           // [0.0 , 1.0]
 };
 
 struct LogSoftmaxOpData : public OpData {
@@ -78,8 +84,10 @@ struct LeakyReluOpData : public OpData {
 };
 
 struct PreluOpData : public OpData {
-  int32_t output_multiplier = 0;
-  int output_shift = 0;
+  int32_t output_multiplier_1 = 0;
+  int32_t output_shift_1 = 0;
+  int32_t output_multiplier_2 = 0;
+  int32_t output_shift_2 = 0;
 };
 
 struct HardSwishData {
@@ -358,7 +366,8 @@ TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   LeakyReluOpData* data = reinterpret_cast<LeakyReluOpData*>(node->user_data);
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      output->type == kTfLiteInt16) {
     const auto* params =
         reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
 
@@ -430,21 +439,29 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
     int input_scale_log2_rounded;
-    TF_LITE_ENSURE(context,
-                   CheckedLog2(input->params.scale, &input_scale_log2_rounded));
+    bool param_scale_pot =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    param_scale_pot &=
+        (data->input_left_shift == 0 || data->input_left_shift == 1);
+
+    if (!param_scale_pot) {
+      // In case of general scale parameter, we need to do a rescaling.
+      // Magic constant 4096:
+      // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
+      // from 16-bit (-2^15, 2^15),
+      // so we need to multiply by
+      // 2^(15 - kInputIntegerBits) = 2^12 = 4096.
+      data->input_multiplier = static_cast<int32_t>(input->params.scale * 4096);
+    }
 
     int output_scale_log2_rounded;
     TF_LITE_ENSURE(
         context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
     TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
                       -kOutputFractionalBits);
-
-    data->input_left_shift =
-        (15 - kInputIntegerBits) + input_scale_log2_rounded;
-    // Support for shifts is limited until we have a parameterized version of
-    // SaturatingRoundingMultiplyByPOT().
-    TF_LITE_ENSURE(context, data->input_left_shift >= 0);
-    TF_LITE_ENSURE(context, data->input_left_shift <= 1);
   }
 
   return context->ResizeTensor(context, output,
@@ -518,19 +535,28 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
     int input_scale_log2_rounded;
-    TF_LITE_ENSURE(context,
-                   CheckedLog2(input->params.scale, &input_scale_log2_rounded));
+    bool param_scale_pot =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    param_scale_pot &= (data->input_left_shift == 0);
+
+    if (!param_scale_pot) {
+      // In case of general scale parameter, we need to do a rescaling.
+      // Magic constant 4096:
+      // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
+      // from 16-bit (-2^15, 2^15),
+      // so we need to multiply by
+      // 2^(15 - kInputIntegerBits) = 2^12 = 4096.
+      data->input_multiplier = static_cast<int32_t>(input->params.scale * 4096);
+    }
 
     int output_scale_log2_rounded;
     TF_LITE_ENSURE(
         context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
     TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
                       -kOutputFractionalBits);
-
-    data->input_left_shift =
-        (15 - kInputIntegerBits) + input_scale_log2_rounded;
-    // The int16 logistic implementation does not support shifting of the input.
-    TF_LITE_ENSURE_EQ(context, data->input_left_shift, 0);
   }
 
   return context->ResizeTensor(context, output,
@@ -546,8 +572,9 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   if (output->type == kTfLiteInt16) {
-    TF_LITE_ENSURE(context,
-                   input->type == kTfLiteInt8 || input->type == kTfLiteUInt8);
+    TF_LITE_ENSURE(context, input->type == kTfLiteInt8 ||
+                                input->type == kTfLiteUInt8 ||
+                                input->type == kTfLiteInt16);
   } else {
     TF_LITE_ENSURE_EQ(context, input->type, output->type);
   }
@@ -562,6 +589,28 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
     data->params.scale = output->params.scale;
   }
 
+  if (input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+    data->params.exp_lut = data->exp_lut;
+    // exp LUT only used on nagative values
+    // we consider exp(-10.0) is insignificant to accumulation
+    gen_lut([](double value) { return std::exp(value); }, -10.0, 0.0,
+            data->params.exp_lut, data->size_of_lut);
+    data->params.one_over_one_plus_x_lut = data->one_over_one_plus_x_lut;
+    gen_lut([](double value) { return 1.0 / (1.0 + value); }, 0.0, 1.0,
+            data->params.one_over_one_plus_x_lut, data->size_of_lut);
+    data->params.zero_point = output->params.zero_point;
+    data->params.scale = output->params.scale;
+
+    double input_scale_beta_rescale =
+        input->params.scale * params->beta /
+        (10.0 / 65535.0);  // scale the input_diff such that [-65535, 0]
+                           // correspond to [-10.0, 0.0]
+    QuantizeMultiplier(input_scale_beta_rescale, &data->params.input_multiplier,
+                       &data->params.input_left_shift);
+  }
+
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
 }
@@ -613,13 +662,35 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
 
   TF_LITE_ENSURE_EQ(context, input->type, alpha->type);
+
   output->type = input->type;
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
-    double real_multiplier =
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      output->type == kTfLiteInt16) {
+    // prelu(x) = x if x >= 0 else x * alpha.
+    // So if we translate that for quantized computation:
+    //
+    // input_float = (input_q - input_zp) * input_scale
+    // output_float = (output_q - output_zp) * output_scale
+    // alpha_float = (alpha_q - alpha_zp) * alpha_scale
+    //
+    // When input_q - input_zp >= 0:
+    // ouput_q = (input_q - input_zp) * input_scale / output_scale + output_q
+    // else:
+    // output_q = (input_q - input_zp) * (alpha_q - alpha_zp) * input_scale
+    //            * alpha_scale / output_scale + output_q
+    //
+    // So for input_q - input_zp >= 0:
+    // output real multiplier 1 is input_scale / output_scale;
+    // for input_q - input_zp < 0:
+    // output real multiplier 2 is input_scale  * alpha_scale/ output_scale.
+    double real_multiplier_1 = input->params.scale / output->params.scale;
+    double real_multiplier_2 =
         input->params.scale * alpha->params.scale / output->params.scale;
-    QuantizeMultiplierSmallerThanOneExp(
-        real_multiplier, &data->output_multiplier, &data->output_shift);
+    QuantizeMultiplier(real_multiplier_1, &data->output_multiplier_1,
+                       &data->output_shift_1);
+    QuantizeMultiplier(real_multiplier_2, &data->output_multiplier_2,
+                       &data->output_shift_2);
   }
 
   // PRelu (parameteric Relu) shares the same alpha value on "shared axis".
@@ -797,13 +868,13 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
-      if (kernel_type == kReference) {
+      if (kernel_type == kReference || (data->input_multiplier > 0)) {
         const int size =
             MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
 
-        reference_integer_ops::Tanh(data->input_left_shift, size,
-                                    GetTensorData<int16_t>(input),
-                                    GetTensorData<int16_t>(output));
+        reference_integer_ops::Tanh(
+            data->input_multiplier, data->input_left_shift, size,
+            GetTensorData<int16_t>(input), GetTensorData<int16_t>(output));
       } else {
         optimized_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
@@ -872,11 +943,12 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteInt16: {
       LogisticParams params;
-      if (kernel_type == kReference) {
+      if (kernel_type == kReference || (data->input_multiplier > 0)) {
         const int size =
             MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
 
-        reference_integer_ops::Logistic(size, GetTensorData<int16_t>(input),
+        reference_integer_ops::Logistic(data->input_multiplier, size,
+                                        GetTensorData<int16_t>(input),
                                         GetTensorData<int16_t>(output));
       } else {
         optimized_ops::Logistic(
@@ -945,6 +1017,25 @@ TfLiteStatus SoftmaxQuantized(TfLiteContext* context, const TfLiteTensor* input,
   return kTfLiteOk;
 }
 
+template <>
+TfLiteStatus SoftmaxQuantized<int16, int16>(TfLiteContext* context,
+                                            const TfLiteTensor* input,
+                                            TfLiteTensor* output,
+                                            SoftmaxOpData* data) {
+  if (NumDimensions(input) >= 1 && NumDimensions(input) <= 4) {
+    reference_ops::SoftmaxInt16(
+        data->params, GetTensorShape(input), GetTensorData<int16_t>(input),
+        GetTensorShape(output), GetTensorData<int16_t>(output));
+    return kTfLiteOk;
+  } else {
+    TF_LITE_KERNEL_LOG(context,
+                       "Only 1D, 2D, 3D and 4D tensors supported for int16 "
+                       "input with int16 output, got %dD.",
+                       NumDimensions(input));
+    return kTfLiteError;
+  }
+}
+
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
   SoftmaxOpData* data = reinterpret_cast<SoftmaxOpData*>(node->user_data);
@@ -987,12 +1078,15 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
           return kTfLiteError;
       }
     }
+    case kTfLiteInt16: {
+      return SoftmaxQuantized<int16_t, int16_t>(context, input, output, data);
+    }
 
     default:
-      TF_LITE_KERNEL_LOG(
-          context,
-          "Only float32, uint8_t and int8_t are supported currently, got %s.",
-          TfLiteTypeGetName(input->type));
+      TF_LITE_KERNEL_LOG(context,
+                         "Only float32, uint8_t, Int8_t, Int16_t are supported "
+                         "currently, got %s.",
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -1079,22 +1173,56 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.input_offset = -input->params.zero_point;
       op_params.alpha_offset = -alpha->params.zero_point;
       op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier = data->output_multiplier;
-      op_params.output_shift = data->output_shift;
+      op_params.output_multiplier_1 = data->output_multiplier_1;
+      op_params.output_shift_1 = data->output_shift_1;
+      op_params.output_multiplier_2 = data->output_multiplier_2;
+      op_params.output_shift_2 = data->output_shift_2;
       reference_ops::BroadcastPrelu4DSlow(
           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
           GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
           GetTensorShape(output), GetTensorData<uint8_t>(output));
       return kTfLiteOk;
     } break;
+    case kTfLiteInt8: {
+      PreluParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.alpha_offset = -alpha->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier_1 = data->output_multiplier_1;
+      op_params.output_shift_1 = data->output_shift_1;
+      op_params.output_multiplier_2 = data->output_multiplier_2;
+      op_params.output_shift_2 = data->output_shift_2;
+      reference_ops::BroadcastPrelu4DSlow(
+          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+          GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
+          GetTensorShape(output), GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    } break;
     default:
       TF_LITE_KERNEL_LOG(
-          context, "Only float32 and uint8 are supported currently, got %d.",
+          context,
+          "Only float32 and uint8 and int8 are supported currently, got %d.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
 
+template <typename T>
+void QuantizeLeakyRelu(const TfLiteTensor* input, TfLiteTensor* output,
+                       const LeakyReluOpData* data) {
+  LeakyReluParams op_params;
+
+  op_params.input_offset = input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier_alpha = data->output_multiplier_alpha;
+  op_params.output_shift_alpha = data->output_shift_alpha;
+  op_params.output_multiplier_identity = data->output_multiplier_identity;
+  op_params.output_shift_identity = data->output_shift_identity;
+  reference_ops::QuantizeLeakyRelu(
+      op_params, GetTensorShape(input), GetTensorData<T>(input),
+      GetTensorShape(output), GetTensorData<T>(output));
+}
+
 TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -1113,33 +1241,21 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
-      op_params.input_offset = input->params.zero_point;
-      op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier_alpha = data->output_multiplier_alpha;
-      op_params.output_shift_alpha = data->output_shift_alpha;
-      op_params.output_multiplier_identity = data->output_multiplier_identity;
-      op_params.output_shift_identity = data->output_shift_identity;
-      reference_ops::QuantizeLeakyRelu(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      QuantizeLeakyRelu<uint8_t>(input, output, data);
       return kTfLiteOk;
     } break;
     case kTfLiteInt8: {
-      op_params.input_offset = input->params.zero_point;
-      op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier_alpha = data->output_multiplier_alpha;
-      op_params.output_shift_alpha = data->output_shift_alpha;
-      op_params.output_multiplier_identity = data->output_multiplier_identity;
-      op_params.output_shift_identity = data->output_shift_identity;
-      reference_ops::QuantizeLeakyRelu(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+      QuantizeLeakyRelu<int8_t>(input, output, data);
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt16: {
+      QuantizeLeakyRelu<int16_t>(input, output, data);
       return kTfLiteOk;
     } break;
     default:
       TF_LITE_KERNEL_LOG(
           context,
-          "Only float32, int8 and uint8 is supported currently, got %s.",
+          "Only float32, int8, int16 and uint8 is supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index a33d5e65200..9f6fb932d34 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -85,7 +85,16 @@ class BaseActivationsOpModel : public SingleOpModel {
       output_ = AddOutput({TensorType_UINT8, {}, 0, 0, 1. / 256});
     } else if (output_type == TensorType_INT8) {
       output_ = AddOutput({TensorType_INT8, {}, 0, 0, 1. / 256, -128});
-    } else if (output_type == TensorType_INT16) {
+    } else if (input.type == TensorType_INT16 &&
+               output_type == TensorType_INT16) {
+      output_ = AddOutput({TensorType_INT16,
+                           {},
+                           0,
+                           0,
+                           1.0f / (std::numeric_limits<int16_t>::max() + 1),
+                           0});
+    } else if (input.type != TensorType_INT16 &&
+               output_type == TensorType_INT16) {
       output_ = AddOutput({TensorType_INT16, {}, 0, 0, 1. / 32768, -16384});
     } else {
       output_ = AddOutput({output_type, {}});
@@ -99,10 +108,20 @@ class BaseActivationsOpModel : public SingleOpModel {
   BaseActivationsOpModel(TensorData input, float alpha) {
     input_ = AddInput(input);
     // The output scale and input scale might be different.
-    if (input.type == TensorType_UINT8 || input.type == TensorType_INT8) {
+    if (input.type == TensorType_UINT8 || input.type == TensorType_INT8 ||
+        input.type == TensorType_INT16) {
       auto output_min = (input.min >= 0) ? input.min : input.min * alpha;
       auto output_max = (input.max >= 0) ? input.max : input.max * alpha;
-      output_ = AddOutput({input.type, {}, output_min, output_max});
+      if (input.type == TensorType_INT16) {
+        output_ = AddOutput({TensorType_INT16,
+                             {},
+                             0,
+                             0,
+                             output_max / (std::numeric_limits<int16_t>::max()),
+                             0});
+      } else {
+        output_ = AddOutput({input.type, {}, output_min, output_max});
+      }
     } else {
       output_ = AddOutput({input.type, {}});
     }
@@ -495,14 +514,15 @@ TEST(QuantizedActivationsOpTest, LeakyReluUint8) {
                   kQuantizedTolerance * 8)));
 }
 
-TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedActivationsOpTestLeakyRelu() {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
 
   QuantizedActivationsOpModel m(
-      /*input=*/{TensorType_INT8, {5, 5}, 5 * kMin, 5 * kMax}, 0.1);
+      /*input=*/{tensor_type, {5, 5}, 5 * kMin, 5 * kMax}, 0.1);
 
-  m.SetInput<int8_t>({
+  m.SetInput<integer_dtype>({
       -5.0f, -4.6f, -4.2f, -3.8f, -3.4f,  // Row 1
       -3.0f, -2.6f, -2.2f, -1.8f, -1.4f,  // Row 2
       -1.0f, -0.6f, -0.2f, 0.2f,  0.6f,   // Row 3
@@ -510,7 +530,12 @@ TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
       3.0f,  3.4f,  3.8f,  4.2f,  4.6f,   // Row 5
   });
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+
+  float kTestQuantizedTolerance = tensor_type == TensorType_INT16
+                                      ? kQuantizedToleranceInt16
+                                      : kQuantizedTolerance * 5;
+
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       -0.50f, -0.46f, -0.42f, -0.38f, -0.34f,  // Row 1
@@ -519,7 +544,15 @@ TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
                       1.00f,  1.40f,  1.80f,  2.20f,  2.60f,   // Row 4
                       3.00f,  3.40f,  3.80f,  4.20f,  4.60f,   // Row 5
                   },
-                  kQuantizedTolerance * 5)));
+                  kTestQuantizedTolerance)));
+}
+
+TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
+  QuantizedActivationsOpTestLeakyRelu<TensorType_INT8, int8_t>();
+}
+
+TEST(QuantizedActivationsOpTest, LeakyReluInt16) {
+  QuantizedActivationsOpTestLeakyRelu<TensorType_INT16, int16_t>();
 }
 
 TEST(QuantizedActivationsOpTest, Relu1Int8) {
@@ -764,19 +797,73 @@ TEST_P(TanhOpTest, TanhInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_TANH,
-      /*input=*/{TensorType_INT16, {1, 2, 8, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {1, 2, 8, 1}, kMin, kMax});
-  m.SetInput<int16_t>({0, -6, 2, 4,   //
-                       -4, -2, 8, 1,  //
-                       7, -8, 3, -5,  //
-                       6, -1, -3, 5});
+      /*input=*/{TensorType_INT16, {89}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT16, {89}, kMin, kMax});
+  m.SetInput<int16_t>(
+      {-8.0000000000, -7.8181818182, -7.6363636364, -7.4545454545,
+       -7.2727272727, -7.0909090909, -6.9090909091, -6.7272727273,
+       -6.5454545455, -6.3636363636, -6.1818181818, -6.0000000000,
+       -5.8181818182, -5.6363636364, -5.4545454545, -5.2727272727,
+       -5.0909090909, -4.9090909091, -4.7272727273, -4.5454545455,
+       -4.3636363636, -4.1818181818, -4.0000000000, -3.8181818182,
+       -3.6363636364, -3.4545454545, -3.2727272727, -3.0909090909,
+       -2.9090909091, -2.7272727273, -2.5454545455, -2.3636363636,
+       -2.1818181818, -2.0000000000, -1.8181818182, -1.6363636364,
+       -1.4545454545, -1.2727272727, -1.0909090909, -0.9090909091,
+       -0.7272727273, -0.5454545455, -0.3636363636, -0.1818181818,
+       0.0000000000,  0.1818181818,  0.3636363636,  0.5454545455,
+       0.7272727273,  0.9090909091,  1.0909090909,  1.2727272727,
+       1.4545454545,  1.6363636364,  1.8181818182,  2.0000000000,
+       2.1818181818,  2.3636363636,  2.5454545455,  2.7272727273,
+       2.9090909091,  3.0909090909,  3.2727272727,  3.4545454545,
+       3.6363636364,  3.8181818182,  4.0000000000,  4.1818181818,
+       4.3636363636,  4.5454545455,  4.7272727273,  4.9090909091,
+       5.0909090909,  5.2727272727,  5.4545454545,  5.6363636364,
+       5.8181818182,  6.0000000000,  6.1818181818,  6.3636363636,
+       6.5454545455,  6.7272727273,  6.9090909091,  7.0909090909,
+       7.2727272727,  7.4545454545,  7.6363636364,  7.8181818182,
+       8.0000000000});
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
               ElementsAreArray(ArrayFloatNear(
-                  {0.0, -0.999987, 0.964027, 0.999329,                //
-                   -0.999329, -0.96402, 0.99999, 0.76159,             //
-                   0.999998337, -0.99999, 0.995054754, -0.999909204,  //
-                   0.999999996, -0.76159, -0.995054754, 0.999909204},
+                  {-0.9999997749, -0.9999996762, -0.9999995342, -0.9999993300,
+                   -0.9999990361, -0.9999986134, -0.9999980053, -0.9999971306,
+                   -0.9999958722, -0.9999940619, -0.9999914578, -0.9999877117,
+                   -0.9999823226, -0.9999745703, -0.9999634183, -0.9999473758,
+                   -0.9999242982, -0.9998911009, -0.9998433469, -0.9997746542,
+                   -0.9996758446, -0.9995337191, -0.9993292997, -0.9990353053,
+                   -0.9986125310, -0.9980046622, -0.9971308601, -0.9958751909,
+                   -0.9940716137, -0.9914827859, -0.9877703933, -0.9824541388,
+                   -0.9748561217, -0.9640275801, -0.9486568273, -0.9269625051,
+                   -0.8965880154, -0.8545351057, -0.7972097087, -0.7206956332,
+                   -0.6213939966, -0.4971057414, -0.3484130125, -0.1798408185,
+                   0.0000000000,  0.1798408185,  0.3484130125,  0.4971057414,
+                   0.6213939966,  0.7206956332,  0.7972097087,  0.8545351057,
+                   0.8965880154,  0.9269625051,  0.9486568273,  0.9640275801,
+                   0.9748561217,  0.9824541388,  0.9877703933,  0.9914827859,
+                   0.9940716137,  0.9958751909,  0.9971308601,  0.9980046622,
+                   0.9986125310,  0.9990353053,  0.9993292997,  0.9995337191,
+                   0.9996758446,  0.9997746542,  0.9998433469,  0.9998911009,
+                   0.9999242982,  0.9999473758,  0.9999634183,  0.9999745703,
+                   0.9999823226,  0.9999877117,  0.9999914578,  0.9999940619,
+                   0.9999958722,  0.9999971306,  0.9999980053,  0.9999986134,
+                   0.9999990361,  0.9999993300,  0.9999995342,  0.9999996762,
+                   0.9999997749},
+                  kQuantizedToleranceInt16)));
+}
+
+TEST_P(TanhOpTest, TanhInt16General) {
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedActivationsOpModel m(
+      GetRegistration(), BuiltinOperator_TANH,
+      /*input=*/{TensorType_INT16, {6}, 11 * kMin, 11 * kMax},
+      /*output=*/{TensorType_INT16, {5}, kMin, kMax});
+  m.SetInput<int16_t>({-10, -4, 0, 6, 7.0909090909, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-0.999969, -0.99408, 0, 0.999664, 0.999939, 0.999969},
                   kQuantizedToleranceInt16)));
 }
 
@@ -905,20 +992,74 @@ TEST_P(LogisticOpTest, SigmoidInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_INT16, {1, 2, 6, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {1, 2, 6, 1}, kMin, kMax});
-  m.SetInput<int16_t>({0, -6, 2, 4,  //
-                       3, -2, 8, 1,  //
-                       5, -8, 7, -3});
+      /*input=*/{TensorType_INT16, {89}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT16, {89}, kMin, kMax});
+  m.SetInput<int16_t>(
+      {-10.0000000000, -9.7727272727, -9.5454545455, -9.3181818182,
+       -9.0909090909,  -8.8636363636, -8.6363636364, -8.4090909091,
+       -8.1818181818,  -7.9545454545, -7.7272727273, -7.5000000000,
+       -7.2727272727,  -7.0454545455, -6.8181818182, -6.5909090909,
+       -6.3636363636,  -6.1363636364, -5.9090909091, -5.6818181818,
+       -5.4545454545,  -5.2272727273, -5.0000000000, -4.7727272727,
+       -4.5454545455,  -4.3181818182, -4.0909090909, -3.8636363636,
+       -3.6363636364,  -3.4090909091, -3.1818181818, -2.9545454545,
+       -2.7272727273,  -2.5000000000, -2.2727272727, -2.0454545455,
+       -1.8181818182,  -1.5909090909, -1.3636363636, -1.1363636364,
+       -0.9090909091,  -0.6818181818, -0.4545454545, -0.2272727273,
+       0.0000000000,   0.2272727273,  0.4545454545,  0.6818181818,
+       0.9090909091,   1.1363636364,  1.3636363636,  1.5909090909,
+       1.8181818182,   2.0454545455,  2.2727272727,  2.5000000000,
+       2.7272727273,   2.9545454545,  3.1818181818,  3.4090909091,
+       3.6363636364,   3.8636363636,  4.0909090909,  4.3181818182,
+       4.5454545455,   4.7727272727,  5.0000000000,  5.2272727273,
+       5.4545454545,   5.6818181818,  5.9090909091,  6.1363636364,
+       6.3636363636,   6.5909090909,  6.8181818182,  7.0454545455,
+       7.2727272727,   7.5000000000,  7.7272727273,  7.9545454545,
+       8.1818181818,   8.4090909091,  8.6363636364,  8.8636363636,
+       9.0909090909,   9.3181818182,  9.5454545455,  9.7727272727,
+       10.0000000000});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {
-                      0.5, 0.002473, 0.880797, 0.982014,       //
-                      0.952574, 0.119203, 0.9995, 0.731059,    //
-                      0.993307, 0.0003535, 0.999089, 0.047426  //
-                  },
-                  kQuantizedToleranceInt16)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int16_t>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0000453979, 0.0000569815, 0.0000715205, 0.0000897689, 0.0001126729,
+           0.0001414198, 0.0001774998, 0.0002227827, 0.0002796147, 0.0003509396,
+           0.0004404502, 0.0005527786, 0.0006937345, 0.0008706021, 0.0010925128,
+           0.0013709094, 0.0017201256, 0.0021581065, 0.0027073042, 0.0033957870,
+           0.0042586071, 0.0053394826, 0.0066928509, 0.0083863576, 0.0105038445,
+           0.0131488902, 0.0164489307, 0.0205599431, 0.0256715863, 0.0320125562,
+           0.0398556989, 0.0495221198, 0.0613831074, 0.0758581800, 0.0934070047,
+           0.1145124805, 0.1396521834, 0.1692560327, 0.2036499335, 0.2429886272,
+           0.2871859014, 0.3358556241, 0.3882805886, 0.4434251301, 0.5000000000,
+           0.5565748699, 0.6117194114, 0.6641443759, 0.7128140986, 0.7570113728,
+           0.7963500665, 0.8307439673, 0.8603478166, 0.8854875195, 0.9065929953,
+           0.9241418200, 0.9386168926, 0.9504778802, 0.9601443011, 0.9679874438,
+           0.9743284137, 0.9794400569, 0.9835510693, 0.9868511098, 0.9894961555,
+           0.9916136424, 0.9933071491, 0.9946605174, 0.9957413929, 0.9966042130,
+           0.9972926958, 0.9978418935, 0.9982798744, 0.9986290906, 0.9989074872,
+           0.9991293979, 0.9993062655, 0.9994472214, 0.9995595498, 0.9996490604,
+           0.9997203853, 0.9997772173, 0.9998225002, 0.9998585802, 0.9998873271,
+           0.9999102311, 0.9999284795, 0.9999430185, 0.9999546021},
+          kQuantizedToleranceInt16)));
+}
+
+TEST_P(LogisticOpTest, SigmoidInt16General) {
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedActivationsOpModel m(
+      GetRegistration(), BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_INT16, {8}, 10 * kMin, 10 * kMax},
+      /*output=*/{TensorType_INT16, {8}, kMin, kMax});
+  m.SetInput<int16_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int16_t>(),
+      ElementsAreArray(ArrayFloatNear({0.5, 0.00814819, 0.832031, 0.960846,  //
+                                       0.916809, 0.167969, 0.999664, 0.689972},
+                                      kQuantizedToleranceInt16)));
 }
 
 TEST(FloatActivationsOpTest, Softmax4D) {
@@ -1040,6 +1181,149 @@ TEST(QuantizedActivationsOpTest, Softmax1DInt8) {
                                       kQuantizedTolerance)));
 }
 
+// Test quantized softmax with int16 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax2D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax1DInt16) {
+  QuantizedActivationsOpModel m(1,
+                                /*input=*/{TensorType_INT16, {3}, -3, 3},
+                                /*output_type-*/ TensorType_INT16);
+  m.SetInput<int16_t>({1, 2, 3});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int16_t>(),
+      ElementsAreArray(ArrayFloatNear({0.0900269, 0.2447285, 0.66524096},
+                                      kQuantizedToleranceInt16)));
+}
+
+TEST(QuantizedActivationsOpTest, Softmax1DInt16ZeroElement) {
+  QuantizedActivationsOpModel m(0.1,
+                                /*input=*/{TensorType_INT16, {1}, -1, 1},
+                                TensorType_INT16);
+  m.SetInput<int16_t>({0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({1}, kQuantizedToleranceInt16)));
+}
+
+TEST(QuantizedActivationsOpTest, Softmax2DInt16) {
+  QuantizedActivationsOpModel m(0.1,
+                                /*input=*/{TensorType_INT16, {2, 4}, -10, 10},
+                                TensorType_INT16);
+  m.SetInput<int16_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedToleranceInt16)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(0.1,
+                                 /*input=*/{TensorType_INT16, {4, 2}, -10, 10},
+                                 TensorType_INT16);
+  m2.SetInput<int16_t>({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.645656, 0.354344,  //
+                      0.450166, 0.549834,  //
+                      0.622459, 0.377541,  //
+                      0.710949, 0.28905,   //
+                  },
+                  kQuantizedToleranceInt16)));
+}
+
+TEST(QuantizedActivationsOpTest, Softmax3DInt16) {
+  QuantizedActivationsOpModel m(
+      1,
+      /*input=*/{TensorType_INT16, {1, 2, 4}, -10, 10}, TensorType_INT16);
+  m.SetInput<int16_t>({
+      0, -6, 2, 4,   // depth = 0
+      3, -2, 10, 1,  // depth = 1
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .0158756, .000039, .1173, .866779,   //
+                      .00091, .0000061, .998959, .000123,  //
+                  },
+                  kQuantizedTolerance)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(
+      1,
+      /*input=*/{TensorType_INT16, {4, 1, 2}, -10, 10}, TensorType_INT16);
+  m2.SetInput<int16_t>({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.997527, 0.0024726,       //
+                      0.11920292, 0.88079707,    //
+                      0.99330715, 0.00669285,    //
+                      0.999876605, 0.000123395,  //
+                  },
+                  kQuantizedTolerance)));
+}
+
+// Test quantized softmax with int16 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax4D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax4DInt16) {
+  QuantizedActivationsOpModel m(
+      0.1,
+      /*input=*/{TensorType_INT16, {1, 2, 1, 4}, -10, 10}, TensorType_INT16);
+  m.SetInput<int16_t>({
+      0, -6, 2, 4,   // depth = 0
+      3, -2, 10, 1,  // depth = 1
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedToleranceInt16)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(
+      0.1,
+      /*input=*/{TensorType_INT16, {4, 1, 1, 2}, -10, 10}, TensorType_INT16);
+  m2.SetInput<int16_t>({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.645656, 0.354344,  //
+                      0.450166, 0.549834,  //
+                      0.622459, 0.377541,  //
+                      0.710949, 0.28905,   //
+                  },
+                  kQuantizedToleranceInt16)));
+}
+
 // Test quantized softmax with int8 input and int16 output. With the same input
 // as in QuantizedActivationsOpTest.Softmax1D, the dequantized output is
 // identical.
@@ -1818,6 +2102,36 @@ TEST(QuantizedActivationsOpTest, PRelu) {
                                       }));
 }
 
+TEST(QuantizedActivationsOpTest, PReluInt8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_INT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_INT8, {1, 1, 3}, kMin, kMax});
+  m.SetInput<int8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+  });
+  m.SetAlpha<int8_t>({0.0f, 0.5f, -0.5f});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         0, 0, 0,     // Row 1, Column 1
+                                         64, 64, 64,  // Row 1, Column 2
+                                         0, -64, 64,  // Row 2, Column 1
+                                         0, -16, 16,  // Row 1, Column 2
+                                     }));
+}
+
 class LeakyReluOpModel : public SingleOpModel {
  public:
   LeakyReluOpModel(const TensorData& input, float alpha) {
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index 3e03b13ecbe..a7912654faa 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -44,6 +44,7 @@ enum KernelType {
 struct OpData {
   // The index of the temporary tensors where we store transposed LHS/RHS.
   int scratch_tensor_index;
+  bool rhs_transposed;
 };
 
 struct OpContext {
@@ -63,6 +64,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // Creates two temp tensors to store the transposed LHS and/or RHS if
   // needed.
   auto* op_data = new OpData();
+  // If the RHS is constant, we only transpose once.
+  op_data->rhs_transposed = false;
   context->AddTensors(context, 2, &op_data->scratch_tensor_index);
   return op_data;
 }
@@ -74,8 +77,8 @@ void Free(TfLiteContext* context, void* buffer) {
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 const RuntimeShape& extended_lhs_shape,
                                 const RuntimeShape& extended_rhs_shape,
-                                bool adjoint_lhs, bool adjoint_rhs,
-                                int output_rank, TfLiteTensor* output) {
+                                bool adj_x, bool adj_y, int output_rank,
+                                TfLiteTensor* output) {
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(output_rank);
   // Fill in any broadcast dimensions.
   for (int i = 0; i < output_rank - 2; ++i) {
@@ -88,8 +91,8 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
     output_shape->data[i] = broadcast_dim;
   }
   // Fill in the matmul dimensions.
-  int lhs_rows_index = adjoint_lhs ? output_rank - 1 : output_rank - 2;
-  int rhs_cols_index = adjoint_rhs ? output_rank - 2 : output_rank - 1;
+  int lhs_rows_index = adj_x ? output_rank - 1 : output_rank - 2;
+  int rhs_cols_index = adj_y ? output_rank - 2 : output_rank - 1;
 
   output_shape->data[output_rank - 2] = extended_lhs_shape.Dims(lhs_rows_index);
   output_shape->data[output_rank - 1] = extended_rhs_shape.Dims(rhs_cols_index);
@@ -107,7 +110,7 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   node->temporaries->data[0] = op_data->scratch_tensor_index;
   node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
   // Temp tensor for Transposed LHS;
-  if (op_context->params->adjoint_lhs) {
+  if (op_context->params->adj_x) {
     TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
     const TfLiteTensor* lhs = op_context->lhs;
     int lhs_rank = NumDimensions(lhs);
@@ -125,13 +128,12 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
                                                      scratch_buffer_size));
   }
 
-  // We need the RHS transposed in the standard case, so if the flag is set,
-  // we do nothing. If the flag is not set, we need this temporary space.
-  // Note: we assume that the RHS is an in-memory tensor. If RHS is from a
-  // constant buffer (e.g. a weights buffer) with allocation type
-  // kTfLiteMmapRo, then this logic must be updated (since a read-only buffer
-  // is in the opposite layout pattern).
-  if (!op_context->params->adjoint_rhs) {
+  // We need a temp buffer for the RHS if we need to transpose the RHS. We
+  // transpose by default, so that the two inputs (LHS and RHS) are in a proper
+  // layout for our fast matrix multiplication routines. If the transpose flag
+  // is set by the caller, the data is already in the desired layout.
+  const bool rhs_needs_temp = !(op_context->params->adj_y);
+  if (rhs_needs_temp) {
     TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/1);
     const TfLiteTensor* rhs = op_context->rhs;
     int rhs_rank = NumDimensions(rhs);
@@ -143,6 +145,11 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
     scratch_buffer_size->data[rhs_rank - 2] = rhs->dims->data[rhs_rank - 1];
     scratch_buffer_size->data[rhs_rank - 1] = rhs->dims->data[rhs_rank - 2];
 
+    if (IsConstantTensor(op_context->rhs)) {
+      scratch_buffer->allocation_type = kTfLiteArenaRwPersistent;
+    } else {
+      scratch_buffer->allocation_type = kTfLiteArenaRw;
+    }
     scratch_buffer->type = op_context->rhs->type;
     scratch_buffer->allocation_type = kTfLiteArenaRw;
     TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
@@ -158,8 +165,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
 
-  bool adjoint_lhs = op_context.params->adjoint_lhs;
-  bool adjoint_rhs = op_context.params->adjoint_rhs;
+  bool adj_x = op_context.params->adj_x;
+  bool adj_y = op_context.params->adj_y;
 
   const TfLiteTensor* lhs_data = GetInput(context, node, kInputLHSTensor);
   const TfLiteTensor* rhs_data = GetInput(context, node, kInputRHSTensor);
@@ -192,15 +199,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
   }
   // Ensure other dimensions work for matrix multiplication.
-  int accum_dim_lhs = adjoint_lhs ? extended_lhs_shape.Dims(output_rank - 2)
-                                  : extended_lhs_shape.Dims(output_rank - 1);
-  int accum_dim_rhs = adjoint_rhs ? extended_rhs_shape.Dims(output_rank - 1)
-                                  : extended_rhs_shape.Dims(output_rank - 2);
+  int accum_dim_lhs = adj_x ? extended_lhs_shape.Dims(output_rank - 2)
+                            : extended_lhs_shape.Dims(output_rank - 1);
+  int accum_dim_rhs = adj_y ? extended_rhs_shape.Dims(output_rank - 1)
+                            : extended_rhs_shape.Dims(output_rank - 2);
 
   TF_LITE_ENSURE_EQ(context, accum_dim_lhs, accum_dim_rhs);
   TfLiteStatus status =
-      ResizeOutputTensor(context, extended_lhs_shape, extended_rhs_shape,
-                         adjoint_lhs, adjoint_rhs, output_rank, output);
+      ResizeOutputTensor(context, extended_lhs_shape, extended_rhs_shape, adj_x,
+                         adj_y, output_rank, output);
   return status;
 }
 
@@ -244,33 +251,37 @@ RuntimeShape SwapRowColumnDims(const RuntimeShape& shape) {
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
   const TfLiteTensor* lhs = GetInput(context, node, kInputLHSTensor);
   const TfLiteTensor* rhs = GetInput(context, node, kInputRHSTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   RuntimeShape orig_lhs_shape = GetTensorShape(lhs);
   RuntimeShape orig_rhs_shape = GetTensorShape(rhs);
 
-  bool adjoint_rhs = op_context.params->adjoint_rhs;
-  bool adjoint_lhs = op_context.params->adjoint_lhs;
+  bool adj_y = op_context.params->adj_y;
+  bool adj_x = op_context.params->adj_x;
 
-  const TfLiteTensor* rhs_tensor =
-      adjoint_rhs ? rhs : GetTemporary(context, node, 1);
-  const TfLiteTensor* lhs_tensor =
-      adjoint_lhs ? GetTemporary(context, node, 0) : lhs;
-  if (!adjoint_rhs) {
-    TransposeRowsColumns<float>(
-        rhs, GetTensorData<float>(rhs), GetTemporary(context, node, 1),
-        GetTensorData<float>(GetTemporary(context, node, 1)));
+  const TfLiteTensor* rhs_tensor = adj_y ? rhs : GetTemporary(context, node, 1);
+  const TfLiteTensor* lhs_tensor = adj_x ? GetTemporary(context, node, 0) : lhs;
+  if (!adj_y) {
+    // TODO(b/154760341) Constant tensors should already be transposed, but
+    // we transpose once if necessary for now.
+    if (!(IsConstantTensor(rhs) && op_data->rhs_transposed)) {
+      TransposeRowsColumns<float>(
+          rhs, GetTensorData<float>(rhs), GetTemporary(context, node, 1),
+          GetTensorData<float>(GetTemporary(context, node, 1)));
+      op_data->rhs_transposed = true;
+    }
   }
-  if (adjoint_lhs) {
+  if (adj_x) {
     TransposeRowsColumns<float>(
         lhs, GetTensorData<float>(lhs), GetTemporary(context, node, 0),
         GetTensorData<float>(GetTemporary(context, node, 0)));
   }
   RuntimeShape rhs_shape =
-      adjoint_rhs ? orig_rhs_shape : SwapRowColumnDims(orig_rhs_shape);
+      adj_y ? orig_rhs_shape : SwapRowColumnDims(orig_rhs_shape);
   RuntimeShape lhs_shape =
-      adjoint_lhs ? orig_lhs_shape : SwapRowColumnDims(orig_lhs_shape);
+      adj_x ? orig_lhs_shape : SwapRowColumnDims(orig_lhs_shape);
 
   switch (lhs->type) {
     case kTfLiteFloat32:
diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc
index 28f8d87be9b..eeb075c4fe1 100644
--- a/tensorflow/lite/kernels/batch_matmul_test.cc
+++ b/tensorflow/lite/kernels/batch_matmul_test.cc
@@ -28,13 +28,13 @@ template <typename T>
 class BatchMatMulOpModel : public SingleOpModel {
  public:
   BatchMatMulOpModel(const TensorData& lhs, const TensorData& rhs,
-                     bool adjoint_lhs = false, bool adjoint_rhs = false) {
+                     bool adj_x = false, bool adj_y = false) {
     lhs_id_ = AddInput(lhs);
     rhs_id_ = AddInput(rhs);
     output_id_ = AddOutput(lhs.type);
-    SetBuiltinOp(
-        BuiltinOperator_BATCH_MATMUL, BuiltinOptions_BatchMatMulOptions,
-        CreateBatchMatMulOptions(builder_, adjoint_lhs, adjoint_rhs).Union());
+    SetBuiltinOp(BuiltinOperator_BATCH_MATMUL,
+                 BuiltinOptions_BatchMatMulOptions,
+                 CreateBatchMatMulOptions(builder_, adj_x, adj_y).Union());
     BuildInterpreter({GetShape(lhs_id_), GetShape(rhs_id_)});
   }
 
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 3a780eed0a0..8ccc7a68eb7 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -785,7 +785,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     const TfLiteTensor* fw_projection_weights =
         GetOptionalInputTensor(context, node, kFwProjectionWeightsTensor);
     if (fw_projection_weights != nullptr) {
-      fw_row_sums_rows += ceil(n_fw_output / n_fw_cell);
+      fw_row_sums_rows += ceil(static_cast<float>(n_fw_output) / n_fw_cell);
     }
     node->temporaries->data[kFwRowSums] =
         op_data->scratch_tensor_index + kFwRowSums;
@@ -808,7 +808,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     const TfLiteTensor* bw_projection_weights =
         GetOptionalInputTensor(context, node, kBwProjectionWeightsTensor);
     if (bw_projection_weights != nullptr) {
-      bw_row_sums_rows += ceil(n_bw_output / n_bw_cell);
+      bw_row_sums_rows += ceil(static_cast<float>(n_bw_output) / n_bw_cell);
     }
     node->temporaries->data[kBwRowSums] =
         op_data->scratch_tensor_index + kBwRowSums;
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index a8e3148464c..91dbc447c35 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -27,7 +27,8 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus ComparisonPrepareCommon(TfLiteContext* context, TfLiteNode* node,
+                                     bool is_string_allowed) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -36,7 +37,9 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Don't support string.
-  TF_LITE_ENSURE(context, input1->type != kTfLiteString);
+  if (!is_string_allowed) {
+    TF_LITE_ENSURE(context, input1->type != kTfLiteString);
+  }
   // Currently only support tensors have the same type.
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = kTfLiteBool;
@@ -54,6 +57,15 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_size);
 }
 
+TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return ComparisonPrepareCommon(context, node, false);
+}
+
+TfLiteStatus ComparisonPrepareStringAllowed(TfLiteContext* context,
+                                            TfLiteNode* node) {
+  return ComparisonPrepareCommon(context, node, true);
+}
+
 template <typename input_dtype, reference_ops::ComparisonFn<int32> opname>
 void ComparisonQuantized(const TfLiteTensor* input1, const TfLiteTensor* input2,
                          TfLiteTensor* output, bool requires_broadcast) {
@@ -108,6 +120,21 @@ void Comparison(const TfLiteTensor* input1, const TfLiteTensor* input2,
             GetTensorShape(output), GetTensorData<bool>(output));
 }
 
+void ComparisonString(bool (*opname)(const StringRef&, const StringRef&),
+                      const TfLiteTensor* input1, const TfLiteTensor* input2,
+                      TfLiteTensor* output, bool requires_broadcast) {
+  bool* output_data = GetTensorData<bool>(output);
+  if (requires_broadcast) {
+    reference_ops::BroadcastComparison4DSlowStringImpl(
+        opname, GetTensorShape(input1), input1, GetTensorShape(input2), input2,
+        GetTensorShape(output), output_data);
+  } else {
+    reference_ops::ComparisonStringImpl(opname, GetTensorShape(input1), input1,
+                                        GetTensorShape(input2), input2,
+                                        GetTensorShape(output), output_data);
+  }
+}
+
 TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
@@ -138,9 +165,14 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
       ComparisonQuantized<int8_t, reference_ops::EqualFn>(
           input1, input2, output, requires_broadcast);
       break;
+    case kTfLiteString:
+      ComparisonString(reference_ops::StringRefEqualFn, input1, input2, output,
+                       requires_broadcast);
+      break;
     default:
       context->ReportError(
-          context, "Does not support type %d, requires bool|float|int|uint8",
+          context,
+          "Does not support type %d, requires bool|float|int|uint8|string",
           input1->type);
       return kTfLiteError;
   }
@@ -177,9 +209,14 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
       ComparisonQuantized<int8_t, reference_ops::NotEqualFn>(
           input1, input2, output, requires_broadcast);
       break;
+    case kTfLiteString:
+      ComparisonString(reference_ops::StringRefNotEqualFn, input1, input2,
+                       output, requires_broadcast);
+      break;
     default:
       context->ReportError(
-          context, "Does not support type %d, requires bool|float|int|uint8",
+          context,
+          "Does not support type %d, requires bool|float|int|uint8|string",
           input1->type);
       return kTfLiteError;
   }
@@ -330,14 +367,15 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace comparisons
 
 TfLiteRegistration* Register_EQUAL() {
-  static TfLiteRegistration r = {
-      nullptr, nullptr, comparisons::ComparisonPrepare, comparisons::EqualEval};
+  static TfLiteRegistration r = {nullptr, nullptr,
+                                 comparisons::ComparisonPrepareStringAllowed,
+                                 comparisons::EqualEval};
   return &r;
 }
 
 TfLiteRegistration* Register_NOT_EQUAL() {
   static TfLiteRegistration r = {nullptr, nullptr,
-                                 comparisons::ComparisonPrepare,
+                                 comparisons::ComparisonPrepareStringAllowed,
                                  comparisons::NotEqualEval};
   return &r;
 }
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 0fc49ea5c88..986600ccd1a 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -125,6 +125,20 @@ TEST(ComparisonsTest, EqualInt) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
+TEST(ComparisonsTest, EqualString) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  ComparisonOpModel model({1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}, TensorType_STRING,
+                          BuiltinOperator_EQUAL);
+  model.PopulateTensor<std::string>(model.input1(), {"A", "B", "C", "D"});
+  model.PopulateTensor<std::string>(model.input2(), {"A", "C", "B", "D"});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4, 1));
+}
+
 TEST(ComparisonsTest, EqualBroadcast) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32,
                           BuiltinOperator_EQUAL);
@@ -148,6 +162,20 @@ TEST(ComparisonsTest, EqualBroadcastTwoD) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
+TEST(ComparisonsTest, EqualBroadcastString) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_STRING,
+                          BuiltinOperator_EQUAL);
+  model.PopulateTensor<std::string>(model.input1(), {"A", "B", "A", "B"});
+  model.PopulateTensor<std::string>(model.input2(), {"A"});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(ComparisonsTest, NotEqualBool) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_BOOL,
                           BuiltinOperator_NOT_EQUAL);
@@ -181,6 +209,20 @@ TEST(ComparisonsTest, NotEqualInt) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
+TEST(ComparisonsTest, NotEqualString) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  ComparisonOpModel model({1, 1, 1, 1, 4}, {1, 1, 1, 1, 4}, TensorType_STRING,
+                          BuiltinOperator_NOT_EQUAL);
+  model.PopulateTensor<std::string>(model.input1(), {"A", "B", "C", "D"});
+  model.PopulateTensor<std::string>(model.input2(), {"A", "C", "B", "D"});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 1, 4));
+}
+
 TEST(ComparisonsTest, NotEqualBroadcast) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32,
                           BuiltinOperator_NOT_EQUAL);
@@ -204,6 +246,20 @@ TEST(ComparisonsTest, NotEqualBroadcastTwoD) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
+TEST(ComparisonsTest, NotEqualBroadcastString) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_STRING,
+                          BuiltinOperator_NOT_EQUAL);
+  model.PopulateTensor<std::string>(model.input1(), {"A", "B", "A", "B"});
+  model.PopulateTensor<std::string>(model.input2(), {"A"});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(ComparisonsTest, GreaterFloat) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
                           BuiltinOperator_GREATER);
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 0bb7ca77da7..403adc725eb 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -320,9 +320,9 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
 
   // Check types. (We assume that UINT8 refers to quantized tensors)
   TfLiteType input_type = input->type;
-  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
-                              input_type == kTfLiteUInt8 ||
-                              input_type == kTfLiteInt8);
+  TF_LITE_ENSURE(context,
+                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+                     input_type == kTfLiteInt8 || input_type == kTfLiteInt16);
   TF_LITE_ENSURE_EQ(context, output->type, input_type);
 
   const TfLiteTensor* bias = nullptr;
@@ -336,6 +336,11 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
     if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+    } else if (input_type == kTfLiteInt16) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt64);
+      TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+      TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
       TF_LITE_ENSURE_EQ(context, bias->type, input_type);
     }
@@ -677,6 +682,42 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   }
 }
 
+template <KernelType kernel_type>
+void EvalQuantizedPerChannel16x8(TfLiteContext* context, TfLiteNode* node,
+                                 TfLiteConvParams* params, OpData* data,
+                                 const TfLiteTensor* input,
+                                 const TfLiteTensor* filter,
+                                 const TfLiteTensor* bias, TfLiteTensor* output,
+                                 TfLiteTensor* im2col) {
+  ConvParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  switch (kernel_type) {
+    case kGenericOptimized:
+    case kMultithreadOptimized:
+    case kCblasOptimized:
+    case kReference: {
+      reference_integer_ops::ConvPerChannel(
+          op_params, data->per_channel_output_multiplier.data(),
+          data->per_channel_output_shift.data(), GetTensorShape(input),
+          GetTensorData<int16>(input), GetTensorShape(filter),
+          GetTensorData<int8>(filter), GetTensorShape(bias),
+          GetTensorData<std::int64_t>(bias), GetTensorShape(output),
+          GetTensorData<int16>(output));
+      break;
+    }
+  }
+}
+
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteConvParams* params, OpData* data,
@@ -938,6 +979,10 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
       EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
                                            filter, bias, output, im2col);
       break;
+    case kTfLiteInt16:
+      EvalQuantizedPerChannel16x8<kernel_type>(
+          context, node, params, data, input, filter, bias, output, im2col);
+      break;
     default:
       context->ReportError(context, "Type %s currently not supported.",
                            TfLiteTypeGetName(input->type));
@@ -957,6 +1002,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
     case kTfLiteInt8:
       return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
+    case kTfLiteInt16:
+      return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            input->type);
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index 10e014d0e21..8569809df75 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -70,7 +70,12 @@ class BaseConvolutionOpModel : public SingleOpModel {
               input.scale * filter.per_channel_quantization_scales[i];
           bias_zero_points[i] = 0;
         }
-        TensorData bias{TensorType_INT32,
+        tflite::TensorType bias_type = TensorType_INT32;
+        if (input.type == TensorType_INT16) {
+          // In case of 16-bit, the bias type is set to be int 64.
+          bias_type = TensorType_INT64;
+        }
+        TensorData bias{bias_type,
                         {bias_size},
                         /*min=*/0,
                         /*max=*/0,
diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc
index 51284214ee4..d6de9bf8d61 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.cc
+++ b/tensorflow/lite/kernels/cpu_backend_context.cc
@@ -55,9 +55,6 @@ CpuBackendContext::CpuBackendContext()
       ruy_context_(new ruy::Context),
       gemmlowp_context_(new gemmlowp::GemmContext) {
   SetMaxNumThreads(kDefaultNumThreadpoolThreads);
-#ifdef TFLITE_WITH_RUY_GEMV
-  ruy_context_->cache_policy = ruy::kCacheLHSOnNarrowMul;
-#endif
 }
 
 CpuBackendContext::~CpuBackendContext() {}
@@ -66,7 +63,7 @@ void CpuBackendContext::SetMaxNumThreads(int max_num_threads) {
   const int target_num_threads =
       max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
   max_num_threads_ = target_num_threads;
-  ruy_context_->max_num_threads = target_num_threads;
+  ruy_context_->set_max_num_threads(target_num_threads);
   gemmlowp_context_->set_max_num_threads(target_num_threads);
 }
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_params.h b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
index 66700ea9cdf..0040f40cd50 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_params.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
@@ -29,6 +29,17 @@ namespace cpu_backend_gemm {
 // Matrix storage order: column-major or row-major.
 enum class Order { kColMajor, kRowMajor };
 
+enum class CachePolicy : std::uint8_t {
+  kNeverCache,
+  kCacheIfLargeSpeedup,
+  kAlwaysCache,
+};
+
+inline CachePolicy DefaultCachePolicy(bool is_constant_data) {
+  return is_constant_data ? CachePolicy::kCacheIfLargeSpeedup
+                          : CachePolicy::kNeverCache;
+}
+
 // MatrixParams encapsulates the parameters that Gemm needs about each
 // matrix, besides the buffer data pointer.
 // Compare to ruy::Matrix, which also encapsulates the data pointer.
@@ -47,10 +58,13 @@ struct MatrixParams {
   // The zero_point, i.e. which Scalar value is to be interpreted as zero.
   // When Scalar is floating-point, this must be 0.
   Scalar zero_point = 0;
-  // Indicate whether the underlying data will remain unchanged for
-  // some period of time. Defaults to false, but should be set to true
-  // for unchanging data (e.g. weights buffers in many cases)
-  bool cacheable = false;
+  // When the data pointed to by this matrix is constant data, so that it is
+  // valid to assume that equality of pointers implies equality of data,
+  // a CachePolicy may be used instead of the default kNeverCache,
+  // which will enable ruy to take advantage of this constancy of the data to
+  // cache the packing work, which can be a large speedup in matrix*vector
+  // and other narrow shapes.
+  CachePolicy cache_policy = CachePolicy::kNeverCache;
 };
 
 // Enumeration of broad categories of Gemm.
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h b/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
index d038c03ac04..b441628a67b 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
 
+#include "ruy/matrix.h"  // from @ruy
 #include "ruy/path.h"  // from @ruy
 #include "ruy/ruy.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
@@ -25,41 +26,54 @@ namespace tflite {
 namespace cpu_backend_gemm {
 namespace detail {
 
+inline ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy) {
+  switch (cache_policy) {
+    case CachePolicy::kNeverCache:
+      return ruy::CachePolicy::kNeverCache;
+    case CachePolicy::kCacheIfLargeSpeedup:
+      return ruy::CachePolicy::kCacheIfLargeSpeedup;
+    case CachePolicy::kAlwaysCache:
+      return ruy::CachePolicy::kAlwaysCache;
+    default:
+      TFLITE_DCHECK(false);
+      return ruy::CachePolicy::kNeverCache;
+  }
+}
+
 template <typename Scalar, typename DataPointer>
 void MakeRuyMatrix(const MatrixParams<Scalar>& params, DataPointer data_ptr,
                    ruy::Matrix<Scalar>* dst) {
-  dst->layout.rows = params.rows;
-  dst->layout.cols = params.cols;
-  if (params.order == Order::kColMajor) {
-    dst->layout.order = ruy::Order::kColMajor;
-    dst->layout.stride = params.rows;
-  } else {
-    dst->layout.order = ruy::Order::kRowMajor;
-    dst->layout.stride = params.cols;
-  }
+  ruy::Order ruy_order = params.order == Order::kColMajor
+                             ? ruy::Order::kColMajor
+                             : ruy::Order::kRowMajor;
+  ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order,
+                        dst->mutable_layout());
   // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
   // It does care whether we assign to it a Scalar* or a const Scalar*.
-  dst->data = data_ptr;
-  dst->zero_point = params.zero_point;
-  dst->cacheable = params.cacheable;
+  dst->set_data(data_ptr);
+  dst->set_zero_point(params.zero_point);
+#ifdef TFLITE_WITH_RUY_GEMV
+  dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy));
+#endif
 }
 
 template <typename GemmParamsType, typename RuySpecType>
-void MakeRuySpec(const GemmParamsType& params, RuySpecType* ruy_spec) {
+void MakeRuyMulParams(const GemmParamsType& params,
+                      RuySpecType* ruy_mul_params) {
   // This validation has already been performed by the Gemm API entry point,
   // but it doesn't hurt to test specifically this again here, where it's
   // being used.
   ValidateGemmParams(params);
 
-  ruy_spec->multiplier_fixedpoint = params.multiplier_fixedpoint;
-  ruy_spec->multiplier_exponent = params.multiplier_exponent;
-  ruy_spec->multiplier_fixedpoint_perchannel =
-      params.multiplier_fixedpoint_perchannel;
-  ruy_spec->multiplier_exponent_perchannel =
-      params.multiplier_exponent_perchannel;
-  ruy_spec->bias = params.bias;
-  ruy_spec->clamp_min = params.clamp_min;
-  ruy_spec->clamp_max = params.clamp_max;
+  ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
+  ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
+  ruy_mul_params->set_multiplier_fixedpoint_perchannel(
+      params.multiplier_fixedpoint_perchannel);
+  ruy_mul_params->set_multiplier_exponent_perchannel(
+      params.multiplier_exponent_perchannel);
+  ruy_mul_params->set_bias(params.bias);
+  ruy_mul_params->set_clamp_min(params.clamp_min);
+  ruy_mul_params->set_clamp_max(params.clamp_max);
 }
 
 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
@@ -78,22 +92,11 @@ struct GemmImplUsingRuy {
     MakeRuyMatrix(rhs_params, rhs_data, &ruy_rhs);
     MakeRuyMatrix(dst_params, dst_data, &ruy_dst);
 
-    ruy::BasicSpec<AccumScalar, DstScalar> ruy_spec;
-    MakeRuySpec(params, &ruy_spec);
+    ruy::MulParams<AccumScalar, DstScalar> ruy_mul_params;
+    MakeRuyMulParams(params, &ruy_mul_params);
 
-// If Ruy is not selected intentionally (TFLITE_WITH_RUY not defined)
-// and GEMMLOWP_NEON is absent, we fall back to Ruy for some quantized
-// kernels. Some Ruy paths are still experimental, so we restrict to reference
-// code in that case.
-#if !defined(TFLITE_WITH_RUY) && !defined(GEMMLOWP_NEON)
-    constexpr ruy::Path kRuyPath =
-        ruy::Path::kReference | ruy::Path::kStandardCpp;
-#else
-    constexpr ruy::Path kRuyPath = ruy::kAllPaths;
-#endif
-
-    ruy::Mul<kRuyPath>(ruy_lhs, ruy_rhs, ruy_spec, context->ruy_context(),
-                       &ruy_dst);
+    ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, context->ruy_context(),
+             &ruy_dst);
   }
 };
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
index 75181a979eb..110eb3a07ef 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include <type_traits>
 
 #include <gtest/gtest.h>
-#include "ruy/ruy.h"  // from @ruy
+#include "ruy/reference_mul.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 
@@ -350,11 +350,10 @@ void ReferenceGemm(
   cpu_backend_gemm::detail::MakeRuyMatrix(rhs_params, rhs_data, &ruy_rhs);
   cpu_backend_gemm::detail::MakeRuyMatrix(dst_params, dst_data, &ruy_dst);
 
-  ruy::BasicSpec<AccumScalar, DstScalar> ruy_spec;
-  cpu_backend_gemm::detail::MakeRuySpec(params, &ruy_spec);
+  ruy::MulParams<AccumScalar, DstScalar> ruy_mul_params;
+  cpu_backend_gemm::detail::MakeRuyMulParams(params, &ruy_mul_params);
 
-  ruy::Mul<ruy::Path::kReference>(ruy_lhs, ruy_rhs, ruy_spec,
-                                  context->ruy_context(), &ruy_dst);
+  ruy::ReferenceMul(ruy_lhs, ruy_rhs, ruy_mul_params, &ruy_dst);
 }
 
 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
diff --git a/tensorflow/lite/kernels/cpu_backend_threadpool.h b/tensorflow/lite/kernels/cpu_backend_threadpool.h
index ff03d372d5e..39eafd51d6a 100644
--- a/tensorflow/lite/kernels/cpu_backend_threadpool.h
+++ b/tensorflow/lite/kernels/cpu_backend_threadpool.h
@@ -37,7 +37,8 @@ template <typename TaskType>
 void Execute(int tasks_count, TaskType* tasks,
              CpuBackendContext* cpu_backend_context) {
   TFLITE_DCHECK_LE(tasks_count, cpu_backend_context->max_num_threads());
-  cpu_backend_context->ruy_context()->workers_pool.Execute(tasks_count, tasks);
+  cpu_backend_context->ruy_context()->mutable_thread_pool()->Execute(
+      tasks_count, tasks);
 }
 
 #else  // not TFLITE_WITH_RUY
diff --git a/tensorflow/lite/kernels/densify_test.cc b/tensorflow/lite/kernels/densify_test.cc
index fad7fff159c..5cb90932069 100644
--- a/tensorflow/lite/kernels/densify_test.cc
+++ b/tensorflow/lite/kernels/densify_test.cc
@@ -44,11 +44,10 @@ using ::testing::ElementsAreArray;
 template <typename T>
 class DensifyOpModel : public SingleOpModel {
  public:
-  DensifyOpModel(TensorType type, std::initializer_list<int> shape,
-                 std::initializer_list<T> input_data, int version = 1) {
-    const TensorData io_tensor_data = {type, shape};
-    input_ = AddConstSparseInput(type, shape, input_data);
-    output_ = AddOutput(io_tensor_data);
+  DensifyOpModel(const TensorData& input, std::initializer_list<T> input_data,
+                 int version = 1) {
+    input_ = AddConstSparseInput(input, input_data);
+    output_ = AddOutput({input.type, input.shape});
 
     SetBuiltinOp(BuiltinOperator_DENSIFY, BuiltinOptions_DensifyOptions,
                  CreateDensifyOptions(builder_).Union());
@@ -56,7 +55,7 @@ class DensifyOpModel : public SingleOpModel {
     resolver_ = absl::make_unique<SingleOpResolver>(
         BuiltinOperator_DENSIFY, ops::builtin::Register_DENSIFY(), version);
 
-    BuildInterpreter({shape});
+    BuildInterpreter({input.shape});
   }
 
   std::vector<T> GetInput() { return ExtractVector<T>(input_); }
@@ -71,7 +70,12 @@ TEST(DensifyOpTest, Float) {
   std::initializer_list<float> dense_values = {6, 0, 9, 8, 0, 0,
                                                0, 0, 5, 0, 0, 7};
   std::initializer_list<float> sparse_values = {6, 9, 8, 5, 7};
-  DensifyOpModel<float> m(TensorType_FLOAT32, {3, 4}, dense_values);
+  TensorData input = {};
+  input.type = TensorType_FLOAT32;
+  input.shape = {3, 4};
+  input.traversal_order = {0, 1};
+  input.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  DensifyOpModel<float> m(input, dense_values);
   m.Invoke();
   EXPECT_THAT(m.GetInput(), ElementsAreArray(sparse_values));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(dense_values));
@@ -81,7 +85,12 @@ TEST(DensifyOpTest, Float3D) {
   std::initializer_list<float> dense_values = {6, 0, 9, 8, 0, 0,
                                                0, 0, 5, 0, 0, 7};
   std::initializer_list<float> sparse_values = {6, 9, 8, 5, 7};
-  DensifyOpModel<float> m(TensorType_FLOAT32, {3, 2, 2}, dense_values);
+  TensorData input = {};
+  input.type = TensorType_FLOAT32;
+  input.shape = {3, 2, 2};
+  input.traversal_order = {0, 1, 2};
+  input.format = {kTfLiteDimDense, kTfLiteDimDense, kTfLiteDimSparseCSR};
+  DensifyOpModel<float> m(input, dense_values);
   m.Invoke();
   EXPECT_THAT(m.GetInput(), ElementsAreArray(sparse_values));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(dense_values));
@@ -91,7 +100,12 @@ TEST(DensifyOpTest, Int8) {
   std::initializer_list<int8_t> dense_values = {6, 0, 9, 8, 0, 0,
                                                 0, 0, 5, 0, 0, 7};
   std::initializer_list<int8_t> sparse_values = {6, 9, 8, 5, 7};
-  DensifyOpModel<int8_t> m(TensorType_INT8, {3, 4}, dense_values);
+  TensorData input = {};
+  input.type = TensorType_INT8;
+  input.shape = {3, 4};
+  input.traversal_order = {0, 1};
+  input.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  DensifyOpModel<int8_t> m(input, dense_values);
   m.Invoke();
   EXPECT_THAT(m.GetInput(), ElementsAreArray(sparse_values));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(dense_values));
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index 34d0556e7bd..8500b5cd39b 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -379,9 +379,8 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = 0;
   op_params.output_offset = output->params.zero_point;
-  // TODO(b/130439627): Use calculated value for clamping.
-  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
-  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
   TF_LITE_ENSURE_STATUS(ComputeDepthMultiplier(context, input, filter,
                                                &op_params.depth_multiplier));
 
diff --git a/tensorflow/lite/kernels/expand_dims_test.cc b/tensorflow/lite/kernels/expand_dims_test.cc
index eba5b88c42c..5bb1d76f00f 100644
--- a/tensorflow/lite/kernels/expand_dims_test.cc
+++ b/tensorflow/lite/kernels/expand_dims_test.cc
@@ -26,8 +26,8 @@ namespace {
 using ::testing::ElementsAreArray;
 
 enum class TestType {
-  CONST = 0,
-  DYNAMIC = 1,
+  kConst = 0,
+  kDynamic = 1,
 };
 
 template <typename InputType>
@@ -36,7 +36,7 @@ class ExpandDimsOpModel : public SingleOpModel {
   ExpandDimsOpModel(int axis, std::initializer_list<int> input_shape,
                     std::initializer_list<InputType> input_data,
                     TestType input_tensor_types) {
-    if (input_tensor_types == TestType::DYNAMIC) {
+    if (input_tensor_types == TestType::kDynamic) {
       input_ = AddInput(GetTensorType<InputType>());
       axis_ = AddInput(TensorType_INT32);
     } else {
@@ -50,7 +50,7 @@ class ExpandDimsOpModel : public SingleOpModel {
 
     BuildInterpreter({input_shape, {1}});
 
-    if (input_tensor_types == TestType::DYNAMIC) {
+    if (input_tensor_types == TestType::kDynamic) {
       PopulateTensor<InputType>(input_, input_data);
       PopulateTensor<int32_t>(axis_, {axis});
     }
@@ -69,18 +69,18 @@ class ExpandDimsOpModel : public SingleOpModel {
 template <typename T>
 class ExpandDimsOpTest : public ::testing::Test {
  public:
-  static std::vector<TestType> _range_;
+  static std::vector<TestType> range_;
 };
 
 template <>
-std::vector<TestType> ExpandDimsOpTest<TestType>::_range_{TestType::CONST,
-                                                          TestType::DYNAMIC};
+std::vector<TestType> ExpandDimsOpTest<TestType>::range_{TestType::kConst,
+                                                         TestType::kDynamic};
 
 using DataTypes = ::testing::Types<float, int8_t, int16_t, int32_t>;
 TYPED_TEST_SUITE(ExpandDimsOpTest, DataTypes);
 
 TYPED_TEST(ExpandDimsOpTest, PositiveAxis) {
-  for (TestType test_type : ExpandDimsOpTest<TestType>::_range_) {
+  for (TestType test_type : ExpandDimsOpTest<TestType>::range_) {
     std::initializer_list<TypeParam> values = {-1, 1, -2, 2};
 
     ExpandDimsOpModel<TypeParam> axis_0(0, {2, 2}, values, test_type);
@@ -101,7 +101,7 @@ TYPED_TEST(ExpandDimsOpTest, PositiveAxis) {
 }
 
 TYPED_TEST(ExpandDimsOpTest, NegativeAxis) {
-  for (TestType test_type : ExpandDimsOpTest<TestType>::_range_) {
+  for (TestType test_type : ExpandDimsOpTest<TestType>::range_) {
     std::initializer_list<TypeParam> values = {-1, 1, -2, 2};
 
     ExpandDimsOpModel<TypeParam> m(-1, {2, 2}, values, test_type);
@@ -115,7 +115,7 @@ TEST(ExpandDimsOpTest, StrTensor) {
   std::initializer_list<std::string> values = {"abc", "de", "fghi"};
 
   // this test will fail on TestType::CONST
-  ExpandDimsOpModel<std::string> m(0, {3}, values, TestType::DYNAMIC);
+  ExpandDimsOpModel<std::string> m(0, {3}, values, TestType::kDynamic);
   m.Invoke();
   EXPECT_THAT(m.GetValues(), ElementsAreArray(values));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 5faf13303d8..cbc3efd5da5 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -49,6 +50,10 @@ bool SupportedSparsityFormat(const TfLiteSparsity& sparsity) {
 
   return false;
 }
+
+static const int kDimMetadataSizeRandomSparse = 2;
+static const int kDimMetadataSizeBlockSparse = 3;
+
 }  // namespace
 
 // This file has four implementations of FullyConnected
@@ -56,8 +61,6 @@ enum KernelType {
   kReference,
   kGenericOptimized,
   kLegacyPie,  // Legacy path used by the PIE team and related clients.
-  kSparseReference,
-  kSparseOptimized,
 };
 
 struct OpData {
@@ -626,54 +629,68 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     FullyConnectedParams op_params;
     op_params.float_activation_min = output_activation_min;
     op_params.float_activation_max = output_activation_max;
-    reference_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output));
-  } else if (kernel_type == kSparseReference) {
-    FullyConnectedParams op_params;
-    op_params.float_activation_min = output_activation_min;
-    op_params.float_activation_max = output_activation_max;
-    TF_LITE_ENSURE(context, filter->sparsity != nullptr);
-
-    const auto& sparsity = *filter->sparsity;
-    reference_ops::FullyConnectedSparseWeight(
-        sparsity, op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output));
-  } else if (kernel_type == kSparseOptimized) {
-    FullyConnectedParams op_params;
-    op_params.float_activation_min = output_activation_min;
-    op_params.float_activation_max = output_activation_max;
-    TF_LITE_ENSURE(context, filter->sparsity != nullptr);
-
-    const auto& sparsity = *filter->sparsity;
-    if (!SupportedSparsityFormat(sparsity)) {
-      context->ReportError(context,
-                           "Unsupported sparse fully-connected weight format.");
-      return kTfLiteError;
+    if (filter->sparsity != nullptr) {
+      const auto& sparsity = *filter->sparsity;
+      reference_ops::FullyConnectedSparseWeight(
+          sparsity, op_params, GetTensorShape(input),
+          GetTensorData<float>(input), GetTensorShape(filter),
+          GetTensorData<float>(filter), GetTensorShape(bias),
+          GetTensorData<float>(bias), GetTensorShape(output),
+          GetTensorData<float>(output));
+    } else {
+      reference_ops::FullyConnected(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(filter), GetTensorData<float>(filter),
+          GetTensorShape(bias), GetTensorData<float>(bias),
+          GetTensorShape(output), GetTensorData<float>(output));
     }
-    optimized_ops::FullyConnectedSparseWeight(
-        sparsity, op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output));
   } else if (kernel_type == kLegacyPie) {
     return EvalPie(context, node, params, data, input, filter, bias, output);
   } else {
     FullyConnectedParams op_params;
     op_params.float_activation_min = output_activation_min;
     op_params.float_activation_max = output_activation_max;
-    op_params.lhs_cacheable = IsConstantTensor(filter);
-    op_params.rhs_cacheable = IsConstantTensor(input);
-    optimized_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output),
-        CpuBackendContext::GetFromContext(context));
+    if (filter->sparsity != nullptr) {
+      const auto& sparsity = *filter->sparsity;
+      if (!SupportedSparsityFormat(sparsity)) {
+        TF_LITE_KERNEL_LOG(context,
+                           "Unsupported sparse fully-connected weight format.");
+        return kTfLiteError;
+      }
+
+      if (sparsity.dim_metadata_size == kDimMetadataSizeRandomSparse) {
+        // Random sparse.
+        optimized_ops::FullyConnectedSparseWeight(
+            sparsity, op_params, GetTensorShape(input),
+            GetTensorData<float>(input), GetTensorShape(filter),
+            GetTensorData<float>(filter), GetTensorShape(bias),
+            GetTensorData<float>(bias), GetTensorShape(output),
+            GetTensorData<float>(output));
+      } else if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
+                 sparsity.dim_metadata[2].dense_size == 4) {
+        // Block sparse with block size of 1x4.
+        optimized_ops::FullyConnectedSparseWeight1x4(
+            sparsity, op_params, GetTensorShape(input),
+            GetTensorData<float>(input), GetTensorShape(filter),
+            GetTensorData<float>(filter), GetTensorShape(bias),
+            GetTensorData<float>(bias), GetTensorShape(output),
+            GetTensorData<float>(output));
+      } else {
+        TF_LITE_KERNEL_LOG(context,
+                           "Unsupported sparse fully-connected weight format.");
+        return kTfLiteError;
+      }
+
+    } else {
+      op_params.lhs_cacheable = IsConstantTensor(filter);
+      op_params.rhs_cacheable = IsConstantTensor(input);
+      optimized_ops::FullyConnected(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(filter), GetTensorData<float>(filter),
+          GetTensorShape(bias), GetTensorData<float>(bias),
+          GetTensorShape(output), GetTensorData<float>(output),
+          CpuBackendContext::GetFromContext(context));
+    }
   }
 
   return kTfLiteOk;
@@ -734,23 +751,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace fully_connected
 
-// TODO(b/147449640): Clean up sparse registrations after conversion is done.
-TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_REF() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free,
-      fully_connected::Prepare<fully_connected::kSparseReference>,
-      fully_connected::Eval<fully_connected::kSparseReference>};
-  return &r;
-}
-
-TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_OPT() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free,
-      fully_connected::Prepare<fully_connected::kSparseOptimized>,
-      fully_connected::Eval<fully_connected::kSparseOptimized>};
-  return &r;
-}
-
 TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
   static TfLiteRegistration r = {
       fully_connected::Init, fully_connected::Free,
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index fbc02dd741d..7227b8a5e92 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -361,11 +361,6 @@ const auto kKernelMapNoPie = new std::map<string, TfLiteRegistration*>({
     {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
 });
 
-const auto kKernelMapSparse = new std::map<string, TfLiteRegistration*>({
-    {"SparseReference", ops::builtin::Register_FULLY_CONNECTED_SPARSE_REF()},
-    {"SparseOptimized", ops::builtin::Register_FULLY_CONNECTED_SPARSE_OPT()},
-});
-
 class QuantizedFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
@@ -713,11 +708,13 @@ void SimpleTestQuantizedInt16OutputCase(
       /*activation_func=*/ActivationFunctionType_NONE, weights_format);
 
   std::mt19937 random_engine;
-  std::uniform_int_distribution<uint8_t> weights_dist;
+  // Some compilers don't support uint8_t for uniform_distribution.
+  std::uniform_int_distribution<uint32_t> weights_dist(
+      0, std::numeric_limits<uint8_t>::max());
 
   std::vector<float> weights_data(input_depth * output_depth);
   for (auto& w : weights_data) {
-    uint8_t q = weights_dist(random_engine);
+    uint8_t q = static_cast<uint8_t>(weights_dist(random_engine));
     w = (q - kWeightsZeroPoint) * kWeightsScale;
   }
 
@@ -739,10 +736,12 @@ void SimpleTestQuantizedInt16OutputCase(
       LOG(FATAL) << "Unhandled weights format";
   }
 
-  std::uniform_int_distribution<uint8_t> input_dist;
+  // Some compilers don't support uint8_t for uniform_distribution.
+  std::uniform_int_distribution<uint32_t> input_dist(
+      0, std::numeric_limits<uint8_t>::max());
   std::vector<float> input_data(input_depth * batches);
   for (auto& i : input_data) {
-    uint8_t q = input_dist(random_engine);
+    uint8_t q = static_cast<uint8_t>(input_dist(random_engine));
     i = (q - kInputZeroPoint) * kInputScale;
   }
 
@@ -1135,7 +1134,7 @@ class SparseFullyConnectedOpModel : public SingleOpModel {
  public:
   SparseFullyConnectedOpModel(TfLiteRegistration* registration, int units,
                               int batches, const TensorData& input,
-                              std::initializer_list<int> weights_shape,
+                              const TensorData& weights,
                               std::initializer_list<T> weights_data)
       : batches_(batches), units_(units) {
     int total_input_size = 1;
@@ -1145,7 +1144,7 @@ class SparseFullyConnectedOpModel : public SingleOpModel {
     input_size_ = total_input_size / batches_;
 
     input_ = AddInput(input);
-    weights_ = AddConstSparseInput(input.type, weights_shape, weights_data);
+    weights_ = AddConstSparseInput(weights, weights_data);
 
     TensorData bias{input.type, {units_}};
     bias_ = AddInput(bias);
@@ -1183,20 +1182,24 @@ class SparseFullyConnectedOpModel : public SingleOpModel {
 class SparseFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMapSparse;
+    return *kKernelMapNoPie;
   }
 };
 
 TEST_P(SparseFullyConnectedOpTest, SimpleTest) {
-  std::initializer_list<int> weight_shape = {3, 10};
   std::initializer_list<float> weight_data = {
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
   };
+  TensorData weight = {};
+  weight.type = TensorType_FLOAT32;
+  weight.shape = {3, 10};
+  weight.traversal_order = {0, 1};
+  weight.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
   SparseFullyConnectedOpModel<float> m(
       GetRegistration(), /*units=*/3, /*batches=*/2,
-      /*input=*/{TensorType_FLOAT32, {2, 10}}, weight_shape, weight_data);
+      /*input=*/{TensorType_FLOAT32, {2, 10}}, weight, weight_data);
   m.SetBias({1, 2, 3});
 
   m.SetInput({
@@ -1211,13 +1214,17 @@ TEST_P(SparseFullyConnectedOpTest, SimpleTest) {
 }
 
 TEST_P(SparseFullyConnectedOpTest, SimpleTest2) {
-  std::initializer_list<int> weight_shape = {1, 2};
   std::initializer_list<float> weight_data = {
       2, 4  // u = 0
   };
+  TensorData weight = {};
+  weight.type = TensorType_FLOAT32;
+  weight.shape = {1, 2};
+  weight.traversal_order = {0, 1};
+  weight.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
   SparseFullyConnectedOpModel<float> m(
       GetRegistration(), /*units=*/1, /*batches=*/2,
-      /*input=*/{TensorType_FLOAT32, {2, 2}}, weight_shape, weight_data);
+      /*input=*/{TensorType_FLOAT32, {2, 2}}, weight, weight_data);
   m.SetBias({1});
 
   m.SetInput({
@@ -1231,12 +1238,41 @@ TEST_P(SparseFullyConnectedOpTest, SimpleTest2) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(11, 9));
 }
 
+TEST_P(SparseFullyConnectedOpTest, Simple1x4Test) {
+  std::initializer_list<float> weight_data = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,  // u = 2
+  };
+  TensorData weight = {};
+  weight.type = TensorType_FLOAT32;
+  weight.shape = {3, 12};
+  weight.traversal_order = {0, 1, 2};
+  weight.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  weight.block_map = {1};
+  weight.block_size = {4};
+  SparseFullyConnectedOpModel<float> m(GetRegistration(),
+                                       /*units=*/3, /*batches=*/2,
+                                       /*input=*/{TensorType_FLOAT32, {2, 12}},
+                                       weight, weight_data);
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10, 11,  12,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10, -11, 12,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(m.GetOutput(), ElementsAre(289, 290, 291, 81, 82, 83));
+}
 // TODO(b/148391360): Add tests for unsupported sparsity format.
 // TEST_P(SparseFullyConnectedOpTest, TestUnsupportedSparsityFormat)
 
 INSTANTIATE_TEST_SUITE_P(
     SparseFullyConnectedOpTest, SparseFullyConnectedOpTest,
-    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapSparse)));
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/hashtable/BUILD b/tensorflow/lite/kernels/hashtable/BUILD
new file mode 100644
index 00000000000..4ec3abe77ee
--- /dev/null
+++ b/tensorflow/lite/kernels/hashtable/BUILD
@@ -0,0 +1,54 @@
+load("//tensorflow/lite/micro:build_def.bzl", "cc_library")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "hashtable_op_kernels",
+    srcs = [
+        "hashtable.cc",
+        "hashtable_find.cc",
+        "hashtable_import.cc",
+        "hashtable_ops.cc",
+        "hashtable_size.cc",
+    ],
+    hdrs = [
+        "hashtable_ops.h",
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/experimental/resource",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "hashtable_op_test",
+    size = "small",
+    srcs = [
+        "hashtable_ops_test.cc",
+    ],
+    deps = [
+        ":hashtable_op_kernels",  # buildcleaner: keep
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/experimental/resource",
+        "//tensorflow/lite/kernels:test_main",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/testing:util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/lite/kernels/hashtable/README.md b/tensorflow/lite/kernels/hashtable/README.md
new file mode 100644
index 00000000000..77076a94f7a
--- /dev/null
+++ b/tensorflow/lite/kernels/hashtable/README.md
@@ -0,0 +1,190 @@
+# How to use TF Lookup ops in TFLite
+
+The objective of this file is to provide examples to demonstrate how to use TF
+Lookup ops in TFLite.
+
+## Supported Tensorflow Lookup ops in TFLite
+
+Here is the supported status of TensorFlow Lookup ops.
+
+<table>
+  <tr>
+   <td><strong><em>TF Python lookup ops</em></strong>
+   </td>
+   <td colspan="5" ><strong><em>Supported status</em></strong>
+   </td>
+  </tr>
+  <tr>
+   <td rowspan="2" >tf.lookup.StaticHashTable
+   </td>
+   <td rowspan="2" colspan="5" >Supported only with tensor initializers.
+<p>
+Supported mapping type: string → int64, int64 → string
+   </td>
+  </tr>
+  <tr>
+  </tr>
+  <tr>
+   <td rowspan="2" >tf.lookup.Hashtable
+   </td>
+   <td rowspan="2" colspan="5" >Supported only with tensor initializers.
+<p>
+Supported mapping type: string → int64, int64 → string
+   </td>
+  </tr>
+  <tr>
+  </tr>
+  <tr>
+   <td rowspan="2" >tf.lookup.index_to_string_table_from_tensor
+   </td>
+   <td rowspan="2" colspan="5" >Supported.
+   </td>
+  </tr>
+  <tr>
+  </tr>
+  <tr>
+   <td rowspan="2" >tf.lookup.index_table_from_tensor
+   </td>
+   <td rowspan="2" colspan="5" >Supported natively when num_oov_bukcets=0 and dtype=dtypes.string.
+<p>
+For the oov concept, you will need a <a href="https://www.tensorflow.org/lite/guide/ops_select" title="Select TensorFlow operators to use in TensorFlow Lite">Flex delegate</a>.
+   </td>
+  </tr>
+  <tr>
+  </tr>
+  <tr>
+   <td>tf.lookup.StaticVocabularyTable
+   </td>
+   <td colspan="5" >Supported but you will need a <a href="https://www.tensorflow.org/lite/guide/ops_select" title="Select TensorFlow operators to use in TensorFlow Lite">Flex delegate</a>.
+<p>
+Use tf.index_table_from_tensor or tf.index_to_string_table_from_tensor instead if possible if you don’t want to use <a href="https://www.tensorflow.org/lite/guide/ops_select" title="Select TensorFlow operators to use in TensorFlow Lite">Flex delegate</a>.
+   </td>
+  </tr>
+  <tr>
+   <td>tf.lookup.experimental.DenseHashTable
+<p>
+tf.contrib.lookup.MutableHashTable
+<p>
+tf.contrib.lookup.MutableDenseHashTable
+   </td>
+   <td colspan="5" >Not supported yet.
+   </td>
+  </tr>
+  <tr>
+   <td>tf.lookup.IdTableWithHashBuckets
+   </td>
+   <td colspan="5" >Supported but you need a <a href="https://www.tensorflow.org/lite/guide/ops_select" title="Select TensorFlow operators to use in TensorFlow Lite">Flex delegate</a>.
+   </td>
+  </tr>
+</table>
+
+
+
+## Python Sample code
+
+Here, you can find the Python sample code:
+
+
+
+*   Static hash table (string → int64)
+
+```
+int64_values = tf.constant([1, 2, 3], dtype=tf.int64)
+string_values = tf.constant(['bar', 'foo', 'baz'], dtype=tf.string)
+
+initializer = tf.lookup.KeyValueTensorInitializer(string_values, int64_values)
+table = tf.lookup.StaticHashTable(initializer, 4)
+
+with tf.control_dependencies([tf.initializers.tables_initializer()]):
+  input_string_tensor = tf.compat.v1.placeholder(tf.string, shape=[1])
+  out_int64_tensor = table.lookup(input_string_tensor)
+```
+
+*   Static hash table, initialized from a file (string → int64)
+
+```
+with open('/tmp/vocab.file', 'r') as f:
+  words = f.read().splitlines()
+
+string_values = tf.constant(words, dtype=tf.string)
+
+initializer = tf.lookup.KeyValueTensorInitializer(string_values, int64_values)
+table = tf.lookup.StaticHashTable(initializer, 4)
+
+with tf.control_dependencies([tf.initializers.tables_initializer()]):
+  input_string_tensor = tf.placeholder(tf.string, shape=[1])
+  out_int64_tensor = table.lookup(input_string_tensor)
+```
+
+*   Index table (string → int64)
+
+```
+UNK_ID = -1
+vocab = tf.constant(["emerson", "lake", "palmer"])
+vocab_table = tf.lookup.index_table_from_tensor(vocab, default_value=UNK_ID)
+
+input_tensor = tf.compat.v1.placeholder(tf.string, shape=[5])
+
+with tf.control_dependencies([tf.initializers.tables_initializer()]):
+  out_tensor = vocab_table.lookup(input_tensor)
+```
+
+*   Index table, initialized from a file (string → int64)
+
+```
+with open('/tmp/vocab.file', 'r') as f:
+  words = f.read().splitlines()
+
+UNK_ID = -1
+vocab = tf.constant(words)
+vocab_table = tf.lookup.index_table_from_tensor(vocab, default_value=UNK_ID)
+
+input_tensor = tf.compat.v1.placeholder(tf.string, shape=[5])
+
+with tf.control_dependencies([tf.initializers.tables_initializer()]):
+  out_tensor = vocab_table.lookup(input_tensor)
+```
+
+*   Index to string table (int64 → string)
+
+```
+UNK_WORD = "unknown"
+vocab = tf.constant(["emerson", "lake", "palmer"])
+vocab_table = tf.lookup.index_to_string_table_from_tensor(vocab, default_value=UNK_WORD)
+
+input_tensor = tf.compat.v1.placeholder(tf.int64, shape=[1])
+
+with tf.control_dependencies([tf.initializers.tables_initializer()]):
+  out_tensor = vocab_table.lookup(input_tensor)
+```
+
+*   Index to string table, initialized from a file (int64 → string)
+
+```
+with open('/tmp/vocab.file', 'r') as f:
+  words = f.read().splitlines()
+
+UNK_WORD = "unknown"
+vocab = tf.constant(words)
+vocab_table = tf.lookup.index_to_string_table_from_tensor(vocab, default_value=UNK_WORD)
+
+input_tensor = tf.compat.v1.placeholder(tf.int64, shape=[1])
+
+with tf.control_dependencies([tf.initializers.tables_initializer()]):
+  out_tensor = vocab_table.lookup(input_tensor)
+```
+
+## How to Include Hashtable ops in your TFLite.
+
+Currently, hashtable ops are not included in the builtin op set. You need to add
+hashtable ops manually by including the following dependency:
+
+`"//tensorflow/lite/kernels/hashtable:hashtable_op_kernels"`
+
+And then, your op resolver should add them like the following statements:
+
+
+```
+  // Add hashtable op handlers.
+  tflite::ops::custom::AddHashtableOps(&resolver);
+```
diff --git a/tensorflow/lite/experimental/kernels/hashtable.cc b/tensorflow/lite/kernels/hashtable/hashtable.cc
similarity index 100%
rename from tensorflow/lite/experimental/kernels/hashtable.cc
rename to tensorflow/lite/kernels/hashtable/hashtable.cc
diff --git a/tensorflow/lite/experimental/kernels/hashtable_find.cc b/tensorflow/lite/kernels/hashtable/hashtable_find.cc
similarity index 100%
rename from tensorflow/lite/experimental/kernels/hashtable_find.cc
rename to tensorflow/lite/kernels/hashtable/hashtable_find.cc
diff --git a/tensorflow/lite/experimental/kernels/hashtable_import.cc b/tensorflow/lite/kernels/hashtable/hashtable_import.cc
similarity index 100%
rename from tensorflow/lite/experimental/kernels/hashtable_import.cc
rename to tensorflow/lite/kernels/hashtable/hashtable_import.cc
diff --git a/tensorflow/lite/experimental/kernels/hashtable_ops.cc b/tensorflow/lite/kernels/hashtable/hashtable_ops.cc
similarity index 95%
rename from tensorflow/lite/experimental/kernels/hashtable_ops.cc
rename to tensorflow/lite/kernels/hashtable/hashtable_ops.cc
index 5b5973e602e..29c932c162f 100644
--- a/tensorflow/lite/experimental/kernels/hashtable_ops.cc
+++ b/tensorflow/lite/kernels/hashtable/hashtable_ops.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/kernels/hashtable_ops.h"
+#include "tensorflow/lite/kernels/hashtable/hashtable_ops.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/experimental/kernels/hashtable_ops.h b/tensorflow/lite/kernels/hashtable/hashtable_ops.h
similarity index 85%
rename from tensorflow/lite/experimental/kernels/hashtable_ops.h
rename to tensorflow/lite/kernels/hashtable/hashtable_ops.h
index 125db2a1b89..7ed4ab3f99a 100644
--- a/tensorflow/lite/experimental/kernels/hashtable_ops.h
+++ b/tensorflow/lite/kernels/hashtable/hashtable_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_HASHTABLE_OPS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_HASHTABLE_OPS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_HASHTABLE_HASHTABLE_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_HASHTABLE_HASHTABLE_OPS_H_
 
 #include "tensorflow/lite/mutable_op_resolver.h"
 
@@ -33,4 +33,4 @@ extern "C" void AddHashtableOps(::tflite::MutableOpResolver* resolver);
 }  // namespace ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_HASHTABLE_OPS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_HASHTABLE_HASHTABLE_OPS_H_
diff --git a/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc b/tensorflow/lite/kernels/hashtable/hashtable_ops_test.cc
similarity index 99%
rename from tensorflow/lite/experimental/kernels/hashtable_ops_test.cc
rename to tensorflow/lite/kernels/hashtable/hashtable_ops_test.cc
index 797b7b36b27..f4a0d3c9abc 100644
--- a/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc
+++ b/tensorflow/lite/kernels/hashtable/hashtable_ops_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include <initializer_list>
 #include <vector>
 
-#include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
diff --git a/tensorflow/lite/experimental/kernels/hashtable_size.cc b/tensorflow/lite/kernels/hashtable/hashtable_size.cc
similarity index 100%
rename from tensorflow/lite/experimental/kernels/hashtable_size.cc
rename to tensorflow/lite/kernels/hashtable/hashtable_size.cc
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index b095e0574fe..d6a96efdbf7 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -216,6 +216,7 @@ cc_library(
     build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
+        ":cppmath",
         ":cpu_check",
         ":types",
         "@gemmlowp//:fixedpoint",
@@ -459,6 +460,7 @@ cc_library(
         "reference/integer_ops/pooling.h",
         "reference/integer_ops/tanh.h",
         "reference/integer_ops/transpose_conv.h",
+        "reference/l2normalization.h",
         "reference/logistic.h",
         "reference/maximum_minimum.h",
         "reference/mul.h",
@@ -472,6 +474,7 @@ cc_library(
         "reference/reduce.h",
         "reference/reference_ops.h",
         "reference/requantize.h",
+        "reference/resize_nearest_neighbor.h",
         "reference/round.h",
         "reference/softmax.h",
         "reference/sparse_ops/fully_connected.h",
@@ -502,6 +505,7 @@ cc_library(
         ":tensor",
         ":tensor_utils",
         ":types",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/tools/optimize/sparsity:format_converter",
@@ -528,6 +532,7 @@ cc_library(
         "reference/dequantize.h",
         "reference/floor.h",
         "reference/fully_connected.h",
+        "reference/l2normalization.h",
         "reference/legacy_reference_ops.h",
         "reference/logistic.h",
         "reference/maximum_minimum.h",
@@ -541,6 +546,7 @@ cc_library(
         "reference/reduce.h",
         "reference/reference_ops.h",
         "reference/requantize.h",
+        "reference/resize_nearest_neighbor.h",
         "reference/round.h",
         "reference/softmax.h",
         "reference/strided_slice.h",
@@ -562,6 +568,7 @@ cc_library(
         "//tensorflow/lite/kernels:op_macros",
         "@ruy//ruy/profiler:instrumentation",
         "//tensorflow/lite/tools/optimize/sparsity:format_converter",
+        "//tensorflow/lite:string_util",
     ] + select({
         ":haswell": tflite_deps_intel,
         ":ios_x86_64": tflite_deps_intel,
@@ -622,7 +629,6 @@ cc_library(
         ":cppmath",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:cpu_backend_context",
         "@gemmlowp",
     ],
 )
@@ -647,7 +653,6 @@ cc_library(
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
         "@ruy//ruy",
-        "@ruy//ruy:detect_arm",
     ],
 )
 
@@ -779,7 +784,6 @@ cc_library(
     deps = [
         ":cpu_check",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:cpu_backend_context",
         "//third_party/eigen3",
     ],
 )
@@ -809,9 +813,11 @@ cc_test(
     }),
     linkstatic = 1,
     deps = [
+        ":common",
         ":quantization_util",
         ":tensor_utils",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -887,6 +893,23 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "conv_per_channel_quantized_16x8_test",
+    srcs = [
+        "conv_per_channel_quantized_16x8_test.cc",
+    ],
+    shard_count = 2,
+    deps = [
+        ":common",
+        ":optimized_base",
+        ":quantization_util",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
@@ -1014,6 +1037,7 @@ cc_test(
 
 cc_library(
     name = "cpu_check",
+    srcs = ["optimized/cpu_check.cc"],
     hdrs = [
         "optimized/cpu_check.h",
         "optimized/neon_check.h",
@@ -1033,9 +1057,6 @@ cc_library(
         ":windows": tflite_deps_intel,
         "//conditions:default": [],
     },
-    deps = [
-        "//tensorflow/lite/kernels:cpu_backend_context",
-    ],
 )
 
 cc_test(
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index dfcfddd3e1c..c1db3587415 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -21,12 +21,17 @@ limitations under the License.
 #endif
 #endif
 
+#include <functional>
+
 #include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 
+constexpr int kReverseShift = -1;
+
 inline void GetActivationMinMax(FusedActivationFunctionType ac,
                                 float* output_activation_min,
                                 float* output_activation_max) {
@@ -156,6 +161,27 @@ inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
                              right_shift);
 }
 
+inline int32 MultiplyByQuantizedMultiplier(int64_t x,
+                                           int32 quantized_multiplier,
+                                           int shift) {
+  // Inputs:
+  // - quantized_multiplier has fixed point at bit 31
+  // - shift is -31 to +7 (negative for right shift)
+  //
+  // Assumptions: The following input ranges are assumed
+  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
+  // - scaling is chosen so final scaled result fits in int32
+  // - input x is in the range -(1<<47) <= x < (1<<47)
+  assert(quantized_multiplier >= 0);
+  assert(shift >= -31 && shift < 8);
+
+  int32_t reduced_multiplier = (quantized_multiplier + (1 << 15)) >> 16;
+  int total_shift = 15 - shift;
+  x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
+  int32_t result = x >> total_shift;
+  return result;
+}
+
 template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
@@ -195,6 +221,63 @@ inline int CountLeadingSignBits(T integer_input) {
 #endif
 }
 
+// Use "count leading zeros" helper functions to do a fast Floor(log_2(x)).
+template <typename Integer>
+inline Integer FloorLog2(Integer n) {
+  static_assert(std::is_integral<Integer>::value, "");
+  static_assert(std::is_signed<Integer>::value, "");
+  static_assert(sizeof(Integer) == 4 || sizeof(Integer) == 8, "");
+  TFLITE_CHECK_GT(n, 0);
+  if (sizeof(Integer) == 4) {
+    return 30 - CountLeadingSignBits(n);
+  } else {
+    return 62 - CountLeadingSignBits(n);
+  }
+}
+
+// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
+// softmax
+inline void gen_lut(const std::function<double(double)>& func, double min,
+                    double max, int16_t* table, const int num) {
+  // size of table should equal to num + 1
+  // last element only for slope calculation
+  double step = (max - min) / (num - 1);
+  double half_step = step / 2.0;
+  for (int i = 0; i < num - 1; i++) {
+    double sample_val = TfLiteRound(func(min + i * step) * 32768.0);
+    double midpoint_interp_val =
+        TfLiteRound((func(min + (i + 1) * step) * 32768.0 +
+                     TfLiteRound(func(min + i * step) * 32768.0)) /
+                    2.0);
+    double midpoint_val =
+        TfLiteRound(func(min + i * step + half_step) * 32768.0);
+    double midpoint_err = midpoint_interp_val - midpoint_val;
+    double bias = TfLiteRound(midpoint_err / 2.0);
+    table[i] = std::min(std::max(sample_val - bias, -32768.0), 32767.0);
+  }
+  table[num - 1] =
+      std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
+}
+
+// int16 func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
+inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
+  // 512 base value, lut[513] only for calculate slope
+  uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
+  assert(index < 512 && "LUT index out of range.");
+  int16_t offset = value & 0x7f;
+
+  // base and slope are Q0.15
+  int16_t base = lut[index];
+  int16_t slope = lut[index + 1] - lut[index];
+
+  // Q0.15 * Q0.7 = Q0.22
+  // Round and convert from Q0.22 to Q0.15
+  int32_t delta = (static_cast<int32_t>(slope) * offset + 64) >> 7;
+
+  // Q0.15 + Q0.15
+  return base + delta;
+}
+
 // Table of sigmoid(i/24) at 0.16 format - 256 elements.
 
 // We use combined sigmoid and tanh look-up table, since
diff --git a/tensorflow/lite/kernels/internal/conv_per_channel_quantized_16x8_test.cc b/tensorflow/lite/kernels/internal/conv_per_channel_quantized_16x8_test.cc
new file mode 100644
index 00000000000..562797bfffe
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/conv_per_channel_quantized_16x8_test.cc
@@ -0,0 +1,332 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdio.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace {
+
+void PickOutputMultiplier(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const int16* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    float* output_multiplier) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+
+  std::int64_t output_accu_min = std::numeric_limits<std::int64_t>::max();
+  std::int64_t output_accu_max = std::numeric_limits<std::int64_t>::min();
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int output_channel = 0; output_channel < output_depth;
+             ++output_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          std::int64_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val =
+                      filter_data[Offset(filter_shape, output_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += static_cast<std::int64_t>(filter_val) *
+                         static_cast<std::int64_t>(input_val);
+                }
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[output_channel];
+          }
+          output_accu_max = std::max(acc, output_accu_max);
+          output_accu_min = std::min(acc, output_accu_min);
+        }
+      }
+    }
+  }
+
+  // Since int16 ranges from -32768 to 32767, we need to squeeze the accumulator
+  // min/max fit in those ranges correspondingly as much as possible.
+  if (std::abs(output_accu_max) > std::abs(output_accu_min)) {
+    *output_multiplier = 32767.0f / std::abs(output_accu_max);
+  } else {
+    *output_multiplier = 32768.0f / std::abs(output_accu_min);
+  }
+}
+
+void PickReasonableMultiplier(
+    const ConvParams& params, int output_activation_min,
+    int output_activation_max, int output_depth,
+    const RuntimeShape& input_shape_inference, const std::int16_t* input_data,
+    const RuntimeShape& filter_shape_inference, const std::int8_t* filter_data,
+    const RuntimeShape& bias_shape_inference, const std::int64_t* bias_data,
+    const RuntimeShape& output_shape_inference,
+    std::int32_t* output_multiplier_ptr, std::int32_t* output_shift_ptr,
+    std::int16_t* output_data) {
+  float output_multiplier;
+  PickOutputMultiplier(params, input_shape_inference, input_data,
+                       filter_shape_inference, filter_data,
+                       bias_shape_inference, bias_data, output_shape_inference,
+                       &output_multiplier);
+
+  int base_multiplier;
+  int base_shift;
+  QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
+  for (int i = 0; i < output_depth; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
+    output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
+  }
+}
+
+bool GenerateValidShapeConfigurations(
+    int filter_width, int filter_height, int dilation_width_factor,
+    int dilation_height_factor, RuntimeShape* input_shape_inference,
+    RuntimeShape* filter_shape_inference, RuntimeShape* output_shape_inference,
+    int* pad_width, int* pad_height, int* stride) {
+  const int batch = UniformRandomInt(1, 3);
+  const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
+  const int input_width = UniformRandomInt(5, 50);
+  const int input_height = UniformRandomInt(5, 50);
+  *stride = UniformRandomInt(1, 2);
+  const bool test_pad = UniformRandomInt(0, 1);
+  const auto padding_type = test_pad ? PaddingType::kValid : PaddingType::kSame;
+
+  const int output_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
+
+  input_shape_inference->BuildFrom(
+      {batch, input_height, input_width, input_depth});
+
+  filter_shape_inference->BuildFrom(
+      {output_depth, filter_height, filter_width, input_depth});
+
+  EXPECT_TRUE(ComputeConvSizes(
+      *input_shape_inference, output_depth, filter_width, filter_height,
+      *stride, dilation_width_factor, dilation_height_factor, padding_type,
+      output_shape_inference, pad_width, pad_height));
+
+  return true;
+}
+
+void IntToFloat(std::vector<float>* d, std::vector<std::int8_t>* s) {
+  for (unsigned int i = 0; i < s->size(); i++) {
+    d->data()[i] = (float)s->data()[i];
+  }
+}
+
+void IntToFloat(std::vector<float>* d, std::vector<std::int64_t>* s) {
+  for (unsigned int i = 0; i < s->size(); i++) {
+    d->data()[i] = (float)s->data()[i];
+  }
+}
+
+void TryTestOneConvFilter(int test_num) {
+  const int filter_width = UniformRandomInt(2, 5);
+  const int filter_height = UniformRandomInt(2, 5);
+  std::cout << "Test number " << test_num << " (" << filter_width << ","
+            << filter_height << ")\n";
+  // We don't support dilations in the 3x3 filter.
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+
+  const int output_activation_min = -32768;
+  const int output_activation_max = 32767;
+
+  RuntimeShape input_shape_inference;
+  RuntimeShape filter_shape_inference;
+  RuntimeShape output_shape_inference;
+  int pad_width, pad_height;
+  int stride;
+
+  // Keeps trying until we get valid shape/configurations for 3x3 filter case.
+  bool generated_valid_configurations_for_3x3_kernel = false;
+  while (!generated_valid_configurations_for_3x3_kernel) {
+    generated_valid_configurations_for_3x3_kernel =
+        GenerateValidShapeConfigurations(
+            filter_width, filter_height, dilation_width_factor,
+            dilation_height_factor, &input_shape_inference,
+            &filter_shape_inference, &output_shape_inference, &pad_width,
+            &pad_height, &stride);
+  }
+
+  const int output_depth = output_shape_inference.Dims(3);
+
+  RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
+  const int input_buffer_size = input_shape_inference.FlatSize();
+  const int filter_buffer_size = filter_shape_inference.FlatSize();
+  const int output_buffer_size = output_shape_inference.FlatSize();
+  std::vector<std::int16_t> input_data(input_buffer_size);
+  std::vector<std::int8_t> filter_data(filter_buffer_size);
+  std::vector<std::int64_t> bias_data(output_depth);
+
+  if (test_num & 1) {
+    // Use high values samples to give large accumulator
+    FillRandom(&input_data, (std::int16_t)32700, (std::int16_t)32767);
+    FillRandom(&filter_data, (std::int8_t)120, (std::int8_t)127);
+  } else {
+    FillRandom(&input_data);
+    FillRandom(&filter_data);
+  }
+  for (int i = 0; i < output_depth; i++) {
+    bias_data.data()[i] = 0;
+  }
+
+  ConvParams params;
+  params.stride_width = stride;
+  params.stride_height = stride;
+  params.dilation_height_factor = dilation_height_factor;
+  params.dilation_width_factor = dilation_width_factor;
+  params.padding_values.width = pad_width;
+  params.padding_values.height = pad_height;
+  params.weights_offset = 0;
+  params.quantized_activation_min = output_activation_min;
+  params.quantized_activation_max = output_activation_max;
+  params.float_activation_max = (float)(1LL << 40);
+  params.float_activation_min = -params.float_activation_max;
+
+  std::vector<std::int16_t> reference_output_data(output_buffer_size);
+  std::vector<std::int16_t> neon_output_data(output_buffer_size);
+
+  std::vector<std::int32_t> output_multiplier(output_depth);
+  std::vector<std::int32_t> output_shift(output_depth);
+
+  // It's hard to come up with a right multiplier, random guess basically makes
+  // all the results saturated and becomes meaningfulless, so we first use
+  // reference impl to poke the min/max value of the accumulation, then use that
+  // value as a guided suggestion for us to populate meaningful mulitplier &
+  // shift.
+  PickReasonableMultiplier(
+      params, output_activation_min, output_activation_max, output_depth,
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, output_multiplier.data(), output_shift.data(),
+      reference_output_data.data());
+
+  // The following tests compare referene impl and Neon general impl agrees,
+  // and reference impl loosely agrees with fast kernel since they use different
+  // rounding strategy.
+  reference_integer_ops::ConvPerChannel(
+      params, output_multiplier.data(), output_shift.data(),
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, reference_output_data.data());
+
+  std::vector<float> input_data_float(input_buffer_size);
+  std::vector<float> filter_data_float(filter_buffer_size);
+  std::vector<float> bias_data_float(output_depth);
+  std::vector<float> output_data_float(output_buffer_size);
+
+  for (int i = 0; i < input_buffer_size; i++) {
+    input_data_float.data()[i] = (float)(input_data.data()[i]);
+  }
+  IntToFloat(&filter_data_float, &filter_data);
+  IntToFloat(&bias_data_float, &bias_data);
+  RuntimeShape im2col_shape;
+  float im2col_data;
+
+  reference_ops::Conv(params, input_shape_inference, input_data_float.data(),
+                      filter_shape_inference, filter_data_float.data(),
+                      bias_shape_inference, bias_data_float.data(),
+                      output_shape_inference, output_data_float.data(),
+                      im2col_shape, &im2col_data);
+
+  for (int n = 0; n < output_shape_inference.Dims(0); n++) {
+    for (int h = 0; h < output_shape_inference.Dims(1); h++) {
+      for (int w = 0; w < output_shape_inference.Dims(2); w++) {
+        for (int c = 0; c < output_shape_inference.Dims(3); c++) {
+          int offset = Offset(output_shape_inference, n, h, w, c);
+          float float_res = output_data_float.data()[offset];
+          int16 int16_res = reference_output_data.data()[offset];
+          int32 output_mul = output_multiplier.data()[c];
+          int shift = output_shift.data()[c];
+          float scale = (float)output_mul / (float)(1ULL << 31);
+          if (shift > 0) scale = scale * (float)(1 << shift);
+          if (shift < 0) scale = scale / (float)(1 << -shift);
+          int ref_res = floor(float_res * scale + 0.5);
+          if (ref_res < output_activation_min) ref_res = output_activation_min;
+          if (ref_res > output_activation_max) ref_res = output_activation_max;
+          int e = (ref_res - int16_res);
+          if (e < 0) e = -e;
+          if (e > 2) {
+            ADD_FAILURE() << "(" << n << ", " << h << ", " << w << ", " << c
+                          << ")"
+                          << " scale=" << output_mul << " shift=" << shift
+                          << " res=" << int16_res
+                          << " float=" << float_res * scale << " (" << float_res
+                          << ", " << scale << ")";
+            EXPECT_TRUE(false);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(QuantizedConvPerChannelTest, FastKernelTest) {
+  for (int i = 0; i < 30; ++i) {
+    TryTestOneConvFilter(i);
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
index 1f2f6d57b9a..061f190c51a 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -1074,13 +1074,9 @@ void TestOneDepthwiseConv3x3Filter(
 void TestOneNeonDot3x3(const TestParam& test_param) {
 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
     defined(__clang__)
-  CpuBackendContext backend_context;
-  ruy::Context* ruy_context = backend_context.ruy_context();
-  const auto ruy_paths = ruy_context != nullptr
-                             ? ruy_context->GetRuntimeEnabledPaths()
-                             : ruy::Path::kNone;
-  const bool has_dot_product_instructions =
-      (ruy_paths & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+  CpuFlags cpu_flags;
+  GetCpuFlags(&cpu_flags);
+  const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
   if (test_param.forced_invocation ==
           DepthwiseConvImplementation::kUseNeon3x3DotProduct &&
       !has_dot_product_instructions) {
@@ -1093,7 +1089,7 @@ void TestOneNeonDot3x3(const TestParam& test_param) {
 }
 
 TEST(TestDepthwiseConv, TestDepthwiseConv) {
-  const int kTestsToRun = 10 * 1000;
+  const int kTestsToRun = 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     TestOneDepthwiseConv(DepthwiseConvImplementation::kNone,
                          DepthwiseConvOutputRounding::kAwayFromZero);
@@ -1102,7 +1098,7 @@ TEST(TestDepthwiseConv, TestDepthwiseConv) {
 
 // Run basic coverage test against the generic kernel.
 TEST(TestDepthwiseConv, TestGenericKernel) {
-  const int kTestsToRun = 10 * 1000;
+  const int kTestsToRun = 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     TestOneDepthwiseConv(DepthwiseConvImplementation::kUseGenericKernel,
                          DepthwiseConvOutputRounding::kAwayFromZero);
@@ -1111,7 +1107,7 @@ TEST(TestDepthwiseConv, TestGenericKernel) {
 
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
 TEST(TestDepthwiseConv, TestNeon3x3FilterAway) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 500;
   for (int i = 0; i < kTestsToRun; i++) {
     TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
                                   DepthwiseConvOutputRounding::kAwayFromZero);
@@ -1119,7 +1115,7 @@ TEST(TestDepthwiseConv, TestNeon3x3FilterAway) {
 }
 
 TEST(TestDepthwiseConv, TestNeon3x3FilterUpward) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 500;
   for (int i = 0; i < kTestsToRun; i++) {
     TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
                                   DepthwiseConvOutputRounding::kUpward);
@@ -1233,7 +1229,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Combine(
         Values(DepthwiseConvImplementation::
                    kUseCModel3x3DotProduct),            // forced_invocation
-        Values(1000),                                   // tests_to_run
+        Values(200),                                    // tests_to_run
         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
         Bool(),                                         // test_stride
         Bool(),                                         // test_pad
@@ -1249,7 +1245,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Combine(
         Values(DepthwiseConvImplementation::
                    kUseUnwound3x3DotProduct),           // forced_invocation
-        Values(1000),                                   // tests_to_run
+        Values(200),                                    // tests_to_run
         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
         Bool(),                                         // test_stride
         Bool(),                                         // test_pad
@@ -1269,7 +1265,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Combine(
         Values(DepthwiseConvImplementation::
                    kUseIntrinsics3x3DotProduct),        // forced_invocation
-        Values(500),                                    // tests_to_run
+        Values(200),                                    // tests_to_run
         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
         Bool(),                                         // test_stride
         Bool(),                                         // test_pad
@@ -1289,7 +1285,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Combine(
         Values(DepthwiseConvImplementation::
                    kUseIntrinsics3x3DotProduct),       // forced_invocation
-        Values(500),                                   // tests_to_run
+        Values(200),                                   // tests_to_run
         Values(QuantizationType::kPerChannelInt8),     // quantization_type
         Bool(),                                        // test_stride
         Bool(),                                        // test_pad
@@ -1310,7 +1306,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Combine(
         Values(DepthwiseConvImplementation::
                    kUseNeon3x3DotProduct),              // forced_invocation
-        Values(1000),                                   // tests_to_run
+        Values(200),                                    // tests_to_run
         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
         Bool(),                                         // test_stride
         Bool(),                                         // test_pad
@@ -1326,7 +1322,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Combine(
         Values(DepthwiseConvImplementation::
                    kUseNeon3x3DotProduct),             // forced_invocation
-        Values(1000),                                  // tests_to_run
+        Values(200),                                   // tests_to_run
         Values(QuantizationType::kPerChannelInt8),     // quantization_type
         Bool(),                                        // test_stride
         Bool(),                                        // test_pad
@@ -1343,7 +1339,7 @@ INSTANTIATE_TEST_SUITE_P(
     Dispatch3x3, DepthwiseConvTest,
     testing::Combine(
         Values(DepthwiseConvImplementation::kNone),     // forced_invocation
-        Values(500),                                    // tests_to_run
+        Values(200),                                    // tests_to_run
         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
         Bool(),                                         // test_stride
         Bool(),                                         // test_pad
@@ -1358,7 +1354,7 @@ INSTANTIATE_TEST_SUITE_P(
     Dispatch3x3PerChannel, DepthwiseConvTest,
     testing::Combine(
         Values(DepthwiseConvImplementation::kNone),    // forced_invocation
-        Values(500),                                   // tests_to_run
+        Values(200),                                   // tests_to_run
         Values(QuantizationType::kPerChannelInt8),     // quantization_type
         Bool(),                                        // test_stride
         Bool(),                                        // test_pad
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.cc b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
new file mode 100644
index 00000000000..8fd17a7e33a
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+
+#if defined __linux__ && defined __aarch64__
+#include <sys/auxv.h>
+#endif
+
+namespace tflite {
+
+namespace {
+
+// The implementation of dotprod detection is copied from ruy's internal
+// function DetectDotprod().
+// At the moment it's only implemented on Linux ARM64. Consider syncing again
+// with ruy in the future to share improvements.
+#if defined __linux__ && defined __aarch64__
+bool DetectDotprodByLinuxAuxvMethod() {
+  // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
+  // however we need to support building against older headers for the time
+  // being.
+  const int kLocalHwcapAsimddp = 1 << 20;
+  return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
+}
+#endif
+
+}  // namespace
+
+bool DetectArmNeonDotprod() {
+#if defined __linux__ && defined __aarch64__
+  return DetectDotprodByLinuxAuxvMethod();
+#endif
+
+  return false;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
index 3ab5c86ff6b..b39371a3e2f 100644
--- a/tensorflow/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
@@ -15,25 +15,24 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
+// This include is superfluous. However, it's been here for a while, and a
+// number of files have been relying on it to include neon_check.h for them.
+// This should be removed, but with a global run of presubmits to catch
+// any such issues. This requires running more than just TFLite presubmits.
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 
 namespace tflite {
 
+// On A64, returns true if the dotprod extension is present.
+// On other architectures, returns false unconditionally.
+bool DetectArmNeonDotprod();
+
 struct CpuFlags {
   bool neon_dotprod = false;
 };
 
-inline void GetCpuFlags(CpuBackendContext* cpu_backend_context,
-                        CpuFlags* cpu_flags) {
-#if RUY_PLATFORM(ARM)
-  ruy::Context* ruy_context = cpu_backend_context->ruy_context();
-  cpu_flags->neon_dotprod =
-      ruy_context != nullptr && (ruy_context->GetRuntimeEnabledPaths() &
-                                 ruy::Path::kNeonDotprod) != ruy::Path::kNone;
-#else
-  cpu_flags->neon_dotprod = false;
-#endif
+inline void GetCpuFlags(CpuFlags* cpu_flags) {
+  cpu_flags->neon_dotprod = DetectArmNeonDotprod();
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
index 52c38097bc5..7d8838a076e 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
@@ -144,7 +144,7 @@ inline void DepthwiseConv(const DepthwiseParams& params,
   const int output_height = output_shape.Dims(1);
 
   CpuFlags cpu_flags;
-  GetCpuFlags(cpu_backend_context, &cpu_flags);
+  GetCpuFlags(&cpu_flags);
 
   if (thread_count == 1) {
     DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index a9dae4feac5..95b78b3a6b3 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -35,58 +35,102 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
+
 #ifdef USE_NEON
-  const int8x8_t output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
-  const int8x8_t output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
-    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
-    const int16x8_t input1_val =
-        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val =
-        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
-    const int16x4_t input1_val_high = vget_high_s16(input1_val);
-    const int16x4_t input1_val_low = vget_low_s16(input1_val);
-    const int16x4_t input2_val_high = vget_high_s16(input2_val);
-    const int16x4_t input2_val_low = vget_low_s16(input2_val);
-    int32x4_t x11 = vmovl_s16(input1_val_low);
-    int32x4_t x12 = vmovl_s16(input1_val_high);
-    int32x4_t x21 = vmovl_s16(input2_val_low);
-    int32x4_t x22 = vmovl_s16(input2_val_high);
-    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
-    x11 = vshlq_s32(x11, left_shift_dup);
-    x12 = vshlq_s32(x12, left_shift_dup);
-    x21 = vshlq_s32(x21, left_shift_dup);
-    x22 = vshlq_s32(x22, left_shift_dup);
-    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
-    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
-    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
-    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
-    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
-    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
-    x11 = vshlq_s32(x11, input1_shift_dup);
-    x12 = vshlq_s32(x12, input1_shift_dup);
-    x21 = vshlq_s32(x21, input2_shift_dup);
-    x22 = vshlq_s32(x22, input2_shift_dup);
-    int32x4_t s1 = vaddq_s32(x11, x21);
-    int32x4_t s2 = vaddq_s32(x12, x22);
-    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
-    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+  const int8x16_t output_activation_min_vector =
+      vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector =
+      vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, input1_offset_dup);
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_dup);
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, input1_offset_dup);
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_dup);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    s1 = RoundingDivideByPOT(s1, -params.output_shift);
-    s2 = RoundingDivideByPOT(s2, -params.output_shift);
-    const int16x4_t s1_narrowed = vmovn_s32(s1);
-    const int16x4_t s2_narrowed = vmovn_s32(s2);
-    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                                  vdupq_n_s16(params.output_offset));
-    const int8x8_t clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
-    vst1_s8(output_data + i, clamped);
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 = vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 = vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+        vmaxq_s8(output_activation_min_vector,
+                 vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index ffc7ea84340..0ff153da977 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -1810,13 +1810,10 @@ inline void DepthwiseConvWithRounding(
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
 #if defined(__ANDROID__) && defined(__clang__)
-  ruy::Context* ruy_context = cpu_backend_context.ruy_context();
-  const auto ruy_paths = ruy_context != nullptr
-                             ? ruy_context->GetRuntimeEnabledPaths()
-                             : ruy::Path::kNone;
+  CpuFlags cpu_flags;
+  GetCpuFlags(&cpu_flags);
   // TODO(b/150208140): Re-enable once erroneous activation in test is resolved.
-  const bool has_dot_product_instructions =
-      false && (ruy_paths & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+  const bool has_dot_product_instructions = false && cpu_flags.neon_dotprod;
 
   // Dispatch to dot-product 3x3 kernels when supported.
   if (has_dot_product_instructions) {
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
index 18aeef4c8b5..0d385ec1656 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -38,49 +38,81 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.output_offset, -256);
   TFLITE_DCHECK_LT(params.output_offset, 256);
 #ifdef USE_NEON
-  const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
-  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
-  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
   const auto output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
+      vdupq_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
+      vdupq_n_s8(params.quantized_activation_max);
   const int left_shift = std::max(0, params.output_shift);
   const int right_shift = std::max(0, -params.output_shift);
   const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
-  for (; i <= size - 8; i += 8) {
-    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
-    const auto input1_val_original = vld1_s8(input1_data + i);
-    const auto input2_val_original = vld1_s8(input2_data + i);
-    const auto input1_val_s16 = vmovl_s8(input1_val_original);
-    const auto input2_val_s16 = vmovl_s8(input2_val_original);
-    const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
-    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
 
-    const auto input1_val_low = vget_low_s16(input1_val);
-    const auto input1_val_high = vget_high_s16(input1_val);
-    const auto input2_val_low = vget_low_s16(input2_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
 
-    auto p1 = vmull_s16(input2_val_low, input1_val_low);
-    auto p2 = vmull_s16(input2_val_high, input1_val_high);
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, input1_offset_vector);
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, input1_offset_vector);
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+
+    auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
+    auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
+    auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
+    auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
 
     p1 = vshlq_s32(p1, left_shift_vec);
     p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     p1 = RoundingDivideByPOT(p1, right_shift);
     p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
 
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
-    const auto p =
-        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(p)));
-    vst1_s8(output_data + i, clamped);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 
@@ -117,40 +149,63 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
   const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
   const auto output_offset_vector = vdupq_n_s16(params.output_offset);
   const auto output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
+      vdupq_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
+      vdupq_n_s8(params.quantized_activation_max);
   const int left_shift = std::max(0, params.output_shift);
   const int right_shift = std::max(0, -params.output_shift);
   const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
-  for (; i <= size - 8; i += 8) {
-    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
-    const auto input2_val_original = vld1_s8(input2_data + i);
-    const auto input2_val_s16 = vmovl_s8(input2_val_original);
-    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const auto input2_val_original = vld1q_s8(input2_data + i);
+    const auto input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
 
-    const auto input2_val_low = vget_low_s16(input2_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
+    const auto input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const auto input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
 
-    auto p1 = vmull_n_s16(input2_val_low, input1_val);
-    auto p2 = vmull_n_s16(input2_val_high, input1_val);
+    const auto input2_val_low_low = vget_low_s16(input2_val_low);
+    const auto input2_val_low_high = vget_high_s16(input2_val_low);
+    const auto input2_val_high_low = vget_low_s16(input2_val_high);
+    const auto input2_val_high_high = vget_high_s16(input2_val_high);
+
+    auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
+    auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
+    auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
 
     p1 = vshlq_s32(p1, left_shift_vec);
     p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     p1 = RoundingDivideByPOT(p1, right_shift);
     p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
 
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
-    const auto p =
-        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(p)));
-    vst1_s8(output_data + i, clamped);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h
index 123e0a0082c..36519dd606f 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h
@@ -25,16 +25,17 @@ inline void TransposeConvV2(
     const ConvParams& params, const int32* output_multiplier,
     const int32* output_shift, const RuntimeShape& input_shape,
     const int8_t* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
-    const int8_t* hwoi_ordered_filter_data, const RuntimeShape& output_shape,
+    const int8_t* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
     int8_t* output_data, const RuntimeShape& col2im_shape, int32_t* col2im_data,
     int32_t* scratch_data, CpuBackendContext* cpu_backend_context) {
   ruy::profiler::ScopeLabel label("TransposeConvV2/int8");
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
-  const int batch_size = input_shape.Dims(0);
   TFLITE_DCHECK(col2im_data);
   TFLITE_DCHECK(hwoi_ordered_filter_data);
 
+  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
   const int input_image_size = input_shape.Dims(1) * input_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
@@ -93,6 +94,9 @@ inline void TransposeConvV2(
 
     scratch_data_p += output_offset;
   }
+  scratch_data_p = scratch_data;
+  optimized_ops::BiasAdd(scratch_data_p, bias_data, batch_size, output_height,
+                         output_width, output_depth);
 
   const int32_t output_min = std::numeric_limits<int8_t>::min();
   const int32_t output_max = std::numeric_limits<int8_t>::max();
diff --git a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
index bc8b9b2d3ac..f206dfa9235 100644
--- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -2946,6 +2946,18 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
                 output_data, DimsToShape(im2col_dims), im2col_data);
 }
 
+inline void TransposeConvV2(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
+    const float* hwoi_ordered_filter_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& col2im_shape, float* col2im_data,
+    CpuBackendContext* cpu_backend_context) {
+  TransposeConvV2(params, input_shape, input_data, hwoi_ordered_filter_shape,
+                  hwoi_ordered_filter_data, /*bias_shape*/ RuntimeShape(),
+                  /*bias_data*/ nullptr, output_shape, output_data,
+                  col2im_shape, col2im_data, cpu_backend_context);
+}
+
 template <typename T>
 void TransposeIm2col(const T* input_data, const Dims<4>& input_dims,
                      const Dims<4>& filter_dims, int stride_width,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index dc2204e3a60..c96f298370a 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <limits>
 #include <utility>
 
-#include "ruy/detect_arm.h"  // from @ruy
 #include "ruy/ruy.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
@@ -39,7 +38,8 @@ limitations under the License.
 // aligned_alloc is available (via cstdlib/stdlib.h) with C++17/C11.
 #if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
 #if !defined(__ANDROID__) || __ANDROID_API__ >= 28
-#if !defined(__APPLE__)  // Apple does not provide aligned_alloc.
+// Neither Apple nor Windows provide aligned_alloc.
+#if !defined(__APPLE__) && !defined(_WIN32)
 #define TFLITE_USE_STD_ALIGNED_ALLOC
 #endif
 #endif
@@ -79,7 +79,7 @@ inline void* aligned_alloc(size_t alignment, size_t size,
 }
 
 bool HasSdotInstruction() {
-  static const bool has_dotprod = ruy::DetectDotprod();
+  static const bool has_dotprod = DetectArmNeonDotprod();
   return has_dotprod;
 }
 
@@ -92,6 +92,32 @@ inline float AccumulateNeonLane(const float32x4_t lane) {
 #endif
 }
 
+// Empirically determined breakpoints on when to use CpuBackendGemm vs.
+// standard MatrixBatchVectorMultiplyAccumulate. Briefly, if the batch size
+// is above 8 and the device does not have sdot, use CpuBackendGemm. Otherwise,
+// for large batch sizes, it makes sense to use CpuBackendGemm if the matrix
+// is not extremely rectangular.
+bool UseCpuBackendGemm(int rows, int cols, int batch) {
+  if (!HasSdotInstruction()) {
+    return batch >= 8;
+  }
+  if (batch < 16) {
+    return false;
+  }
+  constexpr int kCpuBackendGemmThreshold = 2;
+  // Calculate "rectangularness" as a measure of how far from square the
+  // the LHS matrix is.
+  int row_rect = rows / cols;
+  int col_rect = cols / rows;
+  int rectangularness_lg2 =
+      row_rect > 0 ? FloorLog2(row_rect) : FloorLog2(col_rect);
+  int batch_lg2 = FloorLog2(batch);
+  // Large batch sizes move us above the threshold, but can be offset
+  // by significant rectangularness.
+  int batch_lg2_minus_rect_lg2 = batch_lg2 - rectangularness_lg2;
+  return batch_lg2_minus_rect_lg2 > kCpuBackendGemmThreshold;
+}
+
 inline int32_t AccumulateNeonLane(const int32x4_t lane) {
 #ifdef __aarch64__
   return vaddvq_s32(lane);
@@ -1015,7 +1041,7 @@ void NeonCpuBackendGemm(const int8_t* input, const int32_t* bias,
   lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
   lhs_params.rows = n_output;
   lhs_params.cols = n_input;
-  lhs_params.cacheable = true;
+  lhs_params.cache_policy = cpu_backend_gemm::CachePolicy::kCacheIfLargeSpeedup;
 
   MatrixParams<int8_t> rhs_params;
   rhs_params.order = cpu_backend_gemm::Order::kColMajor;
@@ -1404,15 +1430,18 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     int n_batch, float* __restrict__ result, const float* per_channel_scale,
     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
     bool* compute_row_sums, CpuBackendContext* context) {
-  if (input_offset == nullptr) {
 #ifdef TFLITE_WITH_RUY_GEMV
-    if (context) {
+  const bool use_cpu_backend_gemm = true;
+#else
+  const bool use_cpu_backend_gemm = UseCpuBackendGemm(m_rows, m_cols, n_batch);
+#endif
+  if (input_offset == nullptr) {
+    if (use_cpu_backend_gemm && context) {
       NeonMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
                                               scaling_factors, n_batch, scratch,
                                               result, context);
       return;
     }
-#endif
     NeonMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
                                             scaling_factors, n_batch, result);
     return;
@@ -1426,74 +1455,72 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     }
   }
 
-#ifdef TFLITE_WITH_RUY_GEMV
-  if (context != nullptr && m_rows % 4 == 0) {
-    const int32_t* bias = static_cast<const int32_t*>(nullptr);
-    NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows, 0,
-                       scratch, context);
+  if (use_cpu_backend_gemm) {
+    if (context != nullptr && m_rows % 4 == 0) {
+      const int32_t* bias = static_cast<const int32_t*>(nullptr);
+      NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows, 0,
+                         scratch, context);
 
-    // Multiply by float scaling factors and write to result
-    const int total_size = n_batch * m_rows;
-    int i = 0;
-    int32_t* scratch_ptr = scratch;
-    for (; i <= total_size - 8; i += 8, result += 8) {
-      float batch_scaling_factor0 = scaling_factors[i / m_rows];
-      float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
-      if (per_channel_scale) {
-        batch_scaling_factor0 *= per_channel_scale[i % m_rows];
-        batch_scaling_factor1 *= per_channel_scale[(i + 4) % m_rows];
+      // Multiply by float scaling factors and write to result
+      const int total_size = n_batch * m_rows;
+      int i = 0;
+      int32_t* scratch_ptr = scratch;
+      for (; i <= total_size - 8; i += 8, result += 8) {
+        const float batch_scaling_factor0 = scaling_factors[i / m_rows];
+        const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
+        const int batch_input_offset0 = -input_offset[i / m_rows];
+        const int batch_input_offset1 = -input_offset[(i + 4) / m_rows];
+        float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
+        float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
+        if (per_channel_scale) {
+          const float32x4_t per_channel_scale0 =
+              vld1q_f32(&per_channel_scale[i % m_rows]);
+          const float32x4_t per_channel_scale1 =
+              vld1q_f32(&per_channel_scale[(i + 4) % m_rows]);
+          scaling_factor0 = vmulq_f32(scaling_factor0, per_channel_scale0);
+          scaling_factor1 = vmulq_f32(scaling_factor1, per_channel_scale1);
+        }
+        const int32x4_t input_offset0 = vdupq_n_s32(batch_input_offset0);
+        const int32x4_t input_offset1 = vdupq_n_s32(batch_input_offset1);
+        const int32x4_t row_sum0 = vld1q_s32(row_sums + (i % m_rows));
+        const int32x4_t row_sum1 = vld1q_s32(row_sums + ((i + 4) % m_rows));
+        const int32x4_t scratch_val0 = vld1q_s32(scratch_ptr + i);
+        const int32x4_t scratch_val1 = vld1q_s32(scratch_ptr + i + 4);
+        const int32x4_t dotprod0 =
+            vmlaq_s32(scratch_val0, row_sum0, input_offset0);
+        const int32x4_t dotprod1 =
+            vmlaq_s32(scratch_val1, row_sum1, input_offset1);
+        const float32x4_t float_val0 = vcvtq_f32_s32(dotprod0);
+        const float32x4_t float_val1 = vcvtq_f32_s32(dotprod1);
+        const float32x4_t result0 =
+            vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
+        const float32x4_t result1 =
+            vmlaq_f32(vld1q_f32(result + 4), float_val1, scaling_factor1);
+        vst1q_f32(result, result0);
+        vst1q_f32(result + 4, result1);
       }
-      const int batch_input_offset0 = -input_offset[i / m_rows];
-      const int batch_input_offset1 = -input_offset[(i + 4) / m_rows];
-      const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
-      const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
-      const int32x4_t input_offset0 = vdupq_n_s32(batch_input_offset0);
-      const int32x4_t input_offset1 = vdupq_n_s32(batch_input_offset1);
-      const int32x4_t row_sum0 = vld1q_s32(row_sums + (i % m_rows));
-      const int32x4_t row_sum1 = vld1q_s32(row_sums + ((i + 4) % m_rows));
-      const int32x4_t scratch_val0 = vld1q_s32(scratch_ptr + i);
-      const int32x4_t scratch_val1 = vld1q_s32(scratch_ptr + i + 4);
-      const int32x4_t dotprod0 =
-          vmlaq_s32(scratch_val0, row_sum0, input_offset0);
-      const int32x4_t dotprod1 =
-          vmlaq_s32(scratch_val1, row_sum1, input_offset1);
-      const float32x4_t float_val0 = vcvtq_f32_s32(dotprod0);
-      const float32x4_t float_val1 = vcvtq_f32_s32(dotprod1);
-      const float32x4_t result0 =
-          vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
-      const float32x4_t result1 =
-          vmlaq_f32(vld1q_f32(result + 4), float_val1, scaling_factor1);
-      vst1q_f32(result, result0);
-      vst1q_f32(result + 4, result1);
-    }
 
-    scratch_ptr += i;
-    for (; i < total_size; i++) {
-      const float batch_scaling_factor = scaling_factors[i / m_rows];
-      const int32_t zero_point = input_offset[i / m_rows];
-      int32_t dotprod = *(scratch_ptr++);
-      dotprod -= row_sums[i % m_rows] * zero_point;
-      *result += dotprod * batch_scaling_factor;
-      ++result;
+      scratch_ptr += i;
+      for (; i < total_size; i++) {
+        float batch_scaling_factor = scaling_factors[i / m_rows];
+        if (per_channel_scale) {
+          batch_scaling_factor *= per_channel_scale[i % m_rows];
+        }
+        const int32_t zero_point = input_offset[i / m_rows];
+        int32_t dotprod = *(scratch_ptr++);
+        dotprod -= row_sums[i % m_rows] * zero_point;
+        *result += dotprod * batch_scaling_factor;
+        ++result;
+      }
+      return;
     }
-    return;
   }
-#endif
+
   NeonMatrixBatchVectorMultiplyAccumulateImpl(
       matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
       per_channel_scale, input_offset, row_sums);
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  NeonMatrixBatchVectorMultiplyAccumulateImpl(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      per_channel_scale, input_offset, nullptr);
-}
-
 inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) {
   int64x2x2_t result;
   const int64x2_t lhs_low = vmovl_s32(vget_low_s32(lhs));
@@ -1923,6 +1950,36 @@ void NeonCwiseClipping(int8_t* input, const int8_t clipping_value,
   }
 }
 
+void NeonSparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  const int kBlockSize = 4;
+  TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
+
+  for (int batch = 0; batch < n_batch; batch++) {
+    const float* matrix_ptr = matrix;
+    for (int row = 0; row < m_rows; row++) {
+      float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+      const float* vector_in_batch = vector + batch * m_cols;
+
+      for (int i = segments[row]; i < segments[row + 1]; i++) {
+        const int block_start_index = indices[i] * kBlockSize;
+        const float* vector_block_in_batch_ptr =
+            vector_in_batch + block_start_index;
+
+        // Load 4 float values from the vector and matrix row.
+        float32x4_t vector_f32x4 = vld1q_f32(vector_block_in_batch_ptr);
+        float32x4_t matrix_f32x4 = vld1q_f32(matrix_ptr);
+        // Multiply the vector and matrix row and add to accumulator.
+        acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
+        matrix_ptr += kBlockSize;
+      }
+      result[batch * m_rows + row] += AccumulateNeonLane(acc_32x4);
+    }
+  }
+}
+
 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index d873e046729..86951fcd559 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 
@@ -54,16 +55,6 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
                    vectors, scaling_factors, n_batch, scratch, result, context);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vectors, scaling_factors, n_batch, result, per_channel_scale,
-                   input_offset);
-}
-
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
@@ -75,6 +66,14 @@ void MatrixBatchVectorMultiplyAccumulate(
                    input_offset, scratch, row_sums, compute_row_sums, context);
 }
 
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x4, matrix,
+                   segments, indices, m_rows, m_cols, vector, n_batch, result);
+}
+
 void SparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index 059accb0222..1554d07a61c 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -62,12 +62,6 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
     bool* compute_row_sums, CpuBackendContext* context);
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset);
-
 void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                         const int32_t* bias, int32_t layer_norm_scale_a,
                         int32_t layer_norm_scale_b, int32_t variance_limit,
@@ -111,6 +105,11 @@ void NeonMatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
                                         int32_t n_row, int32_t n_col,
                                         int32_t* output);
 
+void NeonSparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
+
 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector. Sparse version.
 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index ce9073773a5..746ed622632 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
 
 #if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 #include <Accelerate/Accelerate.h>
@@ -200,63 +201,35 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
 // MultiplyByQuantizedMultipler.
 #ifdef USE_NEON
 inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
-    int32x4x4_t input_val, int32 quantized_multiplier, int shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  const int left_shift = shift > 0 ? shift : 0;
-  const int right_shift = shift > 0 ? 0 : -shift;
+    int32x4x4_t input_val, int32 quantized_multiplier, int32 shift) {
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
   int32x4x4_t result;
-  // The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
-  // is limited to NEON.
-#ifdef GEMMLOWP_NEON
-  const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
-  result.val[0] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[0], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[1] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[1], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[2] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[2], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[3] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[3], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-#else
-  for (int i = 0; i < 4; ++i) {
-    int32_t vals[4];
-    vals[0] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[1] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[2] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[3] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
 
-    result.val[i] = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
-  }
-#endif
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[1] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[2] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[3] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
   return result;
 }
 #endif
@@ -285,13 +258,15 @@ inline void FullyConnected(
   rhs_params.order = cpu_backend_gemm::Order::kColMajor;
   rhs_params.rows = input_rows;
   rhs_params.cols = input_shape.FlatSize() / input_rows;
-  rhs_params.cacheable = params.rhs_cacheable;
+  rhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
   TFLITE_DCHECK_EQ(input_shape.FlatSize(), rhs_params.rows * rhs_params.cols);
   cpu_backend_gemm::MatrixParams<float> lhs_params;
   lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
   lhs_params.cols = weights_shape.Dims(dims_count - 1);
   lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
-  lhs_params.cacheable = params.lhs_cacheable;
+  lhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
   cpu_backend_gemm::MatrixParams<float> dst_params;
   dst_params.order = cpu_backend_gemm::Order::kColMajor;
   dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
@@ -344,13 +319,15 @@ inline void FullyConnected(
   lhs_params.cols = filter_cols;
   lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
   lhs_params.zero_point = -filter_offset;
-  lhs_params.cacheable = params.lhs_cacheable;
+  lhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
   cpu_backend_gemm::MatrixParams<uint8> rhs_params;
   rhs_params.rows = filter_cols;
   rhs_params.cols = batches;
   rhs_params.order = cpu_backend_gemm::Order::kColMajor;
   rhs_params.zero_point = -input_offset;
-  rhs_params.cacheable = params.rhs_cacheable;
+  rhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
   cpu_backend_gemm::MatrixParams<uint8> dst_params;
   dst_params.rows = filter_rows;
   dst_params.cols = batches;
@@ -403,13 +380,15 @@ inline void FullyConnected(
   lhs_params.cols = accum_depth;
   lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
   lhs_params.zero_point = -filter_offset;
-  lhs_params.cacheable = params.lhs_cacheable;
+  lhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
   cpu_backend_gemm::MatrixParams<uint8> rhs_params;
   rhs_params.rows = accum_depth;
   rhs_params.cols = batches;
   rhs_params.order = cpu_backend_gemm::Order::kColMajor;
   rhs_params.zero_point = -input_offset;
-  rhs_params.cacheable = params.rhs_cacheable;
+  rhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
   cpu_backend_gemm::MatrixParams<int16> dst_params;
   dst_params.rows = output_depth;
   dst_params.cols = batches;
@@ -4325,6 +4304,41 @@ inline void Logistic(const LogisticParams& params,
     }
   }
 #endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    for (; c <= flat_size - 16; c += 16) {
+      F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+          reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output0.raw().v);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                       output1.raw().v);
+      input_data_ptr += 16;
+      output_data_ptr += 16;
+    }
+    for (; c <= flat_size - 8; c += 8) {
+      F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F0 output = gemmlowp::logistic(input);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output.raw().v);
+      input_data_ptr += 8;
+      output_data_ptr += 8;
+    }
+  }
+#endif
+
   {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
@@ -4431,6 +4445,72 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
     }
   }
 #endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+            reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    } else {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr + 8)))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    }
+  }
+#endif
+
   {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
@@ -5588,20 +5668,38 @@ void Col2im(const T* col_data, const int depth, const int height,
   }
 }
 
+template <typename T>
+void BiasAdd(T* im_data, const T* bias_data, const int batch_size,
+             const int height, const int width, const int depth) {
+  if (bias_data) {
+    for (int n = 0; n < batch_size; ++n) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          for (int d = 0; d < depth; ++d) {
+            im_data[d] += bias_data[d];
+          }
+          im_data += depth;
+        }
+      }
+    }
+  }
+}
+
 // TransposeConvV2 expect the weights in HWOI order.
 inline void TransposeConvV2(
     const ConvParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
-    const float* hwoi_ordered_filter_data, const RuntimeShape& output_shape,
-    float* output_data, const RuntimeShape& col2im_shape, float* col2im_data,
-    CpuBackendContext* cpu_backend_context) {
+    const float* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* const output_data, const RuntimeShape& col2im_shape,
+    float* col2im_data, CpuBackendContext* cpu_backend_context) {
   ruy::profiler::ScopeLabel label("TransposeConvV2/float");
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
-  const int batch_size = input_shape.Dims(0);
   TFLITE_DCHECK(col2im_data);
   TFLITE_DCHECK(hwoi_ordered_filter_data);
 
+  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
   const int input_image_size = input_shape.Dims(1) * input_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
@@ -5653,6 +5751,9 @@ inline void TransposeConvV2(
            output_data_p);
     output_data_p += output_offset;
   }
+  output_data_p = output_data;
+  BiasAdd(output_data_p, bias_data, batch_size, output_height, output_width,
+          output_depth);
 }
 
 inline void Quantize(int32_t multiplier, int32_t shift, int32_t total_size,
@@ -5813,17 +5914,18 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
 inline void TransposeConvV2(
     const ConvParams& params, const RuntimeShape& input_shape,
     const uint8_t* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
-    const uint8_t* hwoi_ordered_filter_data, const RuntimeShape& output_shape,
+    const uint8_t* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
     uint8_t* output_data, const RuntimeShape& col2im_shape,
     int32_t* col2im_data, int32_t* scratch_data,
     CpuBackendContext* cpu_backend_context) {
   ruy::profiler::ScopeLabel label("TransposeConvV2/uint8");
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
-  const int batch_size = input_shape.Dims(0);
   TFLITE_DCHECK(col2im_data);
   TFLITE_DCHECK(hwoi_ordered_filter_data);
 
+  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
   const int input_image_size = input_shape.Dims(1) * input_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
@@ -5881,6 +5983,9 @@ inline void TransposeConvV2(
 
     scratch_data_p += output_offset;
   }
+  scratch_data_p = scratch_data;
+  BiasAdd(scratch_data_p, bias_data, batch_size, output_height, output_width,
+          output_depth);
 
   Quantize(params.output_multiplier, params.output_shift,
            output_shape.FlatSize(), params.output_offset, scratch_data,
@@ -5890,13 +5995,21 @@ inline void TransposeConvV2(
 // Integer-only version of ResizeNearestNeighbor. Since scales are represented
 // in fixed-point and thus approximated, |in_x| or |in_y| may differ from the
 // reference version. Debug checks are in place to test if this occurs.
+// NOTE: If align_corners or half_pixel_centers is true, we use the reference
+// version.
 inline void ResizeNearestNeighbor(
     const tflite::ResizeNearestNeighborParams& op_params,
     const RuntimeShape& unextended_input_shape, const uint8* input_data,
     const RuntimeShape& output_size_shape, const int32* output_size_data,
     const RuntimeShape& unextended_output_shape, uint8* output_data) {
-  // Align corners = true is not supported.
-  TFLITE_DCHECK(!op_params.align_corners);
+  if (op_params.align_corners || op_params.half_pixel_centers) {
+    // TODO(b/149823713): Add support for align_corners & half_pixel_centers in
+    // this kernel.
+    reference_ops::ResizeNearestNeighbor(
+        op_params, unextended_input_shape, input_data, output_size_shape,
+        output_size_data, unextended_output_shape, output_data);
+    return;
+  }
   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
 
@@ -7780,6 +7893,225 @@ void Transpose(const TransposeParams& unshrinked_params,
                       shrinked_output_shape, output_data);
 }
 
+// Assume input1 & input2 have the same scale & zero point.
+inline void MaximumElementwise(int size, const ArithmeticParams& params,
+                               const int8* input1_data, const int8* input2_data,
+                               int8* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumElementwiseInt8/8bit");
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input1_val = input1_data[i];
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::max(input1_val, input2_val);
+  }
+}
+
+inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8 input1_data, const int8* input2_data,
+                                   int8* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumScalarBroadcastInt8/8bit");
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::max(input1_data, input2_val);
+  }
+}
+
+// Assume input1 & input2 have the same scale & zero point.
+inline void MinimumElementwise(int size, const ArithmeticParams& params,
+                               const int8* input1_data, const int8* input2_data,
+                               int8* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumElementwiseInt8/8bit");
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input1_val = input1_data[i];
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::min(input1_val, input2_val);
+  }
+}
+
+inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8 input1_data, const int8* input2_data,
+                                   int8* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumScalarBroadcastInt8/8bit");
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::min(input1_data, input2_val);
+  }
+}
+
+template <typename ElementwiseF, typename ScalarBroadcastF>
+inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
+                                    const RuntimeShape& unswitched_input1_shape,
+                                    const int8* unswitched_input1_data,
+                                    const RuntimeShape& unswitched_input2_shape,
+                                    const int8* unswitched_input2_data,
+                                    const RuntimeShape& output_shape,
+                                    int8* output_data,
+                                    ElementwiseF elementwise_f,
+                                    ScalarBroadcastF scalar_broadcast_f) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const int8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const int8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  int8* output_data_ptr = output_data;
+  const int8* input1_data_ptr = input1_data;
+  const int8* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for
+  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const int8* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
+                          output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single
+    // element and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except
+    // simplified for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const int8* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+template <typename Op>
+inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MaximumElementwise, MaximumScalarBroadcast);
+}
+
+template <typename Op>
+inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MinimumElementwise, MinimumScalarBroadcast);
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
index f7e54e144ce..750e63e152f 100644
--- a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -68,6 +69,42 @@ inline void FullyConnectedSparseWeight(
   }
 }
 
+inline void FullyConnectedSparseWeight1x4(
+    const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
+    const RuntimeShape& input_shape, const float* input_data,
+    const RuntimeShape& weights_shape, const float* weights_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data) {
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  const int output_elements = output_shape.FlatSize();
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
+                                       output_shape, output_dims_count - 1);
+  const int* w1_segments = sparsity.dim_metadata[1].array_segments->data;
+  const int* w1_indices = sparsity.dim_metadata[1].array_indices->data;
+
+  for (int i = 0; i < output_elements; ++i) {
+    output_data[i] = 0.f;
+  }
+
+  tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate1x4(
+      weights_data, w1_segments, w1_indices, weights_shape.Dims(0),
+      weights_shape.Dims(1), input_data, batches, output_data);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int i = 0; i < output_depth; ++i) {
+      float total = output_data[b * output_depth + i];
+      float bias_value = bias_data[i];
+      output_data[b * output_depth + i] = ActivationFunctionWithMinMax(
+          total + bias_value, output_activation_min, output_activation_max);
+    }
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
index 7fb69e7b4f4..80cc14c6d26 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
@@ -89,18 +90,24 @@ float GetFloatVectorElement(__m128 v) {
 
 }  // namespace
 
-void SseMatrixBatchVectorMultiplyAccumulate(
+void SseMatrixBatchVectorMultiplyAccumulateImpl(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result) {
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, const int32_t* row_sums) {
   for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
+    const int32_t batch_offset = input_offset ? input_offset[batch] : 0;
     // Compute dot-product for every column.
     for (std::intptr_t row = 0; row < m_rows; ++row) {
       // Get the address of the first element of the row.
       const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
-
+      const float row_scale =
+          per_channel_scale ? per_channel_scale[row] * batch_scaling_factor
+                            : batch_scaling_factor;
+      const int32_t row_offset =
+          row_sums && batch_offset ? batch_offset * row_sums[row] : 0;
       // Initialize the dot product sum for the row to 0.
       __m128i dotprod_32x4 = _mm_setzero_si128();
       std::intptr_t col = 0;
@@ -152,8 +159,10 @@ void SseMatrixBatchVectorMultiplyAccumulate(
       for (; col < m_cols; ++col) {
         sum += row_ptr[col] * vectors[col];
       }  // for col
-
-      *result += sum * batch_scaling_factor;
+      if (row_offset) {
+        sum -= row_offset;
+      }
+      *result += sum * row_scale;
       ++result;
     }  // for row
 
@@ -165,56 +174,30 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset) {
-  if (input_offset == nullptr) {
-    SseMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
-                                           scaling_factors, n_batch, result);
-    return;
-  }
-  static constexpr std::intptr_t kBlockSize = 16;
-  for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
-    const float batch_scaling_factor = scaling_factors[batch];
-    for (std::intptr_t row = 0; row < m_rows; ++row) {
-      const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
-      float scale = batch_scaling_factor;
-      if (per_channel_scale != nullptr) {
-        scale *= per_channel_scale[row];
-      }
-      __m128i dotprod_32x4 = _mm_setzero_si128();
-      __m128i row_sum_16x8 = _mm_setzero_si128();
-      std::intptr_t col = 0;
-      for (; col < (m_cols & ~(kBlockSize - 1)); col += kBlockSize) {
-        const __m128i vec_8x16 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i*>(vectors + col));
-        const __m128i row_8x16 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
-        // dotprod += vec · row
-        dotprod_32x4 =
-            _mm_add_epi32(dotprod_32x4, DotProdInt8x4x4(vec_8x16, row_8x16));
+    float* __restrict__ result) {
+  SseMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+      /*row_sums=*/nullptr);
+}
 
-        // Pairwise add 16x 8-bit values; equivalently, multipy-add with 1.
-        // Result is 8x 16-bit values.
-        const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16);
-        row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
-      }  // for col
-      // Pairwise add 8x 16-bit values; equivalently, multipy-add with 1.
-      // Result is 4x 32-bit values.
-      const __m128i row_sum_32x4 =
-          _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1));
-      int32_t sum = ReduceInt32x4(dotprod_32x4);
-      int32_t row_sum = ReduceInt32x4(row_sum_32x4);
-      // Postamble loop.
-      for (; col < m_cols; ++col) {
-        sum += row_ptr[col] * vectors[col];
-        row_sum += row_ptr[col];
-      }  // for col
-      sum -= row_sum * input_offset[batch];
-      *result += sum * scale;
-      ++result;
-    }  // for row
-    vectors += m_cols;
-  }  // for batch
+void SseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  if ((input_offset != nullptr) && (!compute_row_sums || *compute_row_sums)) {
+    memset(row_sums, 0, sizeof(int32_t) * m_rows);
+    SseReductionSumVector(matrix, row_sums, m_rows, m_cols);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
+  }
+  SseMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale, input_offset, row_sums);
 }
 
 namespace {
@@ -347,6 +330,44 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate(
   }  // for batch
 }
 
+void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                           const int output_size, const int reduction_size) {
+  static constexpr std::intptr_t kBlockSize = 16;
+  for (std::intptr_t row = 0; row < output_size; ++row) {
+    const int8_t* __restrict__ row_ptr = input_vector + row * reduction_size;
+    __m128i row_sum_16x8 = _mm_setzero_si128();
+    std::intptr_t col = 0;
+    for (; col < (reduction_size & ~(kBlockSize - 1)); col += kBlockSize) {
+      const __m128i row_8x16 =
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
+      const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16);
+      row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
+    }  // for col
+#ifdef __SSE4_1__
+    // Postamble for 8x 8-bit inputs.
+    if (col < (reduction_size & ~7)) {
+      // _mm_loadu_si64 not supported in gcc versions < 9, breaks kokoro build.
+      const __m128i row_16x8 = _mm_cvtepi8_epi16(
+          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr + col)));
+      // dotprod += vec · row
+      row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
+      col += 8;
+    }
+#endif
+    const __m128i row_sum_32x4 =
+        _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1));
+    int32_t row_sum = ReduceInt32x4(row_sum_32x4);
+#if defined(__SSE4_1__) && defined(__clang__)
+    // SSE 4.1: Don't try to unroll and vectorize this, already done above.
+#pragma clang loop unroll(disable) vectorize(disable)
+#endif
+    for (; col < reduction_size; col++) {
+      row_sum += *(row_ptr + col);
+    }
+    *(output_vector + row) += row_sum;
+  }
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 1d0d2273e93..224d811e862 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -59,10 +59,9 @@ void MatrixBatchVectorMultiplyAccumulate(
     int n_batch, float* __restrict__ result, const float* per_channel_scale,
     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
     bool* compute_row_sums, CpuBackendContext* context) {
-  PortableMatrixBatchVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
-      context);
+  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                  vectors, scaling_factors, n_batch, result, per_channel_scale,
+                  input_offset, scratch, row_sums, compute_row_sums, context);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
@@ -75,15 +74,12 @@ void MatrixBatchVectorMultiplyAccumulate(
                   vectors, scaling_factors, n_batch, result);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors,
-    const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset) {
-  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                  vectors, scaling_factors, n_batch, result, per_channel_scale,
-                  input_offset);
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x4, matrix,
+                   segments, indices, m_rows, m_cols, vector, n_batch, result);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
@@ -307,8 +303,8 @@ void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
 
 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
                         int output_size, int reduction_size) {
-  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
-                   reduction_size);
+  SSE_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                  reduction_size);
 }
 
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
index 1996b1f30a9..c5ede624762 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
@@ -38,8 +40,9 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset);
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
 
 // Matrix multiplication for quantized values using symmetric quantization.
 // Sparse version.
@@ -49,6 +52,9 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ scaling_factors, int n_batch,
     float* __restrict__ result);
 
+void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                           const int output_size, const int reduction_size);
+
 #endif  // __SSSE3__
 
 }  // namespace tensor_utils
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h
index 19a968e4670..d9bc10a9390 100644
--- a/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -15,8 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 
@@ -49,6 +51,18 @@ inline bool LessEqualFn(T lhs, T rhs) {
   return lhs <= rhs;
 }
 
+inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  if (lhs.len != rhs.len) return false;
+  for (int i = 0; i < lhs.len; ++i) {
+    if (lhs.str[i] != rhs.str[i]) return false;
+  }
+  return true;
+}
+
+inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  return !StringRefEqualFn(lhs, rhs);
+}
+
 template <typename T>
 using ComparisonFn = bool (*)(T, T);
 
@@ -64,6 +78,22 @@ inline void ComparisonImpl(
   }
 }
 
+inline void ComparisonStringImpl(bool (*F)(const StringRef&, const StringRef&),
+                                 const RuntimeShape& input1_shape,
+                                 const TfLiteTensor* input1,
+                                 const RuntimeShape& input2_shape,
+                                 const TfLiteTensor* input2,
+                                 const RuntimeShape& output_shape,
+                                 bool* output_data) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const auto lhs = GetString(input1, i);
+    const auto rhs = GetString(input2, i);
+    output_data[i] = F(lhs, rhs);
+  }
+}
+
 template <ComparisonFn<float> F>
 inline void Comparison(const ComparisonParams& op_params,
                        const RuntimeShape& input1_shape,
@@ -105,35 +135,76 @@ inline void ComparisonWithScaling(
   }
 }
 
+struct BroadcastComparison4DSlowCommon {
+  const RuntimeShape output_shape;
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+};
+
+inline BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
+    const RuntimeShape& unextended_input1_shape,
+    const RuntimeShape& unextended_input2_shape,
+    const RuntimeShape& unextended_output_shape) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
+          desc2};
+}
+
 template <typename T, ComparisonFn<T> F>
 inline void BroadcastComparison4DSlowImpl(
     const ComparisonParams& op_params,
     const RuntimeShape& unextended_input1_shape, const T* input1_data,
     const RuntimeShape& unextended_input2_shape, const T* input2_data,
     const RuntimeShape& unextended_output_shape, bool* output_data) {
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
 
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
-          output_data[Offset(output_shape, b, y, x, c)] =
-              F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          output_data[Offset(dims.output_shape, b, y, x, c)] =
+              F(input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)],
+                input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]);
         }
       }
     }
   }
 }
+
+inline void BroadcastComparison4DSlowStringImpl(
+    bool (*F)(const StringRef&, const StringRef&),
+    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
+    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
+
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          const auto lhs =
+              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
+          const auto rhs =
+              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
+          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
+        }
+      }
+    }
+  }
+}
+
 template <ComparisonFn<float> F>
 inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                       const RuntimeShape& input1_shape,
@@ -153,16 +224,10 @@ inline void BroadcastComparison4DSlowWithScaling(
     const RuntimeShape& unextended_input1_shape, const T* input1_data,
     const RuntimeShape& unextended_input2_shape, const T* input2_data,
     const RuntimeShape& unextended_output_shape, bool* output_data) {
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
 
   int left_shift = op_params.left_shift;
   int32 input1_offset = op_params.input1_offset;
@@ -172,14 +237,16 @@ inline void BroadcastComparison4DSlowWithScaling(
   int32 input2_multiplier = op_params.input2_multiplier;
   int input2_shift = op_params.input2_shift;
 
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
           const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+              input1_offset +
+              input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
           const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+              input2_offset +
+              input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
@@ -188,7 +255,7 @@ inline void BroadcastComparison4DSlowWithScaling(
           const int32 scaled_input2_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
                   shifted_input2_val, input2_multiplier, input2_shift);
-          output_data[Offset(output_shape, b, y, x, c)] =
+          output_data[Offset(dims.output_shape, b, y, x, c)] =
               F(scaled_input1_val, scaled_input2_val);
         }
       }
diff --git a/tensorflow/lite/kernels/internal/reference/fully_connected.h b/tensorflow/lite/kernels/internal/reference/fully_connected.h
index fa59e1df370..204a0fa0afa 100644
--- a/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -23,8 +23,6 @@ limitations under the License.
 namespace tflite {
 namespace reference_ops {
 
-const int kReverseShift = -1;
-
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& weights_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
index 4b101f72ede..9131c7dbe57 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -122,6 +122,95 @@ inline void ConvPerChannel(
   }
 }
 
+// Fixed-point per-channel-quantization convolution reference kernel.
+// 16-bit data and 8-bit filter
+inline void ConvPerChannel(
+    const ConvParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const int16* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    int16* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Set min and max value of the output.
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+
+  // Sanity check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          std::int64_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  // Accumulate with 64 bits accumulator.
+                  // int64 += int8 * int16 so the highest value we can
+                  // get from each accumulation is [-127, 127] * ([-32768,
+                  // 32767] -
+                  // [-32768, 32767]), which is [-8322945, 8322945].
+                  // log2(8322945) = 22.99.
+                  acc += filter_val * input_val;
+                }
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int16_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_integer_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h b/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
index 8e7c7f31760..7488a2147c4 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
@@ -41,8 +41,8 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
     }
     int32_t inv_l2norm_multiplier;
     int inv_l2norm_shift;
-    GetInvSqrtQuantizedMultiplierExp(acc, /*reverse_shift*/ -1,
-                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
+    GetInvSqrtQuantizedMultiplierExp(acc, kReverseShift, &inv_l2norm_multiplier,
+                                     &inv_l2norm_shift);
 
     for (int inner_index = 0; inner_index < depth; ++inner_index) {
       int32_t input =
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
index aa626f43f19..e315683c0cd 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -58,12 +58,15 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
   }
 }
 
-inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
-                     int16_t* ptr_output_data) {
+inline void Logistic(int32_t input_multiplier, int32_t input_size,
+                     const int16_t* ptr_input_data, int16_t* ptr_output_data) {
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = *ptr_input_data;
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
 
     // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
     // we do interpolation on unsigned values.
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index 8c07c6f6d6c..baae65ab30e 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -57,12 +57,16 @@ inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
   }
 }
 
-inline void Tanh(int32_t input_left_shift, int32_t input_size,
-                 const int16_t* ptr_input_data, int16_t* ptr_output_data) {
+inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
+                 int32_t input_size, const int16_t* ptr_input_data,
+                 int16_t* ptr_output_data) {
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = *ptr_input_data;
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
 
     if (input_left_shift == 1) {
       input_data <<= 1;
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
index 1ad6e20f2dc..422adc2a333 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
@@ -25,8 +25,9 @@ inline void TransposeConv(
     const ConvParams& params, const int32* output_multiplier,
     const int32* output_shift, const RuntimeShape& input_shape,
     const int8* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& output_shape,
-    int8* output_data, const RuntimeShape& im2col_shape, int8* im2col_data,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
+    const RuntimeShape& im2col_shape, int8* im2col_data,
     int32* scratch_buffer) {
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
@@ -41,6 +42,9 @@ inline void TransposeConv(
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
   const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
   const int input_height = input_shape.Dims(1);
   const int input_width = input_shape.Dims(2);
   const int filter_height = filter_shape.Dims(1);
@@ -99,6 +103,9 @@ inline void TransposeConv(
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
           int32 acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
                                             out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
           acc = MultiplyByQuantizedMultiplier(
               acc, output_multiplier[out_channel], output_shift[out_channel]);
           acc += output_offset;
diff --git a/tensorflow/lite/kernels/internal/reference/l2normalization.h b/tensorflow/lite/kernels/internal/reference/l2normalization.h
new file mode 100644
index 00000000000..00697c2e548
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/l2normalization.h
@@ -0,0 +1,91 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
+
+#include <algorithm>
+#include <cmath>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const float* input_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data, float epsilon = 1e-6) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[depth * i + c];
+      squared_l2_norm += val * val;
+    }
+    float l2_norm = std::sqrt(squared_l2_norm);
+    l2_norm = std::max(l2_norm, epsilon);
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
+    }
+  }
+}
+
+inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const uint8* input_data,
+                            const RuntimeShape& output_shape,
+                            uint8* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int32 input_zero_point = op_params.input_zero_point;
+
+  for (int i = 0; i < outer_size; ++i) {
+    int32 square_l2_norm = 0;
+    for (int c = 0; c < depth; c++) {
+      int32 diff = input_data[depth * i + c] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    int32 inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
+                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
+    for (int c = 0; c < depth; c++) {
+      int32 diff = input_data[depth * i + c] - input_zero_point;
+      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32 unclamped_output_val = 128 + rescaled_diff;
+      int32 output_val =
+          std::min(static_cast<int32>(255),
+                   std::max(static_cast<int32>(0), unclamped_output_val));
+      output_data[depth * i + c] = static_cast<uint8>(output_val);
+    }
+  }
+}
+
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index 2148be45590..f62c9bd197c 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -387,8 +387,20 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
   op_params.stride_height = stride_height;
 
   TransposeConv(op_params, DimsToShape(input_dims), input_data,
-                DimsToShape(filter_dims), filter_data, DimsToShape(output_dims),
-                output_data, DimsToShape(im2col_dims), im2col_data);
+                DimsToShape(filter_dims), filter_data,
+                /*bias_shape*/ RuntimeShape(), /*bias*/ nullptr,
+                DimsToShape(output_dims), output_data, DimsToShape(im2col_dims),
+                im2col_data);
+}
+
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
+  TransposeConv(params, input_shape, input_data, filter_shape, filter_data,
+                /*bias_shape*/ RuntimeShape(), /*bias*/ nullptr, output_shape,
+                output_data, im2col_shape, im2col_data);
 }
 
 inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 19c74973aeb..4f6db290d4f 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
@@ -53,7 +52,7 @@ void PortableSymmetricQuantizeFloats(const float* values, const int size,
 void PortableSymmetricQuantizeFloats(const float* values, const int size,
                                      int8_t* quantized_values, float min_value,
                                      float max_value, float* scaling_factor) {
-  const int kScale = 127;
+  const int32_t kScale = 127;
   const float range = std::max(std::abs(min_value), std::abs(max_value));
   if (range == 0) {
     memset(quantized_values, 0, size * sizeof(int8_t));
@@ -66,7 +65,8 @@ void PortableSymmetricQuantizeFloats(const float* values, const int size,
     const int32_t quantized_value =
         static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
     // Clamp: just in case some odd numeric offset.
-    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+    quantized_values[i] = static_cast<int8_t>(
+        std::min(kScale, std::max(-kScale, quantized_value)));
   }
 }
 
@@ -161,35 +161,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
   }    // for batch
 }
 
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
-    const float batch_scaling_factor = scaling_factors[batch];
-    const float batch_offset = input_offset[batch];
-    const int8_t* row_ptr = matrix;
-    for (int row = 0; row < m_rows; ++row) {
-      int32_t dotprod = 0;
-      float scale = batch_scaling_factor;
-      if (per_channel_scale) {
-        scale *= per_channel_scale[row];
-      }
-#if defined(__GNUC__)
-      // Prefetch the row to cache.
-      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
-                         3 /* temporal locality */);
-#endif
-      for (int col = 0; col < m_cols; ++col, ++row_ptr) {
-        dotprod += (*row_ptr) * (vectors[col] - batch_offset);
-      }  // for col
-      *result += dotprod * scale;
-      ++result;
-    }  // for row
-  }    // for batch
-}
-
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
@@ -234,6 +205,30 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
   }    // for batch
 }
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  const int kBlockSize = 4;
+  TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
+  for (int batch = 0; batch < n_batch; batch++) {
+    const float* matrix_ptr = matrix;
+    for (int row = 0; row < m_rows; row++) {
+      float dot_prod = 0.0f;
+      const float* vector_in_batch = vector + batch * m_cols;
+      for (int i = segments[row]; i < segments[row + 1]; i++) {
+        const int block_start_index = indices[i] * kBlockSize;
+        const float* vector_block_in_batch_ptr =
+            vector_in_batch + block_start_index;
+        for (int c = 0; c < kBlockSize; c++) {
+          dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
+        }
+      }
+      result[batch * m_rows + row] += dot_prod;
+    }
+  }
+}
+
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
@@ -636,7 +631,8 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
       int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
       value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
       value -= output_zp;
-      value = std::min(std::max(-128, value), 127);
+      value = std::min(std::max(static_cast<int32_t>(-128), value),
+                       static_cast<int32_t>(127));
 
       output[index] = static_cast<int8>(value);
     }
@@ -724,7 +720,8 @@ void PortableVectorBatchVectorCwiseProductAccumulate(
       int32_t prod = vector[v] * *batch_vector++;
       prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
       int32_t output = prod + *result;
-      output = std::max(std::min(32767, output), -32768);
+      output = std::max(std::min(static_cast<int32_t>(32767), output),
+                        static_cast<int32_t>(-32768));
       *result++ = output;
     }
   }
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index c4f886f6a5c..0fd7a407595 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -18,7 +18,6 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 
 #if defined(_MSC_VER)
@@ -99,14 +98,12 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
                                               scaling_factors, n_batch, result);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
-                                              scaling_factors, n_batch, result,
-                                              per_channel_scale, input_offset);
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+      matrix, segments, indices, m_rows, m_cols, vector, n_batch, result);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 04fedc327d0..34767ccd942 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -20,13 +20,17 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
 
 namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation.
+class CpuBackendContext;
+
 namespace tensor_utils {
 
 // Limit a float input f between +abs_limit and -abs_limit.
@@ -79,11 +83,10 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     int n_batch, int32_t* scratch, float* __restrict__ result,
     CpuBackendContext* context);
 
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset);
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
 
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h
index adbbf66eb1b..50d9ad24dd9 100644
--- a/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -23,14 +23,12 @@ namespace tflite {
 
 namespace reference_ops {
 
-// Broadcast prelu to output_shape for quantized uint8 data.
-inline void BroadcastPrelu4DSlow(const PreluParams& params,
-                                 const RuntimeShape& input_shape,
-                                 const uint8* input_data,
-                                 const RuntimeShape& alpha_shape,
-                                 const uint8* alpha_data,
-                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
+// Broadcast prelu to output_shape for quantized uint8/int8 data.
+template <typename T>
+inline void BroadcastPrelu4DSlow(
+    const PreluParams& params, const RuntimeShape& input_shape,
+    const T* input_data, const RuntimeShape& alpha_shape, const T* alpha_data,
+    const RuntimeShape& output_shape, T* output_data) {
   TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
@@ -48,23 +46,26 @@ inline void BroadcastPrelu4DSlow(const PreluParams& params,
           int input_index = SubscriptToIndex(desc1, b, y, x, c);
           const int32 input_value =
               params.input_offset + input_data[input_index];
+          int32 output_value;
           if (input_value >= 0) {
-            output_data[output_index] = input_data[input_index];
+            output_value = MultiplyByQuantizedMultiplier(
+                input_value, params.output_multiplier_1, params.output_shift_1);
           } else {
             auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
             const int32 alpha_value =
                 params.alpha_offset + alpha_data[alpha_index];
-            const int32 unclamped_output =
-                params.output_offset +
-                MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                    input_value * alpha_value, params.output_multiplier,
-                    params.output_shift);
-            const int32 quantized_min = std::numeric_limits<uint8_t>::min();
-            const int32 quantized_max = std::numeric_limits<uint8_t>::max();
-            const int32 clamped_output = std::min(
-                quantized_max, std::max(quantized_min, unclamped_output));
-            output_data[output_index] = static_cast<uint8>(clamped_output);
+
+            output_value = MultiplyByQuantizedMultiplier(
+                input_value * alpha_value, params.output_multiplier_2,
+                params.output_shift_2);
           }
+          output_value += params.output_offset;
+
+          const int32 quantized_min = std::numeric_limits<T>::min();
+          const int32 quantized_max = std::numeric_limits<T>::max();
+          const int32 clamped_output =
+              std::min(quantized_max, std::max(quantized_min, output_value));
+          output_data[output_index] = static_cast<T>(clamped_output);
         }
       }
     }
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index a872bc4d56a..1a6c6d0d80e 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/reference/floor.h"
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/reference/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
 #include "tensorflow/lite/kernels/internal/reference/mul.h"
@@ -53,6 +54,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/quantize.h"
 #include "tensorflow/lite/kernels/internal/reference/reduce.h"
 #include "tensorflow/lite/kernels/internal/reference/requantize.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
 #include "tensorflow/lite/kernels/internal/reference/round.h"
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
@@ -293,62 +295,6 @@ inline void QuantizeLeakyRelu(const LeakyReluParams& params,
   }
 }
 
-inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
-                            const RuntimeShape& input_shape,
-                            const float* input_data,
-                            const RuntimeShape& output_shape,
-                            float* output_data, float epsilon = 1e-6) {
-  const int trailing_dim = input_shape.DimensionsCount() - 1;
-  const int outer_size =
-      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int depth =
-      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-  for (int i = 0; i < outer_size; ++i) {
-    float squared_l2_norm = 0;
-    for (int c = 0; c < depth; ++c) {
-      const float val = input_data[depth * i + c];
-      squared_l2_norm += val * val;
-    }
-    float l2_norm = std::sqrt(squared_l2_norm);
-    l2_norm = std::max(l2_norm, epsilon);
-    for (int c = 0; c < depth; ++c) {
-      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
-    }
-  }
-}
-
-inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
-                            const RuntimeShape& input_shape,
-                            const uint8* input_data,
-                            const RuntimeShape& output_shape,
-                            uint8* output_data) {
-  const int trailing_dim = input_shape.DimensionsCount() - 1;
-  const int depth =
-      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-  const int outer_size =
-      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int32 input_zero_point = op_params.input_zero_point;
-  for (int i = 0; i < outer_size; ++i) {
-    int32 square_l2_norm = 0;
-    for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
-      square_l2_norm += diff * diff;
-    }
-    int32 inv_l2norm_multiplier;
-    int inv_l2norm_shift;
-    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
-                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
-    for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
-      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
-      int32 unclamped_output_val = 128 + rescaled_diff;
-      int32 output_val = std::min(255, std::max(0, unclamped_output_val));
-      output_data[depth * i + c] = static_cast<uint8>(output_val);
-    }
-  }
-}
-
 // T is expected to be either float or int.
 template <typename T>
 inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
@@ -2102,7 +2048,8 @@ void Transpose(const TransposeParams& params,
 inline void TransposeConv(
     const ConvParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& filter_shape,
-    const float* filter_data, const RuntimeShape& output_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
     float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
@@ -2123,6 +2070,9 @@ inline void TransposeConv(
   const int filter_width = filter_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
 
   // Although transpose convolution simplifies to convolution with transposed
   // weights for strides of 1, non-unitary striding complicates matters. To
@@ -2170,16 +2120,27 @@ inline void TransposeConv(
       }
     }
   }
+  if (bias_data) {
+    for (int batch = 0; batch < batches; ++batch) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               out_channel)] += bias_data[out_channel];
+          }
+        }
+      }
+    }
+  }
 }
 
-inline void TransposeConv(const ConvParams& params,
-                          const RuntimeShape& input_shape,
-                          const uint8* input_data,
-                          const RuntimeShape& filter_shape,
-                          const uint8* filter_data,
-                          const RuntimeShape& output_shape, uint8* output_data,
-                          const RuntimeShape& im2col_shape, uint8* im2col_data,
-                          int32* scratch_buffer) {
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data, const RuntimeShape& im2col_shape, uint8* im2col_data,
+    int32* scratch_buffer) {
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int pad_width = params.padding_values.width;
@@ -2207,6 +2168,9 @@ inline void TransposeConv(const ConvParams& params,
   const int32 output_activation_min = params.quantized_activation_min;
   const int32 output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
 
   const int num_elements = output_shape.FlatSize();
   // We need to initialize scratch_buffer to all 0s, as we apply the same
@@ -2248,14 +2212,25 @@ inline void TransposeConv(const ConvParams& params,
       }
     }
   }
-  for (int i = 0; i < num_elements; ++i) {
-    int32 acc = scratch_buffer[i];
-    acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-    acc += output_offset;
-    // Clamp the output before converting back to uint8.
-    acc = std::max(acc, output_activation_min);
-    acc = std::min(acc, output_activation_max);
-    output_data[i] = static_cast<uint8>(acc);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          int32 acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                            out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32 scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier, output_shift);
+          scaled_acc += output_offset;
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<uint8>(scaled_acc);
+        }
+      }
+    }
   }
 }
 
@@ -2459,60 +2434,6 @@ inline void BroadcastPow4DSlow(const RuntimeShape& unextended_input1_shape,
   }
 }
 
-template <typename T>
-inline void ResizeNearestNeighbor(
-    const tflite::ResizeNearestNeighborParams& op_params,
-    const RuntimeShape& unextended_input_shape, const T* input_data,
-    const RuntimeShape& output_size_shape, const int32* output_size_data,
-    const RuntimeShape& unextended_output_shape, T* output_data) {
-  // Align corners = true is not supported.
-  TFLITE_DCHECK(!op_params.align_corners);
-  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-
-  const RuntimeShape input_shape =
-      RuntimeShape::ExtendedShape(4, unextended_input_shape);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
-  int32 input_height = input_shape.Dims(1);
-  int32 input_width = input_shape.Dims(2);
-  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
-
-  // The Tensorflow version of this op allows resize on the width and height
-  // axis only.
-  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
-  int32 output_height = output_size_data[0];
-  int32 output_width = output_size_data[1];
-
-  // We use float to ensure agreement with the Tensorflow implementation.
-  const float height_scale = static_cast<float>(input_height) / output_height;
-  const float width_scale = static_cast<float>(input_width) / output_width;
-
-  const int col_offset = input_shape.Dims(3);
-  const int row_offset = input_shape.Dims(2) * col_offset;
-  const int batch_offset = input_shape.Dims(1) * row_offset;
-
-  const T* input_ptr = input_data;
-  T* output_ptr = output_data;
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < output_height; ++y) {
-      int32 in_y = std::min(static_cast<int32>(std::floor(y * height_scale)),
-                            input_height - 1);
-      const T* y_input_ptr = input_ptr + in_y * row_offset;
-      for (int x = 0; x < output_width; ++x) {
-        int32 in_x = std::min(static_cast<int32>(std::floor(x * width_scale)),
-                              input_width - 1);
-        const T* x_input_ptr = y_input_ptr + in_x * col_offset;
-        memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
-        output_ptr += depth;
-      }
-    }
-    input_ptr += batch_offset;
-  }
-}
-
 template <typename T>
 void Fill(const RuntimeShape& value_shape, const T* value_data,
           const RuntimeShape& output_shape, T* output_data) {
@@ -2676,7 +2597,7 @@ inline void HardSwish(const HardSwishParams& params,
     // significant bits in the high bits of our 16-bit fixedpoint values, so
     // that fixed-point approximate computations below are as accurate as
     // possible.
-    const int16_t input_value_on_hires_input_scale = input_value << 7;
+    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
     // Compute the input value on essentially the output scale, just not
     // right-shifted yet. This is the value that we'll use in the (x >= +3)
     // case, and that in the general case we'll multiply against the "relu-ish"
diff --git a/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
new file mode 100644
index 00000000000..ed87863a7e5
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline int32 GetNearestNeighbor(const int input_value, const int32 input_size,
+                                const int32 output_size,
+                                const bool align_corners,
+                                const bool half_pixel_centers) {
+  const float scale =
+      (align_corners && output_size > 1)
+          ? (input_size - 1) / static_cast<float>(output_size - 1)
+          : input_size / static_cast<float>(output_size);
+  const float offset = half_pixel_centers ? 0.5f : 0.0f;
+  int32 output_value = std::min(
+      align_corners
+          ? static_cast<int32>(std::round((input_value + offset) * scale))
+          : static_cast<int32>(std::floor((input_value + offset) * scale)),
+      input_size - 1);
+  if (half_pixel_centers) {
+    output_value = std::max(static_cast<int32>(0), output_value);
+  }
+  return output_value;
+}
+
+template <typename T>
+inline void ResizeNearestNeighbor(
+    const tflite::ResizeNearestNeighborParams& op_params,
+    const RuntimeShape& unextended_input_shape, const T* input_data,
+    const RuntimeShape& output_size_shape, const int32* output_size_data,
+    const RuntimeShape& unextended_output_shape, T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32 input_height = input_shape.Dims(1);
+  int32 input_width = input_shape.Dims(2);
+  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  // The Tensorflow version of this op allows resize on the width and height
+  // axis only.
+  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
+  int32 output_height = output_size_data[0];
+  int32 output_width = output_size_data[1];
+
+  const int col_offset = input_shape.Dims(3);
+  const int row_offset = input_shape.Dims(2) * col_offset;
+  const int batch_offset = input_shape.Dims(1) * row_offset;
+
+  const T* input_ptr = input_data;
+  T* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      int32 in_y = GetNearestNeighbor(y, input_height, output_height,
+                                      op_params.align_corners,
+                                      op_params.half_pixel_centers);
+      const T* y_input_ptr = input_ptr + in_y * row_offset;
+      for (int x = 0; x < output_width; ++x) {
+        int32 in_x = GetNearestNeighbor(x, input_width, output_width,
+                                        op_params.align_corners,
+                                        op_params.half_pixel_centers);
+        const T* x_input_ptr = y_input_ptr + in_x * col_offset;
+        memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
+        output_ptr += depth;
+      }
+    }
+    input_ptr += batch_offset;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index 3f19878e6a6..dd44b3c7863 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
 
 #include <limits>
+#include <vector>
 
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -142,6 +143,83 @@ inline void Softmax(const SoftmaxParams& params,
   }
 }
 
+// Quantized softmax with int16 input and int16 output.
+inline void SoftmaxInt16(const SoftmaxParams& params,
+                         const RuntimeShape& input_shape,
+                         const int16_t* input_data,
+                         const RuntimeShape& output_shape,
+                         int16_t* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    // Find the largest element
+    int16_t max_in_row = std::numeric_limits<int16_t>::min();
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    // Compute exp(input - max_input)
+    std::vector<int16_t> exp_result_Q015(depth);
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff = input_data[i * depth + c] - max_in_row;
+      // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
+      int32_t scaled_diff = MultiplyByQuantizedMultiplier(
+          input_diff, params.input_multiplier, params.input_left_shift);
+      // recenter to [-32768, 32767]
+      int32_t sym_scaled_diff = scaled_diff + 32767;
+      int16_t sat_sym_scaled_diff =
+          std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
+                   static_cast<int32_t>(32767));
+      // apply the exp() LUT activation function
+      exp_result_Q015[c] =
+          generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
+    }
+
+    // sum_of_exps is a Q16.15 fixed point format.
+    int32_t sum_of_exps = 0;
+    for (int c = 0; c < depth; ++c) {
+      // Q16.15 + Q0.15
+      sum_of_exps += exp_result_Q015[c];
+    }
+
+    // Compute the reciprocal 1/sum_of_exps
+    uint8_t headroom_plus_one =
+        CountLeadingZeros(static_cast<uint32_t>(sum_of_exps));
+    int32_t shifted_sum =
+        ((static_cast<int64_t>(sum_of_exps) << (headroom_plus_one - 1)) +
+         (1 << 13)) >>
+        14;
+    // since the LUT computes 1/(1 + x) we need to first compute x = (sum - 1).
+    // also, the LUT expects a symmetrical input, so we must also recenter x
+    // from [0, 65535] to [-32768, 32767].
+    int32_t sym_shifted_sum = shifted_sum + (-((1 << 15) + (1 << 16)));
+    int16_t sat_sym_shifted_sum = static_cast<int16_t>(
+        std::min(std::max(sym_shifted_sum, static_cast<int32_t>(-32768)),
+                 static_cast<int32_t>(32767)));
+    // apply 1/(1 + x) LUT activation function
+    int16_t reciprocal_scale_Q015 = generic_int16_table_lookup(
+        sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
+
+    // Rescale the exp_result with reciprocal
+    // range of output is [0, 32767] correspond to [0.0, 1.0]
+    for (int c = 0; c < depth; ++c) {
+      uint8_t right_shift = 31 - headroom_plus_one;
+      int64_t round = 1 << (right_shift - 1);
+      int32_t result = (static_cast<int64_t>(exp_result_Q015[c]) *
+                            static_cast<int64_t>(reciprocal_scale_Q015) +
+                        round) >>
+                       right_shift;
+      output_data[i * depth + c] = static_cast<int16_t>(
+          std::min(std::max(result, static_cast<int32_t>(0)),
+                   static_cast<int32_t>(32767)));
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
index 102ee04e6a8..4659d3a80e4 100644
--- a/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
@@ -30,8 +30,9 @@ void TestReferenceResizeNearestNeighbor(
     const RuntimeShape& input_shape, const std::vector<T>& input_data,
     const std::vector<int32>& output_size_data,
     const RuntimeShape& output_shape,
-    const std::vector<T>& expected_output_data) {
-  ResizeNearestNeighborParams op_params{/*align_corners=*/false};
+    const std::vector<T>& expected_output_data, bool align_corners = false,
+    bool half_pixel_centers = false) {
+  ResizeNearestNeighborParams op_params{align_corners, half_pixel_centers};
   RuntimeShape output_size_shape({1, 1, 1, 2});
 
   std::vector<T> output_data(expected_output_data.size());
@@ -55,6 +56,30 @@ TEST(ResizeNearestNeighborReference, Test2x2To1x1) {
                                      output_shape, output_data);
 }
 
+TEST(ResizeNearestNeighborReference, Test2x2To1x1_AlignCorners) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<float> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {1, 1};
+  RuntimeShape output_shape = {1, 1, 1, 1};
+  std::vector<float> output_data = {1};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data,
+                                     /*align_corners=*/true);
+}
+
+TEST(ResizeNearestNeighborReference, Test2x2To1x1_HalfPixelCenters) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<float> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {1, 1};
+  RuntimeShape output_shape = {1, 1, 1, 1};
+  std::vector<float> output_data = {4};
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/false, /*half_pixel_centers=*/true);
+}
+
 TEST(ResizeNearestNeighborReference, Test2x2To3x3) {
   RuntimeShape input_shape = {1, 2, 2, 1};
   std::vector<uint8> input_data = {1, 2, 3, 4};
@@ -66,6 +91,30 @@ TEST(ResizeNearestNeighborReference, Test2x2To3x3) {
                                      output_shape, output_data);
 }
 
+TEST(ResizeNearestNeighborReference, Test2x2To3x3_AlignCorners) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<uint8> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {1, 3, 3, 1};
+  std::vector<uint8> output_data = {1, 2, 2, 3, 4, 4, 3, 4, 4};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data,
+                                     /*align_corners=*/true);
+}
+
+TEST(ResizeNearestNeighborReference, Test2x2To3x3_HalfPixelCenters) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<uint8> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {1, 3, 3, 1};
+  std::vector<uint8> output_data = {1, 2, 2, 3, 4, 4, 3, 4, 4};
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/false, /*half_pixel_centers=*/true);
+}
+
 TEST(ResizeNearestNeighborReference, Test3x3To2x2) {
   RuntimeShape input_shape = {1, 3, 3, 1};
   std::vector<float> input_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
@@ -77,6 +126,30 @@ TEST(ResizeNearestNeighborReference, Test3x3To2x2) {
                                      output_shape, output_data);
 }
 
+TEST(ResizeNearestNeighborReference, Test3x3To2x2_AlignCorners) {
+  RuntimeShape input_shape = {1, 3, 3, 1};
+  std::vector<float> input_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  std::vector<int32> output_size_data = {2, 2};
+  RuntimeShape output_shape = {1, 2, 2, 1};
+  std::vector<float> output_data = {1, 3, 7, 9};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data,
+                                     /*align_corners=*/true);
+}
+
+TEST(ResizeNearestNeighborReference, Test3x3To2x2_HalfPixelCenters) {
+  RuntimeShape input_shape = {1, 3, 3, 1};
+  std::vector<float> input_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  std::vector<int32> output_size_data = {2, 2};
+  RuntimeShape output_shape = {1, 2, 2, 1};
+  std::vector<float> output_data = {1, 3, 7, 9};
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/false, /*half_pixel_centers=*/true);
+}
+
 TEST(ResizeNearestNeighborReference, Test2x2To2x5) {
   RuntimeShape input_shape = {1, 2, 2, 1};
   std::vector<uint8> input_data = {1, 2, 3, 4};
@@ -88,6 +161,18 @@ TEST(ResizeNearestNeighborReference, Test2x2To2x5) {
                                      output_shape, output_data);
 }
 
+TEST(ResizeNearestNeighborReference, Test2x2To2x5_HalfPixelCenters) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<uint8> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {2, 5};
+  RuntimeShape output_shape = {1, 2, 5, 1};
+  std::vector<uint8> output_data = {1, 1, 2, 2, 2, 3, 3, 4, 4, 4};
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/false, /*half_pixel_centers=*/true);
+}
+
 TEST(ResizeNearestNeighborReference, Test4x4To3x3) {
   RuntimeShape input_shape = {1, 4, 4, 1};
   std::vector<uint8> input_data = {1, 2,  3,  4,  5,  6,  7,  8,
@@ -100,6 +185,32 @@ TEST(ResizeNearestNeighborReference, Test4x4To3x3) {
                                      output_shape, output_data);
 }
 
+TEST(ResizeNearestNeighborReference, Test4x4To3x3_AlignCorners) {
+  RuntimeShape input_shape = {1, 4, 4, 1};
+  std::vector<uint8> input_data = {1, 2,  3,  4,  5,  6,  7,  8,
+                                   9, 10, 11, 12, 13, 14, 15, 16};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {1, 3, 3, 1};
+  std::vector<uint8> output_data = {1, 3, 4, 9, 11, 12, 13, 15, 16};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data,
+                                     /*align_corners=*/true);
+}
+
+TEST(ResizeNearestNeighborReference, Test4x4To3x3_HalfPixelCenters) {
+  RuntimeShape input_shape = {1, 4, 4, 1};
+  std::vector<uint8> input_data = {1, 2,  3,  4,  5,  6,  7,  8,
+                                   9, 10, 11, 12, 13, 14, 15, 16};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {1, 3, 3, 1};
+  std::vector<uint8> output_data = {1, 3, 4, 9, 11, 12, 13, 15, 16};
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/false, /*half_pixel_centers=*/true);
+}
+
 TEST(ResizeNearestNeighborReference, Test2x2To5x2) {
   RuntimeShape input_shape = {1, 2, 2, 1};
   std::vector<float> input_data = {1, 2, 3, 4};
@@ -111,6 +222,31 @@ TEST(ResizeNearestNeighborReference, Test2x2To5x2) {
                                      output_shape, output_data);
 }
 
+TEST(ResizeNearestNeighborReference, Test2x2To5x2_HalfPixelCenters) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<float> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {5, 2};
+  RuntimeShape output_shape = {1, 5, 2, 1};
+  std::vector<float> output_data = {1, 2, 1, 2, 3, 4, 3, 4, 3, 4};
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/false, /*half_pixel_centers=*/true);
+}
+
+TEST(ResizeNearestNeighborReference,
+     Test2x2To5x2_HalfPixelCenters_AlignCorners) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<float> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {5, 2};
+  RuntimeShape output_shape = {1, 5, 2, 1};
+  std::vector<float> output_data = {2, 2, 2, 2, 4, 4, 4, 4, 4, 4};
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/true, /*half_pixel_centers=*/true);
+}
+
 TEST(ResizeNearestNeighborReference, Test2x2To4x4) {
   RuntimeShape input_shape = {1, 2, 2, 1};
   std::vector<uint8> input_data = {1, 2, 3, 4};
@@ -149,10 +285,56 @@ TEST(ResizeNearestNeighborReference, Test2x2x2x2To2x3x3x2) {
                                      output_shape, output_data);
 }
 
+TEST(ResizeNearestNeighborReference, Test2x2x2x2To2x3x3x2_AlignCorners) {
+  RuntimeShape input_shape = {2, 2, 2, 2};
+  std::vector<float> input_data = {1, 2, 3, 4, 5, 6, 7, 8,
+                                   1, 2, 3, 4, 5, 6, 7, 8};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {2, 3, 3, 2};
+  std::vector<float> output_data = {
+      1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 7, 8, 5, 6, 7, 8, 7, 8,
+      1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 7, 8, 5, 6, 7, 8, 7, 8,
+  };
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/true, /*half_pixel_centers=*/false);
+}
+
+TEST(ResizeNearestNeighborReference, Test2x2x2x2To2x3x3x2_HalfPixelCenters) {
+  RuntimeShape input_shape = {2, 2, 2, 2};
+  std::vector<float> input_data = {1, 1, 2, 2, 3, 3, 4, 4,
+                                   5, 5, 6, 6, 7, 7, 8, 8};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {2, 3, 3, 2};
+  std::vector<float> output_data = {1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4,
+                                    3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6,
+                                    7, 7, 8, 8, 8, 8, 7, 7, 8, 8, 8, 8};
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/false, /*half_pixel_centers=*/true);
+}
+
+TEST(ResizeNearestNeighborReference,
+     Test2x2x2x2To2x3x3x2_HalfPixelCenters_AlignCorners) {
+  RuntimeShape input_shape = {2, 2, 2, 2};
+  std::vector<float> input_data = {1, 2, 3, 4, 5, 6, 7, 8,
+                                   1, 2, 3, 4, 5, 6, 7, 8};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {2, 3, 3, 2};
+  std::vector<float> output_data = {1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 7, 8,
+                                    5, 6, 7, 8, 7, 8, 1, 2, 3, 4, 3, 4,
+                                    5, 6, 7, 8, 7, 8, 5, 6, 7, 8, 7, 8};
+
+  TestReferenceResizeNearestNeighbor(
+      input_shape, input_data, output_size_data, output_shape, output_data,
+      /*align_corners=*/true, /*half_pixel_centers=*/true);
+}
+
 void TestOptimizedResizeNearestNeighbor(int batch, int depth, int input_width,
                                         int input_height, int output_width,
                                         int output_height) {
-  ResizeNearestNeighborParams op_params{/*align_corners=*/false};
   RuntimeShape output_size_shape({1, 1, 1, 2});
 
   RuntimeShape input_shape({batch, input_height, input_width, depth});
@@ -167,6 +349,9 @@ void TestOptimizedResizeNearestNeighbor(int batch, int depth, int input_width,
   std::vector<uint8> output_data(output_shape.FlatSize(), 3);
   std::vector<int32> output_size_data = {output_height, output_width};
 
+  ResizeNearestNeighborParams op_params{/*align_corners=*/false,
+                                        /*half_pixel_centers=*/false};
+
   // Test the optimized version against the reference version.
   reference_ops::ResizeNearestNeighbor(
       op_params, input_shape, input_data.data(), output_size_shape,
@@ -174,7 +359,35 @@ void TestOptimizedResizeNearestNeighbor(int batch, int depth, int input_width,
   optimized_ops::ResizeNearestNeighbor(
       op_params, input_shape, input_data.data(), output_size_shape,
       output_size_data.data(), output_shape, output_data.data());
+  ASSERT_EQ(reference_output_data, output_data);
 
+  op_params.align_corners = true;
+  reference_ops::ResizeNearestNeighbor(
+      op_params, input_shape, input_data.data(), output_size_shape,
+      output_size_data.data(), output_shape, reference_output_data.data());
+  optimized_ops::ResizeNearestNeighbor(
+      op_params, input_shape, input_data.data(), output_size_shape,
+      output_size_data.data(), output_shape, output_data.data());
+  ASSERT_EQ(reference_output_data, output_data);
+
+  op_params.align_corners = false;
+  op_params.half_pixel_centers = true;
+  reference_ops::ResizeNearestNeighbor(
+      op_params, input_shape, input_data.data(), output_size_shape,
+      output_size_data.data(), output_shape, reference_output_data.data());
+  optimized_ops::ResizeNearestNeighbor(
+      op_params, input_shape, input_data.data(), output_size_shape,
+      output_size_data.data(), output_shape, output_data.data());
+  ASSERT_EQ(reference_output_data, output_data);
+
+  op_params.align_corners = true;
+  op_params.half_pixel_centers = true;
+  reference_ops::ResizeNearestNeighbor(
+      op_params, input_shape, input_data.data(), output_size_shape,
+      output_size_data.data(), output_shape, reference_output_data.data());
+  optimized_ops::ResizeNearestNeighbor(
+      op_params, input_shape, input_data.data(), output_size_shape,
+      output_size_data.data(), output_shape, output_data.data());
   ASSERT_EQ(reference_output_data, output_data);
 }
 
@@ -214,7 +427,7 @@ bool is_valid_scale(int input_width, int input_height, int output_width,
 
 TEST(ResizeNearestNeighborOptimized, TestReferenceParity) {
   int invalid_count = 0;
-  const int kTestsToRun = 100 * 1000;
+  const int kTestsToRun = 10000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
     const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
diff --git a/tensorflow/lite/kernels/internal/tensor.h b/tensorflow/lite/kernels/internal/tensor.h
index 0005bf38d54..543117df0e5 100644
--- a/tensorflow/lite/kernels/internal/tensor.h
+++ b/tensorflow/lite/kernels/internal/tensor.h
@@ -119,6 +119,8 @@ class SequentialTensorWriter {
   T* output_ptr_;
 };
 
+// String ops are not yet supported on platforms w/ static memory.
+#ifndef TF_LITE_STATIC_MEMORY
 template <>
 class SequentialTensorWriter<string> {
  public:
@@ -138,6 +140,7 @@ class SequentialTensorWriter<string> {
   TfLiteTensor* output_;
   DynamicBuffer buffer_;
 };
+#endif  // TF_LITE_STATIC_MEMORY
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index e8edf2f59f2..5e106eb7de4 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -20,13 +20,18 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
 
 namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation. Use of CpuBackendContext in method
+// implementations is purely optional.
+class CpuBackendContext;
+
 namespace tensor_utils {
 
 // Checks if all entries of vector are zero for float.
@@ -65,6 +70,15 @@ void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int m_cols, const float* vector,
                                          int n_batch, float* result);
 
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x4.
+// This function assumes that m_cols is a multiple of the block size (4 in this
+// case) so that there's no incomplete block.
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
+
 // Same as the function above, but the matrix is stored in block compressed
 // sparse row format with block pattern 1x16 which consists of two arrays:
 //   1. A matrix array stores non-zero blocks of the matrix in row major.
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index e039fb841ec..878cf0d2618 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 
+#include <math.h>
+
 #include <gmock/gmock.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 
@@ -26,6 +30,13 @@ limitations under the License.
 namespace tflite {
 namespace tensor_utils {
 
+TEST(uKernels, FloorLog2Test) {
+  for (int i = 1; i < 257; ++i) {
+    EXPECT_EQ(::tflite::FloorLog2(i),
+              static_cast<int>(std::floor(std::log2(i))));
+  }
+}
+
 TEST(uKernels, ClipTest) {
   constexpr int kVectorSize = 10;
   constexpr float kAbsLimit = 2.0;
@@ -457,6 +468,37 @@ TEST(uKernels, HybridMatrixBatchVectorMultiplyAccumulate8x8_16Test) {
       &context);
 
   EXPECT_THAT(output2, testing::ElementsAreArray(expected_output));
+
+  // Run with a large batch size to trigger the CpuBackendGemm path on any
+  // device.
+  constexpr int kBatchMultiplier = 8;
+  std::vector<int8_t> input_big_batch(input.size() * kBatchMultiplier);
+  std::vector<float> scaling_factors_big_batch(scaling_factors.size() *
+                                               kBatchMultiplier);
+  std::vector<int32_t> scratch_big_batch(scratch.size() * kBatchMultiplier);
+  std::vector<int32_t> input_offsets_big_batch(input_offsets.size() *
+                                               kBatchMultiplier);
+  for (int i = 0; i < kBatchMultiplier; i++) {
+    std::copy(input.begin(), input.end(),
+              input_big_batch.begin() + i * input.size());
+    std::copy(scaling_factors.begin(), scaling_factors.end(),
+              scaling_factors_big_batch.begin() + i * scaling_factors.size());
+    std::copy(input_offsets.begin(), input_offsets.end(),
+              input_offsets_big_batch.begin() + i * input_offsets.size());
+  }
+  std::vector<float> output_big_batch(output.size() * kBatchMultiplier, 0);
+  MatrixBatchVectorMultiplyAccumulate(
+      input_to_gate_weights.data(), /*m_rows=*/8, /*m_cols=*/32,
+      input_big_batch.data(), scaling_factors_big_batch.data(),
+      /*n_batch*/ 4 * kBatchMultiplier, output_big_batch.data(), nullptr,
+      input_offsets_big_batch.data(), scratch_big_batch.data(), row_sums,
+      &compute_row_sums, &context);
+  for (int i = 0; i < kBatchMultiplier; i++) {
+    std::vector<float> output_per_batch(
+        output_big_batch.begin() + i * output.size(),
+        output_big_batch.begin() + (i + 1) * output.size());
+    EXPECT_THAT(output_per_batch, testing::ElementsAreArray(expected_output));
+  }
 }
 
 // Qautnized matmul with 2 * 30 input and 9 * 30 matrix.
@@ -1094,11 +1136,15 @@ std::vector<float> TestPerChannelDotprodMatrixBatchVectorMultiply(
     bool is_per_channel = true) {
   MatrixVectorData data =
       SetupMatrixVectorData(rows, cols, batch, negative, is_per_channel);
-
+  std::vector<int32_t> scratch(rows * batch);
+  std::vector<int32_t> row_sums(rows);
+  bool compute_row_sums = true;
+  CpuBackendContext context;
   MatrixBatchVectorMultiplyAccumulate(
       data.matrix.data(), rows, cols, data.vectors.data(),
       data.scale_factors.data(), batch, &data.results[0],
-      data.per_channel_scales.data(), data.input_offsets.data());
+      data.per_channel_scales.data(), data.input_offsets.data(), scratch.data(),
+      row_sums.data(), &compute_row_sums, &context);
   return data.results;
 }
 
diff --git a/tensorflow/lite/kernels/internal/test_util.cc b/tensorflow/lite/kernels/internal/test_util.cc
index 4462775ddbd..4971ed24feb 100644
--- a/tensorflow/lite/kernels/internal/test_util.cc
+++ b/tensorflow/lite/kernels/internal/test_util.cc
@@ -105,6 +105,7 @@ float ExponentialRandomPositiveFloat(float percentile, float percentile_val,
 
 void FillRandom(std::vector<float>* vec, float min, float max) {
   std::uniform_real_distribution<float> dist(min, max);
+  // TODO(b/154540105): use std::ref to avoid copying the random engine.
   auto gen = std::bind(dist, RandomEngine());
   std::generate(std::begin(*vec), std::end(*vec), gen);
 }
diff --git a/tensorflow/lite/kernels/internal/test_util.h b/tensorflow/lite/kernels/internal/test_util.h
index 766a627c99e..6c9a341a79e 100644
--- a/tensorflow/lite/kernels/internal/test_util.h
+++ b/tensorflow/lite/kernels/internal/test_util.h
@@ -59,12 +59,22 @@ float ExponentialRandomPositiveFloat(float percentile, float percentile_val,
 // Fills a vector with random floats between |min| and |max|.
 void FillRandom(std::vector<float>* vec, float min, float max);
 
+template <typename T>
+void FillRandom(typename std::vector<T>::iterator begin_it,
+                typename std::vector<T>::iterator end_it, T min, T max) {
+  // Workaround for compilers that don't support (u)int8_t uniform_distribution.
+  typedef typename std::conditional<sizeof(T) >= sizeof(int16_t), T,
+                                    std::int16_t>::type rand_type;
+  std::uniform_int_distribution<rand_type> dist(min, max);
+  // TODO(b/154540105): use std::ref to avoid copying the random engine.
+  auto gen = std::bind(dist, RandomEngine());
+  std::generate(begin_it, end_it, [&gen] { return static_cast<T>(gen()); });
+}
+
 // Fills a vector with random numbers between |min| and |max|.
 template <typename T>
 void FillRandom(std::vector<T>* vec, T min, T max) {
-  std::uniform_int_distribution<T> dist(min, max);
-  auto gen = std::bind(dist, RandomEngine());
-  std::generate(std::begin(*vec), std::end(*vec), gen);
+  return FillRandom(std::begin(*vec), std::end(*vec), min, max);
 }
 
 // Fills a vector with random numbers.
@@ -73,14 +83,6 @@ void FillRandom(std::vector<T>* vec) {
   FillRandom(vec, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
 }
 
-template <typename T>
-void FillRandom(typename std::vector<T>::iterator begin_it,
-                typename std::vector<T>::iterator end_it, T min, T max) {
-  std::uniform_int_distribution<T> dist(min, max);
-  auto gen = std::bind(dist, RandomEngine());
-  std::generate(begin_it, end_it, gen);
-}
-
 // Fill with a "skyscraper" pattern, in which there is a central section (across
 // the depth) with higher values than the surround.
 template <typename T>
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index 59cec6c387e..52d74d1eca4 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -972,8 +972,10 @@ struct PreluParams {
   int32 input_offset;
   int32 alpha_offset;
   int32 output_offset;
-  int32 output_multiplier;
-  int output_shift;
+  int32 output_multiplier_1;
+  int32 output_shift_1;
+  int32 output_multiplier_2;
+  int32 output_shift_2;
 };
 
 struct PoolParams {
@@ -1007,6 +1009,7 @@ struct ResizeBilinearParams {
 
 struct ResizeNearestNeighborParams {
   bool align_corners;
+  bool half_pixel_centers;
 };
 
 struct SliceParams {
@@ -1030,6 +1033,8 @@ struct SoftmaxParams {
   int32_t zero_point;
   float scale;
   float* table;
+  int16_t* exp_lut;
+  int16_t* one_over_one_plus_x_lut;
 };
 
 struct SpaceToBatchParams {
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index f9b0cbb3cb9..ded536ab3a7 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -62,8 +62,9 @@ TfLiteStatus PopulateConvolutionQuantizationParams(
   TF_LITE_ENSURE(context, affine_quantization->scale);
   const bool is_per_channel = affine_quantization->scale->size > 1;
   if (is_per_channel) {
-    //  Currently only Int8 is supported for per channel quantization.
-    TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt8);
+    //  Currently only Int8/Int16 is supported for per channel quantization.
+    TF_LITE_ENSURE(context,
+                   input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
     TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteInt8);
     TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size, num_channels);
     TF_LITE_ENSURE_EQ(
@@ -104,7 +105,8 @@ TfLiteStatus PopulateConvolutionQuantizationParams(
     QuantizeMultiplier(real_multiplier, multiplier, &exponent);
     *shift = -exponent;
   }
-  if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
+  if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8 ||
+      input->type == kTfLiteInt16) {
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
         context, activation, output, output_activation_min,
         output_activation_max));
@@ -124,11 +126,27 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
   // pipeline.
   if (bias) {
     const double bias_scale = static_cast<double>(bias->params.scale);
-    // Here we're making sure the input_product_scale & bias_scale the same.
-    // Normally this should be guaranteed by the training pipeline, we are
-    // setting the threshold to be 2e-6 to allow some numeric stability
-    // difference.
-    TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 2e-6);
+    // Here we're making sure the input_product_scale & bias_scale are about the
+    // same. Since we have:
+    // (output - output_zp) * output_scale =
+    // input_product_scale * input_product + bias * bias_scale ---- (0)
+    //
+    // (0) equals:
+    // (input_product + bias) * input_product_scale ----- (1)
+    //           +
+    // bias * (bias_scale - input_product_scale)   ------ (2)
+    //
+    // For the real kernel computation, we're doing (1), so we really need to
+    // make sure (2) has minimum impact on the output, so:
+    // bias * (bias_scale - input_product_scale) / output_scale should be
+    // a small number for an integer.
+    // Since normally bias should be within a small range.
+    // We should expect (bias_scale - input_product_scale) / output_scale to
+    // be a small number like 0.02.
+    const double scale_diff = std::abs(input_product_scale - bias_scale);
+    const double output_scale = static_cast<double>(output->params.scale);
+
+    TF_LITE_ENSURE(context, scale_diff / output_scale <= 0.02);
   }
   return GetQuantizedConvolutionMultipler(context, input, filter, output,
                                           multiplier);
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index ad068ddd3fd..5793b08616d 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -87,6 +87,10 @@ inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
 }
 
 // Determines whether tensor is constant.
+// TODO(b/138199592): Introduce new query which checks for constant OR
+// persistent-read-only, which would be useful for most tensor kernels that
+// are potentially dynamic based on the input tensor value availability at the
+// time of prepare.
 inline bool IsConstantTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteMmapRo;
 }
@@ -105,6 +109,14 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
   }
 }
 
+// Sets tensor to persistent and read-only.
+inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLitePersistentRo) {
+    tensor->allocation_type = kTfLitePersistentRo;
+    tensor->data.raw = nullptr;
+  }
+}
+
 // Determines whether it is a hybrid op - one that has float inputs and
 // quantized weights.
 inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 4eafc215b6f..e78ec95a9a8 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -1393,7 +1393,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     const TfLiteTensor* projection_weights =
         GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
     if (projection_weights != nullptr) {
-      row_sums_rows += ceil(n_output / n_cell);
+      row_sums_rows += ceil(static_cast<float>(n_output) / n_cell);
     }
 
     TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/9);
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 9895c9183ec..f7422b2876a 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -594,7 +594,7 @@ inline void LstmStepHybrid(
       num_row_sums += use_cifg ? 3 : 4;
     }
     if (projection_weights_ptr != nullptr) {
-      num_row_sums += ceil(n_output / n_cell);
+      num_row_sums += ceil(static_cast<float>(n_output) / n_cell);
     }
     TF_LITE_ASSERT(row_sums_size == num_row_sums);
     input_to_input_row_sums = row_sums;
@@ -744,7 +744,6 @@ inline void LstmStepHybrid(
         forget_gate_scratch, /*per_channel_scale=*/nullptr, zero_points,
         accum_scratch_ptr, aux_input_to_forget_row_sums, compute_row_sums,
         context);
-    row_sums += n_cell;
 
     for (int b = 0; b < n_batch; ++b) {
       product_scaling_factors[b] =
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index 5409c24d1fe..885ae250ae7 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -40,15 +40,20 @@ bool ArrayEq(const T* result, const T* expected_result, int size) {
   return true;
 }
 
-// The class that holds input parameters for quantized lstm.
-class QuantizedLstmParam {
- public:
-  // Getter methods.
-  TfLiteTensor* GetInput() {
-    PackWeightToTensor(&input_tensor_, input_, input_size_);
-    input_tensor_.data.int8 = input_.data();
-    return &input_tensor_;
+template <typename T>
+bool ArrayFloatNear(const T* result, const T* expected_result, int size,
+                    double threshold) {
+  for (int i = 0; i < size; ++i) {
+    if (std::abs(result[i] - expected_result[i]) > threshold) {
+      return false;
+    }
   }
+  return true;
+}
+
+// Base class that holds input parameters for quantized and hybrid lstm.
+class BaseLstmParam {
+ public:
   TfLiteTensor* Geti2i() {
     PackWeightToTensor(&i2i_tensor_, i2i_, i2i_size_);
     i2i_tensor_.data.int8 = i2i_.data();
@@ -94,6 +99,232 @@ class QuantizedLstmParam {
     projection_tensor_.data.int8 = projection_.data();
     return &projection_tensor_;
   }
+  ~BaseLstmParam() {
+    TfLiteIntArrayFree(input_tensor_.dims);
+    TfLiteIntArrayFree(i2i_tensor_.dims);
+    TfLiteIntArrayFree(i2f_tensor_.dims);
+    TfLiteIntArrayFree(i2c_tensor_.dims);
+    TfLiteIntArrayFree(i2o_tensor_.dims);
+    TfLiteIntArrayFree(r2i_tensor_.dims);
+    TfLiteIntArrayFree(r2f_tensor_.dims);
+    TfLiteIntArrayFree(r2c_tensor_.dims);
+    TfLiteIntArrayFree(r2o_tensor_.dims);
+    TfLiteIntArrayFree(layer_norm_input_tensor_.dims);
+    TfLiteIntArrayFree(layer_norm_forget_tensor_.dims);
+    TfLiteIntArrayFree(layer_norm_cell_tensor_.dims);
+    TfLiteIntArrayFree(layer_norm_output_tensor_.dims);
+    TfLiteIntArrayFree(input_bias_tensor_.dims);
+    TfLiteIntArrayFree(forget_bias_tensor_.dims);
+    TfLiteIntArrayFree(cell_bias_tensor_.dims);
+    TfLiteIntArrayFree(output_bias_tensor_.dims);
+    TfLiteIntArrayFree(projection_tensor_.dims);
+    TfLiteIntArrayFree(projection_bias_tensor_.dims);
+    TfLiteIntArrayFree(activation_tensor_.dims);
+    TfLiteIntArrayFree(cell_tensor_.dims);
+    TfLiteIntArrayFree(output_tensor_.dims);
+  }
+
+ protected:
+  template <typename T>
+  void PackWeightToTensor(TfLiteTensor* tensor, std::vector<T>& data,
+                          std::vector<int32_t> dims) {
+    if (data.empty()) {
+      int total = 1;
+      for (int i = 0; i < dims.size(); ++i) {
+        total *= dims[i];
+      }
+      for (int i = 0; i < total; ++i) {
+        data.push_back(0);
+      }
+    }
+    tensor->dims = TfLiteIntArrayCreate(dims.size());
+    for (int i = 0; i < dims.size(); ++i) {
+      tensor->dims->data[i] = dims[i];
+    }
+  }
+  // Dimensions. Need proper size to trigger neon code.
+  const int n_batch_ = 2;
+  const int n_input_ = 18;
+  const int n_cell_ = 10;
+  const int n_output_ = 6;
+
+  std::vector<int32_t> input_size_ = {n_batch_, n_input_};
+  TfLiteTensor input_tensor_;
+
+  // input_to_input_weights.
+  std::vector<int8_t> i2i_ = {
+      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  0,   //
+      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6, 1, 2, 3, -4, 5,  6,   //
+      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6, 1, 7, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
+      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 8,  5,  -6,  //
+      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6, 1, 2, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  6, 1, 2, 3, 14, 5,  6,   //
+      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
+  };
+  std::vector<int32_t> i2i_size_ = {n_cell_, n_input_};
+  TfLiteTensor i2i_tensor_;
+
+  // input_to_forget_weights.
+  std::vector<int8_t> i2f_ = {
+      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  0,   //
+      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6,  1,  2, 3, -4, 5,  6,   //
+      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6,  1,  7, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6,  11, 2, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  -6, 1,  2, 3, 14, 5,  6,   //
+      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
+      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
+      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6,  13, 2, 3, 4,  5,  6,   //
+      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 8,  5,  -6,  //
+  };
+  std::vector<int32_t> i2f_size_ = {n_cell_, n_input_};
+  TfLiteTensor i2f_tensor_;
+
+  // input_to_cell_weights.
+  std::vector<int8_t> i2c_ = {
+      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  0,   //
+      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6,  1, 2, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  16, 1, 2, 3, 14, 5,  6,   //
+      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6,  7, 2, 3, 4,  5,  6,   //
+      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  6,   //
+      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  6,   //
+      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1, 2, 3, 8,  5,  -6,  //
+      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6,  1, 2, 3, -4, 5,  6,   //
+      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6,  1, 7, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  6,   //
+  };
+  std::vector<int32_t> i2c_size_ = {n_cell_, n_input_};
+  TfLiteTensor i2c_tensor_;
+
+  // input_to_output_weights.
+  std::vector<int8_t> i2o_ = {
+      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6,  1,  7, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  -1, 2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6,  1,  2, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  6,  1,  2, 3, 14, 5,  6,   //
+      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  -6, 1,  2, 3, 4,  5,  6,   //
+      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  0,   //
+      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6,  1,  2, 3, -4, 5,  6,   //
+      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6,  -1, 2, 3, 4,  5,  6,   //
+      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 8,  5,  -6,  //
+  };
+  std::vector<int32_t> i2o_size_ = {n_cell_, n_input_};
+  TfLiteTensor i2o_tensor_;
+
+  // recurrent_to_input_weights.
+  std::vector<int8_t> r2i_ = {
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+  };
+  std::vector<int32_t> r2i_size_ = {n_cell_, n_output_};
+  TfLiteTensor r2i_tensor_;
+
+  // recurrent_to_forget_weights.
+  std::vector<int8_t> r2f_ = {
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+  };
+  std::vector<int32_t> r2f_size_ = {n_cell_, n_output_};
+  TfLiteTensor r2f_tensor_;
+
+  // recurrent_to_cell_weights.
+  std::vector<int8_t> r2c_ = {
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+  };
+  std::vector<int32_t> r2c_size_ = {n_cell_, n_output_};
+  TfLiteTensor r2c_tensor_;
+
+  // recurrent_to_output_weights.
+  std::vector<int8_t> r2o_ = {
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+  };
+  std::vector<int32_t> r2o_size_ = {n_cell_, n_output_};
+  TfLiteTensor r2o_tensor_;
+
+  std::vector<int32_t> layer_norm_input_size_ = {n_cell_};
+  TfLiteTensor layer_norm_input_tensor_;
+
+  TfLiteTensor layer_norm_forget_tensor_;
+  std::vector<int32_t> layer_norm_forget_size_ = {n_cell_};
+
+  std::vector<int32_t> layer_norm_cell_size_ = {n_cell_};
+  TfLiteTensor layer_norm_cell_tensor_;
+
+  std::vector<int32_t> layer_norm_output_size_ = {n_cell_};
+  TfLiteTensor layer_norm_output_tensor_;
+
+  std::vector<int32_t> input_bias_size_ = {n_cell_};
+  TfLiteTensor input_bias_tensor_;
+
+  std::vector<int32_t> forget_bias_size_ = {n_cell_};
+  TfLiteTensor forget_bias_tensor_;
+
+  std::vector<int32_t> cell_bias_size_ = {n_cell_};
+  TfLiteTensor cell_bias_tensor_;
+
+  std::vector<int32_t> output_bias_size_ = {n_cell_};
+  TfLiteTensor output_bias_tensor_;
+
+  // projection_weights.
+  std::vector<int8_t> projection_ = {
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+  };
+  std::vector<int32_t> projection_size_ = {n_cell_, n_output_};
+  TfLiteTensor projection_tensor_;
+
+  // projection_bias.
+  std::vector<int32_t> projection_bias_ = {
+      16, 4, 5, 6, 1, 1  //
+  };
+
+  std::vector<int32_t> projection_bias_size_ = {n_output_};
+  TfLiteTensor projection_bias_tensor_;
+
+  std::vector<int32_t> activation_size_ = {n_batch_, n_output_};
+  TfLiteTensor activation_tensor_;
+
+  std::vector<int32_t> cell_size_ = {n_batch_, n_cell_};
+  TfLiteTensor cell_tensor_;
+
+  std::vector<int32_t> output_size_ = {n_batch_, n_output_};
+  TfLiteTensor output_tensor_;
+};
+
+class QuantizedLstmParam : public BaseLstmParam {
+ public:
+  // Getter methods.
+  TfLiteTensor* GetInput() {
+    PackWeightToTensor(&input_tensor_, input_, input_size_);
+    input_tensor_.data.int8 = input_.data();
+    return &input_tensor_;
+  }
   TfLiteTensor* GetInputLayerNorm() {
     PackWeightToTensor(&layer_norm_input_tensor_, layer_norm_input_,
                        layer_norm_input_size_);
@@ -274,30 +505,7 @@ class QuantizedLstmParam {
     cell_tensor_.data.i16 = cell_.data();
     return &cell_tensor_;
   }
-
   ~QuantizedLstmParam() {
-    TfLiteIntArrayFree(input_tensor_.dims);
-    TfLiteIntArrayFree(i2i_tensor_.dims);
-    TfLiteIntArrayFree(i2f_tensor_.dims);
-    TfLiteIntArrayFree(i2c_tensor_.dims);
-    TfLiteIntArrayFree(i2o_tensor_.dims);
-    TfLiteIntArrayFree(r2i_tensor_.dims);
-    TfLiteIntArrayFree(r2f_tensor_.dims);
-    TfLiteIntArrayFree(r2c_tensor_.dims);
-    TfLiteIntArrayFree(r2o_tensor_.dims);
-    TfLiteIntArrayFree(projection_tensor_.dims);
-    TfLiteIntArrayFree(layer_norm_input_tensor_.dims);
-    TfLiteIntArrayFree(layer_norm_forget_tensor_.dims);
-    TfLiteIntArrayFree(layer_norm_cell_tensor_.dims);
-    TfLiteIntArrayFree(layer_norm_output_tensor_.dims);
-    TfLiteIntArrayFree(input_bias_tensor_.dims);
-    TfLiteIntArrayFree(forget_bias_tensor_.dims);
-    TfLiteIntArrayFree(cell_bias_tensor_.dims);
-    TfLiteIntArrayFree(output_bias_tensor_.dims);
-    TfLiteIntArrayFree(projection_bias_tensor_.dims);
-    TfLiteIntArrayFree(activation_tensor_.dims);
-    TfLiteIntArrayFree(cell_tensor_.dims);
-    TfLiteIntArrayFree(output_tensor_.dims);
     TfLiteIntArrayFree(scratch0_tensor_.dims);
     TfLiteIntArrayFree(scratch1_tensor_.dims);
     TfLiteIntArrayFree(scratch2_tensor_.dims);
@@ -307,241 +515,63 @@ class QuantizedLstmParam {
   }
 
  private:
-  template <typename T>
-  void PackWeightToTensor(TfLiteTensor* tensor, std::vector<T>& data,
-                          std::vector<int32_t> dims) {
-    if (data.empty()) {
-      int total = 1;
-      for (int i = 0; i < dims.size(); ++i) {
-        total *= dims[i];
-      }
-      for (int i = 0; i < total; ++i) {
-        data.push_back(0);
-      }
-    }
-    tensor->dims = TfLiteIntArrayCreate(dims.size());
-    for (int i = 0; i < dims.size(); ++i) {
-      tensor->dims->data[i] = dims[i];
-    }
-  }
-
-  // Dimensions. Need proper size to trigger neon code.
-  const int n_batch_ = 2;
-  const int n_input_ = 18;
-  const int n_cell_ = 10;
-  const int n_output_ = 6;
   // input.
   std::vector<int8_t> input_ = {
       8, 2, 3,  4, 5, 6, 1, -2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,  //
       1, 2, -3, 4, 5, 6, 1, 2,  3, 4, 5, 6, 1, 2, 3, 4, 5, 6,  //
   };
-  std::vector<int32_t> input_size_ = {n_batch_, n_input_};
-  TfLiteTensor input_tensor_;
 
-  // input_to_input_weights.
-  std::vector<int8_t> i2i_ = {
-      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
-      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  0,   //
-      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6, 1, 2, 3, -4, 5,  6,   //
-      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6, 1, 7, 3, 4,  -5, 6,   //
-      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
-      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 8,  5,  -6,  //
-      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
-      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6, 1, 2, 3, 4,  -5, 6,   //
-      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  6, 1, 2, 3, 14, 5,  6,   //
-      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
-  };
-  std::vector<int32_t> i2i_size_ = {n_cell_, n_input_};
-  TfLiteTensor i2i_tensor_;
-
-  // input_to_forget_weights.
-  std::vector<int8_t> i2f_ = {
-      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  0,   //
-      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6,  1,  2, 3, -4, 5,  6,   //
-      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6,  1,  7, 3, 4,  -5, 6,   //
-      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
-      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6,  11, 2, 3, 4,  -5, 6,   //
-      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  -6, 1,  2, 3, 14, 5,  6,   //
-      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
-      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
-      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6,  13, 2, 3, 4,  5,  6,   //
-      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 8,  5,  -6,  //
-  };
-  std::vector<int32_t> i2f_size_ = {n_cell_, n_input_};
-  TfLiteTensor i2f_tensor_;
-  // input_to_cell_weights.
-  std::vector<int8_t> i2c_ = {
-      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  0,   //
-      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6,  1, 2, 3, 4,  -5, 6,   //
-      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  16, 1, 2, 3, 14, 5,  6,   //
-      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6,  7, 2, 3, 4,  5,  6,   //
-      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  6,   //
-      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  6,   //
-      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1, 2, 3, 8,  5,  -6,  //
-      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6,  1, 2, 3, -4, 5,  6,   //
-      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6,  1, 7, 3, 4,  -5, 6,   //
-      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  6,   //
-  };
-  std::vector<int32_t> i2c_size_ = {n_cell_, n_input_};
-  TfLiteTensor i2c_tensor_;
-
-  // input_to_output_weights.
-  std::vector<int8_t> i2o_ = {
-      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6,  1,  7, 3, 4,  -5, 6,   //
-      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  -1, 2, 3, 4,  5,  6,   //
-      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6,  1,  2, 3, 4,  -5, 6,   //
-      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  6,  1,  2, 3, 14, 5,  6,   //
-      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  -6, 1,  2, 3, 4,  5,  6,   //
-      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
-      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  0,   //
-      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6,  1,  2, 3, -4, 5,  6,   //
-      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6,  -1, 2, 3, 4,  5,  6,   //
-      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 8,  5,  -6,  //
-  };
-  std::vector<int32_t> i2o_size_ = {n_cell_, n_input_};
-  TfLiteTensor i2o_tensor_;
-
-  // recurrent_to_input_weights.
-  std::vector<int8_t> r2i_ = {
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-  };
-  std::vector<int32_t> r2i_size_ = {n_cell_, n_output_};
-  TfLiteTensor r2i_tensor_;
-
-  // recurrent_to_forget_weights.
-  std::vector<int8_t> r2f_ = {
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-  };
-  std::vector<int32_t> r2f_size_ = {n_cell_, n_output_};
-  TfLiteTensor r2f_tensor_;
-
-  // recurrent_to_cell_weights.
-  std::vector<int8_t> r2c_ = {
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-  };
-  std::vector<int32_t> r2c_size_ = {n_cell_, n_output_};
-  TfLiteTensor r2c_tensor_;
-
-  // recurrent_to_output_weights.
-  std::vector<int8_t> r2o_ = {
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-  };
-  std::vector<int32_t> r2o_size_ = {n_cell_, n_output_};
-  TfLiteTensor r2o_tensor_;
-
-  // input_layer_norm_coefficients.
   std::vector<int16_t> layer_norm_input_ = {8, 2, 3, 4, 5, 6, 1, 2, 3, 4};
-  std::vector<int32_t> layer_norm_input_size_ = {n_cell_};
-  TfLiteTensor layer_norm_input_tensor_;
 
   // forget_layer_norm_coefficient.
   std::vector<int16_t> layer_norm_forget_ = {
       1, 2, 3, 4, 7, 3, 4, -5, 6, 3,  //
   };
-  std::vector<int32_t> layer_norm_forget_size_ = {n_cell_};
-  TfLiteTensor layer_norm_forget_tensor_;
 
   // cell_layer_norm_coefficients.
   std::vector<int16_t> layer_norm_cell_ = {
       6, 4, 5, 6, 1, 2, 3, 4, -5, 6,  //
   };
-  std::vector<int32_t> layer_norm_cell_size_ = {n_cell_};
-  TfLiteTensor layer_norm_cell_tensor_;
 
   // output_layer_norm_coefficients.
   std::vector<int16_t> layer_norm_output_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
-  std::vector<int32_t> layer_norm_output_size_ = {n_cell_};
-  TfLiteTensor layer_norm_output_tensor_;
 
   // input_gate_bias.
   std::vector<int32_t> input_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
-  std::vector<int32_t> input_bias_size_ = {n_cell_};
-  TfLiteTensor input_bias_tensor_;
 
   // forget_gate_bias.
   std::vector<int32_t> forget_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
-  std::vector<int32_t> forget_bias_size_ = {n_cell_};
-  TfLiteTensor forget_bias_tensor_;
 
   // cell_bias.
   std::vector<int32_t> cell_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
-  std::vector<int32_t> cell_bias_size_ = {n_cell_};
-  TfLiteTensor cell_bias_tensor_;
 
   // output_gate_bias.
   std::vector<int32_t> output_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
-  std::vector<int32_t> output_bias_size_ = {n_cell_};
-  TfLiteTensor output_bias_tensor_;
-
-  // projection_weights.
-  std::vector<int8_t> projection_ = {
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
-      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
-      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
-  };
-  std::vector<int32_t> projection_size_ = {n_cell_, n_output_};
-  TfLiteTensor projection_tensor_;
-
-  // projection_bias.
-  std::vector<int32_t> projection_bias_ = {
-      16, 4, 5, 6, 1, 1  //
-  };
-  std::vector<int32_t> projection_bias_size_ = {n_output_};
-  TfLiteTensor projection_bias_tensor_;
 
   // activation.
   std::vector<int8_t> activation_;
-  std::vector<int32_t> activation_size_ = {n_batch_, n_output_};
-  TfLiteTensor activation_tensor_;
 
   // cell.
   std::vector<int16_t> cell_ = {
       16, 4,  5, 6, 1, 1, 3, 4, -5, 6,  //
       1,  14, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
-  std::vector<int32_t> cell_size_ = {n_batch_, n_cell_};
-  TfLiteTensor cell_tensor_;
 
   // output.
   std::vector<int8_t> output_ = {
       1, 1, 3, 4, -5, 6,  //
       1, 4, 3, 4, -5, 6,  //
   };
-  std::vector<int32_t> output_size_ = {n_batch_, n_output_};
-  TfLiteTensor output_tensor_;
 
   // quantized_lstm_param
   ops::builtin::lstm_eval::IntegerLstmParameter integer_lstm_param_;
@@ -603,5 +633,292 @@ void TestOneFullyQuantizedLSTM() {
 TEST(TestOneFullyQuantizedLSTM, TestOneFullyQuantizedLSTM) {
   TestOneFullyQuantizedLSTM();
 }
+
+class HybridLstmParam : public BaseLstmParam {
+ public:
+  TfLiteTensor* GetFloatOutput() {
+    PackWeightToTensor(&output_tensor_, output_float_, output_size_);
+    output_tensor_.data.f = output_float_.data();
+    return &output_tensor_;
+  }
+  const TfLiteLSTMParams GetLSTMParam() {
+    return {kTfLiteActRelu, 0, 0, kTfLiteLSTMFullKernel, true};
+  }
+  TfLiteTensor* GetScratchBuffer() {
+    PackWeightToTensor(&scratch_buffer_tensor_, scratch_buffer_,
+                       scratch_buffer_size_);
+    scratch_buffer_tensor_.data.f = scratch_buffer_.data();
+    return &scratch_buffer_tensor_;
+  }
+  TfLiteTensor* GetScalingFactors() {
+    PackWeightToTensor(&scaling_factors_tensor_, scaling_factors_,
+                       scaling_factors_size_);
+    scaling_factors_tensor_.data.f = scaling_factors_.data();
+    return &scaling_factors_tensor_;
+  }
+  TfLiteTensor* GetProdScalingFactors() {
+    PackWeightToTensor(&prod_scaling_factors_tensor_, prod_scaling_factors_,
+                       prod_scaling_factors_size_);
+    prod_scaling_factors_tensor_.data.f = prod_scaling_factors_.data();
+    return &prod_scaling_factors_tensor_;
+  }
+  TfLiteTensor* GetInputQuantized() {
+    PackWeightToTensor(&input_quantized_tensor_, input_quantized_, input_size_);
+    input_quantized_tensor_.data.int8 = input_quantized_.data();
+    return &input_quantized_tensor_;
+  }
+  TfLiteTensor* GetActivationStateQuantized() {
+    PackWeightToTensor(&activation_quantized_tensor_, activation_quantized_,
+                       activation_size_);
+    activation_quantized_tensor_.data.int8 = activation_quantized_.data();
+    return &activation_quantized_tensor_;
+  }
+  TfLiteTensor* GetCellStateQuantized() {
+    PackWeightToTensor(&cell_quantized_tensor_, cell_quantized_, cell_size_);
+    cell_quantized_tensor_.data.int8 = cell_quantized_.data();
+    return &cell_quantized_tensor_;
+  }
+  TfLiteTensor* GetZeroPoints() {
+    PackWeightToTensor(&zero_points_tensor_, zero_points_, zero_points_size_);
+    zero_points_tensor_.data.i32 = zero_points_.data();
+    return &zero_points_tensor_;
+  }
+  TfLiteTensor* GetRowSums() {
+    PackWeightToTensor(&row_sums_tensor_, row_sums_, row_sums_size_);
+    row_sums_tensor_.data.i32 = row_sums_.data();
+    return &row_sums_tensor_;
+  }
+  TfLiteTensor* GetFloatInput() {
+    PackWeightToTensor(&input_tensor_, input_float_, input_size_);
+    input_tensor_.data.f = input_float_.data();
+    return &input_tensor_;
+  }
+  TfLiteTensor* GetActivation() {
+    PackWeightToTensor(&activation_tensor_, activation_state_,
+                       activation_size_);
+    activation_tensor_.data.f = activation_state_.data();
+    return &activation_tensor_;
+  }
+  TfLiteTensor* GetCell() {
+    PackWeightToTensor(&cell_tensor_, cell_state_, cell_size_);
+    cell_tensor_.data.f = cell_state_.data();
+    return &cell_tensor_;
+  }
+  TfLiteTensor* GetAccumScratchBuffer() {
+    PackWeightToTensor(&accum_scratch_tensor_, accum_scratch_,
+                       accum_scratch_size_);
+    accum_scratch_tensor_.data.i32 = accum_scratch_.data();
+    return &accum_scratch_tensor_;
+  }
+  TfLiteTensor* GetInputBias() {
+    PackWeightToTensor(&input_bias_tensor_, input_float_bias_,
+                       input_bias_size_);
+    input_bias_tensor_.data.f = input_float_bias_.data();
+    return &input_bias_tensor_;
+  }
+  TfLiteTensor* GetForgetBias() {
+    PackWeightToTensor(&forget_bias_tensor_, forget_float_bias_,
+                       forget_bias_size_);
+    forget_bias_tensor_.data.f = forget_float_bias_.data();
+    return &forget_bias_tensor_;
+  }
+  TfLiteTensor* GetCellBias() {
+    PackWeightToTensor(&cell_bias_tensor_, cell_float_bias_, cell_bias_size_);
+    cell_bias_tensor_.data.f = cell_float_bias_.data();
+    return &cell_bias_tensor_;
+  }
+  TfLiteTensor* GetOutputBias() {
+    PackWeightToTensor(&output_bias_tensor_, output_float_bias_,
+                       output_bias_size_);
+    output_bias_tensor_.data.f = output_float_bias_.data();
+    return &output_bias_tensor_;
+  }
+  TfLiteTensor* GetProjectionBias() {
+    PackWeightToTensor(&projection_bias_tensor_, projection_float_bias_,
+                       projection_bias_size_);
+    projection_bias_tensor_.data.f = projection_float_bias_.data();
+    return &projection_bias_tensor_;
+  }
+  int GetNumRowSums() { return n_row_sums_; }
+  TfLiteTensor* GetInputLayerNorm() {
+    PackWeightToTensor(&layer_norm_input_tensor_, layer_norm_float_input_,
+                       layer_norm_input_size_);
+    layer_norm_input_tensor_.data.f = layer_norm_float_input_.data();
+    return &layer_norm_input_tensor_;
+  }
+  TfLiteTensor* GetForgetLayerNorm() {
+    PackWeightToTensor(&layer_norm_forget_tensor_, layer_norm_float_forget_,
+                       layer_norm_forget_size_);
+    layer_norm_forget_tensor_.data.f = layer_norm_float_forget_.data();
+    return &layer_norm_forget_tensor_;
+  }
+  TfLiteTensor* GetCellLayerNorm() {
+    PackWeightToTensor(&layer_norm_cell_tensor_, layer_norm_float_cell_,
+                       layer_norm_cell_size_);
+    layer_norm_cell_tensor_.data.f = layer_norm_float_cell_.data();
+    return &layer_norm_cell_tensor_;
+  }
+  TfLiteTensor* GetOutputLayerNorm() {
+    PackWeightToTensor(&layer_norm_output_tensor_, layer_norm_float_output_,
+                       layer_norm_output_size_);
+    layer_norm_output_tensor_.data.f = layer_norm_float_output_.data();
+    return &layer_norm_output_tensor_;
+  }
+  static TfLiteTensor* addScale(TfLiteTensor* t, float scale) {
+    t->params.scale = scale;
+    return t;
+  }
+  ~HybridLstmParam() {
+    TfLiteIntArrayFree(scratch_buffer_tensor_.dims);
+    TfLiteIntArrayFree(accum_scratch_tensor_.dims);
+    TfLiteIntArrayFree(scaling_factors_tensor_.dims);
+    TfLiteIntArrayFree(prod_scaling_factors_tensor_.dims);
+    TfLiteIntArrayFree(input_quantized_tensor_.dims);
+    TfLiteIntArrayFree(activation_quantized_tensor_.dims);
+    TfLiteIntArrayFree(cell_quantized_tensor_.dims);
+    TfLiteIntArrayFree(zero_points_tensor_.dims);
+    TfLiteIntArrayFree(row_sums_tensor_.dims);
+  }
+
+ private:
+  const int n_row_sums_ = 9;  // Number of weights + 1 for projection weights.
+
+  std::vector<float> scratch_buffer_;
+  std::vector<int32_t> scratch_buffer_size_ = {n_batch_, n_cell_ * 4};
+  TfLiteTensor scratch_buffer_tensor_;
+
+  std::vector<float> scaling_factors_;
+  std::vector<int32_t> scaling_factors_size_ = {n_batch_};
+  TfLiteTensor scaling_factors_tensor_;
+
+  std::vector<float> prod_scaling_factors_;
+  std::vector<int32_t> prod_scaling_factors_size_ = {n_batch_};
+  TfLiteTensor prod_scaling_factors_tensor_;
+
+  std::vector<int8_t> input_quantized_;
+  TfLiteTensor input_quantized_tensor_;
+
+  std::vector<int8_t> activation_quantized_;
+  TfLiteTensor activation_quantized_tensor_;
+
+  std::vector<int8_t> cell_quantized_;
+  TfLiteTensor cell_quantized_tensor_;
+
+  std::vector<float> cell_state_ = {
+      16, 4, 5, 6, 1, 1, 3, 4, -5, 6, 1, 14, 5, 6, 1, 1, 3, 4, -5, 6,
+  };
+
+  std::vector<int32_t> zero_points_;
+  std::vector<int32_t> zero_points_size_ = {n_batch_};
+  TfLiteTensor zero_points_tensor_;
+
+  std::vector<int32_t> row_sums_;
+  std::vector<int32_t> row_sums_size_ = {n_row_sums_, n_cell_};
+  TfLiteTensor row_sums_tensor_;
+
+  std::vector<float> activation_state_;
+
+  std::vector<int32_t> accum_scratch_;
+  std::vector<int32_t> accum_scratch_size_ = {n_cell_, n_batch_};
+  TfLiteTensor accum_scratch_tensor_;
+  std::vector<float> output_float_ = {
+      1, 1, 3, 4, -5, 6,  //
+      1, 4, 3, 4, -5, 6,  //
+  };
+  std::vector<float> input_float_ = {
+      6.06, 7.66, 7.10, 9.32, 3.85, 0.33, 7.15, 1.56, 9.54,
+      5.30, 4.53, 0.19, 1.83, 4.60, 0.84, 5.08, 4.37, 9.92,  //
+      4.08, 3.79, 1.17, 8.99, 0.14, 9.22, 3.18, 2.97, 7.53,
+      0.59, 9.89, 9.13, 7.68, 0.63, 2.15, 4.31, 7.20, 4.09,  //
+  };
+  std::vector<float> input_float_bias_ = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+  std::vector<float> forget_float_bias_ = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+  std::vector<float> cell_float_bias_ = {
+      -11, -7, -4, -5, -1, -1, -2, -3.5, -3, -4,
+  };
+  std::vector<float> output_float_bias_ = {0.16, 0.4, 0.5, 0.6,  0.1,
+                                           0.1,  0.3, 0.4, -0.5, 0.6};
+  std::vector<float> projection_float_bias_ = {0, 0, 0, 0, 0, 0};
+  std::vector<float> layer_norm_float_input_ = {8, 2, 3, 4, 5, 6, 1, -2, 3, 4};
+  std::vector<float> layer_norm_float_forget_ = {
+      0.1, 0.2, 0.3, 0.4, 0.7, 0.3, 0.4, -0.5, 0.6, 0.3,  //
+  };
+  std::vector<float> layer_norm_float_cell_ = {
+      0.6, 0.4, 0.5, 0.6, 0.1, 0.2, 0.3, 0.4, -0.5, 0.6,  //
+  };
+  std::vector<float> layer_norm_float_output_ = {
+      0.6, 0.4, 0.5, 0.6, 0.1, 0.2, 0.3, 0.4, -0.5, 0.6,  //
+  };
+};
+
+void TestOneHybridAsymmLSTM() {
+  CpuBackendContext context;
+  HybridLstmParam one_parameter;
+  auto activation = one_parameter.GetActivation();
+  auto output = one_parameter.GetFloatOutput();
+  auto cell = one_parameter.GetCell();
+  auto param = one_parameter.GetLSTMParam();
+  bool compute_row_sums = true;
+  constexpr float kDefaultScale = 18.0;
+  ops::builtin::lstm_eval::EvalHybrid(
+      one_parameter.GetFloatInput(),
+      HybridLstmParam::addScale(one_parameter.Geti2i(), kDefaultScale),
+      HybridLstmParam::addScale(one_parameter.Geti2f(), kDefaultScale),
+      HybridLstmParam::addScale(one_parameter.Geti2c(), kDefaultScale),
+      HybridLstmParam::addScale(one_parameter.Geti2o(), kDefaultScale),
+      HybridLstmParam::addScale(one_parameter.Getr2i(), kDefaultScale),
+      HybridLstmParam::addScale(one_parameter.Getr2f(), kDefaultScale),
+      HybridLstmParam::addScale(one_parameter.Getr2c(), kDefaultScale),
+      HybridLstmParam::addScale(one_parameter.Getr2o(), kDefaultScale),
+      /*cell_to_input_weights=*/nullptr,
+      /*cell_to_forget_weights=*/nullptr,
+      /*cell_to_output_weights=*/nullptr, one_parameter.GetInputLayerNorm(),
+      one_parameter.GetForgetLayerNorm(), one_parameter.GetCellLayerNorm(),
+      one_parameter.GetOutputLayerNorm(),
+      /*aux_input=*/nullptr,
+      /*aux_input_to_input_weights=*/nullptr,
+      /*aux_input_to_forget_weights=*/nullptr,
+      /*aux_input_to_cell_weights=*/nullptr,
+      /*aux_input_to_output_weights=*/nullptr, one_parameter.GetInputBias(),
+      one_parameter.GetForgetBias(), one_parameter.GetCellBias(),
+      one_parameter.GetOutputBias(),
+      HybridLstmParam::addScale(one_parameter.GetProjection(), 1.0),
+      one_parameter.GetProjectionBias(), &param,
+      /*forward_sequence=*/true,
+      /*time_major=*/true,
+      /*output_offset=*/0, one_parameter.GetScratchBuffer(),
+      one_parameter.GetScalingFactors(), one_parameter.GetProdScalingFactors(),
+      /*recovered_cell_weights=*/nullptr, one_parameter.GetInputQuantized(),
+      /*aux_input_quantized=*/nullptr,
+      one_parameter.GetActivationStateQuantized(),
+      one_parameter.GetCellStateQuantized(), activation, cell,
+      one_parameter.GetAccumScratchBuffer(), output,
+      one_parameter.GetZeroPoints(), one_parameter.GetRowSums(),
+      one_parameter.GetNumRowSums(), &compute_row_sums, &context);
+  const std::vector<float> expected_cell = {
+      7.83134,  1.96158, 2.18285, 3.28739,  0.483214,
+      0.618206, 1.21539, 1.4052,  -3.17735, 2.24296,  //
+      0.498944, 6.91104, 1.74126, 3.28993,  0.580477,
+      0.489936, 1.2527,  1.50157, -3.71849, 2.76743,  //
+  };
+  const std::vector<float> expected_activation = {
+      53.0403, 59.3623, 24.8493, 53.0403, 59.3623, 24.8493,  //
+      36.7559, 57.5202, 29.7217, 36.7559, 57.5202, 29.7217,
+  };
+  EXPECT_TRUE(ArrayFloatNear(cell->data.f, expected_cell.data(), 20, 1e-2));
+  EXPECT_TRUE(
+      ArrayFloatNear(activation->data.f, expected_activation.data(), 12, 1e-4));
+  EXPECT_TRUE(
+      ArrayFloatNear(output->data.f, expected_activation.data(), 12, 1e-4));
+}
+
+TEST(TestOneHybridAsymmLSTM, TestOneHybridAsymmLSTM) {
+  TestOneHybridAsymmLSTM();
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 2bd31eae8db..62634e6bfbd 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -2050,7 +2050,7 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
       }};
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
-                /*tolerance=*/0.000902065);
+                /*tolerance=*/0.0009021);
 }
 
 class CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 3c6c524c13d..cad86acd8dd 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -31,6 +32,7 @@ namespace maximum_minimum {
 // This file has a reference implementation of TFMaximum/TFMinimum.
 enum KernelType {
   kReference,
+  kGenericOptimized,
 };
 
 constexpr int kInputTensor1 = 0;
@@ -85,7 +87,7 @@ struct MinimumOp {
   }
 };
 
-template <typename data_type, typename op_type>
+template <KernelType kernel_type, typename data_type, typename op_type>
 void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
                      const OpContext& op_context) {
   reference_ops::MaximumMinimumBroadcastSlow(
@@ -98,29 +100,82 @@ void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
       op_type::template op<data_type>);
 }
 
+// Maximum generic opt int8.
+template <>
+void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MaximumOp>(
+    TfLiteContext* context, TfLiteNode* node, const OpContext& op_context) {
+  tflite::ArithmeticParams op_params;
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(op_context.input1), GetTensorShape(op_context.input2),
+      &op_params);
+  if (need_broadcast) {
+    optimized_ops::BroadcastMaximumDispatch(
+        op_params, GetTensorShape(op_context.input1),
+        GetTensorData<int8>(op_context.input1),
+        GetTensorShape(op_context.input2),
+        GetTensorData<int8>(op_context.input2),
+        GetTensorShape(op_context.output),
+        GetTensorData<int8>(op_context.output), MaximumOp::template op<int8>);
+    return;
+  }
+  reference_ops::MaximumMinimumBroadcastSlow(
+      GetTensorShape(op_context.input1), GetTensorData<int8>(op_context.input1),
+      GetTensorShape(op_context.input2), GetTensorData<int8>(op_context.input2),
+      GetTensorShape(op_context.output), GetTensorData<int8>(op_context.output),
+      MaximumOp::template op<int8>);
+}
+
+// Minimum generic opt int8.
+template <>
+void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MinimumOp>(
+    TfLiteContext* context, TfLiteNode* node, const OpContext& op_context) {
+  tflite::ArithmeticParams op_params;
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(op_context.input1), GetTensorShape(op_context.input2),
+      &op_params);
+  if (need_broadcast) {
+    optimized_ops::BroadcastMinimumDispatch(
+        op_params, GetTensorShape(op_context.input1),
+        GetTensorData<int8>(op_context.input1),
+        GetTensorShape(op_context.input2),
+        GetTensorData<int8>(op_context.input2),
+        GetTensorShape(op_context.output),
+        GetTensorData<int8>(op_context.output), MinimumOp::template op<int8>);
+    return;
+  }
+  reference_ops::MaximumMinimumBroadcastSlow(
+      GetTensorShape(op_context.input1), GetTensorData<int8>(op_context.input1),
+      GetTensorShape(op_context.input2), GetTensorData<int8>(op_context.input2),
+      GetTensorShape(op_context.output), GetTensorData<int8>(op_context.output),
+      MinimumOp::template op<int8>);
+}
+
 template <KernelType kernel_type, typename OpType>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
 
-  if (kernel_type == kReference) {
     switch (op_context.output->type) {
       case kTfLiteFloat32:
-        TFLiteOperation<float, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, float, OpType>(context, node, op_context);
         break;
       case kTfLiteUInt8:
-        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, uint8_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt8:
-        TFLiteOperation<int8_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int8_t, OpType>(context, node, op_context);
         break;
       case kTfLiteInt32:
-        TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int32_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt64:
-        TFLiteOperation<int64_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int64_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt16:
-        TFLiteOperation<int16_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int16_t, OpType>(context, node,
+                                                      op_context);
         break;
       default:
         context->ReportError(context,
@@ -128,12 +183,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                              op_context.output->type);
         return kTfLiteError;
     }
-  } else {
-    context->ReportError(context,
-                         "Type %d is currently not supported by Maximum.",
-                         op_context.output->type);
-    return kTfLiteError;
-  }
   return kTfLiteOk;
 }
 
@@ -147,6 +196,14 @@ TfLiteRegistration* Register_MAXIMUM_REF() {
   return &r;
 }
 
+TfLiteRegistration* Register_MAXIMUM_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kGenericOptimized,
+                            maximum_minimum::MaximumOp>};
+  return &r;
+}
+
 TfLiteRegistration* Register_MINIMUM_REF() {
   static TfLiteRegistration r = {
       nullptr, nullptr, maximum_minimum::Prepare,
@@ -154,8 +211,21 @@ TfLiteRegistration* Register_MINIMUM_REF() {
                             maximum_minimum::MinimumOp>};
   return &r;
 }
-TfLiteRegistration* Register_MAXIMUM() { return Register_MAXIMUM_REF(); }
-TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
+
+TfLiteRegistration* Register_MINIMUM_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kGenericOptimized,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MAXIMUM() {
+  return Register_MAXIMUM_GENERIC_OPT();
+}
+TfLiteRegistration* Register_MINIMUM() {
+  return Register_MINIMUM_GENERIC_OPT();
+}
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/non_max_suppression.cc b/tensorflow/lite/kernels/non_max_suppression.cc
index ee8e407066d..f57ee1bc5d2 100644
--- a/tensorflow/lite/kernels/non_max_suppression.cc
+++ b/tensorflow/lite/kernels/non_max_suppression.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
index 33d033b10b6..8c1a6b1be16 100644
--- a/tensorflow/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -19,6 +19,7 @@ limitations under the License.
 // non-portable function.
 #ifdef TF_LITE_MCU_DEBUG_LOG
 
+#include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
 #define DEBUG_LOG(x) \
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index 7d18bb6c34f..bc6758c7249 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -195,10 +195,10 @@ template <typename InputType>
 struct PackOpTestInt : public ::testing::Test {
   using TypeToTest = InputType;
   TensorType TENSOR_TYPE =
-      std::is_same<InputType, int16_t>::value
-          ? TensorType_INT16
-          : (std::is_same<InputType, uint8_t>::value ? TensorType_UINT8
-                                                     : TensorType_INT8);
+      (std::is_same<InputType, int16_t>::value
+           ? TensorType_INT16
+           : (std::is_same<InputType, uint8_t>::value ? TensorType_UINT8
+                                                      : TensorType_INT8));
 };
 
 using TestTypes = testing::Types<int8_t, uint8_t, int16_t>;
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
index 8e27ebcc325..53fd92f1682 100644
--- a/tensorflow/lite/kernels/rank.cc
+++ b/tensorflow/lite/kernels/rank.cc
@@ -30,19 +30,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   output->type = kTfLiteInt32;
 
+  // By design, the input shape is always known at the time of Prepare, even
+  // if the preceding op that generates |input| is dynamic. Thus, we can
+  // always compute the rank immediately, without waiting for Eval.
+  SetTensorToPersistentRo(output);
+
   // Rank produces a 0-D int32 Tensor representing the rank of input.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(0);
-  return context->ResizeTensor(context, output, output_size);
-}
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 0);
 
+  // Immediately propagate the known rank to the output tensor. This allows
+  // downstream ops that rely on the value to use it during prepare.
   if (output->type == kTfLiteInt32) {
     int32_t* output_data = GetTensorData<int32_t>(output);
     *output_data = NumDimensions(input);
@@ -53,6 +57,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
 }  // namespace rank
 
 TfLiteRegistration* Register_RANK() {
diff --git a/tensorflow/lite/kernels/rank_test.cc b/tensorflow/lite/kernels/rank_test.cc
index f3dc97126ba..5373a0a66fe 100644
--- a/tensorflow/lite/kernels/rank_test.cc
+++ b/tensorflow/lite/kernels/rank_test.cc
@@ -43,6 +43,9 @@ class RankOpModel : public SingleOpModel {
 
   std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+  TfLiteAllocationType GetOutputAllocationType() const {
+    return interpreter_->tensor(interpreter_->outputs()[0])->allocation_type;
+  }
 
  private:
   int input_;
@@ -51,6 +54,13 @@ class RankOpModel : public SingleOpModel {
 
 TEST(RankOpTest, InputTypeFloat) {
   RankOpModel model({1, 3, 1, 3, 5}, TensorType_FLOAT32);
+  ASSERT_EQ(model.GetOutputAllocationType(), kTfLitePersistentRo);
+
+  // Unlike most ops, Rank populates outputs in Prepare().
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+
+  // Invoke is superfluous and shouldn't change the output.
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
@@ -59,7 +69,6 @@ TEST(RankOpTest, InputTypeFloat) {
 
 TEST(RankOpTest, InputTypeInt) {
   RankOpModel model({1, 3, 1, 3, 5}, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
   EXPECT_TRUE(model.GetOutputShape().empty());
@@ -67,7 +76,6 @@ TEST(RankOpTest, InputTypeInt) {
 
 TEST(RankOpTest, ScalarTensor) {
   RankOpModel model({}, TensorType_FLOAT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
   EXPECT_TRUE(model.GetOutputShape().empty());
@@ -75,7 +83,6 @@ TEST(RankOpTest, ScalarTensor) {
 
 TEST(RankOpTest, EmptyTensor) {
   RankOpModel model({1, 0}, TensorType_FLOAT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
   EXPECT_TRUE(model.GetOutputShape().empty());
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 392369eaea0..8ca58e6a309 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -39,10 +39,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6(), /* min_version = */ 1,
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_TANH, Register_TANH(), /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D(),
              /* min_version */ 1,
              /* max_version */ 3);
@@ -52,10 +52,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
   AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
              /* min_version = */ 1,
              /* max_version = */ 3);
@@ -77,15 +77,15 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 8);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_ADD, Register_ADD(),
              /* min_version = */ 1,
              /* max_version = */ 2);
@@ -96,7 +96,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* min_version = */ 1,
              /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_MUL, Register_MUL(), /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION(),
              /* min_version = */ 1,
              /* max_version = */ 2);
@@ -121,7 +121,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
              Register_RESIZE_NEAREST_NEIGHBOR(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
   AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH(),
              /* min_version = */ 1,
@@ -142,8 +142,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SUB, Register_SUB(),
              /* min_version = */ 1,
              /* max_version = */ 3);
-  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(), /* min_version = */ 1,
-             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V(),
              /* min_version = */ 1,
              /* max_version = */ 2);
@@ -203,7 +204,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_TILE, Register_TILE(),
              /* min_version = */ 1,
              /* max_version = */ 2);
@@ -224,10 +225,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
   AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
   AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
diff --git a/tensorflow/lite/kernels/reshape_test.cc b/tensorflow/lite/kernels/reshape_test.cc
index 097f18b119f..310d594698c 100644
--- a/tensorflow/lite/kernels/reshape_test.cc
+++ b/tensorflow/lite/kernels/reshape_test.cc
@@ -12,113 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <vector>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/reshape_test_common.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
-
 using ::testing::ElementsAreArray;
 using ::testing::IsEmpty;
 
-// There are three ways to specify the output shape of a Reshape
-// op.
-enum class ShapeSpecificationType {
-  // The output shape is hardcoded in the ReshapeOptions object.
-  kAsReshapeOption,
-  // The output shape is specified as an input tensor, which is connected to a
-  // Const node, which is guaranteed not to change once inference starts. The
-  // shape is also hardcoded as in kAsReshapeOption.
-  kAsConstantTensor,
-  // The output shape is specified as an input tensor that can change based on
-  // external input. That is, the shape is not know before the inference
-  // starts. The shape is also hardcoded as in kAsReshapeOption.
-  kAsTensor,
-};
-
-template <typename T>
-class ReshapeOpModel : public SingleOpModel {
- public:
-  ReshapeOpModel(std::initializer_list<int> input_shape,
-                 std::initializer_list<int> shape_shape,
-                 std::initializer_list<int> shape_data,
-                 ShapeSpecificationType shape_type) {
-    switch (shape_type) {
-      case ShapeSpecificationType::kAsTensor:
-        BuildWithTensorShape(input_shape, shape_shape, shape_data);
-        break;
-      case ShapeSpecificationType::kAsConstantTensor:
-        BuildWithConstantTensorShape(input_shape, shape_shape, shape_data);
-        break;
-      case ShapeSpecificationType::kAsReshapeOption:
-        // In this case the shape of the new shape doesn't matter. It is
-        // always hardcoded as a flat vector.
-        BuildWithHardcodedShape(input_shape, shape_data);
-        break;
-    }
-  }
-
-  void SetInput(std::initializer_list<T> data) {
-    PopulateTensor<T>(input_, data);
-  }
-
-  void SetStringInput(std::initializer_list<string> data) {
-    PopulateStringTensor(input_, data);
-  }
-
-  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- private:
-  void BuildWithHardcodedShape(std::initializer_list<int> input_shape,
-                               std::initializer_list<int> shape_data) {
-    input_ = AddInput({GetTensorType<T>(), input_shape});
-    output_ = AddOutput(GetTensorType<T>());
-    SetBuiltinOp(
-        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
-        CreateReshapeOptions(builder_, builder_.CreateVector<int>(shape_data))
-            .Union());
-    BuildInterpreter({GetShape(input_)});
-  }
-
-  void BuildWithTensorShape(std::initializer_list<int> input_shape,
-                            std::initializer_list<int> shape_shape,
-                            std::initializer_list<int> shape_data) {
-    input_ = AddInput({GetTensorType<T>(), input_shape});
-    output_ = AddOutput(GetTensorType<T>());
-    int shape_input_tensor = AddInput({TensorType_INT32, shape_shape});
-    // Note how shape also appears in ReshapeOptions
-    SetBuiltinOp(
-        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
-        CreateReshapeOptions(builder_, builder_.CreateVector<int>(shape_data))
-            .Union());
-    BuildInterpreter({GetShape(input_), GetShape(shape_input_tensor)});
-    if (shape_data.size() != 0) {
-      PopulateTensor<int32_t>(shape_input_tensor, shape_data);
-    }
-  }
-
-  void BuildWithConstantTensorShape(std::initializer_list<int> input_shape,
-                                    std::initializer_list<int> shape_shape,
-                                    std::initializer_list<int> shape_data) {
-    input_ = AddInput({GetTensorType<T>(), input_shape});
-    output_ = AddOutput(GetTensorType<T>());
-    AddConstInput(TensorType_INT32, shape_data, shape_shape);
-    // Note how the shape also appears in the ReshapeOptions.
-    SetBuiltinOp(
-        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
-        CreateReshapeOptions(builder_, builder_.CreateVector<int>(shape_data))
-            .Union());
-    BuildInterpreter({GetShape(input_)});
-  }
-
-  int input_;
-  int output_;
-};
-
 template <typename T>
 class ReshapeOpTest : public ::testing::Test {
  public:
@@ -267,6 +174,5 @@ TYPED_TEST(ReshapeOpTest, Strings) {
                 ElementsAreArray({"1", "2", "3", "4", "5", "6", "7", "8"}));
   }
 }
-
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/reshape_test_common.h b/tensorflow/lite/kernels/reshape_test_common.h
new file mode 100644
index 00000000000..9dbf028e7be
--- /dev/null
+++ b/tensorflow/lite/kernels/reshape_test_common.h
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_RESHAPE_TEST_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_RESHAPE_TEST_COMMON_H_
+
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+// There are three ways to specify the output shape of a Reshape
+// op.
+enum class ShapeSpecificationType {
+  // The output shape is hardcoded in the ReshapeOptions object.
+  kAsReshapeOption,
+  // The output shape is specified as an input tensor, which is connected to a
+  // Const node, which is guaranteed not to change once inference starts. The
+  // shape is also hardcoded as in kAsReshapeOption.
+  kAsConstantTensor,
+  // The output shape is specified as an input tensor that can change based on
+  // external input. That is, the shape is not know before the inference
+  // starts. The shape is also hardcoded as in kAsReshapeOption.
+  kAsTensor,
+};
+
+template <typename T, typename BASE = SingleOpModel>
+class ReshapeOpModel : public BASE {
+ public:
+  ReshapeOpModel(std::initializer_list<int> input_shape,
+                 std::initializer_list<int> shape_shape,
+                 std::initializer_list<int> shape_data,
+                 ShapeSpecificationType shape_type) {
+    switch (shape_type) {
+      case ShapeSpecificationType::kAsTensor:
+        this->BuildWithTensorShape(input_shape, shape_shape, shape_data);
+        break;
+      case ShapeSpecificationType::kAsConstantTensor:
+        this->BuildWithConstantTensorShape(input_shape, shape_shape,
+                                           shape_data);
+        break;
+      case ShapeSpecificationType::kAsReshapeOption:
+        // In this case the shape of the new shape doesn't matter. It is
+        // always hardcoded as a flat vector.
+        this->BuildWithHardcodedShape(input_shape, shape_data);
+        break;
+    }
+  }
+
+  void SetInput(std::vector<T> data) {
+    this->template PopulateTensor<T>(input_, data);
+  }
+
+  void SetStringInput(std::initializer_list<string> data) {
+    this->PopulateStringTensor(input_, data);
+  }
+
+  std::vector<T> GetOutput() {
+    return this->template ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return this->GetTensorShape(output_); }
+
+ private:
+  void BuildWithHardcodedShape(std::initializer_list<int> input_shape,
+                               std::initializer_list<int> shape_data) {
+    input_ = this->AddInput({GetTensorType<T>(), input_shape});
+    output_ = this->AddOutput(GetTensorType<T>());
+    this->SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(
+            this->builder_,
+            this->builder_.template CreateVector<int>(shape_data))
+            .Union());
+    this->BuildInterpreter({this->GetShape(input_)});
+  }
+
+  void BuildWithTensorShape(std::initializer_list<int> input_shape,
+                            std::initializer_list<int> shape_shape,
+                            std::initializer_list<int> shape_data) {
+    input_ = this->AddInput({GetTensorType<T>(), input_shape});
+    output_ = this->AddOutput(GetTensorType<T>());
+    int shape_input_tensor = this->AddInput({TensorType_INT32, shape_shape});
+    // Note how shape also appears in ReshapeOptions
+    this->SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(
+            this->builder_,
+            this->builder_.template CreateVector<int>(shape_data))
+            .Union());
+    this->BuildInterpreter(
+        {this->GetShape(input_), this->GetShape(shape_input_tensor)});
+    if (shape_data.size() != 0) {
+      this->template PopulateTensor<int32_t>(shape_input_tensor, shape_data);
+    }
+  }
+
+  void BuildWithConstantTensorShape(std::initializer_list<int> input_shape,
+                                    std::initializer_list<int> shape_shape,
+                                    std::initializer_list<int> shape_data) {
+    input_ = this->AddInput({GetTensorType<T>(), input_shape});
+    output_ = this->AddOutput(GetTensorType<T>());
+    this->AddConstInput(TensorType_INT32, shape_data, shape_shape);
+    // Note how the shape also appears in the ReshapeOptions.
+    this->SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(
+            this->builder_,
+            this->builder_.template CreateVector<int>(shape_data))
+            .Union());
+    this->BuildInterpreter({this->GetShape(input_)});
+  }
+
+  int input_;
+  int output_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_RESHAPE_TEST_COMMON_H_
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index 86dcaaefce0..d4d414ae29c 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -25,8 +25,8 @@ using ::testing::ElementsAreArray;
 using uint8 = std::uint8_t;
 
 enum class TestType {
-  CONST = 0,
-  DYNAMIC = 1,
+  kConst = 0,
+  kDynamic = 1,
 };
 
 class ResizeBilinearOpModel : public SingleOpModel {
@@ -35,7 +35,7 @@ class ResizeBilinearOpModel : public SingleOpModel {
                                  std::initializer_list<int> size_data,
                                  TestType test_type,
                                  bool half_pixel_centers = false) {
-    bool const_size = (test_type == TestType::CONST);
+    bool const_size = (test_type == TestType::kConst);
 
     input_ = AddInput(input);
     if (const_size) {
@@ -190,10 +190,6 @@ TEST_P(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
 
 TEST_P(ResizeBilinearOpTest,
        TwoDimensionalResizeWithTwoBatches_HalfPixelCenters) {
-  // TODO(b/147696142): Update when NNAPI delegate can support TF2 behavior.
-  if (SingleOpModel::GetForceUseNnapi()) {
-    return;
-  }
   ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3},
                           GetParam(), /**half_pixel_centers**/ true);
   m.SetInput<float>({
@@ -253,10 +249,6 @@ TEST_P(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
 
 TEST_P(ResizeBilinearOpTest,
        TwoDimensionalResizeWithTwoBatchesUInt8_HalfPixelCenters) {
-  // TODO(b/147696142): Update when NNAPI delegate can support TF2 behavior.
-  if (SingleOpModel::GetForceUseNnapi()) {
-    return;
-  }
   ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3}, GetParam(),
                           /**half_pixel_centers**/ true);
   m.SetInput<uint8>({
@@ -332,7 +324,7 @@ TEST_P(ResizeBilinearOpTest, ThreeDimensionalResizeInt8) {
 }
 
 INSTANTIATE_TEST_SUITE_P(ResizeBilinearOpTest, ResizeBilinearOpTest,
-                         testing::Values(TestType::CONST, TestType::DYNAMIC));
+                         testing::Values(TestType::kConst, TestType::kDynamic));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
index b783a0e0a67..1b58e5245ee 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -89,6 +89,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   tflite::ResizeNearestNeighborParams op_params;
   op_params.align_corners = params->align_corners;
+  op_params.half_pixel_centers = params->half_pixel_centers;
 
   if (output->type == kTfLiteFloat32) {
     reference_ops::ResizeNearestNeighbor(
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
index e8170c9d45f..656bd6ee750 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
@@ -25,16 +25,18 @@ using ::testing::ElementsAreArray;
 using uint8 = std::uint8_t;
 
 enum class TestType {
-  CONST = 0,
-  DYNAMIC = 1,
+  kConst = 0,
+  kDynamic = 1,
 };
 
 class ResizeNearestNeighborOpModel : public SingleOpModel {
  public:
   explicit ResizeNearestNeighborOpModel(const TensorData& input,
                                         std::initializer_list<int> size_data,
-                                        TestType test_type) {
-    bool const_size = (test_type == TestType::CONST);
+                                        TestType test_type,
+                                        bool align_corners = false,
+                                        bool half_pixel_centers = false) {
+    bool const_size = (test_type == TestType::kConst);
 
     input_ = AddInput(input);
     if (const_size) {
@@ -45,7 +47,10 @@ class ResizeNearestNeighborOpModel : public SingleOpModel {
     output_ = AddOutput(input.type);
     SetBuiltinOp(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
                  BuiltinOptions_ResizeNearestNeighborOptions,
-                 CreateResizeNearestNeighborOptions(builder_).Union());
+                 CreateResizeNearestNeighborOptions(
+                     builder_, /*align_corners*/ align_corners,
+                     /*half_pixel_centers*/ half_pixel_centers)
+                     .Union());
     if (const_size) {
       BuildInterpreter({GetShape(input_)});
     } else {
@@ -182,6 +187,47 @@ TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches) {
                                         10, 10, 16,  //
                                     })));
 }
+TEST_P(ResizeNearestNeighborOpTest,
+       TwoDimensionalResizeWithTwoBatches_AlignCorners) {
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3},
+                                 GetParam(), /**align_corners**/ true);
+  m.SetInput<float>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 6, 6,     //
+                                        9, 12, 12,   //
+                                        9, 12, 12,   //
+                                        4, 10, 10,   //
+                                        10, 16, 16,  //
+                                        10, 16, 16,  //
+                                    })));
+}
+TEST_P(ResizeNearestNeighborOpTest,
+       TwoDimensionalResizeWithTwoBatches_HalfPixelCenters) {
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3},
+                                 GetParam(), /**align_corners**/ false,
+                                 /**half_pixel_centers**/ true);
+  m.SetInput<float>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 6, 6,     //
+                                        9, 12, 12,   //
+                                        9, 12, 12,   //
+                                        4, 10, 10,   //
+                                        10, 16, 16,  //
+                                        10, 16, 16,  //
+                                    })));
+}
 TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResize) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3},
                                  GetParam());
@@ -248,6 +294,36 @@ TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeUInt8) {
                                         10, 12, 10, 12, 14, 16,  //
                                     })));
 }
+TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeUInt8_AlignCorners) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3},
+                                 GetParam(), /**align_corners**/ true);
+  m.SetInput<uint8>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 6, 10, 6, 10,      //
+                                        10, 12, 14, 16, 14, 16,  //
+                                        10, 12, 14, 16, 14, 16,  //
+                                    })));
+}
+TEST_P(ResizeNearestNeighborOpTest,
+       ThreeDimensionalResizeUInt8_HalfPixelCenters) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3},
+                                 GetParam(), /**align_corners**/ false,
+                                 /**half_pixel_centers**/ true);
+  m.SetInput<uint8>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 6, 10, 6, 10,      //
+                                        10, 12, 14, 16, 14, 16,  //
+                                        10, 12, 14, 16, 14, 16,  //
+                                    })));
+}
 TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt8) {
   ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 2, 2}}, {3, 3},
                                  GetParam());
@@ -264,7 +340,7 @@ TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt8) {
 }
 INSTANTIATE_TEST_SUITE_P(ResizeNearestNeighborOpTest,
                          ResizeNearestNeighborOpTest,
-                         testing::Values(TestType::CONST, TestType::DYNAMIC));
+                         testing::Values(TestType::kConst, TestType::kDynamic));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/shape.cc b/tensorflow/lite/kernels/shape.cc
index 88794fefac4..d979f083f70 100644
--- a/tensorflow/lite/kernels/shape.cc
+++ b/tensorflow/lite/kernels/shape.cc
@@ -54,19 +54,22 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
+  // By design, the input shape is always known at the time of Prepare, even
+  // if the preceding op that generates |input| is dynamic. Thus, we can
+  // always compute the shape immediately, without waiting for Eval.
+  SetTensorToPersistentRo(output);
+
   // Shape always produces a 1-dimensional output tensor, where each output
   // element is the length of the corresponding input tensor's dimension.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(1);
   output_size->data[0] = NumDimensions(input);
-  return context->ResizeTensor(context, output, output_size);
-}
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TFLITE_DCHECK_EQ(NumDimensions(output), 1);
   TFLITE_DCHECK_EQ(SizeOfDimension(output, 0), NumDimensions(input));
 
+  // Immediately propagate the known shape to the output tensor. This allows
+  // downstream ops that rely on the value to use it during prepare.
   switch (output->type) {
     case kTfLiteInt32:
       ExtractShape(input, GetTensorData<int32_t>(output));
@@ -81,6 +84,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
 }  // namespace shape
 
 TfLiteRegistration* Register_SHAPE() {
diff --git a/tensorflow/lite/kernels/shape_test.cc b/tensorflow/lite/kernels/shape_test.cc
index 6a7dad4d3e0..3eeb83f5000 100644
--- a/tensorflow/lite/kernels/shape_test.cc
+++ b/tensorflow/lite/kernels/shape_test.cc
@@ -45,6 +45,9 @@ class ShapeOpModel : public SingleOpModel {
   int32_t GetOutputSize() { return GetTensorSize(output_); }
   std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+  TfLiteAllocationType GetOutputAllocationType() const {
+    return interpreter_->tensor(interpreter_->outputs()[0])->allocation_type;
+  }
 
  private:
   int input_;
@@ -54,6 +57,13 @@ class ShapeOpModel : public SingleOpModel {
 TEST(ShapeOpTest, OutTypeInt) {
   ShapeOpModel<int32_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
                               TensorType_INT32);
+  ASSERT_EQ(model.GetOutputAllocationType(), kTfLitePersistentRo);
+
+  // Unlike most ops, Rank populates outputs in Prepare().
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
+
+  // Invoke is superfluous and shouldn't change the output.
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
@@ -63,7 +73,6 @@ TEST(ShapeOpTest, OutTypeInt) {
 TEST(ShapeOpTest, OutTypeInt64) {
   ShapeOpModel<int64_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
                               TensorType_INT64);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
@@ -71,7 +80,6 @@ TEST(ShapeOpTest, OutTypeInt64) {
 
 TEST(ShapeOpTest, ScalarTensor) {
   ShapeOpModel<int32_t> model({}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_EQ(model.GetOutputSize(), 0);
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({0}));
@@ -79,7 +87,6 @@ TEST(ShapeOpTest, ScalarTensor) {
 
 TEST(ShapeOpTest, EmptyTensor) {
   ShapeOpModel<int32_t> model({1, 0}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index b372aece52e..1a31ae44a5d 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -24,8 +24,8 @@ namespace {
 using ::testing::ElementsAreArray;
 
 enum class TestType {
-  CONST = 0,
-  DYNAMIC = 1,
+  kConst = 0,
+  kDynamic = 1,
 };
 
 template <typename input_type, typename index_type>
@@ -39,7 +39,7 @@ class SliceOpModel : public SingleOpModel {
                TensorType tensor_index_type, TensorType tensor_input_type,
                TestType input_tensor_types) {
     input_ = AddInput(tensor_input_type);
-    if (input_tensor_types == TestType::DYNAMIC) {
+    if (input_tensor_types == TestType::kDynamic) {
       begin_ = AddInput(tensor_index_type);
       size_ = AddInput(tensor_index_type);
     } else {
@@ -52,7 +52,7 @@ class SliceOpModel : public SingleOpModel {
                  CreateSliceOptions(builder_).Union());
     BuildInterpreter({input_shape, begin_shape, size_shape});
 
-    if (input_tensor_types == TestType::DYNAMIC) {
+    if (input_tensor_types == TestType::kDynamic) {
       PopulateTensor<index_type>(begin_, begin_data);
       PopulateTensor<index_type>(size_, size_data);
     }
@@ -239,7 +239,8 @@ TEST_P(SliceOpTest, SliceString) {
 }
 
 INSTANTIATE_TEST_SUITE_P(SliceOpTest, SliceOpTest,
-                         ::testing::Values(TestType::CONST, TestType::DYNAMIC));
+                         ::testing::Values(TestType::kConst,
+                                           TestType::kDynamic));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/split_test.cc b/tensorflow/lite/kernels/split_test.cc
index 48c7a0afdf2..7952396880c 100644
--- a/tensorflow/lite/kernels/split_test.cc
+++ b/tensorflow/lite/kernels/split_test.cc
@@ -26,8 +26,8 @@ using ::testing::ElementsAreArray;
 constexpr int kAxisIsATensor = -1000;
 
 enum class TestType {
-  CONST = 0,
-  DYNAMIC = 1,
+  kConst = 0,
+  kDynamic = 1,
 };
 
 class SplitOpModel : public SingleOpModel {
@@ -83,7 +83,7 @@ void Check(TestType test_type, int axis, int num_splits,
        << " and num_splits=" << num_splits;
     return ss.str();
   };
-  if (test_type == TestType::DYNAMIC) {
+  if (test_type == TestType::kDynamic) {
     SplitOpModel m({type, input_shape}, num_splits);
     m.SetInput(input_data);
     m.SetAxis(axis);
@@ -110,18 +110,18 @@ void Check(TestType test_type, int axis, int num_splits,
 template <typename T>
 class SplitOpTest : public ::testing::Test {
  public:
-  static std::vector<TestType> _range_;
+  static std::vector<TestType> range_;
 };
 
 template <>
-std::vector<TestType> SplitOpTest<TestType>::_range_{TestType::CONST,
-                                                     TestType::DYNAMIC};
+std::vector<TestType> SplitOpTest<TestType>::range_{TestType::kConst,
+                                                    TestType::kDynamic};
 
 using DataTypes = ::testing::Types<float, int8_t, int16_t>;
 TYPED_TEST_SUITE(SplitOpTest, DataTypes);
 
 TYPED_TEST(SplitOpTest, FourDimensional) {
-  for (TestType test_type : SplitOpTest<TestType>::_range_) {
+  for (TestType test_type : SplitOpTest<TestType>::range_) {
     Check<TypeParam>(/*axis_as_tensor*/ test_type,
                      /*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
                      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
@@ -158,7 +158,7 @@ TYPED_TEST(SplitOpTest, FourDimensional) {
 }
 
 TYPED_TEST(SplitOpTest, FourDimensionalInt8) {
-  for (TestType test_type : SplitOpTest<TestType>::_range_) {
+  for (TestType test_type : SplitOpTest<TestType>::range_) {
     Check<TypeParam>(/*axis_as_tensor*/ test_type,
                      /*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
                      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
@@ -195,7 +195,7 @@ TYPED_TEST(SplitOpTest, FourDimensionalInt8) {
 }
 
 TYPED_TEST(SplitOpTest, FourDimensionalInt32) {
-  for (TestType test_type : SplitOpTest<TestType>::_range_) {
+  for (TestType test_type : SplitOpTest<TestType>::range_) {
     Check<TypeParam>(/*axis_as_tensor*/ test_type,
                      /*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
                      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
@@ -232,7 +232,7 @@ TYPED_TEST(SplitOpTest, FourDimensionalInt32) {
 }
 
 TYPED_TEST(SplitOpTest, OneDimensional) {
-  for (TestType test_type : SplitOpTest<TestType>::_range_) {
+  for (TestType test_type : SplitOpTest<TestType>::range_) {
     Check<TypeParam>(
         /*axis_as_tensor*/ test_type,
         /*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
@@ -241,7 +241,7 @@ TYPED_TEST(SplitOpTest, OneDimensional) {
 }
 
 TYPED_TEST(SplitOpTest, NegativeAxis) {
-  for (TestType test_type : SplitOpTest<TestType>::_range_) {
+  for (TestType test_type : SplitOpTest<TestType>::range_) {
     Check<TypeParam>(/*axis_as_tensor*/ test_type,
                      /*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
                      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 55a91acf1b5..1b04143d222 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
+
 #include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -219,51 +221,66 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_size);
 }
 
+template <KernelType kernel_type, typename data_type>
+void EvalSubImpl(TfLiteContext* context, TfLiteNode* node,
+                 TfLiteSubParams* params, const OpData* data,
+                 const TfLiteTensor* input1, const TfLiteTensor* input2,
+                 bool requires_broadcast, TfLiteTensor* output) {
+  data_type output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  switch (kernel_type) {
+    case kReference:
+      if (requires_broadcast) {
+        reference_ops::BroadcastSubSlow(
+            op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
+            GetTensorShape(input2), GetTensorData<data_type>(input2),
+            GetTensorShape(output), GetTensorData<data_type>(output));
+      } else {
+        reference_ops::SubWithActivation(
+            op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
+            GetTensorShape(input2), GetTensorData<data_type>(input2),
+            GetTensorShape(output), GetTensorData<data_type>(output));
+      }
+      break;
+    case kGenericOptimized:
+    case kNeonOptimized:
+      if (requires_broadcast) {
+        optimized_ops::BroadcastSubSlow(
+            op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
+            GetTensorShape(input2), GetTensorData<data_type>(input2),
+            GetTensorShape(output), GetTensorData<data_type>(output));
+      } else {
+        optimized_ops::SubWithActivation(
+            op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
+            GetTensorShape(input2), GetTensorData<data_type>(input2),
+            GetTensorShape(output), GetTensorData<data_type>(output));
+      }
+      break;
+  }
+}
+
 template <KernelType kernel_type>
 void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
              const OpData* data, const TfLiteTensor* input1,
              const TfLiteTensor* input2, TfLiteTensor* output) {
-#define TF_LITE_SUB(type, opname, data_type)                             \
-  data_type output_activation_min, output_activation_max;                \
-  CalculateActivationRange(params->activation, &output_activation_min,   \
-                           &output_activation_max);                      \
-  tflite::ArithmeticParams op_params;                                    \
-  SetActivationParams(output_activation_min, output_activation_max,      \
-                      &op_params);                                       \
-  type::opname(op_params, GetTensorShape(input1),                        \
-               GetTensorData<data_type>(input1), GetTensorShape(input2), \
-               GetTensorData<data_type>(input2), GetTensorShape(output), \
-               GetTensorData<data_type>(output))
-  if (output->type == kTfLiteInt32) {
-    if (kernel_type == kReference) {
-      if (data->requires_broadcast) {
-        TF_LITE_SUB(reference_ops, BroadcastSubSlow, int32_t);
-      } else {
-        TF_LITE_SUB(reference_ops, SubWithActivation, int32_t);
-      }
-    } else {
-      if (data->requires_broadcast) {
-        TF_LITE_SUB(optimized_ops, BroadcastSubSlow, int32_t);
-      } else {
-        TF_LITE_SUB(optimized_ops, SubWithActivation, int32_t);
-      }
-    }
-  } else if (output->type == kTfLiteFloat32) {
-    if (kernel_type == kReference) {
-      if (data->requires_broadcast) {
-        TF_LITE_SUB(reference_ops, BroadcastSubSlow, float);
-      } else {
-        TF_LITE_SUB(reference_ops, SubWithActivation, float);
-      }
-    } else {
-      if (data->requires_broadcast) {
-        TF_LITE_SUB(optimized_ops, BroadcastSubSlow, float);
-      } else {
-        TF_LITE_SUB(optimized_ops, SubWithActivation, float);
-      }
-    }
+  const bool requires_broadcast = data->requires_broadcast;
+  switch (output->type) {
+    case kTfLiteInt32:
+      EvalSubImpl<kernel_type, int32_t>(context, node, params, data, input1,
+                                        input2, requires_broadcast, output);
+      break;
+    case kTfLiteFloat32:
+      EvalSubImpl<kernel_type, float>(context, node, params, data, input1,
+                                      input2, requires_broadcast, output);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "output type %d is not supported.",
+                         output->type);
   }
-#undef TF_LITE_SUB
 }
 
 template <KernelType kernel_type>
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 7b504e42371..90a4df56c57 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -95,7 +95,9 @@ struct TensorData {
              int32_t zero_point = 0, bool per_channel_quantization = false,
              std::vector<float> per_channel_quantization_scales = {},
              std::vector<int64_t> per_channel_quantization_offsets = {},
-             int32_t channel_index = 0)
+             int32_t channel_index = 0, std::vector<int> traversal_order = {},
+             std::vector<TfLiteDimensionType> format = {},
+             std::vector<int> block_size = {}, std::vector<int> block_map = {})
       : type(type),
         shape(shape),
         min(min),
@@ -107,7 +109,11 @@ struct TensorData {
             std::move(per_channel_quantization_scales)),
         per_channel_quantization_offsets(
             std::move(per_channel_quantization_offsets)),
-        channel_index(channel_index) {}
+        channel_index(channel_index),
+        traversal_order(traversal_order),
+        format(format),
+        block_size(block_size),
+        block_map(block_map) {}
   TensorType type;
   std::vector<int> shape;
   float min;
@@ -118,6 +124,10 @@ struct TensorData {
   std::vector<float> per_channel_quantization_scales;
   std::vector<int64_t> per_channel_quantization_offsets;
   int32_t channel_index;
+  std::vector<int> traversal_order;
+  std::vector<TfLiteDimensionType> format;
+  std::vector<int> block_size;
+  std::vector<int> block_map;
 };
 
 class SingleOpResolver : public OpResolver {
@@ -189,12 +199,75 @@ class SingleOpModel {
     return AddConstInput(TensorData{type, shape}, data);
   }
 
-  // Add a constant sparse tensor as input. For unit test purpose, we choose to
-  // compress all dimensions and traverse them in the original order.
+  // Add a constant sparse tensor as input.
   template <typename T>
-  int AddConstSparseInput(TensorType type, std::initializer_list<int> shape,
-                          std::initializer_list<T> data) {
-    return AddSparseTensor(TensorData{type, shape}, data);
+  int AddConstSparseInput(const TensorData& t, std::initializer_list<T> data) {
+    int id = tensors_.size();
+    const int dims_count = t.traversal_order.size();
+    std::vector<T> dense_data(data);
+
+    tflite::optimize::sparsity::FormatConverter<T> converter(
+        t.shape, t.traversal_order, t.format, t.block_size, t.block_map);
+    converter.DenseToSparse(dense_data.data());
+
+    const auto dim_metadata = converter.GetDimMetadata();
+    const auto sparse_data = converter.GetData();
+
+    // Build sparsity parameter.
+    std::vector<flatbuffers::Offset<DimensionMetadata>> fb_dim_metadata(
+        dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      const int metadata_idx = 2 * i;
+      if (i < t.shape.size() &&
+          t.format[t.traversal_order[i]] == kTfLiteDimSparseCSR) {
+        auto array_segments =
+            CreateInt32Vector(builder_,
+                              builder_.CreateVector(dim_metadata[metadata_idx]))
+                .Union();
+        auto array_indices =
+            CreateInt32Vector(
+                builder_, builder_.CreateVector(dim_metadata[metadata_idx + 1]))
+                .Union();
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_SPARSE_CSR, 0,
+            SparseIndexVector_Int32Vector, array_segments,
+            SparseIndexVector_Int32Vector, array_indices);
+      } else {
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_DENSE, dim_metadata[metadata_idx][0]);
+      }
+    }
+
+    flatbuffers::Offset<SparsityParameters> s_param = CreateSparsityParameters(
+        builder_, builder_.CreateVector(t.traversal_order),
+        builder_.CreateVector(t.block_map),
+        builder_.CreateVector(fb_dim_metadata));
+
+    int buffer_id = 0;
+    if (data.size()) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      // Add compressed data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      auto data_buffer = builder_.CreateVector(
+          reinterpret_cast<const uint8_t*>(sparse_data.data()),
+          sizeof(T) * sparse_data.size());
+      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+    }
+
+    tensors_.push_back(CreateTensor(
+        builder_, builder_.CreateVector<int>(t.shape), t.type,
+        /*buffer=*/buffer_id,
+        /*name=*/0, /*quantization=*/0, /*is_variable=*/false, s_param));
+
+    inputs_.push_back(id);
+    tensor_data_[id] = t;
+
+    return id;
   }
 
   // Add a null input tensor (optional input) and return kTfLiteOptionalTensor.
@@ -251,32 +324,44 @@ class SingleOpModel {
                    quantized_output.data() + quantized_output.size());
   }
 
+  template <typename T>
+  void PerChannelQuantizeBiasPopulateTensor(
+      const std::vector<float>& input_data, int index,
+      TfLiteAffineQuantization* params) {
+    const int32_t num_inputs = input_data.size();
+    std::vector<T> quantized_output(num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      const float scale = params->scale->size == 1 ? params->scale->data[0]
+                                                   : params->scale->data[i];
+      quantized_output[i] = input_data[i] / scale;
+    }
+  }
+
+  template <typename T>
+  void PerChannelQuantizeBiasPopulateTensor(
+      int index, const std::vector<float>& input_data,
+      const TfLiteAffineQuantization* params) {
+    const int32_t num_inputs = input_data.size();
+    std::vector<T> quantized_output(num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      const float scale = params->scale->size == 1 ? params->scale->data[0]
+                                                   : params->scale->data[i];
+      quantized_output[i] = input_data[i] / scale;
+    }
+    PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                   quantized_output.data() + quantized_output.size());
+  }
+
   // Quantize and populate data for bias with per channel quantization.
   void PerChannelQuantizeBias(int index, const std::vector<float>& input_data) {
-    const int32_t num_inputs = input_data.size();
-    std::vector<int32_t> quantized_output(num_inputs);
     TfLiteTensor* t = interpreter_->tensor(index);
     auto* params =
         reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
     CHECK(t->type == kTfLiteInt32 || t->type == kTfLiteInt64);
     if (t->type == kTfLiteInt32) {
-      std::vector<int32_t> quantized_output(num_inputs);
-      for (int i = 0; i < num_inputs; ++i) {
-        const float scale = params->scale->size == 1 ? params->scale->data[0]
-                                                     : params->scale->data[i];
-        quantized_output[i] = input_data[i] / scale;
-      }
-      PopulateTensor(index, /*offset=*/0, quantized_output.data(),
-                     quantized_output.data() + quantized_output.size());
+      PerChannelQuantizeBiasPopulateTensor<int32_t>(index, input_data, params);
     } else {
-      std::vector<int64_t> quantized_output(num_inputs);
-      for (int i = 0; i < num_inputs; ++i) {
-        const float scale = params->scale->size == 1 ? params->scale->data[0]
-                                                     : params->scale->data[i];
-        quantized_output[i] = input_data[i] / scale;
-      }
-      PopulateTensor(index, /*offset=*/0, quantized_output.data(),
-                     quantized_output.data() + quantized_output.size());
+      PerChannelQuantizeBiasPopulateTensor<int64_t>(index, input_data, params);
     }
   }
 
@@ -368,6 +453,14 @@ class SingleOpModel {
   template <typename T>
   void PopulateTensor(int index, int offset, T* begin, T* end) {
     T* v = interpreter_->typed_tensor<T>(index);
+    if (!v) {
+      auto* t = interpreter_->tensor(index);
+      CHECK(t) << "No tensor with index " << index << ".";
+      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
+      CHECK(v) << "Type mismatch for tensor with index " << index
+               << ". Requested " << typeToTfLiteType<T>() << ", got "
+               << t->type;
+    }
     memcpy(v + offset, begin, (end - begin) * sizeof(T));
   }
 
@@ -605,84 +698,6 @@ class SingleOpModel {
     return id;
   }
 
-  template <typename T>
-  int AddSparseTensor(const TensorData& t, std::initializer_list<T> data) {
-    int id = tensors_.size();
-    const auto& shape = t.shape;
-    const int dims_count = shape.size();
-    std::vector<TfLiteDimensionType> format(dims_count);
-    std::vector<int> traversal_order(dims_count);
-    std::vector<T> dense_data(data);
-
-    // Compress only the last dimension and traverse in the original order.
-    for (int i = 0; i < dims_count; i++) {
-      format[i] = kTfLiteDimDense;
-      traversal_order[i] = i;
-    }
-    format[dims_count - 1] = kTfLiteDimSparseCSR;
-
-    tflite::optimize::sparsity::FormatConverter<T> converter(
-        shape, traversal_order, format);
-    converter.DenseToSparse(dense_data.data());
-
-    const auto& dim_metadata = converter.GetDimMetadata();
-    const auto& sparse_data = converter.GetData();
-
-    // Build sparsity parameter.
-    std::vector<flatbuffers::Offset<DimensionMetadata>> fb_dim_metadata(
-        dims_count);
-    for (int i = 0; i < dims_count - 1; i++) {
-      const int metadata_idx = 2 * i;
-      fb_dim_metadata[i] = CreateDimensionMetadata(
-          builder_, DimensionType_DENSE, dim_metadata[metadata_idx][0]);
-    }
-
-    // Parameters for the last compressed dimension.
-    const int compressed_metadata_idx = 2 * (dims_count - 1);
-    auto array_segments =
-        CreateInt32Vector(builder_, builder_.CreateVector(
-                                        dim_metadata[compressed_metadata_idx]))
-            .Union();
-    auto array_indices =
-        CreateInt32Vector(
-            builder_,
-            builder_.CreateVector(dim_metadata[compressed_metadata_idx + 1]))
-            .Union();
-    fb_dim_metadata[dims_count - 1] = CreateDimensionMetadata(
-        builder_, DimensionType_SPARSE_CSR, 0, SparseIndexVector_Int32Vector,
-        array_segments, SparseIndexVector_Int32Vector, array_indices);
-
-    flatbuffers::Offset<SparsityParameters> s_param = CreateSparsityParameters(
-        builder_, builder_.CreateVector(traversal_order), 0,
-        builder_.CreateVector(fb_dim_metadata));
-
-    int buffer_id = 0;
-    if (data.size()) {
-      // Initialize buffers list with empty buffer to allow for non-const
-      // tensors.
-      if (buffers_.empty()) {
-        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
-      }
-
-      // Add compressed data as a Buffer to buffers list.
-      buffer_id = buffers_.size();
-      auto data_buffer = builder_.CreateVector(
-          reinterpret_cast<const uint8_t*>(sparse_data.data()),
-          sizeof(T) * sparse_data.size());
-      buffers_.push_back(CreateBuffer(builder_, data_buffer));
-    }
-
-    tensors_.push_back(CreateTensor(
-        builder_, builder_.CreateVector<int>(t.shape), t.type,
-        /*buffer=*/buffer_id,
-        /*name=*/0, /*quantization=*/0, /*is_variable=*/false, s_param));
-
-    inputs_.push_back(id);
-    tensor_data_[id] = t;
-
-    return id;
-  }
-
   std::vector<int8_t> QuantizeTensor(int index,
                                      const std::vector<float>& data) {
     TfLiteTensor* t = interpreter_->tensor(index);
@@ -816,40 +831,40 @@ struct TypeUnion;
 template <>
 struct TypeUnion<float> {
  public:
-  static const TensorType tensor_type = TensorType::TensorType_FLOAT32;
-  static const TfLiteType tflite_type = TfLiteType::kTfLiteFloat32;
+  static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT32;
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat32;
   typedef float ScalarType;
 };
 
 template <>
 struct TypeUnion<int32_t> {
  public:
-  static const TensorType tensor_type = TensorType::TensorType_INT32;
-  static const TfLiteType tflite_type = TfLiteType::kTfLiteInt32;
+  static constexpr TensorType tensor_type = TensorType::TensorType_INT32;
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt32;
   typedef int32_t ScalarType;
 };
 
 template <>
 struct TypeUnion<int16_t> {
  public:
-  static const TensorType tensor_type = TensorType::TensorType_INT16;
-  static const TfLiteType tflite_type = TfLiteType::kTfLiteInt16;
+  static constexpr TensorType tensor_type = TensorType::TensorType_INT16;
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt16;
   typedef int16_t ScalarType;
 };
 
 template <>
 struct TypeUnion<int8_t> {
  public:
-  static const TensorType tensor_type = TensorType::TensorType_INT8;
-  static const TfLiteType tflite_type = TfLiteType::kTfLiteInt8;
+  static constexpr TensorType tensor_type = TensorType::TensorType_INT8;
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt8;
   typedef int8_t ScalarType;
 };
 
 template <>
 struct TypeUnion<uint8_t> {
  public:
-  static const TensorType tensor_type = TensorType::TensorType_UINT8;
-  static const TfLiteType tflite_type = TfLiteType::kTfLiteUInt8;
+  static constexpr TensorType tensor_type = TensorType::TensorType_UINT8;
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteUInt8;
   typedef uint8_t ScalarType;
 };
 
diff --git a/tensorflow/lite/kernels/topk_v2_test.cc b/tensorflow/lite/kernels/topk_v2_test.cc
index c82e5a66d5b..72ed82c1449 100644
--- a/tensorflow/lite/kernels/topk_v2_test.cc
+++ b/tensorflow/lite/kernels/topk_v2_test.cc
@@ -26,8 +26,8 @@ namespace {
 using ::testing::ElementsAreArray;
 
 enum class TestType {
-  CONST = 0,
-  DYNAMIC = 1,
+  kConst = 0,
+  kDynamic = 1,
 };
 
 template <typename InputType>
@@ -36,7 +36,7 @@ class TopKV2OpModel : public SingleOpModel {
   TopKV2OpModel(int top_k, std::initializer_list<int> input_shape,
                 std::initializer_list<InputType> input_data,
                 TestType input_tensor_types) {
-    if (input_tensor_types == TestType::DYNAMIC) {
+    if (input_tensor_types == TestType::kDynamic) {
       input_ = AddInput(GetTensorType<InputType>());
       top_k_ = AddInput(TensorType_INT32);
     } else {
@@ -49,7 +49,7 @@ class TopKV2OpModel : public SingleOpModel {
     SetBuiltinOp(BuiltinOperator_TOPK_V2, BuiltinOptions_TopKV2Options, 0);
     BuildInterpreter({input_shape, {1}});
 
-    if (input_tensor_types == TestType::DYNAMIC) {
+    if (input_tensor_types == TestType::kDynamic) {
       PopulateTensor<InputType>(input_, input_data);
       PopulateTensor<int32_t>(top_k_, {top_k});
     }
@@ -119,7 +119,8 @@ TEST_P(TopKV2OpTest, TypeInt32) {
 }
 
 INSTANTIATE_TEST_SUITE_P(TopKV2OpTest, TopKV2OpTest,
-                         ::testing::Values(TestType::CONST, TestType::DYNAMIC));
+                         ::testing::Values(TestType::kConst,
+                                           TestType::kDynamic));
 
 // Check that uint8_t works.
 TEST_P(TopKV2OpTest, TypeUint8) {
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 114b9ae48f4..9b2767f15a9 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -50,6 +50,7 @@ enum KernelType {
 constexpr int kOutputShapeTensor = 0;
 constexpr int kWeightsTensor = 1;
 constexpr int kDataInputTensor = 2;
+constexpr int kBiasTensor = 3;
 constexpr int kOutputTensor = 0;
 
 const int kTensorNotAllocated = -1;
@@ -232,7 +233,7 @@ TfLiteStatus ResizeAndTransposeWeights(TfLiteContext* context,
                              GetTensorData<int8>(transposed_weights));
   } else {
     context->ReportError(
-        context, "Transpose conv only support float & uint8 right now.");
+        context, "Transpose conv only support float & uint8 & int8 right now.");
     return kTfLiteError;
   }
 
@@ -243,8 +244,10 @@ template <KernelType kernel_type>
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
+  bool has_bias = NumInputs(node) == 4;
+
   // Sanity checks on op
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE(context, has_bias || NumInputs(node) == 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   // Retrieve tensors
@@ -252,6 +255,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       GetInput(context, node, kOutputShapeTensor);
   const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  const TfLiteTensor* bias = nullptr;
+
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Tensor sanity checks
@@ -261,7 +266,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
                               input->type == kTfLiteUInt8 ||
                               input->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, weights->type, input->type);
+
+  if (has_bias) {
+    bias = GetOptionalInputTensor(context, node, kBiasTensor);
+    if (bias) {
+      if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+        TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+        if (input->type == kTfLiteInt8) {
+          TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+        }
+      } else {
+        TF_LITE_ENSURE_EQ(context, bias->type, input->type);
+      }
+      TF_LITE_ENSURE_EQ(context, NumElements(bias),
+                        SizeOfDimension(weights, 0));
+    }
+  }
+
   TF_LITE_ENSURE_EQ(context, output->type, input->type);
   // Ensure that weights and inputs have the same channel dimension.
   // Note: TOCO will reorder weights in the following format: OHWI.
@@ -330,7 +351,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     data->per_channel_output_multiplier.resize(number_channel);
     data->per_channel_output_shift.resize(number_channel);
     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, weights, nullptr, output, kTfLiteActNone,
+        context, input, weights, bias, output, kTfLiteActNone,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier.data(),
@@ -343,7 +364,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, const TfLiteTransposeConvParams* params,
                const OpData* data, const TfLiteTensor* input,
-               const TfLiteTensor* weights,
+               const TfLiteTensor* weights, const TfLiteTensor* bias,
                const TfLiteTensor* transposed_weights, TfLiteTensor* col2im,
                TfLiteTensor* output) {
   tflite::ConvParams op_params;
@@ -354,11 +375,13 @@ void EvalFloat(TfLiteContext* context, const TfLiteTransposeConvParams* params,
   op_params.padding_values.height_offset = data->padding.height_offset;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
+
   switch (kernel_type) {
     case kReference: {
       reference_ops::TransposeConv(
           op_params, GetTensorShape(input), GetTensorData<float>(input),
           GetTensorShape(weights), GetTensorData<float>(weights),
+          GetTensorShape(bias), GetTensorData<float>(bias),
           GetTensorShape(output), GetTensorData<float>(output),
           GetTensorShape(col2im), GetTensorData<float>(col2im));
       break;
@@ -367,7 +390,8 @@ void EvalFloat(TfLiteContext* context, const TfLiteTransposeConvParams* params,
       optimized_ops::TransposeConvV2(
           op_params, GetTensorShape(input), GetTensorData<float>(input),
           GetTensorShape(transposed_weights),
-          GetTensorData<float>(transposed_weights), GetTensorShape(output),
+          GetTensorData<float>(transposed_weights), GetTensorShape(bias),
+          GetTensorData<float>(bias), GetTensorShape(output),
           GetTensorData<float>(output), GetTensorShape(col2im),
           GetTensorData<float>(col2im),
           CpuBackendContext::GetFromContext(context));
@@ -380,7 +404,8 @@ template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context,
                    const TfLiteTransposeConvParams* params, OpData* data,
                    const TfLiteTensor* input, const TfLiteTensor* weights,
-                   const TfLiteTensor* transposed_weights, TfLiteTensor* col2im,
+                   const TfLiteTensor* transposed_weights,
+                   const TfLiteTensor* bias, TfLiteTensor* col2im,
                    TfLiteTensor* output, TfLiteTensor* scratch_buffer) {
   int32_t input_offset = -input->params.zero_point;
   int32_t filter_offset = -weights->params.zero_point;
@@ -407,6 +432,7 @@ void EvalQuantized(TfLiteContext* context,
       reference_ops::TransposeConv(
           op_params, GetTensorShape(input), GetTensorData<uint8>(input),
           GetTensorShape(weights), GetTensorData<uint8>(weights),
+          GetTensorShape(bias), GetTensorData<int32_t>(bias),
           GetTensorShape(output), GetTensorData<uint8>(output),
           GetTensorShape(col2im), GetTensorData<uint8>(col2im),
           GetTensorData<int32_t>(scratch_buffer));
@@ -416,7 +442,8 @@ void EvalQuantized(TfLiteContext* context,
       optimized_ops::TransposeConvV2(
           op_params, GetTensorShape(input), GetTensorData<uint8>(input),
           GetTensorShape(transposed_weights),
-          GetTensorData<uint8>(transposed_weights), GetTensorShape(output),
+          GetTensorData<uint8>(transposed_weights), GetTensorShape(bias),
+          GetTensorData<int32>(bias), GetTensorShape(output),
           GetTensorData<uint8>(output), GetTensorShape(col2im),
           GetTensorData<int32>(col2im), GetTensorData<int32>(scratch_buffer),
           CpuBackendContext::GetFromContext(context));
@@ -426,13 +453,11 @@ void EvalQuantized(TfLiteContext* context,
 }
 
 template <KernelType kernel_type>
-void EvalQuantizedPerChannel(TfLiteContext* context,
-                             const TfLiteTransposeConvParams* params,
-                             OpData* data, const TfLiteTensor* input,
-                             const TfLiteTensor* weights,
-                             const TfLiteTensor* transposed_weights,
-                             TfLiteTensor* col2im, TfLiteTensor* output,
-                             TfLiteTensor* scratch_buffer) {
+void EvalQuantizedPerChannel(
+    TfLiteContext* context, const TfLiteTransposeConvParams* params,
+    OpData* data, const TfLiteTensor* input, const TfLiteTensor* weights,
+    const TfLiteTensor* transposed_weights, const TfLiteTensor* bias,
+    TfLiteTensor* col2im, TfLiteTensor* output, TfLiteTensor* scratch_buffer) {
   tflite::ConvParams op_params;
   op_params.padding_type = PaddingType::kSame;
   op_params.padding_values.width = data->padding.width;
@@ -454,7 +479,8 @@ void EvalQuantizedPerChannel(TfLiteContext* context,
           op_params, data->per_channel_output_multiplier.data(),
           data->per_channel_output_shift.data(), GetTensorShape(input),
           GetTensorData<int8>(input), GetTensorShape(weights),
-          GetTensorData<int8>(weights), GetTensorShape(output),
+          GetTensorData<int8>(weights), GetTensorShape(bias),
+          GetTensorData<int32>(bias), GetTensorShape(output),
           GetTensorData<int8>(output), GetTensorShape(col2im),
           GetTensorData<int8>(col2im), GetTensorData<int32_t>(scratch_buffer));
       break;
@@ -464,7 +490,8 @@ void EvalQuantizedPerChannel(TfLiteContext* context,
           op_params, data->per_channel_output_multiplier.data(),
           data->per_channel_output_shift.data(), GetTensorShape(input),
           GetTensorData<int8>(input), GetTensorShape(transposed_weights),
-          GetTensorData<int8>(transposed_weights), GetTensorShape(output),
+          GetTensorData<int8>(transposed_weights), GetTensorShape(bias),
+          GetTensorData<int32>(bias), GetTensorShape(output),
           GetTensorData<int8>(output), GetTensorShape(col2im),
           GetTensorData<int32>(col2im), GetTensorData<int32>(scratch_buffer),
           CpuBackendContext::GetFromContext(context));
@@ -480,6 +507,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetInput(context, node, kOutputShapeTensor);
   const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  const TfLiteTensor* bias =
+      (NumInputs(node) == 4)
+          ? GetOptionalInputTensor(context, node, kBiasTensor)
+          : nullptr;
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
   TfLiteTensor* col2im = data->has_col2im
@@ -522,7 +553,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           ResizeAndTransposeWeights(context, weights, transposed_weights);
         }
       }
-      EvalFloat<kernel_type>(context, params, data, input, weights,
+      EvalFloat<kernel_type>(context, params, data, input, weights, bias,
                              transposed_weights, col2im, output);
       break;
     }
@@ -539,7 +570,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         }
       }
       EvalQuantized<kernel_type>(context, params, data, input, weights,
-                                 transposed_weights, col2im, output,
+                                 transposed_weights, bias, col2im, output,
                                  scratch_buffer);
       break;
     }
@@ -554,8 +585,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         ResizeAndTransposeWeights(context, weights, transposed_weights);
       }
       EvalQuantizedPerChannel<kernel_type>(context, params, data, input,
-                                           weights, transposed_weights, col2im,
-                                           output, scratch_buffer);
+                                           weights, transposed_weights, bias,
+                                           col2im, output, scratch_buffer);
       break;
     }
     default:
diff --git a/tensorflow/lite/kernels/transpose_conv_test.cc b/tensorflow/lite/kernels/transpose_conv_test.cc
index 1851c01bb59..77dc22b13e8 100644
--- a/tensorflow/lite/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/kernels/transpose_conv_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -37,8 +37,8 @@ namespace {
 using ::testing::ElementsAreArray;
 
 enum class TestType {
-  CONST = 0,
-  DYNAMIC = 1,
+  kConst = 0,
+  kDynamic = 1,
 };
 
 template <typename InputType>
@@ -54,7 +54,7 @@ class BaseTransposeConvOpModel : public SingleOpModel {
     // Just to be confusing, transpose_conv has an _input_ named "output_shape"
     // that sets the shape of the output tensor of the op :). It must always be
     // an int32 1D four element tensor.
-    if (test_type == TestType::DYNAMIC) {
+    if (test_type == TestType::kDynamic) {
       output_shape_ = AddInput({TensorType_INT32, {4}});
       filter_ = AddInput(filter);
     } else {
@@ -74,7 +74,7 @@ class BaseTransposeConvOpModel : public SingleOpModel {
     BuildInterpreter(
         {GetShape(output_shape_), GetShape(filter_), GetShape(input_)});
 
-    if (test_type == TestType::DYNAMIC) {
+    if (test_type == TestType::kDynamic) {
       PopulateTensor<int32_t>(output_shape_, output_shape_data);
       PopulateTensor<InputType>(filter_, filter_data);
     }
@@ -441,11 +441,241 @@ TEST_P(TransposeConvOpTest, PaddingValidTestQuantized) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 6, 6, 1}));
 }
 
+template <typename InputType>
+class BaseTransposeConvBiasOpModel : public SingleOpModel {
+ public:
+  BaseTransposeConvBiasOpModel(TfLiteRegistration* registration,
+                               std::initializer_list<int> output_shape_data,
+                               const TensorData& filter,
+                               std::initializer_list<InputType> filter_data,
+                               const TensorData& input,
+                               const TensorData& output, Padding padding,
+                               int stride_w, int stride_h, TestType test_type,
+                               int version = 3) {
+    if (test_type == TestType::kDynamic) {
+      output_shape_ = AddInput({TensorType_INT32, {4}});
+      filter_ = AddInput(filter);
+    } else {
+      output_shape_ = AddConstInput(TensorType_INT32, output_shape_data, {4});
+      filter_ = AddConstInput(filter, filter_data);
+    }
+    input_ = AddInput(input);
+
+    int bias_size = GetShape(filter_)[0];
+    if (input.type == TensorType_FLOAT32) {
+      bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+    } else if (input.type == TensorType_INT8) {
+      // per channel quantization.
+      std::vector<float> bias_scale(
+          filter.per_channel_quantization_scales.size());
+      std::vector<int64_t> bias_zero_points(
+          filter.per_channel_quantization_scales.size());
+      for (size_t i = 0; i < filter.per_channel_quantization_scales.size();
+           ++i) {
+        bias_scale[i] = input.scale * filter.per_channel_quantization_scales[i];
+        bias_zero_points[i] = 0;
+      }
+      TensorData bias{TensorType_INT32,
+                      {bias_size},
+                      /*min=*/0,
+                      /*max=*/0,
+                      /*scale=*/0,
+                      /*zero_point=*/0,
+                      true,
+                      /*per_channel_quantization_scales=*/bias_scale,
+                      /*per_channel_quantization_offsets=*/bias_zero_points,
+                      /*channel_index==*/0};
+      bias_ = AddInput(bias);
+    } else {
+      // per tensor quantization.
+      auto bias_scale = GetScale(input_) * GetScale(filter_);
+      TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
+
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(
+        BuiltinOperator_TRANSPOSE_CONV, BuiltinOptions_TransposeConvOptions,
+        CreateTransposeConvOptions(builder_, padding, stride_w, stride_h)
+            .Union());
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_TRANSPOSE_CONV, registration, version);
+    BuildInterpreter({GetShape(output_shape_), GetShape(filter_),
+                      GetShape(input_), GetShape(bias_)});
+
+    if (test_type == TestType::kDynamic) {
+      PopulateTensor<int32_t>(output_shape_, output_shape_data);
+      PopulateTensor<InputType>(filter_, filter_data);
+    }
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    if (std::is_same<InputType, uint8_t>::value) {
+      QuantizeAndPopulate<uint8_t>(input_, data);
+    } else if (std::is_same<InputType, int8_t>::value) {
+      QuantizeAndPopulate<int8_t>(input_, data);
+    } else {
+      PopulateTensor(input_, data);
+    }
+  }
+
+  void SetBias(std::initializer_list<float> bias) {
+    if (std::is_same<InputType, uint8_t>::value) {
+      QuantizeAndPopulate<int32_t>(bias_, bias);
+    } else if (std::is_same<InputType, int8_t>::value) {
+      PerChannelQuantizeBias(bias_, bias);
+    } else {
+      PopulateTensor(bias_, bias);
+    }
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int output_shape_;
+  int filter_;
+  int input_;
+  int bias_;
+  int output_;
+};
+
+class TransposeConvOpBiasModel : public BaseTransposeConvBiasOpModel<float> {
+ public:
+  using BaseTransposeConvBiasOpModel::BaseTransposeConvBiasOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+// Test case:
+// input_data = np.arange(1, 5).reshape(1,2,2,1).astype(np.float32)
+// filter_data = np.arange(1, 19).reshape(3,3,2,1).astype(np.float32)
+// bias_data = np.array([3,4])
+// input = tf.keras.layers.Input(shape=(2, 2, 1))
+// output = tf.keras.layers.Convolution2DTranspose(filters=2,
+//                                                 kernel_size=[3, 3],
+//                                                 strides=[2, 2],
+//                                                 padding="valid")(input)
+// model = tf.keras.models.Model(input, output)
+// model.layers[1].set_weights([filter_data, bias_data])
+// output = model.predict(input_data)
+TEST_P(TransposeConvOpTest, MultiChannelBiasTest) {
+  // TODO(b/138722124): Enable these tests on NNAPI.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  TransposeConvOpBiasModel model(
+      GetRegistration(), /*output_shape=*/{1, 5, 5, 2},
+      /*filter=*/{TensorType_FLOAT32, {2, 3, 3, 1}},
+      /*filter_data=*/
+      {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18},
+      /*input=*/{TensorType_FLOAT32, {1, 2, 2, 1}},
+      /*output=*/{TensorType_FLOAT32, {}}, Padding_VALID,
+      /*stride_w=*/2, /*stride_h=*/2, GetTestType(), /* version */ 3);
+  model.SetInput({1, 2, 3, 4});
+  model.SetBias({3, 4});
+  model.Invoke();
+
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({4,  6,  6,  8,  10, 14,  9,   12, 13, 16, 10, 12, 12,
+                        14, 28, 32, 21, 24, 25,  28,  19, 24, 27, 32, 65, 76,
+                        45, 52, 57, 64, 24, 28,  30,  34, 64, 72, 39, 44, 47,
+                        52, 42, 46, 48, 52, 106, 114, 63, 68, 71, 76}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 5, 5, 2}));
+}
+
+class QuantizedTransposeConvBiasOpModel
+    : public BaseTransposeConvBiasOpModel<uint8_t> {
+ public:
+  using BaseTransposeConvBiasOpModel::BaseTransposeConvBiasOpModel;
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST_P(TransposeConvOpTest, SimpleBiasTestQuantized) {
+  // TODO(b/138722124): Enable these tests on NNAPI.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  // Float would be {1, 2, 3, 4, 5, 6, 7, 8, 9}
+  std::initializer_list<uint8_t> filter_data = {129, 131, 133, 135, 137,
+                                                139, 141, 143, 145};
+  QuantizedTransposeConvBiasOpModel model(
+      GetRegistration(), {1, 4, 4, 1},
+      {TensorType_UINT8, {1, 3, 3, 1}, -63.5, 64}, filter_data,
+      {TensorType_UINT8, {1, 4, 4, 1}, -63.5, 64},
+      {TensorType_UINT8, {}, -508, 512}, Padding_SAME, 1, 1, GetTestType(),
+      /* version */ 3);
+  model.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  model.SetBias({1});
+  model.Invoke();
+
+  EXPECT_THAT(
+      model.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({32, 64, 84, 76, 100, 192, 240, 200, 208,
+                                       372, 420, 332, 264, 448, 488, 368},
+                                      1e-5)));
+
+  // GetOutputShape() should always be same as model.SetOutputShape(...);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+class PerChannelQuantizedTransposeConvBiasOpModel
+    : public BaseTransposeConvBiasOpModel<int8_t> {
+ public:
+  using BaseTransposeConvBiasOpModel::BaseTransposeConvBiasOpModel;
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<int8_t>(ExtractVector<int8_t>(output_), GetScale(output_),
+                              GetZeroPoint(output_));
+  }
+
+  void SetInput(const std::initializer_list<float>& data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  void SetFilter(const std::initializer_list<float>& data) {
+    PerChannelSymmetricQuantizeAndPopulate(filter_, data);
+  }
+};
+
+TEST_P(TransposeConvOpTest, SimpleBiasTestQuantizedPerChannelSingleChannel) {
+  // TODO(b/138722124): Enable these tests on NNAPI.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+
+  const std::initializer_list<float> filter_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  PerChannelQuantizedTransposeConvBiasOpModel model(
+      GetRegistration(), {1, 4, 4, 1},
+      {TensorType_INT8, {1, 3, 3, 1}, 0, 0, 0, 0, true, {9.0 / 127}, {0}, 0},
+      {}, {TensorType_INT8, {1, 4, 4, 1}, 0, 0, 16.0 / 255, -128},
+      {TensorType_INT8, {}, 0, 0, 2, -128}, Padding_SAME, 1, 1, GetTestType(),
+      /* version */ 3);
+  model.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  model.SetFilter(filter_data);
+  model.SetBias({1});
+  model.Invoke();
+
+  EXPECT_THAT(
+      model.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({30, 62, 84, 76, 100, 194, 238, 200, 208,
+                                       372, 418, 330, 264, 446, 486, 366},
+                                      1e-5)));
+
+  // GetOutputShape() should always be same as model.SetOutputShape(...);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     TransposeConvOpTest, TransposeConvOpTest,
     ::testing::Combine(
         ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)),
-        ::testing::Values(TestType::CONST, TestType::DYNAMIC)));
+        ::testing::Values(TestType::kConst, TestType::kDynamic)));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 73b0535fc46..a6fe785ce53 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -537,7 +537,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     const TfLiteTensor* projection_weights =
         GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
     if (projection_weights != nullptr) {
-      row_sums_rows += ceil(n_output / n_cell);
+      row_sums_rows += ceil(static_cast<float>(n_output) / n_cell);
     }
     int row_sums_dims[2] = {row_sums_rows, n_cell};
     if (!TfLiteIntArrayEqualsArray(row_sums->dims, 2, row_sums_dims)) {
diff --git a/tensorflow/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
index 1c6d8ffdf5b..0c6b8fa157c 100644
--- a/tensorflow/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -87,15 +87,15 @@ template <typename InputType>
 struct UnpackOpTest : public ::testing::Test {
   using TypeToTest = InputType;
   TensorType TENSOR_TYPE =
-      std::is_same<InputType, int16_t>::value
-          ? TensorType_INT16
-          : std::is_same<InputType, uint8_t>::value
-                ? TensorType_UINT8
-                : std::is_same<InputType, int8_t>::value
-                      ? TensorType_INT8
-                      : std::is_same<InputType, int32_t>::value
-                            ? TensorType_INT32
-                            : TensorType_FLOAT32;
+      (std::is_same<InputType, int16_t>::value
+           ? TensorType_INT16
+           : (std::is_same<InputType, uint8_t>::value
+                  ? TensorType_UINT8
+                  : (std::is_same<InputType, int8_t>::value
+                         ? TensorType_INT8
+                         : (std::is_same<InputType, int32_t>::value
+                                ? TensorType_INT32
+                                : TensorType_FLOAT32))));
 };
 
 using TestTypes = testing::Types<float, int32_t, int8_t, uint8_t, int16_t>;
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index bd4a0b5d152..67471bc64a6 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -56,8 +56,10 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/micro/memory_planner",
         "//tensorflow/lite/micro/memory_planner:greedy_memory_planner",
         "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
@@ -71,10 +73,7 @@ cc_library(
     ],
     build_for_embedded = True,
     copts = micro_copts(),
-    deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core/api",
-    ],
+    deps = ["//tensorflow/lite/c:common"],
 )
 
 cc_library(
diff --git a/tensorflow/lite/micro/apollo3evb/micro_time.cc b/tensorflow/lite/micro/apollo3evb/micro_time.cc
new file mode 100644
index 00000000000..12c9ae5c633
--- /dev/null
+++ b/tensorflow/lite/micro/apollo3evb/micro_time.cc
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of timer functions.  Platforms are not required to
+// implement these timer methods, but they are required to enable profiling.
+
+// On platforms that have a POSIX stack or C library, it can be written using
+// methods from <sys/time.h> or clock() from <time.h>.
+
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/micro/bluepill/micro_timer.cc or the mbed one on
+// tensorflow/lite/micro/mbed/micro_timer.cc.
+
+#include "tensorflow/lite/micro/micro_time.h"
+
+// These are headers from Ambiq's Apollo3 SDK.
+#include "am_bsp.h"         // NOLINT
+#include "am_mcu_apollo.h"  // NOLINT
+#include "am_util.h"        // NOLINT
+
+namespace tflite {
+namespace {
+
+// Select CTIMER 1 as benchmarking timer on Sparkfun Edge. This timer must not
+// be used elsewhere.
+constexpr int kTimerNum = 1;
+
+// Clock set to operate at 12MHz.
+constexpr int kClocksPerSecond = 12e6;
+
+}  // namespace
+
+int32_t ticks_per_second() { return kClocksPerSecond; }
+
+// Calling this method enables a timer that runs for eternity. The user is
+// responsible for avoiding trampling on this timer's config, otherwise timing
+// measurements may no longer be valid.
+int32_t GetCurrentTimeTicks() {
+  // TODO(b/150808076): Split out initialization, intialize in interpreter.
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    am_hal_ctimer_config_t timer_config;
+    // Operate as a 32-bit timer.
+    timer_config.ui32Link = 1;
+    // Set timer A to continuous mode at 12MHz.
+    timer_config.ui32TimerAConfig =
+        AM_HAL_CTIMER_FN_CONTINUOUS | AM_HAL_CTIMER_HFRC_12MHZ;
+
+    am_hal_ctimer_stop(kTimerNum, AM_HAL_CTIMER_BOTH);
+    am_hal_ctimer_clear(kTimerNum, AM_HAL_CTIMER_BOTH);
+    am_hal_ctimer_config(kTimerNum, &timer_config);
+    am_hal_ctimer_start(kTimerNum, AM_HAL_CTIMER_TIMERA);
+    is_initialized = true;
+  }
+  return CTIMERn(kTimerNum)->TMR0;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/benchmarks/BUILD b/tensorflow/lite/micro/benchmarks/BUILD
index 55970fa0363..4af3267d769 100644
--- a/tensorflow/lite/micro/benchmarks/BUILD
+++ b/tensorflow/lite/micro/benchmarks/BUILD
@@ -29,3 +29,42 @@ cc_binary(
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
+
+cc_library(
+    name = "keyword_scrambled_model_data",
+    srcs = [
+        "keyword_scrambled_model_data.cc",
+    ],
+    hdrs = [
+        "keyword_scrambled_model_data.h",
+    ],
+)
+
+cc_binary(
+    name = "keyword_benchmark",
+    srcs = ["keyword_benchmark.cc"],
+    deps = [
+        ":keyword_scrambled_model_data",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro/kernels:micro_ops",
+        "//tensorflow/lite/micro/testing:micro_benchmark",
+    ],
+)
+
+cc_binary(
+    name = "person_detection_benchmark",
+    srcs = ["person_detection_benchmark.cc"],
+    deps = [
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_utils",
+        "//tensorflow/lite/micro/examples/person_detection:model_settings",
+        "//tensorflow/lite/micro/examples/person_detection:person_detect_model_data",
+        "//tensorflow/lite/micro/examples/person_detection:simple_images_test_data",
+        "//tensorflow/lite/micro/kernels:micro_ops",
+        "//tensorflow/lite/micro/testing:micro_benchmark",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
diff --git a/tensorflow/lite/micro/benchmarks/Makefile.inc b/tensorflow/lite/micro/benchmarks/Makefile.inc
new file mode 100644
index 00000000000..2a7eefd2596
--- /dev/null
+++ b/tensorflow/lite/micro/benchmarks/Makefile.inc
@@ -0,0 +1,24 @@
+$(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
+
+KEYWORD_BENCHMARK_SRCS := \
+tensorflow/lite/micro/benchmarks/keyword_benchmark.cc \
+tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
+
+KEYWORD_BENCHMARK_HDRS := \
+tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h
+
+PERSON_DETECTION_BENCHMARK_SRCS := \
+tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc \
+$(MAKEFILE_DIR)/downloads/person_model_grayscale/no_person_image_data.cc \
+$(MAKEFILE_DIR)/downloads/person_model_grayscale/person_detect_model_data.cc \
+$(MAKEFILE_DIR)/downloads/person_model_grayscale/person_image_data.cc
+
+PERSON_DETECTION_BENCHMARK_HDRS := \
+tensorflow/lite/micro/examples/person_detection/person_detect_model_data.h
+
+# Builds a standalone binary.
+$(eval $(call microlite_test,keyword_benchmark,\
+$(KEYWORD_BENCHMARK_SRCS),$(KEYWORD_BENCHMARK_HDRS)))
+
+$(eval $(call microlite_test,person_detection_benchmark,\
+$(PERSON_DETECTION_BENCHMARK_SRCS),$(PERSON_DETECTION_BENCHMARK_HDRS)))
diff --git a/tensorflow/lite/micro/benchmarks/README.md b/tensorflow/lite/micro/benchmarks/README.md
new file mode 100644
index 00000000000..72195554f34
--- /dev/null
+++ b/tensorflow/lite/micro/benchmarks/README.md
@@ -0,0 +1,66 @@
+# TFLite for Microcontrollers Benchmarks
+
+These benchmarks are for measuring the performance of key models and workloads.
+They are meant to be used as part of the model optimization process for a given
+platform.
+
+## Table of contents
+
+-   [Keyword Benchmark](#keyword-benchmark)
+-   [Person Detection Benchmark](#person-detection-benchmark)
+-   [Run on x86](#run-on-x86)
+-   [Run on Xtensa XPG Simulator](#run-on-xtensa-xpg-simulator)
+-   [Run on Sparkfun Edge](#run-on-sparkfun-edge)
+
+## Keyword benchmark
+
+The keyword benchmark contains a model for keyword detection with scrambled
+weights and biases.  This model is meant to test performance on a platform only.
+Since the weights are scrambled, the output is meaningless. In order to validate
+the accuracy of optimized kernels, please run the kernel tests.
+
+## Person detection benchmark
+
+The keyword benchmark provides a way to evaluate the performance of the 250KB
+visual wakewords model.
+
+## Run on x86
+
+To run the keyword benchmark on x86, run
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TAGS=posix test_keyword_benchmark
+```
+
+To run the person detection benchmark on x86, run
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TAGS=posix test_person_detection_benchmark
+```
+
+## Run on Xtensa XPG Simulator
+
+To run the keyword benchmark on the Xtensa XPG simulator, you will need a valid
+Xtensa toolchain and license.  With these set up, run:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa-xpg XTENSA_CORE=<xtensa core>  TAGS=xtensa_hifimini test_keyword_benchmark -j18
+```
+
+## Run on Sparkfun Edge
+The following instructions will help you build and deploy this benchmark on the
+[SparkFun Edge development board](https://sparkfun.com/products/15170).
+
+
+If you're new to using this board, we recommend walking through the
+[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
+codelab to get an understanding of the workflow.
+
+Build binary using
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=sparkfun_edge person_detection_benchmark_bin
+```
+
+Refer to flashing instructions in the [Person Detection Example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/person_detection/README.md#running-on-sparkfun-edge).
+
diff --git a/tensorflow/lite/micro/benchmarks/conv_benchmark.cc b/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
index b2859a91499..fa1724e5c99 100644
--- a/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
@@ -103,8 +103,7 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
 }  // namespace tflite
 
 int main() {
-  tflite::MicroErrorReporter micro_reporter;
-  tflite::ErrorReporter* reporter = &micro_reporter;
+  tflite::MicroErrorReporter reporter;
   const int input_shape[] = {4, 1, 1, 1, 32};
   const int filter_shape[] = {4, 32, 1, 1, 32};
   const int bias_shape[] = {1, 32};
@@ -231,9 +230,9 @@ int main() {
   const int num_tensors = sizeof(tensors) / sizeof(TfLiteTensor);
   TfLiteStatus status = tflite::testing::ValidateConvGoldens(
       tensors, num_tensors, &conv_params, kQuantizationTolerance,
-      output_dims_count, golden_quantized, reporter);
+      output_dims_count, golden_quantized, &reporter);
   if (status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(reporter, "Model invoke failed\n");
+    TF_LITE_REPORT_ERROR(&reporter, "Model invoke failed\n");
   }
   return 0;
 }
diff --git a/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc b/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
index c089f09cfca..4259424a42e 100644
--- a/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
@@ -116,8 +116,7 @@ TfLiteStatus ValidateDepthwiseConvGoldens(TfLiteTensor* tensors,
 }  // namespace tflite
 
 int main() {
-  tflite::MicroErrorReporter micro_reporter;
-  tflite::ErrorReporter* reporter = &micro_reporter;
+  tflite::MicroErrorReporter reporter;
   const int input_elements = 32 * 4;
   const int filter_elements = 32 * 4;
   const int bias_elements = 32;
@@ -242,9 +241,9 @@ int main() {
   constexpr int kQuantizationTolerance = 1;
   TfLiteStatus status = tflite::testing::ValidateDepthwiseConvGoldens(
       tensors, kTensorsSize, kTfLiteActNone, kQuantizationTolerance,
-      output_elements, golden_quantized, reporter);
+      output_elements, golden_quantized, &reporter);
   if (status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(reporter, "Model invoke failed\n");
+    TF_LITE_REPORT_ERROR(&reporter, "Model invoke failed\n");
   }
   return 0;
 }
diff --git a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
new file mode 100644
index 00000000000..fd8556f752e
--- /dev/null
+++ b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
@@ -0,0 +1,111 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_benchmark.h"
+
+/*
+ * Keyword Spotting Benchmark for performance optimizations. The model used in
+ * this benchmark only serves as a reference. The values assigned to the model
+ * weights and parameters are not representative of the original model.
+ */
+
+namespace {
+
+// Create an area of memory to use for input, output, and intermediate arrays.
+constexpr int tensor_arena_size = 73 * 1024;
+uint8_t tensor_arena[tensor_arena_size];
+// A random number generator seed to generate input values.
+constexpr int kRandomSeed = 42;
+
+class KeywordRunner {
+ public:
+  KeywordRunner()
+      : keyword_spotting_model_(
+            tflite::GetModel(g_keyword_scrambled_model_data)),
+        reporter_(&micro_reporter_),
+        interpreter_(keyword_spotting_model_, resolver_, tensor_arena,
+                     tensor_arena_size, reporter_) {
+    resolver_.AddBuiltin(tflite::BuiltinOperator_SVDF,
+                         tflite::ops::micro::Register_SVDF());
+    resolver_.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
+                         tflite::ops::micro::Register_FULLY_CONNECTED());
+    resolver_.AddBuiltin(tflite::BuiltinOperator_QUANTIZE,
+                         tflite::ops::micro::Register_QUANTIZE());
+    resolver_.AddBuiltin(tflite::BuiltinOperator_DEQUANTIZE,
+                         tflite::ops::micro::Register_DEQUANTIZE(), 1, 2);
+    resolver_.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                         tflite::ops::micro::Register_SOFTMAX());
+    interpreter_.AllocateTensors();
+
+    // The pseudo-random number generator is initialized to a constant seed
+    std::srand(kRandomSeed);
+    TfLiteTensor* input = interpreter_.input(0);
+    TFLITE_CHECK_EQ(input->type, kTfLiteInt16);
+
+    // Pre-populate input tensor with random values.
+    int input_length = input->bytes / sizeof(int16_t);
+    int16_t* input_values = tflite::GetTensorData<int16_t>(input);
+    for (int i = 0; i < input_length; i++) {
+      // Pre-populate input tensor with a random value based on a constant seed.
+      input_values[i] = static_cast<int16_t>(std::rand() % INT16_MAX);
+    }
+  }
+
+  void RunSingleIteration() {
+    // Run the model on this input and make sure it succeeds.
+    TfLiteStatus invoke_status = interpreter_.Invoke();
+    if (invoke_status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(reporter_, "Invoke failed.");
+    }
+  }
+
+ private:
+  const tflite::Model* keyword_spotting_model_;
+  tflite::MicroErrorReporter micro_reporter_;
+  tflite::ErrorReporter* reporter_;
+  tflite::MicroOpResolver<6> resolver_;
+  tflite::MicroInterpreter interpreter_;
+};
+
+// NOLINTNEXTLINE
+KeywordRunner runner;
+
+void KeywordRunFirstIteration() { runner.RunSingleIteration(); }
+
+void KeywordRunTenIerations() {
+  // TODO(b/152644476): Add a way to run more than a single deterministic input.
+  for (int i = 0; i < 10; i++) {
+    runner.RunSingleIteration();
+  }
+}
+
+}  //  namespace
+
+TF_LITE_MICRO_BENCHMARKS_BEGIN
+
+TF_LITE_MICRO_BENCHMARK(KeywordRunFirstIteration);
+
+TF_LITE_MICRO_BENCHMARK(KeywordRunTenIerations);
+
+TF_LITE_MICRO_BENCHMARKS_END
diff --git a/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc b/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
new file mode 100644
index 00000000000..c1e37dfb37e
--- /dev/null
+++ b/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
@@ -0,0 +1,2909 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h"
+
+// We need to keep the data array aligned on some architectures.
+#ifdef __has_attribute
+#define HAVE_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define HAVE_ATTRIBUTE(x) 0
+#endif
+#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
+#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
+#else
+#define DATA_ALIGN_ATTRIBUTE
+#endif
+
+const unsigned char g_keyword_scrambled_model_data[] DATA_ALIGN_ATTRIBUTE = {
+    0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xd0, 0x6e, 0x00, 0x00,
+    0xe4, 0x85, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0xbc, 0x6e, 0x00, 0x00, 0xac, 0x56, 0x00, 0x00, 0x9c, 0x52, 0x00, 0x00,
+    0x8c, 0x51, 0x00, 0x00, 0x7c, 0x4d, 0x00, 0x00, 0x2c, 0x4d, 0x00, 0x00,
+    0x1c, 0x49, 0x00, 0x00, 0x0c, 0x45, 0x00, 0x00, 0xfc, 0x43, 0x00, 0x00,
+    0xec, 0x3f, 0x00, 0x00, 0x9c, 0x3f, 0x00, 0x00, 0x8c, 0x3b, 0x00, 0x00,
+    0x7c, 0x37, 0x00, 0x00, 0x6c, 0x36, 0x00, 0x00, 0x5c, 0x32, 0x00, 0x00,
+    0x0c, 0x32, 0x00, 0x00, 0xfc, 0x2d, 0x00, 0x00, 0xec, 0x29, 0x00, 0x00,
+    0xdc, 0x28, 0x00, 0x00, 0xcc, 0x24, 0x00, 0x00, 0x7c, 0x24, 0x00, 0x00,
+    0x6c, 0x22, 0x00, 0x00, 0x5c, 0x1a, 0x00, 0x00, 0xcc, 0x19, 0x00, 0x00,
+    0xbc, 0x15, 0x00, 0x00, 0xac, 0x0d, 0x00, 0x00, 0x1c, 0x0d, 0x00, 0x00,
+    0x0c, 0x09, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x2a, 0x91, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x34, 0xe1, 0x4f, 0xa1,
+    0x63, 0xa4, 0x62, 0xbf, 0x3e, 0x91, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0xa3, 0xb2, 0x8f, 0xee, 0x35, 0xe6, 0xf2, 0xcc,
+    0x68, 0xa0, 0x33, 0xc4, 0x7d, 0x4e, 0xbb, 0xa9, 0x10, 0x32, 0x8e, 0x3d,
+    0x76, 0x14, 0x1c, 0x33, 0x0e, 0x77, 0xf7, 0xc8, 0x7b, 0x45, 0xc7, 0xdb,
+    0xcf, 0x87, 0xc7, 0x70, 0xa9, 0x29, 0xfd, 0x70, 0x32, 0x96, 0x35, 0x7d,
+    0xe9, 0xac, 0x6d, 0x9b, 0xfd, 0xe4, 0xbc, 0x4a, 0x57, 0xcd, 0x43, 0xcc,
+    0x73, 0x72, 0xdf, 0x07, 0x68, 0xc5, 0x67, 0xbd, 0x8a, 0x91, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0xb0, 0xfb, 0x5f, 0xdf,
+    0x0e, 0xb9, 0xa2, 0xfd, 0x66, 0x86, 0x13, 0x1b, 0x6d, 0x1d, 0x53, 0xdb,
+    0x83, 0xbf, 0x44, 0x29, 0x3f, 0x93, 0xee, 0x42, 0x9a, 0xf4, 0x31, 0x6e,
+    0xc3, 0x15, 0x7e, 0x48, 0x72, 0x50, 0xc3, 0x53, 0xef, 0x35, 0x1f, 0xc2,
+    0x29, 0x42, 0xb4, 0xd7, 0x4b, 0xd7, 0x98, 0x60, 0xb9, 0x3e, 0xbb, 0x31,
+    0x35, 0xc3, 0xf6, 0x15, 0x7a, 0x9a, 0x2c, 0xfd, 0xff, 0x04, 0xd9, 0x04,
+    0x57, 0x52, 0xae, 0x99, 0xa3, 0x95, 0xae, 0x6a, 0x66, 0x52, 0x5f, 0x91,
+    0x17, 0x83, 0x0d, 0x27, 0x16, 0x02, 0x06, 0x64, 0x80, 0x05, 0x99, 0x1c,
+    0x6c, 0xab, 0xb1, 0xa1, 0x0e, 0x44, 0x1f, 0x63, 0xe9, 0xc1, 0xab, 0x8d,
+    0x08, 0x79, 0x56, 0xe0, 0x90, 0xa5, 0xb8, 0x3b, 0xc4, 0x1e, 0xa5, 0x1f,
+    0x64, 0xe4, 0x0b, 0x72, 0x62, 0x19, 0x5f, 0x66, 0xc0, 0x9b, 0x7b, 0xc4,
+    0xe5, 0x9f, 0x82, 0xa7, 0x16, 0x92, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x08, 0x00, 0x00, 0x3e, 0x3d, 0xf4, 0x61, 0x45, 0x2a, 0x48, 0x53,
+    0x1f, 0x22, 0x74, 0x65, 0xea, 0x5a, 0x00, 0x83, 0x68, 0xf9, 0xbb, 0xa3,
+    0xc2, 0x1a, 0x8f, 0xe1, 0xfb, 0x76, 0x6a, 0xe9, 0x1a, 0x0e, 0x4d, 0x32,
+    0xc6, 0xf3, 0x8d, 0x85, 0x54, 0xa1, 0xe9, 0xb8, 0x35, 0xee, 0xba, 0x53,
+    0x40, 0xa2, 0xea, 0x7f, 0xc3, 0x99, 0x71, 0x17, 0xdd, 0xd5, 0xfe, 0xdf,
+    0x5e, 0x15, 0xa0, 0x73, 0xf8, 0x78, 0x49, 0x73, 0xcc, 0xf0, 0x18, 0x12,
+    0x06, 0x81, 0xd6, 0x19, 0x2c, 0xa8, 0xd7, 0x80, 0x19, 0x19, 0xbf, 0x1e,
+    0x50, 0xb1, 0xfb, 0xb3, 0xa6, 0x56, 0x6f, 0x52, 0xa6, 0xc0, 0xdd, 0x3f,
+    0xbb, 0x13, 0x6e, 0x04, 0xdf, 0x79, 0xca, 0x8b, 0xa5, 0x9c, 0xa1, 0x78,
+    0x49, 0xca, 0xe5, 0x29, 0xbb, 0x29, 0x7c, 0x96, 0xc6, 0x29, 0x06, 0x99,
+    0xec, 0x50, 0xd1, 0xe8, 0x9b, 0xb7, 0x53, 0xd2, 0x36, 0x89, 0xb1, 0x5c,
+    0x38, 0xf4, 0x2f, 0xa1, 0xda, 0x6f, 0xd8, 0xd1, 0x62, 0xd2, 0xd4, 0x97,
+    0xce, 0xf1, 0xbd, 0x73, 0x2d, 0x92, 0xdb, 0x62, 0x0c, 0xb0, 0x77, 0xed,
+    0x32, 0x3a, 0xfc, 0x59, 0x94, 0xef, 0x2b, 0x48, 0x60, 0xb2, 0x82, 0xa2,
+    0xb6, 0x51, 0xdb, 0x51, 0x47, 0x99, 0x4c, 0x50, 0x93, 0x53, 0x9d, 0xa9,
+    0x3c, 0x94, 0x34, 0x9f, 0xa6, 0x3e, 0x4f, 0x87, 0xd4, 0xa0, 0x40, 0xeb,
+    0x7b, 0xfa, 0x1b, 0x7d, 0x03, 0xa8, 0xf8, 0x8b, 0xa5, 0x32, 0x3a, 0xaf,
+    0x7e, 0x6b, 0x25, 0x08, 0x97, 0x71, 0x8d, 0x0c, 0x30, 0xc9, 0xa7, 0x23,
+    0xe3, 0x51, 0xb3, 0xf2, 0x86, 0xad, 0x12, 0xe2, 0x79, 0x94, 0x7f, 0xf3,
+    0xf7, 0x88, 0x67, 0x3e, 0x8e, 0x8e, 0x04, 0x5e, 0x4f, 0x01, 0x6f, 0x1d,
+    0x78, 0x42, 0x9e, 0x47, 0x81, 0xdf, 0x03, 0x39, 0x3d, 0x9b, 0xbd, 0xb6,
+    0x06, 0x21, 0x82, 0xfe, 0xf2, 0x50, 0xe1, 0x14, 0xbc, 0xe3, 0x5e, 0xe1,
+    0xbd, 0x8f, 0xfa, 0x35, 0x31, 0x4e, 0x66, 0xeb, 0x67, 0x49, 0x1c, 0x07,
+    0x88, 0xb6, 0x22, 0x0c, 0xeb, 0xd9, 0x9f, 0x9b, 0x8b, 0xe0, 0x9c, 0x3c,
+    0xf7, 0x91, 0xab, 0x98, 0x5b, 0x0e, 0x09, 0xdd, 0xe3, 0x0b, 0x14, 0x55,
+    0xe9, 0xe4, 0x42, 0xd8, 0xce, 0xd7, 0xfd, 0x4c, 0x20, 0x9f, 0x44, 0x93,
+    0xa6, 0x17, 0x8a, 0x68, 0x8f, 0xec, 0x62, 0xd1, 0x97, 0x9c, 0xcc, 0xc4,
+    0xd9, 0x42, 0xda, 0xf1, 0x34, 0x04, 0xc6, 0xb6, 0x0f, 0xc7, 0xe6, 0x2d,
+    0x26, 0x6e, 0x6f, 0x92, 0x7e, 0xd9, 0xd4, 0x40, 0xc6, 0x70, 0xfa, 0x12,
+    0x2a, 0x1b, 0xbc, 0x50, 0xeb, 0x3b, 0x24, 0x96, 0x8d, 0x7c, 0xae, 0xbe,
+    0xc3, 0x27, 0xce, 0x97, 0xcf, 0xcd, 0x10, 0x13, 0x01, 0xc6, 0x48, 0x6a,
+    0x99, 0x38, 0x79, 0xb9, 0x1c, 0xc9, 0x09, 0xac, 0x96, 0x8c, 0xf7, 0x82,
+    0x8f, 0xb8, 0x17, 0x94, 0x2c, 0x5f, 0x40, 0xcc, 0x80, 0xf4, 0x9f, 0xaa,
+    0xcb, 0x83, 0x13, 0x7b, 0x3a, 0x78, 0x0a, 0x9f, 0x79, 0x9e, 0xfc, 0x0e,
+    0x8f, 0x98, 0x60, 0x39, 0x86, 0x44, 0x8e, 0x4b, 0xc4, 0xad, 0xe6, 0x98,
+    0x92, 0x08, 0x84, 0x48, 0x8f, 0x1d, 0x78, 0x10, 0x9e, 0xf7, 0xb8, 0x61,
+    0x65, 0x46, 0xdb, 0x4a, 0xcf, 0xc5, 0x37, 0xe3, 0x77, 0x76, 0xcf, 0x0a,
+    0x7e, 0x72, 0x3f, 0xe4, 0x51, 0x30, 0x28, 0x57, 0x13, 0xfd, 0xdb, 0x7e,
+    0xd6, 0xa3, 0xdd, 0x64, 0xdd, 0x00, 0xd0, 0x7f, 0xbc, 0x48, 0x1d, 0xaf,
+    0xde, 0x0e, 0x45, 0xc4, 0xc9, 0xfa, 0xf6, 0xb2, 0xb7, 0x9a, 0x42, 0x8b,
+    0x18, 0x08, 0xed, 0xdb, 0xa9, 0xc3, 0x32, 0xf1, 0x9c, 0xcf, 0x16, 0x74,
+    0x57, 0xce, 0xe9, 0x44, 0x21, 0xdb, 0x8a, 0x45, 0x89, 0x70, 0x41, 0x5c,
+    0xbf, 0x10, 0xdf, 0x83, 0x4a, 0xe4, 0x4c, 0xd8, 0xc9, 0x2e, 0x5b, 0xa3,
+    0x05, 0xed, 0x73, 0xb1, 0xb0, 0xb7, 0xc4, 0xd7, 0x0d, 0xea, 0xf6, 0xb4,
+    0xc1, 0x5e, 0x12, 0x54, 0x30, 0x73, 0x5c, 0x93, 0xd9, 0xf7, 0xc9, 0x24,
+    0x43, 0x8f, 0x4f, 0x8e, 0x94, 0x95, 0xb6, 0xfd, 0xa3, 0x14, 0x42, 0x50,
+    0xb8, 0x66, 0xfb, 0xc4, 0xed, 0x72, 0xcf, 0x7b, 0xa9, 0x73, 0xeb, 0xc4,
+    0x4a, 0x05, 0xea, 0xb4, 0x47, 0xca, 0x21, 0x56, 0x28, 0xa8, 0x87, 0xb8,
+    0x87, 0x0b, 0xe3, 0x8d, 0xfd, 0x70, 0xf7, 0x33, 0x76, 0xf0, 0x3d, 0xa4,
+    0x3b, 0x83, 0xab, 0x14, 0x01, 0xe1, 0xb0, 0xa9, 0x44, 0xe8, 0xd7, 0x50,
+    0x26, 0x0b, 0xbb, 0x2d, 0x57, 0x39, 0x82, 0x7c, 0x71, 0xd8, 0x12, 0xaf,
+    0xf3, 0x9f, 0x46, 0xbd, 0x62, 0xd6, 0x61, 0xf5, 0xb7, 0x04, 0x94, 0xbf,
+    0x87, 0xea, 0xc4, 0xc4, 0x33, 0xcf, 0x36, 0x3b, 0x4f, 0xc7, 0x71, 0xf1,
+    0x98, 0xe6, 0xb0, 0x96, 0x25, 0xd7, 0xac, 0x75, 0xfc, 0x92, 0xe0, 0x69,
+    0x72, 0x37, 0x8d, 0x40, 0x31, 0xaa, 0x2c, 0x86, 0xfb, 0x95, 0x3f, 0x9c,
+    0x23, 0xd4, 0x39, 0x99, 0xff, 0xea, 0x95, 0x79, 0xb9, 0x2e, 0xb0, 0x33,
+    0xf1, 0xe8, 0xd0, 0x42, 0xb5, 0x70, 0x5c, 0xca, 0x69, 0x48, 0x28, 0x23,
+    0x58, 0xb4, 0x07, 0xfc, 0x3e, 0x15, 0x29, 0x00, 0xa9, 0x22, 0x44, 0x70,
+    0xd0, 0xc7, 0x01, 0x0d, 0x3e, 0xfc, 0x57, 0xb7, 0x54, 0x3a, 0xc3, 0x43,
+    0xd6, 0x2f, 0x55, 0x09, 0x52, 0x4a, 0x6b, 0x8e, 0x4c, 0x82, 0xbb, 0x4e,
+    0x3e, 0x38, 0xe1, 0x9e, 0x72, 0x83, 0xec, 0x40, 0xf5, 0xf7, 0x0e, 0x3c,
+    0x24, 0xed, 0xda, 0xf2, 0x39, 0x6c, 0xad, 0xeb, 0xff, 0xfb, 0x4a, 0x38,
+    0x50, 0x49, 0x28, 0x3d, 0x05, 0xb2, 0x98, 0x44, 0x2b, 0x61, 0xa2, 0x9b,
+    0x3a, 0x3c, 0xad, 0xd9, 0x8c, 0xef, 0x3c, 0x72, 0x50, 0x74, 0x13, 0x80,
+    0xc4, 0x7e, 0x6e, 0xf3, 0xc9, 0xdf, 0x63, 0xf6, 0x41, 0xb2, 0x08, 0x78,
+    0x9b, 0x7c, 0xa9, 0x13, 0xd1, 0x21, 0xe7, 0x5e, 0x6a, 0x0d, 0x64, 0xf7,
+    0x52, 0x75, 0xf2, 0x80, 0x69, 0xbe, 0x43, 0xf8, 0xd4, 0xad, 0x49, 0xfc,
+    0x97, 0x76, 0x1c, 0xb6, 0x43, 0x9e, 0xcb, 0x45, 0x4d, 0x75, 0x07, 0xae,
+    0xdb, 0xbf, 0xf5, 0x8a, 0xeb, 0xb9, 0x6b, 0x12, 0x06, 0xbf, 0x94, 0xad,
+    0x77, 0x29, 0xb1, 0xae, 0x24, 0x9b, 0x4d, 0xdc, 0xe1, 0x5e, 0xd7, 0x57,
+    0xec, 0xd1, 0xd8, 0xad, 0xf0, 0x06, 0x08, 0x43, 0x33, 0x99, 0xd2, 0x04,
+    0xfc, 0xc8, 0xf6, 0x53, 0x3d, 0x73, 0xd4, 0x36, 0xd3, 0x8e, 0x4a, 0xcd,
+    0xb1, 0xe9, 0xcb, 0x3a, 0x5f, 0x54, 0xbc, 0xde, 0x16, 0xa2, 0x85, 0xde,
+    0x35, 0x27, 0x99, 0x32, 0x4f, 0xb9, 0x2c, 0x16, 0xa2, 0x6e, 0xae, 0x75,
+    0x60, 0x77, 0xe9, 0x08, 0x0f, 0x08, 0xc4, 0xd0, 0x62, 0xc7, 0xd2, 0x1f,
+    0x3b, 0x29, 0xdd, 0xb7, 0xea, 0xa3, 0x58, 0xaf, 0x4c, 0x05, 0xd2, 0x82,
+    0x6a, 0xe0, 0xc4, 0xe9, 0x70, 0x7e, 0xf2, 0xca, 0x82, 0x6a, 0xae, 0xc1,
+    0x9a, 0x42, 0x5d, 0x46, 0x4a, 0xb7, 0x8f, 0x4d, 0x33, 0xfe, 0x6f, 0x47,
+    0xb5, 0x49, 0xb3, 0x89, 0x51, 0x31, 0x74, 0x68, 0x14, 0xda, 0x0a, 0x41,
+    0x3d, 0x1f, 0x8e, 0x30, 0x8c, 0x77, 0xd1, 0xa9, 0x36, 0x41, 0x78, 0x34,
+    0xb7, 0x7e, 0x4e, 0x7a, 0x77, 0x12, 0x43, 0x97, 0x43, 0xba, 0xd6, 0x28,
+    0x14, 0x2a, 0x9f, 0x98, 0xb4, 0x39, 0x08, 0x5c, 0xb7, 0xb8, 0x03, 0x63,
+    0x62, 0x68, 0xc6, 0x9a, 0x4d, 0xf5, 0xdc, 0x7c, 0x0f, 0x7e, 0x77, 0xdc,
+    0x85, 0x53, 0x31, 0x8c, 0x53, 0x8b, 0x27, 0xc4, 0xb7, 0x3d, 0xd0, 0x94,
+    0x9b, 0x7e, 0x59, 0x59, 0x03, 0x09, 0x8c, 0x30, 0x70, 0x7d, 0x9c, 0x73,
+    0x89, 0x6c, 0x5f, 0xbf, 0xf9, 0xc7, 0x72, 0x76, 0x12, 0x98, 0xe3, 0xbe,
+    0xc3, 0x67, 0xdf, 0xa1, 0x76, 0xa3, 0xec, 0x44, 0x30, 0x70, 0x2f, 0x6a,
+    0x86, 0x28, 0xb9, 0x9d, 0x7f, 0x93, 0xf2, 0x4a, 0x34, 0x48, 0x1f, 0x2e,
+    0x2e, 0x95, 0x88, 0xdb, 0x1f, 0x2c, 0x19, 0x46, 0x2e, 0x91, 0x5f, 0x81,
+    0x0d, 0x08, 0x9d, 0x03, 0x0b, 0xaf, 0x59, 0x0a, 0x41, 0xad, 0x4d, 0x6c,
+    0x09, 0x0e, 0x9f, 0xd1, 0xc4, 0xdb, 0xac, 0x59, 0x27, 0x04, 0x1c, 0x73,
+    0xe9, 0xf3, 0xe8, 0x54, 0xd9, 0x11, 0x31, 0xb2, 0xed, 0x2d, 0x8c, 0xeb,
+    0x99, 0x26, 0x48, 0x9e, 0xac, 0x88, 0x96, 0xcb, 0x19, 0x49, 0xfa, 0x4a,
+    0x82, 0xd5, 0x5d, 0xb8, 0x0f, 0x22, 0x3f, 0xb6, 0x5c, 0x02, 0x2a, 0xb9,
+    0xd9, 0xfe, 0x4d, 0x9d, 0xdb, 0x85, 0x90, 0x19, 0x7f, 0x1a, 0x44, 0xa3,
+    0x74, 0x68, 0xbf, 0xa2, 0x3b, 0xb4, 0x3b, 0xeb, 0xab, 0x99, 0xc2, 0x46,
+    0x50, 0x7e, 0xec, 0xa9, 0xb4, 0x86, 0xfa, 0x50, 0xcb, 0x71, 0x7e, 0x75,
+    0xa5, 0xca, 0xa6, 0x2f, 0x40, 0x1d, 0xa1, 0x4a, 0x5c, 0x91, 0xd7, 0x2a,
+    0xa6, 0x17, 0x11, 0x4d, 0x19, 0x2b, 0xb3, 0x0f, 0xf0, 0xb3, 0x06, 0x70,
+    0x51, 0x5c, 0x52, 0x8c, 0xdf, 0xe3, 0x19, 0x92, 0x08, 0x40, 0xa2, 0xb4,
+    0xc0, 0xf2, 0xe8, 0x44, 0xcc, 0x36, 0xaa, 0xf9, 0xf8, 0xfc, 0x2d, 0x83,
+    0x79, 0xc6, 0x58, 0xc1, 0xdf, 0x32, 0xb7, 0xde, 0x0f, 0x3e, 0xc0, 0xa8,
+    0x7e, 0xeb, 0xf2, 0x30, 0x16, 0xdf, 0x38, 0xcb, 0x69, 0xd9, 0x44, 0x0d,
+    0x44, 0xf4, 0x45, 0x9c, 0x81, 0xc8, 0xe7, 0x06, 0xae, 0x95, 0xaf, 0xff,
+    0x17, 0x3b, 0x1c, 0x3f, 0xda, 0xa5, 0xf8, 0xfd, 0x9c, 0xf1, 0x0a, 0xca,
+    0xda, 0xc0, 0xfa, 0x02, 0xc4, 0xce, 0x78, 0xfb, 0x35, 0x8c, 0xfe, 0x55,
+    0xad, 0x0d, 0x9b, 0xeb, 0x10, 0xf1, 0x7b, 0xb1, 0x09, 0xf8, 0xef, 0xfc,
+    0xde, 0x7a, 0x69, 0x74, 0x76, 0xef, 0x91, 0x64, 0x33, 0xc4, 0x08, 0x15,
+    0x73, 0x85, 0x56, 0xae, 0x9c, 0xf6, 0xdd, 0x55, 0x19, 0x96, 0xe6, 0x41,
+    0x12, 0xc9, 0x87, 0x91, 0x9e, 0xc6, 0x18, 0xe8, 0xbf, 0xa0, 0x59, 0xfd,
+    0x20, 0xab, 0xb5, 0xcf, 0x0f, 0x6e, 0x30, 0xd3, 0xc5, 0x70, 0xf2, 0x50,
+    0xa4, 0x2a, 0xdf, 0xb0, 0x45, 0xfc, 0x82, 0x1a, 0x3b, 0xfe, 0x0c, 0xad,
+    0x41, 0x95, 0xf1, 0xd6, 0x85, 0xa2, 0xc9, 0xff, 0xbe, 0x3a, 0x64, 0x70,
+    0x43, 0xc0, 0xc5, 0xc8, 0x80, 0x11, 0x0d, 0x20, 0xcd, 0xf2, 0xa2, 0xbb,
+    0x43, 0x68, 0x0e, 0xf4, 0x01, 0xb3, 0x73, 0x79, 0x9f, 0x68, 0x41, 0x63,
+    0x3e, 0xda, 0xf9, 0xf4, 0x23, 0x57, 0x97, 0x84, 0x99, 0xe8, 0x5e, 0xdb,
+    0xaa, 0x24, 0xab, 0x9c, 0x40, 0x83, 0xf9, 0x3f, 0x4f, 0x5a, 0x53, 0xa6,
+    0xf1, 0xe8, 0x95, 0xcf, 0xcb, 0x50, 0x13, 0x51, 0xa7, 0x8c, 0x71, 0x1d,
+    0xff, 0xcc, 0x66, 0xab, 0xff, 0xca, 0xc5, 0xc3, 0x73, 0x45, 0xb7, 0x21,
+    0x1d, 0x65, 0x7a, 0xe5, 0x1f, 0x3f, 0x1a, 0x58, 0x23, 0x28, 0xc8, 0xf3,
+    0xbf, 0x98, 0x25, 0xc0, 0x83, 0x68, 0xf0, 0x62, 0x63, 0x90, 0xcf, 0x1f,
+    0x20, 0xb8, 0x04, 0x5c, 0xc4, 0x80, 0x5b, 0xf4, 0x6d, 0xdc, 0xe9, 0xac,
+    0xd8, 0x13, 0x3b, 0x42, 0xf8, 0x4e, 0xa2, 0x1c, 0xce, 0x3f, 0x8d, 0x15,
+    0xd3, 0x87, 0x1b, 0x44, 0x79, 0x52, 0x34, 0x4b, 0x63, 0x4d, 0xbf, 0x95,
+    0xec, 0xae, 0xf9, 0xc6, 0x7b, 0x7b, 0x85, 0x8c, 0x4f, 0x20, 0x58, 0x9d,
+    0x48, 0x03, 0x2f, 0x77, 0x2e, 0x8b, 0x6f, 0x66, 0x76, 0xb9, 0xb8, 0xb7,
+    0x34, 0x5a, 0x63, 0x06, 0x85, 0x82, 0x5f, 0x23, 0x8f, 0x8d, 0x0c, 0x92,
+    0x3b, 0xd2, 0x8a, 0x1b, 0x39, 0xee, 0x6a, 0xbc, 0xf6, 0x94, 0x2a, 0xc6,
+    0x73, 0xa6, 0x99, 0x98, 0xdc, 0x96, 0xd7, 0xc1, 0xfe, 0x9b, 0xc8, 0xfb,
+    0x86, 0x5a, 0xad, 0xce, 0xf8, 0xd5, 0x32, 0x62, 0x96, 0x63, 0xaf, 0x4c,
+    0x4a, 0xae, 0xec, 0x26, 0x3d, 0x84, 0x69, 0x50, 0x5f, 0x37, 0x9b, 0x29,
+    0xac, 0x15, 0x76, 0x3d, 0x33, 0x96, 0x06, 0xde, 0xc1, 0x6d, 0xa2, 0xc7,
+    0xc3, 0x8a, 0x20, 0x2e, 0xf7, 0x08, 0x55, 0x83, 0x23, 0x9c, 0x23, 0x2d,
+    0x3a, 0xa1, 0x32, 0xbc, 0x47, 0x48, 0xd5, 0x6a, 0x71, 0xb9, 0xcc, 0x2d,
+    0x99, 0xa0, 0x37, 0x07, 0x46, 0x45, 0xbe, 0xf0, 0x27, 0x5a, 0x25, 0x72,
+    0x58, 0x47, 0x6d, 0xbf, 0x23, 0xdc, 0x48, 0x44, 0x45, 0x95, 0xb1, 0x62,
+    0xf1, 0x7e, 0x4c, 0x95, 0x1c, 0xb4, 0x17, 0x8b, 0x59, 0x2e, 0xf3, 0x4f,
+    0x45, 0x3b, 0x5d, 0x67, 0x92, 0x52, 0xd8, 0xc1, 0x91, 0xfa, 0x53, 0xaa,
+    0x87, 0xc0, 0xa7, 0xb0, 0x9f, 0x10, 0xe8, 0xac, 0x45, 0x52, 0xbb, 0x17,
+    0xee, 0xf6, 0x18, 0xbe, 0x02, 0x70, 0xce, 0x79, 0x66, 0x72, 0xf9, 0xf6,
+    0xca, 0x66, 0xff, 0xa4, 0x9a, 0xd9, 0xb7, 0x07, 0xa9, 0xc1, 0x23, 0x7e,
+    0x7b, 0x9c, 0xe3, 0x02, 0x7a, 0xcc, 0xa3, 0x67, 0xb7, 0xb0, 0x37, 0xba,
+    0xae, 0x12, 0xda, 0x48, 0x6e, 0x7f, 0xde, 0x5f, 0x75, 0x15, 0xca, 0xd2,
+    0x46, 0xdd, 0xb0, 0x82, 0xbf, 0x6d, 0xe9, 0x51, 0x66, 0xa5, 0x9e, 0x0c,
+    0xd5, 0x03, 0xbd, 0x97, 0x0e, 0x1b, 0x88, 0xf6, 0x61, 0x5a, 0x8b, 0xe0,
+    0xdd, 0x3e, 0x59, 0x4c, 0x35, 0xfd, 0xb0, 0x3b, 0x79, 0x8c, 0x1c, 0x96,
+    0x97, 0x35, 0x62, 0x36, 0x62, 0x4c, 0x4b, 0x46, 0xb1, 0x21, 0xf7, 0xf0,
+    0x34, 0xdc, 0xd9, 0x9f, 0xf8, 0x53, 0x7d, 0xca, 0xbc, 0x4d, 0xaf, 0xf4,
+    0xb7, 0x2f, 0xa7, 0x5d, 0x18, 0xf9, 0x3b, 0xa9, 0xb0, 0xbb, 0xdf, 0xfa,
+    0x28, 0x2b, 0x58, 0xce, 0x46, 0x01, 0x3f, 0x76, 0xf2, 0x39, 0x45, 0x8b,
+    0x3c, 0xda, 0x62, 0x2b, 0x6b, 0xe1, 0x5f, 0x14, 0xfc, 0x79, 0x17, 0x2d,
+    0xe2, 0xe5, 0x8c, 0xc5, 0xde, 0x91, 0xfd, 0xf5, 0x6d, 0x9b, 0x6b, 0xbb,
+    0xb0, 0x13, 0xae, 0xbe, 0x1e, 0xa8, 0x8f, 0x3c, 0xfd, 0x24, 0xbe, 0xb8,
+    0x39, 0x80, 0x03, 0x06, 0x8b, 0xff, 0xca, 0x90, 0x88, 0x0f, 0x45, 0xc4,
+    0xeb, 0x50, 0x52, 0xf5, 0x00, 0x8c, 0x16, 0x9d, 0x26, 0xaa, 0xec, 0xb1,
+    0x44, 0xd6, 0xfe, 0x67, 0xa3, 0xc1, 0xec, 0x4a, 0x12, 0xa6, 0x7c, 0x7c,
+    0xc3, 0x46, 0x1c, 0x64, 0x61, 0x67, 0xec, 0xce, 0x1e, 0xa2, 0xb4, 0xdd,
+    0x6e, 0x7f, 0x02, 0x14, 0xf4, 0x1c, 0x17, 0xa7, 0x31, 0x9f, 0xc2, 0xc6,
+    0xc0, 0x21, 0x41, 0x88, 0x61, 0xd8, 0xca, 0x06, 0xa5, 0xe4, 0xef, 0xa4,
+    0xaa, 0x4d, 0xa3, 0xad, 0x5f, 0xd4, 0x0c, 0x6b, 0x14, 0x38, 0x2e, 0xe8,
+    0x87, 0x5a, 0x68, 0x10, 0x51, 0xd8, 0xbb, 0xa6, 0xd9, 0xdc, 0xd3, 0x7f,
+    0x1f, 0xea, 0xa8, 0xcc, 0x3f, 0x43, 0xa4, 0x04, 0x95, 0xb4, 0xde, 0x2f,
+    0x07, 0x5d, 0x91, 0x1c, 0x8e, 0xc3, 0xbc, 0xaa, 0x46, 0x8a, 0xa8, 0x42,
+    0xa7, 0x2c, 0x0f, 0x1f, 0xb3, 0xe2, 0x8a, 0x0b, 0xa0, 0x3f, 0xfb, 0x87,
+    0x9e, 0x42, 0xa5, 0x60, 0xce, 0x5a, 0x54, 0x91, 0x26, 0x51, 0xea, 0x81,
+    0x6f, 0xf1, 0x54, 0x93, 0xe7, 0xa0, 0xf8, 0x64, 0xab, 0x1d, 0x0d, 0x9d,
+    0x64, 0x6a, 0xd5, 0x19, 0x03, 0xbb, 0x94, 0x7f, 0x0a, 0xb8, 0x6b, 0x87,
+    0xc3, 0x1a, 0x38, 0xe5, 0xe8, 0xba, 0x13, 0x17, 0xeb, 0x13, 0xcc, 0xac,
+    0xcb, 0x1f, 0x96, 0x4c, 0x3b, 0x18, 0xfb, 0xe8, 0x5c, 0x54, 0xce, 0x1a,
+    0x91, 0x44, 0xf5, 0x49, 0x6c, 0x38, 0x2a, 0x92, 0x8a, 0x0d, 0x3d, 0x08,
+    0xc2, 0x5f, 0x6c, 0xac, 0x48, 0xb3, 0xdc, 0x2e, 0xa6, 0x5a, 0xa8, 0xee,
+    0x22, 0x9a, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+    0x96, 0xc5, 0x3a, 0x4e, 0x42, 0x7d, 0x27, 0xce, 0x44, 0x84, 0xf1, 0x67,
+    0x8c, 0xc5, 0xdd, 0x75, 0x3b, 0x8a, 0xed, 0x2e, 0x29, 0x62, 0x7b, 0xb0,
+    0xe6, 0xa3, 0xb4, 0x61, 0x73, 0x10, 0xff, 0x0e, 0x0c, 0x98, 0x74, 0xef,
+    0xbb, 0xc4, 0xca, 0x03, 0x88, 0xa4, 0x96, 0x61, 0xef, 0x36, 0x6d, 0xa2,
+    0xb1, 0xc8, 0xf0, 0xac, 0xf1, 0xb2, 0x08, 0x56, 0xc7, 0x99, 0xcf, 0xae,
+    0x0a, 0x37, 0x85, 0x60, 0x78, 0x2d, 0x14, 0xda, 0xb1, 0xa7, 0x00, 0xb6,
+    0x00, 0x04, 0x76, 0x80, 0x0e, 0x9f, 0x2a, 0x30, 0x8b, 0x85, 0xd9, 0xc1,
+    0xaf, 0xee, 0x27, 0x80, 0x20, 0xed, 0xef, 0x25, 0x5c, 0x98, 0x6b, 0xcc,
+    0xf8, 0x72, 0xfb, 0x3f, 0x13, 0xe6, 0x9b, 0x47, 0xee, 0xa1, 0x18, 0x55,
+    0xa0, 0x68, 0xbe, 0xd4, 0x21, 0x59, 0x72, 0xa8, 0xa4, 0xd2, 0x33, 0x57,
+    0x50, 0xfc, 0x6b, 0xa8, 0x49, 0x1b, 0x74, 0xdb, 0x5a, 0x16, 0xb8, 0x52,
+    0x0c, 0xda, 0xa0, 0xa3, 0xff, 0x33, 0x56, 0x82, 0x0f, 0x0a, 0x90, 0x82,
+    0xee, 0xf1, 0x1b, 0xb3, 0x05, 0x44, 0x39, 0x01, 0xf7, 0x1e, 0xff, 0xcb,
+    0xea, 0xd0, 0xb6, 0x20, 0xbc, 0x84, 0xb1, 0xf9, 0xa2, 0xc1, 0x56, 0xe6,
+    0xfa, 0x47, 0xc9, 0xfd, 0x45, 0x77, 0x51, 0x8e, 0x01, 0xe4, 0x17, 0x20,
+    0x6f, 0x99, 0xe3, 0x90, 0x2f, 0xcc, 0xaf, 0xd9, 0x61, 0x32, 0x91, 0x62,
+    0x58, 0xf4, 0x98, 0xf5, 0xf4, 0xeb, 0x13, 0xeb, 0xdc, 0x8a, 0xac, 0xb2,
+    0x9e, 0xcf, 0xe7, 0xa7, 0xd4, 0x97, 0x22, 0x12, 0x08, 0x10, 0x6d, 0x40,
+    0xea, 0x26, 0xea, 0x42, 0x29, 0x6e, 0x75, 0x62, 0x47, 0x08, 0x17, 0xa8,
+    0x69, 0x0f, 0xf7, 0x35, 0x59, 0x23, 0x86, 0x83, 0xfd, 0xb5, 0x61, 0x98,
+    0x9c, 0x4d, 0x37, 0xda, 0x9f, 0xfc, 0xfb, 0x16, 0xb7, 0x6c, 0x52, 0xee,
+    0xa8, 0x9c, 0x3e, 0x93, 0x43, 0xc5, 0x2b, 0xd4, 0xd0, 0x9f, 0x69, 0x2c,
+    0xc9, 0x1f, 0x2e, 0xdf, 0x5b, 0xe6, 0xc6, 0x5f, 0x71, 0xd1, 0xd7, 0xb2,
+    0x8f, 0x3a, 0xba, 0x60, 0x75, 0x3d, 0x34, 0x41, 0x43, 0x9b, 0x13, 0xc0,
+    0x3b, 0x30, 0xc5, 0xe9, 0x84, 0x81, 0xde, 0x85, 0x4e, 0x65, 0x7b, 0x21,
+    0x37, 0xb8, 0xef, 0x24, 0x19, 0xaa, 0x26, 0x0c, 0x27, 0xa7, 0xd9, 0x29,
+    0x47, 0x1a, 0x15, 0x42, 0x1e, 0x30, 0x79, 0x79, 0x96, 0x09, 0x62, 0x26,
+    0xad, 0x98, 0x8b, 0xcb, 0x3d, 0xeb, 0x66, 0x83, 0x77, 0xd9, 0x79, 0x4d,
+    0x05, 0x81, 0x72, 0xe9, 0xe0, 0x6f, 0x13, 0x00, 0x7e, 0xa3, 0x92, 0x82,
+    0x1c, 0x90, 0x83, 0x4b, 0x15, 0x97, 0x0f, 0x92, 0xe2, 0xd3, 0x3d, 0xd7,
+    0x6c, 0xb9, 0x60, 0x9a, 0x23, 0x52, 0xbe, 0x59, 0xc9, 0x36, 0x9e, 0xf7,
+    0x77, 0x09, 0x79, 0x01, 0xcc, 0xec, 0x17, 0xd1, 0x74, 0xbc, 0x58, 0x65,
+    0x45, 0x3c, 0x86, 0xf1, 0xbc, 0xbd, 0x95, 0x54, 0x46, 0x45, 0x7b, 0x4c,
+    0xa2, 0xea, 0x2a, 0x6e, 0xa8, 0xd1, 0x66, 0x03, 0xb2, 0x6a, 0xe0, 0xd3,
+    0x07, 0x8d, 0xe0, 0x09, 0x81, 0x42, 0xe3, 0x97, 0xc4, 0xe7, 0x37, 0xc5,
+    0x82, 0xcf, 0xb1, 0xec, 0xba, 0xbd, 0xf4, 0xb6, 0x41, 0xb2, 0xb8, 0xa6,
+    0x3a, 0x85, 0x4b, 0x4f, 0x46, 0x48, 0xe9, 0x9b, 0x72, 0xf5, 0xb0, 0x64,
+    0x66, 0x75, 0x42, 0xb4, 0x00, 0xbe, 0x11, 0x6d, 0x86, 0x93, 0x07, 0x50,
+    0xa7, 0xef, 0x55, 0x42, 0xcf, 0xe8, 0x61, 0xd0, 0x9b, 0x11, 0x84, 0x8c,
+    0x74, 0xe4, 0xb8, 0x3f, 0x48, 0xb3, 0x61, 0xe3, 0xea, 0x66, 0x86, 0x94,
+    0x95, 0x12, 0x77, 0x26, 0x75, 0x30, 0xb5, 0xd3, 0x7a, 0xad, 0x2d, 0x58,
+    0x46, 0x1b, 0x4b, 0xd9, 0x2d, 0x1e, 0x0b, 0xff, 0xd7, 0x03, 0x56, 0x3b,
+    0xbd, 0x65, 0xb0, 0xf9, 0xfe, 0x43, 0x1c, 0x9c, 0x18, 0x82, 0x78, 0x5e,
+    0x06, 0x02, 0x21, 0x70, 0xb2, 0x7f, 0xb5, 0x63, 0x71, 0x85, 0x95, 0x79,
+    0xae, 0x1e, 0xc6, 0x62, 0x7a, 0x7c, 0x63, 0x46, 0x70, 0x1c, 0x58, 0x72,
+    0x1d, 0xde, 0xca, 0xb4, 0xfc, 0xc8, 0x56, 0x38, 0x32, 0xf4, 0x0b, 0x56,
+    0x87, 0x6b, 0x5b, 0x53, 0xd2, 0x2c, 0x35, 0xef, 0x5b, 0x33, 0x59, 0x13,
+    0x76, 0x82, 0x30, 0x80, 0x23, 0x10, 0x07, 0x4c, 0x3f, 0xac, 0x9c, 0x58,
+    0x2d, 0x04, 0xe6, 0x6a, 0xd3, 0x5c, 0xf9, 0xb6, 0x59, 0x4e, 0x85, 0xfe,
+    0x01, 0x71, 0xf0, 0xf7, 0xf2, 0x1f, 0x46, 0xd5, 0x20, 0x3c, 0x9b, 0xc2,
+    0x1e, 0x73, 0x1c, 0x56, 0x9c, 0x76, 0x8c, 0x12, 0x95, 0x51, 0xd4, 0x6f,
+    0x5b, 0x3a, 0xa7, 0x5f, 0xa7, 0xe4, 0xfa, 0xb7, 0x1a, 0xdd, 0xb6, 0x4c,
+    0x01, 0x02, 0xae, 0x9c, 0x02, 0x0d, 0x66, 0x2f, 0x40, 0x87, 0xa1, 0xbc,
+    0xf3, 0xde, 0xf4, 0xdb, 0x65, 0xee, 0xcc, 0xca, 0xe1, 0x7a, 0xa2, 0xf4,
+    0xf7, 0xf5, 0x7c, 0x2a, 0x3f, 0xa4, 0x67, 0xbb, 0x07, 0x50, 0x7a, 0x29,
+    0x8a, 0xcf, 0x2c, 0x7a, 0x0e, 0x0d, 0xc7, 0x95, 0x8b, 0xf4, 0xe2, 0x50,
+    0xe1, 0xc1, 0x40, 0x16, 0x99, 0x5c, 0x72, 0xe7, 0xe4, 0x01, 0xeb, 0x29,
+    0x6a, 0x99, 0xf2, 0x67, 0x23, 0x46, 0x1f, 0xaa, 0xea, 0xc1, 0x51, 0x30,
+    0xeb, 0x7d, 0x34, 0x52, 0x91, 0x37, 0x2d, 0xc6, 0x5c, 0x3a, 0x7c, 0x54,
+    0xc0, 0x79, 0xdc, 0xf9, 0xbf, 0x08, 0x2a, 0xf6, 0xe1, 0x1e, 0xee, 0xc6,
+    0xd2, 0xe9, 0x30, 0x27, 0x60, 0x0c, 0xa2, 0x63, 0x16, 0x06, 0x3d, 0xe2,
+    0xf5, 0x6f, 0xea, 0xe4, 0x4d, 0x9f, 0x2d, 0x36, 0x62, 0x95, 0x47, 0x5d,
+    0x00, 0x22, 0x9f, 0x0c, 0xbb, 0x71, 0xad, 0xea, 0xe7, 0x62, 0x59, 0x21,
+    0xd1, 0xaf, 0x04, 0x5a, 0xfc, 0x1f, 0x28, 0x6b, 0x6f, 0x71, 0xec, 0xd4,
+    0xbd, 0x9c, 0x88, 0xfb, 0x3f, 0x04, 0xea, 0xd6, 0xb2, 0x24, 0xe5, 0x28,
+    0xfe, 0xc5, 0x3e, 0x15, 0x00, 0x8c, 0xa2, 0xdf, 0x18, 0x3d, 0x10, 0x9a,
+    0xb1, 0xcd, 0x64, 0xda, 0x87, 0x41, 0xc8, 0xa1, 0x1c, 0x97, 0xd5, 0x44,
+    0xd9, 0x51, 0xd2, 0x96, 0xed, 0xad, 0x28, 0x1f, 0x03, 0x89, 0x21, 0xbd,
+    0x79, 0x91, 0x48, 0x9c, 0x8e, 0x17, 0xfd, 0x36, 0x72, 0xf6, 0x69, 0x4f,
+    0x3f, 0x02, 0x57, 0xcc, 0x3f, 0x1c, 0x49, 0x82, 0x00, 0x45, 0x9e, 0x29,
+    0x83, 0x14, 0x12, 0xbb, 0xd2, 0xd0, 0x1a, 0x66, 0x0f, 0x57, 0x24, 0xd4,
+    0x9f, 0x46, 0x0c, 0xf4, 0xb8, 0x28, 0x85, 0x52, 0xe2, 0xa1, 0xc2, 0x3a,
+    0x8c, 0x34, 0x4a, 0x81, 0xe3, 0xbc, 0xa2, 0x67, 0x67, 0x12, 0x13, 0xc4,
+    0xe7, 0xd7, 0x2c, 0x4e, 0xa9, 0xf5, 0xed, 0x63, 0xf2, 0x18, 0x9c, 0x0c,
+    0xe2, 0x4d, 0x25, 0x23, 0x30, 0x3e, 0x49, 0x29, 0xa6, 0x37, 0xdf, 0xc2,
+    0xdc, 0xf6, 0x5e, 0xae, 0x45, 0xd7, 0x8d, 0x56, 0xba, 0x29, 0x4f, 0xee,
+    0xc9, 0x26, 0xd7, 0xbf, 0x10, 0x4d, 0x0a, 0x3b, 0x3d, 0x1f, 0xd5, 0x72,
+    0xe1, 0xe6, 0xf5, 0x23, 0x4a, 0x17, 0x2d, 0xe4, 0x40, 0x55, 0x9b, 0x39,
+    0x66, 0x36, 0xe4, 0x6d, 0x6d, 0xb6, 0x8d, 0x2a, 0x7e, 0x76, 0x73, 0xa5,
+    0x86, 0x20, 0x3d, 0x18, 0xa0, 0x6c, 0x35, 0x59, 0xc8, 0x1c, 0xef, 0x0f,
+    0x36, 0x1d, 0x6f, 0xba, 0x89, 0xb9, 0x9e, 0x7a, 0x58, 0x1d, 0x43, 0xad,
+    0x85, 0x8b, 0x6b, 0xcc, 0x25, 0xb8, 0xe4, 0xdd, 0xa1, 0x35, 0xd9, 0xef,
+    0xc4, 0xb1, 0xf6, 0x99, 0x27, 0x17, 0xb7, 0xbe, 0xd1, 0x4f, 0xa1, 0x81,
+    0x4e, 0xb6, 0x19, 0xcd, 0xa0, 0x92, 0xeb, 0x56, 0x41, 0x4f, 0x37, 0xca,
+    0x3b, 0x43, 0x85, 0x86, 0xdf, 0x5d, 0x5a, 0x8c, 0xd4, 0x5b, 0xc4, 0x28,
+    0xdb, 0x16, 0xea, 0x3a, 0x2e, 0x9e, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x00, 0x00, 0x00, 0xea, 0x59, 0x40, 0xc4, 0x40, 0x8b, 0x6a, 0x8a,
+    0xb8, 0x7f, 0x1e, 0x0b, 0xfe, 0xab, 0xa4, 0xac, 0x42, 0x91, 0xc5, 0xfa,
+    0x2c, 0x7e, 0xb4, 0xf9, 0x5c, 0xd5, 0x4c, 0x6a, 0x74, 0x82, 0x90, 0x81,
+    0x96, 0xb0, 0xf4, 0xd4, 0xba, 0xc9, 0xa3, 0x2e, 0x26, 0x0a, 0xc9, 0x55,
+    0x65, 0xac, 0xde, 0x83, 0x37, 0xec, 0x0e, 0xf6, 0xdc, 0x8c, 0x34, 0xe6,
+    0x57, 0xde, 0x32, 0x0a, 0x02, 0x62, 0x4f, 0x6a, 0x92, 0xa5, 0xb4, 0x40,
+    0xde, 0x57, 0xf4, 0xd1, 0xa3, 0x1c, 0xd3, 0xf7, 0x4a, 0x15, 0xcc, 0x27,
+    0x26, 0x00, 0xba, 0xf3, 0xfa, 0x4e, 0xc6, 0xe9, 0xc3, 0x05, 0x3d, 0x3a,
+    0x89, 0x96, 0x7d, 0x41, 0xac, 0xca, 0x28, 0x7f, 0x69, 0x02, 0x40, 0x03,
+    0x93, 0x86, 0x85, 0x85, 0x73, 0x00, 0x09, 0x5a, 0xcf, 0x5f, 0x1d, 0xaa,
+    0x46, 0x41, 0x9d, 0x08, 0xbf, 0xea, 0x45, 0x9b, 0x93, 0xda, 0x9e, 0x81,
+    0xba, 0x9e, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+    0x6a, 0x1f, 0x9b, 0x03, 0xdd, 0xe4, 0x16, 0x07, 0x7f, 0x5b, 0xb0, 0xee,
+    0xac, 0x55, 0xc4, 0x50, 0xe6, 0x2b, 0x17, 0xed, 0x7f, 0x50, 0x4d, 0x71,
+    0x73, 0xae, 0xe0, 0x4d, 0xce, 0x08, 0xd9, 0x8b, 0x83, 0x2c, 0x01, 0x48,
+    0x02, 0xd3, 0xbb, 0xca, 0x86, 0xd7, 0xca, 0x5f, 0xc7, 0xce, 0x59, 0xdf,
+    0xc1, 0xcc, 0xf7, 0x7b, 0x54, 0xf8, 0x0d, 0x4f, 0x81, 0x9e, 0x50, 0x6a,
+    0x65, 0x66, 0x4a, 0xec, 0x7a, 0x1b, 0x92, 0xb2, 0x39, 0x8f, 0x5d, 0x41,
+    0x33, 0xcf, 0xe6, 0x1b, 0x34, 0x5d, 0xe1, 0xf6, 0xef, 0xcb, 0xa0, 0x55,
+    0x7e, 0x1f, 0x45, 0x38, 0xb9, 0x56, 0x15, 0x3b, 0x70, 0xab, 0xc8, 0x2f,
+    0x1c, 0xb9, 0x7d, 0x37, 0xe1, 0xb4, 0x03, 0x44, 0x5a, 0xf6, 0x57, 0x97,
+    0x03, 0x54, 0x4c, 0x22, 0x88, 0xc3, 0x82, 0xfd, 0x91, 0xc1, 0xf1, 0x63,
+    0xb4, 0x50, 0x46, 0x11, 0x64, 0x07, 0xfd, 0x85, 0xe5, 0x78, 0x57, 0xdd,
+    0x19, 0x2a, 0x6b, 0x64, 0x3e, 0xec, 0xb8, 0xf3, 0xb5, 0x95, 0x29, 0x72,
+    0xf1, 0x9d, 0xdd, 0xb9, 0xad, 0xd0, 0x78, 0x26, 0x86, 0x10, 0x10, 0x19,
+    0xe4, 0x79, 0xae, 0xdc, 0x56, 0xb7, 0x54, 0x4f, 0x94, 0xc6, 0x26, 0x9a,
+    0x93, 0xa8, 0x2e, 0x1b, 0x1c, 0xda, 0x87, 0x3a, 0xa2, 0x44, 0xb9, 0x0b,
+    0x0f, 0xab, 0x70, 0x3b, 0xb7, 0x6c, 0xbf, 0x58, 0x67, 0x32, 0x7d, 0xa3,
+    0x2a, 0xcb, 0x4e, 0x02, 0x92, 0xa1, 0x26, 0x0e, 0x20, 0x5e, 0xb3, 0xec,
+    0xc4, 0x04, 0x5b, 0x7f, 0xe5, 0xbd, 0x30, 0xeb, 0xc8, 0xdd, 0xf1, 0x72,
+    0x5a, 0x7e, 0xcb, 0x93, 0x22, 0xa0, 0x01, 0x9f, 0xbb, 0x24, 0x9f, 0x50,
+    0x01, 0x1f, 0x24, 0x02, 0x85, 0x6d, 0xe6, 0x4d, 0x55, 0xc4, 0x07, 0xe9,
+    0x87, 0x38, 0xbf, 0x1a, 0x3b, 0x05, 0x82, 0xc4, 0x73, 0x4b, 0x87, 0x3c,
+    0xb4, 0x0a, 0x48, 0x8c, 0x06, 0x67, 0xe7, 0xbf, 0xcc, 0xe7, 0xe5, 0xc3,
+    0xb2, 0x81, 0x60, 0xe2, 0xd1, 0xb1, 0x8f, 0x98, 0xbd, 0x7d, 0xbd, 0x4e,
+    0x9a, 0xca, 0xbe, 0xcb, 0x81, 0x47, 0x25, 0xaa, 0xfa, 0x91, 0xcf, 0x78,
+    0xce, 0xcb, 0x1a, 0x11, 0x79, 0xcf, 0x97, 0xa3, 0x95, 0x95, 0x6f, 0xd7,
+    0xae, 0x80, 0xc9, 0xd5, 0x95, 0xb7, 0xcf, 0xe2, 0x9d, 0x98, 0x65, 0x80,
+    0xfd, 0x2e, 0xee, 0x46, 0x5e, 0x46, 0x8c, 0xde, 0x52, 0xb4, 0xdc, 0xce,
+    0xa8, 0xab, 0x4e, 0x0c, 0x12, 0x9f, 0x89, 0x9c, 0x84, 0x80, 0xfe, 0x08,
+    0x64, 0x12, 0x12, 0x95, 0x62, 0xea, 0x65, 0xcc, 0x34, 0x80, 0xcf, 0x92,
+    0x5f, 0xc2, 0xae, 0x76, 0xe7, 0x2f, 0xbb, 0xa8, 0xdb, 0x6a, 0x66, 0x60,
+    0xaf, 0x88, 0xba, 0x65, 0x32, 0xcf, 0xf7, 0x6e, 0xd8, 0xd0, 0x69, 0xb0,
+    0x12, 0x23, 0xd6, 0xc2, 0x32, 0xe5, 0x8e, 0x51, 0xc5, 0x61, 0x28, 0x45,
+    0xf7, 0xf9, 0xea, 0x73, 0xce, 0x04, 0x2d, 0x56, 0x43, 0x10, 0x8b, 0x4f,
+    0x6b, 0xfa, 0x32, 0xa8, 0x92, 0x8f, 0xd9, 0xb4, 0xfd, 0xa4, 0x74, 0xa8,
+    0xea, 0xca, 0xd3, 0x84, 0xbb, 0x5a, 0x34, 0x57, 0xf9, 0xda, 0x25, 0x40,
+    0x1f, 0x5e, 0xc2, 0x66, 0x43, 0x05, 0xdd, 0x13, 0x88, 0x91, 0x60, 0xa1,
+    0x75, 0xd3, 0xc4, 0x27, 0xff, 0xda, 0x24, 0x3d, 0xd9, 0xd7, 0x47, 0x46,
+    0x30, 0xd0, 0x76, 0xc4, 0x9e, 0x97, 0xe3, 0x43, 0xd7, 0x45, 0xaf, 0x49,
+    0x36, 0xf2, 0x18, 0xdd, 0x3f, 0x86, 0x9a, 0xec, 0x9a, 0x70, 0xeb, 0x5a,
+    0xe2, 0xa0, 0x4b, 0x45, 0x21, 0xb3, 0x32, 0x3d, 0x0c, 0x8c, 0x03, 0x13,
+    0xae, 0x46, 0xb5, 0x1a, 0x0a, 0x03, 0x36, 0xfe, 0xfe, 0xfa, 0xc9, 0x4d,
+    0x46, 0xf8, 0xfe, 0x6f, 0x99, 0x8c, 0xe4, 0x77, 0x0c, 0x27, 0x59, 0xf7,
+    0xc3, 0xfc, 0x32, 0xb3, 0xa5, 0xae, 0xdc, 0x49, 0xac, 0x31, 0x27, 0xa6,
+    0x14, 0x92, 0xfb, 0xe3, 0x69, 0x35, 0x8d, 0xa0, 0x50, 0x55, 0x09, 0x90,
+    0xdf, 0x67, 0x08, 0x4c, 0x0e, 0xaf, 0x71, 0xc2, 0xe8, 0xb8, 0xdc, 0x45,
+    0xe3, 0x6d, 0x58, 0x3f, 0x19, 0x8d, 0xcd, 0xeb, 0xe3, 0x02, 0x49, 0xd8,
+    0xc8, 0x8b, 0x29, 0xb3, 0xef, 0x2b, 0xf0, 0x39, 0x5c, 0x11, 0xaa, 0x52,
+    0x44, 0x0d, 0x1a, 0x3a, 0x7a, 0x62, 0xda, 0x6d, 0xe3, 0xdd, 0x03, 0x30,
+    0x6d, 0x3e, 0x18, 0x30, 0x1d, 0xc0, 0xd0, 0x05, 0x67, 0x98, 0xf5, 0x2a,
+    0xc7, 0xa1, 0x58, 0xd7, 0xf8, 0x6f, 0x7d, 0x07, 0x59, 0x27, 0x95, 0xb9,
+    0x8d, 0x4d, 0xd7, 0xc8, 0x5e, 0x8b, 0x89, 0x14, 0xb7, 0x1b, 0x35, 0xaa,
+    0x72, 0x02, 0x39, 0x3c, 0x41, 0x7c, 0x91, 0x93, 0x81, 0xe1, 0xad, 0xbe,
+    0x77, 0x28, 0x80, 0xa2, 0x9c, 0xa8, 0x00, 0x18, 0xa5, 0x70, 0xec, 0xec,
+    0x96, 0x95, 0x37, 0xa3, 0xee, 0x15, 0xa0, 0x69, 0x0e, 0x05, 0xb5, 0xb4,
+    0xb6, 0xa7, 0x8b, 0xb9, 0x41, 0x88, 0x4f, 0x56, 0x39, 0xa7, 0xbe, 0x24,
+    0xce, 0x4c, 0xe0, 0x9c, 0x24, 0x5a, 0xa1, 0xab, 0xcd, 0x82, 0xf1, 0x16,
+    0x3f, 0xc0, 0xaf, 0xe1, 0x42, 0xe0, 0x7d, 0x1b, 0xd9, 0x8f, 0xb8, 0x04,
+    0xa1, 0x88, 0xd9, 0xc3, 0xaf, 0x4f, 0xda, 0xfd, 0x0b, 0x5c, 0xc3, 0x04,
+    0xf3, 0xdb, 0xe6, 0x76, 0x6e, 0xe9, 0xdc, 0xea, 0x6f, 0xa2, 0xa5, 0x75,
+    0x2c, 0xc7, 0x91, 0x7d, 0x4b, 0xd5, 0x68, 0x55, 0xbb, 0x2d, 0x14, 0xdb,
+    0x06, 0x76, 0xf7, 0xcc, 0x0a, 0x88, 0x6c, 0x2b, 0xa1, 0x57, 0xd6, 0x15,
+    0x9c, 0x46, 0xcf, 0x5b, 0x6f, 0x9e, 0x7e, 0xc5, 0x39, 0xda, 0x97, 0x26,
+    0x5e, 0xf5, 0x25, 0x06, 0xed, 0x8e, 0x9b, 0x1d, 0x1b, 0x91, 0x07, 0x89,
+    0x08, 0xce, 0xd7, 0x38, 0x43, 0x64, 0x8e, 0xf5, 0x3a, 0x52, 0x4a, 0xfb,
+    0x3e, 0xff, 0x2c, 0xb3, 0x78, 0x40, 0xb5, 0xdd, 0xb2, 0x8a, 0xd3, 0x6a,
+    0xc5, 0xb0, 0xa3, 0x4a, 0xb8, 0xe7, 0x27, 0xa0, 0x5a, 0x8f, 0x0f, 0xda,
+    0x53, 0x49, 0xc9, 0x77, 0x2a, 0xef, 0x78, 0xc6, 0xec, 0xaf, 0x10, 0xe5,
+    0x71, 0xc5, 0x7a, 0x85, 0xdf, 0xb2, 0x85, 0x02, 0xe3, 0x55, 0x7a, 0x91,
+    0x3a, 0x68, 0xb2, 0x9d, 0x3d, 0xd9, 0x01, 0xc5, 0x5f, 0x3c, 0xa8, 0x1d,
+    0x99, 0xc6, 0xe7, 0xad, 0x09, 0xd1, 0x39, 0x3a, 0x92, 0xc5, 0x77, 0x9c,
+    0xdf, 0x99, 0x56, 0x9f, 0xfe, 0xf8, 0xfd, 0xc8, 0x4f, 0x19, 0xa3, 0xa0,
+    0xdf, 0xff, 0x17, 0xac, 0xa9, 0x03, 0x32, 0x85, 0x4c, 0x29, 0xca, 0x89,
+    0x58, 0xdc, 0x88, 0xdd, 0xeb, 0x79, 0x68, 0x5e, 0x0f, 0x37, 0x1a, 0xf7,
+    0x05, 0xfd, 0x39, 0x91, 0x25, 0x61, 0xf3, 0x04, 0xda, 0x97, 0xfc, 0x7b,
+    0xcc, 0x40, 0x63, 0xfd, 0x5b, 0x3b, 0x27, 0x8e, 0x92, 0x6d, 0x98, 0x0f,
+    0xcc, 0x9c, 0x9b, 0xda, 0xb2, 0xc6, 0xca, 0x56, 0xff, 0x7e, 0xcc, 0xa2,
+    0xc0, 0x45, 0x3e, 0xf6, 0xdf, 0xa7, 0xe8, 0x2a, 0xef, 0x0c, 0xde, 0xec,
+    0xa4, 0x1d, 0x2c, 0x3e, 0x03, 0xfd, 0xa4, 0x44, 0x60, 0x4a, 0xf5, 0x83,
+    0x8f, 0x09, 0x2d, 0xe8, 0xd5, 0x46, 0xf6, 0x1c, 0x2d, 0x39, 0x28, 0x0c,
+    0xdf, 0xa1, 0x2b, 0x05, 0x6e, 0x3c, 0x36, 0xdd, 0x91, 0x81, 0x52, 0xf1,
+    0x56, 0xdc, 0xbb, 0x79, 0x62, 0xd8, 0x2e, 0x27, 0x5d, 0x9f, 0x3c, 0xce,
+    0x81, 0x5c, 0x70, 0xe5, 0x4d, 0x33, 0x06, 0xd5, 0x14, 0x04, 0xb7, 0xbc,
+    0x7b, 0x7a, 0xb4, 0xf7, 0x4a, 0x48, 0x8f, 0x97, 0x85, 0x96, 0x69, 0xc9,
+    0x40, 0x52, 0xb1, 0x1c, 0x28, 0x82, 0xb3, 0x63, 0xee, 0x94, 0x2f, 0xcb,
+    0x40, 0xad, 0xd7, 0x78, 0xb1, 0xc4, 0x21, 0x05, 0x36, 0xd9, 0x46, 0xf0,
+    0x83, 0xcd, 0xee, 0x52, 0x7a, 0xa6, 0xa4, 0x40, 0xb0, 0x2f, 0xf0, 0x1c,
+    0xfa, 0x42, 0x98, 0x54, 0x5b, 0xfe, 0x5e, 0xd6, 0x84, 0x73, 0xca, 0x39,
+    0xbe, 0x87, 0xf2, 0x92, 0xee, 0x3d, 0x21, 0xcc, 0x69, 0x81, 0xe5, 0xe8,
+    0x8a, 0xc3, 0x23, 0x64, 0x98, 0xd5, 0x1d, 0xcd, 0x5c, 0x6c, 0x37, 0xc8,
+    0x8b, 0x08, 0x22, 0x12, 0x9f, 0x85, 0xc9, 0xed, 0xb4, 0xa6, 0x07, 0xe1,
+    0x62, 0x79, 0x35, 0x5d, 0x26, 0x11, 0x4a, 0x6b, 0x33, 0x37, 0x91, 0x78,
+    0xe8, 0xe2, 0xba, 0x8b, 0x8a, 0xb7, 0xbb, 0x0f, 0xd2, 0xb3, 0xa2, 0x02,
+    0x0c, 0x57, 0x35, 0x99, 0x88, 0x6b, 0x9b, 0x64, 0x79, 0x1f, 0x4a, 0x48,
+    0xd4, 0x3b, 0x5c, 0xeb, 0xb4, 0x83, 0xc3, 0xad, 0x9c, 0x6a, 0xb0, 0xcf,
+    0x7f, 0x70, 0xe8, 0x22, 0x46, 0x25, 0xfe, 0x7e, 0x02, 0x44, 0x83, 0x02,
+    0xb3, 0x08, 0x2e, 0x34, 0x08, 0x4b, 0xff, 0xa2, 0xc1, 0x60, 0xbb, 0xd8,
+    0x89, 0x16, 0xf8, 0xaa, 0xab, 0xea, 0xf7, 0xa0, 0x10, 0x9a, 0xc9, 0xe9,
+    0xa4, 0x81, 0xa7, 0x87, 0x32, 0x5b, 0xc1, 0xd0, 0xd9, 0x70, 0x6f, 0xb6,
+    0x7c, 0x65, 0xd5, 0x0e, 0x65, 0x93, 0xfe, 0x6d, 0x66, 0xaa, 0xab, 0xd0,
+    0x03, 0x07, 0xf2, 0xbe, 0x39, 0xd6, 0xc8, 0xac, 0xf2, 0x06, 0x58, 0x58,
+    0x46, 0xc0, 0x1a, 0xbd, 0xa4, 0x96, 0x38, 0x31, 0x32, 0x89, 0x04, 0xdf,
+    0xcd, 0x3c, 0x2e, 0x98, 0xb8, 0x39, 0xba, 0xe2, 0xca, 0x6b, 0xd0, 0x53,
+    0xce, 0x4a, 0xc8, 0x95, 0x81, 0x84, 0x17, 0xce, 0x7f, 0x1d, 0xc1, 0x5a,
+    0xc4, 0xc2, 0x73, 0x30, 0x6d, 0x0b, 0x8c, 0xf8, 0x66, 0x38, 0x4e, 0xa3,
+    0x14, 0x84, 0x15, 0x36, 0x9e, 0x0d, 0x56, 0x6b, 0xa6, 0x77, 0x65, 0xa4,
+    0x2c, 0x77, 0x00, 0x8b, 0x43, 0x57, 0xc6, 0x25, 0xc5, 0xd0, 0x17, 0x79,
+    0x6b, 0x5d, 0xbc, 0xcd, 0xc8, 0x25, 0x8f, 0x20, 0x09, 0xcc, 0xbd, 0x80,
+    0x10, 0xdf, 0x35, 0xf6, 0x9c, 0x04, 0x80, 0x23, 0xdc, 0x97, 0xe0, 0xba,
+    0x29, 0x48, 0x2e, 0x95, 0x0f, 0xb1, 0x9b, 0xc7, 0xe6, 0x0b, 0x89, 0x16,
+    0xe2, 0x81, 0x3b, 0x32, 0x69, 0xc4, 0xde, 0xc6, 0x12, 0x09, 0x47, 0xff,
+    0x50, 0xe4, 0x45, 0xb7, 0x35, 0xd2, 0x61, 0x9b, 0x52, 0x6e, 0xbe, 0xaf,
+    0xd2, 0xeb, 0x0c, 0x50, 0xf1, 0x57, 0x9f, 0x59, 0xe1, 0xc1, 0x4f, 0x8c,
+    0x79, 0x07, 0x05, 0xce, 0x8d, 0x64, 0xb2, 0xf0, 0xd3, 0x4f, 0xe1, 0x7b,
+    0xfa, 0x30, 0x0a, 0xc2, 0x5d, 0x0c, 0x47, 0x6c, 0x17, 0x77, 0x1f, 0xe5,
+    0xd8, 0x14, 0xfd, 0xc1, 0x01, 0x70, 0x51, 0x60, 0xb2, 0x20, 0xfd, 0x86,
+    0xbc, 0x19, 0x5e, 0x01, 0xa6, 0x19, 0x3a, 0x21, 0xa5, 0x0a, 0x1c, 0xd9,
+    0xa9, 0x78, 0xbb, 0xc9, 0x01, 0x65, 0xe4, 0xb3, 0x48, 0xb8, 0xe1, 0xe7,
+    0xb5, 0xf4, 0x4e, 0xa9, 0xb6, 0xe2, 0x5b, 0xeb, 0xf5, 0x76, 0x06, 0x1a,
+    0xd9, 0x08, 0x40, 0xff, 0x72, 0xb2, 0xe3, 0x01, 0x50, 0xb1, 0xad, 0xb3,
+    0xa3, 0xf6, 0xef, 0x72, 0x05, 0x0c, 0xf4, 0xce, 0x24, 0x2c, 0x63, 0x89,
+    0x63, 0x9e, 0x21, 0xb8, 0xb0, 0xbe, 0xc7, 0x45, 0xae, 0x47, 0x2b, 0x9e,
+    0x61, 0x81, 0x4c, 0x76, 0x96, 0x7b, 0x18, 0x37, 0x74, 0xcb, 0x00, 0xef,
+    0x38, 0x72, 0x24, 0x0a, 0x63, 0xc1, 0x64, 0xd6, 0x41, 0xc8, 0x6a, 0xf1,
+    0xe7, 0x11, 0x20, 0x4b, 0xc2, 0x95, 0x70, 0xb8, 0xf8, 0x8f, 0xd9, 0xae,
+    0x8c, 0x12, 0xd8, 0x6f, 0x63, 0x30, 0xca, 0x56, 0x46, 0x11, 0xda, 0x49,
+    0x1f, 0x84, 0x3d, 0xae, 0xab, 0x78, 0x29, 0x02, 0x6c, 0x43, 0xa3, 0xef,
+    0x9d, 0x97, 0x59, 0x15, 0x53, 0xcd, 0xc7, 0x47, 0x65, 0x30, 0xc7, 0xae,
+    0x31, 0x4a, 0x41, 0xb4, 0x66, 0x9c, 0xbb, 0x51, 0x0b, 0xbd, 0xe2, 0x7d,
+    0x41, 0x2c, 0xd0, 0x75, 0x57, 0x93, 0xce, 0x2e, 0xeb, 0x31, 0x7f, 0x56,
+    0xb2, 0xa4, 0x2b, 0x9f, 0xcc, 0xef, 0x6f, 0xf0, 0x77, 0x19, 0xad, 0x4d,
+    0x2e, 0x37, 0x00, 0x75, 0x53, 0xae, 0x22, 0x44, 0x69, 0x1c, 0x8a, 0x90,
+    0xf2, 0xcd, 0x0f, 0x6b, 0x37, 0xdb, 0xfd, 0x71, 0x64, 0x80, 0xd8, 0x57,
+    0x1b, 0x8f, 0xff, 0x14, 0xd4, 0x5f, 0xe1, 0xd1, 0x0f, 0x06, 0x13, 0x61,
+    0x29, 0xa9, 0x80, 0x9d, 0xc7, 0x8a, 0xa0, 0xb5, 0xaa, 0xfc, 0xe0, 0xb4,
+    0xb4, 0xf0, 0x31, 0xf0, 0xec, 0x78, 0x03, 0x28, 0xb9, 0xf7, 0xd9, 0xa7,
+    0xc8, 0xad, 0x2e, 0x16, 0xb8, 0x18, 0x82, 0x43, 0x66, 0x8b, 0xae, 0xb2,
+    0x45, 0x2b, 0x0c, 0x9d, 0x69, 0xbd, 0x1b, 0xc5, 0x20, 0xc6, 0x41, 0xe7,
+    0x4f, 0x4b, 0x7b, 0x46, 0x3d, 0x7a, 0x6d, 0x9f, 0x13, 0x2e, 0x0f, 0xf3,
+    0x85, 0x3e, 0x5b, 0x12, 0xe5, 0xbf, 0x1b, 0x20, 0xc3, 0x5f, 0x6b, 0xf7,
+    0xf7, 0xa3, 0xd7, 0x33, 0xd2, 0xcb, 0x18, 0xa5, 0xa4, 0xa2, 0xd3, 0x59,
+    0x91, 0x9a, 0x04, 0xfa, 0x9d, 0xa5, 0x55, 0xad, 0x09, 0x5a, 0x1e, 0x0b,
+    0x10, 0xd0, 0x46, 0x18, 0xe4, 0x09, 0xe8, 0x1b, 0x44, 0xd3, 0x78, 0x45,
+    0xc0, 0xdf, 0xa2, 0xef, 0xfc, 0x59, 0x8a, 0x1b, 0x22, 0x60, 0xc9, 0x58,
+    0x7d, 0x65, 0x45, 0xa9, 0xac, 0xd5, 0xd4, 0xc4, 0x44, 0xd3, 0x08, 0x44,
+    0x40, 0x4d, 0x3d, 0x7e, 0x39, 0x81, 0x72, 0x15, 0x49, 0xd7, 0x2c, 0xda,
+    0x33, 0xaf, 0xc5, 0xb5, 0x8a, 0x3c, 0xbf, 0x81, 0x88, 0x4f, 0x12, 0xe4,
+    0xe8, 0xe6, 0x00, 0xb6, 0xd9, 0xcd, 0xb2, 0x70, 0x08, 0x15, 0x72, 0xf6,
+    0x46, 0xc7, 0x98, 0x7c, 0x1d, 0x54, 0xd0, 0x66, 0x2d, 0xa1, 0xd8, 0xda,
+    0xb0, 0xe5, 0x9f, 0xa3, 0x2f, 0x2c, 0xfb, 0x34, 0xb3, 0x21, 0x8b, 0x61,
+    0xf4, 0xce, 0x60, 0x2b, 0xb5, 0x5e, 0x3d, 0x14, 0x2c, 0xbe, 0x19, 0x9d,
+    0x5f, 0x01, 0xe1, 0x21, 0x34, 0x11, 0x6b, 0x10, 0xd4, 0x17, 0x58, 0xb3,
+    0x0a, 0x30, 0xe4, 0x17, 0x51, 0x0b, 0xf2, 0xbb, 0xa6, 0xb7, 0x00, 0xa2,
+    0xe8, 0xa5, 0xa3, 0x41, 0x1d, 0x65, 0x2d, 0x26, 0x93, 0x26, 0x7d, 0xdc,
+    0xad, 0x6f, 0x83, 0xeb, 0x66, 0x55, 0xde, 0x60, 0x21, 0x56, 0x19, 0x4f,
+    0x9b, 0x7b, 0x26, 0x4a, 0x80, 0xf5, 0xab, 0x8b, 0xbf, 0xe4, 0xb1, 0xa1,
+    0xd6, 0x33, 0x32, 0xbf, 0x86, 0x8c, 0x3c, 0xd0, 0x12, 0x03, 0xd4, 0xb9,
+    0x23, 0x54, 0x1b, 0x94, 0x2f, 0xa5, 0x34, 0x4d, 0x59, 0x18, 0x33, 0x8e,
+    0x8c, 0xf7, 0x1f, 0xc9, 0x6d, 0x75, 0xfb, 0x2a, 0x22, 0x6c, 0x64, 0xb7,
+    0x79, 0xd8, 0x3b, 0xf6, 0x4e, 0x98, 0xd8, 0xa8, 0x2c, 0x06, 0xd1, 0x92,
+    0x32, 0x44, 0xec, 0x38, 0x40, 0x3b, 0x53, 0x16, 0x40, 0x8f, 0x92, 0x72,
+    0x87, 0xa8, 0xb8, 0xc0, 0x8f, 0x25, 0x4c, 0x4f, 0x24, 0xfc, 0x8d, 0xc6,
+    0xa6, 0xeb, 0x2f, 0xdf, 0x2f, 0x0d, 0x2f, 0xd3, 0x6e, 0x70, 0x71, 0xfe,
+    0xf0, 0x2e, 0xe9, 0x84, 0xd3, 0xc1, 0xd1, 0x70, 0x4b, 0x8f, 0x7b, 0x60,
+    0xb0, 0xb7, 0xe3, 0x79, 0x52, 0x6a, 0x6b, 0x26, 0x03, 0x8f, 0x6a, 0x0f,
+    0x8d, 0x85, 0xd7, 0x5f, 0xf7, 0x39, 0x31, 0x0e, 0x26, 0x73, 0x84, 0x3f,
+    0x9b, 0x10, 0x6f, 0x29, 0x63, 0x14, 0x36, 0xa2, 0xec, 0x44, 0x7d, 0x84,
+    0xc6, 0x4a, 0xec, 0xfe, 0xac, 0xcb, 0xe4, 0xfa, 0xf6, 0x68, 0x83, 0x68,
+    0xe0, 0x8f, 0xd3, 0x8a, 0x60, 0x73, 0xf1, 0x5c, 0x71, 0x02, 0x0c, 0xa2,
+    0x88, 0x2c, 0xa2, 0x35, 0x35, 0x5c, 0x3f, 0xb1, 0xbe, 0xb3, 0x6b, 0x5c,
+    0xe1, 0x78, 0x75, 0x40, 0x20, 0x87, 0x67, 0xca, 0x07, 0x1c, 0x9c, 0x02,
+    0xc7, 0xf2, 0x9d, 0x1c, 0xda, 0x1b, 0x86, 0x1b, 0xc6, 0xa6, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x93, 0xca, 0x30, 0xae,
+    0xea, 0x26, 0x6a, 0x1b, 0x15, 0x46, 0x0a, 0xe3, 0x57, 0x23, 0x4c, 0x0c,
+    0x98, 0x8e, 0x3e, 0xbb, 0x43, 0x14, 0x73, 0xdf, 0x17, 0x91, 0xe2, 0xee,
+    0x39, 0xf9, 0xc2, 0x2f, 0xdc, 0xad, 0x0e, 0x00, 0xf5, 0xdd, 0xe3, 0x97,
+    0xba, 0x8c, 0xee, 0x53, 0xc4, 0x70, 0x37, 0x46, 0xcf, 0x04, 0xc3, 0xc8,
+    0x56, 0x38, 0x2e, 0x39, 0x75, 0x32, 0x6d, 0x98, 0xc4, 0x14, 0xae, 0xa4,
+    0x29, 0xa3, 0xc6, 0xb6, 0x66, 0x45, 0x48, 0xdf, 0xc0, 0xa9, 0x4b, 0x4f,
+    0xef, 0xb9, 0xb4, 0x89, 0x0d, 0x64, 0x00, 0x5c, 0xd1, 0xc8, 0x2b, 0xf7,
+    0xc5, 0x1a, 0x1b, 0x06, 0xb7, 0x49, 0xb1, 0xe3, 0x4d, 0x87, 0xf9, 0x3f,
+    0xba, 0x39, 0xa3, 0x56, 0x7f, 0x43, 0xcc, 0x15, 0x9c, 0x3d, 0xba, 0x71,
+    0x7b, 0xeb, 0x45, 0x0f, 0x15, 0x1b, 0x6c, 0x84, 0x75, 0x6d, 0x43, 0x0b,
+    0x27, 0x12, 0x6b, 0xbc, 0x0a, 0x6d, 0xe4, 0xf6, 0x4f, 0xc7, 0xbb, 0x9e,
+    0x91, 0xb5, 0x09, 0x5f, 0x79, 0x2a, 0xbf, 0xda, 0x34, 0x91, 0x44, 0x47,
+    0x52, 0x64, 0x00, 0x89, 0x27, 0x17, 0x5c, 0xe9, 0x90, 0x8b, 0xcb, 0xbe,
+    0x21, 0x47, 0x65, 0x1c, 0x54, 0x61, 0x48, 0x17, 0x66, 0xb7, 0xa1, 0x60,
+    0x27, 0x31, 0x04, 0x42, 0x3b, 0x33, 0x3d, 0xda, 0xf7, 0x61, 0x3d, 0x4b,
+    0x91, 0xa5, 0x74, 0x4b, 0xde, 0x16, 0xf2, 0x79, 0x3e, 0xf7, 0x89, 0x87,
+    0xb3, 0xdd, 0xa2, 0x49, 0xd7, 0x54, 0x1b, 0x39, 0xff, 0xb5, 0xec, 0x9d,
+    0x1d, 0x09, 0x7e, 0x5a, 0x3c, 0xd1, 0xdc, 0x0e, 0x2a, 0x0e, 0x2c, 0x40,
+    0x4e, 0xa5, 0x8c, 0x9d, 0xc8, 0x9b, 0xa5, 0xb2, 0x40, 0xa4, 0xaa, 0x3b,
+    0xac, 0x93, 0x19, 0xf7, 0xa1, 0x8b, 0xf8, 0x4a, 0x40, 0x08, 0x5d, 0x1d,
+    0xb0, 0xae, 0x0f, 0x67, 0xa7, 0x21, 0xaf, 0xe3, 0xb1, 0xfc, 0xff, 0xa0,
+    0x95, 0x66, 0x2b, 0xf7, 0x82, 0x2d, 0x8a, 0x26, 0x0f, 0xc3, 0xed, 0x62,
+    0xb6, 0xcb, 0x4c, 0x86, 0xe9, 0x20, 0x78, 0x3f, 0x08, 0x53, 0x8f, 0x41,
+    0xf1, 0xa1, 0x04, 0x77, 0xd9, 0xe6, 0xea, 0x26, 0x6d, 0x33, 0x48, 0xb3,
+    0xbb, 0xed, 0xfc, 0xd7, 0xa3, 0x2b, 0xe2, 0x39, 0xcf, 0x78, 0x4e, 0x11,
+    0x26, 0xad, 0x39, 0x83, 0x6e, 0x72, 0xbf, 0xc6, 0x34, 0x23, 0x97, 0x5d,
+    0x7b, 0x64, 0x1e, 0x78, 0x00, 0x34, 0x92, 0x5d, 0x3f, 0x23, 0x28, 0x60,
+    0x7f, 0x88, 0xf0, 0xca, 0x96, 0x4a, 0x15, 0xbf, 0x8a, 0xb7, 0xd0, 0xd9,
+    0x99, 0x8b, 0xdb, 0x26, 0xdc, 0x7e, 0x8d, 0x35, 0x53, 0x60, 0x07, 0x85,
+    0x80, 0xc4, 0x9c, 0x0d, 0x81, 0xe2, 0x93, 0x85, 0x76, 0x2d, 0x85, 0x21,
+    0x6e, 0xda, 0x29, 0xe5, 0xb1, 0x08, 0x46, 0x09, 0x1b, 0x8a, 0xd9, 0xd2,
+    0xd7, 0x16, 0x74, 0xee, 0x26, 0x3e, 0xc4, 0x8c, 0x2e, 0x6b, 0x0c, 0xbc,
+    0x95, 0xea, 0x4a, 0xb2, 0xd6, 0x6f, 0x43, 0xd1, 0x3a, 0x8f, 0xbd, 0x77,
+    0xb4, 0x67, 0x63, 0x6b, 0xd2, 0xe0, 0xf0, 0x81, 0x74, 0xb7, 0xc5, 0x11,
+    0x60, 0x10, 0x6b, 0xc6, 0x0f, 0xfd, 0x84, 0x2e, 0x5c, 0x8f, 0x3b, 0xf5,
+    0x68, 0xa7, 0x62, 0xc6, 0x4f, 0xa6, 0xee, 0x19, 0x44, 0xea, 0xc0, 0xe4,
+    0x64, 0x12, 0x71, 0x2f, 0xfb, 0xa3, 0x4d, 0xb0, 0x8e, 0x5e, 0xe1, 0x79,
+    0x65, 0xd4, 0xf3, 0xed, 0x73, 0x04, 0xf1, 0x6d, 0xc6, 0x75, 0x54, 0x28,
+    0x13, 0xe2, 0xd6, 0xa1, 0x26, 0xf9, 0xa4, 0x29, 0x20, 0x5b, 0xd0, 0x3c,
+    0x3d, 0xf3, 0x7a, 0x18, 0x9a, 0x3d, 0xec, 0x6a, 0x4c, 0xfd, 0xa5, 0x00,
+    0xdf, 0xec, 0xfd, 0x64, 0x38, 0x66, 0xa7, 0xba, 0x59, 0xb3, 0x9b, 0x9c,
+    0x44, 0xfb, 0x10, 0x08, 0xb8, 0x79, 0xea, 0x85, 0xbf, 0xa4, 0x14, 0xce,
+    0xce, 0x85, 0x22, 0x3f, 0x16, 0x00, 0x1c, 0x57, 0xc8, 0x5a, 0x1b, 0xf5,
+    0xff, 0xde, 0x7e, 0xa9, 0xcc, 0xf3, 0xb5, 0x1d, 0x57, 0x06, 0xda, 0xbb,
+    0x6c, 0x0a, 0x1e, 0xd4, 0x09, 0x74, 0x84, 0x1d, 0xfa, 0xdf, 0x33, 0x1e,
+    0xe2, 0x8f, 0x10, 0xf7, 0x73, 0xab, 0x71, 0xb8, 0x64, 0xce, 0xc0, 0x49,
+    0xc0, 0x36, 0xd3, 0x39, 0x31, 0x4c, 0x12, 0x5b, 0xf3, 0xf9, 0xb4, 0x2c,
+    0x88, 0xba, 0xd4, 0x1a, 0xbd, 0x0c, 0x99, 0xbd, 0x0e, 0xad, 0x51, 0xe0,
+    0xca, 0xdb, 0x25, 0x66, 0x83, 0xe0, 0x55, 0x18, 0xeb, 0xa6, 0x4e, 0x56,
+    0xcb, 0x2f, 0xa5, 0xf2, 0x42, 0x7a, 0xa1, 0x05, 0xf0, 0x3a, 0x71, 0x5a,
+    0x78, 0x3a, 0x7a, 0x6d, 0x12, 0x9f, 0x43, 0xc5, 0xcc, 0xb3, 0xfd, 0xf2,
+    0xbf, 0x05, 0x16, 0xef, 0x07, 0xf9, 0xde, 0x0d, 0x51, 0xf0, 0x33, 0x86,
+    0x43, 0x57, 0x40, 0xbc, 0xa9, 0xbd, 0xa0, 0x23, 0xff, 0xbb, 0xe6, 0x15,
+    0xa1, 0xeb, 0xe9, 0x78, 0x0d, 0x72, 0x76, 0xf2, 0xb6, 0x6e, 0x46, 0xe2,
+    0x86, 0xab, 0x3c, 0x52, 0x2c, 0xc6, 0x77, 0xdd, 0x57, 0xf7, 0x4d, 0x36,
+    0xbb, 0x41, 0x08, 0x21, 0xaa, 0xe6, 0x44, 0x50, 0xed, 0xaf, 0x18, 0xb3,
+    0xdd, 0x6b, 0x57, 0x46, 0x9e, 0x44, 0x93, 0x20, 0xe0, 0x62, 0x95, 0xcd,
+    0xcf, 0xe4, 0x96, 0x92, 0xc3, 0x0d, 0x16, 0xb2, 0xc3, 0xf4, 0x0f, 0x3f,
+    0x87, 0x17, 0xb9, 0x7b, 0x60, 0x60, 0xfa, 0xfb, 0x81, 0x5c, 0xb3, 0xb7,
+    0x89, 0x73, 0xf7, 0x35, 0xf7, 0x27, 0xf1, 0x0e, 0xa4, 0xa1, 0xba, 0xea,
+    0x6a, 0xe3, 0x5c, 0x0f, 0xf7, 0x15, 0xbc, 0x28, 0x57, 0x27, 0x8f, 0xd8,
+    0xca, 0x82, 0x19, 0xd0, 0xa3, 0x9d, 0xe5, 0xe0, 0x44, 0xbf, 0x78, 0xa4,
+    0x09, 0x69, 0x27, 0xa0, 0x69, 0xb5, 0xd4, 0xbe, 0x00, 0xe6, 0x03, 0x97,
+    0xbc, 0x8b, 0xfc, 0x25, 0x70, 0xb3, 0x49, 0x30, 0xe3, 0x24, 0x19, 0x77,
+    0xb4, 0x93, 0x46, 0x03, 0xe6, 0x22, 0xaf, 0x76, 0xd2, 0x90, 0x00, 0x05,
+    0x46, 0xb8, 0xa4, 0xf5, 0x4c, 0xaa, 0x04, 0x63, 0xa0, 0x57, 0xe0, 0x20,
+    0x6e, 0x1a, 0xed, 0x21, 0x86, 0xd0, 0x38, 0x5b, 0xe6, 0xa7, 0xb0, 0xe7,
+    0x75, 0xe3, 0x76, 0xb3, 0x15, 0x8b, 0xdc, 0x10, 0x52, 0x15, 0x21, 0x7b,
+    0xd0, 0xc4, 0x75, 0x26, 0x1d, 0x6e, 0x0d, 0x4c, 0x08, 0x5b, 0x95, 0x9a,
+    0xd0, 0xda, 0xbe, 0x23, 0x98, 0xde, 0x60, 0x2a, 0xe9, 0xa4, 0x92, 0xf0,
+    0x92, 0x84, 0xdc, 0x86, 0x60, 0xf5, 0x23, 0x31, 0xf5, 0xe9, 0xd6, 0x00,
+    0xc1, 0x78, 0xab, 0x05, 0x94, 0xd3, 0x47, 0x4d, 0x32, 0x0f, 0x82, 0xa0,
+    0x99, 0x0b, 0xfe, 0x6b, 0x58, 0xf9, 0x24, 0xf6, 0x17, 0xa0, 0x5f, 0x24,
+    0x6a, 0xc6, 0x01, 0xa8, 0xfa, 0xca, 0xdc, 0xb6, 0x83, 0xcb, 0xd2, 0x3b,
+    0xb7, 0x0b, 0x04, 0x3e, 0x6a, 0xaf, 0x23, 0x17, 0x3e, 0x14, 0xce, 0x52,
+    0x1c, 0xe3, 0x06, 0x66, 0x29, 0x17, 0x6f, 0x7e, 0x66, 0x06, 0xa9, 0x68,
+    0x7f, 0xca, 0xad, 0xa8, 0xb7, 0x2d, 0xa4, 0x5d, 0xa6, 0x16, 0xcd, 0xed,
+    0xee, 0x14, 0x96, 0xc8, 0x12, 0x69, 0x4e, 0x70, 0x72, 0x2a, 0x75, 0x82,
+    0x08, 0x3f, 0x3e, 0x27, 0xa0, 0xea, 0x43, 0x84, 0xa9, 0x9a, 0x91, 0x87,
+    0x4f, 0x20, 0x61, 0x55, 0x8d, 0x70, 0xad, 0x6c, 0x59, 0x5d, 0x13, 0x80,
+    0xbb, 0x52, 0x55, 0x81, 0x8b, 0x59, 0x94, 0x0f, 0xc2, 0x54, 0x79, 0x59,
+    0xe8, 0x9d, 0x58, 0xe5, 0x91, 0x10, 0xb3, 0xef, 0x1c, 0xda, 0xaa, 0xdd,
+    0x91, 0x0b, 0xb0, 0x14, 0x3b, 0xad, 0x02, 0x98, 0x40, 0x3c, 0x54, 0xc4,
+    0x23, 0xb9, 0x40, 0x54, 0x7e, 0x88, 0x10, 0x3e, 0x24, 0xe5, 0xf6, 0xdf,
+    0x5c, 0x9e, 0x7a, 0x9f, 0xd0, 0xff, 0x5e, 0x9c, 0xb6, 0x30, 0x17, 0x94,
+    0xd2, 0xaa, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+    0x96, 0xff, 0x2f, 0x01, 0x60, 0x2c, 0x1b, 0xe3, 0xc6, 0xcb, 0xa4, 0x41,
+    0xa1, 0x44, 0x13, 0x14, 0xe2, 0x44, 0x77, 0x1c, 0x96, 0xe8, 0xe6, 0x4f,
+    0x70, 0x99, 0x3a, 0xef, 0xa1, 0x6f, 0x1f, 0x7f, 0xb9, 0xe9, 0x1e, 0x35,
+    0x37, 0x5b, 0x94, 0x90, 0x78, 0xcc, 0x8d, 0xcd, 0x6c, 0x9f, 0xf6, 0x73,
+    0xed, 0x23, 0xa2, 0x28, 0x64, 0x58, 0x50, 0x64, 0x05, 0xbc, 0xc9, 0x9b,
+    0x5a, 0xec, 0x3f, 0x2b, 0x61, 0xcf, 0xa7, 0x35, 0x56, 0x8c, 0x77, 0x68,
+    0xd6, 0xcf, 0x9b, 0xc5, 0x62, 0xee, 0x3a, 0xb2, 0xfe, 0x78, 0xba, 0x02,
+    0xe7, 0x26, 0x8a, 0x89, 0x30, 0x19, 0xcc, 0xb0, 0x98, 0xbf, 0x30, 0x2c,
+    0xae, 0x13, 0x6c, 0x93, 0x86, 0x19, 0x84, 0x13, 0x01, 0x2f, 0x39, 0x4e,
+    0x33, 0xd1, 0x15, 0x99, 0xf7, 0x1e, 0xb8, 0x86, 0xdb, 0xb6, 0xf9, 0x56,
+    0x42, 0x0e, 0x4a, 0xb1, 0x5e, 0xf0, 0x9a, 0x06, 0x5e, 0xab, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0xcd, 0xde, 0xad, 0x40,
+    0x34, 0xcd, 0x79, 0x0a, 0x29, 0x84, 0x05, 0x3f, 0xb5, 0xbe, 0x49, 0x84,
+    0x43, 0xcc, 0xa6, 0xe3, 0xe9, 0xdc, 0x84, 0x14, 0xe7, 0xb3, 0x1b, 0x96,
+    0xe8, 0xda, 0x35, 0x15, 0x38, 0xf5, 0xb3, 0xb5, 0x91, 0xc3, 0xc3, 0x94,
+    0xc6, 0x79, 0xeb, 0xf5, 0x22, 0x78, 0xf0, 0x0b, 0xda, 0xb0, 0x91, 0xa7,
+    0x43, 0x71, 0x8e, 0xa6, 0x52, 0x0f, 0x81, 0x06, 0xc8, 0xdf, 0xb5, 0x1f,
+    0x92, 0xb0, 0xfe, 0x93, 0x38, 0x4c, 0xf4, 0x17, 0x66, 0x31, 0xea, 0x08,
+    0x72, 0xb9, 0xaa, 0xfd, 0x40, 0x8d, 0xbf, 0x56, 0x19, 0xb1, 0xb5, 0x8e,
+    0x4e, 0x4e, 0x73, 0x7f, 0x4b, 0x0c, 0x70, 0x94, 0x7c, 0x9f, 0xfc, 0x23,
+    0x35, 0xba, 0xd2, 0x23, 0x88, 0x1d, 0x83, 0x28, 0x45, 0xd7, 0x1b, 0x63,
+    0xfb, 0x36, 0x86, 0x06, 0xf3, 0x99, 0x81, 0x6e, 0xd7, 0xf1, 0xd4, 0x53,
+    0x6d, 0x30, 0x3c, 0x8d, 0xac, 0xc6, 0x9a, 0xd5, 0xe8, 0x4f, 0x11, 0x58,
+    0xba, 0xfd, 0x67, 0x06, 0xe7, 0x1a, 0xb4, 0xa1, 0x45, 0x13, 0xf2, 0x3b,
+    0xdc, 0x71, 0xf0, 0xc6, 0x53, 0xfc, 0x8b, 0x2f, 0x14, 0xe4, 0xe0, 0xd6,
+    0x8c, 0x96, 0x4c, 0x48, 0xc0, 0x30, 0x6e, 0x00, 0x0f, 0x42, 0xfe, 0xa7,
+    0x9d, 0x0f, 0xf2, 0x52, 0x58, 0xf9, 0x35, 0x33, 0x99, 0xda, 0xd5, 0x9d,
+    0x61, 0x26, 0x6b, 0x80, 0xff, 0x08, 0x51, 0x54, 0x26, 0xfa, 0x8d, 0xfc,
+    0x67, 0x60, 0x93, 0x0e, 0xcd, 0x78, 0x41, 0x5a, 0x31, 0x47, 0x14, 0xb0,
+    0x65, 0x89, 0x30, 0xcb, 0x0c, 0xc5, 0xa0, 0x37, 0xa8, 0xe0, 0xcf, 0x24,
+    0xa4, 0x2f, 0xad, 0xa7, 0x9c, 0xa2, 0xe8, 0x81, 0x17, 0xbe, 0x2f, 0xd5,
+    0xd1, 0xa8, 0xff, 0x9d, 0x5e, 0x7f, 0xd9, 0x6c, 0x56, 0xe6, 0xc4, 0x60,
+    0x8d, 0xa5, 0x47, 0x5e, 0x43, 0x1e, 0x34, 0x23, 0xb3, 0x6a, 0xdf, 0x6c,
+    0xf8, 0xd1, 0x85, 0x11, 0xaa, 0x74, 0x85, 0x71, 0x27, 0xc5, 0x80, 0x37,
+    0x60, 0xb4, 0x2b, 0x53, 0x5a, 0xc4, 0x35, 0xd1, 0xe8, 0x4b, 0x01, 0x58,
+    0x1f, 0xdb, 0x73, 0xf3, 0x2c, 0x8b, 0xbb, 0x17, 0x36, 0x76, 0x35, 0x6b,
+    0xa0, 0x82, 0x47, 0xf5, 0x16, 0x21, 0x41, 0x43, 0xc9, 0x1f, 0x53, 0xf9,
+    0xe9, 0x47, 0xf0, 0x9c, 0x6d, 0xe3, 0x23, 0x59, 0x74, 0xdc, 0x1a, 0x8f,
+    0x4e, 0x6c, 0x71, 0x83, 0x7e, 0xd0, 0x2b, 0x50, 0x44, 0x86, 0x5f, 0xbf,
+    0x60, 0x92, 0xeb, 0x9a, 0x9b, 0xa2, 0xc9, 0x2b, 0xa8, 0xc4, 0x77, 0x4e,
+    0x3f, 0xf8, 0xa6, 0x39, 0x50, 0x5c, 0x7e, 0x2a, 0x70, 0xb0, 0x5d, 0x28,
+    0xb2, 0x81, 0xa9, 0xaf, 0x16, 0x5e, 0x27, 0xeb, 0x03, 0x0e, 0x82, 0xad,
+    0x28, 0x51, 0x16, 0xd1, 0xf4, 0x58, 0x75, 0x1a, 0xf9, 0x6a, 0xbf, 0x73,
+    0xd7, 0x84, 0x07, 0x7f, 0x4c, 0x4e, 0x29, 0x02, 0x9b, 0x60, 0x81, 0x85,
+    0xa9, 0xbf, 0xc7, 0xa0, 0x8f, 0x8a, 0xdc, 0xa4, 0xc5, 0x17, 0x51, 0x24,
+    0x15, 0x28, 0x9e, 0x5e, 0x78, 0x84, 0x21, 0x02, 0xca, 0x26, 0x61, 0x4e,
+    0x95, 0xa6, 0x8d, 0xa6, 0x98, 0x7d, 0x1f, 0x84, 0x19, 0x24, 0x8b, 0x31,
+    0x76, 0x89, 0x2a, 0x5f, 0xa9, 0xfb, 0xaa, 0x8a, 0x8c, 0xce, 0xe4, 0x30,
+    0xd6, 0xec, 0x5b, 0x39, 0xb7, 0x09, 0x80, 0x23, 0x4c, 0xe1, 0x6e, 0x8f,
+    0x7c, 0x10, 0xe8, 0x8a, 0x60, 0x35, 0xd7, 0xa3, 0xe0, 0x5f, 0xcd, 0xfa,
+    0x3d, 0x8f, 0xd8, 0x5d, 0xec, 0xc9, 0xc5, 0xa0, 0x73, 0x41, 0x89, 0xe5,
+    0x39, 0xf2, 0x42, 0xff, 0x08, 0xa0, 0x12, 0xb7, 0x4a, 0x5e, 0x46, 0x06,
+    0x31, 0xbd, 0x88, 0x5e, 0x9e, 0x05, 0x17, 0x51, 0xb3, 0xe7, 0x88, 0x10,
+    0x19, 0x32, 0xff, 0x8a, 0x1e, 0xce, 0x66, 0xbc, 0x84, 0x1f, 0xed, 0x52,
+    0x52, 0x77, 0xe1, 0x5e, 0xa6, 0x21, 0xe4, 0xad, 0x59, 0xca, 0xa3, 0x77,
+    0xea, 0x66, 0x28, 0x15, 0x73, 0x3a, 0xfd, 0xe4, 0x75, 0x46, 0x99, 0x59,
+    0x5c, 0x7a, 0x9b, 0x9d, 0x11, 0xb4, 0x76, 0x45, 0x06, 0x45, 0x41, 0x1e,
+    0x94, 0xb7, 0xd9, 0xb8, 0xcb, 0xbf, 0x71, 0xec, 0xba, 0x9f, 0x4a, 0x1b,
+    0xbc, 0xfd, 0x5c, 0x06, 0x64, 0xfd, 0x31, 0x52, 0xc0, 0xe4, 0xa7, 0x21,
+    0x2f, 0x22, 0x92, 0xf0, 0x51, 0x33, 0x92, 0x1d, 0x40, 0x3c, 0x01, 0x81,
+    0x3b, 0xa8, 0x2e, 0x4e, 0xb6, 0x60, 0xcd, 0xd4, 0x36, 0x3b, 0x2e, 0x1d,
+    0x5e, 0x43, 0xd9, 0x94, 0xf1, 0x51, 0xd3, 0x59, 0x94, 0x6a, 0xd5, 0x5f,
+    0x1f, 0xd3, 0xa6, 0x55, 0xda, 0x15, 0xf1, 0x3e, 0x2c, 0x60, 0xb8, 0xc3,
+    0xda, 0x0e, 0x56, 0x53, 0xea, 0xcd, 0x39, 0x27, 0x94, 0x86, 0x94, 0xb2,
+    0x5b, 0xd8, 0x9a, 0x12, 0x94, 0xb0, 0xb6, 0x77, 0x28, 0xba, 0xde, 0xb6,
+    0x60, 0x4d, 0x2b, 0x6e, 0x3d, 0xf6, 0xf1, 0x48, 0xf7, 0x77, 0xa1, 0x49,
+    0xe0, 0x9f, 0x1e, 0xc9, 0xe6, 0xcb, 0x95, 0x26, 0x61, 0x5a, 0xc9, 0xed,
+    0x49, 0x40, 0x17, 0x57, 0x15, 0xfc, 0x3c, 0xb8, 0x28, 0x79, 0xb8, 0x42,
+    0x2a, 0xf9, 0xd4, 0x19, 0xb9, 0x5f, 0x41, 0xc2, 0x25, 0xd7, 0x88, 0x34,
+    0xb3, 0x25, 0x4e, 0xca, 0xff, 0x9e, 0x59, 0x9a, 0x33, 0xc8, 0x12, 0xf9,
+    0xd5, 0x70, 0xc0, 0x8b, 0x43, 0x13, 0xc4, 0x8d, 0x45, 0x99, 0xaa, 0xd7,
+    0xeb, 0xb1, 0xe9, 0xb7, 0x5b, 0xab, 0x48, 0xd1, 0x26, 0x60, 0x8c, 0x13,
+    0x55, 0x8a, 0x41, 0xd3, 0x68, 0x58, 0xd4, 0xa6, 0x30, 0x6e, 0x88, 0x3e,
+    0x81, 0x6e, 0x61, 0x06, 0x13, 0x66, 0xd5, 0x8e, 0x5d, 0x87, 0x4f, 0xd9,
+    0xb1, 0x66, 0xb3, 0xc5, 0x88, 0xa9, 0xc0, 0x73, 0xcb, 0x7f, 0x42, 0xec,
+    0x96, 0x64, 0xad, 0x72, 0x85, 0x72, 0xaf, 0xeb, 0xa9, 0xc4, 0x17, 0x86,
+    0xab, 0xe7, 0x23, 0xd7, 0x96, 0xf7, 0xb2, 0xb3, 0x51, 0xe1, 0x9a, 0x3b,
+    0x0e, 0xaf, 0x89, 0xca, 0x7b, 0xf1, 0x70, 0x7b, 0xc7, 0x82, 0xfc, 0xc7,
+    0x6c, 0x37, 0xd9, 0x7b, 0x82, 0x0f, 0x94, 0xcf, 0xd1, 0xa9, 0x33, 0xc2,
+    0xa4, 0xab, 0xed, 0xad, 0xee, 0x64, 0x5d, 0x04, 0xf2, 0xcb, 0x8e, 0x99,
+    0x22, 0x33, 0x69, 0x85, 0x85, 0xb6, 0x1a, 0x9b, 0x09, 0x18, 0xbe, 0xcd,
+    0x63, 0xf6, 0x5d, 0x52, 0xbc, 0x26, 0x99, 0x3e, 0x52, 0xe5, 0x0c, 0xc5,
+    0xee, 0xdd, 0xbb, 0x07, 0xbc, 0x38, 0xc1, 0x67, 0x96, 0x8c, 0xe6, 0xe4,
+    0x18, 0xfa, 0x07, 0x91, 0x48, 0xef, 0x9c, 0x70, 0x9d, 0x5b, 0x1c, 0x0e,
+    0xd5, 0xd3, 0x59, 0xee, 0x44, 0x13, 0xf7, 0x00, 0xa6, 0x20, 0xad, 0x65,
+    0x1d, 0xb7, 0x96, 0x2f, 0x79, 0x7b, 0x04, 0xa3, 0x10, 0x90, 0x29, 0x8c,
+    0xa3, 0x2e, 0x14, 0x39, 0xd3, 0xe4, 0x6e, 0x46, 0xf7, 0x6e, 0x96, 0x68,
+    0xd9, 0xef, 0x45, 0xf7, 0x3c, 0xcd, 0xc7, 0xca, 0x33, 0x64, 0x8e, 0x31,
+    0x80, 0x48, 0x7b, 0x7c, 0x81, 0x9a, 0x48, 0xff, 0xd5, 0x0d, 0x74, 0xe7,
+    0x77, 0x46, 0x61, 0x9b, 0xde, 0xed, 0x83, 0xe9, 0x4f, 0x92, 0xc1, 0x16,
+    0xad, 0x44, 0x40, 0x23, 0xce, 0x04, 0x31, 0xbf, 0xcf, 0xe2, 0x5a, 0x68,
+    0x5a, 0xf4, 0x0f, 0xe1, 0x87, 0x79, 0xb0, 0x32, 0x0b, 0x09, 0x6b, 0x72,
+    0x2b, 0x16, 0x06, 0x67, 0x82, 0x0b, 0x92, 0x35, 0xdb, 0x4c, 0xe2, 0x4a,
+    0x60, 0x99, 0xaf, 0x52, 0x10, 0x4b, 0xa5, 0xcf, 0xac, 0x66, 0x49, 0x56,
+    0x04, 0xc0, 0xd6, 0x6f, 0x62, 0x53, 0x6f, 0xcb, 0x62, 0xe9, 0xa5, 0xca,
+    0x18, 0x8e, 0x86, 0x3f, 0x36, 0xfd, 0xea, 0x55, 0x16, 0x6d, 0x6c, 0x6a,
+    0x8f, 0xa7, 0x9c, 0x70, 0x15, 0xd7, 0xf4, 0x57, 0x68, 0x04, 0x84, 0x60,
+    0x3b, 0xb0, 0x32, 0xc4, 0xea, 0x9d, 0x70, 0xb9, 0xa6, 0x34, 0xe5, 0xfa,
+    0xa1, 0x24, 0x54, 0x7f, 0xef, 0xac, 0xb4, 0x5f, 0xa0, 0xc0, 0x40, 0x3f,
+    0x73, 0xdf, 0x56, 0xa6, 0xd9, 0x17, 0xf4, 0xff, 0x50, 0xae, 0x21, 0x0d,
+    0x5a, 0xe0, 0xb0, 0xf9, 0x5b, 0x7a, 0x61, 0x6e, 0xa6, 0x85, 0x85, 0xbf,
+    0x19, 0x03, 0xe2, 0x74, 0x1f, 0x03, 0x70, 0x76, 0x3c, 0xed, 0x02, 0x7d,
+    0xfa, 0xf9, 0x1e, 0x17, 0xdd, 0x42, 0x30, 0xf0, 0x32, 0x47, 0x46, 0xae,
+    0xf5, 0x64, 0xe6, 0x5e, 0x2b, 0x40, 0x86, 0x97, 0xb1, 0x24, 0x52, 0x69,
+    0x67, 0x79, 0x8e, 0x0d, 0xcc, 0x07, 0xcb, 0x72, 0x29, 0xe9, 0xba, 0x2d,
+    0xf7, 0xcb, 0xe3, 0x86, 0x06, 0xaa, 0x6d, 0x79, 0xf8, 0xb6, 0x93, 0x0a,
+    0x9c, 0x97, 0xef, 0x47, 0x37, 0x13, 0x2e, 0x6b, 0xfd, 0x59, 0x0c, 0xc9,
+    0x5e, 0x5e, 0xcd, 0x71, 0x6f, 0x99, 0x0d, 0x88, 0x9d, 0xbb, 0x7c, 0x2b,
+    0x22, 0xd5, 0xbe, 0xee, 0x26, 0x1c, 0xe1, 0xad, 0xc8, 0x4d, 0x5f, 0x6b,
+    0xd1, 0xf4, 0x30, 0x4d, 0x46, 0x1d, 0x54, 0x11, 0x4b, 0xa0, 0x7f, 0x94,
+    0x71, 0xc0, 0x44, 0x4a, 0x42, 0x11, 0xf5, 0x89, 0xec, 0xb5, 0x24, 0x45,
+    0xf1, 0xf0, 0x30, 0x54, 0xf8, 0x62, 0xdb, 0x58, 0x3d, 0x7c, 0x2a, 0x82,
+    0xe5, 0xbe, 0x13, 0xcf, 0xdc, 0x88, 0xfb, 0xd3, 0x1e, 0x4d, 0xa5, 0x3e,
+    0xad, 0x95, 0xa2, 0xe6, 0x48, 0x73, 0xb2, 0xbe, 0x96, 0xef, 0x8e, 0x0b,
+    0x28, 0xf9, 0xbe, 0x2a, 0xd6, 0x68, 0x9e, 0x9c, 0x7b, 0x5a, 0xaf, 0x20,
+    0xf6, 0xa5, 0x3f, 0x99, 0x61, 0x57, 0xe8, 0x1c, 0xb2, 0xc3, 0xd0, 0x7f,
+    0x2c, 0xb5, 0xe9, 0x66, 0x8e, 0x88, 0xec, 0x13, 0x51, 0xbc, 0x8e, 0xb6,
+    0xe2, 0x91, 0xbf, 0x5e, 0x8c, 0x1c, 0xdd, 0x0e, 0x0a, 0x13, 0x06, 0xc6,
+    0x62, 0x1c, 0x41, 0x8d, 0xa1, 0xc0, 0xf2, 0xfa, 0x76, 0x35, 0xaa, 0x77,
+    0x06, 0x3f, 0x76, 0x50, 0xf6, 0x43, 0xf2, 0x25, 0x00, 0x79, 0xde, 0xca,
+    0xa1, 0x06, 0x6f, 0xb4, 0x17, 0x4b, 0x99, 0x5a, 0x00, 0x32, 0xd6, 0xb0,
+    0x1f, 0x80, 0x53, 0x16, 0xaa, 0x87, 0x72, 0xa2, 0x34, 0xaf, 0x90, 0x3d,
+    0x60, 0xde, 0x0e, 0x6d, 0x83, 0xda, 0xb2, 0x11, 0x2f, 0x39, 0xdc, 0x1a,
+    0xfe, 0x51, 0x74, 0x10, 0x3c, 0x41, 0xd5, 0x41, 0x65, 0x4a, 0xa0, 0x11,
+    0xde, 0x95, 0x34, 0xef, 0xa0, 0xc9, 0xa8, 0xd3, 0xcb, 0xb9, 0x7d, 0x51,
+    0x7d, 0xff, 0x26, 0x88, 0xd8, 0x29, 0x0e, 0xa0, 0xd4, 0xa7, 0x07, 0x33,
+    0xe7, 0x7d, 0x59, 0x9f, 0x35, 0xc1, 0xb5, 0xf7, 0x78, 0x78, 0x84, 0xf0,
+    0x20, 0x41, 0x3f, 0x02, 0x7d, 0x41, 0x90, 0x01, 0x8d, 0xa4, 0xd8, 0xd7,
+    0xeb, 0x56, 0x7f, 0x38, 0xbc, 0x1e, 0x15, 0xdf, 0xfc, 0x34, 0xe7, 0x99,
+    0xd4, 0x92, 0xd5, 0xf3, 0x9e, 0x16, 0x0b, 0x5c, 0xeb, 0xb6, 0x78, 0xac,
+    0x84, 0x06, 0x8e, 0xfe, 0xd0, 0x7c, 0xce, 0x4a, 0x43, 0x49, 0x3b, 0xe1,
+    0xab, 0x57, 0xc0, 0x12, 0xd6, 0x9d, 0xa4, 0xee, 0x91, 0x10, 0x81, 0xe2,
+    0xfc, 0x02, 0x26, 0x7a, 0xca, 0x81, 0x5b, 0x2f, 0x34, 0x51, 0xdd, 0x25,
+    0x4d, 0xc8, 0xf9, 0x3e, 0x59, 0x0f, 0x3d, 0x64, 0x51, 0xbf, 0x42, 0xc4,
+    0x92, 0x9d, 0x8f, 0x39, 0x8a, 0x31, 0x09, 0x24, 0x19, 0x44, 0xc0, 0xf4,
+    0xea, 0xca, 0x59, 0xcb, 0x86, 0x6c, 0x02, 0x7a, 0xe5, 0x30, 0x79, 0xe2,
+    0x2c, 0x76, 0x08, 0x8f, 0x98, 0x0d, 0x4d, 0x12, 0xc3, 0x98, 0xb4, 0x24,
+    0x04, 0x4f, 0x51, 0xec, 0x4e, 0xec, 0xbd, 0x8c, 0xc4, 0x79, 0x51, 0x7f,
+    0xe1, 0xce, 0x76, 0x28, 0x0b, 0x7b, 0xc5, 0x3f, 0x5b, 0x48, 0x19, 0x76,
+    0x68, 0x31, 0x8e, 0x28, 0xff, 0x18, 0x24, 0xe3, 0x91, 0xe7, 0x49, 0x0d,
+    0x10, 0xbd, 0x00, 0xc6, 0x58, 0xfd, 0xb6, 0x88, 0x63, 0xbd, 0xb4, 0x4b,
+    0xb8, 0xed, 0xdd, 0xb7, 0x53, 0xce, 0x89, 0xdb, 0x7f, 0xf4, 0xc3, 0x21,
+    0x31, 0xad, 0x20, 0x78, 0x06, 0x71, 0xaf, 0xc0, 0xe3, 0xdc, 0xb8, 0xf4,
+    0x80, 0xc8, 0x33, 0x1d, 0x8b, 0xff, 0x5a, 0x92, 0x68, 0x4d, 0xc1, 0x5b,
+    0x58, 0x3e, 0xf6, 0x7f, 0xba, 0x42, 0xa5, 0x6d, 0xec, 0x03, 0x36, 0xc9,
+    0x3f, 0x83, 0x1f, 0x0c, 0x33, 0x57, 0x6a, 0x43, 0x5f, 0x11, 0x72, 0x19,
+    0x2c, 0xda, 0x71, 0x58, 0xf2, 0x50, 0x50, 0x06, 0x97, 0xd0, 0xdf, 0xd1,
+    0x4f, 0x0b, 0x00, 0x1a, 0xea, 0x85, 0x3b, 0x37, 0x2f, 0xf0, 0x40, 0x52,
+    0xd9, 0x2a, 0xe8, 0x54, 0xa5, 0xee, 0x0f, 0x49, 0x74, 0x39, 0x96, 0x5d,
+    0x60, 0x8f, 0x14, 0x59, 0x86, 0x59, 0x86, 0xfb, 0x67, 0x71, 0x5c, 0x26,
+    0x5f, 0xe9, 0xab, 0x32, 0x77, 0x83, 0xdf, 0x02, 0x19, 0x85, 0xae, 0x4d,
+    0x7d, 0x9c, 0x8d, 0x4f, 0x61, 0x05, 0x3c, 0x0c, 0xc6, 0x74, 0x9e, 0x36,
+    0x33, 0xb8, 0x14, 0x85, 0xab, 0xa2, 0x0b, 0x5d, 0x22, 0xf2, 0x50, 0x3e,
+    0xa4, 0x88, 0xac, 0x67, 0xf9, 0x06, 0xe5, 0x30, 0x8e, 0xf9, 0x67, 0x34,
+    0xd5, 0x94, 0x5b, 0x35, 0xb7, 0x3d, 0x39, 0x5f, 0x4e, 0xae, 0xfe, 0xf7,
+    0x57, 0xd3, 0x95, 0x7b, 0x0a, 0xd9, 0x92, 0x4a, 0x66, 0x29, 0xa0, 0x18,
+    0x35, 0x54, 0x14, 0x44, 0x79, 0x72, 0xc3, 0xbc, 0xa8, 0x1a, 0xd3, 0xa3,
+    0xbe, 0x6f, 0x9e, 0xcc, 0x68, 0xb6, 0x5f, 0xd4, 0x42, 0xab, 0xe8, 0x09,
+    0x60, 0x57, 0x2e, 0xb2, 0x9a, 0x5b, 0x62, 0x38, 0xfb, 0x0a, 0x35, 0x9c,
+    0x4f, 0xf7, 0xe0, 0xd2, 0x06, 0x04, 0x1f, 0x79, 0x7f, 0xa7, 0x7b, 0xd3,
+    0x63, 0xc9, 0xbd, 0x16, 0x58, 0x38, 0x7b, 0xaa, 0x08, 0xf3, 0x14, 0x6c,
+    0x25, 0xf8, 0xa5, 0xe9, 0x4b, 0x45, 0x34, 0x89, 0x76, 0x74, 0xcb, 0x41,
+    0x9c, 0x2a, 0xd9, 0xca, 0xb3, 0x12, 0x46, 0x6d, 0x85, 0x4d, 0x63, 0x2d,
+    0x24, 0x1b, 0x19, 0x6b, 0x3f, 0x61, 0x6b, 0x4b, 0x15, 0x83, 0x2d, 0x8f,
+    0x61, 0xab, 0xd1, 0x55, 0x93, 0x4e, 0x26, 0xd6, 0x7a, 0x0a, 0x8a, 0xff,
+    0x58, 0x44, 0xf7, 0x39, 0x31, 0x1a, 0xab, 0xa6, 0x98, 0x31, 0x41, 0x03,
+    0xb6, 0xc9, 0xf5, 0x50, 0xe3, 0x7b, 0xc0, 0x59, 0x74, 0x60, 0x91, 0xb4,
+    0x79, 0x02, 0x25, 0xc1, 0xb5, 0xbd, 0xcb, 0x6e, 0x40, 0x61, 0xfe, 0x68,
+    0x29, 0x83, 0x1b, 0xd2, 0x49, 0xe1, 0x31, 0xde, 0xdd, 0x53, 0xb0, 0xb8,
+    0x96, 0xa2, 0xce, 0xea, 0x8b, 0x66, 0x2c, 0x5a, 0x80, 0x51, 0x0b, 0xc1,
+    0x2d, 0x9a, 0xfa, 0x9d, 0xc6, 0xcc, 0x2b, 0xbb, 0xaa, 0xce, 0x98, 0xaa,
+    0x26, 0x15, 0x8f, 0x4a, 0xe7, 0xdb, 0x17, 0x6c, 0xe5, 0x58, 0xc9, 0xae,
+    0xe4, 0x9c, 0x1d, 0xab, 0x59, 0x84, 0x3e, 0x27, 0x76, 0x03, 0xe3, 0x82,
+    0x64, 0x6f, 0x6e, 0x6f, 0x63, 0xd2, 0x12, 0x84, 0xe3, 0x9b, 0x9d, 0x7e,
+    0x53, 0x1a, 0x54, 0x8d, 0xc1, 0xf0, 0x94, 0xae, 0xad, 0x8f, 0x6a, 0x12,
+    0x4e, 0xa7, 0x30, 0xdb, 0x55, 0xbe, 0x09, 0xe2, 0x56, 0x08, 0xc4, 0x3a,
+    0xb0, 0x55, 0xb0, 0x24, 0x96, 0xa6, 0x3e, 0x28, 0xd0, 0x35, 0xfb, 0x58,
+    0x47, 0xba, 0x2d, 0x51, 0xbb, 0x72, 0x20, 0x59, 0xd2, 0xdd, 0x9c, 0xe2,
+    0xb5, 0x31, 0x90, 0xac, 0x74, 0x5d, 0x9f, 0x3d, 0x8c, 0x1c, 0x96, 0xc0,
+    0x60, 0x61, 0xa8, 0xbb, 0x3c, 0xb3, 0x6d, 0x6d, 0x92, 0x4a, 0xca, 0xbb,
+    0x60, 0x5e, 0x82, 0x0d, 0x7f, 0xab, 0x4b, 0x36, 0x4c, 0x93, 0x0d, 0x88,
+    0x71, 0xaf, 0xb6, 0x53, 0xb0, 0x38, 0xb4, 0x1c, 0xb4, 0x7b, 0xd4, 0x13,
+    0x32, 0x6c, 0xe4, 0xee, 0x6a, 0xb3, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x02, 0x00, 0x00, 0x88, 0x83, 0x91, 0x4c, 0x2e, 0x1e, 0xbe, 0xa4,
+    0xb5, 0x96, 0xff, 0x67, 0x50, 0xe9, 0x81, 0x0e, 0x5d, 0x0e, 0xad, 0xc4,
+    0x1f, 0xeb, 0x98, 0x38, 0xcc, 0x54, 0x9d, 0x27, 0xa6, 0xf1, 0x37, 0x23,
+    0xce, 0xb4, 0x5b, 0xff, 0x12, 0xb1, 0xb8, 0x35, 0x5e, 0x03, 0x02, 0x04,
+    0xad, 0xa6, 0x6f, 0x43, 0xfc, 0xe4, 0xbe, 0x0c, 0xe0, 0x93, 0xd5, 0xef,
+    0x09, 0xfa, 0x04, 0xe9, 0x5a, 0x22, 0xd4, 0x81, 0xc1, 0x27, 0x4f, 0x5f,
+    0x6e, 0x83, 0x5a, 0x8a, 0x2d, 0xbb, 0x8f, 0xa4, 0x91, 0xcc, 0x82, 0x37,
+    0x3b, 0x14, 0x98, 0x58, 0x86, 0x44, 0xb7, 0xa9, 0x58, 0xf3, 0x3d, 0x49,
+    0x71, 0x7a, 0x37, 0xcd, 0xc5, 0xb9, 0xc9, 0x46, 0xd5, 0xd4, 0x17, 0x60,
+    0x1a, 0xbf, 0x93, 0xa9, 0xe9, 0x08, 0x25, 0x40, 0xd1, 0x65, 0xae, 0xdd,
+    0x85, 0xa6, 0xcc, 0x06, 0xca, 0x91, 0xe1, 0x63, 0xf9, 0x6b, 0x15, 0xa8,
+    0x04, 0x61, 0xd2, 0xa6, 0x59, 0x21, 0x1a, 0x1c, 0xc9, 0xa9, 0xa9, 0xc8,
+    0x54, 0x86, 0xac, 0xa5, 0xd6, 0x95, 0x39, 0x83, 0x4b, 0x6b, 0x69, 0xa6,
+    0x94, 0xd8, 0xc0, 0xfb, 0x66, 0x0f, 0x3a, 0xbe, 0xc7, 0xf3, 0xcc, 0xd5,
+    0xb7, 0x1b, 0x60, 0x02, 0x95, 0x45, 0x4a, 0x12, 0xc9, 0xfe, 0x75, 0x7c,
+    0x1b, 0xb2, 0x86, 0x96, 0x28, 0x07, 0xa2, 0x18, 0x7a, 0x6c, 0x90, 0x6f,
+    0x32, 0x0c, 0xc8, 0x34, 0xbc, 0x75, 0x4d, 0x96, 0x03, 0xa6, 0x0f, 0x3d,
+    0x35, 0x1b, 0x64, 0x76, 0x95, 0x55, 0xff, 0x25, 0xd4, 0x71, 0xcf, 0x8a,
+    0x73, 0x6d, 0x9b, 0x74, 0xfe, 0xff, 0x9e, 0x31, 0x9e, 0x5e, 0x89, 0x5a,
+    0x1a, 0xeb, 0x8d, 0x06, 0x3b, 0xf2, 0xf6, 0x06, 0x5d, 0xc3, 0xba, 0x04,
+    0xca, 0x0f, 0x07, 0x2c, 0xbd, 0x54, 0x52, 0xd9, 0x1c, 0x2f, 0x0e, 0x13,
+    0x5e, 0x25, 0x13, 0xe5, 0xd7, 0x8e, 0x19, 0x42, 0x1b, 0x52, 0x2e, 0xd2,
+    0x8f, 0xc5, 0x8e, 0x1c, 0x34, 0x2e, 0x4d, 0xd5, 0x51, 0x7d, 0x91, 0x64,
+    0xbc, 0xb4, 0x0d, 0xc9, 0xe7, 0x1c, 0x6c, 0x47, 0xe9, 0xbb, 0x67, 0x9a,
+    0x96, 0xde, 0xad, 0xff, 0xba, 0x35, 0x25, 0x6d, 0x57, 0xa1, 0x93, 0xfe,
+    0xe2, 0x8d, 0x02, 0xeb, 0xf0, 0x2f, 0x54, 0xfd, 0x46, 0xc0, 0x8f, 0xea,
+    0x32, 0x7b, 0x57, 0xda, 0xe0, 0x29, 0x1c, 0x19, 0xba, 0xa4, 0xa6, 0x1c,
+    0x6e, 0xeb, 0x7a, 0xa8, 0x8a, 0xe1, 0xc6, 0x12, 0xf5, 0xa3, 0x24, 0x1a,
+    0x96, 0xe1, 0x02, 0xc0, 0xf4, 0x7d, 0x14, 0x72, 0xd6, 0x12, 0x8e, 0x6c,
+    0x8c, 0xd2, 0xfd, 0x88, 0x78, 0x48, 0xf3, 0x74, 0x38, 0x86, 0x04, 0x68,
+    0x6d, 0x7c, 0xf4, 0x4c, 0x40, 0x17, 0xf6, 0x8f, 0xb2, 0x6c, 0xd7, 0x66,
+    0x66, 0x3b, 0x38, 0xa1, 0xbb, 0x1e, 0xff, 0x72, 0x1f, 0x64, 0x56, 0xc2,
+    0x53, 0x1c, 0x6f, 0x84, 0x2b, 0xbd, 0x23, 0xd9, 0xb4, 0x6b, 0x87, 0x79,
+    0x99, 0xec, 0x81, 0x8d, 0x1a, 0x58, 0x00, 0xf0, 0x2c, 0xc1, 0xc4, 0x57,
+    0x74, 0x0f, 0xce, 0x32, 0xe2, 0x5e, 0xae, 0x02, 0x1c, 0xe8, 0x94, 0xc6,
+    0x44, 0xaa, 0x7b, 0x9a, 0x32, 0xb5, 0x33, 0xac, 0xfc, 0x41, 0x65, 0xf2,
+    0xca, 0xcc, 0xc6, 0x74, 0x36, 0xb2, 0xc9, 0x0e, 0x26, 0x73, 0xae, 0x68,
+    0x98, 0xa4, 0x36, 0xe8, 0x98, 0x39, 0xad, 0x05, 0x3f, 0xca, 0x12, 0xcc,
+    0x86, 0xfd, 0xc6, 0x57, 0xf0, 0x02, 0x4e, 0x45, 0xcb, 0x54, 0x34, 0xdd,
+    0x66, 0x26, 0xab, 0xda, 0x95, 0xa5, 0x85, 0xec, 0x02, 0x03, 0xb6, 0x29,
+    0x30, 0x11, 0x40, 0x54, 0x9a, 0x6a, 0x87, 0x2e, 0x97, 0xa1, 0x7e, 0xeb,
+    0x34, 0x39, 0x78, 0x3b, 0xbc, 0x5f, 0x8e, 0xc5, 0x0e, 0x21, 0x29, 0x4b,
+    0xb7, 0x1b, 0xe7, 0x14, 0x08, 0x34, 0xb7, 0x9a, 0x0a, 0xb2, 0x6c, 0x25,
+    0x76, 0xb5, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0xe2, 0x7d, 0x48, 0xdd, 0x1a, 0xcb, 0xb6, 0x5c, 0x6f, 0xbe, 0x32, 0x9d,
+    0xd2, 0x2b, 0x9e, 0x10, 0x65, 0xd7, 0x1e, 0xec, 0xc8, 0xb5, 0x10, 0x64,
+    0x8f, 0x5d, 0xef, 0xfe, 0x9b, 0x6c, 0x9b, 0x02, 0x6a, 0x6d, 0xf7, 0x98,
+    0x7b, 0xf7, 0x17, 0xfd, 0x49, 0x1b, 0x6a, 0xc5, 0x3c, 0xa0, 0xfc, 0xa8,
+    0x94, 0x95, 0xed, 0x48, 0x81, 0x04, 0x53, 0x8c, 0xbe, 0xe4, 0x4e, 0xaf,
+    0xc1, 0x9d, 0xc3, 0xdf, 0xc2, 0xb5, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x04, 0x00, 0x00, 0xae, 0xb0, 0x67, 0x5b, 0x99, 0x26, 0x07, 0xfb,
+    0x6c, 0x98, 0xfe, 0xbb, 0x35, 0xf1, 0x5b, 0x02, 0xc6, 0x03, 0xfc, 0x97,
+    0x21, 0x16, 0x8d, 0x48, 0xd4, 0x4f, 0x03, 0xd9, 0x7c, 0x9f, 0xa6, 0x1e,
+    0x6f, 0x5a, 0x58, 0x17, 0x6d, 0x26, 0xb4, 0xc5, 0x4c, 0xe5, 0x93, 0x0a,
+    0x9c, 0xb2, 0x40, 0xbc, 0x60, 0xc7, 0x2b, 0xdb, 0x3b, 0xc0, 0x3c, 0x5c,
+    0x44, 0x4b, 0xdd, 0x58, 0xbe, 0xdc, 0xc5, 0xb5, 0x6a, 0xf9, 0x5e, 0x73,
+    0x07, 0x58, 0x8f, 0x45, 0x7b, 0xac, 0xba, 0x82, 0x96, 0x49, 0x4d, 0x22,
+    0x70, 0x7a, 0x3d, 0x69, 0x26, 0x8b, 0x88, 0x13, 0xf1, 0x8d, 0xfc, 0xdf,
+    0x73, 0xd5, 0x20, 0x3c, 0x52, 0x92, 0x16, 0xb1, 0x6e, 0xb7, 0x41, 0xbe,
+    0x23, 0x9b, 0x51, 0xf7, 0xc9, 0x38, 0x8a, 0xc7, 0x6e, 0x68, 0x82, 0xd1,
+    0x59, 0x50, 0x09, 0x4b, 0x44, 0x3b, 0x28, 0x06, 0x60, 0x75, 0x7a, 0xe5,
+    0xa1, 0x36, 0xbb, 0x62, 0x44, 0xe3, 0xd0, 0x68, 0x14, 0xea, 0xad, 0xf9,
+    0x18, 0xcc, 0xd5, 0x42, 0x5d, 0x18, 0x53, 0xe6, 0x4a, 0xfe, 0xde, 0x32,
+    0xe1, 0xe7, 0xf8, 0x8c, 0x9d, 0x35, 0xf4, 0x4a, 0xcb, 0x23, 0x2f, 0x91,
+    0xb5, 0xb0, 0xb2, 0x01, 0x5c, 0x22, 0x8c, 0x42, 0x42, 0xd5, 0xf0, 0x82,
+    0x6f, 0x9f, 0x64, 0xe5, 0x99, 0x4d, 0x36, 0x0b, 0xfc, 0x78, 0x38, 0x30,
+    0x47, 0x8f, 0x0b, 0x57, 0x86, 0x4f, 0x1b, 0xc9, 0x05, 0x0e, 0x08, 0xc4,
+    0xf4, 0xab, 0x9e, 0x90, 0xb4, 0x4f, 0x36, 0x54, 0xe8, 0xa1, 0x3f, 0x90,
+    0xd2, 0xf3, 0xb4, 0xb4, 0xdd, 0xf3, 0x43, 0x2f, 0xc4, 0x43, 0xbb, 0x99,
+    0x8e, 0xb8, 0x61, 0x59, 0x5e, 0xfa, 0x1b, 0x3c, 0xc1, 0xeb, 0x9d, 0x35,
+    0x62, 0x34, 0x82, 0x45, 0xef, 0x41, 0xe9, 0xfc, 0x35, 0xae, 0xb4, 0x0b,
+    0xce, 0x52, 0x5b, 0x40, 0x7d, 0xdd, 0x86, 0x83, 0x52, 0x74, 0x77, 0x11,
+    0xc2, 0x9b, 0x8c, 0xa3, 0x63, 0xc2, 0x2d, 0xdd, 0x8c, 0x76, 0x13, 0xc5,
+    0xc0, 0xde, 0x3e, 0x6b, 0xe1, 0x0f, 0xeb, 0x0f, 0x0a, 0x25, 0x41, 0x2f,
+    0x8b, 0x4a, 0x98, 0x30, 0xcb, 0x1a, 0x43, 0xa3, 0xc1, 0xcc, 0x44, 0x9a,
+    0x6c, 0xdc, 0x92, 0x40, 0xc4, 0x7a, 0x1f, 0x8a, 0x6f, 0x74, 0xf3, 0xf5,
+    0x52, 0x72, 0xf7, 0x81, 0x6e, 0x74, 0x75, 0xe6, 0xea, 0xd9, 0x57, 0x91,
+    0xae, 0xf2, 0x3f, 0x35, 0x4b, 0x99, 0xd9, 0x3f, 0x85, 0xe0, 0x92, 0xaa,
+    0x35, 0xac, 0x28, 0xbf, 0x43, 0xb8, 0xad, 0xc7, 0xc5, 0xf6, 0x15, 0x2f,
+    0x7c, 0xfb, 0x34, 0x48, 0xf3, 0x04, 0x12, 0xf4, 0x2f, 0x92, 0x74, 0xc8,
+    0xea, 0xbc, 0x24, 0x6e, 0x3b, 0x0e, 0x9e, 0xf0, 0xaf, 0x02, 0x97, 0x95,
+    0xbc, 0x90, 0x7f, 0xc4, 0xf8, 0xe2, 0x04, 0x9a, 0x8f, 0xfc, 0xbc, 0x50,
+    0xfe, 0xf7, 0x89, 0x17, 0x2c, 0xdb, 0xd6, 0x5e, 0xbf, 0xd9, 0x8e, 0x89,
+    0x8b, 0x06, 0x1d, 0x0b, 0x81, 0x2a, 0x55, 0x5c, 0x5f, 0xb6, 0xa6, 0xa5,
+    0xd2, 0xaa, 0x79, 0x9c, 0x39, 0x31, 0x76, 0x03, 0x98, 0x42, 0xd6, 0xb7,
+    0x37, 0x1f, 0xc8, 0x51, 0x8a, 0x1c, 0x5d, 0xcd, 0x9c, 0x78, 0xa4, 0x22,
+    0x6e, 0x12, 0x10, 0x0a, 0x33, 0xc9, 0xe0, 0xfe, 0xfc, 0xe8, 0x15, 0xe7,
+    0xef, 0xd8, 0x6d, 0xc7, 0xc9, 0xc2, 0x8e, 0x18, 0x82, 0x2f, 0xa6, 0x09,
+    0x8a, 0xdc, 0x41, 0x6b, 0x89, 0xea, 0xd9, 0xd6, 0x96, 0xfd, 0xba, 0x6e,
+    0xae, 0x2d, 0x0c, 0xf9, 0x3c, 0x4c, 0x1a, 0xfa, 0x98, 0x83, 0x51, 0x45,
+    0x9d, 0x1e, 0xa5, 0xc1, 0x81, 0x54, 0x37, 0x5d, 0x28, 0xca, 0xa6, 0xfe,
+    0x48, 0xf4, 0x77, 0x17, 0x92, 0x1d, 0x0c, 0xb3, 0x39, 0x77, 0x22, 0xd9,
+    0xc7, 0xc2, 0xaf, 0x70, 0x0a, 0xd3, 0xa6, 0x57, 0x69, 0xfb, 0xb9, 0xe0,
+    0xc4, 0x73, 0x7a, 0x68, 0xee, 0x27, 0x6e, 0x3a, 0x6e, 0xae, 0x32, 0xf6,
+    0x09, 0xb3, 0x0b, 0x40, 0x72, 0xc6, 0x26, 0x6e, 0xc5, 0x88, 0x6b, 0xce,
+    0x99, 0x88, 0x60, 0x6f, 0x6e, 0xa9, 0xe6, 0xd7, 0x35, 0x5e, 0x3b, 0x36,
+    0x0d, 0x14, 0xb8, 0x2f, 0xde, 0x67, 0xc8, 0x2e, 0x52, 0xc1, 0xf1, 0x58,
+    0x87, 0x32, 0x2a, 0x52, 0x21, 0x27, 0x1e, 0x04, 0xed, 0xc4, 0x82, 0xd7,
+    0xeb, 0x85, 0x12, 0x3e, 0xea, 0xd0, 0x07, 0xa0, 0x80, 0x48, 0xe9, 0xbd,
+    0x9b, 0x3a, 0x8e, 0x8b, 0xa0, 0xfc, 0x07, 0xf0, 0x69, 0x4e, 0xc7, 0x1d,
+    0xd9, 0x9a, 0x73, 0x18, 0x63, 0xb8, 0xe6, 0x4a, 0xa0, 0x81, 0xf0, 0xdb,
+    0xb9, 0x88, 0xf4, 0x2b, 0x1f, 0x0d, 0xda, 0x31, 0xc0, 0xb0, 0x55, 0x79,
+    0x56, 0x48, 0x22, 0xbb, 0x49, 0x7f, 0xb1, 0xf1, 0xf6, 0x6f, 0x42, 0xd3,
+    0xba, 0x68, 0x3a, 0x8f, 0xe7, 0xac, 0x53, 0x30, 0x96, 0xec, 0x51, 0x7d,
+    0xfc, 0xc0, 0x35, 0xe9, 0x59, 0xe7, 0x0e, 0xed, 0x29, 0x46, 0x50, 0x3c,
+    0x4b, 0x36, 0xc6, 0x2a, 0xaa, 0x3b, 0xbe, 0xce, 0xd3, 0xda, 0x4d, 0x65,
+    0xb0, 0xe8, 0x52, 0x68, 0xf0, 0x23, 0xde, 0x02, 0x77, 0xb3, 0xcc, 0xce,
+    0x78, 0xdd, 0x8c, 0xf8, 0xbe, 0x5d, 0x0d, 0xa9, 0xb6, 0x96, 0x85, 0xbf,
+    0x92, 0x2a, 0x6b, 0x1b, 0xe8, 0x76, 0x05, 0x13, 0x30, 0xd8, 0x3d, 0x80,
+    0xaa, 0xa2, 0xa3, 0xbc, 0x07, 0xba, 0x9c, 0x75, 0x5b, 0x42, 0x03, 0xd8,
+    0xde, 0x42, 0x44, 0xf7, 0x29, 0x43, 0x29, 0x0d, 0x48, 0x2b, 0x02, 0xd0,
+    0xcc, 0xe9, 0x17, 0x47, 0x23, 0x73, 0x6d, 0xc5, 0x91, 0x6d, 0x4e, 0xc5,
+    0xcf, 0xc3, 0x58, 0xaf, 0x6e, 0xa2, 0x9e, 0xe7, 0xe1, 0x88, 0xac, 0x62,
+    0xff, 0xbc, 0x69, 0x57, 0xad, 0x0f, 0x08, 0xf8, 0x32, 0xfd, 0x79, 0xcb,
+    0x30, 0xbc, 0xd2, 0xe5, 0x20, 0xd9, 0x0f, 0xd1, 0x33, 0xbf, 0xe4, 0x49,
+    0x7a, 0x2b, 0x5c, 0xb3, 0x63, 0x13, 0x4d, 0xed, 0x17, 0xe7, 0x5b, 0xf4,
+    0x36, 0x9d, 0x3c, 0x4e, 0x51, 0xb2, 0xf7, 0xf2, 0xcd, 0xfb, 0xec, 0x42,
+    0x79, 0x46, 0xae, 0x18, 0x50, 0xdf, 0xbf, 0x5b, 0xb1, 0x9a, 0x49, 0x22,
+    0xae, 0xe9, 0xf3, 0x86, 0x3f, 0xe0, 0xb4, 0xc6, 0x9c, 0x08, 0xd6, 0xd9,
+    0xf4, 0x68, 0xbb, 0x33, 0x0e, 0x59, 0x3d, 0x76, 0xf0, 0xd7, 0x54, 0x04,
+    0x19, 0x66, 0xee, 0x61, 0x11, 0x0d, 0x48, 0x10, 0x21, 0x16, 0x7c, 0xac,
+    0x49, 0xab, 0xe0, 0x19, 0x85, 0x93, 0x48, 0x65, 0x7c, 0x5e, 0x6c, 0x1a,
+    0xf5, 0xb0, 0xc6, 0x80, 0xa1, 0x2a, 0xd5, 0x71, 0x42, 0xec, 0x2f, 0x25,
+    0xf7, 0xb8, 0x84, 0xcd, 0xf0, 0x5c, 0xcd, 0xee, 0x44, 0xcb, 0xeb, 0x74,
+    0x96, 0x3c, 0xb0, 0x56, 0xcb, 0xaf, 0x7e, 0x9e, 0x4a, 0x12, 0x06, 0xae,
+    0x57, 0x43, 0x2d, 0xb2, 0x11, 0x96, 0x05, 0xdb, 0xb3, 0x1a, 0x01, 0xa7,
+    0x1d, 0x02, 0x81, 0x1c, 0x36, 0x41, 0x65, 0xf0, 0x67, 0xd6, 0xd0, 0x0f,
+    0xec, 0x34, 0x7d, 0xd3, 0x89, 0xac, 0x60, 0x67, 0x95, 0x81, 0x84, 0xe7,
+    0xbb, 0x9a, 0x59, 0x36, 0x3b, 0xde, 0xa4, 0x88, 0xda, 0xf2, 0xd2, 0xa2,
+    0x0c, 0xba, 0xfb, 0x93, 0xbf, 0xc8, 0xad, 0xe8, 0x57, 0xa0, 0x2b, 0xbb,
+    0x4e, 0xa9, 0x38, 0xe7, 0x86, 0x6b, 0x95, 0x34, 0x24, 0x96, 0xc0, 0x09,
+    0xd9, 0xfd, 0x5f, 0x1c, 0x93, 0xd9, 0x72, 0xfa, 0xc4, 0x14, 0x72, 0x9c,
+    0x19, 0x6f, 0xee, 0x12, 0x17, 0xee, 0x65, 0xb4, 0x8c, 0x83, 0x39, 0x3c,
+    0x0f, 0xbf, 0x25, 0xcf, 0xee, 0x05, 0x8c, 0x6a, 0x56, 0x18, 0xf0, 0x20,
+    0x72, 0xc1, 0xbf, 0xe4, 0xce, 0x37, 0xbf, 0x2b, 0xba, 0x70, 0x1e, 0xc2,
+    0xc8, 0xcd, 0x58, 0xb9, 0x60, 0xc7, 0xfb, 0xd0, 0xce, 0xb9, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x7c, 0x63, 0x50, 0x90,
+    0xcb, 0x9c, 0xce, 0x59, 0xb1, 0x47, 0xb0, 0x49, 0x9b, 0xfc, 0xfb, 0x3d,
+    0x3d, 0x62, 0xcf, 0x58, 0x4c, 0x2a, 0x79, 0xf0, 0x72, 0x7f, 0x81, 0x41,
+    0xac, 0x82, 0x2d, 0xa9, 0xf0, 0x0e, 0x4d, 0xd2, 0xe0, 0xbd, 0xca, 0x17,
+    0xb7, 0x59, 0x9f, 0xdb, 0xfe, 0x51, 0x90, 0x88, 0xb9, 0xeb, 0x4e, 0xac,
+    0x80, 0x30, 0x64, 0xc4, 0x49, 0xd1, 0xb6, 0x65, 0x67, 0xef, 0x9d, 0x5c,
+    0x04, 0xe0, 0x9d, 0xbe, 0x47, 0x75, 0x9b, 0x6e, 0x30, 0x76, 0xad, 0x37,
+    0x9a, 0x56, 0xff, 0xcd, 0x40, 0x26, 0x3e, 0xe2, 0x7d, 0x30, 0x55, 0x09,
+    0x92, 0x25, 0x36, 0x2f, 0xf8, 0x55, 0xb8, 0x9b, 0x66, 0x49, 0x41, 0x9d,
+    0x78, 0x6d, 0x3f, 0x54, 0x41, 0x01, 0x93, 0x9c, 0x5e, 0x0c, 0x4a, 0x38,
+    0x79, 0x76, 0xb4, 0x98, 0xae, 0xf9, 0x99, 0x21, 0x05, 0x6a, 0xfb, 0xbc,
+    0x44, 0xf7, 0xdc, 0x85, 0x5e, 0x5f, 0x18, 0x49, 0x22, 0x11, 0x6d, 0xa5,
+    0x9e, 0x6b, 0x59, 0x60, 0xf8, 0x73, 0x8b, 0xcb, 0x38, 0xbb, 0xc9, 0xbf,
+    0x49, 0x0e, 0x57, 0x65, 0x48, 0x41, 0x41, 0xa2, 0x40, 0x67, 0x91, 0x1d,
+    0x54, 0xac, 0xa7, 0xef, 0x16, 0x8b, 0xc7, 0xd1, 0xe6, 0xdb, 0xc5, 0x9c,
+    0xd4, 0x04, 0x67, 0xd8, 0x75, 0x21, 0x2b, 0x1d, 0x11, 0xc1, 0x79, 0x45,
+    0xb4, 0x91, 0x7a, 0x97, 0x00, 0xde, 0xc6, 0xc5, 0x8a, 0xd1, 0xd7, 0xea,
+    0xc1, 0x22, 0xe1, 0x58, 0x61, 0xf2, 0x89, 0x3d, 0xdb, 0x04, 0x3d, 0xe4,
+    0xe9, 0xe7, 0xbf, 0x4b, 0x99, 0x8a, 0xc6, 0xf2, 0x09, 0xc4, 0xe2, 0x6d,
+    0x0b, 0xda, 0x13, 0xfb, 0xff, 0xbf, 0x0b, 0xfc, 0x78, 0x33, 0xb8, 0x7b,
+    0x3e, 0xd8, 0xba, 0x27, 0xba, 0xae, 0xdf, 0xce, 0xea, 0x80, 0x08, 0x38,
+    0xd8, 0x33, 0x00, 0xa9, 0xb6, 0x88, 0x48, 0xa9, 0x3b, 0x54, 0xf0, 0x95,
+    0xda, 0xba, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+    0xb1, 0xd7, 0x8d, 0x6c, 0xb9, 0x96, 0xdc, 0x64, 0x9b, 0x0c, 0x74, 0x54,
+    0x59, 0x82, 0xf6, 0x6e, 0x7c, 0x4e, 0x23, 0x83, 0x04, 0x2e, 0x49, 0xfb,
+    0x56, 0x4b, 0xcd, 0x0d, 0x76, 0x29, 0xb1, 0xce, 0x40, 0xa3, 0xd0, 0x02,
+    0x16, 0x8e, 0x1c, 0x0a, 0x00, 0x5b, 0x8c, 0x06, 0xf9, 0x07, 0x97, 0x12,
+    0x0c, 0x33, 0xd5, 0x48, 0x6d, 0xae, 0x7d, 0x2c, 0x8f, 0x74, 0x32, 0x24,
+    0xcf, 0x91, 0xd7, 0xbe, 0xb2, 0x05, 0xcf, 0x2f, 0x93, 0xd5, 0x43, 0x90,
+    0xce, 0x02, 0x97, 0xf8, 0x51, 0xb3, 0xba, 0x56, 0x5d, 0x94, 0x41, 0xa4,
+    0x11, 0xf3, 0x21, 0xc0, 0xcc, 0x28, 0xf8, 0x5a, 0x00, 0x0a, 0xd4, 0x53,
+    0xdd, 0xac, 0xfe, 0x25, 0x03, 0xea, 0x2b, 0x6b, 0x9d, 0x7e, 0x1a, 0xe1,
+    0x5f, 0x5c, 0xa7, 0x47, 0xa2, 0x72, 0x4f, 0x92, 0x60, 0x25, 0x7c, 0x1c,
+    0xa5, 0x34, 0xa6, 0x86, 0x0e, 0xda, 0x8f, 0x3f, 0xec, 0xe2, 0xe4, 0xad,
+    0xa9, 0x41, 0xcc, 0x3d, 0x94, 0x43, 0xfd, 0x28, 0xd8, 0xb0, 0x0f, 0x05,
+    0x9e, 0x2b, 0x27, 0x3f, 0xe0, 0x84, 0xbc, 0x9e, 0x7a, 0xa5, 0x83, 0x3d,
+    0x3b, 0xac, 0x83, 0xd3, 0x16, 0x92, 0x8c, 0xd2, 0x4a, 0x81, 0xdd, 0xba,
+    0x0a, 0xb7, 0xc5, 0x9f, 0x83, 0x0f, 0x78, 0xb8, 0xab, 0x2d, 0xca, 0xf8,
+    0x6c, 0x06, 0xd7, 0x82, 0xb8, 0x61, 0x7d, 0x2a, 0x31, 0x3a, 0x39, 0x97,
+    0x5f, 0xc7, 0x00, 0x6e, 0x46, 0xf2, 0xc5, 0x12, 0x71, 0x55, 0x5b, 0x10,
+    0xaf, 0xbb, 0x07, 0x4c, 0x2f, 0xa3, 0x51, 0x53, 0x22, 0x20, 0xab, 0xed,
+    0x02, 0x95, 0xc6, 0x5f, 0xaa, 0xb8, 0xc0, 0xcb, 0xe5, 0xe0, 0x25, 0x97,
+    0xf7, 0xda, 0x1d, 0xd8, 0x5a, 0xff, 0x76, 0x0c, 0x3e, 0x33, 0x1b, 0x7a,
+    0x15, 0xb8, 0x34, 0x75, 0xcf, 0xe9, 0xf3, 0x53, 0x61, 0x03, 0x2d, 0x52,
+    0x29, 0x69, 0x3a, 0xc3, 0xd9, 0x22, 0xc0, 0x2d, 0x80, 0xed, 0x66, 0xc4,
+    0xf4, 0x89, 0x60, 0x14, 0xdb, 0xec, 0x7d, 0xcc, 0x99, 0x5c, 0x94, 0x27,
+    0xab, 0xed, 0xd2, 0x17, 0xf4, 0x36, 0xfc, 0x7e, 0x99, 0x98, 0xb6, 0x86,
+    0xb6, 0x7c, 0x54, 0xd6, 0xec, 0xb5, 0xad, 0x62, 0xcc, 0xb0, 0xf7, 0x8c,
+    0x52, 0x99, 0xf2, 0x44, 0x27, 0x3a, 0xb0, 0xff, 0x8f, 0x09, 0xae, 0xe1,
+    0x61, 0xd8, 0x9f, 0xdd, 0x2f, 0x6b, 0xea, 0xd0, 0x12, 0x70, 0x8c, 0x9d,
+    0x8f, 0x4c, 0x36, 0x98, 0x1e, 0x2e, 0xb5, 0x50, 0x63, 0x33, 0x9c, 0x4b,
+    0xc3, 0xd4, 0xa0, 0xe6, 0x96, 0x96, 0x75, 0xfd, 0x8a, 0xc4, 0x0c, 0xa7,
+    0xea, 0x9d, 0xf1, 0x23, 0x9e, 0x38, 0xff, 0x1a, 0x67, 0x36, 0x5f, 0x5f,
+    0x17, 0x88, 0x1a, 0x43, 0x25, 0xea, 0x76, 0xb5, 0xcd, 0xce, 0x43, 0xf8,
+    0x71, 0x2b, 0xdb, 0xf0, 0xcd, 0x76, 0xbd, 0x94, 0x57, 0xdb, 0x77, 0xcd,
+    0xb2, 0x8f, 0xd1, 0xc0, 0xeb, 0x00, 0x61, 0x7f, 0x66, 0xb0, 0x43, 0x6e,
+    0xe0, 0x9f, 0x11, 0x0e, 0x65, 0xf7, 0x4e, 0x00, 0x74, 0xc3, 0xeb, 0xb1,
+    0xeb, 0x0c, 0x24, 0x5d, 0x15, 0x56, 0x16, 0x47, 0x87, 0xcf, 0x34, 0xbe,
+    0x2a, 0xdd, 0x77, 0x55, 0xa4, 0x09, 0x15, 0x79, 0x8c, 0xaa, 0xce, 0x32,
+    0x90, 0x9b, 0x16, 0x40, 0x94, 0x7f, 0x19, 0x27, 0xbc, 0xbf, 0x45, 0x4b,
+    0xa5, 0xf0, 0xd0, 0x9e, 0x5b, 0xb9, 0x46, 0x6e, 0x72, 0x8f, 0x49, 0x3b,
+    0x7a, 0xc1, 0x92, 0xb0, 0xd5, 0x25, 0x1b, 0x0b, 0xf3, 0xd0, 0x8a, 0x47,
+    0x8b, 0xbe, 0xa4, 0xf9, 0x6a, 0x09, 0x84, 0x9a, 0x5b, 0x5b, 0xea, 0xbb,
+    0x6f, 0xd8, 0xaf, 0xcd, 0x67, 0x9b, 0x79, 0x7c, 0x8f, 0xcc, 0xd7, 0x5f,
+    0x3a, 0xc3, 0xd0, 0xb7, 0xba, 0x28, 0x83, 0x81, 0x4a, 0x05, 0x51, 0xaf,
+    0xa0, 0x52, 0x34, 0xe3, 0x4f, 0xec, 0x82, 0xdc, 0x97, 0xd8, 0x69, 0xb2,
+    0x0d, 0x68, 0x35, 0x87, 0x58, 0xc0, 0xcf, 0x58, 0x0d, 0xf6, 0x6b, 0x6d,
+    0x2a, 0xc0, 0x72, 0xe4, 0x90, 0x8c, 0x7b, 0x45, 0xba, 0xf1, 0x13, 0x6f,
+    0x8c, 0xd2, 0xdd, 0xc5, 0x8e, 0xc8, 0xec, 0xf9, 0xfb, 0xde, 0xe5, 0xaa,
+    0xcb, 0xc0, 0xff, 0x77, 0x2d, 0x99, 0xb1, 0x69, 0x7f, 0xe3, 0x38, 0x61,
+    0x35, 0xb6, 0x45, 0xdd, 0x73, 0x45, 0x84, 0x89, 0x1b, 0x96, 0x7e, 0x6a,
+    0x1d, 0xd9, 0xe6, 0x76, 0xa8, 0x16, 0x0f, 0x42, 0xc9, 0x41, 0xec, 0x5d,
+    0x25, 0x01, 0xb0, 0x45, 0xa6, 0xaa, 0x69, 0x87, 0x11, 0xa1, 0xb8, 0x9e,
+    0x68, 0x48, 0x68, 0xe9, 0xb5, 0xc2, 0xff, 0x83, 0x8f, 0x71, 0xb9, 0xd7,
+    0xbb, 0xae, 0x59, 0x8b, 0x1b, 0x4c, 0x44, 0xd8, 0xe3, 0xce, 0xab, 0x88,
+    0xfb, 0x64, 0xd9, 0x61, 0x5a, 0x7d, 0xce, 0x3a, 0x27, 0xb5, 0xa3, 0xfd,
+    0x5d, 0xa3, 0xb8, 0xa1, 0x15, 0x63, 0x0b, 0x75, 0x39, 0xc3, 0xa4, 0xfb,
+    0x60, 0x53, 0xfd, 0x11, 0x21, 0x35, 0x0f, 0x19, 0x28, 0x14, 0xcd, 0x8a,
+    0xcf, 0x33, 0xaa, 0x4f, 0x6a, 0x1e, 0x56, 0x87, 0xd5, 0x6e, 0x43, 0x9b,
+    0xa3, 0x72, 0x95, 0x8c, 0x34, 0xa2, 0xac, 0x11, 0x76, 0x95, 0xd7, 0xdd,
+    0xbf, 0x10, 0xf4, 0x0f, 0x2a, 0x64, 0xd2, 0x4d, 0x7b, 0xc6, 0x9b, 0x7d,
+    0xf7, 0xa5, 0xb3, 0x84, 0x9a, 0x9a, 0x5e, 0xcf, 0x7f, 0x95, 0x6d, 0x44,
+    0xd1, 0xb2, 0x19, 0xbb, 0xed, 0x37, 0x42, 0x4b, 0x4b, 0x6d, 0xb7, 0x10,
+    0x02, 0x5f, 0x00, 0x1f, 0x24, 0xce, 0xb2, 0x8b, 0x3e, 0x7d, 0xc6, 0x6e,
+    0x6c, 0x90, 0x75, 0xad, 0x3f, 0x9d, 0x63, 0x04, 0x76, 0x20, 0x7a, 0x56,
+    0x48, 0xa1, 0x6a, 0x37, 0x74, 0xd2, 0xb7, 0x4f, 0xa3, 0x64, 0x62, 0xaa,
+    0xce, 0x75, 0x8c, 0x15, 0x75, 0x79, 0xa0, 0xbd, 0xdd, 0x01, 0x46, 0xca,
+    0xa0, 0x31, 0x1a, 0x16, 0x1f, 0xef, 0x8b, 0xc6, 0x54, 0x57, 0xfa, 0x6e,
+    0x43, 0xdf, 0xb0, 0x99, 0xed, 0xa4, 0xcb, 0xeb, 0x91, 0x35, 0x14, 0x0c,
+    0xa9, 0x1d, 0xb5, 0xa9, 0x32, 0x99, 0xe3, 0x89, 0x74, 0xaa, 0xa4, 0x65,
+    0x1e, 0x82, 0x47, 0xfa, 0x37, 0x23, 0xe5, 0x86, 0xb6, 0xc0, 0xb6, 0x89,
+    0x9a, 0xd9, 0xae, 0x29, 0x39, 0x7b, 0x66, 0xc7, 0x5b, 0x02, 0x08, 0x86,
+    0xd4, 0xf0, 0x75, 0xc2, 0x05, 0x86, 0xc3, 0x75, 0xd2, 0x2a, 0x1e, 0xec,
+    0x6e, 0x75, 0x29, 0x58, 0x8c, 0x25, 0x3b, 0x95, 0x21, 0xde, 0x42, 0xd5,
+    0xb7, 0x15, 0x30, 0x09, 0x49, 0x78, 0x55, 0xd5, 0xf2, 0x30, 0x80, 0x93,
+    0x8a, 0xce, 0x84, 0x27, 0xdb, 0x4a, 0x09, 0x30, 0x0c, 0x7f, 0x4d, 0xd1,
+    0x0f, 0xda, 0x66, 0x58, 0xe1, 0x01, 0xfd, 0x75, 0x83, 0xf5, 0x39, 0x2e,
+    0xe2, 0x6b, 0xde, 0xff, 0x20, 0x8a, 0xf7, 0xcc, 0x81, 0x8e, 0x99, 0xb4,
+    0xeb, 0x76, 0x74, 0x38, 0x2b, 0xe0, 0x6d, 0x61, 0x8f, 0x39, 0x59, 0x10,
+    0x7d, 0xb5, 0xd3, 0x14, 0x96, 0x04, 0x1d, 0x22, 0x89, 0xef, 0x15, 0x7c,
+    0x28, 0x5a, 0xd6, 0x8d, 0xf3, 0xb7, 0x6a, 0x9a, 0xce, 0x21, 0x77, 0xfd,
+    0x4f, 0x22, 0x26, 0x28, 0xb8, 0xb5, 0xb3, 0x73, 0xfd, 0x2a, 0x7b, 0x42,
+    0x26, 0x77, 0x41, 0x93, 0xed, 0xf9, 0x8f, 0xa9, 0x92, 0xd5, 0x9f, 0x2e,
+    0x60, 0xec, 0x60, 0x98, 0xf1, 0xd5, 0x11, 0xe2, 0xe0, 0xd7, 0x45, 0xa7,
+    0xe4, 0xf2, 0x82, 0x61, 0x2f, 0x41, 0x1b, 0xd9, 0x8e, 0x78, 0xd5, 0x6b,
+    0x68, 0x74, 0xf0, 0xc3, 0x83, 0x01, 0x16, 0x60, 0x6e, 0x34, 0x88, 0x45,
+    0x8a, 0x86, 0x44, 0x5b, 0xa5, 0xa8, 0x55, 0xbc, 0xfa, 0x8f, 0xbd, 0x93,
+    0x95, 0x3f, 0xab, 0x19, 0x54, 0x8f, 0x06, 0x8e, 0xca, 0x0b, 0x4a, 0x18,
+    0x3f, 0x7a, 0x9c, 0x3f, 0xe6, 0xbe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x04, 0x00, 0x00, 0x81, 0x32, 0x41, 0x46, 0x59, 0x26, 0xf4, 0xef,
+    0x93, 0x9f, 0x04, 0xc2, 0x67, 0x13, 0x32, 0x45, 0xc0, 0x79, 0x70, 0x27,
+    0x21, 0x2b, 0xaf, 0x35, 0xf3, 0xc4, 0x88, 0x52, 0x28, 0xea, 0xca, 0x8a,
+    0x08, 0x01, 0x6f, 0x61, 0xab, 0x10, 0xa3, 0xf0, 0x6b, 0x3b, 0x54, 0x64,
+    0xf1, 0x63, 0x83, 0x38, 0x2b, 0x26, 0x18, 0x5a, 0x67, 0xc4, 0x67, 0x38,
+    0x3f, 0x2c, 0x9a, 0xc9, 0x48, 0x33, 0x77, 0xb4, 0xb2, 0xc2, 0xc7, 0x08,
+    0x21, 0x5e, 0xc4, 0x19, 0x59, 0xe1, 0xfa, 0x32, 0xa4, 0x4c, 0x3e, 0xba,
+    0x65, 0x92, 0x98, 0x39, 0x71, 0x2f, 0x99, 0x08, 0xf8, 0xb3, 0x7a, 0x03,
+    0x53, 0xd7, 0x68, 0xb2, 0x5e, 0xb0, 0xef, 0xe0, 0x1e, 0x7d, 0xb2, 0x23,
+    0x5d, 0x2b, 0xd7, 0x09, 0xa6, 0x78, 0xa4, 0x7c, 0x08, 0xed, 0x8a, 0xf6,
+    0x96, 0xa0, 0x10, 0x17, 0x62, 0x8b, 0x8a, 0xa0, 0xac, 0x22, 0x67, 0x02,
+    0xa8, 0x66, 0x1a, 0xb5, 0x02, 0xde, 0xa5, 0xfa, 0x69, 0x29, 0x5f, 0x24,
+    0x89, 0x46, 0x68, 0xd6, 0x51, 0x2a, 0xfe, 0x88, 0xf0, 0x40, 0xde, 0xd1,
+    0x12, 0x2e, 0xed, 0x13, 0x7b, 0x49, 0xf6, 0xe1, 0x7a, 0xcf, 0x61, 0xcb,
+    0x70, 0x9d, 0xaa, 0x51, 0x07, 0xc2, 0x54, 0x76, 0x89, 0x29, 0x94, 0x29,
+    0x8b, 0x0e, 0xf5, 0xe8, 0x81, 0xc7, 0xdb, 0x59, 0x1e, 0x75, 0xda, 0x6a,
+    0x94, 0x18, 0x16, 0xae, 0xbb, 0x43, 0x87, 0x56, 0x66, 0x8b, 0x84, 0xe9,
+    0xa9, 0xd0, 0xd2, 0x8f, 0x5b, 0xbf, 0x1d, 0x24, 0x3a, 0xb7, 0x64, 0xff,
+    0xe9, 0x22, 0x21, 0x65, 0xaf, 0x2b, 0x45, 0x8d, 0x28, 0xea, 0xbc, 0x07,
+    0x10, 0x6e, 0xfb, 0x4d, 0x6f, 0x35, 0xe5, 0xeb, 0x5d, 0x29, 0x72, 0xe1,
+    0x94, 0xad, 0xed, 0x25, 0xd7, 0x39, 0x63, 0x32, 0x37, 0x0b, 0xb2, 0xd7,
+    0x54, 0x1f, 0xe4, 0x0d, 0xe7, 0xb3, 0xd1, 0xa6, 0x2a, 0xcf, 0x8e, 0x97,
+    0xf1, 0xa8, 0xfc, 0xb1, 0x61, 0xdc, 0xb4, 0x8f, 0x29, 0xa2, 0x68, 0x4a,
+    0xe6, 0x2f, 0x8a, 0x69, 0x2c, 0xa1, 0x1d, 0xe2, 0x9e, 0x65, 0x71, 0xb7,
+    0x83, 0xef, 0x63, 0xf5, 0x36, 0xdc, 0xa0, 0x94, 0x5a, 0x45, 0x8a, 0x85,
+    0x5e, 0x28, 0x86, 0x21, 0xd2, 0xbf, 0x7a, 0x2f, 0x76, 0x1c, 0x2a, 0x15,
+    0xb2, 0xe8, 0xaf, 0x63, 0x37, 0xbe, 0xd8, 0x0a, 0xef, 0x54, 0xee, 0xe6,
+    0xd9, 0xb3, 0xdb, 0x41, 0x55, 0xba, 0xd8, 0x14, 0x7c, 0x10, 0x61, 0x06,
+    0x40, 0x45, 0x69, 0x37, 0x60, 0xf7, 0x6a, 0x7a, 0x23, 0x70, 0x30, 0x57,
+    0x3e, 0xe5, 0x12, 0x24, 0xbc, 0x5e, 0x82, 0x89, 0xd8, 0x37, 0xc9, 0x33,
+    0xb9, 0x38, 0xa5, 0xba, 0xed, 0xdd, 0x93, 0x58, 0x81, 0x15, 0xec, 0x15,
+    0x70, 0x2f, 0x30, 0xfa, 0xaf, 0xf7, 0xf5, 0xcb, 0x41, 0x74, 0xea, 0xc0,
+    0x91, 0xbe, 0x53, 0x4c, 0xc2, 0x74, 0x1b, 0x5b, 0x8c, 0x74, 0xd8, 0xc3,
+    0x4a, 0x12, 0xaa, 0x57, 0xd6, 0x61, 0xb1, 0xb8, 0x81, 0x5d, 0x81, 0x37,
+    0x1e, 0x5b, 0x3d, 0x5a, 0xbc, 0xa6, 0xb2, 0x27, 0xe3, 0x01, 0x4c, 0xf0,
+    0xad, 0x7b, 0xdf, 0x50, 0xf9, 0xd7, 0xb7, 0xcc, 0xa8, 0x5c, 0x3d, 0x9a,
+    0xb7, 0x60, 0x3e, 0x63, 0x3f, 0x6a, 0x08, 0x0b, 0x82, 0xdc, 0x3e, 0xfa,
+    0x24, 0x33, 0xd3, 0x01, 0xbf, 0xef, 0xeb, 0x52, 0x3f, 0x91, 0x61, 0xda,
+    0xe2, 0x26, 0x10, 0xdf, 0xe4, 0x9b, 0x77, 0x91, 0x22, 0xc5, 0x4e, 0x9c,
+    0x0b, 0x32, 0xff, 0x27, 0x85, 0x85, 0x0c, 0x99, 0x50, 0x8f, 0xad, 0x5d,
+    0x06, 0x18, 0x52, 0xb4, 0x64, 0x09, 0xc4, 0xa4, 0x84, 0xd4, 0x81, 0x07,
+    0x0a, 0x97, 0x55, 0xf8, 0x96, 0x52, 0xb2, 0x9a, 0xf4, 0x06, 0x2c, 0x9a,
+    0x3b, 0x8b, 0xaa, 0x67, 0x18, 0x3a, 0xee, 0xbc, 0xca, 0x8f, 0x46, 0xf6,
+    0x4a, 0x33, 0x5b, 0x56, 0x09, 0xb2, 0x72, 0x87, 0xdb, 0xbb, 0x57, 0x67,
+    0x53, 0x82, 0x77, 0x31, 0x66, 0xbb, 0xf1, 0x33, 0x6d, 0x55, 0x82, 0xaa,
+    0x80, 0xd4, 0x4d, 0xb8, 0xab, 0xbd, 0x2a, 0xda, 0x10, 0x3a, 0xc8, 0xf0,
+    0x14, 0x1e, 0xcb, 0x8e, 0x76, 0x6c, 0xc8, 0x74, 0x05, 0xb3, 0x51, 0xbd,
+    0x63, 0x06, 0x69, 0x05, 0x2a, 0x21, 0xd6, 0x2f, 0xe4, 0x38, 0xae, 0xf8,
+    0xd4, 0xe9, 0xa7, 0xe8, 0xc8, 0x5a, 0x65, 0x7d, 0x54, 0x34, 0x33, 0x0d,
+    0xf6, 0x07, 0xd6, 0x8c, 0xe5, 0x72, 0x9b, 0xfb, 0x60, 0x49, 0xd2, 0xaf,
+    0xb4, 0x17, 0xc4, 0x74, 0x8d, 0xe5, 0x54, 0xda, 0x96, 0x56, 0x7d, 0x97,
+    0x62, 0xe8, 0xec, 0x0d, 0x2b, 0x02, 0x2e, 0x59, 0xf8, 0xa1, 0x06, 0x6a,
+    0xb6, 0x3e, 0x15, 0xeb, 0x64, 0x1a, 0x48, 0x3d, 0x53, 0x2c, 0x42, 0x3b,
+    0x97, 0xa1, 0x3f, 0x47, 0x8b, 0x74, 0x87, 0x8b, 0x96, 0x63, 0x08, 0x4c,
+    0x99, 0x38, 0x5a, 0xb6, 0x93, 0xa8, 0xcc, 0xee, 0x62, 0x3a, 0x00, 0x6d,
+    0x5c, 0xab, 0x77, 0x3c, 0x46, 0xae, 0x6e, 0xeb, 0xf1, 0xf9, 0x63, 0xf1,
+    0xa2, 0x31, 0x21, 0x38, 0xc3, 0x4f, 0xe2, 0x3a, 0x33, 0x7f, 0xe7, 0xc6,
+    0x69, 0xd5, 0x1c, 0x7e, 0x5b, 0x4f, 0xb1, 0x50, 0x3b, 0xbe, 0x31, 0xa7,
+    0x42, 0xa3, 0x97, 0x7b, 0xe3, 0x90, 0xd0, 0x07, 0xfd, 0x05, 0xb9, 0xf2,
+    0x47, 0xc4, 0xc8, 0xdd, 0x1c, 0x3c, 0xa4, 0x22, 0x96, 0x04, 0xca, 0x28,
+    0x17, 0xcc, 0x5c, 0x49, 0x7e, 0xc6, 0x93, 0x98, 0xd3, 0x8b, 0xd2, 0xf6,
+    0x4a, 0xb6, 0xbe, 0x8d, 0xa2, 0xdd, 0xb6, 0x7c, 0x66, 0x0c, 0x29, 0xcb,
+    0x1d, 0x98, 0xf6, 0xe4, 0xe5, 0x30, 0x4c, 0x84, 0xbf, 0x6f, 0x71, 0x4e,
+    0xc2, 0x12, 0x9f, 0x35, 0xd6, 0xf8, 0xc6, 0x30, 0xe9, 0x9e, 0x1a, 0x8a,
+    0x2f, 0xd1, 0x96, 0xb3, 0x3c, 0x0f, 0xf5, 0x78, 0xa7, 0xe0, 0xbd, 0x4b,
+    0xe0, 0xd8, 0x3d, 0x57, 0xa5, 0x44, 0xa0, 0xd9, 0x10, 0x79, 0xd2, 0x10,
+    0x50, 0xc7, 0x77, 0x73, 0x09, 0xf8, 0xb4, 0xcf, 0x66, 0xe3, 0x0c, 0xfb,
+    0x96, 0xf8, 0x52, 0xb3, 0x7e, 0x44, 0xf0, 0x03, 0x54, 0xd4, 0xa2, 0x57,
+    0x38, 0x8a, 0x96, 0xfc, 0x7c, 0x4c, 0x9f, 0x3a, 0xf2, 0xa2, 0x48, 0xbb,
+    0x3e, 0xd1, 0x11, 0x2c, 0xab, 0xdf, 0x53, 0x96, 0xac, 0x58, 0x33, 0xb9,
+    0xdd, 0xd2, 0x4f, 0x8a, 0x0a, 0x89, 0x0e, 0xd3, 0x6f, 0x58, 0x8c, 0xa1,
+    0x0a, 0x0b, 0xa7, 0xd7, 0x1f, 0x0a, 0x70, 0xe3, 0x43, 0x12, 0x56, 0xb8,
+    0x6c, 0xf8, 0x75, 0x4e, 0x2b, 0xb0, 0x17, 0x29, 0xe4, 0x95, 0x85, 0xd8,
+    0x85, 0x95, 0x63, 0x55, 0xa8, 0x82, 0xf0, 0xe7, 0x7d, 0xf3, 0xf1, 0x78,
+    0x66, 0xd1, 0x92, 0x71, 0x99, 0xad, 0x30, 0x94, 0xe9, 0x54, 0x2c, 0xe1,
+    0x57, 0xf3, 0x6a, 0xe6, 0x0c, 0x5e, 0xc7, 0x58, 0xba, 0xb7, 0x61, 0xd3,
+    0x74, 0x72, 0x96, 0x06, 0x0b, 0x01, 0x3d, 0xc2, 0xa1, 0xb4, 0x38, 0x81,
+    0x19, 0x44, 0xbc, 0x84, 0x52, 0x22, 0xc9, 0x67, 0x81, 0x99, 0xfb, 0x0a,
+    0xc2, 0xff, 0x50, 0x67, 0xbe, 0x38, 0x5e, 0x13, 0x16, 0x60, 0x83, 0x35,
+    0xb9, 0x2f, 0xa9, 0x55, 0xbb, 0x30, 0x6b, 0x19, 0xfc, 0x2a, 0x40, 0x24,
+    0x74, 0x20, 0x57, 0x78, 0xb9, 0x55, 0xb7, 0x70, 0x86, 0x65, 0x43, 0x1c,
+    0x76, 0x2e, 0x91, 0x83, 0x5e, 0x33, 0xc2, 0xd4, 0xcc, 0xb5, 0x1c, 0x45,
+    0xaf, 0xa3, 0x87, 0x95, 0x9b, 0x77, 0x50, 0x44, 0x7e, 0xdd, 0xca, 0x3f,
+    0x51, 0x21, 0xae, 0xf2, 0x15, 0xa9, 0x32, 0x94, 0xca, 0xde, 0x3b, 0x97,
+    0x13, 0x6b, 0xff, 0xe0, 0x79, 0x39, 0x40, 0xf0, 0x66, 0x7d, 0x5e, 0xef,
+    0xec, 0x0a, 0x35, 0xd2, 0x0d, 0x09, 0x19, 0x13, 0xf2, 0xc2, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0xdc, 0x07, 0x2e, 0x46,
+    0xab, 0x4d, 0x6d, 0xf7, 0x24, 0xba, 0x02, 0xe3, 0xc5, 0xe3, 0xed, 0x64,
+    0xc6, 0x77, 0x5a, 0x14, 0xae, 0x38, 0x52, 0x8c, 0x16, 0x2c, 0x52, 0x0e,
+    0xf6, 0x65, 0x99, 0xcc, 0xf6, 0x9f, 0x77, 0xcc, 0x2e, 0xaf, 0x14, 0xd1,
+    0xf0, 0x0f, 0xa7, 0x3e, 0x5b, 0x74, 0xff, 0xb9, 0xd3, 0x30, 0x02, 0x5e,
+    0x52, 0xc8, 0x6f, 0x57, 0xef, 0x28, 0xf5, 0xfa, 0x9e, 0x70, 0x00, 0xfc,
+    0x3e, 0xc3, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+    0xaa, 0x9f, 0x86, 0xb0, 0x6d, 0xa1, 0x0c, 0xfa, 0xef, 0xb3, 0x6a, 0x50,
+    0xa6, 0xfe, 0xff, 0xa9, 0x61, 0x0b, 0x18, 0x72, 0xee, 0xc6, 0xcd, 0x3a,
+    0x34, 0x5e, 0xa8, 0x81, 0x31, 0x54, 0x25, 0x05, 0xc1, 0xd9, 0x66, 0x3d,
+    0x17, 0xbb, 0x03, 0x21, 0x07, 0x69, 0x3a, 0x37, 0xe8, 0xd4, 0x6a, 0x68,
+    0xe1, 0xa3, 0x19, 0x5a, 0x8d, 0x14, 0x11, 0x09, 0xef, 0xae, 0xfe, 0x94,
+    0x19, 0x8a, 0xe4, 0xb9, 0x6e, 0xe8, 0xfa, 0x12, 0x2a, 0x5d, 0x00, 0x29,
+    0x27, 0x6d, 0x5a, 0xa5, 0x09, 0x34, 0x79, 0x2b, 0xa8, 0xcc, 0x42, 0xb4,
+    0xde, 0xe0, 0x91, 0xb9, 0x06, 0x0c, 0x11, 0x17, 0x25, 0x7a, 0x35, 0x57,
+    0x51, 0x40, 0xf3, 0xc7, 0xc6, 0x4a, 0x69, 0x98, 0x2b, 0x2b, 0x3e, 0x5d,
+    0x32, 0xd8, 0x8f, 0xb0, 0x1d, 0xee, 0x77, 0xe3, 0xaf, 0x4f, 0x71, 0x05,
+    0x04, 0xd2, 0xff, 0x51, 0xed, 0xa4, 0x69, 0x50, 0x24, 0x2a, 0xe5, 0xaa,
+    0xbb, 0xc6, 0x7a, 0x7f, 0xb2, 0xdf, 0x1d, 0xc2, 0x02, 0x2e, 0x52, 0xd1,
+    0xd9, 0x5b, 0xe7, 0x6c, 0x50, 0x31, 0x4e, 0xdf, 0x8e, 0x3f, 0x37, 0xfc,
+    0xf5, 0x34, 0x0e, 0xdb, 0x4c, 0x5d, 0x7d, 0xc8, 0xe4, 0x72, 0x40, 0xcb,
+    0x95, 0xa5, 0x41, 0xeb, 0x78, 0x5f, 0x64, 0x20, 0x55, 0x19, 0xc7, 0xf9,
+    0x9c, 0x71, 0x40, 0x8f, 0xcc, 0x2d, 0x86, 0xc0, 0xf4, 0x36, 0x2b, 0x0e,
+    0x28, 0xb4, 0xad, 0x1b, 0xde, 0x60, 0x67, 0x03, 0x0f, 0x7c, 0x18, 0xd9,
+    0xc3, 0x73, 0x67, 0x0d, 0x44, 0x3d, 0xbe, 0x7c, 0xcf, 0x96, 0x22, 0x0b,
+    0x0e, 0x3a, 0x0b, 0xcf, 0x04, 0x95, 0x92, 0x7d, 0x4b, 0xa2, 0x6a, 0x0b,
+    0x47, 0x72, 0x73, 0xa8, 0x9b, 0x96, 0x3d, 0xc6, 0x03, 0x34, 0xb1, 0x69,
+    0xc2, 0x50, 0x60, 0x89, 0x8c, 0x55, 0x8f, 0x8e, 0x74, 0xa8, 0x9e, 0x25,
+    0xe4, 0x0e, 0x73, 0xef, 0x4f, 0x51, 0xbe, 0xed, 0x5c, 0x14, 0xd3, 0xfa,
+    0x94, 0x58, 0x8d, 0x5c, 0xa0, 0xb1, 0xfc, 0x37, 0x6e, 0x9c, 0x9e, 0x61,
+    0xe5, 0x12, 0x13, 0xb2, 0x88, 0xc6, 0xcf, 0x60, 0x3f, 0x0d, 0x51, 0x33,
+    0x22, 0xfa, 0xfb, 0x2d, 0x2b, 0x8d, 0x43, 0x9b, 0x3d, 0x1e, 0x88, 0x24,
+    0x50, 0x78, 0xf7, 0x7e, 0x45, 0xb1, 0x0f, 0xa9, 0xe6, 0x77, 0xf8, 0x78,
+    0xff, 0x57, 0x6a, 0x05, 0x06, 0x0c, 0x7e, 0x1e, 0x7f, 0xe9, 0x90, 0xe8,
+    0x61, 0x68, 0xbc, 0x9e, 0xc4, 0xe5, 0x06, 0x04, 0x76, 0xcc, 0x01, 0x57,
+    0x1a, 0x55, 0x9e, 0x45, 0x26, 0xd6, 0xd8, 0xc2, 0x50, 0x25, 0xfc, 0x72,
+    0x4e, 0x18, 0xbe, 0xf2, 0x2f, 0xc0, 0x1b, 0xc8, 0x14, 0xeb, 0x24, 0xda,
+    0x15, 0x0a, 0x83, 0x38, 0xc5, 0xdd, 0xc9, 0xd7, 0x12, 0x35, 0x55, 0xdf,
+    0x2c, 0x23, 0xea, 0x17, 0xca, 0xbf, 0x18, 0xc9, 0x80, 0x63, 0x4b, 0x77,
+    0x8b, 0x17, 0x01, 0x05, 0x1b, 0xa3, 0x0b, 0x0f, 0xdd, 0xc6, 0xe0, 0xdf,
+    0xc9, 0xa6, 0x8c, 0x50, 0x95, 0x8d, 0x6c, 0x96, 0x67, 0xff, 0x88, 0x38,
+    0x3b, 0x76, 0x72, 0x11, 0x35, 0xa0, 0x1c, 0xc8, 0x96, 0x9c, 0xe5, 0x90,
+    0x79, 0x0e, 0x62, 0x57, 0x00, 0xd9, 0x57, 0xf8, 0xa4, 0xc2, 0xc2, 0x0a,
+    0x17, 0x8e, 0xd7, 0x03, 0x6d, 0x4d, 0x14, 0xb6, 0x96, 0x8a, 0x76, 0x67,
+    0x58, 0xce, 0x9c, 0xb3, 0x10, 0x49, 0x06, 0xeb, 0x56, 0x43, 0x40, 0xcb,
+    0xd4, 0xd7, 0x59, 0x42, 0xa4, 0xd7, 0x21, 0x6a, 0x51, 0x3d, 0x1c, 0x54,
+    0xd7, 0xd6, 0xa2, 0xcf, 0xf8, 0xf6, 0x72, 0x35, 0x04, 0xa6, 0xe3, 0x53,
+    0xca, 0xc5, 0x62, 0xee, 0xa9, 0xc3, 0x6d, 0x1b, 0xc4, 0xc5, 0xd9, 0xa7,
+    0x37, 0xc2, 0x04, 0x01, 0xc9, 0x4a, 0x2e, 0x26, 0xdd, 0x12, 0x6e, 0x41,
+    0x64, 0xb4, 0xe8, 0xe8, 0xc7, 0xf8, 0xab, 0x8a, 0xab, 0x1d, 0x7f, 0x2d,
+    0x58, 0xc2, 0xc4, 0xf0, 0x5d, 0x11, 0x35, 0x52, 0x88, 0xbc, 0x0f, 0x44,
+    0x6e, 0x91, 0x1e, 0x87, 0xb4, 0xb1, 0x91, 0x52, 0x32, 0xe4, 0x38, 0x6d,
+    0x5e, 0x8d, 0x30, 0xf0, 0xbc, 0xc3, 0x15, 0x80, 0x47, 0x36, 0x35, 0xb0,
+    0x93, 0xf3, 0xc4, 0x82, 0xc7, 0x73, 0xc1, 0x67, 0x0c, 0x7a, 0x31, 0x36,
+    0xbc, 0x73, 0x67, 0x66, 0xae, 0x48, 0x82, 0x27, 0x6e, 0x14, 0xd0, 0xd5,
+    0x12, 0x10, 0xce, 0x5e, 0x37, 0xcd, 0x7e, 0xa5, 0xcb, 0xff, 0x91, 0xf0,
+    0x62, 0xdb, 0x95, 0x74, 0x0c, 0x8c, 0x1e, 0x78, 0x11, 0x02, 0xb3, 0x02,
+    0x0b, 0x31, 0xe7, 0x4e, 0x8b, 0x58, 0x6a, 0xde, 0x20, 0x93, 0x8b, 0x8e,
+    0x62, 0x03, 0x24, 0xc9, 0xca, 0xf8, 0x44, 0x1d, 0x0c, 0x1b, 0xd8, 0x5d,
+    0xcc, 0xe2, 0x8e, 0x02, 0xc6, 0x5c, 0x06, 0x45, 0xe6, 0x94, 0x8f, 0xa2,
+    0x3e, 0xf5, 0xe9, 0xf5, 0x88, 0x87, 0xb2, 0x84, 0x1e, 0xb6, 0xb6, 0xfc,
+    0x9f, 0x8e, 0x79, 0xf5, 0x4b, 0x24, 0x81, 0x3e, 0x5d, 0xf4, 0x10, 0x6e,
+    0xdd, 0x8c, 0x8c, 0xae, 0xc6, 0x2c, 0x26, 0xb2, 0xfc, 0xf3, 0x99, 0xe8,
+    0x8c, 0x65, 0x5d, 0x6c, 0xa8, 0x1d, 0x6f, 0x1e, 0x32, 0x0a, 0xee, 0x87,
+    0xf6, 0xe1, 0xdd, 0x5e, 0x7f, 0x7a, 0x90, 0x8c, 0x3f, 0xe8, 0x47, 0x95,
+    0x9b, 0xc8, 0x2c, 0x49, 0xc9, 0xe4, 0x2d, 0xea, 0x58, 0xfc, 0x29, 0x1a,
+    0xb7, 0xa1, 0xf9, 0xb8, 0x84, 0x41, 0xa0, 0xf1, 0x77, 0x83, 0x56, 0x73,
+    0x86, 0xea, 0xf4, 0xf5, 0x2a, 0xa6, 0x6b, 0x00, 0x64, 0x39, 0x08, 0x8f,
+    0xf0, 0x22, 0x1a, 0x4c, 0xf2, 0x5a, 0xd0, 0xaa, 0x39, 0xae, 0x8a, 0xbc,
+    0x03, 0x99, 0xf7, 0xcc, 0x80, 0xdf, 0x2b, 0x85, 0xbe, 0x1a, 0x97, 0x28,
+    0x63, 0x04, 0x72, 0x75, 0x75, 0xb4, 0x9c, 0xd3, 0x17, 0xcc, 0x1e, 0xa1,
+    0xd2, 0x47, 0x18, 0x45, 0xad, 0xb4, 0x0a, 0x32, 0x31, 0x36, 0x64, 0x48,
+    0x3f, 0x7b, 0x4b, 0xc0, 0xd6, 0x78, 0x46, 0xaa, 0x90, 0x89, 0xf9, 0x36,
+    0x3d, 0xb4, 0xb3, 0x50, 0x51, 0xd9, 0x55, 0x6f, 0xa9, 0xe7, 0x25, 0xaf,
+    0xa0, 0xca, 0x9d, 0x45, 0x83, 0xc3, 0x0b, 0x2a, 0x0c, 0xf9, 0x3f, 0xe4,
+    0x08, 0xf4, 0xbd, 0x23, 0x45, 0x85, 0xcf, 0x41, 0x93, 0xd3, 0x21, 0x5f,
+    0x53, 0xa2, 0x5b, 0xa9, 0xf5, 0xe9, 0x8f, 0x2a, 0x2d, 0x53, 0x3c, 0x36,
+    0x17, 0xce, 0x37, 0x35, 0x3e, 0x9e, 0x6b, 0xbc, 0xba, 0xaa, 0xa5, 0x61,
+    0x79, 0x98, 0x8e, 0xbd, 0x19, 0xf4, 0x5f, 0xa9, 0xb8, 0x96, 0xa2, 0xce,
+    0x32, 0x00, 0xab, 0x51, 0xcb, 0xfa, 0x30, 0x3a, 0x83, 0x92, 0x91, 0xad,
+    0x08, 0x61, 0x62, 0x51, 0x7f, 0x19, 0xa9, 0x2a, 0x84, 0xf2, 0xab, 0x7e,
+    0x5e, 0xa7, 0x5a, 0x54, 0x7f, 0x68, 0x2a, 0x7b, 0x4f, 0xde, 0x45, 0x1d,
+    0xef, 0x73, 0x5f, 0xc0, 0x40, 0x6e, 0xec, 0x6c, 0xe9, 0xa5, 0x6b, 0x46,
+    0x54, 0x7c, 0x24, 0x8b, 0xa4, 0xe5, 0xb4, 0x82, 0x31, 0x1f, 0x3e, 0x79,
+    0x2e, 0x21, 0x8c, 0xf1, 0xbd, 0xad, 0x7c, 0x28, 0xcc, 0xbd, 0x58, 0x72,
+    0xe9, 0x6a, 0x04, 0x56, 0x67, 0x0f, 0x62, 0x98, 0x5a, 0x97, 0x4b, 0xe2,
+    0x67, 0x70, 0xbb, 0x17, 0xb1, 0x84, 0x5b, 0xd4, 0x6e, 0xab, 0x90, 0x29,
+    0x20, 0x93, 0x34, 0xa8, 0x03, 0x0f, 0xed, 0x1a, 0xf0, 0x1b, 0x92, 0x87,
+    0x43, 0xa5, 0x6a, 0x1c, 0xdc, 0xd7, 0x22, 0x68, 0x83, 0x98, 0x74, 0x2a,
+    0x4c, 0x51, 0xef, 0x71, 0x19, 0xd5, 0x3d, 0x05, 0x19, 0x61, 0xb2, 0x52,
+    0xa8, 0x6e, 0xda, 0x72, 0x51, 0x66, 0x9f, 0xf0, 0x12, 0xf6, 0x18, 0x60,
+    0xcc, 0xd7, 0x2f, 0x2e, 0x83, 0x14, 0x09, 0xdb, 0x55, 0x1c, 0xf2, 0xaf,
+    0xfd, 0xa4, 0x40, 0xf1, 0x4a, 0xc7, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x00, 0x00, 0x9c, 0x52, 0xff, 0x48, 0x06, 0x61, 0x76, 0x6d,
+    0xd7, 0x44, 0xb1, 0x0c, 0x32, 0x62, 0x15, 0xa1, 0xc3, 0x97, 0x03, 0xdd,
+    0xed, 0x20, 0x3c, 0x3a, 0x09, 0x16, 0xe5, 0x7d, 0x8c, 0xf9, 0x7b, 0x22,
+    0x5e, 0x3a, 0xdd, 0xf0, 0xc6, 0xf0, 0x3a, 0xd4, 0x94, 0x85, 0x1c, 0x60,
+    0x74, 0x91, 0xa3, 0xe2, 0x8a, 0xe5, 0x3e, 0xd4, 0x95, 0x28, 0x8b, 0x1a,
+    0x7b, 0xbe, 0x07, 0xc0, 0xe3, 0x6b, 0xb9, 0x85, 0x82, 0x0b, 0x24, 0xba,
+    0x1c, 0xfc, 0xc0, 0x0a, 0x21, 0x33, 0xad, 0x00, 0x19, 0xce, 0xb5, 0x8f,
+    0x73, 0x05, 0xf1, 0xac, 0x03, 0xbe, 0x1f, 0x22, 0xd5, 0x32, 0x5e, 0x50,
+    0xe3, 0xe0, 0x62, 0x26, 0xf4, 0xb0, 0x85, 0xd8, 0xf7, 0xa7, 0xf4, 0xa7,
+    0xff, 0x10, 0xb8, 0xbc, 0xe0, 0x3e, 0x4d, 0xcb, 0x37, 0x74, 0xcc, 0x85,
+    0xed, 0xa0, 0x34, 0x6c, 0xfa, 0x37, 0x84, 0x6a, 0x94, 0x55, 0x3b, 0x1e,
+    0x14, 0xab, 0x26, 0x7b, 0x3e, 0xac, 0xc3, 0x79, 0xcd, 0x1b, 0x00, 0x02,
+    0xb3, 0x01, 0xc3, 0x10, 0xdd, 0x56, 0x7d, 0x0e, 0x69, 0x39, 0x3c, 0x17,
+    0xa3, 0xae, 0x9c, 0x2d, 0xc7, 0x5a, 0x0b, 0x7c, 0xd0, 0xac, 0xa1, 0x91,
+    0x6a, 0x6d, 0xc0, 0x3f, 0x98, 0xf1, 0x21, 0xf5, 0xa5, 0x7c, 0xbc, 0x70,
+    0x0d, 0x7b, 0x2f, 0x0d, 0x5a, 0xa5, 0x4a, 0x5a, 0xff, 0x51, 0xbf, 0x7f,
+    0xb5, 0x4f, 0x2c, 0xba, 0xa9, 0x46, 0x81, 0x6b, 0xac, 0xc6, 0x62, 0x2d,
+    0xd7, 0xb5, 0x04, 0x5f, 0xd4, 0x5f, 0x1f, 0x6b, 0x11, 0x7d, 0xe3, 0x58,
+    0x1f, 0xb5, 0xbf, 0x16, 0x43, 0x88, 0x05, 0xf5, 0xa4, 0x7b, 0xb5, 0x0e,
+    0xf4, 0x01, 0xb6, 0x90, 0x69, 0x52, 0x0a, 0x5e, 0x9b, 0x87, 0x51, 0x5e,
+    0xd5, 0xed, 0x2c, 0xcc, 0x58, 0xad, 0xe6, 0x77, 0xa2, 0xc5, 0x7c, 0x1e,
+    0xc5, 0x92, 0xbe, 0xed, 0x3a, 0x9a, 0x97, 0xed, 0x56, 0xc8, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x16, 0xe8, 0x24, 0xe3,
+    0x82, 0x36, 0x8e, 0x50, 0x45, 0xbe, 0xc6, 0x10, 0x02, 0xb9, 0x6d, 0xf9,
+    0xed, 0x8f, 0x64, 0x35, 0x4d, 0x2c, 0x9f, 0x99, 0xdc, 0xee, 0xfa, 0x63,
+    0x99, 0xc4, 0xb8, 0x3d, 0x77, 0xea, 0xda, 0xd5, 0x95, 0x8b, 0x8e, 0x76,
+    0x02, 0x9c, 0x62, 0xa0, 0xad, 0xfe, 0x80, 0x61, 0x72, 0x59, 0xd6, 0x9f,
+    0x16, 0x2e, 0x09, 0x71, 0xb8, 0xd7, 0x65, 0x25, 0xc2, 0x5b, 0x40, 0x67,
+    0x8e, 0xd6, 0xf8, 0xdf, 0x67, 0x29, 0x19, 0xa2, 0xa6, 0x07, 0xf3, 0xc8,
+    0x91, 0x7d, 0xf2, 0x50, 0x71, 0xba, 0x5c, 0x2d, 0xa7, 0xae, 0xc4, 0xd5,
+    0xeb, 0xb9, 0x0d, 0x2d, 0x23, 0xe5, 0x8c, 0x65, 0xf5, 0xf8, 0x97, 0x69,
+    0xde, 0x25, 0x6f, 0xea, 0x12, 0x72, 0x3e, 0xb9, 0xa7, 0x8d, 0xcf, 0xa5,
+    0x66, 0xee, 0x4e, 0x2e, 0x66, 0x6b, 0xec, 0x77, 0x7f, 0x53, 0xdc, 0x29,
+    0x73, 0x5e, 0xe9, 0x2f, 0x79, 0xac, 0x8d, 0x0f, 0x44, 0x09, 0x5d, 0x25,
+    0x1d, 0x78, 0xb6, 0xe9, 0xd0, 0xfa, 0x8f, 0x5f, 0x9c, 0xf0, 0xe0, 0xfc,
+    0x62, 0x9f, 0x52, 0x6b, 0x5b, 0x8e, 0x3f, 0xdf, 0xb4, 0xf1, 0xdf, 0x35,
+    0xd0, 0x8f, 0x5a, 0xc9, 0x1f, 0x08, 0x86, 0xaa, 0x5a, 0x9e, 0xe8, 0xb0,
+    0xaa, 0xd4, 0xcd, 0x2a, 0x5b, 0x4f, 0x7f, 0x39, 0x9f, 0x7f, 0x21, 0xf2,
+    0xfd, 0x05, 0x96, 0x53, 0x09, 0xfd, 0x36, 0x4c, 0xcd, 0x98, 0x74, 0xf5,
+    0xbd, 0xcd, 0x9e, 0x14, 0x15, 0x05, 0xb9, 0x3d, 0x5f, 0x8a, 0x02, 0x86,
+    0x10, 0xd7, 0xd4, 0x01, 0x20, 0xd9, 0x8c, 0x65, 0x7d, 0x9d, 0x39, 0x25,
+    0xbc, 0xce, 0x1a, 0xb1, 0x76, 0x92, 0xc3, 0x03, 0xed, 0xa2, 0x41, 0x31,
+    0x0d, 0xc0, 0x40, 0x94, 0x01, 0xbc, 0x9b, 0xe9, 0x5e, 0x3e, 0x8c, 0x49,
+    0xf6, 0x98, 0x0c, 0x39, 0x79, 0xdc, 0xd1, 0x1b, 0xc5, 0xb2, 0x20, 0xb4,
+    0x6c, 0xb4, 0x4f, 0xce, 0xf4, 0x6c, 0x0b, 0xef, 0x85, 0xf2, 0x7d, 0x9a,
+    0x90, 0x58, 0x1b, 0x51, 0x56, 0x52, 0xac, 0x75, 0x9f, 0x17, 0xe6, 0x48,
+    0xaf, 0x18, 0x4c, 0xd8, 0x67, 0xe8, 0xd2, 0x61, 0xbc, 0xa0, 0x95, 0xc9,
+    0x78, 0xd8, 0xa2, 0x1d, 0x47, 0x59, 0x30, 0xcf, 0xf3, 0x79, 0x06, 0xd4,
+    0x25, 0xf8, 0x9c, 0x5c, 0x28, 0xee, 0xb0, 0xd2, 0xb6, 0xaf, 0x34, 0x0e,
+    0xe5, 0xe4, 0x16, 0x2e, 0x05, 0x45, 0x23, 0xc1, 0x88, 0x90, 0x4a, 0x8f,
+    0xff, 0xfb, 0xe2, 0xc0, 0xb7, 0xae, 0xb5, 0x50, 0xc9, 0x26, 0xf0, 0xa2,
+    0xf5, 0x21, 0x23, 0x79, 0x23, 0xb6, 0x8f, 0x57, 0x64, 0xd1, 0x27, 0xc2,
+    0x07, 0x63, 0xa6, 0x54, 0x1f, 0x2f, 0xca, 0x16, 0xb8, 0x28, 0x51, 0x2a,
+    0x92, 0xe0, 0x06, 0x36, 0x55, 0x00, 0x6c, 0x99, 0x31, 0xa7, 0x56, 0xb3,
+    0x7b, 0x15, 0xcd, 0xc1, 0x32, 0x3a, 0xc0, 0x37, 0x1f, 0xea, 0x29, 0xb6,
+    0x75, 0xdf, 0x8a, 0x17, 0x09, 0x45, 0xc2, 0x6e, 0xe2, 0x4c, 0xa5, 0x93,
+    0x9b, 0x17, 0x08, 0x27, 0x75, 0x33, 0xdb, 0x1f, 0xab, 0x37, 0xad, 0x8e,
+    0xaa, 0xef, 0x0b, 0x82, 0xaa, 0xa7, 0xae, 0x2c, 0x43, 0x4d, 0x8f, 0xa0,
+    0x43, 0xd7, 0xa1, 0x34, 0xeb, 0xc0, 0x4e, 0xbd, 0x64, 0xfc, 0xc8, 0x6a,
+    0x56, 0xa8, 0xfc, 0x9e, 0x2d, 0x5f, 0x7a, 0xa3, 0x72, 0x06, 0x79, 0x38,
+    0x33, 0x05, 0xa7, 0xf0, 0x09, 0x48, 0x55, 0xfe, 0x3f, 0xab, 0x25, 0x8e,
+    0x76, 0x1d, 0x12, 0x5a, 0x20, 0x68, 0xfb, 0x51, 0x51, 0x33, 0x40, 0x37,
+    0x0c, 0x90, 0x98, 0x6f, 0x66, 0x3f, 0x40, 0xa2, 0x2e, 0x3c, 0xd1, 0x22,
+    0x51, 0x54, 0x25, 0x7e, 0x4c, 0x5d, 0x96, 0xb2, 0x65, 0x0f, 0xa3, 0xdf,
+    0x8e, 0x97, 0xfe, 0xeb, 0xe7, 0xc6, 0x22, 0x2a, 0x47, 0x3a, 0x78, 0x1b,
+    0x39, 0x2e, 0xd6, 0xbc, 0x35, 0xb4, 0xf4, 0xc3, 0xf2, 0x6a, 0x12, 0xc9,
+    0xe7, 0x6c, 0x9a, 0xfc, 0xed, 0xbc, 0x11, 0xc7, 0x71, 0x09, 0x8f, 0x56,
+    0xc1, 0xd8, 0xb6, 0x92, 0x35, 0x97, 0x8e, 0x71, 0xd2, 0xbb, 0xb4, 0xed,
+    0xf0, 0x7e, 0xff, 0x58, 0xd9, 0x95, 0x26, 0xea, 0xa9, 0x4d, 0x38, 0x8d,
+    0x4e, 0x8e, 0x53, 0xae, 0x7e, 0xe6, 0xe6, 0x82, 0x35, 0x96, 0xab, 0x0f,
+    0x04, 0x0f, 0xf2, 0xac, 0x1b, 0xcd, 0x07, 0x17, 0x1b, 0x25, 0x2f, 0x92,
+    0xaf, 0x19, 0xa2, 0x1b, 0xa0, 0x7a, 0xc7, 0x4f, 0xb8, 0x1b, 0x89, 0x21,
+    0xb5, 0xe2, 0x24, 0xe9, 0x78, 0xae, 0x7d, 0xd7, 0xcc, 0x8e, 0x3f, 0xa7,
+    0xe9, 0xbe, 0xe6, 0x79, 0x0f, 0xdf, 0x86, 0xe9, 0xb9, 0xcd, 0x82, 0x7b,
+    0xf5, 0x04, 0x89, 0xa0, 0x73, 0x5d, 0xa2, 0x4e, 0xd6, 0xa0, 0x60, 0x21,
+    0xe2, 0xfe, 0xd3, 0xf4, 0x19, 0x8b, 0x6a, 0x03, 0x12, 0x9c, 0x51, 0x9a,
+    0x41, 0x4e, 0xf6, 0xb4, 0x6e, 0x0c, 0x43, 0xf5, 0x00, 0x00, 0x78, 0x12,
+    0xdd, 0x21, 0xa8, 0xc7, 0x21, 0xa1, 0x4e, 0x44, 0x10, 0xd0, 0xdb, 0x6f,
+    0x0b, 0x4c, 0xe7, 0x7a, 0x8c, 0x0c, 0xaa, 0xb6, 0x9a, 0x7d, 0xa9, 0xff,
+    0x5a, 0x2e, 0x15, 0x9e, 0x6f, 0xea, 0xe1, 0x42, 0x0c, 0x9c, 0x5a, 0x3b,
+    0xd5, 0xe6, 0xde, 0x23, 0x3f, 0x9c, 0x45, 0x20, 0x67, 0x96, 0x50, 0x16,
+    0x80, 0x42, 0xe7, 0x67, 0x7d, 0x24, 0xdc, 0x00, 0xaa, 0x01, 0x8a, 0xa3,
+    0x61, 0xfe, 0x9a, 0xce, 0xc1, 0xe5, 0x2e, 0x19, 0x85, 0x04, 0xe6, 0x7b,
+    0xe8, 0x7a, 0xbc, 0x9d, 0xfe, 0x71, 0x29, 0x1d, 0x17, 0xae, 0x6b, 0x1a,
+    0x64, 0xd7, 0xfe, 0x18, 0x29, 0x07, 0x9b, 0x49, 0x43, 0xba, 0x29, 0x37,
+    0xa8, 0xb0, 0x26, 0x27, 0x6b, 0x7d, 0xde, 0x49, 0x12, 0x90, 0x05, 0xe2,
+    0x2c, 0xd8, 0x08, 0xd0, 0x5d, 0x74, 0xa7, 0x15, 0xbe, 0x34, 0x34, 0x6d,
+    0xad, 0xfb, 0xa8, 0x01, 0x4a, 0x6c, 0x98, 0xba, 0x84, 0x38, 0xbd, 0x05,
+    0xe8, 0x87, 0x27, 0x91, 0x3f, 0xb8, 0xe9, 0x06, 0x27, 0xda, 0x56, 0x07,
+    0xaa, 0xea, 0xf4, 0x80, 0x5c, 0x12, 0x44, 0xbe, 0x23, 0xb3, 0x63, 0x9f,
+    0x5f, 0x37, 0xa7, 0x53, 0x4c, 0xfc, 0x4d, 0x87, 0xeb, 0x91, 0xe8, 0xd7,
+    0x5a, 0xd6, 0xca, 0x67, 0x2d, 0x2f, 0x5a, 0x0e, 0xc7, 0x82, 0x78, 0xa4,
+    0xf3, 0x56, 0x07, 0xa5, 0xab, 0x6d, 0x09, 0xd2, 0x0d, 0x08, 0x6b, 0x6e,
+    0x1f, 0xc1, 0xf2, 0x91, 0x1a, 0x39, 0xfe, 0x14, 0x56, 0x3f, 0xeb, 0x9f,
+    0x14, 0xc2, 0xb3, 0xb2, 0xc2, 0x8d, 0xc2, 0xee, 0x7e, 0xf0, 0x7d, 0x92,
+    0xd2, 0xc3, 0x57, 0x3e, 0x2c, 0x07, 0x1b, 0x6a, 0x9b, 0x3b, 0x79, 0x59,
+    0xc9, 0x22, 0x96, 0x6c, 0x3e, 0x37, 0xd3, 0x0e, 0x5c, 0xf6, 0x8f, 0xa9,
+    0xaa, 0xc9, 0xa4, 0x4b, 0xaf, 0x5d, 0x1a, 0xb6, 0xf3, 0x91, 0x32, 0x4f,
+    0xca, 0x72, 0xa0, 0x42, 0x01, 0x51, 0xaf, 0x19, 0x89, 0xc4, 0xcc, 0x9b,
+    0xf3, 0x52, 0xe9, 0xa6, 0xf2, 0x71, 0x6f, 0x5a, 0x38, 0x02, 0xb8, 0x75,
+    0x88, 0x5f, 0x8d, 0x12, 0xc5, 0x55, 0x4f, 0xd1, 0xba, 0xf2, 0x24, 0xdc,
+    0x63, 0x5f, 0x93, 0xc7, 0xf3, 0xe7, 0x59, 0xac, 0xc3, 0xed, 0xbc, 0x02,
+    0xe3, 0xad, 0xb2, 0x8e, 0x2c, 0x2d, 0x47, 0xb4, 0x34, 0x8d, 0xae, 0x44,
+    0xc8, 0x5f, 0x14, 0xe8, 0x8e, 0x7b, 0xc3, 0x60, 0x53, 0x9a, 0x51, 0xea,
+    0x7f, 0x2f, 0xb6, 0x62, 0x61, 0xf7, 0xc0, 0x18, 0x0f, 0x20, 0x79, 0x13,
+    0x5c, 0xe8, 0xca, 0x04, 0x29, 0x5f, 0x70, 0x4d, 0x88, 0xa2, 0x43, 0x20,
+    0x57, 0x33, 0x04, 0x74, 0x8e, 0x7c, 0x89, 0xd4, 0x56, 0x8f, 0x93, 0x86,
+    0x81, 0x6c, 0x11, 0xfc, 0x32, 0x0e, 0xb0, 0x3e, 0xe5, 0x13, 0xbf, 0x76,
+    0x62, 0xcc, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+    0x0e, 0xf8, 0x8f, 0xde, 0xfd, 0xfd, 0xcf, 0xd1, 0x6f, 0x9f, 0xf2, 0xb6,
+    0xb6, 0x59, 0xb2, 0x73, 0x1c, 0x3c, 0x0d, 0xb0, 0x4d, 0xb8, 0x96, 0xc6,
+    0xeb, 0xe5, 0xf8, 0x0d, 0x3e, 0xd7, 0x0c, 0xbd, 0x9c, 0xaa, 0xd5, 0x1c,
+    0x19, 0x9a, 0x4c, 0x8e, 0xfa, 0xac, 0x68, 0x74, 0x16, 0x06, 0xb5, 0x49,
+    0xe7, 0xd5, 0x6f, 0x4f, 0xcc, 0xd9, 0x02, 0x74, 0xd6, 0x08, 0x73, 0x7c,
+    0xa9, 0xfa, 0x3e, 0x50, 0x87, 0xf7, 0xfb, 0xa6, 0x94, 0xdc, 0xb1, 0x40,
+    0xec, 0xa7, 0xa9, 0x39, 0xff, 0x40, 0x4a, 0x97, 0x9b, 0xcc, 0x57, 0x66,
+    0x68, 0xd6, 0xa8, 0x4d, 0x13, 0x06, 0x0e, 0x03, 0xc4, 0xdf, 0x7a, 0xe4,
+    0x2f, 0x0e, 0xd7, 0x54, 0xe0, 0xbd, 0x93, 0xeb, 0x82, 0xd8, 0x05, 0x2d,
+    0xa2, 0xf0, 0x4e, 0xd0, 0xf9, 0x3e, 0x3e, 0x6b, 0x3d, 0x08, 0x39, 0x4e,
+    0x35, 0x13, 0x7b, 0x3b, 0x39, 0x2c, 0x47, 0x2c, 0x61, 0x9f, 0xfd, 0x59,
+    0x88, 0x5f, 0x65, 0x08, 0xa9, 0x66, 0xec, 0xb5, 0x21, 0xf3, 0xe9, 0xba,
+    0x11, 0x63, 0x24, 0x6c, 0xf4, 0x50, 0x3a, 0xe5, 0x0c, 0x06, 0x39, 0x69,
+    0x2f, 0xca, 0x0f, 0x48, 0xbe, 0x95, 0x7d, 0x13, 0x3d, 0xa5, 0x75, 0x69,
+    0x85, 0xc8, 0xb3, 0x72, 0x72, 0x3c, 0x4f, 0x96, 0xe7, 0xb7, 0xbd, 0xe7,
+    0x76, 0xba, 0xac, 0xc0, 0x07, 0x4d, 0xc1, 0xed, 0xb9, 0xf0, 0x91, 0x2e,
+    0x36, 0xb7, 0x5b, 0x1c, 0xb7, 0xd6, 0xb3, 0x45, 0x7d, 0x0a, 0xf5, 0x43,
+    0xdd, 0x7a, 0x8b, 0x4e, 0x18, 0xf2, 0xf3, 0x19, 0xcd, 0x4a, 0xda, 0x3c,
+    0x1b, 0x05, 0x27, 0x67, 0x43, 0xa9, 0x8e, 0xe7, 0x4a, 0x95, 0xa9, 0xad,
+    0x6c, 0x8c, 0xb2, 0x2e, 0x12, 0xcb, 0xf3, 0xeb, 0x65, 0x26, 0xf4, 0x3e,
+    0x86, 0xee, 0x7e, 0xd9, 0xba, 0xce, 0x8d, 0x15, 0x3e, 0xa8, 0x40, 0x59,
+    0x1d, 0x27, 0x78, 0x75, 0xf0, 0xf9, 0x33, 0xb5, 0x32, 0xa9, 0x66, 0xe6,
+    0x2e, 0x2e, 0x3d, 0xf5, 0x4a, 0xf0, 0x97, 0x2d, 0xe7, 0x43, 0x85, 0x43,
+    0x61, 0x25, 0x15, 0x13, 0x9e, 0x8e, 0xf6, 0x78, 0xe8, 0x67, 0xba, 0xc2,
+    0x6d, 0xda, 0x46, 0x25, 0x76, 0xd9, 0x9b, 0x69, 0x95, 0x4b, 0x50, 0x8c,
+    0xb7, 0x36, 0x49, 0xbc, 0xd7, 0x39, 0x69, 0xb9, 0xc1, 0x5f, 0x5f, 0xcc,
+    0x83, 0x4c, 0x16, 0xb8, 0x0c, 0x85, 0xf1, 0xa4, 0x57, 0x6c, 0x22, 0x1f,
+    0x60, 0x0c, 0xff, 0xb6, 0xc9, 0xf7, 0x21, 0x2d, 0x35, 0x78, 0x31, 0x79,
+    0xd0, 0x6d, 0x61, 0xec, 0x61, 0x04, 0x75, 0x5c, 0x06, 0xc3, 0x53, 0x1b,
+    0xb5, 0xdc, 0x23, 0xb9, 0xd9, 0x07, 0xd1, 0xd0, 0xb3, 0xa5, 0xab, 0xd9,
+    0xbe, 0xb7, 0xdc, 0xae, 0x3f, 0x3e, 0xd7, 0x2a, 0x79, 0x3f, 0x9c, 0x27,
+    0x81, 0x8d, 0x61, 0xe8, 0x46, 0x8f, 0x05, 0xf4, 0x9c, 0x30, 0x35, 0x9a,
+    0x2f, 0x62, 0x84, 0x7c, 0xa5, 0x95, 0x68, 0x34, 0xe6, 0xf0, 0xb9, 0x42,
+    0xd4, 0x37, 0xc6, 0xd2, 0x35, 0x1f, 0x7b, 0xe0, 0xa6, 0x92, 0xcf, 0xf7,
+    0x0f, 0x08, 0x10, 0x79, 0xbd, 0xa8, 0x7c, 0x4e, 0xef, 0xf1, 0x01, 0x8d,
+    0x1b, 0x0c, 0x98, 0x46, 0x28, 0xdc, 0xd5, 0xa8, 0xcf, 0x67, 0x7d, 0x87,
+    0x2a, 0x8f, 0xdd, 0x52, 0x43, 0x5a, 0x55, 0x80, 0x88, 0xa6, 0xcd, 0x9c,
+    0x5d, 0x36, 0xae, 0xef, 0x61, 0x43, 0xec, 0xf0, 0x7f, 0x92, 0x21, 0x1f,
+    0xa2, 0xa3, 0x76, 0x0e, 0x5d, 0xf3, 0xa7, 0xe7, 0x7d, 0xb0, 0x2c, 0x94,
+    0x36, 0x95, 0x34, 0x4e, 0x04, 0xfb, 0x51, 0xf9, 0xe6, 0x7e, 0x56, 0x7a,
+    0x59, 0xce, 0x0a, 0x45, 0x7e, 0xeb, 0xc4, 0xbc, 0xfd, 0x20, 0xaa, 0x34,
+    0x6b, 0xee, 0x3b, 0x09, 0xe8, 0x00, 0x4b, 0xfc, 0x68, 0x24, 0x43, 0xdb,
+    0x09, 0x58, 0xd0, 0xb6, 0xbf, 0xaf, 0x1d, 0x7f, 0x8a, 0x4c, 0x9e, 0x51,
+    0x97, 0x97, 0xe1, 0x0c, 0x0d, 0xaf, 0xd1, 0x1e, 0x62, 0xad, 0x70, 0xa5,
+    0x8a, 0x24, 0x2f, 0x4a, 0xa6, 0x55, 0xb1, 0x44, 0x09, 0x88, 0xab, 0xa5,
+    0x45, 0x28, 0xa0, 0x34, 0x9e, 0x14, 0x2c, 0xf9, 0x0f, 0xb8, 0x33, 0x8f,
+    0xcc, 0xba, 0x50, 0x34, 0x4c, 0x96, 0x89, 0x09, 0xb9, 0xa8, 0xfb, 0xac,
+    0x59, 0x73, 0xea, 0x61, 0xbc, 0x0d, 0x24, 0x3a, 0x20, 0xc2, 0x76, 0xfc,
+    0x2e, 0xce, 0xfb, 0x75, 0x00, 0xca, 0x58, 0xbd, 0xab, 0x61, 0x9b, 0x13,
+    0x2b, 0xa3, 0xf6, 0x15, 0x55, 0x83, 0x23, 0xc4, 0xf3, 0x4c, 0x89, 0xc5,
+    0x4a, 0x18, 0x5c, 0x8d, 0x41, 0xcc, 0x06, 0x7b, 0xe3, 0x2a, 0x1f, 0x6a,
+    0x57, 0xbc, 0x54, 0x61, 0x0c, 0xf2, 0xec, 0xbf, 0xb0, 0xf0, 0x21, 0xde,
+    0xfc, 0xe4, 0xef, 0xce, 0x47, 0xc8, 0xdc, 0x11, 0xc7, 0x8a, 0x12, 0x97,
+    0x68, 0x1d, 0x9e, 0x9a, 0xbf, 0xad, 0x62, 0x7e, 0x4b, 0x88, 0xd7, 0x20,
+    0x22, 0xce, 0x5e, 0xe3, 0x87, 0x12, 0xa3, 0x05, 0xef, 0x1f, 0x05, 0xb1,
+    0xbd, 0x1b, 0x80, 0x43, 0x84, 0x33, 0x8b, 0x87, 0xa5, 0xc2, 0xe1, 0x49,
+    0xa8, 0x75, 0x49, 0x9b, 0x1b, 0x64, 0x8a, 0xd0, 0x86, 0x10, 0xa8, 0x72,
+    0xeb, 0x2e, 0xe7, 0x3f, 0xaa, 0x6b, 0x4a, 0x22, 0xae, 0x17, 0x8f, 0x10,
+    0x22, 0x03, 0x66, 0x67, 0x35, 0x40, 0x29, 0x1e, 0xf2, 0x05, 0x36, 0xd5,
+    0xed, 0xe2, 0x2a, 0xcc, 0x77, 0xe2, 0x16, 0xef, 0xa7, 0x9b, 0xe1, 0x1b,
+    0xba, 0xf3, 0xf5, 0x74, 0x6c, 0x2a, 0x98, 0x8a, 0x14, 0xaf, 0x2c, 0xab,
+    0xfb, 0x51, 0x53, 0x75, 0x17, 0xcb, 0x5c, 0x86, 0xb5, 0x60, 0x70, 0x29,
+    0x65, 0x69, 0x49, 0x42, 0x4f, 0x42, 0x6b, 0xc7, 0xdb, 0x98, 0x7d, 0x1e,
+    0xf8, 0x45, 0xb2, 0x33, 0xd6, 0x34, 0x26, 0xa6, 0x7f, 0x76, 0x31, 0x13,
+    0x13, 0x9d, 0xd2, 0xb0, 0x30, 0x0b, 0x0b, 0x3e, 0x1a, 0x84, 0xb0, 0xbd,
+    0x81, 0x34, 0x25, 0x73, 0x99, 0x87, 0x1a, 0xc8, 0x44, 0x34, 0x9d, 0x1a,
+    0x3d, 0x76, 0x44, 0x1d, 0xe2, 0x22, 0xad, 0x3d, 0xb2, 0xa3, 0x1c, 0xd5,
+    0x27, 0x8c, 0xc6, 0x84, 0xdf, 0x33, 0xbe, 0xb2, 0xa7, 0xb9, 0xc5, 0x6e,
+    0x48, 0xdc, 0xe9, 0xf8, 0xef, 0xfc, 0xaa, 0x1f, 0x5e, 0x41, 0x48, 0x1e,
+    0xe0, 0xb9, 0xd6, 0x6e, 0x7a, 0x9c, 0xa3, 0x98, 0x4b, 0xfa, 0x90, 0xa4,
+    0x58, 0x33, 0x85, 0x3b, 0x11, 0x44, 0x83, 0x4b, 0x1e, 0x0e, 0x5d, 0x11,
+    0x36, 0x15, 0xe1, 0xbf, 0x15, 0x04, 0x8e, 0x88, 0xc6, 0x18, 0x53, 0xc3,
+    0x8d, 0x28, 0x86, 0x25, 0xef, 0x55, 0x7b, 0xf6, 0x85, 0xf8, 0xed, 0x3b,
+    0xcf, 0x5d, 0xa6, 0xc7, 0x66, 0xb7, 0xbe, 0x14, 0xf0, 0x62, 0x89, 0x1f,
+    0x32, 0x1e, 0x86, 0x2a, 0x93, 0xd5, 0xca, 0x37, 0x03, 0x0b, 0xf8, 0x0f,
+    0xca, 0x50, 0x6c, 0x16, 0x2b, 0xf0, 0x77, 0xca, 0xbb, 0x8e, 0x95, 0x11,
+    0xef, 0x5b, 0xbe, 0x2f, 0x62, 0x50, 0xb8, 0x3d, 0xff, 0xfa, 0x30, 0x21,
+    0xb2, 0x86, 0x3f, 0x50, 0x57, 0x98, 0x79, 0x15, 0xce, 0x3e, 0xbf, 0x49,
+    0x58, 0xb0, 0xb5, 0xd7, 0xbe, 0x01, 0x55, 0xee, 0x60, 0x14, 0x9d, 0x5b,
+    0x57, 0x48, 0x05, 0x72, 0x6a, 0x23, 0x29, 0xeb, 0xf3, 0x36, 0x2a, 0xc1,
+    0xda, 0x5e, 0x4a, 0x63, 0xc4, 0x6b, 0x04, 0xe8, 0xe8, 0xc1, 0xb5, 0xc4,
+    0x2d, 0x60, 0x1f, 0xa0, 0x2b, 0x33, 0xa5, 0xb7, 0x82, 0x59, 0x21, 0xba,
+    0x13, 0xda, 0x79, 0xda, 0x5a, 0xb1, 0x82, 0x5b, 0x52, 0x7f, 0x0c, 0x70,
+    0x75, 0x65, 0xe0, 0x44, 0xb3, 0xca, 0xd0, 0x09, 0x38, 0x24, 0x83, 0x8e,
+    0x0c, 0x4c, 0xef, 0x96, 0xe4, 0x04, 0x30, 0x46, 0x23, 0x6a, 0x28, 0x13,
+    0x1d, 0x37, 0x14, 0x75, 0x6e, 0xd0, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x21, 0xa2, 0xf0, 0x7d, 0x29, 0x8f, 0x62, 0x2e,
+    0xf4, 0x0e, 0x14, 0x9b, 0x60, 0x38, 0xc0, 0x95, 0xfb, 0x3c, 0x90, 0x5a,
+    0xa0, 0x1f, 0x30, 0x09, 0xfc, 0x6d, 0xa9, 0xd1, 0x7b, 0x0b, 0x7c, 0x78,
+    0xf9, 0xf6, 0xa8, 0x5e, 0xa6, 0x7a, 0xf6, 0x1c, 0xab, 0x1b, 0x0e, 0xa9,
+    0x08, 0xfd, 0xd9, 0x97, 0x08, 0x24, 0x2b, 0xda, 0x08, 0x8b, 0x0c, 0x07,
+    0x70, 0x15, 0xa8, 0x0c, 0x86, 0xfc, 0xd1, 0x84, 0xba, 0xd0, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x35, 0x7a, 0xab, 0xaa,
+    0xbe, 0xd7, 0xad, 0x22, 0x99, 0x46, 0xbb, 0x78, 0xfd, 0x47, 0x8f, 0x2a,
+    0x4a, 0xa6, 0x2f, 0x8d, 0x15, 0x07, 0xed, 0x26, 0x1d, 0xb3, 0x12, 0xd3,
+    0x88, 0x0f, 0xf1, 0x75, 0x2a, 0x07, 0x62, 0xac, 0xbf, 0x52, 0x4a, 0xc3,
+    0x12, 0xe5, 0x3c, 0xea, 0xa6, 0x1e, 0x57, 0x90, 0x56, 0x60, 0x7d, 0xcf,
+    0x4b, 0x65, 0xaf, 0xee, 0x17, 0x56, 0xbe, 0xd2, 0x38, 0x3f, 0xd6, 0xbc,
+    0xef, 0xa7, 0x32, 0xb7, 0x10, 0xe9, 0xbd, 0x97, 0x45, 0x92, 0x3c, 0xd3,
+    0x35, 0x2e, 0x59, 0x37, 0x65, 0x5c, 0x7f, 0xd0, 0x99, 0x9c, 0x01, 0xe9,
+    0x1f, 0x65, 0xe9, 0xec, 0x0f, 0x2d, 0x46, 0xbc, 0xd4, 0x8f, 0x51, 0x1c,
+    0xa0, 0xa4, 0x9b, 0x4f, 0x95, 0x54, 0xb0, 0x50, 0x74, 0xfa, 0x0f, 0xe6,
+    0x55, 0x81, 0xce, 0x0f, 0xd1, 0x25, 0x56, 0xc8, 0x2f, 0x3a, 0x65, 0xd4,
+    0x86, 0x4a, 0x8e, 0xff, 0x5a, 0xcc, 0x67, 0x96, 0xcc, 0x65, 0x0d, 0x20,
+    0xee, 0xba, 0x6b, 0xcb, 0xde, 0x10, 0x2f, 0xbf, 0x67, 0x6d, 0xbe, 0xef,
+    0x72, 0xfc, 0x25, 0x62, 0xbf, 0xbb, 0xc5, 0xe0, 0x7b, 0x4c, 0x32, 0xc5,
+    0xdb, 0x9f, 0xb5, 0xe2, 0x75, 0x8a, 0xba, 0xbb, 0x69, 0x28, 0xb6, 0x41,
+    0x25, 0x83, 0x67, 0x35, 0x1b, 0xd7, 0xb3, 0xd7, 0x58, 0x54, 0x8a, 0x0b,
+    0x7c, 0xf3, 0x05, 0xcf, 0x2c, 0x78, 0x70, 0xc6, 0xed, 0x7e, 0x56, 0xb6,
+    0x4e, 0x48, 0xaa, 0x57, 0xc4, 0xb0, 0xb2, 0xa0, 0xca, 0x50, 0xe1, 0xc7,
+    0x41, 0xea, 0xac, 0x5f, 0x18, 0x13, 0xe5, 0x85, 0x78, 0x3f, 0x05, 0xf3,
+    0xfd, 0x74, 0x7a, 0x42, 0x61, 0x91, 0x19, 0xc6, 0x19, 0xe9, 0xd2, 0x78,
+    0x2c, 0xb1, 0xa3, 0x7f, 0x62, 0xea, 0x2a, 0x35, 0x1c, 0x55, 0xa3, 0xf7,
+    0xdc, 0xec, 0x48, 0x23, 0x99, 0x8d, 0xe1, 0x4d, 0x45, 0xad, 0x92, 0xc6,
+    0xf4, 0xa2, 0xe5, 0xe6, 0x58, 0xe4, 0xd5, 0x37, 0xd0, 0x47, 0x0b, 0x64,
+    0x68, 0x48, 0x7e, 0xeb, 0xbe, 0x5e, 0x74, 0xd1, 0xc4, 0xa5, 0x60, 0xd0,
+    0x30, 0x62, 0xbc, 0x81, 0xc4, 0x01, 0x68, 0x18, 0xf3, 0xac, 0x9d, 0xb1,
+    0x4d, 0xdd, 0x8b, 0xd2, 0x54, 0x5d, 0xd1, 0x1c, 0xee, 0x75, 0x9e, 0x99,
+    0x42, 0x69, 0x38, 0xcc, 0x66, 0x24, 0xd9, 0x8f, 0x70, 0x98, 0xc3, 0x5e,
+    0x08, 0xf0, 0xd8, 0x2d, 0xe6, 0x52, 0x48, 0xdf, 0xd0, 0x03, 0x04, 0x92,
+    0xab, 0xa1, 0xa1, 0x2f, 0x7d, 0x84, 0xb2, 0x82, 0x51, 0x56, 0x74, 0x4a,
+    0x94, 0xff, 0xd2, 0xe4, 0x4e, 0x1a, 0xbd, 0x18, 0xab, 0x33, 0x68, 0x0e,
+    0x4f, 0x99, 0x1d, 0x7e, 0x02, 0x3f, 0x1f, 0x50, 0x05, 0xf8, 0x59, 0x47,
+    0x97, 0x98, 0x60, 0xb1, 0x30, 0xb1, 0x14, 0xac, 0x2c, 0x0a, 0xa8, 0x97,
+    0x83, 0xf5, 0x5a, 0x5c, 0x87, 0xe5, 0x36, 0x26, 0xec, 0xb4, 0x94, 0x46,
+    0x9a, 0xad, 0x2b, 0x9a, 0xb7, 0xac, 0xc4, 0x1a, 0x55, 0x53, 0xc0, 0x16,
+    0x91, 0x1c, 0xd6, 0xaa, 0x6b, 0xdd, 0x85, 0x6a, 0x54, 0xec, 0x7c, 0xa1,
+    0xd5, 0x18, 0x00, 0x74, 0xd2, 0xf1, 0x7e, 0xad, 0x7c, 0xa8, 0x85, 0x9b,
+    0xc0, 0x9f, 0x4f, 0x3b, 0xd9, 0x08, 0xc8, 0x9d, 0x31, 0x22, 0x7a, 0x53,
+    0xa8, 0xbd, 0x00, 0xdf, 0xe8, 0x39, 0x52, 0xe9, 0x14, 0x74, 0x7b, 0x53,
+    0xf9, 0xbd, 0x29, 0x8e, 0x5d, 0xf2, 0x35, 0x3b, 0xe3, 0x48, 0xbf, 0xa0,
+    0xc4, 0x3d, 0x40, 0xb4, 0xf2, 0x7c, 0xd0, 0xe3, 0x17, 0x11, 0x5b, 0xd6,
+    0x55, 0xd2, 0x54, 0xcf, 0x20, 0x8d, 0x74, 0x4a, 0x6b, 0xe9, 0x5d, 0xfe,
+    0x72, 0x14, 0x6a, 0x11, 0x8b, 0x14, 0x19, 0xba, 0x63, 0xe4, 0x6b, 0x39,
+    0xb4, 0x90, 0x67, 0x79, 0x56, 0x31, 0xd3, 0xb5, 0xeb, 0x9e, 0x95, 0x4b,
+    0x1e, 0x04, 0x20, 0xd8, 0xbe, 0xe8, 0x1c, 0xd7, 0x95, 0xcb, 0x57, 0x60,
+    0xe6, 0x11, 0x35, 0x42, 0x90, 0xfd, 0xb2, 0xe4, 0x9b, 0x24, 0x70, 0xc0,
+    0xc3, 0xa9, 0x8a, 0xc9, 0x46, 0xd0, 0xea, 0xc9, 0x93, 0x7d, 0x9f, 0x64,
+    0x12, 0x54, 0x09, 0xb7, 0xc2, 0x4d, 0x6e, 0xcc, 0x60, 0x07, 0x36, 0x31,
+    0x64, 0x3d, 0x1e, 0xd3, 0x86, 0x47, 0x47, 0x42, 0x76, 0xb6, 0xf0, 0xe5,
+    0xb4, 0xe7, 0xbe, 0x47, 0x91, 0x78, 0xbe, 0x06, 0xf1, 0x6e, 0x58, 0xce,
+    0x32, 0x13, 0x26, 0x34, 0x92, 0xae, 0xb2, 0x29, 0xd0, 0x30, 0x55, 0xfd,
+    0x89, 0x6a, 0xbf, 0x3e, 0xdf, 0x11, 0x39, 0xe4, 0xfd, 0x56, 0xd7, 0x2f,
+    0x89, 0x96, 0x08, 0x54, 0xaa, 0xab, 0x8b, 0xfa, 0x65, 0xe5, 0x64, 0xff,
+    0x24, 0x25, 0x8f, 0x7d, 0xf6, 0xb1, 0x7f, 0x2f, 0xa6, 0xf6, 0x46, 0xab,
+    0x61, 0xfd, 0x47, 0xad, 0x6d, 0x38, 0x6d, 0xc1, 0xe9, 0x4a, 0xf1, 0x85,
+    0x05, 0x0e, 0x69, 0x48, 0x7c, 0xa6, 0x76, 0x61, 0xe3, 0x94, 0xf2, 0xd6,
+    0x7a, 0x9c, 0x79, 0xc0, 0x2a, 0x51, 0x23, 0xc6, 0xaf, 0x29, 0x04, 0x0f,
+    0x47, 0xc2, 0x93, 0xd7, 0x64, 0xe5, 0x37, 0x2e, 0x53, 0x3b, 0xb7, 0x7c,
+    0x9c, 0xb4, 0x63, 0x13, 0xc7, 0x56, 0x90, 0xe9, 0x53, 0xd5, 0x86, 0x2b,
+    0x96, 0x41, 0x42, 0x56, 0xc5, 0x16, 0xd7, 0x9e, 0x30, 0xce, 0xa1, 0x0d,
+    0x93, 0x5d, 0x11, 0x07, 0xb2, 0x95, 0xfd, 0xf6, 0x0b, 0x28, 0x95, 0x1a,
+    0x8f, 0xfa, 0xe1, 0x57, 0x7e, 0x06, 0xff, 0x18, 0xaf, 0xe3, 0x4f, 0x3c,
+    0x34, 0x5b, 0xd4, 0x46, 0x1a, 0xd1, 0xd1, 0x7e, 0x55, 0xba, 0x5d, 0x2a,
+    0x1f, 0x42, 0x49, 0x95, 0x75, 0x5f, 0x80, 0x60, 0x02, 0x01, 0xdb, 0x36,
+    0xad, 0x68, 0x69, 0x1e, 0x0b, 0x90, 0x3f, 0xa6, 0xb6, 0x2f, 0x66, 0xa6,
+    0x7d, 0x81, 0x8c, 0xa0, 0xee, 0x05, 0x95, 0xbc, 0xb3, 0x7c, 0x18, 0xd4,
+    0x1b, 0x40, 0x96, 0xf5, 0x05, 0x9d, 0x27, 0x3b, 0x78, 0xfc, 0x19, 0x18,
+    0xc0, 0x61, 0xa0, 0xd6, 0xf9, 0xc0, 0x3f, 0xe5, 0x48, 0x35, 0x0f, 0x8b,
+    0x0d, 0xfb, 0x31, 0xb7, 0x32, 0x40, 0x1d, 0x69, 0x12, 0x5a, 0x23, 0xf0,
+    0xce, 0xe9, 0x5e, 0xa6, 0x68, 0x6b, 0xe1, 0xe2, 0x68, 0x07, 0x02, 0x0d,
+    0x7a, 0xc2, 0x0a, 0x40, 0x10, 0x5e, 0x94, 0xba, 0x77, 0x1d, 0xf7, 0xac,
+    0xec, 0x79, 0xa9, 0xa1, 0x8a, 0xb8, 0x49, 0x32, 0x08, 0xe0, 0x18, 0xa8,
+    0x3d, 0x69, 0x41, 0x5d, 0x30, 0x3b, 0xb6, 0x91, 0x46, 0x8d, 0x81, 0x10,
+    0xb0, 0xc2, 0xed, 0xa0, 0x4e, 0x59, 0x48, 0xd8, 0x64, 0x7d, 0x2d, 0x46,
+    0xf2, 0x8a, 0x2e, 0x5d, 0x0c, 0x4d, 0x9f, 0xfe, 0x7b, 0x5e, 0xbf, 0x1a,
+    0x78, 0xdf, 0xfc, 0x0f, 0x04, 0x37, 0x72, 0x1a, 0x09, 0xb8, 0x6e, 0x1b,
+    0xf1, 0x18, 0x7d, 0x83, 0x44, 0xaa, 0x9b, 0x71, 0xe1, 0x03, 0x04, 0x83,
+    0xe5, 0xaa, 0xc0, 0xd4, 0xa7, 0x80, 0x10, 0x35, 0x09, 0xae, 0xf7, 0xe1,
+    0x5e, 0x7c, 0x31, 0x20, 0x43, 0x82, 0xda, 0x07, 0x39, 0xfe, 0x8f, 0x9d,
+    0x70, 0x3c, 0x57, 0x43, 0x01, 0x51, 0x37, 0x2e, 0x97, 0xef, 0xcf, 0x05,
+    0x44, 0x75, 0x69, 0xf7, 0xdb, 0xda, 0x80, 0x78, 0x0c, 0xcc, 0xc1, 0x49,
+    0xac, 0x3b, 0x7e, 0x27, 0x6a, 0xbb, 0xdf, 0x45, 0x5b, 0x3b, 0x29, 0xf6,
+    0x1b, 0xa9, 0x25, 0xf9, 0x2f, 0xcf, 0x37, 0x71, 0x33, 0xb4, 0x90, 0xd7,
+    0x9b, 0x87, 0x41, 0x15, 0xd1, 0xa6, 0x39, 0xa7, 0xa9, 0xcd, 0x66, 0x29,
+    0x59, 0xb4, 0x53, 0x12, 0xa1, 0x20, 0xd5, 0x04, 0xca, 0x40, 0x31, 0xfa,
+    0x6f, 0xbb, 0x92, 0x04, 0xf3, 0xc2, 0x10, 0x0d, 0xc1, 0x19, 0x78, 0x8c,
+    0x82, 0xed, 0x92, 0x3a, 0x6b, 0xd1, 0x3d, 0xe8, 0xac, 0x55, 0xe4, 0x8c,
+    0xc6, 0xd4, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+    0xc2, 0x1d, 0x86, 0xe4, 0xf6, 0xa1, 0xbe, 0xf5, 0xf3, 0x36, 0x9d, 0x32,
+    0x80, 0x17, 0x3b, 0x1f, 0x18, 0x21, 0xed, 0xa7, 0xf5, 0xaf, 0xf1, 0x94,
+    0xe2, 0xa7, 0x08, 0xd5, 0xca, 0x18, 0x45, 0xf5, 0x68, 0x94, 0x82, 0x61,
+    0xf7, 0xb7, 0xb2, 0xfa, 0xd4, 0x5e, 0x32, 0xd0, 0xf0, 0x20, 0x66, 0x83,
+    0xd1, 0x6b, 0x3c, 0xdf, 0x73, 0xeb, 0x73, 0x82, 0x09, 0x9b, 0xd0, 0xc5,
+    0xb0, 0x9f, 0x01, 0x77, 0x85, 0xcc, 0x6e, 0x23, 0xb7, 0x00, 0x45, 0xe0,
+    0xa6, 0x01, 0x29, 0x1d, 0x8b, 0xc4, 0xe0, 0xc2, 0xe0, 0x4f, 0x3b, 0x07,
+    0xd5, 0xac, 0x6b, 0x88, 0xb8, 0xa4, 0xe2, 0x5c, 0x19, 0xe9, 0x98, 0x72,
+    0xa5, 0x6b, 0xf5, 0xa4, 0xf7, 0x15, 0xaf, 0xfb, 0xb4, 0x80, 0x9a, 0xe3,
+    0xa5, 0x35, 0x2f, 0x45, 0x81, 0xf1, 0x8b, 0x2d, 0x26, 0x5c, 0x65, 0xa9,
+    0x5b, 0x6e, 0x83, 0xc3, 0x62, 0x2f, 0x84, 0xef, 0x11, 0xa5, 0x58, 0x48,
+    0xe9, 0x67, 0x7e, 0xd3, 0x0b, 0x5d, 0x51, 0x80, 0x39, 0x08, 0x8e, 0xc1,
+    0x0d, 0x04, 0x11, 0x5f, 0x72, 0x64, 0x1f, 0x83, 0xf8, 0xd3, 0x09, 0x38,
+    0xb6, 0x7f, 0x50, 0x78, 0x27, 0x20, 0xe5, 0xbd, 0x16, 0xbf, 0x51, 0xd8,
+    0x4f, 0x67, 0x60, 0xf6, 0x9e, 0xff, 0x08, 0xfe, 0xc6, 0x96, 0xd6, 0x64,
+    0x94, 0x28, 0xc6, 0x9a, 0x09, 0x1a, 0x34, 0x08, 0x31, 0x4b, 0x0b, 0x97,
+    0x5a, 0x18, 0x72, 0x49, 0xe9, 0x1d, 0xbb, 0x9c, 0xed, 0x7e, 0xb5, 0xc5,
+    0xa7, 0xf4, 0x25, 0x7a, 0x26, 0xe9, 0x15, 0x61, 0x85, 0x32, 0xc9, 0xb3,
+    0xcf, 0x95, 0xbf, 0x35, 0x10, 0x2d, 0x71, 0xfe, 0x03, 0xd6, 0x69, 0x75,
+    0x8d, 0xb7, 0x16, 0xa7, 0x3d, 0x0e, 0xb7, 0x55, 0x6d, 0xa7, 0x9f, 0x10,
+    0x7e, 0x7e, 0xff, 0x39, 0xee, 0x8e, 0xa7, 0x81, 0x7d, 0x11, 0xea, 0xa9,
+    0xd6, 0xed, 0x54, 0xf8, 0xd2, 0xd5, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x04, 0x00, 0x00, 0xf9, 0xde, 0x41, 0xe7, 0xa6, 0x88, 0x53, 0x76,
+    0x5a, 0x26, 0xc3, 0x5c, 0xf2, 0x58, 0x68, 0x9c, 0xc7, 0x4e, 0x53, 0x18,
+    0x53, 0x67, 0x39, 0x23, 0x96, 0xb0, 0xef, 0x58, 0x29, 0xe1, 0x68, 0xd8,
+    0xce, 0xc0, 0x41, 0xc2, 0x35, 0x5f, 0x74, 0xfa, 0xdf, 0xc7, 0x0f, 0x80,
+    0x50, 0xd1, 0xf6, 0x5a, 0x3a, 0x81, 0xe0, 0xd9, 0x9b, 0x47, 0x96, 0xcd,
+    0xc5, 0x0f, 0x91, 0x12, 0x81, 0x77, 0x1e, 0xef, 0x2e, 0xba, 0x16, 0x51,
+    0x70, 0x78, 0xdc, 0xa3, 0x84, 0x12, 0x7c, 0x9e, 0x21, 0x7d, 0xa3, 0x5f,
+    0xce, 0xa1, 0x25, 0x84, 0x99, 0xa4, 0x2d, 0xa6, 0x0f, 0x95, 0xef, 0xef,
+    0x31, 0xe6, 0xf2, 0x18, 0x08, 0x47, 0xd2, 0x5a, 0x39, 0x01, 0x7a, 0xca,
+    0xd3, 0x03, 0xb1, 0xc2, 0x48, 0xf4, 0x1f, 0x6d, 0xc2, 0x8c, 0x5c, 0xda,
+    0xf5, 0x10, 0xed, 0xfc, 0x2e, 0x0c, 0xb3, 0x52, 0xaa, 0xa9, 0xed, 0xbc,
+    0x41, 0xcc, 0xd4, 0x4b, 0x1c, 0xd0, 0xa3, 0x1d, 0xf4, 0xe7, 0x48, 0x34,
+    0x4e, 0xcf, 0x3b, 0xb3, 0x71, 0x06, 0xbe, 0x0c, 0x35, 0xbb, 0xb4, 0x17,
+    0xd8, 0x8b, 0xba, 0xdd, 0x32, 0x30, 0x51, 0xb1, 0xb1, 0xd6, 0x3a, 0xdc,
+    0x3b, 0x25, 0x9a, 0x57, 0xc7, 0x4d, 0xd3, 0x75, 0x93, 0x59, 0x3e, 0x9b,
+    0x10, 0xcf, 0xdb, 0x38, 0x75, 0x51, 0xb2, 0x2a, 0x48, 0x78, 0xfc, 0xaa,
+    0xe3, 0x91, 0xe7, 0x93, 0xe7, 0x0a, 0x07, 0x2c, 0xf8, 0x88, 0x93, 0xde,
+    0x2f, 0xba, 0x7b, 0x72, 0xcd, 0x92, 0xdd, 0xb1, 0xac, 0x1e, 0xe4, 0xe3,
+    0x5d, 0xa4, 0x7f, 0x86, 0xa7, 0xcb, 0xb5, 0x81, 0x86, 0xf1, 0xf5, 0xad,
+    0xd6, 0x36, 0x08, 0x09, 0x9f, 0x75, 0x6f, 0x4a, 0x5b, 0x30, 0xf8, 0xaf,
+    0xd2, 0xbc, 0xb5, 0xbe, 0xf2, 0xeb, 0x9b, 0xbc, 0x11, 0xd4, 0x0c, 0x14,
+    0xa6, 0x6f, 0x43, 0xd3, 0xc9, 0x4e, 0xca, 0x9b, 0x4e, 0x46, 0x60, 0x4c,
+    0x63, 0xcc, 0x07, 0x36, 0x8c, 0xf2, 0xd1, 0x93, 0x7a, 0x51, 0x49, 0x15,
+    0xbf, 0xbf, 0x9e, 0x82, 0x21, 0x06, 0xa0, 0x39, 0x11, 0x1d, 0x6c, 0x41,
+    0x72, 0xcd, 0x2a, 0x8a, 0x4a, 0xd0, 0x13, 0x6c, 0x56, 0xf4, 0x00, 0x48,
+    0xaf, 0xab, 0xdf, 0xa9, 0xe9, 0xa6, 0xaa, 0x06, 0x61, 0x79, 0xc4, 0x57,
+    0x42, 0xca, 0x12, 0x18, 0xcf, 0x81, 0xec, 0x79, 0x19, 0xd2, 0xd2, 0xe3,
+    0x1d, 0xc6, 0x6c, 0xd0, 0xd6, 0x0a, 0xfb, 0x70, 0x42, 0x28, 0x25, 0x23,
+    0xb6, 0x23, 0x15, 0x28, 0x5e, 0x9f, 0x49, 0xf2, 0x7b, 0x69, 0x74, 0xa5,
+    0xb9, 0x26, 0x81, 0xfe, 0x39, 0x3e, 0x3f, 0xc8, 0x7e, 0x9e, 0x5e, 0x8e,
+    0xf2, 0xdb, 0x6b, 0xfd, 0xe1, 0xc3, 0x01, 0x4a, 0xba, 0x8f, 0x33, 0x71,
+    0x09, 0x80, 0x5d, 0x9c, 0x58, 0x64, 0xb7, 0x90, 0x13, 0x2a, 0xe9, 0x1d,
+    0x07, 0x2c, 0x06, 0x70, 0x43, 0x0d, 0xb6, 0x57, 0x02, 0x3c, 0xbe, 0x3c,
+    0x42, 0xab, 0x77, 0x15, 0x0e, 0x98, 0xfb, 0xf2, 0x1d, 0x14, 0xd9, 0xb8,
+    0xd1, 0x59, 0x2a, 0x67, 0x6f, 0xfc, 0x59, 0x39, 0x33, 0xe0, 0x49, 0x0b,
+    0x4e, 0x65, 0x81, 0x9f, 0x71, 0xf2, 0xa5, 0x90, 0x4f, 0x24, 0xc7, 0x05,
+    0xfb, 0x77, 0x1e, 0x14, 0xca, 0x2f, 0xfc, 0xac, 0xec, 0xbf, 0xa2, 0x69,
+    0x15, 0x0a, 0x6b, 0xa9, 0xa0, 0x74, 0xee, 0xad, 0xa9, 0x50, 0x4d, 0x4d,
+    0xab, 0x6e, 0xc1, 0xb3, 0xda, 0xbb, 0xbd, 0xab, 0x00, 0x05, 0x14, 0xc1,
+    0xc4, 0x53, 0x7b, 0x78, 0x97, 0x68, 0x3c, 0x05, 0xf2, 0xed, 0x87, 0xca,
+    0x86, 0xd1, 0xdf, 0xda, 0xb3, 0x2f, 0x17, 0x87, 0x87, 0x2f, 0xd8, 0xe9,
+    0xb2, 0x96, 0xdc, 0x7f, 0x22, 0xf1, 0x2a, 0x9f, 0xfe, 0x54, 0x55, 0xa1,
+    0x96, 0xab, 0x9f, 0x61, 0x74, 0xcd, 0x4d, 0x77, 0x38, 0x02, 0x23, 0x29,
+    0x28, 0x5b, 0xfc, 0x86, 0x17, 0x40, 0xd4, 0x42, 0x2a, 0x9b, 0x84, 0xf7,
+    0x67, 0x2b, 0x3a, 0xc1, 0x31, 0x89, 0x4b, 0x67, 0xd1, 0x7d, 0x6b, 0x36,
+    0xec, 0x69, 0x6b, 0x24, 0xca, 0xd6, 0x2d, 0xbb, 0x21, 0xc8, 0x0c, 0x53,
+    0x41, 0x29, 0x0b, 0xc1, 0xfe, 0xd5, 0xa3, 0x4c, 0x66, 0x2f, 0xc7, 0xf1,
+    0xa8, 0xc0, 0x3d, 0x9a, 0xb9, 0x09, 0x50, 0x3f, 0x09, 0x87, 0xa4, 0x3f,
+    0x7a, 0x33, 0xef, 0xf0, 0xfb, 0x77, 0x02, 0x7d, 0x92, 0xaf, 0x73, 0xaa,
+    0xcc, 0x3f, 0x66, 0x56, 0xd0, 0x21, 0xd1, 0xe8, 0x0e, 0x47, 0x03, 0x5e,
+    0x3b, 0xe9, 0xa2, 0xe3, 0x83, 0x0b, 0x73, 0xd3, 0xaa, 0x94, 0x80, 0xef,
+    0x7c, 0xdf, 0xde, 0x86, 0xc3, 0xa9, 0x62, 0x34, 0x76, 0xee, 0x4d, 0x15,
+    0x73, 0x7b, 0xd7, 0x6d, 0xd4, 0x21, 0x05, 0xd4, 0xcf, 0xf3, 0x54, 0xdc,
+    0x49, 0x5f, 0x5a, 0x2a, 0x37, 0x19, 0x89, 0x61, 0x1d, 0x95, 0x17, 0x8b,
+    0x09, 0x95, 0x5d, 0x9f, 0xde, 0x86, 0x03, 0x93, 0x76, 0xec, 0x54, 0xec,
+    0x13, 0xc3, 0xf9, 0x38, 0x8f, 0xa9, 0x11, 0xf0, 0x9a, 0x0e, 0x5e, 0x38,
+    0x69, 0xeb, 0x62, 0x41, 0x9e, 0xd0, 0x1b, 0x59, 0x8c, 0xfd, 0x16, 0xfa,
+    0xd8, 0x99, 0x0d, 0x83, 0x7e, 0xba, 0x5b, 0xc6, 0x59, 0xe1, 0xae, 0xba,
+    0xb9, 0xb8, 0xba, 0xa5, 0x4d, 0x20, 0x00, 0xc9, 0x0c, 0xe1, 0x77, 0xdf,
+    0xc4, 0x95, 0xca, 0x7c, 0xa5, 0xef, 0x0a, 0xed, 0x9b, 0x31, 0x06, 0xe1,
+    0xc9, 0xa3, 0x88, 0x0a, 0xcc, 0x3d, 0xc8, 0xb6, 0x01, 0xe2, 0xa9, 0x29,
+    0x03, 0x8a, 0x28, 0xf8, 0x0d, 0x70, 0x77, 0xb9, 0xe1, 0x1b, 0x06, 0x19,
+    0x86, 0xc1, 0xd3, 0xcf, 0x6b, 0x9c, 0x09, 0x70, 0x50, 0xed, 0xb5, 0xf6,
+    0x69, 0xcc, 0xac, 0x30, 0x6a, 0x1f, 0x1d, 0xe6, 0x75, 0x33, 0xab, 0x55,
+    0x48, 0xfa, 0x81, 0xb8, 0x06, 0x3a, 0x78, 0xee, 0xde, 0xef, 0xe2, 0x17,
+    0xc4, 0x3e, 0xe5, 0x22, 0xa7, 0xd1, 0x45, 0x5b, 0x57, 0xb0, 0xde, 0x69,
+    0x30, 0xd1, 0x9a, 0xd7, 0x6b, 0x0e, 0x7a, 0x30, 0x0d, 0xb5, 0xec, 0x60,
+    0xa7, 0x05, 0x87, 0x42, 0x4b, 0x92, 0x1f, 0x68, 0x8e, 0x1a, 0x90, 0x84,
+    0x27, 0x2a, 0xc0, 0xd2, 0xff, 0xbc, 0x8e, 0x34, 0x53, 0x9d, 0x04, 0x50,
+    0xcb, 0x79, 0xd9, 0x55, 0xd5, 0x4d, 0x3c, 0xe2, 0xb4, 0x9b, 0x57, 0x07,
+    0x1f, 0xce, 0xd0, 0xa7, 0x84, 0xe1, 0xb7, 0x3a, 0xaf, 0xc5, 0x67, 0x64,
+    0xbc, 0x02, 0xbe, 0xb0, 0x65, 0x7e, 0xb0, 0x4c, 0xc2, 0x2d, 0xcd, 0xf8,
+    0x60, 0xcb, 0xfe, 0xd1, 0x8d, 0x14, 0x5a, 0xd3, 0x38, 0xd4, 0x71, 0x5a,
+    0xca, 0xbb, 0xfe, 0x0e, 0x54, 0xf9, 0xb4, 0x25, 0xa5, 0x71, 0x13, 0x95,
+    0x14, 0xdc, 0x86, 0xb8, 0x21, 0xa7, 0x2e, 0x13, 0xc6, 0x2f, 0xce, 0xe7,
+    0x6c, 0xb8, 0x0d, 0xc9, 0xe4, 0xc4, 0x64, 0x12, 0x78, 0x1c, 0x95, 0x92,
+    0xc2, 0xec, 0xaa, 0xd3, 0xc3, 0x3a, 0xd2, 0xe8, 0x95, 0xf0, 0x6b, 0x03,
+    0x8c, 0xcf, 0x6b, 0xdb, 0x21, 0xa0, 0xcf, 0xf4, 0x05, 0xc8, 0xe7, 0x77,
+    0x05, 0x55, 0x7b, 0x6b, 0xfa, 0x96, 0xf1, 0x7c, 0x30, 0x62, 0x75, 0xbe,
+    0x6e, 0xea, 0xba, 0x9f, 0x40, 0x2e, 0x9a, 0x86, 0x93, 0xcc, 0x38, 0xf7,
+    0xee, 0xd8, 0xbb, 0x24, 0xcd, 0x85, 0x3e, 0x85, 0x16, 0x8c, 0x33, 0x23,
+    0x73, 0xe6, 0x43, 0xc4, 0x67, 0xbf, 0xef, 0x85, 0xb1, 0x44, 0xf9, 0x55,
+    0x93, 0x4d, 0x0b, 0x8e, 0xc1, 0x42, 0x13, 0xc6, 0xc8, 0x09, 0x63, 0xab,
+    0xb3, 0xc7, 0xc4, 0xa4, 0x8b, 0x72, 0xfb, 0xa5, 0x99, 0xa1, 0x5d, 0x07,
+    0x02, 0x82, 0x56, 0x11, 0x3c, 0xc2, 0x5a, 0x55, 0xf9, 0x3a, 0x93, 0x61,
+    0x89, 0x46, 0xb7, 0x6a, 0x42, 0x76, 0x1e, 0x70, 0xde, 0xd9, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x32, 0xc1, 0x61, 0xaa,
+    0xdb, 0xe9, 0xae, 0x88, 0xcb, 0xf7, 0x28, 0xdd, 0x82, 0x62, 0x61, 0x41,
+    0x4e, 0xbb, 0xf9, 0xb7, 0xe8, 0x81, 0x99, 0x18, 0xe2, 0xa7, 0xb4, 0x7c,
+    0xb7, 0x08, 0x44, 0x6f, 0x24, 0xb3, 0xda, 0x57, 0x62, 0x29, 0xc7, 0xa6,
+    0x84, 0xb1, 0x5d, 0xc5, 0x00, 0x4c, 0x30, 0x16, 0xf0, 0x0a, 0x74, 0x73,
+    0xec, 0xaf, 0xb5, 0xde, 0xb0, 0xa7, 0x75, 0x22, 0x8f, 0x9e, 0x43, 0x01,
+    0x68, 0xae, 0x91, 0xeb, 0x46, 0x52, 0x3f, 0x2c, 0x4e, 0xc5, 0xd0, 0xc8,
+    0x15, 0xea, 0x99, 0xc2, 0x37, 0x5b, 0x68, 0xb5, 0xce, 0x41, 0x92, 0xbf,
+    0xd6, 0xdb, 0x85, 0xad, 0x08, 0xd1, 0x11, 0x93, 0xe8, 0xd4, 0x78, 0x43,
+    0x3b, 0x7d, 0xcb, 0x42, 0x84, 0xf3, 0x61, 0x88, 0x9e, 0x6a, 0x73, 0xb9,
+    0x78, 0x17, 0x9a, 0x9f, 0xfb, 0x97, 0xcb, 0xd6, 0xb5, 0x3f, 0x00, 0x41,
+    0xb0, 0x30, 0x2f, 0x6f, 0x89, 0xdd, 0xfa, 0x13, 0xd1, 0x07, 0xbe, 0x2f,
+    0xea, 0x91, 0x62, 0xaa, 0xed, 0xcb, 0xfd, 0x07, 0x82, 0xbb, 0x3f, 0xf4,
+    0xa6, 0x94, 0x66, 0x71, 0x20, 0x61, 0xac, 0x84, 0x04, 0x70, 0xf2, 0xd3,
+    0xdf, 0xac, 0x44, 0xfd, 0x47, 0x26, 0x81, 0x64, 0xb3, 0xa6, 0x90, 0x2b,
+    0xd2, 0x2c, 0xd0, 0x77, 0x81, 0x53, 0x45, 0x78, 0x5f, 0x30, 0x77, 0x91,
+    0x83, 0x13, 0x33, 0xd1, 0x91, 0xa6, 0x35, 0x21, 0xcb, 0x26, 0x54, 0x0a,
+    0xf7, 0x70, 0x5e, 0xdb, 0xd8, 0x92, 0xc7, 0xdf, 0xf9, 0x2a, 0x46, 0x91,
+    0x22, 0x3b, 0xe6, 0xe1, 0x91, 0xeb, 0xa6, 0x78, 0x81, 0x57, 0xf3, 0x04,
+    0xdf, 0x34, 0x55, 0x74, 0x0a, 0xfe, 0xf2, 0xbd, 0xb3, 0xeb, 0xa3, 0x8e,
+    0x71, 0x15, 0xa9, 0x2f, 0x53, 0xe2, 0xa1, 0x45, 0xdf, 0xe8, 0x29, 0x40,
+    0xf1, 0x4b, 0x23, 0xdb, 0x8e, 0xee, 0x19, 0xa8, 0xd4, 0x15, 0x90, 0x8c,
+    0x04, 0x46, 0x81, 0x49, 0x92, 0xe5, 0xe1, 0xfe, 0x99, 0x06, 0xfc, 0x3e,
+    0x43, 0x58, 0x3b, 0x19, 0x7f, 0xd2, 0x13, 0x65, 0xc2, 0x64, 0x27, 0x6d,
+    0x93, 0x6a, 0xcf, 0x48, 0x2a, 0x3d, 0xdd, 0x79, 0x9f, 0x05, 0x32, 0xeb,
+    0xfd, 0xb4, 0xd2, 0x1d, 0x16, 0x61, 0x3d, 0x17, 0x4c, 0xb8, 0xad, 0x63,
+    0x0e, 0x6b, 0x8a, 0x4a, 0x34, 0x4c, 0xb5, 0x3c, 0x0f, 0x05, 0x28, 0x8c,
+    0x8b, 0xdf, 0xf4, 0xa0, 0x49, 0xbf, 0x34, 0x6c, 0x6a, 0x5f, 0x40, 0x95,
+    0x48, 0x4b, 0x93, 0x1e, 0x61, 0x6d, 0x58, 0xc3, 0x86, 0x98, 0x70, 0x11,
+    0x4e, 0x44, 0x65, 0xc1, 0x0d, 0xea, 0x2f, 0xda, 0x38, 0x16, 0xbd, 0xd4,
+    0x7b, 0x3e, 0x31, 0xee, 0x42, 0x4c, 0xdc, 0xe9, 0x8b, 0x1f, 0xa9, 0xcf,
+    0xab, 0x60, 0xb5, 0xb1, 0xd2, 0xf2, 0x6a, 0xe9, 0xbc, 0xcc, 0xcb, 0x60,
+    0x4a, 0xca, 0x70, 0x79, 0x64, 0x9d, 0x07, 0x1e, 0xdb, 0xef, 0x34, 0xaf,
+    0x17, 0x93, 0x6b, 0x60, 0x73, 0x2d, 0x8c, 0x08, 0x27, 0x1e, 0x46, 0x9f,
+    0xcb, 0x33, 0xdd, 0x76, 0xef, 0x17, 0x58, 0x9a, 0x5f, 0x82, 0x78, 0x0f,
+    0xbf, 0xe7, 0x0f, 0x3a, 0x1e, 0xa8, 0x30, 0xbf, 0xff, 0xc7, 0xc7, 0x82,
+    0x8b, 0xc3, 0x65, 0x04, 0xfd, 0x45, 0xc9, 0x88, 0x99, 0x8e, 0x44, 0xc5,
+    0x23, 0x1e, 0xbf, 0xf1, 0x95, 0x70, 0x35, 0xe6, 0x56, 0x4a, 0x53, 0xb2,
+    0xac, 0x0c, 0xfd, 0xf5, 0x61, 0x26, 0x5b, 0x70, 0xd6, 0x4c, 0xfc, 0x0f,
+    0xcc, 0x53, 0x6e, 0x25, 0xca, 0x1d, 0x0c, 0x56, 0xf7, 0x9c, 0x95, 0xf6,
+    0x3c, 0x08, 0x0c, 0x64, 0xb1, 0x1c, 0x5c, 0xe6, 0x25, 0xa4, 0xa3, 0xb7,
+    0xaf, 0x8b, 0xbc, 0xe1, 0x68, 0xdf, 0x10, 0xab, 0xbb, 0xd5, 0x30, 0x64,
+    0x42, 0xf6, 0xe6, 0x9a, 0xb5, 0x59, 0x12, 0x76, 0x92, 0xac, 0x29, 0xe9,
+    0x45, 0xdb, 0x2e, 0x62, 0x22, 0x58, 0x24, 0x89, 0xc8, 0x6a, 0x2a, 0xa7,
+    0x3f, 0x04, 0x53, 0x4e, 0x07, 0x41, 0x4e, 0x5f, 0x95, 0x5f, 0x6e, 0x14,
+    0x5b, 0xa7, 0xa7, 0xd3, 0x5a, 0xa2, 0x95, 0x4a, 0xc8, 0xe9, 0x3c, 0x5a,
+    0x84, 0x50, 0xbc, 0xe1, 0x9c, 0x7a, 0x16, 0xe5, 0xc7, 0x04, 0x9d, 0x60,
+    0x2e, 0x7d, 0xb3, 0x77, 0x5d, 0x86, 0x2e, 0xac, 0x57, 0x2a, 0x31, 0x26,
+    0x23, 0x6e, 0xcc, 0x7f, 0xb8, 0x36, 0x29, 0xa9, 0xa8, 0xd9, 0xc6, 0x75,
+    0xee, 0x16, 0x23, 0x27, 0x0f, 0xe1, 0xb0, 0x3d, 0x91, 0x3a, 0x26, 0x4a,
+    0x60, 0x72, 0x14, 0xf9, 0x3c, 0x66, 0x66, 0xe8, 0x7d, 0x4a, 0x6f, 0x7e,
+    0x63, 0x58, 0x6a, 0x28, 0x78, 0x50, 0xef, 0x3b, 0x9d, 0xeb, 0xb6, 0x4b,
+    0x5d, 0x55, 0x80, 0x84, 0x97, 0x9b, 0x74, 0x4b, 0x5c, 0x09, 0x1d, 0xe7,
+    0x57, 0xfc, 0x40, 0x3f, 0xa9, 0xbd, 0xdf, 0x61, 0x2a, 0x89, 0x62, 0x51,
+    0xfc, 0x24, 0xee, 0xee, 0x97, 0x10, 0xca, 0xb6, 0x0e, 0x8e, 0x71, 0x67,
+    0x2a, 0x79, 0x4f, 0xc4, 0xe6, 0x3e, 0x27, 0xc2, 0x9b, 0x85, 0xfd, 0xde,
+    0xfb, 0x58, 0x75, 0xf3, 0x1c, 0x31, 0xa2, 0x56, 0x3e, 0xdc, 0x24, 0xf4,
+    0x4f, 0xcb, 0x5a, 0x1a, 0x77, 0x5c, 0x28, 0xd1, 0x5a, 0x55, 0xa9, 0x8c,
+    0xb5, 0xdd, 0x77, 0x93, 0x58, 0xd8, 0x2f, 0x7d, 0x5a, 0x67, 0xa1, 0x95,
+    0x0a, 0xd2, 0x6a, 0x93, 0xa6, 0xf0, 0x5f, 0x7f, 0x0a, 0x29, 0xdb, 0x1d,
+    0x8c, 0xa7, 0x12, 0x0a, 0xf4, 0xc9, 0xcd, 0x70, 0xd1, 0xbd, 0x48, 0xd4,
+    0x9a, 0xbb, 0xbb, 0x24, 0xbf, 0x52, 0x25, 0xb9, 0x75, 0xc2, 0x17, 0x36,
+    0x6f, 0x4a, 0xc0, 0x53, 0x6d, 0x38, 0xfb, 0x7a, 0x60, 0xc8, 0x5d, 0x03,
+    0xc1, 0x1c, 0x0c, 0x31, 0xf0, 0x59, 0xed, 0x0a, 0x5f, 0x84, 0xf2, 0x89,
+    0x6c, 0xb4, 0xd5, 0x24, 0x2d, 0x2a, 0xda, 0xbe, 0x74, 0x1d, 0x22, 0xe2,
+    0xc6, 0xf0, 0x9b, 0x98, 0x5a, 0x41, 0x11, 0x4c, 0x51, 0x97, 0x16, 0xa7,
+    0xc9, 0xd8, 0x53, 0x12, 0x53, 0xdd, 0x22, 0xa9, 0xf2, 0xae, 0x52, 0x49,
+    0x02, 0xf9, 0x5c, 0x78, 0x00, 0xa2, 0x64, 0xff, 0x91, 0x62, 0x20, 0x6a,
+    0x87, 0x6a, 0x40, 0x01, 0x85, 0x30, 0xf5, 0xdd, 0xa7, 0x64, 0x0a, 0x85,
+    0x8d, 0x37, 0x99, 0xcb, 0x03, 0xc8, 0x29, 0x56, 0x7e, 0x75, 0x4f, 0xa1,
+    0xc3, 0x76, 0xce, 0xdb, 0xa3, 0xb4, 0x7e, 0x91, 0x95, 0xbe, 0x53, 0x0e,
+    0x20, 0xc9, 0xe7, 0x71, 0x78, 0xad, 0x3d, 0x4c, 0xbb, 0x59, 0xb9, 0x77,
+    0xcf, 0x7d, 0x7b, 0xff, 0x15, 0xdb, 0x1d, 0xae, 0x1f, 0xbe, 0x33, 0x88,
+    0x01, 0x04, 0x95, 0xe5, 0xe9, 0x6a, 0x1c, 0xbf, 0xc8, 0xc3, 0x33, 0x3b,
+    0xd8, 0x2f, 0x75, 0x4a, 0xc3, 0x6f, 0x09, 0x88, 0x26, 0x46, 0x90, 0x89,
+    0x53, 0x12, 0x27, 0xc2, 0x7d, 0x23, 0x6b, 0xc4, 0xe3, 0x0a, 0x0f, 0xc2,
+    0x86, 0x6d, 0x20, 0x35, 0x82, 0x33, 0xec, 0xdd, 0xa7, 0x6a, 0xc3, 0xa8,
+    0x11, 0xdc, 0x02, 0xd9, 0x05, 0x1b, 0x04, 0x75, 0x92, 0x6c, 0x08, 0x9e,
+    0x38, 0x72, 0xd9, 0x7d, 0x9b, 0xbc, 0xfd, 0xca, 0xb8, 0x06, 0x0e, 0x24,
+    0x89, 0x90, 0xde, 0x52, 0xe4, 0xd1, 0xcc, 0x99, 0x87, 0x0b, 0x87, 0xbb,
+    0x5c, 0xa9, 0xab, 0xec, 0xb5, 0xe4, 0xdd, 0x5d, 0xfa, 0xb1, 0x97, 0x5f,
+    0x61, 0xf7, 0x58, 0xd6, 0x08, 0x02, 0xf2, 0x51, 0x7c, 0x7a, 0xe6, 0xf1,
+    0xcb, 0x43, 0xd0, 0x21, 0x09, 0xb8, 0x82, 0xa9, 0x52, 0xd9, 0xa8, 0x7f,
+    0x2b, 0xe1, 0x0f, 0x31, 0xbc, 0x16, 0xa2, 0xce, 0x35, 0x55, 0x2e, 0xd6,
+    0xda, 0x38, 0xd9, 0xc2, 0x5e, 0xca, 0x27, 0xd9, 0xa6, 0xd6, 0x4b, 0xa2,
+    0x73, 0xc4, 0xce, 0x66, 0x30, 0x60, 0xa2, 0x01, 0xfa, 0xc1, 0xd6, 0xc8,
+    0xea, 0xdd, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x70, 0xe2, 0x62, 0x68, 0xff, 0x60, 0x67, 0x64, 0x88, 0xdd, 0x81, 0x79,
+    0x82, 0xf5, 0x46, 0xf9, 0x7e, 0x0e, 0xa9, 0x26, 0xf6, 0xcf, 0x5d, 0xef,
+    0x10, 0x11, 0xe1, 0x71, 0x72, 0x77, 0xcf, 0x02, 0x7b, 0xf1, 0x6e, 0xc4,
+    0xb4, 0xfa, 0x2a, 0x12, 0xfe, 0x7e, 0x3c, 0x66, 0xef, 0x41, 0x98, 0x3a,
+    0x1f, 0xa9, 0x14, 0x8f, 0x46, 0x22, 0xa0, 0xc2, 0xee, 0x93, 0x25, 0x34,
+    0xf2, 0xb7, 0x6d, 0x0a, 0x36, 0xde, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x04, 0x00, 0x00, 0xd4, 0x17, 0x62, 0x25, 0xfd, 0x5b, 0x75, 0xeb,
+    0xec, 0x06, 0xc9, 0x39, 0x86, 0x6d, 0xc5, 0x60, 0x2d, 0x33, 0x3d, 0xce,
+    0x6a, 0x9f, 0x07, 0x3b, 0xb9, 0x70, 0x0f, 0xc7, 0x13, 0x46, 0x35, 0x46,
+    0x26, 0xe4, 0xbc, 0x6e, 0x54, 0x89, 0x29, 0xd5, 0xa4, 0x94, 0xa0, 0x3a,
+    0x7a, 0x61, 0xcf, 0xd1, 0x48, 0x27, 0x7a, 0x72, 0x95, 0xde, 0x93, 0xd1,
+    0x19, 0x1f, 0xc9, 0xc8, 0x8f, 0x0d, 0xce, 0x34, 0x03, 0x39, 0x0a, 0x92,
+    0x16, 0x09, 0xc4, 0x49, 0xf9, 0x30, 0x2e, 0x19, 0xd1, 0x69, 0x7e, 0x78,
+    0x00, 0x25, 0x30, 0x6f, 0x6b, 0xe1, 0xbe, 0xad, 0xb2, 0x05, 0xde, 0xc7,
+    0xc2, 0xf7, 0xd5, 0xa7, 0x4d, 0x03, 0x6f, 0x6b, 0xcd, 0xcb, 0x42, 0xfa,
+    0x88, 0x16, 0xd5, 0xa6, 0x60, 0x08, 0xd4, 0xa5, 0x5b, 0x3b, 0x7b, 0xa2,
+    0xca, 0xa3, 0xa2, 0x5d, 0x63, 0x7f, 0xc0, 0x37, 0xc5, 0x7e, 0x99, 0x04,
+    0x5d, 0x9a, 0xb9, 0xa5, 0xac, 0xd1, 0xe2, 0x5d, 0xb2, 0x2b, 0x7e, 0xbb,
+    0xb9, 0x66, 0x13, 0xa7, 0x30, 0xbf, 0x80, 0x0c, 0x2b, 0x8d, 0x45, 0xe1,
+    0x8d, 0x96, 0x25, 0x27, 0x47, 0x3d, 0x21, 0x7d, 0x1c, 0x42, 0xac, 0x31,
+    0x26, 0x47, 0x59, 0xb3, 0x44, 0x85, 0xf2, 0x8e, 0x7d, 0x01, 0x96, 0x6d,
+    0xb2, 0x64, 0xc3, 0xfc, 0xa7, 0x82, 0x06, 0x4a, 0x87, 0x75, 0x9b, 0x99,
+    0x47, 0x7e, 0xa6, 0x4d, 0x2c, 0x36, 0xff, 0xac, 0x2b, 0x77, 0x96, 0x52,
+    0x14, 0x8d, 0x07, 0x0d, 0x28, 0x9d, 0x84, 0xa2, 0xda, 0xd6, 0x45, 0x3a,
+    0xd4, 0xe6, 0xb7, 0x9a, 0xf3, 0x34, 0xe3, 0xda, 0x39, 0xdf, 0x35, 0x9c,
+    0xe4, 0x87, 0x55, 0xc8, 0x43, 0xd0, 0x61, 0x46, 0x52, 0x2f, 0x75, 0x63,
+    0xbb, 0x98, 0x97, 0xeb, 0xfb, 0x15, 0xaf, 0x8e, 0x96, 0xdc, 0xff, 0x0a,
+    0x90, 0xda, 0x09, 0x63, 0x28, 0x7b, 0x92, 0x73, 0x0b, 0xd4, 0x2b, 0x72,
+    0x2a, 0x86, 0x32, 0xc3, 0xc1, 0x3e, 0xe4, 0x2c, 0x07, 0x89, 0x53, 0xb7,
+    0xfe, 0x78, 0x6c, 0x95, 0xb4, 0x62, 0x4d, 0x4b, 0xfe, 0x6c, 0xfc, 0x5e,
+    0x4e, 0xa7, 0x8c, 0x07, 0x4f, 0x85, 0x27, 0xe0, 0x7b, 0xd9, 0x7a, 0xe5,
+    0x1d, 0xbc, 0x36, 0xda, 0x8e, 0x21, 0xff, 0xb3, 0x60, 0x2c, 0x5e, 0x23,
+    0x0f, 0xde, 0x3f, 0xae, 0xa5, 0x3a, 0x50, 0xa9, 0x99, 0x39, 0x45, 0xaf,
+    0xd3, 0x5f, 0x4a, 0x15, 0xad, 0x9c, 0x66, 0x7f, 0x92, 0xe0, 0x02, 0x81,
+    0x3e, 0x06, 0x6a, 0x5e, 0xd0, 0x0c, 0x42, 0xe7, 0xcf, 0xe2, 0xeb, 0xa3,
+    0xe0, 0xf7, 0x2d, 0x8a, 0x21, 0xdb, 0x64, 0x28, 0x2a, 0xb3, 0x2b, 0xc4,
+    0xc9, 0xd5, 0x60, 0xaf, 0xfc, 0x15, 0xa1, 0x44, 0x9c, 0x96, 0x04, 0x42,
+    0x1c, 0x55, 0x8c, 0xa5, 0xce, 0x80, 0xce, 0x75, 0x64, 0xa9, 0xf6, 0xa5,
+    0x5a, 0x0f, 0x8a, 0x4b, 0x8b, 0x72, 0xcf, 0x3e, 0xd7, 0xeb, 0xe1, 0xd0,
+    0xd3, 0x2d, 0x04, 0x6c, 0x9e, 0x02, 0x75, 0x43, 0x5c, 0xc1, 0x57, 0x66,
+    0xd9, 0x14, 0x5b, 0x08, 0x10, 0x44, 0x8d, 0x8e, 0x89, 0xd1, 0x65, 0x27,
+    0x2a, 0x0b, 0x99, 0x6f, 0x09, 0xa6, 0x20, 0xa5, 0x75, 0x24, 0xe4, 0xf7,
+    0xf5, 0xe0, 0xed, 0x79, 0x37, 0x18, 0x13, 0x1c, 0xd9, 0xd1, 0xf5, 0x69,
+    0x0c, 0xa5, 0x02, 0xdf, 0x6a, 0xfd, 0x2e, 0x35, 0x8e, 0xd0, 0x41, 0x91,
+    0x61, 0x0f, 0x5c, 0xdd, 0x70, 0xbf, 0x1c, 0x49, 0xcb, 0xe9, 0xc9, 0x33,
+    0xc4, 0x99, 0x1e, 0x8b, 0x75, 0x48, 0xc2, 0x58, 0xa4, 0x70, 0x1f, 0xbb,
+    0xcd, 0xd3, 0x0e, 0x79, 0x25, 0xbe, 0x53, 0xfa, 0x32, 0x32, 0xf6, 0xb9,
+    0xf0, 0x0a, 0x52, 0x5b, 0xe0, 0x69, 0xff, 0x43, 0xda, 0x98, 0x1f, 0xee,
+    0x54, 0x60, 0xf8, 0x24, 0x43, 0xc5, 0x37, 0x72, 0xd1, 0xfc, 0x99, 0x9a,
+    0x3e, 0x24, 0xe0, 0xd9, 0xc2, 0x61, 0x47, 0xb3, 0x26, 0x09, 0x85, 0x74,
+    0xa1, 0x2b, 0x4a, 0x70, 0xd0, 0x1b, 0x90, 0x03, 0x25, 0xd9, 0x22, 0xc2,
+    0x16, 0x22, 0x3a, 0x62, 0x20, 0xd4, 0x13, 0xce, 0xa2, 0xc7, 0x02, 0xfb,
+    0x9a, 0xbf, 0xf1, 0x1c, 0x80, 0x01, 0x97, 0x90, 0x7f, 0x5a, 0x98, 0x70,
+    0x30, 0x61, 0x77, 0xe5, 0xd4, 0x3b, 0x03, 0x42, 0x57, 0x31, 0x5e, 0xc6,
+    0x64, 0xe1, 0xf4, 0x64, 0x77, 0x21, 0x9b, 0x44, 0x1c, 0xd9, 0x8c, 0x95,
+    0x8a, 0xf1, 0xcb, 0x82, 0xac, 0xc1, 0x26, 0x31, 0xf2, 0x22, 0x41, 0xab,
+    0xbb, 0x23, 0xd3, 0x8d, 0xcc, 0x5c, 0x9d, 0x9b, 0x1d, 0x9c, 0x4d, 0xf3,
+    0x62, 0xde, 0x15, 0x6a, 0x94, 0x8d, 0x24, 0xe7, 0x52, 0x8d, 0x2a, 0xa4,
+    0x1d, 0x54, 0x5a, 0xda, 0xaf, 0xab, 0x05, 0x27, 0x4b, 0xbb, 0xb4, 0xda,
+    0x0c, 0xb9, 0x20, 0xb3, 0xaf, 0x4a, 0xeb, 0x37, 0xe5, 0x43, 0xe4, 0xc1,
+    0xf6, 0x9e, 0xf8, 0x6c, 0xd8, 0xa1, 0x0c, 0xf9, 0xd1, 0x4b, 0x96, 0xa0,
+    0x6d, 0x38, 0x64, 0x41, 0xd3, 0x14, 0xfb, 0xad, 0x89, 0xa9, 0xf7, 0x36,
+    0x01, 0x0f, 0xbe, 0x8e, 0xd7, 0x76, 0xc6, 0x70, 0x22, 0x32, 0x8b, 0x08,
+    0xca, 0x95, 0xbf, 0xcf, 0x5e, 0xb8, 0xc0, 0x3f, 0xd9, 0xaa, 0x84, 0xab,
+    0x30, 0x5b, 0xe3, 0x7a, 0x61, 0x32, 0xe5, 0x54, 0x01, 0x5e, 0xb6, 0x1c,
+    0x9c, 0x78, 0x52, 0x2a, 0xa7, 0xf5, 0x29, 0xa6, 0x0f, 0x14, 0xa5, 0x3a,
+    0x34, 0xd4, 0xf5, 0xc2, 0xb2, 0x8d, 0x12, 0x7b, 0x8a, 0x64, 0x00, 0xfd,
+    0x02, 0x0e, 0x02, 0x26, 0x5a, 0xb9, 0xeb, 0xfd, 0x30, 0xce, 0x51, 0xec,
+    0x5f, 0xbc, 0xee, 0x53, 0x21, 0xec, 0x0e, 0xee, 0xc4, 0x28, 0x1a, 0xec,
+    0x2a, 0x39, 0x4e, 0xe1, 0x50, 0x11, 0x3f, 0x16, 0xdd, 0xbf, 0xaf, 0x3e,
+    0xbe, 0xd4, 0xfe, 0x34, 0x1e, 0x62, 0x3f, 0x5a, 0xea, 0x05, 0xfc, 0xd5,
+    0x45, 0x08, 0x47, 0xce, 0x38, 0x3f, 0x75, 0x7e, 0x0c, 0x3a, 0x2a, 0x14,
+    0xa7, 0x61, 0xba, 0x3a, 0xa1, 0x41, 0xa2, 0x72, 0x19, 0xfa, 0x33, 0x43,
+    0xa7, 0xf4, 0x4e, 0x5b, 0xf9, 0xb1, 0x45, 0x16, 0x57, 0x8e, 0xb1, 0xad,
+    0x7d, 0x88, 0xd3, 0x93, 0xa2, 0x08, 0xf3, 0x96, 0x4d, 0x84, 0x63, 0x08,
+    0xfa, 0x9d, 0xf3, 0x04, 0x33, 0xbd, 0x7e, 0x7a, 0xc7, 0x63, 0xc5, 0x31,
+    0x5a, 0x82, 0x33, 0x90, 0x56, 0x44, 0xe9, 0xd3, 0xc4, 0xd4, 0x76, 0x29,
+    0x2f, 0xdb, 0xa3, 0x9d, 0xff, 0xd4, 0xd2, 0xb1, 0xce, 0xf1, 0xcb, 0x7f,
+    0x10, 0x3b, 0x90, 0xa4, 0x1b, 0xa0, 0x9b, 0xa7, 0xfa, 0x27, 0x40, 0x11,
+    0x35, 0xc9, 0x7f, 0x01, 0x97, 0x76, 0x9f, 0x33, 0xc5, 0xd6, 0x8d, 0x20,
+    0x07, 0x73, 0x93, 0x0b, 0x24, 0x88, 0x4e, 0x73, 0x68, 0x79, 0x92, 0x20,
+    0x2a, 0x71, 0xed, 0x22, 0x0b, 0xfb, 0x42, 0xb5, 0xd9, 0xc3, 0xaa, 0xed,
+    0x45, 0x03, 0x64, 0xde, 0x6f, 0x25, 0x8e, 0x3b, 0x9a, 0xef, 0xc5, 0x63,
+    0xc2, 0x7f, 0x34, 0xd0, 0x1b, 0x20, 0xa3, 0xab, 0x9d, 0x54, 0x41, 0x0e,
+    0x7b, 0x2e, 0x96, 0x12, 0x75, 0x58, 0xdf, 0xd5, 0xaa, 0x3c, 0xf2, 0x26,
+    0xc1, 0xf1, 0x18, 0x37, 0x56, 0xf2, 0xd2, 0x86, 0x6f, 0xd4, 0x9f, 0x57,
+    0x2b, 0x32, 0xe9, 0x08, 0x94, 0x53, 0x40, 0xc5, 0x4d, 0x77, 0x39, 0xc6,
+    0x4c, 0x63, 0x53, 0xf9, 0xbf, 0x35, 0x08, 0xc5, 0x0d, 0xd0, 0x89, 0x82,
+    0xa7, 0x2d, 0x6a, 0xb4, 0x22, 0xb1, 0x10, 0x7f, 0xcf, 0x2e, 0x21, 0x27,
+    0x9c, 0x12, 0xc6, 0x0e, 0xca, 0xd2, 0x32, 0xb1, 0x6d, 0xfd, 0x59, 0x12,
+    0x23, 0x60, 0x46, 0x89, 0xe0, 0x75, 0x5e, 0xc9, 0xf4, 0x3d, 0x8a, 0x89,
+    0xd4, 0x23, 0xc2, 0xbe, 0x30, 0x32, 0x4a, 0x95, 0x42, 0xe2, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xa7, 0x0b, 0x48, 0xe2,
+    0xeb, 0xd7, 0x12, 0x42, 0x4c, 0x71, 0xfb, 0x25, 0x17, 0x23, 0x0e, 0x01,
+    0xa6, 0x21, 0xb9, 0x17, 0x6e, 0xf0, 0x24, 0x66, 0x9e, 0x9d, 0x0f, 0x71,
+    0xf8, 0x5b, 0x79, 0xb0, 0x1b, 0x1f, 0xe7, 0xa2, 0xc0, 0x17, 0x16, 0x08,
+    0x5e, 0x24, 0x7b, 0xf9, 0x7a, 0x1e, 0x70, 0xe2, 0x05, 0x40, 0x16, 0x56,
+    0xe7, 0x79, 0xf2, 0x30, 0xa3, 0xdc, 0xe3, 0x7a, 0x7e, 0x22, 0x88, 0xc0,
+    0xf7, 0xc8, 0x5c, 0x93, 0x95, 0x86, 0x02, 0x6c, 0x73, 0x76, 0xef, 0x03,
+    0x2d, 0xcb, 0xa5, 0x22, 0xfe, 0x05, 0xbb, 0xe6, 0xfd, 0x19, 0x8c, 0x8b,
+    0x67, 0x58, 0x81, 0x81, 0x2d, 0x36, 0xd0, 0xc1, 0x20, 0xb2, 0x87, 0x87,
+    0xdb, 0xe4, 0xe5, 0xd1, 0xd1, 0xd5, 0x81, 0x34, 0x4c, 0xd6, 0x09, 0xa2,
+    0x5d, 0xcc, 0x99, 0x12, 0xa5, 0x06, 0x0f, 0x06, 0x7e, 0xbb, 0x67, 0x26,
+    0x69, 0x15, 0x6e, 0x5f, 0xb1, 0x8e, 0xd6, 0x34, 0xfc, 0x4d, 0xd9, 0x03,
+    0xb7, 0x5a, 0xf4, 0xaa, 0x03, 0x00, 0x88, 0x6b, 0x5a, 0xc9, 0xf2, 0xfb,
+    0x67, 0x72, 0xbc, 0xf7, 0xb9, 0xdc, 0x97, 0xdf, 0x80, 0x91, 0xfa, 0x30,
+    0x18, 0x02, 0x89, 0xc7, 0xc9, 0x62, 0x1d, 0xc0, 0x0b, 0xa6, 0xfe, 0x7e,
+    0xb9, 0xa9, 0x1f, 0x11, 0x71, 0xe1, 0xd1, 0xfe, 0x8d, 0x90, 0x2c, 0x09,
+    0x82, 0x2e, 0x36, 0x79, 0xa5, 0x75, 0x54, 0xfb, 0xd3, 0x3c, 0xb4, 0x18,
+    0x2f, 0x4e, 0x3f, 0x37, 0xc4, 0xf8, 0xc5, 0x59, 0xa3, 0xfd, 0x0c, 0x62,
+    0x9e, 0xa8, 0x7a, 0x56, 0xc5, 0x97, 0x89, 0x35, 0xc7, 0xb0, 0x29, 0x87,
+    0xbf, 0x6a, 0xdc, 0xb1, 0x2f, 0x01, 0xf4, 0x0d, 0x7c, 0x25, 0x95, 0x39,
+    0x81, 0xdd, 0x1a, 0x81, 0x36, 0xc0, 0x6b, 0xbf, 0x6b, 0x4d, 0xea, 0x23,
+    0xc0, 0x3e, 0x5c, 0x39, 0xe5, 0x6b, 0x59, 0xa0, 0x50, 0x02, 0x99, 0xdf,
+    0x4e, 0xe3, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+    0x17, 0x88, 0xf8, 0xda, 0x3d, 0x57, 0x83, 0x63, 0x76, 0xa0, 0x5c, 0x13,
+    0x1a, 0x00, 0x64, 0x30, 0x19, 0xfd, 0x2e, 0x9c, 0x64, 0xb6, 0xda, 0x51,
+    0x7b, 0x55, 0xe8, 0xc4, 0x67, 0x1b, 0xda, 0xfc, 0x4c, 0xd0, 0x27, 0x58,
+    0x56, 0xa1, 0x52, 0xd2, 0xb8, 0xd8, 0xd5, 0x94, 0x69, 0xcf, 0xd0, 0xd5,
+    0x72, 0xeb, 0x2b, 0x05, 0xf3, 0x12, 0xa6, 0xac, 0xa6, 0xf7, 0x90, 0x24,
+    0x1f, 0x22, 0x97, 0x5e, 0x8b, 0x7c, 0x2c, 0x30, 0x61, 0x11, 0x9b, 0xdf,
+    0x83, 0x2b, 0x10, 0x09, 0x42, 0x77, 0x2b, 0xd9, 0x43, 0xb3, 0x27, 0x69,
+    0x75, 0xf2, 0x2e, 0x72, 0xed, 0x50, 0xea, 0xbf, 0x7f, 0x47, 0x39, 0x9c,
+    0xf8, 0x1e, 0xce, 0x6f, 0xdd, 0xe8, 0x40, 0xc5, 0x14, 0x01, 0x7e, 0xbb,
+    0x0f, 0x43, 0x2d, 0x36, 0x70, 0x54, 0xc6, 0xbe, 0x69, 0x24, 0xd1, 0x65,
+    0x49, 0x77, 0xf0, 0xd2, 0x99, 0xb4, 0x50, 0x8d, 0x98, 0xcb, 0xbf, 0x7a,
+    0x7c, 0x65, 0xd3, 0x46, 0xcf, 0x90, 0x69, 0x56, 0x15, 0xa2, 0xae, 0x11,
+    0x94, 0x60, 0xf9, 0x45, 0x17, 0x54, 0x6b, 0xbd, 0xeb, 0xd8, 0x74, 0x41,
+    0x5c, 0xf6, 0x49, 0x0a, 0x14, 0xce, 0x43, 0x1f, 0x67, 0xc3, 0x6c, 0xf4,
+    0x01, 0xce, 0x3f, 0x85, 0xed, 0x19, 0xa1, 0xf7, 0x1b, 0xf8, 0x46, 0x45,
+    0xb4, 0xe9, 0xa7, 0x1f, 0x2a, 0x65, 0x00, 0x2a, 0xd3, 0x8b, 0x6a, 0x3b,
+    0xac, 0x78, 0xab, 0xf4, 0xc8, 0x62, 0x76, 0xc8, 0x24, 0xf8, 0xf8, 0x08,
+    0xe0, 0x64, 0x00, 0x64, 0x74, 0x9e, 0x55, 0x2e, 0xf8, 0xc9, 0xc8, 0x58,
+    0x0e, 0x1f, 0x27, 0x32, 0xfd, 0x30, 0x24, 0x68, 0xc8, 0xa4, 0x8c, 0x1c,
+    0xf3, 0xa7, 0x32, 0xae, 0x84, 0x0a, 0x8a, 0x1e, 0x11, 0xce, 0xb2, 0x02,
+    0xf1, 0xb3, 0x5f, 0x7d, 0x5e, 0x54, 0x8c, 0xe0, 0xeb, 0x46, 0x6e, 0x8a,
+    0x5f, 0x3f, 0x71, 0x47, 0x2a, 0x8a, 0xe6, 0xf0, 0xb0, 0x04, 0x49, 0x64,
+    0xb3, 0x7e, 0x16, 0x09, 0x83, 0x5f, 0x12, 0xe0, 0x85, 0xb7, 0x36, 0xc0,
+    0x8a, 0xa5, 0xcd, 0xae, 0xc0, 0xb4, 0xa2, 0x62, 0x9b, 0xfa, 0x64, 0x18,
+    0x16, 0x8e, 0xb6, 0x50, 0xf2, 0x9b, 0xc4, 0x7d, 0x0c, 0x4c, 0x8b, 0x58,
+    0xcf, 0x9b, 0x87, 0x09, 0xb1, 0x37, 0xbb, 0xaf, 0xa7, 0x72, 0x79, 0x81,
+    0x09, 0x55, 0xa1, 0x6a, 0x87, 0xb0, 0x7d, 0xc8, 0xb0, 0xc1, 0xa4, 0xa9,
+    0xdf, 0xcf, 0x95, 0x77, 0x36, 0x8e, 0x2b, 0xae, 0xeb, 0x4b, 0xf9, 0x2a,
+    0x83, 0x6c, 0x53, 0x3c, 0x89, 0xa6, 0x08, 0xae, 0x00, 0x4e, 0xb8, 0xf6,
+    0x34, 0x7c, 0xc6, 0x76, 0x87, 0x1a, 0x02, 0xb0, 0x89, 0xa3, 0x0f, 0x00,
+    0xc6, 0x7b, 0xeb, 0xf7, 0x95, 0x40, 0xc5, 0x0d, 0x6f, 0x74, 0xd8, 0x21,
+    0x2f, 0x9f, 0x24, 0xac, 0x43, 0xdb, 0x3a, 0x39, 0x6c, 0x34, 0x59, 0x62,
+    0x66, 0xbc, 0x28, 0x7f, 0x8c, 0x64, 0x62, 0x8c, 0x28, 0x6c, 0xf5, 0x79,
+    0x24, 0xb1, 0x00, 0x9c, 0x58, 0x6b, 0x09, 0xef, 0xb0, 0x73, 0xcd, 0x47,
+    0xbb, 0x52, 0xfd, 0x26, 0x6a, 0xff, 0xb9, 0xf1, 0xd5, 0x82, 0x59, 0x01,
+    0xfa, 0x87, 0x14, 0x24, 0x10, 0xb0, 0xf7, 0xdf, 0xf9, 0x3f, 0x67, 0x19,
+    0xbd, 0xc7, 0x85, 0xb0, 0xad, 0x47, 0xa8, 0x4c, 0x3e, 0xb6, 0x2e, 0x8a,
+    0xb3, 0xcc, 0x35, 0xa0, 0x48, 0xc7, 0x90, 0x81, 0xb7, 0x53, 0x1c, 0x38,
+    0x63, 0xf2, 0x2f, 0xa0, 0x71, 0x82, 0xe2, 0x56, 0xdb, 0x68, 0xe8, 0x5f,
+    0xf8, 0x42, 0xf2, 0xf6, 0xb8, 0x10, 0x6b, 0x54, 0x21, 0xa0, 0xc1, 0xfe,
+    0xcb, 0xce, 0x12, 0xa2, 0x49, 0x51, 0x86, 0x53, 0x56, 0xec, 0x33, 0xb3,
+    0x72, 0xce, 0xa4, 0x46, 0xe3, 0x37, 0xcb, 0xc0, 0x95, 0xaa, 0xe2, 0xa3,
+    0xc5, 0xe9, 0x36, 0x40, 0xfe, 0xf7, 0xe2, 0x5a, 0x6d, 0x58, 0x39, 0xb2,
+    0x41, 0x5d, 0xe2, 0x71, 0x72, 0xd0, 0xf0, 0x5c, 0x16, 0x88, 0x95, 0x30,
+    0x0a, 0xfb, 0x8d, 0xda, 0x14, 0x80, 0xf4, 0x15, 0xf2, 0xf6, 0xac, 0xf3,
+    0xd8, 0x8d, 0x13, 0x24, 0x2c, 0x74, 0x60, 0x6e, 0x8c, 0xa1, 0x59, 0xcf,
+    0x74, 0x7c, 0x2d, 0x0b, 0xbb, 0x06, 0x5c, 0x9d, 0xcd, 0xf3, 0x1e, 0x4a,
+    0xba, 0x3f, 0x9c, 0x4a, 0xc4, 0xd7, 0xf9, 0xf0, 0xa5, 0x56, 0x7f, 0xb0,
+    0xa2, 0x57, 0xd0, 0xc3, 0xaa, 0xa7, 0xd0, 0x49, 0xe2, 0x28, 0x9b, 0xc4,
+    0x64, 0x0c, 0xe0, 0x71, 0x9c, 0x05, 0x04, 0x95, 0x00, 0x1f, 0x7b, 0xa9,
+    0xb9, 0xb3, 0x2b, 0x8f, 0x0b, 0x45, 0x1e, 0x23, 0xaa, 0x27, 0x89, 0x4a,
+    0xb0, 0x7d, 0x03, 0xdf, 0xae, 0xdb, 0xcb, 0xc4, 0xec, 0x3b, 0x02, 0xe2,
+    0x85, 0x3a, 0xb7, 0x25, 0xfb, 0xab, 0xca, 0xc1, 0x33, 0x00, 0x5b, 0xd2,
+    0xcf, 0xb0, 0x11, 0x1d, 0x51, 0xb5, 0x5b, 0xea, 0x94, 0xf7, 0xa0, 0x98,
+    0x33, 0xba, 0x58, 0xfc, 0x12, 0xea, 0xdd, 0x89, 0xbd, 0x63, 0x03, 0xbe,
+    0x7e, 0x3b, 0x69, 0xc4, 0x9d, 0x57, 0x0f, 0xd6, 0xbe, 0xea, 0x5b, 0xd0,
+    0x97, 0x63, 0x89, 0xb0, 0xa0, 0xc0, 0xd6, 0x39, 0xc1, 0x69, 0x12, 0x6a,
+    0xfb, 0xac, 0x74, 0x7f, 0xfb, 0xf4, 0x7f, 0x38, 0x44, 0x4c, 0x8a, 0xa2,
+    0x41, 0x15, 0xc0, 0x54, 0xc0, 0xed, 0x14, 0x83, 0xef, 0xbc, 0x9c, 0xc7,
+    0xdd, 0x21, 0xd6, 0xf0, 0x9b, 0x7f, 0x09, 0xd5, 0x96, 0xe5, 0xf7, 0xc5,
+    0xa9, 0xb3, 0x41, 0xb0, 0x9d, 0xeb, 0x49, 0x68, 0x9d, 0x2b, 0xea, 0x47,
+    0x80, 0x3b, 0x54, 0xb8, 0xf4, 0x14, 0x5e, 0xd6, 0x66, 0x89, 0x04, 0xb3,
+    0x00, 0xa3, 0xa8, 0x32, 0x62, 0x2e, 0xc3, 0x15, 0xc6, 0x93, 0x7d, 0x40,
+    0x32, 0xb1, 0x6b, 0x60, 0xd3, 0x52, 0xdf, 0x09, 0x8c, 0x80, 0x2b, 0x01,
+    0xe7, 0x97, 0x8d, 0xbb, 0x14, 0xd6, 0x10, 0x15, 0x64, 0x00, 0x4a, 0x2c,
+    0x67, 0xca, 0xd0, 0xa1, 0x37, 0x33, 0x7b, 0xa1, 0x2a, 0x5b, 0x5b, 0x78,
+    0xf8, 0x2f, 0xdd, 0x76, 0xab, 0x8a, 0xc3, 0xe3, 0x37, 0x00, 0xd1, 0x29,
+    0xb0, 0x96, 0x1d, 0x18, 0xbe, 0x5d, 0x32, 0x7e, 0xb7, 0x11, 0xa9, 0x78,
+    0x72, 0xa2, 0x2d, 0x29, 0x1c, 0x32, 0xa4, 0xff, 0xc7, 0xce, 0xfe, 0xaf,
+    0xb7, 0x17, 0x43, 0xe5, 0x2f, 0xae, 0x45, 0xd3, 0xaf, 0x10, 0xe3, 0xd0,
+    0x58, 0xb6, 0xee, 0xee, 0x7a, 0xb5, 0x06, 0x70, 0x26, 0x7e, 0x2d, 0x5b,
+    0xd5, 0xe1, 0x7b, 0x9a, 0x37, 0x02, 0xfc, 0x1d, 0x08, 0x4f, 0x1a, 0xf5,
+    0x44, 0x63, 0xde, 0x4b, 0x14, 0x68, 0x54, 0x0b, 0x6a, 0x22, 0x4e, 0x02,
+    0x65, 0xcd, 0xf4, 0x04, 0xec, 0xcc, 0x8a, 0x0b, 0xe0, 0x59, 0xf8, 0x65,
+    0x25, 0x63, 0xed, 0x0f, 0xa6, 0xc5, 0x3c, 0xcb, 0x5d, 0xc5, 0xd8, 0x9f,
+    0x5a, 0xd3, 0x88, 0x3d, 0xd4, 0x2c, 0xb3, 0x04, 0xf6, 0x97, 0xc7, 0xe2,
+    0xfd, 0xb6, 0xf4, 0x7d, 0x0d, 0xb9, 0x75, 0x7e, 0x9d, 0x81, 0xdc, 0xdf,
+    0x8e, 0x90, 0x40, 0x0c, 0x7b, 0x45, 0xfe, 0x68, 0xfd, 0xff, 0x1c, 0xf1,
+    0x16, 0x09, 0x33, 0x74, 0x27, 0x7b, 0x4d, 0xd9, 0x9b, 0x48, 0x6d, 0x84,
+    0xeb, 0x96, 0x8f, 0x4b, 0x82, 0x73, 0xd5, 0x69, 0x7d, 0x14, 0x45, 0x8c,
+    0xb8, 0x71, 0x87, 0x70, 0x09, 0x26, 0xfc, 0x89, 0x6f, 0x0f, 0xb6, 0xc1,
+    0xd6, 0xe1, 0xbf, 0xdb, 0x85, 0x8f, 0x94, 0xad, 0x94, 0x01, 0x01, 0xbb,
+    0x3f, 0xc0, 0xb5, 0xff, 0xf5, 0xbb, 0x4f, 0x50, 0x09, 0xca, 0x7d, 0x36,
+    0x47, 0x66, 0x9a, 0x8c, 0xee, 0x84, 0x73, 0x9a, 0x1f, 0x49, 0x75, 0xb4,
+    0xab, 0x66, 0xf7, 0x3b, 0xfe, 0x81, 0x67, 0xc9, 0xd1, 0x16, 0xde, 0x1f,
+    0xc2, 0x24, 0xed, 0x6a, 0x5a, 0xe7, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x18, 0x00, 0x00, 0xc5, 0xd7, 0x14, 0x84, 0xf8, 0xcf, 0x9b, 0xf4,
+    0xb7, 0x6f, 0x47, 0x90, 0x47, 0x30, 0x80, 0x4b, 0x9e, 0x32, 0x25, 0xa9,
+    0xf1, 0x33, 0xb5, 0xde, 0xa1, 0x68, 0xf4, 0xe2, 0x85, 0x1f, 0x07, 0x2f,
+    0xcc, 0x00, 0xfc, 0xaa, 0x7c, 0xa6, 0x20, 0x61, 0x71, 0x7a, 0x48, 0xe5,
+    0x2e, 0x29, 0xa3, 0xfa, 0x37, 0x9a, 0x95, 0x3f, 0xaa, 0x68, 0x93, 0xe3,
+    0x2e, 0xc5, 0xa2, 0x7b, 0x94, 0x5e, 0x60, 0x5f, 0x10, 0x85, 0xf3, 0x23,
+    0x2d, 0x42, 0x4c, 0x13, 0x29, 0xc8, 0x8d, 0x78, 0x6e, 0xd6, 0x8c, 0xe6,
+    0xfc, 0xb6, 0x2a, 0xa6, 0x3b, 0xf9, 0xab, 0x61, 0x7c, 0x08, 0x8a, 0x3b,
+    0x70, 0xbe, 0x57, 0xaa, 0xda, 0x1f, 0x33, 0x4a, 0x70, 0x17, 0x25, 0x0d,
+    0x3f, 0x60, 0x3d, 0xc8, 0x2e, 0xbd, 0x3b, 0x12, 0x0b, 0x63, 0x5e, 0x3f,
+    0xf5, 0x6b, 0x1f, 0x0b, 0xd9, 0x33, 0x85, 0x23, 0x71, 0x24, 0x9a, 0xb3,
+    0xdf, 0x5c, 0x1f, 0xef, 0x14, 0x33, 0xc8, 0x66, 0x85, 0xb7, 0xf0, 0x56,
+    0x68, 0x1d, 0x51, 0x52, 0xaf, 0x80, 0x3c, 0xe2, 0x59, 0x06, 0xf1, 0xd1,
+    0x9f, 0xb6, 0xc6, 0x80, 0x4e, 0x06, 0xea, 0x28, 0xab, 0x17, 0x8f, 0x45,
+    0x7a, 0xf6, 0xb4, 0x93, 0xb7, 0x43, 0x9e, 0xc6, 0xd4, 0x29, 0x00, 0x62,
+    0xab, 0x51, 0x7a, 0x72, 0xe5, 0xc1, 0xd4, 0x10, 0xcd, 0xd6, 0x17, 0x54,
+    0xe4, 0x20, 0x84, 0x50, 0xe4, 0xf9, 0x00, 0x13, 0xfd, 0xa6, 0x9f, 0xef,
+    0x19, 0xd4, 0x60, 0x2a, 0x42, 0x07, 0xcd, 0xd5, 0xa1, 0x01, 0x6d, 0x07,
+    0x01, 0x32, 0x61, 0x3c, 0x65, 0x9a, 0x8f, 0x5d, 0x33, 0xf3, 0xcb, 0x29,
+    0x0b, 0x8c, 0xe7, 0x3b, 0x83, 0x44, 0xb1, 0x3a, 0x4f, 0x8e, 0x09, 0x15,
+    0x14, 0x69, 0x84, 0xa1, 0xbb, 0x15, 0xfd, 0xea, 0xde, 0xbe, 0x5b, 0x6a,
+    0xc0, 0x95, 0x04, 0x46, 0x4d, 0x8a, 0xaa, 0xac, 0xbc, 0x2f, 0xad, 0x12,
+    0x15, 0x8a, 0x53, 0x4c, 0x94, 0xb8, 0xca, 0x42, 0x96, 0x3a, 0xf4, 0x7a,
+    0x18, 0x9d, 0x5b, 0x24, 0x9a, 0xce, 0xa8, 0x99, 0xd4, 0x37, 0x32, 0xf6,
+    0xf2, 0xac, 0xaf, 0x3f, 0xf5, 0x3b, 0xfe, 0xda, 0x13, 0x9a, 0xab, 0x4f,
+    0x55, 0xc0, 0x2c, 0x21, 0x2b, 0x65, 0x71, 0x1f, 0xc5, 0x04, 0x32, 0xc9,
+    0x94, 0xe5, 0xfa, 0x6f, 0xd8, 0x2a, 0xbc, 0x70, 0x85, 0x55, 0xdc, 0x62,
+    0xb7, 0x3a, 0x20, 0x0e, 0xe7, 0x67, 0x3c, 0xfe, 0xcb, 0x83, 0x6a, 0x15,
+    0x6e, 0x4a, 0x35, 0x65, 0xea, 0xc1, 0xb9, 0x4d, 0x35, 0xf9, 0x4b, 0xcf,
+    0xd8, 0xfd, 0xa5, 0xff, 0xff, 0x67, 0x70, 0x04, 0xae, 0xa2, 0xa4, 0x12,
+    0x4b, 0x83, 0x4f, 0xc2, 0x96, 0xf0, 0x21, 0x2b, 0x14, 0x21, 0x73, 0x42,
+    0x14, 0x99, 0x07, 0xe5, 0xa9, 0x52, 0x4c, 0xeb, 0xbe, 0xc3, 0x11, 0x2e,
+    0x27, 0xda, 0x69, 0x94, 0xd5, 0xf6, 0xc6, 0x77, 0x0a, 0x00, 0x5d, 0x9a,
+    0x82, 0xaa, 0x21, 0xfc, 0x86, 0x9b, 0xd0, 0xc4, 0xc4, 0x1f, 0x53, 0x41,
+    0x7a, 0x92, 0xab, 0x1c, 0x12, 0xf6, 0xd5, 0x48, 0xfb, 0x29, 0x4d, 0xb4,
+    0xd2, 0x12, 0xee, 0xc5, 0xea, 0x18, 0x33, 0xf1, 0x4d, 0x0a, 0x10, 0x43,
+    0xa5, 0x35, 0xb1, 0x63, 0xc4, 0xfb, 0x38, 0x1e, 0xef, 0xac, 0x3f, 0x97,
+    0x41, 0xc6, 0x96, 0x3e, 0x60, 0x13, 0xc8, 0xe3, 0xbe, 0x61, 0xe9, 0xb6,
+    0x26, 0x16, 0x14, 0xf8, 0x82, 0x0d, 0x6e, 0x75, 0x2f, 0xd7, 0x9c, 0x3a,
+    0x4a, 0xda, 0xd8, 0x2b, 0x35, 0xd4, 0x20, 0x32, 0xd4, 0x4f, 0x0f, 0xe4,
+    0xdc, 0xd5, 0x0f, 0xfe, 0xa6, 0x81, 0x28, 0xb4, 0x24, 0x3e, 0xb7, 0x0f,
+    0xb0, 0xb2, 0x5b, 0x05, 0x76, 0xbb, 0x24, 0x49, 0x6a, 0x01, 0x68, 0x3f,
+    0x03, 0x96, 0xbc, 0x0c, 0x77, 0x48, 0x5f, 0xe8, 0x39, 0xf4, 0xb0, 0x84,
+    0x42, 0x0e, 0x6a, 0xb9, 0xab, 0xf2, 0x95, 0x97, 0xa7, 0x5e, 0x29, 0x34,
+    0x9d, 0x50, 0xc0, 0x4b, 0x40, 0x72, 0xa1, 0x7c, 0x79, 0x5e, 0x95, 0xbe,
+    0xd6, 0x17, 0x43, 0x0a, 0xc9, 0x27, 0x25, 0x43, 0xd7, 0x99, 0xd5, 0x48,
+    0xd8, 0x98, 0xb5, 0x2b, 0x7f, 0xe3, 0xbd, 0x1d, 0xc0, 0xd1, 0x04, 0xd5,
+    0xa4, 0xe1, 0x68, 0xbe, 0x96, 0xf1, 0x2e, 0x5e, 0x37, 0x8d, 0x39, 0x4e,
+    0xe4, 0xcc, 0x5e, 0xd7, 0xdd, 0x59, 0x7e, 0xe8, 0xae, 0x48, 0xb5, 0xec,
+    0x2c, 0xf7, 0x68, 0x96, 0x00, 0xe5, 0xec, 0x03, 0x6f, 0x98, 0x3a, 0x9a,
+    0x4f, 0xd9, 0xf1, 0x2f, 0xfe, 0x76, 0xcf, 0x8f, 0x0b, 0x3d, 0x8a, 0x14,
+    0x00, 0x83, 0xcb, 0xca, 0xe3, 0x34, 0x81, 0xb5, 0x91, 0x64, 0x2b, 0x12,
+    0x24, 0x86, 0x9c, 0xae, 0x3c, 0x7f, 0x53, 0x22, 0xd4, 0x94, 0x90, 0x44,
+    0x6b, 0x35, 0xd2, 0xce, 0x8e, 0x95, 0xe2, 0xbe, 0x46, 0x50, 0x3f, 0x3d,
+    0xc3, 0xcd, 0xef, 0x47, 0x99, 0xb5, 0xf2, 0xd4, 0x6f, 0xf4, 0xfa, 0xa2,
+    0xfc, 0x1e, 0xe3, 0x99, 0x49, 0xfd, 0x1a, 0x6e, 0x0d, 0xb5, 0xf1, 0xc8,
+    0x05, 0x22, 0x29, 0xca, 0x03, 0xb8, 0x15, 0x3b, 0x01, 0x8a, 0x95, 0x74,
+    0x48, 0x93, 0x61, 0x35, 0xde, 0xeb, 0xa9, 0xc4, 0x56, 0xa9, 0xd7, 0xde,
+    0x4b, 0xe5, 0x4b, 0xa1, 0x42, 0x6a, 0x5f, 0xe3, 0xb2, 0xc7, 0xda, 0xfb,
+    0xc7, 0x70, 0x64, 0xe0, 0x68, 0x19, 0xc6, 0x11, 0x77, 0x2b, 0x5f, 0xba,
+    0x1d, 0x58, 0x77, 0x98, 0x2c, 0x91, 0xb4, 0xd2, 0xea, 0x1b, 0xdc, 0xe8,
+    0xfa, 0x82, 0xf3, 0x6e, 0xac, 0x88, 0x15, 0x16, 0x1a, 0x53, 0xb3, 0x01,
+    0x94, 0x03, 0x47, 0x20, 0xdb, 0x71, 0xcb, 0x71, 0xe8, 0x62, 0xad, 0x34,
+    0x2b, 0xa3, 0xa5, 0xe9, 0xa6, 0x82, 0x0e, 0x16, 0x61, 0xbc, 0x29, 0x6b,
+    0xb1, 0x60, 0x67, 0x80, 0x9a, 0x9f, 0xc4, 0x82, 0xf6, 0xb0, 0x7a, 0x16,
+    0x9c, 0x25, 0x04, 0xeb, 0xfd, 0xe0, 0x18, 0xd3, 0xfc, 0xeb, 0xe1, 0x3c,
+    0x2b, 0x29, 0x7b, 0x32, 0x4e, 0xd3, 0x6d, 0xe1, 0x27, 0xda, 0xc9, 0x14,
+    0x5c, 0x7f, 0xfa, 0x70, 0x41, 0x8e, 0xb4, 0xa3, 0xde, 0x36, 0x92, 0x67,
+    0x97, 0xe2, 0xec, 0x85, 0x8b, 0x76, 0x08, 0x3c, 0x32, 0x58, 0xd4, 0x7f,
+    0x6f, 0x91, 0x03, 0xdb, 0x19, 0x3e, 0xc4, 0x8b, 0x3c, 0xb7, 0x75, 0x90,
+    0x71, 0x7a, 0x21, 0x9d, 0xa7, 0x77, 0xbf, 0xf5, 0x92, 0x57, 0x46, 0x07,
+    0xa7, 0xbb, 0x0c, 0x42, 0xca, 0x4f, 0x5a, 0x27, 0x45, 0x69, 0xfe, 0x6d,
+    0x78, 0x43, 0x77, 0xc4, 0xb4, 0x43, 0xff, 0x37, 0x0d, 0xb7, 0xfa, 0xe9,
+    0x9e, 0x06, 0x70, 0x53, 0xfd, 0xf6, 0xa0, 0x28, 0x84, 0x46, 0xcd, 0x61,
+    0xa2, 0x95, 0xc4, 0x1e, 0x6a, 0x13, 0xa1, 0x7f, 0xaf, 0xe1, 0x73, 0x85,
+    0xb0, 0x53, 0x9c, 0x08, 0xb6, 0x1d, 0x4d, 0xb4, 0x0b, 0xfb, 0x1f, 0x0c,
+    0x7b, 0x17, 0x06, 0x73, 0xa7, 0x22, 0x1f, 0xb0, 0xd8, 0x45, 0x6e, 0xe5,
+    0xde, 0x48, 0xb7, 0x9f, 0x5a, 0xa8, 0xd1, 0xc3, 0x04, 0xd1, 0x87, 0xec,
+    0x15, 0x3e, 0xd1, 0xc7, 0x57, 0x01, 0x46, 0x4b, 0x28, 0xa8, 0x79, 0x5a,
+    0x7e, 0x0b, 0x56, 0x56, 0x28, 0xda, 0x35, 0xea, 0x4c, 0x14, 0x81, 0xae,
+    0xc0, 0x0d, 0x12, 0xfe, 0x2d, 0xb7, 0x95, 0x4d, 0xea, 0x78, 0xb6, 0x53,
+    0xcf, 0xac, 0x8a, 0xfc, 0xc9, 0x07, 0x9f, 0x93, 0xf0, 0x11, 0x86, 0x13,
+    0xe9, 0xca, 0x3d, 0xce, 0xb1, 0xfd, 0x1a, 0x0a, 0x8b, 0x11, 0x82, 0x94,
+    0x6a, 0xae, 0xc5, 0x80, 0x6a, 0x3b, 0xa8, 0x7c, 0xb4, 0x53, 0x4e, 0xa9,
+    0x04, 0x1a, 0x4f, 0xb0, 0xb9, 0x95, 0x96, 0xa5, 0xfd, 0xce, 0xdc, 0x57,
+    0x00, 0x48, 0x16, 0xe2, 0x40, 0xae, 0x04, 0xf5, 0x83, 0x60, 0x23, 0xd9,
+    0x8e, 0x59, 0x56, 0x20, 0x50, 0x38, 0xc4, 0xde, 0x88, 0x9f, 0x91, 0x06,
+    0xdb, 0x8f, 0x84, 0xa2, 0xaf, 0x61, 0xdd, 0x48, 0x03, 0x4f, 0xc4, 0xb8,
+    0xed, 0x12, 0xd2, 0x74, 0x08, 0xb9, 0x51, 0x63, 0xb5, 0xfe, 0x09, 0x7f,
+    0x7b, 0x8c, 0x5e, 0xd7, 0x27, 0xe5, 0x79, 0xe6, 0x33, 0x60, 0x54, 0xe1,
+    0x21, 0xda, 0xca, 0x8b, 0x81, 0xdf, 0xb6, 0xa7, 0x2e, 0x9d, 0x0f, 0xfc,
+    0x05, 0x80, 0x67, 0xcb, 0xc5, 0xdf, 0xc7, 0x13, 0xee, 0xb5, 0x40, 0x8e,
+    0xa7, 0x0c, 0xcb, 0xf2, 0x45, 0x15, 0x29, 0xb1, 0xb8, 0x02, 0x23, 0x61,
+    0x38, 0xf1, 0x16, 0xa1, 0x0c, 0xa1, 0xc9, 0x40, 0x8c, 0xd0, 0x48, 0x4b,
+    0xce, 0x9c, 0x1e, 0x53, 0x40, 0x44, 0xf6, 0x17, 0x16, 0xc6, 0x5c, 0xb0,
+    0x2a, 0x29, 0x59, 0x87, 0x67, 0x85, 0xa7, 0x81, 0x84, 0xe9, 0x4f, 0xe5,
+    0x4e, 0x13, 0x5a, 0x11, 0xa1, 0x24, 0x62, 0xe9, 0x7a, 0xea, 0x51, 0xaa,
+    0x45, 0xf3, 0x1d, 0x2a, 0xaf, 0x01, 0x28, 0x35, 0xda, 0xb4, 0xe7, 0xab,
+    0xc1, 0xb9, 0x3c, 0x45, 0xa2, 0x0b, 0x5d, 0x40, 0x09, 0xac, 0x62, 0x16,
+    0xd3, 0x1f, 0x9f, 0xc7, 0x1a, 0x56, 0xb7, 0x27, 0xd1, 0x1b, 0xe1, 0xb5,
+    0x82, 0x9e, 0xe8, 0xd3, 0x5c, 0x0f, 0xe8, 0x87, 0x61, 0xc6, 0x20, 0xb7,
+    0x31, 0x3f, 0x0d, 0xb3, 0x0a, 0x5a, 0xce, 0x06, 0xa5, 0xe9, 0xfd, 0xf3,
+    0x29, 0x1a, 0xcd, 0x86, 0x0e, 0x31, 0x29, 0xaa, 0xb7, 0x32, 0xf1, 0x10,
+    0x4e, 0x92, 0x12, 0x00, 0xc0, 0xac, 0x50, 0x4b, 0x52, 0x59, 0x51, 0x7c,
+    0xa8, 0x0c, 0xf7, 0xcb, 0x16, 0x73, 0x7b, 0x90, 0xa8, 0x57, 0x79, 0xb4,
+    0x73, 0x53, 0xd7, 0xed, 0xba, 0x46, 0xc5, 0x06, 0x53, 0x02, 0xc7, 0x58,
+    0x4c, 0x09, 0x0c, 0xa5, 0x01, 0x13, 0x18, 0x39, 0x4b, 0x4e, 0xc2, 0x0d,
+    0xd6, 0xdf, 0xaa, 0x7e, 0x46, 0xba, 0x6e, 0xcc, 0x25, 0x42, 0xd0, 0xb3,
+    0x31, 0xdc, 0xdf, 0x7d, 0xf1, 0xc3, 0x73, 0xca, 0x7a, 0xf6, 0xcb, 0x23,
+    0x81, 0x8d, 0xbe, 0x0b, 0xf2, 0x79, 0x8d, 0x14, 0xa4, 0xc8, 0x36, 0x18,
+    0x49, 0xc8, 0x0d, 0xd7, 0xc9, 0xdd, 0x35, 0xeb, 0xec, 0x52, 0x56, 0xae,
+    0xf2, 0xd2, 0x51, 0x91, 0x39, 0xbc, 0xb0, 0x49, 0xb7, 0xf2, 0x1b, 0x64,
+    0x83, 0x5a, 0xa6, 0x97, 0xc2, 0x15, 0x95, 0xdc, 0x11, 0xd2, 0x89, 0xc0,
+    0x6a, 0xb1, 0x44, 0x43, 0x38, 0xb6, 0x54, 0x0f, 0xdc, 0xcb, 0xed, 0x26,
+    0x27, 0xd9, 0x46, 0x56, 0x4e, 0x6a, 0x54, 0x74, 0x0f, 0x45, 0xfc, 0xb6,
+    0x93, 0xab, 0x3c, 0xd1, 0x86, 0x51, 0xaf, 0xa9, 0x4a, 0xc0, 0x9c, 0x78,
+    0xc1, 0xb1, 0xc7, 0xf1, 0x9c, 0xd1, 0xd0, 0x32, 0x4e, 0x4b, 0x02, 0x36,
+    0x68, 0x38, 0x88, 0x56, 0xc0, 0x2b, 0x12, 0x05, 0x3b, 0xb9, 0xf6, 0xa2,
+    0x37, 0xe7, 0xbc, 0x81, 0xf9, 0x75, 0x51, 0x27, 0x56, 0x0d, 0x55, 0xd1,
+    0x6a, 0xe0, 0xcf, 0x87, 0x0a, 0x44, 0xc6, 0x57, 0xe1, 0x1b, 0xc0, 0x2c,
+    0xcf, 0xab, 0x77, 0xe9, 0x14, 0xf5, 0x34, 0x89, 0xfb, 0xc9, 0xf2, 0x87,
+    0x5c, 0x75, 0xba, 0x51, 0x9a, 0x49, 0xe9, 0x23, 0x23, 0xf4, 0xc9, 0xd1,
+    0x2f, 0x87, 0xf6, 0x75, 0x38, 0x97, 0x48, 0xb8, 0x30, 0x46, 0x1d, 0x46,
+    0x65, 0x03, 0x10, 0xcf, 0xfb, 0x36, 0xf2, 0xb1, 0xaf, 0x31, 0x02, 0x7b,
+    0x74, 0xfe, 0x9f, 0x8c, 0x73, 0x04, 0xfd, 0xb5, 0xae, 0x2e, 0x27, 0x9c,
+    0xd8, 0x73, 0xbc, 0xc3, 0x4a, 0x76, 0x93, 0x66, 0xf6, 0xb7, 0x90, 0xc4,
+    0x42, 0x3d, 0xcd, 0xb5, 0xf1, 0x75, 0xbf, 0xb7, 0xdd, 0x8e, 0xb7, 0xcd,
+    0x90, 0x35, 0xf5, 0x95, 0x3d, 0xe4, 0x4e, 0xb0, 0x7c, 0x5f, 0xad, 0xff,
+    0x75, 0x38, 0xc4, 0xc7, 0xed, 0xec, 0x70, 0xcc, 0x9f, 0xf9, 0x77, 0xa1,
+    0x00, 0x2f, 0xf1, 0xa2, 0xc9, 0x74, 0xdc, 0x18, 0x14, 0xd0, 0x2f, 0x86,
+    0x66, 0xa7, 0x5b, 0x39, 0x5c, 0xba, 0x0e, 0x77, 0x16, 0x04, 0xc3, 0x02,
+    0x42, 0x3b, 0x66, 0x29, 0xee, 0x65, 0x00, 0xd4, 0x22, 0x5a, 0x77, 0x74,
+    0xd4, 0xc3, 0xf3, 0x00, 0xdf, 0x6b, 0xc3, 0x15, 0x89, 0x0e, 0xb1, 0xbc,
+    0xac, 0xe8, 0x44, 0x2f, 0x80, 0x34, 0x34, 0x8b, 0x0c, 0x48, 0x45, 0xc2,
+    0x6a, 0xa3, 0x67, 0xd7, 0x3d, 0x36, 0xf3, 0x3f, 0xe5, 0xf0, 0x5b, 0xe8,
+    0xad, 0x41, 0xd5, 0x82, 0xc1, 0x28, 0xab, 0x77, 0xe8, 0x7f, 0xb3, 0xf6,
+    0xd2, 0x0c, 0xe4, 0x03, 0xcf, 0xe4, 0x72, 0xdb, 0x7b, 0x81, 0xf4, 0xf3,
+    0x48, 0x74, 0xe1, 0x91, 0xb8, 0xf8, 0x4c, 0x2c, 0x60, 0x99, 0x3e, 0x1e,
+    0x4f, 0xaf, 0x12, 0xab, 0x52, 0xef, 0xc7, 0x60, 0xd2, 0xfe, 0x62, 0x55,
+    0xc8, 0x18, 0xad, 0x60, 0xa7, 0x5d, 0xde, 0x4d, 0xfc, 0x6d, 0xe1, 0x10,
+    0x7c, 0xf9, 0xa2, 0x64, 0x00, 0x16, 0x1f, 0x44, 0x7c, 0xe2, 0x72, 0x37,
+    0xd9, 0x92, 0xad, 0xfc, 0x62, 0x53, 0xbe, 0xb6, 0xe0, 0xc8, 0xe0, 0xa2,
+    0xef, 0x22, 0x4b, 0x70, 0x3a, 0x4f, 0xc9, 0xed, 0x6b, 0xbc, 0x17, 0x0a,
+    0xcf, 0x6a, 0x2c, 0xd3, 0xd2, 0x6b, 0x02, 0x45, 0xfa, 0x9e, 0xc2, 0x21,
+    0x28, 0xfc, 0x07, 0x68, 0xd6, 0xb8, 0x9f, 0x2a, 0x0b, 0x7a, 0x0e, 0xbc,
+    0x4e, 0xee, 0x84, 0x38, 0xe4, 0x8e, 0x70, 0xc3, 0xc4, 0xad, 0x74, 0x87,
+    0x2d, 0x16, 0x4f, 0xa1, 0xf8, 0x20, 0xf5, 0xde, 0xa3, 0xc5, 0x0c, 0x3b,
+    0xde, 0x44, 0x48, 0x0f, 0x3c, 0xdc, 0x7e, 0x10, 0x8b, 0x87, 0xc4, 0x3b,
+    0xb0, 0x95, 0xbf, 0x61, 0x1e, 0xad, 0x07, 0x52, 0xfd, 0x0b, 0x84, 0xa9,
+    0x46, 0xb0, 0x32, 0xd5, 0x22, 0x80, 0x35, 0x26, 0x41, 0xf8, 0x11, 0x72,
+    0xb1, 0x31, 0x6f, 0x5a, 0x75, 0xcc, 0x67, 0xe0, 0xb2, 0x50, 0x89, 0xb2,
+    0x66, 0x6e, 0xee, 0xa0, 0x41, 0x8d, 0x00, 0x2a, 0xa7, 0x9d, 0xa5, 0x11,
+    0x2b, 0x07, 0x95, 0x3a, 0x55, 0x8c, 0x67, 0xb1, 0xe5, 0x2d, 0xd4, 0xd1,
+    0x3e, 0x29, 0xed, 0xa5, 0x59, 0x97, 0x7b, 0xdf, 0x92, 0x10, 0x0b, 0x04,
+    0x89, 0x27, 0xa0, 0xa2, 0x93, 0x18, 0x7f, 0x47, 0x84, 0x1c, 0xc6, 0xd6,
+    0x8f, 0x73, 0x81, 0xa0, 0xfa, 0xe5, 0x3e, 0xd8, 0xbf, 0x56, 0x1a, 0x76,
+    0xf4, 0xc4, 0x0f, 0x7a, 0x29, 0x9d, 0x32, 0x5d, 0x41, 0xe0, 0x07, 0xb9,
+    0xd3, 0x3f, 0x7e, 0xff, 0x90, 0x89, 0xce, 0xdc, 0xf1, 0x1d, 0x54, 0xb6,
+    0x67, 0x7f, 0x4d, 0x71, 0x9a, 0x4a, 0x5f, 0x80, 0x0d, 0x5c, 0x77, 0xd5,
+    0x50, 0x7c, 0x41, 0x56, 0x7e, 0x99, 0x0a, 0xeb, 0x66, 0x1f, 0xd2, 0x55,
+    0xc3, 0xc6, 0x6c, 0xc5, 0xfc, 0x34, 0x40, 0x2c, 0x05, 0x29, 0x05, 0x7c,
+    0xca, 0xe6, 0x8d, 0xd3, 0xb0, 0xca, 0x84, 0x27, 0x50, 0x7c, 0x6b, 0x17,
+    0x1b, 0x22, 0xe4, 0x7f, 0xe6, 0x44, 0x94, 0x06, 0x4b, 0xb3, 0xb7, 0xbb,
+    0x98, 0x81, 0x44, 0x0b, 0xf5, 0x66, 0xcb, 0xad, 0xf2, 0x9a, 0xe1, 0x47,
+    0xf3, 0x97, 0xa9, 0xb2, 0xc2, 0xca, 0xcd, 0x98, 0x78, 0x60, 0xdc, 0x6e,
+    0x87, 0x55, 0x47, 0xf3, 0xae, 0x84, 0xdd, 0x9a, 0xe9, 0x1a, 0x63, 0x83,
+    0xea, 0x23, 0x09, 0x67, 0x34, 0x83, 0x00, 0x6e, 0x5e, 0x58, 0xb8, 0x89,
+    0x04, 0x08, 0x0a, 0x55, 0x9e, 0x78, 0xc9, 0xff, 0xb9, 0xb5, 0x2c, 0xdd,
+    0x3b, 0x0c, 0x58, 0x07, 0x8b, 0xb4, 0x6a, 0xc4, 0x64, 0xa3, 0x5e, 0x5b,
+    0xfe, 0x4d, 0xd0, 0x74, 0x01, 0x1b, 0xdf, 0x10, 0x45, 0x2b, 0xd6, 0x9e,
+    0xa9, 0x60, 0x1f, 0xad, 0x46, 0xa1, 0x8c, 0xf8, 0xf6, 0xa9, 0x8a, 0x27,
+    0xea, 0x51, 0x37, 0x84, 0xcf, 0xe5, 0xd7, 0x51, 0xd6, 0x40, 0x39, 0x39,
+    0x5f, 0xf6, 0x96, 0x33, 0xd9, 0x86, 0x8d, 0x38, 0xb6, 0x26, 0x04, 0x14,
+    0x07, 0x46, 0x3e, 0xd0, 0xc5, 0xf6, 0x0d, 0xa0, 0x47, 0x2b, 0xc8, 0x73,
+    0x18, 0x6b, 0xd3, 0x0e, 0x18, 0xcc, 0x43, 0x98, 0xd0, 0xcf, 0x1c, 0xe4,
+    0x4a, 0x41, 0x6a, 0x56, 0x2d, 0xf0, 0x93, 0x89, 0x81, 0x6c, 0xce, 0x04,
+    0x1a, 0x23, 0x05, 0x91, 0x4f, 0x48, 0x44, 0x3a, 0xaa, 0x03, 0xa5, 0x4a,
+    0xa9, 0x20, 0x2c, 0xbe, 0x6a, 0x81, 0xe6, 0xa9, 0xf8, 0xf0, 0x2b, 0x29,
+    0xa1, 0xe0, 0xc4, 0xce, 0xf5, 0xda, 0x25, 0x70, 0x49, 0xcc, 0xa0, 0x4b,
+    0x24, 0x49, 0x4f, 0x11, 0xc4, 0x3b, 0x22, 0x89, 0x9a, 0xb4, 0xf4, 0xcd,
+    0xa3, 0xee, 0xb0, 0x76, 0x13, 0xc4, 0xbb, 0xaf, 0x03, 0x7f, 0x27, 0xf3,
+    0x38, 0xbc, 0xde, 0x7c, 0x0c, 0x39, 0x14, 0xb7, 0x14, 0xbb, 0x5c, 0xae,
+    0x89, 0xf8, 0xf7, 0xd6, 0x00, 0x78, 0xf4, 0xb0, 0x52, 0x16, 0xf5, 0x54,
+    0xc5, 0x93, 0xf7, 0x6d, 0x0d, 0xe8, 0x58, 0xe2, 0xa1, 0xa7, 0xdc, 0x49,
+    0xdb, 0xc8, 0x79, 0xbc, 0xc3, 0x97, 0x7b, 0x6c, 0x82, 0x7b, 0xbe, 0xe9,
+    0x79, 0xac, 0x4a, 0xa4, 0x7c, 0x49, 0x83, 0x58, 0x3a, 0xe4, 0xf5, 0x68,
+    0x5c, 0xb7, 0x7f, 0x2d, 0xfe, 0x6b, 0x96, 0xc7, 0x8b, 0x67, 0xb5, 0xd0,
+    0xa1, 0x0a, 0x16, 0x62, 0x64, 0x53, 0xea, 0x29, 0x80, 0x93, 0xf9, 0xd6,
+    0xa0, 0xc5, 0x1b, 0x3a, 0x1e, 0xab, 0x51, 0x88, 0xe0, 0x9e, 0xd4, 0xf6,
+    0xbf, 0x70, 0x2d, 0x29, 0x2e, 0x08, 0xa9, 0x31, 0x78, 0x0a, 0x15, 0x30,
+    0x9f, 0x2e, 0xc8, 0x41, 0x65, 0x8e, 0x97, 0x51, 0x5e, 0x73, 0x46, 0x42,
+    0x74, 0x84, 0xfd, 0x9b, 0x4a, 0x8a, 0x68, 0x28, 0x45, 0xd0, 0x5d, 0x65,
+    0x08, 0xb3, 0xf5, 0x40, 0x8a, 0x29, 0x8e, 0x70, 0x02, 0x49, 0x6a, 0x01,
+    0xd6, 0x41, 0x4a, 0xf8, 0x15, 0xa3, 0x70, 0x59, 0xe9, 0xa2, 0xe2, 0x76,
+    0x8c, 0x60, 0x33, 0xb3, 0xfa, 0x8b, 0xb4, 0x90, 0x6f, 0x92, 0xc8, 0x21,
+    0x59, 0xc0, 0x3a, 0x30, 0x46, 0xeb, 0x49, 0xd8, 0x85, 0x63, 0x5a, 0x23,
+    0x87, 0xe1, 0xa7, 0xc0, 0x1a, 0xb0, 0xc7, 0xc4, 0x40, 0x4d, 0x11, 0x9c,
+    0xe3, 0xd4, 0x6b, 0xef, 0x68, 0xc8, 0x2c, 0x31, 0xcd, 0x3e, 0xee, 0x55,
+    0x10, 0x67, 0x77, 0x7b, 0x30, 0xc1, 0xd0, 0x23, 0x6c, 0x65, 0x6f, 0xfb,
+    0x2e, 0x62, 0x33, 0x42, 0x63, 0xdc, 0xca, 0x86, 0xf1, 0x0e, 0xb3, 0xb0,
+    0x69, 0x11, 0x65, 0xe1, 0x6e, 0x6c, 0x03, 0x49, 0x79, 0xe8, 0xf1, 0x2e,
+    0x8d, 0x94, 0xc8, 0xa8, 0x98, 0x2d, 0x3f, 0xfe, 0xbd, 0x2d, 0x75, 0x45,
+    0xd1, 0x7a, 0x09, 0xf8, 0x90, 0x49, 0xbd, 0x4a, 0x3b, 0xa4, 0xa3, 0x26,
+    0xb8, 0x62, 0x66, 0x97, 0xd9, 0xc1, 0xca, 0x12, 0x49, 0xe1, 0x27, 0x93,
+    0x4f, 0x60, 0xfa, 0xb3, 0x4f, 0x4c, 0xdb, 0x87, 0x6c, 0x3b, 0x50, 0x47,
+    0xe2, 0xd8, 0x5b, 0x13, 0x99, 0xf0, 0x2b, 0xbb, 0x32, 0x33, 0xfd, 0x7d,
+    0x15, 0x0f, 0x2c, 0xee, 0x85, 0x83, 0xc0, 0x53, 0x79, 0x3e, 0x51, 0xfe,
+    0x7c, 0x06, 0x73, 0x49, 0x49, 0x4f, 0x5a, 0x22, 0x36, 0x8f, 0x30, 0x8a,
+    0xef, 0x84, 0xd6, 0x15, 0x26, 0x48, 0xe7, 0x1e, 0xb1, 0xaa, 0x82, 0xd0,
+    0xc7, 0x0b, 0x97, 0x7b, 0x6c, 0x2d, 0x49, 0x7e, 0x6d, 0xe7, 0xa3, 0x05,
+    0x80, 0xd7, 0x42, 0xa9, 0xc6, 0x66, 0x98, 0x30, 0xe3, 0x8a, 0x79, 0x86,
+    0x9c, 0x2b, 0xbc, 0x4a, 0xe6, 0x0d, 0xc5, 0xe5, 0x1a, 0x92, 0xd9, 0xef,
+    0x63, 0x52, 0x03, 0x88, 0x36, 0xc5, 0x83, 0x65, 0xf8, 0xf1, 0x87, 0xce,
+    0x43, 0xfe, 0x89, 0x58, 0x07, 0x6a, 0xad, 0x85, 0x37, 0x0f, 0xdf, 0x9e,
+    0xa5, 0x62, 0xa9, 0xd2, 0x41, 0x3f, 0x7f, 0xb7, 0xf1, 0xe2, 0x58, 0xb5,
+    0xda, 0xdf, 0xd1, 0xba, 0x36, 0x2c, 0xe7, 0x43, 0x31, 0x07, 0xc5, 0xf5,
+    0x79, 0xc9, 0x31, 0xd7, 0x1d, 0x97, 0x57, 0x9a, 0x8e, 0x3f, 0xac, 0x00,
+    0x49, 0x00, 0x2f, 0xad, 0xac, 0xe7, 0x65, 0x7c, 0xbf, 0xec, 0x85, 0x57,
+    0xe6, 0xcc, 0x07, 0x34, 0x02, 0x36, 0xa8, 0x6a, 0x9f, 0x3a, 0x9a, 0x2f,
+    0x34, 0x93, 0x1f, 0x7d, 0x38, 0x54, 0xe3, 0x54, 0x54, 0xee, 0x84, 0x55,
+    0xe1, 0x0d, 0xc1, 0x08, 0x3e, 0x33, 0x9e, 0x2a, 0xc3, 0x6a, 0x83, 0xc4,
+    0x75, 0xed, 0xbc, 0x5f, 0xd9, 0x04, 0xd7, 0x77, 0x91, 0xb1, 0xa0, 0xf2,
+    0xef, 0x81, 0xb0, 0x8b, 0x53, 0x5f, 0x71, 0xec, 0xa5, 0x0b, 0xbe, 0xf2,
+    0x92, 0x7e, 0x0a, 0x34, 0xeb, 0x5d, 0x65, 0xc7, 0xa9, 0x44, 0x10, 0xfb,
+    0xd3, 0xef, 0xe1, 0xbc, 0x06, 0x65, 0x68, 0x22, 0xfb, 0x43, 0x2c, 0xcf,
+    0x8e, 0x6a, 0x28, 0xdb, 0x0b, 0xf4, 0xaf, 0x01, 0x65, 0x97, 0xd6, 0xe5,
+    0x91, 0x20, 0x13, 0x2c, 0xb1, 0xc2, 0xd3, 0xc3, 0x76, 0x90, 0xf8, 0xcd,
+    0x00, 0xde, 0x93, 0xf8, 0x4e, 0xcc, 0xdc, 0xca, 0x9a, 0xf0, 0xbd, 0x9b,
+    0xd6, 0x57, 0xb1, 0x13, 0xd9, 0xe0, 0xe1, 0x9e, 0x21, 0x74, 0xa9, 0x76,
+    0xc0, 0x0c, 0xad, 0x4f, 0x5d, 0xfe, 0x23, 0x32, 0x5a, 0x10, 0x75, 0x5b,
+    0x05, 0xdf, 0xdc, 0x5b, 0x94, 0xcb, 0xe1, 0x9f, 0x13, 0x51, 0xf5, 0x50,
+    0x36, 0x3b, 0xf2, 0x90, 0x9c, 0x9a, 0xc8, 0x10, 0x88, 0xa9, 0xec, 0x22,
+    0x1e, 0x96, 0x70, 0xe8, 0x9e, 0x69, 0xc1, 0x22, 0xd9, 0x14, 0x15, 0x2e,
+    0xbc, 0x03, 0x96, 0x9e, 0x1d, 0x00, 0x10, 0x16, 0x4f, 0x56, 0xf0, 0x29,
+    0x47, 0x0a, 0x45, 0x34, 0x27, 0x21, 0x3b, 0x67, 0x33, 0xf9, 0xdd, 0x29,
+    0x3a, 0xf2, 0xe4, 0x56, 0x34, 0x46, 0xbe, 0xd8, 0x42, 0x29, 0x11, 0x7f,
+    0x30, 0xc1, 0xbe, 0xa5, 0xc8, 0x9d, 0x7b, 0x2e, 0x4e, 0xcf, 0xba, 0x91,
+    0xb4, 0xbf, 0x0a, 0x04, 0x00, 0x49, 0x83, 0x6b, 0x46, 0x5f, 0x3b, 0xfa,
+    0xf7, 0x40, 0x8d, 0x85, 0x47, 0x14, 0x58, 0xb3, 0xa5, 0x66, 0x30, 0xfd,
+    0x4a, 0x80, 0xa4, 0x61, 0x3b, 0x7c, 0xb4, 0xcc, 0x34, 0x8c, 0xc6, 0xb6,
+    0x10, 0xa9, 0x76, 0xc9, 0x11, 0xd7, 0x8a, 0x51, 0x86, 0x17, 0x89, 0x28,
+    0xab, 0xd5, 0x03, 0x88, 0x74, 0x5b, 0x81, 0xbd, 0x3a, 0x57, 0xfe, 0x66,
+    0x25, 0xd0, 0x92, 0x15, 0x84, 0x02, 0x0f, 0x51, 0xa8, 0x58, 0xcf, 0x77,
+    0x65, 0x10, 0x61, 0xe8, 0xe6, 0xab, 0xb1, 0xba, 0x3b, 0x08, 0xd6, 0xba,
+    0x5f, 0xf5, 0x74, 0xc5, 0x07, 0x60, 0xfd, 0xd3, 0xc8, 0x52, 0x4e, 0xdb,
+    0xc3, 0xe3, 0x6d, 0x81, 0x20, 0x51, 0x01, 0x9a, 0x5e, 0x32, 0x4e, 0x80,
+    0x5a, 0xcb, 0x83, 0xd7, 0xa4, 0xd9, 0xfb, 0xed, 0x3d, 0x80, 0xa1, 0x83,
+    0x81, 0x91, 0xc0, 0x0b, 0xff, 0x67, 0xd8, 0x8b, 0xd0, 0x12, 0x0b, 0xd4,
+    0x2b, 0x8e, 0x0d, 0x0f, 0xfc, 0xc7, 0xb3, 0xf1, 0xe3, 0xf3, 0x5e, 0x0c,
+    0xb6, 0x6b, 0x9d, 0xdc, 0x22, 0x70, 0x31, 0x54, 0xe8, 0x41, 0xfe, 0xa1,
+    0xe1, 0x4f, 0xfa, 0x81, 0xfb, 0xae, 0x72, 0x16, 0xb8, 0x87, 0xc9, 0x31,
+    0x9d, 0x42, 0x47, 0x4a, 0x20, 0xae, 0x63, 0x16, 0x0d, 0xfa, 0xf1, 0x27,
+    0x19, 0x47, 0xee, 0x45, 0x84, 0x29, 0x9a, 0xb6, 0x42, 0xef, 0xbd, 0x15,
+    0xa8, 0x34, 0x33, 0x38, 0x9c, 0x9d, 0xbb, 0x5c, 0x03, 0xf3, 0xcf, 0xcf,
+    0x6d, 0x2e, 0xd5, 0x88, 0xf8, 0xdd, 0xfc, 0xc0, 0x4a, 0xdb, 0x69, 0xd9,
+    0x62, 0x89, 0x24, 0x46, 0xee, 0xa4, 0xb9, 0x95, 0xe6, 0xaf, 0x7d, 0x53,
+    0xec, 0x41, 0xae, 0x70, 0xfe, 0x4f, 0x31, 0xe3, 0xa2, 0x59, 0x2c, 0xa1,
+    0x53, 0x8b, 0xb6, 0x3b, 0x39, 0xc1, 0xa4, 0xa7, 0x9e, 0xaa, 0x00, 0x60,
+    0x9a, 0x5f, 0x56, 0x51, 0xf3, 0x7b, 0x28, 0x84, 0x36, 0x1a, 0xc1, 0x2d,
+    0xc8, 0xed, 0xf8, 0x48, 0x48, 0x1d, 0x39, 0x4d, 0x3d, 0xce, 0x30, 0x90,
+    0x29, 0x33, 0x6f, 0x9a, 0xce, 0x58, 0xe7, 0x88, 0xac, 0x59, 0xce, 0x85,
+    0x5a, 0x52, 0x2b, 0x6c, 0xb7, 0xe9, 0x2e, 0xa9, 0xd9, 0x9a, 0xea, 0x1c,
+    0x47, 0xb2, 0x59, 0xff, 0x73, 0x76, 0x21, 0x40, 0xe1, 0xde, 0x32, 0xb8,
+    0x73, 0x3d, 0xa5, 0x44, 0x66, 0x79, 0xa1, 0xfe, 0xaf, 0xf6, 0x8a, 0x97,
+    0x09, 0x5c, 0x8b, 0x64, 0x38, 0x9f, 0xe1, 0x59, 0x38, 0x18, 0xe9, 0xc0,
+    0xd6, 0xa2, 0xac, 0x74, 0xa9, 0xfd, 0x4a, 0x0d, 0xf6, 0x47, 0x00, 0x2b,
+    0x09, 0x46, 0x38, 0x1c, 0xa4, 0x9f, 0x63, 0x20, 0x18, 0x75, 0x5a, 0xb8,
+    0xc4, 0xbc, 0xd6, 0x6b, 0xc8, 0x14, 0x72, 0x03, 0xe4, 0x05, 0xd4, 0x4e,
+    0x66, 0x20, 0x42, 0xa2, 0x8f, 0x96, 0xe7, 0xaf, 0xd3, 0xfb, 0xa8, 0x88,
+    0x9b, 0xe3, 0xaa, 0xcd, 0xab, 0xce, 0x8f, 0x07, 0x6d, 0xef, 0x98, 0xce,
+    0xdb, 0x42, 0x5b, 0xf4, 0x61, 0x57, 0x62, 0x27, 0x8a, 0x53, 0x5e, 0xf8,
+    0x3e, 0xf6, 0x7f, 0xde, 0x5e, 0x3b, 0x1b, 0x13, 0x2e, 0x30, 0x46, 0x4b,
+    0x6b, 0xb7, 0xbb, 0x33, 0x31, 0xc0, 0xfa, 0x40, 0xab, 0x68, 0x72, 0xe3,
+    0x92, 0x30, 0x47, 0xd6, 0x30, 0x60, 0x42, 0x5b, 0x88, 0x8d, 0xa6, 0x56,
+    0xe4, 0xac, 0x33, 0x2e, 0xca, 0x05, 0x1f, 0x60, 0xaf, 0xde, 0x7f, 0xa9,
+    0xda, 0x3f, 0xa8, 0x21, 0xf6, 0xfc, 0x98, 0x7d, 0xc4, 0x1e, 0xb0, 0xa9,
+    0x56, 0x2d, 0x8d, 0xea, 0x03, 0x51, 0x48, 0xac, 0xe8, 0x22, 0xc7, 0x8b,
+    0xef, 0x91, 0x0e, 0xcf, 0x0c, 0xe9, 0x38, 0x43, 0x99, 0xa8, 0x98, 0x4f,
+    0xfa, 0xe3, 0x03, 0xa6, 0x4f, 0xd4, 0x0d, 0x98, 0x5b, 0x50, 0x28, 0xd7,
+    0xe7, 0x46, 0xd7, 0xad, 0x43, 0xb8, 0x56, 0x2a, 0x2f, 0x7c, 0x39, 0x67,
+    0xf4, 0x62, 0x0e, 0xc0, 0xa8, 0x87, 0xb5, 0x81, 0xe2, 0x13, 0x9f, 0xe4,
+    0xdd, 0x72, 0xf2, 0x07, 0xca, 0xac, 0x6d, 0xb2, 0x96, 0x53, 0x5a, 0x8f,
+    0x66, 0x3c, 0xb4, 0xc1, 0x4f, 0x9a, 0x82, 0x55, 0xcf, 0x0e, 0x27, 0x5f,
+    0xc7, 0xd2, 0x28, 0x27, 0x7f, 0x22, 0x6e, 0xa5, 0xe7, 0x32, 0x56, 0x51,
+    0x18, 0xe0, 0x85, 0x6d, 0x1f, 0xfc, 0x25, 0x08, 0x18, 0x60, 0x57, 0xfc,
+    0x66, 0x94, 0x2c, 0x4c, 0xbe, 0x00, 0xab, 0x9e, 0x73, 0x9b, 0x06, 0xd3,
+    0xb5, 0x24, 0xa8, 0x8f, 0xb1, 0x33, 0x99, 0x4c, 0xb4, 0x13, 0x07, 0xcd,
+    0x04, 0xdd, 0x77, 0xdc, 0xee, 0x96, 0x02, 0x59, 0xe8, 0x22, 0x07, 0x16,
+    0x2e, 0x41, 0xc9, 0xc4, 0x59, 0x70, 0x37, 0x0f, 0x14, 0xc9, 0xcf, 0x90,
+    0x57, 0xc2, 0x0d, 0xa3, 0xd7, 0x66, 0xb6, 0x7d, 0x10, 0xd4, 0xfc, 0x18,
+    0x66, 0xad, 0xea, 0x5e, 0x64, 0x6c, 0x12, 0x66, 0x3d, 0x96, 0xa5, 0xa8,
+    0x9c, 0x49, 0x5c, 0xd4, 0x8d, 0x1c, 0xc3, 0x38, 0xfe, 0x53, 0xc2, 0x71,
+    0xd1, 0xc6, 0x41, 0xe2, 0xb9, 0x17, 0x74, 0x6e, 0xcc, 0xf8, 0x72, 0x28,
+    0x38, 0x4e, 0x54, 0x9b, 0x0e, 0xa3, 0x3a, 0x43, 0x5c, 0xd5, 0x83, 0x06,
+    0xbb, 0x46, 0x16, 0x6e, 0xe3, 0x8a, 0xd5, 0x1e, 0x7f, 0x88, 0x62, 0xac,
+    0x35, 0x89, 0xfb, 0xbe, 0x96, 0x1d, 0x87, 0x37, 0xb7, 0x91, 0x63, 0xae,
+    0x77, 0x7b, 0x66, 0x60, 0xc1, 0x3e, 0x80, 0x56, 0xb1, 0xc8, 0x0d, 0x16,
+    0xde, 0x38, 0x82, 0x66, 0x99, 0x2b, 0x35, 0xd8, 0xb4, 0x5b, 0x4b, 0x3e,
+    0x93, 0x96, 0x59, 0xf8, 0x96, 0x7e, 0x7b, 0x27, 0xf4, 0x62, 0xb7, 0xda,
+    0x89, 0xa7, 0x34, 0x47, 0xed, 0xb3, 0x42, 0x20, 0xeb, 0xcd, 0xf6, 0xa3,
+    0x9f, 0xf7, 0x48, 0x91, 0x17, 0xd2, 0x21, 0xed, 0x5a, 0x22, 0x39, 0xc9,
+    0x76, 0x95, 0x36, 0xd9, 0x97, 0x0f, 0x19, 0xce, 0xd3, 0xbc, 0x74, 0x7d,
+    0x53, 0x37, 0x3b, 0x4a, 0x97, 0xb7, 0xf8, 0x7e, 0xdd, 0x4c, 0x5f, 0xae,
+    0x5c, 0x0b, 0xab, 0x4c, 0x34, 0xa1, 0x7e, 0x34, 0x35, 0xf4, 0xfc, 0x92,
+    0xab, 0x2e, 0x6a, 0x15, 0xce, 0x84, 0xae, 0x70, 0xae, 0x85, 0x21, 0xe6,
+    0x41, 0x13, 0x31, 0xe0, 0x8f, 0xab, 0x82, 0xe3, 0x09, 0xaf, 0xa4, 0x7c,
+    0xb4, 0xb9, 0xb7, 0xc0, 0x67, 0x08, 0xc9, 0x9d, 0xcd, 0x0b, 0x3c, 0xa0,
+    0x0c, 0xde, 0x49, 0x2f, 0x40, 0x19, 0x95, 0x64, 0xb9, 0x7c, 0x2a, 0x72,
+    0xdd, 0xa2, 0x92, 0x0a, 0x21, 0xeb, 0x8c, 0xc3, 0x6d, 0x52, 0xe7, 0x05,
+    0x50, 0x01, 0x55, 0x19, 0x2f, 0xbd, 0x1b, 0x72, 0x73, 0xfe, 0x82, 0x9f,
+    0xbf, 0xa0, 0xfe, 0x19, 0x7c, 0x42, 0x6d, 0x76, 0x32, 0x47, 0x36, 0x15,
+    0x2e, 0xde, 0xe8, 0xe6, 0xca, 0x07, 0xa3, 0x6b, 0x40, 0x99, 0x96, 0xcd,
+    0x19, 0xea, 0x7e, 0xc9, 0x87, 0x9d, 0x3d, 0xa0, 0x82, 0x88, 0xe7, 0xe4,
+    0x34, 0x9f, 0xa5, 0x27, 0xdf, 0xae, 0x03, 0x37, 0xa8, 0x35, 0x64, 0x02,
+    0x09, 0x09, 0x9e, 0xec, 0x38, 0x0a, 0xff, 0x79, 0x8c, 0x9a, 0x87, 0x66,
+    0xcd, 0xe4, 0xf4, 0x9d, 0xa9, 0x07, 0x96, 0x36, 0xae, 0x2e, 0x4e, 0xc5,
+    0xe9, 0x86, 0xb2, 0x8e, 0x71, 0x5d, 0xe8, 0xee, 0x84, 0xf3, 0x30, 0x2a,
+    0x58, 0x1a, 0x80, 0xb8, 0xaa, 0xb8, 0x1d, 0xc4, 0xae, 0x59, 0x91, 0xf3,
+    0x16, 0x9b, 0xa3, 0x8a, 0xa3, 0x26, 0xb2, 0x0a, 0xe5, 0x58, 0xb7, 0x96,
+    0x87, 0xfb, 0x00, 0xe4, 0x50, 0x7c, 0xb1, 0x77, 0x3a, 0x18, 0xc2, 0xe3,
+    0xc1, 0x12, 0xa6, 0x0d, 0x06, 0xeb, 0x80, 0x6c, 0x5a, 0xee, 0x34, 0xcc,
+    0x1c, 0x87, 0x35, 0x46, 0x1d, 0x05, 0x83, 0xd8, 0x91, 0x22, 0xaa, 0xf6,
+    0xad, 0x87, 0xab, 0x76, 0x18, 0x79, 0xe2, 0x09, 0xc3, 0xa3, 0x15, 0x67,
+    0x3a, 0x7c, 0x0f, 0xa0, 0x4c, 0x7b, 0xfc, 0xfc, 0xdd, 0x5c, 0xe4, 0x86,
+    0x58, 0x13, 0xb8, 0x97, 0xae, 0x8c, 0x75, 0xc8, 0x02, 0x1e, 0x33, 0x45,
+    0xa9, 0x54, 0x09, 0x15, 0x53, 0x4f, 0x28, 0x47, 0x4d, 0x5f, 0xd0, 0xc7,
+    0x09, 0xbd, 0x93, 0xb0, 0x08, 0x79, 0x05, 0xbc, 0xbc, 0xaf, 0x2c, 0xbd,
+    0xbb, 0x21, 0xd1, 0x60, 0xb8, 0x81, 0x4c, 0x6c, 0x5e, 0x45, 0x39, 0xa3,
+    0x31, 0x54, 0xb7, 0x82, 0xef, 0x86, 0xe4, 0x5e, 0xca, 0xd6, 0xb8, 0x31,
+    0xa2, 0x4c, 0x84, 0x5b, 0xac, 0xe5, 0x29, 0xbf, 0xbf, 0x89, 0xb4, 0x4c,
+    0xd3, 0x69, 0x66, 0x50, 0xeb, 0xda, 0x7d, 0x00, 0xbb, 0x45, 0x0f, 0xe1,
+    0xd1, 0x30, 0x1a, 0xc6, 0x94, 0x66, 0xdc, 0x01, 0x75, 0xce, 0xf8, 0xfc,
+    0xd9, 0xce, 0xcf, 0x1f, 0x9e, 0x5a, 0x55, 0xa4, 0x3e, 0xe6, 0x51, 0xc7,
+    0x74, 0x40, 0x82, 0x09, 0xea, 0xa0, 0xf5, 0xb2, 0x70, 0x9f, 0x0e, 0xfb,
+    0x46, 0x8a, 0x69, 0xbf, 0x07, 0x92, 0xdc, 0x74, 0x03, 0x70, 0xc6, 0x44,
+    0x81, 0x66, 0x40, 0xc7, 0xf5, 0xb8, 0xf0, 0x45, 0x0f, 0xca, 0xd8, 0xb0,
+    0x9e, 0x48, 0x94, 0xff, 0x85, 0xcb, 0x7b, 0xec, 0x67, 0x5d, 0xfe, 0xe9,
+    0x13, 0xd1, 0x67, 0x95, 0xd9, 0x35, 0x9e, 0x8a, 0x53, 0x4d, 0x6b, 0x9d,
+    0x42, 0x53, 0xb1, 0x6b, 0x51, 0x1e, 0x35, 0x40, 0x81, 0x92, 0x91, 0x5f,
+    0x1f, 0x8e, 0xbe, 0x37, 0xd3, 0x85, 0xab, 0x85, 0x37, 0x1c, 0x0f, 0xae,
+    0xd9, 0xf7, 0xa2, 0x75, 0x3d, 0xd9, 0xd7, 0x2a, 0x80, 0xb0, 0x4c, 0x14,
+    0x04, 0x40, 0xc5, 0xba, 0x0e, 0xbe, 0xab, 0xcc, 0x38, 0x35, 0x62, 0x6c,
+    0xa5, 0xce, 0x49, 0x15, 0x2a, 0x10, 0xb5, 0x6a, 0xd2, 0x3b, 0xd2, 0x6a,
+    0xad, 0x2e, 0x34, 0x46, 0x8b, 0x78, 0x57, 0x6e, 0xc4, 0xde, 0x65, 0x68,
+    0x05, 0x8f, 0xd6, 0x6e, 0x34, 0xb9, 0xaa, 0x80, 0x77, 0xff, 0x6c, 0x1a,
+    0x37, 0x87, 0xdd, 0x33, 0x13, 0x33, 0xa7, 0xa9, 0x3a, 0x90, 0x32, 0x7b,
+    0x9b, 0x21, 0x31, 0xc8, 0xf5, 0x4c, 0xa6, 0x73, 0x42, 0x79, 0x46, 0x14,
+    0x1b, 0xef, 0xf4, 0x78, 0xd9, 0x7e, 0x6f, 0x31, 0xaa, 0x59, 0x97, 0x34,
+    0xe5, 0xe6, 0x67, 0xf3, 0x86, 0xf5, 0x61, 0xe7, 0x51, 0x6d, 0xce, 0xb3,
+    0xdc, 0x86, 0xc7, 0x55, 0x43, 0xfa, 0x38, 0x78, 0xb0, 0x8d, 0x03, 0x9c,
+    0xe4, 0x6c, 0xca, 0x73, 0x94, 0xa1, 0x0c, 0xb8, 0x11, 0xda, 0x0c, 0x0b,
+    0x18, 0x1b, 0xd0, 0x99, 0xe7, 0xa9, 0x0d, 0xc3, 0x36, 0xd7, 0x8c, 0x16,
+    0xad, 0x16, 0x1f, 0xb2, 0x3c, 0x07, 0x32, 0x11, 0x6c, 0xd2, 0x8f, 0x33,
+    0x37, 0x5c, 0x3e, 0x4f, 0x7a, 0x76, 0xf7, 0x85, 0xcc, 0x68, 0x1a, 0xf9,
+    0x26, 0x74, 0x42, 0xc9, 0xea, 0x21, 0x7e, 0x74, 0x3c, 0x4f, 0xde, 0xfb,
+    0xd7, 0x83, 0x62, 0x12, 0xc7, 0x4f, 0xfc, 0x47, 0x18, 0x9d, 0xc5, 0xf5,
+    0xe9, 0xd7, 0xaa, 0x76, 0x20, 0x99, 0x79, 0xae, 0x9b, 0x7a, 0xde, 0x8b,
+    0x95, 0xc2, 0xa5, 0xa3, 0x6a, 0x30, 0x9b, 0x99, 0x63, 0x34, 0x7c, 0xd1,
+    0x53, 0xa1, 0x6c, 0xd6, 0xed, 0x7d, 0x8c, 0xba, 0xc8, 0x21, 0xf3, 0xe1,
+    0x31, 0x55, 0x3d, 0x88, 0x87, 0x04, 0xc7, 0xc9, 0x65, 0x0c, 0x53, 0x1e,
+    0xd4, 0xd9, 0xaa, 0xda, 0xc2, 0x14, 0x88, 0xf2, 0x07, 0x2c, 0x12, 0x4d,
+    0x79, 0x54, 0xaa, 0xd9, 0x47, 0x95, 0xf9, 0x7e, 0x26, 0x89, 0x4b, 0x63,
+    0x7e, 0x44, 0x06, 0x0e, 0xe2, 0x8d, 0x9a, 0x0a, 0xc3, 0xee, 0x55, 0x13,
+    0x55, 0x04, 0xcc, 0xb5, 0x2e, 0xa0, 0x0d, 0xec, 0x76, 0x84, 0xc1, 0x1e,
+    0xdd, 0xe6, 0xfa, 0x54, 0x6e, 0x38, 0x30, 0x6f, 0xcc, 0xa4, 0x8d, 0x76,
+    0x1e, 0xa3, 0x8e, 0x2c, 0x5e, 0x37, 0xeb, 0x0b, 0xf4, 0xb5, 0x80, 0xde,
+    0x58, 0x13, 0x5a, 0x52, 0xdc, 0x65, 0x99, 0x1a, 0x1b, 0x75, 0x0c, 0xbd,
+    0x83, 0xe8, 0x90, 0x8e, 0xa9, 0xbf, 0x42, 0x22, 0xe1, 0x3a, 0x31, 0x4e,
+    0x54, 0xad, 0xd4, 0x6f, 0x80, 0xb4, 0xb5, 0x82, 0x05, 0x20, 0xd7, 0x38,
+    0xd7, 0xeb, 0x25, 0x33, 0xe9, 0x4b, 0xc3, 0x5e, 0xd1, 0x11, 0xb0, 0xd9,
+    0x8e, 0x90, 0x48, 0x2a, 0xe3, 0xa0, 0x60, 0x16, 0x70, 0xe3, 0xd1, 0x45,
+    0x11, 0x64, 0x91, 0x69, 0x87, 0x1c, 0xbb, 0x91, 0xc4, 0x43, 0x12, 0x62,
+    0x99, 0x69, 0xe5, 0x96, 0x01, 0x15, 0xdb, 0xdf, 0x05, 0x55, 0x34, 0xbb,
+    0xd6, 0x76, 0x89, 0xcd, 0xb5, 0x4f, 0x2e, 0xa7, 0x6e, 0x15, 0xc9, 0xc0,
+    0x8e, 0xa8, 0x63, 0x79, 0x12, 0xfb, 0x7e, 0x69, 0x8f, 0x52, 0x5e, 0xe7,
+    0x76, 0x16, 0x28, 0x76, 0xca, 0xcb, 0xd8, 0x0e, 0x4a, 0x93, 0x9d, 0x16,
+    0x68, 0x98, 0xf8, 0xc3, 0x39, 0xb2, 0x2d, 0xea, 0xba, 0x72, 0x16, 0x33,
+    0xb7, 0xec, 0x61, 0x9e, 0x94, 0x32, 0x01, 0x22, 0xde, 0x66, 0xfd, 0x68,
+    0xfa, 0xcf, 0xf2, 0x52, 0x4f, 0x02, 0xe8, 0x25, 0xd3, 0xa3, 0x5b, 0x29,
+    0xae, 0xe9, 0x62, 0xfa, 0xd6, 0x1a, 0x50, 0x80, 0x95, 0x96, 0xdf, 0x00,
+    0xfc, 0x23, 0xf1, 0x95, 0xef, 0xbb, 0xf5, 0x23, 0x9d, 0x6b, 0xd6, 0xed,
+    0xb4, 0xe2, 0x4a, 0xf6, 0xb8, 0x20, 0x83, 0x6b, 0x45, 0x92, 0x29, 0x5a,
+    0x02, 0xe9, 0xf7, 0x8e, 0x5c, 0x02, 0xde, 0xb4, 0x9a, 0xdf, 0x18, 0x10,
+    0x17, 0x7f, 0xd8, 0x2e, 0x17, 0xc0, 0xf0, 0x6b, 0x3b, 0x88, 0x09, 0x58,
+    0xf2, 0x18, 0x22, 0x09, 0x80, 0x4a, 0xe0, 0x51, 0x6f, 0x7a, 0x70, 0x09,
+    0x1f, 0xe5, 0xfa, 0xa9, 0x4d, 0x24, 0x1f, 0x18, 0x1c, 0x74, 0xcd, 0x87,
+    0x04, 0xfd, 0x85, 0x33, 0x4c, 0x28, 0xbd, 0xa3, 0x66, 0x6c, 0x99, 0x7e,
+    0x50, 0x5e, 0xb5, 0x22, 0x33, 0x92, 0xd4, 0xd8, 0x82, 0x4e, 0x38, 0xbe,
+    0xcb, 0x3d, 0x5f, 0x19, 0xd1, 0x0f, 0x8b, 0xa1, 0x78, 0x08, 0x1c, 0x10,
+    0x0b, 0x77, 0xa7, 0x39, 0x2e, 0x91, 0x83, 0xee, 0x1d, 0x36, 0xd8, 0x77,
+    0x87, 0x8a, 0x38, 0x45, 0x3c, 0xbd, 0xb9, 0x88, 0xbb, 0x1b, 0x20, 0xd1,
+    0x95, 0xb9, 0x8f, 0x03, 0x46, 0xfa, 0xab, 0x70, 0x68, 0x26, 0xd9, 0xb1,
+    0x25, 0x52, 0x5a, 0x77, 0x2d, 0x92, 0xc2, 0x1d, 0xb6, 0x6e, 0xec, 0x67,
+    0xef, 0x34, 0xe2, 0x64, 0xb3, 0xa0, 0xae, 0x0c, 0xd9, 0x36, 0xa1, 0xc7,
+    0xd8, 0xbf, 0x7a, 0x43, 0xbf, 0xc0, 0xc6, 0x90, 0x60, 0x6a, 0x23, 0xc0,
+    0x6a, 0x5d, 0x62, 0x18, 0xac, 0xc1, 0x20, 0x35, 0x17, 0xba, 0x4e, 0x54,
+    0xb7, 0xec, 0xd4, 0xad, 0x99, 0x94, 0xa4, 0xda, 0x57, 0xe7, 0x46, 0xed,
+    0x47, 0xd1, 0xb4, 0xa2, 0x3e, 0x0f, 0x4a, 0xb6, 0xa6, 0x68, 0x3e, 0x94,
+    0xb9, 0x18, 0x30, 0xe0, 0x75, 0x08, 0xe8, 0xf3, 0x21, 0x79, 0x26, 0x68,
+    0x6a, 0x65, 0xb6, 0xbe, 0x03, 0x98, 0x8f, 0x04, 0xad, 0x1e, 0xb0, 0x54,
+    0xd2, 0x28, 0xdd, 0x4a, 0xe9, 0xf3, 0xa0, 0x06, 0xbf, 0x0b, 0x2a, 0xee,
+    0xf8, 0x03, 0x7e, 0x1d, 0x37, 0xc1, 0x32, 0xd1, 0x41, 0xf4, 0x9b, 0xc5,
+    0x02, 0x10, 0x6f, 0x55, 0x5a, 0xec, 0x5b, 0xe7, 0x61, 0x05, 0x17, 0xf0,
+    0xf8, 0xc6, 0x89, 0xe8, 0xad, 0x32, 0x57, 0x14, 0xe5, 0xf8, 0xf5, 0x88,
+    0xd9, 0x73, 0x17, 0x10, 0xa7, 0xc3, 0xf8, 0x78, 0x0b, 0x66, 0xab, 0x63,
+    0x4f, 0x96, 0x5d, 0xdf, 0x36, 0x83, 0xc4, 0x6f, 0x20, 0xbd, 0xcb, 0x4c,
+    0xd2, 0xfa, 0x35, 0x87, 0xd8, 0xb6, 0xbb, 0xcc, 0xb6, 0xd2, 0x85, 0x03,
+    0x6a, 0xea, 0xbb, 0x6d, 0x2f, 0xa2, 0x06, 0xc0, 0xd6, 0x68, 0xd9, 0x7f,
+    0xd6, 0xa2, 0x3b, 0x08, 0x6a, 0x98, 0x26, 0x6d, 0x9a, 0x2b, 0x68, 0x51,
+    0x78, 0xde, 0xa6, 0x96, 0x50, 0x7b, 0xfc, 0x03, 0x43, 0xf8, 0x21, 0x01,
+    0x9d, 0xe2, 0x89, 0x65, 0x47, 0xae, 0x9c, 0x45, 0x5e, 0xa5, 0xce, 0x97,
+    0xb3, 0xe6, 0xf6, 0xd4, 0x5a, 0xe8, 0x6b, 0x87, 0xd6, 0xdf, 0xfb, 0x1f,
+    0xaf, 0xfb, 0xaf, 0x19, 0xa5, 0xfd, 0xba, 0xe0, 0x22, 0x2f, 0x91, 0x97,
+    0xdf, 0xae, 0xe9, 0x39, 0xb1, 0xe4, 0xd3, 0x10, 0xcb, 0xb3, 0x03, 0xb5,
+    0x0b, 0xf0, 0xd9, 0x70, 0x1e, 0x9c, 0x63, 0x6f, 0x3a, 0xcf, 0x3c, 0x1b,
+    0x86, 0xa3, 0xad, 0x1a, 0xe7, 0x4c, 0x09, 0xd0, 0x80, 0xf6, 0x8b, 0x72,
+    0x96, 0x53, 0x7e, 0x66, 0xfb, 0x7c, 0x7c, 0x8a, 0xb0, 0x60, 0xa6, 0x4c,
+    0x20, 0xc4, 0x63, 0x69, 0x6a, 0xc3, 0x53, 0xf8, 0x9a, 0x28, 0x30, 0x9d,
+    0x6f, 0x0e, 0x1b, 0xb2, 0x2c, 0xe6, 0x94, 0x9f, 0xfc, 0xc0, 0x8d, 0x71,
+    0xbe, 0x37, 0xa6, 0xc9, 0xbd, 0x3c, 0x4a, 0xf3, 0xc4, 0xb3, 0x88, 0x4c,
+    0x45, 0x26, 0x4e, 0x2f, 0x83, 0x16, 0x70, 0xb6, 0xc7, 0xb2, 0x36, 0xf0,
+    0x0c, 0x67, 0xd2, 0x0a, 0xd3, 0xd9, 0x7c, 0x35, 0x29, 0xac, 0xd4, 0x9c,
+    0x6d, 0xfc, 0xec, 0x58, 0x92, 0xf0, 0xba, 0x32, 0x00, 0xae, 0xb1, 0xeb,
+    0x4d, 0x8c, 0x1a, 0x20, 0xe7, 0x5c, 0xfc, 0x9a, 0x4d, 0x51, 0x24, 0x7b,
+    0x52, 0xeb, 0x13, 0x3d, 0xb4, 0xab, 0xda, 0xb3, 0x74, 0x39, 0xd2, 0xf8,
+    0x2d, 0xef, 0x9b, 0x0f, 0xae, 0xf5, 0x3c, 0x99, 0x34, 0xbe, 0x15, 0x5c,
+    0x9f, 0x5d, 0xae, 0xf4, 0x72, 0xc2, 0xac, 0x06, 0xbe, 0xad, 0xe4, 0x68,
+    0xea, 0xd5, 0xa1, 0xdc, 0xdb, 0xf4, 0x61, 0x51, 0xf5, 0x1a, 0x62, 0x15,
+    0xfd, 0x00, 0x51, 0x35, 0x53, 0x6c, 0x39, 0x3e, 0xdb, 0x60, 0x0a, 0x52,
+    0xc1, 0x52, 0x3c, 0xd7, 0xab, 0x73, 0xea, 0x1e, 0x38, 0x38, 0x65, 0x35,
+    0x35, 0x2b, 0x28, 0x04, 0x5c, 0x82, 0xea, 0x4a, 0x9e, 0x96, 0x72, 0xa4,
+    0x8e, 0x42, 0xfd, 0x55, 0xa8, 0x66, 0x7a, 0x40, 0xc9, 0xf2, 0xc2, 0x1e,
+    0x5d, 0x09, 0x90, 0x32, 0x18, 0xdb, 0x11, 0x4c, 0x6c, 0x9c, 0x27, 0x62,
+    0x0a, 0xe6, 0xc1, 0xdf, 0xf2, 0x6a, 0x8c, 0x26, 0xb4, 0xfb, 0xda, 0xa9,
+    0x08, 0x10, 0x3a, 0xf0, 0xe1, 0x64, 0xe5, 0x03, 0x81, 0x7d, 0x15, 0x74,
+    0xa1, 0x8d, 0x10, 0xc8, 0xbb, 0x6a, 0x7c, 0x60, 0xa1, 0x09, 0x35, 0x19,
+    0x2d, 0x70, 0xb5, 0x36, 0xc8, 0x8b, 0x66, 0x5f, 0xe0, 0xe7, 0xea, 0x70,
+    0x2f, 0x5d, 0x3f, 0xae, 0x5e, 0x25, 0x84, 0xdd, 0x9b, 0x69, 0x44, 0x37,
+    0x7c, 0x6b, 0x9e, 0x81, 0x18, 0x36, 0x4b, 0xff, 0x86, 0x44, 0x2a, 0x39,
+    0x66, 0x7f, 0x71, 0x43, 0xe7, 0x65, 0xfe, 0xfd, 0x34, 0xb9, 0xd9, 0x5a,
+    0x00, 0xd1, 0x41, 0x43, 0xc7, 0xbc, 0x65, 0x68, 0xb7, 0x73, 0xff, 0x19,
+    0xd3, 0xed, 0x15, 0xa4, 0x67, 0xa1, 0x53, 0x0e, 0xa6, 0xfb, 0x25, 0xce,
+    0x9d, 0x5b, 0x73, 0x08, 0xf3, 0x3b, 0x69, 0xe4, 0x94, 0x9b, 0x94, 0x03,
+    0xb3, 0x8a, 0x2e, 0x07, 0x0c, 0xef, 0x18, 0x4c, 0x2b, 0x1c, 0x83, 0x9f,
+    0x25, 0x20, 0x29, 0x72, 0x11, 0xa0, 0xaa, 0xed, 0x0c, 0xf9, 0xce, 0x94,
+    0x0d, 0x7a, 0xb6, 0xb3, 0xa4, 0x57, 0xd6, 0x61, 0xca, 0x1a, 0x0e, 0x89,
+    0x6d, 0x99, 0x4d, 0x06, 0xcd, 0x83, 0x7e, 0x09, 0x14, 0x5b, 0xe7, 0x4c,
+    0x72, 0xa8, 0x98, 0xc8, 0x27, 0xf3, 0x70, 0x89, 0x87, 0x11, 0xbb, 0x98,
+    0x82, 0x77, 0x9d, 0xaa, 0x95, 0x8c, 0xc1, 0xf8, 0x39, 0x27, 0xd5, 0x64,
+    0x59, 0x6a, 0x8c, 0xbe, 0xe2, 0xe1, 0xd1, 0x6b, 0xe3, 0xaf, 0x30, 0x6f,
+    0xf4, 0x9e, 0x35, 0x0b, 0x10, 0x24, 0x77, 0xd8, 0xa4, 0x30, 0x2e, 0xf7,
+    0x97, 0xfd, 0xef, 0x1e, 0x9e, 0xf2, 0xbd, 0xf2, 0x41, 0x73, 0x19, 0xe6,
+    0x7b, 0x7f, 0x74, 0x11, 0x91, 0x38, 0xc5, 0xac, 0xd5, 0xb0, 0x48, 0xc4,
+    0xe9, 0x41, 0xd4, 0x50, 0x76, 0x13, 0xbf, 0xec, 0xe8, 0x3a, 0xa8, 0x84,
+    0x42, 0x98, 0x12, 0x64, 0x95, 0x85, 0x79, 0x29, 0xea, 0x3a, 0xf9, 0xa4,
+    0x5c, 0x9c, 0x35, 0x01, 0x68, 0x71, 0xb9, 0x5b, 0xbe, 0xaa, 0x76, 0x9e,
+    0x63, 0x1c, 0xc1, 0x83, 0x94, 0xc6, 0x89, 0x2b, 0x1d, 0x00, 0x43, 0x74,
+    0x00, 0x41, 0x93, 0x58, 0x52, 0xf9, 0x13, 0xfe, 0x9f, 0x7a, 0xb7, 0x3d,
+    0x6b, 0x70, 0x4e, 0x4f, 0x8f, 0xf4, 0x9c, 0xe4, 0x97, 0x62, 0xaf, 0x69,
+    0x45, 0xec, 0xf4, 0x53, 0x71, 0xdc, 0xc7, 0x8d, 0x6f, 0xb2, 0x9d, 0xec,
+    0x43, 0xdd, 0xc0, 0xe5, 0xd1, 0x6c, 0x1a, 0x82, 0x19, 0xf6, 0x18, 0xd3,
+    0x59, 0x0e, 0x07, 0x81, 0x5a, 0x23, 0x10, 0x8b, 0xaa, 0x0b, 0x99, 0xc8,
+    0x34, 0xc2, 0xd0, 0xa9, 0x69, 0x7f, 0x54, 0xe3, 0xc4, 0xa0, 0xe7, 0x4b,
+    0x31, 0x90, 0xe7, 0x3b, 0x45, 0x9b, 0x7f, 0xae, 0xd2, 0xab, 0x22, 0xb9,
+    0xfc, 0x07, 0x39, 0x4b, 0x45, 0x83, 0x8d, 0x41, 0x7a, 0x52, 0xb2, 0xae,
+    0x71, 0x78, 0x17, 0x63, 0xfa, 0xbe, 0x59, 0xca, 0xf0, 0xfd, 0x68, 0xe5,
+    0xc4, 0x9a, 0x74, 0x3d, 0xec, 0xd4, 0x8b, 0xa1, 0x2c, 0x31, 0x4d, 0x73,
+    0xfd, 0x5c, 0x1e, 0xeb, 0x5f, 0xf6, 0x42, 0x0d, 0x79, 0x5f, 0x64, 0x10,
+    0xae, 0xb2, 0xf6, 0x9e, 0xa8, 0xab, 0xa5, 0x2b, 0x9a, 0xcf, 0x25, 0xfa,
+    0xa2, 0xb3, 0xdc, 0x30, 0x3d, 0x08, 0x4e, 0xbb, 0x7b, 0x0c, 0x28, 0x34,
+    0x9d, 0xda, 0xc4, 0x94, 0xa4, 0xf4, 0x1e, 0x78, 0x8b, 0xa9, 0xd3, 0xa7,
+    0x1c, 0x2a, 0x27, 0x14, 0xa0, 0x44, 0x1a, 0x9a, 0x87, 0x72, 0xa5, 0x6d,
+    0x69, 0x46, 0xe5, 0xc1, 0x4f, 0x29, 0x87, 0xc0, 0xa7, 0xa8, 0x96, 0xde,
+    0xa9, 0x63, 0x08, 0xd8, 0x4a, 0xa1, 0x25, 0x43, 0x76, 0x41, 0xf7, 0x9f,
+    0x17, 0xe3, 0xe1, 0x4b, 0xc6, 0x2b, 0x79, 0xea, 0xd5, 0xa7, 0x72, 0x16,
+    0x0a, 0x8c, 0xcd, 0x49, 0x70, 0x75, 0xd4, 0x59, 0x4a, 0x19, 0x7b, 0x31,
+    0x02, 0x7a, 0x3a, 0x20, 0x15, 0x62, 0x7e, 0x4e, 0x6f, 0xac, 0xd0, 0xd1,
+    0x29, 0xbd, 0x2d, 0xa1, 0xc6, 0x3e, 0xa6, 0x1a, 0x26, 0x18, 0x96, 0x98,
+    0x12, 0x56, 0x37, 0xbf, 0xb4, 0x91, 0x57, 0xe8, 0xda, 0x61, 0x7c, 0x2f,
+    0x3e, 0xd4, 0x51, 0xfe, 0xe8, 0x5b, 0x00, 0x30, 0x08, 0xf6, 0x4e, 0x69,
+    0xa8, 0x1a, 0x2b, 0x82, 0x41, 0x85, 0xa9, 0xd9, 0x3c, 0xc8, 0x02, 0x91,
+    0x99, 0xd4, 0xa2, 0xfd, 0x9d, 0x1b, 0x08, 0xfc, 0x41, 0x3e, 0x10, 0x6b,
+    0x80, 0x74, 0x3d, 0x72, 0x61, 0x97, 0xdd, 0x96, 0xec, 0xf4, 0xd6, 0x6d,
+    0x68, 0x02, 0x6e, 0xbb, 0x55, 0x9d, 0x6f, 0x11, 0xde, 0xd1, 0xad, 0x6d,
+    0x42, 0x96, 0x2c, 0x42, 0x1e, 0xa9, 0x19, 0x42, 0x22, 0x38, 0x38, 0x18,
+    0x3c, 0x4b, 0xc1, 0x9c, 0x0f, 0xe1, 0x34, 0x61, 0x06, 0x77, 0x54, 0x04,
+    0xe0, 0x87, 0x94, 0x5c, 0xc9, 0xa1, 0x35, 0x55, 0x3d, 0x4a, 0xf2, 0x4f,
+    0x05, 0x11, 0x98, 0x6f, 0x3c, 0x85, 0x84, 0xe6, 0xf8, 0x71, 0x8a, 0xdf,
+    0xe9, 0x9a, 0xe3, 0x70, 0xd6, 0x36, 0xd6, 0xc8, 0x66, 0x3e, 0xba, 0x7c,
+    0x0a, 0x23, 0x0a, 0xd0, 0xb6, 0x66, 0x68, 0xa8, 0xdf, 0x37, 0x17, 0xfb,
+    0xdd, 0x9c, 0x8b, 0xc7, 0x8e, 0xc4, 0x4f, 0x40, 0x08, 0x23, 0x58, 0x15,
+    0xa2, 0xba, 0xef, 0xdf, 0x67, 0xcd, 0x1f, 0xb6, 0xc4, 0xea, 0xce, 0x81,
+    0x38, 0x58, 0x92, 0x57, 0xcf, 0x83, 0x47, 0x29, 0x9f, 0xde, 0x9b, 0xde,
+    0x01, 0xfe, 0x68, 0x91, 0x67, 0x06, 0x9d, 0x31, 0xd0, 0xb9, 0xc3, 0xbb,
+    0xc3, 0x6b, 0xa0, 0x04, 0x1e, 0x34, 0xd5, 0x38, 0xd4, 0xac, 0x70, 0xae,
+    0xab, 0xb2, 0xbd, 0x4b, 0xa0, 0xad, 0x2b, 0x82, 0xaf, 0x8c, 0x90, 0x4d,
+    0xd3, 0xca, 0x71, 0x35, 0x75, 0x89, 0xe5, 0x42, 0x91, 0x46, 0x8d, 0x18,
+    0x04, 0x7a, 0xb9, 0xaa, 0x3b, 0xe7, 0x1e, 0x8c, 0x4e, 0xf9, 0x6e, 0x74,
+    0xaa, 0x2e, 0x36, 0x86, 0xfb, 0xef, 0x9c, 0xd7, 0xba, 0x5e, 0x2e, 0x3c,
+    0x40, 0xce, 0x8b, 0x2b, 0x94, 0x55, 0xf2, 0xd4, 0x7d, 0xbf, 0x8c, 0x8a,
+    0xa8, 0x59, 0x84, 0x6f, 0x32, 0x95, 0xc5, 0xcc, 0xad, 0xee, 0x30, 0x23,
+    0x7c, 0x54, 0xea, 0x60, 0xb8, 0x88, 0x12, 0x45, 0x03, 0xbc, 0xe3, 0x92,
+    0x9f, 0xa8, 0x5b, 0x07, 0x97, 0x53, 0x0d, 0xe1, 0xe3, 0x3d, 0xdf, 0xf2,
+    0x2a, 0x12, 0xee, 0xdf, 0x73, 0x8d, 0x41, 0xf4, 0xe4, 0x2c, 0xb4, 0xd4,
+    0x9e, 0xfe, 0xf2, 0xe6, 0xa0, 0x9e, 0x2a, 0x3a, 0x36, 0x26, 0x7e, 0xd9,
+    0xe1, 0x22, 0xee, 0x0b, 0x5b, 0x48, 0xd2, 0xa9, 0x55, 0xab, 0x50, 0x7c,
+    0xf6, 0xc8, 0x56, 0x31, 0xbb, 0x51, 0xe9, 0x31, 0x4d, 0xaa, 0x13, 0x3a,
+    0x99, 0x9f, 0x8c, 0x59, 0x6a, 0xc9, 0xf1, 0x0a, 0x89, 0xcc, 0x39, 0x98,
+    0xbd, 0xc3, 0x93, 0x97, 0x28, 0xe5, 0x73, 0x94, 0xf2, 0x0a, 0x7a, 0x09,
+    0x38, 0x0b, 0xab, 0xd8, 0x49, 0x98, 0x14, 0x34, 0x32, 0x9d, 0xef, 0x9d,
+    0x47, 0xdb, 0x82, 0xb9, 0x84, 0xd6, 0xd7, 0x9f, 0xf7, 0xdf, 0x79, 0x5b,
+    0xe8, 0x92, 0x44, 0x31, 0x5d, 0x42, 0x80, 0x90, 0x8d, 0x36, 0xa2, 0x39,
+    0x02, 0x64, 0x21, 0xa2, 0xb8, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x4c, 0xe9, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0xd8, 0x03, 0x00, 0x00, 0xdc, 0x03, 0x00, 0x00, 0xe0, 0x03, 0x00, 0x00,
+    0x0f, 0x00, 0x00, 0x00, 0xa8, 0x03, 0x00, 0x00, 0x50, 0x03, 0x00, 0x00,
+    0x04, 0x03, 0x00, 0x00, 0xac, 0x02, 0x00, 0x00, 0x74, 0x02, 0x00, 0x00,
+    0x2c, 0x02, 0x00, 0x00, 0xf4, 0x01, 0x00, 0x00, 0xac, 0x01, 0x00, 0x00,
+    0x74, 0x01, 0x00, 0x00, 0x2c, 0x01, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00,
+    0x9c, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x9e, 0xfc, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x35, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+    0x5e, 0xfd, 0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x80, 0x3f, 0x01, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x96, 0xfd, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x10, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x88, 0xfd, 0xff, 0xff,
+    0x01, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x2f, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+    0xca, 0xfd, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
+    0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x78, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x2a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+    0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x0e, 0xfe, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x18, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0xbc, 0xfd, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x2a, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+    0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x29, 0x00, 0x00, 0x00, 0x52, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x06, 0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+    0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+    0x22, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x96, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+    0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x88, 0xfe, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
+    0x1f, 0x00, 0x00, 0x00, 0xca, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x06, 0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x78, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+    0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+    0x1a, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x0e, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+    0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x00, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+    0x17, 0x00, 0x00, 0x00, 0x42, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x06, 0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xf0, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+    0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x86, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+    0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x78, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+    0x0f, 0x00, 0x00, 0x00, 0xba, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x06, 0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x68, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+    0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
+    0x0a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00,
+    0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x08, 0x14, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00, 0x10, 0x00, 0x0c, 0x00,
+    0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x06, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x07, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x10, 0x00, 0x0c, 0x00,
+    0x08, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x34, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0xac, 0x12, 0x00, 0x00,
+    0x3c, 0x12, 0x00, 0x00, 0xdc, 0x11, 0x00, 0x00, 0x90, 0x11, 0x00, 0x00,
+    0x24, 0x11, 0x00, 0x00, 0xac, 0x10, 0x00, 0x00, 0x5c, 0x10, 0x00, 0x00,
+    0x10, 0x10, 0x00, 0x00, 0xa8, 0x0f, 0x00, 0x00, 0x58, 0x0f, 0x00, 0x00,
+    0x04, 0x0f, 0x00, 0x00, 0xb8, 0x0e, 0x00, 0x00, 0x4c, 0x0e, 0x00, 0x00,
+    0xe4, 0x0d, 0x00, 0x00, 0x94, 0x0d, 0x00, 0x00, 0x48, 0x0d, 0x00, 0x00,
+    0xe0, 0x0c, 0x00, 0x00, 0x90, 0x0c, 0x00, 0x00, 0x3c, 0x0c, 0x00, 0x00,
+    0xf0, 0x0b, 0x00, 0x00, 0x84, 0x0b, 0x00, 0x00, 0x1c, 0x0b, 0x00, 0x00,
+    0xcc, 0x0a, 0x00, 0x00, 0x80, 0x0a, 0x00, 0x00, 0x18, 0x0a, 0x00, 0x00,
+    0xc8, 0x09, 0x00, 0x00, 0x74, 0x09, 0x00, 0x00, 0x28, 0x09, 0x00, 0x00,
+    0xbc, 0x08, 0x00, 0x00, 0x54, 0x08, 0x00, 0x00, 0x04, 0x08, 0x00, 0x00,
+    0xb8, 0x07, 0x00, 0x00, 0x50, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00,
+    0xac, 0x06, 0x00, 0x00, 0x60, 0x06, 0x00, 0x00, 0xf4, 0x05, 0x00, 0x00,
+    0x8c, 0x05, 0x00, 0x00, 0x3c, 0x05, 0x00, 0x00, 0xe8, 0x04, 0x00, 0x00,
+    0x9c, 0x04, 0x00, 0x00, 0x30, 0x04, 0x00, 0x00, 0xc8, 0x03, 0x00, 0x00,
+    0x78, 0x03, 0x00, 0x00, 0x24, 0x03, 0x00, 0x00, 0xd8, 0x02, 0x00, 0x00,
+    0x6c, 0x02, 0x00, 0x00, 0x04, 0x02, 0x00, 0x00, 0xb4, 0x01, 0x00, 0x00,
+    0x68, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+    0x50, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x3a, 0xee, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x2c, 0x00, 0x00, 0x00, 0x94, 0xee, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x82, 0xee, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07,
+    0x2c, 0x00, 0x00, 0x00, 0xdc, 0xee, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0xd7, 0x23, 0x3a,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0xca, 0xee, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07,
+    0x44, 0x00, 0x00, 0x00, 0xbc, 0xee, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x01, 0x00, 0x00, 0x00, 0x80, 0x00, 0x80, 0x37, 0x01, 0x00, 0x00, 0x00,
+    0xc2, 0xff, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00, 0xd2, 0x6f, 0x75, 0x36,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2a, 0xef, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x48, 0x00, 0x00, 0x00, 0x1c, 0xef, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x16, 0x49, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x87, 0x19, 0xb1, 0x40, 0x01, 0x00, 0x00, 0x00,
+    0x58, 0x80, 0xdf, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0xfa, 0xef, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00,
+    0xec, 0xef, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x5d, 0xd1, 0xce, 0x39, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x42, 0xf0, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x2c, 0x00, 0x00, 0x00,
+    0x34, 0xf0, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x23, 0x20, 0xb6, 0x3b, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x22, 0xf0, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x48, 0x00, 0x00, 0x00,
+    0x14, 0xf0, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xa2, 0x5a, 0x91, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x47, 0xc9, 0x90, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0xf2, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x01, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x07, 0x48, 0x00, 0x00, 0x00, 0x7c, 0xf0, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x81, 0xb7, 0xf1, 0x39, 0x01, 0x00, 0x00, 0x00, 0x9e, 0xb5, 0x71, 0x41,
+    0x01, 0x00, 0x00, 0x00, 0x33, 0x20, 0x70, 0xc1, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x04, 0x00, 0x00, 0x5a, 0xf1, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x2c, 0x00, 0x00, 0x00, 0x4c, 0xf1, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7a, 0x08, 0x97, 0x35,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0xa2, 0xf1, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x34, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07,
+    0x30, 0x00, 0x00, 0x00, 0x94, 0xf1, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x2f, 0xf5, 0x1f, 0x3b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0xf2, 0xf1, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x2c, 0x00, 0x00, 0x00,
+    0xe4, 0xf1, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xc7, 0xea, 0x1a, 0x3c, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0xd2, 0xf1, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x48, 0x00, 0x00, 0x00,
+    0xc4, 0xf1, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xb2, 0x78, 0x3f, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x39, 0xb9, 0x3e, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xb0, 0xf3, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x01, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x07, 0x48, 0x00, 0x00, 0x00, 0x2c, 0xf2, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x89, 0x25, 0xf2, 0x39, 0x01, 0x00, 0x00, 0x00, 0xde, 0xdc, 0x1d, 0x41,
+    0x01, 0x00, 0x00, 0x00, 0xa5, 0x23, 0x72, 0xc1, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x04, 0x00, 0x00, 0x0a, 0xf3, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x2c, 0x00, 0x00, 0x00, 0xfc, 0xf2, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x42, 0xe0, 0x90, 0x35,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x52, 0xf3, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x34, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07,
+    0x30, 0x00, 0x00, 0x00, 0x44, 0xf3, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x1a, 0x2a, 0x19, 0x3b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0xa2, 0xf3, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x2c, 0x00, 0x00, 0x00,
+    0x94, 0xf3, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xe9, 0x36, 0xdd, 0x3b, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x82, 0xf3, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x48, 0x00, 0x00, 0x00,
+    0x74, 0xf3, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xdd, 0x43, 0x7e, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x99, 0x45, 0x7d, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x60, 0xf5, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x01, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x07, 0x48, 0x00, 0x00, 0x00, 0xdc, 0xf3, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x5c, 0xfd, 0xa9, 0x39, 0x01, 0x00, 0x00, 0x00, 0x1e, 0xaa, 0x87, 0x40,
+    0x01, 0x00, 0x00, 0x00, 0x08, 0xfc, 0x29, 0xc1, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x04, 0x00, 0x00, 0xba, 0xf4, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x2c, 0x00, 0x00, 0x00, 0xac, 0xf4, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x55, 0xf7, 0x52, 0x35,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x02, 0xf5, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x34, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07,
+    0x30, 0x00, 0x00, 0x00, 0xf4, 0xf4, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xd0, 0xda, 0x1e, 0x3b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x52, 0xf5, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x2c, 0x00, 0x00, 0x00,
+    0x44, 0xf5, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x8e, 0x0b, 0xa8, 0x3b, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x32, 0xf5, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x48, 0x00, 0x00, 0x00,
+    0x24, 0xf5, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xf5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x12, 0x1c, 0x6e, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0xdd, 0x4a, 0x00, 0x41, 0x01, 0x00, 0x00, 0x00, 0x31, 0xc6, 0xd9, 0xc0,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0xf6, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0xf4, 0xf5, 0xff, 0xff,
+    0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x80, 0x9d, 0x16, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x4a, 0xf6, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x09, 0x2c, 0x00, 0x00, 0x00, 0x3c, 0xf6, 0xff, 0xff,
+    0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xa4, 0x34, 0xab, 0x3b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x2a, 0xf6, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x09, 0x48, 0x00, 0x00, 0x00, 0x1c, 0xf6, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x2e, 0x36, 0xe1, 0x3c, 0x01, 0x00, 0x00, 0x00, 0xf8, 0x54, 0xe0, 0x40,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x08, 0xf8, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+    0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07,
+    0x48, 0x00, 0x00, 0x00, 0x84, 0xf6, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xe1, 0xd0, 0xa2, 0x39,
+    0x01, 0x00, 0x00, 0x00, 0x9b, 0xcf, 0x22, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0xea, 0x23, 0x12, 0xc1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+    0x62, 0xf7, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00,
+    0x54, 0xf7, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x99, 0xd3, 0xf7, 0x34, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0xaa, 0xf7, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
+    0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x30, 0x00, 0x00, 0x00,
+    0x9c, 0xf7, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0xd5, 0xc2, 0x3a,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xfa, 0xf7, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x09, 0x2c, 0x00, 0x00, 0x00, 0xec, 0xf7, 0xff, 0xff,
+    0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x8f, 0x84, 0xa2, 0x3b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0xda, 0xf7, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x09, 0x48, 0x00, 0x00, 0x00, 0xcc, 0xf7, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xf7, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x64, 0xeb, 0x8e, 0x3d, 0x01, 0x00, 0x00, 0x00, 0x3b, 0xf3, 0x17, 0x41,
+    0x01, 0x00, 0x00, 0x00, 0xb7, 0xc5, 0x04, 0xc1, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0xaa, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x2c, 0x00, 0x00, 0x00, 0x9c, 0xf8, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x92, 0xa8, 0x98, 0x39,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0xf2, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x2c, 0x00, 0x00, 0x00, 0xe4, 0xf8, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x58, 0x76, 0xb9, 0x3b,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0xd2, 0xf8, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x48, 0x00, 0x00, 0x00, 0xc4, 0xf8, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x43, 0xb8, 0x52, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x8b, 0xe5, 0x51, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0xb0, 0xfa, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x10, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x48, 0x00, 0x00, 0x00,
+    0x2c, 0xf9, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xe3, 0xa1, 0xf0, 0x39, 0x01, 0x00, 0x00, 0x00,
+    0x02, 0xa0, 0x70, 0x41, 0x01, 0x00, 0x00, 0x00, 0x87, 0x08, 0x65, 0xc1,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x0a, 0xfa, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0xfc, 0xf9, 0xff, 0xff,
+    0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xcc, 0x98, 0x41, 0x35, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x52, 0xfa, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x07, 0x30, 0x00, 0x00, 0x00, 0x44, 0xfa, 0xff, 0xff,
+    0x08, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xed, 0xf5, 0xcd, 0x3a, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0xa2, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x2c, 0x00, 0x00, 0x00, 0x94, 0xfa, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x9d, 0xca, 0xd4, 0x3b,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x82, 0xfa, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x48, 0x00, 0x00, 0x00, 0x74, 0xfa, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x58, 0x58, 0xce, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x49, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x01, 0x06, 0x52, 0xc1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x52, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00,
+    0x44, 0xfb, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x9b, 0x9c, 0xe1, 0x39, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x9a, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x2c, 0x00, 0x00, 0x00,
+    0x8c, 0xfb, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xf8, 0xb6, 0xc3, 0x3b, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x7a, 0xfb, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x48, 0x00, 0x00, 0x00,
+    0x6c, 0xfb, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x94, 0x8d, 0x93, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x06, 0xfa, 0x92, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x58, 0xfd, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x01, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x07, 0x48, 0x00, 0x00, 0x00, 0xd4, 0xfb, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x7a, 0xf6, 0x5f, 0x3a, 0x01, 0x00, 0x00, 0x00, 0xba, 0xf4, 0xdf, 0x41,
+    0x01, 0x00, 0x00, 0x00, 0xf4, 0x7c, 0xcf, 0xc1, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x02, 0x00, 0x00, 0xb2, 0xfc, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x2c, 0x00, 0x00, 0x00, 0xa4, 0xfc, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x2f, 0xc4, 0x35,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0xfa, 0xfc, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x34, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07,
+    0x30, 0x00, 0x00, 0x00, 0xec, 0xfc, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x8f, 0x3f, 0xe0, 0x3a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x4a, 0xfd, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x2c, 0x00, 0x00, 0x00,
+    0x3c, 0xfd, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x25, 0xd7, 0xa9, 0x3b, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x2a, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x48, 0x00, 0x00, 0x00,
+    0x1c, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xe3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xc4, 0xf4, 0x39, 0x3e, 0x01, 0x00, 0x00, 0x00,
+    0xf4, 0x1f, 0xe3, 0x41, 0x01, 0x00, 0x00, 0x00, 0xaa, 0x55, 0x8f, 0xc1,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xfa, 0xfd, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0xec, 0xfd, 0xff, 0xff,
+    0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x8b, 0x00, 0x4b, 0x3a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x42, 0xfe, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x09, 0x2c, 0x00, 0x00, 0x00, 0x34, 0xfe, 0xff, 0xff,
+    0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xd7, 0xdf, 0xc3, 0x3b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x22, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x09, 0x48, 0x00, 0x00, 0x00, 0x14, 0xfe, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x68, 0xa8, 0x04, 0x3e, 0x01, 0x00, 0x00, 0x00, 0xc0, 0x23, 0x04, 0x42,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x10, 0x00, 0x18, 0x00, 0x14, 0x00, 0x13, 0x00,
+    0x00, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x07, 0x48, 0x00, 0x00, 0x00, 0x8c, 0xfe, 0xff, 0xff,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x3b, 0xda, 0x75, 0x3b, 0x01, 0x00, 0x00, 0x00, 0x4f, 0xd8, 0xf5, 0x42,
+    0x01, 0x00, 0x00, 0x00, 0xa8, 0x2a, 0x61, 0xc2, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x02, 0x00, 0x00, 0x6a, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x2c, 0x00, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xcf, 0x37, 0x69, 0x37,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0xb2, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07,
+    0x2c, 0x00, 0x00, 0x00, 0xa4, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0xd8, 0x72, 0x3b,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x18, 0x00, 0x14, 0x00, 0x13, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x3c, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x04, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xd4, 0x42, 0x16, 0x3c,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x14, 0x00, 0x10, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x08, 0x00, 0x04, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x09, 0x54, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,
+    0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xa8, 0x41, 0x5b, 0x3d, 0x01, 0x00, 0x00, 0x00, 0x66, 0x66, 0x5a, 0x41,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x60, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
+    0xb4, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00,
+    0x8c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
+    0x68, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00,
+    0x44, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00,
+    0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x96, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x72, 0x9e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x19,
+    0xa6, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0xae, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x1b, 0xb6, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x1b,
+    0xbe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x1b, 0xc6, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x09, 0xce, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x1b,
+    0xd6, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0xde, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x1b, 0xe6, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0xfa, 0xff, 0xff, 0xff, 0x00, 0x1b, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b};
+
+const unsigned int g_keyword_scrambled_model_data_length = 34520;
diff --git a/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h b/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h
new file mode 100644
index 00000000000..ce34426cadc
--- /dev/null
+++ b/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_BENCHMARKS_KEYWORD_SCRAMBLED_MODEL_DATA_H_
+#define TENSORFLOW_LITE_MICRO_BENCHMARKS_KEYWORD_SCRAMBLED_MODEL_DATA_H_
+
+extern const unsigned char g_keyword_scrambled_model_data[];
+extern const unsigned int g_keyword_scrambled_model_data_length;
+
+#endif  // TENSORFLOW_LITE_MICRO_BENCHMARKS_KEYWORD_SCRAMBLED_MODEL_DATA_H_
diff --git a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
new file mode 100644
index 00000000000..5287a9c1e23
--- /dev/null
+++ b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
@@ -0,0 +1,132 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/examples/person_detection/model_settings.h"
+#include "tensorflow/lite/micro/examples/person_detection/no_person_image_data.h"
+#include "tensorflow/lite/micro/examples/person_detection/person_detect_model_data.h"
+#include "tensorflow/lite/micro/examples/person_detection/person_image_data.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/micro/testing/micro_benchmark.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+// Create an area of memory to use for input, output, and intermediate arrays.
+constexpr int tensor_arena_size = 73 * 1024;
+uint8_t tensor_arena[tensor_arena_size];
+
+/*
+ * Person Detection benchmark.  Evaluates runtime performance of the visual
+ * wakewords person detection model.  This is the same model found in
+ * exmaples/person_detection.
+ */
+
+namespace {
+
+// Create an area of memory to use for input, output, and intermediate arrays.
+constexpr int tensor_arena_size = 73 * 1024;
+uint8_t tensor_arena[tensor_arena_size];
+
+class PersonDetectionRunner {
+ public:
+  PersonDetectionRunner()
+      : person_detection_model_(tflite::GetModel(g_person_detect_model_data)),
+        reporter_(&micro_reporter_),
+        interpreter_(person_detection_model_, resolver_, tensor_arena,
+                     tensor_arena_size, reporter_) {
+    resolver_.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+                         tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
+    resolver_.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
+                         tflite::ops::micro::Register_CONV_2D());
+    resolver_.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
+                         tflite::ops::micro::Register_AVERAGE_POOL_2D());
+    interpreter_.AllocateTensors();
+
+    TfLiteTensor* input = interpreter_.input(0);
+    TFLITE_CHECK_EQ(input->type, kTfLiteUInt8);
+  }
+
+  void RunSingleIterationWithPerson() {
+    // Populate input tensor with an image with a person
+    TfLiteTensor* input = interpreter_.input(0);
+    int8_t* input_buffer = tflite::GetTensorData<int8_t>(input);
+    int input_length = tflite::ElementCount(*input->dims);
+    for (int i = 0; i < input_length; i++) {
+      input_buffer[i] = g_person_data[i];
+    }
+
+    // Run the model on this input and make sure it succeeds.
+    TfLiteStatus invoke_status = interpreter_.Invoke();
+    if (invoke_status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(reporter_, "Invoke failed.");
+    }
+  }
+
+  void RunSingleIterationWithoutPerson() {
+    // Populate input tensor with an image with no person.
+    TfLiteTensor* input = interpreter_.input(0);
+    int8_t* input_buffer = tflite::GetTensorData<int8_t>(input);
+    int input_length = tflite::ElementCount(*input->dims);
+    for (int i = 0; i < input_length; i++) {
+      input_buffer[i] = g_no_person_data[i];
+    }
+
+    // Run the model on this input and make sure it succeeds.
+    TfLiteStatus invoke_status = interpreter_.Invoke();
+    if (invoke_status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(reporter_, "Invoke failed.");
+    }
+  }
+
+ private:
+  const tflite::Model* person_detection_model_;
+  tflite::MicroErrorReporter micro_reporter_;
+  tflite::ErrorReporter* reporter_;
+  tflite::MicroOpResolver<6> resolver_;
+  tflite::MicroInterpreter interpreter_;
+};
+
+// NOLINTNEXTLINE
+PersonDetectionRunner runner;
+
+void PersonDetectionFirstIteration() { runner.RunSingleIterationWithPerson(); }
+
+void PersonDetectionTenIerationsWithPerson() {
+  // TODO(b/152644476): Add a way to run more than a single deterministic input.
+  for (int i = 0; i < 10; i++) {
+    runner.RunSingleIterationWithPerson();
+  }
+}
+
+void PersonDetectionTenIerationsWithoutPerson() {
+  // TODO(b/152644476): Add a way to run more than a single deterministic input.
+  for (int i = 0; i < 10; i++) {
+    runner.RunSingleIterationWithoutPerson();
+  }
+}
+
+}  // namespace
+
+TF_LITE_MICRO_BENCHMARKS_BEGIN
+
+TF_LITE_MICRO_BENCHMARK(PersonDetectionFirstIteration);
+TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithPerson);
+TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithoutPerson);
+
+TF_LITE_MICRO_BENCHMARKS_END
diff --git a/tensorflow/lite/micro/examples/hello_world/BUILD b/tensorflow/lite/micro/examples/hello_world/BUILD
index c03069e4ecc..155aaafd98c 100644
--- a/tensorflow/lite/micro/examples/hello_world/BUILD
+++ b/tensorflow/lite/micro/examples/hello_world/BUILD
@@ -16,12 +16,12 @@ package(default_visibility = ["//visibility:public"])
 licenses(["notice"])  # Apache 2.0
 
 cc_library(
-    name = "sine_model_data",
+    name = "model",
     srcs = [
-        "sine_model_data.cc",
+        "model.cc",
     ],
     hdrs = [
-        "sine_model_data.h",
+        "model.h",
     ],
     build_for_embedded = True,
     copts = micro_copts(),
@@ -33,9 +33,9 @@ tflite_micro_cc_test(
         "hello_world_test.cc",
     ],
     deps = [
+        ":model",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/micro:micro_framework",
-        "//tensorflow/lite/micro/examples/hello_world:sine_model_data",
         "//tensorflow/lite/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -83,10 +83,10 @@ cc_binary(
     ],
     deps = [
         ":constants",
+        ":model",
         ":output_handler",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/micro:micro_framework",
-        "//tensorflow/lite/micro/examples/hello_world:sine_model_data",
         "//tensorflow/lite/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/schema:schema_fbs",
     ],
diff --git a/tensorflow/lite/micro/examples/hello_world/Makefile.inc b/tensorflow/lite/micro/examples/hello_world/Makefile.inc
index a4d2da7d891..f1c8859be80 100644
--- a/tensorflow/lite/micro/examples/hello_world/Makefile.inc
+++ b/tensorflow/lite/micro/examples/hello_world/Makefile.inc
@@ -1,9 +1,9 @@
 HELLO_WORLD_TEST_SRCS := \
 tensorflow/lite/micro/examples/hello_world/hello_world_test.cc \
-tensorflow/lite/micro/examples/hello_world/sine_model_data.cc
+tensorflow/lite/micro/examples/hello_world/model.cc
 
 HELLO_WORLD_TEST_HDRS := \
-tensorflow/lite/micro/examples/hello_world/sine_model_data.h
+tensorflow/lite/micro/examples/hello_world/model.h
 
 OUTPUT_HANDLER_TEST_SRCS := \
 tensorflow/lite/micro/examples/hello_world/output_handler_test.cc \
@@ -16,12 +16,12 @@ tensorflow/lite/micro/examples/hello_world/constants.h
 HELLO_WORLD_SRCS := \
 tensorflow/lite/micro/examples/hello_world/main.cc \
 tensorflow/lite/micro/examples/hello_world/main_functions.cc \
-tensorflow/lite/micro/examples/hello_world/sine_model_data.cc \
+tensorflow/lite/micro/examples/hello_world/model.cc \
 tensorflow/lite/micro/examples/hello_world/output_handler.cc \
 tensorflow/lite/micro/examples/hello_world/constants.cc
 
 HELLO_WORLD_HDRS := \
-tensorflow/lite/micro/examples/hello_world/sine_model_data.h \
+tensorflow/lite/micro/examples/hello_world/model.h \
 tensorflow/lite/micro/examples/hello_world/output_handler.h \
 tensorflow/lite/micro/examples/hello_world/constants.h \
 tensorflow/lite/micro/examples/hello_world/main_functions.h
diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index 3f3fef67f28..020a7d49e88 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -1,41 +1,32 @@
-# Hello World example
+# Hello World Example
 
 This example is designed to demonstrate the absolute basics of using [TensorFlow
 Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers).
 It includes the full end-to-end workflow of training a model, converting it for
-use with TensorFlow Lite, and running inference on a microcontroller.
+use with TensorFlow Lite for Microcontrollers for running inference on a
+microcontroller.
 
-The sample is built around a model trained to replicate a `sine` function. It
-contains implementations for several platforms. In each case, the model is used
-to generate a pattern of data that is used to either blink LEDs or control an
-animation.
+The model is trained to replicate a `sine` function and generates a pattern of
+data to either blink LEDs or control an animation, depending on the capabilities
+of the device.
 
-![Animation of example running on STM32F746](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/images/STM32F746.gif)
+![Animation on STM32F746](images/animation_on_STM32F746.gif)
 
 ## Table of contents
 
--   [Understand the model](#understand-the-model)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
 -   [Deploy to STM32F746](#deploy-to-STM32F746)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
-
-## Understand the model
-
-The sample comes with a pre-trained model. The code used to train and convert
-the model is available as a tutorial in [create_sine_model.ipynb](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb).
-
-Walk through this tutorial to understand what the model does,
-how it works, and how it was converted for use with TensorFlow Lite for
-Microcontrollers.
+-   [Train your own model](#train-your-own-model)
 
 ## Deploy to Arduino
 
 The following instructions will help you build and deploy this sample
 to [Arduino](https://www.arduino.cc/) devices.
 
-![Animation of example running on Arduino MKRZERO](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/images/arduino_mkrzero.gif)
+![Animation on Arduino MKRZERO](images/animation_on_arduino_mkrzero.gif)
 
 The sample has been tested with the following devices:
 
@@ -132,7 +123,7 @@ idf.py --port /dev/ttyUSB0 flash monitor
 The following instructions will help you build and deploy this sample on the
 [SparkFun Edge development board](https://sparkfun.com/products/15170).
 
-![Animation of example running on SparkFun Edge](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/images/sparkfun_edge.gif)
+![Animation on SparkFun Edge](images/animation_on_sparkfun_edge.gif)
 
 If you're new to using this board, we recommend walking through the
 [AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
@@ -272,7 +263,7 @@ The following instructions will help you build and deploy the sample to the
 [STM32F7 discovery kit](https://os.mbed.com/platforms/ST-Discovery-F746NG/)
 using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
 
-![Animation of example running on STM32F746](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/images/STM32F746.gif)
+![Animation on STM32F746](images/animation_on_STM32F746.gif)
 
 Before we begin, you'll need the following:
 
@@ -400,7 +391,14 @@ the trained TensorFlow model, runs some example inputs through it, and got the
 expected outputs.
 
 To understand how TensorFlow Lite does this, you can look at the source in
-[hello_world_test.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc).
+[hello_world_test.cc](hello_world_test.cc).
 It's a fairly small amount of code that creates an interpreter, gets a handle to
 a model that's been compiled into the program, and then invokes the interpreter
 with the model and sample inputs.
+
+### Train your own model
+
+So far you have used an existing trained model to run inference on
+microcontrollers. If you wish to train your own model, follow the instructions
+given in the [train/](train/) directory.
+
diff --git a/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb b/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
deleted file mode 100644
index 614cb80b47e..00000000000
--- a/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
+++ /dev/null
@@ -1,1333 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "create_sine_model.ipynb",
-      "version": "0.3.2",
-      "provenance": [],
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "sblS7n3zWCWV",
-        "colab_type": "text"
-      },
-      "source": [
-        "**Copyright 2019 The TensorFlow Authors.**"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "0rvUzWmoWMH5",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "aCZBFzjClURz",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Create and convert a TensorFlow model\n",
-        "This notebook is designed to demonstrate the process of creating a TensorFlow model and converting it to use with TensorFlow Lite. The model created in this notebook is used in the [hello_world](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world) sample for [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers/overview).\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "</table>\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dh4AXGuHWeu1",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Import dependencies\n",
-        "Our first task is to import the dependencies we need. Run the following cell to do so:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "53PBJBv1jEtJ",
-        "colab_type": "code",
-        "outputId": "9b035753-60e5-43db-a78d-284ea9de9513",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 479
-        }
-      },
-      "source": [
-        "# TensorFlow is an open source machine learning library\n",
-        "import tensorflow as tf\n",
-        "# Numpy is a math library\n",
-        "import numpy as np\n",
-        "# Matplotlib is a graphing library\n",
-        "import matplotlib.pyplot as plt\n",
-        "# math is Python's math library\n",
-        "import math"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "p-PuBEb6CMeo",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Generate data\n",
-        "Deep learning networks learn to model patterns in underlying data. In this notebook, we're going to train a network to model data generated by a [sine](https://en.wikipedia.org/wiki/Sine) function. This will result in a model that can take a value, `x`, and predict its sine, `y`.\n",
-        "\n",
-        "In a real world application, if you needed the sine of `x`, you could just calculate it directly. However, by training a model to do this, we can demonstrate the basic principles of machine learning.\n",
-        "\n",
-        "In the [hello_world](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world) sample for [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers/overview), we'll use this model to control LEDs that light up in a sequence.\n",
-        "\n",
-        "The code in the following cell will generate a set of random `x` values, calculate their sine values, and display them on a graph:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "uKjg7QeMDsDx",
-        "colab_type": "code",
-        "outputId": "b17a43c6-eba1-4cc7-8807-14fcf5918d01",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 269
-        }
-      },
-      "source": [
-        "# We'll generate this many sample datapoints\n",
-        "SAMPLES = 1000\n",
-        "\n",
-        "# Set a \"seed\" value, so we get the same random numbers each time we run this\n",
-        "# notebook\n",
-        "np.random.seed(1337)\n",
-        "\n",
-        "# Generate a uniformly distributed set of random numbers in the range from\n",
-        "# 0 to 2π, which covers a complete sine wave oscillation\n",
-        "x_values = np.random.uniform(low=0, high=2*math.pi, size=SAMPLES)\n",
-        "\n",
-        "# Shuffle the values to guarantee they're not in order\n",
-        "np.random.shuffle(x_values)\n",
-        "\n",
-        "# Calculate the corresponding sine values\n",
-        "y_values = np.sin(x_values)\n",
-        "\n",
-        "# Plot our data. The 'b.' argument tells the library to print blue dots.\n",
-        "plt.plot(x_values, y_values, 'b.')\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAD8CAYAAABzTgP2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJzt3X2UVPWd5/H3F1pU1ASRjhLhgDNy\nJpJJgrOVZioa4yQGNJsjzE7iqvRKcpwpH+Im2TkrrZNzNg8ziTSZGcnOEUNHozCgxjUjYtZZMEYH\nZyyBZgYThSgswRFWpBWZaFSQ5rt/3NtD3apb/VQPt27V53VOna77rVvd3/ahvv17NndHRERkwJik\nExARkcaiwiAiIhEqDCIiEqHCICIiESoMIiISocIgIiIRKgwiIhKhwiAiIhEqDCIiEtGWdAKjMWnS\nJJ8+fXrSaYiIpMqWLVtedff2oe5LZWGYPn06vb29SachIpIqZvbicO5TV5KIiESoMIiISIQKg4iI\nRKgwiIhIhAqDiIhEVKUwmNkPzWy/mT1b5nUzs/9pZjvN7Odm9nsFry00sx3hY2E18hERkdGrVovh\nbuDiQV6/BJgRPnLA7QBmNhH4OjAb6AC+bmanViknGYXZs6GtDU45BcaPB7PgMXYsnHsu5PNJZygi\ntVaVwuDuG4ADg9wyD1jpgaeBCWY2GZgLPOruB9z9deBRBi8wUkX5PHziE0EROP74oABs2gT9/fDm\nm/D228fuPXoUtm6Fj33sWKE45RTo6koufxGpjXqNMZwJvFRwvSeMlYuXMLOcmfWaWW9fX1/NEm0V\nM2cGH/IbNgRF4PDhkb3/6NHgfUuWwJgxMGkS9PTUJlcRqa/UDD67e4+7Z9w9094+5IpuidHVBe97\nHxx3HGzfXr3v6w6vvQbXXBO0PDo7q/e9RaT+6lUY9gJTC66nhLFycamifB6mTg3+uu/rgyNHhn6P\nGZx44sh/1uHDsHp1ME6hbiaRdKpXYVgLXBXOTvp94N/c/WVgHTDHzE4NB53nhDGpgp4e+OAHgy6j\nPXsGv7etLegSMoOOjqCr6K23gtbAwGPOnGBsAYL7BtPfHxQijUOIpE+1pqveC+SB3zGzPWZ2tZld\na2bXhrc8AuwCdgI/AK4HcPcDwJ8Dm8PHt8KYVKirK+ja2bZt8PsmTYKnnoJ33w0+zI8ehY0b4+9d\nty5obbgH9y1fDhMnDl4kBsYh1L0kkh7m7knnMGKZTMa1u2p5c+fC+vXlXx8/Pvjrf9EiyGar8zN7\neuDLX4ZDh8rf094ODz1UvZ8pIiNjZlvcPTPUfakZfJah5fPBh365ovC+9wXF4De/gQcfrO4HdC4H\n77wDCxbAuHHx9/T1Bd1amr0k0thUGJpET0/woVu49qDQokXwyivQ3V3bPFatCloNy5eXv+faa1Uc\nRBpZKg/qkajp0+HFMsdvTJwIt9wS/EVfTwM/75prSl9zh+uui94nIo1DLYaUO+20wYvCa68l9+Gb\nywUD2xMmlL529GhQNDQoLdJ4VBhSbPZsOFBmDte0aUFRSFo2C6+/HnRlxVm9WsVBpNGoMKTU3LnB\nvkZxFi2C3bvrms6QuruD1sNJJ5W+ds892pxPpJGoMKTQ7NnxM49OPDH48K31APNoZbPwpS+Vxt3h\nggvUchBpFBp8TplyA80dHeUXpjWSgaK1bFmw+G3AkSNBt9KOHen4PUSamVoMKVKuKMyZk64P0+5u\neOONoJgV27QpaBGJSHJUGFKis7N8S2FdSneXuvrq+PimTcG24CKSDBWGFOjsDLpZik2blq6WQrFc\nLlgIN7AxX6Ht2zXmIJIUFYYGN3dufFGYMKHxZh6NRi4XjDfEWb1aK6RFkqDC0MDy+fjZR2PHwiOP\n1D+fWsnlyq9zuOYaTWUVqTcVhgZ22WWlsZNPhiefbL4dSru7y++v9JnP1DcXkVanwtCgpk+PP1zn\nr/6q+YrCgFwumGFV7OBBmDy5/vmItKpqHdRzsZk9b2Y7zeymmNdvNbOt4eMFMztY8Fp/wWtrq5FP\n2s2dGz8DacGC5t90bt26+Gms+/ZpGqtIvVRcGMxsLHAbcAkwE7jCzCKTDd39v7n7LHefBfwN8HcF\nL7898Jq7X1ppPmlXblyhoyPY0roVbNwYzLgqtmmTjgkVqYdqtBg6gJ3uvsvdDwP3AfMGuf8K4N4q\n/NymtGRJaSzt01JHY/fu+H2V7rqr7qmItJxqFIYzgZcKrveEsRJmNg04C/hZQfgEM+s1s6fNbH4V\n8kmtmTNhzZpobMaM5piWOhqPPloae+01zVISqbV6Dz5fDjzg7v0FsWnhGaRXAkvN7Lfj3mhmubCA\n9Pb19dUj17qaOTNY1FVo7FhYsSKZfBpBNls6U+no0fhWlYhUTzUKw15gasH1lDAW53KKupHcfW/4\ndRfwBHBu3BvdvcfdM+6eaW9vrzTnhtLVVVoUIFj41awzkIZrYHX0mIL/Utes0ViDSC1VozBsBmaY\n2VlmNo7gw79kdpGZfQA4FcgXxE41s+PD55OA84BtVcgpNXp64v8CPuec5p+BNFy5HGQy0diSJSoO\nIrVScWFw9yPADcA6YDtwv7s/Z2bfMrPCWUaXA/e5uxfEzgF6zewZ4HFgsbu3VGG4+ebS2EknwbaW\n+qcwtLgN91QcRGrDop/T6ZDJZLy3tzfpNCrW0xNs+VBs+XK1FuJ0dcW3rp56Sl1uIsNhZlvCMd1B\naeVzgm65pTQ2Z46KQjnd3fF7Ki1cWP9cRJqZCkNCOjtLp6GefXZ6z1aol+7uYLuQQjt2qEtJpJpU\nGBLQ01O6lbYZrFyZTD5pEzcu873vaYtukWpRYUjAl79cGrvxRvWTD1fcZnuHDgXjNSoOIpVTYaiz\nuXODD7FCY8YEXSQyfOvWwQUXlMbjxm1EZGRUGOqoqyt+g7yLLqp/Ls1g8eKgC67Q7t1qNYhUSoWh\nTvJ5+O53S+MTJmjAebSyWZgXs12jWg0ilVFhqJOVK6F4yYhZcx3RmYRFi+JbDZqlJDJ6KgwJmTYN\n/umfNOBcqWwWvv/90viSJdqFVWS0VBjqoKsLfvKTYJDZDMaNg3vvVVGollwu6JIrdv319c9FpBmo\nMNTYwDYOe/YEW0Z//OPwxBMqCtUWt1p869ZgFpiIjIwKQ43dfXf0etcuFYVa6O4OzsQutn69ZimJ\njJQKQw11dcH+/dHYb/1WMrm0glWrYPLk0rhmKYmMjApDjcSds2AWzL2X2vnGN0pjL71UGhOR8lQY\namTp0tLY97+vbqRay+WCzQgL9fdrrEFkJFQYauTFF6PX06drO+16iduM8Gc/q38eImlVlcJgZheb\n2fNmttPMbop5/Qtm1mdmW8PHHxe8ttDMdoSPpthZf+ZMeOutaCxuR1CpjWwWOjqisSNHYPbsZPIR\nSZuKC4OZjQVuAy4BZgJXmNnMmFt/5O6zwscd4XsnAl8HZgMdwNfN7NRKc0pSTw9s3x6NHXecWgv1\ntnEjjB8fjW3aFJyDISKDq0aLoQPY6e673P0wcB8Qs4NNrLnAo+5+wN1fBx4FLq5CTomJaxn8wR/U\nPw+BG24oja1erRXRIkOpRmE4Eyic97EnjBX7IzP7uZk9YGZTR/jeVOjshAMHorHx47VJXlK6u+E9\n7ymN60AkkcHVa/D5YWC6u3+YoFWwYqTfwMxyZtZrZr19fX1VT7BS+XzpqWwAt95a/1zkmLgdbfft\nq38eImlSjcKwF5hacD0ljP07d3/N3QeOp7kD+A/DfW/B9+hx94y7Z9rb26uQdnXdVDLkDjNmaGwh\nablc6Q6sDz+s1dAig6lGYdgMzDCzs8xsHHA5sLbwBjMrXI96KTAwPLsOmGNmp4aDznPCWKr09MCG\nDaXxFSNuF0ktdHcHx34O6O+Ha6/VWINIORUXBnc/AtxA8IG+Hbjf3Z8zs2+Z2aXhbV82s+fM7Bng\ny8AXwvceAP6coLhsBr4VxlLlO98pjS1frsVsjeSqq6KtBne47rrk8hFpZObFp8ekQCaT8d7e3qTT\nAI7tnlpo0SKd4dyITjopur7khBPg7beTy0ek3sxsi7tnhrpPK58rVNxXPWGCikKj+sM/jF6/845O\nehOJo8JQga4uOHgwGvvwh5PJRYa2ahWccUY09pd/qbEGkWIqDKOUz5d2IYF2T2103/xm9Pro0fgZ\nZSKtTIVhlOIWSV1wgQacG93A9NVCGzaoS0mkkArDKP30p9FrnbWQHt3dpWdEa12DyDEqDKPQ1QU7\nd0Zj8+aptZAmxYXh4EEVB5EBKgyjEPcBUtw9IY0tbrPD667TQLQIqDCMWNxMpFmz1FpIm1wuODyp\n0NGjcP31iaQj0lBUGEYgn4/flG3ZsvrnIpWLazVs3aozG0RUGEbgiSeCrRQKTZ+u1kJa5XLB1iVj\niv4vePjhZPIRaRQqDCNw4YXBaWyFdGRnuuVypYsSTzklmVxEGoUKwzB1dsJnPxucxjZ/fnCm8PLl\n2la7GSxbFt1gb+9edSdJa1NhGIbOzuAQngMHYP36YDO2jRtVFJpFNgsf/Wg0tnq1pq9K61JhGIZ7\n741er1mTTB5SO1dfXRq788765yHSCFQYhjB3bjCNsZD6oJtPLgdz5kRjmzZpXYO0JhWGIRRvfQGl\nG7FJc7jwwtLYwoV1T0MkcVUpDGZ2sZk9b2Y7zaxkr0oz+1Mz22ZmPzezx8xsWsFr/Wa2NXysLX5v\nkrq6SlsL48drbKFZXXhhdBAaYMcOjTVI66m4MJjZWOA24BJgJnCFmc0suu1fgIy7fxh4ACjcsPpt\nd58VPi6lgdx1V2ns1lvrn4fURzYLV15ZGteUZGk11WgxdAA73X2Xux8G7gPmFd7g7o+7+8Chik8D\nU6rwc2sqn4e+vmjs7LPVWmh2q1bBaadFYwcOaFtuaS3VKAxnAi8VXO8JY+VcDfx9wfUJZtZrZk+b\n2fxybzKzXHhfb1/xJ3YNXHZZaSzuDAZpPt/5Tmnsnnvqn4dIUuo6+GxmnUAGKNxxaFp4OPWVwFIz\n++2497p7j7tn3D3T3t5e0zy7umDPnmisvV1bX7SKuBlKL7+sGUrSOqpRGPYCUwuup4SxCDO7CPga\ncKm7HxqIu/ve8Osu4Ang3CrkVJG4vw6/+MX65yHJWbcuOJFvQH9//FGuIs2oGoVhMzDDzM4ys3HA\n5UBkdpGZnQssJygK+wvip5rZ8eHzScB5wLYq5DRqPT2lrYWOjuDUL2ktixdDW9ux6zVrNENJWkPF\nhcHdjwA3AOuA7cD97v6cmX3LzAZmGX0XOBn4X0XTUs8Bes3sGeBxYLG7J1YY8nm49tpo7Mwzg+0v\npPVkszBpUjQWN/4g0mzahr5laO7+CPBIUex/FDy/qMz7ngI+VI0cqmHlytJttaW1Fa9j2bcvmTxE\n6kkrnwv8+MelsQUL6p+HNI4vfCF6fehQsE2KSDNTYQjNnl26bmHBAo0ttLrubjj++Ghs/XrNUJLm\npsIQ2rw5em0WLHYS+dznSmM3lWz8ItI8VBgI/vorHlvQDqoyYNUqmDgxGnvySbUapHmpMBA/P/27\n3y2NSeu65ZbotbtWwkvzavnCkM/D2qI9XS+4QHsiSVQuB4sWRXdf/cEP1GqQ5tTyhWHlyuiUxDFj\ngoVNIsW6u+HjHz923d8P11+fXD4itdLyheHpp6PXl16qPZGkvHfeiV5v3arV0NJ8WrowTJ8e/I89\nYMyYoLtApJy4s6G//vX65yFSSy1bGObOhRdfjMbe/361FmRwuRzMmhWN7dun8xqkubRsYXj88dJY\n3OldIsWWLSuNqTtJmklLFoZ8Ht59NxqbMEGrnGV4stnSVsPBgyoO0jxasjDErVp95JHSmEg5ca2G\npUvrn4dILbRcYejqgg0bjl2bwfLlGluQkclmSycqbN+uVoM0B/MU7jOdyWS8t7d3VO89/XTYv//Y\n9fveB6+8UqXEpOVMnhzdinvmTHjuueTyERmMmW0Jj1IeVFVaDGZ2sZk9b2Y7zayko8bMjjezH4Wv\nbzSz6QWv3RzGnzezmm5onM9HiwLABz5Qy58oze7UU6PXOq9BmkHFhcHMxgK3AZcAM4ErzGxm0W1X\nA6+7+9nArUB3+N6ZBEeBfhC4GFgWfr+aiBtb0CpnqcRXvxq9PnAAOjuTyUWkWqrRYugAdrr7Lnc/\nDNwHzCu6Zx6wInz+APApM7Mwfp+7H3L3XwE7w+9Xdfl8sCNmoXPO0diCVCaXC7ojC61erT2UpPry\n+WAzx3r8t1WNwnAm8FLB9Z4wFntPeEb0vwGnDfO9VRF3bGfxX3sio1F8yhvAddfVPQ1pYvk8XHgh\nfO1rwddaF4fUzEoys5yZ9ZpZb1/xUWujoB1UpVq6u6Gt6PT0Z55Rq0GqZ8kSOHw4+OP28OHab/le\njcKwF5hacD0ljMXeY2ZtwHuB14b5XgDcvcfdM+6eaW9vH3GSV10F48YF01PHjdPYglTXySeXxuLO\n+RAZqXwe1qwpjdVSNQrDZmCGmZ1lZuMIBpOLTjhgLbAwfP454GcezJNdC1wezlo6C5gBbKpCTiWy\nWXjiCfj2t4OvGluQaoprfa5dq1aDVC7uD4xXX63tz2wb+pbBufsRM7sBWAeMBX7o7s+Z2beAXndf\nC9wJ/K2Z7QQOEBQPwvvuB7YBR4AvuXt/pTmVk82qIEhtdHfD+vXR3XqPHg2a/PpvTipR+N/UgAUL\navszW26Bm0it5PNw/vnRg5/mz4cHH0wuJ0m3rq7SFsOMGfDCC6P7fnVd4CYiQcvg9tuDcz0GrFmj\nLblldHp6SouCGaxYEX9/NakwiFRRLhe0GgotWaI9lGTkvve90ti8efXpmlRhEKmy4uM/Ae68s/55\nSHrl87BtW2m8XidMqjCIVFnc8Z8nnFD/PCS94mYizZ9fv4kMKgwiVZbLBQsoC736qqauyvDErVsw\nq+959CoMIjWweHF0NfS2bcHYg4qDDCWutVCvsYUBKgwiNZDNwh//cTR29Kj2UJKhPf109LrerQVQ\nYRCpmauuKo398pf1z0PSo6ur9EyPG2+s/yJJFQaRGslmg8VIhQ4d0tRVKe/226PXJ58crKqvNxUG\nkRqKW4x03XUaa5BSXV3wxhvR2KRJyeSiwiBSQ9lsMM2w0NGj2nlVovL5+P8mbr65/rmACoNIzS1a\nFN0mA7TzqkTFHTs8a1ZyZ8aoMIjU2MAeSmbHYmo1SKHNm0tjy5bVP48BKgwidZDLBXPRC61Zo1aD\nBGMLb78djc2alex27SoMInUSNxf9+uvrn4c0lrhZakm2FkCFQaRuslk47rho7LnnkslFGkM+DwcP\nRmNnnJH84U4VFQYzm2hmj5rZjvDrqTH3zDKzvJk9Z2Y/N7P/XPDa3Wb2KzPbGj5mVZKPSKM77bTo\n9bvv6ryGVrZyZWnsm9+sfx7FKm0x3AQ85u4zgMfC62JvAVe5+weBi4GlZjah4PUb3X1W+Ig5xE6k\necT9T6/zGlpTPg8/+MGx64GtL5KaiVSo0sIwDxhYwrMCmF98g7u/4O47wuf/D9gPtFf4c0VSKW7n\nVYAf/7j+uUiybroJ+gtOuP/4x5NZ5Ryn0sJwuru/HD7fB5w+2M1m1gGMA/5vQfjbYRfTrWZ2fIX5\niDS8xYth7NhorF1/KrWUfB6efDIaizvgKSlDFgYz+6mZPRvziEy+c3cHfJDvMxn4W+CL7j5wXPrN\nwAeAjwITgbK9rWaWM7NeM+vt6+sb+jcTaVDZLPzJn0Rj99+vqaut5KabwIs+LeMOeErKkIXB3S9y\n99+NeTwEvBJ+4A988O+P+x5m9h7gfwNfc/enC773yx44BNwFdAySR4+7Z9w9064/ryTlrroqel7D\nkSPxA5HSfOJaC9OmNcbYwoBKu5LWAgvD5wuBh4pvMLNxwIPASnd/oOi1gaJiBOMTz1aYj0gqZLNw\n223HupTcg4FIDUI3vyeeKI392Z/VPY1BVVoYFgOfNrMdwEXhNWaWMbM7wnsuAy4AvhAzLXW1mf0C\n+AUwCfiLCvMRSY1cLuhSGtgqo78frr1WXUrN7sILgzPAzYI9tBplJlIh8+KOrhTIZDLe29ubdBoi\nFcvn4bzzov3NF1wA//APyeUktdPTE8xAmzULJkwIikQ9F7OZ2RZ3zwx1X9tQN4hI7WSz8N73Rle/\n6pS35tTVdWzjxPXrYfny5Fc4l6MtMUQS9uEPR69PPFHdSc0m7ryF730vmVyGQ4VBJGHF6xpefDFY\n7KTi0Dzizlto5F58FQaRhGWzwfTFadOOxfr7tfNqM9m1qzT21a/WP4/hUmEQaQDZbHR7BICtW9Vq\naAb5fOnZzXPmNN5MpEIqDCIN4sorS2Of/3z985Dqyefh/PODIg/BFNUFC2DdumTzGooKg0iD6O4u\n3UNp714tekuzhQuDY1wHuMMHP5hcPsOlwiDSQD71qdLY9derSymNenpgx45ozCxYu9DoVBhEGsi6\nddBRtGNYf7/2UUqjO+8sjV15ZeOuXSikwiDSYDZuDFbGFtq2LZlcZHTyedi0KRo75xxYtSqZfEZK\nhUGkAY0bF71upL36ZWjFi9kApk6tfx6jpcIg0oCK9+Z/4w0NQqfJCy+Uxv7oj+qfx2ipMIg0oFwu\n2Etn5szgevt2uOYa6OxMNi8ZWmdnadffggWNvW6hmAqDSIPK5eDkk6Ox1avVcmhknZ3Bv6NC8+en\nZ2xhgAqDSAN7//tLY428+Vory+dLiwIE5y2kjQqDSAOL+1DZtk3rGhpR3JTiWbPSMT21WEWFwcwm\nmtmjZrYj/Hpqmfv6C05vW1sQP8vMNprZTjP7UXgMqIiEstlgrKHYZZfVPxcZ3NNPl8aWLat/HtVQ\naYvhJuAxd58BPBZex3nb3WeFj0sL4t3Are5+NvA6cHX820VaVy4H7e3R2J49wcEv0hi6uo7thzRg\n/vx0thag8sIwD1gRPl8BzB/uG83MgE8CD4zm/SKt5ItfLI0tXVr/PKRU3CE8ZukcWxhQaWE43d1f\nDp/vA04vc98JZtZrZk+b2cCH/2nAQXc/El7vAc4s94PMLBd+j96+vr4K0xZJl+7u0kVvhw+r1dAI\nFi4sjd14Y3pbCzCMwmBmPzWzZ2Me8wrvc3cHyp1JNC08gPpKYKmZ/fZIE3X3HnfPuHumvbhdLdIC\n4g52ufvuuqchBbq6SjfKmzAhKORpNmRhcPeL3P13Yx4PAa+Y2WSA8Ov+Mt9jb/h1F/AEcC7wGjDB\nzNrC26YAeyv+jUSaVHd36QZ7+/drXUNS4rqQIF0L2cqptCtpLTDQkFoIPFR8g5mdambHh88nAecB\n28IWxuPA5wZ7v4gcs3EjnHFGNHbLLcnk0uriZoadfXb6WwtQeWFYDHzazHYAF4XXmFnGzO4I7zkH\n6DWzZwgKwWJ3H1gw3gX8qZntJBhziNmoVkQK/f7vR69371arod7y+WBmWLFm2R7dgj/c0yWTyXhv\nb2/SaYgkIp+H884LTgMb0NERtCakPqZMCU7XK4699FIy+QyXmW0Jx3sHpZXPIimTzQazXgpt3qwZ\nSvWSz5cWBYD7769/LrWiwiCSQt3dwQKqAe7BQKi6lGrvpphlvB0d6Z6eWkyFQSSlFi2CMUX/B8cd\nJynV09UFGzZEY83YjafCIJJS2Sycf3409qtfqdVQS8X/bCdMaL6iACoMIqm2eDG0tR277usLDvRR\ncai+nh44eDAamzAhmVxqTYVBJMWy2aBrY8qUaPwb30gknaaVz8P115fGb765/rnUgwqDSMpls5Ap\nmoD48sswfXoi6TSlJUugvz8aW7SoOVY5x1FhEGkCcTt5vvgizJ1b/1yaTU8PrFkTjc2f3xwrnMtR\nYRBpAtlscOB8scceq38uzSSfh2uvjcbSvqX2cKgwiDSJVatKB0P7+4MD6mV0Vq6MrjAHOOec5lqz\nEEeFQaSJPPJIaWz1ap0RXU1f+UrSGdSeCoNIE8lmgwPoizXL5m711NkJ99xzbBHhmDHNPeBcSIVB\npMnEHUD/k59obcNITJ4ctLR+/Ws4ejQotv/4j8094FxIhUGkyWSzsHw5jB17LLZnjxa+Ddfs2bBv\nXzT2r//a/OMKhVQYRJpQLgdPPqmFbyPV1QWbNpXGL7mk/rkkSYVBpEmVW/imtQ3xenrij+o8+eRg\nxlcrqagwmNlEM3vUzHaEX0+NuecPzGxrweMdM5sfvna3mf2q4LWYYTMRGa24+fbr1+vshjjFZ1wM\nWL++vnk0gkpbDDcBj7n7DOCx8DrC3R9391nuPgv4JPAWUPiP+saB1919a4X5iEiBcrOUlizRFNZC\nnZ3BQHOx5ctba2xhQKWFYR6wIny+Apg/yL0AnwP+3t3fqvDnisgwxc1SgvjD7FtRPh/MQCq2YEFr\nTE2NU2lhON3dXw6f7wNOH+L+y4F7i2LfNrOfm9mtZnZ8uTeaWc7Mes2st6+vr4KURVpLNhvfpbRn\nj8YbIH5coaOj9cYVCg1ZGMzsp2b2bMxjXuF97u6Al/k2mNlk4EPAuoLwzcAHgI8CE4GyPZ/u3uPu\nGXfPtLe3D5W2iBTo7oY5c0rj69e39hTWnh546KFo7CMfac7Dd0aibagb3P2icq+Z2StmNtndXw4/\n+PcP8q0uAx5093cLvvdAa+OQmd0F/Pdh5i0iI7RuXTBHv3g65pe+BB/6UOv1pefzcN110b2QxoyB\n229PLqdGUWlX0lpgYfh8IfDQIPdeQVE3UlhMMDMjGJ94tsJ8RGQQGzfC+PHR2JEj8LGPtdZgdE9P\nsHX20aPR+KWXtl6BjFNpYVgMfNrMdgAXhdeYWcbM7hi4ycymA1OBfyh6/2oz+wXwC2AS8BcV5iMi\nQ7jhhvh43AllzainJ1gFvr+of2NgLyQB8+I9ZVMgk8l4b29v0mmIpNbMmbB9ezR20knw5pvJ5FNP\nkyeXbnlhBt//fvPPQjKzLe6eGeo+rXwWaUHbtsG0adHYb34TfGg282B0XFGA1igKI6HCINKidu8u\nXfy2b1/zbrZ32mnxRaGV1yuUo8Ig0sKWLQu6UYpde21zbZvR1QUHDpTGP/KR1l6vUI4Kg0gLy2bh\nyitL4+7Bwq9mKA7lNscDTU0tR4VBpMWtWhW/+A3grrvqm0u1DcxAKnbiifDUU5qaWo4Kg4iwbl2w\nDUSxvr7gTIc0rnEoVxQmTIDPqs3qAAAHPElEQVS33lJRGIwKg4gAweK3BQuOnXE8YO/e9C2AK1cU\nQAPNw6HCICL/btWq8v3uF16YjtlKnZ3li0JHR+uc21wJFQYRicjl4ruVDh8OPnBnz65/TsM1d278\nFtoQ/E6tvjnecKkwiEiJjRtLF8AN2LSp8YpDPg/nnlv+tLUFC1QURkKFQURi7d4d33KAoDi0tzfG\nuENPTzAGsrXM+Y/Ll2utwkipMIhIWRs3Bh+sx8ccofXqq8EHcpJrHQYbZD7ttGBKqgabR06FQUQG\nlcvB44+Xf33JkqBw1LtAzJ1bviiMHQsPP6wpqaOlwiAiQ8pmg5ZDOYcPBwWis7O2eeTzQReWWfnx\nhEmT4MknVRQqocIgIsOSywVdMyefXP6e1avhlFOq33oYGFz+2MeCLqxyOjqCRXkqCpWpqDCY2efN\n7DkzO2pmZff4NrOLzex5M9tpZjcVxM8ys41h/EdmNq6SfESktrJZeOONYJbP2LHx97z5ZtB6GDsW\nPvGJygaoe3rgve8dfHAZ4Oyzg6KlmUfVUWmL4VngPwEbyt1gZmOB24BLgJnAFWY2M3y5G7jV3c8G\nXgeurjAfEamDVauCI0HLzVqC4NjMDRuCD/XjjgsekyYNvkiuszMYNJ4+HdragjGEX/+6/P1jxgRF\nascOtRKqqaLC4O7b3f35IW7rAHa6+y53PwzcB8wLz3n+JPBAeN8KgnOfRSQlBmYtnXHG4PcdORI8\nXnst+LA3O/Y48USYOjV4vnp1sD32iy9Cf//g33PixOAeTUWtvnqMMZwJvFRwvSeMnQYcdPcjRXER\nSZFcDl5+OTgvua1t5O9/5x3Ys2f497e1BT/rtddG/rNkeIYsDGb2UzN7NuYxrx4JFuSRM7NeM+vt\n6+ur548WkWHo7oZ33w228I47/KdSbW1Bt9G772q/o1obsr67+0UV/oy9wNSC6ylh7DVggpm1ha2G\ngXi5PHqAHoBMJuMV5iQiNbJuXfC1qwuWLg0+yMeMGbprqNjYscHj859Xd1G91aMraTMwI5yBNA64\nHFjr7g48DnwuvG8h8FAd8hGROujuhkOHgkHoI0eCsYiJE0tbEyecEJz50NYG48fDzJnBvUeOBO9X\nUag/Cz6fR/lmsz8E/gZoBw4CW919rpm9H7jD3T8T3vcZYCkwFvihu387jP8WwWD0ROBfgE53PzTU\nz81kMt7b2zvqvEVEWpGZbXH3sksL/v2+SgpDUlQYRERGbriFQSufRUQkQoVBREQiVBhERCRChUFE\nRCJUGEREJCKVs5LMrA94cZRvnwQMsnFvw0t7/pD+3yHt+UP6f4e05w/J/A7T3L19qJtSWRgqYWa9\nw5mu1ajSnj+k/3dIe/6Q/t8h7flDY/8O6koSEZEIFQYREYloxcIwyDEhqZD2/CH9v0Pa84f0/w5p\nzx8a+HdouTEGEREZXCu2GEREZBAtUxjM7GIze97MdprZTUnnM1Jm9kMz229mzyady2iY2VQze9zM\ntpnZc2b2laRzGikzO8HMNpnZM+Hv8M2kcxoNMxtrZv9iZj9JOpfRMLPdZvYLM9tqZqnbTdPMJpjZ\nA2b2SzPbbmYNd1p1S3QlmdlY4AXg0wRHiG4GrnD3bYkmNgJmdgHwJrDS3X836XxGyswmA5Pd/Z/N\n7BRgCzA/Zf8ODDjJ3d80s+OAfwS+4u5PJ5zaiJjZnwIZ4D3u/tmk8xkpM9sNZNw9lesYzGwF8KS7\n3xGeUTPe3Q8mnVehVmkxdAA73X2Xux8mOAOirkeTVsrdNwAHks5jtNz9ZXf/5/D5G8B2UnbGtwfe\nDC+PCx+p+svKzKYA/xG4I+lcWpGZvRe4ALgTwN0PN1pRgNYpDGcCLxVc7yFlH0rNxMymA+cCG5PN\nZOTCbpitwH7gUXdP2++wFFgEHE06kQo4sN7MtphZLulkRugsoA+4K+zOu8PMTko6qWKtUhikQZjZ\nycCPga+6+6+Tzmek3L3f3WcRnFHeYWap6dYzs88C+919S9K5VOh8d/894BLgS2E3a1q0Ab8H3O7u\n5wK/ARpuzLNVCsNeYGrB9ZQwJnUU9sv/GFjt7n+XdD6VCJv/jwMXJ53LCJwHXBr20d8HfNLMUnei\nsrvvDb/uBx4k6CpOiz3AnoKW5gMEhaKhtEph2AzMMLOzwsGey4G1CefUUsKB2zuB7e7+10nnMxpm\n1m5mE8LnJxJMZvhlslkNn7vf7O5T3H06wf8DP3P3zoTTGhEzOymcvEDYBTMHSM1MPXffB7xkZr8T\nhj4FNNwEjLakE6gHdz9iZjcA64CxwA/d/bmE0xoRM7sXuBCYZGZ7gK+7+53JZjUi5wH/BfhF2EcP\n8Gfu/kiCOY3UZGBFOMttDHC/u6dyymeKnQ48GPydQRtwj7v/n2RTGrH/CqwO/0jdBXwx4XxKtMR0\nVRERGb5W6UoSEZFhUmEQEZEIFQYREYlQYRARkQgVBhERiVBhEBGRCBUGERGJUGEQEZGI/w/w1xWP\nb+vxVQAAAABJRU5ErkJggg==\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "iWOlC7W_FYvA",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Add some noise\n",
-        "Since it was generated directly by the sine function, our data fits a nice, smooth curve.\n",
-        "\n",
-        "However, machine learning models are good at extracting underlying meaning from messy, real world data. To demonstrate this, we can add some noise to our data to approximate something more life-like.\n",
-        "\n",
-        "In the following cell, we'll add some random noise to each value, then draw a new graph:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "i0FJe3Y-Gkac",
-        "colab_type": "code",
-        "outputId": "60b19cdd-c69c-469e-9446-b738a79c1f51",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 269
-        }
-      },
-      "source": [
-        "# Add a small random number to each y value\n",
-        "y_values += 0.1 * np.random.randn(*y_values.shape)\n",
-        "\n",
-        "# Plot our data\n",
-        "plt.plot(x_values, y_values, 'b.')\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAD8CAYAAACfF6SlAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJztnX+YVOV597/3mdkdeNNa0tGWKFIS\njUlsuMIKEqc2uqlEg41K3bfVxPddC8j6A4jEq1Jtk5S3MZIQo5ugIqvAyzaaNC0JQoJFMW6kYRoE\nwdKgxh9NEH9Usr7UpGGX3Znn/ePeu89zzpyzO7MzuzNz5v5c116zM/OcmTP74/vc5/5JxhgoiqIo\njYVX7RNQFEVRxh8Vf0VRlAZExV9RFKUBUfFXFEVpQFT8FUVRGhAVf0VRlAZExV9RFKUBUfFXFEVp\nQFT8FUVRGpBktU8gihNPPNFMmzat2qehKIpSV+zdu/cXxpiTRlpXs+I/bdo07Nmzp9qnoSiKUlcQ\n0c+LWaduH0VRlAZExV9RFKUBUfFXFEVpQFT8FUVRGhAVf0VRlAZExV9RFKUBUfFvQLJZYOVKvlUU\npTGp2Tx/ZWzo6gKWLAFyOSCVAh5/HMhkijs2mwV6eoDW1uKPURSlNlHxbyCyWWDxYmBwkO/397OY\nFyPk2SxwwQXA8eNAc3Npm4aiKLWHun0aiJ4eIJ+394nYig8S5hbq6WHhz+X4tqdnbM9VUZSxRS3/\nmCIumnQa6O1lkW9tBZJJFm8AMAY4cMBvwUe5hVpb2eIXyz9s01AUpX5Q8Y8h4qLp72dL3/OskC9Y\nAKxdy8KfzwM33MDH9PbyRhHlFspk+Hj1+StKPFDxrzOKCbqKi0ZcPPk83+/u5vtELP4AW/iyAXge\n3xcSCb+FL5uAoij1j4p/HVFM0DWbBQ4dYveOWPeex0K+YQNb9SL8ggi+MbxOjrn7bhV7RYkrKv51\nRFjQ1RVnd3NIJICODqClhV06hw4B99/vt+xdxDXU2WljBCr8ihJfVPxriJFcOlFBVznu0CG7OQDA\n1KnA9On8XEuLPTaR4CsAcQsRAbNmASefDDz4IHDkCLB7NzB3rm4EihJXyAR9ADXCrFmzTCMNcyk2\njz6YxZNOA8uW8XHi6snl+DU6O+1ziQRw8cX8GpMnAyecANx5J28ATU18OzAQfm6pFPDEE7oBKEo9\nQER7jTGzRlqnln+NMJJLR5DHZKOQIK1Y8YsWscWfTgObNgF9fXZD2LyZ1xDxRnHTTcCkSWzly3Nh\n9PcDq1YBs2fbqwCt9lWU+kbFv0YoJo8+zL1jDG8ARHxcezuvveACK/xBjGEr/847gXvuAb7//ZHP\nb+tW/mpuBpYutVcNbi2AbgiKUj+o+NcIYXn0rpgC/mBucug3J+4d1ze/ciWvG8mjl88D69ZFu3sA\n3lSIbByhvx+44w57pSG1AO75FeO20g1CUapLRcSfiNYD+ASAN40xHwx5ngB8DcDFAH4N4M+NMU9X\n4r3jhJtHH4wBXH21P5jrundc4ZdUz0TCpnqKgBvj3xCSSWDv3ujz8TwOBO/b528L4b4GEb9fd7ff\nbdXdXSjy2h9IUWqHSln+/xfA3QC6I56fC+C9Q18fBrBm6FYJIZsFVqywFbrSjsF1C7nuHXnMDfAm\nk8Cll7JLZ3CQNwOx4JNJYP584I03wn39slmkUsBZZwFu3F0CxLkcr/E8TiH1PP8GITUFrshHxTX0\nakBRxp+KiL8x5kkimjbMkssAdBtOLfoXIppERO8yxrxeifePE2GtGUTs29v9IrlypV3X388BXvfq\n4Ne/5ufkCuCSS/ixtjauAbj++vBzkDjC0qXA22+zyLvk83w8YGsH3PoBCUAbw+e1YgV/tbbaIjKp\nHtarAUWpDuPV1fMUAK849w8PPeaDiDqIaA8R7Tly5Mg4nVpt4bZm8Dxgzhy26MWvfuutVhyPHvW3\ncJgxgwU0keDbtjZ7P5kEtm1jcV22jEW3vZ2t+zDyeQ7qilXvbgC5HPD001w7kBzBfMjngUcfZYE/\ncMC+Ti7H97VbqKJUh5oK+BpjugB0AZznX+XTqQrBrJ+2Nr8rZ/58Fu0DBzjwKngep20+/rjt4TN9\nug0iuxW+fX02dfPGG60A/+u/2s3E82yDN2OAyy7jbB+x8HfvBp55hgvBhksTFfr6+MpkYMCmni5e\nzNlGwSwndQMpytgzXuL/KoBTnftThh5TAgSzfnp6rGsnl+OOnOvX+3P7AX8Tto0bWUw3buTXuvVW\nFtQNG2x66ObNhaItcYLJk4Ff/pKrfQF+nzPO4CDzfffZ9ceP89qJE/kc3YByWKbRjBnAjh32fj7P\nwWr38wLqBlKU8WC83D5bALQTcw6A/1R/fzSZjHXvpNOFmTYDA4U9ev7sz1hA3awb8bdns/xa8+cX\n+u9dcjm+Glizhls8uPT0cIDYxRv66+nsBG67jTeGiRN5I0ok/GtPPx342tf8nyWVstb9rbfyYxLo\nVjeQoowtlUr1/CaAVgAnEtFhAH8DoAkAjDH3AdgGTvN8EZzqOb8S79sI9Pb6WzAT2Wwb2QCIgL//\ne9uV0+3ouWMH8MMfsvC3tAATJgDHjoW/lzG82QBspT/6qH1u717gqaf86+VKZMIEDg5v3gycfTZw\n5pnsmtq8GXjoIeD114EXXvAf63m8aUi2T3c3X9FIzyEJdOvQGEUZGyqV7fPJEZ43ABZX4r0aBfF7\nHz3qd6F85CPAVVcBjzzCPngRfDfDZtZQV489e2wmkIh0Zyfn7a9bx0JL5Pfz9/by92+/7T+fqG6g\nxtgYgrBrF4v/l7/McYjPfjb82N5em+3jViNLoHvFCk0FVZSxoqYCvoq1giVP3gs45n70I14zOMgW\n/sKFbNEvW2ZjA3v28NVBMmkDrCLS+/bZGgHAf6znseWfzbIVHobnFRaLBRkc5M8gbquwtYkEP9fd\n7Rd+qS9whV9jAIoyBhhjavJr5syZptHYtcuYiRONIRJ5Ncbz+MtKrv9r9mw+btcuYy680K5NJIy5\n7jr+SiTs+mTSmFSKH5s4kY9bu9aYpiY+trmZXzPqPRMJY5Yv59ch4turripcd9119vMEnyPi15f3\nlMebmuxxwu232/NPJPi+oijRANhjitDY8Qr4KkUgKZeuFZxIsAskit27gY9+lC3otja2miXPv72d\nLXvX8pZAqqR8dnez+yWft9XEu3fbthBBjGFXzpNPAl/8It9+4xvA8uV2TTJpC9KkOlmQKwdpIe0G\ngFta+DjXspfUV/lMGgNQlMqg4l9DuELX1GRz7d30yDDEp79sGfv0v/AFdo8AnEvvCqwbPDaGff/p\nNL+vK/aeB3zsY/y68+bxOcm0r3S60Ac/bx4/JxtW8PM0N/Oa4bKN9uxhF082ax+T1Ff5TOryUZTK\noD7/KuMOZ9m3D7joIs6dB4CurpH964IxbGVLk7fubq7CdQO1nsd5/A8/bF9zcNDm2ruxhuZm63fv\n6PCfpxSdBfv2yHzgwUG+f+uthTULDz/sP2+36ZxceQRnGejgeEWpPCr+Y8xwmSrBPj5CUxMPT5c+\nOGGIBS2tF4xhMU6n+b1cd4s0YLvkEq7I3bbN3yxOzi2TKewfJMjz0i5a3EerVnG/IGktEZxHEBTu\n5mb+vAB/vkWLbNB5uFkGiqJUFhX/MSSYqRLsu+/28XEZGOCrgLvvtm6bRKKw734yyWtkTq9Y1+46\nIs6937+fU0O3bwdWr+bXBwp97CNZ2W77Cc+zVcKPPsp+/0mThk/JnDsX2LKFN6tk0g6Y18HxijK+\nqPiPIW7Tsv5+4IYbbEtkEWuxhMMs/OnTgWuu4e9bWoBPf9pazeJe2beP3TIimAcOFPbsP+ssLtIS\na723l6t4R4PbfuIb3wAOHrTPPfQQ8O1v8/crV/pnDQCF+fwDA8CSJXxensd9fnQAjKKMDyr+Y4hr\nJQPW/378ODdKO/dcroz96lf9xzU1sah+5CMslKlUYdYOwPfXr/db7729trc+EdcBtLfbfj+VcKvI\ne/3d3/kfP3wYOP98mzkkraFTKTuMxs1k8jx7lZLP2xbT0i4a4LiHXP24IyMVRSkPFf8xxLWSg0PS\njxyJ7oaZy3EKpSC9+sOqbHM5f4C0tZVFUoS+pYWfl8reKEqxrkWQw84n6JoKDqORK5f3vx+YMsXf\nQiK4AWSzfGUg3UWPHbMZTboBKEqZFFMMUI2vuBV57drFxVVRxVrDfTU1cSFWMllYLCWFWsH3uv12\nPmbiRC6OSqW4gMst7nLXy7qw13NZu9ZfmEXERWGplL84zS1Sc4vJ3IKzqC+3QC2s2Ky5efhzVJRG\nBlrkVXvMnQtMm1baMZ7HQd2ODvbdu5x2WrgbRLpk9vb6M3OkG2iwW2axA1Wy2cK6gUSCLfEnngCu\nvdbWC3ge9yC67TZ7jlJMNhLSMG7DBo5ZBGsDBga026eilIu6fcYI140CcBWuuDw8j90ezc2chSNM\nngyccw5nw4hIXnopB35XruTX2r3brr/5Zr/wB103bsxBOn3mcoV+/+AAmWBMQF730CG/eAeDtJkM\nu5kkiPud7/Bm4bqkPC+6SVywAG1wkFNBg7OG3dkFiqKMDhX/EinGNx5M8bzoIiv8gBXQadP84v+L\nXxTm4W/ZwkPY83l+reXL+RiZwztS8VVwUEpUDr+7LrihyGdJJvlLGs7dc48/OAsUtopw4xGZDHDv\nvZz1FLYBGGMrieXzSt2BuyFec436/BWlXFT8SyCswyRQKJrBFM+tWwtf6+BBf5okwOt7e4EFC9jt\nIVWvInrHj3Me/fbthecjFnVQdIN5+1GiGZXf734WgC3xqVNtGqcMihFGuoro6OArmYULgWefLXw/\nY/i5qVPtsYcOcQaUVB67XUkVRRkdKv4lEPSNd3fbFEp3vq4rgG6//JFw3Rkyb1cgKpxxe+iQPR9J\nq3TXVYKgmIvwRrVZHu4qArAtq198Mfz9PM+mrgY3t5kzeWNQq19RykfFvwSCQghY8ZUgpczNFQE8\nehS4666Re/QkEtZ/ns36g5xNTTZfH7CCKFO7gPAK4koQJubBFg/F9uIJG9wiwWFx+dx9d/gVVC7H\nk8QOHOArB90AFKU8VPxLIMyHvnGjFTNpriZNzQAWO6lglcKnILNns3ADLKyHDvnFceFCW5HrCi9g\n3TBjWf0aFPORXDthZLN2Pq/72WS6WNimlU77f27uz1fFX1HKQ8W/RIJCKN0w168vzKRxp1SJdRtE\nLHbAH1iVtshBH3eYG2a8hXAk106QYAM7z+PPuGBB9PlnsxzAlo3TDQIfPcpB9GDQW9s/KErxqPiX\nSVBsXH/1+vVW8IN+f2mvPHeu9d+LOBrDohZm0ZcqvGNFKW2W3QZ2RDxj2K3S7eriCmYR8+AxiYQN\nAh89aucFP/oo8OCDwI9/bIPB2v5BUYpDxb9MghlAYqVLf/swiIC/+AsebiLHuoHhfJ7z5YNplEK9\n9bdvbWVLXwLT+/ez715iIq6YA/y502kbD3CvcC66yP/abhsMdQkpSvGo+I8C183gunbc6tjdu4cP\n8K5eDbz9tvXfuwFeIraE4xLYzGQ4E0rSVwcHufgrLAi+bh1/7mXLrNXvXiW0tfn7AQmyUaTTY/95\nFCUOqPiXSLDoyQ3iJhLhw1SEU04BXn+9sNmZZO4Q2bm2O3YAO3fGx43hdhZ1axKCrRuefpo3VNdN\n1Ntrn5eroXXruFGdmw6by3GX1H37qhMLUZR6Qnv7lEgw11+6WBJxALO3t7CzJcAi//nPFw5Yl/m0\nPT3cH2fOHBvcHK7PTr3hzuK9+277c5gwATjvPLtONtLhhrZ3dHAM4OST/YVw8jNbu7ZwFrCiKH7U\n8i+RqAEsTU3W39/UZC17CewuX84C6E7dksCwkMlwOqRM44pbDxs3VuH+HIDCuEnUOEmAA8TXXhv9\nPpoSqigjo+JfImLBLlvmb7J2+ukcxOzt5efuuMNao9u2sfjL8SJIUe0ixBUSdInEiaiU2SASGHZ7\nE4XNQSACPvQhO8lMZwEryvCo+I+CTIbbK7vif/AgW6MSeHSvCqQFcdAKjWqlPDhoA6ONZL1KTGDD\nBvv53boAIttULohkEREVBokVRSlEff6jpL3dtnhwkbbJrkAlk5zHH/RBiwvJ9W2HPdYIhMVS3NTX\ngQF2tUmrh6i5CJJB5AaJFUUpRC3/UZLJsGBJde/AgD9t8f3vB844g7/fto0btUnfn5GaoNVCEdd4\nEzZ7wLX83SupfJ67m4YhdQGNsmkqymhR8S8D8Vu3t/MmsG6dddk89xzw7//Og8vFWi22CVq9FXFV\ngqjZA+k01zw89ph/cw276rrwQr5ta2u8n5+ilIqK/xCl9IcJrnU3gRUrOEc/n2c3xdNP+/v0qEUa\nTdTsAckMkgyqpiZO8wxeETz6KD+2c6e/QE7aSAOa/68o/00xg36r8TWeA9xLHWDe1MSDxVMpY+bN\n40Hjcoy8lgwzJ+J17hqldHbt4p/hvHl2EH3UMPhEggfYy3GplA5/VxoHjOcAdyL6OBE9T0QvEtEt\nIc//OREdIaL9Q1/XVOJ9K0WpA8wlGNnfz2mH993HM3plqpV06QRYcgYGuCmZWpyjJ5PhttazZ1s3\nWtgoSPH5p9Pc/lqqhQUd/q4oTNluHyJKALgHwMcAHAbwFBFtMcYEhhTi740xS8p9v7FguP70rssA\niJ7K1d/P63p6Cvv6EKm7p1IEe/wDfH/OHPb19/bymk9/2j93WKqum5r0d6EoQGV8/rMBvGiMeRkA\niOhbAC4DEBT/miUq6yab9ffpkfRNovCmbevWhW8Ol1yiVn8lyGa5d8/AgM3nB3jDXrHC/oyvv543\nY4DXzpvH37/2mo6BVBShEuJ/CoBXnPuHAXw4ZF0bEZ0H4KcAPmOMeSW4gIg6AHQAwNSpUytwasUT\nlmEjbRYEEfaowSyy1vNYmGT4iFT3KuXhunCkp89ll9mZCFJhfTBgdrz1Fo+AlAD8Sy9xqmgjpdIq\nSpDxyvbZCuCbxph+IroWwEYAfxRcZIzpAtAFALNmzRqmIfL40Nrq79MjDDeQnYiblslownTa+phV\naEpjpAwsY4CtW4FHHuHfkTG2wtqlr8/2YsrneX6A5/HvKS5dUxWlVCoR8H0VwKnO/SlDj/03xphe\nY8zQhTgeADCzAu875kgh1wc+UPwxnsfC39HBorVsGfC5z2mXyVKRvkfuz6693bp6BOnkKVdickUg\nLbKbm9nVE9wQ4tY1VVFKpRLi/xSA9xLRu4moGcCVALa4C4joXc7dSwE8W4H3HRcyGfblp1J8P5Gw\nxUTCeef5m7BJa4Fis4iUQsJ+dpkMD6x38bzCBnhEwL33Al/8Ih/X0QHcdJN/rVYCK41O2W4fY8wg\nES0BsB1AAsB6Y8xPiOhvwfmmWwB8moguBTAI4C0Af17u+1aaKBeDPP71r/OQkDfeAL73Pft8IgFc\ndRX7lIPZQsNlESnDE/Wzk6Ew/f0s5Jdcwj59d5yjMXagC8AB4PXr+ftEArjiCuDIEWDGDL9LTgfB\nK40EmeFmDVaRWbNmmT179ozLe4W1VhYxcKd2Sc5+MI3zi19kwQj26Zf2BL29KiijYbgNWXoq5XI2\nldONxYjLJ/g7k2C8TBIT339nJ7vogn8DilJvENFeY8yskdZpewewwEhA8Ngx4JprgAce8LseRFiC\ne6VYpSP16VchKZ2oHkcSi3ELvf7wDwutfzcW4D4ezODq62PXXpibSVHiirZ0BlvnrtV48CBw/vn8\nuNteuamJv0+lOHf8uut49GKxffqVyhFsff3bv124JuyiNuqxffv4tRqtlbbSuKjlj/De7wMD/Hhn\nJ3eVbGsrHMEYhfr6x55gYV5wCtiUKcDhw9HHn3468MEPAg8/bDOEFi3iNhzqolMaARV/sIUfrNpt\nauLHly1jl9Djj3NwUWbxDkdUxbBSWVy30IEDtrCuqYlTRJcu9Vdnu1d3N9/Mm/n27fz79TygpYUz\ngxSlEWg48Q8GEbu6gBtu8Au/5wF3382Wf1+ffW7zZi4oCnP1BGnEnvzVIpvlTdoYDv6uXs0i/tJL\nwFe+Yh9ftoxHPba1WZHv7ORmfbkcPz99Oj+uG7cSdxpK/Lu6gCVL+B9dMjzkvov4gN94o9BH3N/P\nIqEzYmsHibHk83wF19vLG8Kdd9rf3+Agt3TYvt1/bG+vdfscP87uI5klrMF6Jc40TMA3rB3zpk0s\nCkGSSc7+2Lw5/LV277YtnJXqEzb3uKfH7+ZJJMJjL8FjAQ3WK41Bw1j+3d1+oU8kuMjn0UftY1Om\n8O2JJ7J7wCUYE9B0wNohKsaSSll//t1382MrV/rHRLa2Fo6PdC1/DdYrcaUhxD+btRWeAAu/+PQl\nEEhks0PCskSi8vuV2iBsBGRQ1KX2wp0H0NQEzJ/vH++owXqlEWgIt48UBAEs8osW2cZrqVRh068o\nPI8nSUXl9yu1QzCw393Nwftcjl1/UrjX3w+sXetvvJfJALfeyt+vXKnuPSWeNITlH8y7P+EE4KKL\n2O1z9tk88HukLhduGwAV/dpGKqzF5XPFFcC3vhX9O5ZqYJnE1trKqaNucoAGfpW40RDi77oAjh7l\nfu6A398fBREPDJk9W90A9YLbriOfBx58MHotkZ0KtmEDx4WSSb6Vq8X+fo3vKPGjIcQfsD7hiy4q\n/hgiYMKE4gq7lNqhtbWwqMtF2jobYwfAnHYa8PzzfEww9TcqU0hR6pmG8Pm7zJhR3LpkErj2Wr3c\nr0cyGeCeeziYG4zneB7/bl3yeeC558I3C0kO0L8BJW40jOUvTJpk0zaJeErXb/wG+3zd9M5PfAJY\ns6Z656mUR0eH7cUkbbXdW7f1AxAdD5DkAO31r8SNWPfzD/uHddstJxL8Tz84GJ7KqX7e+JLNcuxn\nyxZr8Tc12b8Huf/DH/L32qJbqReK7ecfW7dP2AxY4aKLgDPOAE4+uXA4i5DLaXVnXBGj4K23rPAT\n8axfGfcI2Ftt0a3Ekdi6faL+YVtb/Zf7Yeh81/giRoHbsA/gOMAJJ9hGcAAbBnLl6KYKp9O2Uliv\nAJR6JZbin80Chw7ZwJ7ncZ+e3bv9U5xckkme4NXSomMX44wYBcGrvdNPB7761cIRnek0H9PZaeMF\nOu5RiQOxE/+gT//cc3m83+7d0cfMnq3FW42CWPFSByCcdBJn/Lice26h0IddUerfjVKPxM7n7/5z\nDg4C//ZvIx+zcGHhP3A2q6X9cUQK/m67DbjqKr4qJAJ+/GN/CmhTE3DmmYVCH9ZBVFHqkdhZ/kHL\n7q23Rj4mOMZRB7DHGyn4W7mShT+fZ0Nh0SK7pr2dbzdutG0i0mmd0qbEh9hZ/vLPOWdOeIHPvHn+\nx8OsN/fqoa+vcD6sEg+CVnx7O9d2rFljRf3ss23657JlPBBIhV+JA7HN889meeBKf799LJXibpyA\nFXS3la97rJsVJMfpP3v8iCreCvv7AdgdlM/rFaEydpRbUFhsnn/s3D5CJsN92teutdW88+fbH+Zw\nP9RMBliwwB47OKiBvbgSNWtZrv6CSEGgBnuVsWA8Xc6xc/u4tLTY1D1jOI+7WNrbuambBvYaE2kO\nFySZ1L8JZexw506MdUFhrMW/t9d2cASAu+4qPntHYgdf+IJe3seV4TK6Mhng3nsL40Of+QxbZkuX\n8j+mZoMplUImDorBmkyOrYERW7cPwD+4RML2asnlgBUr+KsYMY9yCSj1T/DyWoq4XD+rNIeT+FBL\nCwd9+/p4FgQRxwAWLAiPHSlKKQQnDrpu6rEg1pa/tPZNJPh+Pg/s2FHY60dpPNyMrv5+ntr1uc9x\nkPf66/0jHSUDqLeX17quxOPHC8dAKspoEGNVjApJNx4rYi3+AFtu7qV7Pq/NuRR/mqfn8SYgG0GY\nmEvLENeNKLgBYEUpB/n7Cvs7qzSxdPu4qVLd3f5+PkTanEvxF2tJvx5p9hbM5nFdRMF/ymSS12sA\nWCmXnh6bTTYeGYaxE/+gL/fss/3Pv//9/I++dClvCk1NmrLXqLgxHfHtr1/PVwAi5tksx4iCvYAA\nvmL4xCeAX/8aaGvTvyGlPILdY8famKiI+BPRxwF8DUACwAPGmC8Fnk8B6AYwE0AvgCuMMT+rxHsH\ncYd39/UBL7/sf/7884F9+2wO9/Hj/E+v/7iNjWwE7e32qvHAAWDxYt4MomohH3mErbSdO3kDOXAA\n2LSJx4VOmqRXlsrIdHXx30xbG1+NjldHgbLFn4gSAO4B8DEAhwE8RURbjDEHnWULAfw/Y8zpRHQl\ngC8DuKLc9w4jnbYWmjHA4cP+51taWPwVxcV1Fd56K99fssRmioWRz/PVo8SRVq3i1uGAZgMpxdHV\nxbPCAf6bWb6c+0kdP863tV7kNRvAi8aYl40xxwF8C8BlgTWXAdg49P0/AriAaGxCGsHcfhfP4+fb\n27llAxHfjnVUXaltwqa+dXcPL/xCImGLvl57zf+cZgMpI7Fpk//+hg3AsWPjU+RVCbfPKQBece4f\nBvDhqDXGmEEi+k8AaQC/cBcRUQeADgCYOnXqqE4mmNtvX5uFXi7Dn3hCG3QpTLBHv/j+g64ezwNO\nPZWzfozhYO9nPmPdOwcOhM+N0HYQShjZLLumXY4csd97XgMVeRljugB0AdzYbTSvIbn911/vn8/6\nsY/5i7u0gEsRgoE2wBbbAPz3I8bDN7/Jrp077uA1q1fbS3P5exKf/9tvsyU3OKjZQIqfYPPIMFpa\naj/b51UApzr3pww9FrbmMBElAfwWOPA7JnR08O2SJfwPmkoVX9WrNB7BHv2A7eMvBkQiwVXAAHDn\nnfbx/n5/Smhvr/9vzQ0g699f4xHVobOnJ3qkrLBw4RieGCoj/k8BeC8RvRss8lcC+FRgzRYAVwPI\nAvifAH5gxriXtJTm6z+eUgzBK8HHH2cR37GDhT6fZ2Hv6fGnfBKxG6irK3y2r15hNi7DdehsbeVk\nANfyb27mv6H9+znzR4zYsaJs8R/y4S8BsB2c6rneGPMTIvpbAHuMMVsArAPwd0T0IoC3wBvEmKP/\neMpoyWRY/HfuLMy7TqVsn39jWPg9z24S6t9XgOHnPWcy7DJct467B5955vhnhFXE52+M2QZgW+Cx\nzzvf9wH400q8l6KMF1EjGyUV6Q70AAAfCUlEQVQX+/77bWwgn+cNwPPUv68wwVhSOs2xSID9+W6h\n6Ze+ZF2H4+WtqKmAr6LUGu7Vo/uPOXVqYTaQZABJbCCsfch4/nMr1SXYQmTpUuvmkStFwNaITJ7s\nrzAf61byKv6KUgRB/+3SpfwP7Hb4NIb/cfftC/f/j+eUJqU2EONh5Up/gDfYKmTLFv9j4+E6VPFX\nlCIItoC+6y7r6hHhB/ixgwft2r4+tupmz+bAcJQPWIk3YQFegahwMxgP12HsWzorSiVwW0ATce6+\nBHiD/7g/+pFtI24M1wV89rN8Sa9jIOPLSJPhenqAefMKOxCceGLh+qVLx94wUPFXlCIQ/+2iRXzf\ndfcEMQY4/XT/Y/k8W/zz5+to0DgS1iIkSCbDV4BBLryw8LH9+yt/jkFi6fbRoJoyFmQynOXjVv8G\nIWKr//nn/Y9LFpA2eIsPrs4Ml9bprk2n/e1nPA/4zd/0B4ABzvMfa2In/hpUU6rJyScDr7/uby1y\n2WVs8aXTvHl0d+smUO+EzYCO6sUfXHvFFcBDD/EVoucBTz/tf+0zz+QC1bEmduI/0g6sKOXQ3s6+\n+4EBO/7R5bXX/K6gZJLb9AL+Xi4bNnBzQf3brE+COtPbG14TElzb38/9oeRvZHAQeOopvk/Et889\nx5uFpnqWyHhPw1EaCwncueMf3WpfV/gTCeCP/5i/D/ZyUcOkvgnTmaiOAu5aYwoTBOQK4D3v4eFT\n41UlHruArwTmNKimjBWZDA986ejgv7HbbgPWrOEyfcnkmTePrf6tW9mKS6c51U9Qw6S+KUVnZO0l\nlxQKP8DCn0oBN9/Mt+OVDRY7yx/Qnj7K+OH+rb30EvCd7wCXX849/rdutbn++/axJScj+tTnX/+U\nqjPf+17hYzNmAL/zO7aR23g2o4yl+CvKeNPVxcVcAN8uX86Wv8z/Xb+eBX/NGnuMZqU1DsFusABb\n+M8+y0OAZAb0eBqusXP7KEo1CI7j27+fc/qloGdw0D+Sr5i8cKV+CRZ8tbayMSB4HruBBgZsIHgs\nRzaGoZa/olSAtjYewO3eB/xtH9Jp+7xmpcWXsDTQ3l7g4ouBhx+2mT2AvRoI/n2MByr+ilIBZPDG\nunWc6y++Wyne8TwWALfYR7PS4kkwtVPaOCeT/LuWsZ6TJxf+fYwnKv6KUiGmT2f/7d69wPbtbPGl\nUv5+7mIRJpPA3LksABr8jRfptG345/r5BwfZSJg6ldc88oitCE+lxt8AUPFXlAoRVfgjGT779tnn\nczl2AUyYwOKv1D/ZrH/IT7CBWz4PnHACd3f9m7+xdR8yH3q8DQAVf0WpEBLUy+f5Np3mzJ+tW+2g\nF3leCsLcAfBKfSKiv369v2WzW7Ur3HUXXwG4j8l86PFGxV9RKogb4F282DbwAvj7WbPY2n/ySbtu\nvAN9SuWQ4G5fX3iH16lTueVHLmfbgQTXNTVVJ+aj4q8oFaKnx/5zu60chHwe2LOHRUAsQiJ2Byn1\nibj6RNCDlv5f/ZUN/h89ypY/wIJf7ZiPir+iVAjp4dLfX1jQ46b2BSeArVunQd9ao9gCvKCrb+FC\n9uvv32+rdoULLrBXAF//uv+5aqDirygVQnq4rFgB7NhhN4AzzwRuvNE/wNu1DgcGbFBY2z9Un1Lb\nwrtWf9TvTa4Q8nleVw0ffxCt8FWUCpLJsPi7TdxefJEv/RcssFcAQb/vk08C558P3Hcff7W2atVv\ntQgrwBturbj6crnwtdksZ/jU2ghPFX9FqTCZjL+1g4hCezsHe72Q/7pnn/XHCQYGxr/cX2Hcec0i\n1FHzecPWushVxP338waxaFHtdBtWt4+ijAHt7cDGjYX93sUt9Nhjfuu/VjJAFPt7Ep8/EO0GCq4N\nirp7FQFw9k8tCD+g4q8oY0KUKIhb6Ac/8KeBErGwVDsDRGHc7prXXw8cO8bfHzvGcZlMxt+qA+Dq\nbrkvGVwtLbXbxkPFX1HGiKj2vJkMcNNNwB138P1kkuMBKvi1RzYLPPCA/7F161jUZYqbm9kVTPVs\nbuZ1kv1TS79f9fkryjiTzXI5v2R+rF5t+/yH+ZWj/M3K2CMBXZfBQd4A+vrCRzK6HD/Ouf2PP86b\nQC39DtXyV5RxprvbpnzmciwkgK0ITiY5+0dcCxdcwBam5wH33FP9/PBGorXVVuYKngc8/XR4RW8Q\nIj52vObyloKKv6JUmd27ufJXrMjBQeCWW4CPf5xTBMW1kM8DS5bYiU9KdZg8GXj9dXvfdfUE3T6f\n+hSP9lSfv6IoaG9na99N7Qy6D3bu5C8i/3OSNqriPz709BRa+K++6r/v1m64az0P+P3f5yu6WhzX\nqeKvKONMJsNtAO67zz4mQz0EEZGgmEjfd53/Oz60tvLPvL+f7wc3aSHMNSS/q/Gcy1sKZQV8iei3\niegxInph6PadEetyRLR/6GtLOe+pKHGgvR2YOJFFIpnkgO/atcCUKeHriYA5czhwCOj83/FCUnZv\nu41/R2EFevk8u+IEz7O/q1oUfaHcbJ9bADxujHkvgMeH7odxzBgzY+jr0jLfU1HqHhGVSy8FzjqL\nH5s+HXjzzfD1nmdTBbu7OdOkmPYDSvlkMmzB9/YCn/xk+JpnnrHfex7XctSy8APlu30uA9A69P1G\nAD0A/rLM11SUhuDAAWDzZv5+927gvPMK0woFYzhV8KWXbKsAgK8aaimIGEe6ujjQnstx5XXQRQcM\nX61dq5Rr+f+uMUbi3m8A+N2IdROIaA8R/QsRzYt6MSLqGFq358iRI2WemqLUNps2+e9LgDeMfJ79\nznfc4d8g5s+vfQuzXgirp+jq4grfgQGbrulm9pxySuHvzJj6uBob0fInoh0AJoc89dfuHWOMIaKo\nPe/3jDGvEtF7APyAiA4YY14KLjLGdAHoAoBZs2bVyf6pKKOjrQ149FF73xjgne8EwuwezysMKiYS\nXGm6cqUGfsslrI0zwBZ/sII3meTfQ3Mz8PnP8xWZTPKq1jD20TCi+Btj5kQ9R0T/QUTvMsa8TkTv\nAhDqsTTGvDp0+zIR9QBoAVAg/orSSHR0sBvnjjuswIQJvwR729qAT3/aZp7kcsANN/D3UX3n454V\nVKnPF2zj3N0NvPyyv/8SwOK+ejX7/9Npvu3s9N+vm5+1MWbUXwC+AuCWoe9vAbAqZM07AaSGvj8R\nwAsAzhzptWfOnGkUpRHYtcuYCy80xvMkU9z/lUrxGmOMue668DWJhDG33174us3NxhDx7a5d/HX7\n7fb16pldu4yZOJE/+8SJ5X0m97VSKWOSyfCfM2DM7NnGrF1bufeuNAD2mCL0u9yA75cAfJuIFgL4\nOYA/AwAimgXgOmPMNQA+AGAtEeXBMYYvGWMOlvm+ihIbpNPnzp1sdSYSwDnn8FXASSfxJDDpGNnS\nUlhFCoRXj7ptJI4fB1atArZvL35CVa0TNnRltJ8nk2ELft064D/+A/j5z+1z06YBP/uZvb97N7B3\nL/8OarFtQ7GUJf7GmF4AF4Q8vgfANUPf7wIwPbhGURSL2wJaBn0PDvKQl507/f7kD32Iu0QK06YB\nDz00svi89lrlxLIWkEEqo2mdEHQXZbN+l5rLxImFG24+z5u0tOKuBx9/EK3wVZQaQYT4vPP8vmYR\nHbEyzzmHrwQk+Pvaa+Gv194ObNhgxXHhQj6uFvvMjIaRBqlE4QZ3k0nOmALsVVKQF14Ib9X89a/X\nmY8/gIq/otQQPT2FOeSSV+55LDrt7fz42rX+2bEiQK5V+8QTfnGcPj1eAeDRtE5w3UW5HP8cm5p4\nI5B+S55nvfyyEScSwLnnshsuDrMXVPwVpYZwe8l4Hg99mTQpPJMkOCYS4Lz0xYt5s0il2DJubbV5\n57XaZ6bSDJcFJO4iSc+UDXTRIrumpaXQDWQMd1q99dZx+ADjgIq/otQQxboywtZls5yXLpZqfz8H\nfd1Not6DvMUQlrPvfmb52XV3A+vX25z9lhb/Brtvn7/5HhG32M5m4/EzVPFXlBqjVOtcMoEOHSrs\nLAnEK8g7HGLtuzMQ+vs5kyqs187UqXbE4owZ/L27YbS0FL7H/ffzZhqHTVTFX1HqEHfCl8QDkkn2\nS0tm0D33sI/ftfzT6XhWBLvWPmDjJvk88NhjnDElgh382REBO3ZYF5BsGO95j7+Pj2yscdlEVfwV\npQ5wfdgAi5M7PDyf52Cl9JlJJOzEr85O7iMUZt3Wu4AJbhA3iDF+wZauqO7MBLdfTz7Pm0FYPYUE\n3es9UwpQ8VeUmieYmigZKGK1homYZAABLPj9/X7rNi7WqxAM4rp4Hm+Ghw5xQHz9+vDOm0TAaadx\nW4ewoS3Sp78e2jUXQ7ldPRVFGUOyWWvli99eOkwSRXcBleCkWLkyA1hcQnGxXgUJ4l57LWc5eR6n\nby5fzj2UiNhf7wbEAf/Pzxjg8sv5+DBSqfgIP6CWv6LULGG+6WB3z6je8fk8568HXRdE7Mu++eb4\niJgggfL2dn8W1MqVLPi5nN38ZOMMuonefhu4+mrg4EHgySft4/Pm8UYSp5+Zir+i1Cjix5aALmDd\nNuKbFjFLJOx9sfJlvYsx7NZYtszGBOKGfCZxe4lLKKx2ws3lTya5InpwkNcvX86ZQG1ttjjOff16\nR8VfUWoUt3eNWPsi8IDdBN73PuD88zk1cdMm/4yAMEbTjKyeWkOH5fl3dtppXKtX22D39OnsGhPu\nv9+61yZN4kZ4I9UN1Csq/opSo7iFXOm0zdRJJGxrAmO4+dvzz7NPeunSaPFvarKujjCff5TAjyR+\ntbYxuJk/btqmXBH19bHgi5tIzrmry7rW3J9PJbuH1hIq/opSw7jiJK6HdNoOcRFca95l9mxu6CaV\nq0DpAj+c+NWSVSybUDpt3TyS5+85qS3GsHvH7c+TzfLmKt06Ozvtc+V0D61lVPwVpU6QjWDlyuj8\n85NP9j/+6qt86/ajkUInt9hrOIEP+szTafta5VrFlbpqCG5CS5dym+vDh23vHgnySqrrqlW8OUrv\nI4mvEPFm6f68RtM9tNZR8VeUOsNt/pZIAJ/5DGepvPEGPy8zZo1h8b/2Wn68o4Nvw6z14axbKRRb\nvJhf1w0Wl9tTvxR30nAbhbsJ9fX5R2O6SOzEGGDzZmDLFv5ZdnYO/zmCQeQ4bAAq/opSZwQtUQD4\n6Edt1ornAe94B/CrX9ljNm2y4t/T4+99I69z9dX8fFi74t7e8MlVpVrFroAX605KJICLLwa2bbPx\niuBG0dpqYyFusZtABEyYAJx9tj+FUz5Pb+/wn6OW3FuVQsVfUeoQNxawcqV/EEk+7xd+gNMVARax\n3bv9bSGOHvULW3t7oZU90pVBMUIYFNDhrO1gz/3Nm+1zYe6lTAZYsMDOOHBJJLhds7RpdhE30NGj\nw3+OOAZ9VfwVpc5xffJhELGbRsS3r88+53mcy+4KW1gbaGD4K4NiCArocNa2a8kHP0tUptIbb9hG\nbO4GsGgRsGaNLfaS15FxmMaw//+00+zVUZA4Bn1V/BWlzslkeGLXNddwZWoY4qs+ftwvjIkEXxXI\n8PjmZrsuajOQSWJRuFk3vb3+26CAhlnb2Sy/p9QxSCFbUxOPXJT3l4A1wLdy9RNseXHCCXaN+/7y\nWQXXNRYkjkFfFX9FiQGZDPDAAyxMMopQRH7CBCuSEgwWZG0whuCKPVC8yyOsJYU7fL6zc/i5t+7V\niZx/sKFa0H109dV+t1fQ7XPXXdyeISjgBw6wC0wQ11gUcZuCpuKvKDFBUja7uzmPfWCALfulS63g\nzZ/vn04FWIvXFbbgZrB+vc2Bb22NzrxxUyaBwuHzvb3Dj0GUYLTbYjnYUC3oPpIsJ0Es/2CH02BR\nl9xu2sTCH2X1xxUVf0WJEbIBSMtnALjzThZCCbI2NVmLHwi3eF2RzGatoBKxxRw2FyCb5U6i0nba\nTbUsppNoNgv80z/5jyPyF1wBNh4gm9HkyYWtrV2Syehq5nSan5s+Pfq84oqKv6LEjKieQGJ5//CH\nwC23cIO3T31qZItXNhOZI7BpU2FMQObhDg6yEF96KXDGGexyGRzk8wiKuEs2y/2J3E0J4Pd0C64E\nEftcjn36iYS/VbPLggXh1cyuaymV4rhJnNw6I6HirygxI9gTaOlSO+Xr0CG23J96ioV79WrOchnO\nDx8MlM6YAfzgB7ab6IYNhYHk73+fb2XTiBJxobu7UPiB8KuFnh67NpfjDeamm/hWNjr3+GCAOuha\nAuywexV/RVFqlmJaIojbRlw2YrVL8zJJh+zv5z5BuRwL+b33Fl4JZDK8gXznO8CHP8wbhrRLOOcc\n4Ec/KnS15HLA1q328TDXy0iceSYHsV33k2xowdm6kybxFY08v28fPxeWlppOR89BaCRU/BWljii1\n0tS1koHwlgeS/ZPL8UYQ7PP/l3/JefAA8OKL9nFjgH/+Z/6eyA6PN8bvhiHiQDPgT890N7D2dmDd\nOnuuTU2Fwi9ZRDKQ5oUX7GfavZtfS4LJslGE/fyWLAmPC4yUwho3VPwVpY4otdL06NFwwQfsLIBn\nn7WP5XLs/li1CnjtNRbUO+6Ifv1gcPaee9i9c/So3TCMYb+8266BiDeHZJI3hpYW7j4qmTuTJxd+\nbndgvQi/sHkz996XgrThOpQG3UunnAL8wz80lssHUPFXlLqilErTbJYzfQQi7m2zfz8LbyIBXHIJ\n8NOf+nP/3RYJbh78SAwO2lTOlSuta4aIXUYi3m4Fbi5n308KuSSQu3GjFW63k2gUbkvrYO8it0Np\nsHL4qqsaT/gBHeCuKHWFBHO/8IXiXD6uZZ5McsbN6tU2C+hrXwsf9RiGuHa8CNXwPA4oZ7O286jn\n8eu99JK/6Cvs/USs3e6cCxcCf/In3JNnJD+9MbxZuVc7+bx/48hkuN2De86TJg3/unFFxV9R6oxM\nhq3rkaxVV4CTSeDuu/mYYIfOKLeQCxG3ht65k/3tUdx/P7tcAN6c5syxG4DncWaRWzMw3GYiU8o2\nby7Mzgkjn+e1rpvK8wqzjNrbgYkT+b1TqXj06RkNKv6KElPkKuG227iNcUeHLcRKJPiruZk3hjAS\nCb4lYneMZM5cfnn4WnHXHDvG/v5MhitzUykrtJdf7i/GuvJK3iCKsb6DPXuikM1MWkqE9eYv9uop\nzpTl8yeiPwWwAsAHAMw2xuyJWPdxAF8DkADwgDHmS+W8r6IoxRGs1JVAqOcBM2eyW2XfvvBWyOIX\nlzTRAwf4tcKE+owzOHYgbN7MaaUdHf5WET09ftfPN7/Jt0Hr303lBHjzmDkT2LOnMI//4ouB733P\nX+RFBMyaBZx1Fp+3DGmXDSxufXpGhTFm1F9g0X8fgB4AsyLWJAC8BOA9AJoBPAPgzJFee+bMmUZR\nlPLZtcuY22835rrrjEkkJBnTGCJjJk40Zu1aviWyz4V9eR6v3bXLmFTK/1zYsSecwOuFtWuNmTw5\n+rXPO8+Y0083ZvlyY+bN8z8/ezYfn0rxezU18efZtct+xnnz+PN5njHNzbzW8/yvk0rZY+IKgD2m\nCP0uy/I3xjwLADT89dhsAC8aY14eWvstAJcBiGg+qyhKpQhOxEombbaNMf6++itWADt2RMcA8nng\n+uu5N/4TT7A1/fTTXC0c5o9/+22OEzz5JPBf/+UfyBLGOefwVUU6zdW6Lnv2AM88468Ydgu4Mhng\nu9+1+f2HDnH8IfhZ4jKIpRKMR6rnKQBece4fBvDhsIVE1AGgAwCmTp069memKDHHrQsAbKbL+vV2\nJKIUWq1YYfv6J5PA3LnAz37Goutm5CxezIK+Zg2L7XnnRffVAYAHHxz5PPN5jhN4HrtsgkNcJBNI\nGBy07RiCFc/y2MaN/toAID6DWCrBiOJPRDsATA556q+NMQ9X8mSMMV0AugBg1qxZWoCtKGUSrAsQ\na7m9vbBFRNhsYMncccnn/S2Sb7oJ+MpX+LlkEpg2rbAIazjcGICkg460ToiqeA72Nxqu3UOjMqL4\nG2PmlPkerwI41bk/ZegxRVHGmKgJVK6FLC0X3EBoNsttm48d4/VEVpTdDJpslmsHXPE+ejT6fCZN\nYqv+l7+0j8lr5/O2WZy4d1zc+5J9NFzFswZ1h2c83D5PAXgvEb0bLPpXAvjUOLyvoiiIFsEoqzms\nvXJzM3DjjVwd3NYW3S4hlwOOHIk+ly9/mXv4uJXDJ54I/MEf2PuPPMK3RMDUqcArrxS2kVi40J5D\n3Gbrjhflpnr+CYDVAE4C8H0i2m+MuYiITgandF5sjBkkoiUAtoMzf9YbY35S9pkrilIWUVbzqlWF\n/W/mzuXK4P5+bucMcBpna2u4OyaK3l4Wblf833yTg8Fh/v7Dh9mVJMNpJHdfmrDFcbbueFFuts93\nAXw35PHXAFzs3N8GYFs576UoSmUJTsSS8Yxbt/rXybQstzfPDTewH729nQe3jJTJA7CrxhXor3yF\n2z64LqMgxvAwlqlT7SD4oMire2d0aGM3RWlgxI+fy3ExlLR+cLnySv9aWb92LWfUdHayq8bNxgm+\nvjH+4zs6uHV02LB391ix8lXcK4+Kv6I0KOKvl7YMixdzS+ZUyp8i+eCDNhALhNcJPPGEP7PmjTds\nW+auLlslHAzIBjNyJAU1kWCLPyj8xQyyUYpDxV9RGhTX7QPwbVTBl1jmw9UJhIlxV1d0h02X6dP5\naiAsBdWd4BU1OF43hNJR8VeUBiWT4U6fixezMEsKp1vwJVcAnjdynUCQbJaHvYs7J9hh053O5Xl8\n1dHRET5sPWwYvfTuD3sNZWRU/BWlgRGh3LTJn8IZdMkEA61RdQKCK+wi/MEOm+50LgkiB0dIuhlJ\n8jpE9ooj+BpLlhS+hhKOir+iNDBSzHX8OFv6rnC6ufxhFv5wdQIrVvivGubM4cfc15A0UUFGSAbX\nuHn8nZ2FG1FwmLv27ikOFX9FaWCGq5AdaVh8dzdP25LAb9AN4+blB4Uf4PuXXDJ8muhIefyZDLt6\nlizhz9DIw1lKRcVfURqY4WYCj7QxbNhgUzOlTkCOCbP4wwKzy5cD27Zx1pG0bAgyUh6/pI1q0Lc0\nVPwVpYEZzrIeaWOQTp5EnJYZ1m6hrY3XHjgQnqmTyfDz5Qq3FnqVjoq/ojQ4UcJZysYQ1m7BTc0M\ny9TRBmzVRcVfUZRIRrMxyDErVw6fqaNUFxV/RVFGRXA+cHAjKCZTR6keKv6KopRFMQNVVPBrDxV/\nRVHKQgeq1CdetU9AUZT6Rtw7iYT68+sJtfwVRSkLde/UJyr+iqKUjbp36g91+yiKojQgKv6KoigN\niIq/oihKA6LiryiK0oCo+CuKojQgKv6KoigNCBlpyF1jENERAD8f5eEnAvhFBU+nGtT7Z6j38wfq\n/zPU+/kD9f8ZqnH+v2eMOWmkRTUr/uVARHuMMbOqfR7lUO+fod7PH6j/z1Dv5w/U/2eo5fNXt4+i\nKEoDouKvKIrSgMRV/LuqfQIVoN4/Q72fP1D/n6Hezx+o/89Qs+cfS5+/oiiKMjxxtfwVRVGUYYid\n+BPRx4noeSJ6kYhuqfb5lAoRrSeiN4no36p9LqOBiE4loieI6CAR/YSIbqz2OZUKEU0got1E9MzQ\nZ/g/1T6n0UBECSLaR0Tfq/a5jAYi+hkRHSCi/US0p9rnUypENImI/pGIniOiZ4mopvqexsrtQ0QJ\nAD8F8DEAhwE8BeCTxpiDVT2xEiCi8wD8CkC3MeaD1T6fUiGidwF4lzHmaSL6TQB7Acyrs98BAXiH\nMeZXRNQE4J8B3GiM+Zcqn1pJENFNAGYBOMEY84lqn0+pENHPAMwyxtRlnj8RbQSw0xjzABE1A/gf\nxpij1T4vIW6W/2wALxpjXjbGHAfwLQCXVfmcSsIY8ySAt6p9HqPFGPO6Mebpoe9/CeBZAKdU96xK\nwzC/GrrbNPRVV1YSEU0B8McAHqj2uTQiRPRbAM4DsA4AjDHHa0n4gfiJ/ykAXnHuH0adCU+cIKJp\nAFoA/Li6Z1I6Qy6T/QDeBPCYMabePkMngOUA8tU+kTIwAB4lor1E1FHtkymRdwM4AmDDkOvtASJ6\nR7VPyiVu4q/UCET0GwA2AVhmjHm72udTKsaYnDFmBoApAGYTUd244IjoEwDeNMbsrfa5lMkfGmPO\nAjAXwOIhl2i9kARwFoA1xpgWAP8FoKZikHET/1cBnOrcnzL0mDKODPnJNwF40BjznWqfTzkMXao/\nAeDj1T6XEjgXwKVDPvNvAfgjIvpGdU+pdIwxrw7dvgngu2C3br1wGMBh54rxH8GbQc0QN/F/CsB7\niejdQwGWKwFsqfI5NRRDwdJ1AJ41xtxZ7fMZDUR0EhFNGvp+IjiB4LnqnlXxGGNuNcZMMcZMA/8P\n/MAY87+qfFolQUTvGEoYwJC75EIAdZMBZ4x5A8ArRPS+oYcuAFBTSQ+xGuBujBkkoiUAtgNIAFhv\njPlJlU+rJIjomwBaAZxIRIcB/I0xZl11z6okzgXwvwEcGPKZA8BfGWO2VfGcSuVdADYOZY95AL5t\njKnLdMk65ncBfJdtCSQBPGSM+afqnlLJLAXw4JAh+jKA+VU+Hx+xSvVUFEVRiiNubh9FURSlCFT8\nFUVRGhAVf0VRlAZExV9RFKUBUfFXFEVpQFT8FUVRGhAVf0VRlAZExV9RFKUB+f8FvkT+M2urzAAA\nAABJRU5ErkJggg==\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Up8Xk_pMH4Rt",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Split our data\n",
-        "We now have a noisy dataset that approximates real world data. We'll be using this to train our model.\n",
-        "\n",
-        "To evaluate the accuracy of the model we train, we'll need to compare its predictions to real data and check how well they match up. This evaluation happens during training (where it is referred to as validation) and after training (referred to as testing) It's important in both cases that we use fresh data that was not already used to train the model.\n",
-        "\n",
-        "To ensure we have data to use for evaluation, we'll set some aside before we begin training. We'll reserve 20% of our data for validation, and another 20% for testing. The remaining 60% will be used to train the model. This is a typical split used when training models.\n",
-        "\n",
-        "The following code will split our data and then plot each set as a different color:\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "nNYko5L1keqZ",
-        "colab_type": "code",
-        "outputId": "b9f9c57b-b6aa-4817-8ab4-4a2201732b9a",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 269
-        }
-      },
-      "source": [
-        "# We'll use 60% of our data for training and 20% for testing. The remaining 20%\n",
-        "# will be used for validation. Calculate the indices of each section.\n",
-        "TRAIN_SPLIT =  int(0.6 * SAMPLES)\n",
-        "TEST_SPLIT = int(0.2 * SAMPLES + TRAIN_SPLIT)\n",
-        "\n",
-        "# Use np.split to chop our data into three parts.\n",
-        "# The second argument to np.split is an array of indices where the data will be\n",
-        "# split. We provide two indices, so the data will be divided into three chunks.\n",
-        "x_train, x_test, x_validate = np.split(x_values, [TRAIN_SPLIT, TEST_SPLIT])\n",
-        "y_train, y_test, y_validate = np.split(y_values, [TRAIN_SPLIT, TEST_SPLIT])\n",
-        "\n",
-        "# Double check that our splits add up correctly\n",
-        "assert (x_train.size + x_validate.size + x_test.size) ==  SAMPLES\n",
-        "\n",
-        "# Plot the data in each partition in different colors:\n",
-        "plt.plot(x_train, y_train, 'b.', label=\"Train\")\n",
-        "plt.plot(x_test, y_test, 'r.', label=\"Test\")\n",
-        "plt.plot(x_validate, y_validate, 'y.', label=\"Validate\")\n",
-        "plt.legend()\n",
-        "plt.show()\n"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAD8CAYAAACfF6SlAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJzsvXt8FNX9//+cmd1JEDUpUctHEbR4\ngWBCEvAyRXQwCl6r/eEV26WgpFoQsaiVfj62fIoV64VGBRWoIPl+VD7thxatN5CVEcShKBJuiwiI\nUFRaTU2ol+zszpzfH2c3uwlBbgmX5Dwfjzxwd2d2zq6zr/M+7/O+aEIIFAqFQtG+0A/2ABQKhUJx\n4FHir1AoFO0QJf4KhULRDlHir1AoFO0QJf4KhULRDlHir1AoFO0QJf4KhULRDlHir1AoFO0QJf4K\nhULRDgkd7AHsimOOOUacdNJJB3sYCoVCcVixfPnyz4UQx+7uuENW/E866STefffdgz0MhUKhOKzQ\nNG3Lnhyn3D4KhULRDlHir1AoFO0QJf4KhULRDjlkff4KhaJ9kUgk2LZtG/X19Qd7KIcFubm5dOnS\nhXA4vE/nK/FXKBSHBNu2beOoo47ipJNOQtO0gz2cQxohBDU1NWzbto2TTz55n95DuX0UCsUhQX19\nPQUFBUr49wBN0ygoKNivVZIS/3ZIXZ3Lli0TqatzD/ZQFIpGKOHfc/b3u1Jun3bGkiUu9fXlGIaH\nrpv07h0lL8/ao3Pr6lxqax3y8+09PkehUByaKMu/HeG6MH26A3iATxB41NY6e3RuXZ3LypXlbN58\nLytXlqtVg6LNUVNTQ0lJCSUlJXTu3JkTTjih4bHneXv0HsOGDWP9+vWtPNKWQVn+7QjHgeXLbW64\nwQACwCA/397pONeVx9o2WCkDv7bWIQgaTxrK+le0JQoKCqiurgZg/PjxHHnkkdx5552NjhFCIIRA\n15u3m2fOnNnq42wplOXfVnFdmDiR1dNcJk6UD20bwmHpK5TuQo01axqfNm0anH8+/Nd/QXm5PA8g\nP99G103AQNfNZicNheJAk7rNG+7T1mDjxo0UFhZy44030qtXLz799FMqKiro27cvvXr14je/+U3D\nseeeey7V1dUkk0ny8/O555576N27N5Zl8c9//rP1BrkPKMu/LeK6UF6OiHt0D0xe1qNMyLGIRuHR\nRx0SiSS6LvB9j7lzqwDo0sVh2zabkSMtkkn5NvG4XAFYFuTlWfTuHVU+f8UhQ+o2x/PANCEazaxU\nW5r333+fqqoq+vbtC8ADDzxAp06dSCaTDBgwgKuvvprCwsJG59TV1XH++efzwAMP8POf/5wZM2Zw\nzz33tM4A9wFl+R9m7JGl4zjgeWiBTxiP/oGD58GGKpfi6q0Q6AgBui4YNGg68fgANm++l/r6cnr0\nyLyxYcjVQpq8PItu3cYp4VccEqRuc3xf/us4rXet7t27Nwg/wPPPP09ZWRllZWWsW7eOWCy20zkd\nOnTgkksuAaBPnz589NFHrTfAfUBZ/ocRe2LpuC5s2GpzY8hEFx6JwGSxbnOu4XLjzHKMpMcx/y2o\n+T5oGhiGj4YPgGF49OnjEItZ6DpMntx6lpRCsb/YtvwdpH8P2YZKS9OxY8eG/96wYQOPPvooy5Yt\nIz8/nx/96EfNxtubptnw34ZhkEwvqQ8RlPgfRjRn6WSLc2ZysJhhRJlV4bCj1OayGoshWydiTJcn\n5/wL0hHCGqT2fnV03WTECJvTT2+82atQHIpYljSAmgYntDY7duzgqKOO4uijj+bTTz9l3rx5XHzx\nxQfm4i2IEv9DiN3F0e/K0klH52zdmpkc3sLiua4WdhHgwI7SzMmdFxpsvySJ0AK0JJw6WSNxUV/q\n/1nG5pq5DOwzhiMTx/PJJ5eQSNQoH7/ikMWyDryRUlZWRmFhIT169KBbt27069fvwA6ghdCEEAd7\nDM3St29f0Z6auaTj6IPg25Ov0kJ/eYFLUY3D6gKbs8dYeB6EQiCEFH/ThMpKGDNGTgjnGi7PX1rF\nf7AdOnemzjqa2jcmkb88IG9DGD8Q1J2aYPUkgcisVhHoaFoOpaV7ngymUOwL69ato2fPngd7GIcV\nzX1nmqYtF0L03cUpDSjL/xBhT+PoLQssMs7/HrpJmR9lSSCPHTECunaFggKYMwfq6+Fs4fKKX07u\n3DgQIDSdI57O4fM7ppA3pAaWLUOb+wL/LhGIMA0+ISFA0wKSSY/58x02brQaltfN5QIoFIrDByX+\nhwjpOPq05f9tyVdDtjp0S/l3QsLjAt1hqWZhmhCJyGPLy6XwCwE2DiYeOgEC0EQACY9Zk2q4ZopN\nr5f/Gw1BXjVoCRpZ/smkTjJpMmGCTSwmVxTP3eby3iSHNwK7IYRUTQgKxeGFEv9DhObi6LPFFDKR\nPvMMm2jIxMBDM02uqbTpUJMR3YkT5XFpj56DTRIDIxXVIwCfEG8ENhcsrWLrNR5mHXh58OXjJdSc\navIJx/PWhkvIz69h5UqbNWukmpfFXS5+uJzLA497MBkYj+I4VqPxfVvMtZogFIpDgxYRf03TZgCX\nA/8UQpzRzOsa8ChwKfA18BMhxHstce22RF6e1eDqaRrWOXRo483cZ0dEiXSVPv+XajLumLo6l3PP\ndSgutlm50qJHD5fupQ5/XHkpP177ApoQ+Gg8wzC+7g3cOJ2PNCFdPQHkJNby4J1vsnatDPfs2xey\nS5Wcj4MpPHR8BB625rB1q0VVVeNIpA1VLlYTlT+QSTkKheLbaSnL/xlgMlC1i9cvAU5N/Z0NPJn6\nV9EMrgvjx8sM2yCQYgmNI31OjVi4WI3E9PXXXXxfbhpPmmSyaVMlJ588Bk3z0ESIHXeGyVvlQ8jk\niGERKm+oIvB9KfwCMMAgwe1Dx/BYVSWbNlmUlUH2vvvCwCYIm2i+h9BkDsGS6aDrcHbgch4OdRRw\n48wxkGys8rsKVVWrAYXiwNMi4i+EWKRp2knfcsiVQJWQoUVLNU3L1zTtP4QQn7bE9dsSaes4Lfy6\nToMvPxJpLJITJ0L37i7FxQ6rVtls3OjQrZvcNAaPvn3n8MUXqcca1I67grw/fI0xeDCRCov166v4\nNPv/QACaDqeWLePR0gFs+2QhrmuRXTbcxWJAEOV/KhzexGbJdAvfhzN9l9cpx8Qj8HX0wAcRUNc9\nTu0H48kvHI9tWxQXu/Tq5bB2rY1tW2o1oFAcJA6Uz/8E4O9Zj7elnmsk/pqmVQAVAF27dj1AQzu0\nSFvHaeG/8EIYPDgj+uPGZY4999xpnHnmKDTNJ5HIoWPHSoTIbBofe+xg6uoWy8eEyJ/4CqzyYfFi\nKCqic2GE7dtnIgIPkYSjNgi+PB0wQA883njD4X/+R7p/giCzh/BFD3ihJ/TpA6GZ0pJPbyqH8PEJ\nCNDZfrnGxtsDAuN1jJWLOeWUSiZNGoMI4uiBQWEwmScWVXxr4ppCcaCoqamhvLwcgO3bt2MYBsce\neywAy5Yta5Sx+23MmDGDSy+9lM6dO7faWFuCQ2rDVwgxDZgGMs7/IA/noNA0kWvw4EysfigEw4bJ\nFUC3btPw/VsJhQIADCNO16415OdHWbXKobraJhy26N27SG4iv7iVvFXTwffx6z2WP+gQPWscJ5yw\nkNpah6//VsCIVbex9iGPQECAybvv2vi+FP0rr4S//hVOP93l4YfLCYc9EgmTO++M8tvfWo02lXUE\nX/SC9aNBM0DTBL4f57PP5gBxND1ABAG100dy+feLmGBajRLXlBtIcTDYk5LOe8KMGTMoKytT4p/i\nY+DErMddUs8pmtA0Zd1xMq6d6mqbqVMt/vY3l0mTRiHrMkg0Tdbmj8UsBg2ystwoFpZlQV8XPzSL\nwPdICJPb59osnQtgpf7glVAR09+qIncQPPdWhLVr5fNBAKedJnMIcnOrMM16dF0ghMegQQ6TJlks\ni1s8I4ZTIaaiI6grDkAXaJqcPILA4NgdJdQlFhBooCchf3lAt9MdolGr2agm5QZS7JYDZCnMmjWL\nKVOm4Hke3//+95k8eTJBEDBs2DCqq6sRQlBRUcF3v/tdqqurue666+jQocNerRgONAdK/F8ERmma\nNhu50Vun/P27JjtlPQhcSkszlvbYsVF69XIQItnIF69pd/DEE1ajEg9lcZf4eAfG22BZPDssyvqp\nDm8Im6Xs/EN5y7f4c0eLcf3BuQ8KC11KSuSk89ln8JOfVBGPP42miQZBr662qayEmhroVxBBHzML\nPI+8NQbJhIYuEgihs/6VO7CffJyOpwTU9ob8lZD3YQ7Ydubzui7OeIeyuM2SwFJuIMW3c4A2jNas\nWcNf/vIX3n77bUKhEBUVFcyePZvu3bvz+eefs3r1agBqa2vJz8/n8ccfZ/LkyZSUlLT4WFqSlgr1\nfB6wgWM0TdsG/BoIAwghngJeQYZ5bkSGeg5rieu2B7p0cdj8YRy0AEScsjK5WappISCROkpjxox8\n/ud/ZBnmUAjOES7zg3I6LPDw3zR5dliU+lKL3+dafPNN89cSQmYGA9x4o8txx8lJx/dDhEKCIEgQ\nCklrPgg0XnppOI8+apGbC7fdBr/fANf8fijf96BT3wjbN8DatQ6vvWZzo1/F1sH1dKqGbs8hNzSe\nrATLkjWNVlWRf/sMzl/pMz8wGahHec+0WrVSo+IwZ3eVDluIBQsW8M477zSUdP7mm2848cQTGTRo\nEOvXr2f06NFcdtllDBw4sMWv3Zq0VLTPDbt5XQAjW+Ja7YbUcjb/hFr04wKCEIREwFXFtdxyC+Tl\nXcbnn/8VEPh+DsuXS/98z54uI0Y4FFdvpcMsWdM/iHusn+rw+1yLykpYsQKefhqSSVJCLi+p69KC\nBzj5ZIdEwkPXfTQt7V6S2zBBoOF5ucyfH0EImUn80ksujzwiJ4t3MelbGCFiWUycaMnVy6Sn2RIS\nbE1CyR2Q9z5QU5OpaeTXo98v6D0Wjn7f474LHXLGWw0rArUJoNiJA1TTWQjB8OHDmTBhwk6vrVq1\nildffZUpU6YwZ84cpk2b1ipjaA0OqQ1fBVLoqqpg5kxIJsnTdU4ZBBtuB6HDMX0eIZGo5LPPfDQt\nxH/8xzA+/zzCpk0WZ5zh8tBD5eTmeujFIXZUGxxZDQlh8oawqa+Xwv/AAy5Dhkh3Tk6OxZgxMrRU\n16Xl77pw++02999vEgp5BEEI8GXtf83g5Zdv5rXXIsRiGSEuKXEIhz0MwyeZ9Fi1yqF/f4vLC1xO\nHDiGUDgBGogwbB8ER2zI4f0Cm9pVDr7voWmCIAS1ZRp5m03s8bbcilCxoIpdcYBqOl944YVcffXV\n3H777RxzzDHU1NTw1Vdf0aFDB3Jzc7nmmms49dRTufnmmwE46qij+Pe//90qY2lJlPgfSqSFLl2U\nB0AIEvk6QgvAgEDISBldFySTgu3bP6SkRP4GPvjAITc3VRwOqH10BLXPdeV30wuwfQeEvMSKFeWA\nR+/esnooWIwaJVfPt90GJSWwcqXF2LFRSkoc6uoKGD16NJrmAwa9e0d47DEr1QwGrrsOVqywSSRM\nhPBIJk3WrbPpH3IpGlOOecs3jWJ639HO4ia/EmMUXH76Vno/FEIPQUgPkX/WMPhZJPNDPkBLe8Vh\nygGo6VxUVMSvf/1rLrzwQoIgIBwO89RTT2EYBjfddBNCCDRN43e/+x0Aw4YN4+abb1Ybvoq9IC10\nKeEXmkbSyGHzsbfhJ34PwscPwgghMIwkhhHg+wtYsWIxq1dHKSuz8f2s4nDFEbaug0minBAeHib3\nnTE09bpPENSzfXsVNTUWQZDJJl62TA5n3TqLWMxiyJCJGIbs+wtJzj/fYdEiq5HB9YtfWEyZUkn/\n/nNw3cHcfbcFjiwy1HmetPaFCUnf5IF5lRwVwPygHHONR81Yg8qSEfwjJ0LOCKvxb/lAtmtSKFKM\nHz++0eMhQ4YwZMiQnY5bsWLFTs9de+21XHvtta01tBZDif+hRJbQ+brB08FwZiUjLH3Uose8qxoi\nbwCGDh1Pnz4LMAxZcnnZModf/GIcr78epVs3pyHs89WRE/lVIJOvBB7fWQmJhIFp+oDg009nct55\nEUzTarTgyE4w27rVRggTkJPKtm02ixY1Xmn/8pcuK1aMQQiPs85aTGFhESA/T956j5JxBuvHDWfE\nfTKE9B4mNiSFdYoBsa7M0i1mz27i2TlY7ZoUijaOEv+DTWozc3WBjROHs2cN5fTt8EIswq3TLAIB\nCIjFrEY+9lmzxlNcvLjBzfLeezaeB4sWWdi2xZsPuBz93kT+4RfgYSLw8HUTcUqEefPg8sunpmL1\nk3TpImPts7YaME1ZX0hqrUVdnaw4um2bzUUXWTu54GtrHcBLuYZS/QiscQ3CnWfbRB2LVFQcDjYe\nJprm4Wsmi4TdsPLYybNzMNo1KRRtHCX+rcy3BqqkfPwi7nFcD4PTH9H4dzjJ8gKT7mURjKczkTjZ\nnIPLgHUOs+6qxCyTJZfXr5f1/AsKYJzt8oon6+wMxOQOrZLj9BqOusIm/xKL2U/BwIGzCIU8QiHZ\nO6BbNzm+pvWD0qQrjj73XGMX/Msvu3zwgcP3vlfQfD+CLOG2kRNGadzFxmGsUck9I2rYUWqzYoyF\noTw7CsUBQ4l/K9I0UOX11126dMnq0Zvy8WuBz9clAaEw6IYgmfQIAofJky1GjpQTgGFAIiGFP0o5\nOXgE603evy3Kl7dk/O+OA99PZOrsaJrHJWfWcF31OPy/gjkPKistNmyQm7nFxY378+7OyM52wRcV\nuZx7rgzv/Oork44dKzn55G/v+fubS1xuf7GckPAgZPLljyqhi8Prr6dXLcrIVygOBEr8W5HsQJXu\n3V3q6202b06gaWFKSqQrBNMkiHscUZ3OiE02RMtEIi49e2ZCMkePhgFxKeyG8CHpcfQKh6KKzCbp\n6tXworAbXD1ayGRHmY2/PGOt19TAuHGZsg57Q7YLPggy4Z2IOF++NIduV43H/cRq1GcY224oP31H\nvYMhPAx86k6Ns7J+FMFmH03TGTp0CscfX7HTNVWYv0LR8ijxb0WyreSLL67CMGRh/iDweOKJKpYt\ne5Kht0X55yNVHBPbzo6xsL6kM6/FIpxzDrzzTjmhkAzJNIwoQlgNvnKBrNEzdIbNxKzIyJoaWKZb\nlAdRBmgOPW6yOTViYc5quYCZ9LUmTbLp08cEESecDCh7eQHJ3y/m7iBKEMDtohyhe2g5JhuGRvE8\nizeEzX+mfP21fTQCQ2YpCxHw/vuj2Ly5iH79Mgo/bRoNq5+cHBXmr1C0FPrBHkBbJm0lT5gAV1zR\n+LV//hPmzoXZL6+m/LrpXFA4l6Gxufz6uRl0XAU7djhoWqah+8aNDr4PdYXwmyFDeaRwBOVEecuX\nVnYa25Yi+Y5hUZk7jtJSsJyJ/K3SZcQI2RGsOVxX9gdw3d1/riVLXKZOncj778PYsVE2zbyQM8bq\nfGdtgEh4nOs7nCfkCkUL5HLjfBxMU05MA/Uoz/WcwCdXXNfofYXwmTrVIZ0k6bowapTcgO7Rw+WH\nP5zI9OnuHo1RodhbBgwYwLx58xo9V1lZya233rrLc4488kgAPvnkE66++upmj7Ftm3ezOyI1Q2Vl\nJV9//fVejnj/UJZ/K5P2odfVRaiunonvy+ic+fMjFBa6jHh4FH8P+3ycgN5j4chYgv6Bw4vVmaQp\nTTM55RSbM85weeCBTJG3Z++KYG5qbMlnu2UuL5BJVngehSGTdUJOFrNmNbag9yaJdskSl6+/LufH\nP/a4/nqTu+6KEts4nrxNi/E1uRpxkAPyMDF02We4W8SmshR+9jNY4lt8ATyWdy+GQGb+ChDCYPly\nm83PuQxa4bABG9+3KCzMlI5IJExGjowyZYqlVgCKFuWGG25g9uzZDBo0qOG52bNn8+CDD+723OOP\nP57/+7//2+drV1ZW8qMf/Ygjjjhin99jb1GW/wFE04axZs1PueOOhcRiFiUlDlo4KTN3Q1BbAgnC\nOMgY/bFjo1RVTSA3N0q/fhbDh2d87KGQx+DBTrNCbVmy6UtRjdMoNKdfwmmUKJumuSTa5nBdmD7d\nQdczY+jTx+G6SgtjYZRtP53ApWYUXYMLdId3bqxEu29Cw2xSU5OJXrqipApDy7SQDAKNysrJHB2D\neX45J069lxtnltM/5FJa6hAOxzEMn3A4Tq9ezi7HqGhf1NW5bNkykbq6/V8OXn311bz88st4qb6p\nH330EZ988gmlpaWUl5dTVlZGUVERL7zwwk7nfvTRR5xxhmxf/s0333D99dfTs2dPfvjDH/JNViXF\nW2+9lb59+9KrVy9+/etfA/DYY4/xySefMGDAAAYMGADA/PnzsSyLsrIyrrnmGr788sv9/nxNUZZ/\na5G1S1lXKEsq+L7HaaeZ6HqEwkIoooBQQhAIWd/+n9vP43+veoBlL1oQyNj+006TNXu2bJlIaWkB\nX32VKaFw0UV2I+HfaWM0e9MhZLJE2Bj+zn7/3SXRpt9361ZYvtzm+uvlGIQwGTHCbsgF6GZZPF7q\n0mNUOSHfQ/uzCSMzs5Nty+Qx34ce1dvRE7IjgSbgn5X9eeWViobkLz21oT1rhMOCvgXoeoAQoOsB\nX35ZoMJBFZmigKnw4t69o7uMMtsTOnXqxFlnncWrr77KlVdeyezZs7n22mvp0KEDf/nLXzj66KP5\n/PPPOeecc/jBD36All1TPYsnn3ySI444gnXr1rFq1SrKysoaXvvtb39Lp06d8H2f8vJyVq1axejR\no5k0aRILFy7kmGOO4fPPP+e+++5jwYIFdOzYkd/97ndMmjSJX/3qV/v82ZpDif9eskeRJ038KNtf\nG0QQ1GMYsgHKZcVV9Kp2+PHXy/h4CnzeH76zSOMPiy6myxSL0CuZpu0bN7osX16OrssbvGPHSj78\nsIZTTrHp189qGE9BQabjV8Z1k/EBGbbNRKxmx/5tSbTZHyUUAiEs7rorSp8+DiNG2I02ZyG12gg8\nCHauxWNZ8MQT0vXzTawzxWOhrgTyqiEWK0QAiw0bdFO+R8pddOHxDps360CAEDo//3mNcvkoqK11\nCILMvlhtrbNf4g8Z109a/J9++mmEEPzyl79k0aJF6LrOxx9/zD/+8Y9ddupatGgRo0ePBqC4uJji\n4uKG1/74xz8ybdo0kskkn376KbFYrNHrAEuXLiUWi9GvXz8APM+Tv+UWRon/XtCcb7yw0JVtErNj\n27P8KHXd42xP/rWhAQo+3FU9nbxYQC2CTY9AEIa6YsGGLQV0qIHhw2HqVOkHLy52ECJzg598cg39\n+4/baTxpi3qnLNmswP30w+bYVXx/tksIZDevrl0tCgosFi2S192bWjwVFVBUBH+4KcKw2EyOinkk\nMKkiAsDbwuK5m6JEujoN5+a/uBW9KExAEsMwKS5u/J6K9kl+vt18YuF+cOWVV3LHHXfw3nvv8fXX\nX9OnTx+eeeYZPvvsM5YvX044HOakk06ivr5+r9978+bNPPzww7zzzjt85zvf4Sc/+Umz7yOE4KKL\nLuL555/f78/zbSjx3wua+sbffdclkShvaJDee/Uw8vpGGglgbR8NoQdoSL/2Ca8FdIoJNKTVG4QB\nA3yhkdOnpkErp8t2u1SnNn6ln13e4NlumPR4pEtE1udvySzZploekRq96w3i3dTiSVes/n8bLWIs\nxMbBIdNZTNfh1EhqJkrNbnmeR1GRwZvDRnBsn8h+W3eKtkFenkXv3tGdja/94Mgjj2TAgAEMHz6c\nG26QbUrq6uo47rjjCIfDLFy4kC1btnzre5x33nk899xzXHDBBaxZs4ZVq1YBsGPHDjp27EheXh7/\n+Mc/ePXVV7FTP9R0GehjjjmGc845h5EjR7Jx40ZOOeUUvvrqKz7++GNOO+20/f582Sjx3wuaCmFJ\niaxFDz5B0qd22VTyfpEKpUkJYP4Jtej+7/E1EJ5B5/mJ9B4nedVIv7cAQcZ/7ro0tGiMxSzGjYvy\n2GMyGzcWsxqEN921C+R40u0UWzIZqjktnzhxN1WWd7GMaFqxeikWf9MsdB10IT/P5MlZp2bNtkdW\nwztjuvL7XEvF+isaSJcdaUluuOEGfvjDHzJ79mwAbrzxRq644gqKioro27cvPXr0+Nbzb731VoYN\nG0bPnj3p2bMnffr0AaB3796UlpbSo0cPTjzxxAa3DkBFRQUXX3wxxx9/PAsXLuSZZ57hhhtuIB6P\nA3Dfffe1uPgjhDgk//r06SMORd5+W4j775f/1ta+Ld58s4NY+IYm3nwVUVuIEIYhD0gf3KGDqD1D\nF5sjITHljBtFAA1/s7hRPFB4i/jLo7eI2tq3G9581i1vC8OQwY+aJsQtt2Suf//9ouE1w5Cvpcdz\nIL+DDh3k9Tt02LNr19a+LZ555n5xxhlvi3Rgp6bJ86dObf4zrJr6tvDCHURSM8RXdBDn8Hajr1fR\ntojFYgd7CIcdzX1nwLtiDzT2oIv8rv4OVfFvSm3t2+KjRbeI2lJzJzX86Jb7ha9JpfZ1Q7zGQJFA\nFwJEAl3cw/3CNFOHZylqMqeDON98u1lx3RfhbQ2yJ8Hd0TBJLjTEq692EGec8bYwTTlx7er89Ofs\np78t/tO4X5wXznwfd98txMCBctJIv/9HH90vJ1DFYYsS/71nf8RfuX32k7w8i1jI4s2zI5x/tkO3\niA2WjMIZN8PmNREiTEAiCPF/DKY/ixvKK3/nBzaPXyK9G8dvdegalxmxuvCYVeHwXNedC50dKuXt\n96bKcnZUhml6jBjhcOaZmSStuiXTqN04h/xTBpPXT9b2SXt8lgQWSw2LETfBxV2hthbSOTfz58OK\nFS7XXSc7k7VEuJ9C0V5Q4r+fZCJuLEzTIhqRUTWOI8sSpJueg2ANRdxOJVdrczjlzsH0vyrjv39V\ns3ktMAnjkQhMdpTajNu5xhlw+JW3l1EYJsmkzE945hmb3Fz5HQ06YRpfHvdTgi6g/3s+vZdAXr8K\nLi9w+UZzeEO3ec+0iKTqF2UlXwKyDIbvy6Szlgr3Uxw8RKolomL3iHTnpX1Eif++kBXsX1WV6YCV\nnR371VcuP7lhPF+vSNIpJgjwGUoVEWaRi4f++GKW7CjC8yx8H97SLC4kyvk4LNJsjpxjMb7o8BL5\nXZGXZ7F6dZRlyxzee8/m/fdTTRpzAAAgAElEQVRlqWoh4Lgb5tD9J6T6E8NHy+fQWy+iaEw5ZwQe\n9xom71dGKUp9EYMHS4s/TToaStNk0tm2bbI3geLwIzc3l5qaGgoKCtQEsBuEENTU1JCbm7vP76HE\nf2/JCq73QyaxQFbbBBmtUlAAI0e63H9/OeEBcdb8KKDXWJ0OMZMTToAOnzYtdmY1RO68p1n8LSH7\n6eoLYPHitlPFsm9fi1/8IvNZ0zkJC6oHc2pifkOW80MzBvPbdQ7dUn0OwponE8dSoaAVqdXQ00/D\nihUyNHTBAlmtbt68CB98YDF8OA0rBcXhQ5cuXdi2bRufffbZwR7KYUFubi5dunTZ5/OV+O8t2cH+\ngUc/4bAIC02TyVk1NdCrV7oGT0BC6Py55EKe3Tiex38F2phMbeVuEZtoVucskK0TFyxoJlnrMCd7\nryI7G/mFTRX0fwHCO+awoHowc9ZXMNByiewmUQxg1app/PCHI9G0AM/L4dVXI3ieTJBrWrxOcegT\nDoc5+eSTD/Yw2g1K/PeWVLC/iHvEA5OFqQqW4XAmAepPf7Lx/RCaFiAIs+Xk8Ux0LIosoKjxbq3l\nulg4gHw8fjzEHZd+CYclho1ttx31yt6rKCrKnvQqKC+vaND6UyMWRHa9qz1tGjz6qMujj45C15No\nGoRCcUpKHGIxq5ELTom/QtE8Svz3lpQJ+8IYh98ty2SmnnIKBIFs0/joowXE43IzxvcFr7wCl12W\ndf631FK2gKhWjoaH0EwMouxLx61Dnaab1tGozPxNI6dEi4LVkGoG1lBKY+lSm5ISB12Twi8E6JoG\n2BiGfKx6ASsU344S/33BsphXZrF0WfaTLl9+Kds0go5hCDRNYBh+qgRxM/Xnd1FL2Uh6kKpq2Z7M\n11kpj9jMmVLAk0np/tJ16N3bZdIkGdL5ox+ZzJ10m6yIiswOPqkSPnwZAk3uKVRWtpuvTaHYJ1Q9\n/30kEpHWZZqBA6swTQ8Z2umjaRrJpEEyabJmjc3Wrc10yUrXizCMjKna3HPtgKbzYCKRqf0fBFBY\n6BD4ccBH1+P8uMSheCycPANKbocTXw6wcVK5w3LvRaFQ7Bpl+e8jliUFq6oKZszI1OJJU1NzBUFw\nFitXyno8q1c3swm5q4ytQyGL6wDTqPVAaGfLP7GigNCPglRUUMB3N+eSF4P8mJxuk+g42Oh6u5oz\nFYp9Ron/fpD2W0ci8O67ETRtJuDheSbjx9/Npk0WQ4dKa3aviqAdbllcLUDTeRAykUFz5kD312s4\nY6zOv0sCjl6ps7ljIcexlDAeAQYjeYKjB1pciMwFaGdfn0Kx12j7myXWWvTt21fsrulxS1JX10xd\n/r08tq7OZe5ch4cftlmzRlar7NsXqqul+O+uP66ieVwXxtkur3jlhPHQwiaTLovy4ovQP9i5JHRO\nzs49itObySr+X9HW0TRtuRCi726PU+K/d+3glixxqa8vxzA8wGTBgig1NZnyA03LFqfr6w8bpoRn\nf3Bd2FDl0mO7w92v2Lzlyy8y3WQmG8OACRNkH2PXhQEDIFUZF9NsV3voinbInop/i2z4app2saZp\n6zVN26hp2j3NvP4TTdM+0zStOvV3c0tct6Vorh1cc6QbmIM81vfjHH30eBYtchkwQL5uWalIE1zu\nYSJnC5dEArp2VYKzP1gWRJ60iJ41jrd8q8GV1pS0z7+gQPYdqKrKtMQEuZGsmr8rFC3g89c0zQCm\nABcB24B3NE17UQgRa3Lo/wohRu3v9VqDb2sH57qyY1dJiUN1td3QwBzi6HpAnz4LKC5ezJ13RtlQ\nBZbj0GVZAa+LMZh4eJgM0qJtKlnrYFJQIAU+COTKCuTjCy+Uvv6aGnnM6NGZzeNQSIo+yGQ8tRms\nULTMhu9ZwEYhxIcAmqbNBq4Emor/Icuu2sG5bqZOj+d59OhhAlHuuivK6MgYupctwzAChPC4oncV\nQ56eBYHHxWgIAgwCBB6PXOFwljL79xvXhdtuk0KupeL5QVr648dnVla33ppx8yQScNVV8r8/+QRu\nukmtwBQKaBnxPwH4e9bjbcDZzRw3WNO084APgDuEEH9veoCmaRVABUDXrl1bYGh7TnPt4Bwnu06P\njxAexcUOW2bbDFq0nM0l0voMkiHyq4GEdAfpuk5gGPiBhm6anHW3fUA/S1sl24UjhLT+r7wS7rzT\n5fjjHZYssVm0yCLWxOz417/gnXegLO6y7T2HuZts1uXv3CtBoWhPHKhQz78Czwsh4pqm/RSYBVzQ\n9CAhxDRgGsgN3wM0tl1i27JOTyJhIoSsRV9dbXNXjwf5aKSP0EETUDP5bObGItzGLDTNw8gx0VMN\ndVcX2LzkWNgoodlbsipnN/vdCQEffuiSSJTz4Yce8bjJc89FWbeu8cH19VL45wflmIGH96DJw3qU\nCTmqH7Ci/dIS4v8xcGLW4y6p5xoQQmTnW/4BeLAFrtvqWBZMmWIxaVIU05Q+/1jM4oghnxCEAQNE\nEoy8epZiUU6UC3WHayttiiqs5kr3KKHZQ5r77iIRmD698UZvUZGT2qvxCYXkymztWgvDkCuDcFi6\nera952AGHiF8BB79A4elnqUifxTtlpYQ/3eAUzVNOxkp+tcDQ7IP0DTtP4QQn6Ye/gBY1wLXPSBY\nFvz85xYDBljE49LP/OV3b6JzYllDDfrPj74JTYOlwuIdLI6ogSKaL92jhGbPaO67GzcORoyAp57K\nHLd6tY3vy836IBmiqHorWzSXyBMWNTWZVcPcTTbJh0004ZEQJot1W2UCK9o1+y3+QoikpmmjgHmA\nAcwQQqzVNO03yEbCLwKjNU37AZAE/gX8ZH+v2+LsysfguliOwzuP2TyxwmL7dnjm8SLmzAvxVUmS\nvDUhvjOyiNzcncvPZ5csUEKzd+zqu4tEZJmMeFxu+n7vexZ//nOUvB1V3FE9gwti07n5jBl81XM4\n+cURYjGLW2+FGTMsziTKBYbDd6+z6fiZxVM3yr2Cujq5yb87N5NC0ZZQSV7QvI8hO2Mr1bWrXERZ\nlLD4hZjIBO4lhI+vGRi/nYBrj2skHGkhKSigkQWq2HO+ZT5uqKnk+zKUc2xiImN7/BefDQzYfgmI\nsAZaLj//eZTqaqtRWKhhwOmnuzz0UDm5uTK81zCiXHSRpVx0isOePU3yUrV9ABwHEZdtA5PfeDx/\ns8Mpf7CwmnbtwuFNYeFg42Ei8NBTZuluyvQrIdkHdlXiKF1ULzvR64SKAtZcGRCYgAZogiDwGNSz\nikErZAmIIlYzWMxhTmIwXxXXEA6nk/U8qqudhn7KykWnaA8o8QdWF9h0D0zCeCQweSJms/x8WD7Z\npqih1KTJEmFj+LAiZDH5kijXdXboFrF3Ugnl6299mrqFjr2ghmRYR9cDaeULjaQXYkz1DArw8dEw\nSYKAgcznP1fe3SiKa+ZMu1HegHLRKdo6SvyBl2osXiSKTVaRsIR8nsooNXMcCgbbTCyystwQFrvq\nsKV8/a1P0yqg775rc9RROYSEh+8brF07nOQMuCA2nRA+OnJBALIE9M3xama8FWXLFof33rNZv95i\nxAhZhkO56BTtASX+SL/83zQLBNg4ACwPWxQUwNljLOJxCy0KV1wBd9+9e2HYVZl+RcuS7RY6cjXM\nu3so/yqG12IRbrnF4rkNLqOYhcBD0zX0IAnISeB7dw3msiKL8vJMFFdpaaY5vELR1ml3G75NyzGv\nnubyx585bPcLeBRZjyepm2x4MspLNRYv/afLeSKzIsjJgYULlaAfUqQ2WUTcI2mYvD85SlGFxS9+\nAW895HK+cHjbtHl6zGq6V8+RRYBSKj9tGowcKXMC0qWgQU3cisMXteHbDHVLprGyfhSB4aPrOfQ2\nKukxagy/9j0CNHQCQgRowqNoRRXHbq/iDjGDED4eJuVEIQ7/GONApa2U4VAhtcmiBT5hzaOoxsF1\nLSZNgqSweBsLPQl/zLcYN6+xaV9TkykV4XkyiijdS1ht1ivaMu2nh6/rUjt9JAEJICAI4tRunMNX\np8f5eIjPl4U+AQYJDAgZ1C19mvgRT1FfKLNCw3hEqCJKOZcvuxd/QHkzTXkVB4Vm+h47TqYHMMiX\nmtt7aXoq7LxZr1C0RdqN5b+lyiHv3QD9emRmrmYQ/l4Jqx6aTxAGPSH4ovJaTln7GUedV89HP16U\neh6Kx0LOOhMEmMjJwFdhPIcOzWyy2Eg3TjwuY/snT5Y9FpjoNOoTadk20WhmIx8aW/5qs17RVmkX\n4u+6MG6GzSteDr3Gxvl3X53vVEymtksNwWYdCAg0nV4F/8tJ2wRboKF2TyBgfslZVMYqEcDQ1Aai\nrpTh0KJJUsBO8wFZCXu6gQg0DJFEC4ewhg3DymqzpjbrFe2BdiH+jgNv+bLw2gXrHE4/zybSz4I6\nF13PSdWF0elU7RMiIL9aZ2vSQIiAZNLkv6sriSF78v6yb5Q7ypqP71ccOjTNDt5yq8OJ9R668MEP\nZB4YAhH30aZOleZ+ysGfnkfq6ly2bNmzvs4KxeFGuxD/tF/3Hc9ipWlx29EwaBDceCMcddRQ3noL\ntr1WyszYGGoK43xRovOXx39OTV5+QyXPdGPw6yotuinRP6RJZ1inXT7XXQdbZtvMEzKRz8dABnx6\n6Ai545ve7U3NGEsCGno1766vs0JxONIuxD/bBVBbCw8+CIWFLscdV0447HHRRSZjX4swrLCSikdG\nQdjn0sTjjB0ra8NfdRWcdZZyAxwuOI4U/iCQf88+C6RKbqcT+TTgp70e5LzSv9KpWpD3gQEzZ0Iy\niR8yefXaoQwY2rivsxJ/RVuiXYg/ZFzCgwbBObgMKxlPOBxvaMNYUuJwIlvRwkl0QyCER2mpw+bN\n1h4ldikOHWw70+c3m6VYLMVC06BXL5fvPjSPzWHBlsDgiGmXcuaf/4oW+Ajf46jlkBgiyz9oWuO+\nzgpFW6D9hHqmuLXEJUo5V1cvIJwI8JM6yaRJsrqAO6pnEEoISIKhhTj7bFvFeR+GyCY8spGL3uQO\n13VZBbR3b9meUzcCklrA3JzOfBOYJDDwMJkbizB2bJSqqgnk5iqXj6Lt0W4s/zRX5TsEmscRsYDi\nu3Q23d6XDzqWMbxkBcc855M7Fr4o0YifPIwB96kf/OFKRQUUFTUuq53971NPpdtzxhFCY1VtKRcS\n4fzs+k4xOO88i3792H1PSYXiMKNNl3doWsoBaFRvua7YYMXDGgFJfC9E8VhBp5hPApNLzSgTHUv9\nztsorgux2DS+971RBIFPIpHDuHFR1qyx6Jt0sXF4O2zzwJtWozBRlfarONRp9+Ud6upcVq4sT/V3\nzURruFh8MaiS4tDT/P3aL/HF+xhGQBCC35eMQIt1xcHmHV/1d22rpI34Tp1qECLAMAJ03eOxxxxq\nXoKLHy4nHHigmxhEVY1uRZukzYp/ba1s7J0drRGLWYyzXf73lNtY/4gnM3h18H3p939pZYS1qXj+\nHJXD1SZJL/zq66FnT5tHHjEJhaSB0HFzAd88NJ6QiGMQIBJextWTVaN7dYHNSxOVB0hxeNMmxd91\nZX33oiIT8CAIsfm/t7LsC5frE1V8U+I1ZPCKJLy34kKefXY8/ftbjB6t2i62ZdJGvBAQi1mMHRul\npMThNK+AcX8Zg54S/iQ6gWbyzXkF1B7vkP96JXmLalhdYHP2GNXuUXH40+bEP+PStygujvLbn1ZR\n9tgMCmLTOYeZ6CT5ulrW7AkEBMkw69aN58knlX+/PZA24tN5AGkGHrMCI/AahH8BF/LhiMGc4Y8h\n2JxyHf4syktPWMoDpGgTtDnxz3bPJhLwzdoP8U5O8HGJ4Ohqn7wY5Meg91ioLYGF1Zdxzu07C78K\n7mibZCf8gUvfvuWEQh4JLcSOpQZHVkMCk4nh8dw3xMH3G7sObdtSXdoUbYI2J/5py657d5eHHion\nx6xnkyYgSFXovFMjb60gLwZHx6CeznxR0/g9VAP2tk064W/LFofNm1PiDux4bAQ7nuvKm9g8ELEo\nLISVK02CwMP3TbZts+nXTxV+U7QN2pz4py27Dz5wyM31ACGbthoQ6Dp1v/4BR1//V0QQ4GEy24ww\n0W78Htmrh/p6WfJF/cjbHvn5NrpuNkSE5RdHyOtvEUm9XlcH//rXUBYvhvnzI2zaZFFZqfaEFG2D\nNhvnX1fnUl09ACHiDc9pWg4lJQvJi8n6/m9ic2qkeZePbcsJAFCtG9swzeaCpJ5fsaIc3/dIJEzG\njo0Si1mEw3KvQK0IFa3F/rqc232cf16eRefOw/j006lI01+jc+dh8gduQTcrY+E1xbJg+HCYOlVG\nhSSTamOvrZKXZzVbuiEdKmwYfkPtp1jMIpnMFAFV94SipTmQLuc2Xdvn888j1Nfn4id1/PoQX/2t\ndI/PjUQgN7dRZ0BFOyI/3yYITJJJg2TSpLraBmRdIHVPKFqLqirpaj4QbUTbrOUPsGiRxarnKplQ\nPJJO1T5HbRgDpxbt0VTaTGdARRvj25bXeXkWHTpEmTrVYfly2dPBNGHMGKiuhpKSzA9T3RuKlsB1\nYcYMubIEaWi0poHRpsXftuGbX9Vw0hpBiIBA93DGO+SM37OY/iadARVtiKbL679VuhTVOI1mgn79\nLHTdoqoKzjsPSkul+NfXw/z5oGmycujw4XKlqO4Vxf7gONLiB3lvDRvWuvdUmxZ/y4Ijp9gEPzNJ\n+h5eYPJfC2zeW6w269o72RFdZXGXHqPKIfDwQybPDos2BAJkGwATJ8rksLRllvb9N+kCqVDsE7YN\n5xou/QKHJWGbSKR1b6Y27fMH+LLI4iI9yr1MoJwoSwKr1X1pikOfdD6IYcAFukPIlzNBEPdYP9Wh\nvFyuDtLU1bmce+5EzjjD3em9sjeAFYp9xcIlqpUzgXuJauWymmwr0ibF33Wllea6cgNlUcLiAcY1\ndHEyTVnTPX2Mov2R3tOZMAGumWKj5Zj4mkECkzeE3UjM0xViff9eJk0qp7Awc9OoDWBFi+E4GEkP\nXfgYyda3Jtqc26epL/fMMxu/3qOH9Ns+9ZRLr14Of/qTzZQpqq5PeyTj0rGgKMq2KoehM2Q57wYx\nd11qPxhP0C0OBOjEebQkwv/G7mKGXsFvL3c562uHgsE2ReomUuwPTarHtrY10SLir2naxcCjgAH8\nQQjxQJPXc4AqoA9QA1wnhPioJa7dlOzm3fX18OGHjV8//3yIx13uv182b08kTN59N4qlfrjtG8ui\nm2UxMZIVAbR6GowcSX4PH/0hgR/SMJIBpdUbKeennMYmfv7q49JKW2xCUZRPuq3ms8/msGPHYN56\nq0JFiil2y+ppLjVzUgZENErdu1XUlkB+IeS14nX3W/w1TTOAKcBFwDbgHU3TXhRCxLIOuwn4Qghx\niqZp1wO/A67b32s3R0FBplqjELBtW+PXS0uhZ08Hz2ucwAPqF9quScV9WraNNc6Sj0eNgmSSvDVw\nxliNzSXfoXv1v8iPybTBq4I/oyc8CGRQ9iexB/kgPhcAIebz9tswb3wRs4Y7dIvYahZQ7MTqaS7d\nf1pOTzy8+Sbvzarky96zCHwPfeWshiZUrUFL+PzPAjYKIT4UQnjAbODKJsdcCcxK/ff/AeWapmkt\ncO2dqKmRYVIAhYUuQ4ZMbPDR6rp8vbjYxjBMhDAIhUyKi+3WGIricCHtK7z3XtI7vVuqHIKkjLsT\nwJExg/nP3UxeSvgB/sz/R9IwG5z+n53ySaO3vaL/07zilXPi1Mz7KhTZ1MxxMPEI4RPGY8eKpwmS\n9WRXkm0tWsLtcwLw96zH24Czd3WMECKpaVodUAB8nn2QpmkVQAVA165d92kwti1/i6ed5vLIIxnX\nzl13Rdm0ycK2ZQJPaWm02ZouinZIkzaNW1K+/1dEDiZxAgxGMpkZegXixO6cuXUOfxKDmRmq4PQ7\nruKqfAdsm2O7reaLD5Y1vG1y8fGYLEcXqvi/YmdcF2LfK+DEIdCpGo6I6fSav4J1gwRBCNBD5Ofb\nrXb9Q2rDVwgxDZgGsrDbvryHZcGUKfDWWw7hsHTtaJrHnXc6nHZaZmN3VzVdFO2QJhttb2Lzlm9R\nTpQBOLyp2SzVLHJyYMDzFcydW8HTD4PwYcjjFtGovK+OT7kO0z7/I7sUoeXMg6Qq/q9ojOvCyJEu\nD9w/mr+HfT5OQNGdPt9ZK3uN/KtEY0vOMPIuaD2Nagnx/xg4Metxl9RzzR2zTdO0EHIfo0kV/Zaj\nogJ69bKpr5dtHEMhk6uusslrzd0TxeFLk1oep2JhzoJlcYulgYUGhAyorJSHT5qU2VeKxzMGvdw2\nqMC2K+jfH/r3ByKqRkh7ZlclRBwHevVyCIU9WW5ewI7ego5rQxwR0wjHTL6cuqvSky1DS4j/O8Cp\nmqadjBT564EhTY55ERgKuMDVwBuilWtJ9+tnUVenXDuKPSQrlddCzgXjx8OCBVLog0DuFzlO4/aP\nmgZbt8K0aTKEeKdqjKpGSLvl2yp02jb86U82yYSJKeLoSchfF+bvdz/O36trZORPReveN/st/ikf\n/ihgHjLUc4YQYq2mab8B3hVCvAg8Dfw/TdM2Av9CThCtjnLtKPYVy5Liv3jxzmHXOTnS4gcZUTZt\nmgwmSE8Syr2vgJ22khrdE5YFt9xiMfuPC7nm7Cq+70HelAh5lkX3AzS+NtvMRaFoCZpbtqczx6dP\nzxTi6tXLpazMYcUKm02bLFXnR7GT5V9ZCStWyNdKS+G222Sf8XA4a2Jogebh7b6Zi0LREmR7bbJ/\nl127Zgq8FRa6PPxwOTk5HkOHmuTmRgGLiRN3/g23wG9bcZiQvZVUUCDFPt0dML1SBPncyy+7HJ+s\nIv/2GeSt8g9Iqzgl/grFHtDUinvuNpf/1B2iwuZ7ZQ5mOI6mBRhGnCBwuOgiaydf74Hs0qQ4NEgb\nDxMnSis/Tfa+UWGhy4DzB7DZi6PfL6N98ta3vu9Qib9CsQc0LQF92e/LuTLw+C/d5DfVtxFKBAQC\n9GTA6hcKGo6tr4cHH4SzzpIbw7vyASvaNrYt3Ttpyz+bq8qqMPR4Q9TPFyUaeZsPk9o+CkVbJzsV\nIEIVoWQ9mhAYmkf/NdWcMVbn3yUBR1brzF9fg65LkRcC5s6FF1+UFUBDqV+cCvtve3ybS8+y5GsP\nPggvvJBxGQKcsg30BCnjAdZXn0n1bZVc1cqWgRJ/hWIPSPtvN1S5DJkue+0JICFC/B+D6R9bzFEx\njwQmjm5z5ZUupulQXS1bQAaBnAxGjJD7Bcrn37bYE5eeZckV4AsvNH7+6xMinD52Bl+XJDiiOsxV\nsUqO7mJxVSuPuU2Kv9pUU7QGlgXHVzng+2iAj8ZMhvEHKlhDETYOizSb436wmp/9bBRB4JNI5DB2\nbJT335dlolW7x7ZDts58W1hn9rEFBbL8TDIpn9d1WHOUxfPvO/SPOTjYLMVi6uDWH3+bE3+1qaZo\nTd7E5mpMBNLKr0JmYS7FYikWAy9yue22kWhaEsMAXY8zfrzDxo0WBQUyRLSqSk0ChzvNhXHuqhR/\n02N/8xuXzz+vQgjYvLmUnj1reKvQ5oE14wAoLISiotb/DG1O/Hc3AysU+8OpEYtLZ0Tpl3BYpNss\n9RvfXMce6yBEJpRD1w0GDrTp0kUKQnrDb+ZMWLhQ3ZuHK011pqamUYWQnUo5pI/t3t3lrLMGoOsy\nS1BWINZ56KEc7rorypo1Fu+/LyeL1jZc21wbx+zerGpTTdHSWBZMdCyO/O04fvyERYcOcumu6/KH\nvGKFTSKRgxA6YNCp02WAFIDsUD/V8/fwpjmdsSwYN25nwc4+9srSKnQtjqZlSs9DQG6ux+DBTkP8\n/4G4P9pkhq/y+SsOFNm+3HRtn+Jil8rKKoSYiRBJdN3EMKJccIHVYPnn5CjL/3Bnb3TGdWHxgy4/\n2WCzbpKHCGf6Qmiajq7nYBjRZvND9pZ2neGramkpDhTZ99qmTfDnP8NFF1l06+aweXMS8PF9D01z\ncByLqip5rPL5H/7src58+ZJDp6RPyR3wyUCNdzgTv6PNBadWk3/KYPL6Wbt0HbUGbVL8FYoDzbRp\nMoYb5L+9etl07WqSTHokkyZjx9pMmQJPPpk5p67OVVVn2wmOA28ENr8kxJGxgJNjJo8bN/F4aIzs\nAW0uhmgRlmUdMKNAib9C0QLMmdP48bPPWlx+eZRP/1bF0SvgiPcbBx/U1bmsXFlOEHjoutmqvVoV\nB56mE7ttw7wQ4Elnj6ELfnXFCvQXZQ9oEffQDnB0ihJ/haIFGDwY5s9v/NgCuj87CxOP0cxiU4Es\n+AZQW+sQBB7ZvVqV+LcNmk7shhFl0SKLBy91MF/w0YXA0HwE8E1gEsYjEZhsKrA5ABGeDSjxVyha\ngIoK+e/TT8Pxx8s47SLHQegeWuBj6B5FNY5sZ+Q45J9XgK6bDQLRmr1aFQeW7Ind9z3+8AeHZ5+1\nmBeyiZpmQ1vP1ztHmKZH6B84LNZtLquxlPgrFIcjRUWwejUsXw7z5sHfKm2KcmTmj2aaMiQole2T\nFwrR+85LqB3UmfziiLL62xDbttn4vomue8TjJsuX2wQBLE5aPFsRJdLVYXWBzV9etViqgavL/tAP\n2Qd2nEr8FYoWomniz0s1FkXRKFuqHN7E5vwVDt3SB/g+efe/QN6kXIhG0t4gxeGM67KlyuGe6Tb/\nOj1Kaals7hOLyf+5QQBYsPhkGD0aqqvlaUaqP/SBjv5S4q9QtBC2Lat2BoH8t6AAfvigxV//aiEE\n9A9BNGRiBPXU9RTUlgjyV8XJU2nohzf/f3tnHx9Vde7779p7ZgfbSoKhFpSCgmgBQ8JLbfdBcWtU\nfK32cNvbak8QPNAqaKNolbanNz21pfU1rdIWVLjMtZyeY6lagQo4soXiVkFICAQU0YKgVJs2AV8y\ne2bvdf9YM5lJSIAYNG/r+/nwSWayZ2bt5MNvrfWs5/k9mdZuCxcyyA9YiUVpbZzf1c7JKeRS3d5O\nPrmUVMpn7lyL2bPjTS41ILsAACAASURBVKZ/dXWf/LC1+Gs0x5BMzWQYwsyZWQMvUNv+2ePjfHfs\nXbx55ROEUTCSIcXHF5LfOcPVdJSMcU9jI0iJCUTxcXB5AZvBg+Gtt9Rmb/x4F9NUZwGRiE9JiUtt\nrU002jlOBFr8NZpjhOtmPfxPP92juDhr6QxqQnhgo039iLO4Nu9PIEJCIagPN2vx765kYn3pWT8U\ngqS0cHEA+P731VmQ68Kkkwt5LzAITUkkYnHqqQ7f+U7nFfxp8ddojhEZD5dhwzzuvruUaNQnmVTb\n++3b1f/us0KPkRv3IK8xESLESEkKZj8Cv9Ylv12Jo7ZucByCiAWhj4hEMK6byqq+ZfStUrbMmSww\nGw9Ky2kYFlA/zqBgeiXOnZ3799bir9EcIzINX1591aVPH7W9N2jkjqtjfPhZmyU3eqzwS7G2+bx3\ni6RhNBRUQX5tEmIxPLT9Q1egPbbwHjZzZJwJuKwXDnPLbK6yObQRS3qHkL81JH+7gDPqYMLHfCNH\nQIu/RnMMsW0YOdKhenOEMBVgpiRfWbqQ/HllTJrm0me+jyEDCrZCwVb1moaR8OagTdxwg0dVlVKZ\nhQu1HXln0R5beNeFvwQ2z0kbM2j9Ws+DnXscrolYmLRi+N9J9DhLZ42ms8nPtymumcqpiwXFsyF/\ni1KFIWUORh9L+T+jRP+Vcqi6H961NzJ3bikjR3qAsn/Wls+dQ2t2zZ4Hc+eqr0e6NpfMLmLaQzal\nMs7u6T/pMh2m9Mpfo/kYyB9fRv7ti5u3dsrEhSoqaHhrNdV3S0ILECBE2CUyQDTZP1Mm5g9th4Fa\nXttS03N3EX/BZslgmzmdr/uAFn+N5uOhLVWwbaiooH7+s4TRVM7eW3SJDBCNIteu+frrYehQj8uL\nY5xQDTtjZdi23ayXA0AYeuze7bJ3r8Ojj6oXjxnTdnvHzkaLv0bzcdGW4bttU7dzHoSzwAgQRoQB\nA6YxYEAZjqMVvyvheeB5Hvfdcx5WNIGRhKI5C6lZ4FJabpNIwBe+4DFpUoz331/E66+nSCQs1q5V\nBVyWpZr8VFWlzf660J9Xi79G8wnjeTBnehGXDr+OA2Phkm+XccYZdtvphbo1XafhulBU5BKJ+mBC\nKOHAmUl2PeLS2GgzYoTHvfeWYlmNCCERgmbhO9+H++9XNR7r1qmc/67yJ9Tir9F8wuyMZVM+66SJ\n9zKsB+Y4cHbK5faIwy/Wppt6ZE4MEwl1UDxvXjZ5XPOx4zjw+987pJIWlkxgpOD4LVHuq3WQEkpK\nXKJRH8OQSAlhKEilLKqqHED16Q2C5n15tfhrNL2Uc3Gx8Hl/ZMAr9wYUWPNp/GARj50uKawN8FMW\n/3lHHPdim6v3uAxJJJR6hCHMmtW1lo+9gG3bbG6evYYrSmL0q4YH6stYH6rff1WVcvCU0icITFat\nmsbTT5c1VXVffbVq7alj/hqNhiFlDqlHLP5R0kgYlWBIDOnzQQl8rlYi8THWuSz5B7w/Zg+3jBD0\n2wYC1DKyKy0feziuq5wbamvtJkG//PIFzL2pgrVrJ7N8+Qxmz45TXNzcwRPURm3UKOXx1BWjdlr8\nNZpPGttmyXVx9q+LUZJchCFTCBnhU1WSJAFJLHaNKGyyiNh0tcGY2XDCDonIywPH0ccAnxCOA3l5\nKuoGcOmlC7jllm8D8MUvrmIou5j351+wfbtNEGRfZxjqdZm/T1f8G3VI/IUQJwD/DZwC/BX4upTy\nn61cFwA16Yd7pJRf6cjnajTdneFlNt9ZbDPstjLGjXOZPt3hne/C0p+4/L+9DkPTsWTTDEhJuHfM\ndC4aNBinwsHDPmr7AU3HyM3YLSyEgwdVs2YhAAnXTryHDcuvorHEbvLnNwy44AKoqOjaf5eOrvzv\nAOJSyp8LIe5IP769les+lFKWdPCzNJoeQ0ZUli+HE09Uz71XZHPzOzY+UNhQQxgKpDRIpSyW15Rx\nykwbx4bY9U0Owl3uELEnYttpYzbX5aVTS3ifVZC27u6/VnIeLj+vtvkyHg4u6w2Higq7y/9NOir+\nV0LauxQWAy6ti79Go2lBGHqcfbYK7Rw8aPHkk3GCwObrIxcwY9YshBEQygjz5lWydatNeTn06ePx\n/vsuI0ao+HIk0rUOEXsiNQs8vjCrlEjgMy5qcd9F11B69n/Rf62k//I+rMHhS9IjTikWPqG0sIjT\n1duzddTb53NSyrfT3+8HPtfGdX2EEBuFEC8IIQ4xvMsghJiRvm7ju+++28GhaTRdm9dey4Z2IhGf\n995z+Rfh8bOSGzCjSQxTYhgphg/fTBgqq+iBA0uZMuU/uPfeUkaN8pg6Va/6jxUNDR67d8+loSFr\n4FOzwGP/9RWIZAIRKqe3+mWjmHn7X/jtip8y5eQ4LwobJ53BFSEgKv1uYcx0xJW/EOIZYEArP/pB\n7gMppRRCyDbeZoiUcp8QYijwrBCiRkq5q+VFUsoFwAKA8ePHt/VeGk2P4LTTHA4eVGmCqZTF5s0O\nP+4Xo7AqYG8A0gAhJJMmLWT1anU2EIn4CBEgpc/YsS5jxtjMnasPfjtKQ4NHdXUpYehjGBbFxXHy\na+ELs0oZESYwCUlhEAiL9RGHDYFNtWVT+SNYXg5rGx18qZq2G3ldLKezDY4o/lLKC9r6mRDib0KI\ngVLKt4UQA4F32niPfemvrwshXGAMcIj4azS9iQkTbGKxOKtXqzTBbdts3iVG/rsw4M/w9hUgDIhG\nA2691eW00xySSYtUyicMI/Tvv4cHH/SabARaO/jt6VlBx+r+6utdwlD1YAhDny1bXII7YWLKx0gL\n/zNcwM+MCr71gM2kOnUAXFenmq/X1dnsKoxTVHcMBvMJ0dGwz5+AKenvpwBPtrxACNFPCJGX/r4/\nqoVBbQc/V6PpEZSV2dxwwxxOPtnGMGAxZSTI48RVYPgQpAwMw+KqqxwmTLCpqYmzYsV0pJRcdtlD\n3HVXKWec4TUd/ObieTDH8XjvB3OZ43h4Xuuhje5Kpvj5P/5DfW1pt9weCgocDMMCTMBi/o2F7Fi1\nB19GSGGSIsobDOULqRpOfGQulxd6lJerzy4vV3pfNMOGOXO6hfBDxw98fw78jxDiOmA38HUAIcR4\n4DtSyn8HRgDzhRAharL5uZRSi79GkyZt9Mm6dbDBt7nYXMPt/V1eeqSQARfU4fsOr75qU1cHhYU2\n777rEokEmKYK/5SUuLzxhn1IpKHJRgIf37f40/JKksny5qGN/O4hVK3RnqYrRyI/38Y041RVuexb\nWciC6nIsfFKY1JxyBSP+uoLpLMAkJHjJIHw5j7EyzvrQ7rYZVx0SfyllHVDayvMbgX9Pf/88UNSR\nz9Foejq5+eT19TZX3m+TSoH8g8oplzJbOPTVr6rwT+asoKHBaTXkc27OIaTEZ8SJS/lnTmijvt7t\n1uKfaaTyUawTWoaL1C7CJpGwuYO5TfYb/ygJ+az/FtHdAaYMkUCEkCD0Od90eUHYXc624WjRFb4a\nTRchI94TJ0IqlX1eplMfMuZgffva3H57nDPPdKmqcnjtNZsf/ODQ9xtS5hAssgh8H8OyOGXcZBqC\ndU0r/4IC52O/p4+TIzVSaYvcHr2RCEydqp73ffXVxaFupMkr9waEUYkQm3hvCJz4Z0G/WkkKA2FZ\nfO1XDsfVdZsQ/yFo8ddouhCuq0Q+F8NQzxmGWuGWlQHYzJ9vI6VqIZgbdsiuam3sNVl1zLdtihuK\nqK93KShwuvWqP8NHsU7IDRcFAcyfD9GomgiSSXgBm1+NncaF1nwwJFKm2H8Z/O3iKB8uvZkRFDCk\nzKHItrt1SEOLv0bThcj1kjEMuOUWKCjIZpbkrjIXLz405LFggTISC0P1PvG4zcgbVDZLQYOKbfcE\n0T8SDQ1em5NcJlyUqZKWUk0C06dnr7n0W2UEyUWEYUI56gmQkYARdxQwZMicT/RePi60+Gs0XYij\nDWW0dp3nKcfnTMgokYCNGz2SydIec8h7NLSas59zz5nfXSwGCxcq4bcsuGGM1yxVs+GBqeyp/y11\n6ZcKTP70J4fx47tnmKclWvw1mi7G0YYybDxsXGpqHOa6Nnv2cIizZEmJSxD0nEPew5KOd9Wfvacp\nZz8IfJ54wuX00w/12hk8ONti8foSj6LyFm55Y8fwz4OAACHhyQdv5lfL2q6p6G5o8ddouiPpU0uZ\n8BkWWiw34myI2JhmNjNo3jwYPdqhutpqWgVHo4Xs3j23x8T8m8g5xe07ShD+AmTEIJmyuOceh127\nsoKd2xwtDOHMMz3qTqygfliCgq0hMuHzXIWL+UMI+xhASBgKxLADxySttKugxV+j6QbkpiYCJCpc\nzk34iDAgis85ocvzSVtZDaMOgYuKsvnrb7zhMnRoIa+91nPy/JuRc4pbsAXGzIa6kgg/qKpka63d\n7FA8FsvG+0eO9NJ9ExJsSYaMvs3A2mrxw2ccDu6He+6JKEsNQzJp0iJWrSpj165Dayq6I1r8NZou\nTsvURCnhiymHVaFFnvBJSgsXp+nwErINvwAuvNBm2DC49toKxo1LoFayPSwE1OIUt18tfKZWMpQ6\nDENNhnv2qAPxhQuzv6eLLophWY0YhiQQBlWTL+CHtRWqTeMWWLFiGldcMR8hJJaV4tZbWw8hdUe0\n+Gs0XRjPU9W/uW18AdZLmwtFHEe4PCsdXmhhHyyEErtYTLmBZla3UoYIYfSIPP9m5JziikWLkMkU\nmBZfutlhxgFYtAgeekiFwzLnIqNGeVx22UKEUM3Xk6ko+4dWsCnPhg/VNatWlTFp0mKiUZ9oVNls\n5Od33m0eS7T4azRdlJaxaSGUeGUE7AVsPGnTmv1tGKr8dSHgG9/IWEeHSGnwz39ewIknVvScVX+G\nzEl5WRnCdYk6DlfZNtvnqgyoIMiehwgB48e7GEaAEBAEgpUrp/LBBzZTpkBtLaxdq3r3zp4dp6LC\n5aKLetY5iRZ/jaaLkgljZwq8IJuXLoR6PiNmppl9nLtDkBKqqprbQfzoRxXs2mX3iIyVVrHTeVCu\n6jSViQgNG+YxbpzL+ec77NtnM3Fi1iU1lbKIx8vYsUNNFJYF3/ueygSaPNnmootUrQTQYyYALf4a\nTRcl17sms9rPCDxkJ4EzzoBzz4UxY2DpUti3z6O4WFk/1NbaTavXkhKX6mplHd2yKviIdCNv6Nwz\nkkxa5urVHo2NpZimOuy+8kp12N3QEGfLFpft2x1s22br1qxRXEEBrFx55LqB7ooWf42mi9KyeXh5\nuRIl08xaE0gJ27fDK6+oit7f/tZj4InnYUZ9UkmLW25dw7ZtagLYuVNlA5lm60ZobVbFtqamORNA\nV5sXcu0bEgl1ZvLDH7qYZqbeoZH9+2NN1c7nnGNzzjnqMDgTWsv9/bT0+u8pB+Va/DWaLkxuwVdR\nUXYiuOGG5tc1mb7Vx4ienAATLJngzhkx3uljN1lDQOtCfdjV7WG8k48wL3yiZCahwkI1lsxZyerV\n8PbbDpWVJoYRAJL9+xcxYEBZ0z16nppcw1BNjpWV2fvIeP33FEO8DFr8NZpuQmYimDs3G/rJkFmt\njngH/nY6hBKMFLy/HJiseozkvg+eB3PdplngsKvbdPxJJnxShsWOQqfJ0KyjnvrHatfQchL67W89\n/v73GG++qTJ2ampsli9XaZsgCcMU1dUxhgxROx3XtZvOV4RQPkoZ8vNtiovjPcoQD7T4azTdjlzz\nN9OEm2+GQYM8iotjwH5Ouz+C/+mA4zZHub22jBdWqdfNmJF+g1aW6wUjD7O6tW1qKuM8NtPl2cBh\nU7lNvEiJdUc99Q+7a2gxMxxuoshMQmec4TFpUoxBgx5h8OAkY8bAxRcv5JZbXFatKuOSSxYDPkFg\nIsQi3ngjhWFYTJwYx7LsNu8jP1+FzpYs6TrhrY6ixV+j6Wa0NHUbOdKjquo8wjDB28C+mVHefPhK\ntpcM4ABArToIbhJ/183GRBIJcF1qmUN1tToUHj360NXtsjqbn0mbIAQzZ4XfXk/9XAE/7K4hd2Yw\nTfZfOo05K8r4S9C6t47jwOjRHj/7Wakq2hJqayQERCJJxoxx+eMf57B5cyWwlA8//BQTJjxFZqcz\nZIhLPG63eR9dKbx1rNDir9F0Q3LPAnbvdpHSz/7QSHHSdcsYZEic5GJmz44zebK6uKHBo/60lyj4\nQkh+LRCG7KovTAubjWWpFFBoLuiHW+EfrRFdSwGtrDzMrqGF6f7nnpjPChZTSpwNvn1IeMm24Ze/\ndEmlfISQICFTAGGKCF/6ksP113skk+UEgU8qZRIEkXSOv8VzzzmUlbV9H8eyZWRXQYu/RtPNKShw\nEMJS3vNAGBoYRpgu6vIZM8alqMjOHur2b8S4F4pnQ/4Ogzer6poJWyzWvFdAZjKYMkV9PZxIHo6W\nAlpXd5hdg+PQMNqkfkRAQRXk10qi+JwvXKqtQ711PA9WrnRwHIuI0YhISU54Aax6GHDqdTg32uze\nPZc33vAxzQDDgD17prNq1eCmlNjGxpzdUQs6Et7qqmjx12i6Ofn5NiUla3jssRjbtsHOnWOYNau8\nqairutrBdeGkk9KHukISRuAfJYJP78yjcLKDtS4rbMBhJwPVSewwpGM7NYUOy+psLi9UPvmXFzr8\npEVcvbVdg+fBxo1QdI/qomL4kuLbDD6z0+KMqQ7x9OfPnZsVYcdRO5fHH49z1dgYN21ayAm1AUks\nVn2vjKtonrVjmhbPPVfGkiXZD28WGmvBR20Z2ZXR4q/R9ADy821GjbKZOVO1Ijx+N5w/einx6sns\nel2tlAsKHAwihKkAIwV9qwyuT1byRexmwgbNxR7aEfJoYTX9V1HJMFmONHyK8ixerIyzrM5uU0Az\noaHJk11GjkxhmpKwj0H9rReQf3oFZemD39zw0ZQp2f67maK2ZynDwcXF4eX7bZ67Cmy7edbOl79s\ns2hR9rMnTz787/ijtIzsymjx12h6CLathHlnzOOaReUYtT7XmutYfnMRrmsDNsU1U/nnC/PpVyX5\nVC30p65pxZsrbC0ng4ULsznwjnOYFM10bCdjNf1VuRQL9Rjfp6jOpWhO2wqaOYv2NxXCNQZSSMxI\nHgVXVUD6ELpl+Gj/fvXakSM9SkpcDhwopCh/M303A7VZh1Pbbt7GMrPKX7pUCX9bq/6eihZ/jaYH\nYdtguy6kfAgDIvhsus/lZ1JlybxYWcaIxxYjkz5JlBX01FZWvLmrXM+jqU+AEFBTk602zs188TzY\nucfhmoiFIX2SocUfmcxE1mEYPsaRguWeR8nTLtPCQn5ZW07j7ICGcQYnfLuyWfaR42S9jEwTBgxQ\nDVkyzqWGESJCML4FU2cv5H+/5uI4zSecTDXzqFEOdXU2Rd25E/tHRIu/RtPTyDmdTBkWzwYOQboC\neFmdTdFzcbxYjE194fpRR47hu64yO5NSfV26tPnKe2fM46SYy5yFDqkUNIopXPEVePH0Mv7v/Tbb\nUkWUGi5fq3Qoaitu4nmkzi3lwqRPKQKDkE/VhuRvF5gj6mBC88uFyDZe79sXxo3LOpciAVMVun1Y\nkmTxRJchOZ/bdPAdJAgSJlWPPsiPfzyDNWt6VljnSGjx12h6GjmnkzsKHTbcaCOSWY//9SEE31zM\nqNDHMBazfn2ctWvbjsO3zHS55hqPgQNdXn7ZofBVuGZRKcL3eVpGAEmEALnc4lXKSKXgeWnzorQ5\nrg7aWmDvjrmcnPSJEJDCIMQkiWh1t+C66lwDlPjffz88/LADWEACRAgpVeHcb3uU/HnNX6+qmdV1\nZiTkpyUz2VNbRCzWM5q0HC1a/DWabsZRWSKk4zbvedlVciqlzMs++MBlyhQfw1AFTqsejtF3U4zl\n2yEMy5gwwT7krW68Ef74R5g2zeOUU0q5dkqCa68xGfLHyzAf8kEGRFE+0iaSMPA58JSLlOq9IpHD\nR3yew+F/YSFR4ajvUsnEkXX828PZm8yEaiZOdDAMu8m2Oghg3z6bK69Uh7nRaCHJXZsp2A758w7N\nS9271yH0TQwjVBNEVYiDSz29SPnR4q/RdCvaW2nqujB8uMfo0VmL540bHb75TUv1ppURrvvHw7x5\nT4owCokPF9HQsKZZjD0W89i718WyHN55J0aQakQYEkSIeP8pkjKCIUCaqseklAGBabEm5QBq8pk6\nFWyyfkINI2nmlTO8zOaSR+L8SzKdoRO1mfYwZPQ4azyXAASx2BXceef3qK1Vk8BLL4Hj2NgZw7nd\nsOQ95eff0jHivPNsvjr8QX5aMpN+VSHH1eaxPuLwiyOlsPYwtPhrNN2I9laaTjp5AWfdPQuiAclk\nHrfeGmfbNuXvP3asy9cb90D+fMIoYIIR+uzfH6O6Osa+fTBo0BgGDixn2jQ/XREbQrqCVgRw/MuS\nh8Op7GEwzwuHB+dBUZ3L8nqH5+9SA5MSJvXNzloNo02q7xOEpACLmpo4Th7ErnP57/0OA7C5bkDz\n+2gK1aR3FwMGPMF9963glltcamttnnhCee9nCtLamiAzIaM9tUWsrv13AJ49uYxfPNa7Qj6gxV+j\n6Va0q9LU8yh8diYH/i2lhF0kmDHD5bbbbHbsUP7+k2/26PvnhRhJn1BCgGDv3ocxjBQDBkAiYWKa\nUmXQCCW8QgAhfO5pQYoIu6+Gp6octm+3WVYHRXNU60TDUBk5Z57pUXBcBQ3DEuRvDdnvhIRSgoBU\nyuftF2MM+91ijjN8ZkctSmWcpwKbxYuzwh2NFja7NSEgGk1y0UUxSkrUruaVV+ympvUtrIuahN1x\n4GzTY2VQioWPj8XAa8p6nfCDFn+NplvRrkpT16Xg5RDjGyrzxRQmU6Y49OkDM2eq3cM3fmkzPuny\njQfvYkT5UwgjQIgwJ7UzIAyjhKFAygimqcI6ST/CMzsv4fR7/8yF0Ydwkou5/fY4e/aoIqyM82im\neTx5Caq/HHLaPMH+SUr4VcvJCH03k60FSDTyv4mRAM5rdFl1XyHJGzcj5SLI6VasuphJLrnkYUxT\nkkxaxG6rpPSlOt463SEM1S8mDJW/f+7v79HpLnm/9TEJMAyfqwpc6GXxftDir9F0O4660tRxyP9J\nHsW3JagfZ1Aw/UHy81Vjl0yvX9+H9dJmSP5ZjBRPql7BaVM0CaRSFg888AD9+tVx1lnK/Oz++10e\ne8yhpMRlRPQpTDNACJ/iYpeHHsqu2ONxePVVlz59fCAk7GPw7rShSOt1VPhGsHr1VLwdZZSzUIkx\nkut4hKks5IMRKbZeF5JKCQyjeQODTA/jSCRQP5MJ/nP0TE5ZIhltWNjE8bAxjObe/ABDyhxYrLZP\nR6w96MFo8ddoeirpbUK+65Kf3iY0NHicfbaybd6yxSYSUTuAqioHkVTLcRHA8dvh4HHH8+Cye1ix\nYgbRqOoelp8PH35oU1urPiLTGB4sNm92+GLgcd6HLuvucvje4zYjRzpUV2f7BBzofxuN75cTiSjf\noXHjxnDmQpeVCy7l8uefxERiksIEDpZIwigYhkyv9AUgmxrZBIHakUQiAaQMTqgKiBAiQ59zcXnR\nsMnLa0Xbe6JRz0egQ+IvhPgaUAGMAM6SUm5s47qLgV8CJvCwlPLnHflcjUZzlORsE3JbNd5zj8Wj\nj8b58pdtNm+G+fNtVs2+lWsvugu/H/zjSxBE3mPmzHKkhBNOqCMMHcCmoEC9dW5jeN93KNgOK0nH\n0p+wqFkQp2hGcz+dX//aZsmSIkaPVjYMN91UTjTqI/8zwt9vinJCbUBABMOQfKYqhZEMSUoDYUTo\n27eEgwc3IkRIEAiefvo61qwpY84clyd/WohdW04ynSq6VjiMHw9jx6qK5J0xj3Nx1ao/8zvppaKf\noaMr/63AvwLz27pACGEC84ALgb3ABiHEn6SUtR38bI1GcxRk8uMbG/c0a9WYTLqUl9tUVkKfPrDM\nuIrSSfcTsZJIAYaQRGSC8vJZCBGQSJi89daDOM6Mpk5iGSO1fxEeP5IVWCSIEAIJ3r+tghoqKJqh\n/HQWLIANv/K4fL+Lu9Vh6NUuhqHGI0zYdNN03r5nMCf8q8Orr0L9Ey67ZhcSKakjL8/huusASpHS\nRwiLU04pY948ld45aBA8eFcRB55ycaXDxoiNqFbuoGeFHvH0pBQssjDX9IBOLMeADom/lHI7ZLZj\nbXIW8JqU8vX0tb8HrgS0+Gs0HzO5q30hTISIEIYqlr9pk9PMV//VV10ifdIZPUAQCKQ0MM1UOvQS\nsmPHTPLyilizxiYWg02bILLBY7UsxSKBSUiAgUnI+APP4H97Hc+treS49+vY8EQhj1LelGUztaqS\nZNICmcCUBp87bgxV02aQKoQf3g9JbKUStSpzaMkSmHx6JU7RUp7bNpmZv8mmZ9o22I/beJ7Np10o\n2gMPPaTOBRxcLFT1cNBTOrEcAz6JmP/JwJs5j/cCX2rtQiHEDGAGwODBgz/+kWk0PZzcxuxSwsCB\n03nnncHMnq1SI3N99XPj80JEiEansmrVGM47bxZSJtNdr0Ieesjl29+2+c1vVNHUnye6WCmfCCEp\nDF5nKP1H7uJgScjxVY2cufkGDpSE/HykgVUrVVwen6G1dSyaXclPS2bSvzrA2lbOTUYRLwibIGh+\nH2EIYxMeD9WUY9X4XMM6/hArwm6lt2/GZG7x4nSqZ+jgp6uHe/MBb0uOKP5CiGeAAa386AdSyieP\n5WCklAuABQDjx4+XR7hco9EcgdwGJoZhMWBAGWecYTNv3qHnnfn5zePztbU2990H1dVw000zESIk\nlcrj5ZedZj18T7ylkIMr4ECxoO+2COuH/CvDvnMXYRREIEEGyAgYyYBRs0361prsvUxw6sQn+Py6\nkxj6XxJDhiTxOSd0eV60vir/6pkx/ja6kROqJJ+qVYe6nme3WtCVe6ZbWGjzh83x5jF/zZHFX0p5\nQQc/Yx/w+ZzHg9LPaTSaj5mWgp6xbchdIWc6YuX63Tc0eGzYMJehQx2WLZvBX/9a1FRMtWtXThtF\nz6P/6hupvjtQ9XoEXwAACitJREFUVcKE9K13SUUFhikJhQohYahag2XOFTAaBs94guG8BF+EvUaE\nzy8zSYYW6wyHc0yPs1Muz0qHF9L596NGeRTfvYjdUcmbSSiaYzKkzGGJ23bFc/MzXZvemMt/OD6J\nsM8GYLgQ4lSU6H8DuPoT+FyNRkPzBia5tOUT1NDg8fLLpYwapbKCZs+Os3OnzaWX2rzzjjJ5axJV\n16V+ZLLJHkLKFP36vdRUBdx0HKisgLjq6kuoSi0lDLOGc3/76gDyi8bzSvEAZlXX8LV7yjFIEAiD\nuwbP4z/enEFxsYsZVZXKoSE48Ktp9LNtHHpeb91PCqMjLxZCfFUIsRc1pS4XQqxMP3+SEGIFgJQy\nBcwCVgLbgf+RUm7r2LA1Gk1Hac0nCGDVKhcpVaPzSMSnpMTlkkvggQfg/Wc89s6cS80CT13sOHym\nOoqRRIk96nC2KQVE1XJBCANWQv7aOoYPn9wk/AI44fm9VJ/9BB98dgEDnVkcHNaIkCERmeKOvbM4\nJ+KxZYujDocxMcw+FIxWLmyZ8M5PfnJkkztNczqa7fM48Hgrz78FXJrzeAWwoiOfpdFoji0Zn5sJ\noct608Fx1OHpnXc6/OIXVlMD+JoahwkT1IHrqrAUK/QJb7CIbY4zvMxm3Wku4ewYZ1y0ln6X1apq\nnszKX4JMe+uf+KwFv3Y46SSl0O++eDefXbiLZF/ZFDJKoRrL59dKNZHIgMXTXJYMnsPxx8cZNKh5\n+Ap0yv5HRVf4ajS9FBuPuCgFfILA4pUa1Vx969Zs8VZ1tUNJiVJWR2RTJpOBzyvzXb6z2Kay0uam\nP9uMrfT43c6J7ClPIQWkklFenXcZqQJo3DyAH+0sYy42NnDSSTM4aUARxEtpGJbASIYEwiCZyuOh\nqhv5MfcTEQFGXh5DyhxU218dtz+WaPHXaHorrouR9BEyIAx8HpvpMmieskTYsUMVbwFs26ZCOWeb\nDiEWQeiTlBbPymydwJo14Lo27xWupWR7jFcGwIv7y6gdpIq7wlC9R7MU+xz7ieLjC9kS1jH7Voct\nr9h41lUsnnZods5RNbLRHBVa/DWa3orjkDItCJUlwrOhw2Xpgq+KCnjmGZq6ZYUhrBc2v5+uUian\nLHTYEDSvE7BzVudnoao7M8KfeY/C5s7MeNi42DgGnDOBnBRUu6nvbkbwCwvbbhyvJ4T2o8Vfo+mt\n2DY7Hozz2EyXZ0OHTXk2dztKQCsqYN26rC++YSjBHV6mRHlu2ZEF1/NUs/fM4W5Lh81MttHYhMeH\nhstn5jnYM5o3VcnNSDIMdTidcSPNHFCXlqpxGoaaPGbM+Fh+Wz0OLf4aTS+maIZS2vOXuhROhiI7\nWweQLZJSop0r9G3VCWTIiHYikRX+lg6brquE/5nwPKKhj7zBgqI1zd4oNyMp8z5CZNM6XTc7QYUh\nzJoFRUV6B3A0aPHXaHoznkdReXppvc6Comy+pLJ88A4pEMt5aevtEj2PRIXL2ITD+lB56l9wgdpN\n5Iqy40ChiJFHAgHIIAGxWLOLWnYuq6w8dCLKdAwDNUlo656jQ4u/RtObOUxT4FxTOMOwKC6ON5sA\nNm70mDzZZdOmbAtFGzUjnJvwWRVaXGTE2ZRnHyL8oB6fcgXwhHrcmj3kkaz3bVuFembNUrfQqn+/\nplW0+Gs0vZnDNAXONYULQ5/6erdJ/BsaPIqKShk50ueaayy+//04jmM3TSYiDDjO8LnzApe8CrvN\ng9mB3yuDFQtVV/VoFMrKDhnikfL4Z8xQoR596Ns+tPhrNL2ZwyytW5rCFRQ4TT+rr3cBVQVsGD6/\n/KWLbduA0zSZCMvCmVwI7lxqahxKy+1DQ0R2esLooHLrQq/2o8Vfo+nttKGcbZnCQfOJwTQtRo92\nsu+Ve1Kczs38gmExNoizPrQPMWDTyt05aPHXaDRt0pYp3OEmhiYxnzu36TwhIn3ON1xeELY2YOsi\naPHXaDQfiWYTQ2sB/ZzzBGFZfK3S4bg6HZfvKmjx12g0HaOtnM8W5wlFtk1RZ49V04QWf41G0zEO\nky6q4/ldlw75+Ws0Gk1TeMc0dUeVboRe+Ws0mo5xpEosTZdEi79Go+k4OrzT7dBhH41Go+mFaPHX\naDSaXogWf41Go+mFaPHXaDSaXogWf41Go+mFaPHXaDSaXoiQUnb2GFpFCPEusPsjvrw/8PdjOJzO\noLvfQ3cfP3T/e+ju44fufw+dMf4hUsrPHumiLiv+HUEIsVFKOb6zx9ERuvs9dPfxQ/e/h+4+fuj+\n99CVx6/DPhqNRtML0eKv0Wg0vZCeKv4LOnsAx4Dufg/dffzQ/e+hu48fuv89dNnx98iYv0aj0WgO\nT09d+Ws0Go3mMPQ48RdCXCyEeEUI8ZoQ4o7OHk97EUIsFEK8I4TY2tlj+SgIIT4vhFgjhKgVQmwT\nQny3s8fUXoQQfYQQLwkhqtP38OPOHtNHQQhhCiE2CyGWdfZYPgpCiL8KIWqEEFVCiI2dPZ72IoQo\nEEL8QQixQwixXQjRpWxPe1TYRwhhAq8CFwJ7gQ3AN6WUtZ06sHYghJgIvAfEpJRndvZ42osQYiAw\nUEq5SQhxPPAycFU3+xsI4NNSyveEEFHgL8B3pZQvdPLQ2oUQ4hZgPNBXSnl5Z4+nvQgh/gqMl1J2\nyzx/IcRiYJ2U8mEhhAV8SkpZ39njytDTVv5nAa9JKV+XUvrA74ErO3lM7UJKuRb4R2eP46MipXxb\nSrkp/f1BYDtwcueOqn1IxXvph9H0v261ShJCDAIuAx7u7LH0RoQQ+cBE4BEAKaXflYQfep74nwy8\nmfN4L91MeHoSQohTgDHAi507kvaTDplUAe8Aq6WU3e0eKoHvAWFnD6QDSGCVEOJlIcSMzh5MOzkV\neBdYlA69PSyE+HRnDyqXnib+mi6CEOIzwFKgXEp5oLPH016klIGUsgQYBJwlhOg2ITghxOXAO1LK\nlzt7LB3kbCnlWOASYGY6JNpdiABjgd9IKccA7wNd6gyyp4n/PuDzOY8HpZ/TfIKk4+RLgd9JKf/Y\n2ePpCOmt+hrg4s4eSzuYAHwlHTP/PXC+EOLRzh1S+5FS7kt/fQd4HBXW7S7sBfbm7Bj/gJoMugw9\nTfw3AMOFEKemD1i+Afypk8fUq0gflj4CbJdS3tfZ4/koCCE+K4QoSH9/HCqBYEfnjurokVLOkVIO\nklKegvo/8KyU8ludPKx2IYT4dDphgHS45CKg22TASSn3A28KIc5IP1UKdKmkhx7VwF1KmRJCzAJW\nAiawUEq5rZOH1S6EEP8FOEB/IcRe4P9IKR/p3FG1iwnAvwE16Zg5wPellCs6cUztZSCwOJ09ZgD/\nI6XslumS3ZjPAY+rtQQRYImU8unOHVK7uRH4XXoh+jowtZPH04weleqp0Wg0mqOjp4V9NBqNRnMU\naPHXaDSaXogWf41Go+mFaPHXaDSaXogWf41Go+mFaPHXaDSaXogWf41Go+mFaPHXaDSaXsj/B5mj\nDFi6aXY5AAAAAElFTkSuQmCC\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "t5McVnHmNiDw",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Design a model\n",
-        "We're going to build a model that will take an input value (in this case, `x`) and use it to predict a numeric output value (the sine of `x`). This type of problem is called a _regression_.\n",
-        "\n",
-        "To achieve this, we're going to create a simple neural network. It will use _layers_ of _neurons_ to attempt to learn any patterns underlying the training data, so it can make predictions.\n",
-        "\n",
-        "To begin with, we'll define two layers. The first layer takes a single input (our `x` value) and runs it through 16 neurons. Based on this input, each neuron will become _activated_ to a certain degree based on its internal state (its _weight_ and _bias_ values). A neuron's degree of activation is expressed as a number.\n",
-        "\n",
-        "The activation numbers from our first layer will be fed as inputs to our second layer, which is a single neuron. It will apply its own weights and bias to these inputs and calculate its own activation, which will be output as our `y` value.\n",
-        "\n",
-        "**Note:** To learn more about how neural networks function, you can explore the [Learn TensorFlow](https://codelabs.developers.google.com/codelabs/tensorflow-lab1-helloworld) codelabs.\n",
-        "\n",
-        "The code in the following cell defines our model using [Keras](https://www.tensorflow.org/guide/keras), TensorFlow's high-level API for creating deep learning networks. Once the network is defined, we _compile_ it, specifying parameters that determine how it will be trained:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "gD60bE8cXQId",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# We'll use Keras to create a simple model architecture\n",
-        "from tensorflow.keras import layers\n",
-        "model_1 = tf.keras.Sequential()\n",
-        "\n",
-        "# First layer takes a scalar input and feeds it through 16 \"neurons\". The\n",
-        "# neurons decide whether to activate based on the 'relu' activation function.\n",
-        "model_1.add(layers.Dense(16, activation='relu', input_shape=(1,)))\n",
-        "\n",
-        "# Final layer is a single neuron, since we want to output a single value\n",
-        "model_1.add(layers.Dense(1))\n",
-        "\n",
-        "# Compile the model using a standard optimizer and loss function for regression\n",
-        "model_1.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "O0idLyRLQeGj",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Train the model\n",
-        "Once we've defined the model, we can use our data to _train_ it. Training involves passing an `x` value into the neural network, checking how far the network's output deviates from the expected `y` value, and adjusting the neurons' weights and biases so that the output is more likely to be correct the next time.\n",
-        "\n",
-        "Training runs this process on the full dataset multiple times, and each full run-through is known as an _epoch_. The number of epochs to run during training is a parameter we can set.\n",
-        "\n",
-        "During each epoch, data is run through the network in multiple _batches_. Each batch, several pieces of data are passed into the network, producing output values. These outputs' correctness is measured in aggregate and the network's weights and biases are adjusted accordingly, once per batch. The _batch size_ is also a parameter we can set.\n",
-        "\n",
-        "The code in the following cell uses the `x` and `y` values from our training data to train the model. It runs for 1000 _epochs_, with 16 pieces of data in each _batch_. We also pass in some data to use for _validation_. As you will see when you run the cell, training can take a while to complete:\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "p8hQKr4cVOdE",
-        "colab_type": "code",
-        "outputId": "3f1a7904-ffcd-4bb7-8bbb-bcd85a132128",
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        }
-      },
-      "source": [
-        "# Train the model on our training data while validating on our validation set\n",
-        "history_1 = model_1.fit(x_train, y_train, epochs=1000, batch_size=16,\n",
-        "                    validation_data=(x_validate, y_validate))"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Train on 600 samples, validate on 200 samples\n",
-            "Epoch 1/1000\n",
-            "600/600 [==============================] - 0s 412us/sample - loss: 0.5016 - mae: 0.6297 - val_loss: 0.4922 - val_mae: 0.6235\n",
-            "Epoch 2/1000\n",
-            "600/600 [==============================] - 0s 105us/sample - loss: 0.3905 - mae: 0.5436 - val_loss: 0.4262 - val_mae: 0.5641\n",
-            "...\n",
-            "Epoch 998/1000\n",
-            "600/600 [==============================] - 0s 109us/sample - loss: 0.1535 - mae: 0.3068 - val_loss: 0.1507 - val_mae: 0.3113\n",
-            "Epoch 999/1000\n",
-            "600/600 [==============================] - 0s 100us/sample - loss: 0.1545 - mae: 0.3077 - val_loss: 0.1499 - val_mae: 0.3103\n",
-            "Epoch 1000/1000\n",
-            "600/600 [==============================] - 0s 132us/sample - loss: 0.1530 - mae: 0.3045 - val_loss: 0.1542 - val_mae: 0.3143\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "cRE8KpEqVfaS",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Check the training metrics\n",
-        "During training, the model's performance is constantly being measured against both our training data and the validation data that we set aside earlier. Training produces a log of data that tells us how the model's performance changed over the course of the training process.\n",
-        "\n",
-        "The following cells will display some of that data in a graphical form:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "CmvA-ksoln8r",
-        "colab_type": "code",
-        "outputId": "1b834831-81e8-4548-dd8c-f5edf2c3ff43",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 295
-        }
-      },
-      "source": [
-        "# Draw a graph of the loss, which is the distance between\n",
-        "# the predicted and actual values during training and validation.\n",
-        "loss = history_1.history['loss']\n",
-        "val_loss = history_1.history['val_loss']\n",
-        "\n",
-        "epochs = range(1, len(loss) + 1)\n",
-        "\n",
-        "plt.plot(epochs, loss, 'g.', label='Training loss')\n",
-        "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
-        "plt.title('Training and validation loss')\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('Loss')\n",
-        "plt.legend()\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJzt3Xd8FHX6wPHPk5AQamhRWiBRUHqN\nYA6BIIjYQJTzQFHh9FB/Kp7lFMspopzlPAse56l32FCxIIoKogIRPKIUpRcJECDUEDoB0p7fHzNJ\nNstuNm0JhOf9eu0rM9/5zsx3djb7zLfsjKgqxhhjTFFCKroAxhhjTn0WLIwxxgRkwcIYY0xAFiyM\nMcYEZMHCGGNMQBYsjDHGBGTBwpwUIhIqIodFpFl55q1IItJCRMp97LmI9BORFI/5dSLSszh5S7Gv\n/4jII6Vdv4jtPi0ib5f3dk3FqVLRBTCnJhE57DFbHTgO5Ljzt6nq+yXZnqrmADXLO++ZQFXPL4/t\niMitwHBVTfDY9q3lsW1T+VmwMD6pav6XtXvlequqfu8vv4hUUdXsk1E2Y8zJZ81QplTcZoaPRORD\nETkEDBeReBH5SUT2i8gOEZkgImFu/ioioiIS485PdpfPFJFDIpIkIrElzesuv0xEfhORAyLyqoj8\nT0RG+Cl3ccp4m4gki8g+EZngsW6oiLwkIukishEYUMT786iITPFKmygiL7rTt4rIGvd4NrhX/f62\nlSoiCe50dRF5zy3bKqCrV97HRGSju91VIjLQTW8P/BPo6Tbx7fF4b8d6rH+7e+zpIvK5iDQqznsT\niIgMdsuzX0TmiMj5HsseEZHtInJQRNZ6HOuFIvKLm75LRP5e3P2ZIFBVe9mryBeQAvTzSnsayASu\nwrnoqAZcAHTHqbGeA/wG3OXmrwIoEOPOTwb2AHFAGPARMLkUec8CDgGD3GX3AVnACD/HUpwyfgFE\nAjHA3rxjB+4CVgFNgfrAPOdfyOd+zgEOAzU8tr0biHPnr3LzCHAxcBTo4C7rB6R4bCsVSHCnXwAS\ngbpAc2C1V97rgEbuObneLcPZ7rJbgUSvck4GxrrT/d0ydgIigH8Bc4rz3vg4/qeBt93p1m45LnbP\n0SPAOne6LbAZaOjmjQXOcacXAcPc6VpA94r+XziTX1azMGXxo6p+qaq5qnpUVRep6s+qmq2qG4E3\ngN5FrP+pqi5W1SzgfZwvqZLmvRJYqqpfuMtewgksPhWzjM+o6gFVTcH5Ys7b13XAS6qaqqrpwLNF\n7GcjsBIniAFcAuxT1cXu8i9VdaM65gCzAZ+d2F6uA55W1X2quhmntuC5349VdYd7Tj7ACfRxxdgu\nwA3Af1R1qaoeA8YAvUWkqUcef+9NUYYC01V1jnuOnsUJON2BbJzA1NZtytzkvnfgBP2WIlJfVQ+p\n6s/FPA4TBBYsTFls9ZwRkVYi8rWI7BSRg8A4oEER6+/0mM6g6E5tf3kbe5ZDVRXnStynYpaxWPvC\nuSIuygfAMHf6enc+rxxXisjPIrJXRPbjXNUX9V7laVRUGURkhIgsc5t79gOtirldcI4vf3uqehDY\nBzTxyFOSc+Zvu7k456iJqq4D7sc5D7vdZs2GbtaRQBtgnYgsFJHLi3kcJggsWJiy8B42+jrO1XQL\nVa0NPI7TzBJMO3CahQAQEaHwl5u3spRxBxDtMR9oaO/HQD8RaYJTw/jALWM14FPgGZwmojrAt8Us\nx05/ZRCRc4DXgDuA+u5213psN9Aw3+04TVt526uF09y1rRjlKsl2Q3DO2TYAVZ2sqj1wmqBCcd4X\nVHWdqg7FaWr8BzBVRCLKWBZTShYsTHmqBRwAjohIa+C2k7DPr4AuInKViFQB7gGiglTGj4E/i0gT\nEakPPFRUZlXdCfwIvA2sU9X17qKqQDiQBuSIyJVA3xKU4RERqSPO71Du8lhWEycgpOHEzT/h1Czy\n7AKa5nXo+/AhcIuIdBCRqjhf2vNV1W9NrQRlHigiCe6+/4LTz/SziLQWkT7u/o66r1ycA7hRRBq4\nNZED7rHllrEsppQsWJjydD9wM84Xwes4HdFBpaq7gD8ALwLpwLnArzi/CynvMr6G07ewAqfz9dNi\nrPMBTod1fhOUqu4H7gWm4XQSD8EJesXxBE4NJwWYCbzrsd3lwKvAQjfP+YBnO/93wHpgl4h4Nifl\nrf8NTnPQNHf9Zjj9GGWiqqtw3vPXcALZAGCg239RFXgep59pJ05N5lF31cuBNeKMtnsB+IOqZpa1\nPKZ0xGniNaZyEJFQnGaPIao6v6LLY0xlYTULc9oTkQFus0xV4K84o2gWVnCxjKlULFiYyuAiYCNO\nE8elwGBV9dcMZYwpBWuGMsYYE5DVLIwxxgRUaW4k2KBBA42JianoYhhjzGllyZIle1S1qOHmQCUK\nFjExMSxevLiii2GMMacVEQl0JwLAmqGMMcYUgwULY4wxAVmwMMYYE1Cl6bMwxpxcWVlZpKamcuzY\nsYouiimGiIgImjZtSliYv1uDFc2ChTGmVFJTU6lVqxYxMTE4N/s1pypVJT09ndTUVGJjYwOv4IM1\nQxljSuXYsWPUr1/fAsVpQESoX79+mWqBQQ0W7j171rnP7B3jY/kIEUkTkaXu61aPZTeLyHr3dXMw\ny5m0NYln5j9D0takYO7GmErHAsXpo6znKmjNUO7dPyfiPE4yFVgkItNVdbVX1o9U9S6vdevh3Io5\nDuce9kvcdfeVdzmTtibR992+ZOZkEh4azuybZhMfHV/euzHGmNNaMGsW3YBk9znDmcAUCp5HHMil\nwHequtcNEN/h3AO/3CWmJJKZk0mO5pCZk0liSmIwdmOMKWfp6el06tSJTp060bBhQ5o0aZI/n5lZ\nvMdejBw5knXr1hWZZ+LEibz//vvlUWQuuugili5dWi7bOtmC2cHdhMLPCk7FeUC7t2tFpBfwG3Cv\nqm71s+4Jj8oUkVHAKIBmzQI94dK3hJgEwkPD82sWCTEJpdqOMebkql+/fv4X79ixY6lZsyYPPPBA\noTyqiqoSEuL7uvitt94KuJ8777yz7IWtBCq6g/tLIEZVO+DUHt4pycqq+oaqxqlqXFRUwFub+BQf\nHc/sm2bzVJ+nrAnKmCA7Gf2DycnJtGnThhtuuIG2bduyY8cORo0aRVxcHG3btmXcuHH5efOu9LOz\ns6lTpw5jxoyhY8eOxMfHs3v3bgAee+wxXn755fz8Y8aMoVu3bpx//vksWLAAgCNHjnDttdfSpk0b\nhgwZQlxcXMAaxOTJk2nfvj3t2rXjkUceASA7O5sbb7wxP33ChAkAvPTSS7Rp04YOHTowfPjwcn/P\niiOYNYttFH6wfP4D2vOoarrH7H9wHq+Yt26C17qJ5V5CV3x0vAUJY4LsZPYPrl27lnfffZe4uDgA\nnn32WerVq0d2djZ9+vRhyJAhtGnTptA6Bw4coHfv3jz77LPcd999TJo0iTFjThiXg6qycOFCpk+f\nzrhx4/jmm2949dVXadiwIVOnTmXZsmV06dKlyPKlpqby2GOPsXjxYiIjI+nXrx9fffUVUVFR7Nmz\nhxUrVgCwf/9+AJ5//nk2b95MeHh4ftrJFsyaxSKgpYjEikg4MBSY7plBRBp5zA4E1rjTs4D+IlJX\nROoC/d20cpeZCfPmwbZtgfMaY0rvZPYPnnvuufmBAuDDDz+kS5cudOnShTVr1rB6tfc4G6hWrRqX\nXXYZAF27diUlJcXntq+55poT8vz4448MHToUgI4dO9K2bdsiy/fzzz9z8cUX06BBA8LCwrj++uuZ\nN28eLVq0YN26dYwePZpZs2YRGRkJQNu2bRk+fDjvv/9+qX9UV1ZBCxaqmg3chfMlvwb4WFVXicg4\nERnoZhstIqtEZBkwGhjhrrsXeAon4CwCxrlp5W7/fujdGz7/PBhbN8bkyesfDJXQoPcP1qhRI396\n/fr1vPLKK8yZM4fly5czYMAAn783CA8Pz58ODQ0lOzvb57arVq0aME9p1a9fn+XLl9OzZ08mTpzI\nbbfdBsCsWbO4/fbbWbRoEd26dSMnJ6dc91scQf0Ft6rOAGZ4pT3uMf0w8LCfdScBk4JZPoDQUOdv\nbm6w92TMmS2vfzAxJZGEmIST1vR78OBBatWqRe3atdmxYwezZs1iwIDyHVzZo0cPPv74Y3r27MmK\nFSt81lw8de/enQceeID09HQiIyOZMmUKDzzwAGlpaURERPD73/+eli1bcuutt5KTk0NqaioXX3wx\nF110EdHR0WRkZFCrVq1yPYZAzvjbfeQNkqiAQG3MGaci+ge7dOlCmzZtaNWqFc2bN6dHjx7lvo+7\n776bm266iTZt2uS/8pqQfGnatClPPfUUCQkJqCpXXXUVV1xxBb/88gu33HILqoqI8Nxzz5Gdnc31\n11/PoUOHyM3N5YEHHjjpgQIq0TO44+LitDQPPzp4ECIj4R//gPvuC0LBjKmk1qxZQ+vWrSu6GKeE\n7OxssrOziYiIYP369fTv35/169dTpcqpdT3u65yJyBJVjfOzSr5T60gqgNUsjDFldfjwYfr27Ut2\ndjaqyuuvv37KBYqyqlxHUwrWZ2GMKas6deqwZMmSii5GUFX0j/IqnNUsjDEmsDM+WFjNwhhjAjvj\ng4XVLIwxJjALFu47YDULY4zx74wPFuAEDKtZGHN66dOnD7NmFb4L0Msvv8wdd9xR5Ho1a9YEYPv2\n7QwZMsRnnoSEBAINxX/55ZfJyMjIn7/88svL5b5NY8eO5YUXXijzdsqbBQucfgurWRhzehk2bBhT\npkwplDZlyhSGDRtWrPUbN27Mp59+Wur9eweLGTNmUKdOnVJv71RnwQKrWRhzOhoyZAhff/11/oOO\nUlJS2L59Oz179sz/3UOXLl1o3749X3zxxQnrp6Sk0K5dOwCOHj3K0KFDad26NYMHD+bo0aP5+e64\n447825s/8cQTAEyYMIHt27fTp08f+vTpA0BMTAx79uwB4MUXX6Rdu3a0a9cu//bmKSkptG7dmj/9\n6U+0bduW/v37F9qPL0uXLuXCCy+kQ4cODB48mH379uXvP++W5Xk3MPzhhx/yH/7UuXNnDh06VOr3\n1pcz/ncWYDULY8rqz3+G8n4AXKdO4H7P+lSvXj26devGzJkzGTRoEFOmTOG6665DRIiIiGDatGnU\nrl2bPXv2cOGFFzJw4EC/z6F+7bXXqF69OmvWrGH58uWFbjE+fvx46tWrR05ODn379mX58uWMHj2a\nF198kblz59KgQYNC21qyZAlvvfUWP//8M6pK9+7d6d27N3Xr1mX9+vV8+OGHvPnmm1x33XVMnTq1\nyOdT3HTTTbz66qv07t2bxx9/nCeffJKXX36ZZ599lk2bNlG1atX8pq8XXniBiRMn0qNHDw4fPkxE\nREQJ3u3ArGaB1SyMOV15NkV5NkGpKo888ggdOnSgX79+bNu2jV27dvndzrx58/K/tDt06ECHDh3y\nl3388cd06dKFzp07s2rVqoA3Cfzxxx8ZPHgwNWrUoGbNmlxzzTXMnz8fgNjYWDp16gQUfRt0cJ6v\nsX//fnr37g3AzTffzLx58/LLeMMNNzB58uT8X4r36NGD++67jwkTJrB///5y/wW51SywmoUxZVVU\nDSCYBg0axL333ssvv/xCRkYGXbt2BeD9998nLS2NJUuWEBYWRkxMjM/bkgeyadMmXnjhBRYtWkTd\nunUZMWJEqbaTJ+/25uDc4jxQM5Q/X3/9NfPmzePLL79k/PjxrFixgjFjxnDFFVcwY8YMevTowaxZ\ns2jVqlWpy+rNahZYzcKY01XNmjXp06cPf/zjHwt1bB84cICzzjqLsLAw5s6dy+bNm4vcTq9evfjg\ngw8AWLlyJcuXLwec25vXqFGDyMhIdu3axcyZM/PXqVWrls9+gZ49e/L555+TkZHBkSNHmDZtGj17\n9izxsUVGRlK3bt38Wsl7771H7969yc3NZevWrfTp04fnnnuOAwcOcPjwYTZs2ED79u156KGHuOCC\nC1i7dm2J91kUq1ng1CwsWBhzeho2bBiDBw8uNDLqhhtu4KqrrqJ9+/bExcUFvMK+4447GDlyJK1b\nt6Z169b5NZSOHTvSuXNnWrVqRXR0dKHbm48aNYoBAwbQuHFj5s6dm5/epUsXRowYQbdu3QC49dZb\n6dy5c5FNTv6888473H777WRkZHDOOefw1ltvkZOTw/Dhwzlw4ACqyujRo6lTpw5//etfmTt3LiEh\nIbRt2zb/qX/l5Yy/RTnA2WfD4MHw73+Xc6GMqcTsFuWnn7LcotyaobCahTHGBGLBAuvgNsaYQIIa\nLERkgIisE5FkERlTRL5rRURFJM6djxGRoyKy1H0FtYHIOriNKZ3K0ox9JijruQpaB7eIhAITgUuA\nVGCRiExX1dVe+WoB9wA/e21ig6p2Clb5PFnNwpiSi4iIID09nfr16/v9sZs5Nagq6enpZfqhXjBH\nQ3UDklV1I4CITAEGAd6/aHkKeA74SxDLUiSrWRhTck2bNiU1NZW0tLSKLoophoiICJo2bVrq9YMZ\nLJoAWz3mU4HunhlEpAsQrapfi4h3sIgVkV+Bg8BjqjrfewciMgoYBdCsWbNSF9RqFsaUXFhYGLGx\nsRVdDHOSVFgHt4iEAC8C9/tYvANopqqdgfuAD0SktncmVX1DVeNUNS4qKqrUZbGahTHGFC2YwWIb\nEO0x39RNy1MLaAckikgKcCEwXUTiVPW4qqYDqOoSYANwXrAKajULY4wpWjCDxSKgpYjEikg4MBSY\nnrdQVQ+oagNVjVHVGOAnYKCqLhaRKLeDHBE5B2gJbAxWQa1mYYwxRQtan4WqZovIXcAsIBSYpKqr\nRGQcsFhVpxexei9gnIhkAbnA7aq6N1hltZqFMcYULaj3hlLVGcAMr7TH/eRN8JieCkwNZtk8Wc3C\nGGOKZr/gxmoWxhgTiAULrGZhjDGBWLDAahbGGBOIBQusZmGMMYFYsMBqFsYYE4gFC6xmYYwxgViw\nwGoWxhgTiAULrGZhjDGBWLDAahbGGBOIBQusZmGMMYFYsMBqFsYYE4gFC6xmYYwxgViwwKlZWLAw\nxhj/LFjg1CysGcoYY/yzYIHVLIwxJhALFlgHtzHGBGLBAuvgNsaYQCxYYDULY4wJxIIFVrMwxphA\nghosRGSAiKwTkWQRGVNEvmtFREUkziPtYXe9dSJyaTDLaTULY4wpWpVgbVhEQoGJwCVAKrBIRKar\n6mqvfLWAe4CfPdLaAEOBtkBj4HsROU9Vg3L9bzULY4wpWjBrFt2AZFXdqKqZwBRgkI98TwHPAcc8\n0gYBU1T1uKpuApLd7QWF1SyMMaZowQwWTYCtHvOpblo+EekCRKvq1yVd111/lIgsFpHFaWlppS6o\n1SyMMaZoFdbBLSIhwIvA/aXdhqq+oapxqhoXFRVV6rJYzcIYY4oWtD4LYBsQ7THf1E3LUwtoBySK\nCEBDYLqIDCzGuuXKahbGGFO0YNYsFgEtRSRWRMJxOqyn5y1U1QOq2kBVY1Q1BvgJGKiqi918Q0Wk\nqojEAi2BhcEq6K6M7RzPziRpa1KwdmGMMae1oAULVc0G7gJmAWuAj1V1lYiMc2sPRa27CvgYWA18\nA9wZrJFQSVuT+Hj1h2Rl59L33b4WMIwxxodgNkOhqjOAGV5pj/vJm+A1Px4YH7TCuRJTEslBIDeE\nzJxMElMSiY+OD/ZujTHmtHLG/4I7ISaB0BABDSU8NJyEmISKLpIxxpxyzvhgER8dz02drgcNZfZN\ns61WYYwxPpzxwQKgWV1n4FX3JhYojDHGFwsWOL+zAPuthTHG+GPBgoJgkZ1dseUwxphTlQULoIo7\nJsx+mGeMMb5ZsKAgWFjNwhhjfLNggQULY4wJxIIFFiyMMSYQCxZYsDDGmEAsWGDBwhhjArFggQUL\nY4wJxIIFFiyMMSYQCxZYsDDGmEAsWGDBwhhjArFggQULY4wJxIIFFiyMMSYQCxZYsDDGmEAsWGDB\nwhhjAglqsBCRASKyTkSSRWSMj+W3i8gKEVkqIj+KSBs3PUZEjrrpS0Xk38EspwULY4wpWpVgbVhE\nQoGJwCVAKrBIRKar6mqPbB+o6r/d/AOBF4EB7rINqtopWOXzZMHCGGOKFsyaRTcgWVU3qmomMAUY\n5JlBVQ96zNYANIjl8Wt1+nIAVuxYUxG7N8aYU14wg0UTYKvHfKqbVoiI3CkiG4DngdEei2JF5FcR\n+UFEevragYiMEpHFIrI4LS2tVIVM2prEnTNvA+DR758gaWtSqbZjjDGVWYV3cKvqRFU9F3gIeMxN\n3gE0U9XOwH3AByJS28e6b6hqnKrGRUVFlWr/iSmJZHEUgOxsJTElsVTbMcaYyiyYwWIbEO0x39RN\n82cKcDWAqh5X1XR3egmwATgvGIVMiEkgrIoAEEoECTEJwdiNMcac1oIZLBYBLUUkVkTCgaHAdM8M\nItLSY/YKYL2bHuV2kCMi5wAtgY3BKGR8dDzvXjsJgEd+9zjx0fHB2I0xxpzWgjYaSlWzReQuYBYQ\nCkxS1VUiMg5YrKrTgbtEpB+QBewDbnZX7wWME5EsIBe4XVX3BqusF0R3BiA2smWAnMYYc2YKWrAA\nUNUZwAyvtMc9pu/xs95UYGowy+bJhs4aY0zRitUMJSLnikhVdzpBREaLSJ3gFu3ksWBhjDFFK26f\nxVQgR0RaAG/gdFx/ELRSnWQWLIwxpmjFDRa5qpoNDAZeVdW/AI2CV6yTy4KFMcYUrbjBIktEhuF0\nQH/lpoUFp0gnnwULY4wpWnGDxUggHhivqptEJBZ4L3jFOrksWBhjTNGKNRrKvfnfaAARqQvUUtXn\nglmwkykvWGRlVWw5jDHmVFXc0VCJIlJbROoBvwBvisiLwS3ayRMa6vy1moUxxvhW3GaoSPcOsdcA\n76pqd6Bf8Ip1coWEOC8LFsYY41txg0UVEWkEXEdBB3elUqWKBQtjjPGnuMFiHM5tOzao6iL3fk3r\ng1esk8+ChTHG+FfcDu5PgE885jcC1warUBUhLMyChTHG+FPcDu6mIjJNRHa7r6ki0jTYhTuZrGZh\njDH+FbcZ6i2c24s3dl9fummVhgULY4zxr7jBIkpV31LVbPf1NlC6R9OdoixYGGOMf8UNFukiMlxE\nQt3XcCA9mAU72XLkGL9uW27P4DbGGB+KGyz+iDNsdifO87GHACOCVKaTLmlrErsytvHr9hX0fbev\nBQxjjPFSrGChqptVdaCqRqnqWap6NZVoNFRiSiIakoXmhJKZk0liSmJFF8kYY04pZXkG933lVooK\nlhCTgITkgIYRHhpOQkxCRRfJGGNOKWV5rKqUWykqWHx0PC0aHKFKvTD+e9Ns4qPjK7pIxhhzSilL\nzUIDZRCRASKyTkSSRWSMj+W3i8gKEVkqIj+KSBuPZQ+7660TkUvLUM5iqV2tBudEnmeBwhhjfCiy\nZiEih/AdFASoFmDdUGAicAmQCiwSkenu7c7zfKCq/3bzDwReBAa4QWMo0Bbndx3fi8h5qppTvMMq\nORs6a4wx/hUZLFS1Vhm23Q1Idm8NgohMAQYB+cHCvZNtnhoUBKZBwBRVPQ5sEpFkd3tBG6ZkwcIY\nY/wrS59FIE2ArR7zqUB370wicidOZ3k4cLHHuj95rdvEx7qjgFEAzZo1K1NhLVgYY4x/ZemzKBeq\nOlFVzwUeAh4r4bpvqGqcqsZFRZXtB+UWLIwxxr9gBottQLTHfFM3zZ8pwNWlXLfMLFgYY4x/wQwW\ni4CWIhIrIuE4HdbTPTOISEuP2SsoeEbGdGCoiFQVkVigJbAwiGW1YGGMMUUIWp+FqmaLyF04D00K\nBSap6ioRGQcsVtXpwF0i0g/IAvYBN7vrrhKRj3E6w7OBO4M5EgosWBhjTFGC2cGNqs4AZnilPe4x\nfU8R644HxgevdIVZsDDGGP8qvIP7VGHBwhhj/LNg4bJgYYwx/lmwcFmwMMYY/yxYuPYe38W+jIP2\nLAtjjPHBggXOw49mbPiSg0cz7OFHxhjjgwULnIcf5cpxyKliDz8yxhgfLFjgPPwoJFQh1x5+ZIwx\nvliwwHn40R86XEMY1ZltDz8yxpgTBPVHeaeTmPqN0RwsUBhjjA9Ws3CFhTlDZ3NzK7okxhhz6rFg\n4QoPd/5mZVVsOYwx5lRkwcJlwcIYY/yzYOEKC3P+ZmZWbDmMMeZUZMHCZTULY4zxz4KFy2oWxhjj\nnwULV17NwoKFMcacyIKFy5qhjDHGPwsWLmuGMsYY/yxYuDYeWAvAkq3LK7gkxhhz6glqsBCRASKy\nTkSSRWSMj+X3ichqEVkuIrNFpLnHshwRWeq+pgeznElbk3jsh4cAuH36aLtFuTHGeAlasBCRUGAi\ncBnQBhgmIm28sv0KxKlqB+BT4HmPZUdVtZP7GhiscoJzi/JsyQAgOzPUblFujDFeglmz6AYkq+pG\nVc0EpgCDPDOo6lxVzXBnfwKaBrE8fiXEJFClqvNM1Sq5Ne0W5cYY4yWYwaIJsNVjPtVN8+cWYKbH\nfISILBaRn0Tkal8riMgoN8/itLS0Uhc0Pjqe169+GYAnL3re7jxrjDFeTolblIvIcCAO6O2R3FxV\nt4nIOcAcEVmhqhs811PVN4A3AOLi4rQsZbgwtqOz05rnl2UzxhhTKQWzZrENiPaYb+qmFSIi/YBH\ngYGqejwvXVW3uX83AolA5yCWlWrVnL9HjwZzL8YYc3oKZrBYBLQUkVgRCQeGAoVGNYlIZ+B1nECx\n2yO9rohUdacbAD2A1UEsqwULY4wpQtCaoVQ1W0TuAmYBocAkVV0lIuOAxao6Hfg7UBP4REQAtrgj\nn1oDr4tILk5Ae1ZVLVgYY0wFCWqfharOAGZ4pT3uMd3Pz3oLgPbBLJs3CxbGGOOf/YLbVaWK87Jg\nYYwxJ7Jg4aF6dThypKJLYYwxpx4LFq6krUlI1YMk79gdOLMxxpxhLFjgBIq+7/blgGzhm1UL7N5Q\nxhjjxYIFzr2hMnMyIfwgucdq2b2hjDHGiwULnHtDhYeGQ8RB5Hik3RvKGGO8WLDAuTfU7Jtm0z66\nGdFV29q9oYwxxosFC1d8dDxdm7chN7NaRRfFGGNOORYsPFSrZr+zMMYYXyxYeKhe3YKFMcb4YsHC\nQ3pWKkePKgu22NBZY4zxZMEY0jgoAAAd/klEQVTClbQ1icmr30RV6DvpcvuthTHGeLBg4UpMSSQn\n9DAAmcftOdzGGOPJgoXLeQ53FgDhufZbC2OM8WTBwhUfHc+YhLsBePfKT+y3FsYY48GChYdOzVsC\n0LJWlwouiTHGnFosWHjYkb0KgPlrVlVwSYwx5tRiwcKVtDWJ++ePAOCB6X+z0VDGGOPBgoUrMSWR\nrKo7Acg6HGmjoYwxxkNQg4WIDBCRdSKSLCJjfCy/T0RWi8hyEZktIs09lt0sIuvd183BLCe4d56t\ndQiA0GNn2WgoY4zxELRgISKhwETgMqANMExE2nhl+xWIU9UOwKfA8+669YAngO5AN+AJEakbrLKC\nMxpqzh9nEl4tkyGxo2w0lDHGeAhmzaIbkKyqG1U1E5gCDPLMoKpzVTXDnf0JaOpOXwp8p6p7VXUf\n8B0wIIhlzVet1lEO7qtyMnZljDGnjWAGiybAVo/5VDfNn1uAmSVZV0RGichiEVmclpZWpsLmP1o1\nZCPfrFhsHdzGGOPhlOjgFpHhQBzw95Ksp6pvqGqcqsZFRUWVqQz5j1atlk5uRh3r4DbGGA/BDBbb\ngGiP+aZuWiEi0g94FBioqsdLsm55ynu0qlTfixytbx3cxhjjIZjBYhHQUkRiRSQcGApM98wgIp2B\n13ECxW6PRbOA/iJS1+3Y7u+mBU3eo1VbN2tI1czGwdyVMcacdoIWLFQ1G7gL50t+DfCxqq4SkXEi\nMtDN9negJvCJiCwVkenuunuBp3ACziJgnJsWdL9lJHHsUHUufruf9VsYY4wrqMN+VHUGMMMr7XGP\n6X5FrDsJmBS80p0oMSWRnGppoKFkHq5FYkqiDaE1xhhOkQ7uU0VCTAKhjVYCELKju/VbGGOMy4KF\nl5Cz1jgTe2MrtiDGGHMKsWDhoeD+ULlk743myyWLKrpIxhhzSrBg4aF+9fpoSDZU2wtJ9/PM4NEV\nXSRjjDklWLDwkJ6RToiEQPU9FV0UU0mtXQs33gjZ2RVdEmNKxoKFh4SYBKqGVoWGy/PTcnOdvxkZ\nMGkSqFZQ4UylcOONMHky/PprRZfEmJKxYOEhPjqelwe8TEjbqflpc9YuBGDMGLjlFpgV1J8GmsrO\nLjbM6cqChZdfd/xKbqtP8ufvfGQLANu3O/OHDlVEqYwxpmJZsPCy8/BOCFE4/wsAfvtiSKFfcn/i\nxpHcXEhOrogSGlMgIwOeeebM6gP54gtISSn/7ebklP82i+Nf/4LvvquYfZeEBQsvDWs2dCa6vpGf\n9tbnG1i3zpn+5BNYuRKeegpatoT16wuvn5EB48ZBZmbJ9vvee3DNNWUo+Blq1Sr46KPSr//llzB1\nauB8p6onn4RHHnE+PyXx8cfwpz8Fp0zBdvXV0LVr+W5zwQKoUgXeead8t1scd94J/fuf/P2WlAUL\nLzd1vIkQQqD+uvy0N0cPZ+XKgjy//QbffutM79pVkL5smdOv8cQT8N//+t7+3LkgAnu8BlzddBNM\nm1ZOB3EGadcOhg4tmN/itBqSng779wdef+BAGDIkOGU7Gfbtc/4eP150Pm9/+AP85z/lX55gyxtw\nsrcc7xT34Yfw9tvO9Pffl992KxsLFl7io+Np1aAV1N3oN8+11zpXIgChoc7fnBzo1AmmTHHmDx/2\nve7zzzt/F/n5vd/IkU7NZPdu38v9OXgQGjSAOXNKtl5FUg3cnLB/f/H7iWbPhubNndpfgwZQv/6J\n+1u7tvjlGzMGbr+9+PlLIiMjcB5vubnw7LNw4EBBWl7TSZVK+HDHl15yLqyOHi1IK2mNvTiuvx7e\nfNOZFin/7VcWFix8OK/+eU6/xY1+73OY73//g88+O/Gf1V/7Z96H0d+omLffdmomt93mzE+bBhdd\nVHBF5c+yZc7V9F//GrDIJ0VyMmRlFZ3nX/+C2Niih5HWrQtNinq+ois7G5YscabzmqW837N33oHW\nrZ2/IvDHP/re1ldfOV9Kzz0Hr78eeN9563z9dfHyAiQkFPR/FdeMGfDww3DffQVpeZ+zvIsWEadZ\no7iOHIE+fZzmvLJIT3fOU945KI6UlKKDZt6FlWcNMRjBorzs2FHRJQguCxY+PNjjQQSB5vMC5v3L\nX+Cep1afkO4vWIS473jeF5mqcwXlLe/q8ZprnIDkr6aSx7OGA87VeKB1fMnIgEsvhTVrSr4uwPLl\nThBo2RIeeKDovImJzt9AAwWKU7M4erTgPfXXB5H3hfivfzl/33rrxDw//ABXXVU46L77btH7zsx0\n1rnyysJXwYF8/nnx8wIcO+b89fzy9A4WUHB8xTFnjnMeAp2r4mxn+3ans91Tdrb/C6PYWBg0yAkw\nG92K/OrVTsCb4XGvas+g79nc5mu7WVmBL6yK4mubqvDgg0Vf1CxeDI0bFzRn5dm/P/Dn53RhwcKH\n+Oh4/vfH/1G/Zm24rXPA/KnbTvx0PjnnKeo9V4/oF6Op9bdaVB11MZH3JPDD5kQAJv78GoOnDKbf\nC/cUulLMM3du4fkjR3zvW9XpP8n7kOd9edSu7f+KPCcH7r0XfvnlxGXz5zvbu/tu3+sG0rFjwZVt\noN+k5JXZs+q/ahX89FPJf4+QkRH4S6JOHeevry/08ePhm28g71HueVe1AJ9+WjD9wgtOTdLTgw8W\nTD/0UMH01Klwxx3w5z/DaB93jinuMf7yi/MebXWfSv/ZZ05t4NChwsGiNFfdJe3r8LZoUeHOde9j\nCgvz/VnKq3V+/z3ExcG55zp9gR9+6KR79t95ltFzev78E7cbHu6/xpiW5pyrko4cO3wY/v536N3b\nf568CxHvZuARI+Dmm0+8+Jowwemo93VRGRLi9H2Cs94HH5SsvEGjqpXi1bVrVy1vC7YsUBkryoN1\n1fk38POKSD8x7ZxZyuX/p0TsVUb2KEg/7wvn77V/UMai3NrN73Zr//mi/OlGf75aGw78p5739wv0\nrN/N1AGv3ay9JvXSc255TEG1Ra+F+XlH3rM1f/pv8/6mC7YsKHRcM2YU7MPzWP8272/6z49WKqhe\ndFHp3jPP8rdsWXTea6918j32mO/1o6JOLKdn+VevLlj+/vuqY8ac+B56mjDBSYuN9X8uP/nkxLQb\nbzyxfJ569ChIHzzY97HkrdO1q//yff656qRJqjk5ql9+qZqb66T/6U9O3ksvLbzukiWq113nTL/3\nnurevb6364vn+5a37dLwft88jz8z0395Dhwo4v8J1dtuU23UqOA850lOLsjz1VeFt5mTU7DstddU\np00rvPz6651l3un+znWeXbucZRER/t+Hd95x8gwfXji9fXsnfdky3/s8cqRg+tZbVRcuLPyeFXU+\n8z4fZQUs1mJ8x1bCbrHyk1fDuHnazawfUxsW/R/88ifYd27hjMfqnbjyxv7OC2Cqx6WBuJdeU6dA\n1BpI8lGtcB18ueDSacfLzqXWzi9vBw3lmwUDYKxAahcAkpdF5ed965Wm+dOPzHkEgFAJJTTEaavI\nXT0QcBrMqzxRnarhQka223i88TtgDj/+CBEjryTsvB9oVLMRmTmZiAjNIptx8NhBth3aRpPaTUBh\nT8Yeru9wvbvH5/L3vWnvZmJfSaBORB2OZx/n/Abnc98FDxFaRflhcyJTpz4MwNNPQ9crl3B240yg\n4GFTeVf5AHd8dQcAnRt15t+vxAPtGTGiYPkNN/h+D+/46g46N+pMekY6u9KHArGkbs8CwnzmX7R+\nA1D4/LZo4XvbeTyv6DccXE3S1gPFfmhWv35OLaZ2bedKE5wa0q23OreXGTmy4Grdu6bmWbPIyirc\n7Ji0NYk5GxOpvnUQf76hDatXQ2QkNG3qv1mnLH7/e+fvuj3rSNq6l7iG8YwZ4z9/oOa6Pcd2cjiz\nBlCLDz5wOqFbt/Zd3gULnJp3jx4FaXc4H5f89w4KrtBXbPuNNfOnkhCTQLfGhc+TqlNLaN3aaRpr\n0aKgX8VXzVXVOUd5zcue+4OCWoy/Zum8UZXgjE7zbsbKs32708zlKSwMLrvMGf59Moh6H91pKi4u\nThcvXhyUbT8z/xkem/sYuerxadnfDA42gakfwoHmTlqH92DV7yEnIijlOMETAotvg6//DaHHIafq\niXnGerTxHI6Cb/8B+2NgS08n7S9RUGMPHGwE1fbBlGmwYcCJ6+eGwFevwbnfQbP5UGuXkya5IMDO\n9lB/PYz3+BaoegBG9nLutZUVASiMPwYXvgT9xsDTHv/5t1wI0T/DWD+fx7xyZIfBR5/B+iuL9x49\nWA+yakBkKvz4IHz/XOB1vFTt/RI1Ln+a2lVrk/LnTQA0fKERDWs25Hj2cTY8M5XMba2dzB3fgcEj\n4Pk0yGhQaDu1/labw//8Ad1edNNmzf5/5/C3f6HaBR+itVM5NvsvPvM1aLOCw7npHFubQM0BzxLR\nbiZ7XvgBABkbgv50F3wzgaiRd5D21msAXP3hYNbNb8eafz0FQIsbXyL5vXup2+5nat/yB0QkP7hH\n1Yji4LGDrE9PJmvBnTTsuIwaTTYTVSMKFFbuXsneMemFC9VqGuE3DKXvgfeY+cJ1+ck3T36A7Go7\nSUxJpPbhriA5rHn8K/9vQvdXnP+lw+43pOTQ8O9Nqb2vF7+Nd0YwNL3t/6jSemb+OWn394tZ+ZfC\n7UBXfziYhjUbsubzq/jhv5c7idfcAB2cyBEV1py0R1Py84fV207W3saE1NhL7pF6XPXq/Szdtpqt\nz8503vOnY6hbO5yj+2sjAlE5nfhlbMEY5OiuK7nygc9JTP+Afcf2see5BWSnxdJ//JOsrPomUTWi\niK0Tw+fDAo+Tb/3PNqy5q6A/tNek3qRlpBFVI4p6EfXyt9Hsrlt5dEQ3RnUdFXCbvojIElWNC5TP\nahbFkHeDwcycTMJDw7m7+918tPIj9h5dSZUHu5N5pBqZx4UqtfaSdemjZM+7HxbfDioQmuV8WQXD\nS1vgYLQz7StQAGyJh8ZLYG8LeGseHPUaT/rWDxD3b/hmAkRuLgh83t5YDDs7wy/uB7LuBqeG1ekt\nGHAP/Hs5tPX6ddzxSPj3MieojT8K1d2qwk/3wnavX1UdbApZS4s+3uRLYPK3Refx9rw7IH+sQHbp\ngvjxH+7lePwT7M0uGLqz8/BO59f+AMc9AlyIeynpFSgADmUeAg3c+3o42/nxxNFFw4rMt2d1ewhz\nOrMOfzOGwxva5y/T+Q/BIeeLNi9QAHy+7nPYWvAZSN7u/FBo39F97Duw2UnMioCke1nzu39AlUzY\n0RFmPMuWJUvhjs6s2bgfdnSF83z82EGFzLSmzHyzb6Hkd4a/4EzU3sq2g9FwXoDL4ZDsglo4gIY6\n7/mSs/OTUtPTYc/2/PmVqScOd/f5pawFXbVpa84vtChrv3Peco84rQVfLlkEYQUXQHv+tpA9D56d\nf1Gz1WvTW5e047Vh7WDsX0GBvY0A+HbdPJg+n+2tprFswP3+j9vDmj2FOzrmbZlXkH68Zn76ln/+\nh9tqRjJv8zwmXzO5WNsujaAGCxEZALwChAL/UdVnvZb3Al4GOgBDVfVTj2U5wAp3douqDgxmWYsS\nHx3P7Jtmk5iSSEJMAvHR8TzXz/8VatLWJBJTXmT/8f3MWT+fs6s1p0HKbSzfsZK1yyLJOh5GiISQ\ns+ccqJdMTs/H4Z+/QUgW1NoGB2KKLpDkgIYWBIqiTFpQ9PI9bZxAAb4DxViFS+91AoWnvKa4pSOh\npTtmdMMlvvex0f3iyChoKmNLr8J5DjSHxCeLLmtJA4W3rGqlX3feX2GBxxX+8ZpQ1W33yfbY7q+3\nQJSfcage/+BFCvHogQ3JhNxw/3k9L0TWX1EwnfgEdH3zxPyLbndqg/llql0wvSXe2V5qd5j7NCy8\nG7q8CfV/c5bvbg/J/WGy2x52vcf+8gl8NvnEi5I8eZ/Z367yf0xQuIx5Vg8u+KwC7D23cO10ge8a\n2AnyLhoONCk4ljze7/WxOoWCCxlnwe7WgfehwNxxBa0MR+vC/lj46T4oZrDgO6+hZYpzAbo9Dqoe\nLLzsnTm8HxFHr+a9Sl3DCKg4HRuleeEEiA3AOUA4sAxo45UnBidQvAsM8Vp2uCT7C0YH98myYMsC\nveqd67TVhLba8V9dNOS+5srDNZT7muh5j16nLR64WRnVRbnlQqXTJK11++Uaes5crdr6W61Sf4tS\nO7Wgk67uegVVaf2Zhpwz208HYk6RnYun5Gvw8LKtf8GrZVu/2bzC8+0+UFp9psS/UP7HetH48tmO\nZAXOc863zt9zZxakdXulcJ4ubxRMR24qmK637sTtnf+5cvbSspe92ytKra1l346/V42dziCU4uS9\n/rLy3fclD5RuvbvOU6occabbv1d4Waupyli0/7v9S/z9QzE7uANmKO0Lp6dylsf8w8DDfvK+fSYH\nC295I5M8RzH5SvO3jq9REuvWqb7w9mq9/4PXdMGWBfrrr6qPPKI6ZMR2rV5vn0Zf/p7yu+eVyE0q\n50/XGg23FfowhjVepY2ufkURJ9DUar1AQbVK1MYTP9Qd3lHO/Sbwh7/hkhPTRvRS2k4J3peEvU6P\nV/XdFV+G0+nV5mNlLPr64tdL/H1T3GARtA5uERkCDFDVW935G4HuqnqXj7xvA19p4WaobGApkA08\nq6on/IRJREYBowCaNWvWdfPmzcE4lDOG03xW0NTmT06OM64/K8sZkXHsmDMevE4dqFkTko85I3H2\nrWtHVtWddKh3IXt2hzHv11Q6dj+I1kylfe1exLQ8yjPvLkRzhWZHB5Fw2V5GzO/B8ezjcLQeXWpf\nRvamC1n70U3ExEJ2Rk169XJGCQG06LyDTc2eICfpbkJjFzCg+WDWrs1lw/KGtLjic+rntuXnmS0B\n6Dl4DUt+aETj9r+R/EM3QmunERaZxsjnp/HasEf9Hmv1jjPJWHbZCelh9VMJb76EI78MAkBCs7nx\nrUdZ8G1DkiffC8DwR+cxeXyvE9YFnKaqtLZ+9xtacy+aWZ3cTKcZI7zJajK3tXEWXvBP6Pgu/Gdh\nfv7IVr9yZPP5ZB+t7pT77FQydjU9YbulFV7zIGFNV3BkbY8TF3Z50xklWAJnjW3H7injYXNPOOpj\nNKGner/B3vPgd8/Dggdh4C3Q9iNCX00h57DbNxSaCTleTUiSg5y1Bt3VLnCB6q+F9FYlOoZTSbO7\n/8ijN19Yqiao4nZwn8rBoomqbhORc4A5QF9V3eBvf8EcDWVOnuIGrNLm90UVfkp1ttO7eQLdm8QX\n+kX0/zYnMfXH5Qzq3pHOZ11IrVoFPyQ8ftwZslq7thM4wRlKqwpVq8K2bc49qo4dc34w9nPqz8xc\nuoSB3ToTHx2PiDPscs8eiIhwgu2xY852Q0OdwFy7trO/nBznYVyL987OP95du6BePed2M7m5zisr\nC6pXd349PHu2Mzx39cGC96lb43hSU2HdoYW8My2Vo7ubEBG1nQtbnI+EKPPTphEReYjOMoIhCa35\n6CPnvmf9+jk/otsb/ivL9s9j2VJh5vHHyQ07gCwczTUdL6FDtSupVs35UVndurBpE+wKS+Kdpe8i\nAlc3H0lc427Ur++cu9nJP1B7x0CGD2jDuAmb2J6zgo5xR/j24xh2bq1JnYRJjLiiDXWPxrEp9Bt0\nXwzUSaFPbAKta8WzcKHzPvfq5QwtDo1eyEffr6ddg86MHtaGI0fgnx+uQ1t9xvmhlxIT0YXDh51h\ntFdc4axXp45Tlv/+9BGZGRG0rNWZ9CMH6BzVncu6dOTxf2xhe+RUdv8WQ9927Rn1+xZERzu3eGnf\nHlJTnXOflLyaTz7PIPxILN071GfcOOdcrtyfxISvv+WzBcvIafEF8svtXNLiYr7fMJuc3S0J7fw+\nF2Y/SEyDRpx3cRIcaI7W2srG6UM5q1pTdhzaRvKunVzVvybNa51Pt27O3QciI53PxYMPlu2eVqdC\nsIgHxqrqpe78wwCq+oyPvG/jFSxKshwsWBhzsiVtTaLvu33zRwnOvml2qQP2mcD7wqY8LnTKw6kQ\nLKoAvwF9gW3AIuB6VT1hqIh3MBCRukCGqh4XkQZAEjBIVU+8CZPLgoUxJ9+p8oVnSq/Cf2ehqtki\nchcwC2dk1CRVXSUi43A6VKaLyAXANKAucJWIPKmqbYHWwOsikotz/6pniwoUxpiKER8db0HiDGG/\n4DbGmDNYcWsWdtdZY4wxAVmwMMYYE5AFC2OMMQFZsDDGGBOQBQtjjDEBVZrRUCKSBpT2fh8NgD3l\nWJzTgR3zmcGO+cxQlmNurqpRgTJVmmBRFiKyuDhDxyoTO+Yzgx3zmeFkHLM1QxljjAnIgoUxxpiA\nLFg43qjoAlQAO+Yzgx3zmSHox2x9FsYYYwKymoUxxpiALFgYY4wJ6IwPFiIyQETWiUiyiIyp6PKU\nFxGJFpG5IrJaRFaJyD1uej0R+U5E1rt/67rpIiIT3PdhuYh0qdgjKB0RCRWRX0XkK3c+VkR+do/r\nIxEJd9OruvPJ7vKYiix3aYlIHRH5VETWisgaEYk/A87xve5neqWIfCgiEZXxPIvIJBHZLSIrPdJK\nfG5F5GY3/3oRubm05Tmjg4WIhAITgcuANsAwEWlTsaUqN9nA/araBrgQuNM9tjHAbFVtCcx258F5\nD1q6r1HAaye/yOXiHmCNx/xzwEuq2gLYB9zipt8C7HPTX3LznY5eAb5R1VZAR5xjr7TnWESaAKOB\nOFVth/OsnKFUzvP8NjDAK61E51ZE6gFPAN2BbsATeQGmxFT1jH0B8cAsj/mHgYcrulxBOtYvgEuA\ndUAjN60RsM6dfh0Y5pE/P9/p8gKauv9AFwNfAYLzq9Yq3ucb56Fc8e50FTefVPQxlPB4I4FN3uWu\n5Oe4CbAVqOeet6+ASyvreQZigJWlPbfAMOB1j/RC+UryOqNrFhR88PKkummVilv17gz8DJytqjvc\nRTuBs93pyvBevAw8COS68/WB/aqa7c57HlP+8brLD7j5TyexQBrwltv09h8RqUElPsequg14AdgC\n7MA5b0uo3OfZU0nPbbmd8zM9WFR6IlITmAr8WVUPei5T51KjUoydFpErgd2quqSiy3ISVQG6AK+p\namfgCAXNEkDlOscAbhPKIJxA2RiowYlNNWeEk31uz/RgsQ2I9phv6qZVCiIShhMo3lfVz9zkXSLS\nyF3eCNjtpp/u70UPYKCIpABTcJqiXgHqiEjes+Y9jyn/eN3lkUD6ySxwOUgFUlX1Z3f+U5zgUVnP\nMUA/YJOqpqlqFvAZzrmvzOfZU0nPbbmd8zM9WCwCWrojKcJxOsqmV3CZyoWICPBfYI2qvuixaDqQ\nNyLiZpy+jLz0m9xRFRcCBzyqu6c8VX1YVZuqagzOeZyjqjcAc4Ehbjbv4817H4a4+U+rK3BV3Qls\nFZHz3aS+wGoq6Tl2bQEuFJHq7mc875gr7Xn2UtJzOwvoLyJ13VpZfzet5Cq6A6eiX8DlwG/ABuDR\nii5POR7XRThV1OXAUvd1OU577WxgPfA9UM/NLzgjwzYAK3BGm1T4cZTy2BOAr9zpc4CFQDLwCVDV\nTY9w55Pd5edUdLlLeaydgMXuef4cqFvZzzHwJLAWWAm8B1StjOcZ+BCnXyYLpxZ5S2nOLfBH9/iT\ngZGlLY/d7sMYY0xAZ3ozlDHGmGKwYGGMMSYgCxbGGGMCsmBhjDEmIAsWxhhjArJgYUwAIpIjIks9\nXuV2d2IRifG8q6gxp6oqgbMYc8Y7qqqdKroQxlQkq1kYU0oikiIiz4vIChFZKCIt3PQYEZnjPldg\ntog0c9PPFpFpIrLMff3O3VSoiLzpPqPhWxGp5uYfLc7zSJaLyJQKOkxjAAsWxhRHNa9mqD94LDug\nqu2Bf+Lc9RbgVeAdVe0AvA9McNMnAD+oakecezitctNbAhNVtS2wH7jWTR8DdHa3c3uwDs6Y4rBf\ncBsTgIgcVtWaPtJTgItVdaN708adqlpfRPbgPHMgy03foaoNRCQNaKqqxz22EQN8p87DbBCRh4Aw\nVX1aRL4BDuPcxuNzVT0c5EM1xi+rWRhTNupnuiSOe0znUNCXeAXO/X66AIs87qpqzElnwcKYsvmD\nx98kd3oBzp1vAW4A5rvTs4E7IP9Z4ZH+NioiIUC0qs4FHsK5tfYJtRtjTha7UjEmsGoistRj/htV\nzRs+W1dEluPUDoa5aXfjPL3uLzhPshvppt8DvCEit+DUIO7AuauoL6HAZDegCDBBVfeX2xEZU0LW\nZ2FMKbl9FnGquqeiy2JMsFkzlDHGmICsZmGMMSYgq1kYY4wJyIKFMcaYgCxYGGOMCciChTHGmIAs\nWBhjjAno/wGVkooxFkdVNgAAAABJRU5ErkJggg==\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "iOFBSbPcYCN4",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Look closer at the data\n",
-        "The graph shows the _loss_ (or the difference between the model's predictions and the actual data) for each epoch. There are several ways to calculate loss, and the method we have used is _mean squared error_. There is a distinct loss value given for the training and the validation data.\n",
-        "\n",
-        "As we can see, the amount of loss rapidly decreases over the first 25 epochs, before flattening out. This means that the model is improving and producing more accurate predictions!\n",
-        "\n",
-        "Our goal is to stop training when either the model is no longer improving, or when the _training loss_ is less than the _validation loss_, which would mean that the model has learned to predict the training data so well that it can no longer generalize to new data.\n",
-        "\n",
-        "To make the flatter part of the graph more readable, let's skip the first 50 epochs:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Zo0RYroFZYIV",
-        "colab_type": "code",
-        "outputId": "e6841332-0541-44bb-a186-ae5b46781e51",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 295
-        }
-      },
-      "source": [
-        "# Exclude the first few epochs so the graph is easier to read\n",
-        "SKIP = 50\n",
-        "\n",
-        "plt.plot(epochs[SKIP:], loss[SKIP:], 'g.', label='Training loss')\n",
-        "plt.plot(epochs[SKIP:], val_loss[SKIP:], 'b.', label='Validation loss')\n",
-        "plt.title('Training and validation loss')\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('Loss')\n",
-        "plt.legend()\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEWCAYAAABMoxE0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJzsnXl4lNXZuO9nZhJQQbGRFpFAcKkC\nRhYjGgEJov1A0Wqx1q3giqJYqbV1aVWq9odrxQUVKiKpWvWTT9xArUDYDPsiRURRIomCQiooAknm\nfZ/fH2dmMjOZJJNkJpkk576uXJl3O+95t+c5z3LOEVXFYrFYLJb64mnqClgsFouleWMVicVisVga\nhFUkFovFYmkQVpFYLBaLpUFYRWKxWCyWBmEVicVisVgahFUkliZHRLwiskdEuiZy36ZERI4WkYTn\n1ovIGSJSFLa8SUQGxbNvPc71rIjcUd/jayj3PhF5PtHlWpoOX1NXwNL8EJE9YYsHAmWAE1i+VlVf\nrEt5quoA7RK9b2tAVY9NRDkicjVwmarmhZV9dSLKtrR8rCKx1BlVDQnyQIv3alX9oLr9RcSnqv7G\nqJvFYml8rGvLknACrotXRORfIvIDcJmI5IrIUhHZJSLbRORxEUkL7O8TERWRrMDyC4Htc0TkBxEp\nFJHudd03sH24iHwqIrtF5AkRWSIil1dT73jqeK2IbBaR70Tk8bBjvSLyqIiUisgXwLAa7s+fReTl\nqHWTReTvgd9Xi8jGwPV8HrAWqiurRETyAr8PFJF/Buq2ATgxat+/iMgXgXI3iMi5gfXZwJPAoIDb\ncGfYvZ0Qdvx1gWsvFZFZInJ4PPemNkTk/EB9donIPBE5NmzbHSLytYh8LyKfhF3rKSKyOrD+GxF5\nKN7zWZKAqto/+1fvP6AIOCNq3X1AOXAOprFyAHAScDLGCj4S+BQYF9jfByiQFVh+AdgJ5ABpwCvA\nC/XY96fAD8AvA9tuBiqAy6u5lnjq+AZwCJAF/Dd47cA4YAPQBcgAFprPK+Z5jgT2AAeFlf0tkBNY\nPiewjwCnA/uAEwLbzgCKwsoqAfICvx8GCoBDgW7Ax1H7XggcHngmlwTq8LPAtquBgqh6vgBMCPz+\nRaCOfYC2wFPAvHjuTYzrvw94PvC7R6Aepwee0R3ApsDvXsCXQKfAvt2BIwO/VwAXB363B05u6m+h\nNf9Zi8SSLBar6luq6qrqPlVdoarLVNWvql8AU4HBNRz/mqquVNUK4EWMAKvrviOAtar6RmDboxil\nE5M46zhRVXerahFGaAfPdSHwqKqWqGopcH8N5/kC+A9GwQGcCXynqisD299S1S/UMA+YC8QMqEdx\nIXCfqn6nql9irIzw876qqtsCz+QlTCMgJ45yAS4FnlXVtaq6H7gNGCwiXcL2qe7e1MRFwJuqOi/w\njO7HKKOTAT9GafUKuEe3BO4dmAbBMSKSoao/qOqyOK/DkgSsIrEki+LwBRE5TkTeEZHtIvI9cA9w\nWA3Hbw/7vZeaA+zV7ds5vB6qqpgWfEzirGNc58K0pGviJeDiwO9LAsvBeowQkWUi8l8R2YWxBmq6\nV0EOr6kOInK5iKwLuJB2AcfFWS6Y6wuVp6rfA98BR4TtU5dnVl25LuYZHaGqm4A/YJ7DtwFXaafA\nrlcAPYFNIrJcRM6K8zosScAqEkuyiE59nYJphR+tqgcDd2FcN8lkG8bVBICICJGCL5qG1HEbkBm2\nXFt68qvAGSJyBMYyeSlQxwOA14CJGLdTB+D9OOuxvbo6iMiRwNPAWCAjUO4nYeXWlqr8NcZdFiyv\nPcaF9lUc9apLuR7MM/sKQFVfUNUBGLeWF3NfUNVNqnoRxn35CDBTRNo2sC6WemIViaWxaA/sBn4U\nkR7AtY1wzreBfiJyjoj4gJuAjkmq46vAeBE5QkQygFtr2llVtwOLgeeBTar6WWBTGyAd2AE4IjIC\nGFqHOtwhIh3E9LMZF7atHUZZ7MDo1GswFkmQb4AuweSCGPwLuEpEThCRNhiBvkhVq7Xw6lDnc0Uk\nL3DuP2LiWstEpIeIDAmcb1/gz8VcwG9F5LCABbM7cG1uA+tiqSdWkVgaiz8AozFCYgomKJ5UVPUb\n4DfA34FS4ChgDabfS6Lr+DQmlrEeEwh+LY5jXsIEz0NuLVXdBfweeB0TsL4AoxDj4W6MZVQEzAHy\nw8r9CHgCWB7Y51ggPK7wb+Az4BsRCXdRBY9/F+Niej1wfFdM3KRBqOoGzD1/GqPkhgHnBuIlbYAH\nMXGt7RgL6M+BQ88CNorJCnwY+I2qlje0Ppb6IcZtbLG0fETEi3GlXKCqi5q6PhZLS8FaJJYWjYgM\nC7h62gB3YrJ9ljdxtSyWFoVVJJaWzkDgC4zb5H+A81W1OteWxWKpB9a1ZbFYLJYGYS0Si8VisTSI\nVjFo42GHHaZZWVlNXQ2LxWJpVqxatWqnqtaUMg+0EkWSlZXFypUrm7oaFovF0qwQkdpGaACsa8ti\nsVgsDcQqEovFYrE0CKtILBaLxdIgWkWMxGKxNC4VFRWUlJSwf//+pq6KJQ7atm1Lly5dSEurbqi1\nmrGKxGKxJJySkhLat29PVlYWZtBlS6qiqpSWllJSUkL37t1rPyAG1rVlsVgSzv79+8nIyLBKpBkg\nImRkZDTIerSKxGJpJAoLYeJE8781YJVI86Ghz8q6tiyWRqCwEIYOhfJySE+HuXMhN7epa2WxJIak\nWiSBkVc3ichmEbktxvbTRGS1iPhF5IKobQ+KyAYR2Sgijwdmt0NEThSR9YEyQ+stllSmoMAoEccx\n/wsKmrpGLZvS0lL69OlDnz596NSpE0cccURoubw8vmlLrrjiCjZt2lTjPpMnT+bFF19MRJUZOHAg\na9euTUhZjU3SLJLA3A+TgTMxczCvEJE3VfXjsN22ApcDt0QdeyowADghsGoxMBgowEyAcw1mUp7Z\nmIlw5iTrOiyWRJCXZyyRoEWSl9fUNWrZZGRkhITyhAkTaNeuHbfcEiFmUFVUFY8ndnt6+vTptZ7n\nhhtuaHhlWwDJtEj6A5tV9YvAzGUvY+amDqGqRYGZ26KnyFSgLWbK0TZAGmbmtsOBg1V1qZphi/OB\n85J4DRZLQsjNNe6se++1bq3qKCwuZOKiiRQWJy+ItHnzZnr27Mmll15Kr1692LZtG2PGjCEnJ4de\nvXpxzz33hPYNWgh+v58OHTpw22230bt3b3Jzc/n2228B+Mtf/sKkSZNC+992223079+fY489lg8/\n/BCAH3/8kZEjR9KzZ08uuOACcnJyarU8XnjhBbKzszn++OO54447APD7/fz2t78NrX/88ccBePTR\nR+nZsycnnHACl112WcLvWTwkM0ZyBFActlwCnBzPgapaKCLzMVN6CvCkqm4UkZxAOeFlHhGrDBEZ\nA4wB6Nq1a91rb7EkmNxcq0Cqo7C4kKH5Qyl3ykn3pjN31FxyM5Nzsz755BPy8/PJyckB4P777+cn\nP/kJfr+fIUOGcMEFF9CzZ8+IY3bv3s3gwYO5//77ufnmm3nuuee47bYq3npUleXLl/Pmm29yzz33\n8O677/LEE0/QqVMnZs6cybp16+jXr1+N9SspKeEvf/kLK1eu5JBDDuGMM87g7bffpmPHjuzcuZP1\n69cDsGvXLgAefPBBvvzyS9LT00PrGpuUzNoSkaOBHkAXjKI4XUQG1aUMVZ2qqjmqmtOxY62DV1os\nliakoKiAcqccRx3KnXIKigqSdq6jjjoqpEQA/vWvf9GvXz/69evHxo0b+fjjj6scc8ABBzB8+HAA\nTjzxRIqKimKW/atf/arKPosXL+aiiy4CoHfv3vTq1avG+i1btozTTz+dww47jLS0NC655BIWLlzI\n0UcfzaZNm/jd737He++9xyGHHAJAr169uOyyy3jxxRfr3aGwoSRTkXwFZIYtdwmsi4fzgaWqukdV\n92BiILmB47vUs0yLxZKi5GXlke5Nxyte0r3p5GXlJe1cBx10UOj3Z599xmOPPca8efP46KOPGDZs\nWMz+FOnp6aHfXq8Xv98fs+w2bdrUuk99ycjI4KOPPmLQoEFMnjyZa6+9FoD33nuP6667jhUrVtC/\nf38cx0noeeMhmYpkBXCMiHQXkXTgIuDNOI/dCgwWEZ+IpGEC7RtVdRvwvYicEsjWGgW8kYzKWyyW\nxiM3M5e5o+Zy75B7k+rWiub777+nffv2HHzwwWzbto333nsv4ecYMGAAr776KgDr16+PafGEc/LJ\nJzN//nxKS0vx+/28/PLLDB48mB07dqCq/PrXv+aee+5h9erVOI5DSUkJp59+Og8++CA7d+5k7969\nCb+G2khajERV/SIyDngP8ALPqeoGEbkHWKmqb4rIScDrwKHAOSLyV1XtBbwGnA6sxwTe31XVtwJF\nXw88DxyAsVRsxpbF0gLIzcxtNAUSpF+/fvTs2ZPjjjuObt26MWDAgISf48Ybb2TUqFH07Nkz9Bd0\nS8WiS5cu3HvvveTl5aGqnHPOOZx99tmsXr2aq666ClVFRHjggQfw+/1ccskl/PDDD7iuyy233EL7\n9u0Tfg210SrmbM/JyVE7sZXF0nhs3LiRHj16NHU1UgK/34/f76dt27Z89tln/OIXv+Czzz7D50ut\n/uCxnpmIrFLVnGoOCZFaV2KxWCwtjD179jB06FD8fj+qypQpU1JOiTSUlnU1FovFkmJ06NCBVatW\nNXU1kkpKpv9aLBaLpflgFYnFYrFYGoRVJBaLxWJpEFaRWCwWi6VBWEVisVhaHEOGDKnSuXDSpEmM\nHTu2xuPatWsHwNdff80FF1wQc5+8vDxq604wadKkiI6BZ511VkLGwZowYQIPP/xwg8tJNFaRWCyW\nFsfFF1/Myy+/HLHu5Zdf5uKLL47r+M6dO/Paa6/V+/zRimT27Nl06NCh3uWlOlaRWCyWlCCRUxFf\ncMEFvPPOO6FJrIqKivj6668ZNGhQqF9Hv379yM7O5o03qo6yVFRUxPHHHw/Avn37uOiii+jRowfn\nn38++/btC+03duzY0BD0d999NwCPP/44X3/9NUOGDGHIkCEAZGVlsXPnTgD+/ve/c/zxx3P88ceH\nhqAvKiqiR48eXHPNNfTq1Ytf/OIXEeeJxdq1aznllFM44YQTOP/88/nuu+9C5w8OKx8cLHLBggWh\nib369u3LDz/8UO97G5Pg5C4t+e/EE09Ui8XSeHz88cd12v/DD1UPOEDV6zX/P/yw4XU4++yzddas\nWaqqOnHiRP3DH/6gqqoVFRW6e/duVVXdsWOHHnXUUeq6rqqqHnTQQaqqumXLFu3Vq5eqqj7yyCN6\nxRVXqKrqunXr1Ov16ooVK1RVtbS0VFVV/X6/Dh48WNetW6eqqt26ddMdO3aE6hJcXrlypR5//PG6\nZ88e/eGHH7Rnz566evVq3bJli3q9Xl2zZo2qqv7617/Wf/7zn1Wu6e6779aHHnpIVVWzs7O1oKBA\nVVXvvPNOvemmm1RV9fDDD9f9+/erqup3332nqqojRozQxYsXq6rqDz/8oBUVFVXKjvXMMMNZ1Spj\nrUVisVianGRMRRzu3gp3a6kqd9xxByeccAJnnHEGX331Fd9880215SxcuDA0YdQJJ5zACSecENr2\n6quv0q9fP/r27cuGDRtqHZBx8eLFnH/++Rx00EG0a9eOX/3qVyxatAiA7t2706dPH6DmoerBzI+y\na9cuBg8eDMDo0aNZuHBhqI6XXnopL7zwQqgH/YABA7j55pt5/PHH2bVrV8J71ltFYrFYmpzgVMRe\nb+KmIv7lL3/J3LlzWb16NXv37uXEE08E4MUXX2THjh2sWrWKtWvX8rOf/Szm0PG1sWXLFh5++GHm\nzp3LRx99xNlnn12vcoIEh6CHhg1D/84773DDDTewevVqTjrpJPx+P7fddhvPPvss+/btY8CAAXzy\nySf1rmcsrCKxWCxNTjKmIm7Xrh1DhgzhyiuvjAiy7969m5/+9KekpaUxf/58vvzyyxrLOe2003jp\npZcA+M9//sNHH30EmCHoDzroIA455BC++eYb5sypHIi8ffv2MeMQgwYNYtasWezdu5cff/yR119/\nnUGD6jRnHwCHHHIIhx56aMia+ec//8ngwYNxXZfi4mKGDBnCAw88wO7du9mzZw+ff/452dnZ3Hrr\nrZx00kkJVyR2rC2LxZISJGMq4osvvpjzzz8/IoPr0ksv5ZxzziE7O5ucnByOO+64GssYO3YsV1xx\nBT169KBHjx4hy6Z379707duX4447jszMzIgh6MeMGcOwYcPo3Lkz8+fPD63v168fl19+Of379wfg\n6quvpm/fvjW6sapjxowZXHfddezdu5cjjzyS6dOn4zgOl112Gbt370ZV+d3vfkeHDh248847mT9/\nPh6Ph169eoVme0wUdhh5i8WScOww8s2Phgwjb11bFovFYmkQVpFYLBaLpUFYRWKxWJJCa3CbtxQa\n+qysIrFYLAmnbdu2lJaWWmXSDFBVSktLadu2bb3LsFlbFosl4XTp0oWSkhJ27NjR1FWxxEHbtm3p\n0qVLvY+3isRisSSctLQ0unfv3tTVsDQS1rVlsVgslgaRVEUiIsNEZJOIbBaR22JsP01EVouIX0Qu\nCFs/RETWhv3tF5HzAtueF5EtYdv6JPMaLBaLxVIzSXNtiYgXmAycCZQAK0TkTVUNH9VsK3A5cEv4\nsao6H+gTKOcnwGbg/bBd/qiq9Z8swGKxWCwJI5kxkv7AZlX9AkBEXgZ+CYQUiaoWBba5NZRzATBH\nVffWsI/FYrFYmohkuraOAIrDlksC6+rKRcC/otb9TUQ+EpFHRaRNrINEZIyIrBSRlTZzxGKxWJJH\nSgfbReRwIBsIn3z5duA44CTgJ8CtsY5V1amqmqOqOR07dkx6XS0Wi6W1kkxF8hWQGbbcJbCuLlwI\nvK6qFcEVqrotMHlXGTAd40KzWCwWSxORTEWyAjhGRLqLSDrGRfVmHcu4mCi3VsBKQUQEOA/4TwLq\narFYLJZ6kjRFoqp+YBzGLbUReFVVN4jIPSJyLoCInCQiJcCvgSkisiF4vIhkYSyaBVFFvygi64H1\nwGHAfcm6BovFYrHUjp2PxGKxWCwxsfORWCwWi6VRsIrEYrFYLA3CKhKLxWKxNAirSCwWiyXFKCyE\niRPN/+aAHUbeYmnGFBZCQQHk5UFublPXxpIICgth6FAoL4f0dJg7N/WfrVUkFkszpTkKnETTEhVp\nQYF5po5j/hcUpP61WUVisTRTmqPASSQtVZHm5ZnrCV5XXl5T16h2rCKxWJopzVHgJJKWqkhzc41S\nbE6WllUkFkszpTkKnETSkhVpbm7zep5WkVgszZjmJnASSWtXpKmEVSQWi6XZkixF2hKD+MnEKhKL\nxWIJIxjELysDjwcmT4YxY5q6VqmN7ZDYSDS3DkYWS2uloMAoEdcFvx/GjbPfbW1Yi6QRaKlpipbm\ni3XdVE9enrFEXNcsO07LyQhLFtYiaQRipSlaLE1FsGFz553mv21tR5Kba9xZaWlGobRp07IywpKB\nVSR1pD4uqmCaotfb8tIUG5tEuAhbu5vRNmxqZ8wYWLAA7rvPehDiwbq26kB9XVQ2TTExJMJFaN2M\nLbv/RSJpzanVdcUqkjrQkJ609qVsOInoydxSe0PXBduwsSQaq0jqgG3JNS2JuP/2GRpsw8aSSKwi\nqQO2Jde0JOL+22dosSQeUdWmrkPSycnJ0ZUrVzZ1NSwWi6VZISKrVDWntv1s1pbFYrFYGkRSFYmI\nDBORTSKyWURui7H9NBFZLSJ+EbkgbP0QEVkb9rdfRM4LbOsuIssCZb4iIunJvIa60tpTSy0WS+sj\naYpERLzAZGA40BO4WER6Ru22FbgceCl8parOV9U+qtoHOB3YC7wf2PwA8KiqHg18B1yVrGuoK7aj\nl8ViaY0k0yLpD2xW1S9UtRx4Gfhl+A6qWqSqHwFuDeVcAMxR1b0iIhjF8lpg2wzgvMRXvX7Yjl4W\ni6U1kkxFcgRQHLZcElhXVy4C/hX4nQHsUlV/bWWKyBgRWSkiK3fs2FGP09Yd24PdYrG0RlI6/VdE\nDgeygffqeqyqTgWmgsnaSnDVYmJTSy2W5o8d0LLuJFORfAVkhi13CayrCxcCr6tqRWC5FOggIr6A\nVVKfMpOK7ehlsTRf7BA69SOZrq0VwDGBLKt0jIvqzTqWcTGVbi3UdHqZj4mbAIwG3khAXWvFZmNZ\nLC0fG+esH0mzSFTVLyLjMG4pL/Ccqm4QkXuAlar6poicBLwOHAqcIyJ/VdVeACKShbFoFkQVfSvw\nsojcB6wBpiXrGoLYVorF0jqwQ+jUj6TGSFR1NjA7at1dYb9XYNxTsY4tIkYgXVW/wGSENRp2oL/U\nJejPzsiA0lLr17Y0DBvnrB8pHWxPFWwrJTUJn1vbdSsnIbIWo6Uh2Dhn3bFDpMRBsJVy771WSKUS\nQUsxOCWq61q/dkvFxihTG2uRxIltpaQeQUsx3CKxFmPLw8YoUx+rSBqRROSn2xz3SsL92YmKkdj7\nm3rYGGXqYxVJI2GniU0OibQU7f1NTWyMMvWxMZJGIhH56TbHPbnY+5ua2Bhl6mMtkkYivFXl9cLW\nraYFXJePwrbMkkuq39/W7HazMcrUxs6Q2IgUFkJ+PkyfDn5//dwnrVmYNAapen+t283SFMQ7Q6K1\nSBqR3FwjpPz++gcObcssuaTq/bUBZ0sqY2MkjYwdat5SH+x7Y0llrEXSyLTmIRjicRsl2rWUqq6q\nutKa3xtL6mNjJJZGIR4ff6LjAKkYV2gpis3SOog3RmJdW3Fgh2doOPGk1iY6/TbV0nmDiu3OO83/\nZLxP9l21NAXWtVULqdiqbY7Ek1qb6PTb6PIyMoyQbSprINkBc/uuWpoKq0hqoTGyZVqDuyMeH3+i\n4wDRQ6iMH9+0QjbZ/VRSIbOrNbzLlqpYRVILyf74W1MrMp7U2kSn3wbLmzix6YVssgPm9X1XEyX8\nW9O7bInEKpJaSPbHnwqtyNZAqvRaT2Y/lfq8q4kU/vZdbr1YRRIHyfz4U0XAtXSS1SBINVdOXd/V\nRAp/+y63XqwiaWJs/4DGI9ENgpbgykmk8LfvcuvFKpIUIFWH5Whp1GQ91MeyaAmunGQkODS3e2Bp\nOHEpEhE5CihR1TIRyQNOAPJVdVcyK2exxEttiqAm66G+lkVLceVY4W9pKPF2SJwJOCJyNDAVyARe\nqu0gERkmIptEZLOI3BZj+2kislpE/CJyQdS2riLyvohsFJGPRSQrsP55EdkiImsDf33ivAZLEkiF\nDnDxdPSrqXNifTsu2nkyEkMqvEOWhhGva8tVVb+InA88oapPiMiamg4QES8wGTgTKAFWiMibqvpx\n2G5bgcuBW2IUkQ/8TVX/LSLtADds2x9V9bU4654QUi2omgqkSowgHhdTTdZDQywL25pvGKnyDiWD\n1iQz4lUkFSJyMTAaOCewLq2WY/oDm1X1CwAReRn4JRBSJKpaFNgWriQQkZ6AT1X/HdhvT5z1TAot\n+WVvCKkSI4hHEdQUC4jeBk3bA741UdM71JwFcWuTGfEqkiuA6zAWwhYR6Q78s5ZjjgCKw5ZLgJPj\nPN/PgV0i8n9Ad+AD4DZVdQLb/yYidwFzA+vLogsQkTHAGICuXbvGedrYpIrATDVSJUYQb8C4Jush\nuK21CYCmprp3qLk/h9YmM+JSJAF31O8ARORQoL2qPpDkeg0C+mLcX69gXGDTgNuB7UA6Jl5zK3BP\njDpPDWwnJyenQUMcp4rATDVSKd0zUS6m1iYAmprq3qHm/hxam8yIN2urADg3sP8q4FsRWaKqN9dw\n2FeYoHyQLoF18VACrA1zi80CTgGmqeq2wD5lIjKd2PGVhJJKAjORJMJ10NJiBK1NAKQCsd6h5v4c\nWqrMqI54XVuHqOr3InI1Ju33bhH5qJZjVgDHBNxgXwEXAZfEeb4VQAcR6aiqO4DTgZUAInK4qm4T\nEQHOA/4TZ5kNoqUJzObuOkgWrU0ApCqp/hziaYS1NJlRE/EqEp+IHA5cCPw5ngMCWV7jgPcAL/Cc\nqm4QkXuAlar6poicBLwOHAqcIyJ/VdVequqIyC3A3IDCWAX8I1D0iyLSERBgLSZ2k9KkYtCwubsO\nkklrEgCpTKo+B9sIq0q8iuQejEJYoqorRORI4LPaDlLV2cDsqHV3hf1egXF5xTr235iOj9HrT4+z\nzilBqr50iXAdpKKCtLQumuIdbGgjrCV+N/EG2/8X+N+w5S+AkcmqVEsiVVv+DXUdpKqCbApaomBo\nCup6H5vqHWxII6ylfjfxBtu7AE8AAwKrFgE3qWpJsiqWitRHYKRy0LAhroNUVZCNSWEh5OfD9Ong\n97cswdDY1EfANtU72JBGWEv9buJ1bU3HDIny68DyZYF1ZyajUqlIfVsSqR40rC+prCAbg+D7sH8/\naCC5vCUJhsamPgK2Kd/B+jbCWup3E68i6aiq08OWnxeR8cmoUKpSUABlZeC65n9dBEaygoZN6VJp\nqQoyXoKCL6hERFqWYGhs6iNgm+M72BzrHA/xKpJSEbkM+Fdg+WKgNDlVSk0yMowSAfM/I6Np65MK\nvtZUzappDMIFn9cLV14Jo0a13vtRE/GmytZHwNb0DqZq7Kq+302qXg/Er0iuxMRIHgUU+BDT07zV\nUFoKHo9RIh6PWW5KWqqvtSmpy4famC3LVBYgtVGXBk8iGyap0NBKJKl+PfFmbX2J6dkeIuDampSM\nSqUieXnQpk3ifZv1FRIt1dfaVNTnQ61N8CVCAaS6AKmNpmrwNNeGVnXvTKpfT0NmSLyZVqRIEtkC\nDb4sGRkwfnz9hERL9bU2FYn8UBOZzRWrXuvXw8yZMHIkjBlTvzo2Fk3V4GmODa2aGg2pfj0NUSSS\nsFo0ExJhek+dCuPGGcEQdJW5bv2EV2uOUSSC8NZfbR9qvNZForO5ouu1axfccYfZ9v775n8qK5Om\navA0x4ZWTY2ZVL+ehiiSBo2o2xwoLC6koKiAjNIRlG7MToglcsMNppUKRtD4fMnL+GnOvvVkE6v1\nV92HWhf3UqKzuaIFyIQJkdtnzmx6RVLbe9ZUDZ7GcD0mktoaM6nccKxRkYjID8RWGAIckJQapQiF\nxYUMzR9KWVE/3Bk34XGVNul2zFsRAAAgAElEQVRSrRCJ56UsKKjM/AKjRJ580gTuE/0yN3fferKJ\n1fq7/fbY96gubq9kZHOFC5CRIystkeByU9Jc37NUrHddrI5UU4I1KhJVbd9YFUk1CooKKHfKcbcM\nAn86rkq1QiTelzIYsC8rM26tJ59MXmsy1YNzQZrqg6iLz7ku+ybbBRF8X1IlRtJc3rNoUrXe8Vgd\nqagEG+LaatHkZeWR7k2nrPsiXF85HtdLerrEFCLxvpT1FTItbWiWIE35QcR6FtXd57o+t2S7IMaM\naXoFEiQZ71ljNC6aw/dRHamoBK0iqYbczFzmjppLQVEBu059n7VLOzByeAa5udlV9q1ri7UuD72h\nQ7Pk58d/rsamqT+I8GdR231OZf90U5JoC6yxGhepHryuiVRUglaR1EBuZi7rv13P3SUX4ndPYt5T\nQwHIPnEPBUUF5GXlkZuZm9SXsqHCdsYMc9yMGalhAoeTl2fiCK5r/jflB9HUSq05k0glW1BQ/6GI\n6kpzbRykohK0iqQGpq6ayth3xuJu7Q8z/o3fSef6BS7ey3+Bc8Ri0r3pzB01N6RMkvFAG9L6aA7C\nUSTyf3Uk292Riq281kiqDUWUqqSaErSKpBoKiwu5YfYNuOpCUR446aA+HL8f9/NT0c4LKCvqx4T7\nyphwefLiHQ1pfaS6cCwoMKnQquZ/dYquMdwdjdnKS7WMm1Qi1YYissSHVSTVUFBUgOM6ZiGrALzl\n4Adw0QN2QHEu7oz3+UAPYNE/4xNuDYl31EfgpKIJHE5Q0ZWVGYukutZnY1lWjdHKS8WMm1QiWUMR\nWZKLp6krkKrkZeXh9XjNQuZSGHYTeFxQD7z7GKz7LTjpuI5JC86f9SUTF02ksLiw2jJjCcT6UFgI\nEyea/7WRm1t9/4imJjcXJk2qjJOMHx/7moIKx+ttHOFSl/tbVxL1DrRUgo2fe++1ShaS+y4mEmuR\nVENuZi6Tz5rM9e9cj6MO7DvMKBH1gV9gz8/AW464gi8Nnts1Gv+8RXg8HiafNZkxJ1bNz4zX1VST\n66OltWhLS2sfJqahllVdXEnJvr/VvQPW3VVJoi3D5npvm9O3bhVJDWT/NBufx4fjOMa95fGD4wU8\n8NlZ+EbczNXH3sr2jq8w68cFALiuy7jZ48j+aTa5mZFPvTaBGM9gf80hgF4X4nVv1Ve41PVjTPb9\nra7/SnMRGM2N5nxvm9O3bl1bNVBQVIDfDQyMlbkU+k4HXEDA9TKiy+WMGvc1s/ffGXGcow4FRQUx\nywy6miDSZA2+8FOmGKFaneujsd08ySZe91aQupr6dXUlNcb9jXY3WndX8mjO97Y5fetJtUhEZBjw\nGOAFnlXV+6O2n4YZiv4E4CJVfS1sW1fgWSATM97XWapaJCLdgZeBDGAV8FtVLU9G/UO92/1luLjQ\nOx/WjgYnDbwVLOCv/HfuHiqcisprQmjjbUNeVl5o0Mdgf5PwQSDHX5Id0UoKvvC1DfaX6gH0+hCP\newvq17qsa+ZaU9zf6DpmZBhl2VKeb1OS6pmLNdGcvnVRTc4gviLiBT4FzgRKgBXAxar6cdg+WcDB\nwC3Am1GKpAD4m6r+W0TaAa6q7hWRV4H/U9WXReQZYJ2qPl1TXXJycnTlypX1uo7C4kLGvzue5V8v\nNyuKTzHpwAfshO39zLre+ZC5FA8ezj3uXIYfPZw129Ywfe10KpwKRIQBXQewrGQZftePLL4Dd95f\ncR3B6zWBxby8SiHZkqdujeWvjldBTJwId95ZOQT/GWeY0XDjiXuk+seYiDlqLLFpDs8/VRGRVaqa\nU+t+SVQkucAEVf2fwPLtAKo6Mca+zwNvBxWJiPQEpqrqwKj9BNgBdFJVf/Q5qqMhigTg/FfOZ9Yn\nsypXrLwa3nnKBN4BvGVw+RDIXIrP40MQKtyK2IUBFOfi/WcBOOkRwqKlv/A1KYx4rj14fLDns8dj\nUkVbkrANV5bBRkbQFWqxNDbxKpJkxkiOAIrDlksC6+Lh58AuEfk/EVkjIg8FLJwMYJeq+msrU0TG\niMhKEVm5Y8eOel4CTJ21nree7WUsETD/Z08OKBExf06asVIAv+uvXokUnwKLbgMUz+gzueYPX0YI\nwVRO1U0ENfmr47n2oKl/xhmVndaam9+7NpqTX9xiCZKqwXYfMAjj8joJOBK4vC4FqOpUVc1R1ZyO\nHTvWqxKFhTDuouNw5k6AGXOhOJeee64H9WKUiJo/b4XJ6qqJ4lNMGfPuhRlzcdSh64iXamx9N4f8\n8bqQCCGZm2vcWW3atExha/tRWBpKU8iOZAbbv8IEyoN0CayLhxJgrap+ASAis4BTgOeADiLiC1gl\ndSmzzhQUgOP3gQo4im/rGdx0fR/Gv+llf5mLqgPHvgUDHjJZXVF4xYtHPMZCCRtmBUehaDAZB8bO\nda3RBRQVwK8rDT2+ISQqeFjfcpqL6zDVxlGKh+Zyb1s6TZXunExFsgI4JpBl9RVwEXBJHY7tICId\nVXUHcDqwUlVVROYDF2Ayt0YDbyS+6oa8PGiTLpSVK14fPHn9rxlzXja8tJ4bnvpf/F3/jWQu48wj\nz+TAtPN4Y9MbaNiEkhkHZHB538v5fv/3TPvqQyoWlBsl4q3A7TaPcbNXAVC6tzRCsFeXPx6ctbHc\nKY8YMDJeGnp8IkiUkKxrOanWn6CugjeVBXWq3dt4SOX72RCaqu9J0hRJIBg+DngPk/77nKpuEJF7\nMErhTRE5CXgdOBQ4R0T+qqq9VNURkVuAuYEA+yrgH4GibwVeFpH7gDXAtGRdQ2XLV8jLSyM3N5vC\n4kJm/jAB/wAz36kC73/xPqd1O61SiQQyu77NKuDBvQ/ypwF/4qpze7L950+ybMkBbMt4CTKXUuHC\n2HfGIkiEYK8uZTE4a6OjDuVOOQVFBXVSBA09vjkTzwfWWMKlroI31QV1c+o4B8m/n02ppJoq3Tmp\n/UhUdTYwO2rdXWG/V2DcU7GO/Temf0n0+i+A/omtaXyE5nH3l1XZtmbbGvMjGAtx0s1Aj6OH8tCS\nh/CIB6/HS0WfCgizWlw1Y2bv29KH8X/5hknXVe+6CfZrCVoUeVl5dap/rOMbU3g2ZQuwtg+sMYV1\nXQVvqgvq5tZXI5n3s6mVflP1PbFDpNRA9Esx+pHPzDzuxf0r+5LsOwyyCvghGCOpEgvJQzOX4qiD\n67gRrq8QxafAjA9Y7qQz5FWHxx/zUloKGT3WU+B/G4or3V6je48GYFTvUVCSy8QX6jAkfdisj3lZ\neVCS2ygvfVN/XFD7B9aYwrqugjfVBXVN97apGxCxSOb9TAWl3xQxNqtIaiD6paBoMN5dA3FmzAZ/\nOuAFcU0/ktFDTcA9q8BYIk7VbK6YSgQilE9ZmcO4ceC4ius5Cs/od2iTdS+Thk1i/LvjQ9ZEX//1\njL+kHkPSZ+aGlNLEFxrnpU/Ux9VQoVTTB9aYwrqurcZktDITLeBj3dtENyDi7WtU2z7JbLU39D1K\nRcUbD1aR1ED0SzHqvG4wawZT3LZoMHNaveCkIUWn4+22En/mUqNUghZLoH9Jz37f88nOT0KuLA8e\nM+wKhA0IKYCL3/GgroCbhrtlEOWZS5n58cyI+MbMOaUNFs6NJTwTcZ5ooXTjjbB2LYwcCWPGNLyO\nje0SCAreYKpmbees6/410VgWYiJb5/HUuS7XlaxWe0Peo1Sw3OuLVSQ1sH49ZGWZca9uuin4ULsx\n44nI3tVen4e+B40kr8tJTPrqN5QH3VxhsZLDut5DG+8Wyp1yRMQolFgGiriopxzwgqcCshbg8/jo\nc3gf5hfNR1HSvemMHJ7Bon9CWbni8fnJ6PEJkF2n62ss4ZmI84QLpf374cEHzfr3Tc5DwpRJY364\nyQ66h7duofJ3Y7lfEtlQiafOqeBWgvq/R6lS//pgFUk1TJ0K115buTx2rPk/ZkylUMzIgDlz4K23\nvKx8ux/r/92PJ15ayRrfUyzceCofh8VKFi308sfbbqRDmw5kHJjBjXNupMKpwCMenKI8cH2AF9SF\nPs/CIVuNpZK5lHJHeLTwURzXwePxMGnYJMacWJmG7HSbx/gNq8k+sWo6b039RhrTjG6okA4XSqqV\ng1sCzJyZGEXS2CQz6B6udLxe0xgKTk0waVLjWKKJbKjEo5SSZWGHj4NWWpq876Wu9U8lN5hVJNUw\nc2bksuvCuHGQnR3pZrjhBvNhg7FSSjdm8/TtT1OYAYNeL8epqABvBZo1j0cLV7HgcjNviaqiqJk0\n64CdJtaC38RVeuebAgNuMc1cGhp2RVQo3Wsmsi7NeBsd+P9w1aHc8Uak8xYWF5K/Lp/pa6fjd/1V\n+o1MnbXe9Nr3+2iTLilvRocLpV27Ki0SMO6thtBUH2Qyg+7hSscNeFBVzbrS0sZz4yWy31BtdU5W\nLKmxxnerS/1TzQ1mFUk1jBxZ6TYJ4jhm4qlwF0HwIwXT8gt+3Lm58NQrmxg7+WXcbvMgcymOeigo\nKmDr7q2V43EVn2Km7nU9ZirfYTeZ9VEpxMGe8x7xhNJ+w4e5B1j+9XKmrpoaGnm43CkPBfjLnXLy\n1+WbYewPzOCGp0rwl98FajpcFhRIwl/EZAZ0jzrKKPuGxkiaIiAcpDbBEV1WXQRNuNKJtkjCy2tO\nxFPnRF9XUCEHv/PapjpoKPHWP9XcYFaRVENQOE2aBJs2md8+X+XshV4vnHVW5YRMXi88+WTkwxxz\nXjZkFnL9Mz7cRbfjOXIxW3dvZfue7WaH4lOg4G6jMPABfpNOHCOFOKhIMg7IYPy74+l8cGf+dOqf\nuPHkG3loyUO46jLrk1mRoxRHMW3NNFx1ERGcbv3Bexs4ptd+Xl5aQu9fsltMY8Ykxp3V2AHhaKoT\nHNWVFb1/dYorWukErzUV3CDNiaBCDrdIUiEFO9VSwq0iqYGgsAp+rFu3mtiJ6xrBMysgs0XM35w5\nsGZN1Dwiq8bA81ejjuJfUMYUPZO0bivxlAzAnfF+WBqxHzzllenC1aQQb/9xO9t/3A5fwxufvIGI\nVJ9WHMAjHlx1jRsN8KgHX9cV+EefiXyZx+8vOYnc3PPqfH9qir+kWoupOho7IJzIsmpTXNFKJxXv\nf6oTrpCTHSOpb71SoT5WkcRB8IMMKpFoVI2VElQs06fD/Pkm62vsWHBdATzgb4NuOQ1/5lJyym9h\nhdsGxQv44cgPIO+vlYM/BlOIsxbEHBASTL+UeOaTCaYcQ2AGR18bftX+IV4q+ho3az6PljzE929f\nxajeoyJmcgy60ILusPAxwWobtyvVWkzV0dgB4USUFd6waQ7KurmTqm7AmurV2HE/q0jqQGlp5TwY\n4YhEZhGVlZlZ7latCu4bHHLeAwfsxCterjr/KNa9CmVlJhgfVCKC4BEPAwam85MzPuGtT5fjKJUz\nMwYyuWokfBbHQM97yVxGmjeNs44+C0py+dcfr0YrvOD9MxWjhzLFncKMdTOYNGwSN059iYrPT0W6\n/xFv1+U4roOLGxoTbP7o+RHjdu337yd/XX6EImmKFlN9RzZOdEA4P79ux8X66GO5piZOjJxB0es1\n7lZIbWWdaqRStlMyaIpAvFUkdSAvz2RshE+H27evcWm98UakMlm+PPxIxSgTP7Lvpzx51pOMOTGb\nNX/PZ8rMTWjWvIhgukc8LNm6BMC4o6LH7xp2U0hBVFEqwX1DLjMHvOX0vvUWso7dxpzNcygv6GmU\niPrAL7BuFJq5lHKnnEn/u4zy52aDk456y3HDAv2KUuaUkb8un1G9R+H1eHEcB0WZvnZ6yKIJEi6g\nEzl8fayyUmFk4yAzZph3ZMaMhvUNCc8ODO4jUjm/PcA110DXro0jFFuCAE61bKdk0BRuZatI6kCs\nVnZhoWkh1o4DvnI83ReyZtvxFBYXMmrEMcz473Xs9+8PRTlcdXE1akyu8OC7n8AMjZ4qGV0R++ID\nNBSwX7v0ENalPWPKzZoHnjvB8QIeWHMF0ucFpOtyPln5s2oD/dEcfejRfLzzY6AyKyyW8K5OyEe4\n0Epy40t7rKasmkY2bsw5WJLRNyR8H4+nMgsrPT0qHpdEGiqAG2vY/NqOay6xu3Dqei+awq1sFUkd\nCT7I/PxKF0awk1w4Ho/5UwXHUfA4MOwmnC6LmbJqCTPWzWDuqLnMHTU3or+H1+M1c747FZFDqASD\n7xDovOiJLeiD+/ohFMQPBOxDyilzKfSdDivHmH1cLz1+uI5NugzNml8l0O/BE+qNn+ZNo+/hfcmb\nkUe5Ux46raL8Y/U/6Ht4X8acGJlOFUvIAyGF4P1qIJI/F3+F13SYe2k9a3xPAVSxcgqKCigr6oe7\nZRBl3ReFFEZeVh5ejxfXcfF6vKH4TmNYKuGKKi8vt/rYRgyFVp+OdpMm1R70TbTyrE0A19bxtTGG\nzY/nuOYSuwtS30zAxnYrW0VSRwoLzcMpD8hQr9f893iMvzro7lqzBlavhpUrATxmlsV9ZspfRUMt\n+K6HdGVU71GM6j0qIsA9oWACH2z5wATKM5fiufwXuGsvhdVXBuaLDyinsIwuwCiJ8LG+qnOB9c6H\ntaPNfPPeCooOnWHcaMHj140K7aooAzMHst+/n7bbh/DYwwdQflC/yDKLT8EpyuPardN5ceCL9Dys\nZ0gJZByYEcocExEyDsyIUC7O6t/AfgHMkC/XT34FZ+AzAExfO535o+eHhFNG6QjcGTeBPx3XV07G\n2Z+HqiCYMlSV/HVGyyd7DpZYimru3KrWVXUKLRkd7ZKhPGsM/tdyvmRYafU9LtWynWqjvveisRME\nrCKpIwUFUFFRuRzs1e7zwRNPVKYL/+53JugORsmkpXsYPqwDc/a3CVke0b3Obx90e6jcCXkTWLR1\nUejjnHTdKGZO/TkfrEnHRQDHWBWx3E6ZS2sPyIcrnKwC9naK2n/taOPiWjsaHT2UhSwMxF/uCsRq\nRla61aJiOAsZysLMZ5i+djqPD3+c8e+Ox+/6TU9+12H8u+OZNGySGR5m60mw5gqCCQnicXC6zQtV\no8wpY0LBBEa2f5jSjdls3ZqNx1VcFTyul9KN2XAe5K/LD3XArPjyRJ5ZcCjPHXU7T4y5JK45WOrb\ngo+lqG4flFvVPVWDQov+6KfOWs/MOaWMHJ5h+iLF2Aeqd3kkQ3nWJIDDz1fmN89rQt6EWq2u6uqf\n0WM9Ht9xKD58aS5bO7xIYfExtV5DvNZGlb44CbLeCgsrvRR9+1ZajVB/xdVcLCirSOpIXh6kpVVa\nJEFUzYsD5mUKKhGAnBy46iovpaV/YvjxwynNeJutu7fyj9X/wNl6EvuLTif/J5+ROzYsUB01d0hu\nZi7ZlxMaqNGVssqhVMIQaulXEp39FUvhVNchMt7160ZBUR5lWQuY+fFMyor6oVsGGfdaIKi/Ztsa\nY20V5QVcdUY5dhzwNtszCyPq+/7b5/H+mmPwoPi8QppP8APp6UJGj/WMffsppq2ZZq47TKmVLyhn\nTZ/XmDRsEjM/nsnIniNZv6od4y6qiBgahi71b8FXN9lYtHCKd1KyqbPWc+2FR4G/B+9PL4dX14eU\nSTjRY2ldeWVlvKShE6BVR3Wt3PARFlxcPtjyAYu2LqrR6qrOZVNYXMj4DUNxftsPKcrD7b6If+xY\nwoz82p9LfayNulpv1cX1INJTAZVeivBRBeoaW2ouFpRVJHUkN9c81Px82L7dZGwFLZTly80HEk3n\nzpUpm+np2dx414Gs3rgedgnMfhh10pm+ROh7+HpKM94mo3QEpRuzycvLJc+XS8ELQB7QpZDRj3wG\nRYPZ3vEVZn0isOg2JGshZw5ux8ieI1mzbQ0Lv1wYCoJHsPLqmgP1YATx7q5mWHs3ECc5YCcsus38\nj9VRMnysMI9jLAzXZ/ZtPxv3+bMiMs6cfR159/si3PZulflbth/1QGRdZswFfxvAg4vgYDKVOPhL\nPj7oacZ+9FBEP5lopfbxyo5M2/lLHHUo+LIAZ8GfcMrvjhgahoGRLfjwoWSi+85UabmW5DL6+42Q\ntYBRI44BYOzbY5m2ZlrI2gy65sIVWnXCauacUvD3CCRWKDPnlJJ9YtXzhrs8HEeZMgVmzAgoRiLr\nVF0CRE2t8Lq00oONnnB3bG1WV7TLJn/WlxT4X2Lr7q1m8rguS5AuH+JiXJXxWlbx9vwP1aMOSRrh\nSic6rjd6dKSnAkxmXXBdcJyz+gT36+qmaszkkiBWkdSD8Ac7dSpcf31lT/c5c8yQ88GhU9LSoFOn\nsCHQy5QH/5wJ2g04OxDvECoqlBue+l/cbvNwZ9yExzWt72BrxpfmoKNuxzliMekHp3NjxkuQ/wH4\n01FfOSNHfE72T/cw/t3xMacCpvgUo0TcNEAQx8txX9/P5m5nRo77FXRRefxw4rPQabUZCywq9bjL\nCZ/T/qjdfLp2IE74WGHHzIZN54YE+b/fOjhmxlnRgnIYvcWct88M8793fjUZaF5AEVHS04WD+8/i\nkeILQj31w6/R+/2RiA8cx8GXJizx/D+cwPWVO+XQbS54bwdHUY+f5WlPMPzAjFALPuhyDCY7CILX\n4+Xm3Jt5YtkToX2u7HNlYHKxbMrLu5GePoq+h69n/IaTA1l4xioMpksDoYnJFm1dRPZPjZURLagO\nPGY5ePsbxeqroM8puxgy46JQizmolPLyzDthXKseVIWycpf8fE8g/djUaVQfKCyJsgZqaYXH20qP\nFljR7thYllDQ/bN9e2UfGF+aw7Pf/RZn3mK8Hi8+jw9cQoknQYVcnbVXXZ2CM4CWlRnrYPJkM+hq\n+L2IOf10NQOehisd9/MBUC6oW2mFRHsqoi0SX5pTxU0Xb0ZWrE7C1V1/U6TBW0XSQEpLIzsolpXB\nww9Xjr81fjx8/715oczw52omwwoIxyDicYwS2TLIBJJVqAgbtdVV4PMBaOcFlDvlrF3aAY97QESs\noCBjomnN4eLBQ07nHDq370zR+sNZW3CuEfaBWIQqbJ47iN//+lXe2nsHG3dujGzNu2qGst93WKTb\nat9hMOh+SgDZKXiK7kCCPfS1Atp9E2FhaI//hS8HmmVRYw2FucA8665AnTTUU9VVJ1kL8aS5uH4H\nnw/OvnAH9M7nkeI7KvvXBN10ADPm4jjpiMfhhLNW06bvK6zwLol8YJmFodiQZhUw68elvPWOlz+c\n+ge+3/89q7etZuW2laGMOUXxu34e/vBhwKRnO47DM6uewbP4J2hZL9T1UF5urInyI8pjuhajW775\n6/KZsW5GZQwsbAZM3xVv8/Pvr+HnOdtYKu9Q5piGQVApBd1FV/z9RZ557seABejFlQo+3rGF8vJe\nlS39/Mp+LUHXSoG/5s6k8cRYqhNY0e7YiGOiElXS0uCci7ez6Yg/s/GARQD4XT8jjh1B/yP6xxSa\nNQnK6G2jv99IWVm3UL+bsWMrOw9XjuAbNf00JpswvDGwf0tfRv1hI78a3qOywXHUEnSxi79C8aXB\nqFFeRo2qjJEc3O1z1m4pZuTwDLJ/lk3+rC95btdo/rFjcchNZ9ystY/AXVhcyJAZQ8y74fHhEU/M\nEb3jfXbJwCqSBhIrZhJULI5jlAqY1okIuCoE4wFgBLsInHPhDt7LWs3+b3qh4uIRxecLt0hAj1qC\nI17Sven06Z7JfI+AQpt0MX7aLpGtq0nDJkFJLoMvr4AywSivyvNWVPh55KVVuAM/MZUMdzP5/MZl\ntXk4oBFpxEEUxc2ajy/tbly/F1+ah+G/+YG3+vwPzpaBlXGYn/2nMovs3ccqXWOA6/cZxappVVKZ\n07NW8fgrm1hTeDDbO77CnP13Uf5jeZVYCN5yY9UEFJ66yrp9b4L3sSrPSxA0KjbkqMNDSx7C5/FF\n9OIPVwiqis/jCw3/D5hRnT1/xiNtSU/3mMnGNlTGCoLWDEBGmNWT7k0HiAhQ37vg3pDw8hyxhM8y\nl7LxRz/6Y6RSWr1tNYXFheRm5jJqxDE8u/M0/L3zQwp1ifjwpRWgePD4/GzfU0p5eacIFxKDtppE\nB43dmTRmKz18kqwuhUwomECZU4arboQyCp/KOZroRJUKv8tb2yfjHj09Yr9O7TpFJJ7EKyijt23v\n+ArIHwi+764bvJdCWVmlmyli+ulFEyNGzab4FHTGv9nsT+fBmS6X3jaHXsM+NHMKcQZ8fir+7otY\nn/5bxpw4JtLiO6KcRRvSmXviXLqOKMCZvziiIfHsUx1rHIE7aIW8+/m7ocZEhVsRejdjNQKSFR+r\nDatIGkgwZjJ+fHRvdkP4XBDmhyAeJSPzv/y35DCjCNrAn244nOHfLGPcvcfhx4fHIzzxRLgp7oUu\nEykoKmDXkgt59K6jQqMQT5oUbMlUbRFOfAEcfzCY7YcjVsL2PuB6wVuB020uIcsokMnl+XIobttv\nYc4T4LQJXgmc/FiVmIpkLuXCB6eyY0OvQJbRnxj79haeWXV/5U4Bwd2nUx/W/uwMKBpslMw3x0fO\nwxKVynxFnyvI7r3HuIt+3E+1nTSD/WuqGehSEHweH66aPiaqWunOCz4WKtcJQuf2nel4UEfWf7M+\ndNzvc3/P0uKlLNy6MOJ+adEQbry0P2POOw8yTRykz+F9+HTnp7y56U2mrJqC1+NlxDEj6NSuE30P\n78uabWuMYnIUF5eSH0oq6ysSynKLZuW2lQzNH8qkYZMo3VvKb3r9hhfdF0PPxUXodP1lbF13JP7u\nBcz2pOFLmwt48aU5PLdrNM7qxaHrVJQKpyIi0yoYz5m2ehqdD+7M+lXt+N3FDuXlgjfNj/72T7hd\nloTqF1RGfQ/vGxFTiqay0RW4Lk85TrcPCLfMveLl4LYHM3HRxJjl1CQow7d5PV7m7L8LPetzeOcJ\n875TaZGLR8nL84SODQrtjCg350//+1tK/OmAsdL/9cAAFo8YTMHeifg7L0I7L8ABxs1eBsCabWtY\nvW11SMkGlV10vQHz7QVG4BavsrXDyyG3V2FxYZW+WqH3I/DcYjUCarMKk4XEM+hfvQsXGQY8hmkK\nP6uq90dtPw2YBJwAXMpip5UAACAASURBVKSqr4Vtc4D1gcWtqnpuYP3zwGBgd2Db5aq6tqZ65OTk\n6ErToSNpFBbCwIGxB3WEynGRKiqMZeLxVPZUnjzZpA1PnAh33mnWe71w771w++1Vz3PaacZKAXP8\nffdV3S98/6FDTaaXeCsY8Oe7KCwpxP/FQLxHLsLbdXkoHfmso8+iU7tOAEyZdCg6917MowPTb6UC\nrsgz7qEoBKGtry1zR80FYOBzAys7VAboeVhPNu7cGGlR+NNNbOWsGyDn2dC+aZ40zj7mbD4t/TR2\n4kDIIknDm+Yy4M93GwEfVFJRCq/HYT0Y3G1wSIhv37Odol1FrPtmXewst5DbbAGezGW4uHjFa6yw\nYHA/zLWW1m0Vv8/9vZnJUh18Hh8VTkWVstO96SG/v9fjpXO7zhTtLorY57xjz+OtT9+KiAGF+sgY\nWwOvx1utsglHEE5yfke/st9D1gL+seNKHHVCZYRbYD6Pj6v6XkXfw/tyw5QX8H8xwIzTVjQEnXeP\nUdpSAaffBYPur3KeoLIOulxijVYwddZ6rv3bEkCrxMQ84glZfB7x4PP4uLLPlRzc9mAKthTQNq0t\nPQ/rWUVhTV01NUJ5byrdxD7/Pr7c9WXlu7ZulHEBOj7wuPQY/RTT/noy679dz7TV01izfU1oBtLf\n9PoNO37cwcieI6E4l2tHHhuKLSJ+rvvjV4wa9zWnPX8aftcfqr9XvBHPzCMe2njbhNxPU1dNZdrq\nabRNawuKeV8D75C3+2LILAx5Eqatnsbyr6u2TD14OO6w40LfkQcPZxx5BiN7jqw9MaQeiMgqVc2p\ndb9kKRIR8QKfAmcCJcAK4GJV/ThsnyzgYOAW4M0oRbJHVdvFKPd54O3wfWujMRQJwK23Rs7cF8Tj\ngaefNr/HjTNKIHjbwxVGdErkpEmmYyNUpnZOnAh/+UulwkpLgwULwvpDxOojUVg12FpT4K6wuJC8\n+26n/Nn3wE0PXIX5iHpfMpPy3L9Suq+Ub3/8NuI6BeGkzifRuX1n3tj0RhUhF+wh76hjssDm3RsS\nTt6hf+Wcqzbw333/ZcfeHXxa+qnpYxKeqhxN4CP0dF9EWreVVDjGojjsoMOq1A2Mcgr6l0UEVQ19\n+IKQ5k2jz8/6sHyZp9qJxSLOHbHPGXi6LovMIItBuEIILmtUi/yps5/i+neujxBKl2Zfyv9t/D/K\nnXJzD12nViUSJCiUzzr6LN757B0q3Aq8YuJCBVsKIgSWIHhKBuA8/15kgsW7j4U6rwbvR5VU8zDl\n279zf9Y99Egoqyno/x/79lieWfVMrfelJtI8aaGZRh9c8iCzNlU/B0/w+l11TdbixpHQYyaek54L\nvQuxCLolJ581mRenH8jCpy8MZTv2v/0OJl3zGyavmMyL61+s9rxHH3o0v+r5Kzq06cCGHRt4af1L\ntU/5EFDw0RYzmHfDI54q24LPIai4bjz5Rh758JGQUg/v0FtX4lUkyXRt9Qc2q+oXgQq9DPwSCCkS\nVS0KbKv562smPPCAmblv2jSjAGJZHK4bOZyKiBnRFSJzxjMy4MYbK2MvwaHp8/KMKyyYiRI+mVa8\nkyFF+7GjX7LczFyeGP4E10/34gSfjDj40l02HDgZ/86NMa9f0ZitqPDt5/78XNPaDovHeNJcBgwy\nH/Syr5YZH3XxybUL84DLzAXKAjLXK15O7XJqSGCGE7Ec9T0rSteDu9LW1xaKTq19vLGiIVX2cWNY\natEEBVSwLtGC5aLjL2La6mlVMtLap7cPDaezfc923vz0zVqnEAgKZ3drf8qL8phVVACZ5ryOOjy2\n9DGGHz28yn1wtgysmmAR1nk1NIhn8SmVFiBEPK/lfWZAmYIaazh/1lYK/C9VsS4FQURC1khwnLma\n+kNVuBVc/871rP92fdXMvRgMzBzI4g8d3GD24Zen4f7sP7jVddotPgUtysOfVcC42eN48oonWVrx\nCyo+PxXNms8K7zJOe35ylfsfXefPv/ucB5fEaFlWgyB4PFWVm1e8XNPvGgCmrJpS7Tldddnn38dD\nSx6qkjWYbBdXMhXJEUBx2HIJcHIdjm8rIisxSaP3q2p4s+NvInIXMBe4TVWr5LuKyBhgDEDXrl3r\nWvd6Ez0ZVngHrK1bK1Meg9kjrmviK9mBPmfBY6IDk8Ec9Ntvr6GHcUHiBqQr3ZiNupWjFnuOms+I\na9fwxo+LI/aTklPRLYOR7gvQLh/WWKaiDD9mOJ3adWKKTkFD2VMLWagfwqawnaNjILGEeZAwF5On\n2yrmbJ4Ts0VXG5u/28zm7zZDVnm18RYwH++g01wWLgjfZ35c5zj32HPp1K5TzFY5UG0Ld/ue7aGU\n1Ihx2KgUxgA+j49TjjiF/f795HXP4++vFuKf8W5MhVzulNOpXaeQ7z5EVN+eCIuwKK9yvxkfxEx2\niI5ZuVLBP/57Ge68JVUsGA1kz9F1OaoaSrX+fv/3VaaLDmftN2vjnlqhsKQQ3XJLRP2kaAiSuTzS\n/RruAgv0g6oYPZSHljzE+F//ioItc1n+9XIUYloyQVdT1w5dK91qNeAVL1p8CrrlNLxHLubqc3sZ\nt+LsG0LlC8I1/a7h6RFPU1hcyNTVUyMUmIhJuIlIDInTUk0kqRxs76aqX4nIkcA8EVmvqp8DtwPb\ngXRgKnArcE/0wao6NbCdnJycRr+zEUOoR/VCPucc+PRT2LjRKJPg/CVr1lT2gL3ppkjLxeer7EFb\nXaerjIy6DadQkx81Lw+8Pj+uI+Bx0R6v0ek4D2lr/3975x5eRXXu/8+ayd4Bj1UwakEJBJEq2lQC\nFomUkIpFsag5pb961NOgUmnwSmvLkd4OWg+0tFbqpTZ4vECrrZ5S8Qbe0ACSILeAUdACEgIKFoOo\nFUiy96zfH2vWzJrZs5NAQETm+zzzJHv2zJp12+ud9X7fS8JbdBLvlMCf5pNqsbBfSeF89xycHkFz\n24BuX1g07mqkqHuRmuzujsJrprkw6MUshTIb7vx+dCNCKqYTb/geDUc9mnHZcUccx/Zd21vvEA2X\nRBf15yB6L8CRArFoEhQsQOQvITcnl8vPP4nqLed7PEKbIWlQKpaRfUdSeHwh/1v7v1nVKmEkrARP\nr3s66/UXn3IxE4dM9HYrz6x7hpSTYuW2lTgb/yurQBYItn2yjQmDJ/Db6t/6ajm3/QU7r6S+y0OR\nYXBaExzYLYr/MKzJ0j1aVw06rn9SqmABHxV9RPkZ5Wx7szfv1n2JLqeuYmXOPby/6/2s95sC0hxr\nzUfJzttBOAjhkJMQOL0X4YTVcobzKwivv9bn/4ppi6dxcteT2xwrB4eGnQ2tLubdjuzG4B6DGZl7\nKzdMPY2mZnAWtnBU8bOMG6UylV439zrSMk2unUv5GSruXXF+MZd++dLAy8a5vc9l/sb5WXdmAkFR\n96I2691RHEhB8g6Qb3zu4Z5rF6SU77h/3xZCVAFFwAYp5Vb3kiYhxIMofuUzDXOnICU89ZQfowvU\nrsS0+NKmiWbCrKIilXExiv8wna5++EPlt9IW2nJcKi6GH0xucJ0nLeSzd1B01QaqxpR7DnbsupkZ\nKRvpCFLNAjaWYOcvofCLhdS9V4dEmcyaTmV5R+Qxe83szAoZC4PIaUGWn6N089oT/9nfKzPisNVY\n/deRxoK2aXUBDA1xM8Li6NyjfUFiCCyR/2okl3PRud3oduQOjtr+I26vuIB0i9KPW1ecx/WXnM3s\nNbNJn/gKnLio7c524UiHa565hqsHXM0Pi38YXLwjIBBcfOrF7Ni9g4WbFma9rtuR3aj7Zx33PfG6\na3a9DfKXKMHTaz7YP/EW+Msv6sEnx5fx5FtP4uAw5805nlopgPwlbO65DCHd5Ta8Q/zXFz2LO5GT\nYuiFm1jsmn2LgoUM/VqSRZsWKVPrKJiqwRTwzB8BCXYz94nzuP/Jm2l5cJ4rKEZgXfEiVo8d/g6i\nlR3rlUVXes6jQgjSmwap+eNYYKcp/t6jLD6+JthmXZ727xISkZNSuyUX6z9Yn3UMTITnk0AwtOdQ\n9qT2sGJpkvc2ljC3z2K6dTmK5mYBjuJwpj38Ks8338LgEwdz9wV3U7u0k1IdbulFDcpJ8rE3HguU\n25oQ0Zjw7AQKjy88oOqtAylIlgF9hRC9UQLkP4DL2nOjEKIrsEtK2SSEOBYYAkxzv+supdwq1F6+\nDHj9gNR+P6K01N8pCBEUItnQqVNmoqylS9X9nTr5/EdVlRIi2unq9tvV7iWVaj2xUnscl7rIPlhC\n4ji+02Nxmc+pzEjV4YjdIBKeeseRDm/88w2klFiWxV0j74LNxcye10j/wTuZ8Oxl0Z73xsIgU9JX\noZgOjPWlWPlLvZD2tmXz5d49WGWYEMuClwkTII50/EUg9CY79Oe3Ui1u9972LWHxo7N/5C9ErxyH\nk7pQ+bqkFX9we/VvMnPGtBNpmaZyRSWdcjpx6Zcv5S+v/0UZxFlWBoEukcxbN4+Tup6UtTyB4OPm\nj7nmj38KEuT6DT0UnHOl/SFf+PgLAZWOI51IYZKWaf+8qe6y0rDuAi+awY9vfZdf3/wrZqyYwXVz\nryPlpFjcYHFM52No3N0YXXGdrkCnO5CqNaQE6VWXkT56c0BQOBuHYvWo8SyjRMFCsNPItPCjYG8e\njKj/Oku2n0bhx4/Qqc8Savgdsr4EnaNHOi0senMt1vHh+gTbV1BaxZCLNvDwB23vNE2EyXKdqE5H\nzk4/9AsvDtySKx4C+0rVRleFuGrbKlZtW0XinRKsP71EqsXmwTtVVIvmExYEniWR0ULEyJAqdx8b\nSLdwoHDABImUMiWEuA54DiXmH5BSviGEuBVYLqV8UgjxVeBxoCtwoRDiFinl6UA/oNIl4S0UR6JZ\nuoeFEMehlPergIoD1Yb9hdZI9CjYNpx2GrzySqY5sVaFaf6jtDSY/tdx/Pwoe/YoT9uMqLGba2j4\nsMELRZHNcam0VDk7KlWZyFCVNeY9DedvhLXfgn6zlSWP6+jm4CCkoHZpJ2beVEhzM8x/MIVzxu+Q\nZ8zEyl/KKbuvQNSX8tYXZpAuWBDNSxjnrN6LuHfUvRQeX6hs/htHccOU05QXvuWoHYyhtoEIfXHo\nTXbPhsHIPr4F1bgB4+iS28UTslavlxA5P0W2OF692kPwtgaJpCnVpN4uXSFyzwX3ADD+mfGBXUpT\nuoldLbtaLevhuodh483ZOSXDAXNtFg1hNuLeka5zpiGQxIcFyJXfA2wEaT76IIepi6bS8GGDJwzT\nMp1diGj0nwlbi+CdM/ESsWEpjmLk9aH5sMAzzX3sjcdoQbqhItQ4896X4dnfI1NJFs7XmUFHwJgF\nGZyPLHgpw1Q3LHDr85dwRM5prZL+UbjolIuYt36eZw5d3KOYhQ0LlRHKonMCY7Rq42YoPyeS52l5\n+2xEM0gH0lLChrMhJEgiEZEhNZxu4UDggHIkUsq5wNzQuV8Y/y9DqbzC91UDmSFP1Xfn7Odqfiow\neY3CwmDQR73wa/+SCy9UqixtnRUWJratBJLO4X322UrogB+KpaVF/X3wwWAWvUDgOUtZg4STR5l1\nbi3yaF7jKHi2j2cJY3Vby48uGeK9zSftJNuqR7Bnj7u7Stuw/GpYVY71zR/x9nN30dJi4YjRMOZc\n7CvOc8n3l8ktWM31Z13PU8dez+71Z9F/8E5Gnnwvjc8XQilMGlrM1KmQ0py6bFHWRS70rsWD8ZZm\nJ9I4KUEyaTH23/tQ94bvKKb10Z5TWq9lyDEjcN4+u91cSFuLj+cIqB0gpaB2ay09j+7JpV++NMNM\ndNOHm9p8JtkEcTuRtb4uIa7VgHav5QxouZalq5pAJpCWItLlS9WK+FX+flnbnEFoWymwU5AWeNyE\nY6vcPd7CvsA1rYZH33iUtJOG1d9V5shY6vrasS6/4aqmTIE69FcZVmfNmyKI+lDEgzXb1wTrHtE3\nZhm5di7djuzm+fc40uHdj9/1r4/i/bJF4C6oUmGDZEL5b2ljDveZomAhX+jzOh81h/TYnorOz5Bq\npls4UPgsk+2fW4SJeL1Tqa1VpsNPPAFz56r8Jo2N8MYb8Je/KIFiWfCd7yhyPixkLAsuuED9r3PI\nt7So8kH9beiyznvbxoGeR/dUDkxZgse1Fnm0cW0hllS5QYQjGHfMw/z63F6UnVLm7Riuv7WboaJT\nYVqEk8uAD/6HFS02ThqlGqsfBiW/4eqLv0zPo79JacFvYEsxXY6G0pvU3WHTZq0ybGqWOJbasVhu\nGBMtRLRfhPzTCzgtbkTiCybw/VNvprysF8XFhRQOnO95NWvjA+0d3PBhA5XpSjjR5yi0lZQZLgVQ\nQSM3ncuQkhYWOdMiHcaqXmni+fnN0Hm7l3RM5i/1ogXr8mxhc8qxp7B2+9p2vRH3/+puXhMjPAsg\nu2etZxK9L7CwkFtUeBCtLjv1pus5pf8OnnjrThjzaiaRLl1foYiFt0/XPpz0yeU8P3NiiNAGceJK\nhn65DzUvd6WlRako7d6LyOlVS0v+UiVbGs6C+mHIgoVqGtVe6ZYhFVeztcj/DJkhffKX+HxYK0R9\nNvQ9pi/rdqzzPostZ3t9YyXSXPQ/v2fiJUOp+2edUgciSdpJzmIC6xdt8QVWO3g/Xd8Mk2uj3tJu\n5qOoepvCChthOeQmrQOexyQWJAcZ5kI9frxv8tvcrBwVhw2Dxx4Lhlp59FGfuDfhOEqA2Lb/nePA\nzp3+IpyTuBy7/AE48RVPpTVjhnKUTKfVLqg9qVxBCT9bx/vKtSkv66Xa5PqpjP+vTbSktHbSzyaZ\nTNqMvbwrdctcIUAa8WEvxJazYQCByK1acIwZk2na7JtCC/L6baAx75s0fFjIfSvvCyziJ+2aQWUq\n4fMcn3SBob+iuPher75AICTF5YWX8+dv/ZkZK2ZkLIoSSY7Ioah7kQrwKB3YPBhr1stIJ5eaRWms\n7y5G5leTa+d6oUdmzKnj+V/0CagdsJtxxgzP8GmQSEp6lrCucV3AlFmgfC4uPOVCvpT3Jao2VlG7\nrZa69+rI6WVz1cWFlJ/xa+r+WefxFZawMoSSdsCMChdjCTVebCwNqGL+seIE3uz8YMDiTl/v7f4E\n9Mvrl+Evsv6D9WxY1BwktJHq/61fZdkOwQ9u2cDt8x8i3Ws+Ob1WcuPgG7mj5g5aNp3pmRpLu5mT\nz6lmvZHDhu618O6Z/udTn0CcuJyTB77DuiP8fv3KF7/C6vdWZ6g3rU3DuejcbsqooWEhYUgk63as\nU17lx53KjWfdSO1j51HpdEJKC5HOYVDLROpW1HHNPQ2kO41B7D6eoYVF/PXWi6BFxT0795Zf8fzu\npgDvJ+q/jui51IvzptVtCStBuufS4LxoxcDAE3SBDKmN9DtyKDde0p/i4kgFz35DLEg+w9iyBR4O\nuRVI6YdHiYLKER88Z1qNgc3VXWbS8+uPeAv2tdf6Ze7Zo4SK47SeiKemRu2KtNPl9Onq/NSpfmC/\nB3ZOQlpzQSbIzbW48b/rvYio48oKXRWf4P4HkqRWXk1qVTkz5Ahmrh7OmI/W0tzcyxMcEG3a7Avi\nQqCQms01gai6k0snw8m9uG96M2mD54D+gfboDIsaD9c9zIlHnUiX3C6Rb9gtTgsfN39MwkqohW7B\nfyNTSaRUwQHFxqHYPZcw/fzpnqC6//ENkO6HqXbI5iNjC5ui7kWIVcL7fNPZN9Elt0vAXHvqoqms\n2LpCBWBMS1ZuXemVYRoElPQsYcOODTSnm7GExcDuAyntXcrvl/iBLTUx7EhHCYaCl8D+qacucwpe\nCvSDQPDjIT+mT9c+XPPMNR5pb765m5CaYHcJ7eP7vMv29b2RjgpauGrjZhg6FWSalrTF39f8XS2s\n9cMCC+iGHevJSZaQakmB1QxF98N7X/HVekN+g8xfwjrw/Dr6d+vPyJNHct3c62gJcSY/uuxMyoZ8\nk6r6KgbnD6ZqYxXNTrMXZ81z+MNhXaPbtoIFJJOXk2pR8zEvD8Zf8iWc5smAjRRpnn/Z8YSGk5Ls\nfLM/FEwLPPvHlw+iy8m3eRylNuHudmQ31ry/JmitV1ClLBpTmerLQJ8bQn4N93HD67kUDtx37/b2\nIBYknyGUlytOw8yuuC8I71RWrPAdIZNJXJWOCs419c9B9Zi2KtOkfTanRi2cHEfdU1trJu+CMbev\nU+axrj/GyPOP5q6myV5E1MKB8ykuVrGYnLSFdACZwNk4lOb8Je4Ptdwrr7wcis7zU9CG37B81VxE\n0Lp8+MOjb3HNPY+R7jWf3IJays/4XZv9+Pc1f2fWv8+iU04n9mwsQtYPU7pq90f65vtvugmOXiTd\nkqOIX+Fbj0kpadzlE84nFP4jaKkUEVEZlND45pe+Se1WFf9Jo0tuFyYNnUTN5hovqGE4O+HSd5ey\n9N2lJKxEwJjiqE5HKdXZ5rNI15eyrPdCVm77nbeTsLA4t/e59E+N53d/WYnT88VAyH0KqsjpuYK0\nYwUiG3+0R+noc6wc5TwoZUaMNQ8hdc37wka+/bwad6uF405/A+tDV5BhWNpp/sflFmS3FTj9/wwb\nS/zx0BGmQzyWg0P9znoaPmzgufXPcfcFdzN7zWxeEN9AbizB6r2Ij44rZPismRmm8DovyX0r7/N2\nCiknxXVzr1NWg+UPcOEnf6HbF7ozb9E2nJZjCbwkyJQyAqEF7BZOKPwHnZtWsWfMNxCbSvnRZV/l\n11eWYZIXeiep47WZRgGJXiso/tktLF6UUAEfo8LURODTCCcfC5LPEIqLVRiUadNUkqz9BceBK6+M\n/q60NBhy5Yc/VNyMXsA1qR9Wc5kmzcmkOmeqnqgfRvKoJM09l5HsvZpup46heWWmuXEUz5G0k5SP\n6kt5/2Do8gkvDQ8KIiM5UJA/KWbS0OCPZlxZIYUD/0VV/RGUFtye8aMqP6OcGStmBBbBb532LRUJ\n93Q3KnOLjbT2eDp1iST99lBI5ag3T1Jw0otQegsi/1WSdqeANdzES4by5FsjVM6Zzu8jdh/PN4Yn\neDm1grT042HNWz+Pp956KpDkyVNDuia22lltfvl8LzvhC2+/4C0qKSfF9wd+n55H9yTviDyunXut\nil1m6NhTY4Zj91yG7aYmGP2F3zLhskKc5oux7Z9B+XDS+a94RPKNg3/AI689wpaPtyCRpDadyR8X\ndMXu/SdkfotHMOuQJ5awcBwn0KdWz6VIl6twIECoP/bh8oDg9JBfgzXyhzhPu1F8592Fc8U5avfi\nXZOFtHahI/E27mp0E3ANpzn/VTcSb2GkKbw+iroXeX2uw+870kE6KZ7527GkUxJJV2U44O761UuC\nnwgucVI1Ey/5FRPx+bjGXWup2fzFQIw706s95aQYN2AcoCIbPPPSDha9bWOftICKi86gqPsV1G6t\n9RJw6cCrQCBE0KcRTj4WJJ8xFBfD44+rzIuzZ0P//srB8L772ud/EoVkEo46Cu64Q5XxwAPBHN+m\nZRb4Do1FRcFdRpg7Cd9nJlAqL+tFeY9gwiBT5ZTXOMoTUGGeo7TgN95OQguvqYui/V5qNtcw+aEm\nmpqH4aRF5C7K3K1MGloceKM3w2+/ctUr3Pzizbz9wdtc9pXLKDulTJm11lyGk0ogHbBEJ07Y8V3e\n67VCvZX2WYxYLGlpkThWM9bXbyOn10qu6v/9SGu4nF7LaPa8/wWLnE7cfcHdXuTWqvoqP/KvA1cP\nuJqeR/f0+tBcaJrSTVTVVzFp6CQml06malOVp57TPIcu03GcaB17z2We5d6su49iT5ODdCxsklx9\nzJ9hoIryW9S9iOvnXe+r/wziN203Y19xHlaPaiU0pHq+NsG2hc2QnkO8yL0Tnp2gcq9sPssPbdNz\nKWlJ1t2Ms7U/OpsoaQtr9RjsXsvbDIVjBjTUC2o41Hp4boYX3XEDx/km50fk+VlI64eRarGQjgBh\nqYyiRzcoayzXkELvGr556sXU/bOOxl2N5B2Rxw3zbvCed+fIO2nc1UjDhw1qnAwUdS9i3MBxjL93\nFi0PfhvSSVILmqHobxSe0ZfGXY3e/eHAq9MWT+Pdj99l7ICxh3SsrRgdgI7ZpVFU5EcO1h7v2mRY\nCF89ZVnqvGXBkCHKH6WoiAAP0tyMm+M7GNgx/Gavr02n1Y4lijsxF+ywqXBNTTG8Ugw5ruBxf7x5\njaPcFLV+WSoMfiFZrL6zpkQdPms4Tc4AHOt5LDpn+LtkRFR+pI4JbwynqX4A1qbd3HPNkYwrU88s\nzi9mwZXKVj9gJr3zOXIS85FYONYe3s17hBwhuLroasqvLIcrbGXO/a+P6DZgHOWjfhP5w62qrwq8\ncUuk95ZsJnIy22kKo6mLpgYWGlvY5B2R5wnFu0be5UUNTss0M1bOYObqmUw/fzq5ObnsKViItJvB\nwTUpDfrEPLBzjMdp5SQsVwV6r/dsHV0ZyBBKF+bezq6Tfu7lbHek45VtY3N+n/O9NhYeX8i0Rxcx\nZ+Z1nuXUudfNZf7rq5QnfmhnYQlL+ScZ576W921O+9f5bDvuUbqdupFtn2zjiTefiLSUU2//jZ5V\nnl7QzYW3rcyOVVXFlJYWUzxQnbtu7nWkTBNdHRYmYlckkcx5cw5z3pwDm4tVNIaCIshfQlO6iWue\nuUb1k2WTsBNeEisppeeVHuaJtr1xKsN3lAbUcWxR+YfUDh6e2/Aczelm6p6tO6Q922PsR4wb5ye5\nysuL3ink5SlyXjsyLlsGv/qVuifKsTH89m6S8mGCuz3cSbb4Yn4d1Y+xau3eB5eMStijs9k5PRZj\njRnBudZtTL6itNVAlrPnNSrB89DzOOkk1y2UFL6c+XzT858TX+Hq3z3M26t68qLzM5wei0k7tm86\nvUXvxrqRnF1OkQ1VEVZvYT7DfEturZ3m/TnvDqVlw9lYvRfxg0vO9tLzJu0kY84YE2iDqc7xhPio\nDdz/+AaWJn4dWPSq6qsCnNaVo0+huLg88OyEbcRZO6kaXoFUS5pk0mLi5YOgh8rZrtunkWPleIJf\nt2tQy0SedE3HaAHyYAAAIABJREFUSdnMv+vbOPLfwfqJSq6Wv9QTBrVba9l2xE6eWS1JtQgsW7L4\npaN55cVjyE1OZP584Iwanlv/XMDIwuy7GXPquPaX/0e653xkfnVGrpBsmR3DcfKuugq29dmp+KYe\n1Ygx33B9n17yrdhcK8WM3dXmwTDzRRXSx/6ppyL1CH0HLvzShTz5jye9c3rXWV42ift/n6KlJUUi\nIeh2+ps0bzdSNz+9jpk3FQc4yk8z5W4sSA4hhJ0ag2//wXzYoCywJkyAE07w0wGbRLxlBQM7hnmP\n8nJ1RAmvbHbpWo3U0JB9NzN9usuLNAHCYemOZ6nZ3LXNiR7+sQd2KQUrmVyeS3F+8J5wm0aPzOOl\nP5yD477dpVMyUpCFd0Dlo/rCKFg0ayXNaTsgAExhFbVz09eUliq+RYeL6XLy2sg34KhFraYGpt1T\nQOqxFyBtkbMYPhrwcGCxALWbUYYBJYjeC0kW1FJaUErdiiOpmlfM6JEw/bYvUjpzJS1pZQIccMJ0\nOa3yUfMz6lQ1psqLs1Z+VTl1F6wNGT8Ue3yN3pkIBFf2VwSdGdtt+umvkmMX0uxuM9Jp1xRYJhD1\n55Dbe7UnRDwO4Iq5XJyYxlNLV5FefhVIwZ49klmzBPfem10A19TAdf9xqkpra98Mrrl1WwtsWG2a\nTkNlpUTa18GYxyF/CcmCFYz8xvHMeQGVY6egCpn/Kt8f+H1AcRueqnJ1ue806aoW7Z7LvCRoAYdO\n7XjY+xV3ntUgxkxCbBiC6LOYokGXkXzWyLhYPyyao/y0Uu5KKT/3x8CBA+XnHVOmSCmEVnhFHwUF\nwWtsW8rKSnVvdbUqp7JSykGDpCwr889pVFcHrw2julrKzp1VucmklLm56v+cHCkty3/mlCnqOXZO\nWiJSkpxPZHLcMFndkKVg8xkN1XLKwineteHP2eoVaOPjr8lEbrO0bEd27txKeyLKrm6olhV/mCkr\nJtZ795ntDre1oiK6T1p7brjuFRVSJpJpCSmp4sFIadmOrJhYLzvf1lnat9iy822dZXVDtdc2YaVl\nIrdZVj7+mqx8/DVJ4hOJaJEkPpGVj7+WtW2t9aXZj9UN1RnPNssJfzdl4RRp32JLJiPtW2w5ZeEU\nWVEhvfaovymZyG2RFX+YKSuXV8rOt3WWYrKQTMa7b8SsEdL63hCJvdu9x5G5udF9qZ9bMbFeWrb7\nHNEsGX6ztG6xMuodvrfzbZ3VsxKfSCEc/7ckmiVn/kEyfJIs+82vVf/muP2b84lMXF0SKLdyeaW0\nv/e1QJ2xd0vGFsvcX+bKiS9MlIlbE9K6xZLJXyZl4uoSr7yc3CZZWSnliHEvq7oY/WeOlzkH9dxq\nz2+jLaDCWbW5xsY7ks8JSkv9XQcEIwdr1Nf7HAqot5drlHqWZFLFALv9dp/UnzcP7rzTV5tFOSma\nHvFBfxW4+mro2TN6NzNrFqTTQlk7pRO0bBjS5vY7W8TiNncyIZVb49pC7r4TajdsUqalPfoCbW/7\na2pg1qxiHnywWAXFvMvnisxYamZbwe8T06m0NZWeGe3Aj8umQ4hIII2d42QaNGwpZva9kG5RMZqc\nlEXjWkUSk+qn9Ospyex5jYwrCxKzugzNY4RTDISjTI+a8AHNX4hWnWRTz4U5rjovurk7IU99krE3\nfMy948s9taXpQJm0k4w+bTSLGiawu+ghFW4Hm1QqwsAixHElEvNploprufSs8zn960eR1ziKqj8X\nUhcxt7VqU6tNz9x+J6ufHUBLSjnQ6hAvTy1y4Ds7sGRnL8LD2K6zKM7v5dVl3MBx1B5zHpUyiUQo\nT/yihyC/hpRjs2rrKp9XctKc8tH3WOPumFNNKa651kHKYYoHHDOCZMHKjB1GdDijtn8b+wuxIPmc\nQEcCnqU0DxQVKSERtvQKcyX6+z174Le/DX7f1KTKkNL3F0kk/JArs2YpvxedQ0WrrEzVGKjrr78e\nVq2C0aPVuQcewF0/lHNaos9iSgumtpoq2Azvkk0tkS3Ui/7O9/BPI8vHkN7+CjNnZYbRDwut6ae/\nyoTLCv24YQSFQTa1I/jWbLat+lD3V5R60KyjOT7u6IHdjD1gFnf/pNhTJ4UXeh1KRz8j7708nn+w\nGVoAIenf29f/RQlnIONcVVVxIMr0U9PPx77qa4EICSaisnCGhUtVozJ2ko5KnpaTX0v5KJWx0VQt\n2pbNVf2vChgeXFP/J9KryiGdwLItGhpsamoyBYHmuIZ861Ve+evZSMfm73cMo6TXMCZMwBvPcFTt\nsNp0+i+aYIuyLly6ZidPPHIcUtqkW1p46q2nSCSuIoVNMulHeDBRXtaLmXfpuSeRA/5KWptdnzaa\nRQ2LvP7+0plbWTPbdVoU0nvhsuiseMDy3MgxKi4u3udEdh1FLEg+R4iKi1VRkbkziYJWeIVhLmT6\nTXraNHjuOTIW1cZG/61o504YOxbeessv27Jg0SIV7kSVKxBC8tVRa5j+s6kZYVE0v9BaeBcT0QS/\nL1TMHZMjgQ1DkCcsiBRK5kLUVD+A3zzxbzQ1+e0VIrswyGbNltevjtqttVA/zLWIyrzXrKOb9NCF\noKDvLo7v9xalxcNpXNuHmi9mGkpoIXLuuTB5slsXCtlw6wZ++7MCpExw1619KBvm9onRTt0PDR82\nKPNc16qsqr6K0tLiQJRp6VhcZURIaM+bbwbHVQqdcpVXu50jufua/0dxvm9Bl43zqF3aCWdjifLR\n2DYQZ/X3uO++oBViQBC98zUWPzbYq3tTk4ppt3u3X7dwVG3z+X4MNpg0qZiamm48838ttDSrSAlO\nt+WcccJABnQfEAiQGp4T/o7Bhh5Tg21zUy2MHplH4cB/MXfdBSq1b+ftKiZXOkEiYTF6YClVf4aG\nLrMC4zbr6XVU7SxuM6zRAUN79F+H+nE4cCTZUF0t5Wmntc6d7O0xaJDPA+jD1FNXVma/N8wbmFzB\nlCnqnMmlhM9VTKzPqvc1r7UsKROJkM7Y0CPndkrJ5LhhkTp+KSN05JbSkQuh7o/ikHR/m3yM5jjK\n/nNrq88z7zc5lURCPTORUH2s+92ygn0XpSPP1jdCqDqZ7dT1qlxeKZO/THq8RO4vc726VlaqepjP\nrqyUcsQI9be9CHAtbfBuUfcmkinFF9m7pf3VSonwx0aXNWWK4sI8jsQKzsFEInNuJhIRvGAWLqjy\n8ddkzjd+LsWF4ySJT9rk29rqgyh+Y8SsEdK6xZKMHSzF8J/Ish89Ezl/k+OGydxOKWlZiqPbm7Fo\nC7STIznoi/yncRzOgkRKNTGTSf8HY1mZgiB8WJa/gJWUBL+7/HI1YfVnc2GSUi0s2crNzVUTvaxM\nCaTKSuOHXxnxg2pjgQy3MxvpPWWKf41JGLdKLjdUK5LT9hcq3XfhRTyq/pWVZr9rgnWwR5Zmg7k4\n67IrKnxBYC6Iui0VFapPKyqyCzhzDuhxMBfcMCkuJgtZ8VRFRjm6rWVlwfq0ZwFrz3i2Ni6KoPf7\ntG+/jzLmZrY5pBfasrLMvgQ1z70x1UT9UxUZRgLZ5oc5z1prf0VF0OjCHNvAXA0JsYqJ9ZEvVWFB\nGSUQ9xWxIIkFSQB6Ausj6ocUJUiSSXV92NqrpET9jVpUwwuMKXDKyoILmn7T1m/gUYvh3ry16msn\nTvTf5s23tPZYnoV3FHphsm2/HyzLX+yzCa8RI8KWdCkphv+k3TuSqB1HeEdSWRkU6npnGCVczHHU\nOzbLdjzrLimj38Cz9Ul4fE8+uW0LuPBiGF54s+0A9P1l/7k18Mzjj8+sg7krHTEic+ej6x+2chTC\n7dPHX/PqkPxlUub+Mjf7zrUNwRhlWWU+V/8mspXRlmWWroM5ByyrbYHWXsSCJBYkWRFeHKMWfL1g\nCaGERrYdTEmJ/2ZrLt6WpcyNS0r8c1FCST8j48dcuXcqD90u/bacmxssUwspsy5R5s3hHYUur6Ii\nUx2i33DNxd1UpwV3JMqEt+IPM1s1x4xS70W1z9yJhMekoiL4XHMHYgo9ra5DNMucb/w80qQ6avEy\n6xi144xaTJPjhkkx/Ccy5+LxMrdTKuvCW/GHmVIM/0lg52YKl+S4YTKRTHu75XA9Jk6MFrhRY613\nBuGXpBHjXg7sQiqeqmi/WXRox5ttRxE1Nu2Z71FC3fztZWvvvqK9giQm2w9DhM1VzfAptg033aSI\nau3AuHBh9rIWLlQkd0so5JGU8M47KvTJq6+qc0IoazLTTDmRUOSwfpaU2cOxtAaTaJcyaH0mpTpv\nBsJsbs5MQ9yaY+GYMZkWb/qztsaKIvh1NkyA8nIr4C2u621aeDU0BCM1m2R+lDGFLtvEtm2Z42Ea\nQ+jsnE89k1ZpXO0WnF4vUVXf2SfEtxQrUtdwLNV9Bn4dhYCuXWH7dvW5qcnvV922pTs+oPmBuSpO\nlN3MqAnPMeiYCyJNyR/84eXIJgn2T7GvusCLFWZaYF14w7Pseu0CjjgCnnrKv7+kBH79aygrUybY\ny5Zlj8Sg+7K8PGh9aNtwxK5Tsd/xLdK0tVhNDX4IEoLWgVEhhsLe5RQswLbLMywpUyk1NpMm0SZa\nix5x993tyyN0IBALksMUUal/wQ/k+NFHKh6XlG2XFV60NNJpZR2jF3f9g7nrLnX+hBNg4kR17bRp\n8OSTvjAxE3y15W8R5cPSXphlmF7w4ZAwoBYZs3zL8hOB1daqc4WF0QtWtmeb4Te0abBtKx8cPRat\nmTSXlwcDegoB3boFhTUEhZI2R5YI+MJWKHyE3IKVKitlRL200LDt4IKr6zhrFvzxj9nbhjhfmbK6\nicW6Wad7Y6b7CNTnVIuNSnoouKrLTM8fw7TAmjfrPFItqg6W5bf91VfVc0GZmuu5m5OTPRKDKVCm\nTVOC6clHupFIzufq3z1MUfciz9dE+weFzbj1i05GiKGQd3n5qL7wWmZftVa/1hB+XnuF0QFBe7Yt\nh/oRq7b2HmGdfFte81FH2DomkVDqpbB3d5gINg+TOGzN0sVUq4XVT/qZZlvCqh6tdjPVWWGVTlgt\nN2hQZl10ORMntm3NFLak0mWHSfRs3vCtqTV0hIL+/X2jhvAz3b2bBEdOnLo+sl7ayi5M+IcNGJLJ\noMowbEGnoxjYOWlP/ZSNBwqrFk3DiDDHMmhQZr9ls1Bra76HeQbTutBUYUaNVVT9oww6ws+Jql97\nOcE2+Zl2ltMaiDmSWJB0FGGd/MSJQa5E8yfZBInJtegfnbkQa1KwtfAugwb5dTF/NOai5hHHli+8\nwqaQUfxCeFHV/EyU4NKfTcGo+YDWOANo3VRY6+hNowO9iIaJ2dYWrcCiW52dJ9AmvOE6jhgRrJdp\n5WT2YVZSOKKvogR9mFcyBZXJMUQJ+agXiI5a+mmE52AiEZxjpsVea6Fu2rN4m2bUuZ1SAd5sb+ue\n7Xn70gdRaK8giVVbMbIiSi1TVhZUg4FSYZjOXRpSKtWDVgWE88w7juJoCgszVTEaffuqxFqmrn7P\nHqXjj1JD2bZyhAR1TW0tnsdzlIrJVFdJqcrWKpcodZLJk7S0BDkDx4lWBc6ZA3PnBnPAmH0Eqg06\nHI2pqjO/N9VTrak1pk71Pdx1nTW/MXOmnx7ZbEv//sEEZtOn+2kLrr1WXTNuXFQYDgWTJ5g6VY3r\nmDFqDLp1U6pS7RWv+12IoLrMTCkwdarfPhUsUdV9+nRVLvh9GY4kUFWVyVW1hby84Nj94Adqrs+c\nGexL21YqLp2zJzyerakyNXQk71lzNvHAzjHcZ0RXMCMImA6S2ZDteeH50Z4I2x3BARUkQojzgd8D\nNvC/Uspfhb4vAaYDXwH+Q0r5N+O7NFDnfmyQUl7knu8N/BXIA1YA35VSRixBMQ4EoiauSdzPm6e4\nDjM/ytixmTlRQC0kjY3B8C7btsEbb8A6NwX1ww+rMnJyfH24lJlxwMz4VkVFZowqtQhdfLHiY8Jh\nVwYPDhoTSKm88sOhw3UU5DCJf//9fviYqPhmGlE5YGbNUsJIStWu2loVmwyCfI1ZB13/cFRjU8e+\nc2dm+mQhVN9q73fbVgvlrl1KiJhZMefPV/2q+zqVUkJF8z9tcT5mxAM9ByzL/1/zSo2N6gVBczxa\n2Om5lEz6ZckIIwz9IhMmu81sn2EeJhvf1NjoC1ch1DX/+Aecdx68+y4sX+4LwjvuyKxDNqj4bOr/\noqJQpIXUI6RffoX0sivZvfbbTNv5ASP7Bl8A8vJaLz8bWpsfBwTt2bbsy4ESHhuAk4AksBo4LXRN\nAUqIzAK+HfruX1nKfQwldAD+CIxvqy6xauvThcklmOqYysqgCihsKtoaVwJSHndcUP1QUuL7xWgT\nXe3oGHW/6WNhqn7CnMqIEdEqr4kTW1fjhVV+/fpFczZaFWG2VZtim2qoKHVcNlNTsw/DarawSias\nsoriFNryTYh69pQpbTu6hj2vTTVPWGWkx7S1KNJmfTL9doL+Nm3xTVG+JVrN1VYdso1FeD4Hxreh\nWuZcPF56EYFx5BmDdkpEOqPP2zPu+reQTTW7L+BgcySocKrPGZ8nAZOyXPtQewQJKgTq+0BO1DOy\nHbEg+XTRli49POE12hMKv7WjLSdL8PXx2fxnshH3lhUkdls7NJEfJRS1kAkLuyjuSMrsC21OTpBE\n130bFVXAdELs1y9aYIW93qurfU4sfG22BTksfLKNUZjnMT3OTV4nvIhG8SHhcqKeqTm0bHyTRmVl\ndBlauLZm2BDlw5FtPpvPHlSyQ4ZD6WPvlohmmdspFWnM0ZaxivnC9HkQJN9GqbP05+8Cd2e5NkqQ\npIDlwBKgzD13LLDeuCYfeD1LmePc+5f37Nlz33syxj5hXyZxWzuS/XHYtu+0pn/k2vu9rMz32A8b\nCoQXe/Po0SNo0aMFZUfqefnlmREITGsh89ChVKLeqG07aLEWbk9ZmRqnsrLgvf36Ze5eop4Rtjoy\nd51RoXi0FV3YSEHHLzOvbW9OnPCOSkdl0M8zdxH6mmzxyKL611yYtaOrbft9km3n1taORPdX8Hkp\nyZl/kGL4T2TFH2ZGts8sq6IicyemxyQm2xV6SSnfEUKcBLwkhKgDPmzvzVLKGcAMgDPPPFMeoDrG\nyIL2kI5R92iuZM0aWLx47/1CTNg2XHih0nWvWaPOpdNKx/2d78Ajj6hz+if4zDNBnxgZmjXmZ53X\nJZmEn/88yNFo3woTxx3nO+21Bw8/nMm5aK6oqSl47ezZSvcejk6seSFQPNXatZntmTNHcVo5OerQ\n7Tev1VxW2ABAl3H//er/oiLF85hcQ1QagzlzlL/GTTcF9fjdugWNAJ54QkWZNrNMRnEGYT7ATD0d\n5tBsW/Fie/ZAXZ1fbmlpJuEO6vo771Rzc/x4v+81p9XYmMmb5eX5XMxdd/k+RkcdFUylMH684q38\nNksQDqL7Kjqd9ScvS2VeXrBPdR0dR/V9KhWsdzKpytX80qFOtr+D2jFo9HDPtQtSynfcv28LIaqA\nImA20EUIkSOlTO1tmTE++wh77k6eDM8/73+vPe9XrVIk8ZIl2T3vHQcGDVKLaUmJT/SnUkqImD8+\nbR3UHmgB1a1b0HJIGwvMmqUW1dxcN/x9DuzYsdddEahfTg7cc496zs03B9u8Z48i2E0IodquSeh/\n/CP7cxxH9UleXrSwkxKWLoWRI/0FW98npRI+2snOXOha608t0E1vbFDGCCbBblqbmblWcnN9o4Xo\npE5BaMuunTuV4yGoNiUSvvFB//6ZwltKJagLC6PbUVqqxkb3iZTKIMCygpZo4BtwVFX5/RaEAJlD\nzvN3c/3ZE6n6cx/PEVJfGxZ0ZhlCwNChcMwxSlDra/fV4XGv0J5ty74cKCH1NtAbn2w/Pcu1D2Go\ntoCuQK77/7HAOlyiHvg/gmT7NW3VJeZIDl1k82kwoaPlTpwYrSvW15h+DPuqcjLjaYX9G8LBKMvK\n1NGv374/Tz8zHAm4PVyNSaa3RYK359DcjzZqMOOZ7csRCGdv8CAmB5NMRqctEMIPyGjOlbaMElqL\nTG2WHf5sxk4zHS+rq6ONO8LGJu0dM31oHmtvxy3KKbesbN9/fxxsjkTVgQuAf6Cst37qnrsVuMj9\n/6vAFuAToBF4wz1/Nsr0d7X7d6xR5knAUmC9K1Ry26pHLEgObewN39IamW8uVlFcjCbUs5HqQqjv\noiy69nahMJ/ZnuvCud+jnArD5WqCWJPUemEK69P3duHXTpS2LWWXLpnX6MV/bwwTopwK9bOy9ZHJ\ndWhBrvku0yjBNC4IW95FLbyDBmU63+rz2lCiNV4qLBDCLzhRY9W//96NQ7ZnRp1vj2d/NrRXkBxQ\njkRKOReYGzr3C+P/ZSj1VPi+aiByMymlfBsYtH9rGuOzjL3hW1q7Niq+2LZtyifFTBcMShVgOkhq\ndcrYsUq3Hla/QHanymzIyYFRoxQXoN6RopFIqL9mLLGBA5VqxoRWy2gu5cEHlepD+1R06RLOA9/6\nc8OQUqnJTJ7AVKmZKr+PP1a+F2b5OnPjEUf4bU6llOrIdDadNQvuvdeNvZXyOYgePVQgUF2m7vuq\nKtUXuk2pVNBfSaOpSfVBZaV6Zv/+yrFwwYIgJzRggLou3Ddmf69eDVdemekzE0Y6rVJYh2FZkJ8P\nmzer+19/PXsZYQwdqgJSak7MfH5YNZdItO3rsj/wWSbbY8Q4YAhzMWH9+lVX+UErw6lrtRAyPbLL\ny32SXQunlhZfp6/JeSnVIq8dDEERyk1NahEwSdW+feGUU3zCXAdbTCZ9gWY632lBoR39ZswIOtEt\nWKDqX1vb/oCcYaxenf27IUMyDRZAtVdKVe/Jk/0267aMHq36XztAPvig4pgaGoJBGd97T11v9msy\nqdpsRgHOBiH8/hk92ifgw/1QVKTG2LajBRKofl+ypH19qMdTQwv6LVt8IZntOWEkEooDOeUUxROG\nERYqY8d+OpGAhdyX2XSI4cwzz5TLly8/2NWIcQghHKI7KpR9a1F59Xd64YoKPR5VTrZrop7X1vNN\nAwPLgttuU+FHTA/wcGh8DS3U2muAoEnrqOsHDVJe9GY9w3UfP94XbtoTH4Jv3ELA97+vvP/Nfq2q\ngp/9zG+Lea+JkhL1Jq/D6pghW8x2/8//qH6aMcMPE7O/lkkdNRl8Qd8eWBZcdFG0oM6G3Fx4+eWO\nCRIhxAop5ZltXtge/dehfsQcSYx9wf5w6DqYyOYoJ2V2T3CtszfJ/XBgTjMNc1mZykc/6KIVfqKs\n0LE3KXjb4owmTmz93pyczORWOtpzVKDPqLZHEfjaCCCckdG8T/d1WxyH5u9M3qqte3QQyb3htEpK\nOj6H+CyQ7Z+VIxYkMQ5XtCYMw2RxOC2xeZ0m2MPZ/HQWQOt7QySJTyTCCSyuUQt/a3XNJtz0YUYp\nbq2d2cLLhCMHh50g26pvpgNhMKyM+cxsKaejLNXaQ9pnS5GtoyWcdlrmPR19CWqvIIlVWzFiHMaI\nUsG1lUTM/H7qoqn8/OWfk5ZprC1DONe6jdEDS/c5U5+pdoNM1U9lpYqeu6+IaseMGYp8Hz26fWXP\nmKEcAWtrW8/iGQ5iKYRSAUY5B+p67dyp+Kx0WvEo4QRadXVwzTV+BOeLLvKDkdbUKCJeqxdNdea+\nor2qrViQxIgRY59Rs7mG4bOGe1kA55fPV6l6O1KmsdjX1SlLOiHgxhs7JkT2N1rjqMLXtCWos5UL\nmc9o7bma10mng06b+4pYkBiIBUmMGAcONZtrqKqvorSgtMNCJEbH0R4B117EgsRALEhixIgRY+/R\nXkFifRqViREjRowYn1/EgiRGjBgxYnQIsSCJESNGjBgdQixIYsSIESNGhxALkhgxYsSI0SHEgiRG\njBgxYnQIh4X5rxBiO7DpYNfjIOFY4P2DXYmDiLj9cfvj9u87ekkpj2vrosNCkBzOEEIsb48d+OcV\ncfvj9sftP/Dtj1VbMWLEiBGjQ4gFSYwYMWLE6BBiQfL5x4yDXYGDjLj9hzfi9n8KiDmSGDFixIjR\nIcQ7khgxYsSI0SHEgiRGjBgxYnQIsSA5hCGEyBdCvCyEWCOEeEMIcaN7/hghxAtCiHXu367ueSGE\nuFMIsV4I8ZoQYsDBbcH+gRDCFkLUCiGedj/3FkK86rbzUSFE0j2f635e735fcDDrvT8ghOgihPib\nEOJNIcRaIUTxYTj+P3Dn/+tCiL8IITp9nueAEOIBIcQ/hRCvG+f2esyFEGPc69cJIcZ0pE6xIDm0\nkQJuklKeBgwGrhVCnAbcDMyXUvYF5rufAUYCfd1jHHDvp1/lA4IbgbXG518Dd0gpTwY+AMa658cC\nH7jn73CvO9Txe+BZKeWpwBmofjhsxl8IcSJwA3CmlPLLgA38B5/vOfAQcH7o3F6NuRDiGOC/gbOA\nQcB/a+GzT2hPYvf4ODQO4AngG8BbQHf3XHfgLff/SuBS43rvukP1AHq4P5xzgKcBgfLkzXG/Lwae\nc/9/Dih2/89xrxMHuw0daPvRwMZwGw6z8T8R2Awc447p08B5n/c5ABQAr+/rmAOXApXG+cB1e3vE\nO5LPCdwtehHwKvBFKeVW96ttwBfd//WPTmOLe+5QxnRgIuC4n/OAnVLKlPvZbKPXfvf7D93rD1X0\nBrYDD7qqvf8VQvwbh9H4SynfAX4LNABbUWO6gsNnDmjs7Zjv17kQC5LPAYQQRwKzgQlSyo/M76R6\n3fhc2ngLIUYB/5RSrjjYdTlIyAEGAPdKKYuAT/BVGsDne/wBXHXMxSihegLwb2SqfQ4rHIwxjwXJ\nIQ4hRAIlRB6WUv7dPf2eEKK7+3134J/u+XeAfOP2Hu65QxVDgIuEEPXAX1Hqrd8DXYQQOe41Zhu9\n9rvfHw00fpoV3s/YAmyRUr7qfv4bSrAcLuMPcC6wUUq5XUrZAvwdNS8Olzmgsbdjvl/nQixIDmEI\nIQRwP7BWSvk746snAW2FMQbFnejz5a4lx2DgQ2M7fMhBSjlJStlDSlmAIlhfklJeDrwMfNu9LNx+\n3S/fdq+6BakOAAADBklEQVQ/ZN/WpZTbgM1CiFPcU8OBNRwm4++iARgshDjC/T3oPjgs5oCBvR3z\n54ARQoiu7q5uhHtu33CwSaP46BDh9jXUFvY1YJV7XIDS+c4H1gEvAse41wvgHmADUIeydDno7dhP\nfVEKPO3+fxKwFFgP/B+Q657v5H5e735/0sGu935od39guTsH5gBdD7fxB24B3gReB/4E5H6e5wDw\nFxQf1ILalY7dlzEHrnL7YT1wZUfqFIdIiREjRowYHUKs2ooRI0aMGB1CLEhixIgRI0aHEAuSGDFi\nxIjRIcSCJEaMGDFidAixIIkRI0aMGB1CLEhixNhHCCHSQohVxnFz23e1u+wCM7prjBifZeS0fUmM\nGDGyYLeUsv/BrkSMGAcb8Y4kRoz9DCFEvRBimhCiTgixVAhxsnu+QAjxkpsXYr4Qoqd7/otCiMeF\nEKvd42y3KFsIcZ+ba+N5IURn9/obhMpB85oQ4q8HqZkxYniIBUmMGPuOziHV1iXGdx9KKQuBu1ER\nigHuAmZKKb8CPAzc6Z6/E1ggpTwDFSvrDfd8X+AeKeXpwE5gtHv+ZqDILafiQDUuRoz2IvZsjxFj\nHyGE+JeU8siI8/XAOVLKt92gmtuklHlCiPdROSNa3PNbpZTHCiG2Az2klE1GGQXAC1IlKkII8V9A\nQkp5mxDiWeBfqJAoc6SU/zrATY0Ro1XEO5IYMQ4MZJb/9wZNxv9pfE7zm6j4SQOAZUaU2xgxDgpi\nQRIjxoHBJcbfGvf/alSUYoDLgUXu//OB8eDlnz86W6FCCAvIl1K+DPwXKgx6xq4oRoxPE/GbTIwY\n+47OQohVxudnpZTaBLirEOI11K7iUvfc9ahshj9GZTa80j1/IzBDCDEWtfMYj4ruGgUb+LMrbARw\np5Ry535rUYwY+4CYI4kRYz/D5UjOlFK+f7DrEiPGp4FYtRUjRowYMTqEeEcSI0aMGDE6hHhHEiNG\njBgxOoRYkMSIESNGjA4hFiQxYsSIEaNDiAVJjBgxYsToEGJBEiNGjBgxOoT/D+Vislm1Q+UtAAAA\nAElFTkSuQmCC\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "W4EQD-Bb8hLM",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Further metrics\n",
-        "From the plot, we can see that loss continues to reduce until around 600 epochs, at which point it is mostly stable. This means that there's no need to train our network beyond 600 epochs.\n",
-        "\n",
-        "However, we can also see that the lowest loss value is still around 0.155. This means that our network's predictions are off by an average of ~15%. In addition, the validation loss values jump around a lot, and is sometimes even higher.\n",
-        "\n",
-        "To gain more insight into our model's performance we can plot some more data. This time, we'll plot the _mean absolute error_, which is another way of measuring how far the network's predictions are from the actual numbers:\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Md9E_azmpkZU",
-        "colab_type": "code",
-        "outputId": "39b97561-b01d-49f2-c35c-fbd8db663806",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 295
-        }
-      },
-      "source": [
-        "plt.clf()\n",
-        "\n",
-        "# Draw a graph of mean absolute error, which is another way of\n",
-        "# measuring the amount of error in the prediction.\n",
-        "mae = history_1.history['mae']\n",
-        "val_mae = history_1.history['val_mae']\n",
-        "\n",
-        "plt.plot(epochs[SKIP:], mae[SKIP:], 'g.', label='Training MAE')\n",
-        "plt.plot(epochs[SKIP:], val_mae[SKIP:], 'b.', label='Validation MAE')\n",
-        "plt.title('Training and validation mean absolute error')\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('MAE')\n",
-        "plt.legend()\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJzsnXmYFNW5/z9v98wALoiOUSIMYIiJ\noqOAhNjXJU0gRo3EBe+9GnPHuBFZvEGNXk00GRMTlBglUWPAhTC/GDAJEVfckFHEVgQBUVwQHQEV\no6OIiszSfX5/nD5d1dVVvcx0z8b5Pk8/3VV16tSp01Xve95dlFJYWFhYWFhkQ6izB2BhYWFh0fVh\nmYWFhYWFRU5YZmFhYWFhkROWWVhYWFhY5IRlFhYWFhYWOWGZhYWFhYVFTlhm0cUhImER+UxEBhWz\nbWdCRL4qIkX32RaRcSLS4Np+TUSOzqdtG651u4j8rK3n9zSIyGYRiRa5z7+KSG0x+7RoO8o6ewA9\nDSLymWtzF6AJiCe3f6yUuquQ/pRScWC3YrfdGaCU+nox+hGR84AfKqWirr7PK0bfFsWBiPwVeEMp\nVdvZY+mpsMyiyFBKpYh1cuV6nlLq8aD2IlKmlGrtiLFZWFi0H37vbKHvcXd8760aqoMhIteIyN0i\nMk9EPgV+KCIREXlWRLaKyHsi8kcRKU+2LxMRJSJDktt/TR5fJCKfikhMRPYvtG3y+PEi8rqIfCIi\nN4nIMhH5UcC48xnjj0XkDRH5WET+6Do3LCI3ikijiLwJHJdlfn4uIvM9+24RkRuSv88TkVeS97Mh\nueoP6iulGhGRXUTk/yXH9jJwuKftlSLyZrLfl0Xk+8n91cDNwNFJFd+HrrmtdZ1/QfLeG0VkoYh8\nOZ+58RnzNSIyP/l8fCYia0RkaHJ8H4jIRhEZ52rfT0TmJP+TzSLyKxEJJY8dICJLROQjEfkwef97\neObnYhFZm3wG5olIr4BxZe0riW8m/5uPReQO05eI7CMiDyWfnY9E5ClXvweLyJPJY2tF5HsB1z9P\nROpd26lnXUQmA/8N/Cw5Z/ck2wwUkXuS8/aWiEzJMu+9ReQGEdkkIu+LyJ9EpHfy2DgRaRCRn4nI\nFuA2v33Jtrmeg8ki8gbwatBYuiyUUvZTog/QAIzz7LsGaAbGo5l1H+AbwDfRkt5XgNeBqcn2ZYAC\nhiS3/wp8CIwCyoG7gb+2oe0+wKfAScljFwMtwI8C7iWfMd4L7AEMAT4y9w5MBV4GBgKVwFP60fO9\nzleAz4BdXX3/GxiV3B6fbCPAt4EvgEOTx8YBDa6+NgPR5O/rgXpgT2AwsM7T9r+ALyf/kx8kx7Bv\n8th5QL1nnH8FapO/j02OcTjQG/gT8EQ+c+Nz/9ck72lc8ty/AW8Blye3JwHrXe3vT15vF2BfYCVw\nbvLY14CxQEXy/14GXO+Zn2eB/sn/5XW0JOw3rnz6ejH5H++d7NfMz+/QDLc8ef4xyf0VyXu7LHls\nXHLev+ozx2n/Af7Peq3reAhYDfwseZ2vot/HsQH3dxNwT/L56As8BPza9Vy1Ar9N9tUnYF8+z8HD\nyWv06Wz6VDA96+wB9OQPwcziiRzn/RT4R/K330vxZ1fb7wMvtaHtOcBS1zEB3iOAWeQ5xiNcx/8F\n/DT5+ylcRAg4gQBmkTz+LPCD5O/jgdeytH0AmJL8nY1ZbHT/F8Bkd1uffl8Cvpf8nYtZzAV+6zrW\nF22nGphrbnyuew2wyLV9CvAJEEpu75nsbzdgAJqx9HK1/x/gsYC+TwOe98zP6a7tG4Cb8/z//fpy\n/8ffN/8bmqD+Cxjq6WMM8A4grn3/AK70meNCmcWRwJue610F3OZzLyFgBzDYte9okkw5+VztACpc\nx/325fMcHJPP/HbFj7VZdA42uTdE5EDg92jVyC7oB+u5LOdvcf3eTnajdlDb/dzjUEopEdkc1Eme\nY8zrWsDbWcYLejV9RvL7B8lvM44T0S/9AeiXfBfg+Rz9gZYaAscgWv12EVrqIDn2vfPoF/T9PWM2\nlFLbRORjNDE3c1LIf/a+6/cXwAdKqYRr24xvMNALeF9ETPsQepGCiPQH/ogmnLsnj33guZZ3XHv5\nDSjPvrzzu1/y97XA1cBiEYmjFzC/Sx7fqJKU1XXeAL8xFIjBwCAR2eraF0ZLl170R8/jGtc8iqfN\n+0qp5hz78nkO0t797gRrs+gceN1GZ6FXsl9VSvUFfkHmw1psvIde8QAg+i3J9pK2Z4zvAVWu7Vyu\nvX8HxonIALSa7G/JMfYB/glMR6uI+gGP5jmOLUFjEJGvALeiVTyVyX5fdfWby833XRwmg4jsjpYA\n3sljXO3BJpIEXinVL/npq5Q6NHn8OrQ3XnXyP/sRbX+u8unLO7/vgiaaSqmLlFJDgJOB/xORbyWP\nV4mLQifP85u3z9ELA4P+nuPe/2gTWjLo5/rsrpQa79P3+2jV8NddbfdQSrltMn7PgHdfPs9Bt03z\nbZlF18DuaFXD5yJyEPDjDrjmA8BIERkvImXAT4AvlWiMfwemicgAEakE/i9bY6XUFuBp4C9oVcb6\n5KFeaP3wB0A8KWWMLWAMP0sahAeh7SgGu6Ff4g/QfPN84EDX8feBgZI06PtgHnCuiByaNOpOR6v4\nAiW1YkAptQl4ErheRPqKSEh0DMsxySa7o4nsJyJShVYdthX59DXV9R9fgbaRkXzGhiaZwido1UwC\nvQpvBS4RkXIR+TZaRXm3T99rgENFpDq5aPil5/j7aFuWQQxoFpFLksbrcPLcwz3nobTL+e3ATBH5\nkmgMFJFj85wbg055DjoKlll0DVwCnIU2OM/C/2UpKpRS76M9SG4AGoGhwCr06rHYY7wVWAysRauM\n/pnHOX9D64VTKiil1Fa0qugetJH4NDTTywe/REs4DcAioM7V74toA+fyZJuvk65iewxYj1b3uNU2\n5vyHgV8lx/UeenV8Zp7jai9+COyKNth/jNb5m1X3L4HRaAJ9H7CgHdfJp695wOPABuA1tK0C9Hw+\ngTZeLwP+oJRaqpRqQjssnIR2xPgj2la13tuxUmpdsr/6ZN9PeZrcDhyW9MT6p9JuqSckx9yQ7H8W\n2o7gh0vQKrDlyXt8FK3qzBud/ByUHJKuLrTYWSEiYbQYfZpSamlnj8fCwqJrwUoWOzFE5LikWqYX\n2mjcgl5ZWVhYWKTBMoudG0cBb6J19d8FTkmqBiwsLCzSYNVQFhYWFhY5YSULCwsLC4uc6DFBeXvv\nvbcaMmRIZw/DwsLColth5cqVHyqlsrnNAz2IWQwZMoQVK1Z09jAsLCwsuhVEJFdGBcCqoSwsLCws\n8oBlFhYWFhYWOWGZhYWFhYVFTvQYm4WFhUXHoKWlhc2bN7Njx47OHopFAejduzcDBw6kvDwoxVl2\nWGZhYWFREDZv3szuu+/OkCFDSE8Ya9FVoZSisbGRzZs3s//+++c+wQdWDWVhYVEQduzYQWVlpWUU\n3QgiQmVlZbukQcssfBCLwfTp+tvCwiITllF0P7T3P7NqKA9iMRg7FpqboaICFi+GSKSzR2VhYWHR\nubCShQf19ZpRxOP6u76+s0dkYWHhRmNjI8OHD2f48OH079+fAQMGpLabm72VT/1x9tln89prr2Vt\nc8stt3DXXXcVY8gcddRRGbaCE088kX79+qXtu/7669lll1349NNPU/sef/xx9thjj9Q9Dh8+nCVL\nlhRlXIXAShYeRKNaojCSRTTa2SOysLBwo7KyktWrVwNQW1vLbrvtxk9/ml64TymFUopQyH89PGfO\nnJzXmTJlSvsH68Luu+/Os88+yxFHHMFHH33E+++/n9Fm3rx5HH744SxcuJD/+Z//Se0fM2YMCxcu\nLOp4CoWVLDyIRLTq6de/tiooC4tiIbYpxvSl04ltKp0h8I033mDYsGGceeaZHHzwwbz33ntMnDiR\nUaNGcfDBB/OrX/0q1faoo45i9erVtLa20q9fPy6//HIOO+wwIpEI//73vwG48sormTlzZqr95Zdf\nzujRo/n617/OM888A8Dnn3/OhAkTGDZsGKeddhqjRo1KMTIvTj/9dObPnw/AP//5T0477bS046+/\n/jqtra3U1tYyb968os9Pe2GZhQ8iEbjiCssoLCyKgdimGGPrxnLVkqsYWze2pAzj1Vdf5aKLLmLd\nunUMGDCAa6+9lhUrVrBmzRoee+wx1q1bl3HOJ598wre+9S3WrFlDJBLhzjvv9O1bKcXy5cv53e9+\nl2I8N910E/3792fdunVcddVVrFq1KnBs3/nOd3jiiSdIJBLcfffd/Pd//3fa8Xnz5nH66acTjUZ5\n6aWX+PDDD1PHlixZkqaGamhoaMPstA+WWXhgPaEsLIqL+oZ6muPNxFWc5ngz9Q31JbvW0KFDGTVq\nVGp73rx5jBw5kpEjR/LKK6/4Mos+ffpw/PHHA3D44YcHEuJTTz01o83TTz/N6aefDsBhhx3GwQcf\nHDi28vJyjjjiCObPn088HmfgwIFpx+fPn8/pp59OOBzm5JNP5p//dErVjxkzhtWrV6c+nZFh29os\nXLCeUBYWxUd0SJSKcAXN8WYqwhVEh0RLdq1dd9019Xv9+vX84Q9/YPny5fTr148f/vCHvnEGFRUV\nqd/hcJjW1lbfvnv16pWzTS6cfvrp/Od//ifXXHNN2v5Vq1bx5ptvMmbMGACampr42te+xgUXXNCm\n65QCVrJwwXpCWVgUH5GqCItrFvPrMb9mcc1iIlUdswLbtm0bu+++O3379uW9997jkUceKfo1jjzy\nSP7+978DsHbtWl/JxY1oNMrll1/uq4K65ppraGhooKGhgXfffZe33nqLzZs3F33MbYWVLFywnlAW\nFqVBpCrSYUzCYOTIkQwbNowDDzyQwYMHc+SRRxb9GhdeeCE1NTUMGzYs9dljjz0C24dCIS699FKA\nlHSilOLuu+9m8eLFqXYiwsknn8zdd9/NYYcdlrJZGPzyl7/klFNOKfr9ZEOPqcE9atQoVYziR7GY\nliiiUauCsrDwwyuvvMJBBx3U2cPoEmhtbaW1tZXevXuzfv16jj32WNavX09ZWddch/v9dyKyUik1\nKuCUFLrmHXUiIhHLJCwsLPLDZ599xtixY2ltbUUpxaxZs7oso2gveuZdWVhYWHQA+vXrx8qVKzt7\nGB0Ca+C2sLCwsMgJyyySsPEVFhYWFsGwaihsfIWFhYVFLljJgrbFV1hJxMLCYmeCZRY48RXhcH7x\nFUYSueoq/e1mGJaJWFiUFmPGjMkIsJs5cyaTJk3Ket5uu+0GwLvvvpuRxM8gGo2SywV/5syZbN++\nPbV9wgknsHXr1nyGnhW1tbWICG+88UbatUQkbUyrV69GRHj44YfTzg+Hw2n5o6699tp2j8kNyywo\nPNNskCSSjYlYWFgUB2eccUYqe6vB/PnzOeOMM/I6f7/99kvLu1QovMzioYceyqhL0VZUV1en3ds/\n/vGPjHxT8+bN46ijjsrITNunT5+0/FGXX355UcZkYJlFEoVkmg2SRGy6EAsLfxRT4j7ttNN48MEH\nU4WOTHqMo48+OhX3MHLkSKqrq7n33nszzm9oaOCQQw4B4IsvvuD000/noIMO4pRTTuGLL75ItZs0\naVIqvfkvf/lLAP74xz/y7rvvMmbMmFQepyFDhqQyxN5www0ccsghHHLIIan05g0NDRx00EGcf/75\nHHzwwRx77LFp13Hj5JNPTo15w4YN7LHHHuy9996p40op/vGPf/CXv/yFxx57rF01tQuFZRYu5PtA\nB0kihaqzLCx2BhRb4t5rr70YPXo0ixYtArRU8V//9V+ICL179+aee+7hhRdeYMmSJVxyySVky1Jx\n6623sssuu/DKK69w9dVXp8VM/OY3v2HFihW8+OKLPPnkk7z44ov87//+L/vttx9LlizJqFa3cuVK\n5syZw3PPPcezzz7LbbfdlkpZvn79eqZMmcLLL79Mv379WLBgge94+vbtS1VVFS+99BLz58/PyCH1\nzDPPsP/++zN06FCi0SgPPvhg6tgXX3yRpoa6++67C5vYHLDMIolCH2g/ScQWTrKwyEQpJG63Ksqt\nglJK8bOf/YxDDz2UcePG8c477/hWpDN46qmn+OEPfwjAoYceyqGHHpo69ve//52RI0cyYsQIXn75\n5ZxJAp9++mlOOeUUdt11V3bbbTdOPfVUli5dCsD++++fyu2ULQ06OEWSFi5cmJH/ydS8MO3cqiiv\nGsrLaNoL6zqbhN8D3RZib9OFWFikoxQJOk866SQuuugiXnjhBbZv387hhx8OwF133cUHH3zAypUr\nKS8vZ8iQIW1S1bz11ltcf/31PP/88+y555786Ec/apfKx6Q3B22IDlJDga7NfemllzJq1Cj69u2b\n2h+Px1mwYAH33nsvv/nNb1BK0djYyKeffsruu+/e5rHlCytZJGFVSBYWpUEpJO7ddtuNMWPGcM45\n56QZtj/55BP22WcfysvLWbJkCW+//XbWfo455hj+9re/AfDSSy/x4osvAjq9+a677soee+zB+++/\nn1J5ga6l/emnn2b0dfTRR7Nw4UK2b9/O559/zj333MPRRx9d8L3tsssuXHfddfz85z9P27948WIO\nPfRQNm3aRENDA2+//TYTJkzgnnvuKfgabUFJJQsROQ74AxAGbldKXes5fgEwBYgDnwETlVLrXMcH\nAeuAWqXU9aUcq3mgbcZZC4vioxQS9xlnnMEpp5yS5j105plnMn78eKqrqxk1ahQHHnhg1j4mTZrE\n2WefzUEHHcRBBx2UklAOO+wwRowYwYEHHkhVVVVaevOJEydy3HHHpWwXBiNHjuRHP/oRo0ePBuC8\n885jxIgRbSqBalRNbsybNy9DLTVhwgRuvfVWampqUjYLg+OOO66o7rMlS1EuImHgdeA7wGbgeeAM\nDzPoq5Talvz9fWCyUuo41/F/Agp4LhezKFaKcgsLi+ywKcq7L9qToryUaqjRwBtKqTeVUs3AfOAk\ndwPDKJLYFc0YABCRk4G3gJdLOMYM2KA6CwsLi0yUUg01ANjk2t4MfNPbSESmABcDFcC3k/t2A/4P\nLZX8NOgCIjIRmAgwaNCgdg84FoMxYxxD3JIlVh1lYWFhAV3AwK2UukUpNRTNHK5M7q4FblRKfZbj\n3NlKqVFKqVFf+tKX2j2WujpoagKl9HddXbu7tLDokegpFTZ3JrT3PyulZPEOUOXaHpjcF4T5wK3J\n398EThORGUA/ICEiO5RSN5dkpEWGLc1q0ZPRu3dvGhsbqaysREQ6ezgWecC42fbu3bvNfZSSWTwP\nHCAi+6OZxOnAD9wNROQApdT65Ob3gPUASqmjXW1qgc86glHU1MCdd0JLC5SX6+1CYdOdW/R0DBw4\nkM2bN/PBBx909lAsCkDv3r0ZOHBgm88vGbNQSrWKyFTgEbTr7J1KqZdF5FfACqXUfcBUERkHtAAf\nA2eVajz54pxz9HdNTduIfLGC+ywsuirKy8vZf//9O3sYFh2MksZZKKUeAh7y7PuF6/dP8uijtvgj\ny4RXImiLVAFOcF9TE4hAZWVRh2lhYWHRKeh0A3dXQbHy10QiMHOmjgRPJGDaNOuGa2Fh0f1hmUUS\n0agm8CL6uz3pPhobNaNIJGyqcgsLi54ByyxcMI4d7XXwsHmmLCwsehoss0iivh5aW3WMRUsL1Na2\nXX1kU5VbWFj0NNgU5Um4DdOJBDz+OCxd2nZi702cZmMvLCwsujOsZJGEkQbGjYNQqLj2Blub28LC\norvDMgsXIhGtfurVSzOMYrm+2trcFhYW3R1WDZWEW000cyZMnaqJ+7Rp+nhjY3YVUjY1U6GVwqzK\nysLCoqvBMgsyA/LOOstxfW1q0owjkQhO35ErxUchhZVsuhALC4uuCKuGIlNNBI7rayik98fjmnH4\neUnlo2aKROCKK3ITfquysrCw6IqwzILMuIiaGr2iP/98OPFEnVTQGL0ffzzTSF3MuAobo2FhYdEV\nYdVQOCk6FiyACRP0diwGc+bo1X1ZGYwaBStWpHtJGSmhmPW7bS1wCwuLrgjLLNCMYdo0zQSWLoXq\naqcQEuggvU8/1RJGa6v/ir+YBelLUdzewsLCoj2waijysxO8+qqO7j7/fGt0trCw2PlgmQX+doKa\nGv3bQCnNTAYNsozCwsJi54NlFsDChbDXXnDkkY7UEIloCeOCC3SQXiEG51gMpk+3kdoWFhY9Bzu9\nzeL//g9mzNC/33lHMw634dqNfKrnueMkwmFdec8UUrJG654PG1Bp0VOx0zOLf/0rffuuu+C66/Tv\n2bOdSO5evfKrnue2f8TjMGuWrust4hjHrc2jZ8IGVFr0ZOz0aqhTT03ffv99/dLHYjBlivaEMpHc\n+QTIGfuHqYlhUp7bQLueDxtQadGTsdMzi+uug2OOcbaV0m6ztbX6pTfIt3qeiZP48Y8dW0d5uQ20\n2xlgAyotejJEKdXZYygKRo0apVasWNGmc712BhFHojBlVm+5BSZOLLxfo78Gq8veGWBtFhbdDSKy\nUik1Kmc7yyw0zEu+cSPcdpuWKkIhXd+itrawF98SDAsLi+6CfJnFTm/gNjDusrEYzJ3rGCkNo5g9\n20kHkk3CsEZOCwuLngjLLDwwNoe6Or29dq12rV24UG8/+qj+njjRX4LwM3JaZmFhYdHdYZlFAObO\ndepxe7Fggc4f5SdBeAsdVVbqAL32qKSsWsvCwqKzYZmFD4x04McoQKui3BKEqXNhVFYma2xlpZOg\nsK0qKavWsrCw6AqwzMIHRjowkoWIdqkFna68ulr/drd5/HGdsXbmTKcEa5DffSFSglVrWVhYdAVY\nZuEDr3SwYIFmBomEJto1NXDppbpNba1zzFuCdebMTJVUoVJCofW7LSwsLEoByywC4K4pUV2tpYYd\nO7SE8cYbOuhu1izNLJYu1cRcRDMTUyCpsTG9kFFbpARbDMnCwqIrwDKLABijcmWlJvoXXgi33w4f\nfeS0WbBAe0UF2SgMcXcT+GxSQpAh2xZDsrCw6GxYZgHENsWob6gnOiRKpCqSMir72Szc2LHDSUO+\ncaP+uG0WXgKfTUqwhmwLC4uujJ2eWcQ2xRgzdwzN8WYqwhUsOWsJ9fWRNG+ooCD3pUs10TfJAkHn\ng1qyJJjQB0kJ1pBtYWHRlbHTM4u6NXU0xXWx7aZ4EzOWzeCy6D2+3lDeb8Mk3MykqUkH9AUR+qB8\nUdaQnQ4bW2Jh0bWw0zMLLxa+tpBdK37IWb8/Fhq+xYihg1m1CrZsgf79YcQIWLUK5szR9SnCYad2\nhcFtt/kXSvJLWOiucVEKQ3Z3JLpWJWdh0fWw0zOLEV8ekbHvrrV3IfyN3n17M2Kf55g7tzpFuGpq\ntFG7psYxat9xByxf7pwfj6dLF4ZgL1/ueFQlEo5EYmplXHFFph2jPYS+uxJdq5KzsOh6KCmzEJHj\ngD8AYeB2pdS1nuMXAFOAOPAZMFEptU5ERgOzTTOgVil1TynG2Li9EUFQpBsmFIodrTu4454NNDdX\nZxCuSETnjZo6VUsHQfAayw1CIUcaSSQ00/E7rz2EvrsSXauSs7DoeigZsxCRMHAL8B1gM/C8iNyn\nlFrnavY3pdSfk+2/D9wAHAe8BIxSSrWKyJeBNSJyv1IqC1luG6JDopSHy2mON2ccUyhW9bqRsvLx\nQBgRnVCwslLHXkyZks4oQiEtLRgJBIJTh3z96/Dqq3p/KKQ9qNwohNAHSSDdlegWK7akO6rgLCy6\nKkopWYwG3lBKvQkgIvOBk4AUs1BKbXO13xX08l4ptd21v7fZXwpEqiKccMAJLHx1oe/xxMBlnHvD\nXWx5tIaFC7UqaflyXV3PzSjKy+HmmzPdZg3B/uKL9H6/9jV4661gQp4voc8mgXTngL72xpZ0VxWc\nhUVXRSmZxQBgk2t7M/BNbyMRmQJcDFQA33bt/yZwJzAY+B8/qUJEJgITAQYNGtTmgfbftX/GvrJQ\nGYlEAhFhxOgdLPDwkqVLnd/hsGYUfnUu3CnP77hDM5jycjj+eG0wB39jeL6Evr7eUXEZ20dnBvR1\nldV8d1XBBaGrzKvFzotON3ArpW4BbhGRHwBXAmcl9z8HHCwiBwFzRWSRUmqH59zZJG0bo0aNarP0\n4WfkjifiCEI8EWfaw9O4cMxYHn10qOva+lsExo/XEkUs5v8iG4LtNoq7I72NyirovGyorHRUXH62\nj45EV1rNd1cVnB+60rxa7LwIlbDvd4Aq1/bA5L4gzAdO9u5USr2CNn4fUtTRuWCM3GnXRZEggULx\nResXrB4wmcumb2D0aM0gDEIhuP9+uPJK/UKbiG43YjFd0wK0x1Njo3822jaNvVGPwYzFa/soFsw9\n+N2fQVCW3Y64thdGMvv1r7s/cS3VvFpYFIJSShbPAweIyP5oJnE68AN3AxE5QCm1Prn5PWB9cv/+\nwKakgXswcCDQUKqBRodE6V3Wmx2tO1AoX++ox958jKVl1Xyj30aU2ju13x1f4VUDxWJa/WRiMsyq\n0J0CXSS7NJBL/RCN6qjxUq6g813ZlmI1355VdU/JqdWTpCSL7ouSMYskoZ8KPIJ2nb1TKfWyiPwK\nWKGUug+YKiLjgBbgY5IqKOAo4HIRaQESwGSl1IelGmukKsLimsXUN9RTuUsli9Yv4t7X7k1jGApF\nU2sTb25qCuwnHHYq4xlVk4mrAGdVeMUVOofU1Kma2Uybpr2rsgXxBRHKjjBi56v/z3cshejfe5rt\noS3ozo4KFj0HJbVZKKUeAh7y7PuF6/dPAs77f8D/K+XYvIhURXQSwU0xpj08zbdNggTfOuUN7npl\nQMYxEbjoIscWIZIeeCeSvipsbNTHTTpzPyJYCJEuFQGJxXSCxLLkk5JrZZtrLIVKCnZVrdFTpKS2\noCcb97vTvXW6gburob6hnqbWppQ6auieQ9nw8QYUipCEOPi4Z5g1+FvccQesXOmoocrKYNs2h7gb\nu4aIljjOOy/d6ykfItjZhNKbnuT88/09twpBoZKCd1UN7a9pbtF9UGrjfmcS6+7muFBKA3e3xNam\nrSTQ7kUKxanDTqUiXIEglIfKiQ6JMnEiPPecJp6GKRiPpIoKJzjPfEQyiWw+BtjONtK6CXs8DoMG\ntX8MhgGGw/kzwEhEq+5Av1wkobxhAAAgAElEQVRXXRXsTGDRs1BK434sBmPGwM9/rr87+nnqbo4L\nVrJwIbYpxg2xG1LbIUK8/uHrtCZ0iEdcxalbUwdotVVNDcyd66wMRiQ9cF94AZ5/3lFBtbb6r6C9\nqgW/VU5nqh9KIdm0R/9u7ReFozupOfxQSum6rk47mUDubNGlQGdrDgqFZRYu1DfUk/Dk5XAbulsT\nrcxaOYu5a+ayuGYxkUjEt0peOKzVUqbGRSiU7vFkvKTAkTi6okhaKsNqWxlgd3u5Ohtd8ZkqFD3Z\nuN/d7s0yCxeiQ6L0KutFU2sTItp9VqnMBINN8SbqG+q1UTxJ+KZPd1a9oFVUW7boGIxEQueRAu31\nFI3qtqDdapcsSV81NzVpxjNyZPttBO1FVzKs5qo02F1euo5CT5HESvUM1tTAnXfqRV15eXBwbCnR\nld6vXLDMwgW3C+3yd5cH5osShOiQaNq+aFRLFImE/jbR2vfdp9VRra3aVfbccx2JA5yX2B17kUg4\nOagMM+kuD1Sp4fdyeQ3x55yTm8nuDMzFSmLZEYnoZ6CnPwfFgmUWHkSq9BNT+2RtYJvxXxufaueG\n2wNq7VrtcuqGkTrKyx3JwrzEZtVcWwuPPZYZm2Ef5GAC7zXEz5qlbUlBapeeoJ7JB91NzdEZ6E4r\n+86GZRY+qG+oJ56I+x4LSYjjDzie2KYY9Q31RIdEiVRFqK/X0oMptWoC7twmEBHo2xdOOAFee02n\nKb/sMs1YamthwgT9bYgf2BWhQTYCb1bQJgBSqexMtqeoZ/KBJYYWxYJlFj6IDolSEa5Ipf9wI6ES\nTH5wMmWhMloTrVSEK1hcs5hoNJIS+UUyGQXofTNmONtvvqlTlZt9jz6qV8X19ekGcLCxBdkIvDuz\n75w5mllnS6Ni1TMWO4MastgQrwG3u2LUqFFqxYoVRevPRHIvf3d51nZhCXP+yPMZtMcgKhtPpPGV\n6pRnlLE/iDhqJTdEYMAA2LzZ2XfssfDII87DvHUr3HijJpK9ejkr6p3tYc9XdTR7tiPVuefLr7+d\naf4sHOwsash8ISIrlVKjcrWzkkUAIlURZh43k+jcqG8VPdCGbhFhzuo5SSnj19qltipCdbXjUrtq\nlSZiXkmjrAzeey9934QJwaVYTaJCKMyg2xMIY77693zSqJj+uuJc9IT/qqtjZ1JDFhOWWWRBpCpC\n/Vn11DfUs7VpK79b9ruM5IJKKVpUCwmVoDnenOFSa7Bliy7JajBsmK62d9ttzr5jjtEFlIwbrpe5\nhMOaiBRi0DVRqmYV1Z09q/Ih8NlUTF2dENsVb8fAqiHbBssscsCdYPD+1+7nlQ9fSTseV3HCEiYs\nYSrCFRkutQaXXQaLFjkP6O236/133ul4ST33nCYYXjdak1/q5psd4pGvQbezo1Q7GkESSEcT4rYw\nJrvi7RhYL7G2wTKLPBDbFGNs3Vh2tO7wPa6U4qjBRzFs72GBfUQiTvCd+wE95xwtGZhYjPp6nQfJ\nHRnurevtNeiaWhlddYXU0St6PwkkKA9PPuMqdPxtZUx2xdtx6KpqyK4MyyzyQH1DPc3x5gzPKIME\nCZ56+yme3vh0KhWIOc+41oLzgJrKb9EoqfxSphDS1q3OMZM8zw+mLxP8F0TIOjtK1W1/CYXgllv8\na5WXGl5CXFmZH0FvC+Fvq4RgV7wWXRmWWeQB40rb1NqUykjrh4RKsKN1BzOWzeCRDY/QHG9OudYa\nhuFHPGfOhMmTtYQwY4ben82Tx41sKySzIr7ppkzppKNQX++o0xIJmDRJ7+9ohuElxPkS9LYQ/vZI\nCPmseLu67cWiZyIrsxCRvkqpbQHHBimlNvod62lwpwHZ2rSV3z/ze+LKP2hPobj3tXsRkQyjN2QS\nz8mTdXCeuzyrnyePm0CYfnJVo+ssY6l7rNGoZn7GWJ9IaNdWv8qApYaXEOdD0NtC+EspIfRkI7hl\ngl0buSSLemAkgIgsVkqNdR1baI7tDDCG7ulLp5NQwdIFkCqc5Gf09hLPeBzWrcvsw62Scme0NTEb\nSgVLH7GYjgQ3TKkjjaV+xOyWW7RE4b7nzjbe5kvQ20r4S6UT76lG8J7MBHsKcjELcf3eK8uxnQbR\nIVHCoXCqxkUQlFJMPHwiNYfVpOWRikQ08Zw6NT2hoEE4rL9NtHco5DAXryutibtwSx/uKOZEQp/b\nkcZSP2JmbC9Tp2pVmzdle2chX4Ju2hijeGcSsZ5qBO+pTLAnIVelPBXw2297p0FYwjnbKBQvvPdC\n2r7YphjTl06n+vgYTz4Jo0ennzNwIIwfnzw/ObuJhCawvuMIO8TCrMxmzXIkilAIxo3r2FVaUCW8\niRO1629ZmR7btGmZlcmM4b+rVcAzc9sVKvQZSact1RO76vxC2yooWnQsckkW+4jIxWgpwvwmuf2l\nko6si6K+oT6nVGGw/N3lHHnnkVx65KUAXP/M9Sil6F3Wm8U1i5k5M8KYMU4cxDvvpKf+cMMvQO+i\ni9JdQJubHSYjotVUtbUdm+4im9omW3R1qdQQ3vtsy33X1TkxLV1h1dsWFVcx57cUz471BOv6yMUs\nbgN29/kNcHtJRtTFYTyjmuPNKSN2NhuGQjFj2Yy0fU2tunjSFUdHWLIkMy15Phg/Xns5Ga+qiy92\n1BMmBciIEf6qk1Lqh7MRkmwqlFKoIbz3OXOmY/vJ975jMe16bP6bsrLuueot1vy29dnJh8HY2Ieu\njazMQil1ddAxEflG8YfT9eH2jDKG63y8pNIgpM6NRDSzeOKJYHWTF+Xl0L+/s9pNJHSywZtvdlxk\nIfilLpV+OBchybZ6LIUu3nufCxY4KjqvvSdbH8ZTTQTOPrt7ErRizW9bnh1rvO4ZKCjOQkSGAWck\nP1uBnJkKeyKMZ5R7e/rS6Wlt+u/any2fb/E9/6f/8VNfo/cFF6RLF8OH6/oXy5Y5BGvIEMdg7G4b\nj2tGYY65y7x6X2o34QiHdZGmWKz9L3A+hCRo9ehmJJWV+nvt2vbFh3gJ5PDhOg08aIaRj5Hd20dn\nlN4sBoql5mkL07HG656BnMxCRIbgMIgWYDAwSinVUMqBdTe41VMV4QquHnM1kx+cnCZp7NVnL/rv\n1h+A6Uunp0V3T5yoYw8uv1zXufjWt+Bf/3II+vjxOrfUpk1alXLWWempz93Gbsj+UkciWiVzxx06\nI+5tt2WvLJf3HGS5Zj4w13Zn3M03QNFPzeEXiGc8y0IhzYjyGVNH6NI7IsagGGqetsxHd07umA96\nwj3kg1xBeTGgLzAfmKCUWi8ib1lGkQmveqq+oT7DlvHRFx/x0Rcfse4DHVhREa7gpuNvYtV7qwCo\nOayGJ5/UT9v06TB/viNRbN+u1VTxuFY/bdkCvXs7NoubbyZ1nnlog17qWEwzHKPGAifJYK7Av2wv\nRjEIq1mFuoP4cq1Gs6k5vASyV69gZhZ0b0FEtlhEorupaQplOkHPRSH33VUJcnf779qDXJLF+8AA\nYF+099N6dmKX2VzwqqdCEspqw2iONzPpgUmpFCJzVs9hyVlLiFRFMlZjEyboRITxuCbwDz6Yn40i\nWwoLtxorkdCSRiKhpRQRJ0HhYp3qKiNxod+L0d7Vq7lvt2SRS0rJV83hp+oy+wt96d3t86kpkg09\nRU2TayHh3ZfvfXeVbATZ3qXu/t/lg1wG7pNFZA/gVKBWRA4A+onIaKVU9hJyOzkiVRH+9L0/8eMH\nfpy1nTvXlLcehns1BukpQVpaNHGfOVM/nNlsFF4YguyWLAxzMAZzcFxF6+q0msrdvtDMrfnCS9Dz\nsVkUov5yq7rcxKfQl97dPldNkVzwG//s2dogP2FC5yReLBRtIej5/m+dRZDzuaeermJzI6fNQin1\nCTAHmCMi+wL/BdyYzA1VVeoBdjfENsVSqqjqfaodN1uEb+z3jYwyrSFCKYYhIlTu4lhd3aux6dMz\nXWuXL9cFk265pTCjtSHIRlLwShlKaY+rREL3CZkxHBUV+WduLRTFUnMEwY/4FGpv8TLc9sRgeMe/\ndi38OLnGMAb5rs4wCiHobiKaz//WWVHr+TpsdIX6KR0CpVSbPsDgtp5bis/hhx+uOhvPbHxG9bmm\njwpfHVZ9rumjLrj/AhW+OqyoRYWvDqsL7r9Alf2qTFFLat/wW4entqlF9fp1L/XMxmcy+35GqbIy\nQ5bSP+Xl+vgzzyh18slKhcNKhUJK9emj1KxZSv32t85x89vgsssy+wuFlLrggvTz+vTR/VZU6GOm\nr3BYnxMO6+3uAPf99OnjzIff/OTq54ILlOrVq/19uXHssen/x7HHFt5HRyNoTtvazu88v/lszzzn\nc822jFWp7vVuACtUHjQ2l4H7vhy85vvFYlo9AabuRVzFU3W73R5SgGG0gE5pvvr91Wl9NMebqVtT\n51sL47zz4M9/zrxuPK6lBID773fUVTt26HxMQXaISARWr87sr1evTP170AqwFCu+QjPsBp2bLfjL\n737aItFEIpk1Rdq7qpwwwZEozHZXR77SXVtVSn7/TalX7+1x2OiJObxyqaEiwCZgHvAcO2nywHxh\n3Gd3tO5Aoejbuy+LaxZTt0ZT8hFfHpFWFyOomNLsF2YD0CvcK60WRk2NdnN12y5AM4PZs521qIGI\nbutOQqiUZiKmvKqXMJ18si4Bm8tAaYjyzJnFrZXhJgD5ZNgNOjcX8SiGG2lQX0EEMV8dtlE5dSeb\nBeQ3p8Ukoh1hy2jrc5LLG7E72jJyMYv+wHfQMRY/AB4E5imlXi71wLojIlURLvzmhcxYNgOlnDQf\nc9fMpTneTDgU5oSvnsC7n76bYbsAEARFSs1HU7wprRZGJAJ/+lN6um8Dv9xRRx6p63q3tuptpbRh\nXCltq6ipaRthCiLK2SQCcyyX0dpNANzIJ+K6EO+aUr6sfgSx0FXwxIndh0kUgmK4Vxt09dV7W6Sh\nrsxIcnlDxYGHgYdFpBeaadSLyNVKqZs7YoDdDavfS9fr/Gvdv1KqqXg8zr2v3Us45J+11itpKKVS\nBu+U4fz4KE8/HaGuDl54AZ5/PtPwPWSITkq4bJlmEuefrxlDXZ1T77ulRacZqa0tnDD5EWVIdyV1\nq7xMTqZ8Au38PLUgM+jQD/kQj44wPPoRxEK81Xo6iiXV5cN4uhrxzbag6epG8XwiuHsB30MziiHA\nH4F78ulcRI4D/gCEgduVUtd6jl8ATAHiwGfARKXUOhH5DnAtUAE0A5cqpZ7I8546FROGTeDRNx29\nzqnDTmXmszOJJ5fKCkU8kRl7EZYwCpUWyKdQTHt4Ghs+3sCNsRuJq3hKNXXrrRFiMf0SNDen99XQ\nkL49aJDz0BkX2ERCJy9cujT/hHrmpfMjyu6XwOt6u2BB/oF2Xk+tlhYn6DDXGPMhHh3lhukliN45\nq6xMD6C0aBuyMZ6uSHyzLWg6y0U4X+QycNcBhwAPAVcrpV7Kt2MRCQO3oNVYm4HnReQ+pZS7Ltzf\nlFJ/Trb/PnADcBzwITBeKfWuiBwCPIIODuzymHi4XqIvWLeACcMmMPHwiWzbsY0/r3Qs0yEJEZIQ\nLQld/SgsYS75j0t49I1HMwzeX7R+wfXPXJ9iIiZjrYnFqK/Xhu+ganvuBzIS0av8SZMcW4A3cjvf\noCg/oux23XVLFhMmaKaUb6BdkOE4H+RatXak6sK7qnXHjxSa/daicHRF4pttQdPV1Wq5JIsfAp8D\nPwH+VyRl3xZAKaX6Zjl3NPCGUupNABGZD5wEpMiaSq/vvSvJ6HCl1CrX/peBPiLSSynVlPOOugAm\nHj4xxTRAp/GYu2YuTa1NhEIhbjnhFqr3qU4zfF+46MKUB5UXbmkjQSIjFuP22+HoozP1/CJw4YWa\nGcyYoTPVeiGSOyrb76W74or0dt6XwJxnXojq6sIC7Tqj3kYxEbSqtSqp0sH7zHRV4hu0oPF7NruS\nGi2XzSLUjr4HoD2pDDYD3/Q2EpEpwMVoldO3ffqZALzgxyhEZCIwEWDQoEHtGGpp4c0bFamKENvk\nlCtb9MaiQEYhSKpuhsGCdQsAaNzeqPuLRPjTnzKz1iYS8PvfpzORcFh/jDvt+PGOu63xkoL0B7Sy\nUksDSuUnEbi3g45lQ0e4RJb6xcu2qg0iYl2JMHQEinm/Qc9Mdyuo5H42u5oaraAU5aWAUuoW4BYR\n+QFwJXCWOSYiBwPXAccGnDsbmA0watSoLp2zyp03KrYpRnRuNJBBGIQIURYuY9jew3jx/RdTkd6P\nvvkoj775KCEJpWwYEydG2LBBSxCp80OZXlJum4K7nck5dccd6ZKGMU7H47rdhRem51TyQ3uq08Vi\n2uhuVFbFWnl3NCHOtqoNWkF2JcJQasyerWOA4vH83KJzIYg5d8TCoFToamq0UjKLdwB3OpCByX1B\nmA/cajZEZCDakF6jlNpQkhF2Euob6mmJt2RtEyLEqP1GsWrLqgw7hkFCJdJsGNddB0OHaoLf3KwJ\n7uuvZ6qnjPQRj8O992omYGIaWlqc337G6RtvdNKA+Ln9eZMN5ludzn1uS0v+SQTzQSkIcS7mk2tV\nm29sRk9ELAZTpjjFvvItRJUNXVXl1B50tXsqJbN4HjhARPZHM4nT0bEaKYjIAUqp9cnN76Gz2iIi\n/dAxHZcrpZaVcIydguiQKOXh8qySRXm4nJFfHukbj+FGggRbm7amto0LrMktBHDAAfDGG/5lW03i\nQHdtDKUcQm2M0yZIzkgm3mAzvzxTbmaTjQgaYu52lw2FYNy44BrihaDYhDhf5lPIqnZn8paqr0+X\nbvNxi86F7qhyyoWudk8lYxZKqVYRmYr2ZAoDdyqlXhaRX6FzkdwHTBWRceiiSh/jqKCmAl8FfiEi\nv0juO1Yp9e9SjbcjEamKUH9WfcrA3bd33zSPJ0E4e/jZ9O2dzX/Awe+f+T0nf/3klJprwYL04xs2\nwKGHwpo1/ucrpRmBG6NGORlt3cZpt5TgDjbzxkUYTyw3s/FbHbnVTu5Ehb16FYdRQPFXaKWQAorl\nLVVqdVsx+o9G9f/rrsVSrLgLv4VIVyG2bUFXUqOV1GahlHoI7Xbr3vcL1++fBJx3DXBNKcfW2fDW\nvhi651CmPjQ1FUtRc1gNtfW1efUVV3HOu+88fnLET2jc3gjDDoBHTUIhQSl48cXg80Uy7Rhr1ujs\np+ZFM+Vaq6sdIzg4hNNN6MvL0+s7GGbjl/bAWxWvrCyzNoTfC58PESg0u2m+KJT55Euw2ustVWq7\nR7b+CyHKne2RZtFG5JNtsDt8ukLWWTee2fiM+u1Tv/XNIJvvOZc9dllaRlpqUYNvHKxCtaGM/Rmf\nI3+rkFYlEldlZUqJ+GesdWeudbcR0fv8sqm6M3HOmuWfkTYfuDNzglKjR/tnFe3TR2fCLSvT15s1\nS4/NZNb1u157Mobmg3yznQZl7M3WV1vHXupMp0H9l3qu24rulPm1M0Exss5atA2xTTHG1o1NZZt1\nJwPMBiNtxDbFmPTAJOasnpPRZtMnm9IKJgXiOz+DA+9jwEf/w6ihQ1h083dpbgr72i1CITj3XF2q\ndeFCvU8ppxiSO/GgVwXT2Ni+zJzhsGOA91OT1dc7kkciAZMn6/3mnCDjaFcxGLvH4VckqZgun6U2\niAb131Xm2otSz0dnq7g6+vqWWZQA3lTl7mSAuWAYjclcC06CQSA/RmFQ9SzvD17J/SpBuOYoTvp8\nHov++eWUx5Nxra2ocKKl77033dBtvk3iQT9DbNADm4/H0DnnOPmqWlszCU00mu4CnEjklzOqlISi\nEPWGGUdQkaRsLp+gt9euzR7IWCp1mxdBTKzYc12oijGoTSnVXZ2t4uqM61tmUQKYVOVGsogOieZ9\nbn1DPU2tTWlJBQWhIlxBS7wlkFkYhhIixKH9D6UiVMF+fffj/tfu13XABzzN6DF/4bIpV6ReHnDs\nD2vX6up6bq8oN5qbddtbbw02xLrTlUN+D3NNjeNFVVaWSWgiEV0J0Pjkl5XpMebKGVVsQuEmToWs\npM04vC7F5j6DvKDM3OZKvuhHNIx9qRTwM7gWc67zIYKdlYrejc6Wpjrj+pZZlAB+Edv5IjokSigU\nIuGyOCdI8JMjfsLq91bz+FuPp7ymwhImoXRdDLfksXrLairCFZw78lweeeORNKYVqUqqPzbFqHtg\nPXP+ciYtzeEM91kv3NKFnyH2iy+0isi43Z54Yv4Ps1eS8a4aq6u1mgz09aEwg3F74SVOM2cWtpI2\n4/DLdRXkBWWcDnIlX+xsomVQrLnO535Kcc/5ptA36OwYiM64vmUWJYLX28nAXaPb73ikKsItJ9zC\nBQ9ckCZdrH5vNbXRWpZuXJqqjTFs72GBAXvN8WZWvbcqrfjS2n+vpb6hnspdKpn28DR2LLkI1aTA\nQ6yD0NKSmbbCbXMw34mETiFSVua45VZW+vWo+zPR46bi39y56YTZLb24mVU25KvPzaddsew0QeP2\nY76hkJ5byB6g2NlEq9jI535KofbyeuXliirv7BiIzri+ZRYdiHwN3xMPn8iGjzekiieBTn1uJJYZ\ny2Zw/+v3BzIKgy2fbaFuTR1zVs+hOd6MQiEI4VBSIhnyBIR/DnEBFU5JFkESRiKRSfS/+lX/jLdK\nwfHHO3mn/vd/tYTgfai9Lz6kE+Z8gvq8yFdNkW87P+JUjJV0rsR3Rq2XbbXb2USr2Mjnfop9z2Yx\nkE8Kfe84OnO+O/r6lll0IAoxfF837jqG7jk0LdW5wQPrH9B2iCwISziVoNAtoZh6GiEJQdWzcNZY\nwm+P45KxP6KfGsrWrfC73wX3+6tfwV13wV57wYMPamnDQMQJ7isrS081YlKh+/nle7PVuiULb1Bf\ntshm0+fGjfkxmELUGWclw0W9tcnbikK8oMx9+d1rsRhXUL8dfT7kdz/FlC4Nk843hX4xUGi+tK6w\nGLDMogNRqOHbm+ocNMNx2zNChKjcpZIPtn+Q2jdw94F8Za+vsPTtpb51vhWKIwcdyVNvPwVVz5Ko\neo7VA5ZTG62l7reRAHWU3vnOO/DOO8Gl2MvK4IQTYNGiTInjqaf0gw/ZjbJeghkUQe4NCnNX6itL\nPtnZXvp81Bleom5sJu1FNi+oIAN2OKy9x0aM8J+HYhD69njYdLaHUFvH4rUbFbOmfHvH1pXm1DKL\nDkR7DN8GlbtUEg6FUQlFOBRO1cYYWzeWptYmEiTY/OlmNn+6OXVOiFCaEVwQ9uq9V+q4QvHom4/y\nRMMTHLz5KeAIdMkSjeHf/ITVa3fA9n3S9nth7A7btztJ4txYtw7GjIGzz86+ovcSTD+dvvc8N/EF\nXUp20KD2u1aWyoCcr97dfX0Tp2FSzLvVJdB+otLee+0oY3tb7ExdSaVUyNhyte1IqcMyiw5GkOE7\nH8Q2xZj28DTiiXiKURjJY3HNYqY9PM038aBCcdi+h/Hi+y+i0Ezmox0fZbRrTbSyZt+LIbwE4uWI\nCJdeGqLfiX9i9cQD4dWTUz16mUYopJlFWZlWHRnDtRfNzfq7LQbKbATWeyxfdZEhErGYv3rL3W84\nrFVcsVj7X8x89e5+cRqmFom7EmIxCHV7DccdYWxvj52pWNdvrwRSyNiyte1oqcMyi24EY/NIkECU\n6DxQSUSqIoFZahUqzRjemmjVKig/VD0LPxoDDVFk/6WcPPV3rP13JeGjbiD++gmQKMePUYTDTsR3\ndbVWRZlocDdEtITx3e/qyn2F2ACyEVivKsGstvPpO9tLZ/o1cRK33ZYegd0e5Kub94vTcMe0mD7a\nSxzbazh2j3XLFv2dK6CwUPgxRbM/yCW5WNdui9eUHwoZW7a2HSXJGVhm0Y0QZPMw7rgjvjyCsIRz\nGr9zoupZqHoWhVC3po65a+aiBjYRPmccR348i6X/GpZm1xg0CDZtSnd/feih9C6Nh1UioW0XoF80\nYwMoNNle0DHIjInIRaxyvXSRiN7X2qrbNDXpjLjZsuIWYwXqvn4kkr0mebGIYzHUMXfe6UiQkD2g\nsNDxelfalZX+jL4U6pm2ek35oZB5Dmrb0W7Tlll0IwSVZzXuuOFQmCOrjmTZpmXtZxhAWaiMLZ9v\nSaUeCVfFOK7mXvhsX556ZC+MhDFoEGzanEBCUFaugHBgwSU32qpvdxMCSCcKbsLf1KQjv4OKNRkU\n4ttvVpWPP669tLJFGLd3BepFNpVZV/GYqa9P95ADf6LaVhWKlyl6GX1dXaYUlk/fpiYLZEq7bsbv\nfgaM919lZed4kZVCesoGyyy6Gbw2D7c7bjweZ+nGpdotNgD9evVLK5YUBEGIDIxw/2v3pwzjZaEy\ntjZtZVnVyRB+DOLlhMuE2LMQb02AQOs3bqTv4FNRamhaf6FQZhp0pfTK/+GH8xenvd5BIulEwU34\ng4o1ZcxpHi9dJKLH+rvf6fog2fos5go02/2be4a2667bUwLX7/xoVKeo90oWXibcHhWKd6VtCDjo\nKpFGHQr59R2L6bGZMc+ZA0uWOPPhJ6lu3aqrRsbjutSw9zksRIJqj+2hIw3zlll0cxjVlFn9K1RW\nqSIbowgRQkRIqAQhCfH0pqfTCjJ9c8A3dZGmgQlt11hTw64fRdn25tcBHa4df3oa1z8jaYxBBL7/\nfe1Oa15qg3//W3/KyjTxzyVOu4mMuYY7Od8VV6TrzRctyszH5Idchu5YzMnV5K4kmI8UUky//SCd\nfVsIrx8hzFV0ySvV+RG5+npnlT5ihL8arlgqFMPEp0511IQGbgeAoPuvr9dOC25pyD2H3vlubNTP\n2PTpjkeaOdebJNJcIxcj6GjbQ1thmUU3h1FNmUjt1kQrIkJrwsd31QOvlHFo/0PpW9GXZZuWpXJO\nGQiSxjwAWH0W21or0OqopIeUCpGIp0s2ZWVw2WX6Y1QEXqax7766LnO2FW1sU4yN/dZTVn4mEE5J\nFiaLrju63AT2hcPajTYfQ3q2F9stLYRCmSVfvavHUvntBxHZtnhseYmUO1renZY+aH7OOiu/WBE/\ntFeF4p7vxkb/bMTjx1imPDsAACAASURBVOtnLtdq3sTlGKLvnteg+fZ6ybkli0IlqI62PbQVlln0\nABjVVM1hNSl7xtp/r2XBugUM//Jwtu3Yxh2r7qAlka5M3ta8LS39+eotwelDEiRIi+9riEK8Av0I\ntSISAlE6GE5J6sULh9MzwxpD7eWXO4ZugDPP9M+WmtIXH7SWCxddQfMLpxPa/36+f/gRXDa5P2vX\nOhlpp03T5yxYkO5qOmhQfsSovt6RBrx1MrwvdG2t3u/OEOtlMqVYHQYRWbfH1OzZcPvtOluvqcnu\nB7cEJALDhzsuz97EkaD7N/Oarwu0m6ivXav/m+HDoV8/vS9XhtygKolBiR2NI4VS8Mgjmln49WlK\n+Rrp9PzznePuew6ab+9+8Gd8+TCCYjLOUkokonJlj+smGDVqlFqxYkVnD6PLYtIDk5i1claatGDU\nTm0yhm86AuYuhng5hFs45sf/YtiuRzMiso1V761i3eJR7GjZwblnlzPx5OqM02MxzTBeeQUGDIAj\njsjMKAsOUSDUSjweh0QFAOHyOH+6uYw//MEVKS5xwmEhEQ+lrTJnzcokmn4v2OzZ8OMf+5/nNYC6\nx+bOECui+7j11sKntL0v/fTp8POfOyvs8nJ48snsfc2e7TDbXr20S7OpaeKWoCBdr9+rl9brQ/CY\n3UTdrLwNRKB37+zeasaW0NKi78Uw70mTnBoo4TD8+teOsXv5cmf85pibIZXK+cCLbE4Y2dq2hVG0\nN9ZCRFYqpUblamcli50ENYfVMHfN3FSUd0hC9Ar34rtf/S4LX00PiHBLG4FI5pWiIUpo/6Us2+dZ\nnhFBvai0CqtaJy1c81IF1YcvSTPKew2KH3wAq1dr42QopIlKWXmcw45dQ1PzCBJxQRJhUCGMB1a8\nJcykSWZlaNLmQrw1fdyhkCZGbgS9YI2NjiHefZ5fyg/3KjsUcnJiKaXvAwqLIfES7Xw9eNxEprIy\nXRXT2prbxdeocIwRvn9/TcS9Xl9nneXYA0R0FL57le03ttpaZ468UCq3t1pdnfOMGE+ntWt1rIvp\n09RAMefV1jrHQqFMlVwudWIx4GZIoZCW8IIkqPYS+460dwS7zVj0KBjbxjXfvoZZJ87imjHXsLhm\nMZf9x2X0KetDiBBhCXPM4GOyelOloepZOPpaEgO1q25ropW4iqcYjULRFG9Ky54L6UTAjZYW/YJp\nt9cEy99dTiL0BaGworxcCJfpXkFpCSJlPjFBgiH9EX3ArBpN8kGTlyrISByN6vbhsP52rwq97pl3\n3pm+gh8/3mEYLS165Tt2rHNNL4whPRbTnylT9HmJhCawtbXB55rzx46Fq65yrtPY6IwB9Pgefzxz\nHO5rGzWJcS6oqdEEa9w4h3G6VU7hsGYm2XJkmbE99lhw2vtQSH9MGhP3/2CwZUvm9pQpwUzLqNAM\nEgnNWNz3777fXr2KzyjMOAyzbW3VDDHovwx6FvOF9/8rpb3DShY7EYJSjbhjN+rW1AVHd7cR9752\nL7NXzs5IiujATVGSxnJJAAo57iLGffkMJhweZdWqEFu26NXviBGacGjVhnKd28rooz/l3DP3TKX3\nNvYEdyI+Pz2y1zDtZiJl5XESCsrKwR1HIqJTsffv7/TpVz7VDT9Dsdt7zBB5vziObJl1o1FHKjD9\neN12/Vayfvry2tr0bL81NdmDAt0wBDCo3vsZZ8DBB2faetyELhZLD+wsL9dz7J6nsrJ0puV1m/bm\nzzJ2pGLEJmRTHUWj6a7i8Xjwit/PplGIWqojYy0ss7BIYyKmUJLBkD2G8N5n79ESb0FEqN63OsMQ\nLgiD9hjE25+87du/QjH5wcksWr+I/rv1Z8R3JxO+rZp43OSYUtB7K+zY03VSGaw4HxVK8KWJ/+bC\nCzN119XVMGMGvPaasP6NBPF4gooKYea1e6ZemunToalZkYgL8bhi1izJqis3v9MMqH9bi6q5EDYc\niRq6jBHfvYmKudUpBmTcc8NhOOmk7O66XuPqjh16xdyrV3Yib87NllnXy+z8CLHfSvaKKzKJjOmr\nri59Xz7EyOs67K2P8sEHznhNRmGzPXu2NoLvsotj4xCB731P/y4vz15S16SS92bmdf8Pue6jvXER\nkUh6KWC3lOqFn6E831os7jF2hKutZRYWaag5rCbNc+qdT9/hoshF3Bi7kdZEq6/HlIgEMgqDuIqz\n8DVtGykP3cGX//tKNs+7QjMFBHb0dffo2CcSirv+vB/GNbe5WXH55cKwYaSkjOOPJ03qcGOrbCCR\nGIxWUQlKaQK9alWwEdrrFbVgUSPxAU+j9nuSuIRprHyAxYurUyv8225z1B+jR2sPHD9i4zWugiai\nixbBH/9IhiTkJnKxGJx3ni5fa+CXWdcdL+JXg6NQN03jglxIPiwv01q0SBfBMit9r9QU5GBgoJSu\nnWISKE6c6B9l7bUrtWXFXYy4CKMSvPnm/Nym3XOQLbNyIWMsBSyzsEjB5JiKVEVSqqiWRAv1b9Vn\nxF0YCJIee5EHWhItbP7aL+HwL8GKiUCyfmiaOslsZ6ZEf+oplXS7zTwmou0JRoX0+18MBhV2tVAo\nJcyZExwwVlmZHn09fP8qlsYr/GuZx9KLNQWt9LwShXu13drqjKO+PlPqicXgmGPSvYlCoWADupeY\njBjhHwMSRMTcq/v2RlnHYjrCOR530mMERbQvWODfl/GkMvPl5wrtR8Dbor8PYgTulB8bNwbXS2kv\nIc+HmXekUdsNyywsANJyTHnx/ufvB3pHBe0fvMdgNm/bnN0t97A6WH0WtFbgMAyAOIhyEXkT9OdS\nWwXU1TC2gro6TVQSrWFPWz3e5hbF5MmS8sQx6R0g0yuqnxqqAx8fWA8N34LNg6FKtw2yc2STKEIh\nTWzcgVzuhHjGtmJQX59ZH2TECP3tF23uJiZBHkdBxMW7ujfjNF5H+cIQ1+XLHQO5cWcFf0I4YQI8\n+mj6Pr+58huHm8iGw/q6V19dWH4obz/mf5k0ScecGAcEMya/YM+2EvKgypH52jk6ApZZWADpOaaM\nZ5Qh9G4VU1jCjP/aeN799F2ef/f5QGaxedvm/N1v62vhzbFaJSVx+MrjcNA/4aGbU3EVSAtaPSVo\nxuI1iruheODJd/lBzQ7Ky4fS3OwdRxyVgHhSNeUt+Wq8oozrY2UlsDnC3EsiNDXB7TdonbRb3751\nK1x5pSaIvXunZz91SxTe2AWTluSOO5w2psjR3Llayti4URNAt6dPNBqcXddr6A3Kj+UXC+BNK2+Y\nVEuLVo+de25woJ979W1UaV4j95FHwnHH+RPCiRO1ysqMwczVhAlabbhunVYhrl2budpvbNQSTH29\nbmtiLaAwou1n9/G6/5r/yE/C8WM2boaeT5Cht3JktjGW2qjthmUWFgAZ6c9nHjeTBesW8Nibj6UR\n/YRKMHrAaKJDoqnqfO4qfAZKKcTlxzmk3xAG7TEo09Oq6lmI1sLbR0NcQbhFb1c9C/u+BGuS7i6H\nJS2tDVHo8yGsPRPePoZMlZUex+ZX92XGVc2MPvUJXnp0NNu37orxltLf6QzmySf1CtJ413z3u3Df\nfU6iuHPOcYh5IgGTJ0MonEilGlEJx93YRH8DjPl2nKYmzeRCISEU0oxl7Vpgn7XcdsdBxFsc6ccd\ngbxjhx4TOLEcSmkj77Zt2aWHmTOdaOmbbvK3gfglZBR/gY1EQq/WlyfLpfgFOfoFKXoxbFh2QnjZ\nZTry2ox3woRMgr18uU7meNNNmWo9rzFdxD8FSjYjtpG8jP3Ay/BCoewr+iAju1/uLUhfTBipOBcj\n6Cijths2gtsiBWOzcKc/j86NpqmmeoV7seSsJanjdWvqMlKJCEJIQiildJqQJI4ZfAzLNgakT990\nhGYEQ55Eqp7NLpVsOgLmPOkqxKRA4oz+z6d4Y11fPnp5eNJwbhhDyHVyHC2Z+KmzdD8hCZNIpFPN\nY46BZ55xq4OMWiyE19YSCsHTT8OMP21h4V+/lLxeK4MPaOLt9bumzg8ddB+JV07ErYIbPRrWrHFU\nHqk5dQX9mbxHDz7oEHjTNhTShNxtR/Hz/Jo+XcdoGFuCt+/t2zWBfsrD2wG++tV0ScxIT48/7khP\nRhIyfScS6Z5s2eCWGBYscPr1juGtt/yrMZr5Ki/XRbgefNDJH/aDH8Duu+eXwtzLUE84Qe8PKtrl\nDcY78URt2Dfz8I1vwMqVetvkLJs7N53hhcNOITG/sZUitYeN4LYoGN44jEhVhPqz6qlbU8eWz7bQ\nf7f+1BxWk2oTqYpQt6YulbRQEI4edDSxzTFaE60ZBD9r/Eay4JJOQRIKNKgDmqkkkt5SSQLP9yax\nfNjtsPsR8OpizSdw2zwAErDLh7B9X1dnCRxJQ0CFSfhcdulSTWjmz3cTKDfDcU469FDt0nvf/XuT\nYiai+KJlO7Brql1i275akkomXgyFhP32g+gpG/jXv4Q3VuyfVLs5xM+46Br3XCOFpPpM6JTv7hxO\nq1ZplYkbXh2/2yZgku/FYrpmujfp44YNmii606O7o9l79dLSmEnhHaTfD0IkQirnl9uw7capp2rJ\nwlzXSBTGnnDOOU6kvTsr7F13pfeTTUWVzRXZ6zQA6V50iYRWhYVCzrVXrUo3jJvruz3jlHIWCt6x\ntSXKv5iwzMIiK7LVDI9tinHn6jtTRL0iXMGwLw1j2aZlue0VARARlNJ1wkf29y8Ty5B6KGvWDCGU\ngBOmwKjb9bGUHeSXsGEc+hF3rfy37538rUBak0Z0r+4lUxejFPz971rn7nhitabaSkgxqKqMjRsV\nq1fr9CVpEowq48NNlWl9hr+ylMTIOajYT5DGg0gkYOFCBfcNhK89CDIAVAUgiDjutRs3asIRpBRo\naEjfDlpFu11rwT9Z3pIlev/LL8Pf/uYQNLeqzaRtNzCSjCGaQfr9IJiIdq9R3+Dkk+G662DoUIeh\nhMNw8cVOgkJzLXeciBcmhbnXruCGVyWVzWnAG4xn4mXcVSLPPddxdwYtWbhVbEa686ZX986Je/47\nynZhmYVFm1HfUE88oZfZgnD28LNTOahM5b4jBhzBi++/mJYKXRDKw+Wc8NUTeL3xddZ9uC51zKio\nEomEP6OAtLxUDKnX297j0au1TaMV7Vm192vw4YEp9VTfYSvYtmc9LPs/MqUDfxfelpYES5929ksI\nFHFIhFEJ2LgpgVLiOc/pKxEX9toLPvooea/LLiIUVqiWcFoyeBIV8OpJrnM1kVq1ShP2GTP87QF+\nMOk0ID2dhNe1dtUq//PdxNJtDwiHHULmJpCJhFYdTZjQ9sjk+vp09ZIJQHRLPrGYvo7JkKuUZhRe\ne0hNTXocDDjqnvPOy7Qr5FNNERyJxxsdfsstuHKWaZSVOYzFK13lW1+9vj5TLRlUUrZUsMzCos3w\nGsWNisqdPgQgOjeaOkcQTjrwJC77j8tSdo9j/nJMXvU3QoQIhUK6bVJt5QdBUF6GAsksudqIvst3\nZrDtpa+jbRhu6cMwDj/DeQKVcKQFlTD2kHByG/CopMyITF+GUWimEE4SALdrsEE4OTZS5z75pI4P\n8curBZnGXS/KyjSBcRtUm5q0sd4QU1MlDtKz7Ho9xNzR0xdfrBmYwWOPabWdm+i5U8l7VSi5EiJe\ncomWJsx41q5NN3pnMzhHInosOtIf1q93bAYjRmiGk49x2R3Rfscdznx5XYqN4d99rzNn5mbGuVKp\nRKP6Wua/D4V0nx0Zb2GZhUWb4VcT3Ow3v6cvnZ6SPkDHZSxav4jL/sMpNHDiASemoruDcGb1mexe\nsbsu8EQwYxGEP5/4ZwB+u/S3vF11rXPQxTy27PksDDnCUWelDOFug7UzaiQOg56Gt7+VulIQQ4EE\n9NsIWweRoQZLtff0jwAJKvZ6l+aP93W5CDvtX33VywycjVBIckobxx+vbQlugmOS+Rm4EyWadoaB\neNNSGNVNv36Z6hd3lHyQCsXYRbyrY2+cy7Zt6atv4w7sVt1ceGF2Q/A99zhGfaNGmzw5XVUETkZb\nw9AgvR/3Ct+byNBg4kTHrbqyUs+DidMIqjPiDmL0U4lFItoOY1KzmzF0ZLyFZRYW7YLXpuH1qPKW\nfQVSmWj779Y/Vd0vRCjNcyokIc445Aw++PwDJgybQPU+1dTW12YUcHIjJCF++h8/pXF7I5W7VPLe\nZ++lHZeq59hv2Cbe+fQdvcMtfbzzDXj1lFTb3b/0MZ9+0BcnpkNB9V2wKeLEfoRaCGGkAzdDEPhk\nEOEyIR5vTRJ+45GVRdUlcZo/2Vu31/64yetrI7wmjm7GFIf+a5FwC4cO/Dqrn9sjNf7+/dOztpaX\n629HKlEkEoqhB25nw6u7paX83rIlXXppanJSnV9xRabXz8UXO1KHm2HMmuWkZHEzMrcKyy+Izayi\nTQ4oI025U4+7pSiltDF9aGQtjZUPUNl4ItN+UJ2hnolG0+NV3ExSqfRtE3vj9irz1ng3aiU/GELv\nNv6DnoepUzUzKTSNR01N+ngKSe5YDJSUWYj8//bOPUyK8kz0v7e6h0FUboNyneGyAkpCYJRFRtSg\noEFQ5FlysjHuQhSdmCMJiAkb92x2PXGfwzmuBowSIt4CWY2bhCwoAl6ACUSHmwKiXARh5A46CIjI\nTHfVd/74qqqrarqnZ2CGufD9nmceuuv6VVXzvfXeZSTwBPoX/6xS6v9G1t8H3I/Wt08CxUqpLSKS\nB/wJ+Fvgt0qpSfU5TkPdEMwCbxFrwbLxy3ztY8rSKSEfxKsfvRqKeJKIU1kpxZ+3/pll4/Xr3fB5\nw32B45mjbMcO7T+mzxieXPMklXalburk2KFjxq04f9v1b9m/bX9qoWfKOl4AViU4cYgl+OLqabB4\nViDqSuCrDnDXMD/3o23Ldpwo/XuqRkXFQCktKK56Fk52DAki2n8ERy8jpMVYNvR5FbaPQf+3tF1h\nY6PrZAEhjcQd05GvoZTF5iOpaKl4jk3B4E0cenWg3lcUEyemd+Lv2HIhQQHmKMVrr8WI8uabsHy5\nfisuL09NgI4Djz+uw22PflXOyjfaucJRC7cFC/S4vAKAYjkU/Y/VPDqrJ53mdc5YATgYzptIpCZb\nkVS01WOPBSu7Ku7/9R9xui+HkkGoiq+jHKniUwi+nVeHUjqQIWiiKilJ9XgPTtBeeZRx48KJmp4g\njJ4rUxXaaCfC6DbpkvFKS3XAg2eia5JmKBGJAbOAm4B9wDoReUUptSWw2UtKqd+4248BfgmMBE4D\nPwe+7v4ZmgDBLPBKu9KvM1WUX8TMkTMZNncYCTuhczAiiXxVkvpQ/jEAP/nPW9cnrw9bP93qbx+3\n4nS6qFMqC11ZWGL5DvOYxHig6AFOnD5BjpWT0lD8jn8twErCoDk6AdATIotn6Qk3Vplyprvrju0d\nAqvHAC3wcilSmog7mbfZo4+3Y5TfVZBey+Hzv3EnVRu6rkNGPqjHu3Mk2IqcFsIDD5e5IbQ9XIFh\n4wsjAHH0chXHsW1uu+MgnbpW8vyxCayzExB70z9n68FvMPbysTzzjINtBwVH0DRmYSdtHFH+8m7d\nYN8+PYElk9p5e8cd4QnQtpXOupbWrtwJC6ZEAu67D2j9Cc+U/oGVv5/iBhroPiWjRwMXHaLTNW9A\nt96U/GdRKCzYiw7ych06ddI+DC8ayrYhlpMk2fIw6rdvuOVj8GtRHTuWMu2MH69NQeF8mei90Of1\nOjB6eSPBxL50xQ/feEMLRdtRxOJJvnPXYZAuiAWWpUDFfBNX1GRUWhrukWJZ4fMFzWqeEz/aRMwz\nF9aXwKhPzWIwsFMptQtARF4Gbgd8YaGUOhHY3n/FUUp9CfxVRC6rx/EZ6oCg2Snq8PYc3JDK2Sgp\nK+FYxTFmlM4A9CQ/sONA1h9c7xckjImeDIPHsCwLx32NVCi2fJp654hJjKdGPaW3c3M0LMtiatFU\nTpzWP7HWLVszo3SGFiRipboBlg3TgkLFESXQZp92joMOx+34QfVRV54Z64LPtOZxujWUPhgSMJK/\nBuuum7B3X5tytm+ckMpYH/kAKn81DuIfz+n5V55IrqPyG1fBhtSkz8jJcOhKfYxO78HSJyAJCodF\nJx/hnusUyfdWoZQTGttjL3Vk/+B22GoI4CUzegRMY1YSKxYDJ06LFjq3JOi8dhzFiy8FgwAC5jQV\njxwvdY7W3T/m1WP/B/vt2fiVhtGCZOFChYq3xmo5h7lH3+NHsc3A3/i5JdGKvBWVimeeTzD03j8w\n+jvD6XRxZwq/tY37f92RpNcXXpTv23j00VT+x8yZ6bQKLbQHDtvDyS8VO9f1IJjIOWiQTpR85hk9\noXs5HEVFqa6IHomEvjeObfHi0x21KdFyoOgJbiv4Bzpd3Nk3XQV9E9EIMNvWgsgr+RIs0f/kk6kQ\n6kTAKlvfTu76FBZdgb2B7/uAq6Mbicj9wFT069mNtTmBiBQDxQAF0awjQ72TzuyUzuHt4X0fPm84\ntmNjWRZP3vIk/S/tHzrOzJEzKT9VHjrGrFGzmLR4UtpkP4CPP/+YJ9c86WsMSSfJzNUzKZlQAsB1\nL1znaxlKBbSaHiV6UrcVykrQ/vJNHA0euJqoq4zrL38F65MbcbqvcNcJ5JdidXsn5ZdJE/qrUP7x\nbNzJI/+d6sOEAV77NTgxkot+yZK/uQuntZMa2+Gvw+JZOI7Fi295vpN0yYSulnPRYYaPLadXq0IO\nHYp2bnP3UZHv/ufUcS5qV8HJzy8ABMtSPL7sBWx1aTiZ0t1eKYFkDs7u66gAHp/b3Z04FYmkDZdu\n5aHi/qHeJNgWK2d/B5RFvEUlvSuW8fXcW9gcB+UoLNEO/6Cv4HSF4qFffI7ttEtzDyw2Jf6AaqGA\nfwqMD1q2FJLJVBhysG7Xe+9VfRze8VJl9pPYb09m4dtxWuaGw3WjDbmCSX2gv8+cmdIeKiu1Yx5S\nIcWewKhvJ3eDO7iVUrOAWSLyPeBfgAm12HcOMAd0uY/6GaEhE+nMTg9d91DGJL7gPg4OooTyU+UZ\no6qCFF9VTP9L+zNv0zxe2PgClXalP+Hbyuaxdx4jWrqm0q70mzkFS4wI4mtA0RDbo5dkFgwWFn07\n9GX7Z9tDzvgq5K9GCtbRL68vWz/TGkyVEidZhJCnXdnKrn7bQ1em3tRti09WXg+3vqzX7R3i+l0C\nZVFCqMhnC07k88a8fG2Sc6JVfyGc8R4N9wXEJqeF4vT1P4HXHtMaUdzG7u6GFsWSYOt9ewzYz8Ft\n3UkkHRxJID1XYn0yHDuZOq9jC/f/+o/0v+okw4YVIZbt7i/u+GIkK5Js/e2PAEUsDsX3Cq1ba6e3\nZ8oS0aHNR/d7QQDRUGmFevvBgDKUEoJvvx2OGvN8CvPng6OCAkfcj+699O+PHqvC4vRpeOKJlG/C\nthW/eRpycx2+/f2DLHm1FUf3t/PPr5SOggsSHMegQdClS+YSJHWJlX2TM2Y/fiFnALq5yzLxMjC2\nHsdjqGM8s1NMYlXMTrXdpyi/iIeu08bY6aumU7q3atPiovwiZt86mxUTVvCDq37gT6hAlcKF1TGm\n7xhWTFjBTb1u0o51t5d4dZO3IOTGc/lm92/WKDvdVjbbPtuWdVvPsR908AvCvVfey4PXPFij68lI\n2bDIm3zUdBSYzKKmo6Bj39/Ghq5rIVYBktABAblHU/uJQ7+rD1A47SfYhb/RQvjGf8X5xxuq3ttY\nkoJxs5k8ewHWjQ8jE24iVrCWqXdcSU5OQBBZSZKfd2HKM//F5sObA5Fl3p8bMeb6buykcOiLg8x4\nIkEi6SCWzZ337aPL5Qe8EwPQNv8gd963H3Hb9+rri5HK6E/dG9sOm3tAT9THTh/FsW13DNp5H4vb\n2j8R0rpSIdmeLyQV2QYooeI0vPh0J47u95qAKf886ZzxXj2w9et14cX6FhRQv5rFOqC3iPREC4nv\nAt8LbiAivZVSO9yvo4EdGJoMNdEIarNPpmiqdMcoyi+isHMhkxZPwlY2cStOvw792Hg41ckvbsUZ\nP0AbiD1txBKLA18cYPORzTw87GFW7VkVCusFPVl3vKgjR7484h/n7oF3+8d65r1nQpqChUV+m3wq\n7AoOn0z1/qhW+wByrBweKHqAx995PHS8uBWnsHMh87dk6AYUZMA82HBXyqfhVecFUmVRvIkfqpid\nLMedgBWonMj6wD2xHJRVCSMfgMP9YfWPdUZ8RfvUca0EW772HYi5giGqEZUN09FmxMBWrJw7jL/e\n8O9wbSlKOSgV48TpEwy5bQsrP9yuj7ljFLx7D2s3VrJu4O9QyX5priU1XhEdaWdX3g3KwrGTvPTm\nh3Rq1wbo4m99bN8l/MEegRrdV5vxqgiJIFV9MI4Da1e2C2/VZwHOtY8T+3QA8tpTOLYX7hzV6lTo\nWCmzlbdtkvZdT/D5gfbpBYWluDjvBCc+bY0TifiqT+pNWCilkiIyCXgdLc6fV0p9KCK/ANYrpV4B\nJonICCABfE7ABCUiZUBroIWIjAVujkRSGRoB1dWOqu0+maKpMhE1TW06vMlfJwj3FN7j779iwgoe\nfftRFmxfwNoDa1l7YC1jLx/rl2J/a/dboY5/R786ypg+YwBCBRTTaTwKxZ7je8iJ5XD75bezYFv1\nCYaDuwzmys5XMn7AeErKSqp0Gkw4Cd8/k5X81fD9G9L7NFwTW+5bs6n4ZEB4P0lCrJIr/nEOhz9N\n0r5yIDvfuIFwrxBXoFy+kK5XHKLDFR+w6YiFWvxkKtfEn/QcKHyhev+O5x/yijx+PAJn941I0Qyk\n5Qnkos95bun3SFQCsR4wcK4WLiqufUo4ocKLmqC5BxCF3aLcjRTT0Wlq13AOiqcBpDSmxK6hcN10\nve9rswMCwwkcO2p2C/4bvH4FX3RBdXsHp9tqir81BN4fz2+edlK+i5AmR2B/FV4visu+8SnrDrQN\nnBv/PEolOfHZBf73eFzOSQOkevVZKKUWA4sjy/418HlyNfv2qL+RGRoj1UVTZaIov4iSspKQ41sQ\nWsZb+pqAt92peW0NrgAAGTxJREFUxKnQvgu2LeD1na8zc+TMkIbhhe16WeWWWMzdNNfXiKK+Ee+8\nlXYlKK0ZZJroc6wcZo6cCWjhmNcqLxWZFaC65EMgnMRYnU8jfzUVI34Iz68MRCElodcyBn9vCWtj\nT0B3OLp3CFh/iZitbIhXwND/YF/+avYBbPy1Kygib/exyrBW49Lt4m7s+2JfapzRIo+OQr09DVDY\nlmtyUZaOFPOO60WNDZin/zaN56LKy6jYNpxk0kFE+zZAUDY6Gs2x3Mtw9HUrL7kRfV2xSqyeK/US\nrwilFyZtJfX+Khg1prL8Cxy8EvYOwclfTeHg0xT/ELbsPsbK1z3tK2oCzICKs3ZJH1Ll9COajVUJ\ndiv/eL0Gb4duR4H6VS0a3MFtMHiciVkLwkImZsVCJqPpq6b7xxrXbxxv7Ar37axIVlB+qly3TU3j\nPAfd8MnTdIb1GEZuPJeKZIX2W0a0gk4XdWJq0VQefTsVb3pn/zvZUb6DLq27+GVOgua2a7tfW335\n9gC+j0MEUVWFTFryV8Po/xnKGbl+/ApOdy6FA4FtRt0fnjALXwjnnFRBAQ5cvhAZ+ngq5DjA4S8P\nh/Na8lfDFfPh428RjUZSfm14OywcolpT/mpOAld8dTd9T/6APt3yePRfO0MyB92O1wLiWrOwbFdG\nWPgTrygYORmn29upgQ56Fjp+SLej/8C+9r+DpTNg/9Wp8eUehz6vwKlLodUR/W+njVyyfyKflnXQ\n2zmW3q/zBpZceIz+l5bSfsQqeHNyRBNz75t//VENxru3wVIxpD7brUL3eOtXKxg+b2pGs21dYYSF\noVFxpmatqJBJ5/8ovqqYJTuXhMxEDg5LP17KnuN7GD9gPIWdC3nuvefYcGgDtmPj4GCJ5Ws6wXPl\ntcrjR0t+5DeHyrFyfNOS9+YvCBe3uJg1967xzzl91fSQua19y/ahNraZiEkMhcJRDo7Sb9Se8IgK\nrSpEckZK5V2cg06126QVEkEfiThaCA16NqPISjgJruhwBbmx3JQ/6asOZC7g6H4fOTkkHNKx9YLn\n2XrB8/Rr3Q/Gt9H90S/4TOee+Dksbl7Ku/cQzO/QY0ghCC17buTnP/w+9y9eT7LwOVdYuONJtILB\ns0NjscTi5rYX8+KDd6X6yO+/GvZfzYINFbz20bdIdl0Fd/03vP1T2H6b9g+JK7AcnV1PzyWw+ya8\nRMUUQZNV1G/iBQAkYMDcGpltzxYjLAzNgqiQyeT/mHbNNBZ9tChkJlr5yUpWfrKS5zY8hyW6qm3M\nilF8VTGFnQur5HwEz+X5TABfm1m7P1XWRKF4YeMLoaZRUU1oyc4lKKWIW3Guyb+GVZ+sQqEQJNTf\nY2rRVJ5c8yQVyQptglL4y2eUzqjWdHVJq0soL1iL4052iUyypSZ5JZl8JBnY+tlWYhJLmdt6/AXJ\nSaDctupaE/D8EO7bdWQyr44tn27RcZf5rj+p4we03HcLp7stSY2v03tVs/EDeJWQQRe2fEU9j7Pz\nFtg2Vo9Nib7mwPU6yuHlY5Nhwovwp9/B8V74k7ndgsSua6DrSr3Pd8elukEeL4D196K1HwW7bw5c\nuw0dtkL55a7pLBpwEPneZxGSvwaRGHmtwv1S6pr6DJ01GBqM6kJ0Z42aRY6VQ7QeVcJJ+ALGqysV\nFRRBvOz18QPGM/vW2fq8c4exYPuCUCRU0klSUlZC6d5Spq+aDsCy8ct45IZHuHvg3SSdJA4OSin6\ndehHy3hLt2Og+AmEjuOw8eBGZo6cyYheI1IlU5SibW5bJhZODIXhXl9wvT9Bt4i1YNwV4zKayXOs\nHHJjuXjtcG/udTM5Vk7mbXtswLruUWIF60LhyxYWPdr2qHJfAT9iLSYxLui5kcunToLhP4fRPwTx\nypiA/zZ9wWcZz399wfXpL8QjfzWni/4tLMgGPQt3fRNu/Ln2m0Q0hE4XdmLepnncMPcGFm5fqDW1\nof8B8dM6TDiWqCJgvOsSBI73CCx1NYfo9l6I9oB5OgrNu1aF1tIkoX1EPVa5OwSd3kkdstx2d/iY\nX3RB7b0a27GZsnRK2gCMusJoFoZmSXX+j2AUVbB/eI6VE9IsvIq4QT+Id5x0Zq6SshISdvjt3pus\n81rlVdn+oeseonRvqd8syusJ4oUEe057QXBweGv3W6zas8p3yEcDAYLHufMbd7Jm/xpdVBGhdcvW\nxKyYXzIlOL7RvUcDOuRUoVhetjytWcsSy6+vBXDoy0Ms3LYwdSwR9hzb44856E/JsXJ4atRTbDio\nGzt8UfkFWy8I1BVd9BtSkUp2Rs3CUQ79LunHnuN7KDtelv7hZyKD1iQIz214rmp1gGxNtly6H5tA\nWdSMds2j1QYdhPxDsUptLvuqA+nLwUTW/XaFNgNiwYFBMHcZasJwKgvW1aspyggLQ6MkWur8TKjO\n/+GtGz9gfBUzUklZCXuO7/HzKWzb5ul3n/YjorwIrKiZa1iPYeTEckI+jNG9R9Ppok5sOLghY5HF\nqFDzwmm9Cru92vVi17FdvqPdc8hH748XBjyu3zjKT5X7k1/CTvi5HNFJ3BKLJTuXhJ36rnkrVBYF\nPVHPKJ2BoxxiVixU9deryeWNuW+HvpxKnGLP8T0AfsLk3E1zU2Y0Fxn0nD5KNWai4HiDAv5MiN4D\nW9mZ/UVZzHIWFnvazYP4P0ASxLJod+NzHL32n6sfRDb/UHVC6vs36IiyXSP8sGIpu4EWPTfVKILw\nTDHCwtDoiL61p6sVVVekEyieg3zuprlVwmm9ST5dmK9XLNETPoWdC5mydIrvm4hbcXBIW2QxOIbo\nsX869Kf+cYLnCmo5XiRXwk6wvGw5U4um+seAVLkThdKOcqWwLItb+9zKq9tfDYUd58ZzmTlyJkt2\nLOGV7a/4E7tXxddRji6dHiAqWLZ+tjW03nZs5m+Z75d6Ce2Lyjh5xiTmCyFBuKz9ZVWOXVN6tOnB\nyMtGsuijRalw3rNEoVDd3oYJw5GyG7h9ZFsWffW/IEu8AZA15LnadV7bYFf7uHzQYZ4z0VCG843g\nW3tFsoJJiyfhKKfarO66xnvj9ybhpJOs4vuoSZdA7zpw4N4r76WgTUFWoZfu2P0v7V9t1nswC91x\ntAbw1KinKD9VztoDa0MRYH3y+vDN7t/0NaklO5bg2A5xK87Ewon+8ilLp1RpSBWTmB8cEHwbD2kg\naWbKuBVnXL9xrNqzytcsquSXRCbIfh36MXnI5JCgbBFrUeXYNSHHyuGlcS8BsHrf6jMSFp6Q9SLk\nQgIyfzVWwTrIvw17W3otJSYxhhYMpV+HfrRu2ZrH33lcR7W5ZsZaETGR7bzwXeCeWl9TbTDCwtDo\nCL5Zi4j/NnsuwgODBE1V6SbqbGG+mXqU1+bc2c7lCdZovoWtbF8bW7t/rW8mAthevp2yY2W+UPC1\nChF/jNNXTde5JAEc5XBP4T0UtCkgr1UeP17yYyrs8DbpEIRbLruF8lPlvpZ4rOIYJbtLQqXpg8Qk\nxrNjng0JymMVx0L5K9no3qY7e4/v1VqJCJuPbA6FOgNc0eEKtpdvr6IZeWPwBGLQpFjYuZAX33+R\nlXvCuTH9L+3P4h2LM+a+2Mrmnb3vcGf/Oym+qpixfcf6v6vNRzYzc/XMKlpTuoRNn4BwTTpiQmcN\n5x/RXIaoCaYhxnMm/wnPNMkQqvfZzHl3ju+biApWb9LLjeWS1yqPYXOH+ZNjMCfDq8i76/Ndvm/D\ndmx/wslrlVelpFFMYiGB1//S/kxZOoV1B9aFzFgt4y35uyv+jt9/8Hud0R6Ls2TnEl796FXfrPjI\nykd8YRR9S/cKKUavu2R3Sei75QZziugormBeTG4sl1suu4Vn3ntGm81cM1g0AGHn0Z062Eh0VJI3\nhrF9xzJt6DTmbZrHoZOH/PHHrBijTo5i1Z5VRHn/yPshwefd7+Bkn3SSTFo8if6X9g/9roryiyg/\nVc4/Lw/7Oq4ruI51B9ZRkazw/T6ez0gQ33cTt+L1/n/DCAtDoySay3C2zu6G4kwETXUFFee8O4cf\nLNKt2d7Y9QZP3/p0SCAB/udodJZCkWPl+JON5+OIJh6W7i1lytIpflkTQYhZusFU9Fo2HNrgT4be\n2zfAxS0uZvbo2Ww4uIH3Dr7naxCVdiXzt8wPObljxBjTdwxLdi7xzX2e1hO8F9Gqwj8Z+hPa5rYN\nXXdeqzxfo4JwhNi4fuMo+aTEF56e5uDgYCmLuBX3zZ3Thuqci4I2BRz68pCvvdm27ZeBiZJOQ4pb\ncV+IedgqJZSjzcOivehPJ0+HfHbB6/R8Sp7mVN8YYWFo9Jzpm31TpbqCitFKtPO3zKf4quIqJiuP\nYHRWbiyXX93yK9+PsXDbQj96aUTPETw87GHfBOVNjhYWI3ql1gWZt2leKCqpqFsRi3cuDkWDWWKF\njuVN2svLloc6Hw7uOphpQ6dVeSkI3osYMcb2HcupxCnG9RtH8VXFofGk+42k8/1kCkAYddkov2gk\nUMUXVFviVtwPFw5WKs6xcnyhHA3kiIY3rz+4ns1LN1fx1UXHFtQK6wsjLAyGRkZ1BRWj9a3G9RuX\n8TjR6Kxg5dyH//JwSiOI5YSEQfT86QRFOk4nT4c0mSrhraLDe/tf2p9be9/Kqx+9qlvgikVeq7y0\nLwV5rfL8BETvjT/TWNKZ7rL5foJViz0zmeejCvZ99/CSEIMmv7gVZ0jXIfx1z19T2pLb6rf4qmI/\nEVPfAuGugXeFhLL3UjB/y/wqZqyor650bykPlzxMhZ0am5fLY8xQBsNZUBf5GueabAmFgO+ziL5d\npztWMMR2+qrp7Dm+x89QD05eNTl/kPEDxvP8xudJ2AlyYjlMvHIiGw9vDGkWjnJCUVMbDm7w3+ZF\nRJtdlMOUpVN8O76HZw7zWvDOHDmzWkGRrRdKJmHiVS2O5swE+7571+NFmEVNQlOWTgG0kLit721M\nu2ZaRuHraS7R5V60mKfpCBKKwvOu0TPhWWKFeq3U9+/bCAtDs6WmzZQaI9WZ3oqvKs4qJKIE70U0\n5yNYyr0m5w9uUzKhJKOZZ/yA8Ww+stlvUJUbywXw36ZFpRzA6SLdgi14cbSAjAqU6LaZeqFU91vI\nlDMT7Pvu+WzSmb48DcHBIUaMwV0GhwR0SVlJ2lyhbCHS3nV564LniZoOzwVGWBiaLbVtptQYqC9N\nKHgvapPzkY1sZp50E6DndE739hzEm8S9N2mv3Ek6oZ+tF0p1v4VMmpRXFibb88h07kwCKvqMs4VI\nl5SVpD3PuRQUYISFoRlzJs2UGpL61ITOJufjbIlOgJmit6Lj8Sbxh0se9jsZRif64MRbnemsugnd\n28frAV/d2DNdX7pzpxNQQI2rE6T7PaQ7z7kytRphYWi2nE2eQ0NQn5pQY7oX6d6mq9vW65Vekzf3\ndBO+d5ya9Dw50/uSTqikE1C1qU4Q3PZ08jTzNs1j9q2za2xeq2uMsDA0a5pS2G19a0J1dS/OddBA\nbd7cswmeTJNxfZgpM427ptUJhvUYpgs22rpgY7Qvyrm4hiBGWBgMjYTG9PafiYYKGqjpm3ttOBdm\nynRaVE2rExTlF3H3wLt5+t2nUSi/L0pUoJwrU6tEm883VQYNGqTWr1/f0MMwGJotXoy/5z+ISYxH\nbngko+nnTI5fW0F5tlpOQ4dWZzv/mYYE1wYReVcpNSjrdkZYGAyGbKSL8c+N5daZZtGUw5zrm/oW\naDUVFladn9lgMDQ7gjkPXox/XU7omSKHmgteQmR9tj2tb4zPwmAwZKW+Y/ybWphzbTgbrakxaVxG\nWBgMhqzUtfM9XWJaY3funylnE7HUmBJLjbAwGAw1oi5Db9O9LTelMOfacDZaU2PSuIywMBgM55TG\n9LZcV1TnhD4brakxaVxGWBgMhnNKY3pbrgtq4lc4G62psWhcRlgYDIZzSmN6W64LmqOmlA4jLAwG\nwzmnsbwt1wXNTVPKhBEWBoPBcBY0N00pE0ZYGAwGw1nSnDSlTJgMboPBYDBkxQgLg8FgMGSlXoWF\niIwUke0islNEfpZm/X0isllENorIX0WkX2DdQ+5+20XkW/U5ToPBYDiXNMVaUfXmsxCRGDALuAnY\nB6wTkVeUUlsCm72klPqNu/0Y4JfASFdofBf4GtAFeEtE+iil7Poar8FgMJwLGlO9p9pQn5rFYGCn\nUmqXUqoSeBm4PbiBUupE4OuFgFcv/XbgZaVUhVJqN7DTPZ7BYDA0aZpqhd36jIbqCuwNfN8HXB3d\nSETuB6YCLYAbA/uujuzbtX6GaTAYDOeOppqX0eChs0qpWcAsEfke8C/AhJruKyLFQDFAQUFB/QzQ\nYDAY6pCmmpdRn8JiP5Af+N7NXZaJl4HZtdlXKTUHmAO6U97ZDNZgMBjOFU0xL6M+fRbrgN4i0lNE\nWqAd1q8ENxCR3oGvo4Ed7udXgO+KSK6I9AR6A2vrcawGg8FgqIZ60yyUUkkRmQS8DsSA55VSH4rI\nL4D1SqlXgEkiMgJIAJ/jmqDc7f4AbAGSwP0mEspgMBgaDlGqeVhvBg0apNavX9/QwzAYDIYmhYi8\nq5QalG07k8FtMBgMhqwYYWEwGAyGrBhhYTAYDIasNBufhYh8CnzS0ONoIDoAnzX0IBqQ8/36wdwD\nc/1nfv3dlVKXZNuo2QiL8xkRWV8TB1Vz5Xy/fjD3wFx//V+/MUMZDAaDIStGWBgMBoMhK0ZYNA/m\nNPQAGpjz/frB3ANz/fWM8VkYDAaDIStGszAYDAZDVoywMBgMBkNWjLBoAohIvoisEJEtIvKhiEx2\nl7cXkTdFZIf7bzt3uYjIr9we5u+LyJUNewV1g4jERGSDiCxyv/cUkTXudf6XW90Yt1rxf7nL14hI\nj4Ycd10gIm1F5E8isk1EtopI0fn0/EXkAfe3/4GI/F5EWjbn5y8iz4vIERH5ILCs1s9bRCa42+8Q\nkRr3CkqHERZNgyTwoFKqHzAEuN/tU/4zYJlSqjewzP0OcAu6rHtvdHOo2VUP2SSZDGwNfP9/wAyl\n1GXoqsUT3eUTgc/d5TPc7Zo6TwBLlVKXAwPQ9+G8eP4i0hX4MTBIKfV1dBXr79K8n/9vgZGRZbV6\n3iLSHvg3dIfSwcC/eQLmjFBKmb8m9gcsBG4CtgOd3WWdge3u56eBOwLb+9s11T90A6xl6Na7iwBB\nZ6zG3fVFwOvu59eBIvdz3N1OGvoazuLa2wC7o9dwvjx/Ui2a27vPcxHwreb+/IEewAdn+ryBO4Cn\nA8tD29X2z2gWTQxXpS4E1gAdlVIH3VWHgI7u53T9z5t6D/OZwDTAcb/nAceUUkn3e/Aa/et31x93\nt2+q9AQ+BV5wzXDPisiFnCfPXym1H3gM2AMcRD/Pdzl/nr9HbZ93nf4OjLBoQojIRcB8YIpS6kRw\nndKvDs0yDlpEbgWOKKXebeixNBBx4EpgtlKqEPiSlAkCaPbPvx1wO1podgEupKqJ5ryiIZ63ERZN\nBBHJQQuKF5VSf3YXHxaRzu76zsARd3lt+583doYCY0SkDN2r/Ua0Db+tiHjdHoPX6F+/u74NUH4u\nB1zH7AP2KaXWuN//hBYe58vzHwHsVkp9qpRKAH9G/ybOl+fvUdvnXae/AyMsmgAiIsBzwFal1C8D\nq17BbUXr/rswsHy8GyUxBDgeUF+bHEqph5RS3ZRSPdCOzeVKqTuBFcC33c2i1+/dl2+72zfZt26l\n1CFgr4j0dRcNR7ccPi+eP9r8NEREWrn/F7zrPy+ef4DaPu/XgZtFpJ2rnd3sLjszGtqJY/5q5Oi6\nFq1yvg9sdP9Goe2wy4AdwFtAe3d7AWYBHwOb0VEkDX4ddXQvhgGL3M+9gLXATuCPQK67vKX7fae7\nvldDj7sOrnsgsN79DSwA2p1Pzx/438A24APgd0Buc37+wO/R/pkEWrOceCbPG7jbvQ87gbvOZkym\n3IfBYDAYsmLMUAaDwWDIihEWBoPBYMiKERYGg8FgyIoRFgaDwWDIihEWBoPBYMiKERYGQxZExBaR\njYG/n2Xfq8bH7hGsLGowNFbi2TcxGM57vlJKDWzoQRgMDYnRLAyGM0REykTkURHZLCJrReQyd3kP\nEVnu9hZYJiIF7vKOIvLfIrLJ/bvGPVRMRJ5x+zW8ISIXuNv/WHQPk/dF5OUGukyDATDCwmCoCRdE\nzFB/H1h3XCnVH3gKXRkX4ElgrlLqG8CLwK/c5b8C/qKUGoCu7fShu7w3MEsp9TXgGDDOXf4zoNA9\nzn31dXEGQ00wGdwGQxZE5KRS6qI0y8uAG5VSu9xCj4eUUnki8hm670DCXX5QKdVBRD4FuimlKgLH\n6AG8qXRDG0Tkn4AcpdS/i8hS4CS6vMcCpdTJer5UgyEjRrMwGM4OleFzbagIfLZJ+RJHo2v+XAms\nC1RYNRjOOUZYGAxnx98H/i11P7+Dro4LcCewyv28DPgh+P3E22Q6qIhYQL5SagXwT+gy21W0G4Ph\nXGHeVAyG7FwgIhsD35cqpbzw2XYi8j5aO7jDXfYjdFe7n6I73N3lLp8MzBGRiWgN4ofoyqLpiAH/\n6QoUAX6llDpWZ1dkMNQS47MwGM4Q12cxSCn1WUOPxWCob4wZymAwGAxZMZqFwWAwGLJiNAuDwWAw\nZMUIC4PBYDBkxQgLg8FgMGTFCAuDwWAwZMUIC4PBYDBk5f8DAkVpn8pWhMcAAAAASUVORK5CYII=\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ctawd0CXAVEw",
-        "colab_type": "text"
-      },
-      "source": [
-        "This graph of _mean absolute error_ tells another story. We can see that training data shows consistently lower error than validation data, which means that the network may have _overfit_, or learned the training data so rigidly that it can't make effective predictions about new data.\n",
-        "\n",
-        "In addition, the mean absolute error values are quite high, ~0.305 at best, which means some of the model's predictions are at least 30% off. A 30% error means we are very far from accurately modelling the sine wave function.\n",
-        "\n",
-        "To get more insight into what is happening, we can plot our network's predictions for the training data against the expected values:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "i13eVIT3B9Mj",
-        "colab_type": "code",
-        "outputId": "afc103e2-0beb-4a26-fe18-c0cccc6d3d2a",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 281
-        }
-      },
-      "source": [
-        "# Use the model to make predictions from our validation data\n",
-        "predictions = model_1.predict(x_train)\n",
-        "\n",
-        "# Plot the predictions along with to the test data\n",
-        "plt.clf()\n",
-        "plt.title('Training data predicted vs actual values')\n",
-        "plt.plot(x_test, y_test, 'b.', label='Actual')\n",
-        "plt.plot(x_train, predictions, 'r.', label='Predicted')\n",
-        "plt.legend()\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEICAYAAAC3Y/QeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJztvXmcVNW16P9d1c3kiLQYvaLigANK\nBMXGUkB8GjDRi6hPkwhB41AgmheTFxm8zye5MSDo517yokj3zwnSSJKnVxxeEohDi9oVCEaMEYyi\nYsCIYCMIyNi9fn/sc7qrq6uqq7rmqvX9fM6nhrPrnH1OVa299lprryWqimEYhlFeBPLdAcMwDCP3\nmPA3DMMoQ0z4G4ZhlCEm/A3DMMoQE/6GYRhliAl/wzCMMsSEf4EjIhUiskNEjs1k2wz062IRWZft\n8+QCEakUERWRvt7rh0Xkzhyc9yYRqc/2eQoBEdkgIiMyfMw235uRGib8M4wnfP2tWUR2Rbwem+rx\nVLVJVQ9S1X9ksm0uKTYhp6o3qeqMjtqJyGsicn0OupRzSvnaDEdlvjtQaqjqQf5zTzO+SVVfiNde\nRCpVdX8u+lYuiEiFqjblux+GUciY5p9jROQeEfmNiCwSke3AOBEJisifRGSriHwqIv9HRLp47aNN\nEnXe/t+LyHYRCYvI8am29fZ/U0TeE5FtIvJLEXk9nrYnIgeIyK9E5AsReQc4O2r//xKRD73zvCMi\no733BwAPAMO82c/n3vujRWSViHwpIv8QkbsS3LOLRWSdiPxvEWkUkY9E5DsR++tE5EER+YOI7PTO\n1V1E/kNE1ovIZyIyV0S6R3xmqohsFJFPgOuizlcnItMjXl8Z0de1IjJSRGYBQWCed11zvLb9ReQF\nEdkiIu+KyFURx+ktIs97x/kTcDxxEJE/isjEqPf+5t23gPe9bvK+u7+KSP84x7lJRNZ438sHInJT\n1P6krk1EThIRjfpsy+xARPqJyMvedX/u/VYOjXd9Ecc4X0Q+EZFAxHtXi8hfvOdx/xsxjtVmtiJR\nM84OvpvLIu7TBhH5UUd9L3pU1bYsbcA64OKo9+4B9gL/iht8ewDnAENwM7ETgPeA27z2lYACfb3X\ndcDnwGCgC/AboK4TbY8AtgOXe/t+DOwDro9zLfcD9cBhwHHAamBdxP5rgKO8a7oW2AF8zdt3E1Af\ndbz/BpzutT/T6+dlcc59MbAfuA/o5n32K+CkiOv8AiewAl6bXwJPe/09BPgd8DOv/WXAp0B/4EDg\ntzHu23Tv+XnAVuAi79jHAKd4+16LvF/AQcAnwHjvuzgbaIxo/ySwCDgA+LrXh/o413wD8ErE6zO9\nY3UFLgVWAId6feoPHBnnOP+K+02Jd992AV/vxLWdBGjUsVvaACd7x+nq/bZeB+6PaLsBGBGjf4L7\nn1wY8d7TwE+856n8N6L73PK7S+K72Qyc5z3vBZyVb/mR7c00//zwmqo+p6rNqrpLVf+sqstVdb+q\nfgjUAhck+PyTqrpSVfcBC4GBnWh7GbBKVZ/x9v0nTgDH4xrgHlX9QlU/xmnzLajqb1X1U++ansD9\noQfHO5iqvqSq73jt3wJ+3cE1NwN3q+oeVX0J+ANwdcT+p1U1rKrNuEHsZuB2r79fAjMBf7ZwDfCI\nqq5W1Z3A9ATnvRH4/1T1Ra+v61X173HaXg68p6oLvO/yDWAx8N89bXUMcJeqfqWqfwV+leC8TwHn\niEgf7/W1wFOqute7vkOAUwG869gY6yDe7+xDdbwEvAgM68S1JURV3/OOs1dVN+F+T4m+T/9zivvu\nvwsgIj2BUd57dOK/EY+43423fx/QX0QOVtUtqvqXTpyjqDDhnx/WR74QkVNF5P95ZogvgX8HDk/w\n+cg/+lc4rSbVtv8S2Q/vT7ghwXGOiur3x5E7ReR6EXnLm55vxQmmuNfgTefrRWSziGzDaWmJrrlR\nVb+KOv+/RLyO7NuROO0/sj/P4zRSiLr26GuJ4hjggwT7IzkOON8/p3feb+Pu3deAimTPq6rbcAPc\nt0VEcAPXQm/fUmAe8BDwmYjME5GDYx3HM2cs90wdW4GRtN7nVK4tISJypIj81jPhfAk8TuLvM5In\ngKu8AfIqYLmqbvCOm+p/Ix6JvhuAK4DRwD+83+WQTpyjqDDhnx+iU6nWAH/DmTEOAf43bjqcTT4F\nfK0ST8AcnaD9Rpyw8GkJJxWRE3CC6BagSlV7Au/Seg2xUsf+GqfdHqOqhwIPk/iaq0SkR9T5/xnx\nOvIcn+FMa6eoak9vO9Q7D7hrj3ktMVgPnBhnX/R1rQdejDhnT3XRV7d5fWpO4bzgTETfBYbi/qvL\nWk6sOkdVzwLOwJl9fhz9Ye9+PYmb9XzN+16W0nqfU7m2nd4xD4h478iI57OAPcAA7zd8PUn+hr1Z\n0Eacxn8tbjDwSeW/sRNnUovVv0TfDd7sYjROQXgeb+ZRypjwLwwOBrYBO0XkNGBCDs75PHCWiPyr\niFQCPwR6J2j/W+BOEekpbh3BbRH7DsIJi824ceRmPJOEx2dAnyhH3cHAFlXdLSLn0mqSiUcAmC4i\nXcXFi38TJ9jaoS7S52FgjudkFRHpIyIjI67lBk+rPBC4O8F5HwFuEpELPUdrHxE5JeK6Toho+yxw\nuohcKyJdvK1aRE7xTGuLgZ+KSA8ROQP4XgfX/BzQDyfwfu3NzvCOWe19bztxA11zjM93w9ngNwNN\nInIZzi7fmWvb6G3jxK0nCeG0aZ+Dvb5sE5FjgJ90cG3RPAH8COe3ifxeU/lvrMLNIHqIyMk4v4lP\n3O/Ga3+tiBzifU/biX0/SwoT/oXB/8RFnGzHaTq/yfYJVfUz3LT3P3COrxOBN3HaWyzuxmnM64Df\nAwsijvVXnIN1hdfmFGB5xGf/CLyPM1H4ZqhbgJniIp7uxAnkRGzACZdPgfm4ENr3E7T/nzizygqc\n8FiKE6So6nPAg8ArOAfiH+MdRFUbcP6D/+Md52Vatfc5wHc9M8J/eKaaUcA4r58bcVp3t4hrPgwn\nWB8BHkt0waq6GzdgXExbbbin9/mtuO/jU9z3GP35rTiB+jSwBWfffr6T16Ze2ztxvqGTaPsd3w1U\ne8d5FjerS4UncA7pP6rqFxHvp/LfuB+nhGwCHsU57v1r7ei7uQ742DMt3ei1K2nEUyaMMkdEKnBm\nlP+uqq/muz+RiMjFwMOq2jfffTGMUsE0/zJGRC7xzDjdgLtwEQ8r8twtwzBygAn/8mYo8CHOJjwK\nuEJV45l9DMMoIczsYxiGUYaY5m8YhlGGFGxit8MPP1z79u2b724YhmEUFW+88cbnqpoobBsoYOHf\nt29fVq5cme9uGIZhFBUikmjFegtm9jEMwyhDTPgbhmGUISb8DcMwypCCtfkbhlGa7Nu3jw0bNrB7\n9+58d6Wo6d69O3369KFLl5i1bTrEhL9hGDllw4YNHHzwwfTt2xeXTNZIFVWlsbGRDRs2cPzxcQvC\nJcTMPoZh5JTdu3dTVVVlgj8NRISqqqq0Zk8m/EuUcBhmznSPhlFomOBPn3TvoZl9SpBwGC66CPbu\nha5d4cUXIRjMd68MwygkTPMvQerrneBvanKP9fX57pFhFB6LFy9GRHj33XcTtnv88cf55z//mbBN\nIurr67nssss6/flsYcK/BBkxwmn8FRXuccQI9360KchMQ0Y5s2jRIoYOHcqiRYsStktX+BcqJvxL\nkGDQmXp+9rNWk49vCrrrLvdYW9v2tQ0ARiGTaUVlx44dvPbaazzyyCP8+tet5XpnzZrFgAEDOPPM\nM5k6dSpPPvkkK1euZOzYsQwcOJBdu3bRt29fPv/8cwBWrlzJCE+7WrFiBcFgkEGDBnHeeefx97//\nPTOdzRJm8y9RgsG2dv5oU9BTT7V9vWCBa1NVBY2NbrZgfgKjEMiGD+uZZ57hkksu4eSTT6aqqoo3\n3niDTZs28cwzz7B8+XIOOOAAtmzZQq9evXjggQe4//77GTx4cMJjnnrqqbz66qtUVlbywgsvcOed\nd/LUU6lWs8wdJvzLhKoqCASguRlEYOBAePVV94eqrIRHH4X9+93+QAC6dTNHsVEYxPJhpfu7XLRo\nET/84Q8B+M53vsOiRYtQVb7//e9zwAEHANCrV6+Ujrlt2zauu+463n//fUSEffv2pdfJLGPCv0gJ\nh92fIBkNPRyG2293wl3V/Yl++UuYM8dp+StWwDPPuH3gBoBM/ckMI118H5av+fs+rM6yZcsWXnrp\nJd5++21EhKamJkSEq6++OqnPV1ZW0tzcDNAmzv6uu+7iwgsv5Omnn2bdunUt5qBCxWz+RUi0/b4j\nO6ivOfnCXdW99s07v/996z5wmn+iP5k5io1cEsuHlQ5PPvkk3/ve9/j4449Zt24d69ev5/jjj+fQ\nQw/lscce46uvvgLcIAFw8MEHs3379pbP9+3blzfeeAOgjVln27ZtHH300YBzEhc6JvyLkFRDOX3N\nyV8TEinc6+vdjADc/jFj4J574v/JUh14DCMTBIMwbVpmZqKLFi3iiiuuaPPeVVddxaeffsro0aMZ\nPHgwAwcO5P777wfg+uuvZ+LEiS0O37vvvpsf/vCHDB48mIqKipZjTJ48mWnTpjFo0CD2+3+qQkZV\nC3I7++yz1YhNQ4Nqjx6qFRXusaGh48/U1Kh26aIqolpZ6V77x+rWzb3frVvHx5oxw50X3OOMGfH7\nOGNGcn0zyovVq1fnuwslQ6x7CazUJGRsRmz+IvIocBmwSVXPiLFfgF8A3wK+Aq5X1b9k4tzliD8N\nTtbmD87E09zszDuq7rVPpDmoIxLZX30/RFWV8zHYCmPDKFwy5fB9HHgAWBBn/zeBft42BHjIezQ6\nSXQoZ0eMGOEWfTU3u0dfaNfXO/ORqjP/TJ/utnjHjjfwRIbjBQLumOY4NozCJSPCX1WXiUjfBE0u\nBxZ4U5I/iUhPETlKVT/NxPmN5PBt/pH5oHxNfs8eJ6xfeMGFgCbS1mMNPJF+CD9cNHqFsU8qkUqG\nYWSHXDl8jwbWR7ze4L3XBhEJichKEVm5efPmHHWtPPAdu76G7zuJfU3+4otb1wF0Jh+QP7MAd45A\nAG6+uf0gYg5jwygMCiraR1VrVXWwqg7u3bt3vrtTUsTL9wNOOE+f7hZ2xdPWOyIYhBtuaJ1VNDfD\nsce2F/zTp7tZhiWdM4z8kqtFXp8Ax0S87uO9Z2SRaPNKIidxZ5zI0YwfD/Pnx3cGX3RRq3mpo7UE\nhmFkl1xp/s8C48VxLrDN7P3ZJZZ5paNY6XRjqRMtxvF9An56icGDY5uEbPGYkQsqKioYOHAgZ5xx\nBldffXXLwq7OEJmy+dlnn+Xee++N23br1q3MnTs35XNMnz69Zd1BpshUqOciYARwuIhsAO4GugCo\n6jzgd7gwz7W4UM/vZ+K8pUyk1g6pa+TZyIeSDPGikHyfgB9Z9NZbbfdbARojl/To0YNVq1YBMHbs\nWObNm8ePf/zjlv1+LHwgkJp+PHr0aEaPHh13vy/8J02a1LmOZ5CMaP6q+l1VPUpVu6hqH1V9RFXn\neYIfb+3Brap6oqoOUNWVmThvqRKptV94oROcd90FB5/Xn+ZAAI44okP1OJGNPx9E+wQinc6QeNWy\nzQiMbP4Ihg0bxtq1a1m3bh2nnHIK48eP54wzzmD9+vUsXbqUYDDIWWedxdVXX82OHTsA+MMf/sCp\np57KWWedxX/913+1HOvxxx/ntttuA+Czzz7jiiuu4Mwzz+TMM8+koaGBqVOn8sEHHzBw4EDuuOMO\nAO677z7OOeccvv71r3P33Xe3HOvnP/85J598MkOHDs1OeuhkVoLlYyvXFb4NDaojR6oGAm45lojb\n3uI0bQaNWKeletxxrUt14xwr36tsI/sQvTK5pib+Pr/PnVnNbBQ2Ka/wzcKP4MADD1RV1X379uno\n0aN17ty5+tFHH6mIaDgcVlXVzZs367Bhw3THjh2qqnrvvffqT3/6U921a5f26dNH33vvPW1ubtar\nr75aL730UlVVfeyxx/TWW29VVdVrrrlG//M//1NVVffv369bt27Vjz76SE8//fSWfixZskRvvvlm\nbW5u1qamJr300kv1lVde0ZUrV+oZZ5yhO3fu1G3btumJJ56o9913X7vryPsKXyMzxHKKdukC1U1h\nTt+/BoA2JZs//hgmTHDPQ6F2x/NNML7SlOu4+limHN+pHGsVcCyHc77MV0YBkYUfwa5duxg4cCDg\nNP8bb7yRf/7znxx33HGce+65APzpT39i9erVnH/++QDs3buXYDDIu+++y/HHH0+/fv0AGDduHLW1\nte3O8dJLL7FggVv3WlFRwaGHHsoXX3zRps3SpUtZunQpgwYNAlyRmffff5/t27dzxRVXtKSXTmRK\n6iwm/AuISKdoIOBi7++/Kkz/20Ykts9Nm+YeYwwA+bSlx/rP+g7lmTPj74sk0+l8jSIkCz+CSJt/\nJAceeGDLc1XlG9/4Rrsyj7E+11lUlWnTpjHBV+I85syZk7FzxKOg4vzLnUg7fbduLiZ+wJsLqNi3\nFyFK649kyxY3Axg0qMUm6mv7Cxbkr5h7Ir9Dsj6JTKfzNYqQPP0Izj33XF5//XXWrl0LwM6dO3nv\nvfc49dRTWbduHR988AFA3BrAF110EQ899BAATU1NbNu2rV166FGjRvHoo4+2+BI++eQTNm3axPDh\nw1m8eDG7du1i+/btPPfccxm/PtP880zCWPwHx8HChW0/EAiw48Svs2/zVg7evYnK3REhaqtWwdCh\nfPCTh7jol6GWKl3+yttca86J8gDV17cWk+nIHJVqHiOjBMnDj6B37948/vjjfPe732XPnj0A3HPP\nPZx88snU1tZy6aWXcsABBzBs2LA2At3nF7/4BaFQiEceeYSKigoeeughgsEg559/PmeccQbf/OY3\nue+++1izZg1B79oOOugg6urqOOuss/j2t7/NmWeeyRFHHME555yT+QtMxjGQj60cHL4J/VjV1a2O\nXX+rrta/1jS0fOaWypq2DmBvawLdyBE6g8laUaE6cWL+Hb8+5sA1LKVz5kjH4WtmnzwSN7xx1ChX\nWzESEZgzh+cbgy2fqdUQbw8c2+64AhzBJqYym/k6jvHjM1cII11SLURjGEZ2MOGfR2LavWtrYenS\n9o2vvRaCwXaf2Tm3DmpqoH//lqaR/oFrmxcSHHUIjBuX1WuJFYYd671CW39gGOWKaDIVPPLA4MGD\ndeXK0l8LFg47p+zGjXDkkXD/H07nwHWr2zaqrobly9t8JuaK39pamDgxflWWfv1c8p0MTwFiRRRB\n/CgjS+lc3qxZs4ZTTz0VkbghDEYSqCrvvvsup512Wpv3ReQNVR3c0edN8y8AHnsMFi+G/fNqaV63\njjaiu1evNoIfEuTgCYXg9ddh4MCWpbT+sRTg/fdh6FA3SGSQWKacROYdv/9gK3fLke7du9PY2Eih\nKp7FgKrS2NhI9+7dO30Mi/bJM76QnMEUpjK7fYOZM1M7YDAIb74J4TA7xt/CgWvfQokIE21udmGh\nH3wAs2al13mP6DDsqirXhURRRonWH9jMoLTp06cPGzZswGp2pEf37t3p06dP5w+QjFc4H1s5RPuo\nuhQHIanRJqQlcqcZVHv1Spi6IRlmzFD9HSPbp4Xwt4MOUp08OSPX4adxqKlpjebp1s1FGsWK6Jkx\nozWFRSDQWgjeooEMIz2waJ/8kkweqtpa+NWkMA/qLQiKQKuWPnNmzBW7qTBiBFzVYwkTpYYmArSb\nZO/YAbNnw5QpaZ0HWk05jY2t5p79+9sXdPGpqnKTEHCPVVXuuUUDGUZuMOGfBToqVRgOwy23wKRJ\n8KOm2VTQ3GKWEYDhw9MW/NC6yKrvz0OsqXkNGT48dsMHH4QhQzLiC0g2mqex0aWwAPfY2Bj781VV\n5hcwjKyQzPQgH1sxm31mzHBmC3CPvklD1ZlFKivdvnNp0L1UtDX3BALZtXWMHdve/BO5HX102udP\nJptoIvNOLBOSmYAMIzkws09uiTTzxNN+w2G47TZnDjmXMHczHfG0fgUkEICHHsqul7POWxfQty9E\nJLFq4ZNP4Lzz0loXkExFsETpWmKZkMwEZBiZxaJ9MkBHqYt9oVVf3yr4X+ZCurIXQVEEqaxw5pcM\nmHs6JBRyWzgMF1wA+/a1b7NwIWzeDEuWZK0bHaVrsYyehpE9TPPPAPHSjY8Y4XLW+7b/rVudbeVB\nJtGNPQQ8J69UnwPLlhEeEMqtfTsYhFdegaOPjr1/6VLo0SPrq4PjYRk9DSN7mOafAeJpqNGDwqpV\nMJ9xDCIqH/hZZxEmmJ+8+8EgbNjgBPz//b+uA5Hs3u1mAStWZGV1cDLdM6FvGJnHNP8MEE9DHTHC\npVQWcY+160cxDpei2bfzAzB+fP5DHOvqXAmxkSNj73//fecLyEBYaDRWo9cwco9p/p0g1grUeBqq\nv4J98Z5RHLvGJWxrE88/dqxL2EaB2LeXLHECfnaM1cbg3v/Tn+DeezOiksdb6WurfA0ju5jwT5FU\nyiLW1ztNfoiGGUVbwa/A5pFjOaKuDohf+CQvzJoFY8bAddc5jT+aZcvg/PPhjjvSThERb8aTr9KT\nhlEumNknRVIxz/i+gOvEFXGONPUsYSSPjKhr0z6ZEMmcEQzCe+/B5Mmx96u6WcAxx6Rlr4kVFpt3\nE5hhlAEm/FMklXz0wSCsGzaOkM4DWgX/HxjJVT2WFEfo4qxZresCYrFhQ1rrAmL5Syznv2FkH8vn\n3wmStkcPGdKuItenYybyePVDLZ+NdayCtXd3VC9g4ECYOzdjvoCCvAeGUeAkm8/fhH+2GDWqfUUu\nEZdv35NmqRZBKQjCYbjmGqfxx2PsWBc9lIOuLHAWNcaPL7D7ZBh5woq55JMpUxKWYvRJtQhKQRAM\nwvr1TsB37Rq7zcKFMGhQVmM3/cXJ8+a57cILLVTUMFLBhH+mCYfhvvvav19d3U4bjmXbLhp7t78u\nYGz7AvKAW9GWYYkcuR5gwYK2WSkKcqA0jALGQj0zzYIF7W3iI0fGzJETL7yzYEI+k6GuzqWHiLUu\nYM8eZ4+54460cxZFm8hGjWq7PxAo4IHSMAoQE/7ZZvjwhMnRYi0OK7qUBrNmwYknwvTp8Omnbfet\nXevKRv7+9y5stJMXFm0OO/LI1kVxFRUZ8zMbRtlgDt9M4YenVFXBD37gbBJdurRmeSsXfJvM88+3\ndwp36eJ8ATfemPJMIJ5zvGhmSIaRIyzaJ5fU1rpE/U1N0K0bzJnjktGXs1SqrXUafzyqq2H58pQO\naeGfhtExOY32EZFLROTvIrJWRKbG2H+9iGwWkVXedlMmzlsQ+LHv+/a5YrR79kBjI+ER05hZHyzf\nCJRQyC0Oq652Gn80K1bAYYel5BAuqBXQhlHkpC38RaQCeBD4JtAf+K6I9I/R9DeqOtDbHk73vAWB\nr91GzZ7erhqRsIZv2RAKOe3+lVdcrqBotm51q4OvuKKMb5Jh5IdMaP7VwFpV/VBV9wK/Bi7PwHEL\nmrdrwzRPmECk2Fdgc9UpzH0zWNix+rkmGISnn3azgFgsXgxDh2akgLxhGMmRCeF/NLA+4vUG771o\nrhKRv4rIkyJyTAbOmzfCYdgyYaqrwuW95w8C/+vz23n0UZe/v+Bj9XPN8uVw3HGx9zU3u1nUySfb\nLMAwckCuFnk9B/RV1a8DfwTmx2okIiERWSkiKzdv3pyjrqXOq7PDnBlRjcsX/LOZTK2GaGqC73/f\nyg/GZN26+AVjwKWQHjo0KwOAFY0xjAhUNa0NCAJLIl5PA6YlaF8BbOvouGeffbYWJA0NuqeihzYh\n2gwt22cjx2qPHqoVFao9eqg2NOS7owVOQ4PqmDGqzmPSfquuzuhNbGhQ+36MsgBYqUnI7kxo/n8G\n+onI8SLSFfgO8GxkAxE5KuLlaGBNBs6bH2bPpkvTLgIozQiN9OKTsZM5YkmdFRtPBd8PUFMTe/+K\nFTBsWErO4ESafcHnTDKMXJPMCNHRBnwLeA/4APg3771/B0Z7z2cC7wBvAS8Dp3Z0zELR/BsaVGfM\n8DTFkSNVI7T9PXTRYZUNpkWmS0ODar9+8WcBIqqTJ3d4iESavWn+RrlAkpp/RoR/NrZcC/82Qj7i\nPV9ghKnW5giB1AwaplorKtznjAzQ0KA6caK74bEGgeHD40rtGTNaPxbvO4n1HRtGqZGs8LfcPsSv\ny+ubCp5tGsUQWouy+A7ex+RGi+bJJH5So0GDYheN8WsHX355uzxBfjZU/zv0v5PIVcGGYbRiwp/Y\n9mC/nOBMpvDNiOLr/uOW6pH0HRPixRFm3884ft6fWAOAqlsXsHhxm2ypsTKkRg7qFRXuo/v3l2fK\nJcOIxoQ/8bXGIGGGNLnc/BL5gepqei1fwrQc97OsCIVgwACYOtVp/LFYuhSOOAKeeQaCwXbZUCMH\n9aam1vf37nW550z4G+WMFXMhdhFxAKZOJYC2FfwjR6ackMzoJMGgSw1RU+MS9sdi82Y47zw+mFLb\nLtInsjBORUVOemwYRUPZC38/PBCcsKiv9wTIuHGwbFmLfV+hw9z8RpYIheC112LnB8J9N8fPnkDg\nziltcilFDupz57qEqyLucfz43HXfMAqRsk7pHG0TFnE24ZDU8uB+l45YcMKlGWF1zesMCJmtIK/E\nSRXt/4r3UsnfB36br7/ZvoC8pYQ2yoFkUzqXlc0/+s8faRNubnZtVOFyngJaBT/A/dxBc2OQHSZA\n8ovvC7jmmpZiMUrrd9WV/QxYtRBGbW43Syu6CmmGkUXKxuzja/mRaZYjbcJdusD5gTAvcwHneGGd\nvuD/FWP5aY9ZVFW1P4aRB4JBWL/ehXv27Nki+CViY+lSOPRQZ76LgeX5McqdshH+8cI5fZvwb24P\n83LTeVzAMg5jKwAf0ZdbK2oIT3SpGxobLUVAQTFrFnzxBYwc2dYp7/Pll7BwIfR35SV8gV9b2zqI\nX3gh3HKLDQJG+VE2Zp+44ZyeKWD7gZdTQdsUzT17VfK950NtTAWxjmHkmSVLYMoUeOAB+Oqr9vvX\nrGHvoYfzq69mUKshRJyZr7nZDeQ1NTB/vuVkMsqLsnL4xnX4jRqFLm1dyOXfEZk82WmXyRzDKAzC\nYZg0CVatavO2/52GqWZYYDnp9iXJAAAdxUlEQVQVFc657//8Kyrg5pvh2GPtuzWKGyvgnixR0SMt\ngr8TBcaNAqJ/f1jTNnms/90ulZF8PG8Jb74Jjz7qtP/IaK/IFB+GUWzktIB70RIOO4Ovhy8cvjru\nNBP8xc7q1e2KxvgmvZG6lNBPDuGh7eOor3c+nxtucILf/DlGuVCWwt93/H06e0FrjKfHe/Sj96bV\n5gAsBZYscQb9Qw5peaslGmj7dli4kOBlVUyrqmX8+NbIL/PnGOVA2Ql/P+QzcOcUeix+os0K3iYC\nXM980/xKiVAItm2LXzpyyxaYMIHg1AtYPidsxXiMsqHshH99Pdy9awqTmc2hfNm6I1DB/6h8iD9X\nBE3zK0WWLIGGBjjzzNj7ly1jwK3DmTYiHFPw27oAo9Qom1BPnxEj4AQeByIie3r1Qp5/nu8R5Jh6\ni/YoWYJBFwU0apRbBBbN/v1w4418esoF/PHI8fQbH2yXGtqcwUapUHaa/zEPTuEINgGtDl5uuqkl\nJfC0afbHLnl8X4C0XRqmgK5Zw5GL5zFu3nm8NmxKS2ivLe4zSo3yEv61tfzLwrb5+bf2PK5dLL9R\nBoRC8PrrLlNoINAa4hux/aRpNgdOGtcmDYiZBI1SoSyEfzgMv7uiFr3lFsTLz+//2beE7sxn14x8\nEgzC00/Da6/xZvVEp/l7u3zlYMCqhQRvOp23f1BrzmCjpCh54R8Ow4dDx3HJ4gnQ3NwmRfO9TObF\nE0P57qKRb4JB9sx5iEWBsQAtg0BLWOjq1Zw4ewLT6keZ4DdKhpIX/jp1Ctc2L2z5IzvBH2Ai87iT\nWTz1VJ47aBQEwSCc8Fod4eGTaepxUOxEcUuXwvHHu1XhhlHklLzwP+cv7o/qC34FJvIQD+M0/quu\nylvXjAIjGITzXplF5VfbnUO4d+/2jdatc+lAjjrKBgGjqClt4V9bS5cdW9u8ta/XkXw+JkR1tft/\nh8zqY8QiFHKF4ePVDt640Q0CceoFGEahU7rCv7a2JW+PP4UXoNvMn/L00y51jwl+IyHBoKsdPHw4\n9OwZu83ChXDBBbb6yyg6SlP4jxvntLLIvD0irvKTSXwjFYJBeOUVVzRm7NjYbZYtg/POczUFDKNI\nKD3hP2WK08Yi0ECA310+j/AYi+c30qCuDqqr4++fPRsOOshMQUZRUHrC/4kn2rxU4NbAQ4x+LmR1\nd430Wb7czQC6dYu9f+dOp3yMGpXbfhlGipSe8D/hhDYvNx45kFoN2dJ8IyEpJW6rq4Pdu+ObgcCF\nhZqmYRQwpSf8773XrcMHqKjg85/OtaX5RkL8xG133UVqs8O6OudH6t0bunRpv/9b33IVxSwk1EiS\nXGaPLb2snsEgvPpqS6HdAcEgLw6wurtGfGIlbkv6dzJrltvCYef0jWTrVrdNmOCcwnV1Ge65UUrk\nOnts6Ql/cHcs4q5FvTSMNviJ2/w/Xadmh8GgWzgyaZIbRaLxgxBsADDikJYS0glKz+xjGCkSDDot\nK1bituhpeG2t8+X6lpw2+0MhN+scMyb2iRYuNDOQEZecZ49V1bQ34BLg78BaYGqM/d2A33j7lwN9\nOzrm2WefrYaRT2pqVCsrVQMB1R49VCdPVoXWbfJk935FhXtsaIj68JFHtv1A5DZ2bN6uyygsGhpU\nZ8xwj5HPOwuwUpOR28k0SngAqAA+AE4AugJvAf2j2kwC5nnPvwP8pqPjmvA3ckn0n66hQbVLl1ZZ\nHQionnRSW/l90klO8IN7nDGj/XE/GzlWm0GbYw0Aw4erTpyY3j/dKGoaGhIoEJ0kWeGfCbNPNbBW\nVT9U1b3Ar4HLo9pcDsz3nj8JXCQiMRMnGkauiRXtU1/f1nQfCMCVV7b93JVXJp6mh8PQ99U6JkoN\nq6V/a+U4n2XLYN48GDbMTEFlSrSdf8GC4or2ORpYH/F6AzAkXhtV3S8i24Aq4PPIRiISApdu89hj\nj81A1wyjY2I52kaMcOu49uxxwv2BB5xJ/8QT4amnXDbYUMiZ9+NFkvnHrdUQj1SE+MuAcXx91cLo\n07sTe3moLP1IeTFiBFRWukw0gQA89pgrJZ2LaJ+Ccviqaq2qDlbVwb1jpdM1jCwQy9HmO4Hvucel\n9vFlcijkSgD7rxPVfY4+7s65dS4iqLra/eMjaW52IaGHHmrpIcoM9aaEzc2wb1/uakVnQvh/AhwT\n8bqP917MNiJSCRwKNGbg3IaRNvGifRIJ9k4fNxRyKSKWLXPThmjr55dfuqigQw4xU1AZ4JsXfUdQ\nRUXuon1EtZ0lMrUDOGH+HnARTsj/GbhWVd+JaHMrMEBVJ4rId4ArVfWaRMcdPHiwrly5Mq2+GUbB\nU1sLt97q5vqxOO00WL06t30yckb0wq45c6CxMb0FqSLyhqoO7qhd2jZ/z4Z/G7AEF/nzqKq+IyL/\njvM6Pws8AvxKRNYCW3ARP4ZhhEIwYIBbHLZqVfv9a9bA4YfDjBnmDyhB/NlhPjIQpK35ZwvT/I2y\nY8gQWLEi/v7Jk10qCaNk8SPNcqH5F5TD1zDKmuXLnUP4gANi758926qGlTCdTjDYSUz4G0YniE77\nkLFsjKGQqwkwcmTs+sHLlrmykjYAlBThMEyf7kKLcxXtU5qJ3Qwji8Ry0t1+e4azMS5Z4k40bFj7\nRHH797sOXHmlJYorAfzf0549rfH+uYj2Mc3fMFJkwQJXy8XX0J56qv0isYzgpycfPrz9vl27XEho\nt24WElrk+IsBfcF/8cXZX+AFJvwNIyXCYXj00daFOZWVbrVv1rIx+gXk/cVh0UVj9u51i8OOOsoG\ngSIlcjFgt27O/JOLqB8T/oaRApE5f0Tg+993Zvp4KaEzhr847Jo4y2M2bnSDwJQpWTi5kU0SpRTP\nJhbqaRgpkOtqSzE56ign7ONxxBFw/fUWFlqmWKinYWSBfGlpbfj0UxcN5NeqjmbTJhcWevLJFhVk\nxMWEv2GkSLo5fzLCkiUu6qemBo47Lnab9993dYXNFGTEwIS/YRQzoRCsW+cGgXjMnm0DQIGQsfUg\nGcDi/A2jFPDz/kyYEHv/7NnOW718ec66ZLSlIPxFEZjmbxilQigEDQ2x1wWAyxvUr19hqJ1lSH19\n6wrePXvc63zOBEz4G0YWyfmf218XMHly7P1r17pVw1dcYYNAjqmqcgu5wD1u3ZrbXD7RmPA3jCyR\n60RdbZg1y/kBDjmk/b6mJli82DmDR43KYafKm8bG1nRNgYDL4J2VleFJYsLfMLJErNrAOSUUgm3b\nXFho167tq4YBLF0KgwbZLCAH+HWh/ZW8WV0ZngS2yMswskQsBx/kp3AH4NI/TJrUPlEcuIHh2mst\nUVyWic7Xn4n8/dEku8jLhL9hZJHIPze0DgaVlS41xPjxOR4EwmG47jq3BiAW1dUWEdRJsiHIO4MJ\nf8MoMGbOdPb/yNxA3bvnKeRv3Dh47jlXMD6agQNh7tw8r2IrLgopjNPSOxhGgeFnb/RN76p58gWA\nM+9s2wZjx7bft2oVnH8+nH66ZQpNgnwUYskEJvwNI0f4eYEmTMivo68NdXXO1BONKqxe7TprEUFx\n8TX+F17IbSGWTGDC3zByzLHHwi9/CTff7MzveWf58tgzAJ+lS+GYYywiKAapFGIppNQOYDZ/w8gZ\nkXbhigqnXO/f7zTFl18uABN7OOzSQCxeHL9Nv34wf34BdLYwSNbWn0ufgNn8DSPPRGt6kXH/+/a5\nTdXZihcsyGtXHcEgPP104iRxfqbQQlFf80yyKb7zvuYjBpbYzTCyQCxNz3f47t3rhH6BTrrd4rAB\nA1zVsA0bYreZOhUuuST/cY0FQDDY8S2I/O4LxSdgmr9hZIFYmp6vJd58c9s6LJWVLt6/oAgGYf16\nlyPogAPa71+2DO6800UFWURQhxREEaAoTPgbRhaILModqekFg87h6yf4EoGbbioMYRCTWbNg587Y\nEUHgpi9WOzgpCqIIUAQm/A0jCyTS9CIHhu7dC1Drj8Xy5c4XMHKk63Q0s2e7tJUlPAsotGiddLFo\nH8PIA6mkAiiUtAEtjBsHCxfG3z92bMnlCCqkFbwdkWy0jzl8DaNAiCXkC1Lo+IL9qadg9+72+/2B\noYQGgHg+nGLGzD6GkWNi5fmPl/s/Uujs3l0gIaHgBPuuXc4MFIuFC+Hgg+GCC0rCThLPh1PMmPA3\njBwTS4uMFwc+YkRrZJAqPPZYgcnSJUtcRNBBB7Xft2OHiwoqgXUBhRitky4m/A0jx8TSIhNFB91w\nQ2syuP37C2OBUBtmzYLt2xOniLjwwqJ3BhdatE66mPA3jBwTS4tMpFmOH+8CbAre5FBXF7928J49\nLiS0f/+iHwSgNCJ/0or2EZFewG+AvsA64BpV/SJGuybgbe/lP1R1dEfHtmgfw2il4CJ+EhEOw7e+\n5SqUx6OII4LiOeEL5TvKVbTPVOBFVb1XRKZ6r2Ot9tilqgPTPJdhlC3JpBAoGIJB+OILGDIEVqyI\n3WbhQnjnnaIpGhMp2OP5ZwouKqsD0jX7XA7M957PB8akeTzDKHtqa10K/aK3jixfDg0NMGYM9OzZ\nfv+qVTB0aMFfaHQkVlVVe/9MISZu64h0Nf+vqeqn3vONwNfitOsuIiuB/cC9qhozZ6yIhIAQwLHH\nHptm1wyj+KitdaZxcGn0weVZK1r8TKHhMAwb1r54fHOzu+BlywrSDBRZpau52Qn2xkan2UebeAot\ncVtHdGjzF5EXgCNj7Po3YL6q9oxo+4WqHhbjGEer6icicgLwEnCRqn6Q6Lxm8zfKkVGjWoU+uDD6\nJUvy15+MEg7DpElO449FdTVs2QJXXukiiPKMr/H7gj8QgG7dEufsLyabf4dmH1W9WFXPiLE9A3wm\nIkd5JzwK2BTnGJ94jx8C9cCgFK7FMMqGq65K/LqoCQbhzTddjqD+/dvvX7EC1q51eYKGDMl9/6JI\npUoXFF8oaLo2/2cBvxDddcAz0Q1E5DAR6eY9Pxw4H1id5nkNoyQJhVrzp9XUpG/yKciQxFDIOXsT\nrQtYsSKvpSPDYfjHP1y67YoK6NIFTjgB3n67AO9nZ1HVTm9AFfAi8D7wAtDLe38w8LD3/DxcmOdb\n3uONyRz77LPPVsMwOk9Dg2qPHqoVFe6xoSHfPYrB5MmqJ52k2qePX9+m/TZ2bE67FHnfunZVHTPG\nPQYCrjuBQAHfT1UFVmoSMjYtzV9VG1X1IlXtp848tMV7f6Wq3uQ9b1DVAap6pvf4SDrnNAwjOYoi\nAmXWLFca8re/dbaVWCxcmNMcQZH3rakJvvrKPfo1GHzHb0HezxSwFb6GUaIUVTKyYBBeew369Im9\nf9mynA0A0fftqqvcoz82BQJFcD+TwPL5G0YJUygRKCmRqF5A376udvD48Vm9IP++VVW50M7ox0K+\nn8lG+5jwNwyj8AiHXZH4cBj27Wu/v0sXeOWVrA8AxbZqFzIY6mkYhpFzgkEn3B94IPb+ffvgG9/I\nqimoKHwmaWDC3zCKlIIM48w0fuxrdXV7h/DOna31Ampr074f0Z8vKp9JJzCzj2EUIcVqkkiL2lq3\nQjg6RQTQDLzFQH4QmMufK4PccENqboFCz9SZCmb2MYwSptRNEjEJheDVV2H48Ha7BBjIKl5pPo/p\ne6dQU9O2HGZHxLufxbZqNxVM+BtGEVLqJom4+L6AqNrB4m0BYCqz+blOSWlQLMf7aWYfwyhSitEk\nkVFqa+Huu2HjxjZvK84MtJdu7Kq+gF7Lk8uMVyr300I9DcMoD2IUjfGlmgCcdhqsLp90YmbzNwyj\nPFi+3NUOjigY45uBAFizBo46quCLxuQaE/6GYXRIwYeVzprlSkc2NEC/fu33b9zoisZMiVVltjwx\ns49hlBiZtl0XZVhp//5O448mEHA5hAr+AjqPmX0MowyJrjebCU29KMNKV6+OXy9g6lTo1QsOO6ys\nZwIm/A2jhEhWUKdixinaMMi6Orc6uG9fEHFafyDgVgV/8QVs3eqqhpXpAGBmH8MoIZIx0XTGjJOP\nMMiMntM/2IMPwieftN/fp4+rKVAC5qBkzT6VueiMYRi5IRh0wjyR0Kyvby1KvmePe92RzAsGcysX\n0/UztBs4/Avwtf1oNmxwOYIaGkpiAEgGM/sYRonRUUqCqqq2VamqqnLXt2SJNF/t2QPTpyfvv0jo\n95g1K3Ht4GuugSuuKOCwpsxhwt8wyozGxrZVqRob89ufWPh+hkDADVAvvJC8A7tDv0ddndPwY1UN\n27ABFi+G888v+XUBJvwNo8yoqmr1fXbrln0HbmfWCPjmq4svbh0Ako00SspBHQzC+vUuVbTfMBJV\nty6gb9+SHQTM4WsYJUg8Z6lvEtmzx8m8Bx5wyTKzdW5I33bfmc+n7CyurXXCPh41NZm/UVnCHL6G\nUaYkEpi+SaS52UU/ZtrkE33u665rb4JJVXh35MCORcoO6lAIPvggtjMY4Lbb3PRl2rSiGQQ6wsw+\nhlFiJLJ5ZztmP/rckPz5fPNQbW1bhy2knlO/U+koZs1yvoAY9QLYtw/WrXOzg1GjUjho4WKav2GU\nGL6A97XvSIGbTChoqkRq6dHnHj/ebR2dL3LGIOJmJpF2/lyYioDWegG1tfCLX8D777cvIL90KYwb\n5xzHRYwJf8MoMToS8JmM2Y8WtHPmOFMPtC2j2NH5ImcMgYCbKYh0bnYSa+YT6/wJ/QKhkNumTIlt\nClq40DmM+/dPrV5kAWHC3zBKkFQEfDoraaPj8W+91QXK+Fp/skTPGObMcf6IzvQp0czHJ+nZwaxZ\nbkXwwoXt9y1b5raaGnj99aIbAEz4G0YZk240TVVVq6ANBNwgEM9ck2iQyaQ5KtlVzkk7ouvq3Kg2\naRKsWtV+vypccAH86EdusCgSTPgbRhmTkhD0iGXqaWx0A8Htt8fWuDsaZDKdO6ijmY+/1sGfpXRo\nWgoG4c03nS9g0iR3wyLZt8+Zhx5+2HmaiyAiyIS/YZQwHQnVZEwk0UQPGI2NLhoHYMCA2OdLNMjk\nul5AOOwGKd+/MGdOCucLhdxFXnedcwZHs2VL63qBAh8ATPgbRomSjFCNNpGAU1wTaeAdRROlOsjE\nC03NVhbRtNc6BIPw3nsu4uepp2D37vZtJkyAn/2soDOFmvA3jBIlWZOOL7DDYbjwwlYB/fLL8dun\nap9P9Bl/YNizxwnjrVuzu6q3M7OdmNTVuW3UKBf+GU2hZwpV1YLczj77bDUMo/M0NKj26KFaUeEe\nGxoSt584UdVZwd02cWLnzjljRsfniqamRrVLF9VAoPURXN9nzEjuvKlca2f7GZeaGtX+/dveQH/r\n3dvtq6nJ0MkSA6zUJGSsaf6GUaJkY0FXItKx3Tc2ti7sAmeLTyXOP1XHdcbrE/jrAoYMgRUr2u7b\nvNltEya4FBIFEhGUVnoHEblaRN4RkWYRiZtISEQuEZG/i8haEZmazjkNw0iejnL7RzJ+vBO2vtBN\nJU4f0qv1G5l2ols3V3DrZz9LfgDpKG1Fp9I9dIbly+NnCgUXEXTBBQVRLyBdzf9vwJVATbwGIlIB\nPAh8A9gA/FlEnlXV1Wme2zCMDBIMOoHdmZlCOAz/+AdUehKlstK9DoeTC+lMdpbSmc/nOpqI5cvd\nY7xMocuWOV/AwIEwd27+/AHJ2IY62oB6YHCcfUFgScTracC0jo5pNn/DKA4i7e1du6qOGeMefft7\nTY2zr9fUxLbLJ2t/T9Wu7zNjhvtMKj6EjDF2bGw/QOQ2cmRGT0kB2fyPBtZHvN4ADInVUERCQAjg\n2GOPzX7PDMNIm0hzD8BXX7nnfsqH225rDauMTtgGyWvlnVmQBhmM7ukMdXUuS+jMmS4raCyWLnUR\nQ0uW5LBjSdj8ReQFEflbjO3yTHdGVWtVdbCqDu7du3emD28YRhaIrAzWtStcdVWr/T0QgP37WweD\nioq2dvlYAj2efb6z6ah9k1AqPoSMEgrBRx+5HEBHHhm7zdKlOa8d3KHmr6oXp3mOT4BjIl738d4z\nDKPIibVa1l8Eu2ABrF7tTNzgbBw/+hH07NnWLh+plVdVxZ8JpBO9lPHons7gRwTFWxeweDE88wzc\ncUdOIoJyUczlz0A/ETleRLoC3wGezcF5DcPIMpGrZVVd+puZM+Htt2H+fHj11da2gYAT/NHRR9dd\nBzff7AR7Y2PrTGD3bjeARJJK9FLBsmSJmwX06tV+n6qLCDr55OzPApJxDMTbgCtwNvw9wGd4jl3g\nX4DfRbT7FvAe8AHwb8kc2xy+hlF4RDtno5293bq555ELtUBVpL2TNpYDt6HBHcf/XLduGVyIVYjU\n1LibE8sRXFHRqYsnSYdvWpq/qj6tqn1UtZuqfk1VR3nv/1NVvxXR7neqerKqnqiqP0/nnIZhZI5U\n4t/9kEm/vKIfxunb02+4oa193y/K0rWri3iMtrfHc+DecINzDoM7XirrBYqOUAjmzWu94EiamrJ6\n8bbC1zDKlFTj3+MJ68jcQPPnJ1+QJV4UzvjxbY+T0+icfBAvU2hFRVYv3oS/YZQpqYZOdhQymapD\nNl77XKelKAj8TKFTpsATT8AJJ8C992b14sWZiAqPwYMH68qVK/PdDcMoWTqz8jXTRVeMzCMib6hq\n3HQ7Pqb5G0aZ0tnUzJkW+jag5AcT/oZRxvjC1vcr5lr4JltDwMg8JvwNo4zJRtKzVDT5BQtcCghw\njwsWmPDPFSb8DaOM6Wy+nHjkPIOm0WlyscLXMIwCpbP5cuKRak7/dGsIGJ3HNH/DKGMyHVYZHQ5a\nVZW4IHw6NQSM9LBQT8MwMopv86+qcknfzASUW5IN9TSzj2EYGcVPvhaZpC3Vso5G9jHhbxhG2sTK\nEZRpf4KRWczmbxhGWsSL8CnLNA1FhAl/wzDSIlG4aEEUUTFiYmYfwzDSwsw7xYlp/oZhpIWZd4oT\nE/6GYaSNmXeKDzP7GIZhlCEm/A3DMMoQE/6GYRhliAl/wzCMMsSEv2EYRhliwt8wDKMMKdisniKy\nGfi4kx8/HPg8g93JB8V+DcXefyj+ayj2/kPxX0M++n+cqvbuqFHBCv90EJGVyaQ0LWSK/RqKvf9Q\n/NdQ7P2H4r+GQu6/mX0MwzDKEBP+hmEYZUipCv/afHcgAxT7NRR7/6H4r6HY+w/Ffw0F2/+StPkb\nhmEYiSlVzd8wDMNIgAl/wzCMMqTkhL+IXCIifxeRtSIyNd/9SRUReVRENonI3/Ldl84gIseIyMsi\nslpE3hGRH+a7T6kiIt1FZIWIvOVdw0/z3afOICIVIvKmiDyf7750BhFZJyJvi8gqEVmZ7/6kioj0\nFJEnReRdEVkjIgWV9LqkbP4iUgG8B3wD2AD8Gfiuqq7Oa8dSQESGAzuABap6Rr77kyoichRwlKr+\nRUQOBt4AxhTZdyDAgaq6Q0S6AK8BP1TVP+W5aykhIj8GBgOHqOpl+e5PqojIOmCwqhblIi8RmQ+8\nqqoPi0hX4ABV3ZrvfvmUmuZfDaxV1Q9VdS/wa+DyPPcpJVR1GbAl3/3oLKr6qar+xXu+HVgDHJ3f\nXqWGOnZ4L7t4W1FpSSLSB7gUeDjffSlHRORQYDjwCICq7i0kwQ+lJ/yPBtZHvN5AkQmeUkJE+gKD\ngOX57UnqeCaTVcAm4I+qWmzXMAeYDDTnuyNpoMBSEXlDREL57kyKHA9sBh7zTG8Pi8iB+e5UJKUm\n/I0CQUQOAp4CblfVL/Pdn1RR1SZVHQj0AapFpGhMcCJyGbBJVd/Id1/SZKiqngV8E7jVM4kWC5XA\nWcBDqjoI2AkUlA+y1IT/J8AxEa/7eO8ZOcSzkz8FLFTV/8p3f9LBm6q/DFyS776kwPnAaM9m/mvg\nv4lIXX67lDqq+on3uAl4GmfWLRY2ABsiZoxP4gaDgqHUhP+fgX4icrznYPkO8Gye+1RWeM7SR4A1\nqvof+e5PZxCR3iLS03veAxdA8G5+e5U8qjpNVfuoal/cf+AlVR2X526lhIgc6AUM4JlLRgJFEwGn\nqhuB9SJyivfWRUBBBT1U5rsDmURV94vIbcASoAJ4VFXfyXO3UkJEFgEjgMNFZANwt6o+kt9epcT5\nwPeAtz2bOcCdqvq7PPYpVY4C5nvRYwHgt6palOGSRczXgKedLkEl8ISq/iG/XUqZHwALPUX0Q+D7\nee5PG0oq1NMwDMNIjlIz+xiGYRhJYMLfMAyjDDHhbxiGUYaY8DcMwyhDTPgbhmGUISb8DcMwyhAT\n/oZhGGXI/w++6U8tCYD1ygAAAABJRU5ErkJggg==\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Wokallj1D21L",
-        "colab_type": "text"
-      },
-      "source": [
-        "Oh dear! The graph makes it clear that our network has learned to approximate the sine function in a very limited way. From `0 <= x <= 1.1` the line mostly fits, but for the rest of our `x` values it is a rough approximation at best.\n",
-        "\n",
-        "The rigidity of this fit suggests that the model does not have enough capacity to learn the full complexity of the sine wave function, so it's only able to approximate it in an overly simplistic way. By making our model bigger, we should be able to improve its performance.\n",
-        "\n",
-        "## Change our model\n",
-        "To make our model bigger, let's add an additional layer of neurons. The following cell redefines our model in the same way as earlier, but with an additional layer of 16 neurons in the middle:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "oW0xus6AF-4o",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "model_2 = tf.keras.Sequential()\n",
-        "\n",
-        "# First layer takes a scalar input and feeds it through 16 \"neurons\". The\n",
-        "# neurons decide whether to activate based on the 'relu' activation function.\n",
-        "model_2.add(layers.Dense(16, activation='relu', input_shape=(1,)))\n",
-        "\n",
-        "# The new second layer may help the network learn more complex representations\n",
-        "model_2.add(layers.Dense(16, activation='relu'))\n",
-        "\n",
-        "# Final layer is a single neuron, since we want to output a single value\n",
-        "model_2.add(layers.Dense(1))\n",
-        "\n",
-        "# Compile the model using a standard optimizer and loss function for regression\n",
-        "model_2.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Dv2SC409Grap",
-        "colab_type": "text"
-      },
-      "source": [
-        "We'll now train the new model. To save time, we'll train for only 600 epochs:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "DPAUrdkmGq1M",
-        "colab_type": "code",
-        "outputId": "34ad91e0-229b-479c-bd65-12ad1ed1c660",
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        }
-      },
-      "source": [
-        "history_2 = model_2.fit(x_train, y_train, epochs=600, batch_size=16,\n",
-        "                    validation_data=(x_validate, y_validate))"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Train on 600 samples, validate on 200 samples\n",
-            "Epoch 1/600\n",
-            "600/600 [==============================] - 0s 422us/sample - loss: 0.5655 - mae: 0.6259 - val_loss: 0.4104 - val_mae: 0.5509\n",
-            "Epoch 2/600\n",
-            "600/600 [==============================] - 0s 111us/sample - loss: 0.3195 - mae: 0.4902 - val_loss: 0.3341 - val_mae: 0.4927\n",
-            "...\n",
-            "Epoch 598/600\n",
-            "600/600 [==============================] - 0s 116us/sample - loss: 0.0124 - mae: 0.0886 - val_loss: 0.0096 - val_mae: 0.0771\n",
-            "Epoch 599/600\n",
-            "600/600 [==============================] - 0s 130us/sample - loss: 0.0125 - mae: 0.0900 - val_loss: 0.0107 - val_mae: 0.0824\n",
-            "Epoch 600/600\n",
-            "600/600 [==============================] - 0s 109us/sample - loss: 0.0124 - mae: 0.0892 - val_loss: 0.0116 - val_mae: 0.0845\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Mc_CQu2_IvOP",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Evaluate our new model\n",
-        "Each training epoch, the model prints out its loss and mean absolute error for training and validation. You can read this in the output above (note that your exact numbers may differ): \n",
-        "\n",
-        "```\n",
-        "Epoch 600/600\n",
-        "600/600 [==============================] - 0s 109us/sample - loss: 0.0124 - mae: 0.0892 - val_loss: 0.0116 - val_mae: 0.0845\n",
-        "```\n",
-        "\n",
-        "You can see that we've already got a huge improvement - validation loss has dropped from 0.15 to 0.015, and validation MAE has dropped from 0.31 to 0.1.\n",
-        "\n",
-        "The following cell will print the same graphs we used to evaluate our original model, but showing our new training history:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "SYHGswAJJgrC",
-        "colab_type": "code",
-        "outputId": "efcc51f6-f1f1-490a-ffba-ed283586f83e",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 851
-        }
-      },
-      "source": [
-        "# Draw a graph of the loss, which is the distance between\n",
-        "# the predicted and actual values during training and validation.\n",
-        "loss = history_2.history['loss']\n",
-        "val_loss = history_2.history['val_loss']\n",
-        "\n",
-        "epochs = range(1, len(loss) + 1)\n",
-        "\n",
-        "plt.plot(epochs, loss, 'g.', label='Training loss')\n",
-        "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
-        "plt.title('Training and validation loss')\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('Loss')\n",
-        "plt.legend()\n",
-        "plt.show()\n",
-        "\n",
-        "# Exclude the first few epochs so the graph is easier to read\n",
-        "SKIP = 100\n",
-        "\n",
-        "plt.clf()\n",
-        "\n",
-        "plt.plot(epochs[SKIP:], loss[SKIP:], 'g.', label='Training loss')\n",
-        "plt.plot(epochs[SKIP:], val_loss[SKIP:], 'b.', label='Validation loss')\n",
-        "plt.title('Training and validation loss')\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('Loss')\n",
-        "plt.legend()\n",
-        "plt.show()\n",
-        "\n",
-        "plt.clf()\n",
-        "\n",
-        "# Draw a graph of mean absolute error, which is another way of\n",
-        "# measuring the amount of error in the prediction.\n",
-        "mae = history_2.history['mae']\n",
-        "val_mae = history_2.history['val_mae']\n",
-        "\n",
-        "plt.plot(epochs[SKIP:], mae[SKIP:], 'g.', label='Training MAE')\n",
-        "plt.plot(epochs[SKIP:], val_mae[SKIP:], 'b.', label='Validation MAE')\n",
-        "plt.title('Training and validation mean absolute error')\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('MAE')\n",
-        "plt.legend()\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJzt3Xl8VOX1+PHPyQ4JEAhRtmBAEQg7\nRDQiJYgiasUflVpwQayI0rpUy1epK0WtuFQRS61LRVEUF6qioNSyiGhklUU2QQwS1hDWsGQ9vz/u\nzWQIWSaQySTMeb9e88q9zzxz73nuTObM89xNVBVjjDEGICTQARhjjKk5LCkYY4zxsKRgjDHGw5KC\nMcYYD0sKxhhjPCwpGGOM8bCkYKqUiISKSLaItKzKuoEkIueISJUfuy0il4hIutf8BhHp7Uvdk1jX\nayLywMm+vpzlPi4ib1T1ck3ghAU6ABNYIpLtNVsXyAEK3PnbVHVqZZanqgVATFXXDQaq2rYqliMi\nI4AbVDXVa9kjqmLZ5vRnSSHIqarnS9n9JTpCVf9XVn0RCVPV/OqIzRhT/Wz4yJTLHR54T0TeFZFD\nwA0ikiIi34nIfhHZISITRSTcrR8mIioiie782+7zn4vIIRFJE5FWla3rPn+5iPwoIgdE5EUR+UZE\nhpcRty8x3iYim0Rkn4hM9HptqIg8LyJZIrIZGFDO9nlQRKaVKJskIs+50yNEZJ3bnp/cX/FlLStD\nRFLd6boi8pYb2xqgR4m6D4nIZne5a0RkoFveCfgH0NsdmtvjtW3Her3+drftWSLysYg09WXbVERE\nBrnx7BeRuSLS1uu5B0Rku4gcFJH1Xm29QESWu+W7ROQZX9dn/EBV7WEPVBUgHbikRNnjQC5wFc6P\niDrAecD5OD3N1sCPwB1u/TBAgUR3/m1gD5AMhAPvAW+fRN0zgEPA1e5z9wJ5wPAy2uJLjJ8ADYBE\nYG9R24E7gDVACyAOWOD8q5S6ntZANhDttezdQLI7f5VbR4CLgaNAZ/e5S4B0r2VlAKnu9LPAfKAh\ncBawtkTda4Gm7ntynRvDme5zI4D5JeJ8GxjrTvd3Y+wKRAH/BOb6sm1Kaf/jwBvudHs3jovd9+gB\nYIM73QHYAjRx67YCWrvTS4Ch7nQ94PxA/y8E88N6CsYXC1X1U1UtVNWjqrpEVRepar6qbgZeAfqU\n8/oPVXWpquYBU3G+jCpb99fAClX9xH3ueZwEUiofY3xSVQ+oajrOF3DRuq4FnlfVDFXNAsaXs57N\nwA84yQrgUmCfqi51n/9UVTerYy4wByh1Z3IJ1wKPq+o+Vd2C8+vfe73vq+oO9z15ByehJ/uwXIDr\ngddUdYWqHgPGAH1EpIVXnbK2TXmGADNUda77Ho3HSSznA/k4CaiDOwT5s7vtwEnubUQkTlUPqeoi\nH9th/MCSgvHFVu8ZEWknIjNFZKeIHATGAY3Lef1Or+kjlL9zuay6zbzjUFXF+WVdKh9j9GldOL9w\ny/MOMNSdvs6dL4rj1yKySET2ish+nF/p5W2rIk3Li0FEhovISneYZj/QzsflgtM+z/JU9SCwD2ju\nVacy71lZyy3EeY+aq+oG4M8478NudziyiVv1ZiAJ2CAii0XkCh/bYfzAkoLxRcnDMV/G+XV8jqrW\nBx7BGR7xpx04wzkAiIhw/JdYSacS4w4gwWu+okNm3wcuEZHmOD2Gd9wY6wAfAk/iDO3EAv/1MY6d\nZcUgIq2Bl4BRQJy73PVey63o8NntOENSRcurhzNMtc2HuCqz3BCc92wbgKq+raq9cIaOQnG2C6q6\nQVWH4AwR/h2YLiJRpxiLOUmWFMzJqAccAA6LSHvgtmpY52dAdxG5SkTCgLuBeD/F+D7wJxFpLiJx\nwP3lVVbVncBC4A1gg6pudJ+KBCKATKBARH4N9KtEDA+ISKw453Hc4fVcDM4XfyZOfrwVp6dQZBfQ\nomjHeineBW4Rkc4iEonz5fy1qpbZ86pEzANFJNVd9//h7AdaJCLtRaSvu76j7qMQpwE3ikhjt2dx\nwG1b4SnGYk6SJQVzMv4M3ITzD/8yzg5hv1LVXcDvgOeALOBs4Huc8yqqOsaXcMb+V+PsBP3Qh9e8\ng7Pj2DN0pKr7gXuAj3B21g7GSW6+eBSnx5IOfA5M8VruKuBFYLFbpy3gPQ7/JbAR2CUi3sNARa//\nAmcY5yP39S1x9jOcElVdg7PNX8JJWAOAge7+hUjgaZz9QDtxeiYPui+9AlgnztFtzwK/U9XcU43H\nnBxxhmaNqV1EJBRnuGKwqn4d6HiMOV1YT8HUGiIywB1OiQQexjlqZXGAwzLmtGJJwdQmFwGbcYYm\nLgMGqWpZw0fGmJNgw0fGGGM8rKdgjDHGo9ZdEK9x48aamJgY6DCMMaZWWbZs2R5VLe8wbqAWJoXE\nxESWLl0a6DCMMaZWEZGKzswHbPjIGGOMF0sKxhhjPCwpGGOM8ah1+xSMMdUrLy+PjIwMjh07FuhQ\njA+ioqJo0aIF4eFlXfqqfJYUjDHlysjIoF69eiQmJuJcnNbUVKpKVlYWGRkZtGrVquIXlMKGj4wx\n5Tp27BhxcXGWEGoBESEuLu6UenVBkxTStqbx5NdPkrY1LdChGFPrWEKoPU71vQqK4aO0rWn0m9KP\n3IJcIkIjmDNsDikJKYEOyxhjapyg6CnMT59PbkEuBVpAbkEu89PnBzokY4yPsrKy6Nq1K127dqVJ\nkyY0b97cM5+b69ttF26++WY2bNhQbp1JkyYxderUqgiZiy66iBUrVlTJsqpbUPQUUhNTiQiN8PQU\nUhNTAx2SMcZHcXFxni/YsWPHEhMTw+jRo4+ro6qoKiEhpf/OnTx5coXr+eMf/3jqwZ4GgqKnkJKQ\nwpxhc3is72M2dGRMNaiOfXibNm0iKSmJ66+/ng4dOrBjxw5GjhxJcnIyHTp0YNy4cZ66Rb/c8/Pz\niY2NZcyYMXTp0oWUlBR2794NwEMPPcSECRM89ceMGUPPnj1p27Yt3377LQCHDx/mmmuuISkpicGD\nB5OcnFxhj+Dtt9+mU6dOdOzYkQceeACA/Px8brzxRk/5xIkTAXj++edJSkqic+fO3HDDDVW+zXwR\nFD0FcBKDJQNj/K869+GtX7+eKVOmkJycDMD48eNp1KgR+fn59O3bl8GDB5OUlHTcaw4cOECfPn0Y\nP3489957L6+//jpjxow5YdmqyuLFi5kxYwbjxo3jiy++4MUXX6RJkyZMnz6dlStX0r1793Ljy8jI\n4KGHHmLp0qU0aNCASy65hM8++4z4+Hj27NnD6tWrAdi/fz8ATz/9NFu2bCEiIsJTVt2CoqdgjKk+\n1bkP7+yzz/YkBIB3332X7t270717d9atW8fatWtPeE2dOnW4/PLLAejRowfp6emlLvs3v/nNCXUW\nLlzIkCFDAOjSpQsdOnQoN75FixZx8cUX07hxY8LDw7nuuutYsGAB55xzDhs2bOCuu+5i9uzZNGjQ\nAIAOHTpwww03MHXq1JM++exUWVIwxlSpon14oRLq93140dHRnumNGzfywgsvMHfuXFatWsWAAQNK\nPV4/IiLCMx0aGkp+fn6py46MjKywzsmKi4tj1apV9O7dm0mTJnHbbbcBMHv2bG6//XaWLFlCz549\nKSgoqNL1+sKSgjGmSgVqH97BgwepV68e9evXZ8eOHcyePbvK19GrVy/ef/99AFavXl1qT8Tb+eef\nz7x588jKyiI/P59p06bRp08fMjMzUVV++9vfMm7cOJYvX05BQQEZGRlcfPHFPP300+zZs4cjR45U\neRsqEjT7FIwx1ScQ+/C6d+9OUlIS7dq146yzzqJXr15Vvo4777yTYcOGkZSU5HkUDf2UpkWLFjz2\n2GOkpqaiqlx11VVceeWVLF++nFtuuQVVRUR46qmnyM/P57rrruPQoUMUFhYyevRo6tWrV+VtqEit\nu0dzcnKy2k12jKk+69ato3379oEOo0bIz88nPz+fqKgoNm7cSP/+/dm4cSNhYTXr93Vp75mILFPV\n5DJe4lGzWmKMMTVYdnY2/fr1Iz8/H1Xl5ZdfrnEJ4VSdXq0xxhg/io2NZdmyZYEOw69sR7MxxhgP\nSwrGGGM8LCkYY4zxsKRgjDHGw5KCMaZG69u37wknok2YMIFRo0aV+7qYmBgAtm/fzuDBg0utk5qa\nSkWHuE+YMOG4k8iuuOKKKrku0dixY3n22WdPeTlVzZKCMaZGGzp0KNOmTTuubNq0aQwdOtSn1zdr\n1owPP/zwpNdfMinMmjWL2NjYk15eTWdJwRhTow0ePJiZM2d6bqiTnp7O9u3b6d27t+e8ge7du9Op\nUyc++eSTE16fnp5Ox44dATh69ChDhgyhffv2DBo0iKNHj3rqjRo1ynPZ7UcffRSAiRMnsn37dvr2\n7Uvfvn0BSExMZM+ePQA899xzdOzYkY4dO3ouu52enk779u259dZb6dChA/379z9uPaVZsWIFF1xw\nAZ07d2bQoEHs27fPs/6iS2kXXYjvq6++8txkqFu3bhw6dOikt21p7DwFY4zP/vQnqOobinXtCu73\naakaNWpEz549+fzzz7n66quZNm0a1157LSJCVFQUH330EfXr12fPnj1ccMEFDBw4sMz7FL/00kvU\nrVuXdevWsWrVquMuff3EE0/QqFEjCgoK6NevH6tWreKuu+7iueeeY968eTRu3Pi4ZS1btozJkyez\naNEiVJXzzz+fPn360LBhQzZu3Mi7777Lq6++yrXXXsv06dPLvT/CsGHDePHFF+nTpw+PPPIIf/3r\nX5kwYQLjx4/n559/JjIy0jNk9eyzzzJp0iR69epFdnY2UVFRldjaFbOegjGmxvMeQvIeOlJVHnjg\nATp37swll1zCtm3b2LVrV5nLWbBggefLuXPnznTu3Nnz3Pvvv0/37t3p1q0ba9asqfBidwsXLmTQ\noEFER0cTExPDb37zG77++msAWrVqRdeuXYHyL88Nzv0d9u/fT58+fQC46aabWLBggSfG66+/nrff\nfttz5nSvXr249957mThxIvv376/yM6r92lMQkQHAC0Ao8Jqqji/x/HDgGWCbW/QPVX3NnzEZY05e\neb/o/enqq6/mnnvuYfny5Rw5coQePXoAMHXqVDIzM1m2bBnh4eEkJiaWernsivz88888++yzLFmy\nhIYNGzJ8+PCTWk6Rostug3Pp7YqGj8oyc+ZMFixYwKeffsoTTzzB6tWrGTNmDFdeeSWzZs2iV69e\nzJ49m3bt2p10rCX5racgIqHAJOByIAkYKiJJpVR9T1W7ug9LCMaYE8TExNC3b19+//vfH7eD+cCB\nA5xxxhmEh4czb948tmzZUu5yfvWrX/HOO+8A8MMPP7Bq1SrAuex2dHQ0DRo0YNeuXXz++eee19Sr\nV6/UcfvevXvz8ccfc+TIEQ4fPsxHH31E7969K922Bg0a0LBhQ08v46233qJPnz4UFhaydetW+vbt\ny1NPPcWBAwfIzs7mp59+olOnTtx///2cd955rF+/vtLrLI8/ewo9gU2quhlARKYBVwPl98mMMaYU\nQ4cOZdCgQccdiXT99ddz1VVX0alTJ5KTkyv8xTxq1Chuvvlm2rdvT/v27T09ji5dutCtWzfatWtH\nQkLCcZfdHjlyJAMGDKBZs2bMmzfPU969e3eGDx9Oz549ARgxYgTdunUrd6ioLG+++Sa33347R44c\noXXr1kyePJmCggJuuOEGDhw4gKpy1113ERsby8MPP8y8efMICQmhQ4cOnrvIVRW/XTpbRAYDA1R1\nhDt/I3C+qt7hVWc48CSQCfwI3KOqW0tZ1khgJEDLli17VPRrwBhTdezS2bXPqVw6O9A7mj8FElW1\nM/Al8GZplVT1FVVNVtXk+Pj4ag3QGGOCiT+TwjYgwWu+BcU7lAFQ1SxVzXFnXwN6+DEeY4wxFfBn\nUlgCtBGRViISAQwBZnhXEJGmXrMDgXV+jMcYc5Jq2x0ag9mpvld+29GsqvkicgcwG+eQ1NdVdY2I\njAOWquoM4C4RGQjkA3uB4f6KxxhzcqKiosjKyiIuLq7Mk8JMzaCqZGVlndIJbXaPZmNMufLy8sjI\nyDil4/ZN9YmKiqJFixaEh4cfV273aDbGVInw8HBatWoV6DBMNQn00UfGGGNqEEsKxhhjPCwpGGOM\n8bCkYIwxxsOSgjHGGA9LCsYYYzwsKRhjjPGwpGCMMcbDkoIxxhgPSwrGGGM8LCkYY4zxsKRgjDHG\nw5KCMcYYD0sKxhhjPCwpGGOM8bCkYIwxxsOSgjHGGA9LCsYYYzwsKRhjjPGwpGCMMcbDkoIxxhgP\nSwrGGGM8LCkYY4zxsKRgjDHGI2iSwsKF8PDDkJcX6EiMMabmCpqkkJYGjz8OOTmBjsQYY2ouvyYF\nERkgIhtEZJOIjCmn3jUioiKS7K9YwsOdv9ZTMMaYsvktKYhIKDAJuBxIAoaKSFIp9eoBdwOL/BUL\nWFIwxhhf+LOn0BPYpKqbVTUXmAZcXUq9x4CngGN+jIWwMOdvfr4/12KMMbWbP5NCc2Cr13yGW+Yh\nIt2BBFWdWd6CRGSkiCwVkaWZmZknFYz1FIwxpmIB29EsIiHAc8CfK6qrqq+oarKqJsfHx5/U+iwp\nGGNMxfyZFLYBCV7zLdyyIvWAjsB8EUkHLgBm+GtnsyUFY4ypmD+TwhKgjYi0EpEIYAgwo+hJVT2g\nqo1VNVFVE4HvgIGqutQfwVhSMMaYivktKahqPnAHMBtYB7yvqmtEZJyIDPTXestiScEYYyoW5s+F\nq+osYFaJskfKqJvqz1js6CNjjKlY0JzRbD0FY4ypmCUFY4wxHpYUjDHGeFhSMMYY42FJwRhjjEfQ\nJAU7+sgYYyoWNEnBegrGGFMxSwrGGGM8LCkYY4zxsKRgjDHGw5KCMcYYj6BJCnb0kTHGVCxokoL1\nFIwxpmKWFIwxxnhYUjDGGOMRNElBBEJDLSkYY0x5giYpgNNbsKRgjDFls6RgjDHGI6iSQliYHZJq\njDHlCaqkYD0FY4wpnyUFY4wxHkGVFApDjvH9ttWkbU0LdCjGGFMjBU1SSNuaxs4jW1mxbQ39pvSz\nxGCMMaUImqQwP30+GpKLFoSRW5DL/PT5gQ7JGGNqnKBJCqmJqUhoAWg4EaERpCamBjokY4ypccIC\nHUB1SUlIoW18Nhpdl8nD5pCSkBLokIwxpsbxqacgImeLSKQ7nSoid4lIrA+vGyAiG0Rkk4iMKeX5\n20VktYisEJGFIpJU+Sb4LjY6hpb1zrGEYIwxZfB1+Gg6UCAi5wCvAAnAO+W9QERCgUnA5UASMLSU\nL/13VLWTqnYFngaeq0zwlWWHpBpjTPl8TQqFqpoPDAJeVNX/A5pW8JqewCZV3ayqucA04GrvCqp6\n0Gs2GlAf4zkplhSMMaZ8vu5TyBORocBNwFVuWXgFr2kObPWazwDOL1lJRP4I3AtEABeXtiARGQmM\nBGjZsqWPIZ8oPByys0/65cYYc9rztadwM5ACPKGqP4tIK+CtqghAVSep6tnA/cBDZdR5RVWTVTU5\nPj7+pNdl1z4yxpjy+dRTUNW1wF0AItIQqKeqT1Xwsm04+x6KtHDLyjINeMmXeE6WDR8ZY0z5fD36\naL6I1BeRRsBy4FURqWin8BKgjYi0EpEIYAgwo8Ry23jNXgls9D30yrOkYIwx5fN1n0IDVT0oIiOA\nKar6qIisKu8FqpovIncAs4FQ4HVVXSMi44ClqjoDuENELgHygH04+yz8xpKCMcaUz9ekECYiTYFr\ngQd9XbiqzgJmlSh7xGv6bl+XVRUsKRhjTPl83dE8DucX/0+qukREWuPnoR5/sKRgjDHl83VH8wfA\nB17zm4Fr/BWUv9jRR8YYUz5fdzS3EJGPRGS3+5guIi38HVxVs56CMcaUz9fho8k4Rw41cx+fumW1\niiUFY4wpn69JIV5VJ6tqvvt4Azj5s8gCxJKCMcaUz9ekkCUiN4hIqPu4AcjyZ2D+YEnBGGPK52tS\n+D3O4ag7gR3AYGC4n2Lym/BwUIWCgkBHYowxNZNPSUFVt6jqQFWNV9UzVPX/UQuPPgp3L+FnvQVj\njCndqdyO894qi6KahLkH4NphqcYYU7pTSQpSZVFUk+1HfgZg4eYlAY7EGGNqplNJCn69IU5VS9ua\nxqTv/w7AoKlDSduaFuCIjDGm5ik3KYjIIRE5WMrjEM75CrXG/PT55IceAiA3J4T56fMDG5AxxtRA\n5V7mQlXrVVcg/paamEp4xA/kAuGF9UlNTA10SMYYU+OcyvBRrZKSkMIT/Z0bu/2z/+ukJKQEOCJj\njKl5giYpAHRNaA9AmwadAxyJMcbUTEGVFKKinL/HjgU2DmOMqamCKinUqeP8PXo0sHEYY0xNFVRJ\nwXoKxhhTvqBKCkU9BUsKxhhTuqBKCkU9BRs+MsaY0gVlUrCegjHGlC6okoLtaDbGmPIFVVKIjHT+\nWk/BGGNKF1RJISTESQyWFIwxpnRBlRQAwiPzWbBpqV0l1RhjShFUSSFtaxrZ7OK7zavoN6WfJQZj\njCkhqJLC/PT5EHEIzY0mtyDXLp9tjDEl+DUpiMgAEdkgIptEZEwpz98rImtFZJWIzBGRs/wZT2pi\nKhJxBHLrEREaYZfPNsaYEvyWFEQkFJgEXA4kAUNFJKlEte+BZFXtDHwIPO2veMC5fHbXlmfTKroD\nc4bNsctnG2NMCf7sKfQENqnqZlXNBaYBV3tXUNV5qnrEnf0OaOHHeABoFteARqFnWUIwxphS+DMp\nNAe2es1nuGVluQX4vLQnRGSkiCwVkaWZmZmnFFRMDGRnn9IijDHmtFUjdjSLyA1AMvBMac+r6iuq\nmqyqyfHx8ae0LksKxhhTtnLv0XyKtgEJXvMt3LLjiMglwINAH1XN8WM8gCUFY4wpjz97CkuANiLS\nSkQigCHADO8KItINeBkYqKq7/RiLR1FSUK2OtRljTO3it6SgqvnAHcBsYB3wvqquEZFxIjLQrfYM\nEAN8ICIrRGRGGYurMln5WygogK82fefvVRljTK3jz+EjVHUWMKtE2SNe05f4c/0lpW1N499r3wMm\ncPm/hzL3j+/YUUjGGOOlRuxori7z0+dTELMFgNx9Z9gZzcYYU0JQJYXUxFTCG+4EIOxQazuj2Rhj\nSgiqpJCSkMJHIycCMOrcv9nQkTHGlBBUSQFgQOfzqFMHQg+1CnQoxhhT4wRdUhCBM86AUzwx2hhj\nTktBlxQAoupls3jTJrufgjHGlBB0SSFtaxobjy5iQ9o5pI6ZYInBGGO8BF1SmJ8+n8LIfQDkvvOe\nHZZqjDFegi4ppCamIgVRx80bY4xxBF1SACC3XqAjMMaYGinoksL89PnHXQxvysopgQvGGGNqmKBL\nCqmJqYQPut0z/9rSN2xnszHGuIIuKaQkpHBlz3ZwxR8AyD/UkKe/8eutoY0xptYIuqQA0CSmCcT9\n6Mzsac+nP35qvQVjjCFIk8KwLsMIabLWmdnViUIttENTjTGGIE0KKQkpjO5/I0Tvgm09UZT9OfsD\nHZYxxgRcUCYFgNjIWGj3Cay7BjZexjPfPMP4mVO55hq7h7MxJngFbVJITUwlpP0nUBAJU79Ad7fj\ngTGh/Oc/8MkngY7OGGMCI2iTQkpCCgNSGxUXHGyB5tYFIDo6QEEZY0yABW1SAHio/x+KZw4mQG4M\n4Fxe2xhjglFQJ4WUhBRGfHiXM3MgAfKcLsKKXzYGMCpjjAmcoE4KAL/vORRidsCa38HheAA+XvVl\ngKMyxpjACPqkkJKQQpchH8OetrC/NQArf/nJTmYzxgSloE8KAC890hXOLu4daF5du/SFMSYoWVLA\n6S1ceufH0PEdpyA3hhkbZlhvwRgTdCwpuP76m2GEDL4R6mZCbgyFFFpvwRgTdPyaFERkgIhsEJFN\nIjKmlOd/JSLLRSRfRAb7M5aKpCSkMLDdQIjIhpz6oFhvwRgTdPyWFEQkFJgEXA4kAUNFJKlEtV+A\n4cA7/oqjMu678D4IPwKrboRPXrfegjEm6Pizp9AT2KSqm1U1F5gGXO1dQVXTVXUVUOjHOHyWkpDC\n2ef95MysuBmw3oIxJrj4Myk0B7Z6zWe4ZZUmIiNFZKmILM3MzKyS4MryxqR46PC+M5Pj7FsYMWOE\nJQZjTFCoFTuaVfUVVU1W1eT4+Hi/ruuixBQuuMI9o/mn/gCs3bOWPm/0scRgjDnt+TMpbAMSvOZb\nuGU13t9GXOJMvD8dMnoCkFeYZ/sXjDGnPX8mhSVAGxFpJSIRwBBghh/XV2X6nns+vxv9jTOzbpCn\n3PYvGGNOd35LCqqaD9wBzAbWAe+r6hoRGSciAwFE5DwRyQB+C7wsImv8FU9lTXumF22Tt8H6QaBO\nWSGFjPnfCUfWGmPMacOv+xRUdZaqnquqZ6vqE27ZI6o6w51eoqotVDVaVeNUtYM/46ms+0Y1h6y2\nsHqop2zBLwu4/3/3BzAqY4zxn1qxozlQrrsOuvQ8CB+/Cdt6eHoMz3zzjA0jGWNOS5YUyhEVBfNn\n1yeybh68uhT+7exnUJTrpl9nicEYc9qxpFCB2Fj4w63ObTrJuBDmPQoL/4/0A+n0ntzbEoMx5rRi\nScEHDz4I4RHuSddfjYX/PQ0KBVpgh6kaY04rlhR8EBcH27eV2FQHWgLwyYZPeGXZKwGIyhhjqp4l\nBR81bgzbt3sVzH0MCsJQlNs/u90SgzHmtGBJoRKaNoW8PAiNyIFVw+D73wNYYjDGnDYsKVRSWBi8\n9vF6Z+aHIbC9O2CJwRhzerCkcBKGX96F7hdvhvS+8Moy+No5y9kSgzGmtrOkcJJefqp18cycJ+Fg\nM8ASgzGmdrOkcJKSk2Gr990intsGK26EQkFRbvvsNrschjGm1rGkcApatIDCQjin2w6n4OMpMK4Q\nVjnXSnr6m6fp+q+udoKbMabWsKRwikRg8ZymjH51BsT+7BSuHOZ5fuWulfR6vZcNJxljagVLClWg\nYUN4ZsRAXvxsLjRfBLs7wdsz4ccrAGw4yRhTa1hSqEJ39LqFP4+Kh0PNYdMV8M5MJzns7Aw4w0lt\nJrZh1GejbEjJGFMjiaoGOoZKSU5O1qVLlwY6jDIVFsJzz8F/VswhbWq/4if+kARnrPPMhkgIL135\nEiN7jAxAlMaYYCMiy1Q1uaJ0dm+9AAAVTklEQVR61lOoYiEhMHo0fPt2P56b82bxE5O/hr2tnF7D\n4TgKtZDbPruNPm/0sV6DMabGsKTgR/dcfBOvfbqKxFtHw9E4mLgZ/rUS3v3UU2fBlgVc+PqFlhyM\nMTWCJQU/u+XXnfn5lWfp+muvL/yMFPji7zB5vufchgXpTnJo+vemDHpv0AkJIisLLrwQNm2q3vir\n2+LFzhFdK1YEOhJjgpMlhWqy7JMU3p33PYmXfOEUfHcvbOlTfG7Dp6/A8pvZ+dWv+Xj9x1z4+oW0\neqGV51DWadMgLQ2eesp5eVYW1LLdQT756CPn78yZgY3DmGBlSaGahITAkNRu/PzlAL5YtYQ6Tbcc\nX2H5rTDjdfj0VZj5IqweQvr+dG777Dbino5jzKwnAPh+79eMnvYSjRvDv/7lvHTmTHjggWpu0Gnu\nhx+cHss33wQ2jn//Gx57zJneuhW+/z6w8VSHjz+GKVMCHUXwsqQQAJd1Oo/sjLP4Zksa5z15HQy6\nAWK8btaw5A6Y/i68vAQW/4G9R/eSnRkLwLKtq/j7jFkA3PXs1zT9e1MG/ymNJ5/Ko+fLKX4/Se7w\nYaeXUpUWL4bQUOdLr6j3I1K166isop7K9OnVt87CQucLsbCwuGzECHjkEWe6dWvo3r3i5ezcCR98\nUPXxbdwI8+eX/pwqTJwIGzac+noGDYKbbjr15VS1os9mTo5vvfQjR8reXr748kvnR0F1C6v+VRpw\neg4Xtkxh8ZgU0ram8ddPH2JZxmr2vDALjsQ7lXYkO4/dHSE91Sk71NRz17f8zb3Z+a/X4efuUBjO\nkg0ZLNl5G6P/O5rQTQMJT1hJaL09J6w7KiyK2KhYcvJziI+Op1FUI5rENKFb025kHckiNTGVC1qk\nIOIcXnvoEDz6qPPa4cPhww8hM9O58VBlZGfDV1/BlVceXz5xovNF+OWXxV+IeXmVW3ZVWrYMvv3W\nmY6IKLve4cMQEwOTJzvbpTIee8y5TMrNNxeXvfoq3H576cvLyYH8fGc6P9+5hHtZBg2C776D3bsh\nNxfeew/uucdJtA88AJ06wdChlYsX4Nxznb+qTuI54wzncwzw3//C3XfDgAHw+eeVX3ZlrF7t9OTK\na8Mvvzh3TIyOLn9Ze/dCo0bOdF4e/OMfzntQp87x9XbudO6n8uKLcOedzmf2zjvLX/aYMU791auh\nY0enLDe3/M+Ut/79nb+33OJb/api5ynUMG9/uZKxz2/hQNfH2DPzTudmPr7qNR7OXOXsyF58J7Rc\nAJEHoctb0GYmRB6GbT1g/lj47e+gMAxUICwHwo85y8hsB5Oc8ynajR3E+rHOIH+TZ5sCsHO0c52n\nelc/TN0L3+Dgx48T3et1wpr8eFwoBYcaE1J3HxJa4Ck7NHs0h7/8M/EPnEdoowxP+f63/8mxFYOo\nP3g0R5dfQ97mFKJbr6Aweid1+48nvGnxz8/8rJYUHm5EREtnT3T+znMpzIkmvNlaoqKE+tKM1X+e\nS/2rH6Huhc4YRFES3Hd0HzkFOaVuuoJ9zQltuI2osCjS//Szpzwk6hAtxiWTG3LwuPpRYVFEZiWz\n4bEPkDr7OPOxJE98BVlnEd5sLVLnABKa76mfUC+RY7ubsjvqW7bckw5A24eu5WDeXvJDD3Dwk8fI\nWTOAMy6bTJ0Bf+XYMdj1F6fe2Y9ezk9/db5t4x9MJrThNk8s2XPv4PDcOznjsbaIwK6H16FHY4+L\nN+zMDTS87Voyx60EIHFCq+O2SVRYFC0btASFn77twqFFgwk561uiUl8AoPBILLsfcT4XzR/qw7bH\nvyK82Voiz1rB2cOeYfPUuzm0cDgRbb6i0W1DSt3GAFooIIpI6T9Ofl7ZgpXjXwQg4W+dyPxwLPUG\njHc+L1sv5JyO+1l46xwAer/Wl18O/ex5T4uWl3XgCFvv30BUu69o96c/kZOfQ2RY5Al/9yy/kIxX\nJtHk7kFEtVpB2LI72fTWvcT+eryn3Y3qNKJbk27Mn9WYbf+eULw9m/1A43svLbWNRXGsnzCBY+v7\nULfPS0Sn/hPS+5D55j84b/wQdkd+d0Lcx/Kc7XB48TWEH0lg0ZRBALQZfyHRsUfJyc+hbeO23Hfh\nfaQkpJS5jcvi63kKlhRqsLStadz36kyWfnApOXuaoe3fg68fOrmFhR+G9tOLk0ziPOd+EN7i10Bm\nh+L5ix+Euc6+DK68HVosgtfSoCDKKQvJhcII55pPf+gAB86CuA2QVxeezC5ezpWjnLLvb4ZM9ydT\nvzFOjyg0FzZeDru6Qp0s59Ddknr8Czq+ByF5MHmhU/ZICBxqBs+7yaXr6xCdCd/cX9zeZkuh3nY4\nkAA3DIDDZ0Kjzc7zm/rD3nOgzSz45SL46C0Y1s/plX3xwvHrP28SnD8RGh+f+NhwJbz7mTN9wXNO\nIi4MP77OgLug5yQIKYS0u2H2BBhyNUz75MR2Fmm6zEni6/8f7HbOhueGy+Dt2e50fzjny+L6Y93/\n4bvOdtr3VCYcLaUbF3YU8t2fwI8KZLaHsGNQb4fzo+CT1+DwGfDjVcWvOX8CXH4PLL0VPnOHJkNz\noCCyuM5FT8LCvzjTIXnQfzScsRri10K9XVAQCiEFsKM7vDkHukyBK+526heEAQqhBc70Y15dxMv+\n5GyvDu8563h5BaQ8C2mjnefbfQTbe0CHDyCrDeQ0gPb/gfoZ8L477ndLCix4EK65Hva0gxaLoVDg\nWCx8McH5f+j7MBxtBN/dU7zu+xtCnf1wpCG8O8Mp23pR8fNnfeV8ruM2Qmg+rB4CaffCTX0hJN/5\nofXOp7Dx1ye+D4OvdT4nEYcgcb5TtrcNvPVfaPgTbO95fP2bL4KzvoH9CVA/g/CwML4a/lWlE4Ov\nSQFVrVWPHj16aLB6eenLmvzPFG1+7VPafGyynjk+QSPPe1O57B6l/QfO48Kn1Ong+/NRcGJZVJbz\nN/xQ1a2n7u6yn7v0z0riHN+XFb3D+XvxX5QR551cPNcOUvrfq3SZrDT/Tolb79vrovYqiXOL52O2\n+b7OJsudv96vR5Ub+znxtPm0uKzbq05sviw35ZnS37/SHpfcp4TkVH57RRxQhvdW6m9xlt9w4/HP\n19mjhB5VWi5wtmv9LVXzuQk7cmJZg3Tnb7NFSt1dFS8jJFfpf4/S98Hy68VsV4ZeWTzf/RUlcr9y\n9udK9M4ylu21LaN3KmGHy19Hp7eVK/6ghGcrV4xSxqJ/W/C3Sn9/AEtVK/6OrbBCTXsEc1Ioy7e/\nfKt/W/A3ve/L+7T9P9rrmc800TPGdtCGI6/VMx4/W5s820QbjzlfG/3xKq1z2WPKpaOVbq8pI3oq\nHd5VOr2lnP+8Muh655/3/OeVnhOVRhuUzlOcD2bsZiXpPeeL595mzoc/9GjxF2N4tpLyrCJ5xf+Y\nt3d2llX0hSP5Ff8ztvnM+Rv/gxOP559vW/EXzXFfAIeVGy4t/Usr9qfyk1STZU78JesULavXk0rT\nJb5/GXWceuKXLao0XuPEUtprQo86ybxo/vbOSs8XlF7jlSFXKQ9FKO3+U/x8ZeKJ2e5s+zp7ir9w\ny/qiQp0vssvuVvo86sx3/ffx2/KONkrbj4rLuv7bibFo3rsdZT3a/efkEkx5jzNXnPhF3/fBstfT\ncJNXG153PrPxq5WR3YuTcGmPq29ytmnC18Wfec/7eMx5lHzNpX9W7m+g/P5CZ/kl68RuLn1dceuV\nRj8qPf5VXNZ6tnJPcw0fF67f/vJtpb8nfE0Kfh0+EpEBwAtAKPCaqo4v8XwkMAXoAWQBv1PV9PKW\nGUzDR/6StjWN+enziasbx/c7vmdn9k72Ht1L5pFMIsMi2Xd0HyJS4Th8kdyfUghr9gMhdQ6Rn9ma\nkJhMJOLIcfsTCo80oPBwHIWHGxFSfyd6OI7Qxj+jOTFI+FGnUmgeIVHZRIYW7wM4tKYXEW3nA8qx\n5YOJ7PQZmhuN5kVBfiSE5hEW9wtHsxpzYFN7Z/iq4WZiw5oS1eAQuQW57N2XD8tuhZYLiT7agZCD\nLQlPWEnE2c4JgloYQv72DhQei6H+uStpWLc+Ow7sYtdR9z4ZBWHEFrRFMjtRmN0YaT2X/fnbYOMV\nEJpL7LlrCTl0lmc/x7HVVxDWbDWhDbehR2PJj3K2L0cbOMM1e9tQN6IOR9jjDHNF7yY2P4mohntL\n3f+RfTCE7FWXOLF0nUL0wa6E7+tESN0DhDbYTsG+BCTyMHnbOhLZbi6aH0FovUwIP3rcUVyFR+sh\n4Tkc+akbh7Y3hYIIqLedmHbfkf1TJzj7vxBagCBo1tnQaBPRP44g5HAzorpNJ7RelrM994Q4Bzw0\nWe0seM01ELeRhi13EpLZmdBGW5zPQlZLctZcRkTiYrQwnLAzNhFSdz+qcOTHCzgUsgUa/QjLboM9\nbeHcmdBuBuSHw4KHoelyGpz1C2G5cRSeuZx9x7Jg23nOMGD4Eee9DsuhUeNCCvY158CPnaDVXMhu\nQr2zfuLQj92cYaGWC53PSnZTZz9awnfOvrW8upD4NRxuTGx0NAf4BS0IgbxoYsOacXR1f3JyC6HT\nO1AYSr1Gx4iOcPZca0EYhOSjRxpybOVVhJ+1HKSQ7G+vJ+fcd+CHa6HOXhpeMYHISHG229G9sKML\nZLWF/Cganj+LyPAwQrOSkMNnkhG6wLmYZmguNPE6e3P9/4OCcEj6kF+16s34fuNr5z4FEQkFfgQu\nBTKAJcBQVV3rVecPQGdVvV1EhgCDVPV35S3XkoIpS1GyS01MPe6fpqzyk13eySwzbWsaU1Y6O76H\ndRlGSkJKpZbxyrJXmL52OtckXVMlF1EsuW7veaDcuCpT15cY4urGkXUk64S/pb2PU1ZOYWf2TgCa\nxDTxbMvy2lTW8kuup7T342S2e3mfw5KfgZKvK2pf0Y+0to3bcvk5l5e6PSqrJiSFFGCsql7mzv8F\nQFWf9Koz262TJiJhwE4gXssJypKCMcZUXk24SmpzwPsuxhluWal1VDUfOACccPiJiIwUkaUisjQz\nM9NP4RpjjKkVZzSr6iuqmqyqyfHx8YEOxxhjTlv+TArbgASv+RZuWal13OGjBjg7nI0xxgSAP5PC\nEqCNiLQSkQhgCDCjRJ0ZwE3u9GBgbnn7E4wxxviX3659pKr5InIHMBvnkNTXVXWNiIzDOV52BvBv\n4C0R2QTsxUkcxhhjAsSvF8RT1VnArBJlj3hNHwN+688YjDHG+K5W7Gg2xhhTPWrdBfFEJBPYUmHF\n0jUGTryWdO10urTldGkHWFtqKmuL4yxVrfDwzVqXFE6FiCz15eSN2uB0acvp0g6wttRU1pbKseEj\nY4wxHpYUjDHGeARbUvDvDYyr1+nSltOlHWBtqamsLZUQVPsUjDHGlC/YegrGGGPKYUnBGGOMR1Ak\nBREZICIbRGSTiIwJdDwVEZHXRWS3iPzgVdZIRL4UkY3u34ZuuYjIRLdtq0Ske+AiP5GIJIjIPBFZ\nKyJrRORut7zWtUdEokRksYisdNvyV7e8lYgscmN+z73WFyIS6c5vcp9PDGT8JYlIqIh8LyKfufO1\ntR3pIrJaRFaIyFK3rNZ9vgBEJFZEPhSR9SKyTkRSqrstp31SEOcOcJOAy4EkYKiIJAU2qgq9AQwo\nUTYGmKOqbYA57jw47WrjPkYCL1VTjL7KB/6sqknABcAf3e1fG9uTA1ysql2ArsAAEbkAeAp4XlXP\nAfYBt7j1bwH2ueXPu/VqkruBdV7ztbUdAH1VtavXMfy18fMFzu2Lv1DVdkAXnPenetviy42ca/MD\nSAFme83/BfhLoOPyIe5E4Aev+Q1AU3e6KbDBnX4Z5zanJ9SriQ/gE5xbtNbq9gB1geXA+ThnmIaV\n/LzhXAwyxZ0Oc+tJoGN342mB8wVzMfAZILWxHW5M6UDjEmW17vOFc+uAn0tu2+puy2nfU8C3O8DV\nBmeqqnsneXYCZ7rTtaZ97rBDN2ARtbQ97pDLCmA38CXwE7BfnTsHwvHx+nRnwQCZANwHFLrzcdTO\ndgAo8F8RWSYiRTdSro2fr1ZAJjDZHdZ7TUSiqea2BENSOO2o87OgVh1LLCIxwHTgT6p60Pu52tQe\nVS1Q1a44v7R7Au0CHFKlicivgd2quizQsVSRi1S1O85wyh9F5FfeT9aiz1cY0B14SVW7AYcpHioC\nqqctwZAUfLkDXG2wS0SaArh/d7vlNb59IhKOkxCmqup/3OJa2x4AVd0PzMMZZokV586BcHy8NfXO\ngr2AgSKSDkzDGUJ6gdrXDgBUdZv7dzfwEU6yro2frwwgQ1UXufMf4iSJam1LMCQFX+4AVxt436Xu\nJpyx+aLyYe6RCBcAB7y6mgEnIoJzM6V1qvqc11O1rj0iEi8ise50HZx9I+twksNgt1rJttS4Owuq\n6l9UtYWqJuL8P8xV1eupZe0AEJFoEalXNA30B36gFn6+VHUnsFVE2rpF/YC1VHdbAr1zpZp24FwB\n/Igz/vtgoOPxId53gR1AHs6vh1twxnDnABuB/wGN3LqCc3TVT8BqIDnQ8Zdoy0U43d1VwAr3cUVt\nbA/QGfjebcsPwCNueWtgMbAJ+ACIdMuj3PlN7vOtA92GUtqUCnxWW9vhxrzSfawp+v+ujZ8vN76u\nwFL3M/Yx0LC622KXuTDGGOMRDMNHxhhjfGRJwRhjjIclBWOMMR6WFIwxxnhYUjDGGONhScEYl4gU\nuFfaLHpU2RV1RSRRvK56a0xNFVZxFWOCxlF1LmFhTNCynoIxFXCv1/+0e83+xSJyjlueKCJz3WvZ\nzxGRlm75mSLykTj3XVgpIhe6iwoVkVfFuRfDf92zohGRu8S538QqEZkWoGYaA1hSMMZbnRLDR7/z\neu6AqnYC/oFzhVGAF4E3VbUzMBWY6JZPBL5S574L3XHOtAXnuveTVLUDsB+4xi0fA3Rzl3O7vxpn\njC/sjGZjXCKSraoxpZSn49xcZ7N7cb+dqhonIntwrl+f55bvUNXGIpIJtFDVHK9lJAJfqnOjFETk\nfiBcVR8XkS+AbJzLGnysqtl+bqoxZbKegjG+0TKmKyPHa7qA4n16V+Jcw6Y7sMTrSqXGVDtLCsb4\n5ndef9Pc6W9xrjIKcD3wtTs9BxgFnpvyNChroSISAiSo6jzgfpzLUp/QWzGmutgvEmOK1XHvqlbk\nC1UtOiy1oYiswvm1P9QtuxPnLln/h3PHrJvd8ruBV0TkFpwewSicq96WJhR4200cAkxU514NxgSE\n7VMwpgLuPoVkVd0T6FiM8TcbPjLGGONhPQVjjDEe1lMwxhjjYUnBGGOMhyUFY4wxHpYUjDHGeFhS\nMMYY4/H/AZN6yxQ6gTLNAAAAAElFTkSuQmCC\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEWCAYAAABMoxE0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJzsnXl8VNXZ+L/PvVlARWij1oVAcGeT\nLaIpImHR4q7V1rVBQBHcSm1fK77S8qoVpVqpSy2oUOJS608/UjfckJFtQHapC4IaSEQUUkEQSTJz\nn98fd+5kZjKTTDKZbJwvn3yYuXPuveeee+95zrOc54iqYjAYDAZDQ7GauwIGg8FgaN0YQWIwGAyG\nlDCCxGAwGAwpYQSJwWAwGFLCCBKDwWAwpIQRJAaDwWBICSNIDM2OiNgiskdEujRm2eZERI4VkUaP\nrReRESJSEvF9g4gMTqZsA871hIjc3tD9aznu3SLyj8Y+rqH5yGjuChhaHyKyJ+LrAUAFEAx9v05V\nn6nP8VQ1CBzU2GX3B1T1hMY4johcA1ylqoURx76mMY5taPsYQWKoN6oa7shDI95rVPWdROVFJENV\nA01RN4PB0PQY05ah0QmZLv4lIv8Ukd3AVSJSICLLRGSniHwlIg+JSGaofIaIqIjkhb4/Hfp9nojs\nFhG/iHSrb9nQ72eJyKcisktEHhaRJSJydYJ6J1PH60Rkk4h8KyIPRexri8iDIlIuIp8DI2tpn/8V\nkeditj0qIn8Jfb5GRD4OXc9nIW0h0bHKRKQw9PkAEXkqVLcPgQExZe8Qkc9Dx/1QRM4Pbe8NPAIM\nDpkNd0S07ZSI/ceHrr1cROaKyBHJtE1diMhFofrsFJF3ReSEiN9uF5GtIvKdiHwSca2nisjq0Pav\nReTPyZ7PkAZU1fyZvwb/ASXAiJhtdwOVwHm4g5X2wMnAKbha8NHAp8CNofIZgAJ5oe9PAzuAfCAT\n+BfwdAPKHgbsBi4I/XYLUAVcneBakqnjv4GOQB7wX+/agRuBD4HOQA6w0H294p7naGAPcGDEsb8B\n8kPfzwuVEWAY8ANwUui3EUBJxLHKgMLQ5/sBH/AjoCvwUUzZXwJHhO7JFaE6/CT02zWAL6aeTwNT\nQp/PDNWxL9AO+BvwbjJtE+f67wb+EfrcPVSPYaF7dDuwIfS5J7AZODxUthtwdOjzCuDy0OcOwCnN\n/S7sz39GIzGki8Wq+oqqOqr6g6quUNXlqhpQ1c+BmcCQWvZ/QVVXqmoV8AxuB1bfsucCa1X136Hf\nHsQVOnFJso5TVXWXqpbgdtreuX4JPKiqZapaDtxby3k+B/6DK+AAzgC+VdWVod9fUdXP1eVdYD4Q\n16Eewy+Bu1X1W1XdjKtlRJ73eVX9KnRPnsUdBOQncVyAK4EnVHWtqu4DbgOGiEjniDKJ2qY2LgNe\nVtV3Q/foXlxhdAoQwBVaPUPm0S9CbQfugOA4EclR1d2qujzJ6zCkASNIDOmiNPKLiJwoIq+JyDYR\n+Q64Eziklv23RXzeS+0O9kRlj4ysh6oq7gg+LknWMalz4Y6ka+NZ4PLQ5ytC3716nCsiy0XkvyKy\nE1cbqK2tPI6orQ4icrWIrAuZkHYCJyZ5XHCvL3w8Vf0O+BY4KqJMfe5ZouM6uPfoKFXdAPwW9z58\nEzKVHh4qOhroAWwQkfdF5Owkr8OQBowgMaSL2NDXGbij8GNV9WDgD7imm3TyFa6pCQAREaI7vlhS\nqeNXQG7E97rCk58HRojIUbiaybOhOrYHXgCm4pqdOgFvJVmPbYnqICJHA48BE4Cc0HE/iThuXaHK\nW3HNZd7xOuCa0L5Mol71Oa6Fe8++BFDVp1V1EK5Zy8ZtF1R1g6pehmu+fAB4UUTapVgXQwMxgsTQ\nVHQAdgHfi0h34LomOOerQH8ROU9EMoBfA4emqY7PAxNF5CgRyQF+X1thVd0GLAb+AWxQ1Y2hn7KB\nLGA7EBSRc4Hh9ajD7SLSSdx5NjdG/HYQrrDYjitTr8XVSDy+Bjp7wQVx+CcwVkROEpFs3A59kaom\n1PDqUefzRaQwdO7/wfVrLReR7iIyNHS+H0J/Du4F/EpEDglpMLtC1+akWBdDAzGCxNBU/BYYhdtJ\nzMB1iqcVVf0auBT4C1AOHAOswZ330th1fAzXl7Ee1xH8QhL7PIvrPA+btVR1J/Ab4CVch/UluAIx\nGf6IqxmVAPOA4ojjfgA8DLwfKnMCEOlXeBvYCHwtIpEmKm//N3BNTC+F9u+C6zdJCVX9ELfNH8MV\nciOB80P+kmxgGq5faxuuBvS/oV3PBj4WNyrwfuBSVa1MtT6GhiGu2dhgaPuIiI1rSrlEVRc1d30M\nhraC0UgMbRoRGRky9WQDk3Gjfd5v5moZDG0KI0gMbZ3TgM9xzSY/Ay5S1USmLYPB0ACMactgMBgM\nKWE0EoPBYDCkxH6RtPGQQw7RvLy85q6GwWAwtCpWrVq1Q1VrC5kH9hNBkpeXx8qVK5u7GgaDwdCq\nEJG6MjQAxrRlMBgMhhQxgsRgMBgMKWEEicFgMBhSYr/wkRgMhqalqqqKsrIy9u3b19xVMSRBu3bt\n6Ny5M5mZiVKt1Y4RJAaDodEpKyujQ4cO5OXl4SZdNrRUVJXy8nLKysro1q1b3TvEwZi2DAZDo7Nv\n3z5ycnKMEGkFiAg5OTkpaY9GkBjqxO+HqVPd/w2GZDFCpPWQ6r0ypi1Drfj9MHw4VFZCVhbMnw8F\nBc1dK4PB0JIwGomhVnw+V4gEg+7/Pl9z18hgqJvy8nL69u1L3759OfzwwznqqKPC3ysrk1u2ZPTo\n0WzYsKHWMo8++ijPPPNMY1SZ0047jbVr1zbKsZoao5EYaqWw0NVEPI2ksLC5a2Qw1E1OTk64U54y\nZQoHHXQQv/vd76LKqCqqimXFH0/Pnj27zvPccMMNqVe2DWA0EkOtFBS45qy77jJmLUN68Zf6mbpo\nKv7S9DnjNm3aRI8ePbjyyivp2bMnX331FePGjSM/P5+ePXty5513hst6GkIgEKBTp07cdttt9OnT\nh4KCAr755hsA7rjjDqZPnx4uf9tttzFw4EBOOOEEli5dCsD333/PxRdfTI8ePbjkkkvIz8+vU/N4\n+umn6d27N7169eL2228HIBAI8Ktf/Sq8/aGHHgLgwQcfpEePHpx00klcddVVjd5myWA0EkOdFBQY\nAWJIL/5SP8OLh1MZrCTLzmJ+0XwKctPz0H3yyScUFxeTn58PwL333suPf/xjAoEAQ4cO5ZJLLqFH\njx5R++zatYshQ4Zw7733cssttzBr1ixuu+22GsdWVd5//31efvll7rzzTt544w0efvhhDj/8cF58\n8UXWrVtH//79a61fWVkZd9xxBytXrqRjx46MGDGCV199lUMPPZQdO3awfv16AHbu3AnAtGnT2Lx5\nM1lZWeFtTY3RSAwGQ7PjK/FRGawkqEEqg5X4SnxpO9cxxxwTFiIA//znP+nfvz/9+/fn448/5qOP\nPqqxT/v27TnrrLMAGDBgACUlJXGP/fOf/7xGmcWLF3PZZZcB0KdPH3r27Flr/ZYvX86wYcM45JBD\nyMzM5IorrmDhwoUce+yxbNiwgZtvvpk333yTjh07AtCzZ0+uuuoqnnnmmQZPKEwVI0gMBkOzU5hX\nSJadhS02WXYWhXmFaTvXgQceGP68ceNG/vrXv/Luu+/ywQcfMHLkyLjzKbKyssKfbdsmEAjEPXZ2\ndnadZRpKTk4OH3zwAYMHD+bRRx/luuuuA+DNN99k/PjxrFixgoEDBxIMBhv1vMlgBInBYGh2CnIL\nmF80n7uG3pVWs1Ys3333HR06dODggw/mq6++4s0332z0cwwaNIjnn38egPXr18fVeCI55ZRTWLBg\nAeXl5QQCAZ577jmGDBnC9u3bUVV+8YtfcOedd7J69WqCwSBlZWUMGzaMadOmsWPHDvbu3dvo11AX\nxkdiMBhaBAW5BU0mQDz69+9Pjx49OPHEE+natSuDBg1q9HPcdNNNFBUV0aNHj/CfZ5aKR+fOnbnr\nrrsoLCxEVTnvvPM455xzWL16NWPHjkVVERHuu+8+AoEAV1xxBbt378ZxHH73u9/RoUOHRr+Gukjr\nmu0iMhL4K2ADT6jqvTG/ZwPFwACgHLhUVUtEZCAw0ysGTFHVl5I5Zjzy8/PVLGxlMDQdH3/8Md27\nd2/uarQIAoEAgUCAdu3asXHjRs4880w2btxIRkbLGsfHu2ciskpV8xPsEiZtVyIiNvAocAZQBqwQ\nkZdVNVKvGwt8q6rHishlwH3ApcB/gHxVDYjIEcA6EXkF0CSOaTAYDC2GPXv2MHz4cAKBAKrKjBkz\nWpwQSZV0Xs1AYJOqfg4gIs8BFwCRnf4FwJTQ5xeAR0REVDXSyNcOV4Ake0yDwWBoMXTq1IlVq1Y1\ndzXSSjqd7UcBpRHfy0Lb4pZR1QCwC8gBEJFTRORDYD0wPvR7Msc0GAwGQxPSYqO2VHW5qvYETgYm\niUi7+uwvIuNEZKWIrNy+fXt6KmkwGAyGtAqSL4HciO+dQ9vilhGRDKAjrtM9jKp+DOwBeiV5TG+/\nmaqar6r5hx56aAqXYTAYDIbaSKcgWQEcJyLdRCQLuAx4OabMy8Co0OdLgHdVVUP7ZACISFfgRKAk\nyWMaDAaDoQlJmyAJ+TRuBN4EPgaeV9UPReROETk/VOxJIEdENgG3AF7ymtNwI7XWAi8B16vqjkTH\nTNc1GAyG1snQoUNrTC6cPn06EyZMqHW/gw46CICtW7dyySWXxC1TWFhIXdMJpk+fHjUx8Oyzz26U\nPFhTpkzh/vvvT/k4jU1aY9BU9XXg9Zhtf4j4vA/4RZz9ngKeSvaYBoPBEMnll1/Oc889x89+9rPw\ntueee45p06Yltf+RRx7JCy+80ODzT58+nauuuooDDjgAgNdfb9tdVot1thsMhv2LxlzS+ZJLLuG1\n114LL2JVUlLC1q1bGTx4cHheR//+/enduzf//ve/a+xfUlJCr169APjhhx+47LLL6N69OxdddBE/\n/PBDuNyECRPCKej/+Mc/AvDQQw+xdetWhg4dytChQwHIy8tjx44dAPzlL3+hV69e9OrVK5yCvqSk\nhO7du3PttdfSs2dPzjzzzKjzxGPt2rWceuqpnHTSSVx00UV8++234fN7aeW9ZJHvvfdeeGGvfv36\nsXv37ga3bVy8xV3a8t+AAQPUYDA0HR999FG9yi9dqtq+vaptu/8vXZp6Hc455xydO3euqqpOnTpV\nf/vb36qqalVVle7atUtVVbdv367HHHOMOo6jqqoHHnigqqp+8cUX2rNnT1VVfeCBB3T06NGqqrpu\n3Tq1bVtXrFihqqrl5eWqqhoIBHTIkCG6bt06VVXt2rWrbt++PVwX7/vKlSu1V69eumfPHt29e7f2\n6NFDV69erV988YXatq1r1qxRVdVf/OIX+tRTT9W4pj/+8Y/65z//WVVVe/furT6fT1VVJ0+erL/+\n9a9VVfWII47Qffv2qarqt99+q6qq5557ri5evFhVVXfv3q1VVVU1jh3vngErNYk+1mgkBoOh2UnH\nks6eeQtcs9bll18OuIPn22+/nZNOOokRI0bw5Zdf8vXXXyc8zsKFC8MLRp100kmcdNJJ4d+ef/55\n+vfvT79+/fjwww/rTMi4ePFiLrroIg488EAOOuggfv7zn7No0SIAunXrRt++fYHaU9WDuz7Kzp07\nGTJkCACjRo1i4cKF4TpeeeWVPP300+EZ9IMGDeKWW27hoYceYufOnY0+s94IEoPB0Ox4SzrbduMt\n6XzBBRcwf/58Vq9ezd69exkwYAAAzzzzDNu3b2fVqlWsXbuWn/zkJ3FTx9fFF198wf3338/8+fP5\n4IMPOOeccxp0HA8vBT2klob+tdde44YbbmD16tWcfPLJBAIBbrvtNp544gl++OEHBg0axCeffNLg\nesbDCBKDwdDspGNJ54MOOoihQ4cyZsyYsDYC7mj+sMMOIzMzkwULFrB58+Zaj3P66afz7LPPAvCf\n//yHDz74AHBT0B944IF07NiRr7/+mnnz5oX36dChQ1w/xODBg5k7dy579+7l+++/56WXXmLw4MH1\nvraOHTvyox/9KKzNPPXUUwwZMgTHcSgtLWXo0KHcd9997Nq1iz179vDZZ5/Ru3dvfv/733PyySc3\nuiBpW5nDDAZDqyUdSzpffvnlXHTRRWETF8CVV17JeeedR+/evcnPz+fEE0+s9RgTJkxg9OjRdO/e\nne7du4c1mz59+tCvXz9OPPFEcnNzo1LQjxs3jpEjR3LkkUeyYMGC8Pb+/ftz9dVXM3DgQACuueYa\n+vXrV6sZKxFz5sxh/Pjx7N27l6OPPprZs2cTDAa56qqr2LVrF6rKzTffTKdOnZg8eTILFizAsix6\n9uwZXu2xsUhrGvmWgkkjbzA0LSaNfOsjlTTyxrRlMBgMhpQwgsRgMBgMKWEEicFgSAv7g9m8rZDq\nvTKCxGAwNDrt2rWjvLzcCJNWgKpSXl5Ou3b1WqkjChO1ZTAYGp3OnTtTVlaGWQuoddCuXTs6d+7c\n4P2NIDEYDI1OZmYm3bp1a+5qGJoIY9oyGAwGQ0oYQWIwGAyGlDCCxGAwGAwpYQSJwWAwGFLCCBKD\nwWAwpIQRJAaDwWBICSNIDAaDwZASRpAYDAaDISWMIDEYDAZDShhBYjAYDIaUMILEYDAYDClhBInB\nYDAYUsIIEoPBYDCkhBEkdeD3w9Sp7v8Gg8FgqIlJI18Lfj8MHw6VlZCVBfPnQ0FBc9fKYDAYWhZG\nI6kFn88VIsGg+7/P19w1MhgMhpaHESS1UFjoaiK27f5fWNjcNTIkwpggDYbmw5i2aqGgwDVn+Xyu\nEDFmrZaJMUEaDM2LESR1UFBgOqWWTjwTpLlnBkPTYUxbhlaPMUEaDM2L0UgMrR5jgjQYmpe0aiQi\nMlJENojIJhG5Lc7v2SLyr9Dvy0UkL7T9DBFZJSLrQ/8Pi9jHFzrm2tDfYem8BkProKAAJk0yQsRg\naA7SppGIiA08CpwBlAErRORlVf0oothY4FtVPVZELgPuAy4FdgDnqepWEekFvAkcFbHflaq6Ml11\nNxjSgd9vtCZD2ySdpq2BwCZV/RxARJ4DLgAiBckFwJTQ5xeAR0REVHVNRJkPgfYikq2qFWmsb1KY\nzsDQEExkmaEtk05BchRQGvG9DDglURlVDYjILiAHVyPxuBhYHSNEZotIEHgRuFtVNfbkIjIOGAfQ\npUuXFC/FxXQGhoZiIssMbZkWHbUlIj1xzV3XRWy+UlV7A4NDf7+Kt6+qzlTVfFXNP/TQQxulPmam\nu6GhmMiy5DATS1sn6dRIvgRyI753Dm2LV6ZMRDKAjkA5gIh0Bl4CilT1M28HVf0y9P9uEXkW14RW\nnK6LiMTrDDyNxHQGhmQxkWV1YzT+1ks6BckK4DgR6YYrMC4Drogp8zIwCvADlwDvqqqKSCfgNeA2\nVV3iFQ4Jm06qukNEMoFzgXfSeA1RmM7AkApmcmvtGPNf6yVtgiTk87gRN+LKBmap6ociciewUlVf\nBp4EnhKRTcB/cYUNwI3AscAfROQPoW1nAt8Db4aEiI0rRB5P1zXEw3QGBkN6MBp/60Xi+KnbHPn5\n+bpypYkWNhhaOiYqsmUhIqtUNb+ucmZmu8FgaDEYjb910qKjtgwGg8HQ8jGCxGBoBkyYq6EtYUxb\nBkMTY8JcDW0No5EYDE2MmdhqaGsYQWIwNDFmlruhrWFMWwZDE2MmthraGkaQGAzNgAlzNbQljGnL\nYDAYDClhBInBYDAYUsIIEoPBYDCkhBEkBoPBYEgJI0gMcTEzrw0GQ7KYqC1DDczMa4PBUB+MRmKo\ngZl5bTAY6oMRJIYamJnXBoOhPhjTlqEGZua1wWCoD0aQGOJiZl4bDIZkMaYtg8FgMKSEESQGg8HQ\nBmjOkH1j2jIYDIZWTnOH7BuNxGAw1BszYbVl0dwh+0YjMRgM9aK5R7+Gmngh+949aeqQfaORGAyt\ngJakATT36NdQEy9k/667mkewG43EYGjhNLUG4PfXPoeouUe/hvg0Z8i+ESQGQyNTV0dcX+JpAOnq\nMJIRWmbCqiEWI0gMhkYkHdpDU2oAyQotM2G1cWjsQUdzYQSJwdCIpEN7aEoNwJitmo62FLRgBInB\n0IikqyNuKg3AmK2ajqY0WaYbI0gMhkakLXTExmzVNLQl7c8IklZGW7GptmVMR2xIhrYw6PBISpCI\nyDFAmapWiEghcBJQrKo701k5QzRtyaZqMBjazqAj2QmJLwJBETkWmAnkAs+mrVaGuJiJYAaDoSWS\nrCBxVDUAXAQ8rKr/AxyRvmoZ4mFWLjQYWi4tKftAU5OsIKkSkcuBUcCroW2Zde0kIiNFZIOIbBKR\n2+L8ni0i/wr9vlxE8kLbzxCRVSKyPvT/sIh9BoS2bxKRh0REkryGVk9zp0EwGAw18fthwgQYOhQm\nT3bNz/ubMEnW2T4aGA/8SVW/EJFuwFO17SAiNvAocAZQBqwQkZdV9aOIYmOBb1X1WBG5DLgPuBTY\nAZynqltFpBfwJnBUaJ/HgGuB5cDrwEhgXpLX0eppKzbV5sQELBgaC89vuW8fqLrbWnsob0NISpCE\nOv+bAUTkR0AHVb2vjt0GAptU9fPQfs8BFwCRguQCYEro8wvAIyIiqromosyHQHsRyQZ+DBysqstC\nxywGLmQ/EiSG1DABC4bGxPNbekJEZP80Oydl2hIRn4gcLCI/BlYDj4vIX+rY7SigNOJ7GdVaRY0y\nIR/MLiAnpszFwGpVrQiVL6vjmF6dx4nIShFZuX379jqqmhh/qZ8JjxUz4feb9zt1tS1iAhYMjUms\n3/K66/bPwUmypq2OqvqdiFyDG/b7RxH5IJ0VAxCRnrjmrjPru6+qzsSNMCM/P18bcn5/qZ/CuydR\nOet1CGYx+6EgC96197uHpC3RliaBGZqftjQXJBWSFSQZInIE8Evgf5Pc50vcMGGPzqFt8cqUiUgG\n0BEoBxCRzsBLQJGqfhZRvnMdx2w0fCU+qj4bBMEs0AwqKoJMmQJTpjTNA2Ns+Y2PefENjY3xWyYv\nSO7EdXgvUdUVInI0sLGOfVYAx4Uc818ClwFXxJR5GTcSzA9cAryrqioinYDXgNtUdYlXWFW/EpHv\nRORUXGd7EfBwktdQb3IOyMHq9jpBuxICgNq88w4sWtQ0a0IYW356aIsvvhl0GJqTpHwkqvr/VPUk\nVZ0Q+v65ql5cxz4B4EZcAfQx8Lyqfigid4rI+aFiTwI5IrIJuAXwQoRvBI4F/iAia0N/h4V+ux54\nAtgEfEaaHO3+Uj8T35iIdl6KffXP6HHqV1iW4DhNY1s3tvyWTUuaM+ANOvbX0FND85NsipTOuCP/\nQaFNi4Bfq2pZ4r1AVV/HDdGN3PaHiM/7gF/E2e9u4O4Ex1wJ9Eqm3qngK/FRGazEwcHO9XN6/wV8\nsa6oyWzrxpbfcmlp2mJbyiJraJ0ka9qajZsSxev0rwptOyMdlWoJFOYVkmVnURmsxLZsyPUz/dl+\nrPEfDHnvQefjgPS9rcaW33JpaR23GXQYmhtRrTugSUTWqmrfura1VPLz83XlypX13s9f6mfakmm8\n8ukrKEqGlYEgBJwAWXYW84vmU5Brevj9jZamkXh1aqpBh/HH7D+IyCpVza+rXLIaSbmIXAX8M/T9\nckLRVW2dVze+SlCDAFQFqwBQlMpgJb4SnxEk+yEtQVuM7cybKoCgJQpRQ/OTrCAZg+sjeRBQYClw\ndZrq1GLwlfgIOsHwd0XJtDJx1CHLzqIwr7D5KmdoVpoz8qs5O/OWYNYz2lfLI9kUKZuB8yO3ichE\nYHo6KtVSKMwrxLZsAk4AAEEY228sXTp2oTCv0GgjhmahOTvz5vbHNKUQNdpX8iSb/TcetzRaLVoo\nBbkFPHr2o2RamUjo38ItC8k5IKdZhUhLCj1tbvbHtmjO5QSaOwN1U4bFmxD85Ellqd39In37uAHj\nAJjw6gQcHD7a/hHXvXpd1G9NrWo3xiipLajs++uIsbl9NM1p1mtKjai5ta/WRCqCpEH5q1oj5XvL\ncXCitr340YuMGzCuyTuzxjBrtJUOuCXY65uLtjg7PxmaUog2t8BuTdQqSERkN/EFhgDt01KjFkhh\nXiEZVkbYVwJwcQ93Yn9Td2aNMUpqKx2wGTHunzSlEN1fBXZ9qVWQqGqHpqpIS6Ygt4CFVy9k2pJp\nbN29lbH9x4bNWk3dmTXGKKmtdMBmxGgwtAySmpDY2mnohMRE+Ev9+Ep84cit1uhvaI11Nhjqi3nO\nU6OxJyQaQvhL/QwvHs6+wD4ABncdzL3D72XSpNb1lBqV3dDWaSu+wNZAKuG/+yXF64r5IfADGvq3\ncPNChvxjCP7S/Sj+1LDf0JrDq034btNhNJJ64C/1M2vtrBrbq5wqky7F0OZo7SP6+voCjRms4RhB\nUg9iU6Z4WGKRc0DsUvMGQ+umtUf31ScYoyUJzdYo0IwgqQeRqeVFhKM6HEXpd6WoKje8fgNQPUnR\nYEgHTdnJtIXovmR9gS1FaLYkgVYfjCCpBwW5Bcwvmh+O2PKV+Ljj3TtQlIAT4PrXrmfNV2so6lPU\nYDNXaxyN1Jf94RpTJV4bNXUnsz+FV7cUodlSBFp9MYKknhTkFoSFxPpv1rtTM0MR1EENMmPVDOas\nm9OgtUpmzoQbb3Qfouzs1jMaqQ+tdcTVlCRqo+boZPaX6L6WIjRbikCrLyZqq4GE13SPmYejKBXB\nCnwlvvodzw833ABVVeA4UFHRNqNMTCRN3SRqo+ZM1rg/UFAAkyY1r+Bs7qSYDcVoJA3EW9NdUQRB\nPbWk9FTYPJyck86t3/F8rgDxsO222VG01hFXU5KojVrKqNmQXlqjFmgESQOJXdNdECpLBqBz3kad\ndkxcYtG7HiOKwkLXnFVRAZYFjzzS+h6mZDCdYd3U1katsZMxtH1MipQUiEyVAjDl7greeXIITlCw\nbbj2t5vpcu6z5JSfS/nHvZMKQTQdrCFZzPNiSDfJpkgxgqQR8ATKzk3deXDCOQQDGWRmOmjRcAJO\nAGfOW1hOe7KzpFXZPeNhOq+WQbqCFhJFizXHPTfPWnKks51Mrq0mwsu9VRGowMFBfvVT7M3DOWVw\nJYucheii30MgC0clLZE2rXFJ2+7WAAAgAElEQVRRrbZCc3Z0jbUuTWT9491faJ57bp615Ggp7WQE\nSR3U1ln4/TDlHxVUOP1xOi8BQDsvxem8jCUacsDn+cCuxFKbrCxpVOdya1xUq63Q3C9wqkEL8eqf\nKFqsOe65edaSo6W0kxEktVBbZ+H9VlE5BMd6Cyk6A81diiUWllg46oZgSe5yLpj6CIdvvxTy3oPO\nxwGNc6db46JabYXmfoFTCVrw+2HKFDeww3Gq65/o/jbHPTfPWnK0lHYygqQWaussvN+coGDRnhH2\nn7j43E9Z89Uatu3ZxrxN8wg4AWzLhs5+Zu/7A4HtAeYUZzVosmI8WuOiWm2FlvACNySCKzwACgkR\ny6quf6L72xz33DxrydFS2sk422shGY0kyp7c2c/QOUOpDFaSYWVwznHnMG/TvPB8EwBbbO4aeheT\nBk9qlGszDsnmozW2/dSpMHmyOziyLBgxwtVOWkv9DU2LcbY3EqNGuf8XFdWM548dCUx4tZiKYAXg\nppZfuXUlVU5VWIgIQpadFQ4X9ohdcbE+mHkFzUdrbPtYTaqlCJHWKJQN1RhBkoBIjcO2q7fXZ3JY\n2e6yqO8iwvSR06OEhRf1VRmsJMtuPLPX/ojpjOqmpZhCImnuwAVD6phcWwmI9Y/MmOE+7LWtFFfU\np4gsOyvh7446zNs4j6mLpoZXVPRSrQQ1SGWwst45uloSzbmantcZTZ5c932q7RitdTXA+tASckpF\nYvKvtX6MRpIAzwSwbx+oun+RDvd4o9+C3AIePuthrn/teoLqLoAVlYcLeOXTV3h5w8tYlsWjZz8a\nlWolntmrJVFXKHRzjipTjaJq7vrvz7SEwAVDahhBkgDPBFBcDLNnQyBQ/ZDX1umU7y2POk6kf0RE\nwgLGcRxufP1G3rv6vag1TlqqWauujra5w2FT7Yyau/77M8mY24zZsmWTVkEiIiOBvwI28ISq3hvz\nezZQDAwAyoFLVbVERHKAF4CTgX+o6o0R+/iAI4AfQpvOVNVv0lF/zwdSVBT9EE+dmrjT8TSMfYF9\nYSFiYZF/ZD5rtq0Jzy8Bd/0SX4mPSYMntVgB4lFXR5uoI2+qDiBV239jjIpNZ9dwavM37g/aYmt/\ndtImSETEBh4FzgDKgBUi8rKqfhRRbCzwraoeKyKXAfcBlwL7gMlAr9BfLFeqavqSZ8UQ+5DX1ul4\nqygWrytm9trZBJwAWXYW/Y/oz6qvVoXLCUKGlcGWXVvwl/pbrCDxHvCcnNo72ngdeVMv1JVKFFWq\nE/xiNde22Nk1F21dW2wLgjKdGslAYJOqfg4gIs8BFwCRguQCYEro8wvAIyIiqvo9sFhEjk1j/RpM\nXZ2Ot4piUZ+iqOzAc9bNoSLghgd36dSFsu/KmLFqBrPXzmbBqAUU5BakFArc2MQ+4NOnQ3l54o42\nsiP3FuoKBNzv3kJdqb4g6Ry51SWIalv+1vOlQfo7u9Y+eq0vbd2HUlxc/fy0VkGZTkFyFFAa8b0M\nOCVRGVUNiMguIAfYUcexZ4tIEHgRuFtb6KzKyGV5AaaPnO464recTMmiQjcPV+4yKoIVFK8rBuD0\nu35P4PPTyDj69yycfF/KwiSVTid2JOgJES+qprbj+XyNv1BXU43c6rNeus/nCsnIJ1DE1eDSQaJ6\ntGXh0hJDlhsLvx9mzap+fjIyWqegbI3O9itV9UsR6YArSH6F62eJQkTGAeMAunTp0qgVSKZDi/di\nl+8tJ7hlIMx5B4JZYFfCqOGQuwyAaf9aRGD2GxDMIvBeJdOOf4SXftfwtybVjjd2JJiTk/zx0rFQ\nV1OYOOq7XnpOTrTAtCz3+8SJ0Lt349cvUahsazeN1EVrnPyZDD6fey/BHYCMHt06rzOd80i+BHIj\nvncObYtbRkQygI64TveEqOqXof93A8/imtDilZupqvmqmn/ooYc26AISUVfce6I5DYV5hVgfjIJA\nNmgGBDOhpBBbbIr6FLF1/fGugAn9tnZZp6g5J7H4S/21/l48dzP7KpwGx+d7I0Fv/ejy8uTj/b19\n774bFi6EceNSn6eRzjXLvboVF9dvvfTycld4gNsRqEYnQmxs4tXDzMNovUTez3bt3MCe1kg6NZIV\nwHEi0g1XYFwGXBFT5mVgFOAHLgHerc1MFRI2nVR1h4hkAucC76Sj8rVRl8027ovd2U/xqxuRtWNx\n5beCFcTqtohBXQYxbck02h1zAthnQlDBrqKk02zuWPA+2XZ2eMa750PJOSCHiW9MTDgj3l/qZ9bO\nSaj1OmgmGZkWhYU29SV2JFgfW3WszyTVUXO6TByxWQwyQm9F5DUmOreneXn7ikSHijc2ierRkn0I\nbdnslioNfaZbWpumTZCEfB43Am/ihv/OUtUPReROYKWqvgw8CTwlIpuA/+IKGwBEpAQ4GMgSkQuB\nM4HNwJshIWLjCpHH03UNiajr5tcwCXVfz/Di4exb8Bs0ACCIKCeeuZxNXd5n4eaq6p1HLYSSQshb\nALnLcBT2BfaFfSheOhURwVEHR53wjPhIQeIr8RE8arFrOls3ij5HDgT6p/W6a6OxzFLpMHFE1g3g\n2muhS5fk1kuPbRPveOl8wWPr0ZI7o7YQkZRu6vtMt8Q2TauPRFVfB16P2faHiM/7gF8k2DcvwWEH\nNFb9UqG2mx/7YvsCr7oZgPPeBWsyOBaZmRZDztvMJ98EonfO9WN1WY5q9Xx4RXl89eNs+35bOJ2K\npRZS9lPki9Oxj1lSY0a8N5+lQmycdUWsXNOe4W+n/tA1tCNvyZE3sXXzzAvJBBXEzXBQUG0qa6oR\nY0vtjNp66G5z0BLbtDU621sMtY3ool7s0ohOHW+WOxy8axDW4tsJdp0fdrgDqGqN1CpBDfLKhlfI\nsDLQoLpC5Kn5UJWBLFHWn/URvpKp4bBhbz7LlLsreMdpjxNMz1K/ydKSI2/iaRXJdLK1RVC1tBGj\nV9/w4MbXNJ1RSx5ANCWNqf21xDY1gqQeRD4MkHxnEdWpk42jQlUVPPiHY1DnLix7Ms6ZN8EPOaGQ\n4OU4ODWO46hD90O688E3HxD8YjBUWqAWVVXKDX/7f+hp95BlZzF95HTK95ZTmFfIlKsLWfRU3Q9d\nU8xfiR01R052rG1+SlMQWbfaMhdEkqgzTuT8bk4hGm9OUFN0Ri15AFEbjdnxN/bAoiW2qREkSRL7\nMIwaVb8RXUFuAVOuJtypi7j7Oo5gaTbWG4/hBNVd3/3qn7lrv8cIE0VZ+/Va90veArArEUewMxyC\nXd/F0SA/BH7g+teuBwg74efPL4g7L6J47ma2OR/CD4fwetUkgkctbrJU9vFW6ktl9nsqL37svsmO\n+BKVSyVsOl3ECrfy8upccukm3aG7je3raWh4fyLSof21tHBoI0iSJPZhgPqP6CJHEjk57lwDT6g4\njg0K4ojrbO+8FFtsBnUZxJadWyjZVRJ9sNxlriO9ZCi/PP8ont/1fng+g5cYsjJYSfG6Yrp09LHz\n8O5M/DscueRTzjr2LG6+vAcV+44CuoAEwX4dRg2nssuKGo77dOC1p1fnyJDZhgiChnbUifZNZsSX\nqFzs9pYwczmR0Jszx902Z056BFy6Hfr1uffJ1qWujj82ym/MmJoL30XSEk1RjY0RJEmSk+OOmlWr\nF7qqK2VIPCJHEr171xQqVoZDMG8BDg6WWvhL/VQ5VfEPlrsMzV3GM9+6fpdYghrkiTVPENw8EJ3z\n6/AkyH/3fQqt6I57+zU0b0WRkqFkdVuXVCr7hnYQsbm74q0dnnDfBOa3VEZ8ifZNdsSXqJy3vaXM\nXI4n9JI14TWUpvAVJXvv61OX+oT3B4PuWkW1CeJkByYtLaS3PhhBkgR+v9vRB4PVk84efzz+Ou71\neRC8Mj5ftVDK6f4JEz9cTWXQjko7XxeRjvlIAk4ASk6PmOioKI47qz4AbhS1A1aQk3+6l+l1mLX8\npe58mNm3XEmgyiYjM8jovzxD0bnH1anFJMrdlYyPpLaVJBO9+Mn4fdI1WvTOveXVKwgGu4a39+nT\nOMdvCLFCryHX3twmnVi8a6ioqJmapqHBBcmG9ydaqyjRMeuK/mtu82cqGEGSBJFmGM+3AdGO1Mjs\nr8mouxDtJ7AsePRRGHdhb3oPmF9j0qFt2Zx97NkcftDh9DuiH/M2zmPuhrnJXUCezxUcoYmO9Cl2\n/9YVweox4LiPQWG3wvAKjfESSHqd+b4Fv0ErFBSCjsOMFz/h8e1jGNRlED0O6UFRn6IakyO9TrWy\nsmuUnX7SpCTvQYmvxkqS3jkKCmD6s+t58qXPOLL3p9B5MP5SogRPZACCt5+/1I8v4GP6s+dS/nHv\nxrOxRwg9e+ebZGTOR9XGcWDlSveez58PdE4+wCHZYIh6BU109jPqgY1QMoSiC7vWee317ezSbdLx\nBMVNN8GDD7rv3vXXw7x5cNZZ7uDPe7duuaXhk2nj/RZvraKcnIaHfDeF0E0nRpAkQeSoJzKvUkZG\ntSM1Mvurp+7OmlW7QPH5qo/pOG7KdTc/U3Wyx96H9Y7bMYwbMI7fv/N7pi2ZFt4mCIO7DGbxlsVR\njvrDTvycb0YND0109FWHGpcUgtqADao88OwqGDwV27I59ahTWVK6BEXDM+u9zlzz3gX7f8OCSfMW\nENQgCzcvZOHmheFsxkBUOn2vUwU7qZc5chb/ll1byLAywAHbssPp971zPLn+SaqOroLvYd6cbEb3\nHR0WPBWBCm58/UaCTjC8MmXvw3ozvHg4FSX9sTb/wKPXQ0FB77oehaSIFHoctZhr//IMn88t4p13\nqn1BxXM3M+fgmhpWPEFQmzYW217JlKtR9uAsijrPByKEf4zm4ffDlCnVz2tkZ5eozo0ppGsMaiKE\nmje4U3X/nzsXXnnFraeXsubBB918b40VHegJGm+tokjzdJ2+mjjtlVCrbiXmLiNIksAbgUyZQrgz\n8BKsefmnYhO7eOpubfbTwsLqJH/gvgSxI5HYDMKR3DfiPgDuX3o/KGTYGfQ4tAdXnnQl8zbOY0P5\nBjb9dxPbv98Oud9EzVUBOLzXBnYsdghUVoFVRbDdNlj4PwTzfCwMLgyX2xfYxxTfFC7ucTG2ZRMM\nO/oLowVTCM/JP2fdnKgFvrxOtcvOIvdF6exn6qL4o2evo6sIVODghNdvOe/485i3aR6Pr36cWWtn\nIYgr3CJMexXBCl759BUssVBVEMJ+Jm9lyrH9xlJR0h/nH2/hBLO4/j2HeX+axuEnfhHWqBoaEh27\nfHLRucdBX1i0qLqjIO89KrdHa1jrv1nvCjwNRqXFqU0biyTZcrFlvcwJYSEQxwTpje5j/VmR9ylW\nSLvXfxfTR07Ht7ccShO3Y21tHaXhWTZj+o6BRbeFtVvLqjY5e3jbIwd39dGAIwcxsZpsJJ5ASdbf\nlEjYxzOntSZzlxEkSVJQ4AqSyM7AmwHtjSRsG84+21WtPeGi6morxcU1H4KCAtecFbn4U33V//tG\n3MeFJ1wYHvk/vvrx8APqK/Fxx7t3RPtPSk8NC4BtuS8hvxoKXwyBfR3g9UdBrRpZiRXl7c/fZv4X\n8xEJOfVzlyG5y0O/R5NhZbD6q9VUBCuilhrOsrPoN3Af5XunMvf7nTww+wEcdci0M/GN8kWNZqf4\nplARrAhrVopS5VTxafmnYcERDCb2H325280PGhYmEXh+J2vzMJyQ7yhYVcXcN76F7//O7LWzeeis\nh6LMimP6joky2c1cNZMXP3qRvkf0pVN2p6iOJry42auu2YiyrtXmkLmbIe89+g3cR9Yb1cJmZ8VO\n7lhwR3gFzYpgRVgQxAqmRMEQyZbzytqWTTAYRFFmr50dvj432Wcu6lhUVsKLL1abdi0LRoxw34WC\nApjw2Eb2LZiI5i3AyV3G9a9dz4AjBrj3Tp244ehRWktJ3XnjIoVeMBhkxqoZZO78OEq7vekmeOAB\nCAbde52ZpfxmosWDDyb/bsXWxxvEWGKR8eVgxnSak9AEGKlRZGQG2dLpGfylNf2GdZloI4/dmsxd\n0kKX8mhU8vPzdeXKxllQMdFaFbEjieJiePJJqAoFXGVnw4IFiSNKEqmvyaq2UxdNZfKCyQQ1iC02\ndw29i8K8Qk7/x+muwx1cITJnfs0U9qWnwuz3wMkEBCQAwybD4HujBE+s5hGJhcVpXU/jx+1+zLxN\n86gKVkWZ17zfl5ctr/EbwPgB43ns3MeYuWomN75+IwEn4Aqh2s4f8ZvdZQXnHX8eS0uX8s3e+Csv\nW1gghEf761cdxI2XnUigykKtinB7CMIZR5/B/C/mRwU72GJz3gnncXzO8TVMiu0y2kV3kp7/q1Kx\nMwI88twn9B6wJ2pk7fm8Dm53MPcvvT9qGeZMK5P3rn6vRqdbHx8JEHefmXPX8+K8cvZ1foNFzjQU\njXpmCu+eROWs1yGYSXa2xUN/teOabfx+GDosSEWF1hh8xMMWm2v7X0uXjl2ihEdk3rjYMuV7y8Nl\nI7VbW2yuPXRWWLstKHCv6/p7lhHcfQh2h+387fYCev+kd+J3K6at4uWxA8LvjTjtaJdt1ZrpoHju\nZmbtHJVwTlZd5seGTnpOFyKySlXz6ypnNJJ6Es8JF7st8vuMGa5WEggkHlEkcuzVK2Qxzmi0ILeA\nR89+NGwusbacQdDJRtV2/Rslhe6LX1IIjgUIockskOfjoG1nsGfO3Lhrp8SiKCOPGcn7W9+nIlhR\n43cHh4WbF8bZ02X1V6uZuWomN7x+Q92CL85vwVEjOP6nx/PqxlejjmuLHSUMzj/hfG796a0hkyH0\nXuC+/E98+ysCR7nHzrAyuLjHxfg2+6K0nqAGmfvJ3Bqh1opSEahgim8KUwqnuOYonytEnKDgODDh\n0ec4/5qPokbW/97wbzLtTIJOMEqIWGLxm4LfRAU+JDJxxg40vHLxTE7jBoxj5tz1XPfLYyDQHeyB\nZI7243ReEn5mopJ9lhRyeJ8S1hzZienPXh/l6/CX+pn496+prDwvHA1IydDw/YlN8SMItmWH/WXg\nZmpQ3KANW2z3L1TGG2xYYpFtZzN95HTWfLUm/JuI0G/gPsYN8JZT8LElYwvaZz384y2CwSwmXBrg\nsX+tZ9Kkmr6v2PY59/hzo/LY2ZY7r8vBQUqGosEsVK2wf8sXeLaGgC4oAF/gWYILFsfVODzB5V1L\nvHsZ+77XMHeFoiaTDZBoKowgSZF42oj3vaioesJXQ6JW6hWyGDKnxI5Axw0YF3bY55x0LhMX21RU\nKmIrg05X/JJJVZ4PMkLhwJYDZ98IucvYs+i2qLDhsOCJpfRUKBnG3ODXvG8nGUkWw4qtK1ixdUW0\nGa6kMPH5a/w2hD8vuS/qmBeecCGHH3Q4f1/1d8DtFF779DVu/emt1e1WAAUFXTn4nQL+vGQxiuKo\nw2fffkbfn/Tl/a3v16hrvFBrB4e3P38b32YfY/qOoV/367EzTsQJCojitPsmnCvNCbodqKJUBaui\njmeJxe9++jseXv5wXLOav9QfzgTdL3A9E6/oHbqfVZz3p+mcNbQT5XvL2bJrS9g04zgOE16bAMCL\n8453hUio3Y777hquGjoy6pkJ+8Fyl7EZ+PsqyLajl4MunFNIZWZ/sM4EzQS7CqvbQmwr09UsLBtB\n3CALz68BPL768bgh7ZZYjO03lm3fb+Pfn/w73CZeduvyveU8du5j9DuiX3hgNPGNiQBRJkgpuTX8\nXDih1EG9B+yJa2KKbB/v3njBHJERkmsOa8fsJUKgyjVbzdo5iuCCmhqHv9QfDgrRoCIi5ByQE/7N\nE1wigiUWjjrMWTcnfAzXpNgZddx31OcTJk2C9VkzmfLRi/T9vi/T/9/ykLaYxeyHgix4146eLNlM\nS3UbQZICiZyStY0o6kN9wycTjVojt/eeDz6fUFiYRUHBvUx4dRcznBlohPPc6vI+qoLGhg3n+QB3\n9AjuS05pATrnbTSYxfvvVcKo5bWaNwA6ZXfiu8rvcNQJj1zjdc7SbSHqzXcRhfbVKzBLXui3iLpF\nHiPbzubWQa7AeGLNE+FRcFCDFK8rjjJpFK8r5vHVj4f3D2owynQVD1tsBhwxgONyjuPZ9c+Gr6Ey\nWMnfV/2dLHsWv/ztWzxzb4Hrd3rjrwR/8iEDTglyZIcjeW3ja1Q5VTWu+3c//R2dsjuFOznPJzBn\n3RxuOuUmHlj6QLWPZ3EOTkUPcGxwhLlvfMvc73+PIGTama4/K3R4R90gg9+c+jxvzaput40HP0Fh\n3r0ATHjVFTZnH3c2cz+JHhBELgc9xTeFqmBVdXaFddWrMY3tNzb8ud8R/cIj76I+RczdMDdK84rE\n2/76xtdraDIAcz+ZGzZ1eWanfYF9/HXZX8MmLw0qJwzYyob3AjgB9/qCXeYzxbecKYVTAKKiAGPb\nZ3Tf0Wzbs41XPn2Flz99mWw7m6I+RYybUEC/I9zw8m8OeZ7NBy9EVdkX2Bd+TrzAlqAT9CqOow4T\n35gYHsiFTXManX1i2pJpbN29lZX/zUStt0AzcaSKnO6fMXOVn+tevQ6Atz5/Cz6rHtxVVgajBpf+\nUj+Fd0+i6rNBZB4zCd8dU5tMmBgfSQpMnequghgMuo52b36A9/2uu6qjRFKdCZ6u8L/w3JAI+7OF\nq9o76uCUnoJ+cXqUj8IWOzxyZ9EkePdOd4QrVTDsD65vpbFYeU3cIABB0NJTXWd2rP+ktIAeeyZw\nfP5XHH7iFxzc7mAe9D/omi1CI0FVJcPKwBKrRtRXPAThqA5H8eXuL1Hc7MwXnHABW3dvjau1AHRe\n9yhf/vs61LHDbWOdPo0MK4OgE4w7Mr9n2D3srNgZV5DFmouqzXuuRhBp+hOELh27sGXXlur7Khbj\n+o/jvcWVfLzyJ5C3IOxbennDy2G/lS02llg1MirYYpNhZUS3V4yJse+tv2N99kwUt31V3SCJGnWP\nwMIiOyObUX1GRWks8fa5ddCtTF82ncpgZfxjiYVVNgj9Ykgoq7YbIp5hZWCLHa674GoFkffg1kG3\n8hf/X8KDDq+9AJ5c82TiDBO1YGGRf2Q+lcHK6jx5tRHh9xt4isPW3Vsp210W/XvEPb9w6iPceulg\nCnILmPBYMX+/+ZLwvbhw6iMMPCWYknaSrI/ECJIUiNVIvIlRXpRIY6YVT6cg8swlnv06cgJfPEen\nJ2iCWwbC2l/hrBnljopDnZnV5X3yj8hn5Vcrw2HJZx97Nlt3b61pvqqLRbfBu3e5gsoKYA/7Pxg8\nFREJrdnidlhnH3u26+TfnI/zj7ei/Cp2lxX89qe/5bt93zFz9cyQJpVcEIF3vdkZrp3+pnk3JezE\nauC99E4WYlehRcOqhWCcNsi2s3norIei/UTJnCOJ65DQP4Rwu1likWFlEHACNTQFzyz4xqY3auZ5\niyTFgYQgnHzkyRzZ4UiAuIEakZx59Jkc/aOjmbFqRsLnyBab844/L/GE3ThtJgjH/OgYPvv2syiH\nfg3B2UhEPQP1eBZjy0vucjLtTMb0HcNHL53PwtlnRNyLPyKD760RFVmvehpne/qJl4TRi1+fPj06\nBUoqYXyJUovUJVSSFWCe6auoT1Hc2Pneh/WuIWhuOvJZHvzTOQQDGWTaDv3OWUfhhVvodOz5FOb9\nJeHM+MI5heGOOFLz8SZBLlxSFa1lRJjXsrMsHrrhl6zJ2OE6XZ0qLMvi4bMeZtyAcW7Y8N0VvB0T\nUBDMXcaD/gcZ229stRCJ48S/8MQLOTDzQJ5Z/0xU+4w4ekTYiT5v07waZp+ERMy30YhOwouSCpuo\nQhFtPQ7pwbxN85IXIt45akSzFcTV1BycsClHEPKPyGdP5R4+2vFRaL/qDurTQz7l1kG3sm3PttoF\nSSgLdaz5Mxk8QbZm25qwVmdh0aVTF0p2xj9n3yP6cuEJF9acoxRDwtF/gnuvKJu+3RQuJgiH/vd8\nvv7PiWhotdLGQhC6durqXmNMfTqMu4g9h71du+CKuOcKYXOqWB+APSTiXiwIm1sj5wmlAyNIUiR2\nQpI3WXHNmup0CammiogURBUV7rwTx6kWDl6Z2JxV9RVg4Vm2ceyskYKmMK8Q39MFOAFwgqCOTf8j\n+3Pf6P7AhdUHLCuAxQXuU5brHt83yhe2tRf1cW3rxeuK2bZnG5QVkPH0RAKVrhnLvvpn/Payn/Ld\ngBciolR6M3VRF3cUjYOoUL63PFz/KVfDe8VBKiqqojo2r9POsrOojOPEl9zlHH7g4Xz+7edRbWKJ\n5UZwlfiYu2EuL7+zHb64LdxJe53hmL5jOLjdwbyy4RU+2fFJdUcQp6MXhGv7Xxv+3u+Ifkx8YyKL\nNy9OOBJPmtICrOJ3cQKuBke/2dDnKTRk4om8rjXb1lSba2I6tI8Yzmk7TqueN5SIOianxqPHIT34\n9am/DgcFzFg1I/ybg8PmnZsT7vvw8oc55kfH8LNjfsYrn74S1zwY1GBCQVRrAEcI12x6CtvmPA2B\nrFAAyg2Q/0S4zOldTmdJ6ZLEufDq0DLKviuLW5/dG/rDYW8lvP5ILKwo/6J2Xlrve9FYGEHSSEQK\nC9uOzsHjOd0buvZD5LGr1zEJpdoodiPD4q3r0RABVvzqxnBUSOV7lRT3fYGCCdUT7cKjmkL3Or3U\nFLNnR6eCiacNebmlIif2+Uv9zFo7y9VSFp2Iu1hXBuII1/74ae4b0RVGxLRHLRPvCgpgwbs2xXPL\n+OjAx1jCChQ3hLTfEf0Ywxjm7tjAtvdqOupnr53Nr0/9tevUDHF5r8urJ6eVDozqbE+ffCcjh3aM\nskF3yu4Uns+TiAzLfe28dpi6aCqVwcooIeL5YA4/6PCwfd7T4MJzbGIQBGvzMIKBjFDHZMPKcbB2\nVI3Q7UFdBrFo86LqneN0sE7uspqzTT28jrL9DvjhUKxuC9HOy8PFvYCMeOl2njj/iaiJnfGuJVJj\nizQDeRMcw6HDsfWpqwONE0Di+c08lJCACWQBGeCo66f7yX8gdxmZViY79u6oXYjE0Xq8wAFFcRx3\nkbq9fUrYsrAKDdSh0fxLaigAAB5fSURBVMWYs9zUq3EGHXEGLplWZnjQli6MIGkkIs1cW7a42YGD\nwepZ7ZGhwMmu/RDp34i3jklWllsu0boekyY1IGqsZEiNsNpE1ztmTOJ5MrHaUKLcUr4SnxsBBOGX\nXByhXbZN0YVd4587QahzZN0KCroC9+IvvaBmAsxDbOyrf0bwi8FIt/cg1AEGnACdsjsx49wZvPjR\ni1zc42LK95bz3H+ec1/amM62x/cTmDQ4uo6RQk5E6H94fwq7FfLdvu/Ytmcb//3hvywpXcLM1TPD\noZ/ePpEzqb2os1hNEKrzl0Xa7jOtTDd89sCDmeurhIAAFmC7jtmYkXePQ3qwrGxZ2MyYefRSWAxV\nlVW1dmiWWIzInMz8p24nWGWDWogodpaDjBoRnogXmyQzdvLf1EVTyTkghxc/erGGz8i2bG4puIVO\n2Z3IOSCHNV+tiXJ21+jAa5lvdHrX06OXYojRoCR3OadxK0sWZxJstw354VCk20KcPJ+riTiK61iy\nkJJhDD4tC3+pv9ocGAcpGYZGPCd5O0cz8sK+Yc3Tu88byjeQ/eMS/mfGmzzw7MoaS24nur5DJlzO\n9pyXE54/koFHDmT6yOlpj94ygqQRiV2DInK0DvUzM8Ub0XsRYN46Jp6GEamRiESn0052XQ2Pogu7\nMvuhIJWVQbKyrISdOUQLR9t2Bajf754vVhuKl1vKS/+RaWe6HVruMjJHn8XYHxXXOdmqthxk8cp5\no/6gBsGBay/oRZeOHcg5YBQT31hTYyLnuAFutI6/1F/dyUeMZjOzJG7bxBNyfj/4VkC/7uu5YWP/\ncEe4r6QfU+6uYMrVheF94uV2ir3W2vxZM4+YyVwvJHfNaAhm1AidzrQyAXj4rIerw3PHFMHoDIrn\nbo7S5DKsDPr+pC8rv1qJs2UgWjKMnQeeHxKo7vFUBSdgcW2nOXQZWnOiXuQ1hKMEv+iHlpwOebuB\nU5CSoVjdFkGuH0cdHl7+cI1Z34kc7FIyFHWy3QSkDmGhaWHRzm4XpW0IgoZG7YKQufV0lhf/iWCF\ngAoqQTQUfbbunJvQ1x4GtbAzHf5242WU5xwUrcnhdtZj+48Nt2W/Ppdx8xIJv0PP/nZc+FnufVhv\nJj7+L1YsPQAnbwGVXVbQ6diPWTTrHG6bHWDRwmEhn0y1KfLQb37J9gjBdOg3l0QLkhhtLDM0l8cT\n6E0RAmwESSMRGx0VO1qH+pmZaozoi+P7QaBa69i5szpq7KabXD9NXansY/FMQ8loMQUFruP/ySfd\ncz3+eLS2FalFrflsCPbO0yA0Yi3MKwy1WQEP91rJmoy/AW6HVpCbWHg1lBqJFCPMa4kyLEO0YMg5\nIIc1Uf6a+BPACnILoKwA39OwPmrRshMJXjUQcpdC6anonLd5R9uz6CmYP7+ASYOTv1GJBGn53nKs\n3Pdds9Tha7Dm/Q11Msh4+zEKug5hxw5l48FP8Lg+XjNFR25NTc7TIArvnkTlnNfRYBZrMoWMjOrM\nul4SR7dNJuH3w9Sn4z8/vhIfFSX90TmhyDrLfTlUM2FREC0ahtN5SY1Z4UV9iqpNoMQEahyzFFlC\naMKgoMf4CYpNlp3FxT0uZtGWRdUTFmMnSe69jcer7JBQ1HBnvX3ZWSDL4OwbkB8O49qLT2DchUX4\nS/dUD3wgPOs+Ns3J6KvdzzXev7IC1v15oLsMg12JPeZst43LClhxbwFUKCJBrHNuhgEzybKzuHvM\nCG5+u1ow/fqyvtyw3o22q6mNjUDyVnFtv2trLOeQVlS1zf8NGDBA08nSpart26vatvv/0qWJt91z\nj/t/fY6ZlaWana1qWW4aSMuqPmYk99zjlvfSRYrEL9fY1y1SfU7bdusR7zqy2wV0/N/m6NItS+O2\nT23nqavdlm5ZqvcsvEeXbklcKJky9Tn30i1Ltf3d7dX+P1vb390+fNzIa8vIiLhvtqMZZ0xWa4ql\n1ojbVaxgjTarzzOS6Bq9OmWcMVkt2wk/M5mZ6p4z43tl7Klq/5+t9yysvlm1nXv8rSVR9R0/3i07\nY0b0PnXd16VblmrGGZMVqQo9M4HQX3X7xLZnuA6vjFeZIsoU1P4/W8e/Mj58PyPrHnufI7/X+G2p\n+1wiAXXtWFWKVKqdEXDrmPG9Zo0bElWXpVuW6vhXxuv4V8bXqGNd1x/5jooV0PG3loS3e88JqNoZ\nwfC7Eu/eeHXocWmxYrnth1Qqw28L39dUnyVVVWClJtHHNnsn3xR/6RYkkQ9Ho3YKof3Hj48WEPE6\nbK98oo69MR6qWJIRXJEviGVV1zlRm8Vrg7oETqIOPVVmzHA734SCe+E9av+fHe7YvE458tq8Dtyr\n/4yXPtB7Ft6jM176IKnBh9cG9bl3XmcZeY5IgQYBJf+xhMIv9lqXLnWfwezsugV/Mvd1xksfaGZ2\npVp2UO3MKs3MCtZon6iOO3T9M176oNHv84yXPlD75Jkq+Y+pddp9atlVijjVz3REZx+P2HtT2/XX\n1o5Ll7r3yDtv5LtSG959s2xHyfxerWsGafu728d9vhqCESRNKEjqM7pO5fh1aSRe2QsvdOvilZsx\no3HqV2NUFKM1jR9f89gzZkQLwBkzau6baseUqENPhWRe7GQ0Eq/94wmCGTNUzzyzuk3iXWuqz1a4\nE57h3qOQQUrtzCqd8dIH4XK1DYbqusex50umvlEaRC2CskZbxhE0DWXpUtXM7Mqw5iH5j4W1rmQ0\n+mQtEbFla3tXahu4JGqnSEHrtU2yA7W6SFaQGB9JI1DXGs91Udfs81h/Q12TEd98030VvImR3uJb\nqaxrkGhyY13XXf7/2zv3WDuK84D/vvuwoaQCYiJABdegoEZUTgxxKW5pZdJgQagiSyARGhWKrKAL\nlFKpqgOKVKVVFbf80RTHNDWkvJSoiQIlINLyMtwKyVcG8zA4cdJA6xIQLuAGIqricn2nf8yOz5y5\ns7Ozu+dx77nfTzo65+zZszvf7O58M99j5mBn8a6xMfs9lCklS04Ic+46HDlLADimp7tXwxwfn3/u\nsuixHNlmZjq+k6eesgEUMVnbJrP6ASBr1sAzz1jnOHMTHNy3+kjaT1k9++cHWLmy2m8Wk90trXDg\nAJx0kvUd+ItMlS2f8Oqr3fIf3Lc6OptvE6anKUKlBQ4bxmSciWWG2Q/ylsuOXZuySMmcerz66u5A\nmq7JGBPJxZ2AmtXFC1jf32WO55GjbRb7q98jkia4Ye7GjXkmg1z60astO24OvTh3ro9k6u/uNlOb\n92f1bFO9R3//sTE7MnGjhl6Raw7tVf3ljGhj9dyr83dGQ/a1fHleT9/5B/sx2vfNQpPL/89sv//F\nxn7MHD9fm3ps8vypj2TEFEl4Qbdvn+/zaNJAxx76lA22rb+m6YPQD/9M3fLFHsTQl7Fhw/z6jDmU\nB1HecN8256+SM6esbc/v++2c2ajsXg+v1caN3SbAXuGelypzXdUx6iiepvVYdb/06xlTRbJAFEnY\ns928udv2nmOPzeml5thge6FM+q0Q6uLK5AckpAIRYnWW6qn30/81qOvRTxlyyrIQRySDqBP/XL14\nblI+kn7JoopkgSiSMKxvbGx+72xycn7D7/eGw5ukqned25BWMUjFUXWuKrNLToNTdowNGzrXKKy7\nrnBNsddpWPjlr3s9Q8d+nfNUmhUzyuJ6/xs35o0AcjsITemVM9ova5nc/VZY/YoaNUYVyYJRJDt3\ndo9ARDqRGW7YHl5oP3qjKw9hrNMY1LH3G1PvwUmZyPpB1WiqTLZQJpfbUHcklqq7sDed6kn3gtxe\nZ24DW/daNhkN9LJRTpXHv9fbNpC9auB7+dzFjt10xNkr+XIViUZt9Zl16+DWW+2MvW6dktQ08DMz\ncN11nWx4Y2CiuEpzc/D44zbKJzzGzAxceaXdLxZpkhP95CJr7rzT7meM3d400isXP6Ll8GE7I4Cf\nIT893ZkC5tChTllCmWJy50yln4qyqppTLEXdNWRSZQ0jhMDuc+hQ95Q4seO9/37+tfTP46LWjEn/\nL3Vv5dZB2X7htYH2a/v4x60zkWqsjFVRdU1n/q6zdEQY1RmLduvn8wvoiGRQ5PYuQlPY5GTHLBEz\nv9TpceYMwWNO0UGNSMoy5MtyUapkMqZ9tEvT0VmTHmHKRBErQ2XCpHe83Gvp+43Gx7uTKav+1zTi\nq05dtYkebBORVrZv1Wg2FayRundTSa2xc4SjkF75llDT1sJSJLn4D7IfdpoavqamKMkl1ug4M1Ps\nQei1/yTVYPvKtcpPEZYr1QCUJQm6xtmfmqbMDFl23qYKrE4QRR2/WE4yYVkdNI1qyq2DumbXJr6+\nHNNo6rypfZsoqTq/j493nvGwg5Eyd5aZeuuQq0j6atoSkQuBW4Bx4BvGmL8Kfl8O3AN8EjgIXGaM\n2S8iK4B7gV8D7jLG/KH3n08CdwFHA/8M3FAIPBK4iRDvuw8uucQmKbntofnlmmu6zRYizZOPwvVU\nXDIWRNYViWxrOmx25jSw57viivnmg/XrrXnPmdvCtU/8Y+UkTZbtF5oVP/igU7cA3/8+bN4clyE8\nXpVJI2YmqTJRQHciW9U5mibKHjxozVpzc/a8VUmIvizQKf+rr3bMsq58ZfvmmF3d/2LXsyyJL2Xm\nqWN2Su0bm2G7yuRV9btverv99s59ODERTxb1zZ0pU2/fyNE2TV5Y5fEKcDqwDNgDnBnscy3w98Xn\nzwHfKT4fA5wHTAHbgv88DZwLCPAvwEVVZVkMIxJ/GOz3MlLDYt8JHIv8aloG/xhNIsRyzzU1Zcud\n48iemor3ynxy8yXKyh+aFV1vsGr+o7Lz5jrOU7/HTBShOaNOhFUOTU0+sclF/RFNSq5Urk5qVJnK\nm8ox85SNJqpMUbGRbx1zaG4dT0117j+wo+LUMXptKWDYpi1gHfCI9/0m4KZgn0eAdcXnCeBtQLzf\n/8BXJMDJwI+875cD26vKstAViX9DhFFaZbbRqrDUOg9ITtl8U1sT80LsmHWS1HIeGr+sVRncKXu3\nL2uVH6LOeR1VijgVjdbEVt+E3HslvA/Da+qX3ze9+PvWMSu5e73KrFtVh23CdcN9/M7fxIR9LzML\n1lX8oSIpe877kTRrzMJQJJdizVnu++9HRhd7gVO8768AJ3jfQ0WyFnjc+/5bwEMl578a2A3sXrly\nZW9rt8eUOdZ8pRKzy6acfL1s/GONadXDmMoYDv0xuaG1/nE3b47b8XfuNOacc+Y3UmEeRU6vM7Ut\nVrZUPkq4b1P7ednIrJ8huClyRiSuZx76nMJOUuqahCHY4YzYYSBBU4Wb4wsJfREbNsTv57CD1+QZ\n3LnTyitS/nz0uhPhs+QVif9aTCMSf5hfNWtv2YMXi/zyb/6602TkNlI7d9qht3/u2M0fNj51ktT8\nnn8suqx7llt7/s2bu/ft9VQbMblyo5xSpstQGcca1F6MSKoUf87/w162kytsdDduLO8ApMrvK1AR\ne5yqQILY81FlIi0b+fqmqphZLjbCDq9RLyPOfPrZiVgIikRNWzWo00POOVY4BfrUVD3zS3i8nOF+\nOA2Gb64K5aiTae0oG8n4pg2/d+h6hBs2dO+3YUOezE1MBXX+FyrUUGmEo8oyc2Yb80ZKObUlbIBj\nkUWpEVWooMJy1g2rzZU1PG/MhBZGRPmylpnbUte7bT33y1eSq0j6GbX1DHCGiJwGvI51pv9esM+D\nwJXADHYE80RR+CjGmDdE5Ocici6wC7gC+Fo/Cj9oytZWr7vmuvtPmATpIqK+/GWb1Dg314kWgerp\n1auif1wESciyZTYqJ0yuCqdQz5Fx/XobUeYimMBOTT8+buVZtsxGuj31VCf6DOz06Y8+2vnPJZek\nz5OK6vIjzMqmPk9FSPn7pBIx/STMuTl7Lbdtmx+Vk5NwmWJ62kanOXKS16qipMLEwfFx+MIXOlGA\nd9/dHf00MzM/wiu8Z3bsmJ8Y6hL03D2cishzsrqIPBG46KLOf93vYeSWuw5+qzQx0ZHFP/e6dXa7\nS+qdne3IsmVLJ+LM/R4uTV2H8F7sR9JmLXK0TdMX8Bng37Amqy8V2/4C+Gzx+Sjgu8DL2Gis073/\n7gf+G3gPeI0i4gtr3tpbHHMb3gim7LUYRiS9JNVDdb0ylyfiTEHue46Zoeyc4WjhzDM7ZSmzKVeZ\nylKmCd/xWtY7dDJs3pw/AirrHfu92bGx7ryS3FFbzEnr92B9mWILa4Wy1s1vCH+LmQKrTHI5vofU\nFC6xHn/YSy+7zmVObleWVH2Eia3ORxPz4/jnCKc4ipWlbFRUt4xVuLpJRTv20tTFsE1bC+m1lBRJ\nzg3uO+82bux+uNyU3TlO4xA3Pb6bT6ws7LNqDRb/YfEVXI58jrJInxxfQOwcX/lK2gae8/D6viun\nGLZvn+/zmZy0x8xJDMxt2MPORNgg5/pIUqHTYZRU3etUprT9evaVUNl/Q5NgrP79qLGyz36ghh9s\nkpppIiTmk2nSUfOvW1W0Y9Pjx1BFskQVSd3Q0nPO6b4pXehiXT+KIzYaKHNYphrGsMEOo19yoqj8\nXtvkZH7OSuwcZT4g53PJWc44NtVLTEH5DUNZfabKakz6PmjbI46NIJoqpzKlHY5ucx3jrp5jIdth\n2WOjkLKck3CEkbvsdSo4InUPV13TmJL1/9t2nRVHriLRSRtHjKps3fD3TZtgzx77fWzM3ppuWdxP\nf9r6VOr4Atwki7Oz85cg9bdDd8a0n4V86FBaxly/kb9U7uxst527yhcQnsPJdfPN8OCD3ccum0gz\nJLbssMva9/0UExO2HmZmyuszLFvoq4hllTtyM7pT2fcxO7+fle98YM6Xkzp2zP/mJqR0dVUnc9/P\nzPfrKzYBZNXncDlbfzaJ1DPijuHudRG46qrY8rjxujn//E79PflkJxN/YsLKNTFht73/vn2G/efI\n94/E6r4fqCIZEaoeTEc4DcfBg7B1q30PG4AyJVLlyCtrqMq2+8cU6W6koRMsUIfp6W7FIdLdYIdO\n0BzFtG4d3H9/x9H53HOwe3enwTp4sHsd8pD1660sofwi9n1sDM47D3bt6m6g60zlcdttnSCLiQnr\n5D7rrPlO4XAqFvebo2qN8Jhyc+V6/vn09B+xY4frt4f3aNk1ijXIYX2tWGEVAHTWig+V5MxM+piu\n3Hfc0bmvJifLlUisHLn38D33dDpThw7Z7+4c7tzG2PtkdhZeeqkTsFI19Uq/UEUyAlQ9mCHuxoo1\nFKtX50Vn+Teq2+7+Uza/1fR0vNfuH9M1qmA/X3DB/Ic1JzrKn58LbOTQtm22kQPbuPpK080hVnVc\nV3+xCKGqOc5i9bJlS3ev9aij4qO5nDmzZmbg2ms7x3MRSqGcfkPpR1a5+dXKGiS/fmKNtX+sstGQ\nO0aVoqkT/VbVWbr++s59cOedtuPk14kfRZiKckqNMGI0ness5MCB7vMb04n0M6b5/GE9Jcf+tdhf\no+4jaWLzbpMcVRU5U/WflMO4avrrOo7EVPJZU+dwTLY28fqp+qybaxBOp+HkyvGVON+M79Oqus5l\njm+XMJiKGKvKn6kKxMidOyv0QYnMjxqsE0XY1omd69/zI8X8QIOc56RJjlYZqLN96SiSJjd4m4ei\nKnImJGcf30GYctTWUYCpiKbcRLlBEHPsN1kDJTbBX070ViqBLvc6pxzLMTnLZKwKZ64qb0zZxRIZ\n63aGyq5VFaHMuZ2Esk5QeLxYeH+vIraMUUWypBSJMc16xm170+4YbUck4T5Vs7XWeVBijXRO1FHb\nB7BOmWI0GTG6RjMWMp0aHeQorZz6SY0Aq6KzykJtU1FL4fxaZXUWi2KK3Re9yARP1VnV/GCp/+aW\nq5c5JMaoIllyimSY5A7XU/uEppGq+ZCaPvRNE/h6RW4D0bQhcTLUnS6lF9cwVeYcpeGH1oY5IOHx\ny2bWHURnIPc+KTOfxhJQ25wn/I+OSFSRLFlyRyS9PE+/Rx4x6prmmii2YcpYVuawTDGlkWsCrZt/\n0Uvq1G1M5qmpzsSVuTlNTcrYqzrIVSQataUsCFIx/r0MX+xVJE1T6kTVNJlnDYYXAgrpOeP8endl\nnJuzEVAukq+qbqrqpGmd5VKnbsPoMT8y7OKL4YEHrCqZne3tNep3HcRQRaIsGGJJgIM4zyAZhCIb\nWghoBWG9h2UctpLPoW7dOpm3bOlWQCedZEO9F9o1aorY0ctos3btWrN79+5hF0NRBkZOLsawWQxl\njNGk3LFcL1j48ovIs8aYtZX7qSJRFEXpP4tRceYqEjVtKYqiDIBhmlT7zdiwC6AoiqIsblSRKIqi\nKK1QRaIoiqK0QhWJoiiK0gpVJIqiKEorVJEoiqIorVgSeSQi8hbwnw3+egLwdo+Ls9BRmZcGKvPS\noK3Mv2yM+UjVTktCkTRFRHbnJOOMEirz0kBlXhoMSmY1bSmKoiitUEWiKIqitEIVSZrbhl2AIaAy\nLw1U5qXBQGRWH4miKIrSCh2RKIqiKK1QRaIoiqK0YkkrEhG5Q0TeFJG93rYPi8hjIvKT4v34YruI\nyFYReVlEXhSRs4dX8uaIyKki8qSI/FBEfiAiNxTbR1ZuETlKRJ4WkT2FzH9ebD9NRHYVsn1HRJYV\n25cX318ufl81zPI3RUTGReR5EXmo+D7S8gKIyH4ReUlEXhCR3cW2kb23AUTkOBG5V0R+JCL7RGTd\noGVe0ooEuAu4MNh2I7DDGHMGsKP4DnARcEbxuhr4+oDK2GtmgT8xxpwJnAtcJyJnMtpyHwI+ZYz5\nBLAGuFBEzgX+GviqMeajwM+ATcX+m4CfFdu/Wuy3GLkB2Od9H3V5HecbY9Z4+ROjfG8D3AI8bIz5\nGPAJ7DUfrMzGmCX9AlYBe73vPwZOLj6fDPy4+LwduDy232J+AQ8AFywVuYFfAJ4Dfh2b8TtRbF8H\nPFJ8fgRYV3yeKPaTYZe9ppynFA3Ip4CHABlleT259wMnBNtG9t4GjgX+I7xeg5Z5qY9IYpxojHmj\n+HwAOLH4/EvAT739Xiu2LVoKE8ZZwC5GXO7CzPMC8CbwGPAK8I4xZrbYxZfriMzF7+8CKwZb4tb8\nLbAZmCu+r2C05XUY4FEReVZEri62jfK9fRrwFnBnYcb8hogcw4BlVkWSwFiVPZLx0SLyIeA+4I+N\nMT/3fxtFuY0xh40xa7A99XOAjw25SH1DRH4XeNMY8+ywyzIEzjPGnI014VwnIr/t/ziC9/YEcDbw\ndWPMWcD/0DFjAYORWRXJfP5LRE4GKN7fLLa/Dpzq7XdKsW3RISKTWCXyLWPMPxWbR15uAGPMO8CT\nWNPOcSIyUfzky3VE5uL3Y4GDAy5qG34T+KyI7Ae+jTVv3cLoynsEY8zrxfubwP3YTsMo39uvAa8Z\nY3YV3+/FKpaByqyKZD4PAlcWn6/E+hDc9iuKqIdzgXe9oeOiQUQE+AdgnzHmb7yfRlZuEfmIiBxX\nfD4a6xPah1Uolxa7hTK7urgUeKLo1S0KjDE3GWNOMcasAj6HLf/nGVF5HSJyjIj8ovsMbAD2MsL3\ntjHmAPBTEfmVYtPvAD9k0DIP21k0ZEfVPwJvAB9gNfsmrG14B/AT4HHgw8W+AtyKta2/BKwddvkb\nynwedpj7IvBC8frMKMsNfBx4vpB5L/BnxfbTgaeBl4HvAsuL7UcV318ufj992DK0kH098NBSkLeQ\nb0/x+gHwpWL7yN7bhRxrgN3F/f094PhBy6xTpCiKoiitUNOWoiiK0gpVJIqiKEorVJEoiqIorVBF\noiiKorRCFYmiKIrSClUkitIQETlczDLrXjdW/yv72KvEm5VaURYyE9W7KIpSwv8aO+2KoixpdESi\nKD2mWBPj5mJdjKdF5KPF9lUi8kSxDsQOEVlZbD9RRO4Xu17KHhH5jeJQ4yJyu9g1VB4tsvIRkT8S\nu57MiyLy7SGJqShHUEWiKM05OjBtXeb99q4xZjWwDTsTL8DXgLuNMR8HvgVsLbZvBf7V2PVSzsZm\nZYNdM+JWY8yvAu8AlxTbbwTOKo4z1S/hFCUXzWxXlIaIyHvGmA9Ftu/HLqT178UEmQeMMStE5G3s\n2g8fFNvfMMacICJvAacYYw55x1gFPGbswkSIyBeBSWPMX4rIw8B72OkwvmeMea/PoipKEh2RKEp/\nMCWf63DI+3yYjk/zYux8SWcDz3gz+irKUFBFoij94TLvfab4vBM7Gy/A54Gnis87gGvgyAJcx5Yd\nVETGgFONMU8CX8RO+T5vVKQog0R7MorSnKOLVRcdDxtjXAjw8SLyInZUcXmx7XrsSnZ/il3V7qpi\n+w3AbSKyCTvyuAY7K3WMceCbhbIRYKuxa6woytBQH4mi9JjCR7LWGPP2sMuiKINATVuKoihKK3RE\noiiKorRCRySKoihKK1SRKIqiKK1QRaIoiqK0QhWJoiiK0gpVJIqiKEor/h+mPrdO7d3H3QAAAABJ\nRU5ErkJggg==\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJzsnXl8VNXZ+L/P3CQssmnUgiQQ6goY\nEYhoqmAQtWBdsNiK2gbceKvCW2zVV21VXFqs2hb3AgolVUGrPyMWECwSQAhCICCCCwiBhE2M4gZk\nMnPP74+75M5kkpksk0zC+X4++WTu/txzzz3PeZZzriil0Gg0Go2mNnzNLYBGo9FoEh+tLDQajUYT\nFa0sNBqNRhMVrSw0Go1GExWtLDQajUYTFa0sNBqNRhMVrSwSHBExROR7EenRmPs2JyJykog0es62\niFwoIiWe5U9FZHAs+9bjWi+IyL31Pb61ISJlIpLTyOd8SUQmNeY5NfUnqbkFaG2IyPeexfZABRC0\nl/9HKfVyXc6nlAoCHRp73yMBpdSpjXEeEbkJ+JVSKsdz7psa49yaxkFEXgK2KqUmNbcsrRWtLBoZ\npZTbWNs915uUUv+taX8RSVJKBZpCNo1G03AivbN1fY9b4nuv3VBNjIg8IiKvishsEfkO+JWIZIvI\nKhE5ICJ7ROQpEUm2908SESUiGfbyS/b2BSLynYgUikivuu5rbx8hIp+JyDci8rSIrBCRsTXIHYuM\n/yMiW0XkaxF5ynOsISJ/F5FyEdkGDK+lfP4gInPC1j0rIn+zf98kIh/b9/O53euv6Vyua0RE2ovI\nv2zZNgEDw/b9o4hss8+7SUQut9dnAs8Ag20X35eesp3kOf439r2Xi0i+iHSLpWwiyPyIiMyx68f3\nIrJBRE605dsvIjtF5ELP/l1EZKb9TMpE5CER8dnbThaRJSLylYh8ad9/57Dy+Z2IbLTrwGwRaVOD\nXLWey+Zs+9l8LSIvOucSkeNFZL5dd74SkWWe8/YVkaX2to0i8rMarn+TiBR4lt26LiK3AlcD99pl\n9qa9T5qIvGmX23YRua2Wcm8rIn8TkVIR2Sciz4lIW3vbhSJSIiL3isheYHqkdfa+0erBrSKyFfik\nJlkSFqWU/ovTH1ACXBi27hHAD1yGpazbAWcBZ2NZej8GPgPG2/snAQrIsJdfAr4EsoBk4FXgpXrs\nezzwHXCFve13QCUwtoZ7iUXGt4DOQAbwlXPvwHhgE5AGpALLrKoX8To/Br4HjvKc+wsgy16+zN5H\ngAuAQ8AZ9rYLgRLPucqAHPv3E0ABcDTQE9gctu8vgW72M7nWluFH9rabgIIwOV8CJtm/L7ZlPBNo\nCzwHvBdL2US4/0fse7rQPvYVYDtwt718C7DFs//b9vXaAz8C1gI32ttOAYYBKfbzXgE8EVY+q4Cu\n9nP5DMsSjiRXLOf60H7Gx9rndcrncSyFm2wfP8Ren2Lf2132tgvtcj8pQhmHPAMi1/VJnu0+YD1w\nr32dk7Dex2E13N/TwJt2/egEzAce9tSrAPBn+1ztalgXSz14x75Gu+Zun+rcnjW3AK35j5qVxXtR\njrsD+Lf9O9JL8Q/PvpcDH9Vj3xuA5Z5tAuyhBmURo4zneLb/P+AO+/cyPI0QcAk1KAt7+yrgWvv3\nCODTWvb9D3Cb/bs2ZbHT+yyAW737RjjvR8DP7N/RlMUs4M+ebZ2w4lRp0comwnUfARZ4lq8EvgF8\n9vLR9vk6AN2xFEsbz/6/Bt6t4dxXAWvCyme0Z/lvwDMxPv9I5/I+48ud54bVoP4/4MSwcwwFdgHi\nWfdv4I8RyriuyuJcYFvY9e4Dpke4Fx9wGOjpWTcYWynb9eowkOLZHmldLPVgSCzlm4h/OmbRPJR6\nF0TkNOCvWK6R9lgV64Najt/r+X2Q2oPaNe17glcOpZQSkbKaThKjjDFdC9hRi7xg9aavsf9fa/93\n5LgU66U/Geslbw+siXI+sKyGGmUQy/12O5bVgS37sTGcF6z7W+ksKKW+FZGvsRpzp0zq8sz2eX4f\nAvYrpUzPsiNfT6ANsE9EnP19WJ0URKQr8BRWw9nR3rY/7Frhch0TSaAYzxVevifYvx8FHgQWi0gQ\nqwPzuL19p7JbVs9x3SPJUEd6Aj1E5IBnnYFlXYbTFascN3jKUcL22aeU8kdZF0s9CHn3WxI6ZtE8\nhKeNTsXqyZ6klOoE3E/1ytrY7MHq8QAg1ltS20vaEBn3AOme5Wipva8BF4pIdyw32Su2jO2A14HJ\nWC6iLsCiGOXYW5MMIvJj4HksF0+qfd5PPOeNlua7myolg4h0xLIAdsUgV0MoxW7glVJd7L9OSqkz\n7O1/wcrGy7Sf2VjqX69iOVd4+e4Gq9FUSt2ulMoARgL/JyLn29vTxdNC28dFKrcfsDoGDl3Dtoc/\no1Isy6CL56+jUuqyCOfeh+UaPtWzb2ellDcmE6kOhK+LpR602Gm+tbJIDDpiuRp+EJHewP80wTX/\nAwwQkctEJAn4LXBcnGR8DZgoIt1FJBX4v9p2VkrtBd4H/onlythib2qD5R/eDwRtK2NYHWS41w4I\n98CKozh0wHqJ92PpzZuB0zzb9wFpYgf0IzAbuFFEzrCDupOxXHw1WmqNgVKqFFgKPCEinUTEJ9YY\nliH2Lh2xGtlvRCQdy3VYX2I513jPM74HK0aGXcdOtJXCN1iuGROrFx4Afi8iySJyAZaL8tUI594A\nnCEimXan4YGw7fuwYlkOhYBfRH5vB68N+9iBYcehrJTzF4ApInKcWKSJyMUxlo1Ds9SDpkIri8Tg\n98AYrIDzVCK/LI2KUmofVgbJ34By4ESgGKv32NgyPg8sBjZiuYxej+GYV7D8wq4LSil1AMtV9CZW\nkPgqLKUXCw9gWTglwAIgz3PeD7ECnKvtfU4l1MX2LrAFy93jdds4x78DPGTLtQerd3xdjHI1lF8B\nR2EF7L/G8vk7ve4HgEFYDfRc4I0GXCeWc80G/gt8DnyKFasAqzzfwwperwCeVEotV0pVYCUsXIGV\niPEUVqxqS/iJlVKb7fMV2OdeFrbLC0A/OxPrdWWlpV5iy1xin38qVhwhEr/HcoGttu9xEZarM2aa\nuR7EHQl1F2qOVETEwDKjr1JKLW9ueTQaTWKhLYsjGBEZbrtl2mAFjSuxelYajUYTglYWRzbnAduw\nfPU/Ba60XQMajUYTgnZDaTQajSYq2rLQaDQaTVRazaC8Y489VmVkZDS3GBqNRtOiWLt27ZdKqdrS\n5oFWpCwyMjIoKipqbjE0Go2mRSEi0WZUALQbSqPRaDQxoJWFRqPRaKKilYVGo9FootJqYhYajaZp\nqKyspKysjMOHDze3KJo60LZtW9LS0khOrmmKs9rRykKj0dSJsrIyOnbsSEZGBqETxmoSFaUU5eXl\nlJWV0atXr+gHRCCubih7OolP7c8M3h1h+xARWSciARG5KmxbUETW239z4ymnRqOJncOHD5OamqoV\nRQtCREhNTW2QNRg3y8KemO5Z4CKsr2itEZG59uyRDjux5sWPNN3xIaXUmfGST1OdwkIoKICcHMjO\nbm5pNImMVhQtj4Y+s3i6oQYBW5VS2wBEZA7WVMSuslBKldjbzEgn0DQdhYUwbBj4/ZCSAosXa4Wh\n0WiqiKcbqjuhnxAso26fS2wrIkUiskpERkbaQUTG2fsU7d8f/oVHTV0oKLAURTBo/S8oaG6JNJrI\nlJeXc+aZZ3LmmWfStWtXunfv7i77/eFfPo3M9ddfz6efflrrPs8++ywvv/xyY4jMeeedVy1WcOml\nl9KlS5eQdU888QTt27fnu+++c9f997//pXPnzu49nnnmmSxZsqRR5KoLiRzg7qmU2mV/8vI9Edmo\nlPrcu4NSahowDSArK0vPiNgAcnIsi8KxLHJymlsijSYyqamprF+/HoBJkybRoUMH7rgj1JOtlEIp\nhc8XuT88c+bMqNe57bbbGi6sh44dO7Jq1SrOOeccvvrqK/bt21dtn9mzZzNw4EDy8/P59a9/7a4f\nOnQo+fn5jSpPXYmnZbGL0G/yplGHbxIrpXbZ/7dhfR2rf2MKpwklO9tyPT38sHZBaRqfwtJCJi+f\nTGFpYdyusXXrVvr06cN1111H37592bNnD+PGjSMrK4u+ffvy0EMPufued955rF+/nkAgQJcuXbj7\n7rvp168f2dnZfPHFFwD88Y9/ZMqUKe7+d999N4MGDeLUU09l5cqVAPzwww+MGjWKPn36cNVVV5GV\nleUqsnBGjx7NnDlzAHj99de56qqQnB4+++wzAoEAkyZNYvbs2Y1ePg0lnspiDXCyiPQSkRRgNNbn\nGKMiIkfbH+RBRI4FzsUT69DEh+xsuOcerSg0jUthaSHD8oZx35L7GJY3LK4K45NPPuH2229n8+bN\ndO/enUcffZSioiI2bNjAu+++y+bN1ZuRb775hvPPP58NGzaQnZ3NjBkzIp5bKcXq1at5/PHHXcXz\n9NNP07VrVzZv3sx9991HcXFxjbJddNFFvPfee5imyauvvsrVV18dsn327NmMHj2anJwcPvroI778\n8kt325IlS0LcUCUlJfUonYYRN2VhfwN3PLAQ+Bh4TSm1SUQeEpHLAUTkLBEpA34BTBWRTfbhvYEi\nEdkALAEeDcui0mg0LYSCkgL8QT9BFcQf9FNQUhC3a5144olkZWW5y7Nnz2bAgAEMGDCAjz/+OKKy\naNeuHSNGjABg4MCBNTbEP//5z6vt8/777zN69GgA+vXrR9++fWuULTk5mXPOOYc5c+YQDAZJS0sL\n2T5nzhxGjx6NYRiMHDmS11+v+lT90KFDWb9+vfvXHDNsxzVmoZSaD8wPW3e/5/caLPdU+HErgcx4\nyqbRaJqGnIwcUowU/EE/KUYKORk5cbvWUUcd5f7esmULTz75JKtXr6ZLly786le/ijjOICUlxf1t\nGAaBQCDiudu0aRN1n2iMHj2aX/ziFzzyyCMh64uLi9m2bRtDhw4FoKKiglNOOYXf/OY39bpOPNBz\nQ2k0mriSnZ7N4tzFPDz0YRbnLiY7vWn8nN9++y0dO3akU6dO7Nmzh4ULFzb6Nc4991xee+01ADZu\n3BjRcvGSk5PD3XffHdEF9cgjj1BSUkJJSQm7d+9m+/btlJWVNbrM9SWRs6E0Gk0rITs9u8mUhMOA\nAQPo06cPp512Gj179uTcc89t9GtMmDCB3Nxc+vTp4/517ty5xv19Ph933nkngGudKKV49dVXWbx4\nsbufiDBy5EheffVV+vXr58YsHB544AGuvPLKRr+f2mg13+DOyspS+uNHGk38+fjjj+ndu3dzi5EQ\nBAIBAoEAbdu2ZcuWLVx88cVs2bKFpKTE7IdHenYislYplVXDIS6JeUcajUbTAvj+++8ZNmwYgUAA\npRRTp05NWEXRUFrnXWk0Gk0T0KVLF9auXdvcYjQJOsCt0Wg0mqhoZaHRaDSaqGhlodFoNJqoaGWh\n0Wg0mqhoZaHRaFoUQ4cOrTbAbsqUKdxyyy21HtehQwcAdu/eXW0SP4ecnByipeBPmTKFgwcPusuX\nXHIJBw4ciEX0Wpk0aRIiwtatW0OuJSIhMq1fvx4R4Z133gk53jCMkPmjHn300QbL5EUrC41G06K4\n5ppr3NlbHebMmcM111wT0/EnnHBCyLxLdSVcWcyfP7/adynqS2ZmZsi9/fvf/64239Ts2bM577zz\nqs1M265du5D5o+6+u9qXrBuEVhYajSbuFBbC5MnW/4Zy1VVXMW/ePPdDR870GIMHD3bHPQwYMIDM\nzEzeeuutaseXlJRw+umnA3Do0CFGjx5N7969ufLKKzl06JC73y233OJOb/7AAw8A8NRTT7F7926G\nDh3qzuOUkZHhzhD7t7/9jdNPP53TTz/dnd68pKSE3r17c/PNN9O3b18uvvjikOt4GTlypCvz559/\nTufOnTn22GPd7Uop/v3vf/PPf/6Td999t0Hf1K4rWlloNJq44nyy9777rP8NVRjHHHMMgwYNYsGC\nBYBlVfzyl79ERGjbti1vvvkm69atY8mSJfz+97+ntlkqnn/+edq3b8/HH3/Mgw8+GDJm4k9/+hNF\nRUV8+OGHLF26lA8//JD//d//5YQTTmDJkiXVvla3du1aZs6cyQcffMCqVauYPn26O2X5li1buO22\n29i0aRNdunThjTfeiChPp06dSE9P56OPPmLOnDnV5pBauXIlvXr14sQTTyQnJ4d58+a52w4dOhTi\nhnr11VfrVrBR0MpCo9HElXh8stfrivK6oJRS3HvvvZxxxhlceOGF7Nq1K+IX6RyWLVvGr371KwDO\nOOMMzjjjDHfba6+9xoABA+jfvz+bNm2KOkng+++/z5VXXslRRx1Fhw4d+PnPf87y5csB6NWrlzu3\nU23ToEPVR5Ly8/Orzf/kfPPC2c/rigp3Q4UrmoaiR3BrNJq4Eo9P9l5xxRXcfvvtrFu3joMHDzJw\n4EAAXn75Zfbv38/atWtJTk4mIyOjXq6a7du388QTT7BmzRqOPvpoxo4d2yCXjzO9OViB6JrcUGB9\nm/vOO+8kKyuLTp06ueuDwSBvvPEGb731Fn/6059QSlFeXs53331Hx44d6y1brGjLQqPRxJV4fLK3\nQ4cODB06lBtuuCEksP3NN99w/PHHk5yczJIlS9ixY0et5xkyZAivvPIKAB999BEffvghYE1vftRR\nR9G5c2f27dvnurzA+pb2d999V+1cgwcPJj8/n4MHD/LDDz/w5ptvMnjw4DrfW/v27fnLX/7CH/7w\nh5D1ixcv5owzzqC0tJSSkhJ27NjBqFGjePPNN+t8jfoQV2UhIsNF5FMR2Soi1ULzIjJERNaJSEBE\nquWyiUgnESkTkWfiKadGo4kv8fhk7zXXXMOGDRtClMV1111HUVERmZmZ5OXlcdppp9V6jltuuYXv\nv/+e3r17c//997sWSr9+/ejfvz+nnXYa1157bcj05uPGjWP48OFugNthwIABjB07lkGDBnH22Wdz\n00030b9//3rd2+jRoxkwYEDIutmzZ1dzS40aNcp1RYXHLBo7GypuU5SLiAF8BlwElGF9k/sa7+dR\nRSQD6ATcAcxVSr0edo4ngeOAr5RS42u7np6iXKNpGvQU5S2XhkxRHk/LYhCwVSm1TSnlB+YAV3h3\nUEqVKKU+BMzwg0VkIPAjYFEcZdRoNBpNDMRTWXQHSj3LZfa6qIiID/grlsVR237jRKRIRIr2799f\nb0E1Go1GUzuJGuC+FZivlKr1A7RKqWlKqSylVNZxxx3XRKJpNJrW8oXNI4mGPrN4ps7uAtI9y2n2\nuljIBgaLyK1AByBFRL5XSjVuxEaj0dSZtm3bUl5eTmpqKiLS3OJoYsBJs23btm29zxFPZbEGOFlE\nemEpidHAtbEcqJS6zvktImOBLK0oNJrEIC0tjbKyMrTrt2XRtm1b0tLS6n183JSFUiogIuOBhYAB\nzFBKbRKRh4AipdRcETkLeBM4GrhMRB5USvWt5bQajaaZSU5OplevXs0thqaJiVvqbFOjU2c1Go2m\n7iRC6qxGo9FoWglaWWhaFI051bVGo4kdPZGgpsXgTHXtTEjXWPMMaTSa6GjLQtNiiMdU1xqNJja0\nstC0GJyprg2j8aa61mg0saHdUJoWgzPVdUGBpSi0C0qjaTq0stC0KLKztZLQaJoD7YbSaDQaTVS0\nstBoNBpNVLSy0Gg0Gk1UtLLQaDQaTVS0sqgBPVJYo9FoqtDZUBHQI4U1Go0mFG1ZRECPFNZoNJpQ\ntLKIgB4prNFoNKHEVVmIyHAR+VREtopItS/dicgQEVknIgERucqzvqe9fr2IbBKR38RTznCckcIP\nP6xdUBqNRgNxjFmIiAE8C1wElAFrRGSuUmqzZ7edwFjgjrDD9wDZSqkKEekAfGQfuzte8oajRwpr\nNBpNFfEMcA8CtiqltgGIyBzgCsBVFkqpEnub6T1QKeX3LLZBu8s0LYTCQj13laZ1Ek9l0R0o9SyX\nAWfHerCIpAPzgJOAO5vSqtBo6oPOoosNrVBbJgmbOquUKgXOEJETgHwReV0ptc+7j4iMA8YB9OjR\noxmk1GiqiJRFpxvDULRCbbnE072zC0j3LKfZ6+qEbVF8BAyOsG2aUipLKZV13HHH1VtQjaYx0Fl0\n0dFp6S2XeCqLNcDJItJLRFKA0cDcWA4UkTQRaWf/Pho4D/g0bpJqNI2AzqKLjlaoLZe4uaGUUgER\nGQ8sBAxghlJqk4g8BBQppeaKyFnAm8DRwGUi8qBSqi/QG/iriChAgCeUUhvjJatG01joLLra0R+w\narmIUqq5ZWgUsrKyVFFRUXOLodFoNC0KEVmrlMqKtp9OSdVoNBpNVLSy0Gg0Gk1UtLLQaDQaTVS0\nskB/u0Kj0WiikbCD8poKPUhIo9FoonPEWxZ6kJAmnmirVdNaOOItC2eQkGNZ6EFCmsZCW62a1sQR\nryz0ICFNvNBzRWlaE0e8sgA96lYTH7TVqmlNaGWh0cQJbbVqWhNaWWg0caSlW6362xMaB60sNJoj\nkFiUgA7QJybNpcC1stBojjBiVQI6QJ94NKcCP+LHWRzJ6DEARyaxji3S355IPJpzXJi2LI5QtIvh\nyCXWLC0doE88mjPDTiuLIxTtYjhyqYsSaOkB+tZGcyrwuCoLERkOPIn1pbwXlFKPhm0fAkwBzgBG\nK6Vet9efCTwPdAKCwJ+UUq/GU9YjDT0G4MhGK4GWS3M9u7gpCxExgGeBi4AyYI2IzFVKbfbsthMY\nC9wRdvhBIFcptUVETgDWishCpdSBeMl7pKFdDA1Hp5VqjiTiaVkMArYqpbYBiMgc4ArAVRZKqRJ7\nm+k9UCn1mef3bhH5AjgO0MqiEdG9y/qjYz6aI414ZkN1B0o9y2X2ujohIoOAFODzCNvGiUiRiBTt\n37+/3oJqNHVFz1asOdJI6NRZEekG/Au4Xillhm9XSk1TSmUppbKOO+64phdQc8Si00o1RxrxdEPt\nAtI9y2n2upgQkU7APOAPSqlVjSybRtMgdMznyOVIjVXFU1msAU4WkV5YSmI0cG0sB4pICvAmkOdk\nSGk0iYaO+Rx5HMmxqri5oZRSAWA8sBD4GHhNKbVJRB4SkcsBROQsESkDfgFMFZFN9uG/BIYAY0Vk\nvf13Zrxk1bRO9Ah1TWNT31hVa6iLcR1noZSaD8wPW3e/5/caLPdU+HEvAS/FU7ZE50g1dRuLI7kH\nqIkPhYWwcyck2a1mrLGq1lIX9QjuBKS1VK7mRI9Q1zQm3nfSMODmmyE3N7Y61VrqYkJnQx2p6LTM\nhqOzlTSNifedDAahR4/YG/zWUhe1ZeEhUVw/eiqOhtNSs5USpQ5qQmnIO9lS62I4opRqbhkahays\nLFVUVFTv4xPN9aMbjSOPRKuDmlBa6zspImuVUlnR9tOWhU2i+RV1WuaRR6LVQU0oR/o7qWMWNq3F\nr6hpueg6qElktGVh01r8ipqWi66DmkRGxyw0mhZEa/Wba5oPHbPQaFoZOgAeP7QSjo5WFhpNC0EH\nwOODVsKxoQPcGk0LQQfA44MeBBsbtVoWItJJKfVtDdt6KKV2xkcsjUYTjg6Axwc9CDY2ormhCoAB\nACKyWCk1zLMt39mm0WiahiM91z8eaCUcG9GUhXh+H1PLNk0D0ME1jaZ50Uo4OtGUharhd6RlTT2o\nKbimFUhioJ+DRmMRTVkcLyK/w7IinN/Yy/qj141ATcE1nZ3R/BwpWTKJqhATVa4jlWjZUNOBjkAH\nz29n+YVoJxeR4SLyqYhsFZG7I2wfIiLrRCQgIleFbXtHRA6IyH9ivZmWSKQMF52dkRgkwnOI9xfW\nHIV4333W/0T5kluiynUkU6tloZR6sKZtInJWbceKiAE8C1wElAFrRGSuUmqzZ7edwFjgjgineBxo\nD/xPbddp6dQUXNPZGc1Pc2fJNIVlk6hjNxJVriOZOg3KE5E+wDX23wGgtiHig4CtSqlt9rFzgCsA\nV1kopUrsbWb4wUqpxSKSUxf5GkJhaSEFJQXkZOSQnd60tTI8uKazMxKD5n4OTdFg1qYQm9MNFE9F\nrd1b9SOqshCRDKoURCXQE8hyGvpa6A6UepbLgLPrI2Qtso0DxgH06NGj3ucpLC1kWN4w/EE/KUYK\ni3MXN7nCCEdnZ4TSXC94cz6HprBsalKIzR2viZeibu77aslEG5RXCHQC5gCjlFJbRGR7DIqiSVBK\nTQOmgTWRYH3PU1BSgD/oJ6iCHA4cJm9Dnru+OSwNTShH6gveVJZNJIWYCG6geCjqRLivlko0y2If\nloXwI6zspy3EnjK7C0j3LKfZ6xKO1Pap7m+FYvpbHzH9yfmojCW0yXg4ISyNI5nW+oLHYi01l2XT\n3PGaeNFa76spiBbgHikinYGfA5NE5GSgi4gMUkqtjnLuNcDJItILS0mMBq5tDKEbk8LSQia+MxFT\n2WGT0nMIzloIwRQw7qFi7MUUlBRoZdGMtMYXPNGtpeaO18SL1npfTUHUmIVS6htgJjBTRH4E/BL4\nuz03VHotxwVEZDywEDCAGUqpTSLyEFCklJprZ1S9CRwNXCYiDyql+gKIyHLgNKCDiJQBNyqlFjbs\ndqvjuKCUYzCVDLUUhUqCoMK34wJyMnIa+7KaOtAaX/CWYC211rhZotxXSwu01ykbSim1D3gaeFpE\nesaw/3xgfti6+z2/12C5pyIdO7gustWXnIwcUowU/EE/hs/gnCEmy5b6IajAqOR31wxo0VZFS6uQ\nNZEoL3hj0RqtJU3sJLplGYloAe65UY6/vBFlaRay07OZMnwKb2x+g1F9RlF+sJz3Sy/G3D4YX6/l\ndDnpZ8DI5hazXjR3hWwtiioetEZrSRM7LcGyDCeaZZGNlf46G/iAVjh5oBOz8Af9LN+5nCnDp9Am\nYx3+9FWkGCnkZDze3CLWm+askM2tqFoCsVpLWulGp6WVUUu0LKMpi65YI7CvwQpOzwNmK6U2xVuw\npsKbNusP+ineU8yYfmMAyO2X26JdUM1ZIVtizykR0Uo3Oi2xjFqiZRktGyoIvAO8IyJtsJRGgR2I\nfqYpBIw3TsyiIlABwIvFL2IqkxQjhdx+uc0sXcNozgrZEntOiYhWutFpqWXU0uJwsYzgbgP8DEtR\nZABPYWUwtQqcmMX4+eMJmAFOJlLKAAAgAElEQVSCKghARbCiVaTMNleFbIk9p0REK93o6DJqGqIF\nuPOA07Eymh5USn3UJFI1MeUHyzGVWZU+C5jKDBmsp6k7La3nlIhopRsdXUZNgyhV84Bse4K/H+xF\n744CKKVUpzjKVieysrJUUVFRvY515oY6HDjsKgwfPsYNHEePzj3iMuVHSwvIaTSa1omIrFVK1TYp\nrLVfbcqiJdEQZQGWwsjbkMfM9TMJmAEMn4EgVAYrEREuO/Uy7vrJXUDD54xqiQE5TdOiOxOapiJW\nZVGnQXmtmez0bLLTs8ntl0tBSQE7v9nJtLXTMDFBQf4n+cz7bB4+8REwAw2anbalBuQ0TUNzdSa0\ngkpcEuHZaGURhqM0pq2dZjvbqrZVmpUIgkLhD/rrHQBPTQWfD5TSATlNdZqiMxHe+GhrN3FJlGej\nlUUEnIF64S46o+w8KMlBZSwhJWNdveaMKiyEiROthsDngylT9MCspqQllGNjZvdEut9IjY+2dhOX\nRHk2WllEIHxywd7H9ubUQ2NZ8NLv8fsFSfoDE/7xTjWrIpaGyHnwpgkiUF4eXZ5E6Vm0dFpKOTZW\ndk9N9xup8dHpp4lLojwbrSwikJORg+EzCAatMRdbv9qKbOpGhR8wfSh/Ek/8+ShOPHoj40ZmArE3\nRPV58M3Rs2gJPfC6kig9tFhojLTjmu43Uh3U6aeJS6I8G60sbEIbx2xuOPMGpq6dikJRaVayucNz\n4BsFZgpgYH4+lFuvNsksANIKmfTPCir852MGpdaGqD4Pvql7Fi2lB15XEqWH1lTUdL811UE9LiZx\nSYRno5UFkRvH3H65zNowq2rsRfoqGDMMCh6AbReCSiJYWcljL69mYbdhVJgDMH2L8NEWX1KQ1N6f\nAJkRr1fXB9/UPYuW1AOvC/Utx5ZqZdV2v4nQ+Di01PI90tDKAsjLg8OHrewkp3G8555sFucuJm9D\nHi8Wv0ilWWkpjJwHYccQ93sXu495BX/Qj5m2Asm9CHYMJZhRwMRN68gc2HifY23Kl7s1Z2vVtRxb\nupWVSEohEi29fI8kfPE8uYgMF5FPRWSriNwdYfsQEVknIgERuSps2xgR2WL/jYmXjIWFMGOG1TAC\nJCV5zPX0bJ6/9HmeueQZfE5RORbGBQ+QdP1wbryiDylGCoYY+Hp8gDrvz5hpK9zU2pZGfbO1mpLC\nQpg82fofbyJZWZrGQ5dvyyFuykJEDOBZYATQB7hGRPqE7bYTGAu8EnbsMcADwNnAIOABETk6HnIW\nFFgV1bouXH999cax/GA5IlWf8vD1WI0MfhSjx2oyj89kce5iLjvlMpRSbgZVki+pRX6O1ZutpVT1\nbK2mbKgj4fRE77vP+h9vORy/v2FYfzt3Nt+9twbC64+3fFubFdvaiKdlMQjYqpTappTyA3OAK7w7\nKKVKlFIfAmbYsT8F3lVKfaWU+hp4FxgeDyGdyurzWRW2f/8I+9jTmBtikOSzPHcKRcAMuNbD25+9\nbY32BgTh+jOvb5Ez1tb28jZ1Qx2Jpu6JOn7/m2+2OhPTpzfdvTe3Ym5sItUfp3wfftj6D63rnlsT\n8VQW3bG+sudQZq9rtGNFZJyIFIlI0f79++slZHa25WoxDKs3PXFiVUV1XlbKrPjFw0Mf5tlLnqWN\n0QZDDPtLejnkbchzpzYH8Ikv4rcwCksLmbx8MoWlifsmhL+8XisrEVwGzdETzc6GHj0gEGi6e28M\nxZxoyqam+pOdDffcY/1u7s6IpmZadIBbKTUNmAbWRIL1PU95uaUoTDO0EocG3rK5Z7DVcmYen0lB\nSQEHtvZm0iMVHE7rHHK+y065rPqAPXtmW3/Q36B5peJFeEZKY40RaWyaK+c8J6eqQ2EY8b/3mhrW\nWO87EQPH0epPa83C89KSM7/iqSx2Aeme5TR7XazH5oQdW9AoUkUgUiWureJmp2ezcW0H7v2fEyGQ\nAsYgjLErMNNWkGKkMOLkEUxePpnU8ksp/zjTOl8g9POtifRhpVgbltoa6qZ8CRojw6c+8jphK6Ws\nDDpHlngQXidTU+vW+Dd1wxtLeUZT9InQGYkn9VHgCaVclFJx+cNSRNuAXkAKsAHoW8O+/wSu8iwf\nA2wHjrb/tgPH1Ha9gQMHqoawcqVSf/6z9d9ZbtdOKcOw/jvrHS4et0QhlQqUQvzquMv+pn7z9m/U\n1KKpqt0j7ZTvpnMVyT8on2Gqdu2Umvrmh6rdI+2U8aCh2j3STq3cubK6EM3En/9s3SdY///857od\nH62sEo36yOstI1BKJP736q2TdX1GTflMGvNa4e9hayJRnyFQpGJo0+NmWSilAiIyHlgIGMAMpdQm\nEXnIFm6uiJyF9YnWo4HL7G9791VKfSUiDwNr7NM9pJT6Kl6yQvXearRe0JnnHGDRDL873mL/8a8x\nc30xgDXuYvtgCKRgKmtEd/nHVtZUQ7+FEQ8i9ejq0qNpae6D+sjrlJEzHsc7Jide9xpeJ+vS625K\nd11jPv9EHxfSEOpqOSXaexXXmIVSaj7WJ1m96+73/F6D5WKKdOwMYEY85YuGU3GdQKE7nXNpIVN2\nXQ1jBkBJDmQUQPoq/EHLT5FipFDRazlmkh+faZCSItax9vTnhYUw+aXGe4kLSwsbpITCGxaom7kc\nPogvNTW0vBKN+ro7xoyBvXthwQIr2N2UrpK6NP5eRe8EjuNJa3cfNRZ1VeCJVq4tOsDdFESczjlQ\nQGXQHtGdvsrdN8VIIbdfLv279eeNzW9w5k8W8e0n/SFjKaSdDGS756vwK4ykAM/M+cSdjDDkujEq\ngMYKnHt7dJMnx96jCR/EN2GCtZxIgdVw6vrShteBp56ykiIiHRtPH3Msve7mCGwnykR3LYG6WE6J\nVq5aWUQhkimY86scko1k/EE/YH2v+/LTLnc/uzrxnYn4g34WsxjVXmF+YfLCP5N49pJnKS8YR4Vf\nYQYF04Rbn32V4qTnyO2X6zbyXgVg7DqPS5Ifo2vfT8i99ORqisCZTr0xA+d16dF4B/GJwPr1iWU6\n10RdXtrwOlBebvXYq1mcCZCB1Fyui9bsPmpOEqlctbKIQsTpnNOzKRhTQN4GKyXG29Df8p9bqiYf\n9BAwA4yfP57bk4cBGSBWrCPYczFT137ArA2zXKvAVQA7zyI4az75wRQwTmfG+kso+OPkEGWQk5GD\nses8zM/PxThxRaOMGq9Ljya8fEaNguXLG246N1UWSE0fB/Kuqymmk4gfEGpo/OlIoznrWUtDK4so\n1Didsx1/8FJYWsiM9TNQpWeHxDIcKndk8dd/paOUD6QShk+E9FUooCJQwaSCSUzKmeSOGD9cMhQV\nTAGVBEFF5efnVrccyrKRvMXgF9RyRZ7PgNyGV8hYezSRyiczs2EvRlP10CNdByJfO/weI7nqEsHH\n3ND4U31JpMYwVlniUc9i/TJhc5dRfdDKIgZibTgLSgoI7DgLZi2CYAoYfmvSQUdhlJxPsNIHShCf\nAYeOc+0PE5N3t73L8p3LWZy7mCnDp3Bryb8IGlUZV8knriAnY3LoNQsgUGmgTKj0w9SpMGtW01bI\nSJlkDbl2U/XQI10HIl87/J4izcybKD7m+saf6ksiNYZ1kaWx61ldvkyolUUrJdaeSk5GDr4dhzA9\n1gAlOVXKIqPAUiBBhfJVQsaSkOMVisOBwzy24jEOVh5Epa+EMcOQkqGc9ZODTLl5cjVrpq4pnXUJ\nnDdWmq9TfqmpNQeGI91TvHvoNV0n2rVrm5k3kXzM0DRlmUiNYV1kaeyyqenaiWBxNgZaWYQRrhjq\n0lPJTs/m2Vs7MH6ZIhgwEUPh+/FKgvgwSwdZimP4b+HQsdVcVA4KRf6n+QiCQuHrsZo2vTYwJdfy\nkdzyn1sA6N+tP+UHy8nJyGHx4mzy8mDmzNpTOmPNnGrMqUnc7K8KKwju80GbNlHKMY499PDnG+k6\n0a4dHtSP5TvqzUVTWDtN2RhG67jVRZbGLpuarh3JNZjIqeU1oZWFh8YIWo4bmUnmEmc6iBT6//QZ\nFmxdQP6s8RFdU45SCEeh8OEjq1sWA7oNYOMXG5mwYIKbgeUc2zapLYtzF/P889nk5to9+N4bKQj8\nB0pz2Li2A28sKGfUiFTKU2PLnGrMDCtvwwqh82/VVo6ReugR/cF1sIBqUvzh14lmHbS0nmK8rZ14\nKySvZRopLTuWDkBtsjeWvLVd27lOIrns6opWFh4iKYbaGobaejmzZtnHzMok86JKS1GEuaZ84nOm\nNwGsFFzTM1u7iFC8t5g1u9fgEx+mCp3JXaGoCFa4jXl2NpBWZRVI2U8IzHwHAr1ZNF1xUjYYpy+E\n7u+7M+ZGwgmwO5ZFLBlWNTXaTvl5LYv6NLARg9FpsVtKBSUF7PzPtfj9PRvsLomr5dMMbsKQ89Yz\nUB0vheR97iKxTPjZvK7AaNdOJJddXdHKwkNOjvWlPNOs+mKe0zA4E8c51NZDCK8QJ3TsBkl+CFiB\n6iFDFMecMpJ5W+ZRqSoBaGO0YcTJI8j/JN+9RlAF3anPvVOgezHECGnMvWm3FNxrTXRIEijF1pWn\nk7Tmv9z85CsRx2w4ZKdnV5uapLbGqTa3lbdhDY9Z1KXBixiMPi+6BRQyZuXAQpKSFwNGiMKqT8Mb\nq+VTF+LlJoxZAdWx1xsPxRa+r/e5+3zWjL9gKY7U1NB6cfiw9Z7G023ZUFqaVepFK4swnI5+MBiq\nIBxLwck0qq2HEF4h7rqtGyOu2ui6gzIHXsGkgkmuAhCEESeNAFWzWyoSgnB79u1uY563IY+93++F\n0myYtdBWFAagAGsqkkClj235uXAmoXMCU/1FjThI0Gdww5k3hIwtiea2itiw1tLgRWpcUntvBONU\nRBkkJUNOjgFp0S2ggpICKkoGYG4fjOq1nHF/e5keB3JDFFZjxGfq0tDW1HjG6v6ri5vQub+KQAU+\nn49nL3mWcQPHRdy3Lr3eeCi2SPWsf+9bSUnJdMt1wgT4+98tGSdOrPoWTTBovbszZ0JuHVLHa+0E\nxcFlVJNVmkipxzWhlYWHgoKqShcIVKWhjhlTN/dUxApRmEl5BsBG9+U1MfGJjyRfEm8v/pLg9vMg\nY2/EwHckFIonVjzBqrJVFJYWUmlaVopsv8dye5EEBLAUhc8+Slj0bpCly2DJe0aI79/7Uk8ZPsUN\noHsbp2AwyNS1U0MGEaa2T7VcaqiY3Vbec1YEKpj4zkRO6HgCAAu2LiBgBqoajG79mfDRBIK/tubi\nMn+8EtIeDbGAUtunUlBSwMYvNrpyZ6dnk1p+Keas31qTOvoCbP76cxiW506/UlvDW5cecV7+Dg5X\npKNMH4crFLm/3c6d9/5QbSqX2hrPWN1/seznut6+2cnh7f1RJUMwMwoYP388mcdnRryfuvR646HY\nItWztkmzmPLKBxQvtMrx229DXVHl5XDDDda76ry3XiVXX4sYGu4yquna4Z2nlhLH0MrCQ01pqFD1\nEjnfYYba/dbegNYtt1RlKvmSTiP46wGYaSvw4ePCXhfSft8w8v8ZOQDuw8epx57Kx19+HFFmE5Nl\nO5ZVrSg9B/VNOvgCYFpur2N+/hBfbesJe/rD7ixQSVRUVJKXX0Z2dk8KSwuZVDCJimAFpjKpCFQw\nfv54TGW6iiPFSHFHpjspvs4I9onvTCRoBvH5fEwZPiXiSxn+0jgNnqM0V3/gg5LT7CyxCgC3wfCJ\nz7LC7Lm4KsEdwOicz6uABcHwGdb0Kh+Pw2cqTCUQNFj2xmkse6uXOxo+JyMHw2cQDAZRSrF692r3\nS4ax9oin5W9k+pJVKPk1kIQyDbYW9eB/fumH1zaSOfB79/4LSgqqytkTb4LI7r9IhO8HMHn55BCX\noTdupWa969atwNiLQhViHYLDjvUKVjaeV2Gltk+1vuHSPjVEWdcl/uUORg2rZwu2LmDhrEz3/fMZ\nJqZSVRYmnhhhmHuxVmXgUU5OfY4Ub/O+99OmWQoqtfdGylP/U+NzqotF1VLiGFpZePDGJ5zG3fGR\nTpkCxcXW+unTq9xRtc3q6fQYHOUDoEjCt+MCJH0VKUYKk3ImkffMCSEBcN+OYSRnFBMwA6QYKZzf\n83w+Lf/UDXCHB8JdSs+BWYutc/kCMPAF6JfHV+mroI93u6VEyFhKYenJ5MzKcbOsBAGxpidRKPxB\nPwu2LCCjSwaHAoco/aaUoAqiUMxcPxPAmpIdE1FC+cFy997z8new97hXWXD4fvdenJfGafAmFUxi\n0dJvq+QOU5YKFTFes2jbIt7d9i6Dew6mz7F9XBmcY5zpVZ7JzKZNSqbnGRgQTHZHw+dk5LhJBiYm\n+Z/ks2DLAq4/8/paGxKHafkbueXqkzEre1tl3r3IVcgEFA/nLWf/R79z7//nvX/uPkdTmaS2T42o\nTCNZSd7GOrdfLvcMvidio+RtBGXbYDDbgDIgqDB2DHMb7Gn5Gxk/+jQClQZJyUF3Usuaxud460my\nL5mfnfwzunboSv9u/Zn4zsQQa7mN0cZ91rUpNi/Ovnkb8nix+EUqzUoUircXfodZYaJMHwoTGfAi\nSgUJiI+N+7IZNzIzopKryapxyvtAxQH32grFtLc2sved1dx13SC3s+dtD6ZNs9OlfQplnIhvzDyS\nek6q5pat7dpueXqUdCSLLl4JDA1BK4swnEqSm1tVSaZPtx7imDHVv8McS+aDoyhEoE2KMOXWX1Ce\n2q6qIoyEmU8F8fuD+JIUz912NZkDfxbygs3aMCvERbRgywLyP80PuV7bshEcdpSOqaDzzlCXVvoq\nqyHekItPDPp3yyZvw3PV0nGVsnp1UvYTVEkO+Tvfg/SP3e1OXKUyWMm6PetI8iWhggoRsRq/Qhh6\nQZCKiu5gjIcxb9pTuFd/YUf1GcW7s3ZUTWsSENiQW6Mr7vj2x/PFwS8A6wVftmMZK3auIMmXhBk0\nQ+I9ATPAG9/dwZRXnqB4YSYvzlBUVgZCRsPnbchz3XcOFcEK1u1Z51o0CsX0ddPp361/iL+/sLSQ\n255bgFl5f1WZdyuGfWe4CrnsmJcgaFlKhwKHeGXjKyFlXbyn2J140nm2kRreKcOnhKROT183ned+\n9hzlB8urNUre3rxx4gpkBVRWKowkeObWX5CdnunKHvDfD8pHpT8YMqklZdnk5e+AjKXkXnoyBSX2\nTMs2lWYlb336Fm2T2rL3h70h86GZyuRw4DAT35nIgG4DalVskZInnMZx6tqpKBRmz/dQvntAJWNK\nJXRdAwumEAymcOvVJsVT8si99GTuuSf0ZYxk1XhjOCEdrtJzMGctIj+YwoIXg66L1hmBHQg46d8K\nZYoly/bB+NNWVHPL1nRtt95EcDuFjMOIIcuvOZSJVhY14K0kjnKAumUyhJuxN9zgBN8ygSpfdna2\nFT+wKothbyekEoS7J8YNHMe0tdO45T+3uJXen74IjDs8lkNBZMHWj0GZbbj16iDp47+BTlWbnHNJ\n6U+Qfy3GrEwC4w+WkgFUSQ5Gr/dRaSsxMVmze41rjQTNILfNv41L952J35/l9mYpyUHSP6j2wjov\nw7WXP87LBQEIGoAPiq+HfnkRFcbXh7+ulgQQVEEuO+ky3v7s7ZCkAYXiv9v/y3LjbBbfu5jc3Gwe\ne7mI3ce8Qs6Qs90ebCSK9hRZ9+W5xvj54wEo3lNcVV49N4Jxtx0aUtB1HTLmJY7edyVfd30TlVYY\ncl6v3Ek+6/XzWjAvrnsxxEoylYk/6OeNzW+ENNaOPM9c8ozrznOUNcCYfmMAyL0+F8Y6dSvZrVsF\nJQWYPd+zZA9WTWr5j7WreGHuJtSsdwlWdgfjKl4oHs6lF6SS5EsKUayOm+jtT9+ulpShsFx6q3ev\nZub6mSwZsyRibxuIGIDP7ZfrdpCk52qCYy9CbR+CZCxDdgx1Z0kI+oP8Y8ZBZpTnuD18wLXAvLG3\n7PRsJi+fHFK+LiU5rnXv9wd57OXVHNx6H6P6jCInZxxJyUGCQQUYIIGQ98uxwJ37cWJoY/qNYe/3\ne+naoSsbv9hY5YosyK7mdrrnHiDNjjNt2OlmNB4qGcpjbZbz5h0eq6QRB83WhbgqCxEZDjyJlZLz\nglLq0bDtbYA8YCBQDlytlCoRkRRgKpAFmMBvlVIF8ZQ1EuHmYW4uVQPfUqvyvBtjJHK0/OxIExeO\nGziO4j3FVT2wtBVWo14y1JpKxBP3cHzAzkuhlEGw0qRkQ08YHOGCJTmYgeSqBn9DLqwfA8EUfCnQ\n/647WG08WXVeu60ImAHyD09EjPesY+2Xqvdxvbn0lEvdoKu3gfyAKZw07FS2LhoKGGAa1j3YY1EE\nwVSW1RA0q7ukkn3JfHX4qxB3VfeO3dn9/W63sbVeVFjYbRiHA4dZvaJ6xlnvY3sjCB9/+TGmMvHh\nCxnfEjAD3DrvVvc6PnwYPQzM4RNh/jOgfPDOkyTfcAmTJ3Vi4jvF+IOWHzPclWaIwTOXPEPm8ZnM\nWD/DipmgKN5b7FpqTvwF4HDwcLXGOqiCFO8p5qcn/tRSlGaQW+fdiogQNIOkGCnWSP9AATm/CnVl\n7f1+L8k9i6gYcyGUnB8yo0Bg27lQabhu0UDxNeRv24lkfEGXkzdz4HCV+0YkevZeRbCCvA155PbL\ndS1An/hYvWs1+Z/ku1aJaZrcMs+aoWDcwHEhyQsTFkygMq2QZCOZiefeyV+XQrBSYXUuxuLvN4up\nwanWRJ5KueXUxmjDkjFLaoyV+cR+xr2WY9pT8fiSTPIP/xa2rWLRtkXcde7n9LuzgtUr20O7/dVm\nYBAEEWHT/k08UPBANUvV6bgIQoqRwm+7zwHjkpDMvmlrpzF+/niCKmh1IpyMxmAK+Uv9/F9qPn+5\nfiTQuINm60LclIWIGMCzwEVAGbBGROYqpTZ7drsR+FopdZKIjAb+AlwN3AyglMoUkeOBBSJyllIq\ngqM+ftTW2NeUvRAeMIz3ACGnB+a6AdJX4UtfjeEzMJXhujaK9xQzc/1M/BnLUJ7JCckosGIZYQpG\nZbwHvj+ASrb2A7fnFagMUrEtG05+MrJQ6YWo3KEhM+9u3g+b92923VhOp12h2Pr1Vki/j6Q2SwlW\nQlKy8LPhXSBtJJRlQ0kO8yvvInDCcus4T9vU59g+/Pac33LrvFtDRPjx0T9m3w/7ADB8Bqt3real\nD1/iUOBQNXGdkfATz5nIhAUT3MbP8Blkp2Xzfun7KGW52byNvokJJsih46yZhFUSYgo3dJnFuIE9\nyTw+k7wNeawqW8X6fetDrnfzgJvJPD6Tx1Y8Ruc2ndl/cL91TmVyY/8b6dG5BwcqDvDXlX8lqIIs\ne9+Pb8ddnDnoKza2mYZCkeRLsp5p0O/KHFRBt3wqghVuuaQYKUw4e4J7PrCU7KCzg6xJ/0tog59R\n4M5hhi9oWXpmEsrwc8COJzlJBL/L/h1TVk0JcWVG4sXiF9n7/V5X6VealdXcqM79ezO2HAXnKE1B\nOPGMLxg44kNWv90flNidixxU+ioqg5Uh9+IP+snbkMdjKx5j93e7uXHAjW5cxOn1u9bIgNfZXHQc\ny+ThEKv28RWPowwVuVNFVYzs5Y0v17jd+V8RrOCxnVfCr89xM/s2plzHbfNvI2AGXJmPKvspP3ji\nmE+8UsTIC39ULWnA8Bns/GYnhaWFcVcY8bQsBgFblVLbAERkDnAF4FUWVwCT7N+vA8+IiGCFY98D\nUEp9ISIHsKyM1XGUNyLerCZnPpeashea5StldlDwsVeX8/bC71AZS2iTsa6a+Q2WYplUMIlFeHqT\nEDm47MQ3nAYfsS0LaxLEDW2nhMgRYr1Ata8IOoRbIi7pqxhw112MbDeF1N4bmbhpEhWLB2D+czxi\ntsFIXoT8ehhm2gprKhTbl//C5S+QtyEvpBH34eODXR8QNINuLztSw+RlwtkTKD9YHmK5mMrk/Z3v\nVwXOVfUetIlpKVnjDxC0MnT6Z3/rZgbNWD+jWkMqInRq24nBMwdXk9vwWZaIkzllKtNNTDCDKXy4\nLMA1j/dl/9FzaZ/SPqILKEQ+u3GuCFTwxMonQmYBCJgBTuh4Akm+JDehwXkW7rP/pgesvdmOJwEF\nD0DOg5x1tskJHU/gsy8/CykXR4lc3fdqlpYspey7MoAalYP3OK/C8/aWC0oKXPkqg5WMnz+ewHFn\ngfEuYrZBkkxUxjJAqjLnnDIVH9PXTXfXrd69mrvOvSskBugGpy+FIV8OAbvRdoh13FOd8GT2Pbnq\nS0wztB/8Q/d5YPzO7dSpjCUUlBwVkhzy2IrHePuzt5m2blq1mEk8iKey6A6UepbLgLNr2kcpFRCR\nb4BUYANwuYjMxho6NtD+H6IsRGQcMA6gR48ecbgFi3AlMGVK5NiFV4nEYzRpjZRls/D+bJT9qdYp\ncz5h3MDqn2rNTs9mUs4klu8cxuH0VdZLsPzuiFORACEN/pCeQ1hGdZcFWC/685c+HzHoDpbLxWm0\nauPGK/qQebwnjXf7YNdlFvCbyPbBqLT33ZTjSTmT2PjFRqavmx5yrctOvYy5n8x1M7SiXVeh+Hvh\n313/v9O4xyKzVU6FyNiLMHYM4/ZrBjBx07WWr91WVNWup6zxMeF+824du/HlwS+Zvm46szbMYsLZ\nE6xG1ONPNysVL88twzfkv/jKzoXt9+DLWIKvh6UcXXlLz0FKhuLrtRwzbQUiUqUoSs+xOwHLmOeb\nR9AMug38nI/mhKQpG2XnEVw/xlIUGLDtQtgxhLXyU1anVX/WIsLVfa/mtU2vuT3lWAaa+sTnlrlP\nfG7spbC0kJ3f7LRcM6Z1/oAZcGdkViU5qIylkG7FhryKQhAGdhvI6t2hfcx/rPmHa4kfDhx207AL\nSgoiPq+Q+7MV0nFHHWcNgK2FJF8SF2RcwKJti2rdb/OXm6379xaRrbClZCgqo4CUnuvIyXgi5Dhv\njK62jL3GIlED3DOA3kARsANYCVR7ikqpacA0gKysrDiof4twS6K42MqMgtDRojk5DRtNGk6sozod\n+cygICRT/nEmjIy8rww1L44AACAASURBVDeV8UDFAf5a9r77zQxfsolhz5Lr8/m4uu/V7P9hP6P6\njCLz+ExyynLwpxdWO+fgHoMpP1jOiJNHMH/r/JA03GQjmXO6n8OK0hU1TlniEx93/OQOMo/PDM1U\n8bpDjEqk1zJ8YrgpxwC3zb8tJKh984Cb6d+tvzttikJhiFHt2pGC5OUHy0NSN2uSNxIn9N7BWRdu\n5DM2ug2RT/lCerqOKyXEAvPgfKpXoTgUOGS5P1DVyoGMAsydgzBnvQNmG3xJf+Dqx1/ktW9ut/zl\nToq02Ybg0goYcyG+nkX4xEfljoGuJakMP5WOW0kJfY/ry80DbnZjYIYYXDbsWN7mpwSX/BG2DXNd\nbcHtgyHt/Wr3oJRi9kezXcUkCGedcBbFe4upNCtxPkF8SuopVfdnl3/Pzj0p+7YMU5lMfGcin3/9\nOX8v/Lvrx7/slMsATyNZg/XqkGKkkNMrh6LdRSGK+Vv/t1Xyoli09Dve+9cCLhjqi3SaapjK5MuD\nX5LsS64Wn3AQhEtPuZTd3+6O+ZxASF01eqxBpa+2pUwO2T/cmnZS2cNTeBuTeCqLXYROKJFmr4u0\nT5mIJAGdgXJl2bW3OzuJyErgszjKGhHvbJferCbvVOC5uVX7Z2fXPpq0rteuzaUVLU+7NhxTdvLy\nyVaPbMwwpOQCxo06ldxLH62WkldYCAUvwdOnF1Gc9Bx7v9/LvC3zCJgBknxJfLDrA1aUriDFSGHi\nORP5e+HfCZgBd6LE5TuX19qzFIQubbq4gTsTK7jcrU8puzyusMuHdWVQ94dDMlu85nuSL4ncfrkU\nlBS4gWmf+Lh5wM2s27PO7WF6s5yc5TZGG1Lbp5K3IY91e9aFNPBXnHoFXTt0DXFnhLPru13s+iS0\neicbyTw14ik3e8oZjxDps7uGGJQcKAlZV90tNNS16mT5PXa6sYFZafLK3N0w2HaflAwFsw3KNKyY\nU8n5BNM/QERCsn68mWqO77t/t/60TWrrumi6dugK6W9DziTYMRgxBV9SEHotr9Z7c1xQ3t55ki+J\nGwfc6GaSmZjM+2weu7tWb0R3fLPD/R3uNvMH/cz9dK4b9I/FWjn6y0t4/NEkVMbZiGNJh1N6Dsz6\nL4FgCouW+GHMwloVkHOOgBlg5KkjGdR9EKt3rw6Z083Z761P3nLdirFiKpOenXsiIuz8Zqer5AJm\nwM22yvvPFpYt/Ql0WB/6JU6zMq7B7ngqizXAySLSC0spjAauDdtnLjAGKASuAt5TSikRaQ+IUuoH\nEbkICIQFxuNOJNdTebk1inP69OpfV3Ma7tzcyKNJ60pNcZFIsoXnaceinELM+x5rSOm1gdxLF1fL\nugq9ViaLFz9P9qWh00k4jag/6Gf9nvWu+8ZUZmRXjusGKQhJqQVCctPvP/9+JhycQGX6ByQbydx1\nbuiLkJORQ5ukNm7a5TOXPONub2O0cQOAADcOuJGNX2x01wlSfUqRsCngnbjIXefeRXZ6Nrn9cl1l\nsmb3mqgKcMRJIyg/WB7S23OC3jPXz6QyWInP5+PSUy7lrU/eqv2BeXrR12VeR6mYLFvq9/i038Px\nYxi9llsWhZOckLEUxO69RrBSjm1/LAcOH2D6uukYPoNLTrokJPA7a8Ms/D3WINcPR20/H5WxhKSe\nazi3+xBWlK5wg+1OOTrjRJxnUn6w3HVJgdWohbuGIhRgtVmWvYMuaypzrxtu76yX7FjcH1BjLsTo\nsdpS9m79W2q5VSO4YY9pewwHKg7UPhC2LBu238WI3htZmLTQ7QQ4cigUSilGnjqS3d/tpnhvMaYy\n3frnTUxwUKgQpem9t9W7VvPAv96hcuYC+75GhQxgDZ9UtLGJm7KwYxDjgYVYqbMzlFKbROQhoEgp\nNRd4EfiXiGwFvsJSKADHAwtFxMRSNL+Ol5w1Ed5Yl5dbudCFhaHKIDW14Q13JGqzFvLyqkaFe/O0\n6zJ5mnfCtpsH3Fyj+RpeDnl5zr1lc89gK1PFGywc1WcUy3cuD2mUQ14K7yhzw88Vk5/hrqsHu9cO\nH0+SeXxmzYOPyrIZ8+3H7sAx73Ynx33elnlMXTuVZCOZp0c87Qb9gZDzTl4+OWQcA8CPu/yYO8+9\nM2Q6Dic7xzuaORIKxbwt83j7s7erjVx3FI930OX8LfMjns+Hj2OPOpYvfvjCXbf/h/207bXfHWDp\nRRAGDvJTxMVWzCdjKb4eH1QNtEz/AOVNXEhfxZcHPcHlkrPIL+hNyokrye0HG9d2IPOzVzgh8zO6\nXr6d6esmY6ogQdNg+EnDefTC6pao88ycqT9S26e6LrZweh/bm61fbXXdOYYYKKWqKYracKw/xwUa\nHuexZkW4gN+PHszb//2ST//1HCqQTFJykF/+fiWvva8IVAbtr1cWAHDTwJt4+oOn3diTqcyQmELy\nriEs+NfvebvS6kRNeeUDylP/Q2r7VDfzsDJYiYgw4uQRjBs4LmQgHRAyUt3F05EKT33P/zQftoXG\nGKXkArAtQ29nKR7ENWahlJoPzA9bd7/n92HgFxGOKwFOjads0aipsQ5Pp41kAdSl4Q7H614KVzqF\nhVZj/eKLVaPCfT7L2iksjP2a4TOx9ji/R42VLHxg4f9v79qj5CrK/O/r7plJlF2EwQcKIaCsGg9I\nIDs6iybR4CwqSHbDCugxEQLjCHHJHg8jkaMnKE509Wh4yU4WwjKrKz4wLnJ4GUiA3c4BA4GExypJ\nDCFKNjBr8ICazOPbP+re7uqaqlt1X909PfU7p0/fvl236qu6VfXV96ivZBWcUI1NjGckT/AAKivp\n0fFR4LkFGAvCTxS4iK6RfnRLykpVstHtLwnbSTDpY9DevhiLgyi6MiMEIEK175qPgzM3YssLW3DD\nGTfU5F2p58z5Eya0nft3Yvndy3HCG6ob2Sqhs5dsrImTtOWFLRMkjnAS0EbiVeqlyw8QHmzb9m3D\nZ+74TCXtolmLAAD3PnBzZe8LHl+Cwqd70DHzsUCKWo4DR20CCBVGUUABpx13GhadsSigdxybXyjU\nGr4DRn7wgYO4vO2neHDN3wOjIpTJ3IU7UOx8puY8FN37qexpuGoFRnacira33oNre6/FXRv2o/xg\nG/a94UeViXDeMfNw08duqtR97yt7a5wkTOqmtkIbClSohFF50yFvqqi/CASS9k2gOILzzjwS1z58\nGf68+Z/AI0WACxgfLeBdh8zDAxvEONv/pnvxeOkvsWjWIHpP6cXCty+sML1wl30oieKPl2PNwSLG\nx8VZLcPPnIAVK6pOJbOPnF3ZN1HpQ3u6gf/qBkrBpt/Q2y3AMX84F8/dclNlIVX89N/izAVH4PZf\n315Np0iGZ51+KLre87W67ORuVgN3w+Fy6lWIrOLTr1kDLFsmGE949GgYe0oXZwoQIQjkWFUuDKMm\nEmvpIDo/usOYVm4HnQquuxs1gwBH6yf8cCXdeeIZWP7fxaC9KHM1nbxhSWxsWl8ZfHvfcR1wxsS8\nBIPurthkHnvhMfzykRLGfzMXB459CENPDNVIT7KUIOeBh57D4/uXYPTND9WoLAp7TsXuOz6BTaWI\nDZwGphj+BwC3PX0bFs1aVAk5cte0k/AzifGeVrgKKxd3VNIvu3NZxeU0VKnJwRdVCfNtr1yMp6VV\n687ybBHmnksiCONPZ6Gj4z5c9O3vY/EZxwN7urHqe3oJeuiOZ3Fw7Z0VxrN6ZA12/uDzGBkpAPQZ\nYMlp6Jj5WEWiDWkKjw0OEdqLgIlMFEDNoqQmJE7fYtz19uvwu21/haV/91YMd/4Bt244KNR1xStA\n44T29iI6O6tHESyevxDf6K56hsh0qRLumt/Vnv7Y2Vlb/+E/DlfUsAfHDmLojmdxy+e7azQQakiQ\nD5e+jjU8HeNMoHHCRYd/DzPe8h+1HoaKS/uHP3C+MeR85mDmlviccsop3CiUy8wDA+I7TR6lUhjr\nlpmIua+v+v/AAHOxWP0fYC4URDpA/Dcw4FbWwABzoTgu8iiOW58rlwUtCxcyd3SIsqZPF/fLZXFd\nLDK3t4t0tnbIqr3CckNamJnLu8s8/arpXLyyyMXTrmDQiGgvOsh9/buc8hlct5XR9qp4tu1VXvjN\nb3DxyiJjJbh4ZZEHHhww5tExbZS7vnQpF64sMFaCsbSbi+0HJtCZBUxtwMw88OBAhebCygL3DPVw\neffEwsu7yzzw4ACXd5e5XBb0U2GUO6aNcv+q7aIdMMoi8FW1n0WVzczc17+rpu3x1rsrvwvFce7p\n3WCkp+OrHUwriTu+2qFNY2wPqS4qBtdt5dKHvsSFC0/l0kXv567F67h/1XZub+egbuPc1j7m/H4G\nBsT4C8ehOobKu8vc3juPacEXub13Hvf176qMX3lsq+2vtml5d5nbv9ou+pLmo/bFJIAwC1jn2IZP\n8ll9Gs0s+vrcJkoT5M4Xfjo6pElQ6kilkvgOGUWhEG8isg10Na0YUOLT1lZbT5WJESWbFF0ZiDq4\ndM+EaQbXba2Z/HR56wa9ykz7+ndVGND0q6ZPmIzkNigWuSZ96UNfquQlM/QsGKYpn3JZ0NDeO89I\ns2t+g+u2ctdZj3Jb+1hNf1HrPGGyDBgPCiOM0quMMy5klF5lKoza+5xh0k/aZmF/LxTHudh+gNsu\nmlt5N6AxacyNahcUUXmaxlCV8Y5xx7RRHhysHUfy2LbVsby7zH0/7+O5a+dyYWWhwijiMlMTPLOo\nE9TJ1NQJop4fGGAeHBSdLmQAukEYpu3rqw7UQoG5pye7yVm9PzBQSxPRRJpc6TbR6Mq8ZKnBdQK0\nlT04WMugBwf19EStWqPSD67bOvE/k1SUscTVMW2U+757S2JGEfWfyztTmVa4wk5SvzgLHBUyY6PC\nKNOCLwqJ68JTmUp/rkgWKP6J+757SyyaTO2lY6Z9fck0ATVlBoyj7+d9mTAKZs8sckPcydSWlzwA\nBgdFh1JVPbbnslJtaCc9B2YYSlY6ul1ota1SK+kk1YpOHZQEJnVC3Ik7Kv3goGDog4PVMtX6pn2n\nuoWEqS1N0kjc8k356FbGJkbrCtc+YqJz+nTxfoulMS6d9dnKgqN/7Tou/vUapjk3cHvvvMwmYNNY\niquyrQc8s8gBWv32YK0aRp1M5cGjYzQ16ou+qpRhekaXb1YwDUhXNZuOJpdBnrVkEadtspqk40hN\nuntZTIayitKkmjTVN035urxdJsOosRGVd5L3NDgoVKiFwkSJy8bM0qi/tCpChwVhHDrSzgWeWeQA\n3eReWbEUhQFYt1oL/29ri15pRBmPszaO6pBHec6MwLHDuwxsU3lZD7Y0UpOrWseFNp3zQ6lUlWRc\n6Un77uPYr2x9X4c0k6JpYWbtbzmMiSSMOap/pKXPlVl419kYUPdeAEFMpnGxB6GrS9yTo9MeOFB1\nsRsbqz4T7seIcksN07qezOcaS8qEOOdvuJbtmqdrKPcoF1MgWUTgpGHko3bZh4jar2NrJ9coxmEZ\nNcf3sthIakpr2z+UpD3mzwdKpWo/Zza3i9x24fiISh/SmDR0zu7dgjbAtF9I/6zLO1bLsrWhLTyP\nLg+jq3hM+lLBhaNMhk8jbBY6m4P6W3aHjVptuaorouiqpxRS77KdJY8c1SyuZSWlXUUcmlX1RpQa\nKA8VZpivbN+K8tLTSRaFglkiSkOTqhpzsevons9yDEZJuXEkiHpKFg2f5LP6NNIbKrQz9PRUjaVh\nJ5R1pboBrNPbutgsVMgTS1IPqaTIYyKWEXdA6NpMfg9RE1jcSTSviTfMO4nROYlOPIt6xO2Dcpku\n7yctTUmdCXTj0rWsLOiV6dDZDr3NYpIwC+Za24RuRRVnFeHSkXX52WjIE1lKFrq6pR2EctuYVq71\nlMziDO4kE0Hc9sqq7mnyyWvBkcYW5JKPjCwYXlQ5efVRV2bhbRYZINQbjo+LWE2nnQasXGnXiev0\njYBZ575xowgrsHz5RD12qHNeuRJYv17QkrsOE9X6ZRE80aSjV3W8nZ1Vu5BLWfL7IdLr8uul+417\nmmISPb1ryPqwT+3e7V73KJ18mn6gozmtDS6Kpqh2jWMzkJ9Zvlz8XyiIKNVZ2w1N80XaNnKGC0eZ\nDJ9mkCzicnzdSsQmbZRKE1VdaWnJU5USp6yoTUuyKiBJ/fJcFcZpvzxVdjqVZpQdJYk3Up7Sl0p/\nI2xwSW0DeatidTQkGQs6wKuh6oukYq1ONaLmpeqCVRfcpLQk1W8nhU3EdtkJn3RQRrWJ/C50LtBJ\n65Q0fRLmHZfZJXElzWovRlwbXBbvOYtyXfpQHJdt3f9x0mbFoDyzaHLEedHqRGAztLkg7NxRYTqy\nRpQBVP5PDaKoozsvv/e48a2S+szrbE5pJCfZ604X2E73TB4SWlbPZ0lfPco1GcBt+blKeKY+U0/J\nIlebBRGdDuBqiMOPbmTmryv/dwAYAnAKgGEA5zDzLiJqA3AjgJMhAl8PMfOqPGmtN0Kf9PFx8R2l\nV1Z1ob0ZRCQO9Z8c+OUTpQ+xbkOolw73nqxfDzz0UBCueX6tznrxYn0eoU43DCstI66OWz02N9yn\nwBZ/f12d4oSoV/Xlsh2DSLRNHJvTxo3VvQqA2EdgPVo3oh1tz6htHPeseBfbSBL7RxZ7EeKWa7NB\n2cp22W9iKiMrW6EzXDhKkg8Eg9gB4DgA7QCeADBLSXMxgH8Jrs8F8MPg+hMAbg2uXwNgF4CZUeVN\nFslCXoW0t4tVbHu7eVWQdOepS/mq/3neKqgoF2M5TRJVTlp1kGtcrqi6JW2/tGrGKJWmje60Lsl5\nr9pd6mCzc+Rp/7BJlq6ShSnKg0sZaYFGq6EAdAO4R/q9AsAKJc09ALqD6xKAlwAQgPMA/Dy41wng\n1wAOjypvMjALuePYDNW6Z1wNkS7l69RZaSe9qHJlxmhTtdjo0Kmz4my0UvNIwrCyhO29yHSZ2i4J\n3XE3/enKdWnHODp5XbmmZ02LhjQ2i7hwYUS2skM1M5HeZhannknQDMzibAjVU/j7UwCuU9I8CeAo\n6fcOAEcAaANwK4AXAbwKoNdWXjMyC/WFRq0go+wQYT5xJ0QVNuNdXquvvj6u2cUeSjGue09UyKux\nsC3jMlKTJ1q9GUWIqLKTLDJcy3R951FMIUrKi/teTJKCLp96eSDZFlRZSJbyWTZtbdHMLuux6sos\nmnWfRReAMQBvBnAYgIeIaD0z75QTEVEvgF4AmDFjRt2JjIJOz6jqt1evFj7/pr0TIUL95KZN4vhU\n1Re9cizk4to9GaoeM0q/HhVTyaQTTeMHH7X3JLRpHDig1y93d4u2++Y3gR07RNqxMeCii4AZM+z0\n6OxAgNv+hyx8/3WI8vuX302hIGwSWdiY4ui8TXtdOjuBJUtEmtmzJ+7bcI37BEwcM0uWROeTxF4U\nByo9q1dXx2mxCFxwgajz8PDE9ovTT+bPF+9VjiGntpPcP1atqmM8KBkuHCXJB+nUUNcD+JSUbi2A\nj0eV12ySRRw1R1x1gLzCUN1Nk6p44toBXFc35bKQJEIx23Y4lHoYUX//xBAHOskijgpA194u70Bd\n6SYJ2Z40jU1NVQ+oqjCTZCfHejLp4XVQvdLmzrVLKHlKg2qf6Omppc8UAyvJyt/V9TkcT1m6u6MJ\n1FAlADsBHIuqgftdSppLUGvg/lFw/QUANwfXrwXwNIATo8prNmYRp8MkFSsHBiYevCR36LiiedgR\nZTWRKS9ZdDa5asadXNV8Qx2uuvfC5IKr0m9q17iMUabN1cU2quwk6oRGqshUqO0QtoW8oXLhwuiY\naDqUyxMDb/b3J7N32NImZdKqu7nO7TzrPSI6erJ0Smk4sxA04CMQxukdAK4I7n0FwMeC62kAfgxg\nO4BHABwX3D8kuP9UwCgus5XVbMyCOdvObXomjmQRpzPKg0OXl+44UhVJ9yDI+nndKYSmyV5tiygb\nTxLdc1iuy94UXd1Vum02qGZiEDJ0kp268k9qX+vqqu1XPT3J6IvyNHRh5HJa1Wah0qguGrK2KYTI\ny0bjyixytVkw850A7lTufVm6/jOAf9A894ru/mRDnLg+SWIAhT7kqs3ihBP0vvA2nbxqtxgeNvvV\n33ab0JszC31rnLMTbHUKy+zsBD73OfE8IPaj7N4trlW6Vq0CRkaq+YTPmMqX7UBynClbnKZwb4J8\nHoKuXrq6q+1roi+0Q6lnLgDp7CU6+1YSqO8o1NnL9AET7WsuWLoUeOSR6u9Fi+LTNzRUbd+DB8Vv\nlxhLCxYIOxkRcOaZQH+//tyR1atFfUZGhO3iwgtr2zOv/Q9522iscOEok+HTjJJFI6GuiOLq5G3q\nlXB1bYuwm3Z1HKqWFi6M1tPqJAub6iLJClDW28e1R5gkIl0aVXoJT2VM4zbtEk7FRHsSmPKw5a2e\nWR4XOu87tXy1LVWPJFsbubZPFv0/ajxl8Z7QDGqoen48s6giSk1j2wRo63zqoCKqDmqdGisrNYor\ns3M5K9wlT5NKIumEHcVkZLpl9Y2s4sjCbVqn0jPRGqeervr/qH0iWaJcFhM9kbCb6PqDbtLVHVRW\nL9fkJM9npe5yZRbN6jrrkQI6MXv+fCFeA+J72zbzMag2F8owH0AMqy1bJpZ74ACwbJlwBzSpvWxq\nEdn90EUEl1VLoWohjjuo7Ip8881CzVAoANdfL0KsJA1jrnPBDOkL6QjVJm1ttcd/XnBBNfSJSa3j\nepRnW1ut+sukxohTTxf1pi2cCZBMPRbVf84/H9i7F7jrLnFc8S23TDxKN6xr+Pv664GLL64eCxs3\nVLqaLml/CTE0VA1BYwsVUhcXWheOMhk+XrKoQjZAhuEfTBsCk3hVLFxYuwLr6qpKL66bx2xqEReV\njZw26YYl3bOqt0u4SSpO3nK+UZsx5bDscnu6rIZN7RRVPxfJKyrPJOpN22ZUF/WYThIwqR2j+qBN\nwlHbyPWdm/pr0pX/4KDeE9D1PcUBvBqq9REl/usi1eoGkc0F1FRuR0f1ed0uaJu6waYWcfX8UAdM\nGnWNziVUpc1F5WJq+0JBfGQX076+2klPZwdypVmnSkurNrO9Uxdmpe4LkFVxqn1Bp/ox2Rh0/cfG\nmFwXMy7t65LO1l9MCwBZJWaKwhxX7WqCK7PwaqhJCpv4PzxcK+7Lnk3hjvEkUVYBkWbDBv2pfCtW\niDQbN1Z3qOvEd1UtUigIuuT/XTw/TB5GoVdLZ6e7GkEuE6iqI5iFh86mTXY13aZNwCWXCC8mQNAx\nPCy8ur71reou3UKhGl138WKhcnjsMWDz5ngRZ5PsyneBrNIzqZCGhsSOedN7lp8tFsUOe1ldtGkT\nsHZtbbm6d21Sq5rUarooCao3mutOeNd+aEpnO5FPN4Y3bqyNIlwqTYzCrD5ritKcKVw4ymT4TDXJ\nwrbiSbLiy8IAF2c1OzjI/M53VlfccfeFhHmEu8NDlZu8sk8am6ivL5n0pToAtLUJelTjaaFQu+HM\n1J4uiFLPpVVTRKkvbe1q66OqJBeqM3X1c1UZRXmqqfm4OmC49MM46UKYTobU9WkVWe65gFdDtTZc\nJgKXzhu3g9uej6s+SnP4kpqHznNI3VkcZzdtEvrC51R7keqWGXrquGwMM5Xjan9I0wdME6yLus9l\nwZLEBmT632WjXZz+nheTCJ+JsrnI/cekqsrKq8wzixZEmo6fRXmuz7gYXE0uonEkC52NoVCo3ZOQ\nJqx7UulL957UvSlposdGTTRZvy9dfVyfMz0b539X6BYpaSbUOPVLUoZJqlLrEbW/Jqu288yixZC2\n4+cxicQtT85TDTynrpBdJzGd95K6GstaekqTR7gyT7PfQGcUtm3aM9XBNMnmtaKOizi06Izg6sTr\nSq9JRaQiqTpIliBkpwa1X6TdX+MCzyxaDGk7ZZqVT9zyTAMyzuCNo87q6aldtbuoivKe5FzgqkpS\nn1G9p1ziYKmqDfU/ncdQGvWma12SqJZMUHd+mxYnLiov193uaRdVulMj1YWOlyw8s4iFRkz6cY2B\nLiqoODrqRqfNG0loMakvovLSGd1Nk05ax4mousY15sdxRzXlF6aNs0KXyzW5rUbVLbyXdk+LLX9v\ns/DMwohGqZNcVSYujEk3aUQZV7M2RmbpRZIWSWhJstIsl2u9saKkL1t/iTuRmvJ07Suu3nZZMrkk\nY0aVBvKOw+W9oTyzyAVpxdW4Hk6ug6wRq/zJLlmEz8V9n66H69jyd5kI1eeTGJ/lhYqLt13SFXqS\nNtClVe0MrnG4ksJLFp5ZNCXirspcXTXzNN5lJbHkRUO9acmyrCjjr6s0EKWmsUlPeevw40Jni4sj\nWSRFVvX1zMIjU2TVMeXBnsat1bWMRkkPzUBDXoiqm4udIY6qy6RSahYvrDBtHGbYbHBlFrmG+yCi\n0wFcDaAI4EZm/rryfweAIQCnABgGcA4z7yKiTwK4TEp6IoCTmfnxPOn1MCPJ4Uw6yCEXABECYsaM\nbA+JqXs0zgbS4BrKJEtEHe5jiuSrHlQV1Ta2EBtZ9UUTXCLpqvTI7QGI6zQHTDUlXDhKkg8Eg9gB\n4DhUz+CepaS5GLVncP9Qk88JAHbYyvOSxeRAPVbczbCqnyr11MEmRWRtX8gaaYzHpvo2g6u2CWgC\nyaILwHZm3gkARHQrgLMgztQOcRaAlcH1TwBcR0QUVCDEeQBuzZFOjzoiryMn611GM9DQDBKUDvLK\nXydFrFhhb5u8pYcopDm+VH0nQ0O155DYpJRmRp7M4i0Anpd+7wHwHlMaZh4lopcBdAJ4SUpzDgRT\nmQAi6gXQCwAzZszIhmqP3FGPiaCRk029aGj4mcwOSBKNtdFIw+jV+gLNydCToKlDlBPRewD8kZmf\n1P3PzGsArAGAOXPmsC6Nh0erohkkKBsmA406JGVmOvuF6YTDyYY8mcVvARwt/T4quKdLs4eISgAO\nhTB0hzgXwA9ypNHDY1KjmVfoISYDjVlCre9kZJY65MksfgngeCI6FoIpnAvgE0qa2wEsAbAJwNkA\n7g/tFURUAPBxyA05dgAABsVJREFUAO/PkUYPDw+PXNEqzDI3ZhHYIJYBuAfCM2otMz9FRF+BsL7f\nDuAmAP9ORNsB/B8EQwkxF8DzoYHcw8PDw6NxoFrHo8mLOXPm8ObNmxtNhoeHh8ekAhE9ysxzbOkK\n9SDGw8PDw2NywzMLDw8PDw8rPLPw8PDw8LDCMwsPDw8PDytaxsBNRC8CeC7h40egdtf4VICv89SA\nr/PUQJo6H8PMr7clahlmkQZEtNnFG6CV4Os8NeDrPDVQjzp7NZSHh4eHhxWeWXh4eHh4WOGZhcCa\nRhPQAPg6Tw34Ok8N5F5nb7Pw8PDw8LDCSxYeHh4eHlZ4ZuHh4eHhYcWUYBZEtJaI9hHRk9K9w4no\nF0T0bPB9WHCfiOgaItpORFuJ6OTGUZ4MRHQ0EW0goqeJ6CkiujS438p1nkZEjxDRE0GdrwzuH0tE\nDwd1+yERtQf3O4Lf24P/ZzaS/jQgoiIRbSGiO4LfLV1nItpFRNuI6HEi2hzca9m+DQBE9Doi+gkR\n/Q8RPUNE3fWu85RgFgD+DcDpyr3LAdzHzMcDuC/4DQAfBnB88OkFcEOdaMwSowA+z8yzALwXwCVE\nNAutXecDAD7IzO8GcBKA04novQC+AeA7zPw2AL8HsDRIvxTA74P73wnSTVZcCuAZ6fdUqPMHmPkk\naW9BK/dtALgawN3M/A4A74Z43/WtMzNPiQ+AmQCelH7/CsCRwfWRAH4VXA8COE+XbrJ+APwngA9N\nlToDeA2AxyDOfH8JQCm43w3gnuD6HgDdwXUpSEeNpj1BXY8KJooPArgDAE2BOu8CcIRyr2X7NsQJ\nor9R31W96zxVJAsd3sjMLwTXewG8Mbh+C4DnpXR7gnuTEoGqYTaAh9HidQ7UMY8D2AfgFwB2ANjP\nzKNBErlelToH/78MoLO+FGeC1QD6AYwHvzvR+nVmAPcS0aNE1Bvca+W+fSyAFwHcHKgbbySi16LO\ndZ7KzKICFuy35XyIiegQALcBWM7Mf5D/a8U6M/MYM58EsdruAvCOBpOUK4joDAD7mPnRRtNSZ7yP\nmU+GULdcQkRz5T9bsG+XAJwM4AZmng3gVVRVTgDqU+epzCz+l4iOBIDge19w/7cAjpbSHRXcm1Qg\nojYIRvF9Zv5pcLul6xyCmfcD2AChgnkdEYXHB8v1qtQ5+P9QAMN1JjUtTgXwMSLaBeBWCFXU1Wjt\nOoOZfxt87wOwDmJh0Mp9ew+APcz8cPD7JxDMo651nsrM4nYAS4LrJRB6/fD+4sCj4L0AXpZEvUkB\nIiKI882fYeZvS3+1cp1fT0SvC66nQ9honoFgGmcHydQ6h21xNoD7g9XZpAEzr2Dmo5h5JsT59fcz\n8yfRwnUmotcS0V+E1wB6ADyJFu7bzLwXwPNE9Pbg1gIAT6PedW608aZOBqIfAHgBwAgEl14Koau9\nD8CzANYDODxISwCuh9B3bwMwp9H0J6jv+yBE0q0AHg8+H2nxOp8IYEtQ5ycBfDm4fxyARwBsB/Bj\nAB3B/WnB7+3B/8c1ug4p6z8fwB2tXuegbk8En6cAXBHcb9m+HdTjJACbg/79MwCH1bvOPtyHh4eH\nh4cVU1kN5eHh4eHhCM8sPDw8PDys8MzCw8PDw8MKzyw8PDw8PKzwzMLDw8PDwwrPLDw8LCCisSDC\nafi53P6Uc94zSYqG7OHRrCjZk3h4THn8iUUYEQ+PKQsvWXh4JERwrsI/B2crPEJEbwvuzySi+4Oz\nBO4johnB/TcS0ToSZ248QUR/E2RVJKJ/JXEOx73BDnQQ0T+SOJNkKxHd2qBqengA8MzCw8MF0xU1\n1DnSfy8z8wkAroOIAAsA1wK4hZlPBPB9ANcE968B8ACLMzdOhtiBDIhzB65n5ncB2A9gUXD/cgCz\ng3z68qqch4cL/A5uDw8LiOgVZj5Ec38XxIFLO4PAjXuZuZOIXoI4P2AkuP8CMx9BRC8COIqZD0h5\nzATwCxYH2ICIvgCgjZmvIqK7AbwCEd7hZ8z8Ss5V9fAwwksWHh7pwIbrODggXY+hakv8KESMn5MB\n/FKKJOvhUXd4ZuHhkQ7nSN+bgusyRBRYAPgkgIeC6/sAfBaoHNR0qClTIioAOJqZNwD4AkQ48QnS\njYdHveBXKh4edkwPTuALcTczh+6zhxHRVgjp4Lzg3ucgTjW7DOKEs/OD+5cCWENESyEkiM9CREPW\noQjgewFDIQDXsDinw8OjIfA2Cw+PhAhsFnOY+aVG0+LhkTe8GsrDw8PDwwovWXh4eHh4WOElCw8P\nDw8PKzyz8PDw8PCwwjMLDw8PDw8rPLPw8PDw8LDCMwsPDw8PDyv+H54gjB3Fee3GAAAAAElFTkSu\nQmCC\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "f86dWOyZKmN9",
-        "colab_type": "text"
-      },
-      "source": [
-        "Great results! From these graphs, we can see several exciting things:\n",
-        "\n",
-        "*   Our network has reached its peak accuracy much more quickly (within 200 epochs instead of 600)\n",
-        "*   The overall loss and MAE are much better than our previous network\n",
-        "*   Metrics are better for validation than training, which means the network is not overfitting\n",
-        "\n",
-        "The reason the metrics for validation are better than those for training is that validation metrics are calculated at the end of each epoch, while training metrics are calculated throughout the epoch, so validation happens on a model that has been trained slightly longer.\n",
-        "\n",
-        "This all means our network seems to be performing well! To confirm, let's check its predictions against the test dataset we set aside earlier:\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "lZfztKKyhLxX",
-        "colab_type": "code",
-        "outputId": "b792a12e-713d-4b07-9f8e-de0d059d5cdb",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 298
-        }
-      },
-      "source": [
-        "# Calculate and print the loss on our test dataset\n",
-        "loss = model_2.evaluate(x_test, y_test)\n",
-        "\n",
-        "# Make predictions based on our test dataset\n",
-        "predictions = model_2.predict(x_test)\n",
-        "\n",
-        "# Graph the predictions against the actual values\n",
-        "plt.clf()\n",
-        "plt.title('Comparison of predictions and actual values')\n",
-        "plt.plot(x_test, y_test, 'b.', label='Actual')\n",
-        "plt.plot(x_test, predictions, 'r.', label='Predicted')\n",
-        "plt.legend()\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "200/200 [==============================] - 0s 146us/sample - loss: 0.0124 - mae: 0.0907\n"
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEICAYAAAC3Y/QeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJztnXmYVMW5/z9v9yzgEpVR44KIMRhj\nnJ+Ak+iJim3QuMS4EaOJZhSJjQtRkmvQyY0JuS4ImlyMIDIKyFwTjHEUl2gkoq2irTgoCRE1oBcR\nl6ijeF1glu76/VHnTPf0dPf0TPdMb+/nefrpPnvV6XO+VfXWW2+JMQZFURSlvPDlOwGKoijK4KPi\nryiKUoao+CuKopQhKv6KoihliIq/oihKGaLiryiKUoao+JcwInKWiCzLdzo8RGSoiDwgIh+LyJ/z\ncP2AiGyKW35JRAL9OM8RIvJqThM3iIjIuSKyIt/pSEfif5XD8xZ83gcLFf8MEJEfikiLiHwqIu+I\nyMMicni+09Ubxpg/GGO+ne90xPE94ItAjTHm9HwnxhjzNWNMqLf9RMSIyJfjjnvKGPOVAU1ckSEi\nI937VJHvtCiZoeLfCyLyM2A2cC1WuEYANwMn5zNdvVGgL+HewL+MMZ3ZnqhA86coxYMxRj8pPsAO\nwKfA6Wn2qcYWDm+7n9lAtbstAGwCpgHvAe8ApwAnAP8CPgR+EXeu6cDdwJ+AT4AXgIPitl8BvOZu\nWwucGrftXOBp4L+BVuBqd90Kd7u4294D/g9YAxwYl88m4H3gDeCXgC/uvCuAG4CPgP8Fjk9zP74K\nhIDNwEvASe763wDtQId7TyclOba3/G8ALgf+AbQBFcAeQLOb9v8FLonbfyhwu5vutcDPgU0J5zva\n/e0HfhF3f1cBewFPAgb4zE33Gd7/2lue3W23A3OBv7jnfQ7Yt7f/JMm9mQi87J7jdWBy3LYA9jn7\nD2LP2cS47TXA/e41VgJXec9Fimv9GXgX+NjN/9cS7ulv3efkY/fZGApsdO/Tp+7Hcf/PO+KOHenu\nU5FpnlKkbx5wQ8K6+4CfZfierEiWHnddCPhx3PJ5bho/Ah4B9u7rf1eon7wnoJA/wHFAZ/zDkWSf\n/wKeBXYFdgGeAa5ytwXc438FVALnY0Xqj8D2wNeALcA+7v7TseL4PXf/y7CCVuluPx0rdj6sCH0G\n7O5uO9e91k+wojg04UE/FitoO7oP7lfjjm1yX57t3RfiX7ji7J6jw027H7gQW8hJkntRCazHimgV\n8C33BfxKXP7uSHMve8v/BmA1VpSHuvdhlXt/q4AvYUXkWHf/64CngGHuMf8ktfj/3H2Bv+Len4Ow\n5imwAvHluOMC3nkyyPPt2ML4G+7/8gfgzt7+kyT35jvAvu5+RwKfA2MTnrP/ctNzgrt9J3f7ncBd\nwLbAgcBbpBf/89xnwavYrI7bNhcrkHu6z8M33f1G0lNIu/3fiftkkKdU4j8OeBP3GQR2wr5He2T4\nnmQk/tjW/Xr3f6nAVoqe6et/V6ifvCegkD/AWcC7vezzGnBC3PKxwAb3d8B9KP3u8vbuw3ZI3P6r\ngFPc39OBZ+O2+bC1uCNSXHs1cLL7+1xgY8L2+Af9W1hRPxS3Vu+u92Nr5AfErZsMhOLOsT5u2zZu\nHnZLkp4jsDXG+PMvAabH5a838U+Zf6xYnxe3/ZAkeW4AFrm/XweOi9sWJLX4v+rdyyTpSif+veX5\nduC2uG0nAK+k+08yfDaXApcmPGfxIvaee14/tkDdP27btaQR/4Tr7Ojmfwf3/9hCXGssbr+R9FH8\nM8hTKvEXbEtjnLt8PvBYmjwkvieZiv/DxLVQ3fx/jjVf9vu/K5SP2vzT0wrs3It9eQ9sE9jjDXdd\n1zmMMRH39xb3+99x27cA28Utv+n9MMZEsc35PQBEpF5EVovIZhHZjK3F7Zzs2ESMMY8Bc7A1t/dE\npFFEvuAeX5kkD3vGLb8bd57P3Z/xafbYA3jTTXeqc/VGyvwnbse+hHt498O9J7/A9s10pSchLanY\nC1uQ95VM8vxu3O/Pce9dmv+kByJyvIg8KyIfuvk8ge7/favp3pfiXWcXbK01o/sgIn4RuU5EXhOR\n/8MWkLjX2hkYQv/uU7Jr9ZanpBirxHcCP3BX/RDbovLO29t7kil7AzfGnedDbMGzZ1/+u0JFxT89\nYaxt+ZQ0+7yNfUg8Rrjr+ste3g8R8QHDgbdFZG/gVmAK1hyxI9aMIXHHmnQnNsb83hhzMHAAsB/W\n1PEBtmaYmIe3+pH2t4G93HT391xJ8x+3PT6PbwL/a4zZMe6zvTHmBHf7O/Hnc9OSijexJoi+klWe\nU/wn3RCRamy/xg3AF93//iG6//epeB9rEsr0PvwQa+44GlvbH+klA/usbCX5fUr27H2GbSl67Ob9\nyDJPYFtX33Pfi0Pcc5HhexKfPlKlEftMTE54voYaY56BzP67QkbFPw3GmI+x9uS5InKKiGwjIpVu\njWWWu9sS4JcisouI7Ozuf0cWlz1YRE5zWxtTsYXPs1h7rcG+zIjIRGyNJiNE5OsicoiIVGIf+q1A\n1G2V3AVcIyLbuy/Pz/qZh+ewNc5p7n0KAN/F1tIyJVX+k7ES+ERELnfHEPhF5EAR+bq7/S6gQUR2\nEpHh2P6QVNwGXCUio8Ty/0Skxt32b2x/QjL6nedU/0mSXauwdvX3gU4ROR7IyIXX/X/vAaa7z+8B\nwDlpDtkee89bsaJ4bdy5osBC4Hcisod7vx1XyN930x5/n1YD40RkhIjsgDXJZZ0nNy0vYguj24BH\njDGb3U0ZvyfGmPexhfTZbl7Oo3vBdgv2+fmae64dROR093em/13BouLfC8aY32LF8JfYB+pNbK1i\nqbvL1UAL1gNlDdZD5eosLnkftpPqI+BHwGnGmA5jzFqsl0UYK0a1WO+eTPkCtkb0EbbZ3wpc7277\nCfYBfh3rvfFH7EveJ4wx7VjhOx77Yt4M1BtjXunDaZLmP8X1IsCJwGhsx7AnBju4u/wGm9f/BZYB\n/5Pmur/DFhbLsN4bC7CdymBt14vd5v/3E9KQTZ7T/Sfx1/gEuMRN30fY2vn9GZzfYwrWBPQutg9i\nUZp9m9y0vIX1lEkseC/DPufPY80gM7E278+Ba4Cn3ft0qDHmb1jPrX9g+7YezGGewD6nR7vf3nn7\n+p6cj62xt2IdMJ6JO9e9bv7udE1g/8T+z5Dhf1fIeL3lSgEgItOxHYtn5zst+aDc868og4nW/BVF\nUcoQFX9FUZQyRM0+iqIoZYjW/BVFUcqQgg2OtfPOO5uRI0fmOxmKoihFxapVqz4wxuzS234FK/4j\nR46kpaUl38lQFEUpKkQk3Uj2LtTsoyiKUoao+CuKopQhKv6KoihlSMHa/BVFKU06OjrYtGkTW7du\nzXdSipohQ4YwfPhwKisr+3W8ir+iKIPKpk2b2H777Rk5ciQimQbxVOIxxtDa2sqmTZvYZ599+nUO\nNfsoijKobN26lZqaGhX+LBARampqsmo9qfiXEOEwzJhhvxWlkFHhz55s76GafUqEcBjGj4f2dqiq\nguXLwXHynSpFUQoVrfmXCKGQFf5IxH6HQvlOkaIUNkuXLkVEeOWV9FMv3H777bz9dv8n5wuFQpx4\n4on9Pn6gUPEvEQIBW+P3++13IGDXJ5qC1DSkKJYlS5Zw+OGHs2TJkrT7ZSv+hYqKf4ngONbUc9VV\nMZOPZwq68kr73djYfVkLAKVYyHWl5dNPP2XFihUsWLCAO++Mzbg5c+ZMamtrOeigg7jiiiu4++67\naWlp4ayzzmL06NFs2bKFkSNH8sEHHwDQ0tJCwK1prVy5EsdxGDNmDN/85jd59dVXc5PYAUJt/iWE\n43S38yeagpqbuy83Ndl9amqgtdW2FrSfQCk0BqI/67777uO4445jv/32o6amhlWrVvHee+9x3333\n8dxzz7HNNtvw4YcfMmzYMObMmcMNN9xAXV1d2nPuv//+PPXUU1RUVPDoo4/yi1/8gubm5uwSOoCo\n+JcwNTXg80E0CiIwejQ89ZR9iSoqYOFC6Oy0230+qK7WjmKl8EjWn5XtM7pkyRIuvfRSAM4880yW\nLFmCMYaJEyeyzTbbADBs2LA+nfPjjz/mnHPOYd26dYgIHR1Jp54uGFT8i4hw2D74mdTQw2GYOtWK\nuzH2xbnpJpg929byV66E++6z28AWALl6sRQll3j9WV7N3+vP6i8ffvghjz32GGvWrEFEiEQiiAin\nn356RsdXVFQQjUYBuvnZX3nllRx11FHce++9bNiwocscVKiozb9ISLTf92b79GpLnrgbY5c9887D\nD8e2ga35p3uxtKNYyRfJ+rOy4e677+ZHP/oRb7zxBhs2bODNN99kn332YYcddmDRokV8/vnngC0k\nALbffns++eSTruNHjhzJqlWrALqZdT7++GP23HNPwHYSFzoq/kVCX105vdqSNw4kXtxDIdsiALv9\nlFPg6qtTv1h9LXgUJdc4DjQ05KZVumTJEk499dRu6yZMmMA777zDSSedRF1dHaNHj+aGG24A4Nxz\nz+WCCy7o6vD99a9/zaWXXkpdXR1+v7/rHNOmTaOhoYExY8bQ6b1ghYwxpiA/Bx98sFFiPPOMMUOH\nGuP32+9nnun9mPnzjamsNEbEmIoKu+ydq7rarq+u7v1c115rrwv2+9pr06fz2mszS59Snqxduzbf\nSSgZkt1LoMVkoLE5sfmLyELgROA9Y8yBSbYLcCNwAvA5cK4x5oVcXLtc8Jq+mdr8wZp4olFr3jHG\nLnvEm4N6I53NNb4fAnSUsaIUC7nq8L0dmAM0pdh+PDDK/RwCzHO/lT6Q6MrZG4GAHfQVjdpvT6BD\nIWs+Msaaf6ZPt59U505V8CS64J1zTu69MhRFGRhyIv7GmCdFZGSaXU4GmtwmybMisqOI7G6MeScX\n11dS49n842NAeTX5tjZbMDz6qHUBTVdTT1bwxPdDtLXBCy/YQgbStxC0QFCU/DNYHb57Am/GLW9y\n13VDRIIi0iIiLe+///4gJa108Tp2vRq+10ns1eSPPjo2DqA/8YC8QsQ7R0uLLWTOP797QaIdxopS\neBSUn78xphFoBKirq8vAGq2kIxCAGVzOGfyBDdF92anmOggDoRDO5s3cvSnEi2YIrQzjfXbDqakH\nMq+We4XI9Om29RCN2kJmxIjuwj99eqyVoeYgRSkMBkv83wL2ilse7q5TckljI5/cuIAPPxvCdnsP\nw3nnJQ6NrANgL/MWctER1i7T0QHGsD1whHdsBOSSRVD7eJ+U2XGsuHsjhxODyo0fHxP+3sYSKIoy\neAyW2ed+oF4shwIfq70/xzQ2YiZPZru1KxnxxpMMe3IpZt06BLo+RCJdwu/RbXs/Y0GnGoTj9Ql4\n4SXq6pL3K+gAMmWw8fv9jB49mgMPPJDTTz+9a2BXf4gP2Xz//fdz3XXXpdx38+bN3HzzzX2+xvTp\n07vGHeSKXLl6LgECwM4isgn4NVAJYIy5BXgI6+a5HuvqOTEX1y0lEl0m+9w56o40TDu3j99vP15A\nn0R6G+KbJlHJOoQ9byPPs+jvf09+WnUPVQaboUOHsnr1agDOOussbrnlFn72s591bfd84X2+vtWP\nTzrpJE466aSU2z3xv+iii/qX8BySK2+fH/Sy3QAX5+JapUi8AFZUxGLx9EkMJ0yAZctI7CjpKgwO\nOgjmzbO/QyHYvNl+DxkCw4bBbrtBfX36Ib59VGjHgfPOg/nzu3c6p4s8Gr9dPYSULgbwYTjiiCP4\nxz/+wYYNGzj22GM55JBDWLVqFQ899BCvvvoqv/71r2lra2Pfffdl0aJFbLfddvz1r39l6tSpbLPN\nNhx++OFd57r99ttpaWlhzpw5/Pvf/+aCCy7g9ddfB2DevHn8/ve/57XXXmP06NEcc8wxXH/99Vx/\n/fXcddddtLW1ceqpp/Kb3/wGgGuuuYbFixez6667stdee3HwwQfnNN8F1eFbrjQ1wdatViC9CrkX\niyfjztFgEIFuNv+aYSQX9b6+PH0Mqxj/ntbXw+LF3fsD4renGkCmLQKliwF8GDo7O3n44Yc57rjj\nAFi3bh2LFy/m0EMP5YMPPuDqq6/m0UcfZdttt2XmzJn87ne/Y9q0aZx//vk89thjfPnLX+aMM85I\neu5LLrmEI488knvvvZdIJMKnn37Kddddxz//+c+uVseyZctYt24dK1euxBjDSSedxJNPPsm2227L\nnXfeyerVq+ns7GTs2LEq/qVGOGxDK3tm+IoKax/3av7drDC91X6CQbYPBtk+g137RB/CKiZ7T+MH\niEH67Yn9BTpgTBmIh2HLli2MHj0asDX/SZMm8fbbb7P33ntz6KGHAvDss8+ydu1aDjvsMADa29tx\nHIdXXnmFffbZh1GjRgFw9tln09jY2OMajz32GE1Ndtyr3+9nhx124KOPPuq2z7Jly1i2bBljxowB\n7CQz69at45NPPuHUU0/tCi+dzpTUX1T884w32has6E+aZGvLPYS7sREuvtg2DXoJvJ/zilK6Ib7u\nw+21LpK9p/EBuWbMSL/dI9dhfJUiZgAehnibfzzbbrtt129jDMccc0yPaR6THddfjDE0NDQwefLk\nbutnz56ds2ukQqN65pn4uXeHDIlZaLoJYjgMU6bEOmrb2pJ65XheM01NAzCZe2KiwmE46ii45Rb7\nce05qeYSTpbfdO9xrsP4KkVMnh6GQw89lKeffpr169cD8Nlnn/Gvf/2L/fffnw0bNvDaa68BpJwD\nePz48cxz+9kikQgff/xxj/DQxx57LAsXLuTTTz8F4K233uK9995j3LhxLF26lC1btvDJJ5/wwAMP\n5Dx/WvMfZBLNMRkFbAuFMJ0RBDCA+Hw9VDOx0zhVmIWc4VXxPTo6YOpUnLFjeW52PQ+2Oj3y4+Xd\nm1CmN5NUX2MZKSVMHh6GXXbZhdtvv50f/OAHtLW1AXD11Vez33770djYyHe+8x222WYbjjjiiG6C\n7nHjjTcSDAZZsGABfr+fefPm4TgOhx12GAceeCDHH388119/PS+//DKOm7ftttuOO+64g7Fjx3LG\nGWdw0EEHseuuu/L1r3899xnMJPRnPj6lGNI547DMCXGR/zH/GfMZQ00HPtNGpVk/bX6PQxLDLl9w\nwQCHVvbiQseChsY+fn8sfnTc7n0NSa2UJhrSOXfkPaSzkhkZ9VslMdg/2OrwF99yjoiGeMoX4Ds7\nOjQkHJZoFk3ltZkzHAcef9zamF54AZ5/PtZrHYnABRfY38EgoB24ilJoqPgPImn7rTybyMaNPVQy\nEHC4qtrh2XaHqiq4PtDz3P2J958tYRxCIxxOHBOmdnUA2tutWQrbopSLLoLaWnAc7cBVlAJDTCaz\neeSBuro609LSku9k5BxP42tq4MUX7br//ORyhi+5wdacq6qSjvIqtAFPiQ2U52aH+dKMSWyz4eWu\ngWVGBJk82UZ6CwRsYREqnDwo+eHll19m//33RyTteHSlF4wxvPLKK3z1q1/ttl5EVhlj6no7Xmv+\ng4wnekcdZZ12fkwjezKrq8ZMR4c1lbiC6R3Ql/6uwSgoEs04D7Y67HncAs685Ugq6QAgIn4qFi2y\nefL5cObOhUCwW2hppfwYMmQIra2t1NTUaAHQT4wxtLa2MmTIkH6fQ8U/D3jC+WMauda13nd7BbIw\n2A/WyNhkZpw1axzG+5/grEgTfj9897uw2/2N1j01GsVccCHvycP8hWlcVe30SFuhtW6UgWH48OFs\n2rQJnbMjO4YMGcLw4cP7fbyK/yATDoN/ZZjHzRWM48mu9V01/8suy0r5BqtjNbGPAWDqVGgzDs9V\nOsyZA7vVhuHB2+JiVkQ5ySzlWB5i/NYQoZCTckpI9e0vXSorK9lnn33ynYyyRwd55ZDeQhMvvTxM\n+zeP5LKlh3UJv1fjl2HDbAS0mTOzSkOmg6hyQfy4r/jwzdGoO1m848DcuTaQP7HQ0VW0M9/8mK9u\njt2oZIWWoigDh9b8c0RvNdell4c5ftY4qugEupt5BGyp4bpFZkM+vH4gjSeTl6eLLsK4cSwE+Bpr\nOeD6w2HfeRAM9ji+psbeEjUBKcrAoOKfI1KZW7zwN8PmhziJzpgnjPstIvDzn+dE+D3yMTI2baET\nDEJtLe2nfJ+q9zbFCj4TtWEramtxHKfr+Joaa0JSE5CiDBxq9ukniSaeZOYWrzUQuaWRH5g7iGJF\nv0v4x42Dp5/O2tRTKPSISZSwsfqqK4HYPeiaXaypCWbMwCFMQ4M1GakJSFEGFq3594NUJp7ly2NB\nLsGK1o+2NnILsYh9EeD/djuAYb+5NKe1/aLAnXOA2bPh1VftuspKG9M6ErEl53nnceKYeq6qcnRA\nmKIMIFrz7wfpOicXL4Zbb7WFQ00NTDILgFhnpx8YdsnZEAyW59y1wSCsXQsrVsDVV8PEifZGejfz\nlluovXgca37SqBE9FWUA0Zp/P0jVuZlYKAx5McxY34sQjZl6Ir5KKgIBdW30OibCYVtielOZAXR2\nsu9/T6HhidoyuymKMnhozb8fpAov7hUKPp/97P9uCD/Rrg7OtXIAL897IuWkJ+VEV6sH92ZOnhyL\nQw32xkyfXmbNIkUZRDIJ/ZmPTyGFdE6IsJyW+fONqagwxucz5siqZ0xn9VAT8fnNFt9Qc++02AnK\nOcRxyrzPn28i/koTwWeiYIyIMZWVPcJDK4qSGjIM6aw1/17wzDNXXmm/e6uItrbCIdEw06Iz6OyE\n3x6/nOm+qzia5fzwJqfr+HKeqSpVqydcG+Rb/if4G0fb9pIxNi7QxRdrC0BRcoyKfy/01Tzz/c2N\nPBYdx1X8kmXR8QBcaxp4Our0OD6ta2QJk2oUcigEKyIO05lOhIqufhKi0fKziynKAKPi3wt9CpcQ\nDrPvf0+hkk4qiDJU2jhjt9CghVsoFnrrM3ne7zC1Yg7GX2E7T6qr9cYpSo7ReP4ZkHG0yVNPhaVL\nY8sVFfDkkz3i2CeeT6NZxuh2L4ib/CCTSX8VRck4nr+Kf65obLQeKx4+H8yb12MgV6KL5+zZGsog\nLUl8YsM4XYPpBny6SkUpMnQyl8Gmubn7cl1d0hG8iX0Izc06t21aEm7YG00hjlzg0GHni2HRIjuV\nsN4zRekbavPPFRMmdF+eNCnpbol9CBMmDF4I5qIkECBSUUVE/EQqqvjTu4Eu4YfyHCOhKLlAa/65\nwqvlNzdbRU8RtydZ9MvaWrX5pyKMQ4NZzmGEeNoE2InuN8jn0wJTUfqDin82JPbUBoMZBWtLDLmc\njxDMxYLn/vmEcfBH4PzdYqE1/H64+Wa9d4rSH1T8+0tjox18FI1aV0TtqR0QEuMo1dfDZV9oRO5p\nxpw2gX3LLTKqouQIFf/+EA7bSUg67axctLVpT+0A0cNMtqYRZrleVbOWwb6UX2hsRckBOenwFZHj\nRORVEVkvIlck2X6uiLwvIqvdz49zcd28EQpZ7xMP1/BcliGaB4FuI6ETvaoWLMhLmhSl2Mm65i8i\nfmAucAywCXheRO43xqxN2PVPxpgp2V6vIAgErKmnrc0anufMIYxT3iGaB4sJE2DZstjyiy/a0lZv\ntqL0iVzU/L8BrDfGvG6MaQfuBE7OwXkLim61etcW8UbwapomPUG4Nlj2IZoHjWAQTjkltqxxfxSl\nX+RC/PcE3oxb3uSuS2SCiPxDRO4Wkb1ycN1BIxyGhkCYnX5xIf847EKWXh4mjMNXFzdw3q1O16xd\n6q8/SEybBkOH2ptdUQEbN6qtTVH6yGB1+D4ALDHGtInIZGAx8K3EnUQkCAQBRowYMUhJ6511TWGW\ntR9JJR1goG3WQn7/rxDt7U5XTb+1taf/vjJAxE+YvHChnTdz8eKUtjaNnaQoPcmF+L8FxNfkh7vr\nujDGtMYt3gbMSnYiY0wj0Ag2tk8O0pYTvrN2FpV0dM3IVUkH+70doiphknH11x9EHCfW8R6J2Gkg\nm5p6/AFlP12moqQgF2af54FRIrKPiFQBZwL3x+8gIrvHLZ4EvJyD6w4O4TA1K2LZMUAUH/tOCpTt\nZCwFQyAAFTbuvzGGyIKFPcw/2hejKMnJuuZvjOkUkSnAI4AfWGiMeUlE/gs7ndj9wCUichLQCXwI\nnJvtdQeKHiaCpiZM1M7Da4VfeOasmzkyaNVeRT+POA7vHD+RXZfOx48h2hFhU1OIveP+lMRBYtoX\noyiWnNj8jTEPAQ8lrPtV3O8GoCEX18oVyezASU0ECcfdz8m88rUgRw5yepXk/G23er7HYippp4Mq\nniBAfdz2ZLGUFEUp06ieqeblTWoiqK8nWllFBKGNKm6smqa1xwJiVL3DCVXLmS5XcULVckbVW3XX\nAXeKkp6yDO+QTOQdJ4WJwHHwPxHijaYQTxBgRr2jtccCwnFgRsghFHKYEYjNjOa14Px+Ow98ZydU\nVmoUDkXxKEvxT2UHTmkicBz2dpxu5gSlcEj0soov3OOjcLS3J3UIUpSypCzFP50d2LEz7gIBelr8\nlUIjWd9NfOEO3QsARVEsZSf+8WIRCMRc/xzHboweeRTS0Y6prML3hM4PWMik8uF3HHhudpjW5hCb\nRwc480anW0hoRVHKTPwTbcEi1hbsTaT+lRubGNfRZgdzdbTx7qwmdrtXxb9QSdV3QzhM7VT3j36q\niud/v5wHWx319lGUOEpa/BNNAvFiEY3afYyxwTn/56Iw10de6Hb822/DbknOoxQGKX34E0qF2tYQ\ntQ36xylKPCUr/slMAvFiEe8F4hDmkch4qmgDIAJ0UEXlpHoND1DApOy7STOySwtyRbGUrPgnMwk0\nNMTEoqYGfvITu++RhKiinQqidOLj1eFHE71yOrVBhxkzUpgWlIIgaTylJKVCOGw9fRYtsgV+RQVM\nnGj7APT/VMqRkh3k5VX+EkMse7NCtbZaQTcGniBA1F9FRPxIdTVfu8sKf7rzKAVO3PRfm86+nOHf\n3IszbjmSMW1hIhFr6ps/v/sgP0UpJ8SYggme2Y26ujrT0tKS1TnSNfHDYVhxxOWcHLmH+/yncdzN\np1DbmnxnNRUUMZdfjpkVCyLbgZ8jeYpnXTdevx/OPx9GjND/VykNRGSVMaau1/1KWfzTkiAKMm0a\nzJw5cNdT8sOoUZj167vCcUe7+QmrAAAdUklEQVSBPxxwLT9e30Ak0tPrS/t0lGInU/EvWbNPr/zx\njwBdosA99+QtKcoActppXRFZDYDPz49uCxAK2XDc551nhV9DPivlRsl2+MbTw2wTDhN9+50uUQDY\ndMhpDM9bCpUBw23NyR//CF/6EnLddeA4OMTiAC1erCGflfKj5M0+ia6az80OU9s8nejfHsVnokSB\npxjHM9c+QUNBBZ1WBoQkHTjap6OUEpmafUq+5h/v8jm2Lcz+U8ZDpA0x1q2znWp+XXUdMwL5Tqky\n4KQYtJFu+k0tGJRSpeTFPxCwnXrRKAQkREWkHaJRxOfjk7qjeWDsdA3TXC6kjAdhSRR6HeCnlDIl\nL/5gvTkAnvIFiPqq8Hfat3mn2dOp17e5fIgf+VtRARs3QjhMGIemJli40JYLntD3UlYoSlFT8t4+\noRDUdYS53MwgEoE/TNRZ18sWb+Tv+efb0X233krkqPE0BMLMn99T6HWAn1LKlGTNP775/s2XGpkW\nvRAfUTqjFbwy5kkIas9u2eJF+PNmeom2cxghnjC2IiASE3qd/1cpZUpO/OPttIf5wizvuAAfBgEq\n6KTm+isg+ES+k6nkkzjzj4iPkyNL+UBqWFwV7BHvJ11nsKIUMyVn9om3057Z0dQl/B4VG1/LV9KU\nQsGr0n/3u/g6O/i6WcktZjIbjzybefNU7JXyoOTE36vUHeYLM5FF3Ud3Am8Fzspf4pTCwXHg888B\nO8pbgF2X/QEaG/OaLEUZLEpO/L1K3dVHh6j2dXbV+j/1fYHV357GmEc0fo/iMmFCz3XNzYOfDkXJ\nAyUn/gDOmkYCm5cifh/4/cjQoWy/4q8q/Ep3gkE4K6EluM02GuNZKQtKT/wbG2HyZFi5Ejo64Lvf\nVbdOJTV33GED+3/jG9b3/4EHNMi/UhaUnvg3N3fZ9w1Yu64Kv5KOYBBOOcX6/mt4T6VMKDnxf220\nteOahGVFSYuO6FLKjJLz879rxyAbBE41zdwrExi5YxAd0qWkIjYg0MGJn+DZq/lrq1EpUUpO/AMB\nGD8kyIL2oI3REsh3ipRCpWfgNgcngEZzU/LKYEWSLTnx1yH5SqYkDdxG3MqtW6GpSR8iZdAYzEiy\nJWfzB3uzGhr0nVXSk9TMHwhYrx+wHcALF6rnjzJoJKuQDBQlKf6KkgleK7FbkFfHgYkTMW4ccNMZ\ngVCIxkY49tjYAOBwGGbM0HJByS2D6XeQE7OPiBwH3Aj4gduMMdclbK8GmoCDgVbgDGPMhlxcW1Gy\nIVngtqVfqOfbZjGVtNMRrWLRSwGm/MFuW7YMXnsNbrpJuwWU3JBo4x8ss3XW4i8ifmAucAywCXhe\nRO43xqyN220S8JEx5ssiciYwEzgj22srSrYkm73r+//tcDDLCRCilRr2ezjEocCz2Dfxnnt6n+RF\np39UMiGVjX8wnplc1Py/Aaw3xrwOICJ3AicD8eJ/MjDd/X03MEdExBTq7PFKWZDsxfNC/XtCv5zx\nVH/UzkVUMZ7lPIvDaad1r/knNs11+kclU5LZ+LdbE6a1OUTNhAC1wYF7cHIh/nsCb8YtbwIOSbWP\nMaZTRD4GaoAP4ncSkSAQBBgxYkQOkqYoqUn24gUCUF0NbW0wnhBDTDs+E2GIr53zvxRi4s+drgHB\nqWr2Ov2jkimejb+tDXw+GBVq5KvLLsJHlPZlVazh8QErAAqqw9cY02iMqTPG1O2yyy75To5S4iTr\nXOuKCns1nDEvgG+I3cHnE84btpQgtsc3nUeZDhZWMsVxYPZsK/xf7wxz8rKL8BPBh6GaNjoWNA3Y\ntXNR838L2Ctuebi7Ltk+m0SkAtgB2/GrKHkjVedazObqQO1ymDULli61wQJXrrQ9vjNTR4jVsSZK\nX2httV7F40wIH5Fuk0/tvsfAXTcX4v88MEpE9sGK/JnADxP2uR84BwgD3wMeU3u/Ugj02rkWN+lL\nFzfcYO0+aQ7U6R+VTAkE4HB/mL2jG+k0lfjoACDqr2D3afUDdt2sxd+14U8BHsG6ei40xrwkIv8F\ntBhj7gcWAP8jIuuBD7EFhKIUBxMmWB9PD2PUkK/kDGdNI491XoSYCKaiEjnxFNhtN/zxk0kPADnx\n8zfGPAQ8lLDuV3G/twKn5+JaijLoBIPW1HPDDVb4Kyth40br1qMFgJIN4TBceCG+aNQud3bwwtu7\n0TZt3oA/WgXV4asoBcvMmbBihZ0oSARuvVUnfVGyp6kJPOF3WblycB4tFX9FyRTHgREjMB2dEIlg\n2tp5oymkYR6UnGCACD4WUz8o8wmp+CtKH1hTE2BLtIpOfHREfVx3aw1XXqmNAKWf1NcTrawmitCJ\nnwuZx0qfMyguwir+ipIh4TBc1uwwldlE8eEjwm8jU/l6JKwzPyr9w3G4Y9LjXCnXMI6nWOgLcvTR\ngzMqvOTi+SvKQOCFbGhrg2m04sNQQRRDO9+SEH+vcnQwl5IZCYGfRtU7XLDYob0dqqtg+vTiie2j\nKCWPF7IhGoUnJUCnVOGnHb+/ggljNnLmpDC16vmj9EY4bEW/o8N6jYVCOI6Tl0GBavZRlAyID9nw\n4hCHdfOWI8Hz8Ylh7KpbqZ2qRn8lA2bNsrUIY+x3kw3fkI8JqFT8FSUDEid+qQ1azx8ikcGZdkkp\nfsJheOCBfKeiCzX7KEqG9AjZ4DUHUsV2VpR4QiFb4/fw+6F+4MI39IaKv6L0F43gpvQFN164aWsj\nip8N/zGHffP4zEihxlerq6szLS0t+U6GoihKzljTGObPF4d4LBrghWpnQFw6RWSVMaaut/205q8o\nijJIPNjqcE3UIRoFX1usmygfjUcVf0UZIHQeXyWRmppYKJ9oFDZvzt+Unyr+ipJrwmHeaArRsDDA\nioij8/gqXbS22lm7olH7vXp1/qb8VFdPRckl7lDgveZfyUPt4zX0g9INb45ov99+T5iQvyk/teav\nKLnEHQrsMxGq2cJspnK5fzY1NQ4zZqgJqNxJ5iBWW5sf86B6+yhKLgmH4aijoK0N782KVFRztO9x\nVkQcKipg4kTr3q2FQGlRKH08mXr7qNlHUXKJ41h1B8T9+DrbOawjRCRiA8PNn68hoEsNL/DfX34Z\n5uFxM1jTWPh/roq/ouSa+nprwPWorOLpygAidtEL66L9AKVDUxOcvaWRx6JH8qvOX7L/lMIv3VX8\nFSXXOI5V9gsugAsuwPfE48wIOUyenL/OPWXgCIdh7YIwc7iYSjqoIEpFpK3gS3ft8FWUgSAxEFDY\nxoG76Sbr7ldTE9MGtf0XN6EQ/KCjCT8RBDsdo/j9SUv3QukXABV/RRlwwmH4n3GNnNzZzH0VExg9\nN8gll8QG9jz+eP6FQOk/J9aEGcUifBgMYHx+ZM6cHn+q1y+QjwFdyVCzj6LkmHCYbpO6fzSrkbmd\nk/k2y5jbOZkPrm2krc3a/tvaukK6K0VKbWuIal8nAiCCL3g+BIM99vMmBCqUCOBa81eUHJK0dvd2\nM0CXSWDcB81AT3FQipRAAKm2ob2lqiplmOZCiwCu4q8oOSRZ7c6ZNAGzclmX33/TZxMAELEz+eUx\npLuSDfEG/AxCexdaBHAVf0XJIUlrd04QAV6/vplX18NlXM+XeI3Hj5k5aJN1KzkmWROvoaHXw3pM\nCJRHVPwVJYekrN0Fg1Q9+RrHrZ8FwOXM4uxdYLgzM19JVfpIN0+dpE28AlH1DFHxV5Qck6p2N/y5\ne6wbINb2P/yem6Bx36Sdgx6F5BpYziRW9J+bHaC2kAz4/UDFX1EGi9NOg1mzumz/bNkCkycDEK4N\n9hD5QnMNLGdCIRjbFuasaBOyBV58sZ7aQjLg9wMVf0UZJMKnzGTFb+HCyE1sy5au9R8taGb8mmAP\nkQ+FrCtoNGq/i9CyUDKcWBPmp9GjqKYNgOiChVAfysjOX6ion7+iDBKhEDQwk58yG6CrBRDeY0JS\n/+/EWZ9qagY7xYpH7cOzqKatK1ifv7Mj/476WaI1f0UZJDxPoEXtQSoFrhnbzE6TJrBTbZCqR3qa\njxNnfWptzWfqy5jGRli6FIlfV1lZlHb+eFT8FWWQ6O4JFGQnx3b0OiT3EPJmfSriPsXSYPbs7st7\n7gl//jNhHEJFPEFPVuIvIsOAPwEjgQ3A940xHyXZLwKscRc3GmNOyua6ilKspPIESra+0AYFlSWN\njfDyy93X/epXhHGSdsYXk3dWtjX/K4DlxpjrROQKd/nyJPttMcaMzvJailJ2FNKgoLKkubnrpwE+\nH3kA2waDhGYkj9NTTN5Z2Xb4ngwsdn8vBk7J8nyKUp6Ew7xx4QxmnRrmwgsLfh6Q0seLzjd6tI3U\n6a7++aZLCYdj/TfxczMUWuC23si25v9FY8w77u93gS+m2G+IiLQAncB1xpilyXYSkSBuxKsRI0Zk\nmTRFKRLCYSJHjWfPtnamUMV4lrNwoaOunfkiYYDFI6OnIatXczcTWGSC7BWyHp7JTHLFNO6rV/EX\nkUeB3ZJs+s/4BWOMEZFUs8HvbYx5S0S+BDwmImuMMa8l7mSMaQQawU7g3mvqFaUUCIWQ9nZ3MpCt\n1NPEcx0q/nkjrgpv2tp5es2OXM0jAFTFzdGSaJIrtj6aXsXfGHN0qm0i8m8R2d0Y846I7A68l+Ic\nb7nfr4tICBgD9BB/RSlLAgFMhR/TEcGH4cfcyj98YwgENOxzXoiLztfpq+LxSACwUVjPOy+9qBdT\nH022Nv/7gXPc3+cA9yXuICI7iUi1+3tn4DBgbZbXVZTSwXHwTzoPEASoIMJcLsYhO8N/4qQySga4\n7jqv/WQ2ofFX8ZefLueFagefDyoq4AtfKKF7aozp9weoAZYD64BHgWHu+jrgNvf3N7Funn93vydl\ncu6DDz7YKErZ8MwzxlRWGmMn+DLG5zPm2muzOt3Qocb4/fb7mWdymNZSxb1pUZ/ffMZQc5jvGTN0\nqDHTphlTUWGMSOyvKeR7CrSYDDQ2q5q/MabVGDPeGDPKGHO0MeZDd32LMebH7u9njDG1xpiD3O8F\n2VxTUUoSx4E5c2z10uezo7uy6DEsNs+TgsC9aRKNUEk7R0RDtLfD6tVeiWx3i0ZL457qCF9FKRSC\nQaitzUmPYaFNGVgUuDfNtLXTEa3iKV+AqiqYMAGeeioWZM/nK417KsYUplNNXV2daWlpyXcyFCW/\nZDFktJhGmxYM7k1bUxPgwVanS+Cbmuz3mDE2xlIh31MRWWWMqet1PxV/RSlQvNFEHR02kJj6fg46\nxTinQqbiryGdFaVQaWqyqmOM/faqn0puyMAdqpT7TtTmryhFiJp0etKne5KiSp94jlLuO1HxV5RC\npb4eFi2KKU99PVCcpoiBJv6e+P12MFZ9fZr70tQEW7fGWlWhUMpIncU0arcvqNlHUQoVx4HHH4dr\nrrHfrvKUsimivyTek/nzbWGQ1KITDsPChTHfzYoKCARS3lfHsbF8Skn4QWv+ilLYJIkXUMqmiP7i\n3ROvMh9Xoe8p2qGQVXiwMRsmTgTHIUB53VcVf0UpMkrZFNFfvHvS1GQtZZ2daQQ8sfR0zWnldl/V\n1VNRihHt8U1JRremhO+f+vkrSqmiPb5KGtTPX1FKlUHu8S2J6KAlkYncojZ/RSk20vT45tqaURKN\njJLIRO5R8VeUYiNFz+RAaFyyRkbR6WZJZCL3qPgrSjGSxAW0LxqXaQuhJNxKSyITuUfFX1FKhEw1\nri8thHy6P+bMhFVuPpwZouKvKCWC48Bzs8O0NoeomRCgNoXIJYlsUHDz0mZrwlrTGHcfgk5xTa47\nSKj4K0qpEA5TO3W8nXXkMR8w104Q030XFi2KRTbw+wvTChJvwmprg+nT7ScT/V7TGGbU5AAH0EHH\nskrWELIFgNINdfVUlFIhFIpNN9XZCVOm9HBtDIXsJrCRDc47rzArxJ4Jy+ez2Xn00TSxehKomj2L\natrxY6imnY4FGgo7GSr+ilIqBAJWLT0ikR5jAGpqrOj7fDBkSFdkgwGjv+71npn+6KNjBUBGQxrC\nYUa9+kC3VXvs0bdrlwtq9lGUUsFxYO5cohdNgWgEIz78S5daxQ8GCYdh6lQrpH4/zJ49MLV+r6O2\npsZer792e8expp6nnsrQUaexEa6/Hl/UBm0zAD4/u00b4BKuSFHxV5QSIlwbpMFfy6WRWZwSWYpZ\nuRJZuRKAUGuQ9nYr/iJ2LtqcXz+uo1bEXiu+1p6p+Md7+mTkqNPYCJMnx5ZFEL8f5s4tTLtWAaDi\nryglRCgEKyIOV/A5AOJtaG4mMD044O7u8R21Pp9tYYhkdr10LYaGhl4u3NzcbXHLnvvy5xObGFXr\noNKfHBV/RSkhvI7Se7dO4FizDINbAEyYMCDu7r1Nezh7tm1h9Ha9rFsMEybAsmWANff8x7s/p/FW\nh6rFGs0hFSr+ilJCxAQ+yOubYd/VzVYYXZfPXLq7J/PFBzjnHPuddhrFBLJpMQAxl9bmZh7eZgKN\nDwR7HelcwlGdM0LFX1FKjJjAB91ParIRwMRwEk1NsHhxjzlSMqK/LYZuBIMQDLJTGKoeSW/e0lhv\nKv6KUrZkI4DhMGzcaKe/BXs8pI8tlK6gyaVJKpNzaaw3FX9FKS/iFDgUcvolgPGFht8P558fq+XH\n1/zja9y9FTQZt0Ay3LE381ZNjTUvGVO+sd5U/BWlROmhk54Ct7WB38/3fzqHq6qCffb+ia81A4wY\nERPaVDXudDXtlAVDvPtPa2v2Awfi7svUqbH+hYEa71DoqPgrSgmSVFDjwz9Eo+x7w4WsuQzu2jHY\nJdaZVKzTRQ9NVeNOd4xXMEyMNPK9Lc1se9EubHl3BUPe3QgY663k9QJHIv0bOBCHd72BHO9QDKj4\nK0oJkrSmHQhYAY1G7U5uAdAwD3DsCOCjjooJ9OOPJ9fW/tjnkx4TDkNTExc+u5ZzIuvYnXfszqtj\nxwnWdVO8NPt8Kd2AymqOghyg4q8oJUhSgXMcmDMHLrywWwHAlClQW0tTk0Nbm13d1ma9d9LF+U/c\n1pv4Og44uDutqYFLLoG2NnYEdnD36RJ7d9l4B/t8UF2d0g2oWOYoKCRU/BWlBEkpcJ4/fHwB0NEB\nU6eyzx6z+TFrmEAzzUygNzfReFKKb2Mj3Hij7VkdOxb+9Cd7XZ8v1mlAT7E3ced+/9tnsWvga2mV\nuq/eOxreP0vxF5HTgenAV4FvGGNaUux3HHAj4AduM8Zcl811FUXpnZQC5xUAF10UE+CVK7nMdwSC\nXT6WZbz+BYBg9yo9dO+E3bwZHnyQL79vmL9lLKNYR/WWNoaf3A47dsK6dbHrvvxy7Lcx3QoAT+yj\nwKPybb7+xTeJRIT/m3gp+87svRDqzZRT7gO6kpFtzf+fwGnA/FQ7iIgfmAscA2wCnheR+40xa7O8\ntqIo/SUYhBdfhFtu6Vrli0a6TC4Gd3RwuDZWpa+osKLd2RlrNbj77gycTZy4vw/mfftT6I4BjM+P\n7+a5Ng1r1yIffMAHO+/HQwdMY1S9wzBXoHdJODaViKcz5eiAruRkJf7GmJcBRBL/3m58A1hvjHnd\n3fdO4GRAxV9R8kl9PSxcaFURwO9H3Jq4Fw+omz3FE3wTM8ok2ueTKUG8CSeKEKGCqTKHH9UGcRIa\nFvUZxv9JJuKpWjo6oCs5g2Hz3xN4M255E3BIsh1FpGs8+ogRIwY+ZYpSzjiOVcImd6ar+npYs8ZG\nyPTiAYXDMXtKmpp/st8eUeAN9ubPcgYfsyOPmwDPG4e9QnZ7prXy/oq4evckp1fxF5FHgd2SbPpP\nY8x9uUyMMaYRaASoq6tL9hwpipJLEqvLjtN93t9Eewp0s/mvDm2matmDgGG1byzf/co6fB1tbFzf\nzvvswityAItNPWEcBFt+RKMxEU4m6N4lEs03/RVx9e5JTq/ib4w5OstrvAXsFbc83F2nKEoxkKyA\nwDXDXAVtvpn4fHbelC+45cbHYbizCdauhfCTdp0x8NOfwo47dhfheEGvqUndEshGxNW7pyeDYfZ5\nHhglIvtgRf9M4IeDcF1FUQaQxJGyL75o5+v1auSLF8PWrbH9fT4r/N7ELJ6tP951P34QcltbT9OO\ninjuyNbV81TgJmyn/F9EZLUx5lgR2QPr0nmCMaZTRKYAj2BdPRcaY17KOuWKogw68Z2z8WYYvx8W\nLbLdAVVVNqZ/e3usb1jEjtHyCoZUnbdr1nQff1ZTM/h5LBey9fa5F7g3yfq3gRPilh8CHsrmWoqi\n5Ja++r4nE2zPDLNxI9x6a8x2D90LhvPO6z65S6rO29ZW20LwxoGVa9ydwUBH+CpKGdIf3/dkgt3Q\nEAsIlziRS3196sIlVedtIGBbCOqZM/Co+CtKGdIft8neonkm64zta3wd9cwZPMSYwvSorKurMy0t\nSaNFKIqSJf0d9aphEgofEVlljKnrbT+t+StKGdLfGvZAeNtogZIfVPwVpUzxhDwcjrloDrb4Njba\niNKRiLX1a9ydwUPFX1HKmIEIepZpTT4chosvtu6hkNyvXxk4VPwVpYzJddCzvhQmoVC3EEH4/erd\nM5j48p0ARVHyh+fB4/fnxrUyVayeVNeurrb+/BUVdpIxrfUPHlrzV5QyJteulYnuoDU1qfsT1K0z\nv6irp6IoOcWz+dfUwNSpOonKYJOpq6eafRRFySmOY0f+trZmbgJSBh8Vf0VRssJzFQ2Hu6/PdX+C\nklvU5q8oSr9J592jNv3CRsVfUZR+05urqMbfL1zU7KMoSr9R007xojV/RVH6jZp2ihcVf0VRskJN\nO8WJmn0URVHKEBV/RVGUMkTFX1EUpQxR8VcURSlDVPwVRVHKEBV/RVGUMqRgo3qKyPvAGxnuvjPw\nwQAmZ7AohXxoHgqDUsgDlEY+BjsPextjdultp4IV/74gIi2ZhDAtdEohH5qHwqAU8gClkY9CzYOa\nfRRFUcoQFX9FUZQypFTEvzHfCcgRpZAPzUNhUAp5gNLIR0HmoSRs/oqiKErfKJWav6IoitIHVPwV\nRVHKkKIXfxE5TkReFZH1InJFvtPTV0RkoYi8JyL/zHda+ouI7CUij4vIWhF5SUQuzXea+oOIDBGR\nlSLydzcfv8l3mvqLiPhF5EUReTDfaekPIrJBRNaIyGoRacl3evqLiOwoIneLyCsi8rKIFEzw66K2\n+YuIH/gXcAywCXge+IExZm1eE9YHRGQc8CnQZIw5MN/p6Q8isjuwuzHmBRHZHlgFnFJM/wOAiAiw\nrTHmUxGpBFYAlxpjns1z0vqMiPwMqAO+YIw5Md/p6SsisgGoM8YU9QAvEVkMPGWMuU1EqoBtjDGb\n850uKP6a/zeA9caY140x7cCdwMl5TlOfMMY8CXyY73RkgzHmHWPMC+7vT4CXgT3zm6q+YyyfuouV\n7qfoakciMhz4DnBbvtNSzojIDsA4YAGAMaa9UIQfil/89wTejFveRBGKTikhIiOBMcBz+U1J/3DN\nJauB94C/GWOKMR+zgWlANN8JyQIDLBORVSISzHdi+sk+wPvAItcEd5uIbJvvRHkUu/grBYSIbAc0\nA1ONMf+X7/T0B2NMxBgzGhgOfENEisoUJyInAu8ZY1blOy1ZcrgxZixwPHCxax4tNiqAscA8Y8wY\n4DOgYPoli1383wL2ilse7q5TBhnXRt4M/MEYc0++05MtbvP8ceC4fKeljxwGnOTazO8EviUid+Q3\nSX3HGPOW+/0ecC/WxFtsbAI2xbUe78YWBgVBsYv/88AoEdnH7Uw5E7g/z2kqO9yO0gXAy8aY3+U7\nPf1FRHYRkR3d30OxjgSv5DdVfcMY02CMGW6MGYl9Hx4zxpyd52T1CRHZ1nUcwDWTfBsoOm84Y8y7\nwJsi8hV31XigYJwgKvKdgGwwxnSKyBTgEcAPLDTGvJTnZPUJEVkCBICdRWQT8GtjzIL8pqrPHAb8\nCFjj2ssBfmGMeSiPaeoPuwOLXS8yH3CXMaYoXSWLnC8C99o6BRXAH40xf81vkvrNT4A/uJXT14GJ\neU5PF0Xt6qkoiqL0j2I3+yiKoij9QMVfURSlDFHxVxRFKUNU/BVFUcoQFX9FUZQyRMVfURSlDFHx\nVxRFKUP+P5OxXtvr2werAAAAAElFTkSuQmCC\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "3h7IcvuOOS4J",
-        "colab_type": "text"
-      },
-      "source": [
-        "Much better! The evaluation metrics we printed show that the model has a low loss and MAE on the test data, and the predictions line up visually with our data fairly well.\n",
-        "\n",
-        "The model isn't perfect; its predictions don't form a smooth sine curve. For instance, the line is almost straight when `x` is between 4.2 and 5.2. If we wanted to go further, we could try further increasing the capacity of the model, perhaps using some techniques to defend from overfitting.\n",
-        "\n",
-        "However, an important part of machine learning is knowing when to quit, and this model is good enough for our use case - which is to make some LEDs blink in a pleasing pattern.\n",
-        "\n",
-        "## Convert to TensorFlow Lite\n",
-        "We now have an acceptably accurate model in-memory. However, to use this with TensorFlow Lite for Microcontrollers, we'll need to convert it into the correct format and download it as a file. To do this, we'll use the [TensorFlow Lite Converter](https://www.tensorflow.org/lite/convert). The converter outputs a file in a special, space-efficient format for use on memory-constrained devices.\n",
-        "\n",
-        "Since this model is going to be deployed on a microcontroller, we want it to be as tiny as possible! One technique for reducing the size of models is called [quantization](https://www.tensorflow.org/lite/performance/post_training_quantization). It reduces the precision of the model's weights, which saves memory, often without much impact on accuracy. Quantized models also run faster, since the calculations required are simpler.\n",
-        "\n",
-        "The TensorFlow Lite Converter can apply quantization while it converts the model. In the following cell, we'll convert the model twice: once with quantization, once without:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "1muAoUm8lSXL",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Convert the model to the TensorFlow Lite format without quantization\n",
-        "converter = tf.lite.TFLiteConverter.from_keras_model(model_2)\n",
-        "tflite_model = converter.convert()\n",
-        "\n",
-        "# Save the model to disk\n",
-        "open(\"sine_model.tflite\", \"wb\").write(tflite_model)\n",
-        "\n",
-        "# Convert the model to the TensorFlow Lite format with quantization\n",
-        "converter = tf.lite.TFLiteConverter.from_keras_model(model_2)\n",
-        "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
-        "tflite_model = converter.convert()\n",
-        "\n",
-        "# Save the model to disk\n",
-        "open(\"sine_model_quantized.tflite\", \"wb\").write(tflite_model)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "L_vE-ZDkHVxe",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Test the converted models\n",
-        "To prove these models are still accurate after conversion and quantization, we'll use both of them to make predictions and compare these against our test results:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "-J7IKlXiYVPz",
-        "colab_type": "code",
-        "outputId": "0c10f56c-dbd7-4cc3-e332-30ad673769e5",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 281
-        }
-      },
-      "source": [
-        "# Instantiate an interpreter for each model\n",
-        "sine_model = tf.lite.Interpreter('sine_model.tflite')\n",
-        "sine_model_quantized = tf.lite.Interpreter('sine_model_quantized.tflite')\n",
-        "\n",
-        "# Allocate memory for each model\n",
-        "sine_model.allocate_tensors()\n",
-        "sine_model_quantized.allocate_tensors()\n",
-        "\n",
-        "# Get the input and output tensors so we can feed in values and get the results\n",
-        "sine_model_input = sine_model.tensor(sine_model.get_input_details()[0][\"index\"])\n",
-        "sine_model_output = sine_model.tensor(sine_model.get_output_details()[0][\"index\"])\n",
-        "sine_model_quantized_input = sine_model_quantized.tensor(sine_model_quantized.get_input_details()[0][\"index\"])\n",
-        "sine_model_quantized_output = sine_model_quantized.tensor(sine_model_quantized.get_output_details()[0][\"index\"])\n",
-        "\n",
-        "# Create arrays to store the results\n",
-        "sine_model_predictions = np.empty(x_test.size)\n",
-        "sine_model_quantized_predictions = np.empty(x_test.size)\n",
-        "\n",
-        "# Run each model's interpreter for each value and store the results in arrays\n",
-        "for i in range(x_test.size):\n",
-        "  sine_model_input().fill(x_test[i])\n",
-        "  sine_model.invoke()\n",
-        "  sine_model_predictions[i] = sine_model_output()[0]\n",
-        "\n",
-        "  sine_model_quantized_input().fill(x_test[i])\n",
-        "  sine_model_quantized.invoke()\n",
-        "  sine_model_quantized_predictions[i] = sine_model_quantized_output()[0]\n",
-        "\n",
-        "# See how they line up with the data\n",
-        "plt.clf()\n",
-        "plt.title('Comparison of various models against actual values')\n",
-        "plt.plot(x_test, y_test, 'bo', label='Actual')\n",
-        "plt.plot(x_test, predictions, 'ro', label='Original predictions')\n",
-        "plt.plot(x_test, sine_model_predictions, 'bx', label='Lite predictions')\n",
-        "plt.plot(x_test, sine_model_quantized_predictions, 'gx', label='Lite quantized predictions')\n",
-        "plt.legend()\n",
-        "plt.show()\n"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEICAYAAAC3Y/QeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAIABJREFUeJzsnXl4FFXWuN/bnbCELbIMCiHpqKzZ\nISBkYXGZDFECIhFkEWRcUFHHJCAOIo7K/DAkcRkc/XRGXAi7DIQx8+GHbAmRkTWYIMiSTtgUBAIB\nAln6/v6o7k4n6ex7ct/n6ae7q27dulV169Stc849R0gpUSgUCkXLQtfQDVAoFApF/aOEv0KhULRA\nlPBXKBSKFogS/gqFQtECUcJfoVAoWiBK+CsUCkULRAn/BkYIMUUI8W1Dt8OCEKKtEGKTEOKKEGJt\nPewvXQgxsq73Ux8IIQxCCCmEcKhE2RlCiOT6aFdlEEK4CiGuCSH0Dd2W+kAIMVIIcboO6m1U17U8\nmo3wF0JMFkLsNXfgc0KI/wghghq6XRUhpYyXUv6+odthwwSgO9BFShle1zuTUnpIKbfX9X4U5SOl\nzJJStpdSFtakHiHEdiHEk7XVLpt6K/1gVVSOZiH8hRARwHvAX9EElyvwd2BsQ7arIhppR3YDfpZS\nFtTlThrpsSsULQcpZZP+AJ2Aa0B4OWVaoz0czpo/7wGtzetGAqeBucB54BwwDggFfgYuAX+2qesN\nYB2wGsgB9gM+NuvnASfM6w4DD9usmwHsAt4FLgJvm5clm9cL87rzwFXgR8DT5ji/BC4AmcBrgM6m\n3mQgBrgMZACjyzkf/YHtQDaQDoSZl/8FyAPyzef0jyW26wHkAp1tlvkBvwGOwF3AVvOx/QbEA842\nZY3AK8Ah4BbgYF52fyWuk/U82dQngbvNv0PN5zsHOANElXHsttcgGzgJBJiXnzKf++kl+ldZ511v\nPue/met53twmB5tt/4nWp86Yr7e+5PGUd93ttP8J4CfzcZ4Enimxfq55f2eBJ0ucoweBA+Z9nALe\nsNnOUKLt24G3zOcqB/gW6Gpe1wZYbr7O2cAetEHXIqAQuInWf5aWcQxrgV+AK8BOwMNmXVsg1nyu\nr6D167ZAlrl918yfYWj34vJyjqHMc4X5vi+jfR8BMSWWbQQiKnmPJ9trj815fdLm/0xzGy8DmwG3\nqvaJasvO2qysIT7AH4AC2xNsp8ybwG7gd0A3IAV4y6YTFACvowmwp9Bu9BVAB8ADTeC5m8u/gSYc\nJ5jLR6EJW0fz+nA0IakDJgLXgTtsOkYB8AKa4GtborOEAPsAZ/PF72+z7ZfmDtjB3Kl+xiyczXXk\nm9uuB55Fu/mFnXPhCBwH/gy0Au41d+K+Nse3vJxzuRV4yub/EuBj8++7gQfQhHg3tBv7PZuyRuAg\n0Atoa7Ps/kpcJ+t5sqnPVrCdA4LNv28DBpbRfss1eMJ8rt5GEywfmtv9e/P5aF+J8z4LOGI+ns7A\nNooLn38B/wO0Mx/TD5gFUGWvu532P4j2kBXACOCG5VjR7oVf0PqsE5qAtj1HIwEvtL7pDfwKjLMn\nqNCE1AmgD1o/3Q4sNq97Bthk3oceGAR0tNnuSXtttzmGmebzaXnYH7RZ96G5jp7mugPM5Yq1z15f\ntXMM5Z2rkZQt/IejPRyFTX/KBXpU8h6vlPBH00wcN19vB7SBRUpV+0S1ZWddCub6+ABTgF8qKHMC\nCLX5HwIYbTpBLkUjsg7mC3aPTfl9NjfJG8Bum3U6bASPnX0fBMbadIysEuttO8u9aMJlKObRpXm5\nHm1EPsBm2TPAdps6jtusczIfw+122hOMJiBs61+JeRRIxcL/SWCr+bcw3yTDyyg7Djhg898IzCxR\nxkiR8C/vOlnPk816W8GWZT4nHSvoCzOAYzb/vcz1dLdZdhHwrcR53wrMsln3e3NdDmgj4VuYH3Lm\n9Y8B2yp73SvZ/zcAL5l/fwb8P5t1d9ueIzvbvge8a/5toLTwf82m7HPA/5p/z0R7MHvbqXM7FQj/\nEuWdzfvthHYv5WLzJm1Trlj77PVVe2XKOVcjKVv4C3N/Gm7+/xTmPl9G+ZL3eGWF/3+webs2H/8N\nNNVrtftEZT/NQed/EehagQ65B9prpIVM8zJrHbLI0JVr/v7VZn0u0N7m/ynLDymlCU1t1ANACPG4\nEOKgECJbCJENeAJd7W1bEinlVmAp2ujnvBDiEyFER/P2jnaOoafN/19s6rlh/mnbZgs9gFPmdpdV\nV3l8DQwTQtyBNkIyAUkAQojuQohVQogzQoiraCPPriW2L/P4qfg6lccjaKqfTCHEDiHEsHLKlry2\nSCntXe+KznsPih+PbTk387bnbPrC/6C9ARSjnOteCiHEaCHEbiHEJXOdoRSd45LtOVVi23uEENuE\nEBeEEFfQ3lxKXh9bfrH5fYOi/vQVmopilRDirBAiWgjhWE49tm3QCyEWCyFOmPuI0byqq/nTBm0Q\nUGMqOFdlIjVJvArtYQ0wGU2Faam3onu8srgB79vUcwntwdOzKn2iujQH4f892ghrXDllzqKdaAuu\n5mXVpZflhxBCB7gAZ4UQbsCnwGw0bxlnIA3tglqQ5VUspfxASjkIGID2yj0HTaecb+cYzlSj7WeB\nXuZ2V7kuKeVlNP3vRLSbYpX5ZgHN4C4BLyllR2AqxY8dyj/+8q7TdbQ3GgCEELeXaNceKeVYNOG6\nAVhTmeOpgIrO+zls+oJ5nYVTaP2yq5TS2fzpKKX0sLejMq57MYQQrdEevjFobyrOQCJF5/gcWl+0\n0Kt4DawAEoBeUspOwMeUvj4VIqXMl1L+RUo5AE0t8xDwuGV1BZtPRlN33I822jeYlwu0830TTVVT\nard2lhXrE4C1T1TiXFXESmCC+Z6+x1wXlbzHbdtHWW1E6yPP2PQPZyllWyllClSuT9SEJi/8pZRX\n0PT1HwohxgkhnIQQjuanfrS52ErgNSFENyFEV3P55TXY7SAhxHjz28af0G7y3Wi6XYlmM0AI8QTa\nqKBSCCEGm0dnjmgd5yZgMr+VrAEWCSE6mDtgRDWP4b9oo7i55vM0EhiDNtKpLCvQbvYJ5t8WOqAZ\n464IIXpS9c5a3nVKBTyEEL5CiDZor/wACCFamedLdJJS5qMZyEzUkEqc9zXAi0IIFyHEbWiGQMu2\n59AekrFCiI5CCJ0Q4i4hxIiS+ynruttpUis0/fcFoEAIMRpN1WRhDfCEEKK/EMIJWFBi+w7AJSnl\nTSHEEDRBXGWEEKOEEF7mOQFX0R6Qlvb+CtxZzuYd0O6Xi2hC8a+WFea30c+AOCFED/NbwjCzIL9g\n3odt3QeB4eY5Cp2AV23WVXSuykVKeQDtYfQPYLOUMtu8qtL3uJTyAtpAYar5WGZS/MH2MfCqEMLD\nXFcnIUS4+Xdl+0S1afLCH0BKGYt2U76GdlFOoT2ZN5iLvA3sRfMy+RHNQ+ftGuxyI9rI9zIwDRhv\nHg0dRvNU+B7tJvBC85aoLB3RRhWX0VQIF9EMqqAZia+jeS0kowndz6racCllHpqwH43Wuf8OPC6l\nPFKFahKA3mi2llSb5X8BBqJ5aXwDrK9i88q8TlLKn9EMwluAY2jnwJZpgNGsSpiFZguqDco775+i\nqT9SzW0tebyPowmhw2jXdB1wh519lHfdrUgpc4AX0YT8ZTThnWCz/j/AB2iG5+NoAxLQhC1oevs3\nhRA5aA/W6r4d3W4+lqtonio70FRBAO+jjZgvCyE+sLPtl+ZjPIN2XnaXWB+Fdu33oKlB3kHTed9A\n8ybaZVaTDJVS/h+a190hNLvcvy2VVHSuKskKtDcU6wCnGvf4U2iDoItohvgUm7r+ZT6+VeZ+m4Z2\nX0Il+0RNsFizFZVECPEGmgFtakO3RaEoDyFEfzSB0lrW8bwNRdOjWYz8FQqFhhDiYSFEa7Ma6h1g\nkxL8Cnso4a9QNC+eQZsYdAJtwtWzDdscRWNFqX0UCoWiBaJG/gqFQtECabTBtbp27SoNBkNDN0Oh\nUCiaFPv27ftNStmtonKNVvgbDAb27t3b0M1QKBSKJoUQIrPiUkrto1AoFC0SJfwVCoWiBaKEv0Kh\nULRAGq3OX6FoTOTn53P69Glu3rzZ0E1RKABo06YNLi4uODpWKqBqKZTwVygqwenTp+nQoQMGgwEh\nqhwIU6GoVaSUXLx4kdOnT+Pu7l6tOpTap5kQHw8GA+h02nd8fEVbKKrCzZs36dKlixL8ikaBEIIu\nXbrU6E1UjfybAfHx8PTTcMOcwiUzU/sPMKW2YlsqlOBXNCpq2h/VyL8ZMH9+keC3cOOGtlyhUCjs\noYR/MyAry/7yzMziqqDnnlOqoabOhg0bEEJw5Ej56Rc+//xzzp6tfrK67du389BDD1V7e0XjRwn/\nZoCrq/3lQmgPACm1748+Kv7/6afVA6CuqCsbzMqVKwkKCmLlypXllqup8Fc0f5TwbwYsWgROTsWX\nCaEJ+fK4cQOmTgUHB628ehuoHSw2mNp+0F67do3k5GT++c9/smpVUdbNd955By8vL3x8fJg3bx7r\n1q1j7969TJkyBV9fX3JzczEYDPz2228A7N27l5EjRwLwww8/MGzYMPz8/AgICODo0aM1a6SiyaAM\nvs0Ai1F3/nxNBeTqqgmcylJYqH0rQ3HtUJ4NpibndePGjfzhD3+gT58+dOnShX379nH+/Hk2btzI\nf//7X5ycnLh06RKdO3dm6dKlxMTE4O/vX26d/fr1IykpCQcHB7Zs2cKf//xnvv766+o3UtFkUCP/\nJkJFaoQpU8BoBJNJexOoriOAMhTXnLJsMGUtrywrV65k0qRJAEyaNImVK1eyZcsWnnjiCZzMr36d\nO3euUp1XrlwhPDwcT09PXn75ZdLT02vWSEWTQQn/JkBV1Qjz51es8imPkkJKzSGoGmXZYMpaXhku\nXbrE1q1befLJJzEYDCxZsoQ1ayqff93BwQGTyQRQzDd8wYIFjBo1irS0NDZt2qRmMLcglPBvAlTV\nlbOmI0xbIVVX+uvmjD0bjJOTtry6rFu3jmnTppGZmYnRaOTUqVO4u7vTqVMnli1bxg1zB7l06RIA\nHTp0ICcnx7q9wWBg3759AMXUOleuXKFnz56AZiRWtByU8G8CVFWNUJMRZkkhVZUHj3pD0JgyBT75\nBNzcNPWbm5v2vyb6/pUrV/Lwww8XW/bII49w7tw5wsLC8Pf3x9fXl5iYGABmzJjBrFmzrAbfhQsX\n8tJLL+Hv749er7fWMXfuXF599VX8/PwoKFB53lsUUspG+Rk0aJBUaLi5SamNu4t/3Nzsl1++XEoh\n7G9T3sfNTdvWlrLqEUIr6+am/e7SRUpHx+JlnJxK19dUOXz4cEM3QaEohb1+CeyVlZCxtTLyF0J8\nJoQ4L4RIK2O9EEJ8IIQ4LoQ4JIQYWBv7bSlUVY0wZUrVdf5CaPWVHJ2W9RbRuXNxddDFi5CfX7yM\nMh4rFI2X2lL7fA78oZz1o4He5s/TwEe1tN8WQXXUCF26VG0fUtrX5Zf14IHS6iB7ZGUpdZBC0Rip\nFeEvpdwJXCqnyFjgS/NbyW7AWQhxR23su6Vg68ppNFZff6wr54rbG6mX9eC5VN7VtqHkG4IyGCsU\njYP6muTVEzhl8/+0edk520JCiKfR3gxwrYnVUsHFSf1ok9uat7cKIoypPBg4nH0uNzh/1484XunB\nXZk9udjhJl1y2tD7N8FWz4v0+OV3/EJ3QhcNJHH+XGtdU6aUftjMn1/xRDJHR7h8WXtg2VIbE54U\nCkXNaFQzfKWUnwCfAPj7+9fAU71l0e1PoYica3Q5D6CdNkeXAm72OkrU43qij/nzW7fjmG47C1JP\nftcMjjifBod8LhS24og+D/Jbc+x3RjA5cH9u+bNCQVMH2YaRtocQpQW/hZq6oyoUippRX66eZ4Be\nNv9dzMsUtUDAr9240CuZI94/cMR7P0d8/kt+9xNgEiAKOd9nryb4TQ5wqx2dTwwEh3xtvT5Pq0QH\nmByIXeVOxPKlFe7Tog6y8Roshl4PeXllb295sVP2AIWiYagv4Z8APG72+hkKXJFSnqtoo5aCrQDs\n2lX7VEUYblwbT9jmIHDMA/1NTbAD6MwvT0L7BCcHEvyDH5fu3o/uSg/QS+s6HG4R/F8/IoypxYbl\noYuiiXstppiEjnsthtBF0UyZAl98Yd8gbIkXZA9HR+3NQU0gqxqnT59m7Nix9O7dm7vuuouXXnqJ\nvDKesGfPnmXChAkV1hkaGkp2dna12vPGG29Y5xXUJbb7ef3119myZUuZZQ8ePEhiYqL1f0JCAosX\nL67zNjZFasvVcyXwPdBXCHFaCPFHIcQsIcQsc5FE4CRwHPgUeK429tscKCkAL17UPlUShoWFbNyd\nRMcsL9CbigS65SMFSEgK/J6kIQfofHwgpk5noVBbjgQKWpN0zwHiDD7F/Dvvz9URlbeYOOEMUhIn\nnInKW8z9uVrXKcsg7OZWdnM7dtS2K28CWZN/I6jlA5BSMn78eMaNG8exY8f4+eefuXbtGvPt+NIW\nFBTQo0cP1q1bV2G9iYmJODs716ht1aG6E8refPNN7r///jLXlxT+YWFhzJs3r1r7avZUZjJAQ3xa\nyiSvLl0qN/mqXPR6GTY0WLJQSBboJAsp/vmzk2z/hIf2+3W99j3fUft+rZW5TGvJn50k8zrK2PlL\niup2c5OxBh8p5nSRwaNGSDGni4w1+JTZKMvEr4qOqaJJaE5OjWvCWJUmeS1fXusHsGXLFhkcHFxs\n2ZUrV2Tnzp3l9evX5bJly+SYMWPkqFGj5PDhw2VGRob08PCQUkp5/fp1GR4eLvv37y/HjRsnhwwZ\nIvfs2SOllNLNzU1euHBBZmRkyH79+sknn3xSDhgwQD7wwAPyxo0bUkopP/nkE+nv7y+9vb3l+PHj\n5fXr16WUUi5cuFAuWbJElmT69OnymWeekYMGDZK9e/eWmzZtklLKUm2UUsro6Gjp7+8vvby85Ouv\nv26t4+2335a9e/eWgYGBctKkSdb9TJ8+Xa5du1ZKKeUPP/wghw0bJr29veXgwYNldna27NWrl+za\ntav08fGRq1atksuWLZPPP/+8lFLKjIwMOWrUKOnl5SXvvfdemZmZaa3zhRdekMOGDZPu7u7W+s+e\nPSuDg4Olj4+P9PDwkDt37qz29asrGnySl6J6xMdro/yKyHQpW/UCMDZ8CgkhyZDfCgrbQIGjtqEE\nCtqAhGs9jtM+ywMKHHD8zZ1+qffQ7Wd/+h0cQtjmYNqdv4t+aUPod2k0W9raWGmzsogwphK015Ok\nETsI2utZSjVkezyWt5iKKG8Sml7fxNNS1kFezfT0dAYNGlRsWceOHXF1deX48eMA7N+/n3Xr1rFj\nx45i5f7+979z2223cfjwYd566y1rjJ+SHDt2jOeff5709HScnZ2tMYDGjx/Pnj17SE1NpX///vzz\nn/+ssL1Go5EffviBb775hlmzZlkDxtm28dtvv+XYsWP88MMPHDx4kH379rFz50727dvHqlWrrKP4\nPXv2lKo/Ly+PiRMn8v7775OamsqWLVto164db775JhMnTuTgwYNMnDix2DYvvPAC06dP59ChQ0yZ\nMoUXX3zRuu7cuXMkJyfz73//2/qmsGLFCkJCQjh48CCpqan4+vpWeNxNiUbl7dPSqJQsCIzGl3Si\n8r4B4UKEzGTsHa4kyIXE5v4FgJTuF+h2KqiYt4/OBJk9LtHrl84U4sAt5+549CvuwlkpXF2JE84k\n+6cRvGMEyf5p9G8XBDoHnvIMJCI9BYA4jwDevrsXN/wGwq4q7sMGJ6eyPYiajIdQXcV0roAHHnjA\nbkjn5ORkXnrpJQA8PT3x9va2u727u7tVwA0aNAij0QhAWloar732GtnZ2Vy7do2QkJAK2/Loo4+i\n0+no3bs3d955pzXtpG0bv/32W7799lv8/PwALVnNsWPHyMnJ4eGHH7aGqQ4LCytV/9GjR7njjjsY\nPHgwoD0IK+L7779n/fr1AEybNo25c4v66bhx49DpdAwYMIBff/0VgMGDBzNz5kzy8/MZN26cEv6K\n2qMyssD/nI59j3zDmKQBRIUfZunxADK8kwnbHETEuaXwdhQX3kskPt6+770l06ubGyR+VfU2xk2d\nTVTeYmLWuhBh3EFchg+REw+BvoDI/g5w3QeAyDFpINPwW+3PATv1VJRZTAjN1LBoUdlzCJrM1I+y\nsunU4AAGDBhQSod/9epVsrKyuPvuu9m/fz/t2rWrdv0ArVu3tv7W6/Xk5uYCWpC4DRs24OPjw+ef\nf8727dsrrEuUSChh+W/bRiklr776Ks8880yxsu+99151D6Ha2B67NHfU4cOHs3PnTr755htmzJhB\nREQEjz/+eL23ra5Qap96pKQNsKy8G0IUGU+3XVlKzFoXNgUfpsNvPcnwScH90DA27k6yPj0qo26p\n7qBzS1sTMa3mESGzQQgiZDax//ak36GBICByWjqR09JAFBC72p19xjn8xRBIh4cmQaCmlnJzg6++\nKtst1M2t+MzlugiJXK/UwQHcd9993Lhxgy+//BKAwsJCIiMjmTFjhnWEXBaBgYHW2P+HDx/mxx9/\nrNK+c3JyuOOOO8jPzye+kobrtWvXYjKZOHHiBCdPnqRv376lyoSEhPDZZ59x7do1AM6cOcP58+cZ\nPnw4GzZsIDc3l5ycHDZt2lRq2759+3Lu3DmrSignJ4eCgoJSoaxtCQgIsKa/jI+PJzg4uNxjyMzM\npHv37jz11FM8+eST7N+/v1LH3lRQI/96wiKgLSqNzExo1Upze7QGRAuMxv+cjm1XltL+UhbgSmwH\nZ77r2QnDsXZk+KbQMdMb491HiTP4aAIZ+yrmklR30GlVE70dZV0WodMRkS4Zfn0ESSMs+uVb7Li9\nI+DDwolpINLw3N6aw8HRhHrO5emn7bt/2pOJ9tJS2gs612ipgwMQQvCvf/2L5557jrfeeguTyURo\naCh//etfK9z2ueeeY/r06QwYMIB+/frh4eFBp06dKr3vt956i3vuuYdu3bpxzz33lClcbXF1dWXI\nkCFcvXqVjz/+mDZt2pQq8/vf/56ffvqJYcOGAdC+fXuWL1/OwIEDmThxIj4+Pvzud7+zqnZsadWq\nFatXr+aFF14gNzeXtm3bsmXLFkaNGsXixYvx9fXl1VdfLbbN3/72N5544gmWLFlCt27dWLZsWbnH\nsH37dpYsWYKjoyPt27e3PnibDZWxCjfEp7l5+5TlAdOlS1FY5CCPJUXeNGD1svEIDZIsFNJ9XIAU\nc7rIsKHBWjmzV05lPGdq1VPG7AHEvI6SP7eVvNa6yLNogaNkXkfN++iVjrLz48GyzX3v2G2XXt90\nQj435ZDOBQUFMjc3V0op5fHjx6XBYJC3bt2qs/3ZeuQo6hbl7dMEKEvtcrFfNC9OjcHkaiApfQ4x\na12InJhB58d9iQo/zZikAaT77idscxAnN6RYVUAeSQ9avXLKG9XXRiKRksRNnU3kpAyQELuiD7HL\n+4HJUZtToM+ndXZ3Ekakgr6AS3ek0v+E/W5mMjWh0XwT5saNGwQFBeHj48PDDz/M3//+d1q1atXQ\nzVI0MErtU0/YtQFODsU120RU3l7Nk4dMPhvQAVpf4/KdBwneMYJ8vSRmRW8eMWZhQjDemM3ytfP4\n6S4TP5pVMvbi7Dg51b7Qt/DlZRMds0YTmXaKl40pvGvw0dxLRSEIE7duP6aFjshvS+xqd142ziGT\npYw3zOZAT5PVG6jJGHCbOB06dGDv3r31tj+VDrJpoEb+9YQ9G6DnyW5kDf6WAen9iQo/TZepA0kf\nnAyFjla3yvvPXCE8Mxt3jOgx4Y6Ro+ej+McTxaNu2s6qtfjJW2bK1jbZ38zl6r9XsdC4i0GGJURO\nzADpQNi3gVBgHlHqJDjcZMftHRHAeoMzB8MX41mYDoHRCAGhobXfNoVCUTmU8K8nigVCmxyK59Dp\nHNr9JWGbg0gfvAupK+DS3fvB5EhsfD92bttBzFoXosJPs+Sx2RUmcrH1krEYVusqVo6tCutATxMd\n0kczZruPpuqRDubJZjrQmUgIScJxthuRkzI0FVbwN4wu/F/aPziJ/0mLtratyYdzUCiaGErtU48s\nSYnGafR+emZfJy3kfxlHMCN+uUpCfmtoewXy20BhkS42QmZDq3lsGWDCWAlhWN7E0tpU/xRTYe2a\nSw6w/aFJoN8HhQ7Eru4NQOTkY6DPpaBrJhTqSBiZSth2HxJG7gMJfqv9rRPdSnpCPf209lvZBBSK\nukHI8mbeNCD+/v6yPvWUNcUyycqeZ1/oomi+y1rNnafbccQ7FSR0N/bh1342x3fDGZyu4LEnkMMe\nP2m+9TbulZVBp7M/kaq8uPrVoaTbKgCB0XS4bT9RaadYYNRm/Y4bGkzCfT8gbnZEdrgAJrSHW0Eb\nYle7s/32TnzjkUMr4yRufld6VrCbm+b73xj46aef6N+/f0M3Q6Eohr1+KYTYJ6WsMCmHUvvUAhWF\nJu64Q0delyMcGZyMxyFv0BcUCX4B3HBGLsm2qoAGZPgXj69TScoyoNa2YbVkJE9AewMw2wF0SAYZ\nlrAp+DBh3w0BnQlx9Xdab3PM47Yzd7Hj9o5sCtmJqftR7r6ebp0QZktmplIB2dK+fftSyz7++GOr\n//nnn3/O2bNn67tZxVDhl5sOSvjXAmWpW156SYvNv/j/lhK7sjfktdUMug65ReGWczuB0xXGDg1m\n4+4kwo5O49eeuqrH4KF+Z8ba5hS2F775QE8TPikPsin4MGOSBiBb3bSGkL581wESQpIg34mwrYNI\nD/4GvzP2u2JTjPEfHQ3bthVftm2btry2mTVrljXkQF0JfxV+uXmihH81KGmcLCuswsWQUG79fjhf\nG5yJMKYSvHuwJvB1Uou/VugISDz2BJIQkszYSY+zceUXXHgv0X6FFVBWbP261pvbfegcmIv08WDM\nqQdJGKmpumK/8kZ/uZf1wSdutSNhZCoxa13YZ5xDBgb8DDGl3gKaVERPYPBgePTRogfAtm3afzsT\nVWuMZaS9bt069u7dy5QpU/D19SU3N5d9+/YxYsQIBg0aREhICOfOlc6fNGPGDGbNmoW/vz99+vTh\n3//+N6A9SMLCwrj33nu5776VBShvAAAgAElEQVT7AFiyZAmDBw/G29ubhQsXWutYtGgRffr0ISgo\niKNHjxar2xKPaM+ePQQEBODj48OQIUO4cuUKr7/+OqtXr8bX15fVq1fz+eefM3v2bECLCnrvvffi\n7e3NfffdR5bZy2DGjBm8+OKLBAQEcOedd1rrP3fuHMOHD8fX1xdPT0+SkpJq/2Q3M5TwryL2VDwl\nYlhZ8TzZjWt9komafIyxQ4NJCkouSp4CeOy7BwSkex3CI/33pHS/UOP22Y7ILbFy6pqyHjoHP5xL\nvqcH/S6NJma1Oztu70jhbafApAeJZgfQ3+RTzw68a/CxuoP+5czGUg+BJhPRExg1Ctas0QT+669r\n32vWaMvrigkTJuDv7098fDwHDx7EwcGBF154gXXr1rFv3z5mzpxpN/ELqPDLLRXl7VNF7Kl47NrM\nA6OZfuYQSZuDSAhJ0tQcABI89gaR7rOf9MG76LcniF8de+DQcyAX3qt+KOSGZsoU+w8ai/pq7GPT\nSej7pVXVk3DvPnC8AQ55HPHeQ6RvIRS20tRjXNfmBCQ9SFpgNOya2+QmhI0aBc8+C2+9BQsW1K3g\nt8fRo0dJS0vjgQceALRAcHfccYfdsir8cstEjfyrSGVHoH5ndMwNP8WIX66iz3axqjrcfwwgLTGZ\nmJW9af9zEEec23N54yqOfT63Sem1q4qWcyCYsK2D2BR8mNgVvQnbHIzjBXct9aRDAbS6wVK/dkVh\nLcy2gCYV0dPMtm3w0Uea4P/oo9I2gLpGSomHhwcHDx7k4MGD/Pjjj3z77bd2y1Yl/LKlvuPHj/PH\nP/6x7g6gHMoLv9yzZ09mzJjR/IKw1QFK+FeRyo5A1xu1UMyRk49R6Hzaqu7J6H+QOIMPjxizubZy\nJ6zQ9PtNTa9dVS68l8j5f+4k/6GHeP7MPB49lc2G3UksTuwIeW257aQvFLQiwyeFVjmd2RR8mJi1\nLvxgnMfMcTHEG+vAWlpHWHT8a9bAm28WqYDq+gFgG864b9++XLhwge+//x6A/Px80tPT7W6nwi+3\nTJTwr4CSxt3QUPseNV26FF/mSpYW4tjxhjbiTw0gbHMwOOYSOfkY4w2zS+2rKem1q8sUw1w+2xBF\nr0IjgwxLiAo/Texqd17bKbUUlIU6bt1+jDbZ3YgwpvKBwZMPey7GMS3dmraysbNnT3Edv8UGYEcd\nXiVu3LiBi4uL9RMXF1dsvcV46+vrS2FhIevWreOVV17Bx8cHX19fUlJS7NZrCb88evTocsMvT548\nmWHDhuHl5cWECRPIyckpFn559OjRFYZf9vHx4YEHHuDmzZuMGjWKw4cPWw2+tvztb39j2bJleHt7\n89VXX/H++++Xe262b9+Oj48Pfn5+rF692pq5TFE2apJXOdibzOTkBNOnQ2Ji8QldAFM/isbvjI71\nxqW4kUmHPw7geo+fafvrXdzs9Bsxa13YfnsnEj2vUfjTxFLpDhvTpKa6oph3VKB2vsbxL94IP8qY\npAEkjExFd8sJU8dfaHu2HzedLzAmaYD2JlCNiW+1RXOd5DVjxgweeughJkyY0NBNUVSDmkzyUgbf\ncijLfz8xsbSQDl0UjWdhOgfDv2H9WhfAmevdj4FJz9vftgG0OD2+a6fifCyK3Fywrbop6rWrQ7G3\nm11zOQAcCDThmZTOpuBviF3lToQxFaen+pHb8witf+ltVQFFyKXFksooFIrqo9Q+5VCVPNz35+pI\nD7bk2j3N/N/fBMdcwr4bwsvGVMYbs/FdO48DPU1cutQw/viNAbs2k11zSdN74Lt2Hi8a04gz+HDT\n+QKtf+nNrduPYTjelwhjasvQi9Uzn3/+uRr1t1CU8C+HyoZL6PanUHZsTyyWa/dmj6Pocn7Hxt1J\nSATuGDlgjIJdc9HpYNo0bdtZs7TvadNaRiiDsmYhdzkylwPGKIYYFlu9fW51PsNtx/3I8P6esUOD\nQacj7rWYJqP7VygaM0r4l0NlwyUE/NqNhPu3s+P2jhiO9eWq2yGQYGqdQ5zBhyyKPy0KC4smiH30\nUdkxgZojZU0Ie/997dwe6GnCI0kLCxG2dRDZd2RpM6Dv3cfYwQFE5S3m0pc6a5L7rl2b9/lSKOqM\nyuR6bIhPY8nhu3x5UY5dN7cycs7q9VrOWkse24VIXnWy5toN8lgihdBy1paXa9fycXOr32NsLFjO\nNYHvyCCPJbJQp7fmMXYfFyD5s5OMNfjIDNyKna9Wreo+F3BTzuGraL6oHL51SKXCJRQWMuKXqyB1\nIKBjljexKzVD5ZhTD9LhMRMmU+XDKrdU1bblXMvkuSSlRaGTJiKMqQTt9STDN4Xg7wcTYUzFleIn\nKC+vec+RUCjqAiX8awO9nsXDHEFIOmZ6c9X1R3bc3pGYda7ke3pYQxxUdoJYUwtlUFdc6+xKnMGH\nZP80a1pLe2o0aBkPzKYQ0tkedRW6eeTIkfWSm9h2P6GhoWRnZ5dZdsOGDRw+fNj6v6Kw1g1KZV4P\nGuLTWNQ+9hj99jsydv4Sqz4oLHCUZKGQ3R7zlxLMKiAhwyY9Xmy75culdHIqX+Xj5FT3KoymQpDH\nEinmdJGxBh8pwaoC8jMsqXdVWVXUPu8kvyO3ntxabNnWk1vlO8nv1KgN7dq1K3f9iBEj5J49e2q0\nj7pg2bJl8vnnn6/1emtyvPn5+XWyn+nTp8u1a9dWq03VQal96pHQRdE4pqUTlbeYOOEMUvIfXyMU\nOjBvt5Y8d+OeFMKOTisVpdOesfPZZ1umy2dlSHY24bt2HuON2ZgQLApoTa90f7r3/AYTgnwc8Bw6\nHSaHNqo5EoN7DObRdY+yLUOL57AtYxuPrnuUwT1qP6ZzTUM6Z2RkWGftvvbaa9a3i+3bt/PQQw9Z\ny82ePZvPP/8c0OL0Dx48GE9PT55++mlrfJ2RI0fyyiuvMGTIEPr06UNSUhJ5eXnlhm729fW1ftq2\nbcuOHTu4fv06M2fOZMiQIfj5+bFx40YAcnNzmTRpEv379+fhhx8mNzfX7jkxGAzMnTsXLy8vhgwZ\nwvHjx4GiGdD33HMPc+fOrdZ+DAYDv/32GwBffvkl3t7e+Pj4MG3aNFJSUkhISGDOnDn4+vpy4sSJ\nYmGtv/vuO/z8/PDy8mLmzJncunXLWufChQsZOHAgXl5e1sB6O3bssJ4bPz+/MkNhVJvKPCEa4tNY\nR/6x87XRqMWY6/5wgDbKHxrcci21dYSbW/HRvefQx4vOtc0bVsADj1dYV02pqsF368mtsmt0V7lg\n6wLZNbprqTeB6mBv5L9w4UK5ZMkSKWXxEWpeXp4cNmyYPH/+vJRSylWrVsknnnii1PZjxoyRX3zx\nhZRSyqVLl1r3sW3bNvnggw9ayz3//PNy2bJlUkopL168aF0+depUmZCQYN1/RESElFLKb775Rt53\n331SytIjf3tvAgkJCTIoKEjm5eXJV199VX711VdSSikvX74se/fuLa9duyZjY2Otx5Camir1er3d\nEbmbm5t8++23pZRSfvHFF9bjmD59unzwwQdlQUGBlFJWaz9ubm7ywoULMi0tTfbu3VteuHCh2Dkp\nOfK3/M/NzZUuLi7y6NGjUkopp02bJt99911rnR988IGUUsoPP/xQ/vGPf5RSSvnQQw/J5ORkKaWU\nOTk5dt9W1Mi/HolYvrSYP3+GTwruh4axcXcSpswsawwg5X5Yc0q62h7YHU+3nweREJJEpye8SQhJ\nJmxzEI8cO9TofP9HuY/iWf9neWvnWzzr/yyj3Os3prNtSGdfX1/efvttTp8+Xarcrl27eOyxxwAt\ndHJl2LZtG/fccw9eXl5s3bq1WMC48ePHAzBo0CCMlYxVcuzYMebMmcOaNWtwdHTk22+/ZfHixfj6\n+jJy5Ehu3rxJVlYWO3fuZOrUqQB4e3vj7e1dZp2WY3rssceswe0AwsPD0ev1ADXaz9atWwkPD6dr\n164A1tDXZXH06FHc3d3p06cPANOnT2fnzp3W9fbOW2BgIBEREXzwwQdkZ2fj4FC7ARlqpTYhxB+A\n9wE98A8p5eIS62cAS4Az5kVLpZT/qI191ztZWUTITJYeCyDDN4WOmd4Y7z5KnMGH8cZspI2/PigV\nTk2wnLv58zWDrl4WMu/7fCLvduCq2yE6Znoz4perRIWfJia3cY1jtmVs46O9H7Fg+AI+2vsRowyj\n6vUBIKUW0tlW8JVFyZDOAA4ODphs3NMsCV5u3rzJc889x969e+nVqxdvvPGGdR0UhVvW6/WVSv94\n7do1Hn30UT799FNrvgEpJV9//bXd6KKVxfaYbH+XDFNd0/3UFvbO27x583jwwQdJTEwkMDCQzZs3\n069fv1rbZ43vGCGEHvgQGA0MAB4TQgywU3S1lNLX/GlSgt82sudpnStjhwaT4fM97gcDyOl6xhrS\nwTZSZ3MP0Vxf2LraCr1ei5SqK4BCB666HiJy8jEt7s/ypQ3dVCsWHf+aCWt4c9SbrJmwppgNoK6o\nTkjnwMDAYqGTLbi5uXH48GFu3bpFdnY23333HVD0EOjatSvXrl2z6rMr266SzJw5kyeeeKJYyOaQ\nkBD+9re/WW0JBw4cALSY/StWrAAgLS2NQ4cOlblPS5TQ1atXM2zYMLtlarKfe++9l7Vr13Lx4kUA\nLl26VO6x9u3bF6PRaLU/fPXVV4wYMaLM9gOcOHECLy8vXnnlFQYPHmy1BdQWtTFcGgIcl1KelFLm\nAauAsbVQb6PA9/lonls1iZkikEIpWN3LmYT79tDu1AD6XXCwqoA8kh7kQM/ijvwtwf2wPhkbPsWs\n6gnmtkwvLUGO4w3tgZCZ2WhCP+w5u4c1E9ZYR/qj3EexZsIa9pytWUznugjp/P777/Phhx/i5eXF\nmTNnrMt79erFo48+iqenJ48++qg1g5ezszNPPfUUnp6ehISE2A3hXJKyQjdnZmaybt06PvvsM6th\nc+/evSxYsID8/Hy8vb3x8PBgwYIFADz77LNcu3aN/v378/rrrzNo0KAy93n58mW8vb15//33effd\nd+2Wqcl+PDw8mD9/PiNGjMDHx4eIiAgAJk2axJIlS/Dz8+PEiRPW8m3atGHZsmWEh4fj5eWFTqdj\nliW2Sxm89957eHp64u3tjaOjI6NHjy63fJWpjGGgvA8wAU3VY/k/DU2tY1tmBnAOOASsA3pVVG9j\nMfjeFjZR8mcnySsdZazBR/Z7KEgyv41kfmsZY55t6mdYIgl8R83UrWO6vjRahgWOkrEGH8m8jpI/\nt5XMbyPb/XGA1Q00dv6SYttUaoZ2JWgpM3wrcidtCliMsi2BpmDw3QQYpJTewP8BX9grJIR4Wgix\nVwix98KFmiczrymhi6IZev48FDqAvoDIyT9zxO+/4HCTsO+G8Igxu1jANltaSojm+uTCe4mMGBmq\nJYBZ5U6/Q4NA6rjePYPIiRnErHWBDf+yjv4t+RhaUuwkhaKy1IbwPwP0svnvQpFhFwAp5UUp5S3z\n338Adt/XpJSfSCn9pZT+3bp1q4Wm1YyMrP38Z+Q+wnb4ABJa5YI+H4eLrmzcnVQqzIBer/z165ot\nbU1aUhdjKk+l5WC5LreduxOAqNCj3G82/paVj0HZYsrGkqaxKWM0Gq1eOIqyqQ3hvwfoLYRwF0K0\nAiYBCbYFhBB32PwNA36qhf3WOiVTNj6x/RRISLh3HzjkaYUkFHT4rVSYAScn+OKLCmIAKWpM4vy5\nWjYvNzdtgckR8tpy2eWodfT/9Mea8bcq+Rgqg5SNM+udomVS0/5YY+EvpSwAZgOb0YT6GilluhDi\nTSFEmLnYi0KIdCFEKvAimg2gwSgp5OPj7asI5vycoo36HW+ArhBMeshzAgGREzOY1Ge2Guk3EHFT\nZ1vVP8HfD7G+lQE4XdSke2XzMVSGNm3acPHiRfUAUDQKpJRcvHjRbr7lylIrfv5SykQgscSy121+\nvwq8Whv7qikl8/Ja9MBt25ZWEQDsdreZQl7QirBtg0gYkUr7LB86P27CpFQIDcKWtibeiO8LXLcG\nfksacoBPPTsw3ujKSINNrmAbHB2rZ4txcXHh9OnTNAZblEIB2oDExcWl2tu3uBy+ZemB7Qn+twwB\nnL9zP+Q7Efz9YJKGHCBhZCqjtw9iW7s/kLhibumNFPVC4vy5BK/UsSt0sebnb9xBXIYPUeE/MT5t\nHplG+9tVNqx2SRwdHXF3d692exWKxkbjmhZZD1RF3xvj2QsKWhG7ojc7t+0gdrU7SEjp+jv+8YQS\n/A1NzggTg74uCvxmmye5LAoL4aWX6rGRCkUjpcUJ/7L0vY4jo/G/M4YMDBSiIwMDAnA7O5qnc4qE\nS9C/F2BwHaj0+42AOQFzybgShTtG9Jjw72Lf7bYk5kmZCkWLpkWpfeLjwZ4nmxDQpft+9g79D+tX\nuRNhzGS9wZmrnv+hx6XRtP/NCIABSKrPBivKpKTtBiA3V3uIexp1rDcuxZUssnBlvGG29jZQwUNB\noWhJtJiRv0VY2Bv1SQnP/ldz64yclMHwUSOInJQB0uzuqWh0lGW7GXhWx8Hwxaw3OKNDst7gzMHw\nxfidKerqXbrUc2MVikZIsxX+Jd05X3rJvlGXwGj8DDEsMKZoOn1dPkkjdoDDDWJXuzPn55RSbqGK\nhqcs282qn7WQ21Hhpxk+aoQW8XOtC+uNmu9/q1bw/vv12FCFopHSLIW/PZ/9svS8fme0keK7Bh9t\ngdCycSGLTo0KD9D4KNOHnyxr0vekETsI2utpTfrepQt06ADTpqkHuUIhGuukFX9/f1nd5MwGg30f\nb3tkYGC9wVlT8zjcAn0e7oeGkdEnDQT8ZZUnC427im3j5qbN4lU0HPZ0/k5O8GtbA590cCYq/DRB\nez1J9k8jZq0L4VnZuJqMperp0kV7E1AGfEVzQQixT0rpX1G5Zjnyr4o7p2Wk2PbSHeBwC/fUYZz8\nV4rVrTPGs1epbVSo5obHXj7kTz6BT2bNtqp6dg/IxOFaJyInZbDG1RkTgn6hw+FPLhCoBX+7eFG9\nzSlaJs1S+JelEujSxRwS5vl+uIb+ARMCgSTO4ENu9xNwswPG3lpWrheNadzzrwXkXB5Y6foV9Ytt\nohdLPCVr4DeZTZ8MF/J/dxIccvmHZwe8QoM4MjgJ2v1WzAB84wZMnapUQYqWRbNU+5SlErDE33F/\n6A8Y/TfjsSeImYdziJxyFBxuWv9HhZ8mptU8uvePKrceRSNHCDxDg0gfnAxSgJBQ0JrY+H6MN4fj\nLom6voqmTotW+5SlEpgyRYvRPzv9Fzz2aEIhcmIGONyk3SkPfkxMJkJmE9NqHlvamsqtR9E0SEtM\nRnelB+gkCAhOGWo1ANtDhXxWtBSa5cjflvj4ogTgrq4wJjCGD3tq8WDmjLuAyfksmASxX3rzsjEV\n0UjPh6IaVGPkb96s2jGAFIqGprIj/2Y9w7eY+icwms5ndLy7Yh7uBk8ipxzRvHskICSfDejAy8YG\nbrCiVvGcEEK6x2YoaE2/1MFcbneTX/vtJXLKEW6sGISfjLE781fZdBQtgWap9rFgOwvU4s//gcGT\nzwZ00AS/APfUAKsKyBAa0rANVtQqP7saaXfWm9j4fjyVlsN51wy6H/FHd70z8R5af/A/V/wWsE2/\naS/vg0LRXGjWah+dTpucBUX+/FHhp5H6W9D6Gu6pARh7HyVmrQufDujIkTvPI/92pBZar2h0GAzE\nidL+/0/nZOPZ3khmppaGs7BQs+2EhmqZ2ZSxX9HUaNEGXwudOxf9tvjzG473hTbXcD8UwMkNKdZQ\nAO2PhLF8qBL8zZYs7fo7/+pabOZvu4uZ3N46BgKjKTRP7s7MhI8/Vvl/Fc2bZi38r3ppcXu08Mya\nP3/GgH20PdsX491F/vyBifPI/4NJjeiaM66uxBl8uNzjBOS1JemeA8QZfHjX4MMPY4sHfoOiN8aS\nqAl+iuZCszP4Wrx7Ml2iYdgSDgReY318X8CZyMnHQF/ALaer1hE/rRaT9HZUQzdbUcfETZ1NVN5i\nYldp2bgiJ2YQOflnKHQkdrU7441LcafifqCMwYrmQrMS/rbePX5Cx4HWOeBwi8gpR2l7wU1LxA70\n+/kuIuQpMPvzRzRwuxV1z5a2JmKYR4RxDgCv3HCloEsWt53sS4TxIBLwHDqdtDsvwAotHbUQxd8A\nbI3BCkVTp1kZfK0B3QKj+cuZjbTnepFLpwAkeOwJ4sfEZOXP31IxG34jJx/TBgOFjsQuH8CO2zuS\nEJJM5yNDuaR3xmlDItOnQ2Ji0RyRRYuUsVfR+GmRBl+LPtbvjI43wo8C4H54kCb4ARDMPJzTIG1T\nNA7ipmqB32JX9Kb7EX/Q5xM57RAJIUl0PzKIS/12453RjU8+gb//vXTsIIWiudCshL9FH7veqCX0\niJx8jAzvFG0ilwlAEjlFM/QqWibWwG/GVH5ZvRf9ZRfQazN/f+23j7DNQaT+EK8EvaLZ06yE/6XR\noXgPnY4b5mD+DrnaqP9mB2K/9IGC1uBwkwX3K5VPSyVx/lwi3o4CvZ6xQ4MpvO0M5LcGx1voL/dk\n4+4kKCwk7rUYQhdFN3RzFYo6o1kJ/1HZ3TgU8hXjhgbzqWcHbaEE9JoDd0x8P9qe9qag462Ga6Si\nUTA2fAoJIcl0PzIIHPLApKPwttPcPtGfOIMPUXmLuT+3Wd0eCkUxmpXBFwcHxg4OICEkCUwOoCsg\nbHMwI365ag3THKHcOhVAtz+Fov8tm1/v3k3Y5iBG/HKVyKmHQZ8PeU7EruxNhMxWKdsUTY4WafCV\nhYVs3J2EPtsF9AV0zPJm4+4kXjamWsM0KxQAF95LpLCrM2FHp7FxdxIRxlSCdwWAAIdrXYkwpqoZ\nXYpmTbMS/oWY9bjOZ+iY6c1V1x+1/+iJeDuKxPlzK65E0WJ4b3Aiqd9/gRE34gw+JA3bg/vBAArb\nXNecAnQ6pftXNFualfD3G6rpccM2B3Fl2SHCNgeREJKM31DluqEojmVCYGYmjDdo7p9hWwdh7H2U\nMUkDiAo/zdjBAUr3r6hX6jOSbLPq1Yd7X8Bz8zS+3p2CBL7enYLn5mkc7n2hoZumaGTYhvs+0NOE\n79p5fL07hb4/9SdhpBYAMOHefcSsdYEN/1Kjf0WdYzsgkVL7fvrpunsANCvh/2VIIicPfYEjBeiQ\nOFLAyUNf8GVIYkM3TdHIKKbO3zWXA8YodJh4Ki0HdPlk+KQQ/P1gAKJCj6rRv6LOsR2QWKjLSLLN\nqkernLuKymIvQFsW5oUmRy3y57AfiJyYQcxaFx5ZtBQhwMEB7r9fJXlR1D5l+RfUld9BsxL+oAl6\nNSVfURGLFmmB2mwJv9Mc+mGVO8HfD4FWuZrrJ9DLnPC9sBC++67+Xs0VzRtbHb+uDGlcV5Fka0X4\nCyH+IIQ4KoQ4LoSYZ2d9ayHEavP6/wohDLWxX4WiukyZAtOna9m7QPvO8DTxxtq+AFbPHwod+dSz\nAyZ0+Bm0pC8lKflqrtI/KipDSR2/JZmQLXUZSbbGwl8IoQc+BEYDA4DHhBADShT7I3BZSnk38C7w\nTk33q1DUhPh4LU2j5YYrLIRLm+aygYeLef6E7fDhaP+feGRoAAfDSyd9sWB5Na9vo52i6VJKxx9Y\nlHyqEB1GDIwJjCHeWDfOBrUx8h8CHJdSnpRS5gGrgLElyowFvjD/XgfcJ4QQKBQNhD3jmpTFPX9i\n1rqwKfhwMc+f9calduuzvJrXt9FO0XQppssPjKa96785MOkt1huc0SF5cagrq4csIP3I/jrZf20I\n/57AKZv/p83L7JaRUhYAV4AuJSsSQjwthNgrhNh74YJyz1TUHWUa0Ww8fyKMqehvtrN6/kQYU3Ej\nE8+h02FyqHUT21fz+jbaKZoutrp8vzM6rrmlgiggclIGd44LICEkGUw6ntp1quxKakCjMvhKKT+R\nUvpLKf27devW0M1RNGPKMqJZ3kez0HL+FrT/DSQkBewmzuDDuKHBpIV8heGKCQKjS3mUlVWvSv+o\nKMmiRUX9bb1xqZZiVDqA4w0yfFOgwJHYlb2Zn5FSJ/uvDeF/Buhl89/FvMxuGSGEA9AJuFgL+1Yo\nqoU9bx8nJ5g1S3MRtsz6jV3ZG489QVo60MfTSAhJxmNPIJkD9hI7UlfKo6yselX6R0VJpkwBGaDp\n+d3IJMKYivtRT9AXaAWkvk73XxvCfw/QWwjhLoRoBUwCEkqUSQCmm39PALbKxhpOVNEiKGtOiCV7\n1+1PFiV9SUtMpvWvvUFXCDc7cNjjJ2LWuhCxvLT+X801UVSW0EXRdDDr+d81+DB2aHBR8qlCPUhB\n5KQMlvQJqJP910pIZyFEKPAeoAc+k1IuEkK8CeyVUiYIIdoAXwF+wCVgkpTyZHl1Viuks0JR21jD\nhCfDzQ7Q9iq3Hffj0vIDAMTNX8KWtiYVNFBRZfo/M4kjXTdpwl4ADjdBmMDkQNj/DSNhpGYD6Hdx\nDD/9z6pK11uvIZ2llIlSyj5SyruklIvMy16XUiaYf9+UUoZLKe+WUg6pSPArFI0FS9IXjz2B2oJC\nRy7fdQDP0CCV9EVRI57adUrLOyLQsg7qtJDzYf83jA27k3hzlSdup8fg7jqwTvbfvJK5KBS1TLc/\nhdL9jInD7nu1IG9A5JSjoM+DvPbErnJnvDGbkW5GFi1S6h1FFRCCOIMPkVOPgIM5u2B+K2Lj+/Oy\nMRUdEje3qucTapHJXBSK2ubCe4m4+t5LTKt5vGxMZUvPTrinDwSdidvO3kWEMZWvDc5kukSryVyK\nKrPj9o6gNwv+QgcwORA5KYO3DJqevy5dhJXwVygqIHH+XLr3jyILNxwLJRk+3+N+MIDs7lmMHRrM\nnPDT+J3RqclciioR5xFAwv27AbRQIvlOmgpIFBDjqTlQ1qWLsBL+CkUFWEI2PGyYzabgw4RtDsLY\n+6g28zckmTFJA6wzf9VkLkVZhC6KJu61GGvgp0/vagVAt5/9ObkhRfPzNznQ3jiInMsD69xFWAl/\nhaICLCEbDvQ0sWStC0IxldEAACAASURBVBt3J+H8qysZPim4HxpGvl7gRiZ+hhha36uSvijsk5G1\nn8iCt4gTzlosEVMBFDrinNMeE4Lxxmz8Vi3gWtZD6HfPrXMXYSX8FYoKsI7md83lEWM2cQYfLvc4\nAXltyeibxv1nrvCuwYeD4Yt5sru6pRSlCV0UTZ/08yAgcmIGw0eN4IjXftAVMi0tDz0m3DFywBiF\n04G5fPFF3TsPqJ6qUFSArd7VOvN3lTuxK/qAhMjJP1uTvvxtl/3Ab4qWzf25OjYFHCJsuw/o80ka\nsQNa3SDsuyG8ZkxpkEmBDnW/C4WiabNokabzt6h+/NbO42XjHASw4YcRJI3YwW0n+xJhPFgUrEWh\nsCFi+VIQLkROSrUmCKKgFSN+uYqg6u6ctYEa+SsUFVAsZEPKXC7JKK53cSPO4EOyfxrBO0aQ3f0U\ncQYfFcFNYR+L7lB/E/QFdMz0hsI2RE7MIM6jbsI3VIQS/gpFJSiZHvSTWZr6J2atCzu37SBmrQtR\n4aeJmzq7oZuqaIy4urJ4mCM45ON+MICcrmc0FZCA9/16Vbx9HaCEv0JRDba0NQd+k9kgBBEym5hW\n89jS1tTQTVM0QuKmzuaCu+YmfHJDUaIgz+3jyMoY2CApP1V4B4WiDoiP11xEs7I0TZAK/dCyCV0U\nzS//0LHeuBRXssjClfGG2RzoaYJdRUEBnZxqbvCtbHgHJfwViloidFE09+fqePrjpThdLH6DOx2o\ne79tRePGwcF+kvaSVCeejy0qto9CUc/cn6sjKm8xn3TQcrCuNzhbk76r0A+Kygh+qL9Z4kr4KxS1\nRMTypfT9qT+RU45w58MBVoPwZFbQ4aFJZLpE17teV9F4cHOrXLn6chhTwl+hqC2ysngqLQeQZPik\nYDjeF4A5E0+Q4/kf/M7oyMyEqVOha1f1EGhOxMdbQ/aU+YC3l+KzJPWZ8lMJf4WitrAM2QrbQH4r\nMrxTiJzyEwiIXeVuDf4GcPEiKgR0MyE+Hh7/n2g6ixhOSgMnM3UETzPwwpQYQhcVxXqyl+Lz2Wcb\nLuWnEv4KRS0RN7Uo9ENwyjAtPK9jHu5HPIkwpuJKcWWusgM0D574RzQD8tM5GL6Y9QbN3vPCPa4s\nNSwsleWt5HwRS85oy//6dAhQwl+hqCW2tDURk6ipepKGHIC8ttobgMd+4gw+ZFFamatCQDdt4uPB\n06gjPfgbxiQNICr8NHeO03I+h20dpIV1aKQo4a9Q1BKJ8+fCuIeJnJihqXpW9CE2vj8UtCJyUgbj\nDaVn/+p0SvXTlHlyWTTj+Jd10laH33qS4ZtC23N92Lg7qdjTvTJ2gfpECX+FohbZ0tZEv8ujid2k\nqXr+X4AD/X70pW+aN7f33EQ+DngOnQ6TQwHN/W/mzIYXBIrq0f+EjjfCjwJgON6Xq26HoNCB3NvO\nFYv1ZEkIlJmphfLPzGx4m48S/gpFLTLFMJfczauIOrwLg5ukR4YHRwYn0fc3QeKunTwyNIC0kK/w\nPNnNuk1eHrz0UgM2WlFtEk4tJWatC5GTj5HhnQIFrSHPibAdPsViPVkSAtnS0DYfFdJZoaglLKM7\ny02emQn5mfE8IoNICEmmU39vrromE7Y5iK93x+PIF9ZtL15soEYrakTPwizAGfR5ICB411DGZWQT\nFX6YMaceZIuniQjKtu00pM1HjfwVilrC3uhOTyEbdyfRMcuLq26H6JjlxcbdSeip5HRPRaPDNhev\nQPKpZwcobMVtJ/1I9k8DICaxL/meHpodiLInbjVkBHAl/BWKWsLeKK4QPWOHBnPV9f+3d+/RUdVZ\nose/uyoBEgQjEBEIlQqIIIk8BDWGlJGW7jRRiD29aBkjcKdv6+2eca49gWG4g3fZvZS1EEPWONfp\n26O2LjSo04zdEjTd3KZbMYFBQXmYBFAkIYDKQ4iAiUKqfvePUxXyqEpSlZB67c9atULFU6d+p5B9\nTu3fPvu3Dzl3LeccH1GY7QLAUfB9+LtJAAwf3p8jVb3ha+NRKimUOqdyIMvK8z/6rqe1tTf3/qA1\n8IP/G7z684YufzTto1QfcTisVE9b07OLqM5/masaMrngqOGqhkzK86sYemMmFxybcezM5/NEePrp\n8IxZ9UzbLq0NtmdgrBXkU06MBQNr/yOD4vq91p1a3tbexW1e76vfj6hOr8aYiHzMmDHDKBVNysqM\nSU42xqrn8D6K5prZ9yw2HjCZBbmGxzD88yDDY9ZzD5jZ9yw2Ix6Z63d/6enGiFg/y8r6/ZCU6fz3\n6kaMAeOanWf4hfXTgPUXFQGAXaYHMVbTPkr1EX+375fNreAvm6yJ3eqKKvhmKAz4Br4ZSnVFFfdm\nu3h7xsuk7UptV/8diaWB8eonL67B/p2F/NI5Cw+CYCjMdlGZ815rnj8al/DUfv5K9QMjwk0FudTc\n4j0BDDqHnL8WM+SUt/pnO4m0AFYuOCnJfwVQb3u9q+DdnFHC7oWPt6Z3tl43lPL8KriYxNpXJwBY\nHVwHrKD4iWVhHq3281cqooy7O5+aW6rI3JmLedIb+IeeRM6ndqr+aWoKXPrZcU5BXVmpPy/guusq\nWPtaBggsvf9jyr+3DTzC2lcnUFy/N2qX8NTgr1Q/qHfW49iZz0cVVRRmuzBDTiHnrsUMOUlhtgs3\n9h7tx96zzVQfyTmRyh/y32HrdUNxvTcdBjSDzcOwumnWBK8I1NdT/MSydtU90UCDv1L9IP3NAzRU\n/JEp2Yutpl+bc/GUnmT+Zhfl+VVMz+5Z2UdPV4NSvZf68wKor2P+5lzK8yupdFWBAYxwZswhSp1T\nqTeOiOjTEwoN/kr1A1+dd/W4U2RtXsTrO7ZjgN+9t50pmxdRPe5U67bJyYHr/nu6GpTqvZwTqZTn\nV/HpMAPGDjbrzJu5c5aVArrPatbnm4z/27+NrMZt3elV8BeRYSLyJxH5xPvzmgDbuUVkj/dR3pv3\nVCoatVYCbaug5r11XJ/ewitlBrunheUPryN9W0W7BT2efjrybgqKNxs3rGf+Zu8kvbitq35g/Bmh\n5LUMhtTMZfcYK8/f1AS//nV0VWf19iavFcCfjTGrRWSF9/k/+dmu2RgzrZfvpVRUKyrqfFNPwao1\nzGm2UX3hGZJNAw1HHPzVow9T7fTwkyXLqaiIoJuC4oDv76O47Jn2OTaBoUemcOf+qynPr+Lw5kWc\nf3Ndu9d2LJz0NW6L1L+z3qZ9CqG1O9U64N5e7k+puOJrFfDsEGsFqN85U9izYDVZ9Taef94K+OFY\n5SletW3dAPBm9iEABn4xgXOOjwCYt9nVLk3XlUherKdXdf4i0miMSfH+WYCzvucdtmsB9gAtwGpj\nzBsB9vcQ8BCAw+GYcUTr2lSsczq58aaxHMjch+t964ahkg1pXGAwj40pJP3Ycq3r709OJ6WSwrIF\nx7jmcwdnxu8mc2cu1d4qrfL8KrI2L6J6R/urfpHOV/4Qnvsy+qzOX0S2iEi1n0dh2+28txUHOpOk\newdzP/AvIjLe30bGmGeNMTONMTNTU1P9baJUbGlo4MHq8zCgmcq8reTuygLgFwsOkuWu4Ujamm52\noPpUQwPF9XvJ3ZXFmet3M+zT6VRXVGGA13dstwL/uFMMH97+Tu6f/jT65mi6Df7GmDnGmCw/j43A\nCREZBeD9eTLAPo57fx4G3gGm99kRKBXNfC0B3IlgoHLWDpYurGNe5WRqXG+R2xhaZjbSlgyMdK1t\nmm02Sp1TqZpZTcaeHM44DlLqnMoR0kmkxbrif6UCaJ+S+9WvOrf2ePbZyE7V9TbnXw4s8f55CbCx\n4wYico2IDPT+eQQwC6jt5fsqFRNKH3iYZQuOsfaVCWTszYGEbyGxifK8vZRsSOMPXwS/ALj2BQqe\nL9dfeEsOyxYcY17lZOonHGT+X2awbMGxTusvf/ll58+0qMg6EUTLHE1vg/9q4Lsi8gkwx/scEZkp\nIs97t7kR2CUie4G3sXL+GvyVwlrzt6RiIgD1Ew4y9MgUsLcw6Owoiuv3ctWZ4GcMI3HJwEhXXGYt\nx1j+nQ9wHprIJlctJRvSeH3HdqZtWNFa0tlWtH+m2thNqTArfbSEZRdXM69yMptctTgPTaRuyn8x\nf3MuGz9vCHrG0GbzP/koYl2VKj+8H9ods/OozNuKa2se7769FQ+CncAfWiR+ptrYTakosSXJw7yj\nd7debR7+/XZSP55B+V3vU3rVmNbEfemjJRSs6n4COBKXDIx4Dkdrrt+1Na+1TXMDXX9o0fyZavBX\nKswqVi7nUlam1RLYNIIIw5uuAo+d5zKsK9JSSWFpy+PUNXzY7f4iccnASOebeynZkMa7b29tXY6x\nY66/rWj/TDX4KxUBKlYut3rBe2cMHzx0EUwCBzL3ccfsPJbeVwcCD2472u2+/C0qE+mVJ/2p7QLs\nvm9Vz53axcSv5lBsGjEIPzraGDDXD7HxmWrwVyoCFddst3rI2y9RmbcV7JdY+1oGxTXbe1TGGW2V\nJ70VTGlru7t4vd+qDl69hTkXZuKkHrt4yE2rZ3f9MtjWuU2zt4tz1H+mGvyViiIGLePsyF9p6wMP\nwIgRnT+XglVr4I3ft6Z17pidx9KFdUzcfyP/+Ooz7fYh4v/9ojnP35YGf6UiUGlmjpXq8STi2poH\nnkSW3lfHqowcLePswF9pK/ivxZ/TbGNZwUEAcndlWd+qbJd4sPo8aaZ9Wa0xnU8A0Z7nb0uDv1IR\n6LlZY0Fg7WsZvPv21tZlBNdkjvW7fSQ3ELvSujr2jidGXz3/0vvqqLz9fbiYBJ5Eaz9+KnuMid25\nk962dFZKXQEZjpt5sHkmxeYZELGqgOz/m5XN/icgYyUVEQqHo+u1jdudHBoaID0F7JdgQDOurXnc\nW9fIsgXHKNuwAurbvzYcjdn6i175KxWBOlb/+NaJnTgRZo4roQ4nbmzU4WTmuBJS7o7fBnD+Slvb\nandidDh4LmsIuBNb6/kBVm+cyN6x7U+ssZTi8UeDv1JRZPE1Nj744Wp+57zc//+DH65m8TXx+0/Z\nV9rqb+nLjgG89IGHOXjjftb+R0a7ev6EBT/gpf+xPGZTPP7E7/8xSkUhX87aV6niuzGpuCz4BnA9\nES3dQYuK4PRpKCtrn6Of8N/WcGL/5Zr+LTv+zLyjd7Nl/KjWdFrJgBVsSfLEXXksxpiIfMyYMcMo\npToQMQaMa3ae4RfWTwPGiJiyMmPS061N0tONKSvr3VuVlRmTnGzt3vdITu79fvvT2pVPGfnH4Wat\nc6oxYNY6p1rPVz4V7qFdMcAu04MYq43dlIombVaayt2V1bry10PnGxnZXN+u5DE5uXepC6fT/0Rq\nVE2CBvi8ik1jFB1EcLSxm1IxKFAPmrnXPdzj+v+epnIClVBGVVlpm5W5fCulFdfvjbKDuDI0+CsV\nRbYkedo1gPPlrKtS/JeAdoxxwSz0Es7uoH021xCgW2dc18b69CQ3FI6H5vyV6rlBdz1ppjufMnWk\nGzdi6kg3051PmUF3Pdluu+HD2+fwfY/09M77DFfOvzfvO/eJJ83D9z9ljtqtz+HxjBzDPw01k+7J\n1Zx/h4de+SsVA34y0sbu+x/jkWxHawnongWruT2ppnUNgPXrrZYH/vjLgoSrO2iglciWLOn+G0Dt\nwQ95Zuzj/HasVQq7PtMG9haMJHSq7ol3eoevUjHg/2x7hobDMyjPr2LcqBzqrz/oXRnsLUqabwK6\n7v8TKAtSVNT/JY+B0vFut5WiAv9jKli1hsmfnOTIGFh6Xx1vvJ/HgZt2gngo+uhi65Jbxd5HvNMr\nf6ViQUMDG3dUkrHvduqmbmfI6TGtK4P57gHoqgXClbiTNdS8fVfp+K6a2M1ptvFH1z7mvzP1civs\nAU3M//MtPFq/Pdjhxzy98lcqFjgclEoK9dcfhKYUzqXvI2NPDsXeoPedeUvg/lPwSkWnlw4f3ndX\n9+vXW8HZ1xLZV0num1iG7t9r1SprW3+dOqHzN4PUnxeQcyKVja+9BM6pLF24FxK+tf5jywDyvjgX\n+gHFML3yVyoG+EpA51VOBvtFMFA3dTuF2S4Ks128PeNlsg6ndnqdCDz9dN+MoW0lEXReRD6Y1tNJ\nSYH/W8dvBjknUimf+DKF2S7rF4lfg82D/UwauAex9L46nrohp2dvHEc0+CsVA7YkeZi63VoEfu2r\nE5i/2QqE5d/bRnl+FfM357J7R+e8izF9d9UfqK9+W4Hy+b4UkQgsWhR4Ytpfs7WNG9Yzf3Mu5flV\nLP3REbC5STo+Cc/AZisFJPDinf5bYcczDf5KxYCKlcvZQybTNqzgH+r3snFHJUMbplhXwI2j2bij\nEjvuTq9LTw/9PTvm9LuaU/Dxl8/v7huDT8BqI7ebjTsqGXjiekhuxH42jabnDlCyIY1NrlqmvHMv\nAxNuDuLI4oMGf6ViRPqx5eyuX4YbO4XZLs45PmLokSm4Uz6jMNuFG3u77XvTstjfzWLdCfR+PfnG\n0OW6uXbreL8d+QkDv5iA+5rjFGa7+If6vUzbsIJ99kwa32q/Fm+0NKy7onpyM0A4HnqTl1LB8d0c\nlZW92PCYmPnZLmPAzM92GR4TM/uexe0av/3sZ6E3gktP93+zWKBHV/v39qrr9vWBzF/o/3izshe3\nvl6k8+cUzQ3rukIPb/IKe5AP9NDgr1TwysqMsS2aa7KyF5tL2I0HjLHbzfyFi82IR+a22y7UAFhW\n1vOgP3z45dcEOtF0dyLpblwjHplr5i9cbIzdOt5L2K3Af/9cvyePQO/X1QkmmmjwV0q1mvvEk1ZL\nA28EPmq32j8w68mgAqC/k0ZXD5HuTzQ/+1nnq3/fc9+JouP4TXq6WbvyKTP3iSe7HV/Hk0dXY40F\nPQ3+mvNXKgZ1zGmPr7Wx1DxG4SgHGMNvx1rtH7LcNTDr8hKQ3TW77El+vi2HI3C7hpUrrXGuW9dh\nknfWGn40p4S5d83lf8o0ipYkkPjmm63jL8hxWW2aL65mTnP7ENZdS4r1663fBxprPNF+/krFGN9k\nbNuAe0Sc/P1tDsrzKxn26c2cHXXE2/6hlszKu6m2Z8K25djtVhcEh8OanO04wWqzBa7GSUyES5cu\nP/etJ7Bokf/XiHgXX59VQNbhVL4YV8tX13zJhLox1E77kMTGkVxKOQEXk2DQOTI/vI2aW7aRse92\n6q8/GFJf/kBVSSLw8suxsXqX9vNXKk75u9JOM1b7h2Gf3syZ6z/E9m0ym1y1zKucTI3rrdZvAG73\n5eqdH/+4cxVMoKvj9HR48UX/V9ydXjNrDUPuWUjSf8+iKecWJjeepzr/Zc6M+IxLI+qovaUKEpu4\nNOw4JDbB4C9JOjGe2sz9re0rOvbl7+0aBaYP73eIFhr8lYox/gJcA1Zf+7OjjmA/Mxb3sKPYvk1q\ndwKYfrx9OLh4ER55pP1+Vq2yrujbSk6GggLrpNPQ0Plbw6pVYHOtYbqzhDqclBx/hfNZf6Dpuo85\ndcMuaqftBHcCnms+a79j+0UQsJ9No3nMAZyfTKT++oOd+vL3xRoFvbnfIVpp2kepGOMvtTHdWcKe\nBatbUz1GWiD5K+TctWB3k3xmJOmfD8Ntg3FnbZRv28b07CKqx53CrG/fD8jXv8cX6AsKrLx9u28b\nRQUkpHxCwQejuWSHBLdh0+wPsDVdzbBToxnZmETNLVXgsYOtzc1nAlxMhgHenV0YAYO/ZNin0zkz\nfjfzN+eycUclpc6p1opmA1bwr2XLerzcpL+UWG+Xu4w0PU37aGM3pWKMv8Zoe8d6uHP/3WxyvcW8\nysmU37kXPGCGniTp+CScnw+zgnHLQB5aP4kfZudQnf8Sjp354HRSkOZkTuMlnhs/ADwtfD3rG+QH\nn9F0YjQvXWri0t80Y0v4Fk9yI7bmq0n+ahgXRhyiPP8QfD0ckhrB5sYzoImvm66mZsKHVkC/fnf7\nwTdfDYO+At816eDTjDwwkxPja8ncOYtNrlpKv5hq5fq9ffm7W26y48lqyRKoqPD/LSWe9OrKX0QW\nAL8AbgRuNcb4vVQXke8DTwN24HljzOru9q1X/kqFrmPAW7UK1tevIbG6hk1j25wAEr+2rrzdCWDs\nkHCRoQ03cc7xEZk7Z1GbuZ+SDWmA1SMfewu4ExhZfwMnJnn/fboTwd5mptcjYDOXf7ZhP5uGe9gx\nrjk0nbNpn0JCk7VPPxJPjuPStYcBmLwzlzFNV9F86i6qUjykH1veGrS7Wmje34kw1q70O+rplX9v\ng/+NgAf4d2CZv+AvInbgY+C7wDFgJ/DXxpjarvatwV+pvlew6vIJoGRDGm9kpFh974GMvTl8mXKB\nc+n7GHpkCl+9uK81vZK7K4vK23aDgYyDWdRN/S+SPptI8+gDHd5BuHzZ3uZXgFwYgbnqNAO/mMC3\nIw9ZJxz7JeskYRJAWqyThTuRyR/exsmUZm4/PJg/zDhOkvsG3C9V+A3iEDjA+9pLd+QvJRQr+iXt\nY4zZ732zrja7FThkjDns3fY1oBDoMvgrpfpexcrlFKxaQ8mrh4Cvqbx1t1VKKW7qst4Hm5uhR6Zw\nzvERhdkuNu6o5I1deVTmbcW1NQ+Ayryt1jbp+7A1jsaT4p2o/WYIDDrf+U295wIz+DRJxyfRnNrA\n4KOT+XrMAQYfzeSXf7HC0PNZQzg95FsaB7tJdM7j1L9d7sfjdMKRAPcK+IK4vwnnRYv8fw7d3c8Q\nD/oj5z8GONrm+THgNn8bishDwEMAjni740KpflKxcjmlzTaWtjwOAmtfuYEXJg9pzfk/9raw9Tqr\nRXLWsFxqM6txbc27fOW/J8e68j8+ybry913oDzwPxs+VvzsRxAM2N82jD5C5M5fazP1k/amIansm\ny+q9Qb7e+mG3wz4PON+6HMS7y+sHWm7S4fB/5a/hpQelniKyRUSq/TwK+3owxphnjTEzjTEzU1M7\nLzyhlOobW5I8TDo7l7WbrHr5EynNZO5yMWnvLWwZczUbd24ns+Z71Nz8HiUb0ri3rtGK6fYW6iZW\nM/LAjMspH3dimz17c/1tiQH3QCbvzCXxdAafp1xk2oYVrTeWddT2XgNfyWagYN1dEA9Umnollq2M\nNt1e+Rtj5vTyPY4DbVdSSPP+TikVJhUr2wfdU362caxaw4+b51BsnqEgzcnaN7Naq31OD/0G21ej\nGX5iNE2Dm7g4sBl3m2qfSR+P58ioMyQ3JfP14GbsX2RR+9XN8G/LOQOcgdYr/a74UjuBJm67C+K+\nbwOB7kGIZ31S5y8i7xB4wjcBa8L3LqygvxO43xhT09U+dcJXqcjStoKot2Gj7fq+PWGM/womDeKd\n9Ut7BxH5gYgcA24H3hKRzd7fjxaRCgBjTAvwMLAZ2A/8trvAr5S6soJdzKTjXbS9YbcHtw+7dw2a\noiJrctfj6WJhF9VjeoevUnEmlLtce7pMY3eSk4PrCuoToWEqImljN6WUX121WA6kL0ojhw+/3G7Z\nH7vd/+/jse9Of9Dgr1Sc6a5s0p+uGqKVlXWuqBkwwAr2vg6fZWVw+rT1zSJQBc5DD2llTn/S4K9U\nnAmlbLKrkkl/C6i88IIV7P3l59tuD9YVf1OT1W9nyZLAC7GovqXBX6k44y+QA1y4EHjit7sVsoKd\njC0qsrqBilh1/WDNKaxbZ41PJ3WvPA3+SsUZXyAfPrz977/8MnAffN/rugvwPa0iWr8efv3rzhO5\n3c09qL6j1T5KxamuumGG0vQsmCqirqqHRKwTjAqNVvsopboUysRvV4KpIgplcln1LQ3+SsWpUPvl\nBBIooB850jkFFOg9RLS6p79o8FcqTvV107OuThod19X1994i8NOf6iRvf9Hgr1Sc6q6CJ1iBqoh8\n2qaA/L33yy/Dr34V2nur4OmEr1IqZP4Wc6+o0MnccNIJX6XUFdWx2VvbOv1ALRl0MjdyaPBXSoWk\nq+oeXUQl8mnwV0qFpKtS0b6eT1B9rz/W8FVKxaDu1scNtK6uigx65a+UCommdqKbBn+lVEg0tRPd\nNO2jlAqZpnail175K6VUHNLgr5RScUiDv1JKxSEN/kopFYc0+CulVByK2MZuInIKCNAeqpMRwOkr\nOJz+EgvHoccQGWLhGCA2jqO/jyHdGJPa3UYRG/yDISK7etLFLtLFwnHoMUSGWDgGiI3jiNRj0LSP\nUkrFIQ3+SikVh2Il+D8b7gH0kVg4Dj2GyBALxwCxcRwReQwxkfNXSikVnFi58ldKKRUEDf5KKRWH\noj74i8j3ReSgiBwSkRXhHk+wROQFETkpItXhHkuoRGSsiLwtIrUiUiMij4R7TKEQkUEi8r6I7PUe\nxy/DPaZQiYhdRHaLyJvhHksoRKReRD4SkT0isivc4wmViKSIyH+KyAER2S8it4d7TD5RnfMXETvw\nMfBd4BiwE/hrY0xtWAcWBBG5A7gAvGSMyQr3eEIhIqOAUcaYD0VkCPABcG80/T0AiIgAg40xF0Qk\nEagCHjHG7Ajz0IImIsXATGCoMeaecI8nWCJSD8w0xkT1DV4isg6oNMY8LyIDgGRjTGO4xwXRf+V/\nK3DIGHPYGHMReA0oDPOYgmKMeRc4E+5x9IYx5nNjzIfeP58H9gNjwjuq4BnLBe/TRO8j6q6ORCQN\nuBt4PtxjiWcicjVwB/AbAGPMxUgJ/BD9wX8McLTN82NEYdCJJSLiBKYD74V3JKHxpkv2ACeBPxlj\novE4/gVYDnjCPZBeMMD/E5EPROShcA8mRBnAKeBFbwrueREZHO5B+UR78FcRRESuAl4Hfm6MORfu\n8YTCGOM2xkwD0oBbRSSqUnEicg9w0hjzQbjH0ku5xpibgbnA33nTo9EmAbgZ+L/GmOnA10DEzEtG\ne/A/Doxt8zzN+zvVz7w58teB9caY34V7PL3l/Xr+NvD9cI8lSLOA+d6c+WvAd0SkLLxDCp4x5rj3\n50ng91gp3mhzDDjW5tvjf2KdDCJCtAf/ncAEEcnwTqYsBMrDPKa4450o/Q2w3xhTGu7xhEpEUkUk\nxfvnJKxCggPhaVF4kAAAANlJREFUHVVwjDH/yxiTZoxxYv17+Isx5oEwDysoIjLYWziAN03yPSDq\nquGMMV8AR0VkovdXdwERUwQR1Qu4G2NaRORhYDNgB14wxtSEeVhBEZFXgTuBESJyDHjMGPOb8I4q\naLOARcBH3nw5wD8bYyrCOKZQjALWeavIbMBvjTFRWSoZ5UYCv7euKUgAXjHG/DG8QwrZ3wPrvRen\nh4G/CfN4WkV1qadSSqnQRHvaRymlVAg0+CulVBzS4K+UUnFIg79SSsUhDf5KKRWHNPgrpVQc0uCv\nlFJx6P8D4Obclx42P3sAAAAASUVORK5CYII=\n",
-            "text/plain": [
-              "<Figure size 432x288 with 1 Axes>"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jWxvLGexKv0D",
-        "colab_type": "text"
-      },
-      "source": [
-        "We can see from the graph that the predictions for the original model, the converted model, and the quantized model are all close enough to be indistinguishable. This means that our quantized model is ready to use!\n",
-        "\n",
-        "We can print the difference in file size:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "6r42iBnULP4X",
-        "colab_type": "code",
-        "outputId": "afe526c9-498d-498e-d768-1edfbf21e870",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 68
-        }
-      },
-      "source": [
-        "import os\n",
-        "basic_model_size = os.path.getsize(\"sine_model.tflite\")\n",
-        "print(\"Basic model is %d bytes\" % basic_model_size)\n",
-        "quantized_model_size = os.path.getsize(\"sine_model_quantized.tflite\")\n",
-        "print(\"Quantized model is %d bytes\" % quantized_model_size)\n",
-        "difference = basic_model_size - quantized_model_size\n",
-        "print(\"Difference is %d bytes\" % difference)"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Basic model is 2656 bytes\n",
-            "Quantized model is 2640 bytes\n",
-            "Difference is 16 bytes\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "C2vpZE9ZshVH",
-        "colab_type": "text"
-      },
-      "source": [
-        "Our quantized model is only 16 bytes smaller than the original version, which only a tiny reduction in size! At around 2.6 kilobytes, this model is already so small that the weights make up only a small fraction of the overall size, meaning quantization has little effect.\n",
-        "\n",
-        "More complex models have many more weights, meaning the space saving from quantization will be much higher, approaching 4x for most sophisticated models.\n",
-        "\n",
-        "Regardless, our quantized model will take less time to execute than the original version, which is important on a tiny microcontroller!\n",
-        "\n",
-        "## Write to a C file\n",
-        "The final step in preparing our model for use with TensorFlow Lite for Microcontrollers is to convert it into a C source file. You can see an example of this format in [`hello_world/sine_model_data.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/sine_model_data.cc).\n",
-        "\n",
-        "To do so, we can use a command line utility named [`xxd`](https://linux.die.net/man/1/xxd). The following cell runs `xxd` on our quantized model and prints the output:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "l4-WhtGpvb-E",
-        "colab_type": "code",
-        "outputId": "f975721f-bdd1-440a-93af-55f13c4c8690",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 3808
-        }
-      },
-      "source": [
-        "# Install xxd if it is not available\n",
-        "!apt-get -qq install xxd\n",
-        "# Save the file as a C source file\n",
-        "!xxd -i sine_model_quantized.tflite > sine_model_quantized.cc\n",
-        "# Print the source file\n",
-        "!cat sine_model_quantized.cc"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "unsigned char sine_model_quantized_tflite[] = {\n",
-            "  0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,\n",
-            "  0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,\n",
-            "  0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x0a, 0x00, 0x00,\n",
-            "  0xb8, 0x05, 0x00, 0x00, 0xa0, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x0b, 0x00, 0x00, 0x00, 0x90, 0x05, 0x00, 0x00, 0x7c, 0x05, 0x00, 0x00,\n",
-            "  0x24, 0x05, 0x00, 0x00, 0xd4, 0x04, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,\n",
-            "  0x74, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n",
-            "  0x14, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x54, 0xf6, 0xff, 0xff, 0x58, 0xf6, 0xff, 0xff, 0x5c, 0xf6, 0xff, 0xff,\n",
-            "  0x60, 0xf6, 0xff, 0xff, 0xc2, 0xfa, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x40, 0x00, 0x00, 0x00, 0x7c, 0x19, 0xa7, 0x3e, 0x99, 0x81, 0xb9, 0x3e,\n",
-            "  0x56, 0x8b, 0x9f, 0x3e, 0x88, 0xd8, 0x12, 0xbf, 0x74, 0x10, 0x56, 0x3e,\n",
-            "  0xfe, 0xc6, 0xdf, 0xbe, 0xf2, 0x10, 0x5a, 0xbe, 0xf0, 0xe2, 0x0a, 0xbe,\n",
-            "  0x10, 0x5a, 0x98, 0xbe, 0xb9, 0x36, 0xce, 0x3d, 0x8f, 0x7f, 0x87, 0x3e,\n",
-            "  0x2c, 0xb1, 0xfd, 0xbd, 0xe6, 0xa6, 0x8a, 0xbe, 0xa5, 0x3e, 0xda, 0x3e,\n",
-            "  0x50, 0x34, 0xed, 0xbd, 0x90, 0x91, 0x69, 0xbe, 0x0e, 0xfb, 0xff, 0xff,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x67, 0x41, 0x48, 0xbf,\n",
-            "  0x24, 0xcd, 0xa0, 0xbe, 0xb7, 0x92, 0x0c, 0xbf, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x98, 0xfe, 0x3c, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a, 0x17, 0x9a, 0xbe,\n",
-            "  0x41, 0xcb, 0xb6, 0xbe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x13, 0xd6, 0x1e, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x5a, 0xfb, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,\n",
-            "  0x4b, 0x98, 0xdd, 0xbd, 0x40, 0x6b, 0xcb, 0xbe, 0x36, 0x0c, 0xd4, 0x3c,\n",
-            "  0xbd, 0x44, 0xb5, 0x3e, 0x95, 0x70, 0xe3, 0x3e, 0xe7, 0xac, 0x86, 0x3e,\n",
-            "  0x00, 0xc4, 0x4e, 0x3d, 0x7e, 0xa6, 0x1d, 0x3e, 0xbd, 0x87, 0xbb, 0x3e,\n",
-            "  0xb4, 0xb8, 0x09, 0xbf, 0xa1, 0x1f, 0xf8, 0xbe, 0x8d, 0x90, 0xdd, 0x3e,\n",
-            "  0xde, 0xfa, 0x6f, 0xbe, 0xb2, 0x75, 0xe4, 0x3d, 0x6e, 0xfe, 0x36, 0x3e,\n",
-            "  0x20, 0x18, 0xc2, 0xbe, 0x39, 0xc7, 0xfb, 0xbe, 0xfe, 0xa4, 0x30, 0xbe,\n",
-            "  0xf7, 0x91, 0xde, 0xbe, 0xde, 0xab, 0x24, 0x3e, 0xfb, 0xbb, 0xce, 0x3e,\n",
-            "  0xeb, 0x23, 0x80, 0xbe, 0x7b, 0x58, 0x73, 0xbe, 0x9a, 0x2e, 0x03, 0x3e,\n",
-            "  0x10, 0x42, 0xa9, 0xbc, 0x10, 0x12, 0x64, 0xbd, 0xe3, 0x8d, 0x0c, 0x3d,\n",
-            "  0x9e, 0x48, 0x97, 0xbe, 0x34, 0x51, 0xd4, 0xbe, 0x02, 0x3b, 0x0d, 0x3e,\n",
-            "  0x62, 0x67, 0x89, 0xbe, 0x74, 0xdf, 0xa2, 0x3d, 0xf3, 0x25, 0xb3, 0xbe,\n",
-            "  0xef, 0x34, 0x7b, 0x3d, 0x61, 0x70, 0xe3, 0x3d, 0xba, 0x76, 0xc0, 0xbe,\n",
-            "  0x7d, 0xe9, 0xa7, 0x3e, 0xc3, 0xab, 0xd0, 0xbe, 0xcf, 0x7c, 0xdb, 0xbe,\n",
-            "  0x70, 0x27, 0x9a, 0xbe, 0x98, 0xf5, 0x3c, 0xbd, 0xff, 0x4b, 0x4b, 0x3e,\n",
-            "  0x7e, 0xa0, 0xf8, 0xbd, 0xd4, 0x6e, 0x86, 0x3d, 0x00, 0x4a, 0x07, 0x3a,\n",
-            "  0x4c, 0x24, 0x61, 0xbe, 0x54, 0x68, 0xf7, 0xbd, 0x02, 0x3f, 0x77, 0xbe,\n",
-            "  0x23, 0x79, 0xb3, 0x3e, 0x1c, 0x83, 0xad, 0xbd, 0xc8, 0x92, 0x8d, 0x3e,\n",
-            "  0xa8, 0xf3, 0x15, 0xbd, 0xe6, 0x4d, 0x6c, 0x3d, 0xac, 0xe7, 0x98, 0xbe,\n",
-            "  0x81, 0xec, 0xbd, 0x3e, 0xe2, 0x55, 0x73, 0x3e, 0xc1, 0x77, 0xc7, 0x3e,\n",
-            "  0x6e, 0x1b, 0x5e, 0x3d, 0x27, 0x78, 0x02, 0x3f, 0xd4, 0x21, 0x90, 0x3d,\n",
-            "  0x52, 0xdc, 0x1f, 0x3e, 0xbf, 0xda, 0x88, 0x3e, 0x80, 0x79, 0xe3, 0xbd,\n",
-            "  0x40, 0x6f, 0x10, 0xbe, 0x20, 0x43, 0x2e, 0xbd, 0xf0, 0x76, 0xc5, 0xbd,\n",
-            "  0xcc, 0xa0, 0x04, 0xbe, 0xf0, 0x69, 0xd7, 0xbe, 0xb1, 0xfe, 0x64, 0xbe,\n",
-            "  0x20, 0x41, 0x84, 0xbe, 0xb2, 0xc3, 0x26, 0xbe, 0xd8, 0xf4, 0x09, 0xbe,\n",
-            "  0x64, 0x44, 0xd1, 0x3d, 0xd5, 0xe1, 0xc8, 0xbe, 0x35, 0xbc, 0x3f, 0xbe,\n",
-            "  0xc0, 0x94, 0x82, 0x3d, 0xdc, 0x2b, 0xb1, 0xbd, 0x02, 0xdb, 0xbf, 0xbe,\n",
-            "  0xa5, 0x7f, 0x8a, 0x3e, 0x21, 0xb4, 0xa2, 0x3e, 0xcd, 0x86, 0x56, 0xbf,\n",
-            "  0x9c, 0x3b, 0x76, 0xbc, 0x85, 0x6d, 0x60, 0xbf, 0x86, 0x00, 0x3c, 0xbe,\n",
-            "  0xc1, 0x23, 0x7e, 0x3e, 0x96, 0xcd, 0x3f, 0x3e, 0x86, 0x91, 0x2d, 0x3e,\n",
-            "  0x55, 0xef, 0x87, 0x3e, 0x7e, 0x97, 0x03, 0xbe, 0x2a, 0xcd, 0x01, 0x3e,\n",
-            "  0x32, 0xc9, 0x8e, 0xbe, 0x72, 0x77, 0x3b, 0xbe, 0xe0, 0xa1, 0xbc, 0xbe,\n",
-            "  0x8d, 0xb7, 0xa7, 0x3e, 0x1c, 0x05, 0x95, 0xbe, 0xf7, 0x1f, 0xbb, 0x3e,\n",
-            "  0xc9, 0x3e, 0xd6, 0x3e, 0x80, 0x42, 0xe9, 0xbd, 0x27, 0x0c, 0xd2, 0xbe,\n",
-            "  0x5c, 0x32, 0x34, 0xbe, 0x14, 0xcb, 0xca, 0xbd, 0xdd, 0x3a, 0x67, 0xbe,\n",
-            "  0x1c, 0xbb, 0x8d, 0xbe, 0x91, 0xac, 0x5c, 0xbe, 0x52, 0x40, 0x6f, 0xbe,\n",
-            "  0xd7, 0x71, 0x94, 0x3e, 0x18, 0x71, 0x09, 0xbe, 0x9b, 0x29, 0xd9, 0xbe,\n",
-            "  0x7d, 0x66, 0xd2, 0xbe, 0x98, 0xd6, 0xb2, 0xbe, 0x00, 0xc9, 0x84, 0x3a,\n",
-            "  0xbc, 0xda, 0xc2, 0xbd, 0x1d, 0xc2, 0x1b, 0xbf, 0xd4, 0xdd, 0x92, 0x3e,\n",
-            "  0x07, 0x87, 0x6c, 0xbe, 0x40, 0xc2, 0x3b, 0xbe, 0xbd, 0xe2, 0x9c, 0x3e,\n",
-            "  0x0a, 0xb5, 0xa0, 0xbe, 0xe2, 0xd5, 0x9c, 0xbe, 0x3e, 0xbb, 0x7c, 0x3e,\n",
-            "  0x17, 0xb4, 0xcf, 0x3e, 0xd5, 0x8e, 0xc8, 0xbe, 0x7c, 0xf9, 0x5c, 0x3e,\n",
-            "  0x80, 0xfc, 0x0d, 0x3d, 0xc5, 0xd5, 0x8b, 0x3e, 0xf5, 0x17, 0xa2, 0x3e,\n",
-            "  0xc7, 0x60, 0x89, 0xbe, 0xec, 0x95, 0x87, 0x3d, 0x7a, 0xc2, 0x5d, 0xbf,\n",
-            "  0x77, 0x94, 0x98, 0x3e, 0x77, 0x39, 0x07, 0xbc, 0x42, 0x29, 0x00, 0x3e,\n",
-            "  0xaf, 0xd0, 0xa9, 0x3e, 0x31, 0x23, 0xc4, 0xbe, 0x95, 0x36, 0x5b, 0xbe,\n",
-            "  0xc7, 0xdc, 0x83, 0xbe, 0x1e, 0x6b, 0x47, 0x3e, 0x5b, 0x24, 0x99, 0x3e,\n",
-            "  0x99, 0x27, 0x54, 0x3e, 0xc8, 0x20, 0xdd, 0xbd, 0x5a, 0x86, 0x2f, 0x3e,\n",
-            "  0x80, 0xf0, 0x69, 0xbe, 0x44, 0xfc, 0x84, 0xbd, 0x82, 0xa0, 0x2a, 0xbe,\n",
-            "  0x87, 0xe6, 0x2a, 0x3e, 0xd8, 0x34, 0xae, 0x3d, 0x50, 0xbd, 0xb5, 0x3e,\n",
-            "  0xc4, 0x8c, 0x88, 0xbe, 0xe3, 0xbc, 0xa5, 0x3e, 0xa9, 0xda, 0x9e, 0x3e,\n",
-            "  0x3e, 0xb8, 0x23, 0xbe, 0x80, 0x90, 0x15, 0x3d, 0x97, 0x3f, 0xc3, 0x3e,\n",
-            "  0xca, 0x5c, 0x9d, 0x3e, 0x21, 0xe8, 0xe1, 0x3e, 0xc0, 0x49, 0x01, 0xbc,\n",
-            "  0x00, 0x0b, 0x88, 0xbd, 0x3f, 0xf7, 0xca, 0x3c, 0xfb, 0x5a, 0xb1, 0x3e,\n",
-            "  0x60, 0xd2, 0x0d, 0x3c, 0xce, 0x23, 0x78, 0xbf, 0x8f, 0x4f, 0xb9, 0xbe,\n",
-            "  0x69, 0x6a, 0x34, 0xbf, 0x4b, 0x5e, 0xa9, 0x3e, 0x64, 0x8c, 0xd9, 0x3e,\n",
-            "  0x52, 0x77, 0x36, 0x3e, 0xeb, 0xaf, 0xbe, 0x3e, 0x40, 0xbe, 0x36, 0x3c,\n",
-            "  0x08, 0x65, 0x3b, 0xbd, 0x55, 0xe0, 0x66, 0xbd, 0xd2, 0xe8, 0x9b, 0xbe,\n",
-            "  0x86, 0xe3, 0x09, 0xbe, 0x93, 0x3d, 0xdd, 0x3e, 0x0f, 0x66, 0x18, 0x3f,\n",
-            "  0x18, 0x05, 0x33, 0xbd, 0xde, 0x15, 0xd7, 0xbe, 0xaa, 0xcf, 0x49, 0xbe,\n",
-            "  0xa2, 0xa5, 0x64, 0x3e, 0xe6, 0x9c, 0x42, 0xbe, 0x54, 0x42, 0xcc, 0x3d,\n",
-            "  0xa0, 0xbd, 0x9d, 0xbe, 0xc2, 0x69, 0x48, 0x3e, 0x5b, 0x8b, 0xa2, 0xbe,\n",
-            "  0xc0, 0x13, 0x87, 0x3d, 0x36, 0xfd, 0x69, 0x3e, 0x05, 0x86, 0x40, 0xbe,\n",
-            "  0x1e, 0x7a, 0xce, 0xbe, 0x46, 0x13, 0xa7, 0xbe, 0x68, 0x52, 0x86, 0xbe,\n",
-            "  0x04, 0x9e, 0x86, 0xbd, 0x8c, 0x54, 0xc1, 0x3d, 0xe0, 0x3b, 0xad, 0x3c,\n",
-            "  0x42, 0x67, 0x85, 0xbd, 0xea, 0x97, 0x42, 0x3e, 0x6e, 0x13, 0x3b, 0xbf,\n",
-            "  0x56, 0x5b, 0x16, 0x3e, 0xaa, 0xab, 0xdf, 0x3e, 0xc8, 0x41, 0x36, 0x3d,\n",
-            "  0x24, 0x2d, 0x47, 0xbe, 0x77, 0xa5, 0xae, 0x3e, 0xc0, 0xc2, 0x5b, 0x3c,\n",
-            "  0xac, 0xac, 0x4e, 0x3e, 0x99, 0xec, 0x13, 0xbe, 0xf2, 0xab, 0x73, 0x3e,\n",
-            "  0xaa, 0xa1, 0x48, 0xbe, 0xe8, 0xd3, 0x01, 0xbe, 0x60, 0xb7, 0xc7, 0xbd,\n",
-            "  0x64, 0x72, 0xd3, 0x3d, 0x83, 0xd3, 0x99, 0x3e, 0x0c, 0x76, 0x34, 0xbe,\n",
-            "  0x42, 0xda, 0x0d, 0x3e, 0xfb, 0x47, 0x9a, 0x3e, 0x8b, 0xdc, 0x92, 0xbe,\n",
-            "  0x56, 0x7f, 0x6b, 0x3e, 0x04, 0xd4, 0x88, 0xbd, 0x11, 0x9e, 0x80, 0x3e,\n",
-            "  0x3c, 0x89, 0xff, 0x3d, 0xb3, 0x3e, 0x88, 0x3e, 0xf7, 0xf0, 0x88, 0x3e,\n",
-            "  0x28, 0xfb, 0xc9, 0xbe, 0x53, 0x3e, 0xcf, 0x3e, 0xac, 0x75, 0xdc, 0xbe,\n",
-            "  0xdd, 0xca, 0xd7, 0x3e, 0x01, 0x58, 0xa7, 0x3e, 0x29, 0xb8, 0x13, 0xbf,\n",
-            "  0x76, 0x81, 0x12, 0xbc, 0x28, 0x8b, 0x16, 0xbf, 0x0e, 0xec, 0x0e, 0x3e,\n",
-            "  0x40, 0x0a, 0xdb, 0xbd, 0x98, 0xec, 0xbf, 0xbd, 0x32, 0x55, 0x0c, 0xbe,\n",
-            "  0xfb, 0xf9, 0xc9, 0x3e, 0x83, 0x4a, 0x6d, 0xbe, 0x76, 0x59, 0xe2, 0xbe,\n",
-            "  0x54, 0x7d, 0x9f, 0xbb, 0x9d, 0xe8, 0x95, 0x3e, 0x5c, 0xd3, 0xd0, 0x3d,\n",
-            "  0x19, 0x8a, 0xb0, 0x3e, 0xde, 0x6f, 0x2e, 0xbe, 0xd0, 0x16, 0x83, 0x3d,\n",
-            "  0x9c, 0x7d, 0x11, 0xbf, 0x2b, 0xcc, 0x25, 0x3c, 0x2a, 0xa5, 0x27, 0xbe,\n",
-            "  0x22, 0x14, 0xc7, 0xbe, 0x5e, 0x7a, 0xac, 0x3e, 0x4e, 0x41, 0x94, 0xbe,\n",
-            "  0x5a, 0x68, 0x7b, 0x3e, 0x86, 0xfd, 0x4e, 0x3e, 0xa2, 0x56, 0x6a, 0xbe,\n",
-            "  0xca, 0xfe, 0x81, 0xbe, 0x43, 0xc3, 0xb1, 0xbd, 0xc5, 0xb8, 0xa7, 0x3e,\n",
-            "  0x55, 0x23, 0xcd, 0x3e, 0xaf, 0x2e, 0x76, 0x3e, 0x69, 0xa8, 0x90, 0xbe,\n",
-            "  0x0d, 0xba, 0xb9, 0x3e, 0x66, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x40, 0x00, 0x00, 0x00, 0x53, 0xd6, 0xe2, 0x3d, 0x66, 0xb6, 0xcc, 0x3e,\n",
-            "  0x03, 0xe7, 0xf6, 0x3e, 0xe0, 0x28, 0x10, 0xbf, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x3e, 0x3d, 0xb0, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x62, 0xf0, 0x77, 0x3e,\n",
-            "  0xa6, 0x9d, 0xa4, 0x3e, 0x3a, 0x4b, 0xf3, 0xbe, 0x71, 0x9e, 0xa7, 0x3e,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x34, 0x39, 0xa2, 0x3e, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0xcc, 0x9c, 0x4a, 0x3e, 0xab, 0x40, 0xa3, 0x3e, 0xb2, 0xff, 0xff, 0xff,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0xb3, 0x71, 0x67, 0x3f,\n",
-            "  0x9a, 0x7a, 0x95, 0xbf, 0xe1, 0x48, 0xe8, 0xbe, 0x8a, 0x72, 0x96, 0x3e,\n",
-            "  0x00, 0xd2, 0xd3, 0xbb, 0x1a, 0xc5, 0xd7, 0x3f, 0xac, 0x7e, 0xc8, 0xbe,\n",
-            "  0x90, 0xa7, 0x95, 0xbe, 0x3b, 0xd7, 0xdc, 0xbe, 0x41, 0xa8, 0x16, 0x3f,\n",
-            "  0x50, 0x5b, 0xcb, 0x3f, 0x52, 0xb9, 0xed, 0xbe, 0x2e, 0xa7, 0xc6, 0xbe,\n",
-            "  0xaf, 0x0f, 0x14, 0xbf, 0xb3, 0xda, 0x59, 0x3f, 0x02, 0xec, 0xd7, 0xbe,\n",
-            "  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x66, 0x11, 0x1f, 0xbf,\n",
-            "  0xb8, 0xfb, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f,\n",
-            "  0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,\n",
-            "  0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
-            "  0xf0, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,\n",
-            "  0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xce, 0xff, 0xff, 0xff,\n",
-            "  0x00, 0x00, 0x00, 0x08, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x1c, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n",
-            "  0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00,\n",
-            "  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00, 0x00,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xba, 0xff, 0xff, 0xff,\n",
-            "  0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,\n",
-            "  0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,\n",
-            "  0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x16, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x08, 0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,\n",
-            "  0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00,\n",
-            "  0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x0a, 0x00, 0x00, 0x00, 0x10, 0x03, 0x00, 0x00, 0xa4, 0x02, 0x00, 0x00,\n",
-            "  0x40, 0x02, 0x00, 0x00, 0xf4, 0x01, 0x00, 0x00, 0xac, 0x01, 0x00, 0x00,\n",
-            "  0x48, 0x01, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00,\n",
-            "  0x50, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x26, 0xfd, 0xff, 0xff,\n",
-            "  0x3c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x18, 0xfd, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00,\n",
-            "  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31,\n",
-            "  0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x34, 0x2f, 0x4d, 0x61, 0x74,\n",
-            "  0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x6e, 0xfd, 0xff, 0xff,\n",
-            "  0x50, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x60, 0xfd, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00,\n",
-            "  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31,\n",
-            "  0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x34, 0x2f, 0x4d, 0x61, 0x74,\n",
-            "  0x4d, 0x75, 0x6c, 0x2f, 0x52, 0x65, 0x61, 0x64, 0x56, 0x61, 0x72, 0x69,\n",
-            "  0x61, 0x62, 0x6c, 0x65, 0x4f, 0x70, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73,\n",
-            "  0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xce, 0xfd, 0xff, 0xff,\n",
-            "  0x34, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0xc0, 0xfd, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00,\n",
-            "  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31,\n",
-            "  0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x33, 0x2f, 0x52, 0x65, 0x6c,\n",
-            "  0x75, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0x12, 0xfe, 0xff, 0xff, 0x3c, 0x00, 0x00, 0x00,\n",
-            "  0x03, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0xfe, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,\n",
-            "  0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,\n",
-            "  0x73, 0x65, 0x5f, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f,\n",
-            "  0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0x5a, 0xfe, 0xff, 0xff, 0x50, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x4c, 0xfe, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,\n",
-            "  0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,\n",
-            "  0x73, 0x65, 0x5f, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f,\n",
-            "  0x52, 0x65, 0x61, 0x64, 0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65,\n",
-            "  0x4f, 0x70, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0xba, 0xfe, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00,\n",
-            "  0x0a, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0xac, 0xfe, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,\n",
-            "  0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,\n",
-            "  0x73, 0x65, 0x5f, 0x32, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0xfe, 0xfe, 0xff, 0xff, 0x3c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,\n",
-            "  0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf0, 0xfe, 0xff, 0xff,\n",
-            "  0x20, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,\n",
-            "  0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32,\n",
-            "  0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x46, 0xff, 0xff, 0xff, 0x50, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,\n",
-            "  0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x38, 0xff, 0xff, 0xff,\n",
-            "  0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,\n",
-            "  0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32,\n",
-            "  0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f, 0x52, 0x65, 0x61, 0x64,\n",
-            "  0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65, 0x4f, 0x70, 0x2f, 0x74,\n",
-            "  0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0xa6, 0xff, 0xff, 0xff, 0x48, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n",
-            "  0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,\n",
-            "  0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x43,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,\n",
-            "  0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32, 0x5f, 0x69, 0x6e, 0x70, 0x75,\n",
-            "  0x74, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x04, 0x00,\n",
-            "  0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00,\n",
-            "  0x28, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x0a, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00,\n",
-            "  0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x03, 0x00, 0x00, 0x00\n",
-            "};\n",
-            "unsigned int sine_model_quantized_tflite_len = 2640;\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1sqrhBLXwILt",
-        "colab_type": "text"
-      },
-      "source": [
-        "We can either copy and paste this output into our project's source code, or download the file using the collapsible menu on the left hand side of this Colab.\n",
-        "\n"
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/hello_world/disco_f746ng/Makefile.inc b/tensorflow/lite/micro/examples/hello_world/disco_f746ng/Makefile.inc
index 879042f65d6..6d18e48d500 100644
--- a/tensorflow/lite/micro/examples/hello_world/disco_f746ng/Makefile.inc
+++ b/tensorflow/lite/micro/examples/hello_world/disco_f746ng/Makefile.inc
@@ -1,6 +1,6 @@
 # Settings for the Discovery STM32F746NG board.
 ifneq ($(filter disco_f746ng,$(ALL_TAGS)),)
-  MBED_PROJECT_FILES += \
+  hello_world_MBED_PROJECT_FILES += \
     BSP_DISCO_F746NG.lib \
     LCD_DISCO_F746NG.lib
 endif
diff --git a/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc b/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
index 75dd607f75c..46976f30ecb 100644
--- a/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
+++ b/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/examples/hello_world/sine_model_data.h"
+#include "tensorflow/lite/micro/examples/hello_world/model.h"
 #include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
@@ -31,7 +31,7 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model = ::tflite::GetModel(g_sine_model_data);
+  const tflite::Model* model = ::tflite::GetModel(g_model);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     TF_LITE_REPORT_ERROR(error_reporter,
                          "Model provided is schema version %d not equal "
@@ -43,8 +43,13 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   tflite::ops::micro::AllOpsResolver resolver;
 
   // Create an area of memory to use for input, output, and intermediate arrays.
-  // Finding the minimum value for your model may require some trial and error.
-  const int tensor_arena_size = 2 * 1024;
+
+  // Minimum arena size, at the time of writing. After allocating tensors
+  // you can retrieve this value by invoking interpreter.arena_used_bytes().
+  const int model_arena_size = 2352;
+  /* Extra headroom for model + alignment + future interpreter changes */
+  const int extra_arena_size = 560 + 16 + 100;
+  const int tensor_arena_size = model_arena_size + extra_arena_size;
   uint8_t tensor_arena[tensor_arena_size];
 
   // Build an interpreter to run the model with
@@ -54,6 +59,9 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   // Allocate memory from the tensor_arena for the model's tensors
   TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
 
+  // Alert for substantial increase in arena size usage.
+  TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(),
+                          model_arena_size + 100);
   // Obtain a pointer to the model's input tensor
   TfLiteTensor* input = interpreter.input(0);
 
diff --git a/tensorflow/lite/micro/examples/hello_world/images/STM32F746.gif b/tensorflow/lite/micro/examples/hello_world/images/animation_on_STM32F746.gif
similarity index 100%
rename from tensorflow/lite/micro/examples/hello_world/images/STM32F746.gif
rename to tensorflow/lite/micro/examples/hello_world/images/animation_on_STM32F746.gif
diff --git a/tensorflow/lite/micro/examples/hello_world/images/arduino_mkrzero.gif b/tensorflow/lite/micro/examples/hello_world/images/animation_on_arduino_mkrzero.gif
similarity index 100%
rename from tensorflow/lite/micro/examples/hello_world/images/arduino_mkrzero.gif
rename to tensorflow/lite/micro/examples/hello_world/images/animation_on_arduino_mkrzero.gif
diff --git a/tensorflow/lite/micro/examples/hello_world/images/sparkfun_edge.gif b/tensorflow/lite/micro/examples/hello_world/images/animation_on_sparkfun_edge.gif
similarity index 100%
rename from tensorflow/lite/micro/examples/hello_world/images/sparkfun_edge.gif
rename to tensorflow/lite/micro/examples/hello_world/images/animation_on_sparkfun_edge.gif
diff --git a/tensorflow/lite/micro/examples/hello_world/images/model_architecture.png b/tensorflow/lite/micro/examples/hello_world/images/model_architecture.png
new file mode 100644
index 00000000000..792d18fab4b
Binary files /dev/null and b/tensorflow/lite/micro/examples/hello_world/images/model_architecture.png differ
diff --git a/tensorflow/lite/micro/examples/hello_world/main_functions.cc b/tensorflow/lite/micro/examples/hello_world/main_functions.cc
index 33180d4554c..404c8542432 100644
--- a/tensorflow/lite/micro/examples/hello_world/main_functions.cc
+++ b/tensorflow/lite/micro/examples/hello_world/main_functions.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/hello_world/main_functions.h"
 
 #include "tensorflow/lite/micro/examples/hello_world/constants.h"
+#include "tensorflow/lite/micro/examples/hello_world/model.h"
 #include "tensorflow/lite/micro/examples/hello_world/output_handler.h"
-#include "tensorflow/lite/micro/examples/hello_world/sine_model_data.h"
 #include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
@@ -49,7 +49,7 @@ void setup() {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  model = tflite::GetModel(g_sine_model_data);
+  model = tflite::GetModel(g_model);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     TF_LITE_REPORT_ERROR(error_reporter,
                          "Model provided is schema version %d not equal "
diff --git a/tensorflow/lite/micro/examples/hello_world/model.cc b/tensorflow/lite/micro/examples/hello_world/model.cc
new file mode 100644
index 00000000000..232e4a14115
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/model.cc
@@ -0,0 +1,250 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Automatically created from a TensorFlow Lite flatbuffer using the command:
+// xxd -i model.tflite > model.cc
+
+// This is a standard TensorFlow Lite model file that has been converted into a
+// C data array, so it can be easily compiled into a binary for devices that
+// don't have a file system.
+
+// See train/README.md for a full description of the creation process.
+
+#include "tensorflow/lite/micro/examples/hello_world/model.h"
+
+// We need to keep the data array aligned on some architectures.
+#ifdef __has_attribute
+#define HAVE_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define HAVE_ATTRIBUTE(x) 0
+#endif
+#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
+#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
+#else
+#define DATA_ALIGN_ATTRIBUTE
+#endif
+
+const unsigned char g_model[] DATA_ALIGN_ATTRIBUTE = {
+    0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,
+    0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
+    0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x60, 0x09, 0x00, 0x00, 0xa8, 0x02, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00,
+    0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+    0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,
+    0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x48, 0x02, 0x00, 0x00, 0x34, 0x02, 0x00, 0x00,
+    0x0c, 0x02, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00,
+    0x8c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xfe, 0xfd, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00,
+    0x7c, 0xfd, 0xff, 0xff, 0x80, 0xfd, 0xff, 0xff, 0x84, 0xfd, 0xff, 0xff,
+    0x88, 0xfd, 0xff, 0xff, 0x22, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x04, 0x00, 0x00,
+    0x9f, 0x0a, 0x00, 0x00, 0x65, 0x06, 0x00, 0x00, 0x3d, 0xf8, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x00, 0xeb, 0x0a, 0x00, 0x00, 0x2f, 0xf8, 0xff, 0xff,
+    0xe8, 0x04, 0x00, 0x00, 0x21, 0x0a, 0x00, 0x00, 0x46, 0xfe, 0xff, 0xff,
+    0xc8, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa3, 0xf7, 0xff, 0xff,
+    0x28, 0xf9, 0xff, 0xff, 0x9a, 0x05, 0x00, 0x00, 0x6e, 0xfe, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x73, 0x1c, 0x11, 0xe1,
+    0x0c, 0x81, 0xa5, 0x43, 0xfe, 0xd5, 0xd5, 0xb2, 0x60, 0x77, 0x19, 0xdf,
+    0x8a, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x51, 0x0b, 0x00, 0x00, 0x47, 0xf6, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x00, 0x1c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x9b, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0xe7, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x92, 0x07, 0x00, 0x00, 0xf4, 0xf4, 0xff, 0xff, 0x55, 0xf0, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x00, 0xd6, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x00, 0x00, 0xee, 0xfc, 0x00, 0xec, 0x05, 0x16, 0xef, 0xec,
+    0xe6, 0xf8, 0x03, 0x01, 0x00, 0xfa, 0xf8, 0xf5, 0xda, 0xeb, 0x27, 0x14,
+    0xef, 0xde, 0xe2, 0xda, 0xf0, 0xdf, 0x32, 0x06, 0x01, 0xe6, 0xee, 0xf9,
+    0x00, 0x16, 0x07, 0xe0, 0xfe, 0xff, 0xe9, 0x05, 0xe7, 0xef, 0x81, 0x1b,
+    0x18, 0xea, 0xca, 0x01, 0x0f, 0x00, 0xdb, 0xf7, 0x0e, 0xec, 0x12, 0x1e,
+    0x04, 0x13, 0xb2, 0xe7, 0xfd, 0x06, 0xbb, 0xe0, 0x0c, 0xec, 0xf0, 0xdf,
+    0xeb, 0xf7, 0x05, 0x26, 0x19, 0xe4, 0x70, 0x1a, 0xea, 0x1e, 0x34, 0xdf,
+    0x19, 0xf3, 0xf1, 0x19, 0x0e, 0x03, 0x1b, 0xe1, 0xde, 0x13, 0xf6, 0x19,
+    0xff, 0xf6, 0x1a, 0x17, 0xf1, 0x1c, 0xdb, 0x1a, 0x1a, 0x20, 0xe6, 0x19,
+    0xf5, 0xff, 0x97, 0x0b, 0x00, 0x00, 0xce, 0xdf, 0x0d, 0xf7, 0x15, 0xe4,
+    0xed, 0xfc, 0x0d, 0xe9, 0xfb, 0xec, 0x5c, 0xfc, 0x1d, 0x02, 0x58, 0xe3,
+    0xe0, 0xf4, 0x15, 0xec, 0xf9, 0x00, 0x13, 0x05, 0xec, 0x0c, 0x1c, 0x14,
+    0x0c, 0xe9, 0x0a, 0xf4, 0x18, 0x00, 0xd7, 0x05, 0x27, 0x02, 0x15, 0xea,
+    0xea, 0x02, 0x9b, 0x00, 0x0c, 0xfa, 0xe9, 0xea, 0xfe, 0x01, 0x14, 0xfd,
+    0x0b, 0x02, 0xf0, 0xef, 0x06, 0xee, 0x01, 0x0d, 0x06, 0xe7, 0xf7, 0x11,
+    0xf5, 0x0a, 0xf9, 0xf1, 0x23, 0xff, 0x0d, 0xf2, 0xec, 0x11, 0x26, 0x1d,
+    0xf2, 0xea, 0x28, 0x18, 0xe0, 0xfb, 0xf3, 0xf4, 0x05, 0x1c, 0x1d, 0xfb,
+    0xfd, 0x1e, 0xfc, 0x11, 0xe8, 0x06, 0x09, 0x03, 0x12, 0xf2, 0x35, 0xfb,
+    0xdd, 0x1b, 0xf9, 0xef, 0xf3, 0xe7, 0x6f, 0x0c, 0x1d, 0x00, 0x43, 0xfd,
+    0x0d, 0xf1, 0x0a, 0x19, 0x1a, 0xfa, 0xe0, 0x18, 0x1e, 0x13, 0x37, 0x1c,
+    0x12, 0xec, 0x3a, 0x0c, 0xb6, 0xcb, 0xe6, 0x13, 0xf7, 0xeb, 0xf1, 0x05,
+    0x1b, 0xfa, 0x19, 0xe5, 0xec, 0xcf, 0x0c, 0xf4, 0xe2, 0xff, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x21, 0xa2, 0x8c, 0xc9,
+    0x5f, 0x1d, 0xce, 0x41, 0x9f, 0xcd, 0x20, 0xb1, 0xdf, 0x53, 0x2f, 0x81,
+    0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe2, 0xee, 0xff, 0xff,
+    0x80, 0xff, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f,
+    0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xbc, 0xf9, 0xff, 0xff,
+    0x48, 0x01, 0x00, 0x00, 0x3c, 0x01, 0x00, 0x00, 0x30, 0x01, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00,
+    0xb8, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x1a, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0xca, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xba, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+    0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x0e, 0x00, 0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,
+    0x07, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x10, 0x00, 0x04, 0x00,
+    0x08, 0x00, 0x0c, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x0a, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0xdc, 0x04, 0x00, 0x00,
+    0x54, 0x04, 0x00, 0x00, 0xc4, 0x03, 0x00, 0x00, 0x54, 0x03, 0x00, 0x00,
+    0xd0, 0x02, 0x00, 0x00, 0x4c, 0x02, 0x00, 0x00, 0xe0, 0x01, 0x00, 0x00,
+    0x5c, 0x01, 0x00, 0x00, 0xd8, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+    0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd8, 0xff, 0xff, 0xff,
+    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x0d, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32, 0x5f,
+    0x69, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc2, 0xfb, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x02, 0x58, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc4, 0xfc, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xba, 0x2b, 0x4f, 0x38, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
+    0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,
+    0x73, 0x65, 0x5f, 0x34, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f,
+    0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x2a, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x6c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x2c, 0xfd, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xb9, 0x36, 0x0b, 0x3c,
+    0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,
+    0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x34,
+    0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f, 0x52, 0x65, 0x61, 0x64,
+    0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65, 0x4f, 0x70, 0x2f, 0x74,
+    0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0xaa, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x6c, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x9c, 0xfc, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xaa, 0x7b, 0xbe, 0x3b, 0x01, 0x00, 0x00, 0x00,
+    0x2e, 0xbd, 0xbd, 0x3f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,
+    0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x33,
+    0x2f, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x2a, 0xfd, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x02, 0x58, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x2c, 0xfe, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xe3, 0x04, 0x20, 0x39, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
+    0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,
+    0x73, 0x65, 0x5f, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f,
+    0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x92, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x6c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x94, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xe8, 0x76, 0x51, 0x3c,
+    0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,
+    0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x33,
+    0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f, 0x52, 0x65, 0x61, 0x64,
+    0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65, 0x4f, 0x70, 0x2f, 0x74,
+    0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x12, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x6c, 0x00, 0x00, 0x00,
+    0x07, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x04, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xd2, 0x91, 0x43, 0x3c, 0x01, 0x00, 0x00, 0x00,
+    0x40, 0xce, 0x42, 0x40, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,
+    0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32,
+    0x2f, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x92, 0xfe, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x02, 0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x94, 0xff, 0xff, 0xff,
+    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x28, 0xb3, 0xd9, 0x38, 0x20, 0x00, 0x00, 0x00,
+    0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31,
+    0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32, 0x2f, 0x4d, 0x61, 0x74,
+    0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x09, 0x78, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x34, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xd5, 0x6b, 0x8a, 0x3b, 0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
+    0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,
+    0x73, 0x65, 0x5f, 0x32, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f,
+    0x52, 0x65, 0x61, 0x64, 0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65,
+    0x4f, 0x70, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x8a, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x60, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x7c, 0xff, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x01, 0x00, 0x00, 0x00, 0x5d, 0x4f, 0xc9, 0x3c, 0x01, 0x00, 0x00, 0x00,
+    0x0e, 0x86, 0xc8, 0x40, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32, 0x5f,
+    0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,
+    0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x6c, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00,
+    0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1a, 0xde, 0x0a, 0x3c,
+    0x01, 0x00, 0x00, 0x00, 0x66, 0x64, 0x87, 0x3f, 0x01, 0x00, 0x00, 0x00,
+    0x13, 0x42, 0x8d, 0xbf, 0x0d, 0x00, 0x00, 0x00, 0x49, 0x64, 0x65, 0x6e,
+    0x74, 0x69, 0x74, 0x79, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0e, 0x00, 0x07, 0x00,
+    0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
+    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x72, 0x0a, 0x00, 0x0c, 0x00, 0x07, 0x00,
+    0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x04, 0x00, 0x00, 0x00};
+const int g_model_len = 2512;
diff --git a/tensorflow/lite/micro/examples/hello_world/model.h b/tensorflow/lite/micro/examples/hello_world/model.h
new file mode 100644
index 00000000000..488f47b3afd
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/model.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Automatically created from a TensorFlow Lite flatbuffer using the command:
+// xxd -i model.tflite > model.cc
+
+// This is a standard TensorFlow Lite model file that has been converted into a
+// C data array, so it can be easily compiled into a binary for devices that
+// don't have a file system.
+
+// See train/README.md for a full description of the creation process.
+
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_HELLO_WORLD_MODEL_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_HELLO_WORLD_MODEL_H_
+
+extern const unsigned char g_model[];
+extern const int g_model_len;
+
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_HELLO_WORLD_MODEL_H_
diff --git a/tensorflow/lite/micro/examples/hello_world/sine_model_data.cc b/tensorflow/lite/micro/examples/hello_world/sine_model_data.cc
deleted file mode 100644
index 7252479fecd..00000000000
--- a/tensorflow/lite/micro/examples/hello_world/sine_model_data.cc
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Automatically created from a TensorFlow Lite flatbuffer using the command:
-// xxd -i sine_model.tflite > sine_model_data.cc
-// See the README for a full description of the creation process.
-
-#include "tensorflow/lite/micro/examples/hello_world/sine_model_data.h"
-
-// We need to keep the data array aligned on some architectures.
-#ifdef __has_attribute
-#define HAVE_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define HAVE_ATTRIBUTE(x) 0
-#endif
-#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
-#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
-#else
-#define DATA_ALIGN_ATTRIBUTE
-#endif
-
-const unsigned char g_sine_model_data[] DATA_ALIGN_ATTRIBUTE = {
-    0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
-    0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
-    0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x0a, 0x00, 0x00,
-    0xb8, 0x05, 0x00, 0x00, 0xa0, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x0b, 0x00, 0x00, 0x00, 0x90, 0x05, 0x00, 0x00, 0x7c, 0x05, 0x00, 0x00,
-    0x24, 0x05, 0x00, 0x00, 0xd4, 0x04, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
-    0x74, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x54, 0xf6, 0xff, 0xff, 0x58, 0xf6, 0xff, 0xff, 0x5c, 0xf6, 0xff, 0xff,
-    0x60, 0xf6, 0xff, 0xff, 0xc2, 0xfa, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
-    0x40, 0x00, 0x00, 0x00, 0x7c, 0x19, 0xa7, 0x3e, 0x99, 0x81, 0xb9, 0x3e,
-    0x56, 0x8b, 0x9f, 0x3e, 0x88, 0xd8, 0x12, 0xbf, 0x74, 0x10, 0x56, 0x3e,
-    0xfe, 0xc6, 0xdf, 0xbe, 0xf2, 0x10, 0x5a, 0xbe, 0xf0, 0xe2, 0x0a, 0xbe,
-    0x10, 0x5a, 0x98, 0xbe, 0xb9, 0x36, 0xce, 0x3d, 0x8f, 0x7f, 0x87, 0x3e,
-    0x2c, 0xb1, 0xfd, 0xbd, 0xe6, 0xa6, 0x8a, 0xbe, 0xa5, 0x3e, 0xda, 0x3e,
-    0x50, 0x34, 0xed, 0xbd, 0x90, 0x91, 0x69, 0xbe, 0x0e, 0xfb, 0xff, 0xff,
-    0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x67, 0x41, 0x48, 0xbf,
-    0x24, 0xcd, 0xa0, 0xbe, 0xb7, 0x92, 0x0c, 0xbf, 0x00, 0x00, 0x00, 0x00,
-    0x98, 0xfe, 0x3c, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a, 0x17, 0x9a, 0xbe,
-    0x41, 0xcb, 0xb6, 0xbe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x13, 0xd6, 0x1e, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x5a, 0xfb, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
-    0x4b, 0x98, 0xdd, 0xbd, 0x40, 0x6b, 0xcb, 0xbe, 0x36, 0x0c, 0xd4, 0x3c,
-    0xbd, 0x44, 0xb5, 0x3e, 0x95, 0x70, 0xe3, 0x3e, 0xe7, 0xac, 0x86, 0x3e,
-    0x00, 0xc4, 0x4e, 0x3d, 0x7e, 0xa6, 0x1d, 0x3e, 0xbd, 0x87, 0xbb, 0x3e,
-    0xb4, 0xb8, 0x09, 0xbf, 0xa1, 0x1f, 0xf8, 0xbe, 0x8d, 0x90, 0xdd, 0x3e,
-    0xde, 0xfa, 0x6f, 0xbe, 0xb2, 0x75, 0xe4, 0x3d, 0x6e, 0xfe, 0x36, 0x3e,
-    0x20, 0x18, 0xc2, 0xbe, 0x39, 0xc7, 0xfb, 0xbe, 0xfe, 0xa4, 0x30, 0xbe,
-    0xf7, 0x91, 0xde, 0xbe, 0xde, 0xab, 0x24, 0x3e, 0xfb, 0xbb, 0xce, 0x3e,
-    0xeb, 0x23, 0x80, 0xbe, 0x7b, 0x58, 0x73, 0xbe, 0x9a, 0x2e, 0x03, 0x3e,
-    0x10, 0x42, 0xa9, 0xbc, 0x10, 0x12, 0x64, 0xbd, 0xe3, 0x8d, 0x0c, 0x3d,
-    0x9e, 0x48, 0x97, 0xbe, 0x34, 0x51, 0xd4, 0xbe, 0x02, 0x3b, 0x0d, 0x3e,
-    0x62, 0x67, 0x89, 0xbe, 0x74, 0xdf, 0xa2, 0x3d, 0xf3, 0x25, 0xb3, 0xbe,
-    0xef, 0x34, 0x7b, 0x3d, 0x61, 0x70, 0xe3, 0x3d, 0xba, 0x76, 0xc0, 0xbe,
-    0x7d, 0xe9, 0xa7, 0x3e, 0xc3, 0xab, 0xd0, 0xbe, 0xcf, 0x7c, 0xdb, 0xbe,
-    0x70, 0x27, 0x9a, 0xbe, 0x98, 0xf5, 0x3c, 0xbd, 0xff, 0x4b, 0x4b, 0x3e,
-    0x7e, 0xa0, 0xf8, 0xbd, 0xd4, 0x6e, 0x86, 0x3d, 0x00, 0x4a, 0x07, 0x3a,
-    0x4c, 0x24, 0x61, 0xbe, 0x54, 0x68, 0xf7, 0xbd, 0x02, 0x3f, 0x77, 0xbe,
-    0x23, 0x79, 0xb3, 0x3e, 0x1c, 0x83, 0xad, 0xbd, 0xc8, 0x92, 0x8d, 0x3e,
-    0xa8, 0xf3, 0x15, 0xbd, 0xe6, 0x4d, 0x6c, 0x3d, 0xac, 0xe7, 0x98, 0xbe,
-    0x81, 0xec, 0xbd, 0x3e, 0xe2, 0x55, 0x73, 0x3e, 0xc1, 0x77, 0xc7, 0x3e,
-    0x6e, 0x1b, 0x5e, 0x3d, 0x27, 0x78, 0x02, 0x3f, 0xd4, 0x21, 0x90, 0x3d,
-    0x52, 0xdc, 0x1f, 0x3e, 0xbf, 0xda, 0x88, 0x3e, 0x80, 0x79, 0xe3, 0xbd,
-    0x40, 0x6f, 0x10, 0xbe, 0x20, 0x43, 0x2e, 0xbd, 0xf0, 0x76, 0xc5, 0xbd,
-    0xcc, 0xa0, 0x04, 0xbe, 0xf0, 0x69, 0xd7, 0xbe, 0xb1, 0xfe, 0x64, 0xbe,
-    0x20, 0x41, 0x84, 0xbe, 0xb2, 0xc3, 0x26, 0xbe, 0xd8, 0xf4, 0x09, 0xbe,
-    0x64, 0x44, 0xd1, 0x3d, 0xd5, 0xe1, 0xc8, 0xbe, 0x35, 0xbc, 0x3f, 0xbe,
-    0xc0, 0x94, 0x82, 0x3d, 0xdc, 0x2b, 0xb1, 0xbd, 0x02, 0xdb, 0xbf, 0xbe,
-    0xa5, 0x7f, 0x8a, 0x3e, 0x21, 0xb4, 0xa2, 0x3e, 0xcd, 0x86, 0x56, 0xbf,
-    0x9c, 0x3b, 0x76, 0xbc, 0x85, 0x6d, 0x60, 0xbf, 0x86, 0x00, 0x3c, 0xbe,
-    0xc1, 0x23, 0x7e, 0x3e, 0x96, 0xcd, 0x3f, 0x3e, 0x86, 0x91, 0x2d, 0x3e,
-    0x55, 0xef, 0x87, 0x3e, 0x7e, 0x97, 0x03, 0xbe, 0x2a, 0xcd, 0x01, 0x3e,
-    0x32, 0xc9, 0x8e, 0xbe, 0x72, 0x77, 0x3b, 0xbe, 0xe0, 0xa1, 0xbc, 0xbe,
-    0x8d, 0xb7, 0xa7, 0x3e, 0x1c, 0x05, 0x95, 0xbe, 0xf7, 0x1f, 0xbb, 0x3e,
-    0xc9, 0x3e, 0xd6, 0x3e, 0x80, 0x42, 0xe9, 0xbd, 0x27, 0x0c, 0xd2, 0xbe,
-    0x5c, 0x32, 0x34, 0xbe, 0x14, 0xcb, 0xca, 0xbd, 0xdd, 0x3a, 0x67, 0xbe,
-    0x1c, 0xbb, 0x8d, 0xbe, 0x91, 0xac, 0x5c, 0xbe, 0x52, 0x40, 0x6f, 0xbe,
-    0xd7, 0x71, 0x94, 0x3e, 0x18, 0x71, 0x09, 0xbe, 0x9b, 0x29, 0xd9, 0xbe,
-    0x7d, 0x66, 0xd2, 0xbe, 0x98, 0xd6, 0xb2, 0xbe, 0x00, 0xc9, 0x84, 0x3a,
-    0xbc, 0xda, 0xc2, 0xbd, 0x1d, 0xc2, 0x1b, 0xbf, 0xd4, 0xdd, 0x92, 0x3e,
-    0x07, 0x87, 0x6c, 0xbe, 0x40, 0xc2, 0x3b, 0xbe, 0xbd, 0xe2, 0x9c, 0x3e,
-    0x0a, 0xb5, 0xa0, 0xbe, 0xe2, 0xd5, 0x9c, 0xbe, 0x3e, 0xbb, 0x7c, 0x3e,
-    0x17, 0xb4, 0xcf, 0x3e, 0xd5, 0x8e, 0xc8, 0xbe, 0x7c, 0xf9, 0x5c, 0x3e,
-    0x80, 0xfc, 0x0d, 0x3d, 0xc5, 0xd5, 0x8b, 0x3e, 0xf5, 0x17, 0xa2, 0x3e,
-    0xc7, 0x60, 0x89, 0xbe, 0xec, 0x95, 0x87, 0x3d, 0x7a, 0xc2, 0x5d, 0xbf,
-    0x77, 0x94, 0x98, 0x3e, 0x77, 0x39, 0x07, 0xbc, 0x42, 0x29, 0x00, 0x3e,
-    0xaf, 0xd0, 0xa9, 0x3e, 0x31, 0x23, 0xc4, 0xbe, 0x95, 0x36, 0x5b, 0xbe,
-    0xc7, 0xdc, 0x83, 0xbe, 0x1e, 0x6b, 0x47, 0x3e, 0x5b, 0x24, 0x99, 0x3e,
-    0x99, 0x27, 0x54, 0x3e, 0xc8, 0x20, 0xdd, 0xbd, 0x5a, 0x86, 0x2f, 0x3e,
-    0x80, 0xf0, 0x69, 0xbe, 0x44, 0xfc, 0x84, 0xbd, 0x82, 0xa0, 0x2a, 0xbe,
-    0x87, 0xe6, 0x2a, 0x3e, 0xd8, 0x34, 0xae, 0x3d, 0x50, 0xbd, 0xb5, 0x3e,
-    0xc4, 0x8c, 0x88, 0xbe, 0xe3, 0xbc, 0xa5, 0x3e, 0xa9, 0xda, 0x9e, 0x3e,
-    0x3e, 0xb8, 0x23, 0xbe, 0x80, 0x90, 0x15, 0x3d, 0x97, 0x3f, 0xc3, 0x3e,
-    0xca, 0x5c, 0x9d, 0x3e, 0x21, 0xe8, 0xe1, 0x3e, 0xc0, 0x49, 0x01, 0xbc,
-    0x00, 0x0b, 0x88, 0xbd, 0x3f, 0xf7, 0xca, 0x3c, 0xfb, 0x5a, 0xb1, 0x3e,
-    0x60, 0xd2, 0x0d, 0x3c, 0xce, 0x23, 0x78, 0xbf, 0x8f, 0x4f, 0xb9, 0xbe,
-    0x69, 0x6a, 0x34, 0xbf, 0x4b, 0x5e, 0xa9, 0x3e, 0x64, 0x8c, 0xd9, 0x3e,
-    0x52, 0x77, 0x36, 0x3e, 0xeb, 0xaf, 0xbe, 0x3e, 0x40, 0xbe, 0x36, 0x3c,
-    0x08, 0x65, 0x3b, 0xbd, 0x55, 0xe0, 0x66, 0xbd, 0xd2, 0xe8, 0x9b, 0xbe,
-    0x86, 0xe3, 0x09, 0xbe, 0x93, 0x3d, 0xdd, 0x3e, 0x0f, 0x66, 0x18, 0x3f,
-    0x18, 0x05, 0x33, 0xbd, 0xde, 0x15, 0xd7, 0xbe, 0xaa, 0xcf, 0x49, 0xbe,
-    0xa2, 0xa5, 0x64, 0x3e, 0xe6, 0x9c, 0x42, 0xbe, 0x54, 0x42, 0xcc, 0x3d,
-    0xa0, 0xbd, 0x9d, 0xbe, 0xc2, 0x69, 0x48, 0x3e, 0x5b, 0x8b, 0xa2, 0xbe,
-    0xc0, 0x13, 0x87, 0x3d, 0x36, 0xfd, 0x69, 0x3e, 0x05, 0x86, 0x40, 0xbe,
-    0x1e, 0x7a, 0xce, 0xbe, 0x46, 0x13, 0xa7, 0xbe, 0x68, 0x52, 0x86, 0xbe,
-    0x04, 0x9e, 0x86, 0xbd, 0x8c, 0x54, 0xc1, 0x3d, 0xe0, 0x3b, 0xad, 0x3c,
-    0x42, 0x67, 0x85, 0xbd, 0xea, 0x97, 0x42, 0x3e, 0x6e, 0x13, 0x3b, 0xbf,
-    0x56, 0x5b, 0x16, 0x3e, 0xaa, 0xab, 0xdf, 0x3e, 0xc8, 0x41, 0x36, 0x3d,
-    0x24, 0x2d, 0x47, 0xbe, 0x77, 0xa5, 0xae, 0x3e, 0xc0, 0xc2, 0x5b, 0x3c,
-    0xac, 0xac, 0x4e, 0x3e, 0x99, 0xec, 0x13, 0xbe, 0xf2, 0xab, 0x73, 0x3e,
-    0xaa, 0xa1, 0x48, 0xbe, 0xe8, 0xd3, 0x01, 0xbe, 0x60, 0xb7, 0xc7, 0xbd,
-    0x64, 0x72, 0xd3, 0x3d, 0x83, 0xd3, 0x99, 0x3e, 0x0c, 0x76, 0x34, 0xbe,
-    0x42, 0xda, 0x0d, 0x3e, 0xfb, 0x47, 0x9a, 0x3e, 0x8b, 0xdc, 0x92, 0xbe,
-    0x56, 0x7f, 0x6b, 0x3e, 0x04, 0xd4, 0x88, 0xbd, 0x11, 0x9e, 0x80, 0x3e,
-    0x3c, 0x89, 0xff, 0x3d, 0xb3, 0x3e, 0x88, 0x3e, 0xf7, 0xf0, 0x88, 0x3e,
-    0x28, 0xfb, 0xc9, 0xbe, 0x53, 0x3e, 0xcf, 0x3e, 0xac, 0x75, 0xdc, 0xbe,
-    0xdd, 0xca, 0xd7, 0x3e, 0x01, 0x58, 0xa7, 0x3e, 0x29, 0xb8, 0x13, 0xbf,
-    0x76, 0x81, 0x12, 0xbc, 0x28, 0x8b, 0x16, 0xbf, 0x0e, 0xec, 0x0e, 0x3e,
-    0x40, 0x0a, 0xdb, 0xbd, 0x98, 0xec, 0xbf, 0xbd, 0x32, 0x55, 0x0c, 0xbe,
-    0xfb, 0xf9, 0xc9, 0x3e, 0x83, 0x4a, 0x6d, 0xbe, 0x76, 0x59, 0xe2, 0xbe,
-    0x54, 0x7d, 0x9f, 0xbb, 0x9d, 0xe8, 0x95, 0x3e, 0x5c, 0xd3, 0xd0, 0x3d,
-    0x19, 0x8a, 0xb0, 0x3e, 0xde, 0x6f, 0x2e, 0xbe, 0xd0, 0x16, 0x83, 0x3d,
-    0x9c, 0x7d, 0x11, 0xbf, 0x2b, 0xcc, 0x25, 0x3c, 0x2a, 0xa5, 0x27, 0xbe,
-    0x22, 0x14, 0xc7, 0xbe, 0x5e, 0x7a, 0xac, 0x3e, 0x4e, 0x41, 0x94, 0xbe,
-    0x5a, 0x68, 0x7b, 0x3e, 0x86, 0xfd, 0x4e, 0x3e, 0xa2, 0x56, 0x6a, 0xbe,
-    0xca, 0xfe, 0x81, 0xbe, 0x43, 0xc3, 0xb1, 0xbd, 0xc5, 0xb8, 0xa7, 0x3e,
-    0x55, 0x23, 0xcd, 0x3e, 0xaf, 0x2e, 0x76, 0x3e, 0x69, 0xa8, 0x90, 0xbe,
-    0x0d, 0xba, 0xb9, 0x3e, 0x66, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
-    0x40, 0x00, 0x00, 0x00, 0x53, 0xd6, 0xe2, 0x3d, 0x66, 0xb6, 0xcc, 0x3e,
-    0x03, 0xe7, 0xf6, 0x3e, 0xe0, 0x28, 0x10, 0xbf, 0x00, 0x00, 0x00, 0x00,
-    0x3e, 0x3d, 0xb0, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x62, 0xf0, 0x77, 0x3e,
-    0xa6, 0x9d, 0xa4, 0x3e, 0x3a, 0x4b, 0xf3, 0xbe, 0x71, 0x9e, 0xa7, 0x3e,
-    0x00, 0x00, 0x00, 0x00, 0x34, 0x39, 0xa2, 0x3e, 0x00, 0x00, 0x00, 0x00,
-    0xcc, 0x9c, 0x4a, 0x3e, 0xab, 0x40, 0xa3, 0x3e, 0xb2, 0xff, 0xff, 0xff,
-    0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0xb3, 0x71, 0x67, 0x3f,
-    0x9a, 0x7a, 0x95, 0xbf, 0xe1, 0x48, 0xe8, 0xbe, 0x8a, 0x72, 0x96, 0x3e,
-    0x00, 0xd2, 0xd3, 0xbb, 0x1a, 0xc5, 0xd7, 0x3f, 0xac, 0x7e, 0xc8, 0xbe,
-    0x90, 0xa7, 0x95, 0xbe, 0x3b, 0xd7, 0xdc, 0xbe, 0x41, 0xa8, 0x16, 0x3f,
-    0x50, 0x5b, 0xcb, 0x3f, 0x52, 0xb9, 0xed, 0xbe, 0x2e, 0xa7, 0xc6, 0xbe,
-    0xaf, 0x0f, 0x14, 0xbf, 0xb3, 0xda, 0x59, 0x3f, 0x02, 0xec, 0xd7, 0xbe,
-    0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x66, 0x11, 0x1f, 0xbf,
-    0xb8, 0xfb, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f,
-    0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,
-    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0xf0, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
-    0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xce, 0xff, 0xff, 0xff,
-    0x00, 0x00, 0x00, 0x08, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x1c, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00,
-    0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00, 0x00,
-    0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xba, 0xff, 0xff, 0xff,
-    0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x16, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x08, 0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-    0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x0a, 0x00, 0x00, 0x00, 0x10, 0x03, 0x00, 0x00, 0xa4, 0x02, 0x00, 0x00,
-    0x40, 0x02, 0x00, 0x00, 0xf4, 0x01, 0x00, 0x00, 0xac, 0x01, 0x00, 0x00,
-    0x48, 0x01, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00,
-    0x50, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x26, 0xfd, 0xff, 0xff,
-    0x3c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x18, 0xfd, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00,
-    0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31,
-    0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x34, 0x2f, 0x4d, 0x61, 0x74,
-    0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x6e, 0xfd, 0xff, 0xff,
-    0x50, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x60, 0xfd, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00,
-    0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31,
-    0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x34, 0x2f, 0x4d, 0x61, 0x74,
-    0x4d, 0x75, 0x6c, 0x2f, 0x52, 0x65, 0x61, 0x64, 0x56, 0x61, 0x72, 0x69,
-    0x61, 0x62, 0x6c, 0x65, 0x4f, 0x70, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73,
-    0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xce, 0xfd, 0xff, 0xff,
-    0x34, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0xc0, 0xfd, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00,
-    0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31,
-    0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x33, 0x2f, 0x52, 0x65, 0x6c,
-    0x75, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x10, 0x00, 0x00, 0x00, 0x12, 0xfe, 0xff, 0xff, 0x3c, 0x00, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x04, 0xfe, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
-    0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,
-    0x73, 0x65, 0x5f, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f,
-    0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x10, 0x00, 0x00, 0x00, 0x5a, 0xfe, 0xff, 0xff, 0x50, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x4c, 0xfe, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
-    0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,
-    0x73, 0x65, 0x5f, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f,
-    0x52, 0x65, 0x61, 0x64, 0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65,
-    0x4f, 0x70, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65,
-    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x10, 0x00, 0x00, 0x00, 0xba, 0xfe, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00,
-    0x0a, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0xac, 0xfe, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
-    0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,
-    0x73, 0x65, 0x5f, 0x32, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0xfe, 0xfe, 0xff, 0xff, 0x3c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
-    0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf0, 0xfe, 0xff, 0xff,
-    0x20, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,
-    0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32,
-    0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x46, 0xff, 0xff, 0xff, 0x50, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-    0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x38, 0xff, 0xff, 0xff,
-    0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,
-    0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32,
-    0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f, 0x52, 0x65, 0x61, 0x64,
-    0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65, 0x4f, 0x70, 0x2f, 0x74,
-    0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0xa6, 0xff, 0xff, 0xff, 0x48, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
-    0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,
-    0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x43,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
-    0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32, 0x5f, 0x69, 0x6e, 0x70, 0x75,
-    0x74, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x04, 0x00,
-    0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79,
-    0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x0a, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00,
-    0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x03, 0x00, 0x00, 0x00};
-const int g_sine_model_data_len = 2640;
diff --git a/tensorflow/lite/micro/examples/hello_world/train/README.md b/tensorflow/lite/micro/examples/hello_world/train/README.md
new file mode 100644
index 00000000000..93d8c0af0a6
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/train/README.md
@@ -0,0 +1,69 @@
+# Hello World Training
+
+This example shows how to train a 2.5 kB model to generate a `sine` wave.
+
+## Table of contents
+
+-   [Overview](#overview)
+-   [Training](#training)
+-   [Trained Models](#trained-models)
+-   [Model Architecture](#model-architecture)
+
+## Overview
+
+1. Dataset: Data is generated locally in the Jupyter Notebook.
+2. Dataset Type: **Structured Data**
+3. Deep Learning Framework: **TensorFlow 2**
+4. Language: **Python 3.7**
+5. Model Size: **2.5 kB**
+6. Model Category: **Regression**
+
+## Training
+
+Train the model in the cloud using Google Colaboratory or locally using a
+Jupyter Notebook.
+
+<table class="tfo-notebook-buttons" align="left">
+  <td>
+    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Google Colaboratory</a>
+  </td>
+  <td>
+    <a target="_blank" href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />Jupyter Notebook</a>
+  </td>
+</table>
+
+*Estimated Training Time: 10 minutes.*
+
+
+## Trained Models
+
+| Download Link        | [hello_world.zip](https://storage.googleapis.com/download.tensorflow.org/models/tflite/micro/hello_world_2020_04_13.zip)           |
+| ------------- |-------------|
+
+
+The `models` directory in the above zip file can be generated by following the
+instructions in the [Training](#training) section above. It
+includes the following 3 model files:
+
+| Name | Format | Target Framework | Target Device |
+| :------------- |:-------------|:-------------|-----|
+| `model.pb` | Keras SavedModel | TensorFlow | Large-Scale/Cloud/Servers   |
+| `model.tflite` *(2.5 kB)*  | Fully Quantized* TFLite Model | TensorFlow Lite | Mobile Devices|
+| `model.cc`  | C Source File | TensorFlow Lite for Microcontrollers | Microcontrollers |
+
+**Fully quantized implies that the model is **strictly int8** quantized
+**excluding** the input(s) and output(s).*
+<!-- **Fully quantized implies that the model is **strictly int8** quantized
+including the input(s)and output(s).* -->
+
+
+## Model Architecture
+
+The final model used to simulate a sine wave is displayed below. It is a
+simple feed forward deep neural network with 2 fully connected layers with
+ReLu activations and a final fully connected output layer with as shown below.
+
+![model_architecture.png](../images/model_architecture.png)
+
+*This image was derived from visualizing the 'model.tflite' file in [Netron](https://github.com/lutzroeder/netron)*
+
diff --git a/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb b/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb
new file mode 100644
index 00000000000..129e278f540
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb
@@ -0,0 +1,3530 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "train_hello_world_model.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aCZBFzjClURz",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Train a basic TensorFlow Lite for Microcontrollers model\n",
+        "\n",
+        "This notebook demonstrates the process of training a 2.5 kB model using TensorFlow and converting it for use with TensorFlow Lite for Microcontrollers. \n",
+        "\n",
+        "Deep learning networks learn to model patterns in underlying data. Here, we're going to train a network to model data generated by a [sine](https://en.wikipedia.org/wiki/Sine) function. This will result in a model that can take a value, `x`, and predict its sine, `y`.\n",
+        "\n",
+        "The model created in this notebook is used in the [hello_world](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n",
+        "\n",
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0Cz6uV1zU_hV",
+        "colab_type": "text"
+      },
+      "source": [
+        "**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_UQblnrLd_ET",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Configure Defaults"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5PYwRFppd-WB",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Define paths to model files\n",
+        "import os\n",
+        "MODELS_DIR = 'models/'\n",
+        "os.mkdir(MODELS_DIR)\n",
+        "MODEL_TF = MODELS_DIR + 'model.pb'\n",
+        "MODEL_NO_QUANT_TFLITE = MODELS_DIR + 'model_no_quant.tflite'\n",
+        "MODEL_TFLITE = MODELS_DIR + 'model.tflite'\n",
+        "MODEL_TFLITE_MICRO = MODELS_DIR + 'model.cc'"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dh4AXGuHWeu1",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Setup Environment\n",
+        "\n",
+        "Install Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "outputId": "e5cbcfca-b6a5-4a61-ac95-1a8d3fd5411b",
+        "id": "cr1VLfotanf6",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 85
+        }
+      },
+      "source": [
+        "! pip install -q tensorflow==2"
+      ],
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\u001b[K     |████████████████████████████████| 86.3MB 52kB/s \n",
+            "\u001b[K     |████████████████████████████████| 450kB 46.2MB/s \n",
+            "\u001b[K     |████████████████████████████████| 3.8MB 50.3MB/s \n",
+            "\u001b[?25h  Building wheel for gast (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6rLYpvtg9P4o",
+        "colab_type": "text"
+      },
+      "source": [
+        "Set Seed for Repeatable Results"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "EIH9NN1c9PJn",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Set a \"seed\" value, so we get the same random numbers each time we run this\n",
+        "# notebook for reproducible results.\n",
+        "# Numpy is a math library\n",
+        "import numpy as np\n",
+        "np.random.seed(1) # numpy seed\n",
+        "# TensorFlow is an open source machine learning library\n",
+        "import tensorflow as tf\n",
+        "tf.random.set_seed(1) # tensorflow global random seed"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tx9lOPWh9grN",
+        "colab_type": "text"
+      },
+      "source": [
+        "Import Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "53PBJBv1jEtJ",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Keras is TensorFlow's high-level API for deep learning\n",
+        "from tensorflow import keras\n",
+        "# Matplotlib is a graphing library\n",
+        "import matplotlib.pyplot as plt\n",
+        "# Math is Python's math library\n",
+        "import math"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "p-PuBEb6CMeo",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7gB0-dlNmLT-",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 1. Generate Data\n",
+        "\n",
+        "The code in the following cell will generate a set of random `x` values, calculate their sine values, and display them on a graph."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "uKjg7QeMDsDx",
+        "colab_type": "code",
+        "outputId": "0afa45df-3766-467c-c92f-2428aa04f22b",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 265
+        }
+      },
+      "source": [
+        "# Number of sample datapoints\n",
+        "SAMPLES = 1000\n",
+        "\n",
+        "# Generate a uniformly distributed set of random numbers in the range from\n",
+        "# 0 to 2π, which covers a complete sine wave oscillation\n",
+        "x_values = np.random.uniform(\n",
+        "    low=0, high=2*math.pi, size=SAMPLES).astype(np.float32)\n",
+        "\n",
+        "# Shuffle the values to guarantee they're not in order\n",
+        "np.random.shuffle(x_values)\n",
+        "\n",
+        "# Calculate the corresponding sine values\n",
+        "y_values = np.sin(x_values).astype(np.float32)\n",
+        "\n",
+        "# Plot our data. The 'b.' argument tells the library to print blue dots.\n",
+        "plt.plot(x_values, y_values, 'b.')\n",
+        "plt.show()"
+      ],
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD4CAYAAADhNOGaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3df5hcdX0v8Pd7syRBuJgQthDZNBtL\nlER7G9pp0gFNqWAWei2JVbxA9hIVn+GHVq2P7oT2eS5WrWaD1qAlkJGoyd0oBhCIt7QbREJAhoRN\nCUqyhexNQ9k0gYUENfxIzOZz//ieaWbmnM3u7MycM2fm/XqeeXbPZ87sflbMfOb7m2YGERFpXE1R\nJyAiItFSIRARaXAqBCIiDU6FQESkwakQiIg0uOaoExiLM844w9ra2qJOQ0QkVrZt2/aymbUUx2NZ\nCNra2tDb2xt1GiIisULy+aC4uoZERBqcCoGISINTIRARaXAqBCIiDU6FQESkwVWkEJD8LsmXSD4z\nzPMk+S2S/SR/QfIP855bQnKX91hSiXxERGT0KtUi+D6AS07w/KUAZnqPFIDbAIDk6QBuAjAPwFwA\nN5GcXKGcZAzSaWDaNOAd7wCmTAGamgASaG4GZswAMpmoMxSRSqtIITCzzQAOnOCWhQDWmvMEgEkk\npwJoB/CgmR0ws4MAHsSJC4pUWEcHMGECMG6ce9NfvhwYGAB27QIOHAByu5QPDQF79gDXXnu8MMye\nDbS3qziIxF1YYwRnA3gh73rAiw0X9yGZItlLsndwcLBqiTaCTMZ9um9qAtatA44cAY4dO/6mPxpD\nQ0BfH7BxoysO06YB118PZLPVy1tEqiM2g8VmljGzhJklWlp8K6RlFDIZ4K1vdW/ce/aU9sY/koEB\n4PbbgfPPB047zXUxiUg8hFUI9gKYlnfd6sWGi0sFpdPAqae6AvDrX4/uNZMmAaef7rqBSvWb37gu\nptmzS3+tiIQvrEKwAcDV3uyhPwHwKzPbB6AHwAKSk71B4gVeTCogm3VdNsuXA6+9NvL9J53kCkBn\nJ3DwIPDKK8e7jMyAxYuB8ePdvePGuUJxIn19bvxBrQOR2lap6aM/BJAF8E6SAySvIXkdyeu8Wx4A\nsBtAP4DvALgBAMzsAIAvA3jSe3zJi0mZ0mnXTTMwMPw9JDBxonuDN3NjBQcPAl1dwfd3dwOHD7t7\njx51haKz07U2hnPkiCtEHR3l/T0iUj2M4+H1iUTCtPvo8ObNA7ZuPfE9CxYAPRVse6XTwMqVwKFD\nw9+zaJErHMlk5X6viIweyW1mliiOx2awWEanre3ERWD+fODxxytbBADXivjNb9zPbm0Nvue++1wr\nZd68yv5uESmPCkGdyGaBlhbg+cDdxoGZM92b9COPVPcTeTIJvPCC++Q/nK1bNZAsUktUCOpAJuM+\nab/8cvDzCxYAzz0XbpdMV5crPIsWBT/f1+daLyISPRWCmJs3z00LDTJunPtkXuluoNFKJoF773WF\nKMjzz7tZRVqZLBKtWB5VKc6JBoVnzQJ27gw3n+H09LitKDZu9D935MjxQpZKhZuXiDhqEcRUJjN8\nEViwoHaKQE5Pj+sqmjQp+PmvfS3cfETkOBWCGMpk3L4+QRYvjq4raCTJpFuncNZZ/uf27FEXkUhU\nVAhipqPDdaUcO+Z/bvFit+ir1u3b53It9uUva+M6kSioEMRIR4fbLTRIZ2c8ikBOd7d/imn+xnXa\nlkIkPCoEMZHNBheBRYtc3/tw20LUsq4uYNUq4Jxz/M8tX66uIpGwaIuJGMhmgSuv9C8W+4M/ALZv\njyanSspmgQsu8G+LPWEC8Oab0eQkUo+0xURM5d4ki4sACdx2WzQ5VVoyCXzhC/744cNu2qmIVJcK\nQY1buDD4AJnbb6+vzdu6uoKnlm7cqL2JRKpNhaCGTZ0KBJ3K2dlZn4uvHnggOL51q7axFqkmFYIa\nNW8esH+/P75gQTwHhkcjmXSDx0Gnoq1bp2mlItWiQlCDstngVcOzZtXuYrFKSaWAn//cDRQXW7o0\n/HxEGkGlTii7hOSzJPtJ+v65kvwmye3e4zmSr+Y9N5T33IZK5BN3CxcGx2tt24hqSSaBb33LH9+9\nO/xcRBpB2YWA5DgAtwK4FMBsAFeSLNht3sz+2szmmNkcAN8G8OO8p9/IPWdml5WbT9y1tQWPCwy3\ng2e9SqX8f/Pb367uIZFqqESLYC6AfjPbbWZHANwJYJjPtACAKwH8sAK/t+60twcfLNMIXUJBenrc\nwHhrq9tS+9FHgfe8R6uORSqtEoXgbAAv5F0PeDEfktMBzADws7zwRJK9JJ8gOcwxJgDJlHdf72DQ\nR+aYy2SCt2mupe2ko9DVBdxwg9tbycx9Xb5c6wtEKinsweIrANxtZkN5seneSrerAKwg+XtBLzSz\njJklzCzR0tISRq6hCtqGefr0xi4CORde6J9JtHGjppSKVEolCsFeANPyrlu9WJArUNQtZGZ7va+7\nAWwCcF4FcoqVdNptw5zv5JP9sUaVTAKf/7w/vm6d9iMSqYRKFIInAcwkOYPkeLg3e9/sH5LnApgM\nIJsXm0xygvf9GQAuANBQn4HTadfVUeyv/ir8XGpZVxcwc6Y/fu21GkAWKVfZhcDMjgL4FIAeAH0A\n1pvZDpJfIpk/C+gKAHda4S53swD0knwawMMAlplZQxWClSv9sba2+l00Vo41a4Ljl18ebh4i9aYi\nZxab2QMAHiiK/e+i6y8GvO5xAL9fiRziKJ0GDh3yx2+8Mfxc4iCZdLOIiltQe/e6/y1VPEXGRiuL\nI5LNAl//uj++eHF97iNUKV1dwNy5/vjy5eoiEhkrFYKILF3qP24yLkdNRm3LFuC00/zxoLEWERmZ\nCkEE0mlg8+bCWNyOmozazTf7Yxs2qFUgMhYqBCHLZoFbby2Mtbaqf7tUqZT/zOPcYjMRKY0KQYiy\nWbdFwmuvFcavuiqafOKuq8ud2ZzvJz9Rq0CkVCoEIbr+ev+4wPTpag2Uo7PT7UOUc+wYcPXVWmgm\nUgoVghD19fljf/M34edRT5JJtxbjpJPcNhRmQH+/W2imYiAyOioEIenoAI4cKYy1tmqqaCWkUsAj\njwDFW1AF7d8kIn4qBCHIZNy+OMXWrw8/l3qVTAJNRf9v3rNH4wUio6FCEIJbbvHHOjvdm5dUzuTJ\n/thHPhJ+HiJxo0JQZZmMfyvpxYs1QFwNn/2sPzYwoO2qRUaiQlBlK1YUXs+apYVj1ZJKuSJb7J57\nws9FJE5UCKoonfbPFAr61CqV093t337izTd1vKXIiagQVEnQpnKzZ2uWUBiCtp9YvlzTSUWGo0JQ\nJWvX+hePfeYz0eTSaFIpYP58f3z16vBzEYkDFYIqyGaB7373+DXpZgmpNRCeZcv8sf/8z/DzEImD\nihQCkpeQfJZkP8mlAc9/lOQgye3e4xN5zy0huct7LKlEPlFbuxb47W/d96Rb5apZQuFKJv37EGkG\nkUiwsgsByXEAbgVwKYDZAK4kOTvg1h+Z2RzvcYf32tMB3ARgHoC5AG4iGTAbPD4yGeA733FbHQBu\n64Orr442p0ZVvDspoAPvRYJUokUwF0C/me02syMA7gSwcJSvbQfwoJkdMLODAB4EcEkFcopEJgNc\ndx0wNHQ89vGPa+FYVJLJ4OmkOvBepFAlCsHZAF7Iux7wYsU+RPIXJO8mOa3E14JkimQvyd7BwcEK\npF1Z2awrArmWAOC2PFBrIFrd3cCpp/rjS30dmCKNK6zB4p8AaDOz/w73qX9NqT/AzDJmljCzREvx\n7mI1YPnywiIAAOeeq9ZALbjhBn9s82a1CkRyKlEI9gKYlnfd6sX+i5m9YmaHvcs7APzRaF8bF088\n4Y9pumht6OoCZs70x3WamYhTiULwJICZJGeQHA/gCgAb8m8gOTXv8jIAufW2PQAWkJzsDRIv8GKx\nkskA+/cXxubM0XTRWrJmjZvBle+++9QqEAEqUAjM7CiAT8G9gfcBWG9mO0h+ieRl3m2fJrmD5NMA\nPg3go95rDwD4MlwxeRLAl7xYrBTve0+6w1KkdiSTwO23++MXXhh6KiI1h1bcsR0DiUTCent7o04D\nANDeDmzcWBjr7NS6gVo1fvzxNR45c+cCW7ZEk49ImEhuM7NEcVwri8uQyfiLQEuLikAt+7M/88dq\n5DOFSGRUCMrw13/tj33sY+HnIaPX0+NaBfmOHdNYgTQ2FYIxam8HXn+9MDZliloDcbBpkz+mk8yk\nkakQjNHDD/tjX/1q+HlI6ZJJt/VHvoEBtQqkcakQjNHJJxden3KKpovGSdBYQVBLQaQRqBCMQSYD\nHDpUGPuHf4gmFxmbnh43Wyinudl17Yk0IhWCEqXTbtOy/ENnFi1SayCOtmwBVq1y3URDQ8AnP6md\nSaUxqRCUIJPxb0vQ1BS83bHEwyuvuCJgBhw9Clx/vYqBNB4VghLccos/9s53amO5OLvwQlfMc44d\nc8VAA8fSSFQIRimbBXbu9Mc/+9nwc5HKSSaBW28t3IcoVwxEGoUKwSgF7VSpc4jrQyoF/O7vFsae\nflpdRNI4VAhG6dlnC69nz9bisXpy3nn+2E03hZ+HSBRUCEYhmwV27SqM6ayB+hI04L9/v8YKpDGo\nEIzC2rVuRknO/PnqEqo3yaQ7Q6KYxgqkEagQjCCbBf75nwtjs2dHk4tUV9AZErt3h5+HSNgqUghI\nXkLyWZL9JH3HgpP8HMmd3uH1D5GcnvfcEMnt3mND8WujlM26T//PP388Nm6cDqSvV8kksGBBYYzU\noLHUv7ILAclxAG4FcCmA2QCuJFn8mfkpAAnv8Pq7AeTPwXnDzOZ4j8tQQzZtKuwSAoA/+iOtG6hn\nxVtP/PrXbiW5ioHUs0q0COYC6Dez3WZ2BMCdABbm32BmD5tZbtPmJ+AOqa95r77qP+f2mmuiyUXC\nM2mSP7ZiRfh5iISlEoXgbAAv5F0PeLHhXAMgv9d9Islekk+QXDTci0imvPt6BwcHy8t4FNJpt3Yg\nd5JnW5vbl0aDxPXvQx/yx954I/w8RMIS6mAxyQ4ACQA354Wne2doXgVgBcnfC3qtmWXMLGFmiZaW\nlqrmmc0CX/96Yewd71ARaBSpFLB4cWHsP/5D3UNSvypRCPYCmJZ33erFCpC8GMDfArjMzA7n4ma2\n1/u6G8AmAAFLe8K1aVPh7qJA8KdEqV/d3W5X2RztQST1rBKF4EkAM0nOIDkewBUACmb/kDwPwCq4\nIvBSXnwyyQne92cAuABAwI4+4dqxo/B68WK1BhpRZ6d/Q7obboguH5FqKbsQmNlRAJ8C0AOgD8B6\nM9tB8kskc7OAbgZwKoC7iqaJzgLQS/JpAA8DWGZmkRaC9nZg3brC2LveFU0uEq1kEjj33MLY9u3q\nIpL6Q8uNhsZIIpGw3t7eiv/c3ABxvqYm4LHHNGW0UWUybvpovrPOAvbtiyYfkXKQ3OaNyRbQyuI8\n3/62P/b5z6sINLJUCjj11MKY9iCSeqNC4Ono8E8RbG3VDqMSPC4QtC25SFypEHjuussf+8AHws9D\nak9Xl3866X33aaxA6ocKgWdoyB/TnkKS091duPUEANx4YzS5iFSaCgHcJ7viQrBggcYGpFDx9iIH\nDrhZZiJxp1lDAN72tsJZIFOmAC+/XLEfL3WkubnwQ8OECcCbb0aXj0gpNGtoGOm0fyrge98bTS5S\n+97+9sLr4rOOReKo4QvBN77hjwUdWygCAGvWFO5Ie9FFmkoq8dfQhWDePP/YwPjxGhuQ4SWTwM9/\nDlx3nesmWrUKuPBCFQOJt4YtBJkMsHWrP3755eHnIvGS+6Bw9KjbpvzIEXeutUhcNWwhCJr6N2mS\nmyYoMpL9+wuvn3gimjxEKqEhC0E266b+FXvggfBzkXg666zC6+3b3cQDkThqyEKwdKk/pnUDUoqr\nr/YfY7p8ucYKJJ4arhBks8CjjxbGTjvNHVouMlrJZPA04yVLws9FpFwNVwg2bTp+DnHO+94XSSoS\nc8uW+WO7dmkPIomfhisE991XeN3UpHUDMjbJpH8zOgD42tfCz0WkHBUpBCQvIfksyX6Svh54khNI\n/sh7fgvJtrznbvTiz5Ks6s4tHR3+KaOplMYGZOy6u4GZMwtje/aoVSDxUnYhIDkOwK0ALgUwG8CV\nJGcX3XYNgINmdg6AbwLo8l47G+6M43cBuATASu/nVcW99/pj2mFUyrVmjT92yy3h5yEyVpVoEcwF\n0G9mu83sCIA7ASwsumchgNw/l7sBXESSXvxOMztsZv8OoN/7eRWXyQCvv14Y00whqYRkEpgzpzDW\n16cZRBIflSgEZwN4Ie96wIsF3uMddv8rAFNG+VoAAMkUyV6SvYODgyUnec89hddTp2qmkFTOn/xJ\n4bWZVhtLZWUybtvzanQ7xmaw2MwyZpYws0RLS0vJr//Qhwqvv/jFyuQlArguxpNOKox95ztqFUhl\nZDLAtdcCGze6r5UuBpUoBHsBTMu7bvVigfeQbAbwVgCvjPK1FZFKuQ3CFixwX1OpavwWaVTJJPDI\nI0Bb2/HY0FDw4kWRUhXPRFu9urI/vxKF4EkAM0nOIDkebvB3Q9E9GwDkltp8GMDPzJ2IswHAFd6s\nohkAZgII2AquMlIp1x2kIiDVEDTetHmzWgVSnkzGzUTL97a3VfZ3lF0IvD7/TwHoAdAHYL2Z7SD5\nJZKXebetBjCFZD+AzwFY6r12B4D1AHYC+BcAnzSzgNODReLh5JP9MbUKpBxB3diVXvukoypFKijX\nl5uPdGcYaIaalCqddntY5Zs/33VDjoWOqhQJQSrlX22sGUQyVt/+tj8WtLVJuVQIRCqsu9u/rkDn\nFUipMhngjTcKY6ecUp2WpQqBSBUUryvYvt1tcSIyWkGr0z/5yer8LhUCkSoIOq9g3TrtQSSjk80C\nO3cWxmbOBLq6qvP7VAhEqmC48wq0M6mMxg03+GNBe1pVigqBSJUEDert2aN1BXJi2azrSszX0lLd\nWWcqBCJVkkwCkyb545s2hZ6KxEjxdFEA+NjHqvs7VQhEqihoFfuOHeHnIfGQzfoPz5ozp3pjAzkq\nBCJV1NXlFgDlW7fOLRQSKbaweAN/ACtXVv/3qhCIVNmyZf4ZRMuXa6xACqXTQPEO+xMnhrMiXYVA\npMqSSWD6dH98yRJ/TBrXD37gj517bji/W4VAJAQ33uiP7d4dfh5SuyZO9MfC6BYCVAhEQpFKuSmA\n+Y4dU/eQOOk00N9fGFu1KryNClUIREJy//2F12bBUwWlsWSzwM03F8ZmzQr33BQVApGQJJPuU15T\n3r+6++7TDKJGt3y5+1CQ753vDDcHFQKREKVSQKJoN/ivf11dRI0qm/W3FMnKHzwzkrIKAcnTST5I\ncpf3dXLAPXNIZknuIPkLkv8z77nvk/x3ktu9x5zi14vUm2uuKbw+dkznFTSqtWv9rYGFC8M/xKjc\nFsFSAA+Z2UwAD3nXxV4HcLWZvQvAJQBWkMxfeP8FM5vjPbYHvF6krqRSrg843+23q1XQiPbvL7xu\nagq/NQCUXwgWAsjtibcGwKLiG8zsOTPb5X3/nwBeAtBSfJ9II3nxRX9MA8eNJZMBfvKT49dNTcBt\nt0VzpGm5heBMM9vnfb8fwJknupnkXADjAfy/vPDfe11G3yQ54QSvTZHsJdk7WLz8TiRmLr3UH3v0\n0fDzkGhks+6QmaEhd026lmKYM4XyjVgISP6U5DMBj4JdMczMANgwPwYkpwL4PwA+ZmbHvPCNAM4F\n8McATgcw7PwJM8uYWcLMEi3FE7JFYqa7G2htLYy98grQ3h5NPhKuTZvc2FBOc7M7zCgqIxYCM7vY\nzN4d8LgfwIveG3zujf6loJ9B8jQA/wTgb83sibyfvc+cwwC+B2BuJf4okThYv94f27hRp5g1ggsv\nBCZMcN1Bzc3AP/5jNF1COeV2DW0AkNsxZQmA+4tvIDkewL0A1prZ3UXP5YoI4cYXnikzH5HYSCaB\nxYv98RUrws9FwpNOu0//f/mXwFe+AmzeHF2XUE65hWAZgPeT3AXgYu8aJBMk7/Du+QiA+QA+GjBN\ndB3JXwL4JYAzAHylzHxEYqW7GzjnnMJYX59mENWrdNpNCujvd9uRv/pqtC2BHFrxJNYYSCQS1tvb\nG3UaIhWRzQIXXFA4n3zOHOCpp6LLSaqjufn4ADEAnHwy8Prr4f1+ktvMLFEc18pikYglk/51Bdu3\na6yg3qTThUUAAI4ciSaXYioEIjXgM5/xx266Kfw8pHqCCvtFF4WfRxAVApEakEoBZ51VGNu/X62C\nepHNuvGAfBMmAD090eRTTIVApEb83d/5Y0EH2kj8BK0a/9a3ws9jOCoEIjUilQJOP70wduCAtqmO\nu0zGv8Po/PnRTxnNp0IgUkPmz/fHvve98POQyshmgeuuK5wR1tQELFsWXU5BVAhEakjQzpPNzeHn\nIZURdOjMZZfVxtqBfCoEIjUkmfQXgxdf1KBxXD37rD8WxTbTI1EhEKkxXV3AorwN3Y8dA264QauN\n4yaddqvE83V21l5rAFAhEKlJnZ3AuHHHr4eGXDGQeMhk/DOF5s93Rb4WqRCI1KBkEviLvyiMbd+u\nGURxccst/tjs2eHnMVoqBCI1Kqgv+RvfCD8PKU02C+zcWRhraor2vIGRqBCI1KhkEjj11MLY0JAO\nr6l1QV14UR1BOVoqBCI1LOhN5aGHws9DRieTcV14+WbPrq3FY0FUCERqWFeX25Mm39AQ0NERTT5y\nYl/7mj8WtKFgrSmrEJA8neSDJHd5XycPc99Q3qE0G/LiM0huIdlP8kfeaWYikidoT5p16zRwXGvS\naWDPnsLYWWfVfmsAKL9FsBTAQ2Y2E8BD3nWQN8xsjve4LC/eBeCbZnYOgIMArikzH5G6k0oFbz2x\ncmX4uUiwoOmiQPBGgrWo3EKwEMAa7/s1cOcOj4p3TvH7AOTOMS7p9SKNJGhvmkOHtOK4VgSdHVFr\nG8udSLmF4Ewz2+d9vx/AmcPcN5FkL8knSObe7KcAeNXMjnrXAwDOHu4XkUx5P6N3cHCwzLRF4iVo\n6wkA+OIXQ09FimQy7uyIYrW2sdyJjFgISP6U5DMBj4X595k7/Hi4A5Cne+dkXgVgBcnfKzVRM8uY\nWcLMEi0tLaW+XCT2urqAxYsLY/v2aeA4aqtX+2NtbbU9XbTYiPsamtnFwz1H8kWSU81sH8mpAF4a\n5mfs9b7uJrkJwHkA7gEwiWSz1ypoBbB3DH+DSMPo7gbuvbfwwPMNG4a/X6pv925/LG4HCpXbNbQB\nwBLv+yUA7i++geRkkhO8788AcAGAnV4L4mEAHz7R60Wk0Ac/WHitsYLotLUBL79cGOvsjM/YQA6t\neLPsUl5MTgGwHsDvAngewEfM7ADJBIDrzOwTJM8HsArAMbjCs8LMVnuvfzuAOwGcDuApAB1mdnik\n35tIJKy3t3fMeYvE3Zw5wNNPH79uagIeeyxe3RFxN28esHVrYWzCBODNN6PJZzRIbvO66QuUdeSF\nmb0C4KKAeC+AT3jfPw7g94d5/W4Ac8vJQaQR3XYb8J73uC2qAff1+uv9q1qlOtJpfxEAgD/90/Bz\nqQStLBaJoWTSf77x00/rzIKwrFjhj02fDvT0hJ9LJagQiMTUxz/ujy0dbkmnVEw2Cxw5Uhgj/auK\n40SFQCSmurrcWEG+zZs1cFxtQSuI3//+8POoJBUCkRhbudJ9Gs0X1G0hlZFOA/fdVxhrbY1vl1CO\nCoFIjCWTwHvfWxjr69Mis2oI2k+oqQlYvz6afCpJhUAk5pYtc29I+bQ7aeUFtbQuu6w+puyqEIjE\nXDLpppMW0+6klZPNAi+84I8H7f8URyoEInUglXJ73+c7dEitgkrIZt36gEOHCuPz59dHawBQIRCp\nG0F73998s9YWlGvpUuC3vy2MNTXFa3fRkagQiNSJVMq/O6kZcPnl0eRTDzIZNyU3H1n7h9GXSoVA\npI50dwNvfWthbO9ezSIaq6AB4ttvj9+mciNRIRCpM9de64+tW6cuolJls+68h3xtbfVXBAAVApG6\n09UFzA3YynHBgvBziatMBjj/fODVVwvjcTtnYLRUCETq0JYtwbOI2toiSSdWstngVtWiRfXZGgBU\nCETqVtAsouef15TSkVx9dXC8XtYMBFEhEKlTQWsLALdNgjamC5bNAv39/viCBfU1S6hYWYWA5Okk\nHyS5y/s6OeCePyO5Pe/xJslF3nPfJ/nvec/N8f8WERmrffuAiRP98Xrt6y7X2rX+WJzPGRitclsE\nSwE8ZGYzATzkXRcws4fNbI6ZzQHwPgCvA9iYd8sXcs+bmc5XEqmwW27xxw4c0JTSYtks8MgjhbFz\nzon3OQOjVW4hWAhgjff9GgCLRrj/wwD+2cxeL/P3isgopVLB/dt33RV+LrUqnXazhPr6jsdOOim4\nhVCPyi0EZ5pZbqbtfgBnjnD/FQB+WBT7e5K/IPlNkhOGeyHJFMlekr2Dg4NlpCzSeLq6gClTCmNH\njgDt7dHkU0s6OoIPm7nmmvoeF8g3YiEg+VOSzwQ8FubfZ2YGwE7wc6bCHWKf39t2I4BzAfwxgNMB\nDDufwcwyZpYws0RLS8tIaYtIka9+1R/buLGxu4gyGbfYrti4ccPPHqpHzSPdYGYXD/ccyRdJTjWz\nfd4b/Usn+FEfAXCvmf3X9k15rYnDJL8H4POjzFtESpRKuX1zit/41q0DBgfrf0A0yKc/HRxfubJx\nWgNA+V1DGwAs8b5fAuD+E9x7JYq6hbziAZKEG194psx8ROQEuruDVxhv3AjMmxd+PlFqbwcOH/bH\nFy+u34Vjwym3ECwD8H6SuwBc7F2DZILkHbmbSLYBmAagaEwe60j+EsAvAZwB4Ctl5iMiI+jp8e9S\nCgBbtzbO+oJMxhW/YrNmuWLZaOi69uMlkUhYb29v1GmIxFp7u//NcPr0+p8umckA113ntujON2kS\ncPBgNDmFheQ2M0sUx7WyWKRB9fQAZ59dGHv++foePM7tI1RcBJqagAceiCanWqBCINLA7rrLHbSS\nr54Pvv/zP/fHZs0CHnussQaHi6kQiDSwZBL4whf88Ztvrr/xgnTav600AKxe3dhFAFAhEGl4XV3B\nR1xee239dBO1twcvGps1S6Z0iREAAAeQSURBVEUAUCEQEbiZMp2dwd1Es2dHk1OldHQEzxCaPh3Y\nuTP8fGqRCoGIAHAtg9tv98f7+vzbU8RFRwfwgx/4452d9T87qhQqBCLyX1Kp4DUGBw7Erxi0tbkW\nTfEMoQULXNGT41QIRKRAd7frOy924EB8uona291U2HykK3KNuJXGSFQIRMRn587gA236+oDzznPz\n8WtRJgO8613BYwJXXdWYq4ZHQ4VARAL97GfB8e3b3d79tbbWoKPDzXQKGgCePl1F4ERUCEQkUDIJ\nPP44cMYZwc8vX147XUXz5gVvJw24MQENDJ+YCoGIDCuZdFtUBw0gA66r6JRTolt8lk4Dzc1uw7xi\nTU3AqlUaExgNFQIRGVF3t3tTDfL6665LJuxtrKdMca2SoSH/c2ed5baNaLTtpMdKhUBERmW4s49z\ntm51A8zVHjvIZl0r4MCB4OdJ4Mc/1orhUqgQiMioBW1Hke/wYfcp/Xd+p/Izizo6gFNPdQPVQa0A\nwG0l/fOfqwiUSoVAREqS6yY67bTh7xkcdG/Yra3A9dePvSik066onHSSGwx+7bXh7507150noCJQ\nurIKAcnLSe4geYyk77CDvPsuIfksyX6SS/PiM0hu8eI/Ijm+nHxEJBypFPCrX7nWwbhxw9+3d6/b\ntuL8812XzVveMnLXUXu76/ohXeticBA4enT4+0lXmLZsGdvfIuW3CJ4B8JcANg93A8lxAG4FcCmA\n2QCuJJmbdNYF4Jtmdg6AgwCuKTMfEQlRd7d7kz7R2EG+N95wb+7TprlP+eTxx8SJbgbSxo3Dd/0U\nmzULOHZMg8LlKqsQmFmfmT07wm1zAfSb2W4zOwLgTgALvQPr3wfgbu++NXAH2ItIzHR1uTUHc+ac\nuIWQMzDg/5R/+LCbgTSSSZPcUZOPP67dQysljDGCswG8kHc94MWmAHjVzI4WxQORTJHsJdk7ODhY\ntWRFZGySSeCpp9wb/OLFbgzhlFMq9/MnT3Ytj4MHgdtu01hAJY1YCEj+lOQzAY+FYSSYY2YZM0uY\nWaKlpSXMXy0iJerudmMIhw65opDr8x8/3i30GknuXISmJmDmTPfp/8AB7RpaLSP+JzGzi83s3QGP\n+0f5O/YCmJZ33erFXgEwiWRzUVxE6kh3N/Db37q+/MOH3UKv+fNdccg3YYLbsmLVKnevmRsreO45\nffqvtuaRbynbkwBmkpwB90Z/BYCrzMxIPgzgw3DjBksAjLa4iEhMJZPAI49EnYXkK3f66AdJDgBI\nAvgnkj1e/G0kHwAAbwzgUwB6APQBWG9mO7wfkQbwOZL9cGMGq8vJR0RESkcrPr4nBhKJhPX29kad\nhohIrJDcZma+NV9aWSwi0uBUCEREGpwKgYhIg1MhEBFpcLEcLCY5COD5Mb78DAAvVzCdKMT9b4h7\n/kD8/4a45w/E/2+IIv/pZuZbkRvLQlAOkr1Bo+ZxEve/Ie75A/H/G+KePxD/v6GW8lfXkIhIg1Mh\nEBFpcI1YCDJRJ1ABcf8b4p4/EP+/Ie75A/H/G2om/4YbIxARkUKN2CIQEZE8KgQiIg2uoQoByUtI\nPkuyn+TSqPMpFcnvknyJ5DNR5zIWJKeRfJjkTpI7SH4m6pxKQXIiya0kn/by/7uocxorkuNIPkXy\n/0ady1iQ3EPylyS3k4zdDpQkJ5G8m+S/kewjGemJCw0zRkByHIDnALwf7ljMJwFcaWaxOfWU5HwA\nhwCsNbN3R51PqUhOBTDVzP6V5H8DsA3Aorj8N/DO2T7FzA6RPAnAYwA+Y2ZPRJxayUh+DkACwGlm\n9oGo8ykVyT0AEmYWywVlJNcAeNTM7iA5HsBbzOzVqPJppBbBXAD9ZrbbzI7AHYYT6nGb5TKzzQAO\nRJ3HWJnZPjP7V+/738CdTzHsOdW1xpxD3uVJ3iN2n6RItgL4HwDuiDqXRkTyrQDmwzt/xcyORFkE\ngMYqBGcDeCHvegAxehOqNyTbAJwHYEu0mZTG61LZDuAlAA+aWazy96wA0AngWNSJlMEAbCS5jWQq\n6mRKNAPAIIDved1zd5A8JcqEGqkQSI0geSqAewB81sx+HXU+pTCzITObA3fG9lySseqiI/kBAC+Z\n2baocynTe8zsDwFcCuCTXrdpXDQD+EMAt5nZeQBeAxDpmGUjFYK9AKblXbd6MQmR17d+D4B1Zvbj\nqPMZK68p/zCAS6LOpUQXALjM62O/E8D7SHZHm1LpzGyv9/UlAPfCdf3GxQCAgbzW5N1whSEyjVQI\nngQwk+QMb3DmCgAbIs6poXiDrasB9JnZP0SdT6lItpCc5H1/MtzEg3+LNqvSmNmNZtZqZm1w/wZ+\nZmYdEadVEpKneJMN4HWpLAAQm5l0ZrYfwAsk3+mFLgIQ6YSJ5ih/eZjM7CjJTwHoATAOwHfNbEfE\naZWE5A8BXAjgDJIDAG4ys9XRZlWSCwD8LwC/9PrZAeBvzOyBCHMqxVQAa7wZaE0A1ptZLKdfxtyZ\nAO51nyvQDOAHZvYv0aZUsr8CsM77ULobwMeiTKZhpo+KiEiwRuoaEhGRACoEIiINToVARKTBqRCI\niDQ4FQIRkQanQiAi0uBUCEREGtz/B3TdSrfISH+TAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "iWOlC7W_FYvA",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 2. Add Noise\n",
+        "Since it was generated directly by the sine function, our data fits a nice, smooth curve.\n",
+        "\n",
+        "However, machine learning models are good at extracting underlying meaning from messy, real world data. To demonstrate this, we can add some noise to our data to approximate something more life-like.\n",
+        "\n",
+        "In the following cell, we'll add some random noise to each value, then draw a new graph:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "i0FJe3Y-Gkac",
+        "colab_type": "code",
+        "outputId": "38886dba-5757-4c7e-bcd6-32c1eb82863e",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 265
+        }
+      },
+      "source": [
+        "# Add a small random number to each y value\n",
+        "y_values += 0.1 * np.random.randn(*y_values.shape)\n",
+        "\n",
+        "# Plot our data\n",
+        "plt.plot(x_values, y_values, 'b.')\n",
+        "plt.show()"
+      ],
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO2de5RcdZXvv7se6aAzTO4UrAkPYxxB\nFCdLGkOckjE2AwoBA9HccSlzpzMhpAMkIOMjmjtyzYhDnOCSCImYJo+bvpfxsYzkMSaCPMoEKEg6\ndJweCTgJgyHBXGI7GcYx9KPqd//Yvf39zq/Oqa5OV3W99metXt116pyqU1Vd+7fPfnw3GWOgKIqi\nND6xap+AoiiKMj6owVcURWkS1OAriqI0CWrwFUVRmgQ1+IqiKE1CotonEMUZZ5xhpk6dWu3TUBRF\nqSv27dv3K2PMmWH31azBnzp1Krq7u6t9GoqiKHUFEf0i6j4N6SiKojQJavAVRVGaBDX4iqIoTYIa\nfEVRlCZBDb6iKEqToAZfURSlSVCDrwAAsllgxQr+rShKY1KzdfjK+JHNApdfDgwMABMmAI89BqTT\n1T4rRVHKjXr4CjIZNva5HP/OZKp9RoqiVAI1+Ara2tizj8f5d1tbtc9IUZRKoCEdBek0h3EyGTb2\nGs5RlMZEDb4CgI28b+izWV0EFKWRUIOvhKKJXEVpPDSGr4SiiVxFaTzU4CuhaCJXURoPDekooWgi\nV1EaDzX4SiRhiVxFUeoXDek0CCqNoCjKSKiH3wCUu6ImrBxTSzQVpf5Rg98AuBU1b7wBdHWVbpR9\nQx62eAClLSi6KChKbaMGvwFoa+NqmlwOMAbYuBFobx/Z6IYZ96hyTH9bWJOW1u0rSm2jMfwGIJ0G\nbrgBIOLbQ0Ol1c37xr2rCzh8mBcPtxyzlBJNrdtXlNpHPfw6YaRwSXs7sGkTG9t4nA13NlvcyxZD\nLsds3MiLRSIBLFwYvEoYqUTTfSyt21eU2oSMMdU+h1CmT59uuru7q30aNUFUXD0ssbpyJbB9O4d2\nEgn2/IuFd2QhOXwYeOAB9tDjceDOO/mxS43JZ7N8hQCUFk5SFKUyENE+Y8z0sPvUw68DwkIv4s37\n8fIf/pD3A/j+tWt5Xz+mLoY+leLbra1BDz2VKj0m7y9I7e2VeicURRkLavDrAD9cAoQnUTMZa+wF\nYwoTrZ2dwJIlHL4xBojFgGQSmDWL7588GejpiU7U+uGlsPi9eviKUnuowa8DfJkDIOjhy7a2NqCl\nBejv5wRuLAbk8xzakZg+ACxezMZeyOf5mC1b+LYsAPE433afo7OTj8/n+bkee0zj94pSL6jBrxN8\nmQNZAFIpWxETtjB0dQEbNnB8ftMmYN48NtbFyOd5QVi4EJgyJVijL1cGAC8SmQywbJnq7ihKPaAG\nv04RoxoVZxfjO2UKh1ok3ALYq4BYDPj0p3nfPXuCj59IFCZfM5nglQGRXVhUd0dRah81+HVMVO27\nuwisWhUMt7S384/fXfuBDwTj/0NDQG9v0IinUhzzj8JNBPf08Laoih3tylWU8UcNfh0QZRzDYuf+\nItDXx56/lEz29vI297F6ewuTvbkccMstwLRpdr++Pvbqxejn8zZBK5U6/f3BkNG6dcCNNwYNv3bl\nKkp1KIvBJ6INAD4C4DVjzJ+E3E8AvgHgagC/BfDXxpjnyvHcjYhr4AFrSGMxYM0aoKODt6fT7MFv\n3gzMncu3e3ttwtZNoG7aZI2xJGXnz2dDvHlz+HnkckFdHllg+vv5djxuH18WGj8/MDRUWBqqVT2K\nUh3K5eH/bwCrAXRF3D8LwPnDP+8DcP/wb8XD937nzbOGOp/npCnA3nYqBdx+O++7ezdvv/123i8e\n58UAAJYvZ1E11zPv77eG+NZbgUceCT+f55+359XVBbzvfcCTT/JjGMOVPRLGkcXAN/p+aai7cBDZ\nXgBFUSpLWQy+MWYXEU0tsst1ALoMt/U+Q0STiOgsY8wvy/H8jYTv/QJWIwfg7YsX2/r5XI4N7MAA\nsH69NexEHEdfsgQYHAx/LmPY6E6aBMycCezaVbjPk08CH/0oN3T5j5PLcWdvLMaJ4FWreCE6cYK7\nfU+eBI4e5f1isaBhv/JK3ief50XKDR0pilIZxiuGfw6AV5zbR4a3BQw+EXUA6ACAKVOmjNOpVRc/\nPu/H5VtbrWEH2JCLty+GXX56eqwXn0iwdx5l7F22bAEmTgy/L5+39flRyIIjuYG2NrtYJRJ8brkc\nG3aAf7tXHBrWUZTxoaaStsaYTgCdAGvpVPl0Ko4bvkkkbEzdrWnPZIIhkksvBZ591gqeAbbsUiDi\nrtmnngp/3pijkZrP25LMRMJq6RhTmMiNws0XZDLBRWZoyCZ6BwY4XzAwYI295Bs0rKMolWe8DP5R\nAG9xbp87vK2pccM3uVwwublsmd1PDDHARlxCPOLpA9aAxmJssLdvDzfYySSwejV741u2BOvvL74Y\nmDOHDXdvL1fpRBn9CRNYmK21NVj109sbvCJJJPjcjOFj5s7lfMPAgO0EFu9fwzqKUlnGSw9/G4B2\nYv4UwH9o/J6NZMJZciWmnsnYGbWA1bgB7OIgRtSN78diwDvfWejxA7zfnDnAggVsWJct479dFizg\n7ek0G3H3WNHIb2kBbrqJz7G9PWjss1lOAEsl0MyZ/NsYm0Tu6OAFbeFCXiyMsSEh1dBXlMpSrrLM\nbwNoA3AGER0B8CUASQAwxnwLwA5wSeZBcFnm/HI8b73hx+vTaQ7jrF0b9NB9pcorryx8rFiMDfG0\naexVG8Pe+4svhksnGANs28Z/y1WElHdKWWdHh63GOXYsqKUjCdlUCti5E7j+etbnAWyJ57FjNnZv\nDMfpJalMxMe7Es6Sh/BLSBVFqQzlqtL55Aj3GwCLy/Fc9YpfbikG9PTTbWhD6uz7+oKVOpMn2xGG\ngPXsczlg/34+7tpreb/OzuhzkIXg5Ek2ug89xAtGTw//dHayhy5GO5kMDkLJZtlrd+UVAFviGfOu\nFydOtK+NiKt33IQuwPdfcQWXjmo4R1EqS00lbRsZN17f328VJ11vXCptXG168bLf8Q7gwAG7r3tc\nPs8e8+zZ1sACwUXCZ8sW4POfB77xDdtIJeEXYWiItXhcfR7f2AsSYorHrULnM8/YBO7QEHDPPYXH\nE6mxV5TxQg3+OOGWW7r18y7SlTpxIl8B7NzJhryz0xr+KHI5YOtWG+oRA3zhhXz/iy8WGv9166yx\nB+xVhhh9P8wiOQdfQC2R4GMnTOArhP37gTe9yYaQ3NfnJnQBfi5fs0dRlMqgBn+ccKWLpUO2WFdq\nTw/wT/8UNI4zZ9ou1zDEyEtVTz4PvPACh2Z8QwsAv/514WNIaAiwk6tWrLB5h127OBz085/zVcfS\npWywN28GLroIuO8+20HrC63FYsA3v8mLmVyt5POFmj2KolQGNfjjiCshPG2alSXeujVYly5TrVzD\nHo+zt/7kk4WPe/75wEsv8WNIx+vmzcCjj9oKGN/4ikGWKh8pm1y61MbrRUtfavNlPu5DD/Ex/j6P\nP24riMKYPZtf98GDwe3SsSuPqyhKZVCDXyXE8B8+bMMkiQSXRopnvWkTV7oQAR//OG8LM6Yvv1xY\n+jhtGte7u962LCB+GEYSyG555eWXB7thpU9g40auyGltBW67LRgScsNJPskkLyZdXeHdv9u38/Oq\nl68olUMN/jjT2VkY/nA9eVdGeNUqO2HqwQfDjalU67ilj4BV0lyyxHrocryrexOmR5/J8Hn5zyV9\nAlKR44eIWlqAj32Mz9U/xwUL+HlEptnHmKDUsmrlK0r5UYM/jnR2AosW8d+PPFJowAcHrRxxNssL\ngxsiCTP2118PfO97fNtPsvb12UogosKRhVGkUsFF6KKLOOYuoaEw2QUiXkRk8ImLDF4B+Pf69fxa\nJREti1Bbm2rlK0olUYM/jkTpzvv4w0TcWna3QeuTnwR+8AOrRrlqVfEBKf4Qkigv2h908rOfsRxD\nT4+N18diwdCMe3XhE1aKKT/vfz9w/Dhw5pnW+3d7ELq67Pao6VmKopSGGvxxZO7coO6877HH4xwb\nd9UkpTFp7lw2uA88YEM43/52MC7vG1x/qHmpE6fa2vhcxFDncvzY999vxyMePhzsEHaHocg5CpKU\nnTGDj5OrlqEhK8l84AD/nUxa8Tb/sTZsUFVNRRkTxpia/Hnve99rGpGlS42JxSQwEvz58IeNiceD\n2yZMMObpp/nYu+4yhij8WHe/kbjrLvs88Tjf9lm71phkks/1tNMKH/vpp3l7LGZMIsH7u8f65xmP\n809LC59r1OsgMmbmzPD3iCj8XBVFsQDoNhF2VT38MlMsVJLNclNSVB192NSpG24IjhhMJoPSBMLb\n3176OYbNwvWRSp+o1xJ19dDZyTH697wH+Od/tlcpUiVkDDB9OjeXhfUUJBKsCBr2HrlXEYqijB4y\nUUXTVWb69Ommu7u72qcxKoqFSqKGfBejpQV44gn+W+LYra0c2tm1y44fBDikM3Gifc6RKl0qUQnj\nJqUBNtCiiuk2mrk9ALEYa/wDHMY6+2zu0A17j266icNKiqJEQ0T7jDHTw+5TD7+MFBvO7Q75dpud\nojjrLOC667iL1RU0k0WgvR247DJbBy8duiIxPFKli9sEVi78pHQux1VBcrWwfLltBpNzBrihbNMm\n22HsSj67tLaW93wVpdkYLz38pkBCJfF4uA6N1MInk8DnPse/o4zba69xwtKfSesuJE88wV6v/5xh\nC894MHdu8HYiwSWeouu/fDm/ZhcpzZTzHRy0lUn+exNW8qkoSumoh19GouLaAHvqQ0M2Bfn66zam\n7SNVKq5evFSquAuJeOlSOeM+50gx+kog+vrr13NoZtYsDuW4Vxrz5wPf+pY9RuQapKvYTdP6bNig\npZmKMhbU4JcZGfO3fHlwqMgtt9hQxuAgDwtxRxcKF14IfOpTQUPpNjSFGTw/PFNs4ak0HR3W8K9Y\nUXil4YdlWlv5/Zo2jXMQMq83zODncpzLkNcFaEeuoowGNfhlxu+mPXQImDSpMAn56qvsAW/ZEtw+\nc+bIFTJRhE3UqiZh1UCZTHDgy86dwfdg6VK++vFr+QHef+NGXjAl4StNYGvW2IVGUZRw1OCXmfXr\ng7fvvpuTr4lEMBa/Z48dOC6GLZm0EgSjNdi1KEkQdaXR0mLP89VXg8fs3w88/DBPAlu5MnhfPm/D\nYoB93/J54Oab+W81+ooSjRr8MpLNFiYWjWH542SSh4i/+GJQC16Gi0+ePLb4dLEKoWoyUript5cX\nP0ESv6+/XvhYUsoZFu7J53mK2KFDvGhIOE1RFIsa/DKSyUQPEM/lWFpg8uTgqEJj2MsVHfpTpZRm\nqlrBXQTktz9Ifd264DGxGF8ZXHKJlWMAgguASDgAtolNjb6iWLQss4yI0fXLCWWoSSplb7vs2cPh\nmGz21J9bPOc776yNcM5o6OjgMI4Y566uoOCa5DXmzQP+8i/Z8BPx7899jsNlYWWcpYrVKUqzoB5+\nGRGj29XFyUUZaiIDQ269leP4rtSA0N8/9jBMLSRqTxU34exz/Lh9PydMAO6912r5Azb848f9/b4A\nRWl21OCPkpEkCdzaeJFDOP104Mtftt2yuRzw1rcCv/iFPY6otsMw5cY38G7C+dZbg8nsF16wYRvp\nxgXYuO/YwftJ+eqcORwiW7BAwzmK4qMGfxS4ejillAK6zUQ+b35z8Pbs2fXrnY8Wv6Jo3jybcO7v\nB77+9aDomrx/MppRBqi49PdzV7KMbZw2bfxfl6LUOhrDHwVdXcDJk7Y88JZbouPuUjUTZuxbWri5\nSuL9Mjy8WfArigArDxGP2yldABt4kY1YtIjDY/5AFWFoaPzlJBSlnlAPv0SyWY4ju0jnJ1AY5pEE\nru/hz5ljK3JOpbmqEQibxCXyEKlUYZexP2Dd9/DdKwGi0mWUdXau0myowS+RTCbcszx2zIYn4nGr\nDQNwqALgGP7+/TwbdtIke2w9J1nHgl+LDwQN70g6/AsW2GlbUpbpzv3N5bh715WU9ge212KjmqJU\nGjX4JZDN8mi+RMJW14jq5eTJNjyRy7Eh2rDBhn2SSeAnP2HPXg2MRRa7KMNb7L0RsTVZZI0Jevxu\nPb4gdfzy+LXaqKYolURj+CMgBumBB9iwdHSwUf/7v2cj0d4erL0XXXq5GhgcZONTLcniWqeU9yWb\nZSE2yZek01zJM3ky8K53lfY8+Xzw8YtJWStKo1IWD5+IrgLwDQBxAOuMMV/17v9rAHcDODq8abUx\nxuulrD2yWVa9dKdUyUAPF7f2XvTcXV59tb46YceTkd4X9wpAQmZuvf3Ro/4jRuM/voTcVHJZaRbG\nbPCJKA5gDYAPATgCYC8RbTPGPO/t+l1jzJKxPt944Y8kjMX4R7plXdza+9tvD2rDABxzrqZkcS0z\n0vviXgFIyCxqaAxgBene/W7OmwhEfFUQFkaSnIuiNDrl8PBnADhojHkJAIjoOwCuA+Ab/LoibCRh\nLscGfdq06LmxrpGJxYDPftZeETRrknYkir0v/gIridowkkleXMWAf+ADtnnLGOCeeziX0tVlq6c0\nfq80E+WI4Z8D4BXn9pHhbT5zieifiej7RPSWMjxvRXFHEsaG3yU3Dixe4h13WB2cri6bPCQCrr2W\nq3LGopHTzGSzwG23BXXxJfm6dCnH713jLzN0Af6MPvGJ4ONJGe2GDbaqJ5HQ8JrSPIxXlc52AN82\nxvQT0SIAmwD8ub8TEXUA6ACAKfLNrQJivMUoiDCXtPC3tQW9xP5+9vyfey54zI4dwPbtWpVzqshV\nlkAEXHEF51UAvpp64YXg/amUDdf4uFdqcnv+fP1clOahHAb/KADXYz8XNjkLADDG9Dk31wHwiuZ+\nt18ngE4AmD59ekiPauURz91tmMrngYUL2XsUDXep2pH79+4NNlgZw5U67lWBGpbRIQnd/n6+nUxa\nYy/5Ffc9J+IJWlFyFkRck+83fSlKs1AOg78XwPlE9Dawof8EgOvdHYjoLGPML4dvXgvgAGoUXxJB\npA+kkqOzk6cruZU4UUM55MpAq3JOjXQaeOIJ20Aln4HMyvWrofJ5vqIK+ywA3t7Xp8lzpXkZs8E3\nxgwR0RIAD4PLMjcYY35GRF8G0G2M2QbgNiK6FsAQgF8D+OuxPm+lSKVs3D6R4DLA1lY2OitXAtu2\nFRr7RMJW8uTzbFhaWgplAZTRE5bQlfyKG9uXcE3YABq5Xxbe3l4r46Cfi9JMkIlyh6rM9OnTTXd3\n97g+pxvOIeIKmzlzgMsus2EFH9FuufhirhBpVn2c8ebmm4PyCr7Egs955/GwlEOHgl24S5dyYl0/\nL6VRIKJ9xpjpofepwbesWAF88YvWS4zHOXYvhqUYRMDEiZqcHS+yWV6IBwaCA2Wiwmvi4fsKpm7Y\nTT87pREoZvBVWsGhra2wzO/YMQ7ZjIRb062MD26eJZnkBVrklP1affl8/MVAxNb0s1OaATX4w0gT\n1aWXBre//HIwVhyG6Lhrcnb8yGT4c5Ewzvz5fDUmHr7kYFwSicKZw7KvfnZKM9C0apn+iL22Nm6a\nkpi8GHm3cxYALrwQ+Nd/tQ1WLS3BGasaEhgfwjT1ZRGQBPpZZwFHjvD+RLYLd/ly4NFH7X5ubf+K\nFfo5Ko1LUxp8X0vlyitto44bJgiLBc+cCaxbV1gqqIwvURo8iYQ1+q6wmjEsupZOs3Hfvdt+/m5t\nv8pXK41MUxp8X5L31VcL9/FH7QFsTMTAqzGoPmGfg9/85iJaOuk0l8xu3gzMnRus7Vd9fKWRaUqD\n74cDFiwAenpsmGbCBOC++zhMk0rxfYB687WOhHSiyOXYm587l5UzBwf5mGnTbG1/Pl/6iERFqTea\nxuD7ypaiYQ/wF371ap6VevbZduas0NnJ9x07VnifUju4C7k0wbm6Ofk8x+4fe8xuHxjgBf+CC4Lh\nPEVpRJqiDj9sjB5gt8noQhFHk/szGeDEiWCjTiIB7NqlRr9WkYX98GFeqKU2/+1vB156KboT1yUe\nB+68E1i2rOKnqyhlp1gdflN4+FFj9GSbGAGp1e7qsjNTfQMxNKTx3VpGPpeVK4Of68c+xmE6d3pZ\nGKp9pDQyTVGHHza/1N8mjTsTJvAx/f22ztslFlNjUMvI1dzWrXZbLMbyCY89xiWYMt/AnXUA8Od/\nySWc0AWCc3QVpRFoCg8/qoTP3QbYv3t7g16g1OXHYsD996t3X8v4aqcAf5YilDZ3LvD447w9mQRm\nzbL77dwJ7NsH/PSnhSE+/cyVRqApDP5IbNnCDVZSopfJ2KRfLBbUwtcvfm0jV26uJn4sxhVX2SwP\nqpG4fi5nB9RIL4YYeqG/X0N4SuPQ8AZfpletW8df5ESCv7zHjwMHD3JMXgzDI4/w77Y29v4GBvi3\nlmPWD24F1oYN/JnH45zE7eoqnFMsE8uidPTl6kBRGoGGNvgSzz150m4bHOQqmyg2b+YyTfny12gR\nk1IEachqb2cjv3EjTygTcTXAVmYNDfHtqESuXB0oSiPQ0ElbfyZqKcydGxTmyuVURbFeSac5FDc0\nxJ9jLscia3feyZO07rvPSiuLrj5gE7kyMF1yPNmsJnKV+qahPfxUKjiFqhixGA886ejgL7TbiZtK\nqahWvRImsiafYSZjjb2IqJ15JvDss8D73ge8+932Mw/r5dD/BaXeaFiDn80Ct91m5RIAvox/29tY\n7dKHiEv3gKDWykUXcaJPv+j1SViFljRnpVJ2SHosxsb+wQf5uIMHefCNuzio1o5S7zSswe/qKhxL\nKElb8foFXw9dqjkGBvjSX5qz9Iten7gia66nnkiwJ//UU/wZ/+M/Bo+7+27O56TTwVnH8r/iy3Uo\nSq3TsAY/DGOAAwfsWDv3Un758nBvTvbRDszGwP1sc7niCfxDh3hxWLXKlnPG47YxS0M8Sr3RsEnb\n9nY73cgXw5KyPEnKucYeCHbhtrQAa9Zwok+/1PWPfLbFBNJOO41/i9TG+vVc1y9y2X190XIdilLL\nNKyHLw1UK1dyY5WPNFWtWlVoxKM6c5X6x63TX7fOlmW6uGW8sRjLY0vS35VOlmSw1Plns/q/otQ2\nDevhA/zl++1vo+/P57mdPurYZcv0C9yISI3+xRcHSzHPO6/Q8x8asosCEXDDDTYn8Nhjdo7uAw9w\niEdLNpWxUsny34Y2+Nks8KY3Fd8nbNqV0thI4ra72+ZoWlqAz32OvXUXt2wzkQBaW1l2+corWXPJ\nrfPX0I4yVuR/8447KuNANGxIp7MTWLIk/JLdZcGC8TkfpXaQ+Ls/xDyd5kStO//AJZcDFi+2/1OP\nPMIDcdw6f03qK2Oh0uW/DWnws9ngFzOKOXO40UppLvxmLDdpP2lSYdkuYD19f/v+/ZrvUcqH/79Z\nbgeiIQ1+JjPyZKNkEpg8WRNtzUhUUj6b5eSrCOe5Iw8TiWATnyAKq9LQpR3ZyliodMFIQ444lDiY\nNF6FGX/RUEkmtZlKCTZkxePA1VcDO3ZYTfyPfcx24QrnncdJXPHCtC5fqQWabsRhOg3ceivwgx/w\nUPLduwu1dGQRkJGG+uVsbtzYKQDMmMHx+a4uHl7/7W8XHvNv/wZ88YvsPHzkIyq9oIwet1sbqHxo\nsCENfmenTbwdPFjdc1Hqg6jY6aZNwWEqLrI45POspy8VPpq8VUrBvaqUvJF0c69ZU5n8YlnKMono\nKiJ6kYgOEtEXQu5vIaLvDt//LBFNLcfzRrF5c+n7yoATpbmR2KnbUZ3JcFgwzNifc07wdi7HQnsL\nF2o4RykN96pycNBKuQwNcYVhTdbhE1EcwBoAswBcCOCTRHSht9sCAP9ujDkPwD0A/mGsz1uMiy6K\nvs+VWiDiskz9cipAYbNdW1twyLlLmDTDnj08ZUtRSqGtzQ7k8anUHI5yePgzABw0xrxkjBkA8B0A\n13n7XAdg0/Df3wdwOVExNZOxMWlStFYKEV8yxePAxInq3TczI3U0ptN8aZ1MWsMvInrXXx9+jOSE\nFGUk0mkeyBNmq1wJj3JSjhj+OQBecW4fAfC+qH2MMUNE9B8AUgB+5e5ERB0AOgBgypQpp3xCbW1s\nzPv7C2WQW1pYP6evT8vnmplSB5p0dLBEsujny/8NADzzDPDkkyOXACvNR6nS2e3tnCdybRVRkyRt\njTGdADoBLss81cdxa1nlS+p+WdXIK6PpaHT19IFg2S8RMHMmbxsaslO1lOZlNNPRwsT8jGHZ7ssu\n43kctdZpexTAW5zb5w5vC9vnCBElAPwBgIqOhva/pIriMpaORknmikeWzQKrVwe9f23Aal6inIko\nrz+dZoPvKwPUqrTCXgDnE9HbwIb9EwD8COc2APMAZAH8dwCPm1rt+FKagrF0NEoyVwx+LsfGftky\nnX2rhDsTxf4vsln27n1qUlphOCa/BMDDAOIANhhjfkZEXwbQbYzZBmA9gP9DRAcB/Bq8KIwLOoZO\niWI0V4H+/9GaNVw6l8txXki+1MuXW+/fVc/U/8HmIcyZWLEiOoSYydieDoEIuPfe8v+/lCWGb4zZ\nAWCHt+1/OX+/AeAvyvFco0G9LaUchP0fSTJXKnJ6e7m7e2CAb8vs21RK/webEd+Z8L3+Eyd4nvLZ\nZwOzZln9JoGIrxrLTU0lbctNpaVGleag2P/Rpk32i+p6aWecAbz//TwtS4594w2V8Wgm/KtC8fpP\nnAhKcG/dymW+//Vf3LFtjL1qLDcNafCzWauBIo0N4m1pMk0ZLVEJXl9/x+W113i0Zixm66yNATZu\n5Coe/f9rbKKiC+k0e/YuxrAw39q1rN9UyfBfwxn8bJbLmUQpM5HgdvfWVuD22/XSWhk9UQleWQj8\nfg8XGbIiDA3plWYzUKxSp6cn/JhVq4C/+qvKOqQNN+JQ3mghl+MxdH19hR+AopRK2IxjWQiuuCK6\ns1tIJLh7UoXVmgNxBuJx/uxlyH2xWR0HDrD6aiVnIzecwZc3WkgmeZv7AeiXTikX6TRX5rj/c/5c\n3GSSq3pcYTalsXGH3Btjh9ynUsH/D1+rya/uKjcNF9JJp7k7Taon3HipjqJTKoH/P/f889wpKcya\npaM0mxFRXM3lbGShr4+H5l33Ox0AAB1tSURBVKxda1VYEwkrjSzVXZVySBvO4APR9dXafatUCnfM\n4Z/9WfC+yZPDj9EekcZFPttUKpjwT6VYVVWMfT4PfPKTwPHjrPI7aZIOQBk1+kVSKon//yVVYQBX\nhvmCfa2thdVh2iPSuPifrYg1plJcOHLyZHB/mab2+OMc+qvk/0HDGXz9IimVJOzL7Ddcubz97cHq\nMPnyHz4cLCLo6lInpVHwK3REduPmm7kXw0cchHyeu7enTVMPv2S02UqpJP7/1+bNPK1I8CswDh2y\n2/v7+Qudz9uqHYB/b9xo1TbVSalfsllezP1xl9ksD8cZSUFMBp+owS+RsaggKspI+P9fc+cWlgK7\nGMNffpm0lsvZRWHhQi4ZPnyYqzjUSalv3Ku/RAKYPdvmbzKZQjVMosIFoFKDT4SGM/hjUUFUlJEI\n+/+aNo3DNnv2FO5vDPDpTwOvv87VO089ZSsxpIIsm7USDeqk1C/u1Z8xLJlgDHv2990XVFgF+D7f\n6Fd6mA7Vqkrx9OnTTXd3d7VPQ1FKorMTWLQo/L6pU4Ff/MJ+seNx4JvftJO0XKkG/291WOoHdzCO\nb7hvuomT9zffXHifa/RjMeArX+GY/6lCRPuMMdPD7mu4xitFqQZ9fdHdti+/XOjF9fSwcbjjDv4N\n8Je8txf44Acr33GplJ90mpPyUT50Rwdw//3RM2xlBKuGdBSlxmlrK5S4jUIqecKkPhYvtrHe/n6N\n59cbPT2FBl8MfDZrG/BuusnuR2TzOZW+qlODryhlQLoqRaUVAH74w2AFj/CZz3C5JlGws9LXWal0\nAk8pL9ksV1u5xGL8OT7wAOdpZJYCwIt7Ps9e/XgpqKrBV5Qy4Xdyd3YGPTnhnntsK30iwWEAOa6l\nhT37WIzn5Kp3Xz90dQWv8GbMAC6+2FZg9fez7tLy5XaAznjnatTgK0qFkLi+b/Bdr39w0E420gqz\n+sLtuAaCdfYtLbyQA+zZSyL30UeB3buD+vjjiRp8RakQbW3WY5fwTViI50c/sgZe9Z7qA7/jet68\n4CCc97yHf8sivnw5G3tXDbMan7NW6ShKhZAv+1e+wl7d6tXAuecW7rdrV7AiJ5tl7R2t0Kld/I5r\ngA2/JOS7u+1nKhLaiQQv/NXMzajBV5QKIoNTAG7OOno0fD+pyOnstGWZl13Gddtq+GsPf75Ge7sd\nhiMNVhKzl89PqnVGGpZTSdTgK8o4kMmwAYiq0c7nebj14sUc9hGDsXat1uPXInL1duedHKuXstrl\nyzmMJ0b/0Uf58+vq4nJbY+yYy2qgMXxFGQdSqeJt80TAD35QqLdijOrr1AJhkuvyW2L58TgPN1m1\nCli/nqU2JGYP1IbGlxp8RSkDI81giKrYEYyxypouRKqvU22iJNezWfbopQInl+MrsmQy+DknEhzy\naW+vfgWWGnxFGSOlzGBoa2MPUDz4MOMvYlpurPfSS4ELL4x+3mobkGYgTHIdCNfNMSZYiUUEzJ9f\neFVQLTSGryhjJMoguKTTPM0ombTdtaKZnkjY2xMn8sg7gB9v1y5O5PoJXFlkRItHY/yVw0/QSlf0\nwAAbeyJelFtaeJ9kMvh5trdX+QU4qIevKGOklBkM2SyHdVavthOvHniA7zOGY78AyzJ85ztBr1ES\nuN/6lm3P10E/44eIom3ezPMP0mkWuROMAQ4eZAnkvj77+cvYy1pCDb6ijJGROmTDQj6A1cCPx9nQ\n79zJhr0YUr6pg37Gj2zWjqn8yU9Y8fKnPw2G5HI5O8pQjpHPVxbpWliQ1eArShko1iHre+Pi+V1y\nCfCrXwEvvghs2VL6cx0+zL9VhmF8cD+/XA7Yvz94f1hivVavwMZk8InoDwF8F8BUAC8D+Lgx5t9D\n9ssBkIugw8aYa8fyvIpST7jeeCLBJXthEgtRuAnefJ5DOxs2sBEZy6AMJRo3IZ5KRe9HxINvWltt\n7iadrt0rsLF6+F8A8Jgx5qtE9IXh258P2e+kMeaiMT6XotQlbsjn8GEu3RsNYaWcAwMcZliwwMaN\na8GDbAT82bRSchnGhz7Exn7xYt4nkeA8TUdHbV6BjWnEIRG9CKDNGPNLIjoLQMYYc0HIfr8xxvze\naB5bRxwqjUg2yxU3xWL1orfiN2FFIZOSaiVOXO+sWMHVT7lc8d4JwGrnuEn2RIKrq6r1WVRyxOEf\nGWN+Ofz3MQB/FLHfRCLqJqJniGhOkRPtGN6v+/jx42M8NUWpDVwxtHQaeOIJ1smfM4d/r10LfPjD\nwfr7G29kPXWXqVOtgXFxFRiVseOWYYrgWRQy18DfVqufxYghHSJ6FMDkkLv+1r1hjDFEFLUWvtUY\nc5SI/hjA40TUa4wp6Cs0xnQC6ATYwx/x7BWlxolqyvK9v0OHgEce4b/zeeD003l4Rk8Pe/qJBPDq\nq3y/zD/N5Xhfd2qWMjqimtfmzePKqR07Rv+YLS0c91+xorbCOUAJBt8Yc0XUfUT0/4joLCek81rE\nYxwd/v0SEWUAtAIIaSRXlMailGqNbBb4+teD2772Nf6dTHJSEOC6/XyeDf6CBTwDNZXSGP6pIuE1\nWYyfeIK3ywJNxJ/bSCEdIvu53Hgjx/SljDOq87pajDWksw3AvOG/5wHY6u9ARP+NiFqG/z4DwKUA\nnh/j8ypKXRDWpemTyRQmBSVUMDjIhr29vVCOt62tNGOv+vrhdHVZBdP+fr7tl2C6xv788wvDO8bw\n1deiRRy3v/9+vip7443indfVYqxVOl8F8D0iWgDgFwA+DgBENB3ATcaYGwG8C8BaIsqDF5ivGmPU\n4CtNQSljC9vabIjGR4Zl+N2egNVyicdtZQhQOHpvJJ2fRiebtb0PMiw8mwWeey6437Fj7J2LnpHv\n2R86FK5/NDTEi7I8rjvqMJGosVCbMaYmf9773vcaRWkWZs40hs2E/YnHjVm7lu9/+mljTjuNt512\nmjE33WRMLGb3TSR4H9kvFjMmmTRmzhw+Rh7vrruq+zrHm6efNqalxb5PsRi/1xMmBN8/uY/I/u1/\nHrGYfS/dbaedxs9jDL+/sg8Rf07jDYBuE2FXVTxNUapMNgs8+6y9TcQVPLt3W6+9qysYJgCs+Bpg\nK0Nk0IqEg7Zu5asHCQVJMrFZwjsSohHyeQ69iPCZSz4fbHBzIWJv/TOf4feSiPMr117LCV7BDeHV\nmnAaoNIKilJ1MpmgbPKiRRwLlth7KhUME0hp5t/8DSd783muDGlrC4p6AXxMPg8sXFjbycRKIQZ4\nJI2iYrjlsq+/zn8bw4vv9u38/m7cyEnfUkJ41UQNvqJUGb8Nv709WM7px/fzea7YmTCBJZddhcbb\nby/0To3hGLMkE5tpipb0PXzhC3zFJItmMgmcdx5w4MDIjyHHyKIhnxVgP5f+fmDlSu6daGurXckL\nNfiKUmXCvMIVK2y1iDG2/M8tFXzjDTbiMklpzx7e5hOP81XC3/2dNV6SDG4WnnkmeIW0ejX/fcst\nvEAmk8Cf/imHe4px+un2s9qzJyh6t307/9Ty1ZMafEWpAfxmLN/rX7WKPfkTJ9iTBNiArVvHP2Ey\nDCLRIBr8btjohhtq0yCNhc5OFqY7+2xg1ix75ZPJBMXqjOGFctMmu5hefTUfs3cvcPJk9HPs3w/8\nwz/YipwdO/ixYzEbPqvlqyc1+IpSg0TFglesCJYMRuntzJwJXHWVPTabLQwbNQrZLC+Crre9ZYvV\nGFq1ij14d5g4EEzcbt0KPPww79vTw4toWNOVlMQC/L5KojyVCuZHavXqSQ2+otQoYRIMbW1B4xUl\n7vWHfxiMI/sLCGBb/4HChaVe5uVKriPMKxdvu6+PX4tbiw+why85Dclr9PVxwlzCZKkUD6b5+c+B\nM87gxUA0kYDgZzRtWu2/Z2rwFaWOSKfZk7z77uIt/5ND1K/EOPnyv1JxIqGjnh6uCpJttRqPBgrL\nLl1cjaHeXuCll+yIQoBfV1cXV9hIWEa0731DftllwPPPc4xfZhH470mxITi1ghp8RakjRHdHjH2U\n0W9tjX4MVz5AQhoiL7BkCYeJ5HFrOR4NBHMdsRhX3lxwQTCG39tr9YhEoK6jwxpoV8/+9tvZwLuv\n119UBgdr+z0phhp8RakjwnR3wujr499hoRl/Apd4+L5YGFHQ660mUSGmUurely8P3r777qBR7+sr\nnnD1a/mTydqN0Y+EGnxFqSPa2kobjnLiRHFpZj+e7yceYzE76SnM6x1Pol6HEBZKcReIuXOtZw+w\nJs7ll9vH8SuifGnjdBq4915bAbR0aX1694AafEWpK9JpbrZasoSNcTIJ/PEfFzYQ3XMPd4VK6Ka/\nnz3duXNtqMNP6gI28Xj4sJVjrnZYJ0xiWraHefVhC8TSpTwL+PXXwxvPRB5BupH7+3nRW7OG3xNZ\nCHt7+bHqFTX4ilJndHQEK0IA4IMfDNaaS9hHQhH5PPDjH7OnW2wkopvY3bSp/GWGo63+yWZ58Ukk\n7OtJpYp7/P4C0dXFiVY3Di8qltks/x4Y4Cun2bPt+5XP88K6YMHIMw3qBTX4ilIH+IbSDWNks8A1\n13B1zSuv8LaWFi4tbG9nz/7HPw4Kg7mGK0w+GLBer7ttrK9hNFLN7v7xOOsBSblkmAGW9yiVsgtd\nLMayx+5iCADz5/MxH/1oUCZh27ag5r27cNZ6jX0pqMFXlBqnmKH0h6InEsBHPsJ/d3WxgTzzzGA1\nD5E1XK6HC3CcevXqYBNRuZq0Spn+FbU/YDXngUID7L9Ht97K1Uy5HNfRu3kPWQyzWZZCcDGGFTB/\n+EM+1l04a73GvhTU4CtKjVPMUPolg0ND3DUqBn7t2sLSzQ99yFauLF9eWHK4fn1hSKQcxs5Pjo7k\nKbe18QImpaNujbwknVMpm3Nwz3n/flt5MzTEVwcAe/uAvaLx35sJEzhGv3Rp4WuuZ0MvqMFXlBqn\nmKEMk/91jZhv0GIxa+xlYpbP2WdzclLKNsvVhHUq0sGiUZ/Lca28VAvJsW4DmcwHmDCBk9O7dxde\npbhXM8kkHzc0xFc9s2cHK3AawcD7qMFXlBqnmKEU+d+uLg5PHD1a/LE++9mgGmc+H4xZJ5NBD1eq\ndcqVsBxNN6ovejY0xK/Tv7qRkM/ChRz2kQXxyiuBV1/lpCtgw1Tu4y1aZI9pRAPvowZfUeqAYobS\n7RiVjtIw5sxhpUcgXI2zp6fwMUdbrdPZaefuyrSukYhKGsusX1/fXwibIwDwY8kiBfDrktmzLnLM\nSInjRojd/46o2YfV/tGZtooyetauNea88+xsVpmtetppfN9dd9n5q08/zbfXrg3O1G1psfvIY374\nw/xbjnHvd/dz572683ijjnn6aZ4vG/bcTz/Nc3nlvmSy8DHcx5Z5vu5rj/qZMSP8fPzHducIj7R/\nrYAiM23Vw1eUBkJq9N3Y9vz57P3fdpv1hmUcH8BSyq73299vPe6VK7lU0Rg+RuQXXI1+8X43bw6e\ny+bNwXOJqpl3wzZu2KirK6jhf801tukqTK1SwlR+3kIGx8iVgkgmj+Sxj7aqqB5Qg68oDUZYzP/m\nm22CVgy6b1Rdjh0LJjiBoGHu7+ckqjHWkPsSBnPnBoeq9/eH69S4cs+i3ZPNBuf4GsMLz7ZtwaYx\nt/b+8GFO3Ep1jjxePM5GOxbjeP1FF5X2Po62qqgeUIOvKA1IKcnRbBZ47rnC7fE4yyv7zUo+IrQm\n3q+UUQ4N8e9p07jaR4xvPl8oxCZDRFau5KSzMZxcnTevcCGSx5GFA7CVRpJ8TiT4Kqe1la8+XIkI\nAHj5Zf7ZsWNkiYZaH0h+KqjBV5QGICq5KNtbW9lLHRxko3jsWKEcAxHwgQ8AX/0q33YTn2H483G7\nuuz+xliD6g5p2bkzmMyV8xP9fukCBoonbLds4Zmy7tQqY+zrEekJed0y6EQYHOTzdRPSxaQmGgU1\n+IpS50R14vrb77vPDjdxm7MA4NxzgTvusMa4szPa2AL2PpmPC/AgEX8R6O0NPs+WLSxnIAJkrnSC\nq5fT3s7G+pZbgouOxOL37LHP43PsGHcfy+u+915+3evX2wUhmeTfjRajHwk1+IpS54SJhYV1n/b1\ncQw7bFbrkSMsFHboECtKPvBA8YlagjFsmDOZ8CHp4uW7bNnC82PnzQsOYrnuOmDGDNs929YGfPOb\nbKgnTuSxjU89BRw/bh/rrLP43AVZANx8xc6dwEMP8Xm6EsdAZQTiahk1+IpS57jJxXicPW2Jo7vd\np2LQXAVNl8FBjqVHzckV3FBLLGYrdVzBMpm45cb1XcQgS0LVGDbMs2bZBilJwMprCaul/+AHge9+\nlx8jHmc5Y7efAOCrmc9/nq9wXInjRozRjwSZUpbxKjB9+nTT3d1d7dNQlLpAYuFuZ6woTPqdpG5l\nS0/PyLH6YkyYYEMhnZ1Wp9+vpFm5kvVtXn7ZHrt2LT+/6P0QAZdcAuzbZydwFTNPM2cCe/fy4uFK\nIwCci3BfUyzGv/N5fl/uvDM4D6CRIKJ9xpjpYffFxvtkFEUpP+k0G7D2djbC8biNhS9bVijHsGwZ\nJzanTAE+8xn2oEVigYiPf897ij+nG79fsYKNt+jIu4NKenuB3/6WyyHF8MqVQWur3WYMG3u535V8\n8EkmOcTzxht2MtfWrZwTADgUFPOsm9TjixZ+MzKmkA4R/QWA5QDeBWCGMSbUJSeiqwB8A0AcwDpj\nzFfH8ryKooRTapjCT+h++tPsgZ95JodI8nng5z9nj3n79uBELQm1xGLAf/4ne9r5PBtSN/Ha1sZe\nvyv3EI/bxejECeBLXwp64uLZj+Tdp9NcWukLxclCI967OxnMrSBqVsYaw/8XAB8DsDZqByKKA1gD\n4EMAjgDYS0TbjDHPj/G5FUUJoZRSQjfR+8YbPNjbmGDj0sAAMGkSx8lfeMEa+dmzWS9+cBB48EH7\nmENDXOXjhpD8AeLG2Dr5xYvDm75EAKEYx4/bihsZtg4EcxXuZDAJdYnyZjNU5IQxJoNvjDkAAFTs\n2guYAeCgMeal4X2/A+A6AGrwFaWCFBP+cpOsbvJWYv+uF+5W7BCxAmWYoY7FCsXILroo2H0rz9XX\nV5g3iPLqYzHgne8EXnzRXkkcPGj3TSY5IevKPAiVHtlYb4xHlc45AF5xbh8B8L6wHYmoA0AHAEyZ\nMqXyZ6YoDcpI4wQl9LN8edAgA+zBS3nk4sWFIZe9e8P1atasKWz6uu++wnPbsIErcfzHiDL2sRgb\ne0lCA7wIyfNefXW4sXdpxoqcMEY0+ET0KIDJIXf9rTFmazlPxhjTCaAT4Cqdcj62ojQTpQh/Scjl\nJz+xZZKihy+a+WHNV2GG+brrbNOWWzHkavEIQ0Ph9flhXHAB5xLcMYdtbdZbTyQ4lr99+8gDWvxQ\nV8NJH5fAiAbfGHPFGJ/jKIC3OLfPHd6mKEqFKFX4yx2gAhTq0be0ACdPFh6XTNp4uCwSQOHgcamf\ndxeOfN5W44yEVAzJIvOjH/FCIkqdUQNaRjLmox2o3iiMR0hnL4DziehtYEP/CQDXj8PzKkrTMpoQ\nhuv5ZrPs2csxq1ZxpYuvuXPNNVb/xl0k/ClUs2cD3d3BbliguGyDywsv2Nh+Pg/s2sU/Uv8PFMbm\nSzHmjSh9XApjLcv8KID7AJwJ4IdEtN8YcyURnQ0uv7zaGDNEREsAPAwuy9xgjPnZmM9cUZSijFb4\nSwxlfz971atXc229m6AlYo9+507e7k6aAgq7fnfuDJ+bG4vZRcFN1hJxgvb3f58XiqiFYXDQll/6\nC5vo4hcz5o0ofVwKY63SeQjAQyHbXwVwtXN7B4AdY3kuRVEqi6tdn8+zcFksFqyGkfmwUXNu3SsL\nCbeE8Y532FJPCdvI877wAhvhRIINe1jOIJm0Rtpf2Eox5s2axFVpBUVRALCHHyaZLEZ50SLg/vvD\nQyZAofGU/cJyAHPmsICaPMattwJf+1pQhfOSS1jobOdOPici4NJLgQsvbMJZtKOgmLSCGnxFUX5H\nZyeXYkq9O5EN3bjaOG6SF2DDOjjInrfr8WezwI03As87XTexGPDkk/y3GOVMBvjiF4MhnFjMjiMU\nQbSRDL1S3OCrWqaiKL/D7U6VUIjrKfvefXs7G38pvxwYYAO/bp099h3vCBr8a68NhoCElhYb73e7\nfXt6WAF0YIDljRcsUMN/qqiHryhKyaxYwYNSZEbsFVcAb3oTa9y7JJP26gCwcgktLcEB6u7Vgowl\nTKWsRPKECcCVVwYfn4j18ZullHK0qIevKEpZ8CUZHn3Uhn78EYL+tliM9e6Fzs7gRCtXatm9ypAF\nQXBF0tTgjw6VR1YUpWSkuuWKK+wglFwOeNe7gvuJJIJLPm8ljF3tfEFKLeV5RNZZJJ/dx26mUspy\noh6+oiijQiQZdu+2YZdPfco2aMViXM0DADffHEzEine+eXOheJpbauk/XyZjh7aMpJujRKMGX1GU\nUSEljyJvIEbaHTAybRpvSybt2ENptkokgLlzecHwp1WVqoOjnBpq8BVFKZko2YIVK+xsWtGbB4JJ\nW3cAybRpzdn4VG3U4CuKUjJRGjRR3a1hmvuilumPXlQqjyZtFUUpGTHsMiDFlTd47DEeDi5ev5vg\ndWckxeOacK0WWoevKMqoGK1sgSvKFovxoBRfO1/DOuVD6/AVRSkbo02gRgmVNasmfTVRg68oSsUJ\nWySaVZO+mmgMX1GUqhCVD1Aqh3r4iqJUhWbVpK8mavAVRaka2lA1vmhIR1EUpUlQg68oitIkqMFX\nFEVpEtTgK4qiNAlq8BVFUZoENfiKoihNQs1q6RDRcQC/GMNDnAHgV2U6nWpQ7+cP1P9rqPfzB/Q1\n1ALjff5vNcacGXZHzRr8sUJE3VECQvVAvZ8/UP+vod7PH9DXUAvU0vlrSEdRFKVJUIOvKIrSJDSy\nwe+s9gmMkXo/f6D+X0O9nz+gr6EWqJnzb9gYvqIoihKkkT18RVEUxUENvqIoSpPQcAafiK4ioheJ\n6CARfaHa5zNaiGgDEb1GRP9S7XM5FYjoLUT0BBE9T0Q/I6JPVfucRgsRTSSiPUT00+HX8HfVPqdT\ngYjiRNRDRP9U7XM5FYjoZSLqJaL9RFSXA66JaBIRfZ+IXiCiA0RUVTHohorhE1EcwM8BfAjAEQB7\nAXzSGPN8VU9sFBDRTAC/AdBljPmTap/PaCGiswCcZYx5joh+H8A+AHPq7DMgAG82xvyGiJIAngTw\nKWPMM1U+tVFBRJ8GMB3A6caYj1T7fEYLEb0MYLoxpm6brohoE4Ddxph1RDQBwJuMMSeqdT6N5uHP\nAHDQGPOSMWYAwHcAXFflcxoVxphdAH5d7fM4VYwxvzTGPDf8938COADgnOqe1egwzG+GbyaHf+rK\nMyKicwFcA2Bdtc+lWSGiPwAwE8B6ADDGDFTT2AONZ/DPAfCKc/sI6szYNBJENBVAK4Bnq3smo2c4\nHLIfwGsAfmyMqbfXsArAUgD5ap/IGDAAHiGifUTUUe2TOQXeBuA4gI3DobV1RPTmap5Qoxl8pUYg\not8DsBnA7caY16t9PqPFGJMzxlwE4FwAM4iobsJrRPQRAK8ZY/ZV+1zGyJ8ZYy4GMAvA4uFwZz2R\nAHAxgPuNMa0A/gtAVfOKjWbwjwJ4i3P73OFtyjgyHPfeDOBBY8wPqn0+Y2H4EvwJAFdV+1xGwaUA\nrh2OgX8HwJ8T0f+t7imNHmPM0eHfrwF4CByyrSeOADjiXB1+H7wAVI1GM/h7AZxPRG8bTpB8AsC2\nKp9TUzGc8FwP4IAx5uvVPp9TgYjOJKJJw3+fBi4CeKG6Z1U6xphlxphzjTFTwd+Bx40x/6PKpzUq\niOjNw0l/DIdBPgygrirXjDHHALxCRBcMb7ocQFWLFxLVfPJyY4wZIqIlAB4GEAewwRjzsyqf1qgg\nom8DaANwBhEdAfAlY8z66p7VqLgUwF8B6B2OgQPA/zTG7KjiOY2WswBsGq76igH4njGmLksb65g/\nAvAQ+w9IAPhHY8yPqntKp8StAB4cdkBfAjC/mifTUGWZiqIoSjSNFtJRFEVRIlCDryiK0iSowVcU\nRWkS1OAriqI0CWrwFUVRmgQ1+IqiKE2CGnxFUZQm4f8DVAgRlRU5GYAAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Up8Xk_pMH4Rt",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 3. Split the Data\n",
+        "We now have a noisy dataset that approximates real world data. We'll be using this to train our model.\n",
+        "\n",
+        "To evaluate the accuracy of the model we train, we'll need to compare its predictions to real data and check how well they match up. This evaluation happens during training (where it is referred to as validation) and after training (referred to as testing) It's important in both cases that we use fresh data that was not already used to train the model.\n",
+        "\n",
+        "The data is split as follows:\n",
+        "  1. Training: 60%\n",
+        "  2. Validation: 20%\n",
+        "  3. Testing: 20% \n",
+        "\n",
+        "The following code will split our data and then plots each set as a different color:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nNYko5L1keqZ",
+        "colab_type": "code",
+        "outputId": "a016bf4f-60a9-4c3f-9954-71218f7f4a25",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 265
+        }
+      },
+      "source": [
+        "# We'll use 60% of our data for training and 20% for testing. The remaining 20%\n",
+        "# will be used for validation. Calculate the indices of each section.\n",
+        "TRAIN_SPLIT =  int(0.6 * SAMPLES)\n",
+        "TEST_SPLIT = int(0.2 * SAMPLES + TRAIN_SPLIT)\n",
+        "\n",
+        "# Use np.split to chop our data into three parts.\n",
+        "# The second argument to np.split is an array of indices where the data will be\n",
+        "# split. We provide two indices, so the data will be divided into three chunks.\n",
+        "x_train, x_test, x_validate = np.split(x_values, [TRAIN_SPLIT, TEST_SPLIT])\n",
+        "y_train, y_test, y_validate = np.split(y_values, [TRAIN_SPLIT, TEST_SPLIT])\n",
+        "\n",
+        "# Double check that our splits add up correctly\n",
+        "assert (x_train.size + x_validate.size + x_test.size) ==  SAMPLES\n",
+        "\n",
+        "# Plot the data in each partition in different colors:\n",
+        "plt.plot(x_train, y_train, 'b.', label=\"Train\")\n",
+        "plt.plot(x_test, y_test, 'r.', label=\"Test\")\n",
+        "plt.plot(x_validate, y_validate, 'y.', label=\"Validate\")\n",
+        "plt.legend()\n",
+        "plt.show()\n"
+      ],
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOyde3wU1d3/32dmd4MKJhr15wVBioAE\nc+HiZYrgYKz0Uaq0aEWrQfEBRUVRlBYvT3kerLQoGi9UAYWSp1pqyyNKvdbFUZBR5BISWECgCFJF\n7WoCVLOzO3N+f5xsbgQBSbgk5/165ZXs7GXObmY/c+Z7vt/PV0gp0Wg0Gk3LxzjYA9BoNBrNgUEL\nvkaj0bQStOBrNBpNK0ELvkaj0bQStOBrNBpNKyF0sAewO4477jh52mmnHexhaDQazWHFsmXL/iWl\nPL6x+w5ZwT/ttNNYunTpwR6GRqPRHFYIITbv7j4d0tFoNJpWghZ8jUajaSVowddoNJpWwiEbw9do\nNK2PZDLJ1q1bqaqqOthDOeRp06YN7du3JxwO7/VztOBrNJpDhq1bt9KuXTtOO+00hBAHeziHLFJK\n4vE4W7dupVOnTnv9PB3S0Wg0hwxVVVVkZ2drsd8DQgiys7P3+UpIC74GANeFSZPUb43mYKLFfu/4\nPp+TDulocF0oLATPg0gEolGwrIM9Ko1G09ToGb4Gx1Fi7/vqt+Mc7BFpNAeHeDxOQUEBBQUFnHji\niZxyyik1tz3P+87nLl26lNtuu+0AjfT7oWf4GmxbzezTM3zbPtgj0mgODtnZ2ZSWlgIwYcIE2rZt\ny1133VVzfyqVIhRqXDb79OlDnz59Dsg4vy96hq/BslQYZ+JEHc7RHH409/rTddddx0033cQ555zD\nuHHjWLJkCZZl0bNnT374wx+ybt06ABzHYdCgQYA6WQwfPhzbtvnBD37A448/3jyD20f0DF8DKJFv\nKPSVlS4VFQ5ZWTaZmfosoDn0OFDrT1u3bmXx4sWYpsn27dtZuHAhoVCIt956i3vuuYe5c+fu8py1\na9fy9ttvs2PHDrp168aoUaP2KWe+OdCCr2mUykqXlSsLCQIPw4iQnx/Voq855Ghs/ak5BP+KK67A\nNE0AKisrGTZsGOvXr0cIQTKZbPQ5l1xyCRkZGWRkZHDCCSfw+eef0759+6Yf3D6gQzqaRqmocAgC\nD/AJAo+KCudgD0mj2YX0+pNpNu/601FHHVXz9/3338+AAQNYtWoV8+fP320ufEZGRs3fpmmSSqWa\nZ3D7gJ7haxolK8vGMCI1M/ysLPtgD0mj2YX0+pPjKLE/EOtPlZWVnHLKKQD84Q9/aP4dNiFa8DWN\nkplpkZ8f1TF8zSFPY+tPzcm4ceMYNmwYDzzwAJdccsmB23ETIKSUB3sMjdKnTx+pG6BoNK2LNWvW\n0L1794M9jMOGxj4vIcQyKWWj+aE6ht9C0NYIGo1mT+iQTgugqVPTXHfXmGhj2zQazeGFFvwWQN3U\ntKoqKCnZe1FuKOSNnTxg704oOm9fozm00YLfArBtlZbm+yAlzJoFRUV7Fv3GxH13vjp7ynXWefsa\nzaGPjuG3ACwLhg+HtFtqKrV3BmgNxb2kBLZsUSePunnNe5PrrPP2NZpDHz3DP0zYU7ikqAhmz1bC\nbZpKuF33u2f5dU3TTFNdGaRSEArBiBH1rxL2lOus8/Y1mkMfLfiHAY2FS2Ixq54ApwtQXnnF5ZNP\nHBYtslkzE2YPd+hYZDeq0nWLVrZsgRkz1GwfoEMH9XvSpPr72B2xmMXKlVEKChzy8nQMX3N4Eo/H\nKSwsBGDbtm2Ypsnxxx8PwJIlS4hEIt/5fMdxiEQi/PCHP2z2sX4ftOAfBjQMl5SVOQwcaO2yiJqT\n4/LNN4WAxy+uCpE/VnL8NB9m77rSml6szc5Wt3v2rG+RnJ2995k/tWsBFpGIpR03NYcte7JH3hOO\n49C2bdtDVvB1DP8wIB0uARPDiFBaaje6sFpR4WCaHqbpEwp5fFOQxJC7djWZPh3OPx/uvRduvBHu\nuw9uuw0GDoRf/crljTcmkUi4u22K0jDnXzdQ0RxUmrkIZdmyZZx//vn07t2bgQMH8tlnnwHw+OOP\nk5OTQ15eHkOHDuXjjz/m6aef5tFHH6WgoICFCxc2y3j2Bz3DPwxoaHMQDluNNiypG0eHEFkxCaaP\nH4rw3BabLtXfh1tuUbH6NEEAPRMu535UQq8bZ+H7KXJzI+TlRSkrs+rtY/p09fwggIwMNfPXDVQ0\nB41m9keWUjJ69Gheeukljj/+eP785z9z7733MnPmTH7729+yadMmMjIyqKioICsri5tuummfrwoO\nJFrwDxMyM62auHjd2Ht2du2M2rIa+N/8HjaXOAybabNohkVkNgwbpsS6LufiEqWQzwuq2BxOW214\nPPaYw6JFVr0c/VtvrT1ZJBJq3+PHH3gDK40GaHZ/5EQiwapVq/jRj34EgO/7nHTSSQDk5eXxi1/8\ngsGDBzN48OAm22dzogX/MCV9TO86ubGIxSyef16Jr9PBYpFf+30ANTNPJMAw4M47wXIcIks8ji2V\nfJKEADDMEHl5Nv361e7TcaBbN5dL8krIKoVX1hVh27UnIS30mgNOM19eSinp0aMHbiPholdeeYV3\n332X+fPn85vf/Iby8vIm3XdzoAX/MGZ3sfO6J4Hi4vrfh6Ii9VNvNu7afH2ToPJM6Pwk7OwKmCkI\nl0PfWhXv39+lT88BhMMJjCSM/OVMjsUB6vsvlGfb/H6F2ra7AjBt1aBpEprZHzkjI4Mvv/wS13Wx\nLItkMslHH31E9+7d+eSTTxgwYADnnXcec+bMYefOnbRr147t27c36RiaEi34hwG7E8fGJjcNTwLx\nuPo+LF3qUlDgkBNkk/luHKvOi1UG5ZT/LkUQBpECBEjT5/Nvbya/MrcmlNS+vcOmjR4YEEjYkZfk\n2PQldHUsVSY8OgcRSonyPhbPPAP/+Z/1hf9AtaXTtBKa8fLSMAz++te/ctttt1FZWUkqlWLMmDF0\n7dqVa665hsrKSqSU3HbbbWRlZfGTn/yEyy+/nJdeeoknnniCfnUvkQ8BmkTwhRAzgUHAF1LKMxu5\nXwCPARcD3wDXSSmXN8W+WyJ1BR6UOPZKuHxrOLSdapM7sjaMUlwMc+fCkCHqdhC4XHONw7JlNhs3\nqvh7To5LMlmI7ydY+e+A/OcNMv87DNdfD0VFVGyYS9AeMEECCJSop3wqykrI7Kf2V7Mo7CcQQHin\nWTvI6jONCHzCeNg4vI9FKgXTpqmisLSwH6i2dBrN/jBhwoSav999991d7l+0aNEu27p27UpZWVlz\nDmu/aKoZ/h+AJ4GS3dz/H0CX6p9zgKeqf2sa0HD2O2yYEvs3g0IigYe8NUI5Uf4Wt8jOhjFj1GMX\nLoQePVx8v5DrrvO49toIbdpEsYDN8yYQnFoFhiQIQUVewNGrEjBtGmL2bLKeHo2RfJMgvV4bqB8j\nBf7cbdBPjWt9CfT8wX/wVe+XkSJg7S0mb62HfzowKNsmNxJBJjySQQQHu+Y9SVlf2NNXJomEsoNI\n1wJoNJrmpUkEX0r5rhDitO94yGVAiVTdVt4XQmQJIU6SUn7WFPtvSTSc/QIM7lHC57lVHFsqabfW\n4y+3ODwoLQxDPS4I1GOXLXPIzfUQwsc0PdoHJQQDZpHZJYExBYKQEvFMVVeCkBKZ8Mj8ZxZffj6O\nY3o+DEJiBpITX4X/9yYctfY1Jm9zeeUVeC1ZyOdXVxHvJcGEIEjxxhsOc+ZYTMyw+KA4Sm7c4bOK\nbO75RwmfdSvhqVeKKCtTY60r7AMHwvz5auxjxkBurp7lazTNzYEqvDoF+KTO7a3V2+ohhBgphFgq\nhFj65ZdfHqChHVwqK102b55EZaXKAmhoVHbNNS59imexebhk5RT4+kyTBYGNX515I0Ttz6xZNolE\nhFTKBCIEz29DJBNkxSBvLHSaBWeOFayNnU2CCElMPCKMmWfzr6OzkEJgmJLAEGR8AVkxIEhRMc/h\nh0mHCCqTx0hCkBKkUqoILH3C+VtcTd+P+/toMm94mq4DnuaRhwdUX3koYZ8+XV3BvPRS/ZOVLtbS\naJqfQ2rRVko5HZgOqsXhQR5Os1PXIwcilJdH6dPHqpd0cPLJDps2qcT3ANhy88Usvd1CVBueAZxx\nhkvPng6lpTZjx0bp1cuhY0ebnnNLOK16X5kxaBeDBG24yygG4HwcFgQ27y+xyNkJjz8eQUoPRIh2\nqyQpfJJEeLs6POMR4ciYR7exJo8WDGd+aRGxmJq912TEOQ4VPZIEYcAE4Xvk5zuUlysriLlzlcCn\nO2sKwS6zf41G0zwcKMH/J3Bqndvtq7e1aup65KRSHh984PDLXyrBHz9ePaay0sYgRJDyMVLQ8fev\ncrbvsgiLIIDu3V0eeqiQcNgjmYxw991RVq2yEcJh/fE9ueTLCGGSpAgxkxuYEy6i6EmLeBzmzbNY\nskTtJxazeO65KL/+tSra2nILPHqzQ9S3eb867bKQKDYO78Rslm2wGD4cbr9dZQKlk37Ky21OLgtj\nJD0CCVJEWLnSrjkpDBmi1hs8Twl9EFAz+9dhHY2meTlQgv8ycKsQYg5qsbZSx++pthCOkEp5pFIR\nVqywa6pXc3Jq7ZD/31vXE9k0jWNLJUfGfM4TDgulhZTQs6dDOKz8cwzD4/bbS+jQYTam6ZG8OsJ/\njx3D+bFSXhRD+OKykeScWCus2dnUCD7AuedadOyoFPdvcfgtFj61s/APsVgRsrj+ephSBBb180Vd\nF255Gn6cO5xTf7+N0y45kXunFbF6tYVpqoyikSPV/pcudZHSYcYMm1WrLJ2to9EcAJoqLfNPgA0c\nJ4TYCvwaCANIKZ8GXkWlZG5ApWVe3xT7PdxomE+fmWlRXh7lgw8cVqywicUsQiFV4FTXDvlTirn0\n+TZ8m5Ng89WCTWXZnIkK43TokI2UEcBDiAjr1kGnTuoEgExwQ8EjnBaT9JcLuejlXF7GqkmRHDlS\njSud1jlypBrj0qUup5+ubI7TXjrFxWomn50NFa+5VFxdQnLLLEKkCMIRnrs+ykfZ8OCDtVcbL7yg\nvHiCQJ004nH1+q+84mLbhZimx0MPqauSdAqpRnMwGTBgAL/61a8YOHBgzbbi4mLWrVvHU089tcvj\nbdvm4Ycfpk+fPlx88cU8//zzZGVl1XvM3rhuzps3j65du5KTk9N0b6YRmipL56o93C+BW5piX4cr\nDdMt0wJaUWGxeQ5cFjgca8K1Uy1V4LSp1g75lIFxRswvZvjvboWwz3Wp0ZimQIgUyWSExx8v5tpr\n42zbZvP663DhhTOBAALBsaUBIQIkHv0Ch/ew+PZbmDwZXnwRrrzSZWB3ZZVQPr2IW56uFe2HH46w\nerVaV0jXVv2yv8vrqUIyqMJAIoAg4bFumsPWa6m52pDS45xzHP74x1rBr6hQJ7vLL3cYMEBlE2Vk\neNx1l0PXrpae3WsOOldddRVz5sypJ/hz5sxh8uTJe3zuq6+++r33O2/ePAYNGtTsgq/tkQ8QddMt\nEwnlOHnfffDuZJc3/EL+R95PVBTSZoXL1q21dshCmJxwwhYGjF4B4QDTDAiFkhhGrQ1yu3Zxrrpq\nPAsWqNCJEEKFYUyDJCGSmCSpnxs/bx6UlLisXDGATd7TrOzyNCc/YfPjnJIa0RbC49JLnRohdhw4\nL6WydUxVooWPIEmEBdJm+XIbKSNIqWycp02z6drV5aqrJtGtm8ujj0IyCaWlNsmkyiby/QiDB9ta\n7DXfm4aZbvvD5ZdfziuvvIJXnRP98ccf8+mnn/KnP/2JPn360KNHD3796183+tzTTjuNf/3rXwD8\n5je/oWvXrpx33nmsW7eu5jEzZszgrLPOIj8/nyFDhvDNN9+wePFiXn75Ze6++24KCgrYuHEjGzdu\n5Mc//jG9e/emX79+rF27dr/fGxxiWTotmbo2CHXz520cqnISfFkQ0LY0wbppDjfNHs/f/x4FSkil\nZvHZZzM4/fQQiYSp0lt8gS8MAiOoSY30fZXqeO21DqFQCsOQQMCbV4+A5zswfZ3N+36tqubkuKRS\nE9SisamsErbnJTlmJSSTKlsnFKrfqtC24ZchGy8VQeLhYzJbDOf5UBEfBhaRjXDsp8Vky7m464eQ\nTMIjjwwgFFJrFGPHvs2aNcrcbezYKBddVIJhwBFHQN++B/gfomkRNNYNbn+6rR177LGcffbZvPba\na1x22WXMmTOHn//859xzzz0ce+yx+L5PYWEhZWVl5OXlNfoay5YtY86cOZSWlpJKpejVqxe9e/cG\n4Gc/+xkjRowA4L777uPZZ59l9OjRXHrppQwaNIjLL78cgMLCQp5++mm6dOnCBx98wM0338yCBQu+\n9/tKowX/ANHQ0njMGDXT/8cZ2ayaEhCEwUgGbBibjbcO/vhHi2+/dbj22hSg+g4e/e1POPH//kbo\nK5/tXWA+P+F/3xxHLKYOcClh+XKboiIV06+qijB5ThHrN1oEBpzru9g4/CMnm+unjCEjI4FEInxV\nkHVkaRjZtYj164tqWhU2bKX4u3ctnpwc5aSPHD7ratNvnIUohzZzYVSBS6+bxiATHieJhWy5fSDh\ncAIhIBxO8OMflzB6tEWxygpl4MDZhMMe3347m8rK/fuialonDbvBVVQ4+30cpcM6acF/9tlneeGF\nF5g+fTqpVIrPPvuMWCy2W8FfuHAhP/3pTznyyCMBuPTSS2vuW7VqFffddx8VFRXs3LmzXugozc6d\nO1m8eDFXXHFFzbZEIrFf7ymNFvwDSF2Pp9xcJf6nnx7HjxgIIyApDSK94kQ2qccsW2YzdKiabQsR\nIeeTE2FFQOkUiQz5nJWaz/++OY6iLi4d/uHwtrRZvtGiTZsoGzY4PPywyoARAs6VLm9RSASPjwsE\nn0QChAgAgw0r+vDVH3rxXxuKmPSMiqVXVkJZmcPtt0NZmcV5psvs4Q5WkY31ogWooP7mkkmMn2mz\nyLc4Z4HDpb6HIX0EHifLT+u9/5/8BMJhOH6Dyy+umEA4nMA0A1IpjzffdLjiCi34mn2jbtMfw6h/\nRfp9ueyyy7jjjjtYvnw533zzDcceeywPP/wwH374IccccwzXXXcdVVVV3+u1r7vuOubNm0d+fj5/\n+MMfcBqpOAyCgKysrJpWi02JjuEfJNKeMtu22QgjDAjMUJizz7aJRpW75MaNFrPvLmbz7EK+WVLM\nS7EiNl8JMgwYYIR9igZO5pmP1RrA22YhHxS79O1r0bXreDZutOhruNxrTGKYUUIEjxA+x5UFmNJE\ntUzMoLtVjP+Lp5jkpMVeXSanUvfz4IOFDOk2nVe9Qk6ddj/+gEJKRrmUT3fxBxTS/un7edUr5Czf\nZUFg44kI8RyDzVcLlnxk43kRfF8gRIT8/CLWl7i8lizk8tK3CCcD/JRBKhVh4kS7uTrUaVow6W5w\nnTpN3O9wTpq2bdsyYMAAhg8fzlVXXcX27ds56qijyMzM5PPPP+e11177zuf379+fefPm8e2337Jj\nxw7mz59fc9+OHTs46aSTSCaTPPfcczXb27Vrx44dOwA4+uij6dSpE3/5y18A5cm/cuXK/X5foGf4\nB5z33nPZsMFBCJubbrLo3BkeflgSDoNhSIqKIDNTPfaDYpczbh2DudqjSi7khjOLOaUYzDqvd4r4\nlJBf7VIpPHLjDqCEWz2/kJDvEZgmUoSQErI2RshvU0xF+7jqjJVp1Yuhq8vkBEIEhEMJLiyYSySm\nZu7J6oycjwyY4KsTSAZVDKOEOzOe4u/3F3NkT5VNdHHyCZ544gmOOSbO2Wfb2LbF+UwigseRsYAe\nYw3mFlzIY6UTWLvWqsnD1175mn2hbje4puKqq67ipz/9KXPmzOGMM86gZ8+enHHGGZx66qn03cOC\nU69evbjyyivJz8/nhBNO4Kyzzqq5b+LEiZxzzjkcf/zxnHPOOTUiP3ToUEaMGMHjjz/OX//6V557\n7jlGjRrFAw88QDKZZOjQoeTn5+/3+9KCfwB57z2XHTsKad9e5al37hwlL88hFPIxDEkq5VNW5tCv\nnypiSsx1lJhLZTk8IG+ueiGhfklpknncDaTMckJ4iAYdf3LjDgQeBD6mAEaMgA4dwLbJtCwydzPO\nrK3ZGFUBQQjCqYDuHI/IiOB7HkmpMnKED/cRwsTHQHK9mEnf4iIqusfxPJVNJKVHZmacv/xlPDff\nrF67Y5FN6tkIqaRH23UR/nfdBNZKi4wMNXTtla85FBg8eDBS1rq7/OEPf2j0cXVDMh9//HHN3/fe\ney/33nvvLo8fNWoUo0aN2mV73759icVi9ba9/vrr+zbovUAL/gFkwwaH9u1r89Tz81XBVTorJpWK\nsGaNTSiU9sC3eTOI0MZQlsNvlw2hU/IdTJFAYND2i7EMmjiSXn4uFxgOVxTb5H5Xh5SGXUh2M43O\nfDdO/vOCijxJVilkrn8BnnySzSviDJtp86Gv/HNmJa9nJNMwkYSFT27c4YlSmy5d1PsJUiFyS7ew\nJeWS7orlYjFeROkrHBYKG+OHFt2+hOOPh5Jqc+26bqElJbXbd9c9S6PR7B2i7lnsUKJPnz5y6dKl\nB3sYTUp6hh8KJQCB6/6EOXPGAVBQ4LBqlc0tt1i8/76L56mTwTFr4YELHbKH2KxYAT9Z3J/tZ6bI\nKoUj12ZwfvA2Lir/fuLEWg+eGhoT9j1No10X+vev7VZuGPDAAzB+fM3LbdkCZdNc/i4LCeNBOEL4\nnSguFjfe6HJxbgl3lM4kO+bjEeHle4uxRsR5+WWbO+6w8P3GP6NwWBVpnZVyGSAcHNSCMKihavuF\nls2aNWvo3r37wR7GYUNjn5cQYpmUsk+jT5BSHpI/vXv3li2Rd9+dJqNRUy5YgFywAPn66xF5ySXT\n5NVXPyifvWOafCj3JvnGaxnyrbdM+dprR8iePRfLxYurn/zgg1IKIaXKwJRf5SCnXX2RzMlZLCMR\nWfu4PfHgg1Kapnod01S3GzJtmpThsJSGIeURR+zy4osXq819jcXy/tCDsmza4npPvUc8KJOoffwr\nx5DO38Py7bdN+fbb6j3VeRv1foSQ8pf9F8tvjSNkClP+myPkuSyuua+xoWpaDrFYTAZBcLCHcVgQ\nBIGMxWK7bAeWyt3oqg7pNDHfteDouvCPf8Tp0CFAVMfhQ6EkY8bciiFShJKSEyRsq7YWBo/HHnOw\n0i9k22oK7HlU5ED5FOgcfospyYU8+2yUmmbie6KxZrgNSbuc7ebN1NYVWNi2RW713ZXvTeecqrlk\nXVWAPyeCkB47+ghkyEcZPCd49NEJzJkzgenTle1CXUIhCL3nEAo8TOq3SzTNxoeqaTm0adOGeDxO\ndnY2Iv0l0eyClJJ4PE6bNm326Xla8JuQ74qUpO/r3NlmypQI4bAqpJDSwBAp1XgE+HcXED74UmCG\nIuTl2TWGZgUFDnkLniDzjyv4Z/vlpMJLaxZHMzIcCgutmn1WVta6be6SwVC3Cuy7UmH20By64d2V\n701n5Y4bCXrAiV3f5IHycfxn3yyOuSabLf4YgiABBEj5FpddtpBFi6KsWVObIVRVBSefDM7LNh6q\nmverHBNZsIWcUpf+/bXfTkunffv2bN26ldbSAGl/aNOmDe3bt9+n52jBb0K+qzl3+r5VqyzGjn2b\niy4q4fhgG23XQ49bXwYkGLCjG4gUbF1+GZVHjOOjjyyeftrlwQcL8TyPFWaEnr+N0p4ivlxRiO/X\nWiun95mTsxfl5nsQ8+9D3WbogYQOuaU83+ENxveF/MpcPv54Al9//RYQEAp55OU5rFljkZOjmpx7\nHrRZ4WILhzEU84OcFfSaMosfhWdgJ2fTrt0+XMVoDkvC4TCdOnU62MNosWjBb0K+K1Ji23Ce6dI3\ncHjvI5t+FxdxyaOFmKkE2zcFbB4GX/cGTPClQXT12Tz3nMqG+fnPnXoulBUVDh07jqdnzyhlZQ53\n322zbp1Vs8/mKDffG7JOH4KxQzVDN1Lwzuoh9D8fJk0C27bIyZnA118vrPH/Ly21a7p2eR6c5bu8\n5heSITwqzjR5cVgBZjiJaQaYoor2QQla8DWa748W/CbkuyIlOcF0Zl15C1nLAv57fQah7cNUjrwM\nyIzBabOhMg8CBAEZLF+uDNGkhLIyu1FDs8xMi379LKZOrb/PysqmLzffGzL7jiT/Pfh42VzWfDGE\n/jePZMyYuiEu5f//8cclNZbJw4erdMvZs+GCKoeI9NjZ3Wf1Qz6nR5aomoMUGClJ1l0zYarOzdRo\nvi9a8JsYy4K25S7xCQ7lQ2xyR1pUVrqUfnsrcliKT66GHmMTeNvghFAE6ScwCTgqZtBtbIg3rx7O\nD+wiNm60ahqZ33qrxfr10RpDs4az9YbRmXS5+W5j+M1IZt+R5PcdST5qZt8wxNW/P3TpokzTBg5U\nYZrycovcXPDb2IgPIlT0qiIIqxAXPhyzXJ0QM9f5VC4toeJk9b4aGrtpNJrvRgt+E1M+3aXzjYV0\nx8N7M8K8jVF63uwQGD7CULHtrwsMntxQxMn/UUTFPIcvyeY44jgxm4KtFtf2bexKodqw7DuonyHU\n9OXm+0pjIS7VlF2Fm0zT49NPHW68UY1zCRbnjosyoH0JhpxR28d3NrSLCb4uMCnPnUWwKUUqpTpl\nHVUG3xoObaeqk6tGo9k9WvCbmI3POnSvNimTeHz4kMPSsM0FF2QQpBKQMrm39EleiKn4vDBri5DC\nYXikSP29r2uqh6IlQWMhrobhptdes+s956lSi8G/s8gvOZqKNyaTWQqZMZjHZSzLPZFCOQMhfAyq\nGJYzmZGlbxAJEohRJvBkbd9GjUazC1rwmxDXhUdX2FxUnVKY7gT1wYMWr74aZfx4hxdesPlrzCIn\nR6VZlpXZnH66xYkn7p91wHdlCB1M9hRuOvdci1mzau8fMkT9rnCzOPV5A4OAFAZLOJu/rbCxr5lF\nyPAJpSTXls4ngwATiQwCuOUWNm6ET0rjZA/RM36NpiFa8JsQx4H3AotCotgoW4D3sUAqT/kNGyyO\nO06lTU6ZovrG+skQkReux95Po5i9qaU6VKjrbri7RupT3WwmXG1wTCkcEctgoWGzcaPFUS9ez4nb\np3FsqeSomEQIgZSqt670fWQshY4AACAASURBVE6dfAsdkXhvRignqkVfo6mDFvwmJC26H1RZvC9r\nhUYItT07W3nQ9OpVm2ZpSp/TEtOgcPZ+xWH2tpbqUGTkyPqRmKVLXa757Rg2hwM2J03WvVTMlTkw\nqWASOUZP2ha2UWe2jAjG7aPhkUcgCAgwMaRf07Q9PtcBLfgaTQ1a8JuQtOiWlMCsWcp7LBSC66+H\nnj3h+dEufZMO23OzCYIIQlYRSkmOKZXIhIfYzzhMM9RSHTjqrDgXFDh4nqe6YUlBpPcKTj99DL7v\nsVJGyI8Wk/luHGwbF4v12wdzPg6po7M5afKYmnBa9hD7YL8rjeaQQgv+PrKn5hxp0S0qqrX1Pfpo\n+Pv/uLzqqRaDXlmE/y4uRmSt4I7SmRwZ80maESKHchymian3OVJ/xfmop4tJnKwWdlOpCLEYdOqk\nroiCwKNMxHl+y3i2TYZXXwXft4hEVK/cisG5dP3UofMNOoav0TREC/4+UOuH47JunUMQ2PTtu3tR\nmT1b+cNICb/CqWkxKPE48+M4RbGneIcibByO+YnNuMN2er5vNMwoWjPMoWP1irNMeMy7Ic6LZ6i6\ng9JSmyCAiy6ajWGo3r633WbTsN1nIgG33gpBoMQ/mntw3ptGcyijBX8fWLrUZdSoEgYOnIVppvj2\n2wiVlY330UxnzaTbDTgoQzDwMDIiFNxuExkNHyQtloctnHEH9K0cVBpmFL2DTVH1inPKiLDAV83X\nYzFVfBYEcM89UR57TJ0AVq5s/MSYSqnP+1DKUtJoDiW04O8llZUuubmF9OhRhRASISCV8igrcwiF\ndq34TC/gpmf476Oydx4b7HD2ONWZytm9+3CLpmFGUZciC4rUivPabJvlYyzM6vuKiyEeV148lmUp\n++QQJJO1r2cY6jM+R6qmKYtN1T93T+jeuZrWhu54tZds3jyJTZvuB/x0qw48rw2LFkV55BELz4O8\nPJfHHlP2BwBlZWpGunWrRWkpFBRAVpYWGKgvtlBfePckxKNGwbRpSuTTlunnSJcohSRyElT2Mfiq\ncCrvVeZSUOBgGDbvvmvtU9MvjeZw5bs6XukZ/t7gumQt3YKRGyIAPM/kjTeG4zhFnHOOEvtu3ZSF\ncSrlUVpq4vvK9atHjwhFRVFiMUsLTB3Si9u7E97v+mzSZmueB6YJ50qXe5IT+OqSb9l4O0gjwE/d\nTLcTQ3heimQywvPPR5k4sbZfwKFaqKbRNCfGwR7AIU+1ImXeMYP8OyWdzBG0betw/PFPMXWqRVGR\nEqp0br0QPoGfRJDAMFRWyZtvOo0KjKZx4W2I6yojNtdVty0LRo+GwkKXkgdGMa+HzTk5f2fjGJAh\nwAQR8gmFVGZP2nu/7uunw0ppg7pWlCClacU0yQxfCPFj4DFUY75npJS/bXD/dcBDwD+rNz0ppXym\nKfbdnLguJCY4nJ/wEIFPZhlkLupAx/G1XZpAzUqXLrUxzQiQwEgGIEAaEKRCvPaazYgRh08l7IFk\nTxXCda8ATFPZKR99NPztb6paORKuojxX8v/eAClQdsoSAmkS+CEMI1XjvV/39S1c1gxzeAebLkW6\nk5amdbDfgi+EMIGpwI+ArcCHQoiXpZSxBg/9s5Ty1v3d34EiLTS9EjZvBhHaGB6+EWFttk3DjD8V\ngrCorIxSMXsMmdOWIICvCgR/Lr2ec2+3DutK2OZkT59L3SsA31exeyFg6FB1RWWYkqB6GcpIQmAa\nCMPEiT7Jiy+qGH5lZTY9ezr88pfq/5T+53b0PJUdVKQ7aWlaB00xwz8b2CCl/AeAEGIOcBnQUPAP\nK9JC815g8SOhvHHe8VUGSTS38cXFzBhk3l0KHkjgiLURzririMHVtgGHdSVsM/Jdn8ugbJd/4xCt\n9iVKL9SWlqqmMMgqQlLSbgOc6ISpePwGsvKKiEQsflt9nZn2LQKVRltR4nBqlYchdQBf07poihj+\nKcAndW5vrd7WkCFCiDIhxF+FEKc2wX6bFdtWWTfXXDOJnbnwW8bzXmDV6EP6CuD++9Vv14XNJQ5B\nUnkdCyGIX3o9a7KsmtizZh9xXXJuK2SCfz9RCjkXF8OAjAwYNMhiwYJiTCGQBmy4BUj5dFzUoaYx\nytChUFBQ61tkGCqNdthMmyoZIYmJH9LxNU3r4UBl6cwH/iSlTAghbgRmAxc0fJAQYiQwEqBDhw4H\naGi74rqqyOrhhwsRwkPKCHfdFaWsrLZv7PoSlzuqHBZIm52d4cMPHZa42UzpbvJNQUDmqjBXvVrE\novk6K+d74zgIz8Osrk4eIByOvtBiwgR190cfxdWURVQ3lullsCXbron55+S4XHjhFlIps7oALsKq\nVTaLfFUTcYFw6Ha9TZH+x2haCU0h+P8E6s7Y21O7OAuAlDJe5+YzwOTGXkhKOR2YDioPvwnGts+k\nZ+5Dhjjk5HgYho8QHo895rBokcrlblvuMnRGIYb0uC3HpOwhgRlJ0e13JmVCqupQXxAfC/4qHTX4\n3tg2MhIhmVBmaIvDNpMmqLuUxYXNQw9lEA6pxjL3rXySf2+3qKqC7t1dJk+utqD2Q7zyygjefruI\nm29WJ+0PPYuVEYto0UF9hxrNAaUpBP9DoIsQohNK6IcCV9d9gBDiJCnlZ9U3LwXWNMF+m4V07H75\ncptf/CKCYXiYZoS8PJt+/eC991yWLJzAKd0SHBsL+LYgIBwBDEkoFCAEGIZEGCl693ZYs8bSWTnf\nF8vCfDvK1hKVTTOpOpsm3Su37SpYOXYYFQXwcmkRa9daiFWqIKtuKEdK+OKLDqxaZRGP68VzTetl\nvwVfSpkSQtwKvIFKy5wppVwthPgfYKmU8mXgNiHEpUAK+Aq4bn/321z07+9yzTUOy5bZNf4thmHz\nq19ZZGe7nH9+Iaddl2DVLwJ6jDU4qiyEygVMYZomQZD+O8KIETbdumlh2S8si46WRd2JuG3DeabL\nq34hkZiHF4vwlejJYOHgBDaLsWoWdaVUjptlZbVpmeXlSvCzs/X/RdO60NYKdaisdFm5shDfVyLx\n2WdRunSxGDBAuTFeffUkhg+/H9P08VMGG2ddyOw/TeDfeTB8uEPv3jZnnklN+76D3US8JbN51CRO\nnXY/hvSRwiAQJkIGVMkIhUR5n9o2ktu3Z9O7d5wf/cjmy5fhg8m13cheHOcyOMvRZ2VNi0FbK+wl\nFRUOQaCqZU2zivLyElxXZeYA9WaNQRBizQk/oKI7xFZajBlj0aZNenFWC0dz07HIxp8Vwfc8hCEw\npQ8yIEN4DJAO76PcNkGlZWZkqHaSfV+VDMLHI8LtFPPjh8eA0H4XmtaBtlaoQ1aWTRCEqnO9JRde\nOJPsbJdQ9WkxFrMYOzbKK6+MQErJoEEzmDKlkJwct54tr6b5cbEolFH+i4ncKqbihzPANJGhCAtN\nu8ZULR3LF8InkB7/zksSwieMxxDmEpba70LTetCCX43rwu9/b1FZeT1SCoQA0/Tp0qWEoUMnkZOj\nkuljMYsvvuhAKORjmj7hsEefPo72ZDnAOA4s8i0elOOZLkfy3PVRNo+YyIUiymJpYRjKRjl9VZZK\nmSAjHLMmjC9MkkT4P4aQkBGkof95mtZBq43hN7TnHW+7/DDp8HGPbIY/Ogbw8H0TIQSmqRwXx45V\nJfiRiMukSYWEQh6hUIRwOLqL/a6meWnMZdNxVCGc78OZZ7pcfLHDK6/YSAk9ezqcc47N6D7gTHC4\n7y2b9wKLvobLAxc62BNUf1ydvaM53NEx/AY0FIsJA+v0m10VYfiYYkIFcY4/fguDBs2oTu3zKChw\nOPpoi6Iii6VLVQu+vDy1OFvXTE3T/OzOgycUgjPOcHnooUIiEY8BA9SJ+rnnxnPKKcBoyJhgsXwh\nmB4sj1hkTLBw0f74mpZPqxT8hpa8XT+t32/2B6vjPLRuPGec4TJw4Oya1L7Vq22eeqrWLE0bbh1c\nGvPgkRLy86uN1QxljVxQ4BCLWTz6KAwerJ5TXAxz58KQIdTL7df++JqWTKsU/IaWvJ1vsGFFhGSy\nuqIzYjP1CYjHLdq1ixIEDmvW2Dz1lLbRPZRxHCXY6bi9EF6NNTKo+yZMUCL//GiXvkmH5x2b3Fyr\nxjupRw+H1av3rkWiRnO40Wpi+A2dLV0XSkrUfUVFyi5h47MOH51s029cfWGfPh2efRZOPhnGjdMz\nv0OVuqG63FyX3FyH5cttVq+2EELN/g0Dfihc3vBVCC9FiLLL/4PM0fBZ8BpSphAiQs+ejTen12gO\ndVp9DL+xBT6obZM3axZIaeH7FpFyiI6rPUFUVMC7k10uQBXr9P+bxbvvatE/FKkb19+yxaJ8Ovwk\ncMgU8EVni3/8A4IA+lEbwtuZ41N1wzy+SQFGukeuR0WFowVf0+JoFYK/uzZ66W1BoG6nc+lLSmpP\nBucEqjl2BA+PCIWpKI6jQzuHKpalulltm1xCVjCTED6ejPDmz6Jc/YRFIgFvBzYeEQRVVBZIgjAq\nQVkCQmAYEbKy7IP7RjSaZqBVCP7u2uilt4VCSux9X20D6NzZJS/PIa90C5FY7YLuBYaj47uHMtWX\ncydWVSGRCMAwPAZnOUSjylr5rbcsCoMow0QJPy97BiOZUl2zQibt2vXmpJNuqPHU1ymampZEqxD8\n3aXw1d0GtX8Hgctllylr3SAZonKcSWY5BEaEoU/Z5GoBOHRJX85Vr035CBJBhI3ZNpYF//VfLied\npMzxxqx/iu2nF3HZohK4aBvb5Gvs2LGMHTvKufPO3Jr+BzpFU9NSaDWVthYu45mEyriuz/r1Lh99\nNIn+/V0sC9q3d2jTRlnrhtuk2Dl1OKEHJ5KxKEruSP3NP6SpvpzzhUmCDKZzIxcZUf4Wt6isdPH9\nQq677n4efbSQM85wuWe+Rc9HnqLs87MJghTgI2UVV9qTudufRK+Eqx0XNC2Glj/Dr07HqXSfoSLX\n5+gXwvzuaIeXv7TYsAG6dVNFOuGwx44dEd57L8qZZ9qAMkkTIkJWXhH000J/WFB9Obe1RLUyXORb\nmCbkboGyMmWOBz6m6ZGb67BypYrrT5xo89BDJpGID0jO/fE8ct94mftjGWzM1k3ONS2Dlj3Dr47n\nVi58mpW/TbHpOkn5JI+sf5WwZg0kk5CXV9soIxTy2LBBFenceWeUmTMncued0RrXRc1hgmXR8anx\nTHIsRoxQmTczZsDtt9tABDARQhXSGdXfgFWrLF5/fXiNj5I0YEdBwBGGR27cOXjvRaNpQlq24FfH\ncyvyUZkYJgQhqCiofUhlZTZSGvi+QSoV4fTTbRwHysos/vjH8ZSVWfqS/jDFsqBDB0il1IJ8WZlF\neXmUTp0m0rNnlJtuUiZrUqqft94qwvPagDQwUpBVZiAyalf5XVdV5Oqm9JrDlRYd0inPtjnDiJBZ\nlsBIBgQIvFSE+aWqf1JOjsvo0WOqvXIMjjqqmL59lQjUzeoZlO3CJEenbByGNMzQ6tPHomNH9T+M\nx2vF3jDg5JMtkiuKaZOYy3EZBWRenVXzP2+slkMfCprDjRYr+K4LA26z6JWMcv4qh013ZdOmT5yK\nCrsmRFNQ4BAKeQgRIISgQwfVa92y4O9/d9mwwSFXZJN70xj9TT9MaSxDK11Ul51dba3R2aV3b4ch\n3bIpvGdMdc3FQsqnRcmt/l83VsuhDwPN4UaLFfySEtWW0MXCxYLVIGJwxhlqNhcEtZ4rpunVK7ZJ\nZ3N07Oix0zeo7OyTuSrQ3/TDlLoma3Vn6qEQXH+9y5AhhZimh0waVOX4HBkLgARHPTQBcieAZTEo\n2+Vb4bDAsFkeUd47De06NJpDnRYr+I0hJaxZowRfCNXM5O67o9x1l8PgwbU9aNOtDsEnMCUVvQ0y\n1wjdJKMFUHem7vuwfbuDENUdsQzJVwUGR8fAJKDTxregcCEUF5M7ZgxnBh73mxHWFkfZiaVDPJrD\njha7aFtUpL6IQlDT7i5NEKhthgEbN1p07Tq+nm9KVpaNYahsDsPIIGvEVJg4UX+rWwDpmH76mEiV\nZkPSwE8ZJFMZ3LdyKouPuBCJgZDVV3XPPgtVVYjAJxyorJ3d2XVoNIcyLXaGb1nqSzh5MsybV/++\nnByXggKHsjKb0aN39cXJzLTIz49SUeGQlVU989cNTloE6Zh+SQmsfsZlVmwMVWN9viowuG9lMX9e\nPZKjAYsoAhCGAStW1FTuYppg29jULgabJmzZokI8ej6gOZRpsYIP6sv3zTf1t+XkuEyZogqtkskI\nixY1XlSTmWlpt8QWSlqUP1/ukPGhx5GxgGPWCi76QZxPhMvj8jZC+AD4KR8DEKAuC4YPB8vCovbE\nMWuWyvOfPVtfBGr2n+ZcG2rRgu+6cOSR6u9zcbFxkAVbagqtpPQ44QQHXUXZukgv3PZK2FwkIxxh\neIiMCGfdbfPxLQ7fdk3weQFklULbWECSMCFD4JsR1vYswp1e2y2rbp6/XtPX7C/Nnf7bYgV/+nS4\n9Vb1Zfx5znR+U3ALx5YGJEtDrEyGkBJSqQi9e9sHe6iaA0w6/v5eYHGREa1pYp5rWSTblFN+girU\nM5LQfWyIe2NPcjxx3vFtPrjFIpVSr/Pmm6ohTmNOrBrN96G5039bpOC7LtxyixL7nByXkVNuZWs4\nxadJ6DE2xfKxIykv6ECnTjYPPKCnY62NusVY6Sbm6Yu87PPjbN9kAAG+FEwp+E9mxEYiqgu0zg3U\nlaKDzftYlJY27sSq0Xwfdmfl3lS0SMF3nNqmJgUFDoR9Zasg4esCg5eeLyKyHh68wQEX/S1tZezO\nLtt1YelSm9zcDKT0SFRXZQuhcvb7JF3eqtsMhyhDhlg1ef5p6wUt/Jrvy+6OzaaiRQq+bUNGhiq8\nKiuzSSYzQCYgZXJv6ZMI4P+62ezcmaTyljCZUx39DW1l1C3GgrqxU4u8vCjjxztMmmSzbp1Fmzbw\ns5/Bqc/VtkaUeIw43eHzuFXjraPz8jVNQcNjsylpkYJvWfD8aJev/s/ho2NtZt9dzIC8uURLh/BC\nbCSTckax+iGvOk7rkb+0hEz97WzV1I2dlpVZbNhgMXWqysLZtg3+9Cc4G9UaUeKRJMKsTTaL71P1\nHIMGaesFzb5TWenWpH8fiC5rLVLwy6e7XDRZXXr7GwxMAlgluYaFbCGXyoI67plSuWdmHuxBaw4q\nu4udzp4NVVUqfv8+FoVEa2P4vvpWBgHMn6/y8UEv3mr2jspKl5UrCwkCjyCIcMcdUYIA1q1zCAKb\nvn2bXvWbRPCFED8GHgNM4Bkp5W8b3J8BlAC9gThwpZTy46bYd2PE5zp0J0GIALM6n1oVViYYgMNL\npUUMSM4iJKs9dPKKmmsomsOExmKnkyapsGC65gqU6L+PxSmnAP+sTfeN+9lc3DvO9l42XYp0k3vN\nnqln4RJ4XHBBCQMHziYc9qiqilBZGW3yWqD9FnwhhAlMBX4EbAU+FEK8LKWM1XnYDcDXUsrThRBD\ngd8BV+7vvnfHqQXZmG8GpL+nApBAgMk7wmbNGos7xr5Nr14OZ59tY1+gv52aXWOntl1rtNcQIZTY\nRykkQoKdOQGVpwuyPgiTWeSgazs0e0KZNUZIpTxSqQhAvRqhigrn0BN84Gxgg5TyHwBCiDnAZUBd\nwb8MmFD991+BJ4UQQsq6c6emo3NWHCmUF4pENbL2MbmFJ3lfWJgGrFtnsWmTxc03N8cINIcDe6po\ntCyYOlXVc/h+rQdTOAxXXw3GZLWI+++cgLIpEISlXhPS7DWZmaohzwcfOKxYYQMwcODs2taq1e69\nTUlTCP4pwCd1bm8FztndY6SUKSFEJZAN/Kvug4QQI4GRAB06dPj+I7JtRJsMZMIjEYSYxfWUUMQS\nwyIjA4qLVfMLnT7XetnbisaRIyE3t9Y/P33cALz0vk1qUYSvC6oIwlKvCWlq2Ft7hD59LH75S9VX\n+ezAZeXYYbTpuY2z251I5pE0+YXiIbVoK6WcDkwH6NOnz/ef/VcHZIXjsD7bpiJucX02XKpFXlPN\nvlQ07i6Fs3NnCP1iGFfmbsMQrxDIFEZIrwm1dvbFHiG9drS+xGXoM4UYsQRmLCDAwJ81G/Ptps3v\nbQrB/ydwap3b7au3NfaYrUKIEGoCFG+Cfe+e6m9pLpDbrDvSHI7sT0Wj46guWQ89pEz4viBC9zOe\nJJmM11yGb948qdZpVdOq2N1kom4KZt3jwrLg5BIHI+URQoWhTQL8ZsjvbQrB/xDoIoTohBL2ocDV\nDR7zMjAMVdd6ObCgueL3Gs3esD8VjbatUufSC2zgkUzG6dhxfL1UO8OIkJ/f9JkWmkObxiYT33Vc\nuC788hmb14kACUwCUhiIZsjv3W/Br47J3wq8gUrLnCmlXC2E+B9gqZTyZeBZ4H+FEBuAr1AnhQOC\nbkOn2R37UtHY8DgKApuqqghQ2x7TdeGjjxw6dqxNtauocA5IQY3m0KGxycTmzfVTMOtm4DgOLPJr\nazz+RTYniDhDH7dreio3FU0Sw5dSvgq82mDbf9X5uwq4oin2tS80t9WopnXQ2HHUt69FZWWUirIS\nskphy1oYP9rl4i5bOPnhEOE2YBgRtm61+dGP9DHY2mg4mUh30UvP8LdsyWbFS6Po/gUM6lDEhLDF\n+56q8QAwBLSNN304+pBatG1qmttqVNM62N1xlBmDzIGzwfPozkxe9wWh1Sm+HitYfVFvPj3qBt6J\nW3genOW7XFDlsL7ExtIHYaug/lVhbRe9LVuyqfr3bWT1SPB5V+h+90weucLhrX9bzJ+vCv0yMpqn\nWrtFCr7r1nqghKrfYSSi0uq0m6FmX9ntAm+dM8HOHj4V+XBMKRwXg36xD/lXTimreq/gypyezCgf\nQ0R6iFkRKNLT/JZO49EF1UVvxUujyOyRqEnj3ZmXZOtzDv8xzWLcuOYNQbc4wXddGDBAlcSDEvwR\nI6BnTxgzRl9aa/ad3S7wVp8JKjsnKHsoIAjDliTkjQWBZN0UjwvC0xDXhPDu9DlydQApfanZGtht\ndMF1OX/WTMonKbE3UtC2NEQHtvBN8XSsa+NYzTgjbXGCn/6g0/i+akMXj+vwjub70+gCb/WZoOKj\nCfiRtxCGapoSLzAwCFTlrSnB8KnsY5C1VmhntVZC3avCUKhOk3vH4Zhyn/yx8HUBfF16Gu1inzKC\n6ZhrArjPUPGcZpqRGk3+igeZ9AedJhxW29LbTVN/5zRNiGWRNXgCwsgglTJJpNrwX6t+z19W3oiX\nVNsQGWSNmAoTJ+pLy1ZC+qpwxAgVk58xQ4V4yrNtME0yY3Da85C/9hPCpAgRKIPHIKidkTYDLW6G\nb1nw9tsqhg9QVFT7/dKt6DTNQWamRc+eUcrKHNassdmaZfH8u5AztoiCAodOnWzsByzoe7BHqjmQ\nWJbSG9+vjSz8LW6RO3w4cto0hJQIJEbIgAAl9obRrDPSFif4sPv86ubsJKNp3WRmWvTrZxEKqbUi\ngFjMIhazuOmm3TxJF4m0WNL/2uzs+gv+g7Jdti0Bs7vJzgKfI0tDxK8Zw1Enl5IlCsj8Z1azHg8t\nUvB3V8Ks0TQFuxxfrkvl0hIqCuCVN4oIgtpjzjDg5p4uTHJ2baCri0RaJA3/tWmzxkHZLrljCqn4\nQYKyKQF+WJBKSYzQY5hmEiEW0OXCqZx8cvMdBy1O8HVpu6Y52eX4Moth9GhWPugReDDg/Fm8+OLb\nABQUOPzg39nk1kkPq/x7MRXt42Qt3UJmnSyCyqUlVJysJyktgYYZOvE4jB8Pm0c5BFUelfkqo0uY\nEoMUQiiXGSkD1q+/laOOym22Y6DFCX7DLjLN0URA03rZ5fjaNBd6JGtaZhp4XHRRbeciwzeo3OiT\nuSqgsnOClVW3EmwKMHJD5OeZZJZBZZ7JytxZBJtSepJymFNZ6XLeeQ55eTZlZVZNON51YfxMm9el\nydGlPkYSktIg5YcwTR8hfIQAKf1m1awWJ/gNS5ibo4mApvWyy/F1+hB4zMFIegQSkjWdixKYZgCG\npKK3QeYaQUVvQWD6QEAAVDw2gsxFHag4bwuBPwM9STm8qXv198gjEd56K0o8XuuXk0oBCI6OCU6b\navDm+b156Z0bkBLuuONmhAgwjHCzalaLE/zMzNoSZn15rGlqGj2+puYSzChhZWIbmaXQ94fvYxjV\nfRFFQPiGu9h8RBavkE3X1BhCoTq9lPtZZFW6GCtn60nKYU7dqz9IkJU1gXnzJjBzpsUTT8AFhkMo\nSLEzR7JptE/n0BJuy1/J448/TiplEg4H+H7zmgiLQ9WluE+fPnLp0qUHexgazV5RPt2ly402VTke\nKx8DaaKaKQdQuWAwSx48mwXSZnsO9OnjMHKkjWHUumjm5NQuBAN6wnIYUjvDTyBlgJSQTEa4806H\n/v0tbu7p0mVUIZtuq+LzSyUIlaO/du3ZdO26rNpq26RTp4l07Dj+e49DCLFMStmnsfta3AxfozkY\n5MYdpEjyeQFIgRJ7CcKH81+cxyXyZe4lg8JYlP9dM54jj4TZs+sm6VhYlkXle9NVnN/0MYwMHc8/\njMjMtDDNKLHYGLp1W1KdUq/WdKqqLHJHWpQTZeW6MZzCElVoBcTjJyNlOXWttpuLFldpq9EcFGwb\nEQ6TVQpGEvCV2Hd5TLlqhggI42HjYFR/6xpafeC6VMy4hYAkEBAECSoqnIP2ljT7zh//aPHRR73q\nbUv/v10Xcv9/e+ceJUV17/vPru6uwRc9OjGiRtAgICMDw0O0RLDIKD5jzOGcxGDuuHyhAkbiKCck\ny4RzzJVEwaAGDRDgMPfqiUlQ8HlFG0p5lA9gZhhtREGQ+CB6RmfQRLq6q/b9Y/drhuHlAD2P/VmL\n1dPd1VW7uhff2vXbv9/3N97itH+Zhe+b+L7A80wef3wKGzfGOO20uw/5BV7P8DWag0G6rDJaXc2g\nVTtoHATd73mOY2pTkO5glMTkZWyqqlQ/XCFaFFY6DsXrAoyr0sZaIqTj+R0I14WFC6F370ouvngh\n4bBHEJgsX15Jfb26F/kfRgAAIABJREFUo8v0Uli92mHePId162y2bLEYNgx69Tr0d3Ja8DWag0W6\nlDuKatpc/7nLn252+ESW8A0acLB5FYu1v1NV9EGgjLVmzcrUXNlE7y5i0J0JGocaFN/4ex3O6UBU\nV6u7tXjc4vbbVzB+vIMQNvX1Fr6vHHynTVP/Roywmq3hHK6aOy34Gs0h4pkGi98Ii6BFXsTQpMv3\nSlVl7jN1ldnUvYzjVtRxiGq7hXZPfsV1PG6xYIFahAXYvNnirLPU72eaSuyDAF56CVauzBVWH+6f\nWAu+RnOIsG3ldJtI5MI3w5IuT5babJrpEUTgO8mFPLl0Ba5rqf/82vCpQ9Cy4rquLobvq9/tzDNd\nxo93KC1VWVaxmJrVv/RSczPMQvzMWvA1mkNEy8YpR9e7GHdP45/lucrcsPTYudOhosLKzvq0p1r7\np2XFdXm5g2la9O7tct99FXTr5lFXp6qmLcti2jRIOC4jkg6rQza2XZgfVgu+RnMIyU7YXRcmVyB3\nJWiqlfwtqRZmg1SYstrtvJ9wcRyL+nqYNEll70QicO21zS2+Ne2DlhXXAwfaxGLwzjsO3bplLgQJ\ntm2bxqmnTsMCYqICgYcUJiFiwOH/UXVapkZzOHAcZMJDyICj4wZfVQ2nduGVlFUJ7ojPY1lQQf9G\nl4kToU8fl6uumk7v3i5z5ijnRdct9Alo8slUXJ922t2EQjEefliJ95VXqguBktaAzz9/ibq6CprW\nVhNKeRjSJ5Q6dA1O9oWe4Ws0h4H6EpvegUkEjyQm0+KzsOMOJTxNGB8hPD57wqFfP5gxo4JIxCOZ\nNKmqirFpk6VbchaY1izXo1HV7+DCC1VcPhSC666z+PGPY0TlZD4PXgcRKH+kcojmG+MXqOWeFnyN\n5iCwrx4MzzRYPCNijJJONj0TwMNE4pGUJvO32Az6kUMk4hEK+UjpMWSIw9atlm7JWUD2aLnuuiSm\nOQxJ2KwOVOrlnDmwcQE8WVpL03QIwmCEw8o3KVZZ8MUZLfgaTRvZnx4Mtg2/DFmsSanXhYBXpUUF\nMWwcXsbGlRY7N0AQmBiGB5hYR5fw62um0wub3WK+enX3sNCq5XocqKjg/ITHssDkAmK4WEgJI5IO\n0TrVqLxxiKB4+LVEbUv9fAX+nbTgazRtZH96MFgWzJ7dfEE2lYJXfYu1YQvDgJAPW7ZYrF0bY+NG\nB299CQvemswRhoe/0OTRa2P0qbRyi8C6Y9ZhoVXL9ccc8DxE4FMkPG7p77B+i0UqBatDNlKYRDd5\nRLeaMKGywGeQQwu+RtNG9qcHQ1OTy0UXOaxYYfPKKxbbt8O8eeo9KeG665RjZo8eDnffbeP7NreV\nT2OXTJAk4LPyXexYWc3Ni9Lpmy3bKukg/yEjY4q2davD6aerkF19CfTDRGTCcZttHnxIdbeybYsQ\nMd6vVndufbAKkI/TOlrwNZo2sq8eDC1DPhMmxIjHraxbZigEJSUuZWVqmxkzwkgpCYdT1PsBSJBh\nSXlyIb3vrMRxLCzbbt4dWwf5DxmuCxdeaOF5qkn9/f3n0qduMc/KW2miGAebN3yLi9KtDNVnLCoW\nWernWdR+bsC04Gs0B4Fo1Nqj703LkM+OHdWkUg4zrinhmFcamLvJZutWB99Xi7XhsGqeYhiSwAAh\nAQMMmaK83GH7dgsXCyu/qqs9qEknJf9m6s5+/85lpfcS9eCC+DLGM4fXhEW3Ftfc9noD1ibBF0Ic\nBzwOnApsA34gpfy8le18oD79dLuU8oq2HFej6Ujkh3yECPPxxwsI/BQDvhcw4BWDsX4R19bOIpk0\nkdJDSoNwOImUanEX38APIJUyWb/eJh6HBQtQM/2p7UBFOiH5WVclJeo7Li11GTNzBlsjygJ7YBX8\nYONiwjeNZ/DgXGq9ZSnxb483YG2d4f8MiEkpfyOE+Fn6+b+3st1XUsryNh5Lo+mQ5Id8du3azkcf\nzcMIBQQSvigPOCbu8e14A1VVMcrLHb75ze1cfvlc1RM3Bce8E9Dt3RD/uWwW8bgSn9NPd3njDYcg\nUGsCepJ/8MgPwYHJI48on5zycgciEkIQAO9fA6fuOJ4JZ7j8ZaJDzLf5Vdji97+H8eOb22q0l9+m\nrZW23wMWpf9eBFzZxv1pNJ2SaNSiV6+p9OhRiRAmfsrASMHRtTmf/Hjc4rHHpvLSS5VAEUggBF/0\ng4aLfIZQA6iZ5syZFQwYcBdffFHBY4+5uhr3IJIfgpPSo7TUASBVW4JICtWy1oDPh8JHl/yJkx6y\n+WXqLl6UFQxNqmpp11UiP3Vq+xF7aLvgnyCl/Dj99w7ghD1s100IsVYI8aoQYo8XBSHE+PR2az/9\n9NM2Dk2jaR+4Lkyfrh6jUYvBg2OYRb/mo1Vz+HLUr9kyJ8YxYyzOFS4/YzrROMTjMY41hqupZEgV\n8Bz7feW4WV6uirMMwycc9hg40Ml1zdK0mUwIDkIIGaZ8w3ZuYC6L4j+hvCrg2HXkfhcRsHNgkjB+\ntqNZELTf32KfIR0hxEtAj1be+kX+EymlFELsqSN6Lynlh0KIbwPLhRD1UsotLTeSUs4F5oJqYr7P\n0Ws07ZzW0+UtRo60YGRuu1u2uIxZVoGJhxeYvB2dRbceQxAf1yBlCgyTPzyn8rnr622EMAGPVMpk\nwwa7XcWJOxKt1a7F4xZ1dTHO7lFNv+kLGPnWPHwMQiTpFgexCJoGgi8FyZTJkbWSJD5JTBxsioqg\npERd5NtTOAf2Q/CllBfs6T0hxN+FECdKKT8WQpwIfLKHfXyYfnxPCOEAg4HdBF+j6WzsT7aG68L6\n+x0uxyOMzxelu/iixy3s/BiEiHDiiTexbFklGzZYBIESpLfeinHFFQ4ffGAzbpyO4X8dXBdGj85d\njFesUK+rC7TFz4XDMN/HkD4SSUAIA5/ucehfFeb+8ht4ZkMlxZtglHBYGbIpv8Hi2sEweXL7rIlr\n66LtU8A1wG/Sj0tbbiCEOBb4p5QyIYT4BjACuLeNx9VoOgT7k63hOBDzbX6GCSTYWR4gQ+oGV8ok\n3br1ZNgwq9l+hg2zKC4GcJgwgb22QtQODK1TXa2a04B6rK6Gnj1zF+iYsPmZzBneze4zi+6bawgk\nVMcreTVuIYT6PQZca/HbtI31LbfArl2qoK49pWRC2wX/N8CfhRDXA+8DPwAQQgwDbpZS3gD0B+YI\nIQLUmsFvpJTxNh5Xo+kQtGyC0tp/fNuGuwyLCj/Gr5jG8NoXMZKyWSPzXr1U79vFi2HsWLVwm8kk\n8X2Tbt1ijBihdp4v8KAdGJqaXDZscKittRk2LGdNcdF6h9o8I7sdO2DwYJUKKwSsyfM6crB5fcvu\n7SqlVBYZPXvmmtfktzoMh9tXqK1Ngi+lbAAqWnl9LXBD+u81QFlbjqPRdGT21bXQsmDECHjlFYv/\nYBqx+ErOrErwxTCDY8erRuaumwsTrFwJ/fs3zySZN8/BMNRBKirUjDUUgssua58FQIeLpiaXmpoK\nfN+jb98Qy5dfCjug9O7n+F6dz8WEWci1VFPJp0/B+0sdzpI2rxnKCO1VrOwFwUB9p76f279hNL9z\nc5zc+0KoBjbt6fvWlbYaTYFxXXjtNfX3q1hcKGL8rq/D8JvtrFpUVzcPE9TW2pSWqkKtVMpk3To7\nmxmSaZg9PHApXerwWdhmNSok1F4XEw8VjY0OUqoKZsPwOffcJSSA2ulQfjt0j/uMZw7XshACSRif\nX2BSEcSyQn8OLqOFw5qwzdmTLWbOVN9vOKwuqD3yUlpahvAq249vGqAFX6MpOI6jwgKgZoUDb7IY\n/oia1TvTlUjnhwlGGC5XxB0aorN4cHkD69bZbNmiFm7r0/XsNzCX2UzCkD5BUMSfboyxa7DVbhcT\nDxXFxSqjyfd3YRhSVS4DMgyN5dA9DiEk4AHqb4lHJdXYOPwPJTwoJmPigTB5dKdqTSilmsk//bQS\n/4UL1aLv/oTwCokWfI2mwLQ2K8xP5zSMXJjAwuXFoAJznkevRSZVs2I80y/XIGXyZDg7cJnNRCKk\nEICUCSp7OlTXwE93OSyXNm94XaOLVqbuYfnyao45Zj6hUFLF6EMR/u5dxgk8RwgfnzAgCfDxCXEt\nCwmTIsDgy/5JPimHaN0uzkc1K/fU9SH7uyQScO+9MHy4+j0zJmrtDS34Gk2BaW1WOH16LvYupRJ9\nIeAC4fDPMxJsvSAAsYueiWpsW4n366+rsM/5OBgESuwBoew4ufo/KpDS4xeYXBqKYdudXO3TxOMW\nV11lcfrplYwZU00oBN//fiW1F1pUPekyMnBYE7E55xwwXnE4he3cyDzC+HxeGvDmTAgiYCQlgz5p\nzDauev11WLIkd5ynn1b/2vPdkxZ8jaYd0HJht+Wsf9Ys5bV+0ckl1J0QIE0AycdyPpNvqaSuzqK0\n1OVHP3J4r7YEL15ESCQQIQN+/3toaFDNs9P9cxdd59CrPSpSG5g7F+LzXcad5HDEJTbPNKg7H8eB\nZFIJfzydSvmPf8CiRZCQFhgw81L1mbPfmEr5Vy7XsAiJx+flkiDjnyOhUdZmfyvXheeeU/s2DHVh\nDoL2vTiuBV+jaYfsKRb8/vsN7NyambuDJMWZZzokkzBzZq75+X8tncWEng25D7tu9goSMk16VdqF\nObFDgOuqcMqOJS4x0tXKS0yeNWLcXWQxa5bqMJYJw5imevQ8tbD9AhV0W+phvGDy2qwYD9dYXPzH\nGOf5DrK2kUuS96oU2RQUnz42e1zLUr+P46h1lvz1kfaUipmPFnyNpp3SWjqnWoSMIKUHEgI/TG2t\nnfXXyTQ/P/47DfBvU5vvLP8KAtl0HRdrtwtLRynWyqx1fPUV/AwHM12tLPEYGTi86lk0NKhzqa5W\nn8lkzixaBN/Z5WBKD0OqvNWyBodHHrFwKy0cx6KkBD5a1ZsBR8znqA9OYill9DFy30n+b1RW1v6/\nMy34Gk0HIhq1OOb9hziybgJCBpS8KOgeh1rsrJ9+KmWyY4e9+4fzYxHpFWE/bDJVxljlq7TN12a5\ndK9xmLrAzr7WXuPRkLOuAHCw8TCR6crYlUbOY6i+Ht57TxWtZc4lFoN3q23EQhOSCRWXKSkBWlxs\n3TL80fXIxDr+lRe4dEGM6Y6123eyr3qL9kBb3TI1Gs1hxHVhyfUNfPP/Sbr9XSKCFDYO8bhFVVWM\nhQvvpqoqRlHRXpSnhcHPiKSD78OQhMsZkyo4Zc5dPOdVcJbvtnsXzsxaRygE6yIWt/aPUXPl3WyZ\nE+OyX6v+v/X1cNNNsGyZepw7V33WsqDyEYvQg7NyqVCTJ+/uM+04CM/LOmKOSDrt+jvZG3qGr9F0\nIBwH3j2jhDfvC9KZIwHvVZVwTtylMl4NcXgPtcALtB6byV8RDpusljYhH74jHMK+h5BK2EYLh3WG\nlZn0FpQ9hZh2X+uwIF0wlSnvnzYtt/05uITvc6Asb0cNDXtfcbVtpGmSTKg7h9URm+n2oTjLQ48W\nfI2mA2HbsGlTA8mIQSgUkJQG3y6vYVH8VorwaCqFMeXzaDr5YXDLWjfSyVPJkG0zPR3Dv7zERkxW\nFwJhmKwM7Oykt6yscOGK1i2mc++3GkrJu0KMHWuxbJkS+xgVdNviQUXejlqkRNWX2DyTX41sWcQf\njLFlvsM7J9lMn7J7OKejoAVfo+lAWBYEgc2uXUWARyhscq4HEZLsLIUNMyGI+BhMomnt9UTToRuZ\n8Hh5msM7Y9Uipm3n+uGmNU39VaYuBI9tt3HnWe0izbA1i+nM660ukLa4QoyPxdgyxaL4Dw7mztwC\nbeakXCzevSbG+TjsHKzsExLpkP7s2epiVzHZwvMszHqITTmcZ39w0YKv0XQwRoywaGqKZZtsR28H\nf+kCGss9ggjpnqs+jeUQNU1kwuOrwOQXL9qsWaaErKhoD4ux6elyHxfMRQc/zTC/OfjeLJ0zuC5s\n3658a4CsH9BeHUBbXCHer3aYtcBiiGdzW3pR1wibhGwb14WptsulfaqZNwTYnvMiCgKYNAmuv77z\nGNBpwddoOgAtY9jRqJUVzKZSl02PXof/chwRrEaGJIZRRPHASohV8vI0R4m9VNu3nLU3Nbns2KFy\nFnv0qMzu95pr1LErKw+OwOU3BzcMk0GDYvv08c8IeygEN96oxrKnpjKZ7+jyEpuy9IUuZZg8vsMm\nmVTGdBXEGI3DGdfaVFoWK7/v8vjpNptmqIulHyzkqadWUF+vxpWxTthXT4OOghZ8jaads7cYdtb+\n9ziP5OUmcx5+mKnjaojWwdL/hj6VFq4Np37DobGWbKVpRriamlxqa22k9JASPv54IZHICi680Dro\njo/5zcGDwKOx0dmr4OcLO+Q852F3Ac7/ju42LR67Ncb6+x2W+zZrn7cIhZRB3atY1BRZrEj7Fe18\n2uGfP0xm74xCwuOuuxyuvtrC99WdUGVl7kLTnnPs9wct+BpNO2dvbRLz7X+l9DjntOcpWfoMx9YE\n/CC+gD+vupSh9z3P8OEpkkmTO+6IcfLJVjZzZckSh549k9mmH0HgsW6dg+dZ2eNVVx8cscs0B8/M\n8IuL7b1ub9sqlBME6nkmWyg/M2fUKJeTTnJ46im72ZgfqbWISQs/gFBK3R2AanJy5pkuqZRqiOJI\nm1trIxhJT1XThk3GjLF5+eXdz7kjC30GLfgaTTtnb20SM/a/qZRHEIQZdtHT/C3s8+GPYWCVx6iB\nS9iWnr1K6TFkiMMtt+QapfTubTNjRgTTVNVLqZTJJ5/YuazNsLJm9v22m4JFoxaDBsUOKIYfBGSt\niCdOzGULWVau69fWrR5lZSYDB8bYsEEVi40dqxrFtLxLmTjR5aabKvA8jzPPNPnnoBhj6xyunFLN\nqLug3xgV0uoIRVRfBy34Gk07Z28e6xn73w0bHN54YzuDBs3NGn29fw0c/woYSUhJQSplcsEFdjM3\nzjfftLjjDocxY6qREhynktmzLS67TB1v+3aYN+/gLVjmrz3si4zpWYZUSt1t5N/dZEJE4PHAAw6r\nVqUbuuNy/u3VbPwmHD+0ElC9APr3b25BoT4zlRG2xfBOKPAt0YKv0XQA9jbjjEYtRo60MAyXL75Y\nBHIXRkjy+VBoGgin/D7M3OgNhE6r5Ne/VjvJv2vYssWiXz+Lmho4++zmx3Nd5TmzvwuWc+fm+u6O\nH79/5+a68G61y/k4ytTNyo3RMHIhnZa0DBENHGgTDqt99Vtj8/ffehRHIJlYyOTbV1BXZ/HllzkL\ninBYfWbkyL2PrTPE7rNIKdvlv6FDh0qNRnNgrFq1Rj7xxBi5YoUhV6xAxl4UcurVN8sjjpByw5w1\nUt5zj5Rr1kgp1cM996jPrP/DlfLFM4fLG5kji4qym2T3+V//dY9ctWpN9jP572eYM0dKFYBR/+bM\nkc2O09pn1qyR8nxzjfwHR8gkIZkqOqLZ+CKR3P4ikd330di4Rm7bdo9sbFRjO+IIKaeKe+R744Rc\n8RJyxQrk8peEHDfunux+SkvXyAceUJ/ZG5n9hULqsbXxt0eAtXIPuqpn+BpNJ0Ll6E+jrm6lmvmG\nTU48u5LXRrmU/qSCwPOQpkloRQzLUh76tetG0dQnReR+uPenryPjUF2tpufPPuty/vkVnHKKRyKh\nFn0zcfKMR39m9rt4cfOxLF6cLlraS86848CIZM7l0s+LG1VXN2/9mAkzQW4f+SGizOL2CmlzW95C\nbECYE0/cTmmpSzxusWWLxVlnWUSje/8u97ZY3lHR5mkaTScjszh62ml3M3hwjFtvtehe4yATqso0\nSKhiJIAdO6qRIgUGyAj8fQyMZTE7digh37rVwTASGIYPMsEt/adxlu+SSKhF1LvuUoLuuiqMk8/Y\nsUokEwklmonE7kZstg2rI8rlMoWBSDtWuq5aLO7f32XcuOlcdtlcjjlmOo895maPB+px+nRYvdrl\nvPOmM3Cgy+uGxZVxB/eOm3nj+SsJhQ0uv3weD84azZ8uu4X/vKSFOdoeyDdm6+j59xn0DF+j6YS0\nXBx9GZt/zbMOfhmbPi5s2wYn9mj+2aWhsfTooRZMjzyyEcMI0m0WAy5uepGrWckYYqzxVTPvzOw3\nk0aZSqnHsjLlVJmJwQcBuxmxWRZMdyyce2dx8dMTle3B5Mm8e00ZffvCffdVEIkkMIyAIDAYN66I\nO++M4Tj5mUYuQ4bYFBUluf/+CPX1DkVFFg0NFueeNx3ffxrwCQmf4dE5fPexRVz6nLI4hj3H6Nt7\nQ/KvgxZ8jaYTsKfFxczrJYMtLjVjjEg6rArbHLfD4tnzoW/fSmbOXEBRJAk+bN11J5UrVThn5+q5\n/OiHMwAVUiEFQVQSweN8HFanK3dDIXXctWtdrrrKYf16m02brOxs/lzhUiXv5WQ+ouH565ut5mbG\nN65HAwY5x8rzcVg+lHRGjbrgqEePyjH3Muaof7Lx2bF43nguuKA6m1YKHmPGVNOjh8riiURK2LzZ\nJPB3YaQkx9XKrMVxdbXVbEG6tZTTzpaeqQVfo+ng7KkSt+Xrsx6yqKmxeHUBJJeqJcy33rK4/XaH\nyy93uOIKG/uRtLrNncuJg25hmwhApBupSzi61iCJyYrABtSF4LrrVE58IlFBaanH1Veb/Pznqkn6\n0fUut8tRmKhgfNM7r/P+X56neMwU4nErO74XQjaxsEkINdhelTY3BvDVVya+r2b4qZSBwGD4RUv4\nPAQn9V3GD8/c/ftIJndQU6Matgth0rfvLJJbauh++wKOjPtZi+P+dL4Y/b7Qgq/RdHBaLi5mKmO3\nb2/+ekODsifwfSXgGTLNvX/3O/jpT+HMnS4/njeRY/sFbL8aAgBpsOLPP2JT+acsF2Nx31LKKCUM\nHqxm06Dy2w1D5bdb6aavkhQCaMq4eZpLMOpeoK4uhudZ9OvncsoQh7/0msVVRzUoe2JH5dOfdcQs\ntq1bzOZEOcGpxXTfvITQ8NeztQY/vngxdz47jUsuWUg47AEmn33WA99XYwmCXWzeXMOoUY/ALYP5\nfP5i3JPGMn2KGv+BpJx2BrTgazQdnPyc+lAIFi7MxdFDIbVNvqCZZs4RMp9kUjUDnyocAhkQjcPA\nKmgqB5oEyUl/JRRJcVqwkvrJZcTjFoahLiS5nPgEhiHo3bskOziRDuw3lqM8awxl4VBe7jBwINxz\nT0U6dGOyOhLL+vicF3J5SU6mLOVxZngl35ExTuxbwsTBr2ebiqeKx7Jpk0VV1QqGDnW48Uab9euh\nX7+FGIaPEJJUah4frezOSZMf4ljP49L6lTBFlex2thj9vtCCr9F0cPIXF/MrY0F5yPTs2VzQMtuW\nlEBNTfPtQaU1ehRh8BXROBTHYeu4gHDEwwhJpFRiHY9bhMNq39Goxemnz+Lddychpc/mzZM56qgy\nopYFr7wC996L2bSJVHIzyIBUyuSYY2weeMAhlfIQQlW+5vv4nBs4ID2+KPX5rHwXl9ZWMzX+CFTB\nBeWLSXYfyx2LxhMEsHGjxemnqwvQsGHwxBPXcsklczAMiRA+7yZncFRviL7Z3Cq0s8Xo94UWfI2m\nE7CnytjWrI0z2zY1uVx0kYNl2Vx/vZUN9bwmLC4yYtw7oJrBdQsI4XN0rYCkxJeSVMqkttbOxu9B\npUaed14DUgZA0MwNc269xeJ/PsmRR8LmO10GDnTYsMFm3DiLUaPA901ANV9fvryEq6+ezvr1Ntso\nYdMF8D+XgAxJhiXnM70KlsYrmfjueC67DHbtyoWnli6FF15QF7Qrr6wkkfgjpNcOpAGNQwTRtwTZ\nq1QXpE2CL4T4N2Aa0B8YLqVcu4ftLgYeAELAH6WUv2nLcTUaTevsbyphvjf9qaeazJ8f49FHLY4/\nHh5/HNYEFhXvWDw2pZJjn67m7PhCyquSfFZu8Kv6Wbz9trIc/uILGDVKhYcGDbK5/34l3hk3zLlz\nVePw0lKX8nIHw7B5/PGpmCY0NsLo0RZ9+8YYNMihqamESZMmY5oelZVhpJT83QiyTp5hmeSH5XP4\nSXwR/2nF+N1zVrO1iPwU0alTLT76aHb2jsMgQnGdn9uwi9LWGf6bwL8Ac/a0gRAiBMwGLgQ+AN4Q\nQjwlpYy38dgajaYV9idM0dybPsEnn0zjww+nEYtZzfp5byy2GHe+Q+TtFMfFA459W3DbFQ385W3o\n29dFSoe+fW3icYu6Oov6+hhXXJFzw1y8WIn9zJkqTu/7Jps2xSgqspg4Ua01ZPS3b98aIhEPw/AJ\nAiX0hiHV+4GK2WfSKi/5tJpQ0mE5Nq8JFcqB5msVJ500nqOOKlPunE9tJ/rmvJz1ZldIyWmFNgm+\nlHIjgBBib5sNBzZLKd9Lb/sn4HuAFnyN5hCyN+Ov/EVWKQMGD36JAQNWUlUVY9MmK1td2tgIP55n\n84I0ieDxxYAQH43ezkWpudw68SeEIx6ppMlPq1bwzjsWw4ZZ9OqVO9gt5S6ffGMakUiCUCjAMHYx\nZkw1TzxhccYZLhdcUM3FFy8gHPaRMoTvh5UoB4JAGgShAAjx1bZLGfHQcxy90YdIiJGbFzJCpvgF\nJpdGYox7yGpm85AhW4A27BD1bOxgHI4Y/snA3/KefwCc3dqGQojxwHiAnj17HvqRaTSdlL11yYKc\n/cK2bdNoaHgpW9RUXu7Qt6/F8OFqUXfiREj5qjXg90qrGfLbhUSL5jG5FELChxCYMsGVQ6rpdZvV\n/MLiulz5UAWN305Q5wcEBggh2bFjAeedN5jBgydjmrsQQiIEyECy89l+lH+yiWNrfVIIHhh6I89s\nqOTNNy3OC7ksusmhF2pl2sBHCI97L3WINVh7z7TpjGWzX4N9Cr4Q4iWgRytv/UJKufRgDkZKOReY\nCzBs2LCuG2jTaNrI/hh/RaMWp546jc8/X0kqpRZN43Gbhx8m65mfSd18FYtvlzucFUkhhE/IEIj0\nIq+Rgspe0C9dQJutnt3u0MvzKH4r4LjXoOE89b6UPkGwmCIzgTAkSMCHUDJg9LKNFMdBAJIUF0r4\nzTtqQXkVFo8Vw0ZcAAAKgklEQVT1tJhq561Mh02mPGez6un9aNDSItbV6ayP94N9Cr6U8oI2HuND\n4JS8599Kv6bRaA4Re+uSlU9+A5WNG20efjg3S7dt1dP1q6/U89raPC95I0yfh32SR/sUvxUhOlu1\nlMq/s8hUz37ZP8FnwwMkgIRkMszDD4/l1ptXEAoHiABOeB5OWAbHxpX+ZxGqlkDK9MLvcdU0rYVo\n2qrz0e02q+ZZu1/Y9qHm+7oD6qwcjpDOG0AfIcRpKKG/Chh3GI6r0XRZDiSCkWmgMnKkyt55/321\n6GpZFrNmwaRJqigrHreoqooxZIiDbZdw2u01FNdCdEIu9zP/zmIVFjO/G2Nk38nI8OvKj8eH9c9f\nwlNPjaf/u1u4sXwGxbWSI+MRQJAiSYgAiSCByS/ersQXSuxn3DsaM5KgLgmDfm4Sne3QB2v30Px+\nqHlntD7eH9qalvl94CHgeOBZIUStlPIiIcRJqPTLS6WUKSHEJOAFVFrmAinlW20euUaj2SsHWlSU\nn6rp+ybdusVIJOCHP1SGaPG4xcaNFkVFcP31FWz1PYxBJoNKK8lYy7es+v3l8xZ/eOckvj2abHVs\n6bLPGGG43BV/CDMOiBATeIh6yhgtHL55RgmnHtPAjLU2qwN1AgMHOoQjXtZSofHMJFHHwZraSrXs\ndGefar6/d0CdjbZm6TwJPNnK6x8Bl+Y9fw54ri3H0mg0h5b8VE0pPZYsqeaSSxZlDdGmTo1xzjkW\n48Y5+H4mpTNXYAWtV/1+Fe+RtWiI1sJR8dVE+1fT7W0PQwZIIegRamB+YPFqYCHeViIswyCSKpxT\nW2uTSpqYMoGRguK3IjDBzh6zmZ7vh5p31TVcXWmr0WgAlarp+ypGn0qZBAEIofLiDcPjwQcdRo60\naGqyqavL9ZItLrZ3C5nnV/3+n68quSE+j2jcR7ksS/r1A2ObEmVhmgy51QblxIyUMNRzufMsh3dO\nsvnl8xZvv21xx5QV/HZiNed6qDWDPan0fqp5V7NVAC34Go0mTTRq0a1bjHnzHNats4lE4LvfXYRy\nwVQNvzOccMI1APToUUk8rlIik0mIRHIRlIzu3nCDxYT4w8xmEgY+KaOIHlMqYUplVpQ3OjnlPQeX\nF2UFR6z1EEUmlzwY4+EaC7AoHmgR3R+R7opqvh9owddoNFlGjLAwDCs7OS4tjalK1XTlbH6c3zBM\nevSopLpaRU8ATj/dZflyh9JSm2gcLMfhmr42/x4fz5uUYeNw7BU2UzJinH60URlBiQR8B4du0kME\nKgbfvcZh4ULloBmf71J9vUOvSlsL+tdAC75Go2lG88lx81aJzS0ZPJYscdixQ73/g9K5jJ85CSI+\nNesiDL5D0r3O56eEiYprWSQr+V3RVFZMyR3LdZV/P+Saol9eYmNMzsXgH99hk0iomf/zyQqK5niw\nqAvlUh5EtOBrNJr9JmfJ4LFrl8mMGTbvvKPaGP7v8ol8EEmpXDzfo7EUojWSMD43MId/HbCALb+8\njn6llYDF3LkwYULOmtk0VYSnzLKgLBeD31qduQtwMPEIyS6WS3kQ0YKv0Wj2m4wlw5IlDjNm2Lz5\npvLdubm/w3G1AR8lVdqkkAZH14fwSWIg+bJU8tZ9HkHRHOrqFhEKxZg0yWrmw59M5ml43m1GJbBg\nAbzs2XiYhAy10NtlcikPIlrwNRrNARGNWvTta7FlC1mTtfLbbLpPKmJAVYKmoQbH3TSb7RPL+Ost\n1VwTLOCz8iRBRIKQBIHH1q0Ovt98dh6JtK7h6U6JOI7FlpIYZQ1O18qlPIgI2U69oYcNGybXrm3V\nXl+j0RSQpiaXxkaHDz6weeUVKyvSU22XEUmH1RGb6emsm9GjYUjC5fsDqjnrgflgpBDCJBxewYUX\nWiQSyuv+u9+FKVO0hh8MhBDrpJTDWntPz/A1Gs1+0zJLZ8KEGNGoxfTpsMq3eFlahNJ286D87l0s\nelHPWUEABoBkwICuWfhUaLTgazSa/aZllk6mynZPxa2mqWb4dw+cyAdCBeylTNHY6GBZlhb6w4wW\nfI1Gs9/kZ+lkqmxhz8WtsRgkpjl8Y0NuQdcQoeznNIcXHcPXaDQHRCaGnynG2idp98qm3gkahxoU\n3zib6Ijx2bd0WOfgomP4Go3moJFtG7i/pKf/UcchmqfsXdWTvpBowddoNIeeVrxtuqonfSExCj0A\njUbTNcks9GZy+XUd1aFHz/A1Gk1B6Kqe9IVEC75GoykY2sX48KJDOhqNRtNF0IKv0Wg0XQQt+BqN\nRtNF0IKv0Wg0XQQt+BqNRtNF0IKv0Wg0XYR266UjhPgUeL8Nu/gG8D8HaTiFoKOPHzr+OXT08YM+\nh/bA4R5/Lynl8a290W4Fv60IIdbuyUCoI9DRxw8d/xw6+vhBn0N7oD2NX4d0NBqNpougBV+j0Wi6\nCJ1Z8OcWegBtpKOPHzr+OXT08YM+h/ZAuxl/p43hazQajaY5nXmGr9FoNJo8tOBrNBpNF6HTCb4Q\n4mIhxCYhxGYhxM8KPZ4DRQixQAjxiRDizUKP5esghDhFCLFCCBEXQrwlhLit0GM6UIQQ3YQQrwsh\n6tLn8B+FHtPXQQgREkLUCCGeKfRYvg5CiG1CiHohRK0QokM2uBZCFAsh/iqEeFsIsVEIUVAz6E4V\nwxdChIB3gAuBD4A3gB9JKeMFHdgBIIQYBXwJVEspBxR6PAeKEOJE4EQp5XohxDHAOuDKDvYbCOAo\nKeWXQogIsAq4TUr5aoGHdkAIIW4HhgHdpZSXF3o8B4oQYhswTErZYYuuhBCLgJVSyj8KIUzgSCll\nY6HG09lm+MOBzVLK96SUHvAn4HsFHtMBIaV8Bfis0OP4ukgpP5ZSrk///QWwETi5sKM6MKTiy/TT\nSPpfh5oZCSG+BVwG/LHQY+mqCCGiwChgPoCU0iuk2EPnE/yTgb/lPf+ADiY2nQkhxKnAYOC1wo7k\nwEmHQ2qBT4AXpZQd7RxmAVOAoNADaQMSWCaEWCeEGF/owXwNTgM+BRamQ2t/FEIcVcgBdTbB17QT\nhBBHA4uByVLKnYUez4EipfSllOXAt4DhQogOE14TQlwOfCKlXFfosbSR86SUQ4BLgInpcGdHIgwM\nAR6RUg4G/gEUdF2xswn+h8Apec+/lX5NcxhJx70XA49KKZ8o9HjaQvoWfAVwcaHHcgCMAK5Ix8D/\nBHxHCPF/CzukA0dK+WH68RPgSVTItiPxAfBB3t3hX1EXgILR2QT/DaCPEOK09ALJVcBTBR5TlyK9\n4Dkf2CilvL/Q4/k6CCGOF0IUp/8+ApUE8HZhR7X/SCmnSim/JaU8FfV/YLmU8scFHtYBIYQ4Kr3o\nTzoMMgboUJlrUsodwN+EEP3SL1UABU1eCBfy4AcbKWVKCDEJeAEIAQuklG8VeFgHhBDivwEb+IYQ\n4gPgV1LK+YUd1QExAvhfQH06Bg7wcynlcwUc04FyIrAonfVlAH+WUnbI1MYOzAnAk2r+QBh4TEr5\n/wo7pK/FrcCj6Qnoe8C1hRxMp0rL1Gg0Gs2e6WwhHY1Go9HsAS34Go1G00XQgq/RaDRdBC34Go1G\n00XQgq/RaDRdBC34Go1G00XQgq/RaDRdhP8PTbAQXVY+FCEAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Wfdelu1TmgPk",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "t5McVnHmNiDw",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 1. Design the Model\n",
+        "We're going to build a simple neural network model that will take an input value (in this case, `x`) and use it to predict a numeric output value (the sine of `x`). This type of problem is called a _regression_. It will use _layers_ of _neurons_ to attempt to learn any patterns underlying the training data, so it can make predictions.\n",
+        "\n",
+        "To begin with, we'll define two layers. The first layer takes a single input (our `x` value) and runs it through 8 neurons. Based on this input, each neuron will become _activated_ to a certain degree based on its internal state (its _weight_ and _bias_ values). A neuron's degree of activation is expressed as a number.\n",
+        "\n",
+        "The activation numbers from our first layer will be fed as inputs to our second layer, which is a single neuron. It will apply its own weights and bias to these inputs and calculate its own activation, which will be output as our `y` value.\n",
+        "\n",
+        "**Note:** To learn more about how neural networks function, you can explore the [Learn TensorFlow](https://codelabs.developers.google.com/codelabs/tensorflow-lab1-helloworld) codelabs.\n",
+        "\n",
+        "The code in the following cell defines our model using [Keras](https://www.tensorflow.org/guide/keras), TensorFlow's high-level API for creating deep learning networks. Once the network is defined, we _compile_ it, specifying parameters that determine how it will be trained:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "gD60bE8cXQId",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# We'll use Keras to create a simple model architecture\n",
+        "model_1 = tf.keras.Sequential()\n",
+        "\n",
+        "# First layer takes a scalar input and feeds it through 8 \"neurons\". The\n",
+        "# neurons decide whether to activate based on the 'relu' activation function.\n",
+        "model_1.add(keras.layers.Dense(8, activation='relu', input_shape=(1,)))\n",
+        "\n",
+        "# Final layer is a single neuron, since we want to output a single value\n",
+        "model_1.add(keras.layers.Dense(1))\n",
+        "\n",
+        "# Compile the model using a standard optimizer and loss function for regression\n",
+        "model_1.compile(optimizer='adam', loss='mse', metrics=['mae'])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "O0idLyRLQeGj",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 2. Train the Model\n",
+        "Once we've defined the model, we can use our data to _train_ it. Training involves passing an `x` value into the neural network, checking how far the network's output deviates from the expected `y` value, and adjusting the neurons' weights and biases so that the output is more likely to be correct the next time.\n",
+        "\n",
+        "Training runs this process on the full dataset multiple times, and each full run-through is known as an _epoch_. The number of epochs to run during training is a parameter we can set.\n",
+        "\n",
+        "During each epoch, data is run through the network in multiple _batches_. Each batch, several pieces of data are passed into the network, producing output values. These outputs' correctness is measured in aggregate and the network's weights and biases are adjusted accordingly, once per batch. The _batch size_ is also a parameter we can set.\n",
+        "\n",
+        "The code in the following cell uses the `x` and `y` values from our training data to train the model. It runs for 500 _epochs_, with 64 pieces of data in each _batch_. We also pass in some data for _validation_. As you will see when you run the cell, training can take a while to complete:\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "p8hQKr4cVOdE",
+        "colab_type": "code",
+        "outputId": "5e9fcc84-1733-4786-8fde-ce47a510cde6",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        }
+      },
+      "source": [
+        "# Train the model on our training data while validating on our validation set\n",
+        "history_1 = model_1.fit(x_train, y_train, epochs=500, batch_size=64,\n",
+        "                    validation_data=(x_validate, y_validate))"
+      ],
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Train on 600 samples, validate on 200 samples\n",
+            "Epoch 1/500\n",
+            "600/600 [==============================] - 1s 971us/sample - loss: 0.6936 - mae: 0.6897 - val_loss: 0.6396 - val_mae: 0.6501\n",
+            "Epoch 2/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.5965 - mae: 0.6254 - val_loss: 0.5594 - val_mae: 0.6035\n",
+            "Epoch 3/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.5240 - mae: 0.5830 - val_loss: 0.5021 - val_mae: 0.5765\n",
+            "Epoch 4/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.4724 - mae: 0.5549 - val_loss: 0.4634 - val_mae: 0.5615\n",
+            "Epoch 5/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.4392 - mae: 0.5390 - val_loss: 0.4375 - val_mae: 0.5533\n",
+            "Epoch 6/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.4174 - mae: 0.5305 - val_loss: 0.4215 - val_mae: 0.5487\n",
+            "Epoch 7/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.4026 - mae: 0.5244 - val_loss: 0.4119 - val_mae: 0.5464\n",
+            "Epoch 8/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.3939 - mae: 0.5225 - val_loss: 0.4057 - val_mae: 0.5452\n",
+            "Epoch 9/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.3880 - mae: 0.5216 - val_loss: 0.4015 - val_mae: 0.5439\n",
+            "Epoch 10/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.3836 - mae: 0.5210 - val_loss: 0.3981 - val_mae: 0.5425\n",
+            "Epoch 11/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.3802 - mae: 0.5205 - val_loss: 0.3950 - val_mae: 0.5412\n",
+            "Epoch 12/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.3770 - mae: 0.5200 - val_loss: 0.3922 - val_mae: 0.5400\n",
+            "Epoch 13/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.3741 - mae: 0.5189 - val_loss: 0.3894 - val_mae: 0.5385\n",
+            "Epoch 14/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.3712 - mae: 0.5173 - val_loss: 0.3866 - val_mae: 0.5368\n",
+            "Epoch 15/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.3686 - mae: 0.5162 - val_loss: 0.3837 - val_mae: 0.5354\n",
+            "Epoch 16/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.3655 - mae: 0.5143 - val_loss: 0.3808 - val_mae: 0.5335\n",
+            "Epoch 17/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.3627 - mae: 0.5122 - val_loss: 0.3777 - val_mae: 0.5314\n",
+            "Epoch 18/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.3597 - mae: 0.5101 - val_loss: 0.3748 - val_mae: 0.5296\n",
+            "Epoch 19/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.3567 - mae: 0.5080 - val_loss: 0.3717 - val_mae: 0.5276\n",
+            "Epoch 20/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.3538 - mae: 0.5059 - val_loss: 0.3686 - val_mae: 0.5256\n",
+            "Epoch 21/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.3507 - mae: 0.5037 - val_loss: 0.3654 - val_mae: 0.5234\n",
+            "Epoch 22/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.3477 - mae: 0.5012 - val_loss: 0.3622 - val_mae: 0.5211\n",
+            "Epoch 23/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.3447 - mae: 0.4993 - val_loss: 0.3591 - val_mae: 0.5195\n",
+            "Epoch 24/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.3414 - mae: 0.4970 - val_loss: 0.3558 - val_mae: 0.5172\n",
+            "Epoch 25/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.3385 - mae: 0.4949 - val_loss: 0.3526 - val_mae: 0.5153\n",
+            "Epoch 26/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.3352 - mae: 0.4926 - val_loss: 0.3493 - val_mae: 0.5130\n",
+            "Epoch 27/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.3321 - mae: 0.4904 - val_loss: 0.3461 - val_mae: 0.5110\n",
+            "Epoch 28/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.3288 - mae: 0.4880 - val_loss: 0.3429 - val_mae: 0.5087\n",
+            "Epoch 29/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.3257 - mae: 0.4854 - val_loss: 0.3395 - val_mae: 0.5064\n",
+            "Epoch 30/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.3227 - mae: 0.4831 - val_loss: 0.3362 - val_mae: 0.5041\n",
+            "Epoch 31/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.3195 - mae: 0.4806 - val_loss: 0.3330 - val_mae: 0.5018\n",
+            "Epoch 32/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.3165 - mae: 0.4782 - val_loss: 0.3298 - val_mae: 0.4996\n",
+            "Epoch 33/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.3133 - mae: 0.4760 - val_loss: 0.3267 - val_mae: 0.4976\n",
+            "Epoch 34/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.3103 - mae: 0.4738 - val_loss: 0.3235 - val_mae: 0.4952\n",
+            "Epoch 35/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.3072 - mae: 0.4713 - val_loss: 0.3203 - val_mae: 0.4930\n",
+            "Epoch 36/500\n",
+            "600/600 [==============================] - 0s 100us/sample - loss: 0.3042 - mae: 0.4694 - val_loss: 0.3173 - val_mae: 0.4913\n",
+            "Epoch 37/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.3012 - mae: 0.4673 - val_loss: 0.3141 - val_mae: 0.4890\n",
+            "Epoch 38/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.2981 - mae: 0.4651 - val_loss: 0.3111 - val_mae: 0.4869\n",
+            "Epoch 39/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.2952 - mae: 0.4625 - val_loss: 0.3078 - val_mae: 0.4841\n",
+            "Epoch 40/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.2921 - mae: 0.4602 - val_loss: 0.3049 - val_mae: 0.4822\n",
+            "Epoch 41/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.2891 - mae: 0.4585 - val_loss: 0.3021 - val_mae: 0.4810\n",
+            "Epoch 42/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.2861 - mae: 0.4568 - val_loss: 0.2991 - val_mae: 0.4790\n",
+            "Epoch 43/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.2832 - mae: 0.4546 - val_loss: 0.2961 - val_mae: 0.4767\n",
+            "Epoch 44/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.2803 - mae: 0.4523 - val_loss: 0.2931 - val_mae: 0.4741\n",
+            "Epoch 45/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.2775 - mae: 0.4503 - val_loss: 0.2902 - val_mae: 0.4723\n",
+            "Epoch 46/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.2746 - mae: 0.4482 - val_loss: 0.2873 - val_mae: 0.4701\n",
+            "Epoch 47/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.2719 - mae: 0.4464 - val_loss: 0.2846 - val_mae: 0.4685\n",
+            "Epoch 48/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.2691 - mae: 0.4444 - val_loss: 0.2818 - val_mae: 0.4666\n",
+            "Epoch 49/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.2663 - mae: 0.4425 - val_loss: 0.2791 - val_mae: 0.4646\n",
+            "Epoch 50/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.2636 - mae: 0.4404 - val_loss: 0.2764 - val_mae: 0.4625\n",
+            "Epoch 51/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.2610 - mae: 0.4382 - val_loss: 0.2736 - val_mae: 0.4599\n",
+            "Epoch 52/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.2583 - mae: 0.4361 - val_loss: 0.2711 - val_mae: 0.4580\n",
+            "Epoch 53/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.2558 - mae: 0.4344 - val_loss: 0.2685 - val_mae: 0.4561\n",
+            "Epoch 54/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.2532 - mae: 0.4326 - val_loss: 0.2659 - val_mae: 0.4539\n",
+            "Epoch 55/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.2508 - mae: 0.4307 - val_loss: 0.2634 - val_mae: 0.4518\n",
+            "Epoch 56/500\n",
+            "600/600 [==============================] - 0s 65us/sample - loss: 0.2483 - mae: 0.4288 - val_loss: 0.2609 - val_mae: 0.4499\n",
+            "Epoch 57/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.2459 - mae: 0.4271 - val_loss: 0.2586 - val_mae: 0.4485\n",
+            "Epoch 58/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.2436 - mae: 0.4255 - val_loss: 0.2561 - val_mae: 0.4464\n",
+            "Epoch 59/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.2411 - mae: 0.4239 - val_loss: 0.2540 - val_mae: 0.4451\n",
+            "Epoch 60/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.2387 - mae: 0.4220 - val_loss: 0.2516 - val_mae: 0.4431\n",
+            "Epoch 61/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.2365 - mae: 0.4202 - val_loss: 0.2493 - val_mae: 0.4411\n",
+            "Epoch 62/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.2343 - mae: 0.4186 - val_loss: 0.2472 - val_mae: 0.4395\n",
+            "Epoch 63/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.2322 - mae: 0.4169 - val_loss: 0.2450 - val_mae: 0.4375\n",
+            "Epoch 64/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.2301 - mae: 0.4151 - val_loss: 0.2428 - val_mae: 0.4355\n",
+            "Epoch 65/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.2280 - mae: 0.4134 - val_loss: 0.2408 - val_mae: 0.4338\n",
+            "Epoch 66/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.2260 - mae: 0.4118 - val_loss: 0.2388 - val_mae: 0.4323\n",
+            "Epoch 67/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.2241 - mae: 0.4104 - val_loss: 0.2369 - val_mae: 0.4308\n",
+            "Epoch 68/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.2222 - mae: 0.4089 - val_loss: 0.2351 - val_mae: 0.4293\n",
+            "Epoch 69/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.2204 - mae: 0.4076 - val_loss: 0.2334 - val_mae: 0.4280\n",
+            "Epoch 70/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.2188 - mae: 0.4062 - val_loss: 0.2314 - val_mae: 0.4255\n",
+            "Epoch 71/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.2168 - mae: 0.4043 - val_loss: 0.2297 - val_mae: 0.4246\n",
+            "Epoch 72/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.2151 - mae: 0.4031 - val_loss: 0.2280 - val_mae: 0.4231\n",
+            "Epoch 73/500\n",
+            "600/600 [==============================] - 0s 40us/sample - loss: 0.2135 - mae: 0.4019 - val_loss: 0.2265 - val_mae: 0.4224\n",
+            "Epoch 74/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.2120 - mae: 0.4007 - val_loss: 0.2247 - val_mae: 0.4203\n",
+            "Epoch 75/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.2102 - mae: 0.3992 - val_loss: 0.2233 - val_mae: 0.4194\n",
+            "Epoch 76/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.2087 - mae: 0.3980 - val_loss: 0.2216 - val_mae: 0.4178\n",
+            "Epoch 77/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.2071 - mae: 0.3965 - val_loss: 0.2199 - val_mae: 0.4158\n",
+            "Epoch 78/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.2056 - mae: 0.3951 - val_loss: 0.2185 - val_mae: 0.4144\n",
+            "Epoch 79/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.2044 - mae: 0.3938 - val_loss: 0.2170 - val_mae: 0.4122\n",
+            "Epoch 80/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.2029 - mae: 0.3926 - val_loss: 0.2159 - val_mae: 0.4123\n",
+            "Epoch 81/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.2015 - mae: 0.3915 - val_loss: 0.2145 - val_mae: 0.4108\n",
+            "Epoch 82/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.2002 - mae: 0.3902 - val_loss: 0.2131 - val_mae: 0.4091\n",
+            "Epoch 83/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1989 - mae: 0.3890 - val_loss: 0.2119 - val_mae: 0.4081\n",
+            "Epoch 84/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1977 - mae: 0.3878 - val_loss: 0.2107 - val_mae: 0.4071\n",
+            "Epoch 85/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1965 - mae: 0.3867 - val_loss: 0.2095 - val_mae: 0.4057\n",
+            "Epoch 86/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1953 - mae: 0.3857 - val_loss: 0.2082 - val_mae: 0.4044\n",
+            "Epoch 87/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1941 - mae: 0.3843 - val_loss: 0.2072 - val_mae: 0.4032\n",
+            "Epoch 88/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1930 - mae: 0.3834 - val_loss: 0.2062 - val_mae: 0.4028\n",
+            "Epoch 89/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1920 - mae: 0.3825 - val_loss: 0.2053 - val_mae: 0.4018\n",
+            "Epoch 90/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.1913 - mae: 0.3819 - val_loss: 0.2046 - val_mae: 0.4018\n",
+            "Epoch 91/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1902 - mae: 0.3808 - val_loss: 0.2033 - val_mae: 0.3994\n",
+            "Epoch 92/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1892 - mae: 0.3796 - val_loss: 0.2025 - val_mae: 0.3989\n",
+            "Epoch 93/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1882 - mae: 0.3786 - val_loss: 0.2015 - val_mae: 0.3970\n",
+            "Epoch 94/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1875 - mae: 0.3776 - val_loss: 0.2006 - val_mae: 0.3959\n",
+            "Epoch 95/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.1870 - mae: 0.3768 - val_loss: 0.1998 - val_mae: 0.3941\n",
+            "Epoch 96/500\n",
+            "600/600 [==============================] - 0s 67us/sample - loss: 0.1861 - mae: 0.3760 - val_loss: 0.1992 - val_mae: 0.3947\n",
+            "Epoch 97/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1852 - mae: 0.3751 - val_loss: 0.1984 - val_mae: 0.3937\n",
+            "Epoch 98/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1843 - mae: 0.3742 - val_loss: 0.1980 - val_mae: 0.3939\n",
+            "Epoch 99/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1837 - mae: 0.3737 - val_loss: 0.1976 - val_mae: 0.3940\n",
+            "Epoch 100/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1832 - mae: 0.3733 - val_loss: 0.1970 - val_mae: 0.3936\n",
+            "Epoch 101/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1828 - mae: 0.3727 - val_loss: 0.1960 - val_mae: 0.3910\n",
+            "Epoch 102/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1820 - mae: 0.3717 - val_loss: 0.1956 - val_mae: 0.3913\n",
+            "Epoch 103/500\n",
+            "600/600 [==============================] - 0s 64us/sample - loss: 0.1812 - mae: 0.3708 - val_loss: 0.1950 - val_mae: 0.3903\n",
+            "Epoch 104/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1806 - mae: 0.3701 - val_loss: 0.1946 - val_mae: 0.3898\n",
+            "Epoch 105/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.1802 - mae: 0.3695 - val_loss: 0.1939 - val_mae: 0.3886\n",
+            "Epoch 106/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1795 - mae: 0.3686 - val_loss: 0.1932 - val_mae: 0.3871\n",
+            "Epoch 107/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1790 - mae: 0.3679 - val_loss: 0.1928 - val_mae: 0.3866\n",
+            "Epoch 108/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1786 - mae: 0.3674 - val_loss: 0.1924 - val_mae: 0.3864\n",
+            "Epoch 109/500\n",
+            "600/600 [==============================] - 0s 40us/sample - loss: 0.1783 - mae: 0.3667 - val_loss: 0.1919 - val_mae: 0.3849\n",
+            "Epoch 110/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1781 - mae: 0.3666 - val_loss: 0.1919 - val_mae: 0.3861\n",
+            "Epoch 111/500\n",
+            "600/600 [==============================] - 0s 68us/sample - loss: 0.1774 - mae: 0.3658 - val_loss: 0.1912 - val_mae: 0.3843\n",
+            "Epoch 112/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1770 - mae: 0.3653 - val_loss: 0.1911 - val_mae: 0.3846\n",
+            "Epoch 113/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1766 - mae: 0.3647 - val_loss: 0.1906 - val_mae: 0.3833\n",
+            "Epoch 114/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1763 - mae: 0.3642 - val_loss: 0.1903 - val_mae: 0.3831\n",
+            "Epoch 115/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1758 - mae: 0.3636 - val_loss: 0.1898 - val_mae: 0.3817\n",
+            "Epoch 116/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1755 - mae: 0.3630 - val_loss: 0.1897 - val_mae: 0.3821\n",
+            "Epoch 117/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1752 - mae: 0.3627 - val_loss: 0.1893 - val_mae: 0.3810\n",
+            "Epoch 118/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1749 - mae: 0.3621 - val_loss: 0.1890 - val_mae: 0.3805\n",
+            "Epoch 119/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1747 - mae: 0.3617 - val_loss: 0.1888 - val_mae: 0.3802\n",
+            "Epoch 120/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1743 - mae: 0.3612 - val_loss: 0.1885 - val_mae: 0.3794\n",
+            "Epoch 121/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1743 - mae: 0.3610 - val_loss: 0.1885 - val_mae: 0.3803\n",
+            "Epoch 122/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.1740 - mae: 0.3608 - val_loss: 0.1884 - val_mae: 0.3802\n",
+            "Epoch 123/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1736 - mae: 0.3602 - val_loss: 0.1879 - val_mae: 0.3786\n",
+            "Epoch 124/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1737 - mae: 0.3597 - val_loss: 0.1876 - val_mae: 0.3765\n",
+            "Epoch 125/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1738 - mae: 0.3597 - val_loss: 0.1876 - val_mae: 0.3780\n",
+            "Epoch 126/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1734 - mae: 0.3591 - val_loss: 0.1872 - val_mae: 0.3762\n",
+            "Epoch 127/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1727 - mae: 0.3583 - val_loss: 0.1873 - val_mae: 0.3775\n",
+            "Epoch 128/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1726 - mae: 0.3583 - val_loss: 0.1872 - val_mae: 0.3776\n",
+            "Epoch 129/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1724 - mae: 0.3579 - val_loss: 0.1869 - val_mae: 0.3763\n",
+            "Epoch 130/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1723 - mae: 0.3575 - val_loss: 0.1867 - val_mae: 0.3757\n",
+            "Epoch 131/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1722 - mae: 0.3573 - val_loss: 0.1866 - val_mae: 0.3759\n",
+            "Epoch 132/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1720 - mae: 0.3572 - val_loss: 0.1868 - val_mae: 0.3770\n",
+            "Epoch 133/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.1721 - mae: 0.3570 - val_loss: 0.1864 - val_mae: 0.3754\n",
+            "Epoch 134/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1717 - mae: 0.3566 - val_loss: 0.1864 - val_mae: 0.3754\n",
+            "Epoch 135/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1717 - mae: 0.3563 - val_loss: 0.1861 - val_mae: 0.3741\n",
+            "Epoch 136/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1715 - mae: 0.3559 - val_loss: 0.1861 - val_mae: 0.3744\n",
+            "Epoch 137/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1714 - mae: 0.3558 - val_loss: 0.1861 - val_mae: 0.3748\n",
+            "Epoch 138/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1713 - mae: 0.3555 - val_loss: 0.1859 - val_mae: 0.3737\n",
+            "Epoch 139/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1712 - mae: 0.3551 - val_loss: 0.1857 - val_mae: 0.3731\n",
+            "Epoch 140/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1712 - mae: 0.3551 - val_loss: 0.1857 - val_mae: 0.3732\n",
+            "Epoch 141/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1710 - mae: 0.3547 - val_loss: 0.1856 - val_mae: 0.3724\n",
+            "Epoch 142/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1713 - mae: 0.3546 - val_loss: 0.1855 - val_mae: 0.3718\n",
+            "Epoch 143/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1711 - mae: 0.3545 - val_loss: 0.1857 - val_mae: 0.3740\n",
+            "Epoch 144/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1708 - mae: 0.3545 - val_loss: 0.1856 - val_mae: 0.3733\n",
+            "Epoch 145/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.1708 - mae: 0.3541 - val_loss: 0.1854 - val_mae: 0.3717\n",
+            "Epoch 146/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1707 - mae: 0.3539 - val_loss: 0.1854 - val_mae: 0.3720\n",
+            "Epoch 147/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.1706 - mae: 0.3539 - val_loss: 0.1854 - val_mae: 0.3725\n",
+            "Epoch 148/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.1706 - mae: 0.3537 - val_loss: 0.1853 - val_mae: 0.3722\n",
+            "Epoch 149/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1705 - mae: 0.3536 - val_loss: 0.1853 - val_mae: 0.3725\n",
+            "Epoch 150/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.1707 - mae: 0.3537 - val_loss: 0.1853 - val_mae: 0.3720\n",
+            "Epoch 151/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1704 - mae: 0.3532 - val_loss: 0.1851 - val_mae: 0.3704\n",
+            "Epoch 152/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1705 - mae: 0.3530 - val_loss: 0.1851 - val_mae: 0.3709\n",
+            "Epoch 153/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1703 - mae: 0.3529 - val_loss: 0.1851 - val_mae: 0.3714\n",
+            "Epoch 154/500\n",
+            "600/600 [==============================] - 0s 63us/sample - loss: 0.1703 - mae: 0.3530 - val_loss: 0.1852 - val_mae: 0.3720\n",
+            "Epoch 155/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1703 - mae: 0.3529 - val_loss: 0.1851 - val_mae: 0.3713\n",
+            "Epoch 156/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1702 - mae: 0.3526 - val_loss: 0.1850 - val_mae: 0.3711\n",
+            "Epoch 157/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.1701 - mae: 0.3526 - val_loss: 0.1852 - val_mae: 0.3719\n",
+            "Epoch 158/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1701 - mae: 0.3528 - val_loss: 0.1852 - val_mae: 0.3721\n",
+            "Epoch 159/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1705 - mae: 0.3528 - val_loss: 0.1849 - val_mae: 0.3698\n",
+            "Epoch 160/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1701 - mae: 0.3525 - val_loss: 0.1852 - val_mae: 0.3723\n",
+            "Epoch 161/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1701 - mae: 0.3528 - val_loss: 0.1851 - val_mae: 0.3721\n",
+            "Epoch 162/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1701 - mae: 0.3527 - val_loss: 0.1851 - val_mae: 0.3717\n",
+            "Epoch 163/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1701 - mae: 0.3527 - val_loss: 0.1852 - val_mae: 0.3722\n",
+            "Epoch 164/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1704 - mae: 0.3531 - val_loss: 0.1852 - val_mae: 0.3722\n",
+            "Epoch 165/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1700 - mae: 0.3525 - val_loss: 0.1847 - val_mae: 0.3697\n",
+            "Epoch 166/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1702 - mae: 0.3518 - val_loss: 0.1847 - val_mae: 0.3694\n",
+            "Epoch 167/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1704 - mae: 0.3519 - val_loss: 0.1847 - val_mae: 0.3680\n",
+            "Epoch 168/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1699 - mae: 0.3516 - val_loss: 0.1848 - val_mae: 0.3704\n",
+            "Epoch 169/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1700 - mae: 0.3522 - val_loss: 0.1851 - val_mae: 0.3718\n",
+            "Epoch 170/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1700 - mae: 0.3524 - val_loss: 0.1851 - val_mae: 0.3720\n",
+            "Epoch 171/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1699 - mae: 0.3522 - val_loss: 0.1848 - val_mae: 0.3702\n",
+            "Epoch 172/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1698 - mae: 0.3518 - val_loss: 0.1849 - val_mae: 0.3711\n",
+            "Epoch 173/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1699 - mae: 0.3521 - val_loss: 0.1849 - val_mae: 0.3710\n",
+            "Epoch 174/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1699 - mae: 0.3521 - val_loss: 0.1849 - val_mae: 0.3711\n",
+            "Epoch 175/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1700 - mae: 0.3518 - val_loss: 0.1847 - val_mae: 0.3699\n",
+            "Epoch 176/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1699 - mae: 0.3517 - val_loss: 0.1847 - val_mae: 0.3701\n",
+            "Epoch 177/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1702 - mae: 0.3524 - val_loss: 0.1852 - val_mae: 0.3721\n",
+            "Epoch 178/500\n",
+            "600/600 [==============================] - 0s 62us/sample - loss: 0.1700 - mae: 0.3523 - val_loss: 0.1849 - val_mae: 0.3710\n",
+            "Epoch 179/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1697 - mae: 0.3517 - val_loss: 0.1847 - val_mae: 0.3701\n",
+            "Epoch 180/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1703 - mae: 0.3515 - val_loss: 0.1846 - val_mae: 0.3681\n",
+            "Epoch 181/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3515 - val_loss: 0.1849 - val_mae: 0.3708\n",
+            "Epoch 182/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1698 - mae: 0.3518 - val_loss: 0.1850 - val_mae: 0.3715\n",
+            "Epoch 183/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1698 - mae: 0.3520 - val_loss: 0.1848 - val_mae: 0.3708\n",
+            "Epoch 184/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1698 - mae: 0.3516 - val_loss: 0.1846 - val_mae: 0.3690\n",
+            "Epoch 185/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1699 - mae: 0.3514 - val_loss: 0.1846 - val_mae: 0.3698\n",
+            "Epoch 186/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1700 - mae: 0.3517 - val_loss: 0.1848 - val_mae: 0.3706\n",
+            "Epoch 187/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.1696 - mae: 0.3513 - val_loss: 0.1846 - val_mae: 0.3693\n",
+            "Epoch 188/500\n",
+            "600/600 [==============================] - 0s 63us/sample - loss: 0.1697 - mae: 0.3511 - val_loss: 0.1845 - val_mae: 0.3687\n",
+            "Epoch 189/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1698 - mae: 0.3508 - val_loss: 0.1845 - val_mae: 0.3675\n",
+            "Epoch 190/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.1699 - mae: 0.3510 - val_loss: 0.1845 - val_mae: 0.3688\n",
+            "Epoch 191/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1698 - mae: 0.3509 - val_loss: 0.1846 - val_mae: 0.3693\n",
+            "Epoch 192/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1698 - mae: 0.3512 - val_loss: 0.1848 - val_mae: 0.3706\n",
+            "Epoch 193/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1700 - mae: 0.3520 - val_loss: 0.1850 - val_mae: 0.3714\n",
+            "Epoch 194/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1698 - mae: 0.3513 - val_loss: 0.1845 - val_mae: 0.3684\n",
+            "Epoch 195/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.1697 - mae: 0.3509 - val_loss: 0.1845 - val_mae: 0.3687\n",
+            "Epoch 196/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1846 - val_mae: 0.3691\n",
+            "Epoch 197/500\n",
+            "600/600 [==============================] - 0s 76us/sample - loss: 0.1697 - mae: 0.3508 - val_loss: 0.1845 - val_mae: 0.3684\n",
+            "Epoch 198/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1698 - mae: 0.3506 - val_loss: 0.1845 - val_mae: 0.3683\n",
+            "Epoch 199/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1698 - mae: 0.3510 - val_loss: 0.1848 - val_mae: 0.3703\n",
+            "Epoch 200/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1696 - mae: 0.3511 - val_loss: 0.1846 - val_mae: 0.3690\n",
+            "Epoch 201/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3509 - val_loss: 0.1846 - val_mae: 0.3694\n",
+            "Epoch 202/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1697 - mae: 0.3512 - val_loss: 0.1847 - val_mae: 0.3696\n",
+            "Epoch 203/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1697 - mae: 0.3513 - val_loss: 0.1850 - val_mae: 0.3708\n",
+            "Epoch 204/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3513 - val_loss: 0.1847 - val_mae: 0.3697\n",
+            "Epoch 205/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1845 - val_mae: 0.3685\n",
+            "Epoch 206/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1699 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3669\n",
+            "Epoch 207/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3500 - val_loss: 0.1845 - val_mae: 0.3680\n",
+            "Epoch 208/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1696 - mae: 0.3503 - val_loss: 0.1846 - val_mae: 0.3687\n",
+            "Epoch 209/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3690\n",
+            "Epoch 210/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1698 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3687\n",
+            "Epoch 211/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1699 - mae: 0.3513 - val_loss: 0.1849 - val_mae: 0.3703\n",
+            "Epoch 212/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3510 - val_loss: 0.1846 - val_mae: 0.3693\n",
+            "Epoch 213/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1697 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3677\n",
+            "Epoch 214/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3681\n",
+            "Epoch 215/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1695 - mae: 0.3505 - val_loss: 0.1847 - val_mae: 0.3698\n",
+            "Epoch 216/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1696 - mae: 0.3510 - val_loss: 0.1848 - val_mae: 0.3702\n",
+            "Epoch 217/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1698 - mae: 0.3512 - val_loss: 0.1846 - val_mae: 0.3694\n",
+            "Epoch 218/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1847 - val_mae: 0.3699\n",
+            "Epoch 219/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.1696 - mae: 0.3511 - val_loss: 0.1847 - val_mae: 0.3700\n",
+            "Epoch 220/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.1697 - mae: 0.3513 - val_loss: 0.1848 - val_mae: 0.3705\n",
+            "Epoch 221/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3513 - val_loss: 0.1847 - val_mae: 0.3699\n",
+            "Epoch 222/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1698 - mae: 0.3515 - val_loss: 0.1848 - val_mae: 0.3707\n",
+            "Epoch 223/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3514 - val_loss: 0.1845 - val_mae: 0.3695\n",
+            "Epoch 224/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1845 - val_mae: 0.3691\n",
+            "Epoch 225/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3511 - val_loss: 0.1846 - val_mae: 0.3695\n",
+            "Epoch 226/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.1697 - mae: 0.3510 - val_loss: 0.1845 - val_mae: 0.3691\n",
+            "Epoch 227/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1698 - mae: 0.3513 - val_loss: 0.1846 - val_mae: 0.3699\n",
+            "Epoch 228/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1699 - mae: 0.3510 - val_loss: 0.1844 - val_mae: 0.3685\n",
+            "Epoch 229/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3510 - val_loss: 0.1845 - val_mae: 0.3691\n",
+            "Epoch 230/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1696 - mae: 0.3510 - val_loss: 0.1846 - val_mae: 0.3696\n",
+            "Epoch 231/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1845 - val_mae: 0.3689\n",
+            "Epoch 232/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3512 - val_loss: 0.1846 - val_mae: 0.3697\n",
+            "Epoch 233/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1698 - mae: 0.3509 - val_loss: 0.1845 - val_mae: 0.3689\n",
+            "Epoch 234/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1846 - val_mae: 0.3694\n",
+            "Epoch 235/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1696 - mae: 0.3511 - val_loss: 0.1846 - val_mae: 0.3693\n",
+            "Epoch 236/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1700 - mae: 0.3506 - val_loss: 0.1844 - val_mae: 0.3673\n",
+            "Epoch 237/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1698 - mae: 0.3502 - val_loss: 0.1844 - val_mae: 0.3676\n",
+            "Epoch 238/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1845 - val_mae: 0.3690\n",
+            "Epoch 239/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1697 - mae: 0.3508 - val_loss: 0.1845 - val_mae: 0.3691\n",
+            "Epoch 240/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1844 - val_mae: 0.3676\n",
+            "Epoch 241/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1698 - mae: 0.3502 - val_loss: 0.1844 - val_mae: 0.3674\n",
+            "Epoch 242/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1697 - mae: 0.3507 - val_loss: 0.1847 - val_mae: 0.3696\n",
+            "Epoch 243/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1697 - mae: 0.3508 - val_loss: 0.1845 - val_mae: 0.3685\n",
+            "Epoch 244/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1697 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3689\n",
+            "Epoch 245/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1701 - mae: 0.3519 - val_loss: 0.1856 - val_mae: 0.3727\n",
+            "Epoch 246/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1701 - mae: 0.3519 - val_loss: 0.1850 - val_mae: 0.3708\n",
+            "Epoch 247/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1698 - mae: 0.3516 - val_loss: 0.1848 - val_mae: 0.3702\n",
+            "Epoch 248/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3508 - val_loss: 0.1844 - val_mae: 0.3671\n",
+            "Epoch 249/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1700 - mae: 0.3506 - val_loss: 0.1844 - val_mae: 0.3682\n",
+            "Epoch 250/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3503 - val_loss: 0.1844 - val_mae: 0.3676\n",
+            "Epoch 251/500\n",
+            "600/600 [==============================] - 0s 61us/sample - loss: 0.1697 - mae: 0.3504 - val_loss: 0.1844 - val_mae: 0.3676\n",
+            "Epoch 252/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1695 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3687\n",
+            "Epoch 253/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1695 - mae: 0.3507 - val_loss: 0.1847 - val_mae: 0.3698\n",
+            "Epoch 254/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1697 - mae: 0.3512 - val_loss: 0.1849 - val_mae: 0.3704\n",
+            "Epoch 255/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1698 - mae: 0.3514 - val_loss: 0.1848 - val_mae: 0.3700\n",
+            "Epoch 256/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1697 - mae: 0.3509 - val_loss: 0.1845 - val_mae: 0.3680\n",
+            "Epoch 257/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1696 - mae: 0.3503 - val_loss: 0.1844 - val_mae: 0.3679\n",
+            "Epoch 258/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3503 - val_loss: 0.1845 - val_mae: 0.3685\n",
+            "Epoch 259/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1695 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3689\n",
+            "Epoch 260/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1847 - val_mae: 0.3698\n",
+            "Epoch 261/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.1697 - mae: 0.3511 - val_loss: 0.1847 - val_mae: 0.3698\n",
+            "Epoch 262/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1699 - mae: 0.3510 - val_loss: 0.1845 - val_mae: 0.3684\n",
+            "Epoch 263/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1845 - val_mae: 0.3685\n",
+            "Epoch 264/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3692\n",
+            "Epoch 265/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1698 - mae: 0.3513 - val_loss: 0.1848 - val_mae: 0.3700\n",
+            "Epoch 266/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1846 - val_mae: 0.3691\n",
+            "Epoch 267/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1847 - val_mae: 0.3696\n",
+            "Epoch 268/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1697 - mae: 0.3507 - val_loss: 0.1845 - val_mae: 0.3681\n",
+            "Epoch 269/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3686\n",
+            "Epoch 270/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3510 - val_loss: 0.1848 - val_mae: 0.3699\n",
+            "Epoch 271/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.1699 - mae: 0.3516 - val_loss: 0.1848 - val_mae: 0.3701\n",
+            "Epoch 272/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1698 - mae: 0.3509 - val_loss: 0.1845 - val_mae: 0.3683\n",
+            "Epoch 273/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1848 - val_mae: 0.3699\n",
+            "Epoch 274/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1696 - mae: 0.3510 - val_loss: 0.1847 - val_mae: 0.3697\n",
+            "Epoch 275/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1846 - val_mae: 0.3690\n",
+            "Epoch 276/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1847 - val_mae: 0.3693\n",
+            "Epoch 277/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1695 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3679\n",
+            "Epoch 278/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1697 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3678\n",
+            "Epoch 279/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3687\n",
+            "Epoch 280/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3688\n",
+            "Epoch 281/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1698 - mae: 0.3510 - val_loss: 0.1848 - val_mae: 0.3700\n",
+            "Epoch 282/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1847 - val_mae: 0.3694\n",
+            "Epoch 283/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1696 - mae: 0.3507 - val_loss: 0.1846 - val_mae: 0.3688\n",
+            "Epoch 284/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1847 - val_mae: 0.3692\n",
+            "Epoch 285/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.1695 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3682\n",
+            "Epoch 286/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1699 - mae: 0.3501 - val_loss: 0.1846 - val_mae: 0.3664\n",
+            "Epoch 287/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1698 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3682\n",
+            "Epoch 288/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3683\n",
+            "Epoch 289/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1695 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3690\n",
+            "Epoch 290/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1847 - val_mae: 0.3689\n",
+            "Epoch 291/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1847 - val_mae: 0.3694\n",
+            "Epoch 292/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1698 - mae: 0.3508 - val_loss: 0.1846 - val_mae: 0.3683\n",
+            "Epoch 293/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1701 - mae: 0.3513 - val_loss: 0.1850 - val_mae: 0.3705\n",
+            "Epoch 294/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1702 - mae: 0.3509 - val_loss: 0.1845 - val_mae: 0.3678\n",
+            "Epoch 295/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1849 - val_mae: 0.3702\n",
+            "Epoch 296/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3510 - val_loss: 0.1848 - val_mae: 0.3699\n",
+            "Epoch 297/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1697 - mae: 0.3509 - val_loss: 0.1847 - val_mae: 0.3691\n",
+            "Epoch 298/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3507 - val_loss: 0.1848 - val_mae: 0.3695\n",
+            "Epoch 299/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1696 - mae: 0.3507 - val_loss: 0.1847 - val_mae: 0.3690\n",
+            "Epoch 300/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3684\n",
+            "Epoch 301/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3685\n",
+            "Epoch 302/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1698 - mae: 0.3507 - val_loss: 0.1848 - val_mae: 0.3696\n",
+            "Epoch 303/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1695 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3684\n",
+            "Epoch 304/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1700 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3667\n",
+            "Epoch 305/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1696 - mae: 0.3498 - val_loss: 0.1845 - val_mae: 0.3679\n",
+            "Epoch 306/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.1699 - mae: 0.3509 - val_loss: 0.1850 - val_mae: 0.3706\n",
+            "Epoch 307/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1697 - mae: 0.3513 - val_loss: 0.1847 - val_mae: 0.3694\n",
+            "Epoch 308/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1845 - val_mae: 0.3682\n",
+            "Epoch 309/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1847 - val_mae: 0.3691\n",
+            "Epoch 310/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3680\n",
+            "Epoch 311/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1699 - mae: 0.3503 - val_loss: 0.1845 - val_mae: 0.3677\n",
+            "Epoch 312/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1694 - mae: 0.3502 - val_loss: 0.1847 - val_mae: 0.3692\n",
+            "Epoch 313/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1698 - mae: 0.3512 - val_loss: 0.1850 - val_mae: 0.3706\n",
+            "Epoch 314/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1698 - mae: 0.3509 - val_loss: 0.1845 - val_mae: 0.3678\n",
+            "Epoch 315/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1697 - mae: 0.3503 - val_loss: 0.1845 - val_mae: 0.3674\n",
+            "Epoch 316/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3503 - val_loss: 0.1845 - val_mae: 0.3680\n",
+            "Epoch 317/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3501 - val_loss: 0.1845 - val_mae: 0.3675\n",
+            "Epoch 318/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.1697 - mae: 0.3500 - val_loss: 0.1845 - val_mae: 0.3674\n",
+            "Epoch 319/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3499 - val_loss: 0.1845 - val_mae: 0.3672\n",
+            "Epoch 320/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3503 - val_loss: 0.1846 - val_mae: 0.3685\n",
+            "Epoch 321/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3507 - val_loss: 0.1847 - val_mae: 0.3695\n",
+            "Epoch 322/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1845 - val_mae: 0.3677\n",
+            "Epoch 323/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1697 - mae: 0.3501 - val_loss: 0.1845 - val_mae: 0.3676\n",
+            "Epoch 324/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3680\n",
+            "Epoch 325/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3690\n",
+            "Epoch 326/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3682\n",
+            "Epoch 327/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3503 - val_loss: 0.1845 - val_mae: 0.3682\n",
+            "Epoch 328/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3682\n",
+            "Epoch 329/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1695 - mae: 0.3503 - val_loss: 0.1846 - val_mae: 0.3684\n",
+            "Epoch 330/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3691\n",
+            "Epoch 331/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1699 - mae: 0.3512 - val_loss: 0.1847 - val_mae: 0.3697\n",
+            "Epoch 332/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3507 - val_loss: 0.1846 - val_mae: 0.3688\n",
+            "Epoch 333/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1702 - mae: 0.3514 - val_loss: 0.1847 - val_mae: 0.3696\n",
+            "Epoch 334/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1695 - mae: 0.3505 - val_loss: 0.1845 - val_mae: 0.3678\n",
+            "Epoch 335/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3503 - val_loss: 0.1845 - val_mae: 0.3680\n",
+            "Epoch 336/500\n",
+            "600/600 [==============================] - 0s 40us/sample - loss: 0.1697 - mae: 0.3501 - val_loss: 0.1845 - val_mae: 0.3675\n",
+            "Epoch 337/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3502 - val_loss: 0.1846 - val_mae: 0.3688\n",
+            "Epoch 338/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3688\n",
+            "Epoch 339/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1700 - mae: 0.3513 - val_loss: 0.1851 - val_mae: 0.3711\n",
+            "Epoch 340/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1697 - mae: 0.3511 - val_loss: 0.1846 - val_mae: 0.3689\n",
+            "Epoch 341/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1845 - val_mae: 0.3677\n",
+            "Epoch 342/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1701 - mae: 0.3509 - val_loss: 0.1848 - val_mae: 0.3700\n",
+            "Epoch 343/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1696 - mae: 0.3510 - val_loss: 0.1847 - val_mae: 0.3692\n",
+            "Epoch 344/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3682\n",
+            "Epoch 345/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1847 - val_mae: 0.3690\n",
+            "Epoch 346/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1695 - mae: 0.3511 - val_loss: 0.1851 - val_mae: 0.3711\n",
+            "Epoch 347/500\n",
+            "600/600 [==============================] - 0s 65us/sample - loss: 0.1697 - mae: 0.3513 - val_loss: 0.1849 - val_mae: 0.3701\n",
+            "Epoch 348/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1694 - mae: 0.3507 - val_loss: 0.1846 - val_mae: 0.3683\n",
+            "Epoch 349/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1696 - mae: 0.3501 - val_loss: 0.1845 - val_mae: 0.3672\n",
+            "Epoch 350/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.1698 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3684\n",
+            "Epoch 351/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3679\n",
+            "Epoch 352/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1695 - mae: 0.3504 - val_loss: 0.1847 - val_mae: 0.3692\n",
+            "Epoch 353/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1697 - mae: 0.3509 - val_loss: 0.1849 - val_mae: 0.3701\n",
+            "Epoch 354/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1847 - val_mae: 0.3689\n",
+            "Epoch 355/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3508 - val_loss: 0.1846 - val_mae: 0.3685\n",
+            "Epoch 356/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1701 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3664\n",
+            "Epoch 357/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1699 - mae: 0.3503 - val_loss: 0.1847 - val_mae: 0.3689\n",
+            "Epoch 358/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3684\n",
+            "Epoch 359/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1695 - mae: 0.3503 - val_loss: 0.1846 - val_mae: 0.3683\n",
+            "Epoch 360/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3681\n",
+            "Epoch 361/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1697 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3685\n",
+            "Epoch 362/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1698 - mae: 0.3503 - val_loss: 0.1845 - val_mae: 0.3676\n",
+            "Epoch 363/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1848 - val_mae: 0.3695\n",
+            "Epoch 364/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1847 - val_mae: 0.3688\n",
+            "Epoch 365/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3507 - val_loss: 0.1849 - val_mae: 0.3699\n",
+            "Epoch 366/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3511 - val_loss: 0.1849 - val_mae: 0.3701\n",
+            "Epoch 367/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1846 - val_mae: 0.3688\n",
+            "Epoch 368/500\n",
+            "600/600 [==============================] - 0s 39us/sample - loss: 0.1696 - mae: 0.3507 - val_loss: 0.1846 - val_mae: 0.3688\n",
+            "Epoch 369/500\n",
+            "600/600 [==============================] - 0s 40us/sample - loss: 0.1698 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3678\n",
+            "Epoch 370/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1697 - mae: 0.3507 - val_loss: 0.1848 - val_mae: 0.3697\n",
+            "Epoch 371/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1698 - mae: 0.3508 - val_loss: 0.1846 - val_mae: 0.3683\n",
+            "Epoch 372/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1847 - val_mae: 0.3692\n",
+            "Epoch 373/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1846 - val_mae: 0.3689\n",
+            "Epoch 374/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1697 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3677\n",
+            "Epoch 375/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3691\n",
+            "Epoch 376/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3684\n",
+            "Epoch 377/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1697 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3678\n",
+            "Epoch 378/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1700 - mae: 0.3507 - val_loss: 0.1847 - val_mae: 0.3690\n",
+            "Epoch 379/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1695 - mae: 0.3501 - val_loss: 0.1845 - val_mae: 0.3670\n",
+            "Epoch 380/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.1696 - mae: 0.3501 - val_loss: 0.1846 - val_mae: 0.3683\n",
+            "Epoch 381/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1695 - mae: 0.3505 - val_loss: 0.1847 - val_mae: 0.3691\n",
+            "Epoch 382/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1847 - val_mae: 0.3690\n",
+            "Epoch 383/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1847 - val_mae: 0.3693\n",
+            "Epoch 384/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1697 - mae: 0.3511 - val_loss: 0.1850 - val_mae: 0.3703\n",
+            "Epoch 385/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.1699 - mae: 0.3510 - val_loss: 0.1847 - val_mae: 0.3689\n",
+            "Epoch 386/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1697 - mae: 0.3511 - val_loss: 0.1851 - val_mae: 0.3709\n",
+            "Epoch 387/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1698 - mae: 0.3512 - val_loss: 0.1846 - val_mae: 0.3688\n",
+            "Epoch 388/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3687\n",
+            "Epoch 389/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1697 - mae: 0.3510 - val_loss: 0.1848 - val_mae: 0.3700\n",
+            "Epoch 390/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1696 - mae: 0.3510 - val_loss: 0.1847 - val_mae: 0.3694\n",
+            "Epoch 391/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1701 - mae: 0.3505 - val_loss: 0.1846 - val_mae: 0.3666\n",
+            "Epoch 392/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1697 - mae: 0.3501 - val_loss: 0.1846 - val_mae: 0.3681\n",
+            "Epoch 393/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1848 - val_mae: 0.3698\n",
+            "Epoch 394/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1696 - mae: 0.3510 - val_loss: 0.1847 - val_mae: 0.3693\n",
+            "Epoch 395/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1699 - mae: 0.3507 - val_loss: 0.1845 - val_mae: 0.3675\n",
+            "Epoch 396/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1695 - mae: 0.3501 - val_loss: 0.1847 - val_mae: 0.3693\n",
+            "Epoch 397/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1698 - mae: 0.3510 - val_loss: 0.1848 - val_mae: 0.3698\n",
+            "Epoch 398/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1696 - mae: 0.3508 - val_loss: 0.1846 - val_mae: 0.3687\n",
+            "Epoch 399/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1695 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3673\n",
+            "Epoch 400/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1697 - mae: 0.3498 - val_loss: 0.1845 - val_mae: 0.3667\n",
+            "Epoch 401/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3498 - val_loss: 0.1845 - val_mae: 0.3681\n",
+            "Epoch 402/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1847 - val_mae: 0.3692\n",
+            "Epoch 403/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3687\n",
+            "Epoch 404/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1699 - mae: 0.3502 - val_loss: 0.1846 - val_mae: 0.3667\n",
+            "Epoch 405/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3500 - val_loss: 0.1846 - val_mae: 0.3683\n",
+            "Epoch 406/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1696 - mae: 0.3503 - val_loss: 0.1847 - val_mae: 0.3689\n",
+            "Epoch 407/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1695 - mae: 0.3504 - val_loss: 0.1847 - val_mae: 0.3684\n",
+            "Epoch 408/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1697 - mae: 0.3502 - val_loss: 0.1846 - val_mae: 0.3673\n",
+            "Epoch 409/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3499 - val_loss: 0.1846 - val_mae: 0.3678\n",
+            "Epoch 410/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3502 - val_loss: 0.1846 - val_mae: 0.3682\n",
+            "Epoch 411/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1697 - mae: 0.3499 - val_loss: 0.1846 - val_mae: 0.3668\n",
+            "Epoch 412/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3496 - val_loss: 0.1846 - val_mae: 0.3673\n",
+            "Epoch 413/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1698 - mae: 0.3508 - val_loss: 0.1852 - val_mae: 0.3710\n",
+            "Epoch 414/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1703 - mae: 0.3519 - val_loss: 0.1854 - val_mae: 0.3716\n",
+            "Epoch 415/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1695 - mae: 0.3511 - val_loss: 0.1846 - val_mae: 0.3686\n",
+            "Epoch 416/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.1696 - mae: 0.3499 - val_loss: 0.1845 - val_mae: 0.3666\n",
+            "Epoch 417/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1700 - mae: 0.3496 - val_loss: 0.1846 - val_mae: 0.3665\n",
+            "Epoch 418/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1694 - mae: 0.3497 - val_loss: 0.1847 - val_mae: 0.3687\n",
+            "Epoch 419/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1695 - mae: 0.3505 - val_loss: 0.1849 - val_mae: 0.3698\n",
+            "Epoch 420/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1697 - mae: 0.3509 - val_loss: 0.1850 - val_mae: 0.3702\n",
+            "Epoch 421/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3511 - val_loss: 0.1849 - val_mae: 0.3700\n",
+            "Epoch 422/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3507 - val_loss: 0.1846 - val_mae: 0.3686\n",
+            "Epoch 423/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1695 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3677\n",
+            "Epoch 424/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3498 - val_loss: 0.1845 - val_mae: 0.3668\n",
+            "Epoch 425/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3497 - val_loss: 0.1845 - val_mae: 0.3671\n",
+            "Epoch 426/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1696 - mae: 0.3497 - val_loss: 0.1846 - val_mae: 0.3676\n",
+            "Epoch 427/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1696 - mae: 0.3500 - val_loss: 0.1847 - val_mae: 0.3683\n",
+            "Epoch 428/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1696 - mae: 0.3502 - val_loss: 0.1847 - val_mae: 0.3686\n",
+            "Epoch 429/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1848 - val_mae: 0.3694\n",
+            "Epoch 430/500\n",
+            "600/600 [==============================] - 0s 40us/sample - loss: 0.1698 - mae: 0.3502 - val_loss: 0.1846 - val_mae: 0.3675\n",
+            "Epoch 431/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3498 - val_loss: 0.1846 - val_mae: 0.3675\n",
+            "Epoch 432/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1850 - val_mae: 0.3703\n",
+            "Epoch 433/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1699 - mae: 0.3514 - val_loss: 0.1853 - val_mae: 0.3713\n",
+            "Epoch 434/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1700 - mae: 0.3510 - val_loss: 0.1846 - val_mae: 0.3686\n",
+            "Epoch 435/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1699 - mae: 0.3509 - val_loss: 0.1846 - val_mae: 0.3689\n",
+            "Epoch 436/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1849 - val_mae: 0.3703\n",
+            "Epoch 437/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3511 - val_loss: 0.1847 - val_mae: 0.3696\n",
+            "Epoch 438/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1846 - val_mae: 0.3691\n",
+            "Epoch 439/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1695 - mae: 0.3506 - val_loss: 0.1845 - val_mae: 0.3682\n",
+            "Epoch 440/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1698 - mae: 0.3506 - val_loss: 0.1845 - val_mae: 0.3683\n",
+            "Epoch 441/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1696 - mae: 0.3501 - val_loss: 0.1845 - val_mae: 0.3670\n",
+            "Epoch 442/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.1697 - mae: 0.3502 - val_loss: 0.1846 - val_mae: 0.3690\n",
+            "Epoch 443/500\n",
+            "600/600 [==============================] - 0s 82us/sample - loss: 0.1704 - mae: 0.3519 - val_loss: 0.1849 - val_mae: 0.3702\n",
+            "Epoch 444/500\n",
+            "600/600 [==============================] - 0s 62us/sample - loss: 0.1696 - mae: 0.3507 - val_loss: 0.1846 - val_mae: 0.3685\n",
+            "Epoch 445/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1697 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3679\n",
+            "Epoch 446/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1697 - mae: 0.3501 - val_loss: 0.1845 - val_mae: 0.3673\n",
+            "Epoch 447/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1700 - mae: 0.3501 - val_loss: 0.1845 - val_mae: 0.3671\n",
+            "Epoch 448/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.1705 - mae: 0.3515 - val_loss: 0.1852 - val_mae: 0.3713\n",
+            "Epoch 449/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.1698 - mae: 0.3512 - val_loss: 0.1846 - val_mae: 0.3687\n",
+            "Epoch 450/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3682\n",
+            "Epoch 451/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1695 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3687\n",
+            "Epoch 452/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3681\n",
+            "Epoch 453/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1696 - mae: 0.3502 - val_loss: 0.1846 - val_mae: 0.3683\n",
+            "Epoch 454/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1696 - mae: 0.3504 - val_loss: 0.1846 - val_mae: 0.3686\n",
+            "Epoch 455/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1698 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3682\n",
+            "Epoch 456/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1698 - mae: 0.3508 - val_loss: 0.1847 - val_mae: 0.3695\n",
+            "Epoch 457/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1697 - mae: 0.3511 - val_loss: 0.1847 - val_mae: 0.3697\n",
+            "Epoch 458/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1695 - mae: 0.3507 - val_loss: 0.1845 - val_mae: 0.3684\n",
+            "Epoch 459/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1698 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3677\n",
+            "Epoch 460/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1696 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3692\n",
+            "Epoch 461/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1847 - val_mae: 0.3696\n",
+            "Epoch 462/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1696 - mae: 0.3510 - val_loss: 0.1846 - val_mae: 0.3692\n",
+            "Epoch 463/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1698 - mae: 0.3506 - val_loss: 0.1845 - val_mae: 0.3674\n",
+            "Epoch 464/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1697 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3685\n",
+            "Epoch 465/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1695 - mae: 0.3506 - val_loss: 0.1847 - val_mae: 0.3695\n",
+            "Epoch 466/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1698 - mae: 0.3513 - val_loss: 0.1850 - val_mae: 0.3706\n",
+            "Epoch 467/500\n",
+            "600/600 [==============================] - 0s 40us/sample - loss: 0.1698 - mae: 0.3512 - val_loss: 0.1847 - val_mae: 0.3698\n",
+            "Epoch 468/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1700 - mae: 0.3519 - val_loss: 0.1850 - val_mae: 0.3712\n",
+            "Epoch 469/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1697 - mae: 0.3515 - val_loss: 0.1847 - val_mae: 0.3700\n",
+            "Epoch 470/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1695 - mae: 0.3508 - val_loss: 0.1845 - val_mae: 0.3683\n",
+            "Epoch 471/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1697 - mae: 0.3503 - val_loss: 0.1845 - val_mae: 0.3675\n",
+            "Epoch 472/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3682\n",
+            "Epoch 473/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1846 - val_mae: 0.3689\n",
+            "Epoch 474/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1696 - mae: 0.3505 - val_loss: 0.1845 - val_mae: 0.3682\n",
+            "Epoch 475/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1697 - mae: 0.3506 - val_loss: 0.1845 - val_mae: 0.3683\n",
+            "Epoch 476/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.1695 - mae: 0.3506 - val_loss: 0.1847 - val_mae: 0.3697\n",
+            "Epoch 477/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1696 - mae: 0.3511 - val_loss: 0.1848 - val_mae: 0.3701\n",
+            "Epoch 478/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3512 - val_loss: 0.1848 - val_mae: 0.3702\n",
+            "Epoch 479/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1695 - mae: 0.3507 - val_loss: 0.1845 - val_mae: 0.3676\n",
+            "Epoch 480/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1699 - mae: 0.3502 - val_loss: 0.1845 - val_mae: 0.3669\n",
+            "Epoch 481/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1697 - mae: 0.3500 - val_loss: 0.1845 - val_mae: 0.3676\n",
+            "Epoch 482/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.1695 - mae: 0.3506 - val_loss: 0.1850 - val_mae: 0.3706\n",
+            "Epoch 483/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1698 - mae: 0.3516 - val_loss: 0.1853 - val_mae: 0.3716\n",
+            "Epoch 484/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1699 - mae: 0.3515 - val_loss: 0.1847 - val_mae: 0.3692\n",
+            "Epoch 485/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3507 - val_loss: 0.1846 - val_mae: 0.3687\n",
+            "Epoch 486/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1699 - mae: 0.3505 - val_loss: 0.1845 - val_mae: 0.3679\n",
+            "Epoch 487/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1695 - mae: 0.3506 - val_loss: 0.1848 - val_mae: 0.3698\n",
+            "Epoch 488/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1701 - mae: 0.3517 - val_loss: 0.1851 - val_mae: 0.3709\n",
+            "Epoch 489/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1698 - mae: 0.3509 - val_loss: 0.1845 - val_mae: 0.3678\n",
+            "Epoch 490/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1696 - mae: 0.3502 - val_loss: 0.1846 - val_mae: 0.3680\n",
+            "Epoch 491/500\n",
+            "600/600 [==============================] - 0s 42us/sample - loss: 0.1696 - mae: 0.3502 - val_loss: 0.1846 - val_mae: 0.3683\n",
+            "Epoch 492/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1699 - mae: 0.3512 - val_loss: 0.1853 - val_mae: 0.3714\n",
+            "Epoch 493/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1698 - mae: 0.3513 - val_loss: 0.1848 - val_mae: 0.3697\n",
+            "Epoch 494/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.1696 - mae: 0.3509 - val_loss: 0.1847 - val_mae: 0.3691\n",
+            "Epoch 495/500\n",
+            "600/600 [==============================] - 0s 41us/sample - loss: 0.1695 - mae: 0.3504 - val_loss: 0.1845 - val_mae: 0.3679\n",
+            "Epoch 496/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1696 - mae: 0.3503 - val_loss: 0.1846 - val_mae: 0.3684\n",
+            "Epoch 497/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1695 - mae: 0.3505 - val_loss: 0.1847 - val_mae: 0.3693\n",
+            "Epoch 498/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1696 - mae: 0.3510 - val_loss: 0.1848 - val_mae: 0.3699\n",
+            "Epoch 499/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1695 - mae: 0.3508 - val_loss: 0.1846 - val_mae: 0.3690\n",
+            "Epoch 500/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1697 - mae: 0.3503 - val_loss: 0.1845 - val_mae: 0.3681\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cRE8KpEqVfaS",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 3. Plot Metrics"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SDsjqfjFm7Fz",
+        "colab_type": "text"
+      },
+      "source": [
+        "**1. Mean Squared Error**\n",
+        "\n",
+        "During training, the model's performance is constantly being measured against both our training data and the validation data that we set aside earlier. Training produces a log of data that tells us how the model's performance changed over the course of the training process.\n",
+        "\n",
+        "The following cells will display some of that data in a graphical form:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "CmvA-ksoln8r",
+        "colab_type": "code",
+        "outputId": "2796d3ca-deb7-4cf9-cc01-78df3cacf12a",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 295
+        }
+      },
+      "source": [
+        "# Draw a graph of the loss, which is the distance between\n",
+        "# the predicted and actual values during training and validation.\n",
+        "loss = history_1.history['loss']\n",
+        "val_loss = history_1.history['val_loss']\n",
+        "\n",
+        "epochs = range(1, len(loss) + 1)\n",
+        "\n",
+        "plt.plot(epochs, loss, 'g.', label='Training loss')\n",
+        "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
+        "plt.title('Training and validation loss')\n",
+        "plt.xlabel('Epochs')\n",
+        "plt.ylabel('Loss')\n",
+        "plt.legend()\n",
+        "plt.show()"
+      ],
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deZyVZf3/8ddndnZkUZQZGyyQMGCQ\nARxQG9MKl3DJSvIrkrtllqaIlUKamWVlfsOULG2h0Kz4YeoXE0FcKBkWURQSbYhxBWR1YNbP74/7\nPjNn9gHmnjMz5/18PM5jzn3d2+c658z5nOu67sXcHRERSV4piQ5AREQSS4lARCTJKRGIiCQ5JQIR\nkSSnRCAikuSUCEREkpwSgbQpM3vCzC5s62UTycyKzeyUCLbrZvax8Pm9ZnZTa5Y9gP2cb2ZPHmic\nzWy30MxK2nq70v7SEh2AJJ6Z7Ymb7A6UAVXh9OXuPq+123L3U6NYtqtz9yvaYjtmlgv8B0h398pw\n2/OAVr+HknyUCAR37xl7bmbFwCXu/lT95cwsLfblIiJdh7qGpEmxpr+Z3WBm7wIPmNkhZvZ3M9ti\nZtvD59lx6yw1s0vC59PN7DkzuzNc9j9mduoBLjvEzJaZ2W4ze8rM5pjZH5qIuzUx3mpmz4fbe9LM\nBsTNv8DMNpnZNjP7TjOvzwQze9fMUuPKzjazteHz8Wa23Mx2mNk7ZvYLM8toYlsPmtn346avD9d5\n28wuqrfs6Wa22sx2mdlmM5sdN3tZ+HeHme0xs4LYaxu3/kQzW2FmO8O/E1v72jTHzD4err/DzNaZ\n2ZS4eaeZ2avhNt8ys+vC8gHh+7PDzD4ws2fNTN9L7UwvuLRkENAP+AhwGcFn5oFw+khgL/CLZtaf\nAGwABgA/An5tZnYAy/4ReBHoD8wGLmhmn62J8cvAV4BDgQwg9sU0AvhluP0jwv1l0wh3/xfwIfCp\netv9Y/i8CrgmrE8BcDLw1WbiJoxhchjPp4GhQP3xiQ+BaUBf4HTgSjM7K5x3Yvi3r7v3dPfl9bbd\nD3gMuDus20+Bx8ysf706NHhtWog5HXgUeDJc7+vAPDM7Olzk1wTdjL2ATwBPh+XfAkqAgcBhwLcB\nXfemnSkRSEuqgVnuXubue919m7v/xd1L3X03cBvwyWbW3+Tuv3L3KuC3wOEE//CtXtbMjgTGATe7\ne7m7PwcsbGqHrYzxAXf/t7vvBR4G8sLyc4G/u/sydy8Dbgpfg6b8CZgKYGa9gNPCMtx9pbv/090r\n3b0YuK+ROBrzxTC+V9z9Q4LEF1+/pe7+srtXu/vacH+t2S4EieN1d/99GNefgPXA5+KWaeq1ac5x\nQE/gh+F79DTwd8LXBqgARphZb3ff7u6r4soPBz7i7hXu/qzrAmjtTolAWrLF3ffFJsysu5ndF3ad\n7CLoiugb3z1Sz7uxJ+5eGj7tuZ/LHgF8EFcGsLmpgFsZ47txz0vjYjoiftvhF/G2pvZF8Ov/HDPL\nBM4BVrn7pjCOYWG3x7thHD8gaB20pE4MwKZ69ZtgZkvCrq+dwBWt3G5s25vqlW0CBsdNN/XatBiz\nu8cnzfjtfp4gSW4ys2fMrCAs/zGwEXjSzN40s5mtq4a0JSUCaUn9X2ffAo4GJrh7b2q7Iprq7mkL\n7wD9zKx7XFlOM8sfTIzvxG873Gf/phZ291cJvvBOpW63EARdTOuBoWEc3z6QGAi6t+L9kaBFlOPu\nfYB747bb0q/ptwm6zOIdCbzVirha2m5Ovf79mu26+wp3P5Og22gBQUsDd9/t7t9y96OAKcC1Znby\nQcYi+0mJQPZXL4I+9x1hf/OsqHcY/sIuAmabWUb4a/JzzaxyMDE+ApxhZseHA7u30PL/yR+BbxAk\nnD/Xi2MXsMfMhgNXtjKGh4HpZjYiTET14+9F0ELaZ2bjCRJQzBaCrqyjmtj248AwM/uymaWZ2ZeA\nEQTdOAfjXwSthxlmlm5mhQTv0fzwPTvfzPq4ewXBa1INYGZnmNnHwrGgnQTjKs11xUkElAhkf90F\ndAO2Av8E/q+d9ns+wYDrNuD7wEME5zs05oBjdPd1wNcIvtzfAbYTDGY2J9ZH/7S7b40rv47gS3o3\n8Ksw5tbE8ERYh6cJuk2errfIV4FbzGw3cDPhr+tw3VKCMZHnwyNxjqu37W3AGQStpm3ADOCMenHv\nN3cvJ/jiP5Xgdb8HmObu68NFLgCKwy6yKwjeTwgGw58C9gDLgXvcfcnBxCL7zzQuI52RmT0ErHf3\nyFskIl2dWgTSKZjZODP7qJmlhIdXnknQ1ywiB0lnFktnMQj4K8HAbQlwpbuvTmxIIl2DuoZERJKc\nuoZERJJcp+saGjBggOfm5iY6DBGRTmXlypVb3X1gY/M6XSLIzc2lqKgo0WGIiHQqZlb/jPIa6hoS\nEUlySgQiIklOiUBEJMlFOkYQnvjzcyAVuN/df1hv/s+Ak8LJ7sCh7t43yphEZP9VVFRQUlLCvn37\nWl5YEiorK4vs7GzS09NbvU5kiSC85O8cgptrlAArzGxheLVGANz9mrjlvw6MiSoeETlwJSUl9OrV\ni9zcXJq+r5Akmruzbds2SkpKGDJkSKvXi7JraDyw0d3fDC9INZ/gsgBNmUp4Qw8R6Vj27dtH//79\nlQQ6ODOjf//++91yizIRDKbuzTVKqHvzixpm9hFgCA2vshibf5mZFZlZ0ZYtWw4omOWbl3P7s7ez\nfPPylhcWkQaUBDqHA3mfOsp5BOcBj4S3KGzA3ecCcwHy8/P3+5oYyzcv5+TfnUx5VTkZqRksnraY\ngpyCllcUEUkCUbYI3qLuXZayafouSOcRYbfQ0uKllFeVU+VVlFeVs7R4aVS7EpEIbNu2jby8PPLy\n8hg0aBCDBw+umS4vL2923aKiIq6++uoW9zFx4sQ2iXXp0qWcccYZbbKt9hJli2AFMNTMhhAkgPOo\neyclAMI7Nx1CcFOKSBTmFpKRmlHTIijMLYxqVyISgf79+7NmzRoAZs+eTc+ePbnuuutq5ldWVpKW\n1vjXWX5+Pvn5+S3u44UXXmibYDuhyFoE7l4JXAUsAl4DHnb3dWZ2i5lNiVv0PGC+R3gZ1IKcAhZP\nW8ytJ92qbiGRdhL1uNz06dO54oormDBhAjNmzODFF1+koKCAMWPGMHHiRDZs2ADU/YU+e/ZsLrro\nIgoLCznqqKO4++67a7bXs2fPmuULCws599xzGT58OOeffz6xr6fHH3+c4cOHM3bsWK6++uoWf/l/\n8MEHnHXWWYwaNYrjjjuOtWvXAvDMM8/UtGjGjBnD7t27eeeddzjxxBPJy8vjE5/4BM8++2ybv2ZN\niXSMwN0fJ7hHanzZzfWmZ0cZQ0xBToESgEg7aa9xuZKSEl544QVSU1PZtWsXzz77LGlpaTz11FN8\n+9vf5i9/+UuDddavX8+SJUvYvXs3Rx99NFdeeWWDY+5Xr17NunXrOOKII5g0aRLPP/88+fn5XH75\n5SxbtowhQ4YwderUFuObNWsWY8aMYcGCBTz99NNMmzaNNWvWcOeddzJnzhwmTZrEnj17yMrKYu7c\nuXz2s5/lO9/5DlVVVZSWlrbZ69SSjjJYLCJdSGPjclEkgi984QukpqYCsHPnTi688EJef/11zIyK\niopG1zn99NPJzMwkMzOTQw89lPfee4/s7Ow6y4wfP76mLC8vj+LiYnr27MlRRx1Vc3z+1KlTmTt3\nbrPxPffcczXJ6FOf+hTbtm1j165dTJo0iWuvvZbzzz+fc845h+zsbMaNG8dFF11ERUUFZ511Fnl5\neQf12uwPXWJCRNpcbFwu1VIjHZfr0aNHzfObbrqJk046iVdeeYVHH320yWPpMzMza56npqZSWVl5\nQMscjJkzZ3L//fezd+9eJk2axPr16znxxBNZtmwZgwcPZvr06fzud79r0302Ry0CEWlzsXG5pcVL\nKcwtbJdu2Z07dzJ4cHCq0oMPPtjm2z/66KN58803KS4uJjc3l4ceeqjFdU444QTmzZvHTTfdxNKl\nSxkwYAC9e/fmjTfeYOTIkYwcOZIVK1awfv16unXrRnZ2NpdeeillZWWsWrWKadOmtXk9GqNEICKR\naO9xuRkzZnDhhRfy/e9/n9NPP73Nt9+tWzfuueceJk+eTI8ePRg3blyL68QGp0eNGkX37t357W9/\nC8Bdd93FkiVLSElJ4ZhjjuHUU09l/vz5/PjHPyY9PZ2ePXu2a4ug092zOD8/33VjGpH29dprr/Hx\nj3880WEk3J49e+jZsyfuzte+9jWGDh3KNddc0/KK7ayx98vMVrp7o8fRaoxARKSVfvWrX5GXl8cx\nxxzDzp07ufzyyxMdUptQ15CISCtdc801HbIFcLDUIhARSXJKBCIiSU6JQEQkySkRiIgkOSUCEenw\nTjrpJBYtWlSn7K677uLKK69scp3CwkJih5qfdtpp7Nixo8Eys2fP5s4772x23wsWLODVV2vusMvN\nN9/MU089tT/hN6ojXa5aiUBEOrypU6cyf/78OmXz589v1YXfILhqaN++fQ9o3/UTwS233MIpp5xy\nQNvqqJQIRKTDO/fcc3nsscdqbkJTXFzM22+/zQknnMCVV15Jfn4+xxxzDLNmzWp0/dzcXLZu3QrA\nbbfdxrBhwzj++ONrLlUNwTkC48aNY/To0Xz+85+ntLSUF154gYULF3L99deTl5fHG2+8wfTp03nk\nkUcAWLx4MWPGjGHkyJFcdNFFlJWV1exv1qxZHHvssYwcOZL169c3W79EX65a5xGIyH755jchvEdM\nm8nLg7vuanp+v379GD9+PE888QRnnnkm8+fP54tf/CJmxm233Ua/fv2oqqri5JNPZu3atYwaNarR\n7axcuZL58+ezZs0aKisrOfbYYxk7diwA55xzDpdeeikA3/3ud/n1r3/N17/+daZMmcIZZ5zBueee\nW2db+/btY/r06SxevJhhw4Yxbdo0fvnLX/LNb34TgAEDBrBq1Sruuece7rzzTu6///4m65foy1Un\nTYtgxw54/XWork50JCJyIOK7h+K7hR5++GGOPfZYxowZw7p16+p049T37LPPcvbZZ9O9e3d69+7N\nlCm198h65ZVXOOGEExg5ciTz5s1j3bp1zcazYcMGhgwZwrBhwwC48MILWbZsWc38c845B4CxY8dS\nXFzc7Laee+45LrjgAqDxy1Xffffd7Nixg7S0NMaNG8cDDzzA7Nmzefnll+nVq1ez226NpGkR3Hcf\nzJwJH34I3bsnOhqRzqu5X+5ROvPMM7nmmmtYtWoVpaWljB07lv/85z/ceeedrFixgkMOOYTp06c3\nefnplkyfPp0FCxYwevRoHnzwQZYuXXpQ8cYuZX0wl7GeOXMmp59+Oo8//jiTJk1i0aJFNZerfuyx\nx5g+fTrXXnvtQV+lNGlaBLEbEDVxrwoR6eB69uzJSSedxEUXXVTTGti1axc9evSgT58+vPfeezzx\nxBPNbuPEE09kwYIF7N27l927d/Poo4/WzNu9ezeHH344FRUVzJs3r6a8V69e7N69u8G2jj76aIqL\ni9m4cSMAv//97/nkJz95QHWLXa4aaPRy1TfccAPjxo1j/fr1bNq0icMOO4xLL72USy65hFWrVh3Q\nPuMlTYsgIyP4G441iUgnNHXqVM4+++yaLqLRo0czZswYhg8fTk5ODpMmTWp2/WOPPZYvfelLjB49\nmkMPPbTOpaRvvfVWJkyYwMCBA5kwYULNl/95553HpZdeyt13310zSAyQlZXFAw88wBe+8AUqKysZ\nN24cV1xxxQHVK9GXq06ay1Dfdx9ccQW89RYccUQEgYl0YboMdeeiy1A3QV1DIiKNUyIQEUlySgQi\n0iqdrRs5WR3I+6REICItysrKYtu2bUoGHZy7s23bNrKysvZrPR01JCItys7OpqSkhC1btiQ6FGlB\nVlYW2dnZ+7VO0iQCtQhEDlx6ejpDhgxJdBgSEXUNiYgkOSUCEZEkF2kiMLPJZrbBzDaa2cwmlvmi\nmb1qZuvM7I9RxaJEICLSuMjGCMwsFZgDfBooAVaY2UJ3fzVumaHAjcAkd99uZodGFY8SgYhI46Js\nEYwHNrr7m+5eDswHzqy3zKXAHHffDuDu70cVjI4aEhFpXJSJYDCwOW66JCyLNwwYZmbPm9k/zWxy\nVMHEWgR/fnkByzcvj2o3IiKdTqIHi9OAoUAhMBX4lZk1uLGomV1mZkVmVnSgxzG/snU1AA+//DdO\n/t3JSgYiIqEoE8FbQE7cdHZYFq8EWOjuFe7+H+DfBImhDnef6+757p4/cODAAwpmxbsvBNuqTKW8\nqpylxUsPaDsiIl1NlIlgBTDUzIaYWQZwHrCw3jILCFoDmNkAgq6iN6MI5vjcCQBYdSYZqRkU5hZG\nsRsRkU4nskTg7pXAVcAi4DXgYXdfZ2a3mFnsRqGLgG1m9iqwBLje3bdFEU9BbnAZ7jM+ejaLpy2m\nIKcgit2IiHQ6kV5iwt0fBx6vV3Zz3HMHrg0fkYodNVR45GcoyGl+WRGRZJLoweJ2o/MIREQap0Qg\nIpLkkiYRpIWdYEoEIiJ1JU0iMAuSgRKBiEhdSZMIIOgeUiIQEakrqRJBRoauNSQiUl9SJQK1CERE\nGlIiEBFJckoEIiJJLukSgcYIRETqSqpEkJmpRCAiUl/SJYKyskRHISLSsSgRiIgkOSUCEZEkl1SJ\nICsL9u1LdBQiIh1LUiUCtQhERBpSIhARSXJKBCIiSS7pEoHGCERE6kqqRJCVpRaBiEh9SZUI1DUk\nItJQ0iWCfWXV3P7s7SzfvDzR4YiIdAhJlQje37eZqsoUvrt4Fif/7mQlAxERkiwRlJRuBKC6Io3y\nqnKWFi9NbEAiIh1AUiWC4YflApBS3Z2M1AwKcwsTGo+ISEeQlugA2tPRg4YAcMOEm/nc2HEU5BQk\nOCIRkcRLqkSQmRn8vXzM1XwkJ7GxiIh0FEnVNRRLBDqpTESkVlIlgqys4K/OJRARqRVpIjCzyWa2\nwcw2mtnMRuZPN7MtZrYmfFwSZTyxFoESgYhIrcjGCMwsFZgDfBooAVaY2UJ3f7Xeog+5+1VRxRFP\niUBEpKEoWwTjgY3u/qa7lwPzgTMj3F+LYl1De/cmMgoRkY4lykQwGNgcN10SltX3eTNba2aPmFmj\nx/KY2WVmVmRmRVu2bDnggLp3D/4qEYiI1Er0YPGjQK67jwL+Afy2sYXcfa6757t7/sCBAw94Zz16\nBH8//PCANyEi0uVEmQjeAuJ/4WeHZTXcfZu7x3rs7wfGRhiPEoGISCOiTAQrgKFmNsTMMoDzgIXx\nC5jZ4XGTU4DXIoynpmuotDTKvYiIdC6RHTXk7pVmdhWwCEgFfuPu68zsFqDI3RcCV5vZFKAS+ACY\nHlU8oBaBiEhjIr3EhLs/Djxer+zmuOc3AjdGGUO8zExISVEiEBGJl+jB4nZlFrQKlAhERGolVSKA\nYJxAYwQiIrWSLhGoRSAiUlfSJQLL+JDV/92g21SKiISSKhEs37ycNz98hVff3qR7FouIhJIqESwt\nXoqnfQjl3XXPYhGRUFIlgsLcQlIy90JFD92zWEQklFSJoCCngE8Nm8CA9CNZPG2x7lksIkKSJQKA\nIYcNIK2iv5KAiEgo6RJB376wc2eioxAR6TiSMhHs3au7lImIxCRlIgC1CkREYlqVCMysh5mlhM+H\nmdkUM0uPNrRoxBLBjh2JjUNEpKNobYtgGZBlZoOBJ4ELgAejCipKSgQiInW1NhGYu5cC5wD3uPsX\ngGOiCys6sUSwfXti4xAR6ShanQjMrAA4H3gsLEuNJqRoxRLBg8sX6BITIiK0PhF8k+AGMn8L7zJ2\nFLAkurCis/HDIgAeWrlI1xsSEaGVicDdn3H3Ke5+RzhovNXdr444tkis3hHkL9/bR9cbEhGh9UcN\n/dHMeptZD+AV4FUzuz7a0KLx6aOPh4w9WOkgXW9IRITWdw2NcPddwFnAE8AQgiOHOp2JRxaQMziV\nT3T7jK43JCJC629enx6eN3AW8At3rzAzjzCuSB11ZDeqK0ZQkJPoSEREEq+1LYL7gGKgB7DMzD4C\n7IoqqKgdcQS8/XaioxAR6RhaO1h8t7sPdvfTPLAJOCni2CJz+OFBIvBO26YREWk7rR0s7mNmPzWz\novDxE4LWQadU3r2YvXvhyVdeTHQoIiIJ19quod8Au4Evho9dwANRBRWl5ZuXM/c/MwA4c871Oo9A\nRJJeaxPBR919lru/GT6+BxwVZWBRWVq8lMr+awEof3eoziMQkaTX2kSw18yOj02Y2SRgbzQhRasw\nt5CMASWQtpfUrZ/QeQQikvRae/joFcDvzKxPOL0duDCakKJVkFPA09P/wbl/2MHh/hUKcvq0vJKI\nSBfW2qOGXnL30cAoYJS7jwE+FWlkESrIKeALpx7OulV92Lcv0dGIiCTWft2hzN13hWcYA1zb0vJm\nNtnMNpjZRjOb2cxynzczN7P8/YnnYAwetZ59+2DuX19tr12KiHRIB3OrSmt2plkqMAc4FRgBTDWz\nEY0s1wv4BvCvg4hlvyzfvJybN02CzJ1ce+dKHTkkIkntYBJBS6djjQc2hkcZlQPzgTMbWe5W4A6g\n3TpplhYvpSJ1J3ziT1S9fC4Li1a0165FRDqcZhOBme02s12NPHYDR7Sw7cHA5rjpkrAsfvvHAjnu\n/hjNMLPLYiezbdmypYXdtqwwt5CM1Axs0k+gOp0Vf+60J0mLiBy0ZhOBu/dy996NPHq5e2uPOGpU\neF+DnwLfamlZd5/r7vnunj9w4MCD2S0QDBbfNfku0gZsgtG/Z/GfP8b/e3HlQW9XRKQzOpiuoZa8\nBcRf3zM7LIvpBXwCWGpmxcBxwML2GjDeVrqNquoqOPF74CncMiu9PXYrItLhRJkIVgBDzWyImWUA\n5wELYzPdfae7D3D3XHfPBf4JTHH3oghjqtG/e3+qqYZDNsFxd7F60UhWrWqPPYuIdCyRJQJ3rwSu\nAhYBrwEPh/c7vsXMpkS139baVrqNFAurf8LtZPbaw7e+pSuSikjyibJFgLs/7u7D3P2j7n5bWHaz\nuy9sZNnC9moNQDBgnJYSDnNk7aT8xBtZuhQWNohMRKRrizQRdGQFOQWc9rHTaqarx9zLITnvcP31\nUF6ewMBERNpZ0iYCgEE9B9VOpFYx/Mv38/rrcM89iYtJRKS9JXUimDZ6GukptUcL/avb9/j4hBJu\nvRV2ddobcYqI7J+kTgQFOQVcPObimulqqtiQdw4ffAA//WkCAxMRaUdJnQggaBWkWmrNdPXhKzhi\nwgv85CewdWsCAxMRaSdJnwgKcgr43NGfq1P2Tv6llJY6P/xhgoISEWlHSZ8IAGZMnEFK3EvhA18l\n+4Sn+cUvoKQkgYGJiLQDJQKCVsGU4XXPcftv3iVUVlXxgx8kKCgRkXaiRBCaMXFGnbECDimmOu9X\n/Or+ajZtSlxcIiJRUyIIFeQUcM/p92Bx99vxE75PtVdx660JDExEJGJKBHEuG3sZZw6Pu3dOn7fw\nsffy4IPOxo2Ji0tEJEpKBPXU7yLy42/HUiu55ZYEBiUiEiElgnoKcgr41sS4e+X0eocxU5Yzbx6s\nX5+4uEREoqJE0Ii+mX3rjBUUHfUF0jMr+N73EhiUiEhElAgaUZhbSGpKXPdQ9/cpz/8JDz3kvPJK\nAgMTEYmAEkEjCnIKmHPanLpHEBX8iNSsvcyalcDAREQioETQhAZHEHXfTuX4O/nrX2H16sTFJSLS\n1pQImtHgJLPjfgpZ27l6xgeJC0pEpI0pETQjdpJZzb2Nu+2EiT/huaf68eKLiY1NRKStKBG04LKx\nl3HdxOtqCyb8nG69P+TmmxMXk4hIW1IiaIU6h5Nm7mHfhFtYtAiefz6xcYmItAUlglYozC2s7R4C\nfNz/ktlnOzfdlMCgRETaiBJBKzS4eU3GXsqOu4UlS2DJksTFJSLSFpQIWqnBEUT5v4Reb3HNDbtw\nT1xcIiIHS4mglRpcpjq9DE64jZdW9OYf/0hsbCIiB0OJYD80OMns2F/TbcD73HQTahWISKelRLCf\nZkycQXpKejCRVk75pJt48UV47LHExiUicqCUCPZTQU4Bpw89vWa6atRv6H7ou9x8s1oFItI5KREc\ngEE9B9VOpFZSOnEmq1fDggWJi0lE5EBFmgjMbLKZbTCzjWY2s5H5V5jZy2a2xsyeM7MRUcbTVqaN\nnlb3CKKRf4D+G7juxlKqqxMXl4jIgYgsEZhZKjAHOBUYAUxt5Iv+j+4+0t3zgB8BP40qnrbU4Aii\n1CoonM2bG7rz8MOJjU1EZH9F2SIYD2x09zfdvRyYD5wZv4C774qb7AF0ml72BkcQHfMQHPYS35qx\nj7KyxMUlIrK/okwEg4HNcdMlYVkdZvY1M3uDoEVwdWMbMrPLzKzIzIq2bNkSSbAHYsbEGaTEXsIU\nh89cx9ubs5gzJ7FxiYjsj4QPFrv7HHf/KHAD8N0mlpnr7vnunj9w4MD2DbAZBTkFTBk+pbbgo09x\n6OhV3HorfKBbFohIJxFlIngLyImbzg7LmjIfOCvCeCJR57wCYOukr7BzVzXf/34CgxIR2Q9RJoIV\nwFAzG2JmGcB5wML4BcxsaNzk6cDrEcYTiYKcAi4ec3HNdPWhayHvAf73F9W8+WYCAxMRaaXIEoG7\nVwJXAYuA14CH3X2dmd1iZrH+lKvMbJ2ZrQGuBS6MKp4o1T+c1AtvwlPKufHGBAYlItJK5p3sdNj8\n/HwvKipKdBgNnP3Q2SxYH3dG2ZLZ8MwsnnkGTjwxYWGJiABgZivdPb+xeQkfLO4qGlymetId0LeY\n6ZeWUlGRuLhERFqiRNBGGpxklrEXJl/Nf/7dnZ//PLGxiYg0R4mgDTU4yWz4ozDsUW6eVUVJSeLi\nEhFpjhJBG6tzkhnAqVdTVlHJNdckLiYRkeYoEbSxBieZHVJM9Qm38sgjsGhR4uISEWmKEkEEGgwc\nT/wx9N/AJVfsZd++xMUlItIYJYIINBg4TiuH075GSXE3nXEsIh2OEkFEGgwcf3QxjP4tP/yhs2pV\n4uISEalPiSBCDQaOJ3+T9F7b+cpXoLw8cXGJiMRTIohQg4HjbjvYN/krrF0Lt9+euLhEROIpEUSs\nwcDx8IXYqD9w663Os88mLjsiRB8AAA89SURBVC4RkRglgog1GDgG/LSvkjXwHc47D95/P4HBiYig\nRNAuGgwcZ+2m9KzT2bqtmgsugKqqxMUmIqJE0E7qDxz7oDWMuOBennwSfvCDBAYmIklPiaCdNBg4\nBtYMvorxp77O7NmwZEli4hIRUSJoRw0Gjs0pGpNPzlGlTJ0K776buNhEJHkpEbSjxgaOqzN2MWDa\nV9m1C778ZY0XiEj7UyJoZw0GjoGV1b+l8Kt/ZskSuOoq6GQ3jRORTk6JIAFmTJxRp1UA8H+9vsTk\n6au591747ncTFJiIJCUlggQoyCng+knX1ylznCdzx3Hml9/jBz+An/wkQcGJSNJRIkiQO065gxmT\nZtQpq6aKfx93Cp86fSvXXQe/+U2CghORpKJEkEB3nHIHZw0/q07Zax+8wrKxH2H8iTu49FL4858T\nFJyIJA0lggRrcEgpUJlSSvrULzFxIpx3HjzwQIKCE5GkoESQYI0dUgrw/HtPkj/ju3z603DRRTBn\nToICFJEuT4mgA7hs7GXce8a9DcrvWnUbx1z9baZMCQ4rveYaqKxMQIAi0qUpEXQQl429rMHgMcBP\nV9zOsK/eyNVXw113wWc/C1u2JCBAEemylAg6kMaOJAK4858/5Jhpc3ngAXj+eRg7FlasSECAItIl\nKRF0ME0lg8v/fjmvZd/A889DSgocfzzcfbfOQhaRgxdpIjCzyWa2wcw2mtnMRuZfa2avmtlaM1ts\nZh+JMp7O4o5T7uDEj5zYoPxHz/+In236H4qK4NOfhm98I0gIah2IyMGILBGYWSowBzgVGAFMNbMR\n9RZbDeS7+yjgEeBHUcXT2fzw5B+SnpLeoHzey/P4/N8/ybfvWc6vfw0bN8L48XDBBVBSkoBARaTT\ni7JFMB7Y6O5vuns5MB+oc7U1d1/i7qXh5D+B7Ajj6VQKcgp4ZvoznHhkw5bBsk3LOOHB46kcPZfX\nX4eZM4MTz4YNg1mzYPv2BAQsIp1WlIlgMLA5brokLGvKxcATjc0ws8vMrMjMirYk0SEzBTkFPPOV\nZzh/5PkN5lV7NZf//XJue/EGbr8d1q+HKVPgllvg8MODS1o/9BDs25eAwEWkU+kQg8Vm9j9APvDj\nxua7+1x3z3f3/IEDB7ZvcB3AH875Q6MDyBCMG+Tdm8c7qcuZPx/WrIFLLoFFi4Kzko8+Ojj/4KWX\n2jloEek0okwEbwE5cdPZYVkdZnYK8B1giruXRRhPp3bHKXdw3xn31bnvccxL773ExN9M5JMPfpLS\nfsv5xS/g/fdh4UIYMSI4KzkvD0aPhm9/Gx55BP77Xx1xJCIB84i+DcwsDfg3cDJBAlgBfNnd18Ut\nM4ZgkHiyu7/emu3m5+d7UVFRBBF3Dss3L2fmUzNZ9t9lTS5z1vCzmDFxBgU5BQBs2wZ/+hM8/DAs\nX157dvKgQVBYCOPGwTHHBElj0CBIbzhGLSKdnJmtdPf8RudFlQjCHZ8G3AWkAr9x99vM7BagyN0X\nmtlTwEjgnXCV/7r7lCY2BygRxPzPX/+HeS/Pa3K+YVw/6XruOOWOOuW7dsFrrwUnpq1cCUuXwttv\n11134MDgpLWcHOjZE7Kzg+e9ekGfPnDoodCvH/TuDal1r5cnIh1UwhJBFJQIas1dOZcfPPsDNu3c\n1OQyg3oOYlj/YYwYMIJpo6fVtBLibdkCr74aPLZuhTfegLVrg+6jvXuhtLSRDYe6dQtOcKuuDhJF\nz56QlhY8UlNr/6amQvfusGcPZGZCVlbwyMyEiopgHwMHBoPbGRnw3ntBohk0CN55J1iuT59gn/36\nBa2aHTuCI6SGDQvK3YNHdXXt8/rT6enB361bg+cpKcHDLHg09TwzM3iUlQXr7dwZHLr71lswalQQ\ne1ZWUF5aGtS1Tx8oLw/qnJ5eu355ebBsaWlQVlkZ7KNbt2C5qqrg4V4bQ0xlZfB6pKUF01lZwXrV\n1cFrnJ4OmzbBkUcGr2NZWdBNmJMTbMs92HZpafBeZWQ0/r5WVwfvS1kZFBfDkCHB652REew/KyvY\n5t69QXl6OvTvXzdWs4bbjX1WqquD57E6xeoce93jHxC8zykpwX5ij5SUIMZYvGa1P0yKioLpsWOD\n7e7cGcR6+OHBuhUVtZ+HsrJgbG3IEBgwoPY9r/8Z+O9/g+0fdlhtWWVl8Jnt3TvYfmyb770X7KO6\nOthuamowr6IiqEtWFvTtG7y2778PI0fChx8Grfejjgp+rPXsGcSTlRWsl5YGX/wiTJzY9P9jc5QI\nurgbnrqBHz//Y5yW38u8QXkcN/i4JpNCY7ZsCT7Yu3YF/1Dvvx98mHftgt27g3+0tLRges+e2n/s\n+L/V1fDBB3DIIbX/PLFHamrwBbNzZ/Chj/1jlZfDu+/CEUcE/wg7dgTxbN8e7K9Xr2C9t9+u/YeN\n/8dtbDr2BTBwYG1c9ZNFY8/37QviycwM/vbpE8R9xBFBnbdsCZbp2zdIArt2BV+26enBP3R5eW0C\nSE8PvjS6dw/WiXXFxfYRS5xmtTHEf6n27h3UPyMj2E7syDD3YJ/Z2cHrVlkZxNuzZ/Cexb8esaTc\n1EUMzWq/cPv1CxLnIYcEr1/v3sH8zZuhR4+gvLw8eH9jGvtaidUlvm7udX8sQLBMLBHGkkbfvsG8\nioraR3V17WsXn+SqqoIfB/v2BefWpKYG62dmBp+V2Hqxz0NKCnzsY8EXfWy7jf2QiP3oKSurrV9q\navCe7toV/I3NP/TQYF5KSu14nFlQ10MOCb709+wJluvdO/hBkZER7GPr1iABxH5wxD4jFRXws5/B\nxRe36t+2kfdUiaDLW755OT96/kcs2LCg1esM7TeUtJQ0jh5wdJ0xBWlc7J9ZpDNSIkgirRlMbsqg\nnoPISsuib1ZfyirLGNhjYLNdSiLSeSgRJKFYC2H1u6ubHUNoraH9hlJeVY6Z0TerL9v3bq95XlZZ\nRmZaZk3ZkX2OVAIR6WCUCJJcWyeF/bE/CaT+/PZYVvvovPF0lX20dtmD/YGlRCA1Yklhw7YNVFZX\n8voHrTp9Q0Q6iMzUTJZcuGS/k0FziSCtTSKTTqMgp4C/nfe3munlm5fzu5d+x6tbXmXTzk11foW0\nd+tBRFpWXlXO0uKlbdrtqkSQ5ApyCpr8QMW3HpprusY3bcuqynh3z7vtXAuR5JGRmkFhbmGbblOJ\nQJpUv/XQWgeSQLpyH3BX2UdHi6er7KO9xgiao0Qgbe5AE4iIJEaHuAy1iIgkjhKBiEiSUyIQEUly\nSgQiIklOiUBEJMkpEYiIJLlOd4kJM9sCHOgprwOArW0YTmegOicH1Tk5HEydP+LuAxub0ekSwcEw\ns6KmrrXRVanOyUF1Tg5R1VldQyIiSU6JQEQkySVbIpib6AASQHVODqpzcoikzkk1RiAiIg0lW4tA\nRETqUSIQEUlySZEIzGyymW0ws41mNjPR8bQVM/uNmb1vZq/ElfUzs3+Y2evh30PCcjOzu8PXYK2Z\nHZu4yA+cmeWY2RIze9XM1pnZN8LyLltvM8sysxfN7KWwzt8Ly4eY2b/Cuj1kZhlheWY4vTGcn5vI\n+A+GmaWa2Woz+3s43aXrbGbFZvayma0xs6KwLPLPdpdPBGaWCswBTgVGAFPNbERio2ozDwKT65XN\nBBa7+1BgcTgNQf2Hho/LgF+2U4xtrRL4lruPAI4Dvha+n1253mXAp9x9NJAHTDaz44A7gJ+5+8eA\n7cDF4fIXA9vD8p+Fy3VW3wBei5tOhjqf5O55cecLRP/Zdvcu/QAKgEVx0zcCNyY6rjasXy7wStz0\nBuDw8PnhwIbw+X3A1MaW68wP4P8Bn06WegPdgVXABIIzTNPC8prPObAIKAifp4XLWaJjP4C6Zodf\nfJ8C/g5YEtS5GBhQryzyz3aXbxEAg4HNcdMlYVlXdZi7vxM+fxc4LHze5V6HsPk/BvgXXbzeYRfJ\nGuB94B/AG8AOd68MF4mvV02dw/k7gf7tG3GbuAuYAVSH0/3p+nV24EkzW2lml4VlkX+2davKLszd\n3cy65PHBZtYT+AvwTXffZWY187pivd29Csgzs77A34DhCQ4pUmZ2BvC+u680s8JEx9OOjnf3t8zs\nUOAfZrY+fmZUn+1kaBG8BeTETWeHZV3Ve2Z2OED49/2wvMu8DmaWTpAE5rn7X8PiLl9vAHffASwh\n6Bbpa2axH3Px9aqpczi/D7CtnUM9WJOAKWZWDMwn6B76OV27zrj7W+Hf9wkS/nja4bOdDIlgBTA0\nPNogAzgPWJjgmKK0ELgwfH4hQR96rHxaeKTBccDOuOZmp2HBT/9fA6+5+0/jZnXZepvZwLAlgJl1\nIxgTeY0gIZwbLla/zrHX4lzgaQ87kTsLd7/R3bPdPZfgf/Zpdz+fLlxnM+thZr1iz4HPAK/QHp/t\nRA+OtNMAzGnAvwn6Vb+T6HjasF5/At4BKgj6By8m6BddDLwOPAX0C5c1gqOn3gBeBvITHf8B1vl4\ngn7UtcCa8HFaV643MApYHdb5FeDmsPwo4EVgI/BnIDMszwqnN4bzj0p0HQ6y/oXA37t6ncO6vRQ+\n1sW+q9rjs61LTIiIJLlk6BoSEZFmKBGIiCQ5JQIRkSSnRCAikuSUCEREkpwSgUjIzKrCqz7GHm12\npVozy7W4q8SKdCS6xIRIrb3unpfoIETam1oEIi0IrxH/o/A68S+a2cfC8lwzezq8FvxiMzsyLD/M\nzP4W3j/gJTObGG4q1cx+Fd5T4MnwLGHM7GoL7q+w1szmJ6iaksSUCERqdavXNfSluHk73X0k8AuC\nq2IC/C/wW3cfBcwD7g7L7wae8eD+AccSnCUKwXXj57j7McAO4PNh+UxgTLidK6KqnEhTdGaxSMjM\n9rh7z0bKiwluDPNmeMG7d929v5ltJbj+e0VY/o67DzCzLUC2u5fFbSMX+IcHNxfBzG4A0t39+2b2\nf8AeYAGwwN33RFxVkTrUIhBpHW/i+f4oi3teRe0Y3ekE14w5FlgRd3VNkXahRCDSOl+K+7s8fP4C\nwZUxAc4Hng2fLwauhJobyvRpaqNmlgLkuPsS4AaCyyc3aJWIREm/PERqdQvvAhbzf+4eO4T0EDNb\nS/CrfmpY9nXgATO7HtgCfCUs/wYw18wuJvjlfyXBVWIbkwr8IUwWBtztwT0HRNqNxghEWhCOEeS7\n+9ZExyISBXUNiYgkObUIRESSnFoEIiJJTolARCTJKRGIiCQ5JQIRkSSnRCAikuT+P9hMeDL/0YJT\nAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "iOFBSbPcYCN4",
+        "colab_type": "text"
+      },
+      "source": [
+        "The graph shows the _loss_ (or the difference between the model's predictions and the actual data) for each epoch. There are several ways to calculate loss, and the method we have used is _mean squared error_. There is a distinct loss value given for the training and the validation data.\n",
+        "\n",
+        "As we can see, the amount of loss rapidly decreases over the first 25 epochs, before flattening out. This means that the model is improving and producing more accurate predictions!\n",
+        "\n",
+        "Our goal is to stop training when either the model is no longer improving, or when the _training loss_ is less than the _validation loss_, which would mean that the model has learned to predict the training data so well that it can no longer generalize to new data.\n",
+        "\n",
+        "To make the flatter part of the graph more readable, let's skip the first 50 epochs:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Zo0RYroFZYIV",
+        "colab_type": "code",
+        "outputId": "5844429f-cb52-41e0-c41c-52485efcd0ac",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 295
+        }
+      },
+      "source": [
+        "# Exclude the first few epochs so the graph is easier to read\n",
+        "SKIP = 50\n",
+        "\n",
+        "plt.plot(epochs[SKIP:], loss[SKIP:], 'g.', label='Training loss')\n",
+        "plt.plot(epochs[SKIP:], val_loss[SKIP:], 'b.', label='Validation loss')\n",
+        "plt.title('Training and validation loss')\n",
+        "plt.xlabel('Epochs')\n",
+        "plt.ylabel('Loss')\n",
+        "plt.legend()\n",
+        "plt.show()"
+      ],
+      "execution_count": 11,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de3xV1bnv/8+TQIgKSAkoSNBgxQs2\nkkhAF1SNxbMr6sELtpXaBjYqir1s2m3Vs31ZPfZ2RPcu21+xGmspdOtGK7vUa22NRFBjyy2AIFS0\nQVBUjCJQhJDk+f0x5wori5V7Vm7r+3698sqa1zXmXEmejPGMMaa5OyIiIvHSOrsAIiLSNSlAiIhI\nQgoQIiKSkAKEiIgkpAAhIiIJKUCIiEhCChDSIczsOTOb1t77diYzqzCzC5JwXjezk8LXD5jZ7c3Z\ntxXvc7WZ/am15WzkvIVmtr29zysdr1dnF0C6LjPbG7N4JHAAqAmXr3f3R5p7LneflIx9ezp3v6E9\nzmNmOcDfgd7uXh2e+xGg2Z+hpB4FCGmQu/eNvjazCuBad38hfj8z6xX9oyMiPYeamKTFok0IZnaL\nmb0PzDezz5nZ02a208w+CV9nxxxTambXhq+nm9nLZnZvuO/fzWxSK/cdYWbLzGyPmb1gZvPM7L8a\nKHdzyvgjM3slPN+fzGxQzPZvmtlWM6s0s9sauT9nmdn7ZpYes+5yM1sXvh5nZmVmtsvMdpjZL8ws\no4Fz/cbMfhyz/IPwmPfMbEbcvheb2Roz221m28zszpjNy8Lvu8xsr5lFovc25vjxZrbCzD4Nv49v\n7r1pjJmdFh6/y8w2mNnkmG0XmdnG8JzvmtlN4fpB4eezy8w+NrPlZqa/Vx1MN1xaawgwEDgBmEnw\nszQ/XD4e+Az4RSPHnwVsBgYBc4CHzcxase+jwF+BLOBO4JuNvGdzyvh14J+BY4AMIPoHaxTwy/D8\nx4Xvl00C7v4X4B/Al+LO+2j4ugb4Xng9EWAicGMj5SYsw4Vhef4XMBKIz3/8AygCBgAXA7PM7LJw\n27nh9wHu3tfdy+LOPRB4BrgvvLb/AJ4xs6y4azjs3jRR5t7AU8CfwuO+AzxiZqeEuzxM0FzZD/gC\n8GK4/l+B7cBg4Fjg3wDNC9TBFCCktWqBO9z9gLt/5u6V7r7Y3fe5+x7gJ8B5jRy/1d0fcvcaYAEw\nlOAPQbP3NbPjgbHAD929yt1fBp5s6A2bWcb57v43d/8MeBzIC9dfCTzt7svc/QBwe3gPGvLfwFQA\nM+sHXBSuw91Xuftr7l7t7hXAgwnKkchXw/K97u7/IAiIsddX6u7r3b3W3deF79ec80IQUN5099+G\n5fpvYBPwv2P2aejeNOZsoC/w/8LP6EXgacJ7AxwERplZf3f/xN1Xx6wfCpzg7gfdfblr4rgOpwAh\nrbXT3fdHF8zsSDN7MGyC2U3QpDEgtpklzvvRF+6+L3zZt4X7Hgd8HLMOYFtDBW5mGd+Peb0vpkzH\nxZ47/ANd2dB7EdQWrjCzPsAVwGp33xqW4+Sw+eT9sBw/JahNNKVeGYCtcdd3lpktDZvQPgVuaOZ5\no+feGrduKzAsZrmhe9Nkmd09NpjGnncKQfDcamYvmVkkXH8PsAX4k5m9bWa3Nu8ypD0pQEhrxf83\n96/AKcBZ7t6fQ00aDTUbtYcdwEAzOzJm3fBG9m9LGXfEnjt8z6yGdnb3jQR/CCdRv3kJgqaqTcDI\nsBz/1poyEDSTxXqUoAY13N2PBh6IOW9T/32/R9D0Fut44N1mlKup8w6Pyx/UndfdV7j7pQTNT0sI\naia4+x53/1d3PxGYDHzfzCa2sSzSQgoQ0l76EbTp7wrbs+9I9huG/5GvBO40s4zwv8//3cghbSnj\nE8AlZvbFMKF8F03//jwK/AtBIPpdXDl2A3vN7FRgVjPL8Dgw3cxGhQEqvvz9CGpU+81sHEFgitpJ\n0CR2YgPnfhY42cy+bma9zOxrwCiC5qC2+AtBbeNmM+ttZoUEn9Gi8DO72syOdveDBPekFsDMLjGz\nk8Jc06cEeZvGmvQkCRQgpL3MBY4APgJeA/7YQe97NUGitxL4MfAYwXiNRFpdRnffAHyL4I/+DuAT\ngiRqY6I5gBfd/aOY9TcR/PHeAzwUlrk5ZXguvIYXCZpfXozb5UbgLjPbA/yQ8L/x8Nh9BDmXV8Ke\nQWfHnbsSuISgllUJ3AxcElfuFnP3KoKAMIngvt8PFLn7pnCXbwIVYVPbDQSfJwRJ+BeAvUAZcL+7\nL21LWaTlTHkf6UnM7DFgk7snvQYj0tOpBiHdmpmNNbPPm1la2A30UoK2bBFpI42klu5uCPA/BAnj\n7cAsd1/TuUUS6RnUxCQiIgmpiUlERBLqMU1MgwYN8pycnM4uhohIt7Jq1aqP3H1wom09JkDk5OSw\ncuXKzi6GiEi3YmbxI+jrqIlJREQSUoAQEZGEFCBERCShHpODEJGOd/DgQbZv387+/fub3lk6VWZm\nJtnZ2fTu3bvZxyhAiEirbd++nX79+pGTk0PDz3uSzubuVFZWsn37dkaMGNHs49TEJCKttn//frKy\nshQcujgzIysrq8U1PQUIoKwMfvaz4LuItIyCQ/fQms8p5ZuYyspg4kSoqoKMDCgpgUik6eNERHq6\nlK9BlJYGwaGmJvheWtrZJRKR5qqsrCQvL4+8vDyGDBnCsGHD6parqqoaPXblypV897vfbfI9xo8f\n3y5lLS0t5ZJLLmmXc3WUlK9BFBYGNYdoDaKwsLNLJCLNlZWVRXl5OQB33nknffv25aabbqrbXl1d\nTa9eif/MFRQUUFBQ0OR7vPrqq+1T2G4o5WsQkUjQrPSjH6l5SaQjlG0r42fLf0bZtuQk/aZPn84N\nN9zAWWedxc0338xf//pXIpEI+fn5jB8/ns2bNwP1/6O/8847mTFjBoWFhZx44oncd999defr27dv\n3f6FhYVceeWVnHrqqVx99dVEZ8N+9tlnOfXUUxkzZgzf/e53m6wpfPzxx1x22WWcccYZnH322axb\ntw6Al156qa4GlJ+fz549e9ixYwfnnnsueXl5fOELX2D58uXtfs8akvI1CBHpOGXbypi4cCJVNVVk\npGdQUlRCZHj7/1e2fft2Xn31VdLT09m9ezfLly+nV69evPDCC/zbv/0bixcvPuyYTZs2sXTpUvbs\n2cMpp5zCrFmzDhszsGbNGjZs2MBxxx3HhAkTeOWVVygoKOD6669n2bJljBgxgqlTpzZZvjvuuIP8\n/HyWLFnCiy++SFFREeXl5dx7773MmzePCRMmsHfvXjIzMykuLubLX/4yt912GzU1Nezbt6/d7lNT\nUj5AKEkt0nFKK0qpqqmixmuoqqmitKI0KQHiK1/5Cunp6QB8+umnTJs2jTfffBMz4+DBgwmPufji\ni+nTpw99+vThmGOO4YMPPiA7O7vePuPGjatbl5eXR0VFBX379uXEE0+sG18wdepUiouLGy3fyy+/\nXBekvvSlL1FZWcnu3buZMGEC3//+97n66qu54ooryM7OZuzYscyYMYODBw9y2WWXkZeX16Z70xIp\n38SkJLVIxynMKSQjPYN0SycjPYPCnMKkvM9RRx1V9/r222/n/PPP5/XXX+epp55qcCxAnz596l6n\np6dTXV3dqn3a4tZbb+VXv/oVn332GRMmTGDTpk2ce+65LFu2jGHDhjF9+nQWLlzYru/ZmJSvQShJ\nLdJxIsMjlBSVUFpRSmFOYVJqD/E+/fRThg0bBsBvfvObdj//Kaecwttvv01FRQU5OTk89thjTR5z\nzjnn8Mgjj3D77bdTWlrKoEGD6N+/P2+99Ra5ubnk5uayYsUKNm3axBFHHEF2djbXXXcdBw4cYPXq\n1RQVFbX7dSSS8gEimqQuLQ2Cg5qXRJIrMjzSIYEh6uabb2batGn8+Mc/5uKLL2738x9xxBHcf//9\nXHjhhRx11FGMHTu2yWOiSfEzzjiDI488kgULFgAwd+5cli5dSlpaGqeffjqTJk1i0aJF3HPPPfTu\n3Zu+fft2aA2ixzyTuqCgwNvywKCyMgUJkZZ64403OO200zq7GJ1u79699O3bF3fnW9/6FiNHjuR7\n3/teZxfrMIk+LzNb5e4J+/umfA0ClKgWkbZ56KGHWLBgAVVVVeTn53P99dd3dpHahQIEiRPVChAi\n0lzf+973umSNoa1SvhcTHEpUp6crUS0iEqUaBEpUi4gkogARigaF6DgIBQkRSXUKECElqkVE6lMO\nIqQR1SLdz/nnn8/zzz9fb93cuXOZNWtWg8cUFhYS7RJ/0UUXsWvXrsP2ufPOO7n33nsbfe8lS5aw\ncePGuuUf/vCHvPDCCy0pfkJdaVpwBYiQEtUi3c/UqVNZtGhRvXWLFi1q1oR5EMzCOmDAgFa9d3yA\nuOuuu7jgggtada6uSgEipGm/RTpGez7i98orr+SZZ56pezhQRUUF7733Hueccw6zZs2ioKCA008/\nnTvuuCPh8Tk5OXz00UcA/OQnP+Hkk0/mi1/8Yt2U4BCMcRg7diyjR49mypQp7Nu3j1dffZUnn3yS\nH/zgB+Tl5fHWW28xffp0nnjiCQBKSkrIz88nNzeXGTNmcODAgbr3u+OOOzjzzDPJzc1l06ZNjV5f\nZ08LrgAhIh0mmuu7/fbge1uDxMCBAxk3bhzPPfccENQevvrVr2Jm/OQnP2HlypWsW7eOl156qe6P\nayKrVq1i0aJFlJeX8+yzz7JixYq6bVdccQUrVqxg7dq1nHbaaTz88MOMHz+eyZMnc88991BeXs7n\nP//5uv3379/P9OnTeeyxx1i/fj3V1dX88pe/rNs+aNAgVq9ezaxZs5psxopOC75u3Tp++tOf1s3B\nFJ0WvLy8nOXLl3PEEUfw6KOP8uUvf5ny8nLWrl3bLrO+KkCE2vsHV0QOl4xcX2wzU2zz0uOPP86Z\nZ55Jfn4+GzZsqNccFG/58uVcfvnlHHnkkfTv35/JkyfXbXv99dc555xzyM3N5ZFHHmHDhg2Nlmfz\n5s2MGDGCk08+GYBp06axbNmyuu1XXHEFAGPGjKGioqLRc7388st885vfBBJPC37fffexa9cuevXq\nxdixY5k/fz533nkn69evp1+/fo2euzkUIEJKUoskXzJyfZdeeiklJSWsXr2affv2MWbMGP7+979z\n7733UlJSwrp167j44osbnOa7KdOnT+cXv/gF69ev54477mj1eaKiU4a3ZbrwjpoWXAEipCS1SPIl\nI9fXt29fzj//fGbMmFFXe9i9ezdHHXUURx99NB988EFdE1RDzj33XJYsWcJnn33Gnj17eOqpp+q2\n7dmzh6FDh3Lw4EEeeeSRuvX9+vVjz549h53rlFNOoaKigi1btgDw29/+lvPOO69V1xadFhxIOC34\nLbfcwtixY9m0aRNbt27l2GOP5brrruPaa69l9erVrXrPWBoHQfAYxNLqUuY+eglrns/t7OKI9GiR\nSPt3Apk6dSqXX355XVPT6NGjyc/P59RTT2X48OFMmDCh0ePPPPNMvva1rzF69GiOOeaYelN2/+hH\nP+Kss85i8ODBnHXWWXVB4aqrruK6667jvvvuq0tOA2RmZjJ//ny+8pWvUF1dzdixY7nhhhtadV2d\nPS14Uqf7NrMLgf8E0oFfufv/i9v+feBaoBrYCcxw963htuOBXwHDAQcucveKht6rtdN9xz4jN/3d\nL2ILS6g+mK7BciLNoOm+u5eWTvedtCYmM0sH5gGTgFHAVDMbFbfbGqDA3c8AngDmxGxbCNzj7qcB\n44APk1HO2GfkHnxrAlVVpjyEiAjJzUGMA7a4+9vuXgUsAi6N3cHdl7r7vnDxNSAbIAwkvdz9z+F+\ne2P2a1exz8jt/flXyMhw5SFEREhuDmIYsC1meTtwViP7XwNEM0knA7vM7H+AEcALwK3uXhN7gJnN\nBGYCHH/88a0qZPwzcpmerlldRVrA3TGzzi6GNKE16YQu0YvJzL4BFAD3hKt6AecANwFjgROB6fHH\nuXuxuxe4e8HgwYPbpSyRSBAcSks1FkKkKZmZmVRWVrbqj490HHensrKSzMzMFh2XzBrEuwQJ5qjs\ncF09ZnYBcBtwnrsfCFdvB8rd/e1wnyXA2cDD7V3I2CR1RnoGc0//C7O/nqtZXUWaITs7m+3bt7Nz\n587OLoo0ITMzk+zs7BYdk8wAsQIYaWYjCALDVcDXY3cws3zgQeBCd/8w7tgBZjbY3XcCXwJa3kWp\nGWKT1FU1VSx+rlKPHxVppt69ezNixIjOLoYkSdKamNy9Gvg28DzwBvC4u28ws7vMLDqO/R6gL/A7\nMys3syfDY2sImpdKzGw9YMBDyShnbJI6Iz2DKZOyNGBORIQkj4PoSK0dBwHhQLkwSR0ZHqG4GBYv\nhilTYObMdi6oiEgX0tg4CI2kJujJBEFz0/pVfZk9O8hBLF8OublqYhKR1KQAQf1Etb38GbVVX6C2\nxpSDEJGU1iW6uXa22ER17Qkvkt6rWjkIEUl5qkFwKFFdVVNFRs5q5i7aROUbuRosJyIpTQGCw0dT\nR4bnUnbsobmYFCREJBUpQIQOS1RrsJyIpDgFiJAS1SIi9SlJHVKiWkSkPtUgQokS1Xq6nIikMo2k\njhE7oprtESZORHkIEenROuWJct1daSmHTdonIpJK1MQUSjTtd0bGoZ5MykOISKpRDSIUP+13ZdbT\nzJ0LEyfC3LlqXhKR1KMaRKhekjo9g6zKS5g9G03aJyIpSwEiFB1NvXDtQgDWLO+vBweJSEpTgIiz\nYO0CqmqqSN/1Br16lwDpykGISEpSgIgRm4dg2Mtc9x+PwLqizi6WiEinUJI6RvzjR/OH5rNgATz0\nUJCsLivr7BKKiHQc1SBixM/qWvpfucpDiEjKUoCIEzura9ZpfTUWQkRSlgJEnPoD5n7E3Ef/ojmZ\nRCQlKQcRJ37A3Joda5SHEJGUpAARJz5RTcV5mpNJRFKSAkScyPAIcy+cy8QRE5l74VyKLjuBjAz0\nbAgRSTnKQcQp21bG7D/OpqqmiuXvLKekKJeSkgilpUFwUC8mEUkVqkHEic9BlFaUdnaRREQ6hWoQ\ncRJN2jfx63pwkIikHgWIOBosJyISUIBIINFguQMHwAyysjq5cCIiHSSpOQgzu9DMNpvZFjO7NcH2\n75vZRjNbZ2YlZnZC3Pb+ZrbdzH6RzHLGiw6Wu33p7czecBbf+eFbpKdDbS3Mnq2xECKSGpIWIMws\nHZgHTAJGAVPNbFTcbmuAAnc/A3gCmBO3/UfAsmSVsSHxieryv2+jtjYIEBoLISKpIpk1iHHAFnd/\n292rgEXApbE7uPtSd98XLr4GZEe3mdkY4FjgT0ksY0Lxg+WmTMrSWAgRSTnJzEEMA7bFLG8Hzmpk\n/2uA5wDMLA34d+AbwAUNHWBmM4GZAMcff3wbi3tI/NPlckfvpaQEFi5st7cQEenyukSS2sy+ARQA\n54WrbgSedfftZtbgce5eDBQDFBQUeHuXK/p0uQVrFzD39L+wYEHQo2nBAnV3FZGeL5kB4l1geMxy\ndriuHjO7ALgNOM/dD4SrI8A5ZnYj0BfIMLO97n5YojtZ4vMQi5+rVHdXEUkpycxBrABGmtkIM8sA\nrgKejN3BzPKBB4HJ7v5hdL27X+3ux7t7DnATsLAjgwMoDyEikrQahLtXm9m3geeBdODX7r7BzO4C\nVrr7k8A9BDWE34VNSe+4++Rklakl4gfMRYbnkqs8hIikkKTmINz9WeDZuHU/jHndYAI6Zp/fAL9p\n77K11oIFKA8hIimhSySpu6L6T5bLCGoTpRHlIUQkZWg21wYkmtW1sDDIQZgF35WHEJGeTAGiAfFJ\n6sKcQiAIDrHfRUR6KjUxNSB+sBwETUrV1eAefFcTk4j0ZAoQTYgfLJeRkVv3bAg1MYlIT6YA0Yj4\nPERl1tOUlOSqq6uIpATlIBrRUB5iwQJ46CGYOFFTf4tIz6UaRCMaykOoq6uIpAIFiGZIlIfQE+ZE\npKdTE1MTEuUh5s5FT5gTkR5PAaIJifIQlZXoCXMi0uOpiakJkeER5l44l8UbFzNl1BQiwyNQGHRz\nVXdXEenJFCCaULatjNl/nE1VTRXL31lO7jG5RCIR5s6FxYthyhQlqUWkZ1KAaEKiOZnYHmH27KAG\nsXw55OYqSIhIz6McRBMS5SASdXUVEelpVINoQqKxEIWFQe5BXV1FpCdTDaKZFqxdwEOrH2LiwomQ\nXaauriLS4ylANEOiPIS6uopIT6cmpmaI5iGiT5crzCmEXmpmEpGeTTWIZojmIa478zqmjZ4WrIug\nZiYR6dEUIFogNg9Rtq1MzUwi0qM1K0CY2VFmlha+PtnMJptZ7+QWrWvRM6pFJNU0twaxDMg0s2HA\nn4BvAr9JVqG6omgeIo00zIysI4Okg55RLSI9VXMDhLn7PuAK4H53/wpwevKK1fVE52RKT0un1muZ\n/cfZLFyy9bBnVIuI9BTNDhBmFgGuBp4J16Unp0hdV+W+Smq9llqvpaqmCnJeIiMD0tLUk0lEep7m\nBojZwP8Bfu/uG8zsRGBp8orVNcVPu1F0yUj1ZBKRHqtZ4yDc/SXgJYAwWf2Ru383mQXrihJN/V2a\noCeTJu4TkZ6gWQHCzB4FbgBqgBVAfzP7T3e/J5mF62oSTf1dWBjRgDkR6ZGa28Q0yt13A5cBzwEj\nCHoypZREXV01YE5EeqrmBoje4biHy4An3f0g4E0dZGYXmtlmM9tiZrcm2P59M9toZuvMrMTMTgjX\n55lZmZltCLd9rSUXlSwNdXXVgDkR6YmaGyAeBCqAo4Bl4R/y3Y0dYGbpwDxgEjAKmGpmo+J2WwMU\nuPsZwBPAnHD9PqDI3U8HLgTmmtmAZpY1aRJ1dS3bVqYBcyLSIzUrQLj7fe4+zN0v8sBW4PwmDhsH\nbHH3t929ClgEXBp33qXh+AqA14DscP3f3P3N8PV7wIfA4GZfVRLFd3UtrSgFNGBORHqe5k61cbSZ\n/YeZrQy//p2gNtGYYcC2mOXt4bqGXEOQ34h/73FABvBWgm0zo2XauXNnk9fRHhI1M5WWogFzItLj\nNLeJ6dfAHuCr4dduYH57FcLMvgEUAPfErR8K/Bb4Z3evjT/O3YvdvcDdCwYP7pgKRqJmpqzT1mvA\nnIj0OM0NEJ939zvC5qK33f3/Aic2ccy7wPCY5exwXT1mdgFwGzDZ3Q/ErO9PMGr7Nnd/rZnl7BDx\nzUyVWU+rJ5OI9DjNDRCfmdkXowtmNgH4rIljVgAjzWyEmWUAVwFPxu5gZvkECfDJ7v5hzPoM4PfA\nQnd/opll7DDxI6oLcwrVk0lEepzmPlHuBmChmR0dLn8CTGvsAHevNrNvA88TzNv063CajruAle7+\nJEGTUl/gdxZkd99x98kEzVjnAllmNj085XR3L2/+pSVPohHVFOoJcyLSs5h7k8MZDu0cNPvg7rvN\nbLa7z01ayVqooKDAV65c2SHvVbatjIkLJ9Y9grSkqITI8AjFxfDtb0NNDfTpAyUlmnZDRLo2M1vl\n7gWJtrXoiXLuvjscUQ3w/TaXrJtKNKIaggFzNTVBM9OBA2pmEpHurS2PHE3ZHv8NjajOygqCAwTf\n1cwkIt1ZWwJE89umepiGRlRXVgZdXSHIQ6xZ07nlFBFpi0YDhJntMbPdCb72AMd1UBm7pEQjqgsL\noVeY9neH+fPV3VVEuq9GA4S793P3/gm++rl7c3tA9UiJmpkiEZgx49B0GxpVLSLdWVuamFJaQ81M\nRUWQmalR1SLS/SlAtEGiZiY9H0JEegoFiDYozCkkPS0dw0hPS6cwpxBQd1cR6RkUINrIwt6+FtPr\nV91dRaQnUIBog9KKUqprq3Gc6trqegPmot1d09KCZRGR7kYBog0STdoHwRPl+vQJgkNammoQItI9\nKUC0QWR4hJKiEq478zqmjT40d6ES1SLSEyhAtIMFaxfw0OqHmLhwImXbgkgQO/33/v2wcGEnF1JE\npIUUINooduK+/dX7Wbg2iASFhUENAjSqWkS6JwWINop2dQVwnPnl8ynbVlY3qjrq4EF1dxWR7kUB\noo0iwyPMyJtR1801tjdTfv6h/dTdVUS6GwWIdlA0uoje6b0TDpjT7K4i0l0pQLSTRAPmNLuriHRn\nChDtoKEBc/Gzu1ZVqTeTiHQfChDtoKEnzAEUFUHv3sFr1SJEpDtRgGgHDU39Dag3k4h0WwoQ7SR2\n6u/Y8RBweG+mXbs6oYAiIi2kANFOGhoPAUFvJjuUu+bnP1czk4h0fQoQ7aSx8RCxo6oheFaEmplE\npKtTgGhHDY2HiERg3rwgWW2mGV5FpHtQgGhnicZDAMycCb/4RRAcamrgO99RM5OIdG0KEO2oofEQ\nUWvWBMHBXWMiRKTrU4BoR42Nh0jk/fc7qGAiIq2gANGOGhsPAfUHzQE895yamUSk60pqgDCzC81s\ns5ltMbNbE2z/vpltNLN1ZlZiZifEbJtmZm+GX9Pij+2qGhsPEYnANddo6g0R6R6SFiDMLB2YB0wC\nRgFTzWxU3G5rgAJ3PwN4ApgTHjsQuAM4CxgH3GFmn0tWWdtTY+Mh4PCpNx5+WLUIEemaklmDGAds\ncfe33b0KWARcGruDuy91933h4mtAdvj6y8Cf3f1jd/8E+DNwYRLL2m4aGw8BQS3ioosO7X/wIMyZ\n08GFFBFphmQGiGHAtpjl7eG6hlwDPNeSY81sppmtNLOVO3fubGNx209D4yGihgypv/9TT6kWISJd\nT5dIUpvZN4AC4J6WHOfuxe5e4O4FgwcPTk7hWqmh8RAQNDPFjqyurVUuQkS6nmQGiHeB4THL2eG6\neszsAuA2YLK7H2jJsV1V7HiIqpqqeolqCJqZ7r//UJBwhwcfhFtu6YTCiog0IJkBYgUw0sxGmFkG\ncBXwZOwOZpYPPEgQHD6M2fQ88E9m9rkwOf1P4bpuoalENQQjq6+77tCye5CLUJAQka4iaQHC3auB\nbxP8YX8DeNzdN5jZXWY2OdztHqAv8DszKzezJ8NjPwZ+RBBkVgB3heu6haYS1VFFRYeeWR11773K\nR4hI15DUHIS7P+vuJ7v75939J+G6H7p7NBBc4O7Hunte+DU55thfu/tJ4df8ZJYzGYpGF5HZK7PR\nUdWRCNx00+HHaqZXEekKulUTuBQAABKJSURBVESSuidqalR11N13w803169J6IFCItIVKEAkUeW+\nSmpqa6j1Wg5UH0jYzARBkIjWJGprlYsQka5BASKJso7MopZaAGqpbXTyvvLy+sv33APFxcksnYhI\n4xQgkqhyXyVpFtxiw1izY02D+06ZUn/ZHW68UQlrEek8ChBJVJhTSK+0XkDD3V2jZs4MchGxamo0\ngE5EOo8CRBLFd3dNNGgu1t13w2WX1V+nZ0aISGdRgEiy6LxM0HQtAoJaROwzI/7wByWsRaRzKEAk\nWbQWEXWw5mCDvZng0DMjoqIjrM87T/kIEelYChAdIH9oft3rWmrZdaDxgQ6JRlgvWwbnn68gISId\nRwGiA1Tuq6w3q+vPy37eaDNTQyOsDxxQ0lpEOo4CRAeInbwPgrmZGktWw6ER1vE066uIdBQFiA4Q\nGR5h3kXzSLfGZ3iNd/fdcMMN9dcpJyEiHUUBooPMHDOT6848NL93U8nqqKIiyMg4fP2yZQoSIpJc\nChAdKD5Z3djUG1GRSDC767nnHr7t4EG49loFCRFJDgWIDtSSqTdiRSLw0kuJcxIbN8KECXD55QoU\nItK+FCA6UEum3kjk7ruDJLXFPebaHZYsUaAQkfalANGBWjr1RiIzZ8IDDxweJOBQoBg/XvkJEWk7\nBYgOFj/1xsNrHm5RLQIOBYn4wXSxli0LAkV+PsyapWAhIi2nANHBIsMjXHTSRXXLB2sPMueVOS0+\nz8yZ8PLLweR+iWoTUeXlQTAZPx5GjFATlIg0nwJEJxjSd0i95af+9lSLaxEQJK9//3t45ZVgvERe\nXuP7V1QcaoI6+WQYNSpoilIN45CyMvjZz3Q/RADM3Tu7DO2ioKDAV65c2dnFaJaybWWcM/8carwG\nCHo0XT/men55yS/bfO7iYvjpT2Hr1pYfm5MDAwYEU3r06RN8Hzw42LZ/P4wcCW++CVVVwdiMa64J\najLFxbB4cRCgdu8O9s/Ph8pKKCwMlktLg9eRyOHvW1bW8PbGtrW3sjKYODG47rQ0mDcvuL6OVlZ2\naEqVoqLm3bP2uk+JztOa92ppeaL7Z2Ud+rlp63XE3kMIBpi+996hn9umjmns/VtzfS05d3zZo+8V\n+7q9fh/MbJW7FyTcpgDROYpXFXPjMzfWBYk+6X1YOm0pkeHt86kXF8PcubBpU5C8TpZ+/WDPnsb3\nMQvKkJYGZ5xRPwBVV8NbbwXP4jaDk06CXr2C7Z98Au+8ExxrBiec0HAA27nz0LrWft+1C3bsqF/2\nkSMPlac152xp+T755PDgnqgM69YdumfHHgsffnhoOdF9ak75En0WVVX1P4Pmvlds+ZoqzyefwLZt\nwf6xPzOxx7XkPlZXw5Ytjf/cx9/TRMck+oepoZ/L449vuHytPXf870/s6/j7c8opQTf41gQNBYgu\natbTs3hw1YM43q61iFjR/0Y2boS//U0PIBLpqXr3DsZLtTRINBYglIPoRO3Ro6kpkQj88pfBD86O\nHcE4inHjguagE05oPMEtIt3HwYNB81N7Ug2ik12+6HKWbF5St3zZKZfx+6t+32HvH1vDSFQ9Hjw4\nyCvENhkMGwbbt3dYEUWkGZJRg+jVHgWT1ovv0fSHzX+geFUxM8d0THY0EmlZEjE2UTlnDmzeHLR/\nTpoEa8KZQ/r3D7rXDh4cJLUzM2HgwGDbxx8fHogyMuonwOMD1MCBiY9r7xxEtCzRJ/o9/PDh5Ul2\nDiJ6zKhRwX0sLW34nsTez6buU3PLl+iziD93cz6Txj7vhsoRPSbRcS29jxkZwc/q7t3BPz/79wef\na25uw/8QxR/T2HsluieNla8l5479fYqWPfqZRH+X4u9PW3IQjVENopPF92gCSLd0lv/z8nZLWIuI\nNEQ5iC4sMjzC/RffX++JczVe06rBcyIi7UkBoguYOWYml556ab11rR08JyLSXpIaIMzsQjPbbGZb\nzOzWBNvPNbPVZlZtZlfGbZtjZhvM7A0zu8+sZ/e3uXn8zXVPnAPVIkSk8yUtQJhZOjAPmASMAqaa\n2ai43d4BpgOPxh07HpgAnAF8ARgLnJessnYF0aamtJiPZMnmJRSvKu7EUolIKktmDWIcsMXd33b3\nKmARUK8dxd0r3H0dUBt3rAOZQAbQB+gNfJDEsnYJM8fMpOC4+rmih1c/3EmlEZFUl8wAMQzYFrO8\nPVzXJHcvA5YCO8Kv5939jfj9zGymma00s5U7d+5shyJ3vmvOvKbe8or3VnDLC7d0UmlEJJV1ySS1\nmZ0EnAZkEwSVL5nZOfH7uXuxuxe4e8HgaEfkbm7mmJlcdupldcuOM+eVOZz3m/OUtBaRDpXMAPEu\nMDxmOTtc1xyXA6+5+1533ws8B6TMoICbx99c9+zqqGVbl3H+gvMVJESkwyQzQKwARprZCDPLAK4C\nnmzmse8A55lZLzPrTZCgPqyJqaeKDI9w0/ibDlt/oOYApRWlHV8gEUlJSQsQ7l4NfBt4nuCP++Pu\nvsHM7jKzyQBmNtbMtgNfAR40sw3h4U8AbwHrgbXAWnd/Klll7YruvuBubp5w82Hr//jWH1WLEJEO\noak2urhZT8/igVUP1FvXO603L01/SVNxiEibaaqNbqxodBG90urPqXiw9iDXPnmtahIiklQKEF1c\nZHiEeRfNqzdXE8DGjzaqZ5OIJJUCRDcwc8xMHrjkgcOChGoSIpJMChDdRENBYuNHG5nw6wlc/tjl\nChQi0q70wKBuJPoQoRuevgHnUOcCx1myaQl/2PQHzjnhHEYNGkXR6CIlsUWkTVSD6GYaqklAECiW\nbV3GA6se0KA6EWkzBYhuKBok0hr5+A7UHOCrv/uqZoMVkVZTgOimZo6ZycszXuayUy5LWJsA2L5n\nO9c/fT1D/32ochQi0mIaKNcDlG0rY+HahSzbuoyNH21sdN+8IXmcPexs8ofmU7mvksKcQuUqRFJY\nYwPlFCB6kLJtZRQuKKSqpqrZxxjGCQNOIG9IHjePD6b2KK0oVeAQSREKECkkWpt4bftrlH9Q3uLj\nDcPxusBx/NHHM2rQKPKH5rNmxxqAutrHrgO7KN9RzpRRU+p6WMWWo7SilKwjs1RTaaHoveuIe9aR\n79XR2vPa4s/Vkz4jBYgUVbatjDmvzOG17a/x/j/eT+p7Dek7hCF9h3Cg+gDVtdW89fFb1MY8KNAw\nRg8ZTf+M/uzct5M+vfpwoPpAk98HHzWYUYNG0T+zP+U7yskbmsfu/bvZuHMjO/ft5JRBpzDppEl1\nAav076VU1VaRkZbByKyRvFn55mHLmb0zGZg5kI8/+zhhWQYfNRgcdu7bWff+0QD5/t736643f2g+\nz735HO/teY+RWSPZ+Y+d9cq3v3o/15x5DbnH5FJaUVoXUPOG5jGgzwCyjsyqO+eQvkMoGl3E+g/X\nc+MzN1LrtaRZGmOGjqk7x8K1C+v27Z/Zn9K/l9ZdS1T8NQ0+anDd9miZK/dV1r33/PL5VNdWk56W\nztnDzq675kTHRO9vZu/Mus+k9O+lHNf/OCadNInn3nyOzZWb646PHht7jdHPcfBRg+vuV+y9iN63\nwhGF9T7nhs4Z+/MXXb9x50a2frqVbbuD55X1SuvFjLwZde89ZdSUus+kMKcQgDmvzKn7HGN/RqKe\nefMZqmurSbM0svtn886n7wCQnpbOJSMvSfiZxJc1+tlEryV+OfZaYu/T4xser3vv3GNzyUjL4Joz\nr2HmmJl1/xACre7argAhFK8qZu5rc9n00aZ6Yygk+aK1Muk6esJnMvCIIMhE9Unvw9JpS1scJBoL\nEBoolyJmjplZ7z+O6H9fH3/2MX+r/FvSaxiprLv/IeqJesJnEhscAKpqqiitKG3XZigFiBQTGR5J\n+ANUvKqYh1c/fFjzS3VtNW9+/GYnlFREWiLN0uqazNqLAoQAh2oYiRSvKmbxxsV17euJ2uE3V26u\n15YfbfPf+Y+gjTXarhtt229ODuKTzz7hnU/fafC/vSF9h/DB3g8O257dL5v39rxHLbUYxrB+w+qW\n4fBE/IDMAQlzEFs/3crWT7c2ee+S0VyRZmnUem2T+8W/d+w1ffLZJ2zbva1Z54kaeMRAdu3f1aJj\nYkVzUWvfX9tu9yRnQA4DMge06JwN/Ww0JvZeNvaZGsZJA086LM+W6Dzx64/te2xdueKX2yLN0rj/\n4vvbPYmtHIR0abFJuNieVNGEXGyTWTTRm6iXSewyNL8rb/z5E/XgiiY8o4nWaPmAes15iRKs8cnV\n+MR21pFZdYnwwhGFdQndaM+w6LU01FsstjdZfGI3+v6xydtoM2SiY6L3N3pd0etc/+F6Fm9cXK83\nW6L7Ft8Lrrnnj15PQ59FomR//M9GQ/f/488+Puyex9/X+HLH/kzF/mzGfybxnRIa6gEV3+Mv9nui\n94YgqR7tENDWudeUpBYRkYT0RDkREWkxBQgREUlIAUJERBJSgBARkYQUIEREJCEFCBERSajHdHM1\ns51A06OaurZBwEedXYguRPfjEN2L+nQ/6mvL/TjB3Qcn2tBjAkRPYGYrG+qPnIp0Pw7RvahP96O+\nZN0PNTGJiEhCChAiIpKQAkTXUtzZBehidD8O0b2oT/ejvqTcD+UgREQkIdUgREQkIQUIERFJSAGi\nA5nZr83sQzN7PWbdQDP7s5m9GX7/XLjezOw+M9tiZuvM7MzOK3n7M7PhZrbUzDaa2QYz+5dwfare\nj0wz+6uZrQ3vx/8N148ws7+E1/2YmWWE6/uEy1vC7TmdWf5kMLN0M1tjZk+Hy6l8LyrMbL2ZlZvZ\nynBd0n9XFCA61m+AC+PW3QqUuPtIoCRcBpgEjAy/ZgK/7KAydpRq4F/dfRRwNvAtMxtF6t6PA8CX\n3H00kAdcaGZnA3cDP3f3k4BPgGvC/a8BPgnX/zzcr6f5F+CNmOVUvhcA57t7Xsx4h+T/rri7vjrw\nC8gBXo9Z3gwMDV8PBTaHrx8Epibaryd+AX8A/pfuhwMcCawGziIYHdsrXB8Bng9fPw9Ewte9wv2s\ns8vejvcgO/yj9yXgacBS9V6E11UBDIpbl/TfFdUgOt+x7r4jfP0+cGz4ehiwLWa/7eG6HidsEsgH\n/kIK34+wSaUc+BD4M/AWsMvdq8NdYq+57n6E2z8Fsjq2xEk1F7gZ6h76nEXq3gsAB/5kZqvMLPrw\n+KT/rvRqzUGSHO7uZpZS/Y7NrC+wGJjt7rvNrG5bqt0Pd68B8sxsAPB74NROLlKnMLNLgA/dfZWZ\nFXZ2ebqIL7r7u2Z2DPBnM9sUuzFZvyuqQXS+D8xsKED4/cNw/bvA8Jj9ssN1PYaZ9SYIDo+4+/+E\nq1P2fkS5+y5gKUEzygAzi/4jF3vNdfcj3H40UNnBRU2WCcBkM6sAFhE0M/0nqXkvAHD3d8PvHxL8\n8zCODvhdUYDofE8C08LX0wja4qPri8IeCWcDn8ZUJ7s9C6oKDwNvuPt/xGxK1fsxOKw5YGZHEORj\n3iAIFFeGu8Xfj+h9uhJ40cMG5+7O3f+Pu2e7ew5wFcG1XU0K3gsAMzvKzPpFXwP/BLxOR/yudHby\nJZW+gP8GdgAHCdoFryFoKy0B3gReAAaG+xowj6Adej1Q0Nnlb+d78UWCdtV1QHn4dVEK348zgDXh\n/Xgd+GG4/kTgr8AW4HdAn3B9Zri8Jdx+YmdfQ5LuSyHwdCrfi/C614ZfG4DbwvVJ/13RVBsiIpKQ\nmphERCQhBQgREUlIAUJERBJSgBARkYQUIEREJCEFCJEmmFlNOItm9OvWpo9q9rlzLGZ2X5GuRFNt\niDTtM3fP6+xCiHQ01SBEWimco39OOE//X83spHB9jpm9GM7FX2Jmx4frjzWz34fPfFhrZuPDU6Wb\n2UPhcyD+FI6kxsy+a8HzMtaZ2aJOukxJYQoQIk07Iq6J6Wsx2z5191zgFwQzkAL8f8ACdz8DeAS4\nL1x/H/CSB898OJNgVCwE8/bPc/fTgV3AlHD9rUB+eJ4bknVxIg3RSGqRJpjZXnfvm2B9BcFDft4O\nJx58392zzOwjgvn3D4brd7j7IDPbCWS7+4GYc+QAf/bgoS+Y2S1Ab3f/sZn9EdgLLAGWuPveJF+q\nSD2qQYi0jTfwuiUOxLyu4VBu8GKCOXXOBFbEzGQq0iEUIETa5msx38vC168SzEIKcDWwPHxdAsyC\nuocDHd3QSc0sDRju7kuBWwimsD6sFiOSTPqPRKRpR4RPeov6o7tHu7p+zszWEdQCpobrvgPMN7Mf\nADuBfw7X/wtQbGbXENQUZhHM7ptIOvBfYRAx4D4PnhMh0mGUgxBppTAHUeDuH3V2WUSSQU1MIiKS\nkGoQIiKSkGoQIiKSkAKEiIgkpAAhIiIJKUCIiEhCChAiIpLQ/w8rWrjKB6F2NQAAAABJRU5ErkJg\ngg==\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "W4EQD-Bb8hLM",
+        "colab_type": "text"
+      },
+      "source": [
+        "From the plot, we can see that loss continues to reduce until around 500 epochs, at which point it is mostly stable. This means that there's no need to train our network beyond 500 epochs.\n",
+        "\n",
+        "However, we can also see that the lowest loss value is still around 0.155. This means that our network's predictions are off by an average of ~15%. In addition, the validation loss values jump around a lot, and is sometimes even higher.\n",
+        "\n",
+        "**2. Mean Absolute Error**\n",
+        "\n",
+        "To gain more insight into our model's performance we can plot some more data. This time, we'll plot the _mean absolute error_, which is another way of measuring how far the network's predictions are from the actual numbers:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Md9E_azmpkZU",
+        "colab_type": "code",
+        "outputId": "90fff6f3-8dc1-42ec-a0e2-f2434c790a3d",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 295
+        }
+      },
+      "source": [
+        "plt.clf()\n",
+        "\n",
+        "# Draw a graph of mean absolute error, which is another way of\n",
+        "# measuring the amount of error in the prediction.\n",
+        "mae = history_1.history['mae']\n",
+        "val_mae = history_1.history['val_mae']\n",
+        "\n",
+        "plt.plot(epochs[SKIP:], mae[SKIP:], 'g.', label='Training MAE')\n",
+        "plt.plot(epochs[SKIP:], val_mae[SKIP:], 'b.', label='Validation MAE')\n",
+        "plt.title('Training and validation mean absolute error')\n",
+        "plt.xlabel('Epochs')\n",
+        "plt.ylabel('MAE')\n",
+        "plt.legend()\n",
+        "plt.show()"
+      ],
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO2de5xd09n4v885c0lISHPRIGFSgoY0\nV+G4jqbUrQyhRfoORSMhKlqG3kgpSr2VVxskqmnyNhK8ftIgKFNDyLTkJhFEg6kEIYZElGQyM8/v\nj7X3mX3O7DPnzOScuT7f+ZzP7L322vs8e+199rOfy1pLVBXDMAzDSCbS1gIYhmEY7RNTEIZhGEYo\npiAMwzCMUExBGIZhGKGYgjAMwzBCMQVhGIZhhGIKop0jIk+IyPnZrtuWiEiViHwrB8dVEdnfW75H\nRH6ZSd0WfM94EflbS+XsbIjIn0Xk11k+5gUi8kI2j2k0n7y2FqAzIiKfB1Z3AbYDdd76Jao6N9Nj\nqepJuajb2VHVidk4jogUAe8A+apa6x17LpDxNTRyi4hcAFysqke1tSydDVMQOUBVe/jLIlKFu3mf\nSa4nInn+Q8cwjPZP2G+2ub/jjvS7NxdTKyIixSKyQUSuEZGNwCwR+YqIPCYim0TkU295QGCfChG5\n2Fu+QEReEJHbvbrviMhJLaw7SESeF5GtIvKMiEwXkb+kkDsTGW8UkRe94/1NRPoGtv+XiPxbRKpF\n5OdNtM9hIrJRRKKBsjNEZJW3PEZEKkVks4h8ICJ/EJGCFMdKcHuIyNXePu+LyIVJdU8RkRUi8pmI\nrBeRqYHNz3v/N4vI5yISS3Z/iMgRIvKyiGzx/h+RadskyeHfH2Ui8pEnb4mInCwib4rIJyLys0D9\niIhcKyJveW37oIj0Dmx/yGvPLd61PjipfaaLyOOeXP8Ukf2auDYpj+XRV0Se9o71nIjs6+0nInKH\ndz6fichqETnE27a7iMzx7qt/i8gvRKTRM0lEisS5BPMCZRUicrGIfB24B4h512ezt73Qu/ffFZEP\nxbkcuzdxfheKyOve/f2UL7+3TUXkMhH5F/AvCf8dF4rINO/+et9bLky6rvH6qeRob5iCaH36A72B\nfYEJuGswy1vfB/gS+EMT+x8GrAX6ArcB94mItKDu/cBLQB9gKvBfTXxnJjKeB/wA2AMoAK4CEJEh\nwN3e8ffyvm8AIajqP4H/AN9MOu793nIdcKV3PjFgLHBpE3LjyXCiJ8/xwGAgOf7xH6AU6AWcAkwS\nkRJv2zHe/16q2kNVK5OO3Rt4HLjTO7ffAY+LSJ+kc2jUNinoD3QD9gauA+4Fvg+MAo4Gfikig7y6\nlwMlwLG4tv0UmB441hPe+e4BLKexW+wc4FfAV4B1wE1NyJXuWOOBG3HXZmVg+wm4NjwA2B34LlDt\nbfu9V/Y17xxKce2UMar6OjARqPSuTy9v02+87xwO7E9DezZCRE4HfgacCfQDFgPzkqqV4H5PQ7z1\n5N/xz4HDve8bBowBfhHYP7l+x0BV7ZPDD1AFfMtbLgZqgG5N1B8OfBpYr8C5qAAuANYFtu0CKNC/\nOXVxD/laYJfA9r8Af8nwnMJk/EVg/VLgSW/5OmB+YNuuXht8K8Wxfw38yVvuiXt475ui7hTgkcC6\nAvt7y38Gfu0t/wn4TaDeAcG6IcedBtzhLRd5dfMC2y8AXvCW/wt4KWn/SuCCdG0T8r3FOOUbDZy/\nAocF6iwDSrzl14GxgW17AjuCsga29fKOtXugff4Y2H4y8EaG1z/sWMFr3AOnzAfilP2buIdnJFAn\n6t0HQwJllwAVIW0cdg0qSLzXXwhsE+++2S9QFgPeSXE+TwAXBdYjwBf+fed99zeTrlPC7xh4Czg5\nsP5toCrT3317/ZgF0fpsUtVt/oqI7CIiMzwT+zOcS6OXBNwsSWz0F1T1C2+xRzPr7gV8EigDWJ9K\n4Axl3BhY/iIg017BY6vqf2h4gwzjfuBMzzw/E1iuqv/25DhAnHtroyfHzbg31nQkyAD8O+n8DhOR\nZz1XxxbcG2kmx/WP/e+ksn/j3lh9UrVNGNWq6ic0fOn9/zCw/cvA/vsCj4hzuW3GKYw64KsiEhWR\n33jup89wLyqQeF4ZyZXhsYLX+HPgE2AvVf07ztqcDnwkIjNFZDdv33wS2y653VpKP9wL0bJA2zzp\nlYexL/A/gbqf4JRMUJbk30fC75jG98G/vbJU9TsEpiBan+Thc38CHIh7S9yNBpdGKrdRNvgA6C0i\nuwTKBjZRf2dk/CB4bO87+6SqrKqv4X5cJ5HoXgLnqnoDGOzJ8bOWyICzoILcDywEBqrq7jiftn/c\ndMMdv497wATZB3gvA7l2lvXASaraK/Dppqrv4drudJw7bXfcWzi07L7K5FjBa9wD5055H0BV71TV\nUTj3zAHA1cDHOGsn2Hap2u0/3v/g/do/sJx8jT7GKdKDA+2yuwaSR5JYj8suDLZjd1Vd0sR3JK8n\n3wf7eGWp6ncITEG0PT1xN/Nmz599fa6/0HsjXwpMFZECEYkB38mRjP8HnCoiR4kLKN9A+vvufuAK\nnCJ6KEmOz4DPReQgYFKGMjwIXCAiQzwFlSx/T5xFtU1ExuAeiD6bgHqcnzyMRcABInKeiOSJyPdw\nD8LHMpRtZ7gHuCkQEO7n+dPBndN2nLW2C87aaimZHOvkwDW+EfiHqq4XkUM9Cy0f96DfBtR7VtKD\nnvw9vXP4Mc7VmYCqbsIpju971syFQDCg/iEwwPtuVLUeF7u5Q0T2ABCRvUXk2ynO7x7gp+IF3r3g\n+dnNaB9wMYtfeNegL861Gpr00ZEwBdH2TAO64956/oEzhVuD8Ti/bDXO7/8A7iEQRotlVNU1wGW4\nh/4HuEDqhjS7zcMFLf+uqh8Hyq/CPby34h4AD2QowxPeOfwdF4z9e1KVS4EbRGQr7of9YGDfL3DB\n2xc9F8ThSceuBk7FWVnVQBlwapLcueJ/cJbP3zzZ/4ELpALMwVli7wGvedtaSibHuh+neD/BBdS/\n75XvhrtWn3rHqAZ+6227HKc03gZe8I7xpxQy/BBneVQDBwPBt/u/A2uAjSLit/s1uGv9D88t9gzO\nCm6Eqj4C3ArM9+q+irNgm8OvcS9dq4DVuEB+VjsPtgXiBVGMLo6IPIALUubcgjEMo2NgFkQXxTP9\n9xOXS38izse8oK3lMgyj/WA9qbsu/YH/hwsYbwAmqeqKthXJMIz2hLmYDMMwjFDMxWQYhmGE0mlc\nTH379tWioqK2FsMwDKNDsWzZso9VNbQTYadREEVFRSxdurStxTAMw+hQiEjySABxzMVkGIZhhGIK\nwjAMwwjFFIRhGIYRSqeJQRiG0Xrs2LGDDRs2sG1bhxugtMvSrVs3BgwYQH5+fsb7mIIwDKPZbNiw\ngZ49e1JUVETq+aqM9oKqUl1dzYYNGxg0aFD6HTxy6mISkRNFZK2IrBORa5uoN86b1m90oOwb4qaX\nXCNumsJuuZTVMIzM2bZtG3369DHl0EEQEfr06dNsiy9nFoQ3mcx03DSPG4CXRWShN95/sF5P3NDO\n/wyU5eGGyv0vVX3Fm75xR65krayEigooLoZYLFffYhidC1MOHYuWXK9cupjG4Ka8fBtARObjBoR7\nLanejbihdq8OlJ0ArFLVVyA+pHJOqKyEsWOhpgYKCqC83JSEYRgG5NbFtDeJ0/RtIGk6QREZiZvF\n6/GkfQ8AVESeEpHlIlIW9gUiMkFElorI0k2bNrVIyIoKpxzq6tz/iooWHcYwjFakurqa4cOHM3z4\ncPr378/ee+8dX6+pqWly36VLl/KjH/0o7XccccQRWZG1oqICEeGPf/xjvGzlypWICLfffnu8rLa2\nln79+nHttYne+OLiYg488MD4+Z111llZkSsT2ixILSIR4He4CceTyQOOAg7FzZVbLiLLVLU8WElV\nZwIzAUaPHt2iUQeLi53l4FsQxcUtOYphGK1Jnz59WLlyJQBTp06lR48eXHXVVfHttbW15OWFP95G\njx7N6NGjQ7cFWbJkSdo6mXLIIYfw4IMPcvHFFwMwb948hg0bllDn6aef5oADDuChhx7illtuSXAJ\nzZ07NyOZs00uLYj3SJwHeACJ8832BA4BKkSkCjgcWOgFqjcAz6vqx96MXouAkbkQMhaDadOcm2na\nNHMvGUauqFxfyS2Lb6FyfWVOjn/BBRcwceJEDjvsMMrKynjppZeIxWKMGDGCI444grVr1wLujf7U\nU08FnHK58MILKS4u5mtf+xp33nln/Hg9evSI1y8uLuass87ioIMOYvz48fijYC9atIiDDjqIUaNG\n8aMf/Sh+3GT23Xdftm3bxocffoiq8uSTT3LSSYmT1s2bN48rrriCffbZh8rK3LRRc8mlBfEyMFhE\nBuEUwzkE5vpV1S1AX39dRCqAq1R1qYi8BZR58wfX4KafvCMXQlZWwpQpzoJYvBiGDjUlYRjZpnJ9\nJWPnjKWmroaCaAHlpeXEBmb/h7ZhwwaWLFlCNBrls88+Y/HixeTl5fHMM8/ws5/9jIcffrjRPm+8\n8QbPPvssW7du5cADD2TSpEmN+gqsWLGCNWvWsNdee3HkkUfy4osvMnr0aC655BKef/55Bg0axLnn\nntukbGeddRYPPfQQI0aMYOTIkRQWFsa3bdu2jWeeeYYZM2awefNm5s2bl+DiGj9+PN27dwfg+OOP\n57e//W2j4+eCnFkQqloLTAaeAl4HHlTVNSJyg4iclmbfT3Hup5eBlcDykDhFVrAYhGHknoqqCmrq\naqjTOmrqaqioqsjJ95x99tlEo1EAtmzZwtlnn80hhxzClVdeyZo1a0L3OeWUUygsLKRv377sscce\nfPjhh43qjBkzhgEDBhCJRBg+fDhVVVW88cYbfO1rX4v3K0inIL773e/y0EMPMW/evEZ1H3vsMY47\n7ji6d+/OuHHjWLBgAXV1dfHtc+fOZeXKlaxcubLVlAPkuB+Eqi5S1QNUdT9Vvckru05VF4bULVbV\npYH1v6jqwap6iKqGBqmzgR+DiERABPr0ydU3GUbXpbiomIJoAVGJUhAtoLioOCffs+uuu8aXf/nL\nX3Lcccfx6quv8uijj6bsAxB8k49Go9TW1raoTjr69+9Pfn4+Tz/9NGPHjk3YNm/ePJ555hmKiooY\nNWoU1dXV/P3vf2/2d2SbLt+T2o9BTJ7srIgpU8zNZBjZJjYwRnlpORVVFRQXFefEvZTMli1b2Htv\nlzj55z//OevHP/DAA3n77bepqqqiqKiIBx54IO0+N9xwAx999FHcygHirrD169fHFdGsWbOYN28e\nxx9/fNblbg5dXkEAVFdDfb37+G4mUxCGkV1iA2Otohh8ysrKOP/88/n1r3/NKaeckvXjd+/enbvu\nuosTTzyRXXfdlUMPPTTtPmGps4888gjf/OY3E6yU008/nbKyMrZv3w4kxiD69u3LM888k6WzaJpO\nMyf16NGjtaUTBllnOcNoHq+//jpf//rX21qMNufzzz+nR48eqCqXXXYZgwcP5sorr2xrsVISdt28\nLgShObRmQeCUQXk5zJnT1pIYhtGRuPfee5k9ezY1NTWMGDGCSy65pK1FyiqmIALMnu2siNmzzYow\nDCM9V155Zbu2GHYWmzDIw9JdDcMwEjEF4eGnu0ajNuSGYRgGmIspjp/u+vDDMG6cuZcMwzBMQXjY\nkBuGYRiJmIvJw2IQhtFxOO6443jqqacSyqZNm8akSZNS7lNcXIyfCn/yySezefPmRnWmTp2aMAR3\nGAsWLOC11xqmtbnuuuuy0i+hPQ4LbgrCw4bcMIyOw7nnnsv8+fMTyubPn592PCSfRYsW0atXrxZ9\nd7KCuOGGG/jWt77VomMl4w8L7pNuWPDkfmzBMZv+7//+b6flMQXh4ccgolHXo3rKFOd2MgwjO1RW\nwi23ZOd3ddZZZ/H444/HJweqqqri/fff5+ijj2bSpEmMHj2agw8+mOuvvz50/6KiIj7++GMAbrrp\nJg444ACOOuqo+JDg4Po4HHrooQwbNoxx48bxxRdfsGTJEhYuXMjVV1/N8OHDeeutt7jgggviD+Py\n8nJGjBjB0KFDufDCC+M9oYuKirj++usZOXIkQ4cO5Y033giVq70NC24KIkDYkBuGYew8/mgFv/yl\n+7+zz7XevXszZswYnnjiCcBZD9/97ncREW666SaWLl3KqlWreO6551i1alXK4yxbtoz58+ezcuVK\nFi1axMsvvxzfduaZZ/Lyyy/zyiuv8PWvf5377ruPI444gtNOO43f/va3rFy5kv322y9ef9u2bVxw\nwQU88MADrF69mtraWu6+++749r59+7J8+XImTZrUpBvLHxZ8yZIlKYcF/853vsO5557LvHnzEvYd\nP3583MV09dVXJx+62ZiCCGBuJsPIDbmI8QXdTEH30oMPPsjIkSMZMWIEa9asSXAHJbN48WLOOOMM\ndtllF3bbbTdOO61hJoJXX32Vo48+mqFDhzJ37tyUw4X7rF27lkGDBnHAAQcAcP755/P888/Ht595\n5pkAjBo1iqqqqpTHaU/DgpuCCGBuJsPIDbnoZ3T66adTXl7O8uXL+eKLLxg1ahTvvPMOt99+O+Xl\n5axatYpTTjkl5TDf6bjgggv4wx/+wOrVq7n++utbfBwf3xJIN1x4exoW3BREEuZmMozs4493duON\n2RvGpkePHhx33HFceOGF8Tftzz77jF133ZXdd9+dDz/8MO6CSsUxxxzDggUL+PLLL9m6dSuPPvpo\nfNvWrVvZc8892bFjB3Pnzo2X9+zZk61btzY61oEHHkhVVRXr1q0D4H//93859thjW3RuN9xwA7fe\nemvosODvvvsuVVVVVFVVMX369EZupmxi/SCSKC5usCCiUetRbRjZIhbLft+ic889lzPOOCPuaho2\nbBgjRozgoIMOYuDAgRx55JFN7j9y5Ei+973vMWzYMPbYY4+EIbtvvPFGDjvsMPr168dhhx0WVwrn\nnHMOP/zhD7nzzjsTMoW6devGrFmzOPvss6mtreXQQw9l4sSJLTqv9jIsuA33nURlJRx3XMPQ388+\nax3mDCMZG+67Y9Lc4b7NxZRERQXU1oKq+28uJsMwuiqmIJKwTCbDMAyHKYgkLJPJMDKjs7inuwot\nuV6mIEKwTCbDaJpu3bpRXV1tSqKDoKpUV1fTrVu3Zu1nWUwh+G6m7dvNzWQYYQwYMIANGzawadOm\nthbFyJBu3boxYMCAZu1jCiIE3800ebLr+Tllig3/bRhB8vPzGTRoUFuLYeQYczGlwNxMhmF0dUxB\npMCmIDUMo6tjLqYU2BSkhmF0dXJqQYjIiSKyVkTWici1TdQbJyIqIqOTyvcRkc9F5KpcyhmGPwVp\nebmluhqG0TXJmYIQkSgwHTgJGAKcKyJDQur1BK4A/hlymN8BTY+2lSNsClLDMLo6ubQgxgDrVPVt\nVa0B5gOnh9S7EbgVSBhLV0RKgHeApgdhzxHWo9owjK5OLhXE3sD6wPoGryyOiIwEBqrq40nlPYBr\ngF819QUiMkFElorI0mznY1uPasMwujptlsUkIhGcC+knIZunAneo6udNHUNVZ6rqaFUd3a9fvxbL\nUrm+klsW30Ll+kQNYKmuhmF0ZXKZxfQeMDCwPsAr8+kJHAJUiAhAf2ChiJwGHAacJSK3Ab2AehHZ\npqp/yLaQlesrGTtnLDV1NRRECygvLSc20KUsWY9qwzC6Mrm0IF4GBovIIBEpAM4BFvobVXWLqvZV\n1SJVLQL+AZymqktV9ehA+TTg5lwoB4CKqgpq6mqo0zpq6mqoqKqIbzM3k2EYXZmcKQhVrQUmA08B\nrwMPquoaEbnBsxLaBcVFxRREC4hKlIJoAcVFxQnbzc1kGEZXJacd5VR1EbAoqey6FHWLU5RPzbpg\nAWIDY0w7cRoPv/Yw44aMi7uXfHw3kz/DnPWoNgyjq9Dle1JXrq9kypNTqKmrYfG7ixm6x9AEJWE9\nqg3D6Kp0+bGYmopBgPWoNgyj69LlFYQfg4gQQUTos0tiqpL1qDYMo6vS5RWEH4OIRqLUaz1TnpyS\n0B/CelQbhtFV6fIKAqD6i2rqtZ56rbdUV8MwDA9TEKR3M1mqq2EYXRFTEJibyTAMIwxTEB7mZjIM\nw0jEFIRHJj2q6+qcgti+3dxMhmF0frp8RzmfdD2q+/RxygHcf3MzGYbR2TEF4ZGuR3V1tYtB1Ne7\n/9XVbSisYRhGK2AuJo90PaqLi6Gw0CmHSMQsCMMwOj+mIDzSpbpaoNowjK6GKQiPdKmukNgfYts2\nmDOnjYQ1DMNoBUxBBGgq1RWcmykadcuqMGuWWRGGYXReTEEEyMTNdOGFrrMcQG2tpbsahtF5MQUR\nIBM3U2kpdOvmLAmbQMgwjM6MKYgk0rmZYjE3N8SNN7r/NoGQYRidFVMQSRQXFRONRBGEaCTaqEe1\nYRhGV8E6yoUgSML/IJWVMHasG24jEoHp02HChNaW0DAMI/eYBZFERVUFtfW1KEptfW0jF1NFhVMO\n9fUuSD15smUyGYbROTEFkUS6TKbiYmc5+NTVWSaTYRidE1MQSaTLZIrFnFspP9+lu9qwG4ZhdFZM\nQYSQLpNpwgT4wx+ccqirg8svNzeTYRidD1MQIaRzMwGsWOGUg6qbhtSG3TAMo7NhCiKETDrMGYZh\ndHZMQaQgnZuptNQN/y3i/peWto2chmEYuSKnCkJEThSRtSKyTkSubaLeOBFRERntrR8vIstEZLX3\n/5u5lDOMdB3mYjF49lm45BL4wQ9aWzrDMIzck7OOciISBaYDxwMbgJdFZKGqvpZUrydwBfDPQPHH\nwHdU9X0ROQR4Ctg7V7KmoqkOcz6zZ7sYxOzZNvSGYRidi1xaEGOAdar6tqrWAPOB00Pq3QjcCmzz\nC1R1haq+762uAbqLSGEOZW1Eug5z4Po/1NS4YLXND2EYRmcjlwpib2B9YH0DSVaAiIwEBqrq400c\nZxywXFW3J28QkQkislRElm7atCkbMsfJJJPJ5ocwDKMz02ZBahGJAL8DftJEnYNx1sUlYdtVdaaq\njlbV0f369cuqfJlkMtn8EIZhdGZyqSDeAwYG1gd4ZT49gUOAChGpAg4HFgYC1QOAR4BSVX0rh3Km\nJF0mE7jsJb9XdTRq80MYhtF5yKWCeBkYLCKDRKQAOAdY6G9U1S2q2ldVi1S1CPgHcJqqLhWRXsDj\nwLWq+mIOZWySTNxM0GBBSOpYtmEYRocjZwpCVWuBybgMpNeBB1V1jYjcICKnpdl9MrA/cJ2IrPQ+\ne+RK1lRk4maqqHCuJVVzMRmG0bnI6XwQqroIWJRUdl2KusWB5V8Dv86lbJkS5maKDWzIZS0udlOP\nbt/uLAgbuM8wjM6C9aROQyYd5qZNc/GH+nqYMsUymQzD6ByYgsiAdB3mqqudcqivd/0izM1kGEZn\nwBREGjLpMOe7mSIRczMZhtF5MAWRBj+TKSpRCqIFjVxMYG4mwzA6JzkNUncGYgNjlJeWM+eVpsfR\nCHMz2bhMhmF0ZMyCyJDZr8zm3uX3MnbO2NC5IXw3UzTq/luHOcMwOjqmIDKgoqqCmroa6rSObbXb\nQq2JWMyN5vqd78DQobB6dRsIahiGkUVMQWSAn+oKoCizVs4KtSJWr4YFC+Cll9w8ETNntrakhmEY\n2cMURAbEBsa4cPiF8TTXVNlMDz+cuH7ffa0gnGEYRo4wBZEhpcNKyY/mp+wwBzBuXOL6ihWWzWQY\nRsfFFEQzSNdhbsIEKClpWK+vt05zhmF0XExBZEgmHeYAysqge3frNGcYRsfHFESGZDr0t3WaMwyj\ns9CkghCR3ZrYtk/2xWm/ZDL0t091tZunur7ejfJqbibDMDoi6SyICn9BRMqTti3IujTtnODQ36n6\nQ4BzK9XXu+X6enMzGYbRMUmnIILR2N5NbOsSZNoforraxSDAxSFWrGhNKQ3DMLJDOgWhKZbD1js9\nfn8Inx11O1KO7prnjXKlCrNmWRzCMIyOR7rB+vYQkR/jrAV/GW+9X04la6eM2HNEfLme+tBgdSwG\nF14IM2Y4BVFTA3Pm2OB9hmF0LNJZEPcCPYEegWV//Y+5Fa19Uv1FNRFxzRaRCNVfVIfWKy2F/Hy3\nbFaEYRgdkSYtCFX9VaptInJo9sVp/xQXFVMYLaSmribl/BDQ2IqorbUhwA3D6Fg0az4IERkCnOt9\nNgOjcyFUeybT+SHAWRF/+hPs2OH6RdgQ4IZhdCTSKggRKaJBKewA9gVGq2pVLgVr78x+ZTY1dTXM\nfmU25aXlxAaGmwYiif8NwzA6Cuk6ylUCj+MUyThVHQVs7erKIZP5IcC5lGprEwPVhmEYHYV0QeoP\ncUHpr9KQtdTl0luTybQ/RHGxcy2BUxL33mtzRBiG0XFoUkGoagkwFFgGTBWRd4CviMiY1hCuvZJp\nfwg/UO1TVwcTJ5qSMAyjY5B2sD5V3aKqs1T1BOBw4DrgDhFZn3Pp2jGZ9IcAF6j2rQhwlsSll1rK\nq2EY7Z9mjeaqqh+q6u9V9UjgqHT1ReREEVkrIutE5Nom6o0TERWR0YGyn3r7rRWRbzdHztYg2B9C\nEFZ8ED6eRizm5qkOYvNEGIbREWgyi0lEFqbZ/7Qm9o0C04HjgQ3AyyKyUFVfS6rXE7gC+GegbAhw\nDnAwsBfwjIgcoKp1aeRpNYqLismL5FFTVxOPQ5QOKw3NZiorg8cfd+muAAUFlvJqGEb7J12aawxY\nD8zDPcCbk6w5Blinqm8DiMh84HTgtaR6NwK3AlcHyk4H5qvqduAdEVnnHa/dOGb8OMSMZTMSJhEK\nUxCxGDz3XEMWU2mpdZgzDKP9k87F1B/4GXAI8D84a+BjVX1OVZ9Ls+/eOOXis8EriyMiI4GBqvp4\nc/dtD5QOK6VbXre0kwiBUwh33+2UQ0WFxSAMw2j/pMtiqlPVJ1X1fFyAeh1QISKTd/aLRSQC/A74\nyU4cY4KILBWRpZs2bdpZkZpNcyYRAqcUiovh5z93/01JGIbRnkkbpBaRQhE5E/gLcBlwJ/BIBsd+\nDxgYWB/glfn0xFkmFSJShVNAC71Adbp9AVDVmao6WlVH9+vXNoPLVn9RTV19HfVaz/ba7Snnqgbn\nYqqpsY5zhmF0DNIFqefgHtrP8tQAACAASURBVOKLgF+p6qvNOPbLwGARGYR7uJ8DnOdvVNUtQN/A\nd1UAV6nqUhH5ErhfRH6HC1IPBl5qxne3Gn126UM9bvq4ptJdDcMwOhrpLIjv4x7OVwBLROQz77NV\nRD5rakdVrQUmA08BrwMPquoaEblBRFJmP3n7rgEexAW0nwQua08ZTEEyHf4bXPyhsNAtR6MwYkTK\nqoZhGG1OuuG+m9VPImT/RTjrI1h2XYq6xUnrNwE37cz3twb+8N/ba7cTkUjaQPWdd8Lkya5X9ZQp\nMHSoZTQZhtE+2SkFYDQ/UF1d7TrK1dfDtm0WhzAMo/1iCiILVH9RTb3WU6/1TY7uCo0H8LOZ5gzD\naK+YgsgCmY7uCg0D+PnzQ1g2k2EY7RVTEFnA71UtXkdzv1d1KoID+Km60V3POMMsCcMw2hemILJE\nc3pVg1MMPvX1sGABHH20DQVuGEb7wRRElmhOsLqiIlFB+NTVuQwnsyQMw2gPmILIIpn2qi4ubugP\nkUxdnQ0FbhhG+8AURBbJtFd1LAbl5W52ufz8hvJIxCkOGwrcMIz2QLrhvo1m4Peqrtf6JicRAqck\nYrGG0V379IEVqasbhmG0OmZBZBF/EiFIn+7qE4vBT3/qlu+7zwWpx461OIRhGG2PKYgskpzuWlNX\n02SnOZ/KSrjsMjfjnPWwNgyjvWAKIsuUDislP+oCC5laERUVTjH4qDprwqwIwzDaElMQWca3Inx2\n1O1ostMchGc17dhhVoRhGG2LKYgcMGLPhnG8M5kjws9qGjMmsXzjxlxIZxiGkRmmIHJAcI6IdNlM\nPrEYTJuWmPb6xBPmZjIMo+0wBZEDWpLNBE5JXHRRw0B+tbXWac4wjLbDFEQOaGk2E7h+Ed26uU5z\nIq5/hGEYRltgCiJHtCSbCRpcTdGosyAmTYLvfx9uucXcTYZhtC6mIHJES7KZfKqrnXIAl/46dy78\n/OfWgc4wjNbFFEQOaW42k09xcUMcwkfVTS5kMQnDMFoLUxA5JJjNFJEI1V9UZ7RfLAZXXdW43GIS\nhmG0JqYgckhxUTGF0UIiRIhIJGMLAuDWW6GszAWrferqYMoUczMZhtE6mILIIcFJhOrq67hs0WXM\nXJb5lHG9eiWuq8L27eZmMgyjdTAFkWP8SYQUpba+lsmLJmeUzQQuFhFJukKqsHlz9uU0DMNIxhRE\njikuKiYSeMrX1tdmnM0Ui8H06Ym9q1Xhttts7mrDMHKPKYgcExsY48exH8fXFWXz9sxNgAkT4Lnn\nYP/9E8tvvNFiEYZh5BZTEK1Ar8Je8V7VAHdU3pGxmwmcJXH11YllGzbAcceZkjAMI3fkVEGIyIki\nslZE1onItSHbJ4rIahFZKSIviMgQrzxfRGZ7214XkZ/mUs5cU1xUTDQSja/X1tdmPPSGz4QJUFKS\nWGYBa8MwcknOFISIRIHpwEnAEOBcXwEEuF9Vh6rqcOA24Hde+dlAoaoOBUYBl4hIUa5kzTWxgTGm\nnzydqDgl0ZyhN4KUlUFe0iziDz4IZ5zhhuQwa8IwjGySSwtiDLBOVd9W1RpgPnB6sIKqfhZY3RVQ\nfxOwq4jkAd2BGiBYt8MxYdQEfjjyh/H15gy94ROLwcUXJ5atXAkLFsA995jLyTCM7JJLBbE3sD6w\nvsErS0BELhORt3AWxI+84v8D/gN8ALwL3K6qn4TsO0FElorI0k2bNmVb/qzT0qE3gpSWNrYifGwo\nDsMwskmbB6lVdbqq7gdcA/zCKx4D1AF7AYOAn4jI10L2namqo1V1dL9+/VpN5pbSkomEkvFTX5PH\nagKnOIqLd1JIwzAMj1wqiPeAgYH1AV5ZKuYDfhj2POBJVd2hqh8BLwKjcyJlK9LSiYSSmTDBuZSi\n0cbb5swxN5NhGNkhlwriZWCwiAwSkQLgHGBhsIKIDA6sngL8y1t+F/imV2dX4HDgjRzK2irszERC\nyUyYAIsXwwknNPS23rHDKY6jj3aBa1MUhmHsDDlTEKpaC0wGngJeBx5U1TUicoOInOZVmywia0Rk\nJfBj4HyvfDrQQ0TW4BTNLFVdlStZW5PkiYTuW3Ffi6wIcO6mqVOhsDDR5VRX5wLXxx5r2U2GYbQc\nUdX0tToAo0eP1qVLl7a1GBlxxvwzWLB2QXx94qiJ3H3q3S0+XmWlG37jr391Q3EEEXFTmJaXO4Vi\nGIYRRESWqWqoC7/Ng9Rdkf49+iesb/x8404fc9GixsoBbARYwzBajimINqB0WCn5kYYR+B5989Fm\nDQOeTEWFiz+kor4ennzSXE2GYTQPUxBtQGxgjItGXBRfr9O6Zg0DnkxxceKIr2E8/zwcdZQFrw3D\nyBxTEG1E6bDSeMorNG8Y8GRiMWdFTJzoPmVl4Smw9fUueO33uK6shFtuMYVhGEY4piDaiLBhwJ98\n68mdymi6+273ufVWlwJbUhLeoW77dhfUHjsWfvELOOYYm1/CMIzGmIJoQ5KHAX/+389z3OzjWqwk\ngsRi8MgjcMkl4dv/+lf48ktnVdTWwuTJZkkYhpGIKYg2JHkYcIDtddtb7GoKo7QUuncPn7o0yI4d\ncPbZqWMUvjtq5kzXt6Kz9q8wt5thNJBi2DejNfCHAZ/02CTqqY+XN2fGubTfEXN9ICoq4KWXXAwi\nFe+95z4LF8JVV0GvXtCnDzzxBDz6qOuAF2TWLHj2WVi9Gh5+GMaNcz28wT1gKypcAL2j9L+orHRu\nt5oaKCiwviOGYR3l2gGTHpvEPcvuia/nR/J57oLniA3M7tOpstL1rm4qJbY5iMDppycqnaIi2Gcf\n+Oc/neuqoACmTYPq6tTKor0ok1tugV/+0inCaNRN6/rTDj1VVfMJuxbt5fo0h8pKNy4ZOCs6E7mb\nc54taZNM90lVL1fXoamOcqhqp/iMGjVKOypL3l2ieTfkKVNRpqIyVXTioxNz811LVIcMUXVOpp37\nRKOqY8akrxeJNNQvK1O9+WYnh6rqjBmq+fmuTvfuDeVBeYP1053bxInuk0n9sP27d3ey5OU52dqC\ndOcctr057dTU93bv7q6Tfy3CynJJts6joKDh/issTH+85pxnS9ok3X0ePHZhoapIg9z+fV1YmJvr\nACzVFM/VNn+wZ+vTkRWEquqMpTM0+qtoXEnk35CvS97Nza/Rvwl3RjlEIu6mLytr2b7du7v98/IS\ny/2Hw803u+1hP8TgQ8RfLitz9fxj5ee37Ec0Y4Y7joh7yAS/synlkyxTqrqZPPx9JRWNqpaUNFYE\nuXqI33xzQxtGIqonnODOwVfwydenqXNorqJOfggWFLRc0d98s7t+/r0g0nA/nXBCuOK/+ebE85w4\nMfEcg+ccbKdo1K2nO7fgfS7ijp987hMnuusd/K2UlLhrGjyfTL6zOTSlICwG0U6YMGoCT/zrifgY\nTTvqd3Dbi7fxyDmPZP27YjEXO6iocDGGFd60FCNGwKWXNo41gHMnHX009O4N/fvDbrvBffc17Nsc\n6uudn/+++5wbyicadfKMHetScf2fBCQOF+LHCaJRtz3MZbZjh5t974ornHsr+TyTXV6++f7SSw3n\nX1Pj3BSrVye2y6xZcOedicebMsXJ6KcV+3VnznQdFIcMSawXibh5PfyYjS/D1KmwbVvDeS9Y4OI/\nd90FQ4e67du3uzbcts3Jt88+Tta6uoZJo9K5h/zyPn0a2qK42LkE/eM/84yTs76+4bpt3tx0nGbm\nzPC2SudiHDs28bzr6tzIxH/8o7uOpaWuPBMXi99xtKbGrRcUwJo1MHeuW//b39z/YLzspZcSz/Pe\ne93//Hz4/e/ddfPPedo0999f9+dgCbq1gvfYnDmJ97mqaxf/nIJu3+RkkjffTGwXcPdYnz4N35lT\n918qzdHRPh3dglBVnfjoxLgFwVQ0+qtozqyIMIJvUf6bzjHHuLeYoHnbEqshzIoIvvH77qcTTkh8\nWwp+Zsxwb1mptjf3k5fnzs0/v0ik8bFLShLf/pLdZsnLTX18yyTZClNtsFxS7SvS4KJIPub48eHu\ni6BlEXwjT7YgRRosupIS1QEDUl+zE05okEHE1Q9aTmFt5csWjbr7aeJE913+W32mbkr/48uabEUm\nW1r+W/mMGY3b7YQTEtuoqXtqyJD0VlSyWyvVfR4snzgx/NyDVlzyb8S/h/Ly3HUPs3abC+Zi6hgs\neXdJgpspl7GI0O9P4YNPNqn33z/1jzjsx5Cfr1pU1PQDYN99m35Agnu4ZPow3tmP7wMOU0g7o6CS\n9/UV484qPf+hEXSfJMvuK4JjjslMtrDrO3x46m3DhrXOtfHbzb/ffKUUjSaev/8QT3bbgGvzJUvc\nA7q5be/v6yu4iRObH9fzFX7YeZWVNd4mkv47JrbwUdGUgrAspnbGzGUzufTxS6lTZ6NHJcpdp9zF\nhFET0uyZHVJlsQTdCpdf7npi+xxzDIwf3+DKufxyZzJHo3Dqqc4ltXFj0ym22SLoEtkZxoyBiy5y\nKb5BuQcMgA0bdv74QUTcT3xn9xGBQw+FkSOdi+PyyxvcLF2JkhI46aQGd17Y/eAPRRN0p0ajrm66\nayHipvetrW3+dWuKSMSNhFBd7UY4CMrtuy6b+r6SEtc5trk0lcVkCqIdkpz2GpUoi3+wOOtpr80h\nWXFccw3cfru7YZPnmwj6t33fbVPxgqbI5OHpx0eGDHF+3dWrXUe+nVEUJSVuCPWu+IBtT7REefr7\nQfP23WMP2LQpuw/9ZJn22w/WrQvfdvrpDS9TCxc2//4tLHSxxebGImw+iA5G6bBSotLQw7pO67jt\nxdua2CP3xGKuT4B/8/Xq1fDj9QOjyXWrqxuCp3V17o184sTwgQTDKClxgcq8DFIpTjzRvX3FYi74\n+MIL7rtKSlyg0X/rKylxFk9yMNBHxFlJn3zSWDmEjWsVpH/iNB/xYw0YkF7+VLKMGZN5e+WSsHNP\n1YYAw4c37/iRiLsuxxzjzjcS2bnz9h0vweMXFDR9zI8+cvv490pzzyGMYLsVFMCZZ4bXi0RcMsI9\n9ziLtSVKqrY2B/O+pPI9dbRPZ4hBBCmZX5IQsJapojOWtlFifgiZpFamqhP0jfuBTj+g6Aeqg77k\n5KD48OGJPtp0ee7JAcXkNMjgZ//9UweMk333ZWWJqZN+oNKPB/hB4RkzGh8n2c/sBxr98mDA2Q+4\nZjsGM358eGA1GAz35Uj24+fnN1yr5MQCP5BbVtYQ+A8mAIg0TttMjh1MnBjuo2/uJ3geZWWZJVic\ncEJ40D2TWIUfSPbjeMGAvp86GxaHCosLhd2DfowieJ/tbP8ILM2141F2RBmPrn00HotQlEsfv5Sh\newxtU1eTT3AIj1QpdqnqlJbC7NkNMY2ysqbN4l69GmILkQh897sN6YP+8ZraPxZL3J6cBhnk6qud\n5aNJb3CFhfCb34QPKxJMVa2oaHy+/v/k/fbbzw2SWFvr3mx//3u3raQk/Bi33AIvvujWk10Sfurx\n7bc3dk1EIu58IhGXKltQ4Ky5CRPgsstcPOnRR12dwsKGnu/BFFiAp55qSNG98konr38Nr74a7rjD\nWYqFhQ2y++cSdDcWFMC3v504RW51deL1qqhIPA/fRZl8br4VG4m45eQU7WDq6h13uPNuym1VWOhS\nif12v/himDGjwbKAhn2TYz7Btgo7Zz+ttVu3hpTo73zH3f9z5rg5W4Lne9ddjVO0/Xs9eI/435eT\nVNdUmqOjfTqbBaHqOs/JVEmwJErmlbS1WFmhuT2ks92bN5gGmWy1pOusli1a0uEqE6vNtzbGjGmw\nbNK1dXPrhMnenB7g6c4lebuf1hq0sAoKEt+e/Yyi/PyGt+vgW7ifWhq0mvw38qY6NgblCLPwmiLT\nXu/B1ONkiyrXYFlMHZczHjiDBW80pNEIwj2n3tNqWU3thdYeD6g1vq8lgwO2l3GRsjGwYbpzyWRM\nImh67KjVq52V5ls25eWuTnPGaUqWI1fXoK2urWUxdWAq11dy9Kyj464maB9ZTUZ2aC8P/JbQUWTv\nKHK2FaYgOjgzl81k4mMTURquVcmBJTkZhsMwjK6Fpbl2cCaMmsDpB52eULZg7QKueeaaNpLIMIyu\ngCmIDkLZEWUJfSMAbnvxNmYus8mkDcPIDaYgOgixgTHuOuWuhDmsAW587saszGFtGIaRjCmIDsSE\nURO4+sirE8o2bN3AUbOOMkvCMIysk1MFISInishaEVknIteGbJ8oIqtFZKWIvCAiQwLbviEilSKy\nxqvTLZeydhRu/datlBxUklBWr/VMfGyiKQnDMLJKzhSEiESB6cBJwBDg3KAC8LhfVYeq6nDgNuB3\n3r55wF+Aiap6MFAMZGkm5Y5PWDzC72lt7ibDMLJFLi2IMcA6VX1bVWuA+UBCKo6qfhZY3RXieZwn\nAKtU9RWvXrWqhsxz1jXx4xGRpMtXp3VcvPBiUxKGYWSFXCqIvYH1gfUNXlkCInKZiLyFsyB+5BUf\nAKiIPCUiy0WkLOwLRGSCiCwVkaWbNm3KsvjtmwmjJvDChS8wpG+iUfbax69x7J+PNSVhGMZO0+ZB\nalWdrqr7AdcAv/CK84CjgPHe/zNEZGzIvjNVdbSqju7Xr1+rydxeiA2M8cfT/tjI3bSjfodZEoZh\n7DS5VBDvAQMD6wO8slTMB/zo6wbgeVX9WFW/ABYBI3MiZQcnVfqrWRKGYewsuVQQLwODRWSQiBQA\n5wALgxVEZHBg9RTgX97yU8BQEdnFC1gfC7yWQ1k7NBNGTeCeU+9ppCR21O+gZH6JZTcZhtEicqYg\nVLUWmIx72L8OPKiqa0TkBhE5zas22UtjXQn8GDjf2/dTXEbTy8BKYLmqPp4rWTsDqZTER198xCWP\nXWLWhGEYzcYG6+tkhA3s55Mfyee5C56zUWANw4hjg/V1IXxLIjkFFpzL6byHz2PSY5PMmjAMIy2m\nIDohfgrsMfsc02hb1ZYq7ll2jw3PYRhGWkxBdFJiA2M894PnmHHqjEZxCXDDc1zy2CWMmDHCLArD\nMEKxGEQXYOaymVz6+KUJs9IlIwinH3Q6J+1/EtVfVFNcVGyxCsPoAtiMcgaV6yu57cXb+Ovav4YG\nsJMpiBZQcX6FKQnD6OSYgjDiVK6v5NpnruX5d59PW/crhV9haP+h9O7WO172yZefsOmLTRzY90DK\njigzBWIYHRxTEEYjZi6bybR/TOONj9/IyKIIIyIR7j7lbgDuW34f3fK70btbb/r36M+IPUew4oMV\nAJQOKzVFYhjtFFMQRkqa63pqCYJw9L5Hxy2R/j36JyiNyvWVVFRVZCXusbPH8tvj/a3vc9HIi5gw\nakKbyGHsPO3pvmrOMSrXVzLnlTmAe7kCcnovmYIw0uLflK9teo3F7y7OmbLwEYTzhp7Hf2r+w8K1\nC1GU/Gg+FedXADDnlTls/Hwjn3z5CdtqtyU8rIM/IN9S8eu+sP4F6rWeCBGO2vcoenfrHXeL9du1\nH0P6DmG3brux8oOVjBsyLkEBVK6v5Jg/H0NtfW28bMapM5gwakL8Ozd+vjGu4Hw5fTn84P7qj1Yz\nedFk6rSOvEgeFw6/MK4Qk3/8YT/4mctm8vBrDzN8z+H0KuzV6MEQdozmPsD8+n126ZOQlBD23as/\nWs3Drz0cb69Mvqsl8vj337babQzuM5hN/9nEuCHjGLrH0FBZ0x2veHYxO+p2xO+r5PNIJQMkXk+A\nsXPGUlNXQ0G0gPLS8pTHCp43NDzYV3+0Op4oEpUod51yV8r7+fInLqemrgZwnVsjEqG2vpZoJMrJ\n+5+c8IKVDcVlCsJoFv5b9IqNK1j/2XrqtT6+TRB2LdiVz2s+z8l39+7Wm0+3fRqqoMqOdKO+//bF\n32ZNgQ3vP5yi3YsAWLlxJVVbqhK2D+g5gPO+cR7/veS/E7LAIkRQ7y8TIkT4Rv9v8MrGVxL2OWbf\nY/jN2N8AcNuLt/GPDf9g4382JuwrCPv22pde3XqxvXY7b1a/mSDL8P7DeX3T6+yo20EkEmH6ydPj\nD5+wBz7AcbOPY3vd9vgxCqIFnD3kbOaunpu2vdZ8tIYd9TsQhK/3+zqnHnAqn237jI2fO7mrNlex\n6sNV1FOPIOzfe3/yInkU5hWyvXY7/Xbtl+CKfOJfTzRpwUYkknAP5kXy+HHsx3y2zU0nE/aWfcb8\nM1iwdkGD3F8dzsoPV8bXy44s49Zv3Qo0PKDvXX5vo0w/Qei3az82/WcTijpreJ+jE2J4Q/oNYUT/\nESz/YDlvfPxGfD+AetzLSj31jY573tDz4vv45y5IRveUfz+t2rgq4eWqJUrCFITRYsLeNAGO/fOx\n7KhPnOQv05vbyD2Dew9m6/atocomlwq+rfDvPV8hrftkXZP3oiBcfeTVvPnxmyxcu7DRA7wjUnJg\nCY+c80iz9zMFYWSdVC6XiqoKNm/fzMoPVjJ8z+Gd6gdoGO2ZqERZ/IPFzbYimlIQeVmRzOhyxAbG\nQm/EsLJkK2TNpjXcv/r+hDe8bFkfQdPbN++/0f8bfPrlp/x7y78zPo5ZQ21HZ277Hvk9+HxHbqw3\nRamoym7fJVMQRs4JUyaXHXpZo0yNYJCyeFBx3Ge+YO0CbnvxtoT9oxKN+4ujEuUnR/wkIaAbFrwL\nBgN367Zbo7iC71O/4rArAJj02KQEyycqUSISibvWfD9yz4Kecf+7H1QvHlQc98snB89PO+g0Duhz\nAI+ufZTXP3494bz69+jP4QMO56T9T0oIvm/6YhO19bWNXCe7F+7Olu1bWnJZEIRh/Yc1iouMHzo+\n4Zz69+jPbt124/YXb4+3R4QI+/XeL9SV47fjiP4j+Ff1v6ipr2F77fZ4DKIwr7DRd/o9+cuOcHEm\n3xJNvkY7S6/CXmzevjnldv9e8q9d1eYqXvnwlYwUlu/eevvTt6nTOiIS4VuDvkX5O+XUaz350Xz+\n+9v/zeRFkxPcs34c7NE3H024p8855Jx4+/kvPP42RalXF+MRcfGOwmhh3AWcLczFZHQIZi6byX3L\n72Ov3faKP0TSZQOlI1UmT3B7U5lLzfneVNkmzU2rnblsZjxDqjBa2CijBlyfFP+hfGDfA0OVTbCj\nY6YypMqeCsv8ySTLKLltM7E+N2/fzB2Vd8Szen4c+zFvfvwm7299n8F9BrP8g+WsrV6LqlIQLeCs\nIWclBN7Ljizjjso7Eh7Qfhr2kL5DQuVIfrGoeKeCvXbbKz4sTfL9k3ytw9bD7qGmMtyaSn0NLluQ\nOgWmIIyuQlfvY5Hu/JO3+5lcwTTdTJRTV8EUhGEYhhGKTRhkGIZhNBtTEIZhGEYopiAMwzCMUExB\nGIZhGKGYgjAMwzBCMQVhGIZhhNJp0lxFZBOQ+VgK7ZO+wMdtLUQ7wtqjAWuLRKw9EtmZ9thXVfuF\nbeg0CqIzICJLU+Ujd0WsPRqwtkjE2iORXLWHuZgMwzCMUExBGIZhGKGYgmhfzGxrAdoZ1h4NWFsk\nYu2RSE7aw2IQhmEYRihmQRiGYRihmIIwDMMwQjEF0UqIyJ9E5CMReTVQ1ltEnhaRf3n/v+KVi4jc\nKSLrRGSViIxsO8lzg4gMFJFnReQ1EVkjIld45V2yTUSkm4i8JCKveO3xK698kIj80zvvB0SkwCsv\n9NbXeduL2lL+XCAiURFZISKPeetduS2qRGS1iKwUkaVeWc5/K6YgWo8/AycmlV0LlKvqYKDcWwc4\nCRjsfSYAd7eSjK1JLfATVR0CHA5cJiJD6Lptsh34pqoOA4YDJ4rI4cCtwB2quj/wKXCRV/8i4FOv\n/A6vXmfjCiA4J2tXbguA41R1eKC/Q+5/K6pqn1b6AEXAq4H1tcCe3vKewFpveQZwbli9zvoB/goc\nb22iALsAy4HDcL1j87zyGPCUt/wUEPOW87x60tayZ7ENBngPvW8CjwHSVdvCO68qoG9SWc5/K2ZB\ntC1fVdUPvOWNwFe95b2B9YF6G7yyTonnEhgB/JMu3CaeS2Ul8BHwNPAWsFlVa70qwXOOt4e3fQvQ\np3UlzinTgDKg3lvvQ9dtCwAF/iYiy0TEnzQ857+VvJbsZGQfVVUR6XI5xyLSA3gYmKKqn4lIfFtX\naxNVrQOGi0gv4BHgoDYWqU0QkVOBj1R1mYgUt7U87YSjVPU9EdkDeFpE3ghuzNVvxSyItuVDEdkT\nwPv/kVf+HjAwUG+AV9apEJF8nHKYq6r/zyvu0m0CoKqbgWdxbpReIuK/yAXPOd4e3vbdgepWFjVX\nHAmcJiJVwHycm+l/6JptAYCqvuf9/wj38jCGVvitmIJoWxYC53vL5+P88H55qZeNcDiwJWBKdgrE\nmQr3Aa+r6u8Cm7pkm4hIP89yQES64+Ixr+MUxVleteT28NvpLODv6jmcOzqq+lNVHaCqRcA5uHMb\nTxdsCwAR2VVEevrLwAnAq7TGb6Wtgy9d5QPMAz4AduB8ghfh/KTlwL+AZ4DeXl0BpuN80KuB0W0t\nfw7a4yicX3UVsNL7nNxV2wT4BrDCa49Xgeu88q8BLwHrgIeAQq+8m7e+ztv+tbY+hxy1SzHwWFdu\nC++8X/E+a4Cfe+U5/63YUBuGYRhGKOZiMgzDMEIxBWEYhmGEYgrCMAzDCMUUhGEYhhGKKQjDMAwj\nFFMQhpEGEanzRtH0P9em3yvjYxdJYIRfw2hP2FAbhpGeL1V1eFsLYRitjVkQhtFCvDH6b/PG6X9J\nRPb3yotE5O/eWPzlIrKPV/5VEXnEm/PhFRE5wjtUVETu9eaB+JvXkxoR+ZG4+TJWicj8NjpNowtj\nCsIw0tM9ycX0vcC2Lao6FPgDbgRSgN8Ds1X1G8Bc4E6v/E7gOXVzPozE9YoFN27/dFU9GNgMjPPK\nrwVGeMeZmKuTM4xUWE9qw0iDiHyuqj1Cyqtwk/y87Q08uFFV+4jIx7jx93d45R+oal8R2QQMUNXt\ngWMUAU+rm/QFEbkGyFfVX4vIk8DnwAJggap+nuNTNYwEzIIwjJ1DUyw3h+2B5ToaYoOn4MbUGQm8\nHBjJ1DBaBVMQhrFzJslZDQAAALlJREFUfC/wv9JbXoIbhRRgPLDYWy4HJkF8cqDdUx1URCLAQFV9\nFrgGN4R1IyvGMHKJvZEYRnq6ezO9+Typqn6q61dEZBXOCjjXK7scmCUiVwObgB945VcAM0XkIpyl\nMAk3wm8YUeAvnhIR4E5180QYRqthMQjDaCFeDGK0qn7c1rIYRi4wF5NhGIYRilkQhmEYRihmQRiG\nYRihmIIwDMMwQjEFYRiGYYRiCsIwDMMIxRSEYRiGEcr/B1jKCPhqR4xuAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ctawd0CXAVEw",
+        "colab_type": "text"
+      },
+      "source": [
+        "This graph of _mean absolute error_ tells another story. We can see that training data shows consistently lower error than validation data, which means that the network may have _overfit_, or learned the training data so rigidly that it can't make effective predictions about new data.\n",
+        "\n",
+        "In addition, the mean absolute error values are quite high, ~0.305 at best, which means some of the model's predictions are at least 30% off. A 30% error means we are very far from accurately modelling the sine wave function.\n",
+        "\n",
+        "**3. Actual vs Predicted Outputs**\n",
+        "\n",
+        "To get more insight into what is happening, we can plot our network's predictions for the training data against the expected values:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "i13eVIT3B9Mj",
+        "colab_type": "code",
+        "outputId": "372e169f-f97d-47ee-e64c-162b8ba4e38c",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 281
+        }
+      },
+      "source": [
+        "# Use the model to make predictions from our validation data\n",
+        "predictions = model_1.predict(x_train)\n",
+        "\n",
+        "# Plot the predictions along with to the test data\n",
+        "plt.clf()\n",
+        "plt.title('Training data predicted vs actual values')\n",
+        "plt.plot(x_test, y_test, 'b.', label='Actual')\n",
+        "plt.plot(x_train, predictions, 'r.', label='Predicted')\n",
+        "plt.legend()\n",
+        "plt.show()"
+      ],
+      "execution_count": 13,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEICAYAAABcVE8dAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO2de3gV1bn/P28ugCKoRCxWRKiXVhRF\nQexGoPFyAFureGlPFYs3jFGxWk8FPed4So+KJu0pnKqVpCKFA6L+tKK2WijVCJgtiEprG2pFDRJF\njaACKpck6/fHmkkmO7N39v36fp5nnr3ntmbNzN7vrPm+73qXGGNQFEVR8p+iTFdAURRFSQ9q8BVF\nUQoENfiKoigFghp8RVGUAkENvqIoSoGgBl9RFKVAUIOf5YjIsyJyabK3TRQRMSJyZDqOlWq85yIi\nc0XktjQc8zIRWZ3q42QDItIoImemoNy8+Q2mi5JMVyAfEZGdntl9gd1AqzN/tTFmcbRlGWPOSsW2\n6UJEBgPvAKXGmJbM1qZ7jDGV0WwnInXAImPMA6mtUfrJ53MrdNTgpwBjzH7udxFpBKYaY1aEbici\nJblgBHMJvaaKEh6VdNKIiJSLSJOIzBCRD4D5InKgiPxeRJpF5BPn+0DPPnUiMtX5fpmIrBaRXzjb\nviMiZ8W57RARWSkiO0RkhYjcJyKLItT9ZhHZIiLvi8gVIeu+IyKvich2EdksIjM9q1c6n5+KyE4R\nCYjIESLynIhsFZGPRWSxiBwQ4dhGRH4kIm872/9cRIo85/miiMwWka3ATBHp6Zz3uyLyoSPT7BPl\nufxWRO7wzJ8rIuudc3tLRCaKyJ3AWOBe55zudbb9hoj8SUS2icgbIvJ9TzllIvKUU85a4IgI5/us\niEwLWfYXETlfLLNF5COnrNdF5Lgw5VwuIhuce/y2iFwdsj6qcxORwc49KPHs6/2txXQ/PWWcIiIf\niEixZ9l5IvJX5/soEQmKyKfO/bpXRHqEKau9Ps58J8msm3vzbRFpcK7TeyLyk+7qnrMYY3RK4QQ0\nAmc638uBFqAK6AnsA5QBF2Clnz7A/wOWevavw74hAFwG7AWuAoqBa4D3AYlj2yDwC6AHMAbYjn2N\n9zuHicCHwHFAb+AhwABHes5rGLYBcbyz7SRn3WBn2xJPeUcC/+Jcg/7Yh8KcCNfQAM8D/YBBwD9D\nzrMFuB77xroPMBt4ytm+D/A0cFeU5/Jb4A7n+yjgM6euRcChwDdCr7Uz3xvYDFzu1ONE4GNgqLP+\nYeBRZ7vjgPeA1WHOdwrwomd+KPCpc70mAK8ABwACHAMcEqac72AfLAJ8C/gCOCmOc/O7h+3bdHc/\n8fwHfOr4FvAvnvn/B9zifB8BfNO5noOBDcCNIb+LI8PU+TL3+kZxb7YAY53vB7rXKB+njFcg3ye6\nGvw9QK8I2w8HPvHMe/9YlwEbPev2dX70A2LZFms0W4B9PesXEd7gPwjc7Zk/2vtn89l+DjDb+d7F\nWPhsPwl4LcJ6A0z0zF8L/Nlznu961gnwOXCEZ1kAeCeac6Gzwa9xz8OnTqEG5l+BVSHb1AA/xT5w\n9+IYVGfdLMIb/D7OORzuzN8JPOh8Px37wPsmUBTjb3EpcEMc59blHoZuE+l+Etng3+E5t07n7bPt\njcATIb+LaAx+2HvjfH8XuBroG8v1zMVJJZ3002yM2eXOiMi+IlIjIptEZDu2dXSA9zU3hA/cL8aY\nL5yv+8W47VeBbZ5lYFtA4fhqyPpN3pXOq/nzYmWpz4BK4KBwhYnIV0TkYef1eTv2YRN2e5/6bXLq\n5LeuP/bh9oojBXwK/NFZ3u25hHAYtgUaDYcDp7jHdI47GfuA7Y9tWUZ1XGPMDuAPwA+cRRcBi511\nzwH3AvcBH4lIrYj09StHRM4SkZccGeNT4Nt0XOdYzi0icd5Pl4eA80WkJ3A+8KoxZpNT7tFiJc4P\nnHJnxVCul0j3Buwb9reBTSLygogE4jhGTqAGP/2Epif9N+DrwCnGmL7AOGe5pLAOW4B+IrKvZ9lh\n3WzvXT8oZP1DWAnlMGPM/sBcOurvl451lrN8mHPOl9D9+YYe/33PvPcYHwNfAscaYw5wpv1NhyO9\nu3PxspnwWnvoeW0GXvAc8wBjzH7GmGuAZuwbVbTHBVgCXOQYn15YScse2JhfGWNGYKWeo4GbQ3d2\nDOjjWNnuK8aYA4Bn6LjOsZzb586n9/cywPM9nvvpnksD9uF3FnAx9rfkcj/wD+Aop9x/j1Du5xHq\nF+neYIx52RhzLnAw9i3o0Wjqnouowc88fbAG6lMR6YeVAFKK04Jah3Vw9nCMyncj7PIocJmIDHUe\nEqF17IN9Y9glIqOwf1yXZqAN+FrI9juBz0TkUHwMlg83i3VwHwbcADwS5tzagN8As0XkYAAROVRE\nJkR5Ll7mAZeLyBkiUuSU8w1n3Ych5/R74GgR+aGIlDrTySJyjDGmFfgd9nrvKyJDge76SzyDbZn+\nN/CIc144ZZ4iIqVYI7cLe31D6YHV1JuBFrEO+/HxnJsxphnrc7hERIrFOrq9D4t47qeXh7D3dBxW\nw/eWux3Y6dTtmghlrMe+KewrNjb/Ss+6sPfG+f1PFpH9jTF7neP5Xc+8QA1+5pmDdTR+DLyElR/S\nwWSstr0Vq6M+gu0v0AVjzLPYej4HbHQ+vVwL/LeI7AD+C08LyZGN7gRedF6nvwn8DDgJ6zT8A9YY\ndseTWGflemefeRG2neHU8yVHCliBfYuK5ly8570W6+ib7dT1BawRBvhf4EKxEVC/cmSY8VgZ5n2s\nnOY65wGmYeW0D7B+gvmRTtYYsxt7Xc6kc6u3L/aB9gm2ZbwV+LnP/juAH2HvxSfYh/BT8Zybs+wq\nrCHfChwL1HsOF8/99LIE61R+zhjzsWf5T5x673DO2fch7zAb6x/7EFiAI4E559rdvfkh0Oj8Viqx\n/428xI3YUAocEXkE+IcxJuVvGLEiIgb7Wr8x03VRlFxGW/gFivNKe4TzOj8ROBerXyqKkqdoT9vC\nZQD21bsMaAKuMca8ltkqKYqSSlTSURRFKRBU0lEURSkQslbSOeigg8zgwYMzXQ1FUZSc4pVXXvnY\nGNPfb13WGvzBgwezbt26TFdDURQlpxCRsL24VdJRFEUpENTgK4qiFAhq8BVFUQqErNXwFUXJT/bu\n3UtTUxO7du3qfmMlLL169WLgwIGUlpZGvY8afEVR0kpTUxN9+vRh8ODBiKQyKWz+Yoxh69atNDU1\nMWTIkKj3U0lHUZS0smvXLsrKytTYJ4CIUFZWFvNbkhr8PCYYhLvusp+Kkk2osU+ceK6hSjp5SjAI\nZ5wBe/ZAjx7w5z9DIG/H8VEUJRq0hZ+n1NVZY9/aaj/r6jJdI0XJLpYuXYqI8I9//CPidnPmzOGL\nL76IuE0kfvvb3zJt2rS4908mavDzlPJy27IvLraf5eWZrpGiZBdLlixhzJgxLFmyJOJ2iRr8bEIN\nfp4SCFgZ5/bbo5NzVO9Xsplk/z537tzJ6tWrmTdvHg8//DAAra2t/OQnP+G4447j+OOP55577uFX\nv/oV77//PqeddhqnnXYaAPvtt197OY899hiXXXYZAE8//TSnnHIKJ554ImeeeSYffvhhciqbRFTD\nz2MCgeh0+1j1/mDQSkTl5eoXUFJPKvxRTz75JBMnTuToo4+mrKyMV155hbVr19LY2Mj69espKSlh\n27Zt9OvXj1/+8pc8//zzHHTQQRHLHDNmDC+99BIiwgMPPEB1dTX/8z//k1hFk4wafKWT3r97N8yc\naSe/P5U6g5V04+ePSvQ3t2TJEm644QYAfvCDH7BkyRLeeecdKisrKSmxZrFfv34xldnU1MS//uu/\nsmXLFvbs2RNTfHy6UIOvUF5utf62NjutWAGrVvkb80h/Pm35K6nA9Ue5jYxE/VHbtm3jueee4/XX\nX0dEaG1tRUQ4+eSTo9rfGw7pjYO//vrruemmmzjnnHOoq6tj5syZiVU0BaiGX2CE00JFwB38rK0N\ndu2ChQu77h/OGey2/G+7zX6qL0BJFrH6o7rjscce44c//CGbNm2isbGRzZs3M2TIEE444QRqampo\naWkB7IMBoE+fPuzYsaN9/6985Sts2LCBtrY2nnjiifbln332GYceeigACxYsSKySKUINfo6QDKeV\n1yifdhpcc01Hq9z5jbdjDMyf3/V44f58GgaqpJJAAG69NTlvjkuWLOG8887rtOyCCy5gy5YtDBo0\niOOPP54TTjiBhx56CICKigomTpzY7rS9++67Ofvssxk9ejSHHHJIexkzZ87ke9/7HiNGjOhW788Y\nxpiEJ+BB4CPgb2HWC/ArYCPwV+Ck7socMWKEUSz19cbss48xxcX2s74+tn1nzTKmpsaY8eONKSoy\nxppzY0RseTU1HeUXF3deX1nZtSy/4ydSR6WwaGhoyHQV8ga/awmsM2HsarI0/N8C9wI+IgAAZwFH\nOdMpwP3OpxIF8TqtgkHbkt+zx5pwV7ZxP42x67Zuta31ujooK4Prr+/YZ/58mDLFlhfJWeu2/FXD\nV5TsJSmSjjFmJbAtwibnAgudB9BLwAEickiE7RUP8XaiWrjQRt242rwxUFQEJ5/ctTz3lbmiAq64\nwj4UwEo9dXXRSTZuGaAx/YqSjaQrSudQYLNnvslZtsW7kYhUABUAgwYNSlPVsp9AAObMgccfhwsu\nSKz13LOnLQvCt8anTIEFC7pGRUQTKaFhm4qSvWRVWKYxphaoBRg5cqTJcHWyhmAQbrzRGtFVq2DY\nsOiM6JQp8OCDsHcvlJTAlVfaZe6+4coIJ89EI9mkImZaUZTkkC6D/x5wmGd+oLNMiYJ4jWgg0CHH\nxKqr+/XSjabnbrJjphVFSR7pMvhPAdNE5GGss/YzY8yWbvZRHLLViPp1tFLnraJkL0kx+CKyBCgH\nDhKRJuCnQCmAMWYu8AzwbWxY5hfA5ck4bqEQrxFNlZ4eDFqH8Pz51qkbWna0OXwUJVMUFxczbNgw\nWlpaOOaYY1iwYAH77rtvXGVddtllnH322Vx44YVMnTqVm266iaFDh/puW1dXR48ePRg9enRMxxg8\neDDr1q1LOL4/KQbfGHNRN+sNcF0yjpWPRJOSIB4jGq0UFEtKBPchsmtXR/SPavVKrrHPPvuwfv16\nACZPnszcuXO56aab2te3tLS059SJhQceeCDi+rq6Ovbbb7+YDX6y0J62GSaVKQmiCeeM9fjuQ8Q1\n9iLZJTMpeUoK83ePHTuWjRs3UldXx9ixYznnnHMYOnQora2t3HzzzZx88skcf/zx1NTUALaz6rRp\n0/j617/OmWeeyUcffdReVnl5OevWrQPgj3/8IyeddBInnHACZ5xxBo2NjcydO5fZs2czfPhwVq1a\nRXNzMxdccAEnn3wyJ598Mi+++CIAW7duZfz48Rx77LFMnTrV7cCaMFkVpVOIxOqQjaU1Ho0UFOvx\nvf6E4mIbs++N/FGUpJPCWN+WlhaeffZZJk6cCMCrr77K3/72N4YMGUJtbS37778/L7/8Mrt37+bU\nU09l/PjxvPbaa7zxxhs0NDTw4YcfMnToUK644opO5TY3N3PVVVexcuVKhgwZ0p5qubKykv3224+f\n/OQnAFx88cX8+Mc/ZsyYMbz77rtMmDCBDRs28LOf/YwxY8bwX//1X/zhD39g3rx5STlfNfgZJhaH\nbDy/e68U5PewiNUh3N1DRDNmKkknBbG+X375JcOHDwdsC//KK6+kvr6eUaNGtac1Xr58OX/96195\n7LHHAJsc7c0332TlypVcdNFFFBcX89WvfpXTTz+9S/kvvfQS48aNay8rXKrlFStW0NDQ0D6/fft2\ndu7cycqVK/nd734HwHe+8x0OPPDAhM7XRQ1+honFIZvI7z7cwyIeh3A4f4J2ulJSQgrC1Lwavpfe\nvXu3fzfGcM899zBhwoRO2zzzzDMJH9+lra2Nl156iV69eiWtzEiohp8FRJsJMFJq4u7kzUipEaI5\nvt8xQpdFmzFTh1NUYiLZ+ZGjZMKECdx///3s3bsXgH/+8598/vnnjBs3jkceeYTW1la2bNnC888/\n32Xfb37zm6xcuZJ33nkHCJ9qefz48dxzzz3t8+5DaNy4ce3ZOp999lk++eSTpJyTtvCznFCJJLQ1\n3l2r2t2/rCz+RpLfMaDrsmgaYvoWoMRFBmJ9p06dSmNjIyeddBLGGPr378/SpUs577zzeO655xg6\ndCiDBg0i4FOv/v37U1tby/nnn09bWxsHH3wwf/rTn/jud7/LhRdeyJNPPtk+Zu51113H8ccfT0tL\nC+PGjWPu3Ln89Kc/5aKLLuLYY49l9OjRyUs1Ey6NZqYnTY8cXcrhWbM6UhoXF9v5cPvX1IRPb+x3\nbHdbv2N4l3nTKEdKodxdfZXCQNMjJ49MpUdWUkA0mn2kVnXo/lu3dmSzjITbCt+922bXvOkm/2OU\nlNiyjbE5e9xonWijfDScU1HSixr8LMbPOLq9XKHDwIZzukYyrpGiaerqrLF3x7idPRvuvdc+MLzb\nX3451NRYg9/a2vWBpKkXFCW7UIOfxYQaR7Cfe/bY7/Pnw/PPh29VhzOu3eno5eW2Zd/WZudbW/3f\nDsKlUe7uGNHIsRremd8YYzoNBq7EjomjM5Ya/CwiXIvY/X7XXTbVsUs0oZl+xrU7qSgQgPvug2nT\n7DY9e/pLL5Fa66kIIVXyg169erF161bKysrU6MeJMYatW7fGHM6pBj9LqK3tbGD9jFx5OZSWdrTw\n49XAo9HRKyps3v14c/wkotVrTv38ZuDAgTQ1NdHc3JzpquQ0vXr1YuDAgTHtowY/CwgG4brrbOZJ\nsPq5n5Fz89uHavixEq2OHk8knPctJV6tXh27+U1paWl7D1QlvajBzyCucXz33Q69HGzHqnBGLlnh\nyKkIa/aTYqKJCvKrmzp2FSX5qMFPI97WL3QYx6Iim3VSxBr7e+/NTSOXDCnGe43ch4U6cBUlOajB\nTxOhrd9LL+0wjq2tdpuSEussrajIbF3jJVEpJtoevWr0FSU+1OCnidDWL1gD5h1IxBgb/pirJCrF\nhF6jhQvh7bc7+gSoA1dREkMNfgqIJg3xlCl2WrjQ9lJtbc0PB2UivoHQXPvz59sw1LY2K3vlw/VR\nlEwi8QTvp4ORI0cad+SYXCJSDHk4LVo16g68juza2g5jf+aZMHOm5t9XlO4QkVeMMSP91mkLP8lE\nclxG6hGrxsriXgvX2IP9vOCCrg/OsjK48UYr+bjO7lz1fyhKOlCDn2Q0hjw5bN3akd6hqKjDt+F9\ngxLpSN7W1mb7Mgwbpg9PRQmHGvwkE6vjUiUJf8rLbY/j0Aen9w3KDWd1Vcm2NusT0eupKP6ohp9B\nNGdMZPwehqHX7Prr4Ze/tMa+tLQjc6deT6VQKTwNPxiEW26BV1+1lqC1tSPrmDcGsqQEjjoKvvtd\n+Oc/7XT00XZ69FH49FM4/ngbO/nqq7ZJefjhsHOnzYNw4om2rP79obkZhg+3ZbzxBuzYAR9/DL17\nw2GHWQu0Y4eNM+zbF0pKGPHRVra3tgCG1i+L+XLCADisLxx0kC3344876vP738MXX9hjHH00/N//\nWZ3DjiUCBxxgLd6XX9r6lJbacj77DN57zzaFv/gCDjzQnrcxtv5ffAEvvQR9+th9wJbVo4e9Ns3N\nVkBPsTjeXeI4F783qEmTOhy9v/mN5uBRlHDkXws/GIRx4zoS02QR3istIfPe5VnJ9OlQVZWSopP1\npqNvTIpSaC38urqsNPYuEuZ71vPzn8P27fFnbItAsrJjag4eRYlM/hn88nIrWWSZ0Tchn5BjBt8Y\nmDsXHngAVq5MqjVNZmSThrgqSnjyz+AHAtYgZZmGv2NPD95t2MERvM2n9KVfWQk9d2y15RhjA8kH\nDLD6frZo+J98Am++2fn6trTAt74F3/8+LFqUtFumLXNFST35p+FnMTkXghkMdh5TMZRRo2DNmrRW\nKRI5d30VJQVE0vDV4CuRCQahuhqWLvVff8ghNudBhru4qsNWUSyRDH5Ruiuj5BiBADzxBNTUWMkp\nlC1b4OqrYcgQmw8hAwSD9pmze3dnx6+iKJ1Rgx8FwaAdQDwYzHRNMkhFhTXuo0b5r29stIZ/xoy0\nVstt2a9YoVk1FaU71OB3g2tQbrvNfkZj9PP6AbFmDUyeHH79z38O11yTtpN3Qzq9WTWjkXPy+h4p\nShiSYvBFZKKIvCEiG0XkFp/1l4lIs4isd6apyThuOvCLEY9Eba0NYvnP/4z+AZFzLFpkJZ7DD++6\nzg3fHD3aRhOl+AK4IZ3FxTb3jl8K5VDieYgrSj6QsMEXkWLgPuAsYChwkYgM9dn0EWPMcGd6INHj\npguvQelOKggGbcZGd9CO3bvzWEuuqLAyTn297dksPr0K3nzTGv5LLon7MN21xAMBmDPHGu45c6Jz\n1Mb6EFeUfCEZcfijgI3GmLcBRORh4FygIQllZ5xYYsTr6jpyuIN9SOS9lhwIwAsvWIu8cKFt3Yey\neLE1/jGGcEYTeRMM2pz4e/bAqlXRpUfWFNZKoZIMSedQYLNnvslZFsoFIvJXEXlMRA7zK0hEKkRk\nnYisa25uTkLVkkMgALfeGp0h6dnTasklJXZAjoIJDQwE4P77Yfx4//Vr18KECTEVGU1LPJ7WuvsQ\nv/12Dd9UCot0OW2fBgYbY44H/gQs8NvIGFNrjBlpjBnZv3//NFUtPvykBteQ3HGH7exbkKMvLVsW\n3ugvXx5T6GY0closkpuXaB/iipJPJNzxSkQCwExjzARn/lYAY8xdYbYvBrYZY/aPVG42d7zSTj5R\nEAzC975n0zp4GTUKevWyaaIvvrjbDJzR9J7VHraK0kGqs2W+DBwlIkOA94AfABeHVOAQY8wWZ/Yc\nYEMSjpsxkpXdMa8JBKCpyco4y5d3LF+3rsPRUV0NTz8NDeHdPZGSoXkN/a23Jq3mipK3JGzwjTEt\nIjINWAYUAw8aY/4uIv8NrDPGPAX8SETOAVqAbcBliR43k6jTLwaWLbMyzuOPw777dk3RsGGDTTA3\ne3ZMGljoW9acOTaXnLbyFSU8mksnTlRGiINg0IZphmPgQJulNIoLetddNo7eHdu2uNi+OMQqsel9\nVPKNwhoAJU1o3vU4CATsyFnV1f7rm5pgzBhYvbrbi+t9yxKxhr+trXuJzWvgQX0xSmGhBl9JL1VV\ncMQR8OMf21z8obS12bEMevWKOJaut39EWVlHLH4kiS1UBrr0UvXFKIWFGnwl/VRU2OmSS2ynrFBW\nrrSfy5fDW2+FjeTxvmUNG9a9NBPqbAf1xSiFhWr4DqrlZohg0Da13ZG1RDpGJXMZOhRuuCHhjg1+\n4bSg913JL3QAlG7QuPoswH3i/vGPHS38UCZPTnhYRX2wK/mOOm27QePqswBXnykvh7Fj7c0IZfFi\nm6gtgZa+OtuVQkbz4RN/93wlBQQCNgvapEn+6//zPzWfsaLEiRp8NJlW1uEdVjGU5mY74MDQoXDe\neXEbfx0ARSlEVMNXspsZM8LH7YN18s6dm1AvXX3IK/mEDmKeINoazCBVVbalP2oUlJZ2XW+MHUvX\np7Uf7r55fTa7d9tRsvTeKoWAtvC7QVuDWUQwaFv7ofl4XEpL7WAsgUDE++au2727Yyzcnj313ir5\ngbbwE0CHw8siXG1/1Cj/9Xv3wtSpbDnvGh65Mcju3f73zfXZnHmmNfbelAyKks+owe8GjeDJQtas\nCTvIimloYMDSufxy7WjuaJtBUZH/fQsErJTTs6feW6VwUEknCrSzTpbiSjxPPdV5MGHA/VU/evh0\nBi2pinnwFL3nSq6iPW3RP3Be4xlA3QDiLHZ/2VJUBBddFHUvXfXbKLlMQfe0dW3Bgw9aPVf/wHmI\n2312xw5k8eIOQ++ub2uzvXRXr4bGxm6L057XSr6S1xq+21KrqVHHa0GwaBFMn44ceGCHsfeyaROc\neGK3MZjqt1Hylbw2+G5LzVWtRPQPnPdUVcG2bXDMMf7r16+3rYAIRl97Xiv5Sl4bfG9LrWdP2z8n\nnj+wdrzKQRoawodvfvklnHqqHWN3xoxOq9x7DXZgdDX2Sj6R907bRJ216sDLAy65BJ5+Gnbu7BLN\nA9i3gYYGvddKXlDQHa8CgcRaatrxKg9YtAg++8w6bUt84hQ2bIAJEyKmXNC3PCUfyHuDnyjqwMsj\nAgE4/XT/dcuX86PaY3ncnMdoCdLWBitW2BZ/ba39vO22buV/Rclq1OB3gzrw8oxly2wvXbFxPK6g\naYDejQ2c07aUF8ypXEVte8qFefNg1y59y1Nyn7zX8BUlHNtOmcABa5cDNmbf22GrlSLGsZp1pfYJ\nv3evXdejh8blK9lNQWv4ihKOmknLGFdUz1Ls6FreDlvFtPF/XMyqvadw2d5au1zgiivU2Cu5S8EY\nfHW6KaGUl8OrPQN8v/gJflE8Hegw+gBfo5FRrKWGq7mbGfTqBVOmZKSqipIU8j61AmhopeKP65+p\nq4Mx5VXI60fQWnktYlq7SDw3U820r/ye3q/fAIH4B1FXlExSEC18Da1UwtEpbLeigoa5q5hXXEkL\nxRhoT8ZWhHXqcvXVcPDB+qqo5CQFYfA1tFKJlmEVAY5bdT9LKlexffi4Ljl5DGCam2kbPbpLL11F\nyXYKJkontMetpktWoqK21rbqQ2h38I4bB3ffrT8iJWvQfPghqKavxIRj9EPTLrfn3heBuXOhQrV9\nJfMUZFhmpKgc1fSVmKiogPp6dh80sF3X9w60gjH2LaC2NmNVVJRoyEuD77bgb7sNTjsNzjsPrrmm\nw/irpq/ETCBAr+bN/L2mnncPH+e/TWUl9O1rk7UpShaSFIMvIhNF5A0R2Sgit/is7ykijzjr14jI\n4GQcNxyhSbCWLrVv3KedZo2+pktQ4mVYRYDDG19AJk/uss4Yg9mxw46uNXy4RvIoWUfCBl9EioH7\ngLOAocBFIjI0ZLMrgU+MMUcCs4GqRI8bifJy23oPxSvfJJpFUylwnNG1dh/QnzaKOun7BuAvf4HR\no7W1r2QVyWjhjwI2GmPeNsbsAR4Gzg3Z5lxggfP9MeAMEfEdhS5ZiLTnx2pH5RslmQQnVXHg7o8Y\nw2pe4wSgc09dwLb2v/Utbf5AIlwAABlBSURBVO0rWUEyDP6hwGbPfJOzzHcbY0wL8BlQloRj+1JX\nBy0t1pdWVGQHPqqshOef1xa9kjxc6TBIgBGs5/+YTAs+r5YrV9oRtjRuX8kwWZVaQUQqgAqAQYMG\nxV2O65R1wy7nzFFDryQf7++spASCly/i1L7jOKK6a9w+xkB1tU3PfP/9+oNUuuD2DSorg61bU9NH\nKBkG/z3gMM/8QGeZ3zZNIlIC7A9sDS3IGFML1IKNw4+3Qt4cKdqxSkkV/r+zCjgCmDULPvzQJtL3\n8pe/wNixsGqV/jCVdtzIwt277SicRUV2HO5kB5UkQ9J5GThKRIaISA/gB8BTIds8BVzqfL8QeM6k\nuMeXOmWVdOD7O6uogMZGeO65ro4ksOFj48fbeGHV9hU65EF3yGV38J1k9xFK2OA7mvw0YBmwAXjU\nGPN3EflvETnH2WweUCYiG4GbgC6hm4qSdwQCNh7Yj507bbzw6NEwYUJ666VkHa48WORY5KKi1ASZ\nFGRqBUVJK8EgXHstrF8ffpvx462+rxQsydLwNZeOoqQIbxI+6MZvNGMG/Pzn1oHrx/TpUJXSLipK\nAaAGX1FSgDcJX3GxletbWuz3K66wo2O5ht99MJxdFmTYr6+xzls/evSAb35TM3AWAKnK2KsGX1FS\nwF132XxNra0dvln37yQCvXrZKAvwyc76eq319m7bFv4A2uLPW1KZsbcgs2UqSqrxJuErLbXfvYbf\njbLwzc5aUWGF2unTwx+gulo7a+UhwSDMnGlDML2/iXSMu51VHa8UJZcIjcMHWLgQ5s+30o43ysLb\nEbBT5EVVlXXmLl/uf5Dq6o7tlJzHL96+pATWroWf/azjd5OqpI5q8BUlAQKBzn/MQMBq96HabMSO\ngMuW2Zb8nDn2qRBKdTUccYQOsJIHhMbbf+MbsHEjPPlkhxzotvhTYfBVw1eUbGLGDDuQyqefdl5+\nyCHw+efwta/Br3+tDt0cJRi0D333uV5cbA29+wDw+n7ivcWq4StKrlBVBZ980lXb37IFtm+38o8O\noJ6zBAI2gsvr6ykq6hiM6eqrUztGh0o6ipKNVFVZGefxx2H1avjii87rVdvPSVyHbGmpddi6yR1T\nlSwtFJV0FCVBuounDrc+6jjsSy6xefVDKSqyDwOVd3ICbyhmSQlcfnnnvhrJIpKkoy18RUmA7uKp\nw62PKQ570SJobvaP5LnlFtiwAXr3tnH96tjNWrzhuQCDBqX/Wa0avqIkgG+MfRTru9uvC8uWQX09\njBtnW/ZFRVYIXrnSPgwaG60AfMopyT1BJWl4+21kavQ9NfiKkgDd/YnDrY/rzx8IwAsvWBnnjjts\n5E4oa9fC0NAhpZVsIBCwev0ZZ2RuUCbV8BUlQVKu4YdjxowO520offvC6afbaB/V+LOCVKZT8KK5\ndBQlDQSDtqctpMYZ50s4h66LiM3Jr9p+xvHmXiouhttvt26XZKNOW0VJMaEdaubPh1/9yobbpXKM\nUhYtguuug3PPtVp+KMZYbf+ttzSEM8OEjrWdCQ1fW/iKkgTuugv+4z86Z8ssLrY9KGMZozQhmWfC\nhPA5eQCGD9deuhkmVSmRvWhPW0VJMeXltjONS1FRh7GH6MYodTXe226znzFnTVy2DGpqYPBg//Vu\nL10dSzdjZHqsbTX4ipIEAgFrzCsr7fTrX9sWfSxjlMYcqulHRQW8805HCKcfS5fCmDE2Z49SUKiG\nryhJIjRz5rBhsY1RmlSN1w3hDOfUbWuz4+wOG6YSTwGhGr6ipJlIOm5KNN5IY+n26wdTp6pDN4/Q\nsExFyRLSFYvte+Dq6s6J170MGGDH0tW4/ZxHnbaKkiUkRaePh0AAnngCXnzROhn69u28/oMPrLav\nqZfzGtXwFSVNBIPw7rs2XBMyFIvtOhr69g3fS1dTL6eEdIRkdoe28BUlDbhSzm9+Y2P0r7rKyjmQ\n+oGrfamqsvLNQQf5r6+uhiFDNJInSSQccpsk1OArShrwSjktLTY1LmTYCFRV2d6548f7r3czcJ54\nosbtJ0jGpLwQ1OArShrwy46ZLUaAZctsa/+AA/zXr19v4/bV6EdNMNj5zS0bUiODaviKkhYCASvh\nhGq4mc6t0k5VlZ3CZeBsa4OxY+Hf/k21fYdIWVD9IrH87n/aMcZk5TRixAijKPlOfb0xs2bZz1jX\nd7dv3NTUGDNggDE2gLPrdNRRKThoblFfb8w++xhTXGw/vZdj1iy7HOznrFnprRuwzoSxqxk37OEm\nNfhKoRPJqERalzQmTw5v9EWMmTSpYA1/JKOelnsTgUgGXzV8RclSImn8adH/Fy2CUaP81xlj4/bH\nji1IbT+SJu/KN7ffnsaOdVGiGr6iZCmRcuukLbf6mjU2H88jj9jwolBaW+H88+F3v8suy5ZCXO1+\nzpzwOZJC8yplC5paQVGymLTn3YlEba21chs2dF1XVATnnJP3qRncgW727rXpsOvqsu90NbWComQh\noaF7sZL23OoVFdDQYI16KG1tBZGaYeFC+1ZljP10h7TMFRKSdESkH/AIMBhoBL5vjPnEZ7tW4HVn\n9l1jzDmJHFdRcp1okqhlLNFad1RVwXvvhR9Lt7ra5u1ZsCBLKpw8Pvig63w2pEyIlkRb+LcAfzbG\nHAX82Zn340tjzHBnUmOvFDzROF2zpmOWH4sW2dG1jjnGf/2bb+alQ3fAgK7LsiFlQrQkavDPBRY4\n3xcAkxIsT1EKgmh6XmZL78ywuBJPTU3H0F5eWlvhllvgqKPyQuYJBm2LvqjI5kPq0cM+ALL2oexD\nQk5bEfnUGHOA812AT9z5kO1agPVAC3C3MWZpmPIqgAqAQYMGjdi0aVPcdVOUbCdUCvCTBnJGLggG\n4dJLbcs+HP3723z8WX0inXGvf1kZ/OhHsHu3XV5cbIexHDYs+2S3hAZAEZEVgM+LDP8BLPAaeBH5\nxBhzoE8Zhxpj3hORrwHPAWcYY96KdFyN0lEKiazV62Nlxgx46CH42tfg7behqanrNtOn50R6Bu89\nKSqykTkuInDnndZpnm0P5YSidIwxZxpjjvOZngQ+FJFDnIMcAnwUpoz3nM+3gTrgxDjPRVHyEq9e\nv3s3zJyZ/XqwL1VVsHmzHU/34ov9t6muzgmJx3tPWls7xjEAG5LpSmxpj5ZKgEQ1/KeAS53vlwJP\nhm4gIgeKSE/n+0HAqUBDgsdVlLzC1euLimyE44oVueEEjEhVVXin7i9+kfUn5/Wh9OxpJZzKSjtl\nY/x9NCTa0/Zu4FERuRLYBHwfQERGApXGmKnAMUCNiLRhHzB3G2PU4CuKB7c7/syZ1ti3tXU4AXPR\nsLTT0ACnnAJr13Ze3tYGV14JX34J++wDN95oncBZRNZkuEwi2tNWUbKIvNHyQ6mthVmz7BiP4WzO\n+PE2N38ayDbdPZkk5LTNFGrwlUIln40RwaB9jVm+3H99Goy+96FaUgKXXw5Tpth1+XDd1eArSgGS\ntQ+OYBC+9a3OYS9eDj8c/v3fUybx3HWX7SjV2mrn3Zh6Y+yy0DerrL2OYYhk8DVbpqLkGNEYoNpa\nmDbNGrCePbNMGgoEbBTPtdfa4RND2bTJjqW7eDHcfXfSK+46Y3ft6kjwv2ePXefNkePG3994Yx5J\nbOES5Wd60gFQFKUr0QyuUV9vTElJx1glRUUdI2OlZISsRKipMaZfv/ADrRQVGTN9etIPW19vTGWl\nMT162GvZs2fH9x497HxxsTGlpXasF+91zHbQAVAUJT+INgdPW1vHfHGxbalmZc6XigqbVH76dKut\nhNLWZuP2+/a1ry1JIhCA+++31+r22+H55zu+X3GFTf3vxt+7qndbm72OuYwafEXJIaLNwdOzp43p\nLymBe++1NjWrc75UVcGLL8K4cf6Gf8cOK/OcckrEYmJNOR0I2OvlXo9bb7UOXPcaFxd3VKeoyF7H\nXEY1fEXJIaKJDffbJhhM0whZieBq+8GgTbq2cmXXbdauhYMP9s3JE09Ia7h93OsXquFn5XWLATX4\nipJjRDN8Xug2oQ8BsC3hrIw8cQ2/X4ctgOZm+yawcmWnyvvJXd2dW7h9vNdv2LDcG+gkHGrwFSUH\niSdU0DVikVrCWRWCuGYNDB3qP6RiS4ttep90ktVgAoG4xvmNdp8FC+w2CxbkdqSOGnxFyTES7Y0b\nrlWblb18GxpsorXZs7vG7a9da6eaGrj5ZgJVVTGnQnDffCK14ON5c8hW1GmrKDmGnwEKBuGaa+zk\ndVj6OTHDOX6zdoStqipboenT4dBDYfDgzuuNsZE8EybEnblywQL4zW/8I5iyfiCaGNAWvqLkGKEy\nRFmZXeZ2Hpo/34YZgn+LPZzjNx5JJK1UVdkpGITTTusYjcRl+XK45BJYtCisNOW3vLsWfD4lUVOD\nryg5RqgBqqvrrHZ4W+fhDJmf4zdnDFsgYJ9ofpE8ixezc+3fuXXTr1ndGuj0oAsnWUXzoIvGUZ4L\nqMFXlBwk1ACVlna08L1GK9YWe84YNjeSZ8KELonYer+5nhWcyvscypIvL6auropAIHJETk486JKA\nGnxFyXFcY+Y6Hp2gFaAADNmyZVbGWby4fZEAxRgOo4npVNNc9x7cuihsSz6rIpNSjGbLVBQl9wkx\n+gZr+A0gIrYXbyDgO3B81kUmJYhmy1SUAqKQWqztLFpkI3geeggAcQZPb0/S8P3vwwcfEDjpJAJr\n1rTvlk8hl9GgYZmKkke4LdbbbrMGPzRMM69xB1DfvBkmT7ZJcNypqcl21lq71jblnURs+RRyGQ1q\n8BUljwhtsdbUZFl2zHSxaJGVce68s2sytr17bSK22tp2h+3tt+eHnNMdKukoSh4RbnCPfJcqfHFD\ncJYu9c/JM28ebN1KoLycwK2FcXG0ha8oeYTbYr36apsiOVekiljTGsdU3po1MGBA141eecUOpTh6\nNAwcWBCvQRqloyh5Sq44b5MdKRO2vBkzbDfk3r1h0KCunbY80Ty5TKQoHW3hK0qeEppXJlIrurbW\n9mFK4qBSUZPsHD5hy6uqgo8+gnfesVk4QzEGrrwyrz3dquErSgEQqRVdW2slIOjotFpRkb66JTuH\nT1TlTZkCc+d2Xb5hg52cDJxUVSVWmSxDW/iKUgBEakU//njnbUPnXZKts7u4foerroJLL01eeaGR\nN53qHwhAfb3V7v1wM3AefXR+tfbDjW6e6WnEiBHJH85dUQqU+npj9tnHmOJi+1lf37GupsaN57FT\nTU1s+6e6fikvv77emMpKY4qKOl8I7zR9enIrlEKAdSaMXdUWvqIUAJHizSsqrIIxfrz99JNzEtHZ\ng0E47zw7YmE4H0Es5cfzphGx/EAA7r/fTkVhTGJ1NZx4Yu639sM9CTI9aQtfUTKH2+itrLTf422B\n19cbU1KSvDeIROoR1X719cYceWT4ln64E8gi0Ba+oijREgxaR+fcuXYaOxZefz2+Hql1dTajgRc/\nH0FUujvxv2kEAjBnjnVcz5kTof6BALz5ph1dq7jYf5urr7bJ2nKRcE+CTE/awleUzDBrljEinRu1\nJSXx6erRtvDD7RvaKk95C99LqHMjdDryyOQ7G5IA2sJXFCVaysvtgCpe2trii48PBGz/pkmTYNSo\n8D4CP8Jlsoz3TSPmNwPXudGvn//6jRvh1FOtgyJXtP1wT4JMT9rCV5TMUV9vzKRJtkVcVJSayJlo\n6pCsyJ36emN69rRvLj17xlHW9OnG9OkTucWfJZE8aAtfUZRYCATgiSdg1Sq44w6re9fVpbchG9qa\nh8T6ARjT+TMmqqpg+3YbyhSO6moYMiQz3ZWjJdyTIJoJ+B7wd6ANGBlhu4nAG8BG4JZoytYWvqJk\nB6mOkU9HHWbNsvuC/Zw1K4HKTJ8euaWf4dY+KWzh/w04H1gZbgMRKQbuA84ChgIXiYhPIgtFUbKR\nZOe6SUYdFi6MrbWf1IFOqqpsL91x48JvU11tk7VlGQkZfGPMBmPMG91sNgrYaIx52xizB3gYODeR\n4yqKkj7CGctoOkDF00nKbx9vHYqLbdLL226LfnCXpA90EgjACy9Yp+7hh/tvU11tvd/ZFMIZrukf\nywTUEUbSAS4EHvDM/xC4N8y2FcA6YN2gQYNS+NKjKEos1NdbGcSVUqKRWKLdJtpy3W0rK5MozySL\n7mSeyZPTVhUiSDrdZssUkRWAz+gB/Icx5skkPHPaMcbUArVg8+Ens2xFUeLHHTzKJZrBv7vbJjSD\n55w5tlOWO1rX7t2d93HrEAzCggXJy66ZFKqq4IgjYNYs2LSp6/pHHrESUDrTkPrQraRjjDnTGHOc\nzxStsX8POMwzP9BZpihKjhKNJh4qw7z7bmf5xftA2L0bpk2DP/2pI4qmrQ3KyrqWG408k6rMnhGp\nqIDGRv9InpYW20N36NDMRvGEa/rHMhFZ0ikB3gaGAD2AvwDHdlemRukoSnYTKseE26ay0pjSUhsD\n36OHv3xTUtI1WWVRUVe5JtpjZjqqyEyfbsx++4XPwDlpUsoqRqqidETkPBFpAgLAH0RkmbP8qyLy\njPNAaQGmAcuADcCjxpi/J3JcRVEyT+iIWuG2Adi7t2NA9YULO9ZfeqnNg3/ffXYMXjdZZVGRnfe+\nObgSUHfO2myIKqKqCnbssBk4/Vi61I6lm+a4/YRGvDLGPAE84bP8feDbnvlngGcSOZaiKPnBBx90\n1e+nTLHyTF2dlXG2bu06Fm80fgNI/ghaCeFq9v/7v9DQ0HV9Y6OVet56Ky2ja+kQh4qipJQpU2De\nPNvKB3j2WRgwoKvx7u5tIVpD7mr8WTOAe0WFnWpr7YV4+eWu3X2rq+1nio2+plZQFCWlBAJ2bHAR\nO++mS47k9PVzusYSSx+N3JR2KipgzRo7Vq4f1dXQp09KO2yJCX3SZAkjR44069aty3Q1FEVJAn6D\nqIN/KzzSgOt5Q22tNeyffuq/fvx4WLYsrqJF5BVjzEi/dSrpKIqSEoLBzgbdT2bxM+SRtPrQMnOW\nigoYNsyOLtPa2nX98uX2gZBkiUcNvqIoScFrjKGjlV5SApdfbrX8W2/tvpxwWn3etfwDAZuO9Npr\nYf36rut/8Qs7kEAST1I1fEVREiY0ZHLhws6dqmpqEs97kxXhlskmEIDXXvMfaMWYpJ+kGnxFURIm\n1BiDbYW7jlo3Bj+WMWhDna5JzXiZbVRU2FjU6dNtJwQR6NUr6SepBl9RlIQJNcZuXP3VVyfPSCc9\n42U2UlUFq1fDnXem5CQ1SkdRlKQQzqGaN47WHCFSlI4afEVRlDwiksFXSUdRlJSTkeyVShc0LFNR\nlJSSd+GUOYy28BVFSSl5GU6Zo6jBVxQlpeR1OGWOoZKOoigpJeuyVxYwavAVRUk5oWPiKplBJR1F\nUZQCQQ2+oihKgaAGX1EUpUBQg68oilIgqMFXFEUpENTgK4qiFAhZmzxNRJqBTQkUcRDwcZKqkwly\nvf6Q++eQ6/UHPYdsIN31P9wY099vRdYa/EQRkXXhMsblArlef8j9c8j1+oOeQzaQTfVXSUdRFKVA\nUIOvKIpSIOSzwa/NdAUSJNfrD7l/Drlef9BzyAaypv55q+EriqIoncnnFr6iKIriQQ2+oihKgZB3\nBl9EJorIGyKyUURuyXR9YkVEHhSRj0Tkb5muSzyIyGEi8ryINIjI30XkhkzXKVZEpJeIrBWRvzjn\n8LNM1ykeRKRYRF4Tkd9nui7xICKNIvK6iKwXkXWZrk88iMgBIvKYiPxDRDaISEaTROeVhi8ixcA/\ngX8BmoCXgYuMMQ0ZrVgMiMg4YCew0BhzXKbrEysicghwiDHmVRHpA7wCTMqxeyBAb2PMThEpBVYD\nNxhjXspw1WJCRG4CRgJ9jTFnZ7o+sSIijcBIY0zOdroSkQXAKmPMAyLSA9jXGPNppuqTby38UcBG\nY8zbxpg9wMPAuRmuU0wYY1YC2zJdj3gxxmwxxrzqfN8BbAAOzWytYsNYdjqzpc6UUy0jERkIfAd4\nINN1KVREZH9gHDAPwBizJ5PGHvLP4B8KbPbMN5FjxiafEJHBwInAmszWJHYcOWQ98BHwJ2NMrp3D\nHGA60JbpiiSAAZaLyCsiUpHpysTBEKAZmO9Iaw+ISO9MVijfDL6SJYjIfsDjwI3GmO2Zrk+sGGNa\njTHDgYHAKBHJGXlNRM4GPjLGvJLpuiTIGGPMScBZwHWO3JlLlAAnAfcbY04EPgcy6lfMN4P/HnCY\nZ36gs0xJI47u/Tiw2Bjzu0zXJxGcV/DngYmZrksMnAqc42jgDwOni8iizFYpdowx7zmfHwFPYCXb\nXKIJaPK8HT6GfQBkjHwz+C8DR4nIEMdB8gPgqQzXqaBwHJ7zgA3GmF9muj7xICL9ReQA5/s+2CCA\nf2S2VtFjjLnVGDPQGDMY+x94zhhzSYarFRMi0ttx+uPIIOOBnIpcM8Z8AGwWka87i84AMhq8UJLJ\ngycbY0yLiEwDlgHFwIPGmL9nuFoxISJLgHLgIBFpAn5qjJmX2VrFxKnAD4HXHQ0c4N+NMc9ksE6x\ncgiwwIn6KgIeNcbkZGhjDvMV4AnbfqAEeMgY88fMVikurgcWOw3Qt4HLM1mZvArLVBRFUcKTb5KO\noiiKEgY1+IqiKAWCGnxFUZQCQQ2+oihKgaAGX1EUpUBQg68oilIgqMFXFEUpEP4/gbBOHjqKOvUA\nAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Wokallj1D21L",
+        "colab_type": "text"
+      },
+      "source": [
+        "Oh dear! The graph makes it clear that our network has learned to approximate the sine function in a very limited way. From `0 <= x <= 1.1` the line mostly fits, but for the rest of our `x` values it is a rough approximation at best.\n",
+        "\n",
+        "The rigidity of this fit suggests that the model does not have enough capacity to learn the full complexity of the sine wave function, so it's only able to approximate it in an overly simplistic way. By making our model bigger, we should be able to improve its performance."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "T7sL-hWtoAZC",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Training a Larger Model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aQd0JSdOoAbw",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 1. Design the Model\n",
+        "To make our model bigger, let's add an additional layer of neurons. The following cell redefines our model in the same way as earlier, but with 16 neurons in the first layer and an additional layer of 16 neurons in the middle:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "oW0xus6AF-4o",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model_2 = tf.keras.Sequential()\n",
+        "\n",
+        "# First layer takes a scalar input and feeds it through 16 \"neurons\". The\n",
+        "# neurons decide whether to activate based on the 'relu' activation function.\n",
+        "model_2.add(keras.layers.Dense(16, activation='relu', input_shape=(1,)))\n",
+        "\n",
+        "# The new second layer may help the network learn more complex representations\n",
+        "model_2.add(keras.layers.Dense(16, activation='relu'))\n",
+        "\n",
+        "# Final layer is a single neuron, since we want to output a single value\n",
+        "model_2.add(keras.layers.Dense(1))\n",
+        "\n",
+        "# Compile the model using a standard optimizer and loss function for regression\n",
+        "model_2.compile(optimizer='adam', loss='mse', metrics=['mae'])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Dv2SC409Grap",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 2. Train the Model ###\n",
+        "\n",
+        "We'll now train the new model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "DPAUrdkmGq1M",
+        "colab_type": "code",
+        "outputId": "64730ff7-488e-4b74-d5a1-49a1b733e9e5",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        }
+      },
+      "source": [
+        "history_2 = model_2.fit(x_train, y_train, epochs=500, batch_size=64,\n",
+        "                    validation_data=(x_validate, y_validate))"
+      ],
+      "execution_count": 15,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Train on 600 samples, validate on 200 samples\n",
+            "Epoch 1/500\n",
+            "600/600 [==============================] - 0s 736us/sample - loss: 0.4245 - mae: 0.5529 - val_loss: 0.4310 - val_mae: 0.5678\n",
+            "Epoch 2/500\n",
+            "600/600 [==============================] - 0s 64us/sample - loss: 0.4056 - mae: 0.5462 - val_loss: 0.4138 - val_mae: 0.5548\n",
+            "Epoch 3/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.3897 - mae: 0.5302 - val_loss: 0.3974 - val_mae: 0.5437\n",
+            "Epoch 4/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.3743 - mae: 0.5181 - val_loss: 0.3815 - val_mae: 0.5336\n",
+            "Epoch 5/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.3602 - mae: 0.5128 - val_loss: 0.3677 - val_mae: 0.5276\n",
+            "Epoch 6/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.3436 - mae: 0.5010 - val_loss: 0.3504 - val_mae: 0.5140\n",
+            "Epoch 7/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.3281 - mae: 0.4859 - val_loss: 0.3340 - val_mae: 0.5021\n",
+            "Epoch 8/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.3127 - mae: 0.4748 - val_loss: 0.3177 - val_mae: 0.4921\n",
+            "Epoch 9/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.2961 - mae: 0.4626 - val_loss: 0.3012 - val_mae: 0.4794\n",
+            "Epoch 10/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.2797 - mae: 0.4502 - val_loss: 0.2851 - val_mae: 0.4687\n",
+            "Epoch 11/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.2635 - mae: 0.4391 - val_loss: 0.2699 - val_mae: 0.4589\n",
+            "Epoch 12/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.2467 - mae: 0.4251 - val_loss: 0.2523 - val_mae: 0.4414\n",
+            "Epoch 13/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.2312 - mae: 0.4107 - val_loss: 0.2369 - val_mae: 0.4293\n",
+            "Epoch 14/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.2149 - mae: 0.3971 - val_loss: 0.2225 - val_mae: 0.4168\n",
+            "Epoch 15/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.2031 - mae: 0.3861 - val_loss: 0.2085 - val_mae: 0.4023\n",
+            "Epoch 16/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1908 - mae: 0.3716 - val_loss: 0.1970 - val_mae: 0.3899\n",
+            "Epoch 17/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1777 - mae: 0.3590 - val_loss: 0.1881 - val_mae: 0.3810\n",
+            "Epoch 18/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1682 - mae: 0.3475 - val_loss: 0.1789 - val_mae: 0.3677\n",
+            "Epoch 19/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.1603 - mae: 0.3367 - val_loss: 0.1723 - val_mae: 0.3586\n",
+            "Epoch 20/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1536 - mae: 0.3276 - val_loss: 0.1668 - val_mae: 0.3500\n",
+            "Epoch 21/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1487 - mae: 0.3181 - val_loss: 0.1619 - val_mae: 0.3403\n",
+            "Epoch 22/500\n",
+            "600/600 [==============================] - 0s 74us/sample - loss: 0.1433 - mae: 0.3108 - val_loss: 0.1598 - val_mae: 0.3358\n",
+            "Epoch 23/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.1418 - mae: 0.3072 - val_loss: 0.1558 - val_mae: 0.3248\n",
+            "Epoch 24/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.1389 - mae: 0.2992 - val_loss: 0.1538 - val_mae: 0.3189\n",
+            "Epoch 25/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.1387 - mae: 0.2978 - val_loss: 0.1524 - val_mae: 0.3161\n",
+            "Epoch 26/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1346 - mae: 0.2904 - val_loss: 0.1510 - val_mae: 0.3112\n",
+            "Epoch 27/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1340 - mae: 0.2904 - val_loss: 0.1501 - val_mae: 0.3098\n",
+            "Epoch 28/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1313 - mae: 0.2849 - val_loss: 0.1489 - val_mae: 0.3042\n",
+            "Epoch 29/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1303 - mae: 0.2830 - val_loss: 0.1489 - val_mae: 0.3058\n",
+            "Epoch 30/500\n",
+            "600/600 [==============================] - 0s 63us/sample - loss: 0.1292 - mae: 0.2804 - val_loss: 0.1474 - val_mae: 0.2997\n",
+            "Epoch 31/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1286 - mae: 0.2781 - val_loss: 0.1467 - val_mae: 0.2998\n",
+            "Epoch 32/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.1274 - mae: 0.2774 - val_loss: 0.1463 - val_mae: 0.2990\n",
+            "Epoch 33/500\n",
+            "600/600 [==============================] - 0s 62us/sample - loss: 0.1268 - mae: 0.2758 - val_loss: 0.1451 - val_mae: 0.2945\n",
+            "Epoch 34/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1295 - mae: 0.2746 - val_loss: 0.1449 - val_mae: 0.2966\n",
+            "Epoch 35/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1278 - mae: 0.2760 - val_loss: 0.1438 - val_mae: 0.2937\n",
+            "Epoch 36/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1246 - mae: 0.2710 - val_loss: 0.1431 - val_mae: 0.2908\n",
+            "Epoch 37/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1247 - mae: 0.2693 - val_loss: 0.1434 - val_mae: 0.2939\n",
+            "Epoch 38/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1237 - mae: 0.2702 - val_loss: 0.1415 - val_mae: 0.2893\n",
+            "Epoch 39/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1263 - mae: 0.2691 - val_loss: 0.1411 - val_mae: 0.2891\n",
+            "Epoch 40/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1238 - mae: 0.2693 - val_loss: 0.1408 - val_mae: 0.2906\n",
+            "Epoch 41/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.1209 - mae: 0.2659 - val_loss: 0.1393 - val_mae: 0.2859\n",
+            "Epoch 42/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.1216 - mae: 0.2644 - val_loss: 0.1387 - val_mae: 0.2842\n",
+            "Epoch 43/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1200 - mae: 0.2642 - val_loss: 0.1386 - val_mae: 0.2869\n",
+            "Epoch 44/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1193 - mae: 0.2626 - val_loss: 0.1370 - val_mae: 0.2814\n",
+            "Epoch 45/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.1187 - mae: 0.2625 - val_loss: 0.1362 - val_mae: 0.2829\n",
+            "Epoch 46/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1177 - mae: 0.2593 - val_loss: 0.1353 - val_mae: 0.2796\n",
+            "Epoch 47/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.1172 - mae: 0.2598 - val_loss: 0.1346 - val_mae: 0.2789\n",
+            "Epoch 48/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.1158 - mae: 0.2569 - val_loss: 0.1337 - val_mae: 0.2769\n",
+            "Epoch 49/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1176 - mae: 0.2590 - val_loss: 0.1329 - val_mae: 0.2761\n",
+            "Epoch 50/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1141 - mae: 0.2544 - val_loss: 0.1320 - val_mae: 0.2759\n",
+            "Epoch 51/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1138 - mae: 0.2536 - val_loss: 0.1312 - val_mae: 0.2741\n",
+            "Epoch 52/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1127 - mae: 0.2535 - val_loss: 0.1313 - val_mae: 0.2776\n",
+            "Epoch 53/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.1124 - mae: 0.2518 - val_loss: 0.1294 - val_mae: 0.2708\n",
+            "Epoch 54/500\n",
+            "600/600 [==============================] - 0s 61us/sample - loss: 0.1115 - mae: 0.2508 - val_loss: 0.1287 - val_mae: 0.2722\n",
+            "Epoch 55/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.1103 - mae: 0.2487 - val_loss: 0.1278 - val_mae: 0.2709\n",
+            "Epoch 56/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1094 - mae: 0.2485 - val_loss: 0.1267 - val_mae: 0.2687\n",
+            "Epoch 57/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1090 - mae: 0.2479 - val_loss: 0.1259 - val_mae: 0.2684\n",
+            "Epoch 58/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1118 - mae: 0.2456 - val_loss: 0.1256 - val_mae: 0.2695\n",
+            "Epoch 59/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1106 - mae: 0.2500 - val_loss: 0.1243 - val_mae: 0.2670\n",
+            "Epoch 60/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.1071 - mae: 0.2429 - val_loss: 0.1231 - val_mae: 0.2626\n",
+            "Epoch 61/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.1059 - mae: 0.2436 - val_loss: 0.1226 - val_mae: 0.2653\n",
+            "Epoch 62/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.1048 - mae: 0.2419 - val_loss: 0.1213 - val_mae: 0.2607\n",
+            "Epoch 63/500\n",
+            "600/600 [==============================] - 0s 65us/sample - loss: 0.1038 - mae: 0.2394 - val_loss: 0.1204 - val_mae: 0.2604\n",
+            "Epoch 64/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.1029 - mae: 0.2383 - val_loss: 0.1196 - val_mae: 0.2593\n",
+            "Epoch 65/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.1021 - mae: 0.2376 - val_loss: 0.1186 - val_mae: 0.2576\n",
+            "Epoch 66/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.1012 - mae: 0.2353 - val_loss: 0.1179 - val_mae: 0.2585\n",
+            "Epoch 67/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.1006 - mae: 0.2358 - val_loss: 0.1169 - val_mae: 0.2568\n",
+            "Epoch 68/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0996 - mae: 0.2346 - val_loss: 0.1158 - val_mae: 0.2553\n",
+            "Epoch 69/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0996 - mae: 0.2349 - val_loss: 0.1148 - val_mae: 0.2534\n",
+            "Epoch 70/500\n",
+            "600/600 [==============================] - 0s 62us/sample - loss: 0.0985 - mae: 0.2316 - val_loss: 0.1142 - val_mae: 0.2490\n",
+            "Epoch 71/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0986 - mae: 0.2327 - val_loss: 0.1144 - val_mae: 0.2559\n",
+            "Epoch 72/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0981 - mae: 0.2306 - val_loss: 0.1121 - val_mae: 0.2494\n",
+            "Epoch 73/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0966 - mae: 0.2308 - val_loss: 0.1118 - val_mae: 0.2521\n",
+            "Epoch 74/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0972 - mae: 0.2281 - val_loss: 0.1104 - val_mae: 0.2456\n",
+            "Epoch 75/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0960 - mae: 0.2293 - val_loss: 0.1101 - val_mae: 0.2500\n",
+            "Epoch 76/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0933 - mae: 0.2247 - val_loss: 0.1087 - val_mae: 0.2424\n",
+            "Epoch 77/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0922 - mae: 0.2221 - val_loss: 0.1080 - val_mae: 0.2453\n",
+            "Epoch 78/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0917 - mae: 0.2235 - val_loss: 0.1069 - val_mae: 0.2432\n",
+            "Epoch 79/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0922 - mae: 0.2204 - val_loss: 0.1061 - val_mae: 0.2394\n",
+            "Epoch 80/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0918 - mae: 0.2239 - val_loss: 0.1062 - val_mae: 0.2456\n",
+            "Epoch 81/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0908 - mae: 0.2220 - val_loss: 0.1048 - val_mae: 0.2372\n",
+            "Epoch 82/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0889 - mae: 0.2193 - val_loss: 0.1046 - val_mae: 0.2421\n",
+            "Epoch 83/500\n",
+            "600/600 [==============================] - 0s 62us/sample - loss: 0.0883 - mae: 0.2175 - val_loss: 0.1029 - val_mae: 0.2339\n",
+            "Epoch 84/500\n",
+            "600/600 [==============================] - 0s 64us/sample - loss: 0.0872 - mae: 0.2143 - val_loss: 0.1022 - val_mae: 0.2372\n",
+            "Epoch 85/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0865 - mae: 0.2148 - val_loss: 0.1012 - val_mae: 0.2342\n",
+            "Epoch 86/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0856 - mae: 0.2124 - val_loss: 0.1004 - val_mae: 0.2317\n",
+            "Epoch 87/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0850 - mae: 0.2122 - val_loss: 0.0998 - val_mae: 0.2340\n",
+            "Epoch 88/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0843 - mae: 0.2121 - val_loss: 0.0987 - val_mae: 0.2312\n",
+            "Epoch 89/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0836 - mae: 0.2103 - val_loss: 0.0981 - val_mae: 0.2313\n",
+            "Epoch 90/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0832 - mae: 0.2113 - val_loss: 0.0971 - val_mae: 0.2288\n",
+            "Epoch 91/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0830 - mae: 0.2066 - val_loss: 0.0970 - val_mae: 0.2238\n",
+            "Epoch 92/500\n",
+            "600/600 [==============================] - 0s 70us/sample - loss: 0.0829 - mae: 0.2111 - val_loss: 0.0965 - val_mae: 0.2311\n",
+            "Epoch 93/500\n",
+            "600/600 [==============================] - 0s 69us/sample - loss: 0.0813 - mae: 0.2068 - val_loss: 0.0959 - val_mae: 0.2234\n",
+            "Epoch 94/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0816 - mae: 0.2070 - val_loss: 0.0950 - val_mae: 0.2288\n",
+            "Epoch 95/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0817 - mae: 0.2036 - val_loss: 0.0940 - val_mae: 0.2189\n",
+            "Epoch 96/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0803 - mae: 0.2064 - val_loss: 0.0929 - val_mae: 0.2243\n",
+            "Epoch 97/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0795 - mae: 0.2018 - val_loss: 0.0919 - val_mae: 0.2201\n",
+            "Epoch 98/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0773 - mae: 0.2024 - val_loss: 0.0930 - val_mae: 0.2276\n",
+            "Epoch 99/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0780 - mae: 0.2015 - val_loss: 0.0905 - val_mae: 0.2205\n",
+            "Epoch 100/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.0770 - mae: 0.2025 - val_loss: 0.0900 - val_mae: 0.2220\n",
+            "Epoch 101/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0768 - mae: 0.1993 - val_loss: 0.0892 - val_mae: 0.2146\n",
+            "Epoch 102/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0783 - mae: 0.2039 - val_loss: 0.0885 - val_mae: 0.2191\n",
+            "Epoch 103/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0748 - mae: 0.1963 - val_loss: 0.0876 - val_mae: 0.2149\n",
+            "Epoch 104/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0743 - mae: 0.1978 - val_loss: 0.0873 - val_mae: 0.2179\n",
+            "Epoch 105/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0733 - mae: 0.1952 - val_loss: 0.0865 - val_mae: 0.2114\n",
+            "Epoch 106/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0733 - mae: 0.1943 - val_loss: 0.0862 - val_mae: 0.2131\n",
+            "Epoch 107/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0723 - mae: 0.1936 - val_loss: 0.0848 - val_mae: 0.2112\n",
+            "Epoch 108/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0715 - mae: 0.1927 - val_loss: 0.0843 - val_mae: 0.2125\n",
+            "Epoch 109/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.0714 - mae: 0.1903 - val_loss: 0.0836 - val_mae: 0.2100\n",
+            "Epoch 110/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0719 - mae: 0.1952 - val_loss: 0.0830 - val_mae: 0.2111\n",
+            "Epoch 111/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0714 - mae: 0.1895 - val_loss: 0.0824 - val_mae: 0.2072\n",
+            "Epoch 112/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0699 - mae: 0.1929 - val_loss: 0.0823 - val_mae: 0.2110\n",
+            "Epoch 113/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0699 - mae: 0.1891 - val_loss: 0.0810 - val_mae: 0.2053\n",
+            "Epoch 114/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0691 - mae: 0.1898 - val_loss: 0.0805 - val_mae: 0.2074\n",
+            "Epoch 115/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0678 - mae: 0.1859 - val_loss: 0.0798 - val_mae: 0.2025\n",
+            "Epoch 116/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0674 - mae: 0.1880 - val_loss: 0.0794 - val_mae: 0.2061\n",
+            "Epoch 117/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0672 - mae: 0.1844 - val_loss: 0.0785 - val_mae: 0.2008\n",
+            "Epoch 118/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0663 - mae: 0.1848 - val_loss: 0.0780 - val_mae: 0.2038\n",
+            "Epoch 119/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.0657 - mae: 0.1830 - val_loss: 0.0772 - val_mae: 0.2003\n",
+            "Epoch 120/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0649 - mae: 0.1813 - val_loss: 0.0767 - val_mae: 0.2002\n",
+            "Epoch 121/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0654 - mae: 0.1845 - val_loss: 0.0761 - val_mae: 0.1997\n",
+            "Epoch 122/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0642 - mae: 0.1815 - val_loss: 0.0755 - val_mae: 0.1991\n",
+            "Epoch 123/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0635 - mae: 0.1807 - val_loss: 0.0750 - val_mae: 0.1955\n",
+            "Epoch 124/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0635 - mae: 0.1779 - val_loss: 0.0744 - val_mae: 0.1981\n",
+            "Epoch 125/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.0642 - mae: 0.1844 - val_loss: 0.0738 - val_mae: 0.1968\n",
+            "Epoch 126/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0659 - mae: 0.1780 - val_loss: 0.0739 - val_mae: 0.1973\n",
+            "Epoch 127/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0622 - mae: 0.1817 - val_loss: 0.0731 - val_mae: 0.1985\n",
+            "Epoch 128/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0619 - mae: 0.1772 - val_loss: 0.0722 - val_mae: 0.1936\n",
+            "Epoch 129/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0607 - mae: 0.1764 - val_loss: 0.0718 - val_mae: 0.1946\n",
+            "Epoch 130/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0602 - mae: 0.1747 - val_loss: 0.0710 - val_mae: 0.1925\n",
+            "Epoch 131/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0600 - mae: 0.1748 - val_loss: 0.0706 - val_mae: 0.1923\n",
+            "Epoch 132/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0592 - mae: 0.1743 - val_loss: 0.0699 - val_mae: 0.1913\n",
+            "Epoch 133/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0594 - mae: 0.1722 - val_loss: 0.0695 - val_mae: 0.1901\n",
+            "Epoch 134/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0589 - mae: 0.1753 - val_loss: 0.0690 - val_mae: 0.1903\n",
+            "Epoch 135/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0587 - mae: 0.1702 - val_loss: 0.0684 - val_mae: 0.1886\n",
+            "Epoch 136/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0575 - mae: 0.1725 - val_loss: 0.0682 - val_mae: 0.1908\n",
+            "Epoch 137/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0570 - mae: 0.1704 - val_loss: 0.0676 - val_mae: 0.1871\n",
+            "Epoch 138/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0567 - mae: 0.1692 - val_loss: 0.0671 - val_mae: 0.1879\n",
+            "Epoch 139/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0562 - mae: 0.1692 - val_loss: 0.0663 - val_mae: 0.1848\n",
+            "Epoch 140/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0558 - mae: 0.1676 - val_loss: 0.0658 - val_mae: 0.1847\n",
+            "Epoch 141/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0553 - mae: 0.1663 - val_loss: 0.0653 - val_mae: 0.1840\n",
+            "Epoch 142/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0552 - mae: 0.1665 - val_loss: 0.0650 - val_mae: 0.1850\n",
+            "Epoch 143/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0550 - mae: 0.1688 - val_loss: 0.0642 - val_mae: 0.1831\n",
+            "Epoch 144/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.0542 - mae: 0.1647 - val_loss: 0.0640 - val_mae: 0.1820\n",
+            "Epoch 145/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0536 - mae: 0.1644 - val_loss: 0.0633 - val_mae: 0.1812\n",
+            "Epoch 146/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0533 - mae: 0.1646 - val_loss: 0.0628 - val_mae: 0.1820\n",
+            "Epoch 147/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0527 - mae: 0.1630 - val_loss: 0.0623 - val_mae: 0.1803\n",
+            "Epoch 148/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0524 - mae: 0.1620 - val_loss: 0.0620 - val_mae: 0.1809\n",
+            "Epoch 149/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0519 - mae: 0.1624 - val_loss: 0.0613 - val_mae: 0.1798\n",
+            "Epoch 150/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0527 - mae: 0.1629 - val_loss: 0.0610 - val_mae: 0.1798\n",
+            "Epoch 151/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0515 - mae: 0.1605 - val_loss: 0.0609 - val_mae: 0.1752\n",
+            "Epoch 152/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0511 - mae: 0.1609 - val_loss: 0.0602 - val_mae: 0.1788\n",
+            "Epoch 153/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0506 - mae: 0.1594 - val_loss: 0.0594 - val_mae: 0.1786\n",
+            "Epoch 154/500\n",
+            "600/600 [==============================] - 0s 64us/sample - loss: 0.0501 - mae: 0.1607 - val_loss: 0.0589 - val_mae: 0.1763\n",
+            "Epoch 155/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0497 - mae: 0.1576 - val_loss: 0.0587 - val_mae: 0.1762\n",
+            "Epoch 156/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0493 - mae: 0.1585 - val_loss: 0.0581 - val_mae: 0.1756\n",
+            "Epoch 157/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0489 - mae: 0.1575 - val_loss: 0.0581 - val_mae: 0.1780\n",
+            "Epoch 158/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0486 - mae: 0.1582 - val_loss: 0.0574 - val_mae: 0.1728\n",
+            "Epoch 159/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0488 - mae: 0.1552 - val_loss: 0.0576 - val_mae: 0.1777\n",
+            "Epoch 160/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0501 - mae: 0.1633 - val_loss: 0.0567 - val_mae: 0.1750\n",
+            "Epoch 161/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.0481 - mae: 0.1568 - val_loss: 0.0562 - val_mae: 0.1750\n",
+            "Epoch 162/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0476 - mae: 0.1569 - val_loss: 0.0553 - val_mae: 0.1706\n",
+            "Epoch 163/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0464 - mae: 0.1533 - val_loss: 0.0549 - val_mae: 0.1717\n",
+            "Epoch 164/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0470 - mae: 0.1559 - val_loss: 0.0550 - val_mae: 0.1696\n",
+            "Epoch 165/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0463 - mae: 0.1526 - val_loss: 0.0543 - val_mae: 0.1669\n",
+            "Epoch 166/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0467 - mae: 0.1530 - val_loss: 0.0536 - val_mae: 0.1685\n",
+            "Epoch 167/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0465 - mae: 0.1521 - val_loss: 0.0536 - val_mae: 0.1691\n",
+            "Epoch 168/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0462 - mae: 0.1570 - val_loss: 0.0530 - val_mae: 0.1681\n",
+            "Epoch 169/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0448 - mae: 0.1514 - val_loss: 0.0523 - val_mae: 0.1679\n",
+            "Epoch 170/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0441 - mae: 0.1509 - val_loss: 0.0518 - val_mae: 0.1668\n",
+            "Epoch 171/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0438 - mae: 0.1488 - val_loss: 0.0516 - val_mae: 0.1668\n",
+            "Epoch 172/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0437 - mae: 0.1509 - val_loss: 0.0510 - val_mae: 0.1649\n",
+            "Epoch 173/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0431 - mae: 0.1479 - val_loss: 0.0507 - val_mae: 0.1658\n",
+            "Epoch 174/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0432 - mae: 0.1493 - val_loss: 0.0503 - val_mae: 0.1634\n",
+            "Epoch 175/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0427 - mae: 0.1467 - val_loss: 0.0502 - val_mae: 0.1667\n",
+            "Epoch 176/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0425 - mae: 0.1475 - val_loss: 0.0494 - val_mae: 0.1618\n",
+            "Epoch 177/500\n",
+            "600/600 [==============================] - 0s 43us/sample - loss: 0.0426 - mae: 0.1497 - val_loss: 0.0491 - val_mae: 0.1618\n",
+            "Epoch 178/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0416 - mae: 0.1454 - val_loss: 0.0489 - val_mae: 0.1635\n",
+            "Epoch 179/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0414 - mae: 0.1467 - val_loss: 0.0483 - val_mae: 0.1599\n",
+            "Epoch 180/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0411 - mae: 0.1439 - val_loss: 0.0489 - val_mae: 0.1651\n",
+            "Epoch 181/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0418 - mae: 0.1485 - val_loss: 0.0477 - val_mae: 0.1597\n",
+            "Epoch 182/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0405 - mae: 0.1445 - val_loss: 0.0473 - val_mae: 0.1612\n",
+            "Epoch 183/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0399 - mae: 0.1435 - val_loss: 0.0466 - val_mae: 0.1579\n",
+            "Epoch 184/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0399 - mae: 0.1432 - val_loss: 0.0465 - val_mae: 0.1561\n",
+            "Epoch 185/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0397 - mae: 0.1437 - val_loss: 0.0459 - val_mae: 0.1573\n",
+            "Epoch 186/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0394 - mae: 0.1424 - val_loss: 0.0455 - val_mae: 0.1582\n",
+            "Epoch 187/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0385 - mae: 0.1411 - val_loss: 0.0453 - val_mae: 0.1544\n",
+            "Epoch 188/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0385 - mae: 0.1403 - val_loss: 0.0447 - val_mae: 0.1545\n",
+            "Epoch 189/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0381 - mae: 0.1392 - val_loss: 0.0444 - val_mae: 0.1549\n",
+            "Epoch 190/500\n",
+            "600/600 [==============================] - 0s 61us/sample - loss: 0.0378 - mae: 0.1402 - val_loss: 0.0441 - val_mae: 0.1529\n",
+            "Epoch 191/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0376 - mae: 0.1390 - val_loss: 0.0441 - val_mae: 0.1574\n",
+            "Epoch 192/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0378 - mae: 0.1397 - val_loss: 0.0431 - val_mae: 0.1533\n",
+            "Epoch 193/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0376 - mae: 0.1401 - val_loss: 0.0430 - val_mae: 0.1538\n",
+            "Epoch 194/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0372 - mae: 0.1376 - val_loss: 0.0433 - val_mae: 0.1548\n",
+            "Epoch 195/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0376 - mae: 0.1412 - val_loss: 0.0429 - val_mae: 0.1508\n",
+            "Epoch 196/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0365 - mae: 0.1383 - val_loss: 0.0419 - val_mae: 0.1529\n",
+            "Epoch 197/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0361 - mae: 0.1353 - val_loss: 0.0416 - val_mae: 0.1485\n",
+            "Epoch 198/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0354 - mae: 0.1353 - val_loss: 0.0411 - val_mae: 0.1506\n",
+            "Epoch 199/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0354 - mae: 0.1363 - val_loss: 0.0410 - val_mae: 0.1504\n",
+            "Epoch 200/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0354 - mae: 0.1358 - val_loss: 0.0410 - val_mae: 0.1511\n",
+            "Epoch 201/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0348 - mae: 0.1349 - val_loss: 0.0399 - val_mae: 0.1475\n",
+            "Epoch 202/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0345 - mae: 0.1342 - val_loss: 0.0396 - val_mae: 0.1476\n",
+            "Epoch 203/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0342 - mae: 0.1345 - val_loss: 0.0395 - val_mae: 0.1455\n",
+            "Epoch 204/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0337 - mae: 0.1321 - val_loss: 0.0390 - val_mae: 0.1462\n",
+            "Epoch 205/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0336 - mae: 0.1328 - val_loss: 0.0389 - val_mae: 0.1445\n",
+            "Epoch 206/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0337 - mae: 0.1317 - val_loss: 0.0392 - val_mae: 0.1497\n",
+            "Epoch 207/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0335 - mae: 0.1326 - val_loss: 0.0384 - val_mae: 0.1436\n",
+            "Epoch 208/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0329 - mae: 0.1310 - val_loss: 0.0376 - val_mae: 0.1444\n",
+            "Epoch 209/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0328 - mae: 0.1298 - val_loss: 0.0375 - val_mae: 0.1454\n",
+            "Epoch 210/500\n",
+            "600/600 [==============================] - 0s 44us/sample - loss: 0.0328 - mae: 0.1328 - val_loss: 0.0370 - val_mae: 0.1432\n",
+            "Epoch 211/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0331 - mae: 0.1310 - val_loss: 0.0369 - val_mae: 0.1413\n",
+            "Epoch 212/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0317 - mae: 0.1290 - val_loss: 0.0367 - val_mae: 0.1449\n",
+            "Epoch 213/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0318 - mae: 0.1291 - val_loss: 0.0360 - val_mae: 0.1425\n",
+            "Epoch 214/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0312 - mae: 0.1284 - val_loss: 0.0356 - val_mae: 0.1413\n",
+            "Epoch 215/500\n",
+            "600/600 [==============================] - 0s 65us/sample - loss: 0.0309 - mae: 0.1273 - val_loss: 0.0356 - val_mae: 0.1423\n",
+            "Epoch 216/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0310 - mae: 0.1280 - val_loss: 0.0350 - val_mae: 0.1396\n",
+            "Epoch 217/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0303 - mae: 0.1263 - val_loss: 0.0346 - val_mae: 0.1400\n",
+            "Epoch 218/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.0302 - mae: 0.1267 - val_loss: 0.0343 - val_mae: 0.1390\n",
+            "Epoch 219/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0299 - mae: 0.1258 - val_loss: 0.0340 - val_mae: 0.1377\n",
+            "Epoch 220/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0299 - mae: 0.1262 - val_loss: 0.0338 - val_mae: 0.1374\n",
+            "Epoch 221/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0294 - mae: 0.1246 - val_loss: 0.0337 - val_mae: 0.1395\n",
+            "Epoch 222/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0299 - mae: 0.1275 - val_loss: 0.0340 - val_mae: 0.1394\n",
+            "Epoch 223/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0295 - mae: 0.1251 - val_loss: 0.0331 - val_mae: 0.1378\n",
+            "Epoch 224/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0290 - mae: 0.1228 - val_loss: 0.0325 - val_mae: 0.1361\n",
+            "Epoch 225/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0291 - mae: 0.1254 - val_loss: 0.0321 - val_mae: 0.1344\n",
+            "Epoch 226/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0286 - mae: 0.1237 - val_loss: 0.0318 - val_mae: 0.1340\n",
+            "Epoch 227/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0281 - mae: 0.1219 - val_loss: 0.0315 - val_mae: 0.1331\n",
+            "Epoch 228/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0280 - mae: 0.1221 - val_loss: 0.0313 - val_mae: 0.1345\n",
+            "Epoch 229/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0276 - mae: 0.1202 - val_loss: 0.0310 - val_mae: 0.1333\n",
+            "Epoch 230/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0276 - mae: 0.1215 - val_loss: 0.0308 - val_mae: 0.1313\n",
+            "Epoch 231/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0274 - mae: 0.1214 - val_loss: 0.0319 - val_mae: 0.1382\n",
+            "Epoch 232/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0281 - mae: 0.1242 - val_loss: 0.0304 - val_mae: 0.1305\n",
+            "Epoch 233/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0268 - mae: 0.1195 - val_loss: 0.0299 - val_mae: 0.1320\n",
+            "Epoch 234/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0264 - mae: 0.1187 - val_loss: 0.0296 - val_mae: 0.1302\n",
+            "Epoch 235/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0267 - mae: 0.1206 - val_loss: 0.0299 - val_mae: 0.1285\n",
+            "Epoch 236/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0272 - mae: 0.1182 - val_loss: 0.0309 - val_mae: 0.1363\n",
+            "Epoch 237/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0273 - mae: 0.1209 - val_loss: 0.0286 - val_mae: 0.1297\n",
+            "Epoch 238/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0260 - mae: 0.1191 - val_loss: 0.0286 - val_mae: 0.1276\n",
+            "Epoch 239/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0259 - mae: 0.1173 - val_loss: 0.0283 - val_mae: 0.1279\n",
+            "Epoch 240/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0255 - mae: 0.1157 - val_loss: 0.0279 - val_mae: 0.1281\n",
+            "Epoch 241/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0253 - mae: 0.1162 - val_loss: 0.0280 - val_mae: 0.1294\n",
+            "Epoch 242/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0256 - mae: 0.1178 - val_loss: 0.0273 - val_mae: 0.1259\n",
+            "Epoch 243/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0245 - mae: 0.1144 - val_loss: 0.0276 - val_mae: 0.1287\n",
+            "Epoch 244/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0252 - mae: 0.1163 - val_loss: 0.0268 - val_mae: 0.1263\n",
+            "Epoch 245/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0261 - mae: 0.1201 - val_loss: 0.0295 - val_mae: 0.1333\n",
+            "Epoch 246/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0268 - mae: 0.1231 - val_loss: 0.0279 - val_mae: 0.1302\n",
+            "Epoch 247/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0240 - mae: 0.1149 - val_loss: 0.0263 - val_mae: 0.1242\n",
+            "Epoch 248/500\n",
+            "600/600 [==============================] - 0s 66us/sample - loss: 0.0242 - mae: 0.1146 - val_loss: 0.0259 - val_mae: 0.1249\n",
+            "Epoch 249/500\n",
+            "600/600 [==============================] - 0s 69us/sample - loss: 0.0233 - mae: 0.1129 - val_loss: 0.0277 - val_mae: 0.1258\n",
+            "Epoch 250/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0246 - mae: 0.1158 - val_loss: 0.0255 - val_mae: 0.1237\n",
+            "Epoch 251/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0231 - mae: 0.1114 - val_loss: 0.0249 - val_mae: 0.1216\n",
+            "Epoch 252/500\n",
+            "600/600 [==============================] - 0s 63us/sample - loss: 0.0230 - mae: 0.1122 - val_loss: 0.0246 - val_mae: 0.1216\n",
+            "Epoch 253/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0229 - mae: 0.1109 - val_loss: 0.0247 - val_mae: 0.1228\n",
+            "Epoch 254/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0230 - mae: 0.1122 - val_loss: 0.0242 - val_mae: 0.1204\n",
+            "Epoch 255/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0233 - mae: 0.1139 - val_loss: 0.0252 - val_mae: 0.1209\n",
+            "Epoch 256/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0225 - mae: 0.1102 - val_loss: 0.0239 - val_mae: 0.1197\n",
+            "Epoch 257/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0219 - mae: 0.1086 - val_loss: 0.0235 - val_mae: 0.1197\n",
+            "Epoch 258/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0217 - mae: 0.1091 - val_loss: 0.0234 - val_mae: 0.1188\n",
+            "Epoch 259/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0215 - mae: 0.1082 - val_loss: 0.0231 - val_mae: 0.1184\n",
+            "Epoch 260/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0214 - mae: 0.1080 - val_loss: 0.0228 - val_mae: 0.1183\n",
+            "Epoch 261/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0214 - mae: 0.1081 - val_loss: 0.0226 - val_mae: 0.1175\n",
+            "Epoch 262/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0211 - mae: 0.1077 - val_loss: 0.0224 - val_mae: 0.1177\n",
+            "Epoch 263/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0210 - mae: 0.1075 - val_loss: 0.0223 - val_mae: 0.1176\n",
+            "Epoch 264/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0209 - mae: 0.1079 - val_loss: 0.0223 - val_mae: 0.1164\n",
+            "Epoch 265/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0208 - mae: 0.1073 - val_loss: 0.0219 - val_mae: 0.1165\n",
+            "Epoch 266/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0209 - mae: 0.1084 - val_loss: 0.0221 - val_mae: 0.1149\n",
+            "Epoch 267/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0206 - mae: 0.1075 - val_loss: 0.0215 - val_mae: 0.1148\n",
+            "Epoch 268/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0203 - mae: 0.1062 - val_loss: 0.0212 - val_mae: 0.1142\n",
+            "Epoch 269/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0201 - mae: 0.1055 - val_loss: 0.0212 - val_mae: 0.1141\n",
+            "Epoch 270/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0200 - mae: 0.1063 - val_loss: 0.0213 - val_mae: 0.1137\n",
+            "Epoch 271/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0201 - mae: 0.1066 - val_loss: 0.0211 - val_mae: 0.1127\n",
+            "Epoch 272/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0205 - mae: 0.1074 - val_loss: 0.0203 - val_mae: 0.1131\n",
+            "Epoch 273/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0197 - mae: 0.1052 - val_loss: 0.0202 - val_mae: 0.1123\n",
+            "Epoch 274/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0194 - mae: 0.1043 - val_loss: 0.0201 - val_mae: 0.1119\n",
+            "Epoch 275/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0192 - mae: 0.1038 - val_loss: 0.0199 - val_mae: 0.1118\n",
+            "Epoch 276/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0191 - mae: 0.1040 - val_loss: 0.0200 - val_mae: 0.1113\n",
+            "Epoch 277/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0191 - mae: 0.1043 - val_loss: 0.0199 - val_mae: 0.1117\n",
+            "Epoch 278/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0194 - mae: 0.1051 - val_loss: 0.0195 - val_mae: 0.1111\n",
+            "Epoch 279/500\n",
+            "600/600 [==============================] - 0s 65us/sample - loss: 0.0186 - mae: 0.1031 - val_loss: 0.0197 - val_mae: 0.1098\n",
+            "Epoch 280/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0187 - mae: 0.1031 - val_loss: 0.0192 - val_mae: 0.1103\n",
+            "Epoch 281/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0186 - mae: 0.1031 - val_loss: 0.0192 - val_mae: 0.1098\n",
+            "Epoch 282/500\n",
+            "600/600 [==============================] - 0s 63us/sample - loss: 0.0185 - mae: 0.1031 - val_loss: 0.0190 - val_mae: 0.1092\n",
+            "Epoch 283/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.0183 - mae: 0.1022 - val_loss: 0.0188 - val_mae: 0.1097\n",
+            "Epoch 284/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0181 - mae: 0.1020 - val_loss: 0.0186 - val_mae: 0.1086\n",
+            "Epoch 285/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0183 - mae: 0.1025 - val_loss: 0.0192 - val_mae: 0.1085\n",
+            "Epoch 286/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0190 - mae: 0.1057 - val_loss: 0.0190 - val_mae: 0.1106\n",
+            "Epoch 287/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0181 - mae: 0.1022 - val_loss: 0.0181 - val_mae: 0.1077\n",
+            "Epoch 288/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0177 - mae: 0.1012 - val_loss: 0.0181 - val_mae: 0.1072\n",
+            "Epoch 289/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0175 - mae: 0.1003 - val_loss: 0.0182 - val_mae: 0.1082\n",
+            "Epoch 290/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0180 - mae: 0.1028 - val_loss: 0.0179 - val_mae: 0.1064\n",
+            "Epoch 291/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0175 - mae: 0.1013 - val_loss: 0.0179 - val_mae: 0.1063\n",
+            "Epoch 292/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0175 - mae: 0.1014 - val_loss: 0.0177 - val_mae: 0.1067\n",
+            "Epoch 293/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0176 - mae: 0.1018 - val_loss: 0.0171 - val_mae: 0.1051\n",
+            "Epoch 294/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0175 - mae: 0.1010 - val_loss: 0.0175 - val_mae: 0.1050\n",
+            "Epoch 295/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0176 - mae: 0.1015 - val_loss: 0.0174 - val_mae: 0.1056\n",
+            "Epoch 296/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0173 - mae: 0.1017 - val_loss: 0.0172 - val_mae: 0.1040\n",
+            "Epoch 297/500\n",
+            "600/600 [==============================] - 0s 63us/sample - loss: 0.0168 - mae: 0.0999 - val_loss: 0.0169 - val_mae: 0.1046\n",
+            "Epoch 298/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0169 - mae: 0.1001 - val_loss: 0.0166 - val_mae: 0.1035\n",
+            "Epoch 299/500\n",
+            "600/600 [==============================] - 0s 141us/sample - loss: 0.0168 - mae: 0.0994 - val_loss: 0.0168 - val_mae: 0.1035\n",
+            "Epoch 300/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0166 - mae: 0.0999 - val_loss: 0.0162 - val_mae: 0.1026\n",
+            "Epoch 301/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0164 - mae: 0.0985 - val_loss: 0.0164 - val_mae: 0.1026\n",
+            "Epoch 302/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0162 - mae: 0.0988 - val_loss: 0.0165 - val_mae: 0.1026\n",
+            "Epoch 303/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0164 - mae: 0.0989 - val_loss: 0.0161 - val_mae: 0.1022\n",
+            "Epoch 304/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0163 - mae: 0.0988 - val_loss: 0.0161 - val_mae: 0.1026\n",
+            "Epoch 305/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0163 - mae: 0.0993 - val_loss: 0.0158 - val_mae: 0.1015\n",
+            "Epoch 306/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0162 - mae: 0.0989 - val_loss: 0.0161 - val_mae: 0.1020\n",
+            "Epoch 307/500\n",
+            "600/600 [==============================] - 0s 76us/sample - loss: 0.0166 - mae: 0.1004 - val_loss: 0.0158 - val_mae: 0.1011\n",
+            "Epoch 308/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0160 - mae: 0.0984 - val_loss: 0.0158 - val_mae: 0.1004\n",
+            "Epoch 309/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0160 - mae: 0.0983 - val_loss: 0.0160 - val_mae: 0.1012\n",
+            "Epoch 310/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0170 - mae: 0.1013 - val_loss: 0.0159 - val_mae: 0.1016\n",
+            "Epoch 311/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0160 - mae: 0.0983 - val_loss: 0.0192 - val_mae: 0.1091\n",
+            "Epoch 312/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0185 - mae: 0.1053 - val_loss: 0.0153 - val_mae: 0.1004\n",
+            "Epoch 313/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0161 - mae: 0.0997 - val_loss: 0.0162 - val_mae: 0.1010\n",
+            "Epoch 314/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0153 - mae: 0.0966 - val_loss: 0.0154 - val_mae: 0.1006\n",
+            "Epoch 315/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0162 - mae: 0.1002 - val_loss: 0.0152 - val_mae: 0.0999\n",
+            "Epoch 316/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0162 - mae: 0.0999 - val_loss: 0.0158 - val_mae: 0.0996\n",
+            "Epoch 317/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0158 - mae: 0.0985 - val_loss: 0.0170 - val_mae: 0.1026\n",
+            "Epoch 318/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0167 - mae: 0.1021 - val_loss: 0.0148 - val_mae: 0.0981\n",
+            "Epoch 319/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0161 - mae: 0.0994 - val_loss: 0.0157 - val_mae: 0.1011\n",
+            "Epoch 320/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0148 - mae: 0.0950 - val_loss: 0.0144 - val_mae: 0.0973\n",
+            "Epoch 321/500\n",
+            "600/600 [==============================] - 0s 62us/sample - loss: 0.0147 - mae: 0.0954 - val_loss: 0.0152 - val_mae: 0.0983\n",
+            "Epoch 322/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0149 - mae: 0.0955 - val_loss: 0.0147 - val_mae: 0.0982\n",
+            "Epoch 323/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0149 - mae: 0.0956 - val_loss: 0.0145 - val_mae: 0.0977\n",
+            "Epoch 324/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0147 - mae: 0.0956 - val_loss: 0.0142 - val_mae: 0.0963\n",
+            "Epoch 325/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0145 - mae: 0.0950 - val_loss: 0.0144 - val_mae: 0.0974\n",
+            "Epoch 326/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.0147 - mae: 0.0957 - val_loss: 0.0141 - val_mae: 0.0965\n",
+            "Epoch 327/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0147 - mae: 0.0960 - val_loss: 0.0144 - val_mae: 0.0973\n",
+            "Epoch 328/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0145 - mae: 0.0944 - val_loss: 0.0141 - val_mae: 0.0959\n",
+            "Epoch 329/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0145 - mae: 0.0952 - val_loss: 0.0137 - val_mae: 0.0949\n",
+            "Epoch 330/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0143 - mae: 0.0944 - val_loss: 0.0139 - val_mae: 0.0952\n",
+            "Epoch 331/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0143 - mae: 0.0941 - val_loss: 0.0139 - val_mae: 0.0947\n",
+            "Epoch 332/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0141 - mae: 0.0941 - val_loss: 0.0139 - val_mae: 0.0949\n",
+            "Epoch 333/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0149 - mae: 0.0951 - val_loss: 0.0148 - val_mae: 0.0968\n",
+            "Epoch 334/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0148 - mae: 0.0957 - val_loss: 0.0151 - val_mae: 0.0979\n",
+            "Epoch 335/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0151 - mae: 0.0966 - val_loss: 0.0139 - val_mae: 0.0945\n",
+            "Epoch 336/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.0141 - mae: 0.0932 - val_loss: 0.0140 - val_mae: 0.0954\n",
+            "Epoch 337/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.0141 - mae: 0.0936 - val_loss: 0.0133 - val_mae: 0.0934\n",
+            "Epoch 338/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0141 - mae: 0.0932 - val_loss: 0.0137 - val_mae: 0.0943\n",
+            "Epoch 339/500\n",
+            "600/600 [==============================] - 0s 62us/sample - loss: 0.0139 - mae: 0.0931 - val_loss: 0.0132 - val_mae: 0.0929\n",
+            "Epoch 340/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0136 - mae: 0.0923 - val_loss: 0.0132 - val_mae: 0.0929\n",
+            "Epoch 341/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0137 - mae: 0.0925 - val_loss: 0.0146 - val_mae: 0.0963\n",
+            "Epoch 342/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0140 - mae: 0.0947 - val_loss: 0.0139 - val_mae: 0.0946\n",
+            "Epoch 343/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0139 - mae: 0.0940 - val_loss: 0.0136 - val_mae: 0.0934\n",
+            "Epoch 344/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0135 - mae: 0.0920 - val_loss: 0.0132 - val_mae: 0.0925\n",
+            "Epoch 345/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0136 - mae: 0.0923 - val_loss: 0.0134 - val_mae: 0.0932\n",
+            "Epoch 346/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0134 - mae: 0.0922 - val_loss: 0.0130 - val_mae: 0.0919\n",
+            "Epoch 347/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0133 - mae: 0.0920 - val_loss: 0.0137 - val_mae: 0.0937\n",
+            "Epoch 348/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0134 - mae: 0.0926 - val_loss: 0.0133 - val_mae: 0.0926\n",
+            "Epoch 349/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0139 - mae: 0.0941 - val_loss: 0.0135 - val_mae: 0.0929\n",
+            "Epoch 350/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0133 - mae: 0.0904 - val_loss: 0.0126 - val_mae: 0.0907\n",
+            "Epoch 351/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0134 - mae: 0.0916 - val_loss: 0.0128 - val_mae: 0.0912\n",
+            "Epoch 352/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0137 - mae: 0.0928 - val_loss: 0.0131 - val_mae: 0.0916\n",
+            "Epoch 353/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0144 - mae: 0.0947 - val_loss: 0.0126 - val_mae: 0.0904\n",
+            "Epoch 354/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0131 - mae: 0.0910 - val_loss: 0.0132 - val_mae: 0.0923\n",
+            "Epoch 355/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0138 - mae: 0.0930 - val_loss: 0.0131 - val_mae: 0.0919\n",
+            "Epoch 356/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0135 - mae: 0.0926 - val_loss: 0.0126 - val_mae: 0.0904\n",
+            "Epoch 357/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0131 - mae: 0.0907 - val_loss: 0.0138 - val_mae: 0.0940\n",
+            "Epoch 358/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0132 - mae: 0.0907 - val_loss: 0.0126 - val_mae: 0.0904\n",
+            "Epoch 359/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0129 - mae: 0.0903 - val_loss: 0.0127 - val_mae: 0.0907\n",
+            "Epoch 360/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0128 - mae: 0.0900 - val_loss: 0.0126 - val_mae: 0.0902\n",
+            "Epoch 361/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0133 - mae: 0.0909 - val_loss: 0.0126 - val_mae: 0.0905\n",
+            "Epoch 362/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0130 - mae: 0.0907 - val_loss: 0.0125 - val_mae: 0.0898\n",
+            "Epoch 363/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0129 - mae: 0.0899 - val_loss: 0.0124 - val_mae: 0.0896\n",
+            "Epoch 364/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0129 - mae: 0.0903 - val_loss: 0.0126 - val_mae: 0.0900\n",
+            "Epoch 365/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0129 - mae: 0.0898 - val_loss: 0.0125 - val_mae: 0.0901\n",
+            "Epoch 366/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0129 - mae: 0.0910 - val_loss: 0.0131 - val_mae: 0.0912\n",
+            "Epoch 367/500\n",
+            "600/600 [==============================] - 0s 72us/sample - loss: 0.0127 - mae: 0.0895 - val_loss: 0.0122 - val_mae: 0.0890\n",
+            "Epoch 368/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0129 - mae: 0.0905 - val_loss: 0.0126 - val_mae: 0.0905\n",
+            "Epoch 369/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0129 - mae: 0.0902 - val_loss: 0.0123 - val_mae: 0.0889\n",
+            "Epoch 370/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0127 - mae: 0.0899 - val_loss: 0.0125 - val_mae: 0.0894\n",
+            "Epoch 371/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0134 - mae: 0.0920 - val_loss: 0.0139 - val_mae: 0.0931\n",
+            "Epoch 372/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0134 - mae: 0.0916 - val_loss: 0.0129 - val_mae: 0.0905\n",
+            "Epoch 373/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0129 - mae: 0.0907 - val_loss: 0.0126 - val_mae: 0.0897\n",
+            "Epoch 374/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0128 - mae: 0.0899 - val_loss: 0.0121 - val_mae: 0.0879\n",
+            "Epoch 375/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0133 - mae: 0.0923 - val_loss: 0.0125 - val_mae: 0.0904\n",
+            "Epoch 376/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0129 - mae: 0.0908 - val_loss: 0.0130 - val_mae: 0.0915\n",
+            "Epoch 377/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0129 - mae: 0.0911 - val_loss: 0.0119 - val_mae: 0.0877\n",
+            "Epoch 378/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0138 - mae: 0.0941 - val_loss: 0.0121 - val_mae: 0.0881\n",
+            "Epoch 379/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0127 - mae: 0.0898 - val_loss: 0.0127 - val_mae: 0.0895\n",
+            "Epoch 380/500\n",
+            "600/600 [==============================] - 0s 46us/sample - loss: 0.0129 - mae: 0.0903 - val_loss: 0.0120 - val_mae: 0.0876\n",
+            "Epoch 381/500\n",
+            "600/600 [==============================] - 0s 45us/sample - loss: 0.0126 - mae: 0.0896 - val_loss: 0.0120 - val_mae: 0.0876\n",
+            "Epoch 382/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0130 - mae: 0.0917 - val_loss: 0.0121 - val_mae: 0.0880\n",
+            "Epoch 383/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0126 - mae: 0.0895 - val_loss: 0.0120 - val_mae: 0.0882\n",
+            "Epoch 384/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0128 - mae: 0.0910 - val_loss: 0.0150 - val_mae: 0.0983\n",
+            "Epoch 385/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0134 - mae: 0.0912 - val_loss: 0.0118 - val_mae: 0.0876\n",
+            "Epoch 386/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0124 - mae: 0.0892 - val_loss: 0.0123 - val_mae: 0.0886\n",
+            "Epoch 387/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0127 - mae: 0.0898 - val_loss: 0.0128 - val_mae: 0.0900\n",
+            "Epoch 388/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0128 - mae: 0.0903 - val_loss: 0.0129 - val_mae: 0.0906\n",
+            "Epoch 389/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0148 - mae: 0.0984 - val_loss: 0.0121 - val_mae: 0.0880\n",
+            "Epoch 390/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0137 - mae: 0.0939 - val_loss: 0.0118 - val_mae: 0.0874\n",
+            "Epoch 391/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0127 - mae: 0.0896 - val_loss: 0.0122 - val_mae: 0.0893\n",
+            "Epoch 392/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0124 - mae: 0.0888 - val_loss: 0.0118 - val_mae: 0.0873\n",
+            "Epoch 393/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0124 - mae: 0.0887 - val_loss: 0.0119 - val_mae: 0.0879\n",
+            "Epoch 394/500\n",
+            "600/600 [==============================] - 0s 62us/sample - loss: 0.0124 - mae: 0.0885 - val_loss: 0.0117 - val_mae: 0.0865\n",
+            "Epoch 395/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0128 - mae: 0.0904 - val_loss: 0.0121 - val_mae: 0.0880\n",
+            "Epoch 396/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0126 - mae: 0.0895 - val_loss: 0.0119 - val_mae: 0.0874\n",
+            "Epoch 397/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0124 - mae: 0.0883 - val_loss: 0.0120 - val_mae: 0.0880\n",
+            "Epoch 398/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0130 - mae: 0.0906 - val_loss: 0.0122 - val_mae: 0.0891\n",
+            "Epoch 399/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0136 - mae: 0.0935 - val_loss: 0.0128 - val_mae: 0.0917\n",
+            "Epoch 400/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0136 - mae: 0.0923 - val_loss: 0.0128 - val_mae: 0.0910\n",
+            "Epoch 401/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.0126 - mae: 0.0896 - val_loss: 0.0134 - val_mae: 0.0934\n",
+            "Epoch 402/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0135 - mae: 0.0925 - val_loss: 0.0127 - val_mae: 0.0910\n",
+            "Epoch 403/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0129 - mae: 0.0904 - val_loss: 0.0117 - val_mae: 0.0868\n",
+            "Epoch 404/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0126 - mae: 0.0898 - val_loss: 0.0140 - val_mae: 0.0928\n",
+            "Epoch 405/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0132 - mae: 0.0928 - val_loss: 0.0117 - val_mae: 0.0869\n",
+            "Epoch 406/500\n",
+            "600/600 [==============================] - 0s 47us/sample - loss: 0.0126 - mae: 0.0906 - val_loss: 0.0128 - val_mae: 0.0908\n",
+            "Epoch 407/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0122 - mae: 0.0880 - val_loss: 0.0117 - val_mae: 0.0870\n",
+            "Epoch 408/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0125 - mae: 0.0897 - val_loss: 0.0119 - val_mae: 0.0875\n",
+            "Epoch 409/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0124 - mae: 0.0889 - val_loss: 0.0118 - val_mae: 0.0869\n",
+            "Epoch 410/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0124 - mae: 0.0888 - val_loss: 0.0117 - val_mae: 0.0868\n",
+            "Epoch 411/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.0122 - mae: 0.0886 - val_loss: 0.0139 - val_mae: 0.0933\n",
+            "Epoch 412/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0132 - mae: 0.0923 - val_loss: 0.0125 - val_mae: 0.0891\n",
+            "Epoch 413/500\n",
+            "600/600 [==============================] - 0s 62us/sample - loss: 0.0140 - mae: 0.0938 - val_loss: 0.0119 - val_mae: 0.0875\n",
+            "Epoch 414/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0134 - mae: 0.0917 - val_loss: 0.0125 - val_mae: 0.0897\n",
+            "Epoch 415/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0131 - mae: 0.0917 - val_loss: 0.0126 - val_mae: 0.0904\n",
+            "Epoch 416/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0128 - mae: 0.0900 - val_loss: 0.0129 - val_mae: 0.0912\n",
+            "Epoch 417/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0124 - mae: 0.0890 - val_loss: 0.0118 - val_mae: 0.0874\n",
+            "Epoch 418/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0128 - mae: 0.0899 - val_loss: 0.0132 - val_mae: 0.0925\n",
+            "Epoch 419/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0131 - mae: 0.0917 - val_loss: 0.0120 - val_mae: 0.0882\n",
+            "Epoch 420/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0124 - mae: 0.0884 - val_loss: 0.0130 - val_mae: 0.0919\n",
+            "Epoch 421/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0131 - mae: 0.0914 - val_loss: 0.0130 - val_mae: 0.0916\n",
+            "Epoch 422/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0133 - mae: 0.0921 - val_loss: 0.0115 - val_mae: 0.0864\n",
+            "Epoch 423/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0123 - mae: 0.0886 - val_loss: 0.0120 - val_mae: 0.0876\n",
+            "Epoch 424/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0122 - mae: 0.0883 - val_loss: 0.0141 - val_mae: 0.0935\n",
+            "Epoch 425/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0136 - mae: 0.0936 - val_loss: 0.0117 - val_mae: 0.0869\n",
+            "Epoch 426/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0134 - mae: 0.0922 - val_loss: 0.0116 - val_mae: 0.0868\n",
+            "Epoch 427/500\n",
+            "600/600 [==============================] - 0s 66us/sample - loss: 0.0121 - mae: 0.0879 - val_loss: 0.0116 - val_mae: 0.0867\n",
+            "Epoch 428/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0121 - mae: 0.0882 - val_loss: 0.0121 - val_mae: 0.0881\n",
+            "Epoch 429/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0125 - mae: 0.0895 - val_loss: 0.0114 - val_mae: 0.0859\n",
+            "Epoch 430/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0123 - mae: 0.0883 - val_loss: 0.0129 - val_mae: 0.0901\n",
+            "Epoch 431/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0126 - mae: 0.0900 - val_loss: 0.0120 - val_mae: 0.0877\n",
+            "Epoch 432/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0123 - mae: 0.0882 - val_loss: 0.0118 - val_mae: 0.0870\n",
+            "Epoch 433/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.0120 - mae: 0.0879 - val_loss: 0.0120 - val_mae: 0.0878\n",
+            "Epoch 434/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0122 - mae: 0.0877 - val_loss: 0.0114 - val_mae: 0.0861\n",
+            "Epoch 435/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0120 - mae: 0.0877 - val_loss: 0.0120 - val_mae: 0.0876\n",
+            "Epoch 436/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0122 - mae: 0.0885 - val_loss: 0.0115 - val_mae: 0.0862\n",
+            "Epoch 437/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0120 - mae: 0.0882 - val_loss: 0.0117 - val_mae: 0.0867\n",
+            "Epoch 438/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0119 - mae: 0.0872 - val_loss: 0.0116 - val_mae: 0.0865\n",
+            "Epoch 439/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0122 - mae: 0.0885 - val_loss: 0.0116 - val_mae: 0.0864\n",
+            "Epoch 440/500\n",
+            "600/600 [==============================] - 0s 65us/sample - loss: 0.0122 - mae: 0.0888 - val_loss: 0.0123 - val_mae: 0.0889\n",
+            "Epoch 441/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0120 - mae: 0.0886 - val_loss: 0.0116 - val_mae: 0.0864\n",
+            "Epoch 442/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0124 - mae: 0.0880 - val_loss: 0.0120 - val_mae: 0.0880\n",
+            "Epoch 443/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0121 - mae: 0.0875 - val_loss: 0.0123 - val_mae: 0.0885\n",
+            "Epoch 444/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0124 - mae: 0.0895 - val_loss: 0.0118 - val_mae: 0.0875\n",
+            "Epoch 445/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0126 - mae: 0.0902 - val_loss: 0.0117 - val_mae: 0.0869\n",
+            "Epoch 446/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0121 - mae: 0.0873 - val_loss: 0.0132 - val_mae: 0.0925\n",
+            "Epoch 447/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0124 - mae: 0.0883 - val_loss: 0.0124 - val_mae: 0.0890\n",
+            "Epoch 448/500\n",
+            "600/600 [==============================] - 0s 69us/sample - loss: 0.0120 - mae: 0.0877 - val_loss: 0.0115 - val_mae: 0.0863\n",
+            "Epoch 449/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0122 - mae: 0.0885 - val_loss: 0.0115 - val_mae: 0.0865\n",
+            "Epoch 450/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0125 - mae: 0.0904 - val_loss: 0.0118 - val_mae: 0.0872\n",
+            "Epoch 451/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0119 - mae: 0.0869 - val_loss: 0.0126 - val_mae: 0.0895\n",
+            "Epoch 452/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0124 - mae: 0.0890 - val_loss: 0.0116 - val_mae: 0.0867\n",
+            "Epoch 453/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0119 - mae: 0.0872 - val_loss: 0.0117 - val_mae: 0.0868\n",
+            "Epoch 454/500\n",
+            "600/600 [==============================] - 0s 49us/sample - loss: 0.0120 - mae: 0.0878 - val_loss: 0.0116 - val_mae: 0.0863\n",
+            "Epoch 455/500\n",
+            "600/600 [==============================] - 0s 61us/sample - loss: 0.0120 - mae: 0.0878 - val_loss: 0.0117 - val_mae: 0.0870\n",
+            "Epoch 456/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0118 - mae: 0.0869 - val_loss: 0.0115 - val_mae: 0.0862\n",
+            "Epoch 457/500\n",
+            "600/600 [==============================] - 0s 66us/sample - loss: 0.0121 - mae: 0.0883 - val_loss: 0.0116 - val_mae: 0.0866\n",
+            "Epoch 458/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.0121 - mae: 0.0876 - val_loss: 0.0116 - val_mae: 0.0863\n",
+            "Epoch 459/500\n",
+            "600/600 [==============================] - 0s 60us/sample - loss: 0.0119 - mae: 0.0872 - val_loss: 0.0116 - val_mae: 0.0864\n",
+            "Epoch 460/500\n",
+            "600/600 [==============================] - 0s 48us/sample - loss: 0.0119 - mae: 0.0871 - val_loss: 0.0115 - val_mae: 0.0862\n",
+            "Epoch 461/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0120 - mae: 0.0880 - val_loss: 0.0120 - val_mae: 0.0881\n",
+            "Epoch 462/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0119 - mae: 0.0872 - val_loss: 0.0116 - val_mae: 0.0864\n",
+            "Epoch 463/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0119 - mae: 0.0873 - val_loss: 0.0117 - val_mae: 0.0866\n",
+            "Epoch 464/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0118 - mae: 0.0868 - val_loss: 0.0115 - val_mae: 0.0862\n",
+            "Epoch 465/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0120 - mae: 0.0875 - val_loss: 0.0124 - val_mae: 0.0896\n",
+            "Epoch 466/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0117 - mae: 0.0875 - val_loss: 0.0129 - val_mae: 0.0901\n",
+            "Epoch 467/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0126 - mae: 0.0907 - val_loss: 0.0127 - val_mae: 0.0898\n",
+            "Epoch 468/500\n",
+            "600/600 [==============================] - 0s 58us/sample - loss: 0.0125 - mae: 0.0893 - val_loss: 0.0118 - val_mae: 0.0874\n",
+            "Epoch 469/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0122 - mae: 0.0887 - val_loss: 0.0115 - val_mae: 0.0864\n",
+            "Epoch 470/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0119 - mae: 0.0874 - val_loss: 0.0119 - val_mae: 0.0876\n",
+            "Epoch 471/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0118 - mae: 0.0866 - val_loss: 0.0116 - val_mae: 0.0867\n",
+            "Epoch 472/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0120 - mae: 0.0873 - val_loss: 0.0118 - val_mae: 0.0872\n",
+            "Epoch 473/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0121 - mae: 0.0882 - val_loss: 0.0115 - val_mae: 0.0863\n",
+            "Epoch 474/500\n",
+            "600/600 [==============================] - 0s 55us/sample - loss: 0.0118 - mae: 0.0871 - val_loss: 0.0117 - val_mae: 0.0867\n",
+            "Epoch 475/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0120 - mae: 0.0877 - val_loss: 0.0121 - val_mae: 0.0884\n",
+            "Epoch 476/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0127 - mae: 0.0902 - val_loss: 0.0119 - val_mae: 0.0877\n",
+            "Epoch 477/500\n",
+            "600/600 [==============================] - 0s 61us/sample - loss: 0.0122 - mae: 0.0882 - val_loss: 0.0151 - val_mae: 0.0967\n",
+            "Epoch 478/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0136 - mae: 0.0933 - val_loss: 0.0123 - val_mae: 0.0889\n",
+            "Epoch 479/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0121 - mae: 0.0884 - val_loss: 0.0116 - val_mae: 0.0869\n",
+            "Epoch 480/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0121 - mae: 0.0883 - val_loss: 0.0118 - val_mae: 0.0877\n",
+            "Epoch 481/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0120 - mae: 0.0876 - val_loss: 0.0118 - val_mae: 0.0875\n",
+            "Epoch 482/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0121 - mae: 0.0887 - val_loss: 0.0116 - val_mae: 0.0865\n",
+            "Epoch 483/500\n",
+            "600/600 [==============================] - 0s 70us/sample - loss: 0.0122 - mae: 0.0892 - val_loss: 0.0114 - val_mae: 0.0863\n",
+            "Epoch 484/500\n",
+            "600/600 [==============================] - 0s 57us/sample - loss: 0.0132 - mae: 0.0926 - val_loss: 0.0115 - val_mae: 0.0866\n",
+            "Epoch 485/500\n",
+            "600/600 [==============================] - 0s 70us/sample - loss: 0.0138 - mae: 0.0948 - val_loss: 0.0118 - val_mae: 0.0874\n",
+            "Epoch 486/500\n",
+            "600/600 [==============================] - 0s 59us/sample - loss: 0.0119 - mae: 0.0879 - val_loss: 0.0114 - val_mae: 0.0860\n",
+            "Epoch 487/500\n",
+            "600/600 [==============================] - 0s 50us/sample - loss: 0.0118 - mae: 0.0872 - val_loss: 0.0116 - val_mae: 0.0870\n",
+            "Epoch 488/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0117 - mae: 0.0870 - val_loss: 0.0114 - val_mae: 0.0861\n",
+            "Epoch 489/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0118 - mae: 0.0869 - val_loss: 0.0120 - val_mae: 0.0879\n",
+            "Epoch 490/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0119 - mae: 0.0873 - val_loss: 0.0115 - val_mae: 0.0863\n",
+            "Epoch 491/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0118 - mae: 0.0871 - val_loss: 0.0117 - val_mae: 0.0873\n",
+            "Epoch 492/500\n",
+            "600/600 [==============================] - 0s 61us/sample - loss: 0.0122 - mae: 0.0886 - val_loss: 0.0127 - val_mae: 0.0899\n",
+            "Epoch 493/500\n",
+            "600/600 [==============================] - 0s 54us/sample - loss: 0.0122 - mae: 0.0881 - val_loss: 0.0113 - val_mae: 0.0857\n",
+            "Epoch 494/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0125 - mae: 0.0898 - val_loss: 0.0119 - val_mae: 0.0880\n",
+            "Epoch 495/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0123 - mae: 0.0897 - val_loss: 0.0116 - val_mae: 0.0866\n",
+            "Epoch 496/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0119 - mae: 0.0875 - val_loss: 0.0115 - val_mae: 0.0866\n",
+            "Epoch 497/500\n",
+            "600/600 [==============================] - 0s 56us/sample - loss: 0.0118 - mae: 0.0868 - val_loss: 0.0117 - val_mae: 0.0871\n",
+            "Epoch 498/500\n",
+            "600/600 [==============================] - 0s 52us/sample - loss: 0.0124 - mae: 0.0889 - val_loss: 0.0116 - val_mae: 0.0866\n",
+            "Epoch 499/500\n",
+            "600/600 [==============================] - 0s 51us/sample - loss: 0.0119 - mae: 0.0871 - val_loss: 0.0115 - val_mae: 0.0863\n",
+            "Epoch 500/500\n",
+            "600/600 [==============================] - 0s 53us/sample - loss: 0.0118 - mae: 0.0873 - val_loss: 0.0115 - val_mae: 0.0864\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Mc_CQu2_IvOP",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 3. Plot Metrics\n",
+        "Each training epoch, the model prints out its loss and mean absolute error for training and validation. You can read this in the output above (note that your exact numbers may differ): \n",
+        "\n",
+        "```\n",
+        "Epoch 500/500\n",
+        "600/600 [==============================] - 0s 51us/sample - loss: 0.0118 - mae: 0.0873 - val_loss: 0.0105 - val_mae: 0.0832\n",
+        "```\n",
+        "\n",
+        "You can see that we've already got a huge improvement - validation loss has dropped from 0.15 to 0.01, and validation MAE has dropped from 0.33 to 0.08.\n",
+        "\n",
+        "The following cell will print the same graphs we used to evaluate our original model, but showing our new training history:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SYHGswAJJgrC",
+        "colab_type": "code",
+        "outputId": "bdc6e8f7-480d-4d3e-c20b-94776722360f",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 851
+        }
+      },
+      "source": [
+        "# Draw a graph of the loss, which is the distance between\n",
+        "# the predicted and actual values during training and validation.\n",
+        "loss = history_2.history['loss']\n",
+        "val_loss = history_2.history['val_loss']\n",
+        "\n",
+        "epochs = range(1, len(loss) + 1)\n",
+        "\n",
+        "plt.plot(epochs, loss, 'g.', label='Training loss')\n",
+        "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
+        "plt.title('Training and validation loss')\n",
+        "plt.xlabel('Epochs')\n",
+        "plt.ylabel('Loss')\n",
+        "plt.legend()\n",
+        "plt.show()\n",
+        "\n",
+        "# Exclude the first few epochs so the graph is easier to read\n",
+        "SKIP = 100\n",
+        "\n",
+        "plt.clf()\n",
+        "\n",
+        "plt.plot(epochs[SKIP:], loss[SKIP:], 'g.', label='Training loss')\n",
+        "plt.plot(epochs[SKIP:], val_loss[SKIP:], 'b.', label='Validation loss')\n",
+        "plt.title('Training and validation loss')\n",
+        "plt.xlabel('Epochs')\n",
+        "plt.ylabel('Loss')\n",
+        "plt.legend()\n",
+        "plt.show()\n",
+        "\n",
+        "plt.clf()\n",
+        "\n",
+        "# Draw a graph of mean absolute error, which is another way of\n",
+        "# measuring the amount of error in the prediction.\n",
+        "mae = history_2.history['mae']\n",
+        "val_mae = history_2.history['val_mae']\n",
+        "\n",
+        "plt.plot(epochs[SKIP:], mae[SKIP:], 'g.', label='Training MAE')\n",
+        "plt.plot(epochs[SKIP:], val_mae[SKIP:], 'b.', label='Validation MAE')\n",
+        "plt.title('Training and validation mean absolute error')\n",
+        "plt.xlabel('Epochs')\n",
+        "plt.ylabel('MAE')\n",
+        "plt.legend()\n",
+        "plt.show()"
+      ],
+      "execution_count": 16,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deXwU9fnA8c+TzQVJIJAAARIIKIcg\nkEAAFxSDUgtCwVupBSkKgrVerUBrPWq1KtLW+itW8UBpsWi1Ug+sChJAiMopAnKpwQQ5QgghIXfy\n/f0xk7jkTshmSeZ5+8oruzOzM88scZ/93mKMQSmllHP5+ToApZRSvqWJQCmlHE4TgVJKOZwmAqWU\ncjhNBEop5XCaCJRSyuE0EahGJSLvi8hNjX2sL4lIioiM8cJ5jYicaz9+VkTur8uxDbjOjSLyYUPj\nrOG8iSKS1tjnVU3P39cBKN8TkRyPp62BAqDEfn6rMWZpXc9ljBnnjWNbOmPMrMY4j4jEAt8CAcaY\nYvvcS4E6/xsq59FEoDDGhJY9FpEU4BZjzMqKx4mIf9mHi1Kq5dCqIVWtsqK/iMwVkcPAYhFpJyLv\niki6iGTaj6M9XpMkIrfYj6eJyCcissA+9lsRGdfAY3uIyFoRyRaRlSKyUET+WU3cdYnxDyKy3j7f\nhyIS6bF/iogcEJEMEbmvhvdnuIgcFhGXx7YrRWS7/XiYiCSLyAkROSQifxORwGrO9bKIPOLx/F77\nNd+LyPQKx44Xka0iclJEUkXkIY/da+3fJ0QkR0TcZe+tx+tHiMhGEcmyf4+o63tTExE5z379CRHZ\nKSITPfZdLiK77HMeFJFf29sj7X+fEyJyXETWiYh+LjUxfcNVbaKA9kB3YCbW38xi+3k3IA/4Ww2v\nHw7sASKB+cCLIiINOPZV4HMgAngImFLDNesS40+BnwMdgUCg7IOpH/B3+/xd7OtFUwVjzGfAKeCS\nCud91X5cAtxt348buBS4rYa4sWMYa8fzI6AXULF94hQwFQgHxgOzReQKe98o+3e4MSbUGJNc4dzt\ngfeAp+17+zPwnohEVLiHSu9NLTEHAO8AH9qv+yWwVET62Ie8iFXNGAacD3xsb/8VkAZ0ADoBvwV0\n3psmpolA1aYUeNAYU2CMyTPGZBhj3jTG5BpjsoFHgYtreP0BY8zzxpgS4BWgM9b/8HU+VkS6AUOB\nB4wxhcaYT4C3q7tgHWNcbIzZa4zJA14H4uzt1wDvGmPWGmMKgPvt96A6/wImA4hIGHC5vQ1jzGZj\nzKfGmGJjTArwXBVxVOU6O74dxphTWInP8/6SjDFfGmNKjTHb7evV5bxgJY59xph/2HH9C9gN/MTj\nmOrem5pcAIQCj9v/Rh8D72K/N0AR0E9E2hhjMo0xWzy2dwa6G2OKjDHrjE6A1uQ0EajapBtj8sue\niEhrEXnOrjo5iVUVEe5ZPVLB4bIHxphc+2FoPY/tAhz32AaQWl3AdYzxsMfjXI+Yunie2/4gzqju\nWljf/q8SkSDgKmCLMeaAHUdvu9rjsB3HH7FKB7U5LQbgQIX7Gy4iq+2qryxgVh3PW3buAxW2HQC6\nejyv7r2pNWZjjGfS9Dzv1VhJ8oCIrBERt739SWA/8KGIfCMi8+p2G6oxaSJQtan47exXQB9guDGm\nDT9URVRX3dMYDgHtRaS1x7aYGo4/kxgPeZ7bvmZEdQcbY3ZhfeCN4/RqIbCqmHYDvew4ftuQGLCq\ntzy9ilUiijHGtAWe9Thvbd+mv8eqMvPUDThYh7hqO29Mhfr98vMaYzYaYyZhVRstxyppYIzJNsb8\nyhjTE5gI3CMil55hLKqeNBGo+grDqnM/Ydc3P+jtC9rfsDcBD4lIoP1t8ic1vORMYnwDmCAiF9oN\nuw9T+/8nrwJ3YiWcf1eI4ySQIyJ9gdl1jOF1YJqI9LMTUcX4w7BKSPkiMgwrAZVJx6rK6lnNuVcA\nvUXkpyLiLyLXA/2wqnHOxGdYpYc5IhIgIolY/0bL7H+zG0WkrTGmCOs9KQUQkQkicq7dFpSF1a5S\nU1Wc8gJNBKq+ngJaAceAT4H/NdF1b8RqcM0AHgFewxrvUJUGx2iM2Qn8AuvD/RCQidWYWZOyOvqP\njTHHPLb/GutDOht43o65LjG8b9/Dx1jVJh9XOOQ24GERyQYewP52bb82F6tNZL3dE+eCCufOACZg\nlZoygDnAhApx15sxphDrg38c1vv+DDDVGLPbPmQKkGJXkc3C+vcEqzF8JZADJAPPGGNWn0ksqv5E\n22VUcyQirwG7jTFeL5Eo1dJpiUA1CyIyVETOERE/u3vlJKy6ZqXUGdKRxaq5iAL+g9VwmwbMNsZs\n9W1ISrUMWjWklFIOp1VDSinlcM2uaigyMtLExsb6OgyllGpWNm/efMwY06Gqfc0uEcTGxrJp0yZf\nh6GUUs2KiFQcUV5Oq4aUUsrhNBEopZTDaSJQSimHa3ZtBEqppldUVERaWhr5+fm1H6x8Kjg4mOjo\naAICAur8Gk0ESqlapaWlERYWRmxsLNWvK6R8zRhDRkYGaWlp9OjRo86v06ohpVSt8vPziYiI0CRw\nlhMRIiIi6l1y00SglKoTTQLNQ0P+nRyTCD75BH7zG9AZNZRS6nSOSQQbN8Ljj0NWlq8jUUrVV0ZG\nBnFxccTFxREVFUXXrl3LnxcWFtb42k2bNnHHHXfUeo0RI0Y0SqxJSUlMmDChUc7VVBzTWJwp+4Be\nfPTlVq69KN7X4Sil6iEiIoJt27YB8NBDDxEaGsqvf/3r8v3FxcX4+1f9cZaQkEBCQkKt19iwYUPj\nBNsMOaJEkJyazBNbrT+aKUvvIjk12ccRKdXyJacm89i6x7z2/9u0adOYNWsWw4cPZ86cOXz++ee4\n3W7i4+MZMWIEe/bsAU7/hv7QQw8xffp0EhMT6dmzJ08//XT5+UJDQ8uPT0xM5JprrqFv377ceOON\nlM3SvGLFCvr27cuQIUO44447av3mf/z4ca644goGDhzIBRdcwPbt2wFYs2ZNeYkmPj6e7OxsDh06\nxKhRo4iLi+P8889n3bp1jf6eVccRJYKklCSKgw8DUJQdTlJKEu4Yt4+jUqrlSk5N5tIll1JYUkig\nK5BVU1d55f+5tLQ0NmzYgMvl4uTJk6xbtw5/f39WrlzJb3/7W958881Kr9m9ezerV68mOzubPn36\nMHv27Ep97rdu3crOnTvp0qULI0eOZP369SQkJHDrrbeydu1aevToweTJk2uN78EHHyQ+Pp7ly5fz\n8ccfM3XqVLZt28aCBQtYuHAhI0eOJCcnh+DgYBYtWsSPf/xj7rvvPkpKSsjNzW2096k2jigRJMYm\nEhBmNQ648juRGJvo24CUauGSUpIoLCmkxJRQWFJIUkqSV65z7bXX4nK5AMjKyuLaa6/l/PPP5+67\n72bnzp1Vvmb8+PEEBQURGRlJx44dOXLkSKVjhg0bRnR0NH5+fsTFxZGSksLu3bvp2bNnef/8uiSC\nTz75hClTpgBwySWXkJGRwcmTJxk5ciT33HMPTz/9NCdOnMDf35+hQ4eyePFiHnroIb788kvCwsIa\n+rbUmyMSgTvGzXu3vALAzL73aWlAKS9LjE0k0BWIS1wEugK99uUrJCSk/PH999/P6NGj2bFjB++8\n8061femDgoLKH7tcLoqLixt0zJmYN28eL7zwAnl5eYwcOZLdu3czatQo1q5dS9euXZk2bRpLlixp\n1GvWxBFVQwCX9BlOUBCEFHf3dShKtXjuGDerpq4iKSWJxNjEJvnylZWVRdeuXQF4+eWXG/38ffr0\n4ZtvviElJYXY2Fhee+21Wl9z0UUXsXTpUu6//36SkpKIjIykTZs2fP311wwYMIABAwawceNGdu/e\nTatWrYiOjmbGjBkUFBSwZcsWpk6d2uj3URXHJAIRiIyEY8d8HYlSzuCOcTdp6XvOnDncdNNNPPLI\nI4wfP77Rz9+qVSueeeYZxo4dS0hICEOHDq31NWWN0wMHDqR169a88opVM/HUU0+xevVq/Pz86N+/\nP+PGjWPZsmU8+eSTBAQEEBoa2qQlgma3ZnFCQoJp6MI0cXHQvTv897+NHJRSLdxXX33Feeed5+sw\nfC4nJ4fQ0FCMMfziF7+gV69e3H333b4Oq5Kq/r1EZLMxpsp+tI5oIyijJQKl1Jl4/vnniYuLo3//\n/mRlZXHrrbf6OqRG4ZiqIQBaH2PvXiE5da82GCul6u3uu+8+K0sAZ8oxJYLk1GSSjr7BsWNw6ZJL\ndVCZUkrZvJoIRGSsiOwRkf0iMq+G464WESMitY8Db6CklCRKWx2FvHYUFJZ4rV+zUko1N15LBCLi\nAhYC44B+wGQR6VfFcWHAncBn3ooFrH7N/qEnAD8Ci3RQmVJKlfFmiWAYsN8Y840xphBYBkyq4rg/\nAE8AXl0Dzx3j5ndjZwHw4qVvaxuBUkrZvJkIugKpHs/T7G3lRGQwEGOMea+mE4nITBHZJCKb0tPT\nGxzQiD69AYgOiGvwOZRSTW/06NF88MEHp2176qmnmD17drWvSUxMpKyr+eWXX86JEycqHfPQQw+x\nYMGCGq+9fPlydu3aVf78gQceYOXKlfUJv0pn03TVPmssFhE/4M/Ar2o71hizyBiTYIxJ6NChQ4Ov\nGRlp/dYupEo1L5MnT2bZsmWnbVu2bFmd5vsBa9bQ8PDwBl27YiJ4+OGHGTNmTIPOdbbyZiI4CMR4\nPI+2t5UJA84HkkQkBbgAeNubDcZlOUQTgVLNyzXXXMN7771XvghNSkoK33//PRdddBGzZ88mISGB\n/v378+CDD1b5+tjYWI7Z/+M/+uij9O7dmwsvvLB8qmqwxggMHTqUQYMGcfXVV5Obm8uGDRt4++23\nuffee4mLi+Prr79m2rRpvPHGGwCsWrWK+Ph4BgwYwPTp0ykoKCi/3oMPPsjgwYMZMGAAu3fvrvH+\nfD1dtTfHEWwEeolID6wEcAPw07KdxpgsILLsuYgkAb82xjRs2HAdRERYv9/clMSAcUHaTqBUA9x1\nF9hrxDSauDh46qnq97dv355hw4bx/vvvM2nSJJYtW8Z1112HiPDoo4/Svn17SkpKuPTSS9m+fTsD\nBw6s8jybN29m2bJlbNu2jeLiYgYPHsyQIUMAuOqqq5gxYwYAv/vd73jxxRf55S9/ycSJE5kwYQLX\nXHPNaefKz89n2rRprFq1it69ezN16lT+/ve/c9dddwEQGRnJli1beOaZZ1iwYAEvvPBCtffn6+mq\nvVYiMMYUA7cDHwBfAa8bY3aKyMMiMtFb163J1vRkCMzmox3bdCyBUs2MZ/WQZ7XQ66+/zuDBg4mP\nj2fnzp2nVeNUtG7dOq688kpat25NmzZtmDjxh4+iHTt2cNFFFzFgwACWLl1a7TTWZfbs2UOPHj3o\n3dtqe7zppptYu3Zt+f6rrroKgCFDhpCSklLjuXw9XbVXRxYbY1YAKypse6CaYxO9GQtYYwloHYU5\n1b58jnQtFShVPzV9c/emSZMmcffdd7NlyxZyc3MZMmQI3377LQsWLGDjxo20a9eOadOmVTv9dG2m\nTZvG8uXLGTRoEC+//DJJSUlnFG/ZVNZnMo31vHnzGD9+PCtWrGDkyJF88MEH5dNVv/fee0ybNo17\n7rnnjGcpdczIYrDGEkjIccjr4NU50pVSjS80NJTRo0czffr08tLAyZMnCQkJoW3bthw5coT333+/\nxnOMGjWK5cuXk5eXR3Z2Nu+88075vuzsbDp37kxRURFLly4t3x4WFkZ2dnalc/Xp04eUlBT2798P\nwD/+8Q8uvvjiBt1b2XTVQJXTVc+dO5ehQ4eye/duDhw4QKdOnZgxYwa33HILW7ZsadA1PTlqriF3\njJvhvTJJO9yZ1720dJ5SynsmT57MlVdeWV5FNGjQIOLj4+nbty8xMTGMHDmyxtcPHjyY66+/nkGD\nBtGxY8fTppL+wx/+wPDhw+nQoQPDhw8v//C/4YYbmDFjBk8//XR5IzFAcHAwixcv5tprr6W4uJih\nQ4cya9asBt2Xr6erdtQ01ABTpsAnn8C33zZiUEq1cDoNdfOi01DXolMnOHIEmln+U0opr3FcIigI\nTiEvD1Z99bmvQ1FKqbOCoxJBcmoyz+15GIAJz9+i3UeVqofmVo3sVA35d3JUIkhKSaKktTW4uSgr\nQqeiVqqOgoODycjI0GRwljPGkJGRQXBwcL1e56heQ4mxiQS0fZMCwJXbVbuPKlVH0dHRpKWlcSaT\nPqqmERwcTHR0dL1e46hE4I5x89bNf+fy/4NfnPcH3DE9fB2SUs1CQEAAPXro/y8tlaOqhgB+PGAo\nLhe0LtQ/aqWUAgcmAj8/a/K5jAxfR6KUUmcHxyUCgJC2uXyye7f2GlJKKRyYCJJTkzlQtJmdB47o\nDKRKKYUDE0FSShKlrdLhVET5DKRKKeVkjksEibGJuEJOQF6kzkCqlFI4MBG4Y9zcOPwy/PI6snKK\nzkCqlFKOSwQAg3pGU1riR782mgSUUsqRiSDSXilZF7FXSimHJoJ08xUAq3d+6eNIlFLK9xyXCJJT\nk7nv01sBuP2Nh7T7qFLK8RyXCJJSkigKOgRAUU5b7T6qlHI8xyWCxNhEAsNOAuDK76TdR5VSjue4\nROCOcbNqxnJc/iXc0ON27T6qlHI8R01DXWZENzcdO0BQYVdfh6KUUj7nuBJBmchInYFUKaXA4YlA\nxxEopZSDE4G0Psae745p91GllOM5MhEkpyazJv0/pB8zOhW1UsrxHJkIrKmoj0JeewqKinUsgVLK\n0RyZCBJjE/EPzQTjIrCoo44lUEo5miMTgTvGzX0/ngXA4h+9o2MJlFKO5shEAODu0wuAmIB4H0ei\nlFK+5dhEoFNRK6WUxbGJICLC+r1kw3vaa0gp5WiOTQRf538GwFtbPtEupEopR3NsIvj0yMfgn4fJ\nbU9hSaF2IVVKOZYjJ50DGN0jEWmdAbkdCHQFahdSpZRjOTYRuGPcnBtzClfrkbw0dZV2IVVKOZZX\nq4ZEZKyI7BGR/SIyr4r9s0TkSxHZJiKfiEg/b8ZTUffOIYSX9tIkoJRyNK8lAhFxAQuBcUA/YHIV\nH/SvGmMGGGPigPnAn70VT1V0BlKllPJuiWAYsN8Y840xphBYBkzyPMAYc9LjaQhgvBhPJZGRkJ7e\nlFdUSqmzjzcTQVcg1eN5mr3tNCLyCxH5GqtEcEdVJxKRmSKySUQ2pTfiJ3d+YCpZWbDum08b7ZxK\nKdXc+Lz7qDFmoTHmHGAu8LtqjllkjEkwxiR06NChUa6bnJrMK/sWAHDZop/qOAKllGN5MxEcBGI8\nnkfb26qzDLjCi/GcJikliZJWhwAoPNlWxxEopRzLm4lgI9BLRHqISCBwA/C25wEi0svj6Xhgnxfj\nOU1ibCL+YScA8M/vrOMIlFKO5bVEYIwpBm4HPgC+Al43xuwUkYdFZKJ92O0islNEtgH3ADd5K56K\n3DFuFk+eD8B9Q57SLqRKKcfy6oAyY8wKYEWFbQ94PL7Tm9evzY8GxgHQrrS3L8NQSimf8nljsS+1\nbw8icPSoryNRSinfcXQicLmgbbsiPty+RXsNKaUcy9GJIDk1mSzXfj7fl6JTUSulHMvRiSApJQnT\nOh1ORepU1Eopx3J0IkiMTcQvVKeiVko5m6MTgTvGzaTBbloXdWeVTkWtlHIoRycCgAE9osg72Zph\nXTQJKKWcyfGJoEMHMAYyMnwdiVJK+YYmAnsOO52OWinlVI5PBEfNTgCSdu70cSRKKeUbjk4EyanJ\n3Lt+GgD3vPVHHUeglHIkRyeCpJQkioK/B6Aou52OI1BKOZKjE0FibCKBYdkAuPKidByBUsqRHJ0I\n3DFuPv75B7QKy2NSzC06jkAp5UhenYa6OXDHuInpDK68Vr4ORSmlfMLRJYIyHTpo91GllHNpIkAT\ngVLK2TQRAKWtjpDyfY52H1VKOZLjE0FyajIrvn+FnBPBXPLyGE0GSinHcXwiSEpJoqTVISj1p/BU\niI4lUEo5juMTQWJsIv5hmQAE5HfRsQRKKcdxfCJwx7h5ctIcAP5y4T91LIFSynEcnwgALu7fD4BO\nfuf7OBKllGp6dUoEIhIiIn72494iMlFEArwbWtPRqaiVUk5W1xLBWiBYRLoCHwJTgJe9FVRTi4y0\nfr+xcY32GlJKOU5dE4EYY3KBq4BnjDHXAv29F1bT2nI0GYKyWLXjCy5dcqkmA6WUo9Q5EYiIG7gR\neM/e5vJOSE0vKSUJQtIxpyIpLCnULqRKKUepayK4C/gN8JYxZqeI9ARWey+sppUYm4iEZEBuRwJd\ngdqFVCnlKHWafdQYswZYA2A3Gh8zxtzhzcCakjvGzcg+x/k6pYg3p67SLqRKKUepa6+hV0WkjYiE\nADuAXSJyr3dDa1q9u7VHcjtpElBKOU5dq4b6GWNOAlcA7wM9sHoOtRgdO1rdR43xdSRKKdW06poI\nAuxxA1cAbxtjioAW9ZHZpQsUFUFGhq8jUUqpplXXRPAckAKEAGtFpDtw0ltB+UJ24B4A3t/yhY8j\nUUqpplWnRGCMedoY09UYc7mxHABGezm2JpOcmszvt9wKwC1LH9RxBEopR6lrY3FbEfmziGyyf/6E\nVTpoEZJSkigO+Q6AohMddByBUspR6lo19BKQDVxn/5wEFnsrqKaWGJtIYNvjALhOxeg4AqWUo9Q1\nEZxjjHnQGPON/fN7oKc3A2tK7hg3H09/n9Ztc/lJ55nahVQp5Sh1TQR5InJh2RMRGQnkeSck33DH\nuDm3e2tKTkb5OhSllGpSdU0Es4CFIpIiIinA34Bba3uRiIwVkT0isl9E5lWx/x4R2SUi20Vkld0b\nyWdat89k895D2lislHKUuvYa+sIYMwgYCAw0xsQDl9T0GhFxAQuBcUA/YLKI9Ktw2FYgwRgzEHgD\nmF/P+BtNcmoyG0++zcGDpToDqVLKUeq1Qpkx5qQ9whjgnloOHwbst9sUCoFlwKQK51ttT28N8CkQ\nXZ94GlNSShKloWmQE0VBUbH2HFJKOcaZLFUptezvCqR6PE+zt1XnZqzpKypfSGRmWdfVdC8tI5YY\nm4h/26NgXATmR2vPIaWUY5xJImi0KSZE5GdAAvBklRcyZpExJsEYk9ChbF3JRuaOcfPIpNkA/G3k\nf7XnkFLKMWqchlpEsqn6A1+AVrWc+yAQ4/E82t5W8RpjgPuAi40xBbWc06u6ds8HIC0l2JdhKKVU\nk6qxRGCMCTPGtKniJ8wYU9taBhuBXiLSQ0QCgRuAtz0PEJF4rHmMJhpjjp7JjZyp5NRkbvlkNEgp\njyxfpo3FSinHOJOqoRoZY4qB24EPgK+A1+3VzR4WkYn2YU8CocC/RWSbiLxdzem8LikliSK/bAhL\no+RYD20sVko5Rp1WKGsoY8wKYEWFbQ94PB7jzevXR2JsIoGuQPIi9sPx3iTGnuPrkJRSqkl4rUTQ\n3Lhj3Kyauoqh57ejzal4bSxWSjmGJgIP7hg3142KJyszgMxMX0ejlFJNQxNBBeeea/3ev9+3cSil\nVFPRRFDBqdBtAKz4dJ+PI1FKqaahicBDcmoyt6xPBFcBj7z+jnYhVUo5giYCD0kpSRRJDkRtozg1\nQbuQKqUcQROBh7IupNJ1I3w/hAujE30dklJKeZ0mAg9lXUhHXxgCRSGk7Gvt65CUUsrrNBFUYX3p\nnwG4+e/PaTuBUqrF00RQQVJKEkXhX0HwcYoPaDuBUqrl8+oUE81RYmwiQf6B5MWug5TRJMYe9nVI\nSinlVVoiqMAd4+apsU/RNyENk9mDiDydakIp1bJpIqggOTWZu/53F3s7PgkBucy6O8PXISmllFdp\nIqggKSWJwpJCStscQIYtZM0H7cjQXKCUasE0EVRQNpbADz/8BrxBaYkfy5b5OiqllPIeTQQVlLUR\nuPxclEZtwq/bBn7z22L27PF1ZEop5R2aCKqQkZtBSWkJhlLMFVMpdRXw4x+jyUAp1SJpIqhCROsI\nSikFwLT/mjuf/oDsbBg8GO66C3JzfRygUko1Ik0EVcjIzcBPrLfGT/wI7b6H7dth2DD461/h4oth\nxYpaTqKUUs2EJoIqJMYmEuQKwg8/BOHz7z/nu9JkVq+GV1+FY8dg/HgYM8ZawMYYX0eslFINp4mg\nCmUNxiJCiSlh+e7ljH5lNMmpyUyeDHv3wn33waZN0KsXdO0KBw/6OmqllGoYTQTVyMjNoNSUlj8v\nKClgyRdLAAgIgEcegS++gNmz4dAh6NEDbr4ZSkp8FbFSSjWMJoJqJMYm4vJznbbtxa0vnjYbaffu\n8MwzsHUr/Pzn8NJLcN558NxzTR2tUko1nCaCarhj3NwSf8tp24pKi5i/fn6lY+Pi4NlnYfFi8PeH\nWbPg3nvh1KmmilYppRpOE0ENpg6aSqAr8LRty/csZ+7KuZWOFYFp02DzZpgxAxYsgP794aOPmihY\npZRqIE0ENXDHuEm6KYnosOjTts9fP5+5K+fy2LrHKi1c06oVLFoEa9dCcDBcdpmVIJKTtXeRUurs\nJKaZfTolJCSYTZs2Nek1F21exK3v3lrlPn8/fxZevpCZQ2ZW2pefD7//PTz+uPX82Wfh1qpPo5RS\nXiUim40xCVXt0xJBHcwcMpNR3UdVua+4tJjb3ruN2e/OrlQ6CA6Gxx6Dr7+GhASr7eCGGyAnpymi\nVkqputFEUEePX/o4AX4BVe4rMSU8u/lZLlp8EYs2L6q0v2dPWL8eHn4Y/v1v6NvXakNoZoUxpVQL\npYmgjtwxbtZMW8OoblWXDMBKCLe+e2uVjcmBgXD//bBmjTXm4N574bbb4MQJb0atlFK100RQD+4Y\nN2t+voY5I+eUz0VUlfnr59Pjrz2qLB1ceKHVkHzPPVabgdttVR0ppZSvaGNxAyWnJjN//XyW71le\n43GX9byMxNhEEmMTccecvv7xqlVw9dXWaOS//tUalCbizaiVUk5VU2OxJoIztGjzIl7c8iKZ+Zns\nO76vymMEIdg/mFVTV1VKBt99BzfdBElJcNVV1qjkyMgmCFwp5Sjaa8iLZg6ZyWczPmPvL/cyZ+Sc\nKo8xGPKK87jl7Vsq9Szq1s0qGTz5JLzzDgwaZCWDfVXnFKWUanRaImhkyanJ3PbebWw7sq3K/YLw\n0wE/JSwwDLBGL5eVErZts9E98lgAABZuSURBVLqX7tkDXbrAp59CTEyTha6UasG0asgH5q6cy5Pr\nn8RQ8/sb5Api9U2ry5NBSYk1Z9Gtt0JIiDWp3Y03atuBUurMaNWQDzwx5gmenfAsfrW8xZ7TWwO4\nXHDLLbBxI3TsCFOmwPTpkJXl7YiVUk6licCLZg6ZySfTP+GKPlcQFRJV7XHPbX6uUlfTwYNh926Y\nMweWLIELLoD583VUslKq8Xk1EYjIWBHZIyL7RWReFftHicgWESkWkWu8GYuvuGPcvHXDWxz69SFu\nHHBjlccYTJUD0fz94YknrBlMT52CuXNh4EBrHIJSSjUWryUCEXEBC4FxQD9gsoj0q3DYd8A04FVv\nxXE2+edV/+S5Cc8xrMswhMqV/vPXz6fznzpz5WtXnta76JJL4MABa3qK/HyYNMl6rJRSjcGbJYJh\nwH5jzDfGmEJgGTDJ8wBjTIoxZjtQWtUJWqKy7qbVtR8czjnM8t3LGfHSCC5++eLyyexE4JprrDmL\nzjkHrrsOJkyAr77ywU0opVoUbyaCrkCqx/M0e1u9ichMEdkkIpvS09MbJThfK2s/qGnuorUH1vLs\n5me5+OWLy0sIPXpY3UqfeAI2bIBhw+APf4DDh5sqcqVUS9MsGouNMYuMMQnGmIQOHTr4OpxG4zl3\nUU0qLpHp7281Im/bBmPGwAMPQNeuVonh4EFvR62Uamm8mQgOAp7DoaLtbaqCJ8Y8wYbpG2osHSzf\ns7xS+0G3bvDWW/D551ZD8v/+B/Hx1u9tVY9nU0qpSryZCDYCvUSkh4gEAjcAb3vxes1aWelgw/QN\nXNHniiobk8vaDy5cfOFp3U2HDoU//hE2bbLGHowbZyWEN99syjtQSjVXXksExphi4HbgA+Ar4HVj\nzE4ReVhEJgKIyFARSQOuBZ4TkZ3eiqe5KOtuun76eq7oc0WVx5SaUma9O6vS2IO+feGzz6wprsFq\nUL7vPsjL83bUSqnmTKeYOMvNXTn3tPaBikZ1H8Xjlz5eaVbTffvgwQfhX/+yVkibORNmzID27b0d\nsVLqbKRTTDRjT4x5gucmPMd5kecRGhhaaf/aA2sZ8dIIevy1x2ntB716wauvwsqV0KkTzJtnTWD3\n1lu6RKZS6nRaImhGklOTufjliykqLarxuH4d+nHn8DuZOWRm+baNG+Hmm+HLLyEhAW6/HcaOtZKE\nUqrl0xJBC1GXdZMBdqXvKp+yIjk1mcfWPUZxVDIbN1prHZw4AdOmWdNVvPoqFBQ0TfxKqbOTlgia\nqbpOc13W+8hzhbSSEmsxnDvusNY+6NDBmtDuxhshIKApoldKNTUtEbRAT4x5gvXT1zNryCziOsVV\ne5yx/8srzitvdHa54LLLYNcua0K7nj2t9ZLPOQf+/Gc4ebKp7kIpdTbQEkELkZyazJIvlvDfPf/l\nUM6hao8b1GkQtw29jYzcDBJjE3HHuCkthffft5bLXLMG2rSxehmNGQOjR0NgYBPeiFLKK3SFMgep\na4MygB9+TOw7kTkj5pR3P924Ef70J2t209JSOPdcK0FMmqSrpCnVnGnVkIOUNSjPGjKLUd1G0b1t\n92qPLaWU5buXM/KlkeVrIQwdCsuWWdNeL1tmlQauvNKa3O6557TaSKmWSEsEDvCz//yMpV8urfW4\nqNAoLoi+oLyEkJyazKr9a8j+7FpWLD2HHTusdZTHjYOf/Qx+8hPw068SSjULWjWkWLR5EX9c90cO\nZB2o0/FRoVEcPXUUgCBXECunrMJ1yM3zz1vtCd9/b01pcdddcMMN0LatN6NXSp0prRpSzBwyk5S7\nUspXSIuLqr6nEVgT3JWaUkpNKXnFefxj+xKGD4cXXrCqjf71L2jVCmbNgqgoq+vpRx9BSUkT3ZBS\nqtFoicDBklOTmbdyHmu/q9siyLHhscRFxZVXHRkDmzfD4sVWYsjMhOhomDoVJk6EuDgICvLyTSil\n6kSrhlSN6psQoPI0Fvn58M478PLL1noIpaVWUvjtb60Fc1rQekJKNUuaCFSdlI1F2JW+iwNZB+rU\nntA1rCttgtrQJ7JPeUnh+++t8Qj/93+QnGwNYOvfH4YMsaa2GFXzDBlKKS/QRKAaZNHmRcx+dzal\nlNb5NbHhsVzX/zrCg8K5uHsirkNuli+H1auttRIALr/cWivhRz+CLl28FLxS6jSaCFSDlZUSPk37\nlG1H6r/+ped6CdnZsHAhPPoo5ORAcDBceKGVFKZMsZ4rpbxDE4FqFGVJYdU3q9iXua9er40KjSIq\nNIpAv0Cm9JuJO/hmFi6E9eth715o184anzB2LAwfDr17e+kmlHIoTQSq0S3avIgXt7xIYWkhh7MP\nc/jU4Xq9vqxtIdAVRME+N+323cnONX3KRy5fcYVVdVRYaK2jEBbmhZtQykE0ESivq++AtapM7jeF\n3/VfwpIl8MorcNjOLdHRMH26VWIYPlznPFKqITQRqCbj2fNob8beepcU2rdqT5ugNsS06UaX/Evo\nVzqZNf/pzccfW/u7drXmPho1CgYMsEY3K6Vqp4lA+cyizYt46tOnyCvOo7ikmLTstHqfo1f7XpAb\nQei3N9LmwGQ2rIqgqMia56hfP2t6i/vus9ZYcLm8cBNKtQCaCNRZw7NtoaC4gOyC7Honh05B3emS\ndxknNlxJ4bFYSo714HBaMB07QkwMzJ5tdVHt3NlLN6FUM6SJQJ3V5q6cW756WoMUhBCZdjPB31yF\n68hQDuxvDUC3bmDMD6ObIyMbKWClmiFNBOqsl5yazPz189l6eCsFJQUczqlf20K5Ehed0n+K35Eh\nZH1zLkVHz6HoUF9cLkPPnsLw4TB5MowYAeHhjXsPSp3NNBGoZqcsMezJ2EOQf1CDuqgCUCpw4GL4\ndjTBJ+Io2nspJfkhBAWXMMLtYsQIqwG6c2ddhU21bJoIVItQ1vCcmZ/JkZwjGBrwt5sfBmkXwL7x\nBB28lIKD54GxWpjbxRxi4hUlXD8uml69rHUXJk+2qpRKSrQhWjVvmghUi1PWTfVwzmFSTqQ0aPoL\nAApbw6kO8O0lsG0apI4A41++O7jTAS4YlUPyu73pNfUv/HJGePmMqwcPWmMdhgxphBtSyss0EagW\nz3P8QnpuOkH+QWTmZZJdmM3xvON1P1FBKBwaDEcGwKmOsPcncDje2ifFELGXVtF7ad/9MAf/Owvx\nK8Y97xFGhEzld7/oqSu1qbOWJgLlaJ5VSg1qhC4OhJxO8PntcGSQlShy7QUWAk5BUQgAQee/T8SE\nP9E2NAg51ZE7r3aXlx6U8jVNBErZKjZCZ+ZlUlBSUL82h1KB7K5w9Hxovx9WPgZ7J0BJhelTBy6h\nTUwawd12ENh5HxGRJRSW5p+2doNSTUUTgVK1qLgoj4jUfyT0sd7w3YWQ2dNqc8joDXkRpx/T5jvo\nsAu6biQkqBWdB+ymsOsq/FwQHhxOZl4mIYEh5au/Jacmk5SSRGJsoiYOdUY0ESjVQGUjobu06ULv\niN4kfZtEZn4mXx//um4L9uS2g0NDIKOXVYrI6G1VLZ2ILe+thCsfWh0Hv2Jo9y1E7IWIPYS0zyE3\nvQOm43bo8za9InpRWFKIiJQnDRGhW9tu9Ivsx9RBU3HHuMuTR3BRF/7z70D6j1vHTfFTTtunicV5\nNBEo1cjKPlAjWkew9dDW+pckSv2gOBh23GAlh9xIKAyBY32t58WtK74Agk9YP5F7wD8fwlOg45cQ\nehiCsiHkKF26lnLo5BFMYDa89h/YfSVccx30/zdh6ZeRnZcH3dcB1hxO/n7+5VVkIYEhTOg9gb3H\n9rInY0+dq7C8lVw0aTUuTQRKNTHPOZXKvrmHB4fXbWCcAQrCIKs7hByB/ePg+LmQHw7Zna0SRUCu\nVQVVKWHYAnKgKPSH5x2/hKMDrMet06HXCuj5EbRNBSmxztdxJxS1spLHie4wZBG0OURUaBSh349H\nOu4iJDyv0v0cOfVD+0rZAkSexxQUF5QnG4wf4a3aciI/s1LJJjw4nPTPE2kdkUl2xw9qPW9mXiYg\nhAdGEhTgR6+IXmw5tIXcwnzatW5b6diyx3l5hk7hbWkf3B6A43nHyS/O5+bBNzOg44DybskVRYVG\nEd85nq2HtgKUl8Bq+ht4c9ebXN3v6vJOA+u++ZS7f5vBeT9ez21jflJeSlvyxZJK5/TsIh0VGlXr\n9WqjiUCps4jnjKwVP6TqNYK6OMBKBpk9QUohp7NV5QRQ0AaCM602ivTzrOeBOfDtmPoF2yrDKoVk\nnmN1n43aBqUBVgkm5Ah02Wwlj8Iw6xrHz4Xz/gNtUqFVJpQEWknFVQhtv4NNs63zjlgAWTEQdNIq\nzbQ+ZlWPPb/R2n/JfVaJ6bw3rdeGpFvXNGJ1501JhJwoqwSVEwUX/x76/xu+HQ0fLoAx86CotfV+\nZJ5jxRGx1zr3tptgVhy4iuDARdZ71+0TKxa/EgjIh5OdobiVlXz9iuCzOyA2Cc5/DfaOt96Htt/R\nKawTrQKCCQ8O51hGCX7+xbTxj+TA6kvIzi2ykqmrgM5tO1JKIUf+Nx3WPAQxn8DPxtE1Ipzvcw5i\nigPAvxAKW3NO+3M4ttVNVvgaK1EH5gIgCJP6TmpwRwNNBEo1I549mzqEdADDaWMjPL/l1nteJoP1\ngeoqsEocWTFWW0VBGzjWx/rQD0+BNgdhz0SrmiqnEwRnWR+kh+KhoK2VZHI6W/v988E/z/qwLWis\nCZxKAb/qdwecglJ/KAlqpOvZpMT6baoZRi7FPww4lGIrmfjnAWK9h2Alq6pi93xtGVeB9fqCcKsU\nV9yq8rX9c61t/nkw9m6CEv7F6ptW1zsZ1JQI/KvaqJTyHXeMm7dueKvOx1fsEhvoF0iviF7sy9hX\n3sj9zp53yCvOo1vbbj8kllh/CopTPOZyevX0E0d/XvvFS/2sb9QC5b1vSwKhOAjy21kN4IHZ1rf2\nUn8IOWqVFL4fYrVtpPe3PwjbWiWLkKNWoorZYH3If3sJlARYj4OyrCqssEPQbZ1VCtl1NfgXWNcr\nbgWFoXDu+1YpqfMWa7xHboR1zYI2cDgOIr+CNDeUuqwP2E5fwKlOdlL0s65h/KySSMQ+65zt91ml\nnazuVtwFbax4i4Ot+wI7IRZY289fZj3+crL1G6xj2++zShOHhlhx5bezSi5+RdZv/3zrPQvI+6HU\nVdDWiqWoFbTfR2FJIUkpSY3abqIlAqUUQKW66i+PfnlaFZZnXb9nqSQkMIT4qHi2HNpSZXVXda+r\n6rHnsbWdt+xxgF/Aab24qmtP8Hz8XdZ3DZur6iwQ5Apq9BKBVxOBiIwF/gq4gBeMMY9X2B8ELAGG\nABnA9caYlJrOqYlAKVVRfXsYVZX0XtzyIsEBwbQPbl+pYbjssWc13PG84+U9xapKaH0i+5R3OS7r\nNFCxZ1ZZCS6xR+Jp2zyPPZl/svy6Z9Jo7JNEICIuYC/wIyAN2AhMNsbs8jjmNmCgMWaWiNwAXGmM\nub6m82oiUEqp+qspEdTQGnPGhgH7jTHfGGMKgWXApArHTAJesR+/AVwqojPCK6VUU/JmIugKpHo8\nT7O3VXmMMaYYyAIqjMkHEZkpIptEZFN6erqXwlVKKWfyZiJoNMaYRcaYBGNMQocOHXwdjlJKtSje\nTAQHgRiP59H2tiqPERF/oC1Wo7FSSqkm4s1EsBHoJSI9RCQQuAF4u8IxbwM32Y+vAT42za0/q1JK\nNXNeG1BmjCkWkduBD7C6j75kjNkpIg8Dm4wxbwMvAv8Qkf3AcaxkoZRSqgk1uwFlIpIOHGjgyyOB\nY40YTnOg9+wMes/OcCb33N0YU2Uja7NLBGdCRDZV14+2pdJ7dga9Z2fw1j03i15DSimlvEcTgVJK\nOZzTEsEiXwfgA3rPzqD37AxeuWdHtREopZSqzGklAqWUUhVoIlBKKYdzRCIQkbEiskdE9ovIPF/H\n01hE5CUROSoiOzy2tReRj0Rkn/27nb1dRORp+z3YLiKDfRd5w4lIjIisFpFdIrJTRO60t7fY+xaR\nYBH5XES+sO/59/b2HiLymX1vr9kj+BGRIPv5fnt/rC/jPxMi4hKRrSLyrv28Rd+ziKSIyJcisk1E\nNtnbvP633eITgb0uwkJgHNAPmCwi/XwbVaN5GRhbYds8YJUxphewyn4O1v33sn9mAn9vohgbWzHw\nK2NMP+AC4Bf2v2dLvu8C4BJjzCAgDhgrIhcATwB/McacC2QCN9vH3wxk2tv/Yh/XXN0JfOXx3An3\nPNoYE+cxXsD7f9vGmBb9A7iBDzye/wb4ja/jasT7iwV2eDzfA3S2H3cG9tiPn8NaGKjScc35B/gv\n1uJHjrhvoDWwBRiONcLU395e/neONa2L237sbx8nvo69AfcabX/wXQK8i7Uycku/5xQgssI2r/9t\nt/gSAXVbF6El6WSMOWQ/Pgx0sh+3uPfBLv7HA5/Rwu/briLZBhwFPgK+Bk4Yax0POP2+6rTORzPw\nFDAH7MWIrXto6fdsgA9FZLOIzLS3ef1v22uTzinfM8YYEWmR/YNFJBR4E7jLGHPSc2G7lnjfxpgS\nIE5EwoG3gL4+DsmrRGQCcNQYs1lEEn0dTxO60BhzUEQ6Ah+JyG7Pnd7623ZCiaAu6yK0JEdEpDOA\n/fuovb3FvA8iEoCVBJYaY/5jb27x9w1gjDkBrMaqFgm31/GA0++rJazzMRKYKCIpWMvcXgL8lZZ9\nzxhjDtq/j2Il/GE0wd+2ExJBXdZFaEk813i4CasOvWz7VLunwQVAlkdxs9kQ66v/i8BXxpg/e+xq\nsfctIh3skgAi0gqrTeQrrIRwjX1YxXtu1ut8GGN+Y4yJNsbEYv0/+7Ex5kZa8D2LSIiIhJU9Bi4D\ndtAUf9u+bhxpogaYy4G9WPWq9/k6nka8r38Bh4AirPrBm7HqRVcB+4CVQHv7WMHqPfU18CWQ4Ov4\nG3jPF2LVo24Httk/l7fk+wYGAlvte94BPGBv7wl8DuwH/g0E2duD7ef77f09fX0PZ3j/icC7Lf2e\n7Xv7wv7ZWfZZ1RR/2zrFhFJKOZwTqoaUUkrVQBOBUko5nCYCpZRyOE0ESinlcJoIlFLK4TQRKGUT\nkRJ71seyn0abqVZEYsVjllilziY6xYRSP8gzxsT5OgilmpqWCJSqhT1H/Hx7nvjPReRce3usiHxs\nzwW/SkS62ds7ichb9voBX4jICPtULhF53l5T4EN7lDAicodY6ytsF5FlPrpN5WCaCJT6QasKVUPX\ne+zLMsYMAP6GNSsmwP8BrxhjBgJLgaft7U8Da4y1fsBgrFGiYM0bv9AY0x84AVxtb58HxNvnmeWt\nm1OqOjqyWCmbiOQYY0Kr2J6CtTDMN/aEd4eNMREicgxr/vcie/shY0ykiKQD0caYAo9zxAIfGWtx\nEURkLhBgjHlERP4H5ADLgeXGmBwv36pSp9ESgVJ1Y6p5XB8FHo9L+KGNbjzWnDGDgY0es2sq1SQ0\nEShVN9d7/E62H2/AmhkT4EZgnf14FTAbyheUaVvdSUXED4gxxqwG5mJNn1ypVKKUN+k3D6V+0Mpe\nBazM/4wxZV1I24nIdqxv9ZPtbb8EFovIvUA68HN7+53AIhG5Geub/2ysWWKr4gL+aScLAZ421poD\nSjUZbSNQqhZ2G0GCMeaYr2NRyhu0akgppRxOSwRKKeVwWiJQSimH00SglFIOp4lAKaUcThOBUko5\nnCYCpZRyuP8H8luf2Ik/xC4AAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOyde3hU5bX/P2smCYh4qQGLQjR4F0UI\n4dIRgVBqi0oFvFRsTwNFHUGxRauAtipHT2tBzynHIxdRRPFgwZ9Wioq1RySAOOUeRBB60AaDgmIU\nhAPkMrN+f+w9ycxkkkxCJtf1eZ482fvd7957zQ7s77zvetdaoqoYhmEYRiyexjbAMAzDaJqYQBiG\nYRhxMYEwDMMw4mICYRiGYcTFBMIwDMOIiwmEYRiGERcTCKNBEJG3RGR0ffdtTESkQER+kITrqoic\n527PEZEHE+lbh/v8TET+Vlc7q7lujojsqe/rGg1PSmMbYDRdRORwxG47oBgIuvu3q+rCRK+lqlcl\no29LR1XH1cd1RCQT+CeQqqpl7rUXAgn/DY3WhwmEUSWq2j68LSIFwK2q+k5sPxFJCb90DMNoOdgU\nk1FrwlMIIjJZRPYB80XkOyLyhojsF5Fv3O0uEefkicit7vYYEXlPRJ5w+/5TRK6qY9+uIrJKRA6J\nyDsiMlNE/rsKuxOx8VERWeNe728i0iHi+M9FZLeIFInIb6p5Pv1EZJ+IeCPaRorIB+52XxEJiMgB\nEdkrIk+JSFoV13peRP4tYv8+95zPRWRsTN9rRGSziHwrIoUiMjXi8Cr39wEROSwivvCzjTj/chFZ\nLyIH3d+XJ/psqkNELnbPPyAi20Tk2ohjV4vIdvean4nIvW57B/fvc0BEvhaR1SJi76sGxh64UVc6\nAacBZwN+nH9L8939s4CjwFPVnN8P2Al0AKYD80RE6tD3JWAdkA5MBX5ezT0TsfGnwC+A04E0IPzC\n6gbMdq9/pnu/LsRBVdcC/wd8P+a6L7nbQeBu9/P4gCHAHdXYjWvDUNeeK4HzgVj/x/8BucCpwDXA\neBEZ4R4b6P4+VVXbq2og5tqnAW8CT7qf7T+AN0UkPeYzVHo2NdicCrwO/M097y5goYhc6HaZhzNd\neRJwKfCu2/5rYA/QEfgu8ABgeYEaGBMIo66EgIdVtVhVj6pqkaq+qqpHVPUQ8DtgUDXn71bVZ1Q1\nCLwAnIHzIki4r4icBfQBHlLVElV9D1ha1Q0TtHG+qv5DVY8CLwM93fYbgDdUdZWqFgMPus+gKv4E\n3AwgIicBV7ttqOpGVf27qpapagHwdBw74vET174PVfX/cAQx8vPlqepWVQ2p6gfu/RK5LjiC8r+q\n+qJr15+AHcCPI/pU9Wyq43tAe+AP7t/oXeAN3GcDlALdRORkVf1GVTdFtJ8BnK2qpaq6Wi1xXINj\nAmHUlf2qeiy8IyLtRORpdwrmW5wpjVMjp1li2BfeUNUj7mb7WvY9E/g6og2gsCqDE7RxX8T2kQib\nzoy8tvuCLqrqXjijhetEpA1wHbBJVXe7dlzgTp/sc+34Pc5ooiaibAB2x3y+fiKywp1COwiMS/C6\n4WvvjmnbDXSO2K/q2dRos6pGimnkda/HEc/dIrJSRHxu++PALuBvIvKJiExJ7GMY9YkJhFFXYr/N\n/Rq4EOinqidTMaVR1bRRfbAXOE1E2kW0ZVTT/3hs3Bt5bfee6VV1VtXtOC/Cq4ieXgJnqmoHcL5r\nxwN1sQFnmiySl3BGUBmqegowJ+K6NX37/hxn6i2Ss4DPErCrputmxPgPyq+rqutVdTjO9NMSnJEJ\nqnpIVX+tqucA1wL3iMiQ47TFqCUmEEZ9cRLOnP4Bdz774WTf0P1GvgGYKiJp7rfPH1dzyvHY+Aow\nTESucB3Kj1Dz/5+XgF/hCNH/i7HjW+CwiFwEjE/QhpeBMSLSzRWoWPtPwhlRHRORvjjCFGY/zpTY\nOVVcexlwgYj8VERSROQmoBvOdNDxsBZntDFJRFJFJAfnb7TI/Zv9TEROUdVSnGcSAhCRYSJynutr\nOojjt6luSs9IAiYQRn0xAzgB+Ar4O/DXBrrvz3AcvUXAvwGLceI14lFnG1V1G3Anzkt/L/ANjhO1\nOsI+gHdV9auI9ntxXt6HgGdcmxOx4S33M7yLM/3ybkyXO4BHROQQ8BDut3H33CM4Ppc17sqg78Vc\nuwgYhjPKKgImAcNi7K41qlqCIwhX4Tz3WUCuqu5wu/wcKHCn2sbh/D3BccK/AxwGAsAsVV1xPLYY\ntUfM72O0JERkMbBDVZM+gjGMlo6NIIxmjYj0EZFzRcTjLgMdjjOXbRjGcWKR1EZzpxPwZxyH8R5g\nvKpublyTDKNlYFNMhmEYRlxsiskwDMOIS4uZYurQoYNmZmY2thmGYRjNio0bN36lqh3jHWsxApGZ\nmcmGDRsa2wzDMIxmhYjERtCXY1NMhmEYRlxMIAzDMIy4mEAYhmEYcUmqD8INXPpPwAs8q6p/iDne\nBlgAZOOE99+kqgVurpungd44+Vd+pap5ybTVMIzaU1payp49ezh27FjNnY1GpW3btnTp0oXU1NSE\nz0maQLgplGfiFDfZA6wXkaVulsswtwDfqOp5IjIKmAbcBNwGoKrdReR04C0R6ROTMtgwjEZmz549\nnHTSSWRmZlJ1vSejsVFVioqK2LNnD127dk34vGROMfUFdqnqJ27CrkU4aRAiGY5TAAacbJlD3OyN\n3XATkanql8ABnNGEYRhNiGPHjpGenm7i0MQREdLT02s90kumQHQmurjJHqKLj0T1cYveH8RJmbAF\nuNZNO9wVZwqqUp5/EfGLyAYR2bB///46GxoIwGOPOb8Nw6gdJg7Ng7r8nZpqHMRzwMU4uf53A+/j\n5IOPQlXnAnMBevfuXaecIYEADBkCJSWQlgbLl4PPV/N5hmEYLZ1kjiA+I/pbfxcqV6cq7yMiKcAp\nQJFbE/duVe3pVps6FfhHMozMy4PiYggGnd95ecm4i2EYyaCoqIiePXvSs2dPOnXqROfOncv3S0pK\nqj13w4YN/PKXv6zxHpdffnm92JqXl8ewYcPq5VoNRTJHEOuB890pos+AUURXuAKnPOJonIIgN+AU\nVlG3Wpao6v+JyJVAWYxzu95IT4eQ6/oOhZx9wzCaB+np6eTn5wMwdepU2rdvz7333lt+vKysjJSU\n+K+53r1707t3za7N999/v36MbYYkbQTh+hQmAG8DHwEvq+o2EXlERK51u80D0kVkF3APEC5Mfjqw\nSUQ+AibjVJ1KCkVF4HGfgghstkTRhpFUAoUBHlv9GIHC5Dj9xowZw7hx4+jXrx+TJk1i3bp1+Hw+\nsrKyuPzyy9m5cycQ/Y1+6tSpjB07lpycHM455xyefPLJ8uu1b9++vH9OTg433HADF110ET/72c8I\nZ8NetmwZF110EdnZ2fzyl7+scaTw9ddfM2LECC677DK+973v8cEHHwCwcuXK8hFQVlYWhw4dYu/e\nvQwcOJCePXty6aWXsnr16np/ZlWRVB+Eqi7DqXUb2fZQxPYx4MY45xXgFJdPOjk5kJLi+CBU4Zln\nICsL/P6GuLthtC4ChQGGLBhCSbCENG8ay3OX48uof6ffnj17eP/99/F6vXz77besXr2alJQU3nnn\nHR544AFeffXVSufs2LGDFStWcOjQIS688ELGjx9fKWZg8+bNbNu2jTPPPJP+/fuzZs0aevfuze23\n386qVavo2rUrN998c432Pfzww2RlZbFkyRLeffddcnNzyc/P54knnmDmzJn079+fw4cP07ZtW+bO\nncuPfvQjfvOb3xAMBjly5Ei9PaeaaPWR1D4fjB1bsR8MwoQJtqLJMJJBXkEeJcESghqkJFhCXkFe\nUu5z44034vV6ATh48CA33ngjl156KXfffTfbtm2Le84111xDmzZt6NChA6effjpffPFFpT59+/al\nS5cueDweevbsSUFBATt27OCcc84pjy9IRCDee+89fv5zZ2Lk+9//PkVFRXz77bf079+fe+65hyef\nfJIDBw6QkpJCnz59mD9/PlOnTmXr1q2cdNJJdX0stabVCwRAbq4zighTVgYLFjSePYbRUsnJzCHN\nm4ZXvKR508jJzEnKfU488cTy7QcffJDBgwfz4Ycf8vrrr1cZC9CmTZvyba/XS1lZWZ36HA9Tpkzh\n2Wef5ejRo/Tv358dO3YwcOBAVq1aRefOnRkzZgwLGvDlZAKBM4qYORPcLxyowrx5NoowjPrGl+Fj\nee5yHh38aNKml2I5ePAgnTs7IVjPP/98vV//wgsv5JNPPqGgoACAxYsX13jOgAEDWLhwIeD4Njp0\n6MDJJ5/Mxx9/TPfu3Zk8eTJ9+vRhx44d7N69m+9+97vcdttt3HrrrWzatKneP0NVmEC4+P3w4x9X\n7JeW2ijCMJKBL8PH/QPubxBxAJg0aRL3338/WVlZ9f6NH+CEE05g1qxZDB06lOzsbE466SROOeWU\nas+ZOnUqGzdu5LLLLmPKlCm88IKTUGLGjBlceumlXHbZZaSmpnLVVVeRl5dHjx49yMrKYvHixfzq\nV7+q989QFS2mJnXv3r31eAsGjR8Pc+ZU7I8bB7NnH6dhhtGC+eijj7j44osb24xG5/Dhw7Rv3x5V\n5c477+T888/n7rvvbmyzKhHv7yUiG1U17npfG0FEkJsL4SlGr9dZzWQYhlETzzzzDD179uSSSy7h\n4MGD3H777Y1tUr1gAhGBzwdPPgmpqY4fYuJE80MYhlEzd999N/n5+Wzfvp2FCxfSrl27xjapXjCB\niKGoyImoDoXg2DHzQxiG0XoxgYghJyd6NdMzz8DcuY1qkmEYRqNgAhGDBc4ZhmE4mEDEIV7gnGV5\nNQyjtWECEQefD+65p2JfFQ4caDx7DMOIz+DBg3n77bej2mbMmMH48eOrPCcnJ4fwkvirr76aA3H+\nc0+dOpUnnnii2nsvWbKE7dsrkkw/9NBDvPPOO7UxPy5NKS24CUQVnHqqk901zBNPmC/CMJoaN998\nM4sWLYpqW7RoUUL5kMDJwnrqqafW6d6xAvHII4/wgx/8oE7XaqqYQFRBpLManFVN5oswjOOnPkv8\n3nDDDbz55pvlxYEKCgr4/PPPGTBgAOPHj6d3795ccsklPPzww3HPz8zM5KuvvgLgd7/7HRdccAFX\nXHFFeUpwcGIc+vTpQ48ePbj++us5cuQI77//PkuXLuW+++6jZ8+efPzxx4wZM4ZXXnkFgOXLl5OV\nlUX37t0ZO3YsxcXF5fd7+OGH6dWrF927d2fHjh3Vfr7GTgtuAlEF4fxMnognFAyaL8Iwjodwid8H\nH3R+H69InHbaafTt25e33noLcEYPP/nJTxARfve737FhwwY++OADVq5cWf5yjcfGjRtZtGgR+fn5\nLFu2jPXr15cfu+6661i/fj1btmzh4osvZt68eVx++eVce+21PP744+Tn53PuueeW9z927Bhjxoxh\n8eLFbN26lbKyMmZHpGTo0KEDmzZtYvz48TVOY4XTgn/wwQf8/ve/Jzc3F6A8LXh+fj6rV6/mhBNO\n4KWXXuJHP/oR+fn5bNmyhZ49e9bpmUZiAlENfr+TaiM11Zlu8nis4pxhHA95eU7tlWDQ+V0fX7gi\np5kip5defvllevXqRVZWFtu2bYuaDopl9erVjBw5knbt2nHyySdz7bXXlh/78MMPGTBgAN27d2fh\nwoVVpgsPs3PnTrp27coFF1wAwOjRo1m1alX58euuuw6A7Ozs8gR/VdHYacGTKhAiMlREdorILhGZ\nEud4GxFZ7B5fKyKZbnuqiLwgIltF5CMRuT+ZdlaH3w9PPeWIQzAId91l00yGUVdyciAtzZm+TUtz\n9o+X4cOHs3z5cjZt2sSRI0fIzs7mn//8J0888QTLly/ngw8+4JprrqkyzXdNjBkzhqeeeoqtW7fy\n8MMP1/k6YcIpw48nXXhDpQVPmkCIiBeYCVwFdANuFpFuMd1uAb5R1fOAPwLT3PYbgTaq2h3IBm4P\ni0djsHmzIw6qzrcei642jLrh88Hy5fDoo85vXz0kdG3fvj2DBw9m7Nix5aOHb7/9lhNPPJFTTjmF\nL774onwKqioGDhzIkiVLOHr0KIcOHeL1118vP3bo0CHOOOMMSktLy1N0A5x00kkcOnSo0rUuvPBC\nCgoK2LVrFwAvvvgigwYNqtNna+y04MksOdoX2KWqnwCIyCJgOBA5zhsOTHW3XwGeEhEBFDhRRFKA\nE4AS4Nsk2lor9u1rbAsMo/ni89WPMERy8803M3LkyPKppnB67IsuuoiMjAz69+9f7fm9evXipptu\nokePHpx++un06dOn/Nijjz5Kv3796NixI/369SsXhVGjRnHbbbfx5JNPljunAdq2bcv8+fO58cYb\nKSsro0+fPowbN65OnytcK/uyyy6jXbt2UWnBV6xYgcfj4ZJLLuGqq65i0aJFPP7446SmptK+fft6\nGUEkLd23iNwADFXVW939nwP9VHVCRJ8P3T573P2PgX7AQeBFYAjQDrhbVatdZFof6b6rIhCAQYOc\nGhHgDI9nzbK61YZh6b6bFy0l3XdfIAicCXQFfi0i58R2EhG/iGwQkQ379+9PmjE+H9xyS8W+pd8w\nDKM1kEyB+AzIiNjv4rbF7eNOJ50CFAE/Bf6qqqWq+iWwBqikcKo6V1V7q2rvjh07JuEjVGB1qw3D\naG0kUyDWA+eLSFcRSQNGAUtj+iwFRrvbNwDvqjPn9SnwfQARORH4HlB9RMlxECgM8NjqxwgUVj0k\niFe3ev58G0UYRkupStnSqcvfKWlOalUtE5EJwNuAF3hOVbeJyCPABlVdCswDXhSRXcDXOCICzuqn\n+SKyDRBgvqpWHeVyHAQKAwxZMISSYAlp3rRqC6n7/c6Kpqefjl7RVN8ON8NoLrRt25aioiLS09OR\nyNw0RpNCVSkqKqJt27a1Oi+Zq5hQ1WXAspi2hyK2j+EsaY0973C89mSQV5BHSbCEoAYpCZaQV5BX\nbTH13Fx47jlHHMKjiNxcEwmjddKlSxf27NlDMn2ARv3Qtm1bunTpUqtzkioQzYGczBzSvGnlI4ic\nzJxq+4frRcyZ4+yXljrRoCYQRmskNTWVrl27NrYZRpJo9QLhy/AxY+gMXt3+Ktd3u77a0UOYrKyK\n7VDIUoEbhtEyafUCESgMMPGvEykJlrD609V0P717jSJRVOTkZgr7fJ54As491+IiDMNoWTTVOIgG\nI54PoiYsFbhhGK2BVi8QYR+EV7wJ+SAgfipwi4swDKOlkbRUGw3N8aTaCBQGyCvIIyczJyEfRJi5\nc+GOO5zIanDSgq9caQ5rwzCaD80x1UaD4svwkZOZQ15BXrXBcrH4/fDjH1fsl5baKMIwjJZDq3dS\nQ0WwXHFZMR6Ph5lXz8SfnZjHuVOn6H3L9GoYRkvBRhA4jurismJChCgLlTFh2YSERxK5uc7UUpil\nS2HkSHNYG4bR/DGBwHFUeyI8zkENJrSaCSpneg2FYMkSGDzYRMIwjOaNCQSOD2Lm1TNJ9VQMBQ4U\nJx79FpvpFaC4uH7q7RqGYTQWJhAu/mw/d/vuBiCkIaavmc7cjdXWKCon3rJXgPT0+rbSMAyj4TCB\niCB/b37U/rxN8xI+1+93fsIJLUWczK+GYRjNFROICK7vdn3U/sa9Gxn/xvg6OaxV4ZlnnFgJwzCM\n5ogJRAT+bD8jLhpRvh/UIHM2zmHA/AEJTTeFM72Wn2+lSQ3DaMaYQMQw6fJJpHnTotqCGuSON+9I\naCRhpUkNw2gpmEDE4MvwMbbn2ErtIQ0ltPQ1XmnSefNsFGEYRvMjqQIhIkNFZKeI7BKRKXGOtxGR\nxe7xtSKS6bb/TETyI35CItIzmbZGktsjt9IoItFEfmApOAzDaBkkLdWGiHhxaktfCewB1ovIUlXd\nHtHtFuAbVT1PREYB04CbVHUhsNC9TndgiapGLzFKIr4MH3mj81iwZQH7DlfkzliwZUH58ZqwFByG\nYTR3kpbNVUR8wFRV/ZG7fz+Aqj4W0edtt09ARFKAfUBHjTBKRH7vnKa/qe5+x5PNtToChQFyXsih\nJFgCQBtvG1aMXlGjSAQCMGiQM3oAZ8pp1iwrKmQYRtOisbK5dgYKI/b3uG1x+6hqGXAQiA0vuwn4\nU7wbiIhfRDaIyIZkFU3PK8ijNFhavl8cLE7YFxGZgsNWNBmG0dxo0k5qEekHHFHVD+MdV9W5qtpb\nVXt37NgxKTbkZObg9Xij2tLbJRYibSuaDMNoziRTID4DMiL2u7htcfu4U0ynAEURx0dRxeihofBl\n+Lg161YEJ0TaIx6KjhTVcJZ7bpwVTXPnWvCcYRjNg2QKxHrgfBHpKiJpOC/7pTF9lgKj3e0bgHfD\n/gcR8QA/ARYl0caEyO2RS9uUtnjFSxtvm4RXM0HlFU2hkFOFzqaaDMNo6iRtFZOqlonIBOBtwAs8\np6rbROQRYIOqLgXmAS+KyC7gaxwRCTMQKFTVT5JlY6L4Mnwsz11evqqpNquZoPKKpmDQmWqy0qSG\nYTRlrCZ1gsSuZvKKl1nXzEqo8lzsiiawVU2GYTQNrCZ1PRC7mimowYQrz/l8sHIl9O1b0WarmgzD\naOqYQCRITmYOqd7UqLayUFn5dFNN+HwwY4atajIMo/lgApEg4ejqEReOwOM+NkWZnz8/ahQRKAzw\n2OrH4o4sLE+TYRjNCROIWuDL8PHaqNei/A6lwdLywLlAYYAhC4bw4IoHGbJgSFyRsDxNhmE0F0wg\n6kDWGVnl2yFCrPt8HYHCAHkFeZQESwhqkJJgSZUR15anyTCM5oAJRB0oOlKERyoe3V92/IUhC4aQ\n3i6dNG8aXvFWm/01svIcwOuvW/CcYRhNDxOIOpCTmUOKp8LbrCjFwWKKjhSxPHc5jw5+lOW5y6uM\nk7A8TYZhNAdMIOpAVUWFDhQfIK8gj5zMnBqD6GLzNJWWwvTp9W2pYRhG3TGBqCO5PXI5IeWE8hxN\nIQ0xfc10fvPub6p0UEcSXtHkifgLLFliU02GYTQdTCDqSDj9Rp8z+0S1h6ebEkkJ7vdD75j4xXnz\n6tFIwzCM48AE4jjwZfjodUavSu1e8Sac0C/SFwGwYYONIgzDaBqYQBwnuT1ySfVULEnyipenrn4q\n4UR+fj+MGFGxHwrB+PEmEoZhND6WrK8eCBQGylNuZJ2Rxea9mwFHPBIRikAABg50Um+E8Xph9WrL\n+GoYRnKpLlmfCUQ9Utf61eCMGMaNc9JvhBk3DmbPTpa1hmEYls21wYjN+FpdNHUsfj8MHx7dZhHW\nhmE0JiYQ9UhsxlePeBKuXw0waVJ0hPXSpTBypAXQGYbROCRVIERkqIjsFJFdIjIlzvE2IrLYPb5W\nRDIjjl0mIgER2SYiW0WkbTJtrQ8iM756xYuiTPzrxIRqRkDlCOtQyImNGDzYRMIwjIYnaQIhIl5g\nJnAV0A24WUS6xXS7BfhGVc8D/ghMc89NAf4bGKeqlwA5QCnNAF+Gj76dncpAIQ1xtOwo09ckHiId\nG2ENUFwMeXn1aKRhGEYCJHME0RfYpaqfqGoJsAiImWVnOPCCu/0KMEREBPgh8IGqbgFQ1SJVDSbR\n1nolJzMH52M4LNm5hLkbE1u3Gi/CGiA98ZkqwzCMeiGZAtEZKIzY3+O2xe2jqmXAQSAduABQEXlb\nRDaJyKR4NxARv4hsEJEN+/fvr/cPUFd8GT56dYoOoHt05aMJTzX5/c5PWGNEYPPm+rbSMAyjepqq\nkzoFuAL4mft7pIgMie2kqnNVtbeq9u7YsWND21gtt/SKDpHec2gPA+YPSHgkEZkSXBWeecaC5wzD\naFiSKRCfARkR+13ctrh9XL/DKUARzmhjlap+papHgGVA5ZwWTRh/tp8RF42IagtqkDvevCOhkYTP\nB2MjEsYGgxZhbRhGw5JMgVgPnC8iXUUkDRgFLI3psxQY7W7fALyrTuTe20B3EWnnCscgYHsSbU0K\nky6fRJo3LaotqMHyqOuaiHVYh0JO8JyJhGEYDUHSBML1KUzAedl/BLysqttE5BERudbtNg9IF5Fd\nwD3AFPfcb4D/wBGZfGCTqr6ZLFuTRXjZa7cOsYu3EjzfdVhH+LtRdUYS48fb0lfDMJKLpdpoAAKF\nAQa/MJjiYDFe8TLrmln4s/0Jnz9ypBMPEYkItG0Ly5dbvibDMOqOpdpoZHwZPp686klSPamENMSd\ny+5M2FkNlSOswRlJWHyEYRjJxASigSg6UkQwFERRykJljH9zfK1iI1auhL59Kx+z+AjDMJKFCUQD\nkZOZgyci+i2koVqLxIwZcMIJFT6JUAjuust8EYZhJAcTiAbCl+Fj5tUz8Ui0SCS67BUckVi+HPpE\nVDktKYEFiS2KMgzDqBUmEA2IP9vP7GtmI1QsSwpqkFuX3lorkegVExFiacENw0gGJhANjD/bz/CL\nolNSbf9qe52jrMHSghuGkRxMIBqBSZdPwiveqLagBpmwbELCUdaWFtwwjGRjAtEI+DJ8zLpmViWR\nKAuVJVyBztKCG4aRbEwgGgl/tp/Vv1jNwLMGlrcpmnAFOksLbhhGsjGBaER8GT6Gnjc0amXTvE3z\n6pwWHGDePJtmMgyjfjCBaGRyMnNI8VTMFa37fB2DXxicsEjEOqzXrYMBAyyhn2EYx09CAiEiJ4o4\nX3NF5AIRuVZEUms6z6gZX4aPsT3HRrUVB4uZvmY6j61+rEahiE0LDk5q8AkTbCRhGMbxkegIYhXQ\nVkQ6A38Dfg48nyyjWhu5PXIrpQVfsnMJv13xW4YsGFKjSOTmQlr06ZSVmcPaMIzjI1GBELdwz3XA\nLFW9EbgkeWa1LsJpwS/ucHFUe0hDlARLalzZ5PM5YjCwwt+NKhw4UP+2GobRekhYIETEh1MCNFyX\nwVtNf6OW+DJ8DDp7UNxjiaxs8vlg6NBoh/UTT5gvwjCMupOoQEwE7gdec4v+nAOsSJ5ZrZPcHrlx\nA+jueuuuhJzWOTngjTg9FLIypYZh1J2EBEJVV6rqtao6zXVWf6Wqv0yyba2OcABdqifa/18SLGH6\nmuk1nx8nNiIUgjvuMIe1YRi1J9FVTC+JyMkiciLwIbBdRO5L4LyhIrJTRHaJyJQ4x9uIyGL3+FoR\nyXTbM0XkqIjkuz9zavexmsV+5DkAACAASURBVC/+bD8rx6yk75nRxR9e/8frUaOIQGEg7ionvx9m\nz46eagoG4dZbTSQMw6gdCZUcFZF8Ve0pIj8DeuHUjt6oqpdVc44X+AdwJbAHp770zaq6PaLPHcBl\nqjpOREYBI1X1Jlco3lDVSxP9IE255GhdCBQGGDB/AEENlrf1PbMvM4bOAGDIgiGUBEtI86axPHc5\nvozouqPxypS2aQMrVliJUsMwKqiPkqOpbtzDCGCpqpYCNSlLX2CXqn6iqiXAImB4TJ/hwAvu9ivA\nEJHI776tl3j5mtZ9vo5Bzw9iwZYFlARLCGqwylVOkyZF+yPAydVktSMMw0iURAXiaaAAOBFYJSJn\nA9/WcE5noDBif4/bFrePqpYBB4Hwkp2uIrJZRFaKyIB4NxARv4hsEJEN+/fvT/CjNB/82X5u63Vb\nVFtpqJTt+7eT5k3DK17SvGnkZOZUOtfng1mzKovE00/D5MlJNNowjBZDok7qJ1W1s6perQ67gcFJ\ntGsvcJaqZgH3AC+JyMlx7Jqrqr1VtXfHjh2TaE7jEW9l06pPV9Gncx9u63Vb3OmlMH4/rF4dXcta\nFaZPt/oRhmHUTKJO6lNE5D/C39ZF5N9xRhPV8RmQEbHfxW2L20dEUoBTgCJVLVbVIgBV3Qh8DFyQ\niK0tjfBUU2QVOoBVu1cxP39+zee7taxjs74uWQJDhphIGIZRNYlOMT0HHAJ+4v58C9T0dloPnC8i\nXUUkDRgFLI3psxQY7W7fALyrqioiHV0nN27MxfnAJwna2uLwZ/uZM2xOVNZXcHI2LdhSs1PB54N7\n763cbvUjDMOojkQF4lxVfdh1OH+iqv8KnFPdCa5PYQLwNvAR8LIbZPeIiFzrdpsHpIvILpyppPBS\n2IHAByKSj+O8HqeqX9fuo7UswvWsY6ebntn0TEKlSqdNgxEjKrdb/QjDMKoi0WWuAeA+VX3P3e8P\nPKGqTWbBZEtb5loVgcIAE/86kXWfrytv84qX23rdRm6P3Cr9EeBMJw0ZAseOOb4IcJL85eXZ0lfD\naK3UxzLXccBMESkQkQLgKeD2erLPqAW+DB8zhs6IqiER1CBzNs5h4PMDqx1N+HywfDn06VPRVlLi\nOK0NwzBiSXQV0xZV7QFchhPYlgV8P6mWGVXiy/Ax8+qZlVJylIXKGPfGOEYuHlll7iafD3r1im77\ny18sX5NhGJWpVUU5Vf1WVcPxD/ckwR4jQfzZfp66+ik8MX9CRVmyY0m1Velyc6PjI1QtqZ9hGJU5\nnpKjFvHcyBQdKUKrCGivro5EOIjOkvoZhlEdxyMQNXu3jaSSk5lDqjd+5deqIqzDVJXUz/wRhmGE\nqXYVk4gcIr4QCHCCqqbEOdYotJZVTLEECgPlsRBZZ2Tx1v++xeeHPueWXrfgz/bXeH5sUj8RmDPH\nERDDMFo+1a1iSmiZa3OgtQpEJIHCQI1ZXiudE4ABA5zRQxiPxxldmEgYRsunPpa5Gs2AvIK88iyv\nR8uOMvGvE2usRFeVP2LcOHNaG0ZrxwSiBZGTmYPXE50efMD8ATVGWsfzR6ia09owWjsmEC0IX4aP\nsT3HRrUFNci4N8YlJBLDY6p1BIMwcaKJhGG0VkwgWhi5PXJJ86ZFtSnKuDfGMfmdyXHLlIaZNAlS\nYxZFrVsHV1xh002G0RoxJ3ULJFAY4Nalt7L9q+2VjglC25S2VTqwAwFn1LBuXXS71+vUlrCcTYbR\nsjAndSvDl+Hj2WufrZSKA5zRRHGwuNoguhkzKleiCwatXKlhtDZMIFoovgwfK8esZMSFIyoVG1JV\nDhQfqHK6Kd7KJoBnnrGpJsNoTdgUUytg7sa53PHmHQQ1GNVel+kmi5EwjJaFTTG1cvzZfm7rdVul\n9kSnm1Ii4uUtRsIwWg9JFQgRGSoiO0Vkl4hMiXO8jYgsdo+vFZHMmONnichhEYlTMNOoDfFWN4Ez\niqguZ5PPBzNnWoyEYbRGkiYQbk3pmcBVQDfgZhHpFtPtFuAbVT0P+CMwLeb4fwBvJcvG1oQvw0fe\n6DxGXDgiKkV4SENMf396tRHXVcVImNPaMFo2yRxB9AV2uTWsS4BFQMxrhuHAC+72K8AQEee7qoiM\nAP4JbEuija0KX4aP10a9FpXEL1w/oqaI63gxEk8/DZMnJ8tawzAam2QKRGegMGJ/j9sWt4+qlgEH\ngXQRaQ9MBv61uhuIiF9ENojIhv3799eb4S2d3B65USVLwYm4vuPNO6qtRLdyJfTtW9Gm6qQHnzwZ\nHnvMppwMo6XRVJ3UU4E/qurh6jqp6lxV7a2qvTt27NgwlrUAwiVLvRId7BDUYHnq8LjnuU7r2OWv\njz8Ov/0tDBliImEYLYlkCsRnQEbEfhe3LW4fEUkBTgGKgH7AdBEpACYCD4jIhCTa2urwZ/tZ/YvV\ndOsQ7RZatXtVtf4Inw/ujVkyoOqsbjp2DPLykmCsYRiNQjIFYj1wvoh0FZE0YBSwNKbPUmC0u30D\n8K46DFDVTFXNBGYAv1fVp5Joa6skXsT19q+21+iPmDYNRoyo3K4K6enJsNQwjMYgaQLh+hQmAG8D\nHwEvq+o2EXlERK51u83D8TnsAu4BKi2FNZKLL8PHLVm3RLUFNcjtb9zOoOcHMXLRSMa/Mb7SqGLS\nJEirvGqWefNsmskwWgoWSW0QKAyQ80IOJcGSKvu08bZhxegVURHXgYDjpF661JliKu/bBlassMR+\nhtEcsEhqo1rCMRKx/ohISoIllSKufT547bXKaTeKi+HWW20kYRjNHRMIA6g+A2yY9HbxHQy5uZWn\nm7Zvh0GDTCQMozljAmGUE84AOy57HAPPGohHKv55KMqdy+6M67z2+ZzVS126RLeXllq0tWE0Z0wg\njCh8GT5mD5vNyl+s5L1fvMcPz/lhebrwslAZE5ZNqDJF+IMPVr7e3LkwcqSNJAyjOWICYVSJL8PH\n1JypeD0VAXWloVJuXXprXJHw+530G5EjiVAIliyBwYNNJAyjuWECYVRLOOo6MsHf9q+2c8X8K+JO\nN/n98PLL0SnCwXFcWxCdYTQvTCCMGvFn++l9ZvQquJCGGP/m+Cp9EjNnVk7JceBAMq00DKO+MYEw\nEuKWXrdUagtpqMoEf35/5eWvjz/u+CPmzrXkfobRHLBAOSNh5m6cy4y/z+Cjrz6Kas88JZOh5w0l\nt0dupUC6gQOhrKzytUSgbVtYvtwC6gyjMbFAOaNe8Gf72X7ndkZcFJ2IqeBgAXM2zmHQ84OiRhNV\nTTWBk7fJ/BKG0bQxgTBqzaTLJ1VKFQ7OCqfrFl9Hv2f6lfsm/H6YPTu6ZGkkltzPMJouJhBGrfFl\n+Jh1zay4IrHv//ax7vN13P7G7VEiMWdO5ZFEKAR33WW+CMNoqphAGHUiXE9ixIUjygPpYnl1+6sV\n/f3w3nswbhxcfHFFn5ISJ+GfYRhNDxMIo86Ea1zPGTYn7mjiWPBYJZ/E7NlOjqZI/vIXZ2UTOKMJ\nW+FkGE0DW8Vk1AuBwgALtizg73v+zpYvtqA4/6684mXWNbPwZ1eseQ0EYMAACAYrzheBn/4U/vxn\nZ1SRlmYrnAyjIbBVTEbSCedw+sklP4lqD2qQcW+MY/I7kyv6+mDWrGifhCosXAhHjzrCUVLirHCy\nEYVhNB5JFQgRGSoiO0Vkl4hUqhYnIm1EZLF7fK2IZLrtfUUk3/3ZIiIjk2mnUX/kZOaQ6o1OGa4o\n09dMj4q6rml1k8fjrHAaMsRJAjhkiImEYTQ0SRMIEfECM4GrgG7AzSISW5HmFuAbVT0P+CMwzW3/\nEOitqj2BocDTIhKT3cdoioSLD424sHLR6kdXPhrlk/D74b774l8nFILNm52RROSIwjCMhiOZI4i+\nwC5V/URVS4BFwPCYPsOBF9ztV4AhIiKqesStaQ3QFmgZjpJWQth5Pan/pKj2PYf20P+5/vzLn/+F\nx1Y/RqAwwLRpTn3r2JFEKOQUHfJ4nJ+0NMjJabjPYBhGcgWiM1AYsb/HbYvbxxWEg0A6gIj0E5Ft\nwFZgXIRglCMifhHZICIb9u/fn4SPYBwP034wrVLUtaIs3LqQB959gIHPD2TuxrlMmwZr1sCIERV+\nCVVYtcopOuTxwIwZ0Q5r800YRvJpsk5qVV2rqpcAfYD7RaRtnD5zVbW3qvbu2LFjwxtp1MikyyeR\n5k2Le6wsVMa4N8Yxd+PcKutbgzOaKCqq2A8EzDdhGA1BMgXiMyAjYr+L2xa3j+tjOAUoiuygqh8B\nh4FLk2apkTTCPomBZw2Me1zRcpEAp751bC0JVVi3rkII8vLMN2EYDUEyBWI9cL6IdBWRNGAUsDSm\nz1JgtLt9A/Cuqqp7TgqAiJwNXAQUJNFWI4n4Mnys/MVKJvWfFFXnOoyiFbUlugQYNnEZ3pRQxXF1\nqtINGOAE1OXkOD4Jr9d8E4aRTJIaKCciVwMzAC/wnKr+TkQeATao6lJ32uhFIAv4Ghilqp+IyM+B\nKUApEAIeUdUl1d3LAuWaB+GAumc2PUNQg5WOp3pSCWkI72dX0HPHa6xf/R0i/4mKOCufRoyABQuc\ntqwsZwoqJ8cC6wyjtlQXKGeR1EajECgMMH3NdJbsjK/7HvHg7zifeRN/RmlJeNRRsdSpZ0/46CPH\niR0KOY7sNm0s+towaotFUhtNjvBS2NhVTmFCGuLQd/8Go78PF72Gs9K54stMfr5TTyLkzkSFQuaP\nMIz6xgTCaFQmXT6JVE9q3GMvbX2J0s6rYNT10H9anB5a/mOxEoZR/5hAGI2KL8PHyjErGXHhCDwx\n/xw1Mj7yygfoef3bEQF14WNB5OzVXPvTfTa9ZBj1jAmE0eiEp5veG/teXKEIk3nj08z581a69fgW\nRyAE8KK7+7P0Tx3ZurUhrTaMlo8JhNFkCAtFZGrwSJbsWMLtWy7j65wxkHIMCK+C8hIKehg/vqKu\nhGEYx48JhNHkyO2RywkpJ+DBEzduYt93XoPRQ6D3XJAywqOJUIhykbBUHIZx/NgyV6NJEigMkFeQ\nR3q7dO566y5KgiVx+3k3jSP0xkw0JDhTTo5YpKQ4AXZWeMgwqseWuRrNDl+Gj/sH3I8/2199qo7s\nufQYOwckSIVfQikrs1QchnG8mEAYTZ5wqo6nhz1Nl5O6RB9U2NJlAlwzHifoPiwSFaSnN5SlhtGy\nMIEwmg3+bD8v3/hyuX/CK14Qdzls72eh/3S3p5b/DgaV8eNh8uSqrmoYRlVYlTajWeHL8LE8dzl5\nBXl8evBTnt74dMXBKx/g1Lbf4cC7t4GGfRJCKKRMn+6MKqbFi7czDCMu5qQ2mi2BwgADnx9IWSim\nllTh92BLLmzw4wySK6acRoxwKtiZ09owHMxJbbRIfBk+Zl49k1RPKhLpd8j4Owy7I86Uk5M2vH9/\nGDnSlsAaRk2YQBjNGn+2n5VjVnJ79u2OTyKSKx+A/n8gNtFfuL5E//7mmzCM6rApJqPFEE4hvrNo\nJyLCR/s/chzYG26FN+YQO90UZtIk800YrRerB2G0OgKFAQbMH1BRlGjDrfDGbJzaVWEqxCIz06kx\nYf4Jo7VhPgij1eHL8PHjC39c0dD7WbhlAPSeA502u40Vy2ELCpxpp0GDzDdhGGGSKhAiMlREdorI\nLhGZEud4GxFZ7B5fKyKZbvuVIrJRRLa6v7+fTDuNlkmlWhNh5/W47BjfRMVIorRUuXX8tyYShkES\nBUJEvMBM4CqgG3CziHSL6XYL8I2qngf8EQjPBH8F/FhVuwOjcepWG0atCNeaGJc9joFnDYxe6XTl\nAzDsdpyMsEqkI3v7lpO4/PIQWVlO8j8TC6O1kjQfhIj4gKmq+iN3/34AVX0sos/bbp+AiKQA+4CO\nGmGUiAhQBJyhqsVV3c98EEZNhJ3Yf9n5l4piRIXfgzX3waeXw5HTqfjOFP4nKIjAffeZI9tomTSW\nD6IzUBixv8dti9tHVcuAg0Bs5pzrgU3xxEFE/CKyQUQ27N+/v94MN1om4XoTa8auYcSFIzj7lLOd\naadR18PNI8FbQvS0kzPiUFWmT9fyEcXcuTDyX/bSb/gm5i45/ipFlprcaKo06VQbInIJzrTTD+Md\nV9W5wFxwRhANaJrRjAkLBcDkdyYzfc10RyjGDIZ3HoPdg4iMmwhniM3PV/LzwynFOwGdWPdmMbyy\nFf+I7nWyJRCAIUOcrLOWmtxoaiRzBPEZkBGx38Vti9vHnWI6BWc6CRHpArwG5Krqx0m002jFTPvB\nNJ4e9jR9z+zLwCvS8IwdAsP80HktnPYPt1f0iMLB3Q+m8vjvT6zzt/+8PCguUYJB57elJjeaEskU\niPXA+SLSVUTSgFHA0pg+S3Gc0AA3AO+qqorIqcCbwBRVXZNEGw0Df7aftbetZeWYlcy+Zjae3s/B\nbT745UXuaqdwGvEKv0TFvodd67vSv3+Ifxm/p9b3Tr94KyHPUZBSQp6jpF9shbWNpkPSpphUtUxE\nJgBv40QnPaeq20TkEWCDqi4F5gEvisgu4GscEQGYAJwHPCQiD7ltP1TVL5Nlr2GAIxbdT+/Ogi0L\n2L5/O6uv/A160VLHkb3jWqID7cIIqrBwTmf+lv82Hc/+kgt672XSTQPwZVQ/X1SU/gae0W8S+ucA\nPF1XU5R+DVC36SrDqG8sktowqiFq5VNhPydL7N4s+KwPFWJRUcnOIQSiyOX/zpo/DaxWJAKFAYYs\nGEJJsIQ0bxrLc5fXKCqGUZ9Yqg3DOE4ChQGmvDOF9z59j1BhX3h+BQTbxPSqqIkdFot22X/m0ku8\n3DLy3ChHdmTN7c17ncju3B65+DJ8BAKwYMluyFxJ7rDzTTCMpGICYRj1RPjFvm3zySx80QsoFJ8I\nW/+FCkd27IhCgRBd+m7mpMFz6HjRLtbuWUtpsJQQITziIeWzAYw99QWyzj2bX/4qSHGxgreEtLFX\nk/fbx0wkjKRhAmEYSSBQGGDBlgXsO7yPguVXsmW+Hw0KFWs/4olFCLovhNM/ghO+gn294PB34X+v\nhlAKHo8QCgHqBSmF7z/MuInfMHvY7Eb4hEZrwATCMBqAQMBZtrpk9U7WvXU+lZfGQvSoIvaYAEEQ\ndUqmesqg13OkZi1i5YN/sFGEkRQsm6thNAA+H9x/P6xddiGTHvsneGLzPEWKQuQoI9zmLJtFvU5b\nKA023EbpvP9h+pPfVHvvQADGT97N+NkLCBRaSHZzoDlE0NsIwjCSRCAA02fu5e87d7NvTxvY19M9\nEm/qiSqOub8lRM9bniZzyP8A0Kl9pyin9oBBpQRLPeAtJe2WoU3GbxEeVeXkOAIa9uHkZOY0Cfsa\ni6YUQV/dCKJJp9owjOaMzwev+c4AznBWQU0J8N4iH6FQ5DRTmMjgu1gHt4B6yH/WT/57neC8t+Do\nqczJvIeefY7xxaKpBEuvdfoFPZRsGkVeQV65eES+oOORrJd27EtwxktbmbjNlvSC8zcpKYFg0Pmd\nl9c0U6yYQBhGA+DL8LFyIQQmOC+DAwcgP9+pYvdf/+Wk2QhpCajH+SkvjxopGl7YMdL5IQQSIv+d\nNXAwI/pmu6/gyVcf4q95U1jz6L8RLPPg8QaZ/f92lC+1jVxmO/GvE+v00q5JfGJfgq++VURJ5xKC\nGqQkWFIuYrW9R7gt/eKtFKW/UaWwJSKOjUVOjiOaYfHMyWlsi+JjAmEYDYjPV/llNWIE5OUJ6Rf/\ng817N7NvzZW89coZlJSEUI1dOou77XWEZPegiCu5o479l7DvycXsO2sNlDr+jFCZcPv9/8vCrQs5\n7YvreGPPiwQPfwfp+iba5RiKcrTsKLfMfJZBeia5I86u9qVa1RRJ5GgkJ8cX9RK8/qp0Vm9Lo7is\nGBEhvV1s4ubolWHs8fHWg7+mrNRbfg9w7ltcooQ85+IZ/SZtMh+tJGxh+4pLFG9KGU8t2lHnhIrJ\nwOdzPk843oUu5wOO/U1pGs58EIbRBAkEYMECeHZeGWWlUBG1HSkS8VZIRR6PTDLoxGI4K6TC1wuB\ntxSy5kObA1AwGPb2AvXgSQ1y75y3OPW8j8pfVJHfyPPy4De/DaEhDx5vCP+vC2HAH5ifP5+yUFn5\naIQ9vqigv61fbmXCsgkENUgbb5uoF3ugMEDOCzmUFPSCghw4eBZsvA00BY9X+bdHnc/24IPOqMRZ\nBvwQMmAaw0/8A5323wSZKzn5/G38ee6FfPzqaDTkLBf2DvlXVj93Ta1fuMkchcSLogcY/MLg8rYV\no1ckXSTMB2EYzYzwSCM3N6V8SurxJ0JoKLZn7Be8yISCkW1ufIaGj7mjkKAHNtwe018IlQjTZ+2F\nYQ8gCOcd+Tkf//FpQmUpeFLLOPOG6ahMAlIJUcqc7b+DjemQmQXA0YLBTAn+hdMumM7rJ76O7lfm\n3nkbqTtHUXrBGOj9DMfKjrFgywIA8gryWPf5OkrW5sKbs9yVXCHwhEDLCGmIA1LIiEHnkpIaJBhy\nxS0zD91wC0uWTXRGVJ6fwPnL3I9RCuL0C569nAVbimCPL+EX/vE6ksMiD5CbW/ncBVsWcKzMGb2F\np9w+PfgpxUGn9E1xsJgFWxZUCGgjTJnZCMIwmgnhF86+ffDmm1BaCh4PXNbnIGnf/Zgc3ym8PK8T\nBbvaUVkgoPLoIzY7beyqqiB0+gDK0qCsHRzIxPGNBOGUAvj2bNdfEgKPQsjjxG6IQijFKcA0eohT\na+N/fg9rIsrSD/ND72cB8IiHkIac6n7PrQJNibDFjQtB8KSUceE9d3K07CgFKwc4AYZHT4PdA4j2\n2bh4SqDXc9BjAWT8nYEymfd+96+ESr14UoPMXvyP6PQn7gv4gHxM3kdb+fKzthS8eyWoF/Eot9/7\nKfs6LubzrRdUSp0Sy9y5cMcdEAw69qSkBlm1MqXCjxIzWko9531WPvgHFmxZwJyNc8qvMy57HLOH\nzY47ZdY9+3C9TEVZoJxhtDCq+jYZCMDAgVBWBuUvS0/QeXlHTTeFibfUtrr26vrFbgehw0fQ4R+w\nYzhRL/EO2yBztdO10ybYdVWcsq9QkWrdC5TBue/Axa/CW/8Vkwsr9t7uub2fhmF3QKEP8h6Cj3+A\nM3FSBp030P7sXZw3+H1Q2PL4v6Olqe69FKQMNLXiFv2nwdpfQjANvCUMfPARTrtgp/MR2nfi5P0/\nIi8P2p50mPfm3Ewo6ImwJcipl2zkGv969n9nKV/uPJf8V38AO38MCJJSyll3jeU7F+zgg30fECKE\nV7zMumYWFPp4/PcnsmtDpivIQeg9l9Rrf0UwFMTj8TDz6pn4s/0J/L0qYwJhGK2IyJFGp07O9MbW\nrTD+jhChIBWR2nioehQR698IE3ssTsxGlSQqOpEEI05zX9yRU2ZRdsS7ZSmcXOiOdsLXiMFTAid/\nBge6Ev+zCFAGJ38O33ahfBQVFj9wRjKf9nefa9i22M/ril37fXC4E9F+pSC02w9nvQ/pO2FfFnTa\nDMWnwuZfQDAl2nYpg0v/5Ahqp81I20Pc97O+TPvFiOofZ7xHZAJhGEb58tB0mDgRiotBxJmmCgYh\nFHL2q38n1DTiqE5Yquobe17ImcI6eBaVX+g1jYCqGtHE3jf2/lV9ntpS3Xnx7Knq/kQcix35xbuH\nQsoxnv5/H9d6tZY5qQ3DiFpi2717xRQVVAhHURGkpwubNzsjkK+/hv37oU0bZxQSCoHHI/TvD926\nwcknwx//6PhDan7hhlwBil26G/3CS0mFXoMOsP71rm5fiH6hO9cKpyYREVfUqpvuirQr3mgnnn8m\n9ryqxC+S6s6LN/qKvX+sDeFRXmRbvFGbB4JpvPpWEf7aDyKqJKkCISJDgf/E+RrwrKr+IeZ4G2AB\nkI1Ti/omVS0QkXTgFaAP8LyqTkimnYbR2oiNx0hkVUxVfg8njsMRGEdYKl52X38Na9aAKrRp4+Gu\nu+IJCogIZ5/tBA5OmiRAL3L+6qwgqqDihepNDfHrR3Zzqp7rjojEHREJHg+UBRXVoLsaqrpv7fFE\nLUTdfDVxriVlrtO98meAIOkX7eCbf1xEKBQ5UqpppFN1Opbrr6ocW3I8JG2KSUS8wD+AK4E9ODWq\nb1bV7RF97gAuU9VxIjIKGKmqN4nIiUAWcClwaSICYVNMhtE0qZSPKRAtKBB/GWjkMtGsLMpHNWG/\nSqxzPnZElJPjjHrmzYO2bWHtWsd57/XC1VdXrAQDZ2rt4oth2DA49dRIsYO33nL6iUD//sAJReze\ne5hPP8xAQwKizitawesNcc5lX/DFwW/53rXbOOeCI+xbcyWdTjqDk8/+mNeXlfHNno6c1mU/v7qn\nBP+I7k7OrunO/QoLnVGaQ8y7WUKkpAjBMo87YqoQD483xL3/VsC0KefW+u/TKD4IEfEBU1X1R+7+\n/QCq+lhEn7fdPgERSQH2AR3VNUpExgC9TSAMwzhe4glVdXEKVZ0Xrx3qJ0YhUjyd6b5oEQ3fJ/J4\nUdHx3bexBOIGYKiq3uru/xzoF/myF5EP3T573P2P3T5fuftjqEYgRMQP+AHOOuus7N27dyflsxiG\nYbRUWmw9CFWdq6q9VbV3x44dG9scwzCMFkUyBeIzIDLNZBe3LW4fd4rpFBxntWEYhtHIJFMg1gPn\ni0hXEUkDRgFLY/osBUa72zcA72qy5rwMwzCMWpG0Za6qWiYiE4C3cZa5Pqeq20TkEWCDqi4F5gEv\nisgu4GscEQFARAqAk4E0ERkB/DByBZRhGIaRXJIaB6Gqy4BlMW0PRWwfA26s4tzMZNpmGIZhVE+z\ndlIbhmEYyaPF5GISkf1AXde5dgC+qkdz6guzq3aYXbWjqdoFTde2lmjX2aoadxloixGI40FENlS1\nDrgxMbtqh9lVO5qqXdB0bWttdtkUk2EYhhEXEwjDMAwjLiYQDnMb24AqMLtqh9lVO5qqXdB0bWtV\ndpkPwjAMw4iLjSAMwzCMuJhAGIZhGHFp8QIhIs+JyJduavFw22ki8j8i8r/u7++47SIiT4rILhH5\nQER6NYJtU0XkMxHJ8wqvjQAABk1JREFUd3+ujjh2v2vbThH5UZJsyhCRFSKyXUS2iciv3PZGfWbV\n2NWoz8u9T1sRWSciW1zb/tVt7yoia10bFrs5yRCRNu7+Lvd4ZgPb9byI/DPimfV02xv6379XRDaL\nyBvufqM+r2rsavTnJSIFIrLVvf8Gty35/ydVtUX/AAOBXsCHEW3TgSnu9hRgmrt9NfAWTpmm7wFr\nG8G2qcC9cfp2A7YAbYCuwMeANwk2nQH0crdPwqkK2K2xn1k1djXq83LvJUB7dzsVWOs+i5eBUW77\nHGC8u30HMMfdHgUsbmC7ngduiNO/of/93wO8BLzh7jfq86rGrkZ/XkAB0CGmLen/J1v8CEJVV+Ek\nAoxkOPCCu/0CMCKifYE6/B04VUTOaGDbqmI4sEhVi1X1n8AuoG8SbNqrqpvc7UPAR0BnGvmZVWNX\nVTTI83LtUVU97O6muj8KfB+ntjpUfmbhZ/kKMEREIqvWJ9uuqmiwf/8i0gW4BnjW3Rca+XnFs6sG\nGvR9UcX9k/p/ssULRBV8V1X3utv7gO+6252Bwoh+e6j+JZQsJrhDw+fCw0YawTZ3KJ+F882zyTyz\nGLugCTwvd1oiH/gS+B+cEcsBVS2Lc/9y29zjB4H6rTZfhV2qGn5mv3Of2R9FpE2sXXFsrm9mAJOA\ncAXmdJrA84pjV5jGfl4K/E1ENopTSRMa4P9kaxWIctQZkzWltb6zgXOBnsBe4N8bwwgRaQ+8CkxU\n1W8jjzXmM4tjV5N4XqoaVNWeOIWx+gIXNYYdscTaJSKXAvfj2NcHOA2Y3JA2icgw4EtV3diQ962J\nauxq1OflcoWq9gKuAu4UkYGRB5P1f7K1CsQX4SGX+/tLtz2RKnhJRVW/cP9Th4BnqJgWaTDbRCQV\n5yW8UFX/7DY3+jOLZ1dTeF6RqOoBYAXgwxnah1PqR96/wSspRtg11J2uU1UtBubT8M+sP3CtODVf\nFuFMLf0njf+8KtklIv/dBJ4XqvqZ+/tL4DXXhqT/n2ytAhFZyW408JeI9v/f3h2EWFXFcRz//hoj\nBxMtCxFMJmlWkZG4CHERQUG2ChcqSiBucpGtIkNo5cpFi1E3uhDRKHCRtJJwlAgUdKEzzmDhEO4K\nxkWCICLyb3H+z3fR+3w4et9N5veBx9x33uPe/ztv3vznnHvf/3yeVwG8D9yqDOEG4qG5ws+AzhVO\nvwBb8oqON4FR4GIDxxdlIadrEfF95aFW+6xXXG33V8bwuqSluT0MfEQ5R3KOslIiPNpnja+k2COu\nPyp/VESZt672WePvZUR8GxEro6z5soXy+rfRcn/1iGt72/0laZGkxZ1t4OOMofnP5FzPbj8vN+BH\nytTDPcpc3E7K/OU4cB04A7yazxVwiDJ/fBVY10Jsx/PYk/lGr6g8f2/G9ifwSUMxbaAMVSeBK3nb\n2HafPSauVvsrj7MGuJwxTAHfZftqSlKaAU4CL2X7wrw/k4+vHnBcZ7PPpoATdK90Gujvfx7zA7pX\nC7XaX4+Jq9X+yn6ZyNs0sDfbG/9MutSGmZnVmq9TTGZm1ocThJmZ1XKCMDOzWk4QZmZWywnCzMxq\nOUGY9SHpvrqVPK9I2vMM9z2iSjVfs/+TBf2fYjbv3YlSrsJsXvEIwmyOskb/fpU6/RclvZXtI5LO\nZnG3cUmrsn25pJ9V1meYkLQ+dzUk6YjKmg2/5reekbRbZf2LSUk/tfQybR5zgjDrb/ihKabNlcdu\nRcQ7wEFKJVCAA8CxiFgD/ACMZfsY8FtEvEtZB2Q620eBQxHxNvAvsCnb9wDv5X6+aOrFmfXib1Kb\n9SHpdkS8XNN+A/gwIv7KQoL/RMQySTcpJT/uZfvfEfGapFlgZZSib519jFDKcI/m/W+AFyNin6TT\nwG3gFHAqums7mA2ERxBmTyd6bD+Ju5Xt+3TPDX5KqamzFrhUqXRqNhBOEGZPZ3Pl54XcPk+pBgqw\nDfg9t8eBXfBgIZ8lvXYq6QXgjYg4R1l/YAnwyCjGrEn+j8Ssv+Fcla3jdER0LnV9RdIkZRSwNdu+\nBI5K+hqYBXZk+1fAYUk7KSOFXZRqvnWGgBOZRASMRVnTwWxgfA7CbI7yHMS6iLjZdixmTfAUk5mZ\n1fIIwszMankEYWZmtZwgzMyslhOEmZnVcoIwM7NaThBmZlbrP163unCMZAiLAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOydeXxU5bn4v89MFqC41IiiEAy2iksp\nOzouGEStoFWsS6W2AVHCIt5yvZbWqpUreqlob6mKQFCRtBbU+pMiAipIgOp42RVBrKBhUVFMRbSY\nZea8vz/ecyZnJpNkghkSyPPNJ5+cc95z3nlnyXnm2cUYg6IoiqIkEmjqBSiKoijNExUQiqIoSlJU\nQCiKoihJUQGhKIqiJEUFhKIoipIUFRCKoihKUlRAHAKIyCIRGdrY5zYlIlIqIhelYV4jIt93t6eL\nyN2pnHsAj3ODiLxyoOs83BCRp0Tkvkaec5iI/KMx51QaRkZTL+BwRUS+9u22ASqAqLs/0hjzdKpz\nGWMGpuPcwx1jzKjGmEdE8oAPgUxjTMSd+2kg5fdQSS8iMgy42RhzXlOv5XBCBUSaMMa09bZFpBT7\n4V2SeJ6IZHg3HUVRmj/J/mcb+n98qPzfq4npICMi+SKyS0R+LSK7gVki8l0RWSAie0TkC3e7o++a\nEhG52d0eJiL/EJGH3HM/FJGBB3huZxFZISJficgSEZkqIn+pZd2prHGiiLzuzveKiBzrG/+FiGwX\nkTIRubOO1+csEdktIkHfsatE5G13u6+IhEVkr4h8IiKPikhWLXPFmT1E5FfuNR+LyPCEcy8TkfUi\nsk9EdorIBN/wCvfvXhH5WkRCieYPETlHRFaLyJfu33NSfW0S1uF9PsaLyGfuegeLyCAR+aeI/EtE\nfus7PyAivxGRbe5r+6yIHOMbf859Pb903+szE16fqSLykruu/xOR79Xx3tQ6l8uxIvKqO9dyETnJ\nvU5E5I/u89knIhtF5Afu2FEiUux+rraLyF0iUuO+JCJ5Yk2CGb5jJSJys4icDkwHQu77s9cdz3Y/\n+ztE5FOxJsfWdTy/4SLyrvv5ftlbvztmROQWEXkfeF+S/x9ni8gU9/P1sbudnfC+xs6vbR3NCRUQ\nTUN74BjgJKAQ+z7Mcvc7Ad8Aj9Zx/VnAe8CxwGTgCRGRAzj3r8AqIAeYAPyijsdMZY0/A24EjgOy\ngNsBROQMYJo7/4nu43UkCcaY/wP+DVyYMO9f3e0o8J/u8wkBA4Axdawbdw2Xuuu5GDgFSPR//Bso\nAI4GLgNGi8hgd6yf+/doY0xbY0w4Ye5jgJeAh93n9r/ASyKSk/Acarw2tdAeaAV0AH4HzAR+DvQC\nzgfuFpHO7rm3AoOBC7Cv7RfAVN9ci9znexywjppmseuB/wa+C2wF7q9jXfXNdQMwEfvebPCNX4J9\nDU8FjgKuA8rcsUfcYye7z6EA+zqljDHmXWAUEHbfn6Pdod+7j9kd+D7Vr2cNRORK4LfAT4B2wEpg\nTsJpg7H/T2e4+4n/x3cCZ7uP1w3oC9zluz7x/OaPMUZ/0/wLlAIXudv5QCXQqo7zuwNf+PZLsCYq\ngGHAVt9YG8AA7RtyLvYmHwHa+Mb/AvwlxeeUbI13+fbHAIvd7d8Bc31j33Ffg4tqmfs+4El3+wjs\nzfukWs4dB7zg2zfA993tp4D73O0ngd/7zjvVf26SeacAf3S389xzM3zjw4B/uNu/AFYlXB8GhtX3\n2iR53Hys8A36nr8BzvKdsxYY7G6/CwzwjZ0AVPnX6hs72p3rKN/r87hvfBCwJcX3P9lc/ve4LVaY\n52KF/T+xN8+A75yg+zk4w3dsJFCS5DVO9h6UEP9Z/4dvTNzPzfd8x0LAh7U8n0XATb79ALDf+9y5\nj31hwvsU938MbAMG+fZ/BJSm+n/fHH9Vg2ga9hhjyr0dEWkjIjNcFXsf1qRxtPjMLAns9jaMMfvd\nzbYNPPdE4F++YwA7a1twimvc7dve71vTif65jTH/pvobZDL+CvzEVc9/Aqwzxmx313GqWPPWbncd\n/4P9xlofcWsAtic8v7NEZJlr6vgS+400lXm9ubcnHNuO/cbqUdtrk4wyY4wX0PCN+/dT3/g3vutP\nAl4Qa3LbixUYUeB4EQmKyO9d89M+7BcViH9eKa0rxbn87/HXwL+AE40xr2G1zanAZyJSJCJHutdm\nEv/aJb5uB0o77Beitb7XZrF7PBknAX/ynfsvrJDxryXx/yPu/5ian4Pt7rHazm/2qIBoGhJL6P4X\n0AX7LfFIqk0atZmNGoNPgGNEpI3vWG4d53+bNX7in9t9zJzaTjbGbMb+cw0k3rwE1lS1BTjFXcdv\nD2QNWA3Kz1+B+UCuMeYorE3bm7e+kscfY28wfjoBH6Wwrm/LTmCgMeZo328rY8xH2NfuSqw57Sjs\nt3A4sM9VKnP53+O2WHPKxwDGmIeNMb2w5plTgV8Bn2O1Hf9rV9vr9m/3r//z2t63nfgefY4VpGf6\nXpejjC94JIGd2OhC/+vY2hjzRh2Pkbif+Dno5B6r7fxmjwqI5sER2A/zXteefU+6H9D9Rr4GmCAi\nWSISAn6cpjX+DbhcRM4T61C+l/o/e38FfokVRM8lrGMf8LWInAaMTnENzwLDROQMV0Alrv8IrEZV\nLiJ9sTdEjz2Ag7WTJ2MhcKqI/ExEMkTkp9gb4YIU1/ZtmA7c73MIt3Pt6WCfUwVWW2uD1bYOlFTm\nGuR7jycCbxpjdopIH1dDy8Te6MsBx9WSnnXXf4T7HG7DmjrjMMbswQqOn7vazHDA71D/FOjoPjbG\nGAfru/mjiBwHICIdRORHtTy/6cAd4jreXef5tQ14fcD6LO5y34NjsabVpEEfhwoqIJoHU4DW2G89\nb2JV4YPBDVi7bBnW7v8M9iaQjANeozFmE3AL9qb/CdaRuquey+ZgnZavGWM+9x2/HXvz/gp7A3gm\nxTUscp/Da1hn7GsJp4wB7hWRr7D/2M/6rt2Pdd6+7pogzk6Yuwy4HKtllQHjgcsT1p0u/oTVfF5x\n1/4m1pEKUIzVxD4CNrtjB0oqc/0VK3j/hXWo/9w9fiT2vfrCnaMMeNAduxUrND4A/uHO8WQtaxiB\n1TzKgDMB/7f714BNwG4R8V73X2Pf6zdds9gSrBZcA2PMC8ADwFz33HewGmxDuA/7pettYCPWkd+o\nyYMHG3EdKIqCiDyDdVKmXYNRFKX5oxpEC8ZV/b8nNpb+UqyNeV5Tr0tRlOaBZlK3bNoD/w/rMN4F\njDbGrG/aJSmK0lxQE5OiKIqSFDUxKYqiKEk5bExMxx57rMnLy2vqZSiKohxSrF279nNjTNIEwsNG\nQOTl5bFmzZqmXoaiKMohhYgkVgGIoSYmRVEUJSkqIBRFUZSkqIBQFEVRknLY+CAURTl4VFVVsWvX\nLsrLD6nipC2aVq1a0bFjRzIzM1O+Jq0Cws3O/RO27vvjxpjfJ4zfBtyM7UuwBxhujNkuIt2xVTuP\nxJYuvt8Yk1LNHUVR0s+uXbs44ogjyMvLo/ZeVUpzwRhDWVkZu3btonPnzvVf4JI2E5PbJ2AqtuDV\nGcAQt7OYn/VAb2PMD7EVPye7x/cDBcaYM4FLgSkicjSKojQLysvLycnJUeFwiCAi5OTkNFjjS6cP\noi+2m9kHxphKYC621k8MY8wyX8OaN3HbUBpj/mmMed/d/hj4jNobfXxrwmGYNMn+VRQlNVQ4HFoc\nyPuVThNTB+I7MO2iugxxMm7Ctv2Lw63Nn4Vt55c4Vojb27VTp8T+L6kRDsOAAVBZCVlZsHQphEIH\nNJWiKMphRbOIYhKRnwO9qa4R7x0/AfgzcKPbACQOY0yRMaa3MaZ3u3YHpmCUlFjhEI1CRQVMmKCa\nhKI0d8rKyujevTvdu3enffv2dOjQIbZfWVlZ57Vr1qzhP/7jP+p9jHPOOadR1lpSUoKI8Pjjj8eO\nbdiwARHhoYceih2LRCK0a9eO3/zmN3HX5+fn06VLl9jzu+aaaxplXamQTg3iI+JbPHYkSStBEbkI\nuBO4wBhT4Tt+JPAScKcx5ts0OqmT/HyrOVRUgOPAkiWwcqVqEorSnMnJyWHDhg0ATJgwgbZt23L7\n7bfHxiORCBkZyW9vvXv3pnfv3vU+xhtvvFHvOanygx/8gGeffZabb74ZgDlz5tCtW7e4c1599VVO\nPfVUnnvuOSZNmhRnEnr66adTWnNjk04NYjVwioh0dtsAXo/tfBVDRHoAM4ArjDGf+Y5nAS8AxcaY\nv6VxjYRCVhhcdBEEAlZIVFZazUJRlMYjvDPMpJWTCO9Mj4o+bNgwRo0axVlnncX48eNZtWoVoVCI\nHj16cM455/Dee+8B9hv95ZdfDljhMnz4cPLz8zn55JN5+OGHY/O1bds2dn5+fj7XXHMNp512Gjfc\ncANeFeyFCxdy2mmn0atXL/7jP/4jNm8iJ510EuXl5Xz66acYY1i8eDEDB8Y3rJszZw6//OUv6dSp\nE+FmYsZImwZhjImIyFjgZWyY65PGmE0ici+wxhgzH2tSags850rLHcaYK4DrsL2Ic0RkmDvlMGPM\nhnSsNRSypqWVK6t9Efn56XgkRWmZhHeGGVA8gMpoJVnBLJYWLCWU2/gq+q5du3jjjTcIBoPs27eP\nlStXkpGRwZIlS/jtb3/L888/X+OaLVu2sGzZMr766iu6dOnC6NGja+QKrF+/nk2bNnHiiSdy7rnn\n8vrrr9O7d29GjhzJihUr6Ny5M0OGDKlzbddccw3PPfccPXr0oGfPnmRnZ8fGysvLWbJkCTNmzGDv\n3r3MmTMnzsR1ww030Lp1awAuvvhiHnzwwRrzp4O05kEYYxZiG7r7j/3Ot31RLdf9hSZo9j10qP1b\nUKDmJUVpTEpKS6iMVhI1USqjlZSUlqRFQFx77bUEg0EAvvzyS4YOHcr777+PiFBVVZX0mssuu4zs\n7Gyys7M57rjj+PTTT+nYsWPcOX379o0d6969O6WlpbRt25aTTz45llcwZMgQioqKal3bddddx09/\n+lO2bNnCkCFD4kxYCxYsoH///rRu3Zqrr76aiRMnMmXKlNhzORxNTIcMRUVwwQX27+zZTb0aRTn8\nyM/LJyuYRVCCZAWzyM/LT8vjfOc734lt33333fTv35933nmHF198sdYcAP83+WAwSCQSOaBz6qN9\n+/ZkZmby6quvMmDAgLixOXPmsGTJEvLy8ujVqxdlZWW89tprDX6MxqbFl9oIh+GWW8B7vysqrP9B\nNQhFaTxCuSGWFiylpLSE/Lz8tGgPiXz55Zd06NABgKeeeqrR5+/SpQsffPABpaWl5OXl8cwz9Rd7\nuPfee/nss89imgEQM4Xt3LkzJohmzZrFnDlzuPjiixt93Q2hxQuIkhLrmPYQgR07rOBQIaEojUco\nN3RQBIPH+PHjGTp0KPfddx+XXXZZo8/funVrHnvsMS699FK+853v0KdPn3qvSRY6+8ILL3DhhRfG\naSlXXnkl48ePp6LCBnb6fRDHHnssS5YsaaRnUTeHTU/q3r17mwNpGOQlylVUWOHgRTIFgzB8uPoj\nFCUZ7777LqeffnpTL6PJ+frrr2nbti3GGG655RZOOeUU/vM//7Opl1Uryd43EVlrjEnq4GjxPggv\nzPW++2DECCscolEbzTRjhhUezSTiTFGUZsbMmTPp3r07Z555Jl9++SUjR45s6iU1Ki1eg/ATDkP/\n/lab8AgEoHdv6NlTtQlF8VAN4tBENYhvSTQav+84sGoVTJ9uhYdqE4qitBRUQPgoKakpIPxohrWi\nKC0JFRA+8vOhrmZLmmGtKEpLQgWEj1DIagijRkHfvtb/4NG3Lyxbpj4IRVFaDiogEgiFYNo0mDIF\nsrNtuGvr1nZfhYOiNA/69+/Pyy+/HHdsypQpjB49utZr8vPz8QJZBg0axN69e2ucM2HChLgS3MmY\nN28emzdvju3/7ne/a5S8hOZYFlwFRC144a8jRlTXaNLOc4rSPBgyZAhz586NOzZ37tx6C+Z5LFy4\nkKOPPrAuxokC4t577+Wii5KWlWswXllwj/rKgidGoT799NNs2LCBDRs28Le/fftC2Cog6mH2bJg5\n0/oe+veHu+6Cfv1s3SZFUVKnMb9gXXPNNbz00kux5kClpaV8/PHHnH/++YwePZrevXtz5plncs89\n9yS9Pi8vj88//xyA+++/n1NPPZXzzjsvVhIcbI5Dnz596NatG1dffTX79+/njTfeYP78+fzqV7+i\ne/fubNu2jWHDhsVuxkuXLqVHjx507dqV4cOHxzKh8/LyuOeee+jZsyddu3Zly5YtSdfV3MqCq4Co\nA3+3uaqq6qZCkQiMHauahKKkilex4O67Gyf59JhjjqFv374sWmS7FM+dO5frrrsOEeH+++9nzZo1\nvP322yxfvpy333671nnWrl3L3Llz2bBhAwsXLmT16tWxsZ/85CesXr2at956i9NPP50nnniCc845\nhyuuuIIHH3yQDRs28L3vfS92fnl5OcOGDeOZZ55h48aNRCIRpk2bFhs/9thjWbduHaNHj67TjOWV\nBX/jjTdqLQv+4x//mCFDhjBnzpy4a2+44YaYielXv/pV6i9oLaiAqAOv21wwaH/9Pb+jUQ15VZRU\n8X/Zaqxwcb+ZyW9eevbZZ+nZsyc9evRg06ZNceagRFauXMlVV11FmzZtOPLII7niiitiY++88w7n\nn38+Xbt25emnn2bTpk11rue9996jc+fOnHrqqQAMHTqUFStWxMZ/8pOfANCrVy9KS0trnee6667j\nueeeY86cOTVMZollwefNm0fUF5vvNzE1Rs8IFRB14PdD+IovEghYB7aGvCpKavi/bDVWuPiVV17J\n0qVLWbduHfv376dXr158+OGHPPTQQyxdupS3336byy67rNYy3/UxbNgwHn30UTZu3Mg999xzwPN4\neJpAfeXCm1NZcBUQ1N0KMRSCTp2sWckYKxwuushGNZWUqJlJUVLB+7I1cWLj9Xtv27Yt/fv3Z/jw\n4bFv2vv27eM73/kORx11FJ9++mnMBFUb/fr1Y968eXzzzTd89dVXvPjii7Gxr776ihNOOIGqqiqe\nfvrp2PEjjjiCr776qsZcXbp0obS0lK1btwLw5z//mQsuuOCAntu9997LAw88kLQs+I4dOygtLaW0\ntJSpU6fWMDM1Ji2+3HcqrRC9bz9eO9Krr4Zx46r3G+sDryiHM6FQ4/+fDBkyhKuuuipmaurWrRs9\nevTgtNNOIzc3l3PPPbfO63v27MlPf/pTunXrxnHHHRdXsnvixImcddZZtGvXjrPOOismFK6//npG\njBjBww8/HBcp1KpVK2bNmsW1115LJBKhT58+jBo16oCeV3MpC97ii/VNWjmJu5fdTdRECUqQif0n\ncsf5d9Q4LxyG4uLq/ZkzrT01GLTfiu6oeYmiHLZosb5Dk4YW62vxGoTXCtHTIOpqhTh7ttUagkHI\ncF85Lb+hKMrhSlp9ECJyqYi8JyJbReQ3ScZvE5HNIvK2iCwVkZN8Y0NF5H33d2i61ui1QpzYf2JS\n85KHPwojEoFu3azzWs1LiqIcrqRNgxCRIDAVuBjYBawWkfnGGH/M2XqgtzFmv4iMBiYDPxWRY4B7\ngN6AAda6136RjrWm0grR80N4uRCrV8PatbB7N4wfr0JCaXkYYxB/7LfSrDkQd0I6NYi+wFZjzAfG\nmEpgLnCl/wRjzDJjzH53902go7v9I+BVY8y/XKHwKnBpGtdaL14UxkUX2XwIY6w2MW8enH++ZlYr\nLYtWrVpRVlZ2QDcd5eBjjKGsrIxWrVo16Lp0+iA6ADt9+7uAs+o4/ybAi0lLdm2HRl2dj/DOMCWl\nJeTn5depSYRCMGECvPaaNTN5RKPg1QgrLEzXKhWl+dCxY0d27drFnj17mnopSoq0atWKjh071n+i\nj2bhpBaRn2PNSQ0KGhaRQqAQoFOnTgf02KmEufoJhWDqVBgzJr65kONUC4muXa3PIj9fTU/K4Ulm\nZiadO3du6mUoaSadJqaPgFzffkf3WBwichFwJ3CFMaaiIdcaY4qMMb2NMb3btWt3QIssKS2hMlpJ\n1ESpiFQwoWRC0oQ5P4WFsHIlDB4cX37Dcazg6N+/8WrOKIqiNBXpFBCrgVNEpLOIZAHXA/P9J4hI\nD2AGVjh85ht6GbhERL4rIt8FLnGPNTpemGuAAA4OSz5cwoDiAfUKiVAIXnjB9qr2NxaKRq0juzFr\nziiKojQFaRMQxpgIMBZ7Y38XeNYYs0lE7hURryLWg0Bb4DkR2SAi891r/wVMxAqZ1cC97rFGxwtz\n7X1ibwTBMQ6V0UpKSktSur6w0DYY8tdq8sjI0BwJRVEOXdLqgzDGLAQWJhz7nW+71i4bxpgngSfT\nt7p4Nny6AYONyMgIZNSZMJdI167QqxesWhV//MYb1QehKMqhS7NwUjc1JaUlRJ1qj3O346s7ONUX\n4eTVua+oiD+enQ0FBWlbsqIoStpRAUG1H6IiUoGDw+qPV5M/O59HBj7CuMXj6oxw8jKsHcf6Inr3\nhp49oUePav+DahGKohyKaLlv4v0QAAZDZbSSP735p1iEU21+CX+d++xsWwa8Rw/bce6uuzSSSVGU\nQxcVEC6h3BA9T+gZd2zz55sJSIAAAUSEnDY5Na9LqHMPcMsttkWp41jTk0YyKYpyKKICwkdBtwKC\nEh+OFHWiiNjopnGLx9XaVOiOO+zfkhIrGDyMsc5r1SIURTnUUAHhI5Qb4rHLHiMg1S+LweAYJ+Xw\n1/x8a2ryEuiMsfWaLrhAhYSiKIcWKiASKOxVyLTLppEZyCQgATKDmWQFswhKsN5+EVBtcrr44vjj\nVVUweTJMmqSCQlGUQ4MW31GuNsI7wxS/ZVvI9TihB2X7y+ot5hd3fdhWefXXaxKxv9nZ2kdCUZTm\nQV0d5VSDqIPZb82maG0Rtyy8JeagnrRyUr1lOMDe/B97LL4MhzHquFYU5dBB8yBqoaS0JJYX4TgO\noxeMRkQwGLKD2fVWfQVbhmP9eluvyY+IluBQFKX5oxpELeTn5RPwff13cIiaKI5xKI+Ux8xP9VFQ\nYPMkEikuVl+EoijNGxUQtRDKDTF10FQyA5kI8W0VDYZZG2albGoqKYG+fasjm6JRmDFDk+gURWne\nqICog8JehSwftpyRvUaSHcyOExQRJ5JyxddQyGZYt2oVH/6q5cAVRWnOqICoh1BuiGmXT2PZ0GVc\n2eVKghIkIIGUQl7j5nGFhN9preXAFUVpzqiAaAAvb3sZYwwBCTDl0ikph7x6lJVVb4toOXBFUZo3\nGsWUIl5rUgcHMcL6T9YzaeUkctrkpJwj4RX2q6y0AmLdOvj1r+Hoo7V/taIozQ8VECnilQSvjFYi\nIsxcNxPHOBisRpEdzGbKpVPqFBZelvXkybb8xqpV9lfE+ic0eU5RlOaEmphSxCsJPqLnCIwxRE00\n1oHOMQ4VkQrGLhzL3cvurrOndSgE+/fHHzNGk+cURWl+qIBoAKHcEJ2O6kRieZKABAgEAkRNtM7e\nER5XX538eE7NauKKoihNhgqIBpKfl09GsNoyF5QghT0LmTpoKtnB7JSK+hUWwvjx8RFNjmP7SIwe\nrbkRiqI0D7RY3wEwesFoZqydgcEgCFd2uZK+HfrGHNapOq7DYRg3zvoh/LRurf4IRVEODk1WrE9E\nLhWR90Rkq4j8Jsl4PxFZJyIREbkmYWyyiGwSkXdF5GERkcTrm4qCbgVkBjMBm1U977153Pnandy6\n6FZy2uQwbvG4en0RYAVAz541j5eXw4QJqkkoitK0pE1AiEgQmAoMBM4AhojIGQmn7QCGAX9NuPYc\n4Fzgh8APgD7ABelaa0MJ5YYY3n14XGa118f6iXVP1NvH2k9Bge1n7ccYeOUVWy68qCgNT0BRFCUF\n0qlB9AW2GmM+MMZUAnOBK/0nGGNKjTFvA07CtQZoBWQB2UAm8Gka19pgCroV0CqjVY06TScecWKD\nGww99hhkZtYci0ZhzBjVJBRFaRrSKSA6ADt9+7vcY/VijAkDy4BP3N+XjTHvJp4nIoUiskZE1uzZ\ns6cRlpw6XtjryF4jYwX9soPZjD93PEsLljKx/8SUSoKDdVovXw6XXFJzLBq1WoZqEoqiHGyaZaKc\niHwfOB3o6B56VUTON8as9J9njCkCisA6qQ/uKq2QCOWGKOhWECv/vfGzjQ3uPgdWk5gwweZCVFbG\nj23dCiNHwrZtmnWtKMrBI50C4iMg17ff0T2WClcBbxpjvgYQkUVACFhZ51VNyOy3ZscaDAlCMBBk\n6qCpFPYqTHkOrzR4cTHs3g1r1sCuXdXjDz1ks66DQRg+3GoWKigURUkX6TQxrQZOEZHOIpIFXA/M\nT/HaHcAFIpIhIplYB3UNE1NzwV+nCazDOuJEGLtwbEo9I/yEQjBtGrzwAtx9d/yYMdbkVFmp/SQU\nRUk/aRMQxpgIMBZ4GXtzf9YYs0lE7hWRKwBEpI+I7AKuBWaIyCb38r8B24CNwFvAW8aYF9O11m+L\nV6cpkPByRpxIyp3nklFYaAVB377QvbtNrNN+EoqiHCw0Ua6RCO8MU1Jawt6KvfzhjT8QNVEAsoPZ\nLBu6DLCaRkN8E+GwNTc98QRUVVUfz8iwAiIrSxPqFEX5dtSVKNcsndSHIp7DGmBf+b5YpnVFtIKb\n5t/EB198QMSJkBXMSim6KRy2JqTycisM/BgD555rxzZuVAGhKEp60FpMacCfaQ3w7ufvUhGtSDl5\nDqqjmZIpeNEorFhhS3SMHKkhsIqipAcVEGnAy7RORJCUW5V6zYWCQfu3Xz+7HQjEF/kDeP75xlm3\noiiKHxUQaaKgWwFBia+hcdqxpzG029CUrveaC02caLWJ5cth5UrruD7vvPhzaysfriiK8m1QJ3Ua\nKVpbxJiXxhA1UQIECAaCOMZJ2Q+RiOeX8FqW9uxpNQ1NnlMU5UBRJ3UT4SXJjV04logTocqxoUie\nH6KhAsLzS0Sj1tzUsyc88og9phFNiqI0NmpiSjNl+8tivas9RIScNg1vH5fol9i920YyeclzmhOh\nKEpjohpEmvGS6CqjtsCSMYaoE2Xc4nF0Pa5rg+s1LV1qBUFODtx6a3WUU0aGFSCKoiiNhWoQacar\n+jqi5wgAHKw28U3kG8YtHlR5k2gAACAASURBVHdApTjuuAPKyqzm4NGtm82JmDRJy28oitI4qAZx\nEAjlhigpLSExIGDVx6voP7s/Dw98uMEVYD1zU0WF7We9apX9FYFWrdQfoSjKt0c1iINEfl4+2RnZ\nNRoMVUQrGLtwbEotSv145qaLLqquzwTW5PTNN7bXtWoSiqJ8G1RAHCQSGwx5BCRA1EQblGUdm9Pt\nIZHYshSsNtG/vwoJRVEOHBUQB5FQbohpl0/j0UGPxpLoPLNTQAIpZ1nHzRmC226rmV0N1vxUXGyF\nhPomFEVpKOqDaALK9pfFBIPBYIwhKEGmXDrlgJLnHnnEbgeD1sTk+Dp8FxXBzJn2eHa2+iYURUkd\n1SCagPy8fAIJX/kd41C2v6zBc3nJc55Q6J2QD+k4NtrJcaxGUVKiGoWiKKmhAqIJCOWGmDpoalyt\nJkFYvG0xoxeMblDoa2Ly3E032b/JCAZt/sSAAbZbnXakUxSlLlRANBGFvQpZeeNKBncZTIAADg4r\ntq9g+trp9J/dv8HRTBMn2r+FhVZL6Ns3/rxAAB591OZPeOU6NPtaUZS6UAHRhIRyQ/Tt0DeuDAfY\n0NfJr09m0spJtQqK8M5wbNxLnvN8C6EQTJkCrVvbENhAAG6/Hbp2hR07bNa1p3Fo9rWiKLWh1Vyb\nmPDOMPmz82OlOPwIQquMVjUqv4Z3hhlQPIDKaGWdlWGLimDsWKstZGRYYRGJWOEwfDgUFKjDWlFa\nOnVVc1UNookJ5YYoGVrCqF6jOOPYM+LGvJalibkRJaUlVEYr682dKCuzzmnHsT2tKyqssIhEoFMn\nFQ6KotSNCohmgJcf8fgVj9doMoSBHV/uiDM1eQUAgxKsM3fCc2AHAvGtSx0H9u7VSCZFUeomrSYm\nEbkU+BMQBB43xvw+YbwfMAX4IXC9MeZvvrFOwONALmCAQcaY0toe61A1MSVy1dyrmPfevNi+V5oj\nGAgyddDUWI+J8M4wJaUl9dZvCodttvWrr8YLicxMKyi0j4SitGyaxMQkIkFgKjAQOAMYIiJnJJy2\nAxgG/DXJFMXAg8aY04G+wGfpWmtzYvy548kKVsepGvcn4kQYu3BsTJMI5Ya44/w76k2s88pxZFZX\n94j5IqLR6twIRVGURNJpYuoLbDXGfGCMqQTmAlf6TzDGlBpj3gYc/3FXkGQYY151z/vaGLM/jWtt\nNng+iUtOvqRGYb+oiTaoVlNszpAVAoMHV2dbe9qE49jcCEVRlETSKSA6ADt9+7vcY6lwKrBXRP6f\niKwXkQddjSQOESkUkTUismbPnj2NsOTmQSg3xIT8CWQGM+OOByVYwx+R8pyhmrkRYP0TZWWaXa0o\nSk2aq5M6AzgfuB3oA5yMNUXFYYwpMsb0Nsb0bteu3cFdYZoJ5YYY3n14nBYRcSJMXzud82edT9Ha\nogbP6XdaQ3WOxN69ml2tKEpN0lms7yOsg9mjo3ssFXYBG4wxHwCIyDzgbOCJRl1hM6egWwGz35pN\nRaQi1okOrKlpzEtjAGo0GqrLee1vWbp3L/zxj9YP8Yc/WFOTMTa7urjYnpOfr85rRWnJ1CkgRORI\nY8y+WsY6GWN21HH5auAUEemMFQzXAz9LcV2rgaNFpJ0xZg9wIXDohyg1EK+HxISSCbzywStxY1ET\nZfRLowHIDmaztGApQL0JdKGQ/Z00qTpHwk8gALNmWSe2RjgpSsumPhNTibchIksTxuZRB8aYCDAW\neBl4F3jWGLNJRO4VkSvcOfuIyC7gWmCGiGxyr41izUtLRWQjIMDMlJ/VYYTnj/BHNnk4xsExDuWR\nciaUTKD4reKUEuig2tzk70YnAj16VEc4aa0mRWnZ1Gdi8ofRHFPHWFKMMQuBhQnHfufbXo01PSW7\n9lVsfkSLx4tsKn6rmN1f7+afZf9k8+ebY+MGw5IPl5CxPYOMQAY41Nt8yKvXNGaMFQZQXZtp/Xpr\nbsrKshFOkyapuUlRWiL1CQhTy3ayfSWNhHJDhHJDsdpNHp4T2zEOUSfKiJ4j6HRUp3oT6MBGL/lp\n377aHxEMwq232t7WlZVqblKUlkh9AuI4EbkNqy1427j7h1fY0CFCSWkJUcd+5ReEK7tcycvbXo75\nHQq6FaTclc4zM1VUWKGwfXv1mONY81J5ebXzuqREBYSitCTqExAzgSOSbIMtg6EcZLw6TJ5AGHjK\nQNq3bc/ur3fTvm37Bs3lRTUlK8XhOLB6dfwxTahTlJbFAddiEpE+rg+hWXC41GJKBS+UNadNDuMW\nj6M8Uh4Lgc0IZMTVbEppvrDVJiprVhyPIQKtWqmZSVEONxqtFpOInCEiE0VkKzCtUVanNBivDlPZ\n/jIqIhVxDYciToQxL41pUOtSrxTHqFE221qShB/4zUyKorQM6tUgRCQPGOL+VgEnAb3rqqzaFLQk\nDcIjvDNMv6f6EXEiScczAhnc3OPmBvklErUJkWqBkZ1tI5/KyjSqSVEOF+rSIOoUECISBo7EFtqb\na4x5X0Q+NMZ0Ts9SD5yWKCAAitYWMXbhWKqcqqTjtXWlq4tw2GZT794NixbZZkOBAFx4ISxZYrUJ\nNTcpyuFBXQKiPif1p9gCe8djo5beR8NbmxWFvQrpelzXmE/iiXVPsOrjVbFxg4lLmkulh4Q/2/rF\nF6vLcLziS+b2yoSHQlagaGkORTn8qFNAGGMGi8hRwE+ACSJyCrYERl9jzKq6rlUOHl6OBEDX47rS\nf3Z/KqIVgNUgRIS9FXtT6mPtxwuDrayMLxHuHw+HbYE/zZVQlMOPep3UxpgvjTGzjDGXYAvm/Q74\no4jsrOdSpQkI5YZ4eODDZAZsqXCDIepE+d/w/1IRrYiV4Sh+q5hJKyfV6cj2wmAnToQhQ+LHbr+9\n2rldWamlORTlcKRB1VyNMZ8CjwCPiMhJ6VmS8m0p21+GY6qr8Hkd6QISICABgoEgszbMIuJE6tUm\nPG1g4sRqh/WQIbBvH4webWs3eVqGV6pDUZTDg/qquc6v5/orGnEtSiPhJdP58yPAluPICGQw6JRB\nvPjei3FF/eoyN3lagjFWQMydW12/KTsbHn5YI5sU5XCkPg0ihO0KNwf4P1Io0Kc0PXWWCXeifLzv\n45SL+kG8L0KkWjiAdVavXw/TNCtGUQ476gtzDQIXY3Mgfgi8BMwxxmw6OMtLnZYa5loXteVJCEJm\nMJPh3YennCPhRSrl5Ngifv6s6+xsWLbMbms0k6IcWhxwmKvbl2ExsFhEsrGCokRE/tsY82jjL1Vp\nTEK5IaYOmlojT8LzSXQ6qlPKuRFe6KvHqFHVUU1VVTZvYvZsKziCQRg+HAoKVFAoyqFMvVFMIpIt\nIj8B/gLcAjwMvJDuhSmNQ2GvQpYPW84lJ18S199aEHZ8uSPlchx+yspqluPYvNlWfvWimWbM0P7W\ninKoU5+JqRj4Abbpz1xjzDsHa2ENRU1MdRPeGWZA8QAqIjY/AgFjDMFAkNtCt3F09tEp9ZCA6twH\nrxR4bQSDNvrpjjsa6UkoitLofJtSGw7wb3fXf6IAxhhzZKOt8luiAqJ+wjvDFL9VzMx1M4maaNxY\nQ0tyhMO2TPiSJTX7WnvhsNnZmjinKM2dA67maowJGGOOcH+P9P0e0ZyEg5IaodwQnY7qRLIvBQZD\neaSc4reKU5srZAVEdnbNMS8c9kc/go0bbckONTUpyqFHg8p9K4c++Xn5ZGdkx/kjPAyGWRtmNahM\n+NKl1mGdmRk/Fo3CvHkwciTceaf6IxTlUCStAkJELhWR90Rkq4j8Jsl4PxFZJyIREbkmyfiRIrJL\nRDRiqpHwciTuv/B+xp87PlaSw6MqWhUr7AfWLFVXSY5QyOZALF9uBUV2dk0HtjHVxf0URTl0aFCp\njYbg5lBMxeZR7AJWi8h8Y8xm32k7gGHA7bVMMxFYka41tlT8xf0GdxnM5NcnM++9eQA4OGzas4lJ\nKyext2Ivfwz/kaiJkh3MrrckRyhkS2+MGROfTOehLUsV5dAibQIC6AtsNcZ8ACAic4ErgZiA8JoO\nuc7wOESkF7bM+GIgqQNF+faEckP07dCXv7/391hZjqc3Pl3jvIpoRb0lOcCGwCbDGBg3zm6vX2//\nap6EojRv0ikgOmDLdHjsAs5K5UIRCQB/AH4OXFTHeYVAIUCnTp0OeKEtnfy8fIKBYK2d6QCCEqy3\nJAckL8vhlQr/5htb4M+Lepo1y2Zgq5BQlOZJc3VSjwEWGmN21XWSMabIGNPbGNO7Xbt2B2lphx9e\nxnVQgknHBeGyUy9LbS5fifCbb67pj/CHxCYrDx4Oa9STojQX0qlBfATk+vY7usdSIQScLyJjgLZA\nloh8bYyp4ehWGofCXoWs/2Q909dOrzEmIsx/bz4vb305pTwJzx8RDtvyG7Ul1AUC8X4JbT6kKM2L\ndGoQq4FTRKSziGQB1wP1lQ8HwBhzgzGmkzEmD+vALlbhkH4KuhWQFcyK7Qew/SMc4+AYJ+aHSBVP\nmxg50mZV+wkEqv0SnragzYcUpXmRNgFhjIkAY4GXgXeBZ40xm0TkXhG5AkBE+ojILuBaYIaINLsq\nsS2JUG6IkqEljOo1ilG9RlHYqzAuf/5A6jd5YbC9esUfdxz76w9/9fwXwWB18yE1OSlK01FnqY1D\nCS210fgkq9/kGIegBHnsssesAEmRoiKrSSRjxgwodKfyyop7nenU5KQo6eWAS20oLRsvqa6wV2FM\nOABETZRRC0ZRtLYo5bkKC60guOQSGDzYmpjA/vWHxoZCtrif9rtWlKZHBYRSJ179Jn+Pa7BlOUYt\nGMWvl/w65bkKC+Hll2H8eJtxHQjYKKdVq6yGkWhKSmZyUhTl4KEmJqVewjvD5M/OpzJamXR8xuUz\nGmRuAisQEjOuRWxNp5KSalOS3+SUaF6qa0xRlNQ44I5yigLVzuvit4rZvGczK3bEVz958PUHASjb\nX0ZOmxzK9pfV21uirKxmmXBjrCmpuLj6hp/Yyc5DQ2IVJf2ogFBSwl+/6ddLfs3k1yfHxrZ+sZWR\nC0YiCAZDQAL11m7Kz7faQmUSpWT3bmtuqkszSOafUAGhKI2L+iCUBvPARQ8w/tzxNUqGe7WcHONQ\nGa2sM2fCc0L37Vtz7KWX4O676y4Rrv4JRUk/KiCUA+Lo7KOT9pQAmy8hIuS0qbt8aygEU6ZA69bx\nJTmqqqxmUFeJcH9JDzUvKUp6UAGhHBC1NR4SJJZ9PW7xuHqT6rwb/cUXV4e+ejgO7N3b2CtXFCVV\n1AehHBBejkRij2uDwTEOBhMzM6VSu2nCBFi5smbdpj/8Afbts6XBQZPoFOVgogJCOWA8x3WPE3ow\n5qUxcULCoz4zU2wuV5MoLoaZM6vDX6NRm2D35JPWDBWJWL9D9+7WBOU46qRWlHShJiblW1PYq5AR\nPUfUMDdFTZTRC0Zz1TNXpVS/yavb9Nhj8cX9vPDXiorqqKVVq6xwCATUSa0o6UIFhNIoFHQroFVG\nqxpCwsFh3pZ5XPDUBTEhUV+f68JCGDGi/scMBOCii9S8pCjpQjOplUYjvDNM8VvFPLH+Caqcqhrj\nfU/sy009b2Lc4nFURivJCmbVmivhJcLV1ktCBFq1UuGgKN8WLdanHBRCuSGmXT6N5cOWM7jL4Brj\nqz5exagFoyiPlBM10TpzJfzRTYld6QIBOP98GDq0+piWBVeUxkc1CCVtFK0tYvSC0Tg4Nca8cNgf\nd/kx488ZX2ukk6dJVFRUH/N6XAcCtujflCm28ZBGNClKw1ENQmkSCnsV1lrEz2CImijztsyj/+z+\ntfojPE3ivvusb0Kk2uTkRTA98YQ1RWlZcEVpXFRAKGmloFsBrTNa13lOoqkp0Ynt9YgoKLAagr+X\nhAisWVMtNDIyNKJJURoLzYNQ0oqXUFdSWsLibYtZsX1FjXMCEojlS3hd7JI5sT1toqQEcnJg/Xqb\nM+GvCjtwoJqXFKWxUAGhpB0voS4/Lz928xcROhzRgZ37dmIwjFs8jq7HdaWktITKaGWcE9vvn/CX\n/x796+1EnVz8ivCiRbbXRFmZ9olQlG+LmpiUg4anTYzoOYKgBNnx5Q4c4+AYh28i3zBu8Thy2uSQ\nFcwiKEGyglnk5+UnnSu8M8yTe4dCsByIgpu9XVUFY8fWXw1WUZT6UQGhHFS8FqYRJxJXkgNsGOzo\nBaPp06EPI3qOqLOfRElpCdEO/4ChA6D3TIKZUQIB64vwqsGqw1pRvh1pFRAicqmIvCciW0XkN0nG\n+4nIOhGJiMg1vuPdRSQsIptE5G0R+Wk616kcXPLz8mNaQlCCcWMODiu2r2DWhlls/GwjoxeMZvSC\n0TWinGJzdFpN68G38V/3bo+LcAK7nZNaKShFUZKQtjwIEQkC/wQuBnYBq4EhxpjNvnPygCOB24H5\nxpi/ucdPBYwx5n0RORFYC5xujKm1+LPmQRxahHeGKSktIadNDrcuujVpv+ugBGMFALOD2SwbuixO\noyhaW8Tzm5/n6jOupuyVQu68s2bWdXY2LFumvghFqY2m6kndF9hqjPnAXcRc4EogJiCMMaXuWFwm\nlTHmn77tj0XkM6AdoN0BDhP8LUyBGgl1gsSEA0BFtILit4pj14R3hmMlO1buWMmUM0NkZnat0cK0\nosIm0U2ZokJCURpKOk1MHYCdvv1d7rEGISJ9gSxgW5KxQhFZIyJr9uzZc8ALVZqWsv1liK+eRoBA\nDf8EwKwNs2KmpsRop7KcBZSUwODBNRsPrVoF551no5sURUmdZu2kFpETgD8DNxpjatRrMMYUGWN6\nG2N6t2vX7uAvUGkU/D6JjEBGUuEAEHEiFL9VzKSVk5JGO4VC8MIL8I9/1Ox17TgwciRcdZVGNilK\nqqTTxPQRkOvb7+geSwkRORJ4CbjTGPNmI69NaUb4k+ly2uRwy8JbiDiRGucZY3hi/RM4xiErmMWU\nS6dQtr/MCoeEXIkpU2xBv2g0fo5582D+fNt3ojB5FRBFUVzSqUGsBk4Rkc4ikgVcD8xP5UL3/BeA\nYs9xrRzehHJD3HH+HRT2KmTqoKlkBjKT9paocqqImigVkQqe3/x8DeEQmy9kGw8lmptAtQlFSZW0\nVnMVkUHAFCAIPGmMuV9E7gXWGGPmi0gfrCD4LlAO7DbGnCkiPwdmAZt80w0zxmyo7bE0iunwwust\nMWvDLCqjlUnNTgEJkB3MZmnBUsD6JRIFRjgMkydbzSEZGuWktHTqimLSct9Ks8YTFDPXzYyLahIE\ngyEoQUb0HMHst2bX2YSoqAhGj46v2wS22N/999tigIrSEtFy38ohi9eEqNcJvWqMBSRAMBBkxfYV\n9TYhKiy0zutkUU57NXhaUZKiAkI5JLip501x+wZjcyWcKJs/3xwzQWUEMsjPy0/a99of5dSvnzuP\nsSaoHj2shqE+CUWpRqu5KocEXuOhB19/kG1fbMNgcIxTwzdxY/cbAWotGQ5WULRqFT//hg32d+ZM\n69zWCCdFUQ1COYQo7FVI8VXFtMpoRVCCZAYzyQxkxsaDEqTHCT2SlgxP5Oqrkz9GNGo1CU2qUxTV\nIJRDDH/OhFcKfPLrk3nxny9iMNy66FbO7nA2YH0UtZUMLyyEbdvgwQdr1m9yHBg1ClasgDPP1L4S\nSstFo5iUQ55JKydx97K746KcwGoUj132WK19scH6HMaNs+U4aiMjA6ZOVbOTcniiUUzKYU1+Xj7B\nQLDGccc4lO0vq/NaL+u6dWsb8pqMSKSm2SkchkmT1KmtHN6oiUk55AnlhhjefTjT106PO24wrPp4\nFeGd4VobD0F8r+tNm+Dpp2ue4zhWSGzbBvv2waxZVnBkZdlr1QSlHI6oBqEcFhR0K6B1Rusa5Tnm\nbZlHv6f6UbS2bq9zKGST5f7yFxg/vvYSHZMnw/Tptox4Y3WtU21Eaa6ogFAOCzzn9cheI8kOZscJ\niogTYdSCUTWERLJcCYAHHrC5EpdcUvdjikAwCDt2HPjNPRy2vbO1h7bSHFEntXLY4ZXnKFpXhJNQ\nJf6Mdmdw+amXs698H7M2zCLiRGotz+HdvMvLa0Y6eXiaRnb2gZmaJk2ywiEatcJm4kQt+6EcXJqq\no5yiNAlet7rdX+9m3nvxVfo279nM5j2b445VRCpiuRL+gn9+38TevfDQQzVrOXn7FRX2vIYKiPx8\n68eorLR/8/Mbdr2ipBPVIJTDlvDOMP2e6pe0t0QifU/sy5pP1mCMoVVGq1oL/o0ZU7PHhMcZZ8Av\nf9nwcNhw2AoXzbdQmgKt5qq0WMI7w0x+fTJ/f+/vtXaqSyRAgPsuvI87zq9p6wmHobgYFi/9N6Xv\ntwFqxsaOHw9HH603fOXQQPMglBZLKDfEC9e/wOvDX6dfp34EUvjIGww5bXKAmo7sUAgKfhvm4/6X\nQbACcCBB8Dz4INx1V8OczrU5zBWlKVEfhNIiCOWGWH7jcsI7w7HWpoveX8T89+bjEO9YMBhuWXgL\nK7av4NlNzxI10VhjolBuiJLSEqId/gHD+sNbQ5F1N2Oc6n8lY+xvebnVNurTIsI7w3UWF1SUpkI1\nCKVF4W9t+sL1L1DYq7BG7gTY0NinNz5NlVOFYxwqotWO7Py8fLKCWQQ7rab14NuY/vy7nHFGzccy\nxvot6mttmkpxQUVpClRAKC2agm4FseqwWcGsWk1QjnFYvG0xoxeMBmBpwVIm9p/I0oKlFA7uyuOP\n2yikGtc5tt1p//61C4mYwHHXkKy4oKI0BeqkVlo8ntkpPy+fyW9MZt6WWhpYuwQIcMVpVzDw+wMp\n219GTpsc+7fscp6Y1LXWwn+nnw5dukD79lBQEG968q+BXSGNalIOGhrFpCgpEt4Z5oKnLqDKqUrp\nfK83dkACZAQyGNRqIgvuGkekIgAESRblBDbBbto06No1PsTVS86rqDQEMyI8OncLhYO7NtbTU5Qa\nqIBQlAbgZWLv/no3L73/UsrCIsbOs6E0Hz7qA1uuojYhARAIGIwxZGQabhpuK9IWzTQ4UQEiBL6/\njGkPtlchoaSNJhMQInIp8CfsV6nHjTG/TxjvB0wBfghcb4z5m29sKHCXu3ufMWZ2XY+lAkJJB35h\nsWH3Bkq/LE394p1nI7NLMJHsWk7w/vckti0BAxLFRAUIgkTJzDIsX5ap5iYlLTRJqQ0RCQJTgYuB\nXcBqEZlvjPHXOdgBDANuT7j2GOAeoDf2P2ete+0X6VqvoiTDK9sBVlj0n92fimhFzLRUJ7lv8v3b\nCsnYOJx2zg/gmxxWroTqL2WeZlG9b0tHBeA7e+Df7cBkEI2YAyrjoSjflnTmQfQFthpjPgAQkbnA\nlUBMQBhjSt2xhAo3/Ah41RjzL3f8VeBSYE4a16sodRLKDbFs6LJYHsUtC2+pt4zH+22K4axitgYy\neXTQo/DSB7z5zHlUbh6IDSJMND+JPf7v493dKJmZkJ9fsyGSoqSbdAqIDsBO3/4u4KxvcW2HxJNE\npBAoBOjUqdOBrVJRGoBfowAYu3AsURMlIAF6tu9Jfud8NnyygVc/eDVOw6hyqhi5YKTdue4BWHMz\nvPQYmAAEolaJMJnu2Z7QcCDnXarabWXyY2cznvaHpBbhrzVFx3BcQUSleXNIZ1IbY4qAIrA+iCZe\njtLCKOxVSNfjuta44YV3hinZbpPfaqX343D8O9aZnVdij73+K9hyBdZlByDw+Zk4n5/JvHfhxTnw\n2GPJiwE214J/XlRWZSVkZEYxBXcQ7fAPzRg/REingPgIyPXtd3SPpXptfsK1JY2yKkVpRBI1Cu9Y\nydCS+osE5r5pf13k+ms47ZsbefcvI+CjvlgTlMHTKKJRW0120aL4XIpw2CbieSXDly1rPkKipMSu\nKxoFxwDbzsWcuDyWMa4ConmTzkzq1cApItJZRLKA64H5KV77MnCJiHxXRL4LXOIeU5RDAq9I4PTL\npxOUav+BIOQdnUf39t3jzg8QIDOYSZfu/yJz0K8hoxzw/BvVAiYatZnZ06dXZ2cXF9t+FMZARYVh\n8mO7Y+c3dTtTr99FMGj/Zn7vdc0Yd2nq9yYV0h3mOggbxhoEnjTG3C8i9wJrjDHzRaQP8ALwXaAc\n2G2MOdO9djjwW3eq+40xs+p6LA1zVZorXqgs2NIe3rfmorVFPL/5ebqf0D2uw10wEGRQq4l8/OII\nVq08Gkx1GKylOvpp1Ci7PX26p2kYgn0eZ+XzP4BdoZh5JyvrwDreNQbqg6iJ3/TWlO8NNGFHOWPM\nQmBhwrHf+bZXY81Hya59EngynetTlINBMjMUWB9GYS/rUJi0chIRJ0LURMGBvmdFyfn+Lla9kQ1R\n13ltglQr/VZgTJ/ucGzul8ARgEDAzlG84H067Q3FzDuVlQfW8a4xCIX8j5v8tWhp+E1vyd4bf+mV\npny9DmkntaIcLngF+7yS3/l5+RR/+RgM3WAd2a0/h0WPQNSfdGe1h893Hu3uR60QWXMzMzcE+K/b\n7LfT8gqDwWGvlALfAxJqP0Gj3Yyay42tuVNXq9nmVP5dBYSiNANCuSGWFiyNu7kWv1Uc78g+/h14\nqwC+Ph62XEnNPIqgq1gI0YjhoT84DBnxMU/POB7jBJh8VwdgG/u+yODJvTaaKBgIIggRJ0JWMIsp\nZ/4fZe92PaBoqFRvbHUJkVi/jrLL611HfcIoHIbiedshbzkFl5/SrASWv9954nNMVv5dBYSitHAS\nTVEF3Qp4csOTVEYrCRDgh33KeTt3rG1w9Or/wOu/IbGbXXXZDsGJwtziI8Fk2OPRAJN/29meEVgM\nPWYR7fZnyLVe0m8+7M6Ye7tANDW7eGJobSo3trqEiDdWUdoTZ/YvCTiGjKAw6LrdtD/nldhN3rvx\ne0IumTAKh6H/hVEqKjpA8Bqe3DCIkrsmNTsh4flk2Fkt5JJpk02FCghFaaZ44bL+b8mew/uJwD1U\nffcDWH+TjXhq/S94f5Drr6gOTozuPyJ+UiMYxJqq1oyEDUNh6AA7VvI7olUBMPX7LBKdrFP+upEd\nGTvICGSAQ603trqE/DbYugAAEl1JREFUiDfmfHg+RLJwjFAZNcz7SzuYa2/yjwx8hHE/60p5RUdM\nYCEMHUBlp9UUL3ifkr2hmLAqnredisqOVjhGDZXbzqH4reIGC4h05pfUJSyHdhsKxAc1NAUqIBSl\nGZOoVXj7Bd0KKO5RzO6vHwCgfdv27N6ylXnPHAFrR7gObX+tp5qFASEAkSxY/Ef4pAc4GfaYRMjI\nlDrLe5SUQHmFg3EClFdEGfM/b+IceQzBk/sy4oozKehWAFjnu98EVNe34/y8fGTXOfBlJwhEIOqW\nHSEI0Uyqtp3L84vKqKg0GCdoM89L83EkwBP/8zOiEYMEonTu+y5by7aBHA/iQLAK8pYxa8P6Bt1w\n0x1pVFsnQb/Q8F7HpkIFhKIcgiSLjAp3C7OwPJ/K9uvcMh7uv7dEbUmPWOIdVAuKoJuUJ9XHcrZw\n5BnvMW9LkOIym35U0K2AjWvb8vyiMq4emMPe9tswgUvAZGIkSnTdL8DJILK8kt2nPgrdqm90IkLP\n9j25qedNFPYqrOFribErhJm9BCpdDejoUtiXC04AxBBs+wVXD8xh2ewIjoN74y/BbCigqsJdfzTI\n1td/APwAApXQ63HoZn05ESfYIHt+fZFGqVCXnySZsGxO/gdQAaEohw2eSaq4ezFv/vCXbHjF7SHR\nfh0s/hNEsql2bPt9F4H4/bLT2bPidCavMHCaA99fxPStn8B7PwYjvPK4wxFXPQWXvgTvXg2Z++G9\nK1xzDvx98Zd8fNQ4KqIVOMYBA6s+XsWq/wsw7dOuZAWyuenGywmdb9fn3URXPTOAaFUvbNqUgb0n\ng1QhAcGYILL4Ybr+OoNH525k9NS5OCe9Bp/+ANbeTE3hBzgZyFG7yMpbT8SxyXl7t57Oj/5SwtUD\nc+rtsZFz+kYCGadhyCArS8g5fSOTVi5IOUKraN5GbnlsEc5Jr5GdNzGp0z6ZKam5+B9AGwYpymGL\nP0HvyD0/4sXnvsuWV0OYaMDVKHxaQw3qOx4FMW4SH1TXjwLO/T202mdrTHkRWDvPhqeWVYfpBisY\n/PuHGdj/aMYtHkd5pByz8yyYtRycTOIKFnqajlTR9xcvMeW+49n42UZG/fdazIKpPnNatPpc9zHG\nz1zM4IuOp6S0hE3rj+Tp/7oRolmQUcmMZ7fVKiSK1hYxduFYIjv6ENw+gNuG9OSRj39GRWlPAtsv\nZOqYa+sUMEVFMHpMFCdqIODAoLGMGhlk2uXTYu9NXc76ZFqH55zf3e4Z2p/2YaP5J5osUU5RlKYj\n0Qz1wI3VTtfF77zBir+GqGlyEmp8E/fVg6rerg6pjT8/AuH/sgIoWGkd4Llv2lyOqO/GH81k3vQf\nMm/LvZD7TfWiT10QX7AwUGUFkWN9CasyH+C8Wau46Itn4KVproBy1xCIwqCxdOdGV0vJpHDw4NjU\nd927wAoHkwERw/OLyiisHqZo3kaeX1RG97P38r+73FLuHd8gCvy/RXmUl/8Cs+iPONEsxix36FqS\n3OQUDsMtt4ATdYWwY2Dho8xsP4CCbmGr6XmmpB19KC+9kOJj3ofLSZpx781ZHZU1FoYOYNaGWTw8\n8GHK9pelLe9EBYSitCC8rOb8nUK/f99CZE0BfNITIQPjiHXq1lHaI/5volCJ2k3P2R3BOsBPWA/Z\ne925vQirAHxwEWzvVx1FNXupvYEHI3DKfGj7qfUfQHXV29w3cXaczSuzrgDHJxwkCqE/wDc5VPR+\nnNHXnsX6T9YzeoG92U5+ZiXO3o7W+e0YCFbR/ey9MS1r87qjWDHxdxA9nVeeiMApz9jHb78Os/hP\nbI1mgfzcrt9kEK2qsjkWHT+Ofdv3fDRt9p+G47SPf31MgOiH5zH59cn07dCXvVtPx7w4FdYPxTgZ\nFK2IMGPtAEzuGwDM2jCLZUOXVUd4lVg/iBVuAm8VUJH7JmMXjsUxTtoS6tTEpCgtlMSktJwcKCuD\nTV8v56+vbMHsOQ22n0u1+cgh1tAIqBYUjr35H/8O8lk3K2iS4hBfodY1C532dyg7FfacCQgSiBIc\n8N9Ez/3/7Z17sFVVHcc/33OAqyMCymUQBXmIVhYGSA6VvXAsH6k11kiPsRonjLR0TA3G0bHSP3R6\nGEU2FCqm+aq0sjSNRy9Rg3gEEkoIKIGgBkUpXO759cda+559z93nXi7cfc6J+/vMnDlrr73OWt/z\nO7P376zH/q0b2kfCTfb63nksLPks4f+thR7GW+6C1VPLPZczLoPXmmHUIgoqUrrjseB81Boc1qiF\nsDs+gf7WO0O9C74WJ/ZTbao1ftU+wN4wXGSCYgsjv3gRLw58gJKV0IvvKLdR2EtBBUp74wqsxD5v\n+CU69RthR8F5v62YE9oLxywJ2qJTHPuvz3LVx9/GuKHjuHn2Fh6650go9Qt6irvh01PQiCcxDCEu\nPvnitiGs7lC3PalriTsIx+k50s7jkZ808+yrz9I85gX+NGcqrS3BYahQ4vRzX2H+w8207oXyTR+q\nD1NVDmN1pNCnxK0PPMOyLcvYuvqNrP3vE6xZfGzbJHlbtRZ0FIpGqdRafiCw7UZeCD2Gw/8BO0an\n2k0cVdLgHjjhVxXPkaR0K85tFFph+JPwn2ZofhbGPtLmhNo5GLWE1VMDN8HrA+CJK8srymiFQZtg\nxyg6LA5I65FBax+QoQJxj/LkVWjfRuxZNRWb2vU69hV3EI7j9AhJeHEI+1EsWgTXXhuWgnZNV04j\n5L1p/L+Z+70BnHZaCGNeKlUOd6WHugpIyT7fqeEmi/MknU60Vz4nUoL+W2HXMNqv9mqFgRth50ja\nTca3fbYER62El06KQ2glGPlHGLImFFkyLfW5ak++p+2S9NQqzyXp1o5ZI/+ITr+GGy/8IDPfNTPj\nO1fHJ6kdx+kR2kdmDfTrl9zIO5aXiDdwKBbFBRfA/fdDSwu0jyNVvnFePn1A25h7qLOyXLgzFgpC\nCntNSKKlBVQwxr3vGZY/Po5sh9S+jjLRIexK72ycmpDfOaairuR8/NzWCan8Imx8T3iphcwluG2f\nr+YMKnUm6dZym+lFAhvfg922kMFnr6UncQfhOM5+kw46l8xhDB4My5aF8xfGB4HT4SouuST0QubM\nae8Ahg8PvZFp00JPJdvxqO39yith0KByJNTQRoFFi05i5fz05zqbcM9aqRWOJQtzIG0rparVVW3O\nhdTQUuWNPvRMxr93E2sWj2HPnuBEO9ZV2dMppOqrOF/qyytrxsGH6DHcQTiOc0Bk9SqyylSWnzAB\nLr00DE81NYWeRVKu0vEkDmfAAFi+HM4/v+Pe3Ok2mppCD6RYhMmTxfr1sHlzuAkXCjBpkli2LOnJ\nJJSdRlOTmDUrlJk7t1xOBeOIkS/w6obhqRVZ1Sn2gT5FsXcvoBKlkrBSmL+56saN3DTjuLZhu7lz\nVUVPfG9bBZa9qqxvX7ULG94T+ByE4zh1I69geJX1ZsVVgvJ8yoQJwQlt3dp+v++krvS8S1LfnXeG\n8tVI6oHUjnpU/76VdR51VFlXovHyy0OvSoJzzoETTgj1HX00XH31/tnQJ6kdx+n15BmZtVbk8R3c\nQTiO4ziZdOYguh5EcxzHcXoluToISWdIWitpnaQZGeebJN0Xzz8laVTM7ytpnqS/SlojqXsLex3H\ncZwDJjcHIakIzAbOBE4EPibpxIpiFwH/NLOxwLeAm2L+R4EmMxsHnAxcnDgPx3Ecpzbk2YM4BVhn\nZuvNbA9wL3BeRZnzgHkx/RPgNEnJIuHDJPUBDgX2AP/KUavjOI5TQZ4O4hjghdTxizEvs4yZ7QV2\nAoMJzuI/wBZgE/B1M3u1sgFJ0yQtkbRk+/btPf8NHMdxejGNOkl9CuGZ8qOB0cCXJI2pLGRmc8xs\nkplNGjJkSK01Oo7jHNTk+ST1ZmBE6nh4zMsq82IcThoIvAJ8HHjUzFqAbZL+BEwC1ldrbOnSpS9L\n2rifWpuBl/fzs3niurqH6+o+jarNdXWPA9E1stqJPB3En4HjJY0mOIKphBt/ml8AnwIWAx8BFpiZ\nSdoETAF+JOkwYDJwS2eNmdl+dyEkLam2DrieuK7u4bq6T6Nqc13dIy9duQ0xxTmFS4HfAGuA+81s\ntaSvSjo3FpsLDJa0DrgCSJbCzgb6S1pNcDS3m9nKvLQ6juM4Hck1WJ+Z/Rr4dUXedan064QlrZWf\n25WV7ziO49SORp2krjVz6i2gCq6re7iu7tOo2lxX98hF10ETi8lxHMfpWbwH4TiO42TiDsJxHMfJ\npFc4CEm3SdomaVUq70hJj0t6Lr4fEfMlaVYMILhS0sQa67pe0mZJy+PrrNS5mVHXWkkfyEnTCEkL\nJT0jabWky2J+I9irmrZ62+wQSU9LWhF1fSXmj45BKNfFoJT9Yn5mkMoa6rpD0vMpe42P+TX7LWN7\nRUnLJD0cj+tqr0501d1ekjYoBC9dLmlJzMv/mjSzg/4FvBuYCKxK5d0MzIjpGcBNMX0W8AhhX7/J\nwFM11nU9cGVG2ROBFUAT4enyvwPFHDQNAybG9OHAs7HtRrBXNW31tpmA/jHdF3gq2uJ+YGrM/z4w\nPaY/D3w/pqcC9+Vkr2q67gA+klG+Zr9lbO8K4MfAw/G4rvbqRFfd7QVsAJor8nK/JntFD8LMfg9U\nxnJKBwqcR3mr7/OAOy3wJDBI0rAa6qrGecC9ZrbbzJ4H1hFCkvS0pi1m9peY/jfhGZZjaAx7VdNW\njVrZzCwszYZwI+5LCDg5hRBXDDraLCtIZa10VaNmv6Wk4cDZwA/jsaizvbJ0dUHN7NVJ+7lek73C\nQVRhqJltiemtwNCY3pcgg3lzaewa3pZ0G+uhK3blJxD+eTaUvSq0QZ1tFocllgPbgMcJvZUdFh4Y\nrWy7WpDK3HWZWWKvG6O9viWpqVJXhuae5hbgaqAUjwfTAPbK0JVQb3sZ8JikpZKmxbzcr8ne7CDa\nsNAva5T1vrcCxwHjCdFsv1EPEZL6Az8FLjezdqHW622vDG11t5mZtZrZeELMsVOAN9ZaQxaVuiS9\nBZhJ0Pc24Ejgy7XUJOmDwDYzW1rLdruiE111tVfkVDObSNhf5xJJ706fzOua7M0O4qWk2xXft8X8\nfQkymBtm9lK8qEvADygPidRMl6S+hBvw3Wb2s5jdEPbK0tYINkswsx3AQuDthK59Eq0g3XabLrUP\nUlkLXWfEoTozs93A7dTeXu8EzpW0gbBPzBTg29TfXh10SbqrAeyFmW2O79uAB6OG3K/J3uwgkkCB\nxPefp/IvjCsBJgM7U9243KkYK/wwkKxw+gUwNa7oGA0cDzydQ/sixMhaY2bfTJ2qu72qaWsAmw2R\nNCimDwVOJ8yPLCQEoYSONkts2Rakska6/pa6qYgwbp22V+6/pZnNNLPhZjaKMOm8wMw+QZ3tVUXX\nJ+ttL0mHSTo8SQPvjxryvyb3d3b7/+kF3EMYemghjMddRBjDnA88B/wWODKWFSFY4N+BvwKTaqzr\nR7HdlfGHHpYqf03UtRY4MydNpxK6qiuB5fF1VoPYq5q2etvsJGBZbH8VcF3MH0NwSOuABwjb6AIc\nEo/XxfNjaqxrQbTXKuAuyiudavZbpjS+l/JqobraqxNddbVXtMuK+FoNXBPzc78mPdSG4ziOk0lv\nHmJyHMdxOsEdhOM4jpOJOwjHcRwnE3cQjuM4TibuIBzHcZxM3EE4ThdIalU5kudySTO6/tQ+1z1K\nqWi+jtNI5LonteMcJLxmIVyF4/QqvAfhOPtJjNF/s0Kc/qcljY35oyQtiMHd5ks6NuYPlfSgwv4M\nKyS9I1ZVlPQDhT0bHotPPSPpiwp7X6yUdG+dvqbTi3EH4Thdc2jFENMFqXM7zWwc8F1CJFCA7wDz\nzOwk4G5gVsyfBfzOzN5K2Adkdcw/HphtZm8GdgDnx/wZwIRYz+fy+nKOUw1/ktpxukDSLjPrn5G/\nAZhiZutjEMGtZjZY0suEcB8tMX+LmTVL2g4MtxD0LaljFCEM9/Hx+MtAXzO7QdKjwC7gIeAhK+/t\n4Dg1wXsQjnNgWJV0d9idSrdSnhs8mxBTZyLw51SkU8epCe4gHOfAuCD1vjimnyBEAwX4BPCHmJ4P\nTIe2jXwGVqtUUgEYYWYLCfsPDAQ69GIcJ0/8H4njdM2hcVe2hEfNLFnqeoSklYRewMdi3heA2yVd\nBWwHPhPzLwPmSLqI0FOYTojmm0URuCs6EQGzLOzp4Dg1w+cgHGc/iXMQk8zs5XprcZw88CEmx3Ec\nJxPvQTiO4ziZeA/CcRzHycQdhOM4jpOJOwjHcRwnE3cQjuM4TibuIBzHcZxM/gd2guItfXq1MgAA\nAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "f86dWOyZKmN9",
+        "colab_type": "text"
+      },
+      "source": [
+        "Great results! From these graphs, we can see several exciting things:\n",
+        "\n",
+        "*   Our network has reached its peak accuracy much more quickly (within 200 epochs instead of 500)\n",
+        "*   The overall loss and MAE are much better than our previous network\n",
+        "*   Metrics are better for validation than training, which means the network is not overfitting\n",
+        "\n",
+        "The reason the metrics for validation are better than those for training is that validation metrics are calculated at the end of each epoch, while training metrics are calculated throughout the epoch, so validation happens on a model that has been trained slightly longer.\n",
+        "\n",
+        "This all means our network seems to be performing well! To confirm, let's check its predictions against the test dataset we set aside earlier:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lZfztKKyhLxX",
+        "colab_type": "code",
+        "outputId": "7ed4e1c5-4d19-4d10-cd65-0cae30486734",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 318
+        }
+      },
+      "source": [
+        "# Calculate and print the loss on our test dataset\n",
+        "loss = model_2.evaluate(x_test, y_test)\n",
+        "\n",
+        "# Make predictions based on our test dataset\n",
+        "predictions = model_2.predict(x_test)\n",
+        "\n",
+        "# Graph the predictions against the actual values\n",
+        "plt.clf()\n",
+        "plt.title('Comparison of predictions and actual values')\n",
+        "plt.plot(x_test, y_test, 'b.', label='Actual')\n",
+        "plt.plot(x_test, predictions, 'r.', label='Predicted')\n",
+        "plt.legend()\n",
+        "plt.show()"
+      ],
+      "execution_count": 17,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\r200/1 [================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================] - 0s 40us/sample - loss: 0.0082 - mae: 0.0827\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEICAYAAABcVE8dAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO2deXxcddX/32cmS9lkCShLqWAFHpBA\nW0rxIrQDxQLKHlERCCCQFgqP/SFS6iOP8akPpQWxytrQgkSgihTKIkgldGihUwsFtI9F2WQpi5Sw\nCdIsM+f3x/fOZDKZSSbJJLOd9+s1r5m7n3tn5nPPPd/zPV9RVQzDMIzSJ5BvAwzDMIzhwQTfMAyj\nTDDBNwzDKBNM8A3DMMoEE3zDMIwywQTfMAyjTDDBL2FE5FQRWZZvO+KIyGYicr+IfCgiv8vD8RtF\n5Db/8ygR+VhEggPYzw9FZGHuLRweRORXIvLTfNvRG8nfVY73W/DnPpSY4GeBiHxHRJ7yBeItEXlI\nRA7Jt119oaq3q+qUfNuRxDeAzwE1qnpyPg1R1ddUdUtVjfa2noiERGRDyraXq+o5Q2thcSEiZ4rI\n4/m2w+gdE/w+EJGLgPnA5TixGgVcDxyfT7v6QkQq8m1DGj4PPK+qnYPdUYGen2EUNqpqrwwvYGvg\nY+DkXtapxt0Q3vRf84Fqf1kI2ABcArwDvAWcAHwNeB54D/hh0r4agbuA3wL/Ap4G9k9afinwkr9s\nPXBi0rIzgSeAnwOtwE/9eY/7y8Vf9g7wEbAO2DfpPJuBjcCrwI+AQNJ+HweuAt4H/gEc3cv12BsI\nAx8AfwWO8+f/BGgHOvxrenaabfs6/1eAmcBfgDagAvgysMo/3p+BUNL6uwOP+fv6I3AtcJu/bDdA\ngQp/ejvgFv87fB9YCmwBfArEfJs/Bnb27bwt6TjH+ef6gX/ue6fYfLFv84f+uY3wl20PPOBv9x6w\nMn7d01ybXwCv+9/dWuDQlOt2p/8d/su3ZXzS8rH+tfyXf/zfAD/NcJzRwKO439C7wO3ANknLdwXu\n9n8rrf413RvYBET9a/SBv24YOCflN/p4P87ptgw2PgcckzRd4dszzp/+HfC2f71XAF9KWvdX8XNP\ntcefp8AXk/7bVwGvAf8EbgQ26+93V0gv8/B7xwNGAPf0ss5/4URnDLA/MAEnmHF29PexC/DfwE3A\nacABwKHAZSKye9L6x+N+sNsBdwBLRaTSX/aSv83WOAG9TUR2Str2IOBl3JPI/6bYOQWYCOzpb/9N\n3B8W4Bp/3heASUA9cFbKfv+O+5HPAxaJiKReCN/O+4FlwGeBC4HbRWQvVf0x7inpt+pCKYtSt8/i\n/AFOAb4ObOOf5+9xN7ftcMK6RER28Ne9Ayck2wOzgTMyHBPg18DmwJd823+uqp8ARwNv+jZvqapv\nppzznsBiYAawA/AgcL+IVCWt9k3gKNwNaD+c0AB8H+cQ7OCfyw9xgpOOJ3G/sfh1+Z2IjEhafhxO\nyLcB7sMJMb4dS/3z2w53bet6uQ4CzMHd2PbGCXyjv68gTuRexd0wdwF+o6rPAdOAiH+Ntull//05\np0wsxv0O4hwJvKuqT/vTDwF74L7Hp3E3rYFwBe7/Mgb4Il3/Yejfd1c45PuOU8gv4FTg7T7WeQn4\nWtL0kcAr/ucQzkMM+tNb4X4UByWtvxY4wf/cCKxOWhbAPRUcmuHYzwLH+5/PBF5LWX4mXR7+4bin\nii+T5IkAQZznvU/SvKlAOGkfLyYt29w/hx3T2HMozrNK3v9ioDHp/NJ6bdmcP85b/m7S8pnAr1P2\n8TBO2EcBncAWScvuII2HD+yE8+K3TWNTCNiQxs74fi4D7kyx+Q38Jw3f5tOSls8DbvQ//w9wL75H\n2c/f5vv4Tz++PY8kLdsH+NT/PBH31CJJy1eRwcNPc5wTgGf8zx7Ok65Is17it5Y0L0wvHn4W55TJ\nw/8i7mllc3/6duC/M6y7jf89b+1P/4osPHzcje8TYHTSMg/4x2C/u3y+zMPvnVZg+z7ixTvjPJ44\nr/rzEvvQrobBT/33fyYt/xTYMmn69fgHVY3hvIidAUSkXkSeFZEPROQDYF+c99pj21RU9VGc13cd\n8I6INInIZ/ztK9Ocwy5J028n7eff/sdkm+PsDLzu251pX32R8fxTl+PaBE6OXw//mhyCE/CdgffV\neenJtqRjV+A9VX2/H3bG6fb9+za/TobrB/ybrmt3JfAisExEXhaRSzMdREQuFpHn/AynD3BPZMnf\nfeoxRvi/252BN9RXKZ9M1wER+ZyI/EZE3hCRj4Dbko6zK/Cq5qANJstzSouqvogL6xwrIpvjnm7u\n8PcZFJErROQl3/5X/M363G8KO+Ccm7VJv60/+POhH99dIWGC3zsRXKz4hF7WeRMnPHFG+fMGyq7x\nDyISAEYCb4rI53HhoAtwWS7bAP+H80Ti9PpIqaq/VNUDcB7gnsAPcHHajjTn8MYAbH8T2NW3e6D7\nSnv+ScuTz/F1nIe/TdJrC1W9AvdksK2IbJFiSzpeB7YTkXShiL4e07t9/36oa1eyOGdV/Zeqfl9V\nv4ATrYtEZHLqeiJyKK4d6Ju4p5BtcPHpHmG1NLwF7JISgst0HcCF3RSoVdXP4MKP8W1fB0ZlcIDS\nXadPcKIZZ8f4h0GeE3SFdY4H1vs3AYDv+POOwN1Adosfsi/7RGTHpGXv4pyxLyX9trZW1S0h+++u\n0DDB7wVV/RAXs7tORE4Qkc1FpFJEjhaRef5qi4EficgOIrK9v/5g8ocPEJGT/D/VDNwNZzWuAVFx\nj9SIyFk4Dz8rRORAETnIj4d/gmtki/lPH3cC/ysiW/k3losGeA5/wnmXl/jXKQQci4stZ0um80/H\nbTgv70jfsxvhp1GOVNVXgaeAn4hIlZ9Ge2y6najqW7i47/Uisq1v+0R/8T+BGhHZOoMNdwJfF5HJ\n/rX9vm/zqr5OVESOEZEv+mL8Ia7RM5Zm1a1w4amNQIWI/Dfwmb727xPxt/1P/7xOwrUzZWIrXMPr\nhyKyC84piLMGdwO5QkS28K/3V/xl/wRGprRdPAuc5P9vvgicnaNzAvebmgKch+/dJ+23Dfd0vjnu\nBpaJPwNfEpExfttBY3yB/6R2E/BzEfksgIjsIiJH+p+z/e4KChP8PlDVn+EE8Ee4H+frOC97qb/K\nT3HC8hdc5svT/ryBci/wLVw883TgJFXtUNX1wM9wf+B/ArW4rJxs+QzuB/w+7pG+FfdYCq5x9RNc\ng+/juD/Qzf01XFXbcaJ6NM5Duh6oV9W/9WM3ac8/w/Fex3lzP6Tru/kBXb/r7+AanN8DfozLYsnE\n6bgnnb/hMplm+Mf4G+6m/rL/aJ8cXkJV/47zgq/xz/lY4Fj/WvTFHsAjOIGNANer6vI06z2MCyc8\nj/vuNtFL+C7FvnbgJFy8+j3ctb27l01+AozDidjvk9f1nYNjcTHu13Dhtm/5ix/FZQe9LSLv+vN+\njmsf+idwK90bTwd8Tr4tb+Gu2cG4zKM4zf7+3sBlsmVyFlDV53Gx+EeAF3C//WRm4sI2q/3w0CPA\nXv6ybL+7gkK6h/aMfCIijbhGoNPybUs+KPfzN4yhxjx8wzCMMsEE3zAMo0ywkI5hGEaZYB6+YRhG\nmVCwBai233573W233fJthmEYRlGxdu3ad1V1h3TLClbwd9ttN5566ql8m2EYhlFUiEjGntQW0jEM\nwygTTPANwzDKBBN8wzCMMqFgY/iGYZQmHR0dbNiwgU2bNuXblKJmxIgRjBw5ksrKyr5X9jHBNwxj\nWNmwYQNbbbUVu+22G2nG0TGyQFVpbW1lw4YN7L777n1v4GMhHcMwhpVNmzZRU1NjYj8IRISampp+\nPyWZ4JcokQjMmePeDaPQMLEfPAO5hhbSKUEiEZg8GdrboaoKWlrA8/JtlWEY+cY8/BIkHHZiH426\n93A43xYZRuGxdOlSRIS//a334Rrmz5/Pv//9717X6Y1f/epXXHDBBQPePpeY4JcgoZDz7INB9x4K\n5dsiwyg8Fi9ezCGHHMLixYt7XW+wgl9ImOCXIJ7nwjizZ2cXzrF4v1Ho5Po3+vHHH/P444+zaNEi\nfvMbNwJnNBrl4osvZt9992W//fbjmmuu4Ze//CVvvvkmhx12GIcddhgAW265ZWI/d911F2eeeSYA\n999/PwcddBBjx47liCOO4J///GdujM0hFsMvUTwvu7j9QOL9kYgLE4VC1jZgDD1D0SZ17733ctRR\nR7HnnntSU1PD2rVrWbNmDa+88grPPvssFRUVvPfee2y33XZcffXVLF++nO23377XfR5yyCGsXr0a\nEWHhwoXMmzePn/3sZ4MzNMeY4Jc5LzRHuP3TeezEm9yy6Wyamxt6FXNrEDaGm3RtUoP9zS1evJjv\nfe97AHz7299m8eLF/OMf/2DatGlUVDhZ3G677fq1zw0bNvCtb32Lt956i/b29n7lxw8XJvjlSCQC\nzc3w9tucet/9BIgCcJCu4cqml/hQt2FWZYg5Ya/HH6u3P595/sZQEG+TijsZg22Teu+993j00UdZ\nt24dIkI0GkVEOPDAA7PaPjkdMjkP/sILL+Siiy7iuOOOIxwO09jYODhDhwAT/DJiXVOEjkXNjFm7\niEC0A3CNOPGfrwIXxa4CYki78OQZ34Hnb+u2j0x/PvP8jaEi3iaVK2firrvu4vTTT2fBggWJeZMm\nTWL//fdnwYIFHHbYYd1COltttRX/+te/EiGdz33uczz33HPstdde3HPPPWy11VYAfPjhh+yyyy4A\n3HrrrYMzcoiwRtsiYbCNVhtOm8k+Uw9hzJobEV/s46j/csSoAAIoB71wO+8ceVq3dTM1CFsqqDGU\neB7MmpUbJ2Lx4sWceOKJ3ebV1dXx1ltvMWrUKPbbbz/2339/7rjjDgAaGho46qijEo22V1xxBccc\ncwwHH3wwO+20U2IfjY2NnHzyyRxwwAF9xvvzRU7GtBWRm4FjgHdUdd80ywX4BfA14N/Amar6dG/7\nHD9+vNoAKI4Bec+RCMybx6ern6Hj43a2+vgtwHnzyd94NFjJqphHtW5iRTDE96NXImjSeoIsuBFa\nW1lXE+KBVi+tl2UevpEtzz33HHvvvXe+zSgJ0l1LEVmrquPTrZ+rkM6vgGuB5gzLjwb28F8HATf4\n70YW9LvRKhKBSZPQjg5GACP82XERjxHgXo7jHXZk83Pr2aPe49Gwe1x+YcYb7LnmdjRpfaZPR2PK\nHrEKtuUsZlXV94jv5/qx2zCM3JMTwVfVFSKyWy+rHA80q3ucWC0i24jITqr6Vi6OX+r0u9EqHEY7\nOkiutJEctrmKi/mv4Fznidd3T+Gcc8Jt7LIGTuMOYgCBCoKxKBKLUU2Uc1nAWe0Lefeb4+Cys6Gh\nIXGM5PBO8rRhGIXBcDXa7gK8njS9wZ/XTfBFpAFoABg1atQwmVYcnHGGe6+vTyOkqekxoRCxQCWB\nWPdY/V/Zm+srZjDmugZmt6b3xEMhmLzZbTS1TefwQJgzLqph9DUz0E2bQJUgSoBOdt6wBqaugZde\ngrlzE2ZYWMcwCpeCytJR1SagCVwMP8/mFASpIlpf38cKvsquv+ExXj5vHvvGnmETW/DclO/xQqiB\n00O9i3BXaMYjFPIY7QEn1CLNzUQX3Yx0tCN0ZfZw5ZXw/PNwySWEw17O86UNw8gdwyX4bwC7Jk2P\n9OcZfdBn/D7DCrUNHh/X3sOdYee1f6Mfwtujl64/I1hfD/PmwdKlXctU3fSDD3LMNWFmV3k5y5c2\nDCO3DFda5n1AvTi+DHxo8fvs6LMQWi8r+NEdwuEc1SDxPLjnHrjkElSkW7YPHR3U3n4prTV7sOIr\nMy2cYxgFSE4EX0QWAxFgLxHZICJni8g0EZnmr/Ig8DLwInATcH4ujlsOJOe9/2l+BK/5PDjvvC4F\n76VSWjzac9ll7j1XhaeaRs/lPLmRToJdjcEisGIFm214kYNXzMO77rQ+9mIY+SMYDDJmzBj23Xdf\nTj755EFVwzzzzDO56667ADjnnHNYv359xnXD4TCrVq3q9zF222033n333QHbGCdXWTqn9LFcgem5\nOFapkU05As8Db12TE/pYzM285RZYvrwr/pJm42zSOftbDiESgenToTPWwJ+p5QyamTABxr35AGzY\n0LXiHXfAxInQmqF12DDyyGabbcazzz4LwKmnnsqNN97IRRddlFje2dmZqKnTHxYuXNjr8nA4zJZb\nbsnBBx/c733nAutpm0ey9sCbmmDatC6xB2hr67M7a1/hoIE8AYTDXWasxuM/K2+gbf4N8J3vdF9R\n1d0Zcv14YZQnQ1jD+9BDD+XFF18kHA5z6KGHctxxx7HPPvsQjUb5wQ9+wIEHHsh+++2XKMWgqlxw\nwQXstddeHHHEEbzzzjuJfYVCIeIdRv/whz8wbtw49t9/fyZPnswrr7zCjTfeyM9//nPGjBnDypUr\n2bhxI3V1dRx44IEceOCBPPHEEwC0trYyZcoUvvSlL3HOOeeQiw6yCeML8XXAAQdoqXP55arBoCq4\n98svT7PSggWqgYBbyX/F4husWtXnMVatcvtNt2pWx0+zv802cyZVVDjzEpx6qqqIe1VWdtmd7c6N\nsmD9+vX92yD+owsG3XsWv/u+2GKLLVRVtaOjQ4877ji9/vrrdfny5br55pvryy+/rKqqCxYs0Nmz\nZ6uq6qZNm/SAAw7Ql19+WZcsWaJHHHGEdnZ26htvvKFbb721/u53v1NV1UmTJumTTz6p77zzjo4c\nOTKxr9bWVlVV/fGPf6xXXnllwo5TTjlFV65cqaqqr776qv7Hf/yHqqpeeOGF+pOf/ERVVR944AEF\ndOPGjT3OI921BJ7SDLpaUGmZ5UafHari8RPfpY7f46MEePX71zM6izBJPNoTd5CSoysDqULYW4/a\nyPTbeGGr6UwizOfH1sCMGbkrcWiUL0NQH/nTTz9lzJgxgPPwzz77bFatWsWECRMSZY2XLVvGX/7y\nl0R8/sMPP+SFF15gxYoVnHLKKQSDQXbeeWcOP/zwHvtfvXo1EydOTOwrU6nlRx55pFvM/6OPPuLj\njz9mxYoV3H333QB8/etfZ9tttx3U+cYxwc8jfZYjSIqfJJdEuDpwCV/fxmNWlsfJ1CFqoOUQ0jUZ\ndB3Do6rKcz14W2qt1oIxeHJdH5nuMfxktthii8RnVeWaa67hyCOP7LbOgw8+OOjjx4nFYqxevZoR\nI0b0vXIOsBh+num1CmAoBNXVEAigwQourLiBbwbv4elqr9tvvq/wZm+VLLOpQphp/8nz0x4jw87X\nNUUIHzmHdU0W1zeyoL9jduaII488khtuuIGODtdj/fnnn+eTTz5h4sSJ/Pa3vyUajfLWW2+xfPny\nHtt++ctfZsWKFfzjH/8AXA1+IFFqOc6UKVO45pprEtPxm9DEiRMT1Tofeugh3n///Zyck3n4hUJT\nEyxZAnV1XfVpPI9181toXRKmpi7E6bUeu4a7O8y9lTOIC3FNzcAdpEz7T50/f352x1jXFGH01Mns\nTTvty6pYRwu1Deb9G32Q7ZidOeScc87hlVdeYdy4cagqO+ywA0uXLuXEE0/k0UcfZZ999mHUqFF4\naezaYYcdaGpq4qSTTiIWi/HZz36WP/7xjxx77LF84xvf4N57702MmTt9+nT2228/Ojs7mThxIjfe\neCM//vGPOeWUU/jSl77EwQcfnLtSM5mC+/l+lUOjbYIFC7o1ysZbQrNpq8rU8Jq67YIFmRtvU0lu\n6M20/+T5IqrTpvXeQBxn+ZTLtQO3YTtBXT7FGnPLjX432hoZ6W+jrYV08k1Tkwt7JLNkCZDdoCKZ\nUi9Tt21tzW4Aibjn/qMfuTT6Dz5Iv/9QCOJpyqpw883uc1/HqKkL0U4VHQTpoIqaulDvBhmGkTMs\npJNPZs50tWlSqasDerZV1dS4vlfQVTUzU8NrX+1cmTpchcMuxT8Wc6+f/xyuvbZn/ynPg7POggUL\nnOBHoz2TJ9Ido7bBYx1dYSoL5xjGMJLJ9c/3q+RDOqtW9civ1+22S0ls7wqTLFigWlXVtWp1dd/h\nmUwhlt5CRatWufz6+HECgcwp9H3tJ+vU6RRDswkNGcXL+vXrNRaL5duMoicWi1keftHQ3Nyt56wC\nD02cw7a1DaQpVMmcOdCRVN4+m3TkTO1cvaU1ex5cdx1ccIFbXl2duQG2t7TOrFOnU1p/181vYfIM\nL20jtFEajBgxgtbWVmpqanCjnxr9RVVpbW3tdzqnCX4+iERcLRyfGMLPAj9g5r0NVD3UVSInmVAI\nKiudLsLg0pH7Cvc0NEBtlin0mW4qWadOJ98Z2trY4spGxrU18kTMs5r6JcrIkSPZsGEDGzduzLcp\nRc2IESMYOXJkv7Yxwc8H4TB0drrPIqzceyqXrHejRrW1Oec/VeQ8z23W7I8anHbkqyzJpsPVQLLg\nUmP2WXXqit8Z/IaD3V96hGW6kimBFp6u8nLRx8YoMCorKxM9UI3hxQQ/H4RCRCuqIObc3z/tWQ+Z\nK6omyGUqcq7TmnvrzdunIS0t0NgIjzyCxGJsFmij+QuNfPKDRmrNvTeMnGFpmcNEcq/UCB6TtYX/\nZjaHx1qI4FFZ6UrKpx3GsAjIJoU0I54HjY1EK6uJSQBiMb7w8iPUXhji7RPPo/m8iBXbNIxckKk1\nN9+vUsrSSc1YmTatq9NSvONSVVVX56ViZLAFDVetUp1UtUr/wBTtwGUvxUCjoJuo1ElVq4r22hjG\ncIJ1vMovqd4vOE8+nqAQz2MfNap4GygHW+4kHIbHox6NNNJONfH8pQBQRQffa5/Xv6cGwzB6YII/\nBKQWGwuFoEGa+ANH0iBN1Nc7UZw6tY/xaouMbAqxZSLedvtk0ONrVS18sk337IM9eb7or49h5Btr\ntM0xaRsv1zXx5c6pAEzpXIasAxoa8DwXr7cKwqlZPR4r513G0UunJsYA2Euep4II0P0i9XeIRsMo\nZ0zwc0zaDkfhJXTrXrJkSaIiZh6KABYsydei6egG7ln6ECewlAAQEE0k5SdXAZ0xw2V0BoOuBES8\n0KhhGD0xwc8xaTsc1dTBsmVdK/m1cozMtLbCrXIJR+nDVNIOwSoCoVC3JygRd2NVdZ2Wp093Hcbs\nBmoY6THBzzFpOxx5vtuZUu/ewhGZCYVg9giPKW0tHB4Ic/K1IWo9j/CcrieoQMCJvvpxn2jUpfM3\nNtr1NIx0iMb/LQXG+PHjNT76eynS28AlhiPdDTH1ul14IVx9dZenHwi4+j92PY1yRUTWqur4dMss\nSydPDKqjUomRaQjFdFk/qemfc+fCihXw1a86sY/F7HoaRiYspDNMpHqrQzAuc1EykCcdjwgeYSAE\nePGOuqxcadfTMHrDBH8YyCRqWRUXK3GyLqMcJ8PFtOtpGH1jgp9rIpEeJS0ziZqlZA7gSaeXO4Rd\nT8PoHRP8XBKJwGGHucRwcAO9hsOEQp6FbzLQb8/cYmGGMWBM8HNJ3PuM09EB4TDeLM/CDb3QL888\n+Q5RU5NonY3g2fU1jD4wwc8lNTUuVSQaddOVlQkP1MINOSR+ISdPhrY2YhLg13IdTdpgKa6G0QuW\nlpkrIhHXz1/V9fM/4QQbn28oCYcTo2RJtJP5nRdwYDRiKZmG0Qsm+H2QKUe8B/FwTnxg8gkTTOyH\nklAIAgEUEKCSDuZwKRUVFtY3jEyY4PdCPAPwssvce6+i7zcmaiBIR6CKdTWhYbKyTPE8uO46VAKJ\nipqTWEF41GlZ3WezvpEbRgmRE8EXkaNE5O8i8qKIXJpm+ZkislFEnvVf5+TiuENNv3rDeh5LL2zh\nMmYT6mzhoBmeiclQ09BAx/Y7ASSqkR70wu19qni/buSGUUIMWvBFJAhcBxwN7AOcIiL7pFn1t6o6\nxn8tHOxxh4N4BmA2A5REInDy1R7/G5vFKvVoa7NY8mDJxguvPutUgERoR6CrH0QGrKyFUa7kIktn\nAvCiqr4MICK/AY4H1udg33mlPzni4XBX+B7cTcJiyQMn25ILkRPmstmVD7O//hnwhf/tt3vdt6Xy\nG+VKLkI6uwCvJ01v8OelUicifxGRu0Rk13Q7EpEGEXlKRJ7auHFjDkwbPGmH7UvjeoZCrkpjIAAV\nFW4wDmuzHTjZeuHhMFwgN9BBZSKWz/33Q1NTxn0PdvxdwyhWhisP/35gsaq2ichU4Fbg8NSVVLUJ\naAJXHnmYbOsfkQjRwyYj7e1oVRXB5VbLZSjI1gsPhWB2tcfNn55NAwsI4I8I38doKNYvwihHcuHh\nvwEke+wj/XkJVLVVVf16AywEDsjBcfPCq81htK2dgEaJtbXzanM4sWwwg3gb3cnWC4+vt/m0eggm\n+S+xmAXnDSOFXAj+k8AeIrK7iFQB3wbuS15BRHZKmjwOeC4Hx80LjxGinSo6CNJBFY8RyrdJJUtf\nN9B4ZA2g/gaPwPXXunhafBQUC84bRjcGHdJR1U4RuQB4GAgCN6vqX0Xkf4CnVPU+4D9F5DigE3gP\nOHOwxx1WkorZ71Hv8bWbW/hKR5gnKkPMqTd3Ph+kNurOnw+trQ0cc10tta1hi6sZRhpsiMO+mDkT\nrrrKlUwYMQJaWqxQVwEwZ47Lo4+PbRsMuihOt4yeLAYNtnGFjVKjtyEOrXhabzQ1wbx5XdObNiWq\nX5o45JfkRl0RJ/zJwxt6pM/rTBZ4sHGFjfLCBL83lizpPi1iceECIbVK8owZKRk9afI6I3jdBP6M\nM/o52pZhFDkm+L1RVwfLlnVNX3yxKUIBkZxaWVubGpoJ9cjrTL0HgHXAMsoLi+H7ZIzlNjU5T7+u\nDhoahs0eIwekfKnpeu+CxfCN0qK3GL4JPtl34zeKH2ukNUoda7Ttg17GxTZKDOtha5QzVg+f/lXF\nNIocK4RvlDHm4eM8vj/Nj9C6JExNXYhacwFLk6TYXbSiitvPamGPekuxNcoHE3yASITaGX4Qf2UV\n1FoQvyRJit3Fou38fUGYabd61mZjlA0W0oFea/FaBKB46fHd+bG7qASJUsFIfY2xmyI0Ntr3a5QH\nJviQMYhvQ+EVL2m/O7+31sbjz0WI0sACWjTEx3+M2PdrlAUm+JCxFq8NhVd8xL365uYM353nseOO\nUEUnQZRq2jlNm+37NcoCi3VLrwEAAB8ESURBVOHHSZOvZ0PhFRfJ/SkqKtwDG6T/7iT5s9j3a5QH\nJvi9YKNYFRfJT2QA554Lo0al+e7q6+GWW6C9HRXhlC0f4JhjP8NIb25iFeugZZQiZSH4g/nzWked\n4iH1iay+PsN353mwfDlceimBFSvY+qMNbH37PDcS89y51vPaKFlKOoYficB55zkhsIbX0qdfg5N7\nHrz5Zvd5d98NWNuNUbqUrIcf99I2bXJjl0BKrXR7Xi9J+vVEdtJJ3cc7OOkkwNpujNKlZAU/7qXF\nxT7eMHdMjT2vGz5z/Zj93Xc7sfenre3GKFVKVvCTvbSKCjjrLBfTrR1ApTRrwCth5s5NCH3q92zf\ntVFqlKzgZ/bSQv16XrcGvPIgEoFZoQhf6QgzqzLEnLDV2DFKj5IVfMjgpfXzed1KJ5cHLzRHeLB9\nMlW0095exbXzWghP8BI/EXvKM0qBkhb8jPTjed0a8MqDSYSpop0KokAbY+5t5H/ua2R2tcf8+d3H\nzLWnPKNYKem0zFzQr1Q/o+iIl2L4aGwIqa4iJgGCxDhcH2FZbDLj2iIsWuSyvSxN0yh2bIhDo2xJ\nbZ/50/wItUsa0T8+gmiMGLCSiXy18jE6Otw2VVUW1jMKm96GOCwPD7+pCY480r0bhk9q+8wDrR40\nNhITQXH1diaygv/pmAm41N7vftfE3iheSl/wm5rQqVPRZcvQqVNN9I0E6apiR/B4M7YT0FVgrY67\nCQZhxAiX2msYxUrJC/77i5YAXX/e+LRhpGufCYfhdr4DQDzY+fGUk6wNxygJSj5L57kRY/BYlvjz\nRnau42t5tcgoJFITtkIhmLzZXPjUefYfTzmJsY0nMDY8BwgBpvhG8VLagh+J8OXIL4gBIFwd/AGH\nXNKQZ6OMQqarm8Zc3g3NdXWXklp2181v4YFWz/LxjaKktAV/3jwCHW0AKMrpx37EjtaJxuiDbl7/\nnHCiZVc3bSJyfjOX4Vk+vlGUlG4MPxKB++9PTAqw4442Tq3RT0IhV4wJQJX66M0cGI1YPr5RlJSk\n4EciEG4Mo7GkPgbBINTXW61zo394nqu8J4IAQaIcLmHrdW0UJSUn+HEP/kePhPhUq4lJgE6p5LFv\nXw+elzYVzzB6pb7e5WQGgwSqq9hrasjCOUZRkpMYvogcBfwCCAILVfWKlOXVQDNwANAKfEtVX8nF\nsVOJe/BPxDyOoIVJGiZMiNW3eyyYCA0NVuvc6CdJBfeCNTXUt4bjCwBrEzKKh0ELvogEgeuArwIb\ngCdF5D5VXZ+02tnA+6r6RRH5NjAX+NZgj52OUAjOpYkTWMIS6riCWYllS5Y4wbda50a/if9gUjJ2\nrn/G4+abXYjQGnKNQicXHv4E4EVVfRlARH4DHA8kC/7xQKP/+S7gWhERHYJCPp9d2sT10akATPHz\n7xfiUjHr6nJ9NKOsSGoA0rZ2fjc9zIKo13MITRN8o0DJRQx/F+D1pOkN/ry066hqJ/AhUJODY/dA\n7u7es/b8zy5hyhRYsMB594YxYJIagDqDVTwaC/UYQtPahIxCpqDy8EWkAZw7PmrUqAHtQ0+qg3ld\nPWs/c2YdD8/NkYFGeZMUy/9bTYinZ3gEU4bQNO/eGCjxtqCaGmhtHZo2oVwI/hvArknTI/156dbZ\nICIVwNa4xttuqGoT0ASuPPJAjBk9t4GXcJ6+nlTH6Lnm1hs5xG8AqgX+RITWJWFq6kLUNpjSGwMn\nnl3Y1gaxGAQCUF2d+zahXAj+k8AeIrI7Tti/DX71qS7uA84AIsA3gEeHIn4fZ/TcBjChN4aSSITa\nGf4/9NEAcJ3FDI0BE28eirk6MMRiQ9MmNOgYvh+TvwB4GHgOuFNV/yoi/yMix/mrLQJqRORF4CLg\n0sEe1zDySjjc5Y51dsL551u3bWPAxJuHAr4iBwJD0yZkI14ZxkCIRODQQ10+ZpwTToB77smfTUZR\nk6sYfm8jXpngG8ZAOfFEdOlSBFc7PyZB1t+40uL5Rl6xIQ4NYwhYd/QldBJMDIeIRvl02gyWzoww\nZ073CE98sHSL+hhx8vGbKKi0TMMoJh5o9bhWrudaPZ8KogSAA3UNbfNCXBUIM7vao6XFrZs8WLr1\nxjUikfz8JkzwDWOAhEIweUQDYz99hgZuTDwuV9HOqbFmVrd7iWqsqRVaTfDLl0gEGhu72vyTq/aG\nw3BMTYTa1vCQJOKb4BvGAIn3w3qhuZ7YwluQTjfYjgBncQu/DdYTCrk/bFVVlzdnvXHLl3T59lVV\n8MEHMGkSHNgZ4Xs6GQ20I9W5d/0thm8Yg8DzoP4Gj4oVy5EJExCc4FfRzuKvNScK9aUOlm6UJ6n5\n9uPHw/z5cPXVcEBHhMu0kSrakNjQDNhhHr5h5ALPc//cUAja2wmg7PTQLRBx9RasQqsB7ucRDHZl\n8/75z/DMMzAhGqGFw6ikjQCggQAyBI+D5uEbRq7wPPjud10lNXAdsmxINSOJdD8RgEtkHtW0EfTX\nk/Hjh+Rx0ATfMHJJ0uhYVFTAa69ZLqaRIP5TqKzsGnXv/LERjqX7+NuMGzckj4Qm+IYxCHrkUscD\n9ueeC6pw001ED5tM83mRHrpvufnlRbzB9qabnId/7rnup1LbGiaAJkq6x8ffHgoshm8YAyRjLrXn\nuVBONArRKLFoO39fEGbarV5inXzlYRv5I2n8HABGjYp/5yFXGrOtzaXtXHfdkP0YzMM3jAGS/Afu\nkVDhV8OKSpAYwrG6lNM3NXXLt864rVGSJI2fwyHBCGeuOQ/OO88tbGmBn/4UVqwY0qqr5uEbxgCJ\n/4HT5tf7oZ33L51HzYqlHMQaDtI1vPwBQEPv2xolSTyRa/2iCFetDVGxtN0tuOUWWL4cZs3qfQc5\nwATfMAZI0gBY6TtFeh7bj/h3otaOAqPDi4CGvrc1So5IBGbMgKs3NRPQ9q4Fw9j92gTfMAZBcn59\nJALNze5zYrjDujpk2TLAz75Yu9ataLn5ZUc4DD/+dCbnsiDhAAgM6yOeCb5h5IBIJNHnCnBP6b/8\nJbS2NlC/x+3s/MIK9+eORt1dwZS+7Dj9rzPZhXlA1xMfEya4OM8w/R5M8A0jB4TD0NHRNd3eDtOn\nuy7028T2YRorgCSvLg3xATAsxFOCRCKMXHxVt/CeBALDKvZgWTqGkRNCIdeZJk4g4MQ+FoNm6mmj\nmihCNFgNY8f2SMCPp2ledpl7t9z8EiMcBu3KtReAiy8e9ju7efiGkQPiqffxGP7Ysa6Brq0NVsc8\nJstyJgfDnHFRDaNnzOiRgJ8uTdO8/BIiFHI9sDdtcr2uLr4Y5s4ddjNM8A0jR6Q2wtbWJo9R6hEK\neYwOz0mr7JamWeIUSFqWCb5hDBHps3BCRCuqINYOFVUEfWUvED0wcklqo0wBpGWZ4BvGMBLBY5a2\n8BXCbNP5AefMaGTbs+ugoaEQ9MDIFQVaO8ME3zCGkXAYHo967KHr+Gn0h7AGWOPy9IeyS70xzBRo\no4xl6RjGMBCvjFlT4xy+s1kEJKVoLlqUN9uMISC5cI7fKFMI1VHNwzeMISb16X7+fNj2FzvDerdc\nAdl557zaaOSYlEaZCF5BRHjMwzeMISb16b61Fdq/dwkdVBIDOqhk3dGX5NtMI9d4niuIliHtNh+Y\nh28YQ0y6lMsHwh7nBR7j0FiYlYEQX2/1qM2zncYAydBFOnl2oaTdmuAbxhCTKeVydrXH6naPqiq4\nMoTVVigyIhF4oTnCqbdMJtjZPVaTLkmnENJuTfANYxhITbnscRMgAocd1qUQy5cTwetVIOz+kD/i\ngv7/NoVRbQe6Z+OkC+H40Z28YoJvGHmi203gvGZXhwGgrY235zUz+WEvYyNfgaZ5lw3hMIxrizBS\nX6OTCkQgmBSrKZQQTirWaGsYBcibb/beyFcojYDlyjE1EZbFJnMuNwHKxuPP7XbXjT/BzZ5dWDdj\nE3zDKATq650rKAIVFey8sxv3NCmNuxtp0ryN4SIS4fPzZzCCTVQQpToQZccJo3qoelKSTsFggm8Y\nhUC83ObUqRAMsuP9N9Eik7n53EhaD7FQPciSp6mJ6MGHsNVzaxAUBWLBYNHccS2Gbxh5okeja1z0\nOzshGiWg7Yx6OQykV3OrvTPMRCLEzjufALFED+kY8Oex32VckXwRgxJ8EdkO+C2wG/AK8E1VfT/N\nelFgnT/5mqoeN5jjGkaxk7HR1Y/VaFs77TFhs2VLufXRGljRYOKeb5qbIRZNiL0CUYK8Fqrn4TnF\nkS012JDOpUCLqu4BtPjT6fhUVcf4LxN7o+zJ2Ojqx2qe3+tYquhkAmu4rnMq789ryqO1Bk1NcNNN\nieEJndgHmL/H9XznGq9oRiobrOAfD9zqf74VOGGQ+zOMsqDXRlfP47Nb/RvoKq7mvblkmC00Esyc\nCdOmodFoQvD/xAQOr3iclyc3FFW21GAF/3Oq+pb/+W3gcxnWGyEiT4nIahHJeFMQkQZ/vac2btw4\nSNMMo3BJ1+iaXE1x27PrACcu0DVtDDNNTei8eag/Hq0CnVRycWA+p13nJZKriiVbqs8Yvog8AuyY\nZtF/JU+oqoqIplkP4POq+oaIfAF4VETWqepLqSupahPQBDB+/PhM+zKMkiC50bVnTL8BbwGwZAnU\n1Vmt/Dzx0fxFbAXdQjnTuZZV6vH11uIbqaxPwVfVIzItE5F/ishOqvqWiOwEvJNhH2/47y+LSBgY\nC/QQfMMoV5Jj+m1t0NgIjY0NeCb0eeUt2ZmtkqZXMJGFNFBV2eXNF1O21GBDOvcBZ/ifzwDuTV1B\nRLYVkWr/8/bAV0hUAjcMA7pi+oEAxGLwyCPF0QhYsvjxtYpjju5WxvoPE69g2rSCGcCq3ww2D/8K\n4E4RORt4FfgmgIiMB6ap6jnA3sACEYnhbjBXqKoJvmEkEQ8NNDY6sY/FCmpkvPIiKb42uqqKly65\nltefbaWmLsQVDcX9ZQxK8FW1FZicZv5TwDn+51Vgpb4Noy88zwn+ypWFV3Sr1Oi10mhKzuzobVoZ\n/fCs4TdyCLCetoZRQGTVCNjUZI25gyC5gbyiAs46y5Uy2nJdhNYlYXYdU8PoQix1mQNM8A2jwOi1\nEbCpydXbAVi2zL1nEH2rl5+eZAc+GoUFC0BuamJ+dDoBYrQvq+alS+YzepvWxHi04SLpSdsXJviG\nUUwsWeIGPccf/PzKK6G2tsfQes3NcMstriyP1cvvTryBfNMmUIWDNML86AVU0ul3dGtjXbiVO0+Y\nRc06mDGjdMYdMME3jCLipTF1fGHZskSHLH3xJWTy5B5D68XFDLr3ADWPvyts1twMf10Y4YbOs6mg\nI3ETjRHg58+EeGKty5rq7HTXsq2t+BvRTfANo4i4c5sGXhH4vl7JF3iJCjTt0HpxsRdxnmlNjY2Q\nlYznuWElYzdNQugAusT+zonX8cQTHtGou47xaxmLuetYzFg9fMMoIkIh+PWIBr4baKadEWige5/+\n1Bo9U6c6cW9tLY8RspLLU/RJOEwg6jz7eM2i4ITxjL6iIXENg0F30wTn7be2DpHhw4R5+IZRRHRl\n8Xi8VNNCbWu4K4tkzhy8UIiWlvSDn5do4kmCfo3zG4nAa68RCwSRWDQx+6XQ2d0ypWpqusfwi/26\nmeAbRpHRlcXjuVeK0nktLXizeg63l5zuCc4TLqV4frqS02nPLel6xaSCJ/gK1WziFjmb3bZpYBbd\nM6Vqa128vxQwwTeMIqNHumU47FoUY7FeWxbjItabJ1zMqZzxcFaf3nhzc6JVOxiAloqjuFxnuWuR\nYZtbb3X7vfXW4m7/MME3jCIirVjX1Dixh6xaFjN5wv0KiRQgfXZaa2qCRYtg7dpES6xUVnDyL0Ns\nfCbzfrN+cigCTPANo4hIKz60EpMAAY2594ceSvTEjdQ29BDATJ5wKQhbxk5ryR3W4ojAWWfxca3H\nrTMye/BZPzkUASb4hlFEpBOfpUtDTNFqKmlHNUDl0qUup3zZMt4JPMTvuYTZ1V5CyDJ5wqUkbD1C\nU4sWJZbF69pr1QiC9fV93uiKreZ9b4hqYY4zMn78eH3qqafybYZhFBypYnbkkfDRsgghwhzPUg5i\nTcrYq0EukOvZ7X8bmNVHDbBijuHHSRuamnciLF0KuGuynr2ZXrWIOWF3ksUcykpFRNaq6vh0y8zD\nN4wiIzVsUVcHU5d5rMbjXWo4iDWJZQFAiHKtns/fampxmT3Z77sYSeuxX3IJ/P73xDo66KCSc1jE\nk1GXvjprVul48H1hgm8YRU68dtqSJXBgXYPrRDR7NrJhA+A6FVVIjNpnmmFOuORVLW1oyvPgscd4\nvTnMGTeHeDLqJZaVwlNNtlhIxzBKkUgEJk2CDlc2gMpK9x6vprZ8eUmr24bTZrL5Q3fz76NPYuRt\nc7stSxZ4KK1wDlhIxzDKCidoHsdc+5jz6gHefjsRw6atzeWiF7uyZWLmTEbePg+A7W6fB7sAc7tE\nPzlsNWdO8Wcm9QerpWMYJUS8wfKyy2D8hR7ncQOR+htgxx27r/j22/0oOlNERCKwcGH3eXffnXH1\n1NpDxZyZlA0m+IZRQqQ2WC5Y4G4A68bWO0UTccM8PfSQuyuU0kjp8bvde+91n3/SSRk3iadczp5d\nGuGcvrCQjmGUEKmDe6hfPfmBVo/acNjdEV57DW66qfTiGPG7XZzttoNzzukWzklHKWQmZYt5+IZR\nQsQ91qlTobo6JVTheS4Hsb6+exyjpgbOO8+98uTt96uscSaS4jPR6s1o/uYDRE7oXezLDcvSMYwS\npdd0w/jCmhr4z/90DbngBHOYPf6c1vCJRHjVT7183E+9LIdQTTK9ZemYh28YJUrcoU+uhJnwouML\nW1vRpDCItndAY+OwevrpOkoNGM/jjlGzeDzqlfxgLwPBYviGUQZk8qLX1YTYQ6uoxvfwUfSPjyAr\nVw6ba5zrGj6lVBMo15jgG0YZkKlA2AOtHveznNNpZixPM56nqFC/rv6MGTBuHNTXEyH9KFq5IHlQ\n8VzuL9XecupRmxFVLcjXAQccoIZh5IZVq1Q320w1GHTvq1Z1za+udvk8X2aVfsJmGgsE4gk+qqDR\nikqdVLWqx7bDYV+vG1x+edbG9Hv/RQzwlGbQVYvhG0YZkCnf3PNclYVp02DMNI+XFrQgRxzRbVvp\n7ODbHc0DjolHInDiiXDQQa4sfTr6E8d/aWYT0UMmof/1o6z7EeS0naCIsZCOYZQJmfLNu8/3oLaR\n2CMtSCzqCrEB+7CeP3Ak90odoVBD1seMRGDiRFfCB2CNX8izIWUX2cbdN5w2k91vvxJBXQnotjYk\ni6wii+s7TPANw+hGBI9muZ5fcj4BYkQJcKiuAGBK5zJkHeBlJ/rhcJfYx1mypKfgZxV3XzqTXfwa\nOfF6/zGCBLNQb8+D+fMTA4GVbQzfBN8wjG6Ew7Ag1sCz1PYYVAVIr9gZCIVcJYdk0a+rS79u6hNI\ncmbRIcEIj3ZchUDS4C7CKxdfy+gs1DsScW3Q7e2wciXU1pan6JvgG4bRjVAIAgFYHU0/qEpGxU6D\n58GKFTBvHrz5Jpx9dtb3im5x96/EwqCaEHuAN0/9AaPnZv+kUU5VMTNhgm8YRjc8D66/Hs4/H2Ix\naK5s4NIZMPpZPx4SV+ws8xw9D+65p/92JMfdnwiGUBkB7ZsQEbj4Ykb2USMndV8VFe58Kioshm8Y\nhpGgocGFPeJ6/g4N3LlNA6Faf5DEnNZDSI/nwZ/mR2hdEqamLsR6WhKfaxv6f6x4FZkCrSYzLAxK\n8EXkZKAR2BuYoKppi9+IyFHAL4AgsFBVrxjMcQ3DGHriMfW02j4cMZJIhNoZ7sDRx6qYrC08Hp1F\n1Upo6WcMPhx2pqq693IN6Qw2D///gJOAFZlWEJEgcB1wNLAPcIqI7DPI4xqGMUykzWFPN3JITkpe\n+kQirqZPW1viwF/pCCdsaG7u36HKbaCTTAzKw1fV5wAXU8vMBOBFVX3ZX/c3wPHA+sEc2zCM4SHT\noODr5ieFWCBtiGcg5Qw2nDaTne+4CtGYywwKBKCyiic0RDDqRPuWW7qG580mmpQp7bPcGI4Y/i7A\n60nTG4CD0q0oIg1AA8CoUaOG3jLDMPoknVhGIjB5hkd7u0fVSnjujDl8PuUxIILXZ5g/9Ybw0swm\nvpCSay9HHEGwsZE5fj2fgY7fUk4DnWSiT8EXkUeAHdMs+i9VvTeXxqhqE9AErh5+LvdtGMbASRXL\n1DDPY4SoT34MqKmhrXEO49pCPBHz0gpzatvAHRdGGH/NlUBSrr0EkMZG8Dw8um42t95qvWYHQp+C\nr6pH9LVOH7wB7Jo0PdKfZxhGkZIa5tmj3oP6lq5BVWbMYFJbO8tiVUwJtPBk0OO115xYx0U/+aYx\ndlOEo+aFqMTV5o97e3/+6sWMTXHLswnPWGXM9AxHSOdJYA8R2R0n9N8GvjMMxzUMY4hIL7r+Y8Cc\nOdDejsSibBZoY9FnZrD8w3H8ekE9k2/1EqGd5JvGxdF5VNOe8OxfZyQ/5TJ2DzUwNum4yUI+a1Z6\n24YhY7RoGWxa5onANcAOwO9F5FlVPVJEdsalX35NVTtF5ALgYVxa5s2q+tdBW24YRl7JGBOPK3lb\nG8Ri7PnBGvZkDVO5kYc+nUI4/HBiuzPOgN3fjnDcffdDrGsXD3IMt23WQEuoa162Qm69ajMzqLRM\nVb1HVUeqarWqfk5Vj/Tnv6mqX0ta70FV3VNVR6vq/w7WaMMwCpi4+3/EEa7R1X8BHM0yvvPrIxPi\nva4pwtj7GgnEYl1x+0CQzafV9xD0bEscWwpmZqynrWEYucfzXB79sj+ifinjuKDv+twyfjsvwumb\n1nGNXkCQTrdEBAkGkeuuoz5NT9psSxxbCmZmRAu0n/H48eP1qafSdtw1DKNYmDkTnTcvMSlAJ/DE\nPtPw1i+kkk53I5AA8tUj3E2il/x9a4ztGxFZq6rj0y0zD98wjKFj7lxk9GjaL55Fxb/eIwq0sxl7\n7gmVz8UQ9XPtK4LdxD5TrN5y6QeHDXFoGEbO6VZloaGBqo9a+euCVTw+5XJeWtDCTpfUIyOqIRBA\nKirg2msTSt5brD6X1RvKEfPwDcPICfFwi5+Gn/DQ58+H1lYIhTxCybH5DIH2TLF6S7ccPCb4hmEM\nmmQxFnF152Mxl5k5fbqrUtlDpDPEZzI1ulq65eAxwTcMY9Aki3Eg4FIiRdznaNSJ/2Dr3thA5IPH\nBN8wjEGTKsbxME5qeGcwIm3ploPHBN8wjEHTmxgnj5w1WJG2LJ3BYXn4hmEYJURvefiWlmkYxpBj\n6ZSFgYV0DMMYUiydsnAwD98wjCEl26JnxtBjgm8YxpBi1SsLBwvpGIYxpFg6ZeFggm8YxpBj6ZSF\ngYV0DMMwygQTfMMwjDLBBN8wDKNMMME3DMMoE0zwDcMwygQTfMMwjDKhYIunichG4NUBbr498G4O\nzckHxX4OxW4/2DkUAsVuPwz/OXxeVXdIt6BgBX8wiMhTmarFFQvFfg7Fbj/YORQCxW4/FNY5WEjH\nMAyjTDDBNwzDKBNKVfCb8m1ADij2cyh2+8HOoRAodvuhgM6hJGP4hmEYRk9K1cM3DMMwUjDBNwzD\nKBNKSvBF5CgR+buIvCgil+bbnv4iIjeLyDsi8n/5tmWgiMiuIrJcRNaLyF9F5Hv5tqm/iMgIEVkj\nIn/2z+En+bZpIIhIUESeEZEH8m3LQBCRV0RknYg8KyJP5duegSAi24jIXSLyNxF5TkTyWiS6ZGL4\nIhIEnge+CmwAngROUdX1eTWsH4jIROBjoFlV9823PQNBRHYCdlLVp0VkK2AtcEKRfQ8CbKGqH4tI\nJfA48D1VXZ1n0/qFiFwEjAc+o6rH5Nue/iIirwDjVbVoO16JyK3ASlVdKCJVwOaq+kG+7CklD38C\n8KKqvqyq7cBvgOPzbFO/UNUVwHv5tmMwqOpbqvq0//lfwHPALvm1qn+o42N/stJ/FZVnJCIjga8D\nC/NtS7kiIlsDE4FFAKrank+xh9IS/F2A15OmN1BkQlNqiMhuwFjgT/m1pP/44ZBngXeAP6pqsZ3D\nfOASIJZvQwaBAstEZK2INOTbmAGwO7ARuMUPrS0UkS3yaVApCb5RQIjIlsASYIaqfpRve/qLqkZV\ndQwwEpggIkUTYhORY4B3VHVtvm0ZJIeo6jjgaGC6H/IsJiqAccANqjoW+ATIa9tiKQn+G8CuSdMj\n/XnGMOPHvZcAt6vq3fm2ZzD4j+DLgaPybUs/+ApwnB8D/w1wuIjcll+T+o+qvuG/vwPcgwvbFhMb\ngA1JT4d34W4AeaOUBP9JYA8R2d1vHPk2cF+ebSo7/AbPRcBzqnp1vu0ZCCKyg4hs43/eDJcI8Lf8\nWpU9qjpLVUeq6m64/8Gjqnpans3qFyKyhd/ojx8GmQIUVfaaqr4NvC4ie/mzJgN5TV6oyOfBc4mq\ndorIBcDDQBC4WVX/mmez+oWILAZCwPYisgH4saouyq9V/eYrwOnAOj8GDvBDVX0wjzb1l52AW/3M\nrwBwp6oWZWpjEfM54B7nP1AB3KGqf8ivSQPiQuB23wl9GTgrn8aUTFqmYRiG0TulFNIxDMMwesEE\n3zAMo0wwwTcMwygTTPANwzDKBBN8wzCMMsEE3zAMo0wwwTcMwygT/j+1rXU6OUUYGQAAAABJRU5E\nrkJggg==\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3h7IcvuOOS4J",
+        "colab_type": "text"
+      },
+      "source": [
+        "Much better! The evaluation metrics we printed show that the model has a low loss and MAE on the test data, and the predictions line up visually with our data fairly well.\n",
+        "\n",
+        "The model isn't perfect; its predictions don't form a smooth sine curve. For instance, the line is almost straight when `x` is between 4.2 and 5.2. If we wanted to go further, we could try further increasing the capacity of the model, perhaps using some techniques to defend from overfitting.\n",
+        "\n",
+        "However, an important part of machine learning is knowing when to quit, and this model is good enough for our use case - which is to make some LEDs blink in a pleasing pattern.\n",
+        "\n",
+        "## Generate a TensorFlow Lite Model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sHe-Wv47rhm8",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 1. Generate Models with or without Quantization\n",
+        "We now have an acceptably accurate model. We'll use the [TensorFlow Lite Converter](https://www.tensorflow.org/lite/convert) to convert the model into a special, space-efficient format for use on memory-constrained devices.\n",
+        "\n",
+        "Since this model is going to be deployed on a microcontroller, we want it to be as tiny as possible! One technique for reducing the size of models is called [quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) while converting the model. It reduces the precision of the model's weights, and possibly the activations (output of each layer) as well, which saves memory, often without much impact on accuracy. Quantized models also run faster, since the calculations required are simpler.\n",
+        "\n",
+        "*Note: Currently, TFLite Converter produces TFlite models with float interfaces (input and output ops are always float). This is a blocker for users who require TFlite models with pure int8 or uint8 inputs/outputs. Refer to https://github.com/tensorflow/tensorflow/issues/38285*\n",
+        "\n",
+        "In the following cell, we'll convert the model twice: once with quantization, once without."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "1muAoUm8lSXL",
+        "colab_type": "code",
+        "outputId": "5ff328ef-73c5-45cd-e339-da52696b00e3",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        }
+      },
+      "source": [
+        "# Convert the model to the TensorFlow Lite format without quantization\n",
+        "converter = tf.lite.TFLiteConverter.from_keras_model(model_2)\n",
+        "model_no_quant_tflite = converter.convert()\n",
+        "\n",
+        "# # Save the model to disk\n",
+        "open(MODEL_NO_QUANT_TFLITE, \"wb\").write(model_no_quant_tflite)\n",
+        "\n",
+        "# Convert the model to the TensorFlow Lite format with quantization\n",
+        "def representative_dataset():\n",
+        "  for i in range(500):\n",
+        "    yield([x_train[i].reshape(1, 1)])\n",
+        "# Set the optimization flag.\n",
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "# Enforce full-int8 quantization (except inputs/outputs which are always float)\n",
+        "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+        "# Provide a representative dataset to ensure we quantize correctly.\n",
+        "converter.representative_dataset = representative_dataset\n",
+        "model_tflite = converter.convert()\n",
+        "\n",
+        "# Save the model to disk\n",
+        "open(MODEL_TFLITE, \"wb\").write(model_tflite)"
+      ],
+      "execution_count": 18,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "2512"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 18
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8X1yO3h5pYbt",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 2. Compare Model Sizes"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "jAIe0dK3pXU8",
+        "colab_type": "code",
+        "outputId": "ce15b7eb-f857-4cb0-ba70-5a67ce04566b",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 68
+        }
+      },
+      "source": [
+        "import os\n",
+        "model_no_quant_size = os.path.getsize(MODEL_NO_QUANT_TFLITE)\n",
+        "print(\"Model is %d bytes\" % model_no_quant_size)\n",
+        "model_size = os.path.getsize(MODEL_TFLITE)\n",
+        "print(\"Quantized model is %d bytes\" % model_size)\n",
+        "difference = model_no_quant_size - model_size\n",
+        "print(\"Difference is %d bytes\" % difference)"
+      ],
+      "execution_count": 19,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Model is 2736 bytes\n",
+            "Quantized model is 2512 bytes\n",
+            "Difference is 224 bytes\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cR2OuokFpkEM",
+        "colab_type": "text"
+      },
+      "source": [
+        "Our quantized model is only 224 bytes smaller than the original version, which only a tiny reduction in size! At around 2.5 kilobytes, this model is already so small that the weights make up only a small fraction of the overall size, meaning quantization has little effect.\n",
+        "\n",
+        "More complex models have many more weights, meaning the space saving from quantization will be much higher, approaching 4x for most sophisticated models.\n",
+        "\n",
+        "Regardless, our quantized model will take less time to execute than the original version, which is important on a tiny microcontroller!"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "L_vE-ZDkHVxe",
+        "colab_type": "text"
+      },
+      "source": [
+        "### 3. Test the Models\n",
+        "\n",
+        "To prove these models are still accurate after conversion and quantization, we'll use both of them to make predictions and compare these against our test results:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-J7IKlXiYVPz",
+        "colab_type": "code",
+        "outputId": "87d2fd39-4ddc-4f73-e164-e0089a5cfb59",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 281
+        }
+      },
+      "source": [
+        "# Instantiate an interpreter for each model\n",
+        "model_no_quant = tf.lite.Interpreter(MODEL_NO_QUANT_TFLITE)\n",
+        "model = tf.lite.Interpreter(MODEL_TFLITE)\n",
+        "\n",
+        "# Allocate memory for each model\n",
+        "model_no_quant.allocate_tensors()\n",
+        "model.allocate_tensors()\n",
+        "\n",
+        "# Get the input and output tensors so we can feed in values and get the results\n",
+        "model_no_quant_input = model_no_quant.tensor(model_no_quant.get_input_details()[0][\"index\"])\n",
+        "model_no_quant_output = model_no_quant.tensor(model_no_quant.get_output_details()[0][\"index\"])\n",
+        "model_input = model.tensor(model.get_input_details()[0][\"index\"])\n",
+        "model_output = model.tensor(model.get_output_details()[0][\"index\"])\n",
+        "\n",
+        "# Create arrays to store the results\n",
+        "model_no_quant_predictions = np.empty(x_test.size)\n",
+        "model_predictions = np.empty(x_test.size)\n",
+        "\n",
+        "# Run each model's interpreter for each value and store the results in arrays\n",
+        "for i in range(x_test.size):\n",
+        "  model_no_quant_input().fill(x_test[i])\n",
+        "  model_no_quant.invoke()\n",
+        "  model_no_quant_predictions[i] = model_no_quant_output()[0]\n",
+        "\n",
+        "  model_input().fill(x_test[i])\n",
+        "  model.invoke()\n",
+        "  model_predictions[i] = model_output()[0]\n",
+        "\n",
+        "# See how they line up with the data\n",
+        "plt.clf()\n",
+        "plt.title('Comparison of various models against actual values')\n",
+        "plt.plot(x_test, y_test, 'bo', label='Actual predictions')\n",
+        "plt.plot(x_test, predictions, 'ro', label='Original predictions')\n",
+        "plt.plot(x_test, model_no_quant_predictions, 'bx', label='Lite predictions')\n",
+        "plt.plot(x_test, model_predictions, 'gx', label='Lite quantized predictions')\n",
+        "plt.legend()\n",
+        "plt.show()"
+      ],
+      "execution_count": 20,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEICAYAAABcVE8dAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOydd3gUVdfAfzebQEioAiIt2ajUdCAU\nIyUIRBAEhFAEpEhVXl8/pCiIYHuFABZABBsoPUFpii+CCVXzmtAERGlZQlOQHhJCsjnfH7O7bJJN\ng0BIMr/nmWd3Zu7cuXPnzpk755x7rhIRdHR0dHSKP06FXQAdHR0dnXuDLvB1dHR0Sgi6wNfR0dEp\nIegCX0dHR6eEoAt8HR0dnRKCLvB1dHR0Sgi6wC9klFL9lFI/FnY5rCilyiil1iulriilIu/B+Q4q\npdrc7fPcC5RSRqWUKKWc85B2kFJqx70oV15QSnkopRKVUobCLsu9QCnVRil16i7ke1/d18wUG4Gv\nlHpWKRVnabRnlVI/KKUeL+xy5YaILBWRDoVdDjt6AtWAyiISdrdPJiLeIrLlbp9HJ2dEJEFEyoqI\n+U7yUUptUUoNLahy2eWb55epTvYUC4GvlBoDfAj8B01YeQDzgK6FWa7cuE8brydwWETS7uZJ7tNr\n19Ep3ohIkV6ACkAiEJZDmtJoL4QzluVDoLRlXxvgFDAeOAecBboBnYDDwEVgol1eU4FVwErgGrAb\n8Lfb/ypwzLLvd6C73b5BwE7gA+AC8I5l2w7LfmXZdw64CuwHfOyu82vgPHACeB1wsst3BzATuATE\nAx1zqI8GwBbgMnAQeNqy/U3gJpBqqdPnMx1XA0gGHrDbFgj8A7gAjwBRlmv7B1gKVLRLawImAL8B\nKYCzZVu7PNwnWz3Z5SfAo5b/nSz1fQ04DYzN5trt78Fl4DjwmGX7SUvdD8zUvrKrd4Olzv+x5POi\npUzOdsd+gdamTlvutyHz9eR03x2UfzBwyHKdx4ERmfaPt5zvDDA0Ux09BeyxnOMkMNXuOGOmsm8B\n3rbU1TXgR6CKZZ8rsMRyny8DsWgdrXcBM3ADrf3MzeYaIoG/gCvANsDbbl8ZYJalrq+gtesyQIKl\nfImWpQXas7gkh2vItq6wPPfZlO8TYGambWuBMXl8xnc4Ko9dvQ61Wx9iKeMlYCPgmd82kS95eTeE\n8L1cgCeBNPtKdZDmLSAGeBCoCvwMvG1349OAN9CE1jC0h3sZUA7wRhNyXpb0U9EEYk9L+rFoAtbF\nsj8MTTA6Ab2B60B1u8aQBvwLTdiVydRAQoFdQEXLDW9gd+zXlkZXztKQDmMRyJY8Ui1lNwCj0B54\n5aAuXICjwESgFNDW0nDr2V3fkhzqMgoYZrc+A5hv+f8o0B5NcFdFe5g/tEtrAvYCtYEydtva5eE+\n2erJLj97YXYWaGn5XwlolE35rfdgsKWu3kETJh9byt3BUh9l81DvI4E/LNfzABBNRoGzGlgAuFuu\n6VcsQiev991B+Z9Ce7EqoDWQZL1WtGfhL7Q264YmlO3rqA3gi9Y2/YC/gW6OhBOaYDoG1EVrp1uA\naZZ9I4D1lnMYgMZAeUcCLZtrGGKpT+sLfq/dvo8tedS05P2YJV2G8jlqqw6uIae6akP2Ar8V2gtR\n2bWnZKBGHp/xPAl8NA3EUcv9dkbrTPyc3zaRL3l5rwTz3VqAfsBfuaQ5BnSyWw8FTHY3PplbPa9y\nlpvUzC79LrsHYyoQY7fPCTth4+Dce4Gudo0hIdN++wbSFk2gNMfSi7RsN6D1vBvabRsBbLHL46jd\nPjfLNTzkoDwt0YSCff7LsfT2yF3gDwWiLP+V5cFolU3absAeu3UTMCRTGhO3BH5O98lWT3b77YVZ\ngqVOyufSFgYBR+zWfS35VLPbdgEIyEO9RwEj7fZ1sOTljNbjTcHyYrPs7wtE5/W+57H9rwH+bfn/\nJfCe3b5H7evIwbEfAh9Y/hvJKvBft0v7AvBfy/8haC9jPwd5biEXgZ8pfUXLeSugPUvJ2H0x26XL\nUD5HbdVRmhzqqg3ZC3xlaU+tLOvDsLT5bNJnfsbzKvB/wO4r2nL9SWhq1dtuEzktxUGHfwGokotO\nuAbaJ6KVE5ZttjzklrEq2fL7t93+ZKCs3fpJ6x8RSUdTCdUAUEo9p5Taq5S6rJS6DPgAVRwdmxkR\niQLmovVyzimlPlVKlbcc7+LgGmrarf9ll0+S5a99ma3UAE5ayp1dXjnxDdBCKVUdrSeUDmwHUEpV\nU0qtUEqdVkpdRethVsl0fLbXT+73KSd6oKl1TiiltiqlWuSQNvO9RUQc3e/c6r0GGa/HPp2n5diz\ndm1hAVpPPwM53PcsKKU6KqVilFIXLXl24lYdZy7PyUzHNlNKRSulziulrqB9oWS+P/b8Zfc/iVvt\naTGa+mGFUuqMUipcKeWSQz72ZTAopaYppY5Z2ojJsquKZXFFe/HfMbnUVbaIJn1XoL2gAZ5FU09a\n883tGc8rnsBHdvlcRHvZ1MxPm8gPxUHg/4LWk+qWQ5ozaJVrxcOy7Xapbf2jlHICagFnlFKewGfA\naDQvl4rAAbSbaEVyylhEZotIY6Ah2uf0ODQdcaqDazh9G2U/A9S2lDvfeYnIJTR9bm+0B2GF5QEB\nzWgugK+IlAf6k/HaIefrz+k+XUf7cgFAKfVQpnLFikhXNIG6BojIy/XkQm71fha7tmDZZ+UkWrus\nIiIVLUt5EfF2dKJs7nsGlFKl0V64M9G+SCoCG7hVx2fR2qKV2hlzYBmwDqgtIhWA+WS9P7kiIqki\n8qaINERTuXQGnrPuzuXwZ9FUGe3QevVGy3aFVt830NQwWU7rYFuGNgHY2kQe6io3lgM9Lc90M0te\n5PEZty8f2ZURrY2MsGsfFUWkjIj8DHlrE/mlyAt8EbmCpn//WCnVTSnlppRysbzdwy3JlgOvK6Wq\nKqWqWNIvuYPTNlZKPWP5qngZ7cGOQdPVCpoNAKXUYLS3f55QSgVZemEuaI3lBpBu+fqIAN5VSpWz\nNLoxt3kN/0PrrY231FMboAtajyavLEN7wHta/lsph2ZQu6KUqkn+G2hO92kf4K2UClBKuaJ9zgOg\nlCplGc9QQURS0Yxc6dwheaj3COAlpVQtpVQlNGOe9dizaC/GWUqp8kopJ6XUI0qp1pnPk919d1Ck\nUmj67PNAmlKqI5oayUoEMFgp1UAp5QZMznR8OeCiiNxQSjVFE775RikVopTytfjsX0V7KVrL+zfw\ncA6Hl0N7Xi6gCcL/WHdYvjq/BN5XStWwfA20sAjv85Zz2Oe9F2hlGUNQAXjNbl9udZUjIrIH7QX0\nObBRRC5bduX5GReR82idg/6WaxlCxpfZfOA1pZS3Ja8KSqkwy/+8tol8UeQFPoCIzEJ7EF9HuxEn\n0d7AayxJ3gHi0LxD9qN51rxzB6dci9bDvQQMAJ6x9Hp+R/Mw+AWt4fuieTnklfJovYdLaOqBC2hG\nUdAMvdfRvA12oAnaL/NbcBG5iSbgO6I16HnAcyLyRz6yWQfUQbOd7LPb/ibQCM274nvg23wWL9v7\nJCKH0Yy6m4EjaHVgzwDAZFETjESz7RQEOdX7Z2iqjX2Wsma+3ufQBM/vaPd0FVDdwTlyuu82ROQa\n8BKaYL+EJrDX2e3/AZiNZjw+itYJAU3AgqaHf0spdQ3tZXq7X0EPWa7lKpqHyVY0NQ/AR2g940tK\nqdkOjv3aco2n0eolJtP+sWj3PhZNxTEdTYedhOYFtNOiAmkuIpvQvOV+Q7OzfWfNJLe6yiPL0L5E\nbJ2a23jGh6F1fC6gGdN/tstrteX6Vlja7QG05xLy2Cbyi9UKrZNHlFJT0Yxg/Qu7LDo6OaGUaoAm\nRErLXR5XoVM0KBY9fB0dHQ2lVHelVGmLimk6sF4X9jpWdIGvo1O8GIE2WOcY2iCoUYVbHJ37CV2l\no6Ojo1NC0Hv4Ojo6OiWE+zaAVZUqVcRoNBZ2MXR0dHSKFLt27fpHRKo62nffCnyj0UhcXFxhF0NH\nR0enSKGUOpHdPl2lo6Ojo1NC0AW+jo6OTglBF/g6Ojo6JYT7Voevo3O/kZqayqlTp7hx40ZhF0VH\nB1dXV2rVqoWLS54ClQK6wNfRyTOnTp2iXLlyGI1GlMp3kEkdnQJDRLhw4QKnTp3Cy8srz8fpKp1i\nyNKlYDSCk5P2u3Rpbkfo5IUbN25QuXJlXdjrFDpKKSpXrpzvr029h1/MWLoUhg+HJMsUKCdOaOsA\n/QoqfmQJRhf2OvcLt9MW9R5+MWPSpFvC3kpSkrZdR0enZKML/GJGQkL+tusUPdasWYNSij/+yH0K\ngw8//JCkzD2AfLBo0SJGjx5928fnlUGDBrFq1SoAhg4dyu+//55t2i1btvDzz7aw8syfP5+vv/76\nrpexOKAL/GKGh0f+tuv6/rvH3arb5cuX8/jjj7N8+fJc096pwL8T0tJuLyrz559/TsOGDbPdn1ng\njxw5kueeey7b9Dq30AV+MePdd8HNLeM2Nzdte2as+v4TJ0Dklr4/J8GkvyDyxu3UbV5ITExkx44d\nfPHFF6xYcWtWSrPZzNixY/Hx8cHPz485c+Ywe/Zszpw5Q0hICCEhIQCULXtrXvtVq1YxaNAgANav\nX0+zZs0IDAykXbt2/P333+TE1KlTGTBgAC1atKBOnTp89tlngCaMW7ZsydNPP03Dhg0xm82MGzeO\noKAg/Pz8WLBgAaB5mYwePZp69erRrl07zp07Z8u7TZs2trAq//3vf2nUqBH+/v488cQTmEwm5s+f\nzwcffEBAQADbt29n6tSpzJw5E4C9e/fSvHlz/Pz86N69O5cuXbLlOWHCBJo2bUrdunXZvn07AAcP\nHqRp06YEBATg5+fHkSNHbvveFAlE5L5cGjduLDq3x5IlIp6eIkppv0uW3No3fcd0iToeJdOni1Qz\nnJd3mSA/GZFpwcgm2oo3+wSyHmfN181NRBNh2uLmljVdceX333/Pc1pPz4z1ZF08Pe+sDEuWLJEh\nQ4aIiEiLFi0kLi5ORETmzZsnPXr0kNTUVBERuXDhgqUcnnL+/Hnb8e7u7rb/kZGRMnDgQBERuXjx\noqSnp4uIyGeffSZjxowREZGFCxfKiy++mKUcU6ZMET8/P0lKSpLz589LrVq15PTp0xIdHS1ubm5y\n/PhxERFZsGCBvP322yIicuPGDWncuLEcP35cvvnmG2nXrp2kpaXJ6dOnpUKFChIZGSkiIq1bt5bY\n2Fg5d+6c1KpVy5aX9ZqmTJkiM2bMyFAW67qvr69s2bJFREQmT54s//73v215Wq/p+++/lyeeeEJE\nREaPHi1LLA04JSVFkpKS8ngn7g8ctUkgTrKRq3oPvxjSrx+YTJCerv2eNoYTHR9NeDhsn1Wd7vOf\nxLT6Uc6NfIxJna4QGlaRRafn0oXvONh8E/Wf9WLLCSOLB26mb99b+eZkENZ7/hm5W7aU5cuX06dP\nHwD69OljU+ts3ryZESNG4OysOd498MAD+cr31KlThIaG4uvry4wZMzh48GCux3Tt2pUyZcpQpUoV\nQkJC+PXXXwFo2rSpzTf8xx9/5OuvvyYgIIBmzZpx4cIFjhw5wrZt2+jbty8Gg4EaNWrQtm3bLPnH\nxMTQqlUrW165XdOVK1e4fPkyrVtr88QPHDiQbdu22fY/88wzADRu3BiTyQRAixYt+M9//sP06dM5\nceIEZcqUyfW6izK6wC8BHNsWRJcvuvPTdiPReypyY9sbfBJ6jDJch6D5pF2ox2HTSG5MqA6hYxl2\n3EQ8Xuwy+7Pt6tOUeqU+kL2wsqorClp9UZTJry0lL1y8eJGoqCiGDh2K0WhkxowZREREIPmYxMje\nlc/eh/tf//oXo0ePZv/+/SxYsCBP/t2Z3QKt6+7u7rZtIsKcOXPYu3cve/fuJT4+ng4dOuS5vAVJ\n6dKlATAYDDb7wrPPPsu6desoU6YMnTp1IioqqlDKdq/QBX4R4HZ6z52WduL9X96n6sud2LX5awxL\nFxLtn0DSc0+T8sQ7kFSZpKpnIN0AtX8l/ZVa4HoFgDcf6EMvIrgxKpAzQet56EhdwsMtwsoYDcHh\nGc5lMOiuoJnJjy0lr6xatYoBAwZw4sQJTCYTJ0+exMvLi+3bt9O+fXsWLFhgE2QXL14EoFy5cly7\nds2WR7Vq1Th06BDp6emsXr3atv3KlSvUrFkTgK+++ipP5Vm7di03btzgwoULbNmyhaCgoCxpQkND\n+eSTT0hNTQXg8OHDXL9+nVatWrFy5UrMZjNnz54lOjo6y7HNmzdn27ZtxMfH53hNVipUqEClSpVs\n+vnFixfbevvZcfz4cR5++GFeeuklunbtym+//Zanay+q6AOv7nPyO5Cq07vhxG+owwXx44cOr9Dw\nLy92tfwvhnrbMTsLKEDdACczpJUGQwpcfxDK/QVmJ1wOt+Nq0Apo+BO4n0eZnbmyfzBN1ocw/eFD\nDOhxg9TIW4LCzS2rsLdSkl1Brfdm0iStHjw8NGF/J4Pfli9fzoQJEzJs69GjB8uXL2fOnDkcPnwY\nPz8/XFxcGDZsGKNHj2b48OE8+eST1KhRg+joaKZNm0bnzp2pWrUqTZo0ITExEdCMsGFhYVSqVIm2\nbdvahGxO+Pn5ERISwj///MPkyZOpUaMGhw8fzpBm6NChmEwmGjVqhIhQtWpV1qxZQ/fu3YmKiqJh\nw4Z4eHjQokWLLPlXrVqVTz/9lGeeeYb09HQefPBBNm3aRJcuXejZsydr165lzpw5GY756quvGDly\nJElJSTz88MMsXLgwx2uIiIhg8eLFuLi48NBDDzFx4sRcr7soUyBz2iqlvgQ6A+dExMfBfgV8BHQC\nkoBBIrI7pzybNGki+gQoWo/+hIPpDDw9Nf18Zt5/fSavqImw+3m4+CiEjgVxApWeMaECtr4Oj26A\nmrupmOjCZfc0uFkGxACu1yCxKqxaTqmwZ+gdV4nlwQkYdg9k+YZLzGU0MW7t6Db9fb7du5kbX2zI\ncxmLKocOHaJBgwaFXYz7gqlTp1K2bFnGjh1b2EUp0Thqk0qpXSLSxFH6glLpLAKezGF/R6COZRkO\nfFJA5y325Nf4N2bJXJ7eXRWC5sOj/4V0Z3CyCHsFJD4Eqe4YbpaG4GlQYzfqtD/XSwkusYOhVJIm\n7G+UA/fzuDVcRNoVI4tbnyDN1JaUoEVsan6AOOM1fLtWYdk/Y3ngUrss6guloFOnAqsGHR2dAqBA\nVDoisk0pZcwhSVfga4vLUIxSqqJSqrqInC2I8xdnHngALlzIut3DA8LDIejKZkKWDrXpDaJPeBF8\nIoj9D3xD/KObQdAWQF2tipT7C2NsKOcuPkZyhyk4pzlRddMrnKEW9G+vJRRAFG5/hJAUtETblloK\nasVB7Eg+CV2AIf0Z/ucEbJxJ14SH2eMH//ufZrQF7ferryA4WI/hUxyZOnVqYRdB5za4V0bbmsBJ\nu/VTlm0ZUEoNV0rFKaXizp8/f4+Kdv+ydCk4sE3h4qLpg49dG0a3ZdeIPuEFIkSf8KKb8SVWd/6B\neI+/NMFtdaT4owtS7jzGhIcwBf1IWa91yNdROH2zmVavnCUqCnBKx/Vcbcp/9S3u6jppj1oMader\n0n7pGE2KN/oCRDAbgITHGRXjypeNvuWS1yNUemz8rUIao0kKDC/RhlsdnfuN+8pLR0Q+FZEmItKk\nalWHk66XKCZNgps3s24vX17rNffZtB4V1ptuxpd4gzfpZnyJm336EOOdANcrA6BSLLqWazVh40xc\nU1zx2TiAc+pBPCWEL14PYfno8cSeiWVirZ8wf55AE1M5hvyvLDddwHDWl1kz++FlMlLq1xHgnKq9\nRMxO4LGDzzvtRvyW82fd41w3V6AtmzVPnrBecDqoRBtudXTuN+6Vl85poLbdei3LNp0cyCAsg8Ph\ndBCYQrB4p8G5v+l9CJaE9eftuFdwa9IfvwP1iU9/hPNBa+iysSVrY7bTtVMN1gfNp+zGKZi+jefz\nz7OqWcYHj4dgaOgGr8wxENXEhdIxk3FuPBu8f2XF9VY4NX9B+2pILY3WdG6SGvQlpLnitPFdXFu+\nzmNlUvmtCTwa+RYxphA8PO9JVeno6OSBe9XDXwc8pzSaA1d0/X3u2AbpPNuJ8hX2Uj7sSX4yKsyi\n6Nrdi3b9DJw58H9I3Eho/TYSN5Jp3z1Ak+o/8Ni1mXz3v204IazfcBo2ziKpzq8Ohb09NR6Lxtyj\nF1GjIrjxw1usf341k8IOYh44gAf/qQYbZ/L0shGaO6fB0tu/WpP0mHGUO/IY77QGz7gniTG9Tht+\nwoG3nY6OTiFRIAJfKbUc+AWop5Q6pZR6Xik1Uik10pJkA3AcOAp8BrxQEOct6uQ2oOrddzV9fbXj\nDbkatILkg/15JsyF+v1rs87fhOx+nmhCKNVkDpO3Qqkmc+hmfIlx1dZz+psxZPC4jRlD+uINGXTq\njs4feyaWiJ4RhHhpwbZCvEJ4rlFv+jXuR6Vfz9A55hG20AbndNF6+2mloeIJeMmTU/5bKb23J7ua\nxFGt0zNsHzqe8icfAaOR6EmbCc84XkvnNjh16hRdu3alTp06PPLII/z73//mpiO9H3DmzBl69uyZ\na56dOnXi8uXLt1Ue+8BldxP7gGq5lXfNmjUZwiu/8cYbbN68+a6XsUiQXZCdwl6Ke/C0vAQimz5d\nZMKjkVKeS+LS/D/CFCW8XEuYitC/nTQzviMVxpeSqGbVRJSSqGbVpMKUCjL8P1GilOPgXUrl/fyZ\n8fQUwRglzq+6CRPKS+nm74j7BIOoyQhTECYbBGOUGDo9L0xBnCc5SZQRiaKNVOGcRE3cdFfr9G6T\nn+BpIpJzFLvbID09XYKCguTLL78UEZG0tDQZMmSIjB07NktaaxC1u03mQGb5IT9ltAZUywsDBw60\nBWIr7ujB04oIuc1M1endcFYfbs+H9dfRzziUMjGj4JInVDwFKe44e2zH3+d1VkfeJCTmL0hPJyTm\nL1YPXM0jrWJzjeVyOzNjJSQANWNJO9CfGivnsjwmlv/71YwYwOV6OW307oAOmIO+ALMLhn39iDa9\nSS8ieM3Ynth9YbdVV0WSuxAfOSoqCldXVwYPHgxoMWE++OADvvzyS5KSkli0aBFPP/00bdu2tYUS\n9vHRxkEmJSXRq1cvGjZsSPfu3WnWrJmtx2w0Gvnnn38wmUw0aNCAYcOG4e3tTYcOHUhOTgbgs88+\nIygoCH9/f3r06JFrjP1BgwYxcuRImjRpQt26dfnuu+8AspTx+vXrDBkyhKZNmxIYGMjatWsBSE5O\npk+fPjRo0IDu3bvbymFfXoCvv/4aPz8//P39GTBgAD///DPr1q1j3LhxBAQEcOzYsQyTq/z0008E\nBgbi6+vLkCFDSElJseU5ZcoUGjVqhK+vr21yma1btxIQEEBAQACBgYEOQzoUKbJ7ExT2Utx7+Ln1\nwGdNmiFMKC+lJjoLE8qLy+BArRf9hhKmIIbeHaXCOBetd+8ARz14EKlcWduX2/kdkSXkrzFKGFdF\n6oSECeOqSLmRHtrXx1SEiWWklnGRgMgA4yCpMg6J8qLgK/Iekq8e/l2Ij/zRRx/Jyy+/nGV7QECA\n7Nu3TxYuXCg1a9a0hRGOj48Xb29vERGZMWOGDB8+XERE9u/fLwaDwdZjtoZQjo+PF4PBIHv27BER\nkbCwMFm8eLGIiPzzzz+2802aNElmz54tItn38AcOHCihoaFiNpvl8OHDUrNmTUlOTs5Sxtdee812\njkuXLkmdOnUkMTFRZs2aJYMHDxYRkX379jks74EDB6ROnTq28M/WPDP38K3rycnJUqtWLfnzzz9F\nRGTAgAHywQcf2PK0XtPHH38szz//vIiIdO7cWXbs2CEiIteuXbtnX055Re/hFxFsPfDgcM2N0W57\ndHw0aXvfZdZKL26a3aDUdVI99mghD1LKYYh9HnP9/5J0cAAr2ndxmH+/fvDpp1C5csbtFy5oHc3s\nIs3mFM0xQ0Awq+vlqghO/y+CGgdbcK1agjaoV0CpNE71fgnfkG4sCVvPa5H+hKSXIJedQpprsn37\n9g7DCO/YscMWVtk6SYojvLy8CAgIADKGET5w4AAtW7bE19eXpUuX5il8cq9evXBycqJOnTo8/PDD\ntl6zfRl//PFHpk2bRkBAAG3atOHGjRskJCSwbds2+vfvD2gxexyVNyoqirCwMKpUqQLkHj75zz//\nxMvLi7p16wJ5C58cHBzMmDFjmD17NpcvX7aFny6q6AK/kLAJz9NBmuA0RuPmBv0mRdNrVS+Cfr/M\nGNM+vH7tDAaz5g3jZKb9rw1w3zATl43vUrriYa4e/Sxbw2+/fmA3wZEN69d4XqI52ht2J02CgQO1\nGDnUjMXpmwiIDyHZ/33OBH2HSnOh1253Rm58BHFOw6lUIvtbr6V/XFneM21ixMObmN11M6ecjaQr\nJ045G5ndtZgac+9CfOSGDRuya9euDNuuXr1KQkICjz76KJAxNPHtYA0hDBnDCA8aNIi5c+eyf/9+\npkyZUqDhk7/55htb+OSEhIRCi1fkKHzyq6++yueff05ycjLBwcF5mkf4fkYX+IWEtQfu+lcIj0e+\nSJWwtoxppvj0SFvCtr7IjLM/8YKxB/FNv0OllQIB5zQDm5oeIsw4Co+YnqSv2c66dY7VxFZB7Sjw\nGsDFi9r5PT21uDeentq6vcumIzX0V19pMXLc9own/bjmySNem+HwU8jSjaz4LpH5MUepEfsUinSe\nOA4/NDvJa5O+4uxf8NY6f46YvXBCOGL24q11/rjuLIYeFHchPvITTzxBUlKSbcJus9nMK6+8wqBB\ng3DLfK5MBAcHExERAcDvv//O/v3783Xua9euUb16dVJTU1maRztEZGQk6enpHDt2jOPHj1OvXr0s\naUJDQ5kzZw6aJgL27NkDQKtWrVi2bBmgfV04Clvctm1bIiMjuWCJPZJb+OR69ephMpk4evQokLfw\nyceOHcPX15cJEyYQFBSkC3yd26dfP3jimQHsoCWBcc14pzUExjXjk4MtOfrUdD7pvYlShiTKpd5k\n1MZHSEt1x2BI4YveGzhV/xRlyjg2vP7737cEdXZ4eGSdGatfv4w9+oEDHef/6aeZti/bAMvXg0l7\nAWCM5ox3DFV+jGLzV0LE8H1owNAAACAASURBVM28V2E65R5txERje3oRwRtoxtyJxvb8fbEYGnOt\nb/Sc3qj5RCnF6tWriYyMpE6dOtStWxdXV1f+85//5HrsCy+8wPnz52nYsCGvv/463t7eVKhQIc/n\nfvvtt2nWrBnBwcHUr18/T8d4eHjQtGlTOnbsyPz583F1dc2SZvLkyaSmpuLn54e3tzeTJ08GYNSo\nUSQmJtKgQQPeeOMNGjdunOVYb29vJk2aROvWrfH392fMmDGANhPYjBkzCAwM5NixY7b0rq6uLFy4\nkLCwMHx9fXFycmLkyJFZ8rXnww8/tKnAXFxc6NixY56u/b4lO+V+YS/F3WjbsaPIrMciJcqIuE4o\nJbxaQSqGjBZerSCuE0pJUGd/qdG5k/Tp7C6bjUg8nhJonCF0Hi5l+wyXPnOmZ2t4zW3Jzv0yO0Nv\nvpfg6VK6flSGc0Qdj5JhnZEq45ABxkEZjLmbjUXDmJtvt8z7iLS0NElOThYRkaNHj4rRaJSUlJS7\ndr6S5BpZmOTXaFu0LRBFlE7vhrPL+Dk/7BnBKHpQWn3DDScDl703gksipVPNTDuwjydMe1kBrLAe\naNKWRGDTpuwjaeaEp6emkpk0CQYMyDgxhyNXTUcYDGA253COU+MdnCOEN37wpP6BiowNW0/LuDYs\nbhHLqKhHqHcyzXZsdHw0sWditVAPOgVGUlISISEhpKamIiLMmzePUqVKFXaxdO412b0JCnspzj38\nWZNmCBPdNDfLvk9J2eaThckutgFVo4w95L1glWtP2mAQKVUqa++9cmXH6a1jf7IbcJWXLwY3N5FR\no7Lfr1T25xgXuEkqc04GhHgKU5E63ZqJGldZptd9zDZwzH1SBXH3iSqosUoFSlHu4esUT3S3zPuc\n8HAInLODWcvqQKob1P2exPbvglMq7Y+Ce/WdfMIoLv8dkcXmlxmzGcqVy6om/uij7O2FOQ24ys2B\nxJr/vHlZ3T2teHhkf47P4tvR99mv+L7JSV7fCn/X2UOp7f/HlK6/8UYboXurixiWLqTZATNSMGOV\ndHR07NAF/j3m2LVhdHtgIIGmSnj90k1ztzSkQ2I1nlzyMusikykV1p1VnpVtNr+cuHgxq+E1J3th\nTu7h776rpXeEdbpCq80xp5dKdue4XCmaFQHTWTVqM29HCWu2VcK15ZukHu3M263h+t/NmcKbbORJ\nLcwykPRgNKOXF0e/TR2de48u8O8x1hj27Zs/SXyLtdhmpCr7N680r8VcUyQ3I9dy1inW5kWTk9DP\nrlfuyAMnp/RWr52RI7MKfUfehJlfKpUrQ5kyms7eKZtWVbFhpsBsv57jje2p0OBb2DoZVSOON3vv\nY7bRh334E2icCWG9uPx7UPYVoKOjk2d0gX+PCfn1HM9u98Ac+iq4XMcl1YVRGx/R1DuhY1ndPAFM\nISRtumW0tEbNzEypUvl3687NPXzePFi8OG/ehNaXyuLFkJysGZBFHBt03dxgbt/xNmEPEN30Qd5q\n6YLzb71pH59K6orvSFLuvNI7nkbdffnz2XEERk6g4qWQrBnq6OjkG13g3wPsfdtPKg92GfzgWnVQ\n4PzLvwiLqc2bywLgcGd4OOsgpH79YOHCjHrzypXhyy/z79adF/fw7L4OsiM77x6DIeeXxor2XZDI\nlfznwG/sCZtGY+JIW7GeUpdqscn/b4IPVeOkaSBfXu7GMxU207dv/q61OFLWwdDp+fPn2wZjLVq0\niDNnztzrYmVAD2V8H5OdNbewl+LipZPZY6Utm6S88VupMM5FJocgFca5SHnjt9KWTVmCnBUVbicQ\nm4gW/jlq4iYxOxlkltFf1LjKUr3bE1oY6O7PihpXWWYZ/SWKNuLONanC+UL12smPl8706SJRURm3\nRUVp2+8Ed3f3HPfnJ4xwftBDGd+f5NdLp9AFe3ZLcRH4rs93FJrPyhBh0mmiq/g/7yJmlCwzVhOX\ncRW0yJOWNC4u95c7Ym7caWDI7uU1d8323aoJU5EHunUUgqeLah4ubuPcxNX4vSjM8ryxj1R86g4l\n5h2QH4EfFSVSpcotoZ95/XZxJPCtESsjIyPF3d1d6tatK/7+/pKUlCRxcXHSqlUradSokXTo0EHO\nnDmT5fiBAwfKiBEjpHHjxlKnTh1Zv369iIgsXLhQunTpIiEhIdKqVStJTEyUwYMHS1BQkAQEBMia\nNWtERCQpKUl69+4t9evXl27duknTpk2zRLYUEfnqq6/E19dX/Pz8pH///rJz506pVKmSGI1G8ff3\nl6NHj2Z4AWzevFkCAgLEx8dHBg8eLDdu3LDl+cYbb0hgYKD4+PjIoUOHRERky5Yt4u/vL/7+/hIQ\nECBXr169s8ouAugCv5DJPOcFzWcJU5TMaK4kHaRTX8tkIZleAgRPt60XJWEvcnuTqdgDIoHGGeI+\nEU3oT3YSl8GNhHFVtHoKnibevb3EaTKC8Q4l5h2QXz98q5CfPLlghL1IzgJfJGPv+ubNm9KiRQs5\nd+6ciIisWLHCFnLYHj2UcdFF98MvRBwFG+se4wEbZzAuVGg9GDbUBTbO1LZbMYXATs1I6+l5R+FW\nCoU7DRtTrVk0e8KmU3fZDHav2c9jhyuR6rEb/qkDLd+Dhis5WD8ew+FQul+8ZRHObYrIwiYkBEaN\ngrff1n5D7rHt+c8//+TAgQO0b9+egIAA3nnnHU6dOuUwrR7KuGSg10gB4sh4GUEfZsf8i1caPM52\nzx1w4nFmxZziJSbgQlqGtHcYTLFQsfr/3w4h/WNZ/XEEe0whtGUzk1b60r53GdLr/wAp5aHmHjjd\niGkrGzCw7FCMRhMnTmgvFxEtjxMnNJfQnTs1T6P7geho+OQTmDxZ+w0JubdCX0Tw9vbml19+yTVt\nfkIZO4p6ea/JLpTxU089xYYNGwgODmbjxo15DvRWUtB7+AWIowFHBszQ/EPw2AknWmq/zT/EGTNL\nlhRoMMUiy/LR4/ni9RA8PSFateMlPkJWfkfZ62XA9SrcKA8VE4g0VqFSYoItCqhV2FsRgfnz74+e\nfnQ09OoFERHw1lvab69e2va7iX1o4Hr16nH+/HmbwE9NTc124hI9lHHJQBf4BYijQU2zmiteCQU2\nzqDlwndg4wxeCYVZLRSTJmkvCfsAZiUVe1fQg/hRr3ddEt2TIbEKlL6KU0IQcWEzWWl8MMd8RLSw\nzoUt9GNjNSFv7dGHhGjrsbF3lm9SUhK1atWyLe+//36G/da5ZAMCAjCbzaxatYoJEybg7+9PQEAA\nP//8s8N89VDGJYTslPuFvRRFo60j4+VDzwYKzWfKLF4WAZnFy0Lzmdr22zRyFnfK9h4lTEFceodK\nFG1kVCds648ED81TiOa7UZ/FNXia7hpZdNGNtoWII+NltYO7meXkyRjDHADGGObQZ5cnfy3bneFY\nawCzkoYjw2s5vyjUrlG0XDkWL+KZu0HRP64spR48yLGdn+Up35Janzo6OaEbbQsYR8bLpUt7Yjyd\npqlvamU/E9Vdnt/6vsPq1WQ1dFujY3766R/gCZP2w8MJJk3l1RfSvoNLhs3MNw/FgwQS8OB5PieK\ndg7zL2n1ebssWrSosIugc4/QBf5dxpFQs/cusecO5rcukuQUqtlRSIcav2+ml9mfeLwwcoJ4vPgN\nf55gMz85EPolrT51dHJDF/gFRHg4HPvvEfrsGkfI1bUARJfvynspM0hKqZMhrUhWoV+UXTJvl5xC\nNTsiZOlQIvCic3BL6le6xtEDL7HG1IugyvFUqbqQlEdXwKVHYOf4ElmfOjq5oevwC4igK5tZGV2V\nblcXEU0bomlD96uLOJlS1Rbb3R4R3SUzp1DNDklIIIQt9Dh9kt3eR0jp0xeMW4gtfwKnvt1Qfivh\nTFCJrU8dndzQe/gFQPjOcI4df5uWncsRfWABT5m+I924lXSfl+nAj/hcSiZq56UMx1gnFCnJvPtu\nRnUX5PylE96pAs4HPfnWNIf2Kx9iU+95hPa/iSiBG2Zk+Xo8JaTEu7jq6GSH3sMvAI5tC2KpZzLR\nfmdJ6x1GcvMFpPTui9lvJdu9z9L+dMbwsLq6QSO/IRmcAyYxNuwUzxkHsdM0ibqH65HqLKQZwCVm\nBN1NF4r9tIhFITyyI/bu3cuGDRts6+vWrWPatGl3nK99KOa7ibXez5w5Q8+ePXNM++GHH5Jk14vJ\nLUT0vUQX+AVAn0QzhpWRmM2lSHEBQl8BlxTSzaWYstKfZtc8S7z6JjvyE3s/rfxYZpZ6lciwb2jQ\nLYjDfrGoNBe4WQZpNpefjAbasvm+cMkM3xlOdHzGYbXR8dGE7yz46RpHjhzJc889BxQdgf/000/z\n6quvFmKJsIVkyA81atRg1apVOabJLPA3bNhAxYoV832uu4Eu8HMgr8G5QpYOZYppGzd+HQ/OKdo8\ntc4plP51BG+athA74vN8TSii45jx42HMO2PpeBR2BfwB6Qbk0iM8tLcDN8QVc58wrjdfAH27cCK4\nU6GWNahGEL1W9bIJ/ej4aHqt6kVQjYKfrnHq1KnMnDmTVatWERcXR79+/QgICCA5OZldu3bRunVr\nGjduTGhoKGfPns1yfHx8PC1atMDX15fXX3/d1pvdsmULnTt3tqUbPXq0zYXzrbfeIigoCB8fH4YP\nH24Lt9CmTRsmTJhA06ZNqVu3Ltu3b+fmzZu88cYbrFy5koCAAFauXMmiRYsYPXo0AAEBAbalTJky\nbN26levXrzNkyBCaNm1KYGAga9dqjhDJycn06dOHBg0a0L17d5KTkx3WidFoZPz48fj6+tK0aVNb\niAbrSORmzZoxfvx4jh07xpNPPknjxo1p2bKlLRxD5jqxYjKZ8PHxAcBsNjN27Fjb6N45c+Ywe/Zs\nzpw5Q0hICCGWYdZGo5F//vkHgPfffx8fHx98fHz48MMPbXk2aNCAYcOG4e3tTYcOHWzXNXv2bBo2\nbIifnx99+vTJV7twSHYjsgp7KeyRtvkK+auUDDf2FqcJ7sLrpYUpSOnXEfcJBunc/es7nvRC5xZR\nx6PEfZKSOsPLChPdhImlhSlK6nZqKx59vYXXXYQpiGvIrGzzyBzCOq8jcvMdHvl4lFQJryKToyZL\nlfAqEnX8zuMj343wyF26dJGvvvpKRETmzp1rO0d0dLQ89dRTtnQvvviiLFy4UERuhTcWEenfv7+s\nW7fOdv4xY8aIiMj3338vTzzxhIhosfVffPFF2zGZ10VE1q1bJ48//rjcvHnztkIx2+Pp6SnvvPOO\niGhx+K3XMXDgQHnqqackLS1NRETatm0rhw8fFhGRmJgYCQkJybFO4uPjxdvbW0RE5s2bJz169LCF\nYbbWif0cAPbrcXFx4uPjI4mJiXLt2jVp2LCh7N69W+Lj48VgMMiePXtERCQsLMx27dWrV7fNA3Dp\n0qUs16mPtC0gcvIRz0x00wdZ2nsV6QahdCq4bXwT51QDYkjnp4DRBIXd5YhZJQRrL/ktwrn06XFq\n/11O05OlluJwUBQJj/4JhlTU+Tp0+KuLwzwchbC+Wzr/EK8QRjUZxdvb3mZUk1EZ5vO9F+Q1PPLO\nnTvpa5k/csCAAXnKOzo6mmbNmuHr60tUVFSGoGyOQhfnxpEjRxg3bhwRERG4uLjcUShmK9Zr6tu3\nb4aIoWFhYRgMBhITE/n5558JCwsjICCAESNG2L6A8lInmzdvZsSIEbYwzLmFfN6xYwfdu3fH3d2d\nsmXL8swzz7B9+3YAvLy8CAgIADLWm5+fH/369WPJkiUFEu65QAS+UupJpdSfSqmjSqksijml1CCl\n1Hml1F7LMrQgzns3yY+P+Id+XUg7+Cydf6vADytT+C4mGsPKSCr+1hHDoT7EnrnDiFklEEfqtNgz\nsUT0jCCt/Fhee2wrFw6+AM434GptTY1m0HSypXcP5OXT4xzmm58X+Z0SHR/NJ3GfMLnVZD6J+ySL\nTv9uI5bwyHv37mXv3r3s37+fH3/80WHazOGRAZydnUlPT7et37hxw/b7wgsvsGrVKvbv38+wYcNs\n+8Bx6OKcSExMpFevXnz22WdUr17dVvZvvvnGVvaEhAQaNGiQ94vPdE32/60hn9PT06lYsaLtHHv3\n7uXQoUMOj7nbWOsMMtbb999/z4svvsju3bsJCgq6LbuDPXcs8JVSBuBjoCPQEOirlGroIOlKEQmw\nLJ/f6XnvNvnxEd8W+RnB3z3HnO9K0dqk8CKeJqZynPnue66vXMD44PF3t7DFjOx64TVN4wnxCmH8\neEjr2pP1MdF4xz4OlY+CoC3AYBbC1SuEO7CP5new1+1i/RqJ6BnBWyFvEdEzIoNO/25xO+GRg4OD\nWbFiBQBL7T51PD09+f3330lJSeHy5cv89NNPwC3BX6VKFRITE3M1YmYuV2aGDBnC4MGDadmypW3b\nnYRitrJy5Urbb4sWLbLsL1++PF5eXkRGRgLaS2bfvn1A9nViT/v27VmwYIFNCOcW8rlly5asWbOG\npKQkrl+/zurVqzNcc2bS09M5efIkISEhTJ8+nStXrpCYmJht+rxQED38psBRETkuIjeBFUDXAsi3\nUHn3Xc190h6nluF0CZ6Zoev5/uszuewdThTt8MKEgXS8MNniu+jD+/NPXnrh48fDZK+2HPS3BKFT\nwN++cNONT0KP0bl5CM5rV2UR+vke7HWbWL9GrGqcEK8QInpG3PHX3t0Ij/zRRx/x8ccf4+vry+nT\np23ba9euTa9evfDx8aFXr14EBgYCULFiRYYNG4aPjw+hoaEEBeVuiA4JCeH333+3GW2tnDhxglWr\nVvHll1/aDLdxcXF3FIrZyqVLl/Dz8+Ojjz7igw8+cJhm6dKlfPHFF/j7++Pt7W0zDmdXJ/YMHToU\nDw8P/Pz88Pf3t72Ihg8fzpNPPmkz2lpp1KgRgwYNomnTpjRr1oyhQ4fa6tQRZrOZ/v374+vrS2Bg\nIC+99NKde/tkp9zP6wL0BD63Wx8AzM2UZhBwFvgNWAXUziav4UAcEOfh4ZHFGHGvyWzcG/3sDFHj\nKssso78W6tjoL2pcZXnce4bDEL1K6SGPbwelHIc8VipjOoKni+u/q2hzBL9STfvtNELo+5Q8ONRT\nSnceIMNfH5rhmDuZf7e4hkfOjCPDcFEjs+G0uHK/Gm3XA0YR8QM2AV85SiQin4pIExFpUrVq1XtU\ntOzJ7CP+3sa5zIysxdiwU7QKac3YsFPMjKzFmhNzs3wNKAUjR+oumLdDXnvhlco7c6PiBVxih+Du\n8g9OaS4QtACu1OJclUs4ey+jz6b1GY650/l3dXSKMgUh8E8Dte3Wa1m22RCRCyKSYln9HMj+O+w+\nxu1CAmNM+3g8zoftrbfyeJwPY0z7qJSYkEWILF58/8ytWtRwpE7LPDp56VJIrLYZNs5k5YYLvLXC\nBzGXgXQDNPkMJzGzfqWZkF/PZck/P4O9SiJ3qie+HzCZTLYJ03VuURACPxaoo5TyUkqVAvoA6+wT\nKKWq260+DRyiCBAeDtGTNtt09uk48YKxB9ubHKDl1tbsaHKA943+JOChC5ECJLdeuNWom7poA8SM\nYS6jec+0iXb/awAGMzilk/7r/7HH9HKBK+fFUVxrHZ1C4Hba4h0LfBFJA0YDG9EEeYSIHFRKvaWU\netqS7CWl1EGl1D7gJTSd/n1N1Zc78fEfQXRbdo3oE14gQp1O7fhkwFqaHarOtuitNvXOAO/RhV3c\nYkdOL9DMRt0o2lHf+DGbmx2i9E0nnMxOOLeYzkRje+3eWYzrnd7NaMHN60hqK66urly4cEEX+jqF\njohw4cIFh3MP54S6XxtvkyZN5F4ERcqOrn0Hsq7eYgw3nXFfvpKHHvovh0M/RaW68OMyM21NQgIe\nhD08mtQn09n7se56ea9wcso0gYwxGvp0xyBpLF5Zlm0PlWV+6HFcUg0MXtaDevzB2LBTvHj6Vdbv\nHEtCAjzwAFy7Bjdv3srGzS1nfX5qaiqnTp3K4HOuo1NYuLq6UqtWLVxcXDJsV0rtEpEmjo7RBX52\nODvTNegx1oXugHQncDLjdNOVzctv0NqkcFbp2tR7eijee47RmGmayOBwqHQMDvQBUwjxGAlv7swn\nT5zCeLIGJ6pdZWZkLXqeuIynmHLMWw9brVPUyUng6/Hws8NsZm3MdtwbNCLJU/P1Lh3zAph24+QZ\nT7qpcItXkskSR39nxq8rDxKYFyMcKNOa7a230nJra8aYtpJO7iMn9XlwdYozeiwdstHlGgwEN+9O\nksceMBtA4GbzOXQzvkR0v/t+oHCxxt6oC5ph155TyoP3jf7scGBczw19oJxOcabEC/zshvG37diP\nn0PXYLjpTNRiM09vbIm5VBrXn+3LirKGwi52icdq1BXRXGDtPXpm9B1tGyNhb1x/xpizcV2fmEan\nuFPiBX52w/i3VjpP/fOBbFppJsQEa2N/5uk/B1A2xZ9HWunB0O4nMnv0HGuYzsxSrzJGLoNSjJHL\ntEhqxl6/IxmOMzwajVv7cH0Alk6JocQbbbN4fFhQShMgOsWDEe9Fs/R6d75c7krP4+dY9fCDDOl7\ng37uq1nw2r0NW6yjczfJyWhb4nv4pduGE2icSTxGzDgRj5FA40xKty34qeh0Co8+iWYMSxcyvMdF\npoYIw3tcxLB0IeenmYvt/Lc6Opkp8QJ/aDUn9oZN41tjRZwQvjVWZG/YNIZWK/FVU6wIWTqUNabZ\n3Iz7F2+3hptx/2KNaTbvXx1arCc919Gxp8RLtTk7HQdEm7Nzbr5HYurcP2S+d3IiAYxbSGvyBWyd\njGoyH4xb8OQEzZM2M2CAfn91ij8lXodvVeK3Crnls70teiuCoqxbegaDbm4jMXXuD6yeV/b3brnX\nQ4zseZGrF/1RB8OQ5rNxdj/DtKV+THwoDKeHf+RmojcPNovizMQ/Cq/wOjp3iK7DzwkPxz7bpw0e\n92wqPJ2Cwdqr798/q+fV6zW6IJEr6XKwDOmhr8KVmqQ6C688fZGU0MnULr0fc+AnXPutbaGUXUfn\nXlDiBf77/R37bD9d27HPtj4S8/7EfjyFI47t/IwgUznWxGxn1MaHEY9YuPYQVDpJ2RtOHPa4BLGj\nuB6hx7TWKb6UeIG/uUxWn+2ZpV7l0COOfTL1kZj3J47GU2TmqGc7FBAWUxsSgqH8X2AuRWIZM7Wv\nQOcNHTAY8h9FU0enqFDsdfhLl2rCICGBfAU7c6QH1nX49y/ZjaewYr13NV7oRseGQaSEToZr1aDc\nX3CjArhegdiR9N4Synq66fddp8hSInX4S5dClSqaPjdz2IS89Nj0qfCKFjl9ednfu4lPNyAldDLe\nCeU0YZ/QAlyvar9B86nUprtuu9EpthRLgW/tnV+4kGlHcDj1HpxJ64FG2/e6o4kxrOizWBUdspsW\nccmSjPfuUt19jDIPJ83tKqNiofbC5bBxJs4p7oyKhWgvx/nrthud4kCxFPjZ6XMDT2uDrCJqVwQR\n3lcVGXtzGu2Si2U1lCjy+kX2x+QNzHt7Pn98DGyYy0k8qB3Tg7RlP8KGuRz62HH+uu1Gp1ggIvfl\n0rhxY7ldlBLRlDgZl3g8ZZbRX9S4ytIypLWocZVlltFfxNMz27yWLNF2K6X9Llly28XSuY8YVWO1\nQLqMYq4IyCjmCqRLZ1ZnaT9ubvp91yk6AHGSjVwtll3b7HpjHiQwxrSPx+N82N56K4/H+TDGtC/b\n7/XsQifrXhtFn6hy3RhVYy3zGE2n4FY8YvyMYXzKEepzXLR4SgSH67YbnWJFsRT4jvS5AKedHA+y\nyu4NkV3oZN2AV/T54w+Yd7obiHDtchfGhZ2ivvETeLEB/XonsjdsGoGntcfjvX0v4PJ/9XU3TZ0i\nT7Gc4tDaG8vsjhl5aDRjb05jZmQtxpi28n68P2PDTkGpVxnjIJ/sDHW6Aa94sfjgXL69XotXwk7x\nwGUjf9Y38dgfldlqehXPTls56PYdxI4Cu6880Hv9OkWPYtnDB8ceNtkNstpcJn+DrHQDXvHAOsDK\nqup76O9qXKxp4oEzRn6uf4GaLztzJug7yp+sDxtujcDVv/J0iirFfuDVnaAPviq+2N/beIx8a6zI\n2D7xiFMqpJeidFo6KWWvodKh49f/xwbT+xmO1yfI0blfKZEDrwCqvtyJrn0HgrOz9oQ6O9O170Cq\nvtwpT8frg6+KL/b2mWeMlnhKK7x4etkwcE4ixf0aCIiCDQ2T6cwaLbExGoLD9a88nSJJsRX44TvD\nqXvNzLp6i+ka9BgAwaENWVfvax77u2qe89EHXxVP7O0we2qmExD5Ki+ZDkDDVWBIBQWcDYR0Zwia\nT61O3TVhH9YLzM60GKvPiKZT9Ci2Aj+oRhCHK//IY7E+rAvdgfvIh/k5aD+PxfqyOkJ3syjpZOih\n7xzPHtNYQvkv6+uCS5qi/NaXoMJJ2PU8AEt90YT99tdQrd9j+FNBhVJuHZ07odgK/NjIEF6N9Oew\n935KXalK0kPHKf13HbpueBKVbi7s4ukUMo5cd6OMBsq7nKfj0n9xNfpD3CO/AO9vKH8siGtlIOif\na7i0fIs25yMI8dInPtcpehRbgX/sv0d427QF17P1uFnxHM6XHyKl2lE+bp6MGUNhF0+nkHFkn3Gr\nE0uDyMmsM33IKOax3vQBhu1juerxO9VPPEKsZwrqaHucvjTr/vg6RZJiKfA7vRvOvnJhXO3dn1OP\n/Emlo4GklU6Cc/VICJ1LYPN++iAanSz2mU8Hjud/JybTmbV8zGjEuAWXlu/gFPUGZ6ufxfloK1L9\nVlG7+cucOAHPvxNN37m6Ll+n6FAsBX67ZCf+53MEqf89/NGZSztmgcEMD/5Bwz+8OPDweT1Ugk4W\n+vWDxYthv2c3DEqIrQmDt1cjvWU4RL1NqepxdImtwaK28TzafCQpXXoRvUTX5esUHYqdH/7SpdB6\noJFRHW/ynd8VQAAFZmdGba2Kp+EYr+7MeM2enloPT0cnA0YjI2oZWHr6Q9JNbUk3buVm2HO0O+LM\njobnSF72E+pEiO6PHFWRkwAAIABJREFUr3NfUWL88K2DaWqYExjzXT0Mv/wbSiVDqSRK/TqCZTFx\n1N3ZNctxeqgEHUdE9/ucb3fGsN70PmOZRYqpE4a4oWwK+JtXfhEwhej++DpFimIl8K2DaRLwYKKx\nHeZm8zDcLAU3y+DU9CNSjdsZx4wsx+kPrY4jYiu0I6L8MABmMYb2xtcwN/mcRls7Mb8JNHl4JtWr\n6/Pf6hQdCkTgK6WeVEr9qZQ6qpR61cH+0kqplZb9/1NKGQvivJk5EdwJn+YDWWV8gJg+4bhKMs32\n1sP9nBcpypWbvXtzzHgqwzFubpqLno5OZsaPB0aPphcRDDQOYnPYZ4yMbENC9CLCInuwq8c0Uv6a\nqYfP1iky3LHAV0oZgI+BjkBDoK9SqmGmZM8Dl0TkUeADYPqdntcRfvFVORC6mBktDHQ+4E6j/fX4\nOegArQ5UI2DFZNIODoCasXqoBJ08E1uhHRET92Gs9S0zI2sRafqEjvzA16ZFzIysRYeaGXsLemA1\nnfuZOzbaKqVaAFNFJNSy/hqAiLxnl2ajJc0vSiln4C+gquRw8tsx2qYpZ3o0f4x1oTson+DLVY/9\nPL3xcb6J+RkX0gAwGCAtLb9XqVPicXICEd7gTd4OdqVRpQhmHthFa5PCQLoWdsFnBVx6BPXzeN2Q\nq1No3G2jbU3gpN36Kcs2h2lEJA24AlQugHNnwICZtTHbNWHv+RvlE3xZG7MdA7dG1lpjmevo5AsP\nD6JpwyeMYsDpQ+z2PkKXPgZWGh/UhH3vbuCzEk4H6TYhnXxjDdVtifGIUnfHJnRfGW2VUsOVUnFK\nqbjz58/n+3gzBro2b8lVj/2UP+HHVY/9dG3eEjMGDAYYNQrmzcs9Hx2dzET3+5xeRBBBL3xMVRm1\nsj3XxZ3Bz57H9dm2OCszrFiN27kQ3Sakky/sp1IFMFv6p3fDJlQQAv80UNtuvZZlm8M0FpVOBeBC\n5oxE5FMRaSIiTapWzXtESysd/r+9O4+LstofOP45MwwquC+5w6hppuaSkChuGEaSG3ZBE83qmmV1\nq6tImpk3jTJSb7fbT8tMrwumomFqdkli3KNA09T0asqAS+67qAwz5/fHA8iuCDjMcN6v17wYZp55\nnjOgX86c8z3f0z+UtQHbGBjbncsLf2NgbHfWBmzjif6hZGSoYK/cu6yxfD/PZLxIZJH5P3T+pRu3\nXG3cdAXDzy8RnHqegQPVnJBSPAVtpZqltOeESiPgJwIthRDNhBCuwDBgbZ5j1gKjMu//BYgvavz+\nXu1tcRa/nSNZnbADCaxO2IHfzpHsbVH8TwuKklN4OPhF+IPZTB/PZNob/8XOx3Zkp/3KLp+xwaMS\n1U8ftndTFQdzp3VApblOqFRW2gohAoFPAD2wQEoZIYSYBiRJKdcKISoDS4BOwAVgmJTyaFHnLA87\nXilKQUzNBQNC9Fx3McDRx6nFeS622Ak2AwM3d+FUQEs6dtSObVG7BeG+4fZtsFKuGY23h3MKUtxK\nAEVN2pbKJuZSyg3AhjyPvZvj/k0guDSupSj2ltimJk3PWzl8MgTrI9Fc1FvBpkPo01jbZzOulp/Y\nt1eHQWcgZmiMvZurlHMREfm3Us1S2uuEytWkraI4Au8O0ZyMW0TltssQm98Bqx70NqQO0NmQtlvc\nvKonIyqGkzv8sjMw1GpcJUvOfxOTJ8OoUVpPHrTUcSibdUKl0sNXlIoksYY/E80TiYhegQx+Hs60\nB89t2c9bDBLXHS/TZZ+VF17QVuFaLNpzWZkXoCZ3K6qsrJysHn1KCixadH8WgaoevqIUU3g4LOCv\n6M096Hy4nhbsbXqtMKsELK4Yusyml3Eq6em3g30WtRq34oqK0nrzeYdv0tJgxNxIHm02k+MuRqTQ\nPg7OfmcmgRGlt+eCCviKcg/O1WyJh88b7OxwEBerAGHVNjy3VMJFCqxC8o9h+3jQ52Xwzf8fVlVo\nrXiyevbWQnZY7XRCx+7gGaxsWhOBZLaoSVj6DPxvlF6YVgFfUe7BK5Em9vVZTedD9Xny1wboDz0B\n1kqACxnJfbDuHUmXFBeO9ZkPJ/JvkqJW41Y8ReXb9yGOX8wTmRndhLDg47T3G8T44OPMjG7CuKWf\nlVobVMBXlHtQo00ibxvX4fHdUrau/x33r1cwdtmT6H4bCk0SsV5vyN4mN6i0LBr/lNxdOlWhtWIq\n7FNdCw6zEy+20oNx5j20S+rO3l7f0iApiHHmPaX6cVBN2irKPQj3DQdfiLwI9f57mGGbXgazjSjz\nfK5fr4+114fc3DyRjeZP8a6TTLuqZlJTtZ59RISasK2IPDwKzrfXD3+Sm0dfICghhseMH7HXaz78\n8TinfBcxO7kD4+SlUmuD6uErSgmEh8MX8S1h0iRCWMlUY290XnNg8xSsvrP4JPAQVc+nsDA0jhkz\n4PWvZxNFoL2brdhBYAG/9ha+L5JyqSe3At7hSmA4G4PnQ2pXaPEjA3fVIyz4OLNHvFZqbVABX1Hu\nUc5c6iFz/JnQajDTgn/HNXoJVUxvod/1LGu9TzIosBHBH3RgyfcvE/ZDGP7N/VVufgUTFaWlXuY1\n/cQ6KreNwpD4AtL7S7hVDVqvY2BiI77dcJKZrhOJq1J6tbadbhNzRbkf8uZSA7Tq/iKnjgcyrHEY\nD52oxnvmTVwPDMPq/RXNTruTXP86BtMsRrcdx6JFuV/r5qY243FmhZVPsKLjE2N7xgefAEsVqHkM\ncaoN1T/fTkz15/C7vKbY16owm5gryv1SUMbFoW1f8viFagw7cZQPg/dQ1bgO64b5uJ9uRnKD6zxy\nGvpv8WDu3ILzsFVuvvPKOe/agsN8w2BsCASSaPMc+LOjFuwvNUbW/52bPv/H8s75998uKRXwFeUe\nFJY4seaqP37Sk0nRHTgZ/HcY0Zfr9ZNpdsqdvfUhxieFIFYV65yK48tKw+1DHGepx/P8h0305p+8\nSULgEmgRR+c/alHdcIZKic9xK2AKJx9fV+rtUAFfUe5BYXn0Hh7aZikfmjfS+U8bPBgHR/w59vkF\niJ0JAWE84DOpWOdUHF/WhO1rfMYw385YjFvpz3omGAPh0a/odrAOIckXidlSm8reMfjoX8ZqjCv1\ndqiAryj3ICJCG3fPKyUF+nzgT4tWi/jNeJHGf7SGhrvJMG7HmBDE2NgWzG9upJXIXTdf5eY7t4UL\noT9rGOHbHr1VQPAw0oy/YGv8K6129WBPi/N4nwS/hFPEjIohqI+RDaEb7nziYlKTtopyj6KitHH3\nfJNxRhMEh7Ax+gJ/7daa1Es9oe0qOBCEYd8Q9A128kjLTzmz5TQpOhM12yTy2TPhasLWSQVGRPKT\neT6XrreiW7IrP/XYgtz/F3h0oZaV43aesbEtmPNnRvEK3xdCTdoqShkIDdX+f2aVtc3WOBGiVzLH\nvILUoy+A9+fo9w8CBJbhf+FmwDsMO3yGhSYTdceG8M2/tdILKk3TOfnf0HGp3glo9R07/GKR+4PB\n+wvQp4P7efRHerMsIQlT6Pwyb4vq4StKCel0Wgnkgszk7xzxWcfcgKOQ2h08tqGzVGLyTzeZ20XH\nJP1HJB4JY+1alabptIxGZouajH/mMLjm+CULaHbKnQvV0unxx1f08BlJeClsjqZ6+IpShoqabG1O\nMlEJSehTfcBzK6T6ovvpDab3gn4/N+XDiFHc2hCn0jSdWWoq48x7aJjwDAiyb7VPNcJcrTLDK71A\nQtdxeAebyrwpKuArSglFRIDBUPBzE/iYdJ//w+bxEz1SAI9tZPSIpMqJh1nqdY1Jxr7MvjIaAl+B\nV1vneq1K03QSHh4MMr7Onz5f394zQcKFWpfoumcAi11WM6n7JBJPJpZ5U1TAV5QSCg3VsjDq1Mn/\n3BGfddwMmMLbsbX4hwncLYCQ3Gh0gJapdfgweA+vD00B77mQ3CfXa1WapmMKjIhk9jszsydlZldt\nzNrhX4BrGgYrDExsBOlu4JpGQpeVTHtiEhm2jPuy2b0K+IpSCkJD4dw5bSxfSli6NHMyt3kc4oeZ\nRCRcIK5JTaYt64Au9iMqXarHodaHyJAG1rWG5olPwIY52edTaZqOy/+GjrD0GcwWNUFKvmymA72V\nh0/WIjYKvt1vYJbuPVrr+uPVtN19C/agJm0V5b4ZUiOOLVc68CCH+Rkfqr5Zl2s1L1L1Ui2ufXKO\nDrp9/CbbqxLKji5zkjYs+Djdk9qxzWuftpGJvFQqaZd3oiZtFcWOsipjxlzxpwN7+IDJ6APHcK3G\nRVwuNeBajYvoA8fwT9ub2GxaTFDB3oFlTtJ2T2rH1l6b6Z7UjnHmPdhSUu2edqsCvqKUoayqmlmL\ns+LxJzpwE1bvryBxDBmf/AnHHsPq/RXRgbezNEzJJiK3l97m1cp95OHBbGMHtnnto8fmXmzz2sds\nYwdS8UBK7d/CmDH2Cfoq4CtKGSqoqqapGXRLfAQ2fK49EP8hLhl61rfKfD7ZRMiqELwb5d8LVyn/\nZo94jbDM/Wi3mDZn71M7xHh7IxN7pd2qgK8oZaig1Mpn/+8tdmzYjTvXmcI03M2PkREVy3lDFd41\nvUvIqhBW/mUlfs387n+DlbsWGQmmyXG5lkibJscx708br56YSMixS9gQDDFfomP0RH5tnHsjE3uk\n3ao9bRWlDBW0j+ksxuNOGusYgB+bOE19FpufpXrScKa7T2fK7hpwyUpkDUpl5aVS+qKiYHHsi0Qc\nDWRNSjP8SMGU0ozBy67Su+9hFqz5ks+sYbdfYM685WCPtFvVw1eUMlRQVc0L1KMLCTQjGRuCG1QG\no4lzXZcwcjd82jKNActusPjdw4jmJmr1j8w33qu2SLSfrHmZyUfXIYKHMtj4Ou/yHoONryOChxL6\n47p8w3h52S3tVkpZLm+dO3eWiuIMli6V0tNTSiG0r2PHSunmlpWxL2Xf/p6SidVlZZ/3pdsEN1nZ\n533JxOqy8zP1JRPqSozx0s1NO0/W+XK+HmSu55WyVbOmlH3YKG0g441ItwluEr8p0jChhow3Iq2I\nXL+bnLesfwNl+bsCkmQhcVXl4SuKHWSVVk5NhdFPCZa11WNZsZZ0qsCwwRj0V9EjuRkVD2ZtLN/T\nU0vZLGx/1KznlbLlL+LYTQeiCQGgn583t3p9TKXNE/jelEhLfTJNreZ8r7tfv5+i8vDVGL6i2EFo\naI5cewGt97VjfPAoSBoLOgsWg+StzbDDbCU+87CsSb7CJvtU7Z2yFRkJR/57mIl8gB7JYGK4YfwZ\ni9cIHt0cyBGvTxicvILp7avhFpe/+ml5WDmtxvAVxc5M1QcxzbwJl6S/Qq/pAFTaPIF/elUhyXiV\nPmhb3WVN8hW1vaJSNgIjIvkg5RHmN3idwca/A+DqMwPLiAGQVgev9IN8E21BjHqe/T565s3TevRC\naF/LS6lrFfAVxQ5yTrr+7dbHWIxbqfTYTCql68BqwJrch/TobxDBQxltHIHBcLuHWNBEcHnpQTqj\nyO2RGG8e5XLNo9ha/ZfrI56mz9CqnOv7MegtUOMYeqsgqW1NYkbF0KJnYvbmOOVt5bQK+Ipyn+Vc\nfSsl7G94HEvwSIbvt/L9Mhuu541kPPM07dhHTLSF1Manad15FGO3aTthh4ZSbnuQziRyeyQvrXuJ\nrZtcWJa+kLHxDSGjkhbkW68HnQ29xZWx8Q35vMcl1pknc3KH330rhHYvShTwhRC1hRAbhRCHM7/W\nKuQ4qxBid+ZtbUmuqSiOLt/q28aJWKJjePd7TzD3Ru4bBq432NM1Dsy92WHtwd6AJbTYWy/7JeW1\nB+lMvBt5s3jXcuJuTMGyZTKLe5tBl55rExP+7MyyHqn8I/ohtu0Ps1vJhLtVoiwdIUQkcEFKOUMI\nMRGoJaV8q4Djrkkpqxbn3CpLR3FWhW2J+Dhx7KEDKwnhEx8rawO2QWo38NjBwNjurE7YgYvMuP8N\nrqACA8FwZjpr/Wdi0Kdh0dtAb9M2MMkiwLBnKD1iRhOPP2D/bKmyrJY5CFiUeX8RMLiE51MUp1fY\n5Gq80Kpp9mYT3yZspXrqI+C5nUqpXnybsBU91vvb0ApO9/tvrN05mc6/dMPimpEr2Hc7mLnbjQRL\n+xXE+/yW/brynC1V0oBfX0r5Z+b9U0D9Qo6rLIRIEkIkCCEK/aMghBiTeVzS2bNnS9g0RSmfCpp0\nBa3X/yP+WNEzyKcHVzz2Uj2lPbc8khjk0wOp09//xlZgZ3390AeOYafPFi3Q24R2u1aXn1qfp39i\nI6oc6g3nWkOfKWDUqp2W52ypO+bhCyHigAYFPJWr1puUUgohChsf8pRSnhBCNAfihRB7pZRH8h4k\npZwHzANtSOeOrVcUB5Q13p618EqnA2uOznsnn1D2BSxhYGx3vk3YyiCfHqwN2EaQcSTf2qfJFdKj\nly7wi/dXYNND7CyofRi8P0fndh5D4nMcuOzCjQ1fagcbTdA4EbczfuU6W+qOAV9K6V/Yc0KI00KI\nhlLKP4UQDYEzhZzjRObXo0KITUAnIF/AV5SKIufCK12ez9n7mp+lXexIVidEIYGYX3YQZBzJjvrq\nU29Zi4wE78tx+EWNxtgE6h8zcrppCjy8Bjy20TaxO3/omvHg5Qz2b19InTpQtSqkpvjhIf2IKOfZ\nUiVdabsWGAXMyPyarwOSmbmTJqW8JYSoC/gCamcHRcmUr6Lmsg3sAwyZ02OeTcH8tV2aVuEcufoi\nEcu0CpiGlCBOb58FL3cEz63oUrqxf8NmXuRLvuQlDAb417/Kd4DPq6Rj+DOAvkKIw4B/5vcIIbyE\nEPMzj3kYSBJC7AFMwAwp5e8lvK6iOI3CxvSzlOdJQEdUVKXRYRtvV8CcxXgIfBXq7+WRUyA9dmDw\n+YjVDKFOHVi40LGCPaCqZSpKebB0qZR6fcEVFj09tWP69ZNyVrdo+VF3IeONSKnXy1ndomW/flLG\nH42XH237yK7vwREUVGkUpAxw2ShTdZ7ZFTArTagmGdFXMhU5NhBpAzmlay3JVCH7vz/L3m+jSBRR\nLVOttFWUciA0FBYtKrpkQnLzRxlvS8F8fAghwfBK08GMt6Ww/6GW+bZEVPXyC1bQlpNBrGJnRgf+\nsDVDANHmz7iV9AY8uBGXI70I3tCbFDxZfPICw+vOxGqMs0vbS4Mqj6wo5UjOsskeHlqwzxo2mN1N\nx/gnJMTOpNmpGiQ/8wa4puGWDtN1H5NRPYzGjeGNN+D8+dzndXNT5Rcg/6K3PsTxGx2YxAd8yNt0\nYhcbjQYqBw9iQtI1PvUyIKNX4GWuRjz+DvFzLGrhlQr4iuIgpBD80wfGBwhI7Q4eW0HAyM2efG9K\nZMrAPUyK8y90tyW9XivFkPcPSUWScy8Bl+H+PCiP8ODVW2zZ9394mpuzN/AL6DyP/n9Y6Z4KHifq\nMyr4JpbomHz7EpRXZbnSVlGU+8SKHhLehFRf8NwKNheabR7OUq9rTDL2Zch3o4vcWs9q1Xq3KSmU\n+5ovZSVrgrwPcYijj3OwVQrfdzjN9aEjtWDvPRekDpOHHo8T9RluPqUF+8aJ2ecoaPMZR6ECvqI4\niBCWM96nCXhsw8UK6DJIbrOLxvu78mHwHv7XNDMS+cyG4YFFnistTRs6ciZ3M2+RVWm0S/dgZpxa\ngSE2AquLxFrpphbsbToqWwT6FdHMNy/VXmT2g+23K2DqHXjBswr4iuIg1nZPhYAJPHUIPlrSAbfY\n96DuQY57r6fb/kYkNYbJPrUhIAyOFrpeMpszpXvmLTld2KeYyEho9HscfY9f4sPgPfQ+dUUbHtNZ\nteqXehvil9d43GzNLoaWl9WBSxqpgK8oDqLdwDgMppnc+HojH5g3sj7BxNjYFoizLVnr/SdRrWrw\nQcBF3GL/QZ+E9tmvy7uSN4uUzpPBU1D2TUGfYrwvxxHyQQcw92ZSdAc2Dv+3NhciAQm6DIHusX8R\nY6xT6LXqFP5UuacCvqI4iN0TNjC67ThMmVU1m5OMZ8IQZs55Cn2qD3s9L6NP7cL0hIt4k4inJyxd\nCosXF76wK2dP2JFTOe9mn9/ISOCzz1hJCEHEMKHBcDBc10ocZwj0iX/Fpoc0Fx1i+ODsYmh5Xb3q\nWD+bnFTAVxQHERWl5epLCfH4Y8SMF4lM8amF1SMBUnqQ4fELU3xqEcAP2Ruj5NwhqyBpaVoq590M\niZRXRe3zGxgIs31XcWTtg/TrWZ1fjRd5gDPY2q0Gm57q113QI5nx+y7qx45DnunEiz7D+GhpYoG9\n+fR0x53/UAFfURxEQcMWv/psJi1gKsR+jMstVzjYn7SAqfzqswmb3oVO3UchQgOZPFnLUBGi4HOf\nP393QyLlVVH7/CY16sF4Wwr6Ex1xabuM8cOSOTw0DBrugoxKWPWS8T/WYkLwMU6fegrDf36m55Uv\nCPcN58KFgq/nqPMfKuArioMoKMhEN6+FW+x7jEyoR8bRQG2v1YP9+bK5kaDHurHbfwn9jt6kRUoc\nzz8PtWuX/JrlUd5PMXr97T9YzdMOQMAE5jboygMrPgd9OrReByIDd3kL/fJofkpYCdErte0mLbf/\n0BX1ycERqYCvKA6ioCDjtyyM6QkX+Z5+TEm4hiE2Alp/x8FKTVgbsI2Bsd2ZkCBJwotWlt+4caPg\nnnBhE5GOFNhCQ2/39LMyaVJS4IOfzlPZIiBgAsl+i8HlppaRo4MhPzfhP+ZFWkZOjvTLrD90RX1y\ncEQq4CuKgygo+CTizQe8zUpCmMZUYhN+gBs1wXMbnG5LjYTRDCaGGz5zOTD6BRp1ejG7JyyE9nXe\nPK3MrzMEtoKGvfzM8PyyIG0jE8+tALhk6KmUrmNpl4tMN3bPd56sP3Q5Pznk/Hk56irlktbDVxTl\nPsm7U5aHB5hS/fGTcTQjGQl84mOFKhe1NMP6+1gSmAAXzkDAZCpbBF/E7aTP5I00Cp1PYg1/wsNz\nX6OwOj6OoqAhqA95i7kN6oFudfZjGbteZPTvZ/ly6H/5ddj7sLxzdumEvH/ocm5W4+hULR1FcWCv\nvAJz52r32/mMyt4asTEnmRuQY1O5jMrMinqIceY9mOjNAOM42vbcys+LnGsvIoMBnsxYw1qCsh9z\n85nGzYCpuFih/RnJ3noCiwtUiX2PG6e6Q7vlcLEFbA9HCHj5ZZgzx45vooSKqqWjeviK4sCyAtO8\nebe3RnwjIZUgktA9/BQ2zx2ZR9qYyAwu8QuzjZ24PnQE9Q7Xwlk2n4vcHol3I2+eqXaZJRcH8Sqf\nEWx8jbcbD+Lmw+vRWSrx7foaPPnbGVYYH2DE8MvcaLceEqZk9+xBS0ndsMGOb6SMqTF8RXFwc+ZA\nRgYsDdzA0d8WMYZ53PCZi83jJ/RWtOEdl3QszzzNdL8Mrg8dgbu4zvhdKWA0Ypocpy1KcmBHtngT\ntCiITaOCaBvYk7nGh3k8uDYJJ17nAdsZGqfdJHDPKb5eamOSPIXt6w1w4OkCz+UomUn3QgV8RXES\nWROWR3zWYQl4m0oWwQ9LYGZs5gGuadBrOrjc4tnlg0lsDAMeSad//A6OXH0RAFOyiZfWvUTkdsf6\nCzDsmhW5aCGn3AT7vbfB8H7I6FVUbrOYM03N9D+kHRcaqpU2lkf98DweXuC5HCkzqbhUwFcUJ5Hd\nM20eB4f6c2tZHI+bJc0TBmFIfAFk5qorl1vMbdCVuBMTWd/+MmkBU3lo52pMySaCVgSxfP/yXLtn\nOQK/qNGsMX+Ky09/11IuDemIwSO56b2ItondmVPAMI2zpVzeDRXwFcVJZPdMl22Ar9dlj02/Znwa\nyyOrcE/XMWUzVLboIGACGzueQG/TUdmi451OF+k/vw8Zl6/T85fPSIz2K/xCpaxUavikpoJxExav\nBbB5CkiBrHmCqpdrsX/DFl5pFJPvJc6Wcnk3VMBXFCdRUI8V4GS7bbgIKwtW1mWaCSKWPQJWA3Rc\ngvXnN8n4aQI3XCHNFTJ+/jtbYgbgffn+7Nt6t2WN78T02AMEBRuwRkdTt8p+EBJhg2s1LuIR2I81\ncnCBr8sa4rHZyK495MxUwFcUJ5HVY823avZiC1i9jlcun8KfjbzHVNwzrDx+FCp3iSSj62eQXgXS\nq3Cr26eE+njh99GT2ZO5pmRTmY3p321Z4ztZ3ncAMnoFA9qM4pz3NwxMbES1xd9Q/VhrUr1judrz\nldJrtANTAV9RnEhoKJw7p5VFzhqqqHMwHGH24/x5+NGo50bw88xfUZe3t4Bel66VCI5/H5Z9h4uE\nuQFHGBRQn5APOuByZSYhq0Jw0bmUSdC/m7LGd6NFtS9ZM7wa+5qd4eVE+NcGA17malxZcAASx3Kt\nbnzJG+sE1MIrRXFyOTfuxjcSTniD2Y+JvrXYV6sKcedeJb35FvQ7xmHBQKU+47jVdA+dj9TigMdF\nRmxpy1fdTmJduRpP6VeqK3BztS2HOnW0P1x5ZeXb+zW7PcdgSjaReDKRcN/wIs9XtapjryK+W2oT\nc0WpwHL1lreHZ0/m/rI9mq3rf8c14VU+3nEat+AB6Mng1oLdcORxdj54kfoX3Pmy6ykMK5fSwtyk\n1OvkR0SAq2v+x69cKfgaWfn2Jp8GoNNh8mlA0KIgjmzxzj5f3nkMg0HbtMRRa/2XJhXwFcXJFZZX\nbhL+1OUs/+E5/m7ew7vRbbAFD6fK4KehRTycaktyg+vo/uiLq7kbHzMBKN4Ye1SU1rMWQrvp9Vo5\niCyhoVCtWv7X5SxRnCUyEmotOI9ctJCgnhd4t7dkQM8r3Fq0jGHXrNnny5t5U726tmlJTo5U679U\nSSnL5a1z585SUZSSW7pUSjc3KbX+rXZzc5Ny7FgpXV217/uwUdbhjOw7uL7kH8jaIx6VTKgrCRop\nmSrkWJ8W0gbSgl4GES3h7q6r0+W+btZt7NjbxwlR8DFC3D6mXz8pR9aKkXU5I2fxpqziN07yDyR+\n78hZvCmlp2eh7bib8zsTIEkWEldVD19RnFxh+eZz5sCCBdr4djz+eBgXsaPNaRqfq8KFFrsQW9+C\nmMUQ+zFfPJ4YJnB5AAAN30lEQVTCy/3hU/7GGoYwzLDqjtedPFlLdyzIvHm37xf2CaR2bW2MX3SP\n5Hf9Eyzx3UA34/u8Z+zJzS5fwslHEd0i+V//T4qc5XW2TUxKQgV8RakACss3z8rqiT9q4tgrH9Hp\n2485Yw4Bixuy13T6GifBqY7YrO5EM4QwZjGT8URZh93xmkVl2mRtUAJ3HnfvdEJHSoct6DosYO3w\nr7gyfARSWKHePqSQRLXTY3rsgUKvFRhYvMedmQr4iqKQeDKRlX9Zybb9YbRaPw63ZctxF9fZFTwD\n1+FPwvJVXFy/mu5sw/TMJ3R94XbEjtweyeyfZudK2zQlm6gRWHgap15/+/6dxt2/MX/G2B+bYHOx\ngf4GGNLAJR1cLAyM80G/PJrlfQcUeq3Cql86c1XMwqiArygK4b7h+DXzw9MT9tMeH3MVXvi5Kufd\nId2QAQ32YOQoW30SWd8Kfquny86UcYmYQVhsGFs3adXWTckmQlaF8Mogb3SFRJgxY3J/n/cTSM7N\nwz1IZU7CEWr/9iTobVqtHH0GrfZ4syNhNWNdrLSo9mWh7620cv2dgQr4iqJkyyocFm/U828vAw9t\nfhos7hAQRtPnW0BAGMTORH69LjtTZlq7a1SJ/QdbLdN41/QuIatCWPmXlUS86MfixeDufvv8Oh2M\nHXvnDUZq1IA+xJGMEYFkkPF1LrTaDtbMkGUTHGr9Pzo98jm1X/xLvp27cips4/bibujuDFTAVxQl\nW2goDHzdBMEhEL2S/5lW0X1ZGC422OoJ3VMhIuEsHcy1SU/6G9N7QXrS31ifYOL1PTB9y3TGeo3N\nXhgVGgrXrt3OjbFa7243qZ7BL5JkvEoyzfinsQNrh/4HDDdAZ6PzH7VAgE6fhmnQLLyDTWX7Q3Ei\nJQr4QohgIcR+IYRNCFHgyq7M454UQvxPCPGHEGJiSa6pKErZ8g1J5O2WK/GUWtDe1qA6GTrgopFt\nHoJKPh8xw9gV4fU5bJ5ChtdXRPscY27Ly0zZDHN/iGD2OzPvaVOVyEgwTY7jzR8XIIKHMtj4OrMa\n94SzrUFv4eVEWLW0Og/GjsGGHpeTPZj3XWKR58w5PHQ3jzu1wvI17+YGPAw8BGwCvAo5Rg8cAZoD\nrsAeoM2dzq3y8BXF/ug6SzJVSHxmaX10n1lSTEVWnqST1Y3fyCm8Jyv7vJ+dqy9BzjJ2kGJCHfna\n8I+lp6eW7+7pqeXlF6VfPykf9X9WVjd+I+PpLeONSJcJNSWDR0rdO3o5ywdpRdzOpTfGS3w/km5u\nRZ/b07PgPPwiUvcdGmWVhy+lPCCl/N8dDnsM+ENKeVRKmQ4sBwaV5LqKotwfldvEQexMSBinPZAw\njtqHumA525E15k+ZxlSe1S+gcux0FuhH4u/bl/eYylMHXFn9wFukNJyN9DSR0iSSv75volPkAAKj\ntHzIvHXw9zd4lF2ubqQHP8tg4+tMMP9CxqXm0HEJLvuG0CmhN6nkSJ43+8H28Duumq2IG50U5n5s\nYt4YOJbj++NAl4IOFEKMAcYAeFTEVRGKUs7M99vAmK8hZwXj818n0Ic4mjEaG4LPtx9lGHFEMJkf\njT2pFDwIv61XMbUHAsajs1TGFh/BrSED2J2WxsjmM7Pr4KelAb6R1D6hI/TAr4QF/MrNxJe5OfQ5\ndqZXg+on6HwC/mi5hsHGFXiZC6jDQNEZN1lrDiZPrhjF04pyxx6+ECJOCLGvgFup99KllPOklF5S\nSq969eqV9ukVRSmmgnLkQVuZ2wwzemykoD24hw6MNB/jVnQM7/aowpDftWNthpvQN1zLn4+dSf2j\n43LVwW90oiG7g2cgTnVgVizg/TlUugI1TqA72Z6Pv+zN6tVWboY8T7xRn7+R5F41W9AOWhVto5PC\n3DHgSyn9pZTtCrh9e5fXOAE0zfF9k8zHFEVxAHmDZVbQz/JX5hPMSlYSwmJGMcu8nutJ41nSEfSp\nXTPz5q08nFqLFgkDeP753CWMXzXvxy16IWHBx5lZ5a8g9VpkulYP15pHGWx8Hd2wWMIejMGtZf4J\n2pzDM6W1g5azuh9pmYlASyFEMyGEKzAMWHsfrqsoShnIOyYejz9e+j1410kGoJPxEwxe/4bdI7F6\n/AQS9FY44HERs883VLHcTo/pQxz/ZDzTzPGIpJf4s9dXIKw0PFUH3M9yc/8IbgY/x/RTej5504+0\njbkT7uvUyb0PbWntoOWsSpqWGSSEOA50Bb4TQsRmPt5ICLEBQEqZAbwGxAIHgJVSyv0la7aiKPZS\n0DDPyEX+VD1nxtSlPkHBBgxbx6FrE629wOqKdedLYKmCNWASV3wWUse4nNq+4fT1fYIu/Tsw2acu\ntq7/AglYDXhcljx1CPD+goyDofx6OjFfIAet9HLO4Rm1qrZoascrRVFKzUtTXmT50kCGNQ7je++j\nHHc3IM+3gtrJED8N2kRDg93gko774m+ZxhQmD9/LTYOEcw9RK/kRLnb8Dgw3eC62DVGuL9Cmfxy/\nvbWBgkKVELkrcha245WnpzYcVRGoHa8URbkvsvaW/SIhhTfXdEAKoP5++LMj9J4OjRPBcAuu1eFZ\nFvGheSO634ZBRiWofZiLnWOobMugcux01uoDqb5/HBMabbjrEscqBbNoKuArilJqwsPBL8If01v/\n5T3zJgxL14LVAB47oNJl0NsQl+vjbrjI58M2Us34LWn7RgNWbaBfn4H4+TX6JTzEhe2RnD+vTboG\nBt5dIC+s9n9FzcrJSw3pKIpSqqKi4LXXoMGlfRykDfhNhV7va09muMKt6hi3DuNEr6+w6AUIC7hY\nMFgFLjaJxeZOxvJ12Xvvgha4AwO14G21auWVx4y5u7o8FY0a0lEUpUxl5b4LASNHwqVLcJB2uBg3\nQvcPtMlYCejT0Z1rhrnHciyHngbXNDBYcLHqqbJ0Na2XfUyG1MOwIDDeLoqWkgKLFt3eOMVq1b5X\n6ZbFowK+oiglkjP3Hbg9uWo0kTFiAOhsVE99mGWL6mOwgs0jEa7WRddmBQYr6KyQYatEaNs9JF8K\ngxVrYN9Qbbw/k16v0i1LgxrSURSlRArLjME3Uuvdm3vBisx1mkYTBI2C6seoYRHEfC3hgfoMePIa\nVuHCuEYxWr59juDu5pY/2GfJm6WjqCEdRVHKUKE57tvD4aNLt4M9aOPyv7zGmM5jiBn9I35HJX4J\np1g3ah3PPjqUGm0SC5x0zbu6N4squVU8qoevKEqJFNrDL8S95MTnKraWyc1NZeAURPXwFUUpMwXl\nvguR+2uWe82JV+mWpUMFfEVRSqSgYLxkiTZ5u2SJVu8mS5UqJbuOqnhZMvejHr6iKE4uNLTwAHzj\nxu37WQupsl6j3F+qh68oSplR1SvLFxXwFUUpM6p6ZfmiAr6iKGXmboueKfeHCviKopQZVb2yfFEB\nX1GUMqPSKcsXlaWjKEqZKiqDR7m/VA9fURSlglABX1EUpYJQAV9RFKWCUAFfURSlglABX1EUpYIo\nt+WRhRBngWIUXc2lLnCuFJtjD47+Hhy9/aDeQ3ng6O2H+/8ePKWU9Qp6otwG/JIQQiQVVg/aUTj6\ne3D09oN6D+WBo7cfytd7UEM6iqIoFYQK+IqiKBWEswb8efZuQClw9Pfg6O0H9R7KA0dvP5Sj9+CU\nY/iKoihKfs7aw1cURVHyUAFfURSlgnCqgC+EeFII8T8hxB9CiIn2bk9xCSEWCCHOCCH22bst90oI\n0VQIYRJC/C6E2C+EeMPebSouIURlIcQvQog9me/hPXu36V4IIfRCiF+FEOvt3ZZ7IYQwCyH2CiF2\nCyGS7N2eeyGEqCmEWCWEOCiEOCCE6GrX9jjLGL4QQg8cAvoCx4FE4Bkp5e92bVgxCCF6AteAxVLK\ndvZuz70QQjQEGkopdwkhqgE7gcEO9nsQgLuU8poQwgBsA96QUibYuWnFIoQYB3gB1aWU/e3dnuIS\nQpgBLymlwy68EkIsArZKKecLIVwBNynlJXu1x5l6+I8Bf0gpj0op04HlwCA7t6lYpJRbgAv2bkdJ\nSCn/lFLuyrx/FTgANLZvq4pHaq5lfmvIvDlUz0gI0QR4Cphv77ZUVEKIGkBP4CsAKWW6PYM9OFfA\nbwwcy/H9cRws0DgbIYQR6AT8bN+WFF/mcMhu4AywUUrpaO/hEyAcsNm7ISUggR+EEDuFEGPs3Zh7\n0Aw4CyzMHFqbL4Rwt2eDnCngK+WIEKIqsBp4U0p5xd7tKS4ppVVK2RFoAjwmhHCYITYhRH/gjJRy\np73bUkLdpZSPAv2AVzOHPB2JC/AoMFdK2Qm4Dth1btGZAv4JoGmO75tkPqbcZ5nj3quBKCnlN/Zu\nT0lkfgQ3AU/auy3F4AsMzBwDXw70EUIstW+Tik9KeSLz6xkgBm3Y1pEcB47n+HS4Cu0PgN04U8BP\nBFoKIZplTo4MA9bauU0VTuaE51fAASnlbHu3514IIeoJIWpm3q+Clghw0L6tuntSyklSyiZSSiPa\n/4N4KeUIOzerWIQQ7pmT/mQOgzwBOFT2mpTyFHBMCPFQ5kOPA3ZNXnCaTcyllBlCiNeAWEAPLJBS\n7rdzs4pFCPE10BuoK4Q4DkyVUn5l31YVmy8wEtibOQYO8LaUcoMd21RcDYFFmZlfOmCllNIhUxsd\nWH0gRus/4AIsk1L+175Nuid/A6IyO6FHgeft2RinSctUFEVRiuZMQzqKoihKEVTAVxRFqSBUwFcU\nRakgVMBXFEWpIFTAVxRFqSBUwFcURakgVMBXFEWpIP4fTVXk+niDyvEAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "<Figure size 432x288 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jWxvLGexKv0D",
+        "colab_type": "text"
+      },
+      "source": [
+        "We can see from the graph that the predictions for the original model, the converted model, and the quantized model are all close enough to be indistinguishable. This means that our quantized model is ready to use!"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HPSFmDL7pv2L",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Generate a TensorFlow Lite for Microcontrollers Model\n",
+        "Convert the TensorFlow Lite quantized model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "j1FB4ieeg0lw",
+        "colab_type": "code",
+        "outputId": "a2ba48f0-c440-409a-dad0-747a22ac3a64",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 476
+        }
+      },
+      "source": [
+        "# Install xxd if it is not available\n",
+        "!apt-get update && apt-get -qq install xxd\n",
+        "# Convert to a C source file\n",
+        "!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n",
+        "# Update variable names\n",
+        "REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n",
+        "!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"
+      ],
+      "execution_count": 21,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]\n",
+            "Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease\n",
+            "Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease\n",
+            "Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n",
+            "Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n",
+            "Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease\n",
+            "Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release\n",
+            "Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release\n",
+            "Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n",
+            "Get:10 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease [15.4 kB]\n",
+            "Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n",
+            "Get:14 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main Sources [1,810 kB]\n",
+            "Get:15 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [38.5 kB]\n",
+            "Get:16 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [873 kB]\n",
+            "Get:17 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [1,368 kB]\n",
+            "Get:18 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [835 kB]\n",
+            "Get:19 http://archive.ubuntu.com/ubuntu bionic-updates/restricted amd64 Packages [57.5 kB]\n",
+            "Get:20 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [1,176 kB]\n",
+            "Get:21 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main amd64 Packages [873 kB]\n",
+            "Fetched 7,301 kB in 3s (2,475 kB/s)\n",
+            "Reading package lists... Done\n",
+            "Selecting previously unselected package xxd.\n",
+            "(Reading database ... 144568 files and directories currently installed.)\n",
+            "Preparing to unpack .../xxd_2%3a8.0.1453-1ubuntu1.3_amd64.deb ...\n",
+            "Unpacking xxd (2:8.0.1453-1ubuntu1.3) ...\n",
+            "Setting up xxd (2:8.0.1453-1ubuntu1.3) ...\n",
+            "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JvRy0ZyMhQOX",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Deploy to a Microcontroller\n",
+        "\n",
+        "Follow the instructions in the [hello_world](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n",
+        "\n",
+        "**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`hello_world/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/models) directory to access the models generated in this notebook.\n",
+        "\n",
+        "**New Model:** If you have generated a new model, then update the values assigned to the variables defined in [`hello_world/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/model.cc) with values displayed after running the following cell."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "l4-WhtGpvb-E",
+        "colab_type": "code",
+        "outputId": "ba008623-d568-43b1-a824-68adbe811567",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        }
+      },
+      "source": [
+        "# Print the C source file\n",
+        "!cat {MODEL_TFLITE_MICRO}"
+      ],
+      "execution_count": 22,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "unsigned char g_model[] = {\n",
+            "  0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,\n",
+            "  0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,\n",
+            "  0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
+            "  0x60, 0x09, 0x00, 0x00, 0xa8, 0x02, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00,\n",
+            "  0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,\n",
+            "  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,\n",
+            "  0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,\n",
+            "  0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,\n",
+            "  0x0c, 0x00, 0x00, 0x00, 0x48, 0x02, 0x00, 0x00, 0x34, 0x02, 0x00, 0x00,\n",
+            "  0x0c, 0x02, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00,\n",
+            "  0x8c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,\n",
+            "  0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0xfe, 0xfd, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00,\n",
+            "  0x7c, 0xfd, 0xff, 0xff, 0x80, 0xfd, 0xff, 0xff, 0x84, 0xfd, 0xff, 0xff,\n",
+            "  0x88, 0xfd, 0xff, 0xff, 0x22, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x04, 0x00, 0x00,\n",
+            "  0x9f, 0x0a, 0x00, 0x00, 0x65, 0x06, 0x00, 0x00, 0x3d, 0xf8, 0xff, 0xff,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0xeb, 0x0a, 0x00, 0x00, 0x2f, 0xf8, 0xff, 0xff,\n",
+            "  0xe8, 0x04, 0x00, 0x00, 0x21, 0x0a, 0x00, 0x00, 0x46, 0xfe, 0xff, 0xff,\n",
+            "  0xc8, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa3, 0xf7, 0xff, 0xff,\n",
+            "  0x28, 0xf9, 0xff, 0xff, 0x9a, 0x05, 0x00, 0x00, 0x6e, 0xfe, 0xff, 0xff,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x73, 0x1c, 0x11, 0xe1,\n",
+            "  0x0c, 0x81, 0xa5, 0x43, 0xfe, 0xd5, 0xd5, 0xb2, 0x60, 0x77, 0x19, 0xdf,\n",
+            "  0x8a, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x51, 0x0b, 0x00, 0x00, 0x47, 0xf6, 0xff, 0xff,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x1c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x9b, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0xe7, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x92, 0x07, 0x00, 0x00, 0xf4, 0xf4, 0xff, 0xff, 0x55, 0xf0, 0xff, 0xff,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0xd6, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x01, 0x00, 0x00, 0xee, 0xfc, 0x00, 0xec, 0x05, 0x16, 0xef, 0xec,\n",
+            "  0xe6, 0xf8, 0x03, 0x01, 0x00, 0xfa, 0xf8, 0xf5, 0xda, 0xeb, 0x27, 0x14,\n",
+            "  0xef, 0xde, 0xe2, 0xda, 0xf0, 0xdf, 0x32, 0x06, 0x01, 0xe6, 0xee, 0xf9,\n",
+            "  0x00, 0x16, 0x07, 0xe0, 0xfe, 0xff, 0xe9, 0x05, 0xe7, 0xef, 0x81, 0x1b,\n",
+            "  0x18, 0xea, 0xca, 0x01, 0x0f, 0x00, 0xdb, 0xf7, 0x0e, 0xec, 0x12, 0x1e,\n",
+            "  0x04, 0x13, 0xb2, 0xe7, 0xfd, 0x06, 0xbb, 0xe0, 0x0c, 0xec, 0xf0, 0xdf,\n",
+            "  0xeb, 0xf7, 0x05, 0x26, 0x19, 0xe4, 0x70, 0x1a, 0xea, 0x1e, 0x34, 0xdf,\n",
+            "  0x19, 0xf3, 0xf1, 0x19, 0x0e, 0x03, 0x1b, 0xe1, 0xde, 0x13, 0xf6, 0x19,\n",
+            "  0xff, 0xf6, 0x1a, 0x17, 0xf1, 0x1c, 0xdb, 0x1a, 0x1a, 0x20, 0xe6, 0x19,\n",
+            "  0xf5, 0xff, 0x97, 0x0b, 0x00, 0x00, 0xce, 0xdf, 0x0d, 0xf7, 0x15, 0xe4,\n",
+            "  0xed, 0xfc, 0x0d, 0xe9, 0xfb, 0xec, 0x5c, 0xfc, 0x1d, 0x02, 0x58, 0xe3,\n",
+            "  0xe0, 0xf4, 0x15, 0xec, 0xf9, 0x00, 0x13, 0x05, 0xec, 0x0c, 0x1c, 0x14,\n",
+            "  0x0c, 0xe9, 0x0a, 0xf4, 0x18, 0x00, 0xd7, 0x05, 0x27, 0x02, 0x15, 0xea,\n",
+            "  0xea, 0x02, 0x9b, 0x00, 0x0c, 0xfa, 0xe9, 0xea, 0xfe, 0x01, 0x14, 0xfd,\n",
+            "  0x0b, 0x02, 0xf0, 0xef, 0x06, 0xee, 0x01, 0x0d, 0x06, 0xe7, 0xf7, 0x11,\n",
+            "  0xf5, 0x0a, 0xf9, 0xf1, 0x23, 0xff, 0x0d, 0xf2, 0xec, 0x11, 0x26, 0x1d,\n",
+            "  0xf2, 0xea, 0x28, 0x18, 0xe0, 0xfb, 0xf3, 0xf4, 0x05, 0x1c, 0x1d, 0xfb,\n",
+            "  0xfd, 0x1e, 0xfc, 0x11, 0xe8, 0x06, 0x09, 0x03, 0x12, 0xf2, 0x35, 0xfb,\n",
+            "  0xdd, 0x1b, 0xf9, 0xef, 0xf3, 0xe7, 0x6f, 0x0c, 0x1d, 0x00, 0x43, 0xfd,\n",
+            "  0x0d, 0xf1, 0x0a, 0x19, 0x1a, 0xfa, 0xe0, 0x18, 0x1e, 0x13, 0x37, 0x1c,\n",
+            "  0x12, 0xec, 0x3a, 0x0c, 0xb6, 0xcb, 0xe6, 0x13, 0xf7, 0xeb, 0xf1, 0x05,\n",
+            "  0x1b, 0xfa, 0x19, 0xe5, 0xec, 0xcf, 0x0c, 0xf4, 0xe2, 0xff, 0xff, 0xff,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x21, 0xa2, 0x8c, 0xc9,\n",
+            "  0x5f, 0x1d, 0xce, 0x41, 0x9f, 0xcd, 0x20, 0xb1, 0xdf, 0x53, 0x2f, 0x81,\n",
+            "  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe2, 0xee, 0xff, 0xff,\n",
+            "  0x80, 0xff, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f,\n",
+            "  0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xbc, 0xf9, 0xff, 0xff,\n",
+            "  0x48, 0x01, 0x00, 0x00, 0x3c, 0x01, 0x00, 0x00, 0x30, 0x01, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00,\n",
+            "  0xb8, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x1a, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,\n",
+            "  0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0xca, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00, 0x00,\n",
+            "  0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n",
+            "  0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
+            "  0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0xba, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x0e, 0x00, 0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,\n",
+            "  0x07, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,\n",
+            "  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n",
+            "  0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x10, 0x00, 0x04, 0x00,\n",
+            "  0x08, 0x00, 0x0c, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x0a, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0xdc, 0x04, 0x00, 0x00,\n",
+            "  0x54, 0x04, 0x00, 0x00, 0xc4, 0x03, 0x00, 0x00, 0x54, 0x03, 0x00, 0x00,\n",
+            "  0xd0, 0x02, 0x00, 0x00, 0x4c, 0x02, 0x00, 0x00, 0xe0, 0x01, 0x00, 0x00,\n",
+            "  0x5c, 0x01, 0x00, 0x00, 0xd8, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,\n",
+            "  0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd8, 0xff, 0xff, 0xff,\n",
+            "  0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n",
+            "  0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x0c, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,\n",
+            "  0x0c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x0d, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32, 0x5f,\n",
+            "  0x69, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc2, 0xfb, 0xff, 0xff,\n",
+            "  0x00, 0x00, 0x00, 0x02, 0x58, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc4, 0xfc, 0xff, 0xff,\n",
+            "  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0xba, 0x2b, 0x4f, 0x38, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,\n",
+            "  0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,\n",
+            "  0x73, 0x65, 0x5f, 0x34, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f,\n",
+            "  0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x2a, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n",
+            "  0x6c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x2c, 0xfd, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xb9, 0x36, 0x0b, 0x3c,\n",
+            "  0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,\n",
+            "  0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x34,\n",
+            "  0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f, 0x52, 0x65, 0x61, 0x64,\n",
+            "  0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65, 0x4f, 0x70, 0x2f, 0x74,\n",
+            "  0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
+            "  0xaa, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x6c, 0x00, 0x00, 0x00,\n",
+            "  0x09, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x9c, 0xfc, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,\n",
+            "  0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0xaa, 0x7b, 0xbe, 0x3b, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x2e, 0xbd, 0xbd, 0x3f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x19, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,\n",
+            "  0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x33,\n",
+            "  0x2f, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x2a, 0xfd, 0xff, 0xff,\n",
+            "  0x00, 0x00, 0x00, 0x02, 0x58, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,\n",
+            "  0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x2c, 0xfe, 0xff, 0xff,\n",
+            "  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0xe3, 0x04, 0x20, 0x39, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,\n",
+            "  0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,\n",
+            "  0x73, 0x65, 0x5f, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f,\n",
+            "  0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x10, 0x00, 0x00, 0x00, 0x92, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n",
+            "  0x6c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x94, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xe8, 0x76, 0x51, 0x3c,\n",
+            "  0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,\n",
+            "  0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x33,\n",
+            "  0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f, 0x52, 0x65, 0x61, 0x64,\n",
+            "  0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65, 0x4f, 0x70, 0x2f, 0x74,\n",
+            "  0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
+            "  0x12, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x6c, 0x00, 0x00, 0x00,\n",
+            "  0x07, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,\n",
+            "  0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0xd2, 0x91, 0x43, 0x3c, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x40, 0xce, 0x42, 0x40, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x19, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69,\n",
+            "  0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32,\n",
+            "  0x2f, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x92, 0xfe, 0xff, 0xff,\n",
+            "  0x00, 0x00, 0x00, 0x02, 0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x2c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x94, 0xff, 0xff, 0xff,\n",
+            "  0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x28, 0xb3, 0xd9, 0x38, 0x20, 0x00, 0x00, 0x00,\n",
+            "  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31,\n",
+            "  0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32, 0x2f, 0x4d, 0x61, 0x74,\n",
+            "  0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff,\n",
+            "  0x00, 0x00, 0x00, 0x09, 0x78, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,\n",
+            "  0x34, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
+            "  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0xd5, 0x6b, 0x8a, 0x3b, 0x34, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,\n",
+            "  0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x31, 0x2f, 0x64, 0x65, 0x6e,\n",
+            "  0x73, 0x65, 0x5f, 0x32, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x2f,\n",
+            "  0x52, 0x65, 0x61, 0x64, 0x56, 0x61, 0x72, 0x69, 0x61, 0x62, 0x6c, 0x65,\n",
+            "  0x4f, 0x70, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x8a, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n",
+            "  0x60, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,\n",
+            "  0x04, 0x00, 0x00, 0x00, 0x7c, 0xff, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,\n",
+            "  0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x5d, 0x4f, 0xc9, 0x3c, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x0e, 0x86, 0xc8, 0x40, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x12, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x32, 0x5f,\n",
+            "  0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00, 0x00,\n",
+            "  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,\n",
+            "  0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,\n",
+            "  0x6c, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00,\n",
+            "  0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00,\n",
+            "  0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,\n",
+            "  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
+            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1a, 0xde, 0x0a, 0x3c,\n",
+            "  0x01, 0x00, 0x00, 0x00, 0x66, 0x64, 0x87, 0x3f, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x13, 0x42, 0x8d, 0xbf, 0x0d, 0x00, 0x00, 0x00, 0x49, 0x64, 0x65, 0x6e,\n",
+            "  0x74, 0x69, 0x74, 0x79, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00, 0x00, 0x00,\n",
+            "  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
+            "  0x03, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n",
+            "  0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0e, 0x00, 0x07, 0x00,\n",
+            "  0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,\n",
+            "  0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00,\n",
+            "  0x06, 0x00, 0x00, 0x00, 0x00, 0x72, 0x0a, 0x00, 0x0c, 0x00, 0x07, 0x00,\n",
+            "  0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,\n",
+            "  0x04, 0x00, 0x00, 0x00\n",
+            "};\n",
+            "unsigned int g_model_len = 2512;\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/.gitignore b/tensorflow/lite/micro/examples/image_recognition_experimental/.gitignore
new file mode 100644
index 00000000000..5762b705e78
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/.gitignore
@@ -0,0 +1 @@
+first_10_cifar_images.h
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc b/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
new file mode 100644
index 00000000000..2fdfb0e6779
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
@@ -0,0 +1,38 @@
+$(eval $(call add_third_party_download,$(IMAGE_RECOGNITION_MODEL_URL),$(IMAGE_RECOGNITION_MODEL_MD5),image_recognition_model,))
+$(eval $(call add_third_party_download,$(CIFAR10_DATASET_URL),$(CIFAR10_DATASET_MD5),cifar10,patch_cifar10_dataset))
+
+IMAGE_RECOGNITION_HDRS := \
+tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_model.h \
+tensorflow/lite/micro/examples/image_recognition_experimental/image_provider.h \
+tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.h \
+tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.h \
+tensorflow/lite/micro/examples/image_recognition_experimental/util.h
+
+IMAGE_RECOGNITION_SRCS := \
+$(MAKEFILE_DIR)/downloads/image_recognition_model/image_recognition_model.cc \
+tensorflow/lite/micro/examples/image_recognition_experimental/main.cc \
+tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_provider.cc \
+tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.cc \
+tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.cc
+
+IMAGE_RECOGNITION_TEST_SRCS := \
+tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc \
+$(MAKEFILE_DIR)/downloads/image_recognition_model/image_recognition_model.cc
+
+IMAGE_RECOGNITION_TEST_HDRS := \
+tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_model.h \
+tensorflow/lite/micro/examples/image_recognition_experimental/util.h
+
+include $(wildcard tensorflow/lite/micro/examples/image_recognition_experimental/*/Makefile.inc)
+
+ifneq ($(filter disco_f746ng,$(ALL_TAGS)),)
+  MBED_PROJECT_FILES += \
+    BSP_DISCO_F746NG.lib \
+    LCD_DISCO_F746NG.lib
+endif
+
+$(eval $(call microlite_test,image_recognition,\
+$(IMAGE_RECOGNITION_SRCS),$(IMAGE_RECOGNITION_HDRS)))
+
+$(eval $(call microlite_test,image_recognition_test,\
+$(IMAGE_RECOGNITION_TEST_SRCS),$(IMAGE_RECOGNITION_TEST_HDRS)))
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/README.md b/tensorflow/lite/micro/examples/image_recognition_experimental/README.md
new file mode 100644
index 00000000000..7a29d2f1f42
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/README.md
@@ -0,0 +1,90 @@
+# Image Recognition Example
+
+## Table of Contents
+
+-   [Introduction](#introduction)
+-   [Hardware](#hardware)
+-   [Building](#building)
+    -   [Building the testcase](#building-the-testcase)
+    -   [Building the image recognition application](#building-the-image-recognition-application)
+        -   [Prerequisites](#prerequisites)
+        -   [Compiling and flashing](#compiling-and-flashing)
+
+## Introduction
+
+This example shows how you can use Tensorflow Lite Micro to perform image
+recognition on a
+[STM32F746 discovery kit](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)
+with a STM32F4DIS-CAM camera module attached. It classifies the captured image
+into 1 of 10 different classes, and those classes are "Plane", "Car", "Bird",
+"Cat", "Deer", "Dog", "Frog", "Horse", "Ship", "Truck".
+
+## Hardware
+
+[STM32F746G-DISCO board (Cortex-M7)](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)
+\
+[STM32F4DIS-CAM Camera module](https://www.element14.com/community/docs/DOC-67585?ICID=knode-STM32F4-cameramore)
+
+## Building
+
+These instructions have been tested on Ubuntu 16.04.
+
+### Building the test case
+
+```
+$ make -f tensorflow/lite/micro/tools/make/Makefile image_recognition_test
+```
+
+This will build and run the test case. As input, the test case uses the first 10
+images of the test batch included in the
+[CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset. Details
+surrounding the dataset can be found in
+[this paper](https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf).
+
+### Building the image recognition application
+
+#### Prerequisites
+
+Install mbed-cli: `$ pip install mbed-cli`
+
+Install the arm-none-eabi-toolchain.
+
+For Ubuntu, this can be done by installing the package `gcc-arm-none-eabi`. In
+Ubuntu 16.04, the version included in the repository is 4.9.3 while the
+recommended version is 6 and up. Later versions can be downloaded from
+[here](https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-rm/downloads)
+for Windows, Mac OS X and Linux.
+
+#### Compiling and flashing
+
+In order to generate the mbed project, run the following command: `$ make -f
+tensorflow/lite/micro/tools/make/Makefile TAGS=disco_f746ng
+generate_image_recognition_mbed_project` This will copy all of the necessary
+files needed to build and flash the application.
+
+Navigate to the output folder: `$ cd
+tensorflow/lite/micro/tools/make/gen/linux_x86_64/prj/image_recognition/mbed/`
+
+The following instructions for compiling and flashing can also be found in the
+file README_MBED.md in the output folder.
+
+To load the dependencies required, run: `$ mbed config root . $ mbed deploy`
+
+In order to compile, run: `mbed compile -m auto -t GCC_ARM --profile release`
+
+`-m auto`: Automatically detects the correct target if the Discovery board is
+connected to the computer. If the board is not connected, replace `auto` with
+`DISCO_F746NG`. \
+`-t GCC_ARM`: Specifies the toolchain used to compile. `GCC_ARM` indicates that
+the arm-none-eabi-toolchain will be used. \
+`--profile release`: Build the `release` profile. The different profiles can be
+found under mbed-os/tools/profiles/.
+
+This will produce a file named `mbed.bin` in
+`BUILD/DISCO_F746NG/GCC_ARM-RELEASE/`. To flash it to the board, simply copy the
+file to the volume mounted as a USB drive. Alternatively, the `-f` option can be
+appended to flash automatically after compilation.
+
+On Ubuntu 16.04 (and possibly other Linux distributions) there may be an error
+message when running `mbed compile` saying that the Python module `pywin32`
+failed to install. This message can be ignored.
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/image_provider.h b/tensorflow/lite/micro/examples/image_recognition_experimental/image_provider.h
new file mode 100644
index 00000000000..b4667967a91
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/image_provider.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_IMAGE_PROVIDER_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_IMAGE_PROVIDER_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+TfLiteStatus InitCamera(tflite::ErrorReporter* error_reporter);
+
+// This is an abstraction around an image source like a camera, and is
+// expected to return 8-bit sample data.  The assumption is that this will be
+// called in a low duty-cycle fashion in a low-power application.  In these
+// cases, the imaging sensor need not be run in a streaming mode, but rather can
+// be idled in a relatively low-power mode between calls to GetImage().  The
+// assumption is that the overhead and time of bringing the low-power sensor out
+// of this standby mode is commensurate with the expected duty cycle of the
+// application.  The underlying sensor may actually be put into a streaming
+// configuration, but the image buffer provided to GetImage should not be
+// overwritten by the driver code until the next call to GetImage();
+//
+// The reference implementation can have no platform-specific dependencies, so
+// it just returns a static image. For real applications, you should
+// ensure there's a specialized implementation that accesses hardware APIs.
+TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int image_width,
+                      int image_height, int channels, uint8_t* image_data);
+
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_IMAGE_PROVIDER_H_
diff --git a/tensorflow/lite/micro/examples/hello_world/sine_model_data.h b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_model.h
similarity index 60%
rename from tensorflow/lite/micro/examples/hello_world/sine_model_data.h
rename to tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_model.h
index b7087c6bd9e..a32dcd0279f 100644
--- a/tensorflow/lite/micro/examples/hello_world/sine_model_data.h
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_model.h
@@ -15,13 +15,13 @@ limitations under the License.
 
 // This is a standard TensorFlow Lite model file that has been converted into a
 // C data array, so it can be easily compiled into a binary for devices that
-// don't have a file system. It was created using the command:
-// xxd -i sine_model.tflite > sine_model_data.cc
+// don't have a file system. It can be created using the command:
+// xxd -i image_recognition_model.tflite > image_recognition_model.cc
 
-#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_HELLO_WORLD_SINE_MODEL_DATA_H_
-#define TENSORFLOW_LITE_MICRO_EXAMPLES_HELLO_WORLD_SINE_MODEL_DATA_H_
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_IMAGE_RECOGNITION_MODEL_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_IMAGE_RECOGNITION_MODEL_H_
 
-extern const unsigned char g_sine_model_data[];
-extern const int g_sine_model_data_len;
+extern const unsigned char image_recognition_model_data[];
+extern const unsigned int image_recognition_model_data_len;
 
-#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_HELLO_WORLD_SINE_MODEL_DATA_H_
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_IMAGE_RECOGNITION_MODEL_H_
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
new file mode 100644
index 00000000000..fd547b433ef
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/first_10_cifar_images.h"
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_model.h"
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/util.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+#define IMAGE_BYTES 3072
+#define LABEL_BYTES 1
+#define ENTRY_BYTES (IMAGE_BYTES + LABEL_BYTES)
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestImageRecognitionInvoke) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  const tflite::Model* model = ::tflite::GetModel(image_recognition_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "Model provided is schema version %d not equal "
+                         "to supported version %d.\n",
+                         model->version(), TFLITE_SCHEMA_VERSION);
+  }
+
+  tflite::MicroOpResolver<4> micro_op_resolver;
+
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
+                               tflite::ops::micro::Register_CONV_2D());
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
+                               tflite::ops::micro::Register_MAX_POOL_2D());
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
+                               tflite::ops::micro::Register_FULLY_CONNECTED());
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                               tflite::ops::micro::Register_SOFTMAX());
+
+  const int tensor_arena_size = 50 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+
+  tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
+                                       tensor_arena_size, error_reporter);
+  interpreter.AllocateTensors();
+
+  TfLiteTensor* input = interpreter.input(0);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, input);
+  TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(32, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(32, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(3, input->dims->data[3]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
+
+  int num_correct = 0;
+  int num_images = 10;
+  for (int image_num = 0; image_num < num_images; image_num++) {
+    memset(input->data.uint8, 0, input->bytes);
+
+    uint8_t correct_label = 0;
+
+    correct_label =
+        tensorflow_lite_micro_tools_make_downloads_cifar10_test_batch_bin
+            [image_num * ENTRY_BYTES];
+    memcpy(input->data.uint8,
+           &tensorflow_lite_micro_tools_make_downloads_cifar10_test_batch_bin
+               [image_num * ENTRY_BYTES + LABEL_BYTES],
+           IMAGE_BYTES);
+    reshape_cifar_image(input->data.uint8, IMAGE_BYTES);
+
+    TfLiteStatus invoke_status = interpreter.Invoke();
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
+    if (invoke_status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+    }
+
+    TfLiteTensor* output = interpreter.output(0);
+    int guess = get_top_prediction(output->data.uint8, 10);
+
+    if (correct_label == guess) {
+      num_correct++;
+    }
+  }
+
+  TF_LITE_MICRO_EXPECT_EQ(6, num_correct);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
new file mode 100644
index 00000000000..09c76df0379
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
@@ -0,0 +1,108 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOLINTNEXTLINE
+#include "mbed.h"
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/image_provider.h"
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_model.h"
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.h"
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.h"
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/util.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+#define NUM_OUT_CH 3
+#define CNN_IMG_SIZE 32
+
+uint8_t camera_buffer[NUM_IN_CH * IN_IMG_WIDTH * IN_IMG_HEIGHT]
+    __attribute__((aligned(4)));
+static const char* labels[] = {"Plane", "Car",  "Bird",  "Cat",  "Deer",
+                               "Dog",   "Frog", "Horse", "Ship", "Truck"};
+
+int main(int argc, char** argv) {
+  init_lcd();
+  wait_ms(100);
+
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  if (InitCamera(error_reporter) != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Failed to init camera.");
+    return 1;
+  }
+
+  const tflite::Model* model = ::tflite::GetModel(image_recognition_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "Model provided is schema version %d not equal "
+                         "to supported version %d.",
+                         model->version(), TFLITE_SCHEMA_VERSION);
+    return 1;
+  }
+
+  tflite::MicroOpResolver<4> micro_op_resolver;
+
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
+                               tflite::ops::micro::Register_CONV_2D());
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
+                               tflite::ops::micro::Register_MAX_POOL_2D());
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
+                               tflite::ops::micro::Register_FULLY_CONNECTED());
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                               tflite::ops::micro::Register_SOFTMAX());
+
+  constexpr int tensor_arena_size = 50 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
+                                       tensor_arena_size, error_reporter);
+  interpreter.AllocateTensors();
+
+  while (true) {
+    TfLiteTensor* input = interpreter.input(0);
+
+    GetImage(error_reporter, IN_IMG_WIDTH, IN_IMG_HEIGHT, NUM_OUT_CH,
+             camera_buffer);
+
+    ResizeConvertImage(error_reporter, IN_IMG_WIDTH, IN_IMG_HEIGHT, NUM_IN_CH,
+                       CNN_IMG_SIZE, CNN_IMG_SIZE, NUM_OUT_CH, camera_buffer,
+                       input->data.uint8);
+
+    if (input->type != kTfLiteUInt8) {
+      TF_LITE_REPORT_ERROR(error_reporter, "Wrong input type.");
+    }
+
+    TfLiteStatus invoke_status = interpreter.Invoke();
+    if (invoke_status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed.");
+      break;
+    }
+
+    display_image_rgb565(IN_IMG_WIDTH, IN_IMG_HEIGHT, camera_buffer, 40, 40);
+    display_image_rgb888(CNN_IMG_SIZE, CNN_IMG_SIZE, input->data.uint8, 300,
+                         100);
+
+    TfLiteTensor* output = interpreter.output(0);
+
+    int top_ind = get_top_prediction(output->data.uint8, 10);
+    print_prediction(labels[top_ind]);
+    print_confidence(output->data.uint8[top_ind]);
+  }
+
+  return 0;
+}
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.cc
new file mode 100644
index 00000000000..22e03c63c54
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.cc
@@ -0,0 +1,79 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.h"
+
+#include <stdint.h>
+
+#include "LCD_DISCO_F746NG/LCD_DISCO_F746NG.h"
+
+LCD_DISCO_F746NG lcd;
+
+extern "C" {
+// defined in stm32746g_discovery_camera.c
+extern DCMI_HandleTypeDef hDcmiHandler;
+void DCMI_IRQHandler(void) { HAL_DCMI_IRQHandler(&hDcmiHandler); }
+void DMA2_Stream1_IRQHandler(void) {
+  HAL_DMA_IRQHandler(hDcmiHandler.DMA_Handle);
+}
+}
+
+static char lcd_output_string[50];
+
+void init_lcd() { lcd.Clear(LCD_COLOR_WHITE); }
+
+void display_image_rgb888(int x_dim, int y_dim, const uint8_t* image_data,
+                          int x_loc, int y_loc) {
+  for (int y = 0; y < y_dim; ++y) {
+    for (int x = 0; x < x_dim; ++x, image_data += 3) {
+      uint8_t a = 0xFF;
+      auto r = image_data[0];
+      auto g = image_data[1];
+      auto b = image_data[2];
+      int pixel = a << 24 | r << 16 | g << 8 | b;
+      lcd.DrawPixel(x_loc + x, y_loc + y, pixel);
+    }
+  }
+}
+
+void display_image_rgb565(int x_dim, int y_dim, const uint8_t* image_data,
+                          int x_loc, int y_loc) {
+  for (int y = 0; y < y_dim; ++y) {
+    for (int x = 0; x < x_dim; ++x, image_data += 2) {
+      uint8_t a = 0xFF;
+      uint8_t pix_lo = image_data[0];
+      uint8_t pix_hi = image_data[1];
+      uint8_t r = (0xF8 & pix_hi);
+      uint8_t g = ((0x07 & pix_hi) << 5) | ((0xE0 & pix_lo) >> 3);
+      uint8_t b = (0x1F & pix_lo) << 3;
+      int pixel = a << 24 | r << 16 | g << 8 | b;
+      // inverted image, so draw from bottom-right to top-left
+      lcd.DrawPixel(x_loc + (x_dim - x), y_loc + (y_dim - y), pixel);
+    }
+  }
+}
+
+void print_prediction(const char* prediction) {
+  // NOLINTNEXTLINE
+  sprintf(lcd_output_string, "  Prediction: %s       ", prediction);
+  lcd.DisplayStringAt(0, LINE(8), (uint8_t*)lcd_output_string, LEFT_MODE);
+}
+
+void print_confidence(uint8_t max_score) {
+  // NOLINTNEXTLINE
+  sprintf(lcd_output_string, "  Confidence: %.1f%%   ",
+          (max_score / 255.0) * 100.0);
+  lcd.DisplayStringAt(0, LINE(9), (uint8_t*)lcd_output_string, LEFT_MODE);
+}
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.h b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.h
new file mode 100644
index 00000000000..b1148121380
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/display_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_STM32F746_DISCOVERY_DISPLAY_UTIL_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_STM32F746_DISCOVERY_DISPLAY_UTIL_H_
+
+#include <stdint.h>
+
+void init_lcd();
+
+void display_image_rgb888(int x_dim, int y_dim, const uint8_t* image_data,
+                          int x_loc, int y_loc);
+
+void display_image_rgb565(int x_dim, int y_dim, const uint8_t* image_data,
+                          int x_loc, int y_loc);
+
+void print_prediction(const char* prediction);
+
+void print_confidence(uint8_t max_score);
+
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_STM32F746_DISCOVERY_DISPLAY_UTIL_H_
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_provider.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_provider.cc
new file mode 100644
index 00000000000..594af5b6954
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_provider.cc
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/image_provider.h"
+
+#include "BSP_DISCO_F746NG/Drivers/BSP/STM32746G-Discovery/stm32746g_discovery_camera.h"
+
+TfLiteStatus InitCamera(tflite::ErrorReporter* error_reporter) {
+  if (BSP_CAMERA_Init(RESOLUTION_R160x120) != CAMERA_OK) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Failed to init camera.\n");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int frame_width,
+                      int frame_height, int channels, uint8_t* frame) {
+  // For consistency, the signature of this function is the
+  // same as the GetImage-function in micro_vision.
+  (void)error_reporter;
+  (void)frame_width;
+  (void)frame_height;
+  (void)channels;
+  BSP_CAMERA_SnapshotStart(frame);
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.cc
new file mode 100644
index 00000000000..49544fd27a4
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.cc
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.h"
+
+void ResizeConvertImage(tflite::ErrorReporter* error_reporter,
+                        int in_frame_width, int in_frame_height,
+                        int num_in_channels, int out_frame_width,
+                        int out_frame_height, int channels,
+                        const uint8_t* in_image, uint8_t* out_image) {
+  // offset so that only the center part of rectangular image is selected for
+  // resizing
+  int width_offset = ((in_frame_width - in_frame_height) / 2) * num_in_channels;
+
+  int yresize_ratio = (in_frame_height / out_frame_height) * num_in_channels;
+  int xresize_ratio = (in_frame_width / out_frame_width) * num_in_channels;
+  int resize_ratio =
+      (xresize_ratio < yresize_ratio) ? xresize_ratio : yresize_ratio;
+
+  for (int y = 0; y < out_frame_height; y++) {
+    for (int x = 0; x < out_frame_width; x++) {
+      int orig_img_loc =
+          y * in_frame_width * resize_ratio + x * resize_ratio + width_offset;
+      // correcting the image inversion here
+      int out_img_loc = ((out_frame_height - 1 - y) * out_frame_width +
+                         (out_frame_width - 1 - x)) *
+                        channels;
+      uint8_t pix_lo = in_image[orig_img_loc];
+      uint8_t pix_hi = in_image[orig_img_loc + 1];
+      // convert RGB565 to RGB888
+      out_image[out_img_loc] = (0xF8 & pix_hi);
+      out_image[out_img_loc + 1] =
+          ((0x07 & pix_hi) << 5) | ((0xE0 & pix_lo) >> 3);
+      out_image[out_img_loc + 2] = (0x1F & pix_lo) << 3;
+    }
+  }
+}
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.h b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.h
new file mode 100644
index 00000000000..5e8a7e62860
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/stm32f746_discovery/image_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_STM32F746_DISCOVERY_IMAGE_UTIL_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_STM32F746_DISCOVERY_IMAGE_UTIL_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+#define NUM_IN_CH 2
+#define IN_IMG_WIDTH 160
+#define IN_IMG_HEIGHT 120
+
+void ResizeConvertImage(tflite::ErrorReporter* error_reporter,
+                        int in_frame_width, int in_frame_height,
+                        int num_in_channels, int out_frame_width,
+                        int out_frame_height, int channels,
+                        const uint8_t* in_frame, uint8_t* out_frame);
+
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_STM32F746_DISCOVERY_IMAGE_UTIL_H_
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/util.h b/tensorflow/lite/micro/examples/image_recognition_experimental/util.h
new file mode 100644
index 00000000000..7927e1b6fc7
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/util.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_UTIL_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_UTIL_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#define IMAGE_SIZE 3072
+#define CHANNEL_SIZE 1024
+#define R_CHANNEL_OFFSET 0
+#define G_CHANNEL_OFFSET CHANNEL_SIZE
+#define B_CHANNEL_OFFSET (CHANNEL_SIZE * 2)
+
+int get_top_prediction(const uint8_t* predictions, int num_categories) {
+  int max_score = predictions[0];
+  int guess = 0;
+
+  for (int category_index = 1; category_index < num_categories;
+       category_index++) {
+    const uint8_t category_score = predictions[category_index];
+    if (category_score > max_score) {
+      max_score = category_score;
+      guess = category_index;
+    }
+  }
+
+  return guess;
+}
+
+void reshape_cifar_image(uint8_t* image_data, int num_bytes) {
+  uint8_t temp_data[IMAGE_SIZE];
+
+  memcpy(temp_data, image_data, num_bytes);
+
+  int k = 0;
+  for (int i = 0; i < CHANNEL_SIZE; i++) {
+    int r_ind = R_CHANNEL_OFFSET + i;
+    int g_ind = G_CHANNEL_OFFSET + i;
+    int b_ind = B_CHANNEL_OFFSET + i;
+
+    image_data[k] = temp_data[r_ind];
+    k++;
+    image_data[k] = temp_data[g_ind];
+    k++;
+    image_data[k] = temp_data[b_ind];
+    k++;
+  }
+}
+
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_IMAGE_RECOGNITION_EXPERIMENTAL_UTIL_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/BUILD b/tensorflow/lite/micro/examples/micro_speech/BUILD
index cc80ce60712..d724972fbed 100644
--- a/tensorflow/lite/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/micro/examples/micro_speech/BUILD
@@ -22,12 +22,12 @@ cc_library(
 )
 
 cc_library(
-    name = "tiny_conv_simple_features_model_data",
+    name = "model",
     srcs = [
-        "simple_features/tiny_conv_simple_features_model_data.cc",
+        "simple_features/model.cc",
     ],
     hdrs = [
-        "simple_features/tiny_conv_simple_features_model_data.h",
+        "simple_features/model.h",
     ],
 )
 
@@ -52,7 +52,7 @@ tflite_micro_cc_test(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_features_test_data",
-        "//tensorflow/lite/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
+        "//tensorflow/lite/micro/examples/micro_speech/micro_features:model",
         "//tensorflow/lite/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -355,7 +355,7 @@ cc_binary(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
-        "//tensorflow/lite/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
+        "//tensorflow/lite/micro/examples/micro_speech/micro_features:model",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -376,7 +376,7 @@ cc_binary(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
-        "//tensorflow/lite/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
+        "//tensorflow/lite/micro/examples/micro_speech/micro_features:model",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
     ],
diff --git a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
index 636cd04c449..18d5fa52505 100644
--- a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
@@ -28,12 +28,12 @@ third_party/kissfft/tools/kiss_fftr.h
 
 MICRO_SPEECH_TEST_SRCS := \
 tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc \
-tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc \
 tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
 tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
 
 MICRO_SPEECH_TEST_HDRS := \
-tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/micro/examples/micro_speech/micro_features/model.h \
 tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
 tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
 
@@ -190,7 +190,7 @@ tensorflow/lite/micro/examples/micro_speech/audio_provider.cc \
 tensorflow/lite/micro/examples/micro_speech/feature_provider.cc \
 tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
 tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
-tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc \
 tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc \
 tensorflow/lite/micro/examples/micro_speech/command_responder.cc \
 $(MICRO_FEATURES_GENERATOR_SRCS)
@@ -200,7 +200,7 @@ tensorflow/lite/micro/examples/micro_speech/audio_provider.h \
 tensorflow/lite/micro/examples/micro_speech/feature_provider.h \
 tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
 tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
-tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/micro/examples/micro_speech/micro_features/model.h \
 tensorflow/lite/micro/examples/micro_speech/recognize_commands.h \
 tensorflow/lite/micro/examples/micro_speech/command_responder.h \
 tensorflow/lite/micro/examples/micro_speech/main_functions.h \
@@ -215,7 +215,7 @@ tensorflow/lite/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
 tensorflow/lite/micro/examples/micro_speech/feature_provider.cc \
 tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
 tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
-tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc \
 tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc \
 tensorflow/lite/micro/examples/micro_speech/command_responder.cc \
 $(MICRO_FEATURES_GENERATOR_SRCS)
@@ -228,7 +228,7 @@ tensorflow/lite/micro/examples/micro_speech/feature_provider.h \
 tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h \
 tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
 tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
-tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/micro/examples/micro_speech/micro_features/model.h \
 tensorflow/lite/micro/examples/micro_speech/recognize_commands.h \
 tensorflow/lite/micro/examples/micro_speech/command_responder.h \
 tensorflow/lite/micro/examples/micro_speech/main_functions.h \
diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md
index 4b75041e0c8..5c20aa5fe75 100644
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@@ -1,21 +1,21 @@
-# Micro speech example
+# Micro Speech Example
 
-This example shows how you can use TensorFlow Lite to run a 20 kilobyte neural
-network model to recognize keywords in speech. It's designed to run on systems
-with very small amounts of memory such as microcontrollers and DSPs.
+This example shows how to run a 20 kB model that can recognize 2 keywords,
+"yes" and "no", from speech data.
 
-The example application listens to its surroundings with a microphone and
-indicates when it has detected a word by lighting an LED or displaying data on a
+The application listens to its surroundings with a microphone and indicates
+when it has detected a word by lighting an LED or displaying data on a
 screen, depending on the capabilities of the device.
 
-The code has a small footprint (for example around 22 kilobytes on a Cortex
+![Animation on Arduino](images/animation_on_arduino.gif)
+
+The code has a small footprint (for example, around 22 kilobytes on a Cortex
 M3) and only uses about 10 kilobytes of RAM for working memory, so it's able to
 run on systems like an STM32F103 with only 20 kilobytes of total SRAM and 64
 kilobytes of Flash.
 
 ## Table of contents
 
--   [Getting started](#getting-started)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
@@ -23,7 +23,6 @@ kilobytes of Flash.
 -   [Deploy to NXP FRDM K66F](#deploy-to-nxp-frdm-k66f)
 -   [Run on macOS](#run-on-macos)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
--   [Calculating the input to the neural network](#calculating-the-input-to-the-neural-network)
 -   [Train your own model](#train-your-own-model)
 
 ## Deploy to Arduino
@@ -96,9 +95,9 @@ The sample has been tested on ESP-IDF version 4.0 with the following devices: -
 ESP-EYE is a board which has a built-in microphone which can be used to run this
 example , if you want to use other esp boards you will have to connect
 microphone externally and write your own
-[audio_provider.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech/esp/audio_provider.cc).
+[audio_provider.cc](esp/audio_provider.cc).
 You can also edit the
-[command_responder.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech/command_responder.cc)
+[command_responder.cc](command_responder.cc)
 to define your own actions after detecting command.
 
 ### Install the ESP IDF
@@ -537,165 +536,13 @@ the trained TensorFlow model, runs some example inputs through it, and got the
 expected outputs.
 
 To understand how TensorFlow Lite does this, you can look at the source in
-[micro_speech_test.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc).
+[micro_speech_test.cc](micro_speech_test.cc).
 It's a fairly small amount of code that creates an interpreter, gets a handle to
 a model that's been compiled into the program, and then invokes the interpreter
 with the model and sample inputs.
 
-## Calculating the input to the neural network
-
-The TensorFlow Lite model doesn't take in raw audio sample data. Instead it
-works with spectrograms, which are two dimensional arrays that are made up of
-slices of frequency information, each taken from a different time window. This
-test uses spectrograms that have been pre-calculated from one-second WAV files
-in the test data set. In a complete application these spectrograms would be
-calculated at runtime from microphone inputs, but the code for doing that is not
-yet included in this sample code.
-
-The recipe for creating the spectrogram data is that each frequency slice is
-created by running an FFT across a 30ms section of the audio sample data. The
-input samples are treated as being between -1 and +1 as real values (encoded as
--32,768 and 32,767 in 16-bit signed integer samples).
-
-This results in an FFT with 256 entries. Every sequence of six entries is
-averaged together, giving a total of 43 frequency buckets in the final slice.
-The results are stored as unsigned eight-bit values, where 0 represents a real
-number of zero, and 255 represents 127.5 as a real number.
-
-Each adjacent frequency entry is stored in ascending memory order (frequency
-bucket 0 at data[0], bucket 1 at data [1], etc). The window for the frequency
-analysis is then moved forward by 20ms, and the process repeated, storing the
-results in the next memory row (for example bucket 0 in this moved window would
-be in data[43 + 0], etc). This process happens 49 times in total, producing a
-single channel image that is 43 pixels wide, and 49 rows high.
-
-Here's an illustration of the process:
-
-![spectrogram diagram](https://storage.googleapis.com/download.tensorflow.org/example_images/spectrogram_diagram.png)
-
-The test data files have been generated by running the following commands. See
-the training instructions below to learn how to set up the environment to run
-them.
-
-```
-python tensorflow/tensorflow/examples/speech_commands/wav_to_features.py \
---input_wav=/tmp/speech_dataset/yes/f2e59fea_nohash_1.wav \
---output_c_file=/tmp/yes_features_data.cc \
---window_stride=20 --preprocess=average --quantize=1
-
-python tensorflow/tensorflow/examples/speech_commands/wav_to_features.py \
---input_wav=/tmp/speech_dataset/no/f9643d42_nohash_4.wav \
---output_c_file=/tmp/no_features_data.cc \
---window_stride=20 --preprocess=average --quantize=1
-```
-
 ## Train your own model
 
-The neural network model used in this example was built using the
-[TensorFlow speech commands tutorial](https://www.tensorflow.org/tutorials/sequences/audio_recognition).
-You can retrain it to recognize any combination of words from this list:
-
-```
-yes
-no
-up
-down
-left
-right
-on
-off
-stop
-go
-```
-
-### Use Google Colaboratory
-
-The easiest way to train your own speech model is by running [`train_speech_model.ipynb`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb)
-in Google Colaboratory. This avoids the need to install dependencies, and allows
-the use of GPUs for training. Total training time will be 1.5-2hrs.
-
-We strongly recommend trying this approach first.
-
-### Use your local machine
-
-You can use the following commands to train the model on your own machine. It
-may be easiest to run these commands in a
-[TensorFlow Docker container](https://www.tensorflow.org/install/docker).
-
-You must currently use the TensorFlow Nightly `pip` package. This version is
-confirmed to work:
-
-```
-tf-nightly-gpu==1.15.0.dev20190729
-```
-
-To begin training, run the following:
-
-```
-python tensorflow/tensorflow/examples/speech_commands/train.py \
---model_architecture=tiny_conv --window_stride=20 --preprocess=micro \
---wanted_words="yes,no" --silence_percentage=25 --unknown_percentage=25 \
---quantize=1 --verbosity=INFO --how_many_training_steps="15000,3000" \
---learning_rate="0.001,0.0001" --summaries_dir=/tmp/retrain_logs \
---data_dir=/tmp/speech_dataset --train_dir=/tmp/speech_commands_train
-```
-
-The training process is likely to take a couple of hours. Once it
-has completed, the next step is to freeze the variables:
-
-```
-python tensorflow/tensorflow/examples/speech_commands/freeze.py \
---model_architecture=tiny_conv --window_stride=20 --preprocess=micro \
---wanted_words="yes,no" --quantize=1 --output_file=/tmp/tiny_conv.pb \
---start_checkpoint=/tmp/speech_commands_train/tiny_conv.ckpt-18000
-```
-
-The next step is to create a TensorFlow Lite file from the frozen graph:
-
-```
-toco \
---graph_def_file=/tmp/tiny_conv.pb --output_file=/tmp/tiny_conv.tflite \
---input_shapes=1,49,40,1 --input_arrays=Reshape_2 --output_arrays='labels_softmax' \
---inference_type=QUANTIZED_UINT8 --mean_values=0 --std_dev_values=9.8077
-```
-
-Finally, convert the file into a C source file that can be compiled into an
-embedded system:
-
-```
-xxd -i /tmp/tiny_conv.tflite > /tmp/tiny_conv_micro_features_model_data.cc
-```
-
-### Use Google Cloud
-
-If want to train your model in Google Cloud you can do so by using
-pre-configured Deep Learning images.
-
-First create the VM:
-
-```
-export IMAGE_FAMILY="tf-latest-cpu"
-export ZONE="us-west1-b" # Or any other required region
-export INSTANCE_NAME="model-trainer"
-export INSTANCE_TYPE="n1-standard-8" # or any other instance type
-gcloud compute instances create $INSTANCE_NAME \
-        --zone=$ZONE \
-        --image-family=$IMAGE_FAMILY \
-        --image-project=deeplearning-platform-release \
-        --machine-type=$INSTANCE_TYPE \
-        --boot-disk-size=120GB \
-        --min-cpu-platform=Intel\ Skylake
-```
-
-As soon as instance has been created you can SSH to it(as a jupyter user!):
-
-```
-gcloud compute ssh "jupyter@${INSTANCE_NAME}"
-```
-
-Finally, follow the instructions in the previous section to train the model. Do
-not forget to remove the instance when training is done:
-
-```
-gcloud compute instances delete "${INSTANCE_NAME}" --zone="${ZONE}"
-```
+So far you have used an existing trained model to run inference on
+microcontrollers. If you wish to train your own model, follow the instructions
+given in the [train/](train/) directory.
diff --git a/tensorflow/lite/micro/examples/micro_speech/apollo3/pushbutton_test.cc b/tensorflow/lite/micro/examples/micro_speech/apollo3/pushbutton_test.cc
index 34b2b3b36b2..1126d7db4d1 100644
--- a/tensorflow/lite/micro/examples/micro_speech/apollo3/pushbutton_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/apollo3/pushbutton_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
  * micro_speech_test.cc */
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/examples/micro_speech/simple_features/model.h"
 #include "tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.h"
-#include "tensorflow/lite/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h"
 #include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
@@ -60,8 +60,7 @@ TF_LITE_MICRO_TEST(TestSimpleFeaturesGenerator) {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model =
-      ::tflite::GetModel(g_tiny_conv_simple_features_model_data);
+  const tflite::Model* model = ::tflite::GetModel(g_model);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     TF_LITE_REPORT_ERROR(error_reporter,
                          "Model provided is schema version %d not equal "
diff --git a/tensorflow/lite/micro/examples/micro_speech/disco_f746ng/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/disco_f746ng/Makefile.inc
index de82216a616..2a55df38671 100644
--- a/tensorflow/lite/micro/examples/micro_speech/disco_f746ng/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/disco_f746ng/Makefile.inc
@@ -1,6 +1,6 @@
 # Settings for the Discovery STM32F746NG board.
 ifneq ($(filter disco_f746ng,$(ALL_TAGS)),)
-  MBED_PROJECT_FILES += \
+  micro_speech_MBED_PROJECT_FILES += \
     AUDIO_DISCO_F746NG.lib \
     BSP_DISCO_F746NG.lib \
     SDRAM_DISCO_F746NG.lib \
diff --git a/tensorflow/lite/micro/examples/micro_speech/esp/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/esp/Makefile.inc
index a658c2fad94..4c533bd3530 100644
--- a/tensorflow/lite/micro/examples/micro_speech/esp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/esp/Makefile.inc
@@ -27,7 +27,7 @@ MICRO_SPEECH_SRCS += $(ESP_MICRO_SPEECH_SRCS)
 MICRO_SPEECH_HDRS += $(ESP_MICRO_SPEECH_HDRS)
 MAIN_SRCS += $(ESP_MICRO_SPEECH_SRCS)
 
-ESP_PROJECT_FILES += \
+micro_speech_ESP_PROJECT_FILES := \
 sdkconfig.defaults
 
 endif
diff --git a/tensorflow/lite/micro/examples/micro_speech/images/animation_on_arduino.gif b/tensorflow/lite/micro/examples/micro_speech/images/animation_on_arduino.gif
new file mode 100644
index 00000000000..66ab9c1a593
Binary files /dev/null and b/tensorflow/lite/micro/examples/micro_speech/images/animation_on_arduino.gif differ
diff --git a/tensorflow/lite/micro/examples/micro_speech/images/model_architecture.png b/tensorflow/lite/micro/examples/micro_speech/images/model_architecture.png
new file mode 100644
index 00000000000..ce91faddf67
Binary files /dev/null and b/tensorflow/lite/micro/examples/micro_speech/images/model_architecture.png differ
diff --git a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
index a684940197e..d3989c07333 100644
--- a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/micro_speech/command_responder.h"
 #include "tensorflow/lite/micro/examples/micro_speech/feature_provider.h"
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h"
-#include "tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/model.h"
 #include "tensorflow/lite/micro/examples/micro_speech/recognize_commands.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
@@ -57,7 +57,7 @@ void setup() {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  model = tflite::GetModel(g_tiny_conv_micro_features_model_data);
+  model = tflite::GetModel(g_model);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     TF_LITE_REPORT_ERROR(error_reporter,
                          "Model provided is schema version %d not equal "
@@ -74,14 +74,22 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  static tflite::MicroOpResolver<3> micro_op_resolver(error_reporter);
+  if (micro_op_resolver.AddBuiltin(
+          tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+          tflite::ops::micro::Register_DEPTHWISE_CONV_2D()) != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(
+          tflite::BuiltinOperator_FULLY_CONNECTED,
+          tflite::ops::micro::Register_FULLY_CONNECTED()) != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                                   tflite::ops::micro::Register_SOFTMAX()) !=
+      kTfLiteOk) {
+    return;
+  }
 
   // Build an interpreter to run the model with.
   static tflite::MicroInterpreter static_interpreter(
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD b/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD
index da9f500cabd..71010493102 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD
@@ -21,12 +21,12 @@ cc_library(
 )
 
 cc_library(
-    name = "tiny_conv_micro_features_model_data",
+    name = "model",
     srcs = [
-        "tiny_conv_micro_features_model_data.cc",
+        "model.cc",
     ],
     hdrs = [
-        "tiny_conv_micro_features_model_data.h",
+        "model.h",
     ],
 )
 
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h
index 270c5f3cce6..e542213e8d1 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,17 +25,19 @@ limitations under the License.
 constexpr int kMaxAudioSampleSize = 512;
 constexpr int kAudioSampleFrequency = 16000;
 
-// All of these values are derived from the values used during model training,
-// if you change your model you'll need to update these constants.
+// The following values are derived from values used during model training.
+// If you change the way you preprocess the input, update all these constants.
 constexpr int kFeatureSliceSize = 40;
 constexpr int kFeatureSliceCount = 49;
 constexpr int kFeatureElementCount = (kFeatureSliceSize * kFeatureSliceCount);
 constexpr int kFeatureSliceStrideMs = 20;
 constexpr int kFeatureSliceDurationMs = 30;
 
-constexpr int kCategoryCount = 4;
+// Variables for the model's output categories.
 constexpr int kSilenceIndex = 0;
 constexpr int kUnknownIndex = 1;
+// If you modify the output categories, you need to update the following values.
+constexpr int kCategoryCount = 4;
 extern const char* kCategoryLabels[kCategoryCount];
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc
new file mode 100644
index 00000000000..45198c781b2
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc
@@ -0,0 +1,1560 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a standard TensorFlow Lite FlatBuffer model file that has been
+// converted into a C data array, so it can be easily compiled into a binary
+// for devices that don't have a file system. It was created using the command:
+// xxd -i model.tflite > model.cc
+
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/model.h"
+
+// We need to keep the data array aligned on some architectures.
+#ifdef __has_attribute
+#define HAVE_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define HAVE_ATTRIBUTE(x) 0
+#endif
+#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
+#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
+#else
+#define DATA_ALIGN_ATTRIBUTE
+#endif
+
+const unsigned char g_model[] DATA_ALIGN_ATTRIBUTE = {
+    0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,
+    0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
+    0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x1c, 0x47, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+    0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e,
+    0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x60, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x3c, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x0e, 0xba, 0xff, 0xff, 0x38, 0x00, 0x00, 0x00,
+    0xbc, 0xb9, 0xff, 0xff, 0xc0, 0xb9, 0xff, 0xff, 0x1e, 0xba, 0xff, 0xff,
+    0xe0, 0x01, 0x00, 0x00, 0xcc, 0xb9, 0xff, 0xff, 0xd0, 0xb9, 0xff, 0xff,
+    0x2e, 0xba, 0xff, 0xff, 0x60, 0x03, 0x00, 0x00, 0x36, 0xba, 0xff, 0xff,
+    0x7c, 0x06, 0x00, 0x00, 0x3e, 0xba, 0xff, 0xff, 0x68, 0x45, 0x00, 0x00,
+    0xec, 0xb9, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e,
+    0x30, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
+    0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,
+    0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,
+    0x10, 0xfa, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x2c, 0x45, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x9c, 0x44, 0x00, 0x00,
+    0x8c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00,
+    0x68, 0x01, 0x00, 0x00, 0x3c, 0x02, 0x00, 0x00, 0x50, 0x05, 0x00, 0x00,
+    0x8e, 0xbb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
+    0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00,
+    0x94, 0xfa, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xc6, 0xd0, 0xd0, 0x3d, 0x01, 0x00, 0x00, 0x00, 0xf5, 0xff, 0xcf, 0x41,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xbc, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75,
+    0x00, 0x00, 0x00, 0x00, 0x04, 0xfb, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x09, 0xf5, 0x83, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x14, 0x71, 0x83, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x72, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02, 0x10, 0x00, 0x00, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+    0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
+    0x64, 0xbc, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2d, 0x95, 0x98, 0x38,
+    0x20, 0x00, 0x00, 0x00, 0x27, 0xff, 0xff, 0xff, 0x97, 0xff, 0xff, 0xff,
+    0x58, 0x00, 0x00, 0x00, 0x66, 0xff, 0xff, 0xff, 0x13, 0xff, 0xff, 0xff,
+    0x72, 0xfe, 0xff, 0xff, 0x5d, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0xea, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,
+    0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0xec, 0xfb, 0xff, 0xff,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x5a, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0x54, 0xfc, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x9c, 0xd2, 0xb5, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x48, 0x18, 0x1f, 0x41, 0x01, 0x00, 0x00, 0x00, 0x4a, 0x21, 0x4b, 0xc1,
+    0xc2, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+    0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e,
+    0x74, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57,
+    0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72,
+    0x73, 0x00, 0x00, 0x00, 0xe4, 0xfc, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8a, 0x0f, 0x3b, 0x3a,
+    0x01, 0x00, 0x00, 0x00, 0xfc, 0x0b, 0xb4, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0xd9, 0x26, 0xbf, 0xbd, 0x80, 0x02, 0x00, 0x00, 0x60, 0x38, 0xab, 0xcb,
+    0xfa, 0x7e, 0xa2, 0x55, 0x6e, 0x87, 0xa5, 0x9b, 0xb4, 0x66, 0x5c, 0x6f,
+    0xae, 0xdb, 0xcd, 0xb6, 0xc2, 0x60, 0xa9, 0x7d, 0xd4, 0xac, 0xa6, 0x90,
+    0x87, 0x6b, 0x50, 0x95, 0xde, 0xcd, 0xaa, 0xa1, 0x9c, 0x65, 0xb5, 0x6d,
+    0xb0, 0xa5, 0xa5, 0x7f, 0x73, 0x95, 0x63, 0x81, 0x7a, 0xc6, 0xaf, 0x82,
+    0x69, 0x89, 0xc3, 0x3c, 0x47, 0x73, 0x89, 0x4f, 0x33, 0xbc, 0x85, 0x5d,
+    0x69, 0x11, 0x5b, 0xb9, 0xf1, 0x95, 0x8f, 0x5c, 0x7c, 0x59, 0x6c, 0xa0,
+    0xa5, 0x7c, 0x5a, 0x7c, 0xb5, 0xa9, 0x7e, 0xa1, 0xb8, 0x65, 0xb3, 0x86,
+    0xc1, 0x9f, 0x5c, 0x86, 0x7f, 0x74, 0x52, 0xa8, 0xc9, 0xc5, 0x71, 0x96,
+    0x7a, 0x65, 0xc7, 0x69, 0x94, 0xa7, 0x65, 0x68, 0x69, 0x8d, 0x6d, 0x9e,
+    0x59, 0xd4, 0x75, 0x7a, 0x4f, 0x70, 0xca, 0x48, 0x25, 0x8a, 0x69, 0x4d,
+    0x2a, 0xa6, 0x76, 0x69, 0x6a, 0x02, 0x3b, 0xa2, 0xea, 0xc2, 0x73, 0x6b,
+    0x86, 0x4d, 0x3a, 0xa2, 0xa2, 0x88, 0x4e, 0x6c, 0xb3, 0x83, 0x39, 0x93,
+    0xa6, 0x85, 0xb8, 0x7a, 0xa8, 0x7d, 0x2e, 0x7b, 0x7f, 0x69, 0x56, 0xb5,
+    0xbb, 0xae, 0x23, 0x78, 0x67, 0x5c, 0xd2, 0x82, 0x7d, 0x96, 0x46, 0x74,
+    0x70, 0x72, 0x6a, 0x90, 0x43, 0xce, 0x44, 0x75, 0x4a, 0x58, 0xc7, 0x5c,
+    0x34, 0x84, 0x46, 0x4b, 0x41, 0x6c, 0x62, 0x83, 0x7e, 0x01, 0x9b, 0x9b,
+    0xeb, 0xf7, 0x58, 0x6f, 0x8a, 0x43, 0xb3, 0x9f, 0x9c, 0x9e, 0x55, 0xa8,
+    0xaa, 0x84, 0x8f, 0x8f, 0xb0, 0x9e, 0xc8, 0x81, 0xb6, 0x80, 0xa0, 0x81,
+    0x86, 0x73, 0x5d, 0xdc, 0xb9, 0xae, 0xa2, 0x6c, 0x46, 0x67, 0xfa, 0x79,
+    0x89, 0xaf, 0xa0, 0x74, 0x76, 0x85, 0x72, 0xb1, 0x2a, 0xbb, 0xa0, 0x6d,
+    0x4f, 0x50, 0xc9, 0x5d, 0x2f, 0xaa, 0x9c, 0x63, 0x3f, 0x59, 0x63, 0x90,
+    0x73, 0x1e, 0xb3, 0x94, 0xcd, 0xff, 0x3c, 0x63, 0x9b, 0x59, 0xc5, 0xa2,
+    0x9f, 0x9a, 0x53, 0xab, 0xb0, 0x74, 0xb2, 0x6f, 0x8a, 0xa7, 0xd5, 0x8d,
+    0xb8, 0x7e, 0x9e, 0x78, 0x84, 0x61, 0x66, 0xe7, 0xa7, 0x9f, 0xb7, 0x45,
+    0x24, 0x61, 0xfd, 0x69, 0x87, 0xb8, 0xb2, 0x7a, 0x7c, 0x58, 0x64, 0xa3,
+    0x07, 0xa9, 0xaf, 0x69, 0x49, 0x2f, 0xc2, 0x46, 0x3b, 0xaf, 0x9a, 0x70,
+    0x6b, 0x25, 0x5f, 0x9d, 0x82, 0x33, 0xa1, 0x54, 0xae, 0xff, 0x31, 0x5d,
+    0xaf, 0x51, 0xb2, 0x82, 0x9c, 0xa9, 0x5b, 0x8c, 0xab, 0x75, 0xb3, 0x32,
+    0x42, 0xbd, 0xcd, 0x77, 0xb6, 0x67, 0x9a, 0x5f, 0x6c, 0x71, 0x6e, 0xc2,
+    0xac, 0x97, 0x9f, 0x4b, 0x21, 0x6a, 0xfc, 0x77, 0x83, 0xa1, 0xa3, 0x6a,
+    0x7a, 0x6d, 0x5e, 0x87, 0x02, 0xa6, 0x8f, 0x7f, 0x5c, 0x2e, 0xc1, 0x51,
+    0x4a, 0xa7, 0x96, 0x79, 0x83, 0x2e, 0x5a, 0x84, 0x82, 0x5c, 0x61, 0x3a,
+    0x4a, 0xff, 0x2a, 0x51, 0xa4, 0x6b, 0x82, 0x5e, 0x67, 0xb3, 0x71, 0x80,
+    0xad, 0x62, 0x59, 0x40, 0x26, 0xd7, 0xcf, 0x68, 0xab, 0x7c, 0x6a, 0x69,
+    0x5b, 0x7c, 0x84, 0xbc, 0x95, 0x68, 0x77, 0x63, 0x3f, 0x85, 0xed, 0x7b,
+    0x71, 0xa0, 0x76, 0x90, 0x8c, 0x6c, 0x61, 0x81, 0x16, 0x74, 0x72, 0x94,
+    0x74, 0x37, 0xb5, 0x3d, 0x55, 0x96, 0x86, 0xad, 0x87, 0x39, 0x59, 0x88,
+    0x5b, 0x65, 0x60, 0x33, 0x33, 0xe6, 0x2b, 0x4a, 0xb6, 0x82, 0x50, 0x56,
+    0x51, 0x97, 0x71, 0x83, 0xa6, 0x60, 0x57, 0x51, 0x58, 0xe4, 0xd0, 0x87,
+    0xa1, 0x78, 0x4c, 0x67, 0x72, 0x74, 0x86, 0xc6, 0x60, 0x47, 0x50, 0x96,
+    0x67, 0x96, 0xdd, 0x7d, 0x63, 0x85, 0x5e, 0x98, 0xa2, 0x64, 0x5f, 0x8a,
+    0x3b, 0x40, 0x54, 0xcb, 0xa0, 0x61, 0xa7, 0x44, 0x5f, 0x6d, 0x57, 0xb3,
+    0xb9, 0x2e, 0x61, 0x8e, 0x54, 0x78, 0x85, 0x58, 0x43, 0xb0, 0x27, 0x5d,
+    0x8a, 0x7c, 0x8a, 0x58, 0x40, 0x83, 0x82, 0x9b, 0x6c, 0x60, 0x6b, 0x72,
+    0x7f, 0xde, 0xc9, 0x7d, 0x6f, 0x5f, 0x90, 0x7e, 0x7e, 0x7e, 0x8b, 0xe5,
+    0x51, 0x37, 0x7a, 0xa9, 0xa2, 0xc5, 0xd3, 0x81, 0x32, 0x4b, 0x80, 0xa9,
+    0xc5, 0x76, 0x56, 0x99, 0x33, 0x19, 0x72, 0xe6, 0xdb, 0x90, 0xa8, 0x50,
+    0x65, 0x44, 0x77, 0xdb, 0xc7, 0x48, 0x65, 0x8d, 0x3d, 0x7f, 0xa2, 0x7c,
+    0x53, 0x55, 0x26, 0x49, 0x5d, 0x7d, 0xa2, 0x6d, 0x3b, 0x5b, 0x87, 0x64,
+    0x3a, 0x5b, 0x8d, 0x93, 0x7a, 0xb4, 0xca, 0x6d, 0x16, 0x5a, 0x99, 0x82,
+    0x8d, 0x6a, 0x92, 0xa0, 0x39, 0x2c, 0x95, 0xc8, 0xb8, 0xf5, 0xc8, 0x66,
+    0x2a, 0x45, 0x84, 0x9c, 0xc7, 0x8e, 0x61, 0x7b, 0x43, 0x28, 0x86, 0xff,
+    0xd2, 0xc8, 0x9c, 0x46, 0x65, 0x33, 0x82, 0xd8, 0xcb, 0x73, 0x63, 0x80,
+    0xda, 0xc0, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x00, 0x00,
+    0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f,
+    0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f, 0x46, 0x61, 0x6b, 0x65,
+    0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e,
+    0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f, 0x74, 0x72, 0x61, 0x6e,
+    0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,
+    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x87, 0xff, 0xdb, 0x39,
+    0x01, 0x00, 0x00, 0x00, 0xd8, 0xb2, 0x5d, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x37, 0xdc, 0x56, 0xbd, 0x80, 0x3e, 0x00, 0x00, 0x67, 0x6d, 0x74, 0x77,
+    0x35, 0x66, 0x87, 0x95, 0x8e, 0x82, 0x5e, 0x70, 0x6e, 0xa7, 0x60, 0x64,
+    0x86, 0x5e, 0x93, 0x7a, 0x76, 0x74, 0x71, 0x8c, 0x61, 0x71, 0x60, 0x8b,
+    0x83, 0x48, 0x8b, 0x5f, 0x95, 0x99, 0x5b, 0x59, 0x49, 0x44, 0x79, 0x62,
+    0x8e, 0x77, 0x71, 0x89, 0x64, 0x46, 0x8f, 0x8e, 0x80, 0x73, 0x71, 0x81,
+    0x85, 0x4a, 0x73, 0x57, 0x66, 0x58, 0x75, 0x93, 0x99, 0x58, 0x8a, 0x7b,
+    0x87, 0x81, 0xa1, 0x46, 0x79, 0x6c, 0x83, 0x7a, 0x92, 0x74, 0x6f, 0x6b,
+    0x79, 0x77, 0x97, 0x8a, 0x95, 0x75, 0xa2, 0x49, 0x80, 0x4e, 0x7f, 0x6d,
+    0xaa, 0xac, 0x6c, 0x5d, 0x57, 0x82, 0x97, 0x77, 0x6f, 0x75, 0x95, 0x73,
+    0x7e, 0x51, 0x9f, 0x5b, 0x54, 0x92, 0x60, 0x72, 0x80, 0x6a, 0x92, 0x83,
+    0x9b, 0x85, 0x7b, 0x4d, 0x55, 0x4d, 0xb2, 0x7d, 0x65, 0x95, 0x76, 0x42,
+    0x61, 0x49, 0xa2, 0x73, 0x9f, 0x7d, 0x7c, 0x54, 0x51, 0x76, 0xa1, 0x7f,
+    0x86, 0x69, 0x98, 0x59, 0x6d, 0x84, 0x9f, 0x7b, 0x86, 0x79, 0x88, 0x55,
+    0x9c, 0x72, 0x95, 0x8a, 0x91, 0x7a, 0x77, 0x95, 0x7b, 0x87, 0x87, 0x85,
+    0x95, 0x72, 0x77, 0x59, 0x7c, 0x80, 0x90, 0x8f, 0x8a, 0x62, 0x76, 0x9f,
+    0x64, 0x84, 0x71, 0x7e, 0x7c, 0x66, 0x8e, 0x94, 0x6e, 0xaa, 0x77, 0x5c,
+    0x6b, 0x63, 0x68, 0x82, 0x89, 0x46, 0x61, 0x74, 0x8e, 0x85, 0x6b, 0x57,
+    0x74, 0x50, 0x87, 0x66, 0x87, 0x98, 0x59, 0x7d, 0xa2, 0x59, 0x75, 0x64,
+    0x72, 0x8c, 0x6a, 0x92, 0x8c, 0x56, 0x88, 0x7a, 0x6e, 0x77, 0x9c, 0x82,
+    0x7e, 0x5a, 0x91, 0x80, 0x9c, 0x9e, 0x60, 0x8b, 0x6d, 0x76, 0x8d, 0x68,
+    0x6c, 0x70, 0x6f, 0x8b, 0x61, 0x6e, 0x86, 0x78, 0x81, 0x81, 0x77, 0x79,
+    0x76, 0x69, 0x7d, 0x7b, 0x96, 0x8b, 0x95, 0x91, 0xa2, 0x7b, 0x86, 0x8d,
+    0x8b, 0x89, 0x86, 0x5a, 0x5c, 0x4d, 0x96, 0x80, 0x81, 0x55, 0x80, 0x80,
+    0x7a, 0x76, 0x99, 0x98, 0x61, 0x95, 0x5a, 0x78, 0x5a, 0x6c, 0x89, 0x81,
+    0x98, 0x77, 0x62, 0x77, 0x93, 0x4d, 0x9f, 0x77, 0x72, 0x87, 0x95, 0x71,
+    0x65, 0x72, 0xac, 0x8c, 0xa2, 0x89, 0x90, 0x7b, 0x67, 0x60, 0x8a, 0xb3,
+    0x72, 0x8f, 0x5c, 0x82, 0x74, 0x76, 0x7c, 0x85, 0x78, 0x6b, 0x97, 0x6d,
+    0x86, 0x82, 0x76, 0x84, 0x89, 0x89, 0x7f, 0x6a, 0x7a, 0x7f, 0x6c, 0x77,
+    0x80, 0x35, 0x7d, 0x66, 0x96, 0x7e, 0x88, 0x55, 0x6b, 0x55, 0x7c, 0xa7,
+    0x7f, 0x9f, 0x64, 0x8b, 0xa0, 0x81, 0x80, 0x97, 0xaf, 0x7a, 0x7d, 0x61,
+    0x7a, 0x77, 0x6f, 0x8c, 0x5e, 0x69, 0x6b, 0x94, 0x70, 0x6a, 0x66, 0x5d,
+    0x78, 0x6e, 0x76, 0x64, 0xa0, 0x73, 0x8f, 0xa2, 0x9d, 0x50, 0x8e, 0x52,
+    0x51, 0x85, 0x78, 0x83, 0x8f, 0x94, 0x83, 0x7c, 0x9c, 0x64, 0x59, 0x7d,
+    0x66, 0x6a, 0x73, 0x80, 0x6a, 0x9b, 0x92, 0x7e, 0x7a, 0x78, 0x7d, 0xa0,
+    0x8a, 0x9b, 0x61, 0x9e, 0x6c, 0x64, 0x6c, 0x8e, 0x86, 0x75, 0x8a, 0x95,
+    0x8e, 0x89, 0x87, 0x8a, 0x5d, 0x8b, 0x82, 0x7c, 0x60, 0x63, 0x85, 0x85,
+    0x63, 0x96, 0xa3, 0x7f, 0x93, 0x78, 0x8c, 0x86, 0x7b, 0x78, 0x8e, 0x71,
+    0x72, 0x8b, 0x8a, 0x5e, 0x8d, 0x75, 0x78, 0xa3, 0x84, 0x67, 0xa7, 0x54,
+    0x6c, 0x80, 0x8e, 0xa8, 0x83, 0x51, 0x6e, 0x9f, 0x8b, 0x86, 0x75, 0x95,
+    0x7f, 0x7a, 0x80, 0x81, 0x8d, 0x9c, 0x83, 0x8a, 0x7b, 0x8a, 0x74, 0x6f,
+    0x8d, 0x96, 0x5b, 0x9c, 0x8d, 0x7b, 0x83, 0x79, 0x7f, 0x65, 0x7e, 0x87,
+    0x7c, 0x5d, 0x71, 0x97, 0x77, 0x44, 0x9a, 0x7f, 0xaa, 0x56, 0x75, 0x5f,
+    0x7c, 0x51, 0x8c, 0x90, 0x84, 0x9a, 0x49, 0x5d, 0x86, 0x52, 0x94, 0x95,
+    0x5b, 0x86, 0x66, 0x7d, 0x51, 0x4f, 0x7a, 0x91, 0x6d, 0x6e, 0x72, 0x70,
+    0x83, 0x4f, 0x9b, 0x9a, 0x8a, 0x77, 0x6a, 0xa1, 0x71, 0x60, 0x61, 0x98,
+    0x67, 0x4e, 0x7a, 0x8a, 0x53, 0x6b, 0x99, 0xa0, 0x91, 0x46, 0x8a, 0x8b,
+    0x47, 0x78, 0xa9, 0x7b, 0x71, 0x6c, 0x81, 0x68, 0x53, 0x73, 0xaf, 0x70,
+    0x62, 0x6d, 0x69, 0x97, 0x70, 0x83, 0x5f, 0x7f, 0x81, 0x87, 0x65, 0x93,
+    0x67, 0x87, 0x70, 0x82, 0x79, 0x9e, 0x80, 0x77, 0x6c, 0x80, 0x92, 0x81,
+    0x8d, 0x8c, 0x89, 0x8b, 0x4e, 0x91, 0x77, 0x84, 0x99, 0x8c, 0x71, 0x88,
+    0x57, 0x7a, 0x9a, 0x8c, 0x82, 0x9b, 0x97, 0x72, 0x69, 0xac, 0x7c, 0x62,
+    0x85, 0x7d, 0x76, 0x7f, 0x59, 0x85, 0x68, 0x63, 0x94, 0x8b, 0x7b, 0x92,
+    0x7b, 0x6f, 0x77, 0x98, 0x66, 0x78, 0x74, 0x99, 0x85, 0x8c, 0x94, 0x89,
+    0x6c, 0x77, 0x89, 0x80, 0x79, 0x8a, 0xa6, 0x95, 0xa9, 0x86, 0x6f, 0x95,
+    0x90, 0x69, 0x98, 0x85, 0xa0, 0x7f, 0x56, 0xab, 0x6f, 0x5a, 0x94, 0x8b,
+    0x5a, 0x72, 0x61, 0x83, 0x54, 0x70, 0x8d, 0x8d, 0x9c, 0x5e, 0x36, 0x9b,
+    0x84, 0x32, 0x6e, 0x84, 0x79, 0x72, 0x64, 0x95, 0x83, 0x58, 0x67, 0x6c,
+    0x9e, 0x8d, 0x6e, 0x9e, 0x4f, 0x78, 0x71, 0x85, 0x75, 0x60, 0x4d, 0x7d,
+    0x64, 0x89, 0x8e, 0x89, 0x6e, 0x92, 0x53, 0x7c, 0x86, 0x8f, 0xa9, 0xb0,
+    0x8e, 0x5e, 0x76, 0x96, 0x65, 0x7c, 0x8a, 0x89, 0x75, 0x8f, 0x65, 0x94,
+    0x6c, 0x6c, 0x8d, 0x6d, 0x66, 0x6a, 0x62, 0x98, 0x53, 0x8f, 0x67, 0x76,
+    0x80, 0x89, 0x66, 0x60, 0x55, 0x81, 0x85, 0x61, 0x75, 0x78, 0x80, 0x92,
+    0x6f, 0x79, 0x66, 0x64, 0x99, 0xa7, 0x88, 0xa1, 0x86, 0x6b, 0x94, 0x88,
+    0x77, 0x83, 0x8f, 0x61, 0x72, 0x7c, 0x6f, 0x8f, 0x61, 0x56, 0x8a, 0x7b,
+    0x66, 0x8b, 0x98, 0x9d, 0x82, 0x65, 0x77, 0x98, 0x55, 0x83, 0x7a, 0x8c,
+    0x74, 0x79, 0x6e, 0x85, 0x82, 0x9a, 0x7d, 0x8d, 0x76, 0x72, 0x64, 0x81,
+    0x9a, 0x8d, 0x9f, 0x7b, 0x7c, 0x7b, 0x7b, 0x84, 0x90, 0x6b, 0xa4, 0x84,
+    0x98, 0x6f, 0x81, 0xb8, 0x6f, 0x6c, 0x87, 0x6d, 0x8c, 0x72, 0x53, 0x85,
+    0x59, 0x4d, 0x9c, 0x94, 0x7d, 0x6f, 0x4f, 0x82, 0x5d, 0x71, 0x6e, 0x78,
+    0x61, 0x61, 0x34, 0x71, 0x6a, 0x5a, 0x73, 0xa3, 0x89, 0x65, 0x4d, 0x80,
+    0x5c, 0x51, 0x81, 0x8e, 0x6c, 0x53, 0x4a, 0x95, 0x3b, 0x72, 0xa7, 0x86,
+    0x7f, 0x75, 0x61, 0xa3, 0x85, 0x6c, 0x99, 0x88, 0x7c, 0x64, 0x7a, 0x8d,
+    0x81, 0x7b, 0x6a, 0x7b, 0x8f, 0x74, 0x6d, 0xae, 0x42, 0x67, 0x88, 0xa1,
+    0x90, 0x4d, 0x7c, 0x7b, 0x62, 0x55, 0x9a, 0x80, 0x4d, 0x76, 0x5c, 0x88,
+    0x60, 0x86, 0x6f, 0x65, 0x67, 0x77, 0x8a, 0x97, 0x99, 0x7c, 0x89, 0x78,
+    0x92, 0xa7, 0x6a, 0x7f, 0x8e, 0x88, 0x9d, 0xa1, 0x7b, 0xb0, 0x69, 0x8c,
+    0x7e, 0x51, 0x76, 0x84, 0x7d, 0x91, 0x7a, 0x88, 0x7b, 0x88, 0x92, 0x79,
+    0x6d, 0x82, 0x6c, 0x8a, 0x99, 0x62, 0x82, 0x9d, 0x99, 0x97, 0x78, 0x6a,
+    0x6e, 0x83, 0x64, 0x7d, 0x8c, 0x78, 0x7c, 0x7a, 0x7d, 0x7b, 0x77, 0x84,
+    0x76, 0x57, 0x63, 0x85, 0x97, 0x94, 0x80, 0x92, 0x88, 0x73, 0x91, 0x91,
+    0x8f, 0x6d, 0x99, 0x86, 0x91, 0x7f, 0x8b, 0x87, 0x98, 0x62, 0x84, 0x70,
+    0x97, 0x7b, 0x2e, 0x9b, 0x6e, 0x2a, 0xa4, 0x9c, 0x79, 0x88, 0x54, 0x81,
+    0x4f, 0x41, 0xa0, 0x85, 0xaf, 0x9a, 0x47, 0x5a, 0x7d, 0x62, 0x7a, 0x84,
+    0x81, 0x6e, 0x41, 0xb4, 0x60, 0x47, 0x8f, 0x98, 0x6c, 0x3c, 0x3b, 0x73,
+    0x59, 0x55, 0x7c, 0xb0, 0x6e, 0x5f, 0x61, 0x97, 0x73, 0x59, 0x9f, 0x92,
+    0x89, 0x5c, 0x70, 0x96, 0x5c, 0x7c, 0x7c, 0x64, 0x7e, 0x54, 0x5c, 0x94,
+    0x56, 0x73, 0x8d, 0x95, 0x59, 0x83, 0x6c, 0x99, 0x6e, 0x5e, 0x7a, 0x99,
+    0x83, 0x93, 0x88, 0x76, 0x5a, 0x5a, 0xa5, 0x95, 0x5d, 0x63, 0x8f, 0x6e,
+    0x74, 0x65, 0x85, 0x86, 0x98, 0x83, 0x7b, 0x8a, 0x5c, 0x5e, 0x7f, 0x88,
+    0x78, 0x68, 0x8f, 0x9f, 0x94, 0x8d, 0x74, 0x7b, 0x6a, 0x91, 0x7a, 0x9a,
+    0x70, 0x67, 0xb2, 0x92, 0x75, 0x4e, 0x74, 0xa3, 0x68, 0x74, 0x91, 0x80,
+    0x55, 0x8e, 0x88, 0x73, 0x70, 0x81, 0xa1, 0xb8, 0x96, 0x48, 0x67, 0xb2,
+    0x76, 0xa1, 0x98, 0xa9, 0x61, 0x6c, 0x5f, 0x98, 0x84, 0x92, 0xa9, 0x83,
+    0x9e, 0x74, 0x7b, 0xa2, 0x6f, 0x72, 0x95, 0xa3, 0xb9, 0x80, 0x81, 0x7b,
+    0x65, 0x6b, 0x96, 0x8b, 0xae, 0x79, 0x2b, 0x86, 0x5c, 0x2c, 0x8b, 0xa3,
+    0x84, 0x74, 0x53, 0x7c, 0x54, 0x4a, 0x65, 0x89, 0xa6, 0x89, 0x47, 0x77,
+    0x50, 0x6d, 0x8b, 0x94, 0x8a, 0x61, 0x32, 0x7c, 0x6f, 0x47, 0x78, 0xa2,
+    0x9f, 0x42, 0x42, 0x71, 0x78, 0x76, 0x9e, 0x88, 0x70, 0x70, 0x56, 0x8a,
+    0x83, 0x95, 0xa7, 0x9d, 0x9d, 0x88, 0x9a, 0x92, 0x48, 0x63, 0xaf, 0x91,
+    0x6c, 0x75, 0x5d, 0x5e, 0x83, 0x86, 0xaa, 0x6f, 0x79, 0x84, 0x67, 0x79,
+    0x63, 0x69, 0x8e, 0x81, 0x6a, 0x96, 0x8d, 0x86, 0x7b, 0x9f, 0xaa, 0x8e,
+    0x63, 0x89, 0x9a, 0x7a, 0x5e, 0x7c, 0x87, 0x83, 0x81, 0x64, 0x7e, 0x59,
+    0x6d, 0x5c, 0xa4, 0x72, 0x78, 0x85, 0x9b, 0x79, 0x85, 0x7d, 0x9c, 0x7d,
+    0x9c, 0x5c, 0x66, 0x75, 0x66, 0x72, 0xb4, 0x7c, 0x83, 0x9e, 0x90, 0xae,
+    0x69, 0x71, 0xb0, 0x84, 0x86, 0x50, 0x66, 0xab, 0x75, 0x96, 0xa8, 0x6c,
+    0x87, 0x7b, 0x7e, 0x7c, 0x60, 0x55, 0x96, 0xb0, 0x6a, 0x79, 0x42, 0x9c,
+    0x97, 0xa8, 0xb2, 0x9a, 0xa0, 0x84, 0x68, 0x90, 0x90, 0x98, 0x67, 0x9c,
+    0xa3, 0x81, 0x71, 0xaa, 0x93, 0x6a, 0x84, 0x8c, 0x77, 0x79, 0x4d, 0x82,
+    0x45, 0x1e, 0x7b, 0x94, 0x86, 0x86, 0x26, 0x82, 0x41, 0x6f, 0x8b, 0x86,
+    0xa4, 0x80, 0x38, 0x71, 0x5e, 0x5b, 0x9a, 0x73, 0x86, 0x60, 0x5a, 0x9d,
+    0x7b, 0x53, 0x89, 0xa0, 0x99, 0x76, 0x57, 0x81, 0x76, 0x5a, 0x9e, 0x85,
+    0x5a, 0x7b, 0x56, 0x74, 0x71, 0x6a, 0x9c, 0x68, 0x7e, 0x76, 0x7d, 0x7f,
+    0x52, 0x71, 0x85, 0xa2, 0x96, 0x63, 0x73, 0x7c, 0x7a, 0x97, 0x9f, 0x7c,
+    0x77, 0x77, 0x59, 0x6b, 0x62, 0x77, 0xbc, 0x6b, 0x7c, 0x79, 0x75, 0x90,
+    0x67, 0x82, 0x92, 0x9c, 0x81, 0x92, 0x84, 0x7a, 0x72, 0x5b, 0x86, 0x82,
+    0x87, 0x73, 0x87, 0x7c, 0x57, 0x76, 0xa6, 0x7d, 0x7d, 0x94, 0x6a, 0x67,
+    0x76, 0x89, 0x9a, 0x6d, 0x7d, 0xa4, 0x6d, 0x7e, 0x74, 0x7e, 0x8f, 0xad,
+    0x99, 0x55, 0x5c, 0x82, 0x75, 0x9e, 0xae, 0x76, 0x6b, 0x93, 0x5d, 0x92,
+    0x6e, 0x54, 0x88, 0x8f, 0x6a, 0x72, 0x64, 0x93, 0x6e, 0x63, 0x8c, 0xa7,
+    0xa6, 0x7a, 0x57, 0x9f, 0x94, 0x91, 0xbd, 0xa4, 0x92, 0x7a, 0x68, 0x9d,
+    0x7d, 0x6b, 0x6b, 0xbc, 0xad, 0x7a, 0x73, 0x92, 0x7b, 0x6d, 0x91, 0x6a,
+    0x66, 0x8d, 0x34, 0x9b, 0x75, 0x3b, 0x93, 0x78, 0x88, 0x58, 0x1a, 0x7f,
+    0x52, 0x61, 0xa3, 0xb1, 0x9c, 0x60, 0x1d, 0x90, 0x7b, 0x37, 0x9f, 0x84,
+    0xa3, 0x6c, 0x2e, 0xac, 0x73, 0x62, 0x92, 0x9a, 0x94, 0x6b, 0x5c, 0x82,
+    0x5f, 0x4c, 0x9a, 0x8c, 0x76, 0x69, 0x77, 0x5f, 0x5d, 0x91, 0x80, 0x9a,
+    0x60, 0x4c, 0x7b, 0x57, 0x67, 0x6b, 0x92, 0x93, 0x64, 0x91, 0x55, 0x75,
+    0x41, 0x82, 0x78, 0x68, 0xa2, 0x55, 0x6a, 0x69, 0x59, 0x70, 0x8a, 0x7b,
+    0x70, 0x6e, 0x63, 0x83, 0x7f, 0xa4, 0x80, 0x85, 0x86, 0x93, 0x7e, 0x6f,
+    0x7b, 0x94, 0xa4, 0xa7, 0x97, 0x7a, 0x87, 0x64, 0x4a, 0x97, 0x94, 0x6a,
+    0x96, 0x73, 0x5e, 0x79, 0x6a, 0x99, 0x86, 0xa0, 0x93, 0xac, 0x79, 0x76,
+    0x7f, 0x7b, 0xa7, 0x75, 0x8a, 0x71, 0x53, 0x87, 0x93, 0x7f, 0x9e, 0x7b,
+    0x81, 0x70, 0x68, 0x8b, 0x8c, 0x9c, 0xaf, 0xa7, 0x6a, 0x9b, 0x49, 0x6d,
+    0x67, 0x80, 0x8b, 0x86, 0x9f, 0x80, 0x74, 0x7a, 0x96, 0x74, 0xc8, 0x9d,
+    0xa4, 0x74, 0x71, 0x6c, 0x75, 0x6a, 0x9a, 0x95, 0x97, 0x8c, 0x6e, 0x8a,
+    0x85, 0x62, 0x5f, 0x7e, 0x9e, 0x6b, 0x48, 0x93, 0x44, 0x37, 0x83, 0xa2,
+    0x97, 0x72, 0x25, 0x79, 0x32, 0x39, 0x68, 0x8f, 0x93, 0x61, 0x2b, 0x96,
+    0x94, 0x43, 0x82, 0x6e, 0x8f, 0x6d, 0x53, 0x9b, 0x65, 0x50, 0x70, 0x9d,
+    0x7d, 0x53, 0x3b, 0x86, 0x77, 0x6c, 0xa6, 0x90, 0x6b, 0x3e, 0x7b, 0x7a,
+    0x50, 0x81, 0xb4, 0x76, 0xa5, 0x74, 0x8b, 0x73, 0x79, 0x69, 0xa8, 0x9a,
+    0x82, 0x4a, 0x5e, 0x6c, 0x8d, 0x66, 0xa3, 0x80, 0x8d, 0x74, 0x5b, 0x7c,
+    0x77, 0xaa, 0x82, 0x69, 0x5e, 0x7d, 0x7f, 0x63, 0xa3, 0x8c, 0xb3, 0x9a,
+    0x81, 0x8f, 0x7b, 0x77, 0x60, 0x89, 0x6a, 0x82, 0x5a, 0x7a, 0x71, 0x61,
+    0x93, 0x73, 0x8b, 0xb0, 0xa2, 0x92, 0x7c, 0x84, 0x8b, 0x72, 0x91, 0x8d,
+    0x91, 0x80, 0x6c, 0x75, 0x7a, 0xb3, 0x95, 0x5e, 0xa5, 0x5d, 0x54, 0x8b,
+    0x63, 0x91, 0xa7, 0x68, 0x96, 0x4c, 0x5a, 0x86, 0x76, 0x82, 0xb6, 0xa0,
+    0x68, 0x6b, 0x53, 0x76, 0x60, 0x65, 0x90, 0xaf, 0x82, 0x66, 0x80, 0x7b,
+    0x84, 0xa0, 0xb0, 0xb8, 0x81, 0x6e, 0x81, 0x8a, 0x74, 0x6e, 0x97, 0xa8,
+    0x89, 0x7b, 0x7b, 0x6e, 0x63, 0x74, 0x5a, 0x7b, 0x7e, 0x84, 0x40, 0x95,
+    0x73, 0x3c, 0x7c, 0x72, 0x9b, 0x92, 0x27, 0x87, 0x69, 0x5b, 0x99, 0x8a,
+    0xa8, 0x65, 0x36, 0x8f, 0x86, 0x3e, 0xa1, 0x79, 0x9f, 0x4d, 0x41, 0xc5,
+    0x8c, 0x6a, 0x7e, 0x7f, 0x68, 0x49, 0x5c, 0x91, 0x50, 0x6a, 0x8c, 0x81,
+    0x75, 0x4c, 0x6a, 0x74, 0x8a, 0x87, 0xa0, 0x93, 0x7e, 0x6d, 0x52, 0x79,
+    0x86, 0x6a, 0x68, 0x6c, 0x83, 0x67, 0x79, 0x73, 0x6f, 0x72, 0x97, 0x84,
+    0x8b, 0x78, 0x64, 0x69, 0x8f, 0x92, 0x86, 0x61, 0x5d, 0x85, 0x70, 0x64,
+    0x7d, 0xa3, 0x92, 0xa0, 0x72, 0x71, 0x5d, 0x63, 0x7c, 0x70, 0xaf, 0x6f,
+    0x93, 0x6a, 0x7e, 0x7f, 0x64, 0xab, 0x85, 0x73, 0x8f, 0x8a, 0x7e, 0x5f,
+    0x7a, 0x6f, 0xaa, 0x71, 0x97, 0x7d, 0x60, 0x7c, 0x48, 0x69, 0xa9, 0xaa,
+    0x98, 0x7c, 0x61, 0x85, 0x66, 0x97, 0xa2, 0x73, 0x74, 0x65, 0x52, 0x67,
+    0x79, 0x8a, 0x79, 0x71, 0x85, 0x6e, 0x6d, 0x67, 0x5e, 0x7f, 0xb9, 0x93,
+    0x96, 0x53, 0x69, 0x6e, 0x7f, 0x8f, 0xab, 0x93, 0xa9, 0x70, 0x6e, 0x71,
+    0x7e, 0x87, 0x98, 0x7a, 0xae, 0x90, 0x64, 0x88, 0x8a, 0x4f, 0x6d, 0x9e,
+    0xac, 0x7e, 0x31, 0x92, 0x50, 0x26, 0x95, 0xb2, 0x90, 0x99, 0x0c, 0x84,
+    0x40, 0x4f, 0x8f, 0x76, 0xa4, 0x46, 0x4c, 0x9d, 0x8b, 0x57, 0x81, 0x79,
+    0x7b, 0x47, 0x4d, 0x9c, 0x5f, 0x3b, 0x6f, 0x90, 0x7a, 0x3f, 0x66, 0x9d,
+    0x6c, 0x45, 0x8b, 0x71, 0x79, 0x62, 0x72, 0x78, 0x93, 0x95, 0x7e, 0x86,
+    0x7a, 0x6b, 0x77, 0x74, 0x6b, 0x86, 0xa4, 0x7e, 0x84, 0x48, 0x78, 0x75,
+    0x6e, 0x8b, 0x8e, 0x56, 0x69, 0x7b, 0x59, 0x68, 0x5d, 0x77, 0x69, 0x66,
+    0x67, 0x9f, 0x75, 0x7b, 0x76, 0x64, 0xc1, 0x78, 0x7d, 0x74, 0x82, 0x73,
+    0x73, 0x90, 0xb8, 0x82, 0x7e, 0x70, 0x7b, 0x7a, 0x64, 0xa1, 0x7e, 0x85,
+    0x83, 0x81, 0x60, 0x7b, 0x91, 0x82, 0x6f, 0x95, 0xa0, 0x86, 0x6d, 0x88,
+    0x75, 0x8d, 0x94, 0x90, 0x76, 0x6d, 0x6e, 0x79, 0x64, 0x74, 0xa8, 0xb1,
+    0x92, 0x6e, 0x61, 0x79, 0x74, 0x91, 0x95, 0x74, 0x65, 0x74, 0x5e, 0x7f,
+    0x8b, 0x60, 0x9b, 0x9f, 0x74, 0x77, 0x4c, 0x66, 0x7c, 0x80, 0x97, 0x98,
+    0x9d, 0x86, 0x55, 0x8a, 0x8a, 0x79, 0x8c, 0x82, 0xb0, 0x7d, 0x63, 0x8c,
+    0x5d, 0x5b, 0x82, 0x58, 0x84, 0x56, 0x51, 0x92, 0x75, 0x24, 0x97, 0x92,
+    0x75, 0x6e, 0x19, 0x8e, 0x47, 0x3e, 0x7b, 0x7b, 0x87, 0x6b, 0x3f, 0xa9,
+    0x59, 0x40, 0x86, 0x74, 0x69, 0x4a, 0x2d, 0xad, 0x91, 0x62, 0xb2, 0xa9,
+    0x74, 0x6c, 0x47, 0x94, 0x51, 0x75, 0xb2, 0x6f, 0x75, 0x4b, 0x60, 0xa2,
+    0x8e, 0x6a, 0xa4, 0x79, 0x6f, 0x57, 0x80, 0x8c, 0x6c, 0x8e, 0x9e, 0x74,
+    0x70, 0x5f, 0x66, 0x80, 0x80, 0x89, 0xb5, 0x8a, 0x7a, 0x96, 0x87, 0x7a,
+    0x7b, 0x85, 0x90, 0x79, 0x59, 0x6d, 0x77, 0x8c, 0x8f, 0x82, 0xb3, 0x9c,
+    0x6a, 0x6a, 0x6b, 0x70, 0x77, 0x89, 0x96, 0x86, 0x94, 0x72, 0x7e, 0x72,
+    0xa9, 0x93, 0x8d, 0x7a, 0x6d, 0x8f, 0x66, 0x72, 0x9a, 0x91, 0x9e, 0x98,
+    0xa0, 0x8b, 0x50, 0x76, 0x5c, 0x74, 0xbc, 0x9a, 0x98, 0x73, 0x80, 0x7d,
+    0x73, 0x7c, 0xc0, 0x8b, 0x86, 0x7a, 0x66, 0x86, 0x83, 0x72, 0x8f, 0x96,
+    0x98, 0x56, 0x45, 0x7b, 0x77, 0x92, 0xac, 0x8a, 0xae, 0x43, 0x33, 0x73,
+    0x78, 0x83, 0x98, 0x84, 0x86, 0x78, 0x54, 0x7e, 0x70, 0x5f, 0xa6, 0xa1,
+    0x94, 0x81, 0x73, 0x8d, 0x83, 0x5b, 0x88, 0x71, 0xb2, 0x91, 0x50, 0x99,
+    0x6b, 0x47, 0x72, 0x92, 0x87, 0x6d, 0x07, 0x99, 0x57, 0x3d, 0x8d, 0x83,
+    0x9d, 0x49, 0x40, 0x9d, 0x5c, 0x57, 0x95, 0x73, 0x6e, 0x4b, 0x49, 0xab,
+    0x97, 0x58, 0x8b, 0x7a, 0x7a, 0x48, 0x47, 0x8b, 0x7e, 0x5d, 0xa9, 0x6d,
+    0x8a, 0x3f, 0x60, 0x82, 0x86, 0x98, 0xa9, 0x7c, 0x74, 0x59, 0x9b, 0x80,
+    0x4e, 0x75, 0x9c, 0x5e, 0x75, 0x8c, 0x67, 0x7e, 0x78, 0x75, 0x87, 0x6c,
+    0x79, 0x73, 0x63, 0x77, 0x6e, 0x7a, 0x8d, 0x73, 0x4e, 0x72, 0x4a, 0x7c,
+    0x8f, 0x79, 0x70, 0x7a, 0x70, 0x73, 0x7b, 0x7a, 0x62, 0xa1, 0x7b, 0x63,
+    0x9a, 0x89, 0x76, 0x64, 0x84, 0x7d, 0x9c, 0x94, 0xb0, 0x7f, 0x6c, 0x7b,
+    0x8d, 0x89, 0x89, 0x7b, 0x9d, 0x99, 0x64, 0x8b, 0x5c, 0x88, 0xa6, 0x8e,
+    0x81, 0x86, 0x7e, 0x85, 0x73, 0x72, 0xad, 0x5d, 0x5f, 0x7e, 0x63, 0x74,
+    0x64, 0xa1, 0x9c, 0x83, 0x7c, 0x83, 0x7b, 0x7b, 0x71, 0xa0, 0x9e, 0xaf,
+    0x89, 0x79, 0x4c, 0x7c, 0x8c, 0x78, 0x91, 0x87, 0x8a, 0x87, 0x5e, 0x85,
+    0x7b, 0x61, 0x9c, 0x88, 0xa5, 0x8d, 0x7c, 0x9c, 0x6b, 0x47, 0x95, 0x85,
+    0x81, 0x80, 0x59, 0xb2, 0x4f, 0x3d, 0xae, 0x8c, 0x8d, 0x71, 0x11, 0x95,
+    0x31, 0x65, 0x9d, 0xa0, 0x8e, 0x64, 0x42, 0xb9, 0x6a, 0x5c, 0x91, 0x82,
+    0x91, 0x50, 0x33, 0xb2, 0x7a, 0x54, 0xac, 0x88, 0x92, 0x61, 0x4e, 0xad,
+    0x65, 0x5c, 0x91, 0xb0, 0x72, 0x65, 0x4a, 0x79, 0x68, 0x77, 0x75, 0x5f,
+    0x79, 0x6d, 0x6f, 0x7c, 0x4d, 0x71, 0xb8, 0x78, 0x8a, 0x87, 0x6e, 0x72,
+    0x7d, 0x79, 0x87, 0x80, 0x5a, 0x78, 0x77, 0x78, 0x80, 0x8f, 0x8c, 0x56,
+    0x7a, 0x8b, 0x62, 0x82, 0x5a, 0x96, 0x82, 0x68, 0x71, 0x5d, 0x75, 0x65,
+    0x93, 0xb5, 0x71, 0x82, 0x82, 0x8a, 0x4b, 0x7c, 0x62, 0x6f, 0xc1, 0x86,
+    0x9d, 0x90, 0x63, 0x71, 0x86, 0x9e, 0x9f, 0x77, 0x90, 0x97, 0x68, 0x81,
+    0x5a, 0x8c, 0xab, 0x5e, 0x81, 0x76, 0x83, 0x79, 0x8f, 0xa1, 0x89, 0x79,
+    0x81, 0x8a, 0x7e, 0x6c, 0x65, 0x79, 0xc7, 0x89, 0x92, 0x68, 0x78, 0x70,
+    0x65, 0x96, 0x9e, 0x82, 0x7d, 0x5f, 0x7b, 0x77, 0x72, 0x84, 0x7e, 0x92,
+    0x97, 0x7b, 0x6e, 0x67, 0x81, 0xa1, 0x9a, 0xab, 0x8d, 0x78, 0x61, 0x78,
+    0x52, 0x66, 0xaa, 0x77, 0x75, 0xa3, 0x5e, 0xa0, 0x51, 0x40, 0x68, 0xb0,
+    0x9a, 0x93, 0x11, 0x82, 0x69, 0x48, 0x9c, 0x77, 0x8d, 0x62, 0x36, 0xac,
+    0x6c, 0x4c, 0xa3, 0xab, 0x8f, 0x32, 0x4f, 0xa9, 0x80, 0x68, 0xab, 0x7a,
+    0x90, 0x61, 0x5c, 0xa5, 0x84, 0x4c, 0x8c, 0x7a, 0x95, 0x54, 0x72, 0xa0,
+    0x66, 0x85, 0xb3, 0x91, 0x69, 0x64, 0x68, 0x56, 0x66, 0x8d, 0xa0, 0x9f,
+    0x7a, 0x88, 0x5d, 0x7d, 0x48, 0x80, 0x7f, 0x7c, 0x7c, 0x99, 0x65, 0x81,
+    0x73, 0x8b, 0x8c, 0x61, 0x44, 0x60, 0x53, 0x8e, 0x64, 0x80, 0x9c, 0x74,
+    0x5d, 0x70, 0x8f, 0x5a, 0x68, 0x7a, 0x82, 0xa1, 0x75, 0x7b, 0x83, 0x60,
+    0x75, 0x5e, 0xa2, 0x94, 0x6a, 0x88, 0x78, 0x71, 0x95, 0x70, 0x8b, 0x86,
+    0x7e, 0x94, 0x5f, 0x65, 0x5f, 0xb1, 0x97, 0x99, 0x94, 0x84, 0x88, 0x7d,
+    0x50, 0x8c, 0xaa, 0x81, 0x7b, 0x7c, 0x77, 0x65, 0x5e, 0x91, 0x9c, 0x89,
+    0x8c, 0x85, 0x75, 0x62, 0x7b, 0x78, 0xc3, 0x7a, 0x62, 0x8c, 0x66, 0x6f,
+    0x79, 0x7a, 0x9c, 0x6d, 0x7c, 0x6b, 0x5c, 0x7d, 0x6d, 0x54, 0x93, 0x87,
+    0x7a, 0x7a, 0x50, 0x85, 0x60, 0x56, 0x5e, 0x6b, 0x90, 0x7c, 0x52, 0xa5,
+    0x54, 0x42, 0x7b, 0x75, 0x83, 0x8c, 0x2c, 0xa6, 0x6f, 0x62, 0x78, 0x78,
+    0x86, 0x36, 0x4b, 0xaa, 0x86, 0x54, 0x92, 0x8d, 0x7f, 0x53, 0x37, 0xbe,
+    0x86, 0x7a, 0x90, 0x7e, 0x8e, 0x50, 0x58, 0xa6, 0x82, 0x58, 0x73, 0x74,
+    0x66, 0x5c, 0x6a, 0x7f, 0xa2, 0x69, 0xbd, 0xa9, 0x74, 0x76, 0x75, 0x6f,
+    0x45, 0x6c, 0xa5, 0x79, 0x82, 0x67, 0x56, 0x7c, 0x7f, 0x81, 0x67, 0x6d,
+    0x81, 0x87, 0x71, 0x69, 0x69, 0x81, 0x85, 0x84, 0x5a, 0x8c, 0x5f, 0x73,
+    0x80, 0x9c, 0x9e, 0x90, 0x77, 0xa0, 0x9c, 0x6c, 0x73, 0x8a, 0x84, 0x72,
+    0x87, 0xa1, 0x67, 0x64, 0x5d, 0x9b, 0x9d, 0x9b, 0x97, 0x83, 0x5f, 0x61,
+    0x77, 0x91, 0xa0, 0x8f, 0x8a, 0x6c, 0x45, 0x5f, 0x6d, 0xa6, 0x9b, 0x76,
+    0x86, 0x93, 0x91, 0x7d, 0x54, 0x61, 0xa4, 0x6a, 0x5b, 0x69, 0x5f, 0x6d,
+    0x83, 0xaf, 0xa0, 0x78, 0x9d, 0x62, 0x65, 0x69, 0x5f, 0x78, 0xbf, 0x91,
+    0x7b, 0x7b, 0x52, 0x5d, 0x70, 0x78, 0xa9, 0x87, 0x93, 0x74, 0x61, 0x74,
+    0x8c, 0x61, 0x97, 0x86, 0x9b, 0x7c, 0x7d, 0x75, 0x4b, 0x64, 0xa7, 0x81,
+    0x8a, 0x9c, 0x29, 0xa2, 0x5f, 0x38, 0x6a, 0xb0, 0x82, 0x53, 0x1a, 0xa7,
+    0x38, 0x47, 0x97, 0x90, 0x8d, 0x41, 0x25, 0xa7, 0x65, 0x63, 0x8b, 0x79,
+    0x8f, 0x3e, 0x21, 0xd0, 0x5e, 0x5d, 0x9d, 0x68, 0x75, 0x3e, 0x68, 0xb6,
+    0x6a, 0x50, 0x9a, 0x71, 0x81, 0x45, 0x6d, 0x9a, 0x7f, 0x86, 0x9c, 0x63,
+    0x7d, 0x74, 0x69, 0x7d, 0x5a, 0x6a, 0x8d, 0x72, 0x6b, 0x69, 0x4c, 0x6f,
+    0x7c, 0x8e, 0xa6, 0x83, 0x70, 0x65, 0x5f, 0x78, 0x69, 0x67, 0x7f, 0x8d,
+    0x58, 0x76, 0x4a, 0x85, 0x80, 0x89, 0x9f, 0x91, 0x52, 0x62, 0x72, 0x60,
+    0x7b, 0x5c, 0x77, 0x6f, 0x9d, 0xa4, 0x98, 0x70, 0x6f, 0xad, 0x94, 0x9f,
+    0x7b, 0x89, 0x74, 0x7e, 0x5d, 0x8d, 0xab, 0x98, 0x8f, 0x90, 0x82, 0x84,
+    0x60, 0x7c, 0xb7, 0x8e, 0x79, 0x83, 0x56, 0x86, 0x87, 0x79, 0x95, 0x75,
+    0x78, 0x71, 0x58, 0x73, 0x87, 0x5d, 0xc6, 0x9f, 0x75, 0x61, 0x4f, 0x71,
+    0x91, 0x88, 0xb3, 0x8c, 0x7d, 0x7c, 0x6a, 0x75, 0x6d, 0x66, 0x8e, 0x94,
+    0x96, 0x74, 0x59, 0x6f, 0x6d, 0x65, 0xb0, 0x8e, 0x7b, 0x89, 0x7a, 0x6a,
+    0x7d, 0x57, 0x82, 0x7a, 0x61, 0x9f, 0x50, 0xab, 0x57, 0x46, 0x86, 0x8d,
+    0xa3, 0x96, 0x18, 0xab, 0x51, 0x6e, 0xb3, 0x7e, 0x90, 0x6d, 0x6d, 0xc0,
+    0x54, 0x35, 0x96, 0x84, 0x8e, 0x49, 0x28, 0xe4, 0x81, 0x5f, 0x9b, 0x87,
+    0x8c, 0x33, 0x56, 0xb4, 0x61, 0x5e, 0x8b, 0x81, 0x99, 0x61, 0x6b, 0x96,
+    0x75, 0x82, 0x9e, 0x7c, 0x90, 0x63, 0x64, 0x6b, 0x55, 0x6e, 0xb6, 0x7f,
+    0x5f, 0x55, 0x65, 0x60, 0x35, 0x8a, 0x85, 0x91, 0x4d, 0x62, 0x90, 0x90,
+    0x57, 0x5a, 0x9f, 0x7b, 0x4c, 0x86, 0x73, 0x83, 0x4a, 0x6d, 0xb0, 0x67,
+    0x65, 0x89, 0x54, 0x68, 0x89, 0x7b, 0x72, 0x4f, 0x7a, 0x93, 0x61, 0x7e,
+    0x79, 0x89, 0x8f, 0x9c, 0x7b, 0x70, 0x48, 0x67, 0x82, 0x75, 0xaa, 0x92,
+    0x9a, 0x8f, 0x79, 0x8c, 0x64, 0x94, 0x98, 0x83, 0x7c, 0x8f, 0x5c, 0x77,
+    0x70, 0x90, 0x91, 0x88, 0x7d, 0x51, 0x5d, 0x5d, 0x8b, 0x9f, 0xbc, 0x78,
+    0x9e, 0x73, 0x67, 0x6d, 0x82, 0x8d, 0xc9, 0x86, 0x96, 0x6a, 0x5d, 0x79,
+    0x7e, 0x6b, 0xb2, 0x79, 0x88, 0x85, 0x65, 0x73, 0x75, 0x6b, 0x9e, 0x7f,
+    0x8e, 0x94, 0x8e, 0x7d, 0x74, 0x61, 0x97, 0x56, 0x97, 0x6b, 0x30, 0xb6,
+    0x5f, 0x5a, 0xaa, 0xa5, 0x85, 0x5d, 0x01, 0xbc, 0x79, 0x63, 0x6e, 0x82,
+    0x72, 0x26, 0x4f, 0xc8, 0x98, 0x56, 0x85, 0x9a, 0x81, 0x1f, 0x48, 0xcf,
+    0x84, 0x74, 0x75, 0x87, 0xae, 0x43, 0x6f, 0xdf, 0x6a, 0x4e, 0x97, 0x5d,
+    0x8f, 0x37, 0x55, 0x89, 0x7d, 0x82, 0xb1, 0x89, 0x6d, 0x52, 0x65, 0x8b,
+    0x71, 0x87, 0x8d, 0x6a, 0x99, 0x5d, 0x65, 0x78, 0x67, 0x8d, 0x7b, 0x51,
+    0x60, 0x8a, 0x59, 0x72, 0x78, 0x93, 0x88, 0x75, 0x46, 0x60, 0x6e, 0x79,
+    0x7b, 0x9d, 0x9c, 0x8c, 0x5c, 0x7c, 0x69, 0x71, 0x60, 0x6f, 0xb0, 0x7d,
+    0x4c, 0x5e, 0x88, 0x77, 0x74, 0x6a, 0x6f, 0x9a, 0xa2, 0x83, 0x48, 0x5a,
+    0x6e, 0xa2, 0x8b, 0x7a, 0x65, 0x5b, 0x4b, 0x80, 0x5b, 0x8f, 0xaf, 0x8e,
+    0x93, 0x4a, 0x59, 0x6e, 0x5e, 0x89, 0x91, 0x87, 0x73, 0x6a, 0x47, 0x6c,
+    0x6c, 0x81, 0xad, 0x5a, 0x76, 0x51, 0x51, 0x6c, 0x80, 0x92, 0x9d, 0xae,
+    0x90, 0x71, 0x6c, 0x7a, 0x7c, 0x84, 0xa7, 0x7d, 0x82, 0x7c, 0x80, 0x59,
+    0x7d, 0x86, 0xa9, 0x94, 0x8e, 0x7b, 0x7c, 0x67, 0x67, 0x66, 0x8f, 0x49,
+    0x5d, 0xa4, 0x4a, 0xbc, 0x5a, 0x34, 0xa7, 0xaa, 0x9e, 0x86, 0x17, 0xc0,
+    0x53, 0x67, 0x76, 0xae, 0x8d, 0x37, 0x4a, 0xd6, 0x76, 0x69, 0x95, 0x7a,
+    0x8a, 0x0e, 0x3f, 0xe8, 0x60, 0x4d, 0x9e, 0x90, 0xad, 0x44, 0x46, 0xc5,
+    0x4c, 0x6e, 0x72, 0x8c, 0x89, 0x49, 0x51, 0xa0, 0x60, 0x84, 0x84, 0x9d,
+    0xa4, 0x5a, 0x84, 0x8d, 0x69, 0x6a, 0x97, 0x78, 0x72, 0x66, 0x72, 0x9b,
+    0x74, 0x7a, 0x95, 0x7c, 0x7a, 0x6e, 0x74, 0x7f, 0x65, 0x94, 0x77, 0x7e,
+    0x85, 0x6d, 0x65, 0x7b, 0x63, 0x7b, 0x87, 0x49, 0x80, 0x74, 0x74, 0x85,
+    0x6e, 0x78, 0xad, 0x66, 0x8a, 0x65, 0x54, 0x7c, 0x4e, 0x62, 0x97, 0x7f,
+    0x82, 0x6c, 0x58, 0x79, 0x91, 0x94, 0xb3, 0x7a, 0x88, 0x82, 0x60, 0x7f,
+    0x8c, 0xa7, 0x7b, 0x93, 0x77, 0x49, 0x6f, 0x6f, 0x5a, 0x8d, 0x93, 0x8b,
+    0x87, 0x59, 0x7d, 0x5e, 0x83, 0x7e, 0x8c, 0x7a, 0x91, 0x4e, 0x6f, 0x89,
+    0x8a, 0x87, 0x8b, 0x85, 0x8e, 0x43, 0x63, 0x8d, 0x90, 0x6c, 0xa5, 0x73,
+    0x8a, 0x78, 0x5f, 0x73, 0x88, 0x57, 0x9e, 0x8f, 0x7f, 0x91, 0x70, 0x77,
+    0x8a, 0x76, 0xa2, 0x77, 0x53, 0x86, 0x51, 0xd8, 0xa9, 0x5b, 0x9b, 0x96,
+    0x7c, 0x71, 0x01, 0xd4, 0x56, 0x4a, 0x95, 0xab, 0x91, 0x54, 0x45, 0xe5,
+    0x74, 0x4f, 0x87, 0x6a, 0xa2, 0x3e, 0x47, 0xff, 0x91, 0x4d, 0x94, 0x97,
+    0x6d, 0x74, 0x77, 0xe0, 0x5d, 0x4e, 0x5f, 0x73, 0x70, 0x3a, 0x68, 0xb2,
+    0x78, 0x61, 0x8c, 0x77, 0xa8, 0x57, 0x8c, 0x99, 0x23, 0x5a, 0x84, 0x78,
+    0x9b, 0x7f, 0x5e, 0xa0, 0x49, 0x84, 0x83, 0x94, 0x99, 0x4d, 0x8d, 0x9a,
+    0x86, 0x90, 0x9b, 0x51, 0x75, 0x73, 0x78, 0x89, 0x59, 0x64, 0x78, 0x91,
+    0x72, 0x9c, 0x72, 0x7e, 0x65, 0x6a, 0x80, 0xaa, 0x94, 0x65, 0x6d, 0x87,
+    0x73, 0x93, 0x97, 0x7d, 0x99, 0x63, 0x75, 0x89, 0x67, 0xa1, 0x90, 0x7f,
+    0x88, 0x65, 0x6d, 0x8f, 0x7d, 0x62, 0x91, 0xa7, 0x8b, 0x73, 0x51, 0x88,
+    0x66, 0x66, 0x99, 0xa7, 0x7c, 0x54, 0x82, 0x67, 0x64, 0x8a, 0x95, 0x7c,
+    0x8a, 0x5d, 0x5e, 0x68, 0x4b, 0x75, 0x92, 0x7a, 0x9f, 0x66, 0x71, 0x8d,
+    0x76, 0x72, 0x8e, 0x77, 0x76, 0x8c, 0x5b, 0x88, 0x9a, 0x92, 0x7c, 0x74,
+    0x95, 0xaa, 0x71, 0x77, 0x97, 0x93, 0x9e, 0x62, 0x96, 0x6a, 0x49, 0xd8,
+    0x81, 0x99, 0xae, 0x87, 0x6c, 0x76, 0x3e, 0xd9, 0x6e, 0x95, 0xa3, 0x86,
+    0x60, 0x6c, 0x5c, 0xbe, 0x98, 0x8a, 0x99, 0x7c, 0x47, 0x45, 0x69, 0xeb,
+    0x9d, 0x7d, 0xbb, 0x90, 0x66, 0x69, 0x70, 0xc6, 0x7b, 0x59, 0x9e, 0x87,
+    0x58, 0x76, 0x7c, 0xae, 0x72, 0x7d, 0x9f, 0x92, 0x82, 0x58, 0x51, 0x7a,
+    0x5d, 0x77, 0xa8, 0x7c, 0x56, 0x68, 0x88, 0x8a, 0x7e, 0x8a, 0x98, 0x68,
+    0x64, 0x79, 0x6e, 0x7a, 0x60, 0x96, 0x98, 0x60, 0x60, 0x71, 0x60, 0x8e,
+    0x7c, 0x8c, 0x92, 0x92, 0x77, 0x80, 0x90, 0x91, 0x81, 0x82, 0x9c, 0x80,
+    0x61, 0x7f, 0x5a, 0x8e, 0x88, 0x7c, 0x8e, 0x79, 0x69, 0x8e, 0x4e, 0x7e,
+    0x84, 0x9e, 0x67, 0x72, 0x5c, 0x78, 0x7b, 0x8c, 0x65, 0x7d, 0x8e, 0xa4,
+    0x5e, 0x7a, 0x5c, 0x97, 0x6a, 0x81, 0xab, 0x85, 0x4d, 0x73, 0x83, 0x96,
+    0x8b, 0x7d, 0xa6, 0x69, 0x74, 0x86, 0x73, 0x79, 0x52, 0x8c, 0xa0, 0x86,
+    0x64, 0x7b, 0x84, 0x77, 0x87, 0x93, 0x7d, 0x6d, 0x98, 0x6d, 0x88, 0x5f,
+    0x7c, 0x84, 0x92, 0x82, 0x81, 0x76, 0x85, 0x77, 0x98, 0x85, 0x88, 0x68,
+    0x7d, 0x71, 0x3c, 0xf1, 0x83, 0x86, 0xa2, 0xb3, 0x6e, 0x77, 0x53, 0xe8,
+    0xa8, 0xc7, 0xb3, 0x83, 0x93, 0x83, 0x63, 0xe8, 0x94, 0xb3, 0x86, 0x6e,
+    0x75, 0x5d, 0x54, 0xf0, 0x89, 0xa7, 0x94, 0xb1, 0x7e, 0x91, 0x9a, 0xb8,
+    0x91, 0x7e, 0x99, 0x50, 0x71, 0x82, 0x8a, 0x91, 0x7a, 0x8a, 0x8b, 0x80,
+    0x64, 0x6a, 0x5f, 0xbe, 0x5d, 0x96, 0xb1, 0x82, 0x45, 0x71, 0x8b, 0x95,
+    0x7c, 0x9b, 0x89, 0x6d, 0x5b, 0x73, 0x81, 0x90, 0x76, 0xab, 0xa6, 0x88,
+    0x62, 0x7d, 0x75, 0x99, 0x7a, 0x8b, 0x6e, 0x9b, 0x83, 0x89, 0x99, 0x93,
+    0x81, 0x9e, 0x8a, 0x76, 0x75, 0x7d, 0x6c, 0x93, 0x68, 0x7a, 0x8d, 0x78,
+    0x88, 0x93, 0x66, 0xa5, 0x6c, 0xae, 0xb1, 0x83, 0x72, 0x8f, 0x6b, 0x7b,
+    0x79, 0x9b, 0x98, 0x7c, 0x82, 0x84, 0x7d, 0x7d, 0x71, 0x7c, 0xb0, 0x81,
+    0x74, 0x89, 0x72, 0x89, 0x98, 0xa0, 0x7d, 0x62, 0x2f, 0x50, 0x7d, 0x8b,
+    0x4c, 0x83, 0x87, 0x89, 0x57, 0x9e, 0x92, 0x8c, 0x81, 0x7e, 0xb9, 0x95,
+    0x7f, 0x76, 0x8e, 0x90, 0x9d, 0x68, 0x78, 0x95, 0x7d, 0xab, 0x84, 0x8a,
+    0x64, 0x9f, 0x80, 0x94, 0x8d, 0x89, 0x76, 0x8e, 0x6f, 0x8b, 0x75, 0x7d,
+    0x89, 0x74, 0x67, 0x8a, 0x7d, 0x63, 0x79, 0x6d, 0x79, 0x8a, 0x78, 0x7f,
+    0x7a, 0x9b, 0x70, 0x70, 0x84, 0x86, 0x80, 0x95, 0x5a, 0x77, 0x80, 0x91,
+    0x9c, 0x92, 0x76, 0x81, 0x69, 0x89, 0x78, 0xa5, 0x7a, 0x8d, 0x86, 0x64,
+    0x8f, 0x8d, 0x7d, 0xa1, 0x8c, 0x7b, 0x77, 0x7e, 0x80, 0x93, 0x86, 0x68,
+    0x90, 0x9c, 0x71, 0x8c, 0x68, 0x52, 0x85, 0x88, 0x89, 0x92, 0x64, 0x8f,
+    0x74, 0x64, 0x7c, 0x88, 0x8d, 0x97, 0x77, 0x97, 0x91, 0xac, 0x74, 0x7f,
+    0x60, 0x7e, 0x6e, 0x70, 0x86, 0x83, 0x7f, 0x81, 0x6f, 0x94, 0x62, 0xa4,
+    0x86, 0x7d, 0x90, 0x7c, 0x89, 0x63, 0x7b, 0x89, 0x75, 0xa1, 0x67, 0x69,
+    0xa6, 0x76, 0x69, 0x9c, 0x71, 0x79, 0x76, 0x7a, 0x8e, 0x78, 0x94, 0x75,
+    0x5a, 0x76, 0x6b, 0x91, 0x84, 0x75, 0x72, 0x93, 0x79, 0x7e, 0x75, 0x9a,
+    0x6f, 0x7a, 0x7b, 0x80, 0x5f, 0x90, 0x74, 0x7d, 0x9b, 0x76, 0x70, 0x89,
+    0x8f, 0x5f, 0x7f, 0x9c, 0x93, 0x6d, 0x81, 0x7f, 0x8d, 0x7d, 0x74, 0x5d,
+    0x75, 0x88, 0x7b, 0x91, 0x75, 0x6b, 0x7f, 0x8c, 0x71, 0x74, 0x87, 0x88,
+    0x83, 0x75, 0x77, 0x96, 0x7f, 0x67, 0x7d, 0x95, 0x81, 0x5c, 0x71, 0x5c,
+    0x6e, 0x75, 0x86, 0x92, 0x5d, 0x7a, 0x77, 0x9f, 0x6e, 0x79, 0x68, 0x60,
+    0x94, 0x88, 0x88, 0x88, 0x79, 0x7e, 0x8a, 0x6d, 0x84, 0xa7, 0x5b, 0x8e,
+    0x67, 0x9c, 0x7e, 0x75, 0x82, 0x96, 0x7c, 0x7b, 0x72, 0x85, 0x8c, 0xa3,
+    0x96, 0x5b, 0x93, 0x67, 0x7e, 0x9f, 0x71, 0x82, 0x79, 0x8c, 0x93, 0x9d,
+    0x6b, 0x90, 0x8a, 0x8a, 0x55, 0x82, 0x94, 0x74, 0x7d, 0xaa, 0x81, 0x78,
+    0x8a, 0x8d, 0x83, 0x7b, 0x97, 0x92, 0x68, 0x64, 0x8c, 0x5d, 0x78, 0x9b,
+    0x73, 0x95, 0x78, 0x77, 0x6f, 0x61, 0x7c, 0x9d, 0x85, 0x6e, 0x84, 0x4c,
+    0x87, 0x57, 0x93, 0x68, 0x8e, 0x77, 0x78, 0x72, 0x87, 0x91, 0x5f, 0x7e,
+    0xa6, 0x75, 0x66, 0x86, 0x7a, 0x7d, 0x70, 0x6f, 0x87, 0x8b, 0x74, 0x85,
+    0x7d, 0x8b, 0x7f, 0x70, 0x7e, 0x82, 0x84, 0x75, 0x89, 0xa6, 0x7b, 0x7a,
+    0xa5, 0x69, 0x73, 0x74, 0x82, 0x65, 0x8f, 0x98, 0x7b, 0x77, 0x84, 0x92,
+    0x73, 0x8a, 0xa1, 0x93, 0x80, 0x81, 0x72, 0x8a, 0x6b, 0x75, 0x8f, 0x98,
+    0x73, 0x74, 0x6f, 0x70, 0x51, 0x6a, 0x84, 0x9e, 0x78, 0x9b, 0x8c, 0x81,
+    0x7e, 0x75, 0x80, 0x88, 0x73, 0x4e, 0x71, 0x74, 0x8c, 0x74, 0x6a, 0x84,
+    0x7f, 0x6b, 0x78, 0xab, 0x77, 0xa2, 0x98, 0x93, 0x77, 0x75, 0x72, 0x5c,
+    0x60, 0x74, 0x84, 0x67, 0x83, 0x7d, 0x7f, 0x7c, 0x5c, 0x72, 0x70, 0x7f,
+    0x6c, 0x84, 0x90, 0xab, 0x97, 0x7f, 0x6b, 0x82, 0x7f, 0x78, 0x73, 0x7d,
+    0x8f, 0x8e, 0x8a, 0x8f, 0x8d, 0xa3, 0x74, 0x6e, 0x5e, 0x8c, 0x94, 0x86,
+    0x57, 0xb0, 0x79, 0xa8, 0x7b, 0x8d, 0x83, 0x77, 0x89, 0xb6, 0x60, 0x9d,
+    0x77, 0x59, 0x72, 0x4d, 0x6f, 0x94, 0x71, 0x75, 0x61, 0x96, 0x86, 0x5d,
+    0x84, 0x68, 0x86, 0x82, 0x8d, 0x70, 0x9a, 0x86, 0x73, 0x64, 0x74, 0x7d,
+    0x80, 0x5a, 0x64, 0x81, 0xa1, 0x71, 0x77, 0x65, 0xa3, 0x76, 0xa3, 0x9d,
+    0x73, 0x7b, 0x8f, 0x7b, 0x79, 0x7d, 0x6c, 0x85, 0x8e, 0x75, 0x65, 0x6a,
+    0x87, 0x70, 0x68, 0x8e, 0x76, 0x5d, 0x66, 0x7c, 0x83, 0x83, 0x7e, 0x89,
+    0x59, 0x8c, 0x75, 0x59, 0x87, 0x7e, 0x7f, 0x90, 0x6b, 0x7b, 0x7e, 0x6d,
+    0x6e, 0x86, 0x69, 0x92, 0x83, 0x8f, 0x8a, 0x60, 0x78, 0x75, 0x61, 0x91,
+    0x73, 0x66, 0x86, 0x86, 0x9f, 0x6f, 0x7b, 0x9a, 0x7c, 0x54, 0x75, 0x8e,
+    0x7e, 0x72, 0x8e, 0x98, 0x94, 0x5f, 0x71, 0x7c, 0x95, 0x9f, 0x8e, 0x83,
+    0x96, 0x4b, 0x8d, 0x84, 0x81, 0x7d, 0x70, 0x84, 0x70, 0x53, 0x8d, 0x84,
+    0x5a, 0x91, 0x88, 0x9a, 0x8f, 0x69, 0x8b, 0x52, 0x85, 0x89, 0x6e, 0x99,
+    0x79, 0x89, 0x9a, 0x82, 0x6e, 0x8b, 0x65, 0x62, 0x80, 0xa8, 0x8f, 0x8a,
+    0x71, 0x61, 0x7e, 0x7d, 0x7e, 0xaa, 0x7f, 0xa0, 0x5e, 0x67, 0x90, 0x86,
+    0x6d, 0xac, 0x74, 0x50, 0x61, 0x91, 0x7d, 0x69, 0x8b, 0x7f, 0x81, 0x7a,
+    0x93, 0x8c, 0x72, 0x64, 0x98, 0x88, 0x91, 0x83, 0x69, 0x6d, 0x78, 0x7a,
+    0x68, 0x7c, 0x76, 0x81, 0xa7, 0x88, 0x8f, 0x79, 0x7d, 0x6c, 0x8a, 0x60,
+    0x88, 0x6d, 0x79, 0x9d, 0x80, 0x82, 0x66, 0x7d, 0x7e, 0x96, 0x78, 0x70,
+    0x9b, 0x70, 0x7e, 0x90, 0x77, 0x94, 0x7b, 0x89, 0x78, 0x84, 0x74, 0x6d,
+    0x7d, 0xa7, 0x75, 0x97, 0x85, 0x83, 0x86, 0x65, 0x75, 0x9a, 0x7c, 0x68,
+    0x87, 0x82, 0x75, 0x68, 0x4c, 0x8a, 0x68, 0x93, 0x7d, 0x88, 0x84, 0x72,
+    0x58, 0x81, 0x5d, 0x83, 0x89, 0x63, 0x83, 0x7d, 0x8e, 0x75, 0x8c, 0x88,
+    0x7f, 0x57, 0x8c, 0x8f, 0xa6, 0x71, 0x8a, 0x95, 0x88, 0x51, 0x74, 0x8a,
+    0x8a, 0x98, 0x72, 0x80, 0x8a, 0x52, 0x90, 0x66, 0x54, 0x8e, 0x7f, 0x94,
+    0x81, 0x49, 0x84, 0x70, 0x5c, 0x93, 0x89, 0x6d, 0x82, 0x7f, 0x70, 0x5d,
+    0x87, 0x8a, 0x71, 0x70, 0x6f, 0xa1, 0x90, 0x9f, 0x74, 0x7c, 0x8c, 0x8b,
+    0x72, 0xbf, 0x89, 0x90, 0x5c, 0x8c, 0x75, 0x72, 0x6f, 0xb2, 0x84, 0x6d,
+    0x61, 0x80, 0x7d, 0x7a, 0x66, 0xaa, 0x75, 0x71, 0x89, 0x6d, 0x69, 0x72,
+    0x73, 0x98, 0x8c, 0x78, 0x5a, 0x8e, 0x8c, 0x81, 0x55, 0x81, 0x96, 0x67,
+    0x6f, 0x71, 0x74, 0x7d, 0x8e, 0x66, 0x9a, 0x67, 0xaa, 0x81, 0x90, 0x79,
+    0x89, 0x59, 0x86, 0x66, 0x8f, 0x7d, 0x7e, 0xa2, 0xa4, 0x99, 0x68, 0x7a,
+    0x8c, 0x73, 0x85, 0x77, 0x8b, 0x74, 0x75, 0x66, 0xaa, 0x98, 0x59, 0x8b,
+    0x91, 0x6c, 0x76, 0x73, 0x87, 0xa4, 0x82, 0x82, 0x63, 0x70, 0x7e, 0x73,
+    0x96, 0x97, 0x6f, 0x86, 0x81, 0x6f, 0x83, 0x82, 0x7b, 0x82, 0xa3, 0xa7,
+    0x95, 0x77, 0x84, 0x65, 0x9b, 0x94, 0x6e, 0xb0, 0x75, 0x66, 0x78, 0x82,
+    0x9c, 0x7a, 0x5f, 0xab, 0x99, 0x2f, 0x7f, 0x68, 0xa4, 0x69, 0x8f, 0x9a,
+    0x91, 0x56, 0x6e, 0x75, 0x63, 0x9b, 0x9e, 0x97, 0x95, 0x68, 0x80, 0x6a,
+    0x40, 0x95, 0x53, 0x72, 0x6f, 0x6b, 0x91, 0x78, 0x7f, 0x93, 0x70, 0x8d,
+    0x62, 0x83, 0x7e, 0x64, 0x5b, 0xaa, 0x70, 0x6c, 0x7e, 0x9c, 0x88, 0x76,
+    0x60, 0x70, 0x66, 0x69, 0x84, 0x97, 0x9d, 0x63, 0x5e, 0x9a, 0x7e, 0x52,
+    0x58, 0xb8, 0x95, 0x7c, 0x4d, 0x96, 0x8f, 0x70, 0x71, 0xbf, 0x83, 0x83,
+    0x9e, 0x70, 0x6f, 0x57, 0x70, 0x9a, 0x8d, 0x6e, 0x98, 0x5a, 0x69, 0x6f,
+    0x90, 0x71, 0x8a, 0x5d, 0x8e, 0x6e, 0x69, 0x7a, 0x90, 0x86, 0x89, 0x88,
+    0xb6, 0x77, 0x84, 0x79, 0x76, 0x86, 0x86, 0x7c, 0xbf, 0x6d, 0x5c, 0x90,
+    0xa1, 0x93, 0x72, 0x63, 0x9a, 0x82, 0x7b, 0x61, 0x91, 0x76, 0x82, 0x96,
+    0xb9, 0x80, 0x77, 0x7f, 0xa0, 0x73, 0x61, 0x80, 0x83, 0xc1, 0x92, 0x67,
+    0x7c, 0x81, 0x90, 0x67, 0x8b, 0xbe, 0x81, 0x91, 0x6c, 0x7e, 0x8d, 0x6c,
+    0x62, 0x83, 0x7e, 0x72, 0x64, 0x8a, 0x83, 0x82, 0xaa, 0x8c, 0x74, 0xab,
+    0x79, 0x85, 0x91, 0x79, 0x90, 0x68, 0x5c, 0x9a, 0x7c, 0x36, 0x80, 0x6e,
+    0x93, 0x76, 0x5e, 0xa0, 0xa5, 0x63, 0x73, 0x7e, 0x8d, 0x94, 0x63, 0x99,
+    0x8f, 0x6a, 0x7f, 0x57, 0x57, 0x6f, 0x6d, 0x86, 0x8e, 0x6b, 0x8d, 0x53,
+    0x94, 0xba, 0x84, 0x6f, 0x5a, 0x7b, 0x8c, 0x5f, 0x73, 0x93, 0x8b, 0x87,
+    0x6f, 0x9e, 0x8a, 0x87, 0x62, 0x97, 0x86, 0x7c, 0x69, 0xab, 0xa1, 0x95,
+    0x42, 0x8c, 0x8b, 0x66, 0x68, 0x99, 0xa8, 0x74, 0x80, 0xa5, 0x7d, 0x82,
+    0x55, 0xb3, 0x6f, 0x81, 0xa8, 0x9a, 0x80, 0x67, 0x62, 0x7f, 0x78, 0x93,
+    0x90, 0x83, 0x83, 0x7b, 0x77, 0x73, 0x8c, 0x56, 0xa7, 0x85, 0x7b, 0x71,
+    0x8f, 0x5d, 0x92, 0x69, 0xbe, 0x5e, 0x7f, 0x7f, 0x8e, 0x71, 0x84, 0x75,
+    0x95, 0x69, 0x88, 0x6b, 0x96, 0x85, 0x78, 0x39, 0xc2, 0x86, 0x7c, 0x99,
+    0xa1, 0x94, 0x6b, 0x86, 0xb5, 0x5e, 0x7e, 0x6e, 0x81, 0x95, 0x6a, 0x88,
+    0x7b, 0x92, 0x8f, 0x68, 0x97, 0x77, 0x84, 0x73, 0x68, 0x96, 0x5a, 0x92,
+    0x66, 0x74, 0x74, 0x6c, 0x7d, 0x81, 0x6c, 0x93, 0x7f, 0x72, 0x86, 0x74,
+    0xbf, 0x8f, 0x53, 0xa4, 0x89, 0x76, 0xa0, 0x87, 0x97, 0x6a, 0x6b, 0xb1,
+    0x91, 0x50, 0x74, 0x68, 0xa3, 0x60, 0x8d, 0xbc, 0xc1, 0x3e, 0x62, 0x59,
+    0x71, 0x72, 0x6d, 0x80, 0x9f, 0x52, 0x82, 0x6b, 0x5d, 0x7f, 0x74, 0x7e,
+    0x74, 0x84, 0x8a, 0x59, 0x5c, 0x85, 0x6d, 0x9c, 0x75, 0x9a, 0x88, 0x89,
+    0x81, 0x9f, 0x81, 0x88, 0x6a, 0x94, 0x84, 0x5f, 0x6b, 0x9b, 0x83, 0x4f,
+    0x7e, 0xca, 0x99, 0x6d, 0x45, 0x7f, 0x87, 0x71, 0x69, 0xad, 0x95, 0x53,
+    0x6e, 0x9b, 0x90, 0x73, 0x5d, 0xb0, 0x8d, 0x67, 0x83, 0x82, 0xa3, 0x70,
+    0x70, 0x92, 0x82, 0x9a, 0x8a, 0x69, 0x6a, 0x6e, 0x7f, 0x89, 0xa4, 0x76,
+    0x97, 0x62, 0x94, 0x80, 0x87, 0x55, 0x80, 0x76, 0xb3, 0x7e, 0x7e, 0x71,
+    0x94, 0x88, 0x8e, 0x74, 0xb6, 0x4d, 0x7b, 0x73, 0x90, 0x86, 0x7c, 0x66,
+    0xb5, 0x80, 0x7f, 0x84, 0x87, 0x82, 0x67, 0x83, 0x97, 0x91, 0x8a, 0x78,
+    0x8b, 0x83, 0x5d, 0x84, 0x82, 0x9f, 0x8c, 0x91, 0x84, 0x8b, 0x6a, 0x68,
+    0x86, 0x82, 0x73, 0x77, 0x7b, 0x83, 0x6a, 0x84, 0x92, 0x93, 0x90, 0x8b,
+    0x4c, 0x94, 0x98, 0x76, 0xb8, 0x7b, 0xa0, 0xa2, 0x7d, 0x3e, 0x95, 0x88,
+    0xa3, 0x6f, 0x5e, 0xc8, 0x9a, 0x52, 0x81, 0x86, 0xa3, 0x79, 0x88, 0xc3,
+    0xbd, 0x54, 0x6c, 0x5e, 0x83, 0x8a, 0x98, 0x88, 0x92, 0x66, 0x73, 0x5b,
+    0x6c, 0x7f, 0x6e, 0x97, 0x8d, 0x58, 0x89, 0x6e, 0x65, 0x7a, 0x7d, 0x7c,
+    0x7e, 0x89, 0x94, 0x89, 0x55, 0xb8, 0x8f, 0x82, 0x6c, 0x9c, 0x96, 0x5e,
+    0x6f, 0xb2, 0x70, 0x76, 0x95, 0xc8, 0x86, 0x78, 0x49, 0xac, 0x7e, 0x6c,
+    0x68, 0xb6, 0xaf, 0x89, 0x68, 0xa5, 0x72, 0x85, 0x69, 0x9c, 0x94, 0x84,
+    0xa4, 0x97, 0x91, 0x61, 0x7a, 0xa3, 0x8f, 0x8e, 0x93, 0x80, 0x8d, 0x76,
+    0x74, 0x84, 0x9b, 0x79, 0x97, 0x4e, 0x67, 0x87, 0x9b, 0x69, 0x85, 0x7d,
+    0xb2, 0x68, 0x76, 0x63, 0xa2, 0x86, 0x97, 0x7f, 0xb5, 0x63, 0x79, 0x76,
+    0x8a, 0x7c, 0x7c, 0x91, 0xb1, 0x42, 0x7d, 0x7a, 0x8c, 0x8e, 0x72, 0xab,
+    0xb8, 0x76, 0xab, 0x81, 0x98, 0x85, 0x56, 0x98, 0x84, 0x9f, 0x70, 0x86,
+    0x76, 0x88, 0x70, 0x8d, 0x71, 0x7b, 0x7a, 0x8d, 0x76, 0x75, 0x62, 0x80,
+    0x81, 0x94, 0x82, 0x6e, 0x57, 0x8d, 0xaf, 0x84, 0xbf, 0x85, 0x82, 0xa7,
+    0x80, 0x89, 0x95, 0x81, 0x91, 0x49, 0x72, 0xa1, 0xa7, 0x3f, 0x72, 0x8b,
+    0x99, 0x72, 0x86, 0xb2, 0xc3, 0x61, 0x55, 0x77, 0x86, 0x77, 0x83, 0xa7,
+    0x95, 0x5a, 0x68, 0x68, 0x6a, 0x63, 0x6a, 0x77, 0x93, 0x7c, 0x88, 0x62,
+    0x79, 0x84, 0x8b, 0x82, 0x58, 0x8f, 0x9c, 0x56, 0x77, 0xb1, 0x65, 0x8c,
+    0x76, 0x91, 0x83, 0x5b, 0x62, 0x91, 0x87, 0x68, 0x71, 0xb0, 0x87, 0x64,
+    0x62, 0x91, 0x94, 0x58, 0x7f, 0xac, 0xa3, 0x84, 0x75, 0xaa, 0xa3, 0x4d,
+    0x7a, 0xc2, 0x84, 0x8a, 0x6d, 0xa2, 0x76, 0x74, 0x8c, 0x9e, 0x7c, 0x71,
+    0x86, 0x70, 0x6d, 0x79, 0x9a, 0x74, 0xb0, 0x8d, 0xa5, 0x7e, 0x6b, 0x63,
+    0x96, 0x74, 0x99, 0x76, 0xd0, 0x62, 0x85, 0x9d, 0x8f, 0x6d, 0x83, 0x88,
+    0xb0, 0x62, 0x9b, 0x87, 0x91, 0x82, 0x7a, 0x90, 0x9c, 0x61, 0x6d, 0x97,
+    0x84, 0x7c, 0x74, 0x8e, 0x8b, 0x75, 0x9a, 0x7e, 0x7c, 0x7d, 0x96, 0x81,
+    0x94, 0x69, 0x83, 0x6f, 0x8e, 0x7c, 0x7b, 0x7a, 0x73, 0x98, 0x74, 0x9e,
+    0x72, 0x8c, 0x5f, 0x7d, 0x99, 0x79, 0x5b, 0x73, 0x65, 0x78, 0xa5, 0x7d,
+    0xa2, 0x98, 0x91, 0x91, 0x87, 0x7b, 0x8c, 0x82, 0xb8, 0x6b, 0x82, 0xba,
+    0xa5, 0x3f, 0x83, 0x7a, 0x9b, 0x73, 0x93, 0xa1, 0xbe, 0x55, 0x6b, 0x75,
+    0x94, 0x7d, 0x9c, 0xa1, 0x82, 0x50, 0x75, 0x5a, 0x88, 0x6e, 0x72, 0x7f,
+    0x99, 0x64, 0x72, 0x49, 0x69, 0x79, 0x6d, 0x94, 0x73, 0x79, 0x80, 0x6f,
+    0x72, 0xbc, 0x9d, 0x71, 0x7a, 0x9d, 0x8a, 0x55, 0x74, 0xaa, 0xa1, 0x85,
+    0x7e, 0xc4, 0xa0, 0x7e, 0x50, 0x99, 0x68, 0x8c, 0x8a, 0xb0, 0x99, 0x6c,
+    0x6d, 0xaf, 0x7b, 0x7b, 0x79, 0xba, 0x8a, 0x7a, 0x9d, 0x8b, 0x67, 0x87,
+    0x76, 0xa9, 0x7f, 0x7e, 0x8b, 0x7b, 0x87, 0x84, 0x82, 0x74, 0xa3, 0x91,
+    0x9a, 0x6a, 0x93, 0x7e, 0x87, 0x5b, 0x95, 0x89, 0xbb, 0x5d, 0x74, 0x6c,
+    0x88, 0x7e, 0x81, 0x7e, 0xb6, 0x6b, 0x91, 0x92, 0x83, 0x78, 0x79, 0x95,
+    0x90, 0x5e, 0x68, 0x8f, 0xa8, 0x92, 0x66, 0x8e, 0x6b, 0x8c, 0x86, 0x80,
+    0x7e, 0x7e, 0x70, 0x84, 0x7d, 0x71, 0x67, 0x94, 0x71, 0x69, 0x84, 0x8f,
+    0x6c, 0x72, 0x85, 0x83, 0x69, 0x76, 0x57, 0x62, 0x83, 0x96, 0x83, 0x77,
+    0x64, 0x5f, 0xae, 0x7c, 0xa7, 0x88, 0x91, 0x8c, 0x9e, 0x7f, 0xa8, 0x8a,
+    0x93, 0x6f, 0x58, 0xae, 0xb4, 0x4b, 0x7f, 0x64, 0x9f, 0x5a, 0x9e, 0xb6,
+    0xa6, 0x6b, 0x79, 0x84, 0x6b, 0x7c, 0x8b, 0x94, 0x85, 0x60, 0x6b, 0x55,
+    0x79, 0x68, 0x77, 0x75, 0x85, 0x5c, 0x91, 0x5e, 0x5a, 0x71, 0x68, 0x7b,
+    0x73, 0x91, 0x6c, 0x6e, 0x71, 0x8b, 0x76, 0x86, 0x99, 0xb8, 0x91, 0x68,
+    0x51, 0xa7, 0x6f, 0x7a, 0x8a, 0xc3, 0x8e, 0x65, 0x64, 0x9e, 0x80, 0x78,
+    0x6c, 0xc5, 0xa2, 0x75, 0x71, 0xa5, 0x96, 0x4f, 0x70, 0xa4, 0x7a, 0x7c,
+    0x8c, 0x80, 0x89, 0x97, 0x9a, 0x9a, 0x85, 0x89, 0x92, 0x8f, 0x81, 0x6f,
+    0x82, 0x6a, 0xb8, 0x74, 0x8f, 0x51, 0x7b, 0x8b, 0x8c, 0x55, 0x7e, 0x8c,
+    0xb2, 0x41, 0x85, 0x77, 0x9c, 0x73, 0x75, 0x8d, 0x9f, 0x64, 0x92, 0x77,
+    0xa0, 0x87, 0x5f, 0x71, 0x85, 0x68, 0x8a, 0x78, 0x91, 0x78, 0x75, 0x7a,
+    0x81, 0x67, 0x96, 0x64, 0x96, 0x85, 0x7a, 0x7e, 0x83, 0x74, 0x82, 0x8f,
+    0x98, 0x75, 0x77, 0x84, 0x7e, 0x88, 0x94, 0x7d, 0x79, 0x8c, 0x47, 0x79,
+    0x96, 0x7f, 0x8e, 0x90, 0x50, 0x7f, 0xa3, 0x77, 0xa8, 0x7f, 0x65, 0x9f,
+    0xb9, 0x4c, 0xa7, 0x7f, 0xaa, 0x6e, 0xa2, 0xb0, 0xb8, 0x51, 0x6b, 0x74,
+    0xaa, 0x63, 0x6c, 0xa3, 0xb6, 0x5e, 0x74, 0x6a, 0x75, 0x69, 0x87, 0x7f,
+    0x9d, 0x71, 0x73, 0x72, 0x70, 0x57, 0x5a, 0x7e, 0x8b, 0x64, 0x9a, 0x4d,
+    0x97, 0x81, 0x7b, 0x75, 0x6e, 0x92, 0x5f, 0x67, 0x7e, 0xaa, 0x90, 0x7a,
+    0x92, 0xae, 0x92, 0x68, 0x79, 0x9d, 0x4f, 0x6c, 0x79, 0xb4, 0x9c, 0x58,
+    0x86, 0x8e, 0x62, 0x72, 0x71, 0xc1, 0xac, 0x7d, 0x7a, 0x94, 0x8f, 0x7b,
+    0x88, 0xa8, 0x8d, 0x82, 0x75, 0x9b, 0x5f, 0x83, 0x82, 0xb3, 0x7a, 0x93,
+    0x94, 0x76, 0x70, 0x7e, 0x72, 0x7e, 0x8f, 0x8c, 0xa7, 0x53, 0x72, 0x77,
+    0x7a, 0x64, 0xa8, 0x83, 0xc5, 0x56, 0x71, 0x7b, 0x96, 0x73, 0x7c, 0x73,
+    0x93, 0x49, 0x83, 0x99, 0xa2, 0x83, 0x74, 0x79, 0xa4, 0x61, 0x8e, 0x84,
+    0x7a, 0x7d, 0x56, 0x98, 0x97, 0x6d, 0x87, 0x8c, 0x7a, 0x77, 0x6a, 0x67,
+    0x8a, 0x6f, 0xa2, 0x82, 0x8d, 0x85, 0x6d, 0x8f, 0x7e, 0x74, 0x72, 0x74,
+    0x91, 0x75, 0x58, 0x7f, 0x9e, 0x7c, 0x9c, 0x75, 0x61, 0x6f, 0x85, 0x7b,
+    0xbe, 0x84, 0x85, 0x9b, 0x8c, 0x3b, 0x9a, 0x90, 0xab, 0x77, 0x8e, 0xa2,
+    0xbd, 0x55, 0x96, 0x70, 0xa8, 0x78, 0x98, 0x9c, 0xc3, 0x67, 0x6e, 0x81,
+    0x70, 0x75, 0x96, 0x9c, 0x8a, 0x5b, 0x73, 0x54, 0x69, 0x6c, 0x5d, 0x82,
+    0x99, 0x5b, 0x8c, 0x6d, 0x87, 0x80, 0x67, 0x86, 0x88, 0x7c, 0x70, 0x6b,
+    0x75, 0xab, 0x8e, 0x79, 0x90, 0x91, 0xaf, 0x67, 0x5c, 0xa1, 0x5c, 0x6f,
+    0x75, 0xa1, 0x95, 0x5f, 0x82, 0x8f, 0x78, 0x5d, 0x7c, 0xb8, 0x8a, 0x8a,
+    0x6a, 0x98, 0x6e, 0x51, 0x6b, 0xaa, 0x7d, 0x7c, 0x80, 0x94, 0x79, 0x6d,
+    0xaa, 0x8a, 0x7e, 0x77, 0xa4, 0x78, 0xa5, 0x6d, 0x7c, 0x75, 0xa8, 0x6f,
+    0xa6, 0x51, 0x8e, 0x80, 0x96, 0x5b, 0x9d, 0x7b, 0xb8, 0x4e, 0x6c, 0x87,
+    0x95, 0x7c, 0x78, 0x71, 0xb0, 0x5a, 0x99, 0xa0, 0x90, 0x87, 0x65, 0x8b,
+    0x98, 0x68, 0x92, 0x76, 0x82, 0x77, 0x6a, 0x8a, 0x91, 0x84, 0x87, 0x8b,
+    0x87, 0x84, 0x7a, 0x81, 0x77, 0x55, 0x8e, 0x86, 0x7a, 0x74, 0x65, 0x88,
+    0x62, 0x51, 0xa1, 0x91, 0x88, 0x76, 0x5f, 0x89, 0x9f, 0x86, 0x66, 0x67,
+    0x64, 0x75, 0x9e, 0x74, 0xc1, 0x80, 0x58, 0xa9, 0x8f, 0x5e, 0x94, 0x88,
+    0xaf, 0x6f, 0x6c, 0xa4, 0xa1, 0x4d, 0x68, 0x66, 0xc2, 0x6e, 0x89, 0x9b,
+    0xa3, 0x5a, 0x63, 0x5b, 0x9c, 0x7a, 0x93, 0x76, 0x9d, 0x6d, 0x71, 0x5d,
+    0x80, 0x66, 0x79, 0x80, 0x7c, 0x65, 0x74, 0x64, 0x88, 0x90, 0x79, 0x89,
+    0x72, 0x88, 0x67, 0x75, 0x6a, 0x96, 0x56, 0x67, 0x88, 0xa1, 0x8c, 0x6c,
+    0x55, 0xb2, 0x8a, 0x71, 0x88, 0xdc, 0x7a, 0x72, 0x94, 0x9d, 0x7c, 0x76,
+    0x6a, 0xaa, 0xa8, 0x7f, 0x80, 0xa0, 0x6b, 0x6f, 0x84, 0xe0, 0x68, 0x93,
+    0xa6, 0x99, 0x69, 0x68, 0x93, 0xa0, 0x93, 0x6b, 0x87, 0x8b, 0x80, 0x90,
+    0x90, 0x89, 0x8f, 0x7f, 0xaf, 0x6f, 0x82, 0x6d, 0x94, 0x70, 0x97, 0x8f,
+    0xb0, 0x40, 0x9b, 0x67, 0x78, 0x86, 0x90, 0x8b, 0xa7, 0x51, 0x7f, 0x79,
+    0x90, 0x71, 0x6d, 0x80, 0x95, 0x63, 0x7d, 0x87, 0xa0, 0x7e, 0x7b, 0x85,
+    0x8e, 0x6d, 0xa1, 0x76, 0x70, 0x7b, 0x66, 0x87, 0x90, 0x7a, 0x86, 0x88,
+    0x89, 0x87, 0x6a, 0x91, 0x78, 0x74, 0x76, 0x8d, 0x7e, 0x86, 0x63, 0x90,
+    0x98, 0x7d, 0x4a, 0x85, 0x4f, 0x9d, 0xa2, 0x7c, 0xb4, 0x88, 0x78, 0xb5,
+    0x8f, 0x3f, 0xa7, 0x7d, 0xa4, 0x7c, 0x60, 0x9c, 0xa8, 0x41, 0x6b, 0x7f,
+    0xa2, 0x7f, 0x68, 0xaa, 0xb4, 0x73, 0x56, 0x62, 0x87, 0x72, 0xa5, 0x7c,
+    0x97, 0x69, 0x58, 0x6b, 0x89, 0x57, 0x51, 0x80, 0x92, 0x7a, 0x7c, 0x4c,
+    0x7c, 0x7b, 0x69, 0x5f, 0x90, 0x77, 0x78, 0x67, 0x7a, 0xad, 0x79, 0x5c,
+    0x9c, 0xbf, 0xa6, 0x64, 0x53, 0xb3, 0x5e, 0x59, 0x86, 0xb9, 0x94, 0x65,
+    0x70, 0x9d, 0x7a, 0x80, 0x7c, 0xae, 0x9c, 0x7b, 0x66, 0xae, 0x83, 0x5f,
+    0x81, 0xc5, 0x8b, 0x7e, 0x9b, 0x89, 0x84, 0x7f, 0x7c, 0xa5, 0x5c, 0x89,
+    0x8a, 0x75, 0x99, 0x6d, 0x8e, 0x90, 0x9f, 0x81, 0x81, 0x6b, 0x87, 0x76,
+    0x92, 0x6f, 0xab, 0x95, 0x95, 0x4c, 0x97, 0x72, 0x80, 0x87, 0x83, 0x87,
+    0xa3, 0x59, 0xad, 0x74, 0x93, 0x7f, 0x77, 0x78, 0x8d, 0x66, 0x9b, 0x7a,
+    0x7d, 0x95, 0x64, 0x7f, 0x6d, 0x5c, 0x8e, 0x94, 0x92, 0x82, 0x60, 0x8d,
+    0x75, 0x55, 0x8c, 0x8b, 0x8f, 0x86, 0x7d, 0x7c, 0x74, 0x57, 0x78, 0x9d,
+    0x71, 0x65, 0x66, 0x7f, 0xaa, 0x92, 0x66, 0x81, 0x5a, 0x71, 0xa6, 0x78,
+    0x9d, 0x8a, 0x5a, 0x8a, 0x91, 0x59, 0xb7, 0x5c, 0xc3, 0x73, 0x89, 0x9d,
+    0xa7, 0x62, 0x77, 0x72, 0x9f, 0x92, 0x6a, 0x9f, 0xaa, 0x71, 0x6b, 0x5e,
+    0x7d, 0x73, 0x8d, 0x89, 0xba, 0x61, 0x73, 0x6e, 0x71, 0x8a, 0x79, 0x7c,
+    0x94, 0x76, 0x76, 0x65, 0x81, 0x6f, 0x4e, 0x75, 0x6e, 0x8b, 0x7d, 0x50,
+    0x56, 0xb8, 0x72, 0x67, 0x93, 0xc6, 0x88, 0x6f, 0x57, 0xb7, 0x80, 0x4c,
+    0x97, 0xc4, 0xb6, 0x71, 0x72, 0x9e, 0x6f, 0x72, 0x8d, 0xa5, 0x8f, 0x89,
+    0x74, 0xae, 0x78, 0x70, 0x6e, 0xbb, 0x8f, 0x73, 0x74, 0x8b, 0x5e, 0x86,
+    0x8b, 0x8a, 0x72, 0x71, 0x84, 0x84, 0x77, 0xa3, 0xa6, 0x73, 0xa4, 0x7e,
+    0xab, 0x5d, 0x75, 0x96, 0x94, 0x5f, 0x8b, 0x74, 0x9c, 0x63, 0x8d, 0x81,
+    0x80, 0x6a, 0x91, 0x88, 0x93, 0x53, 0x80, 0x75, 0x79, 0x8d, 0x78, 0x74,
+    0x7c, 0x73, 0xb2, 0x89, 0x8e, 0xab, 0x75, 0x6c, 0x7a, 0x79, 0x99, 0x77,
+    0x7d, 0x89, 0x5a, 0x81, 0x7c, 0x75, 0x6a, 0x7e, 0x8c, 0x83, 0x78, 0x8e,
+    0x62, 0x76, 0x77, 0x6b, 0x79, 0x66, 0x6e, 0x82, 0xa1, 0x8d, 0x52, 0x79,
+    0x70, 0x7d, 0xa9, 0x6a, 0x95, 0x7f, 0x59, 0x94, 0x8f, 0x73, 0xb7, 0x85,
+    0xb3, 0x80, 0x77, 0x9f, 0xb8, 0x4d, 0x82, 0x7c, 0xa0, 0xa4, 0x7b, 0x8c,
+    0xa9, 0x78, 0x62, 0x6b, 0x8a, 0x93, 0x80, 0x68, 0x9b, 0x6d, 0x6b, 0x7b,
+    0x84, 0x8f, 0x86, 0x70, 0x70, 0x73, 0x84, 0x4f, 0x7c, 0x75, 0x64, 0x8d,
+    0x6e, 0x81, 0x7c, 0x72, 0x81, 0xb0, 0x74, 0x65, 0xa7, 0xae, 0x80, 0x70,
+    0x5e, 0xa4, 0x58, 0x54, 0x8e, 0xa7, 0x96, 0x65, 0x66, 0x8b, 0x6c, 0x5d,
+    0x6b, 0xbe, 0x94, 0x79, 0x80, 0xa1, 0x91, 0x78, 0x6d, 0xc2, 0x82, 0x85,
+    0x81, 0x7d, 0x88, 0x79, 0x93, 0x96, 0x7f, 0x7e, 0x7d, 0x92, 0x75, 0xa2,
+    0x9f, 0x7b, 0x92, 0x77, 0x8a, 0x7c, 0x80, 0x8b, 0x9b, 0x64, 0xa5, 0x74,
+    0xa1, 0x74, 0x7f, 0x7e, 0x85, 0x78, 0x9c, 0x86, 0x9f, 0x62, 0x8f, 0x7f,
+    0x8a, 0x90, 0x6d, 0x7d, 0x93, 0x61, 0x9d, 0x81, 0x9b, 0x99, 0x69, 0x87,
+    0x74, 0x7d, 0x8e, 0x8e, 0x7b, 0x7c, 0x6a, 0x71, 0x7d, 0x7f, 0x74, 0x74,
+    0x7b, 0x65, 0x6e, 0x91, 0x7c, 0x6e, 0x80, 0x8c, 0x8a, 0x6c, 0x6b, 0x76,
+    0xad, 0x94, 0x64, 0x81, 0x69, 0x7b, 0xac, 0x76, 0x9f, 0x71, 0x85, 0x85,
+    0x8b, 0x66, 0xb5, 0x87, 0xb3, 0x63, 0x8b, 0x95, 0x8e, 0x50, 0x91, 0x77,
+    0xa1, 0x99, 0x64, 0x81, 0xb3, 0x63, 0x6e, 0x7a, 0x7f, 0x73, 0x7a, 0x7b,
+    0x93, 0x6d, 0x75, 0x75, 0x7c, 0x7b, 0x59, 0x7c, 0x7c, 0x68, 0x67, 0x78,
+    0x79, 0x75, 0x53, 0x86, 0x84, 0x84, 0x91, 0x71, 0x85, 0xb1, 0x84, 0x64,
+    0x88, 0xc0, 0x94, 0x5f, 0x6f, 0x9b, 0x69, 0x67, 0x97, 0x94, 0x88, 0x6a,
+    0x7e, 0x94, 0x9e, 0x7f, 0x81, 0x9c, 0xa7, 0x7f, 0x7a, 0xa2, 0x63, 0x69,
+    0x82, 0xc2, 0x5e, 0x8d, 0x7c, 0x89, 0x63, 0x93, 0x84, 0xb8, 0x76, 0x89,
+    0x96, 0x87, 0x79, 0x88, 0xa6, 0x8e, 0x9b, 0x93, 0x9c, 0x5d, 0x92, 0x92,
+    0x82, 0x5e, 0x85, 0x88, 0xad, 0x73, 0xa4, 0x6f, 0x74, 0x8e, 0x77, 0x89,
+    0x9b, 0x6e, 0x82, 0x76, 0x93, 0xae, 0x82, 0x87, 0x76, 0x6f, 0x80, 0x76,
+    0x95, 0x8e, 0x5e, 0x85, 0x7b, 0x68, 0x7f, 0x7c, 0x82, 0x94, 0x80, 0x91,
+    0x77, 0x71, 0x7c, 0x94, 0x80, 0x62, 0x65, 0x7c, 0x5e, 0x70, 0x76, 0x75,
+    0x7b, 0x60, 0x5f, 0x69, 0xb3, 0x6e, 0x95, 0x9d, 0x5a, 0x5b, 0x9e, 0x6e,
+    0xa6, 0x80, 0x5d, 0xa5, 0x83, 0x5b, 0xa4, 0x80, 0xb3, 0x79, 0x83, 0xb6,
+    0xa3, 0x73, 0x84, 0x67, 0x8d, 0x8f, 0x9d, 0x78, 0xb8, 0x8a, 0x7b, 0x6c,
+    0x85, 0x87, 0x6d, 0x75, 0xae, 0x75, 0x53, 0x71, 0x6b, 0x87, 0x67, 0x7b,
+    0x7f, 0x86, 0x58, 0x73, 0x7d, 0x87, 0x5d, 0x7f, 0x7d, 0x63, 0x92, 0x65,
+    0x7a, 0x9c, 0x6f, 0x87, 0x81, 0xa9, 0x91, 0x54, 0x66, 0x8e, 0x58, 0x6d,
+    0x92, 0xc2, 0xa9, 0x7b, 0x6e, 0x96, 0x7c, 0x60, 0x7e, 0xa8, 0x85, 0x94,
+    0x90, 0x8b, 0x77, 0x79, 0x77, 0xa7, 0x8f, 0x83, 0x80, 0x99, 0x8c, 0x80,
+    0x93, 0x9c, 0x73, 0x9e, 0x75, 0x90, 0x67, 0x74, 0x99, 0x98, 0x7e, 0x76,
+    0x9f, 0x82, 0x90, 0x95, 0x9d, 0x5f, 0x95, 0x98, 0x8c, 0x5f, 0x77, 0x83,
+    0x7b, 0x72, 0x85, 0x7c, 0x97, 0x74, 0x81, 0x80, 0x8d, 0x89, 0x7d, 0x69,
+    0x95, 0x85, 0x83, 0x5e, 0x95, 0x74, 0x54, 0x7f, 0x6c, 0x67, 0x9b, 0x83,
+    0x88, 0x8e, 0x6f, 0x96, 0x81, 0x7f, 0x6e, 0x87, 0x8f, 0x6f, 0x61, 0x87,
+    0x63, 0x66, 0x72, 0x77, 0x75, 0x6d, 0x59, 0x7d, 0xaa, 0x85, 0x62, 0x83,
+    0x97, 0x94, 0x96, 0x89, 0x9d, 0x90, 0x7d, 0x91, 0x78, 0x57, 0xa0, 0x7f,
+    0xa2, 0x62, 0x63, 0x99, 0x77, 0x71, 0x7f, 0x61, 0x99, 0x89, 0x6f, 0xa2,
+    0xae, 0x92, 0x88, 0x51, 0x87, 0x7a, 0x6f, 0x89, 0xa8, 0x89, 0x64, 0x81,
+    0x84, 0x79, 0x5b, 0x73, 0x82, 0x6e, 0x7e, 0x5d, 0x8f, 0x82, 0x51, 0x69,
+    0x8e, 0x76, 0x8b, 0x58, 0x89, 0xb2, 0x52, 0x72, 0x7f, 0xae, 0x96, 0x5a,
+    0x80, 0xa1, 0x74, 0x62, 0x8d, 0xbe, 0x87, 0x6c, 0x6d, 0xad, 0x83, 0x5a,
+    0x6c, 0xa5, 0x7f, 0x7c, 0x7a, 0xa1, 0x75, 0x6d, 0x85, 0xbe, 0x91, 0x8e,
+    0x96, 0x8c, 0x87, 0x74, 0x8b, 0x82, 0x96, 0x8f, 0x8f, 0x93, 0x8f, 0x8c,
+    0x9a, 0x78, 0x73, 0x6e, 0x91, 0x8d, 0x7e, 0x81, 0x81, 0x52, 0x90, 0x85,
+    0x77, 0x66, 0x7e, 0x75, 0x8a, 0x67, 0x72, 0x76, 0x82, 0x7b, 0x6e, 0x67,
+    0x96, 0x7b, 0x75, 0x76, 0x8d, 0x76, 0x7f, 0x79, 0x84, 0x7b, 0x57, 0x81,
+    0x76, 0x80, 0x67, 0x8c, 0x7c, 0x80, 0x67, 0x85, 0x79, 0x5b, 0x97, 0x74,
+    0x91, 0x75, 0x82, 0x75, 0x6b, 0x94, 0x7e, 0x85, 0x8e, 0x77, 0x5d, 0x78,
+    0xb5, 0x8b, 0x73, 0x7f, 0x62, 0x8f, 0xb1, 0x7d, 0xa2, 0x85, 0x6b, 0x92,
+    0x75, 0x75, 0xb8, 0x7d, 0xb3, 0x67, 0x5f, 0xa6, 0x9b, 0x85, 0x9a, 0x67,
+    0xbe, 0x8d, 0x92, 0x88, 0xa5, 0x7c, 0xaa, 0x5a, 0x71, 0x7b, 0x70, 0x77,
+    0xa0, 0xa4, 0x5e, 0x55, 0x6b, 0x8e, 0x53, 0x89, 0x8a, 0x5a, 0x7c, 0x54,
+    0x7c, 0x8b, 0x53, 0x77, 0x67, 0x77, 0x67, 0x5d, 0x91, 0xac, 0x78, 0x81,
+    0x8e, 0xb5, 0x6d, 0x58, 0x78, 0xa6, 0x7c, 0x85, 0x87, 0xb3, 0x76, 0x5d,
+    0x7c, 0x87, 0x57, 0x68, 0x82, 0x8f, 0x89, 0x76, 0x86, 0x9f, 0x6c, 0x68,
+    0x7c, 0x87, 0x79, 0x9f, 0x86, 0x9e, 0x83, 0x70, 0x8d, 0xb2, 0x84, 0x71,
+    0x71, 0x91, 0x9f, 0x8e, 0x83, 0x84, 0x87, 0x80, 0x94, 0x80, 0x7d, 0x8d,
+    0x7c, 0x56, 0x5f, 0x80, 0x7d, 0x84, 0x61, 0x6e, 0x69, 0x80, 0x8b, 0x67,
+    0xa4, 0x8b, 0x98, 0x7a, 0x8a, 0x6c, 0x77, 0x66, 0x7d, 0x6e, 0x84, 0x78,
+    0x82, 0x7d, 0x61, 0x88, 0x6e, 0x53, 0x92, 0x75, 0x88, 0x77, 0x82, 0x9f,
+    0x9e, 0x6f, 0x9c, 0x76, 0x91, 0x78, 0x69, 0x7f, 0x71, 0x6c, 0x6f, 0x7d,
+    0x83, 0x6e, 0x3c, 0x84, 0x90, 0x8b, 0x71, 0x69, 0x75, 0x81, 0xc8, 0x84,
+    0xa7, 0x8a, 0x8a, 0x90, 0x96, 0x86, 0x9e, 0x68, 0x99, 0x84, 0x8c, 0xa0,
+    0x8a, 0x71, 0x7d, 0x41, 0xa1, 0x98, 0x77, 0x91, 0xaa, 0x86, 0x96, 0x5e,
+    0x86, 0x76, 0xa7, 0x83, 0xac, 0x86, 0x66, 0x46, 0x6a, 0x81, 0x64, 0x77,
+    0x67, 0x53, 0x80, 0x59, 0x73, 0x71, 0x63, 0x71, 0x76, 0x86, 0x62, 0x4f,
+    0x83, 0xa4, 0x5d, 0x66, 0x93, 0x87, 0x87, 0x5b, 0x7f, 0x9d, 0x61, 0x9d,
+    0x94, 0xa4, 0x84, 0x75, 0x67, 0xb3, 0x7b, 0x6d, 0x64, 0x98, 0x62, 0x77,
+    0x7d, 0x98, 0x8e, 0x75, 0x7d, 0xa6, 0xa4, 0x8c, 0x83, 0x8b, 0x7a, 0x97,
+    0x6c, 0x7f, 0x66, 0x7f, 0x8f, 0x98, 0x72, 0x6e, 0x75, 0x65, 0x80, 0x8d,
+    0x88, 0x7d, 0x8c, 0x8d, 0x67, 0x68, 0xab, 0x8c, 0x8b, 0x76, 0x87, 0x69,
+    0x88, 0x6c, 0x83, 0x6e, 0x88, 0x64, 0xa8, 0x67, 0xa5, 0x5b, 0x65, 0x60,
+    0x6b, 0x62, 0x76, 0x78, 0x8c, 0x5b, 0x61, 0x6f, 0x66, 0x65, 0x92, 0x67,
+    0x84, 0x7b, 0x80, 0x86, 0x7b, 0x6c, 0x86, 0x7a, 0x72, 0x7b, 0x4d, 0x94,
+    0x80, 0x67, 0x8e, 0x8d, 0x7f, 0x79, 0x65, 0x78, 0xa3, 0x71, 0x80, 0x74,
+    0xa7, 0xa8, 0x97, 0x78, 0x91, 0x77, 0x98, 0x86, 0x82, 0x64, 0xa5, 0x6e,
+    0x7a, 0x5d, 0x6f, 0xad, 0x9b, 0x7a, 0x91, 0x4b, 0xa1, 0x75, 0x95, 0x76,
+    0xac, 0x9d, 0xa3, 0x65, 0x65, 0x6a, 0x81, 0x8b, 0x9f, 0x67, 0x6b, 0x6a,
+    0x60, 0x5b, 0x77, 0x96, 0x73, 0x78, 0x5a, 0x77, 0x5f, 0x68, 0x70, 0x72,
+    0x78, 0x65, 0x81, 0x20, 0x86, 0x99, 0x80, 0x7a, 0xa5, 0xb1, 0x69, 0x45,
+    0x7d, 0xa6, 0x7d, 0x85, 0xaa, 0xa9, 0x65, 0x60, 0x75, 0x9b, 0x61, 0x92,
+    0x91, 0x8f, 0x8a, 0x81, 0x88, 0x9c, 0x81, 0x7d, 0x7b, 0x8f, 0x7e, 0x9e,
+    0x82, 0x94, 0x95, 0x80, 0x73, 0xae, 0x7b, 0x7a, 0x79, 0x8c, 0x8b, 0x65,
+    0x71, 0x75, 0x8d, 0x7a, 0x90, 0x83, 0x7b, 0x77, 0x71, 0x4f, 0x70, 0x95,
+    0x87, 0x69, 0x97, 0x8e, 0x70, 0x92, 0x6e, 0x91, 0x9d, 0x72, 0x75, 0x82,
+    0xad, 0x81, 0x78, 0x8d, 0x6f, 0x65, 0x88, 0x86, 0x8c, 0x8e, 0x59, 0x8b,
+    0x67, 0x69, 0x8b, 0x78, 0x7f, 0x59, 0x73, 0x87, 0x6f, 0x86, 0x66, 0x7c,
+    0x96, 0x68, 0x59, 0x78, 0x67, 0x92, 0x7b, 0x76, 0x80, 0x6e, 0x4a, 0x7b,
+    0x99, 0x67, 0x72, 0x9c, 0x7a, 0x80, 0x76, 0x5f, 0x8e, 0x4f, 0x71, 0x77,
+    0xab, 0x78, 0x99, 0x50, 0x83, 0x65, 0x78, 0x8c, 0xbb, 0x8d, 0x4e, 0x54,
+    0x81, 0x6f, 0x7f, 0x91, 0xb9, 0x79, 0x9c, 0x65, 0x5a, 0x5a, 0x73, 0x8c,
+    0x9a, 0xac, 0x99, 0x44, 0x7d, 0x4f, 0x78, 0x5a, 0x7d, 0x79, 0x57, 0x44,
+    0x6f, 0x6a, 0x75, 0x7f, 0x5f, 0x6f, 0x72, 0x62, 0x7f, 0x89, 0x57, 0x91,
+    0x8d, 0x83, 0x7e, 0x63, 0x8c, 0x95, 0x48, 0x78, 0xa9, 0x88, 0x84, 0x5b,
+    0x8c, 0xa5, 0x65, 0x71, 0x88, 0x82, 0x7e, 0xa4, 0x8d, 0x7d, 0x7d, 0x8d,
+    0x91, 0x7c, 0x73, 0x7d, 0x99, 0x89, 0x6d, 0xa1, 0x98, 0x84, 0x8b, 0x6b,
+    0x89, 0x86, 0x84, 0x7e, 0x86, 0x87, 0x78, 0x8c, 0x96, 0x92, 0x5a, 0xa0,
+    0x64, 0x73, 0x91, 0x88, 0x8f, 0x6b, 0x96, 0x5c, 0x99, 0x62, 0x78, 0x6c,
+    0x87, 0x4d, 0x5d, 0x69, 0x7b, 0x81, 0x4a, 0x61, 0x71, 0x69, 0x7d, 0x91,
+    0x67, 0x92, 0x68, 0x6f, 0x50, 0x5e, 0x61, 0x7e, 0x81, 0x70, 0x5f, 0x7b,
+    0x6b, 0x55, 0x71, 0x6c, 0x70, 0x53, 0x3f, 0x80, 0x6e, 0x57, 0x96, 0x84,
+    0x75, 0x51, 0x60, 0x9a, 0x7f, 0xa5, 0x80, 0x94, 0x95, 0x74, 0x7c, 0x83,
+    0xa0, 0x93, 0x5d, 0x92, 0x83, 0x66, 0x67, 0x8a, 0x8b, 0x9b, 0x81, 0x69,
+    0x73, 0x91, 0x6b, 0x79, 0x93, 0x88, 0x64, 0x68, 0x81, 0x8c, 0x6f, 0x81,
+    0x6f, 0x80, 0x68, 0x5f, 0x9c, 0x95, 0x76, 0x93, 0x87, 0x68, 0x83, 0x94,
+    0x8b, 0x85, 0x72, 0x7f, 0x64, 0x8c, 0x6a, 0x95, 0x8d, 0x80, 0x69, 0x6b,
+    0x98, 0x86, 0x75, 0x92, 0x7a, 0x7f, 0x5b, 0x7f, 0x9b, 0x57, 0x99, 0x8d,
+    0x8a, 0x7b, 0x58, 0x73, 0x88, 0x6d, 0x8a, 0x8c, 0x8e, 0x82, 0x85, 0xaa,
+    0x72, 0xa6, 0x7f, 0x7a, 0x83, 0x59, 0x6d, 0x6e, 0x79, 0x83, 0x88, 0x84,
+    0x74, 0x85, 0x74, 0x78, 0x80, 0x7c, 0x97, 0x86, 0x94, 0x65, 0x7e, 0x80,
+    0x6f, 0x97, 0x70, 0x74, 0x92, 0x76, 0x71, 0x91, 0x85, 0x72, 0x6e, 0x84,
+    0x78, 0x7e, 0x88, 0x79, 0x7f, 0x80, 0x83, 0x7a, 0x85, 0x75, 0x82, 0x81,
+    0x82, 0x7b, 0x7a, 0xa0, 0x76, 0x7f, 0x75, 0xa7, 0x67, 0x8e, 0x81, 0x98,
+    0xa5, 0x86, 0x77, 0x78, 0x7f, 0x97, 0x90, 0x86, 0x80, 0x6b, 0x89, 0x66,
+    0x9b, 0x5c, 0x8b, 0x74, 0xac, 0x89, 0x89, 0x92, 0x92, 0xa8, 0x61, 0x85,
+    0x8c, 0x86, 0x88, 0x91, 0x92, 0x66, 0x63, 0x6c, 0x7a, 0x80, 0x7d, 0x90,
+    0x6f, 0x7f, 0x92, 0x94, 0x8e, 0x7a, 0x86, 0x98, 0xa1, 0x59, 0x71, 0x8c,
+    0x63, 0xa3, 0x60, 0x7d, 0x88, 0x6a, 0x83, 0x6e, 0x7a, 0x94, 0x7b, 0x81,
+    0x7d, 0x83, 0x77, 0x7e, 0x63, 0xab, 0x75, 0x7b, 0x71, 0x8f, 0x76, 0x6e,
+    0x78, 0x7b, 0x79, 0x86, 0x69, 0x67, 0x67, 0x70, 0x6c, 0x7a, 0x6c, 0x84,
+    0x74, 0xa2, 0x74, 0x77, 0x8a, 0x58, 0x7d, 0xa0, 0x65, 0x7b, 0x79, 0x71,
+    0x7c, 0x3c, 0x85, 0x96, 0x59, 0x76, 0x6a, 0x94, 0xa5, 0x5b, 0x70, 0x99,
+    0x7f, 0x9a, 0x69, 0x7c, 0x6f, 0x79, 0x72, 0x8b, 0x83, 0x6e, 0x73, 0x7f,
+    0x6f, 0x6d, 0x7e, 0xa3, 0x72, 0x87, 0x83, 0x8c, 0x8c, 0x70, 0x77, 0x75,
+    0xa4, 0x5a, 0x89, 0x7d, 0xa0, 0x97, 0x67, 0x80, 0x78, 0x7e, 0x86, 0x6a,
+    0x7b, 0x9c, 0x77, 0x67, 0x7b, 0x74, 0x7f, 0xa5, 0x90, 0x94, 0x92, 0x4d,
+    0x7a, 0x79, 0x9f, 0x87, 0x64, 0x6e, 0x6d, 0x59, 0x83, 0x54, 0x79, 0x82,
+    0x6c, 0x74, 0x82, 0x98, 0x77, 0x90, 0x85, 0xa4, 0x88, 0x81, 0x71, 0x85,
+    0x90, 0x8e, 0x88, 0x68, 0x51, 0x6d, 0x71, 0x7b, 0x80, 0xbc, 0xa5, 0x57,
+    0x8f, 0x9f, 0x95, 0x89, 0xb1, 0x96, 0x69, 0x65, 0x61, 0x73, 0x6f, 0x6c,
+    0x5b, 0x95, 0x99, 0x7f, 0x76, 0x9d, 0x7c, 0x7d, 0x8d, 0xb1, 0x8f, 0x6a,
+    0x76, 0x95, 0x74, 0x7a, 0x7b, 0xae, 0x77, 0x76, 0x6d, 0x99, 0x7d, 0x80,
+    0x6e, 0x89, 0x7f, 0x74, 0x6f, 0x72, 0x89, 0x8b, 0x86, 0x7b, 0x7c, 0x72,
+    0x6b, 0x4f, 0x71, 0x94, 0x80, 0x96, 0x83, 0x7e, 0x75, 0x74, 0x68, 0x83,
+    0x95, 0x8c, 0x85, 0x7a, 0x82, 0x74, 0x85, 0x83, 0x8c, 0x7e, 0x7a, 0xa0,
+    0x8e, 0x67, 0x6b, 0x82, 0x9b, 0x66, 0x6c, 0x8a, 0x88, 0x7e, 0x74, 0x9e,
+    0x88, 0x82, 0x73, 0x73, 0x79, 0x7c, 0x72, 0x6b, 0x74, 0x8b, 0xa4, 0xa4,
+    0xa3, 0x73, 0x73, 0x88, 0x8d, 0x94, 0x84, 0x9a, 0x9e, 0x93, 0x6c, 0x86,
+    0x7a, 0x7a, 0x7e, 0xaa, 0x66, 0x8f, 0x99, 0xa4, 0x70, 0x4c, 0x6f, 0x66,
+    0x8a, 0xaa, 0x69, 0x80, 0x6a, 0x5e, 0x71, 0x8f, 0x8b, 0x84, 0x75, 0x9d,
+    0x5c, 0x60, 0x61, 0x4a, 0x6f, 0x91, 0x78, 0x6e, 0x8c, 0x62, 0x88, 0x75,
+    0x64, 0x7c, 0x7d, 0x92, 0x9b, 0x96, 0x62, 0x72, 0x6c, 0x6f, 0x87, 0x5d,
+    0xa0, 0xa7, 0x7c, 0x58, 0x6e, 0x8c, 0x82, 0x84, 0x7f, 0x8b, 0x54, 0x77,
+    0x5b, 0x9a, 0x6a, 0x78, 0x5d, 0xb9, 0x8e, 0x7d, 0x6e, 0xa1, 0x66, 0x7c,
+    0x87, 0xd2, 0x7a, 0x6c, 0x82, 0xa1, 0x83, 0x59, 0x64, 0x9e, 0x65, 0x6d,
+    0x77, 0x80, 0x7c, 0x9a, 0x50, 0x9f, 0x8b, 0x7a, 0x73, 0x80, 0x92, 0x6d,
+    0x97, 0x7f, 0x74, 0x6a, 0x5f, 0x44, 0x7d, 0x99, 0x95, 0x91, 0x8f, 0x6a,
+    0x63, 0x56, 0x89, 0x96, 0xba, 0xa6, 0x71, 0x98, 0x9d, 0x3a, 0x8f, 0x77,
+    0x6d, 0x76, 0x68, 0xb4, 0x8d, 0x79, 0x7a, 0x83, 0x7f, 0x96, 0x75, 0x94,
+    0x9e, 0x51, 0x83, 0x5b, 0x66, 0x73, 0xa1, 0xbc, 0x8c, 0x70, 0x88, 0x80,
+    0x92, 0x60, 0x7d, 0xa9, 0x97, 0x74, 0x7d, 0x98, 0x7b, 0x78, 0x85, 0xa7,
+    0x8f, 0x8c, 0x91, 0x9d, 0x6a, 0x80, 0x6c, 0x8e, 0x8e, 0x91, 0x76, 0x8b,
+    0x79, 0x59, 0x7d, 0x9c, 0x69, 0x83, 0x8c, 0x95, 0x8e, 0x75, 0x9d, 0x83,
+    0x92, 0x99, 0x8a, 0x59, 0x61, 0x54, 0x63, 0x86, 0x83, 0x86, 0x98, 0x83,
+    0x73, 0x74, 0x91, 0x52, 0x60, 0x8a, 0x7c, 0x57, 0xbc, 0x9d, 0x86, 0x6b,
+    0x63, 0xa2, 0x78, 0x80, 0x75, 0xb1, 0x74, 0x76, 0x69, 0x8b, 0x7e, 0x76,
+    0x7b, 0xb3, 0x77, 0x5b, 0x6c, 0x8b, 0x83, 0x80, 0x7f, 0xd1, 0x7c, 0x58,
+    0x6f, 0x98, 0x71, 0x57, 0x60, 0xd0, 0x84, 0x62, 0x74, 0xa6, 0x8f, 0x7b,
+    0x70, 0xaa, 0x81, 0x6b, 0x7f, 0x89, 0x6a, 0x74, 0x5a, 0x8c, 0x9c, 0x77,
+    0x5d, 0x84, 0x63, 0x94, 0x8e, 0x91, 0x83, 0x4a, 0x49, 0x74, 0x6b, 0x70,
+    0xc0, 0xa0, 0x6a, 0x90, 0x8e, 0x5a, 0x70, 0x96, 0xab, 0x72, 0x7e, 0xba,
+    0xa7, 0x46, 0x86, 0x5d, 0x90, 0x76, 0x95, 0x8d, 0xa5, 0x40, 0x82, 0x8a,
+    0x7d, 0x5e, 0x73, 0x94, 0x9d, 0x58, 0x8c, 0x8b, 0x69, 0x6c, 0x9a, 0x90,
+    0xaa, 0x6f, 0x85, 0x8d, 0x64, 0x58, 0x7b, 0x97, 0xa9, 0x79, 0xa5, 0xa2,
+    0x5f, 0x57, 0x9a, 0xb4, 0x89, 0x70, 0x84, 0x73, 0x46, 0x6c, 0x6e, 0x87,
+    0x70, 0x94, 0x8a, 0x8a, 0x69, 0x7b, 0x6c, 0x68, 0x8e, 0xa2, 0x90, 0x84,
+    0x78, 0x45, 0x63, 0x78, 0x7f, 0x90, 0x9f, 0x90, 0x68, 0x43, 0x92, 0x77,
+    0x78, 0x77, 0x82, 0x7d, 0x8f, 0x6a, 0x7a, 0x70, 0x76, 0x75, 0x87, 0x63,
+    0xbc, 0x8e, 0x6a, 0x71, 0x51, 0x51, 0x75, 0x6b, 0x8a, 0xb4, 0x6a, 0x5b,
+    0x99, 0x84, 0x76, 0x84, 0x74, 0xaf, 0x86, 0x6a, 0x53, 0x97, 0x6e, 0x8e,
+    0x61, 0xc4, 0x7e, 0x5d, 0x4d, 0x96, 0x73, 0x73, 0x53, 0xc0, 0x8f, 0x68,
+    0x58, 0xae, 0x81, 0x83, 0x62, 0x98, 0x7b, 0x89, 0x54, 0x86, 0x78, 0x67,
+    0x70, 0x9b, 0x63, 0x5f, 0x2d, 0x77, 0x84, 0x79, 0x6b, 0xa4, 0x7b, 0x65,
+    0x45, 0x65, 0x56, 0x86, 0xbb, 0x8a, 0x8e, 0x92, 0x86, 0x48, 0x7c, 0x6d,
+    0xb4, 0x7d, 0x56, 0xa4, 0x86, 0x52, 0x8b, 0x6a, 0x8d, 0x5b, 0x9d, 0xa2,
+    0xbf, 0x36, 0x7c, 0x99, 0x9d, 0x65, 0x75, 0xa4, 0x9f, 0x6a, 0x7c, 0x6b,
+    0x6f, 0x55, 0x70, 0x7f, 0xc2, 0x38, 0x6e, 0xa4, 0x74, 0x4c, 0x75, 0xbb,
+    0xa4, 0x75, 0x8e, 0x8f, 0x56, 0x65, 0x57, 0x92, 0x73, 0x7f, 0x7d, 0x86,
+    0x65, 0x76, 0x92, 0x84, 0x70, 0xa8, 0x91, 0x5b, 0x69, 0x74, 0x8e, 0x82,
+    0x78, 0x8a, 0xaa, 0x71, 0x70, 0x50, 0x85, 0x82, 0x7d, 0x94, 0xa0, 0x76,
+    0x6d, 0x55, 0x86, 0x79, 0x71, 0x7f, 0x9b, 0x71, 0x8a, 0x42, 0x87, 0x64,
+    0x57, 0x88, 0xa0, 0x77, 0xa8, 0x91, 0x72, 0x65, 0x7e, 0x6b, 0x7e, 0x81,
+    0x8d, 0x97, 0x7e, 0x6a, 0x92, 0x88, 0x84, 0x7a, 0x61, 0xa9, 0x86, 0x59,
+    0x6c, 0x87, 0x61, 0x72, 0x4f, 0xc8, 0x99, 0x6c, 0x66, 0xa3, 0x80, 0x8b,
+    0x5c, 0xc0, 0x69, 0x7a, 0x6c, 0xb8, 0x8e, 0x91, 0x51, 0x9f, 0x8c, 0x85,
+    0x75, 0x96, 0x8c, 0x84, 0x6b, 0xa6, 0x71, 0x62, 0x42, 0x60, 0x74, 0x72,
+    0x92, 0x91, 0x70, 0x5b, 0x3d, 0x71, 0x5e, 0x91, 0xa3, 0xa5, 0x6a, 0x7c,
+    0x60, 0x58, 0x82, 0x80, 0xa3, 0x73, 0x8f, 0xa0, 0xb2, 0x4b, 0x94, 0x5e,
+    0x9f, 0x75, 0x4d, 0x83, 0xbc, 0x42, 0x5e, 0x80, 0x8f, 0x59, 0x53, 0xac,
+    0xb2, 0x45, 0x68, 0x7d, 0x9a, 0x65, 0x8a, 0xaa, 0xa0, 0x4e, 0x77, 0x72,
+    0x4d, 0x62, 0x6e, 0x98, 0x8c, 0x73, 0x92, 0x5a, 0x49, 0x55, 0x7b, 0x98,
+    0x8d, 0x84, 0x80, 0x8e, 0x2e, 0x56, 0x78, 0x73, 0x7b, 0x8f, 0x9a, 0x69,
+    0x73, 0x68, 0x7a, 0x88, 0x78, 0xa5, 0xb1, 0x5c, 0x8f, 0x55, 0x71, 0x99,
+    0x7a, 0xa9, 0xb0, 0x75, 0x69, 0x44, 0x5f, 0x66, 0x81, 0x7d, 0x9e, 0x4f,
+    0x66, 0x7f, 0x87, 0x7d, 0x5d, 0x7c, 0x95, 0x62, 0xa5, 0x86, 0x90, 0x6f,
+    0x60, 0xa5, 0x6e, 0x70, 0x80, 0x96, 0x6f, 0x55, 0x77, 0x87, 0x99, 0x7b,
+    0x21, 0xaa, 0x7f, 0x60, 0x63, 0xae, 0x47, 0x79, 0x44, 0xb5, 0x83, 0x6e,
+    0x6d, 0x93, 0x76, 0x54, 0x4b, 0xad, 0x91, 0x6b, 0x6a, 0x9c, 0x8c, 0x83,
+    0x62, 0x8a, 0x88, 0x71, 0x73, 0xa0, 0x75, 0x95, 0x54, 0x80, 0x92, 0x65,
+    0x45, 0x80, 0x63, 0x9a, 0x93, 0x9b, 0x78, 0x4e, 0x4d, 0x5f, 0x69, 0x9e,
+    0xbd, 0xa5, 0x75, 0x6b, 0x6e, 0x6a, 0x82, 0x97, 0xab, 0x60, 0x76, 0xb3,
+    0xc1, 0x39, 0x82, 0x5b, 0x71, 0x31, 0x7b, 0x9c, 0xb5, 0x4f, 0x75, 0x79,
+    0x6c, 0x5d, 0x80, 0xa6, 0x9c, 0x53, 0x6f, 0x85, 0x84, 0x5e, 0x7d, 0xb5,
+    0x95, 0x5f, 0x7c, 0x98, 0x72, 0x7c, 0x67, 0x99, 0xbb, 0x6c, 0x73, 0x66,
+    0x59, 0x5c, 0x6c, 0x9a, 0x9b, 0x72, 0x9b, 0x5f, 0x4b, 0x51, 0x63, 0x84,
+    0x74, 0xa0, 0xb3, 0x6e, 0x63, 0xa0, 0x84, 0x90, 0x71, 0x91, 0xba, 0x64,
+    0x6d, 0x72, 0x78, 0x83, 0x6f, 0x8e, 0xbd, 0x64, 0x69, 0x60, 0x95, 0x67,
+    0x70, 0x93, 0x78, 0x4d, 0x91, 0x3f, 0x7b, 0x6d, 0x69, 0x87, 0x7d, 0x8a,
+    0xa3, 0x95, 0x9d, 0x66, 0x6d, 0x8b, 0x7a, 0x75, 0x94, 0x7b, 0x89, 0x52,
+    0x66, 0x65, 0x79, 0x84, 0x49, 0x9c, 0x60, 0x66, 0x3e, 0xab, 0x4a, 0x86,
+    0x54, 0xcd, 0x7c, 0x83, 0x7c, 0xac, 0x8b, 0x53, 0x67, 0xbb, 0x7c, 0x6d,
+    0x72, 0xb3, 0x83, 0x85, 0x4f, 0x97, 0x86, 0x60, 0x7d, 0x93, 0x70, 0x8b,
+    0x64, 0x78, 0x82, 0x73, 0x54, 0x87, 0x6c, 0xaa, 0x6f, 0x97, 0x8d, 0x51,
+    0x2d, 0x50, 0x75, 0xa9, 0xc2, 0x94, 0x8d, 0x6f, 0x6d, 0x71, 0x7b, 0x87,
+    0x93, 0x67, 0x7d, 0xa5, 0xa2, 0x4f, 0x99, 0x83, 0x95, 0x49, 0x70, 0x9c,
+    0xcf, 0x37, 0x84, 0x86, 0x94, 0x5c, 0x95, 0xa1, 0xb6, 0x73, 0x80, 0x8d,
+    0x89, 0x62, 0x6f, 0xb4, 0xa1, 0x5b, 0x64, 0x91, 0x41, 0x4f, 0x53, 0xa6,
+    0xae, 0x75, 0x84, 0x82, 0x58, 0x8e, 0x63, 0x95, 0xa3, 0x8d, 0x8b, 0x76,
+    0x5d, 0x78, 0x80, 0x82, 0x6e, 0x9d, 0xb8, 0x7d, 0x64, 0x8a, 0x7e, 0x80,
+    0x72, 0x99, 0xcf, 0x76, 0x66, 0x77, 0x7c, 0x81, 0x71, 0x6f, 0xa1, 0x6c,
+    0x6b, 0x70, 0x80, 0x7c, 0x6d, 0x83, 0x8e, 0x74, 0x7a, 0x58, 0x69, 0x53,
+    0x58, 0x7d, 0x7f, 0x84, 0x96, 0x9c, 0x75, 0x6e, 0x62, 0x7c, 0x88, 0x7e,
+    0x7f, 0x98, 0x93, 0x61, 0x98, 0x98, 0x80, 0x83, 0x2e, 0x7d, 0x64, 0x69,
+    0x50, 0xa5, 0x38, 0x96, 0x2e, 0xc5, 0x66, 0x56, 0x64, 0xaa, 0x63, 0x64,
+    0x6d, 0xb3, 0x8a, 0x6c, 0x59, 0xb6, 0x69, 0x7a, 0x54, 0x91, 0x58, 0x96,
+    0x6b, 0x9f, 0x6d, 0x88, 0x4a, 0x82, 0x94, 0x67, 0x38, 0x93, 0x60, 0x87,
+    0x8c, 0x93, 0x8c, 0x52, 0x31, 0x43, 0x66, 0xa9, 0xb3, 0x7a, 0x88, 0x64,
+    0x60, 0x5b, 0x80, 0x84, 0xb7, 0x5a, 0x7a, 0x9d, 0x92, 0x50, 0x89, 0x80,
+    0x72, 0x51, 0x7f, 0x85, 0xae, 0x47, 0x76, 0x9a, 0x7a, 0x74, 0x6d, 0x93,
+    0xbd, 0x42, 0x72, 0x6d, 0x58, 0x5e, 0x6e, 0xa4, 0xb5, 0x4e, 0x76, 0x8f,
+    0x75, 0x9b, 0x5d, 0x92, 0xad, 0x77, 0x7f, 0x73, 0x62, 0x7d, 0x65, 0xaf,
+    0x98, 0x87, 0x80, 0x7c, 0x61, 0x81, 0x45, 0xa0, 0x84, 0x99, 0xbb, 0x72,
+    0x86, 0x8f, 0x70, 0x97, 0x6a, 0x8a, 0xd3, 0x70, 0x7c, 0x91, 0x77, 0x82,
+    0x70, 0x8c, 0xd5, 0x6c, 0x7f, 0x51, 0x5f, 0x69, 0x72, 0x89, 0x9a, 0x68,
+    0x79, 0x70, 0x8b, 0x80, 0x52, 0x98, 0x86, 0x7a, 0xa0, 0x7b, 0x61, 0x6e,
+    0x66, 0x6f, 0x77, 0x78, 0x64, 0xac, 0x7e, 0x73, 0x5d, 0x71, 0x6f, 0x80,
+    0x2e, 0xa9, 0x90, 0x5c, 0x56, 0xa1, 0x32, 0x88, 0x55, 0xb9, 0x67, 0x6f,
+    0x5c, 0xa5, 0x87, 0x61, 0x6b, 0xbd, 0x77, 0x7c, 0x62, 0xae, 0x7c, 0x7a,
+    0x66, 0xac, 0x7a, 0x62, 0x5c, 0x9a, 0x58, 0x89, 0x5a, 0x74, 0x72, 0x66,
+    0x5c, 0x8e, 0x51, 0x8e, 0x99, 0x92, 0xa0, 0x49, 0x31, 0x55, 0x68, 0x99,
+    0xba, 0x82, 0xa2, 0x7a, 0x5e, 0x6f, 0x84, 0x98, 0x96, 0x52, 0x73, 0x99,
+    0xb4, 0x5e, 0x7c, 0x59, 0x7d, 0x4a, 0x7e, 0xa0, 0xbe, 0x63, 0x67, 0x8e,
+    0x7f, 0x71, 0x80, 0xaf, 0x93, 0x4e, 0x78, 0x7e, 0x6d, 0x52, 0x66, 0xb3,
+    0x94, 0x56, 0x84, 0x8f, 0x50, 0x6d, 0x65, 0xa8, 0xb3, 0x4b, 0x91, 0x7f,
+    0x4c, 0x8d, 0x69, 0x79, 0x95, 0x8f, 0x8f, 0x7c, 0x66, 0x98, 0x75, 0x9b,
+    0x73, 0x9b, 0xac, 0x79, 0x6e, 0x84, 0x69, 0x9e, 0x80, 0xa0, 0xb0, 0x6c,
+    0x46, 0x8b, 0x3f, 0x7a, 0x79, 0x79, 0xb3, 0x62, 0x6b, 0x60, 0x67, 0x81,
+    0x4a, 0x7e, 0xa7, 0x8c, 0x74, 0x7f, 0x67, 0x4c, 0x4b, 0x8c, 0x8e, 0x67,
+    0x78, 0x9d, 0x94, 0x79, 0x75, 0x7c, 0x86, 0x7b, 0x67, 0x9f, 0xa4, 0x61,
+    0x5b, 0x6e, 0x85, 0x70, 0x20, 0xa5, 0x66, 0x5e, 0x55, 0xad, 0x3e, 0x7c,
+    0x2d, 0xb4, 0x78, 0x6f, 0x4c, 0xc6, 0x7e, 0x6d, 0x54, 0xb4, 0x71, 0x78,
+    0x54, 0xc3, 0x66, 0x6e, 0x4a, 0xa0, 0x7b, 0x85, 0x66, 0x94, 0x75, 0x8d,
+    0x34, 0x88, 0x71, 0x4e, 0x49, 0x8a, 0x3b, 0x9c, 0x88, 0x76, 0x7f, 0x6a,
+    0x37, 0x64, 0x66, 0xb6, 0xa3, 0x82, 0x76, 0x82, 0x6d, 0x65, 0x6f, 0x8c,
+    0x99, 0x5e, 0x77, 0xa1, 0x99, 0x51, 0xa1, 0x67, 0x6f, 0x4c, 0x7f, 0x9e,
+    0xad, 0x40, 0x65, 0x82, 0x76, 0x66, 0x72, 0xb5, 0xb2, 0x5b, 0x71, 0x8a,
+    0x76, 0x74, 0x52, 0xa0, 0x91, 0x37, 0x86, 0x72, 0x6c, 0x75, 0x62, 0xa5,
+    0xb6, 0x57, 0x75, 0x90, 0x3e, 0x7f, 0x49, 0x9f, 0x8e, 0x92, 0x81, 0x87,
+    0x69, 0x9e, 0x6b, 0x86, 0x8d, 0xb1, 0x9e, 0x65, 0x6f, 0x93, 0x70, 0x79,
+    0x7b, 0x87, 0xbe, 0x59, 0x69, 0x7a, 0x56, 0x7a, 0x81, 0x7d, 0xb8, 0x67,
+    0x67, 0x7f, 0x54, 0x8f, 0x71, 0x85, 0xa0, 0x74, 0x89, 0x5d, 0x67, 0x52,
+    0x65, 0x96, 0x89, 0x84, 0x81, 0x83, 0x82, 0x9a, 0x85, 0x73, 0x78, 0x62,
+    0x87, 0x98, 0x75, 0x6a, 0x73, 0x95, 0x86, 0x71, 0x11, 0x9a, 0x91, 0x66,
+    0x6e, 0xa4, 0x35, 0x89, 0x47, 0xbb, 0x5e, 0x46, 0x3a, 0xa8, 0x70, 0x4a,
+    0x65, 0xb9, 0x70, 0x96, 0x66, 0xcf, 0x80, 0x79, 0x60, 0xa4, 0x79, 0x70,
+    0x68, 0x92, 0x7f, 0x89, 0x6b, 0x87, 0x77, 0x67, 0x5b, 0x74, 0x3f, 0x9e,
+    0x94, 0x9b, 0xa1, 0x61, 0x4b, 0x66, 0x70, 0xad, 0xb7, 0x67, 0x70, 0x6c,
+    0x3f, 0x5b, 0x94, 0x88, 0xb3, 0x4f, 0x97, 0x97, 0x8c, 0x55, 0xb8, 0x78,
+    0x60, 0x25, 0x51, 0x91, 0xcd, 0x44, 0x6f, 0x85, 0x5c, 0x65, 0x67, 0xa5,
+    0x9e, 0x5f, 0x6d, 0x85, 0x6d, 0x56, 0x80, 0xae, 0x79, 0x63, 0x4f, 0x7d,
+    0x5f, 0x6b, 0x6e, 0xa7, 0x8e, 0x76, 0x8f, 0x90, 0x6e, 0x8c, 0x88, 0x92,
+    0x81, 0x81, 0x96, 0x7d, 0x48, 0x6b, 0x3f, 0xa1, 0x8c, 0xa2, 0x9f, 0x7f,
+    0x77, 0x97, 0x73, 0x9c, 0x67, 0x95, 0xae, 0x77, 0x7f, 0x7a, 0x52, 0x7e,
+    0x91, 0x77, 0xa8, 0x54, 0x6a, 0x74, 0x52, 0x8a, 0x67, 0x8e, 0x90, 0x8d,
+    0x8b, 0x52, 0x72, 0x5a, 0x73, 0x8f, 0x94, 0x87, 0x7c, 0x88, 0x89, 0x76,
+    0x77, 0x88, 0x5c, 0x77, 0x8f, 0x94, 0xac, 0x58, 0x70, 0x79, 0x75, 0x8a,
+    0x20, 0x9c, 0x91, 0x55, 0x55, 0xa4, 0x5b, 0x84, 0x30, 0xc6, 0x8a, 0x51,
+    0x31, 0xc3, 0x72, 0x6b, 0x65, 0xb9, 0x79, 0x7d, 0x62, 0xad, 0x88, 0x75,
+    0x37, 0xb0, 0x76, 0x8a, 0x7d, 0x85, 0x7f, 0xb4, 0x46, 0x9c, 0x83, 0x7b,
+    0x79, 0x78, 0x56, 0xac, 0x8d, 0xa2, 0xa9, 0x54, 0x44, 0x5a, 0x63, 0xb2,
+    0xa8, 0x72, 0xa4, 0x6b, 0x5d, 0x4d, 0x8e, 0x95, 0x9e, 0x4a, 0x98, 0x8c,
+    0xb0, 0x5c, 0xa5, 0x75, 0x83, 0x3b, 0x46, 0x92, 0xa7, 0x3b, 0x6a, 0x75,
+    0x59, 0x57, 0x52, 0xa1, 0xab, 0x54, 0x68, 0x7c, 0x94, 0x6e, 0x5b, 0x9a,
+    0xa3, 0x5d, 0x73, 0x74, 0x5a, 0x63, 0x56, 0x9e, 0xc1, 0x71, 0x82, 0x79,
+    0x49, 0x92, 0x63, 0xa6, 0x99, 0x7d, 0x71, 0x81, 0x5e, 0x90, 0x5c, 0x8b,
+    0x7e, 0xb4, 0xa0, 0x8c, 0x67, 0x93, 0x4e, 0x72, 0x65, 0x83, 0xb5, 0x77,
+    0x83, 0x92, 0x43, 0x67, 0x8c, 0x81, 0xb1, 0x75, 0x6a, 0x61, 0x66, 0x6f,
+    0x5d, 0x7f, 0x8d, 0x7b, 0x6b, 0x68, 0x6f, 0x85, 0x6e, 0x87, 0x97, 0x89,
+    0x9b, 0x81, 0x7e, 0x7e, 0x9d, 0x83, 0x6b, 0x6a, 0xa5, 0x92, 0x7e, 0x70,
+    0x60, 0x8f, 0x6f, 0x8b, 0x15, 0xa6, 0x66, 0x4e, 0x61, 0xbc, 0x38, 0x67,
+    0x46, 0xab, 0x84, 0x5e, 0x3a, 0xac, 0x74, 0x58, 0x76, 0xc4, 0x7a, 0x76,
+    0x67, 0xc0, 0x76, 0x6f, 0x52, 0xa6, 0xa2, 0x97, 0x76, 0xa6, 0x7f, 0x99,
+    0x5d, 0xa5, 0x5f, 0x60, 0x58, 0x88, 0x3f, 0x9e, 0x7d, 0x81, 0x71, 0x63,
+    0x42, 0x55, 0x3e, 0xbd, 0xa9, 0x7a, 0xa5, 0x67, 0x62, 0x7a, 0x80, 0x9e,
+    0xc3, 0x54, 0x7f, 0x9f, 0x93, 0x73, 0xbd, 0x79, 0x74, 0x2e, 0x54, 0x9e,
+    0xaa, 0x76, 0x68, 0x80, 0x78, 0x64, 0x57, 0x93, 0xa4, 0x56, 0x75, 0x72,
+    0x81, 0x7f, 0x48, 0xad, 0x89, 0x67, 0x60, 0x7e, 0x7a, 0x83, 0x6e, 0x95,
+    0xb0, 0x57, 0x89, 0x91, 0x4d, 0x86, 0x78, 0x7b, 0x74, 0x8c, 0x8f, 0x8d,
+    0x67, 0xa4, 0x64, 0x8d, 0x77, 0x9a, 0xa1, 0x88, 0x6e, 0x94, 0x33, 0x95,
+    0x81, 0x76, 0xc6, 0x7d, 0x7d, 0x85, 0x5a, 0x6e, 0x8e, 0x69, 0x9e, 0x71,
+    0x82, 0x81, 0x59, 0x5b, 0x71, 0x9a, 0x91, 0x8e, 0x80, 0x69, 0x71, 0x73,
+    0x6e, 0x9a, 0x95, 0x94, 0x7b, 0x80, 0x82, 0x7e, 0x76, 0x84, 0x70, 0x72,
+    0x9c, 0xa0, 0x77, 0x66, 0x55, 0xa1, 0x8c, 0x73, 0x35, 0xa0, 0x68, 0x4d,
+    0x3b, 0xaa, 0x44, 0x6f, 0x3c, 0xc0, 0x96, 0x78, 0x33, 0xbd, 0x64, 0x5b,
+    0x75, 0xd2, 0x83, 0x87, 0x59, 0xbd, 0x80, 0x80, 0x6e, 0x8e, 0x65, 0x7a,
+    0x87, 0xb6, 0x8d, 0x94, 0x39, 0x95, 0x8b, 0x5d, 0x66, 0x71, 0x4e, 0x9f,
+    0x96, 0x8a, 0x98, 0x47, 0x41, 0x6c, 0x4c, 0xac, 0x95, 0x81, 0x90, 0x75,
+    0x59, 0x4c, 0xa2, 0x93, 0x99, 0x58, 0x7b, 0xaf, 0xa3, 0x52, 0xb0, 0x6c,
+    0x5f, 0x47, 0x6e, 0x8e, 0xae, 0x3d, 0x81, 0x6d, 0x78, 0x52, 0x4f, 0x81,
+    0x80, 0x68, 0x4b, 0x81, 0x74, 0x71, 0x67, 0xa7, 0x9a, 0x55, 0x84, 0x72,
+    0x64, 0x6b, 0x6e, 0x9d, 0xab, 0x76, 0x79, 0x85, 0x40, 0x84, 0x80, 0x85,
+    0x70, 0x91, 0x9a, 0x81, 0x5b, 0x89, 0x6b, 0x8a, 0x92, 0x8c, 0xa4, 0x7b,
+    0x75, 0x89, 0x54, 0x76, 0x69, 0x69, 0xb3, 0x6c, 0x47, 0x7d, 0x4c, 0x7f,
+    0x81, 0x86, 0x8f, 0x63, 0x71, 0x6a, 0x63, 0x67, 0x7c, 0x8f, 0xa0, 0x68,
+    0x86, 0x58, 0x5b, 0x87, 0x6a, 0x82, 0x89, 0x78, 0x9d, 0x8d, 0xaa, 0x82,
+    0x6e, 0xa4, 0x6f, 0x6d, 0x70, 0x9f, 0x7f, 0x77, 0x41, 0xa5, 0x86, 0x61,
+    0x2d, 0x99, 0xa9, 0x5f, 0x5a, 0xb3, 0x51, 0x70, 0x5a, 0xce, 0x77, 0x68,
+    0x2c, 0xb8, 0x90, 0x44, 0x58, 0xb9, 0x74, 0x8e, 0x70, 0xb3, 0x9a, 0x75,
+    0x6d, 0xc0, 0x9e, 0x8e, 0x8d, 0xa8, 0x7b, 0xa8, 0x4a, 0x89, 0x6e, 0x7f,
+    0x5d, 0x6e, 0x46, 0x91, 0x6d, 0x81, 0x89, 0x3e, 0x35, 0x69, 0x44, 0xaf,
+    0x99, 0x8d, 0x94, 0x54, 0x60, 0x5b, 0xaf, 0x97, 0x92, 0x4e, 0x80, 0xae,
+    0x9e, 0x62, 0xa3, 0x77, 0x6e, 0x5d, 0x71, 0xa0, 0xa6, 0x59, 0x84, 0x5d,
+    0x65, 0x4a, 0x69, 0xa1, 0xa1, 0x40, 0x75, 0x65, 0x6b, 0x68, 0x60, 0xb3,
+    0x92, 0x27, 0x70, 0x67, 0x9b, 0x5e, 0x50, 0xaf, 0xae, 0x64, 0x7a, 0x6e,
+    0x61, 0x94, 0x3b, 0x8f, 0x86, 0x7f, 0x98, 0x88, 0x7a, 0x7f, 0x61, 0x7b,
+    0x64, 0x96, 0x96, 0x79, 0x5c, 0x96, 0x52, 0x92, 0x76, 0x7e, 0xc4, 0x60,
+    0x6d, 0x7b, 0x41, 0x8c, 0x7b, 0x8e, 0x9a, 0x66, 0x79, 0x95, 0x67, 0x6a,
+    0x7a, 0x9b, 0xa9, 0x85, 0x6d, 0x66, 0x55, 0x65, 0x76, 0x8b, 0x90, 0x86,
+    0x88, 0x8b, 0x8f, 0x7e, 0x83, 0x7c, 0x75, 0x5f, 0x78, 0x96, 0x76, 0x47,
+    0x54, 0x9c, 0x8d, 0x7d, 0x24, 0x9f, 0x79, 0x5c, 0x55, 0xb2, 0x3b, 0x67,
+    0x4e, 0xd2, 0x90, 0x79, 0x3c, 0xc3, 0x8b, 0x4a, 0x7c, 0xd7, 0x70, 0x75,
+    0x5b, 0xaf, 0xa8, 0x6b, 0x59, 0xc1, 0x6d, 0x5f, 0x5d, 0x96, 0x87, 0x9a,
+    0x5d, 0x7f, 0x8e, 0x6d, 0x5c, 0x75, 0x3f, 0xb6, 0x8e, 0x81, 0x7b, 0x31,
+    0x47, 0x67, 0x56, 0xb6, 0x90, 0x71, 0x89, 0x63, 0x61, 0x75, 0x8d, 0x8b,
+    0x97, 0x62, 0x62, 0x85, 0x9c, 0x64, 0xb7, 0x61, 0x71, 0x3f, 0x6c, 0x8b,
+    0xaa, 0x43, 0x82, 0x70, 0x52, 0x52, 0x80, 0xaa, 0x9e, 0x5d, 0x90, 0x69,
+    0x8a, 0x77, 0x6d, 0x9f, 0x9e, 0x5f, 0x84, 0x61, 0x87, 0x70, 0x43, 0xab,
+    0x97, 0x6e, 0x84, 0x6c, 0x5d, 0x82, 0x64, 0x85, 0x83, 0x7e, 0x82, 0x7c,
+    0x7b, 0x91, 0x55, 0x7e, 0x77, 0x88, 0xba, 0x71, 0x6d, 0x7b, 0x71, 0x8a,
+    0x7f, 0x84, 0xb5, 0x63, 0x4a, 0x9a, 0x3c, 0x70, 0x7a, 0x99, 0xa3, 0x50,
+    0x84, 0x82, 0x56, 0x4c, 0x74, 0x8e, 0xa3, 0x77, 0x8f, 0x4e, 0x5f, 0x6d,
+    0x97, 0x89, 0xa0, 0x6b, 0x7c, 0x8c, 0x85, 0x82, 0x8e, 0xa1, 0x89, 0x5b,
+    0x7f, 0x8b, 0x8f, 0x5e, 0x74, 0x96, 0x8a, 0x7d, 0x15, 0x7b, 0x8f, 0x88,
+    0x5f, 0xa7, 0x63, 0x5b, 0x39, 0xbd, 0x96, 0x56, 0x4c, 0xb4, 0x7b, 0x53,
+    0x5a, 0xaf, 0x79, 0x7b, 0x5c, 0xa6, 0xaa, 0x74, 0x5f, 0xa0, 0x76, 0x9e,
+    0x71, 0x9a, 0x60, 0xa4, 0x33, 0x87, 0x66, 0x66, 0x64, 0x7d, 0x6d, 0xac,
+    0x9e, 0x8c, 0x78, 0x4f, 0x3d, 0x7b, 0x53, 0xb1, 0x97, 0x8a, 0x96, 0x6e,
+    0x60, 0x4b, 0xa9, 0x9e, 0x93, 0x6e, 0x93, 0xb7, 0xae, 0x46, 0xb9, 0x60,
+    0x72, 0x46, 0x80, 0x95, 0xb5, 0x57, 0x82, 0x53, 0x6e, 0x4e, 0x5b, 0xa2,
+    0x9a, 0x3d, 0x8b, 0x6c, 0x84, 0x65, 0x69, 0xa1, 0x8c, 0x60, 0x83, 0x74,
+    0x73, 0x53, 0x5d, 0x7e, 0x7f, 0x79, 0x6e, 0x81, 0x89, 0x8f, 0x51, 0x81,
+    0x99, 0x97, 0x81, 0x8a, 0x87, 0x83, 0x43, 0x90, 0x89, 0x94, 0x93, 0x7a,
+    0x66, 0x80, 0x82, 0x82, 0x79, 0x85, 0xb0, 0x6b, 0x87, 0x7b, 0x53, 0x89,
+    0x79, 0x9d, 0xab, 0x6e, 0x82, 0x84, 0x50, 0x8f, 0x7e, 0x74, 0x90, 0x74,
+    0x6e, 0x65, 0x84, 0x70, 0x82, 0x7a, 0x9e, 0x6d, 0x8f, 0x62, 0xb2, 0x84,
+    0x78, 0x7e, 0x72, 0x5a, 0x7a, 0x85, 0x8c, 0x4b, 0x70, 0x99, 0x87, 0x78,
+    0x26, 0x95, 0xb9, 0x77, 0x4d, 0xb6, 0x51, 0x6a, 0x41, 0xbf, 0x76, 0x68,
+    0x56, 0xb6, 0x80, 0x53, 0x83, 0xaf, 0x87, 0x79, 0x79, 0xb4, 0x89, 0x7d,
+    0x47, 0x9d, 0xa0, 0x86, 0x89, 0xc3, 0x6d, 0x99, 0x41, 0x89, 0x9a, 0x59,
+    0x54, 0x83, 0x79, 0x9d, 0x7b, 0x73, 0x88, 0x4a, 0x42, 0x64, 0x7a, 0x9f,
+    0x7b, 0x6e, 0x71, 0x7b, 0x6a, 0x61, 0xae, 0xa3, 0xa0, 0x68, 0x95, 0x9d,
+    0x94, 0x49, 0x8b, 0x70, 0x8a, 0x5f, 0x49, 0xbb, 0xa7, 0x4a, 0xa1, 0x59,
+    0x59, 0x59, 0x6d, 0xa0, 0x9f, 0x50, 0xa0, 0x7b, 0x75, 0x49, 0x5a, 0x8c,
+    0x84, 0x68, 0x78, 0x57, 0x7a, 0x6e, 0x6b, 0x87, 0x9c, 0x7b, 0x84, 0x83,
+    0x79, 0x7d, 0x5a, 0x77, 0x77, 0x6f, 0x6f, 0x7c, 0x8f, 0x83, 0x40, 0x62,
+    0x6a, 0x87, 0xab, 0x74, 0x86, 0x96, 0x7a, 0x7d, 0x7b, 0x81, 0x9a, 0x65,
+    0x60, 0x82, 0x61, 0x73, 0x71, 0x77, 0xa7, 0x79, 0x87, 0x8c, 0x4e, 0x72,
+    0x8d, 0x89, 0x94, 0x6d, 0x75, 0x6d, 0x6e, 0x82, 0x7a, 0x8d, 0xa9, 0x77,
+    0x77, 0x7c, 0x74, 0xa7, 0xb7, 0x67, 0x75, 0x67, 0x7e, 0x9f, 0x73, 0x60,
+    0x6c, 0x95, 0x7f, 0x62, 0x31, 0x70, 0x85, 0x7a, 0x5f, 0xc0, 0x69, 0x66,
+    0x71, 0xb0, 0x81, 0x5d, 0x48, 0xc9, 0x86, 0x39, 0x93, 0xa4, 0x8e, 0x7c,
+    0x5e, 0xbb, 0x98, 0x5c, 0x74, 0x9c, 0x89, 0x6d, 0x74, 0xbd, 0x8e, 0x6e,
+    0x5f, 0x9a, 0x6d, 0x70, 0x57, 0x9c, 0x58, 0xb7, 0x8e, 0x94, 0xa0, 0x3f,
+    0x39, 0x75, 0x6f, 0xb4, 0xa2, 0x94, 0xa9, 0x70, 0x61, 0x8a, 0x70, 0x92,
+    0xa7, 0x7f, 0x7f, 0x8d, 0x7a, 0x73, 0xa1, 0x5f, 0x8a, 0x4a, 0x65, 0xaa,
+    0x92, 0x6e, 0x98, 0x51, 0x81, 0x47, 0x57, 0xb8, 0x89, 0x50, 0x8a, 0x6d,
+    0x8b, 0x50, 0x8a, 0x86, 0x9b, 0x7d, 0x5b, 0x4a, 0x68, 0x74, 0x53, 0x9b,
+    0x94, 0x74, 0x7c, 0x6f, 0x62, 0x86, 0x5b, 0x8f, 0x82, 0x96, 0x6e, 0x7c,
+    0x80, 0x8f, 0x47, 0x5b, 0x70, 0x95, 0x97, 0x77, 0x8d, 0x8e, 0x69, 0x62,
+    0x78, 0x8f, 0xbf, 0x5e, 0x76, 0xae, 0x4d, 0x84, 0x73, 0x76, 0xab, 0x6f,
+    0x7f, 0x8c, 0x4b, 0x7d, 0x96, 0x7d, 0xb3, 0x55, 0x78, 0x8d, 0x76, 0x73,
+    0x8d, 0x8e, 0x98, 0x6a, 0x91, 0x86, 0x6d, 0x8c, 0x7d, 0x93, 0x97, 0x56,
+    0x79, 0x8f, 0xa3, 0x7f, 0x7e, 0x82, 0xa0, 0x63, 0x3d, 0x6b, 0x88, 0x5e,
+    0x61, 0xc0, 0x45, 0x5f, 0x66, 0xb0, 0x6c, 0x6d, 0x29, 0xd5, 0x95, 0x3b,
+    0x77, 0xaa, 0x62, 0x70, 0x63, 0xce, 0x8c, 0x6e, 0x56, 0xaa, 0x77, 0x6e,
+    0x90, 0xcc, 0x6d, 0x7e, 0x41, 0x9f, 0x88, 0x4f, 0x5d, 0xb4, 0x4c, 0x9b,
+    0x80, 0x97, 0x98, 0x59, 0x4c, 0x71, 0x53, 0xb4, 0x90, 0x97, 0x93, 0x90,
+    0x46, 0x63, 0xa6, 0x87, 0x9d, 0x56, 0x7f, 0xab, 0x8e, 0x68, 0xc6, 0x5d,
+    0x6e, 0x58, 0x4b, 0x85, 0xa1, 0x70, 0x8a, 0x60, 0x84, 0x44, 0x68, 0x8e,
+    0x9b, 0x3a, 0x8c, 0x57, 0x91, 0x4c, 0x6b, 0x9c, 0xa7, 0x64, 0x82, 0x5f,
+    0x68, 0x6d, 0x4d, 0xa1, 0x6c, 0x91, 0x6c, 0x6b, 0x64, 0x97, 0x86, 0x81,
+    0x8d, 0x8e, 0x80, 0x72, 0x88, 0x96, 0x5d, 0x6e, 0x7c, 0x67, 0x97, 0x69,
+    0x95, 0x93, 0x61, 0x8b, 0x9b, 0x7d, 0xc8, 0x6f, 0x85, 0x80, 0x67, 0x68,
+    0x90, 0x6b, 0xcc, 0x7c, 0xa3, 0xa0, 0x58, 0x81, 0x7a, 0x8d, 0x9f, 0x65,
+    0x81, 0x82, 0x78, 0x6b, 0x85, 0x7b, 0x9b, 0x69, 0x86, 0x6c, 0x83, 0x6c,
+    0x8e, 0x59, 0xab, 0x56, 0x7c, 0x7f, 0x7b, 0x84, 0x71, 0x63, 0x7d, 0x73,
+    0x60, 0x8b, 0x7a, 0x7b, 0x5e, 0xbb, 0x4b, 0x40, 0x30, 0xcc, 0x80, 0x65,
+    0x6c, 0xb7, 0x80, 0x35, 0x7d, 0xa3, 0x5c, 0x6c, 0x49, 0xa6, 0x9b, 0x7b,
+    0x53, 0xba, 0x62, 0x76, 0x78, 0xa0, 0x72, 0x80, 0x78, 0x93, 0x87, 0x62,
+    0x64, 0x84, 0x6f, 0xa1, 0x70, 0x90, 0x9a, 0x6b, 0x42, 0x55, 0x6d, 0xc5,
+    0xa6, 0x8a, 0x79, 0x64, 0x4c, 0x72, 0x7b, 0xa9, 0xa3, 0x70, 0x84, 0x8f,
+    0x63, 0x7a, 0x9c, 0x4e, 0x5a, 0x76, 0x91, 0x67, 0xaf, 0x76, 0xbf, 0x46,
+    0x62, 0x3f, 0x7d, 0xa7, 0x8d, 0x62, 0x90, 0x5b, 0x9a, 0x44, 0x51, 0x80,
+    0xa6, 0x7e, 0x8d, 0x6a, 0x73, 0x65, 0x72, 0x82, 0x99, 0xb4, 0x6a, 0x75,
+    0x85, 0x90, 0x47, 0x62, 0x9e, 0x95, 0x94, 0x78, 0x89, 0x74, 0x5d, 0xa3,
+    0x7f, 0x9d, 0x7d, 0x63, 0x96, 0x86, 0x8d, 0xa2, 0x95, 0xab, 0xae, 0x5d,
+    0x93, 0x8d, 0x3d, 0x76, 0x9e, 0x9c, 0xc4, 0x71, 0x7d, 0xa3, 0x75, 0x7e,
+    0x6d, 0x9d, 0xa3, 0x7f, 0x94, 0x89, 0x47, 0x71, 0x8b, 0x95, 0xb1, 0x72,
+    0x90, 0x53, 0x7e, 0x8f, 0x8c, 0x90, 0xa1, 0x4d, 0x59, 0x62, 0x73, 0xa0,
+    0x69, 0x88, 0x86, 0x71, 0x60, 0x3b, 0x81, 0x57, 0x7d, 0x86, 0x58, 0x63,
+    0x7d, 0x98, 0x74, 0x67, 0x5d, 0xb0, 0x67, 0x45, 0x9b, 0xa9, 0x94, 0x68,
+    0x43, 0x8b, 0x85, 0x56, 0x63, 0x96, 0x87, 0x78, 0x88, 0xbf, 0x92, 0x8d,
+    0x60, 0xa8, 0x7e, 0x7e, 0x78, 0x80, 0x66, 0x92, 0x6e, 0x97, 0xab, 0x7f,
+    0x4f, 0x65, 0x59, 0xb0, 0x9b, 0x6b, 0x9f, 0x70, 0x6f, 0x5c, 0xac, 0x95,
+    0xa3, 0x54, 0x8e, 0xa9, 0x9e, 0x8c, 0xa5, 0x66, 0x5f, 0x5b, 0x6c, 0x83,
+    0x90, 0x73, 0x85, 0x64, 0x61, 0x51, 0x4a, 0x63, 0xa1, 0x96, 0x7e, 0x4e,
+    0x87, 0x60, 0x68, 0xb5, 0x9a, 0x8d, 0x75, 0x4e, 0x8a, 0x7a, 0x5f, 0x9f,
+    0x74, 0x80, 0x69, 0x6d, 0x73, 0x92, 0x79, 0x7e, 0x85, 0x68, 0x83, 0x9d,
+    0xb6, 0x9d, 0x6e, 0x8f, 0x78, 0x91, 0xaf, 0x8f, 0xa0, 0x9d, 0x73, 0x55,
+    0x91, 0x8f, 0xb2, 0x76, 0x97, 0xab, 0x63, 0x63, 0x68, 0x7b, 0xab, 0x5c,
+    0x77, 0xae, 0x4c, 0x72, 0x6e, 0x93, 0xb8, 0x51, 0x79, 0x84, 0x7d, 0x6b,
+    0x7f, 0x8a, 0xba, 0x68, 0x7a, 0x43, 0x9a, 0x8d, 0x77, 0x8a, 0x6d, 0x56,
+    0x79, 0x41, 0x7a, 0x4b, 0x81, 0x7a, 0x5c, 0x68, 0x58, 0x36, 0x6f, 0x6f,
+    0x9f, 0xa6, 0x5f, 0x60, 0x4e, 0x67, 0x70, 0x4c, 0x69, 0x69, 0x94, 0x63,
+    0x6d, 0x7b, 0x88, 0x9e, 0x6d, 0x98, 0x69, 0x68, 0x88, 0x80, 0x80, 0x7a,
+    0x8e, 0x78, 0x5e, 0x8d, 0x7e, 0x91, 0x76, 0x64, 0x7e, 0x7f, 0x4e, 0xc9,
+    0x79, 0x8f, 0x9c, 0x82, 0x3d, 0x62, 0x63, 0xc3, 0xb8, 0x7b, 0x72, 0x7b,
+    0x50, 0x56, 0x95, 0x72, 0x8f, 0x6b, 0x90, 0x9d, 0x76, 0xa4, 0xa5, 0x79,
+    0x54, 0x4f, 0x59, 0x85, 0xc5, 0x92, 0x97, 0x4d, 0x6f, 0x69, 0x77, 0x7f,
+    0x71, 0x7c, 0x87, 0x59, 0x98, 0x61, 0x80, 0x81, 0x88, 0x6b, 0x6d, 0x7f,
+    0x7f, 0x77, 0x60, 0xa2, 0x96, 0x73, 0x69, 0x86, 0x83, 0x8d, 0x60, 0x66,
+    0x88, 0x8c, 0x93, 0x67, 0x98, 0x82, 0x7e, 0x91, 0x99, 0x59, 0x8e, 0x6e,
+    0x90, 0xa1, 0x62, 0x8a, 0x98, 0x7b, 0xc8, 0x67, 0x85, 0x8d, 0x6c, 0xa1,
+    0xa1, 0x92, 0xd0, 0x49, 0x85, 0x76, 0x89, 0x75, 0x88, 0x83, 0xa3, 0x77,
+    0x85, 0x68, 0x82, 0x83, 0x7f, 0x79, 0xae, 0x85, 0x76, 0x84, 0x80, 0x9a,
+    0x9d, 0x7b, 0x83, 0x90, 0x79, 0x88, 0x79, 0x9a, 0x93, 0x6c, 0x69, 0x79,
+    0x5f, 0x90, 0x81, 0x7b, 0x87, 0x9d, 0x86, 0x82, 0x7a, 0x77, 0x71, 0x85,
+    0x8b, 0x99, 0x8f, 0x7b, 0x58, 0x98, 0x84, 0x6e, 0x9a, 0xa1, 0x7a, 0x8c,
+    0x77, 0xa8, 0x86, 0x93, 0x7b, 0x90, 0x79, 0x8a, 0x85, 0x8f, 0x84, 0x97,
+    0x73, 0x83, 0x7b, 0x76, 0x8e, 0xa1, 0x89, 0x8a, 0x83, 0x9c, 0x65, 0x68,
+    0x7b, 0x89, 0x92, 0x84, 0x6d, 0x90, 0x61, 0x78, 0x98, 0x8c, 0x8d, 0x87,
+    0xa0, 0x99, 0x79, 0x7b, 0x69, 0xa4, 0x7a, 0x8d, 0x73, 0x71, 0x70, 0x80,
+    0x82, 0x77, 0x81, 0x67, 0x75, 0x97, 0x71, 0x73, 0x85, 0x6d, 0x8e, 0x86,
+    0x6e, 0x80, 0x86, 0x9e, 0x6f, 0x70, 0x67, 0x59, 0x65, 0x89, 0x67, 0x8b,
+    0x7d, 0x68, 0x69, 0x7a, 0x5b, 0x7e, 0x87, 0xa1, 0x92, 0x7b, 0x64, 0x7e,
+    0x76, 0x72, 0x71, 0xab, 0x7c, 0x83, 0x6f, 0xa1, 0x86, 0x76, 0x71, 0x6f,
+    0x91, 0x77, 0x6c, 0x71, 0x92, 0x78, 0x70, 0x7f, 0x6e, 0x65, 0x77, 0x93,
+    0x7e, 0x6c, 0x85, 0x9d, 0x78, 0x8b, 0x7c, 0x5f, 0x94, 0x86, 0x7c, 0x7f,
+    0x83, 0x6e, 0x72, 0x9e, 0x6e, 0x6b, 0x8d, 0x91, 0x97, 0x8b, 0x7b, 0x72,
+    0x86, 0x75, 0x7f, 0x96, 0x7d, 0x81, 0xa1, 0x55, 0xa6, 0x88, 0x96, 0x87,
+    0x93, 0x68, 0x89, 0x72, 0x6f, 0x9c, 0x75, 0x7c, 0x79, 0x6c, 0x74, 0x84,
+    0x7d, 0xa4, 0x86, 0x84, 0x84, 0x8d, 0x63, 0x7a, 0x63, 0xbc, 0x7e, 0x93,
+    0x80, 0x8d, 0x71, 0x7a, 0x5f, 0x8c, 0x74, 0x96, 0x7e, 0x9b, 0x9d, 0x8d,
+    0x5b, 0xa4, 0x71, 0x5e, 0x83, 0x78, 0x86, 0x7f, 0x70, 0x99, 0x87, 0x85,
+    0x8e, 0x81, 0x93, 0x80, 0x89, 0xa0, 0x7a, 0x77, 0x8e, 0x73, 0x5f, 0x80,
+    0x6d, 0x87, 0x5b, 0x7a, 0x85, 0x7c, 0x85, 0x63, 0x61, 0x9d, 0x6f, 0x68,
+    0x77, 0x86, 0x61, 0x6d, 0x84, 0x98, 0x7c, 0x78, 0x69, 0x84, 0x91, 0x6d,
+    0x81, 0xa1, 0x6c, 0x62, 0x95, 0x6d, 0x86, 0x8b, 0x95, 0x8f, 0x5e, 0x86,
+    0x73, 0xa1, 0x83, 0x58, 0x5f, 0x8e, 0x76, 0x79, 0x9e, 0x92, 0x7c, 0x7b,
+    0x81, 0x8b, 0x83, 0x7b, 0x78, 0x75, 0x70, 0x83, 0x70, 0x5a, 0x6a, 0x59,
+    0xa3, 0x82, 0x7a, 0x91, 0x8b, 0x6e, 0x82, 0x8e, 0x70, 0x73, 0x91, 0x76,
+    0xa5, 0x7f, 0x70, 0x81, 0x6f, 0x85, 0x94, 0xa6, 0x8c, 0x50, 0x76, 0x6e,
+    0x64, 0x95, 0xa0, 0x64, 0x6c, 0x68, 0x8e, 0x8b, 0xa1, 0x7d, 0xa0, 0x7f,
+    0x76, 0x8b, 0x7b, 0x93, 0x7b, 0x6e, 0x7e, 0x64, 0x8a, 0xa7, 0x78, 0x64,
+    0x93, 0x67, 0x7d, 0x68, 0x5c, 0xa0, 0x76, 0x98, 0xaf, 0x80, 0x55, 0x96,
+    0x97, 0x9c, 0x78, 0x75, 0x87, 0x85, 0x77, 0x77, 0x62, 0x93, 0x76, 0x68,
+    0xa0, 0x80, 0x81, 0x7f, 0x9a, 0x68, 0x74, 0x69, 0x94, 0x77, 0x77, 0x72,
+    0x90, 0x9a, 0x6f, 0x95, 0x89, 0x6b, 0x6b, 0x94, 0x7e, 0x9c, 0x6f, 0x67,
+    0x8f, 0x82, 0x80, 0x92, 0x76, 0x80, 0x65, 0x9b, 0x6a, 0x7c, 0x75, 0x5a,
+    0x87, 0xa1, 0x69, 0x7a, 0x79, 0x9e, 0x9a, 0x58, 0x81, 0x92, 0x72, 0x67,
+    0x90, 0x80, 0x82, 0x61, 0x9f, 0x9e, 0x6a, 0x8d, 0x8d, 0x8a, 0x73, 0x81,
+    0x68, 0x7f, 0x5b, 0x59, 0x98, 0x89, 0x71, 0x72, 0x58, 0x7b, 0x94, 0x5d,
+    0xa9, 0x8b, 0x72, 0x7b, 0x65, 0x73, 0x5b, 0x8b, 0x7d, 0x86, 0x6e, 0x8c,
+    0x66, 0x6f, 0x6b, 0x8b, 0x71, 0x80, 0x7f, 0x70, 0x70, 0x88, 0x70, 0x7e,
+    0x84, 0x89, 0x7f, 0x81, 0x87, 0x77, 0x71, 0x88, 0x7f, 0x8f, 0x5e, 0x80,
+    0x5d, 0xa1, 0x89, 0x77, 0x93, 0x8e, 0x55, 0x64, 0x88, 0x9a, 0x8b, 0x80,
+    0x77, 0x6f, 0x91, 0x83, 0x6b, 0x9b, 0x85, 0x5c, 0x57, 0x7e, 0xa9, 0x63,
+    0x83, 0xaa, 0x7c, 0xa1, 0x91, 0x5f, 0x68, 0x76, 0x7a, 0x97, 0x96, 0x84,
+    0xca, 0x8d, 0x8c, 0x8b, 0x71, 0x81, 0x88, 0x92, 0xaa, 0x74, 0x49, 0x7a,
+    0x90, 0x93, 0x7a, 0x61, 0x8c, 0x66, 0x71, 0xa0, 0xab, 0x7d, 0x86, 0x6c,
+    0x9f, 0x77, 0x67, 0x6a, 0x89, 0x89, 0x88, 0x70, 0xad, 0x88, 0x69, 0x84,
+    0x70, 0x8f, 0x79, 0x7c, 0x66, 0xa6, 0x71, 0x8d, 0x77, 0x99, 0x69, 0x76,
+    0x79, 0x7d, 0x9c, 0x6f, 0x64, 0x8b, 0x70, 0x82, 0x69, 0xa4, 0x65, 0x6e,
+    0x7f, 0x9e, 0x7e, 0x84, 0x8c, 0x9c, 0x6c, 0x5b, 0x6e, 0xa7, 0x6d, 0x7a,
+    0x92, 0x78, 0x9a, 0x6f, 0x81, 0x91, 0x71, 0x7d, 0x6b, 0x99, 0x6b, 0x92,
+    0x5e, 0x7e, 0x64, 0x95, 0x78, 0x90, 0x6f, 0x68, 0x8a, 0x85, 0x6f, 0x88,
+    0x64, 0x66, 0x7f, 0x78, 0x7c, 0x95, 0x66, 0x6c, 0x76, 0x6a, 0x9b, 0x8f,
+    0x9d, 0x78, 0x86, 0x95, 0x73, 0x66, 0x6d, 0x71, 0x8b, 0x7f, 0x6f, 0x70,
+    0x64, 0x94, 0xa0, 0x83, 0x6b, 0x6d, 0x85, 0x89, 0x68, 0x92, 0x8e, 0x51,
+    0x81, 0x85, 0x86, 0x6e, 0x83, 0x85, 0x8a, 0x5e, 0x68, 0xbf, 0xc4, 0xa5,
+    0x8b, 0x67, 0x86, 0x59, 0x85, 0x9e, 0x96, 0x67, 0x82, 0x7c, 0x6c, 0x80,
+    0x84, 0xae, 0x9d, 0x80, 0xc2, 0x58, 0x5d, 0x95, 0x85, 0x8b, 0x7f, 0x5d,
+    0xc7, 0x75, 0x75, 0x87, 0xa2, 0x8c, 0x62, 0x71, 0x9c, 0x61, 0x7f, 0x9c,
+    0xca, 0x8d, 0x89, 0x6e, 0x7c, 0x71, 0x81, 0x99, 0x95, 0xa4, 0x76, 0x6f,
+    0x64, 0x7b, 0x6c, 0x72, 0x8b, 0x83, 0x70, 0x70, 0x8b, 0xa4, 0x69, 0x76,
+    0x6e, 0x8d, 0x7a, 0x80, 0x8f, 0x9e, 0x73, 0x4b, 0x75, 0x78, 0x77, 0x7b,
+    0x8e, 0x92, 0x88, 0x49, 0x54, 0x9f, 0x7a, 0x7f, 0x68, 0x9f, 0x7f, 0x57,
+    0x6b, 0xad, 0x85, 0x6f, 0x81, 0xa1, 0x96, 0x6f, 0x73, 0x8d, 0x5e, 0x65,
+    0x7a, 0x8c, 0x7c, 0x6a, 0x7e, 0x7a, 0x6a, 0x97, 0x59, 0x86, 0x62, 0x77,
+    0x70, 0x7a, 0x68, 0x62, 0x68, 0x86, 0x7e, 0x76, 0x9a, 0x7f, 0x6c, 0x7e,
+    0x8a, 0x76, 0x65, 0x8f, 0x7d, 0x65, 0x76, 0xa4, 0x95, 0x62, 0x78, 0x97,
+    0x7a, 0x6e, 0x7a, 0x7a, 0x7e, 0x91, 0x8c, 0x8a, 0x91, 0x82, 0x89, 0x6d,
+    0x87, 0x90, 0x69, 0x71, 0x96, 0xa6, 0x7c, 0x7c, 0xa8, 0xa8, 0x62, 0x77,
+    0x76, 0x99, 0xdd, 0x76, 0x8a, 0x5c, 0x86, 0x6a, 0x69, 0x9c, 0xa5, 0x7d,
+    0x78, 0x6a, 0x88, 0x77, 0x77, 0xae, 0x8a, 0x99, 0xcb, 0x85, 0x59, 0x84,
+    0x7b, 0x97, 0x8a, 0x82, 0xc5, 0x65, 0x8c, 0x93, 0xc3, 0x8c, 0x87, 0x64,
+    0x91, 0x41, 0x70, 0xa8, 0xd1, 0x8b, 0x82, 0x71, 0x9c, 0x71, 0x4e, 0x86,
+    0x98, 0x86, 0x7f, 0x7e, 0x69, 0x99, 0x79, 0x78, 0x77, 0xb3, 0x6b, 0x80,
+    0x84, 0x8b, 0x56, 0x73, 0x84, 0x95, 0x82, 0x94, 0x5b, 0x92, 0x83, 0x46,
+    0x66, 0x89, 0x6d, 0x61, 0x99, 0xa6, 0x99, 0x3f, 0x6c, 0xab, 0x5d, 0x5f,
+    0x6c, 0x8e, 0x6b, 0x4a, 0x72, 0xb6, 0x6c, 0x75, 0x78, 0xa6, 0x6f, 0x5b,
+    0x56, 0x8b, 0x57, 0x74, 0x8f, 0xab, 0x53, 0x56, 0x5d, 0x63, 0x63, 0x8b,
+    0x65, 0x78, 0x71, 0x67, 0x7a, 0x62, 0x8d, 0x78, 0x99, 0x76, 0x94, 0x7a,
+    0xa3, 0x70, 0x55, 0x87, 0x7e, 0x7c, 0x57, 0x57, 0x6e, 0x79, 0x94, 0x8f,
+    0x86, 0x80, 0x90, 0x7d, 0x7d, 0x7f, 0x7f, 0x68, 0x41, 0x86, 0x8c, 0x6f,
+    0x8a, 0x7f, 0x87, 0x8a, 0x7e, 0x7f, 0x5d, 0x71, 0x91, 0x81, 0x93, 0x71,
+    0x91, 0xc6, 0x70, 0x4a, 0x74, 0xa8, 0xf3, 0x72, 0xa7, 0x80, 0x7e, 0x41,
+    0x84, 0xa3, 0xb6, 0x94, 0xba, 0x84, 0x70, 0x74, 0x71, 0xac, 0x9f, 0x9d,
+    0xe4, 0x67, 0x6a, 0x87, 0x92, 0x8e, 0x92, 0x82, 0xdb, 0x5e, 0x9b, 0x90,
+    0xd5, 0x87, 0x8d, 0x7c, 0x9c, 0x3c, 0x6c, 0xab, 0xc2, 0x86, 0x83, 0x79,
+    0x6c, 0x61, 0x51, 0xa9, 0x99, 0x79, 0x72, 0x80, 0x6f, 0x85, 0x57, 0x6c,
+    0x81, 0x86, 0x6e, 0x88, 0x87, 0x8d, 0x8e, 0x81, 0x67, 0x88, 0x62, 0x99,
+    0x87, 0xab, 0x8f, 0x57, 0x60, 0x77, 0x64, 0x81, 0x96, 0xa3, 0x81, 0x3d,
+    0x4e, 0xb9, 0x57, 0x6e, 0x99, 0xad, 0x6a, 0x3e, 0x74, 0x96, 0x7e, 0x79,
+    0x65, 0xa4, 0x7c, 0x6a, 0x53, 0x87, 0x56, 0x6f, 0x5e, 0x97, 0x85, 0x42,
+    0x56, 0x6b, 0x67, 0x78, 0x7d, 0xa6, 0x7c, 0x7c, 0x7d, 0x78, 0x7b, 0x84,
+    0x99, 0x7b, 0x89, 0x71, 0x76, 0x8b, 0x76, 0x73, 0x7d, 0x83, 0x56, 0x4f,
+    0x86, 0x72, 0x83, 0x88, 0x6a, 0x93, 0x69, 0x90, 0x6c, 0x73, 0x6f, 0x63,
+    0x55, 0x88, 0x6b, 0x88, 0x7c, 0x86, 0x87, 0x7b, 0x6c, 0x7e, 0x60, 0x57,
+    0xa8, 0x81, 0xa3, 0x72, 0xba, 0xbf, 0x66, 0x65, 0x70, 0xb9, 0xe4, 0x78,
+    0x99, 0x67, 0x8c, 0x72, 0x88, 0x96, 0xb5, 0x72, 0x8a, 0x66, 0x81, 0x39,
+    0x85, 0x93, 0xa0, 0x9c, 0xdf, 0x74, 0x8a, 0x6d, 0x93, 0xa1, 0x8c, 0x7a,
+    0xb5, 0x4b, 0x89, 0xae, 0xba, 0x9c, 0x96, 0x9a, 0xb4, 0x33, 0x5a, 0xb1,
+    0xcd, 0x88, 0x84, 0x63, 0x8c, 0x5e, 0x71, 0x6d, 0xa7, 0x8a, 0x62, 0x85,
+    0x77, 0x75, 0x62, 0x79, 0x96, 0x73, 0x4f, 0x7d, 0x93, 0x8a, 0x88, 0x7e,
+    0x59, 0x6c, 0x7f, 0x87, 0x6f, 0x91, 0x88, 0x59, 0x6d, 0x83, 0x70, 0x7c,
+    0x7f, 0x8d, 0x7f, 0x26, 0x41, 0xcf, 0x6b, 0x6e, 0x75, 0xa3, 0x90, 0x5e,
+    0x3a, 0x94, 0x61, 0x9a, 0x6f, 0x9f, 0x69, 0x7d, 0x55, 0x8c, 0x60, 0x7c,
+    0x93, 0x85, 0x85, 0x4b, 0x54, 0x71, 0x60, 0x8a, 0x6d, 0x8c, 0x9c, 0x7e,
+    0x5b, 0x79, 0x74, 0x7b, 0x7b, 0x9d, 0x5b, 0x65, 0x81, 0x82, 0x66, 0x89,
+    0x82, 0x72, 0x77, 0x78, 0x75, 0x76, 0x6b, 0x74, 0x89, 0x73, 0x6c, 0x6b,
+    0x77, 0x7e, 0x67, 0x84, 0x41, 0x90, 0x58, 0x87, 0x98, 0x60, 0x96, 0x81,
+    0x6b, 0x74, 0x7d, 0x56, 0x72, 0x71, 0x9a, 0x7d, 0xc5, 0xd0, 0x88, 0x6e,
+    0x4d, 0xbe, 0xef, 0x8a, 0xa7, 0x92, 0x82, 0x67, 0x7f, 0x91, 0xc5, 0x7d,
+    0xad, 0x77, 0x6b, 0x4e, 0x8e, 0x99, 0x9b, 0x8e, 0xc7, 0x7f, 0x8a, 0x8e,
+    0x8f, 0x87, 0x9c, 0x75, 0xb0, 0x53, 0x75, 0x97, 0xc7, 0x98, 0xa4, 0xa4,
+    0x80, 0x41, 0x79, 0xc3, 0xdb, 0x86, 0x9d, 0x75, 0x7f, 0x67, 0x7a, 0x96,
+    0xc3, 0x83, 0x54, 0x8e, 0x6f, 0xa8, 0x7c, 0x65, 0x78, 0x7e, 0x59, 0xa3,
+    0x8a, 0x97, 0x8b, 0x82, 0x5e, 0x66, 0x82, 0x9b, 0x9e, 0x9f, 0x70, 0x49,
+    0x55, 0x88, 0x8a, 0x7e, 0x90, 0xa7, 0x6b, 0x3b, 0x28, 0xc0, 0x63, 0x7e,
+    0x60, 0x90, 0x7c, 0x3f, 0x54, 0x9c, 0x7d, 0x8a, 0x6a, 0xa9, 0x6f, 0x61,
+    0x76, 0x86, 0x64, 0x88, 0x72, 0xa5, 0x6b, 0x4d, 0x56, 0x6c, 0x52, 0xa1,
+    0x84, 0x69, 0x69, 0x5b, 0x71, 0x84, 0x76, 0x9b, 0x92, 0x70, 0x86, 0x8b,
+    0x71, 0x68, 0x56, 0x92, 0x76, 0x8f, 0x8f, 0x72, 0x5a, 0x77, 0x6f, 0x92,
+    0x72, 0x72, 0x5e, 0x7a, 0x70, 0x73, 0x60, 0x7d, 0x5a, 0x93, 0x7f, 0x6b,
+    0x89, 0x6b, 0xa1, 0x85, 0x5c, 0x8d, 0x76, 0x7c, 0x6f, 0x73, 0x96, 0x6d,
+    0xbb, 0xad, 0x53, 0x53, 0x5f, 0x9a, 0xe2, 0x8d, 0xa7, 0x6d, 0x8a, 0x5b,
+    0x85, 0x9c, 0xb4, 0x7b, 0xb3, 0x52, 0x75, 0x7f, 0x7a, 0x8c, 0x91, 0x7e,
+    0xca, 0x5f, 0x64, 0x71, 0x85, 0x9a, 0x91, 0x72, 0xbd, 0x6e, 0x9b, 0x81,
+    0x8f, 0xa8, 0xac, 0x7d, 0xb4, 0x5f, 0x45, 0xc5, 0xc8, 0x7a, 0x93, 0x8e,
+    0x7b, 0x41, 0x69, 0x94, 0x8b, 0x76, 0x59, 0x81, 0x73, 0x92, 0x8e, 0x63,
+    0x8e, 0x74, 0x33, 0xa5, 0x9c, 0xa2, 0x88, 0x48, 0x5d, 0x8c, 0x7d, 0xa6,
+    0x68, 0x9a, 0x6f, 0x58, 0x6c, 0x8f, 0x77, 0x65, 0x97, 0x9d, 0x7a, 0x37,
+    0x59, 0xab, 0x6e, 0x8f, 0x7a, 0xae, 0x65, 0x3e, 0x46, 0xa9, 0x82, 0x82,
+    0x9c, 0x9d, 0x62, 0x79, 0x66, 0x7f, 0x5e, 0x88, 0x9e, 0x8f, 0x84, 0x71,
+    0x5d, 0x6d, 0x70, 0xa0, 0x69, 0x92, 0x7f, 0x70, 0x66, 0x6f, 0x75, 0x8c,
+    0x96, 0x7a, 0x85, 0x6a, 0x5a, 0x7c, 0x72, 0x8a, 0x8d, 0x7b, 0x8b, 0x5c,
+    0x76, 0x69, 0x70, 0x7f, 0x74, 0xa1, 0x71, 0x91, 0x5a, 0x8c, 0x6e, 0x83,
+    0x52, 0x78, 0x71, 0x6d, 0xa9, 0x63, 0x9d, 0x81, 0x52, 0x9e, 0x5d, 0x60,
+    0x76, 0x93, 0x97, 0x67, 0xce, 0xc1, 0x75, 0x5e, 0x5f, 0x8c, 0xea, 0x76,
+    0xad, 0x7a, 0x7d, 0x62, 0x85, 0x92, 0xd0, 0x6a, 0xbc, 0x53, 0x55, 0x5c,
+    0x6d, 0x89, 0x9e, 0x71, 0xd2, 0x8b, 0x64, 0x61, 0x85, 0x9a, 0x77, 0x75,
+    0xb9, 0x67, 0x8a, 0xac, 0x90, 0x8a, 0xb4, 0x91, 0xbb, 0x58, 0x94, 0xaf,
+    0xb2, 0x76, 0xa2, 0x71, 0x95, 0x5e, 0x73, 0xa5, 0x92, 0x8c, 0x52, 0x96,
+    0x53, 0x95, 0x84, 0x91, 0x93, 0x7a, 0x40, 0x88, 0xab, 0xa5, 0x63, 0x70,
+    0x66, 0x88, 0x7e, 0x92, 0x89, 0x84, 0x78, 0x57, 0x3d, 0x8d, 0x84, 0x77,
+    0x9b, 0x87, 0x5e, 0x4e, 0x42, 0xa0, 0x76, 0x8a, 0x77, 0x90, 0x83, 0x4c,
+    0x42, 0x9b, 0x75, 0x7a, 0x88, 0x94, 0x98, 0x69, 0x4c, 0xa2, 0x6b, 0x7b,
+    0x6e, 0x9b, 0x5d, 0x5f, 0x53, 0x6a, 0x63, 0x95, 0x69, 0x8a, 0x61, 0x75,
+    0x6c, 0x7a, 0x58, 0x89, 0x84, 0x8f, 0x6b, 0x5a, 0x71, 0x6f, 0x59, 0x89,
+    0x7d, 0x87, 0x5f, 0x77, 0x4b, 0x61, 0x77, 0x92, 0x67, 0x8e, 0x5c, 0x6f,
+    0x5b, 0x77, 0x76, 0x6b, 0x44, 0x9d, 0x9f, 0x7f, 0x8b, 0x94, 0x9e, 0x7c,
+    0x62, 0x94, 0x60, 0x55, 0x77, 0x8f, 0xa6, 0x62, 0xb5, 0xb2, 0x3c, 0x61,
+    0x5c, 0x99, 0xeb, 0x5b, 0x90, 0x6c, 0x7f, 0x5f, 0x75, 0xa6, 0xcf, 0x77,
+    0x98, 0x5d, 0x75, 0x69, 0x7f, 0x8a, 0xa7, 0x73, 0xc8, 0x74, 0x70, 0x82,
+    0x76, 0x8f, 0xa2, 0x7a, 0xa4, 0x7a, 0x66, 0x81, 0x9b, 0x8f, 0x9e, 0x8b,
+    0xa1, 0x51, 0x7b, 0xba, 0xc8, 0x90, 0xab, 0x92, 0x72, 0x57, 0x5b, 0xa3,
+    0xb0, 0x7f, 0x4c, 0x7d, 0x5f, 0x8e, 0x6c, 0x7d, 0x71, 0x7e, 0x4e, 0x87,
+    0xb7, 0x97, 0x7a, 0x4c, 0x5f, 0x72, 0x78, 0x84, 0x82, 0x7e, 0x63, 0x65,
+    0x68, 0x78, 0x73, 0x85, 0x90, 0x99, 0x80, 0x57, 0x42, 0x8b, 0x8a, 0x77,
+    0x71, 0x97, 0x6d, 0x44, 0x41, 0x8f, 0x78, 0x7d, 0x95, 0x81, 0x95, 0x5f,
+    0x64, 0x87, 0x66, 0x80, 0x89, 0x9a, 0x61, 0x4d, 0x68, 0x7b, 0x72, 0x73,
+    0x85, 0x92, 0x77, 0x7d, 0x73, 0x77, 0x54, 0x7a, 0x77, 0x7d, 0x7d, 0x7a,
+    0x6e, 0x8e, 0x4f, 0x7d, 0x80, 0x9a, 0x79, 0x8b, 0x7b, 0x68, 0x6e, 0x86,
+    0x7f, 0x93, 0x7a, 0x76, 0x72, 0x85, 0x6a, 0x7b, 0x57, 0x84, 0x96, 0x9a,
+    0x8f, 0x91, 0x9b, 0x72, 0x73, 0x91, 0x53, 0x66, 0x76, 0x80, 0xae, 0x63,
+    0xbf, 0x99, 0x5e, 0x77, 0x73, 0x9c, 0xd8, 0x74, 0xa7, 0x79, 0x52, 0x64,
+    0x82, 0x95, 0xc7, 0x4f, 0xa8, 0x4f, 0x6d, 0x42, 0x7c, 0x89, 0xab, 0x83,
+    0xc0, 0x82, 0x6a, 0x5f, 0x83, 0x92, 0xa8, 0x76, 0xc1, 0x77, 0x6e, 0x7b,
+    0xa3, 0x9b, 0xaf, 0x87, 0xab, 0x60, 0x8d, 0xc2, 0xd2, 0x83, 0xb2, 0x78,
+    0x8d, 0x39, 0x57, 0x9c, 0x90, 0x8e, 0x6e, 0x6a, 0x74, 0x79, 0x81, 0x6d,
+    0x6f, 0x8e, 0x77, 0x92, 0x93, 0x7d, 0x5f, 0x68, 0x6a, 0x6c, 0x80, 0x8f,
+    0x99, 0x84, 0x4f, 0x64, 0x5c, 0x93, 0x7c, 0x91, 0x98, 0x82, 0x62, 0x3f,
+    0x41, 0x9f, 0x5d, 0x89, 0x98, 0x89, 0x73, 0x50, 0x32, 0xa8, 0xa0, 0x7a,
+    0xa0, 0x95, 0x78, 0x69, 0x74, 0x7c, 0x89, 0x7b, 0x80, 0x65, 0x56, 0x6b,
+    0x69, 0x78, 0x62, 0x87, 0xaf, 0x94, 0x7a, 0x64, 0x53, 0x86, 0x45, 0x99,
+    0x88, 0x79, 0x4d, 0x74, 0x59, 0x91, 0x5f, 0x7b, 0x88, 0x90, 0x80, 0x86,
+    0x7d, 0x7b, 0x64, 0xa3, 0x7f, 0x74, 0x89, 0x80, 0x7d, 0x7c, 0x7a, 0x87,
+    0x5f, 0x8a, 0x5a, 0x72, 0x79, 0x74, 0x8c, 0x7c, 0x86, 0x91, 0x6e, 0x5d,
+    0x61, 0x8e, 0xa2, 0x68, 0xd4, 0x92, 0x67, 0x66, 0x62, 0xa1, 0xf3, 0x63,
+    0x91, 0x81, 0x74, 0x5f, 0x88, 0x98, 0xbb, 0x5a, 0x9b, 0x54, 0x6a, 0x5c,
+    0x75, 0x88, 0xad, 0x7c, 0xb4, 0x7c, 0x69, 0x74, 0x84, 0x76, 0x9d, 0x9a,
+    0xb0, 0x91, 0x5d, 0xa3, 0xa4, 0x7f, 0xbb, 0x80, 0xa4, 0x5d, 0x83, 0xaf,
+    0xb7, 0x66, 0xb0, 0x7f, 0x89, 0x4b, 0x72, 0x9e, 0x99, 0x7c, 0x66, 0x71,
+    0x6a, 0x6f, 0x6d, 0x67, 0x8d, 0x6d, 0x46, 0xa5, 0x9b, 0x84, 0x7a, 0x61,
+    0x64, 0x5c, 0x88, 0x89, 0x95, 0x8c, 0x70, 0x4b, 0x6c, 0x85, 0x83, 0x8b,
+    0x98, 0x87, 0x6a, 0x44, 0x4d, 0x9d, 0x78, 0x71, 0x78, 0x7e, 0x91, 0x5b,
+    0x3f, 0x9f, 0x80, 0x62, 0xa7, 0x95, 0x5d, 0x74, 0x65, 0x9c, 0x6d, 0x7a,
+    0x98, 0x79, 0x80, 0x61, 0x49, 0x82, 0x65, 0x92, 0x80, 0x96, 0x7c, 0x72,
+    0x4f, 0x76, 0x5e, 0x8d, 0x97, 0xa5, 0x72, 0x57, 0x79, 0x87, 0x67, 0x87,
+    0x80, 0x84, 0x7c, 0x6f, 0x66, 0x6b, 0x70, 0x9b, 0x64, 0x90, 0x59, 0x96,
+    0x7a, 0x6f, 0x75, 0x89, 0x4e, 0x8a, 0x62, 0x6e, 0x9c, 0x8c, 0x9a, 0x78,
+    0x8e, 0x91, 0x3d, 0x50, 0x72, 0x92, 0x9f, 0x63, 0xda, 0x92, 0x72, 0x60,
+    0x59, 0xa6, 0xd0, 0x56, 0xc1, 0x6b, 0x5e, 0x76, 0x6e, 0x81, 0xbb, 0x4b,
+    0xbb, 0x59, 0x68, 0x4f, 0x77, 0x87, 0xa1, 0x73, 0xbf, 0x65, 0x56, 0x67,
+    0x77, 0x84, 0x8a, 0x7e, 0xb8, 0x85, 0x66, 0xa6, 0x99, 0xa0, 0xa5, 0x73,
+    0x8d, 0x4a, 0x7d, 0xab, 0xb0, 0x6a, 0x94, 0x84, 0x87, 0x4c, 0x74, 0xa3,
+    0xb3, 0xa9, 0x62, 0x7a, 0x71, 0x7f, 0x53, 0x79, 0x7a, 0x7c, 0x5e, 0x8f,
+    0xa0, 0x90, 0x5c, 0x76, 0x6c, 0x92, 0x70, 0x9c, 0xb3, 0x8b, 0x7e, 0x57,
+    0x5b, 0x9d, 0x96, 0x85, 0x70, 0x93, 0x8b, 0x67, 0x4c, 0x9c, 0x6a, 0x83,
+    0x84, 0x90, 0x8e, 0x60, 0x56, 0xb3, 0x87, 0x7d, 0x86, 0x88, 0x79, 0x5b,
+    0x58, 0x94, 0x92, 0x8e, 0x90, 0x76, 0x58, 0x51, 0x52, 0x63, 0x57, 0x88,
+    0x9b, 0x7a, 0x85, 0x6c, 0x8b, 0x87, 0x5f, 0x8b, 0x90, 0x92, 0x81, 0x64,
+    0x52, 0x8b, 0x77, 0x94, 0x96, 0x98, 0x69, 0x5b, 0x79, 0x87, 0x61, 0x96,
+    0x7b, 0x9a, 0x61, 0x74, 0x7e, 0x8b, 0x82, 0x92, 0x4f, 0x87, 0x7f, 0x80,
+    0x74, 0x97, 0x98, 0x7a, 0x79, 0x97, 0x65, 0x67, 0x66, 0xb1, 0xb1, 0x49,
+    0xd6, 0x97, 0x58, 0x47, 0x62, 0x94, 0xd5, 0x82, 0xa0, 0x60, 0x3f, 0x67,
+    0x6c, 0x9d, 0xb6, 0x58, 0xb1, 0x6e, 0x58, 0x4e, 0x7c, 0x83, 0x8b, 0x83,
+    0xd5, 0x62, 0x8d, 0x84, 0x84, 0x8c, 0xa9, 0x6e, 0xac, 0x7f, 0x6d, 0x88,
+    0xab, 0x8b, 0xb1, 0x77, 0x9b, 0x46, 0x76, 0xa7, 0xb8, 0x7b, 0xc5, 0x6e,
+    0x73, 0x62, 0x68, 0x95, 0xab, 0x7c, 0x6f, 0x74, 0x56, 0x71, 0x61, 0x83,
+    0x8a, 0x73, 0x54, 0x94, 0x86, 0x91, 0x60, 0x69, 0x65, 0x6b, 0x76, 0x85,
+    0xae, 0x87, 0x8f, 0x55, 0x41, 0x98, 0x68, 0x87, 0x5e, 0x7a, 0x80, 0x38,
+    0x50, 0xaf, 0x93, 0x79, 0x57, 0x96, 0x7b, 0x53, 0x4e, 0xc0, 0xa0, 0x85,
+    0x87, 0x95, 0x86, 0x70, 0x4c, 0x9f, 0x77, 0x7d, 0x8b, 0x7a, 0x7b, 0x6d,
+    0x57, 0x74, 0x81, 0x7d, 0xa2, 0x79, 0x64, 0x6c, 0x55, 0x70, 0x3c, 0x88,
+    0x8a, 0x7a, 0x58, 0x72, 0x71, 0x7d, 0x6a, 0x8d, 0x78, 0x7e, 0x95, 0x8b,
+    0x84, 0x7e, 0x73, 0x7c, 0x7e, 0x67, 0x89, 0x8b, 0x6d, 0x68, 0x66, 0x73,
+    0x5a, 0x93, 0x82, 0x85, 0x97, 0x6b, 0x9a, 0x72, 0x51, 0xa2, 0x4f, 0x67,
+    0x67, 0x7e, 0xbb, 0x37, 0xe3, 0x9c, 0x57, 0x5b, 0x6f, 0xa0, 0xdc, 0x5c,
+    0xa6, 0x7c, 0x71, 0x77, 0x72, 0x88, 0xd0, 0x4d, 0x93, 0x58, 0x74, 0x6d,
+    0x8f, 0x77, 0xa3, 0x76, 0xb7, 0x76, 0x6d, 0x6d, 0x6f, 0x7b, 0xaa, 0x6d,
+    0xaa, 0x6a, 0x72, 0x98, 0x8d, 0x98, 0xb0, 0x52, 0x76, 0x5d, 0x61, 0xb7,
+    0xac, 0x90, 0xa5, 0x75, 0x7e, 0x3d, 0x5b, 0x9a, 0xbf, 0x81, 0x83, 0x7b,
+    0x5c, 0x77, 0x74, 0x82, 0x8d, 0x7e, 0x4f, 0x9f, 0x8f, 0x97, 0x7c, 0x75,
+    0x5b, 0x73, 0x97, 0x73, 0x85, 0x7f, 0x70, 0x5a, 0x53, 0x81, 0x81, 0x89,
+    0x73, 0x8d, 0x8a, 0x5c, 0x5f, 0x84, 0x86, 0x6f, 0x76, 0x78, 0x82, 0x6d,
+    0x4f, 0xbb, 0x91, 0x61, 0x7e, 0x97, 0x6c, 0x67, 0x62, 0x83, 0x61, 0x7d,
+    0x89, 0x76, 0x7b, 0x67, 0x56, 0x74, 0x49, 0x7b, 0x6b, 0x8b, 0x89, 0x74,
+    0x5b, 0x7f, 0x78, 0x7b, 0x80, 0x7e, 0x63, 0x71, 0x5e, 0x91, 0x81, 0x92,
+    0x7b, 0x90, 0x9c, 0x7a, 0x73, 0x85, 0x79, 0x9b, 0x66, 0x93, 0x60, 0x87,
+    0x79, 0x69, 0x73, 0x8b, 0x53, 0x8c, 0x8d, 0x68, 0x93, 0xa0, 0x91, 0x65,
+    0x57, 0x8d, 0x71, 0x65, 0x6c, 0x7e, 0xb3, 0x4f, 0xc7, 0xaa, 0x5a, 0x77,
+    0x6e, 0x85, 0xe4, 0x6c, 0xa3, 0x89, 0x69, 0x54, 0x6d, 0x99, 0xb9, 0x77,
+    0xa0, 0x80, 0x85, 0x71, 0x70, 0x78, 0x99, 0x66, 0xaf, 0x8a, 0x59, 0x64,
+    0x54, 0x62, 0xbf, 0x5c, 0xbd, 0x77, 0x7f, 0xab, 0x95, 0x85, 0xaa, 0x6e,
+    0xaa, 0x5a, 0x7b, 0x9f, 0xc3, 0x65, 0x93, 0x64, 0x7c, 0x2d, 0x4e, 0x8f,
+    0xb2, 0x5f, 0x4e, 0x61, 0x64, 0x73, 0x56, 0x75, 0x79, 0x90, 0x5c, 0x81,
+    0x8a, 0x8c, 0x70, 0x64, 0x74, 0x86, 0x86, 0x82, 0xab, 0x7e, 0x62, 0x4f,
+    0x51, 0x89, 0x7b, 0x88, 0x73, 0x97, 0x77, 0x75, 0x5c, 0x9e, 0x97, 0x70,
+    0x5a, 0x98, 0x7a, 0x54, 0x47, 0x99, 0xab, 0x5d, 0x91, 0xa0, 0x64, 0x51,
+    0x57, 0x88, 0x88, 0x85, 0x81, 0x83, 0xa1, 0x89, 0x6a, 0x88, 0x69, 0x81,
+    0x92, 0x63, 0x6a, 0x71, 0x72, 0x6a, 0x75, 0x8e, 0x90, 0x9d, 0x69, 0x60,
+    0x73, 0x95, 0x79, 0x7b, 0x79, 0x7f, 0x77, 0x6e, 0x69, 0x63, 0x60, 0xa0,
+    0x84, 0x91, 0x80, 0x96, 0x92, 0x70, 0x69, 0x7c, 0x3f, 0x90, 0x5c, 0x79,
+    0x82, 0x63, 0x8d, 0x63, 0x56, 0x8a, 0x8e, 0x7a, 0x5c, 0x8d, 0xb8, 0x4e,
+    0xb6, 0x84, 0x57, 0x79, 0x59, 0x79, 0xe8, 0x7e, 0xa8, 0x71, 0x61, 0x62,
+    0x89, 0x71, 0xb7, 0x83, 0x7b, 0x53, 0x86, 0x88, 0x74, 0x71, 0xb1, 0x61,
+    0xae, 0x7e, 0x8f, 0x69, 0x6b, 0x69, 0xb2, 0x6d, 0xb1, 0x7f, 0x5c, 0x9f,
+    0xaa, 0x8c, 0xbd, 0x74, 0xaa, 0x5b, 0x7f, 0xa5, 0xb0, 0x6e, 0xc1, 0x5c,
+    0x94, 0x34, 0x5b, 0xa6, 0xbc, 0x49, 0x75, 0x5b, 0x6e, 0x74, 0x7a, 0x92,
+    0x92, 0x79, 0x78, 0x8a, 0x9e, 0x97, 0x7c, 0x5f, 0x76, 0x86, 0x59, 0x81,
+    0x83, 0x7a, 0x65, 0x5b, 0x42, 0x95, 0x84, 0x99, 0x81, 0x8d, 0x6a, 0x5e,
+    0x59, 0xb7, 0x96, 0x8a, 0x77, 0x86, 0x7a, 0x67, 0x3b, 0xa8, 0xae, 0x7a,
+    0xa0, 0x97, 0x6c, 0x73, 0x5b, 0x9b, 0x77, 0x84, 0x7a, 0x77, 0x75, 0x6f,
+    0x7d, 0x7a, 0x71, 0x86, 0x6c, 0x6f, 0x7d, 0x71, 0x68, 0x60, 0x64, 0x86,
+    0x90, 0x75, 0x6a, 0x61, 0x60, 0x87, 0x68, 0x99, 0x87, 0x7e, 0x92, 0x87,
+    0x87, 0x5f, 0x60, 0x91, 0x68, 0x8c, 0x7b, 0x67, 0x79, 0x5d, 0x67, 0x77,
+    0x47, 0x72, 0x76, 0x88, 0x82, 0xa2, 0x7a, 0x5d, 0x64, 0x87, 0x75, 0x78,
+    0x5e, 0x6f, 0xa4, 0x52, 0xc2, 0x9d, 0x81, 0x89, 0x55, 0x86, 0xc9, 0x6f,
+    0x95, 0x71, 0x9d, 0x87, 0x95, 0x74, 0xac, 0x7f, 0x95, 0x6c, 0x68, 0x66,
+    0x8a, 0x5f, 0x96, 0x69, 0x95, 0x79, 0x7f, 0x71, 0x86, 0x7e, 0x98, 0x71,
+    0xac, 0x8f, 0x75, 0xa5, 0xac, 0x7a, 0xca, 0x63, 0xa0, 0x63, 0x69, 0xbf,
+    0xae, 0x62, 0xc9, 0x46, 0x74, 0x2c, 0x66, 0x96, 0xb7, 0x70, 0x7c, 0x6b,
+    0x7b, 0x90, 0x72, 0x74, 0x8d, 0x5f, 0x63, 0x93, 0x97, 0x78, 0x79, 0x64,
+    0x67, 0x84, 0x64, 0x82, 0x90, 0x83, 0x91, 0x5f, 0x72, 0x93, 0x91, 0xae,
+    0x6d, 0x99, 0x5b, 0x69, 0x54, 0x9f, 0x97, 0x80, 0x80, 0xa4, 0x91, 0x66,
+    0x65, 0xa4, 0xa7, 0x7b, 0x97, 0x87, 0x72, 0x68, 0x6a, 0x96, 0x7b, 0x79,
+    0x69, 0x83, 0x6f, 0x85, 0x6b, 0x92, 0x7f, 0x71, 0x84, 0x87, 0x6a, 0x7b,
+    0x63, 0x72, 0x5f, 0x87, 0x98, 0x7b, 0x96, 0x71, 0x62, 0x90, 0x71, 0xa3,
+    0x8c, 0x77, 0x90, 0x6f, 0x83, 0x76, 0x65, 0x87, 0x72, 0x8a, 0x64, 0x87,
+    0x75, 0x75, 0x6d, 0x84, 0x54, 0x89, 0x88, 0xa0, 0x87, 0x73, 0x7f, 0x6f,
+    0x5f, 0x90, 0x5e, 0x94, 0x5d, 0x61, 0xa6, 0x56, 0xb3, 0x91, 0x95, 0x75,
+    0x4d, 0x74, 0xd9, 0x87, 0x92, 0x74, 0x7f, 0x79, 0x97, 0x6e, 0x90, 0x54,
+    0x84, 0x5d, 0x5f, 0x75, 0x8b, 0x84, 0xa6, 0x75, 0xb4, 0x77, 0x78, 0x85,
+    0x90, 0x76, 0xbd, 0x78, 0xd1, 0xa0, 0x5d, 0x96, 0xa9, 0x7c, 0xc1, 0x61,
+    0xc2, 0x71, 0x8b, 0xa5, 0xa5, 0x5b, 0xc8, 0x50, 0x7b, 0x4b, 0x93, 0x99,
+    0xae, 0x72, 0x67, 0x54, 0x81, 0x89, 0x96, 0x81, 0x6e, 0x68, 0x55, 0x7f,
+    0x93, 0x8c, 0x5e, 0x65, 0x6c, 0x84, 0x7f, 0x8f, 0x9e, 0x7b, 0x73, 0x7f,
+    0x51, 0x63, 0x8a, 0x8b, 0x6b, 0x9b, 0x9d, 0x57, 0x68, 0x89, 0x98, 0x70,
+    0x73, 0xa3, 0x7f, 0x69, 0x44, 0x89, 0xae, 0x68, 0x89, 0x80, 0x7e, 0x6d,
+    0x70, 0x95, 0x85, 0x65, 0x91, 0x7f, 0x66, 0x74, 0x96, 0x72, 0x60, 0x7a,
+    0x87, 0x85, 0x79, 0x54, 0x53, 0x6c, 0x88, 0x87, 0xa9, 0x90, 0x75, 0x8b,
+    0x69, 0x98, 0x7d, 0x95, 0x85, 0x7a, 0x8b, 0x82, 0x87, 0x6f, 0x86, 0x7f,
+    0x74, 0xab, 0x93, 0x6c, 0x8a, 0x78, 0x68, 0x81, 0x62, 0x88, 0x78, 0x91,
+    0x8b, 0x55, 0xa7, 0x58, 0x64, 0x88, 0x71, 0x93, 0x7d, 0x69, 0xbc, 0x58,
+    0xbe, 0x9a, 0x6f, 0x74, 0x6f, 0x7f, 0xeb, 0x9e, 0xb7, 0x60, 0x63, 0x98,
+    0x82, 0x77, 0x94, 0x63, 0x80, 0x6f, 0x7d, 0x8f, 0x8b, 0x85, 0xa5, 0x62,
+    0xad, 0x86, 0x5f, 0x76, 0x88, 0x74, 0xa5, 0x66, 0xa5, 0x94, 0x88, 0x9b,
+    0x87, 0x9e, 0xa8, 0x5a, 0xc9, 0x81, 0x92, 0xcd, 0xb5, 0x67, 0xb9, 0x63,
+    0x86, 0x65, 0x8d, 0xad, 0x98, 0x7c, 0x8a, 0x40, 0x67, 0x65, 0x60, 0x71,
+    0x8e, 0x84, 0x73, 0x64, 0x98, 0x80, 0x73, 0x81, 0x48, 0x75, 0x71, 0x9e,
+    0x73, 0x89, 0x89, 0x68, 0x73, 0xa6, 0x84, 0x8a, 0x7e, 0x9f, 0x78, 0x83,
+    0x60, 0x77, 0xa1, 0x87, 0x76, 0xab, 0x74, 0x57, 0x6d, 0x99, 0xa5, 0x5e,
+    0x9d, 0x91, 0x6d, 0x6a, 0x76, 0x9c, 0x7b, 0x66, 0x96, 0x84, 0x85, 0x6e,
+    0x6c, 0x75, 0x86, 0x6a, 0x71, 0x67, 0x8a, 0x66, 0x66, 0x68, 0x73, 0x90,
+    0x92, 0x68, 0x8f, 0x71, 0x82, 0x7e, 0x71, 0xad, 0x9f, 0x84, 0x9e, 0x7d,
+    0x77, 0x6b, 0x67, 0x8f, 0x73, 0x9a, 0x91, 0x74, 0x8a, 0x74, 0x5a, 0x87,
+    0x37, 0x80, 0x8c, 0x8f, 0x7f, 0x75, 0xa8, 0x49, 0x63, 0x9b, 0x67, 0x68,
+    0x4f, 0x87, 0xbf, 0x59, 0x9c, 0xbe, 0x93, 0x7e, 0x6f, 0x8a, 0xea, 0x77,
+    0x83, 0x7a, 0x75, 0x8e, 0x7d, 0x50, 0x95, 0x60, 0x74, 0x60, 0x6f, 0x97,
+    0x72, 0x5c, 0xa3, 0x6d, 0xb9, 0x86, 0x7b, 0x89, 0x9a, 0x76, 0xc7, 0x56,
+    0xba, 0x86, 0x8d, 0x93, 0xa9, 0x98, 0xbb, 0x6a, 0x97, 0x74, 0x68, 0x84,
+    0xc3, 0x65, 0xb6, 0x68, 0x89, 0x58, 0x87, 0xa1, 0xac, 0x60, 0x65, 0x68,
+    0x7d, 0x98, 0x67, 0x8f, 0x8e, 0x84, 0x50, 0x75, 0x83, 0x91, 0x8a, 0x90,
+    0x66, 0x74, 0x96, 0x89, 0x81, 0x7a, 0x7a, 0x64, 0x7f, 0x73, 0x8f, 0x95,
+    0x8c, 0x89, 0x96, 0x76, 0x7a, 0x6c, 0x89, 0x91, 0x6d, 0x84, 0x68, 0x8d,
+    0x47, 0x94, 0x9a, 0x67, 0x8f, 0x89, 0x8e, 0x79, 0x73, 0xa8, 0x7f, 0x6c,
+    0x80, 0x64, 0x75, 0x81, 0x96, 0x9c, 0x68, 0x65, 0x76, 0x68, 0x74, 0x72,
+    0x68, 0x76, 0x62, 0x6d, 0x6e, 0x6a, 0x84, 0x65, 0x8a, 0x73, 0x76, 0x91,
+    0x78, 0x7c, 0x7a, 0x88, 0x6a, 0x87, 0x60, 0x99, 0x88, 0x75, 0x7b, 0x71,
+    0x81, 0x7b, 0x76, 0x7d, 0x58, 0x75, 0x65, 0xa3, 0x95, 0x7e, 0x96, 0x3e,
+    0x4c, 0x97, 0x86, 0x7a, 0x62, 0x92, 0xd1, 0x72, 0x8e, 0xaa, 0x85, 0x8e,
+    0x59, 0x5f, 0xec, 0x77, 0x96, 0x66, 0x91, 0x9a, 0x89, 0x6c, 0xa2, 0x69,
+    0x7d, 0x6e, 0x76, 0x63, 0x82, 0x72, 0x9c, 0x72, 0xa3, 0x75, 0x85, 0x7b,
+    0x6d, 0x96, 0xc2, 0x69, 0xa7, 0x6a, 0x6b, 0x83, 0xa2, 0x7d, 0xce, 0x5c,
+    0x94, 0x61, 0x7d, 0xae, 0xc3, 0x6d, 0x9f, 0x3c, 0x52, 0x4d, 0x8e, 0x92,
+    0xae, 0x6e, 0x70, 0x5a, 0x76, 0x84, 0x7f, 0x72, 0x92, 0x72, 0x76, 0x5e,
+    0x73, 0x8e, 0x82, 0x6d, 0x72, 0x81, 0x79, 0x94, 0x81, 0x88, 0x8b, 0x81,
+    0x72, 0x72, 0x69, 0x84, 0x59, 0x6e, 0x74, 0x7d, 0x66, 0x74, 0x8d, 0x7b,
+    0x7d, 0x7e, 0x7a, 0x83, 0x4d, 0x7e, 0x6a, 0x5a, 0x87, 0x66, 0x84, 0xa5,
+    0x50, 0x5d, 0x6a, 0x8e, 0x87, 0x74, 0x88, 0x7c, 0x7d, 0x6c, 0x93, 0x98,
+    0x8c, 0x76, 0x7f, 0xa3, 0x6e, 0x5d, 0x7d, 0x9f, 0x7c, 0x7a, 0x98, 0x88,
+    0x74, 0x73, 0x50, 0x8c, 0x78, 0x8b, 0x71, 0x77, 0x9d, 0x56, 0x71, 0x85,
+    0x6b, 0x8a, 0x93, 0x82, 0x8c, 0x79, 0x68, 0x8b, 0x57, 0x7b, 0x7c, 0x8a,
+    0x6c, 0x87, 0x98, 0x54, 0x63, 0x7e, 0x78, 0x6b, 0x63, 0x77, 0xc1, 0x52,
+    0xcd, 0xab, 0x75, 0x8e, 0x64, 0x68, 0xce, 0x68, 0x88, 0x6d, 0x67, 0x6d,
+    0x68, 0x76, 0xa7, 0x78, 0x83, 0x67, 0x65, 0x5b, 0x8f, 0x63, 0x90, 0x5b,
+    0xa1, 0x6f, 0x6a, 0x88, 0x70, 0x5c, 0x78, 0x49, 0xbc, 0x85, 0x8d, 0x8e,
+    0xa3, 0x90, 0x97, 0x84, 0xa2, 0x46, 0x7a, 0x8e, 0x9e, 0xb1, 0xaa, 0x53,
+    0x7d, 0x6b, 0x72, 0x86, 0x8c, 0x67, 0x6b, 0x48, 0x6f, 0x9c, 0x51, 0x94,
+    0x6d, 0x66, 0x8e, 0x90, 0x79, 0x81, 0x66, 0x9f, 0x82, 0x9f, 0x98, 0x97,
+    0x7c, 0x86, 0x7f, 0x57, 0x57, 0x83, 0x97, 0x8f, 0x73, 0x6f, 0x75, 0x6c,
+    0x56, 0x8f, 0x7f, 0x73, 0x71, 0x84, 0x7d, 0x5f, 0x69, 0x69, 0x8e, 0x67,
+    0x8a, 0x7f, 0x8c, 0x5a, 0x7a, 0x67, 0x82, 0x5a, 0x7a, 0x68, 0x73, 0x58,
+    0x84, 0x83, 0x8d, 0x6d, 0x83, 0x72, 0x80, 0x7a, 0x8e, 0x7a, 0x68, 0x88,
+    0x65, 0x74, 0x78, 0x73, 0x83, 0x97, 0x7b, 0x84, 0x77, 0x6d, 0x95, 0x99,
+    0x76, 0x69, 0x5f, 0x9b, 0x7c, 0x75, 0x91, 0x80, 0x7b, 0x73, 0x6f, 0x9f,
+    0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,
+    0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62,
+    0x69, 0x61, 0x73, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xaa, 0xcc, 0xe2, 0x37, 0x10, 0x00, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00,
+    0xfd, 0xfd, 0xff, 0xff, 0x53, 0xfe, 0xff, 0xff, 0x74, 0x01, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x14, 0x00, 0x1c, 0x00,
+    0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00,
+    0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00,
+    0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
+    0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
+    0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
+const int g_model_len = 18288;
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/model.h
similarity index 56%
rename from tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h
rename to tensorflow/lite/micro/examples/micro_speech/micro_features/model.h
index b14f4641eee..deec2d64627 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/model.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This is a standard TensorFlow Lite model file that has been converted into a
-// C data array, so it can be easily compiled into a binary for devices that
-// don't have a file system. It was created using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
+// This is a standard TensorFlow Lite FlatBuffer model file that has been
+// converted into a C data array, so it can be easily compiled into a binary
+// for devices that don't have a file system. It was created using the command:
+// xxd -i model.tflite > model.cc
 
-#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
-#define TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MODEL_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MODEL_H_
 
-extern const unsigned char g_tiny_conv_micro_features_model_data[];
-extern const int g_tiny_conv_micro_features_model_data_len;
+extern const unsigned char g_model[];
+extern const int g_model_len;
 
-#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MODEL_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc
deleted file mode 100644
index 16052bac540..00000000000
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc
+++ /dev/null
@@ -1,1554 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Automatically created from a TensorFlow Lite flatbuffer using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
-// See the README for a full description of the creation process.
-
-#include "tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
-
-// We need to keep the data array aligned on some architectures.
-#ifdef __has_attribute
-#define HAVE_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define HAVE_ATTRIBUTE(x) 0
-#endif
-#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
-#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
-#else
-#define DATA_ALIGN_ATTRIBUTE
-#endif
-
-const unsigned char
-    g_tiny_conv_micro_features_model_data[] DATA_ALIGN_ATTRIBUTE = {
-        0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
-        0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
-        0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xd0, 0x46, 0x00, 0x00,
-        0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0xb4, 0x41, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
-        0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74,
-        0x65, 0x64, 0x2e, 0x00, 0x09, 0x00, 0x00, 0x00, 0x94, 0x41, 0x00, 0x00,
-        0x74, 0x41, 0x00, 0x00, 0x44, 0x41, 0x00, 0x00, 0xb4, 0x3e, 0x00, 0x00,
-        0xac, 0x3e, 0x00, 0x00, 0xa4, 0x3e, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-        0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf0, 0xb9, 0xff, 0xff,
-        0xf4, 0xb9, 0xff, 0xff, 0x52, 0xba, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
-        0x80, 0x3e, 0x00, 0x00, 0x68, 0x95, 0x91, 0x7d, 0x9b, 0x85, 0x85, 0x81,
-        0x77, 0x85, 0x99, 0x89, 0x7e, 0x8a, 0x85, 0x92, 0xa5, 0x7e, 0x93, 0x97,
-        0x97, 0x91, 0xa3, 0x97, 0x88, 0x8b, 0xa6, 0x71, 0x77, 0x85, 0x95, 0x86,
-        0x6b, 0x93, 0xcb, 0x96, 0x7a, 0x9a, 0x7f, 0x85, 0x7a, 0x8e, 0xac, 0x98,
-        0x6d, 0x9d, 0x9b, 0x70, 0x9a, 0x90, 0xba, 0x99, 0x7b, 0x93, 0x6e, 0x68,
-        0x75, 0x86, 0xc4, 0x8b, 0x66, 0x5d, 0x96, 0x7f, 0x92, 0x91, 0xb6, 0x7b,
-        0x96, 0x95, 0x9a, 0x77, 0x9a, 0x96, 0xce, 0x80, 0x88, 0x65, 0x8e, 0x80,
-        0x88, 0x85, 0xb7, 0x9c, 0x7b, 0x93, 0x9d, 0x95, 0x83, 0x92, 0xd0, 0x7e,
-        0x68, 0x88, 0x6c, 0x78, 0x98, 0x81, 0xac, 0x95, 0x9e, 0x98, 0xa2, 0x99,
-        0x8d, 0x7d, 0xb8, 0x81, 0x6e, 0x68, 0xa1, 0x81, 0x9d, 0x99, 0xb4, 0x7d,
-        0x92, 0x86, 0x9d, 0x93, 0xa3, 0xb0, 0xd6, 0x79, 0x93, 0x76, 0x8d, 0x84,
-        0x91, 0x9d, 0xbe, 0x94, 0xb0, 0x70, 0x84, 0x80, 0x85, 0x99, 0x9e, 0xa2,
-        0x86, 0x8a, 0x7a, 0x76, 0x91, 0x8d, 0xa6, 0x76, 0x8d, 0x82, 0x98, 0x8c,
-        0x92, 0x8f, 0x8c, 0xb3, 0x78, 0x75, 0xa5, 0x88, 0x73, 0x8c, 0x91, 0x7c,
-        0x82, 0x7d, 0x93, 0x9e, 0x8b, 0x97, 0x7c, 0x90, 0x84, 0x95, 0x7e, 0x9e,
-        0xa4, 0x52, 0x8a, 0xb4, 0x97, 0x65, 0x7d, 0xb6, 0x83, 0x7d, 0x99, 0x80,
-        0x97, 0x85, 0x96, 0x5f, 0x8e, 0x87, 0x95, 0x6d, 0x76, 0x84, 0x97, 0x8c,
-        0x66, 0x97, 0xae, 0x6b, 0x93, 0xb3, 0xa8, 0x8b, 0xa1, 0x79, 0xa3, 0x94,
-        0x7e, 0xa8, 0x8d, 0xad, 0x78, 0x82, 0xa2, 0x7b, 0x90, 0xa4, 0x7d, 0xb3,
-        0xa0, 0x7b, 0x94, 0x85, 0x9a, 0x8d, 0x76, 0x82, 0x65, 0x73, 0xab, 0xa4,
-        0xaa, 0x74, 0x93, 0x9c, 0x83, 0x66, 0xbf, 0x7a, 0xaa, 0x81, 0x92, 0x89,
-        0x7e, 0x88, 0xa6, 0x66, 0xaf, 0x92, 0x9f, 0x97, 0x6c, 0x89, 0x9c, 0x74,
-        0x7e, 0x82, 0x8e, 0x88, 0xb2, 0x85, 0xba, 0x96, 0x90, 0x78, 0x8d, 0xa7,
-        0x9e, 0x87, 0xbc, 0x7f, 0xb2, 0x8b, 0x77, 0x9b, 0xab, 0x8f, 0xa4, 0x7d,
-        0x6f, 0x77, 0x8c, 0x98, 0x6f, 0x89, 0xb1, 0x9f, 0xa7, 0x94, 0x7d, 0xae,
-        0x88, 0x8a, 0xa9, 0x75, 0x7d, 0x7c, 0x88, 0x99, 0x90, 0x9d, 0x97, 0xa7,
-        0x8d, 0x7f, 0x73, 0xa1, 0xa3, 0x87, 0xa9, 0x92, 0x98, 0x7e, 0x9c, 0x88,
-        0x73, 0x6b, 0x78, 0x8e, 0x7d, 0x86, 0x6c, 0x7c, 0x92, 0x40, 0x86, 0xa7,
-        0x65, 0x93, 0x67, 0x91, 0x67, 0x71, 0x6c, 0xa8, 0x81, 0x70, 0x8e, 0xa8,
-        0x7b, 0x63, 0x89, 0x76, 0x69, 0x90, 0x73, 0x5e, 0x92, 0x78, 0x7e, 0x9d,
-        0x87, 0x86, 0x89, 0x64, 0x66, 0xa9, 0x92, 0x8d, 0x72, 0x7c, 0x63, 0x7f,
-        0x94, 0x5c, 0x92, 0x89, 0x87, 0x9d, 0x8b, 0x75, 0x93, 0x8c, 0x94, 0x68,
-        0x97, 0x87, 0x78, 0x7d, 0x7f, 0x84, 0x84, 0x77, 0x6b, 0x8e, 0x83, 0xab,
-        0x7e, 0x62, 0x90, 0x83, 0x8e, 0x71, 0x7e, 0x9b, 0x96, 0x6d, 0x83, 0x6a,
-        0x76, 0x68, 0x71, 0x90, 0x98, 0x90, 0x9b, 0x68, 0x89, 0x89, 0x95, 0x85,
-        0x6e, 0x75, 0x8e, 0x95, 0x83, 0x7a, 0x91, 0x7f, 0x8b, 0x71, 0x90, 0x7d,
-        0xad, 0x91, 0x6f, 0x74, 0x98, 0x8a, 0xb0, 0xa8, 0x80, 0xa3, 0x8e, 0x7c,
-        0xa5, 0x67, 0xa4, 0x66, 0xa9, 0x7b, 0x85, 0x9d, 0x88, 0xab, 0x7d, 0x81,
-        0x6e, 0x7f, 0x8f, 0x97, 0x97, 0x84, 0x89, 0x74, 0x9d, 0x5f, 0x9c, 0x88,
-        0x6f, 0x74, 0x96, 0x9e, 0x7e, 0x7e, 0xa4, 0x85, 0x94, 0x91, 0xaf, 0x99,
-        0x7a, 0xaa, 0x8c, 0x92, 0x85, 0x9d, 0x6c, 0x79, 0x57, 0x7a, 0x80, 0x84,
-        0x79, 0x79, 0x74, 0xa5, 0x55, 0xab, 0x73, 0x8c, 0x72, 0x9d, 0x72, 0xa9,
-        0x90, 0x73, 0x8f, 0xa0, 0x89, 0x6d, 0x68, 0x66, 0x61, 0x6f, 0x57, 0x7d,
-        0x66, 0x8c, 0x65, 0x87, 0x62, 0x76, 0x83, 0x77, 0x89, 0xa4, 0x73, 0x89,
-        0x7f, 0x70, 0x79, 0x6b, 0x86, 0x6f, 0x8d, 0x96, 0x65, 0x89, 0x66, 0x53,
-        0x73, 0xae, 0x6a, 0x72, 0x88, 0x97, 0x7a, 0x7f, 0x5d, 0xa1, 0x86, 0x88,
-        0x5f, 0x9f, 0x9b, 0x8a, 0x74, 0x9a, 0x7a, 0x7e, 0x8b, 0x71, 0x58, 0x74,
-        0x8f, 0x9b, 0x9b, 0x8d, 0x6b, 0x83, 0x60, 0x7f, 0x75, 0x91, 0x79, 0x93,
-        0x7a, 0x92, 0x8c, 0x7e, 0x7a, 0x95, 0x84, 0x69, 0x8f, 0x8c, 0x7c, 0x6e,
-        0x8b, 0x87, 0x82, 0x62, 0xa6, 0x97, 0x91, 0x65, 0xa2, 0xa4, 0x9b, 0x8b,
-        0x85, 0xa4, 0x84, 0x7b, 0x67, 0x93, 0x96, 0x84, 0x85, 0x75, 0x6d, 0x9e,
-        0x80, 0x80, 0x73, 0x8c, 0x81, 0x70, 0x8a, 0x68, 0x9c, 0x8e, 0x63, 0x91,
-        0x89, 0x79, 0x8d, 0x79, 0xa4, 0x9a, 0x96, 0xa0, 0x83, 0x63, 0x88, 0x8f,
-        0x76, 0xb4, 0xa8, 0x8e, 0x68, 0x8d, 0x8e, 0x95, 0x78, 0xae, 0x5d, 0x89,
-        0x66, 0x7e, 0x7b, 0x8a, 0x75, 0x86, 0x71, 0x97, 0x6d, 0xb3, 0x67, 0x76,
-        0x82, 0x7d, 0x70, 0x79, 0x8a, 0x9c, 0x82, 0xa7, 0x82, 0xab, 0x58, 0x86,
-        0x5c, 0x70, 0x8c, 0x71, 0x61, 0xa6, 0x74, 0xa8, 0x65, 0x78, 0x72, 0x9d,
-        0x6c, 0x92, 0x70, 0x88, 0x88, 0x79, 0x96, 0x6f, 0x68, 0xa4, 0x7a, 0x7b,
-        0x96, 0xac, 0x6d, 0x76, 0x6a, 0xab, 0x82, 0x7d, 0x71, 0x8d, 0x6b, 0x81,
-        0x6c, 0x9d, 0x71, 0x59, 0x5c, 0x71, 0x77, 0x6d, 0x6a, 0x96, 0x76, 0x69,
-        0x80, 0x83, 0x88, 0x70, 0x97, 0xb4, 0x8a, 0x6c, 0xa5, 0x6e, 0x64, 0x75,
-        0x73, 0xa2, 0x7f, 0x97, 0x9e, 0x75, 0x8f, 0x86, 0x68, 0xbb, 0x6b, 0x86,
-        0x8d, 0x80, 0x8e, 0x58, 0x6d, 0xb2, 0x76, 0x99, 0x8f, 0x70, 0x6c, 0x86,
-        0x78, 0x9e, 0x91, 0x90, 0xa2, 0x7c, 0x8c, 0x81, 0x80, 0xb4, 0x77, 0x7a,
-        0x8c, 0x5f, 0x85, 0x56, 0x7a, 0x93, 0x6b, 0x5c, 0x74, 0x59, 0x7e, 0x86,
-        0x8c, 0xae, 0x76, 0x7d, 0x76, 0x7e, 0x81, 0x5f, 0x81, 0x8e, 0x7b, 0x90,
-        0xaa, 0x99, 0x79, 0x89, 0x93, 0xbc, 0x86, 0x91, 0xa2, 0x88, 0x79, 0x82,
-        0x80, 0xb6, 0x4a, 0x93, 0x7b, 0x89, 0x75, 0x8d, 0x7a, 0x8d, 0x66, 0x7c,
-        0x81, 0x9f, 0x6e, 0x86, 0x4d, 0x82, 0x66, 0x88, 0x73, 0x89, 0x7d, 0xac,
-        0x89, 0x9f, 0x58, 0x7f, 0x6b, 0x8c, 0x6a, 0x82, 0x59, 0xb8, 0x83, 0x67,
-        0x8b, 0x8a, 0x84, 0x7b, 0x7f, 0xb5, 0x44, 0x57, 0x5a, 0x73, 0x8b, 0x6d,
-        0x7c, 0x9e, 0x71, 0x72, 0x8d, 0x93, 0x80, 0x60, 0x7f, 0xc5, 0x69, 0x5c,
-        0x67, 0x92, 0x6c, 0x75, 0x66, 0x8f, 0x91, 0x5a, 0x6c, 0x70, 0x90, 0x84,
-        0x88, 0xab, 0x90, 0x66, 0x9c, 0x64, 0x6e, 0x68, 0x92, 0x9e, 0x89, 0x8d,
-        0x82, 0x97, 0x77, 0x75, 0x7f, 0xa7, 0x91, 0x75, 0x8c, 0x89, 0xa4, 0x6b,
-        0x98, 0x99, 0x80, 0x7d, 0x6b, 0x7f, 0x7d, 0x88, 0x79, 0xa1, 0x87, 0x90,
-        0x81, 0x8e, 0x94, 0x96, 0x7d, 0xa8, 0x86, 0x84, 0x86, 0x79, 0x97, 0x6e,
-        0xaa, 0x95, 0x8a, 0x9f, 0x8c, 0x72, 0x99, 0x77, 0x81, 0x94, 0x91, 0x9f,
-        0x6e, 0x67, 0x87, 0x70, 0x7d, 0xad, 0x58, 0x7f, 0x6d, 0x96, 0x8e, 0x82,
-        0x7d, 0xa6, 0x77, 0x99, 0x87, 0x95, 0x89, 0x7e, 0xa6, 0x9e, 0x86, 0xac,
-        0x78, 0x9f, 0x9b, 0x85, 0x76, 0x99, 0x6a, 0x92, 0x66, 0x7b, 0x9a, 0x99,
-        0x83, 0x8b, 0x57, 0x65, 0x75, 0x9f, 0xa6, 0x8a, 0x8d, 0x96, 0x6f, 0x80,
-        0x65, 0x8f, 0x80, 0x9f, 0x82, 0x85, 0x55, 0x75, 0x5c, 0x84, 0x91, 0x86,
-        0x76, 0x96, 0x5a, 0x6c, 0x62, 0x7b, 0x92, 0x88, 0x61, 0xca, 0x75, 0x66,
-        0x70, 0x70, 0x8e, 0x7a, 0x75, 0xb2, 0x66, 0x81, 0x5b, 0x79, 0x92, 0x97,
-        0x94, 0xaf, 0x72, 0x8a, 0x9b, 0x5f, 0x65, 0x96, 0x81, 0xb6, 0x8a, 0x6f,
-        0x94, 0x7a, 0x96, 0x92, 0x79, 0x94, 0x8e, 0x53, 0x9a, 0x73, 0x6a, 0x9d,
-        0xa1, 0xa3, 0xa4, 0x8f, 0x6b, 0xa4, 0x8b, 0x82, 0x96, 0xb1, 0x8c, 0x92,
-        0x7f, 0x91, 0x5f, 0x98, 0x8a, 0xa4, 0x7e, 0x80, 0x97, 0x86, 0x86, 0x86,
-        0x8f, 0xa6, 0x77, 0x9a, 0x82, 0x80, 0x6e, 0x73, 0x83, 0xaf, 0x87, 0x6d,
-        0x77, 0x9a, 0x83, 0x9f, 0x7c, 0xa4, 0x71, 0x6f, 0x7d, 0x75, 0x9d, 0x82,
-        0x83, 0xaf, 0x85, 0x80, 0x8d, 0x7f, 0xa4, 0xa2, 0x88, 0xba, 0x76, 0x76,
-        0x94, 0x6b, 0x76, 0x83, 0x77, 0x96, 0x78, 0x8c, 0xb0, 0x8e, 0x83, 0x87,
-        0xa0, 0xcc, 0x7f, 0xa4, 0x8c, 0x77, 0x84, 0x8c, 0x80, 0xa0, 0x57, 0x76,
-        0x76, 0x71, 0x86, 0x9c, 0x7f, 0x88, 0x57, 0x95, 0x4d, 0x8c, 0x7f, 0x80,
-        0x66, 0x9e, 0x42, 0x8d, 0x6a, 0x8e, 0x8c, 0x80, 0x89, 0x9d, 0x4f, 0x83,
-        0x54, 0x8a, 0x5e, 0x64, 0x70, 0x94, 0x78, 0x90, 0x7d, 0x78, 0x8d, 0x71,
-        0x56, 0x9a, 0x8c, 0x65, 0x8b, 0x62, 0x88, 0x9a, 0x6c, 0x8e, 0x7b, 0x78,
-        0x68, 0x86, 0x64, 0x6b, 0x67, 0xaa, 0x8c, 0x7b, 0x67, 0x75, 0x58, 0x7e,
-        0x6b, 0x97, 0x92, 0x87, 0x9c, 0x79, 0x71, 0x76, 0x7d, 0xbb, 0x89, 0x75,
-        0x83, 0x57, 0x74, 0x98, 0xa1, 0x8f, 0xb0, 0x89, 0x76, 0x88, 0x69, 0x9c,
-        0x74, 0xb0, 0x86, 0x9c, 0x79, 0x6f, 0x84, 0x70, 0x94, 0xa1, 0x6e, 0x7a,
-        0xa3, 0x88, 0xa0, 0x7a, 0x94, 0xa1, 0x82, 0x93, 0x99, 0x95, 0x7f, 0xab,
-        0x97, 0x9d, 0x6e, 0x68, 0x79, 0x73, 0x76, 0x83, 0x76, 0xbd, 0x87, 0x87,
-        0x86, 0x74, 0x8f, 0x6e, 0x65, 0xba, 0x6a, 0x78, 0x91, 0x62, 0x72, 0x67,
-        0x75, 0xbd, 0x8c, 0x5e, 0x85, 0x6d, 0x72, 0x85, 0x7d, 0x96, 0x8f, 0xb9,
-        0x9f, 0x97, 0xa2, 0x8a, 0xa1, 0xc1, 0x8d, 0xbc, 0x85, 0x78, 0x93, 0x97,
-        0x99, 0x9f, 0x3a, 0x98, 0x65, 0x8d, 0x6a, 0x6c, 0x92, 0x85, 0x49, 0x7e,
-        0x6a, 0xaa, 0x8a, 0x94, 0x6b, 0x93, 0x40, 0x8a, 0x8c, 0x9c, 0x6f, 0xad,
-        0x72, 0xb0, 0x58, 0x88, 0x60, 0x8c, 0x86, 0x84, 0x74, 0x96, 0x8f, 0x97,
-        0x5e, 0x6c, 0x79, 0x92, 0x51, 0xa8, 0x92, 0x58, 0x62, 0x6f, 0x6c, 0x76,
-        0x5f, 0x9e, 0x86, 0x71, 0x9c, 0x69, 0x7e, 0x80, 0x8a, 0x97, 0x6f, 0x79,
-        0x8b, 0x6f, 0x6c, 0x88, 0x73, 0x9c, 0x6d, 0x91, 0x77, 0x73, 0x7f, 0x97,
-        0x86, 0xa9, 0xac, 0x71, 0x82, 0x90, 0x83, 0x8a, 0x80, 0x9d, 0xa8, 0x85,
-        0x78, 0x7f, 0x94, 0x99, 0x8e, 0xa3, 0x89, 0x70, 0x87, 0x62, 0x82, 0x87,
-        0x8c, 0x98, 0x7a, 0x88, 0x72, 0x7e, 0x78, 0xa0, 0x78, 0x95, 0x97, 0x8f,
-        0x7b, 0x7c, 0x83, 0x94, 0x93, 0xa7, 0x77, 0x97, 0x90, 0x5e, 0x76, 0x7c,
-        0x68, 0xaa, 0x69, 0x67, 0x76, 0x84, 0x7e, 0x64, 0xa3, 0xbe, 0x7e, 0x8b,
-        0x82, 0x50, 0x8a, 0x82, 0x89, 0xc0, 0x79, 0x78, 0x68, 0x7c, 0x6b, 0x77,
-        0x82, 0x99, 0x7b, 0x83, 0x80, 0x90, 0x96, 0x96, 0x87, 0xb7, 0xa5, 0x94,
-        0x82, 0x99, 0x95, 0x91, 0x7e, 0xa2, 0x49, 0x95, 0x6d, 0x8e, 0xa9, 0x89,
-        0x8e, 0x8f, 0x3d, 0x95, 0x6a, 0x8c, 0x8b, 0x8c, 0x7e, 0x88, 0x63, 0x94,
-        0x69, 0x94, 0x88, 0x92, 0x79, 0xa7, 0x68, 0x60, 0x76, 0x85, 0xa1, 0x6f,
-        0x54, 0x96, 0x63, 0x7a, 0x5c, 0x73, 0x74, 0x6e, 0x53, 0x99, 0x69, 0x76,
-        0x69, 0x57, 0x6a, 0x82, 0x55, 0x93, 0x82, 0x80, 0x65, 0x7f, 0x7b, 0x76,
-        0x72, 0x87, 0x8d, 0x97, 0x98, 0x78, 0x7e, 0x6d, 0x7a, 0x95, 0x78, 0x70,
-        0x90, 0x83, 0x89, 0x80, 0x7f, 0x9d, 0x73, 0x73, 0x84, 0x77, 0x8e, 0x77,
-        0x8e, 0x75, 0x9e, 0xa5, 0x86, 0x68, 0x89, 0x7d, 0x8d, 0x99, 0x79, 0x8f,
-        0x8e, 0x87, 0x87, 0x97, 0x8c, 0x91, 0xa1, 0x96, 0x83, 0x73, 0x87, 0xa9,
-        0x8c, 0xa6, 0x85, 0x8c, 0x96, 0x7d, 0x7f, 0x8e, 0x7e, 0xb0, 0x85, 0x8f,
-        0x7f, 0x7d, 0x95, 0x7d, 0x9c, 0xb3, 0x71, 0x86, 0x81, 0x69, 0x7b, 0x69,
-        0x76, 0xb6, 0x5d, 0x67, 0x8a, 0x68, 0x9c, 0xa6, 0x70, 0xbf, 0x79, 0x60,
-        0x8b, 0x7f, 0x7a, 0x7b, 0x8b, 0xaf, 0x8c, 0xa1, 0x86, 0x92, 0x76, 0x8d,
-        0x89, 0xa2, 0xa8, 0xa3, 0xa0, 0xa2, 0x96, 0x9d, 0x7c, 0x92, 0x3f, 0x9b,
-        0x6d, 0x8a, 0x80, 0x81, 0xa0, 0x92, 0x50, 0x7c, 0x82, 0x99, 0x80, 0xa6,
-        0x8e, 0x8d, 0x4f, 0x8d, 0x65, 0x71, 0x77, 0x81, 0x51, 0xa6, 0x3f, 0x5c,
-        0x63, 0x6f, 0x61, 0x93, 0x5c, 0xaa, 0x77, 0x8f, 0x5d, 0x53, 0x79, 0x74,
-        0x6b, 0x94, 0x86, 0x81, 0x85, 0x48, 0x81, 0x80, 0x6b, 0x85, 0x6c, 0x91,
-        0x92, 0x6a, 0x74, 0x78, 0x72, 0x87, 0x6c, 0x82, 0x88, 0x7b, 0x93, 0x71,
-        0x91, 0x8d, 0x67, 0x83, 0x86, 0x5b, 0x86, 0x79, 0x81, 0x9f, 0x95, 0x8a,
-        0x70, 0x66, 0x9e, 0x6b, 0x72, 0x98, 0x97, 0x95, 0x72, 0x93, 0x84, 0x92,
-        0x8c, 0x96, 0xa2, 0x65, 0x80, 0x75, 0xa2, 0xa7, 0x7d, 0x97, 0x71, 0x8f,
-        0x69, 0x65, 0x8f, 0xae, 0x9c, 0x97, 0x5d, 0xb3, 0x98, 0x83, 0x98, 0xa0,
-        0x5f, 0x7e, 0x7a, 0x7a, 0x87, 0x7c, 0x92, 0xa0, 0x81, 0xa6, 0x71, 0x8e,
-        0x88, 0x52, 0xa3, 0x88, 0x6a, 0x9d, 0x84, 0x82, 0x7c, 0x78, 0x9f, 0x92,
-        0x66, 0xa4, 0x53, 0x6a, 0x7e, 0x84, 0x60, 0x84, 0x92, 0xb0, 0x93, 0x9d,
-        0xa0, 0x5f, 0x95, 0x8c, 0x77, 0xa1, 0x8c, 0x90, 0xa0, 0x9c, 0x9a, 0x95,
-        0x85, 0xa1, 0x22, 0x8f, 0x57, 0x80, 0x96, 0x7d, 0x92, 0x8b, 0x41, 0xa6,
-        0x61, 0xa2, 0x6f, 0x80, 0x5d, 0x91, 0x66, 0xab, 0x6d, 0x7e, 0x88, 0x93,
-        0x5c, 0xa5, 0x75, 0x6e, 0x6c, 0x86, 0x69, 0x73, 0x4e, 0x8e, 0x77, 0x6b,
-        0x6c, 0x60, 0x67, 0x91, 0x75, 0x91, 0x6c, 0x7c, 0x53, 0x6e, 0x75, 0x8e,
-        0x79, 0x8c, 0x8b, 0x74, 0x6b, 0x57, 0x71, 0xa1, 0x7f, 0x83, 0x6c, 0x6b,
-        0x93, 0x99, 0x7a, 0x78, 0x71, 0x8c, 0x78, 0x88, 0x9f, 0x85, 0x77, 0x7b,
-        0x86, 0x85, 0xa1, 0x61, 0x78, 0x65, 0x61, 0x75, 0x82, 0x7d, 0xa9, 0xa2,
-        0x84, 0x82, 0x94, 0x95, 0x90, 0x9f, 0x83, 0x97, 0x76, 0x95, 0x8a, 0x83,
-        0x9b, 0x87, 0x8b, 0x7a, 0x6c, 0x6e, 0x75, 0x95, 0x85, 0x95, 0x84, 0x9e,
-        0x96, 0x74, 0x7d, 0xa5, 0x85, 0x8e, 0x7e, 0x73, 0x85, 0x8d, 0x87, 0x80,
-        0x8a, 0x96, 0x65, 0x87, 0x7c, 0x73, 0x80, 0x96, 0x73, 0x8d, 0x5e, 0x79,
-        0x7e, 0x8d, 0x79, 0x85, 0x63, 0xa0, 0x62, 0x89, 0x9d, 0x8c, 0x74, 0x7b,
-        0x9c, 0xa5, 0x71, 0x8c, 0x83, 0x91, 0x8e, 0x8d, 0x89, 0x8b, 0x8b, 0xa4,
-        0x78, 0x88, 0x9e, 0x85, 0x8b, 0x94, 0x38, 0x84, 0x7b, 0x86, 0x7d, 0xa2,
-        0x73, 0x8f, 0x47, 0x7b, 0x69, 0xb4, 0x85, 0x71, 0x61, 0x9d, 0x59, 0x95,
-        0x74, 0x93, 0x6a, 0x88, 0x62, 0xa2, 0x56, 0x93, 0x8d, 0x68, 0x7e, 0x80,
-        0x6b, 0xb7, 0x63, 0x90, 0x5d, 0x54, 0x6c, 0x90, 0x5a, 0x8e, 0x7e, 0x7d,
-        0x82, 0x73, 0x7f, 0x89, 0x94, 0x8e, 0x7a, 0x70, 0x6c, 0x79, 0x88, 0x88,
-        0x9b, 0x8b, 0x70, 0x81, 0x83, 0x83, 0x8b, 0x86, 0x64, 0x93, 0x82, 0x66,
-        0x66, 0x79, 0x74, 0x91, 0x92, 0x94, 0x7c, 0x87, 0x72, 0x79, 0x8d, 0xaa,
-        0xa2, 0x9e, 0xaf, 0x95, 0xb1, 0x8a, 0x95, 0x8b, 0x94, 0x7e, 0x79, 0x8e,
-        0x99, 0x98, 0x97, 0x9e, 0x94, 0x87, 0x74, 0x72, 0x63, 0x92, 0x92, 0x95,
-        0xb0, 0x94, 0x86, 0x91, 0x77, 0x8f, 0x91, 0x7e, 0x83, 0x88, 0x90, 0xa5,
-        0x79, 0x70, 0x85, 0x8f, 0x67, 0x90, 0x98, 0x8d, 0x8a, 0x5d, 0x8c, 0x9c,
-        0x94, 0x91, 0x80, 0x95, 0x6e, 0x95, 0x73, 0x8d, 0x63, 0x8e, 0x53, 0x8a,
-        0x77, 0x88, 0x8f, 0x6f, 0x87, 0x9e, 0x8b, 0xb7, 0x99, 0xb2, 0x85, 0x82,
-        0xa1, 0x89, 0x9b, 0xa7, 0x80, 0x81, 0xa0, 0x8e, 0x84, 0xa9, 0x27, 0x73,
-        0x5e, 0x85, 0x5f, 0x92, 0x8c, 0xa2, 0x34, 0x8e, 0x6e, 0xb2, 0x7b, 0x8c,
-        0x69, 0x93, 0x47, 0x9e, 0x58, 0x7e, 0x94, 0x86, 0x47, 0xa3, 0x53, 0x6b,
-        0x6e, 0x6a, 0x7f, 0x73, 0x5b, 0x8c, 0x7a, 0x99, 0x6c, 0x5d, 0x82, 0x82,
-        0x62, 0x8a, 0x7a, 0x8e, 0x88, 0x62, 0xa0, 0x8e, 0x5c, 0x9a, 0x72, 0x79,
-        0x66, 0x6b, 0x75, 0x78, 0x82, 0x8a, 0x59, 0x91, 0x93, 0x68, 0x78, 0xb4,
-        0x86, 0x7e, 0x8c, 0x6e, 0x88, 0x7f, 0x96, 0x8e, 0x6e, 0x8b, 0x8c, 0x73,
-        0xab, 0x79, 0x88, 0xa6, 0x86, 0x81, 0x9a, 0x80, 0x9a, 0x9e, 0x8b, 0x6d,
-        0x9a, 0x70, 0x8e, 0x8a, 0x84, 0x7a, 0xaf, 0xb8, 0x9e, 0x90, 0x89, 0xb3,
-        0x9b, 0x85, 0x94, 0xb6, 0x87, 0x8c, 0x6e, 0xa3, 0xac, 0x9e, 0x8c, 0x7c,
-        0x81, 0x83, 0x70, 0x8d, 0x7c, 0x81, 0x77, 0x82, 0x69, 0x8e, 0x5e, 0x80,
-        0x8a, 0x8e, 0x7c, 0x8a, 0x89, 0x90, 0x58, 0x59, 0x85, 0x88, 0x7a, 0x86,
-        0x73, 0x9c, 0x4a, 0x81, 0x8d, 0x89, 0x91, 0x95, 0x72, 0x83, 0x9d, 0x99,
-        0x8d, 0x6b, 0x95, 0x7e, 0x70, 0x94, 0x8c, 0x9f, 0x8a, 0x8f, 0xa7, 0x84,
-        0x87, 0xb6, 0x42, 0x81, 0x63, 0x8a, 0x79, 0x77, 0x74, 0x90, 0x23, 0x85,
-        0x74, 0x8f, 0x87, 0x80, 0x50, 0xa1, 0x4d, 0x9b, 0x55, 0x82, 0x74, 0x8e,
-        0x4a, 0xa7, 0x52, 0x4d, 0x77, 0x67, 0x77, 0x9e, 0x62, 0xa5, 0x7d, 0x96,
-        0x6f, 0x45, 0x80, 0x8c, 0x6c, 0x92, 0x99, 0x6f, 0x5d, 0x56, 0x93, 0xac,
-        0x94, 0x9c, 0x95, 0x92, 0x6e, 0x71, 0x87, 0x8c, 0x7b, 0xa9, 0x7f, 0x7a,
-        0x69, 0x6b, 0x7d, 0x90, 0x6f, 0x81, 0x9f, 0x80, 0x83, 0x67, 0x78, 0x85,
-        0x85, 0x91, 0x8a, 0x80, 0xaa, 0x86, 0x8c, 0x88, 0x8c, 0x8f, 0x9b, 0x85,
-        0x8b, 0x7e, 0x83, 0x82, 0x95, 0x75, 0x6b, 0x8f, 0x85, 0x8b, 0xb0, 0x9f,
-        0xa7, 0x8e, 0x61, 0x9d, 0x72, 0xac, 0x92, 0x87, 0x94, 0x96, 0x68, 0x8f,
-        0x63, 0x85, 0x9c, 0xa8, 0x82, 0x9b, 0x85, 0x9b, 0x6b, 0x72, 0x83, 0x85,
-        0x90, 0x87, 0x74, 0xa4, 0x88, 0x57, 0x63, 0x90, 0x8e, 0x7b, 0x80, 0x81,
-        0x94, 0x74, 0x68, 0x8a, 0x7f, 0x86, 0x78, 0x72, 0x75, 0x67, 0x7a, 0x8a,
-        0x7a, 0x74, 0x8c, 0xad, 0x75, 0xa2, 0x7d, 0x9a, 0x9e, 0x83, 0x92, 0xa2,
-        0xa3, 0x98, 0xa5, 0x91, 0x84, 0xb0, 0x21, 0x9a, 0x5f, 0x8c, 0x7e, 0x86,
-        0x80, 0xa0, 0x16, 0x9b, 0x5b, 0x9c, 0x76, 0x8d, 0x77, 0x9f, 0x62, 0x86,
-        0x6a, 0x6c, 0x6e, 0x8f, 0x4e, 0xc1, 0x61, 0x6f, 0x74, 0x79, 0x80, 0x5f,
-        0x59, 0x9e, 0x7c, 0x87, 0x7f, 0x4b, 0x6c, 0x8b, 0x5a, 0x8f, 0x65, 0x8a,
-        0x62, 0x58, 0x66, 0x8d, 0x83, 0x97, 0x8a, 0x7a, 0x77, 0x79, 0x6c, 0x83,
-        0x8c, 0x93, 0x82, 0x5e, 0x61, 0x8c, 0x82, 0x80, 0x88, 0x88, 0x85, 0x87,
-        0x77, 0x70, 0x8d, 0x7f, 0x7a, 0x89, 0x72, 0x7e, 0xa3, 0x99, 0x6b, 0xaa,
-        0x81, 0x87, 0x90, 0x6f, 0x7f, 0x77, 0x96, 0x83, 0x89, 0x89, 0x6a, 0x77,
-        0xa4, 0x6c, 0x97, 0x7e, 0x95, 0xa4, 0x63, 0x8d, 0x71, 0x96, 0x8a, 0xa4,
-        0x9f, 0x7c, 0x54, 0x94, 0x7a, 0x89, 0x8a, 0x90, 0x7e, 0x9d, 0x53, 0x7c,
-        0x9d, 0x83, 0x90, 0x84, 0xa1, 0x8e, 0x80, 0x74, 0x69, 0x7a, 0x69, 0x93,
-        0x8a, 0x90, 0x83, 0x76, 0x8b, 0x6f, 0x8e, 0x93, 0x82, 0x84, 0x7d, 0x94,
-        0xa1, 0x78, 0x7d, 0x68, 0x79, 0x83, 0x85, 0x9d, 0x89, 0xa0, 0x8a, 0x93,
-        0x90, 0x8c, 0x82, 0x86, 0x80, 0x71, 0xb3, 0xa1, 0x90, 0xb2, 0x27, 0xa3,
-        0x5e, 0xa3, 0xa6, 0x64, 0x75, 0xa0, 0x23, 0x8c, 0x7c, 0xc4, 0x7a, 0x8c,
-        0x4d, 0xa3, 0x4c, 0x93, 0x71, 0x7b, 0x71, 0x8b, 0x34, 0xa5, 0x47, 0x7f,
-        0x4e, 0x73, 0x51, 0x8a, 0x67, 0xa0, 0x9d, 0x7f, 0x65, 0x38, 0x61, 0x70,
-        0x71, 0x8d, 0x6a, 0x7e, 0x7e, 0x4c, 0x7d, 0x8d, 0x81, 0x80, 0xa5, 0x84,
-        0x6f, 0x57, 0x70, 0x91, 0x8b, 0x99, 0x9d, 0x84, 0x77, 0x7f, 0x6b, 0x7f,
-        0x76, 0x8f, 0x90, 0x72, 0x6c, 0x58, 0x6b, 0x85, 0xa6, 0x8a, 0xa2, 0x6d,
-        0x8a, 0x71, 0x71, 0x95, 0x92, 0x7c, 0x88, 0x67, 0x86, 0x6d, 0x8d, 0x95,
-        0x79, 0x8e, 0x65, 0x71, 0x71, 0x91, 0x85, 0x99, 0xa9, 0x87, 0x80, 0x88,
-        0x74, 0x86, 0x75, 0x83, 0x8b, 0x7f, 0x78, 0xb1, 0x90, 0xa8, 0x7b, 0x98,
-        0x8a, 0x7b, 0x5b, 0x99, 0x6f, 0x7f, 0xa0, 0x79, 0xa5, 0x93, 0x8b, 0x7b,
-        0x7e, 0x7a, 0x61, 0x9d, 0x98, 0x8b, 0x82, 0x7c, 0x76, 0x73, 0x81, 0x8a,
-        0x7e, 0x8d, 0x6e, 0x71, 0xa0, 0x65, 0x80, 0x62, 0x7d, 0x8d, 0x5e, 0x9b,
-        0x8f, 0x85, 0x89, 0xad, 0x71, 0x73, 0x7f, 0x89, 0x8d, 0x89, 0xb3, 0xa1,
-        0x7c, 0xaf, 0x43, 0x82, 0x49, 0x92, 0x62, 0x7f, 0x79, 0xa6, 0x23, 0x99,
-        0x6c, 0x9a, 0x8a, 0x90, 0x6c, 0xb9, 0x6f, 0x8a, 0x61, 0x7f, 0x8f, 0x8a,
-        0x57, 0xb9, 0x55, 0x65, 0x4b, 0x51, 0x66, 0x6e, 0x4a, 0xa1, 0x83, 0x8a,
-        0x73, 0x23, 0x8a, 0x6d, 0x46, 0xa7, 0x87, 0x64, 0x84, 0x5f, 0x6f, 0x6f,
-        0x9b, 0x9d, 0x76, 0x83, 0x60, 0x6e, 0x76, 0x8a, 0x9a, 0xa6, 0x75, 0x73,
-        0x86, 0x5b, 0x97, 0x88, 0x7b, 0x8e, 0x82, 0x5c, 0x97, 0x71, 0x74, 0x85,
-        0x83, 0x91, 0x89, 0x6f, 0x93, 0x94, 0x8b, 0xa9, 0x7d, 0x84, 0x80, 0x89,
-        0x97, 0x80, 0x65, 0x92, 0x9a, 0x85, 0x5a, 0x6a, 0x6b, 0x58, 0x6f, 0x8c,
-        0x9a, 0x8b, 0x6e, 0x81, 0x9d, 0xae, 0x8c, 0x86, 0x8d, 0x90, 0x6c, 0xb8,
-        0x91, 0x89, 0x98, 0xbd, 0x8b, 0x78, 0x7d, 0x87, 0x9c, 0x72, 0x73, 0x80,
-        0x9e, 0x92, 0x5d, 0x77, 0x78, 0x4f, 0x87, 0x7b, 0x7a, 0x9e, 0x74, 0x67,
-        0x6a, 0x58, 0x95, 0x80, 0x75, 0x97, 0x81, 0x75, 0x94, 0x75, 0x73, 0x92,
-        0x83, 0x7b, 0x6b, 0x8e, 0x82, 0x6e, 0x7d, 0x9b, 0x91, 0x7f, 0x9e, 0xaa,
-        0x8c, 0xa3, 0xa8, 0x8c, 0x9a, 0xc1, 0x28, 0xac, 0x49, 0x9b, 0x59, 0x8a,
-        0x60, 0xa7, 0x39, 0xa7, 0x75, 0x9b, 0x95, 0x94, 0x76, 0xb3, 0x4a, 0x6b,
-        0x60, 0x6c, 0xa5, 0x71, 0x40, 0xc4, 0x4c, 0x7c, 0x76, 0x7b, 0x67, 0x76,
-        0x76, 0xa4, 0x7b, 0x83, 0x67, 0x4d, 0x87, 0x87, 0x6e, 0x93, 0x84, 0x70,
-        0x78, 0x41, 0x87, 0x9f, 0x7a, 0x8c, 0x87, 0x69, 0x73, 0x6c, 0x93, 0x73,
-        0x77, 0xa2, 0x52, 0x72, 0x5c, 0x75, 0x6c, 0x8f, 0x65, 0x92, 0x87, 0x52,
-        0x67, 0x54, 0x54, 0x75, 0x90, 0x9c, 0x91, 0x6f, 0xa3, 0x86, 0x87, 0x9c,
-        0x99, 0x86, 0x9f, 0x71, 0x8a, 0x7a, 0x7a, 0x97, 0x7a, 0x86, 0x6c, 0x99,
-        0x89, 0x7e, 0x9c, 0x83, 0x98, 0x78, 0x73, 0x7f, 0x91, 0x96, 0x9a, 0x8d,
-        0xb0, 0x9e, 0x6a, 0x80, 0x92, 0x86, 0x95, 0x83, 0x94, 0x92, 0x6f, 0x86,
-        0x8a, 0x52, 0x6e, 0x82, 0x84, 0x8b, 0x77, 0x88, 0x70, 0x54, 0x8f, 0x7f,
-        0x7d, 0x7e, 0x57, 0x89, 0x6d, 0x6f, 0x9c, 0x93, 0x90, 0x93, 0x52, 0x70,
-        0x75, 0x92, 0x73, 0x88, 0x93, 0x77, 0x77, 0x91, 0x89, 0xa2, 0x9d, 0xa6,
-        0xae, 0x84, 0x7d, 0xab, 0x92, 0x7e, 0x9c, 0x98, 0x7b, 0xc3, 0x38, 0x98,
-        0x4f, 0x97, 0x8f, 0x93, 0x62, 0xb8, 0x23, 0xa4, 0x6d, 0x9c, 0x81, 0x8e,
-        0x6f, 0x9d, 0x56, 0x89, 0x50, 0x94, 0x70, 0x77, 0x5d, 0xb7, 0x60, 0x5b,
-        0x72, 0x45, 0x81, 0x8c, 0x66, 0xbc, 0x8f, 0x7f, 0x57, 0x43, 0x85, 0x96,
-        0x5a, 0xb2, 0x91, 0x7d, 0x6c, 0x3a, 0x73, 0x92, 0x63, 0x93, 0x89, 0x90,
-        0x7f, 0x52, 0x7f, 0x7b, 0xa1, 0xa6, 0x8f, 0x60, 0x78, 0x51, 0x5f, 0xac,
-        0x7b, 0x89, 0x88, 0x97, 0x7e, 0x64, 0x57, 0x72, 0x6c, 0x96, 0x74, 0x78,
-        0xab, 0x66, 0x62, 0x8d, 0x6f, 0x86, 0x91, 0x93, 0x7d, 0x74, 0x82, 0x80,
-        0x73, 0x84, 0x9c, 0x8e, 0x68, 0x69, 0x9e, 0xa1, 0x8a, 0x83, 0x7a, 0x87,
-        0x94, 0x8c, 0x83, 0x7e, 0x91, 0x92, 0x82, 0x7b, 0xa0, 0x8e, 0x73, 0x86,
-        0xa9, 0x95, 0x7c, 0xa5, 0x6c, 0x6f, 0x8c, 0x87, 0xa6, 0x8a, 0x77, 0x86,
-        0x7d, 0x79, 0x89, 0x75, 0x8f, 0x82, 0x54, 0x61, 0x82, 0x8e, 0x80, 0x84,
-        0x7b, 0x8e, 0x61, 0x82, 0x86, 0x77, 0x7d, 0x7c, 0x7e, 0x6c, 0x7b, 0xad,
-        0x7b, 0x90, 0x88, 0x80, 0x64, 0x83, 0x7e, 0xa7, 0x83, 0x7e, 0xb5, 0xbb,
-        0x88, 0xd9, 0x21, 0x9a, 0x4d, 0x9f, 0x91, 0x97, 0x64, 0xb5, 0x1c, 0x8a,
-        0x5f, 0xaf, 0x7e, 0x7b, 0x67, 0xad, 0x48, 0x7f, 0x4e, 0x87, 0x8f, 0x7c,
-        0x46, 0xab, 0x70, 0x7f, 0x4b, 0x4e, 0x48, 0x8c, 0x63, 0xc5, 0xa2, 0x7f,
-        0x68, 0x3b, 0x59, 0x7f, 0x53, 0xa1, 0x8e, 0x6e, 0x7a, 0x4a, 0x5f, 0x62,
-        0x5b, 0xa1, 0x62, 0x78, 0x74, 0x57, 0x78, 0x91, 0x7b, 0x9b, 0x75, 0x73,
-        0x73, 0x72, 0x94, 0x92, 0x79, 0xaa, 0x94, 0x75, 0x86, 0x58, 0x8c, 0x71,
-        0x77, 0x91, 0xa5, 0x74, 0x8f, 0x73, 0x89, 0x77, 0x68, 0x8e, 0x90, 0x96,
-        0x9f, 0x79, 0x77, 0x7d, 0x89, 0x9b, 0x8c, 0x94, 0x81, 0x88, 0x91, 0x8f,
-        0x9b, 0x91, 0x78, 0x87, 0x82, 0x72, 0xa7, 0xa2, 0x85, 0x98, 0xa3, 0x91,
-        0x83, 0x75, 0x72, 0x93, 0x80, 0x8f, 0x85, 0x70, 0x97, 0x58, 0x9f, 0x72,
-        0x91, 0x8e, 0x93, 0x74, 0x97, 0x73, 0x74, 0x91, 0x80, 0x84, 0x96, 0x94,
-        0x76, 0x69, 0x66, 0x9e, 0x81, 0x8a, 0x8b, 0x63, 0x65, 0x7c, 0xa1, 0x9a,
-        0x72, 0x84, 0x9e, 0x89, 0x9a, 0x86, 0x98, 0x7f, 0x77, 0x85, 0x82, 0xaa,
-        0xa3, 0x88, 0xac, 0x9e, 0x76, 0xca, 0x2b, 0xa0, 0x40, 0xad, 0x6f, 0x6c,
-        0x66, 0xc8, 0x07, 0x9e, 0x3e, 0x9f, 0x85, 0x9f, 0x5e, 0xb7, 0x53, 0x91,
-        0x56, 0x6d, 0x62, 0x95, 0x4c, 0xc7, 0x46, 0x56, 0x4b, 0x5d, 0x6f, 0x52,
-        0x4d, 0xa3, 0x8c, 0x90, 0x78, 0x4d, 0x58, 0x8d, 0x53, 0x93, 0x8e, 0x68,
-        0x6f, 0x3b, 0x49, 0x86, 0x6e, 0x9d, 0x76, 0x74, 0x5b, 0x44, 0x7b, 0x8c,
-        0x89, 0xb0, 0x64, 0x62, 0x6a, 0x6d, 0x7a, 0xae, 0x84, 0x95, 0x8c, 0x71,
-        0x8b, 0x60, 0x82, 0x9e, 0x8c, 0xa8, 0x90, 0x66, 0xa1, 0x7b, 0x65, 0x82,
-        0x8f, 0x7d, 0x8d, 0x78, 0x8e, 0x5f, 0x75, 0x88, 0x5d, 0x93, 0xa1, 0x93,
-        0x6b, 0x67, 0x7a, 0xa7, 0x92, 0x8c, 0x65, 0x88, 0x95, 0x93, 0x87, 0x81,
-        0x9c, 0x97, 0x62, 0x9d, 0x90, 0x62, 0xa1, 0x9f, 0x87, 0x94, 0x94, 0x99,
-        0x92, 0x8f, 0x71, 0x80, 0x77, 0x82, 0x92, 0x78, 0x67, 0x69, 0x7e, 0x81,
-        0x93, 0x89, 0x80, 0x9b, 0x71, 0x57, 0x63, 0x83, 0x7b, 0x9f, 0x5d, 0x92,
-        0x85, 0x96, 0x7e, 0x92, 0x84, 0x7f, 0x81, 0xa3, 0xa8, 0x96, 0x91, 0x8e,
-        0x8c, 0x8e, 0x7d, 0xb0, 0x86, 0x72, 0x9d, 0x8e, 0x8e, 0xd0, 0x05, 0x77,
-        0x45, 0xad, 0x91, 0x95, 0x71, 0xb8, 0x01, 0x9a, 0x41, 0xb8, 0x94, 0x6e,
-        0x63, 0xd3, 0x58, 0x8c, 0x5a, 0x89, 0x85, 0x83, 0x52, 0xc1, 0x7b, 0x6a,
-        0x65, 0x6e, 0x73, 0x63, 0x68, 0xba, 0x67, 0x78, 0x79, 0x4a, 0x73, 0x8f,
-        0x51, 0xc9, 0x85, 0x8a, 0x6b, 0x45, 0x6a, 0x8f, 0x6c, 0xad, 0x8a, 0x8d,
-        0x6a, 0x6e, 0x6b, 0x7f, 0x86, 0xb4, 0x88, 0x7d, 0xaa, 0x71, 0x5c, 0x69,
-        0x5d, 0xa8, 0x62, 0x7d, 0x6c, 0x6e, 0x6f, 0x6a, 0x7c, 0x9d, 0x7a, 0x83,
-        0x7d, 0x79, 0x7b, 0x9c, 0x73, 0x93, 0x7f, 0x9d, 0x8c, 0x75, 0x78, 0x83,
-        0x85, 0x88, 0x81, 0x81, 0x98, 0x79, 0xa3, 0xae, 0x5b, 0x90, 0x89, 0x9d,
-        0x6d, 0x90, 0xa3, 0x8e, 0x87, 0x96, 0x60, 0xa7, 0x76, 0x82, 0x81, 0x84,
-        0x84, 0x9c, 0x73, 0x8a, 0x6c, 0x58, 0x64, 0x96, 0x89, 0x8b, 0x76, 0x60,
-        0x91, 0x72, 0x7f, 0x86, 0x9a, 0x89, 0x67, 0x7d, 0x77, 0x84, 0x73, 0x5c,
-        0x67, 0x8a, 0x82, 0x8c, 0x8c, 0x94, 0x8a, 0xa2, 0xaa, 0x7e, 0x5f, 0x7f,
-        0x86, 0x90, 0x96, 0xab, 0x8d, 0x91, 0x7c, 0xb6, 0x82, 0x8d, 0xb8, 0xa9,
-        0x92, 0xea, 0x1b, 0x74, 0x25, 0xab, 0x8d, 0x61, 0x81, 0xd8, 0x2c, 0x86,
-        0x2f, 0xcf, 0xa2, 0x84, 0x7f, 0xa4, 0x36, 0x86, 0x47, 0x8d, 0x60, 0x8a,
-        0x62, 0xb1, 0x4a, 0x54, 0x48, 0x73, 0x64, 0x9d, 0x72, 0xb2, 0x76, 0x4c,
-        0x8e, 0x4e, 0x76, 0x94, 0x7c, 0xad, 0x74, 0x6c, 0x6c, 0x54, 0x7f, 0x63,
-        0x97, 0xb3, 0x74, 0x6c, 0x99, 0x5f, 0x86, 0x6a, 0xa3, 0x94, 0x7c, 0x83,
-        0x8d, 0x81, 0x79, 0xac, 0x61, 0x9b, 0x65, 0x7b, 0x66, 0x89, 0x60, 0x76,
-        0x8d, 0x93, 0x8d, 0x84, 0x71, 0x65, 0x82, 0x8c, 0x94, 0xa7, 0x59, 0xa1,
-        0x8b, 0x72, 0x84, 0x65, 0x75, 0x95, 0x62, 0x71, 0x71, 0x7e, 0x7b, 0x97,
-        0x9b, 0x9a, 0x80, 0xb1, 0x77, 0x7a, 0x73, 0x8e, 0x9c, 0x8c, 0x7d, 0x96,
-        0x89, 0x7d, 0x7e, 0x80, 0x8e, 0x93, 0x63, 0x72, 0x6b, 0x57, 0x78, 0x8f,
-        0x90, 0x86, 0x62, 0x75, 0x7e, 0x54, 0x7d, 0x95, 0x85, 0x84, 0x73, 0x7b,
-        0x8f, 0x9e, 0x72, 0x8c, 0x90, 0x96, 0x8e, 0x6c, 0x80, 0x8b, 0x9e, 0x8c,
-        0x87, 0x8e, 0x9b, 0x97, 0x8f, 0x94, 0xa3, 0x6b, 0xad, 0x93, 0x8a, 0x96,
-        0x8d, 0x91, 0xa6, 0x8a, 0x9e, 0xce, 0x6b, 0x98, 0x6d, 0xa9, 0x92, 0x92,
-        0x7c, 0xe2, 0x63, 0x97, 0x42, 0xc8, 0xa3, 0xa0, 0x88, 0xdc, 0x75, 0x9b,
-        0x51, 0x7d, 0x5c, 0x80, 0x89, 0xc0, 0x83, 0x5e, 0x5e, 0xa4, 0x3e, 0x74,
-        0x9b, 0xb6, 0x7f, 0x63, 0x78, 0x7d, 0x74, 0x57, 0x93, 0xa2, 0x83, 0x70,
-        0x5e, 0x7d, 0x60, 0x69, 0x93, 0x9e, 0x79, 0x86, 0x91, 0x67, 0x86, 0x95,
-        0xa2, 0xad, 0x62, 0x74, 0x68, 0x7e, 0x7e, 0x82, 0x8c, 0xb0, 0xa0, 0x63,
-        0x8b, 0x82, 0x8f, 0x8c, 0xa4, 0xa3, 0x76, 0x6c, 0x8e, 0x87, 0x72, 0x85,
-        0xaa, 0xa4, 0x7f, 0x7b, 0x8e, 0x9a, 0x69, 0x91, 0x9d, 0xa0, 0x81, 0x92,
-        0x90, 0x85, 0x66, 0x82, 0xa3, 0xa9, 0x7f, 0x8f, 0x83, 0x9d, 0x8b, 0x8d,
-        0x96, 0xa3, 0x8f, 0x7a, 0x6d, 0x89, 0x74, 0x8a, 0xa9, 0xa9, 0x7b, 0x77,
-        0x93, 0x8b, 0x63, 0x92, 0x99, 0x8b, 0x88, 0x4f, 0x87, 0x7c, 0x67, 0x78,
-        0x83, 0xa5, 0xa5, 0x58, 0x8d, 0x70, 0x86, 0x82, 0x9e, 0xa7, 0xa5, 0x96,
-        0x8d, 0x7b, 0x96, 0x8c, 0x95, 0xa3, 0x8d, 0x9c, 0x92, 0x95, 0x98, 0x94,
-        0x87, 0x90, 0x92, 0x92, 0x95, 0x96, 0xad, 0x6e, 0x97, 0x8c, 0x92, 0x7f,
-        0x95, 0x8b, 0x8a, 0x90, 0x9b, 0x87, 0x9e, 0x86, 0x91, 0xa0, 0x68, 0x82,
-        0x85, 0x8e, 0x82, 0xa8, 0x9f, 0x68, 0x87, 0x75, 0x9b, 0x70, 0x95, 0x91,
-        0x6c, 0x77, 0x8b, 0x7b, 0x95, 0x80, 0x99, 0x65, 0x95, 0x82, 0x92, 0x9a,
-        0x8a, 0x65, 0x70, 0x8c, 0x98, 0x9e, 0x80, 0x7b, 0xa5, 0x9b, 0x93, 0x94,
-        0x84, 0x6a, 0x69, 0x82, 0x80, 0x7a, 0x75, 0x72, 0x94, 0x79, 0xad, 0xb2,
-        0x81, 0x8b, 0x85, 0x6c, 0x86, 0x88, 0x9e, 0x79, 0x86, 0x9e, 0x7e, 0x91,
-        0x7b, 0x6d, 0x93, 0x91, 0x82, 0x97, 0x6b, 0xa6, 0xaa, 0x9f, 0xa8, 0x74,
-        0x94, 0x7f, 0x63, 0x98, 0x90, 0xa1, 0x8c, 0x7f, 0x71, 0x86, 0x89, 0x95,
-        0x88, 0x80, 0x77, 0x67, 0x85, 0x7d, 0x89, 0x6d, 0x9c, 0x76, 0x72, 0x8d,
-        0x96, 0x94, 0x88, 0x98, 0x9f, 0x94, 0x8e, 0x84, 0x7a, 0x88, 0x79, 0x9f,
-        0x81, 0xa1, 0x7c, 0x8b, 0x71, 0x79, 0x7d, 0x9d, 0x7b, 0x6a, 0x8c, 0x66,
-        0x9e, 0x7b, 0x77, 0x7a, 0xb0, 0x74, 0x7f, 0x8d, 0x8d, 0x71, 0x72, 0x84,
-        0x90, 0x98, 0x7b, 0x89, 0x9b, 0x8e, 0x85, 0x7a, 0x67, 0x8a, 0x72, 0x84,
-        0x82, 0x91, 0x91, 0x7a, 0x85, 0x8a, 0xae, 0x8a, 0x9a, 0x9a, 0x7f, 0x85,
-        0x8a, 0x90, 0x69, 0x7b, 0x76, 0x78, 0x98, 0x54, 0x94, 0x7e, 0x6c, 0x72,
-        0x89, 0x88, 0x82, 0x96, 0x59, 0x95, 0x76, 0x91, 0x94, 0x96, 0x83, 0x84,
-        0x72, 0x8d, 0x97, 0x71, 0x68, 0x8e, 0x88, 0x8b, 0x7c, 0xa9, 0x73, 0x8a,
-        0x95, 0x86, 0x87, 0x96, 0x91, 0x77, 0xb1, 0x88, 0x6e, 0x7d, 0x7c, 0x9f,
-        0x8f, 0x82, 0x79, 0x83, 0xa6, 0x81, 0x89, 0x83, 0x85, 0x9b, 0x7c, 0x68,
-        0x6f, 0x84, 0x7c, 0xa1, 0x8e, 0x80, 0x78, 0x8f, 0x96, 0x77, 0x7e, 0x7b,
-        0x8f, 0x81, 0xa5, 0x84, 0x86, 0x91, 0x7b, 0x73, 0x92, 0x85, 0xa3, 0x7e,
-        0x80, 0x95, 0x7d, 0x5f, 0x8c, 0x94, 0x95, 0x73, 0x95, 0x78, 0x87, 0xa1,
-        0x94, 0x6c, 0xac, 0x6c, 0x77, 0x89, 0x86, 0x9c, 0x82, 0x76, 0x99, 0x93,
-        0x92, 0x88, 0x80, 0x80, 0x85, 0x8a, 0xa8, 0x8f, 0x7a, 0x89, 0x9a, 0x7a,
-        0x8f, 0x91, 0x86, 0x82, 0x7f, 0x82, 0x91, 0x95, 0x85, 0x71, 0x7d, 0x8f,
-        0x83, 0x8c, 0x79, 0x97, 0x7a, 0x9b, 0x91, 0x88, 0xa2, 0x86, 0x8a, 0x80,
-        0xa0, 0x96, 0x8b, 0x7d, 0x76, 0x96, 0x9f, 0x8d, 0x95, 0x8a, 0x94, 0xa0,
-        0x80, 0x95, 0x9b, 0x96, 0x81, 0xa8, 0x59, 0x89, 0x92, 0xb2, 0x83, 0x89,
-        0x85, 0x81, 0x7e, 0x64, 0x77, 0x82, 0x90, 0x96, 0x7e, 0x9f, 0xab, 0x8a,
-        0x6e, 0x9b, 0x90, 0x89, 0x6e, 0x7d, 0x81, 0x65, 0x81, 0x86, 0xa1, 0x93,
-        0x8b, 0x83, 0x81, 0x89, 0x8b, 0x90, 0x7e, 0x97, 0x8e, 0x75, 0x7e, 0x7e,
-        0x7b, 0x81, 0x9a, 0x64, 0x90, 0xab, 0x90, 0x82, 0x8a, 0x82, 0x8d, 0xad,
-        0x90, 0x74, 0x7f, 0x9a, 0x88, 0x92, 0x83, 0x97, 0xa6, 0x6e, 0x9d, 0x81,
-        0xa2, 0x98, 0x74, 0x84, 0x93, 0x85, 0x84, 0x7d, 0xa2, 0x92, 0x92, 0x87,
-        0x73, 0x8b, 0x92, 0x74, 0x96, 0x70, 0x83, 0x86, 0x8a, 0x89, 0x86, 0x88,
-        0x87, 0x7c, 0x7d, 0x81, 0x8d, 0x71, 0x8c, 0x89, 0x70, 0x94, 0x8f, 0x9a,
-        0x83, 0x9d, 0x99, 0x78, 0x74, 0x88, 0x84, 0x9a, 0x95, 0x8b, 0x8e, 0x7f,
-        0xa2, 0xa0, 0x76, 0x93, 0x9b, 0x7c, 0x97, 0x81, 0x83, 0x8c, 0xa1, 0x99,
-        0x9d, 0x7f, 0x87, 0x75, 0xa7, 0x75, 0x89, 0x7e, 0x88, 0x80, 0x8f, 0x84,
-        0x9a, 0x77, 0x8d, 0x90, 0x9d, 0x6c, 0x88, 0x8d, 0x8e, 0x81, 0x97, 0x6d,
-        0x81, 0x88, 0x64, 0x8c, 0x77, 0x8e, 0x91, 0x8a, 0x7f, 0x8a, 0x94, 0x7a,
-        0x89, 0x93, 0x8c, 0x69, 0x85, 0x8c, 0x93, 0x61, 0x7e, 0x89, 0x7e, 0x8a,
-        0x65, 0x8a, 0xa9, 0x7f, 0x80, 0x86, 0x82, 0x90, 0x66, 0x7a, 0x99, 0x71,
-        0x7f, 0x73, 0x8d, 0x94, 0x7d, 0x73, 0x7a, 0x7d, 0x87, 0x7a, 0x97, 0x70,
-        0x81, 0x60, 0x61, 0x7a, 0x91, 0x88, 0x93, 0x7a, 0x9e, 0xa6, 0x92, 0x9d,
-        0x92, 0x67, 0x99, 0x9a, 0xae, 0x71, 0x89, 0xa5, 0x9f, 0xa6, 0x98, 0x89,
-        0x97, 0x90, 0x9b, 0x9a, 0xc0, 0x95, 0x8f, 0x9c, 0x95, 0x93, 0x88, 0x95,
-        0x95, 0xa0, 0x8e, 0x8c, 0xa8, 0x94, 0x6e, 0x9e, 0x6f, 0x7b, 0xa5, 0x96,
-        0x98, 0x90, 0x91, 0x89, 0x93, 0x8f, 0x84, 0xb2, 0x7f, 0x5e, 0xc2, 0x75,
-        0x8f, 0x90, 0x9c, 0xbf, 0x8a, 0x84, 0xa6, 0x85, 0x7d, 0x84, 0x8a, 0xad,
-        0x6f, 0x88, 0xac, 0x77, 0x91, 0x8d, 0x94, 0xac, 0x8f, 0x7f, 0xa1, 0xa5,
-        0x8e, 0x6d, 0x8a, 0x82, 0x85, 0x80, 0x9b, 0x7a, 0x9f, 0x60, 0x95, 0x97,
-        0x90, 0x67, 0x8f, 0x91, 0x86, 0x89, 0x88, 0x89, 0x96, 0x6c, 0x8b, 0x94,
-        0x8a, 0x75, 0x84, 0x96, 0x8a, 0x86, 0x7c, 0x91, 0x74, 0x8f, 0x97, 0x89,
-        0x8f, 0x8e, 0x6b, 0x97, 0x93, 0x89, 0x6b, 0x7e, 0x65, 0xa4, 0xa5, 0x63,
-        0x85, 0x88, 0x81, 0xa3, 0x70, 0x9b, 0x9e, 0x8c, 0x62, 0x73, 0x85, 0xb4,
-        0x88, 0x6e, 0x92, 0x6f, 0x91, 0x88, 0x79, 0x91, 0x7f, 0x7d, 0x9a, 0x6b,
-        0x78, 0x93, 0x7e, 0x79, 0x93, 0x7a, 0x74, 0x91, 0x8d, 0x92, 0xb3, 0x61,
-        0xa3, 0x76, 0x81, 0x99, 0x96, 0x8b, 0x93, 0x8f, 0xa7, 0x6f, 0x8f, 0xa6,
-        0xb2, 0x76, 0xa1, 0x83, 0xa8, 0x8b, 0xae, 0x99, 0x90, 0x6a, 0x97, 0x97,
-        0xaa, 0x95, 0x85, 0x7d, 0x97, 0x94, 0x86, 0x94, 0x89, 0xa4, 0xa9, 0x81,
-        0x89, 0x7c, 0x96, 0xb3, 0x92, 0x7d, 0xa4, 0x6f, 0x6d, 0x92, 0x83, 0xb4,
-        0x7b, 0x94, 0x8c, 0x79, 0x61, 0x6f, 0x8f, 0xb7, 0x88, 0x66, 0xaa, 0x7d,
-        0x89, 0x7f, 0x90, 0xbd, 0x99, 0xac, 0xb1, 0x96, 0x9c, 0x7c, 0x92, 0xb7,
-        0x73, 0x94, 0xad, 0x9d, 0x7c, 0x80, 0x87, 0x96, 0x73, 0x8d, 0xa8, 0x88,
-        0xa9, 0x83, 0x7b, 0x84, 0x9d, 0x99, 0x83, 0x89, 0x9d, 0x7f, 0x7e, 0x86,
-        0x75, 0x83, 0x77, 0x7d, 0x8b, 0x7d, 0x80, 0x9d, 0xa2, 0x94, 0x72, 0x92,
-        0x75, 0x95, 0x99, 0xa0, 0x7b, 0x83, 0x99, 0x89, 0x82, 0x92, 0x5b, 0x9e,
-        0x7c, 0x91, 0x95, 0x79, 0x61, 0x86, 0x60, 0xc7, 0x72, 0x91, 0xb5, 0x88,
-        0x71, 0x8d, 0x85, 0x91, 0x83, 0x74, 0xa8, 0x67, 0x79, 0x77, 0x7f, 0x79,
-        0x68, 0x84, 0x95, 0x69, 0x98, 0x88, 0x74, 0x72, 0x9c, 0x86, 0x87, 0x95,
-        0x90, 0x95, 0x9b, 0x8b, 0xc5, 0x7d, 0x81, 0x8f, 0x88, 0x8c, 0xb0, 0x95,
-        0xa8, 0x8c, 0x84, 0xa0, 0xb0, 0x89, 0x9a, 0x90, 0xaa, 0x88, 0x96, 0x9b,
-        0x88, 0xa9, 0x89, 0x99, 0xb7, 0x82, 0x99, 0xa0, 0x85, 0x70, 0x9c, 0x9a,
-        0x94, 0x74, 0x91, 0x81, 0x76, 0x70, 0x8f, 0xc2, 0x8c, 0x91, 0x8f, 0x69,
-        0x74, 0x7e, 0x6d, 0x9a, 0x80, 0x77, 0xa5, 0x94, 0x8b, 0x6d, 0x82, 0xcf,
-        0x8e, 0x74, 0xc4, 0x86, 0x7f, 0x78, 0x72, 0xb3, 0x78, 0x7a, 0xac, 0x9c,
-        0x7d, 0x77, 0x8d, 0xca, 0x67, 0x8c, 0xd5, 0x8f, 0x7f, 0x71, 0x70, 0x82,
-        0x7e, 0x9f, 0xb0, 0x7f, 0x75, 0x90, 0x79, 0x7b, 0x8d, 0x7b, 0xa6, 0x87,
-        0x98, 0x76, 0x84, 0x96, 0x81, 0x6a, 0x96, 0x86, 0x8e, 0x77, 0xa3, 0x83,
-        0x91, 0x83, 0x8a, 0x6c, 0x74, 0x83, 0x99, 0x7d, 0x7c, 0x8a, 0x88, 0x9a,
-        0x6b, 0x86, 0x59, 0xa3, 0x8a, 0x8e, 0xbb, 0x8a, 0x75, 0x78, 0x68, 0xb5,
-        0x9b, 0x7b, 0xa7, 0x93, 0x5b, 0x6c, 0x6b, 0xa0, 0x74, 0x99, 0xc0, 0x73,
-        0x8b, 0x7e, 0x8e, 0x83, 0x64, 0x7c, 0x7d, 0x7a, 0x98, 0x7d, 0x82, 0x7c,
-        0x8f, 0x7e, 0x74, 0x86, 0xa9, 0x84, 0xba, 0x8f, 0xc7, 0x6f, 0x87, 0xae,
-        0x97, 0x91, 0xad, 0x82, 0xb2, 0x70, 0x8a, 0xa0, 0xb0, 0x7d, 0x95, 0x8d,
-        0xc2, 0x85, 0x80, 0xad, 0x9f, 0x85, 0x8b, 0x76, 0xaa, 0xab, 0x8f, 0xa0,
-        0x89, 0x9b, 0x8a, 0xb3, 0xa0, 0x72, 0xbe, 0x8c, 0x93, 0x7a, 0xa0, 0xad,
-        0x99, 0x6f, 0xa2, 0x79, 0x78, 0x8b, 0x6d, 0xae, 0x75, 0x6f, 0xa1, 0x8d,
-        0x68, 0x81, 0x74, 0xb3, 0x8f, 0x81, 0xc6, 0x96, 0x77, 0x68, 0x85, 0xaf,
-        0x86, 0x9f, 0xbb, 0x8a, 0x7e, 0x8a, 0x86, 0xab, 0x8b, 0x87, 0x94, 0x96,
-        0x99, 0x82, 0x6a, 0xaa, 0x7b, 0x81, 0xa6, 0x9b, 0xb6, 0x73, 0x78, 0x9a,
-        0x8f, 0xaa, 0x93, 0x81, 0x97, 0x7a, 0x72, 0x82, 0x79, 0x81, 0x7c, 0x88,
-        0x8e, 0x79, 0x9d, 0x81, 0x9a, 0x75, 0x9b, 0x89, 0x73, 0x6a, 0xa6, 0x84,
-        0x5c, 0x6f, 0xa0, 0x9d, 0x81, 0x84, 0x3e, 0xaf, 0x94, 0xa1, 0xb8, 0x93,
-        0x81, 0x89, 0x68, 0xd4, 0x87, 0x99, 0x99, 0x95, 0x79, 0x72, 0x81, 0xa1,
-        0x78, 0x7d, 0x8f, 0x7e, 0x87, 0x78, 0x8e, 0x97, 0x7e, 0x96, 0x86, 0x86,
-        0x97, 0x74, 0x6f, 0x7d, 0xa5, 0x81, 0x6f, 0x8e, 0x9e, 0x8b, 0xad, 0xac,
-        0xbd, 0x75, 0x84, 0xa2, 0x93, 0x76, 0xc7, 0x9e, 0xb0, 0x75, 0x89, 0xa4,
-        0x95, 0x92, 0xb5, 0xaa, 0xb9, 0x7d, 0x79, 0xa5, 0x88, 0x70, 0x84, 0x70,
-        0xa3, 0x81, 0xa1, 0xa6, 0x8f, 0x96, 0x96, 0x8d, 0xa5, 0x83, 0xb2, 0x8f,
-        0x88, 0x74, 0x96, 0xbc, 0x8b, 0x81, 0xa4, 0x85, 0x7c, 0x87, 0x64, 0xb4,
-        0x80, 0x88, 0x92, 0x90, 0x78, 0x79, 0x77, 0xa5, 0x79, 0x8b, 0xbd, 0x7d,
-        0x84, 0x8c, 0x96, 0xd4, 0x78, 0x81, 0xa4, 0x8c, 0x97, 0x89, 0x78, 0xc4,
-        0x9f, 0x94, 0xb9, 0x83, 0x76, 0x78, 0x89, 0x86, 0x81, 0x8f, 0xbd, 0xa7,
-        0x88, 0x79, 0x8e, 0x92, 0x86, 0x88, 0xad, 0x8a, 0x7b, 0x7f, 0x80, 0xad,
-        0x7a, 0xaf, 0x8a, 0x93, 0xa6, 0x84, 0x92, 0x8e, 0x84, 0x99, 0x80, 0xae,
-        0x74, 0x7c, 0x95, 0x9c, 0x7b, 0x84, 0x84, 0x84, 0xa4, 0x82, 0x57, 0xb5,
-        0x95, 0xc1, 0xb7, 0xa0, 0x85, 0x7b, 0x69, 0xc3, 0xb1, 0x8e, 0xa0, 0x8e,
-        0x81, 0x88, 0x78, 0x9e, 0x81, 0x97, 0xb2, 0x74, 0x81, 0x84, 0x91, 0x87,
-        0x6f, 0x6f, 0x75, 0x78, 0x92, 0x7a, 0x6d, 0x80, 0x9a, 0x7e, 0x81, 0xa1,
-        0xa8, 0x6d, 0xb5, 0x98, 0xb4, 0x7f, 0x9a, 0xa4, 0x9d, 0x7b, 0xba, 0xaa,
-        0xce, 0x93, 0x79, 0xa5, 0x81, 0x95, 0xa6, 0x7f, 0x8c, 0x8b, 0x96, 0xa4,
-        0xa1, 0x8d, 0x91, 0x97, 0xce, 0x8e, 0x8e, 0x9d, 0x86, 0x7f, 0x97, 0xa3,
-        0x99, 0x75, 0xa3, 0xa0, 0x69, 0x6a, 0x87, 0xa0, 0x9a, 0x80, 0xa2, 0x72,
-        0x6d, 0x85, 0x6b, 0x94, 0x8d, 0x77, 0x9f, 0x84, 0x7f, 0x92, 0x64, 0xaa,
-        0x78, 0x82, 0xa7, 0x8f, 0x84, 0x79, 0x84, 0xb9, 0x92, 0x7c, 0xb6, 0x96,
-        0x9c, 0x99, 0x8f, 0xab, 0xab, 0x8a, 0xa2, 0xab, 0x6d, 0x97, 0x7b, 0xb1,
-        0x9e, 0x6c, 0x9a, 0x99, 0xaa, 0xa3, 0x70, 0x80, 0x81, 0x6f, 0xb6, 0x95,
-        0x93, 0x93, 0x8e, 0x80, 0x86, 0xb0, 0x87, 0x91, 0x8f, 0x8c, 0xa4, 0x86,
-        0x89, 0x8f, 0x93, 0x83, 0x75, 0x7d, 0x9b, 0x86, 0x7d, 0x5a, 0x9d, 0x67,
-        0x9f, 0x78, 0x5c, 0xa5, 0x8e, 0xa2, 0xc1, 0x95, 0x89, 0x84, 0x53, 0xd1,
-        0x7d, 0x9b, 0xc0, 0x8f, 0x73, 0x7f, 0x85, 0x9e, 0x8a, 0x7b, 0xa6, 0x84,
-        0x6c, 0x74, 0x95, 0x93, 0x7a, 0x7a, 0x81, 0x7d, 0x89, 0x86, 0x76, 0x8a,
-        0xad, 0x66, 0x90, 0x90, 0x9d, 0x77, 0xb4, 0xad, 0xac, 0x8e, 0xb3, 0xa5,
-        0x9d, 0x91, 0xd7, 0x94, 0xba, 0x8b, 0x72, 0xa4, 0x93, 0x7e, 0xa7, 0x86,
-        0xae, 0x83, 0x63, 0xa6, 0xa0, 0x78, 0x81, 0x8b, 0xc4, 0x82, 0x8f, 0x98,
-        0xa1, 0x8f, 0x79, 0x9a, 0x92, 0x85, 0x9d, 0x91, 0x92, 0x84, 0x8f, 0x84,
-        0x91, 0x6d, 0x7b, 0x69, 0x75, 0x87, 0x5d, 0x99, 0x92, 0x83, 0xab, 0x8f,
-        0x53, 0x90, 0x7b, 0xa0, 0x71, 0x89, 0xc2, 0x7f, 0x6a, 0x7c, 0x86, 0xb2,
-        0x8d, 0x89, 0xaf, 0x9c, 0x81, 0x8c, 0x84, 0xbe, 0x93, 0x9c, 0xa8, 0x97,
-        0x68, 0x9b, 0x84, 0xa3, 0x8a, 0x77, 0xa5, 0x79, 0x7b, 0x87, 0x86, 0xa5,
-        0x80, 0x83, 0x9e, 0x8d, 0xb1, 0x94, 0x7a, 0x8b, 0xa6, 0xa8, 0x80, 0x98,
-        0x8c, 0x73, 0xa9, 0x7b, 0x91, 0x8f, 0x71, 0x82, 0x68, 0x84, 0xa5, 0x96,
-        0x67, 0x63, 0xa6, 0x71, 0xa7, 0x85, 0x57, 0x9f, 0x91, 0xb2, 0xa6, 0x87,
-        0x80, 0x8f, 0x6a, 0xba, 0x9d, 0xb7, 0xb9, 0x8b, 0x75, 0x7c, 0x6f, 0x9f,
-        0x74, 0x8d, 0xaf, 0x6e, 0x7c, 0x65, 0x6c, 0x8a, 0x7c, 0x81, 0x89, 0x77,
-        0x8b, 0x74, 0x65, 0x9b, 0xa5, 0x6b, 0x92, 0x71, 0xbb, 0x70, 0x99, 0xbf,
-        0xb0, 0x7b, 0x92, 0xb4, 0xa4, 0x84, 0xc4, 0x92, 0xa8, 0x94, 0x7e, 0xcd,
-        0x83, 0x87, 0xaf, 0xa0, 0xa5, 0x94, 0x72, 0xb9, 0x90, 0xa6, 0x9e, 0x9e,
-        0x9b, 0x7a, 0x68, 0xc0, 0x8f, 0x89, 0x72, 0x94, 0x9b, 0x81, 0x81, 0x91,
-        0x88, 0x90, 0xa8, 0x8d, 0x90, 0x78, 0x7c, 0x67, 0x64, 0x8e, 0x55, 0xa1,
-        0x6d, 0x86, 0xa3, 0x6f, 0x5c, 0x7d, 0x79, 0xa3, 0x64, 0x71, 0xd4, 0x87,
-        0x73, 0x85, 0x76, 0xc7, 0x72, 0x86, 0xb2, 0x8c, 0x7b, 0x8d, 0x96, 0xc3,
-        0xad, 0x87, 0xac, 0xa8, 0x84, 0x94, 0x7b, 0xbf, 0x83, 0x74, 0x8e, 0x8c,
-        0x9c, 0x99, 0x88, 0x8e, 0x86, 0x88, 0xae, 0x7f, 0x70, 0x96, 0x6f, 0x74,
-        0x8f, 0x85, 0x7c, 0x86, 0x97, 0x83, 0xa0, 0x6a, 0x8b, 0x82, 0x88, 0x90,
-        0x72, 0x84, 0x9b, 0xa1, 0x6f, 0x72, 0xa4, 0x95, 0xa6, 0x7d, 0x65, 0xbd,
-        0x90, 0xb6, 0x9e, 0x98, 0xa1, 0x94, 0x66, 0xb3, 0x9c, 0xb3, 0xa7, 0x7f,
-        0x91, 0x69, 0x6e, 0xb1, 0x68, 0x7a, 0xaa, 0x91, 0x7c, 0x71, 0x9f, 0x95,
-        0x83, 0x86, 0x76, 0x69, 0x9b, 0x7f, 0x8c, 0x94, 0x9c, 0x89, 0x86, 0x93,
-        0xc1, 0x79, 0x98, 0x9e, 0xb1, 0x90, 0x9b, 0xb7, 0xab, 0x86, 0xc6, 0xa1,
-        0xa9, 0xaa, 0x86, 0xb0, 0x8b, 0x79, 0xb9, 0x85, 0xbe, 0x92, 0x60, 0xc0,
-        0x9f, 0x9a, 0x90, 0x8d, 0xb5, 0x77, 0x95, 0xad, 0x8b, 0x93, 0x8a, 0x93,
-        0x93, 0x7e, 0x86, 0xa6, 0x7d, 0x89, 0x6b, 0x81, 0x93, 0x75, 0x7f, 0x86,
-        0x66, 0x8f, 0x56, 0x8f, 0x84, 0x75, 0x9e, 0x77, 0x78, 0x89, 0x62, 0xb3,
-        0x78, 0x76, 0xb5, 0x92, 0x7f, 0x80, 0x7a, 0xb9, 0x7d, 0x80, 0xc2, 0xb9,
-        0x7d, 0x8f, 0x8f, 0x8c, 0xa0, 0x78, 0xa2, 0xaf, 0x68, 0x98, 0x77, 0xac,
-        0x96, 0x77, 0x96, 0x99, 0x84, 0xb1, 0x72, 0x8e, 0x96, 0xa4, 0xa9, 0x8e,
-        0x84, 0x7b, 0x85, 0x8d, 0x8f, 0x83, 0x83, 0x7f, 0x85, 0x6e, 0xa4, 0x98,
-        0xab, 0x83, 0x90, 0x8e, 0x77, 0x8e, 0xab, 0x9c, 0x73, 0x79, 0x8d, 0x6e,
-        0xa0, 0x97, 0x68, 0xa7, 0x8a, 0xbd, 0x95, 0x96, 0x96, 0x8b, 0x72, 0xc7,
-        0x8d, 0x8c, 0xa5, 0x83, 0x9b, 0x8b, 0x6c, 0xac, 0x62, 0x78, 0xae, 0x78,
-        0x71, 0x7a, 0x8d, 0xae, 0x91, 0x87, 0x90, 0x82, 0x9b, 0x83, 0x90, 0x97,
-        0xb0, 0x96, 0x82, 0xa5, 0xa9, 0x76, 0xa5, 0xa0, 0xac, 0xa1, 0x93, 0x94,
-        0xb7, 0x91, 0xbb, 0x9b, 0xa4, 0xa5, 0x8c, 0xb5, 0x95, 0x7b, 0x92, 0x91,
-        0xb0, 0x97, 0x73, 0xb9, 0x86, 0xa7, 0x92, 0x98, 0x9e, 0x70, 0x77, 0xba,
-        0x96, 0x7b, 0xa6, 0x86, 0x97, 0x85, 0x8e, 0xaa, 0x93, 0x97, 0x8f, 0x8b,
-        0x8d, 0x79, 0x84, 0x7e, 0x70, 0x95, 0x52, 0x8f, 0x62, 0x75, 0x8b, 0x8b,
-        0x7b, 0x8b, 0x79, 0xaf, 0x90, 0x6d, 0xc8, 0x8d, 0x84, 0x8c, 0x72, 0xaf,
-        0x70, 0x8d, 0xa5, 0x8a, 0x76, 0x97, 0x87, 0x8e, 0xa9, 0x83, 0xb2, 0x8d,
-        0x7e, 0x9b, 0x76, 0xc2, 0xa2, 0x72, 0xc5, 0x87, 0x75, 0xb7, 0x92, 0x95,
-        0x9e, 0xa0, 0xc3, 0x82, 0x8d, 0x8f, 0x7d, 0x85, 0x90, 0x99, 0x7b, 0x82,
-        0x87, 0x87, 0xa0, 0x87, 0x9a, 0x8b, 0xa2, 0xa4, 0x67, 0x93, 0xa5, 0xbb,
-        0x73, 0x5f, 0x8c, 0x60, 0xa5, 0x7d, 0x6c, 0xb3, 0xb2, 0xb3, 0xa9, 0xa9,
-        0x8d, 0x8d, 0x67, 0xd7, 0x63, 0x99, 0xaa, 0x83, 0x88, 0x6a, 0x6f, 0x9e,
-        0x5e, 0x9e, 0x9d, 0x81, 0x84, 0x6e, 0x98, 0x90, 0x89, 0x7c, 0x95, 0x7d,
-        0x81, 0x8a, 0xa2, 0x8c, 0x92, 0x85, 0x80, 0x92, 0xac, 0x80, 0x9b, 0x9b,
-        0xc3, 0x8c, 0x95, 0xbc, 0xaa, 0x7c, 0xb5, 0x8d, 0xa1, 0xb8, 0x70, 0xb6,
-        0x8c, 0x92, 0xa8, 0x8e, 0xa3, 0x76, 0x6c, 0xbe, 0xa0, 0x8c, 0x92, 0x8e,
-        0xa1, 0x83, 0x76, 0xb2, 0x91, 0x7b, 0x8e, 0x87, 0x7f, 0x89, 0x8a, 0xa1,
-        0x91, 0xa0, 0x7a, 0x95, 0x7b, 0x86, 0x99, 0x92, 0x78, 0x8a, 0x62, 0x9e,
-        0x7b, 0x7b, 0x89, 0x79, 0x78, 0x87, 0x82, 0x94, 0x7d, 0x91, 0x96, 0x79,
-        0x7b, 0x8d, 0x80, 0xa7, 0x88, 0x95, 0xa6, 0x8f, 0x7d, 0x95, 0x79, 0xa2,
-        0x91, 0x9b, 0x9d, 0x90, 0x79, 0xa4, 0x88, 0x98, 0x9b, 0x7a, 0xa5, 0x7f,
-        0x71, 0x9c, 0x87, 0x96, 0x8c, 0x8f, 0xbc, 0x74, 0x95, 0x99, 0x7f, 0x78,
-        0x8c, 0x63, 0x7c, 0x7a, 0x92, 0x8c, 0xa8, 0x78, 0xa8, 0x89, 0x9a, 0x86,
-        0x69, 0x7e, 0xa1, 0xc3, 0x57, 0x68, 0x84, 0x89, 0xa9, 0x8d, 0x6f, 0xa9,
-        0x8a, 0xab, 0xa5, 0xad, 0x94, 0x83, 0x6b, 0xa7, 0x7e, 0x95, 0x9b, 0x7f,
-        0x8b, 0x78, 0x73, 0x90, 0x65, 0x8d, 0xb1, 0x91, 0x84, 0x65, 0x90, 0xb4,
-        0x8c, 0x89, 0x94, 0x7c, 0x99, 0x8b, 0x98, 0xb7, 0xb0, 0x91, 0x9e, 0x88,
-        0xbd, 0xa0, 0xa4, 0xb9, 0xad, 0x96, 0x97, 0xa3, 0xb6, 0x81, 0xba, 0x9b,
-        0xbc, 0xa9, 0x94, 0xb9, 0xa0, 0x85, 0x8e, 0xa1, 0xac, 0x87, 0x65, 0xa6,
-        0x98, 0x8e, 0xaa, 0xa3, 0xa3, 0x7f, 0x79, 0xb4, 0x93, 0x76, 0x90, 0x99,
-        0x8b, 0x90, 0x84, 0xa6, 0x90, 0x8f, 0x88, 0xa6, 0x89, 0x83, 0x86, 0x7a,
-        0x5d, 0x96, 0x71, 0xa5, 0x64, 0x94, 0x9a, 0x85, 0x7c, 0xa1, 0x96, 0x9d,
-        0x76, 0x8f, 0x95, 0xa0, 0x7f, 0x8c, 0x80, 0xc7, 0x6c, 0x7d, 0xb7, 0xb2,
-        0x82, 0x8e, 0x82, 0xbd, 0xb3, 0x82, 0x99, 0x9b, 0x80, 0x94, 0x8c, 0x94,
-        0x94, 0x6b, 0xc6, 0xa9, 0x81, 0x9f, 0x8c, 0x7e, 0x87, 0x88, 0xb3, 0x7d,
-        0x88, 0x8c, 0x81, 0x81, 0x7e, 0x7e, 0x86, 0x87, 0x96, 0x85, 0xb4, 0x87,
-        0xab, 0x91, 0x8f, 0xa1, 0x72, 0x83, 0xa4, 0x89, 0x6b, 0x75, 0x85, 0x7c,
-        0x94, 0x85, 0x6f, 0xad, 0x91, 0xae, 0xa4, 0xa5, 0xa7, 0x8e, 0x6c, 0xb2,
-        0x73, 0x99, 0x96, 0x92, 0x89, 0x81, 0x7d, 0x88, 0x60, 0x8d, 0x94, 0x83,
-        0x99, 0x68, 0x86, 0xa2, 0x94, 0x8e, 0x82, 0x76, 0x89, 0x8d, 0x98, 0x86,
-        0x94, 0x90, 0x83, 0x7d, 0xad, 0x94, 0xa6, 0x90, 0xcb, 0x96, 0xa2, 0xb2,
-        0xb6, 0x89, 0xc4, 0x9d, 0xc7, 0xa5, 0x75, 0xc3, 0x92, 0x8c, 0x8e, 0xad,
-        0x96, 0x94, 0x8e, 0xab, 0x94, 0x90, 0xa8, 0x84, 0xb5, 0x84, 0x66, 0xce,
-        0x74, 0x8c, 0x93, 0x8d, 0x8f, 0x95, 0x8b, 0xa1, 0x7b, 0xa1, 0x79, 0x9e,
-        0x81, 0xa4, 0xa0, 0x98, 0x5f, 0x78, 0x8e, 0x97, 0x6f, 0x81, 0x96, 0x8d,
-        0x70, 0x93, 0x72, 0x9c, 0x7b, 0x98, 0x8b, 0x8a, 0x8f, 0x8b, 0x6c, 0xa9,
-        0x81, 0x99, 0xb3, 0xa3, 0x71, 0x9c, 0x8b, 0x94, 0xa6, 0x8a, 0xb8, 0xa0,
-        0x7b, 0x98, 0x74, 0x9f, 0x92, 0x92, 0xb2, 0x89, 0x81, 0xa8, 0x87, 0x97,
-        0x96, 0x86, 0xa4, 0x7b, 0x63, 0x8e, 0x86, 0x7d, 0x76, 0x81, 0x93, 0x94,
-        0x98, 0x8b, 0xaf, 0x6d, 0xab, 0x9b, 0x85, 0x9b, 0x91, 0x86, 0x95, 0x95,
-        0x65, 0x89, 0x9e, 0x6b, 0xa4, 0x82, 0x68, 0xb5, 0x8b, 0xd1, 0x9d, 0x93,
-        0x7d, 0x67, 0x5e, 0xba, 0x9b, 0x94, 0x93, 0x8d, 0x88, 0x73, 0x7c, 0x8e,
-        0x7d, 0x83, 0x9a, 0x82, 0xa4, 0x62, 0x9a, 0x8d, 0x86, 0xa0, 0x7b, 0x72,
-        0xa9, 0x84, 0xa7, 0x94, 0xb2, 0x98, 0x8f, 0x81, 0xbe, 0x84, 0x9d, 0x94,
-        0x9c, 0x9a, 0x94, 0x8f, 0xb1, 0x82, 0xb1, 0x82, 0xb1, 0xb2, 0x78, 0xa7,
-        0x95, 0x99, 0x8b, 0x8c, 0xb1, 0x81, 0x5b, 0xbb, 0x88, 0x7a, 0x90, 0xa3,
-        0x8d, 0x78, 0x6f, 0xbf, 0x8c, 0x93, 0xa1, 0x8e, 0x9f, 0x98, 0x88, 0xb3,
-        0x7e, 0x82, 0x8a, 0x8e, 0x7d, 0x8a, 0x96, 0x6a, 0x6c, 0x7b, 0x91, 0x94,
-        0x6f, 0x89, 0x9a, 0x84, 0x73, 0x8b, 0x8c, 0x91, 0x7d, 0x8e, 0x9e, 0x80,
-        0x88, 0x81, 0x78, 0xaf, 0x86, 0xa5, 0xa2, 0x8d, 0x6a, 0x8a, 0x75, 0xa1,
-        0x83, 0x87, 0xaf, 0x7d, 0x6c, 0xa3, 0x65, 0x77, 0x89, 0x91, 0x9a, 0xa1,
-        0xa1, 0xaf, 0x78, 0x94, 0x93, 0xb2, 0xaf, 0x92, 0x74, 0x7a, 0xa7, 0x7b,
-        0x8f, 0x9c, 0x86, 0x8d, 0x8f, 0x79, 0xb0, 0xb3, 0x97, 0x82, 0x8e, 0x92,
-        0x92, 0x81, 0xa7, 0xbc, 0x6e, 0x6e, 0x89, 0xa5, 0x9a, 0x8d, 0x84, 0xb6,
-        0x83, 0xae, 0xa5, 0xa7, 0xae, 0x86, 0x6b, 0xb9, 0x89, 0xb0, 0x8f, 0x82,
-        0x8f, 0x6f, 0x83, 0x98, 0x6a, 0x98, 0x9a, 0x85, 0x9f, 0x78, 0x93, 0x8d,
-        0x83, 0x88, 0x88, 0x7e, 0x97, 0x99, 0x8a, 0x9b, 0xb0, 0x90, 0x86, 0x88,
-        0xb5, 0x90, 0xb3, 0xaa, 0xad, 0x96, 0x93, 0xa3, 0x9d, 0x81, 0xa3, 0x9a,
-        0x9f, 0x99, 0x90, 0x9c, 0x9e, 0x8e, 0x88, 0x93, 0xa8, 0x94, 0x62, 0xa6,
-        0x94, 0x92, 0xa1, 0x86, 0xb7, 0x8a, 0x6a, 0xa6, 0x81, 0x7e, 0x7b, 0x80,
-        0x89, 0x8f, 0x74, 0xa6, 0x72, 0x91, 0xa6, 0x9b, 0x73, 0x97, 0x7e, 0x6f,
-        0x70, 0x8d, 0x73, 0x98, 0x80, 0x90, 0x8f, 0x7e, 0x83, 0x77, 0x84, 0x92,
-        0x7f, 0x8c, 0x91, 0xa6, 0x99, 0x90, 0x9d, 0xb1, 0x88, 0x85, 0x89, 0x85,
-        0x7c, 0x9f, 0x7e, 0xb0, 0xaa, 0x84, 0xa0, 0x8e, 0x74, 0x93, 0x78, 0x90,
-        0x9a, 0x8b, 0x8e, 0x97, 0x8f, 0x9f, 0x7c, 0x83, 0x8a, 0x88, 0xa5, 0x8f,
-        0x8b, 0x74, 0x84, 0x9a, 0x7f, 0x91, 0x88, 0x77, 0x9c, 0x91, 0xbc, 0x93,
-        0x9c, 0x82, 0x89, 0x9b, 0x8a, 0x7d, 0xb7, 0xb8, 0x6f, 0x68, 0xb5, 0x8e,
-        0xb4, 0x86, 0x8c, 0xb3, 0x94, 0xb6, 0xa4, 0x93, 0x98, 0x8b, 0x70, 0xb3,
-        0x96, 0xaa, 0x87, 0x89, 0x99, 0x68, 0x74, 0xa4, 0x69, 0x9e, 0x8e, 0x6b,
-        0x9f, 0x6b, 0x95, 0x9c, 0x88, 0x89, 0x8a, 0x86, 0x8d, 0x75, 0x94, 0x88,
-        0xa0, 0x94, 0x77, 0x8c, 0x9c, 0x8d, 0x8e, 0xa4, 0xac, 0xa7, 0x8a, 0x9b,
-        0xa9, 0x81, 0xab, 0xac, 0xaf, 0xaf, 0x87, 0xbb, 0x9b, 0x95, 0x8e, 0x9e,
-        0x9f, 0xa1, 0x6c, 0xb4, 0x98, 0x8f, 0x81, 0x8d, 0x98, 0x8f, 0x78, 0x96,
-        0x89, 0x86, 0x6c, 0x91, 0x8d, 0x9f, 0x95, 0x9f, 0x6b, 0x7f, 0x93, 0x7c,
-        0x96, 0x8e, 0x8a, 0x58, 0x80, 0x8e, 0x7a, 0x93, 0x8b, 0x78, 0x99, 0x92,
-        0x62, 0x8e, 0x83, 0x8e, 0x87, 0x83, 0x86, 0x99, 0x93, 0x92, 0x80, 0x95,
-        0xa2, 0x72, 0xa2, 0x97, 0x78, 0x87, 0x7b, 0xa3, 0x99, 0x78, 0x98, 0x9c,
-        0x80, 0x9b, 0x5e, 0x8a, 0x9c, 0x99, 0xa6, 0x7a, 0x8e, 0x99, 0x7a, 0x8e,
-        0x8b, 0x76, 0x9b, 0x89, 0x80, 0x8e, 0x83, 0x8a, 0x80, 0x7c, 0x80, 0x74,
-        0x95, 0x8c, 0xbf, 0x7e, 0xa8, 0x7a, 0x99, 0x7d, 0x7d, 0x73, 0xb4, 0xae,
-        0x88, 0x76, 0xae, 0x78, 0xaa, 0x65, 0x94, 0xbe, 0x97, 0xaf, 0xa4, 0x91,
-        0x9c, 0x95, 0x6c, 0xbe, 0x82, 0xb1, 0x9b, 0x91, 0x85, 0x7d, 0x66, 0x9c,
-        0x99, 0xbd, 0xa3, 0x88, 0xa8, 0x73, 0x81, 0x94, 0x92, 0x8e, 0x90, 0x8d,
-        0xaf, 0x75, 0x86, 0x9b, 0x8b, 0x8b, 0x8d, 0x74, 0xbd, 0x85, 0x97, 0x8b,
-        0x9d, 0xba, 0x90, 0xa8, 0x9d, 0x72, 0xa5, 0xa8, 0xbf, 0xbb, 0x7b, 0xb6,
-        0xad, 0x94, 0x6f, 0x9a, 0xa7, 0x97, 0x78, 0x9c, 0x98, 0x8d, 0x8c, 0x93,
-        0xb8, 0xa8, 0x7f, 0x9d, 0x98, 0x7f, 0x8f, 0x8a, 0x8d, 0xa8, 0x86, 0x7b,
-        0x5d, 0x89, 0x8a, 0x83, 0x8c, 0x8b, 0x81, 0x56, 0x7c, 0x87, 0x89, 0xa6,
-        0x75, 0x7c, 0x92, 0x74, 0x96, 0x92, 0x78, 0x8d, 0x8d, 0x98, 0xae, 0x7a,
-        0x95, 0x8f, 0x8b, 0x9c, 0x95, 0x9f, 0xae, 0x93, 0x7b, 0x93, 0x8c, 0x9a,
-        0x79, 0x74, 0x94, 0x6e, 0x7e, 0x8f, 0x64, 0x9f, 0x9c, 0x88, 0x8f, 0x8e,
-        0x84, 0x8d, 0x89, 0x95, 0x96, 0x8f, 0x9d, 0x60, 0x85, 0x86, 0x7c, 0x93,
-        0x8d, 0x68, 0x83, 0x7c, 0x94, 0x87, 0xb8, 0xa2, 0x9d, 0x82, 0x8e, 0x84,
-        0x6c, 0x73, 0xa8, 0xbc, 0x84, 0x85, 0xa2, 0x79, 0x92, 0x64, 0x69, 0xa9,
-        0x82, 0xa7, 0x9d, 0x95, 0x8e, 0x6f, 0x9f, 0xa7, 0x97, 0xb1, 0x9d, 0x8e,
-        0xa1, 0x70, 0x80, 0x9e, 0x8e, 0x91, 0xa0, 0xaa, 0x81, 0x5b, 0x98, 0x8f,
-        0xa0, 0xaa, 0x83, 0x7a, 0x91, 0x7a, 0x73, 0x80, 0xa6, 0x9a, 0x80, 0x7d,
-        0x9e, 0x75, 0x7b, 0xa3, 0xad, 0x92, 0x98, 0xc0, 0xa1, 0x80, 0x88, 0xa2,
-        0xa5, 0xa4, 0x7e, 0x9b, 0xa0, 0x80, 0x6e, 0xa0, 0x9f, 0xa3, 0x8a, 0x8f,
-        0xa2, 0x93, 0x86, 0x8d, 0x8f, 0x93, 0x7e, 0x90, 0x98, 0x83, 0x7d, 0x9b,
-        0x9f, 0x9a, 0x97, 0x83, 0x6e, 0x8d, 0x94, 0x6c, 0x7b, 0x7f, 0x73, 0x65,
-        0x6a, 0x93, 0x8a, 0x94, 0x83, 0x89, 0x7d, 0x7b, 0x77, 0x8a, 0x7a, 0x9b,
-        0x8e, 0x8d, 0x94, 0x89, 0x86, 0x83, 0x7c, 0x8e, 0x8b, 0x90, 0xab, 0x99,
-        0x81, 0x8e, 0x77, 0x9c, 0x8c, 0x82, 0x97, 0x8f, 0x78, 0x91, 0x5f, 0xa1,
-        0x8b, 0x83, 0xa9, 0x8d, 0x7b, 0x97, 0x77, 0x80, 0x84, 0x7e, 0x9e, 0x75,
-        0xa3, 0x86, 0x67, 0x7c, 0x80, 0x6d, 0x77, 0x75, 0x88, 0x75, 0xad, 0x7a,
-        0x93, 0x89, 0x8c, 0x87, 0x7a, 0x79, 0xb2, 0xa1, 0x69, 0x80, 0xb5, 0x7a,
-        0xa6, 0x7b, 0x95, 0xac, 0x95, 0xa9, 0x98, 0xa4, 0xad, 0x83, 0x8d, 0xbe,
-        0xa4, 0x98, 0xad, 0x7d, 0x8b, 0x65, 0x65, 0xad, 0x6a, 0xae, 0xa3, 0xa8,
-        0x9c, 0x63, 0x90, 0x91, 0x6d, 0x9a, 0x81, 0x98, 0x86, 0x6a, 0x83, 0x84,
-        0x94, 0x9c, 0x77, 0x86, 0xc2, 0x7f, 0x9b, 0xa9, 0xad, 0xae, 0xa7, 0xa6,
-        0xd4, 0x70, 0x9d, 0xb5, 0xaa, 0xdb, 0x8f, 0xa3, 0xa5, 0x87, 0x88, 0x9e,
-        0xa9, 0x9f, 0x62, 0xa7, 0xa2, 0x8e, 0x7d, 0x8a, 0x9d, 0xa2, 0x6b, 0xa7,
-        0x96, 0x6d, 0x76, 0x8c, 0x9b, 0x8c, 0x86, 0x86, 0x93, 0x7c, 0x9d, 0x7c,
-        0x7e, 0x93, 0x5c, 0x79, 0x76, 0x8c, 0x8a, 0x87, 0x79, 0x97, 0x9a, 0x7a,
-        0x85, 0x8c, 0x7f, 0x85, 0x7a, 0xa1, 0xa7, 0x72, 0x87, 0x7f, 0x96, 0x9e,
-        0x92, 0x92, 0x9e, 0xa0, 0x72, 0x99, 0x7a, 0xb0, 0x8c, 0x8d, 0xa3, 0x9b,
-        0x91, 0xa6, 0x63, 0x94, 0x8b, 0x81, 0xbb, 0x94, 0x79, 0x95, 0x99, 0x9a,
-        0xa0, 0x7a, 0x96, 0x72, 0x82, 0x9a, 0x83, 0x7f, 0x72, 0x7f, 0x6d, 0x75,
-        0x91, 0x7f, 0xbc, 0x84, 0x9a, 0x81, 0x95, 0x69, 0x7d, 0x6d, 0xa2, 0xa8,
-        0x7e, 0x64, 0xac, 0x86, 0x85, 0x6d, 0x99, 0xaa, 0x7e, 0x79, 0x9c, 0xa0,
-        0xa4, 0x77, 0x99, 0xac, 0xa8, 0x8d, 0xb7, 0xa2, 0xa3, 0x61, 0x82, 0x98,
-        0x84, 0x8e, 0xa1, 0x8c, 0x88, 0x82, 0x6f, 0x7d, 0x88, 0x80, 0x7a, 0x8a,
-        0x8c, 0x6d, 0x87, 0x6f, 0xab, 0x8f, 0x8b, 0x76, 0xa0, 0x7d, 0x9f, 0xab,
-        0xb0, 0xb8, 0x9c, 0x8d, 0xb8, 0x81, 0x89, 0x94, 0xa8, 0xc8, 0x92, 0x9b,
-        0x8d, 0x83, 0x7b, 0xaf, 0x97, 0x94, 0x6e, 0xa5, 0x9b, 0x97, 0x89, 0x8d,
-        0xaa, 0x8a, 0x66, 0x88, 0x93, 0x84, 0xa1, 0x88, 0xa0, 0x99, 0x85, 0x89,
-        0x7d, 0x84, 0x8b, 0x6a, 0x92, 0xa1, 0x74, 0x76, 0x73, 0x87, 0x7a, 0x9a,
-        0x77, 0x86, 0x89, 0x5f, 0x7f, 0x8b, 0x7f, 0x8d, 0x7e, 0x81, 0x95, 0x8a,
-        0x7d, 0x85, 0x74, 0x9a, 0x87, 0x8c, 0x9e, 0xae, 0x80, 0x88, 0x7d, 0x8b,
-        0xaa, 0x79, 0x7c, 0x97, 0x79, 0x90, 0x7b, 0x97, 0x97, 0x9f, 0xa1, 0xa2,
-        0xab, 0x97, 0x69, 0x7a, 0x8d, 0x9f, 0x9f, 0x89, 0x90, 0x8c, 0x66, 0x98,
-        0x6e, 0x86, 0x7b, 0x6e, 0x86, 0x8a, 0xb2, 0xa6, 0x93, 0x7d, 0x8c, 0x81,
-        0x7e, 0x84, 0xa6, 0xb6, 0x83, 0x92, 0xa0, 0x88, 0x90, 0x5f, 0x7c, 0x92,
-        0x98, 0x94, 0x92, 0x98, 0xa7, 0x65, 0x90, 0xa2, 0xa2, 0x9b, 0xa6, 0x7d,
-        0x8b, 0x5a, 0x94, 0x95, 0x9b, 0xa5, 0x99, 0xa5, 0x7e, 0x61, 0x9a, 0x7a,
-        0x8b, 0x77, 0x87, 0x76, 0x9d, 0x72, 0x9a, 0x84, 0x98, 0x94, 0x92, 0x73,
-        0xae, 0x78, 0x8e, 0xaa, 0xa0, 0xc3, 0x7a, 0xa4, 0xa0, 0x75, 0xa9, 0xae,
-        0x8c, 0xd6, 0x87, 0x8f, 0x9f, 0x8c, 0x9b, 0x90, 0x99, 0x97, 0x73, 0x8f,
-        0x9b, 0x9c, 0x8c, 0x89, 0xa5, 0x84, 0x8f, 0x7b, 0x8b, 0x7f, 0x97, 0x98,
-        0x8d, 0x7b, 0x94, 0x9d, 0x9c, 0x8e, 0x92, 0x89, 0x88, 0x8d, 0x6c, 0x63,
-        0x73, 0x81, 0x72, 0x8a, 0x88, 0x8a, 0x9f, 0x79, 0x81, 0x82, 0x9a, 0xa9,
-        0x7a, 0x92, 0x7d, 0x76, 0x7b, 0x7a, 0x6a, 0xbe, 0x91, 0x7d, 0x86, 0xad,
-        0x84, 0x86, 0x6c, 0x91, 0x91, 0x9f, 0x92, 0x6b, 0x95, 0x98, 0x84, 0xa0,
-        0x8f, 0x8b, 0x9e, 0x7f, 0x9f, 0x97, 0x7e, 0x87, 0x80, 0x9e, 0x79, 0x8d,
-        0x68, 0x87, 0x88, 0x7d, 0x89, 0x81, 0x6d, 0x85, 0x80, 0x82, 0xa0, 0x97,
-        0xa3, 0x72, 0x94, 0x74, 0x8e, 0x56, 0x96, 0x98, 0x91, 0x6f, 0xa0, 0xae,
-        0x7c, 0x6e, 0x8e, 0xa9, 0x7c, 0x80, 0x87, 0xa3, 0x9e, 0x57, 0x8e, 0xb5,
-        0x87, 0xa6, 0x87, 0x79, 0x8f, 0x55, 0x8a, 0x81, 0x97, 0x6c, 0x9b, 0x99,
-        0x78, 0x5c, 0x82, 0x80, 0x91, 0x76, 0x80, 0x91, 0x8b, 0x65, 0x89, 0x7d,
-        0xa9, 0x95, 0x89, 0x97, 0x96, 0x6a, 0x89, 0xad, 0x92, 0x9f, 0xb6, 0x82,
-        0x88, 0x79, 0x9d, 0xa5, 0x9c, 0xae, 0x9a, 0x93, 0x77, 0x8e, 0x8a, 0xb5,
-        0x84, 0xb0, 0x76, 0xa2, 0x89, 0xa0, 0x96, 0x7a, 0xa5, 0x8e, 0x7e, 0x74,
-        0x8d, 0x89, 0x89, 0x9e, 0x93, 0x95, 0x90, 0x78, 0x93, 0x8f, 0xa5, 0x7c,
-        0x9d, 0x7c, 0x77, 0x85, 0x81, 0x92, 0x7c, 0x87, 0x92, 0x82, 0x98, 0xa3,
-        0x63, 0x76, 0x9b, 0x91, 0x7b, 0x8e, 0x97, 0x7e, 0x66, 0x90, 0x63, 0xb4,
-        0x71, 0x88, 0x86, 0x8e, 0x6f, 0x89, 0x7a, 0x88, 0x93, 0x7f, 0x96, 0xa8,
-        0x7d, 0x88, 0x88, 0x86, 0x7b, 0x91, 0x88, 0x6b, 0xa6, 0x8b, 0x69, 0x78,
-        0x82, 0x80, 0x83, 0x6b, 0xaf, 0x81, 0x7b, 0x64, 0x8f, 0x78, 0x6e, 0x7f,
-        0x86, 0x91, 0x92, 0xa3, 0xa0, 0x97, 0x82, 0x88, 0x92, 0x90, 0x9e, 0x89,
-        0x9d, 0x7b, 0x96, 0x82, 0xa3, 0x8c, 0x7f, 0x84, 0x7a, 0x6c, 0x60, 0x85,
-        0xa9, 0x74, 0x83, 0xa2, 0x89, 0x87, 0x9b, 0x77, 0x9b, 0x9a, 0x99, 0x84,
-        0x7c, 0x9c, 0x8d, 0x90, 0x8d, 0x7b, 0x74, 0x77, 0x93, 0x8c, 0x6c, 0x8b,
-        0x85, 0x78, 0x7f, 0x7d, 0x75, 0x7f, 0x7e, 0x85, 0x8f, 0x7d, 0x62, 0x8c,
-        0x7c, 0xad, 0x7f, 0x83, 0xa1, 0xa1, 0x97, 0x7b, 0x72, 0x82, 0x9d, 0x81,
-        0x94, 0x81, 0x8d, 0x9f, 0x6f, 0x8f, 0x9d, 0x89, 0x6a, 0x7e, 0x7f, 0x7f,
-        0x8d, 0x7e, 0x91, 0x86, 0x7d, 0x8a, 0x7e, 0x70, 0x7b, 0x9b, 0x6e, 0x5f,
-        0xa8, 0x7a, 0x73, 0x8a, 0x7a, 0x71, 0x90, 0x95, 0x8d, 0x78, 0x7b, 0x72,
-        0x5e, 0x89, 0x62, 0xa1, 0x87, 0x7f, 0x83, 0x75, 0x98, 0x7f, 0x76, 0x72,
-        0x8f, 0x9b, 0x7a, 0x8b, 0xa1, 0x7f, 0x60, 0x99, 0x96, 0x6e, 0x67, 0x76,
-        0x88, 0x98, 0x6c, 0x7b, 0x9b, 0x8d, 0x5f, 0x89, 0x7c, 0x81, 0x79, 0x86,
-        0x69, 0x9e, 0x83, 0x65, 0x8e, 0x82, 0x83, 0x89, 0x85, 0x7f, 0x90, 0x80,
-        0xa2, 0x81, 0x85, 0x83, 0x8e, 0x94, 0x94, 0x75, 0x86, 0x87, 0x9a, 0xb2,
-        0x82, 0x99, 0x85, 0x7f, 0x8c, 0x7e, 0x81, 0x9a, 0x81, 0x7d, 0x87, 0x81,
-        0xa3, 0x8c, 0x8d, 0x85, 0x8d, 0x96, 0x86, 0x7c, 0xa7, 0x87, 0x7e, 0x9d,
-        0x63, 0xa8, 0x7c, 0x97, 0xa2, 0xa4, 0x7e, 0x87, 0x93, 0x9e, 0x89, 0x8d,
-        0x6b, 0x6d, 0x9d, 0x9b, 0x78, 0x8a, 0x8e, 0x7f, 0x7b, 0xa5, 0x6e, 0x8c,
-        0x89, 0x88, 0x73, 0x7e, 0x77, 0x9d, 0xa6, 0xa7, 0x77, 0x87, 0x7e, 0x7e,
-        0x97, 0x84, 0x6b, 0x59, 0x60, 0x90, 0x85, 0x76, 0x8f, 0x61, 0x7f, 0x94,
-        0x8f, 0x84, 0x8b, 0x7f, 0x73, 0x77, 0x73, 0x71, 0x8a, 0x9b, 0x7b, 0x89,
-        0x97, 0x8f, 0x76, 0x63, 0xa3, 0xa1, 0x6b, 0x7c, 0x62, 0x95, 0x8e, 0xa3,
-        0x9f, 0x89, 0x8f, 0x7f, 0x92, 0x7c, 0xa2, 0xa4, 0xa6, 0x92, 0x89, 0x93,
-        0x74, 0x73, 0x73, 0x96, 0xad, 0x9b, 0x87, 0xac, 0x91, 0x8a, 0xa0, 0x70,
-        0x70, 0x7e, 0x8f, 0x74, 0x75, 0xaf, 0x8d, 0x82, 0x8e, 0x82, 0x96, 0x7d,
-        0x69, 0x9c, 0x64, 0xa2, 0x82, 0x89, 0x83, 0x9d, 0x83, 0x88, 0x62, 0x92,
-        0x72, 0x89, 0x6d, 0x7f, 0x92, 0x70, 0x8e, 0x80, 0x7e, 0x8d, 0x91, 0x85,
-        0x8d, 0x89, 0x83, 0x96, 0x90, 0x96, 0x9c, 0xa6, 0x8a, 0x73, 0x89, 0x79,
-        0xa9, 0x70, 0x80, 0x78, 0x96, 0x80, 0x7b, 0x85, 0xa5, 0x80, 0x93, 0x95,
-        0xc5, 0x74, 0x81, 0x88, 0xa2, 0x93, 0x86, 0x9c, 0xa3, 0x6d, 0x92, 0x8a,
-        0x92, 0x99, 0x98, 0x65, 0xad, 0x63, 0x9d, 0x95, 0x99, 0x89, 0x7f, 0x7a,
-        0x99, 0x91, 0x7f, 0x78, 0x90, 0x8f, 0x80, 0x85, 0xa1, 0x68, 0x9d, 0x6c,
-        0x83, 0x8f, 0x7c, 0x5e, 0x99, 0x7b, 0x80, 0x91, 0x66, 0x8a, 0x92, 0xb3,
-        0x7a, 0x99, 0x91, 0x7e, 0x7d, 0x96, 0x69, 0x9e, 0x7c, 0x89, 0xad, 0x8f,
-        0x9d, 0x90, 0x85, 0x8e, 0x72, 0xa9, 0x89, 0x83, 0x7c, 0x82, 0x70, 0x82,
-        0x6b, 0x79, 0x75, 0x8d, 0x77, 0x9b, 0x7c, 0x8f, 0x8a, 0x95, 0x87, 0x9f,
-        0x7c, 0x90, 0x87, 0x70, 0x83, 0x83, 0x98, 0x9f, 0x85, 0x86, 0x8d, 0x81,
-        0x87, 0x87, 0x87, 0x9d, 0x8f, 0x9d, 0x7c, 0x98, 0xa2, 0xac, 0x88, 0x93,
-        0x88, 0x7d, 0x9b, 0x76, 0x82, 0x67, 0x69, 0x7f, 0x8c, 0x8d, 0x94, 0x7d,
-        0x7b, 0xae, 0x8c, 0x85, 0x8b, 0xa7, 0x8c, 0x87, 0x96, 0x7d, 0x8b, 0x90,
-        0x90, 0x7c, 0x92, 0xa8, 0x81, 0x87, 0xa4, 0xa4, 0x82, 0x8b, 0x8d, 0x89,
-        0x8f, 0x70, 0x9d, 0x7f, 0xa0, 0x84, 0x99, 0x65, 0x99, 0x78, 0x94, 0x8b,
-        0xc5, 0x8d, 0x8d, 0x55, 0xb3, 0x8d, 0x78, 0x93, 0xb4, 0x6d, 0x84, 0x90,
-        0xd5, 0x76, 0x7a, 0x9e, 0xc8, 0x8f, 0x86, 0x8a, 0xaa, 0x8b, 0x7f, 0x90,
-        0xaa, 0x95, 0x9c, 0x81, 0xb4, 0x6b, 0x64, 0x8a, 0x99, 0x84, 0x74, 0x6e,
-        0x95, 0x75, 0x98, 0x92, 0x9a, 0x91, 0x8c, 0x7d, 0x88, 0x6e, 0x89, 0x7d,
-        0x87, 0x80, 0x8e, 0x86, 0x78, 0x9f, 0x96, 0x75, 0x76, 0x82, 0x84, 0xaf,
-        0x8a, 0xb3, 0x93, 0x97, 0x86, 0x7c, 0x7e, 0x96, 0x7c, 0x6d, 0x90, 0x8e,
-        0x85, 0x88, 0x8a, 0x9f, 0x70, 0x89, 0x9f, 0x99, 0x95, 0x87, 0x91, 0x9d,
-        0x80, 0x74, 0x88, 0x7c, 0x7f, 0xa8, 0x93, 0x77, 0x66, 0xa6, 0x80, 0xa2,
-        0x88, 0xa0, 0xaf, 0x6f, 0x76, 0x70, 0x82, 0x9a, 0x73, 0x89, 0x9a, 0x75,
-        0x75, 0x8e, 0x5f, 0x85, 0x6a, 0x76, 0x98, 0x66, 0x87, 0xa3, 0x7a, 0x73,
-        0x9d, 0xa1, 0x98, 0x8e, 0x78, 0x91, 0x83, 0x8c, 0x82, 0x9e, 0x90, 0x87,
-        0x8f, 0x9b, 0x8b, 0x8f, 0x89, 0x62, 0x74, 0x82, 0x7b, 0x7f, 0x8a, 0x9d,
-        0x89, 0x93, 0x8c, 0x7a, 0x99, 0x77, 0xac, 0x75, 0x9b, 0x7f, 0x7f, 0x56,
-        0x8c, 0x96, 0x70, 0x79, 0xc2, 0x7d, 0x90, 0x64, 0xe9, 0x79, 0x68, 0xb2,
-        0xc2, 0xa6, 0xa7, 0x7e, 0xd9, 0x98, 0x79, 0x87, 0xc0, 0x97, 0x87, 0x66,
-        0xd0, 0x9f, 0x92, 0x82, 0xa4, 0xa8, 0x8d, 0x78, 0xa6, 0xa1, 0x76, 0x7d,
-        0xa4, 0x87, 0x89, 0x51, 0xae, 0x88, 0x5b, 0x76, 0x7d, 0x70, 0x74, 0x93,
-        0x89, 0x74, 0x9e, 0x7a, 0x79, 0x64, 0x9a, 0x94, 0x65, 0x93, 0xb0, 0x8d,
-        0x88, 0x7e, 0x8e, 0xa5, 0x63, 0x94, 0x94, 0x7d, 0x91, 0x87, 0x84, 0x95,
-        0x75, 0x9e, 0x81, 0x99, 0x65, 0x76, 0x82, 0x9c, 0x6a, 0xab, 0x84, 0x85,
-        0x88, 0x72, 0x92, 0x83, 0x82, 0xaf, 0x6d, 0x9d, 0x9e, 0x73, 0x98, 0x7f,
-        0x91, 0xb4, 0x62, 0x8d, 0x74, 0x6e, 0xb4, 0x94, 0x97, 0x9e, 0x6f, 0x9a,
-        0x83, 0x7b, 0xa9, 0x7d, 0x87, 0x97, 0x60, 0xa9, 0x7a, 0x75, 0xad, 0x6c,
-        0x77, 0xa4, 0x88, 0x82, 0x6f, 0x8a, 0x83, 0x74, 0x9a, 0xa7, 0x83, 0x91,
-        0x7c, 0x7c, 0x78, 0x77, 0x83, 0x92, 0x7a, 0x83, 0x90, 0x6f, 0x79, 0x6b,
-        0x9b, 0x8d, 0x99, 0x95, 0x7b, 0x89, 0x8e, 0x6c, 0x8e, 0x6c, 0x9b, 0x91,
-        0x97, 0x80, 0x83, 0x6f, 0xaa, 0x91, 0x66, 0x76, 0xc9, 0x77, 0x82, 0x4d,
-        0xd7, 0x5f, 0x58, 0x9a, 0xb1, 0x7a, 0xb1, 0x6b, 0xe5, 0x9d, 0x76, 0x89,
-        0xb6, 0x94, 0x90, 0x5b, 0xb8, 0x92, 0x7d, 0x90, 0xbd, 0x9a, 0x85, 0x4e,
-        0xb4, 0x84, 0x61, 0x82, 0x94, 0x8e, 0x70, 0x57, 0x90, 0x89, 0x6f, 0x60,
-        0x78, 0x90, 0x78, 0x85, 0x8e, 0x7c, 0x76, 0x74, 0x71, 0x5d, 0x94, 0x93,
-        0x71, 0x8f, 0xc2, 0x80, 0x75, 0x7d, 0x77, 0xa8, 0x70, 0x8f, 0xa6, 0x83,
-        0x74, 0x6b, 0x79, 0x97, 0x76, 0xa2, 0xad, 0x93, 0x5b, 0x8c, 0x7c, 0x7e,
-        0x82, 0x9b, 0xa0, 0x76, 0x71, 0x7a, 0xa3, 0x80, 0x87, 0x90, 0x92, 0xa6,
-        0x85, 0x71, 0x99, 0x91, 0x91, 0x8c, 0x99, 0x9b, 0x92, 0x74, 0xb2, 0x79,
-        0x9c, 0x7c, 0x7b, 0xa8, 0x8c, 0x6f, 0xb5, 0x69, 0x7a, 0x8a, 0x68, 0x9f,
-        0x82, 0x7d, 0xbd, 0x5f, 0xa1, 0x92, 0x83, 0x9f, 0x6f, 0xa1, 0x88, 0x61,
-        0x7b, 0x94, 0x89, 0x83, 0x6f, 0x6e, 0x92, 0x9d, 0x65, 0x7f, 0x97, 0x83,
-        0x87, 0x75, 0x92, 0x8a, 0x82, 0x82, 0x79, 0x92, 0x78, 0x89, 0x92, 0x7a,
-        0x91, 0x64, 0x8a, 0x93, 0x9d, 0x74, 0x78, 0x64, 0xab, 0x57, 0x7a, 0x84,
-        0xcf, 0x7d, 0x95, 0x4f, 0xde, 0x63, 0x78, 0x9a, 0xb7, 0x7a, 0x8b, 0x5b,
-        0xda, 0xa3, 0x94, 0x99, 0xbd, 0x88, 0xa4, 0x53, 0xad, 0x8b, 0x81, 0x96,
-        0xca, 0x8f, 0x76, 0x5e, 0xbd, 0x9d, 0x70, 0x81, 0x9b, 0x7d, 0x8a, 0x44,
-        0xa0, 0x77, 0x52, 0x6e, 0x82, 0x62, 0x6a, 0x6b, 0x9d, 0xaa, 0x81, 0x85,
-        0x7d, 0x5f, 0x7f, 0x9c, 0x65, 0x99, 0x97, 0x81, 0x7f, 0x65, 0x65, 0xa4,
-        0x84, 0x8c, 0xa1, 0x6d, 0x7a, 0x70, 0x79, 0x90, 0x98, 0xaa, 0x76, 0x95,
-        0x7f, 0x91, 0x95, 0x96, 0x6e, 0xa5, 0x95, 0xa2, 0x7d, 0x7e, 0x93, 0x87,
-        0x7d, 0x9b, 0x85, 0x9b, 0x85, 0x79, 0x96, 0x6b, 0x9d, 0x9d, 0x61, 0x99,
-        0x9c, 0x74, 0xcc, 0x7e, 0x9a, 0x83, 0x83, 0x98, 0x6f, 0x6d, 0xc5, 0x69,
-        0xb0, 0xa5, 0x5c, 0x91, 0x6c, 0x7b, 0xcc, 0x72, 0x9a, 0x9d, 0x7e, 0xa3,
-        0x8a, 0x96, 0x8e, 0x74, 0x7b, 0x80, 0x6b, 0x85, 0x84, 0x56, 0x92, 0x83,
-        0x64, 0x90, 0x86, 0x86, 0x88, 0x79, 0x8b, 0xa0, 0x86, 0x72, 0xab, 0x95,
-        0x80, 0x81, 0x96, 0x8f, 0x75, 0x7f, 0x71, 0x92, 0x9e, 0x75, 0x62, 0x5e,
-        0xc3, 0x7a, 0x6c, 0x84, 0xba, 0x81, 0x8f, 0x49, 0xc9, 0x76, 0x54, 0x89,
-        0xc2, 0x8c, 0xa2, 0x54, 0xd8, 0xa4, 0x72, 0x90, 0xb1, 0x91, 0xa0, 0x7a,
-        0xbf, 0x9a, 0x6f, 0x82, 0xbb, 0x81, 0x6a, 0x52, 0xc2, 0x82, 0x52, 0x65,
-        0x8d, 0x8a, 0x84, 0x46, 0xa2, 0x90, 0x45, 0x52, 0x82, 0x61, 0x8c, 0x77,
-        0x92, 0x6d, 0x87, 0x5b, 0x5e, 0x72, 0x76, 0x97, 0x73, 0x8d, 0x8d, 0x70,
-        0x7a, 0x66, 0x76, 0x89, 0x72, 0xbf, 0xb0, 0x84, 0x7d, 0x80, 0x71, 0x8f,
-        0x85, 0xa9, 0xa3, 0x7d, 0x7b, 0x84, 0x83, 0xa1, 0x97, 0xa7, 0xaf, 0x84,
-        0x86, 0x7d, 0x94, 0x78, 0x80, 0x98, 0x71, 0x84, 0x94, 0x73, 0xb0, 0x74,
-        0x99, 0xa2, 0x68, 0xa7, 0x8b, 0x86, 0xe0, 0x75, 0x9e, 0x93, 0x5c, 0xb2,
-        0xa2, 0x68, 0xb8, 0x61, 0x92, 0xa3, 0x68, 0xa4, 0x89, 0x59, 0xd0, 0x77,
-        0x97, 0xa9, 0x6a, 0x9b, 0x7d, 0x69, 0x9b, 0x79, 0x8c, 0x7c, 0x68, 0x8b,
-        0x7a, 0x53, 0x99, 0x9c, 0x7e, 0x8d, 0x89, 0x96, 0x9e, 0x83, 0x89, 0x74,
-        0x7f, 0x94, 0x92, 0x8f, 0x85, 0x8a, 0x8a, 0x80, 0x99, 0x87, 0x7a, 0x7d,
-        0xac, 0x93, 0x74, 0x68, 0xba, 0x87, 0x6a, 0x98, 0xc7, 0x79, 0x91, 0x54,
-        0xeb, 0x80, 0x45, 0x80, 0xc4, 0xb4, 0x94, 0x61, 0xd2, 0xa6, 0x7b, 0x95,
-        0xa4, 0xaa, 0x93, 0x7b, 0xb1, 0x74, 0x53, 0x7c, 0xaa, 0x91, 0x64, 0x51,
-        0xa9, 0x6e, 0x5e, 0x7c, 0x79, 0x82, 0x8b, 0x2e, 0x9d, 0x66, 0x61, 0x5e,
-        0x72, 0x7f, 0x6e, 0x6d, 0x8c, 0x79, 0x7d, 0x60, 0x76, 0x79, 0x68, 0x84,
-        0x4d, 0x8e, 0xa8, 0x8f, 0x78, 0x74, 0x69, 0xa4, 0x6e, 0xa9, 0xb9, 0x59,
-        0x83, 0x7f, 0x7a, 0x93, 0x90, 0x9b, 0x8d, 0x93, 0x78, 0x80, 0x77, 0x8b,
-        0x72, 0xa3, 0x97, 0x73, 0x91, 0x6c, 0x9a, 0x97, 0xa3, 0xad, 0x89, 0x96,
-        0x9e, 0x6d, 0xb5, 0x7c, 0xa4, 0x98, 0x61, 0x8a, 0x93, 0x5f, 0xdc, 0x63,
-        0xba, 0x92, 0x84, 0x94, 0xab, 0x6f, 0xbf, 0x66, 0x98, 0x93, 0x74, 0x85,
-        0x96, 0x63, 0xb8, 0x60, 0x94, 0xbb, 0x79, 0x94, 0x7b, 0x67, 0x8a, 0x64,
-        0x99, 0xac, 0x60, 0x98, 0xb0, 0x65, 0xa2, 0x73, 0x8f, 0x94, 0x8c, 0x92,
-        0x84, 0x84, 0x9b, 0x8f, 0x84, 0x8d, 0x9f, 0x90, 0x91, 0x85, 0x93, 0x74,
-        0x97, 0x66, 0x7f, 0x78, 0xa2, 0x95, 0x73, 0x6b, 0xc5, 0x6f, 0x62, 0x79,
-        0xbd, 0x81, 0x89, 0x4a, 0xbd, 0x93, 0x57, 0x81, 0xba, 0xb0, 0x9b, 0x4c,
-        0xe8, 0xa2, 0x85, 0xa2, 0x96, 0x92, 0x93, 0x62, 0xbe, 0x7a, 0x71, 0x8b,
-        0x8d, 0x97, 0x53, 0x56, 0xb1, 0x5f, 0x67, 0x60, 0x7a, 0x8e, 0x8a, 0x3a,
-        0x86, 0x67, 0x6d, 0x53, 0x6e, 0x91, 0x7b, 0x60, 0x99, 0x6d, 0x71, 0x5d,
-        0x67, 0x65, 0x63, 0x87, 0x71, 0x8a, 0x92, 0x6d, 0x8f, 0x6f, 0x6f, 0xae,
-        0x6c, 0xa2, 0x87, 0x6f, 0x99, 0x88, 0x78, 0x94, 0x8a, 0xb2, 0x93, 0x89,
-        0x90, 0x8d, 0x8c, 0x98, 0x81, 0x86, 0x90, 0x6d, 0xa2, 0x82, 0xa2, 0xa3,
-        0x9d, 0x8f, 0x7a, 0x9f, 0x87, 0x70, 0xbd, 0x8e, 0xa5, 0x99, 0x5d, 0x70,
-        0x8c, 0x60, 0xc7, 0x78, 0x97, 0xb0, 0x6f, 0x94, 0x92, 0x5a, 0xc3, 0x6e,
-        0x8b, 0x9f, 0x79, 0xa3, 0x8c, 0x5e, 0xbf, 0x79, 0x8e, 0x98, 0x76, 0x8e,
-        0x67, 0x31, 0x9b, 0x85, 0x8e, 0x85, 0x71, 0x99, 0x72, 0x77, 0x84, 0x81,
-        0x91, 0x95, 0x80, 0x98, 0x82, 0x6f, 0x90, 0xa0, 0x91, 0x91, 0x8e, 0x75,
-        0x8a, 0x89, 0x93, 0x69, 0x95, 0x7f, 0x9a, 0xa0, 0x9e, 0x9b, 0x88, 0x4e,
-        0xc3, 0x8d, 0x65, 0x74, 0xba, 0x8d, 0x97, 0x4d, 0xd6, 0x94, 0x73, 0xa0,
-        0xb1, 0xb3, 0x8c, 0x67, 0xdd, 0x9f, 0x7f, 0xaa, 0xaf, 0x9a, 0x88, 0x67,
-        0xc2, 0x8f, 0x71, 0x7b, 0x8f, 0x9f, 0x47, 0x52, 0x93, 0x72, 0x5a, 0x52,
-        0x97, 0x9d, 0x67, 0x3c, 0xa9, 0x59, 0x59, 0x5b, 0x88, 0x92, 0x82, 0x57,
-        0x83, 0x67, 0x94, 0x77, 0x52, 0x74, 0x60, 0x9e, 0x52, 0x84, 0xa2, 0x69,
-        0x71, 0x96, 0x73, 0xb0, 0x5e, 0xb0, 0x89, 0x71, 0x94, 0x8a, 0x66, 0xa0,
-        0x75, 0xc1, 0x99, 0x8e, 0x83, 0x8a, 0x91, 0x89, 0x6b, 0xa5, 0x79, 0x82,
-        0x8b, 0x73, 0x95, 0xb0, 0x77, 0x9b, 0x82, 0x7d, 0x8f, 0x60, 0xb9, 0x78,
-        0x8b, 0x8f, 0x7b, 0x74, 0x84, 0x6d, 0xbf, 0x76, 0x8f, 0xa3, 0x91, 0xa1,
-        0x81, 0x59, 0xcb, 0x69, 0xac, 0x90, 0x98, 0x92, 0xa7, 0x5d, 0xb4, 0x8b,
-        0xaa, 0xb1, 0x98, 0x8c, 0xa2, 0x4d, 0xa1, 0x69, 0x7f, 0xa0, 0x7d, 0x8a,
-        0x9b, 0x77, 0x8e, 0x71, 0x82, 0x8a, 0x78, 0x8d, 0x98, 0x78, 0x90, 0x91,
-        0x7e, 0x7f, 0x78, 0x85, 0x97, 0x8a, 0x97, 0x6d, 0xb3, 0x94, 0x89, 0xa3,
-        0xa5, 0x9a, 0x76, 0x6b, 0xbd, 0x79, 0x71, 0x95, 0xce, 0xab, 0x93, 0x1f,
-        0xe9, 0x97, 0x4c, 0x84, 0xd5, 0x9f, 0x98, 0x6e, 0xdd, 0x8d, 0x80, 0x9c,
-        0xa8, 0x9e, 0x8d, 0x75, 0xbc, 0x8c, 0x80, 0x89, 0xa1, 0x89, 0x74, 0x58,
-        0x92, 0x86, 0x55, 0x87, 0x91, 0x8d, 0x70, 0x33, 0xb8, 0x50, 0x63, 0x6b,
-        0x79, 0x99, 0x76, 0x71, 0x75, 0x59, 0x73, 0x6b, 0x62, 0x62, 0x74, 0x85,
-        0x73, 0xa3, 0xac, 0x78, 0x77, 0x88, 0x64, 0xa0, 0x73, 0xa1, 0xa8, 0x73,
-        0x91, 0x8e, 0x5f, 0x9a, 0x68, 0xc9, 0xa1, 0x92, 0x7a, 0x7c, 0x69, 0x77,
-        0x7d, 0x9e, 0x8f, 0x76, 0x88, 0x80, 0x92, 0x93, 0x91, 0x99, 0x8c, 0x85,
-        0x9f, 0x69, 0xa8, 0x9b, 0x9f, 0x9a, 0x64, 0x7a, 0x99, 0x70, 0xc4, 0x6d,
-        0x9a, 0x99, 0x82, 0xa0, 0x8b, 0x59, 0xc8, 0x61, 0x8f, 0x95, 0x72, 0x8c,
-        0x90, 0x63, 0xa9, 0x7e, 0x88, 0x8c, 0x85, 0x78, 0x76, 0x58, 0x8e, 0x72,
-        0xa3, 0x9a, 0x7c, 0xa0, 0x7f, 0x6d, 0xa6, 0x83, 0x7e, 0x8d, 0x83, 0x88,
-        0x86, 0x68, 0x8d, 0x96, 0xaa, 0x78, 0x90, 0xa5, 0x9c, 0x9d, 0x99, 0x88,
-        0xb0, 0x82, 0x6f, 0x7e, 0xad, 0xa9, 0x7b, 0x6a, 0xba, 0x6c, 0x6d, 0x89,
-        0xc1, 0x9e, 0x8e, 0x2f, 0xf2, 0x77, 0x50, 0x73, 0xdb, 0xc4, 0x9c, 0x6c,
-        0xd0, 0x90, 0x88, 0xbe, 0x97, 0xb9, 0x9e, 0x6e, 0xbe, 0x8e, 0x83, 0x8e,
-        0x96, 0x98, 0x4c, 0x4e, 0xa7, 0x8d, 0x43, 0x92, 0x8f, 0x92, 0x6d, 0x27,
-        0x94, 0x73, 0x5f, 0x42, 0x7c, 0xa7, 0x8a, 0x5a, 0x81, 0x60, 0x85, 0x66,
-        0x73, 0x72, 0x74, 0x9d, 0x5a, 0x9e, 0xa3, 0x71, 0x75, 0x91, 0x4f, 0xa2,
-        0x67, 0xa6, 0x91, 0x64, 0x92, 0x7e, 0x95, 0x8d, 0x6e, 0xbe, 0x9b, 0x57,
-        0x9b, 0x82, 0x89, 0x70, 0x6f, 0x9e, 0x7e, 0x86, 0x97, 0x81, 0x85, 0x8e,
-        0x70, 0x96, 0x6c, 0x72, 0xab, 0x6d, 0x9c, 0x91, 0xa0, 0x8a, 0x8d, 0x88,
-        0x9e, 0x75, 0xc6, 0x76, 0x7c, 0xa7, 0x6b, 0xa8, 0x94, 0x72, 0xb6, 0x78,
-        0x8d, 0x90, 0x7b, 0x8c, 0xa6, 0x65, 0xad, 0x9b, 0xaa, 0x94, 0x89, 0x7d,
-        0x90, 0x69, 0xaa, 0x7e, 0x9e, 0xad, 0x7f, 0x94, 0x81, 0x7d, 0xa1, 0x7b,
-        0x6c, 0x65, 0x83, 0x95, 0x89, 0x75, 0x93, 0x87, 0x94, 0x87, 0xa8, 0x92,
-        0x8d, 0xa6, 0x9f, 0x78, 0xaa, 0x72, 0x95, 0x94, 0xac, 0xa6, 0x91, 0x5a,
-        0xdb, 0x82, 0x55, 0xb6, 0xc1, 0xa3, 0x84, 0x4f, 0xc9, 0x88, 0x53, 0x8f,
-        0xbb, 0xae, 0x9b, 0x8a, 0xd8, 0xa9, 0x68, 0xc2, 0xa0, 0xa9, 0x87, 0x6b,
-        0xbd, 0x99, 0x7e, 0x86, 0x88, 0xa7, 0x5e, 0x53, 0xa4, 0x84, 0x6b, 0x6e,
-        0x89, 0x95, 0x84, 0x2d, 0xb5, 0x43, 0x3e, 0x50, 0x71, 0x96, 0x9a, 0x5b,
-        0xa1, 0x60, 0x80, 0x70, 0x6a, 0x73, 0x8f, 0x95, 0x52, 0x9b, 0xae, 0x71,
-        0x76, 0x7d, 0x61, 0x99, 0x5b, 0xc3, 0xa8, 0x76, 0x98, 0x72, 0x7f, 0x8a,
-        0x66, 0xc7, 0xa3, 0x7b, 0x8e, 0x8f, 0x70, 0x74, 0x6a, 0xae, 0x85, 0x83,
-        0x96, 0x7d, 0x98, 0xa7, 0x8f, 0x94, 0x7e, 0x84, 0x96, 0x7a, 0xab, 0x7d,
-        0x83, 0xb1, 0x6f, 0x7d, 0x9f, 0x80, 0xca, 0x8f, 0x9b, 0xa9, 0x69, 0x7a,
-        0x92, 0x73, 0xaa, 0x74, 0x88, 0x98, 0x87, 0x8f, 0xa7, 0x68, 0xa0, 0x74,
-        0x97, 0x95, 0x6e, 0x6f, 0x83, 0x53, 0x9b, 0x79, 0x71, 0x87, 0x7d, 0x8b,
-        0x79, 0x87, 0xa3, 0x75, 0x68, 0x73, 0x7e, 0x89, 0x8f, 0x81, 0x98, 0x7a,
-        0x9a, 0x83, 0x9d, 0x95, 0x90, 0x98, 0x97, 0x57, 0x93, 0x7e, 0xa2, 0x9a,
-        0xa8, 0x8a, 0x85, 0x53, 0xbd, 0x7a, 0x61, 0x8b, 0xca, 0xac, 0x9b, 0x2e,
-        0xe8, 0xa5, 0x66, 0x86, 0xca, 0xa7, 0xa0, 0x85, 0xcf, 0xa4, 0x6a, 0xc2,
-        0xb0, 0xaa, 0x76, 0x76, 0xb6, 0xa2, 0x72, 0xa9, 0xa1, 0xa1, 0x67, 0x67,
-        0xac, 0x90, 0x70, 0x6d, 0x8f, 0xb5, 0x6d, 0x3b, 0x85, 0x64, 0x4a, 0x6e,
-        0x72, 0x9f, 0x98, 0x5b, 0x97, 0x3e, 0x8a, 0x6a, 0x6c, 0x7d, 0x77, 0x98,
-        0x5a, 0x92, 0xa3, 0x81, 0x6f, 0x91, 0x7b, 0xa6, 0x6e, 0x9c, 0x9b, 0x5f,
-        0x9e, 0x7e, 0x77, 0x9d, 0x88, 0xc6, 0x81, 0x5a, 0x93, 0x8b, 0x6c, 0x71,
-        0x63, 0x9e, 0x78, 0x79, 0x70, 0x90, 0x95, 0x9f, 0x71, 0xa9, 0x90, 0x73,
-        0x98, 0x8a, 0xa5, 0x8e, 0x87, 0xb0, 0x79, 0x79, 0x92, 0x7d, 0xcc, 0xa8,
-        0x7a, 0x92, 0x82, 0x91, 0x90, 0x69, 0xa4, 0x9b, 0x97, 0x8f, 0x75, 0x7c,
-        0xa3, 0x69, 0xb5, 0x87, 0x8d, 0x88, 0x7b, 0x94, 0x8b, 0x55, 0xa2, 0x6d,
-        0x89, 0x8e, 0x81, 0x8a, 0x9e, 0x87, 0x86, 0x83, 0x8b, 0x84, 0x87, 0xa7,
-        0x8e, 0x79, 0xa4, 0x9c, 0x99, 0x82, 0xa3, 0x8f, 0x91, 0x9a, 0x95, 0x5b,
-        0x9f, 0x6e, 0x85, 0x93, 0xa6, 0x9a, 0x91, 0x4c, 0xd8, 0x6b, 0x6d, 0x85,
-        0xde, 0xaa, 0x97, 0x51, 0xcf, 0x8c, 0x5f, 0x9a, 0xc2, 0x9d, 0x9a, 0x7c,
-        0xc6, 0xb1, 0x84, 0xac, 0xba, 0xa5, 0x7c, 0x76, 0xbd, 0x93, 0x7f, 0xa0,
-        0x86, 0xae, 0x47, 0x41, 0x88, 0x82, 0x62, 0x62, 0x73, 0xad, 0x6b, 0x23,
-        0xa0, 0x48, 0x5a, 0x5a, 0x8f, 0x98, 0xbd, 0x5c, 0x9c, 0x72, 0x7c, 0x68,
-        0x50, 0x78, 0x91, 0xab, 0x5c, 0xc1, 0xc6, 0x66, 0x87, 0x86, 0x60, 0x99,
-        0x65, 0xac, 0x94, 0x91, 0x7e, 0x8c, 0x7d, 0x9b, 0x70, 0xb2, 0x9a, 0x7d,
-        0x82, 0x91, 0x6b, 0x86, 0x6f, 0xbb, 0x7f, 0x66, 0x7a, 0x79, 0x94, 0x96,
-        0x71, 0xa5, 0x75, 0x73, 0x95, 0x81, 0xa4, 0x8b, 0x87, 0xaa, 0x8e, 0x92,
-        0xa9, 0x82, 0xb0, 0x92, 0x89, 0xa7, 0x83, 0x81, 0x8c, 0x6d, 0xc4, 0x7a,
-        0x89, 0xa5, 0xa1, 0xa2, 0xa4, 0x6b, 0xa4, 0x82, 0x90, 0xb2, 0x8d, 0x72,
-        0x83, 0x60, 0xa7, 0x7a, 0x80, 0x97, 0x65, 0x90, 0x87, 0x85, 0xae, 0x71,
-        0x7d, 0x71, 0x98, 0xa8, 0x90, 0x75, 0xa9, 0x96, 0xa2, 0x91, 0x7b, 0x6b,
-        0xa0, 0x9d, 0x8d, 0x5d, 0xa4, 0x79, 0x8c, 0xa4, 0xad, 0x94, 0x7e, 0x77,
-        0xb6, 0x92, 0x74, 0xaf, 0xb5, 0x9b, 0x99, 0x67, 0xe7, 0x8e, 0x6a, 0x87,
-        0xc1, 0x98, 0x9b, 0x7e, 0xd7, 0x9b, 0x5b, 0xae, 0xc9, 0x94, 0x7a, 0x6d,
-        0x9e, 0xb4, 0x86, 0x8e, 0xa3, 0xa1, 0x5e, 0x5d, 0x8e, 0x8f, 0x6b, 0x59,
-        0xa5, 0xa9, 0x69, 0x20, 0xa4, 0x64, 0x35, 0x61, 0x83, 0x9d, 0x8a, 0x4e,
-        0x8b, 0x6c, 0x5e, 0x5b, 0x68, 0x76, 0x89, 0x94, 0x5f, 0x87, 0x98, 0x7a,
-        0x5d, 0x81, 0x89, 0xa6, 0x54, 0xa3, 0xb4, 0x7b, 0x83, 0x8a, 0x90, 0x8b,
-        0x86, 0xbc, 0x86, 0x59, 0x91, 0x79, 0x71, 0x6b, 0x7c, 0x94, 0x98, 0x7f,
-        0x81, 0x76, 0x85, 0xad, 0x69, 0xa8, 0x83, 0x8c, 0x8f, 0x70, 0x9a, 0x91,
-        0x78, 0xb3, 0x8f, 0x6d, 0x90, 0x86, 0xbd, 0x97, 0x7f, 0xaf, 0x7e, 0x90,
-        0x8f, 0x63, 0xa2, 0x93, 0x6e, 0xab, 0x75, 0x72, 0x8d, 0x74, 0xa1, 0x72,
-        0x82, 0xaa, 0x70, 0x82, 0x8d, 0x67, 0x94, 0x91, 0x92, 0xa5, 0x7f, 0xa5,
-        0x6f, 0x6d, 0xaf, 0x80, 0x89, 0x7d, 0x92, 0x99, 0x92, 0x72, 0x9d, 0x7d,
-        0x92, 0x78, 0xa9, 0x89, 0xa9, 0x9b, 0xa3, 0x73, 0x98, 0x71, 0x98, 0x86,
-        0x9e, 0x97, 0x9e, 0x6a, 0xb9, 0x6a, 0x6e, 0x90, 0xde, 0x94, 0x9a, 0x52,
-        0xdd, 0xa9, 0x6a, 0x79, 0xb9, 0xa3, 0xaa, 0x95, 0xba, 0xa2, 0x75, 0xc2,
-        0xbf, 0xb5, 0x6d, 0x8d, 0xae, 0x9b, 0x8d, 0x9a, 0x92, 0xb4, 0x5e, 0x4b,
-        0x8b, 0x99, 0x4f, 0x65, 0x94, 0xb6, 0x5d, 0x3a, 0xa3, 0x77, 0x51, 0x4e,
-        0x6d, 0xa3, 0x94, 0x59, 0x80, 0x56, 0x8c, 0x67, 0x67, 0x74, 0x99, 0x85,
-        0x57, 0x7b, 0x9e, 0x7e, 0x84, 0x85, 0x94, 0x96, 0x71, 0xbf, 0x97, 0x5f,
-        0x7d, 0x80, 0x93, 0x87, 0x6b, 0xb9, 0x7d, 0x8b, 0x84, 0x84, 0x6b, 0x8c,
-        0x6c, 0xc4, 0x85, 0x82, 0x87, 0x8d, 0x64, 0x90, 0x80, 0xb6, 0x9a, 0x70,
-        0x9c, 0x68, 0xa0, 0x88, 0x81, 0x9d, 0x83, 0x75, 0x9d, 0x84, 0xbf, 0x8f,
-        0x83, 0x9b, 0x75, 0x82, 0x9c, 0x76, 0xa4, 0x9d, 0x8a, 0xa7, 0x8e, 0x96,
-        0x9c, 0x64, 0xc0, 0x95, 0x88, 0xa5, 0x6f, 0x74, 0x7e, 0x5d, 0x9f, 0x7d,
-        0x89, 0x81, 0x71, 0xa8, 0x82, 0x6e, 0x9b, 0x9a, 0x6f, 0xa5, 0x88, 0x89,
-        0xa4, 0x7e, 0xa4, 0x90, 0xa1, 0x83, 0x8b, 0x9c, 0x9a, 0x89, 0xa2, 0x89,
-        0x9d, 0x5d, 0x86, 0xa5, 0xc4, 0x96, 0x9c, 0x85, 0xd6, 0x7c, 0x69, 0x88,
-        0xc9, 0xa5, 0x9b, 0x60, 0xea, 0xab, 0x62, 0x9f, 0xd1, 0xa5, 0x86, 0x7e,
-        0xb3, 0xbd, 0x7a, 0xa1, 0xbd, 0xa0, 0x7c, 0x92, 0xa6, 0xa3, 0x7d, 0xa9,
-        0x98, 0xa6, 0x71, 0x5c, 0x9b, 0x9b, 0x58, 0x6f, 0x8f, 0xaa, 0x5e, 0x3b,
-        0xa6, 0x5f, 0x3a, 0x79, 0x94, 0xa5, 0x84, 0x6f, 0x83, 0x5d, 0x75, 0x65,
-        0x6c, 0x77, 0x86, 0xad, 0x4a, 0x92, 0x8e, 0x8a, 0x8f, 0x7b, 0x72, 0x96,
-        0x79, 0xa6, 0xa8, 0x6d, 0x7b, 0x7b, 0x98, 0xa9, 0x79, 0xb9, 0x9e, 0x8f,
-        0x90, 0x6d, 0x76, 0x82, 0x81, 0xc1, 0x95, 0x7c, 0x97, 0x8d, 0x95, 0xa2,
-        0x7c, 0xa4, 0x7b, 0x9b, 0x7f, 0x6f, 0xac, 0x83, 0x7e, 0xa1, 0x7c, 0x7c,
-        0xa1, 0x7a, 0xa1, 0x6d, 0x95, 0x86, 0x77, 0x98, 0x8e, 0x58, 0xa2, 0x76,
-        0x8e, 0xa8, 0x94, 0x90, 0xa7, 0x62, 0xb8, 0x8a, 0x9f, 0xac, 0x87, 0x91,
-        0x88, 0x50, 0xa7, 0x83, 0x88, 0x65, 0x7a, 0x92, 0x9d, 0x70, 0xa9, 0x99,
-        0x7c, 0x87, 0x8c, 0x96, 0x8e, 0x73, 0xa4, 0xa7, 0x9b, 0x70, 0x99, 0x96,
-        0x8f, 0x88, 0xb4, 0x85, 0xa8, 0x6a, 0x9e, 0x78, 0xb0, 0x82, 0x9f, 0x89,
-        0xc9, 0x8d, 0x71, 0x7f, 0xc0, 0x98, 0xa0, 0x6d, 0xd2, 0x8e, 0x64, 0x9e,
-        0xb2, 0xa9, 0x93, 0x6e, 0xcc, 0xbb, 0x89, 0xb1, 0xc1, 0x9b, 0x86, 0x94,
-        0xb5, 0xb5, 0x95, 0xa0, 0x9c, 0x9b, 0x62, 0x5f, 0x7b, 0x91, 0x69, 0x74,
-        0x9e, 0xa3, 0x81, 0x30, 0x85, 0x59, 0x49, 0x5e, 0x83, 0x85, 0x7d, 0x6a,
-        0x90, 0x51, 0x80, 0x5e, 0x64, 0x6f, 0x99, 0x93, 0x75, 0x9a, 0xa7, 0x72,
-        0x6c, 0x5d, 0xa3, 0x93, 0x87, 0xa7, 0xbd, 0x6f, 0x92, 0x6d, 0x85, 0x98,
-        0x6f, 0xc7, 0xb6, 0x7c, 0x80, 0x71, 0x8a, 0x9f, 0x71, 0xb5, 0x8c, 0x6d,
-        0xac, 0x7b, 0x72, 0xb7, 0x69, 0xa6, 0x9d, 0x66, 0xab, 0x7a, 0x8b, 0x70,
-        0x8c, 0x9e, 0x86, 0x75, 0x96, 0x7b, 0xa3, 0x93, 0x8f, 0xb7, 0x84, 0x8c,
-        0x87, 0x56, 0xae, 0x82, 0x71, 0xa3, 0x8d, 0x93, 0xaf, 0x59, 0xb3, 0x8a,
-        0x97, 0x99, 0x75, 0x73, 0x8e, 0x51, 0xae, 0x84, 0x8b, 0x7a, 0x76, 0x77,
-        0x6e, 0x75, 0xa4, 0x8a, 0x75, 0x8e, 0x8f, 0xa2, 0x96, 0x76, 0x9a, 0x80,
-        0x96, 0x7d, 0x94, 0x71, 0x8a, 0x90, 0xac, 0x82, 0xa5, 0x61, 0xa3, 0x84,
-        0xac, 0x8f, 0x74, 0x5c, 0xb6, 0x77, 0x8b, 0x9b, 0xb5, 0x8b, 0xb6, 0x52,
-        0xd7, 0xaa, 0x4b, 0x8c, 0xbf, 0xb8, 0x9f, 0x6d, 0xcb, 0xa3, 0x6e, 0x97,
-        0xaa, 0x8d, 0x7c, 0x99, 0xc0, 0xd0, 0x9e, 0xb7, 0x93, 0xaa, 0x5a, 0x6a,
-        0x7d, 0x9a, 0x63, 0x71, 0x78, 0x8c, 0x67, 0x43, 0x87, 0x52, 0x64, 0x68,
-        0x68, 0x9c, 0x65, 0x60, 0x7a, 0x35, 0x68, 0x66, 0x63, 0x69, 0x8d, 0x8f,
-        0x72, 0x9b, 0x99, 0x5b, 0x80, 0x67, 0x93, 0xa2, 0x97, 0x9d, 0x8c, 0x68,
-        0x80, 0x86, 0x96, 0x91, 0x64, 0xbf, 0x98, 0x63, 0x83, 0x85, 0x61, 0x97,
-        0x6a, 0xac, 0xb4, 0x99, 0x8d, 0x7b, 0x7b, 0xad, 0x8b, 0xb2, 0x9e, 0x7f,
-        0x9a, 0x73, 0x91, 0x84, 0x89, 0x9f, 0x8a, 0x87, 0x8b, 0x72, 0x8e, 0x79,
-        0x86, 0xa7, 0x77, 0x84, 0x90, 0x58, 0xb2, 0x90, 0x93, 0xa0, 0x7f, 0x8a,
-        0x91, 0x5a, 0xb1, 0x80, 0x99, 0xc1, 0x80, 0x7d, 0x97, 0x5c, 0x9a, 0x8c,
-        0x71, 0x96, 0x7e, 0x7f, 0xad, 0x7b, 0xb9, 0x8a, 0x84, 0x84, 0x81, 0x97,
-        0x94, 0x64, 0x9f, 0x7e, 0x9b, 0x8d, 0x7d, 0x8d, 0x9a, 0x9e, 0xac, 0x72,
-        0xb2, 0x73, 0x81, 0x84, 0xc8, 0x81, 0x88, 0x72, 0xbe, 0x85, 0x86, 0x97,
-        0xd3, 0x8a, 0xc7, 0x75, 0xce, 0x9c, 0x69, 0xa6, 0xb0, 0xa1, 0x8e, 0x64,
-        0xb1, 0xa6, 0x67, 0xaa, 0xcd, 0x95, 0x97, 0xa2, 0xb2, 0xb2, 0x85, 0x9a,
-        0x9d, 0xa3, 0x5e, 0x73, 0x6e, 0xae, 0x50, 0x83, 0x8c, 0xab, 0x92, 0x43,
-        0x6b, 0x66, 0x43, 0x5c, 0x8f, 0x8a, 0x9a, 0x6c, 0x84, 0x48, 0x80, 0x6b,
-        0x8d, 0x82, 0xaf, 0x89, 0x71, 0x9f, 0xa4, 0x9a, 0x7b, 0x68, 0x91, 0xaa,
-        0x6b, 0xa3, 0x9c, 0x62, 0x8d, 0x6d, 0x87, 0x87, 0x81, 0x9a, 0x97, 0x6c,
-        0x9c, 0x76, 0x63, 0xbc, 0x62, 0xbc, 0xb0, 0x97, 0xa7, 0x81, 0x70, 0x8f,
-        0x7d, 0xb2, 0xa6, 0x98, 0xa1, 0x7b, 0x8e, 0x83, 0x8c, 0xa2, 0x7e, 0x73,
-        0x99, 0x65, 0xc1, 0x77, 0x8e, 0xbc, 0x72, 0xa6, 0x8c, 0x55, 0xab, 0x8e,
-        0x7d, 0xa3, 0x79, 0x80, 0x9e, 0x6b, 0xa9, 0x6c, 0x80, 0xb6, 0x81, 0xa6,
-        0x92, 0x5b, 0xb7, 0x99, 0x81, 0x7e, 0x8e, 0x89, 0x97, 0x86, 0x93, 0x86,
-        0x7b, 0x9a, 0x7f, 0x9a, 0x8e, 0x69, 0xa3, 0xa4, 0x9f, 0x8b, 0x96, 0x6f,
-        0x8b, 0x97, 0xb4, 0x74, 0x96, 0x53, 0x99, 0x91, 0xa7, 0xa8, 0x69, 0x72,
-        0xc9, 0x85, 0x99, 0x93, 0xc0, 0x90, 0xaa, 0x7f, 0xc7, 0x71, 0x74, 0x8d,
-        0xb7, 0xab, 0x91, 0x69, 0xb4, 0x9b, 0x7d, 0x95, 0xc3, 0xb0, 0x9b, 0xa9,
-        0xb3, 0x9f, 0x79, 0xa5, 0x9f, 0xad, 0x6b, 0x85, 0x90, 0xad, 0x69, 0x62,
-        0x7e, 0xa6, 0x69, 0x4e, 0x80, 0x7e, 0x52, 0x57, 0x5f, 0x95, 0x72, 0x4c,
-        0x87, 0x4e, 0x5a, 0x62, 0x7d, 0x70, 0x92, 0x98, 0x76, 0x8e, 0x99, 0x7d,
-        0x73, 0x6d, 0x86, 0x8e, 0x6b, 0x80, 0xa7, 0x9d, 0x91, 0x73, 0x95, 0x70,
-        0x80, 0xc3, 0x9f, 0x8b, 0x72, 0x86, 0x6b, 0xad, 0x76, 0xbe, 0xad, 0x8e,
-        0x9c, 0x78, 0x6a, 0xbf, 0x7d, 0xa8, 0x88, 0x8a, 0x8b, 0x8c, 0x9c, 0x8c,
-        0x8a, 0x85, 0x73, 0x92, 0xa2, 0x7b, 0xa5, 0x96, 0x9b, 0xa3, 0x6c, 0x80,
-        0xa6, 0x63, 0xac, 0x98, 0xa3, 0x9a, 0x83, 0x8a, 0x8c, 0x63, 0xb9, 0x8c,
-        0x99, 0xa1, 0x7a, 0x6c, 0x9e, 0x59, 0x90, 0x84, 0x8a, 0x93, 0x8f, 0x87,
-        0x98, 0x84, 0x99, 0xa4, 0x72, 0x6d, 0x95, 0xa2, 0x95, 0x72, 0xc3, 0x88,
-        0x8f, 0x6a, 0x77, 0x7d, 0x8b, 0xae, 0xa3, 0x7c, 0xa8, 0x5d, 0x7c, 0xa8,
-        0xa1, 0x85, 0x7e, 0x8c, 0xac, 0x8d, 0x73, 0x88, 0xc1, 0x89, 0xaa, 0x89,
-        0xb2, 0x92, 0x75, 0x9a, 0x9c, 0x8e, 0xb9, 0xaa, 0xaa, 0xac, 0x78, 0x85,
-        0xbc, 0x9f, 0x6d, 0xb7, 0x89, 0xa6, 0xb3, 0x8e, 0xa5, 0xbb, 0x6b, 0x9d,
-        0x8f, 0x8b, 0x69, 0x7a, 0x82, 0x99, 0x8c, 0x49, 0x87, 0x74, 0x37, 0x63,
-        0x5d, 0x92, 0x77, 0x66, 0x63, 0x56, 0x77, 0x5d, 0x7f, 0x68, 0x97, 0x74,
-        0x84, 0x94, 0x7d, 0x7d, 0x91, 0x78, 0x87, 0x96, 0x7f, 0x97, 0x94, 0x6f,
-        0x89, 0x6c, 0x96, 0x71, 0x83, 0x8f, 0x8a, 0x89, 0x7d, 0x84, 0x8a, 0xa6,
-        0x7b, 0x95, 0x89, 0x77, 0x94, 0x80, 0x7f, 0x93, 0x5e, 0xbb, 0x9c, 0xa8,
-        0xa2, 0x7e, 0xa6, 0x86, 0x7d, 0x8b, 0x92, 0x73, 0xac, 0x78, 0xaa, 0x98,
-        0xb1, 0x94, 0x79, 0x8b, 0x8f, 0x70, 0xa7, 0xae, 0x92, 0xad, 0xb1, 0x8b,
-        0xb0, 0x78, 0xbc, 0xa9, 0xa4, 0xa3, 0x9e, 0x76, 0x89, 0x67, 0xab, 0x98,
-        0x75, 0x8c, 0x86, 0x95, 0x9e, 0x77, 0x96, 0x85, 0x8c, 0x8e, 0x8b, 0x8a,
-        0x8a, 0x4b, 0x71, 0x8a, 0x9b, 0x6d, 0x6e, 0x89, 0x81, 0x82, 0xa7, 0x98,
-        0xa5, 0x66, 0x72, 0x8b, 0x99, 0x9a, 0x8b, 0x8b, 0x9f, 0x87, 0x79, 0x84,
-        0x99, 0x6d, 0x90, 0x7d, 0x9d, 0xa7, 0x81, 0xa3, 0x9d, 0x96, 0x82, 0x86,
-        0xa2, 0x8e, 0x8d, 0x7f, 0x84, 0x8c, 0x98, 0xbc, 0x83, 0xb4, 0xb5, 0x78,
-        0x7d, 0xab, 0x8d, 0x87, 0x71, 0x8d, 0x6e, 0x8f, 0x89, 0xaa, 0x7c, 0x6f,
-        0x71, 0x69, 0x65, 0x60, 0x81, 0x91, 0x94, 0x6d, 0x76, 0x66, 0x74, 0x5e,
-        0x77, 0x7c, 0xa2, 0xa6, 0x70, 0x90, 0xa3, 0x68, 0x83, 0x69, 0x71, 0x72,
-        0x6c, 0xa9, 0x85, 0x71, 0x88, 0x60, 0x90, 0x84, 0x8a, 0xba, 0x8b, 0x8c,
-        0x72, 0x8f, 0x98, 0x84, 0x8b, 0x8a, 0xb1, 0xa2, 0x93, 0x8d, 0x86, 0x99,
-        0xa2, 0x99, 0xb0, 0xa6, 0x92, 0x78, 0x86, 0x87, 0x9c, 0x9d, 0x6f, 0x92,
-        0x9a, 0x8a, 0xbf, 0xaa, 0xa3, 0xa2, 0x71, 0x8d, 0x93, 0x70, 0xb5, 0x9c,
-        0xa8, 0x97, 0xb4, 0x93, 0xa6, 0x75, 0xbb, 0xa3, 0x92, 0x95, 0x95, 0x94,
-        0x90, 0x5b, 0xbf, 0x92, 0x8a, 0x95, 0xa0, 0xa1, 0x68, 0x7e, 0x9a, 0x7f,
-        0x88, 0xa7, 0x93, 0xa1, 0x7a, 0x93, 0x95, 0x8b, 0x96, 0x94, 0x70, 0xa0,
-        0x70, 0x8f, 0x9d, 0x96, 0x8e, 0x9c, 0x90, 0x9f, 0x7e, 0x83, 0x84, 0x9e,
-        0x7f, 0x65, 0x72, 0x84, 0x64, 0x94, 0x75, 0xa7, 0x62, 0xa3, 0x8a, 0x9b,
-        0x82, 0x99, 0x87, 0x70, 0x81, 0x6d, 0xac, 0x7b, 0x74, 0x68, 0x5d, 0x95,
-        0xa0, 0x6e, 0x84, 0xab, 0x79, 0x8e, 0x8b, 0x79, 0x7b, 0x83, 0xa0, 0x7b,
-        0x96, 0x71, 0x5d, 0xad, 0xa4, 0x82, 0x79, 0x96, 0x73, 0x84, 0x7d, 0x98,
-        0x87, 0x93, 0x86, 0xa6, 0x7f, 0x7c, 0x71, 0x9d, 0xa4, 0x9b, 0x8a, 0x7c,
-        0x87, 0x6a, 0x7f, 0x8d, 0x97, 0x92, 0xa0, 0x88, 0x77, 0x7d, 0x70, 0x9c,
-        0x9f, 0xa0, 0x71, 0xa3, 0x73, 0x95, 0x76, 0x79, 0x94, 0x95, 0x83, 0x8b,
-        0x8d, 0x82, 0x7a, 0x77, 0xa6, 0x88, 0x72, 0x7a, 0x90, 0x76, 0x7f, 0x95,
-        0x83, 0x90, 0x9e, 0x7c, 0x8e, 0x9a, 0x6b, 0xa4, 0x98, 0x9f, 0x86, 0x8c,
-        0x76, 0x70, 0x74, 0x97, 0x7e, 0xa4, 0x5f, 0xa3, 0xa7, 0x7f, 0x67, 0x8d,
-        0x82, 0x95, 0x93, 0x99, 0x82, 0x70, 0x75, 0xa8, 0xa1, 0xaf, 0x8a, 0x8a,
-        0xb0, 0x89, 0x88, 0x6b, 0x98, 0xaf, 0x75, 0x7f, 0x86, 0x90, 0x8f, 0x8c,
-        0x84, 0x8d, 0x7f, 0x8b, 0x94, 0x9f, 0x80, 0x8b, 0x93, 0xa2, 0x98, 0xa5,
-        0x83, 0x81, 0x8a, 0xaa, 0x86, 0xa3, 0xb0, 0xac, 0x64, 0x9c, 0x7c, 0x93,
-        0xac, 0x85, 0x7f, 0x88, 0x7a, 0xa5, 0x75, 0x69, 0x94, 0xa8, 0x95, 0xa9,
-        0x6f, 0x9f, 0x85, 0x8a, 0xa5, 0x97, 0x98, 0xa9, 0x76, 0x80, 0x7e, 0x95,
-        0x89, 0xaf, 0x68, 0x7b, 0xb4, 0x8a, 0x6b, 0xa4, 0x7b, 0x90, 0x79, 0xba,
-        0x9f, 0x82, 0x7d, 0x89, 0x85, 0x82, 0x94, 0xa5, 0x78, 0x8f, 0x6f, 0x71,
-        0x62, 0x66, 0x73, 0x98, 0x8c, 0x7d, 0x81, 0xa2, 0x69, 0x7c, 0x76, 0xa4,
-        0x94, 0x8f, 0x6f, 0x8a, 0x94, 0x8e, 0x8a, 0x88, 0x8c, 0xa3, 0x6f, 0xa2,
-        0x7d, 0x90, 0x8f, 0x96, 0x6c, 0x76, 0x6e, 0x8e, 0x82, 0x85, 0x7f, 0x93,
-        0x81, 0x83, 0x7b, 0x9f, 0x91, 0x89, 0x75, 0x9c, 0x9f, 0x86, 0x7a, 0x8c,
-        0x7a, 0x7b, 0x82, 0xae, 0x6a, 0x7d, 0x82, 0x82, 0xa0, 0x85, 0x99, 0x9f,
-        0x88, 0x8b, 0x8c, 0x8f, 0x90, 0x96, 0x8e, 0x98, 0xa3, 0x87, 0x7f, 0x9b,
-        0x94, 0x73, 0x96, 0x86, 0x72, 0x7c, 0x75, 0x7c, 0x90, 0x79, 0x83, 0x80,
-        0x79, 0x9e, 0x9c, 0x8e, 0x99, 0x8c, 0x7a, 0x9c, 0x8d, 0x99, 0x9d, 0x84,
-        0xa5, 0x93, 0x85, 0x96, 0x88, 0x94, 0x80, 0x90, 0x73, 0xa3, 0x7c, 0xa1,
-        0x88, 0xa4, 0x98, 0x9f, 0x9e, 0x92, 0x6c, 0xa0, 0x84, 0x87, 0x8a, 0x83,
-        0x7b, 0x91, 0x8c, 0x9e, 0x73, 0xa6, 0x93, 0xa0, 0x8d, 0x98, 0x74, 0xa1,
-        0x83, 0x9a, 0x80, 0xbc, 0x62, 0x70, 0x9e, 0xad, 0x9e, 0x8f, 0x8f, 0x9e,
-        0x7e, 0xac, 0xb0, 0xa9, 0x79, 0x6f, 0x79, 0x8f, 0x7e, 0x71, 0x8d, 0xab,
-        0x97, 0x76, 0x86, 0xa2, 0x98, 0x95, 0x8b, 0x9b, 0x75, 0x7a, 0x71, 0x85,
-        0x7f, 0x61, 0x76, 0x8e, 0x99, 0x91, 0x88, 0x73, 0x71, 0x65, 0x82, 0xa0,
-        0x9b, 0x8f, 0x79, 0x70, 0x78, 0x66, 0x85, 0x94, 0x8b, 0x91, 0x75, 0x80,
-        0x9c, 0x94, 0x7f, 0xa5, 0x82, 0x91, 0x7d, 0x76, 0x80, 0x78, 0x83, 0x82,
-        0x79, 0x98, 0x83, 0x87, 0x94, 0x71, 0x73, 0x77, 0x71, 0x94, 0x6a, 0xa8,
-        0x9e, 0x8d, 0x90, 0x78, 0x7a, 0x81, 0x9c, 0x91, 0x96, 0x80, 0x79, 0x83,
-        0x92, 0x9f, 0x8a, 0x84, 0x8e, 0x97, 0x8c, 0x81, 0x87, 0x74, 0x8b, 0x8e,
-        0xa7, 0x86, 0x8b, 0x8a, 0x8e, 0x8f, 0x9b, 0x6b, 0x82, 0x8a, 0x9f, 0x7a,
-        0x96, 0x80, 0x91, 0x94, 0xa6, 0x8e, 0x7a, 0x97, 0x8a, 0x6c, 0xad, 0xa1,
-        0x78, 0x95, 0x9d, 0x9d, 0x88, 0x94, 0x99, 0x86, 0x80, 0x9b, 0x7c, 0x9c,
-        0x87, 0x7a, 0xa0, 0xa8, 0x83, 0x74, 0x8e, 0x9b, 0x65, 0x95, 0x83, 0xc2,
-        0x69, 0x88, 0x87, 0xa7, 0x86, 0x98, 0x9f, 0xc6, 0x5c, 0x7f, 0xb9, 0x9c,
-        0x8b, 0x6e, 0x95, 0xbd, 0x72, 0x83, 0xbf, 0xb1, 0x89, 0x6d, 0x89, 0x8e,
-        0x9d, 0x87, 0x95, 0x92, 0x76, 0x8d, 0x7f, 0x7f, 0x6d, 0x9d, 0x7b, 0x95,
-        0x86, 0x69, 0x90, 0xa0, 0x62, 0x7c, 0x56, 0xa0, 0x9c, 0x8b, 0x81, 0x79,
-        0xa6, 0x73, 0x69, 0xaa, 0x7b, 0x87, 0x8b, 0x7e, 0xa1, 0x9f, 0x6d, 0xa6,
-        0x7e, 0x7e, 0x87, 0x7c, 0xa5, 0x84, 0x7b, 0xa2, 0xae, 0x92, 0x8e, 0x67,
-        0x93, 0x88, 0x8b, 0xa2, 0x8d, 0x96, 0x92, 0x8e, 0x71, 0x7a, 0x82, 0x80,
-        0x9e, 0x8b, 0x7b, 0x87, 0x96, 0xa0, 0xa4, 0x92, 0x88, 0x7e, 0x77, 0x8e,
-        0x91, 0x7e, 0x81, 0x77, 0x79, 0x93, 0x8d, 0x9d, 0x8a, 0x71, 0x8d, 0x88,
-        0x9d, 0x89, 0x85, 0x94, 0x99, 0x80, 0x89, 0x8f, 0x87, 0x81, 0x83, 0x74,
-        0x8a, 0x89, 0x68, 0x7e, 0x99, 0x82, 0x8c, 0x76, 0xc6, 0x8f, 0x90, 0x7d,
-        0x6c, 0x68, 0xbd, 0x90, 0x78, 0x9d, 0x7b, 0xa3, 0x99, 0x76, 0xaf, 0x8d,
-        0x7d, 0x84, 0x7f, 0x9f, 0x8b, 0x7a, 0xaa, 0xa8, 0x79, 0x89, 0x8f, 0x8f,
-        0x71, 0x80, 0x7f, 0xaa, 0x85, 0x70, 0xa8, 0x96, 0x6c, 0x8c, 0xaf, 0xeb,
-        0x57, 0x7e, 0xcf, 0x8d, 0x93, 0x72, 0xa6, 0xd2, 0x52, 0xab, 0xbb, 0xa8,
-        0x8d, 0x82, 0x7a, 0xbc, 0x72, 0x95, 0xa3, 0xa7, 0x8b, 0x74, 0x84, 0x85,
-        0x6a, 0x85, 0x92, 0x9f, 0x91, 0x6b, 0x9b, 0x73, 0x77, 0xa2, 0x7f, 0x81,
-        0x8e, 0x8b, 0x71, 0x8c, 0x7f, 0x60, 0x86, 0x81, 0x9c, 0x86, 0x93, 0x65,
-        0x84, 0x84, 0x89, 0xa2, 0x98, 0x67, 0x88, 0x71, 0x92, 0x80, 0x65, 0xa2,
-        0xa5, 0x99, 0x85, 0x95, 0x8f, 0x85, 0x8f, 0x82, 0x7e, 0x9a, 0x8a, 0x74,
-        0x9d, 0x75, 0x88, 0x7e, 0xa2, 0x77, 0x82, 0x9e, 0x78, 0xa1, 0x74, 0x79,
-        0x7f, 0x87, 0x91, 0x8d, 0x7a, 0x73, 0x96, 0xa2, 0xa3, 0x81, 0x7d, 0x8a,
-        0x85, 0x75, 0x84, 0x81, 0x8b, 0x7f, 0x6c, 0x86, 0x8d, 0x7b, 0x79, 0x78,
-        0x89, 0x85, 0x8c, 0x9a, 0xa6, 0x96, 0x7a, 0x78, 0xa2, 0x85, 0x9b, 0x89,
-        0xc8, 0x97, 0xa3, 0x82, 0x8b, 0x7f, 0xe7, 0x8f, 0x8f, 0x74, 0x75, 0x83,
-        0x87, 0x79, 0xb3, 0xab, 0x70, 0x9a, 0x9a, 0xa6, 0x81, 0x7e, 0xb8, 0x91,
-        0x8b, 0x8d, 0x93, 0xa1, 0x79, 0x7d, 0x81, 0xb4, 0x79, 0x94, 0xa5, 0x89,
-        0x8e, 0x7c, 0x9b, 0xe2, 0x50, 0x94, 0xdf, 0xa0, 0x53, 0x5d, 0x90, 0xde,
-        0x67, 0x90, 0xaf, 0x8a, 0x8f, 0x73, 0x7b, 0xcb, 0x64, 0x9f, 0x91, 0x86,
-        0x95, 0x84, 0x83, 0x88, 0x76, 0x8b, 0x8a, 0x8f, 0x9c, 0x9a, 0x92, 0x96,
-        0x7f, 0x8e, 0x79, 0x80, 0x91, 0x6d, 0x86, 0x59, 0x74, 0x8a, 0x53, 0x88,
-        0xae, 0x7b, 0x80, 0x70, 0x87, 0x74, 0x75, 0x91, 0xa4, 0x74, 0x8d, 0x5a,
-        0x83, 0x95, 0x65, 0xa1, 0xb3, 0x74, 0x87, 0x7d, 0xaa, 0x82, 0x79, 0x78,
-        0x9b, 0x7c, 0x78, 0x74, 0x9e, 0x74, 0x92, 0x92, 0xa3, 0x6e, 0x75, 0x92,
-        0x6a, 0x6f, 0xa3, 0x7c, 0x9e, 0x7f, 0x92, 0x6b, 0x96, 0x79, 0x9a, 0x87,
-        0x83, 0x8c, 0x72, 0x79, 0x6a, 0xa3, 0x79, 0x7d, 0x6d, 0x6c, 0x81, 0x96,
-        0x98, 0x7f, 0x94, 0x81, 0x8a, 0x8a, 0xa7, 0x8c, 0x9a, 0x84, 0xa7, 0x89,
-        0x9d, 0x85, 0xa6, 0xa8, 0xd0, 0x92, 0x97, 0x9f, 0x76, 0x86, 0xe6, 0x6f,
-        0x7c, 0x84, 0x98, 0x8d, 0x80, 0x75, 0xc5, 0x86, 0x6b, 0x8d, 0x9e, 0x9e,
-        0x7f, 0x71, 0x97, 0xa1, 0x75, 0x92, 0xa9, 0x9e, 0x91, 0x5e, 0xa2, 0xa2,
-        0x68, 0xad, 0xa5, 0xa0, 0x7e, 0x68, 0xac, 0xdc, 0x50, 0xa2, 0xc1, 0x8a,
-        0x63, 0x74, 0x7e, 0xd9, 0x3f, 0xbb, 0xba, 0x9d, 0x7f, 0x76, 0x5f, 0xb0,
-        0x74, 0x8e, 0xb1, 0x95, 0x9a, 0x81, 0x63, 0x9f, 0x98, 0x74, 0x80, 0x89,
-        0x95, 0x8e, 0x9e, 0x78, 0x87, 0x82, 0x57, 0x87, 0x8d, 0x90, 0x79, 0x80,
-        0x76, 0x7c, 0x7d, 0x8a, 0xa6, 0x82, 0x98, 0x7a, 0x96, 0x97, 0x84, 0x87,
-        0xab, 0x7f, 0x87, 0x57, 0x83, 0x6a, 0x6a, 0x84, 0x9c, 0x8d, 0x74, 0x68,
-        0xa2, 0x92, 0x90, 0x98, 0x98, 0x8b, 0x6d, 0x72, 0x90, 0x8c, 0x7c, 0x7d,
-        0x9b, 0x6e, 0x71, 0x76, 0x6b, 0x7b, 0x63, 0x81, 0xad, 0x71, 0x78, 0x8e,
-        0x74, 0x87, 0x8e, 0x8a, 0xab, 0x8e, 0x83, 0x85, 0x7d, 0xa0, 0x67, 0x7f,
-        0x9c, 0x74, 0x6b, 0x88, 0x66, 0x92, 0x7f, 0x83, 0x94, 0x92, 0xa5, 0x82,
-        0xa1, 0x7b, 0x6f, 0x70, 0xab, 0x72, 0xb5, 0x91, 0xb7, 0x89, 0x91, 0x77,
-        0x77, 0x8a, 0xdb, 0x88, 0x8a, 0x8d, 0x89, 0x6c, 0x7b, 0x83, 0xc8, 0xb5,
-        0x4b, 0x96, 0x8b, 0x92, 0x91, 0x76, 0xa9, 0xae, 0x70, 0xa8, 0x74, 0x9d,
-        0x96, 0x6d, 0xa1, 0xba, 0x86, 0xbc, 0xbc, 0xa2, 0x8d, 0x6c, 0x96, 0xd8,
-        0x71, 0xb1, 0xae, 0xb0, 0x79, 0x7b, 0x71, 0xd8, 0x32, 0xaa, 0xae, 0xa7,
-        0x7c, 0x6b, 0x77, 0xc0, 0x7c, 0x9e, 0x9f, 0x89, 0x92, 0x8a, 0x76, 0xae,
-        0x97, 0x75, 0x87, 0x8c, 0x7f, 0x86, 0x8b, 0x73, 0x6b, 0x64, 0x87, 0x6d,
-        0x99, 0x8f, 0x8d, 0x66, 0x76, 0x87, 0x6d, 0x6e, 0x98, 0x7a, 0x91, 0x92,
-        0x8c, 0x7c, 0x89, 0x9b, 0x9e, 0x83, 0x86, 0x62, 0x90, 0x6e, 0x62, 0x82,
-        0xa3, 0x7e, 0x86, 0x6a, 0x93, 0x9b, 0x73, 0x6c, 0xa8, 0x99, 0x73, 0x99,
-        0x8c, 0x89, 0x85, 0x67, 0x98, 0x78, 0x63, 0x98, 0x77, 0xa6, 0x6e, 0x81,
-        0xa4, 0x64, 0x8f, 0x8a, 0x7f, 0x9b, 0x91, 0x91, 0x94, 0x82, 0x8b, 0x8b,
-        0x76, 0x66, 0x83, 0x81, 0x94, 0x71, 0x82, 0x9e, 0x93, 0x85, 0x80, 0x8c,
-        0xae, 0x94, 0x96, 0x74, 0x91, 0x9a, 0x6f, 0x9e, 0xa9, 0x76, 0xab, 0x8e,
-        0xd6, 0x9c, 0x7d, 0x98, 0x83, 0x6e, 0xfe, 0x83, 0x71, 0x82, 0x9f, 0x93,
-        0x7b, 0x67, 0xcb, 0xb9, 0x66, 0x89, 0x99, 0x8a, 0xac, 0x8c, 0xa0, 0x9c,
-        0x70, 0xaf, 0x81, 0x88, 0x9c, 0x7e, 0xa8, 0xa5, 0x65, 0x8c, 0xa1, 0x8c,
-        0x83, 0x85, 0x9d, 0xcb, 0x4b, 0xc1, 0xb5, 0xa2, 0x75, 0x63, 0x75, 0xbd,
-        0x34, 0xae, 0xca, 0xa2, 0x89, 0x7a, 0x69, 0xb0, 0x70, 0xae, 0x94, 0x76,
-        0x85, 0x93, 0x6a, 0x90, 0x6a, 0x8a, 0xac, 0x71, 0x7e, 0x81, 0xa2, 0x71,
-        0x98, 0x86, 0x99, 0x76, 0x8f, 0x6f, 0x90, 0x93, 0x7c, 0x72, 0x81, 0x8c,
-        0x78, 0x77, 0x97, 0x84, 0x98, 0x70, 0x96, 0x9a, 0x9b, 0x93, 0x92, 0x5f,
-        0xaa, 0x88, 0x5b, 0x74, 0xaa, 0x96, 0x6a, 0x73, 0x87, 0x83, 0x72, 0x89,
-        0xab, 0x8a, 0x5f, 0x71, 0xa4, 0x94, 0x92, 0x60, 0x96, 0x7b, 0x53, 0x88,
-        0x69, 0x8b, 0x5e, 0x7b, 0xa0, 0x83, 0x70, 0x95, 0x6d, 0x9b, 0x6d, 0x98,
-        0x99, 0x86, 0x6e, 0x7a, 0x87, 0x86, 0x68, 0x8a, 0x7e, 0x87, 0x90, 0x7d,
-        0x76, 0x93, 0x80, 0x8a, 0x8f, 0x97, 0xac, 0x71, 0xa2, 0x96, 0x7f, 0x8e,
-        0xc2, 0x71, 0xab, 0xa9, 0xd1, 0x85, 0x8c, 0x74, 0x70, 0x72, 0xff, 0x77,
-        0x6d, 0x77, 0x91, 0x5d, 0x71, 0x5d, 0xb2, 0xb1, 0x38, 0x76, 0xa6, 0x80,
-        0x91, 0x86, 0xa3, 0x9c, 0x85, 0x95, 0x99, 0xab, 0x8a, 0x6e, 0x9f, 0xa6,
-        0x75, 0xa9, 0xb3, 0x97, 0x69, 0x85, 0xa4, 0xc9, 0x59, 0xb4, 0xca, 0x8d,
-        0x5c, 0x67, 0x7d, 0xcd, 0x29, 0xca, 0xdb, 0x8c, 0x86, 0x8c, 0x70, 0xaa,
-        0x5c, 0x9e, 0x98, 0x86, 0x92, 0x7e, 0x6b, 0x8e, 0x8f, 0x6a, 0x84, 0x71,
-        0x9a, 0x76, 0x87, 0x84, 0x8b, 0x7f, 0x7f, 0x6e, 0xa3, 0x83, 0x85, 0x78,
-        0x6f, 0x7c, 0x6f, 0x96, 0x95, 0x8c, 0xa3, 0x72, 0x92, 0x66, 0x7b, 0x99,
-        0x9c, 0x9c, 0x9a, 0x63, 0xaa, 0x81, 0x7f, 0x90, 0x8c, 0xa0, 0x7e, 0x67,
-        0x94, 0x96, 0x7f, 0x8a, 0x95, 0x91, 0x5c, 0x73, 0x88, 0x9b, 0x85, 0x70,
-        0x87, 0x79, 0x56, 0x92, 0x69, 0x95, 0x62, 0x78, 0x93, 0x83, 0x63, 0x98,
-        0x7a, 0xa4, 0x95, 0x7c, 0x8e, 0x69, 0x86, 0x92, 0x7d, 0x6b, 0x69, 0x85,
-        0xa8, 0x90, 0x7c, 0x7b, 0x9e, 0x87, 0x7b, 0x90, 0x98, 0x7a, 0xa4, 0x92,
-        0xad, 0x97, 0xa0, 0x6d, 0xa6, 0x74, 0xb7, 0x7f, 0xb9, 0x94, 0x6c, 0x77,
-        0x65, 0x6f, 0xfc, 0x7d, 0x68, 0x74, 0xa1, 0x6c, 0x71, 0x61, 0xc3, 0xb5,
-        0x60, 0x86, 0x8b, 0x7d, 0x89, 0x8b, 0x93, 0xa4, 0x68, 0xa0, 0x8f, 0x73,
-        0x96, 0x6e, 0x81, 0x99, 0x81, 0x9d, 0xae, 0x93, 0x6a, 0x8b, 0x9a, 0xcb,
-        0x68, 0xaf, 0xca, 0x81, 0x73, 0x6e, 0x70, 0xd7, 0x49, 0xb9, 0xc5, 0x9d,
-        0x87, 0x8d, 0x61, 0xa8, 0x5e, 0xa4, 0xb7, 0xab, 0x96, 0x84, 0x76, 0x98,
-        0x84, 0x99, 0x8f, 0x70, 0x79, 0x94, 0xa5, 0x87, 0x6e, 0x73, 0x63, 0x7e,
-        0x83, 0x8c, 0x88, 0x71, 0x7a, 0x81, 0x7d, 0x94, 0x92, 0x89, 0xab, 0x7a,
-        0x96, 0x66, 0x7b, 0x8b, 0x8f, 0x8e, 0x94, 0x5b, 0xa0, 0x7f, 0x82, 0x84,
-        0x84, 0x80, 0x7d, 0x81, 0x89, 0x7b, 0x97, 0x78, 0x83, 0x93, 0x4c, 0x95,
-        0x7f, 0x93, 0x8e, 0x70, 0x89, 0x81, 0x69, 0x87, 0x76, 0x73, 0x9a, 0x74,
-        0xa2, 0x88, 0x5e, 0xac, 0x74, 0x8e, 0x74, 0x8e, 0x94, 0x85, 0x7b, 0x7a,
-        0x72, 0x82, 0x68, 0x77, 0x96, 0x8a, 0x7b, 0x6c, 0x88, 0x8b, 0x6b, 0x86,
-        0xa4, 0x88, 0xac, 0xa1, 0x90, 0x8e, 0x85, 0x6d, 0xb1, 0x69, 0xb1, 0xa2,
-        0xbe, 0x9a, 0x7c, 0xb4, 0x63, 0x56, 0xf2, 0x90, 0x5e, 0x71, 0xa3, 0x6a,
-        0x8b, 0x67, 0xbe, 0xa8, 0x6e, 0x8b, 0x90, 0x83, 0xa0, 0x78, 0x9f, 0xa5,
-        0x65, 0xa3, 0x8b, 0x94, 0x84, 0x6c, 0xa5, 0x97, 0x7d, 0xa7, 0x9f, 0x9c,
-        0x62, 0x7d, 0xb5, 0xb1, 0x58, 0x98, 0xba, 0x8d, 0x7f, 0x57, 0x86, 0xc5,
-        0x39, 0xb3, 0xc9, 0xa9, 0x89, 0x8e, 0x55, 0xaf, 0x54, 0xb4, 0xb0, 0x8f,
-        0x8b, 0x7c, 0x6e, 0x8e, 0x96, 0x90, 0x8a, 0x83, 0x84, 0x8c, 0x96, 0x7f,
-        0x89, 0x67, 0x99, 0x60, 0x74, 0x8d, 0x9b, 0x82, 0x6f, 0x61, 0x84, 0x9a,
-        0x7c, 0x85, 0x86, 0x7c, 0x9b, 0x5f, 0x81, 0x96, 0x90, 0x9b, 0xa0, 0x58,
-        0xaf, 0x78, 0x81, 0x8f, 0x96, 0x81, 0x77, 0x7d, 0xa2, 0x85, 0x74, 0x84,
-        0x99, 0x8d, 0x5f, 0x77, 0x8a, 0x8c, 0x85, 0x78, 0x8f, 0x80, 0x5c, 0x6f,
-        0x77, 0x73, 0x80, 0x99, 0x83, 0x89, 0x6f, 0x8e, 0x85, 0x7e, 0x6c, 0x81,
-        0x99, 0x89, 0x69, 0x70, 0x8c, 0x8f, 0x6b, 0x89, 0x80, 0x7a, 0x83, 0x7a,
-        0x96, 0x99, 0x73, 0x76, 0x9c, 0x67, 0xab, 0xab, 0xbd, 0x8b, 0x85, 0x90,
-        0xb0, 0x6b, 0xbd, 0x9c, 0xb9, 0xa0, 0x7c, 0x7d, 0x66, 0x78, 0xdb, 0x97,
-        0x55, 0x67, 0x96, 0x69, 0x80, 0x49, 0xc1, 0xbb, 0x6c, 0x91, 0x8a, 0x92,
-        0x9a, 0x98, 0xa5, 0x98, 0x51, 0xa6, 0x99, 0x8e, 0x73, 0x73, 0x9d, 0x9f,
-        0x77, 0xa6, 0xa4, 0x92, 0x64, 0x75, 0xac, 0xb2, 0x5d, 0xa1, 0xab, 0xa4,
-        0x5a, 0x5b, 0xb3, 0xb7, 0x2d, 0xca, 0xc8, 0x76, 0x94, 0x8e, 0x59, 0xb0,
-        0x52, 0x9d, 0xbd, 0x89, 0x97, 0x84, 0x5d, 0x9a, 0x87, 0x9b, 0x94, 0x6c,
-        0x7b, 0xaa, 0x8a, 0x8b, 0x79, 0x5d, 0x90, 0x5c, 0x8b, 0x7b, 0xbe, 0x68,
-        0x84, 0x6f, 0x75, 0x72, 0x98, 0x82, 0x92, 0x7a, 0xa2, 0x6e, 0x7b, 0x7d,
-        0x9c, 0x99, 0x97, 0x5d, 0x9b, 0x69, 0x80, 0xa3, 0x96, 0x8d, 0x7c, 0x82,
-        0xa3, 0x76, 0x95, 0x67, 0x93, 0x8e, 0x62, 0x7b, 0x78, 0x96, 0x69, 0x67,
-        0x84, 0x8f, 0x62, 0x80, 0x88, 0x7e, 0x6c, 0x94, 0xab, 0x8b, 0x82, 0x9e,
-        0x7e, 0x8c, 0x70, 0x83, 0x9c, 0x9c, 0x80, 0x87, 0x8f, 0xa1, 0x7f, 0x81,
-        0x95, 0x83, 0x6d, 0x7a, 0xa0, 0x77, 0x6d, 0x76, 0x91, 0x7e, 0xa3, 0x62,
-        0xa0, 0x93, 0x7e, 0x97, 0xb6, 0x6c, 0xad, 0x72, 0xb2, 0x95, 0x73, 0x83,
-        0x62, 0x56, 0xe2, 0x99, 0x6e, 0x66, 0xb0, 0x6c, 0x75, 0x4e, 0xb2, 0xc7,
-        0x51, 0x98, 0x90, 0x8c, 0x82, 0x63, 0xa8, 0x99, 0x54, 0xc1, 0x87, 0x80,
-        0x79, 0x62, 0xad, 0x81, 0x76, 0x99, 0xa9, 0x9b, 0x4e, 0x8c, 0xaf, 0xb6,
-        0x5d, 0x9b, 0xb4, 0x9f, 0x6d, 0x60, 0xa5, 0xb5, 0x3e, 0xb2, 0xc4, 0x96,
-        0x86, 0x6d, 0x48, 0x99, 0x50, 0xc1, 0xa8, 0x93, 0x8a, 0x92, 0x7d, 0x8f,
-        0x74, 0x87, 0x91, 0x71, 0x8c, 0x87, 0x90, 0x80, 0x80, 0x82, 0x7b, 0x85,
-        0x81, 0x7f, 0xa7, 0x6a, 0x78, 0x4e, 0x90, 0x85, 0x9f, 0x93, 0x91, 0x91,
-        0xa5, 0x6e, 0x9d, 0xa7, 0x9e, 0x7f, 0x9a, 0x66, 0xbe, 0x6f, 0x82, 0x81,
-        0x85, 0x86, 0x89, 0x6c, 0x88, 0x92, 0x6d, 0x6a, 0x8c, 0x95, 0x68, 0x70,
-        0x91, 0x9b, 0x76, 0x59, 0x87, 0x93, 0x6f, 0x79, 0x7a, 0x99, 0x7d, 0x76,
-        0xa3, 0x9c, 0x69, 0x75, 0x8f, 0x8e, 0x7e, 0x7a, 0x80, 0x8b, 0x76, 0x82,
-        0x70, 0x71, 0x77, 0x7a, 0x88, 0xa1, 0x79, 0x75, 0x9e, 0x7e, 0x6d, 0x6f,
-        0xa5, 0x84, 0xb1, 0x77, 0xad, 0x94, 0x98, 0x90, 0xa7, 0x5c, 0xb6, 0x84,
-        0x99, 0x91, 0x71, 0x7b, 0x6d, 0x54, 0xd2, 0x84, 0x5d, 0x75, 0xb4, 0x7e,
-        0x7d, 0x53, 0xc5, 0x98, 0x70, 0xaa, 0x9e, 0x81, 0x7d, 0x68, 0xa7, 0x8d,
-        0x63, 0xab, 0x9b, 0x96, 0x7e, 0x6b, 0xa3, 0x9e, 0x6d, 0x98, 0xaf, 0x9b,
-        0x78, 0x74, 0xae, 0xc7, 0x70, 0x98, 0xd4, 0x9a, 0x6e, 0x75, 0xa2, 0xcd,
-        0x42, 0xb0, 0xc9, 0x89, 0x88, 0x77, 0x6a, 0xa4, 0x66, 0xb5, 0xbc, 0x8a,
-        0x96, 0x87, 0x5e, 0xa5, 0x87, 0x95, 0x91, 0x5d, 0x85, 0x91, 0xaa, 0x8f,
-        0x99, 0x78, 0x79, 0x74, 0x7f, 0x81, 0xa1, 0x74, 0x77, 0x64, 0x6c, 0x94,
-        0xa0, 0x8b, 0x9b, 0x8e, 0xac, 0x6a, 0x98, 0x9c, 0x7a, 0x9f, 0xab, 0x7e,
-        0xa3, 0x8b, 0x68, 0x7f, 0x84, 0x9f, 0x93, 0x77, 0x90, 0x98, 0x8f, 0x87,
-        0x81, 0x8e, 0x76, 0x95, 0x66, 0x78, 0x85, 0x79, 0x95, 0x89, 0x64, 0x8e,
-        0x8a, 0x87, 0x6f, 0x65, 0xa4, 0x98, 0x7a, 0x83, 0x85, 0x7e, 0x6b, 0xaa,
-        0x81, 0x94, 0x7c, 0x6e, 0x78, 0x85, 0x87, 0x6d, 0x7a, 0x92, 0x67, 0x7a,
-        0x8d, 0x95, 0x77, 0x7f, 0x9f, 0x71, 0xb1, 0xa1, 0xb2, 0x91, 0x7f, 0xb0,
-        0xac, 0x5c, 0xaf, 0x6a, 0xae, 0x98, 0x63, 0x7e, 0x67, 0x6f, 0xc4, 0x8a,
-        0x75, 0x61, 0xac, 0x73, 0x86, 0x54, 0xc3, 0xa8, 0x5d, 0xa9, 0xb4, 0x9b,
-        0x80, 0x6d, 0xa1, 0x8d, 0x64, 0xaa, 0x86, 0x96, 0x86, 0x6c, 0x9b, 0x8b,
-        0x73, 0x9f, 0x9a, 0x87, 0x64, 0x6c, 0xad, 0xa6, 0x64, 0x8a, 0xbe, 0x88,
-        0x67, 0x67, 0xaf, 0xb0, 0x71, 0xae, 0xde, 0x95, 0x9f, 0x7c, 0x7d, 0xa1,
-        0x79, 0xb8, 0xaa, 0x9c, 0x84, 0x91, 0x6b, 0xac, 0x74, 0xa1, 0xad, 0x74,
-        0x88, 0x93, 0x94, 0x72, 0x97, 0x7a, 0x78, 0x86, 0x76, 0x93, 0xb1, 0x6f,
-        0x91, 0x44, 0x96, 0x8e, 0x8e, 0xa5, 0x9a, 0x70, 0x99, 0x79, 0x84, 0x82,
-        0x7f, 0x78, 0xac, 0x6f, 0x9c, 0x80, 0x7d, 0x87, 0x7f, 0x9d, 0x6a, 0x71,
-        0x7c, 0x92, 0x78, 0x7a, 0x93, 0x90, 0x55, 0x83, 0x7a, 0x8a, 0x9a, 0x65,
-        0x86, 0x9b, 0x7c, 0x6b, 0xa3, 0x85, 0x86, 0x71, 0xab, 0x9a, 0x86, 0x90,
-        0x86, 0x88, 0x88, 0x88, 0x99, 0x98, 0x77, 0x86, 0x88, 0x90, 0x79, 0x7c,
-        0x6e, 0x9f, 0x76, 0x70, 0x84, 0x67, 0x7e, 0x8b, 0xa5, 0x68, 0xa7, 0x9d,
-        0xb5, 0x9b, 0x8b, 0x8a, 0xc0, 0x60, 0x9e, 0x83, 0xb0, 0xb7, 0x65, 0x7f,
-        0x7a, 0x7e, 0xc3, 0x7b, 0x74, 0x8f, 0xa4, 0x68, 0x5f, 0x47, 0xbb, 0xa4,
-        0x74, 0x95, 0xab, 0x80, 0x70, 0x5c, 0x9a, 0x8a, 0x7d, 0xa5, 0x90, 0x7d,
-        0x86, 0x68, 0xb1, 0x73, 0x6d, 0xad, 0x93, 0x8d, 0x7b, 0x64, 0xbd, 0xae,
-        0x7a, 0x98, 0xcb, 0x97, 0x83, 0x67, 0xab, 0xb0, 0x61, 0xa7, 0xcd, 0x7e,
-        0x87, 0x78, 0x76, 0x95, 0x6a, 0xba, 0xa9, 0x84, 0x8f, 0x95, 0x7c, 0x8b,
-        0x90, 0x89, 0x8b, 0x81, 0x87, 0x8b, 0x76, 0x73, 0x6f, 0x61, 0x94, 0x73,
-        0x83, 0x97, 0xb3, 0x6b, 0x9c, 0x55, 0x7f, 0x96, 0x9a, 0x92, 0x85, 0x52,
-        0xc6, 0x73, 0x88, 0x9c, 0x7c, 0x86, 0x98, 0x6d, 0x99, 0x87, 0x80, 0x7c,
-        0x7d, 0x98, 0x74, 0x7c, 0x89, 0x8a, 0x7d, 0x7b, 0x83, 0x90, 0x7d, 0x81,
-        0x7a, 0xa0, 0x86, 0x5f, 0x74, 0x8e, 0x68, 0x7b, 0x6c, 0x86, 0x90, 0x84,
-        0x7e, 0xae, 0x73, 0x6f, 0x8d, 0x81, 0x7c, 0x93, 0xa0, 0xb3, 0x6b, 0x9a,
-        0x88, 0xab, 0x8a, 0x94, 0x9c, 0x87, 0x9c, 0x75, 0x7d, 0x8f, 0x7c, 0x7f,
-        0x9b, 0x69, 0xa8, 0x99, 0x9d, 0x89, 0x8f, 0x72, 0xba, 0x61, 0xac, 0x91,
-        0xb5, 0xa7, 0x84, 0x99, 0x71, 0x7e, 0xd0, 0x7c, 0x6d, 0x66, 0xb6, 0x72,
-        0x79, 0x61, 0xb6, 0xab, 0x69, 0xa0, 0xaa, 0x7d, 0x74, 0x61, 0x95, 0xa5,
-        0x71, 0xb0, 0x93, 0x95, 0x86, 0x7d, 0x9f, 0x7e, 0x6c, 0x97, 0x85, 0x87,
-        0x72, 0x7b, 0xb4, 0xad, 0x84, 0x7b, 0xcd, 0xa9, 0x7e, 0x6d, 0xc8, 0xc7,
-        0x7e, 0xb7, 0xcf, 0x98, 0x7b, 0x7c, 0x69, 0xaf, 0x64, 0xa6, 0xc1, 0x8e,
-        0x8f, 0x9c, 0x7d, 0x93, 0x7a, 0x96, 0x8a, 0x65, 0x92, 0x95, 0x8d, 0x6f,
-        0x9f, 0x7f, 0x65, 0x69, 0x7a, 0x92, 0x9f, 0x5c, 0x90, 0x4e, 0x69, 0x89,
-        0x8f, 0x9c, 0xa8, 0x7a, 0xb6, 0x7d, 0x84, 0x97, 0x7f, 0x91, 0x8d, 0x71,
-        0xae, 0x86, 0x80, 0x78, 0x81, 0x87, 0x6e, 0x88, 0x87, 0x7f, 0x8f, 0x9d,
-        0x78, 0x91, 0x74, 0x91, 0x7f, 0x7a, 0x80, 0x63, 0x93, 0xa0, 0x7f, 0x6f,
-        0xa3, 0x88, 0x76, 0x5c, 0x6e, 0xa1, 0x6e, 0x7f, 0x84, 0x8b, 0x87, 0x6d,
-        0x87, 0x9f, 0x79, 0x7c, 0x83, 0x89, 0x7e, 0x86, 0xa0, 0x82, 0x80, 0x8e,
-        0x8b, 0x6c, 0x6e, 0x69, 0x9f, 0x79, 0xaa, 0x6e, 0xa2, 0x8f, 0x9d, 0x87,
-        0xb4, 0x5d, 0xba, 0x6c, 0xaf, 0xa0, 0x84, 0x87, 0x8c, 0x89, 0xcb, 0x6f,
-        0x8e, 0x71, 0xae, 0x5d, 0x6c, 0x61, 0xb3, 0xaf, 0x7a, 0x94, 0xb1, 0x8a,
-        0x80, 0x65, 0x8a, 0x9d, 0x61, 0xb6, 0x8b, 0x97, 0x8a, 0x73, 0xa8, 0x82,
-        0x74, 0x8a, 0x9c, 0x73, 0x61, 0x69, 0xb8, 0x9f, 0x76, 0x90, 0xc5, 0xaa,
-        0x6b, 0x5f, 0xb7, 0xce, 0x6d, 0xb7, 0xcc, 0x97, 0x7a, 0x81, 0x95, 0xbe,
-        0x78, 0xb1, 0xb4, 0x97, 0x8e, 0x99, 0x70, 0xa2, 0x72, 0x8d, 0x8e, 0x7d,
-        0x90, 0x9f, 0x7b, 0x63, 0x87, 0x89, 0x7a, 0x5f, 0x81, 0x97, 0x8d, 0x78,
-        0x94, 0x64, 0x95, 0x9d, 0x90, 0x87, 0xb3, 0x6e, 0xc2, 0x80, 0x94, 0x86,
-        0x87, 0x93, 0xb3, 0x57, 0xb8, 0x73, 0x8a, 0x81, 0x6f, 0x95, 0x89, 0x82,
-        0x94, 0x7a, 0x8e, 0x97, 0x8a, 0x91, 0x7f, 0x77, 0x98, 0x72, 0x67, 0x5f,
-        0x7b, 0x8d, 0x78, 0x74, 0x91, 0x82, 0x86, 0x5c, 0x88, 0xa3, 0x73, 0x6f,
-        0x92, 0x78, 0x9c, 0x95, 0x99, 0x9d, 0x70, 0x89, 0x8f, 0xa7, 0x74, 0x89,
-        0x77, 0x90, 0x72, 0x8d, 0x9c, 0x6f, 0x7a, 0x6c, 0x9f, 0x72, 0xad, 0x6c,
-        0xa5, 0x7a, 0x9d, 0x78, 0xa4, 0x52, 0xbd, 0x94, 0xb5, 0x97, 0x75, 0x78,
-        0x86, 0x72, 0xdf, 0x6f, 0x98, 0x81, 0xab, 0x5d, 0x62, 0x65, 0x9d, 0xbc,
-        0x68, 0x8a, 0xc1, 0x7e, 0x67, 0x7f, 0x88, 0x95, 0x7f, 0xbd, 0x9c, 0x77,
-        0x7d, 0x7e, 0x96, 0x7c, 0x7f, 0xa1, 0xa4, 0x90, 0x7c, 0x74, 0xc0, 0xac,
-        0x7d, 0xa1, 0xdb, 0x85, 0x85, 0x51, 0xbc, 0xb1, 0x6c, 0xcb, 0xd1, 0xa7,
-        0x76, 0x70, 0x7d, 0xba, 0x88, 0xb6, 0xaf, 0xa2, 0x9d, 0x9b, 0x71, 0x96,
-        0x80, 0x89, 0xa3, 0x86, 0x89, 0x8f, 0x76, 0x77, 0xa9, 0x82, 0x8f, 0x69,
-        0x7f, 0x9d, 0xac, 0x80, 0x98, 0x6c, 0x70, 0x72, 0x81, 0x8b, 0xaf, 0x80,
-        0xb1, 0x6f, 0x7c, 0x90, 0x91, 0x82, 0xa5, 0x67, 0x9c, 0x76, 0x8c, 0x6b,
-        0x9c, 0x9b, 0x87, 0x8c, 0x8e, 0x8b, 0xb0, 0x9d, 0x89, 0x8f, 0x76, 0x87,
-        0x9b, 0x90, 0x8e, 0x74, 0x73, 0x91, 0x85, 0x80, 0x81, 0x72, 0x99, 0x84,
-        0x87, 0x95, 0x84, 0x8c, 0x8a, 0x6e, 0x8c, 0x82, 0xad, 0x9d, 0x80, 0x7f,
-        0x96, 0x9c, 0x7f, 0x67, 0xb0, 0x98, 0x69, 0x84, 0x94, 0xa9, 0x7e, 0x83,
-        0x9d, 0x62, 0x92, 0x6e, 0x95, 0x88, 0xa4, 0x90, 0x97, 0x4d, 0xae, 0x89,
-        0xb6, 0xa1, 0x88, 0x9f, 0x7a, 0x70, 0xc2, 0x71, 0x7f, 0x83, 0x90, 0x83,
-        0x5e, 0x50, 0xa9, 0x9f, 0x73, 0x8c, 0xb2, 0x80, 0x79, 0x65, 0x7c, 0x90,
-        0x6d, 0x9a, 0x91, 0x8d, 0x6f, 0x65, 0x97, 0x87, 0x82, 0xa0, 0xa4, 0x8c,
-        0x68, 0x76, 0xa8, 0xa2, 0x7f, 0xa4, 0xcd, 0x91, 0x70, 0x54, 0x95, 0xc6,
-        0x6e, 0x9c, 0xe2, 0xa1, 0x86, 0x82, 0x73, 0xbc, 0x89, 0xaa, 0xb2, 0x7d,
-        0x82, 0x84, 0x8b, 0x9e, 0x84, 0x94, 0xa0, 0x7a, 0x98, 0x9d, 0x99, 0x7b,
-        0x7b, 0x89, 0x8f, 0x66, 0x89, 0x9b, 0xa7, 0x8b, 0x9b, 0x62, 0x9b, 0x78,
-        0x8b, 0x95, 0xbd, 0x7a, 0x9e, 0x61, 0x80, 0x84, 0x89, 0x8e, 0xb4, 0x7b,
-        0xb8, 0x70, 0x75, 0x8e, 0x7b, 0x9c, 0x9e, 0x9f, 0x89, 0x86, 0x9b, 0x7a,
-        0x7b, 0x95, 0x83, 0x95, 0x80, 0x94, 0x85, 0x65, 0x8c, 0x81, 0x67, 0x77,
-        0x94, 0x8a, 0x92, 0x74, 0x72, 0x90, 0x6b, 0x74, 0x7e, 0x75, 0x71, 0x84,
-        0x9e, 0xa6, 0x64, 0x80, 0x8d, 0x7a, 0x8c, 0x82, 0x98, 0x96, 0x64, 0x7d,
-        0x8b, 0x82, 0x6a, 0x7f, 0x97, 0x4e, 0x91, 0x74, 0x94, 0x99, 0x6d, 0x6a,
-        0xb3, 0x5a, 0xb8, 0x64, 0xa3, 0x95, 0x5d, 0x95, 0x90, 0x87, 0xcc, 0x72,
-        0x85, 0x85, 0x8f, 0x55, 0x6f, 0x65, 0x84, 0xb6, 0x7b, 0x77, 0xce, 0x79,
-        0x82, 0x59, 0x8a, 0xa2, 0x68, 0x9b, 0xa3, 0x81, 0x9c, 0x7a, 0x97, 0x87,
-        0x6b, 0x8c, 0x9c, 0xaa, 0x5c, 0x69, 0xb8, 0xb7, 0x7c, 0xa0, 0xb5, 0x92,
-        0x8d, 0x67, 0x96, 0xd2, 0x77, 0xa6, 0xd9, 0xad, 0xaa, 0x79, 0x90, 0xc9,
-        0x81, 0xbf, 0xd0, 0x8d, 0x9d, 0x88, 0x9c, 0x91, 0x90, 0x94, 0x89, 0x8a,
-        0x91, 0x9b, 0x89, 0x79, 0x92, 0x80, 0x8f, 0x7b, 0x7e, 0x8b, 0xb1, 0x85,
-        0xa4, 0x5a, 0xb4, 0x7a, 0xa7, 0x8c, 0xa4, 0x75, 0xb9, 0x66, 0x93, 0x86,
-        0x8a, 0x87, 0xad, 0x64, 0xa2, 0x7e, 0x99, 0x9f, 0x81, 0xa2, 0x9b, 0x88,
-        0x9e, 0xa2, 0xb9, 0x8a, 0x78, 0x84, 0x91, 0x8e, 0x8b, 0x90, 0x83, 0x80,
-        0x64, 0x93, 0x77, 0x89, 0x81, 0x86, 0x96, 0x7a, 0x81, 0xab, 0x6d, 0x73,
-        0x7d, 0x7e, 0xaa, 0x85, 0x95, 0xac, 0x8b, 0x89, 0x8b, 0x77, 0xa3, 0x8b,
-        0xa3, 0xa0, 0x87, 0x86, 0x7a, 0x74, 0x6f, 0x7c, 0x90, 0x58, 0xa2, 0x64,
-        0x94, 0x8b, 0xa0, 0x88, 0xab, 0x53, 0xce, 0x67, 0xb7, 0x7f, 0x8d, 0x69,
-        0x84, 0x74, 0xaf, 0x72, 0xab, 0x70, 0x8f, 0x6e, 0x5d, 0x61, 0x96, 0xa1,
-        0x7b, 0x6f, 0xa2, 0x75, 0x8f, 0x5d, 0x93, 0x72, 0x82, 0x97, 0x76, 0x65,
-        0x7e, 0x96, 0xb3, 0x8b, 0x8d, 0x89, 0x8f, 0x7b, 0x6f, 0x71, 0xa1, 0x9e,
-        0x91, 0x7c, 0xc9, 0x9f, 0x7c, 0x71, 0xa1, 0xba, 0x77, 0xa5, 0xd4, 0xa6,
-        0xa0, 0x82, 0x7b, 0x95, 0x9d, 0xb7, 0xaa, 0x8d, 0x71, 0x87, 0x94, 0x7e,
-        0x88, 0x7f, 0x8b, 0x6e, 0x93, 0x9f, 0x82, 0x88, 0x94, 0x8a, 0x97, 0x7f,
-        0x7d, 0x8c, 0xa0, 0x84, 0xb4, 0x7c, 0x8c, 0x7f, 0x71, 0x8c, 0x8e, 0x7f,
-        0xc6, 0x64, 0x81, 0x8d, 0x89, 0x8d, 0xc4, 0x77, 0xaf, 0x75, 0x92, 0x7f,
-        0x84, 0xa1, 0x99, 0x94, 0x9e, 0x82, 0x7a, 0x98, 0x7e, 0x8e, 0x93, 0x8c,
-        0x6b, 0x93, 0x84, 0xaa, 0x7f, 0x8f, 0x6b, 0x94, 0xa3, 0x8a, 0x78, 0x82,
-        0x60, 0x92, 0x8b, 0x8d, 0x75, 0x8c, 0x8e, 0x6e, 0x7e, 0x9d, 0x6d, 0x8e,
-        0x79, 0x8d, 0x80, 0x89, 0xaa, 0x99, 0x7e, 0xa3, 0x83, 0x95, 0x83, 0x85,
-        0x9c, 0x60, 0x99, 0x78, 0x93, 0x8b, 0x80, 0x82, 0x9d, 0x6b, 0xc2, 0x54,
-        0xb9, 0x7a, 0x83, 0x98, 0x88, 0x65, 0xcb, 0x52, 0xa7, 0x8d, 0x7f, 0x81,
-        0x6b, 0x6d, 0x9e, 0x92, 0x85, 0x82, 0x9f, 0x67, 0x6f, 0x74, 0xaa, 0x75,
-        0x99, 0x9f, 0x8a, 0x8b, 0x88, 0x82, 0xb8, 0x6b, 0x85, 0x99, 0x93, 0x90,
-        0x8d, 0x7a, 0xaa, 0x9d, 0x86, 0x7f, 0xbd, 0x91, 0x67, 0x65, 0x8c, 0xb3,
-        0x87, 0x94, 0xa3, 0x9a, 0x7e, 0x73, 0x83, 0xaa, 0x7a, 0xba, 0xaa, 0x9e,
-        0x9e, 0x86, 0x9a, 0x63, 0x9c, 0x98, 0x5e, 0xa0, 0x9c, 0x9e, 0x8b, 0x85,
-        0xa2, 0x74, 0x80, 0x8d, 0x7e, 0x89, 0xc0, 0x75, 0xa5, 0x3f, 0x97, 0xa2,
-        0x8c, 0x8c, 0x9d, 0x88, 0xa4, 0x5e, 0x75, 0x5f, 0x87, 0x82, 0xbc, 0x72,
-        0xa3, 0x77, 0x83, 0x79, 0x82, 0x95, 0x8d, 0x77, 0x73, 0x81, 0x9d, 0x9b,
-        0x6c, 0x87, 0x93, 0x96, 0x83, 0x86, 0x8b, 0x89, 0x72, 0x7d, 0x96, 0x78,
-        0x67, 0xa2, 0x8d, 0x81, 0x6a, 0x98, 0x75, 0x80, 0x8a, 0x80, 0x9e, 0x82,
-        0x76, 0x9b, 0x6c, 0x94, 0x7a, 0x96, 0x74, 0x92, 0x78, 0x91, 0x7a, 0x7c,
-        0x9a, 0x98, 0x70, 0x5d, 0x9c, 0x4b, 0x70, 0x7d, 0xa9, 0x9b, 0x70, 0x96,
-        0xad, 0x59, 0xc4, 0x63, 0xbc, 0x8f, 0x5c, 0x86, 0x8e, 0x97, 0xa0, 0x7c,
-        0xa6, 0x77, 0xaa, 0x93, 0x68, 0x66, 0x93, 0x91, 0x7b, 0x7e, 0xa2, 0x7a,
-        0x98, 0x77, 0x97, 0x59, 0x84, 0x76, 0x9c, 0x7b, 0x8b, 0x76, 0x88, 0x7a,
-        0x8c, 0x7b, 0xa4, 0xae, 0x6e, 0x7d, 0xb3, 0x99, 0x8d, 0x68, 0x9e, 0x7e,
-        0x77, 0x59, 0x80, 0xbe, 0x80, 0x83, 0xd9, 0x9f, 0x7d, 0x60, 0x8b, 0x98,
-        0x7f, 0x9e, 0xa3, 0x8d, 0x7d, 0x81, 0x9e, 0x78, 0x99, 0x94, 0x70, 0x80,
-        0x9b, 0x89, 0x8c, 0x6d, 0x9c, 0x95, 0x76, 0x7c, 0x83, 0x87, 0x97, 0x93,
-        0x89, 0x6d, 0x77, 0x7e, 0x7e, 0x87, 0x8e, 0x7e, 0x94, 0x61, 0x94, 0xa2,
-        0x94, 0x91, 0xa1, 0x64, 0xc1, 0x78, 0x79, 0xaf, 0x67, 0x7a, 0x9b, 0xa1,
-        0x95, 0x8e, 0x97, 0x84, 0x7b, 0x85, 0x80, 0xa1, 0x6f, 0x87, 0x79, 0x83,
-        0x73, 0x9d, 0x81, 0x64, 0x7a, 0x7f, 0x8f, 0x91, 0x73, 0x97, 0x74, 0x8b,
-        0x7e, 0x88, 0x7f, 0x7e, 0x6e, 0xa1, 0x85, 0x8f, 0x77, 0x93, 0x7a, 0x6f,
-        0x7b, 0x91, 0x67, 0x73, 0x8b, 0x97, 0x6d, 0x87, 0x84, 0xf8, 0xff, 0xff,
-        0x88, 0xf8, 0xff, 0xff, 0xe6, 0xf8, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
-        0x80, 0x02, 0x00, 0x00, 0x73, 0x84, 0xbb, 0xa4, 0xa5, 0x44, 0x5c, 0xb1,
-        0x8e, 0x50, 0x82, 0x8b, 0x81, 0x86, 0x48, 0x80, 0xa9, 0x61, 0xa3, 0xa8,
-        0xca, 0x5a, 0x9d, 0x8a, 0x89, 0x7c, 0x65, 0x91, 0x5e, 0x70, 0x84, 0x71,
-        0xbc, 0x36, 0x8e, 0x8b, 0xa6, 0x63, 0xb7, 0x75, 0x92, 0x59, 0x60, 0x7e,
-        0x33, 0x8f, 0x90, 0x7a, 0xa9, 0x27, 0x72, 0x80, 0x62, 0x95, 0x93, 0x7b,
-        0x60, 0x46, 0x40, 0x55, 0x01, 0x9e, 0x8a, 0x6b, 0x58, 0x8a, 0xa6, 0xb7,
-        0x91, 0x39, 0x72, 0xb4, 0x6e, 0x67, 0x83, 0x91, 0x82, 0x7b, 0x64, 0x7a,
-        0x87, 0x6e, 0xb0, 0xa0, 0xd3, 0x53, 0xb7, 0x93, 0x76, 0xa6, 0x68, 0x8a,
-        0x74, 0x6a, 0x96, 0x6e, 0xb3, 0x53, 0xaa, 0x89, 0xf1, 0x76, 0xb8, 0x75,
-        0x8b, 0x66, 0x5f, 0x6e, 0x52, 0x92, 0x6f, 0x82, 0xbe, 0x45, 0x8d, 0x69,
-        0x98, 0x98, 0x80, 0x87, 0x73, 0x7d, 0x4d, 0x42, 0x1f, 0xa5, 0x6a, 0x73,
-        0x47, 0x87, 0x8a, 0xd1, 0x75, 0x30, 0x91, 0xae, 0x60, 0x82, 0x7a, 0x94,
-        0x75, 0x71, 0x6a, 0x7c, 0x74, 0x7a, 0xac, 0xa2, 0xb6, 0x51, 0xc6, 0x97,
-        0x63, 0xa0, 0x67, 0x7f, 0x80, 0x69, 0x88, 0x6b, 0xa5, 0x5e, 0xc2, 0x72,
-        0xf4, 0x6e, 0xaf, 0x76, 0x7f, 0x7c, 0x55, 0x68, 0x67, 0x97, 0x61, 0x7b,
-        0xbe, 0x5e, 0xab, 0x58, 0xca, 0xa2, 0x77, 0x7a, 0x8f, 0x6e, 0x54, 0x33,
-        0x4d, 0xa7, 0x5d, 0x66, 0x47, 0x92, 0x6f, 0xd6, 0x5c, 0x25, 0xa9, 0xbc,
-        0x5c, 0xb8, 0x64, 0x9b, 0x58, 0x6e, 0x77, 0x76, 0x6a, 0x94, 0xb2, 0xac,
-        0x9a, 0x51, 0xd0, 0x94, 0x62, 0xcc, 0x5a, 0x7f, 0x74, 0x6e, 0x7d, 0x71,
-        0x9b, 0x69, 0xd3, 0x64, 0xef, 0x76, 0xaa, 0x75, 0x89, 0x84, 0x50, 0x76,
-        0x72, 0x97, 0x5f, 0x77, 0xc5, 0x66, 0xce, 0x3a, 0xe5, 0xad, 0x5a, 0x81,
-        0x9e, 0x8e, 0x60, 0x3d, 0x6d, 0xa9, 0x46, 0x6b, 0x44, 0x89, 0x4d, 0xd8,
-        0x4c, 0x28, 0xb1, 0xb7, 0x60, 0xc7, 0x57, 0xb5, 0x50, 0x68, 0x88, 0x7c,
-        0x60, 0x98, 0xac, 0x9a, 0x7f, 0x51, 0xce, 0x8a, 0x5e, 0xd8, 0x51, 0x7d,
-        0x68, 0x6e, 0x7f, 0x6e, 0x90, 0x7b, 0xdf, 0x60, 0xda, 0x77, 0x91, 0x6f,
-        0x85, 0xa0, 0x58, 0x73, 0x70, 0x93, 0x51, 0x7d, 0xb9, 0x70, 0xf5, 0x31,
-        0xe9, 0xa3, 0x47, 0x76, 0xa7, 0x9b, 0x72, 0x3d, 0x90, 0xb2, 0x57, 0x64,
-        0x5b, 0x6f, 0x2b, 0xcf, 0x52, 0x28, 0xc1, 0xa7, 0x6a, 0x78, 0x51, 0xad,
-        0x49, 0x70, 0x90, 0x81, 0x5c, 0x7e, 0x9e, 0x99, 0x77, 0x50, 0xc0, 0x94,
-        0x63, 0xb7, 0x4d, 0x71, 0x58, 0x66, 0x76, 0x6d, 0x78, 0x6a, 0xe1, 0x40,
-        0xc7, 0x73, 0x7f, 0x65, 0x7c, 0x7f, 0x4d, 0x80, 0x64, 0x95, 0x57, 0x81,
-        0xb1, 0x5e, 0xff, 0x26, 0xd6, 0xa2, 0x3a, 0x73, 0xa7, 0x81, 0x76, 0x5d,
-        0x92, 0xb1, 0x58, 0x48, 0x4e, 0x5e, 0x1a, 0xc8, 0x58, 0x2c, 0xb6, 0xa7,
-        0x67, 0x89, 0x5e, 0xa0, 0x4f, 0x78, 0x93, 0x8b, 0x57, 0x7b, 0x95, 0x78,
-        0x6e, 0x46, 0xb2, 0x98, 0x55, 0xd3, 0x5e, 0x66, 0x56, 0x68, 0x74, 0x7e,
-        0x72, 0x74, 0xdd, 0x36, 0xa6, 0x64, 0x65, 0x6b, 0x81, 0x98, 0x56, 0x76,
-        0x65, 0x93, 0x58, 0x7d, 0x9b, 0x82, 0xef, 0x44, 0xbf, 0xa4, 0x3d, 0x57,
-        0xa0, 0xa7, 0x7a, 0x74, 0x9f, 0xa8, 0x70, 0x52, 0x55, 0x5f, 0x1a, 0x94,
-        0x64, 0x37, 0xa7, 0xa6, 0x80, 0x7d, 0x6e, 0x99, 0x5d, 0x81, 0x8a, 0x99,
-        0x5c, 0x76, 0x8f, 0x44, 0x68, 0x50, 0x94, 0x97, 0x63, 0xb6, 0x73, 0x56,
-        0x5b, 0x70, 0x66, 0x8b, 0x72, 0x78, 0xcc, 0x31, 0x8b, 0x68, 0x4a, 0x74,
-        0x7d, 0x99, 0x54, 0x91, 0x6a, 0x90, 0x5d, 0x80, 0x8c, 0x82, 0xcd, 0x4f,
-        0xb0, 0x96, 0x63, 0x56, 0x97, 0xb3, 0x7e, 0x97, 0xa4, 0x9d, 0x7a, 0x5d,
-        0x49, 0x36, 0x18, 0x64, 0x60, 0x43, 0x89, 0xa2, 0x6a, 0x49, 0x7f, 0x58,
-        0x6a, 0x83, 0x77, 0x9d, 0x70, 0x3b, 0x83, 0x21, 0x59, 0x52, 0x6d, 0x95,
-        0x48, 0xa8, 0x8a, 0x42, 0x50, 0x6d, 0x44, 0x95, 0x69, 0x50, 0xc1, 0x4b,
-        0x7c, 0x59, 0x42, 0x78, 0x77, 0x7f, 0x5b, 0x98, 0x67, 0x89, 0x55, 0x8b,
-        0x82, 0x47, 0xb7, 0x64, 0x9d, 0x83, 0x5c, 0x53, 0x89, 0x90, 0x79, 0xb2,
-        0x90, 0x98, 0x85, 0x5a, 0x4d, 0x2b, 0x19, 0x1e, 0x52, 0x50, 0x57, 0x8b,
-        0x73, 0x3a, 0x88, 0x1e, 0x65, 0x80, 0x4d, 0x9b, 0x6c, 0x3c, 0x86, 0x26,
-        0x5b, 0x56, 0x36, 0x98, 0x49, 0x87, 0x9f, 0x2a, 0x40, 0x61, 0x27, 0x9d,
-        0x63, 0x40, 0xa8, 0x46, 0x6b, 0x52, 0x52, 0x7f, 0x67, 0x6a, 0x58, 0xa1,
-        0x5d, 0x6d, 0x5f, 0x9a, 0x72, 0x3a, 0x99, 0x63, 0x8c, 0x80, 0x68, 0x58,
-        0x72, 0x6a, 0x7c, 0xbb, 0x7e, 0x78, 0x94, 0x60, 0x72, 0xfb, 0xff, 0xff,
-        0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x96, 0xfe, 0xff, 0xff,
-        0x8f, 0x00, 0x00, 0x00, 0x8f, 0xfc, 0xff, 0xff, 0xb4, 0xfe, 0xff, 0xff,
-        0xc1, 0xfd, 0xff, 0xff, 0x59, 0xff, 0xff, 0xff, 0xbc, 0xfe, 0xff, 0xff,
-        0x09, 0xff, 0xff, 0xff, 0x9e, 0xfb, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
-        0x10, 0x00, 0x00, 0x00, 0xe9, 0x03, 0x00, 0x00, 0x2b, 0xfd, 0xff, 0xff,
-        0x3b, 0xfd, 0xff, 0xff, 0x91, 0x01, 0x00, 0x00, 0x60, 0xfb, 0xff, 0xff,
-        0x04, 0xfd, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-        0x08, 0x00, 0x00, 0x00, 0xf0, 0x03, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-        0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-        0x08, 0x00, 0x00, 0x00, 0x18, 0x03, 0x00, 0x00, 0x78, 0x03, 0x00, 0x00,
-        0x88, 0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-        0x0c, 0x00, 0x00, 0x00, 0x64, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
-        0xb2, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x24, 0x00, 0x00, 0x00,
-        0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
-        0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,
-        0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x84, 0xfd, 0xff, 0xff,
-        0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-        0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,
-        0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x22, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
-        0x1c, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-        0x1c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
-        0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-        0x04, 0x00, 0x00, 0x00, 0xec, 0xfd, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
-        0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x28, 0x17, 0xb1, 0x3d,
-        0x01, 0x00, 0x00, 0x00, 0x84, 0xdb, 0x33, 0x41, 0x01, 0x00, 0x00, 0x00,
-        0x9d, 0xf0, 0x2c, 0xc1, 0x8e, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
-        0x48, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-        0x48, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67,
-        0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f,
-        0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74,
-        0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f,
-        0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00,
-        0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x00, 0x00,
-        0x84, 0xfe, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-        0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-        0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-        0xac, 0x5f, 0xf6, 0x39, 0x01, 0x00, 0x00, 0x00, 0x1d, 0xaf, 0x62, 0x3d,
-        0x01, 0x00, 0x00, 0x00, 0x5e, 0x1b, 0x83, 0xbd, 0x22, 0xfe, 0xff, 0xff,
-        0x00, 0x00, 0x00, 0x03, 0x1c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
-        0x08, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-        0x08, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
-        0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x0f, 0x72, 0x3d,
-        0x01, 0x00, 0x00, 0x00, 0x38, 0x1d, 0x71, 0x41, 0x01, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x96, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
-        0x20, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-        0x28, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,
-        0x61, 0x70, 0x65, 0x5f, 0x31, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x6c, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
-        0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc6, 0xd0, 0xd0, 0x3d,
-        0x01, 0x00, 0x00, 0x00, 0xf5, 0xff, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x0e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
-        0x3c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-        0x50, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67,
-        0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x2f, 0x46, 0x61,
-        0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d,
-        0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x00, 0x00, 0x00,
-        0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-        0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,
-        0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,
-        0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-        0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xf7, 0x5e, 0x6c, 0x3a,
-        0x01, 0x00, 0x00, 0x00, 0x30, 0x42, 0xec, 0x3d, 0x01, 0x00, 0x00, 0x00,
-        0x42, 0xca, 0xe8, 0xbd, 0xaa, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,
-        0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-        0x1c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x43, 0x6f, 0x6e, 0x76,
-        0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00,
-        0x08, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
-        0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-        0xec, 0xcd, 0xc0, 0x38, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00,
-        0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x02, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-        0x08, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-        0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,
-        0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-        0x25, 0xf5, 0xe8, 0x37, 0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00,
-        0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff,
-        0x00, 0x00, 0x00, 0x09, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-        0x10, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
-        0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f,
-        0x14, 0x00, 0x1c, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00,
-        0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-        0x10, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
-        0x14, 0x00, 0x18, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00,
-        0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x02, 0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-        0x30, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-        0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x0c, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00,
-        0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-        0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
-        0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff,
-        0x00, 0x19, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-        0x00, 0x09, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x04};
-const int g_tiny_conv_micro_features_model_data_len = 18208;
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index b48cee1d80a..ca090ec9524 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/model.h"
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
-#include "tensorflow/lite/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
@@ -33,8 +33,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model =
-      ::tflite::GetModel(g_tiny_conv_micro_features_model_data);
+  const tflite::Model* model = ::tflite::GetModel(g_model);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     TF_LITE_REPORT_ERROR(error_reporter,
                          "Model provided is schema version %d not equal "
diff --git a/tensorflow/lite/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc b/tensorflow/lite/micro/examples/micro_speech/simple_features/model.cc
similarity index 99%
rename from tensorflow/lite/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
rename to tensorflow/lite/micro/examples/micro_speech/simple_features/model.cc
index 2c66e3904b6..e8fea5b765b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/simple_features/model.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Automatically created from a TensorFlow Lite flatbuffer using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
-// See the README for a full description of the creation process.
+// This is a standard TensorFlow Lite FlatBuffer model file that has been
+// converted into a C data array, so it can be easily compiled into a binary
+// for devices that don't have a file system. It was created using the command:
+// xxd -i model.tflite > model.cc
 
-#include "tensorflow/lite/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h"
+#include "tensorflow/lite/micro/examples/micro_speech/simple_features/model.h"
 
-const unsigned char g_tiny_conv_simple_features_model_data[] = {
+const unsigned char g_model[] = {
     0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
     0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
     0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x4d, 0x00, 0x00,
@@ -1670,4 +1671,4 @@ const unsigned char g_tiny_conv_simple_features_model_data[] = {
     0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
     0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
     0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
-const int g_tiny_conv_simple_features_model_data_len = 19800;
+const int g_model_len = 19800;
diff --git a/tensorflow/lite/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h b/tensorflow/lite/micro/examples/micro_speech/simple_features/model.h
similarity index 56%
rename from tensorflow/lite/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
rename to tensorflow/lite/micro/examples/micro_speech/simple_features/model.h
index a97d79033b8..b3e705edd8a 100644
--- a/tensorflow/lite/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
+++ b/tensorflow/lite/micro/examples/micro_speech/simple_features/model.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This is a standard TensorFlow Lite model file that has been converted into a
-// C data array, so it can be easily compiled into a binary for devices that
-// don't have a file system. It was created using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
+// This is a standard TensorFlow Lite FlatBuffer model file that has been
+// converted into a C data array, so it can be easily compiled into a binary
+// for devices that don't have a file system. It was created using the command:
+// xxd -i model.tflite > model.cc
 
-#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
-#define TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_MODEL_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_MODEL_H_
 
-extern const unsigned char g_tiny_conv_simple_features_model_data[];
-extern const int g_tiny_conv_simple_features_model_data_len;
+extern const unsigned char g_model[];
+extern const int g_model_len;
 
-#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_MODEL_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/train/README.md b/tensorflow/lite/micro/examples/micro_speech/train/README.md
new file mode 100644
index 00000000000..8e65f2bb13a
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/train/README.md
@@ -0,0 +1,201 @@
+
+# Micro Speech Training
+
+This example shows how to train a 20 kB model that can recognize 2 keywords,
+"yes" and "no", from speech data.
+
+If the input does not belong to either categories, it is classified as "unknown"
+and if the input is silent, it is classified as "silence".
+
+You can retrain it to recognize any combination of words (2 or more) from this
+list:
+
+```
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
+```
+
+The scripts used in training the model have been sourced from the
+[Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition)
+tutorial.
+
+## Table of contents
+
+-   [Overview](#overview)
+-   [Training](#training)
+-   [Trained Models](#trained-models)
+-   [Model Architecture](#model-architecture)
+-   [Dataset](#dataset)
+-   [Preprocessing Speech Input](#preprocessing-speech-input)
+-   [Other Training Methods](#other-training-methods)
+
+## Overview
+
+1. Dataset: Speech Commands, Version 2. ([Download Link](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz)
+, [Paper](https://arxiv.org/abs/1804.03209))
+2. Dataset Type: **Speech**
+3. Deep Learning Framework: **TensorFlow 1.5**
+4. Language: **Python 3.7**
+5. Model Size: **<20 kB**
+6. Model Category: **Multiclass Classification**
+
+## Training
+
+Train the model in the cloud using Google Colaboratory or locally using a
+Jupyter Notebook.
+
+<table class="tfo-notebook-buttons" align="left">
+  <td>
+    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Google Colaboratory</a>
+  </td>
+  <td>
+    <a target="_blank" href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />Jupyter Notebook</a>
+  </td>
+</table>
+
+*Estimated Training Time: ~2 Hours.*
+
+For more options, refer to the [Other Training Methods](#other-training-methods)
+section.
+
+## Trained Models
+
+| Download Link        | [speech_commands.zip](https://storage.googleapis.com/download.tensorflow.org/models/tflite/micro/micro_speech_2020_04_13.zip)           |
+| ------------- |-------------|
+
+The `models` directory in the above zip file can be generated by following the
+instructions in the [Training](#training) section above. It
+includes the following 3 model files:
+
+| Name | Format | Target Framework | Target Device |
+| :------------- |:-------------|:-------------|-----|
+| `model.pb` | Frozen GraphDef | TensorFlow | Large-Scale/Cloud/Servers   |
+| `model.tflite` *(<20 kB)*  | Fully Quantized* TFLite Model | TensorFlow Lite | Mobile Devices|
+| `model.cc`  | C Source File | TensorFlow Lite for Microcontrollers | Microcontrollers |
+
+**Fully quantized implies that the model is **strictly int8** quantized
+**including** the input(s) and output(s).*
+<!-- **Fully quantized implies that the model is **strictly int8** except the
+input(s) and output(s) which remain float.* -->
+
+## Model Architecture
+
+This is a simple model comprising of a Convolutional 2D layer, a Fully Connected
+Layer or a MatMul Layer (output: logits) and a Softmax layer
+(output: probabilities) as shown below. Refer to the [`tiny_conv`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/speech_commands/models.py#L673)
+model architecture.
+
+![model_architecture.png](../images/model_architecture.png)
+
+*This image was derived from visualizing the 'model.tflite' file in
+[Netron](https://github.com/lutzroeder/netron)*
+
+This doesn't produce a highly accurate model, but it's designed to be used as
+the first stage of a pipeline, running on a low-energy piece of hardware that
+can always be on, and then wake higher-power chips when a possible utterance has
+been found, so that more accurate analysis can be done. Additionally, the model
+takes in preprocessed speech input as a result of which we can leverage a
+simpler model for accurate results.
+
+## Dataset
+
+The Speech Commands Dataset. ([Download Link](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz),
+[Paper](https://arxiv.org/abs/1804.03209)) consists of over 105,000 WAVE audio
+files of people saying thirty different words. This data was collected by
+Google and released under a CC BY license. You can help improve it by
+contributing five minutes of your own voice. The archive is over 2GB, so this
+part may take a while, but you should see progress logs, and once it's been
+downloaded you won't need to do this again.
+
+## Preprocessing Speech Input
+
+In this section we discuss spectrograms, the preprocessed speech input to the
+model. Here's an illustration of the process:
+
+![spectrogram diagram](https://storage.googleapis.com/download.tensorflow.org/example_images/spectrogram_diagram.png)
+
+The model doesn't take in raw audio sample data, instead it works with
+spectrograms which are two dimensional arrays that are made up of slices of
+frequency information, each taken from a different time window.
+
+The recipe for creating the spectrogram data is that each frequency slice is
+created by running an FFT across a 30ms section of the audio sample data. The
+input samples are treated as being between -1 and +1 as real values (encoded as
+-32,768 and 32,767 in 16-bit signed integer samples).
+
+This results in an FFT with 256 entries. Every sequence of six entries is
+averaged together, giving a total of 43 frequency buckets in the final slice.
+The results are stored as unsigned eight-bit values, where 0 represents a real
+number of zero, and 255 represents 127.5 as a real number.
+
+Each adjacent frequency entry is stored in ascending memory order (frequency
+bucket 0 at data[0], bucket 1 at data[1], etc). The window for the frequency
+analysis is then moved forward by 20ms, and the process repeated, storing the
+results in the next memory row (for example bucket 0 in this moved window would
+be in data[43 + 0], etc). This process happens 49 times in total, producing a
+single channel image that is 43 pixels wide, and 49 rows high.
+
+In a complete application these spectrograms would be calculated at runtime from
+microphone inputs, but the code for doing that is not yet included in this
+sample code. The test uses spectrograms that have been pre-calculated from
+one-second WAV files in the test dataset generated by running the following
+commands:
+
+```
+python tensorflow/tensorflow/examples/speech_commands/wav_to_features.py \
+--input_wav=/tmp/speech_dataset/yes/f2e59fea_nohash_1.wav \
+--output_c_file=/tmp/yes_features_data.cc \
+--window_stride=20 --preprocess=average --quantize=1
+
+python tensorflow/tensorflow/examples/speech_commands/wav_to_features.py \
+--input_wav=/tmp/speech_dataset/no/f9643d42_nohash_4.wav \
+--output_c_file=/tmp/no_features_data.cc \
+--window_stride=20 --preprocess=average --quantize=1
+```
+
+
+## Other Training Methods
+
+### Use [Google Cloud](https://cloud.google.com/).
+
+*Note: Google Cloud isn't free. You need to pay depending on how long you use
+run the VM and what resources you use.*
+
+1. Create a Virtual Machine (VM) using a pre-configured Deep Learning VM Image.
+
+```
+export IMAGE_FAMILY="tf-latest-cpu"
+export ZONE="us-west1-b" # Or any other required region
+export INSTANCE_NAME="model-trainer"
+export INSTANCE_TYPE="n1-standard-8" # or any other instance type
+gcloud compute instances create $INSTANCE_NAME \
+        --zone=$ZONE \
+        --image-family=$IMAGE_FAMILY \
+        --image-project=deeplearning-platform-release \
+        --machine-type=$INSTANCE_TYPE \
+        --boot-disk-size=120GB \
+        --min-cpu-platform=Intel\ Skylake
+```
+
+2. As soon as instance has been created you can SSH to it:
+
+```
+gcloud compute ssh "jupyter@${INSTANCE_NAME}"
+```
+
+3. Train a model by following the instructions in the [`train_micro_speech_model.ipynb`](train_micro_speech_model.ipynb)
+jupyter notebook.
+
+4. Finally, don't forget to remove the instance when training is done:
+
+```
+gcloud compute instances delete "${INSTANCE_NAME}" --zone="${ZONE}"
+```
diff --git a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
new file mode 100644
index 00000000000..bfe75bdd9f7
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train_micro_speech_model.ipynb","provenance":[{"file_id":"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb","timestamp":1587690382292}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"pO4-CY_TCZZS","colab_type":"text"},"source":["# Train a Simple Audio Recognition Model"]},{"cell_type":"markdown","metadata":{"id":"BaFfr7DHRmGF","colab_type":"text"},"source":["This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n","\n","The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n","\n","<table class=\"tfo-notebook-buttons\" align=\"left\">\n","  <td>\n","    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n","  </td>\n","  <td>\n","    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n","  </td>\n","</table>\n"]},{"cell_type":"markdown","metadata":{"id":"XaVtYN4nlCft","colab_type":"text"},"source":["**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n","\n","## Configure Defaults\n","\n","**MODIFY** the following constants for your specific use case."]},{"cell_type":"code","metadata":{"id":"ludfxbNIaegy","colab_type":"code","colab":{}},"source":["# A comma-delimited list of the words you want to train for.\n","# The options are: yes,no,up,down,left,right,on,off,stop,go\n","# All the other words will be used to train an \"unknown\" label and silent\n","# audio data with no spoken words will be used to train a \"silence\" label.\n","WANTED_WORDS = \"yes,no\"\n","\n","# The number of steps and learning rates can be specified as comma-separated\n","# lists to define the rate at each stage. For example,\n","# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n","# will run 12,000 training loops in total, with a rate of 0.001 for the first\n","# 8,000, and 0.0001 for the final 3,000.\n","TRAINING_STEPS = \"12000,3000\"\n","LEARNING_RATE = \"0.001,0.0001\"\n","\n","# Calculate the total number of steps, which is used to identify the checkpoint\n","# file name.\n","TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n","\n","# Print the configuration to confirm it\n","print(\"Training these words: %s\" % WANTED_WORDS)\n","print(\"Training steps in each stage: %s\" % TRAINING_STEPS)\n","print(\"Learning rate in each stage: %s\" % LEARNING_RATE)\n","print(\"Total number of training steps: %s\" % TOTAL_STEPS)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gCgeOpvY9pAi","colab_type":"text"},"source":["**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."]},{"cell_type":"code","metadata":{"id":"Nd1iM1o2ymvA","colab_type":"code","colab":{}},"source":["# Calculate the percentage of 'silence' and 'unknown' training samples required\n","# to ensure that we have equal number of samples for each label.\n","number_of_labels = WANTED_WORDS.count(',') + 1\n","number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n","equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n","SILENT_PERCENTAGE = equal_percentage_of_training_samples\n","UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n","\n","# Constants which are shared during training and inference\n","PREPROCESS = 'micro'\n","WINDOW_STRIDE = 20\n","MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n","                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n","\n","# Constants used during training only\n","VERBOSITY = 'WARN'\n","EVAL_STEP_INTERVAL = '1000'\n","SAVE_STEP_INTERVAL = '1000'\n","\n","# Constants for training directories and filepaths\n","DATASET_DIR =  'dataset/'\n","LOGS_DIR = 'logs/'\n","TRAIN_DIR = 'train/' # for training checkpoints and other files.\n","\n","# Constants for inference directories and filepaths\n","import os\n","MODELS_DIR = 'models'\n","if not os.path.exists(MODELS_DIR):\n","  os.mkdir(MODELS_DIR)\n","MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n","MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n","FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n","MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n","SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n","\n","QUANT_INPUT_MIN = 0.0\n","QUANT_INPUT_MAX = 26.0\n","QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6rLYpvtg9P4o","colab_type":"text"},"source":["## Setup Environment\n","\n","Install Dependencies"]},{"cell_type":"code","metadata":{"id":"ed_XpUrU5DvY","colab_type":"code","colab":{}},"source":["%tensorflow_version 1.x\n","import tensorflow as tf"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T9Ty5mR58E4i","colab_type":"text"},"source":["**DELETE** any old data from previous runs\n"]},{"cell_type":"code","metadata":{"id":"APGx0fEh7hFF","colab_type":"code","colab":{}},"source":["!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GfEUlfFBizio","colab_type":"text"},"source":["Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."]},{"cell_type":"code","metadata":{"id":"yZArmzT85SLq","colab_type":"code","colab":{}},"source":["!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nS9swHLSi7Bi","colab_type":"text"},"source":["Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"]},{"cell_type":"code","metadata":{"id":"q4qF1VxP3UE4","colab_type":"code","colab":{}},"source":["%load_ext tensorboard\n","%tensorboard --logdir {LOGS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x1J96Ron-O4R","colab_type":"text"},"source":["## Training\n","\n","The following script downloads the dataset and begin training."]},{"cell_type":"code","metadata":{"id":"VJsEZx6lynbY","colab_type":"code","colab":{}},"source":["!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n","--data_dir={DATASET_DIR} \\\n","--wanted_words={WANTED_WORDS} \\\n","--silence_percentage={SILENT_PERCENTAGE} \\\n","--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n","--preprocess={PREPROCESS} \\\n","--window_stride={WINDOW_STRIDE} \\\n","--model_architecture={MODEL_ARCHITECTURE} \\\n","--how_many_training_steps={TRAINING_STEPS} \\\n","--learning_rate={LEARNING_RATE} \\\n","--train_dir={TRAIN_DIR} \\\n","--summaries_dir={LOGS_DIR} \\\n","--verbosity={VERBOSITY} \\\n","--eval_step_interval={EVAL_STEP_INTERVAL} \\\n","--save_step_interval={SAVE_STEP_INTERVAL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UczQKtqLi7OJ","colab_type":"text"},"source":["# Skipping the training\n","\n","If you don't want to spend an hour or two training the model from scratch, you can download pretrained checkpoints by uncommenting the lines below (removing the '#'s at the start of each line) and running them."]},{"cell_type":"code","metadata":{"id":"RZw3VNlnla-J","colab_type":"code","colab":{}},"source":["#!curl -O \"https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_micro_train_2020_05_10.tgz\"\n","#!tar xzf speech_micro_train_2020_05_10.tgz"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XQUJLrdS-ftl","colab_type":"text"},"source":["## Generate a TensorFlow Model for Inference\n","\n","Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."]},{"cell_type":"code","metadata":{"id":"xyc3_eLh9sAg","colab_type":"code","colab":{}},"source":["!rm -rf {SAVED_MODEL}\n","!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n","--wanted_words=$WANTED_WORDS \\\n","--window_stride_ms=$WINDOW_STRIDE \\\n","--preprocess=$PREPROCESS \\\n","--model_architecture=$MODEL_ARCHITECTURE \\\n","--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'{TOTAL_STEPS} \\\n","--save_format=saved_model \\\n","--output_file={SAVED_MODEL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_DBGDxVI-nKG","colab_type":"text"},"source":["## Generate a TensorFlow Lite Model\n","\n","Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n","\n","The following cell will also print the model size, which will be under 20 kilobytes."]},{"cell_type":"code","metadata":{"id":"RIitkqvGWmre","colab_type":"code","colab":{}},"source":["import sys\n","# We add this path so we can import the speech processing modules.\n","sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n","import input_data\n","import models\n","import numpy as np"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"kzqECqMxgBh4","colab_type":"code","colab":{}},"source":["SAMPLE_RATE = 16000\n","CLIP_DURATION_MS = 1000\n","WINDOW_SIZE_MS = 30.0\n","FEATURE_BIN_COUNT = 40\n","BACKGROUND_FREQUENCY = 0.8\n","BACKGROUND_VOLUME_RANGE = 0.1\n","TIME_SHIFT_MS = 100.0\n","\n","DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n","VALIDATION_PERCENTAGE = 10\n","TESTING_PERCENTAGE = 10"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"rNQdAplJV1fz","colab_type":"code","colab":{}},"source":["model_settings = models.prepare_model_settings(\n","    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n","    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n","    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n","audio_processor = input_data.AudioProcessor(\n","    DATA_URL, DATASET_DIR,\n","    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n","    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n","    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBj_AyCh1cC0","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  float_tflite_model = float_converter.convert()\n","  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n","  print(\"Float model is %d bytes\" % float_tflite_model_size)\n","\n","  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n","  converter.inference_input_type = tf.lite.constants.INT8\n","  converter.inference_output_type = tf.lite.constants.INT8\n","  def representative_dataset_gen():\n","    for i in range(100):\n","      data, _ = audio_processor.get_data(1, i*1, model_settings,\n","                                         BACKGROUND_FREQUENCY, \n","                                         BACKGROUND_VOLUME_RANGE,\n","                                         TIME_SHIFT_MS,\n","                                         'testing',\n","                                         sess)\n","      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n","      yield [flattened_data]\n","  converter.representative_dataset = representative_dataset_gen\n","  tflite_model = converter.convert()\n","  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n","  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EeLiDZTbLkzv","colab_type":"text"},"source":["# Testing the TensorFlow Lite model's accuracy\n","\n","Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."]},{"cell_type":"code","metadata":{"id":"wQsEteKRLryJ","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","float_interpreter = tf.lite.Interpreter(FLOAT_MODEL_TFLITE)\n","float_interpreter.allocate_tensors()\n","\n","float_input_index = float_interpreter.get_input_details()[0][\"index\"]\n","\n","float_output_index = float_interpreter.get_output_details()[0][\"index\"]\n","float_model_output = float_interpreter.tensor(float_output_index)\n","\n","float_correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  float_interpreter.set_tensor(float_input_index, flattened_input)\n","  float_interpreter.invoke()\n","  top_prediction = float_model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    float_correct_predictions += 1\n","\n","print('Float accuracy is %f%% (N=%d)' % ((float_correct_predictions * 100) / len(test_data), len(test_data)))\n","\n","interpreter = tf.lite.Interpreter(MODEL_TFLITE)\n","interpreter.allocate_tensors()\n","\n","input_index = interpreter.get_input_details()[0][\"index\"]\n","\n","output_index = interpreter.get_output_details()[0][\"index\"]\n","model_output = interpreter.tensor(output_index)\n","\n","with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  quantized_input = np.zeros((1960), np.int8)\n","  for index, input_value in enumerate(current_input.flatten()):\n","    # These scaling values are derived from those used in input_data.py in the\n","    # training pipeline.\n","    value = ((input_value - QUANT_INPUT_MIN) * 256) / QUANT_INPUT_RANGE\n","    value -= 128\n","    if value < -128:\n","      value = -128\n","    if value > 127:\n","      value = 127\n","    quantized_input[index] = value\n","  flattened_input = np.array(quantized_input.flatten(), dtype=np.int8).reshape(1, 1960)\n","  interpreter.set_tensor(input_index, flattened_input)\n","  interpreter.invoke()\n","  top_prediction = model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    correct_predictions += 1\n","\n","print('Quantized accuracy is %f%% (N=%d)' % ((correct_predictions * 100) / len(test_data), len(test_data)))\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dt6Zqbxu-wIi","colab_type":"text"},"source":["## Generate a TensorFlow Lite for MicroControllers Model\n","Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."]},{"cell_type":"code","metadata":{"id":"XohZOTjR8ZyE","colab_type":"code","colab":{}},"source":["# Install xxd if it is not available\n","!apt-get update && apt-get -qq install xxd\n","# Convert to a C source file\n","!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n","# Update variable names\n","REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n","!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2pQnN0i_-0L2","colab_type":"text"},"source":["## Deploy to a Microcontroller\n","\n","Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n","\n","**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n","\n","**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."]},{"cell_type":"code","metadata":{"id":"eoYyh0VU8pca","colab_type":"code","colab":{}},"source":["# Print the C source file\n","!cat {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"iYlIKpO2mkhv","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
deleted file mode 100644
index 0baaeac0482..00000000000
--- a/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
+++ /dev/null
@@ -1,324 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "Train simple audio recognition model",
-      "version": "0.3.2",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pO4-CY_TCZZS",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Train a Simple Audio Recognition model for microcontroller use"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BaFfr7DHRmGF",
-        "colab_type": "text"
-      },
-      "source": [
-        "This notebook demonstrates how to train a 20kb [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model for [TensorFlow Lite for Microcontrollers](https://tensorflow.org/lite/microcontrollers/overview). It will produce the same model used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example application.\n",
-        "\n",
-        "The model is designed to be used with [Google Colaboratory](https://colab.research.google.com).\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "</table>\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XaVtYN4nlCft",
-        "colab_type": "text"
-      },
-      "source": [
-        "The notebook runs Python scripts to train and freeze the model, and uses the TensorFlow Lite converter to convert it for use with TensorFlow Lite for Microcontrollers.\n",
-        "\n",
-        "**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and selecting **GPU**. Training 18,000 iterations will take 1.5-2 hours on a GPU runtime.\n",
-        "\n",
-        "## Configure training\n",
-        "\n",
-        "The following `os.environ` lines can be customized to set the words that will be trained for, and the steps and learning rate of the training. The default values will result in the same model that is used in the micro_speech example. Run the cell to set the configuration:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "ludfxbNIaegy",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "import os\n",
-        "\n",
-        "# A comma-delimited list of the words you want to train for.\n",
-        "# The options are: yes,no,up,down,left,right,on,off,stop,go\n",
-        "# All other words will be used to train an \"unknown\" category.\n",
-        "os.environ[\"WANTED_WORDS\"] = \"yes,no\"\n",
-        "\n",
-        "# The number of steps and learning rates can be specified as comma-separated\n",
-        "# lists to define the rate at each stage. For example,\n",
-        "# TRAINING_STEPS=15000,3000 and LEARNING_RATE=0.001,0.0001\n",
-        "# will run 18,000 training loops in total, with a rate of 0.001 for the first\n",
-        "# 15,000, and 0.0001 for the final 3,000.\n",
-        "os.environ[\"TRAINING_STEPS\"]=\"15000,3000\"\n",
-        "os.environ[\"LEARNING_RATE\"]=\"0.001,0.0001\"\n",
-        "\n",
-        "# Calculate the total number of steps, which is used to identify the checkpoint\n",
-        "# file name.\n",
-        "total_steps = sum(map(lambda string: int(string),\n",
-        "                  os.environ[\"TRAINING_STEPS\"].split(\",\")))\n",
-        "os.environ[\"TOTAL_STEPS\"] = str(total_steps)\n",
-        "\n",
-        "# Print the configuration to confirm it\n",
-        "!echo \"Training these words: ${WANTED_WORDS}\"\n",
-        "!echo \"Training steps in each stage: ${TRAINING_STEPS}\"\n",
-        "!echo \"Learning rate in each stage: ${LEARNING_RATE}\"\n",
-        "!echo \"Total number of training steps: ${TOTAL_STEPS}\"\n"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "gCgeOpvY9pAi",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Install dependencies\n",
-        "\n",
-        "Next, we'll install a GPU build of TensorFlow, so we can use GPU acceleration for training."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Nd1iM1o2ymvA",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Replace Colab's default TensorFlow install with an older\n",
-        "# build that contains the operations that are needed for training\n",
-        "!pip uninstall -y tensorflow tensorflow_estimator tensorboard\n",
-        "!pip install -q tensorflow==1.15"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "T9Ty5mR58E4i",
-        "colab_type": "text"
-      },
-      "source": [
-        "We'll also clone the TensorFlow repository, which contains the scripts that train and freeze the model."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "APGx0fEh7hFF",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Clone the repository from GitHub\n",
-        "!git clone -q https://github.com/tensorflow/tensorflow\n"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "aV_0qkYh98LD",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Load TensorBoard\n",
-        "\n",
-        "Now, set up TensorBoard so that we can graph our accuracy and loss as training proceeds."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "yZArmzT85SLq",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Delete any old logs from previous runs\n",
-        "!rm -rf /content/retrain_logs\n",
-        "# Load TensorBoard\n",
-        "%load_ext tensorboard\n",
-        "%tensorboard --logdir /content/retrain_logs"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "x1J96Ron-O4R",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Begin training\n",
-        "\n",
-        "Next, run the following script to begin training. The script will first download the training data:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "VJsEZx6lynbY",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n",
-        "--model_architecture=tiny_conv --window_stride=20 --preprocess=micro \\\n",
-        "--wanted_words=${WANTED_WORDS} --silence_percentage=25 --unknown_percentage=25 \\\n",
-        "--quantize=1 --verbosity=WARN --how_many_training_steps=${TRAINING_STEPS} \\\n",
-        "--learning_rate=${LEARNING_RATE} --summaries_dir=/content/retrain_logs \\\n",
-        "--data_dir=/content/speech_dataset --train_dir=/content/speech_commands_train\n"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XQUJLrdS-ftl",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Freeze the graph\n",
-        "\n",
-        "Once training is complete, run the following cell to freeze the graph."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "xyc3_eLh9sAg",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n",
-        "--model_architecture=tiny_conv --window_stride=20 --preprocess=micro \\\n",
-        "--wanted_words=${WANTED_WORDS} --quantize=1 --output_file=/content/tiny_conv.pb \\\n",
-        "--start_checkpoint=/content/speech_commands_train/tiny_conv.ckpt-${TOTAL_STEPS}"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_DBGDxVI-nKG",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Convert the model\n",
-        "\n",
-        "Run this cell to use the TensorFlow Lite converter to convert the frozen graph into the TensorFlow Lite format, fully quantized for use with embedded devices."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "lBj_AyCh1cC0",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!toco \\\n",
-        "--graph_def_file=/content/tiny_conv.pb --output_file=/content/tiny_conv.tflite \\\n",
-        "--input_shapes=1,49,40,1 --input_arrays=Reshape_2 --output_arrays='labels_softmax' \\\n",
-        "--inference_type=QUANTIZED_UINT8 --mean_values=0 --std_dev_values=9.8077"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dt6Zqbxu-wIi",
-        "colab_type": "text"
-      },
-      "source": [
-        "The following cell will print the model size, which will be under 20 kilobytes."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "XohZOTjR8ZyE",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "import os\n",
-        "model_size = os.path.getsize(\"/content/tiny_conv.tflite\")\n",
-        "print(\"Model is %d bytes\" % model_size)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2pQnN0i_-0L2",
-        "colab_type": "text"
-      },
-      "source": [
-        "Finally, we use xxd to transform the model into a source file that can be included in a C++ project and loaded by TensorFlow Lite for Microcontrollers."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "eoYyh0VU8pca",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Install xxd if it is not available\n",
-        "!apt-get -qq install xxd\n",
-        "# Save the file as a C source file\n",
-        "!xxd -i /content/tiny_conv.tflite > /content/tiny_conv.cc\n",
-        "# Print the source file\n",
-        "!cat /content/tiny_conv.cc"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/person_detection/esp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/esp/Makefile.inc
index 31641b151ba..e3c07e510ae 100644
--- a/tensorflow/lite/micro/examples/person_detection/esp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection/esp/Makefile.inc
@@ -28,7 +28,7 @@ MAIN_SRCS += $(ESP_PERSON_DETECTION_SRCS)
 # ESP specific flags and files
 CCFLAGS += -Wno-nonnull
 CXXFLAGS += -Wno-return-type -Wno-strict-aliasing
-ESP_PROJECT_FILES += \
+person_detection_ESP_PROJECT_FILES := \
   sdkconfig.defaults \
   main/Kconfig.projbuild
 
diff --git a/tensorflow/lite/micro/examples/person_detection/main_functions.cc b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
index 279b09e4c2b..0e5c6394d56 100644
--- a/tensorflow/lite/micro/examples/person_detection/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
@@ -34,7 +34,7 @@ tflite::MicroInterpreter* interpreter = nullptr;
 TfLiteTensor* input = nullptr;
 
 // An area of memory to use for input, output, and intermediate arrays.
-constexpr int kTensorArenaSize = 73 * 1024;
+constexpr int kTensorArenaSize = 93 * 1024;
 static uint8_t tensor_arena[kTensorArenaSize];
 }  // namespace
 
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 51a61881ead..8acb93ced17 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/version.h"
 
 // Create an area of memory to use for input, output, and intermediate arrays.
-constexpr int tensor_arena_size = 73 * 1024;
+constexpr int tensor_arena_size = 93 * 1024;
 uint8_t tensor_arena[tensor_arena_size];
 
 TF_LITE_MICRO_TESTS_BEGIN
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
index 719f16b2d36..92d2c091f55 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
@@ -41,7 +41,7 @@ TfLiteTensor* input = nullptr;
 // signed value.
 
 // An area of memory to use for input, output, and intermediate arrays.
-constexpr int kTensorArenaSize = 125 * 1024;
+constexpr int kTensorArenaSize = 136 * 1024;
 static uint8_t tensor_arena[kTensorArenaSize];
 }  // namespace
 
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index b0979735d4f..c3719e559ca 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/version.h"
 
 // Create an area of memory to use for input, output, and intermediate arrays.
-constexpr int tensor_arena_size = 125 * 1024;
+constexpr int tensor_arena_size = 136 * 1024;
 uint8_t tensor_arena[tensor_arena_size];
 
 TF_LITE_MICRO_TESTS_BEGIN
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 266299d3896..50a0a4f9190 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -34,6 +34,7 @@ cc_library(
         "dequantize.cc",
         "elementwise.cc",
         "floor.cc",
+        "l2norm.cc",
         "logical.cc",
         "logistic.cc",
         "maximum_minimum.cc",
@@ -45,6 +46,7 @@ cc_library(
         "prelu.cc",
         "reduce.cc",
         "reshape.cc",
+        "resize_nearest_neighbor.cc",
         "round.cc",
         "split.cc",
         "strided_slice.cc",
@@ -67,7 +69,6 @@ cc_library(
             "xtensa_hifimini/quantize.cc",
             "xtensa_hifimini/softmax.cc",
             "xtensa_hifimini/svdf.cc",
-            "xtensa_hifimini/utils.h",
         ],
     }),
     hdrs = ["micro_ops.h"],
@@ -91,7 +92,7 @@ cc_library(
     ] + select({
         "//conditions:default": [],
         ":xtensa_hifimini": [
-            # #"//third_party/xtensa/cstub64s:hifi_mini",
+            #"//third_party/xtensa/cstub64s:hifi_mini",
         ],
     }),
 )
@@ -131,6 +132,7 @@ cc_library(
         "elementwise.cc",
         "floor.cc",
         "fully_connected.cc",
+        "l2norm.cc",
         "logical.cc",
         "logistic.cc",
         "maximum_minimum.cc",
@@ -144,6 +146,7 @@ cc_library(
         "quantize.cc",
         "reduce.cc",
         "reshape.cc",
+        "resize_nearest_neighbor.cc",
         "round.cc",
         "softmax.cc",
         "split.cc",
@@ -253,6 +256,7 @@ tflite_micro_cc_test(
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -654,3 +658,29 @@ tflite_micro_cc_test(
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "resize_nearest_neighbor_test",
+    srcs = [
+        "resize_nearest_neighbor_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":micro_ops",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "l2norm_test",
+    srcs = [
+        "l2norm_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":micro_ops",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/micro/kernels/all_ops_resolver.cc
index 8834d1da4bb..dee3cbe0664 100644
--- a/tensorflow/lite/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/micro/kernels/all_ops_resolver.cc
@@ -24,7 +24,7 @@ AllOpsResolver::AllOpsResolver() {
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(), 1, 4);
   AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D(), 1, 2);
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(), 1, 2);
-  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(), 1, 2);
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(), 1, 3);
   AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(), 1, 3);
   AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION(), 1, 3);
@@ -71,6 +71,12 @@ AllOpsResolver::AllOpsResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
   AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
+  AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+             Register_RESIZE_NEAREST_NEIGHBOR(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH());
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/README.md b/tensorflow/lite/micro/kernels/cmsis-nn/README.md
index 4107ba466db..93da68b130f 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/README.md
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/README.md
@@ -48,7 +48,17 @@ cp tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/DSP/Include/\
 arm_math.h mbed-os/cmsis/TARGET_CORTEX_M/arm_math.h
 ```
 
-This issue will be resolved soon. Now type
+There's also a dependency to an old cmsis_gcc.h, which you can fix with the
+following:
+
+```
+tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/Core/Include/\
+cmsis_gcc.h mbed-os/cmsis/TARGET_CORTEX_M/cmsis_gcc.h
+```
+
+This issue will be resolved soon.
+
+Now type:
 
 ```
 mbed compile -m DISCO_F746NG -t GCC_ARM
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
index 8b6d119583a..6dbe4a618ab 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
@@ -67,14 +67,15 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
     data->output_offset = output->params.zero_point;
     data->left_shift = 20;
     const double twice_max_input_scale =
-        2 * std::max(input1->params.scale, input2->params.scale);
+        2 * static_cast<double>(
+                std::max(input1->params.scale, input2->params.scale));
     const double real_input1_multiplier =
-        input1->params.scale / twice_max_input_scale;
+        static_cast<double>(input1->params.scale) / twice_max_input_scale;
     const double real_input2_multiplier =
-        input2->params.scale / twice_max_input_scale;
+        static_cast<double>(input2->params.scale) / twice_max_input_scale;
     const double real_output_multiplier =
         twice_max_input_scale /
-        ((1 << data->left_shift) * output->params.scale);
+        ((1 << data->left_shift) * static_cast<double>(output->params.scale));
 
     QuantizeMultiplierSmallerThanOneExp(
         real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index 273fdaea65b..34d4e837f65 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -40,8 +40,6 @@ constexpr int kMaxChannels = 256;
 // https://www.tensorflow.org/lite/performance/quantization_spec
 constexpr int kConvQuantizedDimension = 0;
 
-const int kTensorNotAllocated = -1;
-
 struct OpData {
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
@@ -127,6 +125,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   RuntimeShape input_shape = GetTensorShape(input);
+  RuntimeShape output_shape = GetTensorShape(output);
 
   const int input_depth = input_shape.Dims(3);
   const int input_width = input->dims->data[2];
@@ -135,6 +134,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int filter_height = filter->dims->data[1];
   const int output_width = output->dims->data[2];
   const int output_height = output->dims->data[1];
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
 
   int* buffer_idx = reinterpret_cast<int*>(node->user_data);
 
@@ -146,6 +146,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       (input_depth % 4 == 0) && params->stride_width == 1 &&
       params->stride_height == 1 && filter_width == 1 && filter_height == 1) {
     buf_size = arm_convolve_1x1_s8_fast_get_buffer_size(input_depth);
+  } else if (output_height == 1 && input_height == 1 && filter_height == 1 &&
+             (output_width % 4 == 0) && batches == 1) {
+    buf_size = arm_convolve_1_x_n_s8_get_buffer_size(input_depth, filter_width,
+                                                     filter_height);
   } else {
     buf_size = arm_convolve_s8_get_buffer_size(input_depth, filter_width,
                                                filter_height);
@@ -153,7 +157,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   node->user_data = buffer_idx;
   if (buf_size > 0) {
-    context->RequestScratchBufferInArena(context, buf_size, buffer_idx);
+    TF_LITE_ENSURE_STATUS(
+        context->RequestScratchBufferInArena(context, buf_size, buffer_idx));
   } else {
     *buffer_idx = -1;
   }
@@ -266,11 +271,6 @@ TfLiteStatus EvalQuantizedPerChannel(
 
   } else if (output_height == 1 && input_height == 1 && filter_height == 1 &&
              (output_width % 4 == 0) && batches == 1) {
-    const int32_t buf_size = arm_convolve_1_x_n_s8_get_buffer_size(
-        input_depth, filter_width, filter_height);
-    if (get_cmsis_scratch_buffer(context, &buf, buf_size) != kTfLiteOk) {
-      return kTfLiteError;
-    }
     if (arm_convolve_1_x_n_s8(
             GetTensorData<int8_t>(input), input_width, input_depth, batches,
             GetTensorData<int8_t>(filter), output_depth, filter_width,
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
index 8fd49627165..7ba03d3890d 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -127,7 +127,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         input_depth, filter_width, filter_height);
 
     if (buf_size > 0) {
-      context->RequestScratchBufferInArena(context, buf_size, buffer_idx);
+      TF_LITE_ENSURE_STATUS(
+          context->RequestScratchBufferInArena(context, buf_size, buffer_idx));
     }
   }
 #endif
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
index e4e16f9cf12..78787ea2547 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
@@ -78,9 +78,13 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP)
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+#if defined(__ARM_FEATURE_DSP)
   RuntimeShape filter_shape = GetTensorShape(filter);
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
@@ -91,7 +95,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   node->user_data = buffer_idx;
   if (buf_size > 0) {
-    context->RequestScratchBufferInArena(context, buf_size, buffer_idx);
+    TF_LITE_ENSURE_STATUS(
+        context->RequestScratchBufferInArena(context, buf_size, buffer_idx));
   } else {
     *buffer_idx = -1;
   }
@@ -227,7 +232,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
                                         filter, bias, output, data));
 
-  switch (filter->type) {  // Already know in/out types are same.
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
     case kTfLiteFloat32:
       return EvalFloat(context, node, params, data, input, filter, bias,
                        output);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index bf7370ee79a..a12f628e721 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -145,7 +145,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
       ARM_MATH_SUCCESS);
 #else
 #pragma message( \
-    "CMSIS-NN optimization for depthwise_conv not available for this target. Using reference kernel.")
+    "CMSIS-NN optimization for avg_pool not available for this target. Using reference kernel.")
 
   PoolParams op_params;
   op_params.stride_height = params->stride_height;
@@ -165,8 +165,8 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
 }
 
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLitePoolParams* params, OpData* data,
-                  const TfLiteTensor* input, TfLiteTensor* output) {
+                  TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
+                  TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
@@ -187,7 +187,7 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
 
 void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
                            TfLitePoolParams* params, OpData* data,
-                           const TfLiteTensor* input, TfLiteTensor* output) {
+                           TfLiteTensor* input, TfLiteTensor* output) {
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                           &activation_min, &activation_max);
@@ -206,6 +206,74 @@ void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
                          GetTensorData<uint8_t>(output));
 }
 
+TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
+                         const TfLitePoolParams* params, const OpData* data,
+                         TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min, activation_max;
+  (void)CalculateActivationRangeQuantized(context, params->activation, output,
+                                          &activation_min, &activation_max);
+
+  TFLITE_DCHECK_LE(activation_min, activation_max);
+
+#if defined(__ARM_FEATURE_DSP)
+  RuntimeShape input_shape = GetTensorShape(input);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+
+  RuntimeShape output_shape = GetTensorShape(output);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params->stride_height;
+  const int stride_width = params->stride_width;
+
+  const int filter_height = params->filter_height;
+  const int filter_width = params->filter_width;
+  const int padding_height = data->padding.height;
+  const int padding_width = data->padding.width;
+
+  int16_t* scratch_buffer = nullptr;
+
+  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
+
+  if (*buffer_idx > -1) {
+    void* raw = context->GetScratchBuffer(context, *buffer_idx);
+    scratch_buffer = reinterpret_cast<int16_t*>(raw);
+  }
+
+  TF_LITE_ENSURE_EQ(
+      context,
+      arm_max_pool_s8_opt(input_height, input_width, output_height,
+                          output_width, stride_height, stride_width,
+                          filter_height, filter_width, padding_height,
+                          padding_width, activation_min, activation_max, depth,
+                          GetTensorData<int8_t>(input), scratch_buffer,
+                          GetTensorData<int8_t>(output)),
+      ARM_MATH_SUCCESS);
+#else
+#pragma message( \
+    "CMSIS-NN optimization for max_pool not available for this target. Using reference kernel.")
+
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+  reference_integer_ops::MaxPool(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
+
+#endif
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -235,7 +303,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   node->user_data = buffer_idx;
   if (buffer_size > 0) {
-    context->RequestScratchBufferInArena(context, buffer_size, buffer_idx);
+    TF_LITE_ENSURE_STATUS(
+        context->RequestScratchBufferInArena(context, buffer_size, buffer_idx));
   } else {
     *buffer_idx = -1;
   }
@@ -277,7 +346,8 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
   OpData data;
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* input = &context->tensors[flatbuffers::EndianScalar(
+      node->inputs->data[kInputTensor])];
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
@@ -289,6 +359,9 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
       MaxEvalQuantizedUInt8(context, node, params, &data, input, output);
       break;
+    case kTfLiteInt8:
+      MaxEvalInt8(context, node, params, &data, input, output);
+      break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
                          TfLiteTypeGetName(input->type));
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
index c581de56c79..8f42a78c941 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
@@ -46,7 +46,8 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
 
     int input_left_shift;
     tflite::PreprocessSoftmaxScaling(
-        params->beta, input->params.scale, kScaledDiffIntegerBits,
+        static_cast<double>(params->beta),
+        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
         &op_data->input_multiplier, &input_left_shift);
     op_data->input_left_shift = input_left_shift;
     op_data->diff_min =
diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index 5d1418a68b1..ff425e90ee8 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -33,7 +33,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -49,9 +48,8 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
 
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
@@ -72,10 +70,10 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
-                             int filter_width, int filter_height, int out_width,
-                             int out_height, const TfLiteType data_type,
-                             OpData* data) {
+                             const TfLiteConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
   bool has_bias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -109,8 +107,69 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  int input_width = input->dims->data[2];
+  int input_height = input->dims->data[1];
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+  int output_width = output->dims->data[2];
+  int output_height = output->dims->data[1];
+
+  // Dynimically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  return CalculateOpData(context, node, params, input_width, input_height,
+                         filter_width, filter_height, output_width,
+                         output_height, input->type, data);
+}  // namespace conv
+
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteConvParams* params, OpData* data,
+                   TfLiteConvParams* params, const OpData& data,
                    const TfLiteTensor* input, const TfLiteTensor* filter,
                    const TfLiteTensor* bias, TfLiteTensor* im2col,
                    TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
@@ -118,10 +177,11 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   const int32_t filter_offset = -filter->params.zero_point;
   const int32_t output_offset = output->params.zero_point;
 
+  // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -129,10 +189,10 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
   reference_ops::Conv(op_params, GetTensorShape(input),
                       GetTensorData<uint8_t>(input), GetTensorShape(filter),
                       GetTensorData<uint8_t>(filter), GetTensorShape(bias),
@@ -142,11 +202,12 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, OpData* data,
+                             TfLiteConvParams* params, const OpData& data,
                              const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
                              TfLiteTensor* im2col) {
+  // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.output_offset = output->params.zero_point;
@@ -154,14 +215,14 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.stride_width = params->stride_width;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, GetTensorShape(input),
       GetTensorData<int8>(input), GetTensorShape(filter),
       GetTensorData<int8>(filter), GetTensorShape(bias),
       GetTensorData<int32>(bias), GetTensorShape(output),
@@ -169,18 +230,18 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 }
 
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteConvParams* params, OpData* data,
+               TfLiteConvParams* params, const OpData& data,
                const TfLiteTensor* input, const TfLiteTensor* filter,
                const TfLiteTensor* bias, TfLiteTensor* im2col,
                TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
-
+  // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -204,50 +265,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
 
-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-
-    TF_LITE_ENSURE(context,
-                   affine_quantization->scale->size == 1 ||
-                       affine_quantization->scale->size ==
-                           filter->dims->data[kConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
+      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
                 nullptr, output);
       break;
     case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
                               output, nullptr);
       break;
     case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
+      EvalQuantized(context, node, params, data, input, filter, bias, nullptr,
                     nullptr, output);
       break;
     default:
@@ -261,9 +292,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace conv
 
 TfLiteRegistration* Register_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/conv::Init,
                                  /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+                                 /*prepare=*/conv::Prepare,
                                  /*invoke=*/conv::Eval,
                                  /*profiling_string=*/nullptr,
                                  /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index a1f155ecc56..4cc2a80c3ea 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -90,13 +90,12 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
   node.delegate = nullptr;
 
   if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+    TF_LITE_ENSURE_OK(context, registration->prepare(&context, &node));
   }
+
+  // TODO(b/154240825): Use a test macro here which fails and returns.
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TfLiteStatus return_val = registration->invoke(&context, &node);
-  if (return_val != kTfLiteOk) {
-    return return_val;
-  }
+  TF_LITE_ENSURE_OK(context, registration->invoke(&context, &node));
 
   if (registration->free) {
     registration->free(&context, user_data);
@@ -626,4 +625,142 @@ TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) {
                      output_dims_count, &tflite::testing::common_conv_params));
 }
 
+TF_LITE_MICRO_TEST(Int8Input32x1Filter32x32ShouldMatchGolden) {
+  constexpr int kSampleSize = 32;
+  constexpr int kNumFilters = 32;
+  const int input_shape[] = {4, 1, 1, 1, kSampleSize};
+  const int filter_shape[] = {4, kNumFilters, 1, 1, kSampleSize};
+  const int bias_shape[] = {1, kSampleSize};
+  const int output_shape[] = {4, 1, 1, 1, kSampleSize};
+  float filter_values[kNumFilters * kSampleSize];
+  float input_values[kSampleSize];
+  float bias_values[kSampleSize];
+
+  // Generated these outputs using the floating point reference conv kernel.
+  // TODO(b/149942509): Do this comparison automatically on random inputs.
+  float expected_output[kSampleSize] = {
+      5168.000000,  3377.000000,  306.000000,   -4045.000000, -4556.000000,
+      -1227.000000, 822.000000,   1591.000000,  5176.000000,  3385.000000,
+      314.000000,   -4037.000000, -4548.000000, -1219.000000, 830.000000,
+      1599.000000,  5184.000000,  3393.000000,  322.000000,   -4029.000000,
+      -4540.000000, -1211.000000, 838.000000,   1607.000000,  5192.000000,
+      3401.000000,  330.000000,   -4021.000000, -4532.000000, -1203.000000,
+      846.000000,   1615.000000};
+
+  for (int i = 0; i < kSampleSize; i++) {
+    bias_values[i] = i;
+    // Generate inputs from -16 to 15.
+    input_values[i] = i - 16;
+  }
+
+  // Generate samples of varying values between -128 and 127.
+  for (int i = 0; i < kNumFilters * kSampleSize; i++) {
+    filter_values[i] = (i * 25) % 256 - 128;
+  }
+
+  TfLiteConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_height_factor = 1;
+  conv_params.dilation_width_factor = 1;
+  conv_params.stride_height = 1;
+  conv_params.stride_width = 1;
+  conv_params.padding = kTfLitePaddingValid;
+
+  TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape);
+  TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape);
+  TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
+  TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
+  const int output_dims_count = tflite::ElementCount(*output_dims);
+
+  // Quantization Parameters.  All scales except output are 1.0, and all zero
+  // points are 0. This direct-maps the values to floating point and makes it
+  // easy to reson about them.
+  int input_zero_point = 0;
+  float input_scale = 1.0f;
+  int filter_zero_point = 0;
+  float filter_scale = 1.0f;
+  int output_zero_point = 0;
+  // Output scale of 50 is needed to accomodate a float range of [-6400, 6350]
+  float output_scale = 50.0f;
+
+  // Create per-tensor quantized int8 input tensor.
+  int8_t input_quantized[kSampleSize];
+  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
+      input_values, input_quantized, input_dims, input_scale, input_zero_point,
+      "input_tensor");
+  // Set zero point and scale arrays with a single element for each.
+  int input_zero_points[] = {1, input_zero_point};
+  float input_scales[] = {1, input_scale};
+  TfLiteAffineQuantization input_quant = {
+      tflite::testing::FloatArrayFromFloats(input_scales),
+      tflite::testing::IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  // Create per-tensor quantized int8 filter tensor.
+  int8_t filter_quantized[kNumFilters * kSampleSize];
+  TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
+      filter_values, filter_quantized, filter_dims, filter_scale,
+      filter_zero_point, "filter_tensor");
+  // Set zero point and scale arrays with a single element for each.
+  int filter_zero_points[] = {1, filter_zero_point};
+  float filter_scales[] = {1, filter_scale};
+  TfLiteAffineQuantization filter_quant = {
+      tflite::testing::FloatArrayFromFloats(filter_scales),
+      tflite::testing::IntArrayFromInts(filter_zero_points)};
+  filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
+
+  // Create per-tensor quantized int32 bias tensor.
+  int32_t bias_quantized[kSampleSize];
+  tflite::SymmetricQuantize(bias_values, bias_quantized, kSampleSize,
+                            input_scale * output_scale);
+  TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor(
+      bias_quantized, bias_dims, "bias_tensor");
+
+  // There is a single zero point of 0, and a single scale of
+  // input_scale * filter_scale.
+  int bias_zero_points[] = {1, 0};
+  float bias_scales[] = {1, input_scale * filter_scale};
+  TfLiteAffineQuantization bias_quant = {
+      tflite::testing::FloatArrayFromFloats(bias_scales),
+      tflite::testing::IntArrayFromInts(bias_zero_points)};
+  bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
+
+  // Create per-tensor quantized int8 output tensor.
+  int8_t output_quantized[kSampleSize];
+  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
+      output_quantized, output_dims, output_scale, output_zero_point,
+      "output_tensor");
+  // Set zero point and scale arrays with a single element for each.
+  int output_zero_points[] = {1, output_zero_point};
+  float output_scales[] = {1, output_scale};
+  TfLiteAffineQuantization output_quant = {
+      tflite::testing::FloatArrayFromFloats(output_scales),
+      tflite::testing::IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  // The 3 inputs include the input, filter and bias tensors.
+  constexpr int kInputsSize = 3;
+  constexpr int kOutputsSize = 1;
+  constexpr int kTensorsSize = kInputsSize + kOutputsSize;
+  TfLiteTensor tensors[kTensorsSize] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  int8_t golden_quantized[kSampleSize];
+  tflite::AsymmetricQuantize(expected_output, golden_quantized,
+                             output_dims_count, output_scale,
+                             output_zero_point);
+
+  // Rounding errors due to quantization should not exceed 1.
+  constexpr int kQuantizationTolerance = 1;
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, tflite::testing::ValidateConvGoldens(
+                     tensors, kTensorsSize, golden_quantized, output_quantized,
+                     output_dims_count, &conv_params, kQuantizationTolerance));
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv.cc b/tensorflow/lite/micro/kernels/depthwise_conv.cc
index 5d76642d37d..0568d6865c2 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv.cc
@@ -35,7 +35,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -49,10 +48,8 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
@@ -84,20 +81,81 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
     int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
 
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+    return tflite::PopulateConvolutionQuantizationParams(
         context, input, filter, bias, output, params->activation,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
   }
   return kTfLiteOk;
 }
 
 }  // namespace
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+
+  // Per channel quantization is only needed for int8 inference. For other
+  // quantized types, only a single scale and zero point is needed.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  // Dynimically allocate per-channel quantization parameters.
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  return CalculateOpData(context, node, params, width, height, filter_width,
+                         filter_height, data_type, data);
+}
+
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
+               TfLiteDepthwiseConvParams* params, const OpData* data,
                const TfLiteTensor* input, const TfLiteTensor* filter,
                const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
@@ -125,8 +183,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
+                             TfLiteDepthwiseConvParams* params,
+                             const OpData* data, const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output) {
   DepthwiseParams op_params;
@@ -155,7 +213,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 }
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
+                   TfLiteDepthwiseConvParams* params, const OpData* data,
                    const TfLiteTensor* input, const TfLiteTensor* filter,
                    const TfLiteTensor* bias, TfLiteTensor* output) {
   const int32_t input_offset = -input->params.zero_point;
@@ -189,8 +247,12 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@@ -198,37 +260,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
 
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        &data));
-
   // TODO(aselle): Consider whether float conv and quantized conv should be
   // separate ops to avoid dispatch overhead here.
   switch (input->type) {  // Already know in/out types are same.
@@ -253,9 +284,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace depthwise_conv
 
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/depthwise_conv::Init,
                                  /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+                                 /*prepare=*/depthwise_conv::Prepare,
                                  /*invoke=*/depthwise_conv::Eval,
                                  /*profiling_string=*/nullptr,
                                  /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index 8b79885a8a8..c4a242f480e 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -86,13 +86,10 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
   node.custom_initial_data_size = 0;
   node.delegate = nullptr;
   if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+    TF_LITE_ENSURE_OK(context, registration->prepare(&context, &node));
   }
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TfLiteStatus invoke_status = registration->invoke(&context, &node);
-  if (invoke_status != kTfLiteOk) {
-    return invoke_status;
-  }
+  TF_LITE_ENSURE_OK(context, registration->invoke(&context, &node));
 
   if (registration->free) {
     registration->free(&context, user_data);
@@ -765,4 +762,154 @@ TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
                      tensors_size, tensors));
 }
 
+TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
+  const int input_elements = 32 * 4;
+  const int filter_elements = 32 * 4;
+  const int bias_elements = 32;
+  const int output_elements = 32;
+  const int input_shape[] = {4, 1, 4, 1, 32};
+  const int filter_shape[] = {4, 1, 4, 1, 32};
+  const int bias_shape[] = {1, 32};
+  const int output_shape[] = {4, 1, 1, 1, 32};
+  const float input_values[] = {
+      11.0589, 10.8824, 11.1766, 11.5295, 10.8236, 9.5295, 9.5295, 10.0001,
+      11.2354, 10.8824, 9.1765,  9.0589,  9.6471,  8.9412, 7.9412, 9.0001,
+      9.3530,  7.5295,  9.2354,  9.5883,  7.5883,  8.1765, 7.5883, 9.2942,
+      9.1177,  8.5883,  8.2354,  8.6471,  8.0589,  8.0001, 7.4118, 7.3530,
+      11.0001, 11.1177, 11.0589, 11.2354, 10.5883, 9.2942, 9.2942, 10.1177,
+      11.2354, 10.8824, 8.9412,  8.8236,  9.2354,  8.8824, 7.0001, 9.1177,
+      9.5883,  8.2354,  9.1765,  9.5295,  7.4118,  8.5883, 8.1177, 9.1765,
+      9.0001,  9.0589,  8.9412,  8.2942,  7.8824,  8.4118, 7.2942, 7.2354,
+      10.4118, 10.8824, 11.1177, 11.0001, 10.0001, 9.7060, 9.7648, 10.1766,
+      11.1766, 10.6471, 8.6471,  8.5295,  9.5295,  9.0001, 7.0001, 9.4118,
+      9.8236,  8.0001,  9.2354,  9.5883,  7.5295,  9.0001, 8.5295, 9.0589,
+      8.9412,  9.1177,  8.9412,  8.0001,  8.0589,  8.8824, 7.0589, 7.3530,
+      11.3530, 11.0589, 10.7060, 10.7648, 9.9413,  9.1177, 9.1177, 9.7648,
+      10.7060, 10.2354, 8.5883,  8.8236,  9.7648,  9.2942, 7.5295, 9.2354,
+      9.7060,  8.1177,  9.2942,  9.5883,  7.7648,  9.6471, 9.1177, 9.4707,
+      9.3530,  8.8236,  8.5295,  8.0589,  8.6471,  9.5883, 7.4118, 7.5883};
+  const float filter_values[] = {
+      -0.1617, -0.1948, 0.1419,  -0.2311, -0.0891, 0.1551,  0.0033,  0.3037,
+      -0.1683, 0.1353,  0.1518,  -0.1683, -0.1386, 0.1452,  0.1816,  0.1716,
+      -0.1948, 0.2080,  0.2245,  -0.1981, -0.2410, 0.1849,  0.1981,  0.1584,
+      0.2509,  0.1783,  -0.2146, -0.1518, 0.2080,  -0.2872, 0.2014,  0.2476,
+      -0.4126, -0.0561, -0.3235, -0.0594, -0.0957, 0.2014,  -0.1056, 0.1386,
+      -0.2542, -0.1617, 0.1287,  -0.1816, -0.0363, 0.1419,  -0.0594, 0.2344,
+      -0.0099, 0.4192,  0.1287,  -0.2311, -0.2212, -0.0528, -0.2080, 0.1816,
+      -0.1452, 0.1221,  0.1254,  -0.1056, -0.0759, 0.1221,  0.1023,  0.1485,
+      0.2707,  0.1716,  -0.1882, -0.1783, 0.1650,  -0.2740, 0.1915,  0.2080,
+      -0.2971, -0.2575, -0.3169, 0.0198,  -0.0231, 0.2410,  -0.0429, 0.0660,
+      -0.1816, 0.1981,  0.2014,  -0.1386, -0.1915, 0.1716,  0.1320,  0.1419,
+      0.1320,  0.1353,  -0.1386, -0.1716, 0.1320,  -0.1650, 0.1386,  0.0825,
+      -0.1419, -0.1023, 0.1783,  0.0462,  0.2047,  -0.2179, -0.1518, -0.1551,
+      0.1518,  0.3334,  0.3103,  -0.2047, -0.2047, -0.0957, -0.1650, 0.1221,
+      0.0990,  0.1353,  -0.1617, -0.1485, 0.1650,  -0.1816, 0.1518,  0.1254,
+      -0.0363, -0.1254, 0.1386,  0.0429,  0.2113,  -0.2839, -0.1056, -0.2278};
+  const float bias_values[] = {
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000};
+  const float golden[] = {
+      -5.1194, -2.0075, -2.1751, -4.7958, 1.7073,  -1.2963, -0.4641, 5.0416,
+      -6.4424, 0.3836,  2.4684,  -4.7643, -3.8913, 3.8382,  -0.5164, 5.4304,
+      -2.7400, 7.7016,  3.6115,  -6.8545, -3.6290, 0.8509,  2.3247,  5.6117,
+      1.8215,  2.7645,  -0.7032, -3.2156, 3.9689,  -5.4583, 2.4346,  1.7731};
+
+  // Quantization Parameters.  All scales except output are 1.0, and all zero
+  // points are 0. This direct-maps the values to floating point and makes it
+  // easy to reson about them.
+  const float input_scale = 0.058824;
+  const float filter_scale = 0.003301;
+  const float output_scale = 0.092596;
+  const int input_zero_point = -128;
+  const int output_zero_point = 0;
+
+  TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape);
+  TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape);
+  TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
+  TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
+
+  // Create per-tensor quantized int8 input tensor.
+  int8_t input_quantized[input_elements];
+  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
+      input_values, input_quantized, input_dims, input_scale, input_zero_point,
+      "input_tensor");
+
+  // Set zero point and scale arrays with a single element for each.
+  int input_zero_points[] = {1, input_zero_point};
+  float input_scales[] = {1, input_scale};
+  TfLiteAffineQuantization input_quant = {
+      tflite::testing::FloatArrayFromFloats(input_scales),
+      tflite::testing::IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  // Create per-tensor quantized int8 filter tensor.
+  int8_t filter_quantized[filter_elements];
+  TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
+      filter_values, filter_quantized, filter_dims, filter_scale, 0,
+      "filter_tensor");
+
+  // Set zero point and scale arrays with a single element for each.
+  int filter_zero_points[] = {1, 0};
+  float filter_scales[] = {1, filter_scale};
+  TfLiteAffineQuantization filter_quant = {
+      tflite::testing::FloatArrayFromFloats(filter_scales),
+      tflite::testing::IntArrayFromInts(filter_zero_points)};
+  filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
+
+  // Create per-tensor quantized int32 bias tensor.
+  int32_t bias_quantized[bias_elements];
+  // See https://www.tensorflow.org/lite/performance/quantization_spec for a
+  // detailed explanation of why bias scale is input_scale * filter_scale.
+  tflite::SymmetricQuantize(bias_values, bias_quantized, bias_elements,
+                            input_scale * output_scale);
+  TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor(
+      bias_quantized, bias_dims, "bias_tensor");
+
+  // Set zero point and scale arrays with a single element for each.
+  int bias_zero_points[] = {1, 0};
+  float bias_scales[] = {1, input_scale * filter_scale};
+  TfLiteAffineQuantization bias_quant = {
+      tflite::testing::FloatArrayFromFloats(bias_scales),
+      tflite::testing::IntArrayFromInts(bias_zero_points)};
+  bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
+
+  // Create per-tensor quantized int8 output tensor.
+  int8_t output_quantized[output_elements];
+  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
+      output_quantized, output_dims, output_scale, output_zero_point,
+      "output_tensor");
+
+  // Set zero point and scale arrays with a single element for each.
+  int output_zero_points[] = {1, output_zero_point};
+  float output_scales[] = {1, output_scale};
+  TfLiteAffineQuantization output_quant = {
+      tflite::testing::FloatArrayFromFloats(output_scales),
+      tflite::testing::IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  // The 3 inputs include the input, filter and bias tensors.
+  constexpr int kInputsSize = 3;
+  constexpr int kOutputsSize = 1;
+  constexpr int kTensorsSize = kInputsSize + kOutputsSize;
+  TfLiteTensor tensors[kTensorsSize] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  int8_t golden_quantized[output_elements];
+  tflite::AsymmetricQuantize(golden, golden_quantized, output_elements,
+                             output_scale, 0);
+
+  // Errors due to quantization should not exceed 1.
+  constexpr int kQuantizationTolerance = 1;
+
+  TfLiteStatus status = tflite::testing::ValidateDepthwiseConvGoldens(
+      golden_quantized, output_elements, kTfLiteActNone, kQuantizationTolerance,
+      kTensorsSize, tensors);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/dequantize.cc b/tensorflow/lite/micro/kernels/dequantize.cc
index 37fb8ffc3c6..4b87c0eb04c 100644
--- a/tensorflow/lite/micro/kernels/dequantize.cc
+++ b/tensorflow/lite/micro/kernels/dequantize.cc
@@ -28,7 +28,27 @@ namespace ops {
 namespace micro {
 namespace dequantize {
 
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -42,10 +62,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(
       context, output->type == kTfLiteFloat32 || output->type == kTfLiteInt32);
 
+  if (output->type == kTfLiteInt32) {
+    const double effective_output_scale =
+        static_cast<double>(input->params.scale) /
+        static_cast<double>(output->params.scale);
+    QuantizeMultiplier(effective_output_scale, &data->output_multiplier,
+                       &data->output_shift);
+  }
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
@@ -76,28 +106,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteError;
     }
   } else if (output->type == kTfLiteInt32) {
-    int32_t output_multiplier;
-    int output_shift;
-    const double effective_output_scale =
-        static_cast<double>(input->params.scale) /
-        static_cast<double>(output->params.scale);
-    QuantizeMultiplier(effective_output_scale, &output_multiplier,
-                       &output_shift);
     int flat_size =
         MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
     switch (input->type) {
       case kTfLiteInt16: {
         reference_ops::Requantize(
-            GetTensorData<int16_t>(input), flat_size, output_multiplier,
-            output_shift, input->params.zero_point, output->params.zero_point,
-            GetTensorData<int32_t>(output));
+            GetTensorData<int16_t>(input), flat_size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int32_t>(output));
         break;
       }
       case kTfLiteInt8: {
         reference_ops::Requantize(
-            GetTensorData<int8_t>(input), flat_size, output_multiplier,
-            output_shift, input->params.zero_point, output->params.zero_point,
-            GetTensorData<int32_t>(output));
+            GetTensorData<int8_t>(input), flat_size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int32_t>(output));
         break;
       }
       default:
@@ -119,7 +142,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace dequantize
 
 TfLiteRegistration* Register_DEQUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/dequantize::Init,
                                  /*free=*/nullptr,
                                  /*prepare=*/dequantize::Prepare,
                                  /*invoke=*/dequantize::Eval,
diff --git a/tensorflow/lite/micro/kernels/elementwise.cc b/tensorflow/lite/micro/kernels/elementwise.cc
index 2e3ebe9f351..93fc4ec0d88 100644
--- a/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/tensorflow/lite/micro/kernels/elementwise.cc
@@ -106,6 +106,10 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalLogical(context, node, [](bool v) { return !v; });
 }
 
+TfLiteStatus TANHEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::tanh);
+}
+
 }  // namespace
 }  // namespace elementwise
 
@@ -221,6 +225,20 @@ TfLiteRegistration* Register_LOGICAL_NOT() {
   return &r;
 }
 
+TfLiteRegistration* Register_TANH() {
+  static TfLiteRegistration r = {
+      /*init=*/nullptr,
+      /*free=*/nullptr,
+      /*prepare=*/
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      /*invoke=*/elementwise::TANHEval,
+      /*profiling_string=*/nullptr,
+      /*builtin_code=*/0,
+      /*custom_name=*/nullptr,
+      /*version=*/0};
+  return &r;
+}
+
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/elementwise_test.cc b/tensorflow/lite/micro/kernels/elementwise_test.cc
index f009dca181a..c1e807974dc 100644
--- a/tensorflow/lite/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/micro/kernels/elementwise_test.cc
@@ -158,11 +158,11 @@ TF_LITE_MICRO_TEST(Abs) {
   constexpr int output_dims_count = 4;
   float output_data[output_dims_count];
   tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_ABS,  // ABS operator
-      {2, 2, 2},                    // Input shape
-      {0.01, -0.01, 10, -10},       // Input values
-      {2, 2, 2},                    // Output shape
-      {0.01, 0.01, 10, 10},         // Output values
+      tflite::BuiltinOperator_ABS,     // ABS operator
+      {2, 2, 2},                       // Input shape
+      {0.01f, -0.01f, 10.0f, -10.0f},  // Input values
+      {2, 2, 2},                       // Output shape
+      {0.01f, 0.01f, 10.0f, 10.0f},    // Output values
       output_data);
 }
 
@@ -170,11 +170,11 @@ TF_LITE_MICRO_TEST(Sin) {
   constexpr int output_dims_count = 4;
   float output_data[output_dims_count];
   tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_SIN,    // SIN operator
-      {2, 2, 2},                      // Input shape
-      {0, 3.1415926, -3.1415926, 1},  // Input values
-      {2, 2, 2},                      // Output shape
-      {0, 0, 0, 0.84147},             // Output values
+      tflite::BuiltinOperator_SIN,            // SIN operator
+      {2, 2, 2},                              // Input shape
+      {0.0f, 3.1415926f, -3.1415926f, 1.0f},  // Input values
+      {2, 2, 2},                              // Output shape
+      {0.0f, 0.0f, 0.0f, 0.84147f},           // Output values
       output_data);
 }
 
@@ -182,11 +182,11 @@ TF_LITE_MICRO_TEST(Cos) {
   constexpr int output_dims_count = 4;
   float output_data[output_dims_count];
   tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_COS,    // COS operator
-      {2, 2, 2},                      // Input shape
-      {0, 3.1415926, -3.1415926, 1},  // Input values
-      {2, 2, 2},                      // Output shape
-      {1, -1, -1, 0.54030},           // Output values
+      tflite::BuiltinOperator_COS,            // COS operator
+      {2, 2, 2},                              // Input shape
+      {0.0f, 3.1415926f, -3.1415926f, 1.0f},  // Input values
+      {2, 2, 2},                              // Output shape
+      {1.0f, -1.0f, -1.0f, 0.54030f},         // Output values
       output_data);
 }
 
@@ -194,11 +194,11 @@ TF_LITE_MICRO_TEST(Log) {
   constexpr int output_dims_count = 4;
   float output_data[output_dims_count];
   tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_LOG,    // LOG operator
-      {2, 2, 2},                      // Input shape
-      {1, 2.7182818, 0.5, 2},         // Input values
-      {2, 2, 2},                      // Output shape
-      {0, 1, -0.6931472, 0.6931472},  // Output values
+      tflite::BuiltinOperator_LOG,            // LOG operator
+      {2, 2, 2},                              // Input shape
+      {1.0f, 2.7182818f, 0.5f, 2.0f},         // Input values
+      {2, 2, 2},                              // Output shape
+      {0.0f, 1.0f, -0.6931472f, 0.6931472f},  // Output values
       output_data);
 }
 
@@ -208,9 +208,9 @@ TF_LITE_MICRO_TEST(Sqrt) {
   tflite::testing::TestElementwiseFloat(
       tflite::BuiltinOperator_SQRT,  // SQRT operator
       {2, 2, 2},                     // Input shape
-      {0, 1, 2, 4},                  // Input values
+      {0.0f, 1.0f, 2.0f, 4.0f},      // Input values
       {2, 2, 2},                     // Output shape
-      {0, 1, 1.41421, 2},            // Output values
+      {0.0f, 1.0f, 1.41421f, 2.0f},  // Output values
       output_data);
 }
 
@@ -218,11 +218,11 @@ TF_LITE_MICRO_TEST(Rsqrt) {
   constexpr int output_dims_count = 4;
   float output_data[output_dims_count];
   tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_RSQRT,  // RSQRT operator
-      {2, 2, 2},                      // Input shape
-      {1, 2, 4, 9},                   // Input values
-      {2, 2, 2},                      // Output shape
-      {1, 0.7071, 0.5, 0.33333},      // Output values
+      tflite::BuiltinOperator_RSQRT,    // RSQRT operator
+      {2, 2, 2},                        // Input shape
+      {1.0f, 2.0f, 4.0f, 9.0f},         // Input values
+      {2, 2, 2},                        // Output shape
+      {1.0f, 0.7071f, 0.5f, 0.33333f},  // Output values
       output_data);
 }
 
@@ -232,9 +232,9 @@ TF_LITE_MICRO_TEST(Square) {
   tflite::testing::TestElementwiseFloat(
       tflite::BuiltinOperator_SQUARE,  // SQARE operator
       {2, 2, 2},                       // Input shape
-      {1, 2, 0.5, -3.0},               // Input values
+      {1.0f, 2.0f, 0.5f, -3.0f},       // Input values
       {2, 2, 2},                       // Output shape
-      {1, 4.0, 0.25, 9.0},             // Output values
+      {1.0f, 4.0f, 0.25f, 9.0f},       // Output values
       output_data);
 }
 
@@ -250,4 +250,16 @@ TF_LITE_MICRO_TEST(LogicalNot) {
       output_data);
 }
 
+TF_LITE_MICRO_TEST(TANH) {
+  constexpr int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestElementwiseFloat(
+      tflite::BuiltinOperator_TANH,    // TANH operator
+      {2, 2, 2},                       // Input shape
+      {0.0f, 50.0f, 0.5f, -50.0f},     // Input values
+      {2, 2, 2},                       // Output shape
+      {0.0f, 1.0f, 0.462117f, -1.0f},  // Output values
+      output_data);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc
index 54c923cd314..66b8379739d 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -48,7 +48,7 @@ constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
+                             TfLiteFusedActivation activation,
                              TfLiteType data_type, const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
@@ -62,7 +62,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
     QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
     data->output_shift = -exponent;
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
+        context, activation, output, &data->output_activation_min,
         &data->output_activation_max));
   }
   return status;
@@ -71,33 +71,49 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  return nullptr;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
-  return kTfLiteOk;
+
+  return CalculateOpData(context, params->activation, input->type, input,
+                         filter, bias, output, data);
 }
 
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
-                               const TfLiteTensor* input,
+                               const OpData& data, const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
-  FullyConnectedParams op_params;
+  tflite::FullyConnectedParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
   op_params.output_offset = output->params.zero_point;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
   // TODO(b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::FullyConnected(
       op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
@@ -108,8 +124,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
+                           const OpData& data, const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* output) {
   const int32_t input_offset = -input->params.zero_point;
@@ -120,11 +135,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
 #define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
   reference_ops::FullyConnected(                                       \
@@ -149,11 +164,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
+                       TfLiteFusedActivation activation,
                        const TfLiteTensor* input, const TfLiteTensor* filter,
                        const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
+  CalculateActivationRange(activation, &output_activation_min,
                            &output_activation_max);
   tflite::FullyConnectedParams op_params;
   op_params.float_activation_min = output_activation_min;
@@ -167,32 +182,29 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TfLiteType data_type = input->type;
-  OpData local_data_object;
-  OpData* data = &local_data_object;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   // Checks in Prepare ensure input, output and filter types are all the same.
   switch (input->type) {
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
+      return EvalFloat(context, node, params->activation, input, filter, bias,
                        output);
     case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
+      return EvalQuantizedInt8(context, node, data, input, filter, bias,
                                output);
 
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
+      return EvalQuantized(context, node, data, input, filter, bias, output);
 
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc
index 32a1b67b88e..a920ca3b132 100644
--- a/tensorflow/lite/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
 #include <cstdint>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -25,7 +28,7 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestFullyConnectedFloat(
+TfLiteStatus TestFullyConnectedFloat(
     const int* input_dims_data, const float* input_data,
     const int* weights_dims_data, const float* weights_data,
     const int* bias_dims_data, const float* bias_data,
@@ -82,20 +85,25 @@ void TestFullyConnectedFloat(
   node.custom_initial_data_size = 0;
   node.delegate = nullptr;
   if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+    TF_LITE_ENSURE_OK(&context, registration->prepare(&context, &node));
   }
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TfLiteStatus invoke_status = registration->invoke(&context, &node);
+
   if (registration->free) {
     registration->free(&context, user_data);
   }
+  if (invoke_status != kTfLiteOk) {
+    return invoke_status;
+  }
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
+  return kTfLiteOk;
 }
 
 template <typename T>
-void TestFullyConnectedQuantized(
+TfLiteStatus TestFullyConnectedQuantized(
     const int* input_dims_data, const T* input_data, const float input_min,
     const float input_max, const int* weights_dims_data, const T* weights_data,
     const float weights_min, const float weights_max, const int* bias_dims_data,
@@ -159,16 +167,20 @@ void TestFullyConnectedQuantized(
   node.delegate = nullptr;
 
   if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+    TF_LITE_ENSURE_OK(&context, registration->prepare(&context, &node));
   }
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TfLiteStatus invoke_status = registration->invoke(&context, &node);
   if (registration->free) {
     registration->free(&context, user_data);
   }
+  if (invoke_status != kTfLiteOk) {
+    return invoke_status;
+  }
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
+  return kTfLiteOk;
 }
 
 }  // namespace
@@ -198,10 +210,12 @@ TF_LITE_MICRO_TEST(SimpleTest) {
 
   const int output_dims_count = 6;
   float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(
-      input_dims_data, input_data, weights_dims_data, weights_data,
-      bias_dims_data, bias_data, expected_output_data, output_dims_data,
-      kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedFloat(
+          input_dims_data, input_data, weights_dims_data, weights_data,
+          bias_dims_data, bias_data, expected_output_data, output_dims_data,
+          kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest2) {
@@ -224,10 +238,12 @@ TF_LITE_MICRO_TEST(SimpleTest2) {
 
   const int output_dims_count = 6;
   float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(
-      input_dims_data, input_data, weights_dims_data, weights_data,
-      bias_dims_data, bias_data, expected_output_data, output_dims_data,
-      kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedFloat(
+          input_dims_data, input_data, weights_dims_data, weights_data,
+          bias_dims_data, bias_data, expected_output_data, output_dims_data,
+          kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestRelu) {
@@ -251,10 +267,12 @@ TF_LITE_MICRO_TEST(SimpleTestRelu) {
 
   const int output_dims_count = 6;
   float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(
-      input_dims_data, input_data, weights_dims_data, weights_data,
-      bias_dims_data, bias_data, expected_output_data, output_dims_data,
-      kTfLiteActRelu, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedFloat(
+          input_dims_data, input_data, weights_dims_data, weights_data,
+          bias_dims_data, bias_data, expected_output_data, output_dims_data,
+          kTfLiteActRelu, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8) {
@@ -315,11 +333,13 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8) {
 
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 // TODO(b/138811455): Fix code duplication in micro tests
@@ -381,11 +401,13 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) {
 
   const int output_dims_count = 6;
   int8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<int8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<int8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8Relu) {
@@ -446,11 +468,13 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8Relu) {
 
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActRelu, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActRelu, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Relu) {
@@ -511,11 +535,13 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Relu) {
 
   const int output_dims_count = 6;
   int8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<int8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActRelu, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<int8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActRelu, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8OutputMultiplierGreaterThan1) {
@@ -576,11 +602,13 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8OutputMultiplierGreaterThan1) {
 
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) {
@@ -641,11 +669,13 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) {
 
   const int output_dims_count = 6;
   int8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<int8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<int8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest4DInput) {
@@ -669,10 +699,12 @@ TF_LITE_MICRO_TEST(SimpleTest4DInput) {
 
   const int output_dims_count = 6;
   float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(
-      input_dims_data, input_data, weights_dims_data, weights_data,
-      bias_dims_data, bias_data, expected_output_data, output_dims_data,
-      kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedFloat(
+          input_dims_data, input_data, weights_dims_data, weights_data,
+          bias_dims_data, bias_data, expected_output_data, output_dims_data,
+          kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedUInt8) {
@@ -733,11 +765,13 @@ TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedUInt8) {
 
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) {
@@ -798,11 +832,13 @@ TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) {
 
   const int output_dims_count = 6;
   int8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<int8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<int8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(
@@ -864,11 +900,13 @@ TF_LITE_MICRO_TEST(
 
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8OutputMultiplierGreaterThan1) {
@@ -929,11 +967,13 @@ TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8OutputMultiplierGreaterThan1) {
 
   const int output_dims_count = 6;
   int8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<int8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized<int8_t>(
+          input_dims_data, input_data, input_min, input_max, weights_dims_data,
+          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+          bias_scale, expected_output_data, output_dims_data, output_min,
+          output_max, kTfLiteActNone, output_data),
+      kTfLiteOk);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/l2norm.cc b/tensorflow/lite/micro/kernels/l2norm.cc
new file mode 100644
index 00000000000..4dd71fe1c4b
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/l2norm.cc
@@ -0,0 +1,150 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h"
+#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace l2norm {
+
+// This file has two implementation of L2Norm.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+#if defined(DEBUG)
+  auto* params = reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
+
+  TF_LITE_ENSURE(context, output->type == kTfLiteFloat32 ||
+                              output->type == kTfLiteUInt8 ||
+                              output->type == kTfLiteInt8);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
+    if (output->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
+    }
+    if (output->type == kTfLiteInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    }
+  }
+
+  // TODO(ahentz): For some reason our implementations don't support
+  // activations.
+  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
+#endif
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(b/143912164): instead of hardcode the epsilon here, we should read it
+  // from tensorflow, i.e., adding a params.
+  // We don't compute epsilon for quantized kernel:
+  //
+  // epsilon_float = (epsilon_quant - zp) * scale
+  // so
+  // espsilon_quant = epsilon_float / scale + zp
+  // We know epsilon_float is just a very small number to avoid division by
+  // zero error, and scale is > 1, so the integer value of epsilon for quant
+  // is just dominated by the zero point.
+  // Also, GetInvSqrtQuantizedMultiplierExp handles the scenario where the sum
+  // of input value squared is zero case well.
+  // So we don't even need to do handle the epsilon for quantized kernel case.
+  const float epsilon = 1e-6f;
+  if (output->type == kTfLiteFloat32) {
+#define TF_LITE_L2NORM(type)                                                 \
+  tflite::L2NormalizationParams op_params;                                   \
+  op_params.input_zero_point = 0;                                            \
+  type::L2Normalization(op_params, GetTensorShape(input),                    \
+                        GetTensorData<float>(input), GetTensorShape(output), \
+                        GetTensorData<float>(output), epsilon)
+
+    TF_LITE_L2NORM(reference_ops);
+#undef TF_LITE_L2NORM
+  } else if (output->type == kTfLiteUInt8) {
+#define TF_LITE_L2NORM(type)                                                 \
+  tflite::L2NormalizationParams op_params;                                   \
+  op_params.input_zero_point = input->params.zero_point;                     \
+  type::L2Normalization(op_params, GetTensorShape(input),                    \
+                        GetTensorData<uint8>(input), GetTensorShape(output), \
+                        GetTensorData<uint8>(output))
+
+    TF_LITE_L2NORM(reference_ops);
+#undef TF_LITE_L2NORM
+  } else if (output->type == kTfLiteInt8) {
+    const auto input_shape = GetTensorShape(input);
+    const auto output_shape = GetTensorShape(output);
+    const int trailing_dim = input_shape.DimensionsCount() - 1;
+    const int depth =
+        MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+    const int outer_size =
+        MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+    reference_integer_ops::L2Normalization(input->params.zero_point, outer_size,
+                                           depth, GetTensorData<int8>(input),
+                                           GetTensorData<int8>(output));
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Output type is %d, requires float.",
+                         output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace l2norm
+
+TfLiteRegistration* Register_L2NORM_REF() {
+    static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/l2norm::Prepare,
+                                 /*invoke=*/l2norm::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+
+  return &r;
+}
+
+TfLiteRegistration* Register_L2_NORMALIZATION() {
+  return Register_L2NORM_REF();
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/l2norm_test.cc b/tensorflow/lite/micro/kernels/l2norm_test.cc
new file mode 100644
index 00000000000..a4f2fff6a12
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/l2norm_test.cc
@@ -0,0 +1,332 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+
+namespace tflite {
+namespace testing {
+namespace {
+
+// used to set the quantization parameters for the int8 and uint8 tests
+constexpr float kInputMin = -2.0;
+constexpr float kInputMax = 2.0;
+constexpr float kOutputMin = -1.0;
+constexpr float kOutputMax = 127.0 / 128.0;
+
+
+void QuantizeInputData(const float input_data[], int length,
+                       uint8_t* quantized_data) {
+  for (int i=0; i < 6; i++) {
+    quantized_data[i] = tflite::testing::F2Q(input_data[i],
+                                             tflite::testing::kInputMin,
+                                             tflite::testing::kInputMax);
+  }
+}
+
+void QuantizeInputData(const float input_data[], int length,
+                       int8_t* quantized_data) {
+  for (int i=0; i < 6; i++) {
+    quantized_data[i] = tflite::testing::F2QS(input_data[i],
+                                             tflite::testing::kInputMin,
+                                             tflite::testing::kInputMax);
+  }
+}
+
+TfLiteTensor CreateL2NormTensor(const float* data, TfLiteIntArray* dims,
+                              const char* name, bool is_input) {
+  return CreateFloatTensor(data, dims, name);
+}
+
+TfLiteTensor CreateL2NormTensor(const uint8* data, TfLiteIntArray* dims,
+                              const char* name, bool is_input) {
+  TfLiteTensor tensor;
+
+  if (is_input) {
+    tensor = CreateQuantizedTensor(data, dims, name, kInputMin, kInputMax);
+  } else {
+    tensor = CreateQuantizedTensor(data, dims, name, kOutputMin, kOutputMax);
+  }
+
+  tensor.quantization.type = kTfLiteAffineQuantization;
+  return tensor;
+}
+
+TfLiteTensor CreateL2NormTensor(const int8* data, TfLiteIntArray* dims,
+                              const char* name, bool is_input) {
+  TfLiteTensor tensor;
+
+  if (is_input) {
+    tensor = CreateQuantizedTensor(data, dims, name, kInputMin, kInputMax);
+  } else {
+    tensor = CreateQuantizedTensor(data, dims, name, kOutputMin, kOutputMax);
+  }
+
+  tensor.quantization.type = kTfLiteAffineQuantization;
+  return tensor;
+}
+
+template <typename T>
+inline float Dequantize(const T data, float scale, int32_t zero_point) {
+  return scale * (data - zero_point);
+}
+
+template<typename T>
+void TestL2Normalization(const int* input_dims_data,
+                               const T* input_data,
+                               const float* expected_output_data,
+                               T* output_data, float variance) {
+  TfLiteIntArray* dims = IntArrayFromInts(input_dims_data);
+
+  const int output_dims_count = ElementCount(*dims);
+
+  constexpr int tensors_size = 2;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateL2NormTensor(input_data, dims, "input_tensor", true),
+      CreateL2NormTensor(output_data, dims, "output_tensor", false),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_L2_NORMALIZATION, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteL2NormParams builtin_data = {
+    .activation = kTfLiteActNone,
+  };
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  // Compare the results from dequantization and expected outputs, and make
+  // sure the difference is within a threshold.
+  if (tensors[1].quantization.type != kTfLiteNoQuantization) {
+    TfLiteTensor* output_tensor = &tensors[1];
+    int32_t zero_point = output_tensor->params.zero_point;
+    float scale = output_tensor->params.scale;
+
+    for (int i = 0; i < output_dims_count; ++i) {
+      float output_val = Dequantize(output_data[i], scale, zero_point);
+
+      TF_LITE_MICRO_EXPECT_LE(expected_output_data[i] - variance, output_val);
+      TF_LITE_MICRO_EXPECT_GE(expected_output_data[i] + variance, output_val);
+    }
+  } else {
+    for (int i = 0; i < output_dims_count; ++i) {
+      float output_val = static_cast<float>(output_data[i]);
+      TF_LITE_MICRO_EXPECT_LE(expected_output_data[i] - variance, output_val);
+      TF_LITE_MICRO_EXPECT_GE(expected_output_data[i] + variance, output_val);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleFloatTest) {
+  const int input_dims[] = {4, 1, 1, 1, 6};
+  constexpr int data_length = 6;
+  const float input_data[data_length] = {
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1
+  };
+  const float expected_output_data[data_length] = {
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05
+  };
+  float output_data[data_length];
+
+  tflite::testing::TestL2Normalization<float>(input_dims, input_data,
+    expected_output_data, output_data, 0);
+}
+
+TF_LITE_MICRO_TEST(ZerosVectorFloatTest) {
+  const int input_dims[] = {4, 1, 1, 1, 6};
+  constexpr int data_length = 6;
+  const float input_data[data_length] = {0, 0, 0, 0, 0, 0};
+  const float expected_output_data[data_length] = {0, 0, 0, 0, 0, 0};
+  float output_data[data_length];
+
+  tflite::testing::TestL2Normalization<float>(input_dims, input_data,
+    expected_output_data, output_data, 0);
+}
+
+TF_LITE_MICRO_TEST(SimpleFloatWithRankLessThanFourTest) {
+  const int input_dims[] = {4, 1, 1, 1, 6};
+  constexpr int data_length = 6;
+  const float input_data[data_length] = {
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1
+  };
+  const float expected_output_data[data_length] = {
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05
+  };
+  float output_data[data_length];
+
+  tflite::testing::TestL2Normalization<float>(input_dims, input_data,
+    expected_output_data, output_data, 0);
+}
+
+TF_LITE_MICRO_TEST(MultipleBatchFloatTest) {
+  const int input_dims[] = {4, 3, 1, 1, 6};
+  constexpr int data_length = 18;
+  const float input_data[data_length] = {
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+  };
+  const float expected_output_data[data_length] = {
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+  };
+  float output_data[data_length];
+
+  tflite::testing::TestL2Normalization<float>(input_dims, input_data,
+    expected_output_data, output_data, 0);
+}
+
+TF_LITE_MICRO_TEST(ZerosVectorUint8Test) {
+  const int input_dims[] = {4, 1, 1, 1, 6};
+  constexpr int data_length = 6;
+  const float input_data[data_length] = {0};
+  const float expected_output_data[data_length] = {0};
+  uint8_t quantized_input[data_length];
+  uint8_t output_data[data_length];
+
+  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
+
+  tflite::testing::TestL2Normalization<uint8_t>(input_dims, quantized_input,
+    expected_output_data, output_data, .1);
+}
+
+TF_LITE_MICRO_TEST(SimpleUint8Test) {
+  const int input_dims[] = {4, 1, 1, 1, 6};
+  constexpr int data_length = 6;
+  float input_data[data_length] = {
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1
+  };
+  float expected_output[data_length] = {
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05
+  };
+  uint8_t quantized_input[data_length];
+  uint8_t output_data[data_length];
+
+  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
+
+  tflite::testing::TestL2Normalization<uint8_t>(input_dims, quantized_input,
+    expected_output, output_data, .1);
+}
+
+TF_LITE_MICRO_TEST(SimpleInt8Test) {
+  const int input_dims[] = {4, 1, 1, 1, 6};
+  constexpr int data_length = 6;
+  float input_data[data_length] = {
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1
+  };
+  float expected_output[data_length] = {
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05
+  };
+  int8_t quantized_input[data_length];
+  int8_t output_data[data_length];
+
+  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
+
+  tflite::testing::TestL2Normalization<int8_t>(input_dims, quantized_input,
+    expected_output, output_data, .1);
+}
+
+TF_LITE_MICRO_TEST(ZerosVectorInt8Test) {
+  const int input_dims[] = {4, 1, 1, 1, 6};
+  constexpr int data_length = 6;
+  const float input_data[data_length] = {0};
+  const float expected_output_data[data_length] = {0};
+  int8_t quantized_input[data_length];
+  int8_t output_data[data_length];
+
+  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
+
+  tflite::testing::TestL2Normalization<int8_t>(input_dims, quantized_input,
+    expected_output_data, output_data, .1);
+}
+
+TF_LITE_MICRO_TEST(MultipleBatchUint8Test) {
+  const int input_dims[] = {4, 1, 1, 1, 6};
+  constexpr int data_length = 18;
+  float input_data[data_length] = {
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+  };
+  float expected_output[data_length] = {
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+  };
+  uint8_t quantized_input[data_length];
+  uint8_t output_data[data_length];
+
+  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
+
+  tflite::testing::TestL2Normalization<uint8_t>(input_dims, quantized_input,
+    expected_output, output_data, .1);
+}
+
+TF_LITE_MICRO_TEST(MultipleBatchInt8Test) {
+  const int input_dims[] = {4, 1, 1, 1, 6};
+  constexpr int data_length = 18;
+  float input_data[data_length] = {
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
+    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+  };
+  float expected_output[data_length] = {
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
+    -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+  };
+  int8_t quantized_input[data_length];
+  int8_t output_data[data_length];
+
+  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
+
+  tflite::testing::TestL2Normalization<int8_t>(input_dims, quantized_input,
+    expected_output, output_data, .1);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index 1433c2533ae..24180aab8c5 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -35,6 +35,7 @@ TfLiteRegistration* Register_ARG_MAX();
 TfLiteRegistration* Register_ARG_MIN();
 TfLiteRegistration* Register_AVERAGE_POOL_2D();
 TfLiteRegistration* Register_CEIL();
+TfLiteRegistration* Register_CIRCULAR_BUFFER();
 TfLiteRegistration* Register_CONV_2D();
 TfLiteRegistration* Register_CONCATENATION();
 TfLiteRegistration* Register_COS();
@@ -67,6 +68,7 @@ TfLiteRegistration* Register_QUANTIZE();
 TfLiteRegistration* Register_RELU();
 TfLiteRegistration* Register_RELU6();
 TfLiteRegistration* Register_RESHAPE();
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR();
 TfLiteRegistration* Register_ROUND();
 TfLiteRegistration* Register_RSQRT();
 TfLiteRegistration* Register_SIN();
@@ -78,7 +80,8 @@ TfLiteRegistration* Register_STRIDED_SLICE();
 TfLiteRegistration* Register_SUB();
 TfLiteRegistration* Register_SVDF();
 TfLiteRegistration* Register_UNPACK();
-TfLiteRegistration* Register_CIRCULAR_BUFFER();
+TfLiteRegistration* Register_L2_NORMALIZATION();
+TfLiteRegistration* Register_TANH();
 
 }  // namespace micro
 }  // namespace ops
diff --git a/tensorflow/lite/micro/kernels/pad_test.cc b/tensorflow/lite/micro/kernels/pad_test.cc
index a114ca0a56a..7248a9b2126 100644
--- a/tensorflow/lite/micro/kernels/pad_test.cc
+++ b/tensorflow/lite/micro/kernels/pad_test.cc
@@ -109,7 +109,7 @@ TfLiteStatus ValidatePadV2Goldens(TfLiteTensor* tensors, int tensors_size,
 
 // output data and golden must be shaped correctly
 void TestPadFloat(const int* input_dims_data, const float* input_data,
-                  const int* pad_dims_data, const int* pad_data,
+                  const int* pad_dims_data, const int32_t* pad_data,
                   const int* output_dims_data, const float* golden,
                   float* output_data,
                   TfLiteStatus expected_status = kTfLiteOk) {
@@ -135,7 +135,7 @@ void TestPadFloat(const int* input_dims_data, const float* input_data,
 
 // output data and golden must be shaped correctly
 void TestPadV2Float(const int* input_dims_data, const float* input_data,
-                    const int* pad_dims_data, const int* pad_data,
+                    const int* pad_dims_data, const int32_t* pad_data,
                     const float pad_value, const int* output_dims_data,
                     const float* golden, float* output_data,
                     TfLiteStatus expected_status = kTfLiteOk) {
@@ -166,7 +166,7 @@ template <typename T>
 void TestPadQuantized(const int* input_dims_data, const float* input_data,
                       T* input_quantized, float input_scale,
                       int input_zero_point, const int* pad_dims_data,
-                      const int* pad_data, const int* output_dims_data,
+                      const int32_t* pad_data, const int* output_dims_data,
                       const float* golden, T* golden_quantized,
                       float output_scale, int output_zero_point, T* output_data,
                       TfLiteStatus expected_status = kTfLiteOk) {
@@ -199,7 +199,7 @@ template <typename T>
 void TestPadV2Quantized(const int* input_dims_data, const float* input_data,
                         T* input_quantized, float input_scale,
                         int input_zero_point, const int* pad_dims_data,
-                        const int* pad_data, const float pad_value,
+                        const int32_t* pad_data, const float pad_value,
                         const float pad_value_scale,
                         const int pad_value_zero_point,
                         const int* output_dims_data, const float* golden,
@@ -249,7 +249,7 @@ TF_LITE_MICRO_TEST(Test2DFloat) {
   const int input_dims[] = {4, 1, 2, 2, 1};
   const float input_values[] = {1, 2, 3, 4};
   const int pad_dims[] = {2, 4, 2};
-  const int pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
+  const int32_t pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
   const int output_dims[] = {4, 3, 2, 4, 1};
   const float golden[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,
                           0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@@ -263,7 +263,7 @@ TF_LITE_MICRO_TEST(Test4DFloat) {
   const int input_dims[] = {4, 1, 1, 1, 1};
   const float input_values[] = {42};
   const int pad_dims[] = {2, 4, 2};
-  const int pad_values[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int32_t pad_values[] = {1, 1, 1, 1, 1, 1, 1, 1};
   const int output_dims[] = {4, 3, 3, 3, 3};
   const int kOutputLen = 81;  // 3 * 3 * 3 * 3
   float golden[kOutputLen];
@@ -282,7 +282,7 @@ TF_LITE_MICRO_TEST(Test2DFloatV2) {
   const int input_dims[] = {4, 1, 2, 2, 1};
   const float input_values[] = {1, 2, 3, 4};
   const int pad_dims[] = {2, 4, 2};
-  const int pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
+  const int32_t pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
   const float pad_value = 42;
   const int output_dims[] = {4, 3, 2, 4, 1};
   const float golden[] = {42, 42, 42, 42, 42, 42, 42, 42, 42, 1,  2,  42,
@@ -300,7 +300,7 @@ TF_LITE_MICRO_TEST(Test2DUInt8) {
   const float input_scale = 1.0f;
   const int input_zero_point = 127;
   const int pad_dims[] = {2, 4, 2};
-  const int pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
+  const int32_t pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
   const int output_dims[] = {4, 3, 2, 4, 1};
   const float golden[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,
                           0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@@ -322,7 +322,7 @@ TF_LITE_MICRO_TEST(Test2DUInt8V2) {
   const float input_scale = 1.0f;
   const int input_zero_point = 127;
   const int pad_dims[] = {2, 4, 2};
-  const int pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
+  const int32_t pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
   const float pad_value = 42;
   const float pad_value_scale = 1.0;
   const float pad_value_zero_point = 127;
@@ -348,7 +348,7 @@ TF_LITE_MICRO_TEST(Test2DInt8) {
   const float input_scale = 1.0f;
   const int input_zero_point = 0;
   const int pad_dims[] = {2, 4, 2};
-  const int pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
+  const int32_t pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
   const int output_dims[] = {4, 3, 2, 4, 1};
   const float golden[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,
                           0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@@ -370,7 +370,7 @@ TF_LITE_MICRO_TEST(Test2DInt8V2) {
   const float input_scale = 1.0f;
   const int input_zero_point = 0;
   const int pad_dims[] = {2, 4, 2};
-  const int pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
+  const int32_t pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
   const float pad_value = 42;
   const float pad_value_scale = 1.0;
   const float pad_value_zero_point = 0;
@@ -396,7 +396,7 @@ TF_LITE_MICRO_TEST(Test2DInt8V2ExpectFailurePadValueQuantizationMismatch) {
   const float input_scale = 1.0f;
   const int input_zero_point = 0;
   const int pad_dims[] = {2, 4, 2};
-  const int pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
+  const int32_t pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
   const float pad_value = 42;
   // Causes failure since this is in a different quantization space than input.
   const float pad_value_scale = .5;
@@ -424,7 +424,7 @@ TF_LITE_MICRO_TEST(Test2DInt8ExpectFailureQuantizationRangeExcludesZero) {
   const float input_scale = 1.0f;
   const int input_zero_point = 0;
   const int pad_dims[] = {2, 4, 2};
-  const int pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
+  const int32_t pad_values[] = {1, 1, 0, 0, 1, 1, 0, 0};
   const int output_dims[] = {4, 3, 2, 4, 1};
   const float golden[] = {42, 42, 42, 42, 42, 42, 42, 42, 42, 1,  2,  42,
                           42, 3,  4,  42, 42, 42, 42, 42, 42, 42, 42, 42};
diff --git a/tensorflow/lite/micro/kernels/prelu.cc b/tensorflow/lite/micro/kernels/prelu.cc
index a20d2c88225..2c575269cca 100644
--- a/tensorflow/lite/micro/kernels/prelu.cc
+++ b/tensorflow/lite/micro/kernels/prelu.cc
@@ -64,14 +64,20 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  int32_t output_multiplier = 0;
-  int output_shift = 0;
+  int32_t output_multiplier_1 = 0;
+  int output_shift_1 = 0;
+  int32_t output_multiplier_2 = 0;
+  int output_shift_2 = 0;
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
-    double real_multiplier = static_cast<double>(input->params.scale) *
-                             static_cast<double>(alpha->params.scale) /
-                             static_cast<double>(output->params.scale);
-    QuantizeMultiplierSmallerThanOneExp(real_multiplier, &output_multiplier,
-                                        &output_shift);
+    double real_multiplier_1 = static_cast<double>(input->params.scale) *
+                               static_cast<double>(output->params.scale);
+    double real_multiplier_2 = static_cast<double>(input->params.scale) *
+                               static_cast<double>(alpha->params.scale) /
+                               static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier_1, &output_multiplier_1,
+                       &output_shift_1);
+    QuantizeMultiplier(real_multiplier_2, &output_multiplier_2,
+                       &output_shift_2);
   }
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -86,8 +92,10 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.input_offset = -input->params.zero_point;
       op_params.alpha_offset = -alpha->params.zero_point;
       op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier = output_multiplier;
-      op_params.output_shift = output_shift;
+      op_params.output_multiplier_1 = output_multiplier_1;
+      op_params.output_shift_1 = output_shift_1;
+      op_params.output_multiplier_2 = output_multiplier_2;
+      op_params.output_shift_2 = output_shift_2;
       reference_ops::BroadcastPrelu4DSlow(
           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
           GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index 4b35dac5849..d6c851a2726 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -154,14 +154,14 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
   const int output_dims_count = 12;
   float output_data[output_dims_count];
-  tflite::testing::TestPreluFloat({4, 1, 2, 2, 3},  // input shape
+  tflite::testing::TestPreluFloat({1, 2, 2, 3},  // input shape
                                   {
                                       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
                                       1.0f, 1.0f, 1.0f,     // Row 1, Column 2
                                       -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
                                       -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
                                   },
-                                  {3, 1, 1, 3},        // alpha shape
+                                  {1, 1, 1, 3},        // alpha shape
                                   {0.0f, 1.0f, 2.0f},  // alpha values
                                   {
                                       0.0f, 0.0f, 0.0f,    // Row 1, Column 1
@@ -169,7 +169,7 @@ TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
                                       0.0f, -1.0f, -2.0f,  // Row 2, Column 1
                                       0.0f, -2.0f, -4.0f,  // Row 1, Column 2
                                   },
-                                  {4, 1, 2, 2, 3},  // output shape
+                                  {1, 2, 2, 3},  // output shape
                                   output_data);
 }
 
@@ -182,13 +182,13 @@ TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) {
   const int output_dims_count = 12;
   uint8_t output_data[output_dims_count];
   tflite::testing::TestPreluQuantized(
-      {4, 1, 2, 2, 3},  // input shape
+      {1, 2, 2, 3},  // input shape
       {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
        F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax),
        F2Q(-0.25f, kMin, kMax), F2Q(-0.25f, kMin, kMax),
        F2Q(-0.25f, kMin, kMax)},
-      kMin, kMax, {3, 1, 1, 3},  // alpha shape
+      kMin, kMax, {1, 1, 1, 3},  // alpha shape
       {F2Q(0.0f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(-0.5f, kMin, kMax)},
       kMin, kMax,
       {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
@@ -196,7 +196,7 @@ TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) {
        F2Q(0.0f, kMin, kMax), F2Q(-0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(0.0f, kMin, kMax), F2Q(-0.125f, kMin, kMax),
        F2Q(0.125f, kMin, kMax)},
-      {4, 1, 2, 2, 3},  // output shape
+      {1, 2, 2, 3},  // output shape
       kMin, kMax, output_data);
 }
 
diff --git a/tensorflow/lite/micro/kernels/quantize.cc b/tensorflow/lite/micro/kernels/quantize.cc
index d40471df948..b5bba83beb8 100644
--- a/tensorflow/lite/micro/kernels/quantize.cc
+++ b/tensorflow/lite/micro/kernels/quantize.cc
@@ -26,7 +26,27 @@ namespace ops {
 namespace micro {
 namespace quantize {
 
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -48,10 +68,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context,
                  output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
 
+  if (input->type == kTfLiteInt16 && output->type == kTfLiteInt8) {
+    double effective_scale =
+        static_cast<double>(input->params.scale / output->params.scale);
+
+    QuantizeMultiplier(effective_scale, &data->output_multiplier,
+                       &data->output_shift);
+  }
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
@@ -79,17 +109,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   } else if (input->type == kTfLiteInt16) {
     size_t size = ElementCount(*input->dims);
-    int32_t output_multiplier;
-    int output_shift;
-    double effective_scale =
-        static_cast<double>(input->params.scale / output->params.scale);
     switch (output->type) {
       case kTfLiteInt8:
-        QuantizeMultiplier(effective_scale, &output_multiplier, &output_shift);
         reference_ops::Requantize(
-            GetTensorData<int16_t>(input), size, output_multiplier,
-            output_shift, input->params.zero_point, output->params.zero_point,
-            GetTensorData<int8_t>(output));
+            GetTensorData<int16_t>(input), size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int8_t>(output));
         break;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@@ -113,7 +138,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 // AffineQuantize takes scale and zero point and quantizes the float value to
 // quantized output, in int8 or uint8 format.
 TfLiteRegistration* Register_QUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/quantize::Init,
                                  /*free=*/nullptr,
                                  /*prepare=*/quantize::Prepare,
                                  /*invoke=*/quantize::Eval,
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
new file mode 100644
index 00000000000..9487e33c45f
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace resize_nearest_neighbor {
+
+constexpr int kInputTensor = 0;
+constexpr int kSizeTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+#if defined(DEBUG)
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Our current implementations rely on the input being 4D,
+  // and the size being 1D tensor with exactly 2 elements.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
+  TF_LITE_ENSURE_EQ(context, size->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, size->dims->data[0], 2);
+
+  output->type = input->type;
+
+  if (!IsConstantTensor(size)) {
+    TF_LITE_KERNEL_LOG(context,
+                         "Dynamic tensors are unsupported in tfmicro.");
+    return kTfLiteError;
+  }
+#endif
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteResizeNearestNeighborParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  tflite::ResizeNearestNeighborParams op_params;
+  op_params.align_corners = params->align_corners;
+  op_params.half_pixel_centers = false;
+
+  if (output->type == kTfLiteFloat32) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, GetTensorShape(input), GetTensorData<int32>(input),
+        GetTensorShape(size), GetTensorData<int32>(size),
+        GetTensorShape(output), GetTensorData<int32>(output));
+  } else if (output->type == kTfLiteUInt8) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(size), GetTensorData<int32>(size),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  } else if (output->type == kTfLiteInt8) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(size), GetTensorData<int32>(size),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
+  } else {
+    TF_LITE_KERNEL_LOG(context,
+                       "Output type is %d, requires float, uint8 or int8.",
+                       output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace resize_nearest_neighbor
+
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/resize_nearest_neighbor::Prepare,
+                                 /*invoke=*/resize_nearest_neighbor::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
new file mode 100644
index 00000000000..518eada70fb
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
@@ -0,0 +1,340 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+
+namespace tflite {
+namespace testing {
+namespace {
+
+using uint8 = std::uint8_t;
+using int32 = std::int32_t;
+
+TfLiteTensor TestCreateTensor(const float* data, TfLiteIntArray* dims,
+                              const char* name) {
+  return CreateFloatTensor(data, dims, name);
+}
+
+TfLiteTensor TestCreateTensor(const uint8* data, TfLiteIntArray* dims,
+                              const char* name) {
+  return CreateQuantizedTensor(data, dims, name, 0, 255);
+}
+
+TfLiteTensor TestCreateTensor(const int8* data, TfLiteIntArray* dims,
+                              const char* name) {
+  return CreateQuantizedTensor(data, dims, name, -128, 127);
+}
+
+
+// Input data expects a 4-D tensor of [batch, height, width, channels]
+// Output data should match input datas batch and channels
+// Expected sizes should be a 1-D tensor with 2 elements: new_height & new_width
+template <typename T>
+void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
+                               const int32* expected_size_data,
+                               const T* expected_output_data,
+                               const int* output_dims_data, T* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+
+  int expected_size_dims_data[] = {2, 1, 2};
+  TfLiteIntArray* expected_size_dims =
+      IntArrayFromInts(expected_size_dims_data);
+
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int tensors_size = 3;
+  TfLiteTensor tensors[tensors_size] = {
+      TestCreateTensor(input_data, input_dims, "input_tensor"),
+      CreateInt32Tensor(expected_size_data, expected_size_dims, "size_tensor"),
+      TestCreateTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteResizeNearestNeighborParams builtin_data = {
+    .align_corners = false
+  };
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  // compare results
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(HorizontalResize) {
+  const int input_dims[] = {4, 1, 1, 2, 1};
+  const float input_data[] = {3, 6};
+  const int32 expected_size_data[] = {1, 3};
+  const float expected_output_data[] = {3, 3, 6};
+  const int output_dims[] = {4, 1, 1, 3, 1};
+  float output_data[3];
+
+  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(HorizontalResizeUInt8) {
+  const int input_dims[] = {4, 1, 1, 2, 1};
+  const uint8 input_data[] = {3, 6};
+  const int32 expected_size_data[] = {1, 3};
+  const uint8 expected_output_data[] = {3, 3, 6};
+  const int output_dims[] = {4, 1, 1, 3, 1};
+  uint8 output_data[3];
+
+  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(HorizontalResizeInt8) {
+  const int input_dims[] = {4, 1, 1, 2, 1};
+  const int8 input_data[] = {-3, 6};
+  const int32 expected_size_data[] = {1, 3};
+  const int8 expected_output_data[] = {-3, -3, 6};
+  const int output_dims[] = {4, 1, 1, 3, 1};
+  int8 output_data[3];
+
+  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(VerticalResize) {
+  const int input_dims[] = {4, 1, 2, 1, 1};
+  const float input_data[] = {3, 9};
+  const int32 expected_size_data[] = {3, 1};
+  const float expected_output_data[] = {3, 3, 9};
+  const int output_dims[] = {4, 1, 3, 1, 1};
+  float output_data[3];
+
+  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(VerticalResizeUInt8) {
+  const int input_dims[] = {4, 1, 2, 1, 1};
+  const uint8 input_data[] = {3, 9};
+  const int32 expected_size_data[] = {3, 1};
+  const uint8 expected_output_data[] = {3, 3, 9};
+  const int output_dims[] = {4, 1, 3, 1, 1};
+  uint8 output_data[3];
+
+  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(VerticalResizeInt8) {
+  const int input_dims[] = {4, 1, 2, 1, 1};
+  const int8 input_data[] = {3, -9};
+  const int32 expected_size_data[] = {3, 1};
+  const int8 expected_output_data[] = {3, 3, -9};
+  const int output_dims[] = {4, 1, 3, 1, 1};
+  int8 output_data[3];
+
+  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(TwoDimensionalResize) {
+  const int input_dims[] = {4, 1, 2, 2, 1};
+  const float input_data[] = {3, 6,   //
+                              9, 12,  //
+                             };
+  const int32 expected_size_data[] = {3, 3};
+  const float expected_output_data[] = {3, 3, 6,  //
+                                        3, 3, 6,  //
+                                        9, 9, 12  //
+                                       };
+
+  const int output_dims[] = {4, 1, 3, 3, 1};
+  float output_data[9];
+
+  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(TwoDimensionalResizeUInt8) {
+  const int input_dims[] = {4, 1, 2, 2, 1};
+  const uint8 input_data[] = {3, 6,  //
+                              9, 12  //
+                             };
+  const int32 expected_size_data[] = {3, 3};
+  const uint8 expected_output_data[] = {3, 3, 6,  //
+                                        3, 3, 6,  //
+                                        9, 9, 12  //
+                                       };
+  const int output_dims[] = {4, 1, 3, 3, 1};
+  uint8 output_data[9];
+
+  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(TwoDimensionalResizeInt8) {
+  const int input_dims[] = {4, 1, 2, 2, 1};
+  const int8 input_data[] = {3, -6,  //
+                             9, 12,  //
+                            };
+  const int32 expected_size_data[] = {3, 3};
+  const int8 expected_output_data[] = {3, 3, -6,  //
+                                       3, 3, -6,  //
+                                       9, 9, 12,  //
+                                      };
+  const int output_dims[] = {4, 1, 3, 3, 1};
+  int8 output_data[9];
+
+  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(TwoDimensionalResizeWithTwoBatches) {
+  const int input_dims[] = {4, 2, 2, 2, 1};
+  const float input_data[] = {3, 6,   //
+                              9, 12,  //
+                              4, 10,  //
+                              10, 16  //
+                             };
+  const int32 expected_size_data[] = {3, 3};
+  const float expected_output_data[] = {3, 3, 6,     //
+                                        3, 3, 6,     //
+                                        9, 9, 12,    //
+                                        4, 4, 10,    //
+                                        4, 4, 10,    //
+                                        10, 10, 16,  //
+                                       };
+  const int output_dims[] = {4, 2, 3, 3, 1};
+  float output_data[18];
+
+  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(TwoDimensionalResizeWithTwoBatchesUInt8) {
+  const int input_dims[] = {4, 2, 2, 2, 1};
+  const uint8 input_data[] = {3, 6,   //
+                              9, 12,  //
+                              4, 10,  //
+                              10, 16  //
+                             };
+  const int32 expected_size_data[] = {3, 3};
+  const uint8 expected_output_data[] = {3, 3, 6,     //
+                                        3, 3, 6,     //
+                                        9, 9, 12,    //
+                                        4, 4, 10,    //
+                                        4, 4, 10,    //
+                                        10, 10, 16,  //
+                                       };
+  const int output_dims[] = {4, 2, 3, 3, 1};
+  uint8 output_data[18];
+
+  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(TwoDimensionalResizeWithTwoBatchesInt8) {
+  const int input_dims[] = {4, 2, 2, 2, 1};
+  const int8 input_data[] = {3, 6,    //
+                             9, -12,  //
+                             -4, 10,  //
+                             10, 16   //
+                            };
+  const int32 expected_size_data[] = {3, 3};
+  const int8 expected_output_data[] = {3, 3, 6,     //
+                                       3, 3, 6,     //
+                                       9, 9, -12,   //
+                                       -4, -4, 10,  //
+                                       -4, -4, 10,  //
+                                       10, 10, 16,  //
+                                      };
+  const int output_dims[] = {4, 2, 3, 3, 1};
+  int8 output_data[18];
+
+  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(ThreeDimensionalResize) {
+  const int input_dims[] = {4, 1, 2, 2, 2};
+  const float input_data[] = {3, 4, 6, 10,    //
+                              9, 10, 12, 16,  //
+                             };
+  const int32 expected_size_data[] = {3, 3};
+  const float expected_output_data[] = {3, 4, 3, 4, 6, 10,     //
+                                        3, 4, 3, 4, 6, 10,     //
+                                        9, 10, 9, 10, 12, 16,  //
+                                     };
+  const int output_dims[] = {4, 1, 3, 3, 2};
+  float output_data[18];
+
+  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(ThreeDimensionalResizeUInt8) {
+  const int input_dims[] = {4, 1, 2, 2, 2};
+  const uint8 input_data[] = {3, 4, 6, 10,     //
+                              10, 12, 14, 16,  //
+                             };
+  const int32 expected_size_data[] = {3, 3};
+  const uint8 expected_output_data[] = {3, 4, 3, 4, 6, 10,       //
+                                        3, 4, 3, 4, 6, 10,       //
+                                        10, 12, 10, 12, 14, 16,  //
+                                     };
+  const int output_dims[] = {4, 1, 3, 3, 2};
+  uint8 output_data[18];
+
+  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+TF_LITE_MICRO_TEST(ThreeDimensionalResizeInt8) {
+  const int input_dims[] = {4, 1, 2, 2, 2};
+  const int8 input_data[] = {3, 4, -6, 10,    //
+                             10, 12, -14, 16,  //
+                            };
+  const int32 expected_size_data[] = {3, 3};
+  const int8 expected_output_data[] = {3, 4, 3, 4, -6, 10,       //
+                                        3, 4, 3, 4, -6, 10,       //
+                                        10, 12, 10, 12, -14, 16,  //
+                                     };
+  const int output_dims[] = {4, 1, 3, 3, 2};
+  int8 output_data[18];
+
+  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
+    expected_size_data, expected_output_data, output_dims, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/svdf.cc b/tensorflow/lite/micro/kernels/svdf.cc
index e2cacf17927..8c33fde5a87 100644
--- a/tensorflow/lite/micro/kernels/svdf.cc
+++ b/tensorflow/lite/micro/kernels/svdf.cc
@@ -31,10 +31,6 @@ namespace micro {
 namespace svdf {
 namespace {
 
-// These constants represent constants specific to the hotword "OK G" model.
-// They exist until (b/132070898) is fixed.
-constexpr int kScratchTensorMaxSize = 64;
-
 struct OpData {
   int32 effective_scale_1_a;
   int32 effective_scale_2_a;
@@ -42,6 +38,8 @@ struct OpData {
   // shift value - typically between [-32, 32].
   int effective_scale_1_b;
   int effective_scale_2_b;
+  int scratch_tensor_index;
+  int scratch_output_tensor_index;
 };
 
 /**
@@ -54,7 +52,6 @@ struct OpData {
  * and resizes the output tensor. Micro runtime does not support tensor
  * resizing.
  */
-
 static inline void ApplyTimeWeightsBiasAndActivation(
     int batch_size, int memory_size, int num_filters, int num_units, int rank,
     const float* const __restrict__ weights_time_ptr,
@@ -120,7 +117,8 @@ inline void EvalFloatSVDF(
     TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
     const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
     const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    TfLiteTensor* activation_state, TfLiteTensor* output) {
+    int scratch_tensor_index, TfLiteTensor* activation_state,
+    TfLiteTensor* output) {
   const int rank = params->rank;
   const int batch_size = input->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -135,10 +133,11 @@ inline void EvalFloatSVDF(
 
   float* state_ptr = GetTensorData<float>(activation_state);
 
-  // TODO(b/132070898): Move this temp variable to the new scratch buffer API
-  // when ready.
-  float scratch_tensor[kScratchTensorMaxSize];
-  float* scratch_ptr = scratch_tensor;
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  float* scratch_ptr = static_cast<float*>(
+      context->GetScratchBuffer(context, scratch_tensor_index));
 
   float* output_ptr = GetTensorData<float>(output);
 
@@ -185,13 +184,15 @@ inline void EvalFloatSVDF(
       bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
 }
 
-void EvalIntegerSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input_tensor,
-    const TfLiteTensor* weights_feature_tensor,
-    const TfLiteTensor* weights_time_tensor, const TfLiteTensor* bias_tensor,
-    const TfLiteSVDFParams* params, TfLiteTensor* activation_state_tensor,
-    TfLiteTensor* output_tensor, int32_t scale_1_a, int scale_1_b,
-    int32_t scale_2_a, int scale_2_b, int32_t input_zp, int32_t output_zp) {
+void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
+                     const TfLiteTensor* input_tensor,
+                     const TfLiteTensor* weights_feature_tensor,
+                     const TfLiteTensor* weights_time_tensor,
+                     const TfLiteTensor* bias_tensor,
+                     const TfLiteSVDFParams* params,
+                     TfLiteTensor* activation_state_tensor,
+                     TfLiteTensor* output_tensor, const OpData& data,
+                     int32_t input_zp, int32_t output_zp) {
   const int n_rank = params->rank;
   const int n_batch = input_tensor->dims->data[0];
   const int n_input = input_tensor->dims->data[1];
@@ -199,10 +200,13 @@ void EvalIntegerSVDF(
   const int n_unit = n_filter / n_rank;
   const int n_memory = weights_time_tensor->dims->data[1];
 
-  // TODO(b/132070898): Move these temp variables to the new scratch buffer API
-  // when ready.
-  int32_t scratch_tensor[kScratchTensorMaxSize];
-  int32_t scratch_output_tensor[kScratchTensorMaxSize];
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  int32_t* scratch_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_tensor_index));
+  int32_t* scratch_output_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
 
   // Shift states.
   int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
@@ -236,8 +240,8 @@ void EvalIntegerSVDF(
         for (int c = 0; c < n_input; c++) {
           dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
         }
-        dot_prod =
-            MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
+        dot_prod = MultiplyByQuantizedMultiplier(
+            dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
         dot_prod = std::min(std::max(output_min, dot_prod), output_max);
         // This assumes state is symmetrically quantized. Otherwise last bit of
         // state should be initialized to its zero point and accumulate the
@@ -310,7 +314,8 @@ void EvalIntegerSVDF(
     const int32_t output_min = std::numeric_limits<int8_t>::min();
     for (int i = 0; i < n_batch * n_unit; ++i) {
       int32_t x1 = scratch_output_tensor[i];
-      int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
+      int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
+                                                 data.effective_scale_2_b);
       int32_t x3 = x2 + output_zp;
       int32_t x4 = std::min(std::max(output_min, x3), output_max);
       GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
@@ -331,8 +336,20 @@ constexpr int kInputActivationStateTensor = 4;
 // Output tensor.
 constexpr int kOutputTensor = 0;
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
 
   // Validate Tensor Inputs (dtype depends on quantization):
   // [0] = Input, {2, batch_size, input_size}
@@ -341,7 +358,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // [3] = Bias (optional), {1, num_units}
   // [4] = Activation State (variable),
   //         {2, batch_size, memory_size * num_filters}
-
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* weights_feature =
       GetInput(context, node, kWeightsFeatureTensor);
@@ -360,8 +376,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int num_units = num_filters / rank;
   const int memory_size = weights_time->dims->data[1];
 
-  const bool is_full_integer = input->type == kTfLiteInt8;
-
   // Validate Input Tensor:
   TF_LITE_ENSURE(context,
                  input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
@@ -385,7 +399,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
 
   // Validate Optional Bias Input Tensor:
-  if (bias) {
+  if (bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
   }
 
@@ -395,51 +409,74 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
                     memory_size * num_filters);
 
-  if (is_full_integer) {
-    TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
 
+  if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
     TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
-
-    if (bias) {
+    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
+    if (bias != nullptr) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
     }
 
-    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
-
-    // Validate Scratch Tensors:
-    // [0] = (shared - see float block below for usage)
-    // [1] = Output Temp, int8_t, {2, num_units, batch_size}
-    // TODO(b/132070898): Scratch values are used as stack variables in
-    // EvalIntegerSVDF().
-
-    // Validate output tensor:
     TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
-  } else {
-    TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
 
-    // Validate Input Tensor dtypes:
+    const auto* input_params =
+        reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
+    const auto* weights_feature_params =
+        static_cast<const TfLiteAffineQuantization*>(
+            weights_feature->quantization.params);
+    const auto* state_params = static_cast<const TfLiteAffineQuantization*>(
+        activation_state->quantization.params);
+    const auto* weight_time_params =
+        static_cast<const TfLiteAffineQuantization*>(
+            weights_time->quantization.params);
+    const auto* output_params = static_cast<const TfLiteAffineQuantization*>(
+        output->quantization.params);
+    const double effective_scale_1 = static_cast<double>(
+        input_params->scale->data[0] * weights_feature_params->scale->data[0] /
+        state_params->scale->data[0]);
+    const double effective_scale_2 = static_cast<double>(
+        state_params->scale->data[0] * weight_time_params->scale->data[0] /
+        output_params->scale->data[0]);
+
+    TFLITE_DCHECK(node->user_data != nullptr);
+    OpData* data = static_cast<OpData*>(node->user_data);
+
+    QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
+                       &(data->effective_scale_1_b));
+    QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
+                       &(data->effective_scale_2_b));
+
+    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+
+    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+        context, batch_size * num_filters * sizeof(int32_t),
+        &(data->scratch_tensor_index));
+    TF_LITE_ENSURE_OK(context, scratch_status);
+
+    const TfLiteStatus scratch_output_status =
+        context->RequestScratchBufferInArena(
+            context, batch_size * num_units * sizeof(int32_t),
+            &(data->scratch_output_tensor_index));
+    TF_LITE_ENSURE_OK(context, scratch_output_status);
+  } else {
     TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
     TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
     TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
-
-    if (bias) {
+    if (bias != nullptr) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
     }
-
-    // Validate shared Scratch Tensor:
-    // [0] = Holds dot-product of time-forward calculations in
-    //       ApplyTimeWeightsBiasAndActivation():
-    //         float/int32, {2, batch_size, num_filters}
-    // TODO(b/132070898): Scratch values are used as stack variables in
-    // EvalIntegerSVDF().
-
-    // Full-float SVDF only uses the one shared scratch tensor (see above for
-    // usage).
-    // TODO(b/132070898): Use input tensor as variable until scratch tensor
-    // allocation has been implemented.
-    // TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
     TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+
+    TFLITE_DCHECK(node->user_data != nullptr);
+    OpData* data = static_cast<OpData*>(node->user_data);
+
+    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+        context, batch_size * num_filters * sizeof(float),
+        &(data->scratch_tensor_index));
+    TF_LITE_ENSURE_OK(context, scratch_status);
   }
 
   return kTfLiteOk;
@@ -458,57 +495,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetVariableInput(context, node, kInputActivationStateTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  const bool is_full_integer = input->type == kTfLiteInt8;
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (weights_feature->type) {
     case kTfLiteFloat32: {
-      // TODO(b/132070898): Use input tensor as variable until scratch tensor
-      // allocation has been implemented.
-      // TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
       EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
-                    params, activation_state, output);
+                    params, data.scratch_tensor_index, activation_state,
+                    output);
       return kTfLiteOk;
       break;
     }
 
     case kTfLiteInt8: {
-      if (is_full_integer) {
-        // TODO(b/132070898): Store these values in ::Prepare() instead of
-        // ::Eval():
-        // Calculate effective scales.
-        OpData op_data;
-        auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            input->quantization.params);
-        auto* weights_feature_params =
-            reinterpret_cast<TfLiteAffineQuantization*>(
-                weights_feature->quantization.params);
-        auto* state_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            activation_state->quantization.params);
-        auto* weight_time_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            weights_time->quantization.params);
-        auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            output->quantization.params);
-        const double effective_scale_1 =
-            static_cast<double>(input_params->scale->data[0] *
-                                weights_feature_params->scale->data[0] /
-                                state_params->scale->data[0]);
-        const double effective_scale_2 = static_cast<double>(
-            state_params->scale->data[0] * weight_time_params->scale->data[0] /
-            output_params->scale->data[0]);
-        QuantizeMultiplier(effective_scale_1, &op_data.effective_scale_1_a,
-                           &op_data.effective_scale_1_b);
-        QuantizeMultiplier(effective_scale_2, &op_data.effective_scale_2_a,
-                           &op_data.effective_scale_2_b);
+      TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
 
-        TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
-        EvalIntegerSVDF(
-            context, node, input, weights_feature, weights_time, bias, params,
-            activation_state, output, op_data.effective_scale_1_a,
-            op_data.effective_scale_1_b, op_data.effective_scale_2_a,
-            op_data.effective_scale_2_b, input->params.zero_point,
-            output->params.zero_point);
-        return kTfLiteOk;
-      }
+      EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
+                      params, activation_state, output, data,
+                      input->params.zero_point, output->params.zero_point);
+      return kTfLiteOk;
       break;
     }
 
@@ -523,7 +528,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace svdf
 
 TfLiteRegistration* Register_SVDF() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/svdf::Init,
                                  /*free=*/nullptr,
                                  /*prepare=*/svdf::Prepare,
                                  /*invoke=*/svdf::Eval,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
index a1de5ef22e0..8895ccf52d7 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
@@ -66,7 +65,7 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
   const int output_width = output_shape.Dims(2);
   const int output_depth = output_shape.Dims(3);
 
-  ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
+  ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
   ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
   ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
   ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
@@ -87,17 +86,16 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
                   (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                   (in_y < input_height);
               if (is_point_inside_image) {
+                // Find current input index, minus 2 for Xtensa load
+                // alignments:
+                // TODO(b/147322595): Consider doing these offset calculations
+                // with intrinsics:
+                int input_idx =
+                    ((batch * input_height + in_y) * input_width + in_x) *
+                        input_depth -
+                    2;
+                const int8_t* input_vals_offset_ptr = input_data + input_idx;
                 for (int i = 0; i < input_depth_iters; ++i) {
-                  // Find current input index, minus 2 for Xtensa load
-                  // alignments:
-                  // TODO(b/147322595): Consider doing these offset calculations
-                  // with intrinsics:
-                  int input_idx =
-                      ((batch * input_height + in_y) * input_width + in_x) *
-                          input_depth +
-                      (i * 2) - 2;
-                  const int8_t* input_vals_offset_ptr = input_data + input_idx;
-
                   // Load signed 2x 8bit values and right shift into 24bit
                   // alignment:
                   ae_p24x2s input_vals_24x2;
@@ -151,9 +149,6 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
               acc_24x2, output_multiplier[out_channel],
               output_shift[out_channel]);
 
-          // Shift from 48bit aligned to 32bit:
-          acc_56 = AE_Q56S_SLAI(acc_56, 16);
-
           // Add output offset, cap activation, and assign to the output:
           acc_56 = AE_ADDQ56(acc_56, output_offset_56);
           acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
@@ -170,6 +165,75 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
   }
 }
 
+// TODO(b/154240772): Move shared code into common methods.
+inline void Conv1x32Input32x32Filter(
+    const int input_offset, const int output_offset,
+    const int quantized_activation_min, const int quantized_activation_max,
+    const int32* output_multiplier, const int32* output_shift,
+    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& bias_shape, const int32* bias_data,
+    const RuntimeShape& output_shape, int8* output_data) {
+  ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
+  ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
+  ae_q56s output_activation_max_56 = AE_CVTQ48A32S(quantized_activation_max);
+  ae_q56s output_activation_min_56 = AE_CVTQ48A32S(quantized_activation_min);
+
+  constexpr int kChannels = 32;
+  constexpr int kFilterDepth = 32;
+  for (int ch = 0; ch < kChannels; ch++) {
+    ae_q56s acc_56 = AE_ZEROQ56();
+    const int8_t* input_vals_ptr = input_data - 2;
+    for (int i = 0; i < kFilterDepth; i += 2) {
+      // Load signed 2x 8bit values and right shift into 24bit
+      // alignment:
+      ae_p24x2s input_vals_24x2;
+      AE_LP8X2F_IU(input_vals_24x2, input_vals_ptr, 2);
+      input_vals_24x2 = AE_P24X2S_SRAI(input_vals_24x2, 16);
+
+      // Add input offset (24bit aligned):
+      input_vals_24x2 = AE_P24S_ADDS_P24X2S(input_vals_24x2, input_offset_24x2);
+      // Find current filter index, minus 2 for Xtensa load
+      // alignments:
+      const int filter_idx = ch * kFilterDepth + i - 2;
+      const int8_t* filter_vals_offset_ptr = filter_data + filter_idx;
+
+      // Load signed 2x 8bit values and right shift into 24bit
+      // alignment:
+      ae_p24x2s filter_vals_24x2;
+      AE_LP8X2F_IU(filter_vals_24x2, filter_vals_offset_ptr, 2);
+      filter_vals_24x2 = AE_P24X2S_SRAI(filter_vals_24x2, 16);
+
+      // Multiply and accumulate into 48bit bit space:
+      AE_MULAAP24S_HH_LL(acc_56, filter_vals_24x2, input_vals_24x2);
+    }
+    // Left shift from 48bit alignment to 32bit:
+    acc_56 = AE_Q56S_SLAI(acc_56, 16);
+    if (bias_data) {
+      // Load and add bias at 32bit alignment:
+      ae_q56s bias_56 = AE_CVTQ48A32S(bias_data[ch]);
+      acc_56 = AE_ADDQ56(acc_56, bias_56);
+    }
+
+    // Shift from 32bit alignment to 24bit alignment and place back on
+    // the PR register:
+    acc_56 = AE_Q56S_SLAI(acc_56, 8);
+    ae_p24x2s acc_24x2 = AE_TRUNCP24Q48(acc_56);
+
+    // Apply quantized multiplier and accumulate result at 48bit alignment.
+    // Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
+    acc_56 = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
+        acc_24x2, output_multiplier[ch] >> 8, output_shift[ch]);
+
+    // Add output offset, cap activation, and assign to the output:
+    acc_56 = AE_ADDQ56(acc_56, output_offset_56);
+    acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
+    acc_56 = AE_MAXQ56S(acc_56, output_activation_min_56);
+
+    output_data[ch] = static_cast<int8_t>(AE_TRUNCA32Q48(acc_56));
+  }
+}
+
 }  // namespace hifimini
 }  // namespace xtensa
 
@@ -177,7 +241,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 8;
 
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -191,9 +254,8 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
 
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
@@ -201,12 +263,6 @@ struct OpData {
   int32_t output_activation_max;
 };
 
-// These constants represent constants specific to the music detect model.
-// They exist until (b/132070898) is fixed.
-static const int kMaxOpDataSize = 6;
-static int op_data_counter = 0;
-static OpData kStaticOpData[kMaxOpDataSize];
-
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteConvParams* params, int width, int height,
                              int filter_width, int filter_height, int out_width,
@@ -234,30 +290,37 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
     int output_channels = filter->dims->data[kConvQuantizedDimension];
 
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+    return tflite::PopulateConvolutionQuantizationParams(
         context, input, filter, bias, output, params->activation,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier,
         reinterpret_cast<int*>(data->per_channel_output_shift),
-        output_channels));
+        output_channels);
   }
   return kTfLiteOk;
 }
 
-void Free(TfLiteContext* context, void* buffer) { op_data_counter = 0; }
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
 
-  // TODO(b/132070898): Use statically slotted OpData structures until a
-  // scratch memory API is ready.
-  OpData* op_data = &kStaticOpData[op_data_counter++];
-  node->user_data = op_data;
+  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   int input_width = input->dims->data[2];
   int input_height = input->dims->data[1];
@@ -266,6 +329,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int output_width = output->dims->data[2];
   int output_height = output->dims->data[1];
 
+  // Per channel quantization is only needed for int8 inference. For other
+  // quantized types, only a single scale and zero point is needed.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  // Dynimically allocate per-channel quantization parameters.
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&op_data->per_channel_output_multiplier)));
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&op_data->per_channel_output_shift)));
+
   // All per-channel quantized tensors need valid zero point and scale arrays.
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
@@ -286,11 +360,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                       affine_quantization->zero_point->size);
   }
 
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, op_data));
-
-  return kTfLiteOk;
+  return CalculateOpData(context, node, params, input_width, input_height,
+                         filter_width, filter_height, output_width,
+                         output_height, input->type, op_data);
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
@@ -299,6 +371,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
                              TfLiteTensor* im2col) {
+  // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.output_offset = output->params.zero_point;
@@ -321,6 +394,8 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
   auto* op_data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -329,6 +404,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
 
+  int* input_dims = input->dims->data;
+  int* filter_dims = filter->dims->data;
+  if (input_dims[0] == 1 && input_dims[1] == 1 && input_dims[2] == 1 &&
+      input_dims[3] == 32 && filter_dims[0] == 32 && filter_dims[1] == 1 &&
+      filter_dims[2] == 1 && filter_dims[3] == 32) {
+    xtensa::hifimini::Conv1x32Input32x32Filter(
+        -input->params.zero_point, output->params.zero_point,
+        op_data->output_activation_min, op_data->output_activation_max,
+        op_data->per_channel_output_multiplier,
+        op_data->per_channel_output_shift, GetTensorShape(input),
+        GetTensorData<int8>(input), GetTensorShape(filter),
+        GetTensorData<int8>(filter), GetTensorShape(bias),
+        GetTensorData<int32>(bias), GetTensorShape(output),
+        GetTensorData<int8>(output));
+    return kTfLiteOk;
+  }
+
   switch (input->type) {
     case kTfLiteInt8:
       EvalQuantizedPerChannel(context, node, params, op_data, input, filter,
@@ -345,8 +437,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace conv
 
 TfLiteRegistration* Register_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/conv::Free,
+  static TfLiteRegistration r = {/*init=*/conv::Init,
+                                 /*free=*/nullptr,
                                  /*prepare=*/conv::Prepare,
                                  /*invoke=*/conv::Eval,
                                  /*profiling_string=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
index 3760dd71838..cf7552c57b5 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
@@ -41,8 +40,7 @@ inline void DepthwiseConvPerChannel(
     const int8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
     int8* output_data) {
-  // Get parameters.
-  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+  // TODO(b/154032858): Investigate removing extra copies.
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
@@ -69,7 +67,7 @@ inline void DepthwiseConvPerChannel(
   const int output_width = output_shape.Dims(2);
   const int output_depth = output_shape.Dims(3);
 
-  ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
+  ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
   ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
   ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
   ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
@@ -114,14 +112,14 @@ inline void DepthwiseConvPerChannel(
                   // shift into 24bit space. Note: value is duplicated in the HH
                   // and LL register - but all calculations are done on the HH
                   // side.
-                  ae_p24x2s input_val_24x2 = AE_CONVERT_INT32_24x2(input_val);
+                  ae_p24x2s input_val_24x2 = AE_MOVPA24(input_val);
 
                   // Add input offset (24bit aligned):
                   input_val_24x2 =
                       AE_P24S_ADDS_P24X2S(input_val_24x2, input_offset_24x2);
 
                   // Load filter 8bit value into 24bit alignment:
-                  ae_p24x2s filter_val_24x2 = AE_CONVERT_INT32_24x2(filter_val);
+                  ae_p24x2s filter_val_24x2 = AE_MOVPA24(filter_val);
 
                   // Multiply and accumulate the HH side of each 24x24 PR
                   // register:
@@ -150,9 +148,6 @@ inline void DepthwiseConvPerChannel(
                 acc_24x2, output_multiplier[output_channel],
                 output_shift[output_channel]);
 
-            // Shift from 48bit aligned to 32bit:
-            acc_56 = AE_Q56S_SLAI(acc_56, 16);
-
             // Add output offset, cap activation, and assign to the output:
             acc_56 = AE_ADDQ56(acc_56, output_offset_56);
             acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
@@ -171,6 +166,119 @@ inline void DepthwiseConvPerChannel(
   }
 }
 
+constexpr int kConvolutionalKernelWidth = 4;
+constexpr int kConvolutionalKernelDepth = 32;
+inline void DepthwiseConv4x32MatchingInputAndFilter(
+    const int input_offset, const int output_offset,
+    const int quantized_activation_min, const int quantized_activation_max,
+    const int32* output_multiplier, const int32* output_shift,
+    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& bias_shape, const int32* bias_data,
+    const RuntimeShape& output_shape, int8* output_data) {
+  // Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
+  const int32_t mult = output_multiplier[0] >> 8;
+  const int32_t shift = output_shift[0];
+  ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
+  ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
+  ae_q56s output_activation_min_56 = AE_CVTQ48A32S(quantized_activation_min);
+  ae_q56s output_activation_max_56 = AE_CVTQ48A32S(quantized_activation_max);
+
+  const int num_blocks =
+      kConvolutionalKernelDepth / 2;  // Based on the 24x2 register size.
+  const int stride_elements =
+      (kConvolutionalKernelDepth / kConvolutionalKernelWidth);
+
+  const int8* input_0_ptr = (const int8*)(input_data - 2);
+  const int8* weight_0_ptr = (const int8*)(filter_data - 2);
+  // Apply the kernels in blocks of 4 for all the channels.
+  const int8* input_1_ptr = input_0_ptr + stride_elements * 4;
+  const int8* input_2_ptr = input_1_ptr + stride_elements * 4;
+  const int8* input_3_ptr = input_2_ptr + stride_elements * 4;
+
+  const int8* weight_1_ptr = weight_0_ptr + stride_elements * 4;
+  const int8* weight_2_ptr = weight_1_ptr + stride_elements * 4;
+  const int8* weight_3_ptr = weight_2_ptr + stride_elements * 4;
+
+  for (int i = 0; i < num_blocks; ++i) {
+    ae_q56s block_0_acc = AE_ZEROQ56();
+    ae_q56s block_1_acc = AE_ZEROQ56();
+
+    // Load all the weights.
+    ae_p24x2s weight_0, weight_1, weight_2, weight_3;
+    AE_LP8X2F_IU(weight_0, weight_0_ptr, 2);
+    AE_LP8X2F_IU(weight_1, weight_1_ptr, 2);
+    AE_LP8X2F_IU(weight_2, weight_2_ptr, 2);
+    AE_LP8X2F_IU(weight_3, weight_3_ptr, 2);
+
+    // Load all the inputs.
+    ae_p24x2s input_0, input_1, input_2, input_3;
+    AE_LP8X2F_IU(input_0, input_0_ptr, 2);
+    AE_LP8X2F_IU(input_1, input_1_ptr, 2);
+    AE_LP8X2F_IU(input_2, input_2_ptr, 2);
+    AE_LP8X2F_IU(input_3, input_3_ptr, 2);
+
+    // Shift inputs to 8 bit alignment and add offsets.
+    input_0 = AE_P24X2S_SRAI(input_0, 16);
+    input_1 = AE_P24X2S_SRAI(input_1, 16);
+    input_2 = AE_P24X2S_SRAI(input_2, 16);
+    input_3 = AE_P24X2S_SRAI(input_3, 16);
+
+    input_0 = AE_P24S_ADDS_P24X2S(input_0, input_offset_24x2);
+    input_1 = AE_P24S_ADDS_P24X2S(input_1, input_offset_24x2);
+    input_2 = AE_P24S_ADDS_P24X2S(input_2, input_offset_24x2);
+    input_3 = AE_P24S_ADDS_P24X2S(input_3, input_offset_24x2);
+
+    // Do the multiplies across all channels.  Resulting accumulators are 32bit
+    // aligned (24 bit aligned weights * 8 bit aligned inputs).
+    AE_MULAS56P24S_HH(block_0_acc, input_0, weight_0);
+    AE_MULAS56P24S_HH(block_0_acc, input_1, weight_1);
+    AE_MULAS56P24S_HH(block_0_acc, input_2, weight_2);
+    AE_MULAS56P24S_HH(block_0_acc, input_3, weight_3);
+
+    AE_MULAS56P24S_LL(block_1_acc, input_0, weight_0);
+    AE_MULAS56P24S_LL(block_1_acc, input_1, weight_1);
+    AE_MULAS56P24S_LL(block_1_acc, input_2, weight_2);
+    AE_MULAS56P24S_LL(block_1_acc, input_3, weight_3);
+
+    int ch_0 = i * 2;
+    int ch_1 = i * 2 + 1;
+
+    // Load and add bias at 32bit alignment:
+    ae_q56s bias_56_0 = AE_CVTQ48A32S(bias_data[ch_0]);
+    ae_q56s bias_56_1 = AE_CVTQ48A32S(bias_data[ch_1]);
+    block_0_acc = AE_ADDQ56(block_0_acc, bias_56_0);
+    block_1_acc = AE_ADDQ56(block_1_acc, bias_56_1);
+
+    // Shift from 32bit alignment to 24bit alignment and place back on
+    // the PR register:
+    block_0_acc = AE_Q56S_SLAI(block_0_acc, 8);
+    block_1_acc = AE_Q56S_SLAI(block_1_acc, 8);
+    ae_p24x2s acc_24x2_0 = AE_TRUNCP24Q48(block_0_acc);
+    ae_p24x2s acc_24x2_1 = AE_TRUNCP24Q48(block_1_acc);
+
+    // Apply quantized multiplier and accumulate result at 48bit
+    // alignment:
+    block_0_acc = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
+        acc_24x2_0, mult, shift);
+    // Apply quantized multiplier and accumulate result at 48bit
+    // alignment:
+    block_1_acc = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
+        acc_24x2_1, mult, shift);
+
+    // Add output offset, cap activation, and assign to the output:
+    block_0_acc = AE_ADDQ56(block_0_acc, output_offset_56);
+    block_1_acc = AE_ADDQ56(block_1_acc, output_offset_56);
+    block_0_acc = AE_MINQ56S(block_0_acc, output_activation_max_56);
+    block_1_acc = AE_MINQ56S(block_1_acc, output_activation_max_56);
+    block_0_acc = AE_MAXQ56S(block_0_acc, output_activation_min_56);
+    block_1_acc = AE_MAXQ56S(block_1_acc, output_activation_min_56);
+
+    output_data[ch_0] = static_cast<int8_t>(AE_TRUNCA32Q48(block_0_acc));
+    output_data[ch_1] = static_cast<int8_t>(AE_TRUNCA32Q48(block_1_acc));
+  }
+}
+
 }  // namespace hifimini
 }  // namespace xtensa
 
@@ -180,7 +288,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 8;
 
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -195,8 +302,8 @@ struct OpData {
 
   // Per channel output multiplier and shift.
   // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
 
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
@@ -204,12 +311,6 @@ struct OpData {
   int32_t output_activation_max;
 };
 
-// These constants represent constants specific to the music detect model.
-// They exist until (b/132070898) is fixed.
-static const int kMaxOpDataSize = 6;
-static int op_data_counter = 0;
-static OpData kStaticOpData[kMaxOpDataSize];
-
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, int width,
                              int height, int filter_width, int filter_height,
@@ -249,19 +350,26 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
 }  // namespace
 
-void Free(TfLiteContext* context, void* buffer) { op_data_counter = 0; }
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
 
-  // TODO(b/132070898): Use statically slotted OpData structures until a
-  // scratch memory API is ready.
-  OpData* op_data = &kStaticOpData[op_data_counter++];
-  node->user_data = op_data;
+  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   const TfLiteType data_type = input->type;
   int width = SizeOfDimension(input, 2);
@@ -269,6 +377,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int filter_width = SizeOfDimension(filter, 2);
   int filter_height = SizeOfDimension(filter, 1);
 
+  // Per channel quantization is only needed for int8 inference. For other
+  // quantized types, only a single scale and zero point is needed.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  // Dynimically allocate per-channel quantization parameters.
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&op_data->per_channel_output_multiplier)));
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&op_data->per_channel_output_shift)));
+
   // All per-channel quantized tensors need valid zero point and scale arrays.
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
@@ -288,10 +407,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                       affine_quantization->zero_point->size);
   }
 
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        op_data));
-  return kTfLiteOk;
+  return CalculateOpData(context, node, params, width, height, filter_width,
+                         filter_height, data_type, op_data);
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
@@ -325,6 +442,8 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
   auto* op_data = reinterpret_cast<OpData*>(node->user_data);
@@ -335,8 +454,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
 
-  // TODO(b/147710241): Consider whether float conv and quantized conv should be
-  // separate ops to avoid dispatch overhead here.
+  // Handle special case for streaming model.
+  int* input_dims = input->dims->data;
+  int* filter_dims = filter->dims->data;
+  if (input_dims[0] == 1 && input_dims[1] == 4 && input_dims[2] == 1 &&
+      input_dims[3] == 32 && filter_dims[0] == 1 && filter_dims[1] == 4 &&
+      filter_dims[2] == 1 && filter_dims[3] == 32) {
+    xtensa::hifimini::DepthwiseConv4x32MatchingInputAndFilter(
+        -input->params.zero_point, output->params.zero_point,
+        std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
+        op_data->per_channel_output_multiplier,
+        op_data->per_channel_output_shift, GetTensorShape(input),
+        GetTensorData<int8>(input), GetTensorShape(filter),
+        GetTensorData<int8>(filter), GetTensorShape(bias),
+        GetTensorData<int32>(bias), GetTensorShape(output),
+        GetTensorData<int8>(output));
+    return kTfLiteOk;
+  }
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteInt8:
       EvalQuantizedPerChannel(context, node, params, op_data, input, filter,
@@ -353,8 +487,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace depthwise_conv
 
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/depthwise_conv::Free,
+  static TfLiteRegistration r = {/*init=*/depthwise_conv::Init,
+                                 /*free=*/nullptr,
                                  /*prepare=*/depthwise_conv::Prepare,
                                  /*invoke=*/depthwise_conv::Eval,
                                  /*profiling_string=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
index 4ffb3653f50..918192c4d8f 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
@@ -31,80 +30,9 @@ namespace micro {
 namespace xtensa {
 namespace hifimini {
 
-//
-// Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
-// aligned value in the QR register.
-//
-inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  // These boolean factors will carry an additional 2^8 (e.g 256) factor
-  // throughout the equation to cover the missing 8 bits of precision when a
-  // 32bit integer is outside the bounds of INT24. The additional scaling factor
-  // will be adjusted after the final multiplication in this method.
-  //
-  // The Q-notation comments in this method describe the calculations that take
-  // place when both |x| and the shifted value of |1| overflow the INT24 limits.
-  bool x_exceeds_24bits = (x <= INT24_MIN || x >= INT24_MAX);
-  bool shift_exceeds_24bits = false;
-
-  // Q31.0 -> Q23.0 / 2^8
-  ae_p24x2s x_24x2 = AE_CONVERT_INT32_24x2(x);
-
-  if (shift > 0) {
-    int shifted = 1 << shift;
-    if (shifted <= INT24_MIN || shifted >= INT24_MAX) {
-      shift_exceeds_24bits = true;
-    }
-
-    // Load the shifted value into the PR register:
-    // Q31.0 -> Q23.0 / 2^8
-    ae_p24x2s shifted_24x2 = AE_CONVERT_INT32_24x2(shifted);
-
-    // (Q23.0 / 2^8) * (Q23.0 / 2^8) = Q47.0 / 2^16
-    ae_q56s sum_56 = AE_MULP24S_HH(x_24x2, shifted_24x2);
-
-    // Shift left into 24bit space:
-    // ((Q47.0 / 2^16) << 24) = Q23.24 / 2^16
-    sum_56 = AE_Q56S_SLAI(sum_56, 24);
-
-    // Truncate and place on the PR register:
-    // (Q23.24 / 2^16) -> Q23.0 / 2^16
-    x_24x2 = AE_TRUNCP24Q48(sum_56);
-  }
-
-  // Load the quantized multiplier into the PR register.
-  // NOTE: This method assumes that this param has been calculated for 24bit
-  // space - not 32bits.
-  // Q0.31 -> Q0.23
-  ae_p24x2s quantized_multiplier_24x2 =
-      AE_CONVERT_INT32_24x2(quantized_multiplier);
-
-  // Adjust for the additional 8 bits of lost precision throughout this
-  // function:
-  int shift_amount = 23;
-  if (x_exceeds_24bits) {
-    shift_amount = shift_amount - 8;
-  }
-  if (shift_exceeds_24bits) {
-    shift_amount = shift_amount - 8;
-  }
-
-  // Find the product of x and the quantized_multiplier and right shift
-  // to 48bit aligned.
-  // (Q23.0 / 2^16) * Q23.0 = Q47.0 / 2^16
-  // (Q47.0 / 2^16) >> 7 = Q47.0
-  ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
-  if (shift_amount > 0) {
-    result_56 = AE_Q56S_SRA(result_56, shift_amount);
-  }
-
-  if (shift < 0) {
-    // Handle any negative shift directly on the 48 bit value.
-    result_56 = AE_Q56S_SRA(result_56, -shift);
-  }
-  return result_56;
-}
+// INT24 MIN/MAX
+#define INT24_MIN -8388608
+#define INT24_MAX 8388607
 
 //
 // Multiply 24bit value by a quantized multiplier (w/ shift) and returns a 48bit
@@ -113,62 +41,62 @@ inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
 inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
                                              int32_t quantized_multiplier,
                                              int shift) {
-  // NOTE: x_24x2 = Q23.0
-
-  // This is an optimized version of a 32 bit MultiplyByQuantizedMultiplier
-  // operation of TFLite. Sometimes, the shifted value of |x_24x2| can exceed
-  // the limits of INT24, which requires |AE_CONVERT_INT32_24x2()| to load the
-  // left-most 24 bits of a 32bit integer. When this occurs, all Q values here
-  // carry an additional division of 2^8 to account for this loss in precision.
-  // This division will be applied to the final shift after multiplication.
+  // A value with 1 sign bit, N integer bits and M fractional bits is
+  // represented as QN+1.M since the sign bit is included in the integer bits.
+  //
+  // The Q notation in this method explains the values represented in each
+  // variable, along with an implicit division since the quantized_multiplier
+  // represents a value between 0.5 and 1.0 (Q1.X-1 where X is the bit precision
+  // of the type).
   //
-  // The Q-notation comments in this method describe the calculations that take
-  // place when both |x| and the shifted value of |1| overflow the INT24 limits.
-  bool shift_exceeds_24bits = false;
-
-  ae_p24x2s x_shifted_24x2 = x_24x2;
-  if (shift > 0) {
-    int shifted = 1 << shift;
-    if (shifted <= INT24_MIN || shifted >= INT24_MAX) {
-      shift_exceeds_24bits = true;
-    }
-    // Load the shifted value into the PR register:
-    // Q31.0 -> Q23.0 / 2^8
-    ae_p24x2s shifted_24x2 = AE_CONVERT_INT32_24x2(shifted);
-
-    // Q23.0 * (Q23.0 / 2^8) = Q47.0 / 2^8
-    ae_q56s sum_56 = AE_MULP24S_HH(x_24x2, shifted_24x2);
-
-    // Shift left into 24bit space:
-    // ((Q47.0 / 2^8) << 24) = Q23.24 / 2^8
-    sum_56 = AE_Q56S_SLAI(sum_56, 24);
-
-    // Truncate and place on the PR register:
-    // (Q23.24 / 2^8) -> Q23.0 / 2^8
-    x_shifted_24x2 = AE_ROUNDSP24Q48SYM(sum_56);
-  }
-
   // Load the quantized multiplier into the PR register.
   // NOTE: This method assumes that this param has been calculated for 24bit
   // space - not 32bits.
-  // Q0.31 -> Q0.23
-  ae_p24x2s quantized_multiplier_24x2 =
-      AE_CONVERT_INT32_24x2(quantized_multiplier);
+  // Q32.0 / 2^23 -> Q24.0 / 2^23 representing a Q1.23 multiplier.
+  ae_p24x2s quantized_multiplier_24x2 = AE_MOVPA24(quantized_multiplier);
+  // Shift right by 23 - 16 bits minus the specified shift.  This is because we
+  // keep 16 fractional bits until the end to perform rounding.  Subtract shift
+  // since shift is a left shift, and the 23-16 is a right shift.
+  int shift_amount = 7 - shift;
 
-  // Find the product of x and the quantized_multiplier and right shift
-  // to 48bit aligned.
-  // NOTE: Adjust for the additional 8 bits of lost precision throughout this
-  // function:
-  // (Q23.0 / 2^8) * Q23.0 = Q47.0 / 2^8
-  // (Q47.0 / 2^8) >> 7 = Q47.0
-  ae_q56s result = AE_MULP24S_HH(x_shifted_24x2, quantized_multiplier_24x2);
-  result = AE_Q56S_SRA(result, shift_exceeds_24bits ? 15 : 23);
+  // Find the product of x and the quantized_multiplier.
+  // Q24.0 / 2^23 * Q24.0 = Q48.0 / 2^23
+  // Q48.0 / 2^23 >> 7 = Q48.0 / 2^16
+  ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
 
-  if (shift < 0) {
-    // Handle any negative shift directly on the 48 bit value.
-    result = AE_Q56S_SRA(result, -shift);
+  // Shift right if shift amount is positive, left if shift amount is negative.
+  if (shift_amount >= 0) {
+    result_56 = AE_Q56S_SRA(result_56, shift_amount);
+  } else {
+    result_56 = AE_Q56S_SLA(result_56, -shift_amount);
   }
-  return result;
+
+  // Round off the bottom 16 bits.
+  // Q48.0 / 2^16 -> Q32.0 aligned to 48 bits.
+  result_56 = AE_ROUNDSQ32SYM(result_56);
+  return result_56;
+}
+
+//
+// Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
+// aligned value in the QR register.
+//
+inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
+  // Convert x into a 2x24bit PR register file. If x is outside the numerical
+  // limits of a 24bit integer, the "fractional" or lower 8bits are discarded.
+  // If x is within the range of a 24 bit integer, the "signed" or upper 8bits
+  // are discarded.
+  ae_p24x2s x_24x2;
+  if (x > INT24_MIN && x < INT24_MAX) {
+    x_24x2 = AE_MOVPA24(x);
+  } else {
+    x_24x2 = static_cast<ae_p24s>(*reinterpret_cast<ae_p24f*>(&x));
+    shift += 8;
+  }
+
+  return MultiplyByQuantizedMultiplier(x_24x2, quantized_multiplier, shift);
 }
 
 //
@@ -193,6 +121,8 @@ inline void QuantizeMultiplier(float multiplier, int32_t* quantized_multiplier,
   }
   TFLITE_CHECK_LE(q_fixed, INT24_MAX);
 
+  // Ensure shift does not exceed 24-bit range.
+  TFLITE_CHECK_LE(*shift, 23);
   if (*shift < -23) {
     *shift = 0;
     q_fixed = 0;
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
index 7a535120216..c8bba633de7 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
@@ -34,13 +33,12 @@ namespace micro {
 namespace xtensa {
 namespace hifimini {
 
-// Int8 optimized:
-inline void FullyConnected(
-    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const int8_t* input_data, const RuntimeShape& filter_shape,
-    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int8_t* output_data) {
+void FullyConnected(const FullyConnectedParams& params,
+                    const RuntimeShape& input_shape, const int8_t* input_data,
+                    const RuntimeShape& filter_shape, const int8_t* filter_data,
+                    const RuntimeShape& bias_shape, const int32* bias_data,
+                    const RuntimeShape& output_shape, int8_t* output_data) {
+  // TODO(b/154032858): Investigate removing extra copies.
   const int32 input_offset = params.input_offset;
   const int32 filter_offset = params.weights_offset;
   const int32 output_offset = params.output_offset;
@@ -108,9 +106,6 @@ inline void FullyConnected(
       sum_56 = MultiplyByQuantizedMultiplier(sum_24x2, output_multiplier,
                                              output_shift);
 
-      // Align from 48bit to 32bit on the QR register:
-      sum_56 = AE_Q56S_SLAI(sum_56, 16);
-
       // Add output_offset and cap min/max values:
       sum_56 = AE_ADDQ56(sum_56, output_offset_56);
       sum_56 = AE_MINQ56S(sum_56, output_activation_max_56);
@@ -146,72 +141,66 @@ constexpr int kWeightsTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 
-// This size will work for both the hotword (5) and ambient music (2):
-constexpr int kMaxOpDataSize = 7;
-static int op_data_counter = 0;
-static OpData kStaticOpData[kMaxOpDataSize];
-
 TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
+                             TfLiteFusedActivation activation,
                              TfLiteType data_type, const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
                              OpData* data) {
-  TfLiteStatus status = kTfLiteOk;
-  if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    xtensa::hifimini::QuantizeMultiplier(real_multiplier,
-                                         &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
-        &data->output_activation_max));
-  }
-  return status;
+  TFLITE_DCHECK(data_type != kTfLiteFloat32);
+
+  double real_multiplier = 0.0;
+  TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+      context, input, filter, bias, output, &real_multiplier));
+  xtensa::hifimini::QuantizeMultiplier(
+      real_multiplier, &data->output_multiplier, &data->output_shift);
+  return CalculateActivationRangeQuantized(context, activation, output,
+                                           &data->output_activation_min,
+                                           &data->output_activation_max);
 }
 
 }  // namespace
 
-void Free(TfLiteContext* context, void* buffer) { op_data_counter = 0; }
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto* params =
       reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TfLiteType data_type = input->type;
 
-  // TODO(b/132070898): Use statically slotted OpData structures until a
-  // scratch memory API is ready.
-  OpData* op_data = &kStaticOpData[op_data_counter++];
-  node->user_data = op_data;
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, op_data));
-
-  return kTfLiteOk;
+  return CalculateOpData(context, params->activation, input->type, input,
+                         filter, bias, output, data);
 }
 
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
-                               const TfLiteTensor* input,
+                               const OpData& data, const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
+  // TODO(b/154032858): Investigate removing extra copies.
   FullyConnectedParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
   op_params.output_offset = output->params.zero_point;
-  op_params.output_multiplier = data->output_multiplier;
-  // TODO(b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   xtensa::hifimini::FullyConnected(
       op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
@@ -222,33 +211,23 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  switch (filter->type) {  // Already know in/out types are same.
-    case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, params, op_data, input, filter,
-                               bias, output);
-
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(filter->type), filter->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
+  TFLITE_DCHECK(filter->type == kTfLiteInt8);
+  return EvalQuantizedInt8(context, node, data, input, filter, bias, output);
 }
 
 }  // namespace fully_connected
 
 TfLiteRegistration* Register_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/fully_connected::Free,
+  static TfLiteRegistration r = {/*init=*/fully_connected::Init,
+                                 /*free=*/nullptr,
                                  /*prepare=*/fully_connected::Prepare,
                                  /*invoke=*/fully_connected::Eval,
                                  /*profiling_string=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
index 0ac5ab821df..29b2544a625 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
@@ -43,7 +42,7 @@ void AffineQuantize(int scale_multiplier,
 
   const ae_p16x2s* input_data_ptr = (const ae_p16x2s*)(input_data - 2);
 
-  ae_p24x2s scale_multiplier_24x2 = AE_CONVERT_INT32_24x2(scale_multiplier);
+  ae_p24x2s scale_multiplier_24x2 = AE_MOVPA24(scale_multiplier);
 
   int iters = flat_size / 2;
   for (int i = 0; i < iters; i++) {
@@ -109,20 +108,24 @@ struct OpData {
   int scale_multiplier = 0;
 };
 
-// This size will work for both the hotword (1) and ambient music (1):
-constexpr int kMaxOpDataSize = 2;
-static int kStaticOpDataCounter = 0;
-static OpData kStaticOpData[kMaxOpDataSize];
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* op_data = static_cast<OpData*>(node->user_data);
+
   TfLiteTensor* output = GetOutput(context, node, 0);
   const TfLiteTensor* input = GetInput(context, node, 0);
 
-  // TODO(b/132070898): Use statically slotted OpData structures until a
-  // scratch memory API is ready.
-  OpData* op_data = &kStaticOpData[kStaticOpDataCounter++];
-  node->user_data = op_data;
-
+  // TODO(b/155682734): Fix dangerous input/output scale ratio assumptions.
   op_data->scale_multiplier = xtensa::hifimini::CreateQConstantForInt24(
       0, input->params.scale / output->params.scale);
 
@@ -130,7 +133,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* op_data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -158,7 +162,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 // AffineQuantize takes scale and zero point and quantizes the float value to
 // quantized output, in int8 or uint8 format.
 TfLiteRegistration* Register_QUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/quantize::Init,
                                  /*free=*/nullptr,
                                  /*prepare=*/quantize::Prepare,
                                  /*invoke=*/quantize::Eval,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
index c77e9d1173c..a7c5604ef64 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -29,71 +29,133 @@ namespace micro {
 namespace activations {
 namespace {
 
-// TODO(b/141176180): This code is currently a strict subset of the portable
-// implementation (softmax.cc one directory up). When TFLM implements
-// registrations for selective types (e.g. compile without float support), this
-// can be removed. Otherwise, any HiFi specific optimizations should land here.
+struct OpData {
+  uint16_t* exp_lut;
+};
 
-// This size will work for both the hotword (1) and ambient music (0):
-static SoftmaxParams kStaticOpData;
+// Number of unique int8 and int16 values.  Used in exponent lookup table
+// conputation.
+constexpr int kInt8Range =
+    std::numeric_limits<int8_t>::max() - std::numeric_limits<int8>::min() + 1;
+constexpr int kInt16Range =
+    std::numeric_limits<int16_t>::max() - std::numeric_limits<int16>::min() + 1;
+// Each 16-bit precalculated exponent is expressed as a Q0.16 fixedpoint
+// value. We special-case e^0 since 1.0 requires 1 integer bit to
+// express.
+constexpr int kExpFractionalBits = 16;
+// e^0 expressed as Q1.15 exceeds the int16_t range, so it must be handled
+// specially.
+constexpr int kMaxExponentValue = (1 << kExpFractionalBits);
+
+// Quantized softmax with int8 input and int16 output.
+// TODO(b/155656675): Investigate removing const ref params.
+inline TfLiteStatus Softmax(const OpData& op_data,
+                            const RuntimeShape& input_shape,
+                            const int8_t* input_data,
+                            const RuntimeShape& output_shape,
+                            int16_t* output_data) {
+  // The last dimension is depth.  Outer size is the the total input size
+  // divided by depth.
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    int8_t max_in_row = std::numeric_limits<int8_t>::min();
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    uint32_t sum_of_exps = 0;
+    for (int c = 0; c < depth; ++c) {
+      TFLITE_DCHECK(max_in_row >= input_data[i * depth + c]);
+      uint8_t input_diff = max_in_row - input_data[i * depth + c];
+
+      sum_of_exps +=
+          input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
+    }
+
+    // Ensure we cannnot overflow the full_range_output value.  We need to
+    // guarantee that kInt16Range * max(input_data) / sum_of_exps < kInt16Range.
+    TFLITE_DCHECK(sum_of_exps >= kMaxExponentValue);
+
+    for (int c = 0; c < depth; ++c) {
+      uint8_t input_diff = max_in_row - input_data[i * depth + c];
+      // Special case for diff == 0
+      uint32_t unscaled_output =
+          input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
+      int64_t scaled_output = static_cast<int64_t>(unscaled_output) *
+                              static_cast<int64_t>(kInt16Range);
+      int32_t full_range_output =
+          scaled_output / sum_of_exps + std::numeric_limits<int16_t>::min();
+      // Round up if remainder exceeds half of the divider value.
+      uint32_t remainder = scaled_output % sum_of_exps;
+      if (remainder * 2 >= sum_of_exps) {
+        full_range_output++;
+      }
+      output_data[i * depth + c] = static_cast<int16_t>(std::max(
+          std::min(full_range_output,
+                   static_cast<int32>(std::numeric_limits<int16_t>::max())),
+          static_cast<int32_t>(std::numeric_limits<int16_t>::min())));
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
 
 TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     TfLiteTensor* output,
                                     const TfLiteSoftmaxParams* params,
-                                    SoftmaxParams* op_data) {
+                                    OpData* op_data) {
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     if (input->type == kTfLiteUInt8) {
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
       if (output->type == kTfLiteInt16) {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int16_t>::min());
         // NOTE: Current int16 softmax output does not require symmetric scaling
         // - so no need to verify scale here.
       } else {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int8_t>::min());
         TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
       }
     }
 
-    static const int kScaledDiffIntegerBits = 5;
+    // Precompute e^(-x * input_scale * beta) for every possible int8 input.
+    // This computation is used for every iteration of Softmax.  We must compute
+    // using pre-scaled inputs to avoid introducing additional error, while
+    // restricting our input range to the int8 range. This is valid since beta
+    // and input scale are constant for a given op in the graph. Skip index 0
+    // since that is a special case which requires 1 integer bit instead of 0.
+    for (int i = 1; i <= kInt8Range; i++) {
+      float scaled_input = i * input->params.scale;
+      float exp_value =
+          std::exp((-scaled_input) * static_cast<float>(params->beta));
 
-    int input_left_shift;
-    tflite::PreprocessSoftmaxScaling(
-        static_cast<double>(params->beta),
-        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
-        &op_data->input_multiplier, &input_left_shift);
-    op_data->input_left_shift = input_left_shift;
-    op_data->diff_min =
-        -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                            op_data->input_left_shift);
+      float exponent_scaled =
+          std::round(exp_value * static_cast<float>(1 << kExpFractionalBits));
+      op_data->exp_lut[i] = static_cast<uint16_t>(exponent_scaled);
+    }
   }
   return kTfLiteOk;
 }
 
-TfLiteStatus SoftmaxQuantized(TfLiteContext* context, const TfLiteTensor* input,
-                              TfLiteTensor* output,
-                              const SoftmaxParams& op_params) {
-  switch (output->type) {
-    case kTfLiteInt16:
-      tflite::reference_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
-      return kTfLiteOk;
-    case kTfLiteInt8:
-      tflite::reference_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
-      return kTfLiteOk;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(output->type), output->type);
-      return kTfLiteError;
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
   }
+  return data;
 }
 
-}  // namespace
-
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
 
@@ -103,36 +165,45 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
-  // TODO(b/132070898): Use statically slotted SoftmaxParams structures until a
-  // scratch memory API is ready.
-  SoftmaxParams* op_params = &kStaticOpData;
-  node->user_data = op_params;
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+
+  // Allocate an array to precompute exponents over all int8 inputs, applying
+  // the scale and beta before calculating exp. It is mandatory to apply beta
+  // and scale here, since each softmax op may have different beta and scale
+  // values. Beta and scale will remain constant for a given softmax op.
+  void* allocated_ptr;
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, kInt8Range * sizeof(int16_t), &allocated_ptr));
+  op_data->exp_lut = static_cast<uint16_t*>(allocated_ptr);
 
   TF_LITE_ENSURE_STATUS(
-      CalculateSoftmaxOpData(context, input, output, params, op_params));
+      CalculateSoftmaxOpData(context, input, output, params, op_data));
 
   return kTfLiteOk;
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_params = static_cast<SoftmaxParams*>(node->user_data);
+  auto* op_data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
-  switch (input->type) {
-    case kTfLiteInt8:
-      return SoftmaxQuantized(context, input, output, *op_params);
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
+  if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
+    // TODO(b/155656675): Const ref params can be slow on xtensa.
+    return Softmax(*op_data, GetTensorShape(input),
+                   GetTensorData<int8_t>(input), GetTensorShape(output),
+                   GetTensorData<int16_t>(output));
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
   }
 }
 }  // namespace activations
 
 TfLiteRegistration* Register_SOFTMAX() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/activations::SoftmaxInit,
                                  /*free=*/nullptr,
                                  /*prepare=*/activations::SoftmaxPrepare,
                                  /*invoke=*/activations::SoftmaxEval,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
index 1847a4e88e8..4f784d32b2e 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
-#include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
 namespace ops {
@@ -34,11 +32,6 @@ namespace micro {
 namespace svdf {
 namespace {
 
-// These constants represent constants specific to the hotword "OK G" model.
-// They exist until (b/132070898) is fixed.
-constexpr int kScratchTensorMaxSize = 64;
-constexpr int kMaxOpDataSize = 7;
-
 struct OpData {
   int32 effective_scale_1_a;
   int32 effective_scale_2_a;
@@ -46,23 +39,37 @@ struct OpData {
   // shift value - typically between [-32, 32].
   int effective_scale_1_b;
   int effective_scale_2_b;
+  int scratch_tensor_index;
+  int scratch_output_tensor_index;
 };
 
-static int op_data_counter = 0;
-static OpData kStaticOpData[kMaxOpDataSize];
+// Input tensors.
+constexpr int kInputTensor = 0;
+constexpr int kWeightsFeatureTensor = 1;
+constexpr int kWeightsTimeTensor = 2;
+constexpr int kBiasTensor = 3;
+// This is a variable tensor, and will be modified by this op.
+constexpr int kInputActivationStateTensor = 4;
+
+// Output tensor.
+constexpr int kOutputTensor = 0;
 
 /**
  * This version of SVDF is specific to TFLite Micro. It contains only a full
  * integer receipe with optimizations for the Xtensa HiFiMini platform.
+ *
+ * Note: passing OpData by value might seem like an oversight but it helps
+ * reduce the latency. See b/155656675 for more details.
  */
-
-void EvalIntegerSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input_tensor,
-    const TfLiteTensor* weights_feature_tensor,
-    const TfLiteTensor* weights_time_tensor, const TfLiteTensor* bias_tensor,
-    const TfLiteSVDFParams* params, TfLiteTensor* activation_state_tensor,
-    TfLiteTensor* output_tensor, int32_t scale_1_a, int scale_1_b,
-    int32_t scale_2_a, int scale_2_b, int32_t input_zp, int32_t output_zp) {
+void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
+                     const TfLiteTensor* input_tensor,
+                     const TfLiteTensor* weights_feature_tensor,
+                     const TfLiteTensor* weights_time_tensor,
+                     const TfLiteTensor* bias_tensor,
+                     const TfLiteSVDFParams* params,
+                     TfLiteTensor* activation_state_tensor,
+                     TfLiteTensor* output_tensor, OpData data, int32_t input_zp,
+                     int32_t output_zp) {
   const int n_rank = params->rank;
   const int n_batch = input_tensor->dims->data[0];
   const int n_input = input_tensor->dims->data[1];
@@ -70,10 +77,15 @@ void EvalIntegerSVDF(
   const int n_unit = n_filter / n_rank;
   const int n_memory = weights_time_tensor->dims->data[1];
 
-  // TODO(b/132070898): Move these temp variables to the new scratch buffer API
-  // when ready.
-  int32_t scratch_tensor[kScratchTensorMaxSize];
-  int32_t scratch_output_tensor[kScratchTensorMaxSize];
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  int32_t* scratch_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_tensor_index));
+  TFLITE_DCHECK(scratch_tensor != nullptr);
+  int32_t* scratch_output_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
+  TFLITE_DCHECK(scratch_output_tensor != nullptr);
 
   // Shift states.
   int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
@@ -99,7 +111,7 @@ void EvalIntegerSVDF(
 
     ae_q56s output_int16_max_56 = AE_CVTQ48A32S(INT16_MAX);
     ae_q56s output_int16_min_56 = AE_CVTQ48A32S(INT16_MIN);
-    ae_p24x2s input_zp_24x2 = AE_CONVERT_INT32_24x2(input_zp);
+    ae_p24x2s input_zp_24x2 = AE_MOVPA24(input_zp);
 
     for (int b = 0; b < n_batch; b++) {
       const int8_t* weight_feature_ptr = weight_feature - 2;
@@ -138,10 +150,9 @@ void EvalIntegerSVDF(
 
         dot_prod_56 =
             tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
-                dot_prod_24x2, scale_1_a, scale_1_b);
+                dot_prod_24x2, data.effective_scale_1_a,
+                data.effective_scale_1_b);
 
-        // Align from 48bit to 32bit on the QR register
-        dot_prod_56 = AE_Q56S_SLAI(dot_prod_56, 16);
         // Cap min/max and convert to int32:
         dot_prod_56 = AE_MAXQ56S(dot_prod_56, output_int16_min_56);
         dot_prod_56 = AE_MINQ56S(dot_prod_56, output_int16_max_56);
@@ -231,9 +242,8 @@ void EvalIntegerSVDF(
     for (int i = 0; i < n_batch * n_unit; ++i) {
       ae_q56s x_56 =
           tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
-              scratch_output_tensor[i], scale_2_a, scale_2_b);
-      // Align from 48bit to 32bit on the QR register:
-      x_56 = AE_Q56S_SLAI(x_56, 16);
+              scratch_output_tensor[i], data.effective_scale_2_a,
+              data.effective_scale_2_b);
       // Add output adjustment:
       x_56 = AE_ADDQ56(x_56, output_zp_56);
       // Cap min/max and convert to int32 (already aligned to 32bit):
@@ -245,22 +255,22 @@ void EvalIntegerSVDF(
   }
 }
 
-// Input tensors.
-constexpr int kInputTensor = 0;
-constexpr int kWeightsFeatureTensor = 1;
-constexpr int kWeightsTimeTensor = 2;
-constexpr int kBiasTensor = 3;
-// This is a variable tensor, and will be modified by this op.
-constexpr int kInputActivationStateTensor = 4;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
 }  // namespace
 
-void Free(TfLiteContext* context, void* buffer) { op_data_counter = 0; }
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
 
   // Validate Tensor Inputs (dtype depends on quantization):
   // [0] = Input, {2, batch_size, input_size}
@@ -269,7 +279,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // [3] = Bias (optional), {1, num_units}
   // [4] = Activation State (variable),
   //         {2, batch_size, memory_size * num_filters}
-
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* weights_feature =
       GetInput(context, node, kWeightsFeatureTensor);
@@ -322,8 +331,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
 
   // Validate Optional Bias Input Tensor:
-  if (bias) {
+  if (bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
+    TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
   }
 
   // Validate Activation State Input Tensor:
@@ -333,60 +343,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     memory_size * num_filters);
 
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
-
   TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
   TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
-
-  if (bias) {
-    TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
-  }
-
   TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
 
-  // Validate Scratch Tensors:
-  // [0] = (shared - see float block below for usage)
-  // [1] = Output Temp, int8_t, {2, num_units, batch_size}
-  // TODO(b/132070898): Scratch values are used as stack variables in
-  // EvalIntegerSVDF().
-
   // Validate output tensor:
   TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
 
-  // TODO(b/132070898): Use statically slotted OpData structures until a
-  // scratch memory API is ready.
-  OpData* op_data = &kStaticOpData[op_data_counter++];
-  node->user_data = op_data;
-
   // Calculate effective scales.
   auto* input_params =
-      reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
-  auto* weights_feature_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      static_cast<TfLiteAffineQuantization*>(input->quantization.params);
+  auto* weights_feature_params = static_cast<TfLiteAffineQuantization*>(
       weights_feature->quantization.params);
-  auto* state_params = reinterpret_cast<TfLiteAffineQuantization*>(
+  auto* state_params = static_cast<TfLiteAffineQuantization*>(
       activation_state->quantization.params);
-  auto* weight_time_params = reinterpret_cast<TfLiteAffineQuantization*>(
-      weights_time->quantization.params);
+  auto* weight_time_params =
+      static_cast<TfLiteAffineQuantization*>(weights_time->quantization.params);
   auto* output_params =
-      reinterpret_cast<TfLiteAffineQuantization*>(output->quantization.params);
+      static_cast<TfLiteAffineQuantization*>(output->quantization.params);
   const float effective_scale_1 = input_params->scale->data[0] *
                                   weights_feature_params->scale->data[0] /
                                   state_params->scale->data[0];
   const float effective_scale_2 = state_params->scale->data[0] *
                                   weight_time_params->scale->data[0] /
                                   output_params->scale->data[0];
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   xtensa::hifimini::QuantizeMultiplier(effective_scale_1,
-                                       &op_data->effective_scale_1_a,
-                                       &op_data->effective_scale_1_b);
+                                       &data->effective_scale_1_a,
+                                       &data->effective_scale_1_b);
   xtensa::hifimini::QuantizeMultiplier(effective_scale_2,
-                                       &op_data->effective_scale_2_a,
-                                       &op_data->effective_scale_2_b);
+                                       &data->effective_scale_2_a,
+                                       &data->effective_scale_2_b);
+
+  const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+      context, batch_size * num_filters * sizeof(int32_t),
+      &(data->scratch_tensor_index));
+  TF_LITE_ENSURE_OK(context, scratch_status);
+  const TfLiteStatus scratch_output_status =
+      context->RequestScratchBufferInArena(
+          context, batch_size * num_units * sizeof(int32_t),
+          &(data->scratch_output_tensor_index));
+  TF_LITE_ENSURE_OK(context, scratch_output_status);
 
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
-  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  auto* params = static_cast<TfLiteSVDFParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* weights_feature =
@@ -399,10 +405,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
 
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+
   EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
-                  params, activation_state, output,
-                  op_data->effective_scale_1_a, op_data->effective_scale_1_b,
-                  op_data->effective_scale_2_a, op_data->effective_scale_2_b,
+                  params, activation_state, output, data,
                   input->params.zero_point, output->params.zero_point);
   return kTfLiteOk;
 }
@@ -410,8 +417,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace svdf
 
 TfLiteRegistration* Register_SVDF() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/svdf::Free,
+  static TfLiteRegistration r = {/*init=*/svdf::Init,
+                                 /*free=*/nullptr,
                                  /*prepare=*/svdf::Prepare,
                                  /*invoke=*/svdf::Eval,
                                  /*profiling_string=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h
deleted file mode 100644
index 59caf4bbf2f..00000000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
-#define TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
-
-#include <xtensa/tie/xt_hifi2.h>
-
-#include <cstdint>
-
-// INT24 MIN/MAX
-#define INT24_MIN -8388608
-#define INT24_MAX 8388607
-
-// Converts an int32 value into a 2x24bit PR register file. If the int32 value
-// is outside the numerical limits of a 24bit integer, the "fractional" or lower
-// 8bits are discarded. If the value is within the range of a 24 bit integer,
-// the "signed" or upper 8bits are discarded.
-inline ae_p24x2s AE_CONVERT_INT32_24x2(int32_t v) {
-  if (v > INT24_MIN && v < INT24_MAX) {
-    return *reinterpret_cast<ae_p24s*>(&v);
-  } else {
-    return static_cast<ae_p24s>(*reinterpret_cast<ae_p24f*>(&v));
-  }
-}
-
-// Shifts a 48bit accumulator value into 32bit space and returns the value.
-#define AE_CONVERT_Q56_INT32(v) AE_TRUNCA32Q48(AE_Q56S_SLAI(v, 16))
-
-#endif  // TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
diff --git a/tensorflow/lite/micro/memory_helpers.cc b/tensorflow/lite/micro/memory_helpers.cc
index 302f160a235..c1b761bf088 100644
--- a/tensorflow/lite/micro/memory_helpers.cc
+++ b/tensorflow/lite/micro/memory_helpers.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/memory_helpers.h"
 
+#include <cstddef>
 #include <cstdint>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/memory_helpers.h b/tensorflow/lite/micro/memory_helpers.h
index ef8205c8038..f52da062271 100644
--- a/tensorflow/lite/micro/memory_helpers.h
+++ b/tensorflow/lite/micro/memory_helpers.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
 #define TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
 
+#include <cstddef>
+#include <cstdint>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
index faea73e9169..c5e2d579ccd 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@@ -41,13 +41,8 @@ void ReverseSortInPlace(int* values, int* ids, int size) {
 GreedyMemoryPlanner::GreedyMemoryPlanner(unsigned char* scratch_buffer,
                                          int scratch_buffer_size)
     : buffer_count_(0), need_to_calculate_offsets_(true) {
-  const int per_buffer_size = sizeof(BufferRequirements) +  // requirements_
-                              sizeof(int) +  // buffer_sizes_sorted_by_size_
-                              sizeof(int) +  // buffer_ids_sorted_by_size_
-                              sizeof(ListEntry) +  // buffers_sorted_by_offset_
-                              sizeof(int);         // buffer_offsets_;
   // Allocate the arrays we need within the scratch buffer arena.
-  max_buffer_count_ = scratch_buffer_size / per_buffer_size;
+  max_buffer_count_ = scratch_buffer_size / per_buffer_size();
 
   unsigned char* next_free = scratch_buffer;
   requirements_ = reinterpret_cast<BufferRequirements*>(next_free);
diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
index f2c77ed94f3..0cb81093596 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
@@ -86,6 +86,17 @@ class GreedyMemoryPlanner : public MemoryPlanner {
     int next_entry_index;
   };
 
+  // Number of bytes required in order to plan a buffer.
+  static size_t per_buffer_size() {
+    const int per_buffer_size =
+        sizeof(BufferRequirements) +  // requirements_
+        sizeof(int) +                 // buffer_sizes_sorted_by_size_
+        sizeof(int) +                 // buffer_ids_sorted_by_size_
+        sizeof(ListEntry) +           // buffers_sorted_by_offset_
+        sizeof(int);                  // buffer_offsets_;
+    return per_buffer_size;
+  }
+
  private:
   // Whether a buffer is active in a given time range.
   bool DoesEntryOverlapInTime(const ListEntry* entry, const int first_time_used,
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 8585c8fa5b8..1dd1fa4b63c 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_allocator.h"
 
 #include <cstddef>
+#include <cstdint>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
@@ -25,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h"
+#include "tensorflow/lite/micro/memory_planner/memory_planner.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 
 namespace tflite {
@@ -43,29 +46,13 @@ struct AllocationInfo {
 // requirement for SIMD extensions.
 constexpr int kBufferAlignment = 16;
 
-// If building with GNU clib from GCC 4.8.x or lower, `max_align_t` is not a
-// member of `std`. If using a newer version of clib, we import `max_align_t`
-// into the local anonymous namespace to be able to use it like the global
-// `max_align_t` from the older clib.
-#if defined(__GNUC__) && defined(__GNUC_PREREQ)
-#if __GNUC_PREREQ(4, 9)
-using std::max_align_t;
-#endif
-#else
-// We assume other compiler/clib configurations don't have this issue.
-using std::max_align_t;
-#endif
-
 class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  public:
   explicit MicroBuiltinDataAllocator(SimpleMemoryAllocator* memory_allocator)
       : memory_allocator_(memory_allocator) {}
 
-  void* Allocate(size_t size) override {
-    // Align to an address that is proper for all primitive types, but no more
-    // than the size.
-    return memory_allocator_->AllocateFromTail(
-        size, std::min(size, alignof(max_align_t)));
+  void* Allocate(size_t size, size_t alignment_hint) override {
+    return memory_allocator_->AllocateFromTail(size, alignment_hint);
   }
   void Deallocate(void* data) override {
     // Do not deallocate, builtin data needs to be available for the life time
@@ -83,10 +70,10 @@ TfLiteStatus AllocateVariables(
     TfLiteTensor* runtime_tensors, SimpleMemoryAllocator* allocator) {
   for (size_t i = 0; i < flatbuffer_tensors->size(); ++i) {
     if (flatbuffer_tensors->Get(i)->is_variable()) {
-      runtime_tensors[i].data.uint8 = allocator->AllocateFromTail(
+      runtime_tensors[i].data.data = allocator->AllocateFromTail(
           runtime_tensors[i].bytes, kBufferAlignment);
       // Allocation failure.
-      if (runtime_tensors[i].data.uint8 == nullptr) {
+      if (runtime_tensors[i].data.data == nullptr) {
         return kTfLiteError;
       }
     }
@@ -157,7 +144,7 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
     current->bytes = runtime_tensors[i].bytes;
     current->first_created = -1;
     current->last_used = -1;
-    current->needs_allocating = (runtime_tensors[i].data.raw == nullptr) &&
+    current->needs_allocating = (runtime_tensors[i].data.data == nullptr) &&
                                 (!subgraph->tensors()->Get(i)->is_variable());
   }
 
@@ -296,8 +283,8 @@ TfLiteStatus InitializeRuntimeTensor(
       if (array->size()) {
         // We've found a buffer with valid data, so update the runtime tensor
         // data structure to point to it.
-        result->data.raw =
-            const_cast<char*>(reinterpret_cast<const char*>(array->data()));
+        result->data.data =
+            const_cast<void*>(static_cast<const void*>(array->data()));
         // We set the data from a serialized buffer, so record tha.
         result->allocation_type = kTfLiteMmapRo;
       }
@@ -311,7 +298,7 @@ TfLiteStatus InitializeRuntimeTensor(
 
   // TODO(petewarden): Some of these paths aren't getting enough testing
   // coverage, so we should figure out some tests that exercise them.
-  if (!result->data.raw) {
+  if (result->data.data == nullptr) {
     // The tensor contents haven't been set from a serialized buffer, so
     // make a note that they will be allocated from memory. The actual
     // allocation won't happen until later.
@@ -351,12 +338,29 @@ TfLiteStatus InitializeRuntimeTensor(
         reinterpret_cast<TfLiteAffineQuantization*>(
             allocator->AllocateFromTail(sizeof(TfLiteAffineQuantization),
                                         alignof(TfLiteAffineQuantization)));
+    if (quantization == nullptr) {
+      TF_LITE_REPORT_ERROR(error_reporter,
+                           "Unable to allocate TfLiteAffineQuantization.\n");
+      return kTfLiteError;
+    }
     quantization->zero_point =
         reinterpret_cast<TfLiteIntArray*>(allocator->AllocateFromTail(
             TfLiteIntArrayGetSizeInBytes(channels), alignof(TfLiteIntArray)));
+    if (quantization->zero_point == nullptr) {
+      TF_LITE_REPORT_ERROR(error_reporter,
+                           "Unable to allocate quantization->zero_point.\n");
+      return kTfLiteError;
+    }
+
     quantization->scale = reinterpret_cast<TfLiteFloatArray*>(
         allocator->AllocateFromTail(TfLiteFloatArrayGetSizeInBytes(channels),
                                     alignof(TfLiteFloatArray)));
+    if (quantization->scale == nullptr) {
+      TF_LITE_REPORT_ERROR(error_reporter,
+                           "Unable to allocate quantization->scale.\n");
+      return kTfLiteError;
+    }
+
     quantization->zero_point->size = channels;
     quantization->scale->size = channels;
     int* zero_point_data = quantization->zero_point->data;
@@ -386,10 +390,8 @@ TfLiteStatus MicroAllocator::Init() {
     return kTfLiteError;
   }
   subgraph_ = (*subgraphs)[0];
-  tensors_ = subgraph_->tensors();
-  operators_ = subgraph_->operators();
 
-  context_->tensors_size = tensors_->size();
+  context_->tensors_size = subgraph_->tensors()->size();
   context_->tensors =
       reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateFromTail(
           sizeof(TfLiteTensor) * context_->tensors_size,
@@ -403,9 +405,9 @@ TfLiteStatus MicroAllocator::Init() {
   }
 
   // Initialize runtime tensors in context_ using the flatbuffer.
-  for (size_t i = 0; i < tensors_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->tensors()->size(); ++i) {
     TfLiteStatus status = internal::InitializeRuntimeTensor(
-        memory_allocator_, *tensors_->Get(i), model_->buffers(),
+        memory_allocator_, *subgraph_->tensors()->Get(i), model_->buffers(),
         error_reporter_, &context_->tensors[i]);
     if (status != kTfLiteOk) {
       TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
@@ -417,18 +419,37 @@ TfLiteStatus MicroAllocator::Init() {
   return kTfLiteOk;
 }
 
+size_t MicroAllocator::used_bytes() const {
+  if (active_) {
+    return 0;
+  }
+  TF_LITE_REPORT_ERROR(error_reporter_, "Total buffer usage: %d bytes",
+                       memory_allocator_->GetUsedBytes());
+  TF_LITE_REPORT_ERROR(error_reporter_, "Head usage: %d bytes",
+                       memory_allocator_->GetHeadUsedBytes());
+  TF_LITE_REPORT_ERROR(error_reporter_, "Tail usage: %d bytes",
+                       memory_allocator_->GetTailUsedBytes());
+  return memory_allocator_->GetUsedBytes();
+}
+
 MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
                                uint8_t* tensor_arena, size_t arena_size,
                                ErrorReporter* error_reporter)
     : model_(model), error_reporter_(error_reporter), context_(context) {
   uint8_t* aligned_arena = AlignPointerUp(tensor_arena, kBufferAlignment);
+  if (aligned_arena != tensor_arena) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "%d bytes lost due to alignment. To avoid this loss, please make sure "
+        "the tensor_arena is 16 bytes aligned.",
+        aligned_arena - tensor_arena);
+  }
   size_t aligned_arena_size = tensor_arena + arena_size - aligned_arena;
   // Creates a root memory allocator managing the arena. The allocator itself
   // also locates in the arena buffer. This allocator doesn't need to be
   // destructed as it's the root allocator.
-  SimpleMemoryAllocator* aligned_allocator =
-      CreateInPlaceSimpleMemoryAllocator(aligned_arena, aligned_arena_size);
-  memory_allocator_ = aligned_allocator;
+  memory_allocator_ = CreateInPlaceSimpleMemoryAllocator(
+      error_reporter, aligned_arena, aligned_arena_size);
   TfLiteStatus status = Init();
   // TODO(b/147871299): Consider improving this code. A better way of handling
   // failures in the constructor is to have a static function that returns a
@@ -451,7 +472,7 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
 
   auto* output = reinterpret_cast<NodeAndRegistration*>(
       memory_allocator_->AllocateFromTail(
-          sizeof(NodeAndRegistration) * operators_->size(),
+          sizeof(NodeAndRegistration) * subgraph_->operators()->size(),
           alignof(NodeAndRegistration)));
   if (output == nullptr) {
     TF_LITE_REPORT_ERROR(
@@ -462,8 +483,8 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
   TfLiteStatus status = kTfLiteOk;
   auto* opcodes = model_->operator_codes();
   MicroBuiltinDataAllocator builtin_data_allocator(memory_allocator_);
-  for (size_t i = 0; i < operators_->size(); ++i) {
-    const auto* op = operators_->Get(i);
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
+    const auto* op = subgraph_->operators()->Get(i);
     size_t index = op->opcode_index();
     if (index >= opcodes->size()) {
       TF_LITE_REPORT_ERROR(error_reporter_,
@@ -540,47 +561,53 @@ TfLiteStatus MicroAllocator::FinishTensorAllocation() {
   // Note that AllocationInfo is only needed for creating the plan. It will be
   // thrown away when the child allocator (tmp_allocator) goes out of scope.
   {
-    SimpleMemoryAllocator tmp_allocator =
-        memory_allocator_->CreateChildAllocator();
+    SimpleMemoryAllocator tmp_allocator(error_reporter_,
+                                        memory_allocator_->GetHead(),
+                                        memory_allocator_->GetTail());
 
     AllocationInfoBuilder builder(error_reporter_, &tmp_allocator);
     TF_LITE_ENSURE_STATUS(
-        builder.Init(tensors_->size(), scratch_buffer_count_));
+        builder.Init(subgraph_->tensors()->size(), scratch_buffer_count_));
     TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph_, context_->tensors));
     TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_handles_));
     const AllocationInfo* allocation_info = builder.Finish();
 
-    uint8_t* aligned_arena = memory_allocator_->GetBuffer();
-    size_t arena_size = memory_allocator_->GetMaxBufferSize();
     // Remaining arena size that memory planner can use for calculating offsets.
-    // The remaining size should always be a positive number since the parent
-    // allocator is always bigger than the child allocator.
-    size_t remaining_arena_size = arena_size - tmp_allocator.GetDataSize();
-    GreedyMemoryPlanner planner(aligned_arena, remaining_arena_size);
+    size_t remaining_arena_size = tmp_allocator.GetAvailableMemory();
+    uint8_t* planner_arena =
+        tmp_allocator.AllocateFromHead(remaining_arena_size, /*alignment=*/1);
+    TF_LITE_ENSURE(error_reporter_, planner_arena != nullptr);
+    GreedyMemoryPlanner planner(planner_arena, remaining_arena_size);
     TF_LITE_ENSURE_STATUS(
         CreatePlan(error_reporter_, &planner, allocation_info, builder.Size()));
-    // Actual size available for placing tensors. This includes memory held by
-    // the tensor info array, which will be released.
+
     size_t actual_available_arena_size =
-        arena_size - memory_allocator_->GetDataSize();
+        memory_allocator_->GetAvailableMemory();
     // Make sure we have enough arena size.
     if (planner.GetMaximumMemorySize() > actual_available_arena_size) {
       TF_LITE_REPORT_ERROR(
           error_reporter_,
           "Arena size is too small for activation buffers. Needed %d but only "
           "%d was available.",
-          planner.GetMaximumMemorySize(), remaining_arena_size);
+          planner.GetMaximumMemorySize(), actual_available_arena_size);
       return kTfLiteError;
     }
 
-    TF_LITE_ENSURE_STATUS(CommitPlan(error_reporter_, &planner, aligned_arena,
+    // Commit the plan.
+    TF_LITE_ENSURE_STATUS(CommitPlan(error_reporter_, &planner,
+                                     memory_allocator_->GetHead(),
                                      allocation_info, builder.Size()));
+    // Allocate the planned area, so the allocator knows it's used.
+    uint8_t* allocated_tensor_memory =
+        memory_allocator_->AllocateFromHead(planner.GetMaximumMemorySize(),
+                                            /*alignment=*/1);
+    TF_LITE_ENSURE(error_reporter_, allocated_tensor_memory != nullptr);
   }
 
   // Data in variables need to be kept for the next invocation so allocating
   // them from the tail (persistent area).
-  if (AllocateVariables(tensors_, context_->tensors, memory_allocator_) !=
-      kTfLiteOk) {
+  if (AllocateVariables(subgraph_->tensors(), context_->tensors,
+                        memory_allocator_) != kTfLiteOk) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
         "Failed to allocate variables. Please increase arena size.");
@@ -612,9 +639,7 @@ TfLiteStatus MicroAllocator::RequestScratchBufferInArena(int node_id,
   // allocator.
   if (scratch_buffer_handles_ != nullptr &&
       reinterpret_cast<uint8_t*>(scratch_buffer_handles_) !=
-          memory_allocator_->GetBuffer() +
-              memory_allocator_->GetMaxBufferSize() -
-              memory_allocator_->GetDataSize()) {
+          memory_allocator_->GetTail()) {
     TF_LITE_REPORT_ERROR(error_reporter_,
                          "Internal error: AllocateFromTail can not be called "
                          "between two RequestScratchBufferInArena calls.");
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index e7dd4f3e34e..d05974f365a 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -15,9 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_
 
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -64,9 +68,10 @@ typedef struct {
 // This information could change in the future version.
 // ************** .memory_allocator->GetBuffer()
 // Tensors/Scratch buffers (head)
-// **************
+// ************** .head_watermark
 // unused memory
-// ************** .memory_allocator->GetBuffer() + ->GetDataSize()
+// ************** .memory_allocator->GetBuffer() + ->GetMaxBufferSize()
+//                                               - ->GetDataSize()
 // persistent area (tail)
 // ************** .memory_allocator->GetBuffer() + ->GetMaxBufferSize()
 class MicroAllocator {
@@ -88,6 +93,10 @@ class MicroAllocator {
   // called in this class.
   TfLiteStatus FinishTensorAllocation();
 
+  // Returns the arena usage in bytes, only available after
+  // `FinishTensorAllocation`. Otherwise, it will return 0.
+  size_t used_bytes() const;
+
   // Run through the model to allocate nodes and registrations. We need to keep
   // them for the entire life time of the model to allow persistent tensors.
   // This method needs to be called before FinishTensorAllocation method.
@@ -115,6 +124,7 @@ class MicroAllocator {
   TfLiteStatus Init();
 
   const Model* model_;
+  // A simple memory allocator that always allocate from the arena tail.
   SimpleMemoryAllocator* memory_allocator_;
   ErrorReporter* error_reporter_;
   TfLiteContext* context_;
@@ -129,8 +139,6 @@ class MicroAllocator {
   size_t scratch_buffer_count_ = 0;
 
   const SubGraph* subgraph_;
-  const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators_;
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 47eefff90b5..78419edbbf9 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -68,7 +68,8 @@ TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
   TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator simple_allocator(arena, arena_size);
+  tflite::SimpleMemoryAllocator simple_allocator(micro_test::reporter, arena,
+                                                 arena_size);
 
   const tflite::Tensor* tensor = tflite::testing::Create1dFlatbufferTensor(100);
   const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
@@ -92,7 +93,8 @@ TF_LITE_MICRO_TEST(TestInitializeQuantizedTensor) {
   TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator simple_allocator(arena, arena_size);
+  tflite::SimpleMemoryAllocator simple_allocator(micro_test::reporter, arena,
+                                                 arena_size);
 
   const tflite::Tensor* tensor =
       tflite::testing::CreateQuantizedFlatbufferTensor(100);
@@ -117,7 +119,8 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
   TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator simple_allocator(arena, arena_size);
+  tflite::SimpleMemoryAllocator simple_allocator(micro_test::reporter, arena,
+                                                 arena_size);
 
   const tflite::Tensor* tensor =
       tflite::testing::CreateMissingQuantizationFlatbufferTensor(100);
@@ -139,11 +142,15 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
 TF_LITE_MICRO_TEST(TestFinishTensorAllocation) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TfLiteContext context;
-  constexpr size_t arena_size = 1024;
+  constexpr size_t arena_size =
+      760 /* minimal arena size at the time of writting */ +
+      16 /* alignment */ + 100 /* leave some headroom for future proof */;
   uint8_t arena[arena_size];
   tflite::MicroAllocator allocator(&context, model, arena, arena_size,
                                    micro_test::reporter);
   TF_LITE_MICRO_EXPECT_EQ(4, context.tensors_size);
+  // Memory planning hasn't been finalized, so the used bytes is unknown.
+  TF_LITE_MICRO_EXPECT_EQ(0, allocator.used_bytes());
 
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
   // No allocation to be done afterwards.
@@ -167,6 +174,7 @@ TF_LITE_MICRO_TEST(TestFinishTensorAllocation) {
                           context.tensors[1].data.raw);
   TF_LITE_MICRO_EXPECT_NE(context.tensors[3].data.raw,
                           context.tensors[2].data.raw);
+  TF_LITE_MICRO_EXPECT_LE(allocator.used_bytes(), 760 + 100);
 }
 
 TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
diff --git a/tensorflow/lite/micro/micro_error_reporter.cc b/tensorflow/lite/micro/micro_error_reporter.cc
index bea3dc8db4c..6d8361cd25a 100644
--- a/tensorflow/lite/micro/micro_error_reporter.cc
+++ b/tensorflow/lite/micro/micro_error_reporter.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
+#include <cstdarg>
+
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
+#include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/micro_string.h"
 #endif
 
diff --git a/tensorflow/lite/micro/micro_error_reporter.h b/tensorflow/lite/micro/micro_error_reporter.h
index b18c47f4ecb..e2c073a465d 100644
--- a/tensorflow/lite/micro/micro_error_reporter.h
+++ b/tensorflow/lite/micro/micro_error_reporter.h
@@ -15,9 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 
+#include <cstdarg>
+
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/compatibility.h"
-#include "tensorflow/lite/micro/debug_log.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index c5d35407648..b46f9ecb9ea 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -14,12 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/micro_interpreter.h"
 
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
-#include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 
 namespace tflite {
 namespace {
@@ -87,8 +91,6 @@ MicroInterpreter::MicroInterpreter(const Model* model,
     return;
   }
   subgraph_ = (*subgraphs)[0];
-  tensors_ = subgraph_->tensors();
-  operators_ = subgraph_->operators();
 
   context_.impl_ = static_cast<void*>(&context_helper_);
   context_.ReportError = context_helper_.ReportOpError;
@@ -112,7 +114,7 @@ MicroInterpreter::MicroInterpreter(const Model* model,
 
 MicroInterpreter::~MicroInterpreter() {
   if (node_and_registrations_ != nullptr) {
-    for (size_t i = 0; i < operators_->size(); ++i) {
+    for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
       TfLiteNode* node = &(node_and_registrations_[i].node);
       const TfLiteRegistration* registration =
           node_and_registrations_[i].registration;
@@ -171,7 +173,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   context_.RequestScratchBufferInArena = nullptr;
   context_.GetScratchBuffer = nullptr;
 
-  for (size_t i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
     context_helper_.SetNodeIndex(i);
     auto* node = &(node_and_registrations_[i].node);
     auto* registration = node_and_registrations_[i].registration;
@@ -195,7 +197,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   // in Prepare stage.
   context_.RequestScratchBufferInArena =
       context_helper_.RequestScratchBufferInArena;
-  for (size_t i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
     // Set node idx to annotate the lifetime for scratch buffers.
     context_helper_.SetNodeIndex(i);
     auto* node = &(node_and_registrations_[i].node);
@@ -237,7 +239,7 @@ TfLiteStatus MicroInterpreter::Invoke() {
     TF_LITE_ENSURE_OK(&context_, AllocateTensors());
   }
 
-  for (size_t i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
     auto* node = &(node_and_registrations_[i].node);
     auto* registration = node_and_registrations_[i].registration;
 
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index e41f2e3dc0f..180a557668e 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
@@ -132,13 +136,21 @@ class MicroInterpreter {
 
   TfLiteStatus initialization_status() const { return initialization_status_; }
 
-  size_t operators_size() const { return operators_->size(); }
+  size_t operators_size() const { return subgraph_->operators()->size(); }
 
   // For debugging only.
   const NodeAndRegistration node_and_registration(int node_index) const {
     return node_and_registrations_[node_index];
   }
 
+  // For debugging only.
+  // Returns the actual used arena in bytes. This method gives the optimal arena
+  // size. It's only available after `AllocateTensors` has been called.
+  // Note that normally `tensor_arena` requires 16 bytes alignment to fully
+  // utilize the space. If it's not the case, the optimial arena size would be
+  // arena_used_bytes() + 16.
+  size_t arena_used_bytes() const { return allocator_.used_bytes(); }
+
  private:
   void CorrectTensorEndianness(TfLiteTensor* tensorCorr);
 
@@ -155,8 +167,6 @@ class MicroInterpreter {
   bool tensors_allocated_;
 
   TfLiteStatus initialization_status_;
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors_;
-  const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators_;
 
   const SubGraph* subgraph_;
   internal::ContextHelper context_helper_;
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 9517a806f3b..36e8c009b96 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -174,7 +174,9 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
   tflite::MockOpResolver mock_resolver;
-  constexpr size_t allocator_buffer_size = 1024;
+  constexpr size_t allocator_buffer_size =
+      928 /* optimal arena size at the time of writting. */ +
+      16 /* alignment */ + 100 /* some headroom */;
   uint8_t allocator_buffer[allocator_buffer_size];
 
   // Create a new scope so that we can test the destructor.
@@ -183,6 +185,7 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
                                          allocator_buffer_size,
                                          micro_test::reporter);
     TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+    TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(), 928 + 100);
     TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
     TF_LITE_MICRO_EXPECT_EQ(2, interpreter.outputs_size());
 
@@ -266,12 +269,15 @@ TF_LITE_MICRO_TEST(TestVariableTensorReset) {
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
 
   tflite::MockOpResolver mock_resolver;
-  constexpr size_t allocator_buffer_size = 2048;
+  constexpr size_t allocator_buffer_size =
+      2096 /* optimal arena size at the time of writting. */ +
+      16 /* alignment */ + 100 /* some headroom */;
   uint8_t allocator_buffer[allocator_buffer_size];
   tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
                                        allocator_buffer_size,
                                        micro_test::reporter);
   TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(), 2096 + 100);
   TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
   TF_LITE_MICRO_EXPECT_EQ(1, interpreter.outputs_size());
 
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index ac304352a57..6c3e9a3331e 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -15,7 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 
+#include <cstring>
+
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -34,12 +37,16 @@ inline int MicroOpResolverAnyVersion() { return 0; }
 template <unsigned int tOpCount = TFLITE_REGISTRATIONS_MAX>
 class MicroOpResolver : public OpResolver {
  public:
+  explicit MicroOpResolver(ErrorReporter* error_reporter = nullptr)
+      : error_reporter_(error_reporter) {}
+
   const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
                                    int version) const override {
     for (unsigned int i = 0; i < registrations_len_; ++i) {
       const TfLiteRegistration& registration = registrations_[i];
       if ((registration.builtin_code == op) &&
           (registration.version == MicroOpResolverAnyVersion() ||
+           version == MicroOpResolverAnyVersion() ||
            registration.version == version)) {
         return &registration;
       }
@@ -53,6 +60,7 @@ class MicroOpResolver : public OpResolver {
       if ((registration.builtin_code == BuiltinOperator_CUSTOM) &&
           (strcmp(registration.custom_name, op) == 0) &&
           (registration.version == MicroOpResolverAnyVersion() ||
+           version == MicroOpResolverAnyVersion() ||
            registration.version == version)) {
         return &registration;
       }
@@ -60,11 +68,16 @@ class MicroOpResolver : public OpResolver {
     return nullptr;
   }
 
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int version = 1) {
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          TfLiteRegistration* registration, int version = 1) {
     if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Couldn't register builtin op #%d, resolver size "
+                             "is too small (%d)",
+                             op, tOpCount);
+      }
+      return kTfLiteError;
     }
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
@@ -72,20 +85,32 @@ class MicroOpResolver : public OpResolver {
     *new_registration = *registration;
     new_registration->builtin_code = op;
     new_registration->version = version;
+
+    return kTfLiteOk;
   }
 
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int min_version, int max_version) {
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          TfLiteRegistration* registration, int min_version,
+                          int max_version) {
     for (int version = min_version; version <= max_version; ++version) {
-      AddBuiltin(op, registration, version);
+      TfLiteStatus add_status = AddBuiltin(op, registration, version);
+      if (add_status != kTfLiteOk) {
+        return add_status;
+      }
     }
+    return kTfLiteOk;
   }
 
-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int version = 1) {
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
+                         int version = 1) {
     if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(
+            error_reporter_,
+            "Couldn't register custom op '%s', resolver size is too small (%d)",
+            name, tOpCount);
+      }
+      return kTfLiteError;
     }
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
@@ -94,13 +119,19 @@ class MicroOpResolver : public OpResolver {
     new_registration->builtin_code = BuiltinOperator_CUSTOM;
     new_registration->custom_name = name;
     new_registration->version = version;
+
+    return kTfLiteOk;
   }
 
-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int min_version, int max_version) {
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
+                         int min_version, int max_version) {
     for (int version = min_version; version <= max_version; ++version) {
-      AddCustom(name, registration, version);
+      TfLiteStatus add_status = AddCustom(name, registration, version);
+      if (add_status != kTfLiteOk) {
+        return add_status;
+      }
     }
+    return kTfLiteOk;
   }
 
   unsigned int GetRegistrationLength() { return registrations_len_; }
@@ -108,6 +139,7 @@ class MicroOpResolver : public OpResolver {
  private:
   TfLiteRegistration registrations_[tOpCount];
   unsigned int registrations_len_ = 0;
+  ErrorReporter* error_reporter_;
 
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index 0619591523a..61ab0e3bec9 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
-
 #include "tensorflow/lite/micro/testing/micro_test.h"
 
 namespace tflite {
@@ -35,6 +34,22 @@ TfLiteStatus MockPrepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus MockInvoke(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
+
+class MockErrorReporter : public ErrorReporter {
+ public:
+  MockErrorReporter() : has_been_called_(false) {}
+  int Report(const char* format, va_list args) override {
+    has_been_called_ = true;
+    return 0;
+  };
+
+  bool HasBeenCalled() { return has_been_called_; }
+
+ private:
+  bool has_been_called_;
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
 }  // namespace
 }  // namespace tflite
 
@@ -52,8 +67,10 @@ TF_LITE_MICRO_TEST(TestOperations) {
   // We need space for 7 operators because of 2 ops, one with 3 versions, one
   // with 4 versions.
   MicroOpResolver<7> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 1, 3);
-  micro_op_resolver.AddCustom("mock_custom", &r, 1, 4);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
+                                         BuiltinOperator_CONV_2D, &r, 1, 3));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 1, 4));
   OpResolver* resolver = &micro_op_resolver;
 
   const TfLiteRegistration* registration =
@@ -96,8 +113,10 @@ TF_LITE_MICRO_TEST(TestOpRegistrationOverflow) {
   MicroOpResolver<4> micro_op_resolver;
   // Register 7 ops, but only 4 is expected because the class is created with
   // that limit..
-  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 0, 2);
-  micro_op_resolver.AddCustom("mock_custom", &r, 0, 3);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
+                                         BuiltinOperator_CONV_2D, &r, 0, 2));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 0, 3));
   OpResolver* resolver = &micro_op_resolver;
 
   TF_LITE_MICRO_EXPECT_EQ(4, micro_op_resolver.GetRegistrationLength());
@@ -137,4 +156,117 @@ TF_LITE_MICRO_TEST(TestZeroVersionRegistration) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
 }
 
+TF_LITE_MICRO_TEST(TestZeroModelVersion) {
+  using tflite::MicroOpResolver;
+  using tflite::OpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  MicroOpResolver<2> micro_op_resolver;
+  micro_op_resolver.AddCustom("mock_custom", &r, 1, 2);
+  TF_LITE_MICRO_EXPECT_EQ(2, micro_op_resolver.GetRegistrationLength());
+  OpResolver* resolver = &micro_op_resolver;
+
+  // If the Op version in the model is 0, we should always get the first
+  // registration.
+  const TfLiteRegistration* registration = resolver->FindOp("mock_custom", 0);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  TF_LITE_MICRO_EXPECT_EQ(1, registration->version);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+
+  // If a non-zero version is requested, the correct version'd op should be
+  // returned. TODO(b/151245712): Realistically, we are better off removing
+  // these version checks altogether.
+  for (int i = 1; i <= 2; ++i) {
+    registration = resolver->FindOp("mock_custom", i);
+    TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+    TF_LITE_MICRO_EXPECT_EQ(i, registration->version);
+    TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+  }
+
+  registration = resolver->FindOp("mock_custom", 42);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
+}
+
+TF_LITE_MICRO_TEST(TestBuiltinRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<1> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_RELU, &r));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
+TF_LITE_MICRO_TEST(TestCustomRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<1> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          micro_op_resolver.AddCustom("mock_custom_0", &r));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom_1", &r));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
+TF_LITE_MICRO_TEST(TestBuiltinVersionRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<2> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
+                                         BuiltinOperator_CONV_2D, &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddBuiltin(
+                                            BuiltinOperator_RELU, &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
+TF_LITE_MICRO_TEST(TestCustomVersionRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<2> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, micro_op_resolver.AddCustom("mock_custom_0", &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, micro_op_resolver.AddCustom("mock_custom_1", &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index 70f16c78d79..daa5d007cdf 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -20,8 +20,17 @@ limitations under the License.
 #endif
 
 #include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+
 namespace tflite {
 namespace {
 
@@ -95,6 +104,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRw";
     case kTfLiteArenaRwPersistent:
       return "kTfLiteArenaRwPersistent";
+    case kTfLitePersistentRo:
+      return "kTfLitePersistentRo";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/micro/micro_string.cc b/tensorflow/lite/micro/micro_string.cc
index 4344743c0f5..9952565ef52 100644
--- a/tensorflow/lite/micro/micro_string.cc
+++ b/tensorflow/lite/micro/micro_string.cc
@@ -21,6 +21,9 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/micro_string.h"
 
+#include <cstdarg>
+#include <cstdint>
+
 namespace {
 
 // Int formats can need up to 10 bytes for the value plus a single byte for the
diff --git a/tensorflow/lite/micro/micro_string.h b/tensorflow/lite/micro/micro_string.h
index e9b893cd886..59303e82b09 100644
--- a/tensorflow/lite/micro/micro_string.h
+++ b/tensorflow/lite/micro/micro_string.h
@@ -15,10 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_STRING_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_STRING_H_
 
-#include <cstdint>
-
-#include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/micro/micro_string.h"
+#include <cstdarg>
 
 // Implements simple string formatting for numeric types.  Returns the number of
 // bytes written to output.
diff --git a/tensorflow/lite/micro/micro_utils.cc b/tensorflow/lite/micro/micro_utils.cc
index eacae1bd619..ff885fa04ff 100644
--- a/tensorflow/lite/micro/micro_utils.cc
+++ b/tensorflow/lite/micro/micro_utils.cc
@@ -105,10 +105,10 @@ int8_t FloatToSymmetricQuantizedInt8(const float value, const float scale) {
 
 int32_t FloatToSymmetricQuantizedInt32(const float value, const float scale) {
   float quantized = round(value / scale);
-  if (quantized > INT_MAX) {
-    quantized = INT_MAX;
+  if (static_cast<int>(quantized) > INT_MAX) {
+    quantized = static_cast<float>(INT_MAX);
   } else if (quantized < INT_MIN) {
-    quantized = INT_MIN;
+    quantized = static_cast<float> INT_MIN;
   }
 
   return static_cast<int>(quantized);
@@ -249,13 +249,15 @@ void SignedSymmetricQuantize(const float* values, TfLiteIntArray* dims,
     max = fmaxf(max, values[i]);
   }
 
-  *scaling_factor = fmaxf(fabs(min), fabs(max)) / kSymmetricInt32Scale;
+  *scaling_factor =
+      fmaxf(fabs(min), fabs(max)) / static_cast<float>(kSymmetricInt32Scale);
   for (int i = 0; i < input_size; i++) {
     const int32_t quantized_value =
         static_cast<int32_t>(roundf(values[i] / *scaling_factor));
     // Clamp: just in case some odd numeric offset.
-    quantized_values[i] = fminf(kSymmetricInt32Scale,
-                                fmaxf(-kSymmetricInt32Scale, quantized_value));
+    quantized_values[i] = fminf(
+        static_cast<float>(kSymmetricInt32Scale),
+        fmaxf(static_cast<float>(-kSymmetricInt32Scale), quantized_value));
   }
 }
 
diff --git a/tensorflow/lite/micro/simple_memory_allocator.cc b/tensorflow/lite/micro/simple_memory_allocator.cc
index 89d6fd6bd40..911e1e404f7 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator.cc
@@ -16,15 +16,17 @@ limitations under the License.
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 
 #include <cstddef>
+#include <cstdint>
 
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
 
-SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(uint8_t* buffer,
-                                                          size_t buffer_size) {
-  SimpleMemoryAllocator tmp = SimpleMemoryAllocator(buffer, buffer_size);
+SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(
+    ErrorReporter* error_reporter, uint8_t* buffer, size_t buffer_size) {
+  SimpleMemoryAllocator tmp =
+      SimpleMemoryAllocator(error_reporter, buffer, buffer_size);
   SimpleMemoryAllocator* in_place_allocator =
       reinterpret_cast<SimpleMemoryAllocator*>(tmp.AllocateFromTail(
           sizeof(SimpleMemoryAllocator), alignof(SimpleMemoryAllocator)));
@@ -32,38 +34,34 @@ SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(uint8_t* buffer,
   return in_place_allocator;
 }
 
-uint8_t* SimpleMemoryAllocator::AllocateFromTail(size_t size,
+uint8_t* SimpleMemoryAllocator::AllocateFromHead(size_t size,
                                                  size_t alignment) {
-  if (has_child_allocator_) {
-    // TODO(wangtz): Add error reporting when the parent allocator is locked!
+  uint8_t* const aligned_result = AlignPointerUp(head_, alignment);
+  const size_t available_memory = tail_ - aligned_result;
+  if (available_memory < size) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Failed to allocate memory. Requested: %u, available %u, missing: %u",
+        size, available_memory, size - available_memory);
     return nullptr;
   }
-  uint8_t* previous_free = (data_ + data_size_max_) - data_size_;
-  uint8_t* current_data = previous_free - size;
-  uint8_t* aligned_result = AlignPointerDown(current_data, alignment);
-  std::ptrdiff_t aligned_size = (previous_free - aligned_result);
-  if ((data_size_ + aligned_size) > data_size_max_) {
-    // TODO(petewarden): Add error reporting beyond returning null!
-    return nullptr;
-  }
-  data_size_ += aligned_size;
+  head_ = aligned_result + size;
   return aligned_result;
 }
 
-SimpleMemoryAllocator SimpleMemoryAllocator::CreateChildAllocator() {
-  // Note that the parameterized constructor initializes data_size_ to 0 which
-  // is not what we expected.
-  SimpleMemoryAllocator child = *this;
-  child.parent_allocator_ = this;
-  has_child_allocator_ = true;
-  return child;
-}
-
-SimpleMemoryAllocator::~SimpleMemoryAllocator() {
-  // Root allocator doesn't have a parent.
-  if (nullptr != parent_allocator_) {
-    parent_allocator_->has_child_allocator_ = false;
+uint8_t* SimpleMemoryAllocator::AllocateFromTail(size_t size,
+                                                 size_t alignment) {
+  uint8_t* const aligned_result = AlignPointerDown(tail_ - size, alignment);
+  if (aligned_result < head_) {
+    const size_t missing_memory = head_ - aligned_result;
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Failed to allocate memory. Requested: %u, available %u, missing: %u",
+        size, size - missing_memory, missing_memory);
+    return nullptr;
   }
+  tail_ = aligned_result;
+  return aligned_result;
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index f8825cfcaa3..223ef8398a4 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -16,9 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
 
-#include "tensorflow/lite/c/common.h"
+#include <cstddef>
+#include <cstdint>
+
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -27,43 +28,46 @@ namespace tflite {
 // This makes it pretty wasteful, so we should use a more intelligent method.
 class SimpleMemoryAllocator {
  public:
-  SimpleMemoryAllocator(uint8_t* buffer, size_t buffer_size)
-      : data_size_max_(buffer_size), data_(buffer) {}
+  SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer_head,
+                        uint8_t* buffer_tail)
+      : error_reporter_(error_reporter),
+        buffer_head_(buffer_head),
+        buffer_tail_(buffer_tail),
+        head_(buffer_head),
+        tail_(buffer_tail) {}
+  SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer,
+                        size_t buffer_size)
+      : SimpleMemoryAllocator(error_reporter, buffer, buffer + buffer_size) {}
 
-  // Allocates memory starting at the end of the arena (highest address and
-  // moving downwards, so that tensor buffers can be allocated from the start
-  // in ascending order.
+  // Allocates memory starting at the head of the arena (lowest address and
+  // moving upwards).
+  uint8_t* AllocateFromHead(size_t size, size_t alignment);
+  // Allocates memory starting at the tail of the arena (highest address and
+  // moving downwards).
   uint8_t* AllocateFromTail(size_t size, size_t alignment);
 
-  size_t GetDataSize() const { return data_size_; }
-  uint8_t* GetBuffer() const { return data_; }
-  size_t GetMaxBufferSize() const { return data_size_max_; }
+  uint8_t* GetHead() const { return head_; }
+  uint8_t* GetTail() const { return tail_; }
+  size_t GetAvailableMemory() const { return tail_ - head_; }
+  size_t GetUsedBytes() const { return GetBufferSize() - GetAvailableMemory(); }
 
-  // Child allocator is something like a temporary allocator. Memory allocated
-  // by the child allocator will be freed once the child allocator is
-  // deallocated. Child allocator could be cascaded to have for example
-  // grandchild allocator. But at any given time, only the latest child
-  // allocator can be used. All its ancestors will be locked to avoid memory
-  // corruption. Locked means that the allocator can't allocate memory.
-  // WARNING: Parent allocator needs to live longer than the child allocator.
-  SimpleMemoryAllocator CreateChildAllocator();
-
-  // Unlocks parent allocator when the child allocator is deconstructed.
-  ~SimpleMemoryAllocator();
+  size_t GetHeadUsedBytes() const { return head_ - buffer_head_; }
+  size_t GetTailUsedBytes() const { return buffer_tail_ - tail_; }
 
  private:
-  size_t data_size_ = 0;
-  size_t data_size_max_;
-  uint8_t* data_;
-  SimpleMemoryAllocator* parent_allocator_ = nullptr;
-  // The allocator is locked if it has a child.
-  bool has_child_allocator_ = false;
+  size_t GetBufferSize() const { return buffer_tail_ - buffer_head_; }
+
+  ErrorReporter* error_reporter_;
+  uint8_t* buffer_head_;
+  uint8_t* buffer_tail_;
+  uint8_t* head_;
+  uint8_t* tail_;
 };
 
 // Allocate a SimpleMemoryAllocator from the buffer and then return the pointer
 // to this allocator.
-SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(uint8_t* buffer,
-                                                          size_t buffer_size);
+SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(
+    ErrorReporter* error_reporter, uint8_t* buffer, size_t buffer_size);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/simple_memory_allocator_test.cc b/tensorflow/lite/micro/simple_memory_allocator_test.cc
index 92c63661bde..f0ebf343b59 100644
--- a/tensorflow/lite/micro/simple_memory_allocator_test.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator_test.cc
@@ -25,7 +25,8 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestJustFits) {
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator allocator(arena, arena_size);
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
 
   uint8_t* result = allocator.AllocateFromTail(arena_size, 1);
   TF_LITE_MICRO_EXPECT_NE(nullptr, result);
@@ -34,7 +35,8 @@ TF_LITE_MICRO_TEST(TestJustFits) {
 TF_LITE_MICRO_TEST(TestAligned) {
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator allocator(arena, arena_size);
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
 
   uint8_t* result = allocator.AllocateFromTail(1, 1);
   TF_LITE_MICRO_EXPECT_NE(nullptr, result);
@@ -47,7 +49,8 @@ TF_LITE_MICRO_TEST(TestAligned) {
 TF_LITE_MICRO_TEST(TestMultipleTooLarge) {
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator allocator(arena, arena_size);
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
 
   uint8_t* result = allocator.AllocateFromTail(768, 1);
   TF_LITE_MICRO_EXPECT_NE(nullptr, result);
@@ -56,32 +59,4 @@ TF_LITE_MICRO_TEST(TestMultipleTooLarge) {
   TF_LITE_MICRO_EXPECT_EQ(nullptr, result);
 }
 
-TF_LITE_MICRO_TEST(TestChildAllocator) {
-  constexpr size_t arena_size = 1024;
-  uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator allocator(arena, arena_size);
-
-  uint8_t* first = allocator.AllocateFromTail(16, 4);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, first);
-
-  {
-    auto child_allocator = allocator.CreateChildAllocator();
-    uint8_t* second = child_allocator.AllocateFromTail(16, 4);
-    TF_LITE_MICRO_EXPECT_EQ(second, first - 16);
-
-    auto grand_child_allocator = child_allocator.CreateChildAllocator();
-    uint8_t* third = grand_child_allocator.AllocateFromTail(15, 4);
-    TF_LITE_MICRO_EXPECT_EQ(third, second - 16);
-
-    // Parent allocator is locked.
-    TF_LITE_MICRO_EXPECT_EQ(nullptr, allocator.AllocateFromTail(16, 4));
-    TF_LITE_MICRO_EXPECT_EQ(nullptr, child_allocator.AllocateFromTail(16, 4));
-  }
-
-  // Parent allocator is unlocked.
-  auto child_allocator = allocator.CreateChildAllocator();
-  uint8_t* fourth = child_allocator.AllocateFromTail(16, 4);
-  TF_LITE_MICRO_EXPECT_EQ(fourth, first - 16);
-}
-
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 77a1cc82f3b..c2607cd32c6 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/test_helpers.h"
 
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
 #include <initializer_list>
+#include <new>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/tensor_utils.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index 010e1f9e336..2d1d2895db0 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 // Useful functions for writing tests.
 
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/testing/micro_benchmark.h b/tensorflow/lite/micro/testing/micro_benchmark.h
index cfc35c1a41c..f059842ffd1 100644
--- a/tensorflow/lite/micro/testing/micro_benchmark.h
+++ b/tensorflow/lite/micro/testing/micro_benchmark.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_TESTING_MICRO_BENCHMARK_H_
 #define TENSORFLOW_LITE_MICRO_TESTING_MICRO_BENCHMARK_H_
 
+#include <climits>
+
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_time.h"
 
@@ -35,13 +37,22 @@ extern tflite::ErrorReporter* reporter;
     int32_t duration_ticks;                      \
     int32_t duration_ms;
 
-#define TF_LITE_MICRO_BENCHMARKS_END }
+#define TF_LITE_MICRO_BENCHMARKS_END \
+  return 0;                          \
+  }
 
 #define TF_LITE_MICRO_BENCHMARK(func)                                         \
+  if (tflite::ticks_per_second() == 0) {                                      \
+    return 0;                                                                 \
+  }                                                                           \
   start_ticks = tflite::GetCurrentTimeTicks();                                \
   func();                                                                     \
   duration_ticks = tflite::GetCurrentTimeTicks() - start_ticks;               \
-  duration_ms = (duration_ticks * 1000) / tflite::ticks_per_second();         \
+  if (duration_ticks > INT_MAX / 1000) {                                      \
+    duration_ms = duration_ticks / (tflite::ticks_per_second() / 1000);       \
+  } else {                                                                    \
+    duration_ms = (duration_ticks * 1000) / tflite::ticks_per_second();       \
+  }                                                                           \
   TF_LITE_REPORT_ERROR(micro_benchmark::reporter, "%s took %d ticks (%d ms)", \
                        #func, duration_ticks, duration_ms);
 
diff --git a/tensorflow/lite/micro/testing/test_utils.cc b/tensorflow/lite/micro/testing/test_utils.cc
index 9f7803fcf62..62621db40d3 100644
--- a/tensorflow/lite/micro/testing/test_utils.cc
+++ b/tensorflow/lite/micro/testing/test_utils.cc
@@ -15,12 +15,115 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
+#include "tensorflow/lite/micro/simple_memory_allocator.h"
+
 namespace tflite {
 namespace testing {
 
+namespace {
+// TODO(b/141330728): Refactor out of test_utils.cc
+// The variables below (and the AllocatePersistentBuffer function) are only
+// needed for the kernel tests and benchmarks, i.e. where we do not have an
+// interpreter object, and the fully featured MicroAllocator.
+// Currently, these need to be sufficient for all the kernel_tests. If that
+// becomes problematic, we can investigate allowing the arena_size to be
+// specified for each call to PopulatContext.
+constexpr size_t kArenaSize = 10000;
+uint8_t raw_arena_[kArenaSize];
+SimpleMemoryAllocator* simple_memory_allocator_ = nullptr;
+constexpr size_t kBufferAlignment = 16;
+
+// We store the pointer to the ith scratch buffer to implement the Request/Get
+// ScratchBuffer API for the tests. scratch_buffers_[i] will be the ith scratch
+// buffer and will still be allocated from within raw_arena_.
+constexpr size_t kNumScratchBuffers = 5;
+uint8_t* scratch_buffers_[kNumScratchBuffers];
+size_t scratch_buffer_count_ = 0;
+
+// Note that the context parameter in this function is only needed to match the
+// signature of TfLiteContext::AllocatePersistentBuffer and isn't needed in the
+// implementation because we are assuming a single global
+// simple_memory_allocator_
+TfLiteStatus AllocatePersistentBuffer(TfLiteContext* context, size_t bytes,
+                                      void** ptr) {
+  TFLITE_DCHECK(simple_memory_allocator_ != nullptr);
+  TFLITE_DCHECK(ptr != nullptr);
+  *ptr = simple_memory_allocator_->AllocateFromTail(bytes, kBufferAlignment);
+  if (*ptr == nullptr) {
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus RequestScratchBufferInArena(TfLiteContext* context, size_t bytes,
+                                         int* buffer_index) {
+  TFLITE_DCHECK(simple_memory_allocator_ != nullptr);
+  TFLITE_DCHECK(buffer_index != nullptr);
+
+  if (scratch_buffer_count_ == kNumScratchBuffers) {
+    TF_LITE_REPORT_ERROR(
+        static_cast<ErrorReporter*>(context->impl_),
+        "Exceeded the maximum number of scratch tensors allowed (%d).",
+        kNumScratchBuffers);
+    return kTfLiteError;
+  }
+
+  // For tests, we allocate scratch buffers from the tail and keep them around
+  // for the lifetime of model. This means that the arena size in the tests will
+  // be more than what we would have if the scratch buffers could share memory.
+  scratch_buffers_[scratch_buffer_count_] =
+      simple_memory_allocator_->AllocateFromTail(bytes, kBufferAlignment);
+  TFLITE_DCHECK(scratch_buffers_[scratch_buffer_count_] != nullptr);
+
+  *buffer_index = scratch_buffer_count_++;
+  return kTfLiteOk;
+}
+
+void* GetScratchBuffer(TfLiteContext* context, int buffer_index) {
+  TFLITE_DCHECK(scratch_buffer_count_ <= kNumScratchBuffers);
+  if (buffer_index >= scratch_buffer_count_) {
+    return nullptr;
+  }
+  return scratch_buffers_[buffer_index];
+}
+
+}  // namespace
+
+uint8_t F2Q(float value, float min, float max) {
+  int32_t result = ZeroPointFromMinMax<uint8_t>(min, max) +
+                   (value / ScaleFromMinMax<uint8_t>(min, max)) + 0.5f;
+  if (result < std::numeric_limits<uint8_t>::min()) {
+    result = std::numeric_limits<uint8_t>::min();
+  }
+  if (result > std::numeric_limits<uint8_t>::max()) {
+    result = std::numeric_limits<uint8_t>::max();
+  }
+  return result;
+}
+
+// Converts a float value into a signed eight-bit quantized value.
+int8_t F2QS(float value, float min, float max) {
+  return F2Q(value, min, max) + std::numeric_limits<int8_t>::min();
+}
+
+int32_t F2Q32(float value, float scale) {
+  double quantized = value / scale;
+  if (quantized > std::numeric_limits<int32_t>::max()) {
+    quantized = std::numeric_limits<int32_t>::max();
+  } else if (quantized < std::numeric_limits<int32_t>::min()) {
+    quantized = std::numeric_limits<int32_t>::min();
+  }
+  return static_cast<int>(quantized);
+}
+
 // TODO(b/141330728): Move this method elsewhere as part clean up.
 void PopulateContext(TfLiteTensor* tensors, int tensors_size,
                      ErrorReporter* error_reporter, TfLiteContext* context) {
+  simple_memory_allocator_ = CreateInPlaceSimpleMemoryAllocator(
+      error_reporter, raw_arena_, kArenaSize);
+  TFLITE_DCHECK(simple_memory_allocator_ != nullptr);
+  scratch_buffer_count_ = 0;
+
   context->tensors_size = tensors_size;
   context->tensors = tensors;
   context->impl_ = static_cast<void*>(error_reporter);
@@ -34,6 +137,10 @@ void PopulateContext(TfLiteTensor* tensors, int tensors_size,
   context->GetExternalContext = nullptr;
   context->SetExternalContext = nullptr;
 
+  context->AllocatePersistentBuffer = AllocatePersistentBuffer;
+  context->RequestScratchBufferInArena = RequestScratchBufferInArena;
+  context->GetScratchBuffer = GetScratchBuffer;
+
   for (int i = 0; i < tensors_size; ++i) {
     if (context->tensors[i].is_variable) {
       ResetVariableTensor(&context->tensors[i]);
@@ -41,5 +148,139 @@ void PopulateContext(TfLiteTensor* tensors, int tensors_size,
   }
 }
 
+TfLiteTensor CreateFloatTensor(std::initializer_list<float> data,
+                               TfLiteIntArray* dims, const char* name,
+                               bool is_variable) {
+  return CreateFloatTensor(data.begin(), dims, name, is_variable);
+}
+
+TfLiteTensor CreateBoolTensor(std::initializer_list<bool> data,
+                              TfLiteIntArray* dims, const char* name,
+                              bool is_variable) {
+  return CreateBoolTensor(data.begin(), dims, name, is_variable);
+}
+
+TfLiteTensor CreateQuantizedTensor(const uint8_t* data, TfLiteIntArray* dims,
+                                   const char* name, float min, float max,
+                                   bool is_variable) {
+  TfLiteTensor result;
+  result.type = kTfLiteUInt8;
+  result.data.uint8 = const_cast<uint8_t*>(data);
+  result.dims = dims;
+  result.params = {ScaleFromMinMax<uint8_t>(min, max),
+                   ZeroPointFromMinMax<uint8_t>(min, max)};
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(uint8_t);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = false;
+  return result;
+}
+
+TfLiteTensor CreateQuantizedTensor(std::initializer_list<uint8_t> data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   float min, float max, bool is_variable) {
+  return CreateQuantizedTensor(data.begin(), dims, name, min, max, is_variable);
+}
+
+TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
+                                   const char* name, float min, float max,
+                                   bool is_variable) {
+  TfLiteTensor result;
+  result.type = kTfLiteInt8;
+  result.data.int8 = const_cast<int8_t*>(data);
+  result.dims = dims;
+  result.params = {ScaleFromMinMax<int8_t>(min, max),
+                   ZeroPointFromMinMax<int8_t>(min, max)};
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(int8_t);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = is_variable;
+  return result;
+}
+
+TfLiteTensor CreateQuantizedTensor(std::initializer_list<int8_t> data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   float min, float max, bool is_variable) {
+  return CreateQuantizedTensor(data.begin(), dims, name, min, max, is_variable);
+}
+
+TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   bool is_variable) {
+  TfLiteTensor result;
+  SymmetricQuantize(data, dims, quantized_data, &result.params.scale);
+  result.data.uint8 = quantized_data;
+  result.type = kTfLiteUInt8;
+  result.dims = dims;
+  result.params.zero_point = 128;
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(uint8_t);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = is_variable;
+  return result;
+}
+
+TfLiteTensor CreateQuantizedTensor(float* data, int8_t* quantized_data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   bool is_variable) {
+  TfLiteTensor result;
+  SignedSymmetricQuantize(data, dims, quantized_data, &result.params.scale);
+  result.data.int8 = quantized_data;
+  result.type = kTfLiteInt8;
+  result.dims = dims;
+  result.params.zero_point = 0;
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(int8_t);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = is_variable;
+  return result;
+}
+
+TfLiteTensor CreateQuantizedTensor(float* data, int16_t* quantized_data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   bool is_variable) {
+  TfLiteTensor result;
+  SignedSymmetricQuantize(data, dims, quantized_data, &result.params.scale);
+  result.data.i16 = quantized_data;
+  result.type = kTfLiteInt16;
+  result.dims = dims;
+  result.params.zero_point = 0;
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(int16_t);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = is_variable;
+  return result;
+}
+
+TfLiteTensor CreateQuantized32Tensor(const int32_t* data, TfLiteIntArray* dims,
+                                     const char* name, float scale,
+                                     bool is_variable) {
+  TfLiteTensor result;
+  result.type = kTfLiteInt32;
+  result.data.i32 = const_cast<int32_t*>(data);
+  result.dims = dims;
+  // Quantized int32 tensors always have a zero point of 0, since the range of
+  // int32 values is large, and because zero point costs extra cycles during
+  // processing.
+  result.params = {scale, 0};
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(int32_t);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = is_variable;
+  return result;
+}
+
+TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
+                                     TfLiteIntArray* dims, const char* name,
+                                     float scale, bool is_variable) {
+  return CreateQuantized32Tensor(data.begin(), dims, name, scale, is_variable);
+}
+
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/testing/test_utils.h b/tensorflow/lite/micro/testing/test_utils.h
index 7aa1e9d488f..b0ebe159b67 100644
--- a/tensorflow/lite/micro/testing/test_utils.h
+++ b/tensorflow/lite/micro/testing/test_utils.h
@@ -65,182 +65,65 @@ inline int ZeroPointFromMinMax(const float min, const float max) {
 }
 
 // Converts a float value into an unsigned eight-bit quantized value.
-inline uint8_t F2Q(const float value, const float min, const float max) {
-  int32_t result = ZeroPointFromMinMax<uint8_t>(min, max) +
-                   (value / ScaleFromMinMax<uint8_t>(min, max)) + 0.5f;
-  if (result < std::numeric_limits<uint8_t>::min()) {
-    result = std::numeric_limits<uint8_t>::min();
-  }
-  if (result > std::numeric_limits<uint8_t>::max()) {
-    result = std::numeric_limits<uint8_t>::max();
-  }
-  return result;
-}
+uint8_t F2Q(float value, float min, float max);
 
 // Converts a float value into a signed eight-bit quantized value.
-inline int8_t F2QS(const float value, const float min, const float max) {
-  return F2Q(value, min, max) + std::numeric_limits<int8_t>::min();
-}
+int8_t F2QS(const float value, const float min, const float max);
 
 // Converts a float value into a signed thirty-two-bit quantized value.  Note
 // that values close to max int and min int may see significant error due to
 // a lack of floating point granularity for large values.
-inline int32_t F2Q32(const float value, const float scale) {
-  double quantized = value / scale;
-  if (quantized > std::numeric_limits<int32_t>::max()) {
-    quantized = std::numeric_limits<int32_t>::max();
-  } else if (quantized < std::numeric_limits<int32_t>::min()) {
-    quantized = std::numeric_limits<int32_t>::min();
-  }
-  return static_cast<int>(quantized);
-}
+int32_t F2Q32(const float value, const float scale);
 
 // TODO(b/141330728): Move this method elsewhere as part clean up.
 void PopulateContext(TfLiteTensor* tensors, int tensors_size,
                      ErrorReporter* error_reporter, TfLiteContext* context);
 
-inline TfLiteTensor CreateFloatTensor(std::initializer_list<float> data,
-                                      TfLiteIntArray* dims, const char* name,
-                                      bool is_variable = false) {
-  return CreateFloatTensor(data.begin(), dims, name, is_variable);
-}
+TfLiteTensor CreateFloatTensor(std::initializer_list<float> data,
+                               TfLiteIntArray* dims, const char* name,
+                               bool is_variable = false);
 
-inline TfLiteTensor CreateBoolTensor(std::initializer_list<bool> data,
+TfLiteTensor CreateBoolTensor(std::initializer_list<bool> data,
+                              TfLiteIntArray* dims, const char* name,
+                              bool is_variable = false);
+
+TfLiteTensor CreateQuantizedTensor(const uint8_t* data, TfLiteIntArray* dims,
+                                   const char* name, float min, float max,
+                                   bool is_variable = false);
+
+TfLiteTensor CreateQuantizedTensor(std::initializer_list<uint8_t> data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   float min, float max,
+                                   bool is_variable = false);
+
+TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
+                                   const char* name, float min, float max,
+                                   bool is_variable = false);
+
+TfLiteTensor CreateQuantizedTensor(std::initializer_list<int8_t> data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   float min, float max,
+                                   bool is_variable = false);
+
+TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   bool is_variable = false);
+
+TfLiteTensor CreateQuantizedTensor(float* data, int8_t* quantized_data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   bool is_variable = false);
+
+TfLiteTensor CreateQuantizedTensor(float* data, int16_t* quantized_data,
+                                   TfLiteIntArray* dims, const char* name,
+                                   bool is_variable = false);
+
+TfLiteTensor CreateQuantized32Tensor(const int32_t* data, TfLiteIntArray* dims,
+                                     const char* name, float scale,
+                                     bool is_variable = false);
+
+TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
                                      TfLiteIntArray* dims, const char* name,
-                                     bool is_variable = false) {
-  return CreateBoolTensor(data.begin(), dims, name, is_variable);
-}
-
-inline TfLiteTensor CreateQuantizedTensor(const uint8_t* data,
-                                          TfLiteIntArray* dims,
-                                          const char* name, float min,
-                                          float max, bool is_variable = false) {
-  TfLiteTensor result;
-  result.type = kTfLiteUInt8;
-  result.data.uint8 = const_cast<uint8_t*>(data);
-  result.dims = dims;
-  result.params = {ScaleFromMinMax<uint8_t>(min, max),
-                   ZeroPointFromMinMax<uint8_t>(min, max)};
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(uint8_t);
-  result.allocation = nullptr;
-  result.name = name;
-  result.is_variable = false;
-  return result;
-}
-
-inline TfLiteTensor CreateQuantizedTensor(std::initializer_list<uint8_t> data,
-                                          TfLiteIntArray* dims,
-                                          const char* name, float min,
-                                          float max, bool is_variable = false) {
-  return CreateQuantizedTensor(data.begin(), dims, name, min, max, is_variable);
-}
-
-inline TfLiteTensor CreateQuantizedTensor(const int8_t* data,
-                                          TfLiteIntArray* dims,
-                                          const char* name, float min,
-                                          float max, bool is_variable = false) {
-  TfLiteTensor result;
-  result.type = kTfLiteInt8;
-  result.data.int8 = const_cast<int8_t*>(data);
-  result.dims = dims;
-  result.params = {ScaleFromMinMax<int8_t>(min, max),
-                   ZeroPointFromMinMax<int8_t>(min, max)};
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(int8_t);
-  result.allocation = nullptr;
-  result.name = name;
-  result.is_variable = is_variable;
-  return result;
-}
-
-inline TfLiteTensor CreateQuantizedTensor(std::initializer_list<int8_t> data,
-                                          TfLiteIntArray* dims,
-                                          const char* name, float min,
-                                          float max, bool is_variable = false) {
-  return CreateQuantizedTensor(data.begin(), dims, name, min, max, is_variable);
-}
-
-inline TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
-                                          TfLiteIntArray* dims,
-                                          const char* name,
-                                          bool is_variable = false) {
-  TfLiteTensor result;
-  SymmetricQuantize(data, dims, quantized_data, &result.params.scale);
-  result.data.uint8 = quantized_data;
-  result.type = kTfLiteUInt8;
-  result.dims = dims;
-  result.params.zero_point = 128;
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(uint8_t);
-  result.allocation = nullptr;
-  result.name = name;
-  result.is_variable = is_variable;
-  return result;
-}
-
-inline TfLiteTensor CreateQuantizedTensor(float* data, int8_t* quantized_data,
-                                          TfLiteIntArray* dims,
-                                          const char* name,
-                                          bool is_variable = false) {
-  TfLiteTensor result;
-  SignedSymmetricQuantize(data, dims, quantized_data, &result.params.scale);
-  result.data.int8 = quantized_data;
-  result.type = kTfLiteInt8;
-  result.dims = dims;
-  result.params.zero_point = 0;
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(int8_t);
-  result.allocation = nullptr;
-  result.name = name;
-  result.is_variable = is_variable;
-  return result;
-}
-
-inline TfLiteTensor CreateQuantizedTensor(float* data, int16_t* quantized_data,
-                                          TfLiteIntArray* dims,
-                                          const char* name,
-                                          bool is_variable = false) {
-  TfLiteTensor result;
-  SignedSymmetricQuantize(data, dims, quantized_data, &result.params.scale);
-  result.data.i16 = quantized_data;
-  result.type = kTfLiteInt16;
-  result.dims = dims;
-  result.params.zero_point = 0;
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(int16_t);
-  result.allocation = nullptr;
-  result.name = name;
-  result.is_variable = is_variable;
-  return result;
-}
-
-inline TfLiteTensor CreateQuantized32Tensor(const int32_t* data,
-                                            TfLiteIntArray* dims,
-                                            const char* name, float scale,
-                                            bool is_variable = false) {
-  TfLiteTensor result;
-  result.type = kTfLiteInt32;
-  result.data.i32 = const_cast<int32_t*>(data);
-  result.dims = dims;
-  // Quantized int32 tensors always have a zero point of 0, since the range of
-  // int32 values is large, and because zero point costs extra cycles during
-  // processing.
-  result.params = {scale, 0};
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(int32_t);
-  result.allocation = nullptr;
-  result.name = name;
-  result.is_variable = is_variable;
-  return result;
-}
-
-inline TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
-                                            TfLiteIntArray* dims,
-                                            const char* name, float scale,
-                                            bool is_variable = false) {
-  return CreateQuantized32Tensor(data.begin(), dims, name, scale, is_variable);
-}
+                                     float scale, bool is_variable = false);
 
 template <typename input_type = int32_t,
           TfLiteType tensor_input_type = kTfLiteInt32>
diff --git a/tensorflow/lite/micro/testing/test_xcore_binary.sh b/tensorflow/lite/micro/testing/test_xcore_binary.sh
new file mode 100755
index 00000000000..e059968c885
--- /dev/null
+++ b/tensorflow/lite/micro/testing/test_xcore_binary.sh
@@ -0,0 +1,47 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests an  XS3  binary by executing it using the XSIM simulator and parsing
+# the log output.
+#
+# First argument is the binary location.
+# Second argument is a regular expression that's required to be in the output
+# logs for the test to pass.
+
+declare -r ROOT_DIR=`pwd`
+declare -r TEST_TMPDIR=/tmp/test_xcore_binary/
+declare -r MICRO_LOG_PATH=${TEST_TMPDIR}/$1
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+declare -r XCORE_
+mkdir -p ${MICRO_LOG_PATH}
+
+# Get the location of this script file as an absolute path
+SCRIPT_PATH="`dirname \"$BASH_SOURCE\"`"
+SCRIPT_PATH="`( cd \"$SCRIPT_PATH\" && pwd )`"
+XSIM_FLAGS=""
+
+
+xsim $1 ${XSIM_FLAGS} 2>&1 | tee ${MICRO_LOG_FILENAME}
+
+if grep -q "$2" ${MICRO_LOG_FILENAME}
+then
+  echo "$1: PASS"
+  exit 0
+else
+  echo "$1: FAIL - '$2' not found in logs."
+  exit 1
+fi
+
diff --git a/tensorflow/lite/micro/tools/ci_build/ci_build_micro_projects.sh b/tensorflow/lite/micro/tools/ci_build/ci_build_micro_projects.sh
index de5b63d964d..96fcc0c90d3 100755
--- a/tensorflow/lite/micro/tools/ci_build/ci_build_micro_projects.sh
+++ b/tensorflow/lite/micro/tools/ci_build/ci_build_micro_projects.sh
@@ -26,7 +26,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR=${SCRIPT_DIR}/../../../../..
-cd ${ROOT_DIR}
+cd "${ROOT_DIR}"
 pwd
 
 make -f tensorflow/lite/micro/tools/make/Makefile \
diff --git a/tensorflow/lite/micro/tools/ci_build/test_all.sh b/tensorflow/lite/micro/tools/ci_build/test_all.sh
index 5172f950eac..817a4dce115 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_all.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_all.sh
@@ -21,7 +21,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR=${SCRIPT_DIR}/../../../../..
-cd ${ROOT_DIR}
+cd "${ROOT_DIR}"
 pwd
 
 make -f tensorflow/lite/micro/tools/make/Makefile \
@@ -49,8 +49,7 @@ tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
 echo "Running x86 tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_x86.sh
 
-# TODO(b/149597202): Disabled until we can get Docker running inside Docker.
-#echo "Running stm32f4 tests at `date`"
-#tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
+echo "Running stm32f4 tests at `date`"
+tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
 
 echo "Finished all micro tests at `date`"
diff --git a/tensorflow/lite/micro/tools/ci_build/test_arc.sh b/tensorflow/lite/micro/tools/ci_build/test_arc.sh
index 24bbb3e4c63..de8f7c5daea 100644
--- a/tensorflow/lite/micro/tools/ci_build/test_arc.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_arc.sh
@@ -21,7 +21,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR=${SCRIPT_DIR}/../../../../..
-cd ${ROOT_DIR}
+cd "${ROOT_DIR}"
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_arduino.sh b/tensorflow/lite/micro/tools/ci_build/test_arduino.sh
index 996612a977b..ecb821bde63 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_arduino.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_arduino.sh
@@ -21,7 +21,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR=${SCRIPT_DIR}/../../../../..
-cd ${ROOT_DIR}
+cd "${ROOT_DIR}"
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh b/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
index 087b08d5e01..8770ea96980 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
@@ -30,10 +30,10 @@ LIBRARY_ZIP=${1}
 
 rm -rf ${TEMP_BUILD_DIR}
 
-mkdir -p ${ARDUINO_HOME_DIR}/libraries
+mkdir -p "${ARDUINO_HOME_DIR}/libraries"
 mkdir -p ${TEMP_BUILD_DIR}
 
-unzip -q ${LIBRARY_ZIP} -d ${ARDUINO_LIBRARIES_DIR}
+unzip -q ${LIBRARY_ZIP} -d "${ARDUINO_LIBRARIES_DIR}"
 
 # Installs all dependencies for Arduino
 InstallLibraryDependencies () {
@@ -52,7 +52,7 @@ InstallLibraryDependencies () {
   # the defines in ArduCAM/memorysaver.h are correct.
   wget -O /tmp/arducam-master.zip https://github.com/ArduCAM/Arduino/archive/e216049ba304048ec9bb29adfc2cc24c16f589b1/master.zip
   unzip /tmp/arducam-master.zip -d /tmp
-  cp -r /tmp/Arduino-e216049ba304048ec9bb29adfc2cc24c16f589b1/ArduCAM ${ARDUINO_LIBRARIES_DIR}
+  cp -r /tmp/Arduino-e216049ba304048ec9bb29adfc2cc24c16f589b1/ArduCAM "${ARDUINO_LIBRARIES_DIR}"
 }
 
 InstallLibraryDependencies
diff --git a/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh b/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
index fc0fc18817c..1f957e9dcab 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
@@ -20,7 +20,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR=${SCRIPT_DIR}/../../../../..
-cd ${ROOT_DIR}
+cd "${ROOT_DIR}"
 pwd
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
diff --git a/tensorflow/lite/micro/tools/ci_build/test_mbed.sh b/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
index 894c85ad811..a4d47009c93 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
@@ -29,7 +29,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR=${SCRIPT_DIR}/../../../../..
-cd ${ROOT_DIR}
+cd "${ROOT_DIR}"
 pwd
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
diff --git a/tensorflow/lite/micro/tools/ci_build/test_mbed_library.sh b/tensorflow/lite/micro/tools/ci_build/test_mbed_library.sh
index 3bf215568b9..c1ec1e60068 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_mbed_library.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_mbed_library.sh
@@ -21,7 +21,7 @@
 set -e
 
 PATH=`pwd`/tensorflow/lite/micro/tools/make/downloads/gcc_embedded/bin:${PATH}
-cd ${1}
+cd "${1}"
 
 mbed config root .
 mbed deploy
diff --git a/tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh b/tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
index d0130228268..dc08c4e4549 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
@@ -20,7 +20,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR=${SCRIPT_DIR}/../../../../..
-cd ${ROOT_DIR}
+cd "${ROOT_DIR}"
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
index 14e229c092f..be706a3a6bd 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
@@ -22,7 +22,7 @@ TARGET=stm32f4
 TAGS=cmsis-nn
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR=${SCRIPT_DIR}/../../../../..
-cd ${ROOT_DIR}
+cd "${ROOT_DIR}"
 pwd
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
@@ -35,6 +35,7 @@ readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARG
 # Build test binaries first
 readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} build
 
+# TODO(b/149597202): Disabled until we can get Docker running inside Docker.
 # Parallell builds doesn't work very well with this
-readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} test
+#readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} test
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_x86.sh b/tensorflow/lite/micro/tools/ci_build/test_x86.sh
index 48ef94aaa21..c150d828164 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_x86.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_x86.sh
@@ -20,7 +20,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR=${SCRIPT_DIR}/../../../../..
-cd ${ROOT_DIR}
+cd "${ROOT_DIR}"
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index dc44c05e72a..1331163a410 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -86,23 +86,26 @@ else ifeq ($(BUILD_TYPE), release)
 	CXXFLAGS += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
 	CCFLAGS  += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
 else
-	CXXFLAGS += -O3
-	CCFLAGS  += -O3
+	CXXFLAGS += -DNDEBUG -O3
+	CCFLAGS  += -DNDEBUG -O3
 endif
 
 # This library is the main target for this makefile. It will contain a minimal
 # runtime that can be linked in to other programs.
 MICROLITE_LIB_NAME := libtensorflow-microlite.a
 
-MICRO_LITE_EXAMPLE_TESTS := $(wildcard tensorflow/lite/micro/examples/*/Makefile.inc)
+MICRO_LITE_EXAMPLE_TESTS := $(shell find tensorflow/lite/micro/examples/ -name Makefile.inc)
+
+MICRO_LITE_BENCHMARKS := $(wildcard tensorflow/lite/micro/benchmarks/Makefile.inc)
 
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/micro/*test.cc) \
 $(wildcard tensorflow/lite/micro/kernels/*test.cc) \
 $(wildcard tensorflow/lite/micro/memory_planner/*test.cc)
 
+# TODO(b/152645559): move all benchmarks to benchmarks directory.
 MICROLITE_BENCHMARK_SRCS := \
-$(wildcard tensorflow/lite/micro/kernels/benchmarks/*.cc)
+$(wildcard tensorflow/lite/micro/benchmarks/*.cc)
 
 MICROLITE_TEST_HDRS := \
 $(wildcard tensorflow/lite/micro/testing/*.h)
@@ -156,7 +159,9 @@ tensorflow/lite/kernels/internal/reference/integer_ops/conv.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/mul.h \
+tensorflow/lite/kernels/internal/reference/l2normalization.h \
 tensorflow/lite/kernels/internal/reference/maximum_minimum.h \
 tensorflow/lite/kernels/internal/reference/mul.h \
 tensorflow/lite/kernels/internal/reference/neg.h \
@@ -167,6 +172,7 @@ tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h \
 tensorflow/lite/kernels/internal/reference/quantize.h \
 tensorflow/lite/kernels/internal/reference/reduce.h \
 tensorflow/lite/kernels/internal/reference/requantize.h \
+tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h \
 tensorflow/lite/kernels/internal/reference/round.h \
 tensorflow/lite/kernels/internal/reference/softmax.h \
 tensorflow/lite/kernels/internal/reference/sub.h \
@@ -266,6 +272,9 @@ AR := $(TARGET_TOOLCHAIN_ROOT)${TARGET_TOOLCHAIN_PREFIX}${AR_TOOL}
 # Load the examples.
 include $(MICRO_LITE_EXAMPLE_TESTS)
 
+# Load the benchmarks.
+include $(MICRO_LITE_BENCHMARKS)
+
 # Create rules for downloading third-party dependencies.
 THIRD_PARTY_TARGETS :=
 $(foreach DOWNLOAD,$(THIRD_PARTY_DOWNLOADS),$(eval $(call create_download_rule,$(DOWNLOAD))))
diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 97f369884f3..da9a474b004 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -82,6 +82,13 @@ patch_kissfft() {
   echo "Finished patching kissfft"
 }
 
+# Create a header file containing an array with the first 10 images from the
+# CIFAR10 test dataset.
+patch_cifar10_dataset() {
+  xxd -l 30730 -i ${1}/test_batch.bin ${1}/../../../../examples/image_recognition_experimental/first_10_cifar_images.h
+  sed -i -E "s/unsigned char/const unsigned char/g" ${1}/../../../../examples/image_recognition_experimental/first_10_cifar_images.h
+}
+
 build_embarc_mli() {
   gmake -j 4 -C ${1}/lib/make TCF_FILE=${2}
 }
@@ -102,14 +109,15 @@ download_and_extract() {
   command -v curl >/dev/null 2>&1 || {
     echo >&2 "The required 'curl' tool isn't installed. Try 'apt-get install curl'."; exit 1;
   }
-  
+
   echo "downloading ${url}" >&2
   mkdir -p "${dir}"
   # We've been seeing occasional 56 errors from valid URLs, so set up a retry
   # loop to attempt to recover from them.
   for (( i=1; i<=$curl_retries; ++i ))
-  do  
-    CURL_RESULT=$(curl -Ls --retry 5 "${url}" > ${tempfile} || true)
+  do
+    curl -Ls --fail --retry 5 "${url}" > ${tempfile}
+    CURL_RESULT=$?
     if [[ $CURL_RESULT -eq 0 ]]
     then
       break
@@ -128,7 +136,10 @@ download_and_extract() {
     echo "Checksum error for '${url}'. Expected ${expected_md5} but found ${DOWNLOADED_MD5}"
     exit 1
   fi
-  
+
+  # delete anything after the '?' in a url that may mask true file extension
+  url=$(echo "${url}" | sed "s/\?.*//")
+
   if [[ "${url}" == *gz ]]; then
     tar -C "${dir}" --strip-components=1 -xzf ${tempfile}
   elif [[ "${url}" == *tar.xz ]]; then
@@ -140,7 +151,7 @@ download_and_extract() {
     unzip ${tempfile} -d ${tempdir2} 2>&1 1>/dev/null
     # If the zip file contains nested directories, extract the files from the
     # inner directory.
-    if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
+    if [ $(find $tempdir2/* -maxdepth 0 | wc -l) = 1 ] && [ -d $tempdir2/* ]; then
       # unzip has no strip components, so unzip to a temp dir, and move the
       # files we want from the tempdir to destination.
       cp -R ${tempdir2}/*/* ${dir}/
@@ -159,6 +170,8 @@ download_and_extract() {
     patch_am_sdk ${dir}
   elif [[ ${action} == "patch_kissfft" ]]; then
     patch_kissfft ${dir}
+  elif [[ ${action} == "patch_cifar10_dataset" ]]; then
+    patch_cifar10_dataset ${dir}
   elif [[ ${action} == "build_embarc_mli" ]]; then
     build_embarc_mli ${dir} ${action_param1}
   elif [[ ${action} ]]; then
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
index cfd87089a84..657d4fa87cf 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
@@ -8,6 +8,11 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
     THIRD_PARTY_DOWNLOADS += \
       $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
 
+    ifneq (,$(filter $(TARGET_ARCH), cortex-m55))
+      CCFLAGS += -DARM_MATH_MVEI
+      CXXFLAGS += -DARM_MATH_MVEI
+    endif
+
     CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
 
     # Include CMSIS-NN files
diff --git a/tensorflow/lite/micro/tools/make/fix_arduino_subfolders.py b/tensorflow/lite/micro/tools/make/fix_arduino_subfolders.py
index 8676794d3c5..29e388c75e0 100755
--- a/tensorflow/lite/micro/tools/make/fix_arduino_subfolders.py
+++ b/tensorflow/lite/micro/tools/make/fix_arduino_subfolders.py
@@ -80,6 +80,27 @@ def move_person_data_experimental(library_dir):
       source_file.write(file_contents)
 
 
+def move_image_data_experimental(library_dir):
+  """Moves the downloaded image detection model into the examples folder."""
+  old_image_data_path = os.path.join(
+      library_dir, 'src/tensorflow/lite/micro/tools/make/downloads/' +
+      'image_recognition_model/image_recognition_model.cpp')
+  new_image_data_path = os.path.join(
+      library_dir,
+      'examples/image_recognition_experimental/image_recognition_model.cpp')
+  if os.path.exists(old_image_data_path):
+    os.rename(old_image_data_path, new_image_data_path)
+    # Update include.
+    with open(new_image_data_path, 'r') as source_file:
+      file_contents = source_file.read()
+    file_contents = file_contents.replace(
+        six.ensure_str('#include "tensorflow/lite/micro/examples/' +
+                       'image_recognition_example/image_recognition_model.h"'),
+        '#include "image_recognition_model.h"')
+    with open(new_image_data_path, 'w') as source_file:
+      source_file.write(file_contents)
+
+
 def rename_example_main_inos(library_dir):
   """Makes sure the .ino sketch files match the example name."""
   search_path = os.path.join(library_dir, 'examples/*', 'main.ino')
@@ -97,6 +118,7 @@ def main(unparsed_args):
   rename_example_main_inos(library_dir)
   move_person_data(library_dir)
   move_person_data_experimental(library_dir)
+  move_image_data_experimental(library_dir)
 
 
 def parse_args():
diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index aee04c63256..09771419843 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -380,12 +380,14 @@ endef
 # Calling eval on the output will create targets that you can invoke to
 # generate the standalone project.
 define generate_microlite_projects
-$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
-$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
-$(call generate_project,mbed,$(MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
-$(call generate_project,keil,$(KEIL_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
-$(call generate_arduino_project,$(ARDUINO_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS))
-$(call generate_esp_project,$(ESP_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS),$(2),$(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(PROJECT_INCLUDES))
+$(call generate_project,make,$(MAKE_PROJECT_FILES) $($(1)_MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
+$(call generate_arc_project,make,$(MAKE_PROJECT_FILES) $($(1)_MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_project,mbed,$(MBED_PROJECT_FILES) $($(1)_MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
+$(call generate_project,keil,$(KEIL_PROJECT_FILES) $($(1)_KEIL_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
+ifeq (,$(findstring _benchmark,$(1)))
+  $(call generate_arduino_project,$(ARDUINO_PROJECT_FILES) $($(1)_ARDUINO_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS))
+endif
+$(call generate_esp_project,$(ESP_PROJECT_FILES) $($(1)_ESP_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS),$(2),$(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(PROJECT_INCLUDES))
 endef
 
 # Handles the details of generating a binary target, including specializing
diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
index fa20ad99125..8671df5864f 100644
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -60,6 +60,7 @@ $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downlo
     -Wmissing-field-initializers \
     -Wno-unused-parameter \
     -Wno-write-strings \
+    -Wunused-function \
     -fno-delete-null-pointer-checks \
     -fno-threadsafe-statics \
     -fomit-frame-pointer \
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
index 0b7e63cae5b..2bd84fa6e29 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
@@ -8,8 +8,8 @@ ifeq ($(TARGET), bluepill)
   $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
   $(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
 
-  # TODO(b/149943573): It may be worthwhile to remove -DNDEBUG if we can get the
-  # bluepill target to compile without it.
+# TODO(b/149943573): It may be worthwhile to remove -DNDEBUG if we can get the
+#bluepill target to compile without it.
   PLATFORM_FLAGS = \
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
     -DTF_LITE_STATIC_MEMORY \
@@ -37,7 +37,9 @@ ifeq ($(TARGET), bluepill)
     -Wmissing-field-initializers \
     -Wno-unused-parameter \
     -Wno-write-strings \
+    -Wunused-function \
     -fno-delete-null-pointer-checks \
+    -fno-threadsafe-statics \
     -fomit-frame-pointer \
     -fpermissive \
     -fno-use-cxa-atexit \
diff --git a/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc b/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc
index bb6a9f3daf5..756915f946a 100644
--- a/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc
@@ -47,6 +47,7 @@ ifeq ($(TARGET), ecm3531)
     -Wmissing-field-initializers \
     -Wno-unused-parameter \
     -Wno-write-strings \
+    -Wunused-function \
     -fno-delete-null-pointer-checks \
     -fno-threadsafe-statics \
     -fomit-frame-pointer \
diff --git a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
index 85e5aa7154d..155fff99dcd 100644
--- a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
@@ -35,6 +35,7 @@ ifeq ($(TARGET), hexagon)
     -Wno-sign-compare \
     -Wno-unused-parameter \
     -Wno-write-strings \
+    -Wunused-function \
     -Wvla \
     -fdata-sections \
     -ffunction-sections \
diff --git a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
index ddd06718bed..079c3c14f1e 100644
--- a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
@@ -30,6 +30,7 @@ ifeq ($(TARGET), riscv32_mcu)
     -Wmissing-field-initializers \
     -Wno-unused-parameter \
     -Wno-write-strings \
+    -Wunused-function \
     -fno-delete-null-pointer-checks \
     -fno-threadsafe-statics \
     -fomit-frame-pointer \
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
index 2bad89e423e..7abd3cc7e38 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -31,6 +31,7 @@ ifeq ($(TARGET), stm32f4)
     -Wno-missing-field-initializers \
     -Wno-write-strings \
     -Wno-sign-compare \
+    -Wunused-function \
     -fno-delete-null-pointer-checks \
     -fomit-frame-pointer \
     -fpermissive \
@@ -71,14 +72,11 @@ ifeq ($(TARGET), stm32f4)
     tensorflow/lite/micro/kernels/elementwise_test.cc \
     tensorflow/lite/micro/kernels/strided_slice_test.cc \
     tensorflow/lite/micro/kernels/prelu_test.cc \
-    tensorflow/lite/micro/kernels/pooling_test.cc \
     tensorflow/lite/micro/kernels/pack_test.cc \
     tensorflow/lite/micro/kernels/activations_test.cc \
     tensorflow/lite/micro/kernels/dequantize_test.cc \
     tensorflow/lite/micro/kernels/unpack_test.cc \
     tensorflow/lite/micro/kernels/split_test.cc \
-    tensorflow/lite/micro/kernels/conv_test.cc \
-    tensorflow/lite/micro/kernels/depthwise_conv_test.cc \
     tensorflow/lite/micro/simple_tensor_allocator_test.cc
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
   EXCLUDED_EXAMPLE_TESTS := \
diff --git a/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
new file mode 100644
index 00000000000..9a0f7463688
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
@@ -0,0 +1,24 @@
+# Settings for XMOS XS3 based processors (xcore.ai, ...)
+
+#IMPORTANT: to set up environment variables correctly run the following from the top tensorflow directory:
+# $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads test
+# $ pushd tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd
+# $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+
+ifeq ($(TARGET), xcore)
+  XTIME_URL := "https://www.xmos.com/download/Tools-15---Linux-64%2815.0.0_rc1%29.tgz?key=132D-9DC9-E913-0229-ECE6-D5AB-F511-2B19"
+  XTIME_MD5 := "8f6543c8ac4af7583edf75e62df322a2"
+  $(eval $(call add_third_party_download,$(XTIME_URL),$(XTIME_MD5),xtimecomposer))
+  PLATFORM_FLAGS = -target=XU316-1024-FB265-C32 -mcmodel=large -Os -DXCORE -Wno-xcore-fptrgroup -report
+  CXX_TOOL := xcc
+  CC_TOOL := xcc
+  AR_TOOL := xmosar   
+  override CXXFLAGS := -std=c++11 -g -DTF_LITE_STATIC_MEMORY -DNDEBUG
+  override CXXFLAGS += $(PLATFORM_FLAGS) 
+  override CCFLAGS := -g -DTF_LITE_STATIC_MEMORY -DNDEBUG
+  override CCFLAGS += $(PLATFORM_FLAGS)
+  TARGET_ARCH := xcore
+  #TARGET_TOOLCHAIN_PREFIX := tensorflow/lite/micro/tools/make/downloads/xtimecomposer/bin/
+  TEST_SCRIPT := tensorflow/lite/micro/testing/test_xcore_binary.sh
+  #GCC_XCORE := $(MAKEFILE_DIR)/downloads/xtimecomposer/bin/
+endif
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 30a27c0a758..9251e4c161e 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -3,12 +3,20 @@
 GEMMLOWP_URL := "https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37f7f98adcc7fc9f425.zip"
 GEMMLOWP_MD5 := "7e8191b24853d75de2af87622ad293ba"
 
-FLATBUFFERS_URL := "https://github.com/google/flatbuffers/archive/v1.12.0.tar.gz"
-FLATBUFFERS_MD5 := "c62ffefb3d4548b127cca14ce047f16c"
+ifeq ($(HOST_OS),windows)
+  FLATBUFFERS_URL := "https://github.com/google/flatbuffers/archive/v1.12.0.zip"
+  FLATBUFFERS_MD5 := "a1afdbf114dec01a861c1b8c917d0fc7"
+else
+  FLATBUFFERS_URL := "https://github.com/google/flatbuffers/archive/v1.12.0.tar.gz"
+  FLATBUFFERS_MD5 := "c62ffefb3d4548b127cca14ce047f16c"
+endif
 
 ifeq ($(HOST_OS),osx)
   GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-mac.tar.bz2"
   GCC_EMBEDDED_MD5 := "a66be9828cf3c57d7d21178e07cd8904"
+else ifeq ($(HOST_OS),windows)
+  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-win32.zip"
+  GCC_EMBEDDED_MD5 := "bc8ae26d7c429f30d583a605a4bcf9bc"
 else
   GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
   GCC_EMBEDDED_MD5 := "299ebd3f1c2c90930d28ab82e5d8d6c0"
@@ -20,8 +28,8 @@ LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
 TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
 TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
 
-CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/b2937134bd2047bd569c4408391ae20d7677d35c.zip"
-CMSIS_MD5 := "04cb3a2cb4834284767a01e8f1c6f834"
+CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/8a4db53f69da06e97565fe2f2e8926d193a5759d.zip"
+CMSIS_MD5 := "e9864fb71b65adc4f7d92a9dea6e1aab"
 
 AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
 AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"
@@ -48,8 +56,14 @@ SIFIVE_FE310_LIB_MD5 := "06ee24c4956f8e21670ab3395861fe64"
 KISSFFT_URL="https://github.com/mborgerding/kissfft/archive/v130.zip"
 KISSFFT_MD5="438ba1fef5783cc5f5f201395cc477ca"
 
-RUY_URL="https://github.com/google/ruy/archive/91d62808498cea7ccb48aa59181e218b4ad05701.zip"
-RUY_MD5="5e653ae8863408ede2a0ca104fea5b1e"
+RUY_URL="https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip"
+RUY_MD5="2d54f058f8f7120dfc1ecee79dbf259e"
+
+CIFAR10_DATASET_URL="https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
+CIFAR10_DATASET_MD5="c32a1d4ab5d03f1284b67883e8d87530"
+
+IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_4_14.zip"
+IMAGE_RECOGNITION_MODEL_MD5 := "2b886156e7ef4d6e53d0f1a4bc800e56"
 
 PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2019_11_21.zip"
 PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
diff --git a/tensorflow/lite/micro/xcore/README.md b/tensorflow/lite/micro/xcore/README.md
new file mode 100644
index 00000000000..796b73a8ca1
--- /dev/null
+++ b/tensorflow/lite/micro/xcore/README.md
@@ -0,0 +1,30 @@
+# Quickstart to install tools and run unit tests:
+
+```
+$ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads && make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test_greedy_memory_planner_test || true && pushd tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd  && make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+```
+
+(add -jN to the final make command to run builds / tests in N parallel threads)
+
+# Background information:
+
+*   To start from a fresh repo (this will also remove non-xcore builds and
+    downloads): `$ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" clean clean_downloads`
+*   To force xcore.ai tools download from a clean repo: `$ make -f
+    tensorflow/lite/micro/tools/make/Makefile TARGET="xcore"
+    test_greedy_memory_planner_test` (this will fail to build the test, but if
+    it succeeds because you already have tools it will exit quickly)
+
+*   To set up environment variables correctly run the following from the top
+    tensorflow directory: `$ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" test $ pushd
+    ./tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/
+    && source SetEnv && popd $ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" test`
+
+*   Assuming tools are already set up the following are the most commonly used
+    commands: `$ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" build $ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" test $ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" < name_of_example i.e. hello_world_test >`
diff --git a/tensorflow/lite/micro/xcore/debug_log.cc b/tensorflow/lite/micro/xcore/debug_log.cc
new file mode 100644
index 00000000000..c206f057c02
--- /dev/null
+++ b/tensorflow/lite/micro/xcore/debug_log.cc
@@ -0,0 +1,16 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include <cstdio>
+extern "C" void DebugLog(const char* s) { printf("%s", s); }
diff --git a/tensorflow/lite/micro/xtensa-xpg/micro_time.cc b/tensorflow/lite/micro/xtensa-xpg/micro_time.cc
new file mode 100644
index 00000000000..6f3844c1fe3
--- /dev/null
+++ b/tensorflow/lite/micro/xtensa-xpg/micro_time.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Xtensa implementation of micro_timer.
+// To include this with make, add TAGS=xtensa-xpg.
+#include "tensorflow/lite/micro/micro_time.h"
+
+#include <time.h>
+
+namespace tflite {
+
+int32_t ticks_per_second() { return CLOCKS_PER_SEC; }
+
+int32_t GetCurrentTimeTicks() { return clock(); }
+
+}  // namespace tflite
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index a5c1a7c98ac..6739838e4d1 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -136,6 +136,13 @@ enum {
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM = 92,
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN = 93,
   ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR = 94,
+  ANEURALNETWORKS_QUANTIZED_LSTM = 95,
+  ANEURALNETWORKS_IF = 96,
+  ANEURALNETWORKS_WHILE = 97,
+  ANEURALNETWORKS_ELU = 98,
+  ANEURALNETWORKS_HARD_SWISH = 99,
+  ANEURALNETWORKS_FILL = 100,
+  ANEURALNETWORKS_RANK = 101,
 };
 
 /**
@@ -173,6 +180,11 @@ enum {
   ANEURALNETWORKS_UNMAPPABLE = 7,
   ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE = 8,
   ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9,
+  ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT = 10,
+  ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT = 11,
+  ANEURALNETWORKS_RESOURCE_EXHAUSTED_TRANSIENT = 12,
+  ANEURALNETWORKS_RESOURCE_EXHAUSTED_PERSISTENT = 13,
+  ANEURALNETWORKS_DEAD_OBJECT = 14,
 };
 // LINT.ThenChange(//tensorflow/lite/delegates/nnapi/nnapi_delegate.cc:NnApiErrorDescription)
 
@@ -203,6 +215,18 @@ enum {
   ANEURALNETWORKS_DEVICE_ACCELERATOR = 4,
 };
 
+/**
+ * Relative execution priority.
+ *
+ * Available since API level 30.
+ */
+enum {
+  ANEURALNETWORKS_PRIORITY_LOW = 90,
+  ANEURALNETWORKS_PRIORITY_MEDIUM = 100,
+  ANEURALNETWORKS_PRIORITY_HIGH = 110,
+  ANEURALNETWORKS_PRIORITY_DEFAULT = ANEURALNETWORKS_PRIORITY_MEDIUM,
+};
+
 /**
  * ANeuralNetworksMemory is an opaque type that represents memory.
  *
@@ -516,9 +540,21 @@ typedef int (*ANeuralNetworksCompilation_setCaching_fn)(
     ANeuralNetworksCompilation* compilation, const char* cacheDir,
     const uint8_t* token);
 
+typedef int (*ANeuralNetworksCompilation_setTimeout_fn)(
+    ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+typedef int (*ANeuralNetworksCompilation_setPriority_fn)(
+    ANeuralNetworksCompilation* compilation, int priority);
+
 typedef int (*ANeuralNetworksExecution_compute_fn)(
     ANeuralNetworksExecution* execution);
 
+typedef int (*ANeuralNetworksExecution_setTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
+typedef int (*ANeuralNetworksExecution_setLoopTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
 typedef int (*ANeuralNetworksExecution_getOutputOperandRank_fn)(
     ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
 
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index 71a4de53e9a..ad5869fec04 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -45,19 +45,6 @@ int32_t GetAndroidSdkVersion() {
       }
       result = result * 10 + digit;
     }
-    // TODO(levp): remove once SDK gets updated to 29th level
-    // Upgrade SDK version for pre-release Q to be able to test functionality
-    // available from SDK level 29.
-    if (result == 28) {
-      char versionCodename[PROP_VALUE_MAX];
-      const char* versionCodenameProp = "ro.build.version.codename";
-      length = __system_property_get(versionCodenameProp, versionCodename);
-      if (length != 0) {
-        if (versionCodename[0] == 'Q') {
-          return 29;
-        }
-      }
-    }
     return result;
   }
   return 0;
@@ -228,6 +215,17 @@ const NnApi LoadNnApi() {
                          ANeuralNetworksModel_getExtensionOperationType);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
                          ANeuralNetworksModel_setOperandExtensionData);
+
+  // API 30 (NNAPI 1.3) methods.
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setTimeout);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setPriority);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setTimeout);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setLoopTimeout);
+
   return nnapi;
 }
 
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
index a27f5ba661a..abee0fbdef3 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.h
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -789,6 +789,76 @@ struct NnApi {
       ANeuralNetworksCompilation* compilation, const char* cacheDir,
       const uint8_t* token);
 
+  /**
+   * Set the maximum expected duration for compiling the model.
+   *
+   * If the device is not able to complete the compilation within the specified
+   * duration, the compilation may be aborted. The timeout duration begins at
+   * the call to {@link ANeuralNetworksCompilation_finish}.
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long a compilation will take to abort the
+   * compilation before it has even started if the driver believes the
+   * compilation cannot be completed within the timeout duration. Similarly, it
+   * enables drivers to abort an ongoing compilation if it is taking too long.
+   * However, this call does not guarantee that the compilation will complete or
+   * abort within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksCompilation_setTimeout is called),
+   * the timeout duration for compiling the model is considered infinite.
+   *
+   * The {@link ANeuralNetworksCompilation} must have been created with
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent finishing a compilation. If this duration is exceeded, the
+   *     compilation may be aborted. If set to 0, the timeout duration is
+   *     considered infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksCompilation_setTimeout)(
+      ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+  /**
+   * Set the execution priority.
+   *
+   * Execution priorities are relative to other executions created by the same
+   * application (specifically same uid) for the same device. Specifically,
+   * priorities of executions from one application will not affect executions
+   * from another application. Similarly, priorities of executions on one device
+   * will not affect executions on another device.
+   *
+   * Higher priority executions may use more compute resources than lower
+   * priority executions, and may preempt or starve lower priority executions.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param compilation The compilation to be modified.
+   * @param priority The relative priority of the execution compared to other
+   *     executions created by the application. Must be one of
+   *     ANEURALNETWORKS_PRIORITY_*.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksCompilation_setPriority)(
+      ANeuralNetworksCompilation* compilation, int priority);
+
   /**
    * Schedule synchronous evaluation of the execution.
    *
@@ -813,6 +883,84 @@ struct NnApi {
    */
   int (*ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
 
+  /**
+   * Set the maximum expected duration of the specified execution.
+   *
+   * If the device is not able to complete the execution within the specified
+   * duration, the execution may be aborted. The timeout duration begins at a
+   * call to one of:
+   * - {@link ANeuralNetworksExecution_burstCompute}
+   * - {@link ANeuralNetworksExecution_compute}
+   * - {@link ANeuralNetworksExecution_startCompute}
+   * - {@link ANeuralNetworksExecution_startComputeWithDependencies}
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long an execution will take to abort the
+   * execution before it has even started if the driver believes the execution
+   * cannot be completed within the timeout duration. Similarly, it enables
+   * drivers to abort an ongoing execution if it is taking too long. However,
+   * this call does not guarantee that the execution will complete or abort
+   * within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksExecution_setTimeout is called),
+   * the timeout duration for execution is considered infinite.
+   *
+   * The {@link ANeuralNetworksExecution} must have been created from an
+   * {@link ANeuralNetworksCompilation} which in turn was created from
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent executing a model. If this duration is exceeded, the execution
+   *     may be aborted. If set to 0, the timeout duration is considered
+   * infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
+  /**
+   * Set the maximum duration of WHILE loops in the specified execution.
+   *
+   * This is a fuzzy per-loop timeout intended to prevent infinite loops.
+   *
+   * If a WHILE loop condition model does not output false within the specified
+   * duration, the execution will be aborted.
+   *
+   * See {@link ANeuralNetworks_getDefaultLoopTimeout} and
+   * {@link ANeuralNetworks_getMaximumLoopTimeout} for the default
+   * and maximum timeout values.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that can be spent
+   *     executing a WHILE loop. If the specified duration value exceeds the
+   * value produced by {@link ANeuralNetworks_getMaximumLoopTimeout}, it will be
+   *     overridden by that value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *         ANEURALNETWORKS_BAD_STATE if execution has started.
+   *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setLoopTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
   /**
    * Get the dimensional information of the specified output operand of the
    * model of the
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index c5ccdb98390..2e25b0a17f7 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -77,6 +77,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRw";
     case kTfLiteArenaRwPersistent:
       return "kTfLiteArenaRwPersistent";
+    case kTfLitePersistentRo:
+      return "kTfLitePersistentRo";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 8333fa418c2..c1f37c81b7f 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -143,7 +143,6 @@ py_test(
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
-        "no_rocm",
         "no_windows",
     ],
     deps = [
@@ -158,9 +157,9 @@ py_test(
     name = "lite_v2_test",
     srcs = ["lite_v2_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
-        "no_rocm",
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index bf9bee02971..a5fbb88132e 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -108,18 +108,36 @@ class ConverterError(Exception):
   pass
 
 
-def mlir_quantize(input_data_str):
+def mlir_quantize(input_data_str, disable_per_channel=False,
+                  inference_type=_types_pb2.INT8):
   """Quantize `input_data_str` with calibration results.
 
   Args:
     input_data_str: Input data in serialized form (e.g. a TFLITE model with
                     calibration results).
+    disable_per_channel: Bool indicating whether to do per-channel or
+                         per-tensor quantization
+    inference_type: Data type for the activations. The default value is int8.
 
   Returns:
     Quantized model in serialized form (e.g. a TFLITE model) with floating-point
     inputs and outputs.
   """
-  return wrap_toco.wrapped_experimental_mlir_quantize(input_data_str)
+  return wrap_toco.wrapped_experimental_mlir_quantize(input_data_str,
+                                                      disable_per_channel,
+                                                      inference_type)
+
+
+def mlir_sparsify(input_data_str):
+  """Sparsify `input_data_str` to encode sparse tensor with proper format.
+
+  Args:
+    input_data_str: Input data in serialized form (e.g. a TFLITE model).
+
+  Returns:
+    Sparsified model in serialized form (e.g. a TFLITE model).
+  """
+  return wrap_toco.wrapped_experimental_mlir_sparsify(input_data_str)
 
 
 def toco_convert_protos(model_flags_str,
@@ -151,9 +169,10 @@ def toco_convert_protos(model_flags_str,
     RuntimeError: When conversion fails, an exception is raised with the error
       message embedded.
   """
-  # TODO(aselle): When toco does not use fatal errors for failure, we can
-  # switch this on.
-  if not _toco_from_proto_bin:
+  # Historically, TOCO conversion failures would trigger a crash, so we would
+  # attempt to run the converter out-of-process. The MLIR conversion pipeline
+  # surfaces errors instead, and can be safely run in-process.
+  if enable_mlir_converter or not _toco_from_proto_bin:
     try:
       model_str = wrap_toco.wrapped_toco_convert(model_flags_str,
                                                  toco_flags_str, input_data_str,
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index 19802233b65..4c2528000da 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -27,20 +27,8 @@ import numpy as np
 # pylint: disable=g-import-not-at-top
 if not __file__.endswith('tflite_runtime/interpreter.py'):
   # This file is part of tensorflow package.
-  from tensorflow.python.util.lazy_loader import LazyLoader
+  from tensorflow.lite.python.interpreter_wrapper import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper
   from tensorflow.python.util.tf_export import tf_export as _tf_export
-
-  # Lazy load since some of the performance benchmark skylark rules
-  # break dependencies. Must use double quotes to match code internal rewrite
-  # rule.
-  # pylint: disable=g-inconsistent-quotes
-  _interpreter_wrapper = LazyLoader(
-      "_interpreter_wrapper", globals(),
-      "tensorflow.lite.python.interpreter_wrapper."
-      '_pywrap_tensorflow_interpreter_wrapper')
-  # pylint: enable=g-inconsistent-quotes
-
-  del LazyLoader
 else:
   # This file is part of tflite_runtime package.
   from tflite_runtime import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper
@@ -415,13 +403,23 @@ class Interpreter(object):
     """
     self._interpreter.SetTensor(tensor_index, value)
 
-  def resize_tensor_input(self, input_index, tensor_size):
+  def resize_tensor_input(self, input_index, tensor_size, strict=False):
     """Resizes an input tensor.
 
+    ```
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.resize_tensor_input(0, [1, 224, 224, 3], strict=True)
+    interpreter.allocate_tensors()
+    interpreter.invoke()
+    ```
+
     Args:
       input_index: Tensor index of input to set. This value can be gotten from
         the 'index' field in get_input_details.
       tensor_size: The tensor_shape to resize the input to.
+      strict: Only unknown dimensions can be resized when `strict` is True.
+        Unknown dimensions are indicated as `-1` in the `shape_signature`
+        attribute of a given tensor. (default False)
 
     Raises:
       ValueError: If the interpreter could not resize the input tensor.
@@ -430,7 +428,7 @@ class Interpreter(object):
     # `ResizeInputTensor` now only accepts int32 numpy array as `tensor_size
     # parameter.
     tensor_size = np.array(tensor_size, dtype=np.int32)
-    self._interpreter.ResizeInputTensor(input_index, tensor_size)
+    self._interpreter.ResizeInputTensor(input_index, tensor_size, strict)
 
   def get_output_details(self):
     """Gets model output details.
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 3e6c453130a..c457e68c91b 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -257,7 +257,7 @@ PyObject* InterpreterWrapper::OutputIndices() const {
   return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
 }
 
-PyObject* InterpreterWrapper::ResizeInputTensor(int i, PyObject* value) {
+PyObject* InterpreterWrapper::ResizeInputTensorImpl(int i, PyObject* value) {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
 
   std::unique_ptr<PyObject, PyDecrefDeleter> array_safe(
@@ -282,10 +282,27 @@ PyObject* InterpreterWrapper::ResizeInputTensor(int i, PyObject* value) {
     return nullptr;
   }
 
+  PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(array),
+                      NPY_ARRAY_OWNDATA);
+  return PyArray_Return(reinterpret_cast<PyArrayObject*>(array));
+}
+
+PyObject* InterpreterWrapper::ResizeInputTensor(int i, PyObject* value,
+                                                bool strict) {
+  PyArrayObject* array =
+      reinterpret_cast<PyArrayObject*>(ResizeInputTensorImpl(i, value));
+  if (array == nullptr) {
+    return nullptr;
+  }
+
   std::vector<int> dims(PyArray_SHAPE(array)[0]);
   memcpy(dims.data(), PyArray_BYTES(array), dims.size() * sizeof(int));
 
-  TFLITE_PY_CHECK(interpreter_->ResizeInputTensor(i, dims));
+  if (strict) {
+    TFLITE_PY_CHECK(interpreter_->ResizeInputTensorStrict(i, dims));
+  } else {
+    TFLITE_PY_CHECK(interpreter_->ResizeInputTensor(i, dims));
+  }
   Py_RETURN_NONE;
 }
 
@@ -592,6 +609,7 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
       size_t size_of_type;
       if (GetSizeOfType(nullptr, tensor->type, &size_of_type) != kTfLiteOk) {
         PyErr_SetString(PyExc_ValueError, "Unknown tensor type.");
+        free(data);
         return nullptr;
       }
       sparse_buffer_dims[0] = tensor->bytes / size_of_type;
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 0c6220397b1..d7141189319 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -63,7 +63,7 @@ class InterpreterWrapper {
 
   PyObject* InputIndices() const;
   PyObject* OutputIndices() const;
-  PyObject* ResizeInputTensor(int i, PyObject* value);
+  PyObject* ResizeInputTensor(int i, PyObject* value, bool strict);
 
   int NumTensors() const;
   std::string TensorName(int i) const;
@@ -112,6 +112,9 @@ class InterpreterWrapper {
   InterpreterWrapper();
   InterpreterWrapper(const InterpreterWrapper& rhs);
 
+  // Helper function to resize an input tensor.
+  PyObject* ResizeInputTensorImpl(int i, PyObject* value);
+
   // The public functions which creates `InterpreterWrapper` should ensure all
   // these member variables are initialized successfully. Otherwise it should
   // report the error and return `nullptr`.
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
index d1f71d40dac..55c377c2bf1 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
@@ -55,82 +55,82 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
   py::class_<InterpreterWrapper>(m, "InterpreterWrapper")
       .def("AllocateTensors",
            [](InterpreterWrapper& self) {
-             return tensorflow::pyo_or_throw(self.AllocateTensors());
+             return tensorflow::PyoOrThrow(self.AllocateTensors());
            })
       .def("Invoke",
            [](InterpreterWrapper& self) {
-             return tensorflow::pyo_or_throw(self.Invoke());
+             return tensorflow::PyoOrThrow(self.Invoke());
            })
       .def("InputIndices",
            [](const InterpreterWrapper& self) {
-             return tensorflow::pyo_or_throw(self.InputIndices());
+             return tensorflow::PyoOrThrow(self.InputIndices());
            })
       .def("OutputIndices",
            [](InterpreterWrapper& self) {
-             return tensorflow::pyo_or_throw(self.OutputIndices());
+             return tensorflow::PyoOrThrow(self.OutputIndices());
            })
       .def("ResizeInputTensor",
-           [](InterpreterWrapper& self, int i, py::handle& value) {
-             return tensorflow::pyo_or_throw(
-                 self.ResizeInputTensor(i, value.ptr()));
+           [](InterpreterWrapper& self, int i, py::handle& value, bool strict) {
+             return tensorflow::PyoOrThrow(
+                 self.ResizeInputTensor(i, value.ptr(), strict));
            })
       .def("NumTensors", &InterpreterWrapper::NumTensors)
       .def("TensorName", &InterpreterWrapper::TensorName)
       .def("TensorType",
            [](const InterpreterWrapper& self, int i) {
-             return tensorflow::pyo_or_throw(self.TensorType(i));
+             return tensorflow::PyoOrThrow(self.TensorType(i));
            })
       .def("TensorSize",
            [](const InterpreterWrapper& self, int i) {
-             return tensorflow::pyo_or_throw(self.TensorSize(i));
+             return tensorflow::PyoOrThrow(self.TensorSize(i));
            })
       .def("TensorSizeSignature",
            [](const InterpreterWrapper& self, int i) {
-             return tensorflow::pyo_or_throw(self.TensorSizeSignature(i));
+             return tensorflow::PyoOrThrow(self.TensorSizeSignature(i));
            })
       .def("TensorSparsityParameters",
            [](const InterpreterWrapper& self, int i) {
-             return tensorflow::pyo_or_throw(self.TensorSparsityParameters(i));
+             return tensorflow::PyoOrThrow(self.TensorSparsityParameters(i));
            })
       .def(
           "TensorQuantization",
           [](const InterpreterWrapper& self, int i) {
-            return tensorflow::pyo_or_throw(self.TensorQuantization(i));
+            return tensorflow::PyoOrThrow(self.TensorQuantization(i));
           },
           R"pbdoc(
             Deprecated in favor of TensorQuantizationParameters.
           )pbdoc")
-      .def("TensorQuantizationParameters",
-           [](InterpreterWrapper& self, int i) {
-             return tensorflow::pyo_or_throw(
-                 self.TensorQuantizationParameters(i));
-           })
+      .def(
+          "TensorQuantizationParameters",
+          [](InterpreterWrapper& self, int i) {
+            return tensorflow::PyoOrThrow(self.TensorQuantizationParameters(i));
+          })
       .def("SetTensor",
            [](InterpreterWrapper& self, int i, py::handle& value) {
-             return tensorflow::pyo_or_throw(self.SetTensor(i, value.ptr()));
+             return tensorflow::PyoOrThrow(self.SetTensor(i, value.ptr()));
            })
       .def("GetTensor",
            [](const InterpreterWrapper& self, int i) {
-             return tensorflow::pyo_or_throw(self.GetTensor(i));
+             return tensorflow::PyoOrThrow(self.GetTensor(i));
            })
       .def("ResetVariableTensors",
            [](InterpreterWrapper& self) {
-             return tensorflow::pyo_or_throw(self.ResetVariableTensors());
+             return tensorflow::PyoOrThrow(self.ResetVariableTensors());
            })
       .def("NumNodes", &InterpreterWrapper::NumNodes)
       .def("NodeName", &InterpreterWrapper::NodeName)
       .def("NodeInputs",
            [](const InterpreterWrapper& self, int i) {
-             return tensorflow::pyo_or_throw(self.NodeInputs(i));
+             return tensorflow::PyoOrThrow(self.NodeInputs(i));
            })
       .def("NodeOutputs",
            [](const InterpreterWrapper& self, int i) {
-             return tensorflow::pyo_or_throw(self.NodeOutputs(i));
+             return tensorflow::PyoOrThrow(self.NodeOutputs(i));
            })
       .def(
           "tensor",
           [](InterpreterWrapper& self, py::handle& base_object, int i) {
-            return tensorflow::pyo_or_throw(self.tensor(base_object.ptr(), i));
+            return tensorflow::PyoOrThrow(self.tensor(base_object.ptr(), i));
           },
           R"pbdoc(
             Returns a reference to tensor index i as a numpy array. The
@@ -140,7 +140,7 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
           "ModifyGraphWithDelegate",
           // Address of the delegate is passed as an argument.
           [](InterpreterWrapper& self, uintptr_t delegate_ptr) {
-            return tensorflow::pyo_or_throw(self.ModifyGraphWithDelegate(
+            return tensorflow::PyoOrThrow(self.ModifyGraphWithDelegate(
                 reinterpret_cast<TfLiteDelegate*>(delegate_ptr)));
           },
           R"pbdoc(
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 96f3428efe3..ce59c56a1d0 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -20,6 +20,8 @@ from __future__ import division
 from __future__ import print_function
 
 import enum
+import shutil
+import tempfile
 import warnings
 
 from absl import logging
@@ -38,6 +40,7 @@ from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.lite.python.convert import ConverterError  # pylint: disable=unused-import
 from tensorflow.lite.python.convert import mlir_quantize as _mlir_quantize
+from tensorflow.lite.python.convert import mlir_sparsify as _mlir_sparsify
 from tensorflow.lite.python.convert import OpsSet
 from tensorflow.lite.python.convert import toco_convert  # pylint: disable=unused-import
 from tensorflow.lite.python.convert import toco_convert_graph_def as _toco_convert_graph_def
@@ -90,6 +93,22 @@ class Optimize(enum.Enum):
   """Enum defining the optimizations to apply when generating tflite graphs.
 
   Some optimizations may come at the cost of accuracy.
+
+  DEFAULT
+      Default optimization strategy.
+
+      Converter will do its best to improve size and latency based on the
+      information provided.
+      Enhanced optimizations are gained by providing a representative_dataset.
+      This is recommended, and is currently equivalent to the modes below.
+      Currently, weights will be quantized and if representative_dataset is
+      provided, activations for quantizable operations will also be quantized.
+
+  OPTIMIZE_FOR_SIZE
+      Deprecated. Does the same as DEFAULT.
+
+  OPTIMIZE_FOR_LATENCY
+      Deprecated. Does the same as DEFAULT.
   """
 
   # Default optimization strategy.
@@ -102,19 +121,10 @@ class Optimize(enum.Enum):
   # provided, activations for quantizable operations will also be quantized.
   DEFAULT = "DEFAULT"
 
-  # Optimize for size.
-  #
-  # Optimizations that reduce the size of the model.
-  # The model size will be reduced.
-  # Currently, weights will be quantized and if representative_dataset is
-  # provided, activations for quantizable operations will also be quantized.
+  # Deprecated. Does the same as DEFAULT.
   OPTIMIZE_FOR_SIZE = "OPTIMIZE_FOR_SIZE"
 
-  # Optimize for latency.
-  #
-  # Optimizations that reduce the latency of the model.
-  # Currently, weights will be quantized and if representative_dataset is
-  # provided, activations for quantizable operations will also be quantized.
+  # Deprecated. Does the same as DEFAULT.
   OPTIMIZE_FOR_LATENCY = "OPTIMIZE_FOR_LATENCY"
 
   def __str__(self):
@@ -288,10 +298,11 @@ class TFLiteConverterBase(object):
     # The 'GraphDebugInfo'  contains the stack traces of all the original nodes
     # in the `GraphDef` to the converter.
     self._debug_info = None
-    self._saved_model_dir = None
+    self.saved_model_dir = None
     self._saved_model_tags = None
-    self._saved_model_version = None
+    self._saved_model_version = 0
     self._saved_model_exported_names = []
+    self._experimental_sparsify_model = False
 
   def _grappler_config(self, optimizers=None):
     """Creates a tf.compat.v1.ConfigProto for configuring Grappler.
@@ -337,15 +348,9 @@ class TFLiteConverterBase(object):
           self.representative_dataset.input_gen, inference_input_type,
           inference_output_type, allow_float)
 
-  def _is_unknown_shapes_allowed(self, fp32_execution):
-    # TODO(b/128319310): Investigate which quantization methods work.
-    if not fp32_execution:
-      return False
-
+  def _is_unknown_shapes_allowed(self):
     # Unknown dimensions are only allowed with the new converter.
-    if not self.experimental_new_converter:
-      return False
-    return True
+    return self.experimental_new_converter
 
   def _get_base_converter_args(self):
     """Returns the base converter args.
@@ -363,9 +368,9 @@ class TFLiteConverterBase(object):
         "enable_mlir_converter": self.experimental_new_converter,
     }
 
-    if self._saved_model_dir:
+    if self.saved_model_dir:
       args.update({
-          "saved_model_dir": self._saved_model_dir,
+          "saved_model_dir": self.saved_model_dir,
           "saved_model_version": self._saved_model_version,
           "saved_model_tags": self._saved_model_tags,
           "saved_model_exported_names": self._saved_model_exported_names,
@@ -384,21 +389,22 @@ class TFLiteConverterBase(object):
   def _parse_saved_model_args(self):
     """Parses SavedModel arguments from the given Keras/RNN SavedModel."""
     if not self.experimental_new_converter:
-      self._saved_model_dir = None
+      self.saved_model_dir = None
       return
-    if self._saved_model_dir:
+    if self.saved_model_dir:
       try:
         saved_model_proto, _ = (
-            _parse_saved_model_with_debug_info(self._saved_model_dir))
+            _parse_saved_model_with_debug_info(self.saved_model_dir))
       except OSError:
         # If it fails to read the given saved model, it will fall back to the
         # frozen graph def path.
-        self._saved_model_dir = None
+        self.saved_model_dir = None
         return
       if not self._contains_function_with_implements_attr(saved_model_proto):
-        self._saved_model_dir = None
+        self.saved_model_dir = None
       else:
-        self._saved_model_exported_names = []
+        if not self._saved_model_exported_names:
+          self._saved_model_exported_names = []
         self._saved_model_version = saved_model_proto.saved_model_schema_version
         if self._saved_model_version not in [1, 2]:
           raise ValueError(
@@ -406,8 +412,348 @@ class TFLiteConverterBase(object):
                   self._saved_model_version))
 
 
+class TFLiteConverterBaseV2(TFLiteConverterBase):
+  """Converter subclass to share functionality between V2 converters."""
+
+  def convert(self, graph_def, input_tensors, output_tensors):
+    """Converts a TensorFlow GraphDef based on instance variables.
+
+    Args:
+      graph_def: Frozen TensorFlow GraphDef.
+      input_tensors: List of input tensors. Type and shape are computed using
+        `foo.shape` and `foo.dtype`.
+      output_tensors: List of output tensors (only .name is used from this).
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        No concrete functions is specified.
+        Multiple concrete functions are specified.
+        Input shape is not specified.
+        Invalid quantization parameters.
+    """
+    quant_mode = QuantizationMode(self.optimizations, self.target_spec,
+                                  self.representative_dataset, graph_def)
+
+    if not self._is_unknown_shapes_allowed():
+      # Checks dimensions in input tensor.
+      for tensor in input_tensors:
+        # Note that shape_list might be empty for scalar shapes.
+        shape_list = tensor.shape.as_list()
+        if None in shape_list[1:]:
+          raise ValueError(
+              "None is only supported in the 1st dimension. Tensor '{0}' has "
+              "invalid shape '{1}'.".format(
+                  _get_tensor_name(tensor), shape_list))
+        elif shape_list and shape_list[0] is None:
+          # Set the batch size to 1 if undefined.
+          shape = tensor.shape.as_list()
+          shape[0] = 1
+          tensor.set_shape(shape)
+
+    if self._trackable_obj is None:
+      self._debug_info = _get_debug_info(
+          _build_debug_info_func(self._funcs[0].graph), graph_def)
+    else:
+      self._debug_info = _get_debug_info(
+          _convert_debug_info_func(self._trackable_obj.graph_debug_info),
+          graph_def)
+
+    converter_kwargs = self._get_base_converter_args()
+
+    if quant_mode.training_time_int8_allow_float():
+      converter_kwargs.update({
+          "inference_type": constants.INT8,
+          "inference_input_type": constants.FLOAT,
+      })
+
+    if quant_mode.post_training_dynamic_range_int8():
+      converter_kwargs.update({
+          "post_training_quantize": True,
+      })
+    elif quant_mode.post_training_fp16():
+      converter_kwargs.update({
+          "post_training_quantize": True,
+          "quantize_to_float16": True,
+      })
+
+    if not self.experimental_new_converter:
+      logging.warning(
+          "Please consider switching to use new converter by setting "
+          "experimental_new_converter to true. "
+          "Old converter (TOCO) is deprecated and flow will be switched on "
+          "by default to use new converter soon.")
+    else:
+      logging.info("Using experimental converter: If you encountered a problem "
+                   "please file a bug. You can opt-out "
+                   "by setting experimental_new_converter=False")
+
+    # Converts model.
+    result = _toco_convert_impl(
+        input_data=graph_def,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        **converter_kwargs)
+
+    if quant_mode.post_training_int8_no_float():
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, False)
+    elif quant_mode.post_training_int8_allow_float():
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, True)
+
+    if self._experimental_sparsify_model:
+      result = _mlir_sparsify(result)
+
+    return result
+
+
+class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
+  """Converts the given SavedModel into TensorFlow Lite model.
+
+  Attributes:
+      saved_model_dir: Directory of the SavedModel.
+  """
+
+  def __init__(self,
+               saved_model_dir,
+               saved_model_tags=None,
+               saved_model_exported_names=None,
+               trackable_obj=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      saved_model_dir: Directory of the SavedModel.
+      saved_model_tags: Set of tags identifying the MetaGraphDef within the
+        SavedModel to analyze. All tags in the tag set must be present. (default
+        set(SERVING)).
+      saved_model_exported_names: Names to be exported (default: export all)
+        when the saved model import path is on.
+      trackable_obj: tf.AutoTrackable object associated with `funcs`. A
+        reference to this object needs to be maintained so that Variables do not
+        get garbage collected since functions have a weak reference to
+        Variables. This is only required when the tf.AutoTrackable object is not
+        maintained by the user (e.g. `from_saved_model`).
+    """
+    super(TFLiteSavedModelConverterV2, self).__init__()
+    self.saved_model_dir = saved_model_dir
+    self._saved_model_tags = saved_model_tags
+    self._saved_model_exported_names = saved_model_exported_names
+    self._trackable_obj = trackable_obj
+    self._parse_saved_model_args()
+
+  def convert(self):
+    """Converts a TensorFlow GraphDef based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        No concrete functions is specified.
+        Multiple concrete functions are specified.
+        Input shape is not specified.
+        Invalid quantization parameters.
+    """
+    graph = _ops.Graph()
+    saved_model = _loader_impl.SavedModelLoader(self.saved_model_dir)
+    saved_model.load_graph(graph, tags=self._saved_model_tags)
+    meta_graph = saved_model.get_meta_graph_def_from_tags(
+        self._saved_model_tags)
+    signature_def = meta_graph.signature_def[
+        _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    input_tensors = [
+        graph.get_tensor_by_name(signature_def.inputs[key].name)
+        for key in signature_def.inputs
+    ]
+    output_tensors = [
+        graph.get_tensor_by_name(signature_def.outputs[key].name)
+        for key in signature_def.outputs
+    ]
+    return super(TFLiteSavedModelConverterV2,
+                 self).convert(meta_graph.graph_def, input_tensors,
+                               output_tensors)
+
+
+class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
+  """Converts the given Keras model into TensorFlow Lite model."""
+
+  def __init__(self, keras_model, trackable_obj=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      keras_model: tf.Keras.Model.
+      trackable_obj: tf.AutoTrackable object associated with `funcs`. A
+        reference to this object needs to be maintained so that Variables do not
+        get garbage collected since functions have a weak reference to
+        Variables. This is only required when the tf.AutoTrackable object is not
+        maintained by the user (e.g. `from_saved_model`).
+    """
+    super(TFLiteKerasModelConverterV2, self).__init__()
+    self._keras_model = keras_model
+    self._trackable_obj = trackable_obj
+
+  def convert(self):
+    """Converts a keras model based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        Multiple concrete functions are specified.
+        Input shape is not specified.
+        Invalid quantization parameters.
+    """
+    temp_dir = tempfile.mkdtemp()
+    try:
+      self._keras_model.save(temp_dir, save_format="tf")
+      self.saved_model_dir = temp_dir
+      self._saved_model_tags = set([_tag_constants.SERVING])
+      self._saved_model_exported_names = [
+          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      ]
+      self._parse_saved_model_args()
+      if self.saved_model_dir:
+        graph = _ops.Graph()
+        saved_model = _loader_impl.SavedModelLoader(self.saved_model_dir)
+        saved_model.load_graph(graph, tags=self._saved_model_tags)
+        meta_graph = saved_model.get_meta_graph_def_from_tags(
+            self._saved_model_tags)
+        signature_def = meta_graph.signature_def[
+            _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+        input_tensors = [
+            graph.get_tensor_by_name(signature_def.inputs[key].name)
+            for key in signature_def.inputs
+        ]
+        output_tensors = [
+            graph.get_tensor_by_name(signature_def.outputs[key].name)
+            for key in signature_def.outputs
+        ]
+        self._trackable_obj = _load(self.saved_model_dir,
+                                    self._saved_model_tags)
+        return super(TFLiteKerasModelConverterV2,
+                     self).convert(meta_graph.graph_def, input_tensors,
+                                   output_tensors)
+    finally:
+      shutil.rmtree(temp_dir, True)
+
+    input_signature = None
+    # If the model's call is not a `tf.function`, then we need to first get its
+    # input signature from `model_input_signature` method. We can't directly
+    # call `trace_model_call` because otherwise the batch dimension is set
+    # to None.
+    # Once we have better support for dynamic shapes, we can remove this.
+    if not isinstance(self._keras_model.call, _def_function.Function):
+      # Pass `keep_original_batch_size=True` will ensure that we get an input
+      # signature including the batch dimension specified by the user.
+      input_signature = _saving_utils.model_input_signature(
+          self._keras_model, keep_original_batch_size=True)
+
+    func = _saving_utils.trace_model_call(self._keras_model, input_signature)
+    concrete_func = func.get_concrete_function()
+    self._funcs = [concrete_func]
+
+    frozen_func, graph_def = (
+        _convert_to_constants.convert_variables_to_constants_v2_as_graph(
+            self._funcs[0], lower_control_flow=False))
+
+    input_tensors = [
+        tensor for tensor in frozen_func.inputs
+        if tensor.dtype != _dtypes.resource
+    ]
+    output_tensors = frozen_func.outputs
+
+    # Run a Grappler pass.
+    grappler_config = self._grappler_config()
+    # Skip running grappler when there are no optimizers to run. If not,
+    # grappler will run with the default optimizer set and it will lead to
+    # causing an unexpected behavior.
+    if grappler_config.graph_options.rewrite_options.optimizers:
+      graph_def = _run_graph_optimizations(
+          graph_def,
+          input_tensors,
+          output_tensors,
+          config=grappler_config,
+          graph=frozen_func.graph)
+
+    return super(TFLiteKerasModelConverterV2,
+                 self).convert(graph_def, input_tensors, output_tensors)
+
+
+class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
+  """Converts the given frozen graph into TensorFlow Lite model."""
+
+  def __init__(self, funcs, trackable_obj=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      funcs: List of TensorFlow ConcreteFunctions. The list should not contain
+        duplicate elements.
+      trackable_obj: tf.AutoTrackable object associated with `funcs`. A
+        reference to this object needs to be maintained so that Variables do not
+        get garbage collected since functions have a weak reference to
+        Variables. This is only required when the tf.AutoTrackable object is not
+        maintained by the user (e.g. `from_saved_model`).
+    """
+    super(TFLiteFrozenGraphConverterV2, self).__init__()
+    self._funcs = funcs
+    self._trackable_obj = trackable_obj
+
+  def convert(self):
+    """Converts a TensorFlow GraphDef based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        No concrete functions is specified.
+        Multiple concrete functions are specified.
+        Input shape is not specified.
+        Invalid quantization parameters.
+    """
+    # TODO(b/130297984): Add support for converting multiple function.
+
+    if len(self._funcs) == 0:
+      raise ValueError("No ConcreteFunction is specified.")
+
+    if len(self._funcs) > 1:
+      raise ValueError("This converter can only convert a single "
+                       "ConcreteFunction. Converting multiple functions is "
+                       "under development.")
+
+    frozen_func, graph_def = (
+        _convert_to_constants.convert_variables_to_constants_v2_as_graph(
+            self._funcs[0], lower_control_flow=False))
+
+    input_tensors = [
+        tensor for tensor in frozen_func.inputs
+        if tensor.dtype != _dtypes.resource
+    ]
+    output_tensors = frozen_func.outputs
+
+    # Run a Grappler pass.
+    grappler_config = self._grappler_config()
+    # Skip running grappler when there are no optimizers to run. If not,
+    # grappler will run with the default optimizer set and it will lead to
+    # causing an unexpected behavior.
+    if grappler_config.graph_options.rewrite_options.optimizers:
+      graph_def = _run_graph_optimizations(
+          graph_def,
+          input_tensors,
+          output_tensors,
+          config=grappler_config,
+          graph=frozen_func.graph)
+
+    return super(TFLiteFrozenGraphConverterV2,
+                 self).convert(graph_def, input_tensors, output_tensors)
+
+
 @_tf_export("lite.TFLiteConverter", v1=[])
-class TFLiteConverterV2(TFLiteConverterBase):
+class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
   """Converts a TensorFlow model into TensorFlow Lite model.
 
   Attributes:
@@ -444,11 +790,8 @@ class TFLiteConverterV2(TFLiteConverterBase):
     ```
   """
 
-  def __init__(self,
-               funcs,
-               trackable_obj=None,
-               saved_model_dir=None,
-               saved_model_tags=None):
+  # pylint: disable=useless-super-delegation
+  def __init__(self, funcs, trackable_obj=None):
     """Constructor for TFLiteConverter.
 
     Args:
@@ -459,19 +802,8 @@ class TFLiteConverterV2(TFLiteConverterBase):
         get garbage collected since functions have a weak reference to
         Variables. This is only required when the tf.AutoTrackable object is not
         maintained by the user (e.g. `from_saved_model`).
-      saved_model_dir: Directory of the SavedModel. This argument can be null
-        when it creates via the from_keras_model and from_concrete_function
-        methods.
-      saved_model_tags: Set of tags identifying the MetaGraphDef within the
-        SavedModel to analyze. All tags in the tag set must be present. (default
-        set(SERVING)).  This argument will be available when the saved model dir
-        argument is set.
     """
-    super(TFLiteConverterV2, self).__init__()
-    self._funcs = funcs
-    self._trackable_obj = trackable_obj
-    self._saved_model_dir = saved_model_dir
-    self._saved_model_tags = saved_model_tags
+    super(TFLiteConverterV2, self).__init__(funcs, trackable_obj)
 
   @classmethod
   def from_concrete_functions(cls, funcs):
@@ -541,6 +873,9 @@ class TFLiteConverterV2(TFLiteConverterBase):
     if not signature_keys:
       signature_keys = saved_model.signatures
 
+    if len(signature_keys) != 1:
+      raise ValueError("Only support a single signature key.")
+
     funcs = []
     for key in signature_keys:
       if key not in saved_model.signatures:
@@ -548,7 +883,13 @@ class TFLiteConverterV2(TFLiteConverterBase):
                          "'{}'.".format(key, ",".join(saved_model.signatures)))
       funcs.append(saved_model.signatures[key])
 
-    return cls(funcs, saved_model, saved_model_dir, tags)
+    saved_model_converter = TFLiteSavedModelConverterV2(saved_model_dir, tags,
+                                                        signature_keys,
+                                                        saved_model)
+    if saved_model_converter.saved_model_dir:
+      return saved_model_converter
+
+    return cls(funcs, saved_model)
 
   @classmethod
   def from_keras_model(cls, model):
@@ -560,22 +901,9 @@ class TFLiteConverterV2(TFLiteConverterBase):
     Returns:
       TFLiteConverter object.
     """
-    input_signature = None
-    # If the model's call is not a `tf.function`, then we need to first get its
-    # input signature from `model_input_signature` method. We can't directly
-    # call `trace_model_call` because otherwise the batch dimension is set
-    # to None.
-    # Once we have better support for dynamic shapes, we can remove this.
-    if not isinstance(model.call, _def_function.Function):
-      # Pass `keep_original_batch_size=True` will ensure that we get an input
-      # signature including the batch dimension specified by the user.
-      input_signature = _saving_utils.model_input_signature(
-          model, keep_original_batch_size=True)
-
-    func = _saving_utils.trace_model_call(model, input_signature)
-    concrete_func = func.get_concrete_function()
-    return cls([concrete_func])
+    return TFLiteKerasModelConverterV2(model)
 
+  # pylint: disable=useless-super-delegation
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.
 
@@ -584,97 +912,269 @@ class TFLiteConverterV2(TFLiteConverterBase):
 
     Raises:
       ValueError:
+        No concrete functions is specified.
         Multiple concrete functions are specified.
         Input shape is not specified.
         Invalid quantization parameters.
     """
-    # TODO(b/130297984): Add support for converting multiple function.
-    if len(self._funcs) != 1:
-      raise ValueError("This converter can only convert a single "
-                       "ConcreteFunction. Converting multiple functions is "
-                       "under development.")
+    return super(TFLiteConverterV2, self).convert()
 
-    # Parses SavedModel argument.
-    self._parse_saved_model_args()
 
-    # graph_def is used here to preserve the node bug information
-    if self._saved_model_dir:
-      graph = _ops.Graph()
-      saved_model = _loader_impl.SavedModelLoader(self._saved_model_dir)
-      saved_model.load_graph(graph, tags=self._saved_model_tags)
-      meta_graph = saved_model.get_meta_graph_def_from_tags(
-          self._saved_model_tags)
-      signature_def = meta_graph.signature_def[
-          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-      input_tensors = [
-          graph.get_tensor_by_name(signature_def.inputs[key].name)
-          for key in signature_def.inputs
-      ]
-      output_tensors = [
-          graph.get_tensor_by_name(signature_def.outputs[key].name)
-          for key in signature_def.outputs
-      ]
-      self._graph_def = graph_def = meta_graph.graph_def
-    else:
-      frozen_func, graph_def = (
-          _convert_to_constants.convert_variables_to_constants_v2_as_graph(
-              self._funcs[0], lower_control_flow=False))
-      self._graph_def = graph_def
+class TFLiteConverterBaseV1(TFLiteConverterBase):
+  """Converter subclass to share functionality between V1 converters.
 
-      input_tensors = [
-          tensor for tensor in frozen_func.inputs
-          if tensor.dtype != _dtypes.resource
-      ]
-      output_tensors = frozen_func.outputs
+  Attributes:
+    inference_type: Target data type of real-number arrays in the output file.
+      Must be `{tf.float32, tf.uint8}`. If `optimzations` are provided, this
+      parameter is ignored. (default tf.float32)
+    inference_input_type: Target data type of real-number input arrays. Allows
+      for a different type for input arrays. If an integer type is provided and
+      `optimizations` are not used, `quantized_inputs_stats` must be provided.
+      If `inference_type` is tf.uint8, signaling conversion to a fully quantized
+      model from a quantization-aware trained input model, then
+      `inference_input_type` defaults to tf.uint8. In all other cases,
+      `inference_input_type` defaults to tf.float32. Must be `{tf.float32,
+      tf.uint8, tf.int8}`
+    inference_output_type: Target data type of real-number output arrays. Allows
+      for a different type for output arrays. If `inference_type` is tf.uint8,
+      signaling conversion to a fully quantized model from a quantization-aware
+      trained output model, then `inference_output_type` defaults to tf.uint8.
+      In all other cases, `inference_output_type` must be tf.float32, an error
+      will be thrown otherwise. Must be `{tf.float32, tf.uint8, tf.int8}`
+    output_format: Output file format. Currently must be `{TFLITE,
+      GRAPHVIZ_DOT}`. (default TFLITE)
+    quantized_input_stats: Dict of strings representing input tensor names
+      mapped to tuple of floats representing the mean and standard deviation
+      of the training data (e.g., {"foo" : (0., 1.)}). Only need if
+        `inference_input_type` is `QUANTIZED_UINT8`. real_input_value =
+        (quantized_input_value - mean_value) / std_dev_value. (default {})
+    default_ranges_stats: Tuple of integers representing (min, max) range values
+      for all arrays without a specified range. Intended for experimenting with
+      quantization via "dummy quantization". (default None)
+    drop_control_dependency: Boolean indicating whether to drop control
+      dependencies silently. This is due to TFLite not supporting control
+      dependencies. (default True)
+    reorder_across_fake_quant: Boolean indicating whether to reorder FakeQuant
+      nodes in unexpected locations. Used when the location of the FakeQuant
+      nodes is preventing graph transformations necessary to convert the graph.
+      Results in a graph that differs from the quantized training graph,
+      potentially causing differing arithmetic behavior. (default False)
+    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
+      inputs and outputs of the concat operator for quantized models. Changes
+      the ranges of concat operator overlap when true. (default False)
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver. (default
+      False)
+    post_training_quantize: Deprecated. Please specify `[Optimize.DEFAULT]` for
+      `optimizations` instead. Boolean indicating whether to quantize the
+      weights of the converted float model.  Model size will be reduced and
+      there will be latency improvements (at the cost of accuracy). (default
+      False)
+    dump_graphviz_dir: Full filepath of folder to dump the graphs at various
+      stages of processing GraphViz .dot files. Preferred over
+      --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
+      output file. (default None)
+    dump_graphviz_video: Boolean indicating whether to dump the graph after
+      every graph transformation. (default False)
+    conversion_summary_dir: A string indicating the path to the generated
+      conversion logs.
+    target_ops: Deprecated. Please specify `target_spec.supported_ops` instead.
+      Set of OpsSet options indicating which converter to use. (default
+      set([OpsSet.TFLITE_BUILTINS]))
+    target_spec: Experimental flag, subject to change. Specification of target
+      device.
+    optimizations: Experimental flag, subject to change. A list of optimizations
+      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
+    representative_dataset: A representative dataset that can be used to
+      generate input and output samples for the model. The converter can use the
+      dataset to evaluate different optimizations.
+    experimental_new_converter: Experimental flag, subject to change. Enables
+      MLIR-based conversion instead of TOCO conversion.
+  """
 
-      # Run a Grappler pass.
-      grappler_config = self._grappler_config()
-      # Skip running grappler when there are no optimizers to run. If not,
-      # grappler will run with the default optimizer set and it will lead to
-      # causing an unexpected behavior.
-      if grappler_config.graph_options.rewrite_options.optimizers:
-        graph_def = _run_graph_optimizations(
-            graph_def,
-            input_tensors,
-            output_tensors,
-            config=grappler_config,
-            graph=frozen_func.graph)
+  def __init__(self, experimental_debug_info_func):
+    """Constructor for TFLiteConverter.
 
+    Args:
+      experimental_debug_info_func: An experimental function to retrieve the
+        graph debug info for a set of nodes from the `graph_def`.
+    """
+    super(TFLiteConverterBaseV1, self).__init__()
+    self.inference_type = constants.FLOAT
+    self.inference_input_type = None
+    self.inference_output_type = None
+    self.output_format = constants.TFLITE
+    self.quantized_input_stats = {}
+    self.default_ranges_stats = None
+    self.drop_control_dependency = True
+    self.reorder_across_fake_quant = False
+    self.change_concat_input_ranges = False
+    self.dump_graphviz_dir = None
+    self.dump_graphviz_video = False
+    self.conversion_summary_dir = None
+    self._debug_info_func = experimental_debug_info_func
+    self._custom_opdefs = None
+
+  def __setattr__(self, name, value):
+    if name == "post_training_quantize":
+      warnings.warn("Property %s is deprecated, "
+                    "please use optimizations=[Optimize.DEFAULT]"
+                    " instead." % name)
+      if value:
+        self.optimizations = [Optimize.DEFAULT]
+      else:
+        self.optimizations = []
+      return
+    if name == "target_ops":
+      warnings.warn("Property %s is deprecated, please use "
+                    "target_spec.supported_ops instead." % name)
+      self.target_spec.supported_ops = value
+      return
+    object.__setattr__(self, name, value)
+
+  def __getattribute__(self, name):
+    if name == "post_training_quantize":
+      warnings.warn("Property %s is deprecated, "
+                    "please use optimizations=[Optimize.DEFAULT]"
+                    " instead." % name)
+      return Optimize.DEFAULT in set(self.optimizations)
+    if name == "target_ops":
+      warnings.warn("Property %s is deprecated, please use "
+                    "target_spec.supported_ops instead." % name)
+      return self.target_spec.supported_ops
+    return object.__getattribute__(self, name)
+
+  def _validate_quantized_input_stats(self, converter_kwargs):
+    """Ensure quantized_input_stats provided if required."""
+
+    quantized_types = frozenset({constants.INT8, constants.QUANTIZED_UINT8})
+
+    requires_quantized_input_stats = (
+        (converter_kwargs["inference_type"] in quantized_types or
+         converter_kwargs["inference_input_type"] in quantized_types) and
+        not converter_kwargs["post_training_quantize"])
+
+    if (requires_quantized_input_stats and
+        not converter_kwargs["quantized_input_stats"]):
+      raise ValueError("std_dev and mean must be defined when inference_type "
+                       "or inference_input_type is QUANTIZED_UINT8 or INT8.")
+
+  def convert(self):
+    """Converts a TensorFlow GraphDef based on instance variables.
+
+    Returns:
+      The converted data in serialized format. Either a TFLite Flatbuffer or a
+      Graphviz graph depending on value in `output_format`.
+
+    Raises:
+      ValueError:
+        Input shape is not specified.
+        None value for dimension in input_tensor.
+    """
     quant_mode = QuantizationMode(self.optimizations, self.target_spec,
-                                  self.representative_dataset, graph_def)
+                                  self.representative_dataset, self._graph_def)
 
-    if not self._is_unknown_shapes_allowed(quant_mode.fp32_execution()):
+    if (not self._is_unknown_shapes_allowed() and self._has_valid_tensors()):
       # Checks dimensions in input tensor.
-      for tensor in input_tensors:
+      for tensor in self._input_tensors:
+        shape = tensor.shape
+        if not shape:
+          raise ValueError("Provide an input shape for input array "
+                           "'{0}'.".format(_get_tensor_name(tensor)))
         # Note that shape_list might be empty for scalar shapes.
-        shape_list = tensor.shape.as_list()
+        shape_list = shape.as_list()
         if None in shape_list[1:]:
           raise ValueError(
               "None is only supported in the 1st dimension. Tensor '{0}' has "
               "invalid shape '{1}'.".format(
                   _get_tensor_name(tensor), shape_list))
         elif shape_list and shape_list[0] is None:
-          # Set the batch size to 1 if undefined.
-          shape = tensor.shape.as_list()
-          shape[0] = 1
-          tensor.set_shape(shape)
+          self._set_batch_size(batch_size=1)
 
-    if self._trackable_obj is None:
-      self._debug_info = _get_debug_info(
-          _build_debug_info_func(self._funcs[0].graph), graph_def)
+    # Get quantization stats. Ensures there is one stat per name if the stats
+    # are specified.
+    if self.quantized_input_stats:
+      quantized_stats = []
+      invalid_stats = []
+      for name in self.get_input_arrays():
+        if name in self.quantized_input_stats:
+          quantized_stats.append(self.quantized_input_stats[name])
+        else:
+          invalid_stats.append(name)
+
+      if invalid_stats:
+        raise ValueError("Quantization input stats are not available for input "
+                         "tensors '{0}'.".format(",".join(invalid_stats)))
     else:
-      self._debug_info = _get_debug_info(
-          _convert_debug_info_func(self._trackable_obj.graph_debug_info),
-          graph_def)
+      quantized_stats = None
+
+    toco_inference_input_type = self.inference_input_type
+    inference_input_type = self.inference_input_type
+    inference_output_type = self.inference_output_type
+    post_training_optimize = (
+        quant_mode.post_training_int8_no_float() or
+        quant_mode.post_training_int8_allow_float() or
+        quant_mode.post_training_dynamic_range_int8() or
+        quant_mode.post_training_fp16())
+    if post_training_optimize:
+      # Post training optimizations require that TOCO outputs a float model.
+      if self.inference_type != constants.FLOAT:
+        raise ValueError(
+            "`optimizations` require that `inference_type` is set to float.")
+      toco_inference_input_type = constants.FLOAT
+      # Set up default values.
+      if inference_input_type is None:
+        inference_input_type = constants.FLOAT
+      if inference_output_type is None:
+        inference_output_type = constants.FLOAT
+
+    weight_only_quantize = (
+        quant_mode.post_training_dynamic_range_int8() or
+        quant_mode.post_training_fp16())
+    if weight_only_quantize:
+      # Currently, weight only quantization requires float inputs and outputs.
+      if (inference_input_type != constants.FLOAT or
+          inference_output_type != constants.FLOAT):
+        raise ValueError(
+            "Provide an inference_input_type and inference_output_type of type "
+            "tf.float32.")
+
+    if not post_training_optimize and self.inference_output_type is not None:
+      raise ValueError(
+          "inference_output_type is currently not supported if optimizations "
+          "are not enabled.")
+
+    optimized_graph = self._graph_def
+    if not self.saved_model_dir:
+      # if it is not uint8 or int8 with post-training quantization, it is not
+      # quantization aware training, then graph optimization is applied.
+      # Graph optimization is disabled for quantization aware training.
+      if (self.inference_type != constants.QUANTIZED_UINT8 or
+          (self.inference_type == constants.INT8 and
+           (post_training_optimize or weight_only_quantize))):
+        try:
+          # TODO(b/150163103): Merge `disabling lower using switch merge' calls.
+          # Grappler will also try to lower while loop into switch merge
+          # representation which is undesired for Ophints, so we simply remove
+          # those attributes to prevent Grappler from doing so.
+          graph_def = _convert_to_constants.disable_lower_using_switch_merge(
+              optimized_graph)
+          # Run function inlining optimization to ensure any models generated
+          # through the from_frozen_graph path have been inlined.
+          optimized_graph = _run_graph_optimizations(
+              graph_def,
+              self._input_tensors,
+              self._output_tensors,
+              config=self._grappler_config(["function"]))
+        except Exception:  # pylint: disable=broad-except
+          optimized_graph = self._graph_def
+
+    self._debug_info = _get_debug_info(self._debug_info_func, optimized_graph)
 
     converter_kwargs = self._get_base_converter_args()
 
-    if quant_mode.training_time_int8_allow_float():
-      converter_kwargs.update({
-          "inference_type": constants.INT8,
-          "inference_input_type": constants.FLOAT,
-      })
-
     if quant_mode.post_training_dynamic_range_int8():
       converter_kwargs.update({
           "post_training_quantize": True,
@@ -685,6 +1185,21 @@ class TFLiteConverterV2(TFLiteConverterBase):
           "quantize_to_float16": True,
       })
 
+    converter_kwargs.update({
+        "inference_type": self.inference_type,
+        "inference_input_type": toco_inference_input_type,
+        "output_format": self.output_format,
+        "quantized_input_stats": quantized_stats,
+        "default_ranges_stats": self.default_ranges_stats,
+        "drop_control_dependency": self.drop_control_dependency,
+        "reorder_across_fake_quant": self.reorder_across_fake_quant,
+        "change_concat_input_ranges": self.change_concat_input_ranges,
+        "dump_graphviz_dir": self.dump_graphviz_dir,
+        "dump_graphviz_video": self.dump_graphviz_video,
+        "conversion_summary_dir": self.conversion_summary_dir,
+        "custom_opdefs": self._custom_opdefs,
+    })
+
     if not self.experimental_new_converter:
       logging.warning(
           "Please consider switching to use new converter by setting "
@@ -696,25 +1211,302 @@ class TFLiteConverterV2(TFLiteConverterBase):
                    "please file a bug. You can opt-out "
                    "by setting experimental_new_converter=False")
 
+    self._validate_quantized_input_stats(converter_kwargs)
+
     # Converts model.
-    result = _toco_convert_impl(
-        input_data=graph_def,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        **converter_kwargs)
+    if self._has_valid_tensors():
+      result = _toco_convert_impl(
+          input_data=optimized_graph,
+          input_tensors=self._input_tensors,
+          output_tensors=self._output_tensors,
+          **converter_kwargs)
+    else:
+      result = _toco_convert_graph_def(
+          input_data=optimized_graph,
+          input_arrays_with_shape=self._input_arrays_with_shape,
+          output_arrays=self._output_arrays,
+          **converter_kwargs)
 
     if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, False)
+      result = self._calibrate_quantize_model(result, inference_input_type,
+                                              inference_output_type, False)
     elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, True)
+      result = self._calibrate_quantize_model(result, inference_input_type,
+                                              inference_output_type, True)
+
+    if self._experimental_sparsify_model:
+      result = _mlir_sparsify(result)
 
     return result
 
+  def get_input_arrays(self):
+    """Returns a list of the names of the input tensors.
+
+    Returns:
+      List of strings.
+    """
+    if self._has_valid_tensors():
+      return [_get_tensor_name(tensor) for tensor in self._input_tensors]
+    else:
+      return [name for name, _ in self._input_arrays_with_shape]
+
+  def _has_valid_tensors(self):
+    """Checks if the input and output tensors have been initialized.
+
+    Returns:
+      Bool.
+    """
+    return self._input_tensors and self._output_tensors
+
+  def _set_batch_size(self, batch_size):
+    """Sets the first dimension of the input tensor to `batch_size`.
+
+    Args:
+      batch_size: Batch size for the model. Replaces the first dimension of an
+        input size array if undefined. (default 1)
+
+    Raises:
+      ValueError: input_tensor is not defined.
+    """
+    if not self._has_valid_tensors():
+      raise ValueError("The batch size cannot be set for this model. Please "
+                       "use input_shapes parameter.")
+
+    for tensor in self._input_tensors:
+      shape = tensor.shape.as_list()
+      if shape[0] is None:
+        shape[0] = batch_size
+        tensor.set_shape(shape)
+
+  def _is_unknown_shapes_allowed(self):
+    # Ophint Converted nodes will need the shapes to be known.
+    if _is_ophint_converted(self._graph_def):
+      return False
+
+    if not super(TFLiteConverterBaseV1, self)._is_unknown_shapes_allowed():
+      return False
+
+    # `conversion_summary_dir` calls TOCO. Unknown shapes are only supported by
+    # the MLIR converter.
+    if self.conversion_summary_dir:
+      logging.warning(
+          "`conversion_summary_dir` does not work with unknown shapes. "
+          "Graphs with unknown shapes might be different than when this flag "
+          "is disabled.")
+      return False
+    return True
+
+
+class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
+  """Converts the given SavedModel into TensorFlow Lite model.
+
+  Attributes:
+      saved_model_dir: Directory of the SavedModel.
+  """
+
+  def __init__(self,
+               saved_model_dir,
+               saved_model_tags,
+               saved_model_exported_names,
+               experimental_debug_info_func=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      saved_model_dir: Directory of the SavedModel.
+      saved_model_tags: Set of tags identifying the MetaGraphDef within the
+        SavedModel to analyze. All tags in the tag set must be present. (default
+        set(SERVING)).
+      saved_model_exported_names: Names to be exported (default: export all)
+        when the saved model import path is on.
+      experimental_debug_info_func: An experimental function to retrieve the
+        graph debug info for a set of nodes from the `graph_def`.
+
+    Raises:
+      ValueError: Invalid arguments.
+    """
+    super(TFLiteSavedModelConverter,
+          self).__init__(experimental_debug_info_func)
+    self.saved_model_dir = saved_model_dir
+    self._saved_model_tags = saved_model_tags
+    self._saved_model_exported_names = saved_model_exported_names
+
+    signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+    if len(self._saved_model_exported_names) != 1:
+      raise ValueError("Only support a single signature key.")
+
+    signature_key = self._saved_model_exported_names[0]
+
+    result = _freeze_saved_model(self.saved_model_dir, None, None, None,
+                                 self._saved_model_tags, signature_key)
+    self._graph_def = result[0]
+    self._input_tensors = result[1]
+    self._output_tensors = result[2]
+    self._parse_saved_model_args()
+
+
+class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
+  """Converts the given SavedModel into TensorFlow Lite model."""
+
+  def __init__(self,
+               model_file,
+               input_arrays=None,
+               input_shapes=None,
+               output_arrays=None,
+               custom_objects=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      model_file: Full filepath of HDF5 file containing the tf.keras model.
+      input_arrays: List of input tensors to freeze graph with. Uses input
+        arrays from SignatureDef when none are provided. (default None)
+      input_shapes: Dict of strings representing input tensor names to list of
+        integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+        Automatically determined when input shapes is None (e.g., {"foo" :
+          None}). (default None)
+      output_arrays: List of output tensors to freeze graph with. Uses output
+        arrays from SignatureDef when none are provided. (default None)
+      custom_objects: Dict mapping names (strings) to custom classes or
+        functions to be considered during model deserialization. (default None)
+
+    Raises:
+      ValueError: Invalid arguments.
+    """
+    super(TFLiteKerasModelConverter,
+          self).__init__(experimental_debug_info_func=None)
+    # Handles Keras when Eager mode is enabled.
+    if context.executing_eagerly():
+      if input_arrays or output_arrays:
+        raise ValueError("`input_arrays` and `output_arrays` are unsupported "
+                         "with Eager mode. If your model requires any of these "
+                         "parameters, please use disable_eager_execution().")
+
+      _keras.backend.set_learning_phase(False)
+      keras_model = _keras.models.load_model(model_file, custom_objects)
+
+      function = _saving_utils.trace_model_call(keras_model)
+      concrete_func = function.get_concrete_function()
+
+      frozen_func = _convert_to_constants.convert_variables_to_constants_v2(
+          concrete_func, lower_control_flow=False)
+      _set_tensor_shapes(frozen_func.inputs, input_shapes)
+      self._keras_model = keras_model
+      self._graph_def = frozen_func.graph.as_graph_def()
+      self._input_tensors = frozen_func.inputs
+      self._output_tensors = frozen_func.outputs
+      self._debug_info_func = _build_debug_info_func(frozen_func.graph)
+      return
+
+    # Handles Keras when Eager mode is disabled.
+    _keras.backend.clear_session()
+    _keras.backend.set_learning_phase(False)
+    keras_model = _keras.models.load_model(model_file, custom_objects)
+    sess = _keras.backend.get_session()
+
+    # Get input and output tensors.
+    if input_arrays:
+      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
+    else:
+      input_tensors = keras_model.inputs
+
+    if output_arrays:
+      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
+    else:
+      output_tensors = keras_model.outputs
+    _set_tensor_shapes(input_tensors, input_shapes)
+
+    graph_def = _freeze_graph(sess, input_tensors, output_tensors)
+    self._keras_model = keras_model
+    self._graph_def = graph_def
+    self._input_tensors = input_tensors
+    self._output_tensors = output_tensors
+    self._debug_info_func = _build_debug_info_func(sess.graph)
+
+  def convert(self):
+    """Converts a Keras model based on instance variables.
+
+    Returns:
+      The converted data in serialized format. Either a TFLite Flatbuffer or a
+      Graphviz graph depending on value in `output_format`.
+
+    Raises:
+      ValueError:
+        Input shape is not specified.
+        None value for dimension in input_tensor.
+    """
+    temp_dir = tempfile.mkdtemp()
+    try:
+      self._keras_model.save(temp_dir, save_format="tf")
+      tag_set = set([_tag_constants.SERVING])
+      signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      result = _freeze_saved_model(temp_dir, None, None, None, tag_set,
+                                   signature_key)
+
+      self.saved_model_dir = temp_dir
+      self._saved_model_tags = tag_set
+      self._saved_model_exported_names = [signature_key]
+      self._parse_saved_model_args()
+      if self.saved_model_dir:
+        self._graph_def = result[0]
+        self._input_tensors = result[1]
+        self._output_tensors = result[2]
+        self._debug_info_func = _build_debug_info_func(result[3])
+        return super(TFLiteKerasModelConverter, self).convert()
+    finally:
+      shutil.rmtree(temp_dir, True)
+
+    return super(TFLiteKerasModelConverter, self).convert()
+
+
+class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
+  """Converts the given frozen graph def into TensorFlow Lite model."""
+
+  def __init__(self,
+               graph_def,
+               input_tensors,
+               output_tensors,
+               input_arrays_with_shape=None,
+               output_arrays=None,
+               experimental_debug_info_func=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      graph_def: Frozen TensorFlow GraphDef.
+      input_tensors: List of input tensors. Type and shape are computed using
+        `foo.shape` and `foo.dtype`.
+      output_tensors: List of output tensors (only .name is used from this).
+      input_arrays_with_shape: Tuple of strings representing input tensor names
+        and list of integers representing input shapes
+        (e.g., [("foo" : [1, 16, 16, 3])]). Use only when graph cannot be loaded
+          into TensorFlow and when `input_tensors` and `output_tensors` are
+          None. (default None)
+      output_arrays: List of output tensors to freeze graph with. Use only when
+        graph cannot be loaded into TensorFlow and when `input_tensors` and
+        `output_tensors` are None. (default None)
+      experimental_debug_info_func: An experimental function to retrieve the
+        graph debug info for a set of nodes from the `graph_def`.
+
+    Raises:
+      ValueError: Invalid arguments.
+    """
+    super(TFLiteFrozenGraphConverter,
+          self).__init__(experimental_debug_info_func)
+    self._graph_def = graph_def
+    self._input_tensors = input_tensors
+    self._output_tensors = output_tensors
+
+    # Attributes are used by models that cannot be loaded into TensorFlow.
+    if not self._has_valid_tensors():
+      if not input_arrays_with_shape or not output_arrays:
+        raise ValueError(
+            "If input_tensors and output_tensors are None, both "
+            "input_arrays_with_shape and output_arrays must be defined.")
+      self._input_arrays_with_shape = input_arrays_with_shape
+      self._output_arrays = output_arrays
+
 
 @_tf_export(v1=["lite.TFLiteConverter"])
-class TFLiteConverter(TFLiteConverterBase):
+class TFLiteConverter(TFLiteFrozenGraphConverter):
   """Convert a TensorFlow model into `output_format`.
 
   This is used to convert from a TensorFlow GraphDef, SavedModel or tf.keras
@@ -820,15 +1612,14 @@ class TFLiteConverter(TFLiteConverterBase):
     ```
   """
 
+  # pylint: disable=useless-super-delegation
   def __init__(self,
                graph_def,
                input_tensors,
                output_tensors,
                input_arrays_with_shape=None,
                output_arrays=None,
-               experimental_debug_info_func=None,
-               saved_model_dir=None,
-               saved_model_tags=None):
+               experimental_debug_info_func=None):
     """Constructor for TFLiteConverter.
 
     Args:
@@ -846,47 +1637,14 @@ class TFLiteConverter(TFLiteConverterBase):
         `output_tensors` are None. (default None)
       experimental_debug_info_func: An experimental function to retrieve the
         graph debug info for a set of nodes from the `graph_def`.
-      saved_model_dir: Directory of the SavedModel. This argument can be null
-        when it creates via the from_keras_model and from_concrete_function
-        methods.
-      saved_model_tags: Set of tags identifying the MetaGraphDef within the
-        SavedModel to analyze. All tags in the tag set must be present. (default
-        set(SERVING)).  This argument will be available when the saved model dir
-        argument is set.
 
     Raises:
       ValueError: Invalid arguments.
     """
-    super(TFLiteConverter, self).__init__()
-    self._graph_def = graph_def
-    self._input_tensors = input_tensors
-    self._output_tensors = output_tensors
-    self.inference_type = constants.FLOAT
-    self.inference_input_type = None
-    self.inference_output_type = None
-    self.output_format = constants.TFLITE
-    self.quantized_input_stats = {}
-    self.default_ranges_stats = None
-    self.drop_control_dependency = True
-    self.reorder_across_fake_quant = False
-    self.change_concat_input_ranges = False
-    self._post_training_quantize = False
-    self.dump_graphviz_dir = None
-    self.dump_graphviz_video = False
-    self.conversion_summary_dir = None
-    self._debug_info_func = experimental_debug_info_func
-    self._custom_opdefs = None
-    self._saved_model_dir = saved_model_dir
-    self._saved_model_tags = saved_model_tags
-
-    # Attributes are used by models that cannot be loaded into TensorFlow.
-    if not self._has_valid_tensors():
-      if not input_arrays_with_shape or not output_arrays:
-        raise ValueError(
-            "If input_tensors and output_tensors are None, both "
-            "input_arrays_with_shape and output_arrays must be defined.")
-      self._input_arrays_with_shape = input_arrays_with_shape
-      self._output_arrays = output_arrays
+    super(TFLiteConverter,
+          self).__init__(graph_def, input_tensors, output_tensors,
+                         input_arrays_with_shape, output_arrays,
+                         experimental_debug_info_func)
 
   @classmethod
   def from_session(cls, sess, input_tensors, output_tensors):
@@ -1034,15 +1792,19 @@ class TFLiteConverter(TFLiteConverterBase):
     if signature_key is None:
       signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
+    saved_model_converter = TFLiteSavedModelConverter(saved_model_dir, tag_set,
+                                                      [signature_key])
+    if saved_model_converter.saved_model_dir:
+      return saved_model_converter
+
     result = _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
                                  output_arrays, tag_set, signature_key)
+
     return cls(
         graph_def=result[0],
         input_tensors=result[1],
         output_tensors=result[2],
-        experimental_debug_info_func=_build_debug_info_func(result[3]),
-        saved_model_dir=saved_model_dir,
-        saved_model_tags=tag_set)
+        experimental_debug_info_func=_build_debug_info_func(result[3]))
 
   @classmethod
   def from_keras_model_file(cls,
@@ -1069,98 +1831,10 @@ class TFLiteConverter(TFLiteConverterBase):
     Returns:
       TFLiteConverter class.
     """
-    # Handles Keras when Eager mode is enabled.
-    if context.executing_eagerly():
-      if input_arrays or output_arrays:
-        raise ValueError("`input_arrays` and `output_arrays` are unsupported "
-                         "with Eager mode. If your model requires any of these "
-                         "parameters, please use disable_eager_execution().")
-
-      _keras.backend.set_learning_phase(False)
-      keras_model = _keras.models.load_model(model_file, custom_objects)
-
-      function = _saving_utils.trace_model_call(keras_model)
-      concrete_func = function.get_concrete_function()
-
-      frozen_func = _convert_to_constants.convert_variables_to_constants_v2(
-          concrete_func, lower_control_flow=False)
-      _set_tensor_shapes(frozen_func.inputs, input_shapes)
-      return cls(
-          frozen_func.graph.as_graph_def(),
-          frozen_func.inputs,
-          frozen_func.outputs,
-          experimental_debug_info_func=_build_debug_info_func(
-              frozen_func.graph))
-
-    # Handles Keras when Eager mode is disabled.
-    _keras.backend.clear_session()
-    _keras.backend.set_learning_phase(False)
-    keras_model = _keras.models.load_model(model_file, custom_objects)
-    sess = _keras.backend.get_session()
-
-    # Get input and output tensors.
-    if input_arrays:
-      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
-    else:
-      input_tensors = keras_model.inputs
-
-    if output_arrays:
-      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
-    else:
-      output_tensors = keras_model.outputs
-    _set_tensor_shapes(input_tensors, input_shapes)
-
-    graph_def = _freeze_graph(sess, input_tensors, output_tensors)
-    return cls(
-        graph_def,
-        input_tensors,
-        output_tensors,
-        experimental_debug_info_func=_build_debug_info_func(sess.graph))
-
-  def __setattr__(self, name, value):
-    if name == "post_training_quantize":
-      warnings.warn("Property %s is deprecated, "
-                    "please use optimizations=[Optimize.DEFAULT]"
-                    " instead." % name)
-      if value:
-        self.optimizations = [Optimize.DEFAULT]
-      else:
-        self.optimizations = []
-      return
-    if name == "target_ops":
-      warnings.warn("Property %s is deprecated, please use "
-                    "target_spec.supported_ops instead." % name)
-      self.target_spec.supported_ops = value
-      return
-    object.__setattr__(self, name, value)
-
-  def __getattribute__(self, name):
-    if name == "post_training_quantize":
-      warnings.warn("Property %s is deprecated, "
-                    "please use optimizations=[Optimize.DEFAULT]"
-                    " instead." % name)
-      return Optimize.DEFAULT in set(self.optimizations)
-    if name == "target_ops":
-      warnings.warn("Property %s is deprecated, please use "
-                    "target_spec.supported_ops instead." % name)
-      return self.target_spec.supported_ops
-    return object.__getattribute__(self, name)
-
-  def _validate_quantized_input_stats(self, converter_kwargs):
-    """Ensure quantized_input_stats provided if required."""
-
-    quantized_types = frozenset({constants.INT8, constants.QUANTIZED_UINT8})
-
-    requires_quantized_input_stats = (
-        (converter_kwargs["inference_type"] in quantized_types or
-         converter_kwargs["inference_input_type"] in quantized_types) and
-        not converter_kwargs["post_training_quantize"])
-
-    if (requires_quantized_input_stats and
-        not converter_kwargs["quantized_input_stats"]):
-      raise ValueError("std_dev and mean must be defined when inference_type "
-                       "or inference_input_type is QUANTIZED_UINT8 or INT8.")
+    return TFLiteKerasModelConverter(model_file, input_arrays, input_shapes,
+                                     output_arrays, custom_objects)
 
+  # pylint: disable=useless-super-delegation
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.
 
@@ -1173,230 +1847,7 @@ class TFLiteConverter(TFLiteConverterBase):
         Input shape is not specified.
         None value for dimension in input_tensor.
     """
-    # Parses SavedModel argument.
-    self._parse_saved_model_args()
-
-    quant_mode = QuantizationMode(self.optimizations, self.target_spec,
-                                  self.representative_dataset, self._graph_def)
-
-    # Checks dimensions in input tensor.
-    if (not self._is_unknown_shapes_allowed(quant_mode.fp32_execution()) and
-        self._has_valid_tensors()):
-      for tensor in self._input_tensors:
-        shape = tensor.shape
-        if not shape:
-          raise ValueError("Provide an input shape for input array "
-                           "'{0}'.".format(_get_tensor_name(tensor)))
-        # Note that shape_list might be empty for scalar shapes.
-        shape_list = shape.as_list()
-        if None in shape_list[1:]:
-          raise ValueError(
-              "None is only supported in the 1st dimension. Tensor '{0}' has "
-              "invalid shape '{1}'.".format(
-                  _get_tensor_name(tensor), shape_list))
-        elif shape_list and shape_list[0] is None:
-          self._set_batch_size(batch_size=1)
-
-    # Get quantization stats. Ensures there is one stat per name if the stats
-    # are specified.
-    if self.quantized_input_stats:
-      quantized_stats = []
-      invalid_stats = []
-      for name in self.get_input_arrays():
-        if name in self.quantized_input_stats:
-          quantized_stats.append(self.quantized_input_stats[name])
-        else:
-          invalid_stats.append(name)
-
-      if invalid_stats:
-        raise ValueError("Quantization input stats are not available for input "
-                         "tensors '{0}'.".format(",".join(invalid_stats)))
-    else:
-      quantized_stats = None
-
-    toco_inference_input_type = self.inference_input_type
-    inference_input_type = self.inference_input_type
-    inference_output_type = self.inference_output_type
-    post_training_optimize = (
-        quant_mode.post_training_int8_no_float() or
-        quant_mode.post_training_int8_allow_float() or
-        quant_mode.post_training_dynamic_range_int8() or
-        quant_mode.post_training_fp16())
-    if post_training_optimize:
-      # Post training optimizations require that TOCO outputs a float model.
-      if self.inference_type != constants.FLOAT:
-        raise ValueError(
-            "`optimizations` require that `inference_type` is set to float.")
-      toco_inference_input_type = constants.FLOAT
-      # Set up default values.
-      if inference_input_type is None:
-        inference_input_type = constants.FLOAT
-      if inference_output_type is None:
-        inference_output_type = constants.FLOAT
-
-    weight_only_quantize = (
-        quant_mode.post_training_dynamic_range_int8() or
-        quant_mode.post_training_fp16())
-    if weight_only_quantize:
-      # Currently, weight only quantization requires float inputs and outputs.
-      if (inference_input_type != constants.FLOAT or
-          inference_output_type != constants.FLOAT):
-        raise ValueError(
-            "Provide an inference_input_type and inference_output_type of type "
-            "tf.float32.")
-
-    if not post_training_optimize and self.inference_output_type is not None:
-      raise ValueError(
-          "inference_output_type is currently not supported if optimizations "
-          "are not enabled.")
-
-    optimized_graph = self._graph_def
-    if not self._saved_model_dir:
-      # if it is not uint8 or int8 with post-training quantization, it is not
-      # quantization aware training, then graph optimization is applied.
-      # Graph optimization is disabled for quantization aware training.
-      if (self.inference_type != constants.QUANTIZED_UINT8 or
-          (self.inference_type == constants.INT8 and
-           (post_training_optimize or weight_only_quantize))):
-        try:
-          # TODO(b/150163103): Merge `disabling lower using switch merge' calls.
-          # Grappler will also try to lower while loop into switch merge
-          # representation which is undesired for Ophints, so we simply remove
-          # those attributes to prevent Grappler from doing so.
-          graph_def = _convert_to_constants.disable_lower_using_switch_merge(
-              optimized_graph)
-          # Run function inlining optimization to ensure any models generated
-          # through the from_frozen_graph path have been inlined.
-          optimized_graph = _run_graph_optimizations(
-              graph_def,
-              self._input_tensors,
-              self._output_tensors,
-              config=self._grappler_config(["function"]))
-        except Exception:
-          optimized_graph = self._graph_def
-
-    self._debug_info = _get_debug_info(self._debug_info_func, optimized_graph)
-
-    converter_kwargs = self._get_base_converter_args()
-
-    if quant_mode.post_training_dynamic_range_int8():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-      })
-    elif quant_mode.post_training_fp16():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-          "quantize_to_float16": True,
-      })
-
-    converter_kwargs.update({
-        "inference_type": self.inference_type,
-        "inference_input_type": toco_inference_input_type,
-        "output_format": self.output_format,
-        "quantized_input_stats": quantized_stats,
-        "default_ranges_stats": self.default_ranges_stats,
-        "drop_control_dependency": self.drop_control_dependency,
-        "reorder_across_fake_quant": self.reorder_across_fake_quant,
-        "change_concat_input_ranges": self.change_concat_input_ranges,
-        "dump_graphviz_dir": self.dump_graphviz_dir,
-        "dump_graphviz_video": self.dump_graphviz_video,
-        "conversion_summary_dir": self.conversion_summary_dir,
-        "custom_opdefs": self._custom_opdefs,
-    })
-
-    if not self.experimental_new_converter:
-      logging.warning(
-          "Please consider switching to use new converter by setting "
-          "experimental_new_converter to true. "
-          "Old converter (TOCO) is deprecated and flow will be switched on "
-          "by default to use new converter soon.")
-    else:
-      logging.info("Using experimental converter: If you encountered a problem "
-                   "please file a bug. You can opt-out "
-                   "by setting experimental_new_converter=False")
-
-    self._validate_quantized_input_stats(converter_kwargs)
-
-    # Converts model.
-    if self._has_valid_tensors():
-      result = _toco_convert_impl(
-          input_data=optimized_graph,
-          input_tensors=self._input_tensors,
-          output_tensors=self._output_tensors,
-          **converter_kwargs)
-    else:
-      result = _toco_convert_graph_def(
-          input_data=optimized_graph,
-          input_arrays_with_shape=self._input_arrays_with_shape,
-          output_arrays=self._output_arrays,
-          **converter_kwargs)
-
-    if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, False)
-    elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, True)
-
-    return result
-
-  def get_input_arrays(self):
-    """Returns a list of the names of the input tensors.
-
-    Returns:
-      List of strings.
-    """
-    if self._has_valid_tensors():
-      return [_get_tensor_name(tensor) for tensor in self._input_tensors]
-    else:
-      return [name for name, _ in self._input_arrays_with_shape]
-
-  def _has_valid_tensors(self):
-    """Checks if the input and output tensors have been initialized.
-
-    Returns:
-      Bool.
-    """
-    return self._input_tensors and self._output_tensors
-
-  def _set_batch_size(self, batch_size):
-    """Sets the first dimension of the input tensor to `batch_size`.
-
-    Args:
-      batch_size: Batch size for the model. Replaces the first dimension of an
-        input size array if undefined. (default 1)
-
-    Raises:
-      ValueError: input_tensor is not defined.
-    """
-    if not self._has_valid_tensors():
-      raise ValueError("The batch size cannot be set for this model. Please "
-                       "use input_shapes parameter.")
-
-    for tensor in self._input_tensors:
-      shape = tensor.shape.as_list()
-      if shape[0] is None:
-        shape[0] = batch_size
-        tensor.set_shape(shape)
-
-  def _is_unknown_shapes_allowed(self, fp32_execution):
-    # Ophint Converted nodes will need the shapes to be known.
-    if _is_ophint_converted(self._graph_def):
-      return False
-
-    if not super(TFLiteConverter,
-                 self)._is_unknown_shapes_allowed(fp32_execution):
-      return False
-
-    # `conversion_summary_dir` calls TOCO. Unknown shapes are only supported by
-    # the MLIR converter.
-    if self.conversion_summary_dir:
-      logging.warning(
-          "`conversion_summary_dir` does not work with unknown shapes. "
-          "Graphs with unknown shapes might be different than when this flag "
-          "is disabled.")
-      return False
-    return True
+    return super(TFLiteConverter, self).convert()
 
 
 @_tf_export(v1=["lite.TocoConverter"])
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 445a8b4cfed..1bcb2ce0ee4 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -269,9 +269,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                                                   [out_tensor])
     converter.inference_input_type = lite_constants.QUANTIZED_UINT8
     converter.inference_type = lite_constants.FLOAT
-    converter.quantized_input_stats = {
-        'Placeholder': (0., 1.)
-    }  # mean, std_dev
+    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -450,8 +448,15 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                       3] == input_details[0]['shape_signature']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
+    # Resize tensor with strict checking.
+    with self.assertRaises(RuntimeError) as error:
+      interpreter.resize_tensor_input(0, [3, 16, 16, 3], strict=True)
+    self.assertIn(
+        'ResizeInputTensorStrict only allows mutating unknown dimensions '
+        'identified by -1.', str(error.exception))
+
     # Resize tensor and invoke.
-    interpreter.resize_tensor_input(0, [1, 16, 16, 3])
+    interpreter.resize_tensor_input(0, [1, 16, 16, 3], strict=True)
     interpreter.allocate_tensors()
     interpreter.invoke()
 
@@ -462,9 +467,37 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                       3] == input_details[0]['shape_signature']).all())
 
     output_details = interpreter.get_output_details()
-    self.assertTrue(([1, 16, 16,
+    self.assertTrue(([1, -1, 16,
                       3] == output_details[0]['shape_signature']).all())
 
+  def testResizeTensorInputStrict(self):
+    # Ensures that resize_tensor_input(strict=True) works as expected.
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+
+    # Resize incorrect value.
+    with self.assertRaises(RuntimeError) as error:
+      interpreter.resize_tensor_input(0, [3, 16, 16, 3], strict=True)
+    self.assertIn(
+        'ResizeInputTensorStrict only allows mutating unknown dimensions '
+        'identified by -1.', str(error.exception))
+
+    # Resize correct value.
+    interpreter.resize_tensor_input(0, [1, 16, 16, 3], strict=True)
+    interpreter.allocate_tensors()
+
   def testBatchSizeValid(self):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
@@ -1292,6 +1325,41 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
+  def testResizeWithShape(self):
+    with ops.Graph().as_default():
+      # Construct a graph with a dynamically shapped input and an internal node
+      # that relies on the output of that input's shape.
+      in_tensor = array_ops.placeholder(
+          shape=[None, None], dtype=dtypes.float32)
+      in_tensor2 = [[1, 2], [3, 4]]
+      out_tensor = array_ops.reshape(in_tensor2, array_ops.shape(in_tensor))
+      sess = session.Session()
+
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    converter.experimental_new_converter = True
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertTrue(([1, 1] == input_details[0]['shape']).all())
+    self.assertTrue(([-1, -1] == input_details[0]['shape_signature']).all())
+
+    # Resize tensor and invoke.
+    interpreter.resize_tensor_input(0, [4])
+    interpreter.allocate_tensors()
+    interpreter.invoke()
+
+    # The output should be reshaped properly according to the resized input.
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.int32, output_details[0]['dtype'])
+    self.assertTrue(([4] == output_details[0]['shape']).all())
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue(([1, 2, 3, 4] == output_data).all())
+
   def testResizingIntermediateDynamicTensor(self):
     # This is a regression test for the case where shape of dynamic output
     # tensors changes between invocations.
@@ -1860,7 +1928,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual('dense_input', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
@@ -1955,7 +2023,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual('dense_input', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertTrue(([2, 3] == input_details[0]['shape']).all())
 
   def testSequentialModelOutputArray(self):
@@ -2074,12 +2142,12 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 2)
-    self.assertEqual('input_a', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'input_a')
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
-    self.assertEqual('input_b', input_details[1]['name'])
+    self.assertEndsWith(input_details[1]['name'], 'input_b')
     self.assertEqual(np.float32, input_details[1]['dtype'])
     self.assertTrue(([1, 3] == input_details[1]['shape']).all())
     self.assertEqual((0., 0.), input_details[1]['quantization'])
@@ -2130,7 +2198,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual('dense_input', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
@@ -2174,6 +2242,14 @@ class FromKerasFile(TestModels, parameterized.TestCase):
       converter.convert()
       self.assertValidDebugInfo(converter._debug_info)
 
+  def testExperimentalSparsifyModel(self):
+    self._getSequentialModel()
+
+    converter = lite.TocoConverter.from_keras_model_file(self._keras_file)
+    converter._experimental_sparsify_model = True
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
 
 class GrapplerTest(TestModels, parameterized.TestCase):
 
@@ -2275,5 +2351,42 @@ class ImportOpsUtilTest(LiteTest):
     self.assertIsNotNone(lite.get_potentially_supported_ops())
 
 
+class DefaultConverterAttrsTest(LiteTest):
+
+  def testAttrs(self):
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(shape=[2, 2], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
+
+    # Convert model.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+
+    # Assert output format.
+    self.assertEqual(converter.output_format, lite_constants.TFLITE)
+
+    # Assert the default inference type is float.
+    self.assertEqual(converter.inference_type, lite_constants.FLOAT)
+
+    # Assert the default inference type overrides are None.
+    self.assertIsNone(converter.inference_input_type)
+    self.assertIsNone(converter.inference_output_type)
+
+    # Assert the default quantization options are not set.
+    self.assertEqual(converter.quantized_input_stats, {})
+    self.assertIsNone(converter.default_ranges_stats)
+    self.assertFalse(converter.reorder_across_fake_quant)
+    self.assertFalse(converter.change_concat_input_ranges)
+
+    # Assert dropping control dependency is enabled by default.
+    self.assertTrue(converter.drop_control_dependency)
+
+    # Assert dumping extra information is disabled by default.
+    self.assertIsNone(converter.dump_graphviz_dir)
+    self.assertFalse(converter.dump_graphviz_video)
+    self.assertIsNone(converter.conversion_summary_dir)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index d04117c1a32..9af37df2975 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -29,7 +29,9 @@ import tensorflow as tf
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_v2_test_util
+from tensorflow.lite.python.convert import mlir_quantize
 from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import recurrent
@@ -204,6 +206,40 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     # Ensure that the quantized weights tflite model is smaller.
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
+  def testCalibrateAndQuantizeBuiltinInt16(self):
+    func, calibration_gen = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    # TODO(b/156309549): We should add INT16 to the builtin types.
+    converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS_INT8
+    ]
+    converter.representative_dataset = calibration_gen
+    converter._experimental_calibrate_only = True
+    calibrated_tflite = converter.convert()
+    quantized_tflite = mlir_quantize(calibrated_tflite,
+                                     inference_type=_types_pb2.QUANTIZED_INT16)
+
+    self.assertTrue(quantized_tflite)
+
+    # The default input and output types should be float.
+    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite), len(float_tflite))
+
   def _getTrainingTimeQuantizedModel(self):
 
     class QLinear(tf.keras.layers.Layer):
@@ -213,9 +249,11 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
         self.units = units
 
       def build(self, input_shape):
-        self.w = self.add_weight(shape=(input_shape[-1], self.units),
-                                 initializer='random_normal',
-                                 trainable=True)
+        self.w = self.add_weight(
+            'weight',
+            shape=(input_shape[-1], self.units),
+            initializer='random_normal',
+            trainable=True)
         self.min_var = self.add_weight(
             'min',
             initializer=tf.keras.initializers.Constant(-6.0),
@@ -469,15 +507,22 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
     save(root, save_dir, {'add': add_func, 'sub': sub_func})
 
-    # Ensure the converter generates.
-    converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
-    self.assertLen(converter._funcs, 2)
-
     # Try converting multiple functions.
     with self.assertRaises(ValueError) as error:
-      _ = converter.convert()
-    self.assertIn('This converter can only convert a single ConcreteFunction',
-                  str(error.exception))
+      _ = lite.TFLiteConverterV2.from_saved_model(save_dir)
+    self.assertIn('Only support a single signature key.', str(error.exception))
+
+  @test_util.run_v2_only
+  def testNoConcreteFunctionModel(self):
+    root = self._getMultiFunctionModel()
+    input_data = tf.constant(1., shape=[1])
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save(root, save_dir)
+
+    with self.assertRaises(ValueError) as error:
+      _ = lite.TFLiteConverterV2.from_saved_model(save_dir)
+    self.assertIn('Only support a single signature key.', str(error.exception))
 
   @test_util.run_v2_only
   def testKerasSequentialModel(self):
@@ -741,7 +786,10 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     input_data = tf.constant(
         np.array(np.random.random_sample((1, 10, 10)), dtype=np.float32))
     rnn_obj = rnn_layer(units=10, input_shape=(10, 10))
-    model = tf.keras.models.Sequential([rnn_obj])
+    model = tf.keras.models.Sequential([
+        tf.keras.layers.Input(batch_size=1, shape=(10, 10), name='input'),
+        rnn_obj,
+    ])
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_keras_model(model)
@@ -780,6 +828,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     input_data = tf.constant(
         np.array(np.random.random_sample((1, 10, 10)), dtype=np.float32))
     model = tf.keras.models.Sequential()
+    model.add(tf.keras.layers.Input(batch_size=1, shape=(10, 10), name='input'))
     model.add(
         tf.keras.layers.Bidirectional(
             recurrent_v2.LSTM(units=10, return_sequences=True),
@@ -861,8 +910,85 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     np.testing.assert_almost_equal(
         expected_value.numpy(), actual_value[0], decimal=6)
 
+  def _getQuantizedModel(self):
+    # Returns a model with tf.MatMul and unknown dimensions.
+    @tf.function(
+        input_signature=[tf.TensorSpec(shape=[None, 33], dtype=tf.float32)])
+    def model(in_tensor):
+      # We need the tensor to have more than 1024 elements for quantize_weights
+      # to kick in. Thus, the [33, 33] shape.
+      const_tensor = tf.constant(
+          np.random.uniform(low=-10., high=10., size=[33, 33]),
+          shape=[33, 33],
+          dtype=tf.float32,
+          name='inputB')
+
+      shape = tf.shape(in_tensor)
+      fill = tf.transpose(tf.fill(shape, 1.))
+      mult = tf.matmul(fill, in_tensor)
+      return tf.matmul(mult, const_tensor)
+
+    concrete_func = model.get_concrete_function()
+
+    def calibration_gen():
+      for batch in range(5, 20, 5):
+        for _ in range(5):
+          yield [np.random.uniform(-1, 1, size=(batch, 33)).astype(np.float32)]
+
+    return concrete_func, calibration_gen
+
+  @test_util.run_v2_only
+  def testMatMulQuantize(self):
+    concrete_func, _ = self._getQuantizedModel()
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions(
+        [concrete_func])
+    float_converter.experimental_new_converter = True
+    float_tflite_model = float_converter.convert()
+
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions(
+        [concrete_func])
+    quantized_converter.experimental_new_converter = True
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_tflite_model = quantized_converter.convert()
+
+    # The default input and output types should be float.
+    quantized_interpreter = Interpreter(model_content=quantized_tflite_model)
+    quantized_interpreter.allocate_tensors()
+    input_details = quantized_interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue((input_details[0]['shape_signature'] == [-1, 33]).all())
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
+
+  @test_util.run_v2_only
+  def testMatMulCalibrateAndQuantize(self):
+    concrete_func, calibration_gen = self._getQuantizedModel()
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions(
+        [concrete_func])
+    float_converter.experimental_new_converter = True
+    float_tflite_model = float_converter.convert()
+
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions(
+        [concrete_func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    quantized_converter.experimental_new_converter = True
+    quantized_tflite_model = quantized_converter.convert()
+
+    # The default input and output types should be float.
+    quantized_interpreter = Interpreter(model_content=quantized_tflite_model)
+    quantized_interpreter.allocate_tensors()
+    input_details = quantized_interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue((input_details[0]['shape_signature'] == [-1, 33]).all())
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
+
   def testBatchMatMul(self):
-    self.skipTest('BatchMatMulV2 does not support unknown batch size.')
     input_data_1 = tf.constant(
         np.array(np.random.random_sample((1, 256, 256)), dtype=np.float32))
     input_data_2 = tf.constant(
@@ -886,7 +1012,29 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     actual_value = self._evaluateTFLiteModel(
         tflite_model, [input_data_1, input_data_2],
         input_shapes=[([-1, 256, 256], [1, 256, 256])])
-    np.testing.assert_almost_equal(expected_value.numpy(), actual_value[0])
+    np.testing.assert_almost_equal(
+        expected_value.numpy(), actual_value[0], decimal=4)
+
+  def testSizeInvalid(self):
+
+    @tf.function(input_signature=[
+        tf.TensorSpec(shape=[1, None, 16, 3], dtype=tf.float32)
+    ])
+    def model(in_tensor):
+      return in_tensor + in_tensor
+
+    concrete_func = model.get_concrete_function()
+
+    # Test invalid shape. None after 1st dimension. Run with TOCO in order to
+    # invoke shape checking code.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
+    converter.experimental_new_converter = False
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual(
+        'None is only supported in the 1st dimension. Tensor '
+        '\'in_tensor\' has invalid shape \'[1, None, 16, 3]\'.',
+        str(error.exception))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/lite/python/lite_v2_test_util.py b/tensorflow/lite/python/lite_v2_test_util.py
index 5ea239f22a2..d8f764711cd 100644
--- a/tensorflow/lite/python/lite_v2_test_util.py
+++ b/tensorflow/lite/python/lite_v2_test_util.py
@@ -53,10 +53,12 @@ class ModelTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       for idx, (shape_signature, final_shape) in enumerate(input_shapes):
         self.assertTrue(
             (input_details[idx]['shape_signature'] == shape_signature).all())
-        interpreter.resize_tensor_input(idx, final_shape)
+        index = input_details[idx]['index']
+        interpreter.resize_tensor_input(index, final_shape, strict=True)
     interpreter.allocate_tensors()
 
     output_details = interpreter.get_output_details()
+    input_details = interpreter.get_input_details()
 
     for input_tensor, tensor_data in zip(input_details, input_data):
       interpreter.set_tensor(input_tensor['index'], tensor_data.numpy())
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 29683718016..9d62c1b8a97 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -435,6 +435,7 @@ class OpHint(object):
     Args:
       *args: List of inputs to be converted (should be Tf.Tensor).
       **kwargs: This allows 'names' which should be a list of names.
+
     Returns:
       Wrapped inputs (identity standins that have additional metadata). These
       are also are also tf.Tensor's.
@@ -453,6 +454,7 @@ class OpHint(object):
     Args:
       *args: List of outputs to be converted (should be tf.Tensor).
       **kwargs: See
+
     Returns:
       Wrapped outputs (identity standins that have additional metadata). These
       are also tf.Tensor's.
@@ -574,8 +576,8 @@ class _LiteAggregateOperand(_LiteOperand):
       elif self.aggregation == OpHint.AGGREGATE_STACK:
         pass
       else:
-        raise ValueError(
-            "Invalid aggregation type %r specified" % self.aggregation)
+        raise ValueError("Invalid aggregation type %r specified" %
+                         self.aggregation)
     return self.flattened
 
   def flatten(self):
@@ -646,8 +648,8 @@ class _LiteAggregateOperand(_LiteOperand):
       stack_node.attr["num"].i = len(flattened)
       output_type = flattened[0].attr["T"].type
       stack_node.attr["T"].type = output_type
-      stack_node.input.append(_tensorflow_output_name(
-          fused_op_name, output_index))
+      stack_node.input.append(
+          _tensorflow_output_name(fused_op_name, output_index))
       out_graphdef.node.extend([stack_node])
 
       for idx, discrete in enumerate(flattened):
@@ -675,11 +677,10 @@ class _LiteFuncCall(object):
     inputs: inputs to the op (hash from index # to argument)
     outputs: outputs to the op (hash from index # to argument)
     function_name: the tflite custom op name to use
-    uuid: a unique call id for this particular call  (i.e.
-      multiple function calls would have the same function_name but different
-      uuids.
-    params: A param name to key value for op constant data. I.e. for
-      axis on a reduction, strides on a convolution, etc.
+    uuid: a unique call id for this particular call  (i.e. multiple function
+      calls would have the same function_name but different uuids.
+    params: A param name to key value for op constant data. I.e. for axis on a
+      reduction, strides on a convolution, etc.
     level: Level of the OpHint.
     children_inputs_mappings: If the Ophint has children, children inputs
       mappings indicate how their inputs & outputs are mapped.
@@ -700,6 +701,7 @@ class _LiteFuncCall(object):
     Returns:
       Tuple of (inputs, outputs). where input and output i a list of names.
     """
+
     def _flatten(input_or_output_dict):
       flattened_items = []
       for item in input_or_output_dict.values():
@@ -709,6 +711,7 @@ class _LiteFuncCall(object):
     return _flatten(self.inputs), _flatten(self.outputs)
 
   def __str__(self):
+
     def format_args(items):
       s = ""
       for idx, item in items.iteritems():
@@ -739,8 +742,8 @@ def _find_all_hints_in_nodes(nodes):
   for node in nodes:
     attr = node.attr
     # This is an op hint if it has a FUNCTION_UUID_ATTR, otherwise skip
-    if (OpHint.FUNCTION_UUID_ATTR not in attr
-        or not attr[OpHint.FUNCTION_UUID_ATTR].s):
+    if (OpHint.FUNCTION_UUID_ATTR not in attr or
+        not attr[OpHint.FUNCTION_UUID_ATTR].s):
       continue
     uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
 
@@ -751,9 +754,11 @@ def _find_all_hints_in_nodes(nodes):
     call_def.level = attr[OpHint.FUNCTION_LEVEL_ATTR].i
     # Get sorting and aggregation information
 
-    sort = (attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
-            if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
-    if sort == -1: sort = None
+    sort = (
+        attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
+        if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
+    if sort == -1:
+      sort = None
     aggregation = None
     if OpHint.FUNCTION_AGGREGATE_ATTR in attr:
       aggregation = _compat.as_text(attr[OpHint.FUNCTION_AGGREGATE_ATTR].s)
@@ -887,6 +892,7 @@ def _tensor_name_base(full_tensor_name):
   Args:
     full_tensor_name: A tensor name that is annotated with a device placement
       (this is what tensor flow introspection gives).
+
   Returns:
     A name without any device assignment.
   """
@@ -919,10 +925,10 @@ def _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
   while next_to_visit:
     current_node = next_to_visit.pop()
     visited.add(current_node)
-    if (current_node in reachable_by_input
-        and current_node not in input_nodes_set):
-      raise TypeError(
-          "Node %s uses input %s not in input_nodes." % (n, current_node))
+    if (current_node in reachable_by_input and
+        current_node not in input_nodes_set):
+      raise TypeError("Node %s uses input %s not in input_nodes." %
+                      (n, current_node))
     if current_node not in input_nodes_set:
       next_to_visit += [
           input_node for input_node in name_to_input_name[current_node]
@@ -1066,6 +1072,7 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
 
   Args:
     in_graph_def: Graph def to use as input.
+
   Returns:
     Simplified tuple (graph_def, changed_something) where changed_something
     is true if anything was done.
@@ -1101,15 +1108,15 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
       node = name_to_node[current_node_name]
       is_op_hint_stack = node.name.startswith("OpHintStack")
       is_op_hint_unstack = node.name.startswith("OpHintUnstack")
-      if (node.op == "Identity" or is_op_hint_stack
-          or (do_generic_pack_unpack and node.op == "Pack")):
+      if (node.op == "Identity" or is_op_hint_stack or
+          (do_generic_pack_unpack and node.op == "Pack")):
         is_hint_created_stack |= is_op_hint_stack
         next_to_visit += [
             input_node for input_node in name_to_input_name[current_node_name]
             if input_node not in visited
         ]
-      elif (is_op_hint_unstack
-            or (do_generic_pack_unpack and node.op == "Unpack")):
+      elif (is_op_hint_unstack or
+            (do_generic_pack_unpack and node.op == "Unpack")):
         unpack_nodes.add(node.name)
         is_hint_created_stack &= is_op_hint_unstack
       else:
@@ -1124,7 +1131,8 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
       # Unstacked form
       no_external_dependency = True
       for other_n in in_graph_def.node:
-        if other_n.name in visited: continue
+        if other_n.name in visited:
+          continue
         for input_tensor in name_to_input_name[other_n.name]:
           input_op = _tensor_name_base(input_tensor)
           if input_op in visited and input_op != pack_node:
@@ -1141,9 +1149,9 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
           if node_name not in visited:
             new_node = _copy.deepcopy(other_n)
             new_node.input[:] = [
-                (end_input if stripped == pack_node else
-                 non_stripped) for stripped, non_stripped in zip(
-                     name_to_input_name[node_name], new_node.input[:])
+                (end_input if stripped == pack_node else non_stripped)
+                for stripped, non_stripped in zip(name_to_input_name[node_name],
+                                                  new_node.input[:])
             ]
             out.node.extend([new_node])
         return out, True
@@ -1166,8 +1174,6 @@ def _get_correct_mapping(original_index, nodes):
     node_indices = nodes.keys()
     node_indices = sorted(node_indices)
     return node_indices[-1]
-  else:
-    return original_index
   return original_index
 
 
@@ -1179,6 +1185,7 @@ def _convert_op_hints_to_stubs_helper(
     graph_def: A graph def that we should convert.
     write_callback: A function pointer that can be used to write intermediate
       steps of graph transformation (optional).
+
   Returns:
     A new stubbed graph_def.
   """
@@ -1308,6 +1315,7 @@ def convert_op_hints_to_stubs(session=None,
     graph_def: A graph def that we should convert.
     write_callback: A function pointer that can be used to write intermediate
       steps of graph transformation (optional).
+
   Returns:
     A new graphdef with all ops contained in OpHints being replaced by
     a single op call with the right parameters.
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index 53ebba2fcb2..1a0d3db3b73 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -24,22 +24,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "sparsification_wrapper_lib",
-    srcs = ["sparsification_wrapper.cc"],
-    hdrs = ["sparsification_wrapper.h"],
-    deps = [
-        "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/python/interpreter_wrapper:numpy",
-        "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
-        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
-        "@com_google_absl//absl/memory",
-    ],
-)
-
 pybind_extension(
     name = "_pywrap_tensorflow_lite_calibration_wrapper",
     srcs = [
@@ -57,22 +41,6 @@ pybind_extension(
     ],
 )
 
-pybind_extension(
-    name = "_pywrap_tensorflow_lite_sparsification_wrapper",
-    srcs = [
-        "sparsification_wrapper_pybind11.cc",
-    ],
-    hdrs = ["sparsification_wrapper.h"],
-    link_in_framework = True,
-    module_name = "_pywrap_tensorflow_lite_sparsification_wrapper",
-    deps = [
-        ":sparsification_wrapper_lib",
-        "//tensorflow/python:pybind11_lib",
-        "//third_party/python_runtime:headers",
-        "@pybind11",
-    ],
-)
-
 py_library(
     name = "calibrator",
     srcs = [
@@ -87,19 +55,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "sparsifier",
-    srcs = [
-        "sparsifier.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "_pywrap_tensorflow_lite_sparsification_wrapper",  # buildcleaner: keep
-        "//tensorflow/python:util",
-    ],
-)
-
 py_test(
     name = "calibrator_test",
     srcs = ["calibrator_test.py"],
@@ -121,20 +76,3 @@ py_test(
         "@six_archive//:six",
     ],
 )
-
-py_test(
-    name = "sparsifier_test",
-    srcs = ["sparsifier_test.py"],
-    data = [
-        "//tensorflow/lite:testdata/multi_add.bin",
-    ],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    tags = ["no_oss"],
-    deps = [
-        ":sparsifier",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
-    ],
-)
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 5a7a3ae2aa5..a115e401cfa 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -218,16 +218,33 @@ PyObject* CalibrationWrapper::SetTensor(int index, PyObject* value) {
     return nullptr;
   }
 
+  std::vector<int> dims(PyArray_NDIM(array));
+  bool has_unknown_dims = false;
   for (int j = 0; j < PyArray_NDIM(array); j++) {
-    if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
+    // Ensure the calibration data input shape is the same as the model input
+    // shape unless the dimension is unknown.
+    if (tensor->dims_signature->size == tensor->dims->size &&
+        tensor->dims_signature->data[j] == -1) {
+      has_unknown_dims = true;
+    } else if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
       PyErr_Format(PyExc_ValueError,
                    "Cannot set tensor: Size mismatch, expected %d for dim "
                    "%d but found %ld",
                    tensor->dims->data[j], j, PyArray_SHAPE(array)[j]);
       return nullptr;
     }
+    dims[j] = PyArray_SHAPE(array)[j];
   }
 
+  // Resize the input tensor if there are unknown dimensions.
+  if (has_unknown_dims) {
+    // Does strict checking on the `ResizeInputTensor` call.
+    TFLITE_PY_CHECK(interpreter_->ResizeInputTensorStrict(index, dims));
+    TFLITE_PY_CHECK(interpreter_->AllocateTensors());
+  }
+
+  tensor = interpreter_->tensor(index);
+
   size_t size = PyArray_NBYTES(array);
   if (size != tensor->bytes) {
     PyErr_Format(PyExc_ValueError,
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
index 06a617463aa..09defb1aed1 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
@@ -31,37 +31,36 @@ PYBIND11_MODULE(_pywrap_tensorflow_lite_calibration_wrapper, m) {
       }))
       .def("Prepare",
            [](CalibrationWrapper& self, py::handle& input_shapes) {
-             return tensorflow::pyo_or_throw(self.Prepare(input_shapes.ptr()));
+             return tensorflow::PyoOrThrow(self.Prepare(input_shapes.ptr()));
            })
       .def("Prepare",
            [](CalibrationWrapper& self) {
-             return tensorflow::pyo_or_throw(self.Prepare());
+             return tensorflow::PyoOrThrow(self.Prepare());
+           })
+      .def("FeedTensor",
+           [](CalibrationWrapper& self, py::handle& input_value) {
+             return tensorflow::PyoOrThrow(self.FeedTensor(input_value.ptr()));
            })
-      .def(
-          "FeedTensor",
-          [](CalibrationWrapper& self, py::handle& input_value) {
-            return tensorflow::pyo_or_throw(self.FeedTensor(input_value.ptr()));
-          })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
               bool allow_float, bool enable_mlir_quantizer) {
-             return tensorflow::pyo_or_throw(self.QuantizeModel(
+             return tensorflow::PyoOrThrow(self.QuantizeModel(
                  input_py_type, output_py_type, allow_float));
            })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
               bool allow_float) {
-             return tensorflow::pyo_or_throw(self.QuantizeModel(
+             return tensorflow::PyoOrThrow(self.QuantizeModel(
                  input_py_type, output_py_type, allow_float));
            })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
               bool allow_float, const char* operator_output_name) {
-             return tensorflow::pyo_or_throw(
+             return tensorflow::PyoOrThrow(
                  self.QuantizeModel(input_py_type, output_py_type, allow_float,
                                     operator_output_name));
            })
       .def("Calibrate", [](CalibrationWrapper& self) {
-        return tensorflow::pyo_or_throw(self.Calibrate());
+        return tensorflow::PyoOrThrow(self.Calibrate());
       });
 }
diff --git a/tensorflow/lite/python/optimize/sparsification_wrapper.cc b/tensorflow/lite/python/optimize/sparsification_wrapper.cc
deleted file mode 100644
index 3526ac0b129..00000000000
--- a/tensorflow/lite/python/optimize/sparsification_wrapper.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/python/optimize/sparsification_wrapper.h"
-
-#include <memory>
-#include <sstream>
-#include <string>
-
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
-#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
-#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
-
-#define TFLITE_PY_CHECK(x)               \
-  if ((x) != kTfLiteOk) {                \
-    return error_reporter_->exception(); \
-  }
-
-#define TFLITE_PY_ENSURE_VALID_INTERPRETER()                               \
-  if (!interpreter_) {                                                     \
-    PyErr_SetString(PyExc_ValueError, "Interpreter was not initialized."); \
-    return nullptr;                                                        \
-  }
-
-namespace tflite {
-namespace sparsification_wrapper {
-
-namespace {
-
-std::unique_ptr<tflite::ModelT> CreateMutableModel(const tflite::Model& model) {
-  auto copied_model = absl::make_unique<tflite::ModelT>();
-  model.UnPackTo(copied_model.get(), nullptr);
-  return copied_model;
-}
-
-}  // namespace
-
-SparsificationWrapper::SparsificationWrapper(
-    std::unique_ptr<tflite::FlatBufferModel> model,
-    std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
-        error_reporter)
-    : model_(std::move(model)), error_reporter_(std::move(error_reporter)) {}
-SparsificationWrapper::~SparsificationWrapper() {}
-
-PyObject* SparsificationWrapper::SparsifyModel() {
-  auto tflite_model = CreateMutableModel(*model_->GetModel());
-  flatbuffers::FlatBufferBuilder builder;
-  auto status = kTfLiteOk;
-  status =
-      mlir::lite::SparsifyModel(*tflite_model, &builder, error_reporter_.get());
-
-  if (status != kTfLiteOk) {
-    error_reporter_->exception();
-    return nullptr;
-  }
-
-  return python_utils::ConvertToPyString(
-      reinterpret_cast<const char*>(builder.GetCurrentBufferPointer()),
-      builder.GetSize());
-}
-
-/*static*/ SparsificationWrapper*
-SparsificationWrapper::CreateWrapperCPPFromBuffer(PyObject* data) {
-  using tflite::interpreter_wrapper::PythonErrorReporter;
-  char* buf = nullptr;
-  Py_ssize_t length;
-  std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
-  ::tflite::python::ImportNumpy();
-
-  if (python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
-    return nullptr;
-  }
-  std::unique_ptr<tflite::FlatBufferModel> model =
-      tflite::FlatBufferModel::BuildFromBuffer(buf, length,
-                                               error_reporter.get());
-  if (!model) {
-    PyErr_Format(PyExc_ValueError, "Invalid model");
-    return nullptr;
-  }
-
-  auto wrapper =
-      new SparsificationWrapper(std::move(model), std::move(error_reporter));
-  return wrapper;
-}
-
-}  // namespace sparsification_wrapper
-}  // namespace tflite
diff --git a/tensorflow/lite/python/optimize/sparsification_wrapper.h b/tensorflow/lite/python/optimize/sparsification_wrapper.h
deleted file mode 100644
index b6c5ae7147e..00000000000
--- a/tensorflow/lite/python/optimize/sparsification_wrapper.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_PYTHON_OPTIMIZE_SPARSIFICATION_WRAPPER_H_
-#define TENSORFLOW_LITE_PYTHON_OPTIMIZE_SPARSIFICATION_WRAPPER_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-// Place `<locale>` before <Python.h> to avoid build failures in macOS.
-#include <locale>
-
-// The empty line above is on purpose as otherwise clang-format will
-// automatically move <Python.h> before <locale>.
-#include <Python.h>
-
-// We forward declare TFLite classes here to avoid exposing them to SWIG.
-namespace tflite {
-
-class FlatBufferModel;
-
-namespace interpreter_wrapper {
-class PythonErrorReporter;
-}  // namespace interpreter_wrapper
-
-namespace sparsification_wrapper {
-
-class SparsificationWrapper {
- public:
-  // SWIG caller takes ownership of pointer.
-  static SparsificationWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
-  ~SparsificationWrapper();
-
-  PyObject* SparsifyModel();
-
- private:
-  // SparsificationWrapper is not copyable or assignable. We avoid the use of
-  // SparsificationWrapper() = delete here for SWIG compatibility.
-  SparsificationWrapper(
-      std::unique_ptr<tflite::FlatBufferModel> model,
-      std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
-          error_reporter);
-  std::unique_ptr<tflite::FlatBufferModel> model_;
-  std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
-      error_reporter_;
-};
-
-}  // namespace sparsification_wrapper
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_PYTHON_OPTIMIZE_SPARSIFICATION_WRAPPER_H_
diff --git a/tensorflow/lite/python/optimize/sparsifier.py b/tensorflow/lite/python/optimize/sparsifier.py
deleted file mode 100644
index a91d78be1fb..00000000000
--- a/tensorflow/lite/python/optimize/sparsifier.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python wrapper for convert models from dense to sparse format."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-# Lazy load since some of the performance benchmark skylark rules
-# break dependencies. Must use double quotes to match code internal rewrite
-# rule.
-_sparsification_wrapper = LazyLoader(
-    "_sparsification_wrapper", globals(),
-    "tensorflow.lite.python.optimize."
-    "_pywrap_tensorflow_lite_sparsification_wrapper")
-
-
-class Sparsifier(object):
-  """Convert a model from dense to sparse format.
-
-  This is an internal class, not a public interface.
-  """
-
-  def __init__(self, model_content):
-    """Constructor.
-
-    Args:
-      model_content: Content of a TFLite Flatbuffer file.
-
-    Raises:
-      ValueError: If unable to open the model.
-    """
-    if not model_content:
-      raise ValueError("`model_content` must be specified.")
-    try:
-      self._sparsifier = (
-          _sparsification_wrapper.SparsificationWrapper(model_content))
-    except Exception as e:
-      raise ValueError("Failed to parse the model: %s." % e)
-    if not self._sparsifier:
-      raise ValueError("Failed to parse the model.")
-
-  def sparsify(self):
-    """Convert the model to sparse format.
-
-    Returns:
-      A sparse model.
-    """
-    return self._sparsifier.SparsifyModel()
diff --git a/tensorflow/lite/python/optimize/sparsifier_test.py b/tensorflow/lite/python/optimize/sparsifier_test.py
deleted file mode 100644
index 31904545d77..00000000000
--- a/tensorflow/lite/python/optimize/sparsifier_test.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Lint as: python2, python3
-# # Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tensorflow.lite.python.optimize.format_converter."""
-
-# These 3 lines below are not necessary in a Python 3-only module
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.lite.python.optimize import sparsifier
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import test
-
-
-class SparsifierTest(test_util.TensorFlowTestCase):
-
-  def test_simple(self):
-    model_path = resource_loader.get_path_to_datafile(
-        '../../testdata/multi_add.bin')
-    dense_model = open(model_path, 'rb').read()
-    converter = sparsifier.Sparsifier(dense_model)
-
-    sparse_model = converter.sparsify()
-    self.assertIsNotNone(sparse_model)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index d0dd7313df3..c7504a3a638 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -65,6 +65,8 @@ def _parse_inference_type(value, flag):
     return lite_constants.FLOAT
   if value == "QUANTIZED_UINT8":
     return lite_constants.QUANTIZED_UINT8
+  if value == "INT8":
+    return lite_constants.INT8
   raise ValueError("Unsupported value for --{0}. Only FLOAT and "
                    "QUANTIZED_UINT8 are supported.".format(flag))
 
@@ -352,12 +354,12 @@ def _get_tf1_flags(parser):
   parser.add_argument(
       "--inference_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help="Target data type of real-number arrays in the output file.")
   parser.add_argument(
       "--inference_input_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help=("Target data type of real-number input arrays. Allows for a "
             "different type for input arrays in the case of quantization."))
 
diff --git a/tensorflow/lite/python/tflite_convert_test.py b/tensorflow/lite/python/tflite_convert_test.py
index 1e80907edbd..d6a35ba9248 100644
--- a/tensorflow/lite/python/tflite_convert_test.py
+++ b/tensorflow/lite/python/tflite_convert_test.py
@@ -98,8 +98,8 @@ class TfLiteConvertV1Test(TestModels):
     sess.close()
 
     flags_str = ('--graph_def_file={0} --input_arrays={1} '
-                 '--output_arrays={2}'.format(graph_def_file,
-                                              'Placeholder', 'add'))
+                 '--output_arrays={2}'.format(graph_def_file, 'Placeholder',
+                                              'add'))
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
@@ -137,8 +137,31 @@ class TfLiteConvertV1Test(TestModels):
     sess.close()
 
     flags_str = ('--graph_def_file={0} --input_arrays={1} '
-                 '--output_arrays={2}'.format(graph_def_file,
-                                              'random', 'add'))
+                 '--output_arrays={2}'.format(graph_def_file, 'random', 'add'))
+    self._run(flags_str, should_succeed=True)
+    os.remove(graph_def_file)
+
+  def testQATFrozenGraphDefInt8(self):
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      _ = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output',
+          num_bits=16)  # INT8 inference type works for 16 bits fake quant.
+      sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = self._getFilepath('model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+    sess.close()
+
+    flags_str = ('--inference_type=INT8 --std_dev_values=128,128 '
+                 '--mean_values=128,128 '
+                 '--graph_def_file={0} --input_arrays={1},{2} '
+                 '--output_arrays={3}'.format(graph_def_file, 'inputA',
+                                              'inputB', 'output'))
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
@@ -166,8 +189,8 @@ class TfLiteConvertV1Test(TestModels):
   def testKerasFileMLIR(self):
     keras_file = self._getKerasModelFile()
 
-    flags_str = ('--keras_model_file={} --experimental_new_converter'
-                 .format(keras_file))
+    flags_str = (
+        '--keras_model_file={} --experimental_new_converter'.format(keras_file))
     self._run(flags_str, should_succeed=True)
     os.remove(keras_file)
 
@@ -299,8 +322,8 @@ class TfLiteConvertV2Test(TestModels):
   def testKerasFileMLIR(self):
     keras_file = self._getKerasModelFile()
 
-    flags_str = ('--keras_model_file={} --experimental_new_converter'
-                 .format(keras_file))
+    flags_str = (
+        '--keras_model_file={} --experimental_new_converter'.format(keras_file))
     self._run(flags_str, should_succeed=True)
     os.remove(keras_file)
 
diff --git a/tensorflow/lite/python/wrap_toco.py b/tensorflow/lite/python/wrap_toco.py
index 2d3357819a4..8f72cc8cbbd 100644
--- a/tensorflow/lite/python/wrap_toco.py
+++ b/tensorflow/lite/python/wrap_toco.py
@@ -43,6 +43,14 @@ def wrapped_get_potentially_supported_ops():
   return _pywrap_toco_api.TocoGetPotentiallySupportedOps()
 
 
-def wrapped_experimental_mlir_quantize(input_data_str):
+def wrapped_experimental_mlir_quantize(input_data_str, disable_per_channel,
+                                       inference_type):
   """Wraps experimental mlir quantize model."""
-  return _pywrap_toco_api.ExperimentalMlirQuantizeModel(input_data_str)
+  return _pywrap_toco_api.ExperimentalMlirQuantizeModel(input_data_str,
+                                                        disable_per_channel,
+                                                        inference_type)
+
+
+def wrapped_experimental_mlir_sparsify(input_data_str):
+  """Wraps experimental mlir sparsify model."""
+  return _pywrap_toco_api.ExperimentalMlirSparsifyModel(input_data_str)
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 836bde083ee..b7f41c756e4 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -662,6 +662,7 @@ table ResizeBilinearOptions {
 
 table ResizeNearestNeighborOptions {
   align_corners: bool;
+  half_pixel_centers: bool;
 }
 
 // A call operation options
@@ -969,8 +970,8 @@ table SegmentSumOptions {
 }
 
 table BatchMatMulOptions {
-  adjoint_lhs:bool;
-  adjoint_rhs:bool;
+  adj_x:bool;
+  adj_y:bool;
 }
 
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 9c8deef700f..b044acb4033 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 // automatically generated by the FlatBuffers compiler, do not modify
 
+
 #ifndef FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
 #define FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
 
@@ -5375,22 +5376,29 @@ flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffe
 struct ResizeNearestNeighborOptionsT : public flatbuffers::NativeTable {
   typedef ResizeNearestNeighborOptions TableType;
   bool align_corners;
+  bool half_pixel_centers;
   ResizeNearestNeighborOptionsT()
-      : align_corners(false) {
+      : align_corners(false),
+        half_pixel_centers(false) {
   }
 };
 
 struct ResizeNearestNeighborOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ResizeNearestNeighborOptionsT NativeTableType;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_ALIGN_CORNERS = 4
+    VT_ALIGN_CORNERS = 4,
+    VT_HALF_PIXEL_CENTERS = 6
   };
   bool align_corners() const {
     return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
   }
+  bool half_pixel_centers() const {
+    return GetField<uint8_t>(VT_HALF_PIXEL_CENTERS, 0) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS) &&
+           VerifyField<uint8_t>(verifier, VT_HALF_PIXEL_CENTERS) &&
            verifier.EndTable();
   }
   ResizeNearestNeighborOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -5404,6 +5412,9 @@ struct ResizeNearestNeighborOptionsBuilder {
   void add_align_corners(bool align_corners) {
     fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
   }
+  void add_half_pixel_centers(bool half_pixel_centers) {
+    fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_HALF_PIXEL_CENTERS, static_cast<uint8_t>(half_pixel_centers), 0);
+  }
   explicit ResizeNearestNeighborOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -5418,8 +5429,10 @@ struct ResizeNearestNeighborOptionsBuilder {
 
 inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    bool align_corners = false) {
+    bool align_corners = false,
+    bool half_pixel_centers = false) {
   ResizeNearestNeighborOptionsBuilder builder_(_fbb);
+  builder_.add_half_pixel_centers(half_pixel_centers);
   builder_.add_align_corners(align_corners);
   return builder_.Finish();
 }
@@ -9230,30 +9243,30 @@ flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(flatbuffers::Flat
 
 struct BatchMatMulOptionsT : public flatbuffers::NativeTable {
   typedef BatchMatMulOptions TableType;
-  bool adjoint_lhs;
-  bool adjoint_rhs;
+  bool adj_x;
+  bool adj_y;
   BatchMatMulOptionsT()
-      : adjoint_lhs(false),
-        adjoint_rhs(false) {
+      : adj_x(false),
+        adj_y(false) {
   }
 };
 
 struct BatchMatMulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef BatchMatMulOptionsT NativeTableType;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_ADJOINT_LHS = 4,
-    VT_ADJOINT_RHS = 6
+    VT_ADJ_X = 4,
+    VT_ADJ_Y = 6
   };
-  bool adjoint_lhs() const {
-    return GetField<uint8_t>(VT_ADJOINT_LHS, 0) != 0;
+  bool adj_x() const {
+    return GetField<uint8_t>(VT_ADJ_X, 0) != 0;
   }
-  bool adjoint_rhs() const {
-    return GetField<uint8_t>(VT_ADJOINT_RHS, 0) != 0;
+  bool adj_y() const {
+    return GetField<uint8_t>(VT_ADJ_Y, 0) != 0;
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint8_t>(verifier, VT_ADJOINT_LHS) &&
-           VerifyField<uint8_t>(verifier, VT_ADJOINT_RHS) &&
+           VerifyField<uint8_t>(verifier, VT_ADJ_X) &&
+           VerifyField<uint8_t>(verifier, VT_ADJ_Y) &&
            verifier.EndTable();
   }
   BatchMatMulOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -9264,11 +9277,11 @@ struct BatchMatMulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct BatchMatMulOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_adjoint_lhs(bool adjoint_lhs) {
-    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJOINT_LHS, static_cast<uint8_t>(adjoint_lhs), 0);
+  void add_adj_x(bool adj_x) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJ_X, static_cast<uint8_t>(adj_x), 0);
   }
-  void add_adjoint_rhs(bool adjoint_rhs) {
-    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJOINT_RHS, static_cast<uint8_t>(adjoint_rhs), 0);
+  void add_adj_y(bool adj_y) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJ_Y, static_cast<uint8_t>(adj_y), 0);
   }
   explicit BatchMatMulOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
@@ -9284,11 +9297,11 @@ struct BatchMatMulOptionsBuilder {
 
 inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    bool adjoint_lhs = false,
-    bool adjoint_rhs = false) {
+    bool adj_x = false,
+    bool adj_y = false) {
   BatchMatMulOptionsBuilder builder_(_fbb);
-  builder_.add_adjoint_rhs(adjoint_rhs);
-  builder_.add_adjoint_lhs(adjoint_lhs);
+  builder_.add_adj_y(adj_y);
+  builder_.add_adj_x(adj_x);
   return builder_.Finish();
 }
 
@@ -11634,6 +11647,7 @@ inline void ResizeNearestNeighborOptions::UnPackTo(ResizeNearestNeighborOptionsT
   (void)_o;
   (void)_resolver;
   { auto _e = align_corners(); _o->align_corners = _e; }
+  { auto _e = half_pixel_centers(); _o->half_pixel_centers = _e; }
 }
 
 inline flatbuffers::Offset<ResizeNearestNeighborOptions> ResizeNearestNeighborOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -11645,9 +11659,11 @@ inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeig
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ResizeNearestNeighborOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _align_corners = _o->align_corners;
+  auto _half_pixel_centers = _o->half_pixel_centers;
   return tflite::CreateResizeNearestNeighborOptions(
       _fbb,
-      _align_corners);
+      _align_corners,
+      _half_pixel_centers);
 }
 
 inline CallOptionsT *CallOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -13617,8 +13633,8 @@ inline BatchMatMulOptionsT *BatchMatMulOptions::UnPack(const flatbuffers::resolv
 inline void BatchMatMulOptions::UnPackTo(BatchMatMulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = adjoint_lhs(); _o->adjoint_lhs = _e; }
-  { auto _e = adjoint_rhs(); _o->adjoint_rhs = _e; }
+  { auto _e = adj_x(); _o->adj_x = _e; }
+  { auto _e = adj_y(); _o->adj_y = _e; }
 }
 
 inline flatbuffers::Offset<BatchMatMulOptions> BatchMatMulOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -13629,12 +13645,12 @@ inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(flatbuff
   (void)_rehasher;
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BatchMatMulOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _adjoint_lhs = _o->adjoint_lhs;
-  auto _adjoint_rhs = _o->adjoint_rhs;
+  auto _adj_x = _o->adj_x;
+  auto _adj_y = _o->adj_y;
   return tflite::CreateBatchMatMulOptions(
       _fbb,
-      _adjoint_lhs,
-      _adjoint_rhs);
+      _adj_x,
+      _adj_y);
 }
 
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index eefbe1fb778..c1373d3a5c2 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -30,7 +30,9 @@ def tflite_extra_gles_deps():
 
 def tflite_ios_lab_runner(version):
     """This is a no-op outside of Google."""
-    return None
+
+    # Can switch back to None when https://github.com/bazelbuild/rules_apple/pull/757 is fixed
+    return "@build_bazel_rules_apple//apple/testing/default_runner:ios_default_runner"
 
 def if_nnapi(supported, not_supported = [], supported_android = None):
     if supported_android == None:
diff --git a/tensorflow/lite/string_util.cc b/tensorflow/lite/string_util.cc
index f7fcf2ac630..44719858f2a 100644
--- a/tensorflow/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -89,6 +89,7 @@ int DynamicBuffer::WriteToBuffer(char** buffer) {
   return bytes;
 }
 
+#ifndef TF_LITE_STATIC_MEMORY
 void DynamicBuffer::WriteToTensorAsVector(TfLiteTensor* tensor) {
   auto dims = TfLiteIntArrayCreate(1);
   dims->data[0] = offset_.size() - 1;  // Store number of strings.
@@ -109,6 +110,7 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
                     tensor->is_variable, tensor);
 }
+#endif  // TF_LITE_STATIC_MEMORY
 
 int GetStringCount(const void* raw_buffer) {
   // The first integers in the raw buffer is the number of strings.
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index 779b1e12ab8..879aa76b83b 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -74,6 +74,9 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
+  // String tensors are not generally supported on platforms w/ static memory.
+  // TODO(b/156130024): Remove this guard after removing header from TFLM deps.
+#ifndef TF_LITE_STATIC_MEMORY
   // Fill content into a string tensor, with the given new_shape. The new shape
   // must match the number of strings in this object. Caller relinquishes
   // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
@@ -82,6 +85,7 @@ class DynamicBuffer {
 
   // Fill content into a string tensor. Set shape to {num_strings}.
   void WriteToTensorAsVector(TfLiteTensor* tensor);
+#endif  // TF_LITE_STATIC_MEMORY
 
  private:
   // Data buffer to store contents of strings, not including headers.
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
index ec2768d49ec..28d93840c56 100644
--- a/tensorflow/lite/string_util_test.cc
+++ b/tensorflow/lite/string_util_test.cc
@@ -33,13 +33,22 @@ TEST(StringUtil, TestStringUtil) {
   t1->type = kTfLiteString;
   t1->allocation_type = kTfLiteDynamic;
 
-  char data[] = {1, 0, 0, 0, 12, 0, 0, 0, 15, 0, 0, 0, 'X', 'Y', 'Z'};
+  // String tensor with one string of length 3
+  union {
+    char raw_bytes[15];
+    struct {
+      int32_t num_strs;
+      int32_t offsets[2];
+      char str_data[3];
+    } tensor_data;
+  } data;
+  data.tensor_data = {1, {12, 15}, {'X', 'Y', 'Z'}};
 
   TfLiteQuantization quant;
   quant.type = kTfLiteNoQuantization;
   quant.params = nullptr;
-  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, quant,
-                                          data, 15);
+  interpreter.SetTensorParametersReadOnly(
+      2, kTfLiteString, "", {1}, quant, data.raw_bytes, sizeof(data.raw_bytes));
   TfLiteTensor* t2 = interpreter.tensor(2);
   interpreter.AllocateTensors();
 
diff --git a/tensorflow/lite/testdata/dynamic_shapes.bin b/tensorflow/lite/testdata/dynamic_shapes.bin
new file mode 100644
index 00000000000..268d457131a
Binary files /dev/null and b/tensorflow/lite/testdata/dynamic_shapes.bin differ
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 5de8a68c94b..379230b3a4b 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -68,8 +68,8 @@ exports_files([
             "//tensorflow/core:test",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
     }),
 ) for conversion_mode, test_name, tags, args in generated_test_models_all() + merged_test_models()]
@@ -222,10 +222,10 @@ cc_library(
         "@com_google_absl//absl/strings",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/experimental/kernels:hashtable_op_kernels",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:custom_ops",
+        "//tensorflow/lite/kernels/hashtable:hashtable_op_kernels",
         "//tensorflow/lite/kernels:reference_ops",
         "//tensorflow/lite/tools/evaluation:utils",
     ] + select({
@@ -326,10 +326,10 @@ cc_library(
             "//tensorflow/core:tensorflow",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -365,10 +365,10 @@ cc_library(
             "//tensorflow/core:framework",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -405,10 +405,10 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -440,10 +440,10 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index 9b3b7866caa..fc92991bd57 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -127,6 +127,7 @@ from tensorflow.lite.testing.op_tests.reverse_sequence import make_reverse_seque
 from tensorflow.lite.testing.op_tests.reverse_v2 import make_reverse_v2_tests
 from tensorflow.lite.testing.op_tests.rfft2d import make_rfft2d_tests
 from tensorflow.lite.testing.op_tests.round import make_round_tests
+from tensorflow.lite.testing.op_tests.scatter_nd import make_scatter_nd_tests
 from tensorflow.lite.testing.op_tests.shape import make_shape_tests
 from tensorflow.lite.testing.op_tests.sigmoid import make_sigmoid_tests
 from tensorflow.lite.testing.op_tests.slice import make_slice_tests
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
index 5180f2f4e5a..76333c76259 100644
--- a/tensorflow/lite/testing/kernel_test/BUILD
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -25,7 +25,7 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index 26f518590df..71a1a31ac4c 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -424,7 +424,8 @@ def test_frozen_graph_quant(filename,
 
   # Convert and load the quantized model.
   converter = _lite.TFLiteConverter.from_frozen_graph(filename, input_arrays,
-                                                      output_arrays)
+                                                      output_arrays,
+                                                      input_shapes)
   tflite_model_quant = _convert(
       converter, post_training_quantize=True, **kwargs)
 
diff --git a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
index 794bc2dd54f..e3f05697b0b 100644
--- a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
+++ b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
@@ -46,15 +46,6 @@ def make_batch_to_space_nd_tests(options):
           "constant_block_shape": [True],
           "constant_crops": [True],
       },
-      # Non-4D use case: 1 batch dimension, 3 spatial dimensions, 2 others.
-      {
-          "dtype": [tf.float32],
-          "input_shape": [[8, 2, 2, 2, 1, 1]],
-          "block_shape": [[2, 2, 2]],
-          "crops": [[[0, 0], [0, 0], [0, 0]]],
-          "constant_block_shape": [True, False],
-          "constant_crops": [True, False],
-      },
       # 3D use case.
       {
           "dtype": [tf.float32],
@@ -66,6 +57,17 @@ def make_batch_to_space_nd_tests(options):
       },
   ]
 
+  if options.run_with_flex:
+    # Non-4D use case: 1 batch dimension, 3 spatial dimensions, 2 others.
+    test_parameters = test_parameters + [{
+        "dtype": [tf.float32],
+        "input_shape": [[8, 2, 2, 2, 1, 1]],
+        "block_shape": [[2, 2, 2]],
+        "crops": [[[0, 0], [0, 0], [0, 0]]],
+        "constant_block_shape": [True, False],
+        "constant_crops": [True, False],
+    }]
+
   def build_graph(parameters):
     """Build a batch_to_space graph given `parameters`."""
     input_tensor = tf.compat.v1.placeholder(
diff --git a/tensorflow/lite/testing/op_tests/equal.py b/tensorflow/lite/testing/op_tests/equal.py
index 76a3fed1456..ddbece129d3 100644
--- a/tensorflow/lite/testing/op_tests/equal.py
+++ b/tensorflow/lite/testing/op_tests/equal.py
@@ -28,7 +28,7 @@ def make_equal_tests(options):
   """Make a set of tests to do equal."""
 
   test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_dtype": [tf.float32, tf.int32, tf.int64, tf.string],
       "input_shape_pair": [([], []), ([1, 1, 1, 3], [1, 1, 1, 3]),
                            ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
                            ([5, 5], [1]), ([10], [2, 4, 10])],
@@ -60,4 +60,4 @@ def make_equal_tests(options):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_failures=3)
+      expected_tf_failures=4)
diff --git a/tensorflow/lite/testing/op_tests/mirror_pad.py b/tensorflow/lite/testing/op_tests/mirror_pad.py
index 0d466354ac3..6452068b1a1 100644
--- a/tensorflow/lite/testing/op_tests/mirror_pad.py
+++ b/tensorflow/lite/testing/op_tests/mirror_pad.py
@@ -33,49 +33,57 @@ def make_mirror_pad_tests(options):
           "input_shape": [[2, 3]],
           "padding_matrix": [[[1, 1], [2, 1]]],
           "mode": ["REFLECT"],
-          "type": ["const"]
+          "type": ["const"],
+          "fully_quantize": [True, False],
       },
       {
           "input_shape": [[2, 3]],
           "padding_matrix": [[[1, 1], [1, 1]]],
           "mode": ["REFLECT"],
-          "type": ["const"]
+          "type": ["const"],
+          "fully_quantize": [False],
       },
       {
           "input_shape": [[2, 3]],
           "padding_matrix": [[[1, 1], [2, 1]]],
           "mode": ["SYMMETRIC"],
-          "type": ["placeholder"]
+          "type": ["placeholder"],
+          "fully_quantize": [False],
       },
       {
           "input_shape": [[2, 3]],
           "padding_matrix": [[[1, 1], [2, 1]]],
           "mode": ["REFLECT"],
-          "type": ["placeholder"]
+          "type": ["placeholder"],
+          "fully_quantize": [False],
       },
       {
           "input_shape": [[3]],
           "padding_matrix": [[[0, 2]]],
           "mode": ["SYMMETRIC"],
-          "type": ["placeholder"]
+          "type": ["placeholder"],
+          "fully_quantize": [False],
       },
       {
           "input_shape": [[3]],
           "padding_matrix": [[[0, 2]]],
           "mode": ["SYMMETRIC"],
-          "type": ["const"]
+          "type": ["const"],
+          "fully_quantize": [False],
       },
       {
           "input_shape": [[3]],
           "padding_matrix": [[[0, 2]]],
           "mode": ["REFLECT"],
-          "type": ["const"]
+          "type": ["const"],
+          "fully_quantize": [False, True],
       },
       {
           "input_shape": [[3, 2, 4, 5]],
           "padding_matrix": [[[1, 1], [2, 2], [1, 1], [1, 1]]],
           "mode": ["SYMMETRIC"],
-          "type": ["placeholder"]
+          "type": ["placeholder"],
+          "fully_quantize": [False],
       },
   ]
 
@@ -83,8 +91,8 @@ def make_mirror_pad_tests(options):
     """Build the graph for the test case."""
 
     input_tensor = tf.compat.v1.placeholder(
-        dtype=tf.int32, name="input", shape=parameters["input_shape"])
-    if parameters["type"] != "const":
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    if parameters["type"] != "const" and not parameters["fully_quantize"]:
       padding_matrix = tf.compat.v1.placeholder(
           dtype=tf.int32,
           name="padding",
@@ -99,7 +107,13 @@ def make_mirror_pad_tests(options):
     return input_tensors, [output]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = [create_tensor_data(tf.int32, parameters["input_shape"])]
+    if not parameters["fully_quantize"]:
+      input_values = [create_tensor_data(tf.float32, parameters["input_shape"])]
+    else:
+      input_values = [
+          create_tensor_data(
+              tf.float32, parameters["input_shape"], min_value=-1, max_value=1)
+      ]
     if parameters["type"] != "const":
       input_values.append(np.array(parameters["padding_matrix"]))
     return input_values, sess.run(
diff --git a/tensorflow/lite/testing/op_tests/not_equal.py b/tensorflow/lite/testing/op_tests/not_equal.py
index 7ecf6e2ffb6..e0f9d3c0735 100644
--- a/tensorflow/lite/testing/op_tests/not_equal.py
+++ b/tensorflow/lite/testing/op_tests/not_equal.py
@@ -28,7 +28,7 @@ def make_not_equal_tests(options):
   """Make a set of tests to do not equal."""
 
   test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_dtype": [tf.float32, tf.int32, tf.int64, tf.string],
       "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
                            ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
                            ([5, 5], [1]), ([10], [2, 4, 10])],
@@ -60,4 +60,4 @@ def make_not_equal_tests(options):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_failures=3)
+      expected_tf_failures=4)
diff --git a/tensorflow/lite/testing/op_tests/prelu.py b/tensorflow/lite/testing/op_tests/prelu.py
index f927c7a8b00..bc5875739ed 100644
--- a/tensorflow/lite/testing/op_tests/prelu.py
+++ b/tensorflow/lite/testing/op_tests/prelu.py
@@ -35,12 +35,33 @@ def make_prelu_tests(options):
           # channel.
           "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
           "shared_axes": [[1, 2], [1]],
+          "fully_quantize": [False],
+          "input_range": [(-10, 10)],
       },
       {
           # 2D-3D example. Share the 2nd axis.
           "input_shape": [[20, 20], [20, 20, 20]],
           "shared_axes": [[1]],
-      }
+          "fully_quantize": [False],
+          "input_range": [(-10, 10)],
+      },
+      # Quantized cases.
+      {
+          # The canonical case for image processing is having a 4D `input`
+          # (NHWC)and `shared_axes`=[1, 2], so the alpha parameter is per
+          # channel.
+          "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
+          "shared_axes": [[1, 2], [1]],
+          "fully_quantize": [True],
+          "input_range": [(-10, 10)],
+      },
+      {
+          # 2D-3D example. Share the 2nd axis.
+          "input_shape": [[20, 20], [20, 20, 20]],
+          "shared_axes": [[1]],
+          "fully_quantize": [True],
+          "input_range": [(-10, 10)],
+      },
   ]
 
   def build_graph(parameters):
@@ -64,7 +85,8 @@ def make_prelu_tests(options):
     for dim in range(1, len(input_shape)):
       alpha_shape.append(1 if dim in shared_axes else input_shape[dim])
 
-    alpha_values = create_tensor_data(np.float32, alpha_shape)
+    alpha_values = create_tensor_data(
+        np.float32, alpha_shape, min_value=-5, max_value=5)
 
     # There should be only 1 trainable variable tensor.
     variables = tf.compat.v1.all_variables()
diff --git a/tensorflow/lite/testing/op_tests/resize_bilinear.py b/tensorflow/lite/testing/op_tests/resize_bilinear.py
index 0316e84b692..9f40ed6e1de 100644
--- a/tensorflow/lite/testing/op_tests/resize_bilinear.py
+++ b/tensorflow/lite/testing/op_tests/resize_bilinear.py
@@ -31,35 +31,35 @@ def make_resize_bilinear_tests(options):
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[1, 3, 4, 3], [1, 10, 2, 1]],
       "size": [[1, 1], [4, 3], [2, 2], [5, 6]],
-      "align_corners": [None, True, False],
+      "align_corners": [True, False],
       "half_pixel_centers": [False],
       "fully_quantize": [False]
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 3, 4, 3], [1, 10, 2, 1]],
       "size": [[1, 1], [4, 3], [2, 2], [5, 6]],
-      "align_corners": [None, True, False],
+      "align_corners": [True, False],
       "half_pixel_centers": [False],
       "fully_quantize": [True]
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 16, 24, 3], [1, 12, 18, 3]],
       "size": [[8, 12], [12, 18]],
-      "align_corners": [None, True, False],
+      "align_corners": [True, False],
       "half_pixel_centers": [False],
       "fully_quantize": [True]
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 16, 24, 3], [1, 12, 18, 3]],
       "size": [[8, 12]],
-      "align_corners": [None, False],
+      "align_corners": [False],
       "half_pixel_centers": [True],
       "fully_quantize": [True]
   }, {
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[1, 3, 4, 3], [1, 10, 2, 1]],
       "size": [[1, 1], [4, 3], [2, 2], [5, 6]],
-      "align_corners": [None, False],
+      "align_corners": [False],
       "half_pixel_centers": [True],
       "fully_quantize": [False]
   }]
diff --git a/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py b/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py
index 4e49e0bb39b..386952f72e2 100644
--- a/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py
+++ b/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py
@@ -28,23 +28,26 @@ def make_resize_nearest_neighbor_tests(options):
   """Make a set of tests to do resize_nearest_neighbor."""
 
   test_parameters = [{
-      "dtype": [tf.float32, tf.int32],
-      "input_shape": [[1, 3, 4, 3], [1, 10, 2, 1]],
-      "size": [[1, 1], [4, 3], [2, 2], [5, 6]],
-      "align_corners": [False],
-      "fully_quantize": [False],
-  }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 3, 4, 3], [1, 10, 2, 1]],
       "size": [[1, 1], [4, 3], [2, 2], [5, 6]],
       "align_corners": [False],
-      "fully_quantize": [True],
+      "half_pixel_centers": [False],
+      "fully_quantize": [True, False],
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 16, 24, 3], [1, 12, 18, 3]],
       "size": [[8, 12], [12, 18]],
-      "align_corners": [None, True, False],
-      "fully_quantize": [True]
+      "align_corners": [True],
+      "half_pixel_centers": [False],
+      "fully_quantize": [True, False]
+  }, {
+      "dtype": [tf.float32],
+      "input_shape": [[1, 16, 24, 3], [1, 12, 18, 3]],
+      "size": [[8, 12], [12, 18]],
+      "align_corners": [False],
+      "half_pixel_centers": [True],
+      "fully_quantize": [True, False]
   }]
 
   def build_graph(parameters):
@@ -55,7 +58,8 @@ def make_resize_nearest_neighbor_tests(options):
     out = tf.image.resize_nearest_neighbor(
         input_tensor,
         size=parameters["size"],
-        align_corners=parameters["align_corners"])
+        align_corners=parameters["align_corners"],
+        half_pixel_centers=parameters["half_pixel_centers"])
     return [input_tensor], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/scatter_nd.py b/tensorflow/lite/testing/op_tests/scatter_nd.py
new file mode 100644
index 00000000000..cad6c32e2e3
--- /dev/null
+++ b/tensorflow/lite/testing/op_tests/scatter_nd.py
@@ -0,0 +1,76 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test configs for scatter_nd."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+from tensorflow.lite.testing.zip_test_utils import create_tensor_data
+from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
+from tensorflow.lite.testing.zip_test_utils import register_make_test_function
+
+
+@register_make_test_function()
+def make_scatter_nd_tests(options):
+  """Make a set of tests to do scatter_nd."""
+
+  test_parameters = [{
+      "indices_dtype": [tf.int32],
+      "indices_shape": [[4, 1]],
+      "indices_value": [[[4], [3], [1], [7]]],
+      "updates_dtype": [tf.int32, tf.int64, tf.float32],
+      "updates_shape": [[4]],
+      "shape_dtype": [tf.int32],
+      "shape_shape": [[1]],
+      "shape_value": [[8]]
+  }, {
+      "indices_dtype": [tf.int32],
+      "indices_shape": [[4, 2]],
+      "indices_value": [[[0, 0], [1, 0], [0, 2], [1, 2]]],
+      "updates_dtype": [tf.int32, tf.int64, tf.float32],
+      "updates_shape": [[4, 5]],
+      "shape_dtype": [tf.int32],
+      "shape_shape": [[3]],
+      "shape_value": [[2, 3, 5]]
+  }]
+
+  def build_graph(parameters):
+    """Build the scatter_nd op testing graph."""
+    indices = tf.compat.v1.placeholder(
+        dtype=parameters["indices_dtype"],
+        name="indices",
+        shape=parameters["indices_shape"])
+    updates = tf.compat.v1.placeholder(
+        dtype=parameters["updates_dtype"],
+        name="updates",
+        shape=parameters["updates_shape"])
+    shape = tf.compat.v1.placeholder(
+        dtype=parameters["shape_dtype"],
+        name="shape",
+        shape=parameters["shape_shape"])
+    out = tf.scatter_nd(indices, updates, shape)
+    return [indices, updates, shape], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    indices = np.array(parameters["indices_value"])
+    updates = create_tensor_data(parameters["updates_dtype"],
+                                 parameters["updates_shape"])
+    shape = np.array(parameters["shape_value"])
+    return [indices, updates, shape], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [indices, updates, shape])))
+
+  make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
diff --git a/tensorflow/lite/testing/op_tests/transpose_conv.py b/tensorflow/lite/testing/op_tests/transpose_conv.py
index 654856f0d88..09c1b5f4f14 100644
--- a/tensorflow/lite/testing/op_tests/transpose_conv.py
+++ b/tensorflow/lite/testing/op_tests/transpose_conv.py
@@ -38,6 +38,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 4, 1], [1, 10, 10, 3], [3, 20, 20, 1]],
           "filter_size": [[1, 1], [1, 2], [3, 3]],
+          "has_bias": [False],
           "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],
@@ -50,6 +51,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 1, 1, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
@@ -60,6 +62,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 2, 2, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
@@ -70,13 +73,25 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 4, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 2, 2, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 8, 6, 2]],
           "fully_quantize": [True]
-      }
+      },
+      {
+          "input_shape": [[1, 3, 3, 1]],
+          "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [True],
+          "strides": [[1, 1, 1, 1]],
+          "padding": ["SAME"],
+          "data_format": ["NHWC"],
+          "channel_multiplier": [1],
+          "output_shape": [[1, 3, 3, 2]],
+          "fully_quantize": [True]
+      },
   ]
 
   def get_tensor_shapes(parameters):
@@ -124,6 +139,13 @@ def make_transpose_conv_tests(options):
           strides=parameters["strides"],
           padding=parameters["padding"],
           data_format=parameters["data_format"])
+      if parameters["has_bias"]:
+        bias_input = create_tensor_data(
+            np.float32, (parameters["output_shape"][-1],),
+            min_value=-1,
+            max_value=1)
+        out = tf.nn.bias_add(
+            out, bias_input, data_format=parameters["data_format"])
 
     return input_tensors, [out]
 
diff --git a/tensorflow/lite/testing/string_util_wrapper.cc b/tensorflow/lite/testing/string_util_wrapper.cc
index 8d7d4588c3b..158392c3930 100644
--- a/tensorflow/lite/testing/string_util_wrapper.cc
+++ b/tensorflow/lite/testing/string_util_wrapper.cc
@@ -28,7 +28,7 @@ PYBIND11_MODULE(_pywrap_string_util, m) {
   m.def(
       "SerializeAsHexString",
       [](py::handle& string_tensor) {
-        return tensorflow::pyo_or_throw(
+        return tensorflow::PyoOrThrow(
             tflite::testing::python::SerializeAsHexString(string_tensor.ptr()));
       },
       R"pbdoc(
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 004c7155864..93a1b813f4d 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #if !defined(__APPLE__)
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #endif
-#include "tensorflow/lite/experimental/kernels/hashtable_ops.h"
 #include "tensorflow/lite/kernels/custom_ops_register.h"
+#include "tensorflow/lite/kernels/hashtable/hashtable_ops.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/string_util.h"
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.cc b/tensorflow/lite/tflite_with_xnnpack_optional.cc
new file mode 100644
index 00000000000..31d4ff50f28
--- /dev/null
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.cc
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
+
+#include "tensorflow/lite/core/macros.h"
+
+#ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#endif
+
+namespace tflite {
+
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+#ifndef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+// Using weak symbols to create a delegate allows automatic injection of the
+// delegate simply by adding it as a dependency. See the strong override in
+// lite/tflite_with_xnnpack.cc,
+TFLITE_ATTRIBUTE_WEAK TfLiteDelegatePtr
+AcquireXNNPACKDelegate(int num_threads) {
+  return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+}
+#endif
+
+#ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
+  auto opts = TfLiteXNNPackDelegateOptionsDefault();
+  // Note that we don't want to use the thread pool for num_threads == 1.
+  opts.num_threads = num_threads > 1 ? num_threads : 0;
+  return TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&opts),
+                           TfLiteXNNPackDelegateDelete);
+}
+#else
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
+  return AcquireXNNPACKDelegate(num_threads);
+}
+#endif
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.h b/tensorflow/lite/tflite_with_xnnpack_optional.h
new file mode 100644
index 00000000000..afbdbd17356
--- /dev/null
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#define TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#include <memory>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
+MaybeCreateXNNPACKDelegate(int num_threads);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index 81c685d4da6..ec3fb386d10 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -1417,6 +1417,21 @@ void ConvertResizeBilinearOperator(const Model& model,
       src_op.half_pixel_centers);
 }
 
+void ConvertResizeNearestNeighborOperator(
+    const Model& model, const ResizeNearestNeighborOperator& src_op,
+    GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* resize_op = tensorflow_graph->add_node();
+  resize_op->set_op("ResizeNearestNeighbor");
+  resize_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *resize_op->add_input() = src_op.inputs[0];
+  *resize_op->add_input() = src_op.inputs[1];
+  (*resize_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*resize_op->mutable_attr())["align_corners"].set_b(src_op.align_corners);
+  (*resize_op->mutable_attr())["half_pixel_centers"].set_b(
+      src_op.half_pixel_centers);
+}
+
 void ConvertOneHotOperator(const Model& model, const OneHotOperator& src_op,
                            GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* onehot_op = tensorflow_graph->add_node();
@@ -2227,6 +2242,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertResizeBilinearOperator(
         model, static_cast<const ResizeBilinearOperator&>(src_op),
         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kResizeNearestNeighbor) {
+    ConvertResizeNearestNeighborOperator(
+        model, static_cast<const ResizeNearestNeighborOperator&>(src_op),
+        tensorflow_graph);
   } else if (src_op.type == OperatorType::kSpaceToBatchND) {
     ConvertSpaceToBatchNDOperator(
         model, static_cast<const SpaceToBatchNDOperator&>(src_op),
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
index 62a4b52bbb8..fcad8bc0086 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
@@ -30,7 +30,8 @@ int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
   const string& weights_name = op.inputs[1];
   const auto& weights_shape = model.GetArray(weights_name).shape();
   if (op.type == OperatorType::kConv ||
-      op.type == OperatorType::kFullyConnected) {
+      op.type == OperatorType::kFullyConnected ||
+      op.type == OperatorType::kTransposeConv) {
     return weights_shape.dims(0);
   }
   if (op.type == OperatorType::kDepthwiseConv) {
@@ -40,8 +41,19 @@ int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
   return 0;
 }
 
+bool CheckOpInputSize(const Operator& op) {
+  if (op.type == OperatorType::kConv ||
+      op.type == OperatorType::kFullyConnected ||
+      op.type == OperatorType::kDepthwiseConv) {
+    return (op.inputs.size() >= 3);
+  } else if (op.type == OperatorType::kTransposeConv) {
+    return (op.inputs.size() >= 4);
+  }
+  return true;
+}
+
 bool ProcessLinearOperator(Model* model, Operator* op) {
-  if (op->inputs.size() >= 3) {
+  if (CheckOpInputSize(*op)) {
     return false;
   }
   const string& output_name = op->outputs[0];
@@ -52,7 +64,6 @@ bool ProcessLinearOperator(Model* model, Operator* op) {
   const int depth = GetOutputDepthFromWeights(*model, *op);
   const string& bias_name = AvailableArrayName(*model, output_name + "_bias");
   op->inputs.push_back(bias_name);
-  DCHECK_EQ(op->inputs.size(), 3);
   auto& bias_array = model->GetOrCreateArray(bias_name);
   bias_array.data_type = ArrayDataType::kFloat;
   bias_array.mutable_shape()->mutable_dims()->push_back(depth);
@@ -68,7 +79,8 @@ bool ProcessLinearOperator(Model* model, Operator* op) {
   auto* op = model->operators[op_index].get();
   if (op->type == OperatorType::kConv ||
       op->type == OperatorType::kDepthwiseConv ||
-      op->type == OperatorType::kFullyConnected) {
+      op->type == OperatorType::kFullyConnected ||
+      op->type == OperatorType::kTransposeConv) {
     if (ProcessLinearOperator(model, op)) {
       AddMessageF("Added bias vector to %s as %s", LogName(*op), op->inputs[2]);
       *modified = true;
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 330ce1bdf49..05a2fecf31d 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -17,16 +17,28 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/runtime/types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
 namespace {
 
+int GetBiasIndex(const Operator& op) {
+  if (op.type == OperatorType::kConv ||
+      op.type == OperatorType::kFullyConnected ||
+      op.type == OperatorType::kDepthwiseConv) {
+    return 2;
+  } else if (op.type == OperatorType::kTransposeConv) {
+    return 3;
+  }
+  LOG(FATAL) << "Unhandled operator type";
+  return 0;
+}
+
 void FuseAddOrSubParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
                                            const Operator* add_or_sub_op,
                                            int index_of_constant_input) {
@@ -36,7 +48,8 @@ void FuseAddOrSubParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
   if (preceding_op->inputs.size() < 3) {
     LOG(FATAL) << "Missing bias parameter";
   }
-  auto& bias = model->GetArray(preceding_op->inputs[2]);
+  const auto bias_ind = GetBiasIndex(*preceding_op);
+  auto& bias = model->GetArray(preceding_op->inputs[bias_ind]);
   bias.minmax = nullptr;
   const auto& operand =
       model->GetArray(add_or_sub_op->inputs[index_of_constant_input]);
@@ -101,7 +114,8 @@ void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
     LOG(FATAL) << "Missing bias parameter";
   }
   const auto& weights_name = preceding_op->inputs[1];
-  const auto& bias_name = preceding_op->inputs[2];
+  const auto bias_ind = GetBiasIndex(*preceding_op);
+  const auto& bias_name = preceding_op->inputs[bias_ind];
   auto& weights = model->GetArray(weights_name);
   DropMinMax(model, weights_name);
   auto& bias = model->GetArray(bias_name);
@@ -136,7 +150,8 @@ void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
   int output_depth;
 
   if (preceding_op->type == OperatorType::kConv ||
-      preceding_op->type == OperatorType::kFullyConnected) {
+      preceding_op->type == OperatorType::kFullyConnected ||
+      preceding_op->type == OperatorType::kTransposeConv) {
     output_depth = weights_shape.dims(0);
   } else if (preceding_op->type == OperatorType::kDepthwiseConv) {
     output_depth = weights_shape.dims(weights_shape.dimensions_count() - 1);
@@ -253,7 +268,8 @@ void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
 
   if (preceding_op->type != OperatorType::kConv &&
       preceding_op->type != OperatorType::kFullyConnected &&
-      preceding_op->type != OperatorType::kDepthwiseConv) {
+      preceding_op->type != OperatorType::kDepthwiseConv &&
+      preceding_op->type != OperatorType::kTransposeConv) {
     AddMessageF(
         "Not fusing %s because the preceding %s is not of one of the supported "
         "types",
@@ -261,6 +277,13 @@ void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
     return ::tensorflow::Status::OK();
   }
 
+  if (preceding_op->type == OperatorType::kTransposeConv &&
+      binary_op->type != OperatorType::kAdd) {
+    AddMessageF("Not fusing %s to preceding %s", LogName(*binary_op),
+                LogName(*preceding_op));
+    return ::tensorflow::Status::OK();
+  }
+
   if (preceding_op->fused_activation_function !=
       FusedActivationFunctionType::kNone) {
     AddMessageF(
@@ -278,7 +301,8 @@ void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
   }
 
   const auto& weights_name = preceding_op->inputs[1];
-  const auto& bias_name = preceding_op->inputs[2];
+  const auto bias_ind = GetBiasIndex(*preceding_op);
+  const auto& bias_name = preceding_op->inputs[bias_ind];
   const auto& weights = model->GetArray(weights_name);
   const auto& bias = model->GetArray(bias_name);
 
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 9816cc1df6a..171d522daa7 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -271,8 +271,8 @@ bool MinMaxApproximatelyEqual(const MinMax& minmax1, const MinMax& minmax2) {
   const double magnitude =
       std::min(minmax1.max - minmax1.min, minmax2.max - minmax2.min);
   const double tolerated = 1e-6 * magnitude;
-  return std::abs(minmax1.min - minmax2.min) < tolerated &&
-         std::abs(minmax1.max - minmax2.max) < tolerated;
+  return std::abs(minmax1.min - minmax2.min) <= tolerated &&
+         std::abs(minmax1.max - minmax2.max) <= tolerated;
 }
 
 // Propagates MinMax from any of the listed arrays, to all others.
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 34813bcc0de..006e624eb7a 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -2120,6 +2120,27 @@ void ProcessMatrixSetDiagOperator(Model* model, MatrixSetDiagOperator* op) {
   output_array.copy_shape(input_array.shape());
 }
 
+void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
+  CHECK_EQ(op->inputs.size(), 3);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& shape_array = model->GetArray(op->inputs[2]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+
+  if (!shape_array.has_shape()) {
+    // Yield until dims shape been resolved.
+    return;
+  }
+  if (!shape_array.buffer) {
+    // Yield until the dims are constant
+    return;
+  }
+  CHECK(shape_array.data_type == ArrayDataType::kInt32) << "dims must be int32";
+
+  std::vector<int32> const& dims =
+      shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  *(output_array.mutable_shape()->mutable_dims()) = dims;
+}
+
 }  // namespace
 
 ::tensorflow::Status PropagateFixedSizes::Run(Model* model,
@@ -2454,6 +2475,9 @@ void ProcessMatrixSetDiagOperator(Model* model, MatrixSetDiagOperator* op) {
       break;
     case OperatorType::kSegmentSum:
       break;
+    case OperatorType::kScatterNd:
+      ProcessScatterNdOperator(model, static_cast<ScatterNdOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index 421bff60a43..e6fd88c9787 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -244,6 +244,13 @@ bool ChooseQuantizationForOperatorInput(
       weights_input_index = 1;
     }
   }
+  if (op.type == OperatorType::kTransposeConv) {
+    if (input_index == 3) {
+      is_bias_vector = true;
+      activations_input_index = 2;
+      weights_input_index = 1;
+    }
+  }
   if (op.type == OperatorType::kLstmCell) {
     if (input_index == LstmCellOperator::BIASES_INPUT) {
       is_bias_vector = true;
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
index 0b7b9d6471a..d83e97e1571 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -99,3 +99,15 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+tf_cc_test(
+    name = "fuse_binary_into_preceding_affine_test",
+    srcs = ["fuse_binary_into_preceding_affine_test.cc"],
+    deps = [
+        "//tensorflow/lite/toco:graph_transformations",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc
new file mode 100644
index 00000000000..b5c321c1a26
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc
@@ -0,0 +1,115 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+std::vector<testing::Matcher<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5) {
+  std::vector<testing::Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    matchers.emplace_back(testing::FloatNear(v, max_abs_error));
+  }
+  return matchers;
+}
+}  // namespace
+
+class FuseBinaryIntoPrecedingAffineTest : public ::testing::Test {
+ protected:
+  FuseBinaryIntoPrecedingAffineTest() {}
+
+  void SetUp() override { model_.reset(new Model); }
+
+  void CreateArray(const string& name, const std::vector<int>& shape) {
+    Array& array = model_->GetOrCreateArray(name);
+    array.data_type = ArrayDataType::kFloat;
+    Shape* array_shape = array.mutable_shape();
+    *(array_shape->mutable_dims()) = shape;
+  }
+
+  void CreateConstantArray(const string& name, const std::vector<int>& shape,
+                           const std::vector<float>& data) {
+    CreateArray(name, shape);
+    Array& array = model_->GetOrCreateArray(name);
+    auto& array_buffer = array.GetMutableBuffer<ArrayDataType::kFloat>();
+    int bufsize = 1;
+    for (int dim : shape) {
+      bufsize *= dim;
+    }
+    array_buffer.data.resize(bufsize);
+    float* buf_ptr = array_buffer.data.data();
+    for (int i = 0; i < bufsize; ++i) {
+      buf_ptr[i] = data[i];
+    }
+  }
+
+  std::unique_ptr<Model> model_;
+};
+
+TEST_F(FuseBinaryIntoPrecedingAffineTest, FuseAddIntoTransposeConv) {
+  // Creating a model.
+  {
+    CreateConstantArray(/*name=*/"OutputShape",
+                        /*shape=*/{1, 2}, /*data=*/{2, 2});
+    CreateConstantArray("TransConvWeight", {2, 2}, {1.0, 2.0, 3.0, 4.0});
+    CreateConstantArray("TransConvBias", {1}, {1.0});
+    CreateArray(/*name=*/"TransConvInput",
+                /*shape=*/{2, 2});
+    CreateArray("TransConvOutput", {2, 2});
+    CreateConstantArray("AddInput2", {1}, {2.0});
+    CreateArray("AddOutput", {2, 2});
+
+    auto* tc_op = new TransposeConvOperator;
+    tc_op->inputs = {"OutputShape", "TransConvWeight", "TransConvInput",
+                     "TransConvBias"};
+    tc_op->outputs = {"TransConvOutput"};
+    model_->operators.push_back(std::unique_ptr<Operator>(tc_op));
+
+    auto* add_op = new AddOperator;
+    add_op->inputs = {"TransConvOutput", "AddInput2"};
+    add_op->outputs = {"AddOutput"};
+    model_->operators.push_back(std::unique_ptr<Operator>(add_op));
+  }
+  toco::FuseBinaryIntoPrecedingAffine transformation;
+  bool modified;
+  ASSERT_TRUE(transformation.Run(model_.get(), /*op_index=*/1, &modified).ok());
+  EXPECT_TRUE(modified);
+
+  // `Add` should be fused into `TransposeConv`. Only 1 op is left.
+  ASSERT_EQ(model_->operators.size(), 1);
+  const auto& op = model_->operators[0];
+  ASSERT_EQ(op->type, OperatorType::kTransposeConv);
+  ASSERT_EQ(op->inputs.size(), 4);
+
+  auto& weights_array = model_->GetArray(op->inputs[1]);
+  EXPECT_THAT(weights_array.GetBuffer<toco::ArrayDataType::kFloat>().data,
+              ElementsAreArray(ArrayFloatNear({1.0, 2.0, 3.0, 4.0})));
+
+  auto& bias_array = model_->GetArray(op->inputs[3]);
+  EXPECT_THAT(bias_array.GetBuffer<toco::ArrayDataType::kFloat>().data,
+              ElementsAreArray(ArrayFloatNear({3.0})));
+}
+}  // namespace toco
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index ce84195c16a..3124133047e 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -27,13 +27,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
-#include "tensorflow/lite/toco/model.h"
-#include "tensorflow/lite/toco/model_flags.pb.h"
-#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
-#include "tensorflow/lite/toco/tensorflow_util.h"
-#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -42,12 +38,16 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
+#include "tensorflow/lite/toco/tensorflow_util.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 using tensorflow::AttrValue;
 using tensorflow::DT_BOOL;
@@ -1731,9 +1731,13 @@ tensorflow::Status ConvertResizeNearestNeighborOperator(
   auto* op = new ResizeNearestNeighborOperator;
 
   op->align_corners = false;
+  op->half_pixel_centers = false;
   if (HasAttr(node, "align_corners")) {
     op->align_corners = GetBoolAttr(node, "align_corners");
   }
+  if (HasAttr(node, "half_pixel_centers")) {
+    op->half_pixel_centers = GetBoolAttr(node, "half_pixel_centers");
+  }
 
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -2628,6 +2632,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"ReverseV2", ConvertSimpleOperator<ReverseV2Operator, 2, 1>},
       {"Round", ConvertRoundOperator},
       {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1, 1>},
+      {"ScatterNd", ConvertSimpleOperator<ScatterNdOperator, 3, 1>},
       {"SegmentSum", ConvertSimpleOperator<SegmentSumOperator, 2, 1>},
       {"Select", ConvertSimpleOperator<SelectOperator, 3, 1>},
       {"SelectV2", ConvertSimpleOperator<SelectOperator, 3, 1>},
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index 11a400318d1..89ea9d997f9 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -180,6 +180,7 @@ enum class OperatorType : uint8 {
   kMatrixSetDiagV2,
   kMatrixDiagV3,
   kMatrixSetDiagV3,
+  kScatterNd,
   // Debugging operators.
   kNumericVerify
 };
@@ -1200,6 +1201,8 @@ struct SqueezeOperator : Operator {
 //   inputs[0]: required: the output shape
 //   inputs[1]: required: the weights
 //   inputs[2]: required: the input activations array
+//   inputs[3]: optional: the bias vector, specifying the biases for each output
+//                        channel.
 //   NOTE: The input activations is NOT the first input.
 //
 //
@@ -1212,6 +1215,7 @@ struct TransposeConvOperator : Operator {
     OUTPUT_SHAPE = 0,
     WEIGHTS = 1,
     DATA_INPUT = 2,
+    BIAS = 3,
   };
 
   TransposeConvOperator() : Operator(OperatorType::kTransposeConv) {}
@@ -1857,6 +1861,7 @@ struct ResizeNearestNeighborOperator : Operator {
       : Operator(OperatorType::kResizeNearestNeighbor) {}
 
   bool align_corners = false;
+  bool half_pixel_centers = false;
 };
 
 // SpaceToBatchND operator. It divides spatial dimensions into a grid of
@@ -2193,6 +2198,10 @@ struct MatrixSetDiagV3Operator : Operator {
   MatrixSetDiagV3Operator() : Operator(OperatorType::kMatrixSetDiagV3) {}
 };
 
+struct ScatterNdOperator : Operator {
+  ScatterNdOperator() : Operator(OperatorType::kScatterNd) {}
+};
+
 struct SegmentSumOperator : Operator {
   SegmentSumOperator() : Operator(OperatorType::kSegmentSum) {}
 };
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 495c40338cf..7dfa714d1d6 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -53,6 +53,8 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/python:graphdef_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite/python:saved_model_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
+        "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
+        "//tensorflow/lite/toco:types_proto_cc",
     ] + select({
         # This is required when running `tflite_convert` from `bazel`.
         # It requires to link with TensorFlow Ops to get the op definitions.
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index dd21e810e86..441aabf0ffe 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h"
+#include "tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
@@ -40,6 +41,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_tooling.h"
 #include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/types.pb.h"
 
 namespace toco {
 
@@ -227,7 +229,57 @@ PyObject* TocoGetPotentiallySupportedOps() {
   return list;
 }
 
-PyObject* MlirQuantizeModel(PyObject* data, bool fully_quantize) {
+PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
+                            bool fully_quantize, int inference_type) {
+  using tflite::interpreter_wrapper::PythonErrorReporter;
+  char* buf = nullptr;
+  Py_ssize_t length;
+  std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
+
+  if (tflite::python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
+    PyErr_Format(PyExc_ValueError, "Failed to convert input PyObject");
+    return nullptr;
+  }
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length,
+                                               error_reporter.get());
+  if (!model) {
+    PyErr_Format(PyExc_ValueError, "Invalid model");
+    return nullptr;
+  }
+  auto tflite_model = absl::make_unique<tflite::ModelT>();
+  model->GetModel()->UnPackTo(tflite_model.get(), nullptr);
+
+  tflite::TensorType inference_tensor_type;
+  switch (inference_type) {
+    case toco::IODataType::QUANTIZED_INT16:
+      inference_tensor_type = tflite::TensorType_INT16;
+      break;
+    case toco::IODataType::QUANTIZED_UINT8:
+      inference_tensor_type = tflite::TensorType_UINT8;
+      break;
+    case toco::IODataType::INT8:
+      inference_tensor_type = tflite::TensorType_INT8;
+      break;
+    default:
+      return nullptr;
+  }
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = mlir::lite::QuantizeModel(
+      *tflite_model, tflite::TensorType::TensorType_FLOAT32,
+      tflite::TensorType::TensorType_FLOAT32, inference_tensor_type, {},
+      disable_per_channel, fully_quantize, &builder, error_reporter.get());
+
+  if (status != kTfLiteOk) {
+    error_reporter->exception();
+    return nullptr;
+  }
+  return tflite::python_utils::ConvertToPyString(
+      reinterpret_cast<const char*>(builder.GetCurrentBufferPointer()),
+      builder.GetSize());
+}
+
+PyObject* MlirSparsifyModel(PyObject* data) {
   using tflite::interpreter_wrapper::PythonErrorReporter;
   char* buf = nullptr;
   Py_ssize_t length;
@@ -248,10 +300,8 @@ PyObject* MlirQuantizeModel(PyObject* data, bool fully_quantize) {
   model->GetModel()->UnPackTo(tflite_model.get(), nullptr);
 
   flatbuffers::FlatBufferBuilder builder;
-  auto status = mlir::lite::QuantizeModel(
-      *tflite_model, tflite::TensorType::TensorType_FLOAT32,
-      tflite::TensorType::TensorType_FLOAT32, {}, fully_quantize, &builder,
-      error_reporter.get());
+  auto status =
+      mlir::lite::SparsifyModel(*tflite_model, &builder, error_reporter.get());
 
   if (status != kTfLiteOk) {
     error_reporter->exception();
diff --git a/tensorflow/lite/toco/python/toco_python_api.h b/tensorflow/lite/toco/python/toco_python_api.h
index ca67e3f0aac..058ae9fb942 100644
--- a/tensorflow/lite/toco/python/toco_python_api.h
+++ b/tensorflow/lite/toco/python/toco_python_api.h
@@ -43,7 +43,12 @@ PyObject* TocoGetPotentiallySupportedOps();
 // Quantize the model with calibration data. Throw errors if `fully_quantize`
 // is specified by the calibration data are not sufficient to quantize the
 // model.
-PyObject* MlirQuantizeModel(PyObject* data, bool fully_quantize);
+PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
+                            bool fully_quantize, int inference_type);
+
+// Sparsifies model to encode sparse tensors with proper format. Throws error if
+// sparsification fails.
+PyObject* MlirSparsifyModel(PyObject* data);
 }  // namespace toco
 
 #endif  // TENSORFLOW_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index c5b5e3c71ab..cf127a9f459 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -45,9 +45,12 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kConv, 1}, "1.5.0"},
           {{OperatorType::kConv, 2}, "1.14.0"},
           {{OperatorType::kConv, 3}, "1.14.0"},
+          {{OperatorType::kConv, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kDepthwiseConv, 1}, "1.5.0"},
           {{OperatorType::kDepthwiseConv, 2}, "1.12.0"},
           {{OperatorType::kDepthwiseConv, 3}, "1.14.0"},
+          {{OperatorType::kDepthwiseConv, 4}, "1.14.0"},
+          {{OperatorType::kDepthwiseConv, 5}, kPendingReleaseOpVersion},
           {{OperatorType::kAdd, 1}, "1.5.0"},
           {{OperatorType::kAdd, 2}, "1.14.0"},
           {{OperatorType::kAddN, 1}, "1.14.0"},
@@ -62,6 +65,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kCast, 1}, "1.5.0"},
           {{OperatorType::kConcatenation, 1}, "1.5.0"},
           {{OperatorType::kConcatenation, 2}, "1.14.0"},
+          {{OperatorType::kConcatenation, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kDepthToSpace, 1}, "2.1.0"},
           {{OperatorType::kFakeQuant, 1}, "1.5.0"},
           {{OperatorType::kFakeQuant, 2}, "1.10.0"},
@@ -71,6 +75,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kFullyConnected, 4}, "1.14.0"},
           {{OperatorType::kFullyConnected, 5}, "2.0.0"},
           {{OperatorType::kFullyConnected, 6}, "2.1.0"},
+          {{OperatorType::kFullyConnected, 7}, kPendingReleaseOpVersion},
           {{OperatorType::kGather, 1}, "1.6.0"},
           {{OperatorType::kGather, 2}, "1.14.0"},
           {{OperatorType::kGather, 3}, "1.15.0"},
@@ -89,12 +94,14 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kMaximum, 1}, "1.14.0"},
           {{OperatorType::kMaximum, 2}, "1.14.0"},
           {{OperatorType::kMaximum, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kMaximum, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kMinimum, 1}, "1.14.0"},
           {{OperatorType::kMinimum, 2}, "1.14.0"},
           {{OperatorType::kMinimum, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kMul, 1}, "1.5.0"},
           {{OperatorType::kMul, 2}, "1.14.0"},
           {{OperatorType::kMul, 3}, "1.15.0"},
+          {{OperatorType::kMul, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kPad, 1}, "1.5.0"},
           {{OperatorType::kPad, 2}, "1.14.0"},
           {{OperatorType::kTile, 1}, "1.10.1"},
@@ -104,6 +111,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kReshape, 1}, "1.5.0"},
           {{OperatorType::kSoftmax, 1}, "1.5.0"},
           {{OperatorType::kSoftmax, 2}, "1.14.0"},
+          {{OperatorType::kSoftmax, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kSpaceToDepth, 1}, "1.5.0"},
           {{OperatorType::kSpaceToDepth, 2}, "1.14.0"},
           {{OperatorType::kTranspose, 1}, "1.6.0"},
@@ -133,10 +141,12 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kResizeBilinear, 3}, "2.2.0"},
           {{OperatorType::kResizeNearestNeighbor, 1}, "1.13.1"},
           {{OperatorType::kResizeNearestNeighbor, 2}, "1.14.0"},
+          {{OperatorType::kResizeNearestNeighbor, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kSqueeze, 1}, "1.6.0"},
           {{OperatorType::kSplit, 1}, "1.5.0"},
           {{OperatorType::kSplit, 2}, "1.14.0"},
           {{OperatorType::kSplit, 3}, "1.14.0"},
+          {{OperatorType::kSplit, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kSplitV, 2}, kPendingReleaseOpVersion},
           {{OperatorType::kStridedSlice, 1}, "1.6.0"},
           {{OperatorType::kStridedSlice, 2}, "1.14.0"},
@@ -148,6 +158,8 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kArgMin, 1}, "1.9.0"},
           {{OperatorType::kArgMin, 2}, "1.14.0"},
           {{OperatorType::kTransposeConv, 1}, "1.9.0"},
+          {{OperatorType::kTransposeConv, 2}, kPendingReleaseOpVersion},
+          {{OperatorType::kTransposeConv, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kSparseToDense, 1}, "1.9.0"},
           {{OperatorType::kSparseToDense, 2}, "1.14.0"},
           {{OperatorType::kSparseToDense, 3}, "1.15.0"},
@@ -161,6 +173,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kSlice, 3}, "1.14.0"},
           {{OperatorType::kTanh, 1}, "1.14.0"},
           {{OperatorType::kTanh, 2}, "1.14.0"},
+          {{OperatorType::kTanh, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kOneHot, 1}, "1.11.0"},
           {{OperatorType::kCTCBeamSearchDecoder, 1}, "1.11.0"},
           {{OperatorType::kUnpack, 1}, "1.11.0"},
@@ -168,8 +181,10 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kUnpack, 3}, "2.2.0"},
           {{OperatorType::kUnpack, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kLeakyRelu, 1}, "1.13.1"},
+          {{OperatorType::kLeakyRelu, 2}, kPendingReleaseOpVersion},
           {{OperatorType::kLogistic, 1}, "1.14.0"},
           {{OperatorType::kLogistic, 2}, "1.14.0"},
+          {{OperatorType::kLogistic, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kLogSoftmax, 1}, "1.14.0"},
           {{OperatorType::kLogSoftmax, 2}, "1.14.0"},
           {{OperatorType::kSquaredDifference, 1}, "1.13.1"},
@@ -183,8 +198,10 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kReverseSequence, 1}, "1.14.0"},
           {{OperatorType::kEqual, 1}, "1.14.0"},
           {{OperatorType::kEqual, 2}, "1.14.0"},
+          {{OperatorType::kEqual, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kNotEqual, 1}, "1.14.0"},
           {{OperatorType::kNotEqual, 2}, "1.14.0"},
+          {{OperatorType::kNotEqual, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kGreater, 1}, "1.14.0"},
           {{OperatorType::kGreater, 2}, "1.14.0"},
           {{OperatorType::kGreaterEqual, 1}, "1.14.0"},
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 2e918b5f389..fee10a19787 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -487,6 +487,7 @@ class FullyConnected
     op_sig.options.fully_connected.keep_num_dims = fc_op.keep_num_dims;
     op_sig.options.fully_connected.weights_format =
         GetWeightFormat(fc_op.weights_format);
+    op_sig.options.fully_connected.sparse_weight = false;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
@@ -1116,8 +1117,9 @@ class ResizeBilinear
         static_cast<const ResizeBilinearOperator&>(*op_signature.op);
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
-    op_sig.options.resize_bilinear.half_pixel_centers =
+    op_sig.options.resize.half_pixel_centers =
         resize_bilinear_op.half_pixel_centers;
+    op_sig.options.resize.align_corners = resize_bilinear_op.align_corners;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
@@ -1131,13 +1133,24 @@ class ResizeNearestNeighbor
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    return ::tflite::CreateResizeNearestNeighborOptions(*builder,
-                                                        op.align_corners);
+    return ::tflite::CreateResizeNearestNeighborOptions(
+        *builder, op.align_corners, op.half_pixel_centers);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
     op->align_corners = options.align_corners();
+    op->half_pixel_centers = options.half_pixel_centers();
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const auto& resize_nn_op =
+        static_cast<const ResizeNearestNeighborOperator&>(*op_signature.op);
+    ::tflite::OpSignature op_sig =
+        GetVersioningOpSig(builtin_op(), op_signature);
+    op_sig.options.resize.half_pixel_centers = resize_nn_op.half_pixel_centers;
+    op_sig.options.resize.align_corners = resize_nn_op.align_corners;
+    return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
 
@@ -2061,6 +2074,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       ::tflite::BuiltinOperator_RANK, OperatorType::kRank));
   ops.emplace_back(new SimpleOperator<SegmentSumOperator>(
       ::tflite::BuiltinOperator_SEGMENT_SUM, OperatorType::kSegmentSum));
+  ops.emplace_back(MakeUnique<SimpleOperator<ScatterNdOperator>>(
+      ::tflite::BuiltinOperator_SCATTER_ND, OperatorType::kScatterNd));
   return ops;
 }
 }  // namespace
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index dd0c2946795..a4fe01e4afd 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -436,11 +436,25 @@ TEST_F(OperatorTest, ResizeBilinear_HalfPixelCenters) {
 TEST_F(OperatorTest, ResizeNearestNeighbor) {
   ResizeNearestNeighborOperator op;
   op.align_corners = true;
+  op.half_pixel_centers = false;
   auto output_toco_op =
       SerializeAndDeserialize(GetOperator("RESIZE_NEAREST_NEIGHBOR",
                                           OperatorType::kResizeNearestNeighbor),
                               op);
   EXPECT_EQ(op.align_corners, output_toco_op->align_corners);
+  EXPECT_EQ(op.half_pixel_centers, output_toco_op->half_pixel_centers);
+}
+
+TEST_F(OperatorTest, ResizeNearestNeighbor_HalfPixelCenters) {
+  ResizeNearestNeighborOperator op;
+  op.align_corners = true;
+  op.half_pixel_centers = true;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("RESIZE_NEAREST_NEIGHBOR",
+                                          OperatorType::kResizeNearestNeighbor),
+                              op);
+  EXPECT_EQ(op.align_corners, output_toco_op->align_corners);
+  EXPECT_EQ(op.half_pixel_centers, output_toco_op->half_pixel_centers);
 }
 
 TEST_F(OperatorTest, Svdf) {
@@ -617,6 +631,13 @@ TEST_F(OperatorTest, BuiltinSquaredDifference) {
   ASSERT_NE(nullptr, output_toco_op.get());
 }
 
+TEST_F(OperatorTest, BuiltinScatterNd) {
+  ScatterNdOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SCATTER_ND", OperatorType::kScatterNd), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+}
+
 TEST_F(OperatorTest, CustomCTCBeamSearchDecoder) {
   CTCBeamSearchDecoderOperator op;
   op.beam_width = 3;
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index b222032e614..d2f1d102c5a 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -33,7 +33,7 @@ void CopyToBuffer(const string& src, char* dest) {
 }
 
 #ifdef PLATFORM_GOOGLE
-void CopyToBuffer(const Cord& src, char* dest) { src.CopyToArray(dest); }
+void CopyToBuffer(const absl::Cord& src, char* dest) { src.CopyToArray(dest); }
 #endif
 }  // namespace port
 }  // namespace toco
diff --git a/tensorflow/lite/toco/toco_port.h b/tensorflow/lite/toco/toco_port.h
index 231612ecd43..5a80d29b72a 100644
--- a/tensorflow/lite/toco/toco_port.h
+++ b/tensorflow/lite/toco/toco_port.h
@@ -80,7 +80,7 @@ tensorflow::Status Exists(const string& filename, const Options& options);
 
 // Copy `src` string to `dest`. User must ensure `dest` has enough space.
 #if defined(PLATFORM_GOOGLE)
-void CopyToBuffer(const ::Cord& src, char* dest);
+void CopyToBuffer(const ::absl::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
 void CopyToBuffer(const string& src, char* dest);
 
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index d0a3b146bda..82ef4445a84 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -452,6 +452,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(MatrixSetDiagV2)
     HANDLE_OPERATORTYPENAME_CASE(MatrixDiagV3)
     HANDLE_OPERATORTYPENAME_CASE(MatrixSetDiagV3)
+    HANDLE_OPERATORTYPENAME_CASE(ScatterNd)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 106b79bbafc..a96c1c3ede3 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -1,5 +1,4 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 package(
@@ -126,13 +125,12 @@ py_library(
     ],
 )
 
-tf_cc_binary(
+cc_binary(
     name = "generate_op_registrations",
     srcs = ["gen_op_registration_main.cc"],
     deps = [
+        ":command_line_flags",
         ":gen_op_registration",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 8b8d537340d..f6cb71749f8 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -98,13 +98,13 @@ cc_test(
     deps = [
         ":benchmark_performance_options",
         ":benchmark_tflite_model_lib",
-        ":delegate_provider_hdr",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "@com_google_absl//absl/algorithm",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
@@ -139,17 +139,17 @@ cc_library(
     deps = [
         ":benchmark_model_lib",
         ":benchmark_utils",
-        ":coreml_delegate_provider",
-        ":delegate_provider_hdr",
         ":profiling_listener",
-        ":tflite_execution_providers",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/profiling:profile_summary_formatter",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
+        "//tensorflow/lite/tools/delegates:tflite_execution_providers",
         "//tensorflow/lite/tools/evaluation:utils",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
@@ -210,144 +210,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "delegate_provider_hdr",
-    hdrs = [
-        "delegate_provider.h",
-    ],
-    copts = common_copts,
-    deps = [
-        ":benchmark_params",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/tools:command_line_flags",
-        "//tensorflow/lite/tools:logging",
-    ],
-)
-
-# A convenient library for all inference execution providers.
-cc_library(
-    name = "tflite_execution_providers",
-    copts = tflite_copts(),
-    deps = [
-        ":default_execution_provider",
-        ":external_delegate_provider",
-        ":gpu_delegate_provider",
-        ":hexagon_delegate_provider",
-        ":nnapi_delegate_provider",
-    ] + select({
-        "//tensorflow:fuchsia": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": [
-            ":xnnpack_delegate_provider",
-        ],
-    }),
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "default_execution_provider",
-    srcs = ["default_execution_provider.cc"],
-    copts = tflite_copts(),
-    linkstatic = True,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":delegate_provider_hdr",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "gpu_delegate_provider",
-    srcs = ["gpu_delegate_provider.cc"],
-    copts = common_copts + select({
-        "//tensorflow:ios": [
-            "-xobjective-c++",
-        ],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":delegate_provider_hdr",
-        "//tensorflow/lite/tools/evaluation:utils",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/lite/delegates/gpu:delegate",
-        ],
-        "//tensorflow:ios": [
-            "//tensorflow/lite/delegates/gpu:metal_delegate",
-        ],
-        "//conditions:default": [],
-    }),
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "nnapi_delegate_provider",
-    srcs = ["nnapi_delegate_provider.cc"],
-    copts = common_copts,
-    deps = [
-        ":delegate_provider_hdr",
-        "//tensorflow/lite/tools/evaluation:utils",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "hexagon_delegate_provider",
-    srcs = ["hexagon_delegate_provider.cc"],
-    copts = common_copts,
-    deps = [
-        ":delegate_provider_hdr",
-        "//tensorflow/lite/tools/evaluation:utils",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "coreml_delegate_provider",
-    srcs = ["coreml_delegate_provider.cc"],
-    copts = common_copts + select({
-        "//tensorflow:ios": [
-            "-xobjective-c++",
-        ],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":delegate_provider_hdr",
-        "//tensorflow/lite/tools/evaluation:utils",
-    ] + select({
-        "//tensorflow:ios": [
-            "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
-        ],
-        "//conditions:default": [],
-    }),
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "xnnpack_delegate_provider",
-    srcs = ["xnnpack_delegate_provider.cc"],
-    copts = tflite_copts(),
-    linkstatic = True,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":delegate_provider_hdr",
-        "//tensorflow/lite/tools/evaluation:utils",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "external_delegate_provider",
-    srcs = ["external_delegate_provider.cc"],
-    copts = tflite_copts(),
-    linkstatic = True,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":delegate_provider_hdr",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "benchmark_utils",
     srcs = [
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 1b3509fea12..ae7e4ae150d 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -34,32 +34,6 @@ and the following optional parameters:
 *   `run_delay`: `float` (default=-1.0) \
     The delay in seconds between subsequent benchmark runs. Non-positive values
     mean use no delay.
-*   `use_xnnpack`: `bool` (default=false) \
-    Whether to use the XNNPack delegate.
-*   `use_hexagon`: `bool` (default=false) \
-    Whether to use the Hexagon delegate. Not all devices may support the Hexagon
-    delegate, refer to the TensorFlow Lite documentation for more information
-    about which devices/chipsets are supported and about how to get the required
-    libraries. To use the Hexagon delegate also build the
-    hexagon_nn:libhexagon_interface.so target and copy the library to the
-    device. All libraries should be copied to /data/local/tmp on the device.
-*   `use_nnapi`: `bool` (default=false) \
-    Whether to use
-    [Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/).
-    This API is available on recent Android devices. Note that some Android P
-    devices will fail to use NNAPI for models in `/data/local/tmp/` and this
-    benchmark tool will not correctly use NNAPI. When on Android Q+, will also
-    print the names of NNAPI accelerators accessible through the
-    `nnapi_accelerator_name` flag.
-*   `nnapi_accelerator_name`: `str` (default="") \
-    The name of the NNAPI accelerator to use (requires Android Q+). If left
-    blank, NNAPI will automatically select which of the available accelerators
-    to use.
-*   `nnapi_execution_preference`: `string` (default="") \
-    Which
-    [NNAPI execution preference](https://developer.android.com/ndk/reference/group/neural-networks.html#group___neural_networks_1gga034380829226e2d980b2a7e63c992f18af727c25f1e2d8dcc693c477aef4ea5f5)
-    to use when executing using NNAPI. Should be one of the following:
-    fast_single_answer, sustained_speed, low_power, undefined.
 *   `use_legacy_nnapi`: `bool` (default=false) \
     Whether to use the legacy
     [Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/)
@@ -67,26 +41,6 @@ and the following optional parameters:
     This is available on recent Android devices. Note that some Android P
     devices will fail to use NNAPI for models in `/data/local/tmp/` and this
     benchmark tool will not correctly use NNAPI.
-*   `max_delegated_partitions`: `int` (default=0, i.e. no limit) \
-    The maximum number of partitions that will be delegated. \
-    Currently supported only by the NNAPI Delegate and it won't work \
-    if `use_legacy_nnapi` has been selected.
-*   `disable_nnapi_cpu`: `bool` (default=false) \
-    Excludes the
-    [NNAPI CPU reference implementation](https://developer.android.com/ndk/guides/neuralnetworks#device-assignment)
-    from the possible devices to be used by NNAPI to execute the model. This
-    option is ignored if `nnapi_accelerator_name` is specified.
-*   `use_gpu`: `bool` (default=false) \
-    Whether to use the
-    [GPU accelerator delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/gpu).
-    This option is currently only available on Android and iOS devices.
-*   `gpu_wait_type`: `str` (default="") \
-    Which GPU wait_type option to use, when using GPU delegate on iOS. Should be
-    one of the following: passive, active, do_not_wait, aggressive. When left
-    blank, passive mode is used by default.
-*   `use_coreml`: `bool` (default=false) \
-    Whether to use the [Core ML delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/delegates/coreml).
-    This option is only available in iOS.
 *   `enable_op_profiling`: `bool` (default=false) \
     Whether to enable per-operator profiling measurement.
 *   `enable_platform_tracing`: `bool` (default=false) \
@@ -94,16 +48,51 @@ and the following optional parameters:
     'enable_op_profiling'. Note, the platform-wide tracing might not work if the
     tool runs as a commandline native binary. For example, on Android, the
     ATrace-based tracing only works when the tool is launched as an APK.
-*   `hexagon_profiling`: `bool` (default=false) \
-    Whether to profile ops running on hexagon. Needs to be combined with
-    `enable_op_profiling`. When this is set to true the profile of ops on
-    hexagon DSP will be added to the profile table. Note that, the reported data
-    on hexagon is in cycles, not in ms like on cpu.
-*   `external_delegate_path`: `string` (default="") \
-    Path to the external delegate library to use.
-*   `external_delegate_options`: `string` (default="") \
-    A list of options to be passed to the external delegate library. Options
-    should be in the format of `option1:value1;option2:value2;optionN:valueN`
+
+### TFLite delegate parameters
+The tool supports all runtime/delegate parameters introduced by
+[the delegate registrar](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates).
+The following simply lists the names of these parameters and additional notes
+where applicable. For details about each parameter, please refer to
+[this page](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/delegates/README.md#tflite-delegate-registrar).
+#### Common parameters
+* `max_delegated_partitions`: `int` (default=0) \
+Note when `use_legacy_nnapi` is selected, this parameter won't work.
+* `min_nodes_per_partition`:`int` (default=0)
+
+#### GPU delegate
+* `use_gpu`: `bool` (default=false)
+* `gpu_precision_loss_allowed`: `bool` (default=true)
+* `gpu_experimental_enable_quant`: `bool` (default=true)
+* `gpu_backend`: `string` (default="")
+* `gpu_wait_type`: `str` (default="")
+
+#### NNAPI delegate
+*   `use_nnapi`: `bool` (default=false) \
+    Note some Android P devices will fail to use NNAPI for models in
+    `/data/local/tmp/` and this benchmark tool will not correctly use NNAPI.
+*   `nnapi_accelerator_name`: `str` (default="")
+*   `disable_nnapi_cpu`: `bool` (default=false)
+*   `nnapi_allow_fp16`: `bool` (default=false)
+
+#### Hexagon delegate
+* `use_hexagon`: `bool` (default=false)
+* `hexagon_profiling`: `bool` (default=false) \
+Note enabling this option will not produce profiling results outputs unless
+`enable_op_profiling` is also turned on. When both parameters are set to true,
+the profile of ops on hexagon DSP will be added to the profile table. Note that,
+the reported data on hexagon is in cycles, not in ms like on cpu.
+
+#### XNNPACK delegate
+*   `use_xnnpack`: `bool` (default=false)
+
+#### CoreML delegate
+*   `use_coreml`: `bool` (default=false)
+*   `coreml_version`: `int` (default=0)
+
+#### External delegate
+*   `external_delegate_path`: `string` (default="")
+*   `external_delegate_options`: `string` (default="")
 
 ## To build/install/run
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.h b/tensorflow/lite/tools/benchmark/benchmark_model.h
index 0aca42dc200..912e54ff385 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.h
@@ -161,7 +161,7 @@ Flag CreateFlag(const char* name, BenchmarkParams* params,
                 const std::string& usage) {
   return Flag(
       name, [params, name](const T& val) { params->Set<T>(name, val); },
-      params->Get<T>(name), usage, Flag::OPTIONAL);
+      params->Get<T>(name), usage, Flag::kOptional);
 }
 
 // Benchmarks a model.
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index 26fed5e279f..cfce23c4595 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -233,6 +233,7 @@ void BenchmarkPerformanceOptions::ResetPerformanceOptions() {
   single_option_run_params_->Set<std::string>("nnapi_accelerator_name", "");
   single_option_run_params_->Set<bool>("disable_nnapi_cpu", false);
   single_option_run_params_->Set<int>("max_delegated_partitions", 0);
+  single_option_run_params_->Set<bool>("nnapi_allow_fp16", false);
 #endif
 #if defined(TFLITE_ENABLE_HEXAGON)
   single_option_run_params_->Set<bool>("use_hexagon", false);
@@ -334,7 +335,7 @@ void BenchmarkPerformanceOptions::Run() {
   // profiling listener etc. in each Run() invoke because such listeners may be
   // reset and become invalid in the next Run(). As a result, we record the
   // number of externally-added listeners here to prevent they're cleared later.
-  const int num_external_listners = single_option_run_->NumListeners();
+  const int num_external_listeners = single_option_run_->NumListeners();
 
   // Now perform all runs, each with different performance-affecting parameters.
   for (const auto& run_params : all_run_params_) {
@@ -349,7 +350,7 @@ void BenchmarkPerformanceOptions::Run() {
 
     // Clear internally created listeners before each run but keep externally
     // created ones.
-    single_option_run_->RemoveListeners(num_external_listners);
+    single_option_run_->RemoveListeners(num_external_listeners);
 
     all_run_stats_->MarkBenchmarkStart(*single_option_run_params_);
     single_option_run_->Run();
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index 8d7fa8d7945..33ccacc0451 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_performance_options.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/logging.h"
 
 namespace {
@@ -88,7 +88,8 @@ BenchmarkParams CreateParams(int32_t num_runs, float min_secs, float max_secs,
   params.AddParam("enable_platform_tracing",
                   BenchmarkParam::Create<bool>(false));
 
-  for (const auto& delegate_provider : GetRegisteredDelegateProviders()) {
+  for (const auto& delegate_provider :
+       tools::GetRegisteredDelegateProviders()) {
     params.Merge(delegate_provider->DefaultParams());
   }
   return params;
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index dc158c1ff0a..969713cce73 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/strings/numbers.h"
 #include "ruy/profiler/profiler.h"  // from @ruy
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
@@ -36,8 +37,8 @@ limitations under the License.
 #include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
 #include "tensorflow/lite/tools/benchmark/profiling_listener.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/logging.h"
 
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
@@ -272,8 +273,9 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   default_params.AddParam("enable_platform_tracing",
                           BenchmarkParam::Create<bool>(false));
 
-  for (const auto& delegate_util : GetRegisteredDelegateProviders()) {
-    default_params.Merge(delegate_util->DefaultParams());
+  for (const auto& delegate_provider :
+       tools::GetRegisteredDelegateProviders()) {
+    default_params.Merge(delegate_provider->DefaultParams());
   }
 
   return default_params;
@@ -332,8 +334,9 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
 
-  for (const auto& delegate_util : GetRegisteredDelegateProviders()) {
-    auto delegate_flags = delegate_util->CreateFlags(&params_);
+  for (const auto& delegate_provider :
+       tools::GetRegisteredDelegateProviders()) {
+    auto delegate_flags = delegate_provider->CreateFlags(&params_);
     flags.insert(flags.end(), delegate_flags.begin(), delegate_flags.end());
   }
 
@@ -372,8 +375,9 @@ void BenchmarkTfLiteModel::LogParams() {
   TFLITE_LOG(INFO) << "Enable platform-wide tracing: ["
                    << params_.Get<bool>("enable_platform_tracing") << "]";
 
-  for (const auto& delegate_util : GetRegisteredDelegateProviders()) {
-    delegate_util->LogParams(params_);
+  for (const auto& delegate_provider :
+       tools::GetRegisteredDelegateProviders()) {
+    delegate_provider->LogParams(params_);
   }
 }
 
@@ -593,17 +597,20 @@ TfLiteStatus BenchmarkTfLiteModel::ResetInputsAndOutputs() {
   return kTfLiteOk;
 }
 
-TfLiteStatus BenchmarkTfLiteModel::Init() {
-  TF_LITE_ENSURE_STATUS(LoadModel());
-
+TfLiteStatus BenchmarkTfLiteModel::InitInterpreter() {
   auto resolver = GetOpResolver();
-
   const int32_t num_threads = params_.Get<int32_t>("num_threads");
   tflite::InterpreterBuilder(*model_, *resolver)(&interpreter_, num_threads);
   if (!interpreter_) {
-    TFLITE_LOG(ERROR) << "Failed to construct interpreter";
+    TFLITE_LOG(ERROR) << "Failed to initialize the interpreter";
     return kTfLiteError;
   }
+  return kTfLiteOk;
+}
+
+TfLiteStatus BenchmarkTfLiteModel::Init() {
+  TF_LITE_ENSURE_STATUS(LoadModel());
+  TF_LITE_ENSURE_STATUS(InitInterpreter());
 
   // Install profilers if necessary right after interpreter is created so that
   // any memory allocations inside the TFLite runtime could be recorded if the
@@ -615,7 +622,8 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
   interpreter_->SetAllowFp16PrecisionForFp32(params_.Get<bool>("allow_fp16"));
 
   owned_delegates_.clear();
-  for (const auto& delegate_provider : GetRegisteredDelegateProviders()) {
+  for (const auto& delegate_provider :
+       tools::GetRegisteredDelegateProviders()) {
     auto delegate = delegate_provider->CreateTfLiteDelegate(params_);
     // It's possible that a delegate of certain type won't be created as
     // user-specified benchmark params tells not to.
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index b56390b3775..cc87743b531 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -74,6 +74,9 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   // Allow subclasses to create a customized Op resolver during init.
   virtual std::unique_ptr<tflite::OpResolver> GetOpResolver() const;
 
+  // Allow subclass to initialize a customized tflite interpereter.
+  virtual TfLiteStatus InitInterpreter();
+
   // Create a BenchmarkListener that's specifically for TFLite profiling if
   // necessary.
   virtual std::unique_ptr<BenchmarkListener> MayCreateProfilingListener() const;
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 39ec547198e..ab150e87d93 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -29,6 +29,9 @@ limitations under the License.
 // TfLiteDelegate - allows delegation of nodes to alternative backends.
 //
 // Some abstractions in this file are created and managed by Interpreter.
+//
+// NOTE: The order of values in these structs are "semi-ABI stable". New values
+// should be added only to the end of structs and never reordered.
 
 #ifndef TENSORFLOW_LITE_C_COMMON_H_
 #define TENSORFLOW_LITE_C_COMMON_H_
@@ -41,7 +44,11 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-typedef enum TfLiteStatus { kTfLiteOk = 0, kTfLiteError = 1 } TfLiteStatus;
+typedef enum TfLiteStatus {
+  kTfLiteOk = 0,
+  kTfLiteError = 1,
+  kTfLiteDelegateError = 2
+} TfLiteStatus;
 
 // The list of external context types known to TF Lite. This list exists solely
 // to avoid conflicts and to ensure ops can share the external contexts they
@@ -151,8 +158,16 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
   do {                                              \
     (context)->ReportError((context), __VA_ARGS__); \
   } while (false)
+
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...)        \
+  do {                                                \
+    if ((context) != nullptr) {                       \
+      (context)->ReportError((context), __VA_ARGS__); \
+    }                                                 \
+  } while (false)
 #else  // TF_LITE_STRIP_ERROR_STRINGS
 #define TF_LITE_KERNEL_LOG(context, ...)
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...)
 #endif  // TF_LITE_STRIP_ERROR_STRINGS
 
 // Check whether value is true, and if not return kTfLiteError from
@@ -178,8 +193,9 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 
 #define TF_LITE_ENSURE_STATUS(a) \
   do {                           \
-    if ((a) != kTfLiteOk) {      \
-      return kTfLiteError;       \
+    const TfLiteStatus s = (a);  \
+    if (s != kTfLiteOk) {        \
+      return s;                  \
     }                            \
   } while (0)
 
@@ -208,8 +224,9 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 
 #define TF_LITE_ENSURE_OK(context, status) \
   do {                                     \
-    if ((status) != kTfLiteOk) {           \
-      return kTfLiteError;                 \
+    const TfLiteStatus s = (status);       \
+    if ((s) != kTfLiteOk) {                \
+      return s;                            \
     }                                      \
   } while (0)
 
@@ -304,15 +321,23 @@ typedef union TfLitePtrUnion {
   void* data;
 } TfLitePtrUnion;
 
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
   kTfLiteArenaRw,
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
+  kTfLitePersistentRo,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
diff --git a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
index 72b22ed029c..ed32772b546 100644
--- a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
+++ b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
@@ -17,6 +17,7 @@
 		6FE9400B20D592DA008C9FE4 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6FE9400A20D592DA008C9FE4 /* main.m */; };
 		DC4D465D2373ECF400397CBD /* TensorFlowLiteBenchmarkC.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DC4D465C2373ECF300397CBD /* TensorFlowLiteBenchmarkC.framework */; };
 		DC4D465E2373ECF400397CBD /* TensorFlowLiteBenchmarkC.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = DC4D465C2373ECF300397CBD /* TensorFlowLiteBenchmarkC.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
+		DCAFFD072457B5D100314DDA /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DCAFFD062457B5D100314DDA /* CoreML.framework */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -48,6 +49,7 @@
 		6FE9400920D592DA008C9FE4 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
 		6FE9400A20D592DA008C9FE4 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
 		DC4D465C2373ECF300397CBD /* TensorFlowLiteBenchmarkC.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = TensorFlowLiteBenchmarkC.framework; path = TFLiteBenchmark/Frameworks/TensorFlowLiteBenchmarkC.framework; sourceTree = "<group>"; };
+		DCAFFD062457B5D100314DDA /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -57,6 +59,7 @@
 			files = (
 				6FE7579F20D5A6A700F01636 /* Accelerate.framework in Frameworks */,
 				DC4D465D2373ECF400397CBD /* TensorFlowLiteBenchmarkC.framework in Frameworks */,
+				DCAFFD072457B5D100314DDA /* CoreML.framework in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -75,6 +78,7 @@
 		6FE7579B20D5A5E000F01636 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				DCAFFD062457B5D100314DDA /* CoreML.framework */,
 				DC4D465C2373ECF300397CBD /* TensorFlowLiteBenchmarkC.framework */,
 				6FE7579E20D5A6A700F01636 /* Accelerate.framework */,
 				6FE7579C20D5A5E000F01636 /* benchmark-lib.a */,
@@ -343,7 +347,7 @@
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib;
+				"LIBRARY_SEARCH_PATHS[arch=*]" = "";
 				OTHER_LDFLAGS = "-all_load";
 				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
 				PRODUCT_NAME = "$(TARGET_NAME)";
@@ -374,7 +378,7 @@
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib;
+				"LIBRARY_SEARCH_PATHS[arch=*]" = "";
 				OTHER_LDFLAGS = "-all_load";
 				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
 				PRODUCT_NAME = "$(TARGET_NAME)";
diff --git a/tensorflow/lite/tools/command_line_flags.cc b/tensorflow/lite/tools/command_line_flags.cc
index c565a5f1484..92ddb1622c6 100644
--- a/tensorflow/lite/tools/command_line_flags.cc
+++ b/tensorflow/lite/tools/command_line_flags.cc
@@ -142,7 +142,7 @@ Flag::Flag(const char* name,
       flag_type_(flag_type) {}
 
 bool Flag::Parse(const std::string& arg, bool* value_parsing_ok) const {
-  return ParseFlag(arg, name_, flag_type_ == POSITIONAL, value_hook_,
+  return ParseFlag(arg, name_, flag_type_ == kPositional, value_hook_,
                    value_parsing_ok);
 }
 
@@ -195,16 +195,17 @@ std::string Flag::GetTypeName() const {
           result = false;
         }
         continue;
-      } else if (flag.flag_type_ == Flag::REQUIRED) {
-        // Check if required flag not found.
+      } else if (flag.flag_type_ == Flag::kRequired) {
         TFLITE_LOG(ERROR) << "Required flag not provided: " << flag.name_;
+        // If the required flag isn't found, we immediately stop the whole flag
+        // parsing.
         result = false;
         break;
       }
     }
 
     // Parses positional flags.
-    if (flag.flag_type_ == Flag::POSITIONAL) {
+    if (flag.flag_type_ == Flag::kPositional) {
       if (++positional_count >= *argc) {
         TFLITE_LOG(ERROR) << "Too few command line arguments.";
         return false;
@@ -237,13 +238,19 @@ std::string Flag::GetTypeName() const {
         break;
       }
     }
-    if (!was_found) {
-      processed_flags[flag.name_] = -1;
-    }
-    // Check if required flag not found.
-    if (flag.flag_type_ == Flag::REQUIRED && !was_found) {
+
+    // If the flag is found from the argv (i.e. the flag name appears in argv),
+    // continue to the next flag parsing.
+    if (was_found) continue;
+
+    // The flag isn't found, do some bookkeeping work.
+    processed_flags[flag.name_] = -1;
+    if (flag.flag_type_ == Flag::kRequired) {
       TFLITE_LOG(ERROR) << "Required flag not provided: " << flag.name_;
       result = false;
+      // If the required flag isn't found, we immediately stop the whole flag
+      // parsing by breaking the outer-loop (i.e. the 'sorted_idx'-iteration
+      // loop).
       break;
     }
   }
@@ -273,7 +280,7 @@ std::string Flag::GetTypeName() const {
   // Prints usage for positional flag.
   for (int i = 0; i < sorted_idx.size(); ++i) {
     const Flag& flag = flag_list[sorted_idx[i]];
-    if (flag.flag_type_ == Flag::POSITIONAL) {
+    if (flag.flag_type_ == Flag::kPositional) {
       positional_count++;
       usage_text << " <" << flag.name_ << ">";
     } else {
@@ -288,7 +295,7 @@ std::string Flag::GetTypeName() const {
   std::vector<std::string> name_column(flag_list.size());
   for (int i = 0; i < sorted_idx.size(); ++i) {
     const Flag& flag = flag_list[sorted_idx[i]];
-    if (flag.flag_type_ != Flag::POSITIONAL) {
+    if (flag.flag_type_ != Flag::kPositional) {
       name_column[i] += "--";
       name_column[i] += flag.name_;
       name_column[i] += "=";
@@ -313,7 +320,8 @@ std::string Flag::GetTypeName() const {
     usage_text << "\t";
     usage_text << std::left << std::setw(max_name_width) << name_column[i];
     usage_text << "\t" << type_name << "\t";
-    usage_text << (flag.flag_type_ != Flag::OPTIONAL ? "required" : "optional");
+    usage_text << (flag.flag_type_ != Flag::kOptional ? "required"
+                                                      : "optional");
     usage_text << "\t" << flag.usage_text_ << "\n";
   }
   return usage_text.str();
diff --git a/tensorflow/lite/tools/command_line_flags.h b/tensorflow/lite/tools/command_line_flags.h
index 941a1b8b59a..95e64a19e18 100644
--- a/tensorflow/lite/tools/command_line_flags.h
+++ b/tensorflow/lite/tools/command_line_flags.h
@@ -65,16 +65,16 @@ namespace tflite {
 class Flag {
  public:
   enum FlagType {
-    POSITIONAL = 0,
-    REQUIRED,
-    OPTIONAL,
+    kPositional = 0,
+    kRequired,
+    kOptional,
   };
 
   // The order of the positional flags is the same as they are added.
   // Positional flags are supposed to be required.
   template <typename T>
   static Flag CreateFlag(const char* name, T* val, const char* usage,
-                         FlagType flag_type = OPTIONAL) {
+                         FlagType flag_type = kOptional) {
     return Flag(
         name, [val](const T& v) { *val = v; }, *val, usage, flag_type);
   }
diff --git a/tensorflow/lite/tools/command_line_flags_test.cc b/tensorflow/lite/tools/command_line_flags_test.cc
index a9a351d315f..0216d7a0636 100644
--- a/tensorflow/lite/tools/command_line_flags_test.cc
+++ b/tensorflow/lite/tools/command_line_flags_test.cc
@@ -55,8 +55,10 @@ TEST(CommandLineFlagsTest, BasicUsage) {
           Flag::CreateFlag("some_numeric_bool", &some_numeric_bool,
                            "some numeric bool"),
           Flag::CreateFlag("some_int1", &some_int1, "some int"),
-          Flag::CreateFlag("some_int2", &some_int2, "some int", Flag::REQUIRED),
-          Flag::CreateFlag("float_1", &float_1, "some float", Flag::POSITIONAL),
+          Flag::CreateFlag("some_int2", &some_int2, "some int",
+                           Flag::kRequired),
+          Flag::CreateFlag("float_1", &float_1, "some float",
+                           Flag::kPositional),
       });
 
   EXPECT_TRUE(parsed_ok);
@@ -131,7 +133,7 @@ TEST(CommandLineFlagsTest, RequiredFlagNotFound) {
   const char* argv_strings[] = {"program_name", "--flag=12"};
   bool parsed_ok = Flags::Parse(
       &argc, reinterpret_cast<const char**>(argv_strings),
-      {Flag::CreateFlag("some_flag", &some_float, "", Flag::REQUIRED)});
+      {Flag::CreateFlag("some_flag", &some_float, "", Flag::kRequired)});
 
   EXPECT_FALSE(parsed_ok);
   EXPECT_NEAR(-23.23f, some_float, 1e-5f);
@@ -144,7 +146,7 @@ TEST(CommandLineFlagsTest, NoArguments) {
   const char* argv_strings[] = {"program_name"};
   bool parsed_ok = Flags::Parse(
       &argc, reinterpret_cast<const char**>(argv_strings),
-      {Flag::CreateFlag("some_flag", &some_float, "", Flag::REQUIRED)});
+      {Flag::CreateFlag("some_flag", &some_float, "", Flag::kRequired)});
 
   EXPECT_FALSE(parsed_ok);
   EXPECT_NEAR(-23.23f, some_float, 1e-5f);
@@ -157,7 +159,7 @@ TEST(CommandLineFlagsTest, NotEnoughArguments) {
   const char* argv_strings[] = {"program_name"};
   bool parsed_ok = Flags::Parse(
       &argc, reinterpret_cast<const char**>(argv_strings),
-      {Flag::CreateFlag("some_flag", &some_float, "", Flag::POSITIONAL)});
+      {Flag::CreateFlag("some_flag", &some_float, "", Flag::kPositional)});
 
   EXPECT_FALSE(parsed_ok);
   EXPECT_NEAR(-23.23f, some_float, 1e-5f);
@@ -170,7 +172,7 @@ TEST(CommandLineFlagsTest, PositionalFlagFailed) {
   const char* argv_strings[] = {"program_name", "string"};
   bool parsed_ok = Flags::Parse(
       &argc, reinterpret_cast<const char**>(argv_strings),
-      {Flag::CreateFlag("some_flag", &some_float, "", Flag::POSITIONAL)});
+      {Flag::CreateFlag("some_flag", &some_float, "", Flag::kPositional)});
 
   EXPECT_FALSE(parsed_ok);
   EXPECT_NEAR(-23.23f, some_float, 1e-5f);
@@ -213,9 +215,9 @@ TEST(CommandLineFlagsTest, UsageString) {
       {Flag::CreateFlag("some_int", &some_int, "some int"),
        Flag::CreateFlag("some_int64", &some_int64, "some int64"),
        Flag::CreateFlag("some_switch", &some_switch, "some switch"),
-       Flag::CreateFlag("some_name", &some_name, "some name", Flag::REQUIRED),
+       Flag::CreateFlag("some_name", &some_name, "some name", Flag::kRequired),
        Flag::CreateFlag("some_int2", &some_int2, "some int",
-                        Flag::POSITIONAL)});
+                        Flag::kPositional)});
   // Match the usage message, being sloppy about whitespace.
   const char* expected_usage =
       " usage: some_tool_name <some_int2> <flags>\n"
@@ -307,8 +309,8 @@ TEST(CommandLineFlagsTest, DuplicateFlagsNotFound) {
   const char* argv_strings[] = {"program_name", "--some_float=1.0"};
   bool parsed_ok = Flags::Parse(
       &argc, reinterpret_cast<const char**>(argv_strings),
-      {Flag::CreateFlag("some_int", &some_int1, "some int1", Flag::OPTIONAL),
-       Flag::CreateFlag("some_int", &some_int2, "some int2", Flag::REQUIRED)});
+      {Flag::CreateFlag("some_int", &some_int1, "some int1", Flag::kOptional),
+       Flag::CreateFlag("some_int", &some_int2, "some int2", Flag::kRequired)});
 
   EXPECT_FALSE(parsed_ok);
   EXPECT_EQ(-23, some_int1);
@@ -338,7 +340,7 @@ TEST(CommandLineFlagsTest, DuplicateFlagsAndArgs) {
   int some_int1 = -23;
   int some_int2 = -23;
   int argc = 3;
-  const char* argv_strings[] = {"program_name", "--some_int=1 --some_int=2"};
+  const char* argv_strings[] = {"program_name", "--some_int=1", "--some_int=2"};
   bool parsed_ok = Flags::Parse(
       &argc, reinterpret_cast<const char**>(argv_strings),
       {Flag::CreateFlag("some_int", &some_int1, "flag1: bind with some_int1"),
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
new file mode 100644
index 00000000000..d2eac9d7348
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -0,0 +1,152 @@
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+common_copts = ["-Wall"] + tflite_copts()
+
+cc_library(
+    name = "delegate_provider_hdr",
+    hdrs = [
+        "delegate_provider.h",
+    ],
+    copts = common_copts,
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools:tool_params",
+    ],
+)
+
+# A convenient library for all inference execution providers.
+cc_library(
+    name = "tflite_execution_providers",
+    copts = tflite_copts(),
+    deps = [
+        ":coreml_delegate_provider",
+        ":default_execution_provider",
+        ":external_delegate_provider",
+        ":gpu_delegate_provider",
+        ":hexagon_delegate_provider",
+        ":nnapi_delegate_provider",
+        ":xnnpack_delegate_provider",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "default_execution_provider",
+    srcs = ["default_execution_provider.cc"],
+    copts = tflite_copts(),
+    linkstatic = True,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":delegate_provider_hdr",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "gpu_delegate_provider",
+    srcs = ["gpu_delegate_provider.cc"],
+    copts = common_copts + select({
+        "//tensorflow:ios": [
+            "-xobjective-c++",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":delegate_provider_hdr",
+        "//tensorflow/lite/tools/evaluation:utils",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/lite/delegates/gpu:delegate",
+        ],
+        "//tensorflow:ios": [
+            "//tensorflow/lite/delegates/gpu:metal_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "nnapi_delegate_provider",
+    srcs = ["nnapi_delegate_provider.cc"],
+    copts = common_copts,
+    deps = [
+        ":delegate_provider_hdr",
+        "//tensorflow/lite/tools/evaluation:utils",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "hexagon_delegate_provider",
+    srcs = ["hexagon_delegate_provider.cc"],
+    copts = common_copts,
+    deps = [
+        ":delegate_provider_hdr",
+        "//tensorflow/lite/tools/evaluation:utils",
+    ] + select({
+        "//tensorflow:android_arm": [
+            "//tensorflow/lite/experimental/delegates/hexagon:hexagon_delegate",
+        ],
+        "//tensorflow:android_arm64": [
+            "//tensorflow/lite/experimental/delegates/hexagon:hexagon_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "coreml_delegate_provider",
+    srcs = ["coreml_delegate_provider.cc"],
+    copts = common_copts + select({
+        "//tensorflow:ios": [
+            "-xobjective-c++",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":delegate_provider_hdr",
+        "//tensorflow/lite/tools/evaluation:utils",
+    ] + select({
+        "//tensorflow:ios": [
+            "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xnnpack_delegate_provider",
+    srcs = ["xnnpack_delegate_provider.cc"],
+    copts = tflite_copts(),
+    linkstatic = True,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":delegate_provider_hdr",
+        "//tensorflow/lite/tools/evaluation:utils",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "external_delegate_provider",
+    srcs = ["external_delegate_provider.cc"],
+    copts = tflite_copts(),
+    linkstatic = True,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":delegate_provider_hdr",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
new file mode 100644
index 00000000000..bc1bffd49b6
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -0,0 +1,107 @@
+# TFLite Delegate Utilities for Tooling
+
+## TFLite Delegate Registrar
+[A TFLite delegate registrar](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/delegates/delegate_provider.h)
+is provided here. The registrar keeps a list of TFLite delegate providers, each
+of which defines a list parameters that could be initialized from commandline
+argumenents and provides a TFLite delegate instance creation based on those
+parameters. This delegate registrar has been used in TFLite evaluation tools and
+the benchmark model tool.
+
+A particular TFLite delegate provider can be used by
+linking the corresponding library, e.g. adding it to the `deps` of a BUILD rule.
+Note that each delegate provider library has been configured with
+`alwayslink=1` in the BUILD rule so that it will be linked to any binary that
+directly or indirectly depends on it.
+
+The following lists all implemented TFLite delegate providers and their
+corresponding list of parameters that each supports to create a particular
+TFLite delegate.
+
+### Common parameters
+*   `num_threads`: `int` (default=1) \
+    The number of threads to use for running the inference on CPU.
+*   `max_delegated_partitions`: `int` (default=0, i.e. no limit) \
+    The maximum number of partitions that will be delegated. \
+    Currently supported by the GPU, Hexagon, CoreML and NNAPI delegate.
+*   `min_nodes_per_partition`: `int` (default=delegate's own choice) \
+    The minimal number of TFLite graph nodes of a partition that needs to be
+    reached to be delegated. A negative value or 0 means to use the default
+    choice of each delegate. \
+    This option is currently supported by the Hexagon and CoreML delegate.
+
+### GPU delegate provider
+*   `use_gpu`: `bool` (default=false) \
+    Whether to use the
+    [GPU accelerator delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/gpu).
+    This option is currently only available on Android and iOS devices.
+*   `gpu_precision_loss_allowed`: `bool` (default=true) \
+    Whethre to allow the GPU delegate to carry out computation with some
+    precision loss (i.e. processing in FP16) or not. If allowed, the performance
+    will increase.
+*   `gpu_experimental_enable_quant`: `bool` (default=true) \
+    Whether to allow the GPU delegate to run a quantized model or not. \
+    This option is currently only available on Android.
+*  `gpu_backend`: `string` (default="") \
+    Force the GPU delegate to use a particular backend for execution, and fail
+    if unsuccessful. Should be one of: cl, gl. By default, the GPU delegate will
+    try OpenCL first and then OpenGL if the former fails.\
+    Note this option is only available on Android.
+*   `gpu_wait_type`: `string` (default="") \
+    Which GPU wait_type option to use, when using GPU delegate on iOS. Should be
+    one of the following: passive, active, do_not_wait, aggressive. When left
+    blank, passive mode is used by default.
+
+### NNAPI delegate provider
+*   `use_nnapi`: `bool` (default=false) \
+    Whether to use
+    [Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/).
+    This API is available on recent Android devices. When on Android Q+, will
+    also print the names of NNAPI accelerators accessible through the
+    `nnapi_accelerator_name` flag.
+*   `nnapi_accelerator_name`: `string` (default="") \
+    The name of the NNAPI accelerator to use (requires Android Q+). If left
+    blank, NNAPI will automatically select which of the available accelerators
+    to use.
+*   `nnapi_execution_preference`: `string` (default="") \
+    Which
+    [NNAPI execution preference](https://developer.android.com/ndk/reference/group/neural-networks.html#group___neural_networks_1gga034380829226e2d980b2a7e63c992f18af727c25f1e2d8dcc693c477aef4ea5f5)
+    to use when executing using NNAPI. Should be one of the following:
+    fast_single_answer, sustained_speed, low_power, undefined.
+*   `disable_nnapi_cpu`: `bool` (default=false) \
+    Excludes the
+    [NNAPI CPU reference implementation](https://developer.android.com/ndk/guides/neuralnetworks#device-assignment)
+    from the possible devices to be used by NNAPI to execute the model. This
+    option is ignored if `nnapi_accelerator_name` is specified.
+*   `nnapi_allow_fp16`: `bool` (default=false) \
+    Whether to allow FP32 computation to be run in FP16.
+
+### Hexagon delegate provider
+*   `use_hexagon`: `bool` (default=false) \
+    Whether to use the Hexagon delegate. Not all devices may support the Hexagon
+    delegate, refer to the [TensorFlow Lite documentation](https://www.tensorflow.org/lite/performance/hexagon_delegate) for more
+    information about which devices/chipsets are supported and about how to get
+    the required libraries. To use the Hexagon delegate also build the
+    hexagon_nn:libhexagon_interface.so target and copy the library to the
+    device. All libraries should be copied to /data/local/tmp on the device.
+*   `hexagon_profiling`: `bool` (default=false) \
+    Whether to profile ops running on hexagon.
+
+### XNNPACK delegate provider
+*   `use_xnnpack`: `bool` (default=false) \
+    Whether to use the XNNPack delegate.
+
+### CoreML delegate provider
+*   `use_coreml`: `bool` (default=false) \
+    Whether to use the [Core ML delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/delegates/coreml).
+    This option is only available in iOS.
+*   `coreml_version`: `int` (default=0) \
+    Target Core ML version for model conversion. The default value is 0 and it
+    means using the newest version that's available on the device.
+
+### External delegate provider
+*   `external_delegate_path`: `string` (default="") \
+    Path to the external delegate library to use.
+*   `external_delegate_options`: `string` (default="") \
+    A list of options to be passed to the external delegate library. Options
+    should be in the format of `option1:value1;option2:value2;optionN:valueN`
diff --git a/tensorflow/lite/tools/benchmark/coreml_delegate_provider.cc b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
similarity index 66%
rename from tensorflow/lite/tools/benchmark/coreml_delegate_provider.cc
rename to tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
index 20bf4741da3..c6509618aee 100644
--- a/tensorflow/lite/tools/benchmark/coreml_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <string>
 
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 #if defined(__APPLE__)
 #if TARGET_OS_IPHONE && !TARGET_IPHONE_SIMULATOR
@@ -25,31 +25,35 @@ limitations under the License.
 #endif
 
 namespace tflite {
-namespace benchmark {
+namespace tools {
 
 class CoreMlDelegateProvider : public DelegateProvider {
  public:
   CoreMlDelegateProvider() {
 #if defined(REAL_IPHONE_DEVICE)
-    default_params_.AddParam("use_coreml", BenchmarkParam::Create<bool>(true));
+    default_params_.AddParam("use_coreml", ToolParam::Create<bool>(false));
+    default_params_.AddParam("coreml_version", ToolParam::Create<int>(0));
 #endif
   }
-  std::vector<Flag> CreateFlags(BenchmarkParams* params) const final;
+  std::vector<Flag> CreateFlags(ToolParams* params) const final;
 
-  void LogParams(const BenchmarkParams& params) const final;
+  void LogParams(const ToolParams& params) const final;
 
-  TfLiteDelegatePtr CreateTfLiteDelegate(
-      const BenchmarkParams& params) const final;
+  TfLiteDelegatePtr CreateTfLiteDelegate(const ToolParams& params) const final;
 
   std::string GetName() const final { return "COREML"; }
 };
 REGISTER_DELEGATE_PROVIDER(CoreMlDelegateProvider);
 
 std::vector<Flag> CoreMlDelegateProvider::CreateFlags(
-    BenchmarkParams* params) const {
+    ToolParams* params) const {
 #if defined(REAL_IPHONE_DEVICE)
   std::vector<Flag> flags = {
       CreateFlag<bool>("use_coreml", params, "use Core ML"),
+      CreateFlag<int>("coreml_version", params,
+                      "Target Core ML version for model conversion. "
+                      "The default value is 0 and it means using the newest "
+                      "version that's available on the device."),
   };
   return flags;
 #else
@@ -57,7 +61,7 @@ std::vector<Flag> CoreMlDelegateProvider::CreateFlags(
 #endif
 }
 
-void CoreMlDelegateProvider::LogParams(const BenchmarkParams& params) const {
+void CoreMlDelegateProvider::LogParams(const ToolParams& params) const {
 #if defined(REAL_IPHONE_DEVICE)
   TFLITE_LOG(INFO) << "Use Core ML : [" << params.Get<bool>("use_coreml")
                    << "]";
@@ -65,13 +69,18 @@ void CoreMlDelegateProvider::LogParams(const BenchmarkParams& params) const {
 }
 
 TfLiteDelegatePtr CoreMlDelegateProvider::CreateTfLiteDelegate(
-    const BenchmarkParams& params) const {
+    const ToolParams& params) const {
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
 
 #if defined(REAL_IPHONE_DEVICE)
   if (params.Get<bool>("use_coreml")) {
     TfLiteCoreMlDelegateOptions coreml_opts = {
         .enabled_devices = TfLiteCoreMlDelegateAllDevices};
+    coreml_opts.coreml_version = params.Get<int>("coreml_version");
+    coreml_opts.max_delegated_partitions =
+        params.Get<int>("max_delegated_partitions");
+    coreml_opts.min_nodes_per_partition =
+        params.Get<int>("min_nodes_per_partition");
     delegate = TfLiteDelegatePtr(TfLiteCoreMlDelegateCreate(&coreml_opts),
                                  &TfLiteCoreMlDelegateDelete);
     if (!delegate) {
@@ -84,5 +93,5 @@ TfLiteDelegatePtr CoreMlDelegateProvider::CreateTfLiteDelegate(
   return delegate;
 }
 
-}  // namespace benchmark
+}  // namespace tools
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/default_execution_provider.cc b/tensorflow/lite/tools/delegates/default_execution_provider.cc
similarity index 63%
rename from tensorflow/lite/tools/benchmark/default_execution_provider.cc
rename to tensorflow/lite/tools/delegates/default_execution_provider.cc
index f7204cba954..f75fd791072 100644
--- a/tensorflow/lite/tools/benchmark/default_execution_provider.cc
+++ b/tensorflow/lite/tools/delegates/default_execution_provider.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <string>
 
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 
 namespace tflite {
-namespace benchmark {
+namespace tools {
 
 // This class actually doesn't provide any TFLite delegate instances, it simply
 // provides common params and flags that are common to all actual delegate
@@ -25,40 +25,48 @@ namespace benchmark {
 class DefaultExecutionProvider : public DelegateProvider {
  public:
   DefaultExecutionProvider() {
-    default_params_.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
+    default_params_.AddParam("num_threads", ToolParam::Create<int32_t>(1));
     default_params_.AddParam("max_delegated_partitions",
-                             BenchmarkParam::Create<int32_t>(0));
+                             ToolParam::Create<int32_t>(0));
+    default_params_.AddParam("min_nodes_per_partition",
+                             ToolParam::Create<int32_t>(0));
   }
 
-  std::vector<Flag> CreateFlags(BenchmarkParams* params) const final;
-  void LogParams(const BenchmarkParams& params) const final;
-  TfLiteDelegatePtr CreateTfLiteDelegate(
-      const BenchmarkParams& params) const final;
+  std::vector<Flag> CreateFlags(ToolParams* params) const final;
+  void LogParams(const ToolParams& params) const final;
+  TfLiteDelegatePtr CreateTfLiteDelegate(const ToolParams& params) const final;
   std::string GetName() const final { return "Default-NoDelegate"; }
 };
 REGISTER_DELEGATE_PROVIDER(DefaultExecutionProvider);
 
 std::vector<Flag> DefaultExecutionProvider::CreateFlags(
-    BenchmarkParams* params) const {
+    ToolParams* params) const {
   std::vector<Flag> flags = {
       CreateFlag<int32_t>("num_threads", params,
                           "number of threads used for inference on CPU."),
       CreateFlag<int32_t>("max_delegated_partitions", params,
-                          "Max number of partitions to be delegated.")};
+                          "Max number of partitions to be delegated."),
+      CreateFlag<int32_t>(
+          "min_nodes_per_partition", params,
+          "The minimal number of TFLite graph nodes of a partition that has to "
+          "be reached for it to be delegated.A negative value or 0 means to "
+          "use the default choice of each delegate.")};
   return flags;
 }
 
-void DefaultExecutionProvider::LogParams(const BenchmarkParams& params) const {
+void DefaultExecutionProvider::LogParams(const ToolParams& params) const {
   TFLITE_LOG(INFO) << "#threads used for CPU inference: ["
                    << params.Get<int32_t>("num_threads") << "]";
   TFLITE_LOG(INFO) << "Max number of delegated partitions : ["
                    << params.Get<int32_t>("max_delegated_partitions") << "]";
+  TFLITE_LOG(INFO) << "Min nodes per partition : ["
+                   << params.Get<int32_t>("min_nodes_per_partition") << "]";
 }
 
 TfLiteDelegatePtr DefaultExecutionProvider::CreateTfLiteDelegate(
-    const BenchmarkParams& params) const {
+    const ToolParams& params) const {
   return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
-}  // namespace benchmark
+}  // namespace tools
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/delegate_provider.h b/tensorflow/lite/tools/delegates/delegate_provider.h
similarity index 73%
rename from tensorflow/lite/tools/benchmark/delegate_provider.h
rename to tensorflow/lite/tools/delegates/delegate_provider.h
index 6090b7f6ee8..91dd3b17b79 100644
--- a/tensorflow/lite/tools/benchmark/delegate_provider.h
+++ b/tensorflow/lite/tools/delegates/delegate_provider.h
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_DELEGATE_PROVIDER_H_
-#define TENSORFLOW_LITE_TOOLS_BENCHMARK_DELEGATE_PROVIDER_H_
+#ifndef TENSORFLOW_LITE_TOOLS_DELEGATES_DELEGATE_PROVIDER_H_
+#define TENSORFLOW_LITE_TOOLS_DELEGATES_DELEGATE_PROVIDER_H_
 
 #include <string>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/logging.h"
+#include "tensorflow/lite/tools/tool_params.h"
 
 namespace tflite {
-namespace benchmark {
+namespace tools {
 
 // Same w/ Interpreter::TfLiteDelegatePtr to avoid pulling
 // tensorflow/lite/interpreter.h dependency
@@ -36,31 +36,30 @@ class DelegateProvider {
  public:
   virtual ~DelegateProvider() {}
 
-  // Create a list of command-line parsable flags based on benchmark params
-  // inside 'params' whose value will be set to the corresponding runtime flag
-  // value.
-  virtual std::vector<Flag> CreateFlags(BenchmarkParams* params) const = 0;
+  // Create a list of command-line parsable flags based on tool params inside
+  // 'params' whose value will be set to the corresponding runtime flag value.
+  virtual std::vector<Flag> CreateFlags(ToolParams* params) const = 0;
 
-  // Log benchmark params.
-  virtual void LogParams(const BenchmarkParams& params) const = 0;
+  // Log tool params.
+  virtual void LogParams(const ToolParams& params) const = 0;
 
-  // Create a TfLiteDelegate based on benchmark params.
+  // Create a TfLiteDelegate based on tool params.
   virtual TfLiteDelegatePtr CreateTfLiteDelegate(
-      const BenchmarkParams& params) const = 0;
+      const ToolParams& params) const = 0;
 
   virtual std::string GetName() const = 0;
 
-  const BenchmarkParams& DefaultParams() const { return default_params_; }
+  const ToolParams& DefaultParams() const { return default_params_; }
 
  protected:
   template <typename T>
-  Flag CreateFlag(const char* name, BenchmarkParams* params,
+  Flag CreateFlag(const char* name, ToolParams* params,
                   const std::string& usage) const {
     return Flag(
         name, [params, name](const T& val) { params->Set<T>(name, val); },
-        default_params_.Get<T>(name), usage, Flag::OPTIONAL);
+        default_params_.Get<T>(name), usage, Flag::kOptional);
   }
-  BenchmarkParams default_params_;
+  ToolParams default_params_;
 };
 
 using DelegateProviderPtr = std::unique_ptr<DelegateProvider>;
@@ -102,7 +101,7 @@ class DelegateProviderRegistrar {
 inline const DelegateProviderList& GetRegisteredDelegateProviders() {
   return DelegateProviderRegistrar::GetProviders();
 }
-}  // namespace benchmark
+}  // namespace tools
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_DELEGATE_PROVIDER_H_
+#endif  // TENSORFLOW_LITE_TOOLS_DELEGATES_DELEGATE_PROVIDER_H_
diff --git a/tensorflow/lite/tools/benchmark/external_delegate_provider.cc b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
similarity index 88%
rename from tensorflow/lite/tools/benchmark/external_delegate_provider.cc
rename to tensorflow/lite/tools/delegates/external_delegate_provider.cc
index a5d8a941697..193860820b1 100644
--- a/tensorflow/lite/tools/benchmark/external_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 
 #if defined(_WIN32)
 #include <Windows.h>
@@ -25,7 +25,7 @@ limitations under the License.
 #include <vector>
 
 namespace tflite {
-namespace benchmark {
+namespace tools {
 namespace {
 // Library Support construct to handle dynamic library operations
 #if defined(_WIN32)
@@ -97,34 +97,33 @@ class ExternalDelegateProvider : public DelegateProvider {
  public:
   ExternalDelegateProvider() {
     default_params_.AddParam("external_delegate_path",
-                             BenchmarkParam::Create<std::string>(""));
+                             ToolParam::Create<std::string>(""));
     default_params_.AddParam("external_delegate_options",
-                             BenchmarkParam::Create<std::string>(""));
+                             ToolParam::Create<std::string>(""));
   }
 
-  std::vector<Flag> CreateFlags(BenchmarkParams* params) const final;
+  std::vector<Flag> CreateFlags(ToolParams* params) const final;
 
-  void LogParams(const BenchmarkParams& params) const final;
+  void LogParams(const ToolParams& params) const final;
 
-  TfLiteDelegatePtr CreateTfLiteDelegate(
-      const BenchmarkParams& params) const final;
+  TfLiteDelegatePtr CreateTfLiteDelegate(const ToolParams& params) const final;
 
   std::string GetName() const final { return "EXTERNAL"; }
 };
 REGISTER_DELEGATE_PROVIDER(ExternalDelegateProvider);
 
 std::vector<Flag> ExternalDelegateProvider::CreateFlags(
-    BenchmarkParams* params) const {
+    ToolParams* params) const {
   std::vector<Flag> flags = {
       CreateFlag<std::string>("external_delegate_path", params,
                               "The library path for the underlying external."),
       CreateFlag<std::string>(
           "external_delegate_options", params,
-          "Comma-seperated options to be passed to the external delegate")};
+          "Comma-separated options to be passed to the external delegate")};
   return flags;
 }
 
-void ExternalDelegateProvider::LogParams(const BenchmarkParams& params) const {
+void ExternalDelegateProvider::LogParams(const ToolParams& params) const {
   TFLITE_LOG(INFO) << "External delegate path : ["
                    << params.Get<std::string>("external_delegate_path") << "]";
   TFLITE_LOG(INFO) << "External delegate options : ["
@@ -133,7 +132,7 @@ void ExternalDelegateProvider::LogParams(const BenchmarkParams& params) const {
 }
 
 TfLiteDelegatePtr ExternalDelegateProvider::CreateTfLiteDelegate(
-    const BenchmarkParams& params) const {
+    const ToolParams& params) const {
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
   std::string lib_path = params.Get<std::string>("external_delegate_path");
   if (!lib_path.empty()) {
@@ -167,5 +166,5 @@ TfLiteDelegatePtr ExternalDelegateProvider::CreateTfLiteDelegate(
   }
   return delegate;
 }
-}  // namespace benchmark
+}  // namespace tools
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/gpu_delegate_provider.cc b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
similarity index 65%
rename from tensorflow/lite/tools/benchmark/gpu_delegate_provider.cc
rename to tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
index 72195e183e6..db1f32b2282 100644
--- a/tensorflow/lite/tools/benchmark/gpu_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <string>
 
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 #if defined(__ANDROID__)
 #include "tensorflow/lite/delegates/gpu/delegate.h"
@@ -28,35 +28,38 @@ limitations under the License.
 #endif
 
 namespace tflite {
-namespace benchmark {
+namespace tools {
 
 class GpuDelegateProvider : public DelegateProvider {
  public:
   GpuDelegateProvider() {
-    default_params_.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
+    default_params_.AddParam("use_gpu", ToolParam::Create<bool>(false));
 #if defined(__ANDROID__) || defined(REAL_IPHONE_DEVICE)
     default_params_.AddParam("gpu_precision_loss_allowed",
-                             BenchmarkParam::Create<bool>(true));
+                             ToolParam::Create<bool>(true));
+#endif
+#if defined(__ANDROID__)
+    default_params_.AddParam("gpu_experimental_enable_quant",
+                             ToolParam::Create<bool>(true));
+    default_params_.AddParam("gpu_backend", ToolParam::Create<std::string>(""));
 #endif
 #if defined(REAL_IPHONE_DEVICE)
     default_params_.AddParam("gpu_wait_type",
-                             BenchmarkParam::Create<std::string>(""));
+                             ToolParam::Create<std::string>(""));
 #endif
   }
 
-  std::vector<Flag> CreateFlags(BenchmarkParams* params) const final;
+  std::vector<Flag> CreateFlags(ToolParams* params) const final;
 
-  void LogParams(const BenchmarkParams& params) const final;
+  void LogParams(const ToolParams& params) const final;
 
-  TfLiteDelegatePtr CreateTfLiteDelegate(
-      const BenchmarkParams& params) const final;
+  TfLiteDelegatePtr CreateTfLiteDelegate(const ToolParams& params) const final;
 
   std::string GetName() const final { return "GPU"; }
 };
 REGISTER_DELEGATE_PROVIDER(GpuDelegateProvider);
 
-std::vector<Flag> GpuDelegateProvider::CreateFlags(
-    BenchmarkParams* params) const {
+std::vector<Flag> GpuDelegateProvider::CreateFlags(ToolParams* params) const {
   std::vector<Flag> flags = {
     CreateFlag<bool>("use_gpu", params, "use gpu"),
 #if defined(__ANDROID__) || defined(REAL_IPHONE_DEVICE)
@@ -64,6 +67,15 @@ std::vector<Flag> GpuDelegateProvider::CreateFlags(
                      "Allow to process computation in lower precision than "
                      "FP32 in GPU. By default, it's enabled."),
 #endif
+#if defined(__ANDROID__)
+    CreateFlag<bool>("gpu_experimental_enable_quant", params,
+                     "Whether to enable the GPU delegate to run quantized "
+                     "models or not. By default, it's disabled."),
+    CreateFlag<std::string>(
+        "gpu_backend", params,
+        "Force the GPU delegate to use a particular backend for execution, and "
+        "fail if unsuccessful. Should be one of: cl, gl"),
+#endif
 #if defined(REAL_IPHONE_DEVICE)
     CreateFlag<std::string>(
         "gpu_wait_type", params,
@@ -74,12 +86,18 @@ std::vector<Flag> GpuDelegateProvider::CreateFlags(
   return flags;
 }
 
-void GpuDelegateProvider::LogParams(const BenchmarkParams& params) const {
+void GpuDelegateProvider::LogParams(const ToolParams& params) const {
   TFLITE_LOG(INFO) << "Use gpu : [" << params.Get<bool>("use_gpu") << "]";
 #if defined(__ANDROID__) || defined(REAL_IPHONE_DEVICE)
   TFLITE_LOG(INFO) << "Allow lower precision in gpu : ["
                    << params.Get<bool>("gpu_precision_loss_allowed") << "]";
 #endif
+#if defined(__ANDROID__)
+  TFLITE_LOG(INFO) << "Enable running quant models in gpu : ["
+                   << params.Get<bool>("gpu_experimental_enable_quant") << "]";
+  TFLITE_LOG(INFO) << "GPU backend : ["
+                   << params.Get<std::string>("gpu_backend") << "]";
+#endif
 #if defined(REAL_IPHONE_DEVICE)
   TFLITE_LOG(INFO) << "GPU delegate wait type : ["
                    << params.Get<std::string>("gpu_wait_type") << "]";
@@ -87,7 +105,7 @@ void GpuDelegateProvider::LogParams(const BenchmarkParams& params) const {
 }
 
 TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
-    const BenchmarkParams& params) const {
+    const ToolParams& params) const {
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
 
   if (params.Get<bool>("use_gpu")) {
@@ -100,6 +118,19 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
       gpu_opts.inference_priority3 =
           TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
     }
+    if (params.Get<bool>("gpu_experimental_enable_quant")) {
+      gpu_opts.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+    }
+    std::string gpu_backend = params.Get<std::string>("gpu_backend");
+    if (!gpu_backend.empty()) {
+      if (gpu_backend == "cl") {
+        gpu_opts.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY;
+      } else if (gpu_backend == "gl") {
+        gpu_opts.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY;
+      }
+    }
+    gpu_opts.max_delegated_partitions =
+        params.Get<int>("max_delegated_partitions");
     delegate = evaluation::CreateGPUDelegate(&gpu_opts);
 #elif defined(REAL_IPHONE_DEVICE)
     TFLGpuDelegateOptions gpu_opts = {0};
@@ -123,8 +154,8 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
     delegate = TfLiteDelegatePtr(TFLGpuDelegateCreate(&gpu_opts),
                                  &TFLGpuDelegateDelete);
 #else
-    TFLITE_LOG(WARN) << "The GPU delegate compile options are only supported "
-                        "to be benchmarked on Android or iOS platforms.";
+    TFLITE_LOG(WARN) << "The GPU delegate compile options are only supported on"
+                        "Android or iOS platforms.";
     delegate = evaluation::CreateGPUDelegate();
 #endif
 
@@ -135,5 +166,5 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
   return delegate;
 }
 
-}  // namespace benchmark
+}  // namespace tools
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/hexagon_delegate_provider.cc b/tensorflow/lite/tools/delegates/hexagon_delegate_provider.cc
similarity index 68%
rename from tensorflow/lite/tools/benchmark/hexagon_delegate_provider.cc
rename to tensorflow/lite/tools/delegates/hexagon_delegate_provider.cc
index b06d4f4c94a..0afb0b28b3f 100644
--- a/tensorflow/lite/tools/benchmark/hexagon_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/hexagon_delegate_provider.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <string>
 
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
 #if (defined(ANDROID) || defined(__ANDROID__)) && \
@@ -22,36 +22,37 @@ limitations under the License.
 #define TFLITE_ENABLE_HEXAGON
 #endif
 
+#if defined(TFLITE_ENABLE_HEXAGON)
+#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.h"
+#endif
+
 namespace tflite {
-namespace benchmark {
+namespace tools {
 
 class HexagonDelegateProvider : public DelegateProvider {
  public:
   HexagonDelegateProvider() {
 #if defined(TFLITE_ENABLE_HEXAGON)
-    default_params_.AddParam("use_hexagon",
-                             BenchmarkParam::Create<bool>(false));
-    default_params_.AddParam(
-        "hexagon_lib_path",
-        BenchmarkParam::Create<std::string>("/data/local/tmp"));
+    default_params_.AddParam("use_hexagon", ToolParam::Create<bool>(false));
+    default_params_.AddParam("hexagon_lib_path",
+                             ToolParam::Create<std::string>("/data/local/tmp"));
     default_params_.AddParam("hexagon_profiling",
-                             BenchmarkParam::Create<bool>(false));
+                             ToolParam::Create<bool>(false));
 #endif
   }
 
-  std::vector<Flag> CreateFlags(BenchmarkParams* params) const final;
+  std::vector<Flag> CreateFlags(ToolParams* params) const final;
 
-  void LogParams(const BenchmarkParams& params) const final;
+  void LogParams(const ToolParams& params) const final;
 
-  TfLiteDelegatePtr CreateTfLiteDelegate(
-      const BenchmarkParams& params) const final;
+  TfLiteDelegatePtr CreateTfLiteDelegate(const ToolParams& params) const final;
 
   std::string GetName() const final { return "Hexagon"; }
 };
 REGISTER_DELEGATE_PROVIDER(HexagonDelegateProvider);
 
 std::vector<Flag> HexagonDelegateProvider::CreateFlags(
-    BenchmarkParams* params) const {
+    ToolParams* params) const {
 #if defined(TFLITE_ENABLE_HEXAGON)
   std::vector<Flag> flags = {
       CreateFlag<bool>("use_hexagon", params, "Use Hexagon delegate"),
@@ -66,7 +67,7 @@ std::vector<Flag> HexagonDelegateProvider::CreateFlags(
 #endif
 }
 
-void HexagonDelegateProvider::LogParams(const BenchmarkParams& params) const {
+void HexagonDelegateProvider::LogParams(const ToolParams& params) const {
 #if defined(TFLITE_ENABLE_HEXAGON)
   TFLITE_LOG(INFO) << "Use Hexagon : [" << params.Get<bool>("use_hexagon")
                    << "]";
@@ -78,13 +79,18 @@ void HexagonDelegateProvider::LogParams(const BenchmarkParams& params) const {
 }
 
 TfLiteDelegatePtr HexagonDelegateProvider::CreateTfLiteDelegate(
-    const BenchmarkParams& params) const {
+    const ToolParams& params) const {
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
 #if defined(TFLITE_ENABLE_HEXAGON)
   if (params.Get<bool>("use_hexagon")) {
+    TfLiteHexagonDelegateOptions options = {0};
+    options.print_graph_profile = params.Get<bool>("hexagon_profiling");
+    options.max_delegated_partitions =
+        params.Get<int>("max_delegated_partitions");
+    options.min_nodes_per_partition =
+        params.Get<int>("min_nodes_per_partition");
     delegate = evaluation::CreateHexagonDelegate(
-        params.Get<std::string>("hexagon_lib_path"),
-        params.Get<bool>("hexagon_profiling"));
+        &options, params.Get<std::string>("hexagon_lib_path"));
 
     if (!delegate.get()) {
       TFLITE_LOG(WARN)
@@ -96,5 +102,5 @@ TfLiteDelegatePtr HexagonDelegateProvider::CreateTfLiteDelegate(
   return delegate;
 }
 
-}  // namespace benchmark
+}  // namespace tools
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
similarity index 81%
rename from tensorflow/lite/tools/benchmark/nnapi_delegate_provider.cc
rename to tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index 04aa318b789..2fbfb791e8c 100644
--- a/tensorflow/lite/tools/benchmark/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -14,40 +14,40 @@ limitations under the License.
 ==============================================================================*/
 #include <string>
 
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 #if defined(__ANDROID__)
 #include "tensorflow/lite/nnapi/nnapi_util.h"
 #endif
 
 namespace tflite {
-namespace benchmark {
+namespace tools {
 
 class NnapiDelegateProvider : public DelegateProvider {
  public:
   NnapiDelegateProvider() {
-    default_params_.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
+    default_params_.AddParam("use_nnapi", ToolParam::Create<bool>(false));
     default_params_.AddParam("nnapi_execution_preference",
-                             BenchmarkParam::Create<std::string>(""));
+                             ToolParam::Create<std::string>(""));
     default_params_.AddParam("nnapi_accelerator_name",
-                             BenchmarkParam::Create<std::string>(""));
+                             ToolParam::Create<std::string>(""));
     default_params_.AddParam("disable_nnapi_cpu",
-                             BenchmarkParam::Create<bool>(false));
+                             ToolParam::Create<bool>(false));
+    default_params_.AddParam("nnapi_allow_fp16",
+                             ToolParam::Create<bool>(false));
   }
 
-  std::vector<Flag> CreateFlags(BenchmarkParams* params) const final;
+  std::vector<Flag> CreateFlags(ToolParams* params) const final;
 
-  void LogParams(const BenchmarkParams& params) const final;
+  void LogParams(const ToolParams& params) const final;
 
-  TfLiteDelegatePtr CreateTfLiteDelegate(
-      const BenchmarkParams& params) const final;
+  TfLiteDelegatePtr CreateTfLiteDelegate(const ToolParams& params) const final;
 
   std::string GetName() const final { return "NNAPI"; }
 };
 REGISTER_DELEGATE_PROVIDER(NnapiDelegateProvider);
 
-std::vector<Flag> NnapiDelegateProvider::CreateFlags(
-    BenchmarkParams* params) const {
+std::vector<Flag> NnapiDelegateProvider::CreateFlags(ToolParams* params) const {
   std::vector<Flag> flags = {
       CreateFlag<bool>("use_nnapi", params, "use nnapi delegate api"),
       CreateFlag<std::string>("nnapi_execution_preference", params,
@@ -58,12 +58,14 @@ std::vector<Flag> NnapiDelegateProvider::CreateFlags(
           "nnapi_accelerator_name", params,
           "the name of the nnapi accelerator to use (requires Android Q+)"),
       CreateFlag<bool>("disable_nnapi_cpu", params,
-                       "Disable the NNAPI CPU device")};
+                       "Disable the NNAPI CPU device"),
+      CreateFlag<bool>("nnapi_allow_fp16", params,
+                       "Allow fp32 computation to be run in fp16")};
 
   return flags;
 }
 
-void NnapiDelegateProvider::LogParams(const BenchmarkParams& params) const {
+void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
 #if defined(__ANDROID__)
   TFLITE_LOG(INFO) << "Use nnapi : [" << params.Get<bool>("use_nnapi") << "]";
   if (params.Get<bool>("use_nnapi")) {
@@ -85,12 +87,16 @@ void NnapiDelegateProvider::LogParams(const BenchmarkParams& params) const {
       TFLITE_LOG(INFO) << "disable_nnapi_cpu: ["
                        << params.Get<bool>("disable_nnapi_cpu") << "]";
     }
+    if (params.Get<bool>("nnapi_allow_fp16")) {
+      TFLITE_LOG(INFO) << "Allow fp16 in NNAPI: ["
+                       << params.Get<bool>("nnapi_allow_fp16") << "]";
+    }
   }
 #endif
 }
 
 TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
-    const BenchmarkParams& params) const {
+    const ToolParams& params) const {
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
   if (params.Get<bool>("use_nnapi")) {
     StatefulNnApiDelegate::Options options;
@@ -101,6 +107,11 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
     } else if (params.Get<bool>("disable_nnapi_cpu")) {
       options.disallow_nnapi_cpu = true;
     }
+
+    if (params.Get<bool>("nnapi_allow_fp16")) {
+      options.allow_fp16 = true;
+    }
+
     std::string string_execution_preference =
         params.Get<std::string>("nnapi_execution_preference");
     // Only set execution preference if user explicitly passes one. Otherwise,
@@ -150,5 +161,5 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
   return delegate;
 }
 
-}  // namespace benchmark
+}  // namespace tools
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/xnnpack_delegate_provider.cc b/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc
similarity index 71%
rename from tensorflow/lite/tools/benchmark/xnnpack_delegate_provider.cc
rename to tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc
index 72226396949..e9bdfb4b533 100644
--- a/tensorflow/lite/tools/benchmark/xnnpack_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc
@@ -14,44 +14,42 @@ limitations under the License.
 ==============================================================================*/
 #include <string>
 
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
 namespace tflite {
-namespace benchmark {
+namespace tools {
 
 class XnnpackDelegateProvider : public DelegateProvider {
  public:
   XnnpackDelegateProvider() {
-    default_params_.AddParam("use_xnnpack",
-                             BenchmarkParam::Create<bool>(false));
+    default_params_.AddParam("use_xnnpack", ToolParam::Create<bool>(false));
   }
 
-  std::vector<Flag> CreateFlags(BenchmarkParams* params) const final;
+  std::vector<Flag> CreateFlags(ToolParams* params) const final;
 
-  void LogParams(const BenchmarkParams& params) const final;
+  void LogParams(const ToolParams& params) const final;
 
-  TfLiteDelegatePtr CreateTfLiteDelegate(
-      const BenchmarkParams& params) const final;
+  TfLiteDelegatePtr CreateTfLiteDelegate(const ToolParams& params) const final;
 
   std::string GetName() const final { return "XNNPACK"; }
 };
 REGISTER_DELEGATE_PROVIDER(XnnpackDelegateProvider);
 
 std::vector<Flag> XnnpackDelegateProvider::CreateFlags(
-    BenchmarkParams* params) const {
+    ToolParams* params) const {
   std::vector<Flag> flags = {
       CreateFlag<bool>("use_xnnpack", params, "use XNNPack")};
   return flags;
 }
 
-void XnnpackDelegateProvider::LogParams(const BenchmarkParams& params) const {
+void XnnpackDelegateProvider::LogParams(const ToolParams& params) const {
   TFLITE_LOG(INFO) << "Use xnnpack : [" << params.Get<bool>("use_xnnpack")
                    << "]";
 }
 
 TfLiteDelegatePtr XnnpackDelegateProvider::CreateTfLiteDelegate(
-    const BenchmarkParams& params) const {
+    const ToolParams& params) const {
   if (params.Get<bool>("use_xnnpack")) {
     return evaluation::CreateXNNPACKDelegate(
         params.Get<int32_t>("num_threads"));
@@ -59,5 +57,5 @@ TfLiteDelegatePtr XnnpackDelegateProvider::CreateTfLiteDelegate(
   return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
-}  // namespace benchmark
+}  // namespace tools
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index bf21e553b1f..1bc35211b0a 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "//conditions:default": [],
     }) + select({
         "//tensorflow:fuchsia": [],
+        "//tensorflow:windows": [],
         "//conditions:default": [
             "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         ],
@@ -71,9 +72,10 @@ cc_library(
     deps = [
         ":utils",
         "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools:tool_params",
-        "//tensorflow/lite/tools/benchmark:delegate_provider_hdr",
-        "//tensorflow/lite/tools/benchmark:tflite_execution_providers",
+        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
+        "//tensorflow/lite/tools/delegates:tflite_execution_providers",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
     ],
 )
@@ -100,6 +102,7 @@ cc_test(
     linkopts = tflite_linkopts(),
     deps = [
         ":evaluation_delegate_provider",
+        "//tensorflow/lite/tools:tool_params",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
index 91a0aea4711..42f2666ba9b 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 
 #include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/logging.h"
 
 namespace tflite {
 namespace evaluation {
@@ -65,7 +66,6 @@ TfLiteDelegatePtr CreateTfLiteDelegate(const TfliteInferenceParams& params,
       return p;
     }
     case TfliteInferenceParams::NONE:
-      if (error_msg) *error_msg = "No delegate type is specified.";
       return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
     default:
       if (error_msg) {
@@ -78,7 +78,7 @@ TfLiteDelegatePtr CreateTfLiteDelegate(const TfliteInferenceParams& params,
 }
 
 DelegateProviders::DelegateProviders()
-    : delegates_list_(benchmark::GetRegisteredDelegateProviders()),
+    : delegates_list_(tools::GetRegisteredDelegateProviders()),
       delegates_map_([=]() -> std::unordered_map<std::string, int> {
         std::unordered_map<std::string, int> delegates_map;
         for (int i = 0; i < delegates_list_.size(); ++i) {
@@ -118,44 +118,46 @@ std::vector<TfLiteDelegatePtr> DelegateProviders::CreateAllDelegates(
     // user-specified benchmark params tells not to.
     if (ptr == nullptr) continue;
     delegates.emplace_back(std::move(ptr));
+    TFLITE_LOG(INFO) << one->GetName() << " delegate is created.";
   }
   return delegates;
 }
 
-std::vector<TfLiteDelegatePtr> DelegateProviders::CreateAllDelegates(
+tools::ToolParams DelegateProviders::GetAllParams(
     const TfliteInferenceParams& params) const {
-  tools::ToolParams merged_params;
-  merged_params.Merge(params_);
+  tools::ToolParams tool_params;
+  tool_params.Merge(params_, /*overwrite*/ false);
+
+  if (params.has_num_threads()) {
+    tool_params.Set<int32_t>("num_threads", params.num_threads());
+  }
 
   const auto type = params.delegate();
   switch (type) {
     case TfliteInferenceParams::NNAPI:
-      if (merged_params.HasParam("use_nnapi")) {
-        merged_params.Set<bool>("use_nnapi", true);
+      if (tool_params.HasParam("use_nnapi")) {
+        tool_params.Set<bool>("use_nnapi", true);
       }
       break;
     case TfliteInferenceParams::GPU:
-      if (merged_params.HasParam("use_gpu")) {
-        merged_params.Set<bool>("use_gpu", true);
+      if (tool_params.HasParam("use_gpu")) {
+        tool_params.Set<bool>("use_gpu", true);
       }
       break;
     case TfliteInferenceParams::HEXAGON:
-      if (merged_params.HasParam("use_hexagon")) {
-        merged_params.Set<bool>("use_hexagon", true);
+      if (tool_params.HasParam("use_hexagon")) {
+        tool_params.Set<bool>("use_hexagon", true);
       }
       break;
     case TfliteInferenceParams::XNNPACK:
-      if (merged_params.HasParam("use_xnnpack")) {
-        merged_params.Set<bool>("use_xnnpack", true);
-      }
-      if (params.has_num_threads()) {
-        merged_params.Set<int32_t>("num_threads", params.num_threads());
+      if (tool_params.HasParam("use_xnnpack")) {
+        tool_params.Set<bool>("use_xnnpack", true);
       }
       break;
     default:
       break;
   }
-  return CreateAllDelegates(merged_params);
+  return tool_params;
 }
 
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
index 5c5c4bb1021..9ff20d630ce 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/lite/tools/benchmark/delegate_provider.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 #include "tensorflow/lite/tools/tool_params.h"
@@ -33,12 +33,19 @@ class DelegateProviders {
   DelegateProviders();
 
   // Initialize delegate-related parameters from commandline arguments and
-  // returns true if sucessful.
+  // returns true if successful.
   bool InitFromCmdlineArgs(int* argc, const char** argv);
 
   // Get all parameters from all registered delegate providers.
   const tools::ToolParams& GetAllParams() const { return params_; }
 
+  // Get a new set of parameters based on the given TfliteInferenceParams
+  // 'params' but considering what have been initialized (i.e. 'params_').
+  // Note the same-meaning parameter (e.g. number of TfLite interpreter threads)
+  // in TfliteInferenceParams will take precedence over the parameter of the
+  // same meaning in 'params_'.
+  tools::ToolParams GetAllParams(const TfliteInferenceParams& params) const;
+
   // Create the a TfLite delegate instance based on the provided delegate
   // 'name'. If the specified one isn't found, an empty TfLiteDelegatePtr is
   // returned.
@@ -53,7 +60,9 @@ class DelegateProviders {
   // Create a list of TfLite delegates based on the given TfliteInferenceParams
   // 'params' but considering what have been initialized (i.e. 'params_').
   std::vector<TfLiteDelegatePtr> CreateAllDelegates(
-      const TfliteInferenceParams& params) const;
+      const TfliteInferenceParams& params) const {
+    return CreateAllDelegates(std::move(GetAllParams(params)));
+  }
 
  private:
   // Create a list of TfLite delegates based on the provided 'params'.
@@ -64,7 +73,7 @@ class DelegateProviders {
   // flags.
   tools::ToolParams params_;
 
-  const benchmark::DelegateProviderList& delegates_list_;
+  const tools::DelegateProviderList& delegates_list_;
   // Key is the delegate name, and the value is the index to the
   // 'delegates_list_'.
   const std::unordered_map<std::string, int> delegates_map_;
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
index 1d7870eaed0..5d0a4dfa7d3 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/tool_params.h"
 
 namespace tflite {
 namespace evaluation {
@@ -54,6 +55,23 @@ TEST(EvaluationDelegateProviderTest, DelegateProvidersParams) {
   EXPECT_EQ("--other_undefined_flag=1", argv[1]);
 }
 
+TEST(EvaluationDelegateProviderTest, GetAllParamsWithTfliteInferenceParams) {
+  DelegateProviders providers;
+  int argc = 2;
+  const char* argv[] = {"program_name", "--num_threads=1"};
+  EXPECT_TRUE(providers.InitFromCmdlineArgs(&argc, argv));
+  const auto& default_params = providers.GetAllParams();
+  EXPECT_EQ(1, default_params.Get<int>("num_threads"));
+
+  TfliteInferenceParams params;
+  params.set_delegate(TfliteInferenceParams::NONE);
+  params.set_num_threads(4);
+  // The same-meaning parameter in TfliteInferenceParams takes precedence.
+  tools::ToolParams tool_params = providers.GetAllParams(params);
+  EXPECT_EQ(4, tool_params.Get<int>("num_threads"));
+  EXPECT_EQ(1, argc);
+}
+
 }  // namespace
 }  // namespace evaluation
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/proto/BUILD b/tensorflow/lite/tools/evaluation/proto/BUILD
index a506e7449be..7ced075e2db 100644
--- a/tensorflow/lite/tools/evaluation/proto/BUILD
+++ b/tensorflow/lite/tools/evaluation/proto/BUILD
@@ -47,6 +47,15 @@ tf_proto_library_py(
     srcs = [
         "evaluation_stages.proto",
     ],
+    protodeps = [":preprocessing_steps"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_py(
+    name = "preprocessing_steps",
+    srcs = [
+        "preprocessing_steps.proto",
+    ],
     visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index 1650151bfa7..17cb0f4ef9f 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -232,6 +232,5 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_protobuf//:protobuf_headers",
     ],
 )
diff --git a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc
index 212e148cbc7..f0c1daeb06b 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc
@@ -67,12 +67,16 @@ TfLiteStatus ImageClassificationStage::Init(
   }
 
   // ImagePreprocessingStage
-  tflite::evaluation::ImagePreprocessingConfigBuilder builder(
-      "image_preprocessing", input_type);
-  builder.AddCroppingStep(kCroppingFraction, true /*square*/);
-  builder.AddResizingStep(input_shape->data[2], input_shape->data[1], false);
-  builder.AddDefaultNormalizationStep();
-  preprocessing_stage_.reset(new ImagePreprocessingStage(builder.build()));
+  if (!config_.specification().has_image_preprocessing_params()) {
+    tflite::evaluation::ImagePreprocessingConfigBuilder builder(
+        "image_preprocessing", input_type);
+    builder.AddCroppingStep(kCroppingFraction, true /*square*/);
+    builder.AddResizingStep(input_shape->data[2], input_shape->data[1], false);
+    builder.AddDefaultNormalizationStep();
+    preprocessing_stage_.reset(new ImagePreprocessingStage(builder.build()));
+  } else {
+    preprocessing_stage_.reset(new ImagePreprocessingStage(config_));
+  }
   if (preprocessing_stage_->Init() != kTfLiteOk) return kTfLiteError;
 
   // TopkAccuracyEvalStage.
diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc
index 1ed8db2076c..8a7f920e846 100644
--- a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <fstream>
 
-#include "google/protobuf/text_format.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
@@ -158,7 +157,7 @@ EvaluationStageMetrics ObjectDetectionStage::LatestMetrics() {
 }
 
 TfLiteStatus PopulateGroundTruth(
-    const std::string& grouth_truth_pbtxt_file,
+    const std::string& grouth_truth_proto_file,
     absl::flat_hash_map<std::string, ObjectDetectionResult>*
         ground_truth_mapping) {
   if (ground_truth_mapping == nullptr) {
@@ -167,11 +166,11 @@ TfLiteStatus PopulateGroundTruth(
   ground_truth_mapping->clear();
 
   // Read the ground truth dump.
-  std::ifstream t(grouth_truth_pbtxt_file);
+  std::ifstream t(grouth_truth_proto_file);
   std::string proto_str((std::istreambuf_iterator<char>(t)),
                         std::istreambuf_iterator<char>());
   ObjectDetectionGroundTruth ground_truth_proto;
-  google::protobuf::TextFormat::ParseFromString(proto_str, &ground_truth_proto);
+  ground_truth_proto.ParseFromString(proto_str);
 
   for (const auto& image_ground_truth :
        ground_truth_proto.detection_results()) {
diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h
index 1489d853c34..0623d215874 100644
--- a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h
@@ -97,7 +97,7 @@ class ObjectDetectionStage : public EvaluationStage {
 // preprocess_coco_minival.py script in evaluation/tasks/coco_object_detection.
 // Useful for wrappers/scripts that use ObjectDetectionStage.
 TfLiteStatus PopulateGroundTruth(
-    const std::string& grouth_truth_pbtxt_file,
+    const std::string& grouth_truth_proto_file,
     absl::flat_hash_map<std::string, ObjectDetectionResult>*
         ground_truth_mapping);
 
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index bb01ca2cc4d..365a00c3cd1 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -159,6 +159,7 @@ EvaluationStageMetrics TfliteInferenceStage::LatestMetrics() {
   latency_metrics->set_min_us(latency_stats_.min());
   latency_metrics->set_sum_us(latency_stats_.sum());
   latency_metrics->set_avg_us(latency_stats_.avg());
+  latency_metrics->set_std_deviation_us(latency_stats_.std_deviation());
   metrics.set_num_runs(
       static_cast<int>(latency_stats_.count() / params.invocations_per_run()));
   auto* inference_metrics =
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
index f7d2e33054b..80e6b3a6a07 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
@@ -148,13 +148,15 @@ TEST(TfliteInferenceStage, CorrectOutput) {
   // Verify metrics.
   EvaluationStageMetrics metrics = stage.LatestMetrics();
   EXPECT_EQ(metrics.num_runs(), 1);
-  const auto& max_latency = metrics.process_metrics().total_latency().max_us();
+  const auto& latency = metrics.process_metrics().total_latency();
+  const auto max_latency = latency.max_us();
   EXPECT_GT(max_latency, 0);
   EXPECT_LT(max_latency, 1e7);
-  EXPECT_LE(metrics.process_metrics().total_latency().last_us(), max_latency);
-  EXPECT_LE(metrics.process_metrics().total_latency().min_us(), max_latency);
-  EXPECT_GT(metrics.process_metrics().total_latency().sum_us(), max_latency);
-  EXPECT_LE(metrics.process_metrics().total_latency().avg_us(), max_latency);
+  EXPECT_LE(latency.last_us(), max_latency);
+  EXPECT_LE(latency.min_us(), max_latency);
+  EXPECT_GT(latency.sum_us(), max_latency);
+  EXPECT_LE(latency.avg_us(), max_latency);
+  EXPECT_TRUE(latency.has_std_deviation_us());
   EXPECT_EQ(
       metrics.process_metrics().tflite_inference_metrics().num_inferences(), 2);
 }
diff --git a/tensorflow/lite/tools/evaluation/tasks/BUILD b/tensorflow/lite/tools/evaluation/tasks/BUILD
new file mode 100644
index 00000000000..d8daf170331
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/BUILD
@@ -0,0 +1,32 @@
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite/tools/evaluation/tasks:build_def.bzl", "task_linkopts")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "task_executor",
+    hdrs = ["task_executor.h"],
+    copts = tflite_copts(),
+    linkopts = task_linkopts(),
+    deps = [
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "task_executor_main",
+    srcs = ["task_executor_main.cc"],
+    copts = tflite_copts(),
+    linkopts = task_linkopts(),
+    deps = [
+        ":task_executor",
+        "//tensorflow/lite/tools:logging",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
diff --git a/tensorflow/lite/tools/evaluation/tasks/build_def.bzl b/tensorflow/lite/tools/evaluation/tasks/build_def.bzl
new file mode 100644
index 00000000000..0d71b4436b2
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/build_def.bzl
@@ -0,0 +1,14 @@
+"""Common BUILD-related definitions across different tasks"""
+
+load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
+
+def task_linkopts():
+    return tflite_linkopts() + select({
+        "//tensorflow:android": [
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
+            # Hexagon delegate libraries should be in /data/local/tmp
+            "-Wl,--rpath=/data/local/tmp/",
+        ],
+        "//conditions:default": [],
+    })
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
index d8c42f9bc05..b8f77d72acb 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite/tools/evaluation/tasks:build_def.bzl", "task_linkopts")
 
 package(
     default_visibility = [
@@ -16,28 +17,34 @@ py_binary(
     deps = ["//tensorflow/lite/tools/evaluation/proto:evaluation_stages_py"],
 )
 
-cc_binary(
-    name = "run_eval",
+cc_library(
+    name = "run_eval_lib",
     srcs = ["run_eval.cc"],
     copts = tflite_copts(),
-    linkopts = tflite_linkopts() + select({
-        "//tensorflow:android": [
-            "-pie",  # Android 5.0 and later supports only PIE
-            "-lm",  # some builtin ops, e.g., tanh, need -lm
-            "-Wl,--rpath=/data/local/tmp/",  # Hexagon delegate libraries should be in /data/local/tmp
-        ],
-        "//conditions:default": [],
-    }),
+    linkopts = task_linkopts(),
     deps = [
-        "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation:utils",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "//tensorflow/lite/tools/evaluation/stages:object_detection_stage",
+        "//tensorflow/lite/tools/evaluation/tasks:task_executor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:optional",
+    ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "run_eval",
+    copts = tflite_copts(),
+    linkopts = task_linkopts(),
+    deps = [
+        ":run_eval_lib",
+        "//tensorflow/lite/tools/evaluation/tasks:task_executor_main",
     ],
 )
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
index 3876952d9b3..a5baff10a28 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
@@ -11,75 +11,23 @@ This binary evaluates the following parameters of TFLite models trained for the
 
 The binary takes the path to validation images and a ground truth proto file as
 inputs, along with the model and inference-specific parameters such as delegate
-and number of threads. It outputs the metrics as a text proto to a file, similar
-to the following:
+and number of threads. It outputs the metrics to std-out as follows:
 
 ```
-num_runs: 8059
-process_metrics {
-  object_detection_metrics {
-    pre_processing_latency {
-      last_us: 27197
-      max_us: 61372
-      min_us: 6166
-      sum_us: 189403170
-      avg_us: 23502.068494850479
-    }
-    inference_latency {
-      last_us: 386378
-      max_us: 412804
-      min_us: 378841
-      sum_us: 3122849071
-      avg_us: 387498.33366422635 # Average Inference Latency.
-    }
-    inference_metrics {
-      num_inferences: 8059 # Number of images evaluated.
-    }
-    average_precision_metrics {
-      individual_average_precisions {
-        iou_threshold: 0.5
-        average_precision: 0.26113987
-      }
-      individual_average_precisions {
-        iou_threshold: 0.55
-        average_precision: 0.2456704
-      }
-      individual_average_precisions {
-        iou_threshold: 0.6
-        average_precision: 0.22885525
-      }
-      individual_average_precisions {
-        iou_threshold: 0.65
-        average_precision: 0.20678344
-      }
-      individual_average_precisions {
-        iou_threshold: 0.7
-        average_precision: 0.18185228
-      }
-      individual_average_precisions {
-        iou_threshold: 0.75
-        average_precision: 0.14681709 # AP at IoU threshold of 0.75.
-      }
-      individual_average_precisions {
-        iou_threshold: 0.8
-        average_precision: 0.107850626
-      }
-      individual_average_precisions {
-        iou_threshold: 0.85
-        average_precision: 0.061735578
-      }
-      individual_average_precisions {
-        iou_threshold: 0.9
-        average_precision: 0.017980274
-      }
-      individual_average_precisions {
-        iou_threshold: 0.95
-        average_precision: 0.0010084915
-      }
-      overall_mean_average_precision: 0.14596924 # Overall mAP average.
-    }
-  }
-}
+Num evaluation runs: 8059
+Preprocessing latency: avg=16589.9(us), std_dev=0(us)
+Inference latency: avg=85169.7(us), std_dev=505(us)
+Average Precision [IOU Threshold=0.5]: 0.349581
+Average Precision [IOU Threshold=0.55]: 0.330213
+Average Precision [IOU Threshold=0.6]: 0.307694
+Average Precision [IOU Threshold=0.65]: 0.281025
+Average Precision [IOU Threshold=0.7]: 0.248507
+Average Precision [IOU Threshold=0.75]: 0.210295
+Average Precision [IOU Threshold=0.8]: 0.165011
+Average Precision [IOU Threshold=0.85]: 0.116215
+Average Precision [IOU Threshold=0.9]: 0.0507883
+Average Precision [IOU Threshold=0.95]: 0.0064338
+Overall mAP: 0.206576
 ```
 
 To run the binary, please follow the
@@ -135,15 +83,48 @@ The following optional parameters can be used to modify the inference runtime:
     assumes that `libhexagon_interface.so` and Qualcomm libraries lie in
     `/data/local/tmp`.
 
+This script also supports runtime/delegate arguments introduced by the
+[delegate registrar](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates).
+If there is any conflict (for example, `num_threads` vs
+`num_interpreter_threads` here), the parameters of this
+script are given precedence.
+
 ### Debug Mode
 
 The script also supports a debug mode with the following parameter:
 
 *   `debug_mode`: `boolean` \
-    Whether to enable debug mode. Per-image predictions are written to the
-    output file along with metrics. NOTE: Its not possible to parse the output
-    file as a proto in this mode, since it contains demarcations between
-    per-file outputs for readability.
+    Whether to enable debug mode. Per-image predictions are written to std-out
+    along with metrics.
+
+Image-wise predictions are output as follows:
+
+```
+======================================================
+
+Image: image_1.jpg
+
+Object [0]
+  Score: 0.585938
+  Class-ID: 5
+  Bounding Box:
+    Normalized Top: 0.23103
+    Normalized Bottom: 0.388524
+    Normalized Left: 0.559144
+    Normalized Right: 0.763928
+Object [1]
+  Score: 0.574219
+  Class-ID: 5
+  Bounding Box:
+    Normalized Top: 0.269571
+    Normalized Bottom: 0.373971
+    Normalized Left: 0.613175
+    Normalized Right: 0.760507
+======================================================
+
+Image: image_2.jpg
+...
+```
 
 This mode lets you debug the output of an object detection model that isn't
 necessarily trained on the COCO dataset (by leaving `ground_truth_proto` empty).
@@ -187,7 +168,7 @@ The script generates the following within the output folder:
 
 *   `images/`: the resulting subset of the 2014 COCO Validation images.
 
-*   `ground_truth.pbtxt`: a `.pbtxt` (text proto) file holding
+*   `ground_truth.pb`: a `.pb` (binary-format proto) file holding
     `tflite::evaluation::ObjectDetectionGroundTruth` corresponding to image
     subset.
 
@@ -248,7 +229,7 @@ adb push /path/to/output/folder /data/local/tmp/coco_validation
 adb shell /data/local/tmp/run_eval \
   --model_file=/data/local/tmp/ssd_mobilenet_v1_float.tflite \
   --ground_truth_images_path=/data/local/tmp/coco_validation/images \
-  --ground_truth_proto=/data/local/tmp/coco_validation/ground_truth.pbtxt \
+  --ground_truth_proto=/data/local/tmp/coco_validation/ground_truth.pb \
   --model_output_labels=/data/local/tmp/labelmap.txt \
   --output_file_path=/data/local/tmp/coco_output.txt
 ```
@@ -266,7 +247,7 @@ bazel run -c opt \
   //tensorflow/lite/tools/evaluation/tasks/coco_object_detection:run_eval \
   --model_file=/path/to/ssd_mobilenet_v1_float.tflite \
   --ground_truth_images_path=/path/to/images \
-  --ground_truth_proto=/path/to/ground_truth.pbtxt \
+  --ground_truth_proto=/path/to/ground_truth.pb \
   --model_output_labels=/path/to/labelmap.txt \
   --output_file_path=/path/to/coco_output.txt
 ```
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/preprocess_coco_minival.py b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/preprocess_coco_minival.py
index 6807e057b0f..ab086538a04 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/preprocess_coco_minival.py
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/preprocess_coco_minival.py
@@ -134,7 +134,7 @@ def _dump_data(ground_truth_detections, images_folder_path, output_folder_path):
 
   The following are created in output_folder_path:
     images/: sub-folder for whitelisted validation images.
-    ground_truth.pbtxt: A text proto file containing all ground-truth
+    ground_truth.pb: A binary proto file containing all ground-truth
     object-sets.
 
   Args:
@@ -149,7 +149,7 @@ def _dump_data(ground_truth_detections, images_folder_path, output_folder_path):
   output_images_folder = os.path.join(output_folder_path, 'images')
   if not os.path.exists(output_images_folder):
     os.makedirs(output_images_folder)
-  output_proto_file = os.path.join(output_folder_path, 'ground_truth.pbtxt')
+  output_proto_file = os.path.join(output_folder_path, 'ground_truth.pb')
 
   ground_truth_data = evaluation_stages_pb2.ObjectDetectionGroundTruth()
   for image_dict in ground_truth_detections.values():
@@ -170,8 +170,8 @@ def _dump_data(ground_truth_detections, images_folder_path, output_folder_path):
         output_images_folder)
 
   # Dump proto.
-  with open(output_proto_file, 'w') as proto_file:
-    proto_file.write(str(ground_truth_data))
+  with open(output_proto_file, 'wb') as proto_file:
+    proto_file.write(ground_truth_data.SerializeToString())
 
 
 def _parse_args():
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 3b5fc08ab84..765e8fc6465 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -18,14 +18,16 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/logging.h"
+#include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/object_detection_stage.h"
+#include "tensorflow/lite/tools/evaluation/tasks/task_executor.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
+#include "tensorflow/lite/tools/logging.h"
 
 namespace tflite {
 namespace evaluation {
@@ -45,145 +47,186 @@ std::string GetNameFromPath(const std::string& str) {
   return str.substr(pos + 1);
 }
 
-bool EvaluateModel(const std::string& model_file_path,
-                   const std::vector<std::string>& model_labels,
-                   const std::vector<std::string>& image_paths,
-                   const std::string& ground_truth_proto_file,
-                   std::string delegate, std::string output_file_path,
-                   int num_interpreter_threads, bool debug_mode,
-                   const DelegateProviders& delegate_providers) {
-  EvaluationStageConfig eval_config;
-  eval_config.set_name("object_detection");
-  auto* detection_params =
-      eval_config.mutable_specification()->mutable_object_detection_params();
-  auto* inference_params = detection_params->mutable_inference_params();
-  inference_params->set_model_file_path(model_file_path);
-  inference_params->set_num_threads(num_interpreter_threads);
-  inference_params->set_delegate(ParseStringToDelegateType(delegate));
-  if (!delegate.empty() &&
-      inference_params->delegate() == TfliteInferenceParams::NONE) {
-    LOG(WARNING) << "Unsupported TFLite delegate: " << delegate;
-    return false;
-  }
+class CocoObjectDetection : public TaskExecutor {
+ public:
+  CocoObjectDetection(int* argc, char* argv[]);
+  ~CocoObjectDetection() override {}
 
-  // Get ground truth data.
-  absl::flat_hash_map<std::string, ObjectDetectionResult> ground_truth_map;
-  if (!ground_truth_proto_file.empty()) {
-    PopulateGroundTruth(ground_truth_proto_file, &ground_truth_map);
-  }
+  // If the run is successful, the latest metrics will be returned.
+  absl::optional<EvaluationStageMetrics> Run() final;
 
-  ObjectDetectionStage eval(eval_config);
+ private:
+  void OutputResult(const EvaluationStageMetrics& latest_metrics) const;
+  std::string model_file_path_;
+  std::string model_output_labels_path_;
+  std::string ground_truth_images_path_;
+  std::string ground_truth_proto_file_;
+  std::string output_file_path_;
+  bool debug_mode_;
+  std::string delegate_;
+  int num_interpreter_threads_;
+  DelegateProviders delegate_providers_;
+};
 
-  eval.SetAllLabels(model_labels);
-  if (eval.Init(&delegate_providers) != kTfLiteOk) return false;
-
-  // Open output file for writing.
-  std::ofstream ofile;
-  ofile.open(output_file_path, std::ios::out);
-
-  const int step = image_paths.size() / 100;
-  for (int i = 0; i < image_paths.size(); ++i) {
-    if (step > 1 && i % step == 0) {
-      LOG(INFO) << "Finished: " << i / step << "%";
-    }
-
-    const std::string image_name = GetNameFromPath(image_paths[i]);
-    eval.SetInputs(image_paths[i], ground_truth_map[image_name]);
-    if (eval.Run() != kTfLiteOk) return false;
-
-    if (debug_mode) {
-      ObjectDetectionResult prediction = *eval.GetLatestPrediction();
-      prediction.set_image_name(image_name);
-      ofile << prediction.DebugString();
-      ofile << "======================================================\n";
-    }
-  }
-
-  // Write metrics to file.
-  EvaluationStageMetrics metrics = eval.LatestMetrics();
-  if (ground_truth_proto_file.empty()) {
-    // mAP metrics are meaningless for no ground truth.
-    metrics.mutable_process_metrics()
-        ->mutable_object_detection_metrics()
-        ->clear_average_precision_metrics();
-  }
-  ofile << metrics.DebugString();
-  ofile.close();
-
-  return true;
-}
-
-int Main(int argc, char* argv[]) {
-  // Command Line Flags.
-  std::string model_file_path;
-  std::string ground_truth_images_path;
-  std::string ground_truth_proto_file;
-  std::string model_output_labels_path;
-  std::string output_file_path;
-  std::string delegate;
-  int num_interpreter_threads = 1;
-  bool debug_mode;
+CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
+    : debug_mode_(false), num_interpreter_threads_(1) {
   std::vector<tflite::Flag> flag_list = {
-      tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path,
+      tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path_,
                                "Path to test tflite model file."),
       tflite::Flag::CreateFlag(
-          kModelOutputLabelsFlag, &model_output_labels_path,
+          kModelOutputLabelsFlag, &model_output_labels_path_,
           "Path to labels that correspond to output of model."
           " E.g. in case of COCO-trained SSD model, this is the path to file "
           "where each line contains a class detected by the model in correct "
           "order, starting from background."),
       tflite::Flag::CreateFlag(
-          kGroundTruthImagesPathFlag, &ground_truth_images_path,
+          kGroundTruthImagesPathFlag, &ground_truth_images_path_,
           "Path to ground truth images. These will be evaluated in "
           "alphabetical order of filenames"),
+      tflite::Flag::CreateFlag(kGroundTruthProtoFileFlag,
+                               &ground_truth_proto_file_,
+                               "Path to file containing "
+                               "tflite::evaluation::ObjectDetectionGroundTruth "
+                               "proto in binary serialized format. If left "
+                               "empty, mAP numbers are not output."),
       tflite::Flag::CreateFlag(
-          kGroundTruthProtoFileFlag, &ground_truth_proto_file,
-          "Path to file containing "
-          "tflite::evaluation::ObjectDetectionGroundTruth "
-          "proto in text format. If left empty, mAP numbers are not output."),
-      tflite::Flag::CreateFlag(
-          kOutputFilePathFlag, &output_file_path,
+          kOutputFilePathFlag, &output_file_path_,
           "File to output to. Contains only metrics proto if debug_mode is "
           "off, and per-image predictions also otherwise."),
-      tflite::Flag::CreateFlag(kDebugModeFlag, &debug_mode,
+      tflite::Flag::CreateFlag(kDebugModeFlag, &debug_mode_,
                                "Whether to enable debug mode. Per-image "
                                "predictions are written to the output file "
                                "along with metrics."),
       tflite::Flag::CreateFlag(
-          kInterpreterThreadsFlag, &num_interpreter_threads,
+          kInterpreterThreadsFlag, &num_interpreter_threads_,
           "Number of interpreter threads to use for inference."),
-      tflite::Flag::CreateFlag(kDelegateFlag, &delegate,
-                               "Delegate to use for inference, if available. "
-                               "Must be one of {'nnapi', 'gpu'}"),
+      tflite::Flag::CreateFlag(
+          kDelegateFlag, &delegate_,
+          "Delegate to use for inference, if available. "
+          "Must be one of {'nnapi', 'gpu', 'xnnpack', 'hexagon'}"),
   };
-  tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flag_list);
+  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   DelegateProviders delegate_providers;
-  delegate_providers.InitFromCmdlineArgs(&argc, const_cast<const char**>(argv));
+  delegate_providers.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
+}
 
+absl::optional<EvaluationStageMetrics> CocoObjectDetection::Run() {
   // Process images in filename-sorted order.
   std::vector<std::string> image_paths;
-  TF_LITE_ENSURE_STATUS(GetSortedFileNames(
-      StripTrailingSlashes(ground_truth_images_path), &image_paths));
+  if (GetSortedFileNames(StripTrailingSlashes(ground_truth_images_path_),
+                         &image_paths) != kTfLiteOk) {
+    return absl::nullopt;
+  }
 
   std::vector<std::string> model_labels;
-  if (!ReadFileLines(model_output_labels_path, &model_labels)) {
-    LOG(ERROR) << "Could not read model output labels file";
-    return EXIT_FAILURE;
+  if (!ReadFileLines(model_output_labels_path_, &model_labels)) {
+    TFLITE_LOG(ERROR) << "Could not read model output labels file";
+    return absl::nullopt;
   }
 
-  if (!EvaluateModel(model_file_path, model_labels, image_paths,
-                     ground_truth_proto_file, delegate, output_file_path,
-                     num_interpreter_threads, debug_mode, delegate_providers)) {
-    LOG(ERROR) << "Could not evaluate model";
-    return EXIT_FAILURE;
+  EvaluationStageConfig eval_config;
+  eval_config.set_name("object_detection");
+  auto* detection_params =
+      eval_config.mutable_specification()->mutable_object_detection_params();
+  auto* inference_params = detection_params->mutable_inference_params();
+  inference_params->set_model_file_path(model_file_path_);
+  inference_params->set_num_threads(num_interpreter_threads_);
+  inference_params->set_delegate(ParseStringToDelegateType(delegate_));
+
+  // Get ground truth data.
+  absl::flat_hash_map<std::string, ObjectDetectionResult> ground_truth_map;
+  if (!ground_truth_proto_file_.empty()) {
+    PopulateGroundTruth(ground_truth_proto_file_, &ground_truth_map);
   }
 
-  return EXIT_SUCCESS;
+  ObjectDetectionStage eval(eval_config);
+
+  eval.SetAllLabels(model_labels);
+  if (eval.Init(&delegate_providers_) != kTfLiteOk) return absl::nullopt;
+
+  const int step = image_paths.size() / 100;
+  for (int i = 0; i < image_paths.size(); ++i) {
+    if (step > 1 && i % step == 0) {
+      TFLITE_LOG(INFO) << "Finished: " << i / step << "%";
+    }
+
+    const std::string image_name = GetNameFromPath(image_paths[i]);
+    eval.SetInputs(image_paths[i], ground_truth_map[image_name]);
+    if (eval.Run() != kTfLiteOk) return absl::nullopt;
+
+    if (debug_mode_) {
+      ObjectDetectionResult prediction = *eval.GetLatestPrediction();
+      TFLITE_LOG(INFO) << "Image: " << image_name << "\n";
+      for (int i = 0; i < prediction.objects_size(); ++i) {
+        const auto& object = prediction.objects(i);
+        TFLITE_LOG(INFO) << "Object [" << i << "]";
+        TFLITE_LOG(INFO) << "  Score: " << object.score();
+        TFLITE_LOG(INFO) << "  Class-ID: " << object.class_id();
+        TFLITE_LOG(INFO) << "  Bounding Box:";
+        const auto& bounding_box = object.bounding_box();
+        TFLITE_LOG(INFO) << "    Normalized Top: "
+                         << bounding_box.normalized_top();
+        TFLITE_LOG(INFO) << "    Normalized Bottom: "
+                         << bounding_box.normalized_bottom();
+        TFLITE_LOG(INFO) << "    Normalized Left: "
+                         << bounding_box.normalized_left();
+        TFLITE_LOG(INFO) << "    Normalized Right: "
+                         << bounding_box.normalized_right();
+      }
+      TFLITE_LOG(INFO)
+          << "======================================================\n";
+    }
+  }
+
+  // Write metrics to file.
+  EvaluationStageMetrics latest_metrics = eval.LatestMetrics();
+  if (ground_truth_proto_file_.empty()) {
+    TFLITE_LOG(WARN) << "mAP metrics are meaningless w/o ground truth.";
+    latest_metrics.mutable_process_metrics()
+        ->mutable_object_detection_metrics()
+        ->clear_average_precision_metrics();
+  }
+
+  OutputResult(latest_metrics);
+  return absl::make_optional(latest_metrics);
+}
+
+void CocoObjectDetection::OutputResult(
+    const EvaluationStageMetrics& latest_metrics) const {
+  if (!output_file_path_.empty()) {
+    std::ofstream metrics_ofile;
+    metrics_ofile.open(output_file_path_, std::ios::out);
+    metrics_ofile << latest_metrics.SerializeAsString();
+    metrics_ofile.close();
+  }
+  TFLITE_LOG(INFO) << "Num evaluation runs: " << latest_metrics.num_runs();
+  const auto object_detection_metrics =
+      latest_metrics.process_metrics().object_detection_metrics();
+  const auto& preprocessing_latency =
+      object_detection_metrics.pre_processing_latency();
+  TFLITE_LOG(INFO) << "Preprocessing latency: avg="
+                   << preprocessing_latency.avg_us() << "(us), std_dev="
+                   << preprocessing_latency.std_deviation_us() << "(us)";
+  const auto& inference_latency = object_detection_metrics.inference_latency();
+  TFLITE_LOG(INFO) << "Inference latency: avg=" << inference_latency.avg_us()
+                   << "(us), std_dev=" << inference_latency.std_deviation_us()
+                   << "(us)";
+  const auto& precision_metrics =
+      object_detection_metrics.average_precision_metrics();
+  for (int i = 0; i < precision_metrics.individual_average_precisions_size();
+       ++i) {
+    const auto ap_metric = precision_metrics.individual_average_precisions(i);
+    TFLITE_LOG(INFO) << "Average Precision [IOU Threshold="
+                     << ap_metric.iou_threshold()
+                     << "]: " << ap_metric.average_precision();
+  }
+  TFLITE_LOG(INFO) << "Overall mAP: "
+                   << precision_metrics.overall_mean_average_precision();
+}
+
+std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]) {
+  return std::unique_ptr<TaskExecutor>(new CocoObjectDetection(argc, argv));
 }
 
 }  // namespace evaluation
 }  // namespace tflite
-
-int main(int argc, char* argv[]) {
-  return tflite::evaluation::Main(argc, argv);
-}
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
index 8f6228c8857..de2a7f96311 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite/tools/evaluation/tasks:build_def.bzl", "task_linkopts")
 
 package(
     default_visibility = [
@@ -7,29 +8,33 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-common_linkopts = tflite_linkopts() + select({
-    "//tensorflow:android": [
-        "-pie",  # Android 5.0 and later supports only PIE
-        "-lm",  # some builtin ops, e.g., tanh, need -lm
-        "-Wl,--rpath=/data/local/tmp/",  # Hexagon delegate libraries should be in /data/local/tmp
-    ],
-    "//conditions:default": [],
-})
-
-cc_binary(
-    name = "run_eval",
+cc_library(
+    name = "run_eval_lib",
     srcs = ["run_eval.cc"],
     copts = tflite_copts(),
-    linkopts = common_linkopts,
+    linkopts = task_linkopts(),
     deps = [
-        "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation:utils",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "//tensorflow/lite/tools/evaluation/stages:image_classification_stage",
+        "//tensorflow/lite/tools/evaluation/tasks:task_executor",
+        "@com_google_absl//absl/types:optional",
+    ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "run_eval",
+    copts = tflite_copts(),
+    linkopts = task_linkopts(),
+    deps = [
+        ":run_eval_lib",
+        "//tensorflow/lite/tools/evaluation/tasks:task_executor_main",
     ],
 )
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
index cad13ba58bd..8557de6eb83 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
@@ -9,44 +9,22 @@ This binary evaluates the following parameters of TFLite models trained for the
 
 The binary takes the path to validation images and labels as inputs, along with
 the model and inference-specific parameters such as delegate and number of
-threads. It outputs the metrics as a text proto to a file, similar to the
-following:
+threads. It outputs the metrics to std-out as follows:
 
 ```
-num_runs: 300 # Total images evaluated
-process_metrics {
-  image_classification_metrics {
-    pre_processing_latency {
-      last_us: 8641
-      max_us: 42357
-      min_us: 2811
-      sum_us: 2449340
-      avg_us: 8164.4666666666662 # Avg Pre-processing latency in micro-seconds
-    }
-    inference_latency {
-      last_us: 27979
-      max_us: 40696
-      min_us: 27811
-      sum_us: 9142187
-      avg_us: 30473.956666666665 # Avg Inference latency in micro-seconds
-    }
-    inference_metrics {
-      num_inferences: 300
-    }
-    topk_accuracy_metrics {
-      topk_accuracies: 0.7033333 # Top-1 accuracy
-      topk_accuracies: 0.78
-      topk_accuracies: 0.8333333
-      topk_accuracies: 0.86
-      topk_accuracies: 0.88
-      topk_accuracies: 0.89
-      topk_accuracies: 0.9033333
-      topk_accuracies: 0.9033333
-      topk_accuracies: 0.92
-      topk_accuracies: 0.9266667 # Top-10 accuracy
-    }
-  }
-}
+Num evaluation runs: 300 # Total images evaluated
+Preprocessing latency: avg=13772.5(us), std_dev=0(us)
+Inference latency: avg=76578.4(us), std_dev=600(us)
+Top-1 Accuracy: 0.733333
+Top-2 Accuracy: 0.826667
+Top-3 Accuracy: 0.856667
+Top-4 Accuracy: 0.87
+Top-5 Accuracy: 0.89
+Top-6 Accuracy: 0.903333
+Top-7 Accuracy: 0.906667
+Top-8 Accuracy: 0.913333
+Top-9 Accuracy: 0.92
+Top-10 Accuracy: 0.923333
 ```
 
 To run the binary download the ILSVRC 2012 devkit
@@ -77,10 +55,6 @@ The binary takes the following parameters:
     `mobilenet_labels.txt` where each label is in the same order as the output
     1001 dimension tensor.
 
-*   `output_file_path`: `string` \
-    The final metrics are dumped into `output_file_path` as a string-serialized
-    instance of `tflite::evaluation::EvaluationStageMetrics`.
-
 and the following optional parameters:
 
 *   `blacklist_file_path`: `string` \
@@ -97,6 +71,10 @@ and the following optional parameters:
     number of TFLite Interpreter threads, but shards the dataset to speed up
     evaluation.
 
+*   `output_file_path`: `string` \
+    The final metrics are dumped into `output_file_path` as a string-serialized
+    instance of `tflite::evaluation::EvaluationStageMetrics`.
+
 The following optional parameters can be used to modify the inference runtime:
 
 *   `num_interpreter_threads`: `int` (default=1) \
@@ -113,6 +91,12 @@ The following optional parameters can be used to modify the inference runtime:
     assumes that `libhexagon_interface.so` and Qualcomm libraries lie in
     `/data/local/tmp`.
 
+This script also supports runtime/delegate arguments introduced by the
+[delegate registrar](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates).
+If there is any conflict (for example, `num_threads` vs
+`num_interpreter_threads` here), the parameters of this
+script are given precedence.
+
 ## Downloading ILSVRC
 
 In order to use this tool to run evaluation on the full 50K ImageNet dataset,
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index cd6c6cfb3c4..13eeb313ad4 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -17,14 +17,16 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/platform/logging.h"
+#include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/image_classification_stage.h"
+#include "tensorflow/lite/tools/evaluation/tasks/task_executor.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
+#include "tensorflow/lite/tools/logging.h"
 
 namespace tflite {
 namespace evaluation {
@@ -46,110 +48,83 @@ std::vector<T> GetFirstN(const std::vector<T>& v, int n) {
   return result;
 }
 
-bool EvaluateModel(const std::string& model_file_path,
-                   const std::vector<ImageLabel>& image_labels,
-                   const std::vector<std::string>& model_labels,
-                   std::string delegate, std::string output_file_path,
-                   int num_interpreter_threads,
-                   const DelegateProviders& delegate_providers) {
-  EvaluationStageConfig eval_config;
-  eval_config.set_name("image_classification");
-  auto* classification_params = eval_config.mutable_specification()
-                                    ->mutable_image_classification_params();
-  auto* inference_params = classification_params->mutable_inference_params();
-  inference_params->set_model_file_path(model_file_path);
-  inference_params->set_num_threads(num_interpreter_threads);
-  inference_params->set_delegate(ParseStringToDelegateType(delegate));
-  if (!delegate.empty() &&
-      inference_params->delegate() == TfliteInferenceParams::NONE) {
-    LOG(WARNING) << "Unsupported TFLite delegate: " << delegate;
-    return false;
-  }
-  classification_params->mutable_topk_accuracy_eval_params()->set_k(10);
+class ImagenetClassification : public TaskExecutor {
+ public:
+  ImagenetClassification(int* argc, char* argv[]);
+  ~ImagenetClassification() override {}
 
-  ImageClassificationStage eval(eval_config);
+  // If the run is successful, the latest metrics will be returned.
+  absl::optional<EvaluationStageMetrics> Run() final;
 
-  eval.SetAllLabels(model_labels);
-  if (eval.Init(&delegate_providers) != kTfLiteOk) return false;
+ private:
+  void OutputResult(const EvaluationStageMetrics& latest_metrics) const;
+  std::string model_file_path_;
+  std::string ground_truth_images_path_;
+  std::string ground_truth_labels_path_;
+  std::string model_output_labels_path_;
+  std::string blacklist_file_path_;
+  std::string output_file_path_;
+  std::string delegate_;
+  int num_images_;
+  int num_interpreter_threads_;
+  DelegateProviders delegate_providers_;
+};
 
-  const int step = image_labels.size() / 100;
-  for (int i = 0; i < image_labels.size(); ++i) {
-    if (step > 1 && i % step == 0) {
-      LOG(INFO) << "Evaluated: " << i / step << "%";
-    }
-
-    eval.SetInputs(image_labels[i].image, image_labels[i].label);
-    if (eval.Run() != kTfLiteOk) return false;
-  }
-
-  std::ofstream metrics_ofile;
-  metrics_ofile.open(output_file_path, std::ios::out);
-  metrics_ofile << eval.LatestMetrics().DebugString();
-  metrics_ofile.close();
-
-  return true;
-}
-
-int Main(int argc, char* argv[]) {
-  // Command Line Flags.
-  std::string model_file_path;
-  std::string ground_truth_images_path;
-  std::string ground_truth_labels_path;
-  std::string model_output_labels_path;
-  std::string blacklist_file_path;
-  std::string output_file_path;
-  std::string delegate;
-  int num_images = 0;
-  int num_interpreter_threads = 1;
+ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
+    : num_images_(0), num_interpreter_threads_(1) {
   std::vector<tflite::Flag> flag_list = {
-      tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path,
+      tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path_,
                                "Path to test tflite model file."),
       tflite::Flag::CreateFlag(
-          kModelOutputLabelsFlag, &model_output_labels_path,
+          kModelOutputLabelsFlag, &model_output_labels_path_,
           "Path to labels that correspond to output of model."
           " E.g. in case of mobilenet, this is the path to label "
           "file where each label is in the same order as the output"
           " of the model."),
       tflite::Flag::CreateFlag(
-          kGroundTruthImagesPathFlag, &ground_truth_images_path,
+          kGroundTruthImagesPathFlag, &ground_truth_images_path_,
           "Path to ground truth images. These will be evaluated in "
           "alphabetical order of filename"),
       tflite::Flag::CreateFlag(
-          kGroundTruthLabelsFlag, &ground_truth_labels_path,
+          kGroundTruthLabelsFlag, &ground_truth_labels_path_,
           "Path to ground truth labels, corresponding to alphabetical ordering "
           "of ground truth images."),
       tflite::Flag::CreateFlag(
-          kBlacklistFilePathFlag, &blacklist_file_path,
+          kBlacklistFilePathFlag, &blacklist_file_path_,
           "Path to blacklist file (optional) where each line is a single "
           "integer that is "
           "equal to index number of blacklisted image."),
-      tflite::Flag::CreateFlag(kOutputFilePathFlag, &output_file_path,
+      tflite::Flag::CreateFlag(kOutputFilePathFlag, &output_file_path_,
                                "File to output metrics proto to."),
-      tflite::Flag::CreateFlag(kNumImagesFlag, &num_images,
+      tflite::Flag::CreateFlag(kNumImagesFlag, &num_images_,
                                "Number of examples to evaluate, pass 0 for all "
                                "examples. Default: 0"),
       tflite::Flag::CreateFlag(
-          kInterpreterThreadsFlag, &num_interpreter_threads,
+          kInterpreterThreadsFlag, &num_interpreter_threads_,
           "Number of interpreter threads to use for inference."),
-      tflite::Flag::CreateFlag(kDelegateFlag, &delegate,
-                               "Delegate to use for inference, if available. "
-                               "Must be one of {'nnapi', 'gpu'}"),
+      tflite::Flag::CreateFlag(
+          kDelegateFlag, &delegate_,
+          "Delegate to use for inference, if available. "
+          "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
   };
-  tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flag_list);
-  DelegateProviders delegate_providers;
-  delegate_providers.InitFromCmdlineArgs(&argc, const_cast<const char**>(argv));
+  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
+  delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
+}
 
+absl::optional<EvaluationStageMetrics> ImagenetClassification::Run() {
   // Process images in filename-sorted order.
   std::vector<std::string> image_files, ground_truth_image_labels;
-  TF_LITE_ENSURE_STATUS(GetSortedFileNames(
-      StripTrailingSlashes(ground_truth_images_path), &image_files));
-  if (!ReadFileLines(ground_truth_labels_path, &ground_truth_image_labels)) {
-    LOG(ERROR) << "Could not read ground truth labels file";
-    return EXIT_FAILURE;
+  if (GetSortedFileNames(StripTrailingSlashes(ground_truth_images_path_),
+                         &image_files) != kTfLiteOk) {
+    return absl::nullopt;
+  }
+  if (!ReadFileLines(ground_truth_labels_path_, &ground_truth_image_labels)) {
+    TFLITE_LOG(ERROR) << "Could not read ground truth labels file";
+    return absl::nullopt;
   }
   if (image_files.size() != ground_truth_image_labels.size()) {
-    LOG(ERROR) << "Number of images and ground truth labels is not same";
-    return EXIT_FAILURE;
+    TFLITE_LOG(ERROR) << "Number of images and ground truth labels is not same";
+    return absl::nullopt;
   }
   std::vector<ImageLabel> image_labels;
   image_labels.reserve(image_files.size());
@@ -158,31 +133,79 @@ int Main(int argc, char* argv[]) {
   }
 
   // Filter out blacklisted/unwanted images.
-  TF_LITE_ENSURE_STATUS(
-      FilterBlackListedImages(blacklist_file_path, &image_labels));
-  if (num_images > 0) {
-    image_labels = GetFirstN(image_labels, num_images);
+  if (FilterBlackListedImages(blacklist_file_path_, &image_labels) !=
+      kTfLiteOk) {
+    return absl::nullopt;
+  }
+  if (num_images_ > 0) {
+    image_labels = GetFirstN(image_labels, num_images_);
   }
 
   std::vector<std::string> model_labels;
-  if (!ReadFileLines(model_output_labels_path, &model_labels)) {
-    LOG(ERROR) << "Could not read model output labels file";
-    return EXIT_FAILURE;
+  if (!ReadFileLines(model_output_labels_path_, &model_labels)) {
+    TFLITE_LOG(ERROR) << "Could not read model output labels file";
+    return absl::nullopt;
   }
 
-  if (!EvaluateModel(model_file_path, image_labels, model_labels, delegate,
-                     output_file_path, num_interpreter_threads,
-                     delegate_providers)) {
-    LOG(ERROR) << "Could not evaluate model";
-    return EXIT_FAILURE;
+  EvaluationStageConfig eval_config;
+  eval_config.set_name("image_classification");
+  auto* classification_params = eval_config.mutable_specification()
+                                    ->mutable_image_classification_params();
+  auto* inference_params = classification_params->mutable_inference_params();
+  inference_params->set_model_file_path(model_file_path_);
+  inference_params->set_num_threads(num_interpreter_threads_);
+  inference_params->set_delegate(ParseStringToDelegateType(delegate_));
+  classification_params->mutable_topk_accuracy_eval_params()->set_k(10);
+
+  ImageClassificationStage eval(eval_config);
+
+  eval.SetAllLabels(model_labels);
+  if (eval.Init(&delegate_providers_) != kTfLiteOk) return absl::nullopt;
+
+  const int step = image_labels.size() / 100;
+  for (int i = 0; i < image_labels.size(); ++i) {
+    if (step > 1 && i % step == 0) {
+      TFLITE_LOG(INFO) << "Evaluated: " << i / step << "%";
+    }
+    eval.SetInputs(image_labels[i].image, image_labels[i].label);
+    if (eval.Run() != kTfLiteOk) return absl::nullopt;
   }
 
-  return EXIT_SUCCESS;
+  const auto latest_metrics = eval.LatestMetrics();
+  OutputResult(latest_metrics);
+  return absl::make_optional(latest_metrics);
+}
+
+void ImagenetClassification::OutputResult(
+    const EvaluationStageMetrics& latest_metrics) const {
+  if (!output_file_path_.empty()) {
+    std::ofstream metrics_ofile;
+    metrics_ofile.open(output_file_path_, std::ios::out);
+    metrics_ofile << latest_metrics.SerializeAsString();
+    metrics_ofile.close();
+  }
+
+  TFLITE_LOG(INFO) << "Num evaluation runs: " << latest_metrics.num_runs();
+  const auto& metrics =
+      latest_metrics.process_metrics().image_classification_metrics();
+  const auto& preprocessing_latency = metrics.pre_processing_latency();
+  TFLITE_LOG(INFO) << "Preprocessing latency: avg="
+                   << preprocessing_latency.avg_us() << "(us), std_dev="
+                   << preprocessing_latency.std_deviation_us() << "(us)";
+  const auto& inference_latency = metrics.inference_latency();
+  TFLITE_LOG(INFO) << "Inference latency: avg=" << inference_latency.avg_us()
+                   << "(us), std_dev=" << inference_latency.std_deviation_us()
+                   << "(us)";
+  const auto& accuracy_metrics = metrics.topk_accuracy_metrics();
+  for (int i = 0; i < accuracy_metrics.topk_accuracies_size(); ++i) {
+    TFLITE_LOG(INFO) << "Top-" << i + 1
+                     << " Accuracy: " << accuracy_metrics.topk_accuracies(i);
+  }
+}
+
+std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]) {
+  return std::unique_ptr<TaskExecutor>(new ImagenetClassification(argc, argv));
 }
 
 }  // namespace evaluation
 }  // namespace tflite
-
-int main(int argc, char* argv[]) {
-  return tflite::evaluation::Main(argc, argv);
-}
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD b/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
index 72a2f9c2d74..a53872b50cb 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite/tools/evaluation/tasks:build_def.bzl", "task_linkopts")
 
 package(
     default_visibility = [
@@ -7,26 +8,32 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-cc_binary(
-    name = "run_eval",
+cc_library(
+    name = "run_eval_lib",
     srcs = ["run_eval.cc"],
     copts = tflite_copts(),
-    linkopts = tflite_linkopts() + select({
-        "//tensorflow:android": [
-            "-pie",  # Android 5.0 and later supports only PIE
-            "-lm",  # some builtin ops, e.g., tanh, need -lm
-            "-Wl,--rpath=/data/local/tmp/",  # Hexagon delegate libraries should be in /data/local/tmp
-        ],
-        "//conditions:default": [],
-    }),
+    linkopts = task_linkopts(),
     deps = [
-        "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "//tensorflow/lite/tools/evaluation/stages:inference_profiler_stage",
+        "//tensorflow/lite/tools/evaluation/tasks:task_executor",
+        "@com_google_absl//absl/types:optional",
+    ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "run_eval",
+    copts = tflite_copts(),
+    linkopts = task_linkopts(),
+    deps = [
+        ":run_eval_lib",
+        "//tensorflow/lite/tools/evaluation/tasks:task_executor_main",
     ],
 )
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md b/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md
index 3d58594a679..64606ee19df 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md
@@ -16,42 +16,18 @@ parametrized by the user's arguments.
 It measures the latency of both, as well as the absolute difference between the
 output tensors from each Interpreter, on a per-element basis.
 
-The final output typically looks like this:
+The final output (logged to stdout) typically looks like this:
 
 ```
-num_runs: 50
-process_metrics {
-  inference_profiler_metrics {
-    reference_latency {
-      last_us: 43111
-      max_us: 49314
-      min_us: 42965
-      sum_us: 6525771
-      avg_us: 43505.14
-    }
-    test_latency {
-      last_us: 26906
-      max_us: 107118
-      min_us: 26454
-      sum_us: 5286197
-      avg_us: 35241.313333333332
-    }
-    output_errors {
-      max_value: 0.000999001
-      min_value: 0
-      avg_value: 1.9980019424110651e-05
-      std_deviation: 0.00013986013
-    }
-  }
-}
+Num evaluation runs: 50
+Reference run latency: avg=84364.2(us), std_dev=12525(us)
+Test run latency: avg=7281.64(us), std_dev=2089(us)
+OutputDiff[0]: avg_error=1.96277e-05, std_dev=6.95767e-06
 ```
 
-The values in `test_latency` denote the inference latency statistics in
-milliseconds. `reference_latency` denotes single-threaded CPU behavior.
-
-There is one instance of `output_errors` for each output tensor in the model,
-and the statistics in `output_errors[i]` correspond to the absolute difference
-in raw values across all elements for the `i`th output.
+There is one instance of `OutputDiff` for each output tensor in the model, and
+the statistics in `OutputDiff[i]` correspond to the absolute difference in raw
+values across all elements for the `i`th output.
 
 ## Parameters
 
@@ -63,10 +39,6 @@ The binary takes the following parameters:
 *   `model_file` : `string` \
     Path to the TFlite model file.
 
-*   `output_file_path`: `string` \
-    The final metrics are dumped into `output_file_path` as a string-serialized
-    instance of `tflite::evaluation::EvaluationStageMetrics`.
-
 and the following optional parameters:
 
 *   `num_runs`: `int` \
@@ -88,6 +60,16 @@ and the following optional parameters:
     assumes that `libhexagon_interface.so` and Qualcomm libraries lie in
     `/data/local/tmp`.
 
+*   `output_file_path`: `string` \
+    The final metrics are dumped into `output_file_path` as a serialized
+    instance of `tflite::evaluation::EvaluationStageMetrics`
+
+This script also supports runtime/delegate arguments introduced by the
+[delegate registrar](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates).
+If there is any conflict (for example, `num_threads` vs
+`num_interpreter_threads` here), the parameters of this
+script are given precedence.
+
 ## Running the binary on Android
 
 (1) Build using the following command:
@@ -116,7 +98,6 @@ adb push mobilenet_v1_1.0_224.tflite /data/local/tmp
 ```
 adb shell /data/local/tmp/run_eval \
   --model_file=/data/local/tmp/mobilenet_v1_1.0_224.tflite \
-  --output_file_path=/data/local/tmp/inference_diff.txt \
   --delegate=gpu
 ```
 
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index 6a7c6e8fc42..814ebe3b3bf 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -16,13 +16,15 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/platform/logging.h"
+#include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h"
+#include "tensorflow/lite/tools/evaluation/tasks/task_executor.h"
+#include "tensorflow/lite/tools/logging.h"
 
 namespace tflite {
 namespace evaluation {
@@ -33,82 +35,111 @@ constexpr char kNumRunsFlag[] = "num_runs";
 constexpr char kInterpreterThreadsFlag[] = "num_interpreter_threads";
 constexpr char kDelegateFlag[] = "delegate";
 
-bool EvaluateModel(const std::string& model_file_path,
-                   const std::string& delegate, int num_runs,
-                   const std::string& output_file_path,
-                   int num_interpreter_threads,
-                   const DelegateProviders& delegate_providers) {
+class InferenceDiff : public TaskExecutor {
+ public:
+  InferenceDiff(int* argc, char* argv[]);
+  ~InferenceDiff() override {}
+
+  // If the run is successful, the latest metrics will be returned.
+  absl::optional<EvaluationStageMetrics> Run() final;
+
+ private:
+  void OutputResult(const EvaluationStageMetrics& latest_metrics) const;
+  std::string model_file_path_;
+  std::string output_file_path_;
+  std::string delegate_;
+  int num_runs_;
+  int num_interpreter_threads_;
+  DelegateProviders delegate_providers_;
+};
+
+InferenceDiff::InferenceDiff(int* argc, char* argv[])
+    : num_runs_(50), num_interpreter_threads_(1) {
+  // Command Line Flags.
+  std::vector<tflite::Flag> flag_list = {
+      tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path_,
+                               "Path to test tflite model file."),
+      tflite::Flag::CreateFlag(kOutputFilePathFlag, &output_file_path_,
+                               "File to output metrics proto to."),
+      tflite::Flag::CreateFlag(kNumRunsFlag, &num_runs_,
+                               "Number of runs of test & reference inference "
+                               "each. Default value: 50"),
+      tflite::Flag::CreateFlag(
+          kInterpreterThreadsFlag, &num_interpreter_threads_,
+          "Number of interpreter threads to use for test inference."),
+      tflite::Flag::CreateFlag(
+          kDelegateFlag, &delegate_,
+          "Delegate to use for test inference, if available. "
+          "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
+  };
+  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
+  delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
+}
+
+absl::optional<EvaluationStageMetrics> InferenceDiff::Run() {
   // Initialize evaluation stage.
   EvaluationStageConfig eval_config;
   eval_config.set_name("inference_profiling");
   auto* inference_params =
       eval_config.mutable_specification()->mutable_tflite_inference_params();
-  inference_params->set_model_file_path(model_file_path);
-  inference_params->set_num_threads(num_interpreter_threads);
+  inference_params->set_model_file_path(model_file_path_);
+  inference_params->set_num_threads(num_interpreter_threads_);
   // This ensures that latency measurement isn't hampered by the time spent in
   // generating random data.
   inference_params->set_invocations_per_run(3);
-  inference_params->set_delegate(ParseStringToDelegateType(delegate));
-  if (!delegate.empty() &&
+  inference_params->set_delegate(ParseStringToDelegateType(delegate_));
+  if (!delegate_.empty() &&
       inference_params->delegate() == TfliteInferenceParams::NONE) {
-    LOG(WARNING) << "Unsupported TFLite delegate: " << delegate;
-    return false;
+    TFLITE_LOG(WARN) << "Unsupported TFLite delegate: " << delegate_;
+    return absl::nullopt;
   }
+
   InferenceProfilerStage eval(eval_config);
-  if (eval.Init(&delegate_providers) != kTfLiteOk) return false;
+  if (eval.Init(&delegate_providers_) != kTfLiteOk) return absl::nullopt;
 
   // Run inference & check diff for specified number of runs.
-  for (int i = 0; i < num_runs; ++i) {
-    if (eval.Run() != kTfLiteOk) return false;
+  for (int i = 0; i < num_runs_; ++i) {
+    if (eval.Run() != kTfLiteOk) return absl::nullopt;
   }
 
-  // Output latency & diff metrics.
-  std::ofstream metrics_ofile;
-  metrics_ofile.open(output_file_path, std::ios::out);
-  metrics_ofile << eval.LatestMetrics().DebugString();
-  metrics_ofile.close();
-  return true;
+  const auto latest_metrics = eval.LatestMetrics();
+  OutputResult(latest_metrics);
+  return absl::make_optional(latest_metrics);
 }
 
-int Main(int argc, char* argv[]) {
-  // Command Line Flags.
-  std::string model_file_path;
-  std::string output_file_path;
-  std::string delegate;
-  int num_runs = 50;
-  int num_interpreter_threads = 1;
-  std::vector<tflite::Flag> flag_list = {
-      tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path,
-                               "Path to test tflite model file."),
-      tflite::Flag::CreateFlag(kOutputFilePathFlag, &output_file_path,
-                               "File to output metrics proto to."),
-      tflite::Flag::CreateFlag(kNumRunsFlag, &num_runs,
-                               "Number of runs of test & reference inference "
-                               "each. Default value: 50"),
-      tflite::Flag::CreateFlag(
-          kInterpreterThreadsFlag, &num_interpreter_threads,
-          "Number of interpreter threads to use for test inference."),
-      tflite::Flag::CreateFlag(
-          kDelegateFlag, &delegate,
-          "Delegate to use for test inference, if available. "
-          "Must be one of {'nnapi', 'gpu', 'hexagon'}"),
-  };
-  tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flag_list);
-
-  DelegateProviders delegate_providers;
-  delegate_providers.InitFromCmdlineArgs(&argc, const_cast<const char**>(argv));
-  if (!EvaluateModel(model_file_path, delegate, num_runs, output_file_path,
-                     num_interpreter_threads, delegate_providers)) {
-    LOG(ERROR) << "Could not evaluate model!";
-    return EXIT_FAILURE;
+void InferenceDiff::OutputResult(
+    const EvaluationStageMetrics& latest_metrics) const {
+  // Output latency & diff metrics.
+  if (!output_file_path_.empty()) {
+    std::ofstream metrics_ofile;
+    metrics_ofile.open(output_file_path_, std::ios::out);
+    metrics_ofile << latest_metrics.SerializeAsString();
+    metrics_ofile.close();
   }
 
-  return EXIT_SUCCESS;
+  TFLITE_LOG(INFO) << "Num evaluation runs: " << latest_metrics.num_runs();
+  const auto& metrics =
+      latest_metrics.process_metrics().inference_profiler_metrics();
+  const auto& ref_latency = metrics.reference_latency();
+  TFLITE_LOG(INFO) << "Reference run latency: avg=" << ref_latency.avg_us()
+                   << "(us), std_dev=" << ref_latency.std_deviation_us()
+                   << "(us)";
+  const auto& test_latency = metrics.test_latency();
+  TFLITE_LOG(INFO) << "Test run latency: avg=" << test_latency.avg_us()
+                   << "(us), std_dev=" << test_latency.std_deviation_us()
+                   << "(us)";
+  const auto& output_errors = metrics.output_errors();
+  for (int i = 0; i < output_errors.size(); ++i) {
+    const auto& error = output_errors.at(i);
+    TFLITE_LOG(INFO) << "OutputDiff[" << i
+                     << "]: avg_error=" << error.avg_value()
+                     << ", std_dev=" << error.std_deviation();
+  }
+}
+
+std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]) {
+  return std::unique_ptr<TaskExecutor>(new InferenceDiff(argc, argv));
 }
 
 }  // namespace evaluation
 }  // namespace tflite
-
-int main(int argc, char* argv[]) {
-  return tflite::evaluation::Main(argc, argv);
-}
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor.h b/tensorflow/lite/tools/evaluation/tasks/task_executor.h
new file mode 100644
index 00000000000..b50e7d6d03f
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+
+namespace tflite {
+namespace evaluation {
+// A common task execution API to avoid boilerpolate code in defining the main
+// function.
+class TaskExecutor {
+ public:
+  virtual ~TaskExecutor() {}
+  // If the run is successful, the latest metrics will be returned.
+  virtual absl::optional<EvaluationStageMetrics> Run() = 0;
+};
+
+// Just a declaration. In order to avoid the boilerpolate main-function code,
+// every evaluation task should define this function.
+std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]);
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_H_
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc b/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
new file mode 100644
index 00000000000..6ef1a6724b7
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "absl/types/optional.h"
+#include "tensorflow/lite/tools/evaluation/tasks/task_executor.h"
+#include "tensorflow/lite/tools/logging.h"
+
+// This could serve as the main function for all eval tools.
+int main(int argc, char* argv[]) {
+  auto task_executor = tflite::evaluation::CreateTaskExecutor(&argc, argv);
+  if (task_executor == nullptr) {
+    TFLITE_LOG(ERROR) << "Could not create the task evaluation!";
+    return EXIT_FAILURE;
+  }
+  const auto metrics = task_executor->Run();
+  if (!metrics.has_value()) {
+    TFLITE_LOG(ERROR) << "Could not run the task evaluation!";
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 69f962d7e9a..33967b6f4ea 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -139,16 +139,25 @@ TfLiteDelegatePtr CreateGPUDelegate() {
 TfLiteDelegatePtr CreateHexagonDelegate(
     const std::string& library_directory_path, bool profiling) {
 #if defined(__ANDROID__) && (defined(__arm__) || defined(__aarch64__))
+  TfLiteHexagonDelegateOptions options = {0};
+  options.print_graph_profile = profiling;
+  return CreateHexagonDelegate(&options, library_directory_path);
+#else
+  return CreateNullDelegate();
+#endif  // defined(__ANDROID__)
+}
+
+#if defined(__ANDROID__) && (defined(__arm__) || defined(__aarch64__))
+TfLiteDelegatePtr CreateHexagonDelegate(
+    const TfLiteHexagonDelegateOptions* options,
+    const std::string& library_directory_path) {
   if (library_directory_path.empty()) {
     TfLiteHexagonInit();
   } else {
     TfLiteHexagonInitWithPath(library_directory_path.c_str());
   }
 
-  const TfLiteHexagonDelegateOptions options = {
-      /*debug_level=*/0, /*powersave_level=*/0, profiling,
-      /*print_graph_debug=*/false};
-  TfLiteDelegate* delegate = TfLiteHexagonDelegateCreate(&options);
+  TfLiteDelegate* delegate = TfLiteHexagonDelegateCreate(options);
   if (!delegate) {
     TfLiteHexagonTearDown();
     return CreateNullDelegate();
@@ -157,10 +166,8 @@ TfLiteDelegatePtr CreateHexagonDelegate(
     TfLiteHexagonDelegateDelete(delegate);
     TfLiteHexagonTearDown();
   });
-#else
-  return CreateNullDelegate();
-#endif  // defined(__ANDROID__)
 }
+#endif
 
 // TODO(b/149248802): include XNNPACK delegate when the issue is resolved.
 #if defined(__Fuchsia__) || defined(TFLITE_WITHOUT_XNNPACK)
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index e7c8246b340..ef2c609723e 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -71,6 +71,11 @@ TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options);
 
 TfLiteDelegatePtr CreateHexagonDelegate(
     const std::string& library_directory_path, bool profiling);
+#if defined(__ANDROID__) && (defined(__arm__) || defined(__aarch64__))
+TfLiteDelegatePtr CreateHexagonDelegate(
+    const TfLiteHexagonDelegateOptions* options,
+    const std::string& library_directory_path);
+#endif
 
 // TODO(b/149248802): include XNNPACK delegate when the issue is resolved.
 #if !defined(__Fuchsia__) || defined(TFLITE_WITHOUT_XNNPACK)
diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py
index 5b513bbfef2..f80daad2519 100644
--- a/tensorflow/lite/tools/flatbuffer_utils.py
+++ b/tensorflow/lite/tools/flatbuffer_utils.py
@@ -31,6 +31,8 @@ import random
 from flatbuffers.python import flatbuffers
 from tensorflow.lite.python import schema_py_generated as schema_fb
 
+TFLITE_FILE_IDENTIFIER = b'TFL3'
+
 
 def read_model(input_tflite_file):
   """Reads and parses a tflite model.
@@ -66,7 +68,7 @@ def write_model(model, output_tflite_file):
   # Initial size of the buffer, which will grow automatically if needed
   builder = flatbuffers.Builder(1024)
   model_offset = model.Pack(builder)
-  builder.Finish(model_offset)
+  builder.Finish(model_offset, file_identifier=TFLITE_FILE_IDENTIFIER)
   model_data = builder.Output()
   with open(output_tflite_file, 'wb') as out_file:
     out_file.write(model_data)
diff --git a/tensorflow/lite/tools/flatbuffer_utils_test.py b/tensorflow/lite/tools/flatbuffer_utils_test.py
index d2e4fe6daea..60235b06bc8 100644
--- a/tensorflow/lite/tools/flatbuffer_utils_test.py
+++ b/tensorflow/lite/tools/flatbuffer_utils_test.py
@@ -31,7 +31,7 @@ class WriteReadModelTest(test_util.TensorFlowTestCase):
   def testWriteReadModel(self):
     # 1. SETUP
     # Define the initial model
-    initial_model = test_utils.build_mock_model_python_object()
+    initial_model = test_utils.build_mock_model()
     # Define temporary files
     tmp_dir = self.get_temp_dir()
     model_filename = os.path.join(tmp_dir, 'model.tflite')
@@ -76,7 +76,7 @@ class StripStringsTest(test_util.TensorFlowTestCase):
   def testStripStrings(self):
     # 1. SETUP
     # Define the initial model
-    initial_model = test_utils.build_mock_model_python_object()
+    initial_model = test_utils.build_mock_model()
     final_model = copy.deepcopy(initial_model)
 
     # 2. INVOKE
@@ -121,7 +121,7 @@ class RandomizeWeightsTest(test_util.TensorFlowTestCase):
   def testRandomizeWeights(self):
     # 1. SETUP
     # Define the initial model
-    initial_model = test_utils.build_mock_model_python_object()
+    initial_model = test_utils.build_mock_model()
     final_model = copy.deepcopy(initial_model)
 
     # 2. INVOKE
diff --git a/tensorflow/lite/tools/gen_op_registration_main.cc b/tensorflow/lite/tools/gen_op_registration_main.cc
index 3e8083143d3..410aaabf064 100644
--- a/tensorflow/lite/tools/gen_op_registration_main.cc
+++ b/tensorflow/lite/tools/gen_op_registration_main.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cassert>
 #include <fstream>
 #include <map>
 #include <sstream>
@@ -21,8 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/strip.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/gen_op_registration.h"
 
 const char kInputModelFlag[] = "input_model";
@@ -31,27 +29,27 @@ const char kOutputRegistrationFlag[] = "output_registration";
 const char kTfLitePathFlag[] = "tflite_path";
 const char kForMicro[] = "for_micro";
 
-using tensorflow::Flag;
-using tensorflow::Flags;
-using tensorflow::string;
-
-void ParseFlagAndInit(int* argc, char** argv, string* input_model,
-                      string* output_registration, string* tflite_path,
-                      string* namespace_flag, bool* for_micro) {
-  std::vector<tensorflow::Flag> flag_list = {
-      Flag(kInputModelFlag, input_model, "path to the tflite model"),
-      Flag(kOutputRegistrationFlag, output_registration,
-           "filename for generated registration code"),
-      Flag(kTfLitePathFlag, tflite_path, "Path to tensorflow lite dir"),
-      Flag(kNamespace, namespace_flag,
-           "Namespace in which to put RegisterSelectedOps."),
-      Flag(kForMicro, for_micro,
-           "By default this script generate TFL registration file, but can "
-           "also generate TFLM files when this flag is set to true"),
+void ParseFlagAndInit(int* argc, char** argv, std::string* input_model,
+                      std::string* output_registration,
+                      std::string* tflite_path, std::string* namespace_flag,
+                      bool* for_micro) {
+  std::vector<tflite::Flag> flag_list = {
+      tflite::Flag::CreateFlag(kInputModelFlag, input_model,
+                               "path to the tflite model"),
+      tflite::Flag::CreateFlag(kOutputRegistrationFlag, output_registration,
+                               "filename for generated registration code"),
+      tflite::Flag::CreateFlag(kTfLitePathFlag, tflite_path,
+                               "Path to tensorflow lite dir"),
+      tflite::Flag::CreateFlag(
+          kNamespace, namespace_flag,
+          "Namespace in which to put RegisterSelectedOps."),
+      tflite::Flag::CreateFlag(
+          kForMicro, for_micro,
+          "By default this script generate TFL registration file, but can "
+          "also generate TFLM files when this flag is set to true"),
   };
 
-  Flags::Parse(argc, argv, flag_list);
-  tensorflow::port::InitMain(argv[0], argc, &argv);
+  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
 }
 
 namespace {
@@ -66,11 +64,10 @@ void GenerateFileContent(const std::string& tflite_path,
 
   if (for_micro) {
     if (!builtin_ops.empty()) {
-      fout << "#include \"" << tflite_path
-           << "/experimental/micro/kernels/micro_ops.h\"\n";
+      fout << "#include \"" << tflite_path << "/micro/kernels/micro_ops.h\"\n";
     }
     fout << "#include \"" << tflite_path
-         << "/experimental/micro/micro_mutable_op_resolver.h\"\n";
+         << "/micro/micro_mutable_op_resolver.h\"\n";
   } else {
     if (!builtin_ops.empty()) {
       fout << "#include \"" << tflite_path
@@ -132,14 +129,14 @@ void GenerateFileContent(const std::string& tflite_path,
   fout.close();
 }
 
-void AddOpsFromModel(const string& input_model,
+void AddOpsFromModel(const std::string& input_model,
                      tflite::RegisteredOpMap* builtin_ops,
                      tflite::RegisteredOpMap* custom_ops) {
   std::ifstream fin(input_model);
   std::stringstream content;
   content << fin.rdbuf();
   // Need to store content data first, otherwise, it won't work in bazel.
-  string content_str = content.str();
+  std::string content_str = content.str();
   const ::tflite::Model* model = ::tflite::GetModel(content_str.data());
   ::tflite::ReadOpsFromModel(model, builtin_ops, custom_ops);
 }
@@ -147,11 +144,11 @@ void AddOpsFromModel(const string& input_model,
 }  // namespace
 
 int main(int argc, char** argv) {
-  string input_model;
-  string output_registration;
-  string tflite_path;
-  string namespace_flag;
-  bool for_micro;
+  std::string input_model;
+  std::string output_registration;
+  std::string tflite_path;
+  std::string namespace_flag;
+  bool for_micro = false;
   ParseFlagAndInit(&argc, argv, &input_model, &output_registration,
                    &tflite_path, &namespace_flag, &for_micro);
 
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 426ed63b482..41f87fb033d 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -184,6 +184,14 @@ ifeq ($(BUILD_WITH_RUY),true)
   CXXFLAGS += -DTFLITE_WITH_RUY
 endif
 
+BUILD_WITH_RUY_PROFILER ?= false
+ifeq ($(BUILD_WITH_RUY_PROFILER),true)
+  CORE_CC_ALL_SRCS += tensorflow/lite/tools/make/downloads/ruy/ruy/profiler/instrumentation.cc
+  CORE_CC_ALL_SRCS += tensorflow/lite/tools/make/downloads/ruy/ruy/profiler/profiler.cc
+  CORE_CC_ALL_SRCS += tensorflow/lite/tools/make/downloads/ruy/ruy/profiler/treeview.cc
+  CXXFLAGS += -DRUY_PROFILER
+endif
+
 # Not to include XNNPACK.
 CXXFLAGS += -DTFLITE_WITHOUT_XNNPACK
 
@@ -212,7 +220,7 @@ TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 
 # Benchmark sources
 BENCHMARK_SRCS_DIR := tensorflow/lite/tools/benchmark
-DELEGATE_PROVIDER_SRCS_DIR := tensorflow/lite/tools/benchmark
+DELEGATE_PROVIDER_SRCS_DIR := tensorflow/lite/tools/delegates
 EVALUATION_UTILS_SRCS := \
   tensorflow/lite/tools/evaluation/utils.cc
 BENCHMARK_ALL_SRCS := \
@@ -238,7 +246,7 @@ BENCHMARK_LIB_SRCS := $(filter-out \
 	$(BENCHMARK_ALL_SRCS))
 
 # These target-specific makefiles should modify or replace options like
-# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
+# CXXFLAGS or LIBS to work for a specific targeted architecture. All logic
 # based on platforms or architectures should happen within these files, to
 # keep this main makefile focused on the sources and dependencies.
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index 314f4fe6177..a7840f6dcd0 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -37,8 +37,8 @@ EIGEN_URL="$(grep -o 'https.*gitlab.com/libeigen/eigen/-/archive/.*tar\.gz' "${B
 EIGEN_SHA="$(eval echo $(grep '# SHARED_EIGEN_SHA' "${BZL_FILE_PATH}" | grep -o '\".*\"'))"
 GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GEMMLOWP_SHA="$(eval echo $(grep '# SHARED_GEMMLOWP_SHA' "${BZL_FILE_PATH}" | grep -o '\".*\"'))"
-RUY_URL="https://github.com/google/ruy/archive/91d62808498cea7ccb48aa59181e218b4ad05701.zip"
-RUY_SHA="ac6d71df496a20043252f451d82a01636bb8bba9c3d6b5dc9fadadaffa392751"
+RUY_URL="https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip"
+RUY_SHA="b21524de00c63b3d5683b42557f78452e791cf77fddb2e63f9bcba1f7bd99093"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 GOOGLETEST_SHA="58a6f4277ca2bc8565222b3bbd58a177609e9c488e8a72649359ba51450db7d8"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/lite/tools/make/targets/rpi_makefile.inc b/tensorflow/lite/tools/make/targets/rpi_makefile.inc
index 2225848ae64..71046d08131 100644
--- a/tensorflow/lite/tools/make/targets/rpi_makefile.inc
+++ b/tensorflow/lite/tools/make/targets/rpi_makefile.inc
@@ -32,7 +32,7 @@ ifeq ($(TARGET),rpi)
   # TODO(petewarden) In the future, we'll want to use OpenBLAS as a faster
   # alternative to Eigen on non-NEON ARM hardware like armv6.
   ifeq ($(TARGET_ARCH), armv6)
-    TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabi-
+    TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf-
     CXXFLAGS += \
       -march=armv6 \
       -mfpu=vfp \
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
index b110174b632..722bdbdbb39 100644
--- a/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
@@ -92,7 +92,7 @@ TEST(NodeInfoDelegateTest, ObserverErrorCausesModifyGraphFailure) {
   TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
 
   auto status = interpreter->ModifyGraphWithDelegate(&delegate);
-  EXPECT_EQ(kTfLiteError, status);
+  EXPECT_EQ(kTfLiteDelegateError, status);
 }
 
 TEST(NodeInfoDelegateTest, NodeInfoDelegateObserver) {
diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index 8c2e39e45fa..ae868cf21b8 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -77,10 +77,14 @@ void MakeQuantizeOperator(ModelT* model, std::unique_ptr<OperatorT>* op,
 
 // Create a new TensorT object without quantization parameters.
 void MakeTensor(const string& name, const std::vector<int32_t>& shape,
+                const std::vector<int32_t>& shape_signature,
                 const TensorType& type, std::unique_ptr<TensorT>* tensor) {
   TensorT* tensor_raw = new TensorT;
   tensor_raw->name = name;
   tensor_raw->shape = shape;
+  if (!shape_signature.empty()) {
+    tensor_raw->shape_signature = shape_signature;
+  }
   tensor_raw->type = type;
 
   tensor->reset(tensor_raw);
@@ -89,10 +93,11 @@ void MakeTensor(const string& name, const std::vector<int32_t>& shape,
 // Create a new TensorT object with quantization parameters.
 void MakeTensorWithQuantParam(const string& name,
                               const std::vector<int32_t>& shape,
+                              const std::vector<int32_t>& shape_signature,
                               const TensorType& type, float scale,
                               int64_t zero_point,
                               std::unique_ptr<TensorT>* tensor) {
-  MakeTensor(name, shape, type, tensor);
+  MakeTensor(name, shape, shape_signature, type, tensor);
   (*tensor)->quantization = absl::make_unique<QuantizationParametersT>();
   (*tensor)->quantization->scale.push_back(scale);
   (*tensor)->quantization->zero_point.push_back(zero_point);
@@ -129,8 +134,10 @@ void SetOperatorCodeVersion(ModelT* model) {
       OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
       operator_property::OperatorProperty property =
           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
-      if (property.quantizable) {
-        // Only update the versions of quantizable operations.
+      if (property.quantizable && op_code->version < property.version) {
+        // Only update the versions of quantizable operations if the original
+        // version is lesser than minimum quantized one mentioned by
+        // OperatorProperty.
         op_code->version = property.version;
       }
     }
diff --git a/tensorflow/lite/tools/optimize/model_utils.h b/tensorflow/lite/tools/optimize/model_utils.h
index 6583d6a10db..f90e6b1a21d 100644
--- a/tensorflow/lite/tools/optimize/model_utils.h
+++ b/tensorflow/lite/tools/optimize/model_utils.h
@@ -34,11 +34,13 @@ void MakeQuantizeOperator(ModelT* model, std::unique_ptr<OperatorT>* op,
 
 // Create a new TensorT object without quantization parameters.
 void MakeTensor(const string& name, const std::vector<int32_t>& shape,
+                const std::vector<int32_t>& shape_signature,
                 const TensorType& type, std::unique_ptr<TensorT>* tensor);
 
 // Create a new TensorT object with quantization parameters.
 void MakeTensorWithQuantParam(const string& name,
                               const std::vector<int32_t>& shape,
+                              const std::vector<int32_t>& shape_signature,
                               const TensorType& type, float scale,
                               int64_t zero_point,
                               std::unique_ptr<TensorT>* tensor);
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.cc b/tensorflow/lite/tools/optimize/modify_model_interface.cc
index bc1e9cbe5a3..0d2441a9c58 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.cc
@@ -295,7 +295,7 @@ TfLiteStatus ModifyModelInterface(flatbuffers::FlatBufferBuilder* builder,
       GetOutputTensors(model, &error_reporter);
   if (output_type == TensorType_UINT8) {
     SetOutputTypeToUINT8(model, outputs);
-  } else if (input_type == TensorType_INT8) {
+  } else if (output_type == TensorType_INT8) {
     RemoveOutputTensor(model, outputs);
   } else {
     return kTfLiteError;
@@ -383,9 +383,9 @@ void AddUint8Dequant(
         const std::pair<float, int32_t>& provided_quant_params =
             quant_params.at(string(tensor->name));
         utils::MakeTensorWithQuantParam(
-            added_tensor_name, tensor->shape, TensorType_UINT8,
-            provided_quant_params.first, provided_quant_params.second,
-            &leading_op_input);
+            added_tensor_name, tensor->shape, tensor->shape_signature,
+            TensorType_UINT8, provided_quant_params.first,
+            provided_quant_params.second, &leading_op_input);
         const int32_t leading_op_input_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(leading_op_input));
 
@@ -423,9 +423,9 @@ void AddUint8Quant(
         const std::pair<float, int32_t>& provided_quant_params =
             quant_params.at(string(tensor->name));
         utils::MakeTensorWithQuantParam(
-            added_tensor_name, tensor->shape, TensorType_UINT8,
-            provided_quant_params.first, provided_quant_params.second,
-            &tailing_op_output);
+            added_tensor_name, tensor->shape, tensor->shape_signature,
+            TensorType_UINT8, provided_quant_params.first,
+            provided_quant_params.second, &tailing_op_output);
         const int32_t tailing_op_output_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(tailing_op_output));
 
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_main.cc b/tensorflow/lite/tools/optimize/modify_model_interface_main.cc
index e76b46b9980..24674a1b341 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface_main.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_main.cc
@@ -25,12 +25,12 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  if (!strcmp(argv[3], "uint8") && !strcmp(argv[3], "int8")) {
+  if (strcmp(argv[3], "uint8") && strcmp(argv[3], "int8")) {
     printf("Only support uint8 and int8 for input interface");
     return 1;
   }
 
-  if (!strcmp(argv[4], "uint8") && !strcmp(argv[4], "int8")) {
+  if (strcmp(argv[4], "uint8") && strcmp(argv[4], "int8")) {
     printf("Only support uint8 and int8 for output interface");
     return 1;
   }
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
index 7e2744fb1e1..5a04f28f638 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
@@ -54,15 +54,15 @@ std::unique_ptr<ModelT> CreateModelSingleInputOutput() {
 
   // Op.
   quant_op->opcode_index = 0;
-  quant_op->inputs = {0};
-  quant_op->outputs = {1};
+  quant_op->inputs = {2};
+  quant_op->outputs = {0};
 
   fc_op->opcode_index = 1;
-  fc_op->inputs = {1};
-  fc_op->outputs = {2};
+  fc_op->inputs = {0};
+  fc_op->outputs = {1};
 
   dequant_op->opcode_index = 2;
-  dequant_op->inputs = {2};
+  dequant_op->inputs = {1};
   dequant_op->outputs = {3};
 
   model->subgraphs[0]->operators.push_back(std::move(quant_op));
@@ -74,30 +74,31 @@ std::unique_ptr<ModelT> CreateModelSingleInputOutput() {
   model->operator_codes.push_back(std::move(dequant_op_code));
 
   // Model input/otuput.
-  model->subgraphs[0]->inputs = {0};
+  model->subgraphs[0]->inputs = {2};
   model->subgraphs[0]->outputs = {3};
 
-  // Tensors
+  // Tensors. Float tensors are at the end of the tensor list.
+
   auto tensor_0 = absl::make_unique<TensorT>();
+  tensor_0->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor_0->quantization->scale.push_back(0.35);
+  tensor_0->quantization->zero_point.push_back(28);
   tensor_0->name = "tensor_0";
   tensor_0->shape = {};
-  tensor_0->type = TensorType_FLOAT32;
+  tensor_0->type = TensorType_INT8;
 
   auto tensor_1 = absl::make_unique<TensorT>();
   tensor_1->quantization = absl::make_unique<QuantizationParametersT>();
-  tensor_1->quantization->scale.push_back(0.35);
-  tensor_1->quantization->zero_point.push_back(28);
+  tensor_1->quantization->scale.push_back(0.12);
+  tensor_1->quantization->zero_point.push_back(50);
   tensor_1->name = "tensor_1";
   tensor_1->shape = {};
   tensor_1->type = TensorType_INT8;
 
   auto tensor_2 = absl::make_unique<TensorT>();
-  tensor_2->quantization = absl::make_unique<QuantizationParametersT>();
-  tensor_2->quantization->scale.push_back(0.12);
-  tensor_2->quantization->zero_point.push_back(50);
   tensor_2->name = "tensor_2";
   tensor_2->shape = {};
-  tensor_2->type = TensorType_INT8;
+  tensor_2->type = TensorType_FLOAT32;
 
   auto tensor_3 = absl::make_unique<TensorT>();
   tensor_3->name = "tensor_3";
@@ -310,11 +311,11 @@ TEST(ModelInterface, Uint8SingleInputOutput) {
   EXPECT_EQ(model->subgraphs[0]->tensors.size(), 4);
   EXPECT_EQ(model->buffers.size(), 1);
 
-  EXPECT_EQ(model->subgraphs[0]->tensors[0]->name, "tensor_0");
-  EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_UINT8);
-  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[0]->quantization->scale[0],
+  EXPECT_EQ(model->subgraphs[0]->tensors[2]->name, "tensor_2");
+  EXPECT_EQ(model->subgraphs[0]->tensors[2]->type, TensorType_UINT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[2]->quantization->scale[0],
                   0.35);
-  EXPECT_EQ(model->subgraphs[0]->tensors[0]->quantization->zero_point[0], 156);
+  EXPECT_EQ(model->subgraphs[0]->tensors[2]->quantization->zero_point[0], 156);
 
   EXPECT_EQ(model->subgraphs[0]->tensors[3]->name, "tensor_3");
   EXPECT_EQ(model->subgraphs[0]->tensors[3]->type, TensorType_UINT8);
@@ -345,9 +346,31 @@ TEST(ModelInterface, Int8SingleInputOutput) {
   EXPECT_EQ(model->buffers.size(), 1);
 
   EXPECT_EQ(model->subgraphs[0]->inputs.size(), 1);
-  EXPECT_EQ(model->subgraphs[0]->inputs[0], 1);
+  EXPECT_EQ(model->subgraphs[0]->inputs[0], 0);
   EXPECT_EQ(model->subgraphs[0]->outputs.size(), 1);
-  EXPECT_EQ(model->subgraphs[0]->outputs[0], 2);
+  EXPECT_EQ(model->subgraphs[0]->outputs[0], 1);
+}
+
+TEST(ModelInterface, MixedTypeSingleInputOutput) {
+  auto model = CreateModelSingleInputOutput();
+
+  // Change model type.
+  flatbuffers::FlatBufferBuilder builder;
+  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), TensorType_UINT8,
+                                 TensorType_INT8),
+            kTfLiteOk);
+
+  // Verify results.
+  EXPECT_EQ(model->operator_codes.size(), 3);
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 2);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 3);
+  EXPECT_EQ(model->buffers.size(), 1);
+
+  EXPECT_EQ(model->subgraphs[0]->inputs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->inputs[0], 2);
+  EXPECT_EQ(model->subgraphs[0]->outputs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->outputs[0], 1);
 }
 
 TEST(ModelInterface, Uint8MutipleInputOutput) {
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 6a32858e357..3633cb63ace 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -130,9 +130,10 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       tensor_property.per_axis = true;
       tensor_property.per_axis_index = 0;
       tensor_property.symmetric = true;
-      property.inputs = {{1, tensor_property}, {2, {}}};
+      property.inputs = {{2, {}}, {1, tensor_property}};
       property.outputs = {{0, {}}};
-      property.version = 2;
+      property.biases = {3};
+      property.version = 3;
       break;
     }
     case BuiltinOperator_DEPTHWISE_CONV_2D: {
@@ -814,7 +815,17 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.version = 2;
       break;
+    case BuiltinOperator_PRELU:
+      property.inputs = {{0, {}}, {1, {}}};
+      property.outputs = {{0, {}}};
+      property.restrict_same_input_output_scale = false;
+      property.version = 1;
+      break;
     case BuiltinOperator_LEAKY_RELU:
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 2;
+      break;
     case BuiltinOperator_RELU:
     case BuiltinOperator_RELU6:
       property.inputs = {{0, {}}};
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index 995595e7878..95b0e5000c3 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -86,7 +86,7 @@ struct OperatorProperty {
   bool restrict_same_input_output_scale = false;
 
   // Use same min of min and max of max for each group.
-  // Incompatable with restrict_same_input_output_scale and restricted_value.
+  // Incompatible with restrict_same_input_output_scale and restricted_value.
   // TODO(jianlijianli): make it compatible with other restrictions when there
   // is a use case.
   std::vector<std::vector<int>> restrict_scale = {};
diff --git a/tensorflow/lite/tools/optimize/python/BUILD b/tensorflow/lite/tools/optimize/python/BUILD
new file mode 100644
index 00000000000..050c0008924
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/python/BUILD
@@ -0,0 +1,67 @@
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_binary(
+    name = "modify_model_interface",
+    srcs = ["modify_model_interface.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":modify_model_interface_constants",
+        ":modify_model_interface_lib",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "modify_model_interface_lib",
+    srcs = ["modify_model_interface_lib.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":_pywrap_modify_model_interface",
+        ":modify_model_interface_constants",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/lite/python:schema_py",
+    ],
+)
+
+py_test(
+    name = "modify_model_interface_lib_test",
+    srcs = ["modify_model_interface_lib_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_mac",  # TODO(b/148247402): flatbuffers import broken on Mac OS.
+    ],
+    deps = [
+        ":modify_model_interface_lib",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "modify_model_interface_constants",
+    srcs = ["modify_model_interface_constants.py"],
+    srcs_version = "PY3",
+    deps = ["//tensorflow/lite/python:lite_constants"],
+)
+
+pybind_extension(
+    name = "_pywrap_modify_model_interface",
+    srcs = ["modify_model_interface.cc"],
+    module_name = "_pywrap_modify_model_interface",
+    deps = [
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/tools/optimize:modify_model_interface",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface.cc b/tensorflow/lite/tools/optimize/python/modify_model_interface.cc
new file mode 100644
index 00000000000..ed67b07cb0f
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface.cc
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Python wrapper to modify model interface.
+
+#include "tensorflow/lite/tools/optimize/modify_model_interface.h"
+
+#include <string>
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace pybind11 {
+
+PYBIND11_MODULE(_pywrap_modify_model_interface, m) {
+  // An anonymous function that invokes the C++ function
+  // after applying transformations to the python function arguments
+  m.def("modify_model_interface",
+        [](const std::string& input_file, const std::string& output_file,
+           const int input_type, const int output_type) -> int {
+          return tflite::optimize::ModifyModelInterface(
+              input_file, output_file,
+              static_cast<tflite::TensorType>(input_type),
+              static_cast<tflite::TensorType>(input_type));
+        });
+}
+
+}  // namespace pybind11
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface.py b/tensorflow/lite/tools/optimize/python/modify_model_interface.py
new file mode 100644
index 00000000000..938f353b0ae
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface.py
@@ -0,0 +1,78 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Modify a quantized model's interface from float to integer.
+
+Example usage:
+python modify_model_interface_main.py \
+  --input_file=float_model.tflite \
+  --output_file=int_model.tflite \
+  --input_type=INT8 \
+  --output_type=INT8
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+from tensorflow.lite.tools.optimize.python import modify_model_interface_constants as mmi_constants
+from tensorflow.lite.tools.optimize.python import modify_model_interface_lib as mmi_lib
+from tensorflow.python.platform import app
+
+
+def main(_):
+  """Application run loop."""
+  parser = argparse.ArgumentParser(
+      description="Modify a quantized model's interface from float to integer.")
+  parser.add_argument(
+      '--input_file',
+      type=str,
+      required=True,
+      help='Full path name to the input tflite file.')
+  parser.add_argument(
+      '--output_file',
+      type=str,
+      required=True,
+      help='Full path name to the output tflite file.')
+  parser.add_argument(
+      '--input_type',
+      type=str.upper,
+      choices=mmi_constants.STR_TYPES,
+      default=mmi_constants.DEFAULT_STR_TYPE,
+      help='Modified input integer interface type.')
+  parser.add_argument(
+      '--output_type',
+      type=str.upper,
+      choices=mmi_constants.STR_TYPES,
+      default=mmi_constants.DEFAULT_STR_TYPE,
+      help='Modified output integer interface type.')
+  args = parser.parse_args()
+
+  input_type = mmi_constants.STR_TO_TFLITE_TYPES[args.input_type]
+  output_type = mmi_constants.STR_TO_TFLITE_TYPES[args.output_type]
+
+  mmi_lib.modify_model_interface(args.input_file, args.output_file, input_type,
+                                 output_type)
+
+  print('Successfully modified the model input type from FLOAT to '
+        '{input_type} and output type from FLOAT to {output_type}.'.format(
+            input_type=args.input_type, output_type=args.output_type))
+
+
+if __name__ == '__main__':
+  app.run(main=main, argv=sys.argv[:1])
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py b/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
new file mode 100644
index 00000000000..42767268e48
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
@@ -0,0 +1,34 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constants for modify_model_interface."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.lite.python import lite_constants
+
+STR_TO_TFLITE_TYPES = {
+    'INT8': lite_constants.INT8,
+    'UINT8': lite_constants.QUANTIZED_UINT8
+}
+TFLITE_TO_STR_TYPES = {v: k for k, v in STR_TO_TFLITE_TYPES.items()}
+
+STR_TYPES = STR_TO_TFLITE_TYPES.keys()
+TFLITE_TYPES = STR_TO_TFLITE_TYPES.values()
+
+DEFAULT_STR_TYPE = 'INT8'
+DEFAULT_TFLITE_TYPE = STR_TO_TFLITE_TYPES[DEFAULT_STR_TYPE]
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py
new file mode 100644
index 00000000000..782d88cbc9b
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py
@@ -0,0 +1,79 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library to modify a quantized model's interface from float to integer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.lite.python import schema_py_generated as schema_fb
+from tensorflow.lite.tools.optimize.python import _pywrap_modify_model_interface
+from tensorflow.lite.tools.optimize.python import modify_model_interface_constants as mmi_constants
+
+
+def _parse_type_to_int(dtype, flag):
+  """Converts a tflite type to it's integer representation.
+
+  Args:
+    dtype: tf.DType representing the inference type.
+    flag: str representing the flag name.
+
+  Returns:
+     integer, a tflite TensorType enum value.
+
+  Raises:
+    ValueError: Unsupported tflite type.
+  """
+  # Validate if dtype is supported in tflite and is a valid interface type.
+  if dtype not in mmi_constants.TFLITE_TYPES:
+    raise ValueError(
+        "Unsupported value '{0}' for {1}. Only {2} are supported.".format(
+            dtype, flag, mmi_constants.TFLITE_TYPES))
+
+  dtype_str = mmi_constants.TFLITE_TO_STR_TYPES[dtype]
+  dtype_int = schema_fb.TensorType.__dict__[dtype_str]
+
+  return dtype_int
+
+
+def modify_model_interface(input_file, output_file, input_type, output_type):
+  """Modify a quantized model's interface (input/output) from float to integer.
+
+  Args:
+    input_file: Full path name to the input tflite file.
+    output_file: Full path name to the output tflite file.
+    input_type: Final input interface type.
+    output_type: Final output interface type.
+
+  Raises:
+    RuntimeError: If the modification of the model interface was unsuccessful.
+    ValueError: If the input_type or output_type is unsupported.
+
+  """
+  # Map the interface types to integer values
+  input_type_int = _parse_type_to_int(input_type, 'input_type')
+  output_type_int = _parse_type_to_int(output_type, 'output_type')
+
+  # Invoke the function to modify the model interface
+  status = _pywrap_modify_model_interface.modify_model_interface(
+      input_file, output_file, input_type_int, output_type_int)
+
+  # Throw an exception if the return status is an error.
+  if status != 0:
+    raise RuntimeError(
+        'Error occurred when trying to modify the model input type from float '
+        'to {input_type} and output type from float to {output_type}.'.format(
+            input_type=input_type, output_type=output_type))
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface_lib_test.py b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib_test.py
new file mode 100644
index 00000000000..e97f0db9bbb
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib_test.py
@@ -0,0 +1,129 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for modify_model_interface_lib.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.lite.tools.optimize.python import modify_model_interface_lib
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+def build_tflite_model_with_full_integer_quantization():
+  # Define TF model
+  input_size = 3
+  model = tf.keras.Sequential([
+      tf.keras.layers.InputLayer(input_shape=(input_size,), dtype=tf.float32),
+      tf.keras.layers.Dense(units=5, activation=tf.nn.relu),
+      tf.keras.layers.Dense(units=2, activation=tf.nn.softmax)
+  ])
+
+  # Convert TF Model to a Quantized TFLite Model
+  converter = tf.lite.TFLiteConverter.from_keras_model(model)
+  converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+  def representative_dataset_gen():
+    for i in range(10):
+      yield [np.array([i] * input_size, dtype=np.float32)]
+
+  converter.representative_dataset = representative_dataset_gen
+  converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+  tflite_model = converter.convert()
+
+  return tflite_model
+
+
+class ModifyModelInterfaceTest(test_util.TensorFlowTestCase):
+
+  def testInt8Interface(self):
+    # 1. SETUP
+    # Define the temporary directory and files
+    temp_dir = self.get_temp_dir()
+    initial_file = os.path.join(temp_dir, 'initial_model.tflite')
+    final_file = os.path.join(temp_dir, 'final_model.tflite')
+    # Define initial model
+    initial_model = build_tflite_model_with_full_integer_quantization()
+    with open(initial_file, 'wb') as model_file:
+      model_file.write(initial_model)
+
+    # 2. INVOKE
+    # Invoke the modify_model_interface function
+    modify_model_interface_lib.modify_model_interface(initial_file, final_file,
+                                                      tf.int8, tf.int8)
+
+    # 3. VALIDATE
+    # Load TFLite model and allocate tensors.
+    initial_interpreter = tf.lite.Interpreter(model_path=initial_file)
+    initial_interpreter.allocate_tensors()
+    final_interpreter = tf.lite.Interpreter(model_path=final_file)
+    final_interpreter.allocate_tensors()
+
+    # Get input and output types.
+    initial_input_dtype = initial_interpreter.get_input_details()[0]['dtype']
+    initial_output_dtype = initial_interpreter.get_output_details()[0]['dtype']
+    final_input_dtype = final_interpreter.get_input_details()[0]['dtype']
+    final_output_dtype = final_interpreter.get_output_details()[0]['dtype']
+
+    # Validate the model interfaces
+    self.assertEqual(initial_input_dtype, np.float32)
+    self.assertEqual(initial_output_dtype, np.float32)
+    self.assertEqual(final_input_dtype, np.int8)
+    self.assertEqual(final_output_dtype, np.int8)
+
+  def testUInt8Interface(self):
+    # 1. SETUP
+    # Define the temporary directory and files
+    temp_dir = self.get_temp_dir()
+    initial_file = os.path.join(temp_dir, 'initial_model.tflite')
+    final_file = os.path.join(temp_dir, 'final_model.tflite')
+    # Define initial model
+    initial_model = build_tflite_model_with_full_integer_quantization()
+    with open(initial_file, 'wb') as model_file:
+      model_file.write(initial_model)
+
+    # 2. INVOKE
+    # Invoke the modify_model_interface function
+    modify_model_interface_lib.modify_model_interface(initial_file, final_file,
+                                                      tf.uint8, tf.uint8)
+
+    # 3. VALIDATE
+    # Load TFLite model and allocate tensors.
+    initial_interpreter = tf.lite.Interpreter(model_path=initial_file)
+    initial_interpreter.allocate_tensors()
+    final_interpreter = tf.lite.Interpreter(model_path=final_file)
+    final_interpreter.allocate_tensors()
+
+    # Get input and output types.
+    initial_input_dtype = initial_interpreter.get_input_details()[0]['dtype']
+    initial_output_dtype = initial_interpreter.get_output_details()[0]['dtype']
+    final_input_dtype = final_interpreter.get_input_details()[0]['dtype']
+    final_output_dtype = final_interpreter.get_output_details()[0]['dtype']
+
+    # Validate the model interfaces
+    self.assertEqual(initial_input_dtype, np.float32)
+    self.assertEqual(initial_output_dtype, np.float32)
+    self.assertEqual(final_input_dtype, np.uint8)
+    self.assertEqual(final_output_dtype, np.uint8)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 6c6ba10d60c..b7b99d9c393 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -141,8 +141,8 @@ int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
       const string leading_op_name = tensor->name;
       const string new_name_original_input = tensor->name + "_int8";
       tensor->name = new_name_original_input;
-      utils::MakeTensor(leading_op_name, tensor->shape, input_type,
-                        &leading_op_input);
+      utils::MakeTensor(leading_op_name, tensor->shape, tensor->shape_signature,
+                        input_type, &leading_op_input);
     } else {
       // Get scale and zero point from the first tensor.
       const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
@@ -156,9 +156,9 @@ int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
       const string leading_op_name = tensor->name;
       const string new_name_original_input = tensor->name + "_int8";
       tensor->name = new_name_original_input;
-      utils::MakeTensorWithQuantParam(leading_op_name, tensor->shape,
-                                      input_type, scale, zero_point + 128,
-                                      &leading_op_input);
+      utils::MakeTensorWithQuantParam(
+          leading_op_name, tensor->shape, tensor->shape_signature, input_type,
+          scale, zero_point + 128, &leading_op_input);
     }
     const int32_t leading_op_input_idx = subgraph->tensors.size();
     subgraph->tensors.push_back(std::move(leading_op_input));
@@ -193,8 +193,8 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
       const string tailing_op_name = tensor->name;
       const string new_name_original_output = tensor->name + "_int8";
       tensor->name = new_name_original_output;
-      utils::MakeTensor(tailing_op_name, tensor->shape, output_type,
-                        &tailing_op_output);
+      utils::MakeTensor(tailing_op_name, tensor->shape, tensor->shape_signature,
+                        output_type, &tailing_op_output);
     } else {
       // Get scale and zero point from the last tensor.
       const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
@@ -208,9 +208,9 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
       const string tailing_op_name = tensor->name;
       const string new_name_original_output = tensor->name + "_int8";
       tensor->name = new_name_original_output;
-      utils::MakeTensorWithQuantParam(tailing_op_name, tensor->shape,
-                                      output_type, scale, zero_point + 128,
-                                      &tailing_op_output);
+      utils::MakeTensorWithQuantParam(
+          tailing_op_name, tensor->shape, tensor->shape_signature, output_type,
+          scale, zero_point + 128, &tailing_op_output);
     }
     const int32_t tailing_op_output_idx = subgraph->tensors.size();
     subgraph->tensors.push_back(std::move(tailing_op_output));
@@ -340,8 +340,9 @@ TfLiteStatus ApplyConstraints(ModelT* model,
         std::unique_ptr<TensorT> additional_tensor;
         const string requant_tensor_name = input_tensor->name + "_requantized";
         utils::MakeTensorWithQuantParam(
-            requant_tensor_name, input_tensor->shape, TensorType_INT8,
-            output_scale, output_zp, &additional_tensor);
+            requant_tensor_name, input_tensor->shape,
+            input_tensor->shape_signature, TensorType_INT8, output_scale,
+            output_zp, &additional_tensor);
         const int32_t additional_tensor_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(additional_tensor));
 
@@ -545,7 +546,7 @@ TfLiteStatus QuantizeOpInput(
         // operation since the preceding op may require a float output.
         std::unique_ptr<TensorT> op_output;
         utils::MakeTensor(tensor->name + "_int8", tensor->shape,
-                          TensorType_INT8, &op_output);
+                          tensor->shape_signature, TensorType_INT8, &op_output);
         op_output->quantization = absl::make_unique<QuantizationParametersT>();
         op_output->quantization->min.push_back(tensor->quantization->min[0]);
         op_output->quantization->max.push_back(tensor->quantization->max[0]);
@@ -573,7 +574,7 @@ TfLiteStatus QuantizeOpInput(
     // since this op is not quantizable.
     std::unique_ptr<TensorT> op_output;
     utils::MakeTensor(tensor->name + "_float", tensor->shape,
-                      TensorType_FLOAT32, &op_output);
+                      tensor->shape_signature, TensorType_FLOAT32, &op_output);
     const int32_t dequant_op_output_idx = subgraph->tensors.size();
     subgraph->tensors.push_back(std::move(op_output));
     std::unique_ptr<OperatorT> dequant_op;
@@ -865,18 +866,10 @@ TfLiteStatus QuantizeBiases(ModelT* model,
         continue;
       }
       for (const int bias_idx : property.biases) {
-        if (op->inputs[bias_idx] == kTfLiteOptionalTensor) {
+        if (bias_idx >= op->inputs.size() ||
+            op->inputs[bias_idx] == kTfLiteOptionalTensor) {
           continue;
         }
-        if (bias_idx >= op->inputs.size()) {
-          TF_LITE_REPORT_ERROR(
-              error_reporter,
-              "Required input index %d is larger than the input length of "
-              "op  %s at index %d in subgraph %d",
-              bias_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code),
-              op_idx, subgraph_idx);
-          return kTfLiteError;
-        }
         // Quantize if it is not quantized already as the
         // output of another op or input of another op.
         TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
@@ -1046,7 +1039,8 @@ TfLiteStatus EnsureBiasScaleCompatibility(
 
       // Loop over all bias tensors.
       for (const int bias_idx : property.biases) {
-        if (op->inputs[bias_idx] == kTfLiteOptionalTensor) {
+        if (bias_idx >= op->inputs.size() ||
+            op->inputs[bias_idx] == kTfLiteOptionalTensor) {
           continue;
         }
         TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index 581819495b1..7e3853c645c 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -259,10 +259,14 @@ void MakeDequantizeOperator(ModelT* model, std::unique_ptr<OperatorT>* op,
 
 // Create a new TensorT object.
 void MakeTensor(const string& name, const std::vector<int32_t>& shape,
+                const std::vector<int32_t>& shape_signature,
                 std::unique_ptr<TensorT>* tensor) {
   TensorT* tensor_raw = new TensorT;
   tensor_raw->name = name;
   tensor_raw->shape = shape;
+  if (!shape_signature.empty()) {
+    tensor_raw->shape_signature = shape_signature;
+  }
 
   tensor->reset(tensor_raw);
 }
@@ -419,8 +423,8 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
       // Create a new tensor to be the output of the dequantize op.
       std::unique_ptr<TensorT> dequantize_output;
       const string dequant_name = tensor->name + "_dequantize";
-      utils::MakeTensor(dequant_name, tensor->shape, TensorType_FLOAT32,
-                        &dequantize_output);
+      utils::MakeTensor(dequant_name, tensor->shape, tensor->shape_signature,
+                        TensorType_FLOAT32, &dequantize_output);
       const int32_t dequantize_output_idx = subgraph->tensors.size();
       subgraph->tensors.push_back(std::move(dequantize_output));
 
@@ -503,8 +507,8 @@ TfLiteStatus QuantizeWeightsFloat16(flatbuffers::FlatBufferBuilder* builder,
       // Create a new tensor to be the output of the dequantize op.
       std::unique_ptr<TensorT> dequantize_output;
       const string dequant_name = tensor->name + "_dequantize";
-      utils::MakeTensor(dequant_name, tensor->shape, TensorType_FLOAT32,
-                        &dequantize_output);
+      utils::MakeTensor(dequant_name, tensor->shape, tensor->shape_signature,
+                        TensorType_FLOAT32, &dequantize_output);
       const int32_t dequantize_output_idx = subgraph->tensors.size();
       subgraph->tensors.push_back(std::move(dequantize_output));
 
diff --git a/tensorflow/lite/tools/optimize/sparsity/format_converter.cc b/tensorflow/lite/tools/optimize/sparsity/format_converter.cc
index 05cb8b32bf7..3800672a4e2 100644
--- a/tensorflow/lite/tools/optimize/sparsity/format_converter.cc
+++ b/tensorflow/lite/tools/optimize/sparsity/format_converter.cc
@@ -250,7 +250,7 @@ FormatConverter<T>::FormatConverter(const std::vector<int>& shape,
   for (int i = 0; i < original_rank; i++) {
     if (block_dim < block_map_.size() && block_map_[block_dim] == i) {
       int orig_dim = traversal_order_[original_rank + block_dim];
-      block_size_[i] = sparsity.dim_metadata[orig_dim].dense_size;
+      block_size_[block_dim] = sparsity.dim_metadata[orig_dim].dense_size;
       blocked_shape_[i] = shape[i] / sparsity.dim_metadata[orig_dim].dense_size;
       block_dim++;
     } else {
@@ -273,9 +273,10 @@ void FormatConverter<T>::Populate(const T* src_data, std::vector<int> indices,
     }
 
     for (; i < indices.size(); i++) {
-      int orig_dim = block_map_[traversal_order_[i] - orig_rank];
+      const int block_idx = traversal_order_[i] - orig_rank;
+      const int orig_dim = block_map_[block_idx];
       orig_idx[orig_dim] =
-          orig_idx[orig_dim] * block_size_[orig_dim] + indices[i];
+          orig_idx[orig_dim] * block_size_[block_idx] + indices[i];
     }
 
     data_[GetFlattenedIndex(orig_idx, dense_shape_)] = src_data[*src_data_ptr];
diff --git a/tensorflow/lite/tools/optimize/sparsity/format_converter_test.cc b/tensorflow/lite/tools/optimize/sparsity/format_converter_test.cc
index c3351810283..96919d22d4a 100644
--- a/tensorflow/lite/tools/optimize/sparsity/format_converter_test.cc
+++ b/tensorflow/lite/tools/optimize/sparsity/format_converter_test.cc
@@ -31,18 +31,18 @@ TEST(FormatConverterTest, SimpleTestD0D1) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0 = {3};
   const std::vector<int> dm1 = {4};
   EXPECT_EQ(dm0, dim_metadata[0]);
   EXPECT_EQ(dm1, dim_metadata[2]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 0, 9, 8, 0, 0, 0, 0, 5, 0, 0, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -55,7 +55,7 @@ TEST(FormatConverterTest, SimpleTestS0D1) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0_0 = {0, 2};
   const std::vector<int> dm0_1 = {0, 2};
   const std::vector<int> dm1 = {4};
@@ -63,12 +63,12 @@ TEST(FormatConverterTest, SimpleTestS0D1) {
   EXPECT_EQ(dm0_1, dim_metadata[1]);
   EXPECT_EQ(dm1, dim_metadata[2]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 0, 9, 8, 5, 0, 0, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -81,7 +81,7 @@ TEST(FormatConverterTest, SimpleTestD0S1) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0 = {3};
   const std::vector<int> dm1_0 = {0, 3, 3, 5};
   const std::vector<int> dm1_1 = {0, 2, 3, 0, 3};
@@ -89,12 +89,12 @@ TEST(FormatConverterTest, SimpleTestD0S1) {
   EXPECT_EQ(dm1_0, dim_metadata[2]);
   EXPECT_EQ(dm1_1, dim_metadata[3]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 9, 8, 5, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -107,7 +107,7 @@ TEST(FormatConverterTest, SimpleTestS0S1) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0_0 = {0, 2};
   const std::vector<int> dm0_1 = {0, 2};
   const std::vector<int> dm1_0 = {0, 3, 5};
@@ -117,12 +117,12 @@ TEST(FormatConverterTest, SimpleTestS0S1) {
   EXPECT_EQ(dm1_0, dim_metadata[2]);
   EXPECT_EQ(dm1_1, dim_metadata[3]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 9, 8, 5, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -135,18 +135,18 @@ TEST(FormatConverterTest, SimpleTestD1D0) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0 = {4};
   const std::vector<int> dm1 = {3};
   EXPECT_EQ(dm0, dim_metadata[0]);
   EXPECT_EQ(dm1, dim_metadata[2]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 0, 5, 0, 0, 0, 9, 0, 0, 8, 0, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -159,7 +159,7 @@ TEST(FormatConverterTest, SimpleTestS1D0) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0_0 = {0, 3};
   const std::vector<int> dm0_1 = {0, 2, 3};
   const std::vector<int> dm1 = {3};
@@ -167,12 +167,12 @@ TEST(FormatConverterTest, SimpleTestS1D0) {
   EXPECT_EQ(dm0_1, dim_metadata[1]);
   EXPECT_EQ(dm1, dim_metadata[2]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 0, 5, 9, 0, 0, 8, 0, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -185,7 +185,7 @@ TEST(FormatConverterTest, SimpleTestD1S0) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0 = {4};
   const std::vector<int> dm1_0 = {0, 2, 2, 3, 5};
   const std::vector<int> dm1_1 = {0, 2, 0, 0, 2};
@@ -193,12 +193,12 @@ TEST(FormatConverterTest, SimpleTestD1S0) {
   EXPECT_EQ(dm1_0, dim_metadata[2]);
   EXPECT_EQ(dm1_1, dim_metadata[3]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 5, 9, 8, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -211,7 +211,7 @@ TEST(FormatConverterTest, SimpleTestS1S0) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0_0 = {0, 3};
   const std::vector<int> dm0_1 = {0, 2, 3};
   const std::vector<int> dm1_0 = {0, 2, 3, 5};
@@ -221,12 +221,12 @@ TEST(FormatConverterTest, SimpleTestS1S0) {
   EXPECT_EQ(dm1_0, dim_metadata[2]);
   EXPECT_EQ(dm1_1, dim_metadata[3]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 5, 9, 8, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -239,7 +239,7 @@ TEST(FormatConverterTest, 3DTestS0D1S2) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0_0 = {0, 2};
   const std::vector<int> dm0_1 = {0, 2};
   const std::vector<int> dm1 = {2};
@@ -252,12 +252,12 @@ TEST(FormatConverterTest, 3DTestS0D1S2) {
   EXPECT_EQ(dm2_0, dim_metadata[4]);
   EXPECT_EQ(dm2_1, dim_metadata[5]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 9, 8, 5, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -270,7 +270,7 @@ TEST(FormatConverterTest, 3DTestD0D1S2) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0 = {3};
   const std::vector<int> dm1 = {2};
   const std::vector<int> dm2_0 = {0, 1, 3, 3, 3, 4, 5};
@@ -281,12 +281,12 @@ TEST(FormatConverterTest, 3DTestD0D1S2) {
   EXPECT_EQ(dm2_0, dim_metadata[4]);
   EXPECT_EQ(dm2_1, dim_metadata[5]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {6, 9, 8, 5, 7};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -300,7 +300,7 @@ TEST(FormatConverterTest, 3DTestS0S1S2) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0_0 = {0, 2};
   const std::vector<int> dm0_1 = {0, 2};
   const std::vector<int> dm1_0 = {0, 2, 5};
@@ -314,12 +314,12 @@ TEST(FormatConverterTest, 3DTestS0S1S2) {
   EXPECT_EQ(dm2_0, dim_metadata[4]);
   EXPECT_EQ(dm2_1, dim_metadata[5]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {1, 7, 5, 2, 4, 8, 3, 9};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -333,7 +333,7 @@ TEST(FormatConverterTest, 3DTestS0S2S1) {
   FormatConverter<int> converter(dense_shape, traversal_order, format);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm0_0 = {0, 2};
   const std::vector<int> dm0_1 = {0, 2};
   const std::vector<int> dm1_0 = {0, 2, 5};
@@ -347,12 +347,12 @@ TEST(FormatConverterTest, 3DTestS0S2S1) {
   EXPECT_EQ(dm2_0, dim_metadata[4]);
   EXPECT_EQ(dm2_1, dim_metadata[5]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {1, 7, 5, 2, 4, 8, 3, 9};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -369,25 +369,58 @@ TEST(FormatConverterTest, BlockTestD0D1) {
                                  block_size, block_map);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm = {2};
   EXPECT_EQ(dm, dim_metadata[0]);
   EXPECT_EQ(dm, dim_metadata[2]);
   EXPECT_EQ(dm, dim_metadata[4]);
   EXPECT_EQ(dm, dim_metadata[6]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {1, 0, 0, 4, 2, 3, 0, 0,
                                           0, 0, 0, 0, 5, 0, 0, 6};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
 // BCSR
-TEST(FormatConverterTest, BlockTestD0S1) {
+TEST(FormatConverterTest, BlockTestD0S11DBlock) {
+  const std::vector<int> dense_values = {1, 0, 2, 3, 0, 4, 0, 0,
+                                         0, 0, 5, 0, 0, 0, 0, 6};
+  const std::vector<int> dense_shape = {4, 4};
+  const std::vector<int> traversal_order = {0, 1, 2};
+  const std::vector<TfLiteDimensionType> format = {kTfLiteDimDense,
+                                                   kTfLiteDimSparseCSR};
+  const std::vector<int> block_size = {2};
+  const std::vector<int> block_map = {1};
+  FormatConverter<int> converter(dense_shape, traversal_order, format,
+                                 block_size, block_map);
+  converter.DenseToSparse(dense_values.data());
+
+  const auto dim_metadata = converter.GetDimMetadata();
+  const std::vector<int> dm0 = {4};
+  const std::vector<int> dm2 = {2};
+  const std::vector<int> dm1_0 = {0, 2, 3, 4, 5};
+  const std::vector<int> dm1_1 = {0, 1, 0, 1, 1};
+  EXPECT_EQ(dm0, dim_metadata[0]);
+  EXPECT_EQ(dm1_0, dim_metadata[2]);
+  EXPECT_EQ(dm1_1, dim_metadata[3]);
+  EXPECT_EQ(dm2, dim_metadata[4]);
+
+  const auto data = converter.GetData();
+  const std::vector<int> expected_data = {1, 0, 2, 3, 0, 4, 5, 0, 0, 6};
+  EXPECT_EQ(expected_data, data);
+
+  converter.SparseToDense(expected_data.data());
+  const auto data_back = converter.GetData();
+  EXPECT_EQ(data_back, dense_values);
+}
+
+// BCSR
+TEST(FormatConverterTest, BlockTestD0S12DBlock) {
   const std::vector<int> dense_values = {1, 0, 2, 3, 0, 4, 0, 0,
                                          0, 0, 5, 0, 0, 0, 0, 6};
   const std::vector<int> dense_shape = {4, 4};
@@ -400,7 +433,7 @@ TEST(FormatConverterTest, BlockTestD0S1) {
                                  block_size, block_map);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm = {2};
   const std::vector<int> dm1_0 = {0, 2, 3};
   const std::vector<int> dm1_1 = {0, 1, 1};
@@ -410,12 +443,12 @@ TEST(FormatConverterTest, BlockTestD0S1) {
   EXPECT_EQ(dm, dim_metadata[4]);
   EXPECT_EQ(dm, dim_metadata[6]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {1, 0, 0, 4, 2, 3, 0, 0, 5, 0, 0, 6};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -433,7 +466,7 @@ TEST(FormatConverterTest, BlockTestD1S0) {
                                  block_size, block_map);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm = {2};
   const std::vector<int> dm1_0 = {0, 1, 3};
   const std::vector<int> dm1_1 = {0, 0, 1};
@@ -443,12 +476,12 @@ TEST(FormatConverterTest, BlockTestD1S0) {
   EXPECT_EQ(dm, dim_metadata[4]);
   EXPECT_EQ(dm, dim_metadata[6]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {1, 0, 0, 4, 2, 0, 3, 0, 5, 0, 0, 6};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -466,7 +499,7 @@ TEST(FormatConverterTest, BlockTestD0S1LastBlockEmpty) {
                                  block_size, block_map);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm = {2};
   const std::vector<int> dm1_0 = {0, 2, 2};
   const std::vector<int> dm1_1 = {0, 1};
@@ -476,12 +509,12 @@ TEST(FormatConverterTest, BlockTestD0S1LastBlockEmpty) {
   EXPECT_EQ(dm, dim_metadata[4]);
   EXPECT_EQ(dm, dim_metadata[6]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {1, 0, 0, 4, 2, 3, 0, 0};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 
@@ -499,7 +532,7 @@ TEST(FormatConverterTest, BlockTestD0S1ColMajorBlock) {
                                  block_size, block_map);
   converter.DenseToSparse(dense_values.data());
 
-  const auto& dim_metadata = converter.GetDimMetadata();
+  const auto dim_metadata = converter.GetDimMetadata();
   const std::vector<int> dm = {2};
   const std::vector<int> dm1_0 = {0, 3, 4};
   const std::vector<int> dm1_1 = {0, 1, 2, 1};
@@ -509,13 +542,13 @@ TEST(FormatConverterTest, BlockTestD0S1ColMajorBlock) {
   EXPECT_EQ(dm, dim_metadata[4]);
   EXPECT_EQ(dm, dim_metadata[6]);
 
-  const auto& data = converter.GetData();
+  const auto data = converter.GetData();
   const std::vector<int> expected_data = {1, 1, 0, 0, 2, 2, 3, 3,
                                           0, 0, 4, 4, 5, 0, 0, 0};
   EXPECT_EQ(expected_data, data);
 
   converter.SparseToDense(expected_data.data());
-  const auto& data_back = converter.GetData();
+  const auto data_back = converter.GetData();
   EXPECT_EQ(data_back, dense_values);
 }
 }  // namespace
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
index dac8ce02ca1..8a2be59b980 100644
--- a/tensorflow/lite/tools/pip_package/README.md
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -49,6 +49,52 @@ BUILD_DEB=y to the make command (only for python3):
 make BASE_IMAGE=debian:buster PYTHON=python3 TENSORFLOW_TARGET=rpi BUILD_DEB=y docker-build
 ```
 
+## Alternative build with Bazel (experimental)
+
+There is another build steps to build a binary wheel which uses Bazel instead of
+Makefile. You don't need to install additional dependencies.
+This approach can leverage TF's ci_build.sh for ARM cross builds.
+
+### Native build for your workstation
+
+```sh
+tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
+```
+
+### Cross build for armhf Python 3.5
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
+```
+
+### Cross build for armhf Python 3.7
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.7" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
+```
+
+### Cross build for aarch64 Python 3.5
+
+```sh
+  CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
+```
+
+### Cross build for aarch64 Python 3.7
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.7" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
+```
+
+## Usage
+
 Note, unlike tensorflow this will be installed to a tflite_runtime namespace.
 You can then use the Tensorflow Lite interpreter as.
 
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
new file mode 100755
index 00000000000..69afb2f6b80
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -ex
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PYTHON="${PYTHON:-python3}"
+VERSION_SUFFIX=${VERSION_SUFFIX:-}
+export TENSORFLOW_DIR="${SCRIPT_DIR}/../../../.."
+TENSORFLOW_LITE_DIR="${TENSORFLOW_DIR}/tensorflow/lite"
+TENSORFLOW_VERSION=$(grep "_VERSION = " "${TENSORFLOW_DIR}/tensorflow/tools/pip_package/setup.py" | cut -d= -f2 | sed "s/[ '-]//g")
+export PACKAGE_VERSION="${TENSORFLOW_VERSION}${VERSION_SUFFIX}"
+BUILD_DIR="${SCRIPT_DIR}/gen/tflite_pip/${PYTHON}"
+TENSORFLOW_TARGET=$1
+
+# Build source tree.
+rm -rf "${BUILD_DIR}" && mkdir -p "${BUILD_DIR}/tflite_runtime"
+cp -r "${TENSORFLOW_LITE_DIR}/tools/pip_package/debian" \
+      "${TENSORFLOW_LITE_DIR}/tools/pip_package/setup_with_bazel.py" \
+      "${TENSORFLOW_LITE_DIR}/tools/pip_package/MANIFEST.in" \
+      "${TENSORFLOW_LITE_DIR}/python/interpreter_wrapper" \
+      "${BUILD_DIR}"
+cp "${TENSORFLOW_LITE_DIR}/python/interpreter.py" \
+   "${BUILD_DIR}/tflite_runtime"
+echo "__version__ = '${PACKAGE_VERSION}'" >> "${BUILD_DIR}/tflite_runtime/__init__.py"
+echo "__git_version__ = '$(git -C "${TENSORFLOW_DIR}" describe)'" >> "${BUILD_DIR}/tflite_runtime/__init__.py"
+
+# Build python interpreter_wrapper.
+cd "${BUILD_DIR}"
+case "${TENSORFLOW_TARGET}" in
+  rpi|armhf)
+    BAZEL_FLAGS="--config=elinux_armhf
+      --copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
+      --copt=-O3 --copt=-fno-tree-pre --copt=-fpermissive
+      --define=raspberry_pi_with_neon=true"
+    ;;
+  aarch64)
+    BAZEL_FLAGS="--config=elinux_aarch64
+      --copt=-O3"
+    ;;
+  *)
+    ;;
+esac
+
+# We need to pass down the environment variable with a possible alternate Python
+# include path for Python 3.x builds to work.
+export CROSSTOOL_PYTHON_INCLUDE_PATH
+
+bazel build -c opt -s --config=monolithic ${BAZEL_FLAGS} //tensorflow/lite/python/interpreter_wrapper:_pywrap_tensorflow_interpreter_wrapper
+cp "${TENSORFLOW_DIR}/bazel-bin/tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper.so" \
+   "${BUILD_DIR}/tflite_runtime"
+
+# Build python wheel.
+cd "${BUILD_DIR}"
+case "${TENSORFLOW_TARGET}" in
+  rpi|armhf)
+    ${PYTHON} setup_with_bazel.py bdist --plat-name=linux-armv7l \
+                       bdist_wheel --plat-name=linux-armv7l
+    ;;
+  aarch64)
+    ${PYTHON} setup_with_bazel.py bdist --plat-name=linux-aarch64 \
+                       bdist_wheel --plat-name=linux-aarch64
+    ;;
+  *)
+    if [[ -n "${TENSORFLOW_TARGET}" ]] && [[ -n "${TENSORFLOW_TARGET_ARCH}" ]]; then
+      ${PYTHON} setup_with_bazel.py bdist --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH} \
+                         bdist_wheel --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH}
+    else
+      ${PYTHON} setup_with_bazel.py bdist bdist_wheel
+    fi
+    ;;
+esac
+
+echo "Output can be found here:"
+find "${BUILD_DIR}"
+
+# Build debian package.
+if [[ "${BUILD_DEB}" != "y" ]]; then
+  exit 0
+fi
+
+PYTHON_VERSION=$(${PYTHON} -c "import sys;print(sys.version_info.major)")
+if [[ ${PYTHON_VERSION} != 3 ]]; then
+  echo "Debian package can only be generated for python3." >&2
+  exit 1
+fi
+
+DEB_VERSION=$(dpkg-parsechangelog --show-field Version | cut -d- -f1)
+if [[ "${DEB_VERSION}" != "${PACKAGE_VERSION}" ]]; then
+  cat << EOF > "${BUILD_DIR}/debian/changelog"
+tflite-runtime (${PACKAGE_VERSION}-1) unstable; urgency=low
+
+  * Bump version to ${PACKAGE_VERSION}.
+
+ -- TensorFlow team <packages@tensorflow.org>  $(date -R)
+
+$(<"${BUILD_DIR}/debian/changelog")
+EOF
+fi
+
+case "${TENSORFLOW_TARGET}" in
+  rpi|armhf)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a armhf
+    ;;
+  aarch64)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a arm64
+    ;;
+  *)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d
+    ;;
+esac
+
+cat "${BUILD_DIR}/debian/changelog"
+
diff --git a/tensorflow/lite/tools/pip_package/setup_with_bazel.py b/tensorflow/lite/tools/pip_package/setup_with_bazel.py
new file mode 100644
index 00000000000..e3e9a35a62e
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/setup_with_bazel.py
@@ -0,0 +1,70 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite is for mobile and embedded devices.
+
+TensorFlow Lite is the official solution for running machine learning models on
+mobile and embedded devices. It enables on-device machine learning inference
+with low latency and a small binary size on Android, iOS, and other operating
+systems.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from setuptools import find_packages
+from setuptools import setup
+PACKAGE_NAME = 'tflite_runtime'
+PACKAGE_VERSION = os.environ['PACKAGE_VERSION']
+DOCLINES = __doc__.split('\n')
+
+setup(
+    name=PACKAGE_NAME.replace('_', '-'),
+    version=PACKAGE_VERSION,
+    description=DOCLINES[0],
+    long_description='\n'.join(DOCLINES[2:]),
+    url='https://www.tensorflow.org/lite/',
+    author='Google, LLC',
+    author_email='packages@tensorflow.org',
+    license='Apache 2.0',
+    include_package_data=True,
+    keywords='tflite tensorflow tensor machine learning',
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Education',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache Software License',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Mathematics',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+    packages=find_packages(exclude=[]),
+    package_dir={'': '.'},
+    package_data={'': ['*.so']},
+    install_requires=[
+        'numpy >= 1.16.0',
+        'pybind11 >= 2.4.3',
+    ])
diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
new file mode 100644
index 00000000000..cf28b2eab72
--- /dev/null
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -0,0 +1,106 @@
+# Utilities for signature_defs in TFLite
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite/micro:build_def.bzl", "cc_library")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+TFLITE_DEFAULT_COPTS = if_not_windows([
+    "-Wall",
+    "-Wno-comment",
+    "-Wno-extern-c-compat",
+])
+
+cc_library(
+    name = "signature_def_util",
+    srcs = ["signature_def_util.cc"],
+    hdrs = ["signature_def_util.h"],
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:protos_all_cc_impl",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@com_google_protobuf//:protobuf",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "signature_def_util_test",
+    size = "small",
+    srcs = ["signature_def_util_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+    ],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":signature_def_util",
+        "//tensorflow/cc/saved_model:signature_constants",
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+pybind_extension(
+    name = "_pywrap_signature_def_util_wrapper",
+    srcs = [
+        "signature_def_util_wrapper_pybind11.cc",
+    ],
+    module_name = "_pywrap_signature_def_util_wrapper",
+    deps = [
+        ":signature_def_util",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/python:pybind11_lib",
+        "@pybind11",
+    ],
+)
+
+py_library(
+    name = "signature_def_utils",
+    srcs = ["signature_def_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":_pywrap_signature_def_util_wrapper",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_test(
+    name = "signature_def_utils_test",
+    srcs = ["signature_def_utils_test.py"],
+    data = ["//tensorflow/lite:testdata/add.bin"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_mac",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":signature_def_utils",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/signature/signature_def_util.cc b/tensorflow/lite/tools/signature/signature_def_util.cc
new file mode 100644
index 00000000000..e44fe98b3cc
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using tensorflow::Status;
+using SerializedSignatureDefMap = std::map<std::string, std::string>;
+using SignatureDefMap = std::map<std::string, tensorflow::SignatureDef>;
+
+const Metadata* GetSignatureDefMetadata(const Model* model) {
+  if (!model || !model->metadata()) {
+    return nullptr;
+  }
+  for (int i = 0; i < model->metadata()->size(); ++i) {
+    const Metadata* metadata = model->metadata()->Get(i);
+    if (metadata->name()->str() == kSignatureDefsMetadataName) {
+      return metadata;
+    }
+  }
+  return nullptr;
+}
+
+Status ReadSignatureDefMap(const Model* model, const Metadata* metadata,
+                           SerializedSignatureDefMap* map) {
+  if (!model || !metadata || !map) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  const flatbuffers::Vector<uint8_t>* flatbuffer_data =
+      model->buffers()->Get(metadata->buffer())->data();
+  const auto signature_defs =
+      flexbuffers::GetRoot(flatbuffer_data->data(), flatbuffer_data->size())
+          .AsMap();
+  for (int i = 0; i < signature_defs.Keys().size(); ++i) {
+    const std::string key = signature_defs.Keys()[i].AsString().c_str();
+    (*map)[key] = signature_defs[key].AsString().c_str();
+  }
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
+
+Status SetSignatureDefMap(const Model* model,
+                          const SignatureDefMap& signature_def_map,
+                          std::string* model_data_with_signature_def) {
+  if (!model || !model_data_with_signature_def) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  if (signature_def_map.empty()) {
+    return tensorflow::errors::InvalidArgument(
+        "signature_def_map should not be empty");
+  }
+  flexbuffers::Builder fbb;
+  const size_t start_map = fbb.StartMap();
+  auto mutable_model = absl::make_unique<ModelT>();
+  model->UnPackTo(mutable_model.get(), nullptr);
+  int buffer_id = mutable_model->buffers.size();
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (metadata) {
+    buffer_id = metadata->buffer();
+  } else {
+    auto buffer = absl::make_unique<BufferT>();
+    mutable_model->buffers.emplace_back(std::move(buffer));
+    auto sigdef_metadata = absl::make_unique<MetadataT>();
+    sigdef_metadata->buffer = buffer_id;
+    sigdef_metadata->name = kSignatureDefsMetadataName;
+    mutable_model->metadata.emplace_back(std::move(sigdef_metadata));
+  }
+  for (const auto& entry : signature_def_map) {
+    fbb.String(entry.first.c_str(), entry.second.SerializeAsString());
+  }
+  fbb.EndMap(start_map);
+  fbb.Finish();
+  mutable_model->buffers[buffer_id]->data = fbb.GetBuffer();
+  flatbuffers::FlatBufferBuilder builder;
+  auto packed_model = Model::Pack(builder, mutable_model.get());
+  FinishModelBuffer(builder, packed_model);
+  *model_data_with_signature_def =
+      std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
+                  builder.GetSize());
+  return Status::OK();
+}
+
+bool HasSignatureDef(const Model* model, const std::string& signature_key) {
+  if (!model) {
+    return false;
+  }
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (!metadata) {
+    return false;
+  }
+  SerializedSignatureDefMap signature_defs;
+  if (ReadSignatureDefMap(model, metadata, &signature_defs) !=
+      tensorflow::Status::OK()) {
+    return false;
+  }
+  return (signature_defs.find(signature_key) != signature_defs.end());
+}
+
+Status GetSignatureDefMap(const Model* model,
+                          SignatureDefMap* signature_def_map) {
+  if (!model || !signature_def_map) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  SignatureDefMap retrieved_signature_def_map;
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (metadata) {
+    SerializedSignatureDefMap signature_defs;
+    auto status = ReadSignatureDefMap(model, metadata, &signature_defs);
+    if (status != tensorflow::Status::OK()) {
+      return tensorflow::errors::Internal("Error reading signature def map: %s",
+                                          status.error_message());
+    }
+    for (const auto& entry : signature_defs) {
+      tensorflow::SignatureDef signature_def;
+      if (!signature_def.ParseFromString(entry.second)) {
+        return tensorflow::errors::Internal(
+            "Cannot parse signature def found in flatbuffer.");
+      }
+      retrieved_signature_def_map[entry.first] = signature_def;
+    }
+    *signature_def_map = retrieved_signature_def_map;
+  }
+  return Status::OK();
+}
+
+Status ClearSignatureDefMap(const Model* model, std::string* model_data) {
+  if (!model || !model_data) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  auto mutable_model = absl::make_unique<ModelT>();
+  model->UnPackTo(mutable_model.get(), nullptr);
+  for (int id = 0; id < model->metadata()->size(); ++id) {
+    const Metadata* metadata = model->metadata()->Get(id);
+    if (metadata->name()->str() == kSignatureDefsMetadataName) {
+      auto* buffers = &(mutable_model->buffers);
+      buffers->erase(buffers->begin() + metadata->buffer());
+      mutable_model->metadata.erase(mutable_model->metadata.begin() + id);
+      break;
+    }
+  }
+  flatbuffers::FlatBufferBuilder builder;
+  auto packed_model = Model::Pack(builder, mutable_model.get());
+  FinishModelBuffer(builder, packed_model);
+  *model_data =
+      std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
+                  builder.GetSize());
+  return Status::OK();
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/signature/signature_def_util.h b/tensorflow/lite/tools/signature/signature_def_util.h
new file mode 100644
index 00000000000..7e9c96ffc43
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
+#define TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Constant for name of the Metadata entry associated with SignatureDefs.
+constexpr char kSignatureDefsMetadataName[] = "signature_defs_metadata";
+
+// The function `SetSignatureDefMap()` results in
+// `model_data_with_signature_defs` containing a serialized TFLite model
+// identical to `model` with a metadata and associated buffer containing
+// a FlexBuffer::Map with `signature_def_map` keys and values serialized to
+// String.
+//
+// If a Metadata entry containing a SignatureDef map exists, it will be
+//   overwritten.
+//
+// Returns error if `model_data_with_signature_defs` is null or
+//   `signature_def_map` is empty.
+//
+// On success, returns tensorflow::Status::OK() or error otherwise.
+// On error, `model_data_with_signature_defs` is unchanged.
+tensorflow::Status SetSignatureDefMap(
+    const Model* model,
+    const std::map<std::string, tensorflow::SignatureDef>& signature_def_map,
+    std::string* model_data_with_signature_defs);
+
+// The function `HasSignatureDef()` returns true if `model` contains a Metadata
+// table pointing to a buffer containing a FlexBuffer::Map and the map has
+// `signature_key` as a key, or false otherwise.
+bool HasSignatureDef(const Model* model, const std::string& signature_key);
+
+// The function `GetSignatureDefMap()` results in `signature_def_map`
+// pointing to a map<std::string, tensorflow::SignatureDef>
+// parsed from `model`'s metadata buffer.
+//
+// If the Metadata entry does not exist, `signature_def_map` is unchanged.
+// If the Metadata entry exists but cannot be parsed, returns an error.
+tensorflow::Status GetSignatureDefMap(
+    const Model* model,
+    std::map<std::string, tensorflow::SignatureDef>* signature_def_map);
+
+// The function `ClearSignatureDefs` results in `model_data`
+// containing a serialized Model identical to `model` omitting any
+// SignatureDef-related metadata or buffers.
+tensorflow::Status ClearSignatureDefMap(const Model* model,
+                                        std::string* model_data);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
diff --git a/tensorflow/lite/tools/signature/signature_def_util_test.cc b/tensorflow/lite/tools/signature/signature_def_util_test.cc
new file mode 100644
index 00000000000..d4581e262a4
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/cc/saved_model/signature_constants.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+using tensorflow::kClassifyMethodName;
+using tensorflow::kDefaultServingSignatureDefKey;
+using tensorflow::kPredictMethodName;
+using tensorflow::SignatureDef;
+using tensorflow::Status;
+
+constexpr char kSignatureInput[] = "input";
+constexpr char kSignatureOutput[] = "output";
+constexpr char kTestFilePath[] = "tensorflow/lite/testdata/add.bin";
+
+class SimpleSignatureDefUtilTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    flatbuffer_model_ = FlatBufferModel::BuildFromFile(kTestFilePath);
+    ASSERT_NE(flatbuffer_model_, nullptr);
+    model_ = flatbuffer_model_->GetModel();
+    ASSERT_NE(model_, nullptr);
+  }
+
+  SignatureDef GetTestSignatureDef() {
+    auto signature_def = SignatureDef();
+    tensorflow::TensorInfo input_tensor;
+    tensorflow::TensorInfo output_tensor;
+    *input_tensor.mutable_name() = kSignatureInput;
+    *output_tensor.mutable_name() = kSignatureOutput;
+    *signature_def.mutable_method_name() = kClassifyMethodName;
+    (*signature_def.mutable_inputs())[kSignatureInput] = input_tensor;
+    (*signature_def.mutable_outputs())[kSignatureOutput] = output_tensor;
+    return signature_def;
+  }
+  std::unique_ptr<FlatBufferModel> flatbuffer_model_;
+  const Model* model_;
+};
+
+TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefTest) {
+  SignatureDef expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  const std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  SignatureDef test_signature_def =
+      test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, OverwriteSignatureDefTest) {
+  auto expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  SignatureDef test_signature_def =
+      test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  *expected_signature_def.mutable_method_name() = kPredictMethodName;
+  expected_signature_def_map.erase(
+      expected_signature_def_map.find(kDefaultServingSignatureDefKey));
+  constexpr char kTestSignatureDefKey[] = "ServingTest";
+  expected_signature_def_map[kTestSignatureDefKey] = expected_signature_def;
+  EXPECT_EQ(
+      Status::OK(),
+      SetSignatureDefMap(add_model, expected_signature_def_map, &model_output));
+  const Model* final_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_FALSE(HasSignatureDef(final_model, kDefaultServingSignatureDefKey));
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(final_model, &test_signature_def_map));
+  EXPECT_NE(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  EXPECT_TRUE(HasSignatureDef(final_model, kTestSignatureDefKey));
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(final_model, &test_signature_def_map));
+  test_signature_def = test_signature_def_map[kTestSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, GetSignatureDefTest) {
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(), GetSignatureDefMap(model_, &test_signature_def_map));
+  EXPECT_FALSE(HasSignatureDef(model_, kDefaultServingSignatureDefKey));
+}
+
+TEST_F(SimpleSignatureDefUtilTest, ClearSignatureDefTest) {
+  const int expected_num_buffers = model_->buffers()->size();
+  auto expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  SignatureDef test_signature_def;
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  test_signature_def = test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  EXPECT_EQ(Status::OK(), ClearSignatureDefMap(add_model, &model_output));
+  const Model* clear_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_FALSE(HasSignatureDef(clear_model, kDefaultServingSignatureDefKey));
+  EXPECT_EQ(expected_num_buffers, clear_model->buffers()->size());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefErrorsTest) {
+  std::map<string, SignatureDef> test_signature_def_map;
+  std::string model_output;
+  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+      SetSignatureDefMap(model_, test_signature_def_map, &model_output)));
+  SignatureDef test_signature_def;
+  test_signature_def_map[kDefaultServingSignatureDefKey] = test_signature_def;
+  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+      SetSignatureDefMap(model_, test_signature_def_map, nullptr)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
new file mode 100644
index 00000000000..9477305d433
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+py::bytes WrappedSetSignatureDefMap(
+    const std::vector<uint8_t>& model_buffer,
+    const std::map<std::string, std::string>& serialized_signature_def_map) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string data;
+  std::map<std::string, tensorflow::SignatureDef> signature_def_map;
+  for (const auto& entry : serialized_signature_def_map) {
+    tensorflow::SignatureDef signature_def;
+    if (!signature_def.ParseFromString(entry.second)) {
+      throw std::invalid_argument("Cannot parse signature def");
+    }
+    signature_def_map[entry.first] = signature_def;
+  }
+  auto status = tflite::SetSignatureDefMap(model, signature_def_map, &data);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument(status.error_message());
+  }
+  return py::bytes(data);
+}
+
+std::map<std::string, py::bytes> WrappedGetSignatureDefMap(
+    const std::vector<uint8_t>& model_buffer) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string content;
+  std::map<std::string, tensorflow::SignatureDef> signature_def_map;
+  auto status = tflite::GetSignatureDefMap(model, &signature_def_map);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument("Cannot parse signature def");
+  }
+  std::map<std::string, py::bytes> serialized_signature_def_map;
+  for (const auto& entry : signature_def_map) {
+    serialized_signature_def_map[entry.first] =
+        py::bytes(entry.second.SerializeAsString());
+  }
+  return serialized_signature_def_map;
+}
+
+py::bytes WrappedClearSignatureDefs(const std::vector<uint8_t>& model_buffer) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string content;
+  auto status = tflite::ClearSignatureDefMap(model, &content);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument("An unknown error occurred");
+  }
+  return py::bytes(content);
+}
+
+PYBIND11_MODULE(_pywrap_signature_def_util_wrapper, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_signature_def_util_wrapper
+    -----
+  )pbdoc";
+
+  m.def("SetSignatureDefMap", &WrappedSetSignatureDefMap);
+
+  m.def("GetSignatureDefMap", &WrappedGetSignatureDefMap);
+
+  m.def("ClearSignatureDefs", &WrappedClearSignatureDefs);
+}
diff --git a/tensorflow/lite/tools/signature/signature_def_utils.py b/tensorflow/lite/tools/signature/signature_def_utils.py
new file mode 100644
index 00000000000..df25c651172
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_utils.py
@@ -0,0 +1,95 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions related to SignatureDefs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.lite.tools.signature import _pywrap_signature_def_util_wrapper as signature_def_util
+
+
+def set_signature_defs(tflite_model, signature_def_map):
+  """Sets SignatureDefs to the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: Binary TFLite model (bytes or bytes-like object) to which to
+      add signature_def.
+    signature_def_map: dict containing SignatureDefs to store in metadata.
+  Returns:
+    buffer: A TFLite model binary identical to model buffer with
+      metadata field containing SignatureDef.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+      signature_def_map is empty or does not contain a SignatureDef.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  serialized_signature_def_map = {
+      k: v.SerializeToString() for k, v in signature_def_map.items()}
+  model_buffer = signature_def_util.SetSignatureDefMap(
+      model, serialized_signature_def_map)
+  return model_buffer
+
+
+def get_signature_defs(tflite_model):
+  """Get SignatureDef dict from the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: TFLite model buffer to get the signature_def.
+
+  Returns:
+    dict containing serving names to SignatureDefs if exists, otherwise, empty
+      dict.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+    DecodeError:
+      SignatureDef cannot be parsed from TfLite SignatureDef metadata.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  serialized_signature_def_map = signature_def_util.GetSignatureDefMap(model)
+  def _deserialize(serialized):
+    signature_def = meta_graph_pb2.SignatureDef()
+    signature_def.ParseFromString(serialized)
+    return signature_def
+  return {k: _deserialize(v) for k, v in serialized_signature_def_map.items()}
+
+
+def clear_signature_defs(tflite_model):
+  """Clears SignatureDefs from the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: TFLite model buffer to remove signature_defs.
+
+  Returns:
+    buffer: A TFLite model binary identical to model buffer with
+      no SignatureDef metadata.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  return signature_def_util.ClearSignatureDefs(model)
diff --git a/tensorflow/lite/tools/signature/signature_def_utils_test.py b/tensorflow/lite/tools/signature/signature_def_utils_test.py
new file mode 100644
index 00000000000..f7cb33188af
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_utils_test.py
@@ -0,0 +1,76 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for signature_def_util.py.
+
+   - Tests adding a SignatureDef to TFLite metadata.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.lite.tools.signature import signature_def_utils
+
+
+class SignatureDefUtilsTest(tf.test.TestCase):
+
+  def testAddSignatureDefToFlatbufferMetadata(self):
+    """Test a SavedModel conversion has correct Metadata."""
+    filename = tf.compat.v1.resource_loader.get_path_to_datafile(
+        '../../testdata/add.bin')
+    if not os.path.exists(filename):
+      raise IOError('File "{0}" does not exist in {1}.'.format(
+          filename,
+          tf.compat.v1.resource_loader.get_root_dir_with_all_resources()))
+
+    with tf.io.gfile.GFile(filename, 'rb') as fp:
+      tflite_model = bytearray(fp.read())
+
+    self.assertIsNotNone(tflite_model, 'TFLite model is none')
+    sig_input_tensor = meta_graph_pb2.TensorInfo(
+        dtype=tf.as_dtype(tf.float32).as_datatype_enum,
+        tensor_shape=tf.TensorShape([1, 8, 8, 3]).as_proto())
+    sig_input_tensor_signature = {'x': sig_input_tensor}
+    sig_output_tensor = meta_graph_pb2.TensorInfo(
+        dtype=tf.as_dtype(tf.float32).as_datatype_enum,
+        tensor_shape=tf.TensorShape([1, 8, 8, 3]).as_proto())
+    sig_output_tensor_signature = {'y': sig_output_tensor}
+    predict_signature_def = (
+        tf.compat.v1.saved_model.build_signature_def(
+            sig_input_tensor_signature, sig_output_tensor_signature,
+            tf.saved_model.PREDICT_METHOD_NAME))
+    serving_key = tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    signature_def_map = {serving_key: predict_signature_def}
+    tflite_model = signature_def_utils.set_signature_defs(
+        tflite_model, signature_def_map)
+    saved_signature_def_map = signature_def_utils.get_signature_defs(
+        tflite_model)
+    signature_def = saved_signature_def_map.get(serving_key)
+    self.assertIsNotNone(signature_def, 'SignatureDef not found')
+    self.assertEqual(signature_def.SerializeToString(),
+                     predict_signature_def.SerializeToString())
+    remove_tflite_model = (
+        signature_def_utils.clear_signature_defs(tflite_model))
+    signature_def_map = signature_def_utils.get_signature_defs(
+        remove_tflite_model)
+    self.assertIsNone(signature_def_map.get(serving_key),
+                      'SignatureDef found, but should be missing')
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/lite/tools/test_utils.py b/tensorflow/lite/tools/test_utils.py
index 75a649b729a..3950e3de35e 100644
--- a/tensorflow/lite/tools/test_utils.py
+++ b/tensorflow/lite/tools/test_utils.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Utility functions that support testing.
 
-All functions that can be commonly used by various tests are in this file.
+All functions that can be commonly used by various tests.
 """
 
 from __future__ import absolute_import
@@ -24,8 +24,10 @@ from __future__ import print_function
 from flatbuffers.python import flatbuffers
 from tensorflow.lite.python import schema_py_generated as schema_fb
 
+TFLITE_SCHEMA_VERSION = 3
 
-def build_mock_model():
+
+def build_mock_flatbuffer_model():
   """Creates a flatbuffer containing an example model."""
   builder = flatbuffers.Builder(1024)
 
@@ -194,6 +196,7 @@ def build_mock_model():
 
   string4_offset = builder.CreateString('model_description')
   schema_fb.ModelStart(builder)
+  schema_fb.ModelAddVersion(builder, TFLITE_SCHEMA_VERSION)
   schema_fb.ModelAddOperatorCodes(builder, codes_offset)
   schema_fb.ModelAddSubgraphs(builder, subgraphs_offset)
   schema_fb.ModelAddDescription(builder, string4_offset)
@@ -205,10 +208,14 @@ def build_mock_model():
   return model
 
 
-def build_mock_model_python_object():
-  """Creates a python flatbuffer object containing an example model."""
-  model_mock = build_mock_model()
-  model_obj = schema_fb.Model.GetRootAsModel(model_mock, 0)
-  model = schema_fb.ModelT.InitFromObj(model_obj)
-
+def load_model_from_flatbuffer(flatbuffer_model):
+  """Loads a model as a python object from a flatbuffer model."""
+  model = schema_fb.Model.GetRootAsModel(flatbuffer_model, 0)
+  model = schema_fb.ModelT.InitFromObj(model)
   return model
+
+
+def build_mock_model():
+  """Creates an object containing an example model."""
+  model = build_mock_flatbuffer_model()
+  return load_model_from_flatbuffer(model)
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 2b7d3f7d316..118e2d420f8 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -28,6 +28,8 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+static const auto kTensorTypeNone = static_cast<::tflite::TensorType>(-1);
+
 // Get the number of dimensions of a tensor with idx of an operator op.
 inline int GetNumDims(const SubGraph* subgraph, const Operator* op, int idx) {
   return subgraph->tensors()->Get(op->inputs()->Get(idx))->shape()->size();
@@ -51,6 +53,14 @@ inline bool HaveSameShapes(const SubGraph* subgraph, const Operator* op,
 int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
   switch (op_sig.op) {
     case BuiltinOperator_CONV_2D:
+      // If the op has signed int16 op_sig.inputs and op_sig.outputs, its
+      // version 4.
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.input_types.at(1) == TensorType_INT16 &&
+          op_sig.output_types.at(1) == TensorType_INT16) {
+        return 4;
+      }
+
       // If the op has signed int8 op_sig.inputs and op_sig.outputs, its
       // version 3.
       if (op_sig.input_types.at(0) == TensorType_INT8 &&
@@ -68,6 +78,13 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_DEPTHWISE_CONV_2D:
+      // If the op accepts int16, we return version 5.
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.input_types.at(1) == TensorType_INT16 &&
+          op_sig.output_types.at(1) == TensorType_INT16) {
+        return 5;
+      }
+
       // If the op is a signed int8 hybrid operation, we need to return
       // version 4.
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
@@ -103,12 +120,25 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       // | Hybrid          |                  3 |                        3 |
       // | Quantized Int8  |                  4 |                        4 |
       // +-----------------+--------------------+--------------------------+
+
+      // FullyConnected with sparse weight is supported at version 8.
+      if (op_sig.options.fully_connected.sparse_weight) {
+        return 8;
+      }
+
+      // Int16 fully fixed point kernel is at version 7.
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.input_types.at(1) == TensorType_INT16 &&
+          op_sig.output_types.at(0) == TensorType_INT16) {
+        return 7;
+      }
+
       // 2 op_sig.inputs (no bias) use case is supported starting from
       // version 6.
       if (op_sig.input_types.size() == 2) {
         return 6;
       }
-      // `keep_num_dims` is supported at verison 5.
+      // `keep_num_dims` is supported at version 5.
       if (op_sig.options.fully_connected.keep_num_dims) {
         return 5;
       }
@@ -159,6 +189,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_MUL:
+      // Version 4 supports int16 inputs
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 4;
+      }
       // Version 3 supports have a rescale value greater than or equal to 1.
       if (op_sig.options.mul.input1_scale != 0 &&
           op_sig.options.mul.input2_scale != 0 &&
@@ -197,12 +231,17 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
-    case BuiltinOperator_TRANSPOSE_CONV:
+    case BuiltinOperator_TRANSPOSE_CONV: {
+      if (op_sig.input_types.size() == 4 &&
+          op_sig.input_types.at(3) != kTensorTypeNone) {
+        return 3;
+      }
       // If the op takes int8 input, it is version 2.
-      if (op_sig.input_types.at(0) == TensorType_INT8) {
+      if (op_sig.input_types.at(1) == TensorType_INT8) {
         return 2;
       }
       return 1;
+    }
 
     case BuiltinOperator_LSTM:
       // If the input tensor is float and a weight is int8, this is a version
@@ -230,6 +269,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_SPLIT:
+      // If the op take in16 input, it is version 4.
+      if (op_sig.input_types.at(1) == TensorType_INT16) {
+        return 4;
+      }
       // If the op take int8 input, it is version 2, for int32 it's version 3.
       // The input tensor is at index 1 not 0, 0 is the axis.
       if (op_sig.input_types.at(1) == TensorType_INT32) {
@@ -325,7 +368,15 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
     case BuiltinOperator_RESIZE_BILINEAR:
-      if (op_sig.options.resize_bilinear.half_pixel_centers) {
+      if (op_sig.options.resize.half_pixel_centers) {
+        return 3;
+      } else if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
+    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
+      if (op_sig.options.resize.half_pixel_centers ||
+          op_sig.options.resize.align_corners) {
         return 3;
       } else if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
@@ -399,6 +450,18 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_TANH:
+    case BuiltinOperator_LOGISTIC:
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.output_types.at(0) == TensorType_INT16) {
+        return 3;
+      }
+
+      if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_FILL:
       if (op_sig.input_types.size() >= 2 &&
           (op_sig.input_types.at(1) == TensorType_BOOL ||
@@ -407,11 +470,38 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
-    case BuiltinOperator_ADD:
+    case BuiltinOperator_EQUAL:
+    case BuiltinOperator_NOT_EQUAL:
+      if (!op_sig.input_types.empty()) {
+        if (op_sig.input_types.at(0) == TensorType_STRING) {
+          return 3;
+        }
+        if (op_sig.input_types.at(0) == TensorType_INT8) {
+          return 2;
+        }
+      }
+      return 1;
+
+    case BuiltinOperator_LEAKY_RELU:
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_CONCATENATION:
+    case BuiltinOperator_SOFTMAX:
+      // In case of int16 inputs, the version is 3.
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 3;
+      }
+      if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
+
+    case BuiltinOperator_ADD:
     case BuiltinOperator_PAD:
     case BuiltinOperator_PADV2:
-    case BuiltinOperator_SOFTMAX:
     case BuiltinOperator_SPACE_TO_DEPTH:
     case BuiltinOperator_SPLIT_V:
     case BuiltinOperator_MEAN:
@@ -419,15 +509,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     case BuiltinOperator_REDUCE_MAX:
     case BuiltinOperator_REDUCE_MIN:
     case BuiltinOperator_RELU6:
-    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
-    case BuiltinOperator_TANH:
-    case BuiltinOperator_LOGISTIC:
     case BuiltinOperator_LOG_SOFTMAX:
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_ARG_MAX:
     case BuiltinOperator_ARG_MIN:
-    case BuiltinOperator_EQUAL:
-    case BuiltinOperator_NOT_EQUAL:
     case BuiltinOperator_GREATER:
     case BuiltinOperator_GREATER_EQUAL:
     case BuiltinOperator_LESS:
@@ -448,10 +533,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
 }
 
 TensorType GetTensorType(int32_t idx, const SubGraph* subgraph) {
-  const auto& none_type = static_cast<::tflite::TensorType>(-1);
   if (idx == -1)
     // For optional input/output, return none type directly.
-    return none_type;
+    return kTensorTypeNone;
 
   // Some tests have a graph with invalid tensor index.
   TFLITE_DCHECK_GE(idx, 0);
@@ -459,7 +543,7 @@ TensorType GetTensorType(int32_t idx, const SubGraph* subgraph) {
     return subgraph->tensors()->Get(idx)->type();
   }
   LOG(ERROR) << "Can't access tenor " << idx;
-  return none_type;
+  return kTensorTypeNone;
 }
 
 // Generate OpSignature with the given OperatorCode, Operator and Tensors (from
@@ -499,6 +583,11 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
         op_sig.options.fully_connected.weights_format =
             fully_connected_option->weights_format();
       }
+
+      const Tensor* weight_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(1));
+      op_sig.options.fully_connected.sparse_weight =
+          (weight_tensor->sparsity() != nullptr);
     } break;
 
     case BuiltinOperator_MUL: {
@@ -538,8 +627,19 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       auto resize_bilinear_option =
           op->builtin_options_as_ResizeBilinearOptions();
       if (resize_bilinear_option) {
-        op_sig.options.resize_bilinear.half_pixel_centers =
+        op_sig.options.resize.half_pixel_centers =
             resize_bilinear_option->half_pixel_centers();
+        op_sig.options.resize.align_corners =
+            resize_bilinear_option->align_corners();
+      }
+    } break;
+    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
+      auto resize_nn_option =
+          op->builtin_options_as_ResizeNearestNeighborOptions();
+      if (resize_nn_option) {
+        op_sig.options.resize.half_pixel_centers =
+            resize_nn_option->half_pixel_centers();
+        op_sig.options.resize.align_corners = resize_nn_option->align_corners();
       }
     } break;
     // TODO(b/150176627): Add tests for GetOpSignature.
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index 4b1771faec8..df74ffaf6dd 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -37,6 +37,9 @@ typedef struct {
     struct {
       bool keep_num_dims;
       FullyConnectedOptionsWeightsFormat weights_format;
+      // TODO(b/156530611): Make this global when more ops support sparse
+      // computation.
+      bool sparse_weight;
     } fully_connected;
     struct {
       float input1_scale;
@@ -48,7 +51,8 @@ typedef struct {
     } lstm;
     struct {
       bool half_pixel_centers;
-    } resize_bilinear;
+      bool align_corners;
+    } resize;
     struct {
       int32_t num_dims;
     } single_input_op;
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 5dde260241e..4017fc3bff0 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -86,10 +86,20 @@ void SimpleOutputVersioningTest(BuiltinOperator op) {
 
 TEST(OpVersionTest, VersioningEqualTest) {
   SimpleVersioningTest(BuiltinOperator_EQUAL);
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_EQUAL,
+      .input_types = std::vector<TensorType>{TensorType_STRING},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
 }
 
 TEST(OpVersionTest, VersioningNotEqualTest) {
   SimpleVersioningTest(BuiltinOperator_NOT_EQUAL);
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_NOT_EQUAL,
+      .input_types = std::vector<TensorType>{TensorType_STRING},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
 }
 
 TEST(OpVersionTest, VersioningLessTest) {
@@ -342,6 +352,15 @@ TEST(OpVersionTest, VersioningFullyConnectedTest) {
   fake_op_sig.options.fully_connected = {
       false, FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 6);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_FULLY_CONNECTED,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT8},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  fake_op_sig.options.fully_connected = {
+      false, FullyConnectedOptionsWeightsFormat_DEFAULT, true};
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 8);
 }
 
 TEST(OpVersionTest, VersioningDequantizeTest) {
@@ -419,7 +438,23 @@ TEST(OpVersionTest, VersioningTransposeConvOperatorTest) {
 
   fake_op_sig = {
       .op = BuiltinOperator_TRANSPOSE_CONV,
-      .input_types = std::vector<TensorType>{TensorType_INT8},
+      .input_types = std::vector<TensorType>{TensorType_INT32, TensorType_INT8,
+                                             TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_TRANSPOSE_CONV,
+      .input_types = std::vector<TensorType>{TensorType_INT32, TensorType_INT8,
+                                             TensorType_INT8, TensorType_INT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  const auto none_type = static_cast<::tflite::TensorType>(-1);
+  fake_op_sig = {
+      .op = BuiltinOperator_TRANSPOSE_CONV,
+      .input_types = std::vector<TensorType>{TensorType_INT32, TensorType_INT8,
+                                             TensorType_INT8, none_type},
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
 }
@@ -568,4 +603,64 @@ TEST(OpVersionTEst, VersioningFillTest) {
                                                         TensorType_INT32}};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
+TEST(OpVersionTest, VersioningResizeBilinearTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_BILINEAR,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // align_corners=true is still version 1.
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // half_pixel_centers=true must be version 3.
+  fake_op_sig.options.resize.align_corners = false;
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_BILINEAR,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+}
+TEST(OpVersionTest, VersioningResizeNearestNeighborTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // align_corners=true is version 3.
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // half_pixel_centers=true must be version 3.
+  fake_op_sig.options.resize.align_corners = false;
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+}
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/versioning/runtime_version.h b/tensorflow/lite/tools/versioning/runtime_version.h
index e4c25221310..ad88bd2ab89 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.h
+++ b/tensorflow/lite/tools/versioning/runtime_version.h
@@ -24,8 +24,8 @@ namespace tflite {
 void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer);
 
 // Returns true if the first version string precedes the second.
-// For example, '1.14' should precede '1.9', also '1.14.1' should precede
-// '1.14'. If two version string is equal, then false will be returned.
+// For example, '1.9' should precede '1.14', also '1.14' should precede
+// '1.14.1'. If two version string is equal, then false will be returned.
 bool CompareRuntimeVersion(const std::string&, const std::string&);
 
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/visualize_test.py b/tensorflow/lite/tools/visualize_test.py
index 8beb8f801da..aa74891224d 100644
--- a/tensorflow/lite/tools/visualize_test.py
+++ b/tensorflow/lite/tools/visualize_test.py
@@ -35,9 +35,9 @@ class VisualizeTest(test_util.TensorFlowTestCase):
     self.assertEqual('HASHTABLE_LOOKUP', visualize.BuiltinCodeToName(10))
 
   def testFlatbufferToDict(self):
-    model_data = test_utils.build_mock_model()
-    model_dict = visualize.CreateDictFromFlatbuffer(model_data)
-    self.assertEqual(0, model_dict['version'])
+    model = test_utils.build_mock_flatbuffer_model()
+    model_dict = visualize.CreateDictFromFlatbuffer(model)
+    self.assertEqual(test_utils.TFLITE_SCHEMA_VERSION, model_dict['version'])
     self.assertEqual(1, len(model_dict['subgraphs']))
     self.assertEqual(1, len(model_dict['operator_codes']))
     self.assertEqual(3, len(model_dict['buffers']))
@@ -45,12 +45,11 @@ class VisualizeTest(test_util.TensorFlowTestCase):
     self.assertEqual(0, model_dict['subgraphs'][0]['tensors'][0]['buffer'])
 
   def testVisualize(self):
-    model_data = test_utils.build_mock_model()
-
+    model = test_utils.build_mock_flatbuffer_model()
     tmp_dir = self.get_temp_dir()
     model_filename = os.path.join(tmp_dir, 'model.tflite')
     with open(model_filename, 'wb') as model_file:
-      model_file.write(model_data)
+      model_file.write(model)
     html_filename = os.path.join(tmp_dir, 'visualization.html')
 
     visualize.CreateHtmlFile(model_filename, html_filename)
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 0324c9508cf..41750ea02b4 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -4,11 +4,9 @@ tensorflow/api_template_v1.__init__.py
 tensorflow/compat_template.__init__.py
 tensorflow/compat_template_v1.__init__.py
 tensorflow/compiler/mlir/glob_lit_test.bzl
-tensorflow/core/public/version.h
 tensorflow/lite/micro/build_def.bzl
 tensorflow/python/autograph/core/config.py
-tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
-tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+tensorflow/python/eager/benchmarks_test_base.py
 tensorflow/python/tpu/profiler/pip_package/BUILD
 tensorflow/python/tpu/profiler/pip_package/README
 tensorflow/python/tpu/profiler/pip_package/build_pip_package.sh
@@ -263,6 +261,7 @@ tensorflow/third_party/toolchains/remote_config/containers.bzl
 tensorflow/third_party/toolchains/remote_config/rbe_config.bzl
 tensorflow/third_party/wrapt.BUILD
 tensorflow/third_party/zlib.BUILD
+tensorflow/tools/build_info/BUILD
 tensorflow/tools/ci_build/release/common.sh
 tensorflow/tools/ci_build/release/common_win.bat
 tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
@@ -342,6 +341,8 @@ tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
+tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
 tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/nightly.bat
@@ -393,6 +394,7 @@ tensorflow/tools/def_file_filter/BUILD
 tensorflow/tools/def_file_filter/BUILD.tpl
 tensorflow/tools/def_file_filter/def_file_filter.py.tpl
 tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+tensorflow/tools/git/BUILD
 tensorflow/tools/lib_package/BUILD
 tensorflow/tools/lib_package/LibTensorFlowTest.java
 tensorflow/tools/lib_package/README.md
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 207fde4c76c..869e2f2f8d8 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1,10 +1,9 @@
-# Python support for TensorFlow.
 #
 # Public targets:
 #  ":platform" - Low-level and platform-specific Python code.
 
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py", "tf_py_build_info_genrule", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
@@ -26,6 +25,12 @@ load("//tensorflow:tensorflow.bzl", "tf_external_workspace_visible")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_proto_library_py", "tf_protos_grappler")  # @unused
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static", "tf_additional_plugin_deps", "tf_additional_xla_deps_py")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
@@ -132,6 +137,7 @@ py_library(
         ":_pywrap_utils",
         ":array_ops",
         ":audio_ops_gen",
+        ":bincount",
         ":bitwise_ops",
         ":boosted_trees_ops",
         ":check_ops",
@@ -144,6 +150,7 @@ py_library(
         ":confusion_matrix",
         ":control_flow_ops",
         ":cudnn_rnn_ops_gen",
+        ":distributed_framework_test_lib",
         ":errors",
         ":framework",
         ":framework_combinations",
@@ -202,11 +209,8 @@ py_library(
         "//tensorflow/python/data",
         "//tensorflow/python/debug:debug_py",
         "//tensorflow/python/distribute",
-        "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:distribute_config",
         "//tensorflow/python/distribute:estimator_training",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/dlpack",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:monitoring",
@@ -229,10 +233,13 @@ py_library(
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
         "//tensorflow/python/tpu:tpu_noestimator",
+        "//tensorflow/python/types",
         "//third_party/py/numpy",
     ],
 )
 
+# TODO(gunan): Investigate making this action hermetic so we do not need
+# to run it locally.
 tf_py_build_info_genrule(
     name = "py_build_info_gen",
     out = "platform/build_info.py",
@@ -506,7 +513,7 @@ cc_library(
     hdrs = ["lib/core/ndarray_tensor_bridge.h"],
     visibility = tf_external_workspace_visible(
         visibility + [
-            "//learning/deepmind/courier:__subpackages__",
+            "//tensorflow:ndarray_tensor_allow_list",
         ],
     ),
     deps = [
@@ -645,21 +652,22 @@ tf_python_pybind_extension(
         "@pybind11",
         "//third_party/python_runtime:headers",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/c/experimental/saved_model/core:pywrap_required_hdrs",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "@com_google_absl//absl/types:optional",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/core:eager_service_proto_cc",
-            "//tensorflow/core:master_proto_cc",
-            "//tensorflow/core:worker_proto_cc",
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
             "//tensorflow/core:version_lib",
         ],
         otherwise = [
-            "//tensorflow/core:eager_service_proto_cc_headers_only",
-            "//tensorflow/core:master_proto_cc_headers_only",
-            "//tensorflow/core:worker_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
         ],
     ),
 )
@@ -711,7 +719,7 @@ tf_python_pybind_extension(
     srcs = [
         "training/quantize_training_wrapper.cc",
     ],
-    hdrs = ["//tensorflow/core/graph:quantize_training_hdrs"],
+    hdrs = ["//tensorflow/core/common_runtime:quantize_training_hdrs"],
     module_name = "_pywrap_quantize_training",
     deps = [
         ":pybind11_lib",
@@ -904,7 +912,8 @@ cc_library(
         ":safe_ptr",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
-        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -920,6 +929,7 @@ cc_library(
 
 cc_header_only_library(
     name = "py_func_headers_lib",
+    tags = ["no-ide"],
     deps = [
         ":py_func_lib",
     ],
@@ -980,7 +990,7 @@ cc_library(
     srcs = ["lib/core/ndarray_tensor.cc"],
     hdrs = ["lib/core/ndarray_tensor.h"],
     visibility = tf_external_workspace_visible(visibility + [
-        "//learning/deepmind/courier:__subpackages__",
+        "//tensorflow:ndarray_tensor_allow_list",
     ]),
     deps = [
         ":bfloat16_lib",
@@ -990,6 +1000,8 @@ cc_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:tfe_context_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1007,7 +1019,8 @@ cc_library(
         ":safe_ptr",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/python_runtime:headers",
@@ -1148,6 +1161,7 @@ cc_header_only_library(
     extra_deps = [
         "//tensorflow/core:protos_all_cc",
     ],
+    tags = ["no-ide"],
     deps = [
         ":python_op_gen",
     ],
@@ -1550,7 +1564,6 @@ py_library(
         ":platform",
         ":registry",
         ":tensor_conversion_registry",
-        ":tensor_like",
         ":tensor_shape",
         ":tf2",
         ":traceable_stack",
@@ -1589,13 +1602,6 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "tensor_like",
-    srcs = ["framework/tensor_like.py"],
-    srcs_version = "PY2AND3",
-    deps = [],
-)
-
 py_library(
     name = "indexed_slices",
     srcs = ["framework/indexed_slices.py"],
@@ -1608,6 +1614,7 @@ py_library(
         ":type_spec",
         ":util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/types",
     ],
 )
 
@@ -1685,6 +1692,7 @@ tf_py_test(
     deps = [
         ":auto_control_deps",
         ":client_testlib",
+        ":sendrecv_ops_gen",
     ],
 )
 
@@ -1768,9 +1776,9 @@ py_library(
         ":composite_tensor",
         ":dtypes",
         ":framework_ops",
-        ":tensor_like",
         ":tensor_util",
         ":type_spec",
+        "//tensorflow/python/types",
     ],
 )
 
@@ -1778,6 +1786,7 @@ py_library(
     name = "composite_tensor",
     srcs = ["framework/composite_tensor.py"],
     srcs_version = "PY2AND3",
+    visibility = visibility + ["//tensorflow:composite_tensor_whitelist"],
     deps = [
         ":dtypes",
         ":tensor_util",
@@ -1851,6 +1860,7 @@ py_library(
     name = "type_spec",
     srcs = ["framework/type_spec.py"],
     srcs_version = "PY2AND3",
+    visibility = visibility + ["//tensorflow:composite_tensor_whitelist"],
     deps = [
         ":dtypes",
         ":tensor_shape",
@@ -1878,7 +1888,6 @@ py_library(
     srcs = ["framework/tensor_util.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":tensor_like",
         ":tensor_shape",
         ":util",
         "//tensorflow/core:protos_all_py",
@@ -1974,6 +1983,7 @@ py_library(
 )
 
 # Including this as a dependency will result in tests to use TFRT.
+# TODO(b/153582383): Move tf_ops_alwayslink dependency to c_api_tfrt instead.
 py_library(
     name = "is_tfrt_test_true",
     srcs = ["framework/is_tfrt_test_true.py"],
@@ -2064,6 +2074,7 @@ tf_py_test(
     srcs = ["framework/constant_op_test.py"],
     main = "framework/constant_op_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":constant_op",
     ],
@@ -2383,6 +2394,7 @@ tf_py_test(
     main = "framework/ops_test.py",
     python_version = "PY3",
     tags = ["no_pip"],  # test_ops_2 is not available in pip.
+    tfrt_enabled = True,
     deps = [
         ":cond_v2",
         ":control_flow_ops",
@@ -2471,9 +2483,11 @@ tf_py_test(
     main = "framework/sparse_tensor_test.py",
     python_version = "PY3",
     deps = [
+        ":array_ops",
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
+        ":math_ops",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
@@ -2891,6 +2905,11 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "count_ops_gen",
+    visibility = ["//learning/brain/python/ops:__pkg__"],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "parsing_ops_gen",
     visibility = ["//learning/brain/python/ops:__pkg__"],
@@ -3222,6 +3241,27 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "collective_ops_xla_test",
+    size = "small",
+    srcs = ["ops/collective_ops_xla_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",
+        "no_rocm",
+        "no_windows",
+        "nomac",
+    ],
+    xla_enable_strict_auto_jit = True,
+    deps = [
+        ":client_testlib",
+        ":collective_ops",
+        ":framework_for_generated_wrappers",
+        ":kernels",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "collective_ops_gpu_test",
     size = "small",
@@ -3435,6 +3475,28 @@ py_library(
     ],
 )
 
+py_library(
+    name = "bincount",
+    srcs = ["ops/bincount.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":count_ops_gen",
+        ":framework",
+        ":framework_for_generated_wrappers",
+    ],
+)
+
+tf_py_test(
+    name = "bincount_test",
+    size = "small",
+    srcs = ["ops/bincount_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":bincount",
+        ":platform_test",
+    ],
+)
+
 py_library(
     name = "ctc_ops",
     srcs = ["ops/ctc_ops.py"],
@@ -4885,7 +4947,6 @@ cuda_py_test(
         ":test_ops",
         ":unconnected_gradients",
         ":variable_scope",
-        "//tensorflow/python/keras:engine",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -5513,6 +5574,7 @@ py_library(
         "//tensorflow:__pkg__",
         "//third_party/py/tensorflow_core:__subpackages__",
         "//third_party/py/tf_agents:__subpackages__",
+        "//tensorflow:composite_tensor_whitelist",
     ],
     deps = [
         ":tf_decorator",
@@ -5891,6 +5953,7 @@ pywrap_tensorflow_macro(
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core:lib",
         "//tensorflow/core:reader_base",
+        "//tensorflow/core/kernels:data_service_ops",
         "//tensorflow/core/debug",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/platform:stacktrace_handler",
@@ -5931,8 +5994,12 @@ filegroup(
         "//tensorflow/c:tf_status_helper",  # tfe
         "//tensorflow/compiler/jit:flags",  #tfe
         "//tensorflow/compiler/mlir/python:mlir",  # mlir
-        "//tensorflow/core/common_runtime:core_cpu_base_no_ops",  # tf_session
-        "//tensorflow/core:core_cpu_impl",  # device_lib
+        "//tensorflow/core/common_runtime:device",  # device_lib, tfe, tf_session
+        "//tensorflow/core/common_runtime:device_factory",  # device_lib, tfe, tf_session
+        "//tensorflow/core/common_runtime:graph_constructor",  # tf_session
+        "//tensorflow/core/common_runtime:quantize_training",  # quantize_training
+        "//tensorflow/core/common_runtime:session_options",  # device_lib, tfe, tf_session
+        "//tensorflow/core/common_runtime:session_state",  # tf_session
         "//tensorflow/core/data/service:server_lib",  # server_lib
         "//tensorflow/core:framework_internal_impl",  # op_def_registry
         "//tensorflow/core:lib_internal_impl",  # device_lib
@@ -6447,6 +6514,7 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "gpu_cupti",
+        "no_gpu",  # b/154742661
     ],
     xla_enable_strict_auto_jit = False,  # Graph structure is different with autojit
     deps = [
@@ -7977,6 +8045,7 @@ tf_python_pybind_extension(
         "@com_google_absl//absl/types:optional",
         "@pybind11",
         "//third_party/python_runtime:headers",
+        "//tensorflow/c/experimental/saved_model/core:pywrap_required_hdrs",
         "//tensorflow/compiler/jit:flags_headers_only",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core:framework_headers_lib",
@@ -7985,14 +8054,14 @@ tf_python_pybind_extension(
         "//tensorflow/core/platform",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/core:eager_service_proto_cc",
-            "//tensorflow/core:master_proto_cc",
-            "//tensorflow/core:worker_proto_cc",
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
         ],
         otherwise = [
-            "//tensorflow/core:eager_service_proto_cc_headers_only",
-            "//tensorflow/core:master_proto_cc_headers_only",
-            "//tensorflow/core:worker_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
         ],
     ),
 )
@@ -8020,6 +8089,29 @@ py_binary(
     ],
 )
 
+tf_python_pybind_extension(
+    name = "_pywrap_parallel_device",
+    srcs = [
+        "lib/core/safe_ptr.h",
+        "//tensorflow/c:headers",
+        "//tensorflow/c/eager:headers",
+        "//tensorflow/c/eager/parallel_device:headers",
+        "//tensorflow/c/eager/parallel_device:sources",
+        "//tensorflow/python/distribute/parallel_device:pywrap_parallel_device.cc",
+    ],
+    module_name = "_pywrap_parallel_device",
+    visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
 pyx_library(
     name = "framework_fast_tensor_util",
     srcs = ["framework/fast_tensor_util.pyx"],
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index c3075cb3d5c..8939c9b3143 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -85,6 +85,7 @@ from tensorflow.python import keras
 from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
 from tensorflow.python.module import module
+from tensorflow.python.ops import bincount
 from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import image_ops as image
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 9c1d5a38707..9cf3bba8dd5 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -33,6 +33,7 @@ py_library(
         "logical_expressions.py",
         "return_statements.py",
         "slices.py",
+        "variables.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -117,7 +118,13 @@ py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+        "no_windows",
+        "nopip",
+    ],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -213,3 +220,16 @@ py_test(
         "//tensorflow/python/autograph/pyct",
     ],
 )
+
+py_test(
+    name = "variables_test",
+    srcs = ["variables_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":converters",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/autograph/core:test_lib",
+        "//tensorflow/python/autograph/pyct",
+    ],
+)
diff --git a/tensorflow/python/autograph/converters/asserts.py b/tensorflow/python/autograph/converters/asserts.py
index bc47fc8e8a9..63ac1cf5672 100644
--- a/tensorflow/python/autograph/converters/asserts.py
+++ b/tensorflow/python/autograph/converters/asserts.py
@@ -48,4 +48,5 @@ class AssertTransformer(converter.Base):
 
 
 def transform(node, ctx):
-  return AssertTransformer(ctx).visit(node)
+  node = AssertTransformer(ctx).visit(node)
+  return node
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index fd31cd15a0e..dc435cbc90e 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import asserts
 from tensorflow.python.autograph.converters import functions
+from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
@@ -36,7 +37,8 @@ class AssertsTest(converter_testing.TestCase):
       return a
 
     with ops.Graph().as_default():
-      with self.converted(test_fn, (functions, asserts), {}) as result:
+      with self.converted(
+          test_fn, (functions, asserts, return_statements), {}) as result:
         op = result.test_fn(constant_op.constant(False))
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, 'testmsg'):
diff --git a/tensorflow/python/autograph/converters/break_statements.py b/tensorflow/python/autograph/converters/break_statements.py
index 60e65e9a1db..c99d8d8bee2 100644
--- a/tensorflow/python/autograph/converters/break_statements.py
+++ b/tensorflow/python/autograph/converters/break_statements.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
@@ -53,7 +55,7 @@ class BreakTransformer(converter.Base):
       return block
 
     template = """
-        if ag__.not_(var_name):
+        if not var_name:
           block
       """
     node = templates.replace(
@@ -100,7 +102,7 @@ class BreakTransformer(converter.Base):
 
     template = """
       var_name = False
-      while ag__.and_(lambda: test, lambda: ag__.not_(var_name)):
+      while not var_name and test:
         body
       orelse
     """
@@ -150,7 +152,7 @@ class BreakTransformer(converter.Base):
     # break did not trigger).
     guarded_orelse = self._guard_if_present(node.orelse, break_var)
     extra_test = templates.replace_as_expression(
-        'ag__.not_(var_name)', var_name=break_var)
+        'not var_name', var_name=break_var)
 
     # The extra test is hidden in the AST, which will confuse the static
     # analysis. To mitigate that, we insert a no-op statement that ensures
@@ -179,6 +181,9 @@ class BreakTransformer(converter.Base):
 
 
 def transform(node, ctx):
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
+
   transformer = BreakTransformer(ctx)
   node = transformer.visit(node)
   return node
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index 6d59c4bc761..505925650d1 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -29,6 +29,7 @@ import gast
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.utils import ag_logging
 
@@ -218,4 +219,7 @@ def transform(node, ctx):
         node: The transformed AST
         new_names: set(string), containing any newly-generated names
   """
-  return CallTreeTransformer(ctx).visit(node)
+  node = qual_names.resolve(node)
+
+  node = CallTreeTransformer(ctx).visit(node)
+  return node
diff --git a/tensorflow/python/autograph/converters/conditional_expressions.py b/tensorflow/python/autograph/converters/conditional_expressions.py
index 44ab6dee926..65fb6765fcf 100644
--- a/tensorflow/python/autograph/converters/conditional_expressions.py
+++ b/tensorflow/python/autograph/converters/conditional_expressions.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gast
+
 from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 
 
@@ -26,19 +29,20 @@ class ConditionalExpressionTransformer(converter.Base):
   """Converts conditional expressions to functional form."""
 
   def visit_IfExp(self, node):
-    return templates.replace_as_expression(
-        '''ag__.if_stmt(
+    template = '''
+        ag__.if_exp(
             test,
             lambda: true_expr,
             lambda: false_expr,
-            lambda: (),
-            lambda _: None,
-            ('<internal expr>',),
-            ())
-        ''',
+            expr_repr)
+    '''
+    expr_repr = parser.unparse(node.test, include_encoding_marker=False).strip()
+    return templates.replace_as_expression(
+        template,
         test=node.test,
         true_expr=node.body,
-        false_expr=node.orelse)
+        false_expr=node.orelse,
+        expr_repr=gast.Constant(expr_repr, kind=None))
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index 551ee196a14..9a87e66be18 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
@@ -83,7 +85,7 @@ class ContinueCanonicalizationTransformer(converter.Base):
       block.create_guard_next = False
       if should_wrap_current:
         template = """
-          if ag__.not_(var_name):
+          if not var_name:
             original_node
         """
         cond, = templates.replace(
@@ -159,6 +161,8 @@ class ContinueCanonicalizationTransformer(converter.Base):
 
 
 def transform(node, ctx):
-  transformer = ContinueCanonicalizationTransformer(ctx)
-  node = transformer.visit(node)
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
+
+  node = ContinueCanonicalizationTransformer(ctx).visit(node)
   return node
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 10db16ef1bb..673781e47dd 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -23,10 +23,15 @@ import gast
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import ast_util
+from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis import annos
+from tensorflow.python.autograph.pyct.static_analysis import liveness
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
 from tensorflow.python.autograph.utils import compat_util
 
 
@@ -51,104 +56,16 @@ class ControlFlowTransformer(converter.Base):
       fn.scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
       return self.generic_visit(node)
 
-  def _create_cond_branch(self, body_name, aliased_orig_names,
-                          aliased_new_names, body, returns):
-    if len(returns) == 1:
-      template = """
-        return retval
-      """
-      return_stmt = templates.replace(template, retval=returns[0])
-    else:
-      template = """
-        return (retvals,)
-      """
-      return_stmt = templates.replace(template, retvals=returns)
-
-    if aliased_orig_names:
-      template = """
-        def body_name():
-          aliased_new_names, = aliased_orig_names,
-          body
-          return_stmt
-      """
-      return templates.replace(
-          template,
-          body_name=body_name,
-          body=body,
-          aliased_orig_names=aliased_orig_names,
-          aliased_new_names=aliased_new_names,
-          return_stmt=return_stmt)
-    else:
-      template = """
-        def body_name():
-          body
-          return_stmt
-      """
-      return templates.replace(
-          template, body_name=body_name, body=body, return_stmt=return_stmt)
-
-  def _create_cond_expr(self, results, test, body_name, orelse_name,
-                        state_getter_name, state_setter_name,
-                        basic_symbol_names, composite_symbol_names):
-    if results is not None:
-      template = """
-        results = ag__.if_stmt(test, body_name, orelse_name,
-                               state_getter_name, state_setter_name,
-                               (basic_symbol_names,),
-                               (composite_symbol_names,))
-      """
-      return templates.replace(
-          template,
-          test=test,
-          results=results,
-          body_name=body_name,
-          orelse_name=orelse_name,
-          state_getter_name=state_getter_name,
-          state_setter_name=state_setter_name,
-          basic_symbol_names=basic_symbol_names,
-          composite_symbol_names=composite_symbol_names)
-    else:
-      template = """
-        ag__.if_stmt(test, body_name, orelse_name, getter_name, setter_name,
-                     (basic_symbol_names,), (composite_symbol_names,))
-      """
-      return templates.replace(
-          template,
-          test=test,
-          body_name=body_name,
-          orelse_name=orelse_name,
-          getter_name=state_getter_name,
-          setter_name=state_setter_name,
-          basic_symbol_names=basic_symbol_names,
-          composite_symbol_names=composite_symbol_names)
-
-  def _fmt_symbols(self, symbol_set):
-    if not symbol_set:
-      return 'no variables'
-    return ', '.join(map(str, symbol_set))
-
-  def _determine_aliased_symbols(self, scope, node_defined_in, block):
-    if block:
-      block_live_in = set(anno.getanno(block[0], anno.Static.LIVE_VARS_IN))
-    else:
-      block_live_in = set()
-
-    modified_live = scope.modified & node_defined_in & block_live_in
-    # Composite symbols are handled elsewhere, see _create_state_functions
-    return {
-        s for s in modified_live
-        if not s.is_composite() and s not in self.state[_Function].scope.globals
-    }
-
-  def _create_nonlocal_declarations(self, loop_vars):
+  def _create_nonlocal_declarations(self, vars_):
+    vars_ = set(vars_)
     results = []
     global_vars = self.state[_Function].scope.globals
 
     if global_vars:
-      results.append(gast.Global([str(v) for v in global_vars]))
+      results.append(gast.Global([str(v) for v in vars_]))
 
     nonlocal_vars = [
-        v for v in loop_vars if not v.is_composite() and v not in global_vars]
+        v for v in vars_ if not v.is_composite() and v not in global_vars]
     if nonlocal_vars:
       results.append(gast.Nonlocal([str(v) for v in nonlocal_vars]))
 
@@ -160,9 +77,9 @@ class ControlFlowTransformer(converter.Base):
       template = """
         def getter_name():
           return state_vars,
-        def setter_name(loop_vars):
+        def setter_name(vars_):
           nonlocal_declarations
-          state_vars, = loop_vars
+          state_vars, = vars_
       """
       return templates.replace(
           template,
@@ -206,166 +123,34 @@ class ControlFlowTransformer(converter.Base):
           symbol_name=gast.Constant(s.ssf(), kind=None))
     return assignments
 
-  def visit_If(self, node):
-    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
-    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
-    defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
-    live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
-
-    # Note: this information needs to be extracted before the body conversion
-    # that happens in the call to generic_visit below, because the conversion
-    # generates nodes that lack static analysis annotations.
-    need_alias_in_body = self._determine_aliased_symbols(
-        body_scope, defined_in, node.body)
-    need_alias_in_orelse = self._determine_aliased_symbols(
-        orelse_scope, defined_in, node.orelse)
-
-    node = self.generic_visit(node)
-
-    modified_in_cond = body_scope.modified | orelse_scope.modified
-    returned_from_cond = set()
-    composites = set()
-    for s in modified_in_cond:
-      if s in live_out and not s.is_composite():
-        returned_from_cond.add(s)
-      if s.is_composite():
-        # Special treatment for compound objects, always return them.
-        # This allows special handling within the if_stmt itself.
-        # For example, in TensorFlow we need to restore the state of composite
-        # symbols to ensure that only effects from the executed branch are seen.
-        composites.add(s)
-
-    created_in_body = body_scope.modified & returned_from_cond - defined_in
-    created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
-
-    basic_created_in_body = tuple(
-        s for s in created_in_body if not s.is_composite())
-    basic_created_in_orelse = tuple(
-        s for s in created_in_orelse if not s.is_composite())
-
-    # These variables are defined only in a single branch. This is fine in
-    # Python so we pass them through. Another backend, e.g. Tensorflow, may need
-    # to handle these cases specially or throw an Error.
-    possibly_undefined = (set(basic_created_in_body) ^
-                          set(basic_created_in_orelse))
-
-    # Alias the closure variables inside the conditional functions, to allow
-    # the functions access to the respective variables.
-    # We will alias variables independently for body and orelse scope,
-    # because different branches might write different variables.
-    aliased_body_orig_names = tuple(need_alias_in_body)
-    aliased_orelse_orig_names = tuple(need_alias_in_orelse)
-    aliased_body_new_names = tuple(
-        self.ctx.namer.new_symbol(s.ssf(), body_scope.referenced)
-        for s in aliased_body_orig_names)
-    aliased_orelse_new_names = tuple(
-        self.ctx.namer.new_symbol(s.ssf(), orelse_scope.referenced)
-        for s in aliased_orelse_orig_names)
-
-    alias_body_map = dict(zip(aliased_body_orig_names, aliased_body_new_names))
-    alias_orelse_map = dict(
-        zip(aliased_orelse_orig_names, aliased_orelse_new_names))
-
-    node_body = ast_util.rename_symbols(node.body, alias_body_map)
-    node_orelse = ast_util.rename_symbols(node.orelse, alias_orelse_map)
-
-    cond_var_name = self.ctx.namer.new_symbol('cond', body_scope.referenced)
-    body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
-    orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
-    all_referenced = body_scope.referenced | orelse_scope.referenced
-    state_getter_name = self.ctx.namer.new_symbol('get_state', all_referenced)
-    state_setter_name = self.ctx.namer.new_symbol('set_state', all_referenced)
-
-    returned_from_cond = tuple(returned_from_cond)
-    composites = tuple(composites)
-
-    if returned_from_cond:
-      if len(returned_from_cond) == 1:
-        cond_results = returned_from_cond[0]
-      else:
-        cond_results = gast.Tuple([s.ast() for s in returned_from_cond], None)
-
-      returned_from_body = tuple(
-          alias_body_map[s] if s in need_alias_in_body else s
-          for s in returned_from_cond)
-      returned_from_orelse = tuple(
-          alias_orelse_map[s] if s in need_alias_in_orelse else s
-          for s in returned_from_cond)
-
-    else:
-      # When the cond would return no value, we leave the cond called without
-      # results. That in turn should trigger the side effect guards. The
-      # branch functions will return a dummy value that ensures cond
-      # actually has some return value as well.
-      cond_results = None
-      # TODO(mdan): Replace with None once side_effect_guards is retired.
-      returned_from_body = (templates.replace_as_expression(
-          'ag__.match_staging_level(1, cond_var_name)',
-          cond_var_name=cond_var_name),)
-      returned_from_orelse = (templates.replace_as_expression(
-          'ag__.match_staging_level(1, cond_var_name)',
-          cond_var_name=cond_var_name),)
-
-    cond_assign = self.create_assignment(cond_var_name, node.test)
-    body_def = self._create_cond_branch(
-        body_name,
-        aliased_orig_names=aliased_body_orig_names,
-        aliased_new_names=aliased_body_new_names,
-        body=node_body,
-        returns=returned_from_body)
-    orelse_def = self._create_cond_branch(
-        orelse_name,
-        aliased_orig_names=aliased_orelse_orig_names,
-        aliased_new_names=aliased_orelse_new_names,
-        body=node_orelse,
-        returns=returned_from_orelse)
-    undefined_assigns = self._create_undefined_assigns(possibly_undefined)
-    composite_defs = self._create_state_functions(
-        composites, [], state_getter_name, state_setter_name)
-
-    basic_symbol_names = tuple(
-        gast.Constant(str(symbol), kind=None) for symbol in returned_from_cond)
-    composite_symbol_names = tuple(
-        gast.Constant(str(symbol), kind=None) for symbol in composites)
-
-    cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
-                                       orelse_name, state_getter_name,
-                                       state_setter_name, basic_symbol_names,
-                                       composite_symbol_names)
-
-    if_ast = (
-        undefined_assigns + composite_defs + body_def + orelse_def +
-        cond_assign + cond_expr)
-    return if_ast
-
-  def _get_basic_loop_vars(self, modified, live_in, live_out):
-    # The loop variables corresponding to simple symbols (e.g. `x`).
-    basic_loop_vars = []
+  def _get_block_basic_vars(self, modified, live_in, live_out):
+    nonlocals = self.state[_Function].scope.nonlocals
+    basic_scope_vars = []
     for s in modified:
       if s.is_composite():
-        # TODO(mdan): Raise an error when this happens for a TF loop.
+        # TODO(mdan): Raise an error when this happens for a TF scope.
         continue
-      # Variables not live into or out of the loop are considered local to the
-      # loop.
-      if s not in live_in and s not in live_out:
-        continue
-      basic_loop_vars.append(s)
-    return frozenset(basic_loop_vars)
+      # Variables not live into or out of the scope are considered local to the
+      # scope.
+      if s in live_in or s in live_out or s in nonlocals:
+        basic_scope_vars.append(s)
+      continue
+    return frozenset(basic_scope_vars)
 
-  def _get_composite_loop_vars(self, modified, live_in):
-    # The loop variables corresponding to composite symbols (e.g. `self.x`).
-    composite_loop_vars = []
+  def _get_block_composite_vars(self, modified, live_in):
+    # The scope variables corresponding to composite symbols (e.g. `self.x`).
+    composite_scope_vars = []
     for s in modified:
       if not s.is_composite():
         continue
-      # Mutations made to objects created inside the loop will appear as writes
+      # Mutations made to objects created inside the scope will appear as writes
       # to composite symbols. Because these mutations appear as modifications
       # made to composite symbols, we check whether the composite's parent is
-      # actually live into the loop.
+      # actually live into the scope.
       # Example:
       #   while cond:
       #     x = Foo()
-      #     x.foo = 2 * x.foo  # x.foo is live into the loop, but x is not.
+      #     x.foo = 2 * x.foo  # x.foo is live into the scope, but x is not.
       #
       # Note that some parents might not be symbols - for example, in x['foo'],
       # 'foo' is a parent, but it's a literal, not a symbol. We don't check the
@@ -374,40 +159,106 @@ class ControlFlowTransformer(converter.Base):
           sss for sss in s.support_set if sss.is_symbol())
       if not all(sss in live_in for sss in support_set_symbols):
         continue
-      composite_loop_vars.append(s)
-    return frozenset(composite_loop_vars)
+      composite_scope_vars.append(s)
+    return frozenset(composite_scope_vars)
 
-  def _get_loop_vars(self, node, modified):
-    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+  def _get_block_vars(self, node, modified):
+    """Determines the variables affected inside a control flow statement."""
     defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
     live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
-    reserved_symbols = body_scope.referenced
 
-    basic_loop_vars = self._get_basic_loop_vars(modified, live_in, live_out)
-    composite_loop_vars = self._get_composite_loop_vars(modified, live_in)
-    loop_vars = tuple(basic_loop_vars | composite_loop_vars)
+    basic_scope_vars = self._get_block_basic_vars(
+        modified,
+        live_in,
+        live_out)
+    composite_scope_vars = self._get_block_composite_vars(modified, live_in)
+    scope_vars = tuple(basic_scope_vars | composite_scope_vars)
 
-    # Variable that are used or defined inside the loop, but not defined
-    # before entering the loop. Only simple variables must be defined. The
+    # Variables that are modified inside the scope, but not defined
+    # before entering it. Only simple variables must be defined. The
     # composite ones will be implicitly checked at runtime.
-    undefined_lives = basic_loop_vars - defined_in
+    # This covers loop variables as well as variables that
+    undefined = tuple(v for v in modified - defined_in if not v.is_composite())
 
-    return loop_vars, reserved_symbols, undefined_lives
+    # Variables that are modified inside the scope, and depend on values outside
+    # it.
+    input_only = basic_scope_vars & live_in - live_out
+
+    # Place the outputs first.
+    scope_vars = sorted(scope_vars, key=lambda v: v in input_only)
+    nouts = len(scope_vars) - len(input_only)
+
+    return scope_vars, undefined, nouts
+
+  def visit_If(self, node):
+    node = self.generic_visit(node)
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
+
+    cond_vars, undefined, nouts = self._get_block_vars(
+        node, body_scope.modified | orelse_scope.modified)
+
+    undefined_assigns = self._create_undefined_assigns(undefined)
+
+    nonlocal_declarations = self._create_nonlocal_declarations(cond_vars)
+
+    reserved = body_scope.referenced | orelse_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved)
+    state_functions = self._create_state_functions(
+        cond_vars, nonlocal_declarations, state_getter_name, state_setter_name)
+
+    orelse_body = node.orelse
+    if not orelse_body:
+      orelse_body = [gast.Pass()]
+
+    template = """
+      state_functions
+      def body_name():
+        nonlocal_declarations
+        body
+      def orelse_name():
+        nonlocal_declarations
+        orelse
+      undefined_assigns
+      ag__.if_stmt(
+        test,
+        body_name,
+        orelse_name,
+        state_getter_name,
+        state_setter_name,
+        (symbol_names,),
+        nouts)
+    """
+    return templates.replace(
+        template,
+        body=node.body,
+        body_name=self.ctx.namer.new_symbol('if_body', reserved),
+        orelse=orelse_body,
+        orelse_name=self.ctx.namer.new_symbol('else_body', reserved),
+        nonlocal_declarations=nonlocal_declarations,
+        nouts=gast.Constant(nouts, kind=None),
+        state_functions=state_functions,
+        state_getter_name=state_getter_name,
+        state_setter_name=state_setter_name,
+        symbol_names=tuple(gast.Constant(str(s), kind=None) for s in cond_vars),
+        test=node.test,
+        undefined_assigns=undefined_assigns)
 
   def visit_While(self, node):
     node = self.generic_visit(node)
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
 
-    loop_vars, reserved_symbols, possibly_undefs = self._get_loop_vars(
-        node, body_scope.modified)
+    loop_vars, undefined, _ = self._get_block_vars(node, body_scope.modified)
 
-    undefined_assigns = self._create_undefined_assigns(possibly_undefs)
+    undefined_assigns = self._create_undefined_assigns(undefined)
 
     nonlocal_declarations = self._create_nonlocal_declarations(loop_vars)
 
-    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved_symbols)
-    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved_symbols)
+    reserved = body_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved)
     state_functions = self._create_state_functions(
         loop_vars, nonlocal_declarations, state_getter_name, state_setter_name)
 
@@ -432,7 +283,7 @@ class ControlFlowTransformer(converter.Base):
     return templates.replace(
         template,
         body=node.body,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+        body_name=self.ctx.namer.new_symbol('loop_body', reserved),
         nonlocal_declarations=nonlocal_declarations,
         opts=opts,
         state_functions=state_functions,
@@ -440,7 +291,7 @@ class ControlFlowTransformer(converter.Base):
         state_setter_name=state_setter_name,
         symbol_names=tuple(gast.Constant(str(s), kind=None) for s in loop_vars),
         test=node.test,
-        test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
+        test_name=self.ctx.namer.new_symbol('loop_test', reserved),
         undefined_assigns=undefined_assigns)
 
   def visit_For(self, node):
@@ -448,24 +299,28 @@ class ControlFlowTransformer(converter.Base):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     iter_scope = anno.getanno(node, annos.NodeAnno.ITERATE_SCOPE)
 
-    loop_vars, reserved_symbols, possibly_undefs = self._get_loop_vars(
+    loop_vars, undefined, _ = self._get_block_vars(
         node, body_scope.modified | iter_scope.modified)
 
-    undefined_assigns = self._create_undefined_assigns(possibly_undefs)
+    undefined_assigns = self._create_undefined_assigns(undefined)
 
     nonlocal_declarations = self._create_nonlocal_declarations(loop_vars)
 
-    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved_symbols)
-    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved_symbols)
+    reserved = body_scope.referenced | iter_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved)
     state_functions = self._create_state_functions(
         loop_vars, nonlocal_declarations, state_getter_name, state_setter_name)
 
     opts = self._create_loop_options(node)
+    opts.keys.append(gast.Constant('iterate_names', kind=None))
+    opts.values.append(gast.Constant(
+        parser.unparse(node.target, include_encoding_marker=False), kind=None))
 
     if anno.hasanno(node, anno.Basic.EXTRA_LOOP_TEST):
       extra_test = anno.getanno(node, anno.Basic.EXTRA_LOOP_TEST)
       extra_test_name = self.ctx.namer.new_symbol(
-          'extra_test', reserved_symbols)
+          'extra_test', reserved)
       template = """
         def extra_test_name():
           nonlocal_declarations
@@ -483,7 +338,7 @@ class ControlFlowTransformer(converter.Base):
 
     # iterate_arg_name holds a single arg with the iterates, which may be a
     # tuple.
-    iterate_arg_name = self.ctx.namer.new_symbol('itr', reserved_symbols)
+    iterate_arg_name = self.ctx.namer.new_symbol('itr', reserved)
     template = """
       iterates = iterate_arg_name
     """
@@ -510,7 +365,7 @@ class ControlFlowTransformer(converter.Base):
     return templates.replace(
         template,
         body=node.body,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+        body_name=self.ctx.namer.new_symbol('loop_body', reserved),
         extra_test_function=extra_test_function,
         extra_test_name=extra_test_name,
         iterate_arg_name=iterate_arg_name,
@@ -525,9 +380,23 @@ class ControlFlowTransformer(converter.Base):
         undefined_assigns=undefined_assigns)
 
 
+class AnnotatedDef(reaching_definitions.Definition):
+
+  def __init__(self):
+    super(AnnotatedDef, self).__init__()
+    self.directives = {}
+
+
 def transform(node, ctx):
-  transformer = ControlFlowTransformer(ctx)
-  return transformer.visit(node)
+  graphs = cfg.build(node)
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
+  node = reaching_definitions.resolve(node, ctx, graphs)
+  node = reaching_fndefs.resolve(node, ctx, graphs)
+  node = liveness.resolve(node, ctx, graphs)
+
+  node = ControlFlowTransformer(ctx).visit(node)
+  return node
 
 
 compat_util.deprecated_py2_support(__name__)
diff --git a/tensorflow/python/autograph/converters/control_flow_deprecated_py2.py b/tensorflow/python/autograph/converters/control_flow_deprecated_py2.py
index c70460a2413..203b99517dc 100644
--- a/tensorflow/python/autograph/converters/control_flow_deprecated_py2.py
+++ b/tensorflow/python/autograph/converters/control_flow_deprecated_py2.py
@@ -27,9 +27,15 @@ from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
+from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis import annos
+from tensorflow.python.autograph.pyct.static_analysis import liveness
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
 
 
 # TODO(mdan): Refactor functions to make them smaller.
@@ -52,18 +58,33 @@ class ControlFlowTransformer(converter.Base):
       return_stmt = templates.replace(template, retvals=returns)
 
     if aliased_orig_names:
+      alias_declarations = []
+      for new_name, old_name in zip(aliased_new_names, aliased_orig_names):
+        template = """
+          try:
+            aliased_new_name = aliased_orig_name
+          except NameError:
+            aliased_new_name = ag__.Undefined(symbol_name)
+        """
+
+        alias_declarations.extend(
+            templates.replace(
+                template,
+                aliased_new_name=new_name,
+                aliased_orig_name=old_name,
+                symbol_name=gast.Constant(str(old_name), kind=None)))
+
       template = """
         def body_name():
-          aliased_new_names, = aliased_orig_names,
+          alias_declarations
           body
           return_stmt
       """
       return templates.replace(
           template,
+          alias_declarations=alias_declarations,
           body_name=body_name,
           body=body,
-          aliased_orig_names=aliased_orig_names,
-          aliased_new_names=aliased_new_names,
           return_stmt=return_stmt)
     else:
       template = """
@@ -114,13 +135,8 @@ class ControlFlowTransformer(converter.Base):
       return 'no variables'
     return ', '.join(map(str, symbol_set))
 
-  def _determine_aliased_symbols(self, scope, node_defined_in, block):
-    if block:
-      block_live_in = set(anno.getanno(block[0], anno.Static.LIVE_VARS_IN))
-    else:
-      block_live_in = set()
-
-    modified_live = scope.modified & node_defined_in & block_live_in
+  def _determine_aliased_symbols(self, scope, node_defined_in):
+    modified_live = scope.modified & node_defined_in
     # Composite symbols are handled elsewhere see _create_state_functions
     return {s for s in modified_live if not s.is_composite()}
 
@@ -191,9 +207,9 @@ class ControlFlowTransformer(converter.Base):
     # that happens in the call to generic_visit below, because the conversion
     # generates nodes that lack static analysis annotations.
     need_alias_in_body = self._determine_aliased_symbols(
-        body_scope, defined_in, node.body)
+        body_scope, defined_in)
     need_alias_in_orelse = self._determine_aliased_symbols(
-        orelse_scope, defined_in, node.orelse)
+        orelse_scope, defined_in)
 
     node = self.generic_visit(node)
 
@@ -604,6 +620,20 @@ class ControlFlowTransformer(converter.Base):
           opts=opts)
 
 
+class AnnotatedDef(reaching_definitions.Definition):
+
+  def __init__(self):
+    super(AnnotatedDef, self).__init__()
+    self.directives = {}
+
+
 def transform(node, ctx):
+  graphs = cfg.build(node)
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
+  node = reaching_definitions.resolve(node, ctx, graphs)
+  node = reaching_fndefs.resolve(node, ctx, graphs)
+  node = liveness.resolve(node, ctx, graphs)
+
   node = ControlFlowTransformer(ctx).visit(node)
   return node
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 32e86400da6..935e2cec4b8 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -1,3 +1,4 @@
+# Lint as: python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -453,6 +454,17 @@ class IfStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
+  def test_local_remains_local(self):
+
+    def test_fn(n):
+      if n > 0:
+        b = 4
+        n = b + 1
+      return n
+
+    self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
+    self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
+
   def test_no_outputs(self):
 
     def test_fn(n):
@@ -465,6 +477,85 @@ class IfStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 1)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
+  def test_created_outputs(self):
+
+    def test_fn(i):
+      if i == 0:
+        result = i - 1
+      else:
+        result = i + 1
+      return result
+
+    self.assertTransformedResult(test_fn, 0, -1)
+    self.assertTransformedResult(test_fn, 1, 2)
+
+  def test_created_loop_local_outputs(self):
+
+    def test_fn(n, x):
+      for i in n:
+        if i == 0:
+          result = i - 1
+        else:
+          result = i + 1
+        if result > 0:
+          x += 1
+      return x
+
+    self.assertTransformedResult(test_fn, (range(5), 10), 14)
+
+  def test_created_loop_variable(self):
+
+    def test_fn(n, x):
+      for i in n:
+        if i == 0:
+          result = i - 1
+        if i > 0:  # Using the result from previous iteration.
+          if result < 0:
+            x += 1
+      return x
+
+    self.assertTransformedResult(test_fn, (range(5), 10), 14)
+
+  def test_unaffected_global(self):
+
+    def test_fn(i):
+      global g  # pylint:disable=global-variable-undefined
+      if i == 0:
+        g = i - 1
+      return g
+
+    self.assertTransformedResult(test_fn, 1, 3, symbols={'g': 3})
+    self.assertTransformedResult(test_fn, 0, -1, symbols={'g': 3})
+
+  def test_unaffected_nonlocal(self):
+
+    def test_fn(i):
+      def inner_fn():
+        nonlocal n
+        if i == 0:
+          n = i - 1
+
+      n = 3
+      inner_fn()
+      return n
+
+    self.assertTransformedResult(test_fn, 1, 3)
+    self.assertTransformedResult(test_fn, 0, -1)
+
+  def test_output_defined_in_prior_except(self):
+
+    def test_fn(i):
+      try:
+        raise ValueError()
+      except ValueError:
+        x = 1
+      if i == 0:
+        x = i - 1
+      return x
+
+    self.assertTransformedResult(test_fn, 1, 1)
+    self.assertTransformedResult(test_fn, 0, -1)
+
   def test_unbalanced_multiple_composites(self):
 
     class Foo(object):
diff --git a/tensorflow/python/autograph/converters/functions.py b/tensorflow/python/autograph/converters/functions.py
index 5ddbb277d10..26ead131f9b 100644
--- a/tensorflow/python/autograph/converters/functions.py
+++ b/tensorflow/python/autograph/converters/functions.py
@@ -23,7 +23,9 @@ import gast
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis import annos
 
 
@@ -36,15 +38,6 @@ class _Function(object):
 class FunctionTransformer(converter.Base):
   """Wraps function bodies around autograph-specific boilerplate."""
 
-  def visit_Return(self, node):
-    if node.value is None:
-      return node
-    node = self.generic_visit(node)
-    return templates.replace(
-        'return function_context_name.mark_return_value(value)',
-        function_context_name=self.state[_Function].context_name,
-        value=node.value)
-
   def _function_scope_options(self, fn_scope):
     """Returns the options with which to create function scopes."""
     # Top-level function receive the options that were directly requested.
@@ -139,4 +132,7 @@ class FunctionTransformer(converter.Base):
 
 
 def transform(node, ctx):
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
+
   return FunctionTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/functions_test.py b/tensorflow/python/autograph/converters/functions_test.py
index aad455e67d7..2a51ef71ebf 100644
--- a/tensorflow/python/autograph/converters/functions_test.py
+++ b/tensorflow/python/autograph/converters/functions_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.converters import functions
+from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
@@ -74,7 +75,7 @@ class FunctionTransformer(converter_testing.TestCase):
       l += 1
       return l, inner_fn(l)
 
-    with self.converted(test_fn, functions, {},
+    with self.converted(test_fn, (functions, return_statements), {},
                         (ops.name_scope,)) as result:
       first, second = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', first.op.name)
@@ -119,6 +120,7 @@ class FunctionTransformer(converter_testing.TestCase):
     ns = {'TestClass': TestClass}
     node, ctx = self.prepare(TestClass, ns)
     node = functions.transform(node, ctx)
+    node = return_statements.transform(node, ctx)
 
     with self.compiled(node, {}, (ops.name_scope,)) as result:
       first, second = result.TestClass().test_fn(constant_op.constant(1))
diff --git a/tensorflow/python/autograph/converters/lists.py b/tensorflow/python/autograph/converters/lists.py
index 253156ceac1..0943009ef4b 100644
--- a/tensorflow/python/autograph/converters/lists.py
+++ b/tensorflow/python/autograph/converters/lists.py
@@ -36,7 +36,9 @@ from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
@@ -235,4 +237,7 @@ class ListTransformer(converter.Base):
 
 
 def transform(node, ctx):
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
+
   return ListTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 8341187975b..e4062e42db7 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -23,7 +23,9 @@ import gast
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
@@ -218,9 +220,9 @@ class ReturnStatementsTransformer(converter.Base):
         retval = val
   """
 
-  def __init__(self, ctx, default_to_null_return):
+  def __init__(self, ctx, allow_missing_return):
     super(ReturnStatementsTransformer, self).__init__(ctx)
-    self.default_to_null_return = default_to_null_return
+    self.allow_missing_return = allow_missing_return
 
   def visit_Return(self, node):
     for block in reversed(self.state[_Block].stack):
@@ -256,7 +258,7 @@ class ReturnStatementsTransformer(converter.Base):
     state = self.state[_Block]
     if state.create_guard_now:
       template = """
-        if ag__.not_(do_return_var_name):
+        if not do_return_var_name:
           original_node
       """
       cond, = templates.replace(
@@ -285,7 +287,7 @@ class ReturnStatementsTransformer(converter.Base):
     node.body = self._visit_statement_block(node, node.body)
     if self.state[_Block].return_used:
       node.test = templates.replace_as_expression(
-          'ag__.and_(lambda: ag__.not_(control_var), lambda: test)',
+          'not control_var and test',
           test=node.test,
           control_var=self.state[_Function].do_return_var_name)
 
@@ -302,12 +304,12 @@ class ReturnStatementsTransformer(converter.Base):
       extra_test = anno.getanno(node, anno.Basic.EXTRA_LOOP_TEST, default=None)
       if extra_test is not None:
         extra_test = templates.replace_as_expression(
-            'ag__.and_(lambda: ag__.not_(control_var), lambda: extra_test)',
+            'not control_var and extra_test',
             extra_test=extra_test,
             control_var=self.state[_Function].do_return_var_name)
       else:
         extra_test = templates.replace_as_expression(
-            'ag__.not_(control_var)',
+            'not control_var',
             control_var=self.state[_Function].do_return_var_name)
       anno.setanno(node, anno.Basic.EXTRA_LOOP_TEST, extra_test)
 
@@ -337,69 +339,68 @@ class ReturnStatementsTransformer(converter.Base):
     return node
 
   def visit_FunctionDef(self, node):
-    self.state[_Function].enter()
-    self.state[_Block].enter()
-    self.state[_Block].is_function = True
+    with self.state[_Function] as fn:
+      with self.state[_Block] as block:
+        block.is_function = True
 
-    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    do_return_var_name = self.ctx.namer.new_symbol(
-        'do_return', scope.referenced)
-    retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
-    self.state[_Function].do_return_var_name = do_return_var_name
-    self.state[_Function].retval_var_name = retval_var_name
+        scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+        do_return_var_name = self.ctx.namer.new_symbol('do_return',
+                                                       scope.referenced)
+        retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
+        fn.do_return_var_name = do_return_var_name
+        fn.retval_var_name = retval_var_name
 
-    converted_body = self._visit_statement_block(node, node.body)
+        node.body = self._visit_statement_block(node, node.body)
 
-    # Avoid placing statements before any eventual docstring.
-    # TODO(mdan): Should a docstring even be included in the output?
-    docstring = None
-    if converted_body:
-      if (isinstance(converted_body[0], gast.Expr) and
-          isinstance(converted_body[0].value, gast.Constant)):
-        docstring = converted_body[0]
-        converted_body = converted_body[1:]
+        if block.return_used:
 
-    if self.state[_Block].return_used:
+          if self.allow_missing_return:
+            # The function whould have a single `with` node that wraps the
+            # entire body. If the function had a docstring, the body has two
+            # nodes, with the `with` as the second node.
+            wrapper_node = node.body[-1]
+            assert isinstance(wrapper_node, gast.With), (
+                'This transformer requires the functions converter.')
 
-      if self.default_to_null_return:
-        # TODO(mdan): Remove the (do_return_var_name,) below.
-        # Currently, that line ensures the variable is both defined and alive
-        # throughout the function.
-        template = """
-          do_return_var_name = False
-          retval_var_name = ag__.UndefinedReturnValue()
-          body
-          (do_return_var_name,)
-          return ag__.retval(retval_var_name)
-        """
-      else:
-        template = """
-          body
-          return retval_var_name
-        """
-      node.body = templates.replace(
-          template,
-          body=converted_body,
-          do_return_var_name=do_return_var_name,
-          retval_var_name=retval_var_name)
+            template = """
+              do_return_var_name = False
+              retval_var_name = ag__.UndefinedReturnValue()
+              body
+              return function_context.ret(retval_var_name, do_return_var_name)
+            """
 
-      if docstring:
-        node.body.insert(0, docstring)
+            wrapper_node.body = templates.replace(
+                template,
+                body=wrapper_node.body,
+                do_return_var_name=do_return_var_name,
+                function_context=anno.getanno(node, 'function_context_name'),
+                retval_var_name=retval_var_name)
+          else:
+            template = """
+              body
+              return retval_var_name
+            """
+            node.body = templates.replace(
+                template,
+                body=node.body,
+                do_return_var_name=do_return_var_name,
+                retval_var_name=retval_var_name)
 
-    self.state[_Block].exit()
-    self.state[_Function].exit()
     return node
 
 
 def transform(node, ctx, default_to_null_return=True):
-  """Ensure a function has only a single return."""
+  """Ensure a function has only a single return, at the end."""
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
+
   # Note: Technically, these two could be merged into a single walk, but
   # keeping them separate helps with readability.
-
   node = ConditionalReturnRewriter(ctx).visit(node)
 
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
   transformer = ReturnStatementsTransformer(
-      ctx, default_to_null_return=default_to_null_return)
+      ctx, allow_missing_return=default_to_null_return)
   node = transformer.visit(node)
-
   return node
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index df687927638..3f1e6a0bd97 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.converters import functions
 from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import ops
@@ -28,7 +29,7 @@ class SingleReturnTest(converter_testing.TestCase):
 
   def assertTransformedEquivalent(self, test_fn, *inputs):
     ns = {'ops': ops}
-    with self.converted(test_fn, return_statements, ns) as result:
+    with self.converted(test_fn, (functions, return_statements), ns) as result:
       self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
 
   def test_straightline(self):
diff --git a/tensorflow/python/autograph/converters/variables.py b/tensorflow/python/autograph/converters/variables.py
new file mode 100644
index 00000000000..3028a65a69b
--- /dev/null
+++ b/tensorflow/python/autograph/converters/variables.py
@@ -0,0 +1,76 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Overloads all variable read operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import templates
+
+
+class VariableAccessTransformer(converter.Base):
+  """Rewrites basic symbol reads.
+
+  This transformer rewrites variable reads with a "read" operator which allows
+  tracking activity.
+
+  Example:
+
+  For a basic statement:
+
+      a = b + c
+
+  This is translated to:
+
+      a = ld(b) + ld(c)
+
+  Augmented assignment operations also introduce a `ld` operator:
+
+      a += b
+
+  The assignment target also receives an operator to properly represent the
+  read:
+
+      a = ld(a)
+      a += ld(b)
+  """
+
+  def visit_Name(self, node):
+    # Only the loads which existed in the original code are overloaded.
+    if not anno.hasanno(node, anno.Static.ORIG_DEFINITIONS):
+      return node
+    if isinstance(node.ctx, gast.Load):
+      node = templates.replace_as_expression('ag__.ld(var_)', var_=node)
+    return node
+
+  def visit_AugAssign(self, node):
+    if isinstance(node.target, gast.Name):
+      template = """
+        var_ = ag__.ld(var_)
+        original
+      """
+      node = templates.replace(template, var_=node.target, original=node)
+    else:
+      node = self.generic_visit(node)
+    return node
+
+
+def transform(node, ctx):
+  return VariableAccessTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/variables_test.py b/tensorflow/python/autograph/converters/variables_test.py
new file mode 100644
index 00000000000..556dafbaa8a
--- /dev/null
+++ b/tensorflow/python/autograph/converters/variables_test.py
@@ -0,0 +1,116 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for variables module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.autograph.converters import variables
+from tensorflow.python.autograph.core import converter_testing
+from tensorflow.python.platform import test
+
+
+class VariablesTest(converter_testing.TestCase):
+
+  @contextlib.contextmanager
+  def apply_add_one_conversion(self, fn):
+    """Generates code which adds 1 to all variable reads."""
+    with self.converted(fn, variables, {}) as result:
+      result.ag__.__dict__['ld'] = lambda x: x + 1
+      yield result
+
+  def test_read(self):
+
+    def test_fn(l):
+      return l
+
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(1), 2)
+
+  def test_aug_assign(self):
+
+    def test_fn(l):
+      l *= 10
+      return l
+
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(1), (1 + 1) * 10 + 1)  # two reads
+
+  def test_attribute(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+    def test_fn(l):
+      return l.v
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+  def test_subscript(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+      def __getitem__(self, _):
+        return self.v
+
+    def test_fn(l):
+      return l[0]
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+  def test_call(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+      def __call__(self):
+        return self.v
+
+    def test_fn(l):
+      return l()
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 655dc118a37..4a5c50dac55 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -30,6 +30,7 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/autograph/operators",
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 77559fd2040..fd0dc0ebc2b 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -68,14 +68,9 @@ import enum
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
-from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
-from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.autograph.pyct.static_analysis import activity
-from tensorflow.python.autograph.pyct.static_analysis import liveness
-from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.util.tf_export import tf_export
 
 # TODO(mdan): These contexts can be refactored into first class objects.
@@ -326,56 +321,3 @@ class Base(transformer.Base):
       return super(Base, self).visit(node)
     finally:
       self._ast_depth -= 1
-
-
-class AnnotatedDef(reaching_definitions.Definition):
-
-  def __init__(self):
-    super(AnnotatedDef, self).__init__()
-    self.directives = {}
-
-
-def standard_analysis(node, context, is_initial=False):
-  """Performs a complete static analysis of the given code.
-
-  Args:
-    node: ast.AST
-    context: converter.EntityContext
-    is_initial: bool, whether this is the initial analysis done on the input
-      source code
-
-  Returns:
-    ast.AST, same as node, with the static analysis annotations added
-  """
-  # TODO(mdan): Clear static analysis here.
-  # TODO(mdan): Consider not running all analyses every time.
-  # TODO(mdan): Don't return a node because it's modified by reference.
-  graphs = cfg.build(node)
-  node = qual_names.resolve(node)
-  node = activity.resolve(node, context, None)
-  node = reaching_definitions.resolve(node, context, graphs, AnnotatedDef)
-  node = liveness.resolve(node, context, graphs)
-  if is_initial:
-    anno.dup(
-        node,
-        {
-            anno.Static.DEFINITIONS: anno.Static.ORIG_DEFINITIONS,
-        },
-    )
-  return node
-
-
-def apply_(node, context, converter_module):
-  """Applies a converter to an AST.
-
-  Args:
-    node: ast.AST
-    context: converter.EntityContext
-    converter_module: converter.Base
-
-  Returns:
-    ast.AST, the result of applying converter to node
-  """
-  node = standard_analysis(node, context)
-  node = converter_module.transform(node, context)
-  return node
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index b9e72e66c2e..fbb031876ad 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -31,12 +31,17 @@ from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import function_wrappers
 from tensorflow.python.autograph.lang import special_functions
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import loader
 from tensorflow.python.autograph.pyct import naming
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import pretty_printer
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.autograph.pyct.static_analysis import activity
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.platform import test
 
 
@@ -137,8 +142,7 @@ class TestCase(test.TestCase):
 
     if not isinstance(converter_module, (list, tuple)):
       converter_module = (converter_module,)
-    for i, m in enumerate(converter_module):
-      node = converter.standard_analysis(node, ctx, is_initial=not i)
+    for m in converter_module:
       node = m.transform(node, ctx)
 
     with self.compiled(node, namespace, tf_symbols) as result:
@@ -177,5 +181,16 @@ class TestCase(test.TestCase):
         namespace=namespace)
     ctx = transformer.Context(entity_info, namer, program_ctx)
     origin_info.resolve_entity(node, source, test_fn)
-    node = converter.standard_analysis(node, ctx, is_initial=True)
+
+    graphs = cfg.build(node)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx, None)
+    node = reaching_definitions.resolve(node, ctx, graphs)
+    anno.dup(
+        node,
+        {
+            anno.Static.DEFINITIONS: anno.Static.ORIG_DEFINITIONS,
+        },
+    )
+
     return node, ctx
diff --git a/tensorflow/python/autograph/core/function_wrappers.py b/tensorflow/python/autograph/core/function_wrappers.py
index cc0e7b98de5..d425f8b679d 100644
--- a/tensorflow/python/autograph/core/function_wrappers.py
+++ b/tensorflow/python/autograph/core/function_wrappers.py
@@ -20,12 +20,16 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.util import nest
 
 
+# TODO(mdan): Move this into operators - it represents a function definition.
+
+
 class FunctionScope(object):
   """Context manager that wraps the body of a converted function.
 
@@ -84,8 +88,13 @@ class FunctionScope(object):
     if self.use_auto_deps:
       self.autodeps_scope.__exit__(exc_type, exc_val, exc_tb)
 
-  def mark_return_value(self, value):
+  def ret(self, value, did_return):
     """Marks a value as returned from the function guarded by the scope."""
+    del did_return
+
+    if isinstance(value, variables.UndefinedReturnValue):
+      return None
+
     if self.use_auto_deps:
       self._return_value_marked = True
       if value is None:
diff --git a/tensorflow/python/autograph/core/function_wrappers_test.py b/tensorflow/python/autograph/core/function_wrappers_test.py
index 917a5358633..344ba495570 100644
--- a/tensorflow/python/autograph/core/function_wrappers_test.py
+++ b/tensorflow/python/autograph/core/function_wrappers_test.py
@@ -46,7 +46,7 @@ class FunctionWrappersTest(test.TestCase):
         converter.ConversionOptions(
             optional_features=converter.Feature.AUTO_CONTROL_DEPS)) as scope:
       v.assign(2)
-      op = scope.mark_return_value(constant_op.constant(1))
+      op = scope.ret(constant_op.constant(1), True)
     self.evaluate(op)
     self.assertEqual(self.evaluate(v.read_value()), 2)
 
diff --git a/tensorflow/python/autograph/g3doc/pyct_tutorial.ipynb b/tensorflow/python/autograph/g3doc/pyct_tutorial.ipynb
new file mode 100644
index 00000000000..51abdf62396
--- /dev/null
+++ b/tensorflow/python/autograph/g3doc/pyct_tutorial.ipynb
@@ -0,0 +1,512 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "fWfkYsCgPvqR"
+      },
+      "source": [
+        "# Short intro to the SCT library of AutoGraph\n",
+        "\n",
+        "**Work in progress, use with care and expect changes.**\n",
+        "\n",
+        "The `pyct` module packages the source code transformation APIs used by AutoGraph.\n",
+        "\n",
+        "This tutorial is just a preview - there is no PIP package yet, and the API has not been finalized, although most of those shown here are quite stable.\n",
+        "\n",
+        "[Run in Colab](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/pyct_tutorial.ipynb)\n",
+        "\n",
+        "Requires `tf-nightly`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "wq1DRamRlqoB"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "r7Q78WIKe2cu"
+      },
+      "source": [
+        "### Writing a custom code generator\n",
+        "\n",
+        "[transformer.CodeGenerator](https://github.com/tensorflow/tensorflow/blob/40802bcdb5c8a4379da2145441f51051402bd29b/tensorflow/python/autograph/pyct/transformer.py#L480) is an AST visitor that outputs a string. This makes it useful in the final stage of translating Python to another language."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HHaCMFOpuoVx"
+      },
+      "source": [
+        "Here's a toy C++ code generator written using a `transformer.CodeGenerator`, which is just a fancy subclass of [ast.NodeVisitor](https://docs.python.org/3/library/ast.html#ast.NodeVisitor):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "PJlTIbJlurpm"
+      },
+      "outputs": [],
+      "source": [
+        "import gast\n",
+        "from tensorflow.python.autograph.pyct import transformer\n",
+        "\n",
+        "class BasicCppCodegen(transformer.CodeGenerator):\n",
+        "\n",
+        "  def visit_Name(self, node):\n",
+        "    self.emit(node.id)\n",
+        "\n",
+        "  def visit_arguments(self, node):\n",
+        "    self.visit(node.args[0])\n",
+        "    for arg in node.args[1:]:\n",
+        "      self.emit(', ')\n",
+        "      self.visit(arg)\n",
+        "\n",
+        "  def visit_FunctionDef(self, node):\n",
+        "    self.emit('void {}'.format(node.name))\n",
+        "    self.emit('(')\n",
+        "    self.visit(node.args)\n",
+        "    self.emit(') {\\n')\n",
+        "    self.visit_block(node.body)\n",
+        "    self.emit('\\n}')\n",
+        "\n",
+        "  def visit_Call(self, node):\n",
+        "    self.emit(node.func.id)\n",
+        "    self.emit('(')\n",
+        "    self.visit(node.args[0])\n",
+        "    for arg in node.args[1:]:\n",
+        "      self.emit(', ')\n",
+        "      self.visit(arg)\n",
+        "    self.emit(');')\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "nUhlScyOjlYM"
+      },
+      "source": [
+        "Let's try it on a simple function:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ty9q853QvUqo"
+      },
+      "outputs": [],
+      "source": [
+        "def f(x, y):\n",
+        "  print(x, y)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "R8N15UpVvbmu"
+      },
+      "source": [
+        "First, parse the Python code and annotate the AST. This is easily done with standard libraries, but [parser.parse_entity](https://github.com/tensorflow/tensorflow/blob/40802bcdb5c8a4379da2145441f51051402bd29b/tensorflow/python/autograph/pyct/parser.py#L182) makes it a single call. It returns a [gast](https://github.com/serge-sans-paille/gast) AST, so you don't have to worry about Python version:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Cs_Ls0MesvBp"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.python.autograph.pyct import parser\n",
+        "\n",
+        "node, source = parser.parse_entity(f, ())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "kuT7J-xps_2Y"
+      },
+      "source": [
+        "There are a couple of context objects that most transformer objects like `CodeGenerator` use.\n",
+        "\n",
+        "Of note here is `EntityInfo.namespace`, which contains the runtime values for all the global and closure names that the function has access to. Inside a transformer object, this is available under `self.ctx.info.namespace`.\n",
+        "\n",
+        "For example, if a function uses NumPy, its namespace will typically include `'np'`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "pnB63kpttIVU"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.python.autograph.pyct import inspect_utils\n",
+        "\n",
+        "f_info = transformer.EntityInfo(\n",
+        "    name='f',\n",
+        "    source_code=source,\n",
+        "    source_file=None,\n",
+        "    future_features=(),\n",
+        "    namespace=inspect_utils.getnamespace(f))\n",
+        "ctx = transformer.Context(f_info, None, None)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "kCjcucmiwW98"
+      },
+      "source": [
+        "Finally, it's just a matter of running the generator:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "SdHjliuuwaaJ"
+      },
+      "outputs": [],
+      "source": [
+        "codegen = BasicCppCodegen(ctx)\n",
+        "codegen.visit(node)\n",
+        "\n",
+        "print(codegen.code_buffer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "rmRI9dG_ydE_"
+      },
+      "source": [
+        "### Helpful static analysis passes\n",
+        "\n",
+        "The `static_analysis` module contains various helper passes for dataflow analyis.\n",
+        "\n",
+        "All these passes annotate the AST. These annotations can be extracted using [anno.getanno](https://github.com/tensorflow/tensorflow/blob/40802bcdb5c8a4379da2145441f51051402bd29b/tensorflow/python/autograph/pyct/anno.py#L111). Most of them rely on the `qual_names` annotations, which just simplify the way more complex identifiers like `a.b.c` are accessed.\n",
+        "\n",
+        "The most useful is the activity analysis which just inventories symbols read, modified, etc.:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "GEJ30Wea4Xfy"
+      },
+      "outputs": [],
+      "source": [
+        "def get_node_and_ctx(f):\n",
+        "  node, source = parser.parse_entity(f, ())\n",
+        "  f_info = transformer.EntityInfo(\n",
+        "    name='f',\n",
+        "    source_code=source,\n",
+        "    source_file=None,\n",
+        "    future_features=(),\n",
+        "    namespace=None)\n",
+        "  ctx = transformer.Context(f_info, None, None)\n",
+        "  return node, ctx"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "BiwPJrDd0aAX"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.python.autograph.pyct import anno\n",
+        "from tensorflow.python.autograph.pyct import qual_names\n",
+        "from tensorflow.python.autograph.pyct.static_analysis import annos\n",
+        "from tensorflow.python.autograph.pyct.static_analysis import activity\n",
+        "\n",
+        "\n",
+        "def f(a):\n",
+        "  b = a + 1\n",
+        "  return b\n",
+        "\n",
+        "\n",
+        "node, ctx = get_node_and_ctx(f)\n",
+        "\n",
+        "node = qual_names.resolve(node)\n",
+        "node = activity.resolve(node, ctx)\n",
+        "\n",
+        "fn_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)  # Note: tag will be changed soon.\n",
+        "\n",
+        "\n",
+        "print('read:', fn_scope.read)\n",
+        "print('modified:', fn_scope.modified)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "w8dBRlKkFNIP"
+      },
+      "source": [
+        "Another useful utility is the control flow graph builder.\n",
+        "\n",
+        "Of course, a CFG that fully accounts for all effects is impractical to build in a late-bound language like Python without creating an almost fully-connected graph. However, one can be reasonably built if we ignore the potential for functions to raise arbitrary exceptions."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "KvLe9lWnFg7N"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.python.autograph.pyct import cfg\n",
+        "\n",
+        "\n",
+        "def f(a):\n",
+        "  if a \u003e 0:\n",
+        "    return a\n",
+        "  b = -a\n",
+        "\n",
+        "node, ctx = get_node_and_ctx(f)\n",
+        "\n",
+        "node = qual_names.resolve(node)\n",
+        "cfgs = cfg.build(node)\n",
+        "cfgs[node]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Cro-jfPA2oxR"
+      },
+      "source": [
+        "Other useful analyses include liveness analysis. Note that these make simplifying assumptions, because in general the CFG of a Python program is a graph that's almost complete. The only robust assumption is that execution can't jump backwards."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "73dARy4_2oAI"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.python.autograph.pyct import anno\n",
+        "from tensorflow.python.autograph.pyct import cfg\n",
+        "from tensorflow.python.autograph.pyct import qual_names\n",
+        "from tensorflow.python.autograph.pyct.static_analysis import annos\n",
+        "from tensorflow.python.autograph.pyct.static_analysis import liveness\n",
+        "\n",
+        "\n",
+        "def f(a):\n",
+        "  b = a + 1\n",
+        "  return b\n",
+        "\n",
+        "\n",
+        "node, ctx = get_node_and_ctx(f)\n",
+        "\n",
+        "node = qual_names.resolve(node)\n",
+        "cfgs = cfg.build(node)\n",
+        "node = activity.resolve(node, ctx)\n",
+        "node = liveness.resolve(node, ctx, cfgs)\n",
+        "\n",
+        "print('live into `b = a + 1`:', anno.getanno(node.body[0], anno.Static.LIVE_VARS_IN))\n",
+        "print('live into `return b`:', anno.getanno(node.body[1], anno.Static.LIVE_VARS_IN))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "GKSaqLbKQI_v"
+      },
+      "source": [
+        "### Writing a custom Python transpiler\n",
+        "\n",
+        "`transpiler.FunctionTranspiler` is a generic class for a Python [source-to-source compiler](https://en.wikipedia.org/wiki/Source-to-source_compiler). It operates on Python ASTs. Subclasses override its [transform_ast](https://github.com/tensorflow/tensorflow/blob/95ea3404528afcb1a74dd5f0946ea8d17beda28b/tensorflow/python/autograph/pyct/transpiler.py#L261) method.\n",
+        "\n",
+        "Unlike the `transformer` module, which have an AST as input/output, the `transpiler` APIs accept and return actual Python objects, handling the tasks associated with parsing, unparsing and loading of code."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eicHoYlzRhnc"
+      },
+      "source": [
+        "Here's a transpiler that does nothing:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "edaG6dWEPvUI"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.python.autograph.pyct import transpiler\n",
+        "\n",
+        "\n",
+        "class NoopTranspiler(transpiler.FunctionTranspiler):\n",
+        "\n",
+        "  def transform_ast(self, ast, transformer_context):\n",
+        "    return ast\n",
+        "\n",
+        "tr = NoopTranspiler()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "hKxmlWeQSQyN"
+      },
+      "source": [
+        "The main method is [transform_function](https://github.com/tensorflow/tensorflow/blob/95ea3404528afcb1a74dd5f0946ea8d17beda28b/tensorflow/python/autograph/pyct/transpiler.py#L384), which as its name suggests, operates on functions."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "HXTIYsunSVr1"
+      },
+      "outputs": [],
+      "source": [
+        "def f(x, y):\n",
+        "  return x + y\n",
+        "\n",
+        "\n",
+        "new_f, _, _ = tr.transform_function(f, None, None, {})\n",
+        "\n",
+        "print(new_f(1, 1))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "aKO42LBXw3SD"
+      },
+      "source": [
+        "### Adding new variables to the transformed code\n",
+        "\n",
+        "The transformed function has the same global and local variables as the original function. You can of course generate local imports to add any new references into the generated code, but an easier method is to use the `extra_locals` arg of `transform_function`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "_Wl0n5I_1NJZ"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.python.autograph.pyct import parser\n",
+        "\n",
+        "\n",
+        "class HelloTranspiler(transpiler.FunctionTranspiler):\n",
+        "\n",
+        "  def transform_ast(self, ast, transformer_context):\n",
+        "    print_code = parser.parse('print(\"Hello\", name)')\n",
+        "    ast.body = [print_code] + ast.body\n",
+        "    return ast\n",
+        "\n",
+        "\n",
+        "def f(x, y):\n",
+        "  pass\n",
+        "\n",
+        "\n",
+        "extra_locals = {'name': 'you'}\n",
+        "new_f, _, _ = HelloTranspiler().transform_function(f, None, None, extra_locals)\n",
+        "\n",
+        "_ = new_f(1, 1)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "JcMSHJXK6pO2"
+      },
+      "outputs": [],
+      "source": [
+        "import inspect\n",
+        "\n",
+        "print(inspect.getsource(new_f))"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "pyctr_tutorial.ipynb",
+      "provenance": [
+        {
+          "file_id": "1dT93XRkt7vUpVp7GZech8LB0u1OytKff",
+          "timestamp": 1586205976756
+        }
+      ]
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/python/autograph/g3doc/reference/_control_flow_tutorial.ipynb b/tensorflow/python/autograph/g3doc/reference/_control_flow_tutorial.ipynb
new file mode 100644
index 00000000000..0d544ee7725
--- /dev/null
+++ b/tensorflow/python/autograph/g3doc/reference/_control_flow_tutorial.ipynb
@@ -0,0 +1,862 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-vLwpT31YOJk"
+      },
+      "source": [
+        "TODO(b/138297412): This colab retains some useful code snippets and demonstrations that used to be in the tf.function/AutoGraph customization tutorial, and should be rolled into the existing docs as part of a broader markdown-\u003ecolab conversion."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "otIdN1TS8N7S"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "I0xDjO4SHLUD"
+      },
+      "source": [
+        "Define a helper function to demonstrate the kinds of errors you might encounter:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "D25apou9IOXa"
+      },
+      "outputs": [],
+      "source": [
+        "import traceback\n",
+        "import contextlib\n",
+        "\n",
+        "# Some helper code to demonstrate the kinds of errors you might encounter.\n",
+        "@contextlib.contextmanager\n",
+        "def assert_raises(error_class):\n",
+        "  try:\n",
+        "    yield\n",
+        "  except error_class as e:\n",
+        "    print('Caught expected exception \\n  {}:'.format(error_class))\n",
+        "    traceback.print_exc(limit=2)\n",
+        "  except Exception as e:\n",
+        "    raise e\n",
+        "  else:\n",
+        "    raise Exception('Expected {} to be raised but no error was raised!'.format(\n",
+        "        error_class))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "5f05Vr_YBUCz"
+      },
+      "source": [
+        "## Using AutoGraph\n",
+        "\n",
+        "The [autograph](https://www.tensorflow.org/guide/function) library is fully integrated with `tf.function`, and it will rewrite conditionals and loops which depend on Tensors to run dynamically in the graph.\n",
+        "\n",
+        "`tf.cond` and `tf.while_loop` continue to work with `tf.function`, but code with control flow is often easier to write and understand when written in imperative style."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xgKmkrNTZSyz"
+      },
+      "source": [
+        "## AutoGraph: Conditionals\n",
+        "\n",
+        "AutoGraph will convert `if` statements into the equivalent `tf.cond` calls.\n",
+        "\n",
+        "This substitution is made if the condition is a Tensor. Otherwise, the conditional is executed during tracing."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "20WlM9T2I9EV"
+      },
+      "source": [
+        "Here is a function that checks if the resulting graph uses `tf.cond`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "E-7KllizZYsy"
+      },
+      "outputs": [],
+      "source": [
+        "def test_tf_cond(f, *args):\n",
+        "  g = f.get_concrete_function(*args).graph\n",
+        "  if any(node.name == 'cond' for node in g.as_graph_def().node):\n",
+        "    print(\"{}({}) uses tf.cond.\".format(\n",
+        "        f.__name__, ', '.join(map(str, args))))\n",
+        "  else:\n",
+        "    print(\"{}({}) executes normally.\".format(\n",
+        "        f.__name__, ', '.join(map(str, args))))\n",
+        "\n",
+        "  print(\"  result: \",f(*args).numpy())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "DlqiutEEJHOe"
+      },
+      "source": [
+        "This substitution is made if the condition is a Tensor. Otherwise, the conditional is executed during tracing.\n",
+        "\n",
+        "Passing a python `True` executes the conditional normally:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "fCMywOXwJLIQ"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def dropout(x, training=True):\n",
+        "  if training:\n",
+        "    x = tf.nn.dropout(x, rate=0.5)\n",
+        "  return x"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "68D2RZ17JM8u"
+      },
+      "outputs": [],
+      "source": [
+        "test_tf_cond(dropout, tf.ones([10], dtype=tf.float32), True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "WEz0QYucJPBa"
+      },
+      "source": [
+        "But passing a tensor replaces the python `if` with a `tf.cond`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "o86paGR-Zadi"
+      },
+      "outputs": [],
+      "source": [
+        "test_tf_cond(dropout, tf.ones([10], dtype=tf.float32), tf.constant(True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "5xFLfdApZh8q"
+      },
+      "source": [
+        "`tf.cond` has a number of subtleties.\n",
+        "\n",
+        "it works by tracing both sides of the conditional, and then choosing the appropriate branch at runtime, depending on the condition. Tracing both sides can result in unexpected execution of Python code."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "VTMoZEVaZiwk"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def f(x):\n",
+        "  if x \u003e 0:\n",
+        "    x = x + 1.\n",
+        "    print(\"Tracing `then` branch\")\n",
+        "  else:\n",
+        "    x = x - 1.\n",
+        "    print(\"Tracing `else` branch\")\n",
+        "  return x"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "HqBVIZWb0Qzn"
+      },
+      "outputs": [],
+      "source": [
+        "f(-1.0).numpy()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "BIMfbXlW0QdP"
+      },
+      "outputs": [],
+      "source": [
+        "f(1.0).numpy()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "2nBnJ42v0Pvq"
+      },
+      "outputs": [],
+      "source": [
+        "f(tf.constant(1.0)).numpy()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "zyzzvtN5Jfpb"
+      },
+      "source": [
+        "It requires that if one branch creates a tensor used downstream, the other branch must also create that tensor."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "k_dxWHeFZlaQ"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def f():\n",
+        "  if tf.constant(True):\n",
+        "    x = tf.ones([3, 3])\n",
+        "  return x\n",
+        "\n",
+        "# Throws an error because both branches need to define `x`.\n",
+        "with assert_raises(ValueError):\n",
+        "  f()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wP-LZP6cztnu"
+      },
+      "source": [
+        "If you want to be sure that a particular section of control flow is never converted by autograph, then explicitly convert the object to a python type so an error is raised instead: "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "iG_VDavjzrzV"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def f(x, y):\n",
+        "  if bool(x):\n",
+        "    y = y + 1.\n",
+        "    print(\"Tracing `then` branch\")\n",
+        "  else:\n",
+        "    y = y - 1.\n",
+        "    print(\"Tracing `else` branch\")\n",
+        "  return y"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "kQ4CRP9T0rH2"
+      },
+      "outputs": [],
+      "source": [
+        "f(True, 0).numpy()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ww9tCzHy0rkv"
+      },
+      "outputs": [],
+      "source": [
+        "f(False, 0).numpy()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ppuV7iug0r7i"
+      },
+      "outputs": [],
+      "source": [
+        "with assert_raises(TypeError):\n",
+        "  f(tf.constant(True), 0.0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "yho4J0a0ZkQS"
+      },
+      "source": [
+        "## AutoGraph and loops\n",
+        "\n",
+        "AutoGraph has a few simple rules for converting loops.\n",
+        "\n",
+        "- `for`: Convert if the iterable is a tensor\n",
+        "- `while`: Convert if the while condition depends on a tensor\n",
+        "\n",
+        "If a loop is converted, it will be dynamically unrolled with `tf.while_loop`, or in the special case of a `for x in tf.data.Dataset`, transformed into `tf.data.Dataset.reduce`.\n",
+        "\n",
+        "If a loop is _not_ converted, it will be statically unrolled "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "OyzGNQAuZsky"
+      },
+      "outputs": [],
+      "source": [
+        "def test_dynamically_unrolled(f, *args):\n",
+        "  g = f.get_concrete_function(*args).graph\n",
+        "  if any(node.name == 'while' for node in g.as_graph_def().node):\n",
+        "    print(\"{}({}) uses tf.while_loop.\".format(\n",
+        "        f.__name__, ', '.join(map(str, args))))\n",
+        "  elif any(node.name == 'ReduceDataset' for node in g.as_graph_def().node):\n",
+        "    print(\"{}({}) uses tf.data.Dataset.reduce.\".format(\n",
+        "        f.__name__, ', '.join(map(str, args))))\n",
+        "  else:\n",
+        "    print(\"{}({}) gets unrolled.\".format(\n",
+        "        f.__name__, ', '.join(map(str, args))))\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "KFO1BSN9JkRP"
+      },
+      "source": [
+        "### For loops\n",
+        "\n",
+        "Here is a `tf.function` that demonstrates static unrolling:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "frecgTco_00V"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def for_in_range():\n",
+        "  x = 0\n",
+        "  for i in range(5):\n",
+        "    x += i\n",
+        "  return x\n",
+        "\n",
+        "test_dynamically_unrolled(for_in_range)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "PMdl0azc_5d4"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def for_in_tfrange():\n",
+        "  x = tf.constant(0, dtype=tf.int32)\n",
+        "  for i in tf.range(5):\n",
+        "    x += i\n",
+        "  return x\n",
+        "\n",
+        "test_dynamically_unrolled(for_in_tfrange)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Q7tmncQTZt6_"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def for_in_tfdataset():\n",
+        "  x = tf.constant(0, dtype=tf.int64)\n",
+        "  for i in tf.data.Dataset.range(5):\n",
+        "    x += i\n",
+        "  return x\n",
+        "\n",
+        "test_dynamically_unrolled(for_in_tfdataset)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "eyPzDYiJAC8f"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def while_py_cond():\n",
+        "  x = 5\n",
+        "  while x \u003e 0:\n",
+        "    x -= 1\n",
+        "  return x\n",
+        "\n",
+        "test_dynamically_unrolled(while_py_cond)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "l6s7aU-padY5"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def while_tf_cond():\n",
+        "  x = tf.constant(5)\n",
+        "  while x \u003e 0:\n",
+        "    x -= 1\n",
+        "  return x\n",
+        "\n",
+        "test_dynamically_unrolled(while_tf_cond)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "dSr64Xn6ap-S"
+      },
+      "source": [
+        " If you have a `break` or early `return` clause that depends on a tensor, the top-level condition or iterable should also be a tensor.\n",
+        "\n",
+        "Compare the following examples:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "hG2Fe_OEAwpY"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def while_py_true_py_break(x):\n",
+        "  while True:  # py true\n",
+        "    if x == 0: # py break\n",
+        "      break\n",
+        "    x -= 1\n",
+        "  return x\n",
+        "\n",
+        "test_dynamically_unrolled(while_py_true_py_break, 5)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Sr2cn5bY_E_9"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def buggy_while_py_true_tf_break(x):\n",
+        "  while True:   # py true\n",
+        "    if tf.equal(x, 0): # tf break\n",
+        "      break\n",
+        "    x -= 1\n",
+        "  return x\n",
+        "\n",
+        "with assert_raises(TypeError):\n",
+        "  test_dynamically_unrolled(buggy_while_py_true_tf_break, 5)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Q-VirD-5avdZ"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def while_tf_true_tf_break(x):\n",
+        "  while tf.constant(True): # tf true\n",
+        "    if x == 0:  # py break\n",
+        "      break\n",
+        "    x -= 1\n",
+        "  return x\n",
+        "\n",
+        "test_dynamically_unrolled(while_tf_true_tf_break, 5)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Upx5J0j8_Ldu"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def buggy_py_for_tf_break():\n",
+        "  x = 0\n",
+        "  for i in range(5):  # py for\n",
+        "    if tf.equal(i, 3): # tf break\n",
+        "      break\n",
+        "    x += i\n",
+        "  return x\n",
+        "\n",
+        "with assert_raises(TypeError):\n",
+        "  test_dynamically_unrolled(buggy_py_for_tf_break)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "GQHbodav_QMt"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def tf_for_py_break():\n",
+        "  x = 0\n",
+        "  for i in tf.range(5): # tf for\n",
+        "    if i == 3:  # py break\n",
+        "      break\n",
+        "    x += i\n",
+        "  return x\n",
+        "\n",
+        "test_dynamically_unrolled(tf_for_py_break)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "hyksHW9TCukR"
+      },
+      "source": [
+        "In order to accumulate results from a dynamically unrolled loop, you'll want to use `tf.TensorArray`.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "HJ3Vb3dXfefN"
+      },
+      "outputs": [],
+      "source": [
+        "batch_size = 2\n",
+        "seq_len = 3\n",
+        "feature_size = 4\n",
+        "\n",
+        "def rnn_step(inp, state):\n",
+        "  return inp + state\n",
+        "\n",
+        "@tf.function\n",
+        "def dynamic_rnn(rnn_step, input_data, initial_state):\n",
+        "  # [batch, time, features] -\u003e [time, batch, features]\n",
+        "  input_data = tf.transpose(input_data, [1, 0, 2])\n",
+        "  max_seq_len = input_data.shape[0]\n",
+        "\n",
+        "  states = tf.TensorArray(tf.float32, size=max_seq_len)\n",
+        "  state = initial_state\n",
+        "  for i in tf.range(max_seq_len):\n",
+        "    state = rnn_step(input_data[i], state)\n",
+        "    states = states.write(i, state)\n",
+        "  return tf.transpose(states.stack(), [1, 0, 2])\n",
+        "  \n",
+        "dynamic_rnn(rnn_step,\n",
+        "            tf.random.uniform([batch_size, seq_len, feature_size]),\n",
+        "            tf.zeros([batch_size, feature_size]))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "9gmLpHY-bkly"
+      },
+      "source": [
+        "### Gotcha's\n",
+        "\n",
+        "As with `tf.cond`, `tf.while_loop` also comes with a number of subtleties.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "FJdfznhhKO7D"
+      },
+      "source": [
+        "#### Zero iterations\n",
+        "\n",
+        "Since a loop can execute 0 times, all tensors used downstream of the while_loop must be initialized above the loop.\n",
+        "\n",
+        "Here is an example of incorrect code:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "CocT5RHwblrQ"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def buggy_loop_var_uninitialized():\n",
+        "  for i in tf.range(3):\n",
+        "    x = i\n",
+        "  return x\n",
+        "\n",
+        "with assert_raises(ValueError):\n",
+        "  buggy_loop_var_uninitialized()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ncr7tRZ1KWh9"
+      },
+      "source": [
+        "And the correct version:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Wm7wIKXcCDGf"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def f():\n",
+        "  x = tf.constant(0)\n",
+        "  for i in tf.range(3):\n",
+        "    x = i\n",
+        "  return x\n",
+        "\n",
+        "f()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "CM7qXVY0KZHB"
+      },
+      "source": [
+        "#### Consistent shapes and types\n",
+        "\n",
+        "The shape/dtypes of all loop variables must stay consistent with each iteration.\n",
+        "\n",
+        "Here is an incorrect example that attempts to change a tensor's type:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "FSftc9cCbpAo"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def buggy_loop_type_changes():\n",
+        "  x = tf.constant(0, dtype=tf.float32)\n",
+        "  for i in tf.range(3): # Yields tensors of type tf.int32...\n",
+        "    x = i\n",
+        "  return x\n",
+        "\n",
+        "with assert_raises(TypeError):\n",
+        "  buggy_loop_type_changes()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "M5l90NAHKsUM"
+      },
+      "source": [
+        "Here is an incorrect example that attempts to change a Tensor's shape while iterating:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "kWF189prbuK0"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def buggy_concat():\n",
+        "  x = tf.ones([0, 10])\n",
+        "  for i in tf.range(5):\n",
+        "    x = tf.concat([x, tf.ones([1, 10])], axis=0)\n",
+        "  return x\n",
+        "\n",
+        "with assert_raises(ValueError):\n",
+        "  buggy_concat()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "miYnYcznCHeV"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def concat_with_padding():\n",
+        "  x = tf.zeros([5, 10])\n",
+        "  for i in tf.range(5):\n",
+        "    x = tf.concat([x[:i], tf.ones([1, 10]), tf.zeros([4-i, 10])], axis=0)\n",
+        "    x.set_shape([5, 10])\n",
+        "  return x\n",
+        "\n",
+        "concat_with_padding()\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "performance.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/python/autograph/g3doc/reference/control_flow.md b/tensorflow/python/autograph/g3doc/reference/control_flow.md
index 1e3b2db559c..cf580af7330 100644
--- a/tensorflow/python/autograph/g3doc/reference/control_flow.md
+++ b/tensorflow/python/autograph/g3doc/reference/control_flow.md
@@ -164,7 +164,7 @@ after if
 #### Python values modified in TensorFlow control flow become Tensors
 
 If a symbol is modified in a TensorFlow control flow statement, then it becomes
-a `tf.Tensor`, even if it started off as a Python promitive value.
+a `tf.Tensor`, even if it started off as a Python primitive value.
 
 For example, the conditional below will run as a `tf.cond` (its condition is a
 `tf.Tensor`), which in turn will cause `i` to become a `tf.Tensor`.
@@ -420,6 +420,21 @@ def extra_test(break_):
 break_, = ag__.for_stmt(range(10), extra_test, ..., (break_,))
 ```
 
+Mixing Tensor-dependent `break` and Python-dependent loops is disallowed:
+
+```
+@tf.function
+def buggy_while_py_true_tf_break(x):
+  while True:   # python conditional
+    if tf.equal(x, 0): # tensor break
+      break
+    x -= 1
+  return x
+
+# Raises OperatorNotAllowedInGraphError: using a `tf.Tensor` as a Python `bool` is not allowed
+# buggy_while_true_tf_break(5)
+```
+
 ### `continue` statements
 
 Code blocks in which `continue` statements are used are rewritten with
diff --git a/tensorflow/python/autograph/g3doc/reference/debugging.md b/tensorflow/python/autograph/g3doc/reference/debugging.md
index abc30842297..2c2a96cec86 100644
--- a/tensorflow/python/autograph/g3doc/reference/debugging.md
+++ b/tensorflow/python/autograph/g3doc/reference/debugging.md
@@ -62,7 +62,7 @@ Adding a call to `tf.config.experimental_execute_functions_eagerly` before
 executing the function will land the debugger in the original code instead:
 
 ```
-tf.config.experimental_run_functions_eagerly(True)
+tf.config.run_functions_eagerly(True)
 f(1)
 ```
 
diff --git a/tensorflow/python/autograph/g3doc/reference/generated_code.md b/tensorflow/python/autograph/g3doc/reference/generated_code.md
index b62911b7203..389fa53a065 100644
--- a/tensorflow/python/autograph/g3doc/reference/generated_code.md
+++ b/tensorflow/python/autograph/g3doc/reference/generated_code.md
@@ -66,7 +66,7 @@ print(inspect.getsourcefile(converted_f))
 ```
 
 `tf.autograph.to_code` is a shortcut to obtain the generated code, and it's
-equivalent with calling `inspect.getsource(tf.autograph.to_code(f))`.
+equivalent with calling `inspect.getsource(tf.autograph.to_graph(f))`.
 
 #### Recording diagnostic information: `tf.autograph.set_verbosity`
 
diff --git a/tensorflow/python/autograph/g3doc/reference/limitations.md b/tensorflow/python/autograph/g3doc/reference/limitations.md
index 1ced1fad486..c41350466fa 100644
--- a/tensorflow/python/autograph/g3doc/reference/limitations.md
+++ b/tensorflow/python/autograph/g3doc/reference/limitations.md
@@ -16,6 +16,88 @@ should not be confused with TensorFlow variables.
 Key Term: A TensorFlow loop variable (or loop variable for short) refers to a
 value (typically a `tf.Tensor`) modified by a loop. See `tf.while_loop`.
 
+### Undefined and None values in TensorFlow
+
+TensorFlow does not support undefined or `None` values. All tensors must have
+a value.
+
+Example:
+
+```
+x = tf.cond(
+    tf.random.uniform(()) > 0.5,
+    lambda: tf.constant(1),
+    lambda: None)  # Error -- a Tensor cannot be None
+```
+
+The same restriction carries over in AutoGraph. If a variable is created inside
+control flow, and used after, then it must be defined before the control flow
+statement:
+
+```
+if tf.random.uniform(()) > 0.5:
+  x = tf.constant(1)
+else:
+  x = None
+tf.print(x)  # Error -- x may be None here
+```
+
+For this reason, AutoGraph forbids variables to be defined in only one branch
+of a TensorFlow conditional, if the variable is used afterwards:
+
+```
+del x
+if tf.random.uniform(()) > 0.5:
+  x = tf.constant(1)
+else:
+  pass
+tf.print(x)  # Error -- x may be undefined here
+```
+
+Note that if the variable is not used after the control flow statement, then it
+is considered local to the control flow block, and is not subject to these
+restrictions.
+
+```
+del x
+if tf.random.uniform(()) > 0.5:
+  x = tf.constant(1)  # Okay -- x does not need to be returned from the TF cond
+else:
+  pass
+```
+
+Similarly, variables may not be defined inside a TensorFlow loop, unless they
+are local to the loop. A variable is local to the loop if (1) it's not used
+after the loop and (2) the value from a previour iteration is not used in the
+next iteration:
+
+```
+del x
+while tf.random.uniform(()) > 0.5:  # Error -- x must be defined before the loop
+  x = tf.constant(1)
+tf.print(x)
+```
+
+```
+del x
+while tf.random.uniform(()) > 0.5:  # Okay -- x is local to the loop
+  x = tf.constant(1)
+```
+
+Avoid these limitations by defining a default value before the control flow
+statement:
+
+```
+x = tf.constant()
+if tf.random.uniform(()) > 0.5:
+  x = tf.constant(1)
+tf.print(x)  # Okay -- x is either 0 or 1
+```
+
+Note: `None` values and undefined symbols are allowed in Eager control flow,
+because Eager execution uses Python control flow, rather than TensorFlow
+control flow ops.
+
 ### Indirect modifications and hidden side effects in TensorFlow control flow
 
 Key Point: We recommend using a functional programming style, immutable Python
@@ -187,6 +269,62 @@ objects, but it does support basic collection objects such as `list`, `dict`,
 `tuple`, `namedtuple` and their subclasses. Design your objects as subclasses
 of [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple).
 
+#### Variables closed over by lambda functions
+
+AutoGraph assumes that variables that local functions close over may be used
+anywhere in the parent function, because in general it is possible to hide a
+function call in almost any Python statement). For this reason, these variables
+are accounted within TensorFlow loops.
+
+For example, the following code correctly captures `a` in the TensorFlow loop
+variables:
+
+```
+a = 0
+def f():
+  tf.print(a)
+for i in tf.range(3):
+  a = i
+f()  # Prints 2
+```
+
+An consequence is that these variables must be defined before the loop (see
+Undefined and None values above). So the following code will raise an error,
+even if the variable is never used after the loop:
+
+```
+def f():
+  tf.print(a)
+for i in tf.range(3):  # Error -- `a` must be defined before the loop.
+  a = i
+```
+
+However, lambda functions are handled differently, for reasons of backward
+compatibility. Lambda functions are assumed to be used in the statement where
+they are used, or at least in the same block.
+
+```
+a = 0
+foo(lambda: a)  # This lambda is not expected to be called anywhere else.
+for i in tf.range(3):  # Okay -- `a` is local to the loop.
+  a = i
+```
+
+Due to that reason, the following code will not work as expected for TensorFlow
+loops.
+
+```
+a = 0
+l = lambda: tf.print(a)
+for i in tf.range(3):
+  a = i  # `a` is considered local to the loop
+l()  # Prints 0!
+```
+
+Note that none of these restrictions only apply to TensorFlow loops; Python
+loops correctly correctly handle closures in all cases.
+
+
 ### Python collections in TensorFlow control flow
 
 Key Point: Use TensorFlow collection classes instead of Python collections.
@@ -489,69 +627,6 @@ while tf.random.uniform(()) > 0.5:
   x = tf.constant((1, 2, 3))  # Error -- inconsistent shapes: (), (3,)
 ```
 
-### Undefined and None values in TensorFlow
-
-TensorFlow does not support undefined and `None` values. All tensors must have
-a value.
-
-Example:
-
-```
-x = tf.cond(
-    tf.random.uniform(()) > 0.5,
-    lambda: tf.constant(1),
-    lambda: None)  # Error -- a Tensor cannot be None
-```
-
-The same restriction carries over in AutoGraph, but only if the symbol is used
-after the conditional (otherwise AutoGraph avoids making it a return value
-of the `tf.cond`):
-
-```
-if tf.random.uniform(()) > 0.5:
-  x = tf.constant(1)
-else:
-  x = None
-tf.print(x)  # Error -- x may be None here
-```
-
-A related but less obvious restriction in AutoGraph forbids symbols to be
-defined in only one branch of TensorFlow control flow, if the symbol is
-used afterwards:
-
-```
-del x
-if tf.random.uniform(()) > 0.5:
-  x = tf.constant(1)
-else:
-  pass
-tf.print(x)  # Error -- x may be undefined here
-```
-
-Similarly, variables defined in a loop may not be used outside the loop, again
-if the symbol is used afterwards:
-
-```
-del x
-if tf.random.uniform(()) > 0.5:
-  x = tf.constant(1)
-tf.print(x)  # Error -- x may be undefined here
-```
-
-Avoid these limitations by defining a default value before the control flow
-statement:
-
-```
-x = tf.constant()
-if tf.random.uniform(()) > 0.5:
-  x = tf.constant(1)
-tf.print(x)  # Okay -- x is either 0 or 1
-```
-
-Note: `None` values and undefined symbols are allowed in Eager control flow,
-because Eager execution uses Python control flow, rather than TensorFlow
-control flow ops.
-
 ### Access to source code
 
 Key point: AutoGraph can only handle functions whose source code can be
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index f444af77cbc..3ebb5824b7f 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -341,8 +341,7 @@ def _call_unconverted(f, args, kwargs, options, update_cache=True):
 
   if kwargs is not None:
     return f(*args, **kwargs)
-  else:
-    return f(*args)
+  return f(*args)
 
 
 def _is_known_loaded_type(f, module_name, entity_name):
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index fa90a4fa42c..eeea0aef896 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -36,14 +36,20 @@ from tensorflow.python.autograph.converters import lists
 from tensorflow.python.autograph.converters import logical_expressions
 from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.converters import slices
+from tensorflow.python.autograph.converters import variables
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import function_wrappers
 from tensorflow.python.autograph.core import unsupported_features_checker
 from tensorflow.python.autograph.lang import special_functions
+from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cache
+from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import transpiler
+from tensorflow.python.autograph.pyct.static_analysis import activity
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.eager import function
 from tensorflow.python.util import tf_inspect
@@ -58,24 +64,36 @@ class AutoGraphTranspiler(transpiler.FunctionTranspiler):
     # TODO(mdan): Insert list_comprehensions somewhere.
     unsupported_features_checker.verify(node)
 
-    node = converter.standard_analysis(node, ctx, is_initial=True)
-    node = converter.apply_(node, ctx, functions)
-    node = converter.apply_(node, ctx, directives)
-    node = converter.apply_(node, ctx, break_statements)
+    # Run initial analysis.
+    graphs = cfg.build(node)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx, None)
+    node = reaching_definitions.resolve(node, ctx, graphs)
+    anno.dup(
+        node,
+        {
+            anno.Static.DEFINITIONS: anno.Static.ORIG_DEFINITIONS,
+        },
+    )
+
+    node = functions.transform(node, ctx)
+    node = directives.transform(node, ctx)
+    node = break_statements.transform(node, ctx)
     if ctx.user.options.uses(converter.Feature.ASSERT_STATEMENTS):
-      node = converter.apply_(node, ctx, asserts)
+      node = asserts.transform(node, ctx)
     # Note: sequencing continue canonicalization before for loop one avoids
     # dealing with the extra loop increment operation that the for
     # canonicalization creates.
-    node = converter.apply_(node, ctx, continue_statements)
-    node = converter.apply_(node, ctx, return_statements)
+    node = continue_statements.transform(node, ctx)
+    node = return_statements.transform(node, ctx)
     if ctx.user.options.uses(converter.Feature.LISTS):
-      node = converter.apply_(node, ctx, lists)
-      node = converter.apply_(node, ctx, slices)
-    node = converter.apply_(node, ctx, call_trees)
-    node = converter.apply_(node, ctx, control_flow)
-    node = converter.apply_(node, ctx, conditional_expressions)
-    node = converter.apply_(node, ctx, logical_expressions)
+      node = lists.transform(node, ctx)
+      node = slices.transform(node, ctx)
+    node = call_trees.transform(node, ctx)
+    node = control_flow.transform(node, ctx)
+    node = conditional_expressions.transform(node, ctx)
+    node = logical_expressions.transform(node, ctx)
+    node = variables.transform(node, ctx)
     return node
 
 
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 6db9e4f8e3b..5f644ea525d 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -22,6 +22,7 @@ py_library(
     name = "operators",
     srcs = [
         "__init__.py",
+        "conditional_expressions.py",
         "control_flow.py",
         "control_flow_deprecated_py2.py",
         "data_structures.py",
@@ -29,8 +30,7 @@ py_library(
         "logical.py",
         "py_builtins.py",
         "slices.py",
-        "special_values.py",
-        "symbols.py",
+        "variables.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -63,6 +63,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "conditional_expressions_test",
+    srcs = ["conditional_expressions_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+    ],
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
@@ -148,19 +162,8 @@ py_test(
 )
 
 py_test(
-    name = "special_values_test",
-    srcs = ["special_values_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":operators",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "symbols_test",
-    srcs = ["symbols_test.py"],
+    name = "variables_test",
+    srcs = ["variables_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 495b6070aae..8ac4e1d8bb3 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -37,6 +37,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.operators.conditional_expressions import if_exp
 from tensorflow.python.autograph.operators.control_flow import for_stmt
 from tensorflow.python.autograph.operators.control_flow import if_stmt
 from tensorflow.python.autograph.operators.control_flow import while_stmt
@@ -60,8 +61,6 @@ from tensorflow.python.autograph.operators.py_builtins import range_
 from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
-from tensorflow.python.autograph.operators.special_values import is_undefined
-from tensorflow.python.autograph.operators.special_values import is_undefined_return
-from tensorflow.python.autograph.operators.special_values import retval
-from tensorflow.python.autograph.operators.special_values import Undefined
-from tensorflow.python.autograph.operators.special_values import UndefinedReturnValue
+from tensorflow.python.autograph.operators.variables import ld
+from tensorflow.python.autograph.operators.variables import Undefined
+from tensorflow.python.autograph.operators.variables import UndefinedReturnValue
diff --git a/tensorflow/python/autograph/operators/conditional_expressions.py b/tensorflow/python/autograph/operators/conditional_expressions.py
new file mode 100644
index 00000000000..7ea2b249935
--- /dev/null
+++ b/tensorflow/python/autograph/operators/conditional_expressions.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Conditional expressions (e.g. the ternary if statement)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.autograph.operators import control_flow
+from tensorflow.python.autograph.utils import tensors
+from tensorflow.python.ops import control_flow_ops
+
+
+def if_exp(cond, if_true, if_false, expr_repr):
+  if tensors.is_dense_tensor(cond):
+    return _tf_if_exp(cond, if_true, if_false, expr_repr)
+  else:
+    return _py_if_exp(cond, if_true, if_false)
+
+
+def _tf_if_exp(cond, if_true, if_false, expr_repr):
+  """Overload of if_exp that stages a TF cond."""
+  # TODO(mdan): Use nonlocal once we no longer need to support py2.
+  true_val = []
+  false_val = []
+
+  def true_fn():
+    true_val.append(if_true())
+    if true_val and false_val:
+      control_flow.verify_single_cond_var(expr_repr, true_val[0], false_val[0])
+    return true_val[0]
+
+  def false_fn():
+    false_val.append(if_false())
+    if true_val and false_val:
+      control_flow.verify_single_cond_var(expr_repr, true_val[0], false_val[0])
+    return false_val[0]
+
+  return control_flow_ops.cond(cond, true_fn, false_fn)
+
+
+def _py_if_exp(cond, if_true, if_false):
+  return if_true() if cond else if_false()
diff --git a/tensorflow/python/autograph/operators/conditional_expressions_test.py b/tensorflow/python/autograph/operators/conditional_expressions_test.py
new file mode 100644
index 00000000000..3f126116023
--- /dev/null
+++ b/tensorflow/python/autograph/operators/conditional_expressions_test.py
@@ -0,0 +1,66 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for conditional_expressions module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.operators import conditional_expressions
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+def _basic_expr(cond):
+  return conditional_expressions.if_exp(
+      cond,
+      lambda: constant_op.constant(1),
+      lambda: constant_op.constant(2),
+      'cond')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class IfExpTest(test.TestCase):
+
+  def test_tensor(self):
+    self.assertEqual(self.evaluate(_basic_expr(constant_op.constant(True))), 1)
+    self.assertEqual(self.evaluate(_basic_expr(constant_op.constant(False))), 2)
+
+  def test_tensor_mismatched_type(self):
+    # tf.function required because eager cond degenerates to Python if.
+    @def_function.function
+    def test_fn():
+      conditional_expressions.if_exp(
+          constant_op.constant(True), lambda: 1.0, lambda: 2, 'expr_repr')
+
+    with self.assertRaisesRegexp(
+        TypeError,
+        "'expr_repr' has dtype float32 in the main.*int32 in the else"):
+      test_fn()
+
+  def test_python(self):
+    self.assertEqual(self.evaluate(_basic_expr(True)), 1)
+    self.assertEqual(self.evaluate(_basic_expr(False)), 2)
+    self.assertEqual(
+        conditional_expressions.if_exp(True, lambda: 1, lambda: 2, ''), 1)
+    self.assertEqual(
+        conditional_expressions.if_exp(False, lambda: 1, lambda: 2, ''), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 40493f07a2d..77db7579ece 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -65,7 +65,7 @@ import traceback
 import numpy as np
 
 from tensorflow.python.autograph.operators import py_builtins
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.autograph.utils import compat_util
 from tensorflow.python.autograph.utils import misc
@@ -83,16 +83,10 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.util import lazy_loader
+from tensorflow.python.types import distribute
 from tensorflow.python.util import nest
 
 
-# TODO(b/145618471): Remove this dependency.
-# Lazy import to work around circular dependencies
-input_lib = lazy_loader.LazyLoader(
-    'input_lib', globals(),
-    'tensorflow.python.distribute.input_lib')
-
 PYTHON_MAX_ITERATIONS = 100000000  # Fails in about one minute for empty loops.
 WARN_INEFFICIENT_UNROLL = True
 INEFFICIENT_UNROLL_MIN_ITERATIONS = 3000
@@ -108,15 +102,15 @@ def _verify_loop_init_vars(values, symbol_names):
   """Ensures that all values in the state are defined when entering a loop."""
   for name, value in zip(symbol_names, values):
     if value is None:
-      raise ValueError('"{}" may not be None before the loop.'.format(name))
-    if special_values.is_undefined_return(value):
+      raise ValueError("'{}' may not be None before the loop.".format(name))
+    if isinstance(value, variables.UndefinedReturnValue):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
       raise ValueError(
           'return statements are not supported within a TensorFlow loop.')
-    if special_values.is_undefined(value):
-      raise ValueError('"{}" must be defined before the loop.'.format(name))
+    if isinstance(value, variables.Undefined):
+      raise ValueError("'{}' must be defined before the loop.".format(name))
 
 
 def _is_subshape(left, right):
@@ -139,9 +133,9 @@ def _is_subshape(left, right):
 def _verify_single_loop_var(
     name, check_shape, init, entry, exit_, shape_invariant):
   """Verifies whether the initial, entry and exit values are consistent."""
-  assert entry is not None, 'no TF op should set "{}" to None?'.format(name)
+  assert entry is not None, "no TF op should set '{}' to None?".format(name)
   if exit_ is None:
-    raise ValueError('"{}" is None at the end of the iteration.'.format(name))
+    raise ValueError("'{}' is None at the end of the iteration.".format(name))
 
   if isinstance(init, (bool, int, float, str, np.ndarray)):
     init = ops.convert_to_tensor_v2(init)
@@ -164,9 +158,8 @@ def _verify_single_loop_var(
 
   if entry.dtype != exit_.dtype:
     raise TypeError(
-        '"{}" has dtype {} before the loop, but dtype {} after one'
-        ' iteration. TensorFlow control flow requires it stays the'
-        ' same.'.format(
+        "'{}' has dtype {} before the loop, but dtype {} after one"
+        ' iteration'.format(
             name,
             entry.dtype.name,
             exit_.dtype.name,
@@ -177,19 +170,19 @@ def _verify_single_loop_var(
       entry_shape = entry.shape
       if not _is_subshape(exit_shape, entry_shape):
         raise ValueError(
-            '"{}" has shape {} before the loop, but shape {} after one'
+            "'{}' has shape {} before the loop, but shape {} after one"
             ' iteration. Use tf.autograph.experimental.set_loop_options to set'
             ' shape invariants.'.format(name, entry_shape, exit_shape))
     else:
       init_shape = init.shape
       if not _is_subshape(init_shape, shape_invariant):
         raise ValueError(
-            '"{}" has shape {} before the loop, which does not conform with'
+            "'{}' has shape {} before the loop, which does not conform with"
             ' the shape invariant {}.'.format(name, init_shape,
                                               shape_invariant))
       if not _is_subshape(exit_shape, shape_invariant):
         raise ValueError(
-            '"{}" has shape {} after one iteration, which does not conform with'
+            "'{}' has shape {} after one iteration, which does not conform with"
             ' the shape invariant {}.'.format(
                 name, exit_shape, shape_invariant))
 
@@ -222,13 +215,13 @@ def _verify_tf_loop_vars(init_vars,
       nest.assert_same_structure(init, entry, expand_composites=True)
       nest.assert_same_structure(entry, exit_, expand_composites=True)
     except (ValueError, TypeError) as e:
-      raise TypeError('"{}" does not have the same nested structure after one'
+      raise TypeError("'{}' does not have the same nested structure after one"
                       ' iteration.\n\n{}'.format(name, e))
     if invariant is not None:
       try:
         nest.assert_same_structure(init, invariant, expand_composites=False)
       except (ValueError, TypeError) as e:
-        raise TypeError('"{}" does not have the same nested structure as its'
+        raise TypeError("'{}' does not have the same nested structure as its"
                         ' corresponding shape invariant.\n\n{}'.format(name, e))
 
     nest.map_structure(
@@ -236,13 +229,13 @@ def _verify_tf_loop_vars(init_vars,
         entry, exit_, invariant)
 
 
-def _verify_single_cond_var(name, body_var, orelse_var):
+def verify_single_cond_var(name, body_var, orelse_var):
   """Verifies whether body_var and orelse_var are consistent."""
   if body_var is None:
-    raise ValueError('"{}" is None at the end of the TRUE branch.'.format(name))
+    raise ValueError("'{}' is None at the end of the main branch.".format(name))
   if orelse_var is None:
     raise ValueError(
-        '"{}" is None at the end of the FALSE branch.'.format(name))
+        "'{}' is None at the end of the else branch.".format(name))
 
   if isinstance(body_var, (bool, int, float, str, np.ndarray)):
     body_var = ops.convert_to_tensor_v2(body_var)
@@ -261,41 +254,37 @@ def _verify_single_cond_var(name, body_var, orelse_var):
 
   if body_var.dtype != orelse_var.dtype:
     raise TypeError(
-        '"{}" has dtype {} in the TRUE branch, but dtype={} in the FALSE'
-        ' branch. TensorFlow control flow requires that they are the'
-        ' same.'.format(name, body_var.dtype.name,
-                        orelse_var.dtype.name))
+        "'{}' has dtype {} in the main branch, but dtype {} in the else"
+        ' branch'.format(name, body_var.dtype.name,
+                         orelse_var.dtype.name))
+
+
+def _verify_tf_cond_branch_vars(vars_, symbol_names, branch_name):
+  """Verifies variables output by a conditional branch for consistency."""
+  for name, var_ in zip(symbol_names, vars_):
+    if isinstance(var_, variables.Undefined):
+      raise ValueError(
+          "'{}' must also be initialized in the {} branch".format(
+              name, branch_name))
+    if isinstance(var_, variables.UndefinedReturnValue):
+      raise ValueError(
+          'the {} branch must also have a return statement.'.format(
+              branch_name))
 
 
 def _verify_tf_cond_vars(body_vars, orelse_vars, symbol_names):
   """Verifies variables manipulated by a conditional for consistency."""
-  basic_body_vars, composite_body_vars = body_vars
-  basic_orelse_vars, composite_orelse_vars = orelse_vars
-  assert isinstance(composite_body_vars, tuple)
-  assert isinstance(composite_orelse_vars, tuple)
-
-  # TODO(kkb): Make this more consistent.
-  # The basic outputs should always be a tuple.
-  if not isinstance(basic_body_vars, tuple):
-    basic_body_vars = (basic_body_vars,)
-  if not isinstance(basic_orelse_vars, tuple):
-    basic_orelse_vars = (basic_orelse_vars,)
-
-  body_vars = basic_body_vars + composite_body_vars
-  orelse_vars = basic_orelse_vars + composite_orelse_vars
-
   named_vars = zip(symbol_names, body_vars, orelse_vars)
+
   for name, body_var, orelse_var in named_vars:
     try:
-      nest.assert_same_structure(
-          body_var, orelse_var, expand_composites=True)
+      nest.assert_same_structure(body_var, orelse_var, expand_composites=True)
     except (ValueError, TypeError) as e:
       raise TypeError(
-          '"{}" does not have the same nested structure in the TRUE and FALSE'
-          ' branches.\n\n{}'.format(name, str(e)))
-
+          "'{}' must have the same nested structure in the main and else"
+          ' branches:\n\n{}'.format(name, str(e)))
     nest.map_structure(
-        functools.partial(_verify_single_cond_var, name), body_var, orelse_var)
+        functools.partial(verify_single_cond_var, name), body_var, orelse_var)
 
 
 def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
@@ -320,12 +309,16 @@ def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
   `extra_test`, `body`, `get_state` and `set_state` functions must bind to the
   original `geo_mean` and `arith_mean` symbols, using `nonlocal`.
 
+  The inputs and outputs of the callables representing the loop blocks are not
+  explicit - instead, these functions must use nonlocal/global for side effects.
+  The inputs and outputs are instead controlled by the set_state/get_state
+  functions.
+
   Args:
     iter_: The entity being iterated over.
-    extra_test: Callable with the state as arguments, and boolean return type.
+    extra_test: Callable with boolean return type.
       An additional loop condition.
-    body: Callable with the iterate and the state as arguments, and state as
-      return type. The actual loop body.
+    body: Callable representing the actual loop body.
     get_state: Additional callable which can capture additional state (such as
       the values of composite symbols). This is only useful when staging the
       loop.
@@ -361,13 +354,12 @@ def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
     _tf_ragged_for_stmt(
         iter_, extra_test, body, get_state, set_state, symbol_names, opts)
 
-  elif isinstance(iter_, input_lib.DistributedIterator):
+  elif isinstance(iter_, distribute.Iterator):
     raise NotImplementedError(
         'distributed iterators not supported yet, use the distributed dataset'
         ' directly')
 
-  # TODO(mdan): Resolve the private access issue.
-  elif isinstance(iter_, input_lib._IterableInput):  # pylint:disable=protected-access
+  elif isinstance(iter_, distribute.Iterable):
     _tf_distributed_iterable_for_stmt(
         iter_, extra_test, body, get_state, set_state, symbol_names, opts)
 
@@ -501,8 +493,17 @@ def _tf_range_for_stmt(
 
   iterate = compat_util.BasicRef(start)
 
+  def _value_or(name, var, default):
+    if (name == opts['iterate_names'] and isinstance(var, variables.Undefined)):
+      return default
+    return var
+
   def aug_get_state():
-    return (iterate.value,) + get_state()
+    state_vars = get_state()
+    state_vars = tuple(
+        _value_or(name, var, iterate.value)
+        for name, var in zip(symbol_names, state_vars))
+    return (iterate.value,) + state_vars
 
   def aug_set_state(aug_loop_vars):
     # TOOD(mdan): Use starred assignment once we can switch to Py3-only syntax.
@@ -715,11 +716,14 @@ def while_stmt(test, body, get_state, set_state, symbol_names, opts):
   a tuple of entities that represent an actual state, or a list of arguments
   of the corresponding types.
 
+  The inputs and outputs of the callables representing the loop blocks are not
+  explicit - instead, these functions must use nonlocal/global for side effects.
+  The inputs and outputs are instead controlled by the set_state/get_state
+  functions.
+
   Args:
-    test: Callable with the state as arguments, and boolean return type. The
-      loop condition.
-    body: Callable with the state as arguments, and state as return type. The
-      actual loop body.
+    test: Callable with boolean return type. The loop condition.
+    body: Callable representing the actual loop body.
     get_state: Additional callable which can capture additional state (such as
       the values of composite symbols). This is only useful when staging the
       loop.
@@ -876,34 +880,48 @@ def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
         init_vars, loop_vars, new_loop_vars, symbol_names, opts)
     return new_loop_vars
 
-  # Non-v2 while_loop unpacks the results when there is only one return value.
-  # This enforces consistency across versions.
-  opts['return_same_structure'] = True
-
   if 'shape_invariants' in opts:
     opts['shape_invariants'] = _shape_invariants_mapping_to_positional_list(
         opts['shape_invariants'], init_vars)
 
+  while_loop_opts = dict(opts)
+  while_loop_opts.pop('iterate_names', None)
+
+  # Non-v2 while_loop unpacks the results when there is only one return value.
+  # This enforces consistency across versions.
+  while_loop_opts['return_same_structure'] = True
+
   final_loop_vars = control_flow_ops.while_loop(
-      aug_test, aug_body, init_vars, **opts)
+      aug_test, aug_body, init_vars, **while_loop_opts)
   set_state(final_loop_vars)
 
 
-def if_stmt(cond,
-            body,
-            orelse,
-            get_state,
-            set_state,
-            basic_symbol_names,
-            composite_symbol_names):
+def if_stmt(cond, body, orelse, get_state, set_state, symbol_names, nouts):
   """Functional form of an if statement.
 
+  The conditional operates on a state, which includes all symbols whose values
+  are a function of the branch taken.
+
+  For example, given the code below that calculates the abs function:
+
+  ```
+    x = 1
+    if x > 0:
+      x = -x
+  ```
+
+  The state is represented by the variable `x`. The `body, `orelse` and
+  `set_state` functions must bind to the original `x` symbol, using `nonlocal`.
+
+  The inputs and outputs of the callables representing the loop blocks are not
+  explicit - instead, these functions must use nonlocal/global for side effects.
+  The inputs and outputs are instead controlled by the set_state/get_state
+  functions.
+
   Args:
     cond: Boolean.
-    body: Callable with no arguments, and outputs of the positive (if) branch as
-      return type.
-    orelse: Callable with no arguments, and outputs of the negative (else)
-      branch as return type.
+    body: Callable representing the main block of the conditional.
+    orelse: Callable representing the else block of the conditional.
     get_state: Function that returns a tuple containing the values of all
       composite symbols modified within the conditional. This allows access to
       state that branches may mutate through side effects. This function is not
@@ -915,122 +933,63 @@ def if_stmt(cond,
       restore checkpointed values. The single argument a tuple containing values
       for each composite symbol that may be modified in a branch of the
       conditional. The is usually the result of a call to get_state.
-    basic_symbol_names: Tuple containing basic loop var names.
-    composite_symbol_names: Tuple containing composite loop var names.
-
-  Returns:
-    Tuple containing the statement outputs.
+    symbol_names: Tuple containing basic loop var names.
+    nouts: Number of variables output by the statement. Vars which are
+      not outputs will not be passed through staged control flow such as
+      tf.cond. This includes variables that are defined before the conditional,
+      but are not used after it.
   """
   # Note: tf.cond doesn't support SparseTensor.
   if tensors.is_dense_tensor(cond):
-    return tf_if_stmt(cond, body, orelse, get_state, set_state,
-                      basic_symbol_names, composite_symbol_names)
+    _tf_if_stmt(cond, body, orelse, get_state, set_state, symbol_names, nouts)
   else:
-    return _py_if_stmt(cond, body, orelse)
+    _py_if_stmt(cond, body, orelse)
 
 
-def tf_if_stmt(cond, body, orelse, get_state, set_state, basic_symbol_names,
-               composite_symbol_names):
+def _tf_if_stmt(
+    cond, body, orelse, get_state, set_state, symbol_names, nouts):
   """Overload of if_stmt that stages a TF cond."""
-  body = _wrap_disallow_undefs_from_cond(body, branch_name='if')
-  orelse = _wrap_disallow_undefs_from_cond(orelse, branch_name='else')
-  body = _isolate_state(body, get_state, set_state)
-  orelse = _isolate_state(orelse, get_state, set_state)
+  if not nouts:
+    prev_get_state, prev_set_state = get_state, set_state
+    # Control flow V1 wants at least one output.
+    get_state = lambda: (0,) + prev_get_state()
+    set_state = lambda v: prev_set_state(v[1:])
+    symbol_names += ('<unused dummy>',)
+    nouts = 1
 
-  # `state` currently includes the values of any composite symbols (e.g. `a.b`)
-  # composites modified by the loop. `final_vars` includes the values of basic
-  # symbols (e.g. `a`) which cannot be passed by reference and must be returned.
-  # See _isolate_state.
-  # TODO(mdan): We should minimize calls to get/set_state.
+  init_vars = get_state()
 
-  body_branch = 0
-  orelse_branch = 1
-  result = [None, None]
+  # TODO(mdan): Use nonlocal once we no longer need to support py2.
+  new_body_vars_ = [None]
+  new_orelse_vars_ = [None]
 
-  def error_checking_body():
-    result[body_branch] = body()
-    if result[orelse_branch] is not None:
-      _verify_tf_cond_vars(result[body_branch], result[orelse_branch],
-                           basic_symbol_names + composite_symbol_names)
-    return result[body_branch]
+  def aug_body():
+    set_state(init_vars)
+    body()
+    new_body_vars = get_state()
+    new_body_vars = new_body_vars[:nouts]
+    new_body_vars_[0] = new_body_vars
+    _verify_tf_cond_branch_vars(new_body_vars, symbol_names, 'main')
+    if new_orelse_vars_[0] is not None:
+      _verify_tf_cond_vars(new_body_vars, new_orelse_vars_[0], symbol_names)
+    return new_body_vars
 
-  def error_checking_orelse():
-    result[orelse_branch] = orelse()
-    if result[body_branch] is not None:
-      _verify_tf_cond_vars(result[body_branch], result[orelse_branch],
-                           basic_symbol_names + composite_symbol_names)
-    return result[orelse_branch]
+  def aug_orelse():
+    set_state(init_vars)
+    orelse()
+    new_orelse_vars = get_state()
+    new_orelse_vars = new_orelse_vars[:nouts]
+    new_orelse_vars_[0] = new_orelse_vars
+    _verify_tf_cond_branch_vars(new_orelse_vars, symbol_names, 'else')
+    if new_body_vars_[0] is not None:
+      _verify_tf_cond_vars(new_body_vars_[0], new_orelse_vars, symbol_names)
+    return new_orelse_vars
 
-  final_vars, final_state = control_flow_ops.cond(cond, error_checking_body,
-                                                  error_checking_orelse)
+  final_cond_vars = control_flow_ops.cond(
+      cond, aug_body, aug_orelse, strict=True)
+  final_cond_vars = final_cond_vars + init_vars[nouts:]
 
-  set_state(final_state)
-
-  return final_vars
-
-
-def _isolate_state(func, get_state, set_state):
-  """Wraps func to (best-effort) isolate state mutations that func may do.
-
-  The simplest example of state mutation is mutation of variables (via e.g.
-  attributes), or modification of globals.
-
-  This allows us to more safely execute this function without worrying about
-  side effects when the function wasn't normally expected to execute. For
-  example, staging requires that the function is executed ahead of time, and
-  we need to ensure its effects are not observed during normal execution.
-
-  Args:
-    func: () -> Any
-    get_state: () -> Any, returns the current state
-    set_state: (Any) -> None, resets the state to the specified values.
-      Typically the result of an earlier call to `get_state`.
-
-  Returns:
-    Tuple[Any, Any], where the first element is the return value of `func`,
-    and the second is the final state values.
-  """
-
-  def wrapper():
-    init_state = get_state()
-    new_vars = func()
-    # TODO(mdan): These should be copies, lest set_state might affect them.
-    new_state = get_state()
-    set_state(init_state)
-    return new_vars, new_state
-
-  return wrapper
-
-
-def _wrap_disallow_undefs_from_cond(func, branch_name):
-  """Wraps conditional branch to disallow returning undefined symbols."""
-
-  def wrapper():
-    """Calls function and raises an error if undefined symbols are returned."""
-    results = func()
-
-    if isinstance(results, tuple):
-      results_tuple = results
-    else:
-      results_tuple = results,
-    undefined = tuple(filter(special_values.is_undefined, results_tuple))
-    if undefined:
-      raise ValueError(
-          'The following symbols must also be initialized in the {} branch: {}.'
-          ' Alternatively, you may initialize them before the if'
-          ' statement.'.format(branch_name,
-                               tuple(s.symbol_name for s in undefined)))
-
-    for result in results_tuple:
-      if special_values.is_undefined_return(result):
-        raise ValueError(
-            'A value must also be returned from the {} branch. If a value is '
-            'returned from one branch of a conditional a value must be '
-            'returned from all branches.'.format(branch_name))
-
-    return results
-
-  return wrapper
+  set_state(final_cond_vars)
 
 
 def _py_if_stmt(cond, body, orelse):
diff --git a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
index e01a2f206c8..5a900fb19ed 100644
--- a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
+++ b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
@@ -66,7 +66,7 @@ import functools
 import numpy as np
 
 from tensorflow.python.autograph.operators import py_builtins
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.autograph.utils import misc
 from tensorflow.python.autograph.utils import tensors
@@ -103,13 +103,13 @@ INEFFICIENT_UNROLL_MIN_OPS = 1
 
 def _disallow_undefs_into_loop(*values):
   """Ensures that all values in the state are defined when entering a loop."""
-  undefined = tuple(filter(special_values.is_undefined, values))
+  undefined = [v for v in values if isinstance(v, variables.Undefined)]
   if undefined:
     raise ValueError(
         '{} must be defined before the loop.'.format(
             ','.join(s.symbol_name for s in undefined)))
   for value in values:
-    if special_values.is_undefined_return(value):
+    if isinstance(value, variables.UndefinedReturnValue):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
@@ -1129,7 +1129,7 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
       results_tuple = results
     else:
       results_tuple = results,
-    undefined = tuple(filter(special_values.is_undefined, results_tuple))
+    undefined = [v for v in results_tuple if isinstance(v, variables.Undefined)]
     if undefined:
       raise ValueError(
           'The following symbols must also be initialized in the {} branch: {}.'
@@ -1138,7 +1138,7 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
                                tuple(s.symbol_name for s in undefined)))
 
     for result in results_tuple:
-      if special_values.is_undefined_return(result):
+      if isinstance(result, variables.UndefinedReturnValue):
         raise ValueError(
             'A value must also be returned from the {} branch. If a value is '
             'returned from one branch of a conditional a value must be '
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 222f6d7ed97..57288be9a9f 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -29,7 +29,7 @@ import numpy as np
 import six
 
 from tensorflow.python.autograph.operators import control_flow
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables as variable_operators
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
@@ -89,7 +89,7 @@ class ForLoopTest(test.TestCase):
         get_state=lambda: (s,),
         set_state=set_state,
         symbol_names=('s',),
-        opts={})
+        opts={'iterate_names': 'i'})
     self.assertEqual(self.evaluate(s), (1234,))
 
   def test_range_tensor_explicit_limit_delta(self):
@@ -109,7 +109,7 @@ class ForLoopTest(test.TestCase):
         get_state=lambda: (s,),
         set_state=set_state,
         symbol_names=('s',),
-        opts={})
+        opts={'iterate_names': 'i'})
     self.assertEqual(self.evaluate(s), (-171207,))
 
   def test_range_tensor_explicit_limit_negative_delta(self):
@@ -129,7 +129,7 @@ class ForLoopTest(test.TestCase):
         get_state=lambda: (s,),
         set_state=set_state,
         symbol_names=('s',),
-        opts={})
+        opts={'iterate_names': 'i'})
     self.assertEqual(self.evaluate(s), (171207,))
 
   def test_range_tensor_random_delta(self):
@@ -150,7 +150,7 @@ class ForLoopTest(test.TestCase):
         get_state=lambda: (s,),
         set_state=set_state,
         symbol_names=('s',),
-        opts={})
+        opts={'iterate_names': 'i'})
     self.assertEqual(self.evaluate(s), (1234,))
 
   def test_range_tensor_random_negative_delta(self):
@@ -171,7 +171,7 @@ class ForLoopTest(test.TestCase):
         get_state=lambda: (s,),
         set_state=set_state,
         symbol_names=('s',),
-        opts={})
+        opts={'iterate_names': 'i'})
     self.assertEqual(self.evaluate(s), (171207,))
 
   def test_tensor_with_extra_test_object_vars(self):
@@ -543,21 +543,21 @@ class ForLoopTest(test.TestCase):
     return s
 
   def test_tensor_illegal_input(self):
-    with self.assertRaisesRegex(ValueError, '"s" may not be None'):
+    with self.assertRaisesRegex(ValueError, '\'s\' may not be None'):
       self._basic_loop(None, lambda i, s: s)
-    with self.assertRaisesRegex(ValueError, '"s" must be defined'):
-      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+    with self.assertRaisesRegex(ValueError, '\'s\' must be defined'):
+      self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
-    with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
+    with self.assertRaisesRegex(ValueError, '\'s\' is None at the end'):
       self._basic_loop(0, lambda i, s: None)
 
   def test_tensor_dtype_change(self):
-    with self.assertRaisesRegex(TypeError, '"s".* dtype float32 after'):
+    with self.assertRaisesRegex(TypeError, '\'s\'.* dtype float32 after'):
       self._basic_loop(0, lambda i, s: 1.0)
 
   def test_tensor_shape_change(self):
-    with self.assertRaisesRegex(ValueError, r'"s".* shape \(1,\) after'):
+    with self.assertRaisesRegex(ValueError, r'\'s\'.* shape \(1,\) after'):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
 
@@ -782,21 +782,21 @@ class WhileLoopTest(test.TestCase):
     return s
 
   def test_tensor_illegal_input(self):
-    with self.assertRaisesRegex(ValueError, '"s" may not be None'):
+    with self.assertRaisesRegex(ValueError, "'s' may not be None"):
       self._basic_loop(None, lambda i, s: s)
-    with self.assertRaisesRegex(ValueError, '"s" must be defined'):
-      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+    with self.assertRaisesRegex(ValueError, "'s' must be defined"):
+      self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
-    with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
+    with self.assertRaisesRegex(ValueError, "'s' is None at the end"):
       self._basic_loop(0, lambda i, s: None)
 
   def test_tensor_dtype_change(self):
-    with self.assertRaisesRegex(TypeError, '"s".* dtype float32 after'):
+    with self.assertRaisesRegex(TypeError, "'s'.* dtype float32 after"):
       self._basic_loop(0, lambda i, s: 1.0)
 
   def test_tensor_shape_change(self):
-    with self.assertRaisesRegex(ValueError, r'"s".* shape \(1,\) after'):
+    with self.assertRaisesRegex(ValueError, r"'s'.* shape \(1,\) after"):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
 
@@ -806,29 +806,88 @@ class IfStmtTest(test.TestCase):
   def test_tensor(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i
+        i = constant_op.constant(1)
+
+      def orelse():
+        nonlocal i
+        i = constant_op.constant(-1)
+
+      def set_state(cond_vars):
+        nonlocal i
+        i, = cond_vars
+
+      i = None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: constant_op.constant(1),
-          orelse=lambda: constant_op.constant(-1),
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (i,),
+          set_state=set_state,
+          symbol_names=('i',),
+          nouts=1)
+      return i
 
     self.assertEqual(1, self.evaluate(test_fn(constant_op.constant(True))))
     self.assertEqual(-1, self.evaluate(test_fn(constant_op.constant(False))))
 
+  def test_tensor_no_outputs(self):
+
+    def test_fn(cond):
+      def body():
+        nonlocal i
+        i = constant_op.constant(1)
+
+      def orelse():
+        nonlocal i
+        i = constant_op.constant(-1.0)
+
+      def set_state(cond_vars):
+        nonlocal i
+        i, = cond_vars
+
+      i = None
+      control_flow.if_stmt(
+          cond=cond,
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (i,),
+          set_state=set_state,
+          symbol_names=('i',),
+          nouts=0)
+      return i
+
+    self.assertEqual(None, test_fn(constant_op.constant(True)))
+    self.assertEqual(None, test_fn(constant_op.constant(False)))
+
   def test_tensor_multiple_returns(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i, j
+        i = constant_op.constant(1)
+        j = constant_op.constant(2)
+
+      def orelse():
+        nonlocal i, j
+        i = constant_op.constant(-1)
+        j = constant_op.constant(-2)
+
+      def set_state(cond_vars):
+        nonlocal i, j
+        i, j = cond_vars
+
+      i, j = None, None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: (constant_op.constant(1), constant_op.constant(2)),
-          orelse=lambda: (constant_op.constant(-1), constant_op.constant(-2)),
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (i, j),
+          set_state=set_state,
+          symbol_names=('i', 'j'),
+          nouts=2)
+      return i, j
 
     self.assertEqual((1, 2), self.evaluate(test_fn(constant_op.constant(True))))
     self.assertEqual((-1, -2),
@@ -837,14 +896,24 @@ class IfStmtTest(test.TestCase):
   def test_python(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i
+        i = 1
+
+      def orelse():
+        nonlocal i
+        i = -1
+
+      i = None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: 1,
-          orelse=lambda: -1,
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=None,
+          set_state=None,
+          symbol_names=('i',),
+          nouts=1)
+      return i
 
     self.assertEqual(1, test_fn(True))
     self.assertEqual(-1, test_fn(False))
@@ -852,48 +921,75 @@ class IfStmtTest(test.TestCase):
   def test_python_multiple_returns(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i, j
+        i = 1
+        j = 2
+
+      def orelse():
+        nonlocal i, j
+        i = -1
+        j = -2
+
+      i, j = None, None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: (1, 2),
-          orelse=lambda: (-1, -2),
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=None,
+          set_state=None,
+          symbol_names=('i', 'j'),
+          nouts=2)
+      return i, j
 
     self.assertEqual((1, 2), test_fn(True))
     self.assertEqual((-1, -2), test_fn(False))
 
-  def _basic_cond(self, true_value, false_value):
+  def _basic_cond(self, body_fn, else_fn):
+    def body():
+      nonlocal x
+      x = body_fn()
+
+    def orelse():
+      nonlocal x
+      x = else_fn()
+
+    def set_state(cond_vars):
+      nonlocal x
+      x, = cond_vars
+
+    x = 0
     # Eager cond had different semantics, we don't test those here.
     with func_graph.FuncGraph('tmp').as_default():
-      return control_flow.if_stmt(
+      control_flow.if_stmt(
           cond=constant_op.constant(True),
-          body=true_value,
-          orelse=false_value,
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('s',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (x,),
+          set_state=set_state,
+          symbol_names=('x',),
+          nouts=1)
+    return x
 
   def test_tensor_none_output(self):
     with self.assertRaisesRegex(
-        ValueError, '"s" is None at the end of the TRUE branch'):
+        ValueError, "'x' is None at the end of the main branch"):
       self._basic_cond(lambda: None, lambda: 1)
     with self.assertRaisesRegex(
-        ValueError, '"s" is None at the end of the FALSE branch'):
+        ValueError, "'x' is None at the end of the else branch"):
       self._basic_cond(lambda: 1, lambda: None)
 
   def test_tensor_undefined_output(self):
     with self.assertRaisesRegex(
-        ValueError, "must also be initialized in the if.*'s'"):
-      self._basic_cond(lambda: special_values.Undefined('s'), lambda: 1)
+        ValueError, "'x' must also be initialized in the main branch"):
+      self._basic_cond(lambda: variable_operators.Undefined('x'), lambda: 1)
     with self.assertRaisesRegex(
-        ValueError, "must also be initialized in the else.*'s'"):
-      self._basic_cond(lambda: 1, lambda: special_values.Undefined('s'))
+        ValueError, "'x' must also be initialized in the else branch"):
+      self._basic_cond(lambda: 1, lambda: variable_operators.Undefined('s'))
 
   def test_tensor_dtype_change(self):
-    with self.assertRaisesRegex(TypeError, '"s" has dtype int32.*but.*float32'):
+    with self.assertRaisesRegex(
+        TypeError, "'x' has dtype int32.*but.*float32"):
       self._basic_cond(lambda: 1, lambda: 1.0)
 
 
diff --git a/tensorflow/python/autograph/operators/data_structures_test.py b/tensorflow/python/autograph/operators/data_structures_test.py
index c5a3a3d1cac..5d835fd3771 100644
--- a/tensorflow/python/autograph/operators/data_structures_test.py
+++ b/tensorflow/python/autograph/operators/data_structures_test.py
@@ -106,11 +106,12 @@ class ListTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(self.evaluate(t), [[1, 2, 3]])
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l1 = data_structures.list_append(l, 1)
     l2 = data_structures.list_append(l1, 2)
+
     with self.cached_session() as sess:
       self.assertAllEqual(self.evaluate(l1.stack()), [1])
       self.assertAllEqual(self.evaluate(l2.stack()), [1, 2])
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index d4447229379..4dbe25aec6d 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -29,6 +29,7 @@ import six
 
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.autograph.utils import tensors
+from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
@@ -234,6 +235,8 @@ def len_(s):
     return _tf_tensor_list_len(s)
   elif tensor_util.is_tensor(s):
     return _tf_tensor_len(s)
+  if isinstance(s, dataset_ops.DatasetV2):
+    return _tf_dataset_len(s)
   return _py_len(s)
 
 
@@ -278,6 +281,26 @@ def _tf_tensor_len(s):
                                raise_zero_rank_error)
 
 
+def _tf_dataset_len(s):
+  l = cardinality.cardinality(s)
+  msg = gen_string_ops.string_join([
+      'len requires dataset with definitive cardinality, got ',
+      gen_string_ops.as_string(l)
+  ])
+  # TODO (yongtang): UNKNOWN is treated as an error.
+  # In case there are more UNKNOWN cases for dataset, we could
+  # use dataset.reduce() to find out the length (in an expensive way).
+  with ops.control_dependencies([
+      control_flow_ops.Assert(
+          math_ops.logical_and(
+              math_ops.not_equal(l, cardinality.INFINITE),
+              math_ops.not_equal(l, cardinality.UNKNOWN)), [msg])
+  ]):
+    l = array_ops.identity(l)
+
+  return l
+
+
 def _py_len(s):
   return len(s)
 
diff --git a/tensorflow/python/autograph/operators/py_builtins_test.py b/tensorflow/python/autograph/operators/py_builtins_test.py
index a98c9cc55ec..43feb0d9bc3 100644
--- a/tensorflow/python/autograph/operators/py_builtins_test.py
+++ b/tensorflow/python/autograph/operators/py_builtins_test.py
@@ -123,6 +123,46 @@ class PyBuiltinsTest(test.TestCase):
       tl = py_builtins.len_(data_structures.tf_tensor_list_new([3, 4, 5]))
       self.assertEqual(self.evaluate(tl), 3)
 
+  def test_len_dataset(self):
+    dataset = dataset_ops.DatasetV2.from_tensor_slices([3, 2, 1])
+    self.assertEqual(self.evaluate(py_builtins.len_(dataset)), 3)
+
+    # graph mode
+    @def_function.function(autograph=False)
+    def test_fn():
+      dataset = dataset_ops.DatasetV2.from_tensor_slices([3, 2, 1])
+      return py_builtins.len_(dataset)
+
+    self.assertEqual(self.evaluate(test_fn()), 3)
+
+  def test_len_dataset_infinite(self):
+    dataset = dataset_ops.DatasetV2.range(5).repeat().batch(2)
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      _ = self.evaluate(py_builtins.len_(dataset))
+
+    # graph mode
+    @def_function.function
+    def test_fn():
+      dataset = dataset_ops.DatasetV2.range(5).repeat().batch(2)
+      return py_builtins.len_(dataset)
+
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      self.evaluate(test_fn())
+
+  def test_len_dataset_unknown(self):
+    dataset = dataset_ops.DatasetV2.range(5).filter(lambda _: True).batch(2)
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      _ = self.evaluate(py_builtins.len_(dataset))
+
+    # graph mode
+    @def_function.function(autograph=False)
+    def test_fn():
+      dataset = dataset_ops.DatasetV2.range(5).filter(lambda _: True).batch(2)
+      return py_builtins.len_(dataset)
+
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      self.evaluate(test_fn())
+
   def test_len_scalar(self):
     with self.assertRaises(ValueError):
       py_builtins.len_(constant_op.constant(1))
diff --git a/tensorflow/python/autograph/operators/symbols.py b/tensorflow/python/autograph/operators/symbols.py
deleted file mode 100644
index 0dd7e0a5956..00000000000
--- a/tensorflow/python/autograph/operators/symbols.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Abstract representation of composite symbols that can be used in staging code.
-
-This provides a way to checkpoint the values of symbols that may be undefined
-entering staged control flow. This checkpointing is necessary to prevent some
-unintended side-effects. For example checkpointing prevents side-effects in one
-branch of a conditional from leaking into another.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.operators import special_values
-
-
-is_undefined = special_values.is_undefined
-Undefined = special_values.Undefined
-
-
-class Symbol(object):
-  """Representation of a simple or composite Python symbol.
-
-  Subclasses should implement `maybe_compute_value(self)` that returns the value
-  corresponding to the symbol or Undefined if no such value exists.
-  """
-
-  def __init__(self, name):
-    self.name = name
-
-
-class ValueSymbol(Symbol):
-  """Representation of a simple Python symbol with a concrete value.
-
-  This includes variables and literals. Since we are reifying undefined symbols
-  `Undefined` is also a valid value.
-  """
-
-  def __init__(self, name, value):
-    super(ValueSymbol, self).__init__(name)
-    self.value = value
-
-  def maybe_compute_value(self):
-    return self.value
-
-
-class AttributeAccessSymbol(Symbol):
-  """Representation of Python attribute access e.g. `a.b`."""
-
-  def __init__(self, parent_symbol, attr_name):
-    super(AttributeAccessSymbol, self).__init__(
-        parent_symbol.name + '.' + attr_name)
-    self.attr_name = attr_name
-    self.parent_symbol = parent_symbol
-
-  def maybe_compute_value(self):
-    """Compute the value corresponding to the attribute access or `Undefined`.
-
-    This will be `Undefined` if no such value exists either because there is no
-    such attribute or if the base is itself undefined.
-
-    Returns:
-      value corresponding to the attribute access or `Undefined`
-    """
-    parent_value = self.parent_symbol.maybe_compute_value()
-    if (is_undefined(parent_value) or
-        getattr(parent_value, self.attr_name, None) is None):
-      return Undefined(self.name)
-
-    return parent_value.__getattribute__(self.attr_name)
-
-
-class SubscriptSymbol(Symbol):
-  """Representation of Python subscript access e.g. `a[b]`."""
-
-  def __init__(self, parent_symbol, index_symbol):
-    super(SubscriptSymbol, self).__init__(
-        parent_symbol.name + '[' + index_symbol.name + ']')
-    self.index_symbol = index_symbol
-    self.parent_symbol = parent_symbol
-
-  def maybe_compute_value(self):
-    """Compute the value corresponding to the subscript access or `Undefined`.
-
-    This will be `Undefined` if no such value exists either because there is no
-    element corresponding to the given subscript or if the base itself is
-    not defined.
-
-    Returns:
-      value corresponding to the subscript access or `Undefined`
-    """
-    parent_value = self.parent_symbol.maybe_compute_value()
-    index_value = self.index_symbol.maybe_compute_value()
-    if is_undefined(parent_value) or is_undefined(index_value):
-      return Undefined(self.name)
-
-    try:
-      return parent_value[index_value]
-    except (IndexError, KeyError, TypeError):
-      # Reify the lack of an object for the given index/key
-      # This allows us to define them later without regret
-      return Undefined(self.name)
diff --git a/tensorflow/python/autograph/operators/symbols_test.py b/tensorflow/python/autograph/operators/symbols_test.py
deleted file mode 100644
index 3acb16273bd..00000000000
--- a/tensorflow/python/autograph/operators/symbols_test.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for special symbol handling."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.operators import special_values
-from tensorflow.python.autograph.operators import symbols
-from tensorflow.python.platform import test
-
-Undefined = special_values.Undefined
-AttributeAccessSymbol = symbols.AttributeAccessSymbol
-SubscriptSymbol = symbols.SubscriptSymbol
-ValueSymbol = symbols.ValueSymbol
-
-
-class SymbolsTest(test.TestCase):
-
-  def test_value_symbol_returns_value(self):
-    a = 42
-    a_symbol = ValueSymbol('a', a)
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_symbol.name, 'a')
-
-  def test_attribute_access_missing_attribute(self):
-    class Foo(object):
-      pass
-    a = Foo()
-
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a.b')
-
-  def test_attribute_access_undefined_target(self):
-    a = Undefined('a')
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a.b')
-
-  def test_attribute_access_basic(self):
-    class Foo(object):
-
-      def __init__(self):
-        self.b = 'this is an attribute'
-
-    a = Foo()
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-
-  def test_item_access_undefined_index(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    a = Foo()
-    b = Undefined('b')
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_no_getitem(self):
-    class Foo(object):
-      pass
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_undefined_root(self):
-    a = Undefined('a')
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_basic(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-
-  def test_item_access_after_attribute_access(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    class Bar(object):
-
-      def __init__(self):
-        self.b = Foo()
-
-    a = Bar()
-    c = 42
-    a_symbol = ValueSymbol('a', a)
-    c_symbol = ValueSymbol('c', c)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-    a_b_c_symbol = SubscriptSymbol(a_b_symbol, c_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(c_symbol.maybe_compute_value(), c)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a.b[c])
-
-  def test_attribute_access_after_item_access(self):
-    class Bar(object):
-
-      def __init__(self):
-        self.c = object()
-
-    item = Bar()
-
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return item
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-    a_b_c_symbol = AttributeAccessSymbol(a_b_symbol, 'c')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a[b].c)
-
-  def test_item_access_after_item_access(self):
-    class Bar(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    item = Bar()
-
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return item
-
-    a = Foo()
-    b = 42
-    c = 43
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    c_symbol = ValueSymbol('b', c)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-    a_b_c_symbol = SubscriptSymbol(a_b_symbol, c_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a[b][c])
-
-  def test_attribute_access_after_attribute_access(self):
-    class Bar(object):
-
-      def __init__(self):
-        self.c = object()
-
-    class Foo(object):
-
-      def __init__(self):
-        self.b = Bar()
-
-    a = Foo()
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-    a_b_c_symbol = AttributeAccessSymbol(a_b_symbol, 'c')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a.b.c)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/operators/special_values.py b/tensorflow/python/autograph/operators/variables.py
similarity index 72%
rename from tensorflow/python/autograph/operators/special_values.py
rename to tensorflow/python/autograph/operators/variables.py
index c172cce23f1..150f64e1758 100644
--- a/tensorflow/python/autograph/operators/special_values.py
+++ b/tensorflow/python/autograph/operators/variables.py
@@ -19,6 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 
+def ld(v):
+  """Load variable operator."""
+  if isinstance(v, Undefined):
+    return v.read()
+  return v
+
+
 class Undefined(object):
   """Represents an undefined symbol in Python.
 
@@ -51,6 +58,10 @@ class Undefined(object):
   def __init__(self, symbol_name):
     self.symbol_name = symbol_name
 
+  def read(self):
+    raise UnboundLocalError("'{}' is used before assignment".format(
+        self.symbol_name))
+
   def __repr__(self):
     return self.symbol_name
 
@@ -66,34 +77,7 @@ class Undefined(object):
     return self
 
 
-def is_undefined(value):
-  """Checks whether Autograph has determined that a given value is undefined.
-
-  This only works in places where Autograph reifies undefined symbols. Note that
-  if this function is passed a truly undefined symbol the call-site will raise
-  NameError.
-
-  Args:
-    value: value to test for undefinedness
-  Returns:
-    Boolean, whether the input value is undefined.
-  """
-  return isinstance(value, Undefined)
-
-
 # TODO(mdan): Refactor as a RetVal object, aggregating the value and do_return.
 class UndefinedReturnValue(object):
-  """Represents a default return value from a function (None in Python)."""
+  """Represents a return value that is undefined."""
   pass
-
-
-def retval(value):
-  """Returns the actual value that a return statement should produce."""
-  if isinstance(value, UndefinedReturnValue):
-    return None
-  return value
-
-
-def is_undefined_return(value):
-  """Checks whether `value` is the default return value."""
-  return isinstance(value, UndefinedReturnValue)
diff --git a/tensorflow/python/autograph/operators/special_values_test.py b/tensorflow/python/autograph/operators/variables_test.py
similarity index 58%
rename from tensorflow/python/autograph/operators/special_values_test.py
rename to tensorflow/python/autograph/operators/variables_test.py
index 1742cc4277d..168e6172232 100644
--- a/tensorflow/python/autograph/operators/special_values_test.py
+++ b/tensorflow/python/autograph/operators/variables_test.py
@@ -18,28 +18,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.platform import test
 
 
 class SpecialValuesTest(test.TestCase):
 
   def test_undefined(self):
-    undefined_symbol = special_values.Undefined('name')
-    self.assertEqual(undefined_symbol.symbol_name, 'name')
+    undefined_symbol = variables.Undefined('name')
+    undefined_symbol2 = variables.Undefined('name')
 
-    undefined_symbol2 = special_values.Undefined('name')
+    self.assertEqual(undefined_symbol.symbol_name, 'name')
+    self.assertEqual(undefined_symbol2.symbol_name, 'name')
     self.assertNotEqual(undefined_symbol, undefined_symbol2)
 
-    self.assertTrue(special_values.is_undefined(undefined_symbol))
-    self.assertTrue(special_values.is_undefined(undefined_symbol2))
-
   def test_undefined_operations(self):
-    undefined_symbol = special_values.Undefined('name')
+    undefined_symbol = variables.Undefined('name')
+
+    self.assertIsInstance(undefined_symbol.foo, variables.Undefined)
+    self.assertIsInstance(undefined_symbol[0], variables.Undefined)
+    self.assertNotIsInstance(undefined_symbol.__class__, variables.Undefined)
+
+  def test_read(self):
+    self.assertEqual(variables.ld(1), 1)
+    o = object()
+    self.assertEqual(variables.ld(o), o)
+
+    self.assertIsNone(variables.ld(None))
+
+  def test_read_undefined(self):
+    with self.assertRaisesRegex(UnboundLocalError, 'used before assignment'):
+      variables.ld(variables.Undefined('a'))
 
-    self.assertTrue(special_values.is_undefined(undefined_symbol.foo))
-    self.assertTrue(special_values.is_undefined(undefined_symbol[0]))
-    self.assertFalse(special_values.is_undefined(undefined_symbol.__class__))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/anno.py b/tensorflow/python/autograph/pyct/anno.py
index 6fd05f833f5..a5f3f5b33a4 100644
--- a/tensorflow/python/autograph/pyct/anno.py
+++ b/tensorflow/python/autograph/pyct/anno.py
@@ -93,6 +93,9 @@ class Static(NoValue):
   ORIG_DEFINITIONS = (
       'The value of DEFINITIONS that applied to the original code before any'
       ' conversion.')
+  DEFINED_FNS_IN = (
+      'Local function definitions that may exist when exiting the node. See'
+      ' reaching_fndefs.py')
   DEFINED_VARS_IN = (
       'Symbols defined when entering the node. See reaching_definitions.py.')
   LIVE_VARS_OUT = ('Symbols live when exiting the node. See liveness.py.')
@@ -112,8 +115,7 @@ def getanno(node, key, default=FAIL, field_name='___pyct_anno'):
   if (default is FAIL or (hasattr(node, field_name) and
                           (key in getattr(node, field_name)))):
     return getattr(node, field_name)[key]
-  else:
-    return default
+  return default
 
 
 def hasanno(node, key, field_name='___pyct_anno'):
diff --git a/tensorflow/python/autograph/pyct/ast_util.py b/tensorflow/python/autograph/pyct/ast_util.py
index e897b47813a..5b6cb115a41 100644
--- a/tensorflow/python/autograph/pyct/ast_util.py
+++ b/tensorflow/python/autograph/pyct/ast_util.py
@@ -121,6 +121,12 @@ class SymbolRenamer(gast.NodeTransformer):
     # Renaming attributes is not supported.
     return self.generic_visit(node)
 
+  def visit_FunctionDef(self, node):
+    qn = qual_names.QN(node.name)
+    if qn in self.name_map:
+      node.name = str(self.name_map[qn])
+    return self.generic_visit(node)
+
 
 def rename_symbols(node, name_map):
   """Renames symbols in an AST. Requires qual_names annotations."""
diff --git a/tensorflow/python/autograph/pyct/ast_util_test.py b/tensorflow/python/autograph/pyct/ast_util_test.py
index c0ef9c587a5..8679e3b8dcb 100644
--- a/tensorflow/python/autograph/pyct/ast_util_test.py
+++ b/tensorflow/python/autograph/pyct/ast_util_test.py
@@ -90,6 +90,14 @@ class AstUtilTest(test.TestCase):
 
     self.assertIs(anno.getanno(node, 'foo'), orig_anno)
 
+  def test_rename_symbols_function(self):
+    node = parser.parse('def f():\n  pass')
+    node = ast_util.rename_symbols(node,
+                                   {qual_names.QN('f'): qual_names.QN('f1')})
+
+    source = parser.unparse(node, include_encoding_marker=False)
+    self.assertEqual(source.strip(), 'def f1():\n    pass')
+
   def test_copy_clean(self):
     node = parser.parse(
         textwrap.dedent("""
diff --git a/tensorflow/python/autograph/pyct/cfg.py b/tensorflow/python/autograph/pyct/cfg.py
index cae838c8c11..9a8ece2bc3a 100644
--- a/tensorflow/python/autograph/pyct/cfg.py
+++ b/tensorflow/python/autograph/pyct/cfg.py
@@ -29,6 +29,7 @@ notable exception:
    raise (i.e. a function call in the middle of a block does not return or jump
    to any except or finally block)
 TODO(mdan): Consider adding the edges above. They'd only add ~O(n) edges.
+TODO(mdan): Alternatively, consider adding an edge from try to all its excepts.
 """
 
 # TODO(mdan): The notion of 'statements' below is inaccurate.
@@ -43,10 +44,8 @@ import collections
 import weakref
 from enum import Enum
 
-# pylint:disable=g-bad-import-order
-
 import gast
-# pylint:enable=g-bad-import-order
+import six
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
@@ -207,6 +206,18 @@ class GraphVisitor(object):
         node: self.init_state(node) for node in self.graph.index.values()
     }
 
+  def can_ignore(self, node):
+    """Returns True if the node can safely be assumed not to touch variables."""
+    ast_node = node.ast_node
+    if anno.hasanno(ast_node, anno.Basic.SKIP_PROCESSING):
+      return True
+    if six.PY2:
+      if (isinstance(ast_node, gast.Name) and
+          ast_node.id in ('None', 'True', 'False')):
+        return True
+    return isinstance(ast_node,
+                      (gast.Break, gast.Continue, gast.Raise, gast.Pass))
+
   def _visit_internal(self, mode):
     """Visits the CFG, depth-first."""
     assert mode in (_WalkMode.FORWARD, _WalkMode.REVERSE)
@@ -679,6 +690,7 @@ class AstToCfg(gast.NodeVisitor):
 
   def _process_exit_statement(
       self, node, exits_nodes_of_type, may_exit_via_except=False):
+    self.generic_visit(node)
     # Note: this is safe because we process functions separately.
     try_node, guards = self._get_enclosing_finally_scopes(exits_nodes_of_type)
     assert try_node is not None, '{} that is not enclosed by any of {}'.format(
@@ -727,11 +739,9 @@ class AstToCfg(gast.NodeVisitor):
     # TODO(mdan): Track the CFG local to the class definition as well?
     self.builder = self.builder_stack.pop()
 
-  def visit_FunctionDef(self, node):
-    # We also keep the FunctionDef node in the CFG. This allows us to determine
-    # things like reaching definitions via closure. Note that the function body
-    # will be stored in a separate graph, because function definitions are not
-    # the same as function calls.
+  def _process_function_def(self, node, is_lambda):
+    # The function body is stored in a separate graph, because function
+    # definitions have effects very different from function calls.
     if self.builder is not None:
       self.builder.add_ordinary_node(node)
 
@@ -742,8 +752,11 @@ class AstToCfg(gast.NodeVisitor):
     self.builder.enter_section(node)
 
     self._process_basic_statement(node.args)
-    for stmt in node.body:
-      self.visit(stmt)
+    if is_lambda:
+      self._process_exit_statement(node.body, (gast.Lambda,))
+    else:
+      for stmt in node.body:
+        self.visit(stmt)
 
     self.builder.exit_section(node)
     self._exit_lexical_scope(node)
@@ -751,6 +764,12 @@ class AstToCfg(gast.NodeVisitor):
     self.cfgs[node] = self.builder.build()
     self.builder = self.builder_stack.pop()
 
+  def visit_FunctionDef(self, node):
+    self._process_function_def(node, is_lambda=False)
+
+  def visit_Lambda(self, node):
+    self._process_function_def(node, is_lambda=True)
+
   def visit_Return(self, node):
     self._process_exit_statement(node, (gast.FunctionDef,))
 
@@ -824,6 +843,7 @@ class AstToCfg(gast.NodeVisitor):
 
     self.builder.enter_section(node)
 
+    self.generic_visit(node.test)
     self.builder.enter_loop_section(node, node.test)
     for stmt in node.body:
       self.visit(stmt)
@@ -849,6 +869,7 @@ class AstToCfg(gast.NodeVisitor):
     # Note: Strictly speaking, this should be node.target + node.iter.
     # However, the activity analysis accounts for this inconsistency,
     # so dataflow analysis produces the correct values.
+    self.generic_visit(node.iter)
     self.builder.enter_loop_section(node, node.iter)
     # Also include the "extra loop test" annotation, to capture things like the
     # control variable for return and break in for loops.
diff --git a/tensorflow/python/autograph/pyct/cfg_test.py b/tensorflow/python/autograph/pyct/cfg_test.py
index d0b88c84a7f..242a33b281c 100644
--- a/tensorflow/python/autograph/pyct/cfg_test.py
+++ b/tensorflow/python/autograph/pyct/cfg_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gast
+
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.platform import test
@@ -1030,16 +1032,117 @@ class AstToCfgTest(test.TestCase):
       a = lambda b: a + b
       return a
 
-    graph, = self._build_cfg(test_fn).values()
+    graphs = self._build_cfg(test_fn)
+    for k, v in graphs.items():
+      if isinstance(k, gast.Lambda):
+        lam_graph = v
+      else:
+        fn_graph = v
 
     self.assertGraphMatches(
-        graph,
+        fn_graph,
         (
-            ('a', 'a = (lambda b: (a + b))', 'return a'),
+            ('a', '(lambda b: (a + b))', 'a = (lambda b: (a + b))'),
+            ('(lambda b: (a + b))', 'a = (lambda b: (a + b))', 'return a'),
             ('a = (lambda b: (a + b))', 'return a', None),
         ),
     )
-    self.assertGraphEnds(graph, 'a', ('return a',))
+    self.assertGraphEnds(fn_graph, 'a', ('return a',))
+    self.assertGraphMatches(
+        lam_graph,
+        (
+            ('b', '(a + b)', None),
+        ),
+    )
+    self.assertGraphEnds(lam_graph, 'b', ('(a + b)',))
+
+  def test_lambda_in_return(self):
+
+    def test_fn(a):
+      return lambda b: a + b
+
+    graphs = self._build_cfg(test_fn)
+    for k, v in graphs.items():
+      if isinstance(k, gast.Lambda):
+        lam_graph = v
+      else:
+        fn_graph = v
+
+    self.assertGraphMatches(
+        fn_graph,
+        (
+            ('a', '(lambda b: (a + b))', 'return (lambda b: (a + b))'),
+            ('(lambda b: (a + b))', 'return (lambda b: (a + b))', None),
+        ),
+    )
+    self.assertGraphEnds(fn_graph, 'a', ('return (lambda b: (a + b))',))
+    self.assertGraphMatches(
+        lam_graph,
+        (
+            ('b', '(a + b)', None),
+        ),
+    )
+    self.assertGraphEnds(lam_graph, 'b', ('(a + b)',))
+
+  def test_lambda_in_while_loop_test(self):
+
+    def test_fn(a):
+      while (lambda b: a + b)(a):
+        pass
+
+    graphs = self._build_cfg(test_fn)
+    for k, v in graphs.items():
+      if isinstance(k, gast.Lambda):
+        lam_graph = v
+      else:
+        fn_graph = v
+
+    self.assertGraphMatches(
+        fn_graph,
+        (
+            ('a', '(lambda b: (a + b))', '(lambda b: (a + b))(a)'),
+            (('(lambda b: (a + b))', 'pass'), '(lambda b: (a + b))(a)', 'pass'),
+            ('(lambda b: (a + b))(a)', 'pass', '(lambda b: (a + b))(a)'),
+        ),
+    )
+    self.assertGraphEnds(fn_graph, 'a', ('(lambda b: (a + b))(a)',))
+    self.assertGraphMatches(
+        lam_graph,
+        (
+            ('b', '(a + b)', None),
+        ),
+    )
+    self.assertGraphEnds(lam_graph, 'b', ('(a + b)',))
+
+  def test_lambda_in_for_loop_test(self):
+
+    def test_fn(a):
+      for _ in (lambda b: a + b)(a):
+        pass
+
+    graphs = self._build_cfg(test_fn)
+    for k, v in graphs.items():
+      if isinstance(k, gast.Lambda):
+        lam_graph = v
+      else:
+        fn_graph = v
+
+    self.assertGraphMatches(
+        fn_graph,
+        (
+            ('a', '(lambda b: (a + b))', '(lambda b: (a + b))(a)'),
+            (('(lambda b: (a + b))', 'pass'), '(lambda b: (a + b))(a)', 'pass'),
+            ('(lambda b: (a + b))(a)', 'pass', '(lambda b: (a + b))(a)'),
+        ),
+    )
+    self.assertGraphEnds(fn_graph, 'a', ('(lambda b: (a + b))(a)',))
+    self.assertGraphMatches(
+        lam_graph,
+        (
+            ('b', '(a + b)', None),
+        ),
+    )
+    self.assertGraphEnds(lam_graph, 'b', ('(a + b)',))
 
   def test_pass(self):
 
diff --git a/tensorflow/python/autograph/pyct/error_utils.py b/tensorflow/python/autograph/pyct/error_utils.py
index 3f7ace067fe..3e9b8754c3c 100644
--- a/tensorflow/python/autograph/pyct/error_utils.py
+++ b/tensorflow/python/autograph/pyct/error_utils.py
@@ -131,6 +131,7 @@ KNOWN_STRING_CONSTRUCTOR_ERRORS = (
     RuntimeError,
     StopIteration,
     TypeError,
+    UnboundLocalError,
     ValueError,
 )
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 3620cff3fd1..0764a3e64b4 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -23,6 +23,7 @@ py_library(
         "annos.py",
         "liveness.py",
         "reaching_definitions.py",
+        "reaching_fndefs.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index 4e0c812a7be..0e19da87451 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -59,6 +59,8 @@ class Scope(object):
       block.
     isolated_names: Set[qual_names.QN], identifiers that are isolated to this
       scope (even if the scope is not isolated).
+    annotations: Set[qual_names.QN], identifiers used as type annotations
+      in this scope.
     read: Set[qual_names.QN], identifiers read in this scope.
     modified: Set[qual_names.QN], identifiers modified in this scope.
     deleted: Set[qual_names.QN], identifiers deleted in this scope.
@@ -68,6 +70,9 @@ class Scope(object):
     globals: Set[qual_names.QN], names that are explicitly marked as global in
       this scope. Note that this doesn't include free read-only vars bound to
       global symbols.
+    nonlocals: Set[qual_names.QN], names that are explicitly marked as nonlocal
+      in this scope. Note that this doesn't include free read-only vars bound to
+      global symbols.
     free_vars: Set[qual_names.QN], the free variables in this scope. See
       https://docs.python.org/3/reference/executionmodel.html for a precise
       definition.
@@ -109,6 +114,8 @@ class Scope(object):
 
     self.bound = set()
     self.globals = set()
+    self.nonlocals = set()
+    self.annotations = set()
 
     self.params = weakref.WeakValueDictionary()
 
@@ -145,6 +152,7 @@ class Scope(object):
     self.read = copy.copy(other.read)
     self.deleted = copy.copy(other.deleted)
     self.bound = copy.copy(other.bound)
+    self.annotations = copy.copy(other.annotations)
     self.params = copy.copy(other.params)
 
   @classmethod
@@ -159,6 +167,7 @@ class Scope(object):
     return new_copy
 
   def merge_from(self, other):
+    """Adds all activity from another scope to this scope."""
     assert not self.is_final
     if self.parent is not None:
       assert other.parent is not None
@@ -167,6 +176,7 @@ class Scope(object):
     self.read.update(other.read)
     self.modified.update(other.modified)
     self.bound.update(other.deleted)
+    self.annotations.update(other.annotations)
     self.params.update(other.params)
 
   def finalize(self):
@@ -180,9 +190,12 @@ class Scope(object):
         self.parent.modified.update(self.modified - self.isolated_names)
         self.parent.bound.update(self.bound - self.isolated_names)
         self.parent.globals.update(self.globals)
+        self.parent.nonlocals.update(self.nonlocals)
+        self.parent.annotations.update(self.annotations)
       else:
         # TODO(mdan): This is not accurate.
         self.parent.read.update(self.read - self.bound)
+        self.parent.annotations.update(self.annotations - self.bound)
     self.is_final = True
 
   def __repr__(self):
@@ -223,11 +236,14 @@ class ActivityAnalyzer(transformer.Base):
 
   def __init__(self, context, parent_scope=None):
     super(ActivityAnalyzer, self).__init__(context)
+    self.allow_skips = False
     self.scope = Scope(parent_scope, isolated=True)
 
     # Note: all these flags crucially rely on the respective nodes are
     # leaves in the AST, that is, they cannot contain other statements.
     self._in_aug_assign = False
+    self._in_annotation = False
+    self._track_annotations_only = False
 
   @property
   def _in_constructor(self):
@@ -249,6 +265,9 @@ class ActivityAnalyzer(transformer.Base):
     return False
 
   def _track_symbol(self, node, composite_writes_alter_parent=False):
+    if self._track_annotations_only and not self._in_annotation:
+      return
+
     # A QN may be missing when we have an attribute (or subscript) on a function
     # call. Example: a().b
     if not anno.hasanno(node, anno.Basic.QN):
@@ -282,6 +301,8 @@ class ActivityAnalyzer(transformer.Base):
 
     elif isinstance(node.ctx, gast.Load):
       self.scope.read.add(qn)
+      if self._in_annotation:
+        self.scope.annotations.add(qn)
 
     elif isinstance(node.ctx, gast.Param):
       self.scope.bound.add(qn)
@@ -320,6 +341,12 @@ class ActivityAnalyzer(transformer.Base):
     self._exit_and_record_scope(node)
     return node
 
+  def _process_annotation(self, node):
+    self._in_annotation = True
+    node = self.visit(node)
+    self._in_annotation = False
+    return node
+
   def visit_Import(self, node):
     return self._process_statement(node)
 
@@ -327,8 +354,22 @@ class ActivityAnalyzer(transformer.Base):
     return self._process_statement(node)
 
   def visit_Global(self, node):
+    self._enter_scope(False)
     for name in node.names:
-      self.scope.globals.add(qual_names.QN(name))
+      qn = qual_names.QN(name)
+      self.scope.read.add(qn)
+      self.scope.globals.add(qn)
+    self._exit_and_record_scope(node)
+    return node
+
+  def visit_Nonlocal(self, node):
+    self._enter_scope(False)
+    for name in node.names:
+      qn = qual_names.QN(name)
+      self.scope.read.add(qn)
+      self.scope.bound.add(qn)
+      self.scope.nonlocals.add(qn)
+    self._exit_and_record_scope(node)
     return node
 
   def visit_Expr(self, node):
@@ -344,7 +385,13 @@ class ActivityAnalyzer(transformer.Base):
     return self._process_statement(node)
 
   def visit_AnnAssign(self, node):
-    return self._process_statement(node)
+    self._enter_scope(False)
+    node.target = self.visit(node.target)
+    node.value = self.visit(node.value)
+    if node.annotation:
+      node.annotation = self._process_annotation(node.annotation)
+    self._exit_and_record_scope(node)
+    return node
 
   def visit_AugAssign(self, node):
     # Special rules for AugAssign. Here, the AST only shows the target as
@@ -364,7 +411,8 @@ class ActivityAnalyzer(transformer.Base):
     return self._process_statement(node)
 
   def visit_Name(self, node):
-    node = self.generic_visit(node)
+    if node.annotation:
+      node.annotation = self._process_annotation(node.annotation)
     self._track_symbol(node)
     return node
 
@@ -474,9 +522,6 @@ class ActivityAnalyzer(transformer.Base):
   def visit_GeneratorExp(self, node):
     return self._process_comprehension(node)
 
-  def visit_arguments(self, node):
-    return self._process_statement(node)
-
   def visit_ClassDef(self, node):
     with self.state[_FunctionOrClass] as fn:
       fn.node = node
@@ -496,6 +541,27 @@ class ActivityAnalyzer(transformer.Base):
       self._exit_scope()
       return node
 
+  def _visit_node_list(self, nodes):
+    return [(None if n is None else self.visit(n)) for n in nodes]
+
+  def _visit_arg_annotations(self, node):
+    node.args.kw_defaults = self._visit_node_list(node.args.kw_defaults)
+    node.args.defaults = self._visit_node_list(node.args.defaults)
+    self._track_annotations_only = True
+    node = self._visit_arg_declarations(node)
+    self._track_annotations_only = False
+    return node
+
+  def _visit_arg_declarations(self, node):
+    node.args.posonlyargs = self._visit_node_list(node.args.posonlyargs)
+    node.args.args = self._visit_node_list(node.args.args)
+    if node.args.vararg is not None:
+      node.args.vararg = self.visit(node.args.vararg)
+    node.args.kwonlyargs = self._visit_node_list(node.args.kwonlyargs)
+    if node.args.kwarg is not None:
+      node.args.kwarg = self.visit(node.args.kwarg)
+    return node
+
   def visit_FunctionDef(self, node):
     with self.state[_FunctionOrClass] as fn:
       fn.node = node
@@ -503,6 +569,11 @@ class ActivityAnalyzer(transformer.Base):
       # of its name, along with the usage of any decorator accompanying it.
       self._enter_scope(False)
       node.decorator_list = self.visit_block(node.decorator_list)
+      if node.returns:
+        node.returns = self._process_annotation(node.returns)
+      # Argument annotartions (includeing defaults) affect the defining context.
+      node = self._visit_arg_annotations(node)
+
       function_name = qual_names.QN(node.name)
       self.scope.modified.add(function_name)
       self.scope.bound.add(function_name)
@@ -510,7 +581,15 @@ class ActivityAnalyzer(transformer.Base):
 
       # A separate Scope tracks the actual function definition.
       self._enter_scope(True)
-      node.args = self.visit(node.args)
+
+      # Keep a separate scope for the arguments node, which is used in the CFG.
+      self._enter_scope(False)
+
+      # Arg declarations only affect the function itself, and have no effect
+      # in the defining context whatsoever.
+      node = self._visit_arg_declarations(node)
+
+      self._exit_and_record_scope(node.args)
 
       # Track the body separately. This is for compatibility reasons, it may not
       # be strictly needed.
@@ -518,16 +597,49 @@ class ActivityAnalyzer(transformer.Base):
       node.body = self.visit_block(node.body)
       self._exit_and_record_scope(node, NodeAnno.BODY_SCOPE)
 
-      self._exit_scope()
+      self._exit_and_record_scope(node, NodeAnno.ARGS_AND_BODY_SCOPE)
       return node
 
   def visit_Lambda(self, node):
     # Lambda nodes are treated in roughly the same way as FunctionDef nodes.
     with self.state[_FunctionOrClass] as fn:
       fn.node = node
-      self._enter_scope(True)
-      node = self.generic_visit(node)
+      # The Lambda node itself has a Scope object that tracks the creation
+      # of its name, along with the usage of any decorator accompanying it.
+      self._enter_scope(False)
+      node = self._visit_arg_annotations(node)
       self._exit_and_record_scope(node)
+
+      # A separate Scope tracks the actual function definition.
+      self._enter_scope(True)
+
+      # Keep a separate scope for the arguments node, which is used in the CFG.
+      self._enter_scope(False)
+      node = self._visit_arg_declarations(node)
+      self._exit_and_record_scope(node.args)
+
+      # Track the body separately. This is for compatibility reasons, it may not
+      # be strictly needed.
+      # TODO(mdan): Do remove it, it's confusing.
+      self._enter_scope(False)
+      node.body = self.visit(node.body)
+
+      # The lambda body can contain nodes of types normally not found as
+      # statements, and may not have the SCOPE annotation needed by the CFG.
+      # So we attach one if necessary.
+      if not anno.hasanno(node.body, anno.Static.SCOPE):
+        anno.setanno(node.body, anno.Static.SCOPE, self.scope)
+
+      self._exit_and_record_scope(node, NodeAnno.BODY_SCOPE)
+
+      lambda_scope = self.scope
+      self._exit_and_record_scope(node, NodeAnno.ARGS_AND_BODY_SCOPE)
+
+      # Exception: lambdas are assumed to be used in the place where
+      # they are defined. Therefore, their activity is passed on to the
+      # calling statement.
+      self.scope.read.update(lambda_scope.read - lambda_scope.bound)
+
       return node
 
   def visit_With(self, node):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_py3_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_py3_test.py
index 30ce2f73966..b29e08f0884 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_py3_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_py3_test.py
@@ -43,7 +43,10 @@ class ActivityAnalyzerTest(activity_test.ActivityAnalyzerTestBase):
     node, _ = self._parse_and_analyze(test_fn)
     fn_node = node
     body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
-    self.assertScopeIs(body_scope, ('nonlocal_b', 'c'), ('nonlocal_a',))
+    self.assertScopeIs(
+        body_scope, ('nonlocal_a', 'nonlocal_b', 'c'), ('nonlocal_a',))
+    nonlocal_a_scope = anno.getanno(fn_node.body[0], anno.Static.SCOPE)
+    self.assertScopeIs(nonlocal_a_scope, ('nonlocal_a',), ())
 
   def test_annotated_assign(self):
     b = int
@@ -54,10 +57,33 @@ class ActivityAnalyzerTest(activity_test.ActivityAnalyzerTestBase):
 
     node, _ = self._parse_and_analyze(test_fn)
     fn_node = node
+
     body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
     self.assertScopeIs(body_scope, ('b', 'c', 'a'), ('a',))
+    self.assertSymbolSetsAre(('b',), body_scope.annotations, 'annotations')
+
     ann_assign_scope = anno.getanno(fn_node.body[0], anno.Static.SCOPE)
     self.assertScopeIs(ann_assign_scope, ('b', 'c'), ('a',))
+    self.assertSymbolSetsAre(
+        ('b',), ann_assign_scope.annotations, 'annotations')
+
+  def test_function_def_annotations(self):
+    b = int
+    c = int
+
+    def test_fn(a: b) -> c:
+      return a
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node
+
+    fn_scope = anno.getanno(fn_node, anno.Static.SCOPE)
+    self.assertScopeIs(fn_scope, ('b', 'c'), ('test_fn',))
+    self.assertSymbolSetsAre(('b', 'c'), fn_scope.annotations, 'annotations')
+
+    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(body_scope, ('a',), ())
+    self.assertSymbolSetsAre((), body_scope.annotations, 'annotations')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
index e4a93dbc91d..3a1b552190a 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
@@ -373,17 +373,73 @@ class ActivityAnalyzerTest(ActivityAnalyzerTestBase):
         y = x * x
         return y
 
-      b = a
-      for i in a:
-        c = b
-        b -= f(i)
-      return b, c
+      return f(a)
+
+    node, _ = self._parse_and_analyze(test_fn)
+
+    fn_node = node
+    scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'f'), ('f',))
+
+    fn_def_node = node.body[0]
+
+    scope = anno.getanno(fn_def_node, anno.Static.SCOPE)
+    self.assertScopeIs(scope, (), ('f'))
+
+    scope = anno.getanno(fn_def_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('x', 'y'), ('y',))
+
+    scope = anno.getanno(fn_def_node, NodeAnno.ARGS_AND_BODY_SCOPE)
+    self.assertScopeIs(scope, ('x', 'y'), ('y',))
+    self.assertSymbolSetsAre(('x', 'y'), scope.bound, 'BOUND')
+
+  def test_nested_lambda(self):
+
+    def test_fn(a):
+      return lambda x: (x * a)
+
+    node, _ = self._parse_and_analyze(test_fn)
+
+    fn_node = node
+    scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('a',), ())
+
+    return_node = node.body[0]
+
+    scope = anno.getanno(return_node, anno.Static.SCOPE)
+    self.assertScopeIs(scope, ('a',), ())
+
+    lam_def_node = return_node.value
+
+    scope = anno.getanno(lam_def_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'x'), ())
+
+    scope = anno.getanno(lam_def_node, NodeAnno.ARGS_AND_BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'x'), ())
+    self.assertSymbolSetsAre(('x',), scope.bound, 'BOUND')
+
+  def test_nested_function_arg_defaults(self):
+
+    def test_fn(a):
+
+      def f(x=a):
+        y = x * x
+        return y
+
+      return f(a)
 
     node, _ = self._parse_and_analyze(test_fn)
     fn_def_node = node.body[0]
 
     self.assertScopeIs(
-        anno.getanno(fn_def_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('y',))
+        anno.getanno(fn_def_node, anno.Static.SCOPE), ('a',), ('f',))
+
+    scope = anno.getanno(fn_def_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('x', 'y'), ('y',))
+
+    scope = anno.getanno(fn_def_node, NodeAnno.ARGS_AND_BODY_SCOPE)
+    self.assertScopeIs(scope, ('x', 'y'), ('y',))
+    self.assertSymbolSetsAre(('x', 'y'), scope.bound, 'BOUND')
 
   def test_constructor_attributes(self):
 
@@ -482,64 +538,154 @@ class ActivityAnalyzerTest(ActivityAnalyzerTestBase):
     self.assertScopeIs(
         anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('foo', 'x'), ())
 
-  def test_params(self):
-
-    def test_fn(a, b):  # pylint: disable=unused-argument
-      return b
-
-    node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node
-    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
-    self.assertScopeIs(body_scope, ('b',), ())
-    self.assertScopeIs(body_scope.parent, ('b',), ())
-
-    args_scope = anno.getanno(fn_node.args, anno.Static.SCOPE)
-    self.assertSymbolSetsAre(('a', 'b'), args_scope.params.keys(), 'params')
-
-  def test_lambda_captures_reads(self):
+  def test_lambda(self):
 
     def test_fn(a, b):
-      return lambda: a + b
+      return lambda: (a + b)
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node
-    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
-    self.assertScopeIs(body_scope, ('a', 'b'), ())
-    # Nothing local to the lambda is tracked.
-    self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
 
-  def test_lambda_params_are_isolated(self):
+    fn_node = node
+    scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b'), ())
+
+    lam_def_node = node.body[0].value
+
+    scope = anno.getanno(lam_def_node, anno.Static.SCOPE)
+    self.assertScopeIs(scope, (), ())
+
+    scope = anno.getanno(lam_def_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b'), ())
+
+    scope = anno.getanno(lam_def_node, NodeAnno.ARGS_AND_BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b'), ())
+    self.assertSymbolSetsAre((), scope.bound, 'BOUND')
+
+    scope = anno.getanno(lam_def_node.args, anno.Static.SCOPE)
+    self.assertSymbolSetsAre((), scope.params.keys(), 'lambda params')
+
+  def test_lambda_params_args(self):
 
     def test_fn(a, b):  # pylint: disable=unused-argument
       return lambda a: a + b
 
     node, _ = self._parse_and_analyze(test_fn)
+
     fn_node = node
-    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
-    self.assertScopeIs(body_scope, ('b',), ())
-    self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
+    scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    # Note: `a` in `a + b` is not "read" here because it's hidden by the `a`
+    # argument.
+    self.assertScopeIs(scope, ('b',), ())
+
+    lam_def_node = node.body[0].value
+
+    scope = anno.getanno(lam_def_node, anno.Static.SCOPE)
+    self.assertScopeIs(scope, (), ())
+
+    scope = anno.getanno(lam_def_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b'), ())
+
+    scope = anno.getanno(lam_def_node, NodeAnno.ARGS_AND_BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b'), ())
+    self.assertSymbolSetsAre(('a',), scope.bound, 'BOUND')
+
+    scope = anno.getanno(lam_def_node.args, anno.Static.SCOPE)
+    self.assertSymbolSetsAre(('a',), scope.params.keys(), 'lambda params')
+
+  def test_lambda_params_arg_defaults(self):
+
+    def test_fn(a, b, c):  # pylint: disable=unused-argument
+      return lambda b=c: a + b
+
+    node, _ = self._parse_and_analyze(test_fn)
+
+    fn_node = node
+    scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    # Note: `b` is not "read" here because it's hidden by the argument.
+    self.assertScopeIs(scope, ('a', 'c'), ())
+
+    lam_def_node = node.body[0].value
+
+    scope = anno.getanno(lam_def_node, anno.Static.SCOPE)
+    self.assertScopeIs(scope, ('c',), ())
+
+    scope = anno.getanno(lam_def_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b'), ())
+
+    scope = anno.getanno(lam_def_node, NodeAnno.ARGS_AND_BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b'), ())
+    self.assertSymbolSetsAre(('b',), scope.bound, 'BOUND')
+
+    scope = anno.getanno(lam_def_node.args, anno.Static.SCOPE)
+    self.assertSymbolSetsAre(('b',), scope.params.keys(), 'lambda params')
 
   def test_lambda_complex(self):
 
-    def test_fn(a, b, c, d):  # pylint: disable=unused-argument
-      a = (lambda a, b, c: a + b + c)(d, 1, 2) + b
+    def test_fn(a, b, c, d, e):  # pylint: disable=unused-argument
+      a = (lambda a, b, c=e: a + b + c)(d, 1, 2) + b
 
     node, _ = self._parse_and_analyze(test_fn)
+
     fn_node = node
-    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
-    self.assertScopeIs(body_scope, ('b', 'd'), ('a',))
-    self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
+    scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('d', 'b', 'e'), ('a',))
+
+    lam_def_node = node.body[0].value.left.func
+
+    scope = anno.getanno(lam_def_node, anno.Static.SCOPE)
+    self.assertScopeIs(scope, ('e',), ())
+
+    scope = anno.getanno(lam_def_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b', 'c'), ())
+
+    scope = anno.getanno(lam_def_node, NodeAnno.ARGS_AND_BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b', 'c'), ())
+    self.assertSymbolSetsAre(('a', 'b', 'c'), scope.bound, 'BOUND')
+
+    scope = anno.getanno(lam_def_node.args, anno.Static.SCOPE)
+    self.assertSymbolSetsAre(
+        ('a', 'b', 'c'), scope.params.keys(), 'lambda params')
 
   def test_lambda_nested(self):
 
-    def test_fn(a, b, c, d, e):  # pylint: disable=unused-argument
-      a = lambda a, b: d(lambda b: a + b + c)  # pylint: disable=undefined-variable
+    def test_fn(a, b, c, d, e, f):  # pylint: disable=unused-argument
+      a = lambda a, b: d(lambda b=f: a + b + c)  # pylint: disable=undefined-variable
 
     node, _ = self._parse_and_analyze(test_fn)
+
     fn_node = node
-    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
-    self.assertScopeIs(body_scope, ('c', 'd'), ('a',))
-    self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
+    scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('d', 'c', 'f'), ('a',))
+
+    outer_lam_def = node.body[0].value
+
+    scope = anno.getanno(outer_lam_def, anno.Static.SCOPE)
+    self.assertScopeIs(scope, (), ())
+
+    scope = anno.getanno(outer_lam_def, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('d', 'f', 'a', 'c'), ())
+
+    scope = anno.getanno(outer_lam_def, NodeAnno.ARGS_AND_BODY_SCOPE)
+    self.assertScopeIs(scope, ('d', 'f', 'a', 'c'), ())
+    self.assertSymbolSetsAre(('a', 'b'), scope.bound, 'BOUND')
+
+    scope = anno.getanno(outer_lam_def.args, anno.Static.SCOPE)
+    self.assertSymbolSetsAre(('a', 'b'), scope.params.keys(), 'lambda params')
+
+    inner_lam_def = outer_lam_def.body.args[0]
+
+    scope = anno.getanno(inner_lam_def, anno.Static.SCOPE)
+    self.assertScopeIs(scope, ('f',), ())
+
+    scope = anno.getanno(inner_lam_def, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b', 'c'), ())
+
+    scope = anno.getanno(inner_lam_def, NodeAnno.ARGS_AND_BODY_SCOPE)
+    self.assertScopeIs(scope, ('a', 'b', 'c'), ())
+    self.assertSymbolSetsAre(('b',), scope.bound, 'BOUND')
+
+    scope = anno.getanno(inner_lam_def.args, anno.Static.SCOPE)
+    self.assertSymbolSetsAre(('b',), scope.params.keys(), 'lambda params')
 
   def test_comprehension_targets_are_isolated(self):
 
@@ -607,9 +753,11 @@ class ActivityAnalyzerTest(ActivityAnalyzerTestBase):
     node, _ = self._parse_and_analyze(test_fn)
     fn_node = node
     body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
-    self.assertScopeIs(body_scope, ('global_b', 'c'), ('global_a',))
+    self.assertScopeIs(body_scope, ('global_a', 'global_b', 'c'), ('global_a',))
     self.assertSetEqual(body_scope.globals, set(
         (QN('global_a'), QN('global_b'))))
+    global_a_scope = anno.getanno(fn_node.body[0], anno.Static.SCOPE)
+    self.assertScopeIs(global_a_scope, ('global_a',), ())
 
   def test_class_definition_basic(self):
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/annos.py b/tensorflow/python/autograph/pyct/static_analysis/annos.py
index cc7ad618cc9..948684a8d66 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/annos.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/annos.py
@@ -48,6 +48,9 @@ class NodeAnno(NoValue):
   ARGS_SCOPE = 'The scope for the argument list of a function call.'
   COND_SCOPE = 'The scope for the test node of a conditional statement.'
   ITERATE_SCOPE = 'The scope for the iterate assignment of a for loop.'
+  ARGS_AND_BODY_SCOPE = (
+      'The scope for the main body of a function or lambda, including its'
+      ' arguments.')
   BODY_SCOPE = (
       'The scope for the main body of a statement (True branch for if '
       'statements, main body for loops).')
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index c2375093cc0..5502147bf80 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -40,11 +40,9 @@ from tensorflow.python.autograph.pyct.static_analysis import annos
 class Analyzer(cfg.GraphVisitor):
   """CFG visitor that performs liveness analysis at statement level."""
 
-  def __init__(self, graph):
+  def __init__(self, graph, include_annotations):
     super(Analyzer, self).__init__(graph)
-    # This allows communicating that nodes generate extra symbols,
-    # e.g. those that a function definition closes over.
-    self.extra_gen = {}
+    self.include_annotations = include_annotations
 
   def init_state(self, _):
     return set()
@@ -55,7 +53,9 @@ class Analyzer(cfg.GraphVisitor):
     if anno.hasanno(node.ast_node, anno.Static.SCOPE):
       node_scope = anno.getanno(node.ast_node, anno.Static.SCOPE)
 
-      gen = node_scope.read | self.extra_gen.get(node.ast_node, frozenset())
+      gen = node_scope.read
+      if not self.include_annotations:
+        gen -= node_scope.annotations
       # TODO(mdan): verify whether composites' parents need to be added.
       # E.g. whether x needs to be added if x.y is live. Theoretically the
       # activity analysis should have both so that wouldn't be needed.
@@ -66,13 +66,21 @@ class Analyzer(cfg.GraphVisitor):
         live_out |= self.in_[n]
       live_in = gen | (live_out - kill)
 
+      reaching_functions = anno.getanno(
+          node.ast_node, anno.Static.DEFINED_FNS_IN)
+      for fn_ast_node in reaching_functions:
+        if isinstance(fn_ast_node, gast.Lambda):
+          # Exception: lambda functions are assumed to be used only in the
+          # place where they are defined, and not later.
+          continue
+        fn_scope = anno.getanno(fn_ast_node, annos.NodeAnno.ARGS_AND_BODY_SCOPE)
+        # Any closure of a reaching function definition is conservatively
+        # considered live.
+        live_in |= (fn_scope.read - fn_scope.bound)
+
     else:
-      # Nodes that don't have a scope annotation are assumed not to touch any
-      # symbols.
-      # This Name node below is a literal name, e.g. False
-      assert isinstance(node.ast_node,
-                        (gast.Name, gast.Continue, gast.Break, gast.Pass,
-                         gast.Global, gast.Nonlocal)), type(node.ast_node)
+      assert self.can_ignore(node), (node.ast_node, node)
+
       live_out = set()
       for n in node.next:
         live_out |= self.in_[n]
@@ -85,7 +93,7 @@ class Analyzer(cfg.GraphVisitor):
     return prev_live_in != live_in
 
 
-class WholeTreeAnalyzer(transformer.Base):
+class TreeAnnotator(transformer.Base):
   """Runs liveness analysis on each of the functions defined in the AST.
 
   If a function defined other local functions, those will have separate CFGs.
@@ -95,7 +103,7 @@ class WholeTreeAnalyzer(transformer.Base):
   subfunction. For example:
 
     def foo():
-      # baz is live here
+      # baz is live from here on
       def bar():
         print(baz)
 
@@ -103,62 +111,15 @@ class WholeTreeAnalyzer(transformer.Base):
   for the effect above.
   """
 
-  def __init__(self, source_info, graphs):
-    super(WholeTreeAnalyzer, self).__init__(source_info)
+  def __init__(self, source_info, graphs, include_annotations):
+    super(TreeAnnotator, self).__init__(source_info)
+    self.include_annotations = include_annotations
+    self.allow_skips = False
     self.graphs = graphs
     self.current_analyzer = None
-    self.analyzers = {}
-
-  def visit_FunctionDef(self, node):
-    parent_analyzer = self.current_analyzer
-    subgraph = self.graphs[node]
-
-    # Postorder tree processing makes this a bit complicated:
-    #  1. construct an analyzer object and put it on stack
-    #  2. recursively walk the subtree; this will initialize the analyzer's
-    #     in_ state properly (done in a block below)
-    #  3. run the final analysis
-    analyzer = Analyzer(subgraph)
-    self.current_analyzer = analyzer
-    node = self.generic_visit(node)
-    analyzer.visit_reverse()
-
-    if parent_analyzer is not None:
-      # Wire the state between the two subgraphs' analyzers.
-      child_in_state = analyzer.in_[subgraph.entry]
-      # Exception: symbols modified in the child function are local to it
-      body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
-      for qn in body_scope.modified:
-        # Note: a function modifying the symbol doesn't make that symbol
-        # live at the function's entry. In fact when that happens it is
-        # probably a case of undefined assignment, like this:
-        #
-        #   bar = 0
-        #   def foo():
-        #     print(bar)  # bar is undefined here!
-        #     bar = 1
-        #
-        # Hence we use discard and not remove below.
-        child_in_state.discard(qn)
-      parent_analyzer.extra_gen[node] = frozenset(child_in_state,)
-
-    self.analyzers[node] = analyzer
-    self.current_analyzer = parent_analyzer
-    return node
-
-
-class Annotator(transformer.Base):
-  """AST visitor that annotates each control flow block with live symbols."""
-
-  # Note: additional nodes may be added as needed.
-
-  def __init__(self, source_info, cross_function_analyzer):
-    super(Annotator, self).__init__(source_info)
-    self.cross_function_analyzer = cross_function_analyzer
-    self.current_analyzer = None
 
   def visit(self, node):
-    node = super(Annotator, self).visit(node)
+    node = super(TreeAnnotator, self).visit(node)
     if (self.current_analyzer is not None and
         isinstance(node, gast.stmt) and
         node in self.current_analyzer.graph.index):
@@ -167,14 +128,23 @@ class Annotator(transformer.Base):
                    frozenset(self.current_analyzer.in_[cfg_node]))
     return node
 
-  def visit_FunctionDef(self, node):
+  def _analyze_function(self, node, is_lambda):
     parent_analyzer = self.current_analyzer
-    self.current_analyzer = self.cross_function_analyzer.analyzers[node]
 
+    analyzer = Analyzer(self.graphs[node], self.include_annotations)
+    analyzer.visit_reverse()
+    self.current_analyzer = analyzer
     node = self.generic_visit(node)
+
     self.current_analyzer = parent_analyzer
     return node
 
+  def visit_Lambda(self, node):
+    return self._analyze_function(node, is_lambda=True)
+
+  def visit_FunctionDef(self, node):
+    return self._analyze_function(node, is_lambda=False)
+
   def _block_statement_live_out(self, node):
     successors = self.current_analyzer.graph.stmt_next[node]
     stmt_live_out = set()
@@ -232,18 +202,18 @@ class Annotator(transformer.Base):
     return node
 
 
-def resolve(node, source_info, graphs):
+# TODO(mdan): Investigate the possibility of removing include_annotations.
+def resolve(node, source_info, graphs, include_annotations=True):
   """Resolves the live symbols at the exit of control flow statements.
 
   Args:
     node: ast.AST
     source_info: transformer.SourceInfo
     graphs: Dict[ast.FunctionDef, cfg.Graph]
+    include_annotations: Bool, whether type annotations should be included in
+      the analysis.
   Returns:
     ast.AST
   """
-  cross_function_analyzer = WholeTreeAnalyzer(source_info, graphs)
-  node = cross_function_analyzer.visit(node)
-  visitor = Annotator(source_info, cross_function_analyzer)
-  node = visitor.visit(node)
+  node = TreeAnnotator(source_info, graphs, include_annotations).visit(node)
   return node
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
index 90bcc67301a..ecb466532e2 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis import liveness
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
 from tensorflow.python.platform import test
 
 
@@ -49,7 +50,8 @@ class LivenessAnalyzerTestBase(test.TestCase):
     ctx = transformer.Context(entity_info, namer, None)
     node = activity.resolve(node, ctx)
     graphs = cfg.build(node)
-    liveness.resolve(node, ctx, graphs)
+    node = reaching_fndefs.resolve(node, ctx, graphs)
+    node = liveness.resolve(node, ctx, graphs)
     return node
 
   def assertHasLiveOut(self, node, expected):
@@ -191,6 +193,73 @@ class LivenessAnalyzerTest(LivenessAnalyzerTestBase):
 
     self.assertHasLiveOut(fn_body[0], 'a')
 
+  def test_live_out_nested_functions_defined_ahead(self):
+
+    def test_fn(a, b):
+      def foo():
+        return a
+
+      if b:
+        a = []
+
+      return foo
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasLiveOut(fn_body[1], ('a', 'foo'))
+
+  def test_live_out_nested_functions_defined_after(self):
+
+    def test_fn(a, b):
+      if b:
+        a = []
+
+      def foo():
+        return a
+
+      return foo
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasLiveOut(fn_body[0], ('a',))
+
+  def test_live_out_lambda(self):
+
+    def test_fn(a, b):
+      if b:
+        a = []
+
+      foo = lambda: a
+
+      if b:
+        pass
+
+      return foo
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasLiveOut(fn_body[0], ('a', 'b'))
+    self.assertHasLiveOut(fn_body[2], ('foo',))
+
+  def test_live_out_nested_functions_hidden_by_argument(self):
+
+    def test_fn(b):
+      def foo(a):
+        return a
+
+      if b:
+        a = []  # pylint:disable=unused-variable
+
+      return foo
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasLiveOut(fn_body[1], ('foo'))
+
   def test_live_out_nested_functions_isolation(self):
 
     def test_fn(b):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
index dda132c9114..4c6d7f3834d 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
@@ -34,9 +34,7 @@ import gast
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cfg
-from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.autograph.pyct.static_analysis import annos
 
 
 class Definition(object):
@@ -47,10 +45,12 @@ class Definition(object):
 
   Attributes:
     param_of: Optional[ast.AST]
+    directives: Dict, optional definition annotations
   """
 
   def __init__(self):
     self.param_of = None
+    self.directives = {}
 
   def __repr__(self):
     return '%s[%d]' % (self.__class__.__name__, id(self))
@@ -115,10 +115,6 @@ class Analyzer(cfg.GraphVisitor):
   def __init__(self, graph, definition_factory):
     self._definition_factory = definition_factory
     super(Analyzer, self).__init__(graph)
-    # This allows communicating that nodes have extra reaching definitions,
-    # e.g. those that a function closes over.
-    self.extra_in = {}
-
     self.gen_map = {}
 
   def init_state(self, _):
@@ -127,7 +123,7 @@ class Analyzer(cfg.GraphVisitor):
   def visit_node(self, node):
     prev_defs_out = self.out[node]
 
-    defs_in = _NodeState(self.extra_in.get(node.ast_node, None))
+    defs_in = _NodeState()
     for n in node.prev:
       defs_in |= self.out[n]
 
@@ -137,8 +133,12 @@ class Analyzer(cfg.GraphVisitor):
       # their ids are used in equality checks.
       if node not in self.gen_map:
         node_symbols = {}
-        # Every modification receives a definition.
-        for s in node_scope.modified:
+        # Every binding operation (assign, nonlocal, global, etc.) counts as a
+        # definition, with the exception of del, which only deletes without
+        # creating a new variable.
+        newly_defined = ((node_scope.bound | node_scope.globals) -
+                         node_scope.deleted)
+        for s in newly_defined:
           def_ = self._definition_factory()
           node_symbols[s] = def_
         # Every param receives a definition. Params are not necessarily
@@ -153,41 +153,16 @@ class Analyzer(cfg.GraphVisitor):
       kill = node_scope.modified | node_scope.deleted
       defs_out = gen | (defs_in - kill)
 
-    elif isinstance(node.ast_node, (gast.Global, gast.Nonlocal)):
-      # Special case for global and nonlocal: they generate a definition,
-      # but are not tracked by activity analysis.
-      if node not in self.gen_map:
-        node_symbols = {}
-        kill = set()
-        for s in node.ast_node.names:
-          qn = qual_names.QN(s)
-          # TODO(mdan): If definitions exist, should we preserve those instead?
-          # Incoming definitions may be present when this is a local function.
-          # In that case, the definitions of the nonlocal symbol from the
-          # enclosing function are available here. See self.extra_in.
-          kill.add(qn)
-          def_ = self._definition_factory()
-          node_symbols[qn] = def_
-        self.gen_map[node] = _NodeState(node_symbols)
-
       gen = self.gen_map[node]
       defs_out = gen | (defs_in - kill)
 
     else:
-      # Nodes that don't have a scope annotation are assumed not to touch any
-      # symbols.
-      # This Name node below is a literal name, e.g. False
-      # This can also happen if activity.py forgot to annotate the node with a
-      # scope object.
-      assert isinstance(node.ast_node,
-                        (gast.Name, gast.Break, gast.Continue, gast.Raise,
-                         gast.Pass)), (node.ast_node, node)
+      assert self.can_ignore(node), (node.ast_node, node)
       defs_out = defs_in
 
     self.in_[node] = defs_in
     self.out[node] = defs_out
 
-    # TODO(mdan): Move this to the superclass?
     return prev_defs_out != defs_out
 
 
@@ -205,6 +180,7 @@ class TreeAnnotator(transformer.Base):
 
   def __init__(self, source_info, graphs, definition_factory):
     super(TreeAnnotator, self).__init__(source_info)
+    self.allow_skips = False
     self.definition_factory = definition_factory
     self.graphs = graphs
     self.current_analyzer = None
@@ -214,28 +190,11 @@ class TreeAnnotator(transformer.Base):
     parent_analyzer = self.current_analyzer
     subgraph = self.graphs[node]
 
-    # Preorder tree processing:
-    #  1. if this is a child function, the parent was already analyzed and it
-    #     has the proper state value for the subgraph's entry
-    #  2. analyze the current function body
-    #  2. recursively walk the subtree; child functions will be processed
     analyzer = Analyzer(subgraph, self.definition_factory)
-    if parent_analyzer is not None:
-      # Wire the state between the two subgraphs' analyzers.
-      parent_out_state = parent_analyzer.out[parent_analyzer.graph.index[node]]
-      # Exception: symbols modified in the child function are local to it
-      body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
-      parent_out_state -= body_scope.modified
-      analyzer.extra_in[node.args] = parent_out_state
-
-    # Complete the analysis for the local function and annotate its body.
     analyzer.visit_forward()
 
     # Recursively process any remaining subfunctions.
     self.current_analyzer = analyzer
-    # Note: not visiting name, decorator_list and returns because they don't
-    # apply to this analysis.
-    # TODO(mdan): Should we still process the function name?
     node.args = self.visit(node.args)
     node.body = self.visit_block(node.body)
     self.current_analyzer = parent_analyzer
@@ -317,7 +276,7 @@ class TreeAnnotator(transformer.Base):
     return node
 
 
-def resolve(node, source_info, graphs, definition_factory):
+def resolve(node, source_info, graphs, definition_factory=Definition):
   """Resolves reaching definitions for each symbol.
 
   Args:
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
index 8ac642be117..ba27280f729 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
@@ -78,7 +78,19 @@ class ReachingDefinitionsAnalyzerTest(
 
     self.assertSameDef(local_body[1].test, local_body[2].value.elts[0])
 
-    self.assertHasDefinedIn(local_body[1], ('a', 'b', 'local_fn'))
+    # Note: the function name is is visible inside the function body. But it's
+    # a closure variable, not a local.
+    #
+    # Example:
+    #
+    #   >>> def f():
+    #   ...  print(f)
+    #   >>> g = f
+    #   >>> f = 'something else'
+    #   >>> g()
+    #   something else
+    #
+    self.assertHasDefinedIn(local_body[1], ('a', 'b'))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index 8b00b5c00ee..ac91b662a47 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -254,7 +254,11 @@ class ReachingDefinitionsAnalyzerTest(ReachingDefinitionsAnalyzerTestBase):
     self.assertHasDefs(fn_body[2].value, 2)
 
     inner_fn_body = fn_body[1].body[1].body
-    self.assertSameDef(inner_fn_body[0].value, def_of_a_in_if)
+    def_of_a_in_foo = inner_fn_body[0].value
+    # Even though `a` is visible in the inner functio above, the late binding
+    # makes it impossible to assume that the same value will be visible at
+    # call time.
+    self.assertHasDefs(def_of_a_in_foo, 0)
 
   def test_nested_functions_isolation(self):
 
@@ -400,6 +404,46 @@ class ReachingDefinitionsAnalyzerTest(ReachingDefinitionsAnalyzerTestBase):
 
     self.assertHasDefinedIn(fn_body[1], ('a',))
 
+  def test_definitions_in_except_block(self):
+
+    def test_fn():
+      try:
+        pass
+      except ValueError:
+        a = None
+      if a:  # pylint:disable=using-constant-test
+        a = None
+      return a
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasDefs(fn_body[1].test, 1)
+    self.assertHasDefs(fn_body[1].body[0].targets[0], 1)
+    self.assertHasDefs(fn_body[2].value, 2)
+
+    self.assertHasDefinedIn(fn_body[1], ('a',))
+
+  def test_definitions_in_except_block_of_raising_try(self):
+
+    def test_fn():
+      try:
+        raise ValueError()
+      except ValueError:
+        a = None
+      if a:  # pylint:disable=using-constant-test
+        a = None
+      return a
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasDefs(fn_body[1].test, 1)
+    self.assertHasDefs(fn_body[1].body[0].targets[0], 1)
+    self.assertHasDefs(fn_body[2].value, 2)
+
+    self.assertHasDefinedIn(fn_body[1], ('a',))
+
   def test_global(self):
 
     def test_fn():
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_fndefs.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_fndefs.py
new file mode 100644
index 00000000000..f650c392106
--- /dev/null
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_fndefs.py
@@ -0,0 +1,182 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An analysis that determines the reach of a function definition.
+
+A function definition is said to reach a statement if that function may exist
+(and therefore may be called) when that statement executes.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import transformer
+
+
+class Definition(object):
+  """Definition objects describe a unique definition of a function."""
+
+  def __init__(self, def_node):
+    self.def_node = def_node
+
+
+class _NodeState(object):
+  """Abstraction for the state of the CFG walk for reaching definition analysis.
+
+  This is a value type. Only implements the strictly necessary operators.
+
+  Attributes:
+    value: Dict[qual_names.QN, Set[Definition, ...]], the defined symbols and
+        their possible definitions
+  """
+
+  def __init__(self, init_from=None):
+    if init_from:
+      self.value = set(init_from)
+    else:
+      self.value = set()
+
+  def __eq__(self, other):
+    return self.value == other.value
+
+  def __ne__(self, other):
+    return self.value != other.value
+
+  def __or__(self, other):
+    assert isinstance(other, _NodeState)
+    result = _NodeState(self.value)
+    result.value.update(other.value)
+    return result
+
+  def __add__(self, value):
+    result = _NodeState(self.value)
+    result.value.add(value)
+    return result
+
+  def __repr__(self):
+    return 'NodeState[%s]=%s' % (id(self), repr(self.value))
+
+
+class Analyzer(cfg.GraphVisitor):
+  """CFG visitor that determines reaching definitions at statement level."""
+
+  def __init__(self, graph, external_defs):
+    super(Analyzer, self).__init__(graph)
+    # This allows communicating that nodes have extra reaching definitions,
+    # e.g. those that a function closes over.
+    self.external_defs = external_defs
+
+  def init_state(self, _):
+    return _NodeState()
+
+  def visit_node(self, node):
+    prev_defs_out = self.out[node]
+
+    if node is self.graph.entry:
+      defs_in = _NodeState(self.external_defs)
+    else:
+      defs_in = prev_defs_out
+
+    for n in node.prev:
+      defs_in |= self.out[n]
+
+    defs_out = defs_in
+    if isinstance(node.ast_node, (gast.Lambda, gast.FunctionDef)):
+      defs_out += node.ast_node
+
+    self.in_[node] = defs_in
+    self.out[node] = defs_out
+
+    return prev_defs_out != defs_out
+
+
+class TreeAnnotator(transformer.Base):
+  """AST visitor that annotates each symbol name with its reaching definitions.
+
+  Simultaneously, the visitor runs the dataflow analysis on each function node,
+  accounting for the effect of closures. For example:
+
+    def foo():
+      def f():
+        pass
+      def g():
+        # `def f` reaches here
+  """
+
+  def __init__(self, source_info, graphs):
+    super(TreeAnnotator, self).__init__(source_info)
+    self.graphs = graphs
+    self.allow_skips = False
+    self.current_analyzer = None
+
+  def _proces_function(self, node):
+    parent_analyzer = self.current_analyzer
+    subgraph = self.graphs[node]
+
+    if (self.current_analyzer is not None
+        and node in self.current_analyzer.graph.index):
+      cfg_node = self.current_analyzer.graph.index[node]
+      defined_in = self.current_analyzer.in_[cfg_node].value
+    else:
+      defined_in = ()
+
+    analyzer = Analyzer(subgraph, defined_in)
+    analyzer.visit_forward()
+
+    self.current_analyzer = analyzer
+    node = self.generic_visit(node)
+    self.current_analyzer = parent_analyzer
+    return node
+
+  def visit_FunctionDef(self, node):
+    return self._proces_function(node)
+
+  def visit_Lambda(self, node):
+    return self._proces_function(node)
+
+  def visit(self, node):
+    # This can happen before entering the top level function
+    if (self.current_analyzer is not None
+        and node in self.current_analyzer.graph.index):
+      cfg_node = self.current_analyzer.graph.index[node]
+      anno.setanno(node, anno.Static.DEFINED_FNS_IN,
+                   self.current_analyzer.in_[cfg_node].value)
+
+    extra_node = anno.getanno(node, anno.Basic.EXTRA_LOOP_TEST, default=None)
+    if extra_node is not None:
+      cfg_node = self.current_analyzer.graph.index[extra_node]
+      anno.setanno(extra_node, anno.Static.DEFINED_FNS_IN,
+                   self.current_analyzer.in_[cfg_node].value)
+
+    return super(TreeAnnotator, self).visit(node)
+
+
+def resolve(node, source_info, graphs):
+  """Resolves reaching definitions for each symbol.
+
+  Args:
+    node: ast.AST
+    source_info: transformer.SourceInfo
+    graphs: Dict[ast.FunctionDef, cfg.Graph]
+  Returns:
+    ast.AST
+  """
+  visitor = TreeAnnotator(source_info, graphs)
+  node = visitor.visit(node)
+  return node
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_fndefs_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_fndefs_test.py
new file mode 100644
index 00000000000..500ac91f64f
--- /dev/null
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_fndefs_test.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for reaching_fndefs module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import naming
+from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.autograph.pyct.static_analysis import activity
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
+from tensorflow.python.platform import test
+
+
+class ReachingFndefsAnalyzerTest(test.TestCase):
+
+  def _parse_and_analyze(self, test_fn):
+    # TODO(mdan): Use a custom FunctionTransformer here.
+    node, source = parser.parse_entity(test_fn, future_features=())
+    entity_info = transformer.EntityInfo(
+        name=test_fn.__name__,
+        source_code=source,
+        source_file=None,
+        future_features=(),
+        namespace={})
+    node = qual_names.resolve(node)
+    namer = naming.Namer({})
+    ctx = transformer.Context(entity_info, namer, None)
+    node = activity.resolve(node, ctx)
+    graphs = cfg.build(node)
+    node = reaching_definitions.resolve(node, ctx, graphs)
+    node = reaching_fndefs.resolve(node, ctx, graphs)
+    return node
+
+  def assertHasFnDefs(self, node):
+    anno.getanno(node, anno.Static.DEFINED_FNS_IN)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index 33700501cf8..87abe3d185c 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -19,15 +19,24 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import enum
 
 import gast
 
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import loader
+from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import templates
 
 
+class AnalysisLevel(enum.IntEnum):
+
+  NONE = 0
+  ACTIVITY = 1
+  DEFINEDNESS = 2
+  LIVENESS = 3
+
+
 # TODO(znado): Use namedtuple.
 class Context(object):
   """Contains information about a source code transformation.
@@ -267,7 +276,7 @@ class NodeStateTracker(object):
   def debug_print_src(self, node):
     """Helper method useful for debugging. Prints the AST as code."""
     if __debug__:
-      print(loader.load_ast(node))
+      print(parser.unparse(node))
     return node
 
   def visit_block(self, nodes, before_visit=None, after_visit=None):
@@ -346,17 +355,6 @@ class NodeStateTracker(object):
         node_destination = new_destination
     return results
 
-  def _get_source(self, node):
-    try:
-      source, _ = loader.load_ast(node)
-      return source
-    # pylint: disable=broad-except
-    # This function is used for error reporting.  If an exception occurs here,
-    # it should be suppressed, in favor of emitting as informative a message
-    # about the original error as possible.
-    except Exception:
-      return '<could not convert AST to source>'
-
 
 # TODO(mdan): Rename to PythonCodeTransformer.
 class Base(NodeStateTracker, gast.NodeTransformer):
diff --git a/tensorflow/python/autograph/utils/tensor_list_test.py b/tensorflow/python/autograph/utils/tensor_list_test.py
index bbbc3bf6918..017d97bb040 100644
--- a/tensorflow/python/autograph/utils/tensor_list_test.py
+++ b/tensorflow/python/autograph/utils/tensor_list_test.py
@@ -34,7 +34,6 @@ class TensorListTest(test.TestCase):
   def _shape(self, shape_tuple):
     return constant(shape_tuple, dtypes.int32)
 
-  @test_util.run_v1_only("b/117943489")
   def test_dynamic_list_append(self):
     l = []
     l = tl.dynamic_list_append(l, 1)
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index dd8e64ac182..1c244c1b297 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -1917,6 +1917,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         a = constant_op.constant(1)
         b = constant_op.constant(2)
         c = a + b
+        # Ensure if the same kernel with the same arguments is executed then its
+        # execution is logged.
+        d = a + b
     else:
       # Passing the config to the server, but not the session should still
       # result in logging device placement.
@@ -1925,12 +1928,16 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(1)
       b = constant_op.constant(2)
       c = a + b
+      d = a + b
       with session.Session(server.target) as sess:
         with CaptureStderr() as log:
-          sess.run(c)
+          c, d = sess.run([c, d])
 
+    self.assertEqual(c, 3)
+    self.assertEqual(d, 3)
     # Ensure that we did log device placement.
-    self.assertTrue('/replica:0/task:0/device:CPU:0' in str(log), str(log))
+    add_executions = [l for l in str(log).splitlines() if 'AddV2' in l]
+    self.assertEqual(len(add_executions), 2)
 
   @test_util.run_v1_only('b/120545219')
   def testLocalMasterSessionTimeout(self):
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 78a1613c86c..cb960fd599a 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -89,7 +89,8 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
     input_names.push_back(key_string);
 
     inputs_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = PyArrayToTF_Tensor(value, &inputs_safe.back());
+    s = NdarrayToTensor(nullptr /*ctx*/, value, &inputs_safe.back(),
+                        true /*convert_to_string*/);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
@@ -367,7 +368,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
   // cleaned up properly.
   //
   // Memory management:
-  // PyArrayToTF_Tensor() creates a new ndarray PyObject from the input
+  // NdarrayToTensor() creates a new ndarray PyObject from the input
   // ndarray. We manage the new ndarray's lifetime in order to keep the
   // underlying data buffer alive (the new ndarray also guarantees a contiguous
   // data buffer). The new ndarray's data buffer is used to create the
@@ -382,7 +383,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
   std::vector<Safe_TF_TensorPtr> input_vals_safe;
   for (PyObject* ndarray : input_ndarrays) {
     input_vals_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = PyArrayToTF_Tensor(ndarray, &input_vals_safe.back());
+    s = NdarrayToTensor(nullptr, ndarray, &input_vals_safe.back(), true);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index 14fda32f956..6bc8cb2084d 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -377,7 +377,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
           auto result = tensorflow::TF_TryEvaluateConstant_wrapper(
               graph, output, status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
-          return tensorflow::pyo_or_throw(result);
+          return tensorflow::PyoOrThrow(result);
         });
 
   m.def("ExtendSession", [](TF_Session* session) {
@@ -459,7 +459,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       PyList_SET_ITEM(result, i, py_outputs.at(i));
     }
 
-    return tensorflow::pyo_or_throw(result);
+    return tensorflow::PyoOrThrow(result);
   });
 
   // Do not release GIL.
@@ -509,7 +509,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       PyList_SET_ITEM(result, i, py_outputs.at(i));
     }
 
-    return tensorflow::pyo_or_throw(result);
+    return tensorflow::PyoOrThrow(result);
   });
 
   // Do not release GIL.
@@ -540,7 +540,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
     // Return out_values
     py::list py_list;
     for (size_t i = 0; i < out_values.size(); ++i) {
-      py::object obj = tensorflow::pyo(out_values.at(i));
+      py::object obj = tensorflow::Pyo(out_values.at(i));
       py_list.append(obj);
     }
     return py_list;
@@ -610,7 +610,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
           // bool.
           // Acquire GIL for returning output returning.
           pybind11::gil_scoped_acquire acquire;
-          return tensorflow::pyo(PyLong_FromLongLong(value));
+          return tensorflow::Pyo(PyLong_FromLongLong(value));
         });
 
   m.def("TF_SetAttrValueProto", [](TF_OperationDescription* desc,
@@ -667,7 +667,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   m.def("TF_NewBuffer", TF_NewBuffer, py::return_value_policy::reference);
   m.def("TF_GetBuffer", [](TF_Buffer* buf) {
     TF_Buffer buffer = TF_GetBuffer(buf);
-    return tensorflow::pyo_or_throw(PyBytes_FromStringAndSize(
+    return tensorflow::PyoOrThrow(PyBytes_FromStringAndSize(
         reinterpret_cast<const char*>(buffer.data), buffer.length));
   });
   m.def("TF_DeleteBuffer", &TF_DeleteBuffer);
@@ -713,7 +713,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def("TF_GetOpList", [](TF_Library* lib_handle) {
     TF_Buffer output_buffer = TF_GetOpList(lib_handle);
-    return tensorflow::pyo_or_throw(PyBytes_FromStringAndSize(
+    return tensorflow::PyoOrThrow(PyBytes_FromStringAndSize(
         reinterpret_cast<const char*>(output_buffer.data),
         output_buffer.length));
   });
@@ -790,7 +790,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
         // Returns a (TF_Operation*, int pos) tuple.
         py::tuple result_tuple = py::make_tuple(
-            py::cast(output), tensorflow::pyo(PyLong_FromSize_t(pos)));
+            py::cast(output), tensorflow::Pyo(PyLong_FromSize_t(pos)));
         return result_tuple;
       },
       py::return_value_policy::reference);
@@ -1094,7 +1094,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
           py::gil_scoped_release release;
           TF_OperationGetAttrBool(oper, attr_name, &value, status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
-          return tensorflow::pyo(PyBool_FromLong(value));
+          return tensorflow::Pyo(PyBool_FromLong(value));
         });
 
   m.def("TF_NewStatus", TF_NewStatus, py::return_value_policy::reference);
diff --git a/tensorflow/python/client/timeline.py b/tensorflow/python/client/timeline.py
index c3f38294b5e..696fdc21f5a 100644
--- a/tensorflow/python/client/timeline.py
+++ b/tensorflow/python/client/timeline.py
@@ -359,7 +359,8 @@ class Timeline(object):
       graph: (Optional) The 'Graph' that was executed.
     """
 
-    self._step_stats = step_stats
+    self._origin_step_stats = step_stats
+    self._step_stats = None
     self._graph = graph
     self._chrome_trace = _ChromeTraceFormatter()
     self._next_pid = 0
@@ -396,6 +397,18 @@ class Timeline(object):
       inputs = inputs.split(', ')
     return nn, op, inputs
 
+  def _parse_kernel_label(self, label, node_name):
+    """Parses the fields in a node timeline label."""
+    # Expects labels of the form: retval (arg) detail @@annotation
+    start = label.find('@@')
+    end = label.find('#')
+    if start >= 0 and end >= 0 and start + 2 < end:
+      node_name = label[start + 2:end]
+    # Node names should always have the form 'name:op'.
+    fields = node_name.split(':') + ['unknown']
+    name, op = fields[:2]
+    return name, op
+
   def _assign_lanes(self):
     """Assigns non-overlapping lanes for the activities on each device."""
     for device_stats in self._step_stats.dev_stats:
@@ -427,9 +440,8 @@ class Timeline(object):
     tid = nodestats.thread_id
     inputs = []
     if is_gputrace:
-      # Node names should always have the form 'name:op'.
-      fields = node_name.split(':') + ['unknown']
-      node_name, op = fields[:2]
+      node_name, op = self._parse_kernel_label(nodestats.timeline_label,
+                                               node_name)
     elif node_name == 'RecvTensor':
       # RPC tracing does not use the standard timeline_label format.
       op = 'RecvTensor'
@@ -607,7 +619,81 @@ class Timeline(object):
                                         total_bytes)
     self._allocator_maximums = alloc_maxes
 
-  def analyze_step_stats(self, show_dataflow=True, show_memory=True):
+  def _preprocess_op_time(self, op_time):
+    """Update the start and end time of ops in step stats.
+
+    Args:
+    op_time: How the execution time of op is shown in timeline. Possible values
+      are "schedule", "gpu" and "all". "schedule" will show op from the time it
+      is scheduled to the end of the scheduling. Notice by the end of its
+      scheduling its async kernels may not start yet. It is shown using the
+      default value from step_stats. "gpu" will show op with the execution time
+      of its kernels on GPU. "all" will show op from the start of its scheduling
+      to the end of its last kernel.
+    """
+    if op_time == 'schedule':
+      self._step_stats = self._origin_step_stats
+      return
+    self._step_stats = copy.deepcopy(self._origin_step_stats)
+    # Separate job task and gpu tracer stream
+    stream_all_stats = []
+    job_stats = []
+    for stats in self._step_stats.dev_stats:
+      if '/stream:all' in stats.device:
+        stream_all_stats.append(stats)
+      elif '/job' in stats.device:
+        job_stats.append(stats)
+
+    # Record the start time of the first kernel and the end time of
+    # the last gpu kernel for all ops.
+    op_gpu_start = {}
+    op_gpu_end = {}
+    for stats in stream_all_stats:
+      for kernel in stats.node_stats:
+        name, _ = self._parse_kernel_label(kernel.timeline_label,
+                                           kernel.node_name)
+        start = kernel.all_start_micros
+        end = kernel.all_start_micros + kernel.all_end_rel_micros
+        if name in op_gpu_start:
+          op_gpu_start[name] = min(op_gpu_start[name], start)
+          op_gpu_end[name] = max(op_gpu_end[name], end)
+        else:
+          op_gpu_start[name] = start
+          op_gpu_end[name] = end
+
+    # Update the start and end time of each op according to the op_time
+    for stats in job_stats:
+      for op in stats.node_stats:
+        if op.node_name in op_gpu_start:
+          end = max(op_gpu_end[op.node_name],
+                    op.all_start_micros + op.all_end_rel_micros)
+          if op_time == 'gpu':
+            op.all_start_micros = op_gpu_start[op.node_name]
+          op.all_end_rel_micros = end - op.all_start_micros
+
+  def analyze_step_stats(self,
+                         show_dataflow=True,
+                         show_memory=True,
+                         op_time='schedule'):
+    """Analyze the step stats and format it into Chrome Trace Format.
+
+    Args:
+      show_dataflow: (Optional.) If True, add flow events to the trace
+        connecting producers and consumers of tensors.
+      show_memory: (Optional.) If True, add object snapshot events to the trace
+        showing the sizes and lifetimes of tensors.
+      op_time: (Optional.) How the execution time of op is shown in timeline.
+        Possible values are "schedule", "gpu" and "all". "schedule" will show op
+        from the time it is scheduled to the end of the scheduling. Notice by
+        the end of its scheduling its async kernels may not start yet. It is
+        shown using the default value from step_stats. "gpu" will show op with
+        the execution time of its kernels on GPU. "all" will show op from the
+        start of its scheduling to the end of its last kernel.
+
+    Returns:
+      A 'StepStatsAnalysis' object.
+    """
+    self._preprocess_op_time(op_time)
     self._allocate_pids()
     self._assign_lanes()
     self._analyze_tensors(show_memory)
@@ -618,7 +704,10 @@ class Timeline(object):
         chrome_trace=self._chrome_trace,
         allocator_maximums=self._allocator_maximums)
 
-  def generate_chrome_trace_format(self, show_dataflow=True, show_memory=False):
+  def generate_chrome_trace_format(self,
+                                   show_dataflow=True,
+                                   show_memory=False,
+                                   op_time='schedule'):
     """Produces a trace in Chrome Trace Format.
 
     Args:
@@ -626,11 +715,20 @@ class Timeline(object):
         connecting producers and consumers of tensors.
       show_memory: (Optional.) If True, add object snapshot events to the trace
         showing the sizes and lifetimes of tensors.
+      op_time: (Optional.) How the execution time of op is shown in timeline.
+        Possible values are "schedule", "gpu" and "all".
+        "schedule" will show op from the time it is scheduled to the end of
+          the scheduling.
+          Notice by the end of its scheduling its async kernels may not start
+          yet. It is shown using the default value from step_stats.
+        "gpu" will show op with the execution time of its kernels on GPU.
+        "all" will show op from the start of its scheduling to the end of
+          its last kernel.
 
     Returns:
       A JSON formatted string in Chrome Trace format.
     """
     step_stats_analysis = self.analyze_step_stats(
-        show_dataflow=show_dataflow, show_memory=show_memory)
+        show_dataflow=show_dataflow, show_memory=show_memory, op_time=op_time)
 
     return step_stats_analysis.chrome_trace.format_to_string(pretty=True)
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 92cc94be34c..751f4b6cadf 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -25,13 +25,15 @@ from __future__ import print_function
 import datetime
 import os
 
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
+
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 4, 13)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 19)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
@@ -53,6 +55,10 @@ def _update_forward_compatibility_date_number(date_to_override=None):
     if delta_days:
       date += datetime.timedelta(days=int(delta_days))
 
+  if date < _FORWARD_COMPATIBILITY_HORIZON:
+    logging.warning("Trying to set the forward compatibility date to the past"
+                    " date %s. This will be ignored by TensorFlow." % (date))
+    return
   _FORWARD_COMPATIBILITY_DATE_NUMBER = _date_to_date_number(
       date.year, date.month, date.day)
 
diff --git a/tensorflow/python/compat/compat_test.py b/tensorflow/python/compat/compat_test.py
index 3d06649ede8..e43203f6efd 100644
--- a/tensorflow/python/compat/compat_test.py
+++ b/tensorflow/python/compat/compat_test.py
@@ -40,6 +40,10 @@ class CompatTest(test.TestCase):
     self.assertTrue(compat.forward_compatible(*one_day_before))
     self.assertFalse(compat.forward_compatible(*compatibility_date))
 
+  def test_past(self):
+    with compat.forward_compatibility_horizon(2018, 9, 18):
+      self.assertTrue(compat.forward_compatible(2020, 4, 4))
+
   def test_decorator(self):
     compatibility_date = self._compatibility_date()
     one_day_after = self._n_days_after(1)
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 1e4c215994f..192ba71cebd 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -120,8 +120,10 @@ cuda_py_tests(
     srcs = [
         "test/base_test.py",
         "test/batch_matmul_test.py",
+        "test/biasadd_matmul_test.py",
         "test/binary_tensor_weight_broadcast_test.py",
         "test/combined_nms_test.py",
+        "test/concatenation_test.py",
         "test/const_broadcast_test.py",
         "test/conv2d_test.py",
         "test/dynamic_input_shapes_test.py",
@@ -155,27 +157,6 @@ cuda_py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "concatenation_test",
-    srcs = [
-        "test/biasadd_matmul_test.py",
-        "test/concatenation_test.py",
-    ],
-    python_version = "PY3",
-    tags = [
-        "no_rocm",
-        "no_windows",
-        "nomac",
-        "notap",  # b/140261407
-    ],
-    xla_enable_strict_auto_jit = False,
-    deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
 cuda_py_test(
     name = "quantization_mnist_test",
     srcs = ["test/quantization_mnist_test.py"],
diff --git a/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
index 933232c37ce..04445cc99aa 100644
--- a/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
@@ -65,7 +65,8 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
     x5 = math_ops.matmul(x, b)
     b = self._ConstOp((48,))
     x5 = nn.bias_add(x5, b)
-    x5 = gen_array_ops.reshape(x5, [4, -1])
+    # TODO(b/154672994): Put the reshape back when the bug is fixed.
+    # x5 = gen_array_ops.reshape(x5, [4, -1])
 
     x6 = gen_array_ops.reshape(x, [4, 24, 6])
     b = self._ConstOp((6,))
diff --git a/tensorflow/python/compiler/tensorrt/test/unary_test.py b/tensorflow/python/compiler/tensorrt/test/unary_test.py
index b88939d9b65..6f8c88c2832 100644
--- a/tensorflow/python/compiler/tensorrt/test/unary_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/unary_test.py
@@ -50,6 +50,8 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
     q = q + 3.0
     a = gen_math_ops.reciprocal(q)
 
+    # this chain of operations has a batch size of 5, which is different from
+    # the batch size for the other operations.
     x = constant_op.constant(np.random.randn(5, 8, 12), dtype=x.dtype)
     q = math_ops.abs(x)
     q = q + 2.0
@@ -95,7 +97,7 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["TRTEngineOp_0"]
+    return ["TRTEngineOp_0", "TRTEngineOp_1"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/benchmarks/benchmark_base.py b/tensorflow/python/data/benchmarks/benchmark_base.py
index 40a8c61556a..1e1a71668ef 100644
--- a/tensorflow/python/data/benchmarks/benchmark_base.py
+++ b/tensorflow/python/data/benchmarks/benchmark_base.py
@@ -31,7 +31,12 @@ from tensorflow.python.platform import test
 class DatasetBenchmarkBase(test.Benchmark):
   """Base class for dataset benchmarks."""
 
-  def run_benchmark(self, dataset, num_elements, iters=1, warmup=True):
+  def run_benchmark(self,
+                    dataset,
+                    num_elements,
+                    iters=1,
+                    warmup=True,
+                    apply_default_optimizations=False):
     """Benchmarks the dataset.
 
     Runs the dataset `iters` times. In each iteration, the benchmark measures
@@ -43,6 +48,8 @@ class DatasetBenchmarkBase(test.Benchmark):
         iteration.
       iters: Number of times to repeat the timing.
       warmup: If true, warms up the session caches by running an untimed run.
+      apply_default_optimizations: Determines whether default optimizations
+        should be applied.
 
     Returns:
       A float, representing the per-element wall time of the dataset in seconds.
@@ -50,7 +57,8 @@ class DatasetBenchmarkBase(test.Benchmark):
       to go through `num_elements` elements, divided by `num_elements.`
     """
     options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.apply_default_optimizations = (
+        apply_default_optimizations)
     dataset = dataset.with_options(options)
     # NOTE: We use `dataset.skip()` to perform the iterations in C++, avoiding
     # the overhead of multiple `session.run()` calls. Note that this relies on
@@ -82,9 +90,11 @@ class DatasetBenchmarkBase(test.Benchmark):
                                name,
                                iters=5,
                                extras=None,
-                               warmup=True):
+                               warmup=True,
+                               apply_default_optimizations=False):
     # Measure the per-element wall time.
-    wall_time = self.run_benchmark(dataset, num_elements, iters, warmup)
+    wall_time = self.run_benchmark(dataset, num_elements, iters, warmup,
+                                   apply_default_optimizations)
 
     if extras is None:
       extras = {}
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
index 6638842bd56..aea0fe9847e 100644
--- a/tensorflow/python/data/benchmarks/map_benchmark.py
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -20,6 +20,12 @@ from __future__ import print_function
 from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import map_fn as map_fn
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 
 
 # TODO(b/119837791): Add eager benchmarks.
@@ -28,11 +34,11 @@ class MapBenchmark(benchmark_base.DatasetBenchmarkBase):
 
   def benchmark_chain_of_maps(self):
 
-    def benchmark_helper(chain_length, map_fn, use_inter_op_parallelism, label):
+    def benchmark_helper(chain_length, fn, use_inter_op_parallelism, label):
       dataset = dataset_ops.Dataset.range(10000)
       for _ in range(chain_length):
         dataset = dataset_ops.MapDataset(
-            dataset, map_fn, use_inter_op_parallelism=use_inter_op_parallelism)
+            dataset, fn, use_inter_op_parallelism=use_inter_op_parallelism)
       self.run_and_report_benchmark(
           dataset,
           num_elements=10000,
@@ -47,11 +53,11 @@ class MapBenchmark(benchmark_base.DatasetBenchmarkBase):
   def benchmark_map_fan_out(self):
     fan_outs = [1, 2, 5, 10, 20, 50, 100]
 
-    def benchmark_helper(fan_out, map_fn, use_inter_op_parallelism, label):
+    def benchmark_helper(fan_out, fn, use_inter_op_parallelism, label):
       dataset = dataset_ops.Dataset.from_tensors(
           tuple(0 for _ in range(fan_out))).repeat(None)
       dataset = dataset_ops.MapDataset(
-          dataset, map_fn, use_inter_op_parallelism=use_inter_op_parallelism)
+          dataset, fn, use_inter_op_parallelism=use_inter_op_parallelism)
       self.run_and_report_benchmark(
           dataset,
           num_elements=10000,
@@ -76,6 +82,39 @@ class MapBenchmark(benchmark_base.DatasetBenchmarkBase):
       self.run_and_report_benchmark(
           dataset, num_elements=10000, name="stats_%s" % stats)
 
+  def benchmark_sequential_control_flow(self):
+    dataset = dataset_ops.Dataset.from_tensors(100000)
+
+    def fn(x):
+      i = constant_op.constant(0)
+
+      def body(i, x):
+        return math_ops.add(i, 1), x
+
+      return control_flow_ops.while_loop(math_ops.less, body, [i, x])
+
+    dataset = dataset.map(fn)
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=1,
+        name="sequential_control_flow",
+        apply_default_optimizations=True)
+
+  def benchmark_parallel_control_flow(self):
+    dataset = dataset_ops.Dataset.from_tensors(
+        random_ops.random_uniform([100, 10000000]))
+
+    def fn(x):
+      return map_fn.map_fn(
+          lambda y: y * array_ops.transpose(y), x, parallel_iterations=10)
+
+    dataset = dataset.map(fn)
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=1,
+        name="parallel_control_flow",
+        apply_default_optimizations=True)
+
 
 if __name__ == "__main__":
   benchmark_base.test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 41b2c340e7b..d5d6cb00733 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -734,6 +734,7 @@ tf_py_test(
 tf_py_test(
     name = "snapshot_test",
     size = "medium",
+    timeout = "long",
     srcs = ["snapshot_test.py"],
     shard_count = 10,
     deps = [
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index b2b348a436e..8271dbada7a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.experimental.ops import interleave_ops
@@ -429,6 +430,21 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       dataset = distribute._AutoShardDataset(dataset, 2, 2)
       self.evaluate(self.getNext(dataset)())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testAssertCardinality(self):
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
+    dataset = dataset.flat_map(core_readers.TFRecordDataset)
+    dataset = dataset.batch(5)
+    dataset = dataset.apply(cardinality.assert_cardinality(42))
+    dataset = distribute._AutoShardDataset(dataset, 5, 0)
+
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in (0, 5)
+        for r in range(0, 10)
+    ]
+    self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
+
 
 class AutoShardTextLineDatasetTest(
     reader_dataset_ops_test_base.TextLineDatasetTestBase,
diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
index ffc98b917d2..13d56a84d3c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -75,9 +75,8 @@ def _test_combinations():
       ("FromTensors2", lambda: dataset_ops.Dataset.from_tensors((0, 1)), 1),
       ("FromTensorSlices1",
        lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0]), 3),
-      ("FromTensorSlices2",
-       lambda: dataset_ops.Dataset.from_tensor_slices(([0, 0, 0], [1, 1, 1])),
-       3),
+      ("FromTensorSlices2", lambda: dataset_ops.Dataset.from_tensor_slices(
+          ([0, 0, 0], [1, 1, 1])), 3),
       ("Interleave1", lambda: dataset_ops.Dataset.range(5).interleave(
           lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
        cardinality.UNKNOWN),
@@ -134,6 +133,19 @@ def _test_combinations():
        lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).take(2),
        cardinality.UNKNOWN),
       ("Take4", lambda: dataset_ops.Dataset.range(5).repeat().take(2), 2),
+      ("Unbatch1", lambda: dataset_ops.Dataset.range(5).batch(
+          2, drop_remainder=True).unbatch(), 4),
+      ("Unbatch2", lambda: dataset_ops.Dataset.range(5).batch(
+          2, drop_remainder=False).unbatch(), cardinality.UNKNOWN),
+      ("Unbatch3", lambda: dataset_ops.Dataset.range(5).batch(
+          2, drop_remainder=True).filter(lambda _: True).unbatch(),
+       cardinality.UNKNOWN),
+      ("Unbatch4", lambda: dataset_ops.Dataset.range(5).batch(
+          2, drop_remainder=True).repeat().unbatch(), cardinality.INFINITE),
+      ("Unbatch5", lambda: dataset_ops.Dataset.zip((
+          dataset_ops.Dataset.range(4).batch(2, drop_remainder=False),
+          dataset_ops.Dataset.range(5).batch(2, drop_remainder=True),
+      )).unbatch(), 4),
       ("Window1", lambda: dataset_ops.Dataset.range(5).window(
           size=2, shift=2, drop_remainder=True), 2),
       ("Window2", lambda: dataset_ops.Dataset.range(5).window(
@@ -144,12 +156,12 @@ def _test_combinations():
           (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
       ("Zip3", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
           5), dataset_ops.Dataset.range(3).repeat())), 5),
-      ("Zip4", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
-          5).repeat(), dataset_ops.Dataset.range(3).repeat())),
-       cardinality.INFINITE),
-      ("Zip5", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
-          5), dataset_ops.Dataset.range(3).filter(lambda _: True))),
-       cardinality.UNKNOWN),
+      ("Zip4", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5).repeat(), dataset_ops.Dataset.range(3).
+           repeat())), cardinality.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3).filter(
+              lambda _: True))), cardinality.UNKNOWN),
   ]
 
   def reduce_fn(x, y):
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index 941ca209848..13948305aea 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -41,9 +41,9 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _setup_files(self, inputs, linebreak='\n', compression_type=None):
     filenames = []
-    for i, ip in enumerate(inputs):
+    for i, file_rows in enumerate(inputs):
       fn = os.path.join(self.get_temp_dir(), 'temp_%d.csv' % i)
-      contents = linebreak.join(ip).encode('utf-8')
+      contents = linebreak.join(file_rows).encode('utf-8')
       if compression_type is None:
         with open(fn, 'wb') as f:
           f.write(contents)
@@ -580,6 +580,13 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]],
           record_defaults=record_defaults)
 
+  def testCsvDataset_immutableParams(self):
+    inputs = [['a,b,c', '1,2,3', '4,5,6']]
+    filenames = self._setup_files(inputs)
+    select_cols = ['a', 'c']
+    _ = readers.make_csv_dataset(
+        filenames, batch_size=1, select_columns=select_cols)
+    self.assertAllEqual(select_cols, ['a', 'c'])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index d32cba79124..1411481f0ac 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -74,6 +75,28 @@ tf_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "grappler_test",
+    size = "medium",
+    srcs = ["grappler_test.py"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "hoist_random_uniform_test",
     size = "small",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/grappler_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/grappler_test.py
new file mode 100644
index 00000000000..f8ec7f9036d
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/grappler_test.py
@@ -0,0 +1,81 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the generic Grappler optimizations used within tf.data."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+
+
+class GrapplerTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testConstantFoldingVarLenFeature(self):
+    example = example_pb2.Example(features=feature_pb2.Features(feature={}))
+    dataset = dataset_ops.Dataset.from_tensors(example.SerializeToString())
+
+    def parse_fn(serialized):
+      features = {"x": parsing_ops.VarLenFeature(dtypes.int64)}
+      parsed = parsing_ops.parse_single_example(serialized, features)
+      parsed = parsed["x"].values
+
+      size = array_ops.size(parsed)
+      value = math_ops.cast(parsed, dtypes.bool)
+      return control_flow_ops.cond(size > 0,
+                                   lambda: array_ops.reshape(value, []),
+                                   lambda: array_ops.zeros([], dtypes.bool))
+
+    dataset = dataset.map(parse_fn)
+
+    self.assertDatasetProduces(dataset, expected_output=[0])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testLayoutOptimizationConv2D(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    # Compute convolution with input and filter of [1, 1, 1, 1] shape.
+    # Verify that Grappler doesn't transpose Conv2D data format to NCHW.
+    dataset = dataset_ops.Dataset.from_tensors((1, 1))
+
+    def map_function(x, y):
+      i = math_ops.cast(x, dtypes.float32)
+      i = array_ops.reshape(i, [1, 1, 1, 1])
+      f = math_ops.cast(y, dtypes.float32)
+      f = array_ops.reshape(f, [1, 1, 1, 1])
+      c = nn_ops.conv2d(i, f, strides=[1, 1, 1, 1], padding="VALID")
+      return array_ops.reshape(c, ())
+
+    dataset = dataset.map(map_function)
+    self.assertDatasetProduces(dataset, expected_output=[1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
index 17115474529..4e908ead618 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -37,11 +36,8 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.default_test_combinations())
   def testParallelMap(self):
     dataset = dataset_ops.Dataset.range(100)
-    parallel_map = "ParallelMap"
-    if compat.forward_compatible(2020, 3, 6):
-      parallel_map = "ParallelMapV2"
     dataset = dataset.apply(
-        testing.assert_next([parallel_map, "Prefetch", "FiniteTake"]))
+        testing.assert_next(["ParallelMapV2", "Prefetch", "FiniteTake"]))
     dataset = dataset.map(
         lambda x: x + 1, num_parallel_calls=dataset_ops.AUTOTUNE)
     dataset = dataset.take(50)
@@ -64,11 +60,8 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.default_test_combinations())
   def testParallelInterleave(self):
     dataset = dataset_ops.Dataset.range(100)
-    parallel_interleave = "ParallelInterleaveV3"
-    if compat.forward_compatible(2020, 3, 6):
-      parallel_interleave = "ParallelInterleaveV4"
     dataset = dataset.apply(
-        testing.assert_next([parallel_interleave, "Prefetch", "FiniteTake"]))
+        testing.assert_next(["ParallelInterleaveV4", "Prefetch", "FiniteTake"]))
     dataset = dataset.interleave(
         lambda x: dataset_ops.Dataset.from_tensors(x + 1),
         num_parallel_calls=dataset_ops.AUTOTUNE)
@@ -79,15 +72,9 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.default_test_combinations())
   def testChainedParallelDatasets(self):
     dataset = dataset_ops.Dataset.range(100)
-    parallel_interleave = "ParallelInterleaveV3"
-    if compat.forward_compatible(2020, 3, 6):
-      parallel_interleave = "ParallelInterleaveV4"
-    parallel_map = "ParallelMap"
-    if compat.forward_compatible(2020, 3, 6):
-      parallel_map = "ParallelMapV2"
     dataset = dataset.apply(
         testing.assert_next([
-            parallel_map, "Prefetch", parallel_interleave, "Prefetch",
+            "ParallelMapV2", "Prefetch", "ParallelInterleaveV4", "Prefetch",
             "MapAndBatch", "Prefetch", "FiniteTake"
         ]))
     dataset = dataset.map(
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index 4b3f811119b..d1a68931d38 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -25,7 +25,6 @@ import numpy as np
 
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
-from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.kernel_tests import test_base
@@ -222,9 +221,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     """
     map_node_name = "Map"
     if num_parallel_calls is not None:
-      map_node_name = "ParallelMap"
-      if compat.forward_compatible(2020, 3, 6):
-        map_node_name = "ParallelMapV2"
+      map_node_name = "ParallelMapV2"
 
     def _make_dataset(node_names):
       dataset = base_dataset.apply(testing.assert_next(node_names))
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index b5cbc56e2f0..f564fac4f1b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -21,7 +21,6 @@ import functools
 
 from absl.testing import parameterized
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -51,8 +50,7 @@ def _test_combinations():
     ds = ds.map(lambda x: (x, x), num_parallel_calls=2)  # Not eliminated
     return ds.map(lambda x, y: (x, y))  # Eliminated
 
-  parallel_map_name = "ParallelMapV2" if compat.forward_compatible(
-      2020, 3, 6) else "ParallelMap"
+  parallel_map_name = "ParallelMapV2"
 
   cases = [
       ("Skip0", lambda ds: ds.skip(0), None),
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
index ad1a98134b8..9dfeec75c95 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
@@ -19,11 +19,9 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.python import tf2
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -34,11 +32,7 @@ class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase,
 
   @combinations.generate(test_base.default_test_combinations())
   def testShuffleAndRepeatFusion(self):
-    if tf2.enabled() and context.executing_eagerly():
-      expected = "Shuffle"
-    else:
-      expected = "ShuffleAndRepeat"
-
+    expected = "ShuffleAndRepeat"
     dataset = dataset_ops.Dataset.range(10).apply(
         testing.assert_next([expected])).shuffle(10).repeat(2)
     options = dataset_ops.Options()
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 058cce03228..50d095e46f6 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -46,6 +46,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "data_service_ops",
+    srcs = [
+        "data_service_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
 py_library(
     name = "distribute",
     srcs = [
@@ -462,6 +476,7 @@ py_library(
         ":batching",
         ":cardinality",
         ":counter",
+        ":data_service_ops",
         ":distribute",
         ":enumerate_ops",
         ":error_ops",
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
new file mode 100644
index 00000000000..67dfadb4841
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -0,0 +1,318 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python API for executing a tf.data.Dataset using a tf.data service."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import six
+
+from tensorflow.python import tf2
+from tensorflow.python.data.experimental.ops.distribute_options import ExternalStatePolicy
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
+
+
+class ProcessingMode(object):
+  PARALLEL_EPOCHS = "parallel_epochs"
+
+  @staticmethod
+  def validate(mode):
+    """Raises a ValueError if the given object is not a valid processing mode."""
+    valid_modes = [ProcessingMode.PARALLEL_EPOCHS]
+    if mode not in valid_modes:
+      raise ValueError(
+          "{0} is not a valid processing mode. Valid modes: {1}".format(
+              mode, valid_modes))
+
+
+class _DataServiceDatasetV2(dataset_ops.DatasetSource):
+  """A `Dataset` that reads elements from the tf.data service."""
+
+  def __init__(self,
+               input_dataset,
+               dataset_id,
+               processing_mode,
+               address,
+               protocol,
+               job_name=None,
+               max_outstanding_requests=None,
+               task_refresh_interval_hint_ms=None):
+    """Constructs a _DataServiceDatasetV2.
+
+    Args:
+      input_dataset: The input dataset, which should be registered with the
+        tf.data service under `dataset_id`.
+      dataset_id: The dataset id for the dataset to read from.
+      processing_mode: A string specifying the policy for how data should be
+        processed by tf.data workers. Currently, the only supported value is
+        "parallel_epochs".
+      address: The tf.data service address, e.g. "localhost:5000".
+      protocol: The protocol to use for communicating with the tf.data service,
+        e.g. "grpc".
+      job_name: (Optional.) The name of the job. This argument makes it
+        possible for multiple datasets to share the same job. The default
+        behavior is that the dataset creates anonymous, exclusively owned jobs.
+      max_outstanding_requests: (Optional.) A limit on how many elements may be
+        requested at the same time. You can use this option to control the
+        amount of memory used, since `distribute` won't use more than
+        `element_size` * `max_outstanding_requests` of memory.
+      task_refresh_interval_hint_ms: (Optional.) A hint for how often to query
+        the master for task changes.
+    """
+
+    if job_name is None:
+      job_name = ""
+    if max_outstanding_requests is None:
+      max_outstanding_requests = dataset_ops.AUTOTUNE
+    if task_refresh_interval_hint_ms is None:
+      task_refresh_interval_hint_ms = dataset_ops.AUTOTUNE
+
+    self._dataset_id = ops.convert_to_tensor(
+        dataset_id, dtype=dtypes.int64, name="dataset_id")
+    self._processing_mode = ops.convert_to_tensor(
+        processing_mode, dtype=dtypes.string, name="processing_mode")
+    self._address = ops.convert_to_tensor(
+        address, dtype=dtypes.string, name="address")
+    self._protocol = ops.convert_to_tensor(
+        protocol, dtype=dtypes.string, name="protocol")
+    self._job_name = ops.convert_to_tensor(
+        job_name, dtype=dtypes.string, name="job_name")
+    self._max_outstanding_requests = ops.convert_to_tensor(
+        max_outstanding_requests,
+        dtype=dtypes.int64,
+        name="max_outstanding_requests")
+    self._element_spec = input_dataset.element_spec
+
+    variant_tensor = gen_experimental_dataset_ops.data_service_dataset(
+        dataset_id=self._dataset_id,
+        processing_mode=self._processing_mode,
+        address=self._address,
+        protocol=self._protocol,
+        job_name=self._job_name,
+        max_outstanding_requests=self._max_outstanding_requests,
+        task_refresh_interval_hint_ms=task_refresh_interval_hint_ms,
+        iteration_counter=gen_experimental_dataset_ops.dummy_iteration_counter(
+        ),
+        **self._flat_structure)
+    super(_DataServiceDatasetV2, self).__init__(variant_tensor)
+
+  @property
+  def element_spec(self):
+    return self._element_spec
+
+
+class _DataServiceDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` that executes its input through the tf.data service."""
+
+  @functools.wraps(_DataServiceDatasetV2.__init__)
+  def __init__(self, input_dataset, dataset_id, processing_mode, address,
+               protocol, job_name, max_outstanding_requests,
+               task_refresh_interval_hint_ms):
+
+    self._wrapped = _DataServiceDatasetV2(
+        input_dataset=input_dataset,
+        dataset_id=dataset_id,
+        processing_mode=processing_mode,
+        address=address,
+        protocol=protocol,
+        job_name=job_name,
+        max_outstanding_requests=max_outstanding_requests,
+        task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
+    super(_DataServiceDatasetV1, self).__init__(self._wrapped)
+
+
+if tf2.enabled():
+  _DataServiceDataset = _DataServiceDatasetV2
+else:
+  _DataServiceDataset = _DataServiceDatasetV1
+
+
+def _distribute(processing_mode,
+                service,
+                job_name=None,
+                max_outstanding_requests=None,
+                task_refresh_interval_hint_ms=None):
+  """A transformation that moves dataset processing to the tf.data service.
+
+  This transformation is similar to `distribute`, but supports additional
+  parameters which we do not yet want to add to the public Python API.
+
+  Args:
+    processing_mode: A string specifying the policy for how data should be
+      processed by tf.data workers. Currently, the only supported value is
+      "parallel_epochs".
+    service: A string indicating how to connect to the tf.data service. The
+      string should be in the format <protocol>://<address>, e.g.
+      grpc://localhost:5000.
+    job_name: (Optional.) The name of the job. This argument makes it
+      possible for multiple datasets to share the same job. The default behavior
+      is that the dataset creates anonymous, exclusively owned jobs.
+    max_outstanding_requests: (Optional.) A limit on how many elements may be
+      requested at the same time. You can use this option to control the amount
+      of memory used, since `distribute` won't use more than `element_size` *
+      `max_outstanding_requests` of memory.
+    task_refresh_interval_hint_ms: (Optional.) A hint for how often to query the
+      master for task changes.
+
+  Returns:
+    Dataset: A `Dataset` of the elements produced by the data service.
+  """
+  ProcessingMode.validate(processing_mode)
+  if job_name is not None:
+    if not isinstance(job_name, six.string_types):
+      raise ValueError("job_name must be a string, but job_name was of type "
+                       "{0}. job_name={1}".format(type(job_name), job_name))
+    if not job_name:
+      raise ValueError("job_name must not be empty")
+  if not isinstance(service, six.string_types):
+    raise ValueError(
+        "service must be a string, but service was of type {0}. service={1}"
+        .format(type(service), service))
+  if not service:
+    raise ValueError("service must not be empty")
+  parts = service.split("://")
+  if len(parts) == 1:
+    raise ValueError("service string %s does not begin with a protocol. "
+                     "The service should be in the format "
+                     "<protocol>://<address>, e.g. grpc://localhost:5000" %
+                     service)
+  if len(parts) > 2:
+    raise ValueError("malformed service string has multiple '://': %s" %
+                     service)
+  protocol, address = parts
+  address = ops.convert_to_tensor(address, dtype=dtypes.string, name="address")
+  protocol = ops.convert_to_tensor(
+      protocol, dtype=dtypes.string, name="protocol")
+
+  def _apply_fn(dataset):
+    external_state_policy = dataset.options().experimental_external_state_policy
+    if external_state_policy is None:
+      external_state_policy = ExternalStatePolicy.WARN
+    dataset_id = gen_experimental_dataset_ops.register_dataset(
+        dataset._variant_tensor,  # pylint: disable=protected-access
+        address=address,
+        protocol=protocol,
+        external_state_policy=external_state_policy.value)
+    return _DataServiceDataset(
+        input_dataset=dataset,
+        dataset_id=dataset_id,
+        processing_mode=processing_mode,
+        address=address,
+        protocol=protocol,
+        job_name=job_name,
+        max_outstanding_requests=max_outstanding_requests,
+        task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
+
+  return _apply_fn
+
+
+def distribute(processing_mode,
+               service,
+               job_name=None,
+               max_outstanding_requests=None):
+  """A transformation that moves dataset processing to the tf.data service.
+
+  When you iterate over a dataset containing the `distribute` transformation,
+  the tf.data service creates a "job" which produces data for the dataset
+  iteration.
+
+  The `processing_mode` argument controls what data is produced by a tf.data
+  service job. Currently, the only supported mode is "parallel_epochs".
+
+  processing_mode="parallel_epochs" means that multiple tf.data workers will
+  iterate through the dataset in parallel, each producing all elements of the
+  dataset. For example, if the dataset contains {0, 1, 2}, every tf.data worker
+  used for execution will produce {0, 1, 2}. If there are 3 workers, the job
+  will produce the elements {0, 0, 0, 1, 1, 1, 2, 2, 2} (though not necessarily
+  in that order). To account for this, it is recommended to randomly shuffle
+  your dataset, so that different tf.data workers will iterate through the
+  dataset in different orders.
+
+  In the future, there will be additional processing modes. For example,
+  a "one_epoch" mode which partitions the dataset across the tf.data
+  workers, so that the consumers see each element of the dataset only once.
+
+  ```
+  dataset = tf.data.Dataset.range(5)
+  dataset = dataset.map(lambda x: x*x)
+  dataset = dataset.apply(
+      tf.data.experimental.service.distribute("parallel_epochs",
+                                              "grpc://dataservice:5000"))
+  dataset = dataset.map(lambda x: x+1)
+
+  for element in dataset:
+    print(element)  # prints { 1, 2, 5, 10, 17 }
+  ```
+
+  In the above example, the first two lines (before the call to `distribute`)
+  will be executed on tf.data workers, and the elements provided over
+  RPC. The remaining transformations (after the call to `distribute`) will be
+  executed locally.
+
+  The `job_name` argument allows jobs to be shared across multiple
+  datasets. Instead of each dataset creating its own job, all datasets with the
+  same `job_name` will consume from the same job. A new job will
+  be created for each iteration of the dataset (with each repetition of
+  `Dataset.repeat` counting as a new iteration). The following example
+  demonstrates shared iteration, with the assumption that the tf.data service is
+  running with a single worker.
+
+  ```
+  range5_dataset = tf.data.Dataset.range(5)
+  dataset1 = range5_dataset.apply(tf.data.experimental.service.distribute(
+      "parallel_epochs", "my_job_name", "grpc://dataservice:5000"))
+  dataset2 = range5_dataset.apply(tf.data.experimental.service.distribute(
+      "parallel_epochs", "my_job_name", "grpc://dataservice:5000"))
+  iter_1_1 = iter(dataset1)
+  iter_1_2 = iter(dataset1)
+  iter_2_1 = iter(dataset2)
+  iter_2_2 = iter(dataset2)
+  print(next(iter_1_1))  # Prints "0"
+  # iter_1_2 consumes from the same job as iter_1_1
+  print(next(iter_1_2))  # Prints "1"
+  # iter_2_1 consumes from a new job
+  print(next(iter_2_1))  # Prints "0"
+  # iter_2_2 consumes from the same job as iter_2_1
+  print(next(iter_2_2))  # Prints "1"
+  ```
+
+  Args:
+    processing_mode: A string specifying the policy for how data should be
+      processed by tf.data workers. Currently, the only supported value is
+      "parallel_epochs".
+    service: A string indicating how to connect to the tf.data service. The
+      string should be in the format <protocol>://<address>, e.g.
+      grpc://localhost:5000.
+    job_name: (Optional.) The name of the job. This argument makes it possible
+      for multiple datasets to share the same job. The default behavior is that
+      the dataset creates anonymous, exclusively owned jobs.
+    max_outstanding_requests: (Optional.) A limit on how many elements may be
+      requested at the same time. You can use this option to control the amount
+      of memory used, since `distribute` won't use more than `element_size` *
+      `max_outstanding_requests` of memory.
+
+  Returns:
+    Dataset: A `Dataset` of the elements produced by the data service.
+  """
+  return _distribute(
+      processing_mode=processing_mode,
+      service=service,
+      job_name=job_name,
+      max_outstanding_requests=max_outstanding_requests)
diff --git a/tensorflow/python/data/experimental/ops/distribute_options.py b/tensorflow/python/data/experimental/ops/distribute_options.py
index 9a4c42d2e5b..40c9b2ec2aa 100644
--- a/tensorflow/python/data/experimental/ops/distribute_options.py
+++ b/tensorflow/python/data/experimental/ops/distribute_options.py
@@ -77,14 +77,6 @@ class DistributeOptions(options.OptionsBase):
       "files to shard.",
       default_factory=lambda: AutoShardPolicy.AUTO)
 
-  _make_stateless = options.create_option(
-      name="_make_stateless",
-      ty=bool,
-      docstring=
-      "Determines whether the input pipeline should be rewritten to not "
-      "contain stateful transformations (so that its graph can be moved "
-      "between devices).")
-
   num_devices = options.create_option(
       name="num_devices",
       ty=int,
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index 8cd9300e01e..ca947abe3cc 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
@@ -85,35 +84,20 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
       self._element_spec[key] = ragged_tensor.RaggedTensorSpec(
           input_dataset_shape.concatenate([None]), value_type, 1, splits_type)
 
-    if deterministic is not None or compat.forward_compatible(2020, 3, 6):
-      variant_tensor = (
-          gen_experimental_dataset_ops.parse_example_dataset_v2(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              self._num_parallel_calls,
-              self._dense_defaults,
-              self._sparse_keys,
-              self._dense_keys,
-              self._sparse_types,
-              self._dense_shapes,
-              deterministic=self._deterministic,
-              ragged_keys=self._ragged_keys,
-              ragged_value_types=self._ragged_value_types,
-              ragged_split_types=self._ragged_split_types,
-              **self._flat_structure))
-    else:
-      variant_tensor = (
-          gen_experimental_dataset_ops.parse_example_dataset(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              self._num_parallel_calls,
-              self._dense_defaults,
-              self._sparse_keys,
-              self._dense_keys,
-              self._sparse_types,
-              self._dense_shapes,
-              ragged_keys=self._ragged_keys,
-              ragged_value_types=self._ragged_value_types,
-              ragged_split_types=self._ragged_split_types,
-              **self._flat_structure))
+    variant_tensor = (
+        gen_experimental_dataset_ops.parse_example_dataset_v2(
+            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+            self._num_parallel_calls,
+            self._dense_defaults,
+            self._sparse_keys,
+            self._dense_keys,
+            self._sparse_types,
+            self._dense_shapes,
+            deterministic=self._deterministic,
+            ragged_keys=self._ragged_keys,
+            ragged_value_types=self._ragged_value_types,
+            ragged_split_types=self._ragged_split_types,
+            **self._flat_structure))
     super(_ParseExampleDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 8795a206bb1..b8f4c34f40e 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -183,24 +183,30 @@ def _get_sorted_col_indices(select_columns, column_names):
   """Transforms select_columns argument into sorted column indices."""
   names_to_indices = {n: i for i, n in enumerate(column_names)}
   num_cols = len(column_names)
-  for i, v in enumerate(select_columns):
+
+  results = []
+  for v in select_columns:
+    # If value is already an int, check if it's valid.
     if isinstance(v, int):
       if v < 0 or v >= num_cols:
         raise ValueError(
             "Column index %d specified in select_columns out of valid range." %
             v)
-      continue
-    if v not in names_to_indices:
+      results.append(v)
+    # Otherwise, check that it's a valid column name and convert to the
+    # the relevant column index.
+    elif v not in names_to_indices:
       raise ValueError(
           "Value '%s' specified in select_columns not a valid column index or "
           "name." % v)
-    select_columns[i] = names_to_indices[v]
+    else:
+      results.append(names_to_indices[v])
 
   # Sort and ensure there are no duplicates
-  result = sorted(set(select_columns))
-  if len(result) != len(select_columns):
+  results = sorted(set(results))
+  if len(results) != len(select_columns):
     raise ValueError("select_columns contains duplicate columns")
-  return result
+  return results
 
 
 def _maybe_shuffle_and_repeat(
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index eac7f4fd552..2e01021cfd2 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -79,6 +79,24 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "data_service_ops_test",
+    size = "medium",
+    srcs = ["data_service_ops_test.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/data",
+        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/service:server_lib",
+    ],
+)
+
 tf_py_test(
     name = "dataset_test",
     size = "small",
@@ -99,6 +117,21 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "dataset_spec_test",
+    size = "small",
+    srcs = ["dataset_spec_test.py"],
+    deps = [
+        ":test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "enumerate_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
index 00068a7fd4c..a95424b6843 100644
--- a/tensorflow/python/data/kernel_tests/cache_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -244,8 +244,6 @@ class MemoryCacheTest(test_base.DatasetTestBase, parameterized.TestCase):
         dataset_ops.Dataset.from_tensor_slices(components).repeat(0))
     cache_dataset = repeat_dataset.cache()
 
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
     self.assertDatasetProduces(cache_dataset, expected_output=[])
 
   @combinations.generate(test_base.default_test_combinations())
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
new file mode 100644
index 00000000000..217c586caef
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -0,0 +1,391 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.data service ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import data_service_ops
+from tensorflow.python.data.experimental.ops import distribute_options
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.service import server_lib
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.platform import test
+
+PROTOCOL = "grpc"
+
+
+def _make_distributed_dataset(dataset, service, job_name=None):
+  """Creates a distributed dataset with a short task refresh interval."""
+  return dataset.apply(
+      data_service_ops._distribute(
+          "parallel_epochs",
+          service,
+          job_name=job_name,
+          task_refresh_interval_hint_ms=20))
+
+
+class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def create_cluster(self, num_workers):
+    """Creates a cluster of tf.data service servers.
+
+    Args:
+      num_workers: The number of workers in the cluster.
+
+    Returns:
+      A target for connecting to the service, e.g.
+      "grpc+local://localhost:2000".
+    """
+    self._master = server_lib.MasterServer(PROTOCOL)
+    master_address = self._master.target[len(PROTOCOL + "://"):]
+
+    self._servers = []
+    for _ in range(num_workers):
+      self._servers.append(
+          server_lib.WorkerServer(PROTOCOL, master_address=master_address))
+
+    return self._master.target
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeBasic(self):
+    num_elements = 10
+    service = self.create_cluster(1)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = _make_distributed_dataset(ds, service)
+    results = [elem.numpy() for elem in ds]
+    self.assertEqual(list(range(num_elements)), results)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testMultipleEpochs(self):
+    num_elements = 3
+    service = self.create_cluster(1)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = _make_distributed_dataset(ds, service)
+    for _ in range(10):
+      self.assertEqual(list(range(num_elements)), [elem.numpy() for elem in ds])
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testRepeatedDataset(self):
+    num_elements = 10
+    num_repetitions = 5
+    service = self.create_cluster(1)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = _make_distributed_dataset(ds, service)
+    ds = ds.repeat(num_repetitions)
+    self.assertDatasetProduces(
+        ds, expected_output=num_repetitions * list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testConcurrentEpoch(self):
+    num_elements = 10
+    num_datasets = 3
+    service = self.create_cluster(1)
+    iterators = []
+    results = []
+    for _ in range(num_datasets):
+      ds = dataset_ops.Dataset.range(num_elements)
+      ds = _make_distributed_dataset(ds, service)
+      iterators.append(iter(ds))
+      results.append([])
+
+    for _ in range(num_elements):
+      for dataset_ind in range(num_datasets):
+        result = next(iterators[dataset_ind]).numpy()
+        results[dataset_ind].append(result)
+    for result in results:
+      self.assertEqual(list(range(num_elements)), result)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSharedEpoch(self):
+    self.skipTest("Not yet implemented")
+    num_elements = 10
+    num_iterators = 3
+    service = self.create_cluster(1)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = _make_distributed_dataset(ds, service)
+    result = []
+    iterators = []
+    for _ in range(num_iterators):
+      iterators.append(iter(ds))
+
+    # Alternate reading between the iterators.
+    for _ in range(2):
+      for it in iterators:
+        result.append(next(it).numpy())
+
+    # Drain the rest of the elements.
+    for it in iterators:
+      for elem in it:
+        result.append(elem.numpy())
+
+    self.assertCountEqual(list(range(num_elements)), result)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testMultiWorker(self):
+    num_workers = 3
+    num_elements = 10
+    service = self.create_cluster(num_workers)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = _make_distributed_dataset(ds, service)
+    results = [elem.numpy() for elem in ds]
+    self.assertCountEqual(num_workers * list(range(num_elements)), results)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testAddWorkerMidJob(self):
+    self._master = server_lib.MasterServer(PROTOCOL)
+    master_address = self._master.target[len(PROTOCOL + "://"):]
+    self._worker = server_lib.WorkerServer(
+        PROTOCOL, master_address=master_address)
+    num_elements = 100
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = _make_distributed_dataset(ds, self._master.target)
+    iterator = iter(ds)
+    results = []
+    # Read halfway through the dataset.
+    for _ in range(num_elements // 2):
+      results.append(next(iterator).numpy())
+
+    self._new_worker = server_lib.WorkerServer(
+        PROTOCOL, master_address=master_address)
+
+    # Wait for the new worker to register with the master.
+    while self._master.num_tasks() < 2:
+      time.sleep(10 / 1000)  # 10ms
+
+    for elem in iterator:
+      results.append(elem.numpy())
+
+    self.assertCountEqual(2 * list(range(num_elements)), results)
+
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         combinations.combine(use_same_port=[True, False])))
+  def testRestartWorker(self, use_same_port):
+    self._master = server_lib.MasterServer(PROTOCOL)
+    master_address = self._master.target[len(PROTOCOL + "://"):]
+    self._worker = server_lib.WorkerServer(
+        PROTOCOL, master_address=master_address)
+    num_elements = 100
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = _make_distributed_dataset(ds, self._master.target)
+    iterator = iter(ds)
+    # Read halfway through the dataset.
+    midpoint = num_elements // 2
+    for i in range(midpoint):
+      self.assertEqual(i, next(iterator).numpy())
+
+    # Stop the original worker and start a new one.
+    port = 0
+    if use_same_port:
+      worker_address = self._worker.target[len(PROTOCOL + "://"):]
+      port = int(worker_address.split(":")[1])
+    self._worker.stop()
+    self._new_worker = server_lib.WorkerServer(
+        PROTOCOL, master_address=master_address, port=port)
+
+    # The dataset starts over now that we read from the new worker.
+    for i in range(num_elements):
+      val = next(iterator).numpy()
+      if val == midpoint and i != midpoint:
+        # There may have been one last element prefetched from the first worker
+        # before it was stopped.
+        val = next(iterator).numpy()
+      self.assertEqual(i, val)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testMaxOutstandingRequests(self):
+    num_elements = 10
+    num_workers = 3
+    service = self.create_cluster(num_workers)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = ds.apply(
+        data_service_ops._distribute(
+            "parallel_epochs",
+            service,
+            max_outstanding_requests=1,
+            task_refresh_interval_hint_ms=20))
+    self.assertCountEqual(num_workers * list(range(num_elements)),
+                          self.getDatasetOutput(ds))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testInsideFunction(self):
+    num_workers = 3
+    num_elements = 10
+    service = self.create_cluster(num_workers)
+
+    @def_function.function
+    def f():
+      ds = dataset_ops.Dataset.range(num_elements)
+      ds = _make_distributed_dataset(ds, service)
+      result = tensor_array_ops.TensorArray(
+          dtypes.int64, size=num_workers * num_elements, dynamic_size=True)
+      i = 0
+      for elem in ds:
+        result = result.write(i, elem)
+        i += 1
+      return result.stack()
+
+    result = list(f().numpy())
+    self.assertCountEqual(num_workers * list(range(num_elements)), result)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSharedJobName(self):
+    num_elements = 10
+    service = self.create_cluster(1)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    iter1 = iter(ds1)
+    iter2 = iter(ds2)
+    results = []
+    for _ in range(3):
+      results.append(next(iter1).numpy())
+      results.append(next(iter2).numpy())
+    for elem in iter1:
+      results.append(elem.numpy())
+    for elem in iter2:
+      results.append(elem.numpy())
+    self.assertCountEqual(list(range(num_elements)), results)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDifferentJobNames(self):
+    num_elements = 10
+    service = self.create_cluster(1)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds1 = _make_distributed_dataset(ds, service, job_name="job_name1")
+    ds2 = _make_distributed_dataset(ds, service, job_name="job_name2")
+    self.assertDatasetProduces(ds1, list(range(num_elements)))
+    self.assertDatasetProduces(ds2, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSharedJobNameMultiIteration(self):
+    num_elements = 10
+    service = self.create_cluster(1)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    # iteration 1
+    self.assertDatasetProduces(ds1, list(range(num_elements)))
+    self.assertDatasetProduces(ds2, [])
+    # iteration 2
+    self.assertDatasetProduces(ds2, list(range(num_elements)))
+    self.assertDatasetProduces(ds1, [])
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSharedJobNameRepeat(self):
+    num_elements = 10
+    num_repetitions = 3
+    service = self.create_cluster(1)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds1 = ds1.repeat(num_repetitions)
+    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds2 = ds2.repeat(num_repetitions)
+    results = []
+    iter1 = iter(ds1)
+    iter2 = iter(ds2)
+    for _ in range(((num_elements * num_repetitions) // 2) - 1):
+      results.append(next(iter1).numpy())
+    for _ in range(((num_elements * num_repetitions) // 2) - 1):
+      results.append(next(iter2).numpy())
+    for elem in iter1:
+      results.append(elem.numpy())
+    for elem in iter2:
+      results.append(elem.numpy())
+    self.assertCountEqual(num_repetitions * list(range(num_elements)), results)
+
+  def run_stateful(self, external_state_policy):
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements).map(
+        lambda _: random_ops.random_uniform(()))
+
+    options = dataset_ops.Options()
+    options.experimental_external_state_policy = external_state_policy
+    ds = ds.with_options(options)
+
+    service = self.create_cluster(3)
+    ds = _make_distributed_dataset(ds, service)
+    next(iter(ds))
+
+  @combinations.generate(
+      combinations.times(
+          test_base.eager_only_combinations(),
+          combinations.combine(external_state_policy=[
+              distribute_options.ExternalStatePolicy.IGNORE,
+              distribute_options.ExternalStatePolicy.WARN
+          ])))
+  def testStatefulNoError(self, external_state_policy):
+    self.run_stateful(external_state_policy)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulError(self):
+    with self.assertRaises(errors.FailedPreconditionError):
+      self.run_stateful(distribute_options.ExternalStatePolicy.FAIL)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeFromInterleave(self):
+    service = self.create_cluster(1)
+    ds = dataset_ops.Dataset.range(2)
+
+    def interleave_fn(_):
+      ds = dataset_ops.Dataset.range(2)
+      _make_distributed_dataset(ds, service)
+      return ds
+
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError, r"The `.distribute\(...\)` dataset "
+        "transformation is not supported within tf.data functions"):
+      ds = ds.interleave(interleave_fn, cycle_length=2)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeNonStringAddresses(self):
+    ds = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegex(ValueError, "service must be a string"):
+      ds = ds.apply(
+          data_service_ops.distribute(
+              processing_mode="parallel_epochs", service=1))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeEmptyAddress(self):
+    ds = dataset_ops.Dataset.range(10)
+    with self.assertRaisesWithLiteralMatch(ValueError,
+                                           "service must not be empty"):
+      ds = ds.apply(
+          data_service_ops.distribute(
+              processing_mode="parallel_epochs", service=""))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeInvalidProcessingMode(self):
+    ds = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegex(ValueError,
+                                "invalid is not a valid processing mode"):
+      ds = ds.apply(
+          data_service_ops.distribute(
+              processing_mode="invalid", service="grpc://localhost:5000"))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_spec_test.py b/tensorflow/python/data/kernel_tests/dataset_spec_test.py
new file mode 100644
index 00000000000..781a972ea33
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/dataset_spec_test.py
@@ -0,0 +1,54 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.DatasetSpec`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.platform import test
+
+
+class DatasetSpecTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testInputSignature(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        np.arange(10).astype(np.int32)).batch(5)
+
+    @def_function.function(input_signature=[
+        dataset_ops.DatasetSpec(
+            tensor_spec.TensorSpec(
+                shape=(None,), dtype=dtypes.int32, name=None),
+            tensor_shape.TensorShape([]))
+    ])
+    def fn(_):
+      pass
+
+    fn(dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/memory_cleanup_test.py b/tensorflow/python/data/kernel_tests/memory_cleanup_test.py
index 8ba9d4c925a..583a2d4a922 100644
--- a/tensorflow/python/data/kernel_tests/memory_cleanup_test.py
+++ b/tensorflow/python/data/kernel_tests/memory_cleanup_test.py
@@ -32,9 +32,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_like
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import internal
 
 
 # memory_profiler might not be available in the OSS version of TensorFlow.
@@ -116,7 +116,7 @@ class MemoryCleanupTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     gc.collect()
     tensors = [
-        o for o in gc.get_objects() if isinstance(o, tensor_like.TensorLike)
+        o for o in gc.get_objects() if isinstance(o, internal.NativeObject)
     ]
     self.assertEmpty(tensors, "%d Tensors are still alive." % len(tensors))
 
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index dea217367dc..27b5a336a6c 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -107,9 +107,6 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     for _ in range(999):
       result = result.concatenate(ds)
-    options = dataset_ops.Options()
-    options.experimental_optimization.autotune = False
-    result = result.with_options(options)
     self.assertDatasetProduces(result, [0]*1000)
 
 
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
index 81a97867b71..eaa4afb93a8 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -23,6 +23,8 @@ import functools
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python import tf2
+from tensorflow.python.compat import compat
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import function
@@ -334,9 +336,12 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(
       combinations.times(
           test_base.graph_only_combinations() +
-          combinations.combine(mode=["eager"], tf_api_version=1),
+          combinations.combine(mode=["eager"]),
           combinations.combine(reshuffle=[True, False])))
   def testRerandomizeOnReplicate(self, reshuffle):
+    if tf2.enabled() and not compat.forward_compatible(2020, 5, 22):
+      self.skipTest("Functionality currently not supported.")
+
     random_seed.set_random_seed(None)
     # When no seeds are fixed, each instantiation of the shuffle dataset should
     # produce elements in a different order.
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index b3c2e6edbff..c64c909a622 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -42,6 +42,7 @@ from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import structure
 from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import auto_control_deps_utils as acd_utils
@@ -1525,7 +1526,7 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
     >>> result = dataset.map(lambda x: x + 1)
 
     >>> # Each element is a tuple containing two `tf.Tensor` objects.
-    >>> elements = [(1, "foo"), (2, "bar"), (3, "baz)")]
+    >>> elements = [(1, "foo"), (2, "bar"), (3, "baz")]
     >>> dataset = tf.data.Dataset.from_generator(
     ...     lambda: elements, (tf.int32, tf.string))
     >>> # `map_func` takes two arguments of type `tf.Tensor`. This function
@@ -1577,7 +1578,7 @@ name=None))
 
     Note that irrespective of the context in which `map_func` is defined (eager
     vs. graph), tf.data traces the function and executes it as a graph. To use
-    Python code inside of the function you have two options:
+    Python code inside of the function you have a few options:
 
     1) Rely on AutoGraph to convert Python code into an equivalent graph
     computation. The downside of this approach is that AutoGraph can convert
@@ -1656,7 +1657,8 @@ name=None))
     stays the same. For example, to flatten a dataset of batches into a
     dataset of their elements:
 
-    >>> dataset = Dataset.from_tensor_slices([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> dataset = tf.data.Dataset.from_tensor_slices(
+    ...                [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     >>> dataset = dataset.flat_map(lambda x: Dataset.from_tensor_slices(x))
     >>> list(dataset.as_numpy_iterator())
     [1, 2, 3, 4, 5, 6, 7, 8, 9]
@@ -2818,15 +2820,12 @@ class Options(options_lib.OptionsBase):
       result.extend(
           optimization_options.OptimizationOptions()._graph_rewrites())  # pylint: disable=protected-access
 
-    if self.experimental_deterministic is False:
+    if self.experimental_deterministic is False:  # pylint: disable=g-bool-id-comparison
       result.append("make_sloppy")
     if self.experimental_stats and self.experimental_stats.latency_all_edges:
       result.append("latency_all_edges")
     if self.experimental_slack:
       result.append("slack")
-    if (self.experimental_distribute and
-        self.experimental_distribute._make_stateless):  # pylint: disable=protected-access
-      result.append("make_stateless")
     return result
 
   def _graph_rewrite_configs(self):
@@ -3050,7 +3049,7 @@ class DatasetSpec(type_spec.BatchableTypeSpec):
 
   @property
   def value_type(self):
-    return _VariantDataset
+    return Dataset
 
   def _serialize(self):
     return (self._element_spec, self._dataset_shape)
@@ -3108,7 +3107,6 @@ class DatasetSpec(type_spec.BatchableTypeSpec):
 class StructuredFunctionWrapper(object):
   """A function wrapper that supports structured arguments and return values."""
 
-  # pylint: disable=protected-access
   def __init__(self,
                func,
                transformation_name,
@@ -3151,6 +3149,7 @@ class StructuredFunctionWrapper(object):
       ValueError: If an invalid combination of `dataset`, `input_classes`,
         `input_shapes`, and `input_types` is passed.
     """
+    # pylint: disable=protected-access
     if input_structure is None:
       if dataset is None:
         if input_classes is None or input_shapes is None or input_types is None:
@@ -3270,7 +3269,14 @@ class StructuredFunctionWrapper(object):
         _warn_if_collections(transformation_name)
 
     else:
+      if def_function.functions_run_eagerly():
+        warnings.warn(
+            "Even though the tf.config.experimental_run_functions_eagerly "
+            "option is set, this option does not apply to tf.data functions. "
+            "tf.data functions are still traced and executed as graphs.")
+
       defun_kwargs.update({"func_name": func_name})
+      defun_kwargs.update({"_tf_data_function": True})
 
       # Note: _wrapper_helper will apply autograph based on context.
       @eager_function.defun_with_attributes(
@@ -3287,9 +3293,9 @@ class StructuredFunctionWrapper(object):
       with tracking.resource_tracker_scope(resource_tracker):
         # TODO(b/141462134): Switch to using garbage collection.
         self._function = wrapper_fn.get_concrete_function()
-
         if add_to_graph:
           self._function.add_to_graph(ops.get_default_graph())
+
       if resource_tracker.resources:
         _warn_if_collections(transformation_name)
 
@@ -3301,7 +3307,6 @@ class StructuredFunctionWrapper(object):
               "if the random op has not been provided any seed. Explicitly set "
               "the seed in the function if this is not the intended behavior."
               %(outer_graph_seed, func_name), stacklevel=4)
-  # pylint: enable=protected-access
 
   @property
   def output_structure(self):
@@ -3526,6 +3531,8 @@ class RangeDataset(DatasetSource):
     return self._structure
 
 
+# This can be deleted after the forward compatibility window for switching
+# to using dummy resource expires on 5/20.
 class _MemoryCacheDeleter(object):
   """An object which cleans up an anonymous memory cache resource.
 
@@ -3552,15 +3559,20 @@ class _MemoryCacheDeleter(object):
               handle=self._handle, deleter=self._deleter)
 
 
+# This can be deleted after the forward compatibility window for switching
+# to using dummy resource expires on 5/20.
 class _MemoryCache(object):
   """Represents a memory cache resource."""
 
   def __init__(self):
     super(_MemoryCache, self).__init__()
-    self._device = context.context().device_name
-    self._handle, self._deleter = (gen_dataset_ops.anonymous_memory_cache())
-    self._resource_deleter = _MemoryCacheDeleter(
-        handle=self._handle, device=self._device, deleter=self._deleter)
+    if compat.forward_compatible(2020, 5, 20):
+      self._handle = gen_dataset_ops.dummy_memory_cache()
+    else:
+      self._device = context.context().device_name
+      self._handle, self._deleter = gen_dataset_ops.anonymous_memory_cache()
+      self._resource_deleter = _MemoryCacheDeleter(
+          handle=self._handle, device=self._device, deleter=self._deleter)
 
   @property
   def handle(self):
@@ -3590,6 +3602,8 @@ class CacheDataset(UnaryUnchangedStructureDataset):
     super(CacheDataset, self).__init__(input_dataset, variant_tensor)
 
 
+# This can be deleted after the forward compatibility window for switching
+# to using dummy resource expires on 5/22.
 class _SeedGeneratorDeleter(object):
   """An object which cleans up an anonymous seed generator resource.
 
@@ -3616,63 +3630,22 @@ class _SeedGeneratorDeleter(object):
               handle=self._handle, deleter=self._deleter)
 
 
+# This can be deleted after the forward compatibility window for switching
+# to using dummy resource expires on 5/22.
 class _SeedGenerator(object):
   """Represents a fixed seed generator resource."""
 
   def __init__(self, seed, seed2, reshuffle):
     super(_SeedGenerator, self).__init__()
-    self._device = context.context().device_name
-    self._handle, self._deleter = (
-        gen_dataset_ops.anonymous_seed_generator(
-            seed=seed, seed2=seed2, reshuffle=reshuffle))
-    self._resource_deleter = _SeedGeneratorDeleter(
-        handle=self._handle, device=self._device, deleter=self._deleter)
-
-  @property
-  def handle(self):
-    return self._handle
-
-
-# TODO(b/151115950): Remove this class after forward compatibility window
-# expires
-class _RandomSeedGeneratorDeleter(object):
-  """An object which cleans up an anonymous random seed generator resource.
-
-  An alternative to defining a __del__ method on an object. Even if the parent
-  object is part of a reference cycle, the cycle will be collectable.
-  """
-
-  def __init__(self, handle, device, deleter):
-    self._deleter = deleter
-    self._handle = handle
-    self._device = device
-    self._eager_mode = context.executing_eagerly()
-
-  def __del__(self):
-    with ops.device(self._device):
-      # Make sure the resource is deleted in the same mode as it was created in.
-      if self._eager_mode:
-        with context.eager_mode():
-          gen_dataset_ops.delete_random_seed_generator(
-              handle=self._handle, deleter=self._deleter)
-      else:
-        with context.graph_mode():
-          gen_dataset_ops.delete_random_seed_generator(
-              handle=self._handle, deleter=self._deleter)
-
-
-# TODO(b/151115950): Remove this class after forward compatibility window
-# expires
-class _RandomSeedGenerator(object):
-  """Represents a random seed generator resource."""
-
-  def __init__(self, seed, seed2):
-    super(_RandomSeedGenerator, self).__init__()
-    self._device = context.context().device_name
-    self._handle, self._deleter = (
-        gen_dataset_ops.anonymous_random_seed_generator(seed=seed, seed2=seed2))
-    self._resource_deleter = _RandomSeedGeneratorDeleter(
-        handle=self._handle, device=self._device, deleter=self._deleter)
+    if compat.forward_compatible(2020, 5, 22):
+      self._handle = gen_dataset_ops.dummy_seed_generator()
+    else:
+      self._device = context.context().device_name
+      self._handle, self._deleter = (
+          gen_dataset_ops.anonymous_seed_generator(
+              seed=seed, seed2=seed2, reshuffle=reshuffle))
+      self._resource_deleter = _SeedGeneratorDeleter(
+          handle=self._handle, device=self._device, deleter=self._deleter)
 
   @property
   def handle(self):
@@ -3710,25 +3683,29 @@ class ShuffleDataset(UnaryUnchangedStructureDataset):
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
     self._seed, self._seed2 = random_seed.get_seed(seed)
-
     if reshuffle_each_iteration is None:
-      self._reshuffle_each_iteration = True
-    else:
-      self._reshuffle_each_iteration = reshuffle_each_iteration
+      reshuffle_each_iteration = True
+    self._reshuffle_each_iteration = reshuffle_each_iteration
 
-    if (tf2.enabled() and (self._reshuffle_each_iteration or
-                           compat.forward_compatible(2020, 4, 10)) and
+    if (tf2.enabled() and
         (context.executing_eagerly() or ops.inside_function())):
-      if compat.forward_compatible(2020, 4, 10):
-        self._seed_generator = _SeedGenerator(self._seed, self._seed2,
-                                              self._reshuffle_each_iteration)
+      self._seed_generator = _SeedGenerator(self._seed, self._seed2,
+                                            self._reshuffle_each_iteration)
+      if compat.forward_compatible(2020, 5, 22):
+        variant_tensor = gen_dataset_ops.shuffle_dataset_v3(
+            input_dataset._variant_tensor,  # pylint: disable=protected-access
+            buffer_size=self._buffer_size,
+            seed=self._seed,
+            seed2=self._seed2,
+            seed_generator=self._seed_generator.handle,
+            reshuffle_each_iteration=self._reshuffle_each_iteration,
+            **self._flat_structure)
       else:
-        self._seed_generator = _RandomSeedGenerator(self._seed, self._seed2)
-      variant_tensor = gen_dataset_ops.shuffle_dataset_v2(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          buffer_size=self._buffer_size,
-          seed_generator=self._seed_generator.handle,
-          **self._flat_structure)
+        variant_tensor = gen_dataset_ops.shuffle_dataset_v2(
+            input_dataset._variant_tensor,  # pylint: disable=protected-access
+            buffer_size=self._buffer_size,
+            seed_generator=self._seed_generator.handle,
+            **self._flat_structure)
     else:
       variant_tensor = gen_dataset_ops.shuffle_dataset(
           input_dataset._variant_tensor,  # pylint: disable=protected-access
@@ -4138,29 +4115,17 @@ class ParallelMapDataset(UnaryDataset):
     else:
       self._deterministic = "false"
     self._preserve_cardinality = preserve_cardinality
-    if deterministic is not None or compat.forward_compatible(2020, 3, 6):
-      self._num_parallel_calls = ops.convert_to_tensor(
-          num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
-      variant_tensor = gen_dataset_ops.parallel_map_dataset_v2(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,
-          f=self._map_func.function,
-          num_parallel_calls=self._num_parallel_calls,
-          deterministic=self._deterministic,
-          use_inter_op_parallelism=self._use_inter_op_parallelism,
-          preserve_cardinality=self._preserve_cardinality,
-          **self._flat_structure)
-    else:
-      self._num_parallel_calls = ops.convert_to_tensor(
-          num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls")
-      variant_tensor = gen_dataset_ops.parallel_map_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,
-          f=self._map_func.function,
-          num_parallel_calls=self._num_parallel_calls,
-          use_inter_op_parallelism=self._use_inter_op_parallelism,
-          preserve_cardinality=self._preserve_cardinality,
-          **self._flat_structure)
+    self._num_parallel_calls = ops.convert_to_tensor(
+        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
+    variant_tensor = gen_dataset_ops.parallel_map_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        num_parallel_calls=self._num_parallel_calls,
+        deterministic=self._deterministic,
+        use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
+        **self._flat_structure)
     super(ParallelMapDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
@@ -4287,30 +4252,17 @@ class ParallelInterleaveDataset(UnaryDataset):
     else:
       deterministic_string = "false"
 
-    if (buffer_output_elements != AUTOTUNE or
-        prefetch_input_elements != AUTOTUNE or
-        compat.forward_compatible(2020, 3, 6)):
-      variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v4(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,  # pylint: disable=protected-access
-          self._cycle_length,
-          self._block_length,
-          self._buffer_output_elements,
-          self._prefetch_input_elements,
-          self._num_parallel_calls,
-          f=self._map_func.function,
-          deterministic=deterministic_string,
-          **self._flat_structure)
-    else:
-      variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v3(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,  # pylint: disable=protected-access
-          self._cycle_length,
-          self._block_length,
-          self._num_parallel_calls,
-          f=self._map_func.function,
-          deterministic=deterministic_string,
-          **self._flat_structure)
+    variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v4(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,  # pylint: disable=protected-access
+        self._cycle_length,
+        self._block_length,
+        self._buffer_output_elements,
+        self._prefetch_input_elements,
+        self._num_parallel_calls,
+        f=self._map_func.function,
+        deterministic=deterministic_string,
+        **self._flat_structure)
     super(ParallelInterleaveDataset, self).__init__(input_dataset,
                                                     variant_tensor)
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 5bd2824839f..db810de6689 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import deprecation
@@ -551,7 +552,11 @@ class OwnedIterator(trackable.Trackable, composite_tensor.CompositeTensor):
   in eager mode and inside of tf.functions.
   """
 
-  def __init__(self, dataset=None, components=None, element_spec=None):
+  def __init__(self,
+               dataset=None,
+               components=None,
+               element_spec=None,
+               job_token=None):
     """Creates a new iterator from the given dataset.
 
     If `dataset` is not specified, the iterator will be created from the given
@@ -564,6 +569,9 @@ class OwnedIterator(trackable.Trackable, composite_tensor.CompositeTensor):
       components: Tensor components to construct the iterator from.
       element_spec: A nested structure of `TypeSpec` objects that
         represents the type specification of elements of the iterator.
+      job_token: A token to use for reading from a tf.data service job. Data
+        will be partitioned among all iterators using the same token. If `None`,
+        the iterator will not read from the tf.data service.
 
     Raises:
       ValueError: If `dataset` is not provided and either `components` or
@@ -575,6 +583,7 @@ class OwnedIterator(trackable.Trackable, composite_tensor.CompositeTensor):
                      "`element_spec` need to be provided.")
 
     self._device = context.context().device_name
+    self._job_token = job_token
 
     if dataset is None:
       if (components is None or element_spec is None):
@@ -617,7 +626,11 @@ class OwnedIterator(trackable.Trackable, composite_tensor.CompositeTensor):
           gen_dataset_ops.anonymous_iterator_v2(
               output_types=self._flat_output_types,
               output_shapes=self._flat_output_shapes))
-      gen_dataset_ops.make_iterator(ds_variant, self._iterator_resource)
+      if self._job_token is None:
+        gen_dataset_ops.make_iterator(ds_variant, self._iterator_resource)
+      else:
+        gen_experimental_dataset_ops.make_data_service_iterator(
+            ds_variant, self._job_token, self._iterator_resource)
       # Delete the resource when this object is deleted
       self._resource_deleter = IteratorResourceDeleter(
           handle=self._iterator_resource,
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index e37e68a0256..dbc580ce331 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import tf2
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
@@ -249,9 +248,6 @@ class ParallelInterleaveDataset(dataset_ops.UnaryDataset):
         cycle_length, dtype=dtypes.int64, name="cycle_length")
     self._block_length = ops.convert_to_tensor(
         block_length, dtype=dtypes.int64, name="block_length")
-    if sloppy is not None:
-      self._sloppy = ops.convert_to_tensor(
-          sloppy, dtype=dtypes.bool, name="sloppy")
     self._buffer_output_elements = convert.optional_param_to_tensor(
         "buffer_output_elements",
         buffer_output_elements,
@@ -260,34 +256,22 @@ class ParallelInterleaveDataset(dataset_ops.UnaryDataset):
         "prefetch_input_elements",
         prefetch_input_elements,
         argument_default=2 * cycle_length)
-    if sloppy is None or compat.forward_compatible(2020, 3, 6):
-      if sloppy is None:
-        self._deterministic = "default"
-      elif sloppy:
-        self._deterministic = "false"
-      else:
-        self._deterministic = "true"
-      variant_tensor = ged_ops.legacy_parallel_interleave_dataset_v2(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,
-          self._cycle_length,
-          self._block_length,
-          self._buffer_output_elements,
-          self._prefetch_input_elements,
-          f=self._map_func.function,
-          deterministic=self._deterministic,
-          **self._flat_structure)
+    if sloppy is None:
+      self._deterministic = "default"
+    elif sloppy:
+      self._deterministic = "false"
     else:
-      variant_tensor = ged_ops.parallel_interleave_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,
-          self._cycle_length,
-          self._block_length,
-          self._sloppy,
-          self._buffer_output_elements,
-          self._prefetch_input_elements,
-          f=self._map_func.function,
-          **self._flat_structure)
+      self._deterministic = "true"
+    variant_tensor = ged_ops.legacy_parallel_interleave_dataset_v2(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        self._cycle_length,
+        self._block_length,
+        self._buffer_output_elements,
+        self._prefetch_input_elements,
+        f=self._map_func.function,
+        deterministic=self._deterministic,
+        **self._flat_structure)
     super(ParallelInterleaveDataset, self).__init__(input_dataset,
                                                     variant_tensor)
 
diff --git a/tensorflow/python/data/service/BUILD b/tensorflow/python/data/service/BUILD
index 19bcaa3b952..18678230205 100644
--- a/tensorflow/python/data/service/BUILD
+++ b/tensorflow/python/data/service/BUILD
@@ -1,4 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
diff --git a/tensorflow/python/data/service/server_lib.py b/tensorflow/python/data/service/server_lib.py
index 45b1924b812..b8f6e673f2e 100644
--- a/tensorflow/python/data/service/server_lib.py
+++ b/tensorflow/python/data/service/server_lib.py
@@ -38,7 +38,9 @@ class MasterServer(object):
         "grpc+local", and make sure your binary links in
         `data/service:local_credentials`.
     """
+    self._protocol = protocol
     self._server = _pywrap_server_lib.TF_DATA_NewMasterServer(0, protocol)
+    self._running = True
 
   @property
   def target(self):
@@ -47,21 +49,31 @@ class MasterServer(object):
     The returned string will be in the form protocol://address:port, e.g.
     "grpc://localhost:1000".
     """
-    return _pywrap_server_lib.TF_DATA_ServerTarget(self._server)
+    port = _pywrap_server_lib.TF_DATA_MasterServerBoundPort(self._server)
+    return "{0}://localhost:{1}".format(self._protocol, port)
 
-  def __del__(self):
+  def num_tasks(self):
+    """Returns the number of tasks on the master."""
+    return _pywrap_server_lib.TF_DATA_MasterServerNumTasks(self._server)
+
+  def stop(self):
     """Shuts down and deletes the server.
 
     This method will block until all outstanding rpcs have completed and the
     server has been shut down.
     """
-    _pywrap_server_lib.TF_DATA_DeleteServer(self._server)
+    if self._running:
+      self._running = False
+      _pywrap_server_lib.TF_DATA_DeleteMasterServer(self._server)
+
+  def __del__(self):
+    self.stop()
 
 
 class WorkerServer(object):
   """An in-process tf.data service worker, for use in testing."""
 
-  def __init__(self, protocol, master_address):
+  def __init__(self, protocol, master_address, port=0):
     """Creates and starts a new tf.data worker server.
 
     The server will choose an available port. Use `target()` to get the string
@@ -73,9 +85,12 @@ class WorkerServer(object):
         "grpc+local", and make sure your binary links in
         `data/service:local_credentials`.
       master_address: The address of the tf.data master server to register with.
+      port: The port to bind to.
     """
+    self._protocol = protocol
     self._server = _pywrap_server_lib.TF_DATA_NewWorkerServer(
-        0, protocol, master_address)
+        port, protocol, master_address, "localhost:%port%")
+    self._running = True
 
   @property
   def target(self):
@@ -84,12 +99,18 @@ class WorkerServer(object):
     The returned string will be in the form protocol://address:port, e.g.
     "grpc://localhost:1000".
     """
-    return _pywrap_server_lib.TF_DATA_ServerTarget(self._server)
+    port = _pywrap_server_lib.TF_DATA_WorkerServerBoundPort(self._server)
+    return "{0}://localhost:{1}".format(self._protocol, port)
 
-  def __del__(self):
+  def stop(self):
     """Shuts down and deletes the server.
 
     This method will block until all outstanding rpcs have completed and the
     server has been shut down.
     """
-    _pywrap_server_lib.TF_DATA_DeleteServer(self._server)
+    if self._running:
+      self._running = False
+      _pywrap_server_lib.TF_DATA_DeleteWorkerServer(self._server)
+
+  def __del__(self):
+    self.stop()
diff --git a/tensorflow/python/data/service/server_lib_wrapper.cc b/tensorflow/python/data/service/server_lib_wrapper.cc
index 16a12eef873..8325d74a768 100644
--- a/tensorflow/python/data/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/service/server_lib_wrapper.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "Python.h"
 #include "pybind11/chrono.h"
 #include "pybind11/complex.h"
+#include "pybind11/detail/common.h"
 #include "pybind11/functional.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
@@ -27,13 +28,14 @@ limitations under the License.
 namespace py = pybind11;
 
 PYBIND11_MODULE(_pywrap_server_lib, m) {
-  py::class_<tensorflow::data::GrpcDataServer>(m, "GrpcDataServer");
+  py::class_<tensorflow::data::MasterGrpcDataServer>(m, "MasterGrpcDataServer");
+  py::class_<tensorflow::data::WorkerGrpcDataServer>(m, "WorkerGrpcDataServer");
 
   m.def(
       "TF_DATA_NewMasterServer",
       [](int port, std::string protocol)
-          -> std::unique_ptr<tensorflow::data::GrpcDataServer> {
-        std::unique_ptr<tensorflow::data::GrpcDataServer> server;
+          -> std::unique_ptr<tensorflow::data::MasterGrpcDataServer> {
+        std::unique_ptr<tensorflow::data::MasterGrpcDataServer> server;
         tensorflow::Status status =
             tensorflow::data::NewMasterServer(port, protocol, &server);
         tensorflow::MaybeRaiseFromStatus(status);
@@ -41,25 +43,43 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
         return server;
       },
       py::return_value_policy::reference);
+  m.def(
+      "TF_DATA_MasterServerBoundPort",
+      [](tensorflow::data::MasterGrpcDataServer* server) -> int {
+        return server->BoundPort();
+      },
+      py::return_value_policy::copy);
+  m.def("TF_DATA_DeleteMasterServer",
+        [](tensorflow::data::MasterGrpcDataServer* server) { server->Stop(); });
+  m.def(
+      "TF_DATA_MasterServerNumTasks",
+      [](tensorflow::data::MasterGrpcDataServer* server) -> int {
+        int num_tasks;
+        tensorflow::Status status = server->NumTasks(&num_tasks);
+        tensorflow::MaybeRaiseFromStatus(status);
+        return num_tasks;
+      },
+      py::return_value_policy::copy);
 
   m.def(
       "TF_DATA_NewWorkerServer",
-      [](int port, std::string protocol, std::string master_address)
-          -> std::unique_ptr<tensorflow::data::GrpcDataServer> {
-        std::unique_ptr<tensorflow::data::GrpcDataServer> server;
+      [](int port, std::string protocol, std::string master_address,
+         std::string worker_address)
+          -> std::unique_ptr<tensorflow::data::WorkerGrpcDataServer> {
+        std::unique_ptr<tensorflow::data::WorkerGrpcDataServer> server;
         tensorflow::Status status = tensorflow::data::NewWorkerServer(
-            port, protocol, master_address, &server);
+            port, protocol, master_address, worker_address, &server);
         tensorflow::MaybeRaiseFromStatus(status);
         server->Start();
         return server;
       },
       py::return_value_policy::reference);
   m.def(
-      "TF_DATA_ServerTarget",
-      [](tensorflow::data::GrpcDataServer* server) -> std::string {
-        return server->Target();
+      "TF_DATA_WorkerServerBoundPort",
+      [](tensorflow::data::WorkerGrpcDataServer* server) -> int {
+        return server->BoundPort();
       },
       py::return_value_policy::copy);
-  m.def("TF_DATA_DeleteServer",
-        [](tensorflow::data::GrpcDataServer* server) { server->Stop(); });
+  m.def("TF_DATA_DeleteWorkerServer",
+        [](tensorflow::data::WorkerGrpcDataServer* server) { server->Stop(); });
 };
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 20abb46c165..1ef0504ecb8 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -745,37 +745,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "distributed_callbacks_test",
-    size = "medium",
-    srcs = ["lib/distributed_callbacks_test.py"],
-    python_version = "PY3",
-    shard_count = 4,
-    tags = [
-        "guitar",
-        "multi_and_single_gpu",
-        "no_windows",  # TODO(b/142475891): Enable this test on Windows.
-        "no_windows_gpu",  # TODO(b/130551176)
-    ],
-    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
-    deps = [
-        ":check_numerics_callback",
-        ":debug_events_reader",
-        ":debug_events_writer",
-        ":dumping_callback",
-        ":dumping_callback_test_lib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/distribute:strategy_combinations",
-        "//tensorflow/python/keras",
-        "//third_party/py/numpy",
-    ],
-)
-
 cuda_py_test(
     name = "dumping_callback_test",
     size = "medium",
@@ -871,7 +840,6 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss_py38",  #TODO(b/151449908)
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/python/debug/lib/check_numerics_callback.py b/tensorflow/python/debug/lib/check_numerics_callback.py
index 26d45c3b589..edcafad201e 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback.py
@@ -382,13 +382,13 @@ def enable_check_numerics(stack_height_limit=30,
      x = -1.0
 
      # When the following line runs, a function graph will be compiled
-     # from the Python function `log_x_plus_1()`. Due to the
+     # from the Python function `square_log_x_plus_1()`. Due to the
      # `enable_check_numerics()` call above, the graph will contain
      # numerics checking ops that will run during the function graph's
      # execution. The function call generates an -infinity when the Log
      # (logarithm) op operates on the output tensor of the Add op.
      # The program errors out at this line, printing an error message.
-     y = log_x_plus_1(x)
+     y = square_log_x_plus_1(x)
      z = -y
     ```
 
diff --git a/tensorflow/python/debug/lib/check_numerics_callback_test.py b/tensorflow/python/debug/lib/check_numerics_callback_test.py
index ea5d70f0d08..5f578da03c3 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback_test.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback_test.py
@@ -94,10 +94,16 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
 
     dataset = dataset_ops.Dataset.from_tensor_slices(tensor).batch(2).map(
         map_fn)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
-    self.assertAllClose(self.evaluate(iterator.get_next()), np.log([1.25, 2]))
-    self.assertAllClose(self.evaluate(iterator.get_next()), np.log([3.25, 5]))
+    @def_function.function
+    def get_batches():
+      iterator = iter(dataset)
+      return [next(iterator), next(iterator)]
+
+    batches = self.evaluate(get_batches())
+    self.assertLen(batches, 2)
+    self.assertAllClose(batches[0], np.log([1.25, 2]))
+    self.assertAllClose(batches[1], np.log([3.25, 5]))
 
 
 class CheckNumericsCallbackUnhealthyTest(test_util.TensorFlowTestCase):
@@ -267,6 +273,23 @@ class CheckNumericsCallbackUnhealthyTest(test_util.TensorFlowTestCase):
     self.assertTrue(re.search(r"Stack trace of op's creation", message))
     self.assertIn("accum.assign(accum * 2.0)", message)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testNanInConstIsCaptured(self):
+    check_numerics_callback.enable_check_numerics()
+    v = variables.Variable(3.0, dtype=dtypes.float32)
+    @def_function.function
+    def add_a_bad_constant(x):
+      c = constant_op.constant(np.nan)
+      return x + c
+    if not context.executing_eagerly():
+      self.evaluate(v.initializer)
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: self.evaluate(add_a_bad_constant(v)))
+    self.assertTrue(re.search(r"graph op.*\"Const\"", message))
+    self.assertTrue(re.search(r"dtype:.*float32", message))
+    self.assertTrue(re.search(r"shape:.*\(\)", message))
+    self.assertTrue(re.search(r"Graph name:.*add_a_bad_constant", message))
+
   @test_util.run_in_graph_and_eager_modes
   def testCatchInfinityInDatasetMapFunction(self):
     """Test that callback catches NaN in a tf.dataset map function."""
diff --git a/tensorflow/python/debug/lib/debug_events_monitors_test.py b/tensorflow/python/debug/lib/debug_events_monitors_test.py
index 05eaa510648..e8dcd6e4329 100644
--- a/tensorflow/python/debug/lib/debug_events_monitors_test.py
+++ b/tensorflow/python/debug/lib/debug_events_monitors_test.py
@@ -173,7 +173,8 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,
         self.assertLen(traces[1].debug_tensor_value, 11)
         self.assertLen(traces[2].debug_tensor_value, 11)
       elif tensor_debug_mode == "FULL_TENSOR":
-        self.assertLen(traces, 4)  # [Placeholder:0, Unique:0, Unique:1, Sum:0].
+        # [Placeholder:0, Unique:0, Unique:1, Const:0, Sum:0].
+        self.assertLen(traces, 5)
         self.assertEqual(traces[0].op_type, "Placeholder")
         self.assertEqual(traces[0].output_slot, 0)
         self.assertIsNone(traces[0].debug_tensor_value)
@@ -192,11 +193,16 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,
         self.assertAllEqual(
             reader.graph_execution_trace_to_tensor_value(traces[2]),
             [0, 1, 2, 3, 0])
-        self.assertEqual(traces[3].op_type, "Sum")
+        self.assertEqual(traces[3].op_type, "Const")
         self.assertEqual(traces[3].output_slot, 0)
         self.assertIsNone(traces[3].debug_tensor_value)
         self.assertAllClose(
-            reader.graph_execution_trace_to_tensor_value(traces[3]), 17.)
+            reader.graph_execution_trace_to_tensor_value(traces[3]), [0])
+        self.assertEqual(traces[4].op_type, "Sum")
+        self.assertEqual(traces[4].output_slot, 0)
+        self.assertIsNone(traces[4].debug_tensor_value)
+        self.assertAllClose(
+            reader.graph_execution_trace_to_tensor_value(traces[4]), 17.)
 
 
 class AlertDataObjectsTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/debug/lib/debug_events_reader.py b/tensorflow/python/debug/lib/debug_events_reader.py
index 10fd0d61222..af4d4d0974d 100644
--- a/tensorflow/python/debug/lib/debug_events_reader.py
+++ b/tensorflow/python/debug/lib/debug_events_reader.py
@@ -410,6 +410,8 @@ class DebuggedGraph(object):
     self._inner_graph_ids = []
     # A dictionary from op name to GraphOpCreationDigest.
     self._op_by_name = dict()
+    # A dictionary mapping op to immediate downstream consumers.
+    self._op_consumers = collections.defaultdict(list)
 
   def add_inner_graph_id(self, inner_graph_id):
     """Add the debugger-generated ID of a graph nested within this graph.
@@ -434,6 +436,18 @@ class DebuggedGraph(object):
     self._op_by_name[
         graph_op_creation_digest.op_name] = graph_op_creation_digest
 
+  def add_op_consumer(self, src_op_name, src_slot, dst_op_name, dst_slot):
+    """Add a consuming op for this op.
+
+    Args:
+      src_op_name: Name of the op of which the output tensor is being consumed.
+      src_slot: 0-based output slot of the op being consumed.
+      dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
+      dst_slot: 0-based input slot of the consuming op that receives the tensor
+        from this op.
+    """
+    self._op_consumers[src_op_name].append((src_slot, dst_op_name, dst_slot))
+
   @property
   def name(self):
     return self._name
@@ -450,14 +464,40 @@ class DebuggedGraph(object):
   def inner_graph_ids(self):
     return self._inner_graph_ids
 
-  def get_op_type(self, op_name):
-    return self._op_by_name[op_name].op_type
-
   def get_tensor_id(self, op_name, output_slot):
     """Get the ID of a symbolic tensor in this graph."""
     return self._op_by_name[op_name].output_tensor_ids[output_slot]
 
-  # TODO(cais): Implement to_json().
+  def get_op_creation_digest(self, op_name):
+    """Get the GraphOpCreationDigest for a op in the graph."""
+    return self._op_by_name[op_name]
+
+  def get_op_consumers(self, src_op_name):
+    """Get all the downstream consumers of this op.
+
+    Only data (non-control) edges are tracked.
+
+    Args:
+      src_op_name: Name of the op providing the tensor being consumed.
+
+    Returns:
+      A list of (src_slot, dst_op_name, dst_slot) tuples. In each item of
+      the list:
+        src_slot: 0-based output slot of the op of which the output tensor
+          is being consumed.
+        dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
+        dst_slot: 0-based input slot of the consuming op that receives
+          the tensor from this op.
+    """
+    return self._op_consumers[src_op_name]
+
+  def to_json(self):
+    return {
+        "name": self.name,
+        "graph_id": self.graph_id,
+        "outer_graph_id": self._outer_graph_id,
+        "inner_graph_ids": self._inner_graph_ids,
+    }
 
 
 class DebuggedDevice(object):
@@ -483,7 +523,11 @@ class DebuggedDevice(object):
   def device_id(self):
     return self._device_id
 
-  # TODO(cais): Implement to_json().
+  def to_json(self):
+    return {
+        "device_name": self._device_name,
+        "device_id": self._device_id,
+    }
 
 
 class GraphOpCreationDigest(BaseDigest):
@@ -500,6 +544,9 @@ class GraphOpCreationDigest(BaseDigest):
     output_tensor_ids: Debugger-generated IDs for the output(s) of the op.
     input_names: Names of the input tensors to the op.
     device_name: The name of the device that the op is placed on (if available).
+    host_name: Name of the host on which the op is created.
+    stack_frame_ids: IDs of the frames of the stack trace at which the op
+      is created.
   """
 
   def __init__(self,
@@ -509,6 +556,8 @@ class GraphOpCreationDigest(BaseDigest):
                op_type,
                op_name,
                output_tensor_ids,
+               host_name,
+               stack_frame_ids,
                input_names=None,
                device_name=None):
     super(GraphOpCreationDigest, self).__init__(wall_time, offset)
@@ -516,6 +565,8 @@ class GraphOpCreationDigest(BaseDigest):
     self._op_type = op_type
     self._op_name = op_name
     self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
+    self._host_name = host_name
+    self._stack_frame_ids = stack_frame_ids
     self._input_names = _tuple_or_none(input_names)
     self._device_name = device_name
 
@@ -547,6 +598,14 @@ class GraphOpCreationDigest(BaseDigest):
   def device_name(self):
     return self._device_name
 
+  @property
+  def host_name(self):
+    return self._host_name
+
+  @property
+  def stack_frame_ids(self):
+    return self._stack_frame_ids
+
   def to_json(self):
     output = super(GraphOpCreationDigest, self).to_json()
     output.update({
@@ -554,6 +613,8 @@ class GraphOpCreationDigest(BaseDigest):
         "op_type": self.op_type,
         "op_name": self.op_name,
         "output_tensor_ids": self.output_tensor_ids,
+        "host_name": self.host_name,
+        "stack_frame_ids": self.stack_frame_ids,
         "input_names": self.input_names,
         "device_name": self.device_name,
     })
@@ -849,9 +910,17 @@ class DebugDataReader(object):
             op_creation_proto.op_type,
             op_creation_proto.op_name,
             tuple(op_creation_proto.output_tensor_ids),
+            op_creation_proto.code_location.host_name,
+            tuple(op_creation_proto.code_location.stack_frame_ids),
             input_names=tuple(op_creation_proto.input_names))
         self._graph_op_digests.append(op_digest)
-        self._graph_by_id[op_creation_proto.graph_id].add_op(op_digest)
+        debugged_graph = self._graph_by_id[op_creation_proto.graph_id]
+        debugged_graph.add_op(op_digest)
+        for dst_slot, input_name in enumerate(op_creation_proto.input_names):
+          src_op_name, src_slot = input_name.split(":")
+          debugged_graph.add_op_consumer(src_op_name, int(src_slot),
+                                         op_creation_proto.op_name, dst_slot)
+
       elif debug_event.debugged_graph.ByteSize():
         graph_proto = debug_event.debugged_graph
         graph = DebuggedGraph(
@@ -936,7 +1005,7 @@ class DebugDataReader(object):
     Returns:
       Op type as a str.
     """
-    return self._graph_by_id[graph_id].get_op_type(op_name)
+    return self._graph_by_id[graph_id].get_op_creation_digest(op_name).op_type
 
   def _load_execution(self):
     """Incrementally read the .execution file."""
@@ -1030,48 +1099,61 @@ class DebugDataReader(object):
     else:
       return self._graph_op_digests
 
-  def graph_execution_traces(self, digest=False):
+  def graph_execution_traces(self, digest=False, begin=None, end=None):
     """Get all the intra-graph execution tensor traces read so far.
 
-    TODO(cais): Support begin and end to enable partial loading.
-
     Args:
       digest: Whether the results will be returned in the more light-weight
         digest form.
+      begin: Optional beginning index for the requested traces or their digests.
+        Python-style negative indices are supported.
+      end: Optional ending index for the requested traces or their digests.
+        Python-style negative indices are supported.
 
     Returns:
       If `digest`: a `list` of `GraphExecutionTraceDigest` objects.
       Else: a `list` of `GraphExecutionTrace` objects.
     """
+    digests = self._graph_execution_trace_digests
+    if begin is not None or end is not None:
+      begin = begin or 0
+      end = end or len(digests)
+      digests = digests[begin:end]
     if digest:
-      return self._graph_execution_trace_digests
+      return digests
     else:
-      return [self.read_graph_execution_trace(digest)
-              for digest in self._graph_execution_trace_digests]
+      return [self.read_graph_execution_trace(digest) for digest in digests]
 
   def num_graph_execution_traces(self):
     """Get the number of graph execution traces read so far."""
     return len(self._graph_execution_trace_digests)
 
-  def executions(self, digest=False):
+  def executions(self, digest=False, begin=None, end=None):
     """Get `Execution`s or `ExecutionDigest`s this reader has read so far.
 
-    # TODO(cais): Support begin index and end index to support partial loading.
-
     Args:
       digest: Whether the results are returned in a digest form, i.e.,
         `ExecutionDigest` format, instead of the more detailed `Execution`
         format.
+      begin: Optional beginning index for the requested execution data objects
+        or their digests. Python-style negative indices are supported.
+      end: Optional ending index for the requested execution data objects or
+        their digests. Python-style negative indices are supported.
 
     Returns:
       If `digest`: a `list` of `ExecutionDigest` objects.
       Else: a `list` of `Execution` objects.
     """
+    digests = self._execution_digests
+    if begin is not None or end is not None:
+      begin = begin or 0
+      end = end or len(digests)
+      digests = digests[begin:end]
     if digest:
-      return self._execution_digests
+      return digests
     else:
       # TODO(cais): Optimizer performance removing repeated file open/close.
-      return [self.read_execution(digest) for digest in self._execution_digests]
+      return [self.read_execution(digest) for digest in digests]
 
   def num_executions(self):
     """Get the number of execution events read so far."""
@@ -1123,13 +1205,10 @@ class DebugDataReader(object):
         1. The host name.
         2. The stack trace, as a list of (file_path, lineno, func) tuples.
     """
-    debug_event = self._reader.read_graphs_event(
-        graph_op_creation_digest.offset)
-    graph_op_creation = debug_event.graph_op_creation
-    host_name = graph_op_creation.code_location.host_name
-    return host_name, [
+    return graph_op_creation_digest.host_name, [
         self._stack_frame_by_id[frame_id][1:]
-        for frame_id in graph_op_creation.code_location.stack_frame_ids]
+        for frame_id in graph_op_creation_digest.stack_frame_ids
+    ]
 
   # TODO(cais): Add graph_execution_digests() with an ExecutionDigest
   #   as a kwarg, to establish the association between top-level and intra-graph
diff --git a/tensorflow/python/debug/lib/debug_events_writer_test.py b/tensorflow/python/debug/lib/debug_events_writer_test.py
index 45b7f16b2a4..8002671450b 100644
--- a/tensorflow/python/debug/lib/debug_events_writer_test.py
+++ b/tensorflow/python/debug/lib/debug_events_writer_test.py
@@ -24,6 +24,8 @@ import os
 import threading
 import time
 
+from absl.testing import parameterized
+
 from tensorflow.core.protobuf import debug_event_pb2
 from tensorflow.python.debug.lib import debug_events_reader
 from tensorflow.python.debug.lib import debug_events_writer
@@ -34,7 +36,8 @@ from tensorflow.python.framework import versions
 from tensorflow.python.platform import googletest
 
 
-class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase):
+class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
+                            parameterized.TestCase):
 
   def testMultiThreadedConstructorCallWorks(self):
     def init_writer():
@@ -51,15 +54,15 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase):
 
     # Verify that there is only one debug event file of each type.
     metadata_paths = glob.glob(os.path.join(self.dump_root, "*.metadata"))
-    self.assertEqual(len(metadata_paths), 1)
+    self.assertLen(metadata_paths, 1)
     source_files_paths = glob.glob(
         os.path.join(self.dump_root, "*.source_files"))
-    self.assertEqual(len(source_files_paths), 1)
+    self.assertLen(source_files_paths, 1)
     stack_frames_paths = glob.glob(
         os.path.join(self.dump_root, "*.stack_frames"))
-    self.assertEqual(len(stack_frames_paths), 1)
+    self.assertLen(stack_frames_paths, 1)
     graphs_paths = glob.glob(os.path.join(self.dump_root, "*.graphs"))
-    self.assertEqual(len(graphs_paths), 1)
+    self.assertLen(graphs_paths, 1)
     self._readAndCheckMetadataFile()
 
   def testWriteSourceFilesAndStackFrames(self):
@@ -256,7 +259,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase):
       actuals = list(reader.graph_execution_traces_iterator())
       # Before FlushExecutionFiles() is called. No data should have been written
       # to the file.
-      self.assertEqual(len(actuals), 0)
+      self.assertEmpty(actuals)
 
       writer.FlushExecutionFiles()
       actuals = list(item.debug_event.graph_execution_trace
@@ -520,6 +523,65 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase):
     for i in range(100):
       self.assertEqual(traces[i].op_name, "Op%d" % i)
 
+  @parameterized.named_parameters(
+      ("Begin1End3", 1, 3, 1, 3),
+      ("Begin0End3", 0, 3, 0, 3),
+      ("Begin0EndNeg1", 0, -1, 0, 4),
+      ("BeginNoneEnd3", None, 3, 0, 3),
+      ("Begin2EndNone", 2, None, 2, 5),
+      ("BeginNoneEndNone", None, None, 0, 5),
+  )
+  def testRangeReadingExecutions(self, begin, end, expected_begin,
+                                 expected_end):
+    writer = debug_events_writer.DebugEventsWriter(
+        self.dump_root, circular_buffer_size=-1)
+    for i in range(5):
+      execution = debug_event_pb2.Execution(op_type="OpType%d" % i)
+      writer.WriteExecution(execution)
+    writer.FlushExecutionFiles()
+    writer.Close()
+
+    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
+      reader.update()
+      executions = reader.executions(begin=begin, end=end)
+    self.assertLen(executions, expected_end - expected_begin)
+    self.assertEqual(executions[0].op_type, "OpType%d" % expected_begin)
+    self.assertEqual(executions[-1].op_type, "OpType%d" % (expected_end - 1))
+
+  @parameterized.named_parameters(
+      ("Begin1End3", 1, 3, 1, 3),
+      ("Begin0End3", 0, 3, 0, 3),
+      ("Begin0EndNeg1", 0, -1, 0, 4),
+      ("BeginNoneEnd3", None, 3, 0, 3),
+      ("Begin2EndNone", 2, None, 2, 5),
+      ("BeginNoneEndNone", None, None, 0, 5),
+  )
+  def testRangeReadingGraphExecutionTraces(self, begin, end, expected_begin,
+                                           expected_end):
+    writer = debug_events_writer.DebugEventsWriter(
+        self.dump_root, circular_buffer_size=-1)
+    debugged_graph = debug_event_pb2.DebuggedGraph(
+        graph_id="graph1", graph_name="graph1")
+    writer.WriteDebuggedGraph(debugged_graph)
+    for i in range(5):
+      op_name = "Op_%d" % i
+      graph_op_creation = debug_event_pb2.GraphOpCreation(
+          op_name=op_name, graph_id="graph1")
+      writer.WriteGraphOpCreation(graph_op_creation)
+      trace = debug_event_pb2.GraphExecutionTrace(
+          op_name=op_name, tfdbg_context_id="graph1")
+      writer.WriteGraphExecutionTrace(trace)
+    writer.FlushNonExecutionFiles()
+    writer.FlushExecutionFiles()
+    writer.Close()
+
+    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
+      reader.update()
+      traces = reader.graph_execution_traces(begin=begin, end=end)
+    self.assertLen(traces, expected_end - expected_begin)
+    self.assertEqual(traces[0].op_name, "Op_%d" % expected_begin)
+    self.assertEqual(traces[-1].op_name, "Op_%d" % (expected_end - 1))
+
 
 class DataObjectsTest(test_util.TensorFlowTestCase):
 
@@ -598,10 +660,53 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
     self.assertIsNone(json["output_tensor_ids"])
     self.assertIsNone(json["debug_tensor_values"])
 
+  def testDebuggedDeviceToJons(self):
+    debugged_device = debug_events_reader.DebuggedDevice("/TPU:3", 4)
+    self.assertEqual(debugged_device.to_json(), {
+        "device_name": "/TPU:3",
+        "device_id": 4,
+    })
+
+  def testDebuggedGraphToJonsWitouthNameInnerOuterGraphIds(self):
+    debugged_graph = debug_events_reader.DebuggedGraph(
+        None,
+        "b1c2",
+        outer_graph_id=None,
+    )
+    self.assertEqual(
+        debugged_graph.to_json(), {
+            "name": None,
+            "graph_id": "b1c2",
+            "outer_graph_id": None,
+            "inner_graph_ids": [],
+        })
+
+  def testDebuggedGraphToJonsWithNameAndInnerOuterGraphIds(self):
+    debugged_graph = debug_events_reader.DebuggedGraph(
+        "loss_function",
+        "b1c2",
+        outer_graph_id="a0b1",
+    )
+    debugged_graph.add_inner_graph_id("c2d3")
+    debugged_graph.add_inner_graph_id("c2d3e4")
+    self.assertEqual(
+        debugged_graph.to_json(), {
+            "name": "loss_function",
+            "graph_id": "b1c2",
+            "outer_graph_id": "a0b1",
+            "inner_graph_ids": ["c2d3", "c2d3e4"],
+        })
+
   def testGraphOpCreationDigestNoInputNoDeviceNameToJson(self):
     op_creation_digest = debug_events_reader.GraphOpCreationDigest(
-        1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
-        [135], input_names=None, device_name=None)
+        1234,
+        5678,
+        "deadbeef",
+        "FooOp",
+        "Model_1/Foo_2", [135],
+        "machine.cluster", ("a1", "a2"),
+        input_names=None,
+        device_name=None)
     json = op_creation_digest.to_json()
     self.jsonRoundTripCheck(json)
     self.assertEqual(json["wall_time"], 1234)
@@ -609,13 +714,21 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
     self.assertEqual(json["op_type"], "FooOp")
     self.assertEqual(json["op_name"], "Model_1/Foo_2")
     self.assertEqual(json["output_tensor_ids"], (135,))
+    self.assertEqual(json["host_name"], "machine.cluster")
+    self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
     self.assertIsNone(json["input_names"])
     self.assertIsNone(json["device_name"])
 
   def testGraphOpCreationDigestWithInputsAndDeviceNameToJson(self):
     op_creation_digest = debug_events_reader.GraphOpCreationDigest(
-        1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
-        [135], input_names=["Bar_1", "Qux_2"], device_name="/device:GPU:0")
+        1234,
+        5678,
+        "deadbeef",
+        "FooOp",
+        "Model_1/Foo_2", [135],
+        "machine.cluster", ("a1", "a2"),
+        input_names=["Bar_1", "Qux_2"],
+        device_name="/device:GPU:0")
     json = op_creation_digest.to_json()
     self.jsonRoundTripCheck(json)
     self.assertEqual(json["wall_time"], 1234)
@@ -623,6 +736,8 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
     self.assertEqual(json["op_type"], "FooOp")
     self.assertEqual(json["op_name"], "Model_1/Foo_2")
     self.assertEqual(json["output_tensor_ids"], (135,))
+    self.assertEqual(json["host_name"], "machine.cluster")
+    self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
     self.assertEqual(json["input_names"], ("Bar_1", "Qux_2"))
     self.assertEqual(json["device_name"], "/device:GPU:0")
 
diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py
index c76cbeeac6c..07721920f63 100644
--- a/tensorflow/python/debug/lib/debug_v2_ops_test.py
+++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_debug_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
@@ -680,6 +681,39 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
     self.assertAllEqual(tensor_1, tensor_2)
     self.assertEqual(tensor_id_1, tensor_id_2)
 
+  def testCheckNumericsV2OpNegativeAndPositiveInf(self):
+    """Test that CheckNumericsV2 op distinguishes negative and positive infs."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([-1.0, 1.0])
+      t2 = constant_op.constant([0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had -Inf and \+Inf values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
+  def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self):
+    """CheckNumericsV2 op distinguishes - & + infs when nan is present."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([-1.0, 1.0, 0.0])
+      t2 = constant_op.constant([0.0, 0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had -Inf, \+Inf, and NaN values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
+  def testCheckNumericsV2PositiveInfAndNaN(self):
+    """Test that CheckNumericsV2 op shows sign of inf when nan is present."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([0.0, 1.0])
+      t2 = constant_op.constant([0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had \+Inf and NaN values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/debug/lib/distributed_callbacks_test.py b/tensorflow/python/debug/lib/distributed_callbacks_test.py
deleted file mode 100644
index f1d00ff6844..00000000000
--- a/tensorflow/python/debug/lib/distributed_callbacks_test.py
+++ /dev/null
@@ -1,308 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tfdbg op callbacks running with various `DistributionStrategy`s."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.debug.lib import check_numerics_callback
-from tensorflow.python.debug.lib import debug_events_reader
-from tensorflow.python.debug.lib import dumping_callback
-from tensorflow.python.debug.lib import dumping_callback_test_lib
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import backprop
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import googletest
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import gradient_descent
-
-
-class MiniModel(keras.Model):
-  """Minimal subclassed Keras model."""
-
-  def __init__(self, generate_infinity=False):
-    super(MiniModel, self).__init__(name="")
-    self._generate_infinity = generate_infinity
-    self.fc = keras.layers.Dense(
-        1, kernel_initializer="ones", bias_initializer="ones",
-        activation="linear")
-
-  @def_function.function
-  def call(self, inputs, training=True):
-    y = self.fc(inputs)
-    if self._generate_infinity:
-      y = math_ops.divide(y, array_ops.zeros_like(y))
-    return y
-
-
-class DistributedDumpingCallbackTest(
-    dumping_callback_test_lib.DumpingCallbackTestBase, parameterized.TestCase):
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.one_device_strategy,
-              strategy_combinations.one_device_strategy_gpu,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.mirrored_strategy_with_two_gpus,
-          ],
-          inside_scope=[False, True],
-          # TODO(cais): Investigate that under V1 graph mode (mode="graph"),
-          # occasionally (~1-2% of time) the test runs into the following error:
-          # CancelledError: [_Derived_] Function was cancelled before it was
-          # started.
-          mode=["eager"],
-      ))
-  def testCheckingInfinityInMiniModelOnOneOrTwoDevices(
-      self, distribution, inside_scope):
-    if not inside_scope:
-      check_numerics_callback.enable_check_numerics()
-    with distribution.scope():
-      if inside_scope:
-        check_numerics_callback.enable_check_numerics()
-
-      mini_model = MiniModel(generate_infinity=True)
-      def train_step():
-        with backprop.GradientTape() as tape:
-          loss = mini_model(array_ops.ones([1, 10]))
-          return tape.gradient(loss, mini_model.weights)
-
-      caught_error = None
-      try:
-        distribution.run(train_step)
-      except errors.InvalidArgumentError as error:
-        caught_error = error
-      self.assertTrue(caught_error)
-      self.assertTrue(re.search(
-          r"Detected Infinity or NaN.*\"RealDiv\"", caught_error.message))
-      self.assertIn(
-          "-> |   y = math_ops.divide(y, array_ops.zeros_like(y))",
-          caught_error.message)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.one_device_strategy,
-              strategy_combinations.one_device_strategy_gpu,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.mirrored_strategy_with_two_gpus,
-          ],
-          mode=["eager"],
-          tensor_debug_mode=["NO_TENSOR", "FULL_TENSOR"],
-      ))
-  def testDumpingMiniModel(self, distribution, tensor_debug_mode):
-    with distribution.scope():
-      writer = dumping_callback.enable_dump_debug_info(
-          self.dump_root, tensor_debug_mode=tensor_debug_mode)
-
-      mini_model = MiniModel()
-      optimizer = gradient_descent.GradientDescentOptimizer(0.25)
-
-      def train_step():
-        with backprop.GradientTape() as tape:
-          loss = mini_model(array_ops.ones([1, 10]))
-          grads = tape.gradient(loss, mini_model.weights)
-          grads_and_vars = zip(grads, mini_model.weights)
-          optimizer.apply_gradients(grads_and_vars)
-
-      distribution.run(train_step)
-
-      updated_var_values = self.evaluate(mini_model.variables)
-      num_devices = len(distribution.extended.worker_devices)
-      assert num_devices in (1, 2)
-      if num_devices == 1:
-        self.assertAllEqual(0.75 * np.ones([10, 1]), updated_var_values[0])
-        self.assertAllEqual([0.75], updated_var_values[1])
-      else:
-        self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
-        self.assertAllEqual([0.5], updated_var_values[1])
-
-      writer.FlushNonExecutionFiles()
-      writer.FlushExecutionFiles()
-
-    device_name_0 = distribution.extended.worker_devices[0]
-    logging.info("device_name_0 = %s", device_name_0)
-    if num_devices > 1:
-      device_name_1 = distribution.extended.worker_devices[1]
-      logging.info("device_name_1 = %s", device_name_1)
-
-    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
-      reader.update()
-      traces = reader.graph_execution_traces()
-
-      # Verify graph-execution traces are available for both devices.
-      # We don't assert MatMul occurs exactly once because the gradient of
-      # MatMul involves MatMul.
-      device_0_executed_op_types = [
-          trace.op_type for trace in traces
-          if trace.device_name.endswith(device_name_0)]
-      if num_devices > 1:
-        device_1_executed_op_types = [
-            trace.op_type for trace in traces
-            if trace.device_name.endswith(device_name_1)]
-      self.assertIn("MatMul", device_0_executed_op_types)
-      self.assertEqual(device_0_executed_op_types.count("BiasAdd"), 1)
-      if num_devices > 1:
-        self.assertIn("MatMul", device_1_executed_op_types)
-        self.assertEqual(device_1_executed_op_types.count("BiasAdd"), 1)
-
-      if tensor_debug_mode == "NO_TENSOR":
-        for trace in traces:
-          self.assertIsNone(trace.debug_tensor_value)
-      elif tensor_debug_mode == "FULL_TENSOR":
-        device_0_matmul_values = [
-            reader.graph_execution_trace_to_tensor_value(trace)
-            for trace in traces if trace.op_type == "MatMul" and
-            trace.device_name.endswith(device_name_0)]
-        device_0_bias_add_values = [
-            reader.graph_execution_trace_to_tensor_value(trace)
-            for trace in traces if trace.op_type == "BiasAdd" and
-            trace.device_name.endswith(device_name_0)]
-        self.assertAllClose(device_0_matmul_values[0], [[10.0]])
-        self.assertAllClose(device_0_bias_add_values[0], [[11.0]])
-        if num_devices > 1:
-          device_1_matmul_values = [
-              reader.graph_execution_trace_to_tensor_value(trace)
-              for trace in traces if trace.op_type == "MatMul" and
-              trace.device_name.endswith(device_name_1)]
-          device_1_bias_add_values = [
-              reader.graph_execution_trace_to_tensor_value(trace)
-              for trace in traces if trace.op_type == "BiasAdd" and
-              trace.device_name.endswith(device_name_1)]
-          self.assertAllClose(device_1_matmul_values[0], [[10.0]])
-          self.assertAllClose(device_1_bias_add_values[0], [[11.0]])
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.one_device_strategy,
-              strategy_combinations.one_device_strategy_gpu,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.mirrored_strategy_with_two_gpus,
-          ],
-          mode=["eager"],
-          tensor_debug_mode=["NO_TENSOR", "FULL_TENSOR"],
-      ))
-  def testKerasModelFitOnOneOrTwoDevices(self, distribution, tensor_debug_mode):
-    writer = dumping_callback.enable_dump_debug_info(
-        self.dump_root, tensor_debug_mode=tensor_debug_mode)
-
-    with distribution.scope():
-      model = keras.Sequential()
-      model.add(keras.layers.Dense(
-          units=10, input_shape=[5], activation="relu"))
-      model.add(keras.layers.Dense(units=1))
-      model.compile(loss="mse", optimizer="sgd")
-
-      batch_size = 20
-      x = np.ones([batch_size, 5])
-      y = np.ones([batch_size, 1])
-      epochs = 1
-      history = model.fit(x, y, epochs=epochs, verbose=0)
-      self.assertLen(history.history["loss"], epochs)
-
-      writer.FlushNonExecutionFiles()
-      writer.FlushExecutionFiles()
-
-    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
-      reader.update()
-      executions = reader.executions()
-      fit_executions = [
-          execution.op_type
-          for execution in executions
-          if dumping_callback.is_op_type_function(execution.op_type)
-      ]
-      self.assertLen(fit_executions, epochs)
-
-      traces = reader.graph_execution_traces()
-      num_devices = len(distribution.extended.worker_devices)
-      device_name_0 = distribution.extended.worker_devices[0]
-      if num_devices > 1:
-        device_name_1 = distribution.extended.worker_devices[1]
-      device_0_executed_op_types = [
-          trace.op_type for trace in traces
-          if trace.device_name.endswith(device_name_0)]
-      if num_devices > 1:
-        device_1_executed_op_types = [
-            trace.op_type for trace in traces
-            if trace.device_name.endswith(device_name_1)]
-
-      self.assertIn("MatMul", device_0_executed_op_types)
-      self.assertIn("BiasAdd", device_0_executed_op_types)
-      self.assertIn("Relu", device_0_executed_op_types)
-      self.assertIn("ReluGrad", device_0_executed_op_types)
-      if num_devices > 1:
-        # If there are two devices involved, assert the ops inside tf.functions
-        # are executed and recorded for the equal numbers of times by the
-        # dumping op-callback.
-        self.assertEqual(
-            device_0_executed_op_types.count("MatMul"),
-            device_1_executed_op_types.count("MatMul"))
-        self.assertEqual(
-            device_0_executed_op_types.count("BiasAdd"),
-            device_1_executed_op_types.count("BiasAdd"))
-        self.assertEqual(
-            device_0_executed_op_types.count("Relu"),
-            device_1_executed_op_types.count("Relu"))
-        self.assertEqual(
-            device_0_executed_op_types.count("ReluGrad"),
-            device_1_executed_op_types.count("ReluGrad"))
-
-      if tensor_debug_mode == "NO_TENSOR":
-        for trace in traces:
-          self.assertIsNone(trace.debug_tensor_value)
-      elif tensor_debug_mode == "FULL_TENSOR":
-        gpu_0_relu_values = [
-            reader.graph_execution_trace_to_tensor_value(trace)
-            for trace in traces if trace.op_type == "Relu" and
-            trace.device_name.endswith(device_name_0)]
-        self.assertTrue(gpu_0_relu_values)
-        gpu_0_relu_grad_values = [
-            reader.graph_execution_trace_to_tensor_value(trace)
-            for trace in traces if trace.op_type == "ReluGrad" and
-            trace.device_name.endswith(device_name_0)]
-        self.assertTrue(gpu_0_relu_grad_values)
-        if num_devices > 1:
-          gpu_1_relu_values = [
-              reader.graph_execution_trace_to_tensor_value(trace)
-              for trace in traces if trace.op_type == "Relu" and
-              trace.device_name.endswith(device_name_1)]
-          self.assertTrue(gpu_1_relu_values)
-          for i in range(len(gpu_0_relu_values)):
-            self.assertEqual(gpu_0_relu_values[i].shape,
-                             gpu_1_relu_values[i].shape)
-          gpu_1_relu_grad_values = [
-              reader.graph_execution_trace_to_tensor_value(trace)
-              for trace in traces if trace.op_type == "ReluGrad" and
-              trace.device_name.endswith(device_name_1)]
-          self.assertTrue(gpu_1_relu_grad_values)
-          for i in range(len(gpu_0_relu_grad_values)):
-            self.assertEqual(gpu_0_relu_grad_values[i].shape,
-                             gpu_1_relu_grad_values[i].shape)
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py
index 921891033ab..5f7fe5e7ea4 100644
--- a/tensorflow/python/debug/lib/dumping_callback.py
+++ b/tensorflow/python/debug/lib/dumping_callback.py
@@ -102,7 +102,12 @@ class _DumpingCallback(object):
     self._function_to_graph_id = dict()
     self._op_type_to_context_id = dict()
     # Keeps track of counter for symbolic tensors output by in-graph ops.
+    # It is used to make unique names for debugger-generated tensors.
     self._symbolic_tensor_counter = 0
+    # A map from the names of debugger-generated Identity and DebugIdentityV2
+    # tensors to the names of the original insrumented graph tensors. This is
+    # applicable to v1 graph mode only.
+    self._tensor_aliases = dict()
     self._source_file_paths_lock = threading.Lock()
     self._stack_frame_to_id_lock = threading.Lock()
     self._context_lock = threading.Lock()
@@ -292,12 +297,21 @@ class _DumpingCallback(object):
       # TODO(cais): Evaluate performance optimization options. For the
       # `NO_TENSOR` debug mode, an alternative is to add `debug_tensor` as a
       # control dependency of `tensor.op` without an additional identity op.
-      if tensor_debug_mode == debug_event_pb2.TensorDebugMode.FULL_TENSOR:
+      if (tensor_debug_mode == debug_event_pb2.TensorDebugMode.FULL_TENSOR and
+          op_type != "Const"):
+        # NOTE(b/153716279): Under v1 graph mode, overriding the output tensor
+        # of Const ops can lead to downstream errors related to shapes. We opt
+        # to use an identity op to avoid this issue at the cost of slightly
+        # larger graph size.
+        self._tensor_aliases[debug_tensor.name] = tensor.name
         return debug_tensor
       else:
-        identity = array_ops.identity(tensor)
+        with self._symbolic_tensor_counter_lock:
+          identity_name = "tfdbg_identity_%d" % self._symbolic_tensor_counter
+        identity = array_ops.identity(tensor, name=identity_name)
         identity.op._add_control_input(  # pylint: disable=protected-access
             debug_tensor.op)
+        self._tensor_aliases[identity.name] = tensor.name
         return identity
 
   def _instrument_symbolic_tensors(self,
@@ -349,6 +363,9 @@ class _DumpingCallback(object):
           continue
         # Except in V1 graph mode + control flow, debug_identity_v2 triggers
         # auto control dependency because it's a stateful op.
+        with self._symbolic_tensor_counter_lock:
+          debug_identity_name = ("DebugIdentityV2_%d" %
+                                 self._symbolic_tensor_counter)
         debug_tensor = gen_debug_ops.debug_identity_v2(
             # Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
             # as a low-overhead placeholder, since no actual tensor value is
@@ -358,7 +375,8 @@ class _DumpingCallback(object):
             op_name=op_name,
             output_slot=output_slot,
             tensor_debug_mode=self._tensor_debug_mode,
-            debug_urls=debug_urls)
+            debug_urls=debug_urls,
+            name=debug_identity_name)
         if is_v1_graph_mode:
           instrumented_tensors.append(self._process_v1_graph_mode_tensor(
               op_type, tensor, debug_tensor, tensor_debug_mode))
@@ -530,15 +548,14 @@ class _DumpingCallback(object):
       is_v1_graph_mode = not ops.executing_eagerly_outside_functions()
       context_id = self._get_context_id(graph)  # Innermost context ID.
       output_tensor_ids = self._get_symbolic_tensor_ids(len(outputs))
-      if op_type in ("Placeholder", "PlaceholderWithDefault"):
-        # In some cases, the op name of a Placeholder op in a graph
-        # can be duplicate (e.g., with the name "resource").
-        # When this happens, we give the op an debugger-generated name
-        # in order to prevent problems and check failures down the pipe.
-        op_name = "%s_%d" % (op_name, self._symbolic_tensor_counter)
+      if op_type in ("Const", "Placeholder", "PlaceholderWithDefault"):
+        # In some cases, the op name of a Const or Placeholder op in a graph
+        # can be duplicate (e.g., `None` or "resource").
+        # When this happens, we use the output tensor name to infer
+        # the non-duplicated tensor name.
+        op_name = outputs[0].name.split(":")[0]
       if is_v1_graph_mode:
         for input_tensor in inputs:
-          # TODO(cais):
           if input_tensor in self._placeholder_to_debug_tensor and outputs:
             outputs[0].op._add_control_input(  # pylint: disable=protected-access
                 self._placeholder_to_debug_tensor[input_tensor].op)
@@ -547,7 +564,9 @@ class _DumpingCallback(object):
           op_name=op_name,
           graph_name=graph.name if hasattr(graph, "name") else None,
           graph_id=context_id,
-          input_names=[input_tensor.name for input_tensor in inputs],
+          input_names=[
+              self._lookup_tensor_name(input_tensor) for input_tensor in inputs
+          ],
           num_outputs=len(outputs),
           output_tensor_ids=output_tensor_ids,
           code_location=self._process_stack_frames())
@@ -572,6 +591,22 @@ class _DumpingCallback(object):
           outputs, op_type, input_ids, output_tensor_device_ids,
           graph_id=context_id))
 
+  def _lookup_tensor_name(self, tensor):
+    """Look up the name of a graph tensor.
+
+    This method maps the name of a debugger-generated Identity or
+    DebugIdentityV2 tensor to the name of the original instrumented tensor,
+    if `tensor` is such a debugger-created tensor.
+    Otherwise, it returns the name of `tensor` as is.
+
+    Args:
+      tensor: The graph tensor to look up the name for.
+
+    Returns:
+      Name of the orignal instrumented tensor as known to the debugger.
+    """
+    return self._tensor_aliases.get(tensor.name, tensor.name)
+
   def _func_graph_id_from_func_name(self, op_type):
     """Attempt to get the ID of a FuncGraph based on an op type name.
 
diff --git a/tensorflow/python/debug/lib/dumping_callback_test.py b/tensorflow/python/debug/lib/dumping_callback_test.py
index 5f932ef87b2..982e57b4a81 100644
--- a/tensorflow/python/debug/lib/dumping_callback_test.py
+++ b/tensorflow/python/debug/lib/dumping_callback_test.py
@@ -289,7 +289,8 @@ class DumpingCallbackTest(
     with debug_events_reader.DebugDataReader(self.dump_root) as reader:
       reader.update()
       graph_exec_traces = reader.graph_execution_traces()
-      executed_op_types = [trace.op_type for trace in graph_exec_traces]
+      executed_op_types = [trace.op_type for trace in graph_exec_traces
+                           if trace.op_type != "Const"]
       self.assertCountEqual(
           executed_op_types,
           ["Placeholder", "Placeholder", "AddV2", "Sub", "RealDiv"])
@@ -344,6 +345,46 @@ class DumpingCallbackTest(
           self.assertAllClose(trace.debug_tensor_value,
                               [tensor_id, 19, 1, 8, 8, 0, 0, 0, 0, 0])
 
+  @parameterized.named_parameters(
+      ("CurtHealth", "CURT_HEALTH"),
+      ("FullTensor", "FULL_TENSOR"),
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def testConstTensorsAreCaptured(self, tensor_debug_mode):
+    writer = dumping_callback.enable_dump_debug_info(
+        self.dump_root, tensor_debug_mode=tensor_debug_mode)
+    @def_function.function
+    def times_two_plus_three(x):
+      return x * constant_op.constant(2.0) + constant_op.constant(3.0)
+    self.assertAllEqual(
+        self.evaluate(times_two_plus_three(10.0)), 23.0)
+    writer.FlushNonExecutionFiles()
+    writer.FlushExecutionFiles()
+
+    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
+      reader.update()
+      const_traces = [trace for trace in reader.graph_execution_traces()
+                      if trace.op_type == "Const"]
+      self.assertGreaterEqual(len(const_traces), 3)
+      if tensor_debug_mode == "CURT_HEALTH":
+        # Under CURT_HEALTH, each debug tensor value has the form
+        # [tensor_id, has_inf_or_nan].
+        self.assertLen(const_traces[0].debug_tensor_value, 2)
+        self.assertEqual(const_traces[0].debug_tensor_value[1], 0)
+        self.assertLen(const_traces[1].debug_tensor_value, 2)
+        self.assertEqual(const_traces[1].debug_tensor_value[1], 0)
+        self.assertLen(const_traces[2].debug_tensor_value, 2)
+        self.assertEqual(const_traces[2].debug_tensor_value[1], 0)
+      else:  # FULL_TENSOR.
+        const_tensor_values = [
+            reader.graph_execution_trace_to_tensor_value(const_trace)
+            for const_trace in const_traces]
+        # Avoid making assertion on the particular order of the debug tensors
+        # for the three Consts because it may be indeterminate.
+        self.assertIn(10.0, const_tensor_values)
+        self.assertIn(2.0, const_tensor_values)
+        self.assertIn(3.0, const_tensor_values)
+
   @parameterized.named_parameters(
       ("Shape", "SHAPE"),
   )
@@ -367,7 +408,8 @@ class DumpingCallbackTest(
     with debug_events_reader.DebugDataReader(self.dump_root) as reader:
       reader.update()
       graph_exec_traces = reader.graph_execution_traces()
-      executed_op_types = [trace.op_type for trace in graph_exec_traces]
+      executed_op_types = [trace.op_type for trace in graph_exec_traces
+                           if trace.op_type != "Const"]
       self.assertEqual(
           executed_op_types,
           ["Placeholder", "Placeholder", "LogicalAnd", "LogicalNot"])
@@ -489,7 +531,8 @@ class DumpingCallbackTest(
         _, stack_frames = reader.read_graph_op_creation_stack_trace(op_digest)
         self._verifyStackFrames(stack_frames)
 
-      graph_exec_traces = reader.graph_execution_traces()
+      graph_exec_traces = [trace for trace in reader.graph_execution_traces()
+                           if trace.op_type != "Const"]
       executed_op_types = [digest.op_type for digest in graph_exec_traces]
       self.assertEqual(
           executed_op_types,
@@ -713,6 +756,63 @@ class DumpingCallbackTest(
             non_placeholder_full_tensor_values[3],
             np.sin(np.log(5.0) + 1.0))  # Sin op.
 
+  @parameterized.named_parameters(
+      ("NoTensor", "NO_TENSOR"),
+      ("FullTensor", "FULL_TENSOR"),
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def testGraphOpConsumingRelationIsCaptured(self, tensor_debug_mode):
+    writer = dumping_callback.enable_dump_debug_info(
+        self.dump_root, tensor_debug_mode=tensor_debug_mode)
+
+    @def_function.function
+    def log_sum(x, y):
+      return math_ops.log(x + y)
+
+    @def_function.function
+    def maxindex_sin1p_log_sum(x, y):
+      _, indices = array_ops.unique(math_ops.sin(1.0 + log_sum(x, y)))
+      return math_ops.reduce_max(indices)
+
+    x = constant_op.constant([2.0, 2.0])
+    y = constant_op.constant([3.0, 3.0])
+    maxindex = maxindex_sin1p_log_sum(x, y)
+    self.assertAllEqual(maxindex, 0)
+    writer.FlushNonExecutionFiles()
+    writer.FlushExecutionFiles()
+
+    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
+      reader.update()
+      traces = reader.graph_execution_traces()
+      add_traces = [trace for trace in traces if trace.op_type == "AddV2"]
+      log_traces = [trace for trace in traces if trace.op_type == "Log"]
+      sin_traces = [trace for trace in traces if trace.op_type == "Sin"]
+      unique_traces = [trace for trace in traces if trace.op_type == "Unique"]
+      max_traces = [trace for trace in traces if trace.op_type == "Max"]
+      self.assertLen(add_traces, 2)
+      self.assertLen(log_traces, 1)
+      self.assertLen(sin_traces, 1)
+      self.assertLen(unique_traces, 2)  # The Unique op outputs two tensors.
+      self.assertLen(max_traces, 1)
+      graph = reader.graph_by_id(add_traces[0].graph_id)
+      # The first AddV2 op is consumed by the Log op.
+      self.assertEqual(
+          graph.get_op_consumers(add_traces[0].op_name),
+          [(0, log_traces[0].op_name, 0)])
+      graph = reader.graph_by_id(add_traces[1].graph_id)
+      # The second AddV2 op is consumed by the Sin op.
+      self.assertEqual(
+          graph.get_op_consumers(add_traces[1].op_name),
+          [(0, sin_traces[0].op_name, 0)])
+      # The last Sin op is consumed by the Unique op.
+      self.assertEqual(
+          graph.get_op_consumers(sin_traces[0].op_name),
+          [(0, unique_traces[0].op_name, 0)])
+      # The Unique op's 2nd output tensor is consumed by the Max op.
+      self.assertEqual(
+          graph.get_op_consumers(unique_traces[0].op_name),
+          [(1, max_traces[0].op_name, 0)])
+
   def testCapturingExecutedGraphIdsOfTwoCompilationsOfSameFunction(self):
     """Test correct executed IDs of two FuncGraphs from the same Py function."""
     writer = dumping_callback.enable_dump_debug_info(
@@ -902,10 +1002,10 @@ class DumpingCallbackTest(
       reader.update()
       graph_exec_digests = reader.graph_execution_traces(digest=True)
       executed_op_types = [digest.op_type for digest in graph_exec_digests
-                           if digest.op_type != "Placeholder"]
+                           if digest.op_type not in ("Const", "Placeholder")]
       tensor_values = [reader.graph_execution_trace_to_tensor_value(digest)
                        for digest in graph_exec_digests
-                       if digest.op_type != "Placeholder"]
+                       if digest.op_type not in ("Const", "Placeholder")]
 
       if tensor_dtypes == [dtypes.float32] and not op_regex:
         self.assertEqual(executed_op_types, ["Unique", "Sum"])
@@ -1003,7 +1103,8 @@ class DumpingCallbackTest(
           self.assertAllClose(tensor_values, [8.0])
 
       graph_exec_traces = reader.graph_execution_traces()
-      executed_op_types = [trace.op_type for trace in graph_exec_traces]
+      executed_op_types = [trace.op_type for trace in graph_exec_traces
+                           if trace.op_type != "Const"]
       if tensor_debug_mode != "CURT_HEALTH":
         # Less outputs a boolean tensor, which is not tracked under CURT_HEALTH.
         # The Less op should have been executed 5 times.
@@ -1342,6 +1443,51 @@ class DumpingCallbackTest(
       # The Mul and Sub ops are from the same innermost context.
       self.assertEqual(mul_op_digest.graph_id, sub_op_digest.graph_id)
 
+  @parameterized.named_parameters(
+      ("NoTensor", "NO_TENSOR"),
+      ("Shape", "SHAPE"),
+      ("FullTensor", "FULL_TENSOR"),
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def testGraphInputTracingWorksWithConstAndPlaceholderTensors(
+      self, tensor_debug_mode):
+    writer = dumping_callback.enable_dump_debug_info(
+        self.dump_root, tensor_debug_mode=tensor_debug_mode)
+
+    @def_function.function
+    def func(x):
+      return (x + constant_op.constant(4.0)) / x
+
+    x = constant_op.constant(2.0)
+    self.assertAllClose(self.evaluate(func(x)), 3.0)
+    writer.FlushNonExecutionFiles()
+    writer.FlushExecutionFiles()
+
+    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
+      reader.update()
+      graph_op_digests = reader.graph_op_digests()
+      placeholder_op_name = None
+      const_op_name = None
+      add_op_name = None
+      div_op_name = None
+      for op_digest in graph_op_digests:
+        if op_digest.op_type == "Placeholder":
+          placeholder_op_name = op_digest.op_name
+        elif op_digest.op_type == "Const":
+          const_op_name = op_digest.op_name
+        elif op_digest.op_type == "AddV2":
+          add_op_name = op_digest.op_name
+          self.assertLen(op_digest.input_names, 2)
+          self.assertEqual(op_digest.input_names[0], placeholder_op_name + ":0")
+          self.assertEqual(op_digest.input_names[1], const_op_name + ":0")
+        elif op_digest.op_type == "RealDiv":
+          div_op_name = op_digest
+          self.assertLen(op_digest.input_names, 2)
+          self.assertEqual(op_digest.input_names[0], add_op_name + ":0")
+          self.assertEqual(op_digest.input_names[1], placeholder_op_name + ":0")
+      self.assertTrue(add_op_name)
+      self.assertTrue(div_op_name)
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/debug/lib/op_callbacks_common.py b/tensorflow/python/debug/lib/op_callbacks_common.py
index 1848bd43a3a..279411721a1 100644
--- a/tensorflow/python/debug/lib/op_callbacks_common.py
+++ b/tensorflow/python/debug/lib/op_callbacks_common.py
@@ -35,6 +35,10 @@ OP_CALLBACK_SKIP_OPS = (
     b"StatefulPartitionedCall",
     b"Switch",
     b"While",
+    # NOTE(b/154097452): On TPUs, debugger ops are colocated with RemoteCall
+    # ops. This exclusion prevents an error due to no OpKernel for those
+    # debugger ops.
+    b"RemoteCall",
     # TPU-specific ops begin.
     b"TPUReplicatedInput",
     b"TPUReplicateMetadata",
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index faf2365fc9c..89964a21ba7 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ast
 import os
+import sys
 import tempfile
 import zipfile
 
@@ -43,7 +45,41 @@ from tensorflow.python.util import tf_inspect
 
 
 def line_number_above():
-  return tf_inspect.stack()[1][2] - 1
+  """Get lineno of the AST node immediately above this function's call site.
+
+  It is assumed that there is no empty line(s) between the call site and the
+  preceding AST node.
+
+  Returns:
+    The lineno of the preceding AST node, at the same level of the AST.
+    If the preceding AST spans multiple lines:
+      - In Python 3.8+, the lineno of the first line is returned.
+      - In older Python versions, the lineno of the last line is returned.
+  """
+  # https://bugs.python.org/issue12458: In Python 3.8, traceback started
+  # to return the lineno of the first line of a multi-line continuation block,
+  # instead of that of the last line. Therefore, in Python 3.8+, we use `ast` to
+  # get the lineno of the first line.
+  call_site_lineno = tf_inspect.stack()[1][2]
+  if sys.version_info < (3, 8):
+    return call_site_lineno - 1
+  else:
+    with open(__file__, "rb") as f:
+      source_text = f.read().decode("utf-8")
+    source_tree = ast.parse(source_text)
+    prev_node = _find_preceding_ast_node(source_tree, call_site_lineno)
+    return prev_node.lineno
+
+
+def _find_preceding_ast_node(node, lineno):
+  """Find the ast node immediately before and not including lineno."""
+  for i, child_node in enumerate(node.body):
+    if child_node.lineno == lineno:
+      return node.body[i - 1]
+    if hasattr(child_node, "body"):
+      found_node = _find_preceding_ast_node(child_node, lineno)
+      if found_node:
+        return found_node
 
 
 class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 61ec09cd6f1..a7e62a2dc7c 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -226,9 +226,8 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss_py2",
-        "notap",
-    ],  # b/138443278
+        "no_oss_py2",  # b/138443278
+    ],
     deps = [
         ":distribute_coordinator",
         "//tensorflow/core:protos_all_py",
@@ -453,6 +452,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
@@ -707,6 +707,10 @@ py_library(
     name = "combinations",
     srcs = ["combinations.py"],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
     deps = [
         ":multi_process_runner",
         ":multi_worker_test_base",
@@ -738,6 +742,10 @@ py_library(
     name = "strategy_combinations",
     srcs = ["strategy_combinations.py"],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
     deps = [
         ":central_storage_strategy",
         ":collective_all_reduce_strategy",
@@ -848,9 +856,38 @@ distribute_py_test(
     main = "input_lib_test.py",
     shard_count = 10,
     tags = [
-        "manual",
         "multi_and_single_gpu",
-        "notap",  # TODO(b/151467526)
+    ],
+    deps = [
+        ":collective_all_reduce_strategy",
+        ":combinations",
+        ":input_lib",
+        ":mirrored_strategy",
+        ":multi_worker_test_base",
+        ":reduce_util",
+        ":strategy_combinations",
+        ":values",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+distribute_py_test(
+    name = "input_lib_type_spec_test",
+    srcs = ["input_lib_type_spec_test.py"],
+    main = "input_lib_type_spec_test.py",
+    shard_count = 10,
+    tags = [
+        "multi_and_single_gpu",
     ],
     deps = [
         ":collective_all_reduce_strategy",
@@ -898,8 +935,6 @@ cuda_py_test(
     srcs = ["cross_device_ops_test.py"],
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/151025792): enable after this is fixed.
-        "notap",  # TODO(b/151025792): enable after this is fixed.
     ],
     deps = [
         ":collective_all_reduce_strategy",
@@ -995,9 +1030,10 @@ distribute_py_test(
     shard_count = 5,
     tags = [
         "multi_and_single_gpu",
+        "no_rocm",
     ],
     tpu_tags = [
-        "no_oss",  # Target too big to run serially reliably.
+        "no_oss",  # b/150954621 Target too big to run serially reliably.
     ],
     deps = [
         ":combinations",
@@ -1152,6 +1188,7 @@ distribute_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/139815303): enable after this is fixed.
+        "noguitar",  # TODO(b/140755528): enable after this is fixed.
         "notap",  # TODO(b/139815303): enable after this is fixed.
     ],
     deps = [
@@ -1241,6 +1278,7 @@ cuda_py_test(
 cuda_py_test(
     name = "remote_mirrored_strategy_eager_test",
     srcs = ["remote_mirrored_strategy_eager_test.py"],
+    tags = ["no_oss"],  # b/154743849
     deps = [
         ":combinations",
         ":distribute_lib",
@@ -1453,10 +1491,9 @@ distribute_py_test(
     name = "ctl_correctness_test",
     srcs = ["ctl_correctness_test.py"],
     main = "ctl_correctness_test.py",
-    shard_count = 10,
+    shard_count = 5,
     tags = [
         "multi_and_single_gpu",
-        "noguitar",  # b/140755528
     ],
     deps = [
         ":combinations",
@@ -1476,6 +1513,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
     ],
+    # b/155301154 broken with XLA:GPU
     xla_enable_strict_auto_jit = True,
     deps = [
         ":collective_all_reduce_strategy",
@@ -1508,7 +1546,6 @@ cuda_py_test(
     srcs = ["parameter_server_strategy_test.py"],
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/133330625)
     ],
     # b/141096229: Non-atomic AssignAdd
     xla_enable_strict_auto_jit = False,
@@ -1545,7 +1582,6 @@ py_library(
     srcs = ["multi_process_runner.py"],
     deps = [
         ":multi_process_lib",
-        ":multi_worker_test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:tf2",
         "//tensorflow/python/compat:v2_compat",
@@ -1566,6 +1602,7 @@ py_test(
     shard_count = 12,
     deps = [
         ":multi_process_runner",
+        ":multi_worker_test_base",
         "//tensorflow/python/eager:test",
     ],
 )
@@ -1576,6 +1613,7 @@ py_test(
     python_version = "PY3",
     deps = [
         ":multi_process_runner",
+        ":multi_worker_test_base",
         "//tensorflow/python/eager:test",
     ],
 )
@@ -1590,3 +1628,45 @@ py_library(
         ":multi_worker_util",
     ],
 )
+
+py_test(
+    name = "distributed_file_utils_test",
+    srcs = ["distributed_file_utils_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":distributed_file_utils",
+        ":multi_worker_util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "strategy_common_test",
+    srcs = ["strategy_common_test.py"],
+    tags = [
+        "multi_and_single_gpu",
+        # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
+        # runner can run on guitar.
+        "noguitar",
+    ],
+    xla_enable_strict_auto_jit = True,
+    deps = [
+        ":combinations",
+        ":reduce_util",
+        ":strategy_combinations",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index 040faf6f6ce..ad646905315 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import adam as adam_v1
@@ -96,6 +97,41 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
         self.assertEqual((training_continuation + 1) * num_training_steps,
                          root.optimizer_step.numpy())
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+          mode=["eager"]))
+  def testInitializeFromCheckpoint(self, distribution):
+    variable_shape = [5]
+    save_checkpoint = trackable_utils.Checkpoint(v=variables_lib.Variable(
+        array_ops.ones(variable_shape)))
+    save_path = save_checkpoint.save(
+        os.path.join(self.get_temp_dir(), "checkpoint"))
+    with distribution.scope():
+      restore_checkpoint = trackable_utils.Checkpoint()
+      restore_checkpoint.restore(save_path)
+      initial_value = restore_checkpoint._preload_simple_restoration(
+          "v", variable_shape)
+      v = variables_lib.Variable(initial_value)
+      # Check that the variable is now tagged as restored. `Checkpoint` then
+      # knows it doesn't have to restore `v`'s value when it's assigned to an
+      # object.
+      self.assertGreater(v._update_uid, 0)
+      self.assertAllClose(array_ops.ones(variable_shape), v)
+      v.assign(array_ops.zeros(variable_shape))
+      # Assignment to an object should not trigger restoration, since we already
+      # restored the object through an initializer. This wouldn't be a
+      # correctness issue, but it would mean that models would use twice as much
+      # memory when loading (the buffer already assigned to the variable, and
+      # the new restoration).
+      restore_checkpoint.v = v
+      self.assertAllClose(array_ops.zeros(variable_shape), v)
+
   @combinations.generate(
       combinations.combine(
           distribution=[
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index 8577f1978b9..c7427af2081 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -1,10 +1,6 @@
 # Description: Operations defined for Cluster Resolvers
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_additional_rpc_deps",
-)
 
 package(
     default_visibility = [
@@ -64,12 +60,7 @@ py_library(
     name = "tpu_cluster_resolver_py",
     srcs = ["tpu_cluster_resolver.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu/client",
-    ] + tf_additional_rpc_deps(),
+    deps = ["//tensorflow/python/distribute/cluster_resolver/tpu:tpu_cluster_resolver_py"],
 )
 
 py_library(
@@ -137,25 +128,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "tpu_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["tpu_cluster_resolver_test.py"],
-    grpc_enabled = True,
-    main = "tpu_cluster_resolver_test.py",
-    python_version = "PY3",
-    deps = [
-        ":tpu_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python/tpu/client",
-        "@absl_py//absl/testing:flagsaver",
-    ],
-)
-
 tf_py_test(
     name = "slurm_cluster_resolver_py_test",
     size = "small",
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
new file mode 100644
index 00000000000..4825bf3b6d8
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -0,0 +1,44 @@
+# Description: OSS only cluster resolvers
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_rpc_deps",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "tpu_cluster_resolver_py",
+    srcs = ["tpu_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu/client",
+    ] + tf_additional_rpc_deps(),
+)
+
+tf_py_test(
+    name = "tpu_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tpu_cluster_resolver_test.py"],
+    grpc_enabled = True,
+    main = "tpu_cluster_resolver_test.py",
+    python_version = "PY3",
+    deps = [
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/tpu/client",
+    ],
+)
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
new file mode 100644
index 00000000000..943b736fde4
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -0,0 +1,349 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Cloud TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+try:
+  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
+except ImportError:
+  logging.debug(
+      'Falling back to TensorFlow client; we recommended you install the Cloud '
+      'TPU client directly with pip install cloud-tpu-client.')
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
+
+
+def is_running_in_gce():
+  return True
+
+
+_TPU_DEVICE_REGEX = re.compile(
+    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
+_TPU_CONN_RETRIES = 120
+DeviceDetails = collections.namedtuple(
+    'DeviceDetails', ['device_map', 'total_cores'])
+
+
+class TPUClusterResolver(cluster_resolver.ClusterResolver):
+  """Cluster Resolver for Google Cloud TPUs.
+
+  This is an implementation of cluster resolvers for the Google Cloud TPU
+  service. As Cloud TPUs are in alpha, you will need to specify a API definition
+  file for this to consume, in addition to a list of Cloud TPUs in your Google
+  Cloud Platform project.
+
+  TPUClusterResolver supports the following distinct environments:
+  Google Compute Engine
+  Google Kubernetes Engine
+  Google internal
+  """
+
+  @staticmethod
+  def _get_device_dict_and_cores(devices):
+    """Returns a dict of hosts to cores and total cores given devices names.
+
+    Returns a namedtuple with two attributes:
+      device_map: A map of host_ids to a list of core_ids.
+      total_cores: The total number of cores within the TPU system.
+
+    Args:
+      devices: A list of devices returned by session.list_devices()
+    """
+    device_map = collections.defaultdict(list)
+    num_cores = 0
+    for device in devices:
+      match = _TPU_DEVICE_REGEX.match(device.name)
+      if match:
+        host_id = match.group('host_id')
+        core_id = match.group('core_id')
+        device_map[host_id].append(core_id)
+        num_cores += 1
+    return DeviceDetails(device_map, num_cores)
+
+  @staticmethod
+  def _verify_and_return_same_core_count(device_dict):
+    """Verifies that every device in device_dict has the same # of cores."""
+    num_cores_per_host_set = (
+        {len(core_ids) for core_ids in device_dict.values()})
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError('TPU cores on each device is not the same. This '
+                         'should never happen. Devices: {}'.format(device_dict))
+    return num_cores_per_host_set.pop()
+
+  def __init__(self,
+               tpu=None,
+               zone=None,
+               project=None,
+               job_name='worker',
+               coordinator_name=None,
+               coordinator_address=None,
+               credentials='default',
+               service=None,
+               discovery_url=None):
+    """Creates a new TPUClusterResolver object.
+
+    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
+    for the IP addresses and ports of each Cloud TPU listed.
+
+    Args:
+      tpu: A string corresponding to the TPU to use. If the string is an empty
+        string, the string 'local', or a string that begins with 'grpc://', then
+          it is assumed to not correspond with a Cloud TPU and will instead be
+          passed as the session master and no ClusterSpec propagation will be
+          done. In the future, this may also support a list of strings when
+          multiple Cloud TPUs are used.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
+      job_name: Name of the TensorFlow job the TPUs belong to.
+      coordinator_name: The name to use for the coordinator. Set to None if the
+        coordinator should not be included in the computed ClusterSpec.
+      coordinator_address: The address of the coordinator (typically an ip:port
+        pair). If set to None, a TF server will be started. If coordinator_name
+        is None, a TF server will not be started even if coordinator_address is
+        None.
+      credentials: GCE Credentials. If None, then we use default credentials
+        from the oauth2client
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. If you specify a custom service object, then the credentials
+        parameter will be ignored.
+      discovery_url: A URL template that points to the location of the discovery
+        service. It should have two parameters {api} and {apiVersion} that when
+        filled in produce an absolute URL to the discovery document for that
+        service. The environment variable 'TPU_API_DISCOVERY_URL' will override
+        this.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+      ValueError: If no TPUs are specified.
+      RuntimeError: If an empty TPU name is specified and this is running in a
+        Google Cloud environment.
+    """
+
+    self._cloud_tpu_client = client.Client(
+        tpu=tpu,
+        zone=zone,
+        project=project,
+        credentials=credentials,
+        service=service,
+        discovery_url=discovery_url)
+
+    self._tpu = self._cloud_tpu_client.name()
+    # By default the task_type is 'worker` and the task_id is 0 (which is the
+    # first worker in the task).
+    self.task_type = job_name
+    self.task_id = 0
+    self._coordinator_name = coordinator_name
+    if (coordinator_name and not coordinator_address):
+      self._start_local_server()
+    else:
+      self._coordinator_address = coordinator_address
+
+  def __enter__(self):
+    self._cloud_tpu_client.enter()
+
+  def __exit__(self, type, value, traceback):  # pylint: disable=redefined-builtin
+    self._cloud_tpu_client.exit(type, value, traceback)
+
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
+    """Get the Master string to be used for the session.
+
+    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
+    first instance in the ClusterSpec returned by the cluster_spec function.
+
+    If a non-TPU name is used when constructing a TPUClusterResolver, that will
+    be returned instead (e.g. If the tpus argument's value when constructing
+    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
+    'grpc://10.240.1.2:8470' will be returned).
+
+    Args:
+      task_type: (Optional, string) The type of the TensorFlow task of the
+        master.
+      task_id: (Optional, integer) The index of the TensorFlow task of the
+        master.
+      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
+        communicate with TPUs.
+
+    Returns:
+      string, the connection string to use when creating a session.
+
+    Raises:
+      ValueError: If none of the TPUs specified exists.
+    """
+
+    cluster_spec = self.cluster_spec()
+    if task_type is not None and task_id is not None:
+      # task_type and task_id is from the function parameter
+      master = cluster_spec.task_address(task_type, task_id)
+    elif self.task_type is not None and self.task_id is not None:
+      # task_type and task_id is from the object
+      master = cluster_spec.task_address(self.task_type, self.task_id)
+    else:
+      # by default we take the first item in the cluster with the right name
+      job_tasks = cluster_spec.job_tasks(self.task_type)
+      if not job_tasks:
+        raise ValueError('No TPUs with the specified names exist.')
+      master = job_tasks[0]
+    return cluster_resolver.format_master_url(master, 'grpc')
+
+  def get_master(self):
+    return self.master()
+
+  def get_job_name(self):
+    return self.task_type
+
+  def get_tpu_system_metadata(self):
+    """Returns the metadata of the TPU system.
+
+    Users can call this method to get some facts of the TPU system, like
+    total number of cores, number of TPU workers and the devices. E.g.
+    ```python
+
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tpu_system_medata = resolver.get_tpu_system_metadata()
+    num_hosts = tpu_system_medata.num_hosts
+    ```
+
+    Returns:
+      A `tf.tpu.experimental.TPUSystemMetadata` object.
+    """
+    cluster_spec = self.cluster_spec()
+    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
+            self.master(),
+            cluster_def=cluster_def,
+            query_topology=False))
+
+    return tpu_system_metadata
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest TPU information.
+
+    We retrieve the information from the GCE APIs every time this method is
+    called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Cloud TPUs,
+      or None.
+
+    Raises:
+      RuntimeError: If the provided TPU is not healthy.
+    """
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    ############################################################################
+
+    network_endpoints = self._cloud_tpu_client.network_endpoints()
+    worker_list = [
+        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+        for endpoint in network_endpoints
+    ]
+    cluster_spec = {self.task_type: worker_list}
+    if self._coordinator_address:
+      # {1, 2}.a
+      cluster_spec[self._coordinator_name] = [self._coordinator_address]
+
+    return server_lib.ClusterSpec(cluster_spec)
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_id=None,
+                       config_proto=None):
+    """Returns the number of TPU cores per worker.
+
+    Connects to the master and list all the devices present in the master,
+    and counts them up. Also verifies that the device counts per host in the
+    cluster is the same before returning the number of TPU cores per host.
+
+    Args:
+      task_type: Unused.
+      task_id: Unused.
+      config_proto: Used to create a connection to a TPU master in order to
+        retrieve the system metadata.
+
+    Raises:
+      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
+        number of TPU devices per host is different.
+    """
+    retry_count = 1
+    # TODO(b/120564445): Replace with standard library for retries.
+    while True:
+      try:
+        device_details = TPUClusterResolver._get_device_dict_and_cores(
+            cluster_resolver.get_accelerator_devices(
+                self.master(), config_proto=config_proto))
+        break
+      except errors.DeadlineExceededError:
+        error_message = ('Failed to connect to master. The TPU might not be '
+                         'ready (e.g. still scheduling) or the master '
+                         'address is incorrect: got (%s)' % self.master())
+        if retry_count <= _TPU_CONN_RETRIES:
+          logging.warning(error_message)
+          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
+          retry_count += 1
+        else:
+          raise RuntimeError(error_message)
+
+    if device_details.total_cores:
+      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
+          device_details.device_map)}
+    return {'TPU': 0}
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    return self._environment
+
+  def _start_local_server(self):
+    address = compat.as_text(self._cloud_tpu_client.get_local_ip())
+    self._server = server_lib.Server({'local': ['0.0.0.0:0']},
+                                     protocol='grpc',
+                                     config=None,
+                                     start=True)
+    # self._server.target is of the form: grpc://ipaddress:port
+    target = compat.as_bytes(self._server.target)
+    splits = target.split(compat.as_bytes(':'))
+    assert len(splits) == 3, self._server.target
+    assert splits[0] == compat.as_bytes('grpc'), self._server.target
+    self._coordinator_port = compat.as_text(splits[2])
+    self._coordinator_address = '%s:%s' % (
+        address, compat.as_text(self._coordinator_port))
+
+  def __deepcopy__(self, memo):
+    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
+    return self
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
similarity index 99%
rename from tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
index 1fad0a3fc95..1dc9a73fd74 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
@@ -25,7 +25,7 @@ from six.moves.urllib.error import URLError
 
 from tensorflow.python import framework
 from tensorflow.python.client import session
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver
+from tensorflow.python.distribute.cluster_resolver.tpu import tpu_cluster_resolver as resolver
 from tensorflow.python.eager.context import LogicalDevice
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
@@ -41,7 +41,7 @@ except ImportError:
   logging.debug(
       'Falling back to TensorFlow client; we recommended you install the Cloud '
       'TPU client directly with pip install cloud-tpu-client.')
-  from tensorflow.python.tpu.client import client
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
 
 
 class MockRequestClass(object):
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 79ec0bc13d1..5731c2c930a 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -12,339 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Cloud TPUs."""
+"""Shim so that direct imports of tpu_cluster_resolver get correct symbols.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import re
-
-from tensorflow.python.distribute.cluster_resolver import cluster_resolver
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import is_running_in_gce  # pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import TPUClusterResolver
 from tensorflow.python.util.tf_export import tf_export
 
-try:
-  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
-except ImportError:
-  logging.debug(
-      'Falling back to TensorFlow client; we recommended you install the Cloud '
-      'TPU client directly with pip install cloud-tpu-client.')
-  from tensorflow.python.tpu.client import client
-
-def is_running_in_gce():
-  return True
-
-
-_TPU_DEVICE_REGEX = re.compile(
-    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
-_TPU_CONN_RETRIES = 120
-DeviceDetails = collections.namedtuple(
-    'DeviceDetails', ['device_map', 'total_cores'])
-
-
-@tf_export('distribute.cluster_resolver.TPUClusterResolver')
-class TPUClusterResolver(cluster_resolver.ClusterResolver):
-  """Cluster Resolver for Google Cloud TPUs.
-
-  This is an implementation of cluster resolvers for the Google Cloud TPU
-  service. As Cloud TPUs are in alpha, you will need to specify a API definition
-  file for this to consume, in addition to a list of Cloud TPUs in your Google
-  Cloud Platform project.
-
-  TPUClusterResolver supports the following distinct environments:
-  Google Compute Engine
-  Google Kubernetes Engine
-  Google internal
-  """
-
-  @staticmethod
-  def _get_device_dict_and_cores(devices):
-    """Returns a dict of hosts to cores and total cores given devices names.
-
-    Returns a namedtuple with two attributes:
-      device_map: A map of host_ids to a list of core_ids.
-      total_cores: The total number of cores within the TPU system.
-
-    Args:
-      devices: A list of devices returned by session.list_devices()
-    """
-    device_map = collections.defaultdict(list)
-    num_cores = 0
-    for device in devices:
-      match = _TPU_DEVICE_REGEX.match(device.name)
-      if match:
-        host_id = match.group('host_id')
-        core_id = match.group('core_id')
-        device_map[host_id].append(core_id)
-        num_cores += 1
-    return DeviceDetails(device_map, num_cores)
-
-  @staticmethod
-  def _verify_and_return_same_core_count(device_dict):
-    """Verifies that every device in device_dict has the same # of cores."""
-    num_cores_per_host_set = (
-        {len(core_ids) for core_ids in device_dict.values()})
-    if len(num_cores_per_host_set) != 1:
-      raise RuntimeError('TPU cores on each device is not the same. This '
-                         'should never happen. Devices: {}'.format(device_dict))
-    return num_cores_per_host_set.pop()
-
-  def __init__(self,
-               tpu=None,
-               zone=None,
-               project=None,
-               job_name='worker',
-               coordinator_name=None,
-               coordinator_address=None,
-               credentials='default',
-               service=None,
-               discovery_url=None):
-    """Creates a new TPUClusterResolver object.
-
-    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
-    for the IP addresses and ports of each Cloud TPU listed.
-
-    Args:
-      tpu: A string corresponding to the TPU to use. If the string is an empty
-        string, the string 'local', or a string that begins with 'grpc://', then
-          it is assumed to not correspond with a Cloud TPU and will instead be
-          passed as the session master and no ClusterSpec propagation will be
-          done. In the future, this may also support a list of strings when
-          multiple Cloud TPUs are used.
-      zone: Zone where the TPUs are located. If omitted or empty, we will assume
-        that the zone of the TPU is the same as the zone of the GCE VM, which we
-        will try to discover from the GCE metadata service.
-      project: Name of the GCP project containing Cloud TPUs. If omitted or
-        empty, we will try to discover the project name of the GCE VM from the
-        GCE metadata service.
-      job_name: Name of the TensorFlow job the TPUs belong to.
-      coordinator_name: The name to use for the coordinator. Set to None if the
-        coordinator should not be included in the computed ClusterSpec.
-      coordinator_address: The address of the coordinator (typically an ip:port
-        pair). If set to None, a TF server will be started. If coordinator_name
-        is None, a TF server will not be started even if coordinator_address is
-        None.
-      credentials: GCE Credentials. If None, then we use default credentials
-        from the oauth2client
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. If you specify a custom service object, then the credentials
-        parameter will be ignored.
-      discovery_url: A URL template that points to the location of the discovery
-        service. It should have two parameters {api} and {apiVersion} that when
-        filled in produce an absolute URL to the discovery document for that
-        service. The environment variable 'TPU_API_DISCOVERY_URL' will override
-        this.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-      ValueError: If no TPUs are specified.
-      RuntimeError: If an empty TPU name is specified and this is running in a
-        Google Cloud environment.
-    """
-
-    self._cloud_tpu_client = client.Client(
-        tpu=tpu,
-        zone=zone,
-        project=project,
-        credentials=credentials,
-        service=service,
-        discovery_url=discovery_url)
-
-    self._tpu = self._cloud_tpu_client.name()
-    # By default the task_type is 'worker` and the task_id is 0 (which is the
-    # first worker in the task).
-    self.task_type = job_name
-    self.task_id = 0
-    self._coordinator_name = coordinator_name
-    if (coordinator_name and not coordinator_address):
-      self._start_local_server()
-    else:
-      self._coordinator_address = coordinator_address
-
-  def __enter__(self):
-    self._cloud_tpu_client.enter()
-
-  def __exit__(self, type, value, traceback):  # pylint: disable=redefined-builtin
-    self._cloud_tpu_client.exit(type, value, traceback)
-
-  def master(self, task_type=None, task_id=None, rpc_layer=None):
-    """Get the Master string to be used for the session.
-
-    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
-    first instance in the ClusterSpec returned by the cluster_spec function.
-
-    If a non-TPU name is used when constructing a TPUClusterResolver, that will
-    be returned instead (e.g. If the tpus argument's value when constructing
-    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
-    'grpc://10.240.1.2:8470' will be returned).
-
-    Args:
-      task_type: (Optional, string) The type of the TensorFlow task of the
-        master.
-      task_id: (Optional, integer) The index of the TensorFlow task of the
-        master.
-      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
-        communicate with TPUs.
-
-    Returns:
-      string, the connection string to use when creating a session.
-
-    Raises:
-      ValueError: If none of the TPUs specified exists.
-    """
-
-    cluster_spec = self.cluster_spec()
-    if task_type is not None and task_id is not None:
-      # task_type and task_id is from the function parameter
-      master = cluster_spec.task_address(task_type, task_id)
-    elif self.task_type is not None and self.task_id is not None:
-      # task_type and task_id is from the object
-      master = cluster_spec.task_address(self.task_type, self.task_id)
-    else:
-      # by default we take the first item in the cluster with the right name
-      job_tasks = cluster_spec.job_tasks(self.task_type)
-      if not job_tasks:
-        raise ValueError('No TPUs with the specified names exist.')
-      master = job_tasks[0]
-    return cluster_resolver.format_master_url(master, 'grpc')
-
-  def get_master(self):
-    return self.master()
-
-  def get_job_name(self):
-    return self.task_type
-
-  def get_tpu_system_metadata(self):
-    """Returns the metadata of the TPU system.
-
-    Users can call this method to get some facts of the TPU system, like
-    total number of cores, number of TPU workers and the devices. E.g.
-    ```python
-
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tpu_system_medata = resolver.get_tpu_system_metadata()
-    num_hosts = tpu_system_medata.num_hosts
-    ```
-
-    Returns:
-      A `tf.tpu.experimental.TPUSystemMetadata` object.
-    """
-    cluster_spec = self.cluster_spec()
-    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
-            self.master(),
-            cluster_def=cluster_def,
-            query_topology=False))
-
-    return tpu_system_metadata
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest TPU information.
-
-    We retrieve the information from the GCE APIs every time this method is
-    called.
-
-    Returns:
-      A ClusterSpec containing host information returned from Cloud TPUs,
-      or None.
-
-    Raises:
-      RuntimeError: If the provided TPU is not healthy.
-    """
-    ############################################################################
-    # There are 5 potential cases this code must handle:
-    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
-    #      a. Create a ClusterSpec that includes the coordinator job
-    #      b. Create a ClusterSpec without the coordinator job.
-    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
-    #     tasks and
-    #      a. Create a ClusterSpec with the coordinator
-    #      b. Create a ClusterSpec without the coordinator
-    ############################################################################
-
-    network_endpoints = self._cloud_tpu_client.network_endpoints()
-    worker_list = [
-        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-        for endpoint in network_endpoints
-    ]
-    cluster_spec = {self.task_type: worker_list}
-    if self._coordinator_address:
-      # {1, 2}.a
-      cluster_spec[self._coordinator_name] = [self._coordinator_address]
-
-    return server_lib.ClusterSpec(cluster_spec)
-
-  def num_accelerators(self,
-                       task_type=None,
-                       task_id=None,
-                       config_proto=None):
-    """Returns the number of TPU cores per worker.
-
-    Connects to the master and list all the devices present in the master,
-    and counts them up. Also verifies that the device counts per host in the
-    cluster is the same before returning the number of TPU cores per host.
-
-    Args:
-      task_type: Unused.
-      task_id: Unused.
-      config_proto: Used to create a connection to a TPU master in order to
-        retrieve the system metadata.
-
-    Raises:
-      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
-        number of TPU devices per host is different.
-    """
-    retry_count = 1
-    # TODO(b/120564445): Replace with standard library for retries.
-    while True:
-      try:
-        device_details = TPUClusterResolver._get_device_dict_and_cores(
-            cluster_resolver.get_accelerator_devices(
-                self.master(), config_proto=config_proto))
-        break
-      except errors.DeadlineExceededError:
-        error_message = ('Failed to connect to master. The TPU might not be '
-                         'ready (e.g. still scheduling) or the master '
-                         'address is incorrect: got (%s)' % self.master())
-        if retry_count <= _TPU_CONN_RETRIES:
-          logging.warning(error_message)
-          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
-          retry_count += 1
-        else:
-          raise RuntimeError(error_message)
-
-    if device_details.total_cores:
-      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
-          device_details.device_map)}
-    return {'TPU': 0}
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in."""
-    return self._environment
-
-  def _start_local_server(self):
-    address = compat.as_text(self._cloud_tpu_client.get_local_ip())
-    self._server = server_lib.Server({'local': ['0.0.0.0:0']},
-                                     protocol='grpc',
-                                     config=None,
-                                     start=True)
-    # self._server.target is of the form: grpc://ipaddress:port
-    target = compat.as_bytes(self._server.target)
-    splits = target.split(compat.as_bytes(':'))
-    assert len(splits) == 3, self._server.target
-    assert splits[0] == compat.as_bytes('grpc'), self._server.target
-    self._coordinator_port = compat.as_text(splits[2])
-    self._coordinator_address = '%s:%s' % (
-        address, compat.as_text(self._coordinator_port))
-
-  def __deepcopy__(self, memo):
-    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
-    return self
+tf_export('distribute.cluster_resolver.TPUClusterResolver')(TPUClusterResolver)
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index d35bb85cd1b..7c7f521af98 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -330,8 +330,9 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         communication=self._communication)
     super(CollectiveAllReduceExtended, self)._initialize_single_worker(
         local_devices)
+    host_device = device_util.get_host_for_device(self._worker_device)
     self._input_workers = input_lib.InputWorkers(
-        [(self._worker_device, self.worker_devices)])
+        [(host_device, self.worker_devices)])
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 04248ee140d..ea7a90504d2 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -594,14 +594,14 @@ class LocalCollectiveAllReduceStrategy(
 
   @combinations.generate(
       combinations.combine(
-          mode=['graph', 'eager'], required_gpus=2, use_dataset=[True, False]))
+          mode=['graph'], required_gpus=2, use_dataset=[True, False]))
   def testMakeInputFnIterator(self, required_gpus, use_dataset):
     if use_dataset:
       fn = lambda: dataset_ops.Dataset.range(5 * required_gpus)
     else:
       def fn():
         dataset = dataset_ops.Dataset.range(5 * required_gpus)
-        it = dataset.make_one_shot_iterator()
+        it = dataset_ops.make_one_shot_iterator(dataset)
         return it.get_next
 
     expected_values = [
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index ffa03ee5329..9a479a3769b 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -22,6 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import sys
 import types
 import unittest
@@ -94,10 +95,10 @@ class NamedGPUCombination(combinations_lib.TestCombination):
 
   Attributes:
     GPU_TEST: The environment is considered to have GPU hardware available if
-              the name of the program contains "test_gpu".
+              the name of the program contains "test_gpu" or "test_xla_gpu".
   """
 
-  GPU_TEST = "test_gpu" in sys.argv[0]
+  GPU_TEST = re.search(r"(test_gpu|test_xla_gpu)$", sys.argv[0])
 
   def should_execute_combination(self, kwargs):
     distributions = [
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 0a662908323..8c8970f4aeb 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import threading
 
 import enum
 import six
@@ -31,7 +32,7 @@ from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
+from tensorflow.python.eager import executor
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -948,6 +949,20 @@ class CollectiveAllReduce(CrossDeviceOps):
     self._collective_keys = (collective_keys or
                              cross_device_utils.CollectiveKeys())
     self._communication = communication
+    # In a multi threaded eager program we need to ensure different groups of
+    # collectives don't interleave each other, otherwise there will be deadlock.
+    self._lock = threading.Lock()
+
+    # Collective ops requires all devices to participate and is blocking. In
+    # eager, we need one async executor for each device to be able to launch
+    # them altogether. Note that async doesn't imply concurrency. Within an
+    # async executor operations are still executed sequentially. In graph or
+    # function building, the executors are not used.
+    self._executors = []
+    for _ in range(self._num_gpus_per_worker or 1):
+      # If num_gpus_per_worker is zero, we assume there's only one device (CPU).
+      self._executors.append(executor.new_executor(enable_async=True))
+
     super(CollectiveAllReduce, self).__init__()
 
   @property
@@ -1059,33 +1074,26 @@ class CollectiveAllReduce(CrossDeviceOps):
           "num_workers = %d, communication_hint = %s, num_packs = %d" %
           (batch_size, self._num_workers, communication, len(packs)), 10)
 
-    def batch_fn():
-      """Wrapper function around batched all-reduce calls."""
-      reduced_values = []
-      for pack in packs:
-        # By placing all CollectiveReduce ops in a pack under single name scope,
-        # we ensure they will be picked up by the `ScopedAllocator` grappler
-        # optimizer and packed into a single all-reduce.
-        with ops.name_scope("allreduce"):
-          for per_replica in pack:
-            # Add control dependencies per device from the last gradients to the
-            # current set, in order to serialize NCCL launches.
-            if (communication == CollectiveCommunication.NCCL.value and
-                reduced_values):
-              control_inputs = [g for g in reduced_values[-1]]
-            else:
-              control_inputs = None
-            reduced_values.append(
-                cross_device_utils.build_collective_reduce(
-                    per_replica.values, self._num_workers,
-                    self._collective_keys, "Add", "Id", communication,
-                    control_inputs))
-      return reduced_values
+    reduced_values = []
+    for pack in packs:
+      # By placing all CollectiveReduce ops in a pack under single name scope,
+      # we ensure they will be picked up by the `ScopedAllocator` grappler
+      # optimizer and packed into a single all-reduce.
+      with self._lock, ops.name_scope("allreduce"):
+        for per_replica in pack:
+          # Add control dependencies per device from the last gradients to the
+          # current set, in order to serialize NCCL launches.
+          if (communication == CollectiveCommunication.NCCL.value and
+              reduced_values):
+            control_inputs = list(reduced_values[-1])
+          else:
+            control_inputs = None
+          reduced_values.append(
+              cross_device_utils.build_collective_reduce(
+                  per_replica.values, self._num_workers,
+                  self._collective_keys, "Add", "Id", communication,
+                  control_inputs, executors=self._executors))
 
-    if context.executing_eagerly():
-      batch_fn = def_function.function(batch_fn)
-
-    reduced_values = batch_fn()
     mirrored = []
     # Reverse the order of reduced value to recover the order in the input.
     for value in reversed(reduced_values):
@@ -1134,6 +1142,12 @@ class CollectiveAllReduce(CrossDeviceOps):
       mirrored.append(value_lib.regroup(value, wrap_class=value_lib.Mirrored))
     return mirrored
 
+  def __deepcopy__(self, memo):
+    # distribute_coordinator deep-copies the strategy object, so
+    # CollectiveAllReduce needs to support deep copy as well.
+    return CollectiveAllReduce(self._num_workers, self._num_gpus_per_worker,
+                               self._collective_keys, self._communication)
+
 
 def choose_the_best(devices, session_config=None):
   """Find the best CrossDeviceOps locally given a `tf.compat.v1.ConfigProto`.
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index 7f25066a45f..09de4306199 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -19,6 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
+import os
+import threading
+import time
 
 from absl.testing import parameterized
 import numpy as np
@@ -39,6 +42,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 
@@ -120,7 +124,8 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
         self.evaluate(ops.convert_to_tensor(left)),
         self.evaluate(ops.convert_to_tensor(right)))
 
-  def _assert_mirrored_equal(self, left_list, right_list, sess):
+  def _assert_mirrored_equal(self, left_list, right_list, sess,
+                             run_options=None):
     if not isinstance(left_list, list):
       left_list, right_list = [left_list], [right_list]
 
@@ -141,7 +146,13 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
       # Densify IndexedSlices.
       left = [ops.convert_to_tensor(v) for v in left]
       right = [ops.convert_to_tensor(v) for v in right]
-      left, right = sess.run((left, right))
+      if context.executing_eagerly():
+        # Optional args in session run are not supported when eager execution
+        # is enabled.
+        assert run_options is None
+        left, right = sess.run((left, right))
+      else:
+        left, right = sess.run((left, right), options=run_options)
       for left_value, right_value in zip(left, right):
         self.assertAllEqual(left_value, right_value)
 
@@ -552,6 +563,17 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         return (collective_all_reduce_ops, devices,
                 "grpc://" + self._cluster_spec[task_type][task_id])
 
+  def _assert_mirrored_equal(self, left_list, right_list, sess):
+    if context.executing_eagerly():
+      run_options = None
+    else:
+      # TODO(b/151025792): figure out why missing run options would make the
+      # test flaky and whether this is a problem in TF 2.
+      run_options = config_pb2.RunOptions()
+      run_options.experimental.collective_graph_key = 5
+    super(CollectiveAllReduceTest, self)._assert_mirrored_equal(
+        left_list, right_list, sess, run_options=run_options)
+
   def _test_reduction(self,
                       task_type,
                       task_id,
@@ -817,6 +839,64 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         variable_length=variable_length,
         local_mode=True)
 
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=2,
+          mode="eager",
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
+  def testEagerMultiThread(self, communication):
+    collective, devices, _ = self._get_test_objects(
+        None,
+        None,
+        num_gpus=2,
+        communication=communication,
+        use_strategy_object=False,
+        local_mode=True)
+
+    # We would like to simulate the following sequence:
+    #   thread-0  device0                 device1
+    #   thread-1          device0 device1
+    # If the kernel launch sequence is as-is the program will deadlock since
+    # NCCL requires the launch order to be same on each device.
+    v0 = _make_per_replica([1.0 for _ in devices], devices)
+    v1 = _make_per_replica([2.0 for _ in devices], devices)
+
+    # Add a delay to collective_ops.all_reduce according to the input tensors
+    # index in `sequence.`
+    sequence = [v0.values[0], v1.values[0], v1.values[1], v0.values[1]]
+    all_reduce = collective_ops.all_reduce
+
+    def delayed_all_reduce(input_tensor, *args, **kwargs):
+      for idx, v in enumerate(sequence):
+        if input_tensor is v:
+          time.sleep(idx)
+          break
+      return all_reduce(input_tensor, *args, **kwargs)
+
+    with test.mock.patch.object(collective_ops, "all_reduce",
+                                delayed_all_reduce):
+      # We only use NCCL for batch reduce with two or more values, so we use two
+      # values here.
+
+      def thread_fn():
+        reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v0, v0),
+                                                                     (v0, v0)])
+        self.assertAllEqual(reduced[0].values, [2.0, 2.0])
+        self.assertAllEqual(reduced[1].values, [2.0, 2.0])
+
+      t = threading.Thread(target=thread_fn)
+      t.start()
+      reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v1, v1),
+                                                                   (v1, v1)])
+      self.assertAllEqual(reduced[0].values, [4.0, 4.0])
+      self.assertAllEqual(reduced[1].values, [4.0, 4.0])
+      t.join()
+
 
 if __name__ == "__main__":
+  # Set default inter op thread pool size to one to ensure we don't exhaust the
+  # thread pool with the additional executors to run collectives in eager.
+  os.environ["TF_NUM_INTEROP_THREADS"] = "1"
   test.main()
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index f9917385b59..d7be93ae2c4 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -337,10 +337,12 @@ def build_collective_reduce(input_tensors,
                             reduction_op='Add',
                             unary_op='Id',
                             communication_hint='AUTO',
-                            control_inputs=None):
+                            control_inputs=None,
+                            executors=None):
   """Build a subgraph that does one full all-reduce, using the collective Op.
 
-  This method must be called in graph mode or inside a tf.function.
+  If called in eager mode, it's required to supply a list of async executors for
+  each input Tensor.
 
   Args:
     input_tensors: tensors within a single worker graph that are to be reduced
@@ -355,6 +357,7 @@ def build_collective_reduce(input_tensors,
       implementation.
     control_inputs: if not None, add control edges between control_inputs and
       (index-wise) corresponding collective_reduce tensors
+    executors: a list of async executor. Required for eager execution.
 
   Returns:
     An array of final tensors, one per device, computed by the full reduction.
@@ -362,9 +365,11 @@ def build_collective_reduce(input_tensors,
   Raises:
     ValueError: There must be at least two tensors over all the workers.
   """
-  assert not context.executing_eagerly(), (
-      'build_collective_reduce can only be called in graph mode or inside '
-      'tf.function')
+  if context.executing_eagerly():
+    if (not executors or len(executors) != len(input_tensors) or
+        not all(e.is_async() for e in executors)):
+      raise ValueError(
+          'collectives requires async executors for each device in eager mode')
 
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
@@ -375,15 +380,19 @@ def build_collective_reduce(input_tensors,
 
   out_tensors = []
   for idx, input_tensor in enumerate(input_tensors):
-    with ops.device(input_tensor.device):
-      with ops.control_dependencies(
-          _control_input(input_tensors, control_inputs, idx)):
-        out_tensor = collective_ops.all_reduce(input_tensor, group_size,
-                                               group_key, instance_key,
-                                               reduction_op, unary_op,
-                                               subdiv_offsets,
-                                               communication_hint)
-      out_tensors.append(out_tensor)
+    if context.executing_eagerly():
+      executor_scope = context.executor_scope(executors[idx])
+    else:
+      executor_scope = ops.NullContextmanager()
+    with executor_scope, \
+         ops.device(input_tensor.device), \
+         ops.control_dependencies(
+             _control_input(input_tensors, control_inputs, idx)):
+      out_tensor = collective_ops.all_reduce(input_tensor, group_size,
+                                             group_key, instance_key,
+                                             reduction_op, unary_op,
+                                             subdiv_offsets, communication_hint)
+    out_tensors.append(out_tensor)
   return out_tensors
 
 
diff --git a/tensorflow/python/distribute/ctl_correctness_test.py b/tensorflow/python/distribute/ctl_correctness_test.py
index 59fae808c21..6ce6a59df47 100644
--- a/tensorflow/python/distribute/ctl_correctness_test.py
+++ b/tensorflow/python/distribute/ctl_correctness_test.py
@@ -224,7 +224,7 @@ class TestDistributionStrategyDnnCorrectness(test.TestCase,
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
-          optimizer_fn=strategy_combinations.optimizers_v1_and_v2,
+          optimizer_fn=strategy_combinations.optimizers_v2,
           mode=['eager'],
           iteration_type=['iterator', 'dataset'],
           inside_func=[False, True],
diff --git a/tensorflow/python/distribute/custom_training_loop_models_test.py b/tensorflow/python/distribute/custom_training_loop_models_test.py
index 3c748bd7364..48f2af0349a 100644
--- a/tensorflow/python/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_models_test.py
@@ -378,6 +378,46 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     for model_v, model2_v in zip(model.variables, model2.variables):
       self.assertAllClose(model_v.numpy(), model2_v.numpy())
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def test_nested_tf_functions_with_control_flow(self, distribution):
+    inputs = np.random.random((10, 3)).astype(np.float32)
+    targets = np.ones((10, 4), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)).repeat()
+    dataset = dataset.batch(10, drop_remainder=True)
+    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
+
+    def get_model():
+      x = keras.layers.Input(shape=(3,), name="input")
+      y = keras.layers.Dense(4, name="dense")(x)
+      model = keras.Model(x, y)
+      return model
+
+    with distribution.scope():
+      model = get_model()
+      optimizer = keras.optimizer_v2.gradient_descent.SGD(0.1, momentum=0.01)
+
+    @def_function.function
+    def train_step(iterator):
+
+      def step_fn(inputs):
+        images, targets = inputs
+        with backprop.GradientTape() as tape:
+          outputs = model(images)
+          loss = math_ops.reduce_sum(outputs - targets)
+        grads = tape.gradient(loss, model.variables)
+        optimizer.apply_gradients(zip(grads, model.variables))
+
+      distribution.run(step_fn, args=(next(iterator),))
+
+    @def_function.function
+    def train_steps(iterator):
+      for _ in math_ops.range(10):
+        train_step(iterator)
+
+    train_steps(input_iterator)
+
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 5259f27d96a..4531e922840 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -520,7 +520,10 @@ class StrategyBase(object):
   """A state & compute distribution policy on a list of devices.
 
   See [the guide](https://www.tensorflow.org/guide/distributed_training)
-  for overview and examples.
+  for overview and examples. See `tf.distribute.StrategyExtended` and
+  [`tf.distribute`](https://www.tensorflow.org/api_docs/python/tf/distribute)
+  for a glossory of concepts mentioned on this page such as "per-replica",
+  _replica_, and _reduce_.
 
   In short:
 
@@ -736,12 +739,16 @@ class StrategyBase(object):
     # Iterate over the distributed dataset
     for x in dist_dataset:
       # process dataset elements
-      strategy.run(train_step, args=(x,))
+      strategy.run(replica_fn, args=(x,))
     ```
 
-    We will assume that the input dataset is batched by the
-    global batch size. With this assumption, we will make a best effort to
-    divide each batch across all the replicas (one or more workers).
+    In the code snippet above, the dataset `dist_dataset` is batched by
+    GLOBAL_BATCH_SIZE, and we iterate through it using `for x in dist_dataset`,
+    where x is one batch of data of GLOBAL_BATCH_SIZE containing N batches of
+    data of per-replica batch size, corresponding to N replicas.
+    `tf.distribute.Strategy.run` will take care of feeding
+    the right per-replica batch to the right `replica_fn` execution on each
+    replica.
 
     In a multi-worker setting, we will first attempt to distribute the dataset
     by attempting to detect whether the dataset is being created out of
@@ -892,8 +899,13 @@ class StrategyBase(object):
     `tf.distribute.DistributedValues` containing tensors or composite tensors.
 
     IMPORTANT: Depending on the implementation of `tf.distribute.Strategy` and
-    whether eager execution is enabled, `fn` may be called one or more times (
-    once for each replica).
+    whether eager execution is enabled, `fn` may be called one or more times. If
+    `fn` is annotated with `tf.function` or `tf.distribute.Strategy.run` is
+    called inside a `tf.function`, eager execution is disabled and `fn` is
+    called once (or once per replica, if you are using MirroredStrategy) to
+    generate a Tensorflow graph, which will then be reused for execution with
+    new inputs. Otherwise, if eager execution is enabled, `fn` will be called
+    every step just like regular python code.
 
     Example usage:
 
@@ -1760,13 +1772,25 @@ class StrategyExtendedV2(object):
       kwargs["distribute_strategy"] = strategy
 
       # Unwrap `initial_value` if it is a `CheckpointInitialValue` to avoid
-      # dereferencing a `Tensor` that is without a `name`.
-      # TODO(b/138130844): Revisit the following check once
-      # `CheckpointInitialValue` class is removed.
+      # dereferencing a `Tensor` that is without a `name`. We still need to
+      # propagate the metadata it's holding.
       if isinstance(kwargs["initial_value"], trackable.CheckpointInitialValue):
+        checkpoint_restore_uid = kwargs[
+            "initial_value"].checkpoint_position.restore_uid
         kwargs["initial_value"] = kwargs["initial_value"].wrapped_value
+      else:
+        checkpoint_restore_uid = None
 
-      return self._create_variable(next_creator, **kwargs)
+      created = self._create_variable(next_creator, **kwargs)
+
+      if checkpoint_restore_uid is not None:
+        # pylint: disable=protected-access
+        # Let the checkpointing infrastructure know that the variable was
+        # already restored so it doesn't waste memory loading the value again.
+        created._maybe_initialize_trackable()
+        created._update_uid = checkpoint_restore_uid
+        # pylint: enable=protected-access
+      return created
 
     def distributed_getter(getter, *args, **kwargs):
       if not self._allow_variable_partition():
@@ -1900,9 +1924,8 @@ class StrategyExtendedV2(object):
 
   def _reduce(self, reduce_op, value):
     # Default implementation until we have an implementation for each strategy.
-    return self._local_results(
-        self.reduce_to(reduce_op, value,
-                       device_util.current() or "/device:CPU:0"))[0]
+    dst = device_util.current() or self._default_device or "/device:CPU:0"
+    return self._local_results(self.reduce_to(reduce_op, value, dst))[0]
 
   def reduce_to(self, reduce_op, value, destinations, experimental_hints=None):
     """Combine (via e.g. sum or mean) values across replicas.
diff --git a/tensorflow/python/distribute/distributed_file_utils.py b/tensorflow/python/distribute/distributed_file_utils.py
index b8e3fd8a8c9..e7278888da9 100644
--- a/tensorflow/python/distribute/distributed_file_utils.py
+++ b/tensorflow/python/distribute/distributed_file_utils.py
@@ -64,12 +64,25 @@ def _is_temp_dir(dirpath, strategy):
 
 def _get_temp_dir(dirpath, strategy):
   if _is_temp_dir(dirpath, strategy):
-    return dirpath
-  return os.path.join(dirpath, _get_base_dirpath(strategy))
+    temp_dir = dirpath
+  else:
+    temp_dir = os.path.join(dirpath, _get_base_dirpath(strategy))
+  file_io.recursive_create_dir_v2(temp_dir)
+  return temp_dir
 
 
-def write_dirpath(dirpath, strategy=None):
-  """Returns the write path that should be used to save file distributedly."""
+def write_dirpath(dirpath, strategy):
+  """Returns the writing dir that should be used to save file distributedly.
+
+  `dirpath` would be created if it doesn't exist.
+
+  Args:
+    dirpath: Original dirpath that would be used without distribution.
+    strategy: The tf.distribute strategy object currently used.
+
+  Returns:
+    The writing dir path that should be used to save with distribution.
+  """
   if strategy is None:
     # Infer strategy from `distribution_strategy_context` if not given.
     strategy = distribution_strategy_context.get_strategy()
@@ -86,8 +99,13 @@ def write_dirpath(dirpath, strategy=None):
   return _get_temp_dir(dirpath, strategy)
 
 
-def remove_temp_dirpath(dirpath, strategy=None):
-  """Removes the temp path after writing is finished."""
+def remove_temp_dirpath(dirpath, strategy):
+  """Removes the temp path after writing is finished.
+
+  Args:
+    dirpath: Original dirpath that would be used without distribution.
+    strategy: The tf.distribute strategy object currently used.
+  """
   if strategy is None:
     # Infer strategy from `distribution_strategy_context` if not given.
     strategy = distribution_strategy_context.get_strategy()
@@ -99,6 +117,33 @@ def remove_temp_dirpath(dirpath, strategy=None):
   # it is redundant when used with the should_checkpoint property.
   if (strategy.extended._in_multi_worker_mode() and  # pylint: disable=protected-access
       not strategy.extended.should_checkpoint):
-      # If this worker is not chief and hence should not save file, remove
-      # the temporary directory.
+    # If this worker is not chief and hence should not save file, remove
+    # the temporary directory.
     file_io.delete_recursively(_get_temp_dir(dirpath, strategy))
+
+
+def write_filepath(filepath, strategy):
+  """Returns the writing file path to be used to save file distributedly.
+
+  Directory to contain `filepath` would be created if it doesn't exist.
+
+  Args:
+    filepath: Original filepath that would be used without distribution.
+    strategy: The tf.distribute strategy object currently used.
+
+  Returns:
+    The writing filepath that should be used to save file with distribution.
+  """
+  dirpath = os.path.dirname(filepath)
+  base = os.path.basename(filepath)
+  return os.path.join(write_dirpath(dirpath, strategy), base)
+
+
+def remove_temp_dir_with_filepath(filepath, strategy):
+  """Removes the temp path for file after writing is finished.
+
+  Args:
+    filepath: Original filepath that would be used without distribution.
+    strategy: The tf.distribute strategy object currently used.
+  """
+  remove_temp_dirpath(os.path.dirname(filepath), strategy)
diff --git a/tensorflow/python/distribute/distributed_file_utils_test.py b/tensorflow/python/distribute/distributed_file_utils_test.py
new file mode 100644
index 00000000000..985af870080
--- /dev/null
+++ b/tensorflow/python/distribute/distributed_file_utils_test.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for distributed_file_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from tensorflow.python.distribute import distributed_file_utils
+from tensorflow.python.eager import test
+
+
+class DistributedFileUtilsTest(test.TestCase):
+
+  class MockedExtended(object):
+    pass
+
+  class MockedChiefStrategy(object):
+
+    def __init__(self):
+      self.extended = DistributedFileUtilsTest.MockedExtended()
+      self.extended._in_multi_worker_mode = lambda: True
+      self.extended.should_checkpoint = True
+
+  class MockedWorkerStrategy(object):
+
+    def __init__(self):
+      self.extended = DistributedFileUtilsTest.MockedExtended()
+      self.extended._in_multi_worker_mode = lambda: True
+      self.extended.should_checkpoint = False
+      self.extended._task_id = 3
+
+  class MockedSingleWorkerStrategy(object):
+
+    def __init__(self):
+      self.extended = DistributedFileUtilsTest.MockedExtended()
+      self.extended._in_multi_worker_mode = lambda: False
+
+  def _write_dummy_file(self, file_to_write):
+    with open(file_to_write, 'w') as f:
+      f.write('foo bar')
+
+  def testChiefWriteDirAndFilePath(self):
+    dirpath = self.get_temp_dir()
+    filepath = os.path.join(dirpath, 'foo.bar')
+    strategy = DistributedFileUtilsTest.MockedChiefStrategy()
+    self.assertEqual(
+        distributed_file_utils.write_filepath(filepath, strategy), filepath)
+    self.assertEqual(
+        distributed_file_utils.write_dirpath(dirpath, strategy), dirpath)
+
+  def testWorkerWriteDirAndFilePath(self):
+    dirpath = self.get_temp_dir()
+    filepath = os.path.join(dirpath, 'foo.bar')
+    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+    self.assertEqual(
+        distributed_file_utils.write_filepath(filepath, strategy),
+        os.path.join(dirpath, 'workertemp_3', 'foo.bar'))
+    self.assertEqual(
+        distributed_file_utils.write_dirpath(dirpath, strategy),
+        os.path.join(dirpath, 'workertemp_3'))
+
+  def testChiefDoesNotRemoveDirAndFilePath(self):
+    temp_dir = self.get_temp_dir()
+    strategy = DistributedFileUtilsTest.MockedChiefStrategy()
+    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+    file_to_write = os.path.join(dir_to_write, 'tmp')
+    self.assertFalse(os.path.exists(file_to_write))
+    self._write_dummy_file(file_to_write)
+    self.assertTrue(os.path.exists(file_to_write))
+    distributed_file_utils.remove_temp_dir_with_filepath(
+        file_to_write, strategy)
+    self.assertTrue(os.path.exists(file_to_write))
+
+  def testWorkerDoesRemoveFilePath(self):
+    temp_dir = self.get_temp_dir()
+    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+    file_to_write = os.path.join(dir_to_write, 'tmp')
+    self.assertFalse(os.path.exists(file_to_write))
+    self._write_dummy_file(file_to_write)
+    self.assertTrue(os.path.exists(file_to_write))
+    distributed_file_utils.remove_temp_dir_with_filepath(
+        file_to_write, strategy)
+    self.assertFalse(os.path.exists(file_to_write))
+
+  def testWorkerDoesRemoveDirPath(self):
+    temp_dir = self.get_temp_dir()
+    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+    file_to_write = os.path.join(dir_to_write, 'tmp')
+    self.assertFalse(os.path.exists(file_to_write))
+    self._write_dummy_file(file_to_write)
+    self.assertTrue(os.path.exists(file_to_write))
+    distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
+    self.assertFalse(os.path.exists(file_to_write))
+    self.assertFalse(os.path.exists(os.path.dirname(file_to_write)))
+
+  def testMultipleRemoveOrigDirPathIsFine(self):
+    temp_dir = self.get_temp_dir()
+    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+    file_to_write = os.path.join(dir_to_write, 'tmp')
+    self._write_dummy_file(file_to_write)
+    distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
+    distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
+    distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
+
+  def testMultipleRemoveDirToWritePathIsFine(self):
+    temp_dir = self.get_temp_dir()
+    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+    file_to_write = os.path.join(dir_to_write, 'tmp')
+    self._write_dummy_file(file_to_write)
+    distributed_file_utils.remove_temp_dirpath(dir_to_write, strategy)
+    distributed_file_utils.remove_temp_dirpath(dir_to_write, strategy)
+    distributed_file_utils.remove_temp_dirpath(dir_to_write, strategy)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 6cf6bd0db26..26bc9a087fb 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import sys
 
 import six
@@ -33,6 +34,7 @@ from tensorflow.python.distribute import input_ops
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import dtypes
@@ -41,10 +43,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.types import distribute as distribute_types
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 
@@ -143,9 +147,10 @@ class InputWorkers(object):
       worker_device_pairs: A sequence of pairs:
         `(input device, a tuple of compute devices fed by that input device)`.
     """
-    self._input_worker_devices = tuple(d for d, _ in worker_device_pairs)
+    self._worker_device_pairs = worker_device_pairs
+    self._input_worker_devices = tuple(d for d, _ in self._worker_device_pairs)
     self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f)
-                              for _, f in worker_device_pairs)
+                              for _, f in self._worker_device_pairs)
 
   @property
   def num_workers(self):
@@ -165,6 +170,12 @@ class InputWorkers(object):
                             for i in range(len(devices)))
     return "%s:{\n%s}" % (self.__class__.__name__, debug_repr)
 
+  def serialize(self):
+    return self._worker_device_pairs
+
+  def deserialize(self, worker_device_pairs):
+    return InputWorkers(worker_device_pairs)
+
 
 def _get_next_as_optional(iterator, strategy, name=None):
   """Returns an empty dataset indicator and the next input from the iterator."""
@@ -208,7 +219,7 @@ def _get_next_as_optional(iterator, strategy, name=None):
 
 
 def _is_statically_shaped(tensor_class, shape):
-  """Test if an iteratort output is statically shaped.
+  """Test if an iterator output is statically shaped.
 
   For sparse and ragged tensors this only tests the batch dimension.
 
@@ -231,20 +242,27 @@ def _is_statically_shaped(tensor_class, shape):
   return shape.is_fully_defined()
 
 
-class DistributedIterator(object):
+def _get_static_shape(iterators):
+  """Returns a boolean indicating if the input is fully defined."""
+  static_shape = True
+  for iterator in iterators:
+    if not isinstance(iterator, (_SingleWorkerOwnedDatasetIterator,
+                                 _SingleWorkerDatasetIterator)):
+      continue
+    flattened = zip(nest.flatten(iterator.output_shapes),
+                    nest.flatten(iterator.output_classes))
+    for output_shape, output_class in flattened:
+      if not _is_statically_shaped(output_class, output_shape):
+        static_shape = False
+        break
+    return static_shape
+
+
+class DistributedIteratorBase(distribute_types.Iterator):
   """Common implementation for all input iterators."""
 
   def __init__(self, input_workers, iterators, strategy):
-    static_shape = True
-    for iterator in iterators:
-      if not isinstance(iterator, _SingleWorkerDatasetIterator):
-        continue
-      flattened = zip(nest.flatten(iterator.output_shapes),
-                      nest.flatten(iterator.output_classes))
-      for output_shape, output_class in flattened:
-        if not _is_statically_shaped(output_class, output_shape):
-          static_shape = False
-          break
+    static_shape = _get_static_shape(iterators)
 
     # TODO(b/133073708): we currently need a flag to control the usage because
     # there is a performance difference between get_next() and
@@ -360,6 +378,10 @@ class DistributedIterator(object):
 
     return values.regroup(replicas)
 
+
+class DistributedIteratorV1(DistributedIteratorBase):
+  """Input Iterator for a distributed dataset."""
+
   # We need a private initializer method for re-initializing multidevice
   # iterators when used with Keras training loops. If we don't reinitialize the
   # iterator we run into memory leak issues (b/123315763).
@@ -370,23 +392,14 @@ class DistributedIterator(object):
       init_ops.extend(it.initialize())
     return control_flow_ops.group(init_ops)
 
-  @property
-  def element_spec(self):
-    """The type specification of an element of this iterator."""
-    return self._element_spec
-
-
-class DistributedIteratorV1(DistributedIterator):
-  """Input Iterator for a distributed dataset instance."""
-
   @deprecated(None, "Use the iterator's `initializer` property instead.")
   def initialize(self):
-    """Initialze underlying iterators.
+    """Initialize underlying iterators.
 
     Returns:
       A list of any initializer ops that should be run.
     """
-    return super(DistributedIteratorV1, self)._initializer
+    return self._initializer
 
   @property
   def initializer(self):
@@ -415,8 +428,153 @@ class DistributedIteratorV1(DistributedIterator):
         return self._iterators[i]
     return None
 
+  @property
+  def element_spec(self):
+    """The type specification of an element of this iterator."""
+    return self._element_spec
 
-class _IterableInput(object):
+
+class DistributedIteratorSpec(type_spec.TypeSpec):
+  """Type specification for `DistributedIterator`."""
+
+  __slots__ = ["_input_workers", "_element_spec", "_strategy"]
+
+  def __init__(self, input_workers, element_spec, strategy):
+    # We don't want to allow deserialization of this class because we don't
+    # serialize the strategy object. Currently the only places where
+    # _deserialize is called is when we save/restore using SavedModels.
+    if isinstance(input_workers, tuple):
+      raise NotImplementedError("DistributedIteratorSpec does not have support "
+                                "for deserialization.")
+    else:
+      self._input_workers = input_workers
+      self._element_spec = element_spec
+      self._strategy = strategy
+
+  @property
+  def value_type(self):
+    return DistributedIterator
+
+  def _serialize(self):
+    # We cannot serialize the strategy object so we convert it to an id that we
+    # can use for comparison.
+    return (self._input_workers.serialize(),
+            self._element_spec, id(self._strategy))
+
+  def _deserialize(self):
+    raise ValueError("Deserialization is currently unsupported for "
+                     "DistributedIteratorSpec.")
+
+  # Overriding this method so that we can merge and reconstruct the spec object
+  def most_specific_compatible_type(self, other):
+    """Returns the most specific TypeSpec compatible with `self` and `other`.
+
+    Args:
+      other: A `TypeSpec`.
+
+    Raises:
+      ValueError: If there is no TypeSpec that is compatible with both `self`
+        and `other`.
+    """
+    # pylint: disable=protected-access
+    if type(self) is not type(other):
+      raise ValueError("No TypeSpec is compatible with both %s and %s" %
+                       (self, other))
+    if self._input_workers.serialize() != other._input_workers.serialize():
+      raise ValueError("_input_workers is not compatible with both %s "
+                       "and %s" % (self, other))
+    if self._strategy is not other._strategy:
+      raise ValueError("tf.distribute strategy is not compatible with both %s "
+                       "and %s" % (self, other))
+    element_spec = nest.map_structure(
+        lambda a, b: a.most_specific_compatible_type(b), self._element_spec,
+        other._element_spec)
+    return DistributedIteratorSpec(self._input_workers, element_spec,
+                                   self._strategy)
+
+  @property
+  def _component_specs(self):
+    specs = []
+    worker_device_pairs = self._input_workers._worker_device_pairs  # pylint: disable=protected-access
+
+    for i, (input_device, compute_devices) in enumerate(worker_device_pairs):
+      element_spec = nest.map_structure(
+          functools.partial(_replace_per_replica_spec, i=i), self._element_spec)
+      specs.append(_SingleWorkerDatasetIteratorSpec(input_device,
+                                                    compute_devices,
+                                                    element_spec))
+    return specs
+
+  def _to_components(self, value):
+    return value._iterators  # pylint: disable=protected-access
+
+  def _from_components(self, components):
+    return DistributedIterator(input_workers=self._input_workers,
+                               iterators=None,
+                               components=components,
+                               element_spec=self._element_spec,
+                               strategy=self._strategy)
+
+  @staticmethod
+  def from_value(value):
+    # pylint: disable=protected-access
+    return DistributedIteratorSpec(value._input_workers, value._element_spec,
+                                   value._strategy)
+
+  def _with_tensor_ranks_only(self):
+    element_spec = nest.map_structure(
+        lambda s: s._with_tensor_ranks_only(),  # pylint: disable=protected-access
+        self._element_spec)
+    return DistributedIteratorSpec(self._input_workers, element_spec,
+                                   self._strategy)
+
+
+class DistributedIterator(DistributedIteratorBase,
+                          composite_tensor.CompositeTensor):
+  """Input Iterator for a distributed dataset."""
+
+  def __init__(self, input_workers=None, iterators=None, strategy=None,
+               components=None, element_spec=None):
+    if input_workers is None:
+      raise ValueError("`input_workers` should be "
+                       "provided.")
+
+    error_message = ("Either `input_workers` or "
+                     "both `components` and `element_spec` need to be "
+                     "provided.")
+
+    if iterators is None:
+      if (components is None or element_spec is None):
+        raise ValueError(error_message)
+      self._element_spec = element_spec
+      self._input_workers = input_workers
+      self._iterators = components
+      static_shape = _get_static_shape(self._iterators)
+      self._strategy = strategy
+      if getattr(
+          strategy.extended, "experimental_enable_get_next_as_optional", False):
+        self._enable_get_next_as_optional = not static_shape
+      else:
+        self._enable_get_next_as_optional = False
+    else:
+      if (components is not None and element_spec is not None):
+        raise ValueError(error_message)
+
+      super(DistributedIterator, self).__init__(input_workers, iterators,
+                                                strategy)
+
+  @property
+  def element_spec(self):
+    return self._element_spec
+
+  @property
+  def _type_spec(self):
+    return DistributedIteratorSpec(self._input_workers,
+                                   self.element_spec,
+                                   self._strategy)
+
+
+class _IterableInput(distribute_types.Iterable):
   """Base class for iterable inputs for distribution strategies."""
 
   def __init__(self, input_workers):
@@ -482,7 +640,6 @@ class DistributedDataset(_IterableInput):
         `num_input_pipelines` in the `InputContext`.
     """
     super(DistributedDataset, self).__init__(input_workers=input_workers)
-
     # We clone and shard the dataset on each worker. The current setup tries to
     # shard the dataset by files if possible so that each worker sees a
     # different subset of files. If that is not possible, will attempt to shard
@@ -511,11 +668,6 @@ class DistributedDataset(_IterableInput):
         else:
           raise
 
-    # TODO(b/138745411): Remove once stateful transformations are supported.
-    options = dataset_ops.Options()
-    options.experimental_distribute._make_stateless = True  # pylint: disable=protected-access
-    dataset = dataset.with_options(options)
-
     self._cloned_datasets = []
     if input_context:
       # Between-graph where we rely on the input_context for sharding
@@ -546,10 +698,20 @@ class DistributedDataset(_IterableInput):
       raise RuntimeError("__iter__() is only supported inside of tf.function "
                          "or when eager execution is enabled.")
 
+    # This is an optional flag that can be used to turn off using
+    # OwnedMultiDeviceIterators and instead use the legacy MultiDeviceIterators
+    # as a stop gap solution that will allow us to roll out this change.
+    enable_legacy_iterators = getattr(self._strategy,
+                                      "_enable_legacy_iterators", False)
     worker_iterators = _create_iterators_per_worker(self._cloned_datasets,
-                                                    self._input_workers)
-    iterator = DistributedIterator(self._input_workers, worker_iterators,
-                                   self._strategy)
+                                                    self._input_workers,
+                                                    enable_legacy_iterators)
+    if enable_legacy_iterators:
+      iterator = DistributedIteratorV1(self._input_workers, worker_iterators,
+                                       self._strategy)
+    else:
+      iterator = DistributedIterator(self._input_workers, worker_iterators,
+                                     self._strategy)
     iterator._element_spec = self.element_spec  # pylint: disable=protected-access
     return iterator
 
@@ -620,12 +782,21 @@ class DistributedDatasetV1(DistributedDataset):
 
   def _get_iterator(self):
     worker_iterators = _create_iterators_per_worker(self._cloned_datasets,
-                                                    self._input_workers)
+                                                    self._input_workers,
+                                                    True)
     iterator = DistributedIteratorV1(self._input_workers, worker_iterators,
                                      self._strategy)
     iterator._element_spec = self.element_spec  # pylint: disable=protected-access
     return iterator
 
+  def __iter__(self):
+    if (ops.executing_eagerly_outside_functions() or
+        ops.get_default_graph().building_function):
+      return self._get_iterator()
+
+    raise RuntimeError("__iter__() is only supported inside of tf.function "
+                       "or when eager execution is enabled.")
+
 
 # TODO(priyag): Add other replication modes.
 class DistributedDatasetsFromFunction(_IterableInput):
@@ -652,26 +823,41 @@ class DistributedDatasetsFromFunction(_IterableInput):
           "input_contexts (%d)" %
           (input_workers.num_workers, len(input_contexts)))
 
-    self._dataset_fn = dataset_fn
     self._input_workers = input_workers
     self._input_contexts = input_contexts
     self._strategy = strategy
-    self._element_spec = None
+    self._datasets, element_spec = (
+        _create_datasets_per_worker_with_input_context(self._input_contexts,
+                                                       self._input_workers,
+                                                       dataset_fn))
+    self._element_spec = _create_distributed_tensor_spec(
+        self._strategy, element_spec)
 
   def __iter__(self):
-    if not (context.executing_eagerly() or
-            ops.get_default_graph().building_function):
-      raise RuntimeError("__iter__() is only supported inside of tf.function "
-                         "or when eager execution is enabled.")
+    if (ops.executing_eagerly_outside_functions() or
+        ops.get_default_graph().building_function):
+      # This is an optional flag that can be used to turn off using
+      # OwnedMultiDeviceIterators and instead use the legacy
+      # MultiDeviceIterators as a stop gap solution that will allow us to roll
+      # out this change.
+      enable_legacy_iterators = getattr(self._strategy,
+                                        "_enable_legacy_iterators", False)
 
-    iterators, element_spec = _create_iterators_per_worker_with_input_context(
-        self._input_contexts, self._input_workers, self._dataset_fn)
-    iterator = DistributedIterator(self._input_workers, iterators,
-                                   self._strategy)
-    self._element_spec = _create_distributed_tensor_spec(self._strategy,
-                                                         element_spec)
-    iterator._element_spec = self._element_spec  # pylint: disable=protected-access
-    return iterator
+      iterators = _create_iterators_per_worker(self._datasets,
+                                               self._input_workers,
+                                               enable_legacy_iterators)
+
+      if enable_legacy_iterators:
+        iterator = DistributedIteratorV1(self._input_workers, iterators,
+                                         self._strategy)
+      else:
+        iterator = DistributedIterator(self._input_workers, iterators,
+                                       self._strategy)
+      iterator._element_spec = self._element_spec  # pylint: disable=protected-access
+      return iterator
+
+    raise RuntimeError("__iter__() is only supported inside of tf.function "
+                       "or when eager execution is enabled.")
 
   @property
   def element_spec(self):
@@ -709,15 +895,21 @@ class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction):
     return self._get_iterator()
 
   def _get_iterator(self):
-    iterators, element_spec = _create_iterators_per_worker_with_input_context(
-        self._input_contexts, self._input_workers, self._dataset_fn)
+    iterators = _create_iterators_per_worker(self._datasets,
+                                             self._input_workers, True)
     iterator = DistributedIteratorV1(self._input_workers, iterators,
                                      self._strategy)
-    self._element_spec = _create_distributed_tensor_spec(self._strategy,
-                                                         element_spec)
     iterator._element_spec = self._element_spec  # pylint: disable=protected-access
     return iterator
 
+  def __iter__(self):
+    if (ops.executing_eagerly_outside_functions() or
+        ops.get_default_graph().building_function):
+      return self._get_iterator()
+
+    raise RuntimeError("__iter__() is only supported inside of tf.function "
+                       "or when eager execution is enabled.")
+
 
 # TODO(anjalisridhar): This class will be soon be removed in favor of newer
 # APIs.
@@ -802,7 +994,7 @@ class DatasetIterator(DistributedIteratorV1):
         split_batch_by=split_batch_by,
         input_context=input_context)
     worker_iterators = _create_iterators_per_worker(
-        dist_dataset._cloned_datasets, input_workers)  # pylint: disable=protected-access
+        dist_dataset._cloned_datasets, input_workers, True)  # pylint: disable=protected-access
     super(DatasetIterator, self).__init__(
         input_workers,
         worker_iterators,  # pylint: disable=protected-access
@@ -813,18 +1005,18 @@ class DatasetIterator(DistributedIteratorV1):
 def _dummy_tensor_fn(value_structure):
   """A function to create dummy tensors from `value_structure`."""
 
-  def create_dummy_tensor(type_spec):
+  def create_dummy_tensor(spec):
     """Create a dummy tensor with possible batch dimensions set to 0."""
-    if isinstance(type_spec, ragged_tensor.RaggedTensorSpec):
+    if isinstance(spec, ragged_tensor.RaggedTensorSpec):
       # Splice out the ragged dimensions.
       # pylint: disable=protected-access
-      feature_shape = type_spec._shape[:1].concatenate(
-          type_spec._shape[(1 + type_spec._ragged_rank):])
-      feature_type = type_spec._dtype
+      feature_shape = spec._shape[:1].concatenate(
+          spec._shape[(1 + spec._ragged_rank):])
+      feature_type = spec._dtype
       # pylint: enable=protected-access
     else:
-      feature_shape = type_spec.shape
-      feature_type = type_spec.dtype
+      feature_shape = spec.shape
+      feature_type = spec.dtype
     # Ideally we should set the batch dimension to 0, however as in
     # DistributionStrategy we don't know the batch dimension, we try to
     # guess it as much as possible. If the feature has unknown dimensions, we
@@ -832,11 +1024,11 @@ def _dummy_tensor_fn(value_structure):
     # first dimension as batch dimension and set it to 0.
     dims = ([dim if dim is not None else 0 for dim in feature_shape.as_list()]
             if feature_shape else [])
-    if dims and (isinstance(type_spec, ragged_tensor.RaggedTensorSpec) or
+    if dims and (isinstance(spec, ragged_tensor.RaggedTensorSpec) or
                  feature_shape.is_fully_defined()):
       dims[0] = tensor_shape.Dimension(0)
 
-    if isinstance(type_spec, sparse_tensor.SparseTensorSpec):
+    if isinstance(spec, sparse_tensor.SparseTensorSpec):
       return sparse_tensor.SparseTensor(
           values=array_ops.zeros(0, feature_type),
           indices=array_ops.zeros((0, len(dims)), dtypes.int64),
@@ -844,26 +1036,26 @@ def _dummy_tensor_fn(value_structure):
 
     # Create the dummy tensor.
     dummy_tensor = array_ops.zeros(tensor_shape.TensorShape(dims), feature_type)
-    if isinstance(type_spec, ragged_tensor.RaggedTensorSpec):
+    if isinstance(spec, ragged_tensor.RaggedTensorSpec):
       # Reinsert the ragged dimensions with size 0.
       # pylint: disable=protected-access
-      row_splits = array_ops.zeros(1, type_spec._row_splits_dtype)
+      row_splits = array_ops.zeros(1, spec._row_splits_dtype)
       dummy_tensor = ragged_tensor.RaggedTensor.from_nested_row_splits(
-          dummy_tensor, (row_splits,) * type_spec._ragged_rank, validate=False)
+          dummy_tensor, (row_splits,) * spec._ragged_rank, validate=False)
       # pylint: enable=protected-access
     return dummy_tensor
 
   return nest.map_structure(create_dummy_tensor, value_structure)
 
 
-class _SingleWorkerDatasetIterator(object):
+class _SingleWorkerDatasetIteratorBase(object):
   """Iterator for a single `tf.data.Dataset`."""
 
   def __init__(self, dataset, worker, devices):
     """Create iterator for the `dataset` to fetch data to worker's `devices` .
 
-    `MultiDeviceIterator` is used to prefetch input to the devices on the
-    given worker.
+    A `MultiDeviceIterator`  or `OwnedMultiDeviceIterator` is used to prefetch
+    input to the devices on the given worker.
 
     Args:
       dataset: A `tf.data.Dataset` instance.
@@ -873,13 +1065,11 @@ class _SingleWorkerDatasetIterator(object):
     self._dataset = dataset
     self._worker = worker
     self._devices = devices
+    self._element_spec = dataset.element_spec
     self._make_iterator()
 
   def _make_iterator(self):
-    """Make appropriate iterator on the dataset."""
-    with ops.device(self._worker):
-      self._iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, self._devices)
+    raise NotImplementedError("must be implemented in descendants")
 
   def get_next(self, device, name=None):
     """Get next element for the given device."""
@@ -928,9 +1118,9 @@ class _SingleWorkerDatasetIterator(object):
         # Place the condition op in the same device as the data so the data
         # doesn't need to be sent back to the worker.
         with ops.device(self._devices[i]):
-          # As MultiDeviceIterator will fetch data in order, so we only need to
-          # check if the first replica has value to see whether there is data
-          # left for this single worker.
+          # Data will be fetched in order, so we only need to check if the first
+          # replica has value to see whether there is data left for this single
+          # worker.
           if i == 0:
             worker_has_value = data.has_value()
 
@@ -948,8 +1138,159 @@ class _SingleWorkerDatasetIterator(object):
 
       return worker_has_value, result
 
+
+class _SingleWorkerDatasetIteratorSpec(type_spec.TypeSpec):
+  """Type specification for `_SingleWorkerOwnedDatasetIterator`."""
+
+  __slots__ = ["_worker", "_devices", "_element_spec"]
+
+  def __init__(self, worker, devices, element_spec):
+    self._worker = worker
+    self._devices = tuple(device_util.canonicalize(d) for d in devices)
+    self._element_spec = element_spec
+
+  @property
+  def value_type(self):
+    return _SingleWorkerOwnedDatasetIterator
+
+  def _serialize(self):
+    return (self._worker, self._devices, self._element_spec)
+
+  @property
+  def _component_specs(self):
+    specs = []
+    specs.append(multi_device_iterator_ops.MultiDeviceIteratorSpec(
+        self._devices, self._worker, element_spec=self._element_spec))
+    return specs
+
+  def _to_components(self, value):
+    return [value._iterator]  # pylint: disable=protected-access
+
+  def _from_components(self, components):
+    return _SingleWorkerOwnedDatasetIterator(
+        dataset=None,
+        worker=self._worker,
+        devices=self._devices,
+        components=components,
+        element_spec=self._element_spec)
+
+  @staticmethod
+  def from_value(value):
+    # pylint: disable=protected-access
+    return _SingleWorkerDatasetIteratorSpec(value._worker, value._devices,
+                                            value._element_spec)
+
+
+class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase,
+                                        composite_tensor.CompositeTensor):
+  """Iterator for a DistributedDataset instance."""
+
+  def __init__(self, dataset=None, worker=None, devices=None, components=None,
+               element_spec=None):
+    """Create iterator for the `dataset` to fetch data to worker's `devices` .
+
+    `OwnedMultiDeviceIterator` is used to prefetch input to the devices on the
+    given worker. The lifetime of this iterator is tied to the encompassing
+    python object. Once we go out of scope of the python object or return from
+    a tf.function the underlying iterator resource is deleted.
+
+    Args:
+      dataset: A `tf.data.Dataset` instance.
+      worker: Worker on which ops should be created.
+      devices: Distribute data from `dataset` to these devices.
+      components: Tensor components to construct the
+        _SingleWorkerOwnedDatasetIterator from.
+      element_spec: A nested structure of `TypeSpec` objects that represents the
+      type specification of elements of the iterator.
+    """
+    if worker is None or devices is None:
+      raise ValueError("Both `worker` and `devices` should be provided")
+
+    error_message = ("Either `dataset` or both `components` and `element_spec` "
+                     "need to be provided.")
+
+    if dataset is None:
+      if (components is None or element_spec is None):
+        raise ValueError(error_message)
+      self._element_spec = element_spec
+      self._worker = worker
+      self._devices = devices
+      self._iterator = components[0]
+    else:
+      if (components is not None or element_spec is not None):
+        raise ValueError(error_message)
+      super(_SingleWorkerOwnedDatasetIterator, self).__init__(dataset, worker,
+                                                              devices)
+
+  def _make_iterator(self):
+    """Make appropriate iterator on the dataset."""
+    if not self._worker:
+      raise ValueError("Worked device must be specified when creating an "
+                       "owned iterator.")
+    host_device = device_util.get_host_for_device(self._worker)
+    with ops.device(self._worker):
+      self._iterator = multi_device_iterator_ops.OwnedMultiDeviceIterator(
+          self._dataset, self._devices, source_device=host_device)
+
+  @property
+  def element_spec(self):
+    return self._element_spec
+
+  @property
+  def _type_spec(self):
+    return _SingleWorkerDatasetIteratorSpec(self._worker, self._devices,
+                                            self._element_spec)
+
+  @property
+  def output_classes(self):
+    """Returns the class of each component of an element of this iterator.
+
+    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+
+    Returns:
+      A nested structure of Python `type` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return nest.map_structure(
+        lambda component_spec: component_spec._to_legacy_output_classes(),  # pylint: disable=protected-access
+        self._element_spec)
+
+  @property
+  def output_shapes(self):
+    """Returns the shape of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.TensorShape` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return nest.map_structure(
+        lambda component_spec: component_spec._to_legacy_output_shapes(),  # pylint: disable=protected-access
+        self._element_spec)
+
+  @property
+  def output_types(self):
+    """Returns the type of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this dataset.
+    """
+    return nest.map_structure(
+        lambda component_spec: component_spec._to_legacy_output_types(),  # pylint: disable=protected-access
+        self._element_spec)
+
+
+class _SingleWorkerDatasetIterator(_SingleWorkerDatasetIteratorBase):
+  """Iterator for a single DistributedDatasetV1 instance."""
+
+  def _make_iterator(self):
+    """Make appropriate iterator on the dataset."""
+    with ops.device(self._worker):
+      self._iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          self._dataset, self._devices)
+
   def initialize(self):
-    """Initialze underlying iterator.
+    """Initialize underlying iterator.
 
     In eager execution, this simply recreates the underlying iterator.
     In graph execution, it returns the initializer ops for the underlying
@@ -1010,7 +1351,8 @@ class _SingleWorkerCallableIterator(object):
     return []
 
 
-def _create_iterators_per_worker(worker_datasets, input_workers):
+def _create_iterators_per_worker(worker_datasets, input_workers,
+                                 enable_legacy_iterators):
   """Create a multidevice iterator on each of the workers."""
   assert isinstance(input_workers, InputWorkers)
 
@@ -1019,29 +1361,26 @@ def _create_iterators_per_worker(worker_datasets, input_workers):
   for i, worker in enumerate(input_workers.worker_devices):
     with ops.device(worker):
       worker_devices = input_workers.compute_devices_for_worker(i)
-      iterator = _SingleWorkerDatasetIterator(worker_datasets[i], worker,
-                                              worker_devices)
+      if tf2.enabled() and not enable_legacy_iterators:
+        iterator = _SingleWorkerOwnedDatasetIterator(worker_datasets[i], worker,
+                                                     worker_devices)
+      else:
+        iterator = _SingleWorkerDatasetIterator(worker_datasets[i], worker,
+                                                worker_devices)
       iterators.append(iterator)
   return iterators
 
 
-def _create_iterators_per_worker_with_input_context(input_contexts,
-                                                    input_workers,
-                                                    dataset_fn):
-  """Create a multidevice iterator per workers given a dataset function."""
-  iterators = []
+def _create_datasets_per_worker_with_input_context(input_contexts,
+                                                   input_workers, dataset_fn):
+  """Create device datasets per worker given a dataset function."""
+  datasets = []
   for i, ctx in enumerate(input_contexts):
     worker = input_workers.worker_devices[i]
     with ops.device(worker):
       dataset = dataset_fn(ctx)
-      # TODO(b/138745411): Remove once stateful transformations are supported.
-      options = dataset_ops.Options()
-      options.experimental_distribute._make_stateless = True  # pylint: disable=protected-access
-      dataset = dataset.with_options(options)
-      devices = input_workers.compute_devices_for_worker(i)
-      iterator = _SingleWorkerDatasetIterator(dataset, worker, devices)
-      iterators.append(iterator)
-  return iterators, dataset.element_spec
+      datasets.append(dataset)
+  return datasets, dataset.element_spec
 
 
 # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
@@ -1235,3 +1574,11 @@ def _create_distributed_tensor_spec(strategy, tensor_spec):
     return values.PerReplicaSpec(*value_specs)
 
   return nest.map_structure(_get_value_per_replica, tensor_spec)
+
+
+def _replace_per_replica_spec(spec, i):
+  """If `spec` is a `PerReplicaSpec`, then return its `i`th value_spec."""
+  if isinstance(spec, values.PerReplicaSpec):
+    return spec._value_specs[i]  # pylint: disable=protected-access
+  else:
+    return spec
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 8995704f44e..60212f7a3b7 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -26,6 +26,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import tf2
+from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
@@ -43,7 +44,9 @@ from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -104,20 +107,21 @@ class DistributedIteratorTestBase(test.TestCase):
                     split_batch_by,
                     strategy,
                     input_context=None):
-    if isinstance(dataset, (dataset_ops.Dataset, dataset_ops.DatasetV1Adapter)):
-      return input_lib.DistributedDatasetV1(
-          dataset,
-          input_workers,
-          strategy,
-          split_batch_by=split_batch_by,
-          input_context=input_context)
-    elif input_type == "dataset":
-      return input_lib.DistributedDataset(
-          dataset,
-          input_workers,
-          strategy,
-          split_batch_by=split_batch_by,
-          input_context=input_context)
+    if input_type == "dataset":
+      if tf2.enabled():
+        return input_lib.DistributedDataset(
+            dataset,
+            input_workers,
+            strategy,
+            split_batch_by=split_batch_by,
+            input_context=input_context)
+      else:
+        return input_lib.DistributedDatasetV1(
+            dataset,
+            input_workers,
+            strategy,
+            split_batch_by=split_batch_by,
+            input_context=input_context)
     else:
       return strategy.experimental_distribute_datasets_from_function(dataset)
 
@@ -138,6 +142,9 @@ class DistributedIteratorTestBase(test.TestCase):
     if api_type == "wrap_into_iterator" and iteration_type == "for_loop":
       self.skipTest("unsupported test combination.")
 
+    if api_type == "wrap_into_iterator" and input_type == "input_fn":
+      self.skipTest("unsupported test combination.")
+
     devices = nest.flatten([ds for _, ds in worker_device_pairs])
     input_workers = input_lib.InputWorkers(worker_device_pairs)
 
@@ -160,7 +167,7 @@ class DistributedIteratorTestBase(test.TestCase):
           strategy,
           input_context=input_context)
 
-      if context.executing_eagerly():
+      if ops.executing_eagerly_outside_functions():
         iterator = iter(dataset)
       else:
         if isinstance(dataset, input_lib.DistributedDatasetV1):
@@ -168,9 +175,13 @@ class DistributedIteratorTestBase(test.TestCase):
         else:
           self.skipTest("unsupported test combination")
 
+    if isinstance(iterator, composite_tensor.CompositeTensor):
+      nest.assert_same_structure(iterator, iterator._type_spec,
+                                 expand_composites=True)
+
     if iteration_type == "get_next":
       evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
-      if isinstance(iterator, input_lib.DistributedIteratorV1):
+      if not ops.executing_eagerly_outside_functions():
         evaluate(control_flow_ops.group(iterator.initializer))
 
       for expected_value in expected_values:
@@ -189,10 +200,13 @@ class DistributedIteratorTestBase(test.TestCase):
                                    next_element) for r in range(len(devices))])
 
       # After re-initializing the iterator, should be able to iterate again.
-      if isinstance(iterator, input_lib.DistributedIteratorV1):
+      if not ops.executing_eagerly_outside_functions():
         evaluate(control_flow_ops.group(iterator.initializer))
       else:
-        evaluate(control_flow_ops.group(iterator._initializer))
+        if api_type == "wrap_into_iterator":
+          self.skipTest("unsupported test combination")
+        else:
+          iterator = iter(dataset)
 
       for expected_value in expected_values:
         next_element = iterator.get_next()
@@ -224,6 +238,48 @@ class DistributedIteratorTestBase(test.TestCase):
 class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
                                           parameterized.TestCase):
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["input_fn", "dataset"],
+          distribution=[
+              strategy_combinations.one_device_strategy,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu
+          ]))
+  def testDisablingOwnedIteratorsInTF2(self, distribution, input_type):
+    if not tf2.enabled():
+      self.skipTest("unsupported test combination")
+
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
+    input_workers = input_lib.InputWorkers(worker_device_pairs)
+    dataset_fn = lambda _: dataset_ops.DatasetV2.range(10)
+    dataset_or_input_fn = self._create_dataset_or_input_fn(
+        input_type, dataset_fn)
+
+    input_workers = input_lib.InputWorkers(worker_device_pairs)
+    if input_type == "dataset":
+      dist_dataset = input_lib.get_distributed_dataset(dataset_or_input_fn,
+                                                       input_workers,
+                                                       distribution)
+    else:
+      dist_dataset = input_lib.get_distributed_datasets_from_function(
+          dataset_or_input_fn, input_workers, [distribute_lib.InputContext()],
+          distribution)
+
+    # Default Iterator types in TF2.
+    iterator = iter(dist_dataset)
+    self.assertIsInstance(iterator, input_lib.DistributedIterator)
+    self.assertIsInstance(iterator._iterators[0],
+                          input_lib._SingleWorkerOwnedDatasetIterator)
+
+    # Disable creating owned iterators by setting a property on the strategy.
+    distribution._enable_legacy_iterators = True
+    iterator = iter(dist_dataset)
+    self.assertIsInstance(iterator, input_lib.DistributedIteratorV1)
+    self.assertIsInstance(iterator._iterators[0],
+                          input_lib._SingleWorkerDatasetIterator)
+
   @combinations.generate(
       combinations.combine(
           mode=["eager"],
@@ -231,7 +287,10 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu
           ]))
   def testMultiDeviceIterInitialize(self, distribution):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    if tf2.enabled():
+      self.skipTest("Only V1 is supported.")
+    worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
+                                              "/device:CPU:0"])]
     dataset_fn = lambda _: dataset_ops.DatasetV1.range(10)
 
     input_workers = input_lib.InputWorkers(worker_device_pairs)
@@ -247,25 +306,6 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
 
     init_func_for_iter()
 
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          distribution=[
-              strategy_combinations.one_device_strategy,
-              strategy_combinations.mirrored_strategy_with_one_cpu
-          ]))
-  def testDatasetV2IterError(self, distribution):
-    worker_device_pairs = [("", ["/device:CPU:0"])]
-    input_workers = input_lib.InputWorkers(worker_device_pairs)
-    dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2)
-
-    dist_dataset = input_lib.get_distributed_dataset(
-        dataset_fn(distribute_lib.InputContext()), input_workers, distribution)
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 "or when eager execution is enabled"):
-      iter(dist_dataset)
-
   @combinations.generate(
       combinations.combine(
           mode=["graph", "eager"],
@@ -279,11 +319,11 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
           enable_get_next_as_optional=[True, False]))
   def testOneDeviceCPU(self, input_type, api_type, iteration_type, distribution,
                        enable_get_next_as_optional):
-    worker_device_pairs = [("", ["/device:CPU:0"])]
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
     if tf2.enabled():
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(10)
     else:
-      dataset_fn = lambda _: dataset_ops.Dataset.range(10)
+      dataset_fn = lambda _: dataset_ops.DatasetV1.range(10)
     dataset_or_input_fn = self._create_dataset_or_input_fn(
         input_type, dataset_fn)
 
@@ -313,7 +353,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
           enable_get_next_as_optional=[True, False]))
   def testTwoDevicesOneGPUOneCPU(self, input_type, api_type, iteration_type,
                                  distribution, enable_get_next_as_optional):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
+                                              "/device:CPU:0"])]
     if tf2.enabled():
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(10)
     else:
@@ -383,7 +424,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
           enable_get_next_as_optional=[True, False]))
   def testTupleDataset(self, input_type, api_type, iteration_type, distribution,
                        enable_get_next_as_optional):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
+                                              "/device:CPU:0"])]
 
     def dataset_fn(ctx):
       del ctx
@@ -419,7 +461,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_one_cpu
           ]))
   def testIterableIterator(self, distribution):
-    worker_device_pairs = [("", ["/device:CPU:0"])]
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
     input_workers = input_lib.InputWorkers(worker_device_pairs)
 
     dataset = dataset_ops.DatasetV2.range(10)
@@ -443,7 +485,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
           ]))
   def testUnevenDatasetBatches(self, input_type, api_type, iteration_type,
                                drop_remainder, distribution):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
+                                              "/device:CPU:0"])]
     if tf2.enabled():
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(9).batch(  # pylint: disable=g-long-lambda
           2, drop_remainder=drop_remainder)
@@ -483,7 +526,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
   def testBatchSplitting(self, input_type, api_type, iteration_type,
                          split_batch_by, distribution,
                          enable_get_next_as_optional):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
+                                              "/device:CPU:0"])]
     batch_size = 10
     if tf2.enabled():
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(100).batch(batch_size)
@@ -511,6 +555,63 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         sess=None,
         split_batch_by=split_batch_by)
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.one_device_strategy,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+      ))
+  def testCacheAcrossIteration(self, distribution):
+    if not tf2.enabled():
+      self.skipTest("Only V2 is supported.")
+
+    dataset = dataset_ops.Dataset.range(10).shuffle(10).cache().batch(2)
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+
+    first_epoch = list(
+        distribution.experimental_local_results(x) for x in dist_dataset)
+    second_epoch = list(
+        distribution.experimental_local_results(x) for x in dist_dataset)
+
+    self.assertAllEqual(first_epoch, second_epoch)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.one_device_strategy,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+          reshuffle=[True, False]))
+  def testShuffleAcrossIterations(self, distribution, reshuffle):
+    if not tf2.enabled():
+      self.skipTest("Only V2 is supported.")
+
+    if not reshuffle and not compat.forward_compatible(2020, 5, 22):
+      self.skipTest("Functionality currently not supported.")
+
+    dataset = dataset_ops.Dataset.range(10).shuffle(
+        10, reshuffle_each_iteration=reshuffle).batch(2)
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+
+    first_epoch = list(
+        distribution.experimental_local_results(x) for x in dist_dataset)
+    second_epoch = list(
+        distribution.experimental_local_results(x) for x in dist_dataset)
+
+    if reshuffle:
+      self.assertNotAllEqual(first_epoch, second_epoch)
+    else:
+      self.assertAllEqual(first_epoch, second_epoch)
+
 
 class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
                                         parameterized.TestCase):
@@ -917,7 +1018,6 @@ class DistributedIteratorMultiWorkerTest(
           required_gpus=0))
   def testUnevenDatasetBatchesBetweenGraph(self, input_type, api_type,
                                            iteration_type, strategy_cls):
-    self.skipTest("broken test to be fixed")
     if api_type == "wrap_into_dataset" and input_type == "input_fn":
       self.skipTest("unsupported test combination.")
     if tf2.enabled():
@@ -1015,69 +1115,5 @@ class DistributedIteratorMultiWorkerTest(
           strategy,
           sess=sess)
 
-
-class InputTypeSpecTest(test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["eager"],
-          distribution=[
-              strategy_combinations.one_device_strategy,
-              strategy_combinations.mirrored_strategy_with_one_cpu,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-              strategy_combinations.central_storage_strategy_with_two_gpus,
-          ],
-          input_type=["dataset", "dataset_fn"],
-      ))
-  def testInputSignatureForPerReplicaValues(self, distribution, input_type):
-    def dataset_fn(ctx):
-      del ctx  # unused
-      return dataset_ops.DatasetV2.from_tensor_slices(
-          np.ones([10, 12]).astype(np.float32)).batch(4)
-
-    if input_type == "dataset":
-      ds = distribution.experimental_distribute_dataset(
-          dataset_fn(distribute_lib.InputContext()))
-      type_spec = ds.element_spec
-    else:
-      ds = distribution.experimental_distribute_datasets_from_function(
-          dataset_fn)
-      iterator = iter(ds)
-      type_spec = iterator.element_spec
-
-    @def_function.function(input_signature=[type_spec])
-    def process_inputs(inputs):
-      distribution.run(lambda inputs: inputs, args=(inputs,))
-
-    for x in ds:
-      process_inputs(x)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["eager"],
-          distribution=[
-              strategy_combinations.one_device_strategy,
-              strategy_combinations.mirrored_strategy_with_one_cpu,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-              strategy_combinations.central_storage_strategy_with_two_gpus,
-          ],
-      ))
-  def testInputSignatureForNestedPerReplicaValues(self, distribution):
-    a = np.ones((10, 2)) * 5
-    b = np.ones((10, 3)) * 6
-    dataset = dataset_ops.DatasetV2.from_tensor_slices((a, b)).batch(2)
-
-    dist_dataset = distribution.experimental_distribute_dataset(dataset)
-
-    @def_function.function(input_signature=[dist_dataset.element_spec])
-    def process_inputs(inputs):
-      distribution.run(lambda inputs: inputs, args=(inputs,))
-
-    for x in dist_dataset:
-      process_inputs(x)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/input_lib_type_spec_test.py b/tensorflow/python/distribute/input_lib_type_spec_test.py
new file mode 100644
index 00000000000..7f5b0e09f2c
--- /dev/null
+++ b/tensorflow/python/distribute/input_lib_type_spec_test.py
@@ -0,0 +1,416 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the input_lib library which tests iterator type specs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import tf2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_tensor as ragged_tensor_lib
+from tensorflow.python.util import nest
+
+
+class DistributedIteratorTest(test.TestCase,
+                              parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["dataset"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+          ],
+          enable_get_next_as_optional=[True, False]))
+  def testTypeSpec(self, input_type, distribution,
+                   enable_get_next_as_optional):
+    if not tf2.enabled():
+      self.skipTest("DistributedIterator has CompositeTensor support in "
+                    "TF 2 only.")
+    dataset = dataset_ops.DatasetV2.range(10).batch(2)
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    with distribution.scope():
+      iterator = iter(dist_dataset)
+      _check_type_spec_structure(iterator)
+
+    spec = iterator._type_spec
+    self.assertEqual(spec._input_workers, iterator._input_workers)
+    self.assertEqual(spec._element_spec._value_specs,
+                     (tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.int64,
+                                             name=None),
+                      tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.int64,
+                                             name=None)))
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["dataset"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+          ],
+          enable_get_next_as_optional=[True, False]))
+  def testTypeSpecRoundTrip(self, input_type,
+                            distribution, enable_get_next_as_optional):
+    if not tf2.enabled():
+      self.skipTest("DistributedIterator CompositeTensor support is only "
+                    "present in TF 2.0 only.")
+
+    dataset = dataset_ops.DatasetV2.range(10).batch(2)
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    with distribution.scope():
+      iterator = iter(dist_dataset)
+      _check_type_spec_structure(iterator)
+
+    spec = iterator._type_spec
+
+    tensor_list = spec._to_components(iterator)
+    re_iterator = spec._from_components(tensor_list)
+
+    self.assertEqual(iterator._input_workers, re_iterator._input_workers)
+    self.assertAllEqual(iterator._iterators, re_iterator._iterators)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["dataset"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+          ],
+          enable_get_next_as_optional=[True, False]))
+  def testDoesNotTriggerFunctionTracing(self, input_type, distribution,
+                                        enable_get_next_as_optional):
+    if not tf2.enabled():
+      self.skipTest("DistributedIterator CompositeTensor support is only "
+                    "present in TF 2.0 only.")
+
+    trace_count = [0]
+
+    @def_function.function
+    def f(iterator):
+      trace_count[0] += 1
+      counter = np.int64(0)
+      for _ in range(5):
+        next(iterator)
+        counter += 1
+      return counter
+
+    dataset = dataset_ops.DatasetV2.range(10).batch(2)
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    with distribution.scope():
+      for _ in range(3):
+        iterator = iter(dist_dataset)
+        _check_type_spec_structure(iterator)
+        counter = f(iterator)
+
+        self.assertEqual(trace_count[0], 1)
+        self.assertEqual(counter, 5)
+
+
+class InputTypeSpecTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.one_device_strategy,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+          input_type=["dataset", "dataset_fn"],
+      ))
+  def testInputSignatureForPerReplicaValues(self, distribution, input_type):
+    def dataset_fn(ctx):
+      del ctx  # unused
+      return dataset_ops.DatasetV2.from_tensor_slices(
+          np.ones([10, 12]).astype(np.float32)).batch(4)
+
+    if input_type == "dataset":
+      ds = distribution.experimental_distribute_dataset(
+          dataset_fn(distribute_lib.InputContext()))
+      type_spec = ds.element_spec
+    else:
+      ds = distribution.experimental_distribute_datasets_from_function(
+          dataset_fn)
+      iterator = iter(ds)
+      _check_type_spec_structure(iterator)
+      type_spec = iterator.element_spec
+
+    @def_function.function(input_signature=[type_spec])
+    def process_inputs(inputs):
+      distribution.run(lambda inputs: inputs, args=(inputs,))
+
+    for x in ds:
+      process_inputs(x)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.one_device_strategy,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+      ))
+  def testInputSignatureForNestedPerReplicaValues(self, distribution):
+    a = np.ones((10, 2)) * 5
+    b = np.ones((10, 3)) * 6
+    dataset = dataset_ops.DatasetV2.from_tensor_slices((a, b)).batch(2)
+
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+
+    @def_function.function(input_signature=[dist_dataset.element_spec])
+    def process_inputs(inputs):
+      distribution.run(lambda inputs: inputs, args=(inputs,))
+
+    for x in dist_dataset:
+      process_inputs(x)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["dataset"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+          ],
+          enable_get_next_as_optional=[True, False]))
+  def testMostSpecificCompatibleType(self, input_type, distribution,
+                                     enable_get_next_as_optional):
+    if not tf2.enabled():
+      self.skipTest("DistributedIterator has CompositeTensor support in "
+                    "TF 2 only.")
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+
+    ds1 = dataset_ops.DatasetV2.range(10).batch(2).batch(5)
+    ds2 = dataset_ops.DatasetV2.from_tensors(
+        array_ops.zeros([5, 2], dtypes.int64))
+    dist_ds1 = distribution.experimental_distribute_dataset(ds1)
+    dist_ds2 = distribution.experimental_distribute_dataset(ds2)
+
+    with distribution.scope():
+      iter1 = iter(dist_ds1)
+      iter2 = iter(dist_ds2)
+
+    spec1 = iter1._type_spec  # Wrapped TensorSpec has shape [None, None]
+    spec2 = iter2._type_spec  # Wrapped TensorSpec has shape [None, 2]
+
+    self.assertNotEqual(spec1, spec2)
+    self.assertEqual(spec1, spec1.most_specific_compatible_type(spec2))
+    self.assertEqual(spec1, spec2.most_specific_compatible_type(spec1))
+
+
+class RaggedTensorDistributedIteratorTest(test.TestCase,
+                                          parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          ],
+          enable_get_next_as_optional=[True, False]))
+  def testTypeSpec(self, distribution, enable_get_next_as_optional):
+    if not tf2.enabled():
+      self.skipTest("DistributedIterator has CompositeTensor support in "
+                    "TF 2.0 only.")
+    ctx = distribute_lib.InputContext()
+    batch_size = ctx.get_per_replica_batch_size(8)
+    # Use 20 which isn't divisible by 8 to test partial batch behavior.
+    row_lengths = np.mod(np.arange(20), 4).astype(np.int64)
+    ragged_tensor = ragged_tensor_lib.RaggedTensor.from_row_lengths(
+        np.repeat(np.arange(20, dtype=np.float32), row_lengths), row_lengths)
+    dataset = dataset_ops.DatasetV2.from_tensor_slices({
+        "dense": ragged_tensor.to_tensor(),
+        "ragged": ragged_tensor,
+        "sparse": ragged_tensor.to_sparse(),
+    })
+    dataset = dataset.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
+    dataset = dataset.batch(batch_size)
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    with distribution.scope():
+      iterator = iter(dist_dataset)
+      _check_type_spec_structure(iterator)
+
+    spec = iterator._type_spec
+    self.assertEqual(spec._input_workers, iterator._input_workers)
+    self.assertEqual(
+        spec._element_spec, {
+            "sparse":
+                values.PerReplicaSpec(
+                    sparse_tensor.SparseTensorSpec(
+                        tensor_shape.TensorShape([None, 3]), dtypes.float32),
+                    sparse_tensor.SparseTensorSpec(
+                        tensor_shape.TensorShape([None, 3]), dtypes.float32)),
+            "dense":
+                values.PerReplicaSpec(
+                    tensor_spec.TensorSpec(
+                        shape=(None, 3), dtype=dtypes.float32, name=None),
+                    tensor_spec.TensorSpec(
+                        shape=(None, 3), dtype=dtypes.float32, name=None)),
+            "ragged":
+                values.PerReplicaSpec(
+                    ragged_tensor_lib.RaggedTensorSpec(
+                        tensor_shape.TensorShape([None, None]), dtypes.float32,
+                        1, dtypes.int64),
+                    ragged_tensor_lib.RaggedTensorSpec(
+                        tensor_shape.TensorShape([None, None]), dtypes.float32,
+                        1, dtypes.int64))
+        })
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+          ],
+          enable_get_next_as_optional=[True, False]))
+  def testTypeSpecRoundTrip(self, distribution, enable_get_next_as_optional):
+    if not tf2.enabled():
+      self.skipTest("DistributedIterator CompositeTensor support is only "
+                    "present in TF 2.0 only.")
+
+    ctx = distribute_lib.InputContext()
+    batch_size = ctx.get_per_replica_batch_size(8)
+    # Use 20 which isn't divisible by 8 to test partial batch behavior.
+    row_lengths = np.mod(np.arange(20), 4).astype(np.int64)
+    ragged_tensor = ragged_tensor_lib.RaggedTensor.from_row_lengths(
+        np.repeat(np.arange(20, dtype=np.float32), row_lengths), row_lengths)
+    dataset = dataset_ops.DatasetV2.from_tensor_slices({
+        "dense": ragged_tensor.to_tensor(),
+        "ragged": ragged_tensor,
+        "sparse": ragged_tensor.to_sparse(),
+    })
+    dataset = dataset.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
+    dataset = dataset.batch(batch_size)
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    with distribution.scope():
+      iterator = iter(dist_dataset)
+      _check_type_spec_structure(iterator)
+
+    spec = iterator._type_spec
+
+    tensor_list = spec._to_components(iterator)
+    re_iterator = spec._from_components(tensor_list)
+
+    self.assertEqual(iterator._input_workers, re_iterator._input_workers)
+    self.assertAllEqual(iterator._iterators, re_iterator._iterators)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+          ],
+          enable_get_next_as_optional=[True, False]))
+  def testDoesNotTriggerFunctionTracing(self, distribution,
+                                        enable_get_next_as_optional):
+    if not tf2.enabled():
+      self.skipTest("DistributedIterator CompositeTensor support is only "
+                    "present in TF 2.0 only.")
+
+    trace_count = [0]
+
+    @def_function.function
+    def f(iterator):
+      trace_count[0] += 1
+      counter = np.int64(0)
+      for _ in range(5):
+        next(iterator)
+        counter += 1
+      return counter
+
+    ctx = distribute_lib.InputContext()
+    batch_size = ctx.get_per_replica_batch_size(8)
+    # Use 20 which isn't divisible by 8 to test partial batch behavior.
+    row_lengths = np.mod(np.arange(50), 4).astype(np.int64)
+    ragged_tensor = ragged_tensor_lib.RaggedTensor.from_row_lengths(
+        np.repeat(np.arange(50, dtype=np.float32), row_lengths), row_lengths)
+    dataset = dataset_ops.DatasetV2.from_tensor_slices({
+        "dense": ragged_tensor.to_tensor(),
+        "ragged": ragged_tensor,
+        "sparse": ragged_tensor.to_sparse(),
+    })
+    dataset = dataset.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
+    dataset = dataset.batch(batch_size)
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    with distribution.scope():
+      for _ in range(3):
+        iterator = iter(dist_dataset)
+        _check_type_spec_structure(iterator)
+        counter = f(iterator)
+
+        self.assertEqual(trace_count[0], 1)
+        self.assertEqual(counter, 5)
+
+
+def _check_type_spec_structure(x):
+  """Verifies that `x` has the same structure as its `TypeSpec`."""
+  if isinstance(x, composite_tensor.CompositeTensor):
+    nest.assert_same_structure(x, x._type_spec, expand_composites=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/mirrored_run.py b/tensorflow/python/distribute/mirrored_run.py
index 2cd139c387f..aed7b363b81 100644
--- a/tensorflow/python/distribute/mirrored_run.py
+++ b/tensorflow/python/distribute/mirrored_run.py
@@ -364,7 +364,7 @@ class _MirroredReplicaContext(distribute_lib.ReplicaContext):
 
     This pauses the current replica thread and passes `fn` and its arguments to
     the main thread. The main thread will wait until all replicas pause, then
-    invoke `fn` with grouped arugments. The current replica thread will continue
+    invoke `fn` with grouped arguments. The current replica thread will continue
     after `fn` completes.
 
     See `_call_for_each_replica` for the logic in the main thread.
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 47c3a744419..1413777d0bc 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -160,7 +160,7 @@ class MultiProcessRunnerTest(test.TestCase):
       for i in range(0, 10):
         print(
             'index {}, iteration {}'.format(self._worker_idx(), i), flush=True)
-        time.sleep(1)
+        time.sleep(5)
 
     mpr = multi_process_runner.MultiProcessRunner(
         proc_func,
diff --git a/tensorflow/python/distribute/multi_worker_continuous_run_test.py b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
index 90484a12423..437255c1015 100644
--- a/tensorflow/python/distribute/multi_worker_continuous_run_test.py
+++ b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
@@ -33,10 +33,13 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 
+
 NUM_WORKERS = 5
 
 
@@ -84,9 +87,10 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
       for _ in range(20):
         worker_step_fn(worker_id)
 
-    multi_process_runner.run(
-        worker_fn,
-        cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
+    with test_util.skip_if_error(self, errors_impl.UnavailableError):
+      multi_process_runner.run(
+          worker_fn,
+          cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
 
   @combinations.generate(combinations.combine(mode=['eager']))
   def testVariableInitializationWithChangingShape(self, mode):
@@ -116,9 +120,10 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
       for i in range(20):
         worker_step_fn(worker_id, num_dims=(i + 1))
 
-    multi_process_runner.run(
-        worker_fn,
-        cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
+    with test_util.skip_if_error(self, errors_impl.UnavailableError):
+      multi_process_runner.run(
+          worker_fn,
+          cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index b90c922c9e2..408cad2ca0a 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -50,6 +50,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import server_lib
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 
@@ -193,7 +194,7 @@ def create_in_process_cluster(num_workers,
         protocol=rpc_layer)
   except errors.UnknownError as e:
     if 'Could not start gRPC server' in e.message:
-      test.TestCase.skipTest('Cannot start std servers.')
+      raise unittest.SkipTest('Cannot start std servers.')
     else:
       raise
   return cluster
@@ -559,6 +560,10 @@ class MultiWorkerMultiProcessTest(test.TestCase):
     return subprocess.Popen(
         cmd_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
 
+  @deprecation.deprecated(
+      None, '`run_multiple_tasks_in_processes` is deprecated; any new test '
+      'requiring multiple processes should use `multi_process_runner` for '
+      'better support of log printing, streaming, and more functionality.')
   def run_multiple_tasks_in_processes(self, cmd_args, cluster_spec):
     """Run `cmd_args` in a process for each task in `cluster_spec`."""
     processes = {}
@@ -570,6 +575,10 @@ class MultiWorkerMultiProcessTest(test.TestCase):
         processes[task_type].append(p)
     return processes
 
+  @deprecation.deprecated(
+      None, '`join_independent_workers` is deprecated; any new test '
+      'requiring multiple processes should use `multi_process_runner` for '
+      'better support of log printing, streaming, and more functionality.')
   def join_independent_workers(self, worker_processes):
     return_codes = []
     for p in nest.flatten(worker_processes):
@@ -585,6 +594,10 @@ class MultiWorkerMultiProcessTest(test.TestCase):
     for return_code in return_codes:
       self.assertEqual(return_code, 0)
 
+  @deprecation.deprecated(
+      None, '`stream_stderr` is deprecated; any new test '
+      'requiring multiple processes should use `multi_process_runner` for '
+      'better support of log printing, streaming, and more functionality.')
   def stream_stderr(self, processes, print_only_first=False):
     """Consume stderr of all processes and print to stdout.
 
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
new file mode 100644
index 00000000000..930816d4407
--- /dev/null
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -0,0 +1,69 @@
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Pybind rules must live in tensorflow/python due to header rule visibility.
+exports_files(
+    ["pywrap_parallel_device.cc"],
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+py_library(
+    name = "parallel_device",
+    srcs = ["parallel_device.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":parallel_device_ops",
+        ":saving",
+        "//tensorflow/python:_pywrap_parallel_device",
+    ],
+)
+
+py_library(
+    name = "saving",
+    srcs = ["saving.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python:framework_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "parallel_device_ops_py",
+    out = "gen_parallel_device_ops.py",
+    deps = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
+)
+
+tf_custom_op_library(
+    name = "_parallel_device_ops.so",
+    srcs = ["//tensorflow/c/eager/parallel_device:parallel_device_ops_srcs"],
+)
+
+tf_custom_op_py_library(
+    name = "parallel_device_ops",
+    dso = [":_parallel_device_ops.so"],
+    kernels = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
+    visibility = ["//tensorflow:internal"],
+    deps = [":parallel_device_ops_py"],
+)
+
+py_test(
+    name = "parallel_device_test",
+    srcs = ["parallel_device_test.py"],
+    python_version = "PY3",
+    tags = [
+        # Dependencies aren't otherwise included in the pip package yet.
+        "no_pip",
+    ],
+    deps = [
+        ":parallel_device",
+        ":saving",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/module",
+        "//tensorflow/python/tpu",
+    ],
+)
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device.py b/tensorflow/python/distribute/parallel_device/parallel_device.py
new file mode 100644
index 00000000000..2dbdc653a64
--- /dev/null
+++ b/tensorflow/python/distribute/parallel_device/parallel_device.py
@@ -0,0 +1,115 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility for eagerly executing operations in parallel on multiple devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import threading
+
+from tensorflow.python import _pywrap_parallel_device
+from tensorflow.python.distribute.parallel_device import gen_parallel_device_ops
+from tensorflow.python.distribute.parallel_device import saving
+from tensorflow.python.eager import context
+from tensorflow.python.framework import load_library
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.tpu.ops import tpu_ops
+
+load_library.load_op_library(
+    resource_loader.get_path_to_datafile("_parallel_device_ops.so"))
+
+_next_device_number = 0
+_next_device_number_lock = threading.Lock()
+
+
+# TODO(allenl): Expand this docstring once things like getting components on and
+# off the device are stable.
+class ParallelDevice(object):
+  """A device which executes operations in parallel."""
+
+  def __init__(self, components):
+    """Creates a device which executes operations in parallel on `components`.
+
+    Args:
+      components: A list of device names. Each operation executed on the
+        returned device executes on these component devices.
+
+    Returns:
+      A string with the name of the newly created device.
+    """
+    global _next_device_number, _next_device_number_lock
+    self.components = tuple(components)
+    ctx = context.context()
+    with _next_device_number_lock:
+      # TODO(allenl): Better names for parallel devices (right now "CUSTOM" is
+      # special-cased).
+      self.name = "{}/device:CUSTOM:{}".format(
+          ctx.host_address_space(), _next_device_number)
+      _next_device_number += 1
+    device, device_info = _pywrap_parallel_device.GetParallelDeviceCapsules(
+        self.name, self.components)
+    context.register_custom_device(device, self.name, device_info)
+    with ops.device(self.name):
+      self._device_ids = gen_parallel_device_ops.device_id()
+
+  def pack(self, tensors):
+    """Create a tensor on the parallel device from a sequence of tensors.
+
+    Args:
+      tensors: A flat list of tensors, one per device in `self.components`.
+
+    Returns:
+      A single tensor placed on `self.name`.
+    """
+    with ops.device(self.name):
+      return tpu_ops.tpu_replicated_input(inputs=tensors)
+
+  def unpack(self, parallel_tensor):
+    """Unpack a parallel tensor into its components.
+
+    Args:
+      parallel_tensor: A tensor placed on `self.name`.
+
+    Returns:
+      A flat list of tensors, one per `self.components`.
+    """
+    with ops.device(self.name):
+      return tpu_ops.tpu_replicated_output(
+          parallel_tensor, num_replicas=len(self.components))
+
+  @property
+  def device_ids(self):
+    """A parallel tensor with scalar integers numbering component devices.
+
+    Each device ID is placed on its corresponding device, in the same order as
+    the `components` constructor argument.
+
+    Returns:
+      A parallel tensor containing 0 on the first device, 1 on the second, etc.
+    """
+    return self._device_ids
+
+  # TODO(allenl): Fixing saving in Python is a bit odd. One alternative would be
+  # to provide a hook for the custom device to create save specs/etc., then call
+  # that hook from the default variable implementation if the variable is on a
+  # custom device. We'll likely want similar hooks for repr() and such.
+  @contextlib.contextmanager
+  def scope(self):
+    """Runs ops in parallel, makes variables which save independent buffers."""
+    with ops.device(self.name), saving.independent_buffers(self):
+      yield
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
new file mode 100644
index 00000000000..e35eb601cc5
--- /dev/null
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -0,0 +1,260 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+
+from tensorflow.python.distribute.parallel_device import parallel_device
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.module import module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as tracking
+from tensorflow.python.util import nest
+
+# When running collectives asynchronously, we need to give each parallel device
+# execution a unique ID so the collectives don't interfere. Since the op is
+# replicated with group/instance key intact, the replicated nodes will
+# communicate.
+# TODO(allenl): Switch to using a collective manager.
+_COUNTER_LOCK = threading.Lock()
+_COUNTER = 0
+
+
+def _collective_reduce(inputs, operation, num_replicas):
+
+  def _reduce_tensor(tensor):
+    with _COUNTER_LOCK:
+      global _COUNTER
+      keys = _COUNTER
+      _COUNTER += 1
+    return collective_ops.all_reduce(
+        t=tensor,
+        group_size=num_replicas,
+        merge_op=operation,
+        group_key=keys,
+        instance_key=keys,
+        final_op="Id")
+
+  return nest.map_structure(_reduce_tensor, inputs)
+
+
+def _collective_sum(inputs, num_replicas):
+  return _collective_reduce(
+      inputs=inputs, operation="Add", num_replicas=num_replicas)
+
+
+class _Dense(module.Module):
+
+  def __init__(self, output_size):
+    self.output_size = output_size
+    self.kernel = None
+    self.bias = None
+
+  def __call__(self, x):
+    if self.kernel is None:
+      self.kernel = variables.Variable(
+          array_ops.ones(
+              array_ops.stack([self.output_size,
+                               array_ops.shape(x)[-1]])))
+      self.bias = variables.Variable(array_ops.ones([self.output_size]))
+    return math_ops.matmul(x, self.kernel, transpose_b=True) + self.bias
+
+
+class _VirtualDeviceTestCase(test.TestCase):
+
+  def setUp(self):
+    super(_VirtualDeviceTestCase, self).setUp()
+    cpus = context.context().list_physical_devices("CPU")
+    # Set 4 virtual CPUs
+    context.context().set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+
+    # TODO(allenl): Make CPU:0 and CPU:1 work (right now "CPU:1" soft-places
+    # onto CPU:0, which seems wrong).
+    components = [
+        "/job:localhost/replica:0/task:0/device:CPU:0",
+        "/job:localhost/replica:0/task:0/device:CPU:1"
+    ]
+    self.device = parallel_device.ParallelDevice(components)
+
+
+class ParallelDeviceTests(_VirtualDeviceTestCase):
+
+  def test_register_parallel_device(self):
+    with ops.device(self.device.name):
+      c = constant_op.constant(1.)
+      d = constant_op.constant(2.)
+      e = c + d
+      outputs = self.device.unpack(e)
+    self.assertAllClose([3., 3.], outputs)
+
+    self.assertIn(self.device.components[0], outputs[0].backing_device)
+    self.assertIn(self.device.components[1], outputs[1].backing_device)
+
+  def test_device_id(self):
+    device_ids = self.device.unpack(self.device.device_ids)
+    self.assertAllClose([0, 1], device_ids)
+    self.assertIn(self.device.components[0], device_ids[0].backing_device)
+    self.assertIn(self.device.components[1], device_ids[1].backing_device)
+
+  def test_collective_reduce(self):
+    with ops.device(self.device.name):
+      x = self.device.pack(
+          [constant_op.constant(-1.5),
+           constant_op.constant(3.5)])
+      reduced = _collective_sum(x, num_replicas=2)
+      outputs = self.device.unpack(reduced)
+    self.assertAllClose([2., 2.], outputs)
+    self.assertIn(self.device.components[0], outputs[0].backing_device)
+    self.assertIn(self.device.components[1], outputs[1].backing_device)
+
+  def test_checkpointing(self):
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.device.scope():
+      different_values = self.device.pack(
+          [constant_op.constant(-1.),
+           constant_op.constant(3.)])
+      v = variables.Variable(different_values)
+      checkpoint = tracking.Checkpoint(v=v)
+    save_path = checkpoint.save(prefix)
+    with ops.device(self.device.name):
+      v.assign(constant_op.constant(0.))
+    # Make sure the checkpoint is actually written before we try to read it
+    context.async_wait()
+    checkpoint.restore(save_path).assert_consumed()
+    with ops.device(self.device.name):
+      outputs = self.device.unpack(v)
+    self.assertAllClose([-1., 3.], outputs)
+
+
+class LayerTests(_VirtualDeviceTestCase):
+
+  def test_layer_forward(self):
+    with ops.device(self.device.name):
+      layer = _Dense(5)
+      x = constant_op.constant([[2.]])
+      y = layer(x)
+      outputs = self.device.unpack(y)
+    self.assertAllClose([[3.] * 5], outputs[0])
+    self.assertAllClose([[3.] * 5], outputs[1])
+    self.assertIn(self.device.components[0], outputs[0].backing_device)
+    self.assertIn(self.device.components[1], outputs[1].backing_device)
+
+    # With different Layer inputs we get different outputs
+    with ops.device(self.device.name):
+      x = self.device.pack(
+          [constant_op.constant([[-0.5]]),
+           constant_op.constant([[0.5]])])
+      y = layer(x)
+      outputs = self.device.unpack(y)
+    self.assertGreater(
+        math_ops.reduce_max(math_ops.abs(outputs[0] - outputs[1])), 1e-5)
+    self.assertIn(self.device.components[0], outputs[0].backing_device)
+    self.assertIn(self.device.components[1], outputs[1].backing_device)
+
+  def test_layer_sync_training(self):
+    with ops.device(self.device.name):
+      layer = _Dense(5)
+
+      with backprop.GradientTape() as tape:
+        x = self.device.pack(
+            [constant_op.constant([[-0.5]]),
+             constant_op.constant([[0.5]])])
+        y = layer(x)
+        loss = (y - math_ops.range(5.))**2.
+      parameters = layer.trainable_variables
+      unreduced_gradients = tape.gradient(loss, parameters)
+      reduced_gradients = _collective_sum(unreduced_gradients, num_replicas=2)
+      for grad, param in zip(reduced_gradients, parameters):
+        param.assign_sub(0.01 * grad)
+    final_kernels = self.device.unpack(layer.kernel)
+    self.assertAllClose(final_kernels[0], final_kernels[1])
+    final_bias = self.device.unpack(layer.bias)
+    expected_bias = (1. - 0.01 * 2. * (1. + .5 - math_ops.range(5.)) -
+                     0.01 * 2. * (1. - .5 - math_ops.range(5.)))
+    self.assertAllClose(expected_bias, final_bias[0])
+    self.assertAllClose(expected_bias, final_bias[1])
+    self.assertIn(self.device.components[0], final_kernels[0].backing_device)
+    self.assertIn(self.device.components[1], final_kernels[1].backing_device)
+
+  def test_layer_divergent_buffer_training(self):
+    with ops.device(self.device.name):
+      layer = _Dense(5)
+
+      with backprop.GradientTape() as tape:
+        x = self.device.pack(
+            [constant_op.constant([[-0.5]]),
+             constant_op.constant([[0.5]])])
+        y = layer(x)
+        loss = (y - math_ops.range(5.))**2.
+      parameters = layer.trainable_variables
+      unreduced_gradients = tape.gradient(loss, parameters)
+      for grad, param in zip(unreduced_gradients, parameters):
+        param.assign_sub(0.01 * grad)
+    final_kernels = self.device.unpack(layer.kernel)
+    self.assertNotAllClose(final_kernels[0], final_kernels[1])
+    final_bias = self.device.unpack(layer.bias)
+    self.assertAllClose(1. - 0.01 * 2. * (1. - .5 - math_ops.range(5.)),
+                        final_bias[0])
+    self.assertAllClose(1. - 0.01 * 2. * (1. + .5 - math_ops.range(5.)),
+                        final_bias[1])
+    self.assertIn(self.device.components[0], final_kernels[0].backing_device)
+    self.assertIn(self.device.components[1], final_kernels[1].backing_device)
+
+  def test_training_loop(self):
+    for _ in range(5):
+      layer = _Dense(5)
+      checkpoint = tracking.Checkpoint(layer=layer)
+      manager = checkpoint_management.CheckpointManager(
+          checkpoint, directory=self.get_temp_dir(), max_to_keep=5)
+      manager.restore_or_initialize()
+
+      for _ in range(10):
+        with self.device.scope():
+          with backprop.GradientTape() as tape:
+            x = self.device.pack(
+                [constant_op.constant([[-0.5]]),
+                 constant_op.constant([[0.5]])])
+            y = layer(x)
+            loss = (y - math_ops.range(5.))**2.
+          parameters = layer.trainable_variables
+          unreduced_gradients = tape.gradient(loss, parameters)
+          reduced_gradients = _collective_sum(
+              unreduced_gradients, num_replicas=len(self.device.components))
+          for grad, param in zip(reduced_gradients, parameters):
+            param.assign_sub(0.01 * grad)
+
+        manager.save()
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc b/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc
new file mode 100644
index 00000000000..62488cb31e7
--- /dev/null
+++ b/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "Python.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device.h"
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+
+namespace py = pybind11;
+
+void CallDelete_Device(PyObject* capsule) {
+  delete reinterpret_cast<TFE_CustomDevice*>(
+      PyCapsule_GetPointer(capsule, "TFE_CustomDevice"));
+}
+
+void CallDelete_DeviceInfo(PyObject* capsule) {
+  void (*destructor)(void*) =
+      reinterpret_cast<void (*)(void*)>(PyCapsule_GetContext(capsule));
+  destructor(PyCapsule_GetPointer(capsule, "TFE_CustomDevice_DeviceInfo"));
+}
+
+PYBIND11_MODULE(_pywrap_parallel_device, m) {
+  m.def("GetParallelDeviceCapsules",
+        [](const char* name, std::vector<std::string> underlying_devices) {
+          std::vector<const char*> underlying_devices_c;
+          underlying_devices_c.reserve(underlying_devices.size());
+          for (const std::string& element : underlying_devices) {
+            underlying_devices_c.push_back(element.c_str());
+          }
+          // `device` is owned by `device_capsule`.
+          TFE_CustomDevice* device = new TFE_CustomDevice;
+          tensorflow::Safe_PyObjectPtr device_capsule(
+              PyCapsule_New(device, "TFE_CustomDevice", &CallDelete_Device));
+          void* device_info;
+          tensorflow::eager::AllocateParallelDevice(
+              name, underlying_devices_c.data(), underlying_devices_c.size(),
+              device, &device_info);
+          if (PyErr_Occurred()) throw py::error_already_set();
+          tensorflow::Safe_PyObjectPtr device_info_capsule(
+              PyCapsule_New(device_info, "TFE_CustomDevice_DeviceInfo",
+                            &CallDelete_DeviceInfo));
+          if (PyErr_Occurred()) throw py::error_already_set();
+          // The PyCapsule destructor needs a pointer to the destructor for
+          // DeviceInfo.
+          PyCapsule_SetContext(device_info_capsule.get(),
+                               reinterpret_cast<void*>(device->delete_device));
+          return tensorflow::PyoOrThrow(
+              PyTuple_Pack(2, device_capsule.get(), device_info_capsule.get()));
+        });
+}
diff --git a/tensorflow/python/distribute/parallel_device/saving.py b/tensorflow/python/distribute/parallel_device/saving.py
new file mode 100644
index 00000000000..f2e7dadae41
--- /dev/null
+++ b/tensorflow/python/distribute/parallel_device/saving.py
@@ -0,0 +1,131 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Special-cased checkpointing for variables on a parallel device."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import functools
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training.saving import saveable_object
+
+
+def _read_component(handle, dtype, replica_id, parallel_device):
+  """Read one component of a parallel variable and discard the rest."""
+  with ops.device(handle.device):
+    read = gen_resource_variable_ops.read_variable_op(
+        resource=handle, dtype=dtype)
+  all_components = parallel_device.unpack(read)
+  # We're pretending that parallel variables have a first axis with length
+  # num_components, so we need to add a dummy first axis to the shape that gets
+  # saved.
+  return all_components[replica_id][None, ...]
+
+
+class _ParallelDeviceSaveable(saveable_object.SaveableObject):
+  """Saves and restores a parallel variable."""
+
+  def __init__(self, name, handle, dtype, component_shape, parallel_device):
+    # Each component device gets one spec with a tensor to save.
+    specs = []
+    for replica_id, device_name in enumerate(parallel_device.components):
+      # TODO(b/151773535): SaveableObjects with SaveSpecs on different devices
+      # will cause extra copying at the moment. We should fix that before doing
+      # anything serious with this code.
+      specs.append(
+          saveable_object.SaveSpec(
+              tensor=functools.partial(
+                  _read_component,
+                  handle=handle,
+                  dtype=dtype,
+                  replica_id=replica_id,
+                  parallel_device=parallel_device),
+              slice_spec=variables.Variable.SaveSliceInfo(
+                  full_shape=([len(parallel_device.components)] +
+                              component_shape),
+                  var_offset=[replica_id] + [0] * len(component_shape),
+                  var_shape=[1] + component_shape).spec,
+              device=device_name,
+              dtype=dtype,
+              name=name))
+    self._handle = handle
+    self._parallel_device = parallel_device
+    self._component_shape = component_shape
+    super(_ParallelDeviceSaveable, self).__init__(None, specs, name)
+
+  def restore(self, tensors, restored_shapes=None):
+    with ops.device(self._handle.device):
+      # Combine the restored tensors into one parallel tensor to assign.
+      bundled = self._parallel_device.pack(tensors)
+      gen_resource_variable_ops.assign_variable_op(
+          resource=self._handle,
+          # Squeeze out the dummy first axis we added when saving.
+          value=array_ops.squeeze(bundled, axis=0))
+
+
+class VariableWithFixedCheckpointing(resource_variable_ops.ResourceVariable):
+  """Overrides checkpointing behavior to save like a partitioned variable."""
+
+  def __init__(self, parallel_device, **kwargs):
+    self._parallel_device = parallel_device
+    kwargs = {k: v for k, v in kwargs.items()
+              if k not in ["use_resource", "expected_shape"]}
+    super(VariableWithFixedCheckpointing, self).__init__(**kwargs)
+
+  def _gather_saveables_for_checkpoint(self):
+    # Note VARIABLE_VALUE is the usual attribute name for variables. Using
+    # something different means (a) the checkpointing infrastructure won't try
+    # doing restore-on-create (which has shape issues), and (b) the saved
+    # variables won't be compatible with regular variables. Both of those are
+    # good in this case.
+    return dict(
+        PARALLEL_VARIABLE_VALUE=functools.partial(
+            _ParallelDeviceSaveable,
+            handle=self.handle,
+            dtype=self.dtype,
+            component_shape=self.shape,
+            parallel_device=self._parallel_device))
+
+
+def _variable_creator(next_creator, parallel_device, **kwargs):
+  del next_creator
+  return VariableWithFixedCheckpointing(
+      parallel_device=parallel_device, **kwargs)
+
+
+@contextlib.contextmanager
+def independent_buffers(parallel_device):
+  """Context manager which saves parallel buffers independently.
+
+  Creates a ParallelDevice-aware variable subclass which saves buffers for each
+  device separately.
+
+  Args:
+    parallel_device: A ParallelDevice object on which variables are placed.
+
+  Yields:
+    Nothing.
+  """
+  with variable_scope.variable_creator_scope(
+      functools.partial(_variable_creator, parallel_device=parallel_device)):
+    yield
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index e1f0f41b393..ce2b7ceb159 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -120,6 +120,31 @@ class ParameterServerStrategy(distribute_lib.Strategy):
     distribute_lib.distribution_strategy_replica_gauge.get_cell("num_ps").set(
         len(self.extended.parameter_devices))
 
+  def experimental_distribute_dataset(self, dataset):
+    self._raise_pss_error_if_eager()
+    super(ParameterServerStrategy,
+          self).experimental_distribute_dataset(dataset=dataset)
+
+  def experimental_distribute_datasets_from_function(self, dataset_fn):
+    self._raise_pss_error_if_eager()
+    super(ParameterServerStrategy,
+          self).experimental_distribute_datasets_from_function(
+              dataset_fn=dataset_fn)
+
+  def run(self, fn, args=(), kwargs=None, options=None):
+    self._raise_pss_error_if_eager()
+    super(ParameterServerStrategy, self).run(
+        fn, args=args, kwargs=kwargs, options=options)
+
+  def scope(self):
+    self._raise_pss_error_if_eager()
+    return super(ParameterServerStrategy, self).scope()
+
+  def _raise_pss_error_if_eager(self):
+    if context.executing_eagerly():
+      raise NotImplementedError("ParameterServerStrategy currently only works "
+                                "with the tf.Estimator API")
+
 
 @tf_export(v1=["distribute.experimental.ParameterServerStrategy"])  # pylint: disable=missing-docstring
 class ParameterServerStrategyV1(distribute_lib.StrategyV1):
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
index 9995c2ead9b..de5cf9e90d1 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -46,6 +46,7 @@ from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
@@ -732,6 +733,37 @@ class ParameterServerStrategyTest(
         num_gpus=0)
     self.assertTrue(strategy.extended._in_multi_worker_mode())
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testEagerCustomTrainingUnimplementedError(self):
+    cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=2)
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+        task_type='worker',
+        task_id=1,
+        num_accelerators={'GPU': 0})
+    strategy = parameter_server_strategy.ParameterServerStrategy(
+        cluster_resolver)
+    dataset = dataset_ops.DatasetV2.from_tensor_slices([5., 6., 7., 8.])
+
+    def train_step(data):
+      return math_ops.square(data)
+
+    self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*',
+                           strategy.experimental_distribute_dataset,
+                           dataset.batch(2))
+
+    self.assertRaisesRegex(
+        NotImplementedError, 'ParameterServerStrategy*',
+        strategy.experimental_distribute_datasets_from_function,
+        lambda _: dataset)
+
+    self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*',
+                           strategy.scope)
+
+    self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*',
+                           strategy.run, train_step)
+
 
 class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                            parameterized.TestCase):
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 8d721698d5c..e69c8c7f129 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -265,7 +265,8 @@ def distributions_and_v1_and_v2_optimizers():
 
 strategies_minus_tpu = [
     default_strategy, one_device_strategy, one_device_strategy_gpu,
-    mirrored_strategy_with_gpu_and_cpu, mirrored_strategy_with_two_gpus
+    mirrored_strategy_with_gpu_and_cpu, mirrored_strategy_with_two_gpus,
+    central_storage_strategy_with_gpu_and_cpu
 ]
 
 strategies_minus_default_and_tpu = [
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
new file mode 100644
index 00000000000..c277310b6a0
--- /dev/null
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -0,0 +1,65 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for common methods in strategy classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class StrategyReduceTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[strategy_combinations.multi_worker_mirrored_two_workers] +
+          strategy_combinations.strategies_minus_tpu,
+          mode=['eager']))
+  def testSimpleReduce(self, strategy):
+
+    def fn_eager():
+
+      def replica_fn():
+        return array_ops.ones((), dtypes.float32)
+
+      per_replica_value = strategy.run(replica_fn)
+      return strategy.reduce(
+          reduce_util.ReduceOp.SUM, value=per_replica_value, axis=None)
+
+    fn_graph = def_function.function(fn_eager)
+
+    # Run reduce under the strategy scope to explicitly enter
+    # strategy default_device scope.
+    with strategy.scope():
+      self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
+      self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
+
+    # Run reduce without a strategy scope to implicitly enter
+    # strategy default_device scope.
+    self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
+    self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
+
+
+if __name__ == '__main__':
+  combinations.main()
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 6e51b84a1d1..b574c523ccd 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -96,35 +96,34 @@ def validate_run_function(fn):
 
 @tf_export("distribute.experimental.TPUStrategy", v1=[])
 class TPUStrategy(distribute_lib.Strategy):
-  """TPU distribution strategy implementation."""
+  """TPU distribution strategy implementation.
+
+  To construct a TPUStrategy object, you need to run the
+  initialization code as below:
+
+  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  >>> tf.config.experimental_connect_to_cluster(resolver)
+  >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+  >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)
+
+  While using distribution strategies, the variables created within strategy's
+  scope will be replicated across all the replicas and can be kept in sync
+  using all-reduce algorithms.
+
+  To run TF2 programs on TPUs, you can either use `.compile` and
+  `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
+  training loop by calling `strategy.run` directly. Note that
+  TPUStrategy doesn't support pure eager execution, so please make sure the
+  function passed into `strategy.run` is a `tf.function` or
+  `strategy.run` is called inside a `tf.function` if eager
+  behavior is enabled.
+  """
 
   def __init__(self,
                tpu_cluster_resolver=None,
                device_assignment=None):
     """Synchronous training in TPU donuts or Pods.
 
-    To construct a TPUStrategy object, you need to run the
-    initialization code as below:
-
-    ```python
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
-    tf.config.experimental_connect_to_cluster(resolver)
-    tf.tpu.experimental.initialize_tpu_system(resolver)
-    strategy = tf.distribute.experimental.TPUStrategy(resolver)
-    ```
-
-    While using distribution strategies, the variables created within strategy's
-    scope will be replicated across all the replicas and can be kept in sync
-    using all-reduce algorithms.
-
-    To run TF2 programs on TPUs, you can either use `.compile` and
-    `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
-    training loop by calling `strategy.run` directly. Note that
-    TPUStrategy doesn't support pure eager execution, so please make sure the
-    function passed into `strategy.run` is a `tf.function` or
-    `strategy.run` is called inside a `tf.function` if eager
-    behavior is enabled.
-
     Args:
       tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
         which provides information about the TPU cluster.
@@ -209,26 +208,26 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
     Users can pass strategy specific options to `options` argument. An example
     to enable bucketizing dynamic shapes in `TPUStrategy.run`
     is:
-    ```python
 
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tf.config.experimental_connect_to_cluster(resolver)
-    tf.tpu.experimental.initialize_tpu_system(resolver)
-    strategy = tf.distribute.experimental.TPUStrategy(tpu='')
+    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    >>> tf.config.experimental_connect_to_cluster(resolver)
+    >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+    >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)
 
-    options = tf.distribute.RunOptions()
-    options.experimental_bucketizing_dynamic_shape = True
+    >>> options = tf.distribute.RunOptions(
+    ...     experimental_bucketizing_dynamic_shape=True)
 
-    iterator = iter(inputs)
+    >>> dataset = tf.data.Dataset.range(
+    ...    strategy.num_replicas_in_sync, output_type=dtypes.float32).batch(
+    ...        strategy.num_replicas_in_sync, drop_remainder=True)
+    >>> input_iterator = iter(strategy.experimental_distribute_dataset(dataset))
 
-    @tf.function()
-    def step_fn(inputs):
-      output = tf.reduce_sum(inputs)
-      return output
+    >>> @tf.function()
+    ... def step_fn(inputs):
+    ...  output = tf.reduce_sum(inputs)
+    ...  return output
 
-      strategy.run(step_fn, args=(next(iterator),),
-                                   options=options)
-    ```
+    >>> strategy.run(step_fn, args=(next(input_iterator),), options=options)
 
     Args:
       fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index de4c975d5ef..6c93e29c028 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -140,6 +141,9 @@ class TPUStrategyTest(test.TestCase):
     # for non-local TPU.
     if FLAGS.tpu:
       self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+
+    # Disable automatic outside compilation.
+    config.set_soft_device_placement(False)
     strategy = get_tpu_strategy()
 
     @def_function.function
@@ -164,6 +168,28 @@ class TPUStrategyTest(test.TestCase):
 
     good_run()
 
+  def test_dynamic_shape_with_outside_compilation_failure(self):
+    # Enable automatic outside compilation.
+    config.set_soft_device_placement(True)
+    strategy = get_tpu_strategy()
+    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
+        2, drop_remainder=False)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    iterator = iter(dataset)
+
+    @def_function.function
+    def train_fn(iterator):
+
+      def step_fn(inputs):
+        _, inputs = inputs
+        return math_ops.reduce_sum(inputs)
+
+      return strategy.experimental_local_results(
+          strategy.run(step_fn, args=(next(iterator),)))
+
+    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
+      logging.info(train_fn(iterator))
+
   def test_computation_on_subset_cores(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
index 9d0719f34b4..cab48155d90 100644
--- a/tensorflow/python/distribute/tpu_values.py
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -24,7 +24,6 @@ from __future__ import print_function
 
 import contextlib
 
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
@@ -186,30 +185,35 @@ def enclosing_tpu_context():
 class TPUMirroredVariable(TPUVariableMixin, values.MirroredVariable):
   """Holds a map from replica to TPU variables whose values are kept in sync."""
 
-  def _mirrored_update(self, update_fn, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if (ds_context.in_cross_replica_context() and
-          (enclosing_tpu_context() is not None)):
-        return self._distribute_strategy.extended.update(
-            self, update_fn, args=args, kwargs=kwargs)
-      else:
-        return values.MirroredVariable._mirrored_update(self, update_fn, *args,
-                                                        **kwargs)
-
-  def assign_sub(self, *args, **kwargs):
+  def assign_sub(self, value, use_locking=False, name=None, read_value=True):
     assign_sub_fn = _make_raw_assign_fn(
         gen_resource_variable_ops.assign_sub_variable_op)
-    return self._mirrored_update(assign_sub_fn, *args, **kwargs)
+    return self._update(
+        update_fn=assign_sub_fn,
+        value=value,
+        use_locking=use_locking,
+        name=name,
+        read_value=read_value)
 
-  def assign_add(self, *args, **kwargs):
+  def assign_add(self, value, use_locking=False, name=None, read_value=True):
     assign_add_fn = _make_raw_assign_fn(
         gen_resource_variable_ops.assign_add_variable_op)
-    return self._mirrored_update(assign_add_fn, *args, **kwargs)
+    return self._update(
+        update_fn=assign_add_fn,
+        value=value,
+        use_locking=use_locking,
+        name=name,
+        read_value=read_value)
 
-  def assign(self, *args, **kwargs):
+  def assign(self, value, use_locking=False, name=None, read_value=True):
     assign_fn = _make_raw_assign_fn(
         gen_resource_variable_ops.assign_variable_op)
-    return self._mirrored_update(assign_fn, *args, **kwargs)
+    return self._update(
+        update_fn=assign_fn,
+        value=value,
+        use_locking=use_locking,
+        name=name,
+        read_value=read_value)
 
   def scatter_sub(self, *args, **kwargs):
     raise NotImplementedError
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 2f8f9b8afe4..432f6b06975 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -38,10 +38,12 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
+# Utility functions used by the different classes below.
 def _get_current_replica_id_as_int():
   """Returns the current replica ID as an integer, or `None`."""
   replica_context = ds_context.get_replica_context()
@@ -54,6 +56,59 @@ def _get_current_replica_id_as_int():
   return replica_id
 
 
+def _assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(tensor)
+
+
+def _assign_add_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_add(tensor)
+
+
+def _assign_sub_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_sub(tensor)
+
+
+def _assert_replica_context(strategy):
+  replica_context = ds_context.get_replica_context()
+  if not replica_context:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+  if replica_context.strategy is not strategy:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+
+
+def _apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.extended.broadcast_to(
+        strategy.experimental_local_results(value)[0],
+        destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.extended.reduce_to(reduce_op, value, destinations)
+
+
+_aggregation_error_msg = (
+    "You must specify an aggregation method to update a "
+    "{variable_type} in Replica Context. You can do so by passing "
+    "an explicit value for argument `aggregation` to tf.Variable(..)."
+    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
+    "`tf.VariableAggregation` lists the possible aggregation methods."
+    "This is required because {variable_type} should always be "
+    "kept in sync. When updating them or assigning to them in a "
+    "replica context, we automatically try to aggregate the values "
+    "before updating the variable. For this aggregation, we need to "
+    "know the aggregation method. "
+    "Another alternative is to not try to update such "
+    "{variable_type} in replica context, but in cross replica "
+    "context. You can enter cross replica context by calling "
+    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
+    "Inside `merge_fn`, you can then update the {variable_type} "
+    "using `tf.distribute.StrategyExtended.update()`.")
+
+
 @tf_export("distribute.DistributedValues", v1=[])
 class DistributedValues(object):
   """Base class for representing distributed values.
@@ -138,7 +193,7 @@ class DistributedValues(object):
         "This method should be overridden by sub-classes which support cross-"
         "replica accesses.")
 
-  def _get_closest(self):
+  def _get_on_device_or_primary(self):
     """Returns value in same replica or device if possible, else the _primary."""
     replica_id = _get_current_replica_id_as_int()
     if replica_id is None:
@@ -378,7 +433,7 @@ class Mirrored(DistributedDelegate):
   """Holds a map from replica to values which are kept in sync."""
 
   def _get_cross_replica(self):
-    return self._get_closest()
+    return self._get_on_device_or_primary()
 
   def _as_graph_element(self):
     obj = self._get()
@@ -388,21 +443,6 @@ class Mirrored(DistributedDelegate):
     return obj
 
 
-def _assign_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign(tensor)
-
-
-def _assign_add_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_add(tensor)
-
-
-def _assign_sub_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_sub(tensor)
-
-
 class DistributedVarOp(object):
   """A class that looks like `tf.Operation`."""
 
@@ -422,20 +462,18 @@ class DistributedVarOp(object):
     return hash((self.name, self.graph, self.traceback, self.type))
 
 
-class DistributedVariable(DistributedDelegate, variables_lib.Variable):
+class DistributedVariable(DistributedDelegate, variables_lib.Variable,
+                          core.Tensor):
   """Holds a map from replica to variables."""
 
   # TODO(josh11b): Support changing the set of variables if e.g. if new
   # devices are joining or a device is to leave.
 
-  def __init__(self, strategy, values):
+  def __init__(self, strategy, values, aggregation):
     self._distribute_strategy = strategy
+    self._aggregation = aggregation
     super(DistributedVariable, self).__init__(values)
     self._common_name = self._primary.name.split(":")[0]
-    # Use a weakref to make it easy to map from the contained values
-    # to the container without introducing a reference cycle.
-    for v in values:
-      v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
     # tf.keras keeps track of variables initialized using this attribute. When
     # tf.keras gets the default session, it initializes all uninitialized vars.
     # We need to make _keras_initialized a member of DistributedVariable because
@@ -481,11 +519,11 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     return init_op
 
   def initialized_value(self):
-    return self._get_closest().initialized_value()
+    return self._get_on_device_or_primary().initialized_value()
 
   @property
   def initial_value(self):
-    return self._get_closest().initial_value
+    return self._get_on_device_or_primary().initial_value
 
   @property
   def constraint(self):
@@ -524,6 +562,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
   def synchronization(self):
     return self._primary.synchronization
 
+  @property
+  def aggregation(self):
+    return self._aggregation
+
   @property
   def handle(self):
     replica_id = _get_current_replica_id_as_int()
@@ -534,7 +576,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
       return self._values[replica_id].handle
 
   def eval(self, session=None):
-    return self._get_closest().eval(session)
+    return self._get_on_device_or_primary().eval(session)
 
   @property
   def _save_slice_info(self):
@@ -549,7 +591,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def device(self):
-    return self._get_closest().device
+    return self._get_on_device_or_primary().device
 
   @property
   def trainable(self):
@@ -584,69 +626,162 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
       return array_ops.identity(self._get())
 
   def value(self):
-    return self._get_closest().value()
+    return self._get_on_device_or_primary().value()
+
+  def numpy(self):
+    if context.executing_eagerly():
+      return self.read_value().numpy()
+    else:
+      raise NotImplementedError(
+          "numpy() is only available when eager execution is enabled.")
+
+  def assign_sub(self, value, use_locking=False, name=None, read_value=True):
+    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
+    return self._update(
+        update_fn=assign_sub_fn,
+        value=value,
+        use_locking=use_locking,
+        name=name,
+        read_value=read_value)
+
+  def assign_add(self, value, use_locking=False, name=None, read_value=True):
+    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
+    return self._update(
+        update_fn=assign_add_fn,
+        value=value,
+        use_locking=use_locking,
+        name=name,
+        read_value=read_value)
+
+  def assign(self, value, use_locking=False, name=None, read_value=True):
+    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
+    return self._update(
+        update_fn=assign_fn,
+        value=value,
+        use_locking=use_locking,
+        name=name,
+        read_value=read_value)
+
+  def scatter_sub(self, sparse_delta, use_locking=False, name=None):
+    scatter_sub_fn = lambda var, *a, **kw: var.scatter_sub(*a, **kw)
+    return self._update(
+        update_fn=scatter_sub_fn,
+        value=sparse_delta,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_add(self, sparse_delta, use_locking=False, name=None):
+    scatter_add_fn = lambda var, *a, **kw: var.scatter_add(*a, **kw)
+    return self._update(
+        update_fn=scatter_add_fn,
+        value=sparse_delta,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_mul(self, sparse_delta, use_locking=False, name=None):
+    scatter_mul_fn = lambda var, *a, **kw: var.scatter_mul(*a, **kw)
+    return self._update(
+        update_fn=scatter_mul_fn,
+        value=sparse_delta,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_div(self, sparse_delta, use_locking=False, name=None):
+    scatter_div_fn = lambda var, *a, **kw: var.scatter_div(*a, **kw)
+    return self._update(
+        update_fn=scatter_div_fn,
+        value=sparse_delta,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_min(self, sparse_delta, use_locking=False, name=None):
+    scatter_min_fn = lambda var, *a, **kw: var.scatter_min(*a, **kw)
+    return self._update(
+        update_fn=scatter_min_fn,
+        value=sparse_delta,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_max(self, sparse_delta, use_locking=False, name=None):
+    scatter_max_fn = lambda var, *a, **kw: var.scatter_max(*a, **kw)
+    return self._update(
+        update_fn=scatter_max_fn,
+        value=sparse_delta,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_update(self, sparse_delta, use_locking=False, name=None):
+    scatter_update_fn = lambda var, *a, **kw: var.scatter_update(*a, **kw)
+    return self._update(
+        update_fn=scatter_update_fn,
+        value=sparse_delta,
+        use_locking=use_locking,
+        name=name)
+
+  def _update_cross_replica(self, update_fn, value, **kwargs):
+    """Applies updates across replicas.
+
+    Args:
+      update_fn: A callable to pass to `strategy.extended.update` to update the
+        variable. It should has the same signature as `Variable.assign()`.
+      value: value to be passed to `update_fn`.
+      **kwargs: remaining arguments to `update_fn`.
+
+    Returns:
+      Updated variable or `tf.Operation`.
+    """
+    return self.distribute_strategy.extended.update(
+        self, update_fn, args=(value,), kwargs=kwargs, group=True)
+
+  def _update_replica(self, update_fn, value, **kwargs):
+    """Applies updates in one replica.
+
+    Args:
+      update_fn: A callable to update the variable. It should has the same
+        signature as `Variable.assign()`.
+      value: value to be passed to `update_fn`.
+      **kwargs: remaining arguments to `update_fn`.
+
+    Returns:
+      Updated variable or `tf.Operation`.
+    """
+    raise NotImplementedError("should be implemented by subclass.")
+
+  def _update(self, update_fn, value, **kwargs):
+    """Applies updates depending on the context.
+
+    The method calls `_update_replica` in replica context,
+    `_update_cross_replica` in cross replica context, and `update_fn` in update
+    context.
+
+    If `read_value` is True, the method returns the updated Variable. If
+    `read_value` is False, the method returns the update `tf.Operation`.
+
+    Args:
+      update_fn: A callable to pass to `strategy.extended.update` to update the
+        variable. It should have the same signature as `Variable.assign()`.
+      value: value to be passed to `update_fn`.
+      **kwargs: keyword arguments to `update_fn`.
+
+    Returns:
+      Updated variable or `tf.Operation`.
+
+    """
+    with ds_context.enter_or_assert_strategy(self.distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        update_replica_id = distribute_lib.get_update_replica_id()
+        if update_replica_id is not None:
+          return update_fn(self._values[update_replica_id], value, **kwargs)
+        return self._update_cross_replica(update_fn, value, **kwargs)
+      else:
+        _assert_replica_context(self.distribute_strategy)
+        return self._update_replica(update_fn, value, **kwargs)
 
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
     pass
 
 
-ops.register_dense_tensor_like_type(DistributedVariable)
-
-
-def _validate_colocate_extended(v, extended):
-  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
-  if variable_strategy.extended is not extended:
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
-        (v, variable_strategy))
-
-
-def validate_colocate_distributed_variable(v, extended):
-  if not isinstance(v, DistributedVariable):
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not: %r" % (v,))
-  _validate_colocate_extended(v, extended)
-
-
-def validate_colocate(v, extended):
-  if not hasattr(v, "_distribute_strategy"):
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not: %r" % (v,))
-  _validate_colocate_extended(v, extended)
-
-
-def _apply_aggregation(strategy, value, aggregation, destinations):
-  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-    return strategy.extended.broadcast_to(
-        strategy.experimental_local_results(value)[0],
-        destinations=destinations)
-  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
-  return strategy.extended.reduce_to(reduce_op, value, destinations)
-
-
-_aggregation_error_msg = (
-    "You must specify an aggregation method to update a "
-    "{variable_type} in Replica Context. You can do so by passing "
-    "an explicit value for argument `aggregation` to tf.Variable(..)."
-    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
-    "`tf.VariableAggregation` lists the possible aggregation methods."
-    "This is required because {variable_type} should always be "
-    "kept in sync. When updating them or assigning to them in a "
-    "replica context, we automatically try to aggregate the values "
-    "before updating the variable. For this aggregation, we need to "
-    "know the aggregation method. "
-    "Another alternative is to not try to update such "
-    "{variable_type} in replica context, but in cross replica "
-    "context. You can enter cross replica context by calling "
-    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
-    "Inside `merge_fn`, you can then update the {variable_type} "
-    "using `tf.distribute.StrategyExtended.update()`.")
-
-
 class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
 
@@ -663,6 +798,268 @@ class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
             for v in self._mirrored_variable.values))
 
 
+class MirroredVariable(DistributedVariable, Mirrored):
+  """Holds a map from replica to variables whose values are kept in sync."""
+
+  def _update_replica(self, update_fn, value, **kwargs):
+    if self.aggregation == vs.VariableAggregation.NONE:
+      raise ValueError(
+          _aggregation_error_msg.format(variable_type="MirroredVariable"))
+
+    def merge_fn(strategy, value, **kwargs):
+      """Aggregate values and update all variables in cross replica context."""
+      # Don't allow MEAN with non float dtype, since it may cause unexpected
+      # precision loss. Python3 and NumPy automatically upcast integers to
+      # float in division, but we should always preserve the type.
+      #
+      # Note that to be backward compatible we allow the case when the value
+      # is *always* the same on each replica. I.E. value is not a
+      # PerReplica. Refer to regroup() to see how values are grouped.
+      if self._aggregation == vs.VariableAggregation.MEAN and (
+          not self.dtype.is_floating) and isinstance(value, PerReplica):
+        raise ValueError(
+            "Cannot update non-float variables with "
+            "tf.VariableAggregation.MEAN aggregation in replica context. "
+            "Either change the variable dtype to float or update it in "
+            "cross-replica context.")
+
+      assert strategy == self.distribute_strategy
+      v = _apply_aggregation(strategy, value, self.aggregation, self)
+      return self._update_cross_replica(update_fn, v, **kwargs)
+
+    return ds_context.get_replica_context().merge_call(
+        merge_fn, args=(value,), kwargs=kwargs)
+
+  def scatter_min(self, *args, **kwargs):
+    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
+        self._aggregation != vs.VariableAggregation.NONE):
+      raise NotImplementedError("scatter_min is only supported for mirrored "
+                                "variable (variable created within certain "
+                                "`tf.distribute.Strategy` scope) with NONE or "
+                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
+                                self._aggregation)
+    return super(MirroredVariable, self).scatter_min(*args, **kwargs)
+
+  def scatter_max(self, *args, **kwargs):
+    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
+        self._aggregation != vs.VariableAggregation.NONE):
+      raise NotImplementedError("scatter_max is only supported for mirrored "
+                                "variable (variable created within certain "
+                                "`tf.distribute.Strategy` scope) with NONE or "
+                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
+                                self._aggregation)
+    return super(MirroredVariable, self).scatter_max(*args, **kwargs)
+
+  def scatter_update(self, *args, **kwargs):
+    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
+        self._aggregation != vs.VariableAggregation.NONE):
+      raise NotImplementedError("scatter_update is only supported for mirrored "
+                                "variable (variable created within certain "
+                                "`tf.distribute.Strategy` scope) with NONE or "
+                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
+                                self._aggregation)
+    return super(MirroredVariable, self).scatter_update(*args, **kwargs)
+
+  def _get_cross_replica(self):
+    # Return identity, to avoid directly exposing the variable to the user and
+    # allowing it to be modified by mistake.
+    return array_ops.identity(Mirrored._get_cross_replica(self))
+
+  def _as_graph_element(self):
+    return self._get_on_device_or_primary()._as_graph_element()  # pylint: disable=protected-access
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides Trackable method.
+
+    This allows both name-based and object-based save and restore of
+    MirroredVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+
+    def _saveable_factory(name=self._common_name):
+      return _MirroredSaveable(self, self._primary, name)
+
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts a variable to a tensor."""
+    # Try to avoid assignments to and other mutations of MirroredVariable
+    # state except through a DistributionStrategy.extended.update() call.
+    if as_ref:
+      # A TF 1.x case where the variable is a boolean variable and used like:
+      # tf.cond(v, true_fn, false_fn).
+      raise ValueError(
+          "You may be using variable created under distribute strategy in TF "
+          "1.x control flows. Try explicitly converting the variable to Tensor "
+          "using variable.read_value(), or switch to TF 2.x.")
+    return ops.convert_to_tensor(
+        self._get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+class _SyncOnReadSaveable(saveable_object.SaveableObject):
+  """Class for defining how to restore a SyncOnReadVariable."""
+
+  def __init__(self, sync_on_read_variable, name):
+    self._sync_on_read_variable = sync_on_read_variable
+
+    # We use a callable so that we don't have to evaluate this expression
+    # in the case where we are trying to restore instead of save.
+    def tensor():
+      strategy = sync_on_read_variable._distribute_strategy  # pylint: disable=protected-access
+      return strategy.extended.read_var(sync_on_read_variable)
+
+    spec = saveable_object.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name,
+        dtype=sync_on_read_variable.dtype,
+        device=sync_on_read_variable._primary.device)  # pylint: disable=protected-access
+
+    super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    # To preserve the sum across save and restore, we have to divide the
+    # total across all devices when restoring a variable that was summed
+    # when saving.
+    tensor, = restored_tensors
+    if self._sync_on_read_variable.aggregation == vs.VariableAggregation.SUM:
+      tensor = math_ops.cast(tensor / len(self._sync_on_read_variable._devices),  # pylint: disable=protected-access
+                             self._sync_on_read_variable.dtype)
+    return control_flow_ops.group(
+        tuple(
+            _assign_on_device(v.device, v, tensor)
+            for v in self._sync_on_read_variable.values))
+
+
+class SyncOnReadVariable(DistributedVariable):
+  """Holds a map from replica to variables whose values are reduced on save."""
+
+  def _update_replica(self, update_fn, value, **kwargs):
+    return update_fn(self._get_on_device_or_primary(), value, **kwargs)
+
+  # TODO(b/154017756): Make assign behaivor in cross replica context consistent
+  # with MirroredVariable.
+  def assign_sub(self, *args, **kwargs):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        if self._aggregation == vs.VariableAggregation.SUM:
+          raise ValueError(
+              "SyncOnReadVariable does not support `assign_sub` in "
+              "cross-replica context when aggregation is set to "
+              "`tf.VariableAggregation.SUM`.")
+        return control_flow_ops.group(
+            tuple(
+                _assign_sub_on_device(v.device, v, args[0])
+                for v in self._values))
+      else:
+        return super(SyncOnReadVariable, self).assign_sub(*args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        if self._aggregation == vs.VariableAggregation.SUM:
+          raise ValueError(
+              "SyncOnReadVariable does not support `assign_add` in "
+              "cross-replica context when aggregation is set to "
+              "`tf.VariableAggregation.SUM`.")
+        return control_flow_ops.group(
+            tuple(
+                _assign_add_on_device(v.device, v, args[0])
+                for v in self._values))
+      else:
+        return super(SyncOnReadVariable, self).assign_add(*args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        # To preserve the sum across save and restore, we have to divide the
+        # total across all devices when restoring a variable that was summed
+        # when saving.
+        tensor = args[0]
+        if self._aggregation == vs.VariableAggregation.SUM:
+          tensor = math_ops.cast(tensor / len(self._values), self.dtype)
+        return control_flow_ops.group(
+            tuple(_assign_on_device(v.device, v, tensor) for v in self._values))
+      else:
+        return super(SyncOnReadVariable, self).assign(*args, **kwargs)
+
+  def _scatter_not_implemented(self, method):
+    raise NotImplementedError(
+        "Variables with `synchronization=ON_READ` doesn't support `%s`" %
+        method)
+
+  def scatter_sub(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_sub")
+
+  def scatter_add(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_add")
+
+  def scatter_mul(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_mul")
+
+  def scatter_div(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_div")
+
+  def scatter_min(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_min")
+
+  def scatter_max(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_max")
+
+  def scatter_update(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_update")
+
+  def value(self):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        return self._get_cross_replica()
+      else:
+        # _get_on_device_or_primary() returns a Variable.
+        return self._get_on_device_or_primary().value()
+
+  def _get_cross_replica(self):
+    if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+      return self._primary
+
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      return self._distribute_strategy.reduce(
+          reduce_util.ReduceOp.from_variable_aggregation(self.aggregation),
+          self,
+          axis=None)
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        return ops.convert_to_tensor(self._get_cross_replica())
+    return self._get()._as_graph_element()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides Trackable method.
+
+    This allows both name-based and object-based save and restore of
+    `SyncOnReadVariable`s.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+
+    def _saveable_factory(name=self._common_name):
+      return _SyncOnReadSaveable(self, name)
+
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts a variable to a tensor."""
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      return ops.convert_to_tensor(
+          self._get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+# Variable creation function for sync strategies.
 def create_mirrored_variable(  # pylint: disable=missing-docstring
     strategy, real_mirrored_creator, mirrored_cls, sync_on_read_cls, **kwargs):
   # Figure out what collections this variable should be added to.
@@ -710,6 +1107,13 @@ def create_mirrored_variable(  # pylint: disable=missing-docstring
     value_list = real_mirrored_creator(**kwargs)
     var_cls = sync_on_read_cls if is_sync_on_read else mirrored_cls
     result = var_cls(strategy, value_list, aggregation)
+    # Install the created DistributedVariable as _distributed_container property
+    # of the underlying variables, to make it easy to map back to the container.
+    for v in result.values:
+      # Hold a strong reference to avoid the container from being GC-ed. After
+      # v = v.assign(), the user code may no longer holds references to the
+      # original container, since v.assign() returns a new DistributedVariable.
+      v._distributed_container = result  # pylint: disable=protected-access
 
   # Add the wrapped variable to the requested collections.
   # The handling of eager mode and the global step matches
@@ -737,175 +1141,9 @@ def create_mirrored_variable(  # pylint: disable=missing-docstring
   return result
 
 
-class MirroredVariable(DistributedVariable, Mirrored):
-  """Holds a map from replica to variables whose values are kept in sync."""
-
-  def __init__(self, strategy, values, aggregation):
-    super(MirroredVariable, self).__init__(strategy, values)
-    self._aggregation = aggregation
-
-  def _mirrored_update(self, update_fn, *args, **kwargs):
-    """Apply identical updates using `update_fn` to variables on each replica."""
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        update_replica_id = distribute_lib.get_update_replica_id()
-        if update_replica_id is not None:
-          # We are calling an update function on the mirrored variable in an
-          # update context.
-          #
-          # The arguments to update() are automatically unwrapped so the
-          # update() function would normally see regular variables, not
-          # MirroredVariables. However, the update function can still operate on
-          # wrapped MirroredVariables through object members, captured arguments
-          # , etc. This is more likely in an update_non_slot() function
-          # , which can update several non-slot variables in one call.
-          return update_fn(self._values[update_replica_id], *args, **kwargs)
-
-        # We are calling update on the mirrored variable in cross replica
-        # context, use `strategy.extended.update()` to update the variable.
-        return self._distribute_strategy.extended.update(
-            self, update_fn, args=args, kwargs=kwargs)
-      else:
-        _assert_replica_context(self._distribute_strategy)
-        # We are calling an update function on the mirrored variable in replica
-        # context.
-        # We reduce the value we want to update. More details about how
-        # we handle the different use cases can be found in the _reduce method.
-        # We call the function on each of the mirrored variables with the
-        # reduced value.
-        if self._aggregation == vs.VariableAggregation.NONE:
-          raise ValueError(
-              _aggregation_error_msg.format(variable_type="MirroredVariable"))
-
-        def merge_fn(strategy, value, *other_args, **other_kwargs):
-          """Aggregate across replicas and update MV with aggregated value."""
-          # Don't allow MEAN with non float dtype, since it may cause unexpected
-          # precision loss. Python3 and NumPy automatically upcast integers to
-          # float in division, but we should always preserve the type.
-          #
-          # Note that to be backward compatible we allow the case when the value
-          # is *always* the same on each replica. I.E. value is not a
-          # PerReplica. Refer to regroup() to see how values are grouped.
-          if self._aggregation == vs.VariableAggregation.MEAN and (
-              not self.dtype.is_floating) and isinstance(value, PerReplica):
-            raise ValueError(
-                "Cannot update non-float variables with "
-                "tf.VariableAggregation.MEAN aggregation in replica context. "
-                "Either change the variable dtype to float or update it in "
-                "cross-replica context.")
-
-          v = _apply_aggregation(strategy, value, self._aggregation, self)
-          return strategy.extended.update(
-              self, update_fn, args=(v,) + other_args, kwargs=other_kwargs)
-
-        return ds_context.get_replica_context().merge_call(
-            merge_fn, args=args, kwargs=kwargs)
-
-  def assign_sub(self, *args, **kwargs):
-    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
-    return self._mirrored_update(assign_sub_fn, *args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
-    return self._mirrored_update(assign_add_fn, *args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
-    return self._mirrored_update(assign_fn, *args, **kwargs)
-
-  def scatter_sub(self, *args, **kwargs):
-    scatter_sub_fn = lambda var, *a, **kw: var.scatter_sub(*a, **kw)
-    return self._mirrored_update(scatter_sub_fn, *args, **kwargs)
-
-  def scatter_add(self, *args, **kwargs):
-    scatter_add_fn = lambda var, *a, **kw: var.scatter_add(*a, **kw)
-    return self._mirrored_update(scatter_add_fn, *args, **kwargs)
-
-  def scatter_mul(self, *args, **kwargs):
-    scatter_mul_fn = lambda var, *a, **kw: var.scatter_mul(*a, **kw)
-    return self._mirrored_update(scatter_mul_fn, *args, **kwargs)
-
-  def scatter_div(self, *args, **kwargs):
-    scatter_div_fn = lambda var, *a, **kw: var.scatter_div(*a, **kw)
-    return self._mirrored_update(scatter_div_fn, *args, **kwargs)
-
-  def scatter_min(self, *args, **kwargs):
-    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
-        self._aggregation != vs.VariableAggregation.NONE):
-      raise NotImplementedError("scatter_min is only supported for mirrored "
-                                "variable (variable created within certain "
-                                "`tf.distribute.Strategy` scope) with NONE or "
-                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
-                                self._aggregation)
-    scatter_min_fn = lambda var, *a, **kw: var.scatter_min(*a, **kw)
-    return self._mirrored_update(scatter_min_fn, *args, **kwargs)
-
-  def scatter_max(self, *args, **kwargs):
-    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
-        self._aggregation != vs.VariableAggregation.NONE):
-      raise NotImplementedError("scatter_max is only supported for mirrored "
-                                "variable (variable created within certain "
-                                "`tf.distribute.Strategy` scope) with NONE or "
-                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
-                                self._aggregation)
-    scatter_max_fn = lambda var, *a, **kw: var.scatter_max(*a, **kw)
-    return self._mirrored_update(scatter_max_fn, *args, **kwargs)
-
-  def scatter_update(self, *args, **kwargs):
-    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
-        self._aggregation != vs.VariableAggregation.NONE):
-      raise NotImplementedError("scatter_update is only supported for mirrored "
-                                "variable (variable created within certain "
-                                "`tf.distribute.Strategy` scope) with NONE or "
-                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
-                                self._aggregation)
-    scatter_update_fn = lambda var, *a, **kw: var.scatter_update(*a, **kw)
-    return self._mirrored_update(scatter_update_fn, *args, **kwargs)
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  def _get_cross_replica(self):
-    # Return identity, to avoid directly exposing the variable to the user and
-    # allowing it to be modified by mistake.
-    return array_ops.identity(Mirrored._get_cross_replica(self))
-
-  def _as_graph_element(self):
-    return self._get_closest()._as_graph_element()  # pylint: disable=protected-access
-
-  def _gather_saveables_for_checkpoint(self):
-    """Overrides Trackable method.
-
-    This allows both name-based and object-based save and restore of
-    MirroredVariables.
-
-    Returns:
-      A dictionary mapping attribute names to `SaveableObject` factories.
-    """
-
-    def _saveable_factory(name=self._common_name):
-      return _MirroredSaveable(self, self._primary, name)
-
-    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    """Converts a variable to a tensor."""
-    # Try to avoid assignments to and other mutations of MirroredVariable
-    # state except through a DistributionStrategy.extended.update() call.
-    if as_ref:
-      # A TF 1.x case where the variable is a boolean variable and used like:
-      # tf.cond(v, true_fn, false_fn).
-      raise ValueError(
-          "You may be using variable created under distribute strategy in TF "
-          "1.x control flows. Try explicitly converting the variable to Tensor "
-          "using variable.read_value(), or switch to TF 2.x.")
-    return ops.convert_to_tensor(
-        self._get(), dtype=dtype, name=name, as_ref=as_ref)
-
-
-# Register a conversion function which reads the value of the variable,
+# Register a conversion functions which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
+# MirroredVariables
 def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
@@ -914,6 +1152,7 @@ ops.register_tensor_conversion_function(MirroredVariable,
                                         _tensor_conversion_mirrored)
 
 
+# Mirrored Values
 def _tensor_conversion_mirrored_val(value, dtype=None, name=None, as_ref=False):
   return ops.convert_to_tensor(
       value._get(), dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
@@ -923,168 +1162,7 @@ ops.register_tensor_conversion_function(Mirrored,
                                         _tensor_conversion_mirrored_val)
 
 
-def is_distributed_variable(v):
-  """Determine if a variable is ds variable or TPU mirrored variable."""
-  return isinstance(v, DistributedVariable)
-
-
-class _SyncOnReadSaveable(saveable_object.SaveableObject):
-  """Class for defining how to restore a SyncOnReadVariable."""
-
-  def __init__(self, sync_on_read_variable, name):
-    self._sync_on_read_variable = sync_on_read_variable
-
-    # We use a callable so that we don't have to evaluate this expression
-    # in the case where we are trying to restore instead of save.
-    def tensor():
-      strategy = sync_on_read_variable._distribute_strategy  # pylint: disable=protected-access
-      return strategy.extended.read_var(sync_on_read_variable)
-
-    spec = saveable_object.SaveSpec(
-        tensor=tensor,
-        slice_spec="",
-        name=name,
-        dtype=sync_on_read_variable.dtype,
-        device=sync_on_read_variable._primary.device)  # pylint: disable=protected-access
-
-    super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    """Restore the same value into all variables."""
-    # To preserve the sum across save and restore, we have to divide the
-    # total across all devices when restoring a variable that was summed
-    # when saving.
-    tensor, = restored_tensors
-    if self._sync_on_read_variable.aggregation == vs.VariableAggregation.SUM:
-      tensor = math_ops.cast(tensor / len(self._sync_on_read_variable._devices),  # pylint: disable=protected-access
-                             self._sync_on_read_variable.dtype)
-    return control_flow_ops.group(
-        tuple(
-            _assign_on_device(v.device, v, tensor)
-            for v in self._sync_on_read_variable.values))
-
-
-def _assert_replica_context(strategy):
-  replica_context = ds_context.get_replica_context()
-  if not replica_context:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-  if replica_context.strategy is not strategy:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-
-
-class SyncOnReadVariable(DistributedVariable):
-  """Holds a map from replica to variables whose values are reduced on save."""
-
-  def __init__(self, strategy, values, aggregation):
-    super(SyncOnReadVariable, self).__init__(strategy, values)
-    self._aggregation = aggregation
-
-  def assign_sub(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        if self._aggregation == vs.VariableAggregation.SUM:
-          raise ValueError(
-              "SyncOnReadVariable does not support `assign_sub` in "
-              "cross-replica context when aggregation is set to "
-              "`tf.VariableAggregation.SUM`.")
-        return control_flow_ops.group(
-            tuple(
-                _assign_sub_on_device(v.device, v, args[0])
-                for v in self._values))
-      else:
-        return self._get().assign_sub(*args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        if self._aggregation == vs.VariableAggregation.SUM:
-          raise ValueError(
-              "SyncOnReadVariable does not support `assign_add` in "
-              "cross-replica context when aggregation is set to "
-              "`tf.VariableAggregation.SUM`.")
-        return control_flow_ops.group(
-            tuple(
-                _assign_add_on_device(v.device, v, args[0])
-                for v in self._values))
-      else:
-        return self._get().assign_add(*args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        # To preserve the sum across save and restore, we have to divide the
-        # total across all devices when restoring a variable that was summed
-        # when saving.
-        tensor = args[0]
-        if self._aggregation == vs.VariableAggregation.SUM:
-          tensor = math_ops.cast(tensor / len(self._values), self.dtype)
-        return control_flow_ops.group(
-            tuple(_assign_on_device(v.device, v, tensor) for v in self._values))
-      else:
-        return self._get().assign(*args, **kwargs)
-
-  def value(self):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        return self._get_cross_replica()
-      else:
-        # _get_closest() returns a Variable.
-        return self._get_closest().value()
-
-  def numpy(self):
-    if context.executing_eagerly():
-      return self.read_value().numpy()
-    else:
-      raise NotImplementedError(
-          "numpy() is only available when eager execution is enabled.")
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  def _get_cross_replica(self):
-    if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self._primary
-
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      return self._distribute_strategy.reduce(
-          reduce_util.ReduceOp.from_variable_aggregation(self.aggregation),
-          self,
-          axis=None)
-
-  def _as_graph_element(self):
-    # pylint: disable=protected-access
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        return ops.convert_to_tensor(self._get_cross_replica())
-    return self._get()._as_graph_element()
-
-  def _gather_saveables_for_checkpoint(self):
-    """Overrides Trackable method.
-
-    This allows both name-based and object-based save and restore of
-    `SyncOnReadVariable`s.
-
-    Returns:
-      A dictionary mapping attribute names to `SaveableObject` factories.
-    """
-
-    def _saveable_factory(name=self._common_name):
-      return _SyncOnReadSaveable(self, name)
-
-    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    """Converts a variable to a tensor."""
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      return ops.convert_to_tensor(
-          self._get(), dtype=dtype, name=name, as_ref=as_ref)
-
-
-# Register a conversion function for SyncOnReadVariable which allows as_ref to
-# be true.
+# SyncOnReadVariables
 def _tensor_conversion_sync_on_read(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
@@ -1132,11 +1210,11 @@ def regroup(values, wrap_class=PerReplica, always_wrap=False):
       return regrouped_tuple
 
   if isinstance(v0, dict):
-    v0keys = set(v0.keys())
+    v0keys = v0.keys()
     for v in values[1:]:
       assert isinstance(v, dict), ("v[0]: %r  v[i]: %r" % (v0, v))
-      assert set(v.keys()) == v0keys, ("v[0].keys: %s  v[i].keys: %s" %
-                                       (v0keys, set(v.keys())))
+      assert set(v.keys()) == set(v0keys), ("v[0].keys: %s  v[i].keys: %s" %
+                                            (set(v0keys), set(v.keys())))
     # Use the actual type in case it is a class inherited from a dict.
     return type(v0)({
         key: regroup(tuple(v[key] for v in values), wrap_class)
@@ -1177,10 +1255,10 @@ def regroup(values, wrap_class=PerReplica, always_wrap=False):
     # pylint: disable=protected-access
     assert not isinstance(v0, MirroredVariable), (
         "ids = %s, values = %s" % ([id(v) for v in values], values))
-    distributed_container = v0._distributed_container()
+    distributed_container = v0._distributed_container
     assert distributed_container is not None
     for v in values[1:]:
-      assert distributed_container is v._distributed_container()
+      assert distributed_container is v._distributed_container
     return distributed_container
   # pylint: enable=protected-access
 
@@ -1268,13 +1346,44 @@ def value_container(val):
       # DistributedVariable has _distributed_container defined
       # but we don't want to return it.
       not isinstance(val, DistributedVariable)):
-    container = val._distributed_container()  # pylint: disable=protected-access
+    container = val._distributed_container  # pylint: disable=protected-access
     if container is not None:
       return container
   return val
 
 
-class AggregatingVariable(variables_lib.Variable):
+def is_distributed_variable(v):
+  """Determine if a variable is ds variable or TPU mirrored variable."""
+  return isinstance(v, DistributedVariable)
+
+
+def _validate_colocate_extended(v, extended):
+  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
+  if variable_strategy.extended is not extended:
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
+        (v, variable_strategy))
+
+
+def validate_colocate_distributed_variable(v, extended):
+  if not isinstance(v, DistributedVariable):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+def validate_colocate(v, extended):
+  if not hasattr(v, "_distribute_strategy"):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+# Variable used in PSStrategy TF 1 and CentralStorageStrategy.
+class AggregatingVariable(variables_lib.Variable, core.Tensor):
   """A wrapper around a variable that aggregates updates across replicas."""
 
   def __init__(self, strategy, v, aggregation):
@@ -1319,11 +1428,23 @@ class AggregatingVariable(variables_lib.Variable):
               _aggregation_error_msg.format(
                   variable_type="AggregatingVariable"))
 
-        def merge_fn(strategy, value, *other_args, **other_kwargs):
+        def merge_fn(strategy,
+                     value,
+                     use_locking=False,
+                     name=None,
+                     read_value=True):
           v = _apply_aggregation(strategy, value, self._aggregation, self)
+          if name and isinstance(name, PerReplica):
+            name = name.values[0]
           return strategy.extended.update(
-              self, f, args=(v,) + other_args, kwargs=other_kwargs)
-
+              self,
+              f,
+              args=(v,),
+              kwargs={
+                  "use_locking": use_locking,
+                  "name": name,
+                  "read_value": read_value
+              })
         return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
@@ -1531,4 +1652,3 @@ def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
 
 ops.register_tensor_conversion_function(AggregatingVariable,
                                         _tensor_conversion_aggregate)
-ops.register_dense_tensor_like_type(AggregatingVariable)
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 96089a67b00..ef26174e82d 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -56,6 +56,7 @@ from tensorflow.python.saved_model.model_utils import mode_keys
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import util as trackable_utils
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 
 
@@ -229,7 +230,8 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
-              strategy_combinations.central_storage_strategy_with_two_gpus,
+              # TODO(b/137795644): support CentralStroageStrategy
+              # strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
           mode=["eager"]
       ))
@@ -253,7 +255,8 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
-              strategy_combinations.central_storage_strategy_with_two_gpus,
+              # TODO(b/137795644): support CentralStroageStrategy
+              # strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
           mode=["eager"]
       ))
@@ -363,7 +366,7 @@ def _make_mirrored():
   return mirrored
 
 
-class RegroupAndSelectDeviceTest(test.TestCase):
+class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
 
   def _is_per_replica(self, result, expected, klass=values.PerReplica):
     self.assertIsInstance(result, klass)
@@ -373,12 +376,12 @@ class RegroupAndSelectDeviceTest(test.TestCase):
   def testNested(self):
     result = values.regroup((_nested_value("1"), _nested_value("2")))
     self.assertIsInstance(result, tuple)
-    self.assertEqual(3, len(result))
+    self.assertLen(result, 3)
     self._is_per_replica(result[0], ["a1", "a2"])
     self._is_per_replica(result[2], ["h1", "h2"])
 
     self.assertIsInstance(result[1], list)
-    self.assertEqual(3, len(result[1]))
+    self.assertLen(result[1], 3)
     self._is_per_replica(result[1][0], ["b1", "b2"])
     self._is_per_replica(result[1][2], ["g1", "g2"])
 
@@ -414,12 +417,12 @@ class RegroupAndSelectDeviceTest(test.TestCase):
     result = values.regroup((_nested_value("1"), _nested_value("2")),
                             values.Mirrored)
     self.assertIsInstance(result, tuple)
-    self.assertEqual(3, len(result))
+    self.assertLen(result, 3)
     self._is_per_replica(result[0], ["a1", "a2"], values.Mirrored)
     self._is_per_replica(result[2], ["h1", "h2"], values.Mirrored)
 
     self.assertIsInstance(result[1], list)
-    self.assertEqual(3, len(result[1]))
+    self.assertLen(result[1], 3)
     self._is_per_replica(result[1][0], ["b1", "b2"], values.Mirrored)
     self._is_per_replica(result[1][2], ["g1", "g2"], values.Mirrored)
 
@@ -442,34 +445,42 @@ class RegroupAndSelectDeviceTest(test.TestCase):
   def testWrapAListOfTwoTuples(self):
     result = values.regroup([("1", "2"), ("3", "4")])
     self.assertIsInstance(result, tuple)
-    self.assertEqual(2, len(result))
+    self.assertLen(result, 2)
     self._is_per_replica(result[0], ("1", "3"), values.PerReplica)
     self._is_per_replica(result[1], ("2", "4"), values.PerReplica)
 
-  def testMirroredContainer(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-    mirrored = _make_mirrored()
-    result = values.regroup(mirrored.values)
-    self.assertIs(mirrored, result)
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+          ],
+          mode=["graph", "eager"],
+      ))
+  def testMirroredContainer(self, distribution):
+    with distribution.scope():
+      v = variable_scope.variable(
+          1., aggregation=variable_scope.VariableAggregation.SUM)
+    self.assertTrue(values.is_distributed_variable(v))
+    self.assertTrue(values.is_distributed_variable(values.regroup(v.values)))
 
   def testSameId(self):
     foo = object()
     result = values.regroup((("a", foo), ("b", foo)))
     self.assertIsInstance(result, tuple)
-    self.assertEqual(2, len(result))
+    self.assertLen(result, 2)
     self._is_per_replica(result[0], ["a", "b"])
     self.assertIs(foo, result[1])
 
     # Test select_replica(), should undo the merge done by regroup().
     result_0 = values.select_replica(0, result)
     self.assertIsInstance(result_0, tuple)
-    self.assertEqual(2, len(result_0))
+    self.assertLen(result_0, 2)
     self.assertEqual("a", result_0[0])
     self.assertIs(foo, result_0[1])
     result_1 = values.select_replica(1, result)
     self.assertIsInstance(result_1, tuple)
-    self.assertEqual(2, len(result_1))
+    self.assertLen(result_1, 2)
     self.assertEqual("b", result_1[0])
     self.assertIs(foo, result_1[1])
 
@@ -477,18 +488,7 @@ class RegroupAndSelectDeviceTest(test.TestCase):
     result = values.regroup((_nested_value("1"),))
     # On one device regroup() and select_replica() are basically identity.
     self.assertEqual(_nested_value("1"), result)
-    self.assertEqual(_nested_value("1"),
-                     values.select_replica(0, result))
-
-    # The one exception has to do with MirroredVariables.
-    d = "/device:CPU:0"
-    with ops.device(d):
-      v = variable_scope.get_variable(
-          name="v", initializer=1., use_resource=True)
-    mirrored = values.MirroredVariable(None, (v,),
-                                       variable_scope.VariableAggregation.SUM)
-    result = values.regroup((v,))
-    self.assertIs(mirrored, result)
+    self.assertEqual(_nested_value("1"), values.select_replica(0, result))
 
   def testNamedTuple(self):
 
@@ -624,10 +624,10 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
       v = variables_lib.Variable(
           0., synchronization=synchronization, aggregation=aggregation)
     # In cross replica context.
-    self.assertTrue(ops.is_dense_tensor_like(v))
+    self.assertIsInstance(v, core.Tensor)
     # In replica context.
     distribution.run(
-        lambda v: self.assertTrue(ops.is_dense_tensor_like(v)), args=(v,))
+        lambda v: self.assertIsInstance(v, core.Tensor), args=(v,))
 
   def testAssignReturnValueIsTensorLike(self, distribution, synchronization,
                                         aggregation):
@@ -646,9 +646,9 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
       # values is not allowed when aggregation is SUM. See
       # `cross_device_ops.reduce_non_distributed_value`.
       delta = array_ops.identity(1.)
-      self.assertTrue(ops.is_dense_tensor_like(v.assign(delta)))
-      self.assertTrue(ops.is_dense_tensor_like(v.assign_sub(delta)))
-      self.assertTrue(ops.is_dense_tensor_like(v.assign_add(delta)))
+      self.assertIsInstance(v.assign(delta), core.Tensor)
+      self.assertIsInstance(v.assign_sub(delta), core.Tensor)
+      self.assertIsInstance(v.assign_add(delta), core.Tensor)
 
     # In cross replica context we return a PerReplica which is not Tensor like
     # yet.
@@ -656,6 +656,35 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
     # In replica context.
     distribution.run(assert_is_tensor_like, args=(v,))
 
+  def testAssignSignature(self, distribution, synchronization, aggregation):
+    # This test verifies assign*() can be called in the same way as normal
+    # variables.
+    with distribution.scope():
+      v = variables_lib.Variable(
+          0., synchronization=synchronization, aggregation=aggregation)
+
+      def assign():
+        one = constant_op.constant(1.)
+        v.assign(one, True, "assign", False)
+        # TODO(b/154017756): SyncOnReadVariable.assign() doesn't support passing
+        # value as a keyword argument.
+        v.assign(one, use_locking=True, name="assign", read_value=False)
+        v.assign_add(one, True, "assign", False)
+        v.assign_add(one, use_locking=True, name="assign", read_value=False)
+        v.assign_sub(one, True, "assign", False)
+        v.assign_sub(one, use_locking=True, name="assign", read_value=False)
+        # Return something for graph mode to fetch.
+        return constant_op.constant(1)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      if not (synchronization == variables_lib.VariableSynchronization.ON_READ
+              and aggregation == variables_lib.VariableAggregation.SUM):
+        self.evaluate(distribution.experimental_local_results(assign()))
+      if not (isinstance(distribution.extended, tpu_strategy.TPUExtended) and
+              context.executing_eagerly()):
+        self.evaluate(
+            distribution.experimental_local_results(distribution.run(assign)))
+
 
 class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
@@ -1693,8 +1722,8 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
                                          experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.SUM,
-        variables_lib.VariableAggregation.MEAN,
-        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+        # variables_lib.VariableAggregation.MEAN,
+        # variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
     ]
     for aggregation in aggregations:
       if isinstance(distribution, _TPU_STRATEGIES):
@@ -1824,6 +1853,146 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(vals[0], vals[1])
 
 
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+        ],
+        aggregation=[
+            variables_lib.VariableAggregation.MEAN,
+            variables_lib.VariableAggregation.SUM,
+            variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+        ],
+        mode=["graph", "eager"]))
+class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
+
+  def testScatterSub(self, distribution, aggregation):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [1., 1., 1.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[0.], [1.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[1.], [2.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_sub, args=(delta,)))
+
+  def testScatterAdd(self, distribution, aggregation):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [1., 1., 1.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[0.], [1.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[1.], [2.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_add, args=(delta,)))
+
+  def testScatterDiv(self, distribution, aggregation):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [2., 6., 1.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[2.], [2.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[3.], [3.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_div, args=(delta,)))
+
+  def testScatterMul(self, distribution, aggregation):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [2., 1., 1.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[2.], [3.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[4.], [5.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_mul, args=(delta,)))
+
+  def testScatterMin(self, distribution, aggregation):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [3., 4., 5.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[1.], [8.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[9.], [2.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
+
+  def testScatterMax(self, distribution, aggregation):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [3., 4., 5.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[1.], [8.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[9.], [2.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_max, args=(delta,)))
+
+  def testScatterUpdate(self, distribution, aggregation):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [0., 0., 0.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[1.], [2.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[3.], [4.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
+
+
 @combinations.generate(
     combinations.combine(
         distribution=[
@@ -1981,5 +2150,12 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
           condition, lambda: per_replica_1, lambda: per_replica_2)
 
 
+def _make_index_slices(values, indices, dense_shape=None):
+  if dense_shape:
+    dense_shape = array_ops.identity(dense_shape)
+  return indexed_slices.IndexedSlices(
+      array_ops.identity(values), array_ops.identity(indices), dense_shape)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index e5d7a9cfe50..adc30eab5e1 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load(
@@ -40,6 +40,9 @@ cc_library(
         "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:dlpack",
         "//tensorflow/c/eager:tape",
+        "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/c/eager:tfe_op_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -81,6 +84,7 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":backprop",
+        ":benchmarks_test_base",
         ":cancellation",
         ":context",
         ":core",
@@ -257,6 +261,7 @@ cuda_py_test(
     name = "profiler_test",
     srcs = ["profiler_test.py"],
     python_version = "PY3",
+    tags = ["no_rocm"],
     deps = [
         ":profiler",
         ":test",
@@ -330,6 +335,7 @@ cuda_py_test(
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:memory_checker",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
@@ -426,6 +432,7 @@ cuda_py_test(
     srcs = ["function_test.py"],
     python_version = "PY3",
     shard_count = 15,
+    tags = ["nomac"],  # b/157056289
     deps = [
         ":backprop",
         ":cancellation",
@@ -610,12 +617,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "benchmarks_test_base",
+    srcs = ["benchmarks_test_base.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [":test"],
+)
+
 cuda_py_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
+        ":benchmarks_test_base",
         ":context",
         ":forwardprop",
         ":function",
@@ -634,6 +651,7 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":backprop",
+        ":benchmarks_test_base",
         ":context",
         ":forwardprop",
         ":function",
@@ -659,6 +677,7 @@ tf_py_test(
     deps = [
         ":backprop",
         ":context",
+        ":tape",
         ":test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -741,6 +760,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
+        "//tensorflow/python/profiler:traceme",
         "//tensorflow/python/training/tracking:base",
     ],
 )
@@ -941,7 +961,6 @@ cuda_py_test(
     shard_count = 8,
     tags = [
         "no_oss",  # This test launches local server
-        "notsan",  # TODO(b/152075365)
     ],
     deps = [
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 48a3301a3db..7a3dce7db4e 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -86,13 +86,13 @@ def make_attr(attr_type, value):
   # from integer value to class.
   if attr_type == int(pywrap_tfe.TF_ATTR_TYPE):
     return dtypes.as_dtype(value)
-  elif attr_type == [int(pywrap_tfe.TF_ATTR_TYPE)]:
+  if attr_type == [int(pywrap_tfe.TF_ATTR_TYPE)]:
     return [dtypes.as_dtype(v) for v in value]
-  elif attr_type == int(pywrap_tfe.TF_ATTR_SHAPE):
+  if attr_type == int(pywrap_tfe.TF_ATTR_SHAPE):
     return tensor_shape.as_shape(value).as_proto()
-  elif attr_type == [int(pywrap_tfe.TF_ATTR_SHAPE)]:
+  if attr_type == [int(pywrap_tfe.TF_ATTR_SHAPE)]:
     return [tensor_shape.as_shape(v).as_proto() for v in value]
-  elif isinstance(value, str):
+  if isinstance(value, str):
     return value.encode()
   return value
 
@@ -149,10 +149,9 @@ def _gradient_function(op_name, attr_tuple, num_inputs, inputs, outputs,
   # This does not work with v1 TensorArrays.
   if ops.executing_eagerly_outside_functions(
   ) or control_flow_util.EnableControlFlowV2(ops.get_default_graph()):
+    gradient_name_scope = "gradient_tape/"
     if forward_pass_name_scope:
-      gradient_name_scope = "gradient_tape/" + forward_pass_name_scope + "/"
-    else:
-      gradient_name_scope = "gradient_tape/"
+      gradient_name_scope += forward_pass_name_scope + "/"
     with ops.name_scope(gradient_name_scope):
       return grad_fn(mock_op, *out_grads)
   else:
@@ -242,6 +241,11 @@ def implicit_val_and_grad(f):
                        "function was being computed.")
 
     sources = [v.handle for v in variables]
+    for s in sources:
+      if getattr(s, "is_packed", False):
+        raise ValueError(
+            "GradientTape.gradient is not supported on packed EagerTensors yet."
+        )
     grad = imperative_grad.imperative_grad(this_tape, nest.flatten(end_node),
                                            sources)
     return end_node, list(zip(grad, variables))
@@ -530,7 +534,7 @@ def make_vjp(f, params=None, persistent=True):
     wrapped_fn = tfe.make_vjp(f)
     result, vjp = wrapped_fn(tf.constant(3.0))
     # result is 9.0
-    vjp()  # the vjp function rturns 6.0
+    vjp()  # the vjp function returns 6.0
 
   Raises:
     ValueError: if `f` returns None.
@@ -549,6 +553,10 @@ def make_vjp(f, params=None, persistent=True):
       ]
       args = _ensure_unique_tensor_objects(parameter_positions, args)
       for i in parameter_positions:
+        if getattr(args[i], "is_packed", False):
+          raise ValueError(
+              "GradientTape.gradient is not supported on packed EagerTensors"
+              "yet.")
         sources.append(args[i])
         tape.watch(this_tape, args[i])
       result = f(*args)
@@ -588,28 +596,27 @@ def aggregate_indexed_slices_gradients(grads):
   """Aggregates gradients containing `IndexedSlices`s."""
   if len(grads) < 1:
     return None
-  elif len(grads) == 1:
+  if len(grads) == 1:
     return grads[0]
-  else:
-    grads = [g for g in grads if g is not None]
-    # If any gradient is a `Tensor`, sum them up and return a dense tensor
-    # object.
-    if any(isinstance(g, ops.Tensor) for g in grads):
-      return math_ops.add_n(grads)
+  grads = [g for g in grads if g is not None]
+  # If any gradient is a `Tensor`, sum them up and return a dense tensor
+  # object.
+  if any(isinstance(g, ops.Tensor) for g in grads):
+    return math_ops.add_n(grads)
 
-    # The following `_as_indexed_slices_list` casts ids of IndexedSlices into
-    # int64. It is to make sure the inputs of `concat` all have same the data
-    # type.
-    grads = math_ops._as_indexed_slices_list(grads)  # pylint: disable=protected-access
+  # The following `_as_indexed_slices_list` casts ids of IndexedSlices into
+  # int64. It is to make sure the inputs of `concat` all have same the data
+  # type.
+  grads = math_ops._as_indexed_slices_list(grads)  # pylint: disable=protected-access
 
-    grads = [flatten_nested_indexed_slices(x) for x in grads]
-    # Form IndexedSlices out of the concatenated values and indices.
-    concat_grad = ops.IndexedSlices(
-        array_ops.concat([x.values for x in grads], axis=0),
-        array_ops.concat([x.indices for x in grads], axis=0),
-        grads[0].dense_shape)
+  grads = [flatten_nested_indexed_slices(x) for x in grads]
+  # Form IndexedSlices out of the concatenated values and indices.
+  concat_grad = ops.IndexedSlices(
+      array_ops.concat([x.values for x in grads], axis=0),
+      array_ops.concat([x.indices for x in grads], axis=0),
+      grads[0].dense_shape)
 
-    return concat_grad
+  return concat_grad
 
 
 def _aggregate_grads(gradients):
@@ -1034,6 +1041,10 @@ class GradientTape(object):
             logging.WARN, "The dtype of the source tensor must be "
             "floating (e.g. tf.float32) when calling GradientTape.gradient, "
             "got %r", t.dtype)
+      if getattr(t, "is_packed", False):
+        raise ValueError(
+            "GradientTape.gradient is not supported on packed EagerTensors yet."
+        )
 
     if output_gradients is not None:
       output_gradients = [None if x is None else ops.convert_to_tensor(x)
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 4259224f8e7..b28aaa3a626 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework.memory_checker import MemoryChecker
 from tensorflow.python.layers.pooling import max_pooling3d
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -1532,6 +1533,39 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
         self.assertIn('gradient_tape/my_scope/', op.name)
     self.assertEqual(num_sin_ops_found, 2)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testRecomputeGradWithNestedFunctionAndWhileLoop(self):
+
+    @custom_gradient.recompute_grad
+    @def_function.function
+    def outer(x):
+
+      @def_function.function
+      def middle(y):
+
+        @def_function.function
+        def inner(z):
+          return z + 1
+
+        i = constant_op.constant(0.0)
+        c = lambda y, i: i < 10.
+        b = lambda y, i: (inner(y), i + 1.0)
+        y, i = control_flow_ops.while_loop(c, b, [y, i])
+
+        return y
+
+      return middle(x)
+
+    with MemoryChecker() as memory_checker:
+      for _ in range(5):
+        x = variables.Variable(1.0, name='x')
+        with backprop.GradientTape():
+          y = outer(x)
+          self.assertAllEqual(y, 11.0)
+
+    memory_checker.report()
+    memory_checker.assert_no_leak_if_all_possibly_except_one()
+
 
 class JacobianTest(test.TestCase):
 
diff --git a/tensorflow/python/eager/benchmarks/resnet50/BUILD b/tensorflow/python/eager/benchmarks/resnet50/BUILD
index 6c63658e3c7..ccec9f858a2 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/BUILD
+++ b/tensorflow/python/eager/benchmarks/resnet50/BUILD
@@ -46,6 +46,7 @@ cuda_py_test(
         "oss_serial",
         "v1only",
     ],
+    tfrt_enabled = True,
     deps = [
         ":resnet50",
         ":resnet50_test_util",
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 5562b31fe95..30e2585e842 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.benchmarks.resnet50 import resnet50
 from tensorflow.python.eager.benchmarks.resnet50 import resnet50_test_util
+from tensorflow.python.framework import test_util
 
 
 def compute_gradients(model, images, labels, num_replicas=1):
@@ -103,18 +104,24 @@ class ResNet50Test(tf.test.TestCase):
       context.async_wait()
     self.assertEqual((2, 1000), output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply(self):
     self._apply(defun=False)
 
+  @test_util.disable_tfrt(
+      'TFE_ContextGetExecutorForThread not implemented b/156188669')
   def test_apply_async(self):
     self._apply(defun=False, execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun(self):
     self._apply(defun=True)
 
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun_async(self):
     self._apply(defun=True, execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_no_top(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
@@ -125,6 +132,7 @@ class ResNet50Test(tf.test.TestCase):
                     if data_format == 'channels_first' else (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_with_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False, pooling='avg')
@@ -133,6 +141,7 @@ class ResNet50Test(tf.test.TestCase):
       output = model(images, training=False)
     self.assertEqual((2, 2048), output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_no_average_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -144,6 +153,7 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 7, 7, 2048))
     self.assertEqual(output_shape, output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_block3_strides(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -155,6 +165,7 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_retrieve_intermediates(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -209,12 +220,15 @@ class ResNet50Test(tf.test.TestCase):
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].tag, 'loss')
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_train(self):
     self._test_train()
 
+  @test_util.disable_tfrt('TFE_ContextGetExecutorForThread missing b/156188669')
   def test_train_async(self):
     self._test_train(execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_no_garbage(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format)
@@ -318,9 +332,12 @@ class ResNet50Benchmarks(tf.test.Benchmark):
 
   def benchmark_eager_apply_async(self):
     self._benchmark_eager_apply(
-        'eager_apply_async', resnet50_test_util.device_and_data_format(),
-        defun=False, execution_mode=context.ASYNC)
+        'eager_apply_async',
+        resnet50_test_util.device_and_data_format(),
+        defun=False,
+        execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_apply_with_defun(self):
     self._benchmark_eager_apply(
         'eager_apply_with_defun',
@@ -380,6 +397,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_with_defun(self):
     self._benchmark_eager_train(
         'eager_train_with_defun', MockIterator,
@@ -393,9 +411,12 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       return iter(ds)
 
     self._benchmark_eager_train(
-        'eager_train_dataset', make_iterator,
-        resnet50_test_util.device_and_data_format(), defun=False)
+        'eager_train_dataset',
+        make_iterator,
+        resnet50_test_util.device_and_data_format(),
+        defun=False)
 
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_datasets_with_defun(self):
 
     def make_iterator(tensors):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 2005d002652..223b62ededa 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -40,6 +40,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import backprop  # pylint: disable=unused-import
+from tensorflow.python.eager import benchmarks_test_base
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import def_function
@@ -50,6 +51,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -59,6 +61,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.util import tf_inspect
 
 
 CPU = "/device:CPU:0"
@@ -104,33 +107,48 @@ def run_benchmark(func, num_iters, execution_mode=None):
     return end - start
 
 
-class MicroBenchmarks(test.Benchmark):
+class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
   def __init__(self):
-    # TODO(b/153054118): Add tf.RandomUniform
-    if not context.is_tfrt_enabled():
-      # used for multiply benchmarks
-      self._m_2 = random_ops.random_uniform([2])
+    # used for multiply benchmarks
+    self._m_2 = random_ops.random_uniform([2])
 
-      # used for matmul benchmarks
-      self._m_2_by_2 = random_ops.random_uniform((2, 2))
-      self._m_100_by_784 = random_ops.random_uniform((100, 784))
+    # used for matmul benchmarks
+    self._m_2_by_2 = random_ops.random_uniform((2, 2))
+    self._m_100_by_784 = random_ops.random_uniform((100, 784))
 
     self._num_iters_2_by_2 = 30000
     self._num_iters_100_by_784 = 30000
 
+    # used for conv2d benchmarks
+    self._m_8_28_28_3 = random_ops.random_uniform((8, 28, 28, 3))
+    self._m_1_3_3_1 = random_ops.random_uniform((1, 3, 3, 1))
+
+  def _get_benchmark_name(self):
+    """Mostly copied from benchmark.py _get_name()."""
+    stack = tf_inspect.stack()
+    name = None
+    for frame in stack[::-1]:
+      f_locals = frame[0].f_locals
+      f_self = f_locals.get("self", None)
+      if isinstance(f_self, test.Benchmark):
+        name = frame[3]  # Get the method name
+        # This is a hack to get around the fact that some methods might have a
+        # disable_tfrt decorator around them. In that case a function called
+        # 'decorated' wraps the real called function underneath and so we
+        # peek one deeper into the stack to get the real name.
+        if name == "decorated":
+          continue
+        else:
+          break
+    if name is None:
+      raise ValueError("Unable to determine calling Benchmark function.")
+    if context.is_tfrt_enabled():
+      name = name + "_tfrt"
+    return name
+
   def _run(self, func, num_iters, execution_mode=None):
-    total_time = run_benchmark(func, num_iters, execution_mode)
-    mean_us = total_time * 1e6 / num_iters
-    self.report_benchmark(
-        iters=num_iters,
-        wall_time=mean_us,
-        extras={
-            "examples_per_sec":
-                float("{0:.3f}".format(num_iters / total_time)),
-            "us_per_example":
-                float("{0:.3f}".format(total_time * 1e6 / num_iters))
-        })
+    self.run_report(run_benchmark, func, num_iters, execution_mode)
 
   def benchmark_create_np_array(self):
     func = lambda: np.array([3.0])
@@ -138,7 +156,6 @@ class MicroBenchmarks(test.Benchmark):
 
   def _benchmark_create_tensor(self, value, dtype, device):
     """Benchmark overheads of creating a Tensor object."""
-    ctx = context.context()
     if device == GPU:
       # Warmup the GPU
       ops.EagerTensor(value, device=device)
@@ -220,26 +237,32 @@ class MicroBenchmarks(test.Benchmark):
     tensor_b = constant_op.constant([[24, 24], [24, 24]])
     self._benchmark_add(tensor_a, tensor_b)
 
+  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_float_tensor_from_list_CPU(self):
     self._benchmark_create_tensor([[3.0]], dtypes.float32.as_datatype_enum, CPU)
 
+  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_float_tensor_from_np_array_CPU(self):
     self._benchmark_create_tensor(
         np.array([[3.0]], dtype=np.float32), dtypes.float32.as_datatype_enum,
         CPU)
 
+  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_int32_tensor_from_list_CPU(self):
     self._benchmark_create_tensor([[3]], dtypes.int32.as_datatype_enum, CPU)
 
+  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_int32_tensor_from_np_array_CPU(self):
     self._benchmark_create_tensor(
         np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, CPU)
 
+  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_float_tensor_from_list_GPU(self):
     if not context.num_gpus():
       return
     self._benchmark_create_tensor([[3.0]], dtypes.float32.as_datatype_enum, GPU)
 
+  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_float_tensor_from_np_array_GPU(self):
     if not context.num_gpus():
       return
@@ -247,12 +270,14 @@ class MicroBenchmarks(test.Benchmark):
         np.array([[3.0]], dtype=np.float32), dtypes.float32.as_datatype_enum,
         GPU)
 
+  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_int32_tensor_from_list_GPU(self):
     # int32's are kept on host memory even when executing on GPU.
     if not context.num_gpus():
       return
     self._benchmark_create_tensor([[3]], dtypes.int32.as_datatype_enum, GPU)
 
+  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_int32_tensor_from_np_array_GPU(self):
     # int32's are kept on host memory even when executing on GPU.
     if not context.num_gpus():
@@ -260,14 +285,17 @@ class MicroBenchmarks(test.Benchmark):
     self._benchmark_create_tensor(
         np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, GPU)
 
+  @test_util.disable_tfrt("strided slice not supported")
   def benchmark_index_tensor_with_literal(self):
     func = lambda: constant_op.constant([3.0])[0]
     self._run(func, 30000)
 
+  @test_util.disable_tfrt("strided slice not supported")
   def benchmark_index_tensor_with_tensor(self):
     func = lambda idx=constant_op.constant(0): constant_op.constant([3.0])[idx]
     self._run(func, 30000)
 
+  @test_util.disable_tfrt("strided slice not supported")
   def benchmark_index_tensor_with_np_array(self):
     func = lambda idx=np.array(0): constant_op.constant([3.0])[idx]
     self._run(func, 30000)
@@ -281,10 +309,15 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: m * m
     self._run(func, num_iters)
 
+  def _benchmark_tf_conv2d(self, m1, m2, num_iters):
+    func = lambda: nn_ops.conv2d(m1, m2, strides=[1, 1, 1, 1], padding="VALID")
+    self._run(func, num_iters)
+
   def _benchmark_tf_multiply_op(self, m, num_iters):
     func = lambda: math_ops.multiply(m, m)
     self._run(func, num_iters)
 
+  @test_util.disable_tfrt("numpy() not supported")
   def benchmark_np_multiply(self):
     self._benchmark_np_multiply(self._m_2, 30000)
 
@@ -293,6 +326,7 @@ class MicroBenchmarks(test.Benchmark):
       m = self._m_2.cpu()
       self._benchmark_tf_multiply(m, 30000)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_multiply_GPU(self):
     if not context.num_gpus():
       return
@@ -305,6 +339,7 @@ class MicroBenchmarks(test.Benchmark):
       m = self._m_2.cpu()
       self._benchmark_tf_multiply_op(m, 30000)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_multiply_op_GPU(self):
     if not context.num_gpus():
       return
@@ -312,10 +347,26 @@ class MicroBenchmarks(test.Benchmark):
       m = self._m_2.gpu()
       self._benchmark_tf_multiply_op(m, 30000)
 
+  def benchmark_tf_conv2d_CPU(self):
+    with context.device(CPU):
+      m1 = self._m_8_28_28_3.cpu()
+      m2 = self._m_1_3_3_1.cpu()
+      self._benchmark_tf_conv2d(m1, m2, 30000)
+
+  @test_util.disable_tfrt("copy to GPU not supported")
+  def benchmark_tf_conv2d_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m1 = self._m_8_28_28_3.gpu()
+      m2 = self._m_1_3_3_1.gpu()
+      self._benchmark_tf_conv2d(m1, m2, 30000)
+
   def benchmark_tf_identity(self):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)
 
+  @test_util.disable_tfrt("identity not supported")
   def benchmark_slowpath_tf_identity(self):
     self._run(lambda: gen_array_ops.identity(1), 30000)
 
@@ -330,6 +381,7 @@ class MicroBenchmarks(test.Benchmark):
 
     self._run(f, 30000)
 
+  @test_util.disable_tfrt("identity not supported")
   def benchmark_tf_gradient_function_identity(self):
     with context.device(CPU):
       m = gen_array_ops.identity(self._m_2)
@@ -337,12 +389,14 @@ class MicroBenchmarks(test.Benchmark):
           lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
           30000)
 
+  @test_util.disable_tfrt("identity not supported")
   def benchmark_tf_gradient_forward_identity(self):
     with backprop.GradientTape() as tape:
       m = self._m_2
       tape.watch(m)
       self._run(lambda: gen_array_ops.identity(m), 30000)
 
+  @test_util.disable_tfrt("gradients not supported")
   def benchmark_tf_gradient_tape_push_pop(self):
 
     def f():
@@ -351,6 +405,7 @@ class MicroBenchmarks(test.Benchmark):
 
     self._run(f, 30000)
 
+  @test_util.disable_tfrt("gradients not supported")
   def benchmark_tf_gradient_function_no_op(self):
     with context.device(CPU):
       m = gen_array_ops.identity(self._m_2)
@@ -462,6 +517,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -489,12 +545,14 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("Mutex corrupt: waiting writer with no waiters")
   def benchmark_defun_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -504,12 +562,14 @@ class MicroBenchmarks(test.Benchmark):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt("Mutex corrupt: waiting writer with no waiters")
   def benchmark_defun_matmul_forward_backward_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul_forward_backward(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_forward_backward_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -519,6 +579,7 @@ class MicroBenchmarks(test.Benchmark):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -527,6 +588,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_2_by_2_GPU_async(self):
     if not context.num_gpus():
       return
@@ -538,6 +600,7 @@ class MicroBenchmarks(test.Benchmark):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_gen_math_ops_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -546,6 +609,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -554,6 +618,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -562,6 +627,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_2_by_2_GPU_async(self):
     if not context.num_gpus():
       return
@@ -573,6 +639,7 @@ class MicroBenchmarks(test.Benchmark):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_nested_defun_matmul_2_by_2(self):
     m = self._m_2_by_2.cpu()
     self._benchmark_nested_defun_matmul(
@@ -591,6 +658,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_100_by_784_CPU_async(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -612,18 +680,21 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tfe_py_fastpath_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -632,6 +703,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_100_by_784_GPU_async(self):
     if not context.num_gpus():
       return
@@ -643,6 +715,7 @@ class MicroBenchmarks(test.Benchmark):
           num_iters=self._num_iters_100_by_784,
           execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_gen_math_ops_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -651,6 +724,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -659,6 +733,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_defun_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -667,6 +742,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_nested_defun_matmul_100_by_784(self):
     m = self._m_100_by_784.gpu()
     self._benchmark_nested_defun_matmul(
@@ -739,27 +815,35 @@ class MicroBenchmarks(test.Benchmark):
         func()
       self._run(func, 3000)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(256, 2096))
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(256, 2096))
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(256, 2096))
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(256, 2096))
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(100, 784))
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(100, 784))
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(100, 784))
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(100, 784))
 
@@ -776,29 +860,37 @@ class MicroBenchmarks(test.Benchmark):
         func = lambda: math_ops.reduce_logsumexp(x)
       self._run(func, 3000, execution_mode=execution_mode)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_CPU(self):
     self._benchmark_tf_reduce_logsumexp()
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_CPU_async(self):
     self._benchmark_tf_reduce_logsumexp(execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU(self):
     self._benchmark_tf_reduce_logsumexp(device=GPU)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU_async(self):
     self._benchmark_tf_reduce_logsumexp(device=GPU,
                                         execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_CPU_defunc(self):
     self._benchmark_tf_reduce_logsumexp(defunc=True)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_CPU_async_defun(self):
     self._benchmark_tf_reduce_logsumexp(
         execution_mode=context.ASYNC, defunc=True)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU_defun(self):
     self._benchmark_tf_reduce_logsumexp(device=GPU, defunc=True)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU_async_defun(self):
     self._benchmark_tf_reduce_logsumexp(
         device=GPU, execution_mode=context.ASYNC, defunc=True)
@@ -810,15 +902,19 @@ class MicroBenchmarks(test.Benchmark):
       func = lambda: math_ops.tensordot(a, b, [[1], [0]])
       self._run(func, 30000, execution_mode=execution_mode)
 
+  @test_util.disable_tfrt("tensordot not supported")
   def benchmark_tf_tensordot_CPU(self):
     self._benchmark_tf_tensordot()
 
+  @test_util.disable_tfrt("tensordot not supported")
   def benchmark_tf_tensordot_CPU_async(self):
     self._benchmark_tf_tensordot(execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt("tensordot not supported")
   def benchmark_tf_tensordot_GPU(self):
     self._benchmark_tf_tensordot(device=GPU)
 
+  @test_util.disable_tfrt("tensordot not supported")
   def benchmark_tf_tensordot_GPU_async(self):
     self._benchmark_tf_tensordot(device=GPU, execution_mode=context.ASYNC)
 
@@ -827,48 +923,63 @@ class MicroBenchmarks(test.Benchmark):
       func = lambda: array_ops.zeros(shape, dtype)
       self._run(func, 3000)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_float32_CPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.float32)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_bool_CPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.bool)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_string_CPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.string)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_float32_GPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.float32, device=GPU)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_bool_GPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.bool, device=GPU)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_float32_CPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.float32)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_bool_CPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.bool)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_string_CPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.string)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_float32_GPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.float32, device=GPU)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_bool_GPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.bool, device=GPU)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_float32_CPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.float32)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_bool_CPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.bool)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_string_CPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.string)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_float32_GPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.float32, device=GPU)
 
+  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_bool_GPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.bool, device=GPU)
 
@@ -962,26 +1073,31 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: array_ops.transpose(m, perm, conjugate)
     self._run(func, num_iters, execution_mode=execution_mode)
 
+  @test_util.disable_tfrt("ConvertToEagerTensorUncached error")
   def benchmark_tf_transpose_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_transpose_2_by_2_GPU(self):
     with context.device(GPU):
       m = self._m_2_by_2.gpu()
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("ConvertToEagerTensorUncached error")
   def benchmark_tf_transpose_variable_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("Cannot convert array to EagerTensor of dtype int32")
   def benchmark_tf_transpose_variable_2_by_2_GPU(self):
     with context.device(GPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -993,6 +1109,7 @@ class MicroBenchmarks(test.Benchmark):
     cache_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(cache_computation, 30000)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature_and_with_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1005,6 +1122,7 @@ class MicroBenchmarks(test.Benchmark):
       return defined(t1=t, t2=t, t3=t, t4=t, t5=t, t6=t, t7=t, t8=t)
     self._run(cache_computation, 30000)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1017,6 +1135,7 @@ class MicroBenchmarks(test.Benchmark):
     signature_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(signature_computation, 30000)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature_and_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1046,6 +1165,7 @@ class MicroBenchmarks(test.Benchmark):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_read_variable_op_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -1059,6 +1179,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_read_variable_op_with_tape_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -1067,6 +1188,7 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("Scan, loops need fallback")
   def benchmarkScan(self):
     elems = math_ops.range(1600)
 
@@ -1076,6 +1198,7 @@ class MicroBenchmarks(test.Benchmark):
 
     self._run(scan, 100)
 
+  @test_util.disable_tfrt("Scan, loops need fallback")
   def benchmarkScanDefun(self):
     elems = math_ops.range(1600)
 
@@ -1094,10 +1217,52 @@ class MicroBenchmarks(test.Benchmark):
 
     self._run(fn, 10000)
 
+  def _benchmark_convert_constant(self, value, cached):
+    global GLOBAL_TEST_VALUE
+    GLOBAL_TEST_VALUE = value
+
+    def cached_func():
+      ops.convert_to_tensor(value)
+
+    def uncached_func():
+      global GLOBAL_TEST_VALUE
+      GLOBAL_TEST_VALUE += 1
+      ops.convert_to_tensor(GLOBAL_TEST_VALUE)
+
+    func = cached_func if cached else uncached_func
+
+    self._run(func, 10000)
+
+  def benchmark_convert_python_int(self):
+    self._benchmark_convert_constant(42, cached=True)
+
+  def benchmark_convert_python_int_uncached(self):
+    self._benchmark_convert_constant(42, cached=False)
+
+  def benchmark_convert_python_float(self):
+    self._benchmark_convert_constant(42.0, cached=True)
+
+  def benchmark_convert_python_float_uncached(self):
+    self._benchmark_convert_constant(42.0, cached=False)
+
+  def benchmark_convert_numpy_int(self):
+    self._benchmark_convert_constant(np.array(42), cached=True)
+
+  def benchmark_convert_numpy_int_uncached(self):
+    self._benchmark_convert_constant(np.array(42), cached=False)
+
+  def benchmark_convert_numpy_float(self):
+    self._benchmark_convert_constant(np.array(42.0), cached=True)
+
+  def benchmark_convert_numpy_float_uncached(self):
+    self._benchmark_convert_constant(np.array(42.0), cached=False)
+
+  @test_util.disable_tfrt("convert to tensor not supported")
   def benchmark_convert_3x_list_to_tensor(self):
     xs = [1, 2, 3]
     self._run(lambda: ops.convert_to_tensor(xs), 1000)
 
+  @test_util.disable_tfrt("convert to tensor not supported")
   def benchmark_convert_3x_array_to_tensor(self):
     xs = np.array([1, 2, 3], dtype=np.int32)
     self._run(lambda: ops.convert_to_tensor(xs), 1000)
@@ -1106,6 +1271,7 @@ class MicroBenchmarks(test.Benchmark):
     xs = [[0] * 2] * 40
     self._run(lambda: constant_op.constant(xs), 1000)
 
+  @test_util.disable_tfrt("convert to tensor not supported")
   def benchmark_constant_40x2_array_to_tensor(self):
     xs = np.array([[0] * 2] * 40, dtype=np.int32)
     self._run(lambda: constant_op.constant(xs), 1000)
@@ -1139,9 +1305,11 @@ class MicroBenchmarks(test.Benchmark):
         resources.append(resource_variable_ops.ResourceVariable(self._m_2))
       self._run(lambda: add_all(resources), num_iters)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(5, 1000)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveHundredResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(500, 100)
 
@@ -1176,12 +1344,15 @@ class MicroBenchmarks(test.Benchmark):
     with context.device(CPU):
       self._run(benchmark_fn, 10)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenThousandResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10000)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkHundredResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(100)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 
diff --git a/tensorflow/python/eager/benchmarks_test_base.py b/tensorflow/python/eager/benchmarks_test_base.py
new file mode 100644
index 00000000000..3d81d08ccbf
--- /dev/null
+++ b/tensorflow/python/eager/benchmarks_test_base.py
@@ -0,0 +1,37 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Benchmark base to run and report benchmark results."""
+
+from __future__ import absolute_import as _absolute_import
+from __future__ import division as _division
+from __future__ import print_function as _print_function
+
+from tensorflow.python.eager import test
+
+
+class MicroBenchmarksBase(test.Benchmark):
+  """Run and report benchmark results."""
+
+  def run_report(self, run_benchmark, func, num_iters, execution_mode=None):
+    """Run and report benchmark results."""
+    total_time = run_benchmark(func, num_iters, execution_mode)
+    mean_us = total_time * 1e6 / num_iters
+    extras = {
+        "examples_per_sec": float("{0:.3f}".format(num_iters / total_time)),
+        "us_per_example": float("{0:.3f}".format(total_time * 1e6 / num_iters))
+    }
+    benchmark_name = self._get_benchmark_name()
+    self.report_benchmark(
+        iters=num_iters, wall_time=mean_us, extras=extras, name=benchmark_name)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index eb928614817..604a960afd5 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -1123,6 +1123,22 @@ class Context(object):
     pywrap_tfe.TFE_Py_RegisterCustomDevice(self._handle, device_capsule,
                                            device_name, device_info_capsule)
 
+  def pack_eager_tensors(self, tensors):
+    """Pack multiple `EagerTensor`s of the same dtype and shape.
+
+    Args:
+      tensors: a list of EagerTensors to pack.
+
+    Returns:
+      A packed EagerTensor.
+    """
+    self.ensure_initialized()
+    if self._lazy_remote_inputs_copy is not None and (
+        not self._lazy_remote_inputs_copy):
+      raise ValueError("Packing eager tensors is not supported when "
+                       "lazy_remote_inputs_copy is disabled.")
+    return pywrap_tfe.TFE_Py_PackEagerTensors(self._handle, tensors)
+
   def remove_function(self, name):
     """Remove a function from the context.
 
@@ -1509,9 +1525,11 @@ class Context(object):
     return self.config.allow_soft_placement
 
   @soft_device_placement.setter
-  def soft_device_placement(self, enabled):
-    self._soft_device_placement = enabled
+  def soft_device_placement(self, enable):
+    if self._context_handle is not None:
+      pywrap_tfe.TFE_ContextSetSoftDevicePlacement(self._handle, enable)
 
+    self._soft_device_placement = enable
     self._thread_local_data.function_call_options = None
 
   @property
@@ -1519,15 +1537,11 @@ class Context(object):
     return self.config.log_device_placement
 
   @log_device_placement.setter
-  def log_device_placement(self, enabled):
-    if self._log_device_placement == enabled:
-      return
-
+  def log_device_placement(self, enable):
     if self._context_handle is not None:
-      raise RuntimeError(
-          "Device placement logging must be set at program startup")
+      pywrap_tfe.TFE_ContextSetLogDevicePlacement(self._handle, enable)
 
-    self._log_device_placement = enabled
+    self._log_device_placement = enable
     self._thread_local_data.function_call_options = None
 
   @property
@@ -1793,7 +1807,7 @@ def executing_eagerly():
   cases.
 
   *  Executing inside `tf.function`, unless under `tf.init_scope` or
-     `tf.config.experimental_run_functions_eagerly(True)` is previously called.
+     `tf.config.run_functions_eagerly(True)` is previously called.
   *  Executing inside a transformation function for `tf.dataset`.
   *  `tf.compat.v1.disable_eager_execution()` is called.
 
@@ -1815,8 +1829,8 @@ def executing_eagerly():
 
   Inside `tf.function` after
 
-  `tf.config.experimental_run_functions_eagerly(True)` is called:
-  >>> tf.config.experimental_run_functions_eagerly(True)
+  `tf.config.run_functions_eagerly(True)` is called:
+  >>> tf.config.run_functions_eagerly(True)
   >>> @tf.function
   ... def fn():
   ...   with tf.init_scope():
@@ -1825,7 +1839,7 @@ def executing_eagerly():
   >>> fn()
   True
   True
-  >>> tf.config.experimental_run_functions_eagerly(False)
+  >>> tf.config.run_functions_eagerly(False)
 
   Inside a transformation function for `tf.dataset`:
 
@@ -1858,7 +1872,7 @@ def executing_eagerly_v1():
   this API might return `False` in the following use cases.
 
   *  Executing inside `tf.function`, unless under `tf.init_scope` or
-     `tf.config.experimental_run_functions_eagerly(True)` is previously called.
+     `tf.config.run_functions_eagerly(True)` is previously called.
   *  Executing inside a transformation function for `tf.dataset`.
   *  `tf.compat.v1.disable_eager_execution()` is called.
 
@@ -1881,9 +1895,9 @@ def executing_eagerly_v1():
   False
 
   Inside `tf.function`
-  after  `tf.config.experimental_run_functions_eagerly(True)` is called:
+  after  `tf.config.run_functions_eagerly(True)` is called:
 
-  >>> tf.config.experimental_run_functions_eagerly(True)
+  >>> tf.config.run_functions_eagerly(True)
   >>> @tf.function
   ... def fn():
   ...   with tf.init_scope():
@@ -1892,7 +1906,7 @@ def executing_eagerly_v1():
   >>> fn()
   True
   True
-  >>> tf.config.experimental_run_functions_eagerly(False)
+  >>> tf.config.run_functions_eagerly(False)
 
   Inside a transformation function for `tf.dataset`:
 
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 47b3966827f..c1401fc56ee 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -1112,5 +1112,4 @@ class EagerTensorCacheTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == '__main__':
-  context.set_log_device_placement(True)
   test.main()
diff --git a/tensorflow/python/eager/custom_device_testutil.cc b/tensorflow/python/eager/custom_device_testutil.cc
index 214c1811c13..0ce722da32d 100644
--- a/tensorflow/python/eager/custom_device_testutil.cc
+++ b/tensorflow/python/eager/custom_device_testutil.cc
@@ -64,7 +64,7 @@ PYBIND11_MODULE(custom_device_testutil, m) {
         PyCapsule_New(device, "TFE_CustomDevice", &CallDelete_Device));
     tensorflow::Safe_PyObjectPtr device_info_capsule(PyCapsule_New(
         device_info, "TFE_CustomDevice_DeviceInfo", &CallDelete_DeviceInfo));
-    return tensorflow::pyo_or_throw(
+    return tensorflow::PyoOrThrow(
         PyTuple_Pack(4, device_capsule.get(), device_info_capsule.get(),
                      arrived_capsule.get(), executed_capsule.get()));
   });
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 11e916d063e..c61f39111b1 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -39,7 +39,9 @@ from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import traceme
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_decorator
@@ -107,7 +109,7 @@ class _FrequentTracingDetector(object):
             "retracing. Tracing is expensive and the excessive number of "
             "tracings could be due to (1) creating @tf.function repeatedly in "
             "a loop, (2) passing tensors with different shapes, (3) passing "
-            "Python objects instead of tensors. For (1), please  define your "
+            "Python objects instead of tensors. For (1), please define your "
             "@tf.function outside of the loop. For (2), @tf.function has "
             "experimental_relax_shapes=True option that relaxes argument "
             "shapes that can avoid unnecessary retracing. For (3), please "
@@ -304,8 +306,11 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
 RUN_FUNCTIONS_EAGERLY = False
 
 
+@deprecation.deprecated(
+    None,
+    "Use tf.config.run_functions_eagerly instead of the experimental version.")
 @tf_export("config.experimental_run_functions_eagerly")
-def run_functions_eagerly(run_eagerly):
+def experimental_run_functions_eagerly(run_eagerly):
   """Enables / disables eager execution of `tf.function`s.
 
   Calling `tf.config.experimental_run_functions_eagerly(True)` will make all
@@ -344,6 +349,60 @@ def run_functions_eagerly(run_eagerly):
   Calling `tf.config.experimental_run_functions_eagerly(False)` will undo this
   behavior.
 
+  Note: This flag has no effect on functions passed into tf.data transformations
+  as arguments. tf.data functions are never executed eagerly and are always
+  executed as a compiled Tensorflow Graph.
+
+  Args:
+    run_eagerly: Boolean. Whether to run functions eagerly.
+  """
+  return run_functions_eagerly(run_eagerly)
+
+
+@tf_export("config.run_functions_eagerly")
+def run_functions_eagerly(run_eagerly):
+  """Enables / disables eager execution of `tf.function`s.
+
+  Calling `tf.config.run_functions_eagerly(True)` will make all
+  invocations of `tf.function` run eagerly instead of running as a traced graph
+  function.
+
+  This can be useful for debugging or profiling. For example, let's say you
+  implemented a simple iterative sqrt function, and you want to collect the
+  intermediate values and plot the convergence.  Appending the values to a list
+  in `@tf.function` normally wouldn't work since it will just record the Tensors
+  being traced, not the values.  Instead, you can do the following.
+
+  >>> ys = []
+  >>>
+  >>> @tf.function
+  ... def sqrt(x):
+  ...   y = x / 2
+  ...   d = y
+  ...   for _ in range(10):
+  ...     d /= 2
+  ...     if y * y < x:
+  ...       y += d
+  ...     else:
+  ...       y -= d
+  ...     ys.append(y.numpy())
+  ...   return y
+  >>>
+  >>> tf.config.run_functions_eagerly(True)
+  >>> sqrt(tf.constant(2.))
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.4150391>
+  >>> ys
+  [1.5, 1.25, 1.375, 1.4375, 1.40625, 1.421875, 1.4140625, 1.4179688, 1.4160156,
+  1.4150391]
+  >>> tf.config.run_functions_eagerly(False)
+
+  Calling `tf.config.run_functions_eagerly(False)` will undo this
+  behavior.
+
+  Note: This flag has no effect on functions passed into tf.data transformations
+  as arguments. tf.data functions are never executed eagerly and are always
+  executed as a compiled Tensorflow Graph.
+
   Args:
     run_eagerly: Boolean. Whether to run functions eagerly.
   """
@@ -351,9 +410,18 @@ def run_functions_eagerly(run_eagerly):
   RUN_FUNCTIONS_EAGERLY = bool(run_eagerly)
 
 
+@deprecation.deprecated(
+    None,
+    "Use tf.config.functions_run_eagerly instead of the experimental version.")
 @tf_export("config.experimental_functions_run_eagerly")
-def functions_run_eagerly():
+def experimental_functions_run_eagerly():
   """Returns the value of the `experimental_run_functions_eagerly` setting."""
+  return functions_run_eagerly()
+
+
+@tf_export("config.functions_run_eagerly")
+def functions_run_eagerly():
+  """Returns the value of the `run_functions_eagerly` setting."""
   return RUN_FUNCTIONS_EAGERLY
 
 
@@ -463,6 +531,7 @@ class Function(object):
     self._name = name
     self._input_signature = input_signature
     self._key_for_call_stats = self._get_key_for_call_stats()
+    ops._tf_function_api_guage.get_cell().set(True)  # pylint: disable=protected-access
 
   def __getstate__(self):
     """Custom pickling, to omit unpickleable objects."""
@@ -563,10 +632,14 @@ class Function(object):
       attributes.update(_XlaMustCompile=bool(self._experimental_compile))
       if self._experimental_compile:
         attributes.update(_noinline=True)
+        # TODO(b/149755889): Until XLA is always linked, we have to do a runtime
+        # check.
         if not pywrap_tfe.TF_IsXlaEnabled():
-          raise ValueError("Attempting to use experimental_compile, "
-                           "but XLA support is not linked in. "
-                           "Rebuild with --define=with_xla_support=true.")
+          raise ValueError(
+              "Attempting to use experimental_compile, "
+              "but XLA support is not linked in. "
+              "Is the dependency to tensorflow/compiler/jit:xla_gpu_jit "
+              "(or xla_cpu_jit) present?")
     if not attributes:
       attributes = None
     return function_lib.defun_with_attributes(
@@ -669,30 +742,42 @@ class Function(object):
   def __call__(self, *args, **kwds):
     """Calls the graph function and warn too frequent tracings."""
     if RUN_FUNCTIONS_EAGERLY:
-      return self._python_function(*args, **kwds)
+      with traceme.TraceMe(self._name,
+                           tf_function_call="eager"):
+        return self._python_function(*args, **kwds)
 
     tracing_count = self._get_tracing_count()
-    if self._experimental_compile and (
-        not control_flow_util.GraphOrParentsInXlaContext(
-            ops.get_default_graph())):
-      # V2 control flow relies on XLAControlFlowContext to generate a
-      # XLA-compatible function graph. If the function is already called inside
-      # an XLA context, we don't create nested XLA context.
-      xla_context = control_flow_ops.XLAControlFlowContext()
-      try:
-        xla_context.Enter()
+    with traceme.TraceMe(self._name) as tm:
+      if self._experimental_compile and (
+          not control_flow_util.GraphOrParentsInXlaContext(
+              ops.get_default_graph())):
+        # V2 control flow relies on XLAControlFlowContext to generate a
+        # XLA-compatible function graph. If the function is already called
+        # inside an XLA context, we don't create nested XLA context.
+        compiler = "xla"
+        xla_context = control_flow_ops.XLAControlFlowContext()
+        try:
+          xla_context.Enter()
+          result = self._call(*args, **kwds)
+        finally:
+          xla_context.Exit()
+      else:
+        compiler = "nonXla"
         result = self._call(*args, **kwds)
-      finally:
-        xla_context.Exit()
-    else:
-      result = self._call(*args, **kwds)
 
-    if tracing_count == self._get_tracing_count():
-      _frequent_tracing_detector.called_without_tracing(
-          self._key_for_call_stats)
-    else:
-      _frequent_tracing_detector.called_with_tracing(self._key_for_call_stats,
-                                                     self._python_function)
+      new_tracing_count = self._get_tracing_count()
+      without_tracing = (tracing_count == new_tracing_count)
+      execution_mode = "notTraced" if without_tracing else "traced"
+      tm.set_metadata(tf_function_call=execution_mode + "-" + compiler,
+                      tracing_count=new_tracing_count)
+
+    if context.executing_eagerly():
+      if without_tracing:
+        _frequent_tracing_detector.called_without_tracing(
+            self._key_for_call_stats)
+      else:
+        _frequent_tracing_detector.called_with_tracing(self._key_for_call_stats,
+                                                       self._python_function)
 
     return result
 
@@ -817,6 +902,13 @@ class Function(object):
   def function_spec(self):
     return self._function_spec
 
+  def pretty_printed_concrete_signatures(self, verbose=True):
+    joiner = "\n\n" if verbose else "\n"
+    return joiner.join([
+        c.pretty_printed_signature(verbose=verbose)
+        for c in self._list_all_concrete_functions()
+    ])
+
   def _initialize_uninitialized_variables(self, initializers):
     """Make and call a `ConcreteFunction` which initializes variables."""
 
@@ -900,12 +992,8 @@ class Function(object):
 
     return initialize_variables.get_concrete_function()
 
-  def _list_all_concrete_functions_for_serialization(self):
-    """Returns all concrete functions for serialization.
-
-    Returns:
-      A list of instances of `ConcreteFunction`.
-    """
+  def _list_all_concrete_functions(self):
+    """Returns all concrete functions."""
     if self.input_signature is not None:
       self.get_concrete_function()
     concrete_functions = []
@@ -917,6 +1005,15 @@ class Function(object):
       concrete_functions.extend(
           self._stateless_fn._function_cache.all_values())
     # pylint: enable=protected-access
+    return concrete_functions
+
+  def _list_all_concrete_functions_for_serialization(self):
+    """Returns all concrete functions for serialization.
+
+    Returns:
+      A list of instances of `ConcreteFunction`.
+    """
+    concrete_functions = self._list_all_concrete_functions()
     seen_signatures = []
     for concrete_function in concrete_functions:
       signature = concrete_function.structured_input_signature
@@ -1247,7 +1344,7 @@ def function(func=None,
   ...     self.v = None
   ...
   ...   @tf.function
-  ...   def call(self, x):
+  ...   def __call__(self, x):
   ...     if self.v is None:
   ...       self.v = tf.Variable(tf.ones_like(x))
   ...     return self.v * x
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index ff99004722b..d5daa3acc99 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -368,13 +368,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     signature_args, _ = conc.structured_input_signature
     self.assertEqual('z', signature_args[0][0].name)
 
-    with self.assertRaisesRegexp(
-        ValueError, 'either zero or all names have to be specified'):
-      conc = g.get_concrete_function([
-          tensor_spec.TensorSpec(None, dtypes.float32, 'z'),
-          tensor_spec.TensorSpec(None, dtypes.float32),
-      ])
-
   def test_error_inner_capture(self):
 
     @def_function.function
@@ -859,13 +852,26 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
     with self.assertLogs(level='WARN') as logs:
       inner(1)
-      outer1(2)
-      outer2(3)
-      outer1(4)
-      outer2(5)
+      inner(2)
+      inner(3)
+      inner(4)
 
-    self.assertLen(logs.output, 1)
-    self.assertIn('Tracing is expensive', logs.output[0])
+      outer1(5)
+      outer1(6)
+      outer1(7)
+      outer1(8)
+
+      outer2(9)
+      outer2(10)
+      outer2(11)
+      outer2(12)
+
+      self.assertEmpty(logs.output)
+
+      outer2(13)
+
+      self.assertLen(logs.output, 1)
+      self.assertIn('Tracing is expensive', logs.output[0])
 
   def test_frequent_retracing_warning_on_reinstantiation(self):
     if sys.version_info[0] < 3:
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index f17033d126c..13b46491d9f 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
 
@@ -270,6 +271,25 @@ class DefFunctionTest(test.TestCase):
     self.assertAllClose(0, argmin(array_ops.ones([10], dtype=dtypes.float32)))
     self.assertAllClose(0, argmin(array_ops.ones([10])))
 
+  def testErrorMessagePassingTensorArray(self):
+
+    @def_function.function(experimental_compile=True)
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=1, element_shape=[])
+      ta = ta.write(0, 2 * x)
+      y = ta.read(0)
+      return y
+
+    x = constant_op.constant(3.14)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      with self.assertRaisesRegexp(
+          errors.UnimplementedError,
+          'TensorList crossing the XLA/TF boundary'):
+        y = f(x)
+        tape.gradient(y, x)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index 0bb1e89e4a3..762fea85d8c 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -23,9 +23,9 @@ import threading
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import backprop_util
-from tensorflow.python.eager import def_function
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import forwardprop_util
+from tensorflow.python.eager import function
 
 from tensorflow.python.framework import ops
 
@@ -145,9 +145,15 @@ def _jvp_helper(op_name, attr_tuple, inputs, outputs, tangents):
 # implementations, or a more satisfying story about how we re-specialize
 # gradients which were traced with relaxed shapes (e.g. use conds instead of
 # trace-time Python logic).
-_jvp_relaxed_shapes = def_function.function(
+#
+# Using function.defun rather than def_function.function avoids
+# tf.config.run_functions_eagerly(True). `_jvp_helper` doesn't successfully run
+# eagerly (infinite recursion), and even if it did it would use extra memory and
+# run unnecessary computation. The function does not create variables, so the
+# two symbols are otherwise equivalent.
+_jvp_relaxed_shapes = function.defun(
     _jvp_helper, experimental_relax_shapes=True)
-_jvp_exact_shapes = def_function.function(
+_jvp_exact_shapes = function.defun(
     _jvp_helper, experimental_relax_shapes=False)
 
 # The maximum number of exact-shape traces to perform for a single op before
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index aad179ffb6b..4ddba6b9be3 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -235,6 +235,17 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
       self.assertIsNone(acc1.jvp(y))
     self.assertIsNone(acc2.jvp(y))
 
+  def testRunFunctionsEagerly(self):
+    try:
+      original_setting = def_function.functions_run_eagerly()
+      def_function.run_functions_eagerly(True)
+      x = constant_op.constant(1.)
+      with forwardprop.ForwardAccumulator(x, 2.) as acc:
+        y = x * 3.
+      self.assertAllClose(6., acc.jvp(y))
+    finally:
+      def_function.run_functions_eagerly(original_setting)
+
   def testJVPFunctionUsedByAccumulatorForOps(self):
     previous_fn = forwardprop._jvp_dispatch
     try:
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 4cf2c416ef4..97708f056c2 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import collections
 import functools
 import itertools
+import pprint
 import threading
 import types as types_lib
 import weakref
@@ -53,6 +54,8 @@ from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
@@ -62,6 +65,7 @@ from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import resource_variable_ops
 
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import traceme
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import lazy_loader
@@ -105,22 +109,36 @@ def _make_input_signature_hashable(elem, variable_map=None):
     return tuple(map(lambda e: _make_input_signature_hashable(e, variable_map),
                      elem))
 
-  # If the element is not hashable, assume it is a weakref to a variable
-  # and return the dtype & shape. Else, simply return the element
   try:
     hash(elem)
   except TypeError:
+    # TFE_Py_EncodeArg weakrefs arguments it does not recognize, and we expect
+    # all recognized types to be hashable.
     assert isinstance(elem, weakref.ReferenceType)
     v = elem()
-    idx = variable_map.get(id(v))
-    if idx is None:
-      idx = len(variable_map)
-      variable_map[id(v)] = idx
 
-    # We include the class name to avoid having different types of variables
-    # having the same hash. We Also include the variable index which allows
-    # us to return a different hash if variables have been aliased in a call.
-    return v.__class__, tensor_spec.TensorSpec(v.shape, v.dtype), idx
+    if resource_variable_ops.is_resource_variable(v):
+      idx = variable_map.get(id(v))
+      if idx is None:
+        idx = len(variable_map)
+        variable_map[id(v)] = idx
+
+      # We include the class name to avoid having different types of variables
+      # having the same hash. We Also include the variable index which allows
+      # us to return a different hash if variables have been aliased in a call.
+      return v.__class__, tensor_spec.TensorSpec(v.shape, v.dtype), idx
+
+    if _is_ndarray(v):
+      # Numpy arrays are not hashable, but when calling functions we treat them
+      # in the same way as tf.Tensors.
+      if not hasattr(v, "shape") or not hasattr(v, "dtype"):
+        # TODO(tomhennigan) De-dup with _as_ndarray in _convert_numpy_inputs.
+        v = _as_ndarray(v)
+      return tensor_spec.TensorSpec(v.shape, v.dtype)
+
+    raise ValueError("Arguments to a tf.function must be Tensors, Variables, "
+                     "or hashable Python objects (or nested structures of "
+                     "these types).\nGot type: %s" % type(v).__name__)
 
   return elem
 
@@ -135,100 +153,33 @@ CacheKey = collections.namedtuple("CacheKey", [
 ])
 
 
-def _flat_shape_list(*params):
-  """Return a flat list of TensorShapes, one for each tensor[spec] in `*params`.
-
-  If `params` contains `CompositeTensors`, then they are expanded to their
-  components `Tensors`.
-
-  Args:
-    *params: Set of nested entries containing Tensors, TensorSpec, and
-      non-tensors.
-
-  Returns:
-    A list of entries containing either `None` or `TensorShape`.
-  """
-  return [
-      tensor_shape.TensorShape(x.shape)
-      if isinstance(x, (ops.Tensor, tensor_spec.DenseSpec)) else None
-      for x in nest.flatten(params, expand_composites=True)
-  ]
+def _type_spec_for(x):
+  """Returns a TypeSpec for `x`, or `None` if `x` doesn't have a TensorSpec."""
+  if isinstance(x, ops.Tensor):
+    return tensor_spec.TensorSpec.from_tensor(x)
+  elif isinstance(x, type_spec.TypeSpec):
+    return x
+  elif isinstance(x, composite_tensor.CompositeTensor):
+    return x._type_spec  # pylint: disable=protected-access
+  else:
+    return None
 
 
-def _shape_less_specific_than(relaxed, to_check):
-  """Checks if `relaxed` is less specific than `to_check`.
-
-  This is an asymmetric check, unlike `TensorShape.is_compatible_with`. If
-  `to_check` has a dimension with an undefined shape, `relaxed` must also have
-  an undefined shape for that dimension.
-
-  Args:
-    relaxed: A `TensorShape` to check against.
-    to_check: A second `TensorShape`.
-
-  Returns:
-    True if `to_check` represents a set of shapes which is a subset of
-    `relaxed`'s shapes and False otherwise.
-  """
-  if to_check.dims is not None and relaxed.dims is not None:
-    if to_check.rank != relaxed.rank:
-      return False
-    for check_dim, relaxed_dim in zip(to_check.dims, relaxed.dims):
-      if check_dim.value is None and relaxed_dim.value is not None:
-        return False
-      if not relaxed_dim.is_compatible_with(check_dim):
-        return False
-  return True
+def _is_type_subset(a, b):
+  """Returns true if TypeSpec `b` is a subset of type `a` (or if a is None.)"""
+  if a is None:
+    return True
+  else:
+    return a.most_specific_compatible_type(b) == a
 
 
-def _compatible_shapes(flat_relaxed, flat_to_check):
-  """Check if lists of TensorShapes contain compatible shapes.
-
-  Checks that each `flat_relaxed` shape covers a superset of the shapes of the
-  corresponding `flat_to_check` shape.
-
-  Args:
-    flat_relaxed: List of TensorShape or None.
-    flat_to_check: List of TensorShape or None.
-
-  Returns:
-    A python bool.
-
-  Raises:
-    RuntimeError:
-      if `len(flat_relaxed) != len(flat_to_check)`.
-    RuntimeError:
-      if `flat_relaxed[i] is None != flat_to_check[i] is None` for any `i`.
-  """
-
-  if len(flat_relaxed) != len(flat_to_check):
-    raise RuntimeError("Expected shape lists of identical lengths, but saw: "
-                       "%s and %s" % (flat_relaxed, flat_to_check))
-  def is_compatible(relaxed, to_check):
-    """Internal help function.
-
-    Args:
-      relaxed: TensorShape or None.
-      to_check: TensorShape or None.
-
-    Returns:
-      Python bool.
-
-    Raises:
-      RuntimeError: If `relaxed is None != to_check is None`.
-    """
-    # If both x and y are None, there is no shape to compare.  Otherwise check
-    # if they are compatible with each other.  Either way, both input signatures
-    # must have have Tensors in the same entries.  If not, raise an assertion
-    # error.
-    if relaxed is None != to_check is None:
-      raise RuntimeError(
-          "Expected signature type matches between flattened input shapes "
-          "%s and %s; but saw that (%s is None) != (%s is None)"
-          % (flat_relaxed, flat_to_check, relaxed, to_check))
-    return relaxed is None or _shape_less_specific_than(relaxed, to_check)
-  return all(is_compatible(relaxed, to_check)
-             for relaxed, to_check in zip(flat_relaxed, flat_to_check))
+def _shape_relaxed_type_for_composite_tensor(x):
+  """Returns a shape-relaxed TypeSpec for x (if composite) or x (if not)."""
+  if isinstance(x, composite_tensor.CompositeTensor):
+    # pylint: disable=protected-access
+    return x._type_spec._with_tensor_ranks_only()
+  else:
+    return x
 
 
 def common_shape(x, y):
@@ -329,7 +280,7 @@ class _InterpolateFunctionError(object):
         if t.name == compat.as_str(self._func.name):
           g = self._func.graph
         elif g:
-          next_func = g._get_function(t.name)
+          next_func = g._get_function(t.name)  # pylint: disable=protected-access
           if next_func is not None and isinstance(next_func,
                                                   _EagerDefinedFunction):
             g = next_func.graph
@@ -1488,6 +1439,12 @@ class _ForwardBackwardCall(object):
           flat_outputs, self._inference_args, self._input_tangents)
 
 
+# Sentinel value used by with ConcreteFunction's structured signature to
+# indicate that a non-tensor parameter should use the value that was
+# specified when the concrete function was created.
+_BOUND_VALUE = object()
+
+
 class ConcreteFunction(object):
   """Callable object encapsulating a function definition and its gradient.
 
@@ -1495,7 +1452,11 @@ class ConcreteFunction(object):
   is differentiable under `tf.GradientTape` objects.
   """
 
-  def __init__(self, func_graph, attrs=None, shared_func_graph=True):
+  def __init__(self,
+               func_graph,
+               attrs=None,
+               shared_func_graph=True,
+               function_spec=None):
     """Initialize a `ConcreteFunction`.
 
     Args:
@@ -1506,16 +1467,25 @@ class ConcreteFunction(object):
      shared_func_graph: If False, the ConcreteFunction takes ownership of
        `func_graph` and will break reference cycles when it is deleted. This
        makes the FuncGraph inoperable.
+     function_spec: FunctionSpec for the original function.  If not specified,
+       then this ConcreteFunction may only be called using the flat signature.
 
     Raises:
       ValueError: If number of input_placeholders is not equal to the number
         of function inputs.
     """
+    # _arg_keywords and _num_positional_args define the flat signature.  They
+    # are assigned after construction.
     self._arg_keywords = None
     self._num_positional_args = None
+
     self._func_graph = func_graph
     self._captured_inputs = self._func_graph.external_captures
     self._captured_closures = self._func_graph.deferred_external_captures
+
+    # function_spec defines the structured signature.
+    self._set_function_spec(function_spec)
+
     if attrs and IMPLEMENTS_ATTRIBUTE_NAME in attrs:
       # The alternative is to silently drop "implements" tag
       # but it seems likely it would lead to hard to catch bugs.
@@ -1565,6 +1535,52 @@ class ConcreteFunction(object):
     # building gradients.
     self._inference_function = self._delayed_rewrite_functions.forward()
 
+  def _set_function_spec(self, function_spec):
+    """Enables the structured signature by supplying a function_spec."""
+    self._function_spec = None
+    self._pre_initialized_function_spec = function_spec
+
+    # Note: when ConcreteFunctions are built by recreate_function() in
+    # function_deserialization.py, they don't have a structured_input_signature
+    # yet.  In that case, _initialize_function_spec() gets called by
+    # _setup_functions_structures() in load.py.
+    if (function_spec is not None and
+        self.structured_input_signature is not None):
+      self._initialize_function_spec()
+
+  def _initialize_function_spec(self):
+    """Updates `self._function_spec` to include varargs and bound variables.
+
+    Adds new positional arguments for any varargs (i.e., for args that are
+    in `structured_input_signature`, but not in the original fullargspec.args).
+
+    Replaces `defaults` and `kwonlydefaults` with the `_BOUND_VALUE`, for
+    all args and kwargs in `structured_input_signature`.
+
+    Sets `varkw` and `varargs` to None.
+    """
+    if self._pre_initialized_function_spec is None:
+      return  # e.g., SavedBareConcreteFunction doesn't have function_spec yet.
+    assert not self._function_spec, "already initialized"
+    function_spec = self._pre_initialized_function_spec
+    args = function_spec.fullargspec.args
+    arg_specs, kwarg_specs = self.structured_input_signature
+    fullargspec = tf_inspect.FullArgSpec(
+        args=list(args) +
+        ["<arg{}>".format(i + 1) for i in range(len(args), len(arg_specs))],
+        varargs=None,
+        varkw=None,
+        defaults=[_BOUND_VALUE] * len(arg_specs),
+        kwonlyargs=list(sorted(kwarg_specs)),
+        kwonlydefaults=dict((k, _BOUND_VALUE) for k in kwarg_specs),
+        annotations=function_spec.fullargspec.annotations)
+    self._function_spec = FunctionSpec(
+        fullargspec,
+        function_spec.is_method,
+        function_spec.input_signature,
+        function_spec.is_pure,
+        name=self._func_graph.name)
+
   @property
   def variables(self):
     """Sequence of variables for this function."""
@@ -1578,15 +1594,44 @@ class ConcreteFunction(object):
   def __call__(self, *args, **kwargs):
     """Executes the wrapped function.
 
+    ConcreteFunctions have two signatures:
+
+    * The signature of the original function wrapped by this ConcreteFunction.
+    * A flat signature, where each argument accepts a single Tensor.
+
+    The original function signature is generally preferred, but the flat input
+    signature is supported for backward compatibility.
+
+    ### Original Function Signature
+
+    When calling a ConcreteFunction with the signature of the original function,
+    each argument must match the type or value that was used when the
+    ConcreteFunction's graph was traced.  In particular:
+
+    * Tensor arguments (including CompositeTensors, such as RaggedTensor) must
+      have matching `TypeSpec`s.
+    * Non-Tensor arguments (such as booleans or ints) must have equal values.
+    * Nested arguments (such as lists, tuples, or dictionaries) must have the
+      same nesting structure; and each nested value must have a matching type
+      or value.
+
+    The default value for any arguments that were traced with non-Tensor values
+    is the value that was used in the trace.  Arguments that were traced with
+    tensor arguments do not have a default value (even if the original function
+    had a default value for that argument).
+
+    ### Flat Signature
+
+    When calling a ConcreteFunction with the flat signature, the arguments
+    correspond to the flattened component tensors of the arguments that were
+    used to construct the ConcreteFunction.  Parameter names are assigned based
+    on `TensorSpec.name` (when specified) or the original argument names (with
+    suffixes automatically added for nested arguments or composite tensors with
+    multiple components).
+
     Args:
-      *args: Tensors or Variables. Positional arguments are only accepted when
-        they correspond one-to-one with arguments of the traced Python function.
-      **kwargs: Tensors or Variables specified by name. When
-        `get_concrete_function` was called to create this `ConcreteFunction`,
-        each Tensor input was given a name, defaulting to the name of the Python
-        function's argument but possibly overridden by the `name=` argument to
-        `tf.TensorSpec`. These names become the argument names for the concrete
-        function.
+      *args: Positional arguments to the concrete function.
+      **kwargs: Keyword arguments to the concrete function.
 
     Returns:
       The result of applying the TF function on the given Tensors.
@@ -1594,47 +1639,182 @@ class ConcreteFunction(object):
     Raises:
       AssertionError: If this `ConcreteFunction` was not created through
         `get_concrete_function`.
-      ValueError: If arguments contains anything other than Tensors or
-        Variables.
-      TypeError: For invalid positional/keyword argument combinations.
+      TypeError: If the arguments do not match the function's signature.
     """
     return self._call_impl(args, kwargs)
 
   def _call_impl(self, args, kwargs, cancellation_manager=None):
     """See `__call__` for details."""
-    if self._arg_keywords is None or self._num_positional_args is None:
-      raise AssertionError(
-          "Tried to call a concrete function obtained from an internal API "
-          "through the public interface. Use get_concrete_function instead.")
+    with traceme.TraceMe(self._func_graph.name,
+                         tf_function_call="concrete"):
+      # Construct the list of input tensors: check if the structured signature
+      # applies first; and if not, then use the flat signature.
+      if self._function_spec is not None:
+        try:
+          return self._call_with_structured_signature(args, kwargs,
+                                                      cancellation_manager)
+        except TypeError as structured_err:
+          try:
+            return self._call_with_flat_signature(args, kwargs,
+                                                  cancellation_manager)
+          except TypeError:
+            raise structured_err
+
+      return self._call_with_flat_signature(args, kwargs, cancellation_manager)
+
+  def _call_with_flat_signature(self, args, kwargs, cancellation_manager):
+    """Executes the wrapped function with the flat signature.
+
+    Args:
+      args: Positional arguments to the concrete function.
+      kwargs: Keyword arguments to the concrete function.
+      cancellation_manager: A `CancellationManager` that can be used to cancel
+        function invocation.
+
+    Returns:
+      The result of applying the function on the Tensors/Variables contained in
+      `args` and `kwargs`.
+    Raises:
+      TypeError: if `args` and `kwargs` do not match the flat signature of this
+        `ConcreteFunction`.
+    """
     if len(args) > self._num_positional_args:
       raise TypeError(
-          ("Expected at most {} positional arguments (and the rest keywords, "
-           "of {}), got {}. When calling a concrete function, positional "
-           "arguments may not be bound to Tensors within nested structures."
-          ).format(self._num_positional_args, self._arg_keywords, args))
+          "{} takes {} positional arguments but {} were given".format(
+              self._flat_signature_summary(), self._num_positional_args,
+              len(args)))
     args = list(args)
+    kwargs = dict(kwargs)
     for keyword in self._arg_keywords[len(args):]:
       try:
         args.append(kwargs.pop(compat.as_str(keyword)))
       except KeyError:
-        specified_keywords = (list(self._arg_keywords[:len(args)])
-                              + list(kwargs.keys()))
-        raise TypeError(
-            "Expected argument names {} but got values for {}. Missing: {}."
-            .format(
-                list(self._arg_keywords),
-                specified_keywords,
-                list(set(self._arg_keywords) - set(specified_keywords))))
+        specified_keywords = (
+            list(self._arg_keywords[:len(args)]) + list(kwargs.keys()))
+        raise TypeError("{} missing required arguments: {}".format(
+            self._flat_signature_summary(), ", ".join(
+                sorted(set(self._arg_keywords) - set(specified_keywords)))))
     if kwargs:
       positional_arg_keywords = set(self._arg_keywords[:len(args)])
       for unused_key in kwargs:
         if unused_key in positional_arg_keywords:
-          raise TypeError("Got two values for keyword '{}'.".format(unused_key))
-      raise TypeError("Keyword arguments {} unknown. Expected {}.".format(
-          list(kwargs.keys()), list(self._arg_keywords)))
+          raise TypeError("{} got two values for argument '{}'".format(
+              self._flat_signature_summary(), unused_key))
+      raise TypeError("{} got unexpected keyword arguments: {}.".format(
+          self._flat_signature_summary(), ", ".join(sorted(kwargs))))
+
+    for i, arg in enumerate(args):
+      if not isinstance(
+          arg, (ops.Tensor, resource_variable_ops.BaseResourceVariable)):
+        raise TypeError("{}: expected argument #{}(zero-based) to be a Tensor; "
+                        "got {} ({})".format(self._flat_signature_summary(), i,
+                                             type(arg).__name__, str(arg)))
     return self._call_flat(args, self.captured_inputs, cancellation_manager)
 
-  def _filtered_call(self, args, kwargs):
+  def _call_with_structured_signature(self, args, kwargs, cancellation_manager):
+    """Executes the wrapped function with the structured signature.
+
+    Args:
+      args: Positional arguments to the concrete function.
+      kwargs: Keyword arguments to the concrete function.
+      cancellation_manager: A `CancellationManager` that can be used to cancel
+        function invocation.
+
+    Returns:
+      The result of applying the function on the Tensors/Variables contained in
+      `args` and `kwargs`.
+    Raises:
+      TypeError: if `args` and `kwargs` do not match the structured signature
+        of this `ConcreteFunction`.
+    """
+    args, kwargs = self._function_spec.canonicalize_function_inputs(
+        *args, **kwargs)
+    self._structured_signature_check_missing_args(args, kwargs)
+    self._structured_signature_check_unexpected_args(args, kwargs)
+    self._structured_signature_check_arg_types(args, kwargs)
+    return self._filtered_call(args, kwargs, cancellation_manager)
+
+  def _structured_signature_check_missing_args(self, args, kwargs):
+    """Raises a TypeError if any args are missing."""
+    arg_specs, kwarg_specs = self.structured_input_signature
+    missing_arguments = []
+    for i, (arg, spec) in enumerate(zip(args, arg_specs)):
+      if arg is _BOUND_VALUE and _contains_type_spec(spec):
+        missing_arguments.append(self._function_spec.arg_names[i])
+    for (name, arg) in kwargs.items():
+      if arg is _BOUND_VALUE and _contains_type_spec(kwarg_specs[name]):
+        missing_arguments.append(name)
+    if missing_arguments:
+      raise TypeError("{} missing required arguments: {}".format(
+          self._structured_signature_summary(),
+          ", ".join(sorted(missing_arguments))))
+
+  def _structured_signature_check_unexpected_args(self, args, kwargs):
+    """Raises a TypeError if there are any extra args."""
+    arg_specs, kwarg_specs = self.structured_input_signature
+    if len(args) > len(arg_specs):
+      raise TypeError(
+          "{} takes {} positional arguments but {} were given".format(
+              self._structured_signature_summary(),
+              len(self._function_spec.arg_names), len(args)))
+    if len(kwargs) > len(kwarg_specs):
+      extra_args = set(kwargs) - set(kwarg_specs)
+      raise TypeError("{} got unexpected keyword arguments: {}".format(
+          self._structured_signature_summary(), ", ".join(extra_args)))
+
+  def _structured_signature_check_arg_types(self, args, kwargs):
+    """Raises a TypeError if any args have the wrong type."""
+    # Check argument types
+    arg_specs, kwarg_specs = self.structured_input_signature
+    for i, (arg, spec) in enumerate(zip(args, arg_specs)):
+      name = self._function_spec.arg_names[i]
+      self._structured_signature_check_arg_type(arg, spec, name)
+    for (name, arg) in kwargs.items():
+      self._structured_signature_check_arg_type(arg, kwarg_specs[name], name)
+
+  def _structured_signature_check_arg_type(self, arg, spec, name):
+    """Raise TypeError if `arg`'s type doesn't match `spec`."""
+    if arg is _BOUND_VALUE:
+      return
+
+    # Check the overall nested structure of the argument.
+    try:
+      nest.assert_same_structure(arg, spec, expand_composites=True)
+    except (ValueError, TypeError):
+      try:
+        nest.assert_same_structure(arg, spec, expand_composites=False)
+        expected, got = spec, arg
+      except (ValueError, TypeError):
+        expected, got = _structure_summary(spec), _structure_summary(arg)
+      raise TypeError("{}: argument {} had incorrect type\n"
+                      "  expected: {}\n       got: {}".format(
+                          self._structured_signature_summary(), name, expected,
+                          got))
+
+    # Check the type for each leaf in the nested structure.
+    arg_pieces = nest.flatten(arg, expand_composites=True)
+    spec_pieces = nest.flatten(spec, expand_composites=True)
+    for (arg_piece, spec_piece) in zip(arg_pieces, spec_pieces):
+      if isinstance(spec_piece, tensor_spec.DenseSpec):
+        # TODO(edloper): Consider calling convert_to_tensor on non-tensor
+        # values here.  That would match the behavior of
+        # _call_concrete_function() in function_deserialization.py.  If
+        # we do, then we need to change the nest assert_same_structure and
+        # flatten calls above to use shallow variants.
+        tensor_types = (ops.Tensor, resource_variable_ops.BaseResourceVariable)
+        if not isinstance(arg_piece, tensor_types):
+          raise TypeError(
+              "{} expected a Tensor in {}, but got {} value {}".format(
+                  self._structured_signature_summary(), name,
+                  type(arg_piece).__name__, arg_piece))
+      elif arg_piece is not _BOUND_VALUE and arg_piece != spec_piece:
+        raise TypeError("ConcreteFunction {} was constructed with {} value "
+                        "{} in {}, but was called with {} value {}".format(
+                            self._structured_signature_summary(),
+                            type(spec_piece).__name__, spec_piece, name,
+                            type(arg_piece).__name__, arg_piece))
+
+  def _filtered_call(self, args, kwargs, cancellation_manager=None):
     """Executes the function, filtering arguments from the Python function.
 
     Objects aside from Tensors, CompositeTensors, and Variables are ignored.
@@ -1643,6 +1823,8 @@ class ConcreteFunction(object):
     Args:
       args: Canonicalized positional arguments of the Python function.
       kwargs: Canonicalized keyword arguments of the Python function.
+      cancellation_manager: (Optional.) A `CancellationManager` that can be
+        used to cancel function invocation.
 
     Returns:
       The result of applying the function on the Tensors/Variables contained in
@@ -1652,7 +1834,8 @@ class ConcreteFunction(object):
         (t for t in nest.flatten((args, kwargs), expand_composites=True)
          if isinstance(t, (ops.Tensor,
                            resource_variable_ops.BaseResourceVariable))),
-        self.captured_inputs)
+        captured_inputs=self.captured_inputs,
+        cancellation_manager=cancellation_manager)
 
   def _call_flat(self, args, captured_inputs, cancellation_manager=None):
     """Executes the wrapped function.
@@ -1781,7 +1964,26 @@ class ConcreteFunction(object):
 
   @property
   def structured_input_signature(self):
-    """Returns structured signature of the original function."""
+    """Returns structured signature for this concrete function.
+
+    Returns:
+      A tuple `(args, kwargs)`, where:
+
+        * `args` is a tuple that specifies the expected type or value each for
+          positional argument.
+        * `kwargs` is a dictionary that specifies the expected type or value
+          for each keyword-only argument.
+
+      The type or value for each argument is specified using one of the
+      following:
+
+        * A `tf.TypeSpec`, indicating that a Tensor or other TensorFlow-native
+          value is expected.
+        * A Python value, such as an integer, indicating that an equal value
+          is expected.
+        * A nested structure of `tf.TypeSpec`s and Python values, indicating
+          that a corresponding nested structure is expected.
+    """
     return self._func_graph.structured_input_signature
 
   @property
@@ -1968,6 +2170,104 @@ class ConcreteFunction(object):
       ret.attr[name].CopyFrom(value)
     return ret
 
+  def _structured_signature_summary(self, default_values=False):
+    """Returns a string summarizing this function's structured signature.
+
+    Args:
+      default_values: If true, then include default values in the signature.
+
+    Returns:
+      A `string`.
+    """
+    # Note: we can't just use self._funcion_spec.signature_summary(), because
+    # that would show "_BOUND_VALUE" as the default value for all arguments.
+    assert self._function_spec is not None
+    arg_specs, kwarg_specs = self.structured_input_signature
+    arg_names = list(self._function_spec.arg_names)
+    if default_values:
+      for i in range(len(arg_names)):
+        if not _contains_type_spec(arg_specs[i]):
+          arg_names[i] += "={}".format(arg_specs[i])
+    if kwarg_specs:
+      arg_names.append("*")
+      for name, spec in kwarg_specs.items():
+        arg_names.append(name)
+        if default_values and not _contains_type_spec(spec):
+          arg_names[-1] += "={}".format(spec)
+    signature = "{}({})".format(self._func_graph.name, ", ".join(arg_names))
+
+    return signature
+
+  def _flat_signature_summary(self):
+    """Returns a string summarizing this function's flat signature."""
+    assert self._arg_keywords is not None
+    assert self._num_positional_args is not None
+    arg_names = self._arg_keywords
+    if self._num_positional_args > len(arg_names):
+      arg_names.extend(
+          "<arg{}>".format(i + 1)
+          for i in range(len(arg_names), self._num_positional_args))
+    return "{}({})".format(self._func_graph.name, ", ".join(arg_names))
+
+  def pretty_printed_signature(self, verbose=True):
+    """Returns a string summarizing the signature of this concrete function."""
+    if not verbose:
+      return self._structured_signature_summary(default_values=True)
+
+    def pretty_print_spec(spec):
+      """Returns a string describing the spec for a single argument."""
+      if isinstance(spec, tensor_spec.TensorSpec):
+        return "{} Tensor, shape={}".format(spec.dtype.name, spec.shape)
+      elif nest.is_sequence(spec):
+        pieces = nest.flatten(spec, expand_composites=False)
+        markers = [_Marker("<{}>".format(i + 1)) for i in range(len(pieces))]
+        structure = nest.pack_sequence_as(spec, markers)
+        # Ensure dictionaries are sorted by key (for determinism)
+        result = pprint.pformat(structure, width=10000)
+        for (marker, piece) in zip(markers, pieces):
+          result += "\n      {}: {}".format(marker, pretty_print_spec(piece))
+        return result
+      else:
+        return repr(spec)
+
+    lines = [self._structured_signature_summary(default_values=True)]
+    arg_specs, kwarg_specs = self.structured_input_signature
+    names = list(self._function_spec.arg_names)
+    names.extend(sorted(kwarg_specs))
+    specs = list(arg_specs) + list(kwarg_specs.values())
+    # note: we can skip bound args, since we already displayed thier bound
+    # value in the signature summary.
+    arg_details = []
+    for (name, spec) in zip(names, specs):
+      if _contains_type_spec(spec):
+        arg_details.append("    {}: {}".format(name, pretty_print_spec(spec)))
+    if arg_details:
+      lines.append("  Args:")
+      lines.extend(arg_details)
+    lines.append("  Returns:")
+    lines.append("    {}".format(
+        pretty_print_spec(
+            nest.map_structure(type_spec.type_spec_from_value,
+                               self.structured_outputs))))
+
+    return "\n".join(lines)
+
+  def __repr__(self):
+    if self._function_spec is not None:
+      return "<ConcreteFunction {} at 0x{:X}>".format(
+          self.pretty_printed_signature(verbose=False), id(self))
+    elif not (self._num_positional_args is None or self._arg_keywords is None):
+      return "<ConcreteFunction {} at 0x{:X}>".format(
+          self._flat_signature_summary(), id(self))
+    else:
+      return object.__repr__(self)
+
+  def __str__(self):
+    if self._function_spec is not None:
+      return "ConcreteFunction {}".format(self.pretty_printed_signature())
+    else:
+      return self.__repr__()
+
 
 _pywrap_utils.RegisterType("Tensor", ops.Tensor)
 _pywrap_utils.RegisterType("EagerTensor", ops.EagerTensor)
@@ -2061,17 +2361,37 @@ class FunctionSpec(object):
             kwonlydefaults={},
             annotations=fullargspec.annotations)
     is_method = tf_inspect.ismethod(python_function)
-    return FunctionSpec(fullargspec, is_method, [], {}, input_signature,
-                        is_pure=is_pure)
 
-  def __init__(self, fullargspec, is_method, args_to_prepend, kwargs_to_include,
-               input_signature, is_pure=False):
+    # Get the function's name.  Remove functools.partial wrappers if necessary.
+    while isinstance(python_function, functools.partial):
+      python_function = python_function.func
+    name = getattr(python_function, "__name__", "f")
+
+    return FunctionSpec(
+        fullargspec, is_method, input_signature, is_pure=is_pure, name=name)
+
+  def __init__(self,
+               fullargspec,
+               is_method,
+               input_signature,
+               is_pure=False,
+               name=None):
+    """Constructs a FunctionSpec describing a python function.
+
+    Args:
+      fullargspec: `tf_inspect.FullArgSpec` object describing the function.
+      is_method: True if the function is a method.
+      input_signature: a signature of the function (None, if variable)
+      is_pure: if True all input arguments (including variables and constants)
+        will be converted to tensors and no variable changes allowed.
+      name: Name of the function
+    """
     self._fullargspec = fullargspec
     self._is_method = is_method
     self._is_pure = is_pure
-    del args_to_prepend
-    del kwargs_to_include
-    self._default_values = fullargspec.defaults
+
+    # TODO(edloper): Include name when serializing for SavedModel?
+    self._name = name or "f"
 
     if self._is_method:
       # Remove `self`: default arguments shouldn't be matched to it.
@@ -2084,21 +2404,21 @@ class FunctionSpec(object):
     # A cache mapping from argument name to index, for canonicalizing
     # arguments that are called in a keyword-like fashion.
     self._args_to_indices = {arg: i for i, arg in enumerate(args)}
-    self.arg_names = args
-    self.vararg_name = fullargspec.varargs
+    self._arg_names = args
 
     # A cache mapping from arg index to default value, for canonicalization.
-    offset = len(args) - len(self._default_values or [])
+    default_values = fullargspec.defaults
+    offset = len(args) - len(default_values or [])
     self._arg_indices_to_default_values = {
         offset + index: default
-        for index, default in enumerate(self._default_values or [])
+        for index, default in enumerate(default_values or [])
     }
     if input_signature is None:
       self._input_signature = None
     else:
-      if fullargspec.kwonlyargs:
+      if set(fullargspec.kwonlyargs) - set(fullargspec.kwonlydefaults or ()):
         raise ValueError("Cannot define a TensorFlow function from a Python "
-                         "function with keyword arguments when "
+                         "function with keyword-only arguments when "
                          "input_signature is provided.")
 
       if not isinstance(input_signature, (tuple, list)):
@@ -2118,8 +2438,8 @@ class FunctionSpec(object):
     return self._is_method
 
   @property
-  def args_to_prepend(self):
-    return self._args_to_prepend
+  def args_to_indices(self):
+    return self._args_to_indices
 
   @property
   def kwargs_to_include(self):
@@ -2133,6 +2453,43 @@ class FunctionSpec(object):
   def flat_input_signature(self):
     return self._flat_input_signature
 
+  @property
+  def is_pure(self):
+    return self._is_pure
+
+  @property
+  def arg_names(self):
+    return self._arg_names
+
+  @property
+  def vararg_name(self):
+    return self._fullargspec.varargs
+
+  @property
+  def varkw_name(self):
+    return self._fullargspec.varkw
+
+  def signature_summary(self, default_values=False):
+    """Returns a string summarizing this function's signature.
+
+    Args:
+      default_values: If true, then include default values in the signature.
+
+    Returns:
+      A `string`.
+    """
+    args = list(self._arg_names)
+    if default_values:
+      for (i, default) in self._arg_indices_to_default_values.items():
+        args[i] += "={}".format(default)
+    if self._fullargspec.kwonlyargs:
+      args.append("*")
+      for arg_name in self._fullargspec.kwonlyargs:
+        args.append(arg_name)
+        if default_values and arg_name in self._fullargspec.kwonlydefaults:
+          args[-1] += "={}".format(self._fullargspec.kwonlydefaults[arg_name])
+    return "{}({})".format(self._name, ", ".join(args))
+
   def _convert_variables_to_tensors(self, args, kwargs):
     args = [ops.convert_to_tensor(x) for x in args]
     kwargs = {kw: ops.convert_to_tensor(x) for kw, x in kwargs.items()}
@@ -2145,7 +2502,13 @@ class FunctionSpec(object):
     instance. In particular, we parse the varags and kwargs that the
     original function was called with into a tuple corresponding to the
     Python function's positional (named) arguments and a dictionary
-    corresponding to its kwargs.
+    corresponding to its kwargs.  Missing default arguments are added.
+
+    If this `FunctionSpec` has an input signature, then it is used to convert
+    arguments to tensors; otherwise, any inputs containing numpy arrays are
+    converted to tensors.
+
+    Additionally, any inputs containing numpy arrays are converted to Tensors.
 
     Args:
       *args: The varargs this object was called with.
@@ -2166,29 +2529,38 @@ class FunctionSpec(object):
       args, kwargs = self._convert_variables_to_tensors(args, kwargs)
     if self._input_signature is not None:
       if len(args) > len(self._input_signature):
-        raise TypeError(
-            "When input_signature is provided, only pass arguments "
-            "covered by it. Received %d argument(s)." % len(args))
+        raise TypeError("{} takes {} positional arguments (as specified by the "
+                        "input_signature) but {} were given".format(
+                            self.signature_summary(),
+                            len(self._input_signature), len(args)))
       for arg in six.iterkeys(kwargs):
         index = self._args_to_indices.get(arg, None)
         if index is None:
-          raise TypeError(
-              "Function got an unexpected keyword argument %s" % arg)
+          raise TypeError("{} got unexpected keyword argument `{}`".format(
+              self.signature_summary(), arg))
         if index >= len(self._input_signature):
           raise TypeError(
-              "When input_signature is provided, only pass arguments "
-              "covered by it. Received argument %s." % arg)
+              "{} got keyword argument `{}` that was not included in "
+              "input_signature".format(self.signature_summary(), arg))
 
     if not kwargs:
       inputs = args
-      default_keys = sorted(self._arg_indices_to_default_values.keys())
-      if default_keys:
-        assert min(default_keys) <= len(
-            args), "Not enough arguments (%s, %s, %s)" % (args, default_keys,
-                                                          self.arg_names)
-      for index in default_keys:
-        if index >= len(args):
-          inputs += (self._arg_indices_to_default_values[index],)
+      if self._arg_indices_to_default_values:
+        try:
+          inputs += tuple(
+              self._arg_indices_to_default_values[i]
+              for i in range(len(args), len(self._arg_names)))
+        except KeyError:
+          missing_args = [
+              self._arg_names[i]
+              for i in range(len(args), len(self._arg_names))
+              if i not in self._arg_indices_to_default_values
+          ]
+          raise TypeError("{} missing required arguments: {}".format(
+              self.signature_summary(), ", ".join(missing_args)))
+
+      if self._fullargspec.kwonlydefaults:
+        kwargs.update(self._fullargspec.kwonlydefaults)
     else:
       # Maps from index of arg to its corresponding value, according to `args`
       # and `kwargs`; seeded with the default values for the named args that
@@ -2201,18 +2573,28 @@ class FunctionSpec(object):
       for arg, value in six.iteritems(kwargs):
         index = self._args_to_indices.get(arg, None)
         if index is not None:
+          if index < len(args):
+            raise TypeError("{} got two values for argument '{}'".format(
+                self.signature_summary(), arg))
           arg_indices_to_values[index] = value
           consumed_args.append(arg)
-        elif self._input_signature is not None:
-          raise ValueError("Cannot define a TensorFlow function from a Python "
-                           "function with keyword arguments when "
-                           "input_signature is provided.")
       for arg in consumed_args:
-        # After this loop, `kwargs` will only contain true keyword arguments, as
-        # opposed to named arguments called in a keyword-like fashion.
+        # After this loop, `kwargs` will only contain keyword_only arguments,
+        # and all positional_or_keyword arguments have been moved to `inputs`.
         kwargs.pop(arg)
       inputs = args + _deterministic_dict_values(arg_indices_to_values)
 
+      if kwargs and self._input_signature is not None:
+        raise TypeError(
+            "{} got unexpected keyword arguments: {}\n(Cannot define a "
+            "TensorFlow function from a Python function with keyword arguments "
+            "when input_signature is provided.)".format(
+                self.signature_summary(), ", ".join(kwargs)))
+
+      if self._fullargspec.kwonlydefaults:
+        for (kwarg, default) in self._fullargspec.kwonlydefaults.items():
+          kwargs.setdefault(kwarg, default)
+
     if self._input_signature is None:
       inputs = _convert_numpy_inputs(inputs)
       kwargs = _convert_numpy_inputs(kwargs)
@@ -2226,6 +2608,24 @@ class FunctionSpec(object):
       return inputs, {}
 
 
+def _as_ndarray(value):
+  """Converts value to an ndarray, assumes _is_ndarray(value)."""
+  # TODO(tomhennigan) Support __array_interface__ too.
+  return value.__array__()
+
+
+def _is_ndarray(value):
+  """Tests whether the given value is an ndarray (and not a TF tensor/var)."""
+  # TODO(tomhennigan) Support __array_interface__ too.
+  return hasattr(value, "__array__") and not (
+      resource_variable_ops.is_resource_variable(value)
+      or tensor_util.is_tensor(value)
+      # For legacy reasons we do not automatically promote Numpy strings.
+      or isinstance(value, np.str_)
+      # NumPy dtypes have __array__ as unbound methods.
+      or isinstance(value, type))
+
+
 def _convert_numpy_inputs(inputs):
   """Convert numpy array inputs to tensors."""
   # We assume that any CompositeTensors have already converted their components
@@ -2238,8 +2638,12 @@ def _convert_numpy_inputs(inputs):
   # possible since ndarrays are not hashable).
   need_packing = False
   for index, value in enumerate(flat_inputs):
-    if type(value) == np.ndarray:
-      flat_inputs[index] = constant_op.constant(value)
+    if _is_ndarray(value):
+      a = _as_ndarray(value)
+      if not isinstance(a, np.ndarray):
+        raise TypeError("The output of __array__ must be an np.ndarray "
+                        "(got {} from {}).".format(type(a), type(value)))
+      flat_inputs[index] = constant_op.constant(a)
       need_packing = True
   if need_packing:
     return nest.pack_sequence_as(
@@ -2311,9 +2715,10 @@ class FunctionCache(object):
     # The primary cache, mapping a fully shaped CacheKey to a function.
     self.primary = collections.OrderedDict()
     # A cache key lookup, mapping a CacheKey generated without shape info to a
-    # flat list of relaxed shapes (one for each argument).  Arguments that are
-    # not Tensors contain a `None` for the corresponding relaxed shape.
-    self.arg_relaxed_shapes = collections.OrderedDict()
+    # flat list of `TypeSpec`s with relaxed shapes (one for each flattened
+    # argument). Arguments that are not Tensors or `CompositeTensor`s contain a
+    # `None` for the corresponding relaxed spec.
+    self.arg_relaxed_specs = collections.OrderedDict()
     # The secondary cache, mapping a CacheKey generated without shape info to a
     # function.
     self.arg_relaxed = collections.OrderedDict()
@@ -2321,7 +2726,7 @@ class FunctionCache(object):
     self._garbage_collectors = [
         _FunctionGarbageCollector(self.primary),
         _FunctionGarbageCollector(self.arg_relaxed),
-        _FunctionGarbageCollector(self.arg_relaxed_shapes)]
+        _FunctionGarbageCollector(self.arg_relaxed_specs)]
 
   def all_values(self):
     """A set of all `ConcreteFunction` instances held by this cache."""
@@ -2433,6 +2838,7 @@ class Function(object):
       graph_function, _, _ = self._maybe_define_function(args, kwargs)
     return graph_function
 
+  # XX TODO: make sure we fix up this path as well!?
   def _get_concrete_function_internal(self, *args, **kwargs):
     """Bypasses error checking when getting a graph function."""
     graph_function = self._get_concrete_function_internal_garbage_collected(
@@ -2650,6 +3056,7 @@ class Function(object):
             override_flat_arg_shapes=override_flat_arg_shapes,
             capture_by_value=self._capture_by_value),
         self._function_attributes,
+        function_spec=self.function_spec,
         # Tell the ConcreteFunction to clean up its graph once it goes out of
         # scope. This is not the default behavior since it gets used in some
         # places (like Keras) where the FuncGraph lives longer than the
@@ -2659,33 +3066,60 @@ class Function(object):
 
   def _define_function_with_shape_relaxation(self, args, kwargs):
     """Define a function, relaxing arg shapes to avoid unnecessary retracing."""
+    any_composite_args = any(isinstance(x, composite_tensor.CompositeTensor)
+                             for x in nest.flatten((args, kwargs)))
 
-    rank_only_cache_key = self._cache_key(
-        args, kwargs, include_tensor_ranks_only=True)
+    # Build a cache key where TensorShapes include only rank information (and
+    # not information about the size of each dimension).
+    if not any_composite_args:
+      rank_only_cache_key = self._cache_key(
+          args, kwargs, include_tensor_ranks_only=True)
+    else:
+      # For the rank-only cache key, replace any composite tensors with
+      # shape-relaxed TypeSpecs.
+      (cache_key_args, cache_key_kwargs) = nest.map_structure(
+          _shape_relaxed_type_for_composite_tensor, (args, kwargs))
+      rank_only_cache_key = self._cache_key(
+          cache_key_args, cache_key_kwargs, include_tensor_ranks_only=True)
 
-    arg_shapes = _flat_shape_list(args, kwargs)
-    relaxed_arg_shapes = self._function_cache.arg_relaxed_shapes.get(
+    arg_specs = [_type_spec_for(x) for x in nest.flatten((args, kwargs))]
+    relaxed_arg_specs = self._function_cache.arg_relaxed_specs.get(
         rank_only_cache_key, None)
     relaxed_arg_function = self._function_cache.arg_relaxed.get(
         rank_only_cache_key, None)
 
     if (relaxed_arg_function is not None
-        and _compatible_shapes(flat_relaxed=relaxed_arg_shapes,
-                               flat_to_check=arg_shapes)):
+        and all(_is_type_subset(x, y) for (x, y) in
+                zip(relaxed_arg_specs, arg_specs))):
       return relaxed_arg_function, args, kwargs
 
-    if relaxed_arg_shapes is None:
-      relaxed_arg_shapes = arg_shapes
+    if relaxed_arg_specs is None:
+      relaxed_arg_specs = arg_specs
     else:
-      if len(arg_shapes) != len(relaxed_arg_shapes):
-        raise RuntimeError("Expected arg_shapes len to match "
-                           "relaxed_arg_shapes len: %d vs. %d"
-                           % (len(arg_shapes), len(relaxed_arg_shapes)))
-      relaxed_arg_shapes = [
-          common_shape(x, y) for (x, y) in zip(
-              arg_shapes, relaxed_arg_shapes)]
-    self._function_cache.arg_relaxed_shapes[rank_only_cache_key] = (
-        relaxed_arg_shapes)
+      if len(arg_specs) != len(relaxed_arg_specs):
+        raise RuntimeError("Expected arg_specs len to match "
+                           "relaxed_arg_specs len: %d vs. %d"
+                           % (len(arg_specs), len(relaxed_arg_specs)))
+      relaxed_arg_specs = [
+          x if x is None else x.most_specific_compatible_type(y)
+          for (x, y) in zip(arg_specs, relaxed_arg_specs)]
+    self._function_cache.arg_relaxed_specs[rank_only_cache_key] = (
+        relaxed_arg_specs)
+    relaxed_arg_shapes = [
+        x if x is None else x.shape
+        for x in nest.flatten(relaxed_arg_specs, expand_composites=True)]
+
+    if any_composite_args:
+      # Rebuild composite tensors with the relaxed TypeSpecs.  For example,
+      # if a tf.data iterator is passed as an argument, then we need to relax
+      # the TensorShapes in its element_spec.
+      (relaxed_arg_specs, relaxed_kwarg_specs) = nest.pack_sequence_as(
+          (args, kwargs), relaxed_arg_specs, expand_composites=False)
+      (args, kwargs) = nest.pack_sequence_as(
+          (relaxed_arg_specs, relaxed_kwarg_specs),
+          nest.flatten((args, kwargs), expand_composites=True),
+          expand_composites=True)
+
     graph_function = self._create_graph_function(
         args, kwargs, override_flat_arg_shapes=relaxed_arg_shapes)
     self._function_cache.arg_relaxed[rank_only_cache_key] = graph_function
@@ -3336,3 +3770,30 @@ class ConcreteFunctionGarbageCollector(object):
       func_graph_module.dismantle_func_graph(self._func_graph)
     except:  # pylint: disable=bare-except
       pass
+
+
+class _Marker(object):
+  """Markers used to pretty-print nested args in function signatures."""
+
+  def __init__(self, s):
+    self._s = s
+
+  def __repr__(self):
+    return str(self._s)
+
+
+def _structure_summary(structure):
+  """Displays a summary of the nesting structure of the given value."""
+
+  def type_name(x):
+    if isinstance(x, type_spec.TypeSpec):
+      return x.value_type.__name__
+    else:
+      return type(x).__name__
+
+  markers = [_Marker(type_name(v)) for v in nest.flatten(structure)]
+  return str(nest.pack_sequence_as(structure, markers))
+
+
+def _contains_type_spec(value):
+  return any(isinstance(x, type_spec.TypeSpec) for x in nest.flatten(value))
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 11c365d6387..078ca8b8878 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -31,11 +31,14 @@ import numpy
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.autograph.core import ag_ctx
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import cancellation
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -50,6 +53,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import type_spec
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -65,10 +69,12 @@ from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.structured import structured_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_ops
 from tensorflow.python.util import compat
@@ -99,6 +105,16 @@ def _example_indexed_slices_without_dense_shape():
       constant_op.constant([1, 2]), constant_op.constant([0, 1]))
 
 
+def _spec_for_value(value):
+  """Returns the (nested) TypeSpec for a value."""
+  if nest.is_sequence(value):
+    return nest.map_structure(_spec_for_value, value)
+  elif isinstance(value, (ops.Tensor, composite_tensor.CompositeTensor)):
+    return type_spec.type_spec_from_value(value)
+  else:
+    return value
+
+
 class FunctionTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
@@ -170,6 +186,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(AttributeError, 'no attribute'):
       add(c)
 
+  def testPackedVariable(self):
+    with ops.device('/cpu:0'):
+      v0_0 = resource_variable_ops.ResourceVariable(1.0)
+    with ops.device('/cpu:1'):
+      v0_1 = resource_variable_ops.ResourceVariable(2.0)
+      v1_0 = resource_variable_ops.ResourceVariable(3.0)
+    with ops.device('/cpu:2'):
+      v1_1 = resource_variable_ops.ResourceVariable(4.0)
+
+    packed_var_0 = ops.pack_eager_tensors([v0_0.handle, v0_1.handle])
+    packed_var_1 = ops.pack_eager_tensors([v1_0.handle, v1_1.handle])
+
+    # TODO(b/145922293): use ResourceVariable.assign_add and
+    # ResourceVariable.read_value directly once we support packing multiple
+    # ResourceVariable into one ResourceVariable.
+    @def_function.function
+    def read_var():
+      resource_variable_ops.assign_add_variable_op(
+          packed_var_0, constant_op.constant(5.0))
+      resource_variable_ops.assign_add_variable_op(
+          packed_var_1, constant_op.constant(6.0))
+      with ops.device('/cpu:0'):
+        read0 = resource_variable_ops.read_variable_op(
+            packed_var_0, dtype=dtypes.float32)
+      with ops.device('/cpu:1'):
+        read1 = resource_variable_ops.read_variable_op(
+            packed_var_0, dtype=dtypes.float32)
+        read2 = resource_variable_ops.read_variable_op(
+            packed_var_1, dtype=dtypes.float32)
+      with ops.device('/cpu:2'):
+        read3 = resource_variable_ops.read_variable_op(
+            packed_var_1, dtype=dtypes.float32)
+
+      return read0, read1, read2, read3
+
+    self.assertAllEqual(read_var(), (1 + 5, 2 + 5, 3 + 6, 4 + 6))
+
   def testImplementsAttributeBasic(self):
     v = def_function.function(
         experimental_implements='func')(lambda x, y: x + y)
@@ -348,6 +401,125 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     foo.func(constant_op.constant([1.0, 2.0]))
     self.assertTrue(unknown_dim[0])
 
+  def testInputShapeFunctionRelaxationWithRaggedTensors(self):
+    traced_type_spec = [None]
+
+    @def_function.function(experimental_relax_shapes=True)
+    def func(x):
+      traced_type_spec[0] = x._type_spec
+      return x
+
+    def check_trace(x, expected_trace):
+      traced_type_spec[0] = None
+      func(x)
+      self.assertEqual(traced_type_spec[0], expected_trace)
+
+    check_trace(  # Initial call gets traced.
+        ragged_factory_ops.constant([[1], [2, 3, 4]]),
+        ragged_tensor.RaggedTensorSpec([2, None], dtypes.int32))
+    check_trace(  # Input TypeSpec is the same -> no retrace.
+        ragged_factory_ops.constant([[1, 2], [3, 4]]), None)
+    check_trace(  # Even if component tensor shapes change -> no retrace.
+        ragged_factory_ops.constant([[1, 2], [3, 4, 5, 6]]), None)
+    check_trace(  # Different TypeSpec shape (nrows): retrace
+        ragged_factory_ops.constant([[1], [2], [3]]),
+        ragged_tensor.RaggedTensorSpec([3, None], dtypes.int32))
+    check_trace(  # Different nrows again: relax & retrace
+        ragged_factory_ops.constant([[1], [2], [3], [4]]),
+        ragged_tensor.RaggedTensorSpec([None, None], dtypes.int32))
+    check_trace(  # Different nrows yet again: not retrace
+        ragged_factory_ops.constant([[1]]), None)
+    check_trace(  # Different ragged_rank: retrace
+        ragged_factory_ops.constant([[[1]]]),
+        ragged_tensor.RaggedTensorSpec([1, None, None], dtypes.int32))
+    check_trace(  # Different ragged_rank again: retrace & relax
+        ragged_factory_ops.constant([[[1]], [[2]]]),
+        ragged_tensor.RaggedTensorSpec([None, None, None], dtypes.int32))
+
+  def testInputShapeFunctionRelaxationWithStructuredTensors(self):
+    traced_type_spec = [None]
+
+    @def_function.function(experimental_relax_shapes=True)
+    def func(x):
+      traced_type_spec[0] = x._type_spec
+      return x
+
+    def check_trace(x, expected_trace):
+      traced_type_spec[0] = None
+      func(x)
+      self.assertEqual(traced_type_spec[0], expected_trace)
+
+    # If we have TypeSpecs that differ in ways other than just their shape,
+    # then retrace each time.
+    check_trace(
+        structured_tensor.StructuredTensor.from_pyval({'a': [1]}),
+        structured_tensor.StructuredTensorSpec(
+            [], {'a': tensor_spec.TensorSpec((1,), dtypes.int32)}))
+    check_trace(
+        structured_tensor.StructuredTensor.from_pyval({'b': [1]}),
+        structured_tensor.StructuredTensorSpec(
+            [], {'b': tensor_spec.TensorSpec((1,), dtypes.int32)}))
+    check_trace(
+        structured_tensor.StructuredTensor.from_pyval({'c': [1]}),
+        structured_tensor.StructuredTensorSpec(
+            [], {'c': tensor_spec.TensorSpec((1,), dtypes.int32)}))
+
+    # But if we call again with only shape different, then do relax:
+    check_trace(  # retrace
+        structured_tensor.StructuredTensor.from_pyval({'a': [1, 2]}),
+        structured_tensor.StructuredTensorSpec(
+            [], {'a': tensor_spec.TensorSpec((2,), dtypes.int32)}))
+    check_trace(  # relax & retrace
+        structured_tensor.StructuredTensor.from_pyval({'a': [1, 2, 3]}),
+        structured_tensor.StructuredTensorSpec(
+            [], {'a': tensor_spec.TensorSpec((None,), dtypes.int32)}))
+    check_trace(  # use relaxed graph
+        structured_tensor.StructuredTensor.from_pyval({'a': [1, 2, 3, 4]}),
+        None)
+
+  def testInputShapeFunctionRelaxationWithDatasetIterators(self):
+    # For dataset iterators, the TypeSpec includes type information that's
+    # not derivable from the component tensors.  Make sure that the TypeSpec
+    # shapes get relaxed as appropriate.
+
+    traced_type_spec = [None]
+
+    @def_function.function(experimental_relax_shapes=True)
+    def func(x):
+      traced_type_spec[0] = x._type_spec
+      return x
+
+    def check_trace(x, expected_trace):
+      traced_type_spec[0] = None
+      func(x)
+      self.assertEqual(traced_type_spec[0], expected_trace)
+
+    ds_1_2 = dataset_ops.DatasetV2.from_tensors(array_ops.zeros([1, 2]))
+    ds_2_2 = dataset_ops.DatasetV2.from_tensors(array_ops.zeros([2, 2]))
+    ds_3_2 = dataset_ops.DatasetV2.from_tensors(array_ops.zeros([3, 2]))
+    ds_4_2 = dataset_ops.DatasetV2.from_tensors(array_ops.zeros([4, 2]))
+    ds_2_1 = dataset_ops.DatasetV2.from_tensors(array_ops.zeros([2, 1]))
+    check_trace(  # shape=[1, 2]: retrace
+        dataset_ops.make_one_shot_iterator(ds_1_2),
+        iterator_ops.IteratorSpec(
+            tensor_spec.TensorSpec([1, 2], dtypes.float32)))
+    check_trace(  # shape=[1, 2]: no retrace (use the [1, 2] graph)
+        dataset_ops.make_one_shot_iterator(ds_1_2), None)
+    check_trace(  # shape=[2, 2]: retrace
+        dataset_ops.make_one_shot_iterator(ds_2_2),
+        iterator_ops.IteratorSpec(
+            tensor_spec.TensorSpec([2, 2], dtypes.float32)))
+    check_trace(  # shape=[3, 2]: relax to [None, 2] and retrace
+        dataset_ops.make_one_shot_iterator(ds_3_2),
+        iterator_ops.IteratorSpec(
+            tensor_spec.TensorSpec([None, 2], dtypes.float32)))
+    check_trace(  # shape=[4, 2]: no retrace (use the [None, 2] graph)
+        dataset_ops.make_one_shot_iterator(ds_4_2), None)
+    check_trace(  # shape=[2, 1]: relax to [None, None] and retrace
+        dataset_ops.make_one_shot_iterator(ds_2_1),
+        iterator_ops.IteratorSpec(
+            tensor_spec.TensorSpec([None, None], dtypes.float32)))
+
   def testCapturesVariables(self):
     a = variables.Variable(1.0, trainable=False)
     b = variables.Variable(1.0)
@@ -442,7 +614,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def f(_):
       return 1.0
 
-    with self.assertRaisesRegexp(AttributeError, 'set'):
+    with self.assertRaisesRegexp(ValueError, r'Got type: set'):
       f(set([]))
 
   def testFuncName(self):
@@ -762,11 +934,44 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # shouldn't trigger another function definition.
     self.assertLen(total_function_cache(defined), 1)
 
+    np_ones = numpy.ones([], numpy.float32)
+    np_zeros = numpy.zeros([], numpy.float32)
+    tf_ones = array_ops.ones([])
+    tf_zeros = array_ops.zeros([])
+
     # Test that the numpy array is properly an argument to the graph function.
-    self.assertEqual(1., defined(numpy.ones([])).numpy())
-    self.assertEqual(0., defined(numpy.zeros([])).numpy())
-    self.assertEqual(1., defined(array_ops.ones([])).numpy())
-    self.assertEqual(0., defined(array_ops.zeros([])).numpy())
+    self.assertEqual(1., defined(np_ones).numpy())
+    self.assertLen(total_function_cache(defined), 2)
+    self.assertEqual(0., defined(np_zeros).numpy())
+    self.assertEqual(1., defined(tf_ones).numpy())
+    self.assertEqual(0., defined(tf_zeros).numpy())
+    self.assertLen(total_function_cache(defined), 2)
+
+    # Test that mutable inputs are supported.
+    mutable = numpy.ones([], numpy.float32)
+    self.assertEqual(1., defined(mutable).numpy())
+    mutable.fill(0)
+    self.assertEqual(0., defined(mutable).numpy())
+
+    class MyNdarray(numpy.ndarray):
+      pass
+
+    # Test that the subclasses of ndarray are converted too.
+    self.assertEqual(1., defined(np_ones.view(MyNdarray)).numpy())
+    self.assertEqual(0., defined(np_zeros.view(MyNdarray)).numpy())
+
+    # We should not have triggered any re-tracing of the python function.
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testNumpyDtypeInputSupported(self):
+    @function.defun
+    def f(x, dtype):
+      return constant_op.constant(dtype(x))
+
+    self.assertEqual(f(1, numpy.float32).numpy(), numpy.float32(1))
+    self.assertEqual(f(2, numpy.float32).numpy(), numpy.float32(2))
+    self.assertEqual(f(1, numpy.int32).numpy(), numpy.int32(1))
+    self.assertEqual(f(2, numpy.int32).numpy(), numpy.int32(2))
 
   def testDefunNumpyArraysConvertedToTensorsInKwargs(self):
 
@@ -1671,11 +1876,11 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       self.assertLen(total_function_cache(defined), 2)
 
       # pylint: disable=protected-access
-      self.assertLen(defined._function_cache.arg_relaxed_shapes, 1)
-      relaxed_shapes = (
-          list(defined._function_cache.arg_relaxed_shapes.values())[0])
-      self.assertLen(relaxed_shapes, 1)
-      relaxed_shape = relaxed_shapes[0]
+      self.assertLen(defined._function_cache.arg_relaxed_specs, 1)
+      relaxed_specs = (
+          list(defined._function_cache.arg_relaxed_specs.values())[0])
+      self.assertLen(relaxed_specs, 1)
+      relaxed_shape = relaxed_specs[0].shape
       # pylint: enable=protected-access
       self.assertEqual(relaxed_shape.rank, 1)
       self.assertEqual(tensor_shape.dimension_value(relaxed_shape[0]), None)
@@ -1789,6 +1994,18 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'incompatible'):
       func([['wrong dtype']])
 
+  def testNoKeywordOnlyArgumentsWithInputSignature(self):
+    if sys.version_info[0] < 3:
+      self.skipTest('keyword_only arguments only exist in Python 3.')
+
+    func = eval('lambda x, *, y: x')  # pylint: disable=eval-used
+    signature = [tensor_spec.TensorSpec(None, dtypes.int32)]
+    with self.assertRaisesRegexp(
+        ValueError, 'Cannot define a TensorFlow function from a Python '
+        'function with keyword-only arguments when input_signature is '
+        'provided.'):
+      def_function.function(func, signature)
+
   def testNestedInputSignatures(self):
 
     def expected_foo(a, b):
@@ -1905,7 +2122,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined(array_ops.ones([2, 1]))
 
     # Wrong number of arguments.
-    with self.assertRaisesRegexp(TypeError, r'Received 2 argument\(s\)'):
+    with self.assertRaisesRegexp(
+        TypeError, r'takes 1 positional arguments \(as specified by the '
+        r'input_signature\) but 2 were given'):
       defined(array_ops.ones([2]), array_ops.ones([2]))
     with self.assertRaisesRegexp(ValueError,
                                  'Structure of Python function inputs.*'):
@@ -1946,10 +2165,14 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         return -1.0 * a
 
     x = constant_op.constant(1.0)
-    with self.assertRaisesRegexp(TypeError, 'only pass arguments'):
+    with self.assertRaisesRegexp(
+        TypeError, 'got keyword argument `training` '
+        'that was not included in input_signature'):
       foo(x, training=True)
 
-    with self.assertRaisesRegexp(TypeError, 'only pass arguments'):
+    with self.assertRaisesRegexp(
+        TypeError, 'got keyword argument `training` '
+        'that was not included in input_signature'):
       foo(x, training=False)
 
     self.assertAllEqual(x.numpy(), foo(x).numpy())
@@ -2472,8 +2695,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       return x
 
     graph_function = foo.get_concrete_function(constant_op.constant(1.0))
-    with self.assertRaisesRegexp(
-        ValueError, 'All inputs to `ConcreteFunction`s must be Tensors;.*'):
+    with self.assertRaises((TypeError, ValueError)):
       graph_function('Not a Tensor.')
 
   def testSwapImplementationWithGrapplerPlugin(self):
@@ -3148,6 +3370,436 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     function.clear_function_callbacks()
     self.assertEmpty(function._function_callbacks)  # pylint:disable=protected-access
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionWithNestedTensorInputs(self):
+
+    @def_function.function
+    def f(x, y):
+      return (x['a'] + x['b'], y[0] + y[1])
+
+    a = constant_op.constant(1000)
+    b = constant_op.constant(200)
+    c = constant_op.constant(30)
+    d = {'a': a, 'b': b}
+    e = (c, 4)
+
+    # Test different argument signatures when constructing the concrete func.
+    for cf in [
+        f.get_concrete_function(d, e),
+        f.get_concrete_function(d, y=e),
+        f.get_concrete_function(y=e, x=d),
+        f.get_concrete_function(_spec_for_value(d), _spec_for_value(e)),
+        f.get_concrete_function(_spec_for_value(d), y=_spec_for_value(e)),
+        f.get_concrete_function(y=_spec_for_value(e), x=_spec_for_value(d))
+    ]:
+      # Test different calling conventions when calling the concrete func.
+      for output in [
+          cf(d, e),  # structured signature
+          cf(d, y=e),  # structured signature w/ kwarg
+          cf(y=e, x=d),  # structured signature w/ 2 kwargs
+          cf(a, b, c),  # flat signature
+          cf(x=a, x_1=b, y=c)  # flat signature w/ kwargs
+      ]:
+        self.assertIsInstance(output, tuple)
+        self.assertLen(output, 2)
+        self.assertAllEqual(output[0], 1200)
+        self.assertAllEqual(output[1], 34)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionWithNestedNonTensorInputs(self):
+
+    @def_function.function
+    def f(x, y):
+      return (x['a'] + x['b'], y[0] + y[1])
+
+    a = {'a': constant_op.constant(1000), 'b': constant_op.constant(200)}
+    b = (50, 3)
+
+    for cf in [  # argument y is bound to non-Tensor value (50, 3).
+        f.get_concrete_function(a, b),
+        f.get_concrete_function(a, y=b),
+        f.get_concrete_function(x=a, y=b)
+    ]:
+      for output in [cf(a), cf(x=a), cf(a, b), cf(x=a, y=b)]:
+        self.assertAllEqual(output[0] + output[1], 1253)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionWithBoundNestedNonTensorInputs(self):
+
+    @def_function.function
+    def f(x, y):
+      return (x['a'] + x['b'], y[0] + y[1])
+
+    a = {'a': 3000, 'b': 200, 'c': 9000}
+    b = (constant_op.constant(30), 4)
+
+    for cf in [  # argument x is bound to non-tensor value `a`
+        f.get_concrete_function(a, b),
+        f.get_concrete_function(a, y=b),
+        f.get_concrete_function(x=a, y=b)
+    ]:
+      for output in [cf(a, b), cf(a, y=b), cf(y=b), cf(x=a, y=b)]:
+        self.assertAllEqual(output[0] + output[1], 3234)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionWithAllBoundNestedNonTensorInputs(self):
+
+    @def_function.function
+    def f(x, y):
+      return (x['a'] + x['b'], y[0] + y[1])
+
+    a = {'a': 5000, 'b': 500}
+    b = (50, 5)
+
+    cf = f.get_concrete_function(a, b)
+    for output in [cf(), cf(a), cf(y=b)]:
+      self.assertAllEqual(output[0] + output[1], 5555)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionStructuredSignatureKeywordOrder(self):
+    # Check that keyword-only arguments are sorted appropriately, so that they
+    # feed the right tensor into each input.
+    @def_function.function
+    def g(**kwargs):
+      return string_ops.reduce_join(
+          string_ops.reduce_join(
+              ops.convert_to_tensor(sorted(kwargs.items())),
+              axis=1,
+              separator='='),
+          axis=0,
+          separator=', ')
+
+    s = constant_op.constant('s')
+    g.get_concrete_function(q=s, a=s, p=s, r=s, v=s, m=s, l=s)
+    self.assertAllEqual(
+        g(m='a', r='b', v='c', q='d', l='e', a='f', p='g'),
+        b'a=f, l=e, m=a, p=g, q=d, r=b, v=c')
+    self.assertAllEqual(
+        g(q='d', a='f', p='g', r='b', v='c', m='a', l='e'),
+        b'a=f, l=e, m=a, p=g, q=d, r=b, v=c')
+    self.assertAllEqual(
+        g(a='f', l='e', m='a', p='g', q='d', r='b', v='c'),
+        b'a=f, l=e, m=a, p=g, q=d, r=b, v=c')
+
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters([
+      dict(
+          testcase_name='MissingArg',
+          conc_args=lambda: (1, constant_op.constant(2)),
+          call_args=lambda: (1,),
+          error=r'func\(x, y\) missing required arguments: y'),
+      dict(
+          testcase_name='MissingVararg',
+          conc_args=lambda: (1, 2, constant_op.constant(1.0)),
+          call_args=lambda: (1, 2),
+          error=r'func\(x, y, <arg3>\) missing required arguments: <arg3>'),
+      dict(
+          testcase_name='ExtraPositionalArg',
+          conc_args=lambda: (1, 2),
+          call_args=lambda: (1, 2, 3),
+          error=r'func\(x, y\) takes 2 positional arguments but 3 were given'),
+      dict(
+          testcase_name='MissingKeywordOnlyArg',
+          conc_args=lambda: (1, 2),
+          conc_kwargs=lambda: {'c': constant_op.constant(1.0)},
+          call_args=lambda: (1, 2),
+          error=r'func\(x, y, \*, c\) missing required arguments: c'),
+      dict(
+          testcase_name='ExtraKeywordArg',
+          conc_args=lambda: (1, 2),
+          call_args=lambda: (1, 2),
+          call_kwargs=lambda: {'c': constant_op.constant(1.0)},
+          error=r'func\(x, y\) got unexpected keyword arguments: c'),
+      dict(
+          testcase_name='ExpectedRaggedGotNest',
+          conc_args=lambda: (ragged_factory_ops.constant([[1, 2], [3]]),),
+          call_args=lambda: ({
+              'a': constant_op.constant([1, 2, 3])
+          },),
+          error=r'func\(x, y\): argument x had incorrect type\n'
+          r'  expected: RaggedTensor\n'
+          r"       got: {'a': (Eager)?Tensor}"),
+      dict(
+          testcase_name='WrongRaggedRank',
+          conc_args=lambda: (ragged_factory_ops.constant([[1, 2], [3]]),),
+          call_args=lambda: (ragged_factory_ops.constant([[[1]]]),),
+          error=r'func\(x, y\): argument x had incorrect type\n'),
+      dict(
+          testcase_name='WrongRaggedDType',
+          conc_args=lambda: (ragged_factory_ops.constant([[1]]),),
+          call_args=lambda: (ragged_factory_ops.constant([[1.0]]),),
+          error=r'func\(x, y\): argument x had incorrect type\n'),
+      dict(
+          testcase_name='ExpectedDictGotTensor',
+          conc_args=lambda: ({
+              'a': constant_op.constant(1),
+              'b': constant_op.constant(1)
+          },),
+          call_args=lambda: (constant_op.constant(1),),
+          error=r'func\(x, y\): argument x had incorrect type\n'),
+      dict(
+          testcase_name='ExpectedTupleGotTensor',
+          conc_args=lambda:
+          ((constant_op.constant(1), constant_op.constant(2)),),
+          call_args=lambda: (constant_op.constant(1),),
+          error=r'func\(x, y\): argument x had incorrect type\n'),
+      dict(
+          testcase_name='WrongDType',
+          conc_args=lambda: (constant_op.constant(1),),
+          call_args=lambda: (constant_op.constant(1.0),),
+          exception=(ValueError, errors.InvalidArgumentError,
+                     # on xla_gpu, we get InternalError instead.
+                     errors.InternalError)),
+      dict(
+          testcase_name='ExpectedTensorGotInt',
+          conc_args=lambda: (constant_op.constant(1),),
+          call_args=lambda: (5,),
+          error=r'func\(x, y\) expected a Tensor in x, but got int value 5'),
+      dict(
+          testcase_name='ExpectedIntGotDifferentInt',
+          conc_args=lambda: (5,),
+          call_args=lambda: (8,),
+          error=r'ConcreteFunction func\(x, y\) was constructed with int '
+          r'value 5 in x, but was called with int value 8'),
+      dict(
+          testcase_name='ExpectedIntGotTensor',
+          conc_args=lambda: (5,),
+          call_args=lambda: (constant_op.constant(6),),
+          error=r'ConcreteFunction func\(x, y\) was constructed with int '
+          'value 5 in x, but was called with (Eager)?Tensor value .*'),
+      dict(
+          testcase_name='TwoValuesForArgument',
+          conc_args=lambda: (1, 2),
+          call_args=lambda: (1, 2),
+          call_kwargs=lambda: {'x': 3},
+          error=r"func\(x, y\) got two values for argument 'x'"),
+  ])
+  # pylint: enable=g-long-lambda
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionStructuredSignatureError(self,
+                                                   conc_args=(),
+                                                   conc_kwargs=None,
+                                                   call_args=(),
+                                                   call_kwargs=None,
+                                                   error='.*',
+                                                   exception=TypeError):
+    """Tests for errors in the structrued signature.
+
+    Args:
+      conc_args: Positional arguments used for get_concrete_function.
+      conc_kwargs: Keyword arguments used for get_concrete_function.
+      call_args: Positional arguments used to call the function.
+      call_kwargs: Keyword arguments used to call the function.
+      error: Expected exception message.
+      exception: Expected exception type.
+    """
+    conc_args = conc_args() if callable(conc_args) else conc_args
+    conc_kwargs = conc_kwargs() if callable(conc_kwargs) else conc_kwargs or {}
+    call_args = call_args() if callable(call_args) else call_args
+    call_kwargs = call_kwargs() if callable(call_kwargs) else call_kwargs or {}
+    self.assertIsInstance(conc_args, tuple)
+    self.assertIsInstance(call_args, tuple)
+    self.assertIsInstance(conc_kwargs, dict)
+    self.assertIsInstance(call_kwargs, dict)
+
+    @def_function.function
+    def func(x, y=5, *varargs, **kwargs):  # pylint: disable=keyword-arg-before-vararg
+      del y, varargs, kwargs
+      return x
+
+    conc = func.get_concrete_function(*conc_args, **conc_kwargs)
+    with self.assertRaisesRegexp(exception, error):
+      self.evaluate(conc(*call_args, **call_kwargs))
+
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters([
+      dict(
+          testcase_name='MissingArg',
+          conc_args=lambda: (constant_op.constant(1), constant_op.constant(2)),
+          call_args=lambda: (constant_op.constant(1),),
+          error=r'func\(x, y\) missing required arguments: y'),
+      dict(
+          testcase_name='TwoValuesForArg',
+          conc_args=lambda: (constant_op.constant(1), constant_op.constant(2)),
+          call_args=lambda: (constant_op.constant(1),),
+          call_kwargs=lambda: {
+              'x': constant_op.constant(1),
+              'y': constant_op.constant(1)
+          },
+          error=r"func\(x, y\) got two values for argument 'x'"),
+      dict(
+          testcase_name='ExtraPositionalArg',
+          conc_args=lambda: (constant_op.constant(1), constant_op.constant(2)),
+          call_args=lambda: (constant_op.constant(1), constant_op.constant(2),
+                             constant_op.constant(3)),
+          error=r'func\(x, y\) takes 2 positional arguments but 3 were given'),
+      dict(
+          testcase_name='UnexpectedKeywordArg',
+          conc_args=lambda: (constant_op.constant(1),),
+          call_args=lambda: (constant_op.constant(1),),
+          call_kwargs=lambda: {'c': constant_op.constant(1)},
+          error=r'func\(x\) got unexpected keyword arguments: c'),
+      dict(
+          testcase_name='MissingVararg',
+          conc_args=lambda: (constant_op.constant(1), constant_op.constant(2),
+                             constant_op.constant(3)),
+          call_args=lambda: (constant_op.constant(1), constant_op.constant(2)),
+          error=r'func\(x, y, varargs_0\) missing required '
+          r'arguments: varargs_0'),
+      dict(
+          testcase_name='MissingKeywordArg',
+          conc_args=lambda: (constant_op.constant(1), constant_op.constant(2)),
+          conc_kwargs=lambda: {'c': constant_op.constant(1)},
+          call_args=lambda: (constant_op.constant(1), constant_op.constant(2)),
+          error=r'func\(x, y, c\) missing required arguments: c'),
+      dict(
+          testcase_name='ExpectedTensorGotInt',
+          conc_args=lambda: (constant_op.constant(1), constant_op.constant(2)),
+          call_args=lambda: (5, constant_op.constant(2)),
+          error=r'func\(x, y\): expected argument #0\(zero-based\) to be '
+          r'a Tensor; got int \(5\)'),
+      dict(
+          testcase_name='WrongDType',
+          conc_args=lambda: (constant_op.constant(1),),
+          call_args=lambda: (constant_op.constant(1.0),),
+          exception=(ValueError, errors.InvalidArgumentError,
+                     # on xla_gpu, we get InternalError instead.
+                     errors.InternalError)),
+      dict(
+          testcase_name='MissingKeywordArgNestPiece',
+          conc_args=lambda: (constant_op.constant(1), constant_op.constant(2)),
+          conc_kwargs=lambda: {'c': ragged_factory_ops.constant([[1]])},
+          call_args=lambda: (constant_op.constant(1), constant_op.constant(2)),
+          call_kwargs=lambda: {'c': constant_op.constant(1)},
+          error=r'func\(x, y, c, c_1\) missing required arguments: c_1'),
+  ])
+  # pylint: enable=g-long-lambda
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionFlatSignatureError(self,
+                                             conc_args=(),
+                                             conc_kwargs=None,
+                                             call_args=(),
+                                             call_kwargs=None,
+                                             error='.*',
+                                             exception=TypeError):
+    """Tests for errors in the flat signature.
+
+    Args:
+      conc_args: Positional arguments used for get_concrete_function.
+      conc_kwargs: Keyword arguments used for get_concrete_function.
+      call_args: Positional arguments used to call the function.
+      call_kwargs: Keyword arguments used to call the function.
+      error: Expected exception message.
+      exception: Expected exception type.
+    """
+    conc_args = conc_args() if callable(conc_args) else conc_args
+    conc_kwargs = conc_kwargs() if callable(conc_kwargs) else conc_kwargs or {}
+    call_args = call_args() if callable(call_args) else call_args
+    call_kwargs = call_kwargs() if callable(call_kwargs) else call_kwargs or {}
+    self.assertIsInstance(conc_args, tuple)
+    self.assertIsInstance(call_args, tuple)
+    self.assertIsInstance(conc_kwargs, dict)
+    self.assertIsInstance(call_kwargs, dict)
+
+    @def_function.function
+    def func(x, y=5, *varargs, **kwargs):  # pylint: disable=keyword-arg-before-vararg
+      del y, varargs, kwargs
+      return x
+
+    conc = func.get_concrete_function(*conc_args, **conc_kwargs)
+
+    # Remove _function_spec, to disable the structured signature.
+    conc._set_function_spec(None)  # pylint: disable=protected-access
+
+    with self.assertRaisesRegexp(exception, error):
+      self.evaluate(conc(*call_args, **call_kwargs))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionAmbiguousSignature(self):
+    # When both the flat & structured signatures are applicable, but they
+    # give different results, we use the structured signature.  Note: we expect
+    # this to be extremely rare.
+    @def_function.function
+    def f(x, y):
+      return x * 10 + y
+
+    conc = f.get_concrete_function(
+        x=tensor_spec.TensorSpec(None, dtypes.int32, name='y'),
+        y=tensor_spec.TensorSpec(None, dtypes.int32, name='x'))
+
+    result = conc(x=constant_op.constant(5), y=constant_op.constant(6))
+    self.assertAllEqual(result, 56)
+
+  def testPrettyPrintedSignature(self):
+
+    @def_function.function
+    def func(x, kangaroo=None, octopus=7):
+      del octopus, kangaroo
+      return x
+
+    scalar = constant_op.constant(5)
+    vector = constant_op.constant([10, 10, 20])
+    ragged = ragged_factory_ops.constant([[10, 20], [40]])
+
+    c1 = func.get_concrete_function(scalar, vector)
+    c1_summary = r'func\(x, kangaroo, octopus=7\)'
+    c1_details = (r'  Args:\n'
+                  r'    x: int32 Tensor, shape=\(\)\n'
+                  r'    kangaroo: int32 Tensor, shape=\(3,\)\n'
+                  r'  Returns:\n'
+                  r'    int32 Tensor, shape=\(\)')
+    self.assertRegexpMatches(
+        c1.pretty_printed_signature(verbose=False), c1_summary)
+    self.assertRegexpMatches(
+        c1.pretty_printed_signature(verbose=True),
+        c1_summary + '\n' + c1_details)
+    self.assertRegexpMatches(
+        repr(c1), r'<ConcreteFunction func\(x, kangaroo, octopus=7\) at .*>')
+    self.assertRegexpMatches(
+        str(c1), 'ConcreteFunction {}\n{}'.format(c1_summary, c1_details))
+
+    c2 = func.get_concrete_function(scalar, ragged, 3)
+    c2_summary = r'func\(x, kangaroo, octopus=3\)'
+    c2_details = (r'  Args:\n'
+                  r'    x: int32 Tensor, shape=\(\)\n'
+                  r'    kangaroo: RaggedTensorSpec\(.*\)\n'
+                  r'  Returns:\n'
+                  r'    int32 Tensor, shape=\(\)')
+    self.assertRegexpMatches(c2.pretty_printed_signature(),
+                             c2_summary + '\n' + c2_details)
+
+    c3 = func.get_concrete_function({'a': scalar, 'b': [ragged, ragged]})
+    c3_summary = r'func\(x, kangaroo=None, octopus=7\)'
+    c3_details = (r'  Args:\n'
+                  r"    x: {'a': <1>, 'b': \[<2>, <3>\]}\n"
+                  r'      <1>: int32 Tensor, shape=\(\)\n'
+                  r'      <2>: RaggedTensorSpec\(.*\)\n'
+                  r'      <3>: RaggedTensorSpec\(.*\)\n'
+                  r'  Returns:\n'
+                  r"    {'a': <1>, 'b': \[<2>, <3>\]}\n"
+                  r'      <1>: int32 Tensor, shape=\(\)\n'
+                  r'      <2>: RaggedTensorSpec\(.*\)\n'
+                  r'      <3>: RaggedTensorSpec\(.*\)')
+
+    # python 3.5 does not gurantee deterministic iteration of dict contents
+    # which can lead mismatch on pretty_printed_signature output for "Args"
+    if sys.version_info >= (3, 6):
+      self.assertRegexpMatches(c3.pretty_printed_signature(),
+                               c3_summary + '\n' + c3_details)
+
+    # pylint: disable=keyword-arg-before-vararg
+    @def_function.function
+    def func2(x, y=3, *args, **kwargs):
+      return (x, y, args, kwargs)
+
+    c4 = func2.get_concrete_function(scalar, 4, 5, a=scalar)
+    c4_summary = 'func2(x, y=4, <arg3>=5, *, a)'
+    self.assertEqual(c4.pretty_printed_signature(verbose=False), c4_summary)
+
+    c5 = func2.get_concrete_function(8, vector)
+    c5_summary = 'func2(x=8, y)'
+    self.assertEqual(c5.pretty_printed_signature(verbose=False), c5_summary)
+
 
 class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/eager/gradient_input_output_exclusions.py b/tensorflow/python/eager/gradient_input_output_exclusions.py
index 983f10551ba..94962bf6135 100644
--- a/tensorflow/python/eager/gradient_input_output_exclusions.py
+++ b/tensorflow/python/eager/gradient_input_output_exclusions.py
@@ -36,6 +36,7 @@ from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis import liveness
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 
@@ -208,6 +209,7 @@ def _live_tensors(f, attr_name="inputs"):
   graphs = cfg.build(node)
   node = qual_names.resolve(node)
   node = activity.resolve(node, ctx, None)
+  node = reaching_fndefs.resolve(node, ctx, graphs)
   node = liveness.resolve(node, ctx, graphs)
 
   op_arg_name = anno.getanno(node.args.args[0], anno.Basic.QN)
diff --git a/tensorflow/python/eager/profiler_test.py b/tensorflow/python/eager/profiler_test.py
index 7695fb7532e..f924f0f10f6 100644
--- a/tensorflow/python/eager/profiler_test.py
+++ b/tensorflow/python/eager/profiler_test.py
@@ -47,7 +47,8 @@ class ProfilerTest(test_util.TensorFlowTestCase):
     profile_pb.ParseFromString(profile_result)
     devices = frozenset(device.name for device in profile_pb.devices.values())
     self.assertIn('/host:CPU', devices)
-    if config.list_physical_devices('GPU'):
+    if not test_util.IsBuiltWithROCm() and config.list_physical_devices('GPU'):
+      # device tracing is not yet supported on the ROCm platform
       self.assertIn('/device:GPU:0', devices)
     events = frozenset(event.name for event in profile_pb.trace_events)
     self.assertIn('three_times_five', events)
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 882c8097a0f..72757ae41e2 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 347> a = {{
+  static std::array<OpIndexInfo, 348> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -396,6 +396,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"WholeFileReader"},
       {"XlaClusterOutput"},
       {"XlaSharding"},
+      {"XlaSpmdShardToFullShape"},
       {"ZerosLike"},
       {"VarHandleOp"},
   }};
@@ -410,7 +411,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 459> a = {{
+  static std::array<OpIndexInfo, 461> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -865,6 +866,8 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"XlaClusterOutput"},
       {"XlaEinsum"},
       {"XlaSharding"},
+      {"XlaSpmdFullToShardShape"},
+      {"XlaSpmdShardToFullShape"},
       {"Xlog1py"},
       {"Xlogy"},
       {"ZerosLike"},
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 7dd7eb53fb1..b209ddb6162 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -44,6 +44,19 @@ namespace {
 // events on eager tensors. This is set by TFE_Py_InitEagerTensor, if at all.
 PyObject* eager_tensor_profiler = nullptr;
 
+// Read-only dict. Please don't use this in any setting where the dict might
+// actually get mutated. This is only used to pass empty kwargs when creating a
+// new EagerTensor.
+PyObject* EmptyDict() {
+  static PyObject* empty_dict = PyDict_New();
+  return empty_dict;
+}
+
+PyObject* EmptyTuple() {
+  static PyObject* empty_tuple = PyTuple_New(0);
+  return empty_tuple;
+}
+
 TFE_Context* GetContextHandle(PyObject* py_context) {
   tensorflow::Safe_PyObjectPtr py_context_handle(
       PyObject_GetAttrString(py_context, "_handle"));
@@ -332,6 +345,8 @@ typedef struct EagerTensor {
   char unused[kMaxEagerTensorParentSize];
   TFE_TensorHandle* handle;
   int64_t id;
+  // Indicates whether it's a packed tensor or not.
+  bool is_packed;
   // This mirrors tensorflow.core.framework.ops.Tensor._handle_data Which will
   // be None for tensors of type other than DT_RESOURCE. For DT_RESOURCE
   // tensors, this will contain a serialized HandleData proto with shape
@@ -405,6 +420,7 @@ bool MaybeInvokeCreatedOnEagerTensorProfiler(EagerTensor* created_tensor) {
 int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   self->id = get_uid();
   self->handle = nullptr;
+  self->is_packed = false;
   Py_INCREF(Py_None);
   self->handle_data = Py_None;
   Py_INCREF(Py_None);
@@ -634,6 +650,11 @@ static PyObject* EagerTensor_backing_device(EagerTensor* self) {
 #endif
 }
 
+// Getter `is_packed`.
+static PyObject* EagerTensor_is_packed(EagerTensor* self) {
+  return PyBool_FromLong(self->is_packed);
+}
+
 static PyGetSetDef EagerTensor_getsetters[] = {
     {const_cast<char*>("_id"), (getter)EagerTensor_getid, nullptr,
      const_cast<char*>("Tensor ID."), nullptr},
@@ -642,6 +663,9 @@ static PyGetSetDef EagerTensor_getsetters[] = {
     {const_cast<char*>("backing_device"), (getter)EagerTensor_backing_device,
      nullptr, const_cast<char*>("Device on which tensor's memory is resident."),
      nullptr},
+    {const_cast<char*>("is_packed"), (getter)EagerTensor_is_packed, nullptr,
+     const_cast<char*>("Whether the EagerTensor is a packed tensor or not."),
+     nullptr},
     {const_cast<char*>("_handle_data"), (getter)EagerTensor_handle_data,
      (setter)EagerTensor_sethandle_data,
      const_cast<char*>("Shape/DType data if the EagerTensor is a DT_RESOURCE"),
@@ -749,7 +773,11 @@ static PyTypeObject _EagerTensorType = {
     sizeof(EagerTensor),                /* tp_basicsize */
     0,                                  /* tp_itemsize */
     (destructor)EagerTensor_dealloc,    /* tp_dealloc */
+#if PY_VERSION_HEX < 0x03080000
     nullptr,                            /* tp_print */
+#else
+    0, /* tp_vectorcall_offset */
+#endif
     nullptr,                            /* tp_getattr */
     nullptr,                            /* tp_setattr */
     nullptr,                            /* tp_compare */
@@ -796,14 +824,16 @@ TFE_TensorHandle* EagerTensor_Handle(const PyObject* o) {
   return reinterpret_cast<const EagerTensor*>(o)->handle;
 }
 
-PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
+PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle,
+                                const bool is_packed) {
   if (handle == nullptr) {
     return nullptr;
   }
   EagerTensor* t = reinterpret_cast<EagerTensor*>(
-      EagerTensorType->tp_new(EagerTensorType, Py_None, Py_None));
+      EagerTensorType->tp_new(EagerTensorType, EmptyTuple(), EmptyDict()));
   if (t != nullptr) {
     t->id = get_uid();
+    t->is_packed = is_packed;
     Py_INCREF(Py_None);
     t->handle_data = Py_None;
     Py_INCREF(Py_None);
diff --git a/tensorflow/python/eager/pywrap_tensor_conversion.cc b/tensorflow/python/eager/pywrap_tensor_conversion.cc
index 7b192214f18..041ddf4ec53 100644
--- a/tensorflow/python/eager/pywrap_tensor_conversion.cc
+++ b/tensorflow/python/eager/pywrap_tensor_conversion.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/hash/hash.h"
-#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -41,7 +41,7 @@ TFE_TensorHandle* TFE_TensorHandleCache::Lookup(
     PyObject* value, tensorflow::DataType dtype,
     absl::string_view device_name) const {
   CHECK_NOTNULL(value);
-  const auto& it = cache.find(Key{PyObjectPtr{value}, dtype, device_name});
+  const auto it = cache.find(Key{PyObjectPtr{value}, dtype, device_name});
   if (it == cache.end()) {
     scalar_cache_misses->GetCell()->IncrementBy(1);
     return nullptr;
@@ -49,7 +49,7 @@ TFE_TensorHandle* TFE_TensorHandleCache::Lookup(
 
   scalar_cache_hits->GetCell()->IncrementBy(1);
   auto* h = it->second;
-  return new TFE_TensorHandle{h->handle->Copy()};
+  return tensorflow::wrap(tensorflow::unwrap(h)->Copy());
 }
 
 void TFE_TensorHandleCache::Insert(PyObject* value, tensorflow::DataType dtype,
@@ -57,7 +57,7 @@ void TFE_TensorHandleCache::Insert(PyObject* value, tensorflow::DataType dtype,
                                    TFE_TensorHandle* h) {
   Py_INCREF(value);
   cache.emplace(Key{PyObjectPtr{value}, dtype, device_name},
-                new TFE_TensorHandle{h->handle->Copy()});
+                tensorflow::wrap(tensorflow::unwrap(h)->Copy()));
 }
 
 void TFE_TensorHandleCache::Clear() {
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 398c8aa14a8..a5c9c181539 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -129,7 +129,8 @@ void TFE_DeleteContextCapsule(PyObject* context);
 bool EagerTensor_CheckExact(const PyObject* o);
 
 // Helper function to construct a new EagerTensor from a TFE_TensorHandle.
-PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle);
+PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle,
+                                const bool is_packed = false);
 
 // Extracts the handle inside EagerTensor object `o`. Returns nullptr on error.
 TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
@@ -331,6 +332,22 @@ PyObject* TFE_Py_ForwardAccumulatorPopState();
 //       appended to `tensors`.
 PyObject* TFE_Py_PackJVPs(PyObject* tensors);
 
+// Variable Watcher methods.
+
+// Creates a new variable watcher and adds it to the set of active variable
+// watchers.
+PyObject* TFE_Py_VariableWatcherNew();
+
+// Removes the passed variable watcher from the set of active variable watchers.
+void TFE_Py_VariableWatcherRemove(PyObject* variable_watcher);
+
+// Notifies all variable watchers that a variable has been accessed.
+void TFE_Py_VariableWatcherVariableAccessed(PyObject* variable);
+
+// Returns all variables watched by the given variable_watcher in the order
+// those variables were created.
+PyObject* TFE_Py_VariableWatcherWatchedVariables(PyObject* variable_watcher);
+
 // Returns an EagerTensor of dimension [len(`tensors`)] containing
 // the `slice_dim`'th dimension of each tensor in `tensors`. In other words,
 // TFE_Py_TensorShapeSlice takes a slice of dimensions of tensors in
@@ -363,7 +380,7 @@ void TFE_Py_EnableInteractivePythonLogging();
 // Py_None.
 //
 // This function is not thread-safe.
-PyObject* TFE_Py_SetEagerContext(PyObject* python_context);
+PyObject* TFE_Py_SetEagerContext(PyObject* py_context);
 
 // Returns the current eager Context object (defined in eager/context.py)
 // that was last set using TFE_Py_SetEagerContext.
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 0091cf2455c..639f623bd1a 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/tape.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -80,9 +83,10 @@ TFE_Op* GetOp(TFE_Context* ctx, const char* op_or_function_name,
               const char* raw_device_name, TF_Status* status) {
   auto op = ReleaseThreadLocalOp(ctx);
   if (!op) {
-    op.reset(new TFE_Op{ctx->context->CreateOperation()});
+    op.reset(tensorflow::wrap(tensorflow::unwrap(ctx)->CreateOperation()));
   }
-  status->status = op->operation->Reset(op_or_function_name, raw_device_name);
+  status->status =
+      tensorflow::unwrap(op.get())->Reset(op_or_function_name, raw_device_name);
   if (!status->status.ok()) {
     op.reset();
   }
@@ -91,7 +95,7 @@ TFE_Op* GetOp(TFE_Context* ctx, const char* op_or_function_name,
 
 void ReturnOp(TFE_Context* ctx, TFE_Op* op) {
   if (op) {
-    op->operation->Clear();
+    tensorflow::unwrap(op)->Clear();
     thread_local_eager_operation_map[ctx].reset(op);
   }
 }
@@ -848,6 +852,8 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name,
                               TFE_CancellationManager* cancellation_manager,
                               TFE_OutputTensorHandles* outputs,
                               TF_Status* out_status) {
+  tensorflow::profiler::TraceMe activity(
+      "TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo);
   TFE_Op* op = GetOp(ctx, op_name, device_name, out_status);
   auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); });
   if (!out_status->status.ok()) return;
@@ -1375,38 +1381,24 @@ PyObject* PyTapeTensor::ZerosLike() const {
   return result;
 }
 
-class GradientTape
-    : public tensorflow::eager::GradientTape<PyObject, PyBackwardFunction,
-                                             PyTapeTensor> {
+// Keeps track of all variables that have been accessed during execution.
+class VariableWatcher {
  public:
-  explicit GradientTape(bool persistent, bool watch_accessed_variables)
-      : tensorflow::eager::GradientTape<PyObject, PyBackwardFunction,
-                                        PyTapeTensor>(persistent),
-        watch_accessed_variables_(watch_accessed_variables) {}
+  VariableWatcher() {}
 
-  virtual ~GradientTape() {
+  ~VariableWatcher() {
     for (const IdAndVariable& v : watched_variables_) {
       Py_DECREF(v.variable);
     }
   }
 
-  void VariableAccessed(PyObject* v) {
-    if (watch_accessed_variables_) {
-      WatchVariable(v);
-    }
-  }
-
-  void WatchVariable(PyObject* v) {
+  tensorflow::int64 WatchVariable(PyObject* v) {
     tensorflow::Safe_PyObjectPtr handle(PyObject_GetAttrString(v, "handle"));
     if (handle == nullptr) {
-      return;
+      return -1;
     }
     tensorflow::int64 id = FastTensorId(handle.get());
 
-    if (!PyErr_Occurred()) {
-      this->Watch(id);
-    }
-
     tensorflow::mutex_lock l(watched_variables_mu_);
     auto insert_result = watched_variables_.emplace(id, v);
 
@@ -1415,6 +1407,8 @@ class GradientTape
       // variable.
       Py_INCREF(v);
     }
+
+    return id;
   }
 
   PyObject* GetVariablesAsPyTuple() {
@@ -1445,12 +1439,45 @@ class GradientTape
     }
   };
 
-  bool watch_accessed_variables_;
   tensorflow::mutex watched_variables_mu_;
   std::set<IdAndVariable, CompareById> watched_variables_
       TF_GUARDED_BY(watched_variables_mu_);
 };
 
+class GradientTape
+    : public tensorflow::eager::GradientTape<PyObject, PyBackwardFunction,
+                                             PyTapeTensor> {
+ public:
+  explicit GradientTape(bool persistent, bool watch_accessed_variables)
+      : tensorflow::eager::GradientTape<PyObject, PyBackwardFunction,
+                                        PyTapeTensor>(persistent),
+        watch_accessed_variables_(watch_accessed_variables) {}
+
+  virtual ~GradientTape() {}
+
+  void VariableAccessed(PyObject* v) {
+    if (watch_accessed_variables_) {
+      WatchVariable(v);
+    }
+  }
+
+  void WatchVariable(PyObject* v) {
+    tensorflow::int64 id = variable_watcher_.WatchVariable(v);
+
+    if (!PyErr_Occurred()) {
+      this->Watch(id);
+    }
+  }
+
+  PyObject* GetVariablesAsPyTuple() {
+    return variable_watcher_.GetVariablesAsPyTuple();
+  }
+
+ private:
+  bool watch_accessed_variables_;
+  VariableWatcher variable_watcher_;
+};
+
 typedef tensorflow::eager::ForwardAccumulator<PyObject, PyBackwardFunction,
                                               PyTapeTensor>
     ForwardAccumulator;
@@ -1479,22 +1506,26 @@ static PyTypeObject TFE_Py_Tape_Type = {
     sizeof(TFE_Py_Tape),                          /* tp_basicsize */
     0,                                            /* tp_itemsize */
     &TFE_Py_Tape_Delete,                          /* tp_dealloc */
-    0,                                            /* tp_print */
-    nullptr,                                      /* tp_getattr */
-    nullptr,                                      /* tp_setattr */
-    nullptr,                                      /* tp_reserved */
-    nullptr,                                      /* tp_repr */
-    nullptr,                                      /* tp_as_number */
-    nullptr,                                      /* tp_as_sequence */
-    nullptr,                                      /* tp_as_mapping */
-    nullptr,                                      /* tp_hash  */
-    nullptr,                                      /* tp_call */
-    nullptr,                                      /* tp_str */
-    nullptr,                                      /* tp_getattro */
-    nullptr,                                      /* tp_setattro */
-    nullptr,                                      /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,                           /* tp_flags */
-    "TFE_Py_Tape objects",                        /* tp_doc */
+#if PY_VERSION_HEX < 0x03080000
+    nullptr, /* tp_print */
+#else
+    0, /* tp_vectorcall_offset */
+#endif
+    nullptr,               /* tp_getattr */
+    nullptr,               /* tp_setattr */
+    nullptr,               /* tp_reserved */
+    nullptr,               /* tp_repr */
+    nullptr,               /* tp_as_number */
+    nullptr,               /* tp_as_sequence */
+    nullptr,               /* tp_as_mapping */
+    nullptr,               /* tp_hash  */
+    nullptr,               /* tp_call */
+    nullptr,               /* tp_str */
+    nullptr,               /* tp_getattro */
+    nullptr,               /* tp_setattro */
+    nullptr,               /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,    /* tp_flags */
+    "TFE_Py_Tape objects", /* tp_doc */
 };
 
 typedef struct {
@@ -1517,22 +1548,65 @@ static PyTypeObject TFE_Py_ForwardAccumulator_Type = {
     sizeof(TFE_Py_ForwardAccumulator),                      /* tp_basicsize */
     0,                                                      /* tp_itemsize */
     &TFE_Py_ForwardAccumulatorDelete,                       /* tp_dealloc */
-    0,                                                      /* tp_print */
-    nullptr,                                                /* tp_getattr */
-    nullptr,                                                /* tp_setattr */
-    nullptr,                                                /* tp_reserved */
-    nullptr,                                                /* tp_repr */
-    nullptr,                                                /* tp_as_number */
-    nullptr,                                                /* tp_as_sequence */
-    nullptr,                                                /* tp_as_mapping */
-    nullptr,                                                /* tp_hash  */
-    nullptr,                                                /* tp_call */
-    nullptr,                                                /* tp_str */
-    nullptr,                                                /* tp_getattro */
-    nullptr,                                                /* tp_setattro */
-    nullptr,                                                /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,                                     /* tp_flags */
-    "TFE_Py_ForwardAccumulator objects",                    /* tp_doc */
+#if PY_VERSION_HEX < 0x03080000
+    nullptr, /* tp_print */
+#else
+    0, /* tp_vectorcall_offset */
+#endif
+    nullptr,                             /* tp_getattr */
+    nullptr,                             /* tp_setattr */
+    nullptr,                             /* tp_reserved */
+    nullptr,                             /* tp_repr */
+    nullptr,                             /* tp_as_number */
+    nullptr,                             /* tp_as_sequence */
+    nullptr,                             /* tp_as_mapping */
+    nullptr,                             /* tp_hash  */
+    nullptr,                             /* tp_call */
+    nullptr,                             /* tp_str */
+    nullptr,                             /* tp_getattro */
+    nullptr,                             /* tp_setattro */
+    nullptr,                             /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,                  /* tp_flags */
+    "TFE_Py_ForwardAccumulator objects", /* tp_doc */
+};
+
+typedef struct {
+  PyObject_HEAD
+      /* Type-specific fields go here. */
+      VariableWatcher* variable_watcher;
+} TFE_Py_VariableWatcher;
+
+static void TFE_Py_VariableWatcher_Delete(PyObject* variable_watcher) {
+  delete reinterpret_cast<TFE_Py_VariableWatcher*>(variable_watcher)
+      ->variable_watcher;
+  Py_TYPE(variable_watcher)->tp_free(variable_watcher);
+}
+
+static PyTypeObject TFE_Py_VariableWatcher_Type = {
+    PyVarObject_HEAD_INIT(nullptr, 0) "tfe.VariableWatcher", /* tp_name */
+    sizeof(TFE_Py_VariableWatcher),                          /* tp_basicsize */
+    0,                                                       /* tp_itemsize */
+    &TFE_Py_VariableWatcher_Delete,                          /* tp_dealloc */
+#if PY_VERSION_HEX < 0x03080000
+    nullptr, /* tp_print */
+#else
+    0, /* tp_vectorcall_offset */
+#endif
+    nullptr,                          /* tp_getattr */
+    nullptr,                          /* tp_setattr */
+    nullptr,                          /* tp_reserved */
+    nullptr,                          /* tp_repr */
+    nullptr,                          /* tp_as_number */
+    nullptr,                          /* tp_as_sequence */
+    nullptr,                          /* tp_as_mapping */
+    nullptr,                          /* tp_hash  */
+    nullptr,                          /* tp_call */
+    nullptr,                          /* tp_str */
+    nullptr,                          /* tp_getattro */
+    nullptr,                          /* tp_setattro */
+    nullptr,                          /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,               /* tp_flags */
+    "TFE_Py_VariableWatcher objects", /* tp_doc */
 };
 
 // Note: in the current design no mutex is needed here because of the python
@@ -1548,6 +1622,18 @@ tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* GetTapeSet() {
   return tape_set.get();
 }
 
+tensorflow::gtl::CompactPointerSet<TFE_Py_VariableWatcher*>*
+GetVariableWatcherSet() {
+  thread_local std::unique_ptr<
+      tensorflow::gtl::CompactPointerSet<TFE_Py_VariableWatcher*>>
+      variable_watcher_set = nullptr;
+  if (variable_watcher_set == nullptr) {
+    variable_watcher_set.reset(
+        new tensorflow::gtl::CompactPointerSet<TFE_Py_VariableWatcher*>);
+  }
+  return variable_watcher_set.get();
+}
+
 // A linked hash set, where iteration is in insertion order.
 //
 // Nested accumulators rely on op recording happening in insertion order, so an
@@ -1670,6 +1756,16 @@ class SafeAccumulatorSet : public SafeSetCopy<AccumulatorSet> {
   }
 };
 
+class SafeVariableWatcherSet
+    : public SafeSetCopy<
+          tensorflow::gtl::CompactPointerSet<TFE_Py_VariableWatcher*>> {
+ public:
+  SafeVariableWatcherSet()
+      : SafeSetCopy<
+            tensorflow::gtl::CompactPointerSet<TFE_Py_VariableWatcher*>>(
+            *GetVariableWatcherSet()) {}
+};
+
 bool* ThreadTapeIsStopped() {
   thread_local bool thread_tape_is_stopped{false};
   return &thread_tape_is_stopped;
@@ -1912,21 +2008,22 @@ bool ListContainsNone(PyObject* list) {
 
 static PyTapeTensor TapeTensorFromTensor(PyObject* tensor) {
   if (EagerTensor_CheckExact(tensor)) {
-    TFE_TensorHandle* t = EagerTensor_Handle(tensor);
+    tensorflow::AbstractTensorHandleInterface* handle =
+        tensorflow::unwrap(EagerTensor_Handle(tensor));
     tensorflow::int64 id = PyEagerTensor_ID(tensor);
     tensorflow::DataType dtype =
-        static_cast<tensorflow::DataType>(t->handle->DataType());
+        static_cast<tensorflow::DataType>(handle->DataType());
     if (dtype == tensorflow::DT_VARIANT) {
       return PyTapeTensor(id, dtype, tensor);
     }
 
     tensorflow::TensorShape tensor_shape;
     int num_dims;
-    tensorflow::Status status = t->handle->NumDims(&num_dims);
+    tensorflow::Status status = handle->NumDims(&num_dims);
     if (status.ok()) {
       for (int i = 0; i < num_dims; ++i) {
         tensorflow::int64 dim_size;
-        status = t->handle->Dim(i, &dim_size);
+        status = handle->Dim(i, &dim_size);
         if (!status.ok()) break;
         tensor_shape.AddDim(dim_size);
       }
@@ -2037,6 +2134,36 @@ PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape) {
   return reinterpret_cast<TFE_Py_Tape*>(tape)->tape->GetVariablesAsPyTuple();
 }
 
+PyObject* TFE_Py_VariableWatcherNew() {
+  TFE_Py_VariableWatcher_Type.tp_new = PyType_GenericNew;
+  if (PyType_Ready(&TFE_Py_VariableWatcher_Type) < 0) return nullptr;
+  TFE_Py_VariableWatcher* variable_watcher =
+      PyObject_NEW(TFE_Py_VariableWatcher, &TFE_Py_VariableWatcher_Type);
+  variable_watcher->variable_watcher = new VariableWatcher();
+  Py_INCREF(variable_watcher);
+  GetVariableWatcherSet()->insert(variable_watcher);
+  return reinterpret_cast<PyObject*>(variable_watcher);
+}
+
+void TFE_Py_VariableWatcherRemove(PyObject* variable_watcher) {
+  auto* stack = GetVariableWatcherSet();
+  stack->erase(reinterpret_cast<TFE_Py_VariableWatcher*>(variable_watcher));
+  // We kept a reference to the variable watcher in the set to ensure it
+  // wouldn't get deleted under us; cleaning it up here.
+  Py_DECREF(variable_watcher);
+}
+
+void TFE_Py_VariableWatcherVariableAccessed(PyObject* variable) {
+  for (TFE_Py_VariableWatcher* variable_watcher : SafeVariableWatcherSet()) {
+    variable_watcher->variable_watcher->WatchVariable(variable);
+  }
+}
+
+PyObject* TFE_Py_VariableWatcherWatchedVariables(PyObject* variable_watcher) {
+  return reinterpret_cast<TFE_Py_VariableWatcher*>(variable_watcher)
+      ->variable_watcher->GetVariablesAsPyTuple();
+}
+
 namespace {
 std::vector<tensorflow::DataType> MakeTensorDtypeList(PyObject* tensors) {
   PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
@@ -3086,6 +3213,7 @@ void MaybeNotifyVariableAccessed(PyObject* input) {
       PyObject_GetAttrString(input, "_trainable"));
   if (trainable.get() == Py_False) return;
   TFE_Py_TapeVariableAccessed(input);
+  TFE_Py_VariableWatcherVariableAccessed(input);
 }
 
 bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
@@ -3097,6 +3225,9 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
   auto cleaner = tensorflow::gtl::MakeCleanup([op] { TFE_DeleteOp(op); });
   if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
 
+  TFE_OpSetDevice(op, parent_op_exec_info.device_name, status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
+
   // Set dtype
   DCHECK(PyObject_HasAttrString(input, "_dtype"));
   tensorflow::Safe_PyObjectPtr dtype(PyObject_GetAttrString(input, "_dtype"));
@@ -3106,9 +3237,6 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
   }
   TFE_OpSetAttrType(op, "dtype", static_cast<TF_DataType>(value));
 
-  TFE_OpSetDevice(op, parent_op_exec_info.device_name, status);
-  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
-
   // Get handle
   tensorflow::Safe_PyObjectPtr handle(PyObject_GetAttrString(input, "_handle"));
   if (!EagerTensor_CheckExact(handle.get())) return false;
@@ -3402,7 +3530,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
     return nullptr;
   }
 
-  const tensorflow::OpDef* op_def = op->operation->OpDef();
+  const tensorflow::OpDef* op_def = tensorflow::unwrap(op)->OpDef();
   if (op_def == nullptr) return nullptr;
 
   if (args_size < kFastPathExecuteInputStartIndex + op_def->input_arg_size()) {
@@ -3741,14 +3869,15 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg,
                                        bool include_tensor_ranks_only,
                                        EncodeResult* result) {
   if (EagerTensor_CheckExact(arg)) {
-    TFE_TensorHandle* t = EagerTensor_Handle(arg);
+    tensorflow::AbstractTensorHandleInterface* handle =
+        tensorflow::unwrap(EagerTensor_Handle(arg));
 
     absl::StrAppend(&result->str, kDType,
-                    static_cast<tensorflow::DataType>(t->handle->DataType()));
+                    static_cast<tensorflow::DataType>(handle->DataType()));
     absl::StrAppend(&result->str, kShape);
 
     int num_dims;
-    tensorflow::Status status = t->handle->NumDims(&num_dims);
+    tensorflow::Status status = handle->NumDims(&num_dims);
     if (!status.ok()) return status;
 
     if (include_tensor_ranks_only) {
@@ -3756,7 +3885,7 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg,
     } else {
       for (int i = 0; i < num_dims; ++i) {
         tensorflow::int64 dim_size;
-        status = t->handle->Dim(i, &dim_size);
+        status = handle->Dim(i, &dim_size);
         if (!status.ok()) return status;
         absl::StrAppend(&result->str, dim_size, kShapeDelim);
       }
@@ -3892,6 +4021,11 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg,
           "Error while reading CompositeTensor._type_spec.");
     }
     result->objects.push_back(type_spec);
+  } else if (tensorflow::swig::IsTypeSpec(arg)) {
+    // Add the typespec (not a weakref) in case it's a temporary object.
+    absl::StrAppend(&result->str, kRaw);
+    Py_INCREF(arg);
+    result->objects.push_back(arg);
   } else if (tensorflow::swig::IsAttrs(arg)) {
     absl::StrAppend(&result->str, kAttrs);
     tensorflow::Safe_PyObjectPtr attrs(
diff --git a/tensorflow/python/eager/remote_cluster_test.py b/tensorflow/python/eager/remote_cluster_test.py
index 11310c0b5c4..025899e271e 100644
--- a/tensorflow/python/eager/remote_cluster_test.py
+++ b/tensorflow/python/eager/remote_cluster_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import threading
 
 from absl.testing import parameterized
@@ -28,11 +29,13 @@ from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.eager import executor
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import server_lib
@@ -108,14 +111,24 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
             self._cached_server3_target
         ],
         task_index=0)
+    self.server_def_s1_s2_s3_s4 = get_server_def(
+        JOB_NAME,
+        local_server_port=0,
+        remote_server_addresses=[
+            self._cached_server1_target, self._cached_server2_target,
+            self._cached_server3_target, self._cached_server4_target
+        ],
+        task_index=0)
 
     self.device_local = "/job:%s/replica:0/task:0/device:CPU:0" % JOB_NAME
     self.device_t1 = "/job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME
     self.device_t2 = "/job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME
     self.device_t3 = "/job:%s/replica:0/task:3/device:CPU:0" % JOB_NAME
+    self.device_t4 = "/job:%s/replica:0/task:4/device:CPU:0" % JOB_NAME
 
   def setUp(self):
     super(DynamicClusterTest, self).setUp()
+    os.environ["TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE"] = str(False)
     local_port = pywrap_tfe.TF_PickUnusedPortOrDie()
     context.set_server_def(
         server_def=get_server_def(
@@ -287,7 +300,6 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
       y = worker_fn(x1)
     np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
 
-  @test_util.run_in_async_and_sync_mode
   def testPendingNodesServerReplaced(self):
     """Update cluster when nodes are still pending on remote workers."""
     with ops.device(self.device_local):
@@ -358,9 +370,9 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
     for result in t1_results + t2_results:
       np.testing.assert_array_equal([[2, 2], [2, 2]], result)
 
-  @test_util.run_in_async_and_sync_mode
   def testMultiThreadPendingNodesLockFree(self):
     """Update cluster when other remote function calls are being launched."""
+
     with ops.device(self.device_t1):
       x1 = array_ops.ones([2, 2])
 
@@ -481,6 +493,105 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
         y = worker_fn(x1)
       np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
 
+  def testDistributedFunctionPendingNodesServerReplaced(self):
+    with ops.device(self.device_local):
+      x1 = array_ops.ones([2, 2])
+
+    @def_function.function
+    def worker_fn(i):
+      with ops.device(self.device_t1):
+        mul = math_ops.matmul(i, i)
+      with ops.device(self.device_t2):
+        add = mul + i
+      return add - i
+    worker_fn.get_concrete_function(x1)
+
+    num_calls = 10
+    self._coord = coordinator.Coordinator()
+
+    def thread_fn(device, results):
+      with self._coord.stop_on_exception():
+        for i in range(num_calls):
+          with ops.device(device):
+            y = worker_fn(x1)
+          results[i] = y.numpy()
+
+    def update_server_def_fn():
+      with self._coord.stop_on_exception():
+        for i in range(num_calls):
+          context.update_server_def(
+              server_def=(self.server_def_s1_s2_s3
+                          if i % 2 == 0 else self.server_def_s1_s2))
+
+    results = [None] * num_calls
+    threads = []
+    threads.append(threading.Thread(target=thread_fn,
+                                    args=(self.device_t1, results)))
+    threads.append(threading.Thread(target=update_server_def_fn))
+    for t in threads:
+      t.start()
+    for t in threads:
+      t.join()
+    for result in results:
+      np.testing.assert_array_equal([[2, 2], [2, 2]], result)
+
+  def testParameterServerMultiExecutors(self):
+    context.update_server_def(server_def=self.server_def_s1_s2_s3_s4)
+
+    with ops.device(self.device_t1):
+      v1 = variables.Variable(initial_value=0.)
+    with ops.device(self.device_t2):
+      v2 = variables.Variable(initial_value=10.)
+
+    @def_function.function
+    def worker_fn():
+      x1 = v1.read_value()
+      x2 = v2.read_value()
+      grad = (x1 + x2) * 0.1
+      v1.assign_add(grad)
+      v2.assign_sub(grad)
+      return v1 + v2
+
+    worker_fn.get_concrete_function()
+
+    executor_t3 = executor.new_executor(enable_async=False)
+    executor_t4 = executor.new_executor(enable_async=False)
+
+    num_calls = 10
+    self._coord = coordinator.Coordinator()
+
+    def thread_fn(executor_obj, device, results):
+      with self._coord.stop_on_exception():
+        for i in range(num_calls):
+          with context.executor_scope(executor_obj):
+            with ops.device(device):
+              results[i] = worker_fn()
+
+    def update_server_def_fn():
+      with self._coord.stop_on_exception():
+        for _ in range(30):
+          context.update_server_def(self.server_def_s1_s2_s3_s4)
+
+    t3_results = [None] * num_calls
+    t4_results = [None] * num_calls
+    threads = []
+    threads.append(
+        threading.Thread(
+            target=thread_fn, args=(executor_t3, self.device_t3, t3_results)))
+    threads.append(
+        threading.Thread(
+            target=thread_fn, args=(executor_t4, self.device_t4, t4_results)))
+    threads.append(threading.Thread(target=update_server_def_fn))
+    for t in threads:
+      t.start()
+    self._coord.join(threads)
+
+    # Cannot assert individual values since the results are non-deterministic.
+    # By summing up the value we ensure that there are all reasonable and valid
+    # numbers (not `None` or `NaN`).
+    total = np.sum(t3_results + t4_results)
+    self.assertGreater(total, 0)
+
   def testCheckAlive(self):
     with self.assertRaisesRegexp(ValueError, "Context is not initialized."):
       context.check_alive("/job:remote_device/task:0")
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 32fe6372f77..710e7bf5f9d 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import server_lib
 from tensorflow.python.training.server_lib import ClusterSpec
@@ -324,6 +325,36 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
+  def testMultiDeviceFunctionWithPackedVariable(self):
+    with ops.device('/job:worker/replica:0/task:0/device:CPU:0'):
+      var0 = resource_variable_ops.ResourceVariable(1.0)
+    with ops.device('/job:worker/replica:0/task:1/device:CPU:0'):
+      var1 = resource_variable_ops.ResourceVariable(2.0)
+
+    packed_var = ops.pack_eager_tensors([var0.handle, var1.handle])
+    self.assertEqual(packed_var.device,
+                     '/job:localhost/replica:0/task:0/device:COMPOSITE:0')
+    self.assertEqual(packed_var.backing_device,
+                     '/job:localhost/replica:0/task:0/device:COMPOSITE:0')
+
+    @def_function.function
+    def add_variables():
+      with ops.device('/job:worker/replica:0/task:0/device:CPU:0'):
+        read0 = resource_variable_ops.read_variable_op(
+            packed_var, dtype=dtypes.float32)
+      with ops.device('/job:worker/replica:0/task:1/device:CPU:0'):
+        read1 = resource_variable_ops.read_variable_op(
+            packed_var, dtype=dtypes.float32)
+
+      return read0 + read1
+
+    # Run the function on a remote device
+    with ops.device('/job:worker/replica:0/task:0'):
+      self.assertAllEqual(add_variables().numpy(), 3.0)
+
+    # Run the function on a local worker
+    self.assertAllEqual(add_variables().numpy(), 3.0)
+
   @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceFunctionOnRemoteDeviceWithWait(self):
     with ops.device('/job:worker/replica:0/task:1'):
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 2ecac8bbb73..d1e8e52513e 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -58,6 +58,36 @@ def watch(tape, tensor):
   pywrap_tfe.TFE_Py_TapeWatch(tape._tape, tensor)  # pylint: disable=protected-access
 
 
+class VariableWatcher(object):
+  """A scope that tracks all trainable variable accesses within it.
+
+  This explicitly ignores variables that are not marked as trainable.
+
+  Sample usage:
+
+  var = tf.Variable(0.0)
+  with VariableWatcher() as variable_watcher:
+    var.assign_add(1.0)
+
+  assert variable_watcher.watched_variables == [var]
+  """
+
+  def __init__(self):
+    self._variable_watcher = None
+
+  def __enter__(self):
+    self._variable_watcher = pywrap_tfe.TFE_Py_VariableWatcherNew()
+    return self
+
+  def __exit__(self, typ, value, traceback):
+    pywrap_tfe.TFE_Py_VariableWatcherRemove(self._variable_watcher)
+
+  def watched_variables(self):
+    """Returns a tuple of variables accessed under this scope."""
+    return pywrap_tfe.TFE_Py_VariableWatcherWatchedVariables(
+        self._variable_watcher)
+
+
 def watch_variable(tape, variable):
   """Marks this variable to be watched by the given tape."""
   strategy, context = (
@@ -68,6 +98,7 @@ def watch_variable(tape, variable):
     variables = strategy.experimental_local_results(variable)
   for var in variables:
     pywrap_tfe.TFE_Py_TapeWatchVariable(tape._tape, var)  # pylint: disable=protected-access
+    pywrap_tfe.TFE_Py_VariableWatcherVariableAccessed(var)
 
 
 def variable_accessed(variable):
@@ -84,6 +115,7 @@ def variable_accessed(variable):
     variables = strategy.experimental_local_results(variable)
   for var in variables:
     pywrap_tfe.TFE_Py_TapeVariableAccessed(var)
+    pywrap_tfe.TFE_Py_VariableWatcherVariableAccessed(var)
 
 
 def variables_accessed(variables):
@@ -107,6 +139,7 @@ def variables_accessed(variables):
 
   for var in accessed:
     pywrap_tfe.TFE_Py_TapeVariableAccessed(var)
+    pywrap_tfe.TFE_Py_VariableWatcherVariableAccessed(var)
 
 
 def pop_tape(tape):
diff --git a/tensorflow/python/eager/tape_test.py b/tensorflow/python/eager/tape_test.py
index 48d3b8ac6ee..cf49aa2fde9 100644
--- a/tensorflow/python/eager/tape_test.py
+++ b/tensorflow/python/eager/tape_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +32,7 @@ from tensorflow.python.ops import math_ops
 # Importing nn_grad for the registration functions.
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variables
 
 
 @custom_gradient.custom_gradient
@@ -166,5 +168,48 @@ class TapeTest(test.TestCase):
     self.assertAllEqual(g, 1.0)
 
 
+class VariableWatcherTest(test.TestCase):
+
+  def testBasic(self):
+    var1 = variables.Variable(0.0)
+    var2 = variables.Variable(1.0)
+    with tape.VariableWatcher() as variable_watcher:
+      var1.assign_add(1.0)
+      var2.assign_add(2.0)
+
+    self.assertAllEqual(variable_watcher.watched_variables(), (var1, var2))
+
+  def testNonTrainableVariables(self):
+    var1 = variables.Variable(0.0)
+    var2 = variables.Variable(1.0, trainable=False)
+    with tape.VariableWatcher() as variable_watcher:
+      var1.assign_add(1.0)
+      var2.assign_add(2.0)
+
+    self.assertAllEqual(variable_watcher.watched_variables(), (var1,))
+
+  def testMultipleScopes(self):
+    var1 = variables.Variable(0.0)
+    var2 = variables.Variable(1.0)
+    with tape.VariableWatcher() as variable_watcher1:
+      var1.assign_add(1.0)
+      with tape.VariableWatcher() as variable_watcher2:
+        var2.assign_add(2.0)
+
+    # variable_watcher1 should see both vars and variable_watcher2 only sees
+    # var2
+    self.assertAllEqual(variable_watcher1.watched_variables(), (var1, var2))
+    self.assertAllEqual(variable_watcher2.watched_variables(), (var2,))
+
+  def testCreateVariables(self):
+    with tape.VariableWatcher() as variable_watcher:
+      var1 = variables.Variable(0.0)
+      var2 = variables.Variable(1.0)
+      var1.assign_add(1.0)
+      var2.assign_add(2.0)
+
+    self.assertAllEqual(variable_watcher.watched_variables(), (var1, var2))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 2e2b831750a..786c26c009a 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -13,6 +13,7 @@ py_library(
         ":feature_column",
         ":feature_column_v2",
         "//tensorflow/python:util",
+        "//tensorflow/python/keras/feature_column",
     ],
 )
 
@@ -54,8 +55,6 @@ py_library(
 py_library(
     name = "feature_column_v2",
     srcs = [
-        "dense_features.py",
-        "dense_features_v2.py",
         "feature_column_v2.py",
         "sequence_feature_column.py",
         "serialization.py",
@@ -80,16 +79,20 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/keras:base_layer",
-        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:initializers",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python/keras/layers",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:data_structures",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -121,15 +124,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "dense_features_test",
-    srcs = ["dense_features_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
@@ -172,15 +166,6 @@ tf_py_test(
     deps = [":feature_column_v2_test_main_lib"],
 )
 
-tf_py_test(
-    name = "dense_features_v2_test",
-    srcs = ["dense_features_v2_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_v2_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
@@ -269,3 +254,37 @@ tf_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+tf_py_test(
+    name = "keras_integration_test",
+    size = "medium",
+    srcs = ["keras_integration_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    tags = [
+        "nomac",  # TODO(mihaimaruseac): b/127695564
+        "notsan",
+    ],
+    deps = [
+        ":feature_column_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_test(
+    name = "save_test",
+    size = "medium",
+    srcs = ["save_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":feature_column_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index f53825381d9..07df4e914c9 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -1797,7 +1797,26 @@ class _FeatureColumn(object):
 
     `__gt__` is called when the "other" object being compared during the sort
     does not have `__lt__` defined.
-    Example: http://gpaste/4803354716798976
+    Example:
+    ```
+    # __lt__ only class
+    class A():
+      def __lt__(self, other): return str(self) < str(other)
+
+    a = A()
+    a < "b" # True
+    "0" < a # Error
+
+    # __lt__ and __gt__ class
+    class B():
+      def __lt__(self, other): return str(self) < str(other)
+      def __gt__(self, other): return str(self) > str(other)
+
+    b = B()
+    b < "c" # True
+    "0" < b # True
+    ```
+
 
     Args:
       other: The other object to compare to.
@@ -2527,7 +2546,7 @@ class _EmbeddingColumn(
     embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
     if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
         sparse_id_rank <= 2):
-      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
     # Return embedding lookup result.
     return embedding_lookup_sparse(
         embedding_weights,
@@ -2677,7 +2696,7 @@ class _SharedEmbeddingColumn(
       embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
       if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
           sparse_id_rank <= 2):
-        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
       # Return embedding lookup result.
       return embedding_lookup_sparse(
           embedding_weights,
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 6a995842d8b..bda20ff3f2c 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -19,12 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import,g-bad-import-order
-# We import dense_features_v2 first so that the V1 DenseFeatures is the default
-# if users directly import feature_column_lib.
-from tensorflow.python.feature_column.dense_features_v2 import *
-from tensorflow.python.feature_column.dense_features import *
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
 from tensorflow.python.feature_column.sequence_feature_column import *
 from tensorflow.python.feature_column.serialization import *
+# We import dense_features_v2 first so that the V1 DenseFeatures is the default
+# if users directly import feature_column_lib.
+from tensorflow.python.keras.feature_column.dense_features_v2 import *
+from tensorflow.python.keras.feature_column.dense_features import *
+from tensorflow.python.keras.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index b9206f40ba0..38800fc2162 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import copy
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.example import example_pb2
@@ -852,9 +853,9 @@ class HashedCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -1714,10 +1715,10 @@ class LinearModelTest(test.TestCase):
       # We check the mapping by checking that we have the right keys,
       # and that the values (output_tensors) were indeed the ones used to
       # form the input layer.
-      self.assertItemsEqual(all_cols, cols_to_output_tensors.keys())
+      self.assertCountEqual(all_cols, cols_to_output_tensors.keys())
       input_layer_inputs = [tensor for tensor in input_layer.op.inputs[:-1]]
       output_tensors = [tensor for tensor in cols_to_output_tensors.values()]
-      self.assertItemsEqual(input_layer_inputs, output_tensors)
+      self.assertCountEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
     price = fc._numeric_column('price')
@@ -2841,7 +2842,7 @@ class FunctionalInputLayerTest(test.TestCase):
       cols_to_vars = {}
       all_cols = [price1, dense_feature_bucketized, some_embedding_column]
       fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -2891,7 +2892,7 @@ class FunctionalInputLayerTest(test.TestCase):
           shared_embedding_a, shared_embedding_b
       ]
       fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -2927,7 +2928,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_from_feature_columns',
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
         fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
@@ -3043,7 +3044,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_layer/sparse_feature_embedding/embedding_weights:0',
           'input_layer_1/sparse_feature_embedding/embedding_weights:0'
       ]
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3077,7 +3078,7 @@ class FunctionalInputLayerTest(test.TestCase):
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3129,7 +3130,7 @@ class FunctionalInputLayerTest(test.TestCase):
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3618,9 +3619,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4058,9 +4059,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4363,9 +4364,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4416,7 +4417,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
 
     with _initialized_session():
       with self.assertRaisesRegexp(errors.OpError,
-                                   r'indices\[0\] = 2 is not in \[0, 2\)'):
+                                   r'indices\[0\] .* 2 .* \[0, 2\)'):
         self.evaluate(embedding_lookup)
 
   @test_util.run_deprecated_v1
@@ -4820,7 +4821,7 @@ class IndicatorColumnTest(test.TestCase):
         self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
 
-class EmbeddingColumnTest(test.TestCase):
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_defaults(self):
@@ -4956,10 +4957,29 @@ class EmbeddingColumnTest(test.TestCase):
       _assert_sparse_tensor_value(self, self.evaluate(output_a),
                                   self.evaluate(output_embedded))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
         # example 1, ids [0, 1]
@@ -4974,12 +4994,20 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -4997,25 +5025,43 @@ class EmbeddingColumnTest(test.TestCase):
     # Build columns.
     categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input
-        }))
+      # Provide sparse input and get dense result.
+      embedding_lookup = embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}))
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                             'vars/embedding_weights/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
@@ -5072,7 +5118,7 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
@@ -5102,11 +5148,11 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in my_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in my_vars]))
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
@@ -5169,8 +5215,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval(
@@ -5233,8 +5279,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
@@ -5280,14 +5326,14 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v for v in ops.get_collection(
               ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -5361,14 +5407,14 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -5450,13 +5496,11 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in trainable_vars]))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
@@ -5513,17 +5557,16 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
-class SharedEmbeddingColumnTest(test.TestCase):
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_defaults(self):
@@ -5772,33 +5815,59 @@ class SharedEmbeddingColumnTest(test.TestCase):
       _assert_sparse_tensor_value(self, self.evaluate(output_b),
                                   self.evaluate(output_b_embedded))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
-    input_features = {
-        'aaa': input_a,
-        'bbb': input_b
-    }
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
 
     # Embedding variable.
     embedding_dimension = 2
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -5808,38 +5877,65 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1:
         (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
     )
-    expected_lookups_b = (
-        # example 0:
-        (1., 2.),  # ids [0], embedding = [1, 2]
-        # example 1:
-        (0., 0.),  # ids [], embedding = [0, 0]
-    )
+    if use_safe_embedding_lookup:
+      expected_lookups_b = (
+          # example 0:
+          (1., 2.),  # ids [0], embedding = [1, 2]
+          # example 1:
+          (0., 0.),  # ids [], embedding = [0, 0]
+      )
+    else:
+      expected_lookups_b = (
+          # example 0:
+          (1., 2.),  # ids [0], embedding = [1, 2]
+      )
 
     # Build columns.
     categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a._get_dense_tensor(
-        _LazyBuilder(input_features))
-    embedding_lookup_b = embedding_column_b._get_dense_tensor(
-        _LazyBuilder(input_features))
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
 
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+      # Provide sparse input and get dense result.
+      embedding_lookup_a = embedding_column_a._get_dense_tensor(
+          _LazyBuilder(input_features))
+      embedding_lookup_b = embedding_column_b._get_dense_tensor(
+          _LazyBuilder(input_features))
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                             'vars/embedding_weights/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
-      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
-      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+    self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
@@ -5886,11 +5982,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in global_vars))
     my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in my_vars))
 
@@ -5997,14 +6093,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v for v in ops.get_collection(
               ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
@@ -6091,14 +6187,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
@@ -6195,16 +6291,16 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
         tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           tuple([v.name for v in trainable_vars]))
     else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index f981909aef1..b572987d52d 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -1968,7 +1968,7 @@ def indicator_column(categorical_column):
 
   ```python
   name = indicator_column(categorical_column_with_vocabulary_list(
-      'name', ['bob', 'george', 'wanda'])
+      'name', ['bob', 'george', 'wanda']))
   columns = [name, ...]
   features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
   dense_tensor = input_layer(features, columns)
@@ -2231,7 +2231,7 @@ class FeatureColumn(object):
     In CPython, `__lt__` must be defined for all objects in the
     sequence being sorted.
 
-    If any objects in teh sequence being sorted do not have an `__lt__` method
+    If any objects in the sequence being sorted do not have an `__lt__` method
     compatible with feature column objects (such as strings), then CPython will
     fall back to using the `__gt__` method below.
     https://docs.python.org/3/library/stdtypes.html#list.sort
@@ -2254,7 +2254,25 @@ class FeatureColumn(object):
 
     `__gt__` is called when the "other" object being compared during the sort
     does not have `__lt__` defined.
-    Example: http://gpaste/4803354716798976
+    Example:
+    ```
+    # __lt__ only class
+    class A():
+      def __lt__(self, other): return str(self) < str(other)
+
+    a = A()
+    a < "b" # True
+    "0" < a # Error
+
+    # __lt__ and __gt__ class
+    class B():
+      def __lt__(self, other): return str(self) < str(other)
+      def __gt__(self, other): return str(self) > str(other)
+
+    b = B()
+    b < "c" # True
+    "0" < b # True
+    ```
 
     Args:
       other: The other object to compare to.
@@ -3245,7 +3263,7 @@ class EmbeddingColumn(
     embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
     if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
         sparse_id_rank <= 2):
-      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
     # Return embedding lookup result.
     return embedding_lookup_sparse(
         embedding_weights,
@@ -3540,7 +3558,7 @@ class SharedEmbeddingColumn(
       embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
       if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
           sparse_id_rank <= 2):
-        embedding_lookup_sparse = (embedding_ops.embedding_lookup_sparse)
+        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
       # Return embedding lookup result.
       return embedding_lookup_sparse(
           embedding_weights,
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index fe769850fb0..076515c84b8 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
@@ -2087,7 +2086,7 @@ class LinearModelTest(test.TestCase):
       for var in model.variables:
         self.assertIsInstance(var, variables_lib.VariableV1)
       variable_names = [var.name for var in model.variables]
-      self.assertItemsEqual([
+      self.assertCountEqual([
           'linear_model/dense_feature_bucketized/weights:0',
           'linear_model/price1/weights:0',
           'linear_model/sparse_feature_embedding/embedding_weights:0',
@@ -2731,10 +2730,10 @@ class OldLinearModelTest(test.TestCase):
       # We check the mapping by checking that we have the right keys,
       # and that the values (output_tensors) were indeed the ones used to
       # form the input layer.
-      self.assertItemsEqual(all_cols, cols_to_output_tensors.keys())
+      self.assertCountEqual(all_cols, cols_to_output_tensors.keys())
       input_layer_inputs = [tensor for tensor in input_layer.op.inputs[:-1]]
       output_tensors = [tensor for tensor in cols_to_output_tensors.values()]
-      self.assertItemsEqual(input_layer_inputs, output_tensors)
+      self.assertCountEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
     price = fc.numeric_column('price')
@@ -3411,7 +3410,7 @@ class FunctionalInputLayerTest(test.TestCase):
       cols_to_vars = {}
       all_cols = [price1, dense_feature_bucketized, some_embedding_column]
       fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -3461,7 +3460,7 @@ class FunctionalInputLayerTest(test.TestCase):
           shared_embedding_a, shared_embedding_b
       ]
       fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -3497,7 +3496,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_from_feature_columns',
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
         fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
@@ -3616,7 +3615,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_layer/sparse_feature_embedding/embedding_weights:0',
           'input_layer_1/sparse_feature_embedding/embedding_weights:0'
       ]
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -5582,23 +5581,6 @@ class IndicatorColumnTest(test.TestCase):
       self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
       self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      net = df.DenseFeatures([animal])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
@@ -5904,7 +5886,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -5968,7 +5950,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6036,7 +6018,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6109,7 +6091,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6180,7 +6162,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6230,14 +6212,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6271,156 +6253,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
                           self.evaluate(predictions))
 
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
-      })
-  @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
-
-    # Provide sparse input and get dense result.
-    l = df.DenseFeatures((embedding_column,))
-    dense_features = l({'aaa': sparse_input})
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in trainable_vars]))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures((embedding_column,))({
-        'aaa': sparse_input
-    })
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertItemsEqual([],
-                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
@@ -6475,10 +6307,10 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6528,14 +6360,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6610,14 +6442,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6972,15 +6804,26 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       {
           'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
       }, {
           'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
       })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self, use_safe_embedding_lookup):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     # -1 values are ignored.
     input_a = np.array([
         [2, -1, -1],  # example 0, ids [2]
@@ -6997,13 +6840,20 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
 
     def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -7031,22 +6881,32 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a.get_dense_tensor(
-        fc.FeatureTransformationCache(input_features), None)
-    embedding_lookup_b = embedding_column_b.get_dense_tensor(
-        fc.FeatureTransformationCache(input_features), None)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+      # Provide sparse input and get dense result.
+      embedding_lookup_a = embedding_column_a.get_dense_tensor(
+          fc.FeatureTransformationCache(input_features), None)
+      embedding_lookup_b = embedding_column_b.get_dense_tensor(
+          fc.FeatureTransformationCache(input_features), None)
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/aaa_bbb_shared_embedding/part_0:0',
+                             'vars/aaa_bbb_shared_embedding/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/aaa_bbb_shared_embedding:0',),
+                            tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -7279,14 +7139,14 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'aaa_bbb_shared_embedding:0',
           'linear_model/bbb_shared_embedding/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars['aaa_bbb_shared_embedding:0']
       linear_weights_a = trainable_vars[
@@ -7326,129 +7186,6 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
       self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
-  def _test_dense_features(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
-    sparse_input_c = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 1), (1, 1), (1, 3)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_d = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        indices=((0, 1),),
-        values=(2,),
-        dense_shape=(2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        # C ids [2], embedding = [7, 11]
-        # D ids [2], embedding = [7, 11]
-        (7., 11., 1., 2., 7., 11., 7., 11.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # D ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    categorical_column_c = fc.categorical_column_with_identity(
-        key='ccc', num_buckets=vocabulary_size)
-    categorical_column_d = fc.categorical_column_with_identity(
-        key='ddd', num_buckets=vocabulary_size)
-
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
-        [categorical_column_c, categorical_column_d],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-
-    features = {
-        'aaa': sparse_input_a,
-        'bbb': sparse_input_b,
-        'ccc': sparse_input_c,
-        'ddd': sparse_input_d
-    }
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures(
-        feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d))(
-                             features)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-        tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
-      self.assertItemsEqual(
-          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values,
-                        self.evaluate(shared_embedding_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    self._test_dense_features()
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_no_trainable(self):
-    self._test_dense_features(trainable=False)
-
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/feature_column/keras_integration_test.py
similarity index 63%
rename from tensorflow/python/keras/engine/feature_columns_integration_test.py
rename to tensorflow/python/feature_column/keras_integration_test.py
index 75d99df89f5..456c0204350 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/feature_column/keras_integration_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests specific to Feature Columns integration."""
+"""Tests specific to Feature Columns and Keras integration."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,11 +21,17 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column import dense_features_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.premade import linear
+from tensorflow.python.keras.premade import wide_deep
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.platform import test
 
@@ -204,7 +210,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         loss_weights=loss_weights)
 
     data = ({'a': np.arange(10), 'b': np.arange(10)}, np.arange(10, 20))
-    print(model.fit(*data, epochs=1))
+    model.fit(*data, epochs=1)
 
   # TODO(kaftan) seems to throw an error when enabled.
   @keras_parameterized.run_all_keras_modes
@@ -239,7 +245,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         'b': np.arange(10),
         'c': np.arange(10)
     }], np.arange(10, 100))
-    print(model.fit(*data_list, epochs=1))
+    model.fit(*data_list, epochs=1)
 
     data_bloated_list = ([{
         'a': np.arange(10),
@@ -250,7 +256,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         'b': np.arange(10),
         'c': np.arange(10)
     }], np.arange(10, 100))
-    print(model.fit(*data_bloated_list, epochs=1))
+    model.fit(*data_bloated_list, epochs=1)
 
     data_dict = ({
         'fc1': {
@@ -262,7 +268,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
             'c': np.arange(10)
         }
     }, np.arange(10, 100))
-    print(model.fit(*data_dict, epochs=1))
+    model.fit(*data_dict, epochs=1)
 
     data_bloated_dict = ({
         'fc1': {
@@ -276,7 +282,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
             'c': np.arange(10)
         }
     }, np.arange(10, 100))
-    print(model.fit(*data_bloated_dict, epochs=1))
+    model.fit(*data_bloated_dict, epochs=1)
 
   @keras_parameterized.run_all_keras_modes
   def test_string_input(self):
@@ -298,6 +304,108 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
                   loss=keras.losses.BinaryCrossentropy())
     model.fit(dataset)
 
+  def test_serialization_dense_features(self):
+    dense_feature = fc.DenseFeatures([fc.numeric_column('a')])
+    config = keras.layers.serialize(dense_feature)
+    self.assertEqual(config['class_name'], 'DenseFeatures')
+
+    revived = keras.layers.deserialize(config)
+    if tf2.enabled():
+      self.assertIsInstance(revived, dense_features_v2.DenseFeatures)
+    else:
+      self.assertIsInstance(revived, fc.DenseFeatures)
+      self.assertNotIsInstance(revived, dense_features_v2.DenseFeatures)
+
+  # This test is an example for a regression on categorical inputs, i.e.,
+  # the output is 0.4, 0.6, 0.9 when input is 'alpha', 'beta', 'gamma'
+  # separately.
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_linear_model_with_feature_column(self):
+    vocab_list = ['alpha', 'beta', 'gamma']
+    vocab_val = [0.4, 0.6, 0.9]
+    data = np.random.choice(vocab_list, size=256)
+    y = np.zeros_like(data, dtype=np.float32)
+    for vocab, val in zip(vocab_list, vocab_val):
+      indices = np.where(data == vocab)
+      y[indices] = val + np.random.uniform(
+          low=-0.01, high=0.01, size=indices[0].shape)
+    cat_column = feature_column_v2.categorical_column_with_vocabulary_list(
+        key='symbol', vocabulary_list=vocab_list)
+    ind_column = feature_column_v2.indicator_column(cat_column)
+    dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+    linear_model = linear.LinearModel(
+        use_bias=False, kernel_initializer='zeros')
+    combined = keras.Sequential([dense_feature_layer, linear_model])
+    opt = gradient_descent.SGD(learning_rate=0.1)
+    combined.compile(opt, 'mse', [])
+    combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+    self.assertAllClose([[0.4], [0.6], [0.9]],
+                        combined.layers[1].dense_layers[0].kernel.numpy(),
+                        atol=0.01)
+
+  # This test is an example for cases where linear and dnn model accepts
+  # same raw input and same transformed inputs, i.e., the raw input is
+  # categorical, and both linear and dnn model accept one hot encoding.
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_wide_deep_model_with_single_feature_column(self):
+    vocab_list = ['alpha', 'beta', 'gamma']
+    vocab_val = [0.4, 0.6, 0.9]
+    data = np.random.choice(vocab_list, size=256)
+    y = np.zeros_like(data, dtype=np.float32)
+    for vocab, val in zip(vocab_list, vocab_val):
+      indices = np.where(data == vocab)
+      y[indices] = val + np.random.uniform(
+          low=-0.01, high=0.01, size=indices[0].shape)
+    cat_column = feature_column_v2.categorical_column_with_vocabulary_list(
+        key='symbol', vocabulary_list=vocab_list)
+    ind_column = feature_column_v2.indicator_column(cat_column)
+    dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+    linear_model = linear.LinearModel(
+        use_bias=False, kernel_initializer='zeros')
+    dnn_model = keras.Sequential([keras.layers.Dense(units=1)])
+    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+    combined = keras.Sequential([dense_feature_layer, wide_deep_model])
+    opt = gradient_descent.SGD(learning_rate=0.1)
+    combined.compile(
+        opt,
+        'mse', [],
+        run_eagerly=testing_utils.should_run_eagerly())
+    combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+
+  # This test is an example for cases where linear and dnn model accepts
+  # same raw input but different transformed inputs, i.e,. the raw input is
+  # categorical, and linear model accepts one hot encoding, while dnn model
+  # accepts embedding encoding.
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_wide_deep_model_with_two_feature_columns(self):
+    vocab_list = ['alpha', 'beta', 'gamma']
+    vocab_val = [0.4, 0.6, 0.9]
+    data = np.random.choice(vocab_list, size=256)
+    y = np.zeros_like(data, dtype=np.float32)
+    for vocab, val in zip(vocab_list, vocab_val):
+      indices = np.where(data == vocab)
+      y[indices] = val + np.random.uniform(
+          low=-0.01, high=0.01, size=indices[0].shape)
+    cat_column = feature_column_v2.categorical_column_with_vocabulary_list(
+        key='symbol', vocabulary_list=vocab_list)
+    ind_column = feature_column_v2.indicator_column(cat_column)
+    emb_column = feature_column_v2.embedding_column(cat_column, dimension=5)
+    linear_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+    linear_model = linear.LinearModel(
+        use_bias=False, kernel_initializer='zeros')
+    combined_linear = keras.Sequential(
+        [linear_feature_layer, linear_model])
+    dnn_model = keras.Sequential([keras.layers.Dense(units=1)])
+    dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column])
+    combined_dnn = keras.Sequential([dnn_feature_layer, dnn_model])
+    wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn)
+    opt = gradient_descent.SGD(learning_rate=0.1)
+    wide_deep_model.compile(
+        opt,
+        'mse', [],
+        run_eagerly=testing_utils.should_run_eagerly())
+    wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/feature_column/save_test.py b/tensorflow/python/feature_column/save_test.py
new file mode 100644
index 00000000000..05370911295
--- /dev/null
+++ b/tensorflow/python/feature_column/save_test.py
@@ -0,0 +1,133 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras model saving code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.feature_column import feature_column_lib
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras.saving import model_config
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.platform import test
+
+
+class TestSaveModel(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_saving_with_dense_features(self):
+    cols = [
+        feature_column_lib.numeric_column('a'),
+        feature_column_lib.indicator_column(
+            feature_column_lib.categorical_column_with_vocabulary_list(
+                'b', ['one', 'two']))
+    ]
+    input_layers = {
+        'a': keras.layers.Input(shape=(1,), name='a'),
+        'b': keras.layers.Input(shape=(1,), name='b', dtype='string')
+    }
+
+    fc_layer = feature_column_lib.DenseFeatures(cols)(input_layers)
+    output = keras.layers.Dense(10)(fc_layer)
+
+    model = keras.models.Model(input_layers, output)
+
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer='rmsprop',
+        metrics=[keras.metrics.categorical_accuracy])
+
+    config = model.to_json()
+    loaded_model = model_config.model_from_json(config)
+
+    inputs_a = np.arange(10).reshape(10, 1)
+    inputs_b = np.arange(10).reshape(10, 1).astype('str')
+
+    with self.cached_session():
+      # Initialize tables for V1 lookup.
+      if not context.executing_eagerly():
+        self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertLen(loaded_model.predict({'a': inputs_a, 'b': inputs_b}), 10)
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_saving_with_sequence_features(self):
+    cols = [
+        feature_column_lib.sequence_numeric_column('a'),
+        feature_column_lib.indicator_column(
+            feature_column_lib.sequence_categorical_column_with_vocabulary_list(
+                'b', ['one', 'two']))
+    ]
+    input_layers = {
+        'a':
+            keras.layers.Input(shape=(None, 1), sparse=True, name='a'),
+        'b':
+            keras.layers.Input(
+                shape=(None, 1), sparse=True, name='b', dtype='string')
+    }
+
+    fc_layer, _ = feature_column_lib.SequenceFeatures(cols)(input_layers)
+    # TODO(tibell): Figure out the right dtype and apply masking.
+    # sequence_length_mask = array_ops.sequence_mask(sequence_length)
+    # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
+    x = keras.layers.GRU(32)(fc_layer)
+    output = keras.layers.Dense(10)(x)
+
+    model = keras.models.Model(input_layers, output)
+
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer='rmsprop',
+        metrics=[keras.metrics.categorical_accuracy])
+
+    config = model.to_json()
+    loaded_model = model_config.model_from_json(config)
+
+    batch_size = 10
+    timesteps = 1
+
+    values_a = np.arange(10, dtype=np.float32)
+    indices_a = np.zeros((10, 3), dtype=np.int64)
+    indices_a[:, 0] = np.arange(10)
+    inputs_a = sparse_tensor.SparseTensor(indices_a, values_a,
+                                          (batch_size, timesteps, 1))
+
+    values_b = np.zeros(10, dtype=np.str)
+    indices_b = np.zeros((10, 3), dtype=np.int64)
+    indices_b[:, 0] = np.arange(10)
+    inputs_b = sparse_tensor.SparseTensor(indices_b, values_b,
+                                          (batch_size, timesteps, 1))
+
+    with self.cached_session():
+      # Initialize tables for V1 lookup.
+      if not context.executing_eagerly():
+        self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertLen(
+          loaded_model.predict({
+              'a': inputs_a,
+              'b': inputs_b
+          }, steps=1), batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/sequence_feature_column.py b/tensorflow/python/feature_column/sequence_feature_column.py
index 25f2021e7e7..9409c29fdf8 100644
--- a/tensorflow/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/python/feature_column/sequence_feature_column.py
@@ -30,147 +30,14 @@ from tensorflow.python.feature_column import utils as fc_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras import backend
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
 
+
 # pylint: disable=protected-access
-
-
-@keras_export('keras.experimental.SequenceFeatures')
-class SequenceFeatures(fc._BaseFeaturesLayer):
-  """A layer for sequence input.
-
-    All `feature_columns` must be sequence dense columns with the same
-    `sequence_length`. The output of this method can be fed into sequence
-    networks, such as RNN.
-
-    The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
-    `T` is the maximum sequence length for this batch, which could differ from
-    batch to batch.
-
-    If multiple `feature_columns` are given with `Di` `num_elements` each, their
-    outputs are concatenated. So, the final `Tensor` has shape
-    `[batch_size, T, D0 + D1 + ... + Dn]`.
-
-    Example:
-
-    ```python
-    # Behavior of some cells or feature columns may depend on whether we are in
-    # training or inference mode, e.g. applying dropout.
-    training = True
-    rating = sequence_numeric_column('rating')
-    watches = sequence_categorical_column_with_identity(
-        'watches', num_buckets=1000)
-    watches_embedding = embedding_column(watches, dimension=10)
-    columns = [rating, watches_embedding]
-
-    sequence_input_layer = SequenceFeatures(columns)
-    features = tf.io.parse_example(...,
-                                   features=make_parse_example_spec(columns))
-    sequence_input, sequence_length = sequence_input_layer(
-       features, training=training)
-    sequence_length_mask = tf.sequence_mask(sequence_length)
-
-    rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size, training=training)
-    rnn_layer = tf.keras.layers.RNN(rnn_cell, training=training)
-    outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
-    ```
-  """
-
-  def __init__(
-      self,
-      feature_columns,
-      trainable=True,
-      name=None,
-      **kwargs):
-    """"Constructs a SequenceFeatures layer.
-
-    Args:
-      feature_columns: An iterable of dense sequence columns. Valid columns are
-        - `embedding_column` that wraps a `sequence_categorical_column_with_*`
-        - `sequence_numeric_column`.
-      trainable: Boolean, whether the layer's variables will be updated via
-        gradient descent during training.
-      name: Name to give to the SequenceFeatures.
-      **kwargs: Keyword arguments to construct a layer.
-
-    Raises:
-      ValueError: If any of the `feature_columns` is not a
-        `SequenceDenseColumn`.
-    """
-    super(SequenceFeatures, self).__init__(
-        feature_columns=feature_columns,
-        trainable=trainable,
-        name=name,
-        expected_column_type=fc.SequenceDenseColumn,
-        **kwargs)
-
-  @property
-  def _is_feature_layer(self):
-    return True
-
-  def _target_shape(self, input_shape, total_elements):
-    return (input_shape[0], input_shape[1], total_elements)
-
-  def call(self, features, training=None):
-    """Returns sequence input corresponding to the `feature_columns`.
-
-    Args:
-      features: A dict mapping keys to tensors.
-      training: Python boolean or None, indicating whether to the layer is being
-        run in training mode. This argument is passed to the call method of any
-        `FeatureColumn` that takes a `training` argument. For example, if a
-        `FeatureColumn` performed dropout, the column could expose a `training`
-        argument to control whether the dropout should be applied. If `None`,
-        defaults to `tf.keras.backend.learning_phase()`.
-
-
-    Returns:
-      An `(input_layer, sequence_length)` tuple where:
-      - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
-          `T` is the maximum sequence length for this batch, which could differ
-          from batch to batch. `D` is the sum of `num_elements` for all
-          `feature_columns`.
-      - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
-          length for each example.
-
-    Raises:
-      ValueError: If features are not a dictionary.
-    """
-    if not isinstance(features, dict):
-      raise ValueError('We expected a dictionary here. Instead we got: ',
-                       features)
-    if training is None:
-      training = backend.learning_phase()
-    transformation_cache = fc.FeatureTransformationCache(features)
-    output_tensors = []
-    sequence_lengths = []
-
-    for column in self._feature_columns:
-      with ops.name_scope(column.name):
-        try:
-          dense_tensor, sequence_length = column.get_sequence_dense_tensor(
-              transformation_cache, self._state_manager, training=training)
-        except TypeError:
-          dense_tensor, sequence_length = column.get_sequence_dense_tensor(
-              transformation_cache, self._state_manager)
-        # Flattens the final dimension to produce a 3D Tensor.
-        output_tensors.append(self._process_dense_tensor(column, dense_tensor))
-        sequence_lengths.append(sequence_length)
-
-    # Check and process sequence lengths.
-    fc._verify_static_batch_size_equality(sequence_lengths,
-                                          self._feature_columns)
-    sequence_length = _assert_all_equal_and_return(sequence_lengths)
-
-    return self._verify_and_concat_tensors(output_tensors), sequence_length
-
-
 def concatenate_context_input(context_input, sequence_input):
   """Replicates `context_input` across all timesteps of `sequence_input`.
 
diff --git a/tensorflow/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
index 888c21c8450..9f9740afa76 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
@@ -24,130 +24,13 @@ import tempfile
 from google.protobuf import text_format
 
 from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras.layers import recurrent
-from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class SequenceFeatureColumnIntegrationTest(test.TestCase):
-
-  def _make_sequence_example(self):
-    example = example_pb2.SequenceExample()
-    example.context.feature['int_ctx'].int64_list.value.extend([5])
-    example.context.feature['float_ctx'].float_list.value.extend([123.6])
-    for val in range(0, 10, 2):
-      feat = feature_pb2.Feature()
-      feat.int64_list.value.extend([val] * val)
-      example.feature_lists.feature_list['int_list'].feature.extend([feat])
-    for val in range(1, 11, 2):
-      feat = feature_pb2.Feature()
-      feat.bytes_list.value.extend([compat.as_bytes(str(val))] * val)
-      example.feature_lists.feature_list['str_list'].feature.extend([feat])
-
-    return example
-
-  def _build_feature_columns(self):
-    col = fc.categorical_column_with_identity('int_ctx', num_buckets=100)
-    ctx_cols = [
-        fc.embedding_column(col, dimension=10),
-        fc.numeric_column('float_ctx')
-    ]
-
-    identity_col = sfc.sequence_categorical_column_with_identity(
-        'int_list', num_buckets=10)
-    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
-        'bytes_list', hash_bucket_size=100)
-    seq_cols = [
-        fc.embedding_column(identity_col, dimension=10),
-        fc.embedding_column(bucket_col, dimension=20)
-    ]
-
-    return ctx_cols, seq_cols
-
-  def test_sequence_example_into_input_layer(self):
-    examples = [_make_sequence_example().SerializeToString()] * 100
-    ctx_cols, seq_cols = self._build_feature_columns()
-
-    def _parse_example(example):
-      ctx, seq = parsing_ops.parse_single_sequence_example(
-          example,
-          context_features=fc.make_parse_example_spec_v2(ctx_cols),
-          sequence_features=fc.make_parse_example_spec_v2(seq_cols))
-      ctx.update(seq)
-      return ctx
-
-    ds = dataset_ops.Dataset.from_tensor_slices(examples)
-    ds = ds.map(_parse_example)
-    ds = ds.batch(20)
-
-    # Test on a single batch
-    features = dataset_ops.make_one_shot_iterator(ds).get_next()
-
-    # Tile the context features across the sequence features
-    sequence_input_layer = sfc.SequenceFeatures(seq_cols)
-    seq_layer, _ = sequence_input_layer(features)
-    input_layer = dense_features.DenseFeatures(ctx_cols)
-    ctx_layer = input_layer(features)
-    input_layer = sfc.concatenate_context_input(ctx_layer, seq_layer)
-
-    rnn_layer = recurrent.RNN(recurrent.SimpleRNNCell(10))
-    output = rnn_layer(input_layer)
-
-    with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      features_r = sess.run(features)
-      self.assertAllEqual(features_r['int_list'].dense_shape, [20, 3, 6])
-
-      output_r = sess.run(output)
-      self.assertAllEqual(output_r.shape, [20, 10])
-
-  @test_util.run_deprecated_v1
-  def test_shared_sequence_non_sequence_into_input_layer(self):
-    non_seq = fc.categorical_column_with_identity('non_seq',
-                                                  num_buckets=10)
-    seq = sfc.sequence_categorical_column_with_identity('seq',
-                                                        num_buckets=10)
-    shared_non_seq, shared_seq = fc.shared_embedding_columns_v2(
-        [non_seq, seq],
-        dimension=4,
-        combiner='sum',
-        initializer=init_ops_v2.Ones(),
-        shared_embedding_collection_name='shared')
-
-    seq = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [0, 1], [1, 0]],
-        values=[0, 1, 2],
-        dense_shape=[2, 2])
-    non_seq = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [0, 1], [1, 0]],
-        values=[0, 1, 2],
-        dense_shape=[2, 2])
-    features = {'seq': seq, 'non_seq': non_seq}
-
-    # Tile the context features across the sequence features
-    seq_input, seq_length = sfc.SequenceFeatures([shared_seq])(features)
-    non_seq_input = dense_features.DenseFeatures([shared_non_seq])(features)
-
-    with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      output_seq, output_seq_length, output_non_seq = sess.run(
-          [seq_input, seq_length, non_seq_input])
-      self.assertAllEqual(output_seq, [[[1, 1, 1, 1], [1, 1, 1, 1]],
-                                       [[1, 1, 1, 1], [0, 0, 0, 0]]])
-      self.assertAllEqual(output_seq_length, [2, 1])
-      self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
-
-
 class SequenceExampleParsingTest(test.TestCase):
 
   def test_seq_ex_in_sequence_categorical_column_with_identity(self):
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index 35eaa37cce7..d0cf5ee7670 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -24,12 +24,10 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -49,538 +47,6 @@ def _initialized_session(config=None):
   return sess
 
 
-class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args_a': {
-           # example 0, ids [2]
-           # example 1, ids [0, 1]
-           'indices': ((0, 0), (1, 0), (1, 1)),
-           'values': (2, 0, 1),
-           'dense_shape': (2, 2)},
-       'sparse_input_args_b': {
-           # example 0, ids [1]
-           # example 1, ids [2, 0]
-           'indices': ((0, 0), (1, 0), (1, 1)),
-           'values': (1, 2, 0),
-           'dense_shape': (2, 2)},
-       'expected_input_layer': [
-           # example 0, ids_a [2], ids_b [1]
-           [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
-           # example 1, ids_a [0, 1], ids_b [2, 0]
-           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],],
-       'expected_sequence_length': [1, 2]},
-      {'testcase_name': '3D',
-       'sparse_input_args_a': {
-           # feature 0, ids [[2], [0, 1]]
-           # feature 1, ids [[0, 0], [1]]
-           'indices': (
-               (0, 0, 0), (0, 1, 0), (0, 1, 1),
-               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
-           'values': (2, 0, 1, 0, 0, 1),
-           'dense_shape': (2, 2, 2)},
-       'sparse_input_args_b': {
-           # feature 0, ids [[1, 1], [1]]
-           # feature 1, ids [[2], [0]]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
-           'values': (1, 1, 1, 2, 0),
-           'dense_shape': (2, 2, 2)},
-       'expected_input_layer': [
-           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
-           [[5., 6., 14., 15., 16.], [2., 3., 14., 15., 16.]],
-           # feature 1, [a: 0, 0, b: 2, -], [a: 1, -, b: 0, -]
-           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
-       'expected_sequence_length': [2, 2]},
-      )
-  @test_util.run_in_graph_and_eager_modes
-  def test_embedding_column(
-      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
-      expected_sequence_length):
-
-    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
-    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
-    vocabulary_size = 3
-    embedding_dimension_a = 2
-    embedding_values_a = (
-        (1., 2.),  # id 0
-        (3., 4.),  # id 1
-        (5., 6.)  # id 2
-    )
-    embedding_dimension_b = 3
-    embedding_values_b = (
-        (11., 12., 13.),  # id 0
-        (14., 15., 16.),  # id 1
-        (17., 18., 19.)  # id 2
-    )
-    def _get_initializer(embedding_dimension, embedding_values):
-
-      def _initializer(shape, dtype, partition_info=None):
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertEqual(dtypes.float32, dtype)
-        self.assertIsNone(partition_info)
-        return embedding_values
-      return _initializer
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a,
-        dimension=embedding_dimension_a,
-        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc.embedding_column(
-        categorical_column_b,
-        dimension=embedding_dimension_b,
-        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
-
-    # Test that columns are reordered alphabetically.
-    sequence_input_layer = sfc.SequenceFeatures(
-        [embedding_column_b, embedding_column_a])
-    input_layer, sequence_length = sequence_input_layer({
-        'aaa': sparse_input_a, 'bbb': sparse_input_b,})
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    weights = sequence_input_layer.weights
-    self.assertCountEqual(
-        ('sequence_features/aaa_embedding/embedding_weights:0',
-         'sequence_features/bbb_embedding/embedding_weights:0'),
-        tuple([v.name for v in weights]))
-    self.assertAllEqual(embedding_values_a, self.evaluate(weights[0]))
-    self.assertAllEqual(embedding_values_b, self.evaluate(weights[1]))
-    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
-    self.assertAllEqual(
-        expected_sequence_length, self.evaluate(sequence_length))
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_embedding_column_with_non_sequence_categorical(self):
-    """Tests that error is raised for non-sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
-    sequence_input_layer = sfc.SequenceFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must be of '
-        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
-      _, _ = sequence_input_layer({'aaa': sparse_input})
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_shared_embedding_column(self):
-    with ops.Graph().as_default():
-      vocabulary_size = 3
-      sparse_input_a = sparse_tensor.SparseTensorValue(
-          # example 0, ids [2]
-          # example 1, ids [0, 1]
-          indices=((0, 0), (1, 0), (1, 1)),
-          values=(2, 0, 1),
-          dense_shape=(2, 2))
-      sparse_input_b = sparse_tensor.SparseTensorValue(
-          # example 0, ids [1]
-          # example 1, ids [2, 0]
-          indices=((0, 0), (1, 0), (1, 1)),
-          values=(1, 2, 0),
-          dense_shape=(2, 2))
-
-      embedding_dimension = 2
-      embedding_values = (
-          (1., 2.),  # id 0
-          (3., 4.),  # id 1
-          (5., 6.)  # id 2
-      )
-
-      def _get_initializer(embedding_dimension, embedding_values):
-
-        def _initializer(shape, dtype, partition_info=None):
-          self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-          self.assertEqual(dtypes.float32, dtype)
-          self.assertIsNone(partition_info)
-          return embedding_values
-
-        return _initializer
-
-      expected_input_layer = [
-          # example 0, ids_a [2], ids_b [1]
-          [[5., 6., 3., 4.], [0., 0., 0., 0.]],
-          # example 1, ids_a [0, 1], ids_b [2, 0]
-          [[1., 2., 5., 6.], [3., 4., 1., 2.]],
-      ]
-      expected_sequence_length = [1, 2]
-
-      categorical_column_a = sfc.sequence_categorical_column_with_identity(
-          key='aaa', num_buckets=vocabulary_size)
-      categorical_column_b = sfc.sequence_categorical_column_with_identity(
-          key='bbb', num_buckets=vocabulary_size)
-      # Test that columns are reordered alphabetically.
-      shared_embedding_columns = fc.shared_embedding_columns_v2(
-          [categorical_column_b, categorical_column_a],
-          dimension=embedding_dimension,
-          initializer=_get_initializer(embedding_dimension, embedding_values))
-
-      sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
-      input_layer, sequence_length = sequence_input_layer({
-          'aaa': sparse_input_a, 'bbb': sparse_input_b})
-
-      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-      self.assertCountEqual(
-          ('aaa_bbb_shared_embedding:0',),
-          tuple([v.name for v in global_vars]))
-      with _initialized_session() as sess:
-        self.assertAllEqual(embedding_values,
-                            global_vars[0].eval(session=sess))
-        self.assertAllEqual(expected_input_layer,
-                            input_layer.eval(session=sess))
-        self.assertAllEqual(
-            expected_sequence_length, sequence_length.eval(session=sess))
-
-  @test_util.run_deprecated_v1
-  def test_shared_embedding_column_with_non_sequence_categorical(self):
-    """Tests that error is raised for non-sequence shared embedding column."""
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b], dimension=2)
-
-    sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_shared_embedding\. categorical_column must '
-        r'be of type SequenceCategoricalColumn to use SequenceFeatures\.'):
-      _, _ = sequence_input_layer({'aaa': sparse_input_a,
-                                   'bbb': sparse_input_b})
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args_a': {
-           # example 0, ids [2]
-           # example 1, ids [0, 1]
-           'indices': ((0, 0), (1, 0), (1, 1)),
-           'values': (2, 0, 1),
-           'dense_shape': (2, 2)},
-       'sparse_input_args_b': {
-           # example 0, ids [1]
-           # example 1, ids [1, 0]
-           'indices': ((0, 0), (1, 0), (1, 1)),
-           'values': (1, 1, 0),
-           'dense_shape': (2, 2)},
-       'expected_input_layer': [
-           # example 0, ids_a [2], ids_b [1]
-           [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
-           # example 1, ids_a [0, 1], ids_b [1, 0]
-           [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
-       'expected_sequence_length': [1, 2]},
-      {'testcase_name': '3D',
-       'sparse_input_args_a': {
-           # feature 0, ids [[2], [0, 1]]
-           # feature 1, ids [[0, 0], [1]]
-           'indices': (
-               (0, 0, 0), (0, 1, 0), (0, 1, 1),
-               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
-           'values': (2, 0, 1, 0, 0, 1),
-           'dense_shape': (2, 2, 2)},
-       'sparse_input_args_b': {
-           # feature 0, ids [[1, 1], [1]]
-           # feature 1, ids [[1], [0]]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
-           'values': (1, 1, 1, 1, 0),
-           'dense_shape': (2, 2, 2)},
-       'expected_input_layer': [
-           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
-           [[0., 0., 1., 0., 2.], [1., 1., 0., 0., 1.]],
-           # feature 1, [a: 0, 0, b: 1, -], [a: 1, -, b: 0, -]
-           [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
-       'expected_sequence_length': [2, 2]},
-      )
-  @test_util.run_in_graph_and_eager_modes
-  def test_indicator_column(
-      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
-      expected_sequence_length):
-    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
-    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
-
-    vocabulary_size_a = 3
-    vocabulary_size_b = 2
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc.indicator_column(categorical_column_b)
-    # Test that columns are reordered alphabetically.
-    sequence_input_layer = sfc.SequenceFeatures(
-        [indicator_column_b, indicator_column_a])
-    input_layer, sequence_length = sequence_input_layer({
-        'aaa': sparse_input_a, 'bbb': sparse_input_b})
-
-    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
-    self.assertAllEqual(
-        expected_sequence_length, self.evaluate(sequence_length))
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_indicator_column_with_non_sequence_categorical(self):
-    """Tests that error is raised for non-sequence categorical column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-
-    sequence_input_layer = sfc.SequenceFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must be of '
-        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
-      _, _ = sequence_input_layer({'aaa': sparse_input})
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args': {
-           # example 0, values [0., 1]
-           # example 1, [10.]
-           'indices': ((0, 0), (0, 1), (1, 0)),
-           'values': (0., 1., 10.),
-           'dense_shape': (2, 2)},
-       'expected_input_layer': [
-           [[0.], [1.]],
-           [[10.], [0.]]],
-       'expected_sequence_length': [2, 1]},
-      {'testcase_name': '3D',
-       'sparse_input_args': {
-           # feature 0, ids [[20, 3], [5]]
-           # feature 1, ids [[3], [8]]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
-           'values': (20., 3., 5., 3., 8.),
-           'dense_shape': (2, 2, 2)},
-       'expected_input_layer': [
-           [[20.], [3.], [5.], [0.]],
-           [[3.], [0.], [8.], [0.]]],
-       'expected_sequence_length': [2, 2]},
-      )
-  @test_util.run_in_graph_and_eager_modes
-  def test_numeric_column(
-      self, sparse_input_args, expected_input_layer, expected_sequence_length):
-    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
-
-    numeric_column = sfc.sequence_numeric_column('aaa')
-
-    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
-    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
-
-    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
-    self.assertAllEqual(
-        expected_sequence_length, self.evaluate(sequence_length))
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args': {
-           # example 0, values [0., 1.,  2., 3., 4., 5., 6., 7.]
-           # example 1, [10., 11., 12., 13.]
-           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
-                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
-           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-           'dense_shape': (2, 8)},
-       'expected_input_layer': [
-           # The output of numeric_column._get_dense_tensor should be flattened.
-           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
-           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
-       'expected_sequence_length': [2, 1]},
-      {'testcase_name': '3D',
-       'sparse_input_args': {
-           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
-           # example 1, [[10., 11., 12., 13.], []]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
-                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
-                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
-           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-           'dense_shape': (2, 2, 4)},
-       'expected_input_layer': [
-           # The output of numeric_column._get_dense_tensor should be flattened.
-           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
-           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
-       'expected_sequence_length': [2, 1]},
-      )
-  @test_util.run_in_graph_and_eager_modes
-  def test_numeric_column_multi_dim(
-      self, sparse_input_args, expected_input_layer, expected_sequence_length):
-    """Tests SequenceFeatures for multi-dimensional numeric_column."""
-    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
-
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
-
-    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
-    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
-
-    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
-    self.assertAllEqual(
-        expected_sequence_length, self.evaluate(sequence_length))
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_sequence_length_not_equal(self):
-    """Tests that an error is raised when sequence lengths are not equal."""
-    # Input a with sequence_length = [2, 1]
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    # Input b with sequence_length = [1, 1]
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0)),
-        values=(1., 10.),
-        dense_shape=(2, 2))
-    numeric_column_a = sfc.sequence_numeric_column('aaa')
-    numeric_column_b = sfc.sequence_numeric_column('bbb')
-
-    sequence_input_layer = sfc.SequenceFeatures(
-        [numeric_column_a, numeric_column_b])
-
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError, r'Condition x == y did not hold.*'):
-      _, sequence_length = sequence_input_layer({
-          'aaa': sparse_input_a,
-          'bbb': sparse_input_b
-      })
-      self.evaluate(sequence_length)
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args': {
-           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
-           # example 1, [[[10., 11.],  [12., 13.]]]
-           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
-                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
-           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-           'dense_shape': (2, 8)},
-       'expected_shape': [2, 2, 4]},
-      {'testcase_name': '3D',
-       'sparse_input_args': {
-           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
-           # example 1, [[10., 11., 12., 13.], []]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
-                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
-                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
-           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-           'dense_shape': (2, 2, 4)},
-       'expected_shape': [2, 2, 4]},
-      )
-  @test_util.run_in_graph_and_eager_modes
-  def test_static_shape_from_tensors_numeric(
-      self, sparse_input_args, expected_shape):
-    """Tests that we return a known static shape when we have one."""
-    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
-
-    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
-    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
-    shape = input_layer.get_shape()
-    self.assertEqual(shape, expected_shape)
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args': {
-           # example 0, ids [2]
-           # example 1, ids [0, 1]
-           # example 2, ids []
-           # example 3, ids [1]
-           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
-           'values': (2, 0, 1, 1),
-           'dense_shape': (4, 2)},
-       'expected_shape': [4, 2, 3]},
-      {'testcase_name': '3D',
-       'sparse_input_args': {
-           # example 0, ids [[2]]
-           # example 1, ids [[0, 1], [2]]
-           # example 2, ids []
-           # example 3, ids [[1], [0, 2]]
-           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
-                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
-           'values': (2, 0, 1, 2, 1, 0, 2),
-           'dense_shape': (4, 2, 2)},
-       'expected_shape': [4, 2, 3]}
-      )
-  @test_util.run_in_graph_and_eager_modes
-  def test_static_shape_from_tensors_indicator(
-      self, sparse_input_args, expected_shape):
-    """Tests that we return a known static shape when we have one."""
-    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
-    categorical_column = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    indicator_column = fc.indicator_column(categorical_column)
-
-    sequence_input_layer = sfc.SequenceFeatures([indicator_column])
-    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
-    shape = input_layer.get_shape()
-    self.assertEqual(shape, expected_shape)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_compute_output_shape(self):
-    price1 = sfc.sequence_numeric_column('price1', shape=2)
-    price2 = sfc.sequence_numeric_column('price2')
-    features = {
-        'price1': sparse_tensor.SparseTensor(
-            indices=[[0, 0, 0], [0, 0, 1],
-                     [0, 1, 0], [0, 1, 1],
-                     [1, 0, 0], [1, 0, 1],
-                     [2, 0, 0], [2, 0, 1],
-                     [3, 0, 0], [3, 0, 1]],
-            values=[0., 1., 10., 11., 100., 101., 200., 201., 300., 301.],
-            dense_shape=(4, 3, 2)),
-        'price2': sparse_tensor.SparseTensor(
-            indices=[[0, 0],
-                     [0, 1],
-                     [1, 0],
-                     [2, 0],
-                     [3, 0]],
-            values=[10., 11., 20., 30., 40.],
-            dense_shape=(4, 3))}
-    sequence_features = sfc.SequenceFeatures([price1, price2])
-    seq_input, seq_len = sequence_features(features)
-    self.assertEqual(
-        sequence_features.compute_output_shape((None, None)),
-        (None, None, 3))
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]],
-                         [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]],
-                         [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]],
-                         [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]],
-                        self.evaluate(seq_input))
-    self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
   """Tests the utility fn concatenate_context_input."""
@@ -644,54 +110,6 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesTest(test.TestCase):
-  """Tests DenseFeatures with sequence feature columns."""
-
-  def test_embedding_column(self):
-    """Tests that error is raised for sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
-
-    input_layer = dense_features.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-  def test_indicator_column(self):
-    """Tests that error is raised for sequence indicator column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-
-    input_layer = dense_features.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-
 def _assert_sparse_tensor_value(test_case, expected, actual):
   _assert_sparse_tensor_indices_shape(test_case, expected, actual)
 
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 8a9082d02e1..881ca0cca5e 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -20,9 +20,7 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -115,105 +113,6 @@ class FeatureColumnSerializationTest(test.TestCase):
     self.assertIs(new_price.normalizer_fn, _custom_fn)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_identity(
-                key='b', num_buckets=3), dimension=2)]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=['1', '2', '3']), dimension=2),
-            fc.indicator_column(fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3))]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
-    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
-    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
-
-  def test_crossed_column(self):
-    a = fc.categorical_column_with_vocabulary_list(
-        'a', vocabulary_list=['1', '2', '3'])
-    b = fc.categorical_column_with_vocabulary_list(
-        'b', vocabulary_list=['1', '2', '3'])
-    ab = fc.crossed_column([a, b], hash_bucket_size=2)
-    cols = [fc.indicator_column(ab)]
-
-    orig_layer = dense_features.DenseFeatures(cols)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class SequenceFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(('default', None, None),
-                                  ('trainable', True, 'trainable'),
-                                  ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [sfc.sequence_numeric_column('a')]
-    orig_layer = sfc.SequenceFeatures(cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 1)
-    self.assertEqual(config['feature_columns'][0]['class_name'],
-                     'SequenceNumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-
-  @parameterized.named_parameters(('default', None, None),
-                                  ('trainable', True, 'trainable'),
-                                  ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [sfc.sequence_numeric_column('a')]
-    orig_layer = sfc.SequenceFeatures(cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = sfc.SequenceFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class LinearModelLayerSerializationTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index c674717482d..dfe84f14f26 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -45,6 +45,9 @@ ASYNC_STATEFUL_OPS = [
     "CollectiveBcastSend",
     "CollectiveBcastRecv",
     "NcclAllReduce",
+    # We do not add "Send" here since we want it to be added as a control output
+    # in order to avoid being pruned.
+    "Recv",
 ]
 
 LEGACY_RANDOM_OPS = [
@@ -96,7 +99,8 @@ LEGACY_RANDOM_OPS = [
 _ORDER_INSENSITIVE_STATEFUL_OPS = [
     "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
     "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
-    "EnqueueTPUEmbeddingSparseTensorBatch"
+    "EnqueueTPUEmbeddingSparseTensorBatch",
+    "EnqueueTPUEmbeddingRaggedTensorBatch"
 ]
 # LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc)
 
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index 114c64900c6..d0e08e676d5 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import gen_sendrecv_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -165,6 +166,16 @@ class AutomaticControlDependenciesTest(test.TestCase):
       # Last write must be in `ops_which_must_run`.
       self.assertIn(assign_op4, c.ops_which_must_run)
 
+  def testSendInOpsWithMustRun(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      self.evaluate(variables.global_variables_initializer())
+      with acd.AutomaticControlDependencies() as c:
+        send_op = gen_sendrecv_ops.send(v, "x", "/", 0, "/")
+
+      # Send must be in `ops_which_must_run`.
+      self.assertIn(send_op, c.ops_which_must_run)
+
   def _testVariableReadInFunctionalOp(self, build_functional_op, op_type):
     v = resource_variable_ops.ResourceVariable(1.0)
     self.evaluate(variables.global_variables_initializer())
@@ -751,7 +762,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
     grad = backprop.implicit_grad(lambda v: v**2)(v)
 
     with self.assertRaisesRegexp(TypeError,
-                                 '.*must return zero or more Tensors.*'):
+                                 ".*must return zero or more Tensors.*"):
       # TODO(akshayka): We might want to allow defun-ing Python functions
       # that return operations (and just execute the op instead of running it).
       optimizer.apply_gradients(grad)
@@ -803,6 +814,6 @@ class AutomaticControlDependenciesTest(test.TestCase):
     self.assertEqual(self.evaluate(outer()), 2.0)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index b07bb874385..3051f1d0623 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -159,7 +159,6 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertFalse(config.get_soft_device_placement())
 
-    @def_function.function
     def mod():
       with ops.device('/device:GPU:0'):
         a = constant_op.constant(1.0)
@@ -172,8 +171,10 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
         config.get_soft_device_placement(),
         context.context().soft_device_placement)
 
-    # Since soft placement is enabled, the mod operation should work with CPU
+    # Since soft placement is enabled, the mod operation should fallback to CPU
+    # with pure eager execution as well as functions
     mod()
+    def_function.function(mod)()
 
     config.set_soft_device_placement(False)
     self.assertEqual(config.get_soft_device_placement(), False)
@@ -182,8 +183,11 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
         context.context().soft_device_placement)
 
     # Since soft placement is disabled, the mod operation should fail on GPU
+    # with pure eager execution as well as functions
     with self.assertRaises(errors.InvalidArgumentError):
       mod()
+    with self.assertRaises(errors.InvalidArgumentError):
+      def_function.function(mod)()
 
   @reset_eager
   def testLogDevicePlacement(self):
@@ -203,12 +207,8 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
 
     context.ensure_initialized()
 
-    with self.assertRaises(RuntimeError):
-      context.set_log_device_placement(True)
-
-    # If the setting the device placement is a no-op, do not throw a runtime
-    # exception.
-    context.set_log_device_placement(False)
+    # Changing the device placement should not throw an exception
+    context.set_log_device_placement(True)
 
   @reset_eager
   def testEnableMlirBridge(self):
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 9736bb8b78b..af9a0f7738c 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -28,6 +28,7 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -299,11 +300,17 @@ def _constant_impl(
           value, dtype=dtype, shape=shape, verify_shape=verify_shape,
           allow_broadcast=allow_broadcast))
   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
+  attrs = {"value": tensor_value, "dtype": dtype_value}
   const_tensor = g._create_op_internal(  # pylint: disable=protected-access
-      "Const", [], [dtype_value.type],
-      attrs={"value": tensor_value,
-             "dtype": dtype_value},
-      name=name).outputs[0]
+      "Const", [], [dtype_value.type], attrs=attrs, name=name).outputs[0]
+
+  if op_callbacks.should_invoke_op_callbacks():
+    # TODO(b/147670703): Once the special-op creation code paths
+    # are unified. Remove this `if` block.
+    callback_outputs = op_callbacks.invoke_op_callbacks(
+        "Const", tuple(), attrs, (const_tensor,), op_name=name, graph=g)
+    if callback_outputs is not None:
+      const_tensor, = callback_outputs
   return const_tensor
 
 
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index bc4cb4094ad..6b358a3c51a 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -1190,13 +1190,6 @@ def _get_defun_inputs(args, names, structure, flat_shapes=None):
     arg_value = nest.map_structure(_get_composite_tensor_spec, arg_value)
 
     flattened = nest.flatten(arg_value, expand_composites=True)
-    tensor_specs = [
-        arg for arg in flattened if isinstance(arg, tensor_spec.DenseSpec)
-    ]
-    specified_names = [arg.name for arg in tensor_specs if arg.name]
-    if specified_names and len(specified_names) < len(tensor_specs):
-      raise ValueError("If specifying TensorSpec names for nested structures, "
-                       "either zero or all names have to be specified.")
 
     for arg in flattened:
       # We have a shape entry for each arg, regardless of whether it's a real
diff --git a/tensorflow/python/framework/indexed_slices.py b/tensorflow/python/framework/indexed_slices.py
index 8e9c6f63dca..f85d0e77481 100644
--- a/tensorflow/python/framework/indexed_slices.py
+++ b/tensorflow/python/framework/indexed_slices.py
@@ -29,9 +29,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_conversion_registry
-from tensorflow.python.framework import tensor_like
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import type_spec
+from tensorflow.python.types import internal
 from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
@@ -55,8 +55,9 @@ tensor_util = LazyLoader(
     "tensorflow.python.framework.tensor_util")
 
 
+# TODO(mdan): Should IndexedSlices be a "tensor"?
 @tf_export("IndexedSlices")
-class IndexedSlices(tensor_like.TensorLike, composite_tensor.CompositeTensor):
+class IndexedSlices(internal.NativeObject, composite_tensor.CompositeTensor):
   """A sparse representation of a set of tensor slices at given indices.
 
   This class is a simple wrapper for a pair of `Tensor` objects:
@@ -305,7 +306,8 @@ def internal_convert_to_tensor_or_indexed_slices(value,
   """
   if isinstance(value, ops.EagerTensor) and not context.executing_eagerly():
     return ops.convert_to_tensor(value, dtype=dtype, name=name, as_ref=as_ref)
-  elif isinstance(value, tensor_like.TensorLike):
+  # TODO(mdan): Name says tensor_or_indexed_slices. So do explicitly just that?
+  elif isinstance(value, internal.NativeObject):
     if dtype and not dtypes.as_dtype(dtype).is_compatible_with(value.dtype):
       raise ValueError(
           "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
diff --git a/tensorflow/python/framework/op_callbacks_test.py b/tensorflow/python/framework/op_callbacks_test.py
index 31b6a583b8e..8868ffd664e 100644
--- a/tensorflow/python/framework/op_callbacks_test.py
+++ b/tensorflow/python/framework/op_callbacks_test.py
@@ -109,7 +109,8 @@ class _NumpyFunctionCallback(object):
         if compat.as_bytes(op_type) in (_ENTER_OP, _EXIT_OP, _IF_OP, _MERGE_OP,
                                         _NEXT_ITERATION_OP, _STATELESS_IF_OP,
                                         _SWITCH_OP, _WHILE_OP, _IDENTITY_OP,
-                                        _VAR_HANDLE_OP, _PLACEHOLDER_OP):
+                                        _VAR_HANDLE_OP, _PLACEHOLDER_OP,
+                                        _CONSTANT_OP):
           # TODO(cais): Overriding the output of StatelessIf, If and While ops
           # currently fails with error. Investigate (b/139668453).
           # Avoid instrumenting Identity ops as well, as they are inserted
@@ -724,7 +725,7 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
   def testOverrideDTypeInFuncGraph(self):
     def to_float64(op_type, inputs, attrs, outputs, op_name=None, graph=None):
       del inputs, attrs, op_name, graph  # Unused.
-      if op_type == "Placeholder":
+      if op_type in ("Const", "Placeholder"):
         return outputs
       else:
         return [math_ops.cast(output, dtypes.float64) for output in outputs]
@@ -751,6 +752,17 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     self.assertIsNone(w)
     self.assertEqual(instrument.eager_op_types, [_ADD_OP])
 
+  def testOpCallbackCapturesConstTensors(self):
+    instrument = _NumpyFunctionCallback()
+    op_callbacks.add_op_callback(instrument.callback)
+
+    @def_function.function
+    def times_two_plus_three(x):
+      return x * 2.0 + 3.0
+
+    self.assertAllClose(times_two_plus_three(constant_op.constant(10.0)), 23.0)
+    self.assertEqual(instrument.graph_op_types.count(b"Const"), 2)
+
   @test_util.run_in_graph_and_eager_modes
   def testOpCallbackWorksWithGradientTape(self):
     instrument = _NumpyFunctionCallback()
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index a41fcee2e97..5b6dac5be34 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -56,13 +56,14 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor_conversion_registry
-from tensorflow.python.framework import tensor_like
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import traceable_stack
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import core as core_tf_types
+from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import deprecation
@@ -100,6 +101,10 @@ _control_flow_api_gauge = monitoring.BoolGauge(
     "/tensorflow/api/enable_control_flow_v2",
     "Whether enable_control_flow_v2() is called.")
 
+_tf_function_api_guage = monitoring.BoolGauge(
+    "/tensorflow/api/tf_function",
+    "Whether tf.function() is used.")
+
 # pylint: disable=protected-access
 _DTYPES_INTERN_TABLE = dtypes._INTERN_TABLE
 # pylint: enable=protected-access
@@ -209,53 +214,11 @@ def _as_graph_element(obj):
   return None
 
 
-_TENSOR_LIKE_TYPES = tuple()
-
-
+# Deprecated - do not use.
+# This API to avoid breaking estimator and tensorflow-mesh which depend on this
+# internal API. The stub should be safe to use after TF 2.3 is released.
 def is_dense_tensor_like(t):
-  """EXPERIMENTAL: Returns true if `t` implements the tensor interface.
-
-  See `register_dense_tensor_like_type()` for the current definition of a
-  "tensor-like type".
-
-  Args:
-    t: An object.
-
-  Returns:
-    True iff `t` is an instance of one of the registered "tensor-like" types.
-  """
-  return isinstance(t, _TENSOR_LIKE_TYPES)
-
-
-def register_dense_tensor_like_type(tensor_type):
-  """EXPERIMENTAL: Registers `tensor_type` as implementing the tensor interface.
-
-  A "tensor-like type" can represent a single dense tensor, and implements
-  the `name`, `dtype` and `shape` properties.
-
-  Args:
-    tensor_type: A type implementing the tensor interface.
-
-  Raises:
-    TypeError: If `tensor_type` does not implement the tensor interface.
-  """
-  if not (hasattr(tensor_type, "name") and
-          isinstance(tensor_type.name, property)):
-    raise TypeError("Type %s does not define a `name` property" %
-                    tensor_type.__name__)
-  if not (hasattr(tensor_type, "dtype") and
-          isinstance(tensor_type.dtype, property)):
-    raise TypeError("Type %s does not define a `dtype` property" %
-                    tensor_type.__name__)
-  if not (hasattr(tensor_type, "shape") and
-          isinstance(tensor_type.shape, property)):
-    raise TypeError("Type %s does not define a `shape` property" %
-                    tensor_type.__name__)
-  # We expect this list to be small, so choose quadratic complexity
-  # for registration, so that we have a tuple that can be used for
-  # more efficient `isinstance` checks later.
-  global _TENSOR_LIKE_TYPES
-  _TENSOR_LIKE_TYPES = tuple(list(_TENSOR_LIKE_TYPES) + [tensor_type])
+  return isinstance(t, core_tf_types.Tensor)
 
 
 def uid():
@@ -298,8 +261,9 @@ def disable_tensor_equality():
   Tensor._USE_EQUALITY = False  # pylint: disable=protected-access
 
 
+# TODO(mdan): This object should subclass Symbol, not just Tensor.
 @tf_export("Tensor")
-class Tensor(tensor_like.TensorLike):
+class Tensor(internal.NativeObject, core_tf_types.Tensor):
   """A tensor is a multidimensional array of elements represented by a
 
   `tf.Tensor` object.  All elements are of a single known data type.
@@ -514,8 +478,8 @@ class Tensor(tensor_like.TensorLike):
 
   def _disallow_when_autograph_enabled(self, task):
     raise errors.OperatorNotAllowedInGraphError(
-        "{} is not allowed: AutoGraph did not convert this function. Try"
-        " decorating it directly with @tf.function.".format(task))
+        "{} is not allowed: AutoGraph did convert this function. This might"
+        " indicate you are trying to use an unsupported feature.".format(task))
 
   def _disallow_in_graph_mode(self, task):
     raise errors.OperatorNotAllowedInGraphError(
@@ -1003,6 +967,7 @@ class Tensor(tensor_like.TensorLike):
 
 
 # TODO(agarwal): consider getting rid of this.
+# TODO(mdan): This object should not subclass ops.Tensor.
 class _EagerTensorBase(Tensor):
   """Base class for EagerTensor."""
 
@@ -1061,15 +1026,17 @@ class _EagerTensorBase(Tensor):
     except core._NotOkStatusException as e:
       six.raise_from(core._status_to_exception(e.code, e.message), None)
 
+  def __array__(self):
+    return self._numpy()
+
   def _numpy_internal(self):
     raise NotImplementedError()
 
   def _numpy(self):
-    # pylint: disable=protected-access
     try:
       return self._numpy_internal()
-    except core._NotOkStatusException as e:
-      six.raise_from(core._status_to_exception(e.code, e.message), None)
+    except core._NotOkStatusException as e:  # pylint: disable=protected-access
+      six.raise_from(core._status_to_exception(e.code, e.message), None)  # pylint: disable=protected-access
 
   @property
   def dtype(self):
@@ -1297,9 +1264,6 @@ class _EagerTensorBase(Tensor):
 EagerTensor = pywrap_tfe.TFE_Py_InitEagerTensor(_EagerTensorBase)
 
 
-register_dense_tensor_like_type(Tensor)
-
-
 @tf_export(v1=["convert_to_tensor"])
 def convert_to_tensor_v1(value,
                          dtype=None,
@@ -1430,6 +1394,65 @@ def _error_prefix(name):
   return "" if name is None else "%s: " % name
 
 
+def pack_eager_tensors(tensors, ctx=None):
+  """Pack multiple `EagerTensor`s of the same dtype and shape.
+
+  Args:
+    tensors: a list of EagerTensors to pack.
+    ctx: context.context().
+
+  Returns:
+    A packed EagerTensor.
+  """
+  if not isinstance(tensors, list):
+    raise TypeError("tensors must be a list or a tuple: %s" % tensors)
+
+  if not tensors:
+    raise ValueError("Empty tensors is unexpected for packing.")
+
+  dtype = tensors[0].dtype
+  shape = tensors[0].shape
+  handle_data = tensors[0]._handle_data  # pylint: disable=protected-access
+  is_resource = dtype == dtypes.resource
+  for i in range(len(tensors)):
+    t = tensors[i]
+    if not isinstance(t, EagerTensor):
+      raise TypeError("tensors must be a list of EagerTensors: %s" % t)
+
+    if t.dtype != dtype:
+      raise ValueError(
+          "All tensors being packed should have the same dtype %s, "
+          "but the %d-th tensor is of dtype %s" % (dtype, i, t.dtype))
+    if t.shape != shape:
+      raise ValueError(
+          "All tensors being packed should have the same shape %s, "
+          "but the %d-th tensor is of shape %s" % (shape, i, t.shape))
+    # pylint: disable=protected-access
+    if is_resource and t._handle_data != handle_data:
+      raise ValueError(
+          "All tensors being packed should have the same handle data %s, "
+          "but the %d-th tensor is of handle data %s" %
+          (handle_data, i, t._handle_data))
+    # pylint: enable=protected-access
+
+  if ctx is None:
+    ctx = context.context()
+
+  # Propogate handle data for resource variables
+  packed_tensor = ctx.pack_eager_tensors(tensors)
+  if handle_data is not None:
+    packed_tensor._handle_data = handle_data  # pylint: disable=protected-access
+
+  def grad_fun(_):
+    raise ValueError(
+        "Gradients through pack_eager_tensors are not supported yet.")
+
+  tape.record_operation("pack_eager_tensors", [packed_tensor], tensors,
+                        grad_fun)
+
+  return packed_tensor
+
+
 def convert_to_tensor(value,
                       dtype=None,
                       name=None,
@@ -1742,8 +1765,8 @@ def _NodeDef(op_type, name, attrs=None):
 
 # Copied from core/framework/node_def_util.cc
 # TODO(mrry,josh11b): Consolidate this validation in C++ code.
-_VALID_OP_NAME_REGEX = re.compile("^[A-Za-z0-9.][A-Za-z0-9_.\\-/>]*$")
-_VALID_SCOPE_NAME_REGEX = re.compile("^[A-Za-z0-9_.\\-/>]*$")
+_VALID_OP_NAME_REGEX = re.compile(r"^[A-Za-z0-9.][A-Za-z0-9_.\\/>-]*$")
+_VALID_SCOPE_NAME_REGEX = re.compile(r"^[A-Za-z0-9_.\\/>-]*$")
 
 
 def _create_c_op(graph, node_def, inputs, control_inputs, op_def=None):
@@ -5291,6 +5314,11 @@ def control_dependencies(control_inputs):
   See `tf.Graph.control_dependencies`
   for more details.
 
+  Note: *In TensorFlow 2 with eager and/or Autograph, you should not require
+  this method, as code executes in the expected order.* Only use
+  `tf.control_dependencies` when working with v1-style code or in a graph
+  context such as inside `Dataset.map`.
+
   When eager execution is enabled, any callable object in the `control_inputs`
   list will be called.
 
@@ -6053,7 +6081,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
     # TODO(josh11b): Note that we exclude subclasses of Tensor. Need to clean this
     # up.
     graph_element = None
-    if (isinstance(op_input, (Operation, tensor_like.TensorLike)) and
+    if (isinstance(op_input, (Operation, internal.NativeObject)) and
         ((not isinstance(op_input, Tensor)) or type(op_input) == Tensor)):  # pylint: disable=unidiomatic-typecheck
       graph_element = op_input
     else:
@@ -6248,10 +6276,12 @@ def add_to_collection(name, value):
   Args:
     name: The key for the collection. For example, the `GraphKeys` class
       contains many standard names for collections.
-    value: The value to add to the collection.  @compatibility(eager)
-      Collections are only supported in eager when variables are created inside
-      an EagerVariableStore (e.g. as part of a layer or template).
-      @end_compatibility
+    value: The value to add to the collection.
+
+  @compatibility(eager)
+  Collections are only supported in eager when variables are created inside
+  an EagerVariableStore (e.g. as part of a layer or template).
+  @end_compatibility
   """
   get_default_graph().add_to_collection(name, value)
 
@@ -6266,10 +6296,12 @@ def add_to_collections(names, value):
   Args:
     names: The key for the collections. The `GraphKeys` class contains many
       standard names for collections.
-    value: The value to add to the collections.  @compatibility(eager)
-      Collections are only supported in eager when variables are created inside
-      an EagerVariableStore (e.g. as part of a layer or template).
-      @end_compatibility
+    value: The value to add to the collections.
+
+  @compatibility(eager)
+  Collections are only supported in eager when variables are created inside
+  an EagerVariableStore (e.g. as part of a layer or template).
+  @end_compatibility
   """
   get_default_graph().add_to_collections(names, value)
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 015f05310a2..7626bd780bb 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
@@ -90,6 +91,7 @@ class ResourceTest(test_util.TensorFlowTestCase):
                   resources.shared_resources()).eval()), 0)
 
 
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -119,7 +121,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(TypeError, "iterating.*not allowed in Graph"):
       next(iter(t))
     with self.assertRaisesRegexp(
-        TypeError, "iterating.*AutoGraph did not convert"):
+        TypeError, "iterating.*AutoGraph did convert"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.ENABLED):
         next(iter(t))
     with self.assertRaisesRegexp(
@@ -135,7 +137,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
         TypeError, "using.*as a.*bool.*not allowed in Graph"):
       bool(t)
     with self.assertRaisesRegexp(
-        TypeError, "using.*as a.*bool.*AutoGraph did not convert"):
+        TypeError, "using.*as a.*bool.*AutoGraph did convert"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.ENABLED):
         bool(t)
     with self.assertRaisesRegexp(
@@ -309,6 +311,8 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     del x
     self.assertIsNotNone(x_ref.deref())
 
+
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
@@ -353,6 +357,7 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(x.indices, [0, 2])
 
 
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesSpecTest(test_util.TensorFlowTestCase,
                             parameterized.TestCase):
@@ -498,6 +503,7 @@ def _apply_op(g, *args, **kwargs):
     return op.outputs
 
 
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class OperationTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
@@ -1308,6 +1314,18 @@ class NameStackTest(test_util.TensorFlowTestCase):
     self.assertEqual("bar_2", g.unique_name("bar", mark_as_used=False))
     self.assertEqual("bar_2", g.unique_name("bar"))
 
+  def testBackslashAndDashRegex(self):
+    # GitHub issue 39019, all should pass
+    g = ops.Graph()
+    with g.name_scope("n_CatCntc-campaign\\c_campaign"):
+      pass
+    with g.name_scope("foo"):
+      with g.name_scope("n_CatCntc-campaign\\c_campaign"):
+        pass
+    with g.name_scope("n_CatCntc-campaign\\c_campaign"):
+      with g.name_scope("foo"):
+        pass
+
   @test_util.run_deprecated_v1
   def testNameAndVariableScope(self):
     with self.cached_session() as sess:
@@ -1428,6 +1446,7 @@ class NameTest(test_util.TensorFlowTestCase):
                        g.create_op("FloatOutput", [], [dtypes.float32]).name)
 
 
+@test_util.disable_tfrt("Device API are not supported yet. b/156188344")
 class DeviceTest(test_util.TensorFlowTestCase):
 
   def testNoDevice(self):
@@ -2008,6 +2027,7 @@ class CollectionTest(test_util.TensorFlowTestCase):
       # Collections are ordered.
       self.assertEqual([90, 100], ops.get_collection("key"))
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def test_defun(self):
     with context.eager_mode():
 
@@ -2114,6 +2134,7 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
     # e should be dominated by c.
     self.assertEqual(e.op.control_inputs, [])
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes
   def testEager(self):
     def future():
@@ -2434,6 +2455,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     self._testGraphElements([a, variable, b])
 
 
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class InitScopeTest(test_util.TensorFlowTestCase):
 
   def testClearsControlDependencies(self):
@@ -2736,6 +2758,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           self.assertFalse(self.evaluate(f()))
 
 
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class GraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -3213,6 +3236,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
       b = variables.Variable([3.0], name="b")
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def testColocateWithVariableInFunction(self):
     v = variables.Variable(1.)
 
@@ -3246,55 +3270,6 @@ class DeprecatedTest(test_util.TensorFlowTestCase):
         test_ops.old()
 
 
-class DenseTensorLikeTypeTest(test_util.TensorFlowTestCase):
-
-  def testSuccess(self):
-    op = ops.Operation(
-        ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
-    t = op.outputs[0]
-    self.assertTrue(ops.is_dense_tensor_like(t))
-
-    v = variables.Variable([17])
-    self.assertTrue(ops.is_dense_tensor_like(v))
-
-  class BadClassNoName(object):
-    pass
-
-  class BadClassBadName(object):
-
-    def name(self):
-      pass
-
-  class BadClassNoDtype(object):
-
-    @property
-    def name(self):
-      pass
-
-  class BadClassBadDtype(object):
-
-    @property
-    def name(self):
-      pass
-
-    def dtype(self):
-      pass
-
-  def testBadClass(self):
-    with self.assertRaisesRegexp(TypeError, "`name`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassNoName)
-    with self.assertRaisesRegexp(TypeError, "`name`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassBadName)
-    with self.assertRaisesRegexp(TypeError, "`dtype`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassNoDtype)
-    with self.assertRaisesRegexp(TypeError, "`dtype`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassBadDtype)
-
-
 class NameScopeTest(test_util.TensorFlowTestCase):
 
   def testStripAndPrependScope(self):
@@ -3421,6 +3396,7 @@ ops.register_tensor_conversion_function(
 
 class CustomConvertToCompositeTensorTest(test_util.TensorFlowTestCase):
 
+  @test_util.disable_tfrt("TODO(kkb): This makes Kokoro tests fail.")
   def testCompositeTensorConversion(self):
     """Tests that a user can register a CompositeTensor converter."""
     x = _MyTuple((1, [2., 3.], [[4, 5], [6, 7]]))
@@ -3434,5 +3410,51 @@ class CustomConvertToCompositeTensorTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x_, tensor_util.constant_value(y_))
 
 
+@test_util.disable_tfrt("Packing EagerTensors is not supported yet.")
+class PackEagerTensorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(PackEagerTensorTest, self).setUp()
+    context._reset_context()
+    cpus = config.list_physical_devices("CPU")
+    # Set 2 virtual CPUs
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+    ])
+
+  def testPack(self):
+    with context.eager_mode():
+      with ops.device("CPU:0"):
+        var0 = resource_variable_ops.ResourceVariable(1.0)
+        c0 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+      with ops.device("CPU:1"):
+        var1 = resource_variable_ops.ResourceVariable(2.0)
+        var2 = resource_variable_ops.ResourceVariable([3.0])
+        c1 = constant_op.constant([9.0])
+
+      packed_var0 = ops.pack_eager_tensors([var0.handle, var1.handle])
+      self.assertTrue(packed_var0.is_packed)
+      self.assertEqual(packed_var0.dtype, var0.handle.dtype)
+      self.assertEqual(packed_var0.shape, var0.handle.shape)
+      self.assertEqual(packed_var0._handle_data, var0.handle._handle_data)
+      self.assertIn("COMPOSITE:0", packed_var0.device)
+      self.assertIn("COMPOSITE:0", packed_var0.backing_device)
+      with self.assertRaises(errors.InvalidArgumentError):
+        packed_var0.numpy()
+
+      # Different dtypes
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([var0.handle, c1])
+
+      # Different shapes
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([c0, c1])
+
+      # Different handle data
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([var0.handle, var2.handle])
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 857cc7b6638..ca0c5d9ef1a 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -959,7 +959,10 @@ void GenEagerPythonOp::AddDispatch(const string& prefix) {
 
   strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
   strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
-  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));
+  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_,
+                                  ", "
+                                  "(), dict("));
+  strings::StrAppend(&result_, prefix, "      )\n");
   strings::StrAppend(&result_, prefix,
                      "  if result is not "
                      "_dispatch.OpDispatcher.NOT_SUPPORTED:\n");
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index d085dfdab0d..76cb24f2cc6 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -29,12 +29,12 @@ from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_like
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import gen_sparse_ops
+from tensorflow.python.types import internal
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
@@ -44,7 +44,7 @@ _override_helper = ops._override_helper
 
 
 @tf_export("sparse.SparseTensor", "SparseTensor")
-class SparseTensor(tensor_like.TensorLike, composite_tensor.CompositeTensor):
+class SparseTensor(internal.NativeObject, composite_tensor.CompositeTensor):
   """Represents a sparse tensor.
 
   TensorFlow represents a sparse tensor as three separate dense tensors:
@@ -132,38 +132,9 @@ class SparseTensor(tensor_like.TensorLike, composite_tensor.CompositeTensor):
       # is a VariableOp and updating users of SparseTensor.
       values = ops.convert_to_tensor(values, name="values")
 
-      # Can't check `if context.executing_eagerly()` here because sparse
-      # placeholders can still be used in eager context, when building a
-      # functional model.
-      if isinstance(indices, ops.EagerTensor):
-        try:
-          dense_shape = ops.convert_to_tensor(
-              dense_shape, name="dense_shape", dtype=dtypes.int64)
-          dense_shape_default = tensor_shape.TensorShape(dense_shape)
-        except ValueError:
-          raise ValueError("Unable to create eager SparseTensor. Check that "
-                           "your shape is correctly defined. Eager "
-                           "SparseTensors don't support unknown dimesions.\n"
-                           "got shape:\n    {}".format(dense_shape))
-      else:
-        if isinstance(dense_shape, ops.Tensor):
-          dense_shape_default = tensor_util.constant_value_as_shape(dense_shape)
-        else:
-          dense_shape_default = []
-          for dim in dense_shape:
-            if isinstance(dim, ops.Tensor):
-              # There is code passing lists of constant tensors.
-              dim = tensor_util.constant_value(dim)
-            if dim == -1:
-              # -1 may be passed for unknown shapes.
-              dim = None
-
-            dense_shape_default.append(dim)
-
-        dense_shape_default = tensor_shape.TensorShape(dense_shape_default)
-
-        dense_shape = ops.convert_to_tensor(
-            dense_shape, name="dense_shape", dtype=dtypes.int64)
+      dense_shape = ops.convert_to_tensor(
+          dense_shape, name="dense_shape", dtype=dtypes.int64)
+      dense_shape_default = tensor_util.constant_value_as_shape(dense_shape)
 
     self._indices = indices
     self._values = values
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index f7ecf00f29b..0d18af1fe2f 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -29,6 +29,8 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import googletest
 
@@ -124,6 +126,84 @@ class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
             sparse_tensor_value.dense_shape, convertee.dense_shape)
 
 
+class SparseTensorShapeTest(test_util.TensorFlowTestCase):
+
+  def test_simple(self):
+    indices = [[0, 2]]
+    values = [1]
+    dense_shape = [5, 5]
+    sp = sparse_tensor.SparseTensor(indices, values, dense_shape)
+
+    self.assertIsInstance(sp.shape, tensor_shape.TensorShape)
+    self.assertIsInstance(sp.dense_shape, ops.Tensor)
+    self.assertEqual(sp.shape.as_list(), [5, 5])
+
+  def test_unknown_shape(self):
+
+    @def_function.function
+    def my_func(dense_shape):
+      indices = [[0, 2]]
+      values = [1]
+      sp = sparse_tensor.SparseTensor(indices, values, dense_shape)
+      self.assertEqual(sp.shape.as_list(), [None, None])
+      return sp
+
+    my_func.get_concrete_function(
+        dense_shape=tensor_spec.TensorSpec(
+            dtype=dtypes.int64, shape=[2,]))
+
+  def test_partial_shape(self):
+
+    @def_function.function
+    def my_func(x):
+      indices = [[0, 2]]
+      values = [1]
+      y = ops.convert_to_tensor(3, dtype=dtypes.int64)
+      dense_shape = [x, y]
+      sp = sparse_tensor.SparseTensor(indices, values, dense_shape)
+      self.assertEqual(sp.shape.as_list(), [None, 3])
+      return sp
+
+    my_func.get_concrete_function(
+        x=tensor_spec.TensorSpec(dtype=dtypes.int64, shape=[]))
+
+  def test_neg_shape(self):
+    indices = [[0, 2]]
+    values = [1]
+    dense_shape = [-1, 5]
+    sp = sparse_tensor.SparseTensor(indices, values, dense_shape)
+    self.assertEqual(sp.shape.as_list(), [None, 5])
+
+  def test_unknown_tensor_shape(self):
+
+    @def_function.function
+    def my_func(x):
+      indices = [[0, 0]]
+      values = [1]
+      dense_shape = array_ops.shape(x)
+      dense_shape = math_ops.cast(dense_shape, dtypes.int64)
+
+      sp = sparse_tensor.SparseTensor(indices, values, dense_shape)
+      self.assertEqual(sp.shape.as_list(), [None, None])
+      return sp
+
+    my_func.get_concrete_function(
+        x=tensor_spec.TensorSpec(dtype=dtypes.int64, shape=[None, None]))
+
+  def test_unknown_rank(self):
+
+    @def_function.function
+    def my_func(dense_shape):
+      indices = [[0, 0]]
+      values = [1]
+      sp = sparse_tensor.SparseTensor(indices, values, dense_shape)
+      self.assertEqual(sp.shape.rank, None)
+      return sp
+
+    my_func.get_concrete_function(
+        dense_shape=tensor_spec.TensorSpec(dtype=dtypes.int64, shape=[None]))
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class SparseTensorSpecTest(test_util.TensorFlowTestCase,
                            parameterized.TestCase):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 8f22cad4135..968b635250a 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -25,9 +25,9 @@ from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_like
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.types import core
+from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -261,8 +261,12 @@ def _check_quantized(values):
 
 def _generate_isinstance_check(expected_types):
   def inner(values):
-    _ = [_check_failed(v) for v in nest.flatten(values)
-         if not isinstance(v, expected_types)]
+    for v in nest.flatten(values):
+      if not (isinstance(v, expected_types) or
+              (isinstance(v, np.ndarray) and
+               issubclass(v.dtype.type, expected_types))):
+        _check_failed(v)
+
   return inner
 
 _check_int = _generate_isinstance_check(
@@ -791,6 +795,8 @@ def _ConstantValue(tensor, partial):
     return np.not_equal(value1, value2)
   elif tensor.op.type == "StopGradient":
     return constant_value(tensor.op.inputs[0], partial)
+  elif tensor.op.type in ("CheckNumericsV2", "DebugIdentityV2", "Identity"):
+    return constant_value(tensor.op.inputs[0], partial)
   else:
     return None
 
@@ -976,6 +982,7 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   return ret
 
 
+# TODO(mdan): Deprecate in favor of more static-friendly types.
 @tf_export("is_tensor")
 def is_tensor(x):  # pylint: disable=invalid-name
   """Checks whether `x` is a TF-native type that can be passed to many TF ops.
@@ -1002,8 +1009,8 @@ def is_tensor(x):  # pylint: disable=invalid-name
   Returns:
     `True` if `x` is a tensor or "tensor-like", `False` if not.
   """
-  return (isinstance(x, (tensor_like.TensorLike, core.Tensor)) or
-          ops.is_dense_tensor_like(x) or
+  return (isinstance(x, internal.NativeObject) or
+          isinstance(x, core.Tensor) or
           getattr(x, "is_tensor_like", False))
 
 
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index f1f63d3d53e..ad0aec1623d 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -713,6 +713,19 @@ class TensorUtilTest(test.TestCase):
     self.assertAllEqual(
         np.array([[(1 + 2j), (3 + 4j)], [(5 + 6j), (7 + 8j)]]), a)
 
+  def testNestedNumpyArrayWithoutDType(self):
+    t = tensor_util.make_tensor_proto([10.0, 20.0, np.array(30.0)])
+    a = tensor_util.MakeNdarray(t)
+    self.assertEqual(np.float32, a.dtype)
+    self.assertAllClose(np.array([10.0, 20.0, 30.0], dtype=np.float32), a)
+
+  def testNestedNumpyArrayWithDType(self):
+    t = tensor_util.make_tensor_proto([10.0, 20.0, np.array(30.0)],
+                                      dtype=dtypes.float32)
+    a = tensor_util.MakeNdarray(t)
+    self.assertEqual(np.float32, a.dtype)
+    self.assertAllClose(np.array([10.0, 20.0, 30.0], dtype=np.float32), a)
+
   def testUnsupportedDTypes(self):
     with self.assertRaises(TypeError):
       tensor_util.make_tensor_proto(np.array([1]), 0)
@@ -1002,6 +1015,12 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertAllEqual(input_, c_val)
 
+  def testIdentity(self):
+    input_ = np.random.rand(4, 7)
+    tf_val = array_ops.identity(input_)
+    c_val = tensor_util.constant_value(tf_val)
+    self.assertAllEqual(input_, c_val)
+
   def testLiteral(self):
     x = "hi"
     self.assertIs(x, tensor_util.constant_value(x))
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 9b423bf10c5..4981e1b68fd 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -31,6 +31,7 @@ import random
 import re
 import tempfile
 import threading
+import time
 import unittest
 
 from absl.testing import parameterized
@@ -110,6 +111,17 @@ except Exception:  # pylint: disable=broad-except
   pass
 
 
+# Uses the same mechanism as above to selectively enable TFRT.
+def is_tfrt_enabled():
+  return False
+
+
+try:
+  from tensorflow.python.framework.is_tfrt_test_true import is_tfrt_enabled  # pylint: disable=g-import-not-at-top, unused-import
+except Exception:  # pylint: disable=broad-except
+  pass
+
+
 def _get_object_count_by_type():
   return collections.Counter([type(obj).__name__ for obj in gc.get_objects()])
 
@@ -448,6 +460,38 @@ def skip_if(condition):
   return real_skip_if
 
 
+@contextlib.contextmanager
+def skip_if_error(test_obj, error_type, messages=None):
+  """Context manager to skip cases not considered failures by the tests.
+
+  Note that this does not work if used in setUpClass/tearDownClass.
+  Usage in setUp/tearDown works fine just like regular test methods.
+
+  Args:
+    test_obj: A test object provided as `self` in the test methods; this object
+      is usually an instance of `unittest.TestCase`'s subclass and should have
+      `skipTest` method.
+    error_type: The error type to skip. Note that if `messages` are given, both
+      `error_type` and `messages` need to match for the test to be skipped.
+    messages: Optional, a string or list of strings. If `None`, the test will be
+      skipped if `error_type` matches what is raised; otherwise, the test is
+      skipped if any of the `messages` is contained in the message of the error
+      raised, and `error_type` matches the error raised.
+
+  Yields:
+    Nothing.
+  """
+  if messages:
+    messages = nest.flatten(messages)
+  try:
+    yield
+  except error_type as e:
+    if not messages or any([message in str(e) for message in messages]):
+      test_obj.skipTest("Skipping error: {}".format(str(e)))
+    else:
+      raise
+
+
 def enable_c_shapes(fn):
   """No-op. TODO(b/74620627): Remove this."""
   return fn
@@ -1063,7 +1107,6 @@ def eager_lazy_remote_copy_on_and_off(f):
 def run_in_graph_and_eager_modes(func=None,
                                  config=None,
                                  use_gpu=True,
-                                 reset_test=True,
                                  assert_no_eager_garbage=False):
   """Execute the decorated test with and without enabling eager execution.
 
@@ -1105,8 +1148,6 @@ def run_in_graph_and_eager_modes(func=None,
     config: An optional config_pb2.ConfigProto to use to configure the session
       when executing graphs.
     use_gpu: If True, attempt to run as many operations as possible on GPU.
-    reset_test: If True, tearDown and SetUp the test case between the two
-      executions of the test (once with and once without eager execution).
     assert_no_eager_garbage: If True, sets DEBUG_SAVEALL on the garbage
       collector and asserts that no extra garbage has been created when running
       the test with eager execution enabled. This will fail if there are
@@ -1150,17 +1191,15 @@ def run_in_graph_and_eager_modes(func=None,
         run_eagerly = assert_no_new_tensors(
             assert_no_garbage_created(run_eagerly))
 
-      if reset_test:
-        # This decorator runs the wrapped test twice.
-        # Reset the test environment between runs.
-        self.tearDown()
-        self._tempdir = None
+      # This decorator runs the wrapped test twice.
+      # Reset the test environment between runs.
+      self.tearDown()
+      self._tempdir = None
       # Create a new graph for the eagerly executed version of this test for
       # better isolation.
       graph_for_eager_test = ops.Graph()
       with graph_for_eager_test.as_default(), context.eager_mode():
-        if reset_test:
-          self.setUp()
+        self.setUp()
         run_eagerly(self, **kwargs)
       ops.dismantle_graph(graph_for_eager_test)
 
@@ -1738,18 +1777,15 @@ def enable_tf_xla_constant_folding(description):
   return enable_tf_xla_constant_folding_impl
 
 
-# The description is just for documentation purposes.
-def disable_xla(description):
+# Updates test function by selectively disabling it.
+def _disable_test(execute_func):
 
-  def disable_xla_impl(func):
-    """Execute the test method only if xla is not enabled."""
+  def disable_test_impl(func):
 
     def decorator(func):
 
       def decorated(self, *args, **kwargs):
-        if is_xla_enabled():
-          return
-        else:
+        if execute_func:
           return func(self, *args, **kwargs)
 
       return decorated
@@ -1759,7 +1795,51 @@ def disable_xla(description):
 
     return decorator
 
-  return disable_xla_impl
+  return disable_test_impl
+
+
+# The description is just for documentation purposes.
+def disable_xla(description):  # pylint: disable=unused-argument
+  """Execute the test method only if xla is not enabled."""
+  execute_func = not is_xla_enabled()
+  return _disable_test(execute_func)
+
+
+# The description is just for documentation purposes.
+def disable_mlir_bridge(description):  # pylint: disable=unused-argument
+  """Execute the test method only if MLIR bridge is not enabled."""
+  execute_func = not is_mlir_bridge_enabled()
+  return _disable_test(execute_func)
+
+
+# The description is just for documentation purposes.
+def disable_tfrt(unused_description):
+
+  def disable_tfrt_impl(cls_or_func):
+    """Execute the test only if tfrt is not enabled."""
+
+    if tf_inspect.isclass(cls_or_func):
+      if is_tfrt_enabled():
+        return None
+      else:
+        return cls_or_func
+    else:
+      def decorator(func):
+
+        def decorated(self, *args, **kwargs):
+          if is_tfrt_enabled():
+            return
+          else:
+            return func(self, *args, **kwargs)
+
+        return decorated
+
+      if cls_or_func is not None:
+        return decorator(cls_or_func)
+
+      return decorator
+
+  return disable_tfrt_impl
 
 
 def for_all_test_methods(decorator, *args, **kwargs):
@@ -1791,27 +1871,9 @@ def for_all_test_methods(decorator, *args, **kwargs):
 
 # The description is just for documentation purposes.
 def no_xla_auto_jit(description):  # pylint: disable=unused-argument
-
-  def no_xla_auto_jit_impl(func):
-    """This test is not intended to be run with XLA auto jit enabled."""
-
-    def decorator(func):
-
-      def decorated(self, *args, **kwargs):
-        if is_xla_enabled():
-          # Skip test if using XLA is forced.
-          return
-        else:
-          return func(self, *args, **kwargs)
-
-      return decorated
-
-    if func is not None:
-      return decorator(func)
-
-    return decorator
-
-  return no_xla_auto_jit_impl
+  """This test is not intended to be run with XLA auto jit enabled."""
+  execute_func = not is_xla_enabled()
+  return _disable_test(execute_func)
 
 
 # The description is just for documentation purposes.
@@ -1874,6 +1936,7 @@ class TensorFlowTestCase(googletest.TestCase):
     self._threads = []
     self._tempdir = None
     self._cached_session = None
+    self._test_start_time = None
 
   def setUp(self):
     self._ClearCachedSession()
@@ -1897,7 +1960,15 @@ class TensorFlowTestCase(googletest.TestCase):
     if self.id().endswith(".test_session"):
       self.skipTest("Not a test.")
 
+    self._test_start_time = time.time()
+
   def tearDown(self):
+    # If a subclass overrides setUp and doesn't call the parent class's setUp,
+    # then we may not have set the start time.
+    if self._test_start_time is not None:
+      logging.info("time(%s): %ss", self.id(),
+                   round(time.time() - self._test_start_time, 2))
+
     for thread in self._threads:
       thread.check_termination()
 
@@ -2615,7 +2686,7 @@ class TensorFlowTestCase(googletest.TestCase):
     if (b.ndim <= 3 or b.size < 500):
       self.assertEqual(
           a.shape, b.shape, "Shape mismatch: expected %s, got %s."
-          " Contents: %s. \n%s." % (a.shape, b.shape, b, msg))
+          " Contents: %r. \n%s." % (a.shape, b.shape, b, msg))
     else:
       self.assertEqual(
           a.shape, b.shape, "Shape mismatch: expected %s, got %s."
@@ -2638,8 +2709,8 @@ class TensorFlowTestCase(googletest.TestCase):
       else:
         # np.where is broken for scalars
         x, y = a, b
-      msgs.append("not equal lhs = {}".format(x))
-      msgs.append("not equal rhs = {}".format(y))
+      msgs.append("not equal lhs = %r" % x)
+      msgs.append("not equal rhs = %r" % y)
       # With Python 3, we need to make sure the dtype matches between a and b.
       b = b.astype(a.dtype)
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index b5cb903c666..2bd75c3919e 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -22,6 +22,7 @@ import collections
 import copy
 import random
 import threading
+import unittest
 import weakref
 
 from absl.testing import parameterized
@@ -808,6 +809,66 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(tested_codepaths, set(["present", "future"]))
 
 
+class SkipTestTest(test_util.TensorFlowTestCase):
+
+  def _verify_test_in_set_up_or_tear_down(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError,
+                                   ["foo bar", "test message"]):
+        raise ValueError("test message")
+    try:
+      with self.assertRaisesRegexp(ValueError, "foo bar"):
+        with test_util.skip_if_error(self, ValueError, "test message"):
+          raise ValueError("foo bar")
+    except unittest.SkipTest:
+      raise RuntimeError("Test is not supposed to skip.")
+
+  def setUp(self):
+    super(SkipTestTest, self).setUp()
+    self._verify_test_in_set_up_or_tear_down()
+
+  def tearDown(self):
+    super(SkipTestTest, self).tearDown()
+    self._verify_test_in_set_up_or_tear_down()
+
+  def test_skip_if_error_should_skip(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError, "test message"):
+        raise ValueError("test message")
+
+  def test_skip_if_error_should_skip_with_list(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError,
+                                   ["foo bar", "test message"]):
+        raise ValueError("test message")
+
+  def test_skip_if_error_should_skip_without_expected_message(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError):
+        raise ValueError("test message")
+
+  def test_skip_if_error_should_skip_without_error_message(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError):
+        raise ValueError()
+
+  def test_skip_if_error_should_raise_message_mismatch(self):
+    try:
+      with self.assertRaisesRegexp(ValueError, "foo bar"):
+        with test_util.skip_if_error(self, ValueError, "test message"):
+          raise ValueError("foo bar")
+    except unittest.SkipTest:
+      raise RuntimeError("Test is not supposed to skip.")
+
+  def test_skip_if_error_should_raise_no_message(self):
+    try:
+      with self.assertRaisesRegexp(ValueError, ""):
+        with test_util.skip_if_error(self, ValueError, "test message"):
+          raise ValueError()
+    except unittest.SkipTest:
+      raise RuntimeError("Test is not supposed to skip.")
+
+
 # Its own test case to reproduce variable sharing issues which only pop up when
 # setUp() is overridden and super() is not called.
 class GraphAndEagerNoVariableSharing(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index 490574bbc1b..e6e921e6184 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -83,7 +83,11 @@ class TypeSpec(object):
 
   @abc.abstractproperty
   def value_type(self):
-    """The Python type for values that are compatible with this TypeSpec."""
+    """The Python type for values that are compatible with this TypeSpec.
+
+    In particular, all values that are compatible with this TypeSpec must be an
+    instance of this type.
+    """
     raise NotImplementedError("%s.value_type" % type(self).__name__)
 
   def is_compatible_with(self, spec_or_value):
@@ -127,6 +131,31 @@ class TypeSpec(object):
         self._serialize(), other._serialize())  # pylint: disable=protected-access
     return self._deserialize(merged)
 
+  def _with_tensor_ranks_only(self):
+    """Returns a TypeSpec compatible with `self`, with tensor shapes relaxed.
+
+    Returns:
+      A `TypeSpec` that is compatible with `self`, where any `TensorShape`
+      information has been relaxed to include only tensor rank (and not
+      the dimension sizes for individual axes).
+    """
+
+    # === Subclassing ===
+    # If not overridden by a subclass, the default behavior is to serialize
+    # this TypeSpec, relax any TensorSpec or TensorShape values, and
+    # deserialize the result.
+
+    def relax(value):
+      if isinstance(value, TypeSpec):
+        return value._with_tensor_ranks_only()  # pylint: disable=protected-access
+      elif (isinstance(value, tensor_shape.TensorShape) and
+            value.rank is not None):
+        return tensor_shape.TensorShape([None] * value.rank)
+      else:
+        return value
+
+    return self._deserialize(nest.map_structure(relax, self._serialize()))
+
   # === Component encoding for values ===
 
   @abc.abstractmethod
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index b98154b9095..4cd0af07c74 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -28,6 +28,7 @@ py_library(
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/keras/applications",
         "//tensorflow/python/keras/datasets",
+        "//tensorflow/python/keras/feature_column",
         "//tensorflow/python/keras/layers",
         "//tensorflow/python/keras/mixed_precision/experimental:mixed_precision_experimental",
         "//tensorflow/python/keras/optimizer_v2",
@@ -140,6 +141,7 @@ py_library(
         ":backend",
         "//tensorflow/python/distribute:distributed_file_utils",
         "//tensorflow/python/keras/distribute:multi_worker_training_state",
+        "//tensorflow/python/keras/protobuf:projector_config_proto_py",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:mode_keys",
         "//tensorflow/python/profiler:profiler_v2",
@@ -526,7 +528,6 @@ tf_py_test(
     ],
     deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index b69080da48a..0ee4a91f417 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -24,6 +24,7 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 
 # b/123041942
@@ -41,6 +42,7 @@ _TF_ACTIVATIONS_V2 = {
 
 
 @keras_export('keras.activations.softmax')
+@dispatch.add_dispatch_support
 def softmax(x, axis=-1):
   """Softmax converts a real vector to a vector of categorical probabilities.
 
@@ -82,6 +84,7 @@ def softmax(x, axis=-1):
 
 
 @keras_export('keras.activations.elu')
+@dispatch.add_dispatch_support
 def elu(x, alpha=1.0):
   """Exponential linear unit.
 
@@ -100,6 +103,7 @@ def elu(x, alpha=1.0):
 
 
 @keras_export('keras.activations.selu')
+@dispatch.add_dispatch_support
 def selu(x):
   """Scaled Exponential Linear Unit (SELU).
 
@@ -153,9 +157,18 @@ def selu(x):
 
 
 @keras_export('keras.activations.softplus')
+@dispatch.add_dispatch_support
 def softplus(x):
   """Softplus activation function, `softplus(x) = log(exp(x) + 1)`.
-
+  
+  Example Usage:
+  
+  >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32)
+  >>> b = tf.keras.activations.softplus(a) 
+  >>> b.numpy()
+  array([2.0611537e-09, 3.1326166e-01, 6.9314718e-01, 1.3132616e+00,
+           2.0000000e+01], dtype=float32)
+  
   Arguments:
       x: Input tensor.
 
@@ -166,8 +179,16 @@ def softplus(x):
 
 
 @keras_export('keras.activations.softsign')
+@dispatch.add_dispatch_support
 def softsign(x):
   """Softsign activation function, `softsign(x) = x / (abs(x) + 1)`.
+  
+  Example Usage:
+  
+  >>> a = tf.constant([-1.0, 0.0, 1.0], dtype = tf.float32)
+  >>> b = tf.keras.activations.softsign(a)
+  >>> b.numpy()
+  array([-0.5,  0. ,  0.5], dtype=float32)
 
   Arguments:
       x: Input tensor.
@@ -179,8 +200,23 @@ def softsign(x):
 
 
 @keras_export('keras.activations.swish')
+@dispatch.add_dispatch_support
 def swish(x):
-  """Swish activation function.
+  """Swish activation function, `swish(x) = x * sigmoid(x)`.
+
+  Swish activation function which returns `x*sigmoid(x)`.
+  It is a smooth, non-monotonic function that consistently matches
+  or outperforms ReLU on deep networks, it is unbounded above and
+  bounded below.
+
+
+  Example Usage:
+
+  >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32)
+  >>> b = tf.keras.activations.swish(a)
+  >>> b.numpy()
+  array([-4.1223075e-08, -2.6894143e-01,  0.0000000e+00,  7.3105860e-01,
+            2.0000000e+01], dtype=float32)
 
   Arguments:
       x: Input tensor.
@@ -195,6 +231,7 @@ def swish(x):
 
 
 @keras_export('keras.activations.relu')
+@dispatch.add_dispatch_support
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Applies the rectified linear unit activation function.
 
@@ -235,6 +272,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 
 @keras_export('keras.activations.tanh')
+@dispatch.add_dispatch_support
 def tanh(x):
   """Hyperbolic tangent activation function.
 
@@ -256,6 +294,7 @@ def tanh(x):
 
 
 @keras_export('keras.activations.sigmoid')
+@dispatch.add_dispatch_support
 def sigmoid(x):
   """Sigmoid activation function, `sigmoid(x) = 1 / (1 + exp(-x))`.
 
@@ -264,14 +303,16 @@ def sigmoid(x):
   the result of the function gets close to 1.
 
   Sigmoid is equivalent to a 2-element Softmax, where the second element is
-  assumed to be zero.
+  assumed to be zero. The sigmoid function always returns a value between
+  0 and 1.
 
   For example:
 
   >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32)
   >>> b = tf.keras.activations.sigmoid(a)
-  >>> b.numpy() >= 0
-  array([ True,  True,  True,  True,  True])
+  >>> b.numpy()
+  array([2.0611537e-09, 2.6894143e-01, 5.0000000e-01, 7.3105860e-01,
+           1.0000000e+00], dtype=float32)
 
   Arguments:
       x: Input tensor.
@@ -283,6 +324,7 @@ def sigmoid(x):
 
 
 @keras_export('keras.activations.exponential')
+@dispatch.add_dispatch_support
 def exponential(x):
   """Exponential activation function.
 
@@ -303,6 +345,7 @@ def exponential(x):
 
 
 @keras_export('keras.activations.hard_sigmoid')
+@dispatch.add_dispatch_support
 def hard_sigmoid(x):
   """Hard sigmoid activation function.
 
@@ -329,6 +372,7 @@ def hard_sigmoid(x):
 
 
 @keras_export('keras.activations.linear')
+@dispatch.add_dispatch_support
 def linear(x):
   """Linear activation function (pass-through).
 
@@ -349,6 +393,7 @@ def linear(x):
 
 
 @keras_export('keras.activations.serialize')
+@dispatch.add_dispatch_support
 def serialize(activation):
   """Returns the string identifier of an activation function.
 
@@ -379,6 +424,7 @@ def serialize(activation):
 
 
 @keras_export('keras.activations.deserialize')
+@dispatch.add_dispatch_support
 def deserialize(name, custom_objects=None):
   """Returns activation function given a string identifier.
 
@@ -416,6 +462,7 @@ def deserialize(name, custom_objects=None):
 
 
 @keras_export('keras.activations.get')
+@dispatch.add_dispatch_support
 def get(identifier):
   """Returns function.
 
@@ -456,4 +503,4 @@ def get(identifier):
   else:
     raise TypeError(
         'Could not interpret activation function identifier: {}'.format(
-            repr(identifier)))
+            identifier))
diff --git a/tensorflow/python/keras/api/BUILD b/tensorflow/python/keras/api/BUILD
index 7dee9b1f638..b393c2006e3 100644
--- a/tensorflow/python/keras/api/BUILD
+++ b/tensorflow/python/keras/api/BUILD
@@ -47,6 +47,7 @@ keras_packages = [
     "tensorflow.python.keras.engine.sequential",
     "tensorflow.python.keras.engine.training",
     "tensorflow.python.keras.estimator",
+    "tensorflow.python.keras.feature_column.sequence_feature_column",
     "tensorflow.python.keras.initializers",
     "tensorflow.python.keras.initializers.initializers_v1",
     "tensorflow.python.keras.initializers.initializers_v2",
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index fe353bcef15..39004be622f 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -138,9 +138,9 @@ def DenseNet(
     classifier_activation='softmax'):
   """Instantiates the DenseNet architecture.
 
-  Reference paper:
-  - [Densely Connected Convolutional Networks]
-    (https://arxiv.org/abs/1608.06993) (CVPR 2017 Best Paper Award)
+  Reference:
+  - [Densely Connected Convolutional Networks](
+      https://arxiv.org/abs/1608.06993) (CVPR 2017)
 
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
diff --git a/tensorflow/python/keras/applications/efficientnet.py b/tensorflow/python/keras/applications/efficientnet.py
index 28426d4d42e..ece9f7f7e5b 100644
--- a/tensorflow/python/keras/applications/efficientnet.py
+++ b/tensorflow/python/keras/applications/efficientnet.py
@@ -145,7 +145,7 @@ layers = VersionAwareLayers()
 
 BASE_DOCSTRING = """Instantiates the {name} architecture.
 
-  Reference paper:
+  Reference:
   - [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](
       https://arxiv.org/abs/1905.11946) (ICML 2019)
 
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index d4ffd372a10..15cbfa5033c 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -53,7 +53,7 @@ def InceptionResNetV2(include_top=True,
                       **kwargs):
   """Instantiates the Inception-ResNet v2 architecture.
 
-  Reference paper:
+  Reference:
   - [Inception-v4, Inception-ResNet and the Impact of
      Residual Connections on Learning](https://arxiv.org/abs/1602.07261)
     (AAAI 2017)
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index 21f65b1fbc7..3f528fc131a 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -56,7 +56,7 @@ def InceptionV3(
     classifier_activation='softmax'):
   """Instantiates the Inception v3 architecture.
 
-  Reference paper:
+  Reference:
   - [Rethinking the Inception Architecture for Computer Vision](
       http://arxiv.org/abs/1512.00567) (CVPR 2016)
 
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index c79627c6aa7..f531d8d124c 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -95,9 +95,10 @@ def MobileNet(input_shape=None,
               **kwargs):
   """Instantiates the MobileNet architecture.
 
-  Reference paper:
-  - [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision
-    Applications](https://arxiv.org/abs/1704.04861)
+  Reference:
+  - [MobileNets: Efficient Convolutional Neural Networks
+     for Mobile Vision Applications](
+      https://arxiv.org/abs/1704.04861)
 
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index 59aeba572e3..b1138b7ae26 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -106,9 +106,9 @@ def MobileNetV2(input_shape=None,
                 **kwargs):
   """Instantiates the MobileNetV2 architecture.
 
-  Reference paper:
-  - [MobileNetV2: Inverted Residuals and Linear Bottlenecks]
-  (https://arxiv.org/abs/1801.04381) (CVPR 2018)
+  Reference:
+  - [MobileNetV2: Inverted Residuals and Linear Bottlenecks](
+      https://arxiv.org/abs/1801.04381) (CVPR 2018)
 
   Optionally loads weights pre-trained on ImageNet.
 
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 5c3117d8a47..f4e5f74e77d 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -79,9 +79,9 @@ def NASNet(
     classifier_activation='softmax'):
   """Instantiates a NASNet model.
 
-  Reference paper:
-  - [Learning Transferable Architectures for Scalable Image Recognition]
-    (https://arxiv.org/abs/1707.07012) (CVPR 2018)
+  Reference:
+  - [Learning Transferable Architectures for Scalable Image Recognition](
+      https://arxiv.org/abs/1707.07012) (CVPR 2018)
 
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
diff --git a/tensorflow/python/keras/applications/resnet.py b/tensorflow/python/keras/applications/resnet.py
index ecb3f31e0c9..e72f06ce3d1 100644
--- a/tensorflow/python/keras/applications/resnet.py
+++ b/tensorflow/python/keras/applications/resnet.py
@@ -15,9 +15,9 @@
 # pylint: disable=invalid-name
 """ResNet models for Keras.
 
-Reference paper:
-  - [Deep Residual Learning for Image Recognition]
-    (https://arxiv.org/abs/1512.03385) (CVPR 2015)
+Reference:
+  - [Deep Residual Learning for Image Recognition](
+      https://arxiv.org/abs/1512.03385) (CVPR 2015)
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -72,9 +72,9 @@ def ResNet(stack_fn,
            **kwargs):
   """Instantiates the ResNet, ResNetV2, and ResNeXt architecture.
 
-  Reference paper:
-  - [Deep Residual Learning for Image Recognition]
-    (https://arxiv.org/abs/1512.03385) (CVPR 2015)
+  Reference:
+  - [Deep Residual Learning for Image Recognition](
+      https://arxiv.org/abs/1512.03385) (CVPR 2015)
 
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
diff --git a/tensorflow/python/keras/applications/resnet_v2.py b/tensorflow/python/keras/applications/resnet_v2.py
index a8f6e526ad5..31785350a74 100644
--- a/tensorflow/python/keras/applications/resnet_v2.py
+++ b/tensorflow/python/keras/applications/resnet_v2.py
@@ -138,7 +138,7 @@ decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
 
-  Reference paper:
+  Reference:
   - [Identity Mappings in Deep Residual Networks]
     (https://arxiv.org/abs/1603.05027) (CVPR 2016)
 
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index d92bfd0f4c6..7139764b15b 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -18,7 +18,7 @@
 On ImageNet, this model gets to a top-1 validation accuracy of 0.790
 and a top-5 validation accuracy of 0.945.
 
-Reference paper:
+Reference:
   - [Xception: Deep Learning with Depthwise Separable Convolutions](
       https://arxiv.org/abs/1610.02357) (CVPR 2017)
 
@@ -60,6 +60,10 @@ def Xception(
     classifier_activation='softmax'):
   """Instantiates the Xception architecture.
 
+  Reference:
+  - [Xception: Deep Learning with Depthwise Separable Convolutions](
+      https://arxiv.org/abs/1610.02357) (CVPR 2017)
+
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 1ad621f003b..11795625d06 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -76,6 +76,7 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import moving_averages
 from tensorflow.python.training.tracking import util as tracking_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
@@ -91,10 +92,13 @@ py_any = any
 
 # The internal graph maintained by Keras and used by the symbolic Keras APIs
 # while executing eagerly (such as the functional API for model-building).
-_GRAPH = None
+# This is thread-local to allow building separate models in different threads
+# concurrently, but comes at the cost of not being able to build one model
+# across threads.
+_GRAPH = threading.local()
 
 # A graph which is used for constructing functions in eager mode.
-_CURRENT_SCRATCH_GRAPH = None
+_CURRENT_SCRATCH_GRAPH = threading.local()
 
 # This is a thread local object that will hold the default internal TF session
 # used by Keras. It can be set manually via `set_session(sess)`.
@@ -133,6 +137,7 @@ class _DummyEagerGraph(threading.local):
     # get a different key.
     super(_DummyEagerGraph, self).__init__()
     self.key = _DummyEagerGraph._WeakReferencableClass()
+    self.learning_phase_is_set = False
 
 
 _DUMMY_EAGER_GRAPH = _DummyEagerGraph()
@@ -169,6 +174,7 @@ def backend():
 
 
 @keras_export('keras.backend.cast_to_floatx')
+@dispatch.add_dispatch_support
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
@@ -289,12 +295,13 @@ def clear_session():
   global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
   global _GRAPH
   global _FREEZABLE_VARS
-  _GRAPH = None
+  _GRAPH.graph = None
   ops.reset_default_graph()
   reset_uids()
   _SESSION.session = None
   graph = get_graph()
   with graph.as_default():
+    _DUMMY_EAGER_GRAPH.learning_phase_is_set = False
     _GRAPH_LEARNING_PHASES.clear()
     # Create the learning phase placeholder in graph using the default factory.
     _GRAPH_LEARNING_PHASES.setdefault(graph)
@@ -332,7 +339,7 @@ def learning_phase():
       Learning phase (scalar integer tensor or Python integer).
   """
   graph = ops.get_default_graph()
-  if graph is _GRAPH:
+  if graph is getattr(_GRAPH, 'graph', None):
     # Don't enter an init_scope for the learning phase if eager execution
     # is enabled but we're inside the Keras workspace graph.
     learning_phase = symbolic_learning_phase()
@@ -351,7 +358,7 @@ def learning_phase():
 
 
 def global_learning_phase_is_set():
-  return _DUMMY_EAGER_GRAPH.key in _GRAPH_LEARNING_PHASES
+  return _DUMMY_EAGER_GRAPH.learning_phase_is_set
 
 
 def _mark_func_graph_as_unsaveable(graph, learning_phase):
@@ -388,6 +395,9 @@ def _default_learning_phase():
           False, shape=(), name='keras_learning_phase')
 
 
+@deprecated('2020-10-11',
+            'Simply pass a True/False value to the `training` argument '
+            'of the `__call__` method of your layer or model.')
 @keras_export('keras.backend.set_learning_phase')
 def set_learning_phase(value):
   """Sets the learning phase to a fixed value.
@@ -420,6 +430,7 @@ def set_learning_phase(value):
     if context.executing_eagerly():
       # In an eager context, the learning phase values applies to both the eager
       # context and the internal Keras graph.
+      _DUMMY_EAGER_GRAPH.learning_phase_is_set = True
       _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH.key] = value
     _GRAPH_LEARNING_PHASES[get_graph()] = value
 
@@ -451,11 +462,14 @@ def learning_phase_scope(value):
           _DUMMY_EAGER_GRAPH.key, None)
     previous_graph_value = _GRAPH_LEARNING_PHASES.get(get_graph(), None)
 
+  learning_phase_previously_set = _DUMMY_EAGER_GRAPH.learning_phase_is_set
   try:
     set_learning_phase(value)
     yield
   finally:
     # Restore learning phase to initial value.
+    if not learning_phase_previously_set:
+      _DUMMY_EAGER_GRAPH.learning_phase_is_set = False
     with ops.init_scope():
       if context.executing_eagerly():
         if previous_eager_value is not None:
@@ -569,9 +583,9 @@ tracking_util.register_session_provider(get_session)
 def get_graph():
   if context.executing_eagerly():
     global _GRAPH
-    if _GRAPH is None:
-      _GRAPH = func_graph.FuncGraph('keras_graph')
-    return _GRAPH
+    if not getattr(_GRAPH, 'graph', None):
+      _GRAPH.graph = func_graph.FuncGraph('keras_graph')
+    return _GRAPH.graph
   else:
     return ops.get_default_graph()
 
@@ -594,20 +608,22 @@ def _scratch_graph(graph=None):
     The current scratch graph.
   """
   global _CURRENT_SCRATCH_GRAPH
-  if (_CURRENT_SCRATCH_GRAPH is not None and graph is not None and
-      _CURRENT_SCRATCH_GRAPH is not graph):
+  scratch_graph = getattr(_CURRENT_SCRATCH_GRAPH, 'graph', None)
+  # If scratch graph and `graph` are both configured, they must match.
+  if (scratch_graph is not None and graph is not None and
+      scratch_graph is not graph):
     raise ValueError('Multiple scratch graphs specified.')
 
-  if _CURRENT_SCRATCH_GRAPH:
-    yield _CURRENT_SCRATCH_GRAPH
+  if scratch_graph:
+    yield scratch_graph
     return
 
   graph = graph or func_graph.FuncGraph('keras_scratch_graph')
   try:
-    _CURRENT_SCRATCH_GRAPH = graph
+    _CURRENT_SCRATCH_GRAPH.graph = graph
     yield graph
   finally:
-    _CURRENT_SCRATCH_GRAPH = None
+    _CURRENT_SCRATCH_GRAPH.graph = None
 
 
 @keras_export(v1=['keras.backend.set_session'])
@@ -785,6 +801,7 @@ def is_sparse(tensor):
 
 
 @keras_export('keras.backend.to_dense')
+@dispatch.add_dispatch_support
 def to_dense(tensor):
   """Converts a sparse tensor into a dense tensor and returns it.
 
@@ -933,8 +950,8 @@ def unique_object_name(name,
   Example:
 
 
-  _unique_layer_name('dense')  # dense_1
-  _unique_layer_name('dense')  # dense_2
+  unique_object_name('dense')  # dense_1
+  unique_object_name('dense')  # dense_2
 
   """
   if name_uid_map is None:
@@ -993,6 +1010,7 @@ def _initialize_variables(session):
 
 
 @keras_export('keras.backend.constant')
+@dispatch.add_dispatch_support
 def constant(value, dtype=None, shape=None, name=None):
   """Creates a constant tensor.
 
@@ -1148,54 +1166,8 @@ def is_placeholder(x):
     return False
 
 
-def freezable_variable(value, shape=None, name=None):
-  """A tensor-like object whose value can be updated only up until execution.
-
-  After creating the freezable variable, you can update its value by calling
-  `var.update_value(new_value)` (similar to a regular variable).
-  Unlike an actual variable, the value used during execution is the current
-  value at the time the execution function (`backend.function()`) was created.
-
-  This is an internal API, expected to be temporary. It is used to implement a
-  mutable `trainable` property for `BatchNormalization` layers, with a frozen
-  value after model compilation.
-
-  We don't use a plain variable in this case because we need the value used
-  in a specific model to be frozen after `compile` has been called
-  (e.g. GAN use case).
-
-  Arguments:
-    value: The initial value for the tensor-like object.
-    shape: The shape for the tensor-like object (cannot be changed).
-    name: The name for the tensor-like object.
-
-  Returns:
-    A tensor-like object with a static value that can be updated via
-    `x.update_value(new_value)`, up until creating an execution function
-    (afterwards the value is fixed).
-  """
-  graph = get_graph()
-  with graph.as_default():
-    x = array_ops.placeholder_with_default(
-        value, shape=shape, name=name)
-    x._initial_value = value
-    x._current_value = value
-
-    def update_value(new_value):
-      x._current_value = new_value
-
-    def get_value():
-      return x._current_value
-
-    x.update_value = update_value
-    x.get_value = get_value
-
-    global _FREEZABLE_VARS
-    _FREEZABLE_VARS[graph].add(x)
-  return x
-
-
 @keras_export('keras.backend.shape')
+@dispatch.add_dispatch_support
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
 
@@ -1278,6 +1250,7 @@ def ndim(x):
 
 
 @keras_export('keras.backend.dtype')
+@dispatch.add_dispatch_support
 def dtype(x):
   """Returns the dtype of a Keras tensor or variable, as a string.
 
@@ -1376,6 +1349,7 @@ def zeros(shape, dtype=None, name=None):
 
 
 @keras_export('keras.backend.ones')
+@dispatch.add_dispatch_support
 def ones(shape, dtype=None, name=None):
   """Instantiates an all-ones variable and returns it.
 
@@ -1410,6 +1384,7 @@ def ones(shape, dtype=None, name=None):
 
 
 @keras_export('keras.backend.eye')
+@dispatch.add_dispatch_support
 def eye(size, dtype=None, name=None):
   """Instantiate an identity matrix and returns it.
 
@@ -1466,6 +1441,7 @@ def zeros_like(x, dtype=None, name=None):
 
 
 @keras_export('keras.backend.ones_like')
+@dispatch.add_dispatch_support
 def ones_like(x, dtype=None, name=None):
   """Instantiates an all-ones variable of the same shape as another tensor.
 
@@ -1596,6 +1572,7 @@ def count_params(x):
 
 
 @keras_export('keras.backend.cast')
+@dispatch.add_dispatch_support
 def cast(x, dtype):
   """Casts a tensor to a different dtype and returns it.
 
@@ -1680,6 +1657,7 @@ def moving_average_update(x, value, momentum):
 
 
 @keras_export('keras.backend.dot')
+@dispatch.add_dispatch_support
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a tensor.
 
@@ -1740,6 +1718,7 @@ def dot(x, y):
 
 
 @keras_export('keras.backend.batch_dot')
+@dispatch.add_dispatch_support
 def batch_dot(x, y, axes=None):
   """Batchwise dot product.
 
@@ -1928,6 +1907,7 @@ def batch_dot(x, y, axes=None):
 
 
 @keras_export('keras.backend.transpose')
+@dispatch.add_dispatch_support
 def transpose(x):
   """Transposes a tensor and returns it.
 
@@ -1959,6 +1939,7 @@ def transpose(x):
 
 
 @keras_export('keras.backend.gather')
+@dispatch.add_dispatch_support
 def gather(reference, indices):
   """Retrieves the elements of indices `indices` in the tensor `reference`.
 
@@ -1994,6 +1975,7 @@ def gather(reference, indices):
 
 
 @keras_export('keras.backend.max')
+@dispatch.add_dispatch_support
 def max(x, axis=None, keepdims=False):
   """Maximum value in a tensor.
 
@@ -2012,6 +1994,7 @@ def max(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.min')
+@dispatch.add_dispatch_support
 def min(x, axis=None, keepdims=False):
   """Minimum value in a tensor.
 
@@ -2030,6 +2013,7 @@ def min(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.sum')
+@dispatch.add_dispatch_support
 def sum(x, axis=None, keepdims=False):
   """Sum of the values in a tensor, alongside the specified axis.
 
@@ -2048,6 +2032,7 @@ def sum(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.prod')
+@dispatch.add_dispatch_support
 def prod(x, axis=None, keepdims=False):
   """Multiplies the values in a tensor, alongside the specified axis.
 
@@ -2066,6 +2051,7 @@ def prod(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.cumsum')
+@dispatch.add_dispatch_support
 def cumsum(x, axis=0):
   """Cumulative sum of the values in a tensor, alongside the specified axis.
 
@@ -2080,6 +2066,7 @@ def cumsum(x, axis=0):
 
 
 @keras_export('keras.backend.cumprod')
+@dispatch.add_dispatch_support
 def cumprod(x, axis=0):
   """Cumulative product of the values in a tensor, alongside the specified axis.
 
@@ -2114,6 +2101,7 @@ def var(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.std')
+@dispatch.add_dispatch_support
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
@@ -2140,6 +2128,7 @@ def std(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.mean')
+@dispatch.add_dispatch_support
 def mean(x, axis=None, keepdims=False):
   """Mean of a tensor, alongside the specified axis.
 
@@ -2160,6 +2149,7 @@ def mean(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.any')
+@dispatch.add_dispatch_support
 def any(x, axis=None, keepdims=False):
   """Bitwise reduction (logical OR).
 
@@ -2176,6 +2166,7 @@ def any(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.all')
+@dispatch.add_dispatch_support
 def all(x, axis=None, keepdims=False):
   """Bitwise reduction (logical AND).
 
@@ -2192,6 +2183,7 @@ def all(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.argmax')
+@dispatch.add_dispatch_support
 def argmax(x, axis=-1):
   """Returns the index of the maximum value along an axis.
 
@@ -2206,6 +2198,7 @@ def argmax(x, axis=-1):
 
 
 @keras_export('keras.backend.argmin')
+@dispatch.add_dispatch_support
 def argmin(x, axis=-1):
   """Returns the index of the minimum value along an axis.
 
@@ -2220,6 +2213,7 @@ def argmin(x, axis=-1):
 
 
 @keras_export('keras.backend.square')
+@dispatch.add_dispatch_support
 def square(x):
   """Element-wise square.
 
@@ -2233,6 +2227,7 @@ def square(x):
 
 
 @keras_export('keras.backend.abs')
+@dispatch.add_dispatch_support
 def abs(x):
   """Element-wise absolute value.
 
@@ -2246,6 +2241,7 @@ def abs(x):
 
 
 @keras_export('keras.backend.sqrt')
+@dispatch.add_dispatch_support
 def sqrt(x):
   """Element-wise square root.
 
@@ -2262,6 +2258,7 @@ def sqrt(x):
 
 
 @keras_export('keras.backend.exp')
+@dispatch.add_dispatch_support
 def exp(x):
   """Element-wise exponential.
 
@@ -2275,6 +2272,7 @@ def exp(x):
 
 
 @keras_export('keras.backend.log')
+@dispatch.add_dispatch_support
 def log(x):
   """Element-wise log.
 
@@ -2309,6 +2307,7 @@ def logsumexp(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.round')
+@dispatch.add_dispatch_support
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -2324,6 +2323,7 @@ def round(x):
 
 
 @keras_export('keras.backend.sign')
+@dispatch.add_dispatch_support
 def sign(x):
   """Element-wise sign.
 
@@ -2337,6 +2337,7 @@ def sign(x):
 
 
 @keras_export('keras.backend.pow')
+@dispatch.add_dispatch_support
 def pow(x, a):
   """Element-wise exponentiation.
 
@@ -2351,6 +2352,7 @@ def pow(x, a):
 
 
 @keras_export('keras.backend.clip')
+@dispatch.add_dispatch_support
 def clip(x, min_value, max_value):
   """Element-wise value clipping.
 
@@ -2374,6 +2376,7 @@ def clip(x, min_value, max_value):
 
 
 @keras_export('keras.backend.equal')
+@dispatch.add_dispatch_support
 def equal(x, y):
   """Element-wise equality between two tensors.
 
@@ -2388,6 +2391,7 @@ def equal(x, y):
 
 
 @keras_export('keras.backend.not_equal')
+@dispatch.add_dispatch_support
 def not_equal(x, y):
   """Element-wise inequality between two tensors.
 
@@ -2402,6 +2406,7 @@ def not_equal(x, y):
 
 
 @keras_export('keras.backend.greater')
+@dispatch.add_dispatch_support
 def greater(x, y):
   """Element-wise truth value of (x > y).
 
@@ -2416,6 +2421,7 @@ def greater(x, y):
 
 
 @keras_export('keras.backend.greater_equal')
+@dispatch.add_dispatch_support
 def greater_equal(x, y):
   """Element-wise truth value of (x >= y).
 
@@ -2430,6 +2436,7 @@ def greater_equal(x, y):
 
 
 @keras_export('keras.backend.less')
+@dispatch.add_dispatch_support
 def less(x, y):
   """Element-wise truth value of (x < y).
 
@@ -2444,6 +2451,7 @@ def less(x, y):
 
 
 @keras_export('keras.backend.less_equal')
+@dispatch.add_dispatch_support
 def less_equal(x, y):
   """Element-wise truth value of (x <= y).
 
@@ -2458,6 +2466,7 @@ def less_equal(x, y):
 
 
 @keras_export('keras.backend.maximum')
+@dispatch.add_dispatch_support
 def maximum(x, y):
   """Element-wise maximum of two tensors.
 
@@ -2482,6 +2491,7 @@ def maximum(x, y):
 
 
 @keras_export('keras.backend.minimum')
+@dispatch.add_dispatch_support
 def minimum(x, y):
   """Element-wise minimum of two tensors.
 
@@ -2496,6 +2506,7 @@ def minimum(x, y):
 
 
 @keras_export('keras.backend.sin')
+@dispatch.add_dispatch_support
 def sin(x):
   """Computes sin of x element-wise.
 
@@ -2509,6 +2520,7 @@ def sin(x):
 
 
 @keras_export('keras.backend.cos')
+@dispatch.add_dispatch_support
 def cos(x):
   """Computes cos of x element-wise.
 
@@ -2654,6 +2666,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
 
 
 @keras_export('keras.backend.batch_normalization')
+@dispatch.add_dispatch_support
 def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -2716,6 +2729,7 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
 
 
 @keras_export('keras.backend.concatenate')
+@dispatch.add_dispatch_support
 def concatenate(tensors, axis=-1):
   """Concatenates a list of tensors alongside the specified axis.
 
@@ -2753,6 +2767,7 @@ def concatenate(tensors, axis=-1):
 
 
 @keras_export('keras.backend.reshape')
+@dispatch.add_dispatch_support
 def reshape(x, shape):
   """Reshapes a tensor to the specified shape.
 
@@ -2782,6 +2797,7 @@ def reshape(x, shape):
 
 
 @keras_export('keras.backend.permute_dimensions')
+@dispatch.add_dispatch_support
 def permute_dimensions(x, pattern):
   """Permutes axes in a tensor.
 
@@ -2813,6 +2829,7 @@ def permute_dimensions(x, pattern):
 
 
 @keras_export('keras.backend.resize_images')
+@dispatch.add_dispatch_support
 def resize_images(x, height_factor, width_factor, data_format,
                   interpolation='nearest'):
   """Resizes the images contained in a 4D tensor.
@@ -2876,6 +2893,7 @@ def resize_images(x, height_factor, width_factor, data_format,
 
 
 @keras_export('keras.backend.resize_volumes')
+@dispatch.add_dispatch_support
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
   """Resizes the volume contained in a 5D tensor.
 
@@ -2908,6 +2926,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
 
 
 @keras_export('keras.backend.repeat_elements')
+@dispatch.add_dispatch_support
 def repeat_elements(x, rep, axis):
   """Repeats the elements of a tensor along an axis, like `np.repeat`.
 
@@ -2969,6 +2988,7 @@ def repeat_elements(x, rep, axis):
 
 
 @keras_export('keras.backend.repeat')
+@dispatch.add_dispatch_support
 def repeat(x, n):
   """Repeats a 2D tensor.
 
@@ -3004,6 +3024,7 @@ def repeat(x, n):
 
 
 @keras_export('keras.backend.arange')
+@dispatch.add_dispatch_support
 def arange(start, stop=None, step=1, dtype='int32'):
   """Creates a 1D tensor containing a sequence of integers.
 
@@ -3042,6 +3063,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
 
 
 @keras_export('keras.backend.tile')
+@dispatch.add_dispatch_support
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -3059,6 +3081,7 @@ def tile(x, n):
 
 
 @keras_export('keras.backend.flatten')
+@dispatch.add_dispatch_support
 def flatten(x):
   """Flatten a tensor.
 
@@ -3084,6 +3107,7 @@ def flatten(x):
 
 
 @keras_export('keras.backend.batch_flatten')
+@dispatch.add_dispatch_support
 def batch_flatten(x):
   """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
@@ -3109,6 +3133,7 @@ def batch_flatten(x):
 
 
 @keras_export('keras.backend.expand_dims')
+@dispatch.add_dispatch_support
 def expand_dims(x, axis=-1):
   """Adds a 1-sized dimension at index "axis".
 
@@ -3123,6 +3148,7 @@ def expand_dims(x, axis=-1):
 
 
 @keras_export('keras.backend.squeeze')
+@dispatch.add_dispatch_support
 def squeeze(x, axis):
   """Removes a 1-dimension from the tensor at index "axis".
 
@@ -3137,6 +3163,7 @@ def squeeze(x, axis):
 
 
 @keras_export('keras.backend.temporal_padding')
+@dispatch.add_dispatch_support
 def temporal_padding(x, padding=(1, 1)):
   """Pads the middle dimension of a 3D tensor.
 
@@ -3154,6 +3181,7 @@ def temporal_padding(x, padding=(1, 1)):
 
 
 @keras_export('keras.backend.spatial_2d_padding')
+@dispatch.add_dispatch_support
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   """Pads the 2nd and 3rd dimensions of a 4D tensor.
 
@@ -3185,6 +3213,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
 
 
 @keras_export('keras.backend.spatial_3d_padding')
+@dispatch.add_dispatch_support
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   """Pads 5D tensor with zeros along the depth, height, width dimensions.
 
@@ -3229,6 +3258,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
 
 
 @keras_export('keras.backend.stack')
+@dispatch.add_dispatch_support
 def stack(x, axis=0):
   """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
@@ -3255,6 +3285,7 @@ def stack(x, axis=0):
 
 
 @keras_export('keras.backend.one_hot')
+@dispatch.add_dispatch_support
 def one_hot(indices, num_classes):
   """Computes the one-hot representation of an integer tensor.
 
@@ -3274,6 +3305,7 @@ def one_hot(indices, num_classes):
 
 
 @keras_export('keras.backend.reverse')
+@dispatch.add_dispatch_support
 def reverse(x, axes):
   """Reverse a tensor along the specified axes.
 
@@ -3354,6 +3386,7 @@ def get_value(x):
 
 
 @keras_export('keras.backend.batch_get_value')
+@dispatch.add_dispatch_support
 def batch_get_value(tensors):
   """Returns the value of more than one tensor variable.
 
@@ -3415,6 +3448,7 @@ def set_value(x, value):
 
 
 @keras_export('keras.backend.batch_set_value')
+@dispatch.add_dispatch_support
 def batch_set_value(tuples):
   """Sets the values of many tensor variables at once.
 
@@ -3457,6 +3491,7 @@ set_value.__doc__ = set_value.__doc__.format(snippet=_VALUE_SET_CODE_STRING)
 
 
 @keras_export('keras.backend.print_tensor')
+@dispatch.add_dispatch_support
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
@@ -3874,7 +3909,8 @@ def function(inputs, outputs, updates=None, name=None, **kwargs):
         msg = ('Invalid argument "%s" passed to K.function with TensorFlow '
                'backend') % key
         raise ValueError(msg)
-  return GraphExecutionFunction(inputs, outputs, updates=updates, **kwargs)
+  return GraphExecutionFunction(
+      inputs, outputs, updates=updates, name=name, **kwargs)
 
 
 @keras_export('keras.backend.gradients')
@@ -3893,6 +3929,7 @@ def gradients(loss, variables):
 
 
 @keras_export('keras.backend.stop_gradient')
+@dispatch.add_dispatch_support
 def stop_gradient(variables):
   """Returns `variables` but with zero gradient w.r.t. every other variable.
 
@@ -3914,6 +3951,7 @@ def stop_gradient(variables):
 
 
 @keras_export('keras.backend.rnn')
+@dispatch.add_dispatch_support
 def rnn(step_function,
         inputs,
         initial_states,
@@ -4308,6 +4346,7 @@ def rnn(step_function,
 
 
 @keras_export('keras.backend.switch')
+@dispatch.add_dispatch_support
 def switch(condition, then_expression, else_expression):
   """Switches between two operations depending on a scalar value.
 
@@ -4441,6 +4480,7 @@ def in_test_phase(x, alt, training=None):
 
 
 @keras_export('keras.backend.relu')
+@dispatch.add_dispatch_support
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
@@ -4494,6 +4534,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 
 @keras_export('keras.backend.elu')
+@dispatch.add_dispatch_support
 def elu(x, alpha=1.):
   """Exponential linear unit.
 
@@ -4512,6 +4553,7 @@ def elu(x, alpha=1.):
 
 
 @keras_export('keras.backend.softmax')
+@dispatch.add_dispatch_support
 def softmax(x, axis=-1):
   """Softmax of a tensor.
 
@@ -4527,6 +4569,7 @@ def softmax(x, axis=-1):
 
 
 @keras_export('keras.backend.softplus')
+@dispatch.add_dispatch_support
 def softplus(x):
   """Softplus of a tensor.
 
@@ -4540,6 +4583,7 @@ def softplus(x):
 
 
 @keras_export('keras.backend.softsign')
+@dispatch.add_dispatch_support
 def softsign(x):
   """Softsign of a tensor.
 
@@ -4559,6 +4603,7 @@ def _backtrack_identity(tensor):
 
 
 @keras_export('keras.backend.categorical_crossentropy')
+@dispatch.add_dispatch_support
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy between an output tensor and a target tensor.
 
@@ -4627,6 +4672,7 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 
 @keras_export('keras.backend.sparse_categorical_crossentropy')
+@dispatch.add_dispatch_support
 def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy with integer targets.
 
@@ -4708,6 +4754,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 
 @keras_export('keras.backend.binary_crossentropy')
+@dispatch.add_dispatch_support
 def binary_crossentropy(target, output, from_logits=False):
   """Binary crossentropy between an output tensor and a target tensor.
 
@@ -4744,6 +4791,7 @@ def binary_crossentropy(target, output, from_logits=False):
 
 
 @keras_export('keras.backend.sigmoid')
+@dispatch.add_dispatch_support
 def sigmoid(x):
   """Element-wise sigmoid.
 
@@ -4757,6 +4805,7 @@ def sigmoid(x):
 
 
 @keras_export('keras.backend.hard_sigmoid')
+@dispatch.add_dispatch_support
 def hard_sigmoid(x):
   """Segment-wise linear approximation of sigmoid.
 
@@ -4779,6 +4828,7 @@ def hard_sigmoid(x):
 
 
 @keras_export('keras.backend.tanh')
+@dispatch.add_dispatch_support
 def tanh(x):
   """Element-wise tanh.
 
@@ -4792,6 +4842,7 @@ def tanh(x):
 
 
 @keras_export('keras.backend.dropout')
+@dispatch.add_dispatch_support
 def dropout(x, level, noise_shape=None, seed=None):
   """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
@@ -4812,6 +4863,7 @@ def dropout(x, level, noise_shape=None, seed=None):
 
 
 @keras_export('keras.backend.l2_normalize')
+@dispatch.add_dispatch_support
 def l2_normalize(x, axis=None):
   """Normalizes a tensor wrt the L2 norm alongside the specified axis.
 
@@ -4826,6 +4878,7 @@ def l2_normalize(x, axis=None):
 
 
 @keras_export('keras.backend.in_top_k')
+@dispatch.add_dispatch_support
 def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
@@ -4928,6 +4981,7 @@ def _preprocess_padding(padding):
 
 
 @keras_export('keras.backend.conv1d')
+@dispatch.add_dispatch_support
 def conv1d(x,
            kernel,
            strides=1,
@@ -4978,6 +5032,7 @@ def conv1d(x,
 
 
 @keras_export('keras.backend.conv2d')
+@dispatch.add_dispatch_support
 def conv2d(x,
            kernel,
            strides=(1, 1),
@@ -5021,6 +5076,7 @@ def conv2d(x,
 
 
 @keras_export('keras.backend.conv2d_transpose')
+@dispatch.add_dispatch_support
 def conv2d_transpose(x,
                      kernel,
                      output_shape,
@@ -5161,6 +5217,7 @@ def separable_conv1d(x,
 
 
 @keras_export('keras.backend.separable_conv2d')
+@dispatch.add_dispatch_support
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -5218,6 +5275,7 @@ def separable_conv2d(x,
 
 
 @keras_export('keras.backend.depthwise_conv2d')
+@dispatch.add_dispatch_support
 def depthwise_conv2d(x,
                      depthwise_kernel,
                      strides=(1, 1),
@@ -5267,6 +5325,7 @@ def depthwise_conv2d(x,
 
 
 @keras_export('keras.backend.conv3d')
+@dispatch.add_dispatch_support
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -5369,6 +5428,7 @@ def conv3d_transpose(x,
 
 
 @keras_export('keras.backend.pool2d')
+@dispatch.add_dispatch_support
 def pool2d(x,
            pool_size,
            strides=(1, 1),
@@ -5428,6 +5488,7 @@ def pool2d(x,
 
 
 @keras_export('keras.backend.pool3d')
+@dispatch.add_dispatch_support
 def pool3d(x,
            pool_size,
            strides=(1, 1, 1),
@@ -5558,6 +5619,7 @@ def local_conv(inputs,
 
 
 @keras_export('keras.backend.local_conv1d')
+@dispatch.add_dispatch_support
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
   """Apply 1D conv with un-shared weights.
 
@@ -5593,6 +5655,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
 
 
 @keras_export('keras.backend.local_conv2d')
+@dispatch.add_dispatch_support
 def local_conv2d(inputs,
                  kernel,
                  kernel_size,
@@ -5634,6 +5697,7 @@ def local_conv2d(inputs,
 
 
 @keras_export('keras.backend.bias_add')
+@dispatch.add_dispatch_support
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
@@ -5678,6 +5742,7 @@ def bias_add(x, bias, data_format=None):
 
 
 @keras_export('keras.backend.random_normal')
+@dispatch.add_dispatch_support
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
@@ -5714,6 +5779,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.random_uniform')
+@dispatch.add_dispatch_support
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
   """Returns a tensor with uniform distribution of values.
 
@@ -5747,6 +5813,7 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
 
 @deprecated(None, 'Use `tf.keras.backend.random_bernoulli` instead.')
 @keras_export('keras.backend.random_binomial')
+@dispatch.add_dispatch_support
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
@@ -5783,6 +5850,7 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.random_bernoulli')
+@dispatch.add_dispatch_support
 def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random bernoulli distribution of values.
 
@@ -5799,6 +5867,7 @@ def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.truncated_normal')
+@dispatch.add_dispatch_support
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
 
@@ -5833,6 +5902,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.ctc_label_dense_to_sparse')
+@dispatch.add_dispatch_support
 def ctc_label_dense_to_sparse(labels, label_lengths):
   """Converts CTC labels from dense to sparse.
 
@@ -5879,6 +5949,7 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
 
 
 @keras_export('keras.backend.ctc_batch_cost')
+@dispatch.add_dispatch_support
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   """Runs CTC loss algorithm on each batch element.
 
@@ -5911,6 +5982,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
 
 
 @keras_export('keras.backend.ctc_decode')
+@dispatch.add_dispatch_support
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
   """Decodes the output of a softmax.
 
diff --git a/tensorflow/python/keras/backend_config.py b/tensorflow/python/keras/backend_config.py
index 407235c54bd..cd1f1e4b423 100644
--- a/tensorflow/python/keras/backend_config.py
+++ b/tensorflow/python/keras/backend_config.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 
 # The type of float to use throughout a session.
@@ -30,6 +31,7 @@ _IMAGE_DATA_FORMAT = 'channels_last'
 
 
 @keras_export('keras.backend.epsilon')
+@dispatch.add_dispatch_support
 def epsilon():
   """Returns the value of the fuzz factor used in numeric expressions.
 
@@ -86,8 +88,8 @@ def set_floatx(value):
   likely cause numeric stability issues. Instead, mixed precision, which is
   using a mix of float16 and float32, can be used by calling
   `tf.keras.mixed_precision.experimental.set_policy('mixed_float16')`. See the
-  [mixed precision
-  guide](https://www.tensorflow.org/guide/keras/mixed_precision) for details.
+  [mixed precision guide](
+    https://www.tensorflow.org/guide/keras/mixed_precision) for details.
 
   Arguments:
       value: String; `'float16'`, `'float32'`, or `'float64'`.
@@ -110,6 +112,7 @@ def set_floatx(value):
 
 
 @keras_export('keras.backend.image_data_format')
+@dispatch.add_dispatch_support
 def image_data_format():
   """Returns the default image data format convention.
 
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 2d178bb6409..db326ea32f0 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -27,7 +27,6 @@ import io
 import json
 import os
 import re
-import tempfile
 import time
 
 import numpy as np
@@ -225,12 +224,7 @@ class CallbackList(object):
     if params:
       self.set_params(params)
 
-    self._queue_length = 10
-    self._reset_batch_timing()
-
-    # Determines if batch-level hooks need to be called.
-    # This is important for performance, because processing batch-level logs
-    # will cause async eager to block on each batch.
+    # Performance optimization: determines if batch hooks need to be called.
     # pylint: disable=protected-access
     self._should_call_train_batch_hooks = any(
         cb._implements_train_batch_hooks() for cb in self.callbacks)
@@ -240,6 +234,11 @@ class CallbackList(object):
         cb._implements_predict_batch_hooks() for cb in self.callbacks)
     # pylint: enable=protected-access
 
+    # Performance check: Check batch hooks for slowness compared to batch time.
+    self._timing = {}
+    self._check_timing = False
+    self._batch_start_time = None
+
   def _add_default_callbacks(self, add_history, add_progbar):
     """Adds `Callback`s that are always present."""
     self._progbar = None
@@ -259,11 +258,6 @@ class CallbackList(object):
       self._history = History()
       self.callbacks.append(self._history)
 
-  def _reset_batch_timing(self):
-    self._delta_t_batch = 0.
-    self._delta_ts = collections.defaultdict(
-        lambda: collections.deque([], maxlen=self._queue_length))
-
   def append(self, callback):
     self.callbacks.append(callback)
 
@@ -283,33 +277,71 @@ class CallbackList(object):
     """Helper function for all batch_{begin | end} methods."""
     if not self.callbacks:
       return
-    hook_name = 'on_{mode}_batch_{hook}'.format(mode=mode, hook=hook)
-    if hook == 'begin':
-      self._t_enter_batch = time.time()
-    if hook == 'end':
-      # Batch is ending, calculate batch time.
-      self._delta_t_batch = time.time() - self._t_enter_batch
 
+    if hook == 'begin':
+      self._call_batch_begin_hook(mode, batch, logs)
+    elif hook == 'end':
+      self._call_batch_end_hook(mode, batch, logs)
+    else:
+      raise ValueError('Unrecognized hook: {}'.format(hook))
+
+  def _call_batch_begin_hook(self, mode, batch, logs):
+    """Helper function for `on_*_batch_begin` methods."""
+    hook_name = 'on_{mode}_batch_begin'.format(mode=mode)
+    self._check_timing = batch == 1 and hook_name not in self._timing
+    self._call_batch_hook_helper(hook_name, batch, logs)
+
+    if self._check_timing:
+      self._batch_start_time = time.time()
+
+  def _call_batch_end_hook(self, mode, batch, logs):
+    """Helper function for `on_*_batch_end` methods."""
+    hook_name = 'on_{mode}_batch_end'.format(mode=mode)
+
+    if self._check_timing:
+      batch_time = time.time() - self._batch_start_time
+
+    self._call_batch_hook_helper(hook_name, batch, logs)
+
+    if self._check_timing:
+      end_hook_name = hook_name
+      begin_hook_name = 'on_{mode}_batch_begin'.format(mode=mode)
+
+      threshold_time = 1.5 * batch_time
+      warning_msg = ('Callbacks method `{hook}` is slow compared to '
+                     'the batch time (batch time: {batch_time:.4f}s vs '
+                     '`{hook}` time: {cbk_time:.4f}s). Check your callbacks.')
+      if self._timing[begin_hook_name] > threshold_time:
+        logging.warning(warning_msg.format(
+            hook=begin_hook_name,
+            batch_time=batch_time,
+            cbk_time=self._timing[begin_hook_name]))
+      if self._timing[end_hook_name] > threshold_time:
+        logging.warning(warning_msg.format(
+            hook=end_hook_name,
+            batch_time=batch_time,
+            cbk_time=self._timing[end_hook_name]))
+      self._check_timing = False
+      self._batch_start_time = None
+
+  def _call_batch_hook_helper(self, hook_name, batch, logs):
+    """Helper function for `on_*_batch_*` methods."""
     logs = logs or {}
-    t_before_callbacks = time.time()
     numpy_logs = None
+    if self._check_timing:
+      start_time = time.time()
+
     for callback in self.callbacks:
-      batch_hook = getattr(callback, hook_name)
+      hook = getattr(callback, hook_name)
       if getattr(callback, '_supports_tf_logs', False):
-        batch_hook(batch, logs)
+        hook(batch, logs)
       else:
         if numpy_logs is None:  # Only convert once.
           numpy_logs = tf_utils.to_numpy_or_python_type(logs)
-        batch_hook(batch, numpy_logs)
-    self._delta_ts[hook_name].append(time.time() - t_before_callbacks)
+        hook(batch, numpy_logs)
 
-    delta_t_median = np.median(self._delta_ts[hook_name])
-    if (self._delta_t_batch > 0. and
-        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      logging.warning(
-          'Method (%s) is slow compared '
-          'to the batch update (%f). Check your callbacks.', hook_name,
-          delta_t_median)
+    if self._check_timing:
+      self._timing[hook_name] = time.time() - start_time
 
   def _call_begin_hook(self, mode):
     """Helper function for on_{train|test|predict}_begin methods."""
@@ -356,7 +388,6 @@ class CallbackList(object):
         if numpy_logs is None:  # Only convert once.
           numpy_logs = tf_utils.to_numpy_or_python_type(logs)
         callback.on_epoch_begin(epoch, numpy_logs)
-    self._reset_batch_timing()
 
   def on_epoch_end(self, epoch, logs=None):
     """Calls the `on_epoch_end` methods of its callbacks.
@@ -720,8 +751,9 @@ class Callback(object):
     Subclasses should override for any actions to run.
 
     Arguments:
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
+        logs: Dict. Currently the output of the last call to `on_epoch_end()`
+          is passed to this argument for this method but that may change in
+          the future.
     """
 
   @doc_controls.for_subclass_implementers
@@ -742,7 +774,8 @@ class Callback(object):
     Subclasses should override for any actions to run.
 
     Arguments:
-        logs: Dict. Currently no data is passed to this argument for this method
+        logs: Dict. Currently the output of the last call to
+          `on_test_batch_end()` is passed to this argument for this method
           but that may change in the future.
     """
 
@@ -1301,36 +1334,24 @@ class ModelCheckpoint(Callback):
   def _get_file_path(self, epoch, logs):
     """Returns the file path for checkpoint."""
     # pylint: disable=protected-access
-    if not self.model._in_multi_worker_mode(
-    ) or multi_worker_util.should_save_checkpoint():
-      try:
-        # `filepath` may contain placeholders such as `{epoch:02d}` and
-        # `{mape:.2f}`. A mismatch between logged metrics and the path's
-        # placeholders can cause formatting to fail.
-        return self.filepath.format(epoch=epoch + 1, **logs)
-      except KeyError as e:
-        raise KeyError('Failed to format this callback filepath: "{}". '
-                       'Reason: {}'.format(self.filepath, e))
-    else:
-      # If this is multi-worker training, and this worker should not
-      # save checkpoint, we use a temp filepath to store a dummy checkpoint, so
-      # it writes to a file that will be removed at the end of `_save_model()`
-      # call. This is because the SyncOnReadVariable needs to be synced across
-      # all the workers in order to be read, and all workers need to initiate
-      # that.
-      self._temp_file_dir = tempfile.mkdtemp()
-      extension = os.path.splitext(self.filepath)[1]
-      return os.path.join(self._temp_file_dir, 'temp' + extension)
+    try:
+      # `filepath` may contain placeholders such as `{epoch:02d}` and
+      # `{mape:.2f}`. A mismatch between logged metrics and the path's
+      # placeholders can cause formatting to fail.
+      file_path = self.filepath.format(epoch=epoch + 1, **logs)
+    except KeyError as e:
+      raise KeyError('Failed to format this callback filepath: "{}". '
+                     'Reason: {}'.format(self.filepath, e))
+    self._write_filepath = distributed_file_utils.write_filepath(
+        file_path, self.model.distribute_strategy)
+    return self._write_filepath
 
   def _maybe_remove_file(self):
     # Remove the checkpoint directory in multi-worker training where this worker
     # should not checkpoint. It is a dummy directory previously saved for sync
     # distributed training.
-
-    if (self.model._in_multi_worker_mode() and  # pylint: disable=protected-access
-        not multi_worker_util.should_save_checkpoint()):
-      file_io.delete_recursively(self._temp_file_dir)
-      del self._temp_file_dir
+    distributed_file_utils.remove_temp_dir_with_filepath(
+        self._write_filepath, self.model.distribute_strategy)
 
   def _get_most_recently_modified_file_matching_pattern(self, pattern):
     """Returns the most recently modified filepath matching pattern.
@@ -1888,41 +1909,33 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
   def _configure_embeddings(self):
     """Configure the Projector for embeddings."""
     # TODO(omalleyt): Add integration tests.
+    from google.protobuf import text_format
     from tensorflow.python.keras.layers import embeddings
-    try:
-      from tensorboard.plugins import projector
-    except ImportError:
-      raise ImportError('Failed to import TensorBoard. Please make sure that '
-                        'TensorBoard integration is complete."')
-    config = projector.ProjectorConfig()
+    from tensorflow.python.keras.protobuf import projector_config_pb2
+
+    config = projector_config_pb2.ProjectorConfig()
     for layer in self.model.layers:
       if isinstance(layer, embeddings.Embedding):
         embedding = config.embeddings.add()
-        embedding.tensor_name = layer.embeddings.name
+        embedding.tensor_name = layer.name + '/.ATTRIBUTES/VARIABLE_VALUE'
 
         if self.embeddings_metadata is not None:
           if isinstance(self.embeddings_metadata, str):
             embedding.metadata_path = self.embeddings_metadata
           else:
-            if layer.name in embedding.metadata_path:
+            if layer.name in self.embeddings_metadata.keys():
               embedding.metadata_path = self.embeddings_metadata.pop(layer.name)
 
-    if self.embeddings_metadata:
+    if self.embeddings_metadata and not isinstance(self.embeddings_metadata,
+                                                   str):
       raise ValueError('Unrecognized `Embedding` layer names passed to '
                        '`keras.callbacks.TensorBoard` `embeddings_metadata` '
                        'argument: ' + str(self.embeddings_metadata.keys()))
 
-    class DummyWriter(object):
-      """Dummy writer to conform to `Projector` API."""
-
-      def __init__(self, logdir):
-        self.logdir = logdir
-
-      def get_logdir(self):
-        return self.logdir
-
-    writer = DummyWriter(self._log_write_dir)
-    projector.visualize_embeddings(writer, config)
+    config_pbtxt = text_format.MessageToString(config)
+    path = os.path.join(self._log_write_dir, 'projector_config.pbtxt')
+    with open(path, 'w') as f:
+      f.write(config_pbtxt)
 
   def _push_writer(self, writer, step):
     """Sets the default writer for custom batch-level summaries."""
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index a2709bcb5de..2f1256ee3ee 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -274,6 +274,37 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
       model.fit(dataset, epochs=2, steps_per_epoch=10)
       self.assertRegexpMatches(printed.contents(), expected_log)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_callback_warning(self):
+
+    class SleepCallback(keras.callbacks.Callback):
+
+      def on_train_batch_end(self, batch, logs=None):
+        time.sleep(1)
+
+    model = sequential.Sequential()
+    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model.compile(
+        'sgd',
+        loss='binary_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    warning_messages = []
+
+    def warning(msg):
+      warning_messages.append(msg)
+
+    with test.mock.patch.object(logging, 'warning', warning):
+      model.fit(
+          np.ones((10, 10), 'float32'),
+          np.ones((10, 1), 'float32'),
+          batch_size=5,
+          epochs=10,
+          callbacks=[SleepCallback()])
+    warning_msg = ('Callbacks method `on_train_batch_end` is slow compared '
+                   'to the batch time')
+    self.assertIn(warning_msg, '\n'.join(warning_messages))
+
   @keras_parameterized.run_with_all_model_types(exclude_models='functional')
   @keras_parameterized.run_all_keras_modes
   def test_progbar_logging_deferred_model_build(self):
@@ -1918,6 +1949,39 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
         },
     )
 
+  def test_TensorBoard_projector_callback(self):
+    layers = [
+        keras.layers.Embedding(10, 10, name='test_embedding'),
+        keras.layers.Dense(10, activation='relu'),
+        keras.layers.Dense(1, activation='sigmoid')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(10,))
+    model.compile(
+        optimizer='adam',
+        loss=keras.losses.BinaryCrossentropy(from_logits=True),
+        run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 10)), np.ones((10, 10))
+    tb_cbk = keras.callbacks.TensorBoard(
+        self.logdir,
+        embeddings_freq=1,
+        embeddings_metadata={'test_embedding': 'metadata.tsv'})
+
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+
+    with open(os.path.join(self.logdir, 'projector_config.pbtxt')) as f:
+      self.assertEqual(
+          f.readlines(), [
+              'embeddings {\n',
+              '  tensor_name: "test_embedding/.ATTRIBUTES/VARIABLE_VALUE"\n',
+              '  metadata_path: "metadata.tsv"\n',
+              '}\n'])
+
   def test_custom_summary(self):
     if not context.executing_eagerly():
       self.skipTest('Custom summaries only supported in V2 code path.')
diff --git a/tensorflow/python/keras/callbacks_v1.py b/tensorflow/python/keras/callbacks_v1.py
index 09af890b76c..251fb3476dc 100644
--- a/tensorflow/python/keras/callbacks_v1.py
+++ b/tensorflow/python/keras/callbacks_v1.py
@@ -83,15 +83,16 @@ class TensorBoard(callbacks.TensorBoard):
       embeddings_layer_names: a list of names of layers to keep eye on. If None
         or empty list all the embedding layer will be watched.
       embeddings_metadata: a dictionary which maps layer name to a file name in
-        which metadata for this embedding layer is saved. See the
-          [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
+        which metadata for this embedding layer is saved.
+          [Here are details](
+            https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
             about metadata files format. In case if the same metadata file is
             used for all embedding layers, string can be passed.
       embeddings_data: data to be embedded at layers specified in
         `embeddings_layer_names`. Numpy array (if the model has a single input)
-        or list of Numpy arrays (if the model has multiple inputs). Learn [more
-        about
-            embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+        or list of Numpy arrays (if the model has multiple inputs). Learn more
+        about embeddings [in this guide](
+          https://www.tensorflow.org/programmers_guide/embedding).
       update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
         writes the losses and metrics to TensorBoard after each batch. The same
         applies for `'epoch'`. If using an integer, let's say `1000`, the
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 7e68b9d3e67..87625446e2f 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -128,9 +128,12 @@ distribute_py_test(
         "multi_and_single_gpu",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
-        "notpu",  # TODO(b/142805125): Enable tpu_test_target once fixed on tpu.
+        "notpu",  # TODO(b/155867206) flaky segfault
         "notsan",
     ],
+    tpu_tags = [
+        "no_oss",  # b/155502591
+    ],
     deps = [
         ":distribute_strategy_test_lib",
     ],
@@ -187,12 +190,9 @@ distribute_py_test(
     # shards more evenly.
     shard_count = 19,
     tags = [
-        "manual",
         "multi_and_single_gpu",
-        "no_oss",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
-        "notap",  # TODO(b/153671866)
         "notsan",
     ],
     deps = [
@@ -246,11 +246,10 @@ distribute_py_test(
     # shards more evenly.
     shard_count = 31,
     tags = [
-        "manual",
         "multi_and_single_gpu",
         "no_oss",  # b/136660639
         "no_windows_gpu",
-        "notap",  # TODO(b/153672562)
+        "notpu",  # TODO(b/153672562)
         "notsan",
     ],
     deps = [
@@ -266,12 +265,9 @@ distribute_py_test(
     main = "keras_stateful_lstm_model_correctness_test.py",
     shard_count = 4,
     tags = [
-        "manual",
         "multi_and_single_gpu",
-        "no_oss",
         "no_pip",
         "no_windows_gpu",
-        "notap",  # TODO(b/153392670)
         "notsan",
     ],
     deps = [
@@ -287,6 +283,7 @@ distribute_py_test(
     shard_count = 4,
     tags = [
         "multi_and_single_gpu",
+        "no_rocm",
         "no_windows_gpu",
         "notsan",
     ],
@@ -323,7 +320,6 @@ cuda_py_test(
     shard_count = 4,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # http://b/119349471
         "tf_integration_test",
     ],
     deps = [
@@ -366,30 +362,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "multi_worker_callback_tf1_test",
-    srcs = ["multi_worker_callback_tf1_test.py"],
-    # TODO(b/132384649): Enable for guitar and oss tests.
-    shard_count = 24,
-    tags = [
-        "manual",
-        "no_oss",
-        "noguitar",
-        "notap",
-    ],
-    deps = [
-        ":distribute",
-        ":multi_worker_testing_utils",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:distribute_config",
-        "//tensorflow/python/distribute:distribute_coordinator",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/keras",
-    ],
-)
-
 py_test(
     name = "multi_worker_callback_tf2_test",
     srcs = ["multi_worker_callback_tf2_test.py"],
@@ -439,3 +411,38 @@ py_library(
         "//tensorflow/python/keras/optimizer_v2",
     ],
 )
+
+py_library(
+    name = "tpu_strategy_test_utils",
+    srcs = ["tpu_strategy_test_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:platform",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/tpu:tpu_strategy_util",
+    ],
+)
+
+py_test(
+    name = "multi_worker_tutorial_test",
+    srcs = ["multi_worker_tutorial_test.py"],
+    python_version = "PY3",
+    shard_count = 5,
+    tags = [
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],  # TODO(b/156029134)
+    deps = [
+        "//tensorflow/python:platform",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/optimizer_v2",
+    ],
+)
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index 874ca84cab9..f6a83c499fe 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -25,10 +25,13 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -256,14 +259,6 @@ def all_strategy_combinations():
   return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
-def all_strategy_combinations_plus_run_distributed():
-  return (combinations.combine(
-      distribution=strategies_minus_tpu,
-      mode=['graph', 'eager']) + combinations.combine(
-          distribution=tpu_strategies,
-          mode=['graph', 'eager']))
-
-
 def all_strategy_minus_default_and_tpu_combinations():
   return combinations.combine(
       distribution=[
@@ -331,6 +326,8 @@ class BatchCountingCB(keras.callbacks.Callback):
     self.train_end_batches = []
     self.test_begin_batches = []
     self.test_end_batches = []
+    self.predict_begin_batches = []
+    self.predict_end_batches = []
 
   def on_train_batch_begin(self, batch, logs=None):
     self.train_begin_batches.append(batch)
@@ -344,6 +341,12 @@ class BatchCountingCB(keras.callbacks.Callback):
   def on_test_batch_end(self, batch, logs=None):
     self.test_end_batches.append(batch)
 
+  def on_predict_batch_begin(self, batch, logs=None):
+    self.predict_begin_batches.append(batch)
+
+  def on_predict_batch_end(self, batch, logs=None):
+    self.predict_end_batches.append(batch)
+
 
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
@@ -449,7 +452,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, 64, steps=10, batch_size=13)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -483,7 +486,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         model.predict(inputs)
         model.predict(inputs, batch_size=8)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_with_mixed_precision(self, distribution):
     if isinstance(distribution.extended,
                   parameter_server_strategy.ParameterServerStrategyExtended):
@@ -529,7 +532,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       model.predict(inputs)
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_operator_overload_mixed_precision(self, distribution):
     # Regression test that tests a fixed bug does not reoccur. Adding an
     # AutoCastVariable to a tensor on a TPU, where the variable was the LHS of
@@ -589,7 +592,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                   'cannot be called in cross-replica context'):
         optimizer.apply_gradients(zip(gradients, model.trainable_variables))
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -661,7 +664,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       result = model.evaluate(inputs, targets, batch_size=2, verbose=1)
       self.assertAllClose(result, 13.5)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_flatten_predict_outputs(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -828,7 +831,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           atol=1e-4,
           rtol=1e-4)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_gradients_are_none(self, distribution):
 
     if not context.executing_eagerly():
@@ -859,7 +862,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -892,7 +895,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           validation_steps=2)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_model_interleaved_eval_same_as_direct_eval(
       self, distribution):
     with self.cached_session():
@@ -943,7 +946,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertEqual(interleaved_output.history['val_categorical_accuracy'],
                        [x[2] for x in user_controlled_output])
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -980,10 +983,14 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_fit_with_dictionary_in_the_dataset_b135161171(
       self, distribution):
 
+    if isinstance(distribution,
+                  (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+      self.skipTest('b/142805125')
+
     def custom_loss(predict, label, weight):
       bce = keras.losses.binary_crossentropy(label, predict)
       return math_ops.reduce_mean(bce * weight)
@@ -1024,7 +1031,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(data)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset_without_steps(
       self, distribution):
     with self.cached_session():
@@ -1060,11 +1067,14 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertAllClose(
           predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
 
-  @combinations.generate(
-      combinations.times(
-          strategy_minus_tpu_combinations()))
+  @combinations.generate(all_strategy_combinations())
   def test_on_dataset_with_unknown_cardinality_without_steps(
       self, distribution, mode):
+
+    if mode == 'graph' and isinstance(
+        distribution, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+      self.skipTest('partial batch not supported with TPU in graph mode.')
+
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -1115,9 +1125,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-4,
           rtol=1e-4)
 
-  @combinations.generate(
-      combinations.times(
-          tpu_strategy_combinations()))
+  @combinations.generate(tpu_strategy_combinations_graph_only())
   def test_on_dataset_with_unknown_cardinality(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -1158,7 +1166,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                                    'Number of steps could not be inferred'):
         model.fit(dataset, epochs=1)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(
       self, distribution):
     with self.cached_session():
@@ -1310,7 +1318,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       ref_output = np.ones((160, 1), dtype=np.float32)
       self.assertArrayNear(output, ref_output, 1e-1)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def testOptimizerWithCallbacks(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -1594,7 +1602,7 @@ class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
 class TestDistributionStrategyWithKerasModels(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_distribution_strategy_on_sequential_model(
       self, distribution):
     with distribution.scope():
@@ -1613,7 +1621,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     model.predict(inputs, batch_size=10)
     model.evaluate(inputs, targets, batch_size=10)
 
-  @combinations.generate(all_strategy_combinations_plus_run_distributed())
+  @combinations.generate(all_strategy_combinations())
   def test_distribution_strategy_on_functional_model(
       self, distribution):
     with distribution.scope():
@@ -1760,6 +1768,10 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.test_begin_batches, [0, 10, 20, 30, 40])
     self.assertEqual(bc.test_end_batches, [9, 19, 29, 39, 49])
 
+    model.predict(x, batch_size=2, callbacks=[bc])
+    self.assertEqual(bc.predict_begin_batches, [0, 10, 20, 30, 40])
+    self.assertEqual(bc.predict_end_batches, [9, 19, 29, 39, 49])
+
   @combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_last_partial_execution(self, distribution):
@@ -1780,6 +1792,10 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.test_begin_batches, [0, 20, 40])
     self.assertEqual(bc.test_end_batches, [19, 39, 49])
 
+    model.predict(x, batch_size=2, callbacks=[bc])
+    self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
+    self.assertEqual(bc.predict_end_batches, [19, 39, 49])
+
   @combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_dataset_unknown_size(self, distribution):
@@ -1811,6 +1827,11 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.test_begin_batches, [0, 20, 40])
     self.assertEqual(bc.test_end_batches, [19, 39, 49])
 
+    predict_ds = ds.repeat(2)
+    model.predict(predict_ds, steps=50, callbacks=[bc])
+    self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
+    self.assertEqual(bc.predict_end_batches, [19, 39, 49])
+
   @combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_truncate_to_epoch(self, distribution):
@@ -1832,6 +1853,11 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.test_begin_batches, [0])
     self.assertEqual(bc.test_end_batches, [24])
 
+    x = np.ones((50, 10))
+    model.predict(x, batch_size=2, callbacks=[bc])
+    self.assertEqual(bc.predict_begin_batches, [0])
+    self.assertEqual(bc.predict_end_batches, [24])
+
   @combinations.generate(
       combinations.times(
           all_strategy_combinations_minus_default()))
@@ -2220,6 +2246,30 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
         for x in dataset:
           train_step(x)
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_unimplemented_parameter_server_strategy(self):
+    cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=2)
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+        task_type='worker',
+        task_id=1,
+        num_accelerators={'GPU': 0})
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        cluster_resolver)
+
+    self.assertIsInstance(distribution,
+                          (parameter_server_strategy.ParameterServerStrategyV1,
+                           parameter_server_strategy.ParameterServerStrategy))
+
+    with self.assertRaisesRegexp(NotImplementedError,
+                                 'ParameterServerStrategy*'):
+      with distribution.scope():
+        model = simple_sequential_model()
+        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+        loss = 'mse'
+        model.compile(optimizer, loss)
+
 
 # Models to exercise inserting ancillary layers with add_loss and add_metric.
 def _functional_with_add_loss_and_metric(input_shape, num_classes, l1, l2):
@@ -2394,13 +2444,17 @@ class TestModelCapturesStrategy(test.TestCase, parameterized.TestCase):
     # Make model with distribution strategy
     with distribution.scope():
       model = DeterministicModel(distribution)
+      optimizer = keras.optimizers.adam_v2.Adam(1e-4)
 
     # Compile & evaluate the model outside of the distribution strategy scope
     model.compile(
-        optimizer=keras.optimizers.adam_v2.Adam(1e-4),
+        optimizer=optimizer,
         loss=keras.losses.MeanSquaredError(),
         metrics=['binary_accuracy'])
 
+    # Call `optimizer.iterations` out of strategy scope.
+    self.assertEqual(model.optimizer.iterations.numpy(), 0)
+
     # Non-eager training doesn't support steps_per_epoch=None.
     for unused_epoch in range(2):
       model.fit(dataset)
@@ -2429,7 +2483,7 @@ class TestModelCapturesStrategy(test.TestCase, parameterized.TestCase):
     with distribution.scope():
       metric = keras.metrics.BinaryAccuracy()
     model.compile(
-        optimizer=keras.optimizers.adam_v2.Adam(1e-4),
+        optimizer=optimizer,
         loss=keras.losses.MeanSquaredError(),
         metrics=[metric])
 
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py
deleted file mode 100644
index 95a235e7b33..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras callbacks in multi-worker training with TF1."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import sys
-import tempfile
-import threading
-
-from absl.testing import parameterized
-
-from tensorflow.python import keras
-from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import multi_worker_test_base as test_base
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import callbacks
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
-from tensorflow.python.platform import test
-
-
-def get_strategy_object(strategy_cls):
-  if strategy_cls == mirrored_strategy.MirroredStrategy:
-    return strategy_cls(mirrored_strategy.all_local_devices())
-  else:
-    # CollectiveAllReduceStrategy and ParameterServerStrategy.
-    return strategy_cls()
-
-
-def generate_callback_test_function(custom_callable):
-  """Generic template for callback tests using mnist synthetic dataset."""
-
-  @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
-          required_gpus=[0, 1],
-          file_format=['h5', 'tf']))
-  def test_template(self, strategy_cls, file_format):
-    num_workers = 2
-    num_epoch = 2
-
-    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
-    self._barrier = dc._Barrier(2)
-
-    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
-      """Simulates an Independent Worker inside of a thread."""
-      with test.mock.patch.object(dc, '_run_std_server',
-                                  self._make_mock_run_std_server()):
-        strategy = get_strategy_object(strategy_cls)
-        batch_size = 64
-        steps = 2
-        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-            batch_size, steps)
-        with strategy.scope():
-          model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-
-        custom_callable(
-            model,
-            self,
-            train_ds,
-            num_epoch,
-            steps,
-            strategy,
-            saving_filepath=kwargs['saving_filepath'],
-            barrier=kwargs['barrier'],
-            threading_local=kwargs['threading_local'])
-
-    # Pass saving_filepath from the parent thread to ensure every worker has the
-    # same filepath to save.
-    saving_filepath = os.path.join(self.get_temp_dir(),
-                                   'checkpoint.' + file_format)
-    barrier = dc._Barrier(2)
-    threading_local = threading.local()
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        saving_filepath=saving_filepath,
-        barrier=barrier,
-        threading_local=threading_local)
-    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    threads_to_join = []
-    strategy = get_strategy_object(strategy_cls)
-    if strategy.extended.experimental_between_graph:
-      for ts in threads.values():
-        threads_to_join.extend(ts)
-    else:
-      threads_to_join = [threads['worker'][0]]
-    self.join_independent_workers(threads_to_join)
-
-  return test_template
-
-
-class KerasMultiWorkerCallbackTest(test_base.IndependentWorkerTestBase,
-                                   parameterized.TestCase):
-  """KerasMultiWorkerCallbackTest for TF1.
-
-  TODO(rchao): Migrate all tests in this class to
-  `multi_worker_callback_tf2_test`.
-  """
-
-  # The callables of the actual testing content to be run go below.
-  @staticmethod
-  def callableForTestChiefOnlyCallback(model, test_obj, train_ds, num_epoch,
-                                       steps, strategy, saving_filepath,
-                                       **kwargs):
-
-    class ChiefOnly(keras.callbacks.Callback):
-
-      def __init__(self):
-        self._chief_worker_only = True
-        self.filtered_correctly = True
-
-      def on_train_begin(self, logs):
-        if not multi_worker_util.is_chief():
-          # Non-chief workers shouldn't run this callback.
-          self.filtered_correctly = False
-
-    cb = ChiefOnly()
-    model.fit(
-        x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[cb])
-
-    test_obj.assertTrue(cb.filtered_correctly)
-
-  @staticmethod
-  def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
-      model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath,
-      **kwargs):
-
-    extension = os.path.splitext(saving_filepath)[1]
-
-    # Incorporate type/index information and thread id in saving_filepath to
-    # ensure every worker has a unique path. Note that in normal use case the
-    # saving_filepath will be the same for all workers, but we use different
-    # ones here just to test out chief saves checkpoint but non-chief doesn't.
-
-    saving_filepath = os.path.join(
-        test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
-        (test_base.get_task_type(), test_base.get_task_index(), extension))
-
-    # The saving_filepath shouldn't exist at the beginning (as it's unique).
-    test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])
-
-    # If it's chief, the model should be saved; if not, the model shouldn't.
-    test_obj.assertEqual(
-        training_state.checkpoint_exists(saving_filepath), test_base.is_chief())
-
-  @staticmethod
-  def initialFitting(test_obj, model, train_ds, num_epoch, steps,
-                     saving_filepath):
-    # The saving_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath, save_weights_only=True)
-        ])
-
-    # The saving_filepath should exist after fitting with callback. Both chief
-    # and non-chief worker should both see it exists (which was saved only by
-    # chief).
-    test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))
-
-    history_after_one_more_epoch = model.fit(
-        x=train_ds, epochs=1, steps_per_epoch=steps)
-
-    # The saving_filepath should continue to exist (if it did) after fitting
-    # without callback.
-    test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))
-
-    return saving_filepath, history_after_one_more_epoch
-
-  @staticmethod
-  def callableForTestLoadWeightFromModelCheckpoint(model, test_obj, train_ds,
-                                                   num_epoch, steps, strategy,
-                                                   saving_filepath, **kwargs):
-    filepaths = []
-    real_mkstemp = tempfile.mkstemp
-    def mocked_mkstemp():
-      # Only non-chief should call tempfile.mkstemp() inside fit() in sync
-      # training.
-      assert not test_base.is_chief()
-      file_handle, temp_file_name = real_mkstemp()
-      extension = os.path.splitext(saving_filepath)[1]
-      temp_filepath = temp_file_name + extension
-      filepaths.append(temp_filepath)
-      return file_handle, temp_file_name
-
-    # Mock tempfile.mkstemp() so the filepaths can be stored and verified later.
-    with test.mock.patch.object(tempfile, 'mkstemp', mocked_mkstemp):
-      saving_filepath, history_after_one_more_epoch = \
-          KerasMultiWorkerCallbackTest.initialFitting(
-              test_obj, model, train_ds, num_epoch, steps, saving_filepath)
-
-      with strategy.scope():
-        model.load_weights(saving_filepath)
-
-      history_after_loading_weight_and_one_more_epoch = model.fit(
-          x=train_ds, epochs=1, steps_per_epoch=steps)
-
-      test_obj.assertAllClose(
-          history_after_one_more_epoch.history,
-          history_after_loading_weight_and_one_more_epoch.history,
-          rtol=5e-5)
-
-    # Verify the temp files are indeed removed (no trace left behind).
-    for filepath in filepaths:
-      assert not training_state.checkpoint_exists(filepath)
-
-  @staticmethod
-  def callableForTestModelRestoreCallback(model, test_obj, train_ds, num_epoch,
-                                          steps, strategy, saving_filepath,
-                                          **kwargs):
-
-    saving_filepath, history_after_one_more_epoch = \
-        KerasMultiWorkerCallbackTest.initialFitting(
-            test_obj, model, train_ds, num_epoch, steps, saving_filepath)
-
-    # The model should get restored to the weights previously saved, by
-    # adding a ModelCheckpoint callback (which results in a
-    # _ModelRestoreCallback being added), with load_weights_on_restart=True.
-    history_after_model_restoring_and_one_more_epoch = model.fit(
-        x=train_ds,
-        epochs=1,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath,
-                save_weights_only=True,
-                load_weights_on_restart=True)
-        ])
-
-    # Asserting the history one epoch after initial fitting and one epoch after
-    # restoring are closed.
-    test_obj.assertAllClose(
-        history_after_one_more_epoch.history,
-        history_after_model_restoring_and_one_more_epoch.history,
-        rtol=5e-5)
-
-    history_one_more_epoch_without_model_restoring = model.fit(
-        x=train_ds, epochs=1, steps_per_epoch=steps)
-
-    # Ensuring training for another epoch gives different result.
-    test_obj.assertNotAllClose(
-        history_after_model_restoring_and_one_more_epoch.history,
-        history_one_more_epoch_without_model_restoring.history,
-        rtol=5e-5)
-
-  @staticmethod
-  def callableForTestBackupModelRemoved(model, test_obj, train_ds, num_epoch,
-                                        steps, strategy, saving_filepath,
-                                        **kwargs):
-
-    # `barrier` object needs to be passed in from parent
-    # thread so both threads refer to the same object.
-    barrier = kwargs['barrier']
-
-    num_epoch = 3
-
-    # Testing the backup filepath `multi_worker_training_state` uses.
-    _, backup_filepath = training_state._get_backup_filepath(saving_filepath)
-
-    # The backup_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
-
-    # Callback to verify that the backup file exists in the middle of training.
-    class BackupFilepathVerifyingCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs=None):
-        if epoch > 1:
-          # Asserting that after the first two epochs, the backup file should
-          # exist.
-          test_obj.assertTrue(training_state.checkpoint_exists(backup_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath, save_weights_only=True),
-            BackupFilepathVerifyingCallback()
-        ])
-
-    # Sync on the two threads so we make sure the backup file is removed before
-    # we move on.
-    barrier.wait()
-
-    # The back up file should not exist at successful exit of `model.fit()`.
-    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
-
-  @staticmethod
-  def callableForTestBackupModelNotRemovedIfInterrupted(model, test_obj,
-                                                        train_ds, num_epoch,
-                                                        steps, strategy,
-                                                        saving_filepath,
-                                                        **kwargs):
-
-    # `barrier` object needs to be passed in from parent
-    # thread so both threads refer to the same object.
-    barrier = kwargs['barrier']
-
-    num_epoch = 4
-
-    # Testing the backup filepath `multi_worker_training_state` uses.
-    _, backup_filepath = training_state._get_backup_filepath(saving_filepath)
-
-    # The backup_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
-
-    # Callback to interrupt in the middle of training.
-    class InterruptingCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs=None):
-        if epoch == 2:
-          raise RuntimeError('Interrupting!')
-
-    try:
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          callbacks=[
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath, save_weights_only=True),
-              InterruptingCallback()
-          ])
-    except RuntimeError as e:
-      if 'Interrupting!' not in e.message:
-        raise
-
-    # Sync on the two threads.
-    barrier.wait()
-
-    # The back up file should exist after interruption of `model.fit()`.
-    test_obj.assertTrue(training_state.checkpoint_exists(backup_filepath))
-
-  @staticmethod
-  def callableForTestUnmatchedModelFile(model, test_obj, train_ds, num_epoch,
-                                        steps, strategy, saving_filepath,
-                                        **kwargs):
-
-    # The saving_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath, save_weights_only=True)
-        ])
-
-    (train_ds, _), (_, _) = testing_utils.get_test_data(
-        train_samples=10, test_samples=10, input_shape=(3,), num_classes=2)
-
-    # Switch to a model of different structure.
-    with strategy.scope():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(5, input_dim=3, activation='relu'))
-      model.add(keras.layers.Dense(2, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
-
-    test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))
-
-    if saving_filepath.endswith('.tf'):
-      test_obj.skipTest('Loading mismatched TF checkpoint would cause Fatal '
-                        'Python error: Aborted. Skipping.')
-
-    # Unmatched format. Should raise ValueError.
-    with test_obj.assertRaisesRegexp(ValueError, 'Error loading file from'):
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          batch_size=8,
-          callbacks=[
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath,
-                  save_weights_only=True,
-                  load_weights_on_restart=True)
-          ])
-
-  @staticmethod
-  def callableForTestReduceLROnPlateau(model, test_obj, train_ds, num_epoch,
-                                       steps, strategy, saving_filepath,
-                                       **kwargs):
-
-    cbks = [
-        callbacks.ReduceLROnPlateau(
-            monitor='loss',
-            factor=0.1,
-            min_delta=1,
-            patience=1,
-            cooldown=5,
-            verbose=1)
-    ]
-
-    # It is expected that the learning rate would drop by `factor` within
-    # 3 epochs with `min_delta=1`.
-    model.fit(x=train_ds, epochs=3, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.0001, atol=1e-8)
-
-    # It is expected that the learning rate would drop by another `factor`
-    # within 3 epochs with `min_delta=1`.
-    model.fit(x=train_ds, epochs=3, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.00001, atol=1e-8)
-
-  @staticmethod
-  def callableForTestEarlyStopping(model, test_obj, train_ds, num_epoch, steps,
-                                   strategy, saving_filepath, **kwargs):
-
-    class EpochCounterCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs):
-        self.last_epoch = epoch
-
-    epoch_counter_cbk = EpochCounterCallback()
-    cbks = [
-        callbacks.EarlyStopping(
-            monitor='loss', min_delta=0.05, patience=1, verbose=1),
-        epoch_counter_cbk
-    ]
-
-    # Empirically, it is expected that `model.fit()` would terminate around the
-    # 22th epoch. Asserting that it should have been stopped before the 50th
-    # epoch to avoid flakiness and be more predictable.
-    model.fit(x=train_ds, epochs=100, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertLess(epoch_counter_cbk.last_epoch, 50)
-
-  @staticmethod
-  def callableForTestLearningRateScheduler(model, test_obj, train_ds, num_epoch,
-                                           steps, strategy, saving_filepath,
-                                           **kwargs):
-
-    cbks = [
-        callbacks.LearningRateScheduler(
-            schedule=lambda x: 1. / (1. + x), verbose=1)
-    ]
-
-    # It is expected that with `epochs=2`, the learning rate would drop to
-    # 1 / (1 + 2) = 0.5.
-    model.fit(x=train_ds, epochs=2, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.5, atol=1e-8)
-
-    # It is expected that with `epochs=4`, the learning rate would drop to
-    # 1 / (1 + 4) = 0.25.
-    model.fit(x=train_ds, epochs=4, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.25, atol=1e-8)
-
-  # pylint: disable=g-doc-args
-  @staticmethod
-  def callableForTestIntermediateDirForFTAreRemoved(model, test_obj, train_ds,
-                                                    num_epoch, steps, strategy,
-                                                    saving_filepath, **kwargs):
-    """Testing that the temporary directory are removed.
-
-    Some temporary directories are created for the purpose of fault tolerance.
-    This test ensures that such directories should have been removed at the time
-    `model.fit()` finishes successfully.
-    """
-
-    # `threading_local` and `barrier` objects have to be passed in from parent
-    # thread so both threads refer to the same object.
-    threading_local = kwargs['threading_local']
-    barrier = kwargs['barrier']
-
-    # Two threads will each has one copy of `temp_dirs_supposed_to_be_removed`
-    # list.
-    threading_local.temp_dirs_supposed_to_be_removed = []
-
-    callbacks_list = [
-        callbacks.ModelCheckpoint(
-            filepath=saving_filepath,
-            save_weights_only=True,
-            load_weights_on_restart=True),
-    ]
-
-    # Keep the references to the real function objects.
-    real_os_path_join = os.path.join
-    real_tempfile_mkdtemp = tempfile.mkdtemp
-
-    # Make a `os.path.join` wrapper, which will be patched onto the real
-    # function, so the temporary directories can be tracked.
-    def wrapper_os_path_join(path, *paths):
-      join_result = real_os_path_join(path, *paths)
-      if len(paths) == 1 and paths[0] == 'backup':
-        threading_local.temp_dirs_supposed_to_be_removed.append(join_result)
-      return join_result
-
-    # Likewise for `tempfile.mkdtemp`.
-    def wrapper_tempfile_mkdtemp():
-      result = real_tempfile_mkdtemp()
-      threading_local.temp_dirs_supposed_to_be_removed.append(result)
-      return result
-
-    # Now the two threads must sync here: if they are out of sync, one thread
-    # can go ahead and patch `os.path.join` while the other has not even
-    # assigned the real `os.path.join` to `real_os_path_join`. If this happened,
-    # the "real" `os.path.join` the slower thread would see is actually the
-    # wrapper of the other.
-    barrier.wait()
-
-    # Note that `os.path.join` will respect the second patch (there are two
-    # patches because of the two threads). Both threads will refer to the same
-    # copy of `wrapper_os_path_join` because of the `barrier` preceding
-    # `model.fit()`. Likewise for `wrapper_tempfile_mkdtemp`.
-    os.path.join = wrapper_os_path_join
-    tempfile.mkdtemp = wrapper_tempfile_mkdtemp
-
-    barrier.wait()
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=callbacks_list)
-
-    # Sync before un-patching to prevent either thread from accessing the real
-    # functions. Also to make sure `model.fit()` is done on both threads (so we
-    # can safely assert the directories are removed).
-    barrier.wait()
-    os.path.join = real_os_path_join
-    tempfile.mkdtemp = real_tempfile_mkdtemp
-
-    # There should be directory (names) that are supposed to be removed.
-    test_obj.assertTrue(threading_local.temp_dirs_supposed_to_be_removed)
-    for temp_dir_supposed_to_be_removed in (
-        threading_local.temp_dirs_supposed_to_be_removed):
-      # They should have been removed and thus don't exist.
-      test_obj.assertFalse(os.path.exists(temp_dir_supposed_to_be_removed))
-
-  # The actual testing methods go here.
-  test_chief_only_callback = generate_callback_test_function(
-      callableForTestChiefOnlyCallback.__func__)
-  test_model_checkpoint_saves_on_chief_but_not_otherwise = \
-      generate_callback_test_function(
-          callableForTestModelCheckpointSavesOnChiefButNotOtherwise.__func__)
-  test_load_weight_from_model_checkpoint = generate_callback_test_function(
-      callableForTestLoadWeightFromModelCheckpoint.__func__)
-  test_model_restore_callback = generate_callback_test_function(
-      callableForTestModelRestoreCallback.__func__)
-  test_unmatched_model_file = generate_callback_test_function(
-      callableForTestUnmatchedModelFile.__func__)
-  test_reduce_lr_on_plateau = generate_callback_test_function(
-      callableForTestReduceLROnPlateau.__func__)
-  test_early_stopping = generate_callback_test_function(
-      callableForTestEarlyStopping.__func__)
-  test_learning_rate_scheduler = generate_callback_test_function(
-      callableForTestLearningRateScheduler.__func__)
-  test_intermediate_dir_for_ft_are_removed = generate_callback_test_function(
-      callableForTestIntermediateDirForFTAreRemoved.__func__)
-  test_backup_model_removed = generate_callback_test_function(
-      callableForTestBackupModelRemoved.__func__)
-  test_backup_model_not_removed_if_interrupted = \
-      generate_callback_test_function(
-          callableForTestBackupModelNotRemovedIfInterrupted.__func__)
-
-
-if __name__ == '__main__':
-  with test.mock.patch.object(sys, 'exit', os._exit):
-    test.main()
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index c99b6db8f4d..7ea385e0b04 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distributed_file_utils
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.keras import callbacks
@@ -106,6 +107,16 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
           training_state.checkpoint_exists(saving_filepath),
           test_base.is_chief())
 
+      # If it's chief, the model should be saved (`write_filepath` should
+      # simply return `saving_filepath`); if not, i.e. for non-chief workers,
+      # the temporary path generated by `write_filepath` should no longer
+      # contain the checkpoint that has been deleted.
+      test_obj.assertEqual(
+          training_state.checkpoint_exists(
+              distributed_file_utils.write_filepath(
+                  saving_filepath, model._distribution_strategy)),
+          test_base.is_chief())
+
     multi_process_runner.run(
         proc_model_checkpoint_saves_on_chief_but_not_otherwise,
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
diff --git a/tensorflow/python/keras/distribute/multi_worker_training_state.py b/tensorflow/python/keras/distribute/multi_worker_training_state.py
index 6ba1cef5d7f..d967cf8b1d2 100644
--- a/tensorflow/python/keras/distribute/multi_worker_training_state.py
+++ b/tensorflow/python/keras/distribute/multi_worker_training_state.py
@@ -170,7 +170,10 @@ class MultiWorkerTrainingState(object):
     successfully finishes.
     """
     self._assert_in_multi_worker_mode()
-    tracking.AutoTrackable.__delattr__(self._model, CKPT_SAVED_EPOCH)
+    # Model may not have such attr if there was a failure before the attr was
+    # added to the model
+    if hasattr(self._model, CKPT_SAVED_EPOCH):
+      tracking.AutoTrackable.__delattr__(self._model, CKPT_SAVED_EPOCH)
     if multi_worker_util.should_save_checkpoint():
       _remove_dir(self._backup_dir)
     else:
diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
new file mode 100644
index 00000000000..1a46bcd7499
--- /dev/null
+++ b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
@@ -0,0 +1,148 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for multi-worker training tutorial."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import contextlib
+import os
+import re
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.python import keras
+from tensorflow.python.data.experimental.ops import distribute_options
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.datasets import mnist
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
+  """Test multi-worker training flow demo'ed in go/multi-worker-with-keras."""
+
+  @contextlib.contextmanager
+  def skip_fetch_failure_exception(self):
+    try:
+      yield
+    except Exception as e:  # pylint: disable=broad-except
+      if 'URL fetch failure' in str(e):
+        self.skipTest('URL fetch error not considered failure of the test.')
+      else:
+        raise
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['eager'],
+          shard_policy=[None] + list(distribute_options.AutoShardPolicy)))
+  def testMultiWorkerTutorial(self, mode, shard_policy):
+    """Test multi-worker training flow demo'ed in go/multi-worker-with-keras.
+
+    This test should be kept in sync with the code samples in
+    go/multi-worker-with-keras.
+
+    Args:
+      mode: Runtime mode.
+      shard_policy: None or any of tf.data.experimental.AutoShardPolicy for
+        testing.
+    """
+    if shard_policy is distribute_options.AutoShardPolicy.FILE:
+      self.skipTest('TensorSliceDataset is not shardable with FILE policy.')
+
+    def mnist_dataset(batch_size):
+      with self.skip_fetch_failure_exception():
+        (x_train, y_train), _ = mnist.load_data()
+      # The `x` arrays are in uint8 and have values in the range [0, 255].
+      # We need to convert them to float32 with values in the range [0, 1]
+      x_train = x_train / np.float32(255)
+      y_train = y_train.astype(np.int64)
+      train_dataset = dataset_ops.DatasetV2.from_tensor_slices(
+          (x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
+      return train_dataset
+
+    def build_and_compile_cnn_model():
+      model = keras.Sequential([
+          keras.layers.Input(shape=(28, 28)),
+          keras.layers.Reshape(target_shape=(28, 28, 1)),
+          keras.layers.Conv2D(32, 3, activation='relu'),
+          keras.layers.Flatten(),
+          keras.layers.Dense(128, activation='relu'),
+          keras.layers.Dense(10)
+      ])
+      model.compile(
+          loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+          optimizer=gradient_descent.SGD(learning_rate=0.001),
+          metrics=['accuracy'])
+      return model
+
+    per_worker_batch_size = 64
+
+    single_worker_dataset = mnist_dataset(per_worker_batch_size)
+    single_worker_model = build_and_compile_cnn_model()
+    single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70)
+
+    num_workers = 4
+
+    def proc_func():
+      global_batch_size = per_worker_batch_size * num_workers
+      strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
+      with strategy.scope():
+        multi_worker_model = build_and_compile_cnn_model()
+
+      callbacks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath=os.path.join(self.get_temp_dir(), 'checkpoint'))
+      ]
+
+      multi_worker_dataset = mnist_dataset(global_batch_size)
+      if shard_policy:
+        options = dataset_ops.Options()
+        options.experimental_distribute.auto_shard_policy = shard_policy
+        multi_worker_dataset = multi_worker_dataset.with_options(options)
+
+      multi_worker_model.fit(
+          multi_worker_dataset,
+          epochs=3,
+          steps_per_epoch=70,
+          callbacks=callbacks)
+
+    with test_util.skip_if_error(self, errors_impl.UnavailableError):
+      mpr_result = multi_process_runner.run(
+          proc_func,
+          multi_worker_test_base.create_cluster_spec(num_workers=num_workers),
+          list_stdout=True)
+
+    def extract_accuracy(worker_id, input_string):
+      match = re.match(
+          r'\[worker\-{}\].*accuracy: (\d+\.\d+).*'.format(worker_id),
+          input_string)
+      return None if match is None else float(match.group(1))
+
+    for worker_id in range(num_workers):
+      accu_result = nest.map_structure(
+          lambda x: extract_accuracy(worker_id, x),  # pylint: disable=cell-var-from-loop
+          mpr_result.stdout)
+      self.assertTrue(
+          any(accu_result), 'Every worker is supposed to have accuracy result.')
+
+
+if __name__ == '__main__':
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/tpu_strategy_test_utils.py b/tensorflow/python/keras/distribute/tpu_strategy_test_utils.py
new file mode 100644
index 00000000000..14fdf92c2c7
--- /dev/null
+++ b/tensorflow/python/keras/distribute/tpu_strategy_test_utils.py
@@ -0,0 +1,47 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions for tests using TPUStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import google_type_annotations
+from __future__ import print_function
+
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import remote
+from tensorflow.python.platform import flags
+from tensorflow.python.tpu import tpu_strategy_util
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
+flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
+flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
+
+
+def get_tpu_cluster_resolver():
+  resolver = tpu_cluster_resolver.TPUClusterResolver(
+      tpu=FLAGS.tpu,
+      zone=FLAGS.zone,
+      project=FLAGS.project,
+  )
+  return resolver
+
+
+def get_tpu_strategy():
+  resolver = get_tpu_cluster_resolver()
+  remote.connect_to_cluster(resolver)
+  tpu_strategy_util.initialize_tpu_system(resolver)
+  return tpu_strategy.TPUStrategy(resolver)
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 3e78e70a106..1ff15d7e2e1 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -21,8 +21,8 @@ py_library(
     srcs = [
         "__init__.py",
         "compile_utils.py",
+        "functional.py",
         "input_layer.py",
-        "network.py",
         "node.py",
         "partial_batch_padding_handler.py",
         "saving.py",
@@ -425,24 +425,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "feature_columns_integration_test",
-    size = "medium",
-    srcs = ["feature_columns_integration_test.py"],
-    python_version = "PY3",
-    tags = [
-        "nomac",  # TODO(mihaimaruseac): b/127695564
-        "notsan",
-    ],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/feature_column:feature_column_py",
-        "//tensorflow/python/keras",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 tf_py_test(
     name = "training_eager_test",
     size = "medium",
@@ -478,9 +460,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "network_test",
+    name = "functional_test",
     size = "medium",
-    srcs = ["network_test.py"],
+    srcs = ["functional_test.py"],
     python_version = "PY3",
     shard_count = 8,
     tags = [
@@ -513,6 +495,23 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "node_test",
+    size = "medium",
+    srcs = ["node_test.py"],
+    python_version = "PY3",
+    shard_count = 3,
+    tags = [
+        "nomac",  # TODO(mihaimaruseac): b/127695564
+    ],
+    deps = [
+        ":base_layer",
+        ":engine",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/utils:layer_utils",
+    ],
+)
+
 tf_py_test(
     name = "base_layer_test",
     size = "medium",
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index d51ca1918eb..0f4bec92e39 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+import copy
 import functools
 import itertools
 import threading
@@ -123,8 +123,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       precision is used with a `tf.keras.mixed_precision.experimental.Policy`,
       this is instead just the dtype of the layer's weights, as the computations
       are done in a different dtype.
-    losses: List of losses added to this layer (via `self.add_loss()`).
-    metrics: List of metrics added to this layer (via `self.add_metric()`)..
     trainable_weights: List of variables to be included in backprop.
     non_trainable_weights: List of variables that should not be
       included in backprop.
@@ -315,7 +313,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # Provides information about which inputs are compatible with the layer.
     self._input_spec = None
     self.supports_masking = False
-    self._supports_ragged_inputs = False
 
     self._init_set_name(name)
     self._activity_regularizer = kwargs.pop('activity_regularizer', None)
@@ -415,7 +412,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
     Note here that `call()` method in `tf.keras` is little bit different
     from `keras` API. In `keras` API, you can pass support masking for
-    layers as additional arguements. Whereas `tf.keras` has `compute_mask()`
+    layers as additional arguments. Whereas `tf.keras` has `compute_mask()`
     method to support masking.
 
     Arguments:
@@ -799,15 +796,22 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       raise RuntimeError(
           'You must call `super().__init__()` in the layer constructor.')
 
-    # Grab the first positional or keyword argument.
-    if args:
-      inputs = args[0]
-      args = args[1:]
-    elif self._call_fn_args[0] in kwargs:
-      inputs = kwargs.pop(self._call_fn_args[0])
-    else:
-      raise ValueError(
-          'The first argument to `Layer.call` must always be passed.')
+    # 'inputs` (the first arg in the method spec) is special cased in
+    # layer call due to historical reasons.
+    # This special casing currently takes the form of:
+    # - 'inputs' must be explicitly passed. A layer cannot have zero arguments,
+    #   and inputs cannot have been provided via the default value of a kwarg.
+    # - numpy/scalar values in `inputs` get converted to tensors
+    # - implicit masks / mask metadata are only collected from 'inputs`
+    # - Layers are built using shape info from 'inputs' only
+    # - input_spec compatibility is only checked against `inputs`
+    # - checking if a layer has ragged tensor support is only done against
+    #   `inputs`
+    # - mixed precision casting (autocast) is only applied to `inputs`,
+    #   not to any other argument.
+    # - configuring the Functional API SavedModel saving spec for deciding what
+    #   should be serialized during SavedModel saving
+    inputs, args, kwargs = self._split_out_first_arg(args, kwargs)
 
     call_context = base_layer_utils.call_context()
     input_list = nest.flatten(inputs)
@@ -816,6 +820,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # This is always the case in graph mode. It can also be the case in eager
     # mode when all inputs can be traced back to `keras.Input()` (when building
     # models using the functional API).
+    # TODO(kaftan): make this not special case inputs. Instead
+    # build a functional api model if *any* *arg or **kwarg is symbolic,
+    # even if part of the data structure in that arg is not symbolic.
     build_graph = tf_utils.are_all_symbolic_tensors(input_list)
 
     # Accept NumPy and scalar inputs by converting to Tensors.
@@ -840,16 +847,17 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       mask_arg_passed_by_framework = True
       kwargs['mask'] = input_masks
 
-    # If `training` argument was not explicitly passed, propagate `training`
-    # value from this layer's calling layer.
+    # If `training` argument is None or not explicitly passed,
+    # propagate `training` value from this layer's calling layer.
+    training_value = None
     training_arg_passed_by_framework = False
     # Priority 1: `training` was explicitly passed.
     if self._call_arg_was_passed('training', args, kwargs):
       training_value = self._get_call_arg_value('training', args, kwargs)
       if not self._expects_training_arg:
         kwargs.pop('training')
-    else:
-      training_value = None
+
+    if training_value is None:
       # Priority 2: `training` was passed to a parent layer.
       if call_context.training is not None:
         training_value = call_context.training
@@ -869,12 +877,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           training_value = math_ops.cast(training_value, dtypes.bool)
         else:
           training_value = bool(training_value)
-        kwargs['training'] = training_value
+        args, kwargs = self._set_call_arg_value(
+            'training', training_value, args, kwargs)
         training_arg_passed_by_framework = True
 
     # Only create Keras history if at least one tensor originates from a
     # `keras.Input`. Otherwise this Layer may be being used outside the Keras
     # framework.
+    # TODO(kaftan): make this not special case inputs
     if build_graph and base_layer_utils.needs_keras_history(inputs):
       base_layer_utils.create_keras_history(inputs)
 
@@ -894,12 +904,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         # are casted, not before.
         input_spec.assert_input_compatibility(self.input_spec, inputs,
                                               self.name)
-        if (any(isinstance(x, ragged_tensor.RaggedTensor) for x in input_list)
-            and self._supports_ragged_inputs is False):  # pylint: disable=g-bool-id-comparison
-          raise ValueError('Layer %s does not support RaggedTensors as input. '
-                           'Inputs received: %s. You can try converting your '
-                           'input to an uniform tensor.' % (self.name, inputs))
-
         graph = backend.get_graph()
         with graph.as_default(), backend.name_scope(self._name_scope()):
           # Build layer if applicable (if the `build` method has been
@@ -955,13 +959,16 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             raise ValueError('A layer\'s `call` method should return a '
                              'Tensor or a list of Tensors, not None '
                              '(layer: ' + self.name + ').')
+          # TODO(kaftan): This should be 'any' and check all args
           if base_layer_utils.have_all_keras_metadata(inputs):
             if training_arg_passed_by_framework:
-              kwargs.pop('training')
+              args, kwargs = self._set_call_arg_value(
+                  'training', None, args, kwargs, pop_kwarg_if_none=True)
             if mask_arg_passed_by_framework:
               kwargs.pop('mask')
-            inputs, outputs = self._set_connectivity_metadata_(
-                inputs, outputs, args, kwargs)
+            # Node connectivity does not special-case the first argument.
+            outputs = self._set_connectivity_metadata((inputs,) + args, kwargs,
+                                                      outputs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks)
           if hasattr(self, '_set_inputs') and not self.inputs:
@@ -999,13 +1006,23 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     """Whether the layer is dynamic (eager-only); set in the constructor."""
     # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
     #                    then this cache logic must be updated.
-    return self._dynamic
+    return self._dynamic or any(layer.dynamic
+                                for layer in self._unique_sublayers())
+
+  def _unique_sublayers(self):
+    # Model.layers will use this as implementation, but we can't expose this
+    # one as the public property since it might conflict with subclass layers
+    # which also have user defined layers property.
+    self._maybe_create_attribute('_layers', [])
+    return list(
+        trackable_layer_utils.filter_empty_layer_containers(self._layers))
 
   @property
   @doc_controls.do_not_doc_inheritable
   @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful
+    return self._stateful or any(
+        getattr(layer, 'stateful', False) for layer in self._unique_sublayers())
 
   @stateful.setter
   @trackable_layer_utils.invalidate_recursive_cache('stateful')
@@ -1140,12 +1157,42 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @property
   def losses(self):
-    """Losses which are associated with this `Layer`.
+    """List of losses added using the `add_loss()` API.
 
     Variable regularization tensors are created when this property is accessed,
     so it is eager safe: accessing `losses` under a `tf.GradientTape` will
     propagate gradients back to the corresponding variables.
 
+    Examples:
+
+    >>> class MyLayer(tf.keras.layers.Layer):
+    ...   def call(self, inputs):
+    ...     self.add_loss(tf.abs(tf.reduce_mean(inputs)))
+    ...     return inputs
+    >>> l = MyLayer()
+    >>> l(np.ones((10, 1)))
+    >>> l.losses
+    [1.0]
+
+    >>> inputs = tf.keras.Input(shape=(10,))
+    >>> x = tf.keras.layers.Dense(10)(inputs)
+    >>> outputs = tf.keras.layers.Dense(1)(x)
+    >>> model = tf.keras.Model(inputs, outputs)
+    >>> # Activity regularization.
+    >>> model.add_loss(tf.abs(tf.reduce_mean(x)))
+    >>> model.losses
+    [<tf.Tensor 'Abs:0' shape=() dtype=float32>]
+
+    >>> inputs = tf.keras.Input(shape=(10,))
+    >>> d = tf.keras.layers.Dense(10, kernel_initializer='ones')
+    >>> x = d(inputs)
+    >>> outputs = tf.keras.layers.Dense(1)(x)
+    >>> model = tf.keras.Model(inputs, outputs)
+    >>> # Weight regularization.
+    >>> model.add_loss(lambda: tf.reduce_mean(d.kernel))
+    >>> model.losses
+    [<tf.Tensor: shape=(), dtype=float32, numpy=1.0>]
+
     Returns:
       A list of tensors.
     """
@@ -1169,7 +1216,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           collected_losses.append(loss_tensor)
     return collected_losses
 
-  def add_loss(self, losses, inputs=None):
+  def add_loss(self, losses, **kwargs):
     """Add loss tensor(s), potentially dependent on layer inputs.
 
     Some losses (for instance, activity regularization losses) may be dependent
@@ -1186,7 +1233,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     ```python
     class MyLayer(tf.keras.layers.Layer):
       def call(self, inputs):
-        self.add_loss(tf.abs(tf.reduce_mean(inputs)), inputs=True)
+        self.add_loss(tf.abs(tf.reduce_mean(inputs)))
         return inputs
     ```
 
@@ -1215,29 +1262,27 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
     ```python
     inputs = tf.keras.Input(shape=(10,))
-    x = tf.keras.layers.Dense(10)(inputs)
+    d = tf.keras.layers.Dense(10)
+    x = d(inputs)
     outputs = tf.keras.layers.Dense(1)(x)
     model = tf.keras.Model(inputs, outputs)
     # Weight regularization.
-    model.add_loss(lambda: tf.reduce_mean(x.kernel))
+    model.add_loss(lambda: tf.reduce_mean(d.kernel))
     ```
 
-    The `get_losses_for` method allows to retrieve the losses relevant to a
-    specific set of inputs.
-
     Arguments:
       losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
         may also be zero-argument callables which create a loss tensor.
-      inputs: Ignored when executing eagerly. If anything other than None is
-        passed, it signals the losses are conditional on some of the layer's
-        inputs, and thus they should only be run where these inputs are
-        available. This is the case for activity regularization losses, for
-        instance. If `None` is passed, the losses are assumed
-        to be unconditional, and will apply across all dataflows of the layer
-        (e.g. weight regularization losses).
+      **kwargs: Additional keyword arguments for backward compatibility.
+        Accepted values:
+          inputs - Deprecated, will be automatically inferred.
     """
-    def _tag_unconditional(loss):
-      """Process the loss and tag it by setting loss._unconditional_loss."""
+    kwargs.pop('inputs', None)
+    if kwargs:
+      raise TypeError('Unknown keyword arguments: %s' % (kwargs.keys(),))
+
+    def _tag_callable(loss):
+      """Tags callable loss tensor as `_unconditional_loss`."""
       if callable(loss):
         # We run the loss without autocasting, as regularizers are often
         # numerically unstable in float16.
@@ -1247,7 +1292,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         return None  # Will be filtered out when computing the .losses property
       if not tensor_util.is_tensor(loss):
         loss = ops.convert_to_tensor_v2(loss, dtype=backend.floatx())
-      loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
+      loss._unconditional_loss = True  # pylint: disable=protected-access
       return loss
 
     losses = nest.flatten(losses)
@@ -1257,7 +1302,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     symbolic_losses = []
     for loss in losses:
       if callable(loss):
-        callable_losses.append(functools.partial(_tag_unconditional, loss))
+        callable_losses.append(functools.partial(_tag_callable, loss))
         continue
       if loss is None:
         continue
@@ -1266,9 +1311,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # TF Functions should take the eager path.
       if (tf_utils.is_symbolic_tensor(loss) and
           not base_layer_utils.is_in_tf_function()):
-        symbolic_losses.append(_tag_unconditional(loss))
+        symbolic_losses.append(loss)
       elif tensor_util.is_tensor(loss):
-        eager_losses.append(_tag_unconditional(loss))
+        eager_losses.append(loss)
 
     self._callable_losses.extend(callable_losses)
 
@@ -1302,7 +1347,21 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @property
   def metrics(self):
-    """List of `tf.keras.metrics.Metric` instances tracked by the layer."""
+    """List of metrics added using the `add_metric()` API.
+
+    Example:
+
+    >>> input = tf.keras.layers.Input(shape=(3,))
+    >>> d = tf.keras.layers.Dense(2)
+    >>> output = d(input)
+    >>> d.add_metric(tf.reduce_max(output), name='max')
+    >>> d.add_metric(tf.reduce_min(output), name='min')
+    >>> [m.name for m in d.metrics]
+    ['max', 'min']
+
+    Returns:
+      A list of tensors.
+    """
     collected_metrics = []
     all_layers = self._gather_unique_layers()
     for layer in all_layers:
@@ -1313,6 +1372,47 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def add_metric(self, value, name=None, **kwargs):
     """Adds metric tensor to the layer.
 
+    This method can be used inside the `call()` method of a subclassed layer
+    or model.
+
+    ```python
+    class MyMetricLayer(tf.keras.layers.Layer):
+      def __init__(self):
+        super(MyMetricLayer, self).__init__(name='my_metric_layer')
+        self.mean = metrics_module.Mean(name='metric_1')
+
+      def call(self, inputs):
+        self.add_metric(self.mean(x))
+        self.add_metric(math_ops.reduce_sum(x), name='metric_2')
+        return inputs
+    ```
+
+    This method can also be called directly on a Functional Model during
+    construction. In this case, any tensor passed to this Model must
+    be symbolic and be able to be traced back to the model's `Input`s. These
+    metrics become part of the model's topology and are tracked when you
+    save the model via `save()`.
+
+    ```python
+    inputs = tf.keras.Input(shape=(10,))
+    x = tf.keras.layers.Dense(10)(inputs)
+    outputs = tf.keras.layers.Dense(1)(x)
+    model = tf.keras.Model(inputs, outputs)
+    model.add_metric(math_ops.reduce_sum(x), name='metric_1')
+    ```
+
+    Note: Calling `add_metric()` with the result of a metric object on a
+    Functional Model, as shown in the example below, is not supported. This is
+    because we cannot trace the metric result tensor back to the model's inputs.
+
+    ```python
+    inputs = tf.keras.Input(shape=(10,))
+    x = tf.keras.layers.Dense(10)(inputs)
+    outputs = tf.keras.layers.Dense(1)(x)
+    model = tf.keras.Model(inputs, outputs)
+    model.add_metric(tf.keras.metrics.Mean()(x), name='metric_1')
+    ```
+
     Args:
       value: Metric tensor.
       name: String metric name.
@@ -1402,9 +1502,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     dependent on `a` and some on `b`. This method automatically keeps track
     of dependencies.
 
-    The `get_updates_for` method allows to retrieve the updates relevant to a
-    specific set of inputs.
-
     This call is ignored when eager execution is enabled (in that case, variable
     updates are run on the fly and thus do not need to be tracked for later
     execution).
@@ -1436,12 +1533,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             update()
       return
 
-    if call_context.in_call:
-      relevant_inputs = call_context.inputs
-    else:
-      inbound_nodes = getattr(self, '_inbound_nodes', [])
-      relevant_inputs = [node.input_tensors for node in inbound_nodes]
-
     def process_update(x):
       """Standardize update ops.
 
@@ -1463,9 +1554,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         update = x.op
       else:
         update = ops.convert_to_tensor_v2(x)
-
-      reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, [update])
-      update._unconditional_update = update not in reachable
       return update
 
     updates = [process_update(x) for x in updates]
@@ -1599,9 +1687,13 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         output_weights.append(weight)
     return backend.batch_get_value(output_weights)
 
+  @deprecation.deprecated(
+      date=None, instructions='Please use `layer.updates` instead.')
   @doc_controls.do_not_generate_docs
   def get_updates_for(self, inputs):
-    """Retrieves updates relevant to a specific set of inputs.
+    """Deprecated, do NOT use!
+
+    Retrieves updates relevant to a specific set of inputs.
 
     Arguments:
       inputs: Input tensor or list/tuple of input tensors.
@@ -1609,19 +1701,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       List of update ops of the layer that depend on `inputs`.
     """
-    if inputs is None:
-      # Requesting unconditional updates.
-      return [u for u in self.updates if u._unconditional_update]
+    return self.updates
 
-    # Requesting input-conditional updates.
-    updates = [u for u in self.updates if not u._unconditional_update]
-    inputs = nest.flatten(inputs)
-    reachable = tf_utils.get_reachable_from_inputs(inputs, updates)
-    return [u for u in updates if u in reachable]
-
-  @doc_controls.do_not_doc_inheritable
+  @deprecation.deprecated(
+      date=None, instructions='Please use `layer.losses` instead.')
+  @doc_controls.do_not_generate_docs
   def get_losses_for(self, inputs):
-    """Retrieves losses relevant to a specific set of inputs.
+    """Deprecated, do NOT use!
+
+    Retrieves losses relevant to a specific set of inputs.
 
     Arguments:
       inputs: Input tensor or list/tuple of input tensors.
@@ -1629,15 +1717,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       List of loss tensors of the layer that depend on `inputs`.
     """
-    if inputs is None:
-      # Requesting unconditional losses.
-      return [l for l in self.losses if l._unconditional_loss]
-
-    # Requesting input-conditional losses.
-    losses = [l for l in self.losses if not l._unconditional_loss]
-    inputs = nest.flatten(inputs)
-    reachable = tf_utils.get_reachable_from_inputs(inputs, losses)
-    return [l for l in losses if l in reachable]
+    return self.losses
 
   @doc_controls.do_not_doc_inheritable
   def get_input_mask_at(self, node_index):
@@ -2168,7 +2248,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
               array_ops.shape(output)[0], activity_loss.dtype)
           # Make activity regularization strength batch-agnostic.
           mean_activity_loss = activity_loss / batch_size
-          self.add_loss(mean_activity_loss, inputs=inputs)
+          self.add_loss(mean_activity_loss)
 
   def _set_mask_metadata(self, inputs, outputs, previous_mask):
     flat_outputs = nest.flatten(outputs)
@@ -2245,70 +2325,45 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     args_dict = dict(zip(call_fn_args, args))
     return args_dict[arg_name]
 
-  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
+  def _set_call_arg_value(
+      self, arg_name, new_value, args,
+      kwargs, inputs_in_args=False, pop_kwarg_if_none=False):
+    arg_pos = self._call_fn_arg_positions.get(arg_name, None)
+    if arg_pos is not None:
+      if not inputs_in_args:
+        # Ignore `inputs` arg.
+        arg_pos = arg_pos - 1
+      if len(args) > arg_pos:
+        args = list(args)
+        args[arg_pos] = new_value
+        return args, kwargs
+    if new_value is None and pop_kwarg_if_none:
+      kwargs.pop(arg_name, None)
+    else:
+      kwargs[arg_name] = new_value
+    return args, kwargs
 
-    # If the layer returns tensors from its inputs, unmodified,
-    # we copy them to avoid loss of tensor metadata.
-    output_ls = nest.flatten(outputs)
-    inputs_ls = object_identity.ObjectIdentitySet(nest.flatten(inputs))
-    output_ls_copy = []
-    for x in output_ls:
-      if x in inputs_ls:
+  def _set_connectivity_metadata(self, args, kwargs, outputs):
+    # If the layer returns tensors from its inputs unmodified,
+    # we copy them to avoid loss of KerasHistory metadata.
+    flat_outputs = nest.flatten(outputs)
+    flat_inputs = nest.flatten((args, kwargs))
+    inputs_set = object_identity.ObjectIdentitySet(flat_inputs)
+    outputs_copy = []
+    for x in flat_outputs:
+      if x in inputs_set:
         with backend.name_scope(self.name):
           x = array_ops.identity(x)
-      output_ls_copy.append(x)
-    outputs = nest.pack_sequence_as(outputs, output_ls_copy)
+      outputs_copy.append(x)
+    outputs = nest.pack_sequence_as(outputs, outputs_copy)
 
-    # Ignore `inputs` arg.
-    arguments = dict(zip(self._call_fn_args[1:], args))
-    arguments.update(kwargs)
-
-    # Add an inbound node to the layer, so it can keep track of this call.
-    # This updates the layer history of the output tensor(s).
-    self._add_inbound_node(
-        input_tensors=inputs, output_tensors=outputs, arguments=arguments)
-    return inputs, outputs
-
-  def _add_inbound_node(self,
-                        input_tensors,
-                        output_tensors,
-                        arguments=None):
-    """Internal method to create an inbound node for the layer.
-
-    Arguments:
-        input_tensors: list of input tensors.
-        output_tensors: list of output tensors.
-        arguments: dictionary of keyword arguments that were passed to the
-            `call` method of the layer at the call that created the node.
-    """
-    inbound_layers = nest.map_structure(lambda t: t._keras_history.layer,
-                                        input_tensors)
-    node_indices = nest.map_structure(lambda t: t._keras_history.node_index,
-                                      input_tensors)
-    tensor_indices = nest.map_structure(lambda t: t._keras_history.tensor_index,
-                                        input_tensors)
-
-    # Create node, add it to inbound nodes.
-    node_module.Node(
-        self,
-        inbound_layers=inbound_layers,
-        node_indices=node_indices,
-        tensor_indices=tensor_indices,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        arguments=arguments)
-
-    # Update tensor history metadata.
-    # The metadata attribute consists of
-    # 1) a layer instance
-    # 2) a node index for the layer
-    # 3) a tensor index for the node.
-    # The allows layer reuse (multiple nodes per layer) and multi-output
-    # or multi-input layers (e.g. a layer can return multiple tensors,
-    # and each can be sent to a different layer).
-    for i, tensor in enumerate(nest.flatten(output_tensors)):
-      tensor._keras_history = KerasHistory(self,
-                                           len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+    # Create node, Node wires itself to inbound and outbound layers.
+    # The Node constructor actually updates this layer's self._inbound_nodes,
+    # sets _keras_history on the outputs, and adds itself to the
+    # `_outbound_nodes` of the layers that produced the inputs to this
+    # layer call.
+    node_module.Node(self, call_args=args, call_kwargs=kwargs, outputs=outputs)
+    return outputs
 
   def _get_node_attribute_at_index(self, node_index, attr, attr_name):
     """Private utility to retrieves an attribute (e.g. inputs) from a node.
@@ -2530,6 +2585,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     except AttributeError:
       pass
 
+    # Keep track of metric instance created in subclassed layer.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    for val in nest.flatten(value):
+      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
+        self._metrics.append(val)
+
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
@@ -2652,6 +2713,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       return all_args[1:]
     return all_args
 
+  @property
+  @tracking.cached_per_instance
+  def _call_fn_arg_positions(self):
+    call_fn_arg_positions = dict()
+    for pos, arg in enumerate(self._call_fn_args):
+      call_fn_arg_positions[arg] = pos
+    return call_fn_arg_positions
+
   @property
   @tracking.cached_per_instance
   def _call_accepts_kwargs(self):
@@ -2689,6 +2758,21 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         seen_weights.add(w)
     return output
 
+  def _split_out_first_arg(self, args, kwargs):
+    # Grab the argument corresponding to the first argument in the
+    # layer's `call` method spec. This will either be the first positional
+    # argument, or it will be provided as a keyword argument.
+    if args:
+      inputs = args[0]
+      args = args[1:]
+    elif self._call_fn_args[0] in kwargs:
+      kwargs = copy.copy(kwargs)
+      inputs = kwargs.pop(self._call_fn_args[0])
+    else:
+      raise ValueError(
+          'The first argument to `Layer.call` must always be passed.')
+    return inputs, args, kwargs
+
   # SavedModel properties. Please see keras/saving/saved_model for details.
 
   @property
@@ -2898,31 +2982,6 @@ class AddMetric(Layer):
     return config
 
 
-class KerasHistory(
-    collections.namedtuple('KerasHistory',
-                           ['layer', 'node_index', 'tensor_index'])):
-  """Tracks the Layer call that created a Tensor, for Keras Graph Networks.
-
-  During construction of Keras Graph Networks, this metadata is added to
-  each Tensor produced as the output of a Layer, starting with an
-  `InputLayer`. This allows Keras to track how each Tensor was produced, and
-  this information is later retraced by the `keras.engine.Network` class to
-  reconstruct the Keras Graph Network.
-
-  Attributes:
-    layer: The Layer that produced the Tensor.
-    node_index: The specific call to the Layer that produced this Tensor. Layers
-      can be called multiple times in order to share weights. A new node is
-      created every time a Layer is called.
-    tensor_index: The output index for this Tensor. Always zero if the Layer
-      that produced this Tensor only has one output. Nested structures of
-      Tensors are deterministically assigned an index via `nest.flatten`.
-  """
-  # Added to maintain memory and performance characteristics of `namedtuple`
-  # while subclassing.
-  __slots__ = ()
-
-
 # Avoid breaking users who directly import this symbol from this file.
 # TODO(fchollet): remove this.
 InputSpec = input_spec.InputSpec  # pylint:disable=invalid-name
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 6c3fc04bf77..82c60eb34c8 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -1454,7 +1454,6 @@ class DTypeTest(keras_parameterized.TestCase):
         row_splits=array_ops.constant([0, 2, 2, 3], dtype='int64'))
 
     layer = IdentityLayer(dtype='float16')
-    layer._supports_ragged_inputs = True
 
     for x in sparse, ragged:
       self.assertEqual(x.dtype, 'float32')
@@ -1462,19 +1461,6 @@ class DTypeTest(keras_parameterized.TestCase):
       self.assertEqual(y.dtype, 'float16')
       self.assertEqual(type(x), type(y))
 
-  def test_supports_ragged_inputs_attribute_error(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'does not support RaggedTensors'):
-      ragged = ragged_tensor.RaggedTensor.from_row_splits(
-          values=array_ops.constant([1., 2., 3.], dtype='float32'),
-          row_splits=array_ops.constant([0, 2, 2, 3], dtype='int64'))
-      model = sequential.Sequential([
-          input_layer.InputLayer(input_shape=(None,), ragged=True),
-          IdentityLayer()
-      ])
-      model.compile(rmsprop.RMSprop(0.001), loss='mse')
-      model.train_on_batch(ragged)
-
   @testing_utils.enable_v2_dtype_behavior
   def test_passing_non_tensor(self):
     layer = IdentityLayer()
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index a0e1e9edc2f..5980eeaf115 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import threading
 
 from tensorflow.python import tf2
@@ -34,9 +35,11 @@ from tensorflow.python.ops import control_flow_v2_func_graphs
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.tracking import base as tracking
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import keras_export
 
 _call_context = threading.local()
 
@@ -119,7 +122,7 @@ def make_variable(name,
         initializer,
         (type(init_ops.Initializer), type(init_ops_v2.Initializer))):
       initializer = initializer()
-    init_val = lambda: initializer(shape, dtype=dtype)
+    init_val = functools.partial(initializer, shape, dtype=dtype)
     variable_dtype = dtype.base_dtype
   if use_resource is None:
     use_resource = True
@@ -254,8 +257,10 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
       op_layer = base_layer.TensorFlowOpLayer(
           node_def, constants=constants, name=name)
       created_layers.append(op_layer)
-      op_layer._add_inbound_node(  # pylint: disable=protected-access
-          layer_inputs, op.outputs)
+      op_layer._set_connectivity_metadata(  # pylint: disable=protected-access
+          args=(layer_inputs,),
+          kwargs={},
+          outputs=op.outputs)
       processed_ops.update([op])
   return processed_ops, created_layers
 
@@ -658,33 +663,34 @@ def mark_as_return(outputs, acd):
 V2_DTYPE_BEHAVIOR = None
 
 
+@keras_export(v1=['keras.layers.enable_v2_dtype_behavior'])
 def enable_v2_dtype_behavior():
   """Enable the V2 dtype behavior for Keras layers.
 
-  By default, the V2 dtype behavior is enabled in TensorFlow 2.
+  By default, the V2 dtype behavior is enabled in TensorFlow 2, so this function
+  is only useful if `tf.compat.v1.disable_v2_behavior` has been called. Since
+  mixed precision requires V2 dtype behavior to be enabled, this function allows
+  you to use mixed precision in Keras layers if `disable_v2_behavior` has been
+  called.
 
   When enabled, the dtype of Keras layers defaults to floatx (which is typically
   float32) instead of None. In addition, layers will automatically cast
   floating-point inputs to the layer's dtype.
 
-  For example, once enabled, the following block will run a Conv2D layer
-  in float32:
-
-  ```python
-  x = tf.ones((4, 4, 4, 4), dtype='float64')
-  layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
-  print(layer.dtype)  # Float32 when enabled. None when disabled.
-  # When enabled, will cast inputs to the layer's dtype, which is float32. When
-  # disabled, will do no casting, so the layer is done in float64.
-  y = layer(x)
-  ```
+  >>> x = tf.ones((4, 4, 4, 4), dtype='float64')
+  >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
+  >>> print(layer.dtype)  # float32 since V2 dtype behavior is enabled
+  float32
+  >>> y = layer(x)  # Layer casts inputs since V2 dtype behavior is enabled
+  >>> print(y.dtype.name)
+  float32
 
   A layer author can opt-out their layer from the automatic input casting by
   passing `autocast=False` to the base Layer's constructor. This disables the
   autocasting part of the V2 behavior for that layer, but not the defaulting to
   floatx part of the V2 behavior.
 
-  When a global `tf.keras.mixed_precision.experimental.Policy` is set, the
+  When a global `tf.keras.mixed_precision.experimental.Policy` is set, a Keras
   layer's dtype will default to the global policy instead of floatx. Layers
   will automatically cast inputs to the policy's compute_dtype.
   """
@@ -692,12 +698,11 @@ def enable_v2_dtype_behavior():
   V2_DTYPE_BEHAVIOR = True
 
 
+@keras_export(v1=['keras.layers.disable_v2_dtype_behavior'])
 def disable_v2_dtype_behavior():
   """Disables the V2 dtype behavior for Keras layers.
 
-  See `enable_v2_dtype_behavior`.
-
-  This function will be removed in the future.
+  See `tf.compat.v1.keras.layers.enable_v2_dtype_behavior`.
   """
   global V2_DTYPE_BEHAVIOR
   V2_DTYPE_BEHAVIOR = False
@@ -781,6 +786,14 @@ class TrackableWeightHandler(object):
     backend.get_session().run(self._assign_op, feed_dict)
 
 
+def no_ragged_support(inputs, layer_name):
+  input_list = nest.flatten(inputs)
+  if any(isinstance(x, ragged_tensor.RaggedTensor) for x in input_list):
+    raise ValueError('Layer %s does not support RaggedTensors as input. '
+                     'Inputs received: %s. You can try converting your '
+                     'input to an uniform tensor.' % (layer_name, inputs))
+
+
 # TODO(kathywu): This is a temporary hack. When a network of layers is revived
 # from SavedModel, only the top-level layer will have losses. This causes issues
 # in eager mode because the child layers may have graph losses
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 7b4ce8ad54c..80e0b4be2f1 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -45,7 +45,6 @@ from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
-from tensorflow.python.keras.engine import node as node_module
 from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.mixed_precision.experimental import policy
@@ -183,7 +182,6 @@ class Layer(base_layer.Layer):
     # Provides information about which inputs are compatible with the layer.
     self._input_spec = None
     self.supports_masking = False
-    self._supports_ragged_inputs = False
 
     self._init_set_name(name)
     self._activity_regularizer = kwargs.pop('activity_regularizer', None)
@@ -698,16 +696,17 @@ class Layer(base_layer.Layer):
       mask_arg_passed_by_framework = True
       kwargs['mask'] = input_masks
 
-    # If `training` argument was not explicitly passed, propagate `training`
-    # value from this layer's calling layer.
+    # If `training` argument is None or not explicitly passed,
+    # propagate `training` value from this layer's calling layer.
+    training_value = None
     training_arg_passed_by_framework = False
     # Priority 1: `training` was explicitly passed.
     if self._call_arg_was_passed('training', args, kwargs):
       training_value = self._get_call_arg_value('training', args, kwargs)
       if not self._expects_training_arg:
         kwargs.pop('training')
-    else:
-      training_value = None
+
+    if training_value is None:
       # Priority 2: `training` was passed to a parent layer.
       if call_context.training is not None:
         training_value = call_context.training
@@ -727,7 +726,8 @@ class Layer(base_layer.Layer):
           training_value = math_ops.cast(training_value, dtypes.bool)
         else:
           training_value = bool(training_value)
-        kwargs['training'] = training_value
+        args, kwargs = self._set_call_arg_value(
+            'training', training_value, args, kwargs)
         training_arg_passed_by_framework = True
 
     # Only create Keras history if at least one tensor originates from a
@@ -745,12 +745,6 @@ class Layer(base_layer.Layer):
         # are casted, not before.
         input_spec.assert_input_compatibility(self.input_spec, inputs,
                                               self.name)
-        if (any(isinstance(x, ragged_tensor.RaggedTensor) for x in input_list)
-            and self._supports_ragged_inputs is False):  # pylint: disable=g-bool-id-comparison
-          raise ValueError('Layer %s does not support RaggedTensors as input. '
-                           'Inputs received: %s. You can try converting your '
-                           'input to an uniform tensor.' % (self.name, inputs))
-
         graph = backend.get_graph()
         with graph.as_default(), backend.name_scope(self._name_scope()):
           # Build layer if applicable (if the `build` method has been
@@ -798,11 +792,12 @@ class Layer(base_layer.Layer):
                              '(layer: ' + self.name + ').')
           if base_layer_utils.have_all_keras_metadata(inputs):
             if training_arg_passed_by_framework:
-              kwargs.pop('training')
+              args, kwargs = self._set_call_arg_value(
+                  'training', None, args, kwargs, pop_kwarg_if_none=True)
             if mask_arg_passed_by_framework:
               kwargs.pop('mask')
-            inputs, outputs = self._set_connectivity_metadata_(
-                inputs, outputs, args, kwargs)
+            outputs = self._set_connectivity_metadata((inputs,) + args, kwargs,
+                                                      outputs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks)
           if hasattr(self, '_set_inputs') and not self.inputs:
@@ -838,13 +833,15 @@ class Layer(base_layer.Layer):
   def dynamic(self):
     # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
     #                    then this cache logic must be updated.
-    return self._dynamic
+    return self._dynamic or any(layer.dynamic
+                                for layer in self._unique_sublayers())
 
   @property
   @doc_controls.do_not_generate_docs
   @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful
+    return self._stateful or any(
+        getattr(layer, 'stateful', False) for layer in self._unique_sublayers())
 
   @stateful.setter
   @trackable_layer_utils.invalidate_recursive_cache('stateful')
@@ -2005,70 +2002,23 @@ class Layer(base_layer.Layer):
     args_dict = dict(zip(call_fn_args, args))
     return args_dict[arg_name]
 
-  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
-
-    # If the layer returns tensors from its inputs, unmodified,
-    # we copy them to avoid loss of tensor metadata.
-    output_ls = nest.flatten(outputs)
-    inputs_ls = object_identity.ObjectIdentitySet(nest.flatten(inputs))
-    output_ls_copy = []
-    for x in output_ls:
-      if x in inputs_ls:
-        with backend.name_scope(self.name):
-          x = array_ops.identity(x)
-      output_ls_copy.append(x)
-    outputs = nest.pack_sequence_as(outputs, output_ls_copy)
-
-    # Ignore `inputs` arg.
-    arguments = dict(zip(self._call_fn_args[1:], args))
-    arguments.update(kwargs)
-
-    # Add an inbound node to the layer, so it can keep track of this call.
-    # This updates the layer history of the output tensor(s).
-    self._add_inbound_node(
-        input_tensors=inputs, output_tensors=outputs, arguments=arguments)
-    return inputs, outputs
-
-  def _add_inbound_node(self,
-                        input_tensors,
-                        output_tensors,
-                        arguments=None):
-    """Internal method to create an inbound node for the layer.
-
-    Arguments:
-        input_tensors: list of input tensors.
-        output_tensors: list of output tensors.
-        arguments: dictionary of keyword arguments that were passed to the
-            `call` method of the layer at the call that created the node.
-    """
-    inbound_layers = nest.map_structure(lambda t: t._keras_history.layer,
-                                        input_tensors)
-    node_indices = nest.map_structure(lambda t: t._keras_history.node_index,
-                                      input_tensors)
-    tensor_indices = nest.map_structure(lambda t: t._keras_history.tensor_index,
-                                        input_tensors)
-
-    # Create node, add it to inbound nodes.
-    node_module.Node(
-        self,
-        inbound_layers=inbound_layers,
-        node_indices=node_indices,
-        tensor_indices=tensor_indices,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        arguments=arguments)
-
-    # Update tensor history metadata.
-    # The metadata attribute consists of
-    # 1) a layer instance
-    # 2) a node index for the layer
-    # 3) a tensor index for the node.
-    # The allows layer reuse (multiple nodes per layer) and multi-output
-    # or multi-input layers (e.g. a layer can return multiple tensors,
-    # and each can be sent to a different layer).
-    for i, tensor in enumerate(nest.flatten(output_tensors)):
-      tensor._keras_history = KerasHistory(self,
-                                           len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+  def _set_call_arg_value(
+      self, arg_name, new_value, args,
+      kwargs, inputs_in_args=False, pop_kwarg_if_none=False):
+    arg_pos = self._call_fn_arg_positions.get(arg_name, None)
+    if arg_pos is not None:
+      if not inputs_in_args:
+        # Ignore `inputs` arg.
+        arg_pos = arg_pos - 1
+      if len(args) > arg_pos:
+        args = list(args)
+        args[arg_pos] = new_value
+        return args, kwargs
+    if new_value is None and pop_kwarg_if_none:
+      kwargs.pop(arg_name, None)
+    else:
+      kwargs[arg_name] = new_value
+    return args, kwargs
 
   def _get_node_attribute_at_index(self, node_index, attr, attr_name):
     """Private utility to retrieves an attribute (e.g. inputs) from a node.
@@ -2273,6 +2223,12 @@ class Layer(base_layer.Layer):
     except AttributeError:
       pass
 
+    # Keep track of metric instance created in subclassed layer.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    for val in nest.flatten(value):
+      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
+        self._metrics.append(val)
+
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
@@ -2395,6 +2351,14 @@ class Layer(base_layer.Layer):
       return all_args[1:]
     return all_args
 
+  @property
+  @tracking.cached_per_instance
+  def _call_fn_arg_positions(self):
+    call_fn_arg_positions = dict()
+    for pos, arg in enumerate(self._call_fn_args):
+      call_fn_arg_positions[arg] = pos
+    return call_fn_arg_positions
+
   @property
   @tracking.cached_per_instance
   def _call_accepts_kwargs(self):
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index 84138dd0a00..efd8a0e621f 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -143,9 +143,12 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
       accumulator = self._combiner.restore(self._restore_updates())
 
     if not isinstance(data,
-                      (dataset_ops.DatasetV2, np.ndarray, ops.EagerTensor)):
+                      (dataset_ops.DatasetV2,
+                       np.ndarray,
+                       ops.Tensor,
+                       ragged_tensor.RaggedTensor)):
       raise ValueError(
-          '`adapt()` requires a batched Dataset, an EagerTensor, '
+          '`adapt()` requires a batched Dataset, a Tensor, '
           'or a Numpy array as input, '
           'got {}'.format(type(data)))
 
@@ -158,9 +161,14 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
             'elements. Please use `dataset.take(...)` to make the number '
             'of elements finite.')
       next_data = self._get_dataset_iterator(data)
+      # TODO(fchollet): consider checking if the dataset is already batched
+      # and otherwise batching it.
+    elif isinstance(data, (ops.Tensor, ragged_tensor.RaggedTensor)):
+      next_data = self._get_dataset_iterator(
+          dataset_ops.Dataset.from_tensor_slices(data).batch(512))
     else:
       generator, _ = training_generator.convert_to_generator_like(
-          data, batch_size=len(data))
+          data, batch_size=512)
       # If the data is not a dataset, we can iterate over it using next(foo);
       # here, we wrap that into a callable.
       next_data = lambda: next(generator)
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py b/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py
index fb77b696f68..f603fac25c3 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py
@@ -55,8 +55,9 @@ class CombinerPreprocessingLayer(
 
   def _get_dataset_iterator(self, dataset):
     """Gets an iterator from a tf.data.Dataset."""
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     session = K.get_session()
+    session.run(iterator.initializer)
     next_element = iterator.get_next()
     return lambda: session.run(next_element)
 
diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index fd792e0ee8c..aa5578dd03c 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -192,6 +192,7 @@ class LossesContainer(Container):
 
     loss_values = []  # Used for gradient calculation.
     loss_metric_values = []  # Used for loss metric calculation.
+    batch_dim = None
     zip_args = (y_true, y_pred, sample_weight, self._losses, self._loss_weights,
                 self._per_output_metrics)
     for y_t, y_p, sw, loss_obj, loss_weight, metric_obj in zip(*zip_args):
@@ -207,8 +208,11 @@ class LossesContainer(Container):
       # Correct for the `Mean` loss metrics counting each replica as a batch.
       if loss_obj.reduction == losses_utils.ReductionV2.SUM:
         loss_metric_value *= ds_context.get_strategy().num_replicas_in_sync
+
+      if batch_dim is None:
+        batch_dim = array_ops.shape(y_t)[0]
       if metric_obj is not None:
-        metric_obj.update_state(loss_metric_value)
+        metric_obj.update_state(loss_metric_value, sample_weight=batch_dim)
 
       if loss_weight is not None:
         loss_value *= loss_weight
@@ -232,7 +236,8 @@ class LossesContainer(Container):
       loss_metric_values = losses_utils.cast_losses_to_common_dtype(
           loss_metric_values)
       total_loss_metric_value = math_ops.add_n(loss_metric_values)
-      self._loss_metric.update_state(total_loss_metric_value)
+      self._loss_metric.update_state(
+          total_loss_metric_value, sample_weight=batch_dim)
 
       loss_values = losses_utils.cast_losses_to_common_dtype(loss_values)
       total_loss = math_ops.add_n(loss_values)
@@ -604,7 +609,12 @@ def match_dtype_and_rank(y_t, y_p, sw):
       sw = array_ops.expand_dims_v2(sw, axis=-1)
 
   # Dtype.
-  y_t = math_ops.cast(y_t, y_p.dtype)
+  # This is required mainly for custom loss functions which do not take care
+  # casting dtypes.
+  if ((y_t.dtype.is_floating and y_p.dtype.is_floating) or
+      (y_t.dtype.is_integer and y_p.dtype.is_integer)):
+    y_t = math_ops.cast(y_t, y_p.dtype)
+
   if sw is not None:
     sw = math_ops.cast(sw, y_p.dtype)
   return y_t, y_p, sw
diff --git a/tensorflow/python/keras/engine/compile_utils_test.py b/tensorflow/python/keras/engine/compile_utils_test.py
index 6f5fafe4a54..189d385de07 100644
--- a/tensorflow/python/keras/engine/compile_utils_test.py
+++ b/tensorflow/python/keras/engine/compile_utils_test.py
@@ -18,11 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_mod
 from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -239,6 +243,52 @@ class LossesContainerTest(keras_parameterized.TestCase):
     self.assertEqual(output_3_metric.name, 'output3_loss')
     self.assertEqual(output_3_metric.result().numpy(), 2.)
 
+  def test_mismatched_dtypes(self):
+    y_t = constant_op.constant([1, 9, 2, -5], shape=(2, 2))
+    y_p = constant_op.constant([4, 8, 12, 8],
+                               shape=(2, 2),
+                               dtype=dtypes.float32)
+
+    def my_mae(labels, preds):
+      self.assertEqual(labels.dtype, dtypes.int32)
+      self.assertEqual(preds.dtype, dtypes.float32)
+      labels = math_ops.cast(labels, preds.dtype)
+      return K.mean(math_ops.abs(preds - labels), axis=-1)
+
+    loss_container = compile_utils.LossesContainer(my_mae)
+    total_loss = loss_container(y_t, y_p)
+    self.assertEqual(total_loss.dtype, dtypes.float32)
+
+  def test_integer_dtypes(self):
+    y_t = constant_op.constant([1, 9, 2, -5], shape=(2, 2))
+    y_p = constant_op.constant([4, 8, 12, 8], shape=(2, 2), dtype=dtypes.int64)
+
+    def my_mae(labels, preds):
+      self.assertEqual(labels.dtype, dtypes.int64)
+      self.assertEqual(preds.dtype, dtypes.int64)
+      return K.mean(math_ops.abs(preds - labels), axis=-1)
+
+    loss_container = compile_utils.LossesContainer(my_mae)
+    total_loss = loss_container(y_t, y_p)
+    self.assertEqual(total_loss.dtype, dtypes.int64)
+
+  def test_float_dtypes(self):
+    y_t = constant_op.constant([1, 9, 2, -5],
+                               shape=(2, 2),
+                               dtype=dtypes.float32)
+    y_p = constant_op.constant([4, 8, 12, 8],
+                               shape=(2, 2),
+                               dtype=dtypes.float64)
+
+    def my_mae(labels, preds):
+      self.assertEqual(labels.dtype, dtypes.float64)
+      self.assertEqual(preds.dtype, dtypes.float64)
+      return K.mean(math_ops.abs(preds - labels), axis=-1)
+
+    loss_container = compile_utils.LossesContainer(my_mae)
+    total_loss = loss_container(y_t, y_p)
+    self.assertEqual(total_loss.dtype, dtypes.float64)
+
 
 class MetricsContainerTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index 3994db4a541..fdfd0af722f 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -746,7 +746,7 @@ class DatasetAdapter(DataAdapter):
       if size == cardinality.INFINITE and steps is None:
         raise ValueError(
             "When providing an infinite dataset, you must specify "
-            "the number of steps to run (if you did not intend to ."
+            "the number of steps to run (if you did not intend to "
             "create an infinite dataset, make sure to not call "
             "`repeat()` on the dataset).")
 
@@ -849,11 +849,6 @@ class GeneratorDataAdapter(DataAdapter):
                               max_queue_size):
     """Create a callable, possibly including an Enqueuer."""
     if workers > 1 or (workers > 0 and use_multiprocessing):
-      if use_multiprocessing:
-        logging.warning(
-            UserWarning("Using a generator with `use_multiprocessing=True` "
-                        "and multiple workers may duplicate your data. "
-                        "Please consider using the `tf.data.Dataset`."))
       def generator_fn():
         enqueuer = data_utils.GeneratorEnqueuer(
             x, use_multiprocessing=use_multiprocessing)
@@ -1106,20 +1101,21 @@ class DataHandler(object):
                workers=1,
                use_multiprocessing=False,
                model=None,
-               steps_per_execution=1):
+               steps_per_execution=None):
 
     self._initial_epoch = initial_epoch
     self._epochs = epochs
     self._insufficient_data = False
     self._model = model
-    self._steps_per_execution = steps_per_execution
 
-    # This `Variable` is assigned to by `DataHandler` to allow partial
-    # executions. Save its original value here to reset after a partial
-    # execution.
-    if isinstance(steps_per_execution, int):
-      self._steps_per_execution_value = steps_per_execution
+    # `steps_per_execution_value` is the cached initial value.
+    # `steps_per_execution` is mutable and may be changed by the DataAdapter
+    # to handle partial executions.
+    if steps_per_execution is None:
+      self._steps_per_execution = 1
+      self._steps_per_execution_value = 1
     else:
+      self._steps_per_execution = steps_per_execution
       self._steps_per_execution_value = steps_per_execution.numpy().item()
 
     adapter_cls = select_data_adapter(x, y)
@@ -1161,12 +1157,7 @@ class DataHandler(object):
         if self._insufficient_data:  # Set by `catch_stop_iteration`.
           break
         if self._adapter.should_recreate_iterator():
-          if ds_context.has_strategy():
-            # TODO(b/138326910): remove this when MultiDeviceIterator is a
-            # CompositeTensor (unless this is more efficient)
-            data_iterator._initializer  # pylint: disable=pointless-statement, protected-access
-          else:
-            data_iterator = iter(self._dataset)
+          data_iterator = iter(self._dataset)
         yield epoch, data_iterator
         self._adapter.on_epoch_end()
 
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
new file mode 100644
index 00000000000..f219e590daf
--- /dev/null
+++ b/tensorflow/python/keras/engine/functional.py
@@ -0,0 +1,1280 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""A `Network` is way to compose layers: the topological form of a `Model`.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import itertools
+
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.engine import input_layer as input_layer_module
+from tensorflow.python.keras.engine import training as training_lib
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.saving.saved_model import network_serialization
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
+
+
+# pylint: disable=g-classes-have-attributes
+class Functional(training_lib.Model):
+  """A `Functional` model is a `Model` defined as a directed graph of layers.
+
+  Three types of `Model` exist: subclassed `Model`, `Functional` model,
+  and `Sequential` (a special case of `Functional`).
+  In general, more Keras features are supported with `Functional`
+  than with subclassed `Model`s, specifically:
+
+  - Model cloning (`keras.models.clone`)
+  - Serialization (`model.get_config()/from_config`, `model.to_json()/to_yaml()`
+  - Whole-model saving (`model.save()`)
+
+  A `Functional` model can be instantiated by passing two arguments to
+  `__init__`. The first argument is the `keras.Input` Tensors that represent
+  the inputs to the model. The second argument specifies the output
+  tensors that represent the outputs of this model. Both arguments can be a
+  nested structure of tensors.
+
+  Example:
+
+  ```
+  inputs = {'x1': keras.Input(shape=(10,)), 'x2': keras.Input(shape=(1,))}
+  t = keras.layers.Dense(1, activation='relu')(inputs['x1'])
+  outputs = keras.layers.Add()([t, inputs['x2'])
+  model = keras.Model(inputs, outputs)
+  ```
+
+  A `Functional` model constructed using the Functional API can also include raw
+  TensorFlow functions, with the exception of functions that create Variables
+  or assign ops.
+
+  Example:
+
+  ```
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(1)(inputs)
+  outputs = tf.nn.relu(x)
+  model = keras.Model(inputs, outputs)
+  ```
+
+  Arguments:
+    inputs: List of input tensors (must be created via `tf.keras.Input()`).
+    outputs: List of outputs tensors.
+    name: String, optional. Name of the model.
+    trainable: Boolean, whether the model's variables should be trainable.
+  """
+
+  # See tf.Module for the usage of this property.
+  # The key of _layer_call_argspecs is a layer. tf.Module._flatten will fail to
+  # flatten the key since it is trying to convert Trackable/Layer to a string.
+  _TF_MODULE_IGNORED_PROPERTIES = frozenset(itertools.chain(
+      ('_layer_call_argspecs', '_compiled_trainable_state',
+       '_output_mask_cache', '_output_tensor_cache', '_output_shape_cache'),
+      training_lib.Model._TF_MODULE_IGNORED_PROPERTIES
+  ))
+
+  @trackable.no_automatic_dependency_tracking
+  def __init__(self, inputs=None, outputs=None, name=None, trainable=True):
+    # generic_utils.validate_kwargs(
+    #     kwargs, {'name', 'trainable'},
+    #     'Functional models may only specify `name` and `trainable` keyword '
+    #     'arguments during initialization. Got an unexpected argument:')
+    super(Functional, self).__init__(name=name, trainable=trainable)
+    self._init_graph_network(inputs, outputs)
+
+  @trackable.no_automatic_dependency_tracking
+  def _init_graph_network(self, inputs, outputs):
+    # This method is needed for Sequential to reinitialize graph network when
+    # layer is added or removed.
+    self._is_graph_network = True
+
+    # Normalize and set self.inputs, self.outputs.
+    if isinstance(inputs, list) and len(nest.flatten(inputs)) == 1:
+      inputs = inputs[0]
+    if isinstance(outputs, list) and len(nest.flatten(outputs)) == 1:
+      outputs = outputs[0]
+    self._nested_inputs = inputs
+    self._nested_outputs = outputs
+    self.inputs = nest.flatten(inputs)
+    self.outputs = nest.flatten(outputs)
+
+    # Models constructed with a single Tensor or list of Tensors can
+    # be called with a dict, where the keys of the dict are the names
+    # of the `Input` objects. Extra keys are ignored.
+    self._enable_dict_to_input_mapping = (
+        not nest.is_sequence(self._nested_inputs) or
+        (isinstance(self._nested_inputs, (list, tuple)) and
+         not any(nest.is_sequence(t) for t in self._nested_inputs)))
+
+    if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
+      base_layer_utils.create_keras_history(self._nested_outputs)
+
+    self._validate_graph_inputs_and_outputs()
+
+    # A Network does not create weights of its own, thus it is already
+    # built.
+    self.built = True
+    self._build_input_shape = nest.map_structure(lambda x: x.shape, inputs)
+    self._compute_output_and_mask_jointly = True
+    # `_expects_training_arg` is True since the `training` argument is always
+    # present in the signature of the `call` method of a graph network.
+    self._expects_training_arg = True
+    self._expects_mask_arg = True
+    # A graph network does not autocast inputs, as its layers will cast them
+    # instead.
+    self._autocast = False
+
+    self._input_layers = []
+    self._output_layers = []
+    self._input_coordinates = []
+    self._output_coordinates = []
+
+    # This is for performance optimization when calling the Network on new
+    # inputs. Every time the Network is called on a set on input tensors,
+    # we compute the output tensors, output masks and output shapes in one pass,
+    # then cache them here. When any of these outputs is queried later, we
+    # retrieve it from there instead of recomputing it.
+    self._output_mask_cache = {}
+    self._output_tensor_cache = {}
+    self._output_shape_cache = {}
+
+    # Build self._output_layers:
+    for x in self.outputs:
+      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      self._output_layers.append(layer)
+      self._output_coordinates.append((layer, node_index, tensor_index))
+
+    # Build self._input_layers:
+    for x in self.inputs:
+      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      # It's supposed to be an input layer, so only one node
+      # and one tensor output.
+      assert node_index == 0
+      assert tensor_index == 0
+      self._input_layers.append(layer)
+      self._input_coordinates.append((layer, node_index, tensor_index))
+
+    # Keep track of the network's nodes and layers.
+    nodes, nodes_by_depth, layers, _ = _map_graph_network(
+        self.inputs, self.outputs)
+    self._network_nodes = nodes
+    self._nodes_by_depth = nodes_by_depth
+    self._layers = layers
+    self._layer_call_argspecs = {}
+    for layer in self._layers:
+      self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+      layer._attribute_sentinel.add_parent(self._attribute_sentinel)
+
+    # Build self.input_names and self.output_names.
+    self._set_output_names()
+    self.input_names = []
+    self._feed_input_names = []
+    self._feed_inputs = []
+    self._feed_input_shapes = []
+    for layer in self._input_layers:
+      self.input_names.append(layer.name)
+      if layer.is_placeholder:
+        self._feed_input_names.append(layer.name)
+        # Use batch_input_shape here because non-eager composite tensors may not
+        # have a shape attribute that's meaningful (sparse, for instance, has
+        # a tensor that's non-constant and needs to be fed). This means that
+        # input layers that create placeholders will need to have the
+        # batch_input_shape attr to allow for input shape validation.
+        self._feed_input_shapes.append(layer._batch_input_shape)
+        self._feed_inputs.append(layer.input)
+
+    self._compute_tensor_usage_count()
+    self._set_save_spec(self._nested_inputs)
+    tf_utils.assert_no_legacy_layers(self.layers)
+
+  @property
+  def input(self):
+    """Retrieves the input tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input tensor or list of input tensors.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+      AttributeError: If no inbound nodes are found.
+    """
+    return self._nested_inputs
+
+  @property
+  def input_shape(self):
+    """Retrieves the input shape(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer, or if all inputs
+    have the same shape.
+
+    Returns:
+        Input shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per input tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined input_shape.
+        RuntimeError: if called in Eager mode.
+    """
+    return nest.map_structure(backend.int_shape, self.input)
+
+  @property
+  def output(self):
+    """Retrieves the output tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one output,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+      Output tensor or list of output tensors.
+
+    Raises:
+      AttributeError: if the layer is connected to more than one incoming
+        layers.
+      RuntimeError: if called in Eager mode.
+    """
+    return self._nested_outputs
+
+  @property
+  def output_shape(self):
+    """Retrieves the output shape(s) of a layer.
+
+    Only applicable if the layer has one output,
+    or if all outputs have the same shape.
+
+    Returns:
+        Output shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per output tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined output shape.
+        RuntimeError: if called in Eager mode.
+    """
+    return nest.map_structure(backend.int_shape, self.output)
+
+  def _set_output_names(self):
+    """Assigns unique names to the Network's outputs.
+
+    Output layers with multiple output tensors would otherwise lead to duplicate
+    names in self.output_names.
+    """
+    uniquified = []
+    output_names = set()
+    prefix_count = {}
+    for layer in self._output_layers:
+      proposal = layer.name
+      while proposal in output_names:
+        existing_count = prefix_count.get(layer.name, 1)
+        proposal = '{}_{}'.format(layer.name, existing_count)
+        prefix_count[layer.name] = existing_count + 1
+      output_names.add(proposal)
+      uniquified.append(proposal)
+    self.output_names = uniquified
+
+  @property
+  def _layer_checkpoint_dependencies(self):
+    """Dictionary of layer dependencies to be included in the checkpoint."""
+    weight_layer_index = 0
+
+    dependencies = collections.OrderedDict()
+    for layer_index, layer in enumerate(self.layers):
+      try:
+        if layer.weights:
+          # Keep a separate index for layers which have weights. This allows
+          # users to insert Layers without weights anywhere in the network
+          # without breaking checkpoints.
+          dependencies['layer_with_weights-%d' % weight_layer_index] = layer
+          weight_layer_index += 1
+      except ValueError:
+        # The layer might have weights, but may not be built yet. We just treat
+        # it as layer without weight.
+        pass
+
+      # Even if it doesn't have weights, we should still track everything in
+      # case it has/will have Trackable dependencies.
+      dependencies['layer-%d' % layer_index] = layer
+    return dependencies
+
+  @property
+  def _checkpoint_dependencies(self):
+    dependencies = [
+        trackable.TrackableReference(name=name, ref=layer)
+        for name, layer in self._layer_checkpoint_dependencies.items()]
+    dependencies.extend(super(Functional, self)._checkpoint_dependencies)
+    return dependencies
+
+  def _lookup_dependency(self, name):
+    layer_dependencies = self._layer_checkpoint_dependencies
+    if name in layer_dependencies:
+      return layer_dependencies[name]
+    return super(Functional, self)._lookup_dependency(name)
+
+  def _handle_deferred_layer_dependencies(self, layers):
+    """Handles layer checkpoint dependencies that are added after init."""
+    layer_checkpoint_dependencies = self._layer_checkpoint_dependencies
+    layer_to_name = {v: k for k, v in layer_checkpoint_dependencies.items()}
+    for layer in layers:
+      if layer in layer_to_name:
+        self._handle_deferred_dependencies(name=layer_to_name[layer],
+                                           trackable=layer)
+
+  @property
+  def _should_compute_mask(self):
+    return True
+
+  def compute_mask(self, inputs, mask):
+    # TODO(omalleyt): b/123540974 This function is not really safe to call
+    # by itself because it will duplicate any updates and losses in graph
+    # mode by `call`ing the Layers again.
+    output_tensors = self._run_internal_graph(inputs, mask=mask)
+    return nest.map_structure(lambda t: t._keras_mask, output_tensors)
+
+  def call(self, inputs, training=None, mask=None):
+    """Calls the model on new inputs.
+
+    In this case `call` just reapplies
+    all ops in the graph to the new inputs
+    (e.g. build a new computational graph from the provided inputs).
+
+    Arguments:
+        inputs: A tensor or list of tensors.
+        training: Boolean or boolean scalar tensor, indicating whether to run
+          the `Network` in training mode or inference mode.
+        mask: A mask or list of masks. A mask can be
+            either a tensor or None (no mask).
+
+    Returns:
+        A tensor if there is a single output, or
+        a list of tensors if there are more than one outputs.
+    """
+    return self._run_internal_graph(
+        inputs, training=training, mask=mask)
+
+  def compute_output_shape(self, input_shape):
+    # Convert any shapes in tuple format to TensorShapes.
+    input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
+
+    if len(nest.flatten(input_shape)) != len(nest.flatten(self._input_layers)):
+      raise ValueError('Invalid input_shape argument ' + str(input_shape) +
+                       ': model has ' + str(len(self._input_layers)) +
+                       ' tensor inputs.')
+
+    # Use the tuple of TensorShape as the cache key, since tuple is hashable
+    # and can be used as hash key.
+    try:
+      cache_key = tuple(tf_utils.convert_shapes(input_shape, to_tuples=True))
+      if cache_key in self._output_shape_cache:
+        # Cache hit. Return shapes as TensorShapes.
+        return self._output_shape_cache[cache_key]
+    except ValueError:
+      # In case there are unknown TensorShape, eg for sparse tensor input,
+      # We skip the caching since the shape is unknown.
+      pass
+
+    layers_to_output_shapes = {}
+    for layer, shape in zip(self._input_layers, nest.flatten(input_shape)):
+      # It's an input layer: then `compute_output_shape` is identity,
+      # and there is only one node and one tensor..
+      shape_key = layer.name + '_0_0'
+      layers_to_output_shapes[shape_key] = shape
+
+    depth_keys = list(self._nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+    # Iterate over nodes, by depth level.
+    if len(depth_keys) > 1:
+      for depth in depth_keys:
+        nodes = self._nodes_by_depth[depth]
+        for node in nodes:
+          layer = node.layer
+          if layer in self._input_layers:
+            # We've already covered the input layers
+            # a few lines above.
+            continue
+          # Get the input shapes for the first argument of the node
+          layer_input_shapes = []
+          layer_inputs = node.call_args[0]
+          for layer_input in nest.flatten(layer_inputs):
+            kh = layer_input._keras_history
+            input_layer_key = kh.layer.name + '_%s_%s' % (kh.node_index,
+                                                          kh.tensor_index)
+            layer_input_shapes.append(layers_to_output_shapes[input_layer_key])
+          layer_input_shapes = nest.pack_sequence_as(layer_inputs,
+                                                     layer_input_shapes)
+          # Layers expect shapes to be tuples for `compute_output_shape`.
+          layer_input_shapes = tf_utils.convert_shapes(
+              layer_input_shapes, to_tuples=True)
+          layer_output_shapes = layer.compute_output_shape(layer_input_shapes)
+          # Convert back to TensorShapes.
+          layer_output_shapes = tf_utils.convert_shapes(
+              layer_output_shapes, to_tuples=False)
+
+          node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
+          for j, shape in enumerate(nest.flatten(layer_output_shapes)):
+            shape_key = layer.name + '_%s_%s' % (node_index, j)
+            layers_to_output_shapes[shape_key] = shape
+
+      # Read final output shapes from layers_to_output_shapes.
+      output_shapes = []
+      for i in range(len(self._output_layers)):
+        layer, node_index, tensor_index = self._output_coordinates[i]
+        shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
+        output_shapes.append(layers_to_output_shapes[shape_key])
+      output_shapes = nest.pack_sequence_as(self._nested_outputs, output_shapes)
+      # Store in cache.
+      self._output_shape_cache[cache_key] = output_shapes
+
+    # Return shapes as TensorShapes.
+    return output_shapes
+
+  def _run_internal_graph(self, inputs, training=None, mask=None):
+    """Computes output tensors for new inputs.
+
+    # Note:
+        - Can be run on non-Keras tensors.
+
+    Arguments:
+        inputs: Tensor or nested structure of Tensors.
+        training: Boolean learning phase.
+        mask: (Optional) Tensor or nested structure of Tensors.
+
+    Returns:
+        output_tensors
+    """
+    inputs = self._flatten_to_reference_inputs(inputs)
+    if mask is None:
+      masks = [None] * len(inputs)
+    else:
+      masks = self._flatten_to_reference_inputs(mask)
+    for input_t, mask in zip(inputs, masks):
+      input_t._keras_mask = mask
+
+    # Dictionary mapping reference tensors to computed tensors.
+    tensor_dict = {}
+    tensor_usage_count = self._tensor_usage_count
+    for x, y in zip(self.inputs, inputs):
+      y = self._conform_to_reference_input(y, ref_input=x)
+      x_id = str(id(x))
+      tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
+
+    nodes_by_depth = self._nodes_by_depth
+    depth_keys = list(nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+
+    for depth in depth_keys:
+      nodes = nodes_by_depth[depth]
+      for node in nodes:
+        if node.is_input:
+          continue  # Input tensors already exist.
+
+        if any(t_id not in tensor_dict for t_id in node.flat_input_ids):
+          continue  # Node is not computable, try skipping.
+
+        args, kwargs = node.map_arguments(tensor_dict)
+        outputs = node.layer(*args, **kwargs)
+
+        # Update tensor_dict.
+        for x_id, y in zip(node.flat_output_ids, nest.flatten(outputs)):
+          tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
+
+    output_tensors = []
+    for x in self.outputs:
+      x_id = str(id(x))
+      assert x_id in tensor_dict, 'Could not compute output ' + str(x)
+      output_tensors.append(tensor_dict[x_id].pop())
+
+    return nest.pack_sequence_as(self._nested_outputs, output_tensors)
+
+  def _flatten_to_reference_inputs(self, tensors):
+    """Maps `tensors` to their respective `keras.Input`."""
+    if self._enable_dict_to_input_mapping and isinstance(tensors, dict):
+      ref_inputs = self._nested_inputs
+      if not nest.is_sequence(ref_inputs):
+        ref_inputs = [self._nested_inputs]
+
+      try:
+        # Flatten in the order `Input`s were passed during Model construction.
+        return [tensors[inp._keras_history.layer.name] for inp in ref_inputs]
+      except KeyError:
+        # TODO(b/151582614)
+        return nest.flatten(tensors)
+
+    # Otherwise both self.inputs and tensors will already be in same order.
+    return nest.flatten(tensors)
+
+  def _conform_to_reference_input(self, tensor, ref_input):
+    """Set shape and dtype based on `keras.Input`s."""
+    # Shape handling (only for non-CompositeTensors).
+    if isinstance(tensor, ops.Tensor) and isinstance(ref_input, ops.Tensor):
+      # Allow (None,) and (None, 1) Tensors to be passed interchangably. Use the
+      # shape specified by the `keras.Input`.
+      if tensor.shape.rank is not None and ref_input.shape.rank is not None:
+        should_squeeze_last_dim = (
+            tensor.shape.rank == ref_input.shape.rank + 1 and
+            tensor.shape[-1] == 1)
+        should_expand_last_dim = (
+            tensor.shape.rank == ref_input.shape.rank - 1 and
+            ref_input.shape[-1] == 1)
+        if should_squeeze_last_dim:
+          tensor = array_ops.squeeze_v2(tensor, axis=-1)
+        elif should_expand_last_dim:
+          tensor = array_ops.expand_dims_v2(tensor, axis=-1)
+
+      # Add shape hints to Tensors that might have None shape dims but have
+      # shapes defined by the `keras.Input`.
+      try:
+        tensor.set_shape(tensor.shape.merge_with(ref_input.shape))
+      except ValueError:
+        logging.warning(
+            'Model was constructed with shape {} for input {}, but it was '
+            'called on an input with incompatible shape {}.'.format(
+                ref_input.shape, ref_input, tensor.shape))
+
+    # Dtype handling.
+    if isinstance(ref_input, (ops.Tensor, composite_tensor.CompositeTensor)):
+      tensor = math_ops.cast(tensor, dtype=ref_input.dtype)
+
+    return tensor
+
+  def get_config(self):
+    return copy.deepcopy(get_network_config(self))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    """Instantiates a Model from its config (output of `get_config()`).
+
+    Arguments:
+        config: Model config dictionary.
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+
+    Returns:
+        A model instance.
+
+    Raises:
+        ValueError: In case of improperly formatted config dict.
+    """
+    input_tensors, output_tensors, created_layers = reconstruct_from_config(
+        config, custom_objects)
+    model = cls(inputs=input_tensors, outputs=output_tensors,
+                name=config.get('name'))
+    connect_ancillary_layers(model, created_layers)
+    return model
+
+  def _validate_graph_inputs_and_outputs(self):
+    """Validates the inputs and outputs of a Graph Network."""
+    # Check for redundancy in inputs.
+    if len({id(i) for i in self.inputs}) != len(self.inputs):
+      raise ValueError('The list of inputs passed to the model '
+                       'is redundant. '
+                       'All inputs should only appear once.'
+                       ' Found: ' + str(self.inputs))
+
+    for x in self.inputs:
+      # Check that x has appropriate `_keras_history` metadata.
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Input tensors to a ' + cls_name + ' ' +
+                         'must come from `tf.keras.Input`. '
+                         'Received: ' + str(x) +
+                         ' (missing previous layer metadata).')
+      # Check that x is an input tensor.
+      # pylint: disable=protected-access
+      layer = x._keras_history.layer
+      if len(layer._inbound_nodes) > 1 or (
+          layer._inbound_nodes and not layer._inbound_nodes[0].is_input):
+        cls_name = self.__class__.__name__
+        logging.warning(cls_name + ' inputs must come from '
+                        '`tf.keras.Input` (thus holding past layer metadata), '
+                        'they cannot be the output of '
+                        'a previous non-Input layer. '
+                        'Here, a tensor specified as '
+                        'input to "' + self.name + '" was not an Input tensor, '
+                        'it was generated by layer ' + layer.name + '.\n'
+                        'Note that input tensors are '
+                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
+                        'The tensor that caused the issue was: ' + str(x.name))
+
+    # Check compatibility of batch sizes of Input Layers.
+    input_batch_sizes = [
+        training_utils.get_static_batch_size(x._keras_history.layer)
+        for x in self.inputs
+    ]
+    consistent_batch_size = None
+    for batch_size in input_batch_sizes:
+      if batch_size is not None:
+        if (consistent_batch_size is not None and
+            batch_size != consistent_batch_size):
+          raise ValueError('The specified batch sizes of the Input Layers'
+                           ' are incompatible. Found batch sizes: {}'.format(
+                               input_batch_sizes))
+        consistent_batch_size = batch_size
+
+    for x in self.outputs:
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Output tensors to a ' + cls_name + ' must be '
+                         'the output of a TensorFlow `Layer` '
+                         '(thus holding past layer metadata). Found: ' + str(x))
+
+  def _insert_layers(self, layers, relevant_nodes=None):
+    """Inserts Layers into the Network after Network creation.
+
+    This is only valid for Keras Graph Networks.  Layers added via this function
+    will be included in the `call` computation and `get_config` of this Network.
+    They will not be added to the Network's outputs.
+
+
+    Arguments:
+      layers: Arbitrary nested structure of Layers. Layers must be reachable
+        from one or more of the `keras.Input` Tensors that correspond to this
+        Network's inputs.
+      relevant_nodes: Nodes from the Layers that should be considered part of
+        this Network. If `None`, all Nodes will be considered part of this
+        Network.
+
+    Raises:
+      ValueError: If the layers depend on `Input`s not found in this Model.
+    """
+    layers = nest.flatten(layers)
+    tf_utils.assert_no_legacy_layers(layers)
+    node_to_depth = {}
+    for depth, nodes in self._nodes_by_depth.items():
+      node_to_depth.update({node: depth for node in nodes})
+    # The nodes of these Layers that are relevant to this Network. If not
+    # provided, assume all Nodes are relevant
+    if not relevant_nodes:
+      relevant_nodes = nest.flatten([layer._inbound_nodes for layer in layers])
+    network_nodes = set(relevant_nodes + list(node_to_depth.keys()))
+
+    def _get_min_depth(node):
+      """Gets the minimum depth at which node can be computed."""
+      min_depth = 0
+      for layer, node_id, _, _ in node.iterate_inbound():
+        inbound_node = layer._inbound_nodes[node_id]
+        if inbound_node in node_to_depth:
+          min_depth = min(min_depth, node_to_depth[inbound_node])
+        elif inbound_node not in network_nodes:
+          continue
+        else:
+          # Previous relevant nodes haven't been processed yet.
+          return None
+      # New node is one shallower than its shallowest input.
+      return min_depth - 1
+
+    # Insert nodes into `_nodes_by_depth` and other node attrs.
+    unprocessed_nodes = copy.copy(relevant_nodes)
+    i = 0
+    while unprocessed_nodes:
+      i += 1
+      # Do a sanity check. This can occur if `Input`s from outside this Model
+      # are being relied on.
+      if i > 10000:
+        raise ValueError('Layers could not be added due to missing '
+                         'dependencies.')
+
+      node = unprocessed_nodes.pop(0)
+      depth = _get_min_depth(node)
+      if depth is None:  # Defer until inbound nodes are processed.
+        unprocessed_nodes.append(node)
+        continue
+      node_key = _make_node_key(node.layer.name,
+                                node.layer._inbound_nodes.index(node))
+      if node_key not in self._network_nodes:
+        node_to_depth[node] = depth
+        self._network_nodes.add(node_key)
+        self._nodes_by_depth[depth].append(node)
+
+    # Insert layers and update other layer attrs.
+    layer_set = set(self._layers)
+    deferred_layers = []
+    for layer in layers:
+      if layer not in layer_set:
+        self._layers.append(layer)
+        deferred_layers.append(layer)
+        self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+
+        # This allows the added layer to broadcast mutations to the current
+        # layer, which is necessary to ensure cache correctness.
+        layer._attribute_sentinel.add_parent(self._attribute_sentinel)
+        layer_set.add(layer)
+    self._handle_deferred_layer_dependencies(deferred_layers)
+
+    self._compute_tensor_usage_count()
+
+  def _compute_tensor_usage_count(self):
+    """Compute the #. of tensor usages for all the output tensors of layers.
+
+    The computed tensor usage count is saved as `self._tensor_usage_count`. This
+    is later used for saving memory in eager computation by releasing
+    no-longer-needed tensors as early as possible.
+    """
+    tensor_usage_count = collections.Counter()
+    available_tensors = set(str(id(tensor)) for tensor in self.inputs)
+
+    depth_keys = list(self._nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+    depth_keys = depth_keys[1:]
+
+    for depth in depth_keys:
+      for node in self._nodes_by_depth[depth]:
+        input_tensors = {
+            str(id(tensor)) for tensor in nest.flatten(node.keras_inputs)
+        }
+        if input_tensors.issubset(available_tensors):
+          for tensor in nest.flatten(node.keras_inputs):
+            tensor_usage_count[str(id(tensor))] += 1
+
+          for output_tensor in nest.flatten(node.outputs):
+            available_tensors.add(str(id(output_tensor)))
+
+    for tensor in self.outputs:
+      tensor_usage_count[str(id(tensor))] += 1
+
+    self._tensor_usage_count = tensor_usage_count
+
+  def _assert_weights_created(self):
+    # Override the implementation in Model.
+    # The Functional model should always have weight created already.
+    return
+
+  def _graph_network_add_loss(self, symbolic_loss):
+    new_nodes, new_layers = _map_subgraph_network(self.inputs, [symbolic_loss])
+    # Losses must be keyed on inputs no matter what in order to be supported in
+    # DistributionStrategy.
+    add_loss_layer = base_layer.AddLoss(
+        unconditional=False, dtype=symbolic_loss.dtype)
+    add_loss_layer(symbolic_loss)
+    new_nodes.extend(add_loss_layer.inbound_nodes)
+    new_layers.append(add_loss_layer)
+    self._insert_layers(new_layers, new_nodes)
+
+  def _graph_network_add_metric(self, value, aggregation, name):
+    new_nodes, new_layers = _map_subgraph_network(self.inputs, [value])
+    add_metric_layer = base_layer.AddMetric(
+        aggregation, name, dtype=value.dtype)
+    add_metric_layer(value)
+    new_nodes.extend(add_metric_layer.inbound_nodes)
+    new_layers.append(add_metric_layer)
+    self._insert_layers(new_layers, new_nodes)
+
+  @property
+  def _trackable_saved_model_saver(self):
+    return network_serialization.NetworkSavedModelSaver(self)
+
+
+def _make_node_key(layer_name, node_index):
+  return layer_name + '_ib-' + str(node_index)
+
+
+def _map_graph_network(inputs, outputs):
+  """Validates a network's topology and gather its layers and nodes.
+
+  Arguments:
+    inputs: List of input tensors.
+    outputs: List of outputs tensors.
+
+  Returns:
+    A tuple `(nodes, nodes_by_depth, layers, layers_by_depth)`.
+    - nodes: list of Node instances.
+    - nodes_by_depth: dict mapping ints (depth) to lists of node instances.
+    - layers: list of Layer instances.
+    - layers_by_depth: dict mapping ints (depth) to lists of layer instances.
+
+  Raises:
+    ValueError: In case the network is not valid (e.g. disconnected graph).
+  """
+  # "depth" is number of layers between output Node and the Node.
+  # Nodes are ordered from inputs -> outputs.
+  nodes_in_decreasing_depth, layer_indices = _build_map(outputs)
+  network_nodes = {
+      _make_node_key(node.layer.name, node.layer._inbound_nodes.index(node))
+      for node in nodes_in_decreasing_depth
+  }
+
+  nodes_depths = {}  # dict {node: depth value}
+  layers_depths = {}  # dict {layer: depth value}
+
+  for node in reversed(nodes_in_decreasing_depth):
+    # If the depth is not set, the node has no outbound nodes (depth 0).
+    depth = nodes_depths.setdefault(node, 0)
+
+    # Update the depth of the corresponding layer
+    previous_depth = layers_depths.get(node.layer, 0)
+    # If we've seen this layer before at a higher depth,
+    # we should use that depth instead of the node depth.
+    # This is necessary for shared layers that have inputs at different
+    # depth levels in the graph.
+    depth = max(depth, previous_depth)
+    layers_depths[node.layer] = depth
+    nodes_depths[node] = depth
+
+    # Update the depth of inbound nodes.
+    # The "depth" of a node is the max of the depths
+    # of all nodes it is connected to + 1.
+    for node_dep in node.parent_nodes:
+      previous_depth = nodes_depths.get(node_dep, 0)
+      nodes_depths[node_dep] = max(depth + 1, previous_depth)
+
+  # Handle inputs that are not connected to outputs.
+  # We do not error out here because the inputs may be used to compute losses
+  # and metrics.
+  for input_t in inputs:
+    input_layer = input_t._keras_history[0]
+    if input_layer not in layers_depths:
+      layers_depths[input_layer] = 0
+      layer_indices[input_layer] = -1
+      nodes_depths[input_layer._inbound_nodes[0]] = 0
+      network_nodes.add(_make_node_key(input_layer.name, 0))
+
+  # Build a dict {depth: list of nodes with this depth}
+  nodes_by_depth = collections.defaultdict(list)
+  for node, depth in nodes_depths.items():
+    nodes_by_depth[depth].append(node)
+
+  # Build a dict {depth: list of layers with this depth}
+  layers_by_depth = collections.defaultdict(list)
+  for layer, depth in layers_depths.items():
+    layers_by_depth[depth].append(layer)
+
+  # Get sorted list of layer depths.
+  depth_keys = list(layers_by_depth.keys())
+  depth_keys.sort(reverse=True)
+
+  # Set self.layers ordered by depth.
+  layers = []
+  for depth in depth_keys:
+    layers_for_depth = layers_by_depth[depth]
+    # Network.layers needs to have a deterministic order:
+    # here we order them by traversal order.
+    layers_for_depth.sort(key=lambda x: layer_indices[x])
+    layers.extend(layers_for_depth)
+
+  # Get sorted list of node depths.
+  depth_keys = list(nodes_by_depth.keys())
+  depth_keys.sort(reverse=True)
+
+  # Check that all tensors required are computable.
+  # computable_tensors: all tensors in the graph
+  # that can be computed from the inputs provided.
+  computable_tensors = set()
+  for x in inputs:
+    computable_tensors.add(id(x))
+
+  layers_with_complete_input = []  # To provide a better error msg.
+  for depth in depth_keys:
+    for node in nodes_by_depth[depth]:
+      layer = node.layer
+      if layer and not node.is_input:
+        for x in nest.flatten(node.keras_inputs):
+          if id(x) not in computable_tensors:
+            raise ValueError('Graph disconnected: '
+                             'cannot obtain value for tensor ' + str(x) +
+                             ' at layer "' + layer.name + '". '
+                             'The following previous layers '
+                             'were accessed without issue: ' +
+                             str(layers_with_complete_input))
+        for x in nest.flatten(node.outputs):
+          computable_tensors.add(id(x))
+        layers_with_complete_input.append(layer.name)
+
+  # Ensure name unicity, which will be crucial for serialization
+  # (since serialized nodes refer to layers by their name).
+  all_names = [layer.name for layer in layers]
+  for name in all_names:
+    if all_names.count(name) != 1:
+      raise ValueError('The name "' + name + '" is used ' +
+                       str(all_names.count(name)) + ' times in the model. '
+                       'All layer names should be unique.')
+  return network_nodes, nodes_by_depth, layers, layers_by_depth
+
+
+def _build_map(outputs):
+  """This method topologically sorts nodes in order from inputs to outputs.
+
+  It uses a depth-first search to topologically sort nodes that appear in the
+  _keras_history connectivity metadata of `outputs`.
+
+  Args:
+    outputs: the output tensors whose _keras_history metadata should be walked.
+    This may be an arbitrary nested structure.
+
+  Returns:
+    A tuple like (ordered_nodes, layer_to_first_traversal_index)
+    ordered_nodes: list of nodes appearing in the keras history, topologically
+      sorted from original inputs to the `outputs`.
+      (If outputs have different sets of ancestors, the inputs to one output
+      may appear after a different output).
+    layer_to_first_traversal_index:
+      A dict mapping layer to the traversal index in the DFS where it is
+      seen. Note: if a layer is shared by several nodes, the dict will only
+      store the index corresponding to the *first* time the layer seen.
+  """
+  finished_nodes = set()
+  nodes_in_progress = set()
+  nodes_in_decreasing_depth = []  # nodes from inputs -> outputs.
+  layer_indices = {}  # layer -> in traversal order.
+  for output in nest.flatten(outputs):
+    _build_map_helper(output, finished_nodes, nodes_in_progress,
+                      nodes_in_decreasing_depth, layer_indices)
+  return nodes_in_decreasing_depth, layer_indices
+
+
+def _build_map_helper(tensor, finished_nodes, nodes_in_progress,
+                      nodes_in_decreasing_depth, layer_indices):
+  """Recursive helper for `_build_map`."""
+  layer, node_index, _ = tensor._keras_history  # pylint: disable=protected-access
+  node = layer._inbound_nodes[node_index]  # pylint: disable=protected-access
+
+  # Don't repeat work for shared subgraphs
+  if node in finished_nodes:
+    return
+
+  # Prevent cycles.
+  if node in nodes_in_progress:
+    raise ValueError('The tensor ' + str(tensor) + ' at layer "' + layer.name +
+                     '" is part of a cycle.')
+
+  # Store the traversal order for layer sorting.
+  if layer not in layer_indices:
+    layer_indices[layer] = len(layer_indices)
+
+  # Propagate to all previous tensors connected to this node.
+  nodes_in_progress.add(node)
+  if not node.is_input:
+    for tensor in node.keras_inputs:
+      _build_map_helper(tensor, finished_nodes, nodes_in_progress,
+                        nodes_in_decreasing_depth, layer_indices)
+
+  finished_nodes.add(node)
+  nodes_in_progress.remove(node)
+  nodes_in_decreasing_depth.append(node)
+
+
+def _map_subgraph_network(inputs, outputs):
+  """Returns the nodes and layers in the topology from `inputs` to `outputs`.
+
+  Args:
+    inputs: List of input tensors.
+    outputs: List of output tensors.
+
+  Returns:
+    A tuple of List{Node] and List[Layer].
+  """
+  base_layer_utils.create_keras_history(outputs)
+  # Keep only nodes and layers in the topology between inputs and outputs.
+  _, nodes_by_depth, layers, _ = _map_graph_network(inputs, outputs)
+  return nest.flatten([nodes for nodes in nodes_by_depth.values()]), layers
+
+
+def _should_skip_first_node(layer):
+  """Returns True if the first layer node should not be saved or loaded."""
+  # Networks start with a pre-existing node linking their input to output.
+  # For a sequential model, it is first created with _is_graph_network = False,
+  # we have to keep the _is_graph_network check here.
+  return isinstance(layer, Functional) and layer._is_graph_network
+
+
+def _deserialize_keras_tensors(kwargs, layer_map):
+  """Deserializes Keras Tensors passed to `call`.."""
+
+  def _deserialize_keras_tensor(t):
+    """Deserializes a single Keras Tensor passed to `call`."""
+    if isinstance(t, tf_utils.ListWrapper):
+      t = t.as_list()
+      layer_name = t[0]
+      node_index = t[1]
+      tensor_index = t[2]
+
+      layer = layer_map[layer_name]
+      node = layer._inbound_nodes[node_index]
+      return nest.flatten(node.outputs)[tensor_index]
+    return t
+
+  kwargs = tf_utils.convert_inner_node_data(kwargs, wrap=True)
+  return nest.map_structure(_deserialize_keras_tensor, kwargs)
+
+
+def connect_ancillary_layers(model, created_layers):
+  """Adds layers that are not connected to the outputs to the model."""
+  # Layers not connected to outputs, such as those added in `add_loss`.
+  ancillary_layers = [
+      layer for layer in created_layers.values() if layer not in model.layers
+  ]
+  if ancillary_layers:
+    relevant_nodes = nest.flatten([
+        layer.inbound_nodes[1:]
+        if _should_skip_first_node(layer) else layer.inbound_nodes
+        for layer in created_layers.values()
+    ])
+    model._insert_layers(ancillary_layers, relevant_nodes)
+  return model
+
+
+def reconstruct_from_config(config, custom_objects=None, created_layers=None):
+  """Reconstructs graph from config object.
+
+  Args:
+    config: Dictionary returned from Network.get_config()
+    custom_objects: Optional dictionary mapping names (strings) to custom
+      classes or functions to be considered during deserialization.
+    created_layers: Optional dictionary mapping names to Layer objects. Any
+      layer not in this dictionary will be be created and added to the dict.
+      This function will add new nodes to all layers (excluding InputLayers),
+      instead of re-using pre-existing nodes in the layers.
+
+  Returns:
+    Tuple of (input tensors, output tensors, dictionary of created layers)
+  """
+  # Layer instances created during the graph reconstruction process.
+  created_layers = created_layers or collections.OrderedDict()
+
+  # Maps input data (tuple of inbound layer name, node index) from the config
+  # to node indices in the newly generated model. The node indices may be
+  # different if the layers have already been called previously.
+  node_index_map = {}
+  node_count_by_layer = {}
+
+  # Dictionary mapping layer instances to
+  # node data that specifies a layer call.
+  # It acts as a queue that maintains any unprocessed
+  # layer call until it becomes possible to process it
+  # (i.e. until the input tensors to the call all exist).
+  unprocessed_nodes = {}
+
+  def add_unprocessed_node(layer, node_data):
+    if layer not in unprocessed_nodes:
+      unprocessed_nodes[layer] = [node_data]
+    else:
+      unprocessed_nodes[layer].append(node_data)
+
+  def get_node_index(layer, config_node_index):
+    """Returns node index in layer (might differ from config_node_index)."""
+    if isinstance(layer, input_layer_module.InputLayer):
+      return 0
+    return node_index_map.get((layer.name, config_node_index), None)
+
+  def process_node(layer, node_data):
+    """Deserialize a node.
+
+    Arguments:
+        layer: layer instance.
+        node_data: Nested structure of `ListWrapper`.
+
+    Raises:
+        ValueError: In case of improperly formatted `node_data`.
+    """
+    input_tensors = []
+    for input_data in nest.flatten(node_data):
+      input_data = input_data.as_list()
+      inbound_layer_name = input_data[0]
+      inbound_node_index = input_data[1]
+      inbound_tensor_index = input_data[2]
+      if len(input_data) == 3:
+        kwargs = {}
+      elif len(input_data) == 4:
+        kwargs = input_data[3]
+        kwargs = _deserialize_keras_tensors(kwargs, created_layers)
+      else:
+        raise ValueError('Improperly formatted model config.')
+
+      inbound_layer = created_layers[inbound_layer_name]
+      inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
+
+      if inbound_node_index is None:
+        add_unprocessed_node(layer, node_data)
+        return
+      inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
+      input_tensors.append(
+          nest.flatten(inbound_node.outputs)[inbound_tensor_index])
+    input_tensors = nest.pack_sequence_as(node_data, input_tensors)
+    # Call layer on its inputs, thus creating the node
+    # and building the layer if needed.
+    if input_tensors is not None:
+      input_tensors = base_layer_utils.unnest_if_single_tensor(input_tensors)
+      output_tensors = layer(input_tensors, **kwargs)
+
+      # Update node index map.
+      output_index = nest.flatten(output_tensors)[0]._keras_history.node_index
+      node_index_map[(layer.name, node_count_by_layer[layer])] = output_index
+      node_count_by_layer[layer] += 1
+
+  def process_layer(layer_data):
+    """Deserializes a layer, then call it on appropriate inputs.
+
+    Arguments:
+        layer_data: layer config dict.
+
+    Raises:
+        ValueError: In case of improperly formatted `layer_data` dict.
+    """
+    layer_name = layer_data['name']
+
+    if layer_name in created_layers:
+      layer = created_layers[layer_name]
+    else:
+      # Instantiate layer.
+      from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+
+      layer = deserialize_layer(layer_data, custom_objects=custom_objects)
+      created_layers[layer_name] = layer
+
+    node_count_by_layer[layer] = int(_should_skip_first_node(layer))
+
+    # Gather layer inputs and convert to `ListWrapper` objects.
+    inbound_nodes_data = layer_data['inbound_nodes']
+    inbound_nodes_data = tf_utils.convert_inner_node_data(
+        inbound_nodes_data, wrap=True)
+    for node_data in inbound_nodes_data:
+      # We don't process nodes (i.e. make layer calls)
+      # on the fly because the inbound node may not yet exist,
+      # in case of layer shared at different topological depths
+      # (e.g. a model such as A(B(A(B(x)))))
+      add_unprocessed_node(layer, node_data)
+
+  # First, we create all layers and enqueue nodes to be processed
+  for layer_data in config['layers']:
+    process_layer(layer_data)
+  # Then we process nodes in order of layer depth.
+  # Nodes that cannot yet be processed (if the inbound node
+  # does not yet exist) are re-enqueued, and the process
+  # is repeated until all nodes are processed.
+  while unprocessed_nodes:
+    for layer_data in config['layers']:
+      layer = created_layers[layer_data['name']]
+      if layer in unprocessed_nodes:
+        for node_data in unprocessed_nodes.pop(layer):
+          process_node(layer, node_data)
+
+  input_tensors = []
+  output_tensors = []
+
+  input_layers = tf_utils.convert_inner_node_data(
+      config['input_layers'], wrap=True)
+  for layer_data in nest.flatten(input_layers):
+    layer_name, node_index, tensor_index = layer_data.as_list()
+    assert layer_name in created_layers
+    layer = created_layers[layer_name]
+    node_index = get_node_index(layer, node_index)
+    layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
+    input_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
+
+  output_layers = tf_utils.convert_inner_node_data(
+      config['output_layers'], wrap=True)
+  for layer_data in nest.flatten(output_layers):
+    layer_name, node_index, tensor_index = layer_data.as_list()
+    assert layer_name in created_layers
+    layer = created_layers[layer_name]
+    node_index = get_node_index(layer, node_index)
+    layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
+    output_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
+
+  input_tensors = nest.pack_sequence_as(input_layers, input_tensors)
+  output_tensors = nest.pack_sequence_as(output_layers, output_tensors)
+  return input_tensors, output_tensors, created_layers
+
+
+def get_network_config(network, serialize_layer_fn=None):
+  """Builds the config, which consists of the node graph and serialized layers.
+
+  Args:
+    network: A Network object.
+    serialize_layer_fn: Function used to serialize layers.
+
+  Returns:
+    Config dictionary.
+  """
+  serialize_layer_fn = (
+      serialize_layer_fn or generic_utils.serialize_keras_object)
+  config = {
+      'name': network.name,
+  }
+  node_conversion_map = {}
+  for layer in network.layers:
+    kept_nodes = 1 if _should_skip_first_node(layer) else 0
+    for original_node_index, node in enumerate(layer._inbound_nodes):
+      node_key = _make_node_key(layer.name, original_node_index)
+      if node_key in network._network_nodes:
+        node_conversion_map[node_key] = kept_nodes
+        kept_nodes += 1
+  layer_configs = []
+  for layer in network.layers:  # From the earliest layers on.
+    filtered_inbound_nodes = []
+    for original_node_index, node in enumerate(layer._inbound_nodes):
+      node_key = _make_node_key(layer.name, original_node_index)
+      if node_key in network._network_nodes and not node.is_input:
+        # The node is relevant to the model:
+        # add to filtered_inbound_nodes.
+        node_data = node.serialize(_make_node_key, node_conversion_map)
+        filtered_inbound_nodes.append(node_data)
+
+    layer_config = serialize_layer_fn(layer)
+    layer_config['name'] = layer.name
+    layer_config['inbound_nodes'] = filtered_inbound_nodes
+    layer_configs.append(layer_config)
+  config['layers'] = layer_configs
+
+  # Gather info about inputs and outputs.
+  model_inputs = []
+  for i in range(len(network._input_layers)):
+    layer, node_index, tensor_index = network._input_coordinates[i]
+    node_key = _make_node_key(layer.name, node_index)
+    if node_key not in network._network_nodes:
+      continue
+    new_node_index = node_conversion_map[node_key]
+    model_inputs.append(
+        tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
+  model_inputs = nest.pack_sequence_as(network._nested_inputs, model_inputs)
+  # Preserve external Keras compat for Models with single input.
+  if not nest.is_sequence(model_inputs):
+    model_inputs = [model_inputs]
+  model_inputs = tf_utils.convert_inner_node_data(model_inputs)
+  config['input_layers'] = model_inputs
+
+  model_outputs = []
+  for i in range(len(network._output_layers)):
+    layer, node_index, tensor_index = network._output_coordinates[i]
+    node_key = _make_node_key(layer.name, node_index)
+    if node_key not in network._network_nodes:
+      continue
+    new_node_index = node_conversion_map[node_key]
+    model_outputs.append(
+        tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
+  model_outputs = nest.pack_sequence_as(network._nested_outputs, model_outputs)
+  # Preserve external Keras compat for Models with single output.
+  if not nest.is_sequence(model_outputs):
+    model_outputs = [model_outputs]
+  model_outputs = tf_utils.convert_inner_node_data(model_outputs)
+  config['output_layers'] = model_outputs
+  return config
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/functional_test.py
similarity index 95%
rename from tensorflow/python/keras/engine/network_test.py
rename to tensorflow/python/keras/engine/functional_test.py
index ad620713b63..90fc9f2697f 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -33,8 +33,8 @@ from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
-from tensorflow.python.keras.engine import network as network_lib
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.utils import layer_utils
@@ -83,21 +83,14 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       _ = layer(x1)
 
       self.assertEqual(len(layer.updates), 2)
-      self.assertEqual(len(layer.get_updates_for(x1)), 1)
-      self.assertEqual(len(layer.get_updates_for(None)), 1)
 
       x2 = input_layer_lib.Input(shape=(1,))
       y2 = layer(x2)
 
       self.assertEqual(len(layer.updates), 3)
-      self.assertEqual(len(layer.get_updates_for(x1)), 1)
-      self.assertEqual(len(layer.get_updates_for(x2)), 1)
-      self.assertEqual(len(layer.get_updates_for(None)), 1)
 
-      network = network_lib.Network(x2, y2)
+      network = functional.Functional(x2, y2)
       self.assertEqual(len(network.updates), 3)
-      self.assertEqual(len(network.get_updates_for(x2)), 1)
-      self.assertEqual(len(network.get_updates_for(None)), 1)
 
       x3 = input_layer_lib.Input(shape=(1,))
       _ = layer(x3)
@@ -106,17 +99,12 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       x4 = input_layer_lib.Input(shape=(1,))
       _ = network(x4)
       self.assertEqual(len(network.updates), 5)
-      self.assertEqual(len(network.get_updates_for(x2)), 1)
-      self.assertEqual(len(network.get_updates_for(x4)), 1)
-      self.assertEqual(len(network.get_updates_for(None)), 1)
 
       network.add_update(state_ops.assign_add(layer.a, [[1]]))
       self.assertEqual(len(network.updates), 6)
-      self.assertEqual(len(network.get_updates_for(None)), 2)
 
       network.add_update(state_ops.assign_add(layer.b, x4), inputs=True)
       self.assertEqual(len(network.updates), 7)
-      self.assertEqual(len(network.get_updates_for(x4)), 2)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_get_updates_bn(self):
@@ -125,8 +113,6 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     _ = layer(x1)
 
     self.assertEqual(len(layer.updates), 2)
-    self.assertEqual(len(layer.get_updates_for(x1)), 2)
-    self.assertEqual(len(layer.get_updates_for(None)), 0)
 
   def test_get_layer(self):
     # create a simple network
@@ -134,7 +120,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     dense_a = layers.Dense(4, name='dense_a')
     dense_b = layers.Dense(2, name='dense_b')
     y = dense_b(dense_a(x))
-    network = network_lib.Network(x, y, name='dense_network')
+    network = functional.Functional(x, y, name='dense_network')
 
     # test various get_layer by index
     self.assertEqual(network.get_layer(index=1), dense_a)
@@ -265,7 +251,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       x = input_layer_lib.Input(shape=(32,))
       dense = layers.Dense(2)
       y = dense(x)
-      network = network_lib.Network(x, y, name='dense_network')
+      network = functional.Functional(x, y, name='dense_network')
 
       # test basic attributes
       self.assertEqual(network.name, 'dense_network')
@@ -754,7 +740,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     else:
       x = input_layer_lib.Input(shape=(32,))
       y = MaskedLayer()(x)  # pylint: disable=not-callable
-      network = network_lib.Network(x, y)
+      network = functional.Functional(x, y)
 
       # test callability on Input
       x_2 = input_layer_lib.Input(shape=(32,))
@@ -981,14 +967,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
   @combinations.generate(combinations.keras_mode_combinations())
   def test_composite_call_kwarg_derived_from_keras_layer(self):
 
-    # Create a test layer that accepts composite tensor inputs (note the
-    # 'supports_ragged_inputs = True' in the init method.)
+    # Create a test layer that accepts composite tensor inputs.
     class MaybeAdd(layers.Layer):
 
-      def __init__(self, **kwargs):
-        super(MaybeAdd, self).__init__(**kwargs)
-        self._supports_ragged_inputs = True
-
       def call(self, x1, x2=None):
         # We need to convert this to a tensor for loss calculations -
         # losses don't play nicely with ragged tensors yet.
@@ -1121,7 +1102,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
   def test_subclassed_error_if_init_not_called(self):
 
-    class MyNetwork(network_lib.Network):
+    class MyNetwork(training_lib.Model):
 
       def __init__(self):
         self._foo = [layers.Dense(10), layers.Dense(10)]
@@ -1143,10 +1124,12 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     inputs = input_layer_lib.Input(shape=(32,))
     outputs = layers.Dense(4)(inputs)
 
-    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dtype='int64')
-    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dynamic=False)
 
@@ -1155,8 +1138,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertFalse(model.trainable)
     self.assertFalse(model.dynamic)
 
+    class SubclassModel(training_lib.Model):
+      pass
     # Subclassed model
-    model = training_lib.Model(
+    model = SubclassModel(
         name='subclassed', trainable=True, dtype='int64', dynamic=True)
     self.assertEqual('subclassed', model.name)
     self.assertTrue(model.dynamic)
@@ -1169,9 +1154,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     input_tensor2 = input_layer_lib.Input(shape=[10], name='b')
     output_tensor1 = layers.Dense(units=10)(input_tensor1)
 
-    net = network_lib.Network(
+    net = functional.Functional(
         inputs=[input_tensor1, input_tensor2], outputs=[output_tensor1])
-    net2 = network_lib.Network.from_config(net.get_config())
+    net2 = functional.Functional.from_config(net.get_config())
     self.assertLen(net2.inputs, 2)
     self.assertEqual('a', net2.layers[0].name)
     self.assertEqual('b', net2.layers[1].name)
@@ -1199,8 +1184,8 @@ class DeferredModeTest(keras_parameterized.TestCase):
       self.assertEqual(x.shape.as_list(), [None, 2])
 
     outputs = layers.Dense(4)(x)
-    network = network_lib.Network(inputs, outputs)
-    self.assertIsInstance(network, network_lib.Network)
+    network = functional.Functional(inputs, outputs)
+    self.assertIsInstance(network, functional.Functional)
 
     if context.executing_eagerly():
       # It should be possible to call such a network on EagerTensors.
@@ -1223,7 +1208,7 @@ class DeferredModeTest(keras_parameterized.TestCase):
     c = AddLayer()([a, input_b])  # pylint: disable=not-callable
     c = layers.Dense(2)(c)
 
-    network = network_lib.Network([input_a, input_b], [a, c])
+    network = functional.Functional([input_a, input_b], [a, c])
     if context.executing_eagerly():
       a_val = constant_op.constant(
           np.random.random((10, 32)).astype('float32'))
@@ -1503,9 +1488,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x2': input_layer_lib.Input(shape=(1,))
     }
     outputs = layers.Add()([inputs['x1'], inputs['x2']])
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     result_tensor = network({
         'x': array_ops.ones((1, 1), 'float32'),
@@ -1528,9 +1513,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x*x': layers.Multiply()([inputs, inputs])
     }
 
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     result_tensor = network(array_ops.ones((1, 1), 'float32'))
     result = self.evaluate(result_tensor)
@@ -1550,7 +1535,8 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x1+x2': layers.Add()([inner_inputs['x1'], inner_inputs['x2']]),
         'x1*x2': layers.Multiply()([inner_inputs['x1'], inner_inputs['x2']])
     }
-    inner_network = network_lib.Network(inner_inputs, inner_outputs)
+    inner_network = functional.Functional(
+        inner_inputs, inner_outputs)
 
     inputs = [
         input_layer_lib.Input(shape=(1,)),
@@ -1558,9 +1544,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
     ]
     middle = inner_network({'x1': inputs[0], 'x2': inputs[1]})
     outputs = layers.Add()([middle['x1+x2'], middle['x1*x2']])
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     # Computes: `(x1+x2) + (x1*x2)`
     result_tensor = network(
@@ -1572,7 +1558,6 @@ class NestedNetworkTest(keras_parameterized.TestCase):
     output_shape = network.compute_output_shape([(None, 1), (None, 1)])
     self.assertListEqual(output_shape.as_list(), [None, 1])
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_updates_with_direct_call(self):
     inputs = input_layer_lib.Input(shape=(10,))
     x = layers.BatchNormalization()(inputs)
@@ -1582,8 +1567,7 @@ class NestedNetworkTest(keras_parameterized.TestCase):
     ph = backend.placeholder(shape=(10, 10))
     model(ph)
 
-    self.assertLen(model.get_updates_for(ph), 2)
-    self.assertLen(model.get_updates_for(None), 0)
+    self.assertLen(model.updates, 4)
 
   def test_dict_mapping_input(self):
 
@@ -1756,13 +1740,13 @@ class DTypeTest(keras_parameterized.TestCase):
   def test_graph_network_dtype(self):
     inputs = input_layer_lib.Input((10,))
     outputs = layers.Dense(10)(inputs)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
     self.assertEqual(network.dtype, 'float32')
 
   @testing_utils.enable_v2_dtype_behavior
   def test_subclassed_network_dtype(self):
 
-    class IdentityNetwork(network_lib.Network):
+    class IdentityNetwork(training_lib.Model):
 
       def call(self, inputs):
         return inputs
@@ -1801,15 +1785,16 @@ class AttrTrackingLayer(base_layer.Layer):
     return super(AttrTrackingLayer, self).dynamic
 
 
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CacheCorrectnessTest(keras_parameterized.TestCase):
 
   def layer_and_network_test(self):
     # Top level layer
-    network = network_lib.Network()
+    network = functional.Functional()
 
     layer_0 = AttrTrackingLayer()
 
-    sub_network = network_lib.Network()
+    sub_network = functional.Functional()
     layer_1 = AttrTrackingLayer(dynamic=True)
     layer_2 = AttrTrackingLayer()
     sub_network.sub_layers = [layer_1, layer_2]
@@ -1907,7 +1892,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     x = input_layer_lib.Input(shape=(None, 32))
     dense = layers.Dense(2)
     y = dense(x)
-    network = network_lib.Network(x, y, name='dense_network')
+    network = functional.Functional(x, y, name='dense_network')
 
     for i in range(999, 1024):
       self.assertEqual(network.compute_output_shape((1, i, 32)), (1, i, 2))
@@ -1915,7 +1900,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
   def test_2d_inputs_squeezed_to_1d(self):
     input_1d = input_layer_lib.Input(shape=())
     outputs = input_1d * 2.
-    net = network_lib.Network(input_1d, outputs)
+    net = functional.Functional(input_1d, outputs)
 
     x = np.ones((10, 1))
     y = net(x)
@@ -1924,7 +1909,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
   def test_1d_inputs_expanded_to_2d(self):
     input_1d = input_layer_lib.Input(shape=(1,))
     outputs = input_1d * 2.
-    net = network_lib.Network(input_1d, outputs)
+    net = functional.Functional(input_1d, outputs)
 
     x = np.ones((10,))
     y = net(x)
@@ -1935,39 +1920,39 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     class MyLayer(base_layer.Layer):
 
       def call(self, x, training=None):
-        self.training = training
-        return x
+        if training is None:
+          return x * -1.0
+        elif training:
+          return x
+        else:
+          return x * 0.0
 
     my_layer = MyLayer()
     x = np.ones((1, 10))
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=True)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network(x, training=False)
     # Hard-coded value passed during construction is respected.
-    self.assertTrue(my_layer.training)
+    self.assertAllEqual(network(x, training=False), x)
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=False)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     network(x, training=True)
     # Hard-coded value passed during construction is respected.
-    self.assertFalse(my_layer.training)
+    self.assertAllEqual(network(x, training=True), x * 0.0)
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=None)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network(x, training=True)
     # `None` value passed during construction is overridden.
-    self.assertTrue(my_layer.training)
-    network(x, training=False)
+    self.assertAllEqual(network(x, training=True), x)
     # `None` value passed during construction is overridden.
-    self.assertFalse(my_layer.training)
-
+    self.assertAllEqual(network(x, training=False), x * 0.0)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index bcaaa5c46f6..ed715f61897 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -80,7 +80,7 @@ class InputLayer(base_layer.Layer):
       ragged: Boolean, whether the placeholder created is meant to be ragged.
           In this case, values of 'None' in the 'shape' argument represent
           ragged dimensions. For more information about RaggedTensors, see
-          https://www.tensorflow.org/guide/ragged_tensors.
+          [this guide](https://www.tensorflow.org/guide/ragged_tensors).
           Default to False.
       name: Optional name of the layer (string).
   """
@@ -132,7 +132,6 @@ class InputLayer(base_layer.Layer):
     self.ragged = ragged
     self.batch_size = batch_size
     self.supports_masking = True
-    self._supports_ragged_inputs = True
 
     if isinstance(input_shape, tensor_shape.TensorShape):
       input_shape = tuple(input_shape.as_list())
@@ -164,17 +163,9 @@ class InputLayer(base_layer.Layer):
       self.is_placeholder = False
       self._batch_input_shape = tuple(input_tensor.shape.as_list())
 
-    # Create an input node to add to self.outbound_node
-    # and set output_tensors' _keras_history.
-    input_tensor._keras_history = base_layer.KerasHistory(self, 0, 0)
+    # Create an input node.
     input_tensor._keras_mask = None
-    node_module.Node(
-        self,
-        inbound_layers=[],
-        node_indices=[],
-        tensor_indices=[],
-        input_tensors=[input_tensor],
-        output_tensors=[input_tensor])
+    node_module.Node(layer=self, outputs=input_tensor)
 
   def get_config(self):
     config = {
@@ -231,7 +222,7 @@ def Input(  # pylint: disable=invalid-name
           ragged. Only one of 'ragged' and 'sparse' can be True. In this case,
           values of 'None' in the 'shape' argument represent ragged dimensions.
           For more information about RaggedTensors, see
-          https://www.tensorflow.org/guide/ragged_tensors.
+          [this guide](https://www.tensorflow.org/guide/ragged_tensors).
       **kwargs: deprecated arguments support. Supports `batch_shape` and
           `batch_input_shape`.
 
@@ -294,8 +285,8 @@ def Input(  # pylint: disable=invalid-name
 
   # Return tensor including `_keras_history`.
   # Note that in this case train_output and test_output are the same pointer.
-  outputs = input_layer._inbound_nodes[0].output_tensors
-  if len(outputs) == 1:
+  outputs = input_layer._inbound_nodes[0].outputs
+  if isinstance(outputs, list) and len(outputs) == 1:
     return outputs[0]
   else:
     return outputs
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
deleted file mode 100644
index 313f0f2d661..00000000000
--- a/tensorflow/python/keras/engine/network.py
+++ /dev/null
@@ -1,2161 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=protected-access
-"""A `Network` is way to compose layers: the topological form of a `Model`.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import copy
-import itertools
-import json
-import os
-
-import numpy as np
-import six
-from six.moves import zip  # pylint: disable=redefined-builtin
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import composite_tensor
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import func_graph
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.engine import compile_utils
-from tensorflow.python.keras.engine import input_layer as input_layer_module
-from tensorflow.python.keras.engine import node as node_module
-from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.saving import hdf5_format
-from tensorflow.python.keras.saving import save
-from tensorflow.python.keras.saving.saved_model import network_serialization
-from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.keras.utils.io_utils import path_to_string
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import py_checkpoint_reader
-from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
-from tensorflow.python.training.tracking import tracking
-from tensorflow.python.training.tracking import util as trackable_utils
-from tensorflow.python.util import nest
-from tensorflow.python.util import serialization
-from tensorflow.python.util import tf_inspect
-
-
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-except ImportError:
-  h5py = None
-
-try:
-  import yaml
-except ImportError:
-  yaml = None
-# pylint: enable=g-import-not-at-top
-
-
-class Network(base_layer.Layer):
-  """A `Network` is a composition of layers.
-
-  `Network` is the topological form of a "model". A `Model`
-  is simply a `Network` with added training routines.
-
-  Two types of `Networks` exist: Graph Networks and Subclass Networks. Graph
-  networks are used in the Keras Functional and Sequential APIs. Subclassed
-  networks are used when a user subclasses the `Model` class. In general,
-  more Keras features are supported with Graph Networks than with Subclassed
-  Networks, specifically:
-
-  - Model cloning (`keras.models.clone`)
-  - Serialization (`model.get_config()/from_config`, `model.to_json()/to_yaml()`
-  - Whole-model saving (`model.save()`)
-
-  A Graph Network can be instantiated by passing two arguments to `__init__`.
-  The first argument is the `keras.Input` Tensors that represent the inputs
-  to the Network. The second argument specifies the output Tensors that
-  represent the outputs of this Network. Both arguments can be a nested
-  structure of Tensors.
-
-  Example:
-
-  ```
-  inputs = {'x1': keras.Input(shape=(10,)), 'x2': keras.Input(shape=(1,))}
-  t = keras.layers.Dense(1, activation='relu')(inputs['x1'])
-  outputs = keras.layers.Add()([t, inputs['x2'])
-  network = Network(inputs, outputs)
-  ```
-
-  A Graph Network constructed using the Functional API can also include raw
-  TensorFlow functions, with the exception of functions that create Variables
-  or assign ops.
-
-  Example:
-
-  ```
-  inputs = keras.Input(shape=(10,))
-  x = keras.layers.Dense(1)(inputs)
-  outputs = tf.nn.relu(x)
-  network = Network(inputs, outputs)
-  ```
-
-  Subclassed Networks can be instantiated via `name` and (optional) `dynamic`
-  keyword arguments. Subclassed Networks keep track of their Layers, and their
-  `call` method can be overridden. Subclassed Networks are typically created
-  indirectly, by subclassing the `Model` class.
-
-  Example:
-
-  ```
-  class MyModel(keras.Model):
-    def __init__(self):
-      super(MyModel, self).__init__(name='my_model', dynamic=False)
-
-      self.layer1 = keras.layers.Dense(10, activation='relu')
-
-    def call(self, inputs):
-      return self.layer1(inputs)
-  ```
-
-  Allowed args in `super().__init__`:
-    name: String name of the model.
-    dynamic: (Subclassed models only) Set this to `True` if your model should
-      only be run eagerly, and should not be used to generate a static
-      computation graph. This attribute is automatically set for Functional API
-      models.
-    trainable: Boolean, whether the model's variables should be trainable.
-    dtype: (Subclassed models only) Default dtype of the model's weights (
-      default of `None` means use the type of the first input). This attribute
-      has no effect on Functional API models, which do not have weights of their
-      own.
-  """
-
-  # See tf.Module for the usage of this property.
-  # The key of _layer_call_argspecs is a layer. tf.Module._flatten will fail to
-  # flatten the key since it is trying to convert Trackable/Layer to a string.
-  _TF_MODULE_IGNORED_PROPERTIES = frozenset(itertools.chain(
-      ('_layer_call_argspecs', '_compiled_trainable_state',
-       '_output_mask_cache', '_output_tensor_cache', '_output_shape_cache'),
-      base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES
-  ))
-
-  def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
-    # Signature detection
-    if (len(args) == 2 or
-        len(args) == 1 and 'outputs' in kwargs or
-        'inputs' in kwargs and 'outputs' in kwargs):
-      # Graph network
-      self._init_graph_network(*args, **kwargs)
-    else:
-      # Subclassed network
-      self._init_subclassed_network(**kwargs)
-
-    tf_utils.assert_no_legacy_layers(self.layers)
-
-  # Several Network methods have "no_automatic_dependency_tracking"
-  # annotations. Since Network does automatic dependency tracking on attribute
-  # assignment, including for common data structures such as lists, by default
-  # we'd have quite a few empty dependencies which users don't care about (or
-  # would need some way to ignore dependencies automatically, which is confusing
-  # when applied to user code). Some attributes, such as _layers, would cause
-  # structural issues (_layers being the place where Layers assigned to tracked
-  # attributes are stored).
-  #
-  # Aside from these aesthetic and structural issues, useless dependencies on
-  # empty lists shouldn't cause issues; adding or removing them will not break
-  # checkpoints, but may cause "all Python objects matched" assertions to fail
-  # (in which case less strict assertions may be substituted if necessary).
-  @trackable.no_automatic_dependency_tracking
-  def _base_init(self, name=None, **kwargs):
-    # The following are implemented as property functions:
-    # self.trainable_weights
-    # self.non_trainable_weights
-    # self.input_spec
-    # self.losses
-    # self.updates
-
-    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
-                                           'autocast'})
-
-    super(Network, self).__init__(name=name, **kwargs)
-
-    self.output_names = None
-    self.input_names = None
-    self._saved_model_inputs_spec = None
-
-    # This is True for Sequential networks and Functional networks.
-    self._compute_output_and_mask_jointly = False
-
-    # Don't reset compilation if already done. This may occur if calling
-    # `__init__` (or `_init_graph_network`) on an already-compiled model
-    # such as a Sequential model. Sequential models may need to rebuild
-    # themselves after compilation.
-    self._maybe_create_attribute('_is_compiled', False)
-    self._maybe_create_attribute('optimizer', None)
-
-    self._scope = None  # Never used.
-    self._reuse = None  # Never used.
-    if context.executing_eagerly():
-      self._graph = None
-    else:
-      self._graph = ops.get_default_graph()  # Used in symbolic mode only.
-
-    self._trackable_saver = (
-        trackable_utils.saver_with_op_caching(self))
-
-  @trackable.no_automatic_dependency_tracking
-  def _init_graph_network(self, inputs, outputs, name=None, **kwargs):
-    generic_utils.validate_kwargs(
-        kwargs, {'trainable'},
-        'Functional models may only specify `name` and `trainable` keyword '
-        'arguments during initialization. Got an unexpected argument:')
-    # Normalize and set self.inputs, self.outputs.
-    if isinstance(inputs, list) and len(nest.flatten(inputs)) == 1:
-      inputs = inputs[0]
-    if isinstance(outputs, list) and len(nest.flatten(outputs)) == 1:
-      outputs = outputs[0]
-    self._nested_outputs = outputs
-    self._nested_inputs = inputs
-    self.inputs = nest.flatten(inputs)
-    self.outputs = nest.flatten(outputs)
-
-    # Models constructed with a single Tensor or list of Tensors can
-    # be called with a dict, where the keys of the dict are the names
-    # of the `Input` objects. Extra keys are ignored.
-    self._enable_dict_to_input_mapping = (
-        not nest.is_sequence(self._nested_inputs) or
-        (isinstance(self._nested_inputs, (list, tuple)) and
-         not any(nest.is_sequence(t) for t in self._nested_inputs)))
-
-    if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
-      base_layer_utils.create_keras_history(self._nested_outputs)
-
-    self._base_init(name=name, **kwargs)
-    self._validate_graph_inputs_and_outputs()
-
-    # A Network does not create weights of its own, thus it is already
-    # built.
-    self.built = True
-    self._build_input_shape = nest.map_structure(lambda x: x.shape, inputs)
-    self._compute_output_and_mask_jointly = True
-    self._is_graph_network = True
-    # `_expects_training_arg` is True since the `training` argument is always
-    # present in the signature of the `call` method of a graph network.
-    self._expects_training_arg = True
-    self._expects_mask_arg = True
-    # A graph network does not autocast inputs, as its layers will cast them
-    # instead.
-    self._autocast = False
-
-    self._input_layers = []
-    self._output_layers = []
-    self._input_coordinates = []
-    self._output_coordinates = []
-
-    self._supports_ragged_inputs = None
-
-    # This is for performance optimization when calling the Network on new
-    # inputs. Every time the Network is called on a set on input tensors,
-    # we compute the output tensors, output masks and output shapes in one pass,
-    # then cache them here. When any of these outputs is queried later, we
-    # retrieve it from there instead of recomputing it.
-    self._output_mask_cache = {}
-    self._output_tensor_cache = {}
-    self._output_shape_cache = {}
-
-    # Build self._output_layers:
-    for x in self.outputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      self._output_layers.append(layer)
-      self._output_coordinates.append((layer, node_index, tensor_index))
-
-    # Build self._input_layers:
-    for x in self.inputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      # It's supposed to be an input layer, so only one node
-      # and one tensor output.
-      assert node_index == 0
-      assert tensor_index == 0
-      self._input_layers.append(layer)
-      self._input_coordinates.append((layer, node_index, tensor_index))
-
-    # Keep track of the network's nodes and layers.
-    nodes, nodes_by_depth, layers, _ = _map_graph_network(
-        self.inputs, self.outputs)
-    self._network_nodes = nodes
-    self._nodes_by_depth = nodes_by_depth
-    self._layers = layers
-    self._layer_call_argspecs = {}
-    for layer in self._layers:
-      self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-      layer._attribute_sentinel.add_parent(self._attribute_sentinel)
-
-    # Create the node linking internal inputs to internal outputs.
-    node_module.Node(
-        outbound_layer=self,
-        inbound_layers=[],
-        node_indices=[],
-        tensor_indices=[],
-        input_tensors=self._nested_inputs,
-        output_tensors=self._nested_outputs)
-
-    # Build self.input_names and self.output_names.
-    self._set_output_names()
-    self.input_names = []
-    self._feed_input_names = []
-    self._feed_inputs = []
-    self._feed_input_shapes = []
-    for layer in self._input_layers:
-      self.input_names.append(layer.name)
-      if layer.is_placeholder:
-        self._feed_input_names.append(layer.name)
-        # Use batch_input_shape here because non-eager composite tensors may not
-        # have a shape attribute that's meaningful (sparse, for instance, has
-        # a tensor that's non-constant and needs to be fed). This means that
-        # input layers that create placeholders will need to have the
-        # batch_input_shape attr to allow for input shape validation.
-        self._feed_input_shapes.append(layer._batch_input_shape)
-        self._feed_inputs.append(layer.input)
-
-    self._compute_tensor_usage_count()
-    self._set_save_spec(self._nested_inputs)
-
-  def _set_output_names(self):
-    """Assigns unique names to the Network's outputs.
-
-    Output layers with multiple output tensors would otherwise lead to duplicate
-    names in self.output_names.
-    """
-    uniquified = []
-    output_names = set()
-    prefix_count = {}
-    for layer in self._output_layers:
-      proposal = layer.name
-      while proposal in output_names:
-        existing_count = prefix_count.get(layer.name, 1)
-        proposal = '{}_{}'.format(layer.name, existing_count)
-        prefix_count[layer.name] = existing_count + 1
-      output_names.add(proposal)
-      uniquified.append(proposal)
-    self.output_names = uniquified
-
-  @trackable.no_automatic_dependency_tracking
-  def _init_subclassed_network(self, name=None, **kwargs):
-    self._base_init(name=name, **kwargs)
-    self._is_graph_network = False
-    self._init_call_fn_args()
-    self._autocast = kwargs.get('autocast',
-                                base_layer_utils.v2_dtype_behavior_enabled())
-    self._supports_ragged_inputs = None
-    self.outputs = None
-    self.inputs = None
-    self.built = False
-    self._build_input_shape = None
-
-  @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
-  def dynamic(self):
-    if self._is_graph_network:
-      return any(layer.dynamic for layer in self.layers)
-    return self._dynamic or any(layer.dynamic for layer in self.layers)
-
-  @property
-  def _layer_checkpoint_dependencies(self):
-    """Dictionary of layer dependencies to be included in the checkpoint."""
-    # Use getattr because this function can be called from __setattr__, at which
-    # point the _is_graph_network attribute has not been created.
-    if (not getattr(self, '_is_graph_network', False) and
-        base_layer_utils.is_subclassed(self)):
-      return {}  # Only add layer dependencies for graph networks
-
-    weight_layer_index = 0
-
-    dependencies = collections.OrderedDict()
-    for layer_index, layer in enumerate(self.layers):
-      try:
-        if layer.weights:
-          # Keep a separate index for layers which have weights. This allows
-          # users to insert Layers without weights anywhere in the network
-          # without breaking checkpoints.
-          dependencies['layer_with_weights-%d' % weight_layer_index] = layer
-          weight_layer_index += 1
-      except ValueError:
-        # The layer might have weights, but may not be built yet. We just treat
-        # it as layer without weight.
-        pass
-
-      # Even if it doesn't have weights, we should still track everything in
-      # case it has/will have Trackable dependencies.
-      dependencies['layer-%d' % layer_index] = layer
-    return dependencies
-
-  @property
-  def _checkpoint_dependencies(self):
-    dependencies = [
-        trackable.TrackableReference(name=name, ref=layer)
-        for name, layer in self._layer_checkpoint_dependencies.items()]
-    dependencies.extend(super(Network, self)._checkpoint_dependencies)
-    return dependencies
-
-  def _lookup_dependency(self, name):
-    layer_dependencies = self._layer_checkpoint_dependencies
-    if name in layer_dependencies:
-      return layer_dependencies[name]
-    return super(Network, self)._lookup_dependency(name)
-
-  def _handle_deferred_layer_dependencies(self, layers):
-    """Handles layer checkpoint dependencies that are added after init."""
-    layer_checkpoint_dependencies = self._layer_checkpoint_dependencies
-    layer_to_name = {v: k for k, v in layer_checkpoint_dependencies.items()}
-    for layer in layers:
-      if layer in layer_to_name:
-        self._handle_deferred_dependencies(name=layer_to_name[layer],
-                                           trackable=layer)
-
-  def __setattr__(self, name, value):
-    if not getattr(self, '_self_setattr_tracking', True):
-      super(Network, self).__setattr__(name, value)
-      return
-
-    if all(
-        isinstance(v, (base_layer.Layer,
-                       data_structures.TrackableDataStructure)) or
-        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
-      try:
-        self._is_graph_network
-      except AttributeError:
-        # six.raise_from supresses the original AttributeError from being raised
-        six.raise_from(
-            RuntimeError('It looks like you are subclassing `Model` and you '
-                         'forgot to call `super(YourClass, self).__init__()`.'
-                         ' Always start with this line.'), None)
-
-    super(Network, self).__setattr__(name, value)
-
-    # Keep track of metric instance created in subclassed model/layer.
-    # We do this so that we can maintain the correct order of metrics by adding
-    # the instance to the `metrics` list as soon as it is created.
-    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
-    if isinstance(value, metrics_module.Metric):
-      self._metrics.append(value)
-
-  @property
-  @trackable_layer_utils.cache_recursive_attribute('stateful')
-  def stateful(self):
-    return any(getattr(layer, 'stateful', False) for layer in self.layers)
-
-  def reset_states(self):
-    for layer in self.layers:
-      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
-        layer.reset_states()
-
-  @property
-  def state_updates(self):
-    """Returns the `updates` from all layers that are stateful.
-
-    This is useful for separating training updates and
-    state updates, e.g. when we need to update a layer's internal state
-    during prediction.
-
-    Returns:
-        A list of update ops.
-    """
-    state_updates = []
-    for layer in self.layers:
-      if getattr(layer, 'stateful', False):
-        if hasattr(layer, 'updates'):
-          state_updates += layer.updates
-    return state_updates
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self._dedup_weights(self._undeduplicated_weights)
-
-  @property
-  def _undeduplicated_weights(self):
-    """Returns the undeduplicated list of all layer variables/weights."""
-    self._assert_weights_created()
-    weights = []
-    for layer in self._layers:
-      weights += layer.weights
-    weights += (self._trainable_weights + self._non_trainable_weights)
-    return weights
-
-  @property
-  @tracking.cached_per_instance
-  def _should_compute_mask(self):
-    return self._is_graph_network and super(Network, self)._should_compute_mask
-
-  def compute_mask(self, inputs, mask):
-    if not self._is_graph_network:
-      return None
-
-    # TODO(omalleyt): b/123540974 This function is not really safe to call
-    # by itself because it will duplicate any updates and losses in graph
-    # mode by `call`ing the Layers again.
-    output_tensors = self._run_internal_graph(inputs, mask=mask)
-    return nest.map_structure(lambda t: t._keras_mask, output_tensors)
-
-  @property
-  def layers(self):
-    return list(
-        trackable_layer_utils.filter_empty_layer_containers(self._layers))
-
-  def get_layer(self, name=None, index=None):
-    """Retrieves a layer based on either its name (unique) or index.
-
-    If `name` and `index` are both provided, `index` will take precedence.
-    Indices are based on order of horizontal graph traversal (bottom-up).
-
-    Arguments:
-        name: String, name of layer.
-        index: Integer, index of layer.
-
-    Returns:
-        A layer instance.
-
-    Raises:
-        ValueError: In case of invalid layer name or index.
-    """
-    # TODO(fchollet): We could build a dictionary based on layer names
-    # since they are constant, but we have not done that yet.
-    if index is not None and name is not None:
-      raise ValueError('Provide only a layer name or a layer index.')
-
-    if index is not None:
-      if len(self.layers) <= index:
-        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
-                         ' but model only has ' + str(len(self.layers)) +
-                         ' layers.')
-      else:
-        return self.layers[index]
-
-    if name is not None:
-      for layer in self.layers:
-        if layer.name == name:
-          return layer
-      raise ValueError('No such layer: ' + name + '.')
-    raise ValueError('Provide either a layer name or layer index.')
-
-  @property
-  def trainable_weights(self):
-    self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._trainable_weights))
-
-  @property
-  def non_trainable_weights(self):
-    self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_non_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._non_trainable_weights +
-            self._trainable_weights))
-
-  @generic_utils.default
-  def build(self, input_shape):
-    """Builds the model based on input shapes received.
-
-    This is to be used for subclassed models, which do not know at instantiation
-    time what their inputs look like.
-
-    This method only exists for users who want to call `model.build()` in a
-    standalone way (as a substitute for calling the model on real data to
-    build it). It will never be called by the framework (and thus it will
-    never throw unexpected errors in an unrelated workflow).
-
-    Args:
-     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
-         are tuples, integers, or TensorShapes.
-
-    Raises:
-      ValueError:
-        1. In case of invalid user-provided data (not of type tuple,
-           list, or TensorShape).
-        2. If the model requires call arguments that are agnostic
-           to the input shapes (positional or kwarg in call signature).
-        3. If not all layers were properly built.
-        4. If float type inputs are not supported within the layers.
-
-      In each of these cases, the user should build their model by calling it
-      on real tensor data.
-    """
-    if self._is_graph_network:
-      super(Network, self).build(input_shape)
-      return
-
-    # If subclass network
-    if input_shape is None:
-      raise ValueError('Input shape must be defined when calling build on a '
-                       'model subclass network.')
-    valid_types = (tuple, list, tensor_shape.TensorShape)
-    if not isinstance(input_shape, valid_types):
-      raise ValueError('Specified input shape is not one of the valid types. '
-                       'Please specify a batch input shape of type tuple or '
-                       'list of input shapes. User provided '
-                       'input type: {}'.format(type(input_shape)))
-
-    if input_shape and not self.inputs:
-      # We create placeholders for the `None`s in the shape and build the model
-      # in a Graph. Since tf.Variable is compatible with both eager execution
-      # and graph building, the variables created after building the model in
-      # a Graph are still valid when executing eagerly.
-      if context.executing_eagerly():
-        graph = func_graph.FuncGraph('build_graph')
-      else:
-        graph = backend.get_graph()
-      with graph.as_default():
-        if isinstance(input_shape, list):
-          x = [base_layer_utils.generate_placeholders_from_shape(shape)
-               for shape in input_shape]
-        elif isinstance(input_shape, dict):
-          x = {
-              k: base_layer_utils.generate_placeholders_from_shape(shape)
-              for k, shape in input_shape.items()
-          }
-        else:
-          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
-
-        kwargs = {}
-        call_signature = self._call_full_argspec
-        call_args = call_signature.args
-        # Exclude `self`, `inputs`, and any argument with a default value.
-        if len(call_args) > 2:
-          if call_signature.defaults:
-            call_args = call_args[2:-len(call_signature.defaults)]
-          else:
-            call_args = call_args[2:]
-          for arg in call_args:
-            if arg == 'training':
-              # Case where `training` is a positional arg with no default.
-              kwargs['training'] = False
-            else:
-              # Has invalid call signature with unknown positional arguments.
-              raise ValueError(
-                  'Currently, you cannot build your model if it has '
-                  'positional or keyword arguments that are not '
-                  'inputs to the model, but are required for its '
-                  '`call` method. Instead, in order to instantiate '
-                  'and build your model, `call` your model on real '
-                  'tensor data with all expected call arguments.')
-        elif len(call_args) < 2:
-          # Signature without `inputs`.
-          raise ValueError('You can only call `build` on a model if its `call` '
-                           'method accepts an `inputs` argument.')
-        try:
-          self.call(x, **kwargs)
-        except (errors.InvalidArgumentError, TypeError):
-          raise ValueError('You cannot build your model by calling `build` '
-                           'if your layers do not support float type inputs. '
-                           'Instead, in order to instantiate and build your '
-                           'model, `call` your model on real tensor data (of '
-                           'the correct dtype).')
-
-    super(Network, self).build(input_shape)
-
-  def call(self, inputs, training=None, mask=None):
-    """Calls the model on new inputs.
-
-    In this case `call` just reapplies
-    all ops in the graph to the new inputs
-    (e.g. build a new computational graph from the provided inputs).
-
-    Arguments:
-        inputs: A tensor or list of tensors.
-        training: Boolean or boolean scalar tensor, indicating whether to run
-          the `Network` in training mode or inference mode.
-        mask: A mask or list of masks. A mask can be
-            either a tensor or None (no mask).
-
-    Returns:
-        A tensor if there is a single output, or
-        a list of tensors if there are more than one outputs.
-    """
-    if not self._is_graph_network:
-      raise NotImplementedError('When subclassing the `Model` class, you should'
-                                ' implement a `call` method.')
-
-    return self._run_internal_graph(
-        inputs, training=training, mask=mask,
-        convert_kwargs_to_constants=base_layer_utils.call_context().saving)
-
-  def compute_output_shape(self, input_shape):
-    if not self._is_graph_network:
-      return super(Network, self).compute_output_shape(input_shape)
-
-    # Convert any shapes in tuple format to TensorShapes.
-    input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
-
-    if len(nest.flatten(input_shape)) != len(nest.flatten(self._input_layers)):
-      raise ValueError('Invalid input_shape argument ' + str(input_shape) +
-                       ': model has ' + str(len(self._input_layers)) +
-                       ' tensor inputs.')
-
-    # Use the tuple of TensorShape as the cache key, since tuple is hashable
-    # and can be used as hash key.
-    try:
-      cache_key = tuple(tf_utils.convert_shapes(input_shape, to_tuples=True))
-      if cache_key in self._output_shape_cache:
-        # Cache hit. Return shapes as TensorShapes.
-        return self._output_shape_cache[cache_key]
-    except ValueError:
-      # In case there are unknown TensorShape, eg for sparse tensor input,
-      # We skip the caching since the shape is unknown.
-      pass
-
-    layers_to_output_shapes = {}
-    for layer, shape in zip(self._input_layers, nest.flatten(input_shape)):
-      # It's an input layer: then `compute_output_shape` is identity,
-      # and there is only one node and one tensor..
-      shape_key = layer.name + '_0_0'
-      layers_to_output_shapes[shape_key] = shape
-
-    depth_keys = list(self._nodes_by_depth.keys())
-    depth_keys.sort(reverse=True)
-    # Iterate over nodes, by depth level.
-    if len(depth_keys) > 1:
-      for depth in depth_keys:
-        nodes = self._nodes_by_depth[depth]
-        for node in nodes:
-          # This is always a single layer, never a list.
-          layer = node.outbound_layer
-          if layer in self._input_layers:
-            # We've already covered the input layers
-            # a few lines above.
-            continue
-          # Potentially redundant list,
-          # same size as node.input_tensors.
-          layer_input_shapes = []
-          for inbound_layer, node_id, tensor_id, _ in node.iterate_inbound():
-            input_layer_key = inbound_layer.name + '_%s_%s' % (node_id,
-                                                               tensor_id)
-            layer_input_shapes.append(layers_to_output_shapes[input_layer_key])
-          layer_input_shapes = nest.pack_sequence_as(node.inbound_layers,
-                                                     layer_input_shapes)
-          # Layers expect shapes to be tuples for `compute_output_shape`.
-          layer_input_shapes = tf_utils.convert_shapes(
-              layer_input_shapes, to_tuples=True)
-          layer_output_shapes = layer.compute_output_shape(layer_input_shapes)
-          # Convert back to TensorShapes.
-          layer_output_shapes = tf_utils.convert_shapes(
-              layer_output_shapes, to_tuples=False)
-
-          node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
-          for j, shape in enumerate(nest.flatten(layer_output_shapes)):
-            shape_key = layer.name + '_%s_%s' % (node_index, j)
-            layers_to_output_shapes[shape_key] = shape
-
-      # Read final output shapes from layers_to_output_shapes.
-      output_shapes = []
-      for i in range(len(self._output_layers)):
-        layer, node_index, tensor_index = self._output_coordinates[i]
-        shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
-        output_shapes.append(layers_to_output_shapes[shape_key])
-      output_shapes = nest.pack_sequence_as(self._nested_outputs, output_shapes)
-      # Store in cache.
-      self._output_shape_cache[cache_key] = output_shapes
-
-    # Return shapes as TensorShapes.
-    return output_shapes
-
-  def _run_internal_graph(self, inputs, training=None, mask=None,
-                          convert_kwargs_to_constants=False):
-    """Computes output tensors for new inputs.
-
-    # Note:
-        - Can be run on non-Keras tensors.
-
-    Arguments:
-        inputs: Tensor or nested structure of Tensors.
-        training: Boolean learning phase.
-        mask: (Optional) Tensor or nested structure of Tensors.
-        convert_kwargs_to_constants: Whether to convert Tensor kwargs to
-          constants. This is used when tracing the model call function during
-          saving to ensure that external tensors aren't captured.
-
-    Returns:
-        Two lists: output_tensors, output_masks
-    """
-    # Note: masking support is relevant mainly for Keras.
-    # It cannot be factored out without having the fully reimplement the network
-    # calling logic on the Keras side. We choose to incorporate it in
-    # Network because 1) it may be useful to fully support in tf.layers in
-    # the future and 2) Keras is a major user of Network.  If you don't
-    # use masking, it does not interfere with regular behavior at all and you
-    # can ignore it.
-
-    inputs = self._flatten_to_reference_inputs(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = self._flatten_to_reference_inputs(mask)
-    for input_t, mask in zip(inputs, masks):
-      input_t._keras_mask = mask
-
-    # Dictionary mapping reference tensors to computed tensors.
-    tensor_dict = {}
-    for x, y in zip(self.inputs, inputs):
-      y = self._conform_to_reference_input(y, ref_input=x)
-      x_id = str(id(x))
-      tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
-
-    depth_keys = list(self._nodes_by_depth.keys())
-    depth_keys.sort(reverse=True)
-    # Ignore the InputLayers when computing the graph.
-    depth_keys = depth_keys[1:]
-
-    for depth in depth_keys:
-      nodes = self._nodes_by_depth[depth]
-      for node in nodes:
-        # This is always a single layer, never a list.
-        layer = node.outbound_layer
-
-        if all(
-            str(id(tensor)) in tensor_dict
-            for tensor in nest.flatten(node.input_tensors)):
-
-          # Call layer (reapplying ops to new inputs).
-          computed_tensors = nest.map_structure(
-              lambda t: tensor_dict[str(id(t))].pop(), node.input_tensors)
-
-          # Ensure `training` arg propagation if applicable.
-          kwargs = copy.copy(node.arguments) if node.arguments else {}
-          if convert_kwargs_to_constants:
-            kwargs = _map_tensors_to_constants(kwargs)
-
-          argspec = self._layer_call_argspecs[layer].args
-          if 'training' in argspec:
-            if 'training' not in kwargs or kwargs['training'] is None:
-              kwargs['training'] = training
-            elif (type(kwargs['training']) is ops.Tensor and  # pylint: disable=unidiomatic-typecheck
-                  any([
-                      kwargs['training'] is x
-                      for x in backend._GRAPH_LEARNING_PHASES.values()
-                  ])):
-              kwargs['training'] = training  # Materialize placeholder.
-
-          # Map Keras tensors in kwargs to their computed value.
-          def _map_tensor_if_from_keras_layer(t):
-            if (isinstance(t,
-                           (ops.Tensor, composite_tensor.CompositeTensor)) and
-                hasattr(t, '_keras_history')):
-              t_id = str(id(t))
-              return tensor_dict[t_id].pop()
-            return t
-
-          kwargs = nest.map_structure(_map_tensor_if_from_keras_layer, kwargs)
-
-          # Compute outputs.
-          output_tensors = layer(computed_tensors, **kwargs)
-
-          # Update tensor_dict.
-          for x, y in zip(
-              nest.flatten(node.output_tensors), nest.flatten(output_tensors)):
-            x_id = str(id(x))
-            tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
-
-    output_tensors = []
-    output_shapes = []
-    for x in self.outputs:
-      assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)
-      tensor = tensor_dict[str(id(x))].pop()
-      output_shapes.append(x.shape)
-      output_tensors.append(tensor)
-
-    if output_shapes is not None:
-      input_shapes = [x.shape for x in inputs]
-      try:
-        cache_key = tuple(tf_utils.convert_shapes(input_shapes, to_tuples=True))
-        self._output_shape_cache[cache_key] = nest.pack_sequence_as(
-            self._nested_outputs, output_shapes)
-      except ValueError:
-        # In case there are unknown TensorShape, eg for sparse tensor input,
-        # We skip the caching since the shape is unknown.
-        pass
-
-    output_tensors = nest.pack_sequence_as(self._nested_outputs, output_tensors)
-    return output_tensors
-
-  def _flatten_to_reference_inputs(self, tensors):
-    """Maps `tensors` to their respective `keras.Input`."""
-    if self._enable_dict_to_input_mapping and isinstance(tensors, dict):
-      ref_inputs = self._nested_inputs
-      if not nest.is_sequence(ref_inputs):
-        ref_inputs = [self._nested_inputs]
-
-      try:
-        # Flatten in the order `Input`s were passed during Model construction.
-        return [tensors[inp._keras_history.layer.name] for inp in ref_inputs]
-      except KeyError:
-        # TODO(b/151582614)
-        return nest.flatten(tensors)
-
-    # Otherwise both self.inputs and tensors will already be in same order.
-    return nest.flatten(tensors)
-
-  def _conform_to_reference_input(self, tensor, ref_input):
-    """Set shape and dtype based on `keras.Input`s."""
-    # Shape handling (only for non-CompositeTensors).
-    if isinstance(tensor, ops.Tensor) and isinstance(ref_input, ops.Tensor):
-      # Allow (None,) and (None, 1) Tensors to be passed interchangably. Use the
-      # shape specified by the `keras.Input`.
-      if tensor.shape.rank is not None and ref_input.shape.rank is not None:
-        should_squeeze_last_dim = (
-            tensor.shape.rank == ref_input.shape.rank + 1 and
-            tensor.shape[-1] == 1)
-        should_expand_last_dim = (
-            tensor.shape.rank == ref_input.shape.rank - 1 and
-            ref_input.shape[-1] == 1)
-        if should_squeeze_last_dim:
-          tensor = array_ops.squeeze_v2(tensor, axis=-1)
-        elif should_expand_last_dim:
-          tensor = array_ops.expand_dims_v2(tensor, axis=-1)
-
-      # Add shape hints to Tensors that might have None shape dims but have
-      # shapes defined by the `keras.Input`.
-      try:
-        tensor.set_shape(tensor.shape.merge_with(ref_input.shape))
-      except ValueError:
-        logging.warning(
-            'Model was constructed with shape {} for input {}, but it was '
-            'called on an input with incompatible shape {}.'.format(
-                ref_input.shape, ref_input, tensor.shape))
-
-    # Dtype handling.
-    if isinstance(ref_input, (ops.Tensor, composite_tensor.CompositeTensor)):
-      tensor = math_ops.cast(tensor, dtype=ref_input.dtype)
-
-    return tensor
-
-  def get_config(self):
-    if not self._is_graph_network:
-      raise NotImplementedError
-    return copy.deepcopy(get_network_config(self))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    """Instantiates a Model from its config (output of `get_config()`).
-
-    Arguments:
-        config: Model config dictionary.
-        custom_objects: Optional dictionary mapping names
-            (strings) to custom classes or functions to be
-            considered during deserialization.
-
-    Returns:
-        A model instance.
-
-    Raises:
-        ValueError: In case of improperly formatted config dict.
-    """
-    input_tensors, output_tensors, created_layers = reconstruct_from_config(
-        config, custom_objects)
-    model = cls(inputs=input_tensors, outputs=output_tensors,
-                name=config.get('name'))
-    connect_ancillary_layers(model, created_layers)
-    return model
-
-  def save(self,
-           filepath,
-           overwrite=True,
-           include_optimizer=True,
-           save_format=None,
-           signatures=None,
-           options=None):
-    """Saves the model to Tensorflow SavedModel or a single HDF5 file.
-
-    The savefile includes:
-
-    - The model architecture, allowing to re-instantiate the model.
-    - The model weights.
-    - The state of the optimizer, allowing to resume training
-        exactly where you left off.
-
-    This allows you to save the entirety of the state of a model
-    in a single file.
-
-    Saved models can be reinstantiated via `keras.models.load_model`.
-    The model returned by `load_model` is a compiled model ready to be used
-    (unless the saved model was never compiled in the first place).
-
-    Models built with the Sequential and Functional API can be saved to both the
-    HDF5 and SavedModel formats. Subclassed models can only be saved with the
-    SavedModel format.
-
-    Note that the model weights may have different scoped names after being
-    loaded. Scoped names include the model/layer names, such as
-    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
-     access specific variables, e.g. `model.get_layer("dense_1").kernel`.
-
-    Arguments:
-        filepath: String, PathLike, path to SavedModel or H5 file to save the
-            model.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        include_optimizer: If True, save optimizer's state together.
-        save_format: Either `'tf'` or `'h5'`, indicating whether to save the
-            model to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X,
-            and 'h5' in TF 1.X.
-        signatures: Signatures to save with the SavedModel. Applicable to the
-            'tf' format only. Please see the `signatures` argument in
-            `tf.saved_model.save` for details.
-        options: Optional `tf.saved_model.SaveOptions` object that specifies
-            options for saving to SavedModel.
-
-    Example:
-
-    ```python
-    from keras.models import load_model
-
-    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
-    del model  # deletes the existing model
-
-    # returns a compiled model
-    # identical to the previous one
-    model = load_model('my_model.h5')
-    ```
-    """
-    save.save_model(self, filepath, overwrite, include_optimizer, save_format,
-                    signatures, options)
-
-  def save_weights(self, filepath, overwrite=True, save_format=None):
-    """Saves all layer weights.
-
-    Either saves in HDF5 or in TensorFlow format based on the `save_format`
-    argument.
-
-    When saving in HDF5 format, the weight file has:
-      - `layer_names` (attribute), a list of strings
-          (ordered names of model layers).
-      - For every layer, a `group` named `layer.name`
-          - For every such layer group, a group attribute `weight_names`,
-              a list of strings
-              (ordered names of weights tensor of the layer).
-          - For every weight in the layer, a dataset
-              storing the weight value, named after the weight tensor.
-
-    When saving in TensorFlow format, all objects referenced by the network are
-    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
-    instances or `Optimizer` instances assigned to object attributes. For
-    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
-    outputs)`, `Layer` instances used by the network are tracked/saved
-    automatically. For user-defined classes which inherit from `tf.keras.Model`,
-    `Layer` instances must be assigned to object attributes, typically in the
-    constructor. See the documentation of `tf.train.Checkpoint` and
-    `tf.keras.Model` for details.
-
-    While the formats are the same, do not mix `save_weights` and
-    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
-    loaded using `Model.load_weights`. Checkpoints saved using
-    `tf.train.Checkpoint.save` should be restored using the corresponding
-    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
-    `save_weights` for training checkpoints.
-
-    The TensorFlow format matches objects and variables by starting at a root
-    object, `self` for `save_weights`, and greedily matching attribute
-    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
-    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
-    means saving a `tf.keras.Model` using `save_weights` and loading into a
-    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
-    the `Model`'s variables. See the [guide to training
-    checkpoints](https://www.tensorflow.org/guide/checkpoint) for details
-    on the TensorFlow format.
-
-    Arguments:
-        filepath: String or PathLike, path to the file to save the weights to.
-            When saving in TensorFlow format, this is the prefix used for
-            checkpoint files (multiple files are generated). Note that the '.h5'
-            suffix causes weights to be saved in HDF5 format.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
-            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
-            `None` defaults to 'tf'.
-
-    Raises:
-        ImportError: If h5py is not available when attempting to save in HDF5
-            format.
-        ValueError: For invalid/unknown format arguments.
-    """
-    self._assert_weights_created()
-    filepath = path_to_string(filepath)
-    filepath_is_h5 = _is_hdf5_filepath(filepath)
-    if save_format is None:
-      if filepath_is_h5:
-        save_format = 'h5'
-      else:
-        save_format = 'tf'
-    else:
-      user_format = save_format.lower().strip()
-      if user_format in ('tensorflow', 'tf'):
-        save_format = 'tf'
-      elif user_format in ('hdf5', 'h5', 'keras'):
-        save_format = 'h5'
-      else:
-        raise ValueError(
-            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
-                save_format,))
-    if save_format == 'tf' and filepath_is_h5:
-      raise ValueError(
-          ('save_weights got save_format="tf"/"tensorflow", but the '
-           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
-           'when saving in TensorFlow format.')
-          % filepath)
-
-    if save_format == 'h5' and h5py is None:
-      raise ImportError(
-          '`save_weights` requires h5py when saving in hdf5.')
-    if save_format == 'tf':
-      check_filepath = filepath + '.index'
-    else:
-      check_filepath = filepath
-    # If file exists and should not be overwritten:
-    if not overwrite and os.path.isfile(check_filepath):
-      proceed = ask_to_proceed_with_overwrite(check_filepath)
-      if not proceed:
-        return
-    if save_format == 'h5':
-      with h5py.File(filepath, 'w') as f:
-        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
-    else:
-      if context.executing_eagerly():
-        session = None
-      else:
-        session = backend.get_session()
-      optimizer = getattr(self, 'optimizer', None)
-      if (optimizer
-          and not isinstance(optimizer, trackable.Trackable)):
-        logging.warning(
-            ('This model was compiled with a Keras optimizer (%s) but is being '
-             'saved in TensorFlow format with `save_weights`. The model\'s '
-             'weights will be saved, but unlike with TensorFlow optimizers in '
-             'the TensorFlow format the optimizer\'s state will not be '
-             'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
-            % (optimizer,))
-      self._trackable_saver.save(filepath, session=session)
-      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
-      checkpoint_management.update_checkpoint_state_internal(
-          save_dir=os.path.dirname(filepath),
-          model_checkpoint_path=filepath,
-          save_relative_paths=True,
-          all_model_checkpoint_paths=[filepath])
-
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
-
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
-
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
-
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
-
-    Arguments:
-        filepath: String or PathLike, path to the weights file to load. For
-            weight files in TensorFlow format, this is the file prefix (the
-            same as was passed to `save_weights`).
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
-
-    Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
-
-    Raises:
-        ImportError: If h5py is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
-    """
-
-    if skip_mismatch and not by_name:
-      raise ValueError(
-          'When calling model.load_weights, skip_mismatch can only be set to '
-          'True when by_name is True.')
-
-    filepath = path_to_string(filepath)
-    if _is_hdf5_filepath(filepath):
-      save_format = 'h5'
-    else:
-      try:
-        py_checkpoint_reader.NewCheckpointReader(filepath)
-        save_format = 'tf'
-      except errors_impl.DataLossError:
-        # The checkpoint is not readable in TensorFlow format. Try HDF5.
-        save_format = 'h5'
-    if save_format == 'tf':
-      status = self._trackable_saver.restore(filepath)
-      if by_name:
-        raise NotImplementedError(
-            'Weights may only be loaded based on topology into Models when '
-            'loading TensorFlow-formatted weights (got by_name=True to '
-            'load_weights).')
-      if not context.executing_eagerly():
-        session = backend.get_session()
-        # Restore existing variables (if any) immediately, and set up a
-        # streaming restore for any variables created in the future.
-        trackable_utils.streaming_restore(status=status, session=session)
-      status.assert_nontrivial_match()
-      return status
-    if h5py is None:
-      raise ImportError(
-          '`load_weights` requires h5py when loading weights from HDF5.')
-    if self._is_graph_network and not self.built:
-      raise NotImplementedError(
-          'Unable to load weights saved in HDF5 format into a subclassed '
-          'Model which has not created its variables yet. Call the Model '
-          'first, then load the weights.')
-    self._assert_weights_created()
-    with h5py.File(filepath, 'r') as f:
-      if 'layer_names' not in f.attrs and 'model_weights' in f:
-        f = f['model_weights']
-      if by_name:
-        hdf5_format.load_weights_from_hdf5_group_by_name(
-            f, self.layers, skip_mismatch=skip_mismatch)
-      else:
-        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
-
-  def _updated_config(self):
-    """Util shared between different serialization methods.
-
-    Returns:
-        Model config with Keras version information added.
-    """
-    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-    config = self.get_config()
-    model_config = {
-        'class_name': self.__class__.__name__,
-        'config': config,
-        'keras_version': keras_version,
-        'backend': backend.backend()
-    }
-    return model_config
-
-  def to_json(self, **kwargs):
-    """Returns a JSON string containing the network configuration.
-
-    To load a network from a JSON save file, use
-    `keras.models.model_from_json(json_string, custom_objects={})`.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `json.dumps()`.
-
-    Returns:
-        A JSON string.
-    """
-    model_config = self._updated_config()
-    return json.dumps(
-        model_config, default=serialization.get_json_type, **kwargs)
-
-  def to_yaml(self, **kwargs):
-    """Returns a yaml string containing the network configuration.
-
-    To load a network from a yaml save file, use
-    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
-
-    `custom_objects` should be a dictionary mapping
-    the names of custom losses / layers / etc to the corresponding
-    functions / classes.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `yaml.dump()`.
-
-    Returns:
-        A YAML string.
-
-    Raises:
-        ImportError: if yaml module is not found.
-    """
-    if yaml is None:
-      raise ImportError(
-          'Requires yaml module installed (`pip install pyyaml`).')
-    return yaml.dump(self._updated_config(), **kwargs)
-
-  def summary(self, line_length=None, positions=None, print_fn=None):
-    """Prints a string summary of the network.
-
-    Arguments:
-        line_length: Total length of printed lines
-            (e.g. set this to adapt the display to different
-            terminal window sizes).
-        positions: Relative or absolute positions of log elements
-            in each line. If not provided,
-            defaults to `[.33, .55, .67, 1.]`.
-        print_fn: Print function to use. Defaults to `print`.
-            It will be called on each line of the summary.
-            You can set it to a custom function
-            in order to capture the string summary.
-
-    Raises:
-        ValueError: if `summary()` is called before the model is built.
-    """
-    if not self.built:
-      raise ValueError('This model has not yet been built. '
-                       'Build the model first by calling `build()` or calling '
-                       '`fit()` with some data, or specify '
-                       'an `input_shape` argument in the first layer(s) for '
-                       'automatic build.')
-    layer_utils.print_summary(self,
-                              line_length=line_length,
-                              positions=positions,
-                              print_fn=print_fn)
-
-  def _validate_graph_inputs_and_outputs(self):
-    """Validates the inputs and outputs of a Graph Network."""
-    # Check for redundancy in inputs.
-    if len({id(i) for i in self.inputs}) != len(self.inputs):
-      raise ValueError('The list of inputs passed to the model '
-                       'is redundant. '
-                       'All inputs should only appear once.'
-                       ' Found: ' + str(self.inputs))
-
-    for x in self.inputs:
-      # Check that x has appropriate `_keras_history` metadata.
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Input tensors to a ' + cls_name + ' ' +
-                         'must come from `tf.keras.Input`. '
-                         'Received: ' + str(x) +
-                         ' (missing previous layer metadata).')
-      # Check that x is an input tensor.
-      # pylint: disable=protected-access
-      layer = x._keras_history.layer
-      if len(layer._inbound_nodes) > 1 or (
-          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
-        cls_name = self.__class__.__name__
-        logging.warning(cls_name + ' inputs must come from '
-                        '`tf.keras.Input` (thus holding past layer metadata), '
-                        'they cannot be the output of '
-                        'a previous non-Input layer. '
-                        'Here, a tensor specified as '
-                        'input to "' + self.name + '" was not an Input tensor, '
-                        'it was generated by layer ' + layer.name + '.\n'
-                        'Note that input tensors are '
-                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
-                        'The tensor that caused the issue was: ' + str(x.name))
-      if isinstance(x, ragged_tensor.RaggedTensor):
-        self._supports_ragged_inputs = True
-
-    # Check compatibility of batch sizes of Input Layers.
-    input_batch_sizes = [
-        training_utils.get_static_batch_size(x._keras_history.layer)
-        for x in self.inputs
-    ]
-    consistent_batch_size = None
-    for batch_size in input_batch_sizes:
-      if batch_size is not None:
-        if (consistent_batch_size is not None and
-            batch_size != consistent_batch_size):
-          raise ValueError('The specified batch sizes of the Input Layers'
-                           ' are incompatible. Found batch sizes: {}'.format(
-                               input_batch_sizes))
-        consistent_batch_size = batch_size
-
-    for x in self.outputs:
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Output tensors to a ' + cls_name + ' must be '
-                         'the output of a TensorFlow `Layer` '
-                         '(thus holding past layer metadata). Found: ' + str(x))
-
-  def _insert_layers(self, layers, relevant_nodes=None):
-    """Inserts Layers into the Network after Network creation.
-
-    This is only valid for Keras Graph Networks.  Layers added via this function
-    will be included in the `call` computation and `get_config` of this Network.
-    They will not be added to the Network's outputs.
-
-
-    Arguments:
-      layers: Arbitrary nested structure of Layers. Layers must be reachable
-        from one or more of the `keras.Input` Tensors that correspond to this
-        Network's inputs.
-      relevant_nodes: Nodes from the Layers that should be considered part of
-        this Network. If `None`, all Nodes will be considered part of this
-        Network.
-
-    Raises:
-      ValueError: If the layers depend on `Input`s not found in this Model.
-    """
-    layers = nest.flatten(layers)
-    tf_utils.assert_no_legacy_layers(layers)
-    node_to_depth = {}
-    for depth, nodes in self._nodes_by_depth.items():
-      node_to_depth.update({node: depth for node in nodes})
-    # The nodes of these Layers that are relevant to this Network. If not
-    # provided, assume all Nodes are relevant
-    if not relevant_nodes:
-      relevant_nodes = nest.flatten([layer._inbound_nodes for layer in layers])
-    network_nodes = set(relevant_nodes + list(node_to_depth.keys()))
-
-    def _get_min_depth(node):
-      """Gets the minimum depth at which node can be computed."""
-      min_depth = 0
-      for layer, node_id, _, _ in node.iterate_inbound(include_arguments=True):
-        inbound_node = layer._inbound_nodes[node_id]
-        if inbound_node in node_to_depth:
-          min_depth = min(min_depth, node_to_depth[inbound_node])
-        elif inbound_node not in network_nodes:
-          continue
-        else:
-          # Previous relevant nodes haven't been processed yet.
-          return None
-      # New node is one shallower than its shallowest input.
-      return min_depth - 1
-
-    # Insert nodes into `_nodes_by_depth` and other node attrs.
-    unprocessed_nodes = copy.copy(relevant_nodes)
-    i = 0
-    while unprocessed_nodes:
-      i += 1
-      # Do a sanity check. This can occur if `Input`s from outside this Model
-      # are being relied on.
-      if i > 10000:
-        raise ValueError('Layers could not be added due to missing '
-                         'dependencies.')
-
-      node = unprocessed_nodes.pop(0)
-      depth = _get_min_depth(node)
-      if depth is None:  # Defer until inbound nodes are processed.
-        unprocessed_nodes.append(node)
-        continue
-      node_key = _make_node_key(node.outbound_layer.name,
-                                node.outbound_layer._inbound_nodes.index(node))
-      if node_key not in self._network_nodes:
-        node_to_depth[node] = depth
-        self._network_nodes.add(node_key)
-        self._nodes_by_depth[depth].append(node)
-
-    # Insert layers and update other layer attrs.
-    layer_set = set(self._layers)
-    deferred_layers = []
-    for layer in layers:
-      if layer not in layer_set:
-        self._layers.append(layer)
-        deferred_layers.append(layer)
-        self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-
-        # This allows the added layer to broadcast mutations to the current
-        # layer, which is necessary to ensure cache correctness.
-        layer._attribute_sentinel.add_parent(self._attribute_sentinel)
-        layer_set.add(layer)
-    self._handle_deferred_layer_dependencies(deferred_layers)
-
-    self._compute_tensor_usage_count()
-
-  def _compute_tensor_usage_count(self):
-    """Compute the #. of tensor usages for all the output tensors of layers.
-
-    The computed tensor usage count is saved as `self._tensor_usage_count`. This
-    is later used for saving memory in eager computation by releasing
-    no-longer-needed tensors as early as possible.
-    """
-    tensor_usage_count = collections.Counter()
-    available_tensors = set(str(id(tensor)) for tensor in self.inputs)
-
-    depth_keys = list(self._nodes_by_depth.keys())
-    depth_keys.sort(reverse=True)
-    depth_keys = depth_keys[1:]
-
-    for depth in depth_keys:
-      for node in self._nodes_by_depth[depth]:
-        input_tensors = {
-            str(id(tensor)) for tensor in nest.flatten(node.input_tensors)
-        }
-        if input_tensors.issubset(available_tensors):
-          kwargs = copy.copy(node.arguments) if node.arguments else {}
-
-          for tensor in nest.flatten(kwargs):
-            if (isinstance(tensor,
-                           (ops.Tensor, composite_tensor.CompositeTensor)) and
-                hasattr(tensor, '_keras_history')):
-              tensor_usage_count[str(id(tensor))] += 1
-
-          for tensor in nest.flatten(node.input_tensors):
-            tensor_usage_count[str(id(tensor))] += 1
-
-          for output_tensor in nest.flatten(node.output_tensors):
-            available_tensors.add(str(id(output_tensor)))
-
-    for tensor in self.outputs:
-      tensor_usage_count[str(id(tensor))] += 1
-
-    self._tensor_usage_count = tensor_usage_count
-
-  def _assert_weights_created(self):
-    """Asserts that all the weights for the network have been created.
-
-    For a non-dynamic network, the weights must already be created after the
-    layer has been called. For a dynamic network, the exact list of weights can
-    never be known for certain since it may change at any time during execution.
-
-    We run this check right before accessing weights or getting the Numpy value
-    for the current weights. Otherwise, if the layer has never been called,
-    the user would just get an empty list, which is misleading.
-
-    Raises:
-      ValueError: if the weights of the network has not yet been created.
-    """
-    if self.dynamic:
-      return
-    if (not self._is_graph_network and
-        'build' in self.__class__.__dict__ and
-        not self.built):
-      # For any model that has customized build() method but hasn't
-      # been invoked yet, this will cover both sequential and subclass model.
-      raise ValueError('Weights for model %s have not yet been created. '
-                       'Weights are created when the Model is first called on '
-                       'inputs or `build()` is called with an `input_shape`.' %
-                       self.name)
-
-  def _graph_network_add_loss(self, symbolic_loss):
-    new_nodes, new_layers = _map_subgraph_network(self.inputs, [symbolic_loss])
-    # Losses must be keyed on inputs no matter what in order to be supported in
-    # DistributionStrategy.
-    add_loss_layer = base_layer.AddLoss(
-        unconditional=False, dtype=symbolic_loss.dtype)
-    add_loss_layer(symbolic_loss)
-    new_nodes.extend(add_loss_layer.inbound_nodes)
-    new_layers.append(add_loss_layer)
-    self._insert_layers(new_layers, new_nodes)
-
-  def _graph_network_add_metric(self, value, aggregation, name):
-    new_nodes, new_layers = _map_subgraph_network(self.inputs, [value])
-    add_metric_layer = base_layer.AddMetric(
-        aggregation, name, dtype=value.dtype)
-    add_metric_layer(value)
-    new_nodes.extend(add_metric_layer.inbound_nodes)
-    new_layers.append(add_metric_layer)
-    self._insert_layers(new_layers, new_nodes)
-
-  @trackable.no_automatic_dependency_tracking
-  def _set_save_spec(self, inputs):
-    if self._saved_model_inputs_spec is not None:
-      return  # Already set.
-
-    input_names = self.input_names
-    if not input_names:
-      input_names = compile_utils.create_pseudo_input_names(inputs)
-
-    flat_inputs = nest.flatten(inputs)
-    specs = []
-    for name, tensor in zip(input_names, flat_inputs):
-      specs.append(
-          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
-    specs = nest.pack_sequence_as(inputs, specs)
-
-    self._saved_model_inputs_spec = specs
-
-  def _get_save_spec(self, dynamic_batch=True):
-    if self._saved_model_inputs_spec is None:
-      return None
-
-    return nest.map_structure(
-        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
-        self._saved_model_inputs_spec)
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return network_serialization.NetworkSavedModelSaver(self)
-
-
-def _is_hdf5_filepath(filepath):
-  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
-          filepath.endswith('.hdf5'))
-
-
-def _make_node_key(layer_name, node_index):
-  return layer_name + '_ib-' + str(node_index)
-
-
-def _map_graph_network(inputs, outputs):
-  """Validates a network's topology and gather its layers and nodes.
-
-  Arguments:
-    inputs: List of input tensors.
-    outputs: List of outputs tensors.
-
-  Returns:
-    A tuple `(nodes, nodes_by_depth, layers, layers_by_depth)`.
-    - nodes: list of Node instances.
-    - nodes_by_depth: dict mapping ints (depth) to lists of node instances.
-    - layers: list of Layer instances.
-    - layers_by_depth: dict mapping ints (depth) to lists of layer instances.
-
-  Raises:
-    ValueError: In case the network is not valid (e.g. disconnected graph).
-  """
-  # Network_nodes: set of nodes included in the graph of layers
-  # (not all nodes included in the layers are relevant to the current graph).
-  network_nodes = set()  # ids of all nodes relevant to the Network
-  nodes_depths = {}  # dict {node: depth value}
-  layers_depths = {}  # dict {layer: depth value}
-  layer_indices = {}  # dict {layer: index in traversal}
-  nodes_in_decreasing_depth = []
-
-  def build_map(tensor,
-                finished_nodes,
-                nodes_in_progress,
-                layer,
-                node_index,
-                tensor_index):
-    """Builds a map of the graph of layers.
-
-    This recursively updates the map `layer_indices`,
-    the list `nodes_in_decreasing_depth` and the set `network_nodes`.
-
-    Arguments:
-        tensor: Some tensor in a graph.
-        finished_nodes: Set of nodes whose subgraphs have been traversed
-            completely. Useful to prevent duplicated work.
-        nodes_in_progress: Set of nodes that are currently active on the
-            recursion stack. Useful to detect cycles.
-        layer: Layer from which `tensor` comes from. If not provided,
-            will be obtained from `tensor._keras_history`.
-        node_index: Node index from which `tensor` comes from.
-        tensor_index: Tensor_index from which `tensor` comes from.
-
-    Raises:
-        ValueError: if a cycle is detected.
-    """
-    node = layer._inbound_nodes[node_index]  # pylint: disable=protected-access
-
-    # Prevent cycles.
-    if node in nodes_in_progress:
-      raise ValueError('The tensor ' + str(tensor) + ' at layer "' +
-                       layer.name + '" is part of a cycle.')
-
-    # Don't repeat work for shared subgraphs
-    if node in finished_nodes:
-      return
-
-    node_key = _make_node_key(layer.name, node_index)
-    # Update network_nodes.
-    network_nodes.add(node_key)
-
-    # Store the traversal order for layer sorting.
-    if layer not in layer_indices:
-      layer_indices[layer] = len(layer_indices)
-
-    nodes_in_progress.add(node)
-
-    # Propagate to all previous tensors connected to this node.
-    for layer, node_index, tensor_index, tensor in node.iterate_inbound(
-        include_arguments=True):
-      build_map(tensor, finished_nodes, nodes_in_progress, layer, node_index,
-                tensor_index)
-
-    finished_nodes.add(node)
-    nodes_in_progress.remove(node)
-    nodes_in_decreasing_depth.append(node)
-
-  finished_nodes = set()
-  nodes_in_progress = set()
-  for x in outputs:
-    layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-    build_map(x, finished_nodes, nodes_in_progress,
-              layer=layer,
-              node_index=node_index,
-              tensor_index=tensor_index)
-
-  for node in reversed(nodes_in_decreasing_depth):
-    # If the depth is not set, the node has no outbound nodes (depth 0).
-    depth = nodes_depths.setdefault(node, 0)
-
-    # Update the depth of the corresponding layer
-    previous_depth = layers_depths.get(node.outbound_layer, 0)
-    # If we've seen this layer before at a higher depth,
-    # we should use that depth instead of the node depth.
-    # This is necessary for shared layers that have inputs at different
-    # depth levels in the graph.
-    depth = max(depth, previous_depth)
-    layers_depths[node.outbound_layer] = depth
-    nodes_depths[node] = depth
-
-    # Update the depth of inbound nodes.
-    # The "depth" of a node is the max of the depths
-    # of all nodes it is connected to + 1.
-    for node_dep in node._get_all_node_dependencies():
-      previous_depth = nodes_depths.get(node_dep, 0)
-      nodes_depths[node_dep] = max(depth + 1, previous_depth)
-
-  # Handle inputs that are not connected to outputs.
-  # We do not error out here because the inputs may be used to compute losses
-  # and metrics.
-  for input_t in inputs:
-    input_layer = input_t._keras_history[0]
-    if input_layer not in layers_depths:
-      layers_depths[input_layer] = 0
-      layer_indices[input_layer] = -1
-      nodes_depths[input_layer._inbound_nodes[0]] = 0
-      network_nodes.add(_make_node_key(input_layer.name, 0))
-
-  # Build a dict {depth: list of nodes with this depth}
-  nodes_by_depth = collections.defaultdict(list)
-  for node, depth in nodes_depths.items():
-    nodes_by_depth[depth].append(node)
-
-  # Build a dict {depth: list of layers with this depth}
-  layers_by_depth = collections.defaultdict(list)
-  for layer, depth in layers_depths.items():
-    layers_by_depth[depth].append(layer)
-
-  # Get sorted list of layer depths.
-  depth_keys = list(layers_by_depth.keys())
-  depth_keys.sort(reverse=True)
-
-  # Set self.layers ordered by depth.
-  layers = []
-  for depth in depth_keys:
-    layers_for_depth = layers_by_depth[depth]
-    # Network.layers needs to have a deterministic order:
-    # here we order them by traversal order.
-    layers_for_depth.sort(key=lambda x: layer_indices[x])
-    layers.extend(layers_for_depth)
-
-  # Get sorted list of node depths.
-  depth_keys = list(nodes_by_depth.keys())
-  depth_keys.sort(reverse=True)
-
-  # Check that all tensors required are computable.
-  # computable_tensors: all tensors in the graph
-  # that can be computed from the inputs provided.
-  computable_tensors = set()
-  for x in inputs:
-    computable_tensors.add(id(x))
-
-  layers_with_complete_input = []  # To provide a better error msg.
-  for depth in depth_keys:
-    for node in nodes_by_depth[depth]:
-      layer = node.outbound_layer
-      if layer:
-        for x in nest.flatten(node.input_tensors):
-          if id(x) not in computable_tensors:
-            raise ValueError('Graph disconnected: '
-                             'cannot obtain value for tensor ' + str(x) +
-                             ' at layer "' + layer.name + '". '
-                             'The following previous layers '
-                             'were accessed without issue: ' +
-                             str(layers_with_complete_input))
-        for x in nest.flatten(node.output_tensors):
-          computable_tensors.add(id(x))
-        layers_with_complete_input.append(layer.name)
-
-  # Ensure name unicity, which will be crucial for serialization
-  # (since serialized nodes refer to layers by their name).
-  all_names = [layer.name for layer in layers]
-  for name in all_names:
-    if all_names.count(name) != 1:
-      raise ValueError('The name "' + name + '" is used ' +
-                       str(all_names.count(name)) + ' times in the model. '
-                       'All layer names should be unique.')
-  return network_nodes, nodes_by_depth, layers, layers_by_depth
-
-
-def _map_subgraph_network(inputs, outputs):
-  """Returns the nodes and layers in the topology from `inputs` to `outputs`.
-
-  Args:
-    inputs: List of input tensors.
-    outputs: List of output tensors.
-
-  Returns:
-    A tuple of List{Node] and List[Layer].
-  """
-  base_layer_utils.create_keras_history(outputs)
-  # Keep only nodes and layers in the topology between inputs and outputs.
-  _, nodes_by_depth, layers, _ = _map_graph_network(inputs, outputs)
-  return nest.flatten([nodes for nodes in nodes_by_depth.values()]), layers
-
-
-def _should_skip_first_node(layer):
-  """Returns True if the first layer node should not be saved or loaded."""
-  # Networks start with a pre-existing node linking their input to output.
-  return issubclass(layer.__class__, Network) and layer._is_graph_network
-
-
-def _serialize_tensors(kwargs):
-  """Serializes Tensors passed to `call`."""
-
-  def _serialize_keras_tensor(t):
-    """Serializes a single Tensor passed to `call`."""
-    if hasattr(t, '_keras_history'):
-      kh = t._keras_history
-      return [kh.layer.name, kh.node_index, kh.tensor_index]
-
-    if isinstance(t, np.ndarray):
-      return t.tolist()
-
-    if isinstance(t, ops.Tensor):
-      return backend.get_value(t).tolist()
-
-    return t
-
-  return nest.map_structure(_serialize_keras_tensor, kwargs)
-
-
-def _map_tensors_to_constants(kwargs):
-
-  def _map_to_constants(t):
-    if not hasattr(t, '_keras_history') and isinstance(t, ops.Tensor):
-      return constant_op.constant(backend.get_value(t))
-    return t
-
-  return nest.map_structure(_map_to_constants, kwargs)
-
-
-def _deserialize_keras_tensors(kwargs, layer_map):
-  """Deserializes Keras Tensors passed to `call`.."""
-
-  def _deserialize_keras_tensor(t):
-    """Deserializes a single Keras Tensor passed to `call`."""
-    if isinstance(t, tf_utils.ListWrapper):
-      t = t.as_list()
-      layer_name = t[0]
-      node_index = t[1]
-      tensor_index = t[2]
-
-      layer = layer_map[layer_name]
-      node = layer._inbound_nodes[node_index]
-      return nest.flatten(node.output_tensors)[tensor_index]
-    return t
-
-  kwargs = tf_utils.convert_inner_node_data(kwargs, wrap=True)
-  return nest.map_structure(_deserialize_keras_tensor, kwargs)
-
-
-def connect_ancillary_layers(model, created_layers):
-  """Adds layers that are not connected to the outputs to the model."""
-  # Layers not connected to outputs, such as those added in `add_loss`.
-  ancillary_layers = [
-      layer for layer in created_layers.values() if layer not in model.layers
-  ]
-  if ancillary_layers:
-    relevant_nodes = nest.flatten([
-        layer.inbound_nodes[1:]
-        if _should_skip_first_node(layer) else layer.inbound_nodes
-        for layer in created_layers.values()
-    ])
-    model._insert_layers(ancillary_layers, relevant_nodes)
-  return model
-
-
-def reconstruct_from_config(config, custom_objects=None, created_layers=None):
-  """Reconstructs graph from config object.
-
-  Args:
-    config: Dictionary returned from Network.get_config()
-    custom_objects: Optional dictionary mapping names (strings) to custom
-      classes or functions to be considered during deserialization.
-    created_layers: Optional dictionary mapping names to Layer objects. Any
-      layer not in this dictionary will be be created and added to the dict.
-      This function will add new nodes to all layers (excluding InputLayers),
-      instead of re-using pre-existing nodes in the layers.
-
-  Returns:
-    Tuple of (input tensors, output tensors, dictionary of created layers)
-  """
-  # Layer instances created during the graph reconstruction process.
-  created_layers = created_layers or collections.OrderedDict()
-
-  # Maps input data (tuple of inbound layer name, node index) from the config
-  # to node indices in the newly generated model. The node indices may be
-  # different if the layers have already been called previously.
-  node_index_map = {}
-  node_count_by_layer = {}
-
-  # Dictionary mapping layer instances to
-  # node data that specifies a layer call.
-  # It acts as a queue that maintains any unprocessed
-  # layer call until it becomes possible to process it
-  # (i.e. until the input tensors to the call all exist).
-  unprocessed_nodes = {}
-
-  def add_unprocessed_node(layer, node_data):
-    if layer not in unprocessed_nodes:
-      unprocessed_nodes[layer] = [node_data]
-    else:
-      unprocessed_nodes[layer].append(node_data)
-
-  def get_node_index(layer, config_node_index):
-    """Returns node index in layer (might differ from config_node_index)."""
-    if isinstance(layer, input_layer_module.InputLayer):
-      return 0
-    return node_index_map.get((layer.name, config_node_index), None)
-
-  def process_node(layer, node_data):
-    """Deserialize a node.
-
-    Arguments:
-        layer: layer instance.
-        node_data: Nested structure of `ListWrapper`.
-
-    Raises:
-        ValueError: In case of improperly formatted `node_data`.
-    """
-    input_tensors = []
-    for input_data in nest.flatten(node_data):
-      input_data = input_data.as_list()
-      inbound_layer_name = input_data[0]
-      inbound_node_index = input_data[1]
-      inbound_tensor_index = input_data[2]
-      if len(input_data) == 3:
-        kwargs = {}
-      elif len(input_data) == 4:
-        kwargs = input_data[3]
-        kwargs = _deserialize_keras_tensors(kwargs, created_layers)
-      else:
-        raise ValueError('Improperly formatted model config.')
-
-      inbound_layer = created_layers[inbound_layer_name]
-      inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
-
-      if inbound_node_index is None:
-        add_unprocessed_node(layer, node_data)
-        return
-      inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
-      input_tensors.append(
-          nest.flatten(inbound_node.output_tensors)[inbound_tensor_index])
-    input_tensors = nest.pack_sequence_as(node_data, input_tensors)
-    # Call layer on its inputs, thus creating the node
-    # and building the layer if needed.
-    if input_tensors is not None:
-      input_tensors = base_layer_utils.unnest_if_single_tensor(input_tensors)
-      output_tensors = layer(input_tensors, **kwargs)
-
-      # Update node index map.
-      output_index = nest.flatten(output_tensors)[0]._keras_history.node_index
-      node_index_map[(layer.name, node_count_by_layer[layer])] = output_index
-      node_count_by_layer[layer] += 1
-
-  def process_layer(layer_data):
-    """Deserializes a layer, then call it on appropriate inputs.
-
-    Arguments:
-        layer_data: layer config dict.
-
-    Raises:
-        ValueError: In case of improperly formatted `layer_data` dict.
-    """
-    layer_name = layer_data['name']
-
-    if layer_name in created_layers:
-      layer = created_layers[layer_name]
-    else:
-      # Instantiate layer.
-      from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-
-      layer = deserialize_layer(layer_data, custom_objects=custom_objects)
-      created_layers[layer_name] = layer
-
-    node_count_by_layer[layer] = int(_should_skip_first_node(layer))
-
-    # Gather layer inputs and convert to `ListWrapper` objects.
-    inbound_nodes_data = layer_data['inbound_nodes']
-    inbound_nodes_data = tf_utils.convert_inner_node_data(
-        inbound_nodes_data, wrap=True)
-    for node_data in inbound_nodes_data:
-      # We don't process nodes (i.e. make layer calls)
-      # on the fly because the inbound node may not yet exist,
-      # in case of layer shared at different topological depths
-      # (e.g. a model such as A(B(A(B(x)))))
-      add_unprocessed_node(layer, node_data)
-
-  # First, we create all layers and enqueue nodes to be processed
-  for layer_data in config['layers']:
-    process_layer(layer_data)
-  # Then we process nodes in order of layer depth.
-  # Nodes that cannot yet be processed (if the inbound node
-  # does not yet exist) are re-enqueued, and the process
-  # is repeated until all nodes are processed.
-  while unprocessed_nodes:
-    for layer_data in config['layers']:
-      layer = created_layers[layer_data['name']]
-      if layer in unprocessed_nodes:
-        for node_data in unprocessed_nodes.pop(layer):
-          process_node(layer, node_data)
-
-  input_tensors = []
-  output_tensors = []
-
-  input_layers = tf_utils.convert_inner_node_data(
-      config['input_layers'], wrap=True)
-  for layer_data in nest.flatten(input_layers):
-    layer_name, node_index, tensor_index = layer_data.as_list()
-    assert layer_name in created_layers
-    layer = created_layers[layer_name]
-    node_index = get_node_index(layer, node_index)
-    layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-    input_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
-
-  output_layers = tf_utils.convert_inner_node_data(
-      config['output_layers'], wrap=True)
-  for layer_data in nest.flatten(output_layers):
-    layer_name, node_index, tensor_index = layer_data.as_list()
-    assert layer_name in created_layers
-    layer = created_layers[layer_name]
-    node_index = get_node_index(layer, node_index)
-    layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-    output_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
-
-  input_tensors = nest.pack_sequence_as(input_layers, input_tensors)
-  output_tensors = nest.pack_sequence_as(output_layers, output_tensors)
-  return input_tensors, output_tensors, created_layers
-
-
-def get_network_config(network, serialize_layer_fn=None):
-  """Builds the config, which consists of the node graph and serialized layers.
-
-  Args:
-    network: A Network object.
-    serialize_layer_fn: Function used to serialize layers.
-
-  Returns:
-    Config dictionary.
-  """
-  serialize_layer_fn = (
-      serialize_layer_fn or generic_utils.serialize_keras_object)
-  config = {
-      'name': network.name,
-  }
-  node_conversion_map = {}
-  for layer in network.layers:
-    kept_nodes = 1 if _should_skip_first_node(layer) else 0
-    for original_node_index, node in enumerate(layer._inbound_nodes):
-      node_key = _make_node_key(layer.name, original_node_index)
-      if node_key in network._network_nodes:
-        node_conversion_map[node_key] = kept_nodes
-        kept_nodes += 1
-  layer_configs = []
-  for layer in network.layers:  # From the earliest layers on.
-    filtered_inbound_nodes = []
-    for original_node_index, node in enumerate(layer._inbound_nodes):
-      node_key = _make_node_key(layer.name, original_node_index)
-      if node_key in network._network_nodes:
-        # The node is relevant to the model:
-        # add to filtered_inbound_nodes.
-        if node.arguments:
-          kwargs = _serialize_tensors(node.arguments)
-          try:
-            json.dumps(kwargs)
-          except TypeError:
-            logging.warning(
-                'Layer ' + layer.name +
-                ' was passed non-serializable keyword arguments: ' +
-                str(node.arguments) + '. They will not be included '
-                'in the serialized model (and thus will be missing '
-                'at deserialization time).')
-            kwargs = {}
-        else:
-          kwargs = {}
-        if node.inbound_layers:
-          node_data = []
-          for inbound_layer, node_id, tensor_id, _ in node.iterate_inbound():
-            node_key = _make_node_key(inbound_layer.name, node_id)
-            new_node_index = node_conversion_map.get(node_key, 0)
-            node_data.append(
-                tf_utils.ListWrapper(
-                    [inbound_layer.name, new_node_index, tensor_id, kwargs]))
-          node_data = nest.pack_sequence_as(node.input_tensors, node_data)
-          if not nest.is_sequence(node_data):
-            node_data = [node_data]
-          # Convert ListWrapper to list for backwards compatible configs.
-          node_data = tf_utils.convert_inner_node_data(node_data)
-          filtered_inbound_nodes.append(node_data)
-
-    layer_config = serialize_layer_fn(layer)
-    layer_config['name'] = layer.name
-    layer_config['inbound_nodes'] = filtered_inbound_nodes
-    layer_configs.append(layer_config)
-  config['layers'] = layer_configs
-
-  # Gather info about inputs and outputs.
-  model_inputs = []
-  for i in range(len(network._input_layers)):
-    layer, node_index, tensor_index = network._input_coordinates[i]
-    node_key = _make_node_key(layer.name, node_index)
-    if node_key not in network._network_nodes:
-      continue
-    new_node_index = node_conversion_map[node_key]
-    model_inputs.append(
-        tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
-  model_inputs = nest.pack_sequence_as(network._nested_inputs, model_inputs)
-  # Preserve external Keras compat for Models with single input.
-  if not nest.is_sequence(model_inputs):
-    model_inputs = [model_inputs]
-  model_inputs = tf_utils.convert_inner_node_data(model_inputs)
-  config['input_layers'] = model_inputs
-
-  model_outputs = []
-  for i in range(len(network._output_layers)):
-    layer, node_index, tensor_index = network._output_coordinates[i]
-    node_key = _make_node_key(layer.name, node_index)
-    if node_key not in network._network_nodes:
-      continue
-    new_node_index = node_conversion_map[node_key]
-    model_outputs.append(
-        tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
-  model_outputs = nest.pack_sequence_as(network._nested_outputs, model_outputs)
-  # Preserve external Keras compat for Models with single output.
-  if not nest.is_sequence(model_outputs):
-    model_outputs = [model_outputs]
-  model_outputs = tf_utils.convert_inner_node_data(model_outputs)
-  config['output_layers'] = model_outputs
-  return config
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index c8255764efc..a9e0b621d75 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -18,10 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import copy
+import json
+import numpy as np
+
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util import serialization
 
 
 class Node(object):
@@ -33,163 +41,231 @@ class Node(object):
   a node is added to `layer._outbound_nodes`.
 
   Arguments:
-      outbound_layer: the layer that takes
-          `input_tensors` and turns them into `output_tensors`
-          (the node gets created when the `call`
-          method of the layer was called).
-      inbound_layers: a list of layers, the same length as `input_tensors`,
-          the layers from where `input_tensors` originate.
-      node_indices: a list of integers, the same length as `inbound_layers`.
-          `node_indices[i]` is the origin node of `input_tensors[i]`
-          (necessary since each inbound layer might have several nodes,
-          e.g. if the layer is being shared with a different data stream).
-      tensor_indices: a list of integers,
-          the same length as `inbound_layers`.
-          `tensor_indices[i]` is the index of `input_tensors[i]` within the
-          output of the inbound layer
-          (necessary since each inbound layer might
-          have multiple tensor outputs, with each one being
-          independently manipulable).
-      input_tensors: list of input tensors.
-      output_tensors: list of output tensors.
-      arguments: dictionary of keyword arguments that were passed to the
-          `call` method of the layer at the call that created the node.
-
-  `node_indices` and `tensor_indices` are basically fine-grained coordinates
-  describing the origin of the `input_tensors`.
-
-  A node from layer A to layer B is added to:
-    - A._outbound_nodes
-    - B._inbound_nodes
+      layer: The Layer for the Layer.__call__ this node represents.
+      call_args: The positional arguments the Layer was called with.
+      call_kwargs: The keyword arguments the Layer was called with.
+      outputs: The outputs of the Layer.__call__
   """
 
   def __init__(self,
-               outbound_layer,
-               inbound_layers,
-               node_indices,
-               tensor_indices,
-               input_tensors,
-               output_tensors,
-               arguments=None):
-    # Layer instance (NOT a sequence)
-    if isinstance(outbound_layer, (list, tuple, dict)):
-      raise ValueError('`outbound_layer` should be a layer instance, '
-                       'not a list, tuple, or, dict.')
+               layer,
+               call_args=None,
+               call_kwargs=None,
+               outputs=None):
+    call_args = [] if call_args is None else call_args
+    call_kwargs = {} if call_kwargs is None else call_kwargs
+    outputs = [] if outputs is None else outputs
 
-    # These arguments are user-provided. Copy them here so that future
-    # user modifications do not affect the node's metadata.
-    input_tensors = nest.map_structure(lambda t: t, input_tensors)
-    output_tensors = nest.map_structure(lambda t: t, output_tensors)
-    arguments = nest.map_structure(lambda t: t, arguments)
+    self.layer = layer
+    self.is_input = not call_args and not call_kwargs
 
-    # this is the layer that takes a nested structure of input tensors
-    # and turns them into a nested structure of output tensors.
-    # the current node will be added to
-    # the inbound_nodes of outbound_layer.
-    self.outbound_layer = outbound_layer
+    # These arguments are user-provided. Copy the structures here so that
+    # future user modifications do not affect the node's metadata.
+    # We copy using map_structure rather than python's shallow or deep copy,
+    # because the args can be data structures (so shallow copy is
+    # insufficient), but individual values might not support copy.copy
+    # or be too expensive to deep copy.
+    call_args = nest.map_structure(lambda t: t, call_args)
+    call_kwargs = nest.map_structure(lambda t: t, call_kwargs)
+    self.outputs = nest.map_structure(lambda t: t, outputs)
+    self.call_args = call_args
+    self.call_kwargs = call_kwargs
 
-    # The following 3 properties describe where
-    # the input tensors come from: which layers,
-    # and for each layer, which node and which
-    # tensor output of each node.
+    # Cached for performance.
+    self._flat_arguments = nest.flatten((self.call_args, self.call_kwargs))
 
-    # Nested structure of layer instances.
-    self.inbound_layers = inbound_layers
-    # Nested structure of integers, 1:1 mapping with inbound_layers.
-    self.node_indices = node_indices
-    # Nested of integers, 1:1 mapping with inbound_layers.
-    self.tensor_indices = tensor_indices
+    # Create TensorFlowOpLayers if needed.
+    for obj in self._flat_arguments:
+      if (isinstance(obj, ops.Tensor) and
+          base_layer_utils.needs_keras_history(obj, ignore_call_context=True)):
+        base_layer_utils.create_keras_history(obj)
 
-    # Following 2 properties:
-    # tensor inputs and outputs of outbound_layer.
+    self._keras_inputs = []
+    self._keras_inputs_ids_and_indices = []
+    for i, ele in enumerate(self._flat_arguments):
+      if is_keras_tensor(ele):
+        self._keras_inputs.append(ele)
+        kt_id = str(id(ele))
+        kt_index = i
+        self._keras_inputs_ids_and_indices.append((kt_id, kt_index))
 
-    # Nested structure of tensors. 1:1 mapping with inbound_layers.
-    self.input_tensors = input_tensors
-    # Nested structure of tensors, created by outbound_layer.call().
-    self.output_tensors = output_tensors
+    # Wire up Node to Layers.
+    self.layer._inbound_nodes.append(self)
+    for kt in self.keras_inputs:
+      inbound_layer = kt._keras_history.layer
+      if inbound_layer is not None:  # `None` for `Input` tensors.
+        inbound_layer._outbound_nodes.append(self)
 
-    # Following 2 properties: input and output shapes.
+    # Set metadata on outputs.
+    node_index = len(self.layer._inbound_nodes) - 1
+    for i, tensor in enumerate(nest.flatten(outputs)):
+      tensor._keras_history = KerasHistory(
+          layer=layer, node_index=node_index, tensor_index=i)
 
-    # Nested structure of shape tuples, shapes of input_tensors.
-    self.input_shapes = nest.map_structure(backend.int_shape, input_tensors)
-    # Nested structure of shape tuples, shapes of output_tensors.
-    self.output_shapes = nest.map_structure(backend.int_shape, output_tensors)
+    # Cached for performance.
+    self.flat_input_ids = [str(id(t)) for t in self._keras_inputs]
+    self.flat_output_ids = [str(id(t)) for t in nest.flatten(self.outputs)]
 
-    # Optional keyword arguments to layer's `call`.
-    self.arguments = arguments
+  @property
+  def keras_inputs(self):
+    """Tensors input to this node that can be traced back to a `keras.Input`."""
+    return self._keras_inputs
 
-    # Create Keras History for any Keras Tensors in `arguments`.
-    tensor_arguments = [
-        t for t in nest.flatten(self.arguments) if isinstance(t, ops.Tensor)
-    ]
-    for tensor_argument in tensor_arguments:
-      if base_layer_utils.needs_keras_history(
-          tensor_argument, ignore_call_context=True):
-        base_layer_utils.create_keras_history(tensor_argument)
-
-    # Add nodes to all layers involved.
-    for layer in nest.flatten(inbound_layers):
-      if layer is not None:
-        # For compatibility with external Keras, we use the deprecated
-        # accessor here.
-        layer.outbound_nodes.append(self)
-    # For compatibility with external Keras, we use the deprecated
-    # accessor here.
-    outbound_layer.inbound_nodes.append(self)
-
-  def iterate_inbound(self, include_arguments=False):
-    """Returns a list of tuples representing the inbound data.
-
-    Arguments:
-      include_arguments: Whether to also iterate over any Keras Tensors
-        passed as args, kwargs.
-
-    Returns:
-      List of tuples like: (inbound_layer, node_index, tensor_index, tensor).
-    """
-    inputs_inbound = list(
-        zip(
-            nest.flatten(self.inbound_layers),
-            nest.flatten(self.node_indices),
-            nest.flatten(self.tensor_indices),
-            nest.flatten(self.input_tensors)))
-
-    if include_arguments:
-      keras_tensor_arguments = [
-          kt for kt in nest.flatten(self.arguments)
-          if hasattr(kt, '_keras_history')
-      ]
-
-      def _get_inbound(keras_tensor):
-        kh = keras_tensor._keras_history
-        return kh.layer, kh.node_index, kh.tensor_index, keras_tensor
-
-      arguments_inbound = nest.map_structure(_get_inbound,
-                                             keras_tensor_arguments)
-
-      return inputs_inbound + arguments_inbound
-    else:
-      return inputs_inbound
-
-  def _get_all_node_dependencies(self):
-    """Returns all of the nodes this node immediately depends on."""
+  @property
+  def parent_nodes(self):
+    """Returns all the `Node`s whose output this node immediately depends on."""
     node_deps = []
-    for layer, node_index, _, _ in self.iterate_inbound():
-      node_deps.append(layer._inbound_nodes[node_index])
-
-    for arg in nest.flatten(self.arguments):
-      if isinstance(arg, ops.Tensor) and hasattr(arg, '_keras_history'):
-        kh = arg._keras_history
-        node_deps.append(kh.layer._inbound_nodes[kh.node_index])
-
+    for kt in self.keras_inputs:
+      layer = kt._keras_history.layer
+      node_index = kt._keras_history.node_index
+      if layer is not None:  # `None` for `Input` tensors.
+        node_deps.append(layer._inbound_nodes[node_index])
     return node_deps
 
-  def get_config(self):
-    inbound_names = nest.map_structure(
-        lambda layer: layer.name if layer else None, self.inbound_layers)
-    return {
-        'outbound_layer': self.outbound_layer.name,
-        'inbound_layers': inbound_names,
-        'node_indices': self.node_indices,
-        'tensor_indices': self.tensor_indices
-    }
+  def iterate_inbound(self):
+    """Yields tuples representing the data inbound from other nodes.
+
+    Yields:
+      tuples like: (inbound_layer, node_index, tensor_index, tensor).
+    """
+    for kt in self.keras_inputs:
+      keras_history = kt._keras_history
+      layer = keras_history.layer
+      node_index = keras_history.node_index
+      tensor_index = keras_history.tensor_index
+      yield layer, node_index, tensor_index, kt
+
+  def map_arguments(self, tensor_dict):
+    """Maps Keras Tensors to computed Tensors using `tensor_dict`."""
+    flat_arguments = copy.copy(self._flat_arguments)
+    for kt_id, kt_index in self._keras_inputs_ids_and_indices:
+      flat_arguments[kt_index] = tensor_dict[kt_id].pop()
+
+    args, kwargs = nest.pack_sequence_as(
+        (self.call_args, self.call_kwargs), flat_arguments)
+    return args, kwargs
+
+  def serialize(self, make_node_key, node_conversion_map):
+    """Serializes `Node` for Functional API's `get_config`."""
+    # Serialization still special-cases first argument.
+    args, kwargs = self.call_args, self.call_kwargs
+    inputs, args, kwargs = self.layer._split_out_first_arg(args, kwargs)
+
+    # Treat everything other than first argument as a kwarg.
+    arguments = dict(zip(self.layer._call_fn_args[1:], args))
+    arguments.update(kwargs)
+    kwargs = arguments
+
+    kwargs = nest.map_structure(_serialize_keras_tensor, kwargs)
+    try:
+      json.dumps(kwargs, default=serialization.get_json_type)
+    except TypeError:
+      kwarg_types = nest.map_structure(type, kwargs)
+      logging.warning('Layer ' + self.layer.name +
+                      ' was passed non-JSON-serializable arguments. ' +
+                      'Arguments had types: ' +
+                      str(kwarg_types) + '. They will not be included '
+                      'in the serialized model (and thus will be missing '
+                      'at deserialization time).')
+      kwargs = {}
+
+    # `kwargs` is added to each Tensor in the first arg. This should be
+    # changed in a future version of the serialization format.
+    def serialize_first_arg_tensor(t):
+      kh = t._keras_history
+      node_index = kh.node_index
+      node_key = make_node_key(kh.layer.name, node_index)
+      new_node_index = node_conversion_map.get(node_key, 0)
+      data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
+      return tf_utils.ListWrapper(data)
+
+    data = nest.map_structure(serialize_first_arg_tensor, inputs)
+    if not nest.is_sequence(data):
+      data = [data]
+    data = tf_utils.convert_inner_node_data(data)
+    return data
+
+  #############################################################
+  # Properties for Backwards compatibility.
+  # These only check the first input argument
+  # As nodes are internal, they may be removed in the future.
+  #############################################################
+
+  @property
+  def input_tensors(self):
+    if self.is_input:
+      return [self.outputs]  # Used in `Layer.input`.
+    return self.call_args[0]
+
+  @property
+  def output_tensors(self):
+    if self.is_input:
+      return [self.outputs]  # Used in `Layer.input`.
+    return self.outputs
+
+  @property
+  def input_shapes(self):
+    input_shapes = nest.map_structure(backend.int_shape, self.input_tensors)
+    if len(input_shapes) == 1 and not self.is_input:
+      return input_shapes[0]
+    return input_shapes
+
+  @property
+  def output_shapes(self):
+    return nest.map_structure(backend.int_shape, self.output_tensors)
+
+  @property
+  def outbound_layer(self):
+    return self.layer
+
+  @property
+  def inbound_layers(self):
+    if self.is_input:
+      return []
+    inbound_layers = nest.map_structure(lambda t: t._keras_history.layer,
+                                        self.call_args[0])
+    return inbound_layers
+
+
+class KerasHistory(
+    collections.namedtuple('KerasHistory',
+                           ['layer', 'node_index', 'tensor_index'])):
+  """Tracks the Layer call that created a Tensor, for Keras Graph Networks.
+
+  During construction of Keras Graph Networks, this metadata is added to
+  each Tensor produced as the output of a Layer, starting with an
+  `InputLayer`. This allows Keras to track how each Tensor was produced, and
+  this information is later retraced by the `keras.engine.Network` class to
+  reconstruct the Keras Graph Network.
+
+  Attributes:
+    layer: The Layer that produced the Tensor.
+    node_index: The specific call to the Layer that produced this Tensor. Layers
+      can be called multiple times in order to share weights. A new node is
+      created every time a Layer is called.
+    tensor_index: The output index for this Tensor. Always zero if the Layer
+      that produced this Tensor only has one output. Nested structures of
+      Tensors are deterministically assigned an index via `nest.flatten`.
+  """
+  # Added to maintain memory and performance characteristics of `namedtuple`
+  # while subclassing.
+  __slots__ = ()
+
+
+def is_keras_tensor(obj):
+  return hasattr(obj, '_keras_history')
+
+
+def _serialize_keras_tensor(t):
+  """Serializes a single Tensor passed to `call`."""
+  if hasattr(t, '_keras_history'):
+    kh = t._keras_history
+    return [kh.layer.name, kh.node_index, kh.tensor_index]
+
+  if isinstance(t, np.ndarray):
+    return t.tolist()
+
+  if isinstance(t, ops.Tensor):
+    return backend.get_value(t).tolist()
+
+  return t
diff --git a/tensorflow/python/keras/engine/node_test.py b/tensorflow/python/keras/engine/node_test.py
new file mode 100644
index 00000000000..80c5144da1b
--- /dev/null
+++ b/tensorflow/python/keras/engine/node_test.py
@@ -0,0 +1,160 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#,============================================================================
+"""Tests for layer graphs construction & handling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import node as node_module
+from tensorflow.python.platform import test
+
+
+class DummyTensor(object):
+
+  def __init__(self, shape=None):
+    self.shape = shape
+
+
+class DummyLayer(base_layer.Layer):
+  pass
+
+
+class NetworkConstructionTest(keras_parameterized.TestCase):
+
+  def test_chained_node_construction(self):
+    # test basics
+    a = DummyTensor(shape=(None, 32))
+    b = DummyTensor(shape=(None, 32))
+
+    a_layer = DummyLayer()
+    node = node_module.Node(a_layer, outputs=a)
+    self.assertEqual(node.outbound_layer, a_layer)
+
+    self.assertTrue(node.is_input)
+    self.assertListEqual(node.inbound_layers, [])
+    self.assertListEqual(node.input_tensors, [a])
+    self.assertListEqual(node.input_shapes, [(None, 32)])
+    self.assertListEqual(node.output_tensors, [a])
+    self.assertListEqual(node.output_shapes, [(None, 32)])
+
+    b_layer = DummyLayer()
+    node_module.Node(b_layer, outputs=b)
+
+    dense = DummyLayer()
+    a_2 = DummyTensor()
+    node_a = node_module.Node(layer=dense, call_args=(a,), outputs=a_2)
+    b_2 = DummyTensor()
+    node_b = node_module.Node(layer=dense, call_args=(b,), outputs=b_2)
+
+    # test the node attributes
+    self.assertFalse(node_a.is_input)
+    self.assertFalse(node_b.is_input)
+    self.assertEqual(node_a.call_args, (a,))
+    self.assertEqual(node_a.call_kwargs, {})
+    self.assertEqual(node_a.outputs, a_2)
+
+    # Test the layer wiring
+    self.assertLen(dense._inbound_nodes, 2)
+    self.assertLen(dense._outbound_nodes, 0)
+    self.assertEqual(dense._inbound_nodes, [node_a, node_b])
+    self.assertEqual(dense._inbound_nodes[0].inbound_layers, a_layer)
+    self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
+    self.assertEqual(dense._inbound_nodes[1].inbound_layers, b_layer)
+    self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
+    self.assertIs(dense._inbound_nodes[0].input_tensors, a)
+    self.assertIs(dense._inbound_nodes[1].input_tensors, b)
+
+  def test_multi_input_node(self):
+    # test multi-input layer
+    a = DummyTensor()
+    b = DummyTensor()
+
+    dense = DummyLayer()
+    a_2 = DummyTensor()
+    node_module.Node(layer=dense, call_args=(a,), outputs=a_2)
+    b_2 = DummyTensor()
+    node_module.Node(layer=dense, call_args=(b,), outputs=b_2)
+
+    concat_layer = DummyLayer()
+    merged = DummyTensor()
+    node_module.Node(layer=concat_layer, call_args=([a_2, b_2],),
+                     outputs=merged)
+
+    merge_layer, merge_node_index, merge_tensor_index = merged._keras_history
+
+    self.assertEqual(merge_node_index, 0)
+    self.assertEqual(merge_tensor_index, 0)
+
+    self.assertLen(merge_layer._inbound_nodes, 1)
+    self.assertLen(merge_layer._outbound_nodes, 0)
+
+    self.assertLen(merge_layer._inbound_nodes[0].input_tensors, 2)
+    self.assertEqual(merge_layer._inbound_nodes[0].input_tensors, [a_2, b_2])
+    self.assertLen(merge_layer._inbound_nodes[0].inbound_layers, 2)
+
+  def test_arg_and_kwarg_mix(self):
+    input_layer = DummyLayer()
+    input_layer_2 = DummyLayer()
+    a = DummyTensor()
+    node_a = node_module.Node(layer=input_layer, outputs=a)
+    b = DummyTensor()
+    node_b = node_module.Node(layer=input_layer_2, outputs=b)
+
+    arg_2 = DummyTensor()
+    arg_3 = DummyTensor()
+    node_c = node_module.Node(layer=input_layer, outputs=arg_3)
+
+    kwarg_x = DummyTensor()
+    kwarg_y = DummyTensor()
+    node_d = node_module.Node(layer=input_layer, outputs=kwarg_y)
+
+    merge_layer = DummyLayer()
+    merged = DummyTensor()
+    node = node_module.Node(layer=merge_layer,
+                            call_args=([a, b], arg_2, arg_3),
+                            call_kwargs={'x': kwarg_x, 'y': kwarg_y},
+                            outputs=merged)
+
+    merge_layer, merge_node_index, merge_tensor_index = merged._keras_history
+
+    # Check the saved call args/kwargs
+    self.assertEqual(([a, b], arg_2, arg_3), node.call_args)
+    self.assertEqual({'x': kwarg_x, 'y': kwarg_y}, node.call_kwargs)
+
+    # Only the inputs that were produced by input nodes should appear in
+    # keras_tensors
+    self.assertEqual({a, b, arg_3, kwarg_y}, set(node.keras_inputs))
+    self.assertEqual(set(node.parent_nodes), {node_a, node_b, node_c, node_d})
+
+    # Check the layer wirings
+    self.assertEqual(merge_node_index, 0)
+    self.assertEqual(merge_tensor_index, 0)
+    self.assertLen(merge_layer._inbound_nodes, 1)
+    self.assertLen(merge_layer._outbound_nodes, 0)
+    self.assertLen(input_layer._outbound_nodes, 3)
+    self.assertLen(input_layer_2._outbound_nodes, 1)
+
+    # The 'backwards compatibility' attributes should only check the
+    # first call argument
+    self.assertLen(merge_layer._inbound_nodes[0].input_tensors, 2)
+    self.assertEqual(merge_layer._inbound_nodes[0].input_tensors, [a, b])
+    self.assertLen(merge_layer._inbound_nodes[0].inbound_layers, 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 30ec7d06d56..d07ed477ba9 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -26,8 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import generic_utils
@@ -35,7 +35,6 @@ from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.deprecation import deprecated
@@ -48,7 +47,7 @@ SINGLE_LAYER_OUTPUT_ERROR_MSG = ('All layers in a Sequential model should have '
 
 
 @keras_export('keras.Sequential', 'keras.models.Sequential')
-class Sequential(training.Model):
+class Sequential(functional.Functional):
   """`Sequential` groups a linear stack of layers into a `tf.keras.Model`.
 
   `Sequential` provides training and inference features on this model.
@@ -113,7 +112,9 @@ class Sequential(training.Model):
       layers: Optional list of layers to add to the model.
       name: Optional name for the model.
     """
-    super(Sequential, self).__init__(name=name, autocast=False)
+    # Skip the init in FunctionalModel since model doesn't have input/output yet
+    super(functional.Functional, self).__init__(  # pylint: disable=bad-super-call
+        name=name, autocast=False)
     self.supports_masking = True
     self._compute_output_and_mask_jointly = True
     self._auto_track_sub_layers = False
@@ -122,6 +123,10 @@ class Sequential(training.Model):
     self._input_dtype = None
     self._layer_call_argspecs = {}
     self._created_nodes = set()
+    # Flag that indicate whether the sequential network topology has been
+    # created. It is false when there isn't any layer, or the layers doesn't
+    # have input shape.
+    self._graph_initialized = False
 
     # Unfortunately some Sequential models using custom layers or FeatureColumn
     # layers have multiple inputs. This is fundamentally incompatible with
@@ -148,11 +153,6 @@ class Sequential(training.Model):
       return layers[1:]
     return layers[:]
 
-  @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
-  def dynamic(self):
-    return any(layer.dynamic for layer in self.layers)
-
   @trackable.no_automatic_dependency_tracking
   def add(self, layer):
     """Adds a layer instance on top of the layer stack.
@@ -211,7 +211,7 @@ class Sequential(training.Model):
           set_inputs = True
 
       if set_inputs:
-        outputs = nest.flatten(layer._inbound_nodes[-1].output_tensors)
+        outputs = nest.flatten(layer._inbound_nodes[-1].outputs)
         if len(outputs) != 1:
           raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
         self.outputs = outputs
@@ -228,8 +228,9 @@ class Sequential(training.Model):
       self.outputs = [output_tensor]
       self.built = True
 
-    if set_inputs or self._is_graph_network:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+    if set_inputs or self._graph_initialized:
+      self._init_graph_network(self.inputs, self.outputs)
+      self._graph_initialized = True
     else:
       self._layers.append(layer)
       self._handle_deferred_layer_dependencies([layer])
@@ -258,10 +259,11 @@ class Sequential(training.Model):
       self.built = False
       self._inferred_input_shape = None
       self._has_explicit_input_shape = False
-    elif self._is_graph_network:
+      self._graph_initialized = False
+    elif self._graph_initialized:
       self.layers[-1]._outbound_nodes = []
       self.outputs = [self.layers[-1].output]
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
       self.built = True
 
   @trackable.no_automatic_dependency_tracking
@@ -285,9 +287,7 @@ class Sequential(training.Model):
       if (new_shape is not None and new_shape != self._inferred_input_shape):
         # A novel shape has been received: we need to rebuild the model.
         # In case we are inside a graph function, we step out of it.
-        # We also open a CPU device scope to avoid allocating memory on GPU.
-        # The graph we create here is never used for execution.
-        with ops.init_scope(), ops.device('/cpu:0'):
+        with ops.init_scope():
           inputs = input_layer.Input(
               batch_shape=new_shape,
               dtype=input_dtype,
@@ -337,15 +337,16 @@ class Sequential(training.Model):
             # case, we fall back to the legacy deferred behavior.
             # TODO(fchollet): consider raising here, as we should not be
             # supporting such layers.
-            self._init_graph_network(inputs, outputs, name=self.name)
+            self._init_graph_network(inputs, outputs)
+            self._graph_initialized = True
           except:  # pylint:disable=bare-except
             self._use_legacy_deferred_behavior = True
         self._inferred_input_shape = new_shape
 
   @generic_utils.default
   def build(self, input_shape=None):
-    if self._is_graph_network:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+    if self._graph_initialized:
+      self._init_graph_network(self.inputs, self.outputs)
     else:
       if input_shape is None:
         raise ValueError('You must provide an `input_shape` argument.')
@@ -373,9 +374,9 @@ class Sequential(training.Model):
       else:
         self._build_graph_network_for_inferred_shape(inputs.shape, inputs.dtype)
 
-    if self._is_graph_network:
+    if self._graph_initialized:
       if not self.built:
-        self._init_graph_network(self.inputs, self.outputs, name=self.name)
+        self._init_graph_network(self.inputs, self.outputs)
       return super(Sequential, self).call(inputs, training=training, mask=mask)
 
     outputs = inputs  # handle the corner case where self.layers is empty
@@ -514,6 +515,13 @@ class Sequential(training.Model):
         return False
     return True
 
+  def _assert_weights_created(self):
+    if self._graph_initialized:
+      return
+    # When the graph has not been initialized, use the Model's implementation to
+    # to check if the weights has been created.
+    super(functional.Functional, self)._assert_weights_created()  # pylint: disable=bad-super-call
+
 
 def _get_shape_tuple(t):
   if hasattr(t, 'shape'):
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index c65ac094663..9589d24fc57 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -491,6 +491,30 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_build_empty_network(self):
+    x = np.random.random((2, 6))
+    y = np.random.random((2, 5))
+    model = keras.Sequential()
+
+    # Make sure an empty sequential model can still work with build().
+    model.build((None, 6))
+    self.assertTrue(model.built)
+
+    model.add(keras.layers.Dense(5, input_shape=(6,)))
+
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y)
+
+    model.pop()
+    self.assertFalse(model.built)
+
+    model.build((None, 6))
+    self.assertTrue(model.built)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index f3a7385c3a7..d8c95b2a972 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -20,7 +20,11 @@ from __future__ import print_function
 
 import copy
 import itertools
+import json
+import os
+import six
 
+from tensorflow.python.autograph.lang import directives
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
@@ -30,19 +34,31 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as callbacks_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.keras.engine import data_adapter
-from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
+from tensorflow.python.keras.saving import hdf5_format
+from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.saving.saved_model import model_serialization
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -51,12 +67,33 @@ from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
 
 
 _keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras',
@@ -96,13 +133,36 @@ def disable_multi_worker(method):
       target=method, decorator_func=_method_wrapper)
 
 
+def inject_functional_model_class(cls):
+  from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.engine import training_v1  # pylint: disable=g-import-not-at-top
+  if cls == Model or cls == training_v1.Model:
+    return functional.Functional
+
+  cls.__bases__ = tuple(inject_functional_model_class(base)
+                        for base in cls.__bases__)
+  return cls
+
+
+def is_functional_model_init_params(args, kwargs):
+  return (len(args) == 2 or
+          len(args) == 1 and 'outputs' in kwargs or
+          'inputs' in kwargs and 'outputs' in kwargs)
+
+
 @keras_export('keras.Model', 'keras.models.Model')
-class Model(network.Network, version_utils.ModelVersionSelector):
+class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   """`Model` groups layers into an object with training and inference features.
 
+  Arguments:
+      inputs: The input(s) of the model: a `keras.Input` object or list of
+          `keras.Input` objects.
+      outputs: The output(s) of the model. See Functional API example below.
+      name: String, the name of the model.
+
   There are two ways to instantiate a `Model`:
 
-  1 - With the "functional API", where you start from `Input`,
+  1 - With the "Functional API", where you start from `Input`,
   you chain layer calls to specify the model's forward pass,
   and finally you create your model from inputs and outputs:
 
@@ -163,18 +223,65 @@ class Model(network.Network, version_utils.ModelVersionSelector):
   Once the model is created, you can config the model with losses and metrics
   with `model.compile()`, train the model with `model.fit()`, or use the model
   to do prediction with `model.predict()`.
-
-  Checkout [guide](https://www.tensorflow.org/guide/keras/overview) for
-  additional details.
   """
   _TF_MODULE_IGNORED_PROPERTIES = frozenset(
       itertools.chain(('_train_counter', '_test_counter', '_predict_counter',
                        '_steps_per_execution'),
-                      network.Network._TF_MODULE_IGNORED_PROPERTIES))  # pylint: disable=protected-access
+                      base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES))  # pylint: disable=protected-access
 
+  def __new__(cls, *args, **kwargs):
+    # Signature detection
+    if is_functional_model_init_params(args, kwargs) and cls == Model:
+      # Functional model
+      from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+      return functional.Functional(*args, **kwargs)
+    else:
+      return super(Model, cls).__new__(cls, *args, **kwargs)
+
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, *args, **kwargs):
-    super(Model, self).__init__(*args, **kwargs)
-    _keras_api_gauge.get_cell('model').set(True)
+    # Special case for Subclassed Functional Model, which we couldn't detect
+    # when __new__ is called. We only realize it is a functional model when it
+    # calls super.__init__ with input and output tensor.
+    from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+    if (is_functional_model_init_params(args, kwargs) and
+        not isinstance(self, functional.Functional)):
+      inject_functional_model_class(self.__class__)
+      functional.Functional.__init__(self, *args, **kwargs)
+      return
+
+    # The following are implemented as property functions:
+    # self.trainable_weights
+    # self.non_trainable_weights
+    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
+                                           'name', 'autocast'})
+    super(Model, self).__init__(**kwargs)
+    # By default, Model is a subclass model, which is not in graph network.
+    self._is_graph_network = False
+
+    self.inputs = None
+    self.outputs = None
+    self.input_names = None
+    self.output_names = None
+    # stop_training is used by callback to stop training when error happens
+    self.stop_training = False
+    self.history = None
+    # These objects are used in the default `Model.compile`. They are not
+    # guaranteed to be set after `Model.compile` is called, as users can
+    # override compile with custom logic.
+    self.compiled_loss = None
+    self.compiled_metrics = None
+
+    # This is True for Sequential networks and Functional networks.
+    self._compute_output_and_mask_jointly = False
+
+    # Don't reset compilation if already done. This may occur if calling
+    # `__init__` (or `_init_graph_network`) on an already-compiled model
+    # such as a Sequential model. Sequential models may need to rebuild
+    # themselves after compilation.
+    self._maybe_create_attribute('_is_compiled', False)
+    self._maybe_create_attribute('optimizer', None)
+
     # Model must be created under scope of DistStrat it will be trained with.
     if ds_context.has_strategy():
       self._distribution_strategy = ds_context.get_strategy()
@@ -182,21 +289,20 @@ class Model(network.Network, version_utils.ModelVersionSelector):
       self._distribution_strategy = None
     # Defaults to value of `tf.config.experimental_functions_run_eagerly`.
     self._run_eagerly = None
-    self.stop_training = False
     # Initialize cache attrs.
     self._reset_compile_cache()
 
     # Fault-tolerance handler. Set in `ModelCheckpoint`.
     self._training_state = None
-    self.history = None
+    self._saved_model_inputs_spec = None
+    self._trackable_saver = (
+        trackable_utils.saver_with_op_caching(self))
 
-    # These objects are used in the default `Model.compile`. They are not
-    # guaranteed to be set after `Model.compile` is called, as users can
-    # override compile with custom logic.
-    self.compiled_loss = None
-    self.compiled_metrics = None
+    self._steps_per_execution = None
 
     self._init_batch_counters()
+    self._base_model_initialized = True
+    _keras_api_gauge.get_cell('model').set(True)
 
   @trackable.no_automatic_dependency_tracking
   def _init_batch_counters(self):
@@ -208,67 +314,146 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     self._predict_counter = variables.Variable(
         0, dtype='int64', aggregation=agg)
 
-  def get_weights(self):
-    """Retrieves the weights of the model.
+  def __setattr__(self, name, value):
+    if not getattr(self, '_self_setattr_tracking', True):
+      super(Model, self).__setattr__(name, value)
+      return
 
-    Returns:
-        A flat list of Numpy arrays.
-    """
-    with self.distribute_strategy.scope():
-      return super(Model, self).get_weights()
+    if all(
+        isinstance(v, (base_layer.Layer,
+                       data_structures.TrackableDataStructure)) or
+        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
+      try:
+        self._base_model_initialized
+      except AttributeError:
+        # six.raise_from supresses the original AttributeError from being raised
+        six.raise_from(
+            RuntimeError('It looks like you are subclassing `Model` and you '
+                         'forgot to call `super(YourClass, self).__init__()`.'
+                         ' Always start with this line.'), None)
 
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+    super(Model, self).__setattr__(name, value)
 
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
+  @generic_utils.default
+  def build(self, input_shape):
+    """Builds the model based on input shapes received.
 
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
+    This is to be used for subclassed models, which do not know at instantiation
+    time what their inputs look like.
 
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
+    This method only exists for users who want to call `model.build()` in a
+    standalone way (as a substitute for calling the model on real data to
+    build it). It will never be called by the framework (and thus it will
+    never throw unexpected errors in an unrelated workflow).
 
-    Arguments:
-        filepath: String, path to the weights file to load. For weight files in
-            TensorFlow format, this is the file prefix (the same as was passed
-            to `save_weights`).
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
-
-    Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
+    Args:
+     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
+         are tuples, integers, or TensorShapes.
 
     Raises:
-        ImportError: If h5py is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
+      ValueError:
+        1. In case of invalid user-provided data (not of type tuple,
+           list, or TensorShape).
+        2. If the model requires call arguments that are agnostic
+           to the input shapes (positional or kwarg in call signature).
+        3. If not all layers were properly built.
+        4. If float type inputs are not supported within the layers.
+
+      In each of these cases, the user should build their model by calling it
+      on real tensor data.
     """
-    if dist_utils.is_tpu_strategy(self._distribution_strategy):
-      if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
-        raise ValueError('Load weights is not yet supported with TPUStrategy '
-                         'with steps_per_run greater than 1.')
-    return super(Model, self).load_weights(filepath, by_name, skip_mismatch)
+    if self._is_graph_network:
+      super(Model, self).build(input_shape)
+      return
+
+    if input_shape is None:
+      raise ValueError('Input shape must be defined when calling build on a '
+                       'model subclass network.')
+    valid_types = (tuple, list, tensor_shape.TensorShape)
+    if not isinstance(input_shape, valid_types):
+      raise ValueError('Specified input shape is not one of the valid types. '
+                       'Please specify a batch input shape of type tuple or '
+                       'list of input shapes. User provided '
+                       'input type: {}'.format(type(input_shape)))
+
+    if input_shape and not self.inputs:
+      # We create placeholders for the `None`s in the shape and build the model
+      # in a Graph. Since tf.Variable is compatible with both eager execution
+      # and graph building, the variables created after building the model in
+      # a Graph are still valid when executing eagerly.
+      if context.executing_eagerly():
+        graph = func_graph.FuncGraph('build_graph')
+      else:
+        graph = backend.get_graph()
+      with graph.as_default():
+        if isinstance(input_shape, list):
+          x = [base_layer_utils.generate_placeholders_from_shape(shape)
+               for shape in input_shape]
+        elif isinstance(input_shape, dict):
+          x = {
+              k: base_layer_utils.generate_placeholders_from_shape(shape)
+              for k, shape in input_shape.items()
+          }
+        else:
+          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
+
+        kwargs = {}
+        call_signature = self._call_full_argspec
+        call_args = call_signature.args
+        # Exclude `self`, `inputs`, and any argument with a default value.
+        if len(call_args) > 2:
+          if call_signature.defaults:
+            call_args = call_args[2:-len(call_signature.defaults)]
+          else:
+            call_args = call_args[2:]
+          for arg in call_args:
+            if arg == 'training':
+              # Case where `training` is a positional arg with no default.
+              kwargs['training'] = False
+            else:
+              # Has invalid call signature with unknown positional arguments.
+              raise ValueError(
+                  'Currently, you cannot build your model if it has '
+                  'positional or keyword arguments that are not '
+                  'inputs to the model, but are required for its '
+                  '`call` method. Instead, in order to instantiate '
+                  'and build your model, `call` your model on real '
+                  'tensor data with all expected call arguments.')
+        elif len(call_args) < 2:
+          # Signature without `inputs`.
+          raise ValueError('You can only call `build` on a model if its `call` '
+                           'method accepts an `inputs` argument.')
+        try:
+          self.call(x, **kwargs)
+        except (errors.InvalidArgumentError, TypeError):
+          raise ValueError('You cannot build your model by calling `build` '
+                           'if your layers do not support float type inputs. '
+                           'Instead, in order to instantiate and build your '
+                           'model, `call` your model on real tensor data (of '
+                           'the correct dtype).')
+
+    super(Model, self).build(input_shape)
+
+  def call(self, inputs, training=None, mask=None):
+    """Calls the model on new inputs.
+
+    In this case `call` just reapplies
+    all ops in the graph to the new inputs
+    (e.g. build a new computational graph from the provided inputs).
+
+    Arguments:
+        inputs: A tensor or list of tensors.
+        training: Boolean or boolean scalar tensor, indicating whether to run
+          the `Network` in training mode or inference mode.
+        mask: A mask or list of masks. A mask can be
+            either a tensor or None (no mask).
+
+    Returns:
+        A tensor if there is a single output, or
+        a list of tensors if there are more than one outputs.
+    """
+    raise NotImplementedError('When subclassing the `Model` class, you should '
+                              'implement a `call` method.')
 
   def compile(self,
               optimizer='rmsprop',
@@ -347,7 +532,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     _keras_api_gauge.get_cell('compile').set(True)
     with self.distribute_strategy.scope():
       self._validate_compile(optimizer, metrics, **kwargs)
-      self._run_eagerly = kwargs.pop('run_eagerly', None)
+      self._run_eagerly = run_eagerly
 
       self.optimizer = self._get_optimizer(optimizer)
       self.compiled_loss = compile_utils.LossesContainer(
@@ -393,12 +578,16 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         dtype='int64',
         aggregation=variables.VariableAggregationV2.ONLY_FIRST_REPLICA)
 
+  @property
+  def _should_compute_mask(self):
+    return False
+
   @property
   def metrics(self):
     """Returns the model's metrics added using `compile`, `add_metric` APIs.
 
-    Note: `metrics` are available only after a `keras.Model` has been
-    trained/evaluated on actual data.
+    Note: Metrics passed to `compile()` are available only after a `keras.Model`
+    has been trained/evaluated on actual data.
 
     Examples:
 
@@ -503,30 +692,19 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     Returns:
       Boolean, whether the model should run eagerly.
     """
-    if self._run_eagerly is True and not context.executing_eagerly():
-      raise ValueError('You can only set `run_eagerly=True` if eager execution '
-                       'is enabled.')
-    if not self.dynamic:
-      if self._run_eagerly is None:
-        # Respect `tf.config.experimental_run_functions_eagerly` unless
-        # `run_eagerly` was explicitly passed to `compile`.
-        return def_function.RUN_FUNCTIONS_EAGERLY
-      else:
-        return self._run_eagerly
-    else:
-      if not context.executing_eagerly():
-        raise ValueError('Your model contains layers that can only be '
-                         'successfully run in eager execution (layers '
-                         'constructed with `dynamic=True`). '
-                         'You must enable eager execution with '
-                         '`tf.enable_eager_execution()`.')
-      if self._run_eagerly is False:
-        # TODO(fchollet): consider using py_func to enable this.
-        raise ValueError('Your model contains layers that can only be '
-                         'successfully run in eager execution (layers '
-                         'constructed with `dynamic=True`). '
-                         'You cannot set `run_eagerly=False`.')
-      return context.executing_eagerly()
+    if self.dynamic and self._run_eagerly is False:  # pylint:disable=g-bool-id-comparison
+      # TODO(fchollet): consider using py_func to enable this.
+      raise ValueError('Your model contains layers that can only be '
+                       'successfully run in eager execution (layers '
+                       'constructed with `dynamic=True`). '
+                       'You cannot set `run_eagerly=False`.')
+
+    # Run eagerly logic, by priority:
+    # (1) Dynamic models must be run eagerly.
+    # (2) Explicitly setting run_eagerly causes a Model to be run eagerly.
+    # (3) Not explicitly setting run_eagerly defaults to TF's global setting.
+    return (self.dynamic or self._run_eagerly or
+            (def_function.RUN_FUNCTIONS_EAGERLY and self._run_eagerly is None))
 
   @run_eagerly.setter
   def run_eagerly(self, value):
@@ -660,8 +838,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           validation_freq=1,
           max_queue_size=10,
           workers=1,
-          use_multiprocessing=False,
-          **kwargs):
+          use_multiprocessing=False):
     """Trains the model for a fixed number of epochs (iterations on a dataset).
 
     Arguments:
@@ -728,7 +905,6 @@ class Model(network.Network, version_utils.ModelVersionSelector):
               - tuple `(x_val, y_val)` of Numpy arrays or tensors
               - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
               - dataset
-
             For the first two cases, `batch_size` must be provided.
             For the last case, `validation_steps` could be provided.
             Note that `validation_data` does not support all the data types that
@@ -808,7 +984,6 @@ class Model(network.Network, version_utils.ModelVersionSelector):
             `False`. Note that because this implementation relies on
             multiprocessing, you should not pass non-picklable arguments to
             the generator as they can't be passed easily to children processes.
-        **kwargs: Used for backwards compatibility.
 
     Unpacking behavior for iterator-like inputs:
         A common pattern is to pass a tf.data.Dataset, generator, or
@@ -850,6 +1025,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     version_utils.disallow_legacy_graph('Model', 'fit')
     self._assert_compile_was_called()
     self._check_call_args('fit')
+    _disallow_inside_tf_function('fit')
 
     if validation_split:
       # Create the validation data using the training data. Only supported for
@@ -893,6 +1069,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
       train_function = self.make_train_function()
       self._train_counter.assign(0)
       callbacks.on_train_begin()
+      training_logs = None
       # Handle fault-tolerance for multi-worker.
       # TODO(omalleyt): Fix the ordering issues that mean this has to
       # happen after `callbacks.on_train_begin`.
@@ -937,10 +1114,11 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           epoch_logs.update(val_logs)
 
         callbacks.on_epoch_end(epoch, epoch_logs)
+        training_logs = epoch_logs
         if self.stop_training:
           break
 
-      callbacks.on_train_end()
+      callbacks.on_train_end(logs=training_logs)
       return self.history
 
   def test_step(self, data):
@@ -1125,6 +1303,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     version_utils.disallow_legacy_graph('Model', 'evaluate')
     self._assert_compile_was_called()
     self._check_call_args('evaluate')
+    _disallow_inside_tf_function('evaluate')
 
     with self.distribute_strategy.scope():
       # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
@@ -1153,6 +1332,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
             epochs=1,
             steps=data_handler.inferred_steps)
 
+      logs = {}
       test_function = self.make_test_function()
       self._test_counter.assign(0)
       callbacks.on_test_begin()
@@ -1168,9 +1348,9 @@ class Model(network.Network, version_utils.ModelVersionSelector):
               logs = tmp_logs  # No error, now safe to assign to logs.
               end_step = step + data_handler.step_increment
               callbacks.on_test_batch_end(end_step, logs)
-      callbacks.on_test_end()
-
       logs = tf_utils.to_numpy_or_python_type(logs)
+      callbacks.on_test_end(logs=logs)
+
       if return_dict:
         return logs
       else:
@@ -1224,22 +1404,44 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     if self.predict_function is not None:
       return self.predict_function
 
-    def predict_function(iterator):
-      """Runs one call to `self.predict_function`."""
+    def step_function(model, iterator):
+      """Runs a single evaluation step."""
 
       def run_step(data):
-        outputs = self.predict_step(data)
-        # Ensure counter is updated only if `predict_step` succeeds.
+        outputs = model.predict_step(data)
+        # Ensure counter is updated only if `test_step` succeeds.
         with ops.control_dependencies(_minimum_control_deps(outputs)):
-          self._predict_counter.assign_add(1)
+          model._predict_counter.assign_add(1)  # pylint: disable=protected-access
         return outputs
 
       data = next(iterator)
-      outputs = self.distribute_strategy.run(run_step, args=(data,))
+      outputs = model.distribute_strategy.run(run_step, args=(data,))
       outputs = reduce_per_replica(
           outputs, self.distribute_strategy, reduction='concat')
       return outputs
 
+    if (self._steps_per_execution is None or
+        self._steps_per_execution.numpy().item() == 1):
+
+      def predict_function(iterator):
+        """Runs an evaluation execution with one step."""
+        return step_function(self, iterator)
+
+    else:
+
+      def predict_function(iterator):
+        """Runs an evaluation execution with multiple steps."""
+        outputs = step_function(self, iterator)
+        for _ in math_ops.range(self._steps_per_execution - 1):
+          directives.set_loop_options(
+              shape_invariants=[(
+                  t, tf_utils.get_tensor_spec(t, dynamic_batch=True).shape)
+                                for t in nest.flatten(outputs)])
+          step_outputs = step_function(self, iterator)
+          outputs = nest.map_structure(lambda t1, t2: concat([t1, t2]), outputs,
+                                       step_outputs)
+        return outputs
+
     if not self.run_eagerly:
       predict_function = def_function.function(
           predict_function, experimental_relax_shapes=True)
@@ -1324,6 +1526,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     _keras_api_gauge.get_cell('predict').set(True)
     version_utils.disallow_legacy_graph('Model', 'predict')
     self._check_call_args('predict')
+    _disallow_inside_tf_function('predict')
 
     outputs = None
     with self.distribute_strategy.scope():
@@ -1337,7 +1540,8 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing,
-          model=self)
+          model=self,
+          steps_per_execution=self._steps_per_execution)
 
       # Container that configures and calls `tf.keras.Callback`s.
       if not isinstance(callbacks, callbacks_module.CallbackList):
@@ -1369,13 +1573,31 @@ class Model(network.Network, version_utils.ModelVersionSelector):
                   batch_outputs,
                   lambda output, batch_output: output.append(batch_output),
                   outputs, batch_outputs)
-            callbacks.on_predict_batch_end(step, {'outputs': batch_outputs})
+            end_step = step + data_handler.step_increment
+            callbacks.on_predict_batch_end(end_step, {'outputs': batch_outputs})
       callbacks.on_predict_end()
     all_outputs = nest.map_structure_up_to(batch_outputs, concat, outputs)
     return tf_utils.to_numpy_or_python_type(all_outputs)
 
   def reset_metrics(self):
-    """Resets the state of metrics."""
+    """Resets the state of all the metrics in the model.
+
+    Examples:
+
+    >>> inputs = tf.keras.layers.Input(shape=(3,))
+    >>> outputs = tf.keras.layers.Dense(2)(inputs)
+    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
+    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
+
+    >>> x = np.random.random((2, 3))
+    >>> y = np.random.randint(0, 2, (2, 2))
+    >>> _ = model.fit(x, y, verbose=0)
+    >>> assert all(float(m.result()) for m in model.metrics)
+
+    >>> model.reset_metrics()
+    >>> assert all(float(m.result()) == 0 for m in model.metrics)
+
+    """
     for m in self.metrics:
       m.reset_states()
 
@@ -1427,6 +1649,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     """
     self._assert_compile_was_called()
     self._check_call_args('train_on_batch')
+    _disallow_inside_tf_function('train_on_batch')
     with self.distribute_strategy.scope(), \
          training_utils.RespectCompiledTrainableState(self):
       iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x,
@@ -1486,6 +1709,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     """
     self._assert_compile_was_called()
     self._check_call_args('test_on_batch')
+    _disallow_inside_tf_function('test_on_batch')
     with self.distribute_strategy.scope():
       iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x,
                                                     y, sample_weight)
@@ -1519,6 +1743,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           expectations of the model.
     """
     self._check_call_args('predict_on_batch')
+    _disallow_inside_tf_function('predict_on_batch')
     with self.distribute_strategy.scope():
       iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x)
       predict_function = self.make_predict_function()
@@ -1619,6 +1844,564 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         verbose=verbose,
         callbacks=callbacks)
 
+  ######################################################################
+  # Functions below are not training related. They are for model weights
+  # tracking, save/load, serialization, etc.
+  ######################################################################
+
+  @property
+  def trainable_weights(self):
+    self._assert_weights_created()
+    return self._dedup_weights(
+        trackable_layer_utils.gather_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._trainable_weights))
+
+  @property
+  def non_trainable_weights(self):
+    self._assert_weights_created()
+    return self._dedup_weights(
+        trackable_layer_utils.gather_non_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._non_trainable_weights +
+            self._trainable_weights))
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays.
+    """
+    with self.distribute_strategy.scope():
+      return super(Model, self).get_weights()
+
+  def save(self,
+           filepath,
+           overwrite=True,
+           include_optimizer=True,
+           save_format=None,
+           signatures=None,
+           options=None):
+    """Saves the model to Tensorflow SavedModel or a single HDF5 file.
+
+    The savefile includes:
+
+    - The model architecture, allowing to re-instantiate the model.
+    - The model weights.
+    - The state of the optimizer, allowing to resume training
+        exactly where you left off.
+
+    This allows you to save the entirety of the state of a model
+    in a single file.
+
+    Saved models can be reinstantiated via `keras.models.load_model`.
+    The model returned by `load_model` is a compiled model ready to be used
+    (unless the saved model was never compiled in the first place).
+
+    Models built with the Sequential and Functional API can be saved to both the
+    HDF5 and SavedModel formats. Subclassed models can only be saved with the
+    SavedModel format.
+
+    Note that the model weights may have different scoped names after being
+    loaded. Scoped names include the model/layer names, such as
+    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
+     access specific variables, e.g. `model.get_layer("dense_1").kernel`.
+
+    Arguments:
+        filepath: String, PathLike, path to SavedModel or H5 file to save the
+            model.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
+        save_format: Either `'tf'` or `'h5'`, indicating whether to save the
+            model to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X,
+            and 'h5' in TF 1.X.
+        signatures: Signatures to save with the SavedModel. Applicable to the
+            'tf' format only. Please see the `signatures` argument in
+            `tf.saved_model.save` for details.
+        options: Optional `tf.saved_model.SaveOptions` object that specifies
+            options for saving to SavedModel.
+
+    Example:
+
+    ```python
+    from keras.models import load_model
+
+    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
+    del model  # deletes the existing model
+
+    # returns a compiled model
+    # identical to the previous one
+    model = load_model('my_model.h5')
+    ```
+    """
+    save.save_model(self, filepath, overwrite, include_optimizer, save_format,
+                    signatures, options)
+
+  def save_weights(self, filepath, overwrite=True, save_format=None):
+    """Saves all layer weights.
+
+    Either saves in HDF5 or in TensorFlow format based on the `save_format`
+    argument.
+
+    When saving in HDF5 format, the weight file has:
+      - `layer_names` (attribute), a list of strings
+          (ordered names of model layers).
+      - For every layer, a `group` named `layer.name`
+          - For every such layer group, a group attribute `weight_names`,
+              a list of strings
+              (ordered names of weights tensor of the layer).
+          - For every weight in the layer, a dataset
+              storing the weight value, named after the weight tensor.
+
+    When saving in TensorFlow format, all objects referenced by the network are
+    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
+    instances or `Optimizer` instances assigned to object attributes. For
+    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
+    outputs)`, `Layer` instances used by the network are tracked/saved
+    automatically. For user-defined classes which inherit from `tf.keras.Model`,
+    `Layer` instances must be assigned to object attributes, typically in the
+    constructor. See the documentation of `tf.train.Checkpoint` and
+    `tf.keras.Model` for details.
+
+    While the formats are the same, do not mix `save_weights` and
+    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
+    loaded using `Model.load_weights`. Checkpoints saved using
+    `tf.train.Checkpoint.save` should be restored using the corresponding
+    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
+    `save_weights` for training checkpoints.
+
+    The TensorFlow format matches objects and variables by starting at a root
+    object, `self` for `save_weights`, and greedily matching attribute
+    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
+    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
+    means saving a `tf.keras.Model` using `save_weights` and loading into a
+    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
+    the `Model`'s variables. See the [guide to training
+    checkpoints](https://www.tensorflow.org/guide/checkpoint) for details
+    on the TensorFlow format.
+
+    Arguments:
+        filepath: String or PathLike, path to the file to save the weights to.
+            When saving in TensorFlow format, this is the prefix used for
+            checkpoint files (multiple files are generated). Note that the '.h5'
+            suffix causes weights to be saved in HDF5 format.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
+            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
+            `None` defaults to 'tf'.
+
+    Raises:
+        ImportError: If h5py is not available when attempting to save in HDF5
+            format.
+        ValueError: For invalid/unknown format arguments.
+    """
+    self._assert_weights_created()
+    filepath = path_to_string(filepath)
+    filepath_is_h5 = _is_hdf5_filepath(filepath)
+    if save_format is None:
+      if filepath_is_h5:
+        save_format = 'h5'
+      else:
+        save_format = 'tf'
+    else:
+      user_format = save_format.lower().strip()
+      if user_format in ('tensorflow', 'tf'):
+        save_format = 'tf'
+      elif user_format in ('hdf5', 'h5', 'keras'):
+        save_format = 'h5'
+      else:
+        raise ValueError(
+            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
+                save_format,))
+    if save_format == 'tf' and filepath_is_h5:
+      raise ValueError(
+          ('save_weights got save_format="tf"/"tensorflow", but the '
+           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
+           'when saving in TensorFlow format.')
+          % filepath)
+
+    if save_format == 'h5' and h5py is None:
+      raise ImportError(
+          '`save_weights` requires h5py when saving in hdf5.')
+    if save_format == 'tf':
+      check_filepath = filepath + '.index'
+    else:
+      check_filepath = filepath
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(check_filepath):
+      proceed = ask_to_proceed_with_overwrite(check_filepath)
+      if not proceed:
+        return
+    if save_format == 'h5':
+      with h5py.File(filepath, 'w') as f:
+        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
+    else:
+      if context.executing_eagerly():
+        session = None
+      else:
+        session = backend.get_session()
+      optimizer = getattr(self, 'optimizer', None)
+      if (optimizer
+          and not isinstance(optimizer, trackable.Trackable)):
+        logging.warning(
+            ('This model was compiled with a Keras optimizer (%s) but is being '
+             'saved in TensorFlow format with `save_weights`. The model\'s '
+             'weights will be saved, but unlike with TensorFlow optimizers in '
+             'the TensorFlow format the optimizer\'s state will not be '
+             'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
+            % (optimizer,))
+      self._trackable_saver.save(filepath, session=session)
+      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
+      checkpoint_management.update_checkpoint_state_internal(
+          save_dir=os.path.dirname(filepath),
+          model_checkpoint_path=filepath,
+          save_relative_paths=True,
+          all_model_checkpoint_paths=[filepath])
+
+  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
+    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+
+    If `by_name` is False weights are loaded based on the network's
+    topology. This means the architecture should be the same as when the weights
+    were saved.  Note that layers that don't have weights are not taken into
+    account in the topological ordering, so adding or removing layers is fine as
+    long as they don't have weights.
+
+    If `by_name` is True, weights are loaded into layers only if they share the
+    same name. This is useful for fine-tuning or transfer-learning models where
+    some of the layers have changed.
+
+    Only topological loading (`by_name=False`) is supported when loading weights
+    from the TensorFlow format. Note that topological loading differs slightly
+    between TensorFlow and HDF5 formats for user-defined classes inheriting from
+    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
+    TensorFlow format loads based on the object-local names of attributes to
+    which layers are assigned in the `Model`'s constructor.
+
+    Arguments:
+        filepath: String, path to the weights file to load. For weight files in
+            TensorFlow format, this is the file prefix (the same as was passed
+            to `save_weights`).
+        by_name: Boolean, whether to load weights by name or by topological
+            order. Only topological loading is supported for weight files in
+            TensorFlow format.
+        skip_mismatch: Boolean, whether to skip loading of layers where there is
+            a mismatch in the number of weights, or a mismatch in the shape of
+            the weight (only valid when `by_name=True`).
+
+    Returns:
+        When loading a weight file in TensorFlow format, returns the same status
+        object as `tf.train.Checkpoint.restore`. When graph building, restore
+        ops are run automatically as soon as the network is built (on first call
+        for user-defined classes inheriting from `Model`, immediately if it is
+        already built).
+
+        When loading weights in HDF5 format, returns `None`.
+
+    Raises:
+        ImportError: If h5py is not available and the weight file is in HDF5
+            format.
+        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
+          `False`.
+    """
+    if dist_utils.is_tpu_strategy(self._distribution_strategy):
+      if (self._distribution_strategy.extended.steps_per_run > 1 and
+          (not _is_hdf5_filepath(filepath))):
+        raise ValueError('Load weights is not yet supported with TPUStrategy '
+                         'with steps_per_run greater than 1.')
+    if skip_mismatch and not by_name:
+      raise ValueError(
+          'When calling model.load_weights, skip_mismatch can only be set to '
+          'True when by_name is True.')
+
+    filepath = path_to_string(filepath)
+    if _is_hdf5_filepath(filepath):
+      save_format = 'h5'
+    else:
+      try:
+        py_checkpoint_reader.NewCheckpointReader(filepath)
+        save_format = 'tf'
+      except errors_impl.DataLossError:
+        # The checkpoint is not readable in TensorFlow format. Try HDF5.
+        save_format = 'h5'
+    if save_format == 'tf':
+      status = self._trackable_saver.restore(filepath)
+      if by_name:
+        raise NotImplementedError(
+            'Weights may only be loaded based on topology into Models when '
+            'loading TensorFlow-formatted weights (got by_name=True to '
+            'load_weights).')
+      if not context.executing_eagerly():
+        session = backend.get_session()
+        # Restore existing variables (if any) immediately, and set up a
+        # streaming restore for any variables created in the future.
+        trackable_utils.streaming_restore(status=status, session=session)
+      status.assert_nontrivial_match()
+      return status
+    if h5py is None:
+      raise ImportError(
+          '`load_weights` requires h5py when loading weights from HDF5.')
+    if not self._is_graph_network and not self.built:
+      raise ValueError(
+          'Unable to load weights saved in HDF5 format into a subclassed '
+          'Model which has not created its variables yet. Call the Model '
+          'first, then load the weights.')
+    self._assert_weights_created()
+    with h5py.File(filepath, 'r') as f:
+      if 'layer_names' not in f.attrs and 'model_weights' in f:
+        f = f['model_weights']
+      if by_name:
+        hdf5_format.load_weights_from_hdf5_group_by_name(
+            f, self.layers, skip_mismatch=skip_mismatch)
+      else:
+        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
+
+  def _updated_config(self):
+    """Util shared between different serialization methods.
+
+    Returns:
+        Model config with Keras version information added.
+    """
+    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+    config = self.get_config()
+    model_config = {
+        'class_name': self.__class__.__name__,
+        'config': config,
+        'keras_version': keras_version,
+        'backend': backend.backend()
+    }
+    return model_config
+
+  def get_config(self):
+    raise NotImplementedError
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    # Since only FunctionalModel produces config, the model can only
+    # be constructed for FunctionalModel
+    from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+    return functional.Functional.from_config(
+        config, custom_objects=custom_objects)
+
+  def to_json(self, **kwargs):
+    """Returns a JSON string containing the network configuration.
+
+    To load a network from a JSON save file, use
+    `keras.models.model_from_json(json_string, custom_objects={})`.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `json.dumps()`.
+
+    Returns:
+        A JSON string.
+    """
+    model_config = self._updated_config()
+    return json.dumps(
+        model_config, default=serialization.get_json_type, **kwargs)
+
+  def to_yaml(self, **kwargs):
+    """Returns a yaml string containing the network configuration.
+
+    To load a network from a yaml save file, use
+    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
+
+    `custom_objects` should be a dictionary mapping
+    the names of custom losses / layers / etc to the corresponding
+    functions / classes.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `yaml.dump()`.
+
+    Returns:
+        A YAML string.
+
+    Raises:
+        ImportError: if yaml module is not found.
+    """
+    if yaml is None:
+      raise ImportError(
+          'Requires yaml module installed (`pip install pyyaml`).')
+    return yaml.dump(self._updated_config(), **kwargs)
+
+  def reset_states(self):
+    for layer in self.layers:
+      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
+        layer.reset_states()
+
+  @property
+  @deprecation.deprecated(
+      date=None,
+      instructions='This property should not be used in TensorFlow 2.0, '
+      'as updates are applied automatically.')
+  @doc_controls.do_not_generate_docs
+  def state_updates(self):
+    """Deprecated, do NOT use!
+
+    Returns the `updates` from all layers that are stateful.
+
+    This is useful for separating training updates and
+    state updates, e.g. when we need to update a layer's internal state
+    during prediction.
+
+    Returns:
+        A list of update ops.
+    """
+    state_updates = []
+    for layer in self.layers:
+      if getattr(layer, 'stateful', False):
+        if hasattr(layer, 'updates'):
+          state_updates += layer.updates
+    return state_updates
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self._dedup_weights(self._undeduplicated_weights)
+
+  @property
+  def _undeduplicated_weights(self):
+    """Returns the undeduplicated list of all layer variables/weights."""
+    self._assert_weights_created()
+    weights = []
+    for layer in self._layers:
+      weights += layer.weights
+    weights += (self._trainable_weights + self._non_trainable_weights)
+    return weights
+
+  def summary(self, line_length=None, positions=None, print_fn=None):
+    """Prints a string summary of the network.
+
+    Arguments:
+        line_length: Total length of printed lines
+            (e.g. set this to adapt the display to different
+            terminal window sizes).
+        positions: Relative or absolute positions of log elements
+            in each line. If not provided,
+            defaults to `[.33, .55, .67, 1.]`.
+        print_fn: Print function to use. Defaults to `print`.
+            It will be called on each line of the summary.
+            You can set it to a custom function
+            in order to capture the string summary.
+
+    Raises:
+        ValueError: if `summary()` is called before the model is built.
+    """
+    if not self.built:
+      raise ValueError('This model has not yet been built. '
+                       'Build the model first by calling `build()` or calling '
+                       '`fit()` with some data, or specify '
+                       'an `input_shape` argument in the first layer(s) for '
+                       'automatic build.')
+    layer_utils.print_summary(self,
+                              line_length=line_length,
+                              positions=positions,
+                              print_fn=print_fn)
+
+  @property
+  def layers(self):
+    return self._unique_sublayers()
+
+  def get_layer(self, name=None, index=None):
+    """Retrieves a layer based on either its name (unique) or index.
+
+    If `name` and `index` are both provided, `index` will take precedence.
+    Indices are based on order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: String, name of layer.
+        index: Integer, index of layer.
+
+    Returns:
+        A layer instance.
+
+    Raises:
+        ValueError: In case of invalid layer name or index.
+    """
+    # TODO(fchollet): We could build a dictionary based on layer names
+    # since they are constant, but we have not done that yet.
+    if index is not None and name is not None:
+      raise ValueError('Provide only a layer name or a layer index.')
+
+    if index is not None:
+      if len(self.layers) <= index:
+        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
+                         ' but model only has ' + str(len(self.layers)) +
+                         ' layers.')
+      else:
+        return self.layers[index]
+
+    if name is not None:
+      for layer in self.layers:
+        if layer.name == name:
+          return layer
+      raise ValueError('No such layer: ' + name + '.')
+    raise ValueError('Provide either a layer name or layer index.')
+
+  @trackable.no_automatic_dependency_tracking
+  def _set_save_spec(self, inputs):
+    if self._saved_model_inputs_spec is not None:
+      return  # Already set.
+
+    input_names = self.input_names
+    if not input_names:
+      input_names = compile_utils.create_pseudo_input_names(inputs)
+
+    flat_inputs = nest.flatten(inputs)
+    specs = []
+    for name, tensor in zip(input_names, flat_inputs):
+      specs.append(
+          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
+    specs = nest.pack_sequence_as(inputs, specs)
+
+    self._saved_model_inputs_spec = specs
+
+  def _get_save_spec(self, dynamic_batch=True):
+    if self._saved_model_inputs_spec is None:
+      return None
+
+    return nest.map_structure(
+        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
+        self._saved_model_inputs_spec)
+
+  def _assert_weights_created(self):
+    """Asserts that all the weights for the model have been created.
+
+    For a non-dynamic model, the weights must already be created after the
+    layer has been called. For a dynamic model, the exact list of weights can
+    never be known for certain since it may change at any time during execution.
+
+    We run this check right before accessing weights or getting the Numpy value
+    for the current weights. Otherwise, if the layer has never been called,
+    the user would just get an empty list, which is misleading.
+
+    Raises:
+      ValueError: if the weights of the network has not yet been created.
+    """
+    if self.dynamic:
+      return
+
+    if ('build' in self.__class__.__dict__ and
+        self.__class__ != Model and
+        not self.built):
+      # For any model that has customized build() method but hasn't
+      # been invoked yet, this will cover both sequential and subclass model.
+      # Also make sure to exclude Model class itself which has build() defined.
+      raise ValueError('Weights for model %s have not yet been created. '
+                       'Weights are created when the Model is first called on '
+                       'inputs or `build()` is called with an `input_shape`.' %
+                       self.name)
+
   def _check_call_args(self, method_name):
     """Check that `call` has only one positional arg."""
     # Always allow first arg, regardless of arg name.
@@ -1696,6 +2479,20 @@ class Model(network.Network, version_utils.ModelVersionSelector):
               'strategy scope.' % (metric, strategy)
           )
 
+    # Model metrics must be created in the same distribution strategy scope
+    # as the model.
+    for opt in nest.flatten(optimizer):
+      for v in getattr(opt, '_weights', []):
+        if not strategy.extended.variable_created_in_scope(v):
+          raise ValueError(
+              'Optimizer (%s) passed to model.compile was created inside of a '
+              'different distribution strategy scope than the model. All '
+              'optimizers must be created in the same distribution strategy '
+              'scope as the model (in this case %s). If you pass in a string '
+              'identifier for an optimizer to compile the optimizer will '
+              'automatically be created in the correct distribution '
+              'strategy scope.' % (opt, strategy))
+
   def _maybe_load_initial_epoch_from_ckpt(self, initial_epoch):
     """Maybe load initial epoch from ckpt considering possible worker recovery.
 
@@ -1922,3 +2719,20 @@ def _minimum_control_deps(outputs):
     if not isinstance(out, variables.Variable):
       return [out]  # Return first Tensor or Op from outputs.
   return []  # No viable Tensor or Op to use for control deps.
+
+
+def _disallow_inside_tf_function(method_name):
+  if ops.inside_function():
+    error_msg = (
+        'Detected a call to `Model.{method_name}` inside a `tf.function`. '
+        '`Model.{method_name} is a high-level endpoint that manages its own '
+        '`tf.function`. Please move the call to `Model.{method_name}` outside '
+        'of all enclosing `tf.function`s. Note that you can call a `Model` '
+        'directly on `Tensor`s inside a `tf.function` like: `model(x)`.'
+    ).format(method_name=method_name)
+    raise RuntimeError(error_msg)
+
+
+def _is_hdf5_filepath(filepath):
+  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
+          filepath.endswith('.hdf5'))
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 0cbb70109fc..a93bba0271b 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
 from absl.testing import parameterized
 import numpy as np
 
@@ -302,11 +300,11 @@ class CorrectnessTest(keras_parameterized.TestCase):
     self.assertAlmostEqual(history.history['loss'][-1], 0.5836, 4)
 
   @parameterized.named_parameters([
-      ('_None', contextlib.contextmanager(lambda: iter([None])), 0., 4.),
-      ('_0', lambda: keras.backend.learning_phase_scope(0), 4., 4.),
-      ('_1', lambda: keras.backend.learning_phase_scope(1), 0., 0.),
+      ('_None', None, 0., 4.),
+      ('_False', False, 4., 4.),
+      ('_True', True, 0., 0.),
   ])
-  def test_nested_model_learning_phase(self, nested_scope_fn,
+  def test_nested_model_learning_phase(self, training,
                                        expected_training_loss,
                                        expected_validation_loss):
     """Tests that learning phase is correctly set in an intermediate layer."""
@@ -326,18 +324,17 @@ class CorrectnessTest(keras_parameterized.TestCase):
       return keras.Model(inputs, outputs)
 
     def _regularize_model(unregularized_model):
-      inputs = keras.Input(unregularized_model.inputs[0].shape[1:])
-      with nested_scope_fn():
-        logits = unregularized_model(inputs)
-      outputs = keras.activations.softmax(logits)
-      model = keras.Model(inputs, outputs)
       # Regularize the most recent activations of a post-dropout layer.
       sample_activations = unregularized_model.get_layer(
           index=-2).get_output_at(-1)
       regularization_loss = keras.backend.mean(sample_activations)
-      model.add_loss(regularization_loss)
-      model.add_metric(
+      unregularized_model.add_loss(regularization_loss)
+      unregularized_model.add_metric(
           regularization_loss, aggregation='mean', name='regularization_loss')
+      inputs = keras.Input(unregularized_model.inputs[0].shape[1:])
+      logits = unregularized_model(inputs, training=training)
+      outputs = keras.activations.softmax(logits)
+      model = keras.Model(inputs, outputs)
       return model
 
     # Make and compile models.
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 475a370fa55..c1c498b207b 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -28,6 +28,7 @@ import six
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -88,6 +89,35 @@ class TrainingTest(keras_parameterized.TestCase):
     hist = model.fit(x=np.array([0.]), y=np.array([0.]))
     self.assertAllClose(hist.history['loss'][0], 10000)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_run_eagerly_setting(self):
+    model = sequential.Sequential([layers_module.Dense(1)])
+    run_eagerly = testing_utils.should_run_eagerly()
+    model.compile('sgd', 'mse', run_eagerly=run_eagerly)
+    self.assertEqual(model.run_eagerly, run_eagerly)
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  @parameterized.named_parameters(
+      ('train_on_batch', 'train_on_batch'),
+      ('test_on_batch', 'test_on_batch'),
+      ('predict_on_batch', 'predict_on_batch'),
+      ('fit', 'fit'),
+      ('evaluate', 'evaluate'),
+      ('predict', 'predict'),
+  )
+  def test_disallow_methods_inside_tf_function(self, method_name):
+    model = sequential.Sequential([layers_module.Dense(1)])
+    run_eagerly = testing_utils.should_run_eagerly()
+    model.compile('sgd', 'mse', run_eagerly=run_eagerly)
+
+    @def_function.function
+    def my_fn():
+      getattr(model, method_name)(1)
+
+    error_msg = 'inside a `tf.function`'
+    with self.assertRaisesRegexp(RuntimeError, error_msg):
+      my_fn()
+
   @keras_parameterized.run_all_keras_modes
   def test_fit_and_validate_learning_phase(self):
 
@@ -2949,6 +2979,8 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         return self.dense1(x)
 
     model = TestModel()
+    self.assertListEqual([m.name for m in model.metrics],
+                         ['metric_1', 'metric_2'])
     model.compile(
         loss='mse',
         optimizer=RMSPropOptimizer(0.01),
@@ -2968,6 +3000,41 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_multiple_add_metric_calls_layer(self):
+
+    class TestLayer(layers_module.Layer):
+
+      def __init__(self):
+        super(TestLayer, self).__init__(name='test_layer')
+        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
+        self.m1 = metrics_module.Mean(name='m_1')
+        self.m2 = [
+            metrics_module.Mean(name='m_2'),
+            metrics_module.Mean(name='m_3')
+        ]
+        self.m3 = {
+            'mean4': metrics_module.Mean(name='m_4'),
+            'mean5': metrics_module.Mean(name='m_5')
+        }
+
+      def call(self, x):
+        self.add_metric(self.m2[0](x))
+        self.add_metric(self.m2[1](x))
+        self.add_metric(self.m1(x))
+        self.add_metric(self.m3['mean4'](x))
+        self.add_metric(self.m3['mean5'](x))
+        self.add_metric(math_ops.reduce_sum(x), name='m_6', aggregation='mean')
+        return self.dense1(x)
+
+    layer = TestLayer()
+    self.assertListEqual([m.name for m in layer.metrics],
+                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5'])
+
+    layer(np.ones((10, 10)))
+    self.assertListEqual([m.name for m in layer.metrics],
+                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
+
   @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
 
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 1edf364b3ff..c137c6e517a 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -25,6 +25,7 @@ from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
@@ -42,7 +43,7 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils
-from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_distributed
@@ -61,6 +62,7 @@ from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
@@ -179,8 +181,8 @@ class Model(training_lib.Model):
                 self._compile_time_distribution_strategy)
     if strategy:
       with strategy.scope():
-        return network.Network.get_weights(self)
-    return network.Network.get_weights(self)
+        return base_layer.Layer.get_weights(self)
+    return base_layer.Layer.get_weights(self)
 
   def load_weights(self, filepath, by_name=False, skip_mismatch=False):
     """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
@@ -230,7 +232,7 @@ class Model(training_lib.Model):
     """
     if distributed_training_utils.is_tpu_strategy(self._distribution_strategy):
       if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
+          (not training_lib._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
         raise ValueError('Load weights is not yet supported with TPUStrategy '
                          'with steps_per_run greater than 1.')
     return super(Model, self).load_weights(filepath, by_name, skip_mismatch)
@@ -358,6 +360,12 @@ class Model(training_lib.Model):
           self._distribution_strategy = (
               distribution_strategy_context.get_strategy())
 
+    if isinstance(self._distribution_strategy,
+                  (parameter_server_strategy.ParameterServerStrategyV1,
+                   parameter_server_strategy.ParameterServerStrategy)):
+      raise NotImplementedError('ParameterServerStrategy currently only works '
+                                'with the tf.Estimator API')
+
     if not self._experimental_run_tf_function:
       self._validate_compile_param_for_distribution_strategy(self.run_eagerly,
                                                              sample_weight_mode,
@@ -483,6 +491,11 @@ class Model(training_lib.Model):
     """Returns the model's metrics added using `compile`, `add_metric` APIs."""
     metrics = []
     if self._is_compiled:
+      if not hasattr(self, '_v1_compile_was_called'):
+        # See b/155687393 for more details, the model is created as a v2
+        # instance but converted to v1. Fallback to use base Model to retrieve
+        # the metrics.
+        return super(Model, self).metrics
       metrics += self._compile_metric_functions
     metrics.extend(self._metrics)
     metrics.extend(_get_metrics_from_layers(self._layers))
@@ -496,6 +509,12 @@ class Model(training_lib.Model):
     # losses for backward compatibility.
     metrics_names = ['loss']
     if self._is_compiled:
+      if not hasattr(self, '_v1_compile_was_called'):
+        # See b/155687393 for more details, the model is created as a v2
+        # instance but converted to v1. Fallback to use base Model to retrieve
+        # the metrics name
+        return super(Model, self).metrics_names
+
       # Add output loss metric names to the metric names list.
       if len(self._training_endpoints) > 1:
         metrics_names.extend([
@@ -527,7 +546,7 @@ class Model(training_lib.Model):
                        'is enabled.')
     if not self.dynamic:
       if self._run_eagerly is None:
-        # Respect `tf.config.experimental_run_functions_eagerly` unless
+        # Respect `tf.config.run_functions_eagerly` unless
         # `run_eagerly` was explicitly passed to `compile`.
         return def_function.RUN_FUNCTIONS_EAGERLY
       else:
@@ -3136,7 +3155,7 @@ def _convert_scipy_sparse_tensor(value, expected_input):
     The possibly-converted 'value'.
   """
   if issparse is not None and issparse(value):
-    if ops.is_dense_tensor_like(expected_input):
+    if isinstance(expected_input, core.Tensor):
       if ops.executing_eagerly_outside_functions():
         # In TF2 we do not silently densify sparse matrices.
         raise ValueError('A SciPy sparse matrix was passed to a model '
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index 4dde07e770b..895dd0458ef 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -46,8 +46,8 @@ def model_to_estimator(
   model to an Estimator for use with downstream systems.
 
   For usage example, please see:
-  [Creating estimators from Keras
-  Models](https://www.tensorflow.org/guide/estimators#creating_estimators_from_keras_models).
+  [Creating estimators from Keras Models](
+    https://www.tensorflow.org/guide/estimators#creating_estimators_from_keras_models).
 
   Sample Weights:
   Estimators returned by `model_to_estimator` are configured so that they can
@@ -144,8 +144,8 @@ def model_to_estimator_v2(keras_model=None,
   model to an Estimator for use with downstream systems.
 
   For usage example, please see:
-  [Creating estimators from Keras
-  Models](https://www.tensorflow.org/guide/estimators#creating_estimators_from_keras_models).
+  [Creating estimators from Keras Models](
+    https://www.tensorflow.org/guide/estimators#creating_estimators_from_keras_models).
 
   Sample Weights:
   Estimators returned by `model_to_estimator` are configured so that they can
@@ -169,6 +169,11 @@ def model_to_estimator_v2(keras_model=None,
   estimator.train(input_fn, steps=1)
   ```
 
+  Note: We do not support creating weighted metrics in Keras and converting them
+  to weighted metrics in the Estimator API using `model_to_estimator`.
+  You will have to create these metrics directly on the estimator spec using the
+  `add_metrics` function.
+
   To customize the estimator `eval_metric_ops` names, you can pass in the
   `metric_names_map` dictionary mapping the keras model output metric names
   to the custom names as follows:
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
new file mode 100644
index 00000000000..94097c28d73
--- /dev/null
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -0,0 +1,152 @@
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
+
+package(
+    default_visibility = [
+        "//tensorflow/python/feature_column:__subpackages__",
+        "//tensorflow/python/keras:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "feature_column",
+    srcs = ["__init__.py"],
+    deps = [
+        ":dense_features",
+        ":dense_features_v2",
+        ":sequence_feature_column",
+    ],
+)
+
+py_library(
+    name = "dense_features",
+    srcs = [
+        "dense_features.py",
+    ],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras:backend",
+    ],
+)
+
+py_library(
+    name = "dense_features_v2",
+    srcs = [
+        "dense_features_v2.py",
+    ],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_test",
+    srcs = ["dense_features_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_v2_test",
+    srcs = ["dense_features_v2_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+py_library(
+    name = "sequence_feature_column",
+    srcs = ["sequence_feature_column.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras:backend",
+    ],
+)
+
+tf_py_test(
+    name = "sequence_feature_column_test",
+    srcs = ["sequence_feature_column_test.py"],
+    deps = [
+        ":sequence_feature_column",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "sequence_feature_column_integration_test",
+    srcs = ["sequence_feature_column_integration_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features",
+        ":sequence_feature_column",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras/layers:recurrent",
+    ],
+)
diff --git a/tensorflow/python/keras/feature_column/__init__.py b/tensorflow/python/keras/feature_column/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/feature_column/dense_features.py b/tensorflow/python/keras/feature_column/dense_features.py
similarity index 100%
rename from tensorflow/python/feature_column/dense_features.py
rename to tensorflow/python/keras/feature_column/dense_features.py
diff --git a/tensorflow/python/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
similarity index 60%
rename from tensorflow/python/feature_column/dense_features_test.py
rename to tensorflow/python/keras/feature_column/dense_features_test.py
index 7cd523dcc14..76b91dd605f 100644
--- a/tensorflow/python/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -18,22 +18,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 
@@ -676,5 +679,452 @@ class DenseFeaturesTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
+class IndicatorColumnTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = df.DenseFeatures([animal])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
+  @test_util.run_deprecated_v1
+  def test_dense_features(self, use_safe_embedding_lookup, partition_variables):
+    # Inputs.
+    vocabulary_size = 4
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
+      self.assertEqual(dtypes.float32, dtype)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+
+      # Provide sparse input and get dense result.
+      l = df.DenseFeatures((embedding_column,))
+      dense_features = l({'aaa': sparse_input})
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in trainable_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  def _test_dense_features(self, trainable=True):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0]
+        # example 1, ids []
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
+    sparse_input_c = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 1), (1, 1), (1, 3)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_d = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        indices=((0, 1),),
+        values=(2,),
+        dense_shape=(2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0:
+        # A ids [2], embedding = [7, 11]
+        # B ids [0], embedding = [1, 2]
+        # C ids [2], embedding = [7, 11]
+        # D ids [2], embedding = [7, 11]
+        (7., 11., 1., 2., 7., 11., 7., 11.),
+        # example 1:
+        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # B ids [], embedding = [0, 0]
+        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # D ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=vocabulary_size)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=vocabulary_size)
+
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+
+    features = {
+        'aaa': sparse_input_a,
+        'bbb': sparse_input_b,
+        'ccc': sparse_input_c,
+        'ddd': sparse_input_d
+    }
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures(
+        feature_columns=(embedding_column_b, embedding_column_a,
+                         embedding_column_c, embedding_column_d))(
+                             features)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(
+        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+        tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if trainable:
+      self.assertCountEqual(
+          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
+    shared_embedding_vars = global_vars
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_get_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_identity(
+                key='b', num_buckets=3), dimension=2)]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    self.assertEqual(config['name'], orig_layer.name)
+    self.assertEqual(config['trainable'], trainable)
+    self.assertLen(config['feature_columns'], 2)
+    self.assertEqual(
+        config['feature_columns'][0]['class_name'], 'NumericColumn')
+    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
+    self.assertEqual(
+        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_from_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
+                'b', vocabulary_list=['1', '2', '3']), dimension=2),
+            fc.indicator_column(fc.categorical_column_with_hash_bucket(
+                key='c', hash_bucket_size=3))]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertEqual(new_layer.name, orig_layer.name)
+    self.assertEqual(new_layer.trainable, trainable)
+    self.assertLen(new_layer._feature_columns, 3)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a')
+    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
+    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
+    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
+
+  def test_crossed_column(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        'a', vocabulary_list=['1', '2', '3'])
+    b = fc.categorical_column_with_vocabulary_list(
+        'b', vocabulary_list=['1', '2', '3'])
+    ab = fc.crossed_column([a, b], hash_bucket_size=2)
+    cols = [fc.indicator_column(ab)]
+
+    orig_layer = df.DenseFeatures(cols)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertLen(new_layer._feature_columns, 1)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SequenceFeatureColumnsTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    input_layer = df.DenseFeatures([embedding_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    input_layer = df.DenseFeatures([indicator_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/feature_column/dense_features_v2.py b/tensorflow/python/keras/feature_column/dense_features_v2.py
similarity index 98%
rename from tensorflow/python/feature_column/dense_features_v2.py
rename to tensorflow/python/keras/feature_column/dense_features_v2.py
index 9e9892d77f5..e4dc22f1bbe 100644
--- a/tensorflow/python/feature_column/dense_features_v2.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.util.tf_export import keras_export
 
 
diff --git a/tensorflow/python/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
similarity index 99%
rename from tensorflow/python/feature_column/dense_features_v2_test.py
rename to tensorflow/python/keras/feature_column/dense_features_v2_test.py
index 71cb163a7d9..95fc8b7ac1e 100644
--- a/tensorflow/python/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features_v2 as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +30,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features_v2 as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables as variables_lib
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column.py b/tensorflow/python/keras/feature_column/sequence_feature_column.py
new file mode 100644
index 00000000000..856e385c8fa
--- /dev/null
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column.py
@@ -0,0 +1,173 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This API defines FeatureColumn for sequential input.
+
+NOTE: This API is a work in progress and will likely be changing frequently.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.util.tf_export import keras_export
+
+# pylint: disable=protected-access
+
+
+@keras_export('keras.experimental.SequenceFeatures')
+class SequenceFeatures(fc._BaseFeaturesLayer):
+  """A layer for sequence input.
+
+    All `feature_columns` must be sequence dense columns with the same
+    `sequence_length`. The output of this method can be fed into sequence
+    networks, such as RNN.
+
+    The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
+    `T` is the maximum sequence length for this batch, which could differ from
+    batch to batch.
+
+    If multiple `feature_columns` are given with `Di` `num_elements` each, their
+    outputs are concatenated. So, the final `Tensor` has shape
+    `[batch_size, T, D0 + D1 + ... + Dn]`.
+
+    Example:
+
+    ```python
+    # Behavior of some cells or feature columns may depend on whether we are in
+    # training or inference mode, e.g. applying dropout.
+    training = True
+    rating = sequence_numeric_column('rating')
+    watches = sequence_categorical_column_with_identity(
+        'watches', num_buckets=1000)
+    watches_embedding = embedding_column(watches, dimension=10)
+    columns = [rating, watches_embedding]
+
+    sequence_input_layer = SequenceFeatures(columns)
+    features = tf.io.parse_example(...,
+                                   features=make_parse_example_spec(columns))
+    sequence_input, sequence_length = sequence_input_layer(
+       features, training=training)
+    sequence_length_mask = tf.sequence_mask(sequence_length)
+
+    rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size, training=training)
+    rnn_layer = tf.keras.layers.RNN(rnn_cell, training=training)
+    outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
+    ```
+  """
+
+  def __init__(
+      self,
+      feature_columns,
+      trainable=True,
+      name=None,
+      **kwargs):
+    """"Constructs a SequenceFeatures layer.
+
+    Args:
+      feature_columns: An iterable of dense sequence columns. Valid columns are
+        - `embedding_column` that wraps a `sequence_categorical_column_with_*`
+        - `sequence_numeric_column`.
+      trainable: Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
+      name: Name to give to the SequenceFeatures.
+      **kwargs: Keyword arguments to construct a layer.
+
+    Raises:
+      ValueError: If any of the `feature_columns` is not a
+        `SequenceDenseColumn`.
+    """
+    super(SequenceFeatures, self).__init__(
+        feature_columns=feature_columns,
+        trainable=trainable,
+        name=name,
+        expected_column_type=fc.SequenceDenseColumn,
+        **kwargs)
+
+  @property
+  def _is_feature_layer(self):
+    return True
+
+  def _target_shape(self, input_shape, total_elements):
+    return (input_shape[0], input_shape[1], total_elements)
+
+  def call(self, features, training=None):
+    """Returns sequence input corresponding to the `feature_columns`.
+
+    Args:
+      features: A dict mapping keys to tensors.
+      training: Python boolean or None, indicating whether to the layer is being
+        run in training mode. This argument is passed to the call method of any
+        `FeatureColumn` that takes a `training` argument. For example, if a
+        `FeatureColumn` performed dropout, the column could expose a `training`
+        argument to control whether the dropout should be applied. If `None`,
+        defaults to `tf.keras.backend.learning_phase()`.
+
+
+    Returns:
+      An `(input_layer, sequence_length)` tuple where:
+      - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
+          `T` is the maximum sequence length for this batch, which could differ
+          from batch to batch. `D` is the sum of `num_elements` for all
+          `feature_columns`.
+      - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
+          length for each example.
+
+    Raises:
+      ValueError: If features are not a dictionary.
+    """
+    if not isinstance(features, dict):
+      raise ValueError('We expected a dictionary here. Instead we got: ',
+                       features)
+    if training is None:
+      training = backend.learning_phase()
+    transformation_cache = fc.FeatureTransformationCache(features)
+    output_tensors = []
+    sequence_lengths = []
+
+    for column in self._feature_columns:
+      with ops.name_scope(column.name):
+        try:
+          dense_tensor, sequence_length = column.get_sequence_dense_tensor(
+              transformation_cache, self._state_manager, training=training)
+        except TypeError:
+          dense_tensor, sequence_length = column.get_sequence_dense_tensor(
+              transformation_cache, self._state_manager)
+        # Flattens the final dimension to produce a 3D Tensor.
+        output_tensors.append(self._process_dense_tensor(column, dense_tensor))
+        sequence_lengths.append(sequence_length)
+
+    # Check and process sequence lengths.
+    fc._verify_static_batch_size_equality(sequence_lengths,
+                                          self._feature_columns)
+    sequence_length = _assert_all_equal_and_return(sequence_lengths)
+
+    return self._verify_and_concat_tensors(output_tensors), sequence_length
+
+
+def _assert_all_equal_and_return(tensors, name=None):
+  """Asserts that all tensors are equal and returns the first one."""
+  with ops.name_scope(name, 'assert_all_equal', values=tensors):
+    if len(tensors) == 1:
+      return tensors[0]
+    assert_equal_ops = []
+    for t in tensors[1:]:
+      assert_equal_ops.append(check_ops.assert_equal(tensors[0], t))
+    with ops.control_dependencies(assert_equal_ops):
+      return array_ops.identity(tensors[0])
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
new file mode 100644
index 00000000000..b1100bf7b07
--- /dev/null
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
@@ -0,0 +1,259 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration test for sequence feature columns with SequenceExamples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from google.protobuf import text_format
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features
+from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
+from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class SequenceFeatureColumnIntegrationTest(test.TestCase):
+
+  def _make_sequence_example(self):
+    example = example_pb2.SequenceExample()
+    example.context.feature['int_ctx'].int64_list.value.extend([5])
+    example.context.feature['float_ctx'].float_list.value.extend([123.6])
+    for val in range(0, 10, 2):
+      feat = feature_pb2.Feature()
+      feat.int64_list.value.extend([val] * val)
+      example.feature_lists.feature_list['int_list'].feature.extend([feat])
+    for val in range(1, 11, 2):
+      feat = feature_pb2.Feature()
+      feat.bytes_list.value.extend([compat.as_bytes(str(val))] * val)
+      example.feature_lists.feature_list['str_list'].feature.extend([feat])
+
+    return example
+
+  def _build_feature_columns(self):
+    col = fc.categorical_column_with_identity('int_ctx', num_buckets=100)
+    ctx_cols = [
+        fc.embedding_column(col, dimension=10),
+        fc.numeric_column('float_ctx')
+    ]
+
+    identity_col = sfc.sequence_categorical_column_with_identity(
+        'int_list', num_buckets=10)
+    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
+        'bytes_list', hash_bucket_size=100)
+    seq_cols = [
+        fc.embedding_column(identity_col, dimension=10),
+        fc.embedding_column(bucket_col, dimension=20)
+    ]
+
+    return ctx_cols, seq_cols
+
+  def test_sequence_example_into_input_layer(self):
+    examples = [_make_sequence_example().SerializeToString()] * 100
+    ctx_cols, seq_cols = self._build_feature_columns()
+
+    def _parse_example(example):
+      ctx, seq = parsing_ops.parse_single_sequence_example(
+          example,
+          context_features=fc.make_parse_example_spec_v2(ctx_cols),
+          sequence_features=fc.make_parse_example_spec_v2(seq_cols))
+      ctx.update(seq)
+      return ctx
+
+    ds = dataset_ops.Dataset.from_tensor_slices(examples)
+    ds = ds.map(_parse_example)
+    ds = ds.batch(20)
+
+    # Test on a single batch
+    features = dataset_ops.make_one_shot_iterator(ds).get_next()
+
+    # Tile the context features across the sequence features
+    sequence_input_layer = ksfc.SequenceFeatures(seq_cols)
+    seq_layer, _ = sequence_input_layer(features)
+    input_layer = dense_features.DenseFeatures(ctx_cols)
+    ctx_layer = input_layer(features)
+    input_layer = sfc.concatenate_context_input(ctx_layer, seq_layer)
+
+    rnn_layer = recurrent.RNN(recurrent.SimpleRNNCell(10))
+    output = rnn_layer(input_layer)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      features_r = sess.run(features)
+      self.assertAllEqual(features_r['int_list'].dense_shape, [20, 3, 6])
+
+      output_r = sess.run(output)
+      self.assertAllEqual(output_r.shape, [20, 10])
+
+  @test_util.run_deprecated_v1
+  def test_shared_sequence_non_sequence_into_input_layer(self):
+    non_seq = fc.categorical_column_with_identity('non_seq',
+                                                  num_buckets=10)
+    seq = sfc.sequence_categorical_column_with_identity('seq',
+                                                        num_buckets=10)
+    shared_non_seq, shared_seq = fc.shared_embedding_columns_v2(
+        [non_seq, seq],
+        dimension=4,
+        combiner='sum',
+        initializer=init_ops_v2.Ones(),
+        shared_embedding_collection_name='shared')
+
+    seq = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 0]],
+        values=[0, 1, 2],
+        dense_shape=[2, 2])
+    non_seq = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 0]],
+        values=[0, 1, 2],
+        dense_shape=[2, 2])
+    features = {'seq': seq, 'non_seq': non_seq}
+
+    # Tile the context features across the sequence features
+    seq_input, seq_length = ksfc.SequenceFeatures([shared_seq])(features)
+    non_seq_input = dense_features.DenseFeatures([shared_non_seq])(features)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      output_seq, output_seq_length, output_non_seq = sess.run(
+          [seq_input, seq_length, non_seq_input])
+      self.assertAllEqual(output_seq, [[[1, 1, 1, 1], [1, 1, 1, 1]],
+                                       [[1, 1, 1, 1], [0, 0, 0, 0]]])
+      self.assertAllEqual(output_seq_length, [2, 1])
+      self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
+
+
+_SEQ_EX_PROTO = """
+context {
+  feature {
+    key: "float_ctx"
+    value {
+      float_list {
+        value: 123.6
+      }
+    }
+  }
+  feature {
+    key: "int_ctx"
+    value {
+      int64_list {
+        value: 5
+      }
+    }
+  }
+}
+feature_lists {
+  feature_list {
+    key: "bytes_list"
+    value {
+      feature {
+        bytes_list {
+          value: "a"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "b"
+          value: "c"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "d"
+          value: "e"
+          value: "f"
+          value: "g"
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "float_list"
+    value {
+      feature {
+        float_list {
+          value: 1.0
+        }
+      }
+      feature {
+        float_list {
+          value: 3.0
+          value: 3.0
+          value: 3.0
+        }
+      }
+      feature {
+        float_list {
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "int_list"
+    value {
+      feature {
+        int64_list {
+          value: 2
+          value: 2
+        }
+      }
+      feature {
+        int64_list {
+          value: 4
+          value: 4
+          value: 4
+          value: 4
+        }
+      }
+      feature {
+        int64_list {
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def _make_sequence_example():
+  example = example_pb2.SequenceExample()
+  return text_format.Parse(_SEQ_EX_PROTO, example)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_test.py
new file mode 100644
index 00000000000..f6e24a586f2
--- /dev/null
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_test.py
@@ -0,0 +1,687 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sequential_feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
+from tensorflow.python.keras.saving import model_config
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+
+
+def _initialized_session(config=None):
+  sess = session.Session(config=config)
+  sess.run(variables_lib.global_variables_initializer())
+  sess.run(lookup_ops.tables_initializer())
+  return sess
+
+
+class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args_a': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'sparse_input_args_b': {
+           # example 0, ids [1]
+           # example 1, ids [2, 0]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 2, 0),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [2, 0]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_args_a': {
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           'indices': (
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 0, 0, 1),
+           'dense_shape': (2, 2, 2)},
+       'sparse_input_args_b': {
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[2], [0]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (1, 1, 1, 2, 0),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[5., 6., 14., 15., 16.], [2., 3., 14., 15., 16.]],
+           # feature 1, [a: 0, 0, b: 2, -], [a: 1, -, b: 0, -]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  @test_util.run_in_graph_and_eager_modes
+  def test_embedding_column(
+      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
+      expected_sequence_length):
+
+    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
+    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
+    vocabulary_size = 3
+    embedding_dimension_a = 2
+    embedding_values_a = (
+        (1., 2.),  # id 0
+        (3., 4.),  # id 1
+        (5., 6.)  # id 2
+    )
+    embedding_dimension_b = 3
+    embedding_values_b = (
+        (11., 12., 13.),  # id 0
+        (14., 15., 16.),  # id 1
+        (17., 18., 19.)  # id 2
+    )
+    def _get_initializer(embedding_dimension, embedding_values):
+
+      def _initializer(shape, dtype, partition_info=None):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
+      return _initializer
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a,
+        dimension=embedding_dimension_a,
+        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_b = fc.embedding_column(
+        categorical_column_b,
+        dimension=embedding_dimension_b,
+        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
+
+    # Test that columns are reordered alphabetically.
+    sequence_input_layer = ksfc.SequenceFeatures(
+        [embedding_column_b, embedding_column_a])
+    input_layer, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b,})
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    weights = sequence_input_layer.weights
+    self.assertCountEqual(
+        ('sequence_features/aaa_embedding/embedding_weights:0',
+         'sequence_features/bbb_embedding/embedding_weights:0'),
+        tuple([v.name for v in weights]))
+    self.assertAllEqual(embedding_values_a, self.evaluate(weights[0]))
+    self.assertAllEqual(embedding_values_b, self.evaluate(weights[1]))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_embedding_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+    sequence_input_layer = ksfc.SequenceFeatures([embedding_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must be of '
+        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      _, _ = sequence_input_layer({'aaa': sparse_input})
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_shared_embedding_column(self):
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 0, 1),
+          dense_shape=(2, 2))
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [1]
+          # example 1, ids [2, 0]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(1, 2, 0),
+          dense_shape=(2, 2))
+
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 4.),  # id 1
+          (5., 6.)  # id 2
+      )
+
+      def _get_initializer(embedding_dimension, embedding_values):
+
+        def _initializer(shape, dtype, partition_info=None):
+          self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+          self.assertEqual(dtypes.float32, dtype)
+          self.assertIsNone(partition_info)
+          return embedding_values
+
+        return _initializer
+
+      expected_input_layer = [
+          # example 0, ids_a [2], ids_b [1]
+          [[5., 6., 3., 4.], [0., 0., 0., 0.]],
+          # example 1, ids_a [0, 1], ids_b [2, 0]
+          [[1., 2., 5., 6.], [3., 4., 1., 2.]],
+      ]
+      expected_sequence_length = [1, 2]
+
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      # Test that columns are reordered alphabetically.
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension,
+          initializer=_get_initializer(embedding_dimension, embedding_values))
+
+      sequence_input_layer = ksfc.SequenceFeatures(shared_embedding_columns)
+      input_layer, sequence_length = sequence_input_layer({
+          'aaa': sparse_input_a, 'bbb': sparse_input_b})
+
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(
+          ('aaa_bbb_shared_embedding:0',),
+          tuple([v.name for v in global_vars]))
+      with _initialized_session() as sess:
+        self.assertAllEqual(embedding_values,
+                            global_vars[0].eval(session=sess))
+        self.assertAllEqual(expected_input_layer,
+                            input_layer.eval(session=sess))
+        self.assertAllEqual(
+            expected_sequence_length, sequence_length.eval(session=sess))
+
+  @test_util.run_deprecated_v1
+  def test_shared_embedding_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence shared embedding column."""
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    shared_embedding_columns = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b], dimension=2)
+
+    sequence_input_layer = ksfc.SequenceFeatures(shared_embedding_columns)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_shared_embedding\. categorical_column must '
+        r'be of type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      _, _ = sequence_input_layer({'aaa': sparse_input_a,
+                                   'bbb': sparse_input_b})
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args_a': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'sparse_input_args_b': {
+           # example 0, ids [1]
+           # example 1, ids [1, 0]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 1, 0),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [1, 0]
+           [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_args_a': {
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           'indices': (
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 0, 0, 1),
+           'dense_shape': (2, 2, 2)},
+       'sparse_input_args_b': {
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[1], [0]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (1, 1, 1, 1, 0),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[0., 0., 1., 0., 2.], [1., 1., 0., 0., 1.]],
+           # feature 1, [a: 0, 0, b: 1, -], [a: 1, -, b: 0, -]
+           [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  @test_util.run_in_graph_and_eager_modes
+  def test_indicator_column(
+      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
+      expected_sequence_length):
+    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
+    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
+
+    vocabulary_size_a = 3
+    vocabulary_size_b = 2
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size_b)
+    indicator_column_b = fc.indicator_column(categorical_column_b)
+    # Test that columns are reordered alphabetically.
+    sequence_input_layer = ksfc.SequenceFeatures(
+        [indicator_column_b, indicator_column_a])
+    input_layer, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b})
+
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_indicator_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence categorical column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    sequence_input_layer = ksfc.SequenceFeatures([indicator_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must be of '
+        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      _, _ = sequence_input_layer({'aaa': sparse_input})
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [0., 1]
+           # example 1, [10.]
+           'indices': ((0, 0), (0, 1), (1, 0)),
+           'values': (0., 1., 10.),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           [[0.], [1.]],
+           [[10.], [0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # feature 0, ids [[20, 3], [5]]
+           # feature 1, ids [[3], [8]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (20., 3., 5., 3., 8.),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           [[20.], [3.], [5.], [0.]],
+           [[3.], [0.], [8.], [0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  @test_util.run_in_graph_and_eager_modes
+  def test_numeric_column(
+      self, sparse_input_args, expected_input_layer, expected_sequence_length):
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
+    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
+
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [0., 1.,  2., 3., 4., 5., 6., 7.]
+           # example 1, [10., 11., 12., 13.]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
+                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 4)},
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      )
+  @test_util.run_in_graph_and_eager_modes
+  def test_numeric_column_multi_dim(
+      self, sparse_input_args, expected_input_layer, expected_sequence_length):
+    """Tests SequenceFeatures for multi-dimensional numeric_column."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
+    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
+
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_sequence_length_not_equal(self):
+    """Tests that an error is raised when sequence lengths are not equal."""
+    # Input a with sequence_length = [2, 1]
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    # Input b with sequence_length = [1, 1]
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0)),
+        values=(1., 10.),
+        dense_shape=(2, 2))
+    numeric_column_a = sfc.sequence_numeric_column('aaa')
+    numeric_column_b = sfc.sequence_numeric_column('bbb')
+
+    sequence_input_layer = ksfc.SequenceFeatures(
+        [numeric_column_a, numeric_column_b])
+
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, r'Condition x == y did not hold.*'):
+      _, sequence_length = sequence_input_layer({
+          'aaa': sparse_input_a,
+          'bbb': sparse_input_b
+      })
+      self.evaluate(sequence_length)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+           # example 1, [[[10., 11.],  [12., 13.]]]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_shape': [2, 2, 4]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
+                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 4)},
+       'expected_shape': [2, 2, 4]},
+      )
+  @test_util.run_in_graph_and_eager_modes
+  def test_static_shape_from_tensors_numeric(
+      self, sparse_input_args, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
+    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected_shape': [4, 2, 3]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [0, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 0, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected_shape': [4, 2, 3]}
+      )
+  @test_util.run_in_graph_and_eager_modes
+  def test_static_shape_from_tensors_indicator(
+      self, sparse_input_args, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    indicator_column = fc.indicator_column(categorical_column)
+
+    sequence_input_layer = ksfc.SequenceFeatures([indicator_column])
+    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_compute_output_shape(self):
+    price1 = sfc.sequence_numeric_column('price1', shape=2)
+    price2 = sfc.sequence_numeric_column('price2')
+    features = {
+        'price1': sparse_tensor.SparseTensor(
+            indices=[[0, 0, 0], [0, 0, 1],
+                     [0, 1, 0], [0, 1, 1],
+                     [1, 0, 0], [1, 0, 1],
+                     [2, 0, 0], [2, 0, 1],
+                     [3, 0, 0], [3, 0, 1]],
+            values=[0., 1., 10., 11., 100., 101., 200., 201., 300., 301.],
+            dense_shape=(4, 3, 2)),
+        'price2': sparse_tensor.SparseTensor(
+            indices=[[0, 0],
+                     [0, 1],
+                     [1, 0],
+                     [2, 0],
+                     [3, 0]],
+            values=[10., 11., 20., 30., 40.],
+            dense_shape=(4, 3))}
+    sequence_features = ksfc.SequenceFeatures([price1, price2])
+    seq_input, seq_len = sequence_features(features)
+    self.assertEqual(
+        sequence_features.compute_output_shape((None, None)),
+        (None, None, 3))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]],
+                         [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]],
+                         [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]],
+                         [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]],
+                        self.evaluate(seq_input))
+    self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SequenceFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(('default', None, None),
+                                  ('trainable', True, 'trainable'),
+                                  ('not_trainable', False, 'frozen'))
+  def test_get_config(self, trainable, name):
+    cols = [sfc.sequence_numeric_column('a')]
+    orig_layer = ksfc.SequenceFeatures(cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    self.assertEqual(config['name'], orig_layer.name)
+    self.assertEqual(config['trainable'], trainable)
+    self.assertLen(config['feature_columns'], 1)
+    self.assertEqual(config['feature_columns'][0]['class_name'],
+                     'SequenceNumericColumn')
+    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
+
+  @parameterized.named_parameters(('default', None, None),
+                                  ('trainable', True, 'trainable'),
+                                  ('not_trainable', False, 'frozen'))
+  def test_from_config(self, trainable, name):
+    cols = [sfc.sequence_numeric_column('a')]
+    orig_layer = ksfc.SequenceFeatures(cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    new_layer = ksfc.SequenceFeatures.from_config(config)
+
+    self.assertEqual(new_layer.name, orig_layer.name)
+    self.assertEqual(new_layer.trainable, trainable)
+    self.assertLen(new_layer._feature_columns, 1)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a')
+
+  def test_serialization_sequence_features(self):
+    rating = sfc.sequence_numeric_column('rating')
+    sequence_feature = ksfc.SequenceFeatures([rating])
+    config = keras.layers.serialize(sequence_feature)
+
+    revived = keras.layers.deserialize(config)
+    self.assertIsInstance(revived, ksfc.SequenceFeatures)
+
+
+class SequenceFeaturesSavingTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_saving_with_sequence_features(self):
+    cols = [
+        sfc.sequence_numeric_column('a'),
+        fc.indicator_column(
+            sfc.sequence_categorical_column_with_vocabulary_list(
+                'b', ['one', 'two']))
+    ]
+    input_layers = {
+        'a':
+            keras.layers.Input(shape=(None, 1), sparse=True, name='a'),
+        'b':
+            keras.layers.Input(
+                shape=(None, 1), sparse=True, name='b', dtype='string')
+    }
+
+    fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers)
+    # TODO(tibell): Figure out the right dtype and apply masking.
+    # sequence_length_mask = array_ops.sequence_mask(sequence_length)
+    # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
+    x = keras.layers.GRU(32)(fc_layer)
+    output = keras.layers.Dense(10)(x)
+
+    model = keras.models.Model(input_layers, output)
+
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer='rmsprop',
+        metrics=[keras.metrics.categorical_accuracy])
+
+    config = model.to_json()
+    loaded_model = model_config.model_from_json(config)
+
+    batch_size = 10
+    timesteps = 1
+
+    values_a = np.arange(10, dtype=np.float32)
+    indices_a = np.zeros((10, 3), dtype=np.int64)
+    indices_a[:, 0] = np.arange(10)
+    inputs_a = sparse_tensor.SparseTensor(indices_a, values_a,
+                                          (batch_size, timesteps, 1))
+
+    values_b = np.zeros(10, dtype=np.str)
+    indices_b = np.zeros((10, 3), dtype=np.int64)
+    indices_b[:, 0] = np.arange(10)
+    inputs_b = sparse_tensor.SparseTensor(indices_b, values_b,
+                                          (batch_size, timesteps, 1))
+
+    with self.cached_session():
+      # Initialize tables for V1 lookup.
+      if not context.executing_eagerly():
+        self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertLen(
+          loaded_model.predict({
+              'a': inputs_a,
+              'b': inputs_b
+          }, steps=1), batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index d3920224b58..01c405a86ae 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -32,6 +32,16 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "gradients_test",
+    srcs = ["gradients_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
+
 tf_py_test(
     name = "legacy_rnn_test",  # Remove this target in when TF 1 is deprecated.
     srcs = ["legacy_rnn_test.py"],
@@ -41,3 +51,22 @@ tf_py_test(
         "//tensorflow/python:extra_py_tests_deps",
     ],
 )
+
+tf_py_test(
+    name = "module_test",
+    srcs = ["module_test.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
+
+tf_py_test(
+    name = "vectorized_map_test",
+    srcs = ["vectorized_map_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
diff --git a/tensorflow/python/keras/integration_test/gradients_test.py b/tensorflow/python/keras/integration_test/gradients_test.py
new file mode 100644
index 00000000000..706150ddaa8
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/gradients_test.py
@@ -0,0 +1,84 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class TestKerasModelClass(tf.keras.Model):
+  """A simple tensorflow keras Model class definition."""
+
+  def __init__(self, width):
+    super(TestKerasModelClass, self).__init__()
+    self.width = width
+
+  def build(self, input_shape):
+    self.weight = self.add_weight(
+        name="test_keras_var",
+        shape=(self.width,),
+        dtype=tf.float32,
+        trainable=True,
+    )
+
+  def call(self, inputs):
+    return self.weight * inputs
+
+
+class GradientsTest(tf.test.TestCase):
+
+  def _TestVariablesGradient(self, inputs, test_model, vars_to_grad):
+    """Returns gradients of `test_model` with respect to `vars_to_grad`."""
+
+    test_model_re = tf.recompute_grad(test_model)
+
+    with tf.GradientTape(persistent=True) as tape:
+      tape.watch(vars_to_grad)
+      out_re = test_model_re(inputs)
+      out = test_model(inputs)
+
+    grads_re = tape.gradient(out_re, vars_to_grad)
+    grads = tape.gradient(out, vars_to_grad)
+
+    return grads_re, grads
+
+  def testKerasRecompute(self):
+    """Checks that recompute_grad works for a simple Keras Model."""
+
+    test_model = TestKerasModelClass(10)
+    test_input = tf.constant(tf.zeros((10, 10), dtype=np.float32))
+    # Ensures keras model is initialized.
+    test_model(test_input)  # pylint: disable=not-callable
+    grads_re, grads = self._TestVariablesGradient(test_input, test_model,
+                                                  test_input)
+
+    grads_re = self.evaluate(grads_re)
+    grads = self.evaluate(grads)
+    for g, g_re in zip(grads, grads_re):
+      self.assertAllClose(g, g_re)
+
+    grads_re, grads = self._TestVariablesGradient(test_input, test_model,
+                                                  test_model.variables)
+
+    grads_re = self.evaluate(grads_re)
+    grads = self.evaluate(grads)
+    for g, g_re in zip(grads, grads_re):
+      self.assertAllClose(g, g_re)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/keras/integration_test/module_test.py b/tensorflow/python/keras/integration_test/module_test.py
new file mode 100644
index 00000000000..02a9a56e5ca
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/module_test.py
@@ -0,0 +1,63 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class ModuleTest(tf.test.TestCase):
+
+  def test_module_discover_layer_variable(self):
+    m = tf.Module()
+    m.a = tf.keras.layers.Dense(1)
+    m.b = tf.keras.layers.Dense(2)
+
+    # The weights of the layer has not been created yet.
+    self.assertEmpty(m.variables)
+    self.assertLen(m.submodules, 2)
+
+    inputs = tf.keras.layers.Input((1,))
+    m.a(inputs)
+    m.b(inputs)
+
+    variable_list = m.variables
+    self.assertLen(variable_list, 4)
+    self.assertIs(variable_list[0], m.a.kernel)
+    self.assertIs(variable_list[1], m.a.bias)
+    self.assertIs(variable_list[2], m.b.kernel)
+    self.assertIs(variable_list[3], m.b.bias)
+
+  def test_model_discover_submodule(self):
+    m = tf.keras.models.Sequential(
+        layers=[tf.keras.layers.Dense(1), tf.keras.layers.Dense(2)])
+
+    self.assertEqual(m.submodules, (m.layers[0], m.layers[1]))
+    m(tf.keras.layers.Input((1,)))
+    self.assertLen(m.variables, 4)
+
+  def test_model_wrapped_in_module_discovers_submodules(self):
+    linear = tf.keras.models.Sequential(
+        [tf.keras.layers.Dense(units=1, input_shape=[1])])
+    linear.compile(optimizer="sgd", loss="mean_squared_error")
+    m = tf.Module()
+    m.l = linear
+    self.assertNotEmpty(m.submodules)
+    self.assertLen(m.variables, 2)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/keras/integration_test/vectorized_map_test.py b/tensorflow/python/keras/integration_test/vectorized_map_test.py
new file mode 100644
index 00000000000..5ec6dc6c0d1
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/vectorized_map_test.py
@@ -0,0 +1,47 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class VectorizedMapTest(tf.test.TestCase):
+
+  def test_vectorized_map(self):
+    batch_size = 10
+    num_features = 32
+    layer = tf.keras.layers.Dense(1)
+
+    def model_fn(arg):
+      with tf.GradientTape() as g:
+        inp, label = arg
+        inp = tf.expand_dims(inp, 0)
+        label = tf.expand_dims(label, 0)
+        prediction = layer(inp)
+        loss = tf.nn.l2_loss(label - prediction)
+      return g.gradient(loss, (layer.kernel, layer.bias))
+
+    inputs = tf.random.uniform([batch_size, num_features])
+    labels = tf.random.uniform([batch_size, 1])
+    per_example_gradients = tf.vectorized_map(model_fn, (inputs, labels))
+    self.assertEqual(per_example_gradients[0].shape,
+                     (batch_size, num_features, 1))
+    self.assertEqual(per_example_gradients[1].shape, (batch_size, 1))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index 15a6f6bd191..46ac88754a8 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -10,7 +10,6 @@ package(
         "//tensorflow/python/distribute:__pkg__",
         "//tensorflow/python/feature_column:__pkg__",
         "//tensorflow/python/keras:__subpackages__",
-        "//tensorflow/python/kernel_tests:__pkg__",
         "//tensorflow/python/training/tracking:__pkg__",
         "//tensorflow/tools/pip_package:__pkg__",
     ],
@@ -35,6 +34,7 @@ py_library(
         ":core",
         ":cudnn_recurrent",
         ":dense_attention",
+        ":einsum_dense",
         ":embeddings",
         ":kernelized",
         ":local",
@@ -47,7 +47,6 @@ py_library(
         ":recurrent_v2",
         ":rnn_cell_wrapper_v2",
         ":wrappers",
-        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras/layers/preprocessing",
         "//tensorflow/python/keras/utils:tf_utils",
     ],
@@ -189,6 +188,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "einsum_dense",
+    srcs = ["einsum_dense.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras:activations",
+        "//tensorflow/python/keras:base_layer",
+        "//tensorflow/python/keras:constraints",
+        "//tensorflow/python/keras:initializers",
+        "//tensorflow/python/keras:regularizers",
+    ],
+)
+
 py_library(
     name = "embeddings",
     srcs = ["embeddings.py"],
@@ -457,6 +472,7 @@ tf_py_test(
     srcs = ["convolutional_recurrent_test.py"],
     python_version = "PY3",
     shard_count = 8,
+    tags = ["no_rocm"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -583,6 +599,18 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "einsum_dense_test",
+    srcs = ["einsum_dense_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":einsum_dense",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "local_test",
     size = "medium",
@@ -632,6 +660,7 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 4,
     tags = [
+        "no_rocm",
         "notsan",
     ],
     deps = [
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index c4388ec94fe..ede199a9169 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -57,6 +57,7 @@ else:
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization as TextVectorizationV2
   TextVectorizationV1 = TextVectorization
+from tensorflow.python.keras.layers.preprocessing.categorical_crossing import CategoryCrossing
 
 # Advanced activations.
 from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
@@ -119,6 +120,9 @@ from tensorflow.python.keras.layers.dense_attention import Attention
 # Embedding layers.
 from tensorflow.python.keras.layers.embeddings import Embedding
 
+# Einsum-based dense layer/
+from tensorflow.python.keras.layers.einsum_dense import EinsumDense
+
 # Locally-connected layers.
 from tensorflow.python.keras.layers.local import LocallyConnected1D
 from tensorflow.python.keras.layers.local import LocallyConnected2D
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index cc46ed0d64b..7cb40c172b7 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -349,6 +349,9 @@ class ReLU(Layer):
     if negative_slope < 0.:
       raise ValueError('negative_slope of Relu layer '
                        'cannot be negative value: ' + str(negative_slope))
+    if threshold is None:
+      raise ValueError('threshold of Relu layer '
+                       'cannot be None. Required a float')
 
     self.support_masking = True
     if max_value is not None:
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 476aaba6bb5..f7148ccd4e9 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -305,8 +305,9 @@ class Conv(Layer):
     """Recreate conv_op if necessary.
 
     Check if the input_shape in call() is different from that in build().
-    For the values that are not None, if they are different, recreate
-    the _convolution_op to avoid the stateful behavior.
+    If the most-specific input shape describing the build and call shapes is not
+    equal to the shape we currently built with, then we need to rebuild the
+    _convolution_op to avoid incorrect behavior.
 
     Args:
       inputs: The input data to call() method.
@@ -315,12 +316,10 @@ class Conv(Layer):
       `True` or `False` to indicate whether to recreate the conv_op.
     """
     call_input_shape = inputs.get_shape()
-    for axis in range(1, len(call_input_shape)):
-      if (call_input_shape[axis] is not None
-          and self._build_conv_op_input_shape[axis] is not None
-          and call_input_shape[axis] != self._build_conv_op_input_shape[axis]):
-        return True
-    return False
+    # If the most specific compatible shape between _build_input_shape and
+    # call_input_shape is not _build_input_shape then we must re-build.
+    return self._build_conv_op_input_shape.most_specific_compatible_shape(
+        call_input_shape) != self._build_conv_op_input_shape
 
 
 @keras_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
@@ -1978,10 +1977,10 @@ class SeparableConv1D(SeparableConv):
 class SeparableConv2D(SeparableConv):
   """Depthwise separable 2D convolution.
 
-  Separable convolutions consist in first performing
+  Separable convolutions consist of first performing
   a depthwise spatial convolution
   (which acts on each input channel separately)
-  followed by a pointwise convolution which mixes together the resulting
+  followed by a pointwise convolution which mixes the resulting
   output channels. The `depth_multiplier` argument controls how many
   output channels are generated per input channel in the depthwise step.
 
@@ -2145,7 +2144,7 @@ class SeparableConv2D(SeparableConv):
 class DepthwiseConv2D(Conv2D):
   """Depthwise separable 2D convolution.
 
-  Depthwise Separable convolutions consists in performing
+  Depthwise Separable convolutions consist of performing
   just the first step in a depthwise spatial convolution
   (which acts on each input channel separately).
   The `depth_multiplier` argument controls how many
@@ -2632,7 +2631,7 @@ class ZeroPadding1D(Layer):
           How many zeros to add at the beginning and end of
           the padding dimension (axis 1).
           - If tuple of int (length 2):
-          How many zeros to add at the beginning and at the end of
+          How many zeros to add at the beginning and the end of
           the padding dimension (`(left_pad, right_pad)`).
 
   Input shape:
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 1929b145561..19831429b73 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -458,8 +458,8 @@ class ConvLSTM2DCell(DropoutRNNCellMixin, Layer):
     unit_forget_bias: Boolean.
       If True, add 1 to the bias of the forget gate at initialization.
       Use in combination with `bias_initializer="zeros"`.
-      This is recommended in [Jozefowicz et al.]
-      (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      This is recommended in [Jozefowicz et al., 2015](
+        http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
     kernel_regularizer: Regularizer function applied to
       the `kernel` weights matrix.
     recurrent_regularizer: Regularizer function applied to
@@ -739,8 +739,8 @@ class ConvLSTM2D(ConvRNN2D):
     unit_forget_bias: Boolean.
       If True, add 1 to the bias of the forget gate at initialization.
       Use in combination with `bias_initializer="zeros"`.
-      This is recommended in [Jozefowicz et al.]
-      (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      This is recommended in [Jozefowicz et al., 2015](
+        http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
     kernel_regularizer: Regularizer function applied to
       the `kernel` weights matrix.
     recurrent_regularizer: Regularizer function applied to
@@ -807,10 +807,9 @@ class ConvLSTM2D(ConvRNN2D):
     ValueError: in case of invalid constructor arguments.
 
   References:
-    - [Convolutional LSTM Network: A Machine Learning Approach for
-    Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
-    The current implementation does not include the feedback loop on the
-    cells output.
+    - [Shi et al., 2015](http://arxiv.org/abs/1506.04214v1)
+    (the current implementation does not include the feedback loop on the
+    cells output).
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index a36efd9da26..528bc14adf4 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -23,6 +23,8 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -108,6 +110,26 @@ class Conv1DTest(keras_parameterized.TestCase):
       _ = layer(inpt2).shape
       self.assertEqual(outp1_shape, layer(inpt1).shape)
 
+  def test_conv1d_recreate_conv_unknown_dims(self):
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.Conv1D(filters=1,
+                                  kernel_size=3,
+                                  strides=1,
+                                  dilation_rate=2,
+                                  padding='causal')
+
+      inpt1 = np.random.normal(size=[1, 9, 1]).astype(np.float32)
+      inpt2 = np.random.normal(size=[1, 2, 1]).astype(np.float32)
+      outp1_shape = layer(inpt1).shape
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec([1, None, 1])])
+      def fn(inpt):
+        return layer(inpt)
+
+      fn(inpt2)
+      self.assertEqual(outp1_shape, layer(inpt1).shape)
+
 
 @keras_parameterized.run_all_keras_modes
 class Conv2DTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 76e24f1ba64..60834fad30b 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -37,6 +37,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
@@ -56,6 +57,7 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
+# pylint: disable=g-classes-have-attributes
 @keras_export('keras.layers.Masking')
 class Masking(Layer):
   """Masks a sequence by using a mask value to skip timesteps.
@@ -92,8 +94,8 @@ class Masking(Layer):
   # The time step 3 and 5 will be skipped from LSTM calculation.
   ```
 
-  See [the masking and padding
-  guide](https://www.tensorflow.org/guide/keras/masking_and_padding)
+  See [the masking and padding guide](
+    https://www.tensorflow.org/guide/keras/masking_and_padding)
   for more details.
   """
 
@@ -228,7 +230,7 @@ class Dropout(Layer):
 class SpatialDropout1D(Dropout):
   """Spatial 1D version of Dropout.
 
-  This version performs the same function as Dropout, however it drops
+  This version performs the same function as Dropout, however, it drops
   entire 1D feature maps instead of individual elements. If adjacent frames
   within feature maps are strongly correlated (as is normally the case in
   early convolution layers) then regular dropout will not regularize the
@@ -270,7 +272,7 @@ class SpatialDropout1D(Dropout):
 class SpatialDropout2D(Dropout):
   """Spatial 2D version of Dropout.
 
-  This version performs the same function as Dropout, however it drops
+  This version performs the same function as Dropout, however, it drops
   entire 2D feature maps instead of individual elements. If adjacent pixels
   within feature maps are strongly correlated (as is normally the case in
   early convolution layers) then regular dropout will not regularize the
@@ -329,7 +331,7 @@ class SpatialDropout2D(Dropout):
 class SpatialDropout3D(Dropout):
   """Spatial 3D version of Dropout.
 
-  This version performs the same function as Dropout, however it drops
+  This version performs the same function as Dropout, however, it drops
   entire 3D feature maps instead of individual elements. If adjacent voxels
   within feature maps are strongly correlated (as is normally the case in
   early convolution layers) then regular dropout will not regularize the
@@ -458,7 +460,7 @@ class Reshape(Layer):
   >>> # also supports shape inference using `-1` as dimension
   >>> model.add(tf.keras.layers.Reshape((-1, 2, 2)))
   >>> model.output_shape
-  (None, None, 2, 2)
+  (None, 3, 2, 2)
   """
 
   def __init__(self, target_shape, **kwargs):
@@ -493,7 +495,9 @@ class Reshape(Layer):
       is specified.
     """
     output_shape = list(output_shape)
-    msg = 'total size of new array must be unchanged'
+    msg = ('total size of new array must be unchanged, '
+           'input_shape = {}, output_shape = {}'
+           .format(input_shape, output_shape))
 
     known, unknown = 1, None
     for index, dim in enumerate(output_shape):
@@ -527,8 +531,13 @@ class Reshape(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return array_ops.reshape(inputs,
-                             (array_ops.shape(inputs)[0],) + self.target_shape)
+    result = array_ops.reshape(
+        inputs, (array_ops.shape(inputs)[0],) + self.target_shape)
+    if not context.executing_eagerly():
+      # Set the static shape for the result since it might lost during array_ops
+      # reshape, eg, some `None` dim in the result could be inferred.
+      result.set_shape(self.compute_output_shape(inputs.shape))
+    return result
 
   def get_config(self):
     config = {'target_shape': self.target_shape}
@@ -540,7 +549,7 @@ class Reshape(Layer):
 class Permute(Layer):
   """Permutes the dimensions of the input according to a given pattern.
 
-  Useful for e.g. connecting RNNs and convnets together.
+  Useful e.g. connecting RNNs and convnets.
 
   Example:
 
@@ -552,7 +561,7 @@ class Permute(Layer):
   ```
 
   Arguments:
-    dims: Tuple of integers. Permutation pattern, does not include the
+    dims: Tuple of integers. Permutation pattern does not include the
       samples dimension. Indexing starts at 1.
       For instance, `(2, 1)` permutes the first and second dimensions
       of the input.
@@ -734,15 +743,15 @@ class Lambda(Layer):
   The `Lambda` layer exists so that arbitrary TensorFlow functions
   can be used when constructing `Sequential` and Functional API
   models. `Lambda` layers are best suited for simple operations or
-  quick experimentation. For more advanced usecases, follow 
+  quick experimentation. For more advanced use cases, follow
   [this guide](https://www.tensorflow.org/guide/keras/custom_layers_and_models)
-  for subclassing `tf.keras.layers.Layer`. 
-  
-  The main reason to subclass `tf.keras.layers.Layer` instead of using a 
-  `Lambda` layer is saving and inspecting a Model. `Lambda` layers 
-  are saved by serializing the Python bytecode, whereas subclassed 
-  Layers can be saved via overriding their `get_config` method. Overriding 
-  `get_config` improves the portability of Models. Models that rely on 
+  for subclassing `tf.keras.layers.Layer`.
+
+  The main reason to subclass `tf.keras.layers.Layer` instead of using a
+  `Lambda` layer is saving and inspecting a Model. `Lambda` layers
+  are saved by serializing the Python bytecode, whereas subclassed
+  Layers can be saved via overriding their `get_config` method. Overriding
+  `get_config` improves the portability of Models. Models that rely on
   subclassed Layers are also often easier to visualize and reason about.
 
   Examples:
@@ -809,7 +818,7 @@ class Lambda(Layer):
       input shape: `output_shape = f(input_shape)`
     mask: Either None (indicating no masking) or a callable with the same
       signature as the `compute_mask` layer method, or a tensor that will be
-      returned as output mask regardless what the input is.
+      returned as output mask regardless of what the input is.
     arguments: Optional dictionary of keyword arguments to be passed to the
       function.
   Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
@@ -829,7 +838,6 @@ class Lambda(Layer):
     if mask is not None:
       self.supports_masking = True
     self.mask = mask
-    self._supports_ragged_inputs = True
     self._output_shape = output_shape
 
     # Warning on every invocation will be quite irksome in Eager mode.
@@ -1072,17 +1080,17 @@ class Dense(Layer):
 
   Example:
 
-  ```python
-  # as first layer in a sequential model:
-  model = Sequential()
-  model.add(Dense(32, input_shape=(16,)))
-  # now the model will take as input arrays of shape (*, 16)
-  # and output arrays of shape (*, 32)
-
-  # after the first layer, you don't need to specify
-  # the size of the input anymore:
-  model.add(Dense(32))
-  ```
+  >>> # Create a `Sequential` model and add a Dense layer as the first layer.
+  >>> model = tf.keras.models.Sequential()
+  >>> model.add(tf.keras.Input(shape=(16,)))
+  >>> model.add(tf.keras.layers.Dense(32, activation='relu'))
+  >>> # Now the model will take as input arrays of shape (None, 16)
+  >>> # and output arrays of shape (None, 32).
+  >>> # Note that after the first layer, you don't need to specify
+  >>> # the size of the input anymore:
+  >>> model.add(tf.keras.layers.Dense(32))
+  >>> model.output_shape
+  (None, 32)
 
   Arguments:
     units: Positive integer, dimensionality of the output space.
@@ -1096,7 +1104,7 @@ class Dense(Layer):
       the `kernel` weights matrix.
     bias_regularizer: Regularizer function applied to the bias vector.
     activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation")..
+      the output of the layer (its "activation").
     kernel_constraint: Constraint function applied to
       the `kernel` weights matrix.
     bias_constraint: Constraint function applied to the bias vector.
@@ -1176,6 +1184,7 @@ class Dense(Layer):
     self.built = True
 
   def call(self, inputs):
+    base_layer_utils.no_ragged_support(inputs, self.name)
     rank = inputs.shape.rank
     if rank is not None and rank > 2:
       # Broadcasting is required for the inputs.
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 3daa187f1ce..70ad63c17eb 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -430,6 +430,12 @@ class CoreLayersTest(keras_parameterized.TestCase):
         kwargs={'target_shape': (-1, 1)},
         input_shape=(None, None, 2))
 
+  def test_reshape_set_static_shape(self):
+    input_layer = keras.Input(batch_shape=(1, None))
+    reshaped = keras.layers.Reshape((1, 100))(input_layer)
+    # Make sure the batch dim is not lost after array_ops.reshape.
+    self.assertEqual(reshaped.shape, [1, 1, 100])
+
   def test_permute(self):
     testing_utils.layer_test(
         keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 9cf132d68df..d25851f6569 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -267,6 +267,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
       self.assertEqual(len(layer.trainable_weights), 3)
       self.assertEqual(len(layer.non_trainable_weights), 0)
 
+  # TODO(b/156439419): Reenable after the bug is fixed.
   @parameterized.named_parameters(
       *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
@@ -274,9 +275,9 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
           model_nest_level=[1, 2], model_type=['seq', 'func']))
   @test_util.run_v1_only('b/120911602, b/112083752')
   @test_util.run_gpu_only
-  def test_load_weights_between_noncudnn_rnn(self, rnn_type, to_cudnn,
-                                             bidirectional, implementation,
-                                             model_nest_level, model_type):
+  def DISALBED_test_load_weights_between_noncudnn_rnn(
+      self, rnn_type, to_cudnn, bidirectional, implementation,
+      model_nest_level, model_type):
     input_size = 10
     timesteps = 6
     input_shape = (timesteps, input_size)
diff --git a/tensorflow/python/keras/layers/einsum_dense.py b/tensorflow/python/keras/layers/einsum_dense.py
new file mode 100644
index 00000000000..7b5bd085703
--- /dev/null
+++ b/tensorflow/python/keras/layers/einsum_dense.py
@@ -0,0 +1,337 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-based einsum dense layer."""
+# pylint: disable=g-classes-have-attributes
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.ops import special_math_ops
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.layers.experimental.EinsumDense")
+class EinsumDense(Layer):
+  """A layer that uses tf.einsum as the backing computation.
+
+  This layer can perform einsum calculations of arbitrary dimensionality.
+
+  Arguments:
+    equation: An equation describing the einsum to perform. This equation must
+      be a valid einsum string of the form `ab,bc->ac`, `...ab,bc->...ac`, or
+      `ab...,bc->ac...` where 'ab', 'bc', and 'ac' can be any valid einsum axis
+      expression sequence.
+    output_shape: The expected shape of the output tensor (excluding the batch
+      dimension and any dimensions represented by ellipses). You can specify
+      None for any dimension that is unknown or can be inferred from the input
+      shape.
+    activation: Activation function to use. If you don't specify anything, no
+      activation is applied (that is, a "linear" activation: `a(x) = x`).
+    bias_axes: A string containing the output dimension(s) to apply a bias to.
+      Each character in the `bias_axes` string should correspond to a character
+      in the output portion of the `equation` string.
+    kernel_initializer: Initializer for the `kernel` weights matrix.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to the `kernel` weights
+      matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to the output of the
+      layer (its "activation")..
+    kernel_constraint: Constraint function applied to the `kernel` weights
+      matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+
+  Examples:
+
+  **Biased dense layer with einsums**
+
+  This example shows how to instantiate a standard Keras dense layer using
+  einsum operations. This example is equivalent to
+  `tf.keras.layers.Dense(64, use_bias=True)`.
+
+  >>> layer = EinsumDense("ab,bc->ac", output_shape=64, bias_axes="c")
+  >>> input_tensor = tf.keras.Input(shape=[32])
+  >>> output_tensor = layer(input_tensor)
+  >>> output_tensor
+  <tf.Tensor '...' shape=(None, 64) dtype=...>
+
+  **Applying a dense layer to a sequence**
+
+  This example shows how to instantiate a layer that applies the same dense
+  operation to every element in a sequence. Here, the 'output_shape' has two
+  values (since there are two non-batch dimensions in the output); the first
+  dimension in the output_shape is `None`, because the sequence dimension `b`
+  has an unknown shape.
+
+  >>> layer = EinsumDense("abc,cd->abd",
+  ...                     output_shape=(None, 64),
+  ...                     bias_axes="d")
+  >>> input_tensor = tf.keras.Input(shape=[32, 128])
+  >>> output_tensor = layer(input_tensor)
+  >>> output_tensor
+  <tf.Tensor '...' shape=(None, 32, 64) dtype=...>
+
+  **Applying a dense layer to a sequence using ellipses**
+
+  This example shows how to instantiate a layer that applies the same dense
+  operation to every element in a sequence, but uses the ellipsis notation
+  instead of specifying the batch and sequence dimensions.
+
+  Because we are using ellipsis notation and have specified only one axis, the
+  output_shape arg is a single value. When instantiated in this way, the layer
+  can handle any number of sequence dimensions - including the case where no
+  sequence dimension exists.
+
+  >>> layer = EinsumDense("...x,xy->...y", output_shape=64, bias_axes="y")
+  >>> input_tensor = tf.keras.Input(shape=[32, 128])
+  >>> output_tensor = layer(input_tensor)
+  >>> output_tensor
+  <tf.Tensor '...' shape=(None, 32, 64) dtype=...>
+  """
+
+  def __init__(self,
+               equation,
+               output_shape,
+               activation=None,
+               bias_axes=None,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(EinsumDense, self).__init__(**kwargs)
+    self.equation = equation
+    if isinstance(output_shape, int):
+      self.partial_output_shape = [output_shape]
+    else:
+      self.partial_output_shape = list(output_shape)
+    self.bias_axes = bias_axes
+    self.activation = activations.get(activation)
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    shape_data = _analyze_einsum_string(self.equation,
+                                        self.bias_axes,
+                                        input_shape,
+                                        self.partial_output_shape)
+    kernel_shape, bias_shape, self.full_output_shape = shape_data
+    self.kernel = self.add_weight(
+        "kernel",
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint,
+        dtype=self.dtype,
+        trainable=True)
+
+    if bias_shape is not None:
+      self.bias = self.add_weight(
+          "bias",
+          shape=bias_shape,
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          dtype=self.dtype,
+          trainable=True)
+    else:
+      self.bias = None
+    super(EinsumDense, self).build(input_shape)
+
+  def compute_output_shape(self, _):
+    return tensor_shape.TensorShape(self.full_output_shape)
+
+  def get_config(self):
+    config = {
+        "output_shape":
+            self.partial_output_shape,
+        "equation":
+            self.equation,
+        "activation":
+            activations.serialize(self.activation),
+        "bias_axes":
+            self.bias_axes,
+        "kernel_initializer":
+            initializers.serialize(self.kernel_initializer),
+        "bias_initializer":
+            initializers.serialize(self.bias_initializer),
+        "kernel_regularizer":
+            regularizers.serialize(self.kernel_regularizer),
+        "bias_regularizer":
+            regularizers.serialize(self.bias_regularizer),
+        "activity_regularizer":
+            regularizers.serialize(self.activity_regularizer),
+        "kernel_constraint":
+            constraints.serialize(self.kernel_constraint),
+        "bias_constraint":
+            constraints.serialize(self.bias_constraint),
+    }
+    base_config = super(EinsumDense, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    ret = special_math_ops.einsum(self.equation, inputs, self.kernel)
+    if self.bias is not None:
+      ret += self.bias
+    if self.activation is not None:
+      ret = self.activation(ret)
+    return ret
+
+
+def _analyze_einsum_string(equation, bias_axes, input_shape, output_shape):
+  """Analyzes an einsum string to determine the required weight shape."""
+
+  dot_replaced_string = re.sub(r"\.\.\.", "0", equation)
+
+  # This is the case where no ellipses are present in the string.
+  split_string = re.match("([a-zA-Z]+),([a-zA-Z]+)->([a-zA-Z]+)",
+                          dot_replaced_string)
+  if split_string:
+    return _analyze_split_string(split_string, bias_axes, input_shape,
+                                 output_shape)
+
+  # This is the case where ellipses are present on the left.
+  split_string = re.match("0([a-zA-Z]+),([a-zA-Z]+)->0([a-zA-Z]+)",
+                          dot_replaced_string)
+  if split_string:
+    return _analyze_split_string(
+        split_string, bias_axes, input_shape, output_shape, left_elided=True)
+
+  # This is the case where ellipses are present on the right.
+  split_string = re.match("([a-zA-Z]{2,})0,([a-zA-Z]+)->([a-zA-Z]+)0",
+                          dot_replaced_string)
+  if split_string:
+    return _analyze_split_string(split_string, bias_axes, input_shape,
+                                 output_shape)
+
+  raise ValueError(
+      "Invalid einsum equation '%s'. Equations must be in the form "
+      "[X],[Y]->[Z], ...[X],[Y]->...[Z], or [X]...,[Y]->[Z]...." % equation)
+
+
+def _analyze_split_string(split_string,
+                          bias_axes,
+                          input_shape,
+                          output_shape,
+                          left_elided=False):
+  """Analyze an pre-split einsum string to find the weight shape."""
+  input_spec = split_string.group(1)
+  weight_spec = split_string.group(2)
+  output_spec = split_string.group(3)
+  elided = len(input_shape) - len(input_spec)
+
+  if isinstance(output_shape, int):
+    output_shape = [output_shape]
+  else:
+    output_shape = list(output_shape)
+
+  output_shape.insert(0, input_shape[0])
+
+  if elided > 0 and left_elided:
+    for i in range(1, elided):
+      # We already inserted the 0th input dimension at dim 0, so we need to
+      # start at location 1 here.
+      output_shape.insert(1, input_shape[i])
+  elif elided > 0 and not left_elided:
+    for i in range(len(input_shape) - elided, len(input_shape)):
+      output_shape.append(input_shape[i])
+
+  if left_elided:
+    # If we have beginning dimensions elided, we need to use negative indexing
+    # to determine where in the input dimension our values are.
+    input_dim_map = {
+        dim: (i + elided) - len(input_shape) for i, dim in enumerate(input_spec)
+    }
+    # Because we've constructed the full output shape already, we don't need
+    # to do negative indexing.
+    output_dim_map = {dim: (i + elided) for i, dim in enumerate(output_spec)}
+  else:
+    input_dim_map = {dim: i for i, dim in enumerate(input_spec)}
+    output_dim_map = {dim: i for i, dim in enumerate(output_spec)}
+
+  for i, dim in enumerate(input_spec):
+    input_shape_at_dim = input_shape[i]
+    if dim in output_dim_map:
+      output_shape_at_dim = output_shape[output_dim_map[dim]]
+      if (output_shape_at_dim is not None and
+          output_shape_at_dim != input_shape_at_dim):
+        raise ValueError(
+            "Input shape and output shape do not match at shared "
+            "dimension '%s'. Input shape is %s, and output shape "
+            "is %s." %
+            (dim, input_shape_at_dim, output_shape[output_dim_map[dim]]))
+
+  for dim in output_spec:
+    if dim not in input_spec and dim not in weight_spec:
+      raise ValueError("Dimension '%s' was specified in the output '%s' but "
+                       "has no corresponding dim in the input spec '%s' or "
+                       "weight spec '%s.'" % (dim, output_spec, input_spec,
+                                              output_spec))
+
+  weight_shape = []
+  for dim in weight_spec:
+    if dim in input_dim_map:
+      weight_shape.append(input_shape[input_dim_map[dim]])
+    elif dim in output_dim_map:
+      weight_shape.append(output_shape[output_dim_map[dim]])
+    else:
+      raise ValueError("Weight dimension '%s' did not have a match in either "
+                       "the input spec '%s' or the output spec '%s'. For this "
+                       "layer, the weight must be fully specified." %
+                       (dim, input_spec, output_spec))
+
+  if bias_axes is not None:
+    num_left_elided = elided if left_elided else 0
+    idx_map = {
+        char: output_shape[i + num_left_elided]
+        for i, char in enumerate(output_spec)
+    }
+
+    for char in bias_axes:
+      if char not in output_spec:
+        raise ValueError("Bias dimension '%s' was requested, but is not a part "
+                         "of the output specification '%s'" %
+                         (char, output_spec))
+
+    first_bias_location = min([output_spec.find(char) for char in bias_axes])
+    bias_output_spec = output_spec[first_bias_location:]
+
+    bias_shape = [
+        idx_map[char] if char in bias_axes else 1 for char in bias_output_spec
+    ]
+
+    if not left_elided:
+      for _ in range(elided):
+        bias_shape.append(1)
+  else:
+    bias_shape = None
+
+  return weight_shape, bias_shape, output_shape
diff --git a/tensorflow/python/keras/layers/einsum_dense_test.py b/tensorflow/python/keras/layers/einsum_dense_test.py
new file mode 100644
index 00000000000..e9ae7271130
--- /dev/null
+++ b/tensorflow/python/keras/layers/einsum_dense_test.py
@@ -0,0 +1,315 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras-based einsum dense layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from tensorflow.python import keras
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import einsum_dense
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+@parameterized.named_parameters(
+    {
+        "testcase_name": "_1d_end_weight",
+        "equation": "ab,b->a",
+        "bias_axes": None,
+        "input_shape": (None, 32),
+        "output_shape": [],
+        "expected_weight_shape": [32],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None,)
+    }, {
+        "testcase_name": "_2d_middle_weight",
+        "equation": "ab,bc->ac",
+        "bias_axes": None,
+        "input_shape": (None, 32),
+        "output_shape": (64),
+        "expected_weight_shape": [32, 64],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, 64)
+    }, {
+        "testcase_name": "_3d_bert",
+        "equation": "abc,cde->abde",
+        "bias_axes": None,
+        "input_shape": (None, 1, 2),
+        "output_shape": (1, 3, 4),
+        "expected_weight_shape": [2, 3, 4],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, 1, 3, 4)
+    }, {
+        "testcase_name": "_3d_3_bias",
+        "equation": "abc,cde->abde",
+        "bias_axes": "e",
+        "input_shape": (None, 1, 2),
+        "output_shape": (1, 3, 4),
+        "expected_weight_shape": [2, 3, 4],
+        "expected_bias_shape": [4],
+        "expected_output_shape": (None, 1, 3, 4)
+    }, {
+        "testcase_name": "_3d_2_bias",
+        "equation": "abc,cde->abde",
+        "bias_axes": "d",
+        "input_shape": (None, 1, 2),
+        "output_shape": (1, 3, 4),
+        "expected_weight_shape": [2, 3, 4],
+        "expected_bias_shape": [3, 1],
+        "expected_output_shape": (None, 1, 3, 4)
+    }, {
+        "testcase_name": "_3d_1_3_bias",
+        "equation": "abc,cde->abde",
+        "bias_axes": "be",
+        "input_shape": (None, 7, 2),
+        "output_shape": (7, 3, 4),
+        "expected_weight_shape": [2, 3, 4],
+        "expected_bias_shape": [7, 1, 4],
+        "expected_output_shape": (None, 7, 3, 4)
+    }, {
+        "testcase_name": "_3d_bert_projection",
+        "equation": "BFNH,NHD->BFD",
+        "bias_axes": None,
+        "input_shape": (None, 1, 2, 3),
+        "output_shape": (1, 4),
+        "expected_weight_shape": [2, 3, 4],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, 1, 4)
+    }, {
+        "testcase_name": "_2d_bert",
+        "equation": "abc,cd->abd",
+        "bias_axes": None,
+        "input_shape": (None, 1, 2),
+        "output_shape": (1, 4),
+        "expected_weight_shape": [2, 4],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, 1, 4)
+    }, {
+        "testcase_name": "_embedding_1d",
+        "equation": "i,d->id",
+        "bias_axes": None,
+        "input_shape": (None,),
+        "output_shape": (2),
+        "expected_weight_shape": [2],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, 2)
+    }, {
+        "testcase_name": "_xlnet_lm",
+        "equation": "ibd,nd->ibn",
+        "bias_axes": None,
+        "input_shape": (None, None, 1),
+        "output_shape": (None, 2),
+        "expected_weight_shape": [2, 1],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, None, 2)
+    }, {
+        "testcase_name": "_2d_precast",
+        "equation": "...b,bc->...c",
+        "bias_axes": None,
+        "input_shape": (None, 32),
+        "output_shape": (64),
+        "expected_weight_shape": [32, 64],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, 64)
+    }, {
+        "testcase_name": "_2d_precast_multiple_elided_dims",
+        "equation": "...b,bc->...c",
+        "bias_axes": None,
+        "input_shape": (None, None, 32),
+        "output_shape": (64),
+        "expected_weight_shape": [32, 64],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, None, 64)
+    }, {
+        "testcase_name": "_3d_precast",
+        "equation": "...c,cde->...de",
+        "bias_axes": None,
+        "input_shape": (None, 1, 2),
+        "output_shape": (3, 4),
+        "expected_weight_shape": [2, 3, 4],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, 1, 3, 4)
+    }, {
+        "testcase_name": "_3d_precast_3_bias",
+        "equation": "...c,cde->...de",
+        "bias_axes": "e",
+        "input_shape": (None, 1, 2),
+        "output_shape": (3, 4),
+        "expected_weight_shape": [2, 3, 4],
+        "expected_bias_shape": [4],
+        "expected_output_shape": (None, 1, 3, 4)
+    }, {
+        "testcase_name": "_3d_precast_2_bias",
+        "equation": "...c,cde->...de",
+        "bias_axes": "d",
+        "input_shape": (None, 1, 2),
+        "output_shape": (3, 4),
+        "expected_weight_shape": [2, 3, 4],
+        "expected_bias_shape": [3, 1],
+        "expected_output_shape": (None, 1, 3, 4)
+    }, {
+        "testcase_name": "_3d_precast_2_3_bias",
+        "equation": "...c,cde->...de",
+        "bias_axes": "de",
+        "input_shape": (None, 1, 2),
+        "output_shape": (3, 4),
+        "expected_weight_shape": [2, 3, 4],
+        "expected_bias_shape": [3, 4],
+        "expected_output_shape": (None, 1, 3, 4)
+    }, {
+        "testcase_name": "_2d_postcast",
+        "equation": "bc...,cd->bd...",
+        "bias_axes": None,
+        "input_shape": (None, 1, 2, 3),
+        "output_shape": (4),
+        "expected_weight_shape": [1, 4],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, 4, 2, 3)
+    }, {
+        "testcase_name": "_3d_postcast",
+        "equation": "bc...,cde->bde...",
+        "bias_axes": None,
+        "input_shape": (None, 1, 2),
+        "output_shape": (3, 4),
+        "expected_weight_shape": [1, 3, 4],
+        "expected_bias_shape": None,
+        "expected_output_shape": (None, 3, 4, 2)
+    }, {
+        "testcase_name": "_3d_postcast_1_bias",
+        "equation": "bc...,cde->bde...",
+        "bias_axes": "d",
+        "input_shape": (None, 1, 2),
+        "output_shape": (3, 4),
+        "expected_weight_shape": [1, 3, 4],
+        "expected_bias_shape": [3, 1, 1],
+        "expected_output_shape": (None, 3, 4, 2)
+    }, {
+        "testcase_name": "_3d_postcast_2_bias",
+        "equation": "bc...,cde->bde...",
+        "bias_axes": "e",
+        "input_shape": (None, 1, 2),
+        "output_shape": (3, 4),
+        "expected_weight_shape": [1, 3, 4],
+        "expected_bias_shape": [4, 1],
+        "expected_output_shape": (None, 3, 4, 2)
+    }, {
+        "testcase_name": "_3d_postcast_1_2_bias",
+        "equation": "bc...,cde->bde...",
+        "bias_axes": "de",
+        "input_shape": (None, 1, 2),
+        "output_shape": (3, 4),
+        "expected_weight_shape": [1, 3, 4],
+        "expected_bias_shape": [3, 4, 1],
+        "expected_output_shape": (None, 3, 4, 2)
+    })
+class TestEinsumDenseLayer(keras_parameterized.TestCase):
+
+  def test_weight_shapes(self, equation, bias_axes, input_shape, output_shape,
+                         expected_weight_shape, expected_bias_shape,
+                         expected_output_shape):
+    del expected_output_shape  # Not used in this test.
+
+    weight_shape, bias_shape, _ = einsum_dense._analyze_einsum_string(
+        equation, bias_axes, input_shape, output_shape)
+
+    self.assertAllEqual(expected_weight_shape, weight_shape)
+    self.assertAllEqual(expected_bias_shape, bias_shape)
+
+  def test_layer_creation(self, equation, bias_axes, input_shape, output_shape,
+                          expected_weight_shape, expected_bias_shape,
+                          expected_output_shape):
+    # Keras elides the 0-dimension of the input shape when constructing inputs.
+    non_batch_input_shape = list(input_shape)[1:]
+
+    input_tensor = keras.Input(shape=non_batch_input_shape)
+    layer = einsum_dense.EinsumDense(
+        equation=equation, output_shape=output_shape, bias_axes=bias_axes)
+    output_tensor = layer(input_tensor)
+
+    self.assertAllEqual(expected_weight_shape, layer.kernel.shape.as_list())
+    if expected_bias_shape is None:
+      self.assertIsNone(layer.bias)
+    else:
+      self.assertAllEqual(expected_bias_shape, layer.bias.shape.as_list())
+    self.assertAllEqual(expected_output_shape, output_tensor.shape.as_list())
+
+
+@keras_parameterized.run_all_keras_modes
+class TestEinsumLayerAPI(keras_parameterized.TestCase):
+
+  def test_layer_api(self):
+    input_data = np.array([[1.0, 2.0], [3.0, 4.0]])
+    kwargs = {
+        "equation": "...b,bc->...c",
+        "bias_axes": "c",
+        "output_shape": 4,
+        "bias_initializer": keras.initializers.constant(0.03),
+        "kernel_initializer": keras.initializers.constant(0.5),
+        "dtype": input_data.dtype
+    }
+    expected_output = np.array([[1.53, 1.53, 1.53, 1.53],
+                                [3.53, 3.53, 3.53, 3.53]])
+
+    output_data = testing_utils.layer_test(
+        einsum_dense.EinsumDense,
+        kwargs=kwargs,
+        input_shape=(None, 2),
+        input_data=input_data)
+
+    self.assertAllClose(expected_output, output_data)
+
+  def test_unspecified_bias_dim_fails(self):
+    input_tensor = keras.Input(shape=(32,))
+    layer = einsum_dense.EinsumDense(
+        equation="ab,bc->ac", output_shape=64, bias_axes="y")
+    with self.assertRaisesRegexp(
+        ValueError, ".*is not a part of the output specification.*"):
+      _ = layer(input_tensor)
+
+  def test_incompatible_input_output_shape_fails(self):
+    input_tensor = keras.Input(shape=(32, 64))
+    layer = einsum_dense.EinsumDense(
+        equation="abc,cd->abd", output_shape=(10, 96))
+    with self.assertRaisesRegexp(
+        ValueError, ".*Input shape and output shape do not match at shared "
+        "dimension 'b'.*"):
+      _ = layer(input_tensor)
+
+  def test_unspecified_output_dim_fails(self):
+    input_tensor = keras.Input(shape=(32,))
+    layer = einsum_dense.EinsumDense(equation="ab,bc->cd", output_shape=64)
+    with self.assertRaisesRegexp(
+        ValueError, ".*Dimension 'd' was specified in the output 'cd' but has "
+        "no corresponding dim.*"):
+      _ = layer(input_tensor)
+
+  def test_unspecified_weight_dim_fails(self):
+    input_tensor = keras.Input(shape=(32,))
+    layer = einsum_dense.EinsumDense(equation="ab,zd->ad", output_shape=64)
+    with self.assertRaisesRegexp(
+        ValueError, ".*Weight dimension 'z' did not have a match "):
+      _ = layer(input_tensor)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index c8a4cbc5952..e30e93f02dc 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -121,7 +121,6 @@ class Embedding(Layer):
     self.mask_zero = mask_zero
     self.supports_masking = mask_zero
     self.input_length = input_length
-    self._supports_ragged_inputs = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -130,8 +129,10 @@ class Embedding(Layer):
     # since it knows all kernels using the variable only exist on CPU.
     # When eager execution is enabled, the placement decision has to be made
     # right now. Checking for the presence of GPUs to avoid complicating the
-    # TPU codepaths which can handle sparse optimizers.
-    if context.executing_eagerly() and context.context().num_gpus():
+    # TPU codepaths which can handle sparse optimizers. But if we are within
+    # a tf.function, we go back the graph mode logic and rely on the placer.
+    if (context.executing_eagerly() and context.context().num_gpus() and
+        not ops.inside_function()):
       with ops.device('cpu:0'):
         self.embeddings = self.add_weight(
             shape=(self.input_dim, self.output_dim),
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
index d79145c2ee8..ce53334ebc7 100644
--- a/tensorflow/python/keras/layers/kernelized.py
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-classes-have-attributes
 """Keras layers that implement explicit (approximate) kernel feature maps."""
 
 from __future__ import absolute_import
@@ -30,66 +31,87 @@ from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.util.tf_export import keras_export
 
 _SUPPORTED_RBF_KERNEL_TYPES = ['gaussian', 'laplacian']
 
 
+@keras_export('keras.layers.experimental.RandomFourierFeatures')
 class RandomFourierFeatures(base_layer.Layer):
-  r"""Layer that maps its inputs using random Fourier features.
+  r"""Layer that projects its inputs into a random feature space.
 
-  This layer implements a feature map \\(\phi: \mathbb{R}^d \rightarrow
-  \mathbb{R}^D\\) which approximates shift-invariant kernels. A kernel function
-  K(x, y) defined over \\(\mathbb{R}^d x \mathbb{R}^d\\) is shift-invariant if
-  K(x, y) = k(x-y) for some function defined over \\(\mathbb{R}^d\\). Many
-  popular Radial Basis Functions (in short RBF), including gaussian and
-  laplacian kernels are shift-invariant.
-
-  The layer approximates a (shift invariant) kernel K in the following sense:
-    up to a scaling factor, for all inputs \\(x, y \in \mathbb{R}^d\\)
-        \\(\phi(x)^T \cdot \phi(y) \approx K(x, y)\\)
+  This layer implements a mapping from input space to a space with `output_dim`
+  dimensions, which approximates shift-invariant kernels. A kernel function
+  `K(x, y)` is shift-invariant if `K(x, y) == k(x - y)` for some function `k`.
+  Many popular Radial Basis Functions (RBF), including Gaussian and
+  Laplacian kernels, are shift-invariant.
 
   The implementation of this layer is based on the following paper:
-  "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
-  (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
+  ["Random Features for Large-Scale Kernel Machines"](
+    https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
+  by Ali Rahimi and Ben Recht.
 
   The distribution from which the parameters of the random features map (layer)
-  are sampled, determines which shift-invariant kernel the layer approximates
-  (see paper for more details). The users can use the distribution of their
-  choice. Due to their popularity, the layer supports the out-of-the-box
-  approximation of the following RBF kernels:
-  - Gaussian: \\(K(x, y) = e^{-\frac{\|x-y\|_2^2}{2 \cdot scale^2}}\\)
-  - Laplacian: \\(K(x, y) = e^{-\frac{\|x-y\|_1}{scale}}\\)
+  are sampled determines which shift-invariant kernel the layer approximates
+  (see paper for more details). You can use the distribution of your
+  choice. The layer supports out-of-the-box
+  approximation sof the following two RBF kernels:
 
-  NOTE: Unlike the map described in the paper and the scikit-learn
-  implementation, the output of this layer does not apply the sqrt(2/D)
-  normalization factor.
+  - Gaussian: `K(x, y) == exp(- square(x - y) / (2 * square(scale)))`
+  - Laplacian: `K(x, y) = exp(-abs(x - y) / scale))`
 
-  Usage for ML: Typically, this layer is used to "kernelize" linear models by
+  **Note:** Unlike what is described in the paper and unlike what is used in
+  the Scikit-Learn implementation, the output of this layer does not apply
+  the `sqrt(2 / D)` normalization factor.
+
+  **Usage:** Typically, this layer is used to "kernelize" linear models by
   applying a non-linear transformation (this layer) to the input features and
   then training a linear model on top of the transformed features. Depending on
   the loss function of the linear model, the composition of this layer and the
   linear model results to models that are equivalent (up to approximation) to
   kernel SVMs (for hinge loss), kernel logistic regression (for logistic loss),
-  kernel linear regression (for squared loss) etc.
+  kernel linear regression (for squared loss), etc.
+
+  Examples:
+
+  A kernel multinomial logistic regression model with Gaussian kernel for MNIST:
 
-  Example of building a kernel multinomial logistic regression model with
-  Gaussian kernel in keras:
   ```python
-  random_features_layer = RandomFourierFeatures(
-      output_dim=500,
-      kernel_initializer='gaussian',
-      scale=5.0,
-      ...)
-
-  model = tf.keras.models.Sequential()
-  model.add(random_features_layer)
-  model.add(tf.keras.layers.Dense(units=num_classes, activation='softmax')
-
+  model = keras.Sequential([
+    keras.Input(shape=(784,)),
+    RandomFourierFeatures(
+        output_dim=4096,
+        scale=10.,
+        kernel_initializer='gaussian'),
+    layers.Dense(units=10, activation='softmax'),
+  ])
   model.compile(
-    loss=tf.keras.losses.categorical_crossentropy, optimizer=..., metrics=...)
+      optimizer='adam',
+      loss='categorical_crossentropy',
+      metrics=['categorical_accuracy']
+  )
   ```
 
-  To use another kernel, replace the layer creation command with:
+  A quasi-SVM classifier for MNIST:
+
+  ```python
+  model = keras.Sequential([
+    keras.Input(shape=(784,)),
+    RandomFourierFeatures(
+        output_dim=4096,
+        scale=10.,
+        kernel_initializer='gaussian'),
+    layers.Dense(units=10),
+  ])
+  model.compile(
+      optimizer='adam',
+      loss='hinge',
+      metrics=['categorical_accuracy']
+  )
+  ```
+
+  To use another kernel, just replace the layer creation line with:
+
   ```python
   random_features_layer = RandomFourierFeatures(
       output_dim=500,
@@ -103,29 +125,26 @@ class RandomFourierFeatures(base_layer.Layer):
       number of random features used to approximate the kernel.
     kernel_initializer: Determines the distribution of the parameters of the
       random features map (and therefore the kernel approximated by the layer).
-      It can be either a string or an instance of TensorFlow's Initializer
-      class. Currently only 'gaussian' and 'laplacian' are supported as string
-      initializers (case insensitive). Note that these parameters are not
+      It can be either a string identifier or a Keras `Initializer` instance.
+      Currently only 'gaussian' and 'laplacian' are supported string
+      identifiers (case insensitive). Note that the kernel matrix is not
       trainable.
-    scale: For gaussian and laplacian kernels, this corresponds to a scaling
+    scale: For Gaussian and Laplacian kernels, this corresponds to a scaling
       factor of the corresponding kernel approximated by the layer (see concrete
       definitions above). When provided, it should be a positive float. If None,
-      the implementation chooses a default value (1.0 typically). Both the
-      approximation error of the kernel and the classification quality are
-      sensitive to this parameter. If trainable is set to True, this parameter
-      is learned end-to-end during training and the provided value serves as an
-      initialization value.
-      NOTE: When this layer is used to map the initial features and then the
-        transformed features are fed to a linear model, by making `scale`
-        trainable, the resulting optimization problem is no longer convex (even
-        if the loss function used by the linear model is convex).
-    trainable: Whether the scaling parameter of th layer is trainable. Defaults
-      to False.
-    name: name for the RandomFourierFeatures layer.
-
-  Raises:
-    ValueError: if output_dim or stddev are not positive or if the provided
-      kernel_initializer is not supported.
+      a default value is used: if the kernel initializer is set to "gaussian",
+      `scale` defaults to `sqrt(input_dim / 2)`, otherwise, it defaults to 1.0.
+      Both the approximation error of the kernel and the classification quality
+      are sensitive to this parameter. If `trainable` is set to `True`, this
+      parameter is learned end-to-end during training and the provided value
+      serves as the initial value.
+      **Note:** When features from this layer are fed to a linear model,
+        by making `scale` trainable, the resulting optimization problem is
+        no longer convex (even if the loss function used by the linear model
+        is convex).
+    trainable: Whether the scaling parameter of the layer should be trainable.
+      Defaults to `False`.
+    name: String, name to use for this layer.
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index 83c49064fe8..73646a638ea 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
@@ -43,7 +44,6 @@ class _Merge(Layer):
     """
     super(_Merge, self).__init__(**kwargs)
     self.supports_masking = True
-    self._supports_ragged_inputs = True
 
   def _merge_function(self, inputs):
     raise NotImplementedError
@@ -651,7 +651,6 @@ class Dot(_Merge):
     self.normalize = normalize
     self.supports_masking = True
     self._reshape_required = False
-    self._supports_ragged_inputs = False
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -677,6 +676,7 @@ class Dot(_Merge):
                        'Chosen axes: %s, %s' % (axes[0], axes[1]))
 
   def _merge_function(self, inputs):
+    base_layer_utils.no_ragged_support(inputs, self.name)
     if len(inputs) != 2:
       raise ValueError('A `Dot` layer should be called on exactly 2 inputs')
     x1 = inputs[0]
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index c5062163889..213aadeb606 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Normalization layers.
-"""
+"""Normalization layers."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -28,7 +27,6 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
@@ -44,7 +42,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 
 class BatchNormalizationBase(Layer):
-  r"""Normalize and scale inputs or activations. (Ioffe and Szegedy, 2014).
+  r"""Normalize and scale inputs or activations.
 
   Normalize the activations of the previous layer at each batch,
   i.e. applies a transformation that maintains the mean activation
@@ -66,20 +64,16 @@ class BatchNormalizationBase(Layer):
   `training=False` when calling the model, or using `model.predict`.
 
   Arguments:
-    axis: Integer, the axis that should be normalized
-      (typically the features axis).
-      For instance, after a `Conv2D` layer with
-      `data_format="channels_first"`,
-      set `axis=1` in `BatchNormalization`.
+    axis: Integer, the axis that should be normalized (typically the features
+      axis). For instance, after a `Conv2D` layer with
+      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
     momentum: Momentum for the moving average.
     epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor.
-      If False, `beta` is ignored.
-    scale: If True, multiply by `gamma`.
-      If False, `gamma` is not used.
-      When the next layer is linear (also e.g. `nn.relu`),
-      this can be disabled since the scaling
-      will be done by the next layer.
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
+      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
+      scaling will be done by the next layer.
     beta_initializer: Initializer for the beta weight.
     gamma_initializer: Initializer for the gamma weight.
     moving_mean_initializer: Initializer for the moving mean.
@@ -88,19 +82,19 @@ class BatchNormalizationBase(Layer):
     gamma_regularizer: Optional regularizer for the gamma weight.
     beta_constraint: Optional constraint for the beta weight.
     gamma_constraint: Optional constraint for the gamma weight.
-    renorm: Whether to use Batch Renormalization
-      (https://arxiv.org/abs/1702.03275). This adds extra variables during
-      training. The inference is the same for either value of this parameter.
+    renorm: Whether to use [Batch Renormalization](
+      https://arxiv.org/abs/1702.03275). This adds extra variables during
+        training. The inference is the same for either value of this parameter.
     renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
-      scalar `Tensors` used to clip the renorm correction. The correction
-      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
-      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      scalar `Tensors` used to clip the renorm correction. The correction `(r,
+      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
+      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
       dmax are set to inf, 0, inf, respectively.
     renorm_momentum: Momentum used to update the moving means and standard
-      deviations with renorm. Unlike `momentum`, this affects training
-      and should be neither too small (which would add noise) nor too large
-      (which would give stale estimates). Note that `momentum` is still applied
-      to get the means and variances for inference.
+      deviations with renorm. Unlike `momentum`, this affects training and
+      should be neither too small (which would add noise) nor too large (which
+      would give stale estimates). Note that `momentum` is still applied to get
+      the means and variances for inference.
     fused: if `True`, use a faster, fused implementation, or raise a ValueError
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
@@ -118,55 +112,37 @@ class BatchNormalizationBase(Layer):
       example, if axis==-1,
         `adjustment = lambda shape: (
           tf.random.uniform(shape[-1:], 0.93, 1.07),
-          tf.random.uniform(shape[-1:], -0.1, 0.1))`
-      will scale the normalized value by up to 7% up or down, then shift the
-      result by up to 0.1 (with independent scaling and bias for each feature
-      but shared across all examples), and finally apply gamma and/or beta. If
-      `None`, no adjustment is applied. Cannot be specified if
-      virtual_batch_size is specified.
-
+          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
+            value by up to 7% up or down, then shift the result by up to 0.1
+            (with independent scaling and bias for each feature but shared
+            across all examples), and finally apply gamma and/or beta. If
+            `None`, no adjustment is applied. Cannot be specified if
+            virtual_batch_size is specified.
   Call arguments:
     inputs: Input tensor (of any rank).
     training: Python boolean indicating whether the layer should behave in
       training mode or in inference mode.
-      - `training=True`: The layer will normalize its inputs using the
-        mean and variance of the current batch of inputs.
-      - `training=False`: The layer will normalize its inputs using the
-        mean and variance of its moving statistics, learned during training.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-
-  {{TRAINABLE_ATTRIBUTE_NOTE}}
-
-  Normalization equations:
-    Consider the intermediate activations \(x\) of a mini-batch of size
-    \\(m\\):
-
-    We can compute the mean and variance of the batch
-
-    \\({\mu_B} = \frac{1}{m} \sum_{i=1}^{m} {x_i}\\)
-
-    \\({\sigma_B^2} = \frac{1}{m} \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\)
-
-    and then compute a normalized \\(x\\), including a small factor
-    \\({\epsilon}\\) for numerical stability.
-
-    \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 + \epsilon}}\\)
-
-    And finally \\(\hat{x}\) is linearly transformed by \({\gamma}\\)
-    and \\({\beta}\\), which are learned parameters:
-
-    \\({y_i} = {\gamma * \hat{x_i} + \beta}\\)
-
-  References:
-  - [Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
+      - `training=True`: The layer will normalize its inputs using the mean and
+        variance of the current batch of inputs.
+      - `training=False`: The layer will normalize its inputs using the mean and
+        variance of its moving statistics, learned during training.
+  Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
+    integers, does not include the samples axis) when using this layer as the
+    first layer in a model.
+  Output shape: Same shape as input.  {{TRAINABLE_ATTRIBUTE_NOTE}}
+  Normalization equations: Consider the intermediate activations \(x\) of a
+    mini-batch of size
+    \\(m\\):  We can compute the mean and variance of the batch  \\({\mu_B} =
+      \frac{1}{m} \sum_{i=1}^{m} {x_i}\\)  \\({\sigma_B^2} = \frac{1}{m}
+      \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\)  and then compute a normalized
+      \\(x\\), including a small factor \\({\epsilon}\\) for numerical
+      stability.  \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 +
+      \epsilon}}\\)  And finally \\(\hat{x}\) is linearly transformed by
+      \({\gamma}\\)
+    and \\({\beta}\\), which are learned parameters:  \\({y_i} = {\gamma *
+      \hat{x_i} + \beta}\\)
+  Reference:
+    - [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
   """
 
   # By default, the base class uses V2 behavior. The BatchNormalization V1
@@ -196,8 +172,7 @@ class BatchNormalizationBase(Layer):
                adjustment=None,
                name=None,
                **kwargs):
-    super(BatchNormalizationBase, self).__init__(
-        name=name, **kwargs)
+    super(BatchNormalizationBase, self).__init__(name=name, **kwargs)
     if isinstance(axis, (list, tuple)):
       self.axis = axis[:]
     elif isinstance(axis, int):
@@ -234,7 +209,6 @@ class BatchNormalizationBase(Layer):
 
     self.fused = fused
     self._bessels_correction_test_only = True
-    self._trainable_var = None
     self.trainable = trainable
 
     if renorm:
@@ -277,8 +251,8 @@ class BatchNormalizationBase(Layer):
     # TODO(reedwm): Support fp64 in FusedBatchNorm then remove this check.
     if self._compute_dtype not in ('float16', 'bfloat16', 'float32', None):
       raise ValueError('Passing fused=True is only supported when the compute '
-                       'dtype is float16, bfloat16, or float32. Got dtype: %s'
-                       % (self._compute_dtype,))
+                       'dtype is float16, bfloat16, or float32. Got dtype: %s' %
+                       (self._compute_dtype,))
 
   def _fused_can_be_used(self):
     try:
@@ -294,14 +268,6 @@ class BatchNormalizationBase(Layer):
   @trainable.setter
   def trainable(self, value):
     self._trainable = value
-    if self._trainable_var is not None:
-      self._trainable_var.update_value(value)
-
-  def _get_trainable_var(self):
-    if self._trainable_var is None:
-      self._trainable_var = K.freezable_variable(
-          self._trainable, name=self.name + '_trainable')
-    return self._trainable_var
 
   @property
   def _param_dtype(self):
@@ -390,13 +356,14 @@ class BatchNormalizationBase(Layer):
       param_shape = (list(axis_to_dim.values())[0],)
     else:
       # Parameter shape is the original shape but with 1 in all non-axis dims
-      param_shape = [axis_to_dim[i] if i in axis_to_dim
-                     else 1 for i in range(ndims)]
+      param_shape = [
+          axis_to_dim[i] if i in axis_to_dim else 1 for i in range(ndims)
+      ]
       if self.virtual_batch_size is not None:
         # When using virtual batches, add an extra dim at index 1
         param_shape.insert(1, 1)
         for idx, x in enumerate(self.axis):
-          self.axis[idx] = x + 1      # Account for added dimension
+          self.axis[idx] = x + 1  # Account for added dimension
 
     if self.scale:
       self.gamma = self.add_weight(
@@ -517,8 +484,7 @@ class BatchNormalizationBase(Layer):
         decay = ops.convert_to_tensor_v2(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
-        update_delta = (
-            variable - math_ops.cast(value, variable.dtype)) * decay
+        update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
         if inputs_size is not None:
           update_delta = array_ops.where(inputs_size > 0, update_delta,
                                          K.zeros_like(update_delta))
@@ -660,8 +626,9 @@ class BatchNormalizationBase(Layer):
     with ops.control_dependencies([r, d]):
       mean = array_ops.identity(mean)
       stddev = array_ops.identity(stddev)
-    rmin, rmax, dmax = [self.renorm_clipping.get(key)
-                        for key in ['rmin', 'rmax', 'dmax']]
+    rmin, rmax, dmax = [
+        self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
+    ]
     if rmin is not None:
       r = math_ops.maximum(r, rmin)
     if rmax is not None:
@@ -671,13 +638,13 @@ class BatchNormalizationBase(Layer):
       d = math_ops.minimum(d, dmax)
     # When not training, use r=1, d=0.
     r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
-    d = tf_utils.smart_cond(training,
-                            lambda: d,
+    d = tf_utils.smart_cond(training, lambda: d,
                             lambda: array_ops.zeros_like(d))
 
     def _update_renorm_variable(var, value, inputs_size):
       """Updates a moving average and weight, returns the unbiased value."""
       value = array_ops.identity(value)
+
       def _do_update():
         """Updates the var, returns the updated value."""
         new_var = self._assign_moving_average(var, value, self.renorm_momentum,
@@ -686,6 +653,7 @@ class BatchNormalizationBase(Layer):
 
       def _fake_update():
         return array_ops.identity(var)
+
       return tf_utils.smart_cond(training, _do_update, _fake_update)
 
     # TODO(yuefengz): colocate the operations
@@ -722,12 +690,10 @@ class BatchNormalizationBase(Layer):
     if self._USE_V2_BEHAVIOR:
       if isinstance(training, int):
         training = bool(training)
-      if base_layer_utils.is_in_keras_graph():
-        training = math_ops.logical_and(training, self._get_trainable_var())
-      elif not self.trainable:
+      if not self.trainable:
         # When the layer is not trainable, it overrides the value passed from
         # model.
-        training = self.trainable
+        training = False
     return training
 
   def call(self, inputs, training=None):
@@ -736,8 +702,14 @@ class BatchNormalizationBase(Layer):
     if self.virtual_batch_size is not None:
       # Virtual batches (aka ghost batches) can be simulated by reshaping the
       # Tensor and reusing the existing batch norm implementation
-      original_shape = [-1] + inputs.shape.as_list()[1:]
-      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
+      original_shape = array_ops.shape(inputs)
+      original_shape = array_ops.concat(
+          [constant_op.constant([-1]), original_shape[1:]], axis=0)
+      expanded_shape = array_ops.concat([
+          constant_op.constant([self.virtual_batch_size, -1]),
+          original_shape[1:]
+      ],
+                                        axis=0)
 
       # Will cause errors if virtual_batch_size does not divide the batch size
       inputs = array_ops.reshape(inputs, expanded_shape)
@@ -759,12 +731,13 @@ class BatchNormalizationBase(Layer):
     ndims = len(input_shape)
     reduction_axes = [i for i in range(ndims) if i not in self.axis]
     if self.virtual_batch_size is not None:
-      del reduction_axes[1]     # Do not reduce along virtual batch dim
+      del reduction_axes[1]  # Do not reduce along virtual batch dim
 
     # Broadcasting only necessary for single-axis batch norm where the axis is
     # not the last dimension
     broadcast_shape = [1] * ndims
     broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
+
     def _broadcast(v):
       if (v is not None and len(v.shape) != ndims and
           reduction_axes != list(range(ndims - 1))):
@@ -789,11 +762,9 @@ class BatchNormalizationBase(Layer):
       if self.adjustment:
         adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
         # Adjust only during training.
-        adj_scale = tf_utils.smart_cond(training,
-                                        lambda: adj_scale,
+        adj_scale = tf_utils.smart_cond(training, lambda: adj_scale,
                                         lambda: array_ops.ones_like(adj_scale))
-        adj_bias = tf_utils.smart_cond(training,
-                                       lambda: adj_bias,
+        adj_bias = tf_utils.smart_cond(training, lambda: adj_bias,
                                        lambda: array_ops.zeros_like(adj_bias))
         scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
 
@@ -885,11 +856,8 @@ class BatchNormalizationBase(Layer):
       scale = math_ops.cast(scale, inputs.dtype)
     # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing
     # math in float16 hurts validation accuracy of popular models like resnet.
-    outputs = nn.batch_normalization(inputs,
-                                     _broadcast(mean),
-                                     _broadcast(variance),
-                                     offset,
-                                     scale,
+    outputs = nn.batch_normalization(inputs, _broadcast(mean),
+                                     _broadcast(variance), offset, scale,
                                      self.epsilon)
     # If some components of the shape got lost due to adjustments, fix that.
     outputs.set_shape(input_shape)
@@ -903,21 +871,32 @@ class BatchNormalizationBase(Layer):
 
   def get_config(self):
     config = {
-        'axis': self.axis,
-        'momentum': self.momentum,
-        'epsilon': self.epsilon,
-        'center': self.center,
-        'scale': self.scale,
-        'beta_initializer': initializers.serialize(self.beta_initializer),
-        'gamma_initializer': initializers.serialize(self.gamma_initializer),
+        'axis':
+            self.axis,
+        'momentum':
+            self.momentum,
+        'epsilon':
+            self.epsilon,
+        'center':
+            self.center,
+        'scale':
+            self.scale,
+        'beta_initializer':
+            initializers.serialize(self.beta_initializer),
+        'gamma_initializer':
+            initializers.serialize(self.gamma_initializer),
         'moving_mean_initializer':
             initializers.serialize(self.moving_mean_initializer),
         'moving_variance_initializer':
             initializers.serialize(self.moving_variance_initializer),
-        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
-        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
-        'beta_constraint': constraints.serialize(self.beta_constraint),
-        'gamma_constraint': constraints.serialize(self.gamma_constraint)
+        'beta_regularizer':
+            regularizers.serialize(self.beta_regularizer),
+        'gamma_regularizer':
+            regularizers.serialize(self.gamma_regularizer),
+        'beta_constraint':
+            constraints.serialize(self.beta_constraint),
+        'gamma_constraint':
+            constraints.serialize(self.gamma_constraint)
     }
     # Only add TensorFlow-specific parameters if they are set, so as to preserve
     # model compatibility with external Keras.
@@ -948,16 +927,14 @@ def replace_in_base_docstring(replacements):
 @keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
 class BatchNormalization(BatchNormalizationBase):
 
-  __doc__ = replace_in_base_docstring(
-      [('''
+  __doc__ = replace_in_base_docstring([("""
     fused: if `True`, use a faster, fused implementation, or raise a ValueError
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
-      implementation.''',
-        '''
+      implementation.""", """
     fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.'''),
-       ('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
+      If `False`, use the system recommended implementation."""),
+                                       ('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
 
   _USE_V2_BEHAVIOR = False
 
@@ -971,35 +948,115 @@ class LayerNormalization(Layer):
   i.e. applies a transformation that maintains the mean activation within each
   example close to 0 and the activation standard deviation close to 1.
 
+  Given a tensor `inputs`, moments are calculated and normalization
+  is performed across the axes specified in `axis`.
+
+  Example:
+
+  >>> data = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
+  >>> print(data)
+  tf.Tensor(
+  [[ 0. 10.]
+   [20. 30.]
+   [40. 50.]
+   [60. 70.]
+   [80. 90.]], shape=(5, 2), dtype=float32)
+
+  >>> layer = tf.keras.layers.LayerNormalization(axis=1)
+  >>> output = layer(data)
+  >>> print(output)
+  tf.Tensor(
+  [[-1. 1.]
+   [-1. 1.]
+   [-1. 1.]
+   [-1. 1.]
+   [-1. 1.]], shape=(5, 2), dtype=float32)
+
+  Notice that with Layer Normalization the normalization happens across the
+  axes *within* each example, rather than across different examples in the
+  batch.
+
+  If `scale` or `center` are enabled, the layer will scale the normalized
+  outputs by broadcasting them with a trainable variable `gamma`, and center
+  the outputs by broadcasting with a trainable variable `beta`. `gamma` will
+  default to a ones tensor and `beta` will default to a zeros tensor, so that
+  centering and scaling are no-ops before training has begun.
+
+  So, with scaling and centering enabled the normalization equations
+  are as follows:
+    Let the intermediate activations for a mini-batch to be the `inputs`.
+
+    For each sample `x_i` in `inputs` with `k` features, we compute the mean and
+    variance of the sample:
+
+    ```python
+    mean_i = sum(x_i[j] for j in range(k)) / k
+    var_i = sum((x_i[j] - mean_i) ** 2 for j in range(k)) / k
+    ```
+
+    and then compute a normalized `x_i_normalized`, including a small factor
+    `epsilon` for numerical stability.
+
+    ```python
+    x_i_normalized = (x_i - mean_i) / sqrt(var_i + epsilon)
+    ```
+
+    And finally `x_i_normalized ` is linearly transformed by `gamma` and `beta`,
+    which are learned parameters:
+
+    ```python
+    output_i = x_i_normalized * gamma + beta
+    ```
+
+  `gamma` and `beta` will span the axes of `inputs` specified in `axis`, and
+  this part of the inputs' shape must be fully defined.
+
+  For example:
+
+  >>> layer = tf.keras.layers.LayerNormalization(axis=[1, 2, 3])
+  >>> layer.build([5, 20, 30, 40])
+  >>> print(layer.beta.shape)
+  (20, 30, 40)
+  >>> print(layer.gamma.shape)
+  (20, 30, 40)
+
+  Note that other implementations of layer normalization may choose to define
+  `gamma` and `beta` over a separate set of axes from the axes being
+  normalized across. For example, Group Normalization
+  ([Wu et al. 2018](https://arxiv.org/abs/1803.08494)) with group size of 1
+  corresponds to a Layer Normalization that normalizes across height, width,
+  and channel and has `gamma` and `beta` span only the channel dimension.
+  So, this Layer Normalization implementation will not match a Group
+  Normalization layer with group size set to 1.
+
+
   Arguments:
-    axis: Integer or List/Tuple. The axis that should be normalized
-      (typically the features axis).
-    epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor.
-        If False, `beta` is ignored.
-    scale: If True, multiply by `gamma`.
-      If False, `gamma` is not used.
-      When the next layer is linear (also e.g. `nn.relu`),
-      this can be disabled since the scaling
-      will be done by the next layer.
-    beta_initializer: Initializer for the beta weight.
-    gamma_initializer: Initializer for the gamma weight.
-    beta_regularizer: Optional regularizer for the beta weight.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-    beta_constraint: Optional constraint for the beta weight.
-    gamma_constraint: Optional constraint for the gamma weight.
+    axis: Integer or List/Tuple. The axis or axes to normalize across. Typically
+      this is the features axis/axes. The left-out axes are typically the batch
+      axis/axes. This argument defaults to `-1`, the last dimension in the
+      input.
+    epsilon: Small float added to variance to avoid dividing by zero. Defaults
+      to 1e-3
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored. Defaults to True.
+    scale: If True, multiply by `gamma`. If False, `gamma` is not used. Defaults
+      to True. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling will be done by the next layer.
+    beta_initializer: Initializer for the beta weight. Defaults to zeros.
+    gamma_initializer: Initializer for the gamma weight. Defaults to ones.
+    beta_regularizer: Optional regularizer for the beta weight. None by default.
+    gamma_regularizer: Optional regularizer for the gamma weight. None by
+      default.
+    beta_constraint: Optional constraint for the beta weight. None by default.
+    gamma_constraint: Optional constraint for the gamma weight. None by default.
     trainable: Boolean, if `True` the variables will be marked as trainable.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-
-  References:
-    - [Layer Normalization](https://arxiv.org/abs/1607.06450)
+      Defaults to True.
+  Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
+    integers, does not include the samples axis) when using this layer as the
+    first layer in a model.
+  Output shape: Same shape as input.
+  Reference:
+    - [Lei Ba et al., 2016](https://arxiv.org/abs/1607.06450).
   """
 
   def __init__(self,
@@ -1118,14 +1175,14 @@ class LayerNormalization(Layer):
     input_shape = inputs.shape
     ndims = len(input_shape)
 
-    # Broadcasting only necessary for norm where the axis is not just
+    # Broadcasting only necessary for norm when the axis is not just
     # the last dimension
     broadcast_shape = [1] * ndims
     for dim in self.axis:
       broadcast_shape[dim] = input_shape.dims[dim].value
+
     def _broadcast(v):
-      if (v is not None and len(v.shape) != ndims and
-          self.axis != [ndims - 1]):
+      if (v is not None and len(v.shape) != ndims and self.axis != [ndims - 1]):
         return array_ops.reshape(v, broadcast_shape)
       return v
 
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 56f140f3edb..4d1e3213ba7 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import wrap_function
@@ -35,7 +34,6 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.keras.layers import normalization_v2
 from tensorflow.python.keras.mixed_precision.experimental import policy
-from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
@@ -170,6 +168,13 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_batchnorm_non_trainable_with_fit(self):
+    # We use the same data shape for all the data we use in this test.
+    # This will prevent any used tf.functions from retracing.
+    # This helps us verify that changing trainable and recompiling really
+    # does update the training loop, rather than a different data shape
+    # triggering a retrace.
+    data_shape = (100, 3)
+
     inputs = keras.Input((3,))
     bn = normalization_v2.BatchNormalization()
     outputs = bn(inputs)
@@ -178,10 +183,10 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly())
-    model.fit(np.random.random((100, 3)), np.random.random((100, 3)))
+    model.fit(np.random.random(data_shape), np.random.random(data_shape))
 
-    test_data = np.random.random((10, 3))
-    test_targets = np.random.random((10, 3))
+    test_data = np.random.random(data_shape)
+    test_targets = np.random.random(data_shape)
     test_loss = model.evaluate(test_data, test_targets)
 
     bn.trainable = False
@@ -192,41 +197,6 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     train_loss = model.train_on_batch(test_data, test_targets)
     self.assertAlmostEqual(test_loss, train_loss)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
-  def test_batchnorm_non_trainable_with_tf_function(self):
-    inputs = keras.Input((3,))
-    bn = normalization_v2.BatchNormalization()
-    outputs = bn(inputs)
-    model = keras.Model(inputs, outputs)
-    loss_fn = keras.losses.MeanSquaredError()
-    optimizer = rmsprop_v2.RMSprop()
-
-    @def_function.function()
-    def train_step(x, y):
-      with backprop.GradientTape() as tape:
-        y_pred = model(x, training=True)
-        loss = loss_fn(y, y_pred)
-      grads = tape.gradient(loss, model.trainable_weights)
-      optimizer.apply_gradients(zip(grads, model.trainable_weights))
-      return loss
-
-    @def_function.function()
-    def test_step(x, y):
-      y_pred = model(x, training=False)
-      loss = loss_fn(y, y_pred)
-      return loss
-
-    train_step(np.random.random((100, 3)), np.random.random((100, 3)))
-
-    test_data = np.random.random((10, 3))
-    test_targets = np.random.random((10, 3))
-    test_loss = test_step(test_data, test_targets)
-
-    bn.trainable = False
-    train_loss = train_step(test_data, test_targets)
-    if context.executing_eagerly():
-      self.assertAlmostEqual(test_loss.numpy(), train_loss.numpy())
-
   def test_eager_batchnorm_in_custom_model_call_with_tf_function(self):
 
     class MyModel(keras.Model):
@@ -354,6 +324,13 @@ class BatchNormalizationV2Test(keras_parameterized.TestCase):
       # Updates should be tracked in a `wrap_function`.
       self.assertLen(layer.updates, 2)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_basic_batchnorm_v2_none_shape_and_virtual_batch_size(self):
+    # Test case for GitHub issue for 32380
+    norm = normalization_v2.BatchNormalization(virtual_batch_size=8)
+    inp = keras.layers.Input(shape=(None, None, 3))
+    _ = norm(inp)
+
 
 def _run_batchnorm_correctness_test(layer, dtype='float32', fused=False):
   model = keras.models.Sequential()
@@ -403,8 +380,6 @@ class NormalizationLayersGraphModeOnlyTest(
       model.train_on_batch(x, x)
 
       self.assertLen(bn.updates, 4)
-      self.assertLen(bn.get_updates_for(x1), 2)
-      self.assertLen(model.get_updates_for(x2), 2)
 
       # Test model-level reuse
       x3 = keras.layers.Input(shape=(10,))
@@ -413,7 +388,6 @@ class NormalizationLayersGraphModeOnlyTest(
 
       self.assertLen(new_model.updates, 6)
       self.assertLen(model.updates, 6)
-      self.assertLen(new_model.get_updates_for(x3), 2)
       new_model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       new_model.train_on_batch(x, x)
 
diff --git a/tensorflow/python/keras/layers/normalization_v2.py b/tensorflow/python/keras/layers/normalization_v2.py
index 0e529ee35e9..48af6b97ce1 100644
--- a/tensorflow/python/keras/layers/normalization_v2.py
+++ b/tensorflow/python/keras/layers/normalization_v2.py
@@ -76,8 +76,8 @@ class SyncBatchNormalization(normalization.BatchNormalizationBase):
     gamma_regularizer: Optional regularizer for the gamma weight.
     beta_constraint: Optional constraint for the beta weight.
     gamma_constraint: Optional constraint for the gamma weight.
-    renorm: Whether to use Batch Renormalization
-      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+    renorm: Whether to use [Batch Renormalization](
+      https://arxiv.org/abs/1702.03275). This adds extra variables during
       training. The inference is the same for either value of this parameter.
     renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
       scalar `Tensors` used to clip the renorm correction. The correction
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index 7617303624e..ff7d157acad 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -714,7 +714,6 @@ class GlobalPooling1D(Layer):
     super(GlobalPooling1D, self).__init__(**kwargs)
     self.input_spec = InputSpec(ndim=3)
     self.data_format = conv_utils.normalize_data_format(data_format)
-    self._supports_ragged_inputs = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -849,7 +848,6 @@ class GlobalPooling2D(Layer):
     super(GlobalPooling2D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=4)
-    self._supports_ragged_inputs = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -957,7 +955,6 @@ class GlobalPooling3D(Layer):
     super(GlobalPooling3D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=5)
-    self._supports_ragged_inputs = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 8c346da3d20..b580382f9d8 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -2,7 +2,11 @@
 #   Contains the Keras preprocess layers (internal TensorFlow version).
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
+load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
 
 package(
     default_visibility = [
@@ -25,10 +29,12 @@ py_library(
         ":discretization",
         ":hashing",
         ":image_preprocessing",
+        ":integer_lookup",
         ":normalization",
         ":preprocessing_stage",
         ":preprocessing_test_utils",
         ":reduction",
+        ":string_lookup",
         ":text_vectorization",
     ],
 )
@@ -108,6 +114,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":table_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -143,6 +150,44 @@ py_library(
     ],
 )
 
+py_library(
+    name = "integer_lookup",
+    srcs = [
+        "integer_lookup.py",
+        "integer_lookup_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
+py_library(
+    name = "table_utils",
+    srcs = [
+        "table_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras/engine:base_preprocessing_layer",
+        "//tensorflow/python/ops/ragged",
+    ],
+)
+
 py_library(
     name = "text_vectorization",
     srcs = [
@@ -152,7 +197,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":categorical_encoding",
-        ":index_lookup",
+        ":string_lookup",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -208,6 +253,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "string_lookup",
+    srcs = [
+        "string_lookup.py",
+        "string_lookup_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 py_library(
     name = "preprocessing_stage",
     srcs = [
@@ -231,19 +290,6 @@ py_library(
     ],
 )
 
-tf_py_test(
-    name = "discretization_test",
-    size = "small",
-    srcs = ["discretization_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":discretization",
-        ":preprocessing_test_utils",
-        "//tensorflow/python:client_testlib",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 cuda_py_test(
     name = "categorical_crossing_test",
     size = "medium",
@@ -261,6 +307,88 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "categorical_encoding_test",
+    size = "medium",
+    srcs = ["categorical_encoding_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":categorical_encoding",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+distribute_py_test(
+    name = "categorical_encoding_distribution_test",
+    srcs = ["categorical_encoding_distribution_test.py"],
+    main = "categorical_encoding_distribution_test.py",
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tpu_tags = [
+        "no_oss",  # b/155502591
+    ],
+    deps = [
+        ":categorical_encoding",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
+distribute_py_test(
+    name = "categorical_crossing_distribution_test",
+    srcs = ["categorical_crossing_distribution_test.py"],
+    main = "categorical_crossing_distribution_test.py",
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tpu_tags = [
+        "no_oss",  # b/155502591
+    ],
+    deps = [
+        ":categorical_crossing",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
+tf_py_test(
+    name = "discretization_test",
+    size = "small",
+    srcs = ["discretization_test.py"],
+    python_version = "PY3",
+    tags = ["no_rocm"],
+    deps = [
+        ":discretization",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+distribute_py_test(
+    name = "discretization_distribution_test",
+    srcs = ["discretization_distribution_test.py"],
+    main = "discretization_distribution_test.py",
+    python_version = "PY3",
+    tags = ["multi_and_single_gpu"],
+    deps = [
+        ":discretization",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
 cuda_py_test(
     name = "hashing_test",
     size = "medium",
@@ -275,6 +403,20 @@ cuda_py_test(
     ],
 )
 
+tpu_py_test(
+    name = "hashing_distribution_test",
+    srcs = ["hashing_distribution_test.py"],
+    main = "hashing_distribution_test.py",
+    python_version = "PY3",
+    tags = ["multi_and_single_gpu"],
+    deps = [
+        ":hashing",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
 tf_py_test(
     name = "index_lookup_test",
     size = "medium",
@@ -291,6 +433,20 @@ tf_py_test(
     ],
 )
 
+tpu_py_test(
+    name = "index_lookup_distribution_test",
+    srcs = ["index_lookup_distribution_test.py"],
+    main = "index_lookup_distribution_test.py",
+    python_version = "PY3",
+    tags = ["no_oss"],
+    deps = [
+        ":index_lookup",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
 cuda_py_test(
     name = "image_preprocessing_test",
     size = "medium",
@@ -318,11 +474,56 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "integer_lookup_test",
+    size = "medium",
+    srcs = ["integer_lookup_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":integer_lookup",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+distribute_py_test(
+    name = "normalization_distribution_test",
+    srcs = ["normalization_distribution_test.py"],
+    main = "normalization_distribution_test.py",
+    python_version = "PY3",
+    tags = ["no_oss"],
+    deps = [
+        ":normalization",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
+tf_py_test(
+    name = "table_utils_test",
+    srcs = ["table_utils_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":table_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "text_vectorization_test",
     size = "medium",
     srcs = ["text_vectorization_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     deps = [
         ":preprocessing_test_utils",
         ":text_vectorization",
@@ -334,6 +535,26 @@ tf_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "text_vectorization_distribution_test",
+    srcs = ["text_vectorization_distribution_test.py"],
+    main = "text_vectorization_distribution_test.py",
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tpu_tags = [
+        "no_oss",  # b/155502591
+    ],
+    deps = [
+        ":text_vectorization",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+    ],
+)
+
 tf_py_test(
     name = "reduction_test",
     srcs = ["reduction_test.py"],
@@ -346,13 +567,13 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "categorical_encoding_test",
+    name = "string_lookup_test",
     size = "medium",
-    srcs = ["categorical_encoding_test.py"],
+    srcs = ["string_lookup_test.py"],
     python_version = "PY3",
     deps = [
-        ":categorical_encoding",
         ":preprocessing_test_utils",
+        ":string_lookup",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
         "//tensorflow/python/keras/utils:generic_utils",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 276fb4767af..653a81581b3 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -1,4 +1,7 @@
 # Benchmarks for Keras preprocessing layers.
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
@@ -17,6 +20,16 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "categorical_crossing_benchmark",
+    srcs = ["categorical_crossing_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:categorical_crossing",
+    ],
+)
+
 tf_py_test(
     name = "index_lookup_adapt_benchmark",
     srcs = ["index_lookup_adapt_benchmark.py"],
@@ -36,3 +49,13 @@ tf_py_test(
         "//tensorflow/python/keras/layers/preprocessing:normalization",
     ],
 )
+
+cuda_py_test(
+    name = "image_preproc_benchmark",
+    srcs = ["image_preproc_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:image_preprocessing",
+    ],
+)
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
new file mode 100644
index 00000000000..80a7903f0b9
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
@@ -0,0 +1,116 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras categorical_encoding preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+v2_compat.enable_v2_behavior()
+
+
+# word_gen creates random sequences of ASCII letters (both lowercase and upper).
+# The number of unique strings is ~2,700.
+def int_gen():
+  for _ in itertools.count(1):
+    yield (np.random.randint(0, 5, (1,)), np.random.randint(0, 7, (1,)))
+
+
+class BenchmarkLayer(benchmark.Benchmark):
+  """Benchmark the layer forward pass."""
+
+  def run_dataset_implementation(self, batch_size):
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(
+          int_gen, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([1]), tensor_shape.TensorShape([1])))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = sparse_ops.sparse_cross([i[0], i[1]])
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    return avg_time
+
+  def bm_layer_implementation(self, batch_size):
+    input_1 = keras.Input(shape=(1,), dtype=dtypes.int64, name="word")
+    input_2 = keras.Input(shape=(1,), dtype=dtypes.int64, name="int")
+    layer = categorical_crossing.CategoryCrossing()
+    _ = layer([input_1, input_2])
+
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(
+          int_gen, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([1]), tensor_shape.TensorShape([1])))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = layer([i[0], i[1]])
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    name = "categorical_crossing|batch_%s" % batch_size
+    baseline = self.run_dataset_implementation(batch_size)
+    extras = {
+        "dataset implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for batch in [32, 64, 256]:
+      self.bm_layer_implementation(batch_size=batch)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
new file mode 100644
index 00000000000..302c890c823
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
@@ -0,0 +1,163 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras image preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+v2_compat.enable_v2_behavior()
+
+LOWER = .2
+UPPER = .4
+BATCH_SIZE = 32
+
+
+def rotate(inputs):
+  """rotate image."""
+  inputs_shape = array_ops.shape(inputs)
+  batch_size = inputs_shape[0]
+  img_hd = math_ops.cast(inputs_shape[1], dtypes.float32)
+  img_wd = math_ops.cast(inputs_shape[2], dtypes.float32)
+  min_angle = LOWER * 2. * np.pi
+  max_angle = UPPER * 2. * np.pi
+  angles = random_ops.random_uniform(
+      shape=[batch_size], minval=min_angle, maxval=max_angle)
+  return image_preprocessing.transform(
+      inputs, image_preprocessing.get_rotation_matrix(angles, img_hd, img_wd))
+
+
+def zoom(inputs):
+  """zoom image."""
+  inputs_shape = array_ops.shape(inputs)
+  batch_size = inputs_shape[0]
+  img_hd = math_ops.cast(inputs_shape[1], dtypes.float32)
+  img_wd = math_ops.cast(inputs_shape[2], dtypes.float32)
+  height_zoom = random_ops.random_uniform(
+      shape=[batch_size, 1], minval=1. + LOWER, maxval=1. + UPPER)
+  width_zoom = random_ops.random_uniform(
+      shape=[batch_size, 1], minval=1. + LOWER, maxval=1. + UPPER)
+  zooms = math_ops.cast(
+      array_ops.concat([width_zoom, height_zoom], axis=1), dtype=dtypes.float32)
+  return image_preprocessing.transform(
+      inputs, image_preprocessing.get_zoom_matrix(zooms, img_hd, img_wd))
+
+
+def image_augmentation(inputs, batch_size):
+  """image augmentation."""
+  img = inputs
+  img = image_ops_impl.resize_images_v2(img, size=[224, 224])
+  img = random_ops.random_crop(img, size=[batch_size, 224, 224, 3])
+  img = rotate(img)
+  img = zoom(img)
+  return img
+
+
+class BenchmarkLayer(benchmark.Benchmark):
+  """Benchmark the layer forward pass."""
+
+  def run_dataset_implementation(self, batch_size):
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_tensor_slices(
+          np.random.random((batch_size, 256, 256, 3)))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      ds = ds.prefetch(batch_size)
+      img_augmentation = functools.partial(
+          image_augmentation, batch_size=batch_size)
+      ds = ds.map(img_augmentation)
+      starts.append(time.time())
+      count = 0
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = i
+        count += 1
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / count
+    return avg_time
+
+  def bm_layer_implementation(self, batch_size):
+    with ops.device_v2("/gpu:0"):
+      img = keras.Input(shape=(256, 256, 3), dtype=dtypes.float32)
+      preprocessor = keras.Sequential([
+          image_preprocessing.Resizing(224, 224),
+          image_preprocessing.RandomCrop(height=224, width=224),
+          image_preprocessing.RandomRotation(factor=(.2, .4)),
+          image_preprocessing.RandomFlip(mode="horizontal"),
+          image_preprocessing.RandomZoom(.2, .2)
+      ])
+      _ = preprocessor(img)
+
+      num_repeats = 5
+      starts = []
+      ends = []
+      for _ in range(num_repeats):
+        ds = dataset_ops.Dataset.from_tensor_slices(
+            np.random.random((batch_size, 256, 256, 3)))
+        ds = ds.shuffle(batch_size * 100)
+        ds = ds.batch(batch_size)
+        ds = ds.prefetch(batch_size)
+        starts.append(time.time())
+        count = 0
+        # Benchmarked code begins here.
+        for i in ds:
+          _ = preprocessor(i)
+          count += 1
+        # Benchmarked code ends here.
+        ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / count
+    name = "image_preprocessing|batch_%s" % batch_size
+    baseline = self.run_dataset_implementation(batch_size)
+    extras = {
+        "dataset implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for batch in [32, 64, 256]:
+      self.bm_layer_implementation(batch_size=batch)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
index 4ed112a0f29..68848458bb2 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
@@ -23,46 +23,31 @@ import itertools
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.layers.experimental.preprocessing.CategoryCrossing')
 class CategoryCrossing(Layer):
   """Category crossing layer.
 
-  This layer transforms multiple categorical inputs to categorical outputs
-  by Cartesian product, and hash the output if necessary. Without hashing
-  (`num_bins=None`) the output dtype is string, with hashing the output dtype
-  is int64.
-
-  For each input, the hash function uses a specific fingerprint method, i.e.,
-  [FarmHash64](https://github.com/google/farmhash) to compute the hashed output,
-  that provides a consistent hashed output across different platforms.
-  For multiple inputs, the final output is calculated by first computing the
-  fingerprint of `hash_key`, and concatenate it with the fingerprints of
-  each input. The user can also obfuscate the output with customized `hash_key`.
-
-  If [SipHash64[(https://github.com/google/highwayhash) is desired instead, the
-  user can set `num_bins=None` to get string outputs, and use Hashing layer to
-  get hashed output with SipHash64.
+  This layer concatenates multiple categorical inputs into a single categorical
+  output (similar to Cartesian product). The output dtype is string.
 
   Usage:
-
-  Use with string output.
   >>> inp_1 = tf.constant([['a'], ['b'], ['c']])
   >>> inp_2 = tf.constant([['d'], ['e'], ['f']])
-  >>> layer = categorical_crossing.CategoryCrossing()
-  >>> output = layer([inp_1, inp_2])
-
-  Use with hashed output.
-  >>> layer = categorical_crossing.CategoryCrossing(num_bins=2)
-  >>> output = layer([inp_1, inp_2])
-
-  Use with customized hashed output.
-  >>> layer = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=133)
-  >>> output = layer([inp_1, inp_2])
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryCrossing()
+  >>> layer([inp_1, inp_2])
+  <tf.Tensor: shape=(3, 1), dtype=string, numpy=
+    array([[b'a_X_d'],
+           [b'b_X_e'],
+           [b'c_X_f']], dtype=object)>
 
   Arguments:
     depth: depth of input crossing. By default None, all inputs are crossed into
@@ -74,10 +59,6 @@ class CategoryCrossing(Layer):
       equal to N1 or N2. Passing `None` means a single crossed output with all
       inputs. For example, with inputs `a`, `b` and `c`, `depth=2` means the
       output will be [a;b;c;cross(a, b);cross(bc);cross(ca)].
-    num_bins: Number of hash bins. By default None, no hashing is performed.
-    hash_key: Integer hash_key that will be used by the concatenate
-      fingerprints. If not given, will use a default key from
-      `tf.sparse.cross_hashed`. This is only valid when `num_bins` is not None.
     name: Name to give to the layer.
     **kwargs: Keyword arguments to construct a layer.
 
@@ -87,115 +68,66 @@ class CategoryCrossing(Layer):
   Output shape: a single string or int tensor or sparse tensor of shape
     `[batch_size, d1, ..., dm]`
 
-  Below 'hash' stands for tf.fingerprint, and cat stands for 'FingerprintCat'.
+  Returns:
+    If any input is `RaggedTensor`, the output is `RaggedTensor`.
+    Else, if any input is `SparseTensor`, the output is `SparseTensor`.
+    Otherwise, the output is `Tensor`.
 
   Example: (`depth`=None)
     If the layer receives three inputs:
     `a=[[1], [4]]`, `b=[[2], [5]]`, `c=[[3], [6]]`
-    the output will be a string tensor if not hashed:
+    the output will be a string tensor:
     `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
-    the output will be an int64 tensor if hashed:
-    `[[cat(hash(3), cat(hash(2), cat(hash(1), hash(hash_key))))],
-     [[cat(hash(6), cat(hash(5), cat(hash(4), hash(hash_key))))]`
 
   Example: (`depth` is an integer)
     With the same input above, and if `depth`=2,
-    the output will be a list of 6 string tensors if not hashed:
+    the output will be a list of 6 string tensors:
     `[[b'1'], [b'4']]`
     `[[b'2'], [b'5']]`
     `[[b'3'], [b'6']]`
     `[[b'1_X_2'], [b'4_X_5']]`,
     `[[b'2_X_3'], [b'5_X_6']]`,
     `[[b'3_X_1'], [b'6_X_4']]`
-    the output will be a list of 6 int64 tensors if hashed:
-    `[[hash(b'1')], [hash(b'4')]]`
-    `[[hash(b'2')], [hash(b'5')]]`
-    `[[hash(b'3')], [hash(b'6')]]`
-    `[[cat(hash(2), cat(hash(1), hash(hash_key)))],
-      [cat(hash(5), cat(hash(4), hash(hash_key)))]`,
-    `[[cat(hash(3), cat(hash(1), hash(hash_key)))],
-      [cat(hash(6), cat(hash(4), hash(hash_key)))]`,
-    `[[cat(hash(3), cat(hash(2), hash(hash_key)))],
-      [cat(hash(6), cat(hash(5), hash(hash_key)))]`,
 
   Example: (`depth` is a tuple/list of integers)
     With the same input above, and if `depth`=(2, 3)
-    the output will be a list of 4 string tensors if not hashed:
+    the output will be a list of 4 string tensors:
     `[[b'1_X_2'], [b'4_X_5']]`,
     `[[b'2_X_3'], [b'5_X_6']]`,
     `[[b'3_X_1'], [b'6_X_4']]`,
     `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
-    the output will be a list of 4 int64 tensors if hashed:
-    `[
-      [cat(hash(2), cat(hash(1), hash(hash_key)))],
-      [cat(hash(5), cat(hash(4), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(1), hash(hash_key)))],
-      [cat(hash(6), cat(hash(4), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(2), hash(hash_key)))],
-      [cat(hash(6), cat(hash(5), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(2), cat(hash(1), hash(hash_key))))],
-      [cat(hash(6), cat(hash(5), cat(hash(4), hash(hash_key))))]
-     ]`
   """
 
   def __init__(self,
                depth=None,
-               num_bins=None,
-               hash_key=None,
                name=None,
                **kwargs):
     # TODO(tanzheny): Consider making seperator configurable.
-    if num_bins is None and hash_key is not None:
-      raise ValueError('`hash_key` is only valid when `num_bins` is not None')
     super(CategoryCrossing, self).__init__(name=name, **kwargs)
     self.depth = depth
-    self.num_bins = num_bins
-    self.hash_key = hash_key
     if isinstance(depth, (tuple, list)):
       self._depth_tuple = depth
     elif depth is not None:
       self._depth_tuple = tuple([i for i in range(1, depth + 1)])
-    self._supports_ragged_inputs = True
 
   def partial_crossing(self, partial_inputs, ragged_out, sparse_out):
     """Gets the crossed output from a partial list/tuple of inputs."""
-    if self.num_bins is not None:
-      partial_output = sparse_ops.sparse_cross_hashed(
-          partial_inputs, num_buckets=self.num_bins, hash_key=self.hash_key)
-    else:
-      partial_output = sparse_ops.sparse_cross(partial_inputs)
-
     # If ragged_out=True, convert output from sparse to ragged.
     if ragged_out:
-      return ragged_tensor.RaggedTensor.from_sparse(partial_output)
+      return ragged_array_ops.cross(partial_inputs)
     elif sparse_out:
-      return partial_output
+      return sparse_ops.sparse_cross(partial_inputs)
     else:
-      return sparse_ops.sparse_tensor_to_dense(partial_output)
+      return sparse_ops.sparse_tensor_to_dense(
+          sparse_ops.sparse_cross(partial_inputs))
 
   def call(self, inputs):
     depth_tuple = self._depth_tuple if self.depth else (len(inputs),)
     ragged_out = sparse_out = False
-    if all([ragged_tensor.is_ragged(inp) for inp in inputs]):
-      # (b/144500510) ragged.map_flat_values(sparse_cross_hashed, inputs) will
-      # cause kernel failure. Investigate and find a more efficient
-      # implementation
-      inputs = [inp.to_sparse() for inp in inputs]
+    if any([ragged_tensor.is_ragged(inp) for inp in inputs]):
       ragged_out = True
-    else:
-      if any([ragged_tensor.is_ragged(inp) for inp in inputs]):
-        raise ValueError(
-            'Inputs must be either all `RaggedTensor`, or none of them should '
-            'be `RaggedTensor`, got {}'.format(inputs))
-
-      if any([isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs]):
-        sparse_out = True
+    elif any([isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs]):
+      sparse_out = True
 
     outputs = []
     for depth in depth_tuple:
@@ -230,15 +162,22 @@ class CategoryCrossing(Layer):
   def compute_output_signature(self, input_spec):
     input_shapes = [x.shape for x in input_spec]
     output_shape = self.compute_output_shape(input_shapes)
-    output_dtype = dtypes.int64 if self.num_bins else dtypes.string
-    return sparse_tensor.SparseTensorSpec(
-        shape=output_shape, dtype=output_dtype)
+    if any([
+        isinstance(inp_spec, ragged_tensor.RaggedTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
+    elif any([
+        isinstance(inp_spec, sparse_tensor.SparseTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return sparse_tensor.SparseTensorSpec(
+          shape=output_shape, dtype=dtypes.string)
+    return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
 
   def get_config(self):
     config = {
         'depth': self.depth,
-        'num_bins': self.num_bins,
-        'hash_key': self.hash_key
     }
     base_config = super(CategoryCrossing, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
new file mode 100644
index 00000000000..57dea6edf4a
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
@@ -0,0 +1,83 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution,
+                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
+@combinations.generate(
+    combinations.combine(
+        # Investigate why crossing is not supported with TPU.
+        distribution=strategy_combinations.all_strategies,
+        mode=['eager', 'graph']))
+class CategoryCrossingDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution(self, distribution):
+    input_array_1 = np.array([['a', 'b'], ['c', 'd']])
+    input_array_2 = np.array([['e', 'f'], ['g', 'h']])
+    inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(
+        {'input_1': input_array_1, 'input_2': input_array_2})
+    inp_dataset = batch_wrapper(inp_dataset, 2, distribution)
+
+    # pyformat: disable
+    expected_output = [[b'a_X_e', b'a_X_f', b'b_X_e', b'b_X_f'],
+                       [b'c_X_g', b'c_X_h', b'd_X_g', b'd_X_h']]
+    config.set_soft_device_placement(True)
+
+    with distribution.scope():
+      input_data_1 = keras.Input(shape=(2,), dtype=dtypes.string,
+                                 name='input_1')
+      input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string,
+                                 name='input_2')
+      input_data = [input_data_1, input_data_2]
+      layer = categorical_crossing.CategoryCrossing()
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(inp_dataset)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
index 49d8f0d7003..5bbcf5ce022 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
@@ -40,7 +40,7 @@ from tensorflow.python.platform import test
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class CategoryCrossingTest(keras_parameterized.TestCase):
 
-  def test_crossing_basic(self):
+  def test_crossing_sparse_inputs(self):
     layer = categorical_crossing.CategoryCrossing()
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -52,36 +52,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
     self.assertAllEqual([b'a_X_d', b'b_X_e', b'c_X_e'], output.values)
 
-  def test_crossing_sparse_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=1)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertAllClose([0, 0, 0], output.values)
-
-  def test_crossing_sparse_inputs_with_hash_key(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=133)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertAllClose([1, 0, 1], output.values)
-
-    layer_2 = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=137)
-    output = layer_2([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    # Note the output is different with above.
-    self.assertAllClose([0, 1, 0], output.values)
-
   def test_crossing_sparse_inputs_depth_int(self):
     layer = categorical_crossing.CategoryCrossing(depth=1)
     inputs_0 = sparse_tensor.SparseTensor(
@@ -127,35 +97,15 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
         [expected_outputs_0, expected_outputs_1, expected_outputs_2], axis=0)
     self.assertAllEqual(expected_out, output)
 
-  def test_crossing_hashed_two_bins(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertEqual(output.values.numpy().max(), 1)
-    self.assertEqual(output.values.numpy().min(), 0)
-
-  def test_crossing_hashed_ragged_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
+  def test_crossing_ragged_inputs(self):
     inputs_0 = ragged_factory_ops.constant(
         [['omar', 'skywalker'], ['marlo']],
         dtype=dtypes.string)
     inputs_1 = ragged_factory_ops.constant(
         [['a'], ['b']],
         dtype=dtypes.string)
-    out_data = layer([inputs_0, inputs_1])
-    expected_output = [[0, 0], [0]]
-    self.assertAllClose(expected_output, out_data)
     inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
     inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
-    out_t = layer([inp_0_t, inp_1_t])
-    model = training.Model(inputs=[inp_0_t, inp_1_t], outputs=out_t)
-    self.assertAllClose(expected_output, model.predict([inputs_0, inputs_1]))
 
     non_hashed_layer = categorical_crossing.CategoryCrossing()
     out_t = non_hashed_layer([inp_0_t, inp_1_t])
@@ -198,16 +148,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertIsInstance(output, ragged_tensor.RaggedTensor)
     self.assertAllEqual(expected_output, output)
 
-  def test_invalid_mixed_sparse_and_ragged_input(self):
-    with self.assertRaises(ValueError):
-      layer = categorical_crossing.CategoryCrossing(num_bins=2)
-      inputs_0 = ragged_factory_ops.constant(
-          [['omar'], ['marlo']],
-          dtype=dtypes.string)
-      inputs_1 = sparse_tensor.SparseTensor(
-          indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-      layer([inputs_0, inputs_1])
-
   def test_crossing_with_dense_inputs(self):
     layer = categorical_crossing.CategoryCrossing()
     inputs_0 = np.asarray([[1, 2]])
@@ -251,13 +191,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output,
                         model.predict([inputs_0, inputs_1, inputs_2]))
 
-  def test_crossing_hashed_with_dense_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    inputs_0 = np.asarray([[1, 2]])
-    inputs_1 = np.asarray([[1, 3]])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose([[1, 1, 0, 0]], output)
-
   def test_crossing_compute_output_signature(self):
     input_shapes = [
         tensor_shape.TensorShape([2, 2]),
@@ -272,18 +205,9 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
     self.assertEqual(output_spec.dtype, dtypes.string)
 
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    output_spec = layer.compute_output_signature(input_specs)
-    self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
-    self.assertEqual(output_spec.dtype, dtypes.int64)
-
-  def test_crossing_with_invalid_hash_key(self):
-    with self.assertRaises(ValueError):
-      _ = categorical_crossing.CategoryCrossing(hash_key=133)
-
   @tf_test_util.run_v2_only
   def test_config_with_custom_name(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2, name='hashing')
+    layer = categorical_crossing.CategoryCrossing(depth=2, name='hashing')
     config = layer.get_config()
     layer_1 = categorical_crossing.CategoryCrossing.from_config(config)
     self.assertEqual(layer_1.name, layer.name)
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py b/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
index a7dc159cdb6..466405a27a9 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
@@ -102,9 +102,6 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     self._sparse = sparse
     self._called = False
 
-    # This layer supports RaggedTensor inputs.
-    self._supports_ragged_inputs = True
-
     # We are adding these here instead of in build() since they do not depend
     # on the input shape at all.
     if max_tokens is None:
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
new file mode 100644
index 00000000000..c5214533f94
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
@@ -0,0 +1,61 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=strategy_combinations.all_strategies,
+        mode=["eager", "graph"]))
+class CategoricalEncodingDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution(self, distribution):
+    input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
+
+    # pyformat: disable
+    expected_output = [[0, 1, 1, 1, 0, 0],
+                       [1, 1, 0, 1, 0, 0]]
+    # pyformat: enable
+    max_tokens = 6
+
+    with distribution.scope():
+      input_data = keras.Input(shape=(4,), dtype=dtypes.int32)
+      layer = categorical_encoding.CategoricalEncoding(
+          max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
index 3427a311078..003b6e64f90 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization.py
@@ -56,7 +56,6 @@ class Discretization(Layer):
 
   def __init__(self, bins, output_mode=INTEGER, **kwargs):
     super(Discretization, self).__init__(**kwargs)
-    self._supports_ragged_inputs = True
     self.bins = bins
     self.output_mode = output_mode
 
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
new file mode 100644
index 00000000000..7da40b88920
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
@@ -0,0 +1,59 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import discretization
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=strategy_combinations.all_strategies,
+        mode=["eager", "graph"]))
+class DiscretizationDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution(self, distribution):
+    input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
+
+    expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
+    expected_output_shape = [None, None]
+
+    with distribution.scope():
+      input_data = keras.Input(shape=(None,))
+      layer = discretization.Discretization(
+          bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+      bucket_data = layer(input_data)
+      self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
+
+      model = keras.Model(inputs=input_data, outputs=bucket_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index 23d553a49cd..dfd4761f193 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -90,7 +90,6 @@ class Hashing(Layer):
     super(Hashing, self).__init__(name=name, **kwargs)
     self.num_bins = num_bins
     self.salt = salt
-    self._supports_ragged_inputs = True
 
   def call(self, inputs):
     # Converts integer inputs to string.
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
new file mode 100644
index 00000000000..0cfd1ab967c
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
@@ -0,0 +1,60 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import hashing
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=strategy_combinations.all_strategies,
+        mode=["eager", "graph"]))
+class HashingDistributionTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution(self, distribution):
+    input_data = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"]])
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+        2, drop_remainder=True)
+    expected_output = [[0], [0], [1], [0]]
+
+    config.set_soft_device_placement(True)
+
+    with distribution.scope():
+      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+      layer = hashing.Hashing(num_bins=2)
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_dataset)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 95cb87486f8..832915dac68 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -51,6 +51,19 @@ _RESIZE_METHODS = {
     'mitchellcubic': ResizeMethod.MITCHELLCUBIC
 }
 
+H_AXIS = 1
+W_AXIS = 2
+
+
+def check_fill_mode_and_interpolation(fill_mode, interpolation):
+  if fill_mode not in {'reflect', 'wrap', 'constant'}:
+    raise NotImplementedError(
+        'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
+        '`constant` are supported.'.format(fill_mode))
+  if interpolation not in {'nearest', 'bilinear'}:
+    raise NotImplementedError('Unknown `interpolation` {}. Only `nearest` and '
+                              '`bilinear` are supported.'.format(interpolation))
+
 
 @keras_export('keras.layers.experimental.preprocessing.Resizing')
 class Resizing(Layer):
@@ -132,9 +145,8 @@ class CenterCrop(Layer):
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
-    h_axis, w_axis = 1, 2
-    img_hd = inputs_shape[h_axis]
-    img_wd = inputs_shape[w_axis]
+    img_hd = inputs_shape[H_AXIS]
+    img_wd = inputs_shape[W_AXIS]
     img_hd_diff = img_hd - self.target_height
     img_wd_diff = img_wd - self.target_width
     checks = []
@@ -205,7 +217,7 @@ class RandomCrop(Layer):
     self.input_spec = InputSpec(ndim=4)
     super(RandomCrop, self).__init__(name=name, **kwargs)
 
-  def call(self, inputs, training=None):
+  def call(self, inputs, training=True):
     if training is None:
       training = K.learning_phase()
 
@@ -230,9 +242,9 @@ class RandomCrop(Layer):
     def resize_and_center_cropped_inputs():
       """Deterministically resize to shorter side and center crop."""
       input_shape = array_ops.shape(inputs)
-      input_height_t = input_shape[1]
-      input_width_t = input_shape[2]
-      ratio_cond = (input_height_t / input_width_t > 1.)
+      input_height_t = input_shape[H_AXIS]
+      input_width_t = input_shape[W_AXIS]
+      ratio_cond = (input_height_t / input_width_t > (self.height / self.width))
       # pylint: disable=g-long-lambda
       resized_height = tf_utils.smart_cond(
           ratio_cond,
@@ -341,7 +353,8 @@ class RandomFlip(Layer):
   Attributes:
     mode: String indicating which flip mode to use. Can be "horizontal",
       "vertical", or "horizontal_and_vertical". Defaults to
-      "horizontal_and_vertical".
+      "horizontal_and_vertical". "horizontal" is a left-right flip and
+      "vertical" is a top-bottom flip.
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
   """
@@ -369,17 +382,17 @@ class RandomFlip(Layer):
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
 
-  def call(self, inputs, training=None):
+  def call(self, inputs, training=True):
     if training is None:
       training = K.learning_phase()
 
     def random_flipped_inputs():
       flipped_outputs = inputs
       if self.horizontal:
-        flipped_outputs = image_ops.random_flip_up_down(flipped_outputs,
-                                                        self.seed)
+        flipped_outputs = image_ops.random_flip_left_right(flipped_outputs,
+                                                           self.seed)
       if self.vertical:
-        flipped_outputs = image_ops.random_flip_left_right(
+        flipped_outputs = image_ops.random_flip_up_down(
             flipped_outputs, self.seed)
       return flipped_outputs
 
@@ -406,17 +419,24 @@ class RandomTranslation(Layer):
   """Randomly translate each image during training.
 
   Arguments:
-    height_factor: a positive float represented as fraction of value, or a tuple
-      of size 2 representing lower and upper bound for shifting vertically. When
-      represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `height_factor=(0.2, 0.3)` results in an output
-      height varying in the range `[original - 20%, original + 30%]`.
-      `height_factor=0.2` results in an output height varying in the range
-      `[original - 20%, original + 20%]`.
-    width_factor: a positive float represented as fraction of value, or a tuple
+    height_factor: a float represented as fraction of value, or a tuple
+      of size 2 representing lower and upper bound for shifting vertically.
+      A negative value means shifting image up, while a positive value
+      means shifting image down. When represented as a single positive float,
+      this value is used for both the upper and lower bound. For instance,
+      `height_factor=(-0.2, 0.3)` results in an output shifted by a random
+      amount in the range [-20%, +30%].
+      `height_factor=0.2` results in an output height shifted by a random
+      amount in the range [-20%, +20%].
+    width_factor: a float represented as fraction of value, or a tuple
       of size 2 representing lower and upper bound for shifting horizontally.
-      When represented as a single float, this value is used for both the upper
-      and lower bound.
+      A negative value means shifting image left, while a positive value
+      means shifting image right. When represented as a single positive float,
+      this value is used for both the upper and lower bound. For instance,
+      `width_factor=(-0.2, 0.3)` results in an output shifted left by 20%, and
+      shifted right by 30%.
+      `width_factor=0.2` results in an output height shifted left or right
+      by 20%.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -439,8 +459,8 @@ class RandomTranslation(Layer):
       data_format='channels_last'.
 
   Raise:
-    ValueError: if lower bound is not between [0, 1], or upper bound is
-      negative.
+    ValueError: if either bound is not between [0, 1], or upper bound is
+      less than lower bound.
   """
 
   def __init__(self,
@@ -453,38 +473,34 @@ class RandomTranslation(Layer):
                **kwargs):
     self.height_factor = height_factor
     if isinstance(height_factor, (tuple, list)):
-      self.height_lower = abs(height_factor[0])
+      self.height_lower = height_factor[0]
       self.height_upper = height_factor[1]
     else:
-      self.height_lower = self.height_upper = height_factor
-    if self.height_upper < 0.:
-      raise ValueError('`height_factor` cannot have negative values as upper '
-                       'bound, got {}'.format(height_factor))
+      self.height_lower = -height_factor
+      self.height_upper = height_factor
+    if self.height_upper < self.height_lower:
+      raise ValueError('`height_factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(height_factor))
     if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
       raise ValueError('`height_factor` must have values between [-1, 1], '
                        'got {}'.format(height_factor))
 
     self.width_factor = width_factor
     if isinstance(width_factor, (tuple, list)):
-      self.width_lower = abs(width_factor[0])
+      self.width_lower = width_factor[0]
       self.width_upper = width_factor[1]
     else:
-      self.width_lower = self.width_upper = width_factor
-    if self.width_upper < 0.:
-      raise ValueError('`width_factor` cannot have negative values as upper '
-                       'bound, got {}'.format(width_factor))
+      self.width_lower = -width_factor
+      self.width_upper = width_factor
+    if self.width_upper < self.width_lower:
+      raise ValueError('`width_factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(width_factor))
     if abs(self.width_lower) > 1. or abs(self.width_upper) > 1.:
       raise ValueError('`width_factor` must have values between [-1, 1], '
                        'got {}'.format(width_factor))
 
-    if fill_mode not in {'reflect', 'wrap', 'constant'}:
-      raise NotImplementedError(
-          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-          '`constant` are supported.'.format(fill_mode))
-    if interpolation not in {'nearest', 'bilinear'}:
-      raise NotImplementedError(
-          'Unknown `interpolation` {}. Only `nearest` and '
-          '`bilinear` are supported.'.format(interpolation))
+    check_fill_mode_and_interpolation(fill_mode, interpolation)
+
     self.fill_mode = fill_mode
     self.interpolation = interpolation
     self.seed = seed
@@ -492,7 +508,7 @@ class RandomTranslation(Layer):
     self.input_spec = InputSpec(ndim=4)
     super(RandomTranslation, self).__init__(name=name, **kwargs)
 
-  def call(self, inputs, training=None):
+  def call(self, inputs, training=True):
     if training is None:
       training = K.learning_phase()
 
@@ -500,22 +516,24 @@ class RandomTranslation(Layer):
       """Translated inputs with random ops."""
       inputs_shape = array_ops.shape(inputs)
       batch_size = inputs_shape[0]
-      h_axis, w_axis = 1, 2
+      h_axis, w_axis = H_AXIS, W_AXIS
       img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
       img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
       height_translate = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.height_lower,
-          maxval=self.height_upper)
+          minval=self.height_lower,
+          maxval=self.height_upper,
+          dtype=dtypes.float32)
       height_translate = height_translate * img_hd
       width_translate = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.width_lower,
-          maxval=self.width_upper)
+          minval=self.width_lower,
+          maxval=self.width_upper,
+          dtype=dtypes.float32)
       width_translate = width_translate * img_wd
       translations = math_ops.cast(
-          array_ops.concat([height_translate, width_translate], axis=1),
-          dtype=inputs.dtype)
+          array_ops.concat([width_translate, height_translate], axis=1),
+          dtype=dtypes.float32)
       return transform(
           inputs,
           get_translation_matrix(translations),
@@ -712,9 +730,15 @@ class RandomRotation(Layer):
     `(samples, height, width, channels)`, data_format='channels_last'.
 
   Attributes:
-    factor: a positive float represented as fraction of 2pi, or a tuple of size
+    factor: a float represented as fraction of 2pi, or a tuple of size
       2 representing lower and upper bound for rotating clockwise and
-      counter-clockwise. When represented as a single float, lower = upper.
+      counter-clockwise. A positive values means rotating counter clock-wise,
+      while a negative value means clock-wise. When represented as a single
+      float, this value is used for both the upper and lower bound. For
+      instance, `factor=(-0.2, 0.3)` results in an output
+      rotation by a random amount in the range `[-20% * 2pi, 30% * 2pi]`.
+      `factor=0.2` results in an output rotating by a random amount in the range
+      `[-20% * 2pi, 20% * 2pi]`.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -735,8 +759,8 @@ class RandomRotation(Layer):
       data_format='channels_last'.
 
   Raise:
-    ValueError: if lower bound is not between [0, 1], or upper bound is
-      negative.
+    ValueError: if either bound is not between [0, 1], or upper bound is
+      less than lower bound.
   """
 
   def __init__(self,
@@ -751,18 +775,12 @@ class RandomRotation(Layer):
       self.lower = factor[0]
       self.upper = factor[1]
     else:
-      self.lower = self.upper = factor
-    if self.lower < 0. or self.upper < 0.:
+      self.lower = -factor
+      self.upper = factor
+    if self.upper < self.lower:
       raise ValueError('Factor cannot have negative values, '
                        'got {}'.format(factor))
-    if fill_mode not in {'reflect', 'wrap', 'constant'}:
-      raise NotImplementedError(
-          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-          '`constant` are supported.'.format(fill_mode))
-    if interpolation not in {'nearest', 'bilinear'}:
-      raise NotImplementedError(
-          'Unknown `interpolation` {}. Only `nearest` and '
-          '`bilinear` are supported.'.format(interpolation))
+    check_fill_mode_and_interpolation(fill_mode, interpolation)
     self.fill_mode = fill_mode
     self.interpolation = interpolation
     self.seed = seed
@@ -770,7 +788,7 @@ class RandomRotation(Layer):
     self.input_spec = InputSpec(ndim=4)
     super(RandomRotation, self).__init__(name=name, **kwargs)
 
-  def call(self, inputs, training=None):
+  def call(self, inputs, training=True):
     if training is None:
       training = K.learning_phase()
 
@@ -778,13 +796,12 @@ class RandomRotation(Layer):
       """Rotated inputs with random ops."""
       inputs_shape = array_ops.shape(inputs)
       batch_size = inputs_shape[0]
-      h_axis, w_axis = 1, 2
-      img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
-      img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
+      img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
+      img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
       min_angle = self.lower * 2. * np.pi
       max_angle = self.upper * 2. * np.pi
       angles = self._rng.uniform(
-          shape=[batch_size], minval=-min_angle, maxval=max_angle)
+          shape=[batch_size], minval=min_angle, maxval=max_angle)
       return transform(
           inputs,
           get_rotation_matrix(angles, img_hd, img_wd),
@@ -815,16 +832,24 @@ class RandomZoom(Layer):
   """Randomly zoom each image during training.
 
   Arguments:
-    height_factor: a positive float represented as fraction of value, or a tuple
-      of size 2 representing lower and upper bound for zooming horizontally.
-      When represented as a single float, this value is used for both the
-      upper and lower bound. For instance, `height_factor=(0.2, 0.3)` result in
-      an output zoom varying in the range `[original * 20%, original * 30%]`.
-    width_factor: a positive float represented as fraction of value, or a tuple
+    height_factor: a float represented as fraction of value, or a tuple
       of size 2 representing lower and upper bound for zooming vertically.
       When represented as a single float, this value is used for both the
-      upper and lower bound. For instance, `width_factor=(0.2, 0.3)` result in
-      an output zoom varying in the range `[original * 20%, original * 30%]`.
+      upper and lower bound. A positive value means zooming out, while a
+      negative value means zooming in.
+      For instance, `height_factor=(0.2, 0.3)` result in an output zoomed out
+      by a random amount in the range [+20%, +30%].
+      `height_factor=(-0.3, -0.2)` result in an output zoomed in by a random
+      amount in the range [+20%, +30%].
+    width_factor: a float represented as fraction of value, or a tuple
+      of size 2 representing lower and upper bound for zooming horizontally.
+      When represented as a single float, this value is used for both the
+      upper and lower bound.
+      For instance, `width_factor=(0.2, 0.3)` result in an output zooming out
+      between 20% to 30%.
+      `width_factor=(-0.3, -0.2)` result in an output zooming in between 20%
+      to 30%. Defaults to `None`, i.e., zooming vertical and horizontal
+      directions by preserving the aspect ratio.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -837,6 +862,14 @@ class RandomZoom(Layer):
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
 
+  Example:
+
+  >>> input_img = np.random.random((32, 224, 224, 3))
+  >>> layer = tf.keras.layers.experimental.preprocessing.RandomZoom(.5, .2)
+  >>> out_img = layer(input_img)
+  >>> out_img.shape
+  TensorShape([32, 224, 224, 3])
+
   Input shape:
     4D tensor with shape:
     `(samples, height, width, channels)`, data_format='channels_last'.
@@ -850,9 +883,10 @@ class RandomZoom(Layer):
       negative.
   """
 
+  # TODO(b/156526279): Add `fill_value` argument.
   def __init__(self,
                height_factor,
-               width_factor,
+               width_factor=None,
                fill_mode='reflect',
                interpolation='bilinear',
                seed=None,
@@ -863,35 +897,28 @@ class RandomZoom(Layer):
       self.height_lower = height_factor[0]
       self.height_upper = height_factor[1]
     else:
-      self.height_lower = self.height_upper = height_factor
-    if self.height_lower < 0. or self.height_upper < 0.:
-      raise ValueError('`height_factor` cannot have negative values, '
+      self.height_lower = -height_factor
+      self.height_upper = height_factor
+
+    if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
+      raise ValueError('`height_factor` must have values between [-1, 1], '
                        'got {}'.format(height_factor))
-    if self.height_lower > self.height_upper:
-      raise ValueError('`height_factor` cannot have lower bound larger than '
-                       'upper bound, got {}.'.format(height_factor))
 
     self.width_factor = width_factor
-    if isinstance(width_factor, (tuple, list)):
-      self.width_lower = width_factor[0]
-      self.width_upper = width_factor[1]
-    else:
-      self.width_lower = self.width_upper = width_factor
-    if self.width_lower < 0. or self.width_upper < 0.:
-      raise ValueError('`width_factor` cannot have negative values, '
-                       'got {}'.format(width_factor))
-    if self.width_lower > self.width_upper:
-      raise ValueError('`width_factor` cannot have lower bound larger than '
-                       'upper bound, got {}.'.format(width_factor))
+    if width_factor is not None:
+      if isinstance(width_factor, (tuple, list)):
+        self.width_lower = width_factor[0]
+        self.width_upper = width_factor[1]
+      else:
+        self.width_lower = -width_factor  # pylint: disable=invalid-unary-operand-type
+        self.width_upper = width_factor
+
+      if self.width_lower < -1. or self.width_upper < -1.:
+        raise ValueError('`width_factor` must have values larger than -1, '
+                         'got {}'.format(width_factor))
+
+    check_fill_mode_and_interpolation(fill_mode, interpolation)
 
-    if fill_mode not in {'reflect', 'wrap', 'constant'}:
-      raise NotImplementedError(
-          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-          '`constant` are supported.'.format(fill_mode))
-    if interpolation not in {'nearest', 'bilinear'}:
-      raise NotImplementedError(
-          'Unknown `interpolation` {}. Only `nearest` and '
-          '`bilinear` are supported.'.format(interpolation))
     self.fill_mode = fill_mode
     self.interpolation = interpolation
     self.seed = seed
@@ -899,7 +926,7 @@ class RandomZoom(Layer):
     self.input_spec = InputSpec(ndim=4)
     super(RandomZoom, self).__init__(name=name, **kwargs)
 
-  def call(self, inputs, training=None):
+  def call(self, inputs, training=True):
     if training is None:
       training = K.learning_phase()
 
@@ -907,22 +934,22 @@ class RandomZoom(Layer):
       """Zoomed inputs with random ops."""
       inputs_shape = array_ops.shape(inputs)
       batch_size = inputs_shape[0]
-      h_axis, w_axis = 1, 2
-      img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
-      img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
+      img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
+      img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
       height_zoom = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.height_lower,
-          maxval=self.height_upper)
-      height_zoom = height_zoom * img_hd
-      width_zoom = self._rng.uniform(
-          shape=[batch_size, 1],
-          minval=-self.width_lower,
-          maxval=self.width_upper)
-      width_zoom = width_zoom * img_wd
+          minval=1. + self.height_lower,
+          maxval=1. + self.height_upper)
+      if self.width_factor is not None:
+        width_zoom = self._rng.uniform(
+            shape=[batch_size, 1],
+            minval=1. + self.width_lower,
+            maxval=1. + self.width_upper)
+      else:
+        width_zoom = height_zoom
       zooms = math_ops.cast(
-          array_ops.concat([height_zoom, width_zoom], axis=1),
-          dtype=inputs.dtype)
+          array_ops.concat([width_zoom, height_zoom], axis=1),
+          dtype=dtypes.float32)
       return transform(
           inputs, get_zoom_matrix(zooms, img_hd, img_wd),
           fill_mode=self.fill_mode,
@@ -974,8 +1001,8 @@ def get_zoom_matrix(zooms, image_height, image_width, name=None):
     #      [0 0 1]]
     # where the last entry is implicit.
     # Zoom matrices are always float32.
-    x_offset = ((image_height + 1.) / 2.0) * (zooms[:, 0, None] - 1.)
-    y_offset = ((image_width + 1.) / 2.0) * (zooms[:, 1, None] - 1.)
+    x_offset = ((image_width - 1.) / 2.0) * (1.0 - zooms[:, 0, None])
+    y_offset = ((image_height - 1.) / 2.0) * (1.0 - zooms[:, 1, None])
     return array_ops.concat(
         values=[
             zooms[:, 0, None],
@@ -1035,7 +1062,7 @@ class RandomContrast(Layer):
     self.input_spec = InputSpec(ndim=4)
     super(RandomContrast, self).__init__(name=name, **kwargs)
 
-  def call(self, inputs, training=None):
+  def call(self, inputs, training=True):
     if training is None:
       training = K.learning_phase()
 
@@ -1073,11 +1100,11 @@ class RandomHeight(Layer):
     factor: A positive float (fraction of original height), or a tuple of size 2
       representing lower and upper bound for resizing vertically. When
       represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `factor=(0.2, 0.3)` results in an output height
-      varying in the range `[original + 20%, original + 30%]`. `factor=(-0.2,
-      0.3)` results in an output height varying in the range `[original - 20%,
-      original + 30%]`. `factor=0.2` results in an output height varying in the
-      range `[original - 20%, original + 20%]`.
+      lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
+      height changed by a random amount in the range `[20%, 30%]`.
+      `factor=(-0.2, 0.3)` results in an output with height changed by a random
+      amount in the range `[-20%, +30%]. `factor=0.2` results in an output with
+      height changed by a random amount in the range `[-20%, +20%]`.
     interpolation: String, the interpolation method. Defaults to `bilinear`.
       Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
       `gaussian`, `mitchellcubic`
@@ -1099,12 +1126,17 @@ class RandomHeight(Layer):
                **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
-      self.height_lower = -factor[0]
+      self.height_lower = factor[0]
       self.height_upper = factor[1]
     else:
-      self.height_lower = self.height_upper = factor
-    if self.height_lower > 1.:
-      raise ValueError('`factor` cannot have abs lower bound larger than 1.0, '
+      self.height_lower = -factor
+      self.height_upper = factor
+
+    if self.height_upper < self.height_lower:
+      raise ValueError('`factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(factor))
+    if self.height_lower < -1. or self.height_upper < -1.:
+      raise ValueError('`factor` must have values larger than -1, '
                        'got {}'.format(factor))
     self.interpolation = interpolation
     self._interpolation_method = get_interpolation(interpolation)
@@ -1113,19 +1145,18 @@ class RandomHeight(Layer):
     self._rng = make_generator(self.seed)
     super(RandomHeight, self).__init__(name=name, **kwargs)
 
-  def call(self, inputs, training=None):
+  def call(self, inputs, training=True):
     if training is None:
       training = K.learning_phase()
 
     def random_height_inputs():
       """Inputs height-adjusted with random ops."""
       inputs_shape = array_ops.shape(inputs)
-      h_axis, w_axis = 1, 2
-      img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
-      img_wd = inputs_shape[w_axis]
+      img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
+      img_wd = inputs_shape[W_AXIS]
       height_factor = self._rng.uniform(
           shape=[],
-          minval=(1.0 - self.height_lower),
+          minval=(1.0 + self.height_lower),
           maxval=(1.0 + self.height_upper))
       adjusted_height = math_ops.cast(height_factor * img_hd, dtypes.int32)
       adjusted_size = array_ops.stack([adjusted_height, img_wd])
@@ -1163,14 +1194,14 @@ class RandomWidth(Layer):
   By default, this layer is inactive during inference.
 
   Arguments:
-    factor: A positive float (fraction of original width), or a tuple of
-      size 2 representing lower and upper bound for resizing horizontally. When
+    factor: A positive float (fraction of original height), or a tuple of size 2
+      representing lower and upper bound for resizing vertically. When
       represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `factor=(0.2, 0.3)` results in an output width
-      varying in the range `[original + 20%, original + 30%]`. `factor=(-0.2,
-      0.3)` results in an output width varying in the range `[original - 20%,
-      original + 30%]`. `factor=0.2` results in an output width varying in the
-      range `[original - 20%, original + 20%]`.
+      lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
+      width changed by a random amount in the range `[20%, 30%]`.
+      `factor=(-0.2, 0.3)` results in an output with width changed by a random
+      amount in the range `[-20%, +30%]. `factor=0.2` results in an output with
+      width changed by a random amount in the range `[-20%, +20%]`.
     interpolation: String, the interpolation method. Defaults to `bilinear`.
       Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
       `gaussian`, `mitchellcubic`
@@ -1183,7 +1214,7 @@ class RandomWidth(Layer):
 
   Output shape:
     4D tensor with shape:
-    `(samples, random_height, width, channels)`.
+    `(samples, height, random_width, channels)`.
   """
 
   def __init__(self,
@@ -1194,12 +1225,16 @@ class RandomWidth(Layer):
                **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
-      self.width_lower = -factor[0]
+      self.width_lower = factor[0]
       self.width_upper = factor[1]
     else:
-      self.width_lower = self.width_upper = factor
-    if self.width_lower > 1.:
-      raise ValueError('`factor` cannot have abs lower bound larger than 1.0, '
+      self.width_lower = -factor
+      self.width_upper = factor
+    if self.width_upper < self.width_lower:
+      raise ValueError('`factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(factor))
+    if self.width_lower < -1. or self.width_upper < -1.:
+      raise ValueError('`factor` must have values larger than -1, '
                        'got {}'.format(factor))
     self.interpolation = interpolation
     self._interpolation_method = get_interpolation(interpolation)
@@ -1208,19 +1243,18 @@ class RandomWidth(Layer):
     self._rng = make_generator(self.seed)
     super(RandomWidth, self).__init__(name=name, **kwargs)
 
-  def call(self, inputs, training=None):
+  def call(self, inputs, training=True):
     if training is None:
       training = K.learning_phase()
 
     def random_width_inputs():
       """Inputs width-adjusted with random ops."""
       inputs_shape = array_ops.shape(inputs)
-      h_axis, w_axis = 1, 2
-      img_hd = inputs_shape[h_axis]
-      img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
+      img_hd = inputs_shape[H_AXIS]
+      img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
       width_factor = self._rng.uniform(
           shape=[],
-          minval=(1.0 - self.width_lower),
+          minval=(1.0 + self.width_lower),
           maxval=(1.0 + self.width_upper))
       adjusted_width = math_ops.cast(width_factor * img_wd, dtypes.int32)
       adjusted_size = array_ops.stack([img_hd, adjusted_width])
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 56e3abe0c6a..38d2d25916a 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -74,6 +74,40 @@ class ResizingTest(keras_parameterized.TestCase):
     with CustomObjectScope({'Resizing': image_preprocessing.Resizing}):
       self._run_test(kwargs, expected_height, expected_width)
 
+  def test_down_sampling_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype(dtype)
+        layer = image_preprocessing.Resizing(
+            height=2, width=2, interpolation='nearest')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [5, 7],
+            [13, 15]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_up_sampling_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 4), (1, 2, 2, 1)).astype(dtype)
+        layer = image_preprocessing.Resizing(
+            height=4, width=4, interpolation='nearest')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 1, 1],
+            [0, 0, 1, 1],
+            [2, 2, 3, 3],
+            [2, 2, 3, 3]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 4, 4, 1))
+        self.assertAllEqual(expected_output, output_image)
+
   @parameterized.named_parameters(
       ('reshape_bilinear_10_by_4', {'interpolation': 'bilinear'}, 10, 4))
   def test_reshaping(self, kwargs, expected_height, expected_width):
@@ -223,6 +257,21 @@ class RandomCropTest(keras_parameterized.TestCase):
     with CustomObjectScope({'RandomCrop': image_preprocessing.RandomCrop}):
       self._run_test(expected_height, expected_width)
 
+  def test_random_crop_full_height(self):
+    self._run_test(5, 2)
+
+  def test_random_crop_full_width(self):
+    self._run_test(3, 8)
+
+  def test_random_crop_full(self):
+    np.random.seed(1337)
+    height, width = 8, 16
+    inp = np.random.random((12, 8, 16, 3))
+    with tf_test_util.use_gpu():
+      layer = image_preprocessing.RandomCrop(height, width)
+      actual_output = layer(inp, training=0)
+      self.assertAllClose(inp, actual_output)
+
   def test_predicting_with_mock_longer_height(self):
     np.random.seed(1337)
     height, width = 3, 3
@@ -242,8 +291,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     with tf_test_util.use_gpu():
       layer = image_preprocessing.RandomCrop(height, width)
       actual_output = layer(inp, training=0)
-      resized_inp = image_ops.resize_images_v2(
-          inp, size=[4, 8])
+      resized_inp = image_ops.resize_images_v2(inp, size=[4, 8])
       expected_output = resized_inp[:, :, 1:7, :]
       self.assertAllClose(expected_output, actual_output)
 
@@ -303,9 +351,9 @@ class RandomFlipTest(keras_parameterized.TestCase):
     if expected_output is None:
       expected_output = inp
       if mode == 'horizontal' or mode == 'horizontal_and_vertical':
-        expected_output = np.flip(expected_output, axis=1)
-      if mode == 'vertical' or mode == 'horizontal_and_vertical':
         expected_output = np.flip(expected_output, axis=2)
+      if mode == 'vertical' or mode == 'horizontal_and_vertical':
+        expected_output = np.flip(expected_output, axis=1)
     with test.mock.patch.object(
         random_ops, 'random_uniform', return_value=mock_random):
       with tf_test_util.use_gpu():
@@ -328,7 +376,7 @@ class RandomFlipTest(keras_parameterized.TestCase):
       mock_random = np.reshape(mock_random, [2, 1, 1, 1])
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images.copy()
-      expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=0)
+      expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=1)
       self._run_test('horizontal', expected_output, mock_random)
 
   def test_random_flip_vertical_half(self):
@@ -338,7 +386,7 @@ class RandomFlipTest(keras_parameterized.TestCase):
       mock_random = np.reshape(mock_random, [2, 1, 1, 1])
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images.copy()
-      expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=1)
+      expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=0)
       self._run_test('vertical', expected_output, mock_random)
 
   def test_random_flip_inference(self):
@@ -475,21 +523,152 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('random_translate_4_by_6', .4, .6), ('random_translate_3_by_2', .3, .2),
-      ('random_translate_tuple_factor', (.5, .4), (.2, .3)))
+      ('random_translate_tuple_factor', (-.5, .4), (.2, .3)))
   def test_random_translation(self, height_factor, width_factor):
     self._run_test(height_factor, width_factor)
 
-  def test_random_translation_negative_lower(self):
-    mock_offset = np.random.random((12, 1))
-    with test.mock.patch.object(
-        gen_stateful_random_ops, 'stateful_uniform', return_value=mock_offset):
-      with self.cached_session(use_gpu=True):
-        layer = image_preprocessing.RandomTranslation((-0.2, .3), .4)
-        layer_2 = image_preprocessing.RandomTranslation((0.2, .3), .4)
-        inp = np.random.random((12, 5, 8, 3)).astype(np.float32)
-        actual_output = layer(inp, training=1)
-        actual_output_2 = layer_2(inp, training=1)
-        self.assertAllClose(actual_output, actual_output_2)
+  def test_random_translation_up_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(-.2, -.2), width_factor=0.)
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19],
+            [20, 21, 22, 23, 24],
+            [20, 21, 22, 23, 24]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_up_numeric_constant(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(-.2, -.2), width_factor=0., fill_mode='constant')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19],
+            [20, 21, 22, 23, 24],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_down_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by .2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(.2, .2), width_factor=0.)
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_asymmetric_size_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 16), (1, 8, 2, 1)).astype(dtype)
+        # Shifting by .5 * 8 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(.5, .5), width_factor=0.)
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [6, 7],
+            [4, 5],
+            [2, 3],
+            [0, 1],
+            [0, 1],
+            [2, 3],
+            [4, 5],
+            [6, 7],
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 8, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_down_numeric_constant(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(.2, .2), width_factor=0., fill_mode='constant')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_left_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by .2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=0., width_factor=(-.2, -.2))
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [1, 2, 3, 4, 4],
+            [6, 7, 8, 9, 9],
+            [11, 12, 13, 14, 14],
+            [16, 17, 18, 19, 19],
+            [21, 22, 23, 24, 24]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_left_numeric_constant(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=0., width_factor=(-.2, -.2), fill_mode='constant')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [1, 2, 3, 4, 0],
+            [6, 7, 8, 9, 0],
+            [11, 12, 13, 14, 0],
+            [16, 17, 18, 19, 0],
+            [21, 22, 23, 24, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
 
   def test_random_translation_inference(self):
     with CustomObjectScope(
@@ -768,7 +947,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(('random_rotate_4', .4),
                                   ('random_rotate_3', .3),
-                                  ('random_rotate_tuple_factor', (.5, .4)))
+                                  ('random_rotate_tuple_factor', (-.5, .4)))
   def test_random_rotation(self, factor):
     self._run_test(factor)
 
@@ -808,22 +987,75 @@ class RandomZoomTest(keras_parameterized.TestCase):
           expected_output_shape=(None, orig_height, orig_width, channels))
 
   @parameterized.named_parameters(
-      ('random_zoom_4_by_6', .4, .6), ('random_zoom_2_by_3', .2, .3),
-      ('random_zoom_tuple_factor', (.4, .5), (.2, .3)))
+      ('random_zoom_4_by_6', -.4, -.6), ('random_zoom_2_by_3', -.2, -.3),
+      ('random_zoom_tuple_factor', (-.4, -.5), (-.2, -.3)))
   def test_random_zoom_in(self, height_factor, width_factor):
     self._run_test(height_factor, width_factor)
 
   @parameterized.named_parameters(
-      ('random_zoom_4_by_6', 1.4, 1.6), ('random_zoom_2_by_3', 1.2, 1.3),
-      ('random_zoom_tuple_factor', (1.4, 1.5), (1.2, 1.3)))
+      ('random_zoom_4_by_6', .4, .6), ('random_zoom_2_by_3', .2, .3),
+      ('random_zoom_tuple_factor', (.4, .5), (.2, .3)))
   def test_random_zoom_out(self, height_factor, width_factor):
     self._run_test(height_factor, width_factor)
 
-  def test_random_zoom_invalid_factor(self):
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomZoom((.5, .4), .2)
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomZoom(.2, (.5, .4))
+  def test_random_zoom_in_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((-.5, -.5), (-.5, -.5),
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [6, 7, 7, 8, 8],
+            [11, 12, 12, 13, 13],
+            [11, 12, 12, 13, 13],
+            [16, 17, 17, 18, 18],
+            [16, 17, 17, 18, 18]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_zoom_out_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((.5, .5), (.8, .8),
+                                               fill_mode='constant',
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 5, 7, 9, 0],
+            [0, 10, 12, 14, 0],
+            [0, 20, 22, 24, 0],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_zoom_out_numeric_preserve_aspect_ratio(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((.5, .5),
+                                               fill_mode='constant',
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 6, 7, 9, 0],
+            [0, 11, 12, 14, 0],
+            [0, 21, 22, 24, 0],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
 
   def test_random_zoom_inference(self):
     with CustomObjectScope(
@@ -861,7 +1093,7 @@ class RandomHeightTest(keras_parameterized.TestCase):
       self.assertEqual(img_out.shape[3], 3)
 
   @parameterized.named_parameters(('random_height_4_by_6', (.4, .6)),
-                                  ('random_height_3_by_2', (.3, 1.2)),
+                                  ('random_height_3_by_2', (-.3, .2)),
                                   ('random_height_3', .3))
   def test_random_height_basic(self, factor):
     self._run_test(factor)
@@ -877,6 +1109,39 @@ class RandomHeightTest(keras_parameterized.TestCase):
         img_out = layer(img, training=True)
         self.assertEqual(img_out.shape[1], 3)
 
+  def test_random_height_longer_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
+        layer = image_preprocessing.RandomHeight(factor=(1., 1.))
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 1, 2],
+            [0.75, 1.75, 2.75],
+            [2.25, 3.25, 4.25],
+            [3, 4, 5]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 4, 3, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_height_shorter_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 8), (4, 2, 1)).astype(dtype)
+        layer = image_preprocessing.RandomHeight(
+            factor=(-.5, -.5), interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [2, 3],
+            [6, 7]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
   def test_random_height_invalid_factor(self):
     with self.assertRaises(ValueError):
       image_preprocessing.RandomHeight((-1.5, .4))
@@ -916,7 +1181,7 @@ class RandomWidthTest(keras_parameterized.TestCase):
       self.assertEqual(img_out.shape[3], 3)
 
   @parameterized.named_parameters(('random_width_4_by_6', (.4, .6)),
-                                  ('random_width_3_by_2', (.3, 1.2)),
+                                  ('random_width_3_by_2', (-.3, .2)),
                                   ('random_width_3', .3))
   def test_random_width_basic(self, factor):
     self._run_test(factor)
@@ -932,6 +1197,38 @@ class RandomWidthTest(keras_parameterized.TestCase):
         img_out = layer(img, training=True)
         self.assertEqual(img_out.shape[2], 3)
 
+  def test_random_width_longer_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
+        layer = image_preprocessing.RandomWidth(factor=(1., 1.))
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0.25, 0.75, 1],
+            [2, 2.25, 2.75, 3],
+            [4, 4.25, 4.75, 5]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 3, 4, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_width_shorter_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 8), (2, 4, 1)).astype(dtype)
+        layer = image_preprocessing.RandomWidth(
+            factor=(-.5, -.5), interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [1, 3],
+            [5, 7]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
   def test_random_width_invalid_factor(self):
     with self.assertRaises(ValueError):
       image_preprocessing.RandomWidth((-1.5, .4))
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index 6c40d1618bc..c0d0d266ad3 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -24,17 +24,12 @@ import operator
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras.layers.preprocessing import table_utils
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops.ragged import ragged_functional_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
 
 # The string tokens in the extracted vocabulary
@@ -47,14 +42,16 @@ _ACCUMULATOR_COUNTS_NAME = "counts"
 
 
 class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
-  """Maps strings (or integers) from a vocabulary to integer indices.
+  """Maps values from a vocabulary to integer indices.
 
-  This layer translates a set of arbitrary strings or integers into an integer
-  output via a table-based lookup, with optional out-of-vocabulary handling.
+  This layer translates a set of arbitrary hashables into an integer output via
+  a table-based lookup, with optional out-of-vocabulary handling. This is the
+  basis layer for both IntegerLookup and IndexLookup; it holds the common
+  logic but is not intended to be exported as part of the Keras API.
 
   If desired, the user can call this layer's `adapt()` method on a data set,
   which will analyze the data set, determine the frequency of individual string
-  or integer values, and create a vocabulary from them. This vocabulary can have
+  values, and create a vocabulary from them. This vocabulary can have
   unlimited size or be capped, depending on the configuration options for this
   layer; if there are more unique values in the input than the maximum
   vocabulary size, the most frequent terms will be used to create the
@@ -62,79 +59,53 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
-      there is no cap on the size of the vocabulary. Note that the vocabulary
-      does include OOV buckets, so the effective number of unique values in the
-      vocabulary is `(max_tokens - num_oov_tokens)` when this value is set.
-    num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to
-      1. If this value is more than 1, OOV inputs are hashed to determine their
-      OOV value; if this value is 0, passing an OOV input will result in a '-1'
-      being returned for that value in the output tensor. (Note that, because
-      the value is -1 and not 0, this will allow you to effectively drop OOV
-      values from categorical encodings.)
-    vocabulary: An optional list of vocabulary terms, or a path to a text file
-      containing a vocabulary to load into this layer. The file should contain
-      one token per line. In either case, the vocabulary must be unique; if
-      the list or file contains the same token multiple times, an error will
-      be thrown. Note that when passing a vocabulary - either as a list or as
-      a file - the vocabulary will not be present in the layer's config dict;
-      it will instead be a part of the layer's weights.
-    reserve_zero: Whether to reserve the index 0, which indicates pad values in
-      the Keras masking system. If True, the output of this layer will be in the
-      range `[1...max_tokens+1)`; if False, the output will be in the range
-      `[0...max_tokens)`. Defaults to True.
-    mask_zero: If True, input values of 0 (for integers) and `""` (for strings)
-      will be treated as masked values and assigned an output value of 0. If
-      this option is set, `reserve_zero` must also be set. Defaults to False.
-
-  Call arguments:
-    inputs: The data to look up. Can be a tf.Tensor or RaggedTensor.
-    invert: Controls the lookup direction. If False, the layer will map strings
-      to integers; if true, the layer will map integers to strings. Defaults
-      to False.
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use. If this
+      value is more than 1, OOV inputs are hashed to determine their OOV value;
+      if this value is 0, passing an OOV input will result in a '-1' being
+      returned for that value in the output tensor. (Note that, because the
+      value is -1 and not 0, this will allow you to effectively drop OOV values
+      from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. If set to None, no mask term will be added and the OOV tokens, if
+      any, will be indexed from (0...num_oov_indices) instead of
+      (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. This token is
+      only used when performing an inverse lookup.
+    vocabulary: An optional list of vocabulary terms. If the list contains the
+      same token multiple times, an error will be thrown.
+    invert: If true, this layer will map indices to vocabulary items instead
+      of mapping vocabulary items to indices.
   """
-  # TODO(momernick): Add an examples section to the docstring.
 
   def __init__(self,
-               max_tokens=None,
-               num_oov_tokens=1,
+               max_tokens,
+               num_oov_indices,
+               mask_token,
+               oov_token,
                vocabulary=None,
-               reserve_zero=True,
-               mask_zero=False,
+               invert=False,
                **kwargs):
-    allowed_dtypes = [dtypes.string, dtypes.int64]
-    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
-      raise ValueError(
-          "TextVectorization may only have a dtype of string or int64.")
-    elif "dtype" not in kwargs:
-      kwargs["dtype"] = dtypes.string
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
     # are creating a 0-element vocab, which doesn't make sense.
     if max_tokens is not None and max_tokens <= 1:
-      raise ValueError("max_tokens must be greater than 1.")
+      raise ValueError("If set, `max_tokens` must be greater than 1.")
 
-    # For now, limit the num_oov_tokens to one.
-    if num_oov_tokens < 0:
-      raise ValueError("num_oov_tokens must be greater than 0. You passed %s" %
-                       num_oov_tokens)
+    if num_oov_indices < 0:
+      raise ValueError("`num_oov_indices` must be greater than 0. You passed "
+                       "%s" % num_oov_indices)
 
+    if invert and num_oov_indices != 1:
+      raise ValueError("`num_oov_tokens` must be 1 when `invert` is True.")
+
+    self.invert = invert
     self.max_tokens = max_tokens
-    self.num_oov_tokens = num_oov_tokens
-    self.reserve_zero = reserve_zero
-    self.mask_zero = mask_zero
-
-    # We need to reserve at least num_oov_tokens tokens, plus one additional
-    # value if we are reserving the zero value in our output.
-    if reserve_zero:
-      self._reserved_values = (num_oov_tokens + 1)
-    else:
-      self._reserved_values = num_oov_tokens
-
-    # We need to account for the OOV buckets in our vocabulary size.
-    if max_tokens is not None:
-      self._max_elements = max_tokens - num_oov_tokens
-    else:
-      self._max_elements = None
+    self.num_oov_indices = num_oov_indices
+    self.oov_token = oov_token
+    self.mask_token = mask_token
 
     # If there is only one OOV bucket, we can determine the OOV value (either 0
     # or 1 depending on whether 0 is reserved) and set that as the default
@@ -142,27 +113,32 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # do a further hashing step; to make this easier, we set the OOV value to
     # -1. (This lets us do a vectorized add and cast to boolean to determine
     # locations where we need to do extra hashing.)
-    if self.num_oov_tokens == 1:
-      self._oov_value = 1 if reserve_zero else 0
+    if self.num_oov_indices == 1:
+      self._oov_value = 0 if mask_token is None else 1
     else:
       self._oov_value = -1
 
     super(IndexLookup, self).__init__(
-        combiner=_IndexLookupCombiner(self.max_tokens), **kwargs)
+        combiner=_IndexLookupCombiner(self.max_tokens, self.mask_token),
+        **kwargs)
 
-    # This layer supports RaggedTensor inputs.
-    self._supports_ragged_inputs = True
+    self._output_dtype = dtypes.int64
 
-    # If the layer's input type is int32, we can only output int32 values -
-    # MutableHashTable doesn't allow us to map int32->int64.
-    if self.dtype == dtypes.int32:
-      self._output_dtype = dtypes.int32
+    # We need to save the key dtype so that we know if we're expecting int64
+    # keys. If we are, we will cast int32 inputs to int64 as well.
+    if invert:
+      self._key_dtype = self._output_dtype
+      value_dtype = self.dtype
+      oov_value = self.oov_token
     else:
-      self._output_dtype = dtypes.int64
+      self._key_dtype = self.dtype
+      value_dtype = self._output_dtype
+      oov_value = self._oov_value
+
     self._table = lookup_ops.MutableHashTable(
-        key_dtype=self.dtype,
-        value_dtype=self._output_dtype,
-        default_value=self._oov_value,
+        key_dtype=self._key_dtype,
+        value_dtype=value_dtype,
+        default_value=oov_value,
         name=(self._name + "_index_table"))
     tracked_table = self._add_trackable(self._table, trainable=False)
     # This is a workaround for summary() on this layer. Because the table is
@@ -171,100 +147,27 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # counting code in the Model object doesn't throw an attribute error.
     tracked_table.shape = tensor_shape.TensorShape((0,))
 
-    self._inverse_table = None
+    if self.num_oov_indices <= 1:
+      oov_indices = None
+    else:
+      oov_start = 1 if mask_token is not None else 0
+      oov_end = oov_start + num_oov_indices
+      oov_indices = list(range(oov_start, oov_end))
+
+    self._table_handler = table_utils.TableHandler(
+        table=self._table,
+        oov_tokens=oov_indices,
+        use_v1_apis=self._use_v1_apis())
 
     if vocabulary is not None:
-      if isinstance(vocabulary, str):
-        vocabulary = self._get_vocabulary_from_file(vocabulary)
-
-      vocabulary_set = set(vocabulary)
-      if len(vocabulary) != len(vocabulary_set):
-        repeated_items = [
-            item for item, count in collections.Counter(vocabulary).items()
-            if count > 1
-        ]
-        raise ValueError("The passed vocabulary has at least one repeated "
-                         "term. Please uniquify your dataset before passing "
-                         "it to IndexLookup(). The repeated terms are %s" %
-                         repeated_items)
       self.set_vocabulary(vocabulary)
 
-  def _get_vocabulary_from_file(self, vocabulary_path):
-    vocab = []
-    with gfile.GFile(vocabulary_path, "r") as reader:
-      while True:
-        # Get the next line, and break if it is None.
-        text = reader.readline()
-        if not text:
-          break
-
-        # Convert the raw text into UTF8 and strip whitespace.
-        if isinstance(text, str):
-          token = text
-        elif isinstance(text, bytes):
-          token = text.decode("utf-8", "ignore")
-        token = token.strip()
-        vocab.append(token)
-    return vocab
-
-  def _get_table_data(self):
-    keys, values = self._table.export()
-    return (keys.numpy(), values.numpy())
-
-  def vocab_size(self):
-    return self._table.size().numpy()
-
-  def _clear_table(self):
-    keys, _ = self._table.export()
-    self._table.remove(keys)
-    if self._inverse_table:
-      keys, _ = self._inverse_table.export()
-      self._inverse_table.remove(keys)
-
-  def _insert_table_data(self, keys, values):
-    if len(values) != len(keys):
-      raise RuntimeError("Size mismatch between values and key arrays. "
-                         "Keys had size %s, values had size %s." %
-                         (len(keys), len(values)))
-    self._table.insert(keys, values)
-    if self._inverse_table:
-      self._inverse_table.insert(values, keys)
-
-  def _initialize_inverse_table(self):
-    keys, values = self._table.export()
-    self._inverse_table.insert(values, keys)
-
-  def _to_numpy(self, preprocessed_data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(preprocessed_data, np.ndarray):
-      return preprocessed_data
-    return np.array(preprocessed_data.to_list())
-  # End of V1/V2 shim points.
-
-  def _assert_same_type(self, expected_type, values, value_name):
-    if dtypes.as_dtype(expected_type) != dtypes.as_dtype(values.dtype):
-      raise RuntimeError("Expected %s type %s, got %s" %
-                         (value_name, expected_type, values.dtype))
-
-  def _convert_to_ndarray(self, x, dtype=None):
-    array = np.array(x) if isinstance(x, (list, tuple)) else x
-    if dtype not in (None, dtypes.string):
-      # If the dtype is an integer, we do permissive casting. This allows
-      # users to examine int32 data if the dtype is int64 without trouble.
-      np_dtype = dtypes.as_dtype(dtype).as_numpy_dtype
-      if np.can_cast(array.dtype, np_dtype):
-        array = array.astype(np_dtype, casting="safe")
-    return array
-
   def compute_output_shape(self, input_shape):
     return input_shape
 
-  def compute_output_signature(self, input_spec, invert=False):
+  def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    if invert:
-      output_dtype = dtypes.string
-    else:
-      output_dtype = dtypes.int64
+    output_dtype = self.dtype if self.invert else self._output_dtype
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def adapt(self, data, reset_state=True):
@@ -285,21 +188,28 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     super(IndexLookup, self).adapt(data, reset_state)
 
   def get_vocabulary(self):
-    if self.vocab_size() == 0:
+    if self._table_handler.vocab_size() == 0:
       return []
 
-    keys, values = self._get_table_data()
+    keys, values = self._table_handler.data()
     # This is required because the MutableHashTable doesn't preserve insertion
     # order, but we rely on the order of the array to assign indices.
-    return [x for _, x in sorted(zip(values, keys))]
+    if self.invert:
+      # If we are inverting, the vocabulary is in the values instead of keys.
+      return [x for _, x in sorted(zip(keys, values))]
+    else:
+      return [x for _, x in sorted(zip(values, keys))]
+
+  def vocab_size(self):
+    return self._table_handler.vocab_size()
 
   def get_config(self):
     config = {
+        "invert": self.invert,
         "max_tokens": self.max_tokens,
-        "num_oov_tokens": self.num_oov_tokens,
-        "vocabulary": None,
-        "reserve_zero": self.reserve_zero,
-        "mask_zero": self.mask_zero,
+        "num_oov_indices": self.num_oov_indices,
+        "oov_token": self.oov_token,
+        "mask_token": self.mask_token,
     }
     base_config = super(IndexLookup, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -311,121 +221,154 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # abstraction for ease of saving!) we return 0.
     return 0
 
-  def set_vocabulary(self,
-                     vocab,
-                     append=False):
-    """Sets vocabulary (and optionally document frequency) data for this layer.
+  def _set_forward_vocabulary(self, vocab):
+    """Sets vocabulary data for this layer when inverse is False."""
+    table_utils.validate_vocabulary_is_unique(vocab)
+
+    should_have_mask = self.mask_token is not None
+    has_mask = vocab[0] == self.mask_token
+    oov_start = 1 if should_have_mask else 0
+
+    should_have_oov = (self.num_oov_indices > 0) and not self.invert
+    if should_have_oov:
+      oov_end = oov_start + self.num_oov_indices
+      expected_oov = [self.oov_token] * self.num_oov_indices
+      has_oov = vocab[oov_start:oov_end] == expected_oov
+      # If we get a numpy array, then has_oov may end up being a numpy array
+      # instead of a bool. Fix this by collapsing the variable if it's not bool.
+      if not isinstance(has_oov, bool):
+        has_oov = any(has_oov)
+    else:
+      has_oov = False
+
+    if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
+      raise ValueError("The passed vocabulary has the correct mask token `%s` "
+                       "at index 0, but does not have the OOV token `%s` in "
+                       "indices [%s:%s]. Instead, we found `%s`. Was this "
+                       "vocabulary generated by a layer with incompatible "
+                       "settings?" %
+                       (self.mask_token, self.oov_token, oov_start, oov_end,
+                        vocab[oov_start:oov_end]))
+
+    if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
+      raise ValueError(
+          "The passed vocabulary has the correct OOV token `%s` at "
+          "indices [%s:%s], but does not have the mask token `%s` in "
+          "index 0. Instead, we found `%s`. Was this vocabulary "
+          "generated by a layer with incompatible settings?" %
+          (self.oov_token, oov_start, oov_end, self.mask_token, vocab[0]))
+
+    insert_special_tokens = not has_oov and not has_mask
+
+    special_tokens = [] if self.mask_token is None else [self.mask_token]
+    special_tokens.extend([self.oov_token] * self.num_oov_indices)
+
+    num_special_tokens = len(special_tokens)
+    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
+    if self.mask_token in tokens:
+      raise ValueError("Reserved mask token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "mask token for this layer." %
+                       (self.mask_token, tokens.index(self.mask_token)))
+    if self.oov_token in tokens:
+      raise ValueError("Reserved OOV token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "OOV token for this layer." %
+                       (self.oov_token, tokens.index(self.oov_token)))
+
+    if insert_special_tokens:
+      total_vocab_size = len(vocab) + num_special_tokens
+    else:
+      total_vocab_size = len(vocab)
+    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
+      raise ValueError(
+          "Attempted to set a vocabulary larger than the maximum vocab size. "
+          "Passed vocab size is %s, max vocab size is %s." %
+          (total_vocab_size, self.max_tokens))
+
+    start_index = num_special_tokens
+    values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
+
+    self._table_handler.clear()
+    self._table_handler.insert(vocab, values)
+
+    if insert_special_tokens and num_special_tokens > 0:
+      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
+      self._table_handler.insert(special_tokens, special_token_values)
+
+  def _set_inverse_vocabulary(self, vocab):
+    """Sets vocabulary data for this layer when inverse is True."""
+    table_utils.validate_vocabulary_is_unique(vocab)
+
+    should_have_mask = self.mask_token is not None
+    has_mask = vocab[0] == self.mask_token
+
+    insert_special_tokens = should_have_mask and not has_mask
+    special_tokens = [] if self.mask_token is None else [self.mask_token]
+
+    num_special_tokens = len(special_tokens)
+    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
+    if self.mask_token in tokens:
+      raise ValueError("Reserved mask token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "mask token for this layer." %
+                       (self.mask_token, tokens.index(self.mask_token)))
+
+    if insert_special_tokens:
+      total_vocab_size = len(vocab) + num_special_tokens
+    else:
+      total_vocab_size = len(vocab)
+    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
+      raise ValueError(
+          "Attempted to set a vocabulary larger than the maximum vocab size. "
+          "Passed vocab size is %s, max vocab size is %s." %
+          (total_vocab_size, self.max_tokens))
+
+    start_index = num_special_tokens if insert_special_tokens else 0
+    values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
+
+    self._table_handler.clear()
+    self._table_handler.insert(values, vocab)
+
+    if insert_special_tokens and num_special_tokens > 0:
+      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
+      self._table_handler.insert(special_token_values, special_tokens)
+
+  def set_vocabulary(self, vocab):
+    """Sets vocabulary data for this layer with inverse=False.
 
     This method sets the vocabulary for this layer directly, instead of
     analyzing a dataset through 'adapt'. It should be used whenever the vocab
     information is already known. If vocabulary data is already present in the
-    layer, this method will either replace it, if 'append' is set to False, or
-    append to it (if 'append' is set to True).
+    layer, this method will either replace it
 
     Arguments:
       vocab: An array of string tokens.
-      append: Whether to overwrite or append any existing vocabulary data.
 
     Raises:
       ValueError: If there are too many inputs, the inputs do not match, or
         input data is missing.
     """
-    current_table_size = self.vocab_size()
-    total_vocab_size = len(vocab) + (current_table_size if append else 0)
-    if self.max_tokens is not None and total_vocab_size > self._max_elements:
-      raise ValueError(
-          "Attempted to set a vocabulary larger than the maximum vocab size. "
-          "Passed vocab size is %s, max vocab size is %s. Note that the OOV "
-          "token(s) are automatically added to the number of tokens." %
-          (total_vocab_size, self.max_tokens))
-
-    start_index = self._reserved_values + (self.vocab_size() if append else 0)
-    values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
-    vocab = self._convert_to_ndarray(vocab, self.dtype)
-    self._assert_same_type(self.dtype, vocab, "vocab")
-
-    values = self._convert_to_ndarray(values, self._output_dtype)
-    self._assert_same_type(self._output_dtype, values, "values")
-
-    if not append and self.vocab_size() > 0:
-      self._clear_table()
-    self._insert_table_data(vocab, values)
+    if self.invert:
+      self._set_inverse_vocabulary(vocab)
+    else:
+      self._set_forward_vocabulary(vocab)
 
   def _set_state_variables(self, updates):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
     self.set_vocabulary(updates[_VOCAB_NAME])
 
-  def __call__(self, inputs, invert=False, **kwargs):
-    if invert and not self._inverse_table:
-      # If the user wants to perform an inverse lookup, we need to build an
-      # inverse lookup table and initialize it to have the inverse of the
-      # forward table's vocabulary.
-      self._inverse_table = lookup_ops.MutableHashTable(
-          key_dtype=self._output_dtype,
-          value_dtype=self.dtype,
-          default_value="",
-          name=(self._name + "_inverse_index_table"))
+  def call(self, inputs):
+    if self._key_dtype == dtypes.int64 and inputs.dtype == dtypes.int32:
+      inputs = math_ops.cast(inputs, dtypes.int64)
+    return self._table_handler.lookup(inputs)
 
-      tracked_inverse_table = self._add_trackable(
-          self._inverse_table, trainable=False)
-      # This is a workaround for summary() on this layer. Because the table is
-      # not mutable during training, the effective number of parameters (and so
-      # the weight shape) is 0; we add this as an attr so that the parameter
-      # counting code in the Model object doesn't throw an attribute error.
-      tracked_inverse_table.shape = tensor_shape.TensorShape((0,))
-
-      # This is a workaround for saving not working yet for MutableHashTables.
-      # By replacing the existing function call by an explicit failure, we
-      # can provide a more user-friendly error message.
-      def fail(_):
-        raise NotImplementedError(
-            "Saving is not yet supported for IndexLookup layers.")
-
-      self._inverse_table._list_extra_dependencies_for_serialization = fail  # pylint: disable=protected-access
-      self._initialize_inverse_table()
-
-    return super(IndexLookup, self).__call__(inputs, invert=invert, **kwargs)
-
-  def replace_oov_buckets(self, inputs, lookups):
-    if self.num_oov_tokens <= 1:
-      return lookups
-
-    if inputs.dtype.is_integer:
-      inputs = string_ops.as_string(inputs)
-    hashed_inputs = string_ops.string_to_hash_bucket_fast(
-        inputs, num_buckets=self.num_oov_tokens)
-    if self.reserve_zero:
-      hashed_inputs = math_ops.add(hashed_inputs, 1)
-    return array_ops.where(math_ops.equal(lookups, -1), hashed_inputs, lookups)
-
-  def call(self, inputs, invert=False):
-    table = self._inverse_table if invert else self._table
-    # The table lookup ops don't natively support ragged tensors, so if we have
-    # a RT we need to use map_flat_values to look up every element.
-    if ragged_tensor.is_ragged(inputs):
-      indexed_data = ragged_functional_ops.map_flat_values(table.lookup, inputs)
-      if not invert:
-        indexed_data = ragged_functional_ops.map_flat_values(
-            self.replace_oov_buckets, inputs, indexed_data)
-    elif isinstance(
-        inputs, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
-      if not invert:
-        values = self.replace_oov_buckets(inputs.values,
-                                          table.lookup(inputs.values))
-      indexed_data = sparse_tensor.SparseTensor(inputs.indices, values,
-                                                inputs.dense_shape)
-    else:
-      indexed_data = table.lookup(inputs)
-      if not invert:
-        indexed_data = self.replace_oov_buckets(inputs, indexed_data)
-      # (b/149446477): output does not preserve input shape.
-      indexed_data.set_shape(inputs.shape)
-
-    # Composite tensors can pass tensor values through, which will cause
-    # errors if this is the only layer in the model. To fix this, pass
-    # the output through an identity op.
-    return array_ops.identity(indexed_data)
+  def _use_v1_apis(self):
+    return False
 
 
 class _IndexLookupAccumulator(
@@ -446,23 +389,28 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
       dataset, all tokens are retained.s
   """
 
-  def __init__(self, vocab_size=None):
+  def __init__(self, vocab_size=None, mask_value=None):
     self._vocab_size = vocab_size
+    self._mask_value = mask_value
 
   def compute(self, values, accumulator=None):
     """Compute a step in this computation, returning a new accumulator."""
-    values = base_preprocessing_layer.convert_to_list(values)
+    values = base_preprocessing_layer.convert_to_list(
+        values, sparse_default_value=self._mask_value)
 
     if accumulator is None:
       accumulator = self._create_accumulator()
 
     # TODO(momernick): Benchmark improvements to this algorithm.
-    for document in values:
-      if not isinstance(document, list):
-        accumulator.count_dict[document] += 1
-      else:
-        for token in document:
-          accumulator.count_dict[token] += 1
+    if isinstance(values, (str, bytes, np.int64)):
+      accumulator.count_dict[values] += 1
+    else:
+      for document in values:
+        if not isinstance(document, list):
+          accumulator.count_dict[document] += 1
+        else:
+          for token in document:
+            accumulator.count_dict[token] += 1
 
     return accumulator
 
@@ -489,6 +437,8 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
         "vocab": A list of the retained items in the vocabulary.
     """
     vocab_counts = accumulator.count_dict
+    if self._mask_value in vocab_counts:
+      del vocab_counts[self._mask_value]
     sorted_counts = sorted(
         vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
     vocab_data = (
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
new file mode 100644
index 00000000000..098e67f5f6b
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
@@ -0,0 +1,82 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import context
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return index_lookup.IndexLookup
+  else:
+    return index_lookup_v1.IndexLookup
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=strategy_combinations.all_strategies,
+        mode=["eager", "graph"]))
+class IndexLookupDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_tpu_distribution(self, distribution):
+    vocab_data = [[
+        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
+        "and", "fire"
+    ]]
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
+        2, drop_remainder=True)
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    config.set_soft_device_placement(True)
+
+    with distribution.scope():
+      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+      layer = get_layer_class()(
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
+      layer.adapt(vocab_dataset)
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_dataset)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index d9990ddb037..73189d9b9f1 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
@@ -69,11 +68,39 @@ def _get_end_to_end_test_cases():
                         ["and"], ["earth"], ["michigan"]]),
           "kwargs": {
               "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
           "input_dtype":
               dtypes.string
       },
+      {
+          "testcase_name":
+              "test_inverse_strings_soft_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data": np.array([[1], [2], [3], [4], [4], [3], [1], [5]]),
+          "kwargs": {
+              "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
+              "invert": True
+          },
+          "expected_output":
+              np.array([[b"earth"], [b"wind"], [b"and"], [b"fire"], [b"fire"],
+                        [b"and"], [b"earth"], [b"[OOV]"]]),
+          "input_dtype":
+              dtypes.int64
+      },
       {
           "testcase_name":
               "test_ints_soft_vocab_cap",
@@ -89,6 +116,9 @@ def _get_end_to_end_test_cases():
                        dtype=np.int64),
           "kwargs": {
               "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": 0,
+              "oov_token": -1,
               "dtype": dtypes.int64,
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
@@ -119,7 +149,11 @@ class IndexLookupLayerTest(keras_parameterized.TestCase,
                                        use_dataset, expected_output,
                                        input_dtype):
     cls = get_layer_class()
-    expected_output_dtype = dtypes.int64
+    if "invert" in kwargs and kwargs["invert"]:
+      expected_output_dtype = kwargs["dtype"]
+    else:
+      expected_output_dtype = dtypes.int64
+
     input_shape = input_data.shape
 
     if use_dataset:
@@ -150,7 +184,10 @@ class IndexLookupLayerTest(keras_parameterized.TestCase,
           expected_output_dtype=expected_output_dtype,
           validate_training=False,
           adapt_data=vocab_data)
-    self.assertAllClose(expected_output, output_data)
+    if "invert" in kwargs and kwargs["invert"]:
+      self.assertAllEqual(expected_output, output_data)
+    else:
+      self.assertAllClose(expected_output, output_data)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -170,7 +207,12 @@ class CategoricalEncodingInputTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -191,7 +233,12 @@ class CategoricalEncodingInputTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -207,7 +254,12 @@ class CategoricalEncodingInputTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -221,7 +273,31 @@ class CategoricalEncodingInputTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_int32_input_with_int64_keys(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int32)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -246,7 +322,12 @@ class CategoricalEncodingMultiOOVTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -259,7 +340,7 @@ class CategoricalEncodingMultiOOVTest(
     vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
     input_array = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 2]],
-        values=np.array([13, 132], dtype=np.int64),
+        values=np.array([13, 133], dtype=np.int64),
         dense_shape=[3, 4])
 
     expected_indices = [[0, 0], [1, 2]]
@@ -268,7 +349,11 @@ class CategoricalEncodingMultiOOVTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
     layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -284,7 +369,12 @@ class CategoricalEncodingMultiOOVTest(
     expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -293,13 +383,17 @@ class CategoricalEncodingMultiOOVTest(
 
   def test_ragged_int_input_multi_bucket(self):
     vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 132]],
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
                                               dtype=np.int64)
     expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
     layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -319,13 +413,14 @@ class CategoricalEncodingAdaptTest(
         dense_shape=[3, 4])
     vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
 
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.adapt(vocab_dataset)
-    # Note that the expected vocabulary has a null string (''). This is because
-    # we assume that sparse tensors are in fact dense tensors with elided
-    # values, not ragged tensors. Therefore, we assume that any missing data
-    # is important and give it a spot in our vocab.
-    expected_vocabulary = ["", "michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
     self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
 
   def test_ragged_adapt(self):
@@ -333,9 +428,14 @@ class CategoricalEncodingAdaptTest(
                                               ["fire", "michigan"]])
     vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
 
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.adapt(vocab_dataset)
-    expected_vocabulary = ["michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
     self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
 
   def test_sparse_int_input(self):
@@ -350,7 +450,12 @@ class CategoricalEncodingAdaptTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -366,7 +471,12 @@ class CategoricalEncodingAdaptTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -380,7 +490,12 @@ class CategoricalEncodingAdaptTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -395,10 +510,14 @@ class CategoricalEncodingAdaptTest(
 
     ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.string,
                                             tensor_shape.TensorShape([]))
-    batched_ds = ds.take(100).batch(1)
+    batched_ds = ds.take(2)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = get_layer_class()(
-        max_tokens=10, num_oov_tokens=0, reserve_zero=False)
+        max_tokens=10,
+        num_oov_indices=0,
+        mask_token=None,
+        oov_token=None,
+        dtype=dtypes.string)
     _ = layer(input_t)
     layer.adapt(batched_ds)
 
@@ -414,7 +533,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -423,7 +547,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
 
   def test_output_shape(self):
     input_data = keras.Input(shape=(4,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
 
@@ -434,7 +563,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(reserve_zero=False)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=None,
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -448,7 +582,13 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
     output_dataset = model.predict(input_array)
@@ -460,15 +600,6 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
                                 preprocessing_test_utils.PreprocessingLayerTest
                                ):
 
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
   def test_int_output_explicit_vocab(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
@@ -476,100 +607,307 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_int_output_explicit_vocab_from_file(self):
-    vocab_list = ["earth", "wind", "and", "fire"]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+  def test_vocab_with_max_cap(self):
+    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=5)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
+  def test_int_vocab_with_max_cap(self):
+    vocab_data = [0, -1, 42, 1276, 1138]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
   def test_non_unique_vocab_fails(self):
     vocab_data = ["earth", "wind", "and", "fire", "fire"]
     with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
-      _ = get_layer_class()(vocabulary=vocab_data)
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
 
-  def test_non_unique_vocab_from_file_fails(self):
-    vocab_list = ["earth", "wind", "and", "fire", "earth"]
-    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+  def test_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = ["custom_mask", "[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = ["", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_repeated_element_fails(self):
+    vocab_data = ["earth", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
-      _ = get_layer_class()(vocabulary=vocab_path)
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="mask_token",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_non_unique_int_vocab_fails(self):
+    vocab_data = [12, 13, 14, 15, 15]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token=0,
+          oov_token=-1,
+          dtype=dtypes.int64)
+
+  def test_int_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = [1234, -1, 11, 21, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = [-1, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = [0, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_repeated_element_fails(self):
+    vocab_data = [11, 11, 34, 23, 124]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = [14, 38, -1, 34, 3, 84]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = [125, 0, 3, 4, 94]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
 
 
 @keras_parameterized.run_all_keras_modes
-class InverseLookupOutputTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+class IndexLookupInverseVocabularyTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
 
-  def test_inverse_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
+  def test_int_output_explicit_vocab(self):
+    vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
+    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
+    expected_output = np.array([["earth", "wind", "and", "fire"],
+                                ["fire", "and", "earth", "[OOV]"]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
     int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
-    int_outputs, string_outputs = model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
 
-  def test_inverse_output_serialization(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
+  def test_vocab_with_max_cap(self):
+    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
     layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
-    with CustomObjectScope({"IndexLookup": get_layer_class()}):
-      new_model = keras.Model.from_config(model.get_config())
-    new_model.set_weights(model.get_weights())
-    int_outputs, string_outputs = new_model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+  def test_int_vocab_with_max_cap(self):
+    vocab_data = [0, -1, 42, 1276, 1138]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64,
+        invert=True)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string,
+          invert=True)
+
+  def test_vocab_with_repeated_element_fails(self):
+    vocab_data = ["earth", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="mask_token",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_non_unique_int_vocab_fails(self):
+    vocab_data = [12, 13, 14, 15, 15]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token=0,
+          oov_token=-1,
+          dtype=dtypes.int64,
+          invert=True)
+
+  def test_int_vocab_with_repeated_element_fails(self):
+    vocab_data = [11, 11, 34, 23, 124]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64,
+        invert=True)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
+      layer.set_vocabulary(vocab_data)
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_eager=True)
@@ -580,7 +918,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
     vocab_data = ["earth", "wind", "and", "fire"]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -594,7 +937,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
     vocab_data = ["earth", "wind", "and", "fire"]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -611,25 +959,24 @@ class IndexLookupErrorTest(keras_parameterized.TestCase,
   def test_too_long_vocab_fails_in_single_setting(self):
     vocab_data = ["earth", "wind", "and", "fire"]
 
-    layer = get_layer_class()(max_tokens=4)
+    layer = get_layer_class()(
+        max_tokens=4,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     with self.assertRaisesRegex(ValueError,
                                 "vocabulary larger than the maximum vocab.*"):
       layer.set_vocabulary(vocab_data)
 
-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    layer = get_layer_class()(max_tokens=4)
-
-    # The first time we call set_vocabulary, we're under the max_tokens
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
   def test_zero_max_tokens_fails(self):
     with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
-      _ = get_layer_class()(max_tokens=0)
+      _ = get_layer_class()(
+          max_tokens=0,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -644,7 +991,12 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
 
     # Build and validate a golden model.
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -673,8 +1025,9 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
 
 
 @keras_parameterized.run_all_keras_modes
-class IndexLookupCombinerTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+class IndexLookupStringCombinerTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
 
   def compare_text_accumulators(self, a, b, msg=None):
     if a is None or b is None:
@@ -802,5 +1155,123 @@ class IndexLookupCombinerTest(keras_parameterized.TestCase,
     self.validate_accumulator_extract(combiner, data, expected_extract_output)
 
 
+@keras_parameterized.run_all_keras_modes
+class IndexLookupIntCombinerTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def compare_text_accumulators(self, a, b, msg=None):
+    if a is None or b is None:
+      self.assertAllEqual(a, b, msg=msg)
+
+    self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
+
+  compare_accumulators = compare_text_accumulators
+
+  def update_accumulator(self, accumulator, data):
+    accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
+
+    return accumulator
+
+  def test_combiner_api_compatibility_int_mode(self):
+    data = np.array([[42, 1138, 725, 1729], [42, 1138, 725, 203]])
+    combiner = index_lookup._IndexLookupCombiner()
+    expected_accumulator_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+        "counts": np.array([2, 2, 2, 1, 1]),
+    }
+    expected_extract_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+    }
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_serialize_and_deserialize(combiner, data,
+                                                        expected_accumulator)
+    self.validate_accumulator_uniqueness(combiner, data)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+  # TODO(askerryryan): Add tests confirming equivalence to behavior of
+  # existing tf.keras.preprocessing.text.Tokenizer.
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "top_k_smaller_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      {
+          "testcase_name": "top_k_larger_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 10,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "no_top_k",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": None,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "single_element_per_row",
+          "data": np.array([[42], [1138], [1729], [1138], [725]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([2, 1, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      # Which tokens are retained are based on global frequency, and thus are
+      # sensitive to frequency within a document. In contrast, because idf only
+      # considers the presence of a token in a document, it is insensitive
+      # to the frequency of the token within the document.
+      {
+          "testcase_name":
+              "retained_tokens_sensitive_to_within_document_frequency",
+          "data":
+              np.array([[42, 42], [1138, 1138], [1729, 1729], [1138, 1138],
+                        [725, 203]]),
+          "vocab_size":
+              3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 42, 1729, 725, 203]),
+              "counts": np.array([4, 2, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 42]),
+          },
+      })
+  def test_combiner_computation(self, data, vocab_size,
+                                expected_accumulator_output,
+                                expected_extract_output):
+    combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size)
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_computation(combiner, data, expected_accumulator)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py
index c6e0b6ed286..47fea11dd57 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import index_lookup
-from tensorflow.python.ops.ragged import ragged_tensor_value
 
 
 class IndexLookup(index_lookup.IndexLookup,
@@ -59,37 +56,5 @@ class IndexLookup(index_lookup.IndexLookup,
       this option is set, reserve_zero must also be set. Defaults to False.
   """
 
-  def _get_table_data(self):
-    keys, values = self._table.export()
-    np_keys = K.get_session().run(keys)
-    np_values = K.get_session().run(values)
-    return (np_keys, np_values)
-
-  def vocab_size(self):
-    return K.get_session().run(self._table.size())
-
-  def _clear_table(self):
-    keys, _ = self._table.export()
-    K.get_session().run(self._table.remove(keys))
-    if self._inverse_table:
-      keys, _ = self._inverse_table.export()
-      K.get_session().run(self._inverse_table.remove(keys))
-
-  def _insert_table_data(self, keys, values):
-    K.get_session().run(self._table.insert(keys, values))
-    if self._inverse_table:
-      K.get_session().run(self._inverse_table.insert(values, keys))
-
-  def _initialize_inverse_table(self):
-    keys, values = self._table.export()
-    K.get_session().run(self._inverse_table.insert(values, keys))
-
-  def _to_numpy(self, data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(data, np.ndarray):
-      return data
-    session = K.get_session()
-    data = session.run(data)
-    if isinstance(data, ragged_tensor_value.RaggedTensorValue):
-      data = np.array(data.to_list())
-    return data
+  def _use_v1_apis(self):
+    return True
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
new file mode 100644
index 00000000000..c42c7cc1b89
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@@ -0,0 +1,116 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+
+
+class IntegerLookup(index_lookup.IndexLookup):
+  """Maps integers from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary integers into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_values: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_tokens - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary values to use; defaults to
+      1. If this value is more than 1, OOV inputs are hashed to determine their
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    mask_value: A value that represents masked inputs, and which is mapped to
+      index 0. Defaults to 0. If set to None, no mask term will be added and the
+      OOV tokens, if any, will be indexed from (0...num_oov_tokens) instead of
+      (1...num_oov_tokens+1).
+    oov_value: The value representing an out-of-vocabulary value. Defaults to
+      -1.
+    vocabulary: An optional list of values, or a path to a text file containing
+      a vocabulary to load into this layer. The file should contain one value
+      per line. If the list or file contains the same token multiple times, an
+      error will be thrown.
+    invert: If true, this layer will map indices to vocabulary items instead
+      of mapping vocabulary items to indices.
+  """
+
+  def __init__(self,
+               max_values=None,
+               num_oov_indices=1,
+               mask_value=0,
+               oov_value=-1,
+               vocabulary=None,
+               invert=False,
+               **kwargs):
+    allowed_dtypes = [dtypes.int64]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("IntegerLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.int64
+
+    # If max_values is set, the value must be greater than 1 - otherwise we
+    # are creating a 0-element vocab, which doesn't make sense.
+    if max_values is not None and max_values <= 1:
+      raise ValueError("If set, max_values must be greater than 1.")
+
+    if num_oov_indices < 0:
+      raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
+                       num_oov_indices)
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
+        vocabulary = [int(v) for v in vocabulary]
+
+    super(IntegerLookup, self).__init__(
+        max_tokens=max_values,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_value,
+        oov_token=oov_value,
+        vocabulary=vocabulary,
+        invert=invert,
+        **kwargs)
+
+  def get_config(self):
+    base_config = super(IntegerLookup, self).get_config()
+    # Because the super config has a bunch of args we're also passing,
+    # we need to rename and remove them from the config dict.
+    base_config["max_values"] = base_config["max_tokens"]
+    del base_config["max_tokens"]
+
+    base_config["mask_value"] = base_config["mask_token"]
+    del base_config["mask_token"]
+
+    base_config["oov_value"] = base_config["oov_token"]
+    del base_config["oov_token"]
+    return base_config
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
new file mode 100644
index 00000000000..0b71c6aaecc
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
@@ -0,0 +1,531 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import os
+import random
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
+from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return integer_lookup.IntegerLookup
+  else:
+    return integer_lookup_v1.IntegerLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_ints_soft_vocab_cap",
+          # Create an array where 1138 is the most frequent term, followed by
+          # 1729, then 725, then 42. This ensures that the vocab accumulator
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
+                        [1729], [725], [725]],
+                       dtype=np.int64),
+          "input_data":
+              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                       dtype=np.int64),
+          "kwargs": {
+              "max_values": None,
+              "dtype": dtypes.int64,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.int64
+      },)
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupLayerTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # IntegerLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"IntegerLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingInputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingMultiOOVTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 133], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(
+        max_values=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_value=0,
+        oov_value=-1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
+                                              dtype=np.int64)
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None, num_oov_indices=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingAdaptTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_adapt(self):
+    vocab_data = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 2]],
+        values=[203, 1729, 203],
+        dense_shape=[3, 4])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_ragged_adapt(self):
+    vocab_data = ragged_factory_ops.constant([[203], [1729, 203]])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_single_int_generator_dataset(self):
+
+    def word_gen():
+      for _ in itertools.count(1):
+        yield random.randint(0, 100)
+
+    ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.int64,
+                                            tensor_shape.TensorShape([]))
+    batched_ds = ds.take(2)
+    input_t = keras.Input(shape=(), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_values=10, num_oov_indices=0, mask_value=None, oov_value=None)
+    _ = layer(input_t)
+    layer.adapt(batched_ds)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupOutputTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_int_output(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_output_shape(self):
+    input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    int_data = layer(input_data)
+    self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
+
+  def test_int_output_no_reserved_zero(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, mask_value=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_values=None,
+    )
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_inverse_output(self):
+    vocab_data = [0, -1, 42, 1138, 725, 1729]
+    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
+    expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(invert=True)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_forward_backward_output(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    inverse_layer = get_layer_class()()
+    layer.set_vocabulary(vocab_data)
+    inverse_layer = get_layer_class()(
+        vocabulary=layer.get_vocabulary(), invert=True)
+    int_data = layer(input_data)
+    inverse_data = inverse_layer(int_data)
+    model = keras.Model(inputs=input_data, outputs=inverse_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupVocabularyTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(str(vocab) + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_int(self):
+    vocab_data = [42, 1138, 725, 1729]
+    expected_vocab = [0, -1, 42, 1138, 725, 1729]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], np.int64)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = [42, 1138, 725, 1729]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = [42, 1138, 725, 1729, 1729]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = [42, 1138, 725, 1729, 42]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class IntegerLookupSaveableTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupErrorTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_too_long_vocab_fails_in_single_setting(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    layer = get_layer_class()(max_values=4, num_oov_indices=1)
+    with self.assertRaisesRegex(ValueError,
+                                "vocabulary larger than the maximum vocab.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_zero_max_values_fails(self):
+    with self.assertRaisesRegex(ValueError, ".*max_values.*"):
+      _ = get_layer_class()(max_values=0, num_oov_indices=1)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupSavingTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_vocabulary_persistence_across_saving(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    # Build and validate a golden model.
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(output_dataset, expected_output)
+
+    # Save the model to disk.
+    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+    model.save(output_path, save_format="tf")
+
+    # Delete the session and graph to ensure that the loaded model is generated
+    # from scratch.
+    # TODO(b/149526183): Can't clear session when TF2 is disabled.
+    if tf2.enabled():
+      keras.backend.clear_session()
+
+    loaded_model = keras.models.load_model(
+        output_path, custom_objects={"IntegerLookup": get_layer_class()})
+
+    # Ensure that the loaded model is unique (so that the save/load is real)
+    self.assertIsNot(model, loaded_model)
+
+    # Validate correctness of the new model.
+    new_output_dataset = loaded_model.predict(input_array)
+    self.assertAllEqual(new_output_dataset, expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/framework/tensor_like.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
similarity index 68%
rename from tensorflow/python/framework/tensor_like.py
rename to tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
index e8fe2f2fc05..ec326f4d78b 100644
--- a/tensorflow/python/framework/tensor_like.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
@@ -1,4 +1,4 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Base class for tensor-like objects."""
-
+"""Keras string lookup preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
 
-class TensorLike(object):
-  """TF-specific types TF operations are expected to natively support.
 
-  Do not check this with isinstance directly; prefer instead using
-  `tf.is_tensor` to check whether converting to a tensor is necessary.
-  """
+class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup):
   pass
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index 5a0b8990486..cf9600a63ab 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -41,7 +41,7 @@ _VARIANCE_NAME = 'variance'
 class Normalization(CombinerPreprocessingLayer):
   """Feature-wise normalization of the data.
 
-  This layer will coerce its inputs into a normal distribution centered around
+  This layer will coerce its inputs into a distribution centered around
   0 with standard deviation 1. It accomplishes this by precomputing the mean and
   variance of the data, and calling (input-mean)/sqrt(var) at runtime.
 
@@ -101,7 +101,7 @@ class Normalization(CombinerPreprocessingLayer):
     self.count = self._add_state_variable(
         name=_COUNT_NAME,
         shape=(),
-        dtype=dtypes.int32,
+        dtype=dtypes.int64,
         initializer=init_ops.zeros_initializer)
 
     super(Normalization, self).build(input_shape)
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
new file mode 100644
index 00000000000..f22556ef723
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
@@ -0,0 +1,136 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import context
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import normalization
+from tensorflow.python.keras.layers.preprocessing import normalization_v1
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return normalization.Normalization
+  else:
+    return normalization_v1.Normalization
+
+
+def _get_layer_computation_test_cases():
+  test_cases = ({
+      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
+      "axis": -1,
+      "test_data": np.array([[1.], [2.], [3.]], np.float32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_single_element"
+  }, {
+      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
+      "axis": None,
+      "test_data": np.array([[1.], [2.], [3.]], np.float32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_single_element_none_axis"
+  }, {
+      "adapt_data": np.array([[1., 2., 3., 4., 5.]], dtype=np.float32),
+      "axis": None,
+      "test_data": np.array([[1.], [2.], [3.]], np.float32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_single_element_none_axis_flat_data"
+  }, {
+      "adapt_data":
+          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
+                   np.float32),
+      "axis":
+          1,
+      "test_data":
+          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
+                   np.float32),
+      "expected":
+          np.array([[[-1.549193, -0.774597, 0.], [-1.549193, -0.774597, 0.]],
+                    [[0., 0.774597, 1.549193], [0., 0.774597, 1.549193]]],
+                   np.float32),
+      "testcase_name":
+          "3d_internal_axis"
+  }, {
+      "adapt_data":
+          np.array(
+              [[[1., 0., 3.], [2., 3., 4.]], [[3., -1., 5.], [4., 5., 8.]]],
+              np.float32),
+      "axis": (1, 2),
+      "test_data":
+          np.array(
+              [[[3., 1., -1.], [2., 5., 4.]], [[3., 0., 5.], [2., 5., 8.]]],
+              np.float32),
+      "expected":
+          np.array(
+              [[[1., 3., -5.], [-1., 1., -1.]], [[1., 1., 1.], [-1., 1., 1.]]],
+              np.float32),
+      "testcase_name":
+          "3d_multiple_axis"
+  })
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@combinations.generate(
+    combinations.times(
+        combinations.combine(
+            distribution=strategy_combinations.all_strategies,
+            mode=["eager", "graph"]), _get_layer_computation_test_cases()))
+class NormalizationTest(keras_parameterized.TestCase,
+                        preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_layer_computation(self, distribution, adapt_data, axis, test_data,
+                             use_dataset, expected):
+    input_shape = tuple([None for _ in range(test_data.ndim - 1)])
+    if use_dataset:
+      # Keras APIs expect batched datasets
+      adapt_data = dataset_ops.Dataset.from_tensor_slices(adapt_data).batch(
+          test_data.shape[0] // 2)
+      test_data = dataset_ops.Dataset.from_tensor_slices(test_data).batch(
+          test_data.shape[0] // 2)
+
+    with distribution.scope():
+      input_data = keras.Input(shape=input_shape)
+      layer = get_layer_class()(axis=axis)
+      layer.adapt(adapt_data)
+      output = layer(input_data)
+      model = keras.Model(input_data, output)
+      output_data = model.predict(test_data)
+    self.assertAllClose(expected, output_data)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_tpu_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_tpu_test.py
new file mode 100644
index 00000000000..b9c7b414f57
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_tpu_test.py
@@ -0,0 +1,128 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.distribute import tpu_strategy_test_utils
+from tensorflow.python.keras.layers.preprocessing import normalization
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+def _get_layer_computation_test_cases():
+  test_cases = ({
+      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
+      "axis": -1,
+      "test_data": np.array([[1.], [2.], [3.]], np.float32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_single_element"
+  }, {
+      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
+      "axis": None,
+      "test_data": np.array([[1.], [2.], [3.]], np.float32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_single_element_none_axis"
+  }, {
+      "adapt_data": np.array([[1., 2., 3., 4., 5.]], dtype=np.float32),
+      "axis": None,
+      "test_data": np.array([[1.], [2.], [3.]], np.float32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_single_element_none_axis_flat_data"
+  }, {
+      "adapt_data":
+          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
+                   np.float32),
+      "axis":
+          1,
+      "test_data":
+          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
+                   np.float32),
+      "expected":
+          np.array([[[-1.549193, -0.774597, 0.], [-1.549193, -0.774597, 0.]],
+                    [[0., 0.774597, 1.549193], [0., 0.774597, 1.549193]]],
+                   np.float32),
+      "testcase_name":
+          "3d_internal_axis"
+  }, {
+      "adapt_data":
+          np.array(
+              [[[1., 0., 3.], [2., 3., 4.]], [[3., -1., 5.], [4., 5., 8.]]],
+              np.float32),
+      "axis": (1, 2),
+      "test_data":
+          np.array(
+              [[[3., 1., -1.], [2., 5., 4.]], [[3., 0., 5.], [2., 5., 8.]]],
+              np.float32),
+      "expected":
+          np.array(
+              [[[1., 3., -5.], [-1., 1., -1.]], [[1., 1., 1.], [-1., 1., 1.]]],
+              np.float32),
+      "testcase_name":
+          "3d_multiple_axis"
+  })
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes(
+    always_skip_v1=True, always_skip_eager=True)
+class NormalizationTest(keras_parameterized.TestCase,
+                        preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_layer_computation_test_cases())
+  def test_layer_computation(self, adapt_data, axis, test_data, use_dataset,
+                             expected):
+    input_shape = tuple([None for _ in range(test_data.ndim - 1)])
+    if use_dataset:
+      # Keras APIs expect batched datasets
+      adapt_data = dataset_ops.Dataset.from_tensor_slices(adapt_data).batch(
+          test_data.shape[0] // 2)
+      test_data = dataset_ops.Dataset.from_tensor_slices(test_data).batch(
+          test_data.shape[0] // 2)
+
+    strategy = tpu_strategy_test_utils.get_tpu_strategy()
+
+    with strategy.scope():
+      input_data = keras.Input(shape=input_shape)
+      layer = normalization.Normalization(axis=axis)
+      layer.adapt(adapt_data)
+      output = layer(input_data)
+      model = keras.Model(input_data, output)
+      output_data = model.predict(test_data)
+    self.assertAllClose(expected, output_data)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/reduction.py b/tensorflow/python/keras/layers/preprocessing/reduction.py
index 7cd5c2ffacd..dc1ea0a0db6 100644
--- a/tensorflow/python/keras/layers/preprocessing/reduction.py
+++ b/tensorflow/python/keras/layers/preprocessing/reduction.py
@@ -75,7 +75,6 @@ class Reduction(Layer):
     # We temporarily turn off autocasting, as it does not apply to named call
     # kwargs.
     super(Reduction, self).__init__(**kwargs)
-    self._supports_ragged_inputs = True
 
   def call(self, inputs, weights=None):
     # If we are not weighting the inputs we can immediately reduce the data
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
new file mode 100644
index 00000000000..bbebe499204
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@@ -0,0 +1,110 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+
+
+class StringLookup(index_lookup.IndexLookup):
+  """Maps strings from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary strings into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_tokens: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use; defaults to
+      1. If this value is more than 1, OOV inputs are hashed to determine their
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. Defaults to the empty string "". If set to None, no mask term
+      will be added and the OOV tokens, if any, will be indexed from
+      (0...num_oov_indices) instead of (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. Defaults to
+      "[OOV]".
+    vocabulary: An optional list of vocabulary terms, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one token per line. If the list or file contains the same token multiple
+      times, an error will be thrown.
+    encoding: The Python string encoding to use. Defaults to `'utf-8'`.
+    invert: If true, this layer will map indices to vocabulary items instead
+      of mapping vocabulary items to indices.
+  """
+
+  def __init__(self,
+               max_tokens=None,
+               num_oov_indices=1,
+               mask_token="",
+               oov_token="[OOV]",
+               vocabulary=None,
+               encoding="utf-8",
+               invert=False,
+               **kwargs):
+    allowed_dtypes = [dtypes.string]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("StringLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.string
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary, encoding)
+
+    self.encoding = encoding
+
+    super(StringLookup, self).__init__(
+        max_tokens=max_tokens,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_token,
+        oov_token=oov_token,
+        vocabulary=vocabulary,
+        invert=invert,
+        **kwargs)
+
+  def get_config(self):
+    config = {"encoding": self.encoding}
+    base_config = super(StringLookup, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def get_vocabulary(self):
+    if self._table_handler.vocab_size() == 0:
+      return []
+
+    keys, values = self._table_handler.data()
+    # This is required because the MutableHashTable doesn't preserve insertion
+    # order, but we rely on the order of the array to assign indices.
+    return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
new file mode 100644
index 00000000000..0b9081d815c
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
@@ -0,0 +1,254 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.python import keras
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return string_lookup.StringLookup
+  else:
+    return string_lookup_v1.StringLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_strings_soft_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": None,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.string
+      },
+  )
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupLayerTest(keras_parameterized.TestCase,
+                            preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # StringLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"StringLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupVocabularyTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(vocab + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_str(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], six.text_type)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = ["earth", "wind", "and", "fire"]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = ["earth", "wind", "and", "fire", "earth"]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+  def test_inverse_layer(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
+    expected_output = np.array([["earth", "wind", "and", "fire"],
+                                ["fire", "and", "earth", ""]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_data, invert=True)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_forward_backward_layer(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = np.array([["earth", "wind", "and", "fire"],
+                                ["fire", "and", "earth", "[OOV]"]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    invert_layer = get_layer_class()(
+        vocabulary=layer.get_vocabulary(), invert=True)
+    int_data = layer(input_data)
+    out_data = invert_layer(int_data)
+    model = keras.Model(inputs=input_data, outputs=out_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class StringLookupSaveableTest(keras_parameterized.TestCase,
+                               preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
new file mode 100644
index 00000000000..0d4c70de655
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
@@ -0,0 +1,25 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+
+
+class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup):
+  pass
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
new file mode 100644
index 00000000000..05447f6e9ff
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -0,0 +1,191 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for working with tf.lookup tables in Keras."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import backend as K
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import gfile
+
+
+class TableHandler(object):
+  """Wrapper object that holds a lookup table and provides accessors."""
+
+  def __init__(self, table, oov_tokens=None, use_v1_apis=False):
+    self.table = table
+    self.use_v1_apis = use_v1_apis
+    if oov_tokens is None:
+      self.oov_tokens = oov_tokens
+    else:
+      if not isinstance(oov_tokens, (list, tuple, np.ndarray)):
+        oov_tokens = [oov_tokens]
+      self.oov_tokens = math_ops.cast(oov_tokens, table._value_dtype)  # pylint: disable=protected-access
+
+  def data(self):
+    keys, values = self.table.export()
+    return (self._eval(keys), self._eval(values))
+
+  def vocab_size(self):
+    return self._eval(self.table.size())
+
+  def clear(self):
+    keys, _ = self.table.export()
+    self._run(self.table.remove(keys))
+
+  def insert(self, keys, values):
+    if len(values) != len(keys):
+      raise RuntimeError("Size mismatch between values and key arrays. "
+                         "Keys had size %s, values had size %s." %
+                         (len(keys), len(values)))
+    self._run(self.table.insert(keys, values))
+
+  def _replace_oov_buckets(self, inputs, lookups):
+    """Replace the default OOV value with one of the OOV bucket values."""
+    if self.oov_tokens is None:
+      return lookups
+
+    num_oov_elements = self.oov_tokens.shape.num_elements()
+    if inputs.dtype.is_integer:
+      oov_indices = math_ops.floormod(inputs, num_oov_elements)
+    else:
+      oov_indices = string_ops.string_to_hash_bucket_fast(
+          inputs, num_buckets=num_oov_elements)
+
+    oov_values = array_ops.gather(self.oov_tokens, oov_indices)
+    oov_locations = math_ops.equal(lookups, self.table._default_value)  # pylint: disable=protected-access
+
+    return array_ops.where(oov_locations, oov_values, lookups)
+
+  def _ragged_lookup(self, inputs):
+    """Perform a table lookup on a ragged tensor."""
+    # The table lookup ops don't natively support ragged tensors, so if we have
+    # a RT we need to use map_flat_values to look up every element.
+    indexed_data = ragged_functional_ops.map_flat_values(
+        self.table.lookup, inputs)
+    indexed_data = ragged_functional_ops.map_flat_values(
+        self._replace_oov_buckets, inputs, indexed_data)
+    # Composite tensors can pass tensor values through, which will cause
+    # errors if all operations in the TF graph do so. We can break this chain
+    # with an identity here.
+    return array_ops.identity(indexed_data)
+
+  def _sparse_lookup(self, inputs):
+    """Perform a table lookup on a sparse tensor."""
+    values = self.table.lookup(inputs.values)
+    values = self._replace_oov_buckets(inputs.values, values)
+    indexed_data = sparse_tensor.SparseTensor(inputs.indices, values,
+                                              inputs.dense_shape)
+    # Composite tensors can pass tensor values through, which will cause
+    # errors if all operations in the TF graph do so. We can break this chain
+    # with an identity here.
+    return array_ops.identity(indexed_data)
+
+  def _tensor_lookup(self, inputs):
+    """Perform a table lookup on a tf.tensor."""
+    values = self.table.lookup(inputs)
+    indexed_data = self._replace_oov_buckets(inputs, values)
+    # (b/149446477): output does not preserve input shape.
+    indexed_data.set_shape(inputs.shape)
+    return indexed_data
+
+  def lookup(self, inputs):
+    """Perform a table lookup."""
+    # Sparse tensors don't play nicely with tensor conversion, so we handle
+    # them before attempting to convert lists or arrays to tensors.
+    if isinstance(
+        inputs, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+      return self._sparse_lookup(inputs)
+
+    # Try to convert lists/arrays to tensors or RaggedTensors.
+    inputs = ragged_tensor.convert_to_tensor_or_ragged_tensor(inputs)
+
+    # Run the lookup operation on the converted tensor.
+    if ragged_tensor.is_ragged(inputs):
+      return self._ragged_lookup(inputs)
+    else:
+      return self._tensor_lookup(inputs)
+
+  def _eval(self, tensor):
+    if self.use_v1_apis:
+      return K.get_session().run(tensor)
+    else:
+      return tensor.numpy()
+
+  def _run(self, op):
+    if self.use_v1_apis:
+      K.get_session().run(op)
+
+
+def get_vocabulary_from_file(vocabulary_path, encoding="utf-8"):
+  """Read a vocabulary in from a file."""
+  vocab = []
+  with gfile.GFile(vocabulary_path, "r") as reader:
+    while True:
+      # Get the next line (incl. \n), and break if nothing is left to read.
+      text = reader.readline()
+      if not text:
+        break
+
+      # Convert the raw text and strip whitespace.
+      if isinstance(text, str):
+        token = text
+      elif isinstance(text, bytes):
+        token = text.decode(encoding, "ignore")
+      token = token.strip()
+      vocab.append(token)
+  return vocab
+
+
+def validate_vocabulary_is_unique(vocabulary):
+  """Validate that a vocabulary contains no repeated tokens."""
+  vocabulary_set = set(vocabulary)
+  if len(vocabulary) != len(vocabulary_set):
+    repeated_items = [
+        item for item, count in collections.Counter(vocabulary).items()
+        if count > 1
+    ]
+    raise ValueError("The passed vocabulary has at least one repeated "
+                     "term. Please uniquify your dataset. The repeated terms "
+                     "are %s" % repeated_items)
+
+
+def assert_same_type(expected_type, values, value_name):
+  """Assert that 'values' is of type 'expected_type'."""
+  if dtypes.as_dtype(expected_type) != dtypes.as_dtype(values.dtype):
+    raise RuntimeError("Expected %s type %s, got %s" %
+                       (value_name, expected_type, values.dtype))
+
+
+def convert_to_ndarray(x, dtype=None):
+  """Convert 'x' to a numpy array."""
+  array = np.array(x) if isinstance(x, (list, tuple)) else x
+  if dtype not in (None, dtypes.string):
+    # If the dtype is an integer, we do permissive casting. This allows
+    # users to examine int32 data if the dtype is int64 without trouble.
+    np_dtype = dtypes.as_dtype(dtype).as_numpy_dtype
+    if np.can_cast(array.dtype, np_dtype):
+      array = array.astype(np_dtype, casting="safe")
+  return array
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils_test.py b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
new file mode 100644
index 00000000000..60a891f6ba8
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
@@ -0,0 +1,243 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras lookup table utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.layers.preprocessing import table_utils
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import test
+
+
+def get_table(dtype=dtypes.string, oov_tokens=None):
+  table = lookup_ops.MutableHashTable(
+      key_dtype=dtype,
+      value_dtype=dtypes.int64,
+      default_value=-7,
+      name="index_table")
+  return table_utils.TableHandler(
+      table, oov_tokens, use_v1_apis=(not context.executing_eagerly()))
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingInputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_string_input(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=["fire", "michigan"],
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_string_input(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = ragged_factory_ops.constant(
+        [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    table = get_table(oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingMultiOOVTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=["fire", "ohio"], dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_sparse_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 132], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 1]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = ragged_factory_ops.constant([["earth", "wind", "fire"],
+                                               ["fire", "and", "earth",
+                                                "ohio"]])
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    table = get_table(oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_ragged_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 132]],
+                                              dtype=np.int64)
+    expected_output = [[3, 4, 6], [6, 5, 3, 1]]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_tensor_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = np.array([[13, 132], [13, 133]], dtype=np.int64)
+    expected_values = [[6, 1], [6, 2]]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_values, output_data)
+
+  def test_tensor_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = [["earth", "wind", "fire", "michigan"],
+                   ["fire", "and", "earth", "ohio"]]
+    expected_output = [[3, 4, 6, 1], [6, 5, 3, 2]]
+
+    table = get_table(oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class IndexLookupOutputTest(keras_parameterized.TestCase,
+                            preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_int_output_default_lookup_value(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[1, 2, 3, 4], [4, 3, 1, -7]]
+
+    table = get_table(oov_tokens=None)
+    table.insert(vocab_data, range(1, len(vocab_data) + 1))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_output_shape(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+
+    table = get_table()
+    table.insert(vocab_data, range(1, len(vocab_data) + 1))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(input_array.shape[1:], output_data.shape[1:])
+
+  def test_int_output_no_reserved_zero_default_lookup_value(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[0, 1, 2, 3], [3, 2, 0, -7]]
+
+    table = get_table(oov_tokens=None)
+    table.insert(vocab_data, range(len(vocab_data)))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 4e8edf5cc98..9d083cc8769 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -17,10 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import json
-import operator
-
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
@@ -29,10 +25,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
 from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding
-from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -41,7 +36,6 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import keras_export
 
 LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
@@ -122,7 +116,9 @@ class TextVectorization(CombinerPreprocessingLayer):
 
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
-      there is no cap on the size of the vocabulary.
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      contains 1 OOV token, so the effective number of tokens is `(max_tokens -
+      1 - (1 if output == "int" else 0))`.
     standardize: Optional specification for standardization to apply to the
       input text. Values can be None (no standardization),
       'lower_and_strip_punctuation' (lowercase and remove punctuation) or a
@@ -138,7 +134,8 @@ class TextVectorization(CombinerPreprocessingLayer):
     output_mode: Optional specification for the output of the layer. Values can
       be "int", "binary", "count" or "tf-idf", configuring the layer as follows:
         "int": Outputs integer indices, one integer index per split string
-          token.
+          token. When output == "int", 0 is reserved for masked locations;
+          this reduces the vocab size to max_tokens-2 instead of max_tokens-1
         "binary": Outputs a single int array per batch, of either vocab_size or
           max_tokens size, containing 1s in all elements where the token mapped
           to that index exists at least once in the batch item.
@@ -269,21 +266,11 @@ class TextVectorization(CombinerPreprocessingLayer):
 
     self._max_tokens = max_tokens
 
-    # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
-    # modes don't have a PAD value, so we only need to reserve one value.
-    self._reserved_values = 2 if output_mode == INT else 1
-
     # In INT mode, the zero value is reserved for padding (per Keras standard
     # padding approaches). In non-INT modes, there is no padding so we can set
     # the OOV value to zero instead of one.
     self._oov_value = 1 if output_mode == INT else 0
 
-    # We always reduce the max token number by 1 to account for the OOV token
-    # if it is set. Keras' use of the reserved number 0 for padding tokens,
-    # if the output is in INT mode, does not really count as a 'token' for
-    # vocabulary purposes, so we only reduce vocab size by 1 here.
-    self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None
-
     self._standardize = standardize
     self._split = split
     self._ngrams_arg = ngrams
@@ -299,26 +286,24 @@ class TextVectorization(CombinerPreprocessingLayer):
     self._called = False
 
     super(TextVectorization, self).__init__(
-        combiner=_TextVectorizationCombiner(
-            self._max_vocab_size, compute_idf=output_mode == TFIDF),
+        combiner=None,
         **kwargs)
-    self._supports_ragged_inputs = True
 
-    reserve_zero = output_mode in [None, INT]
+    mask_token = "" if output_mode in [None, INT] else None
     self._index_lookup_layer = self._get_index_lookup_class()(
-        max_tokens=max_tokens, reserve_zero=reserve_zero, dtype=dtypes.string)
+        max_tokens=max_tokens, mask_token=mask_token)
 
     # If this layer is configured for string or integer output, we do not
     # create a vectorization layer (as the output is not vectorized).
     if self._output_mode in [None, INT]:
-      return
-
-    if max_tokens is not None and self._pad_to_max:
-      max_elements = max_tokens
+      self._vectorize_layer = None
     else:
-      max_elements = None
-    self._vectorize_layer = self._get_vectorization_class()(
-        max_tokens=max_elements, output_mode=self._output_mode)
+      if max_tokens is not None and self._pad_to_max:
+        max_elements = max_tokens
+      else:
+        max_elements = None
+      self._vectorize_layer = self._get_vectorization_class()(
+          max_tokens=max_elements, output_mode=self._output_mode)
 
   # These are V1/V2 shim points. There are V1 implementations in the V1 class.
   def _get_vectorization_class(self):
@@ -329,7 +314,7 @@ class TextVectorization(CombinerPreprocessingLayer):
     return (keys.numpy(), values.numpy())
 
   def _get_index_lookup_class(self):
-    return index_lookup.IndexLookup
+    return string_lookup.StringLookup
 
   def _to_numpy(self, preprocessed_data):
     """Converts preprocessed inputs into numpy arrays."""
@@ -351,11 +336,16 @@ class TextVectorization(CombinerPreprocessingLayer):
       return tensor_shape.TensorShape([input_shape[0], self._max_tokens])
 
     if self._output_mode == INT and self._split is None:
-      return input_shape
+      if len(input_shape) == 1:
+        input_shape = tuple(input_shape) + (1,)
+      return tensor_shape.TensorShape(input_shape)
 
     if self._output_mode == INT and self._split is not None:
       input_shape = list(input_shape)
-      input_shape[1] = self._output_sequence_length
+      if len(input_shape) == 1:
+        input_shape = input_shape + [self._output_sequence_length]
+      else:
+        input_shape[1] = self._output_sequence_length
       return tensor_shape.TensorShape(input_shape)
 
   def compute_output_signature(self, input_spec):
@@ -371,7 +361,7 @@ class TextVectorization(CombinerPreprocessingLayer):
 
     Arguments:
       data: The data to train on. It can be passed either as a tf.data Dataset,
-        or as a numpy array.
+        as a NumPy array, a string tensor, or as a list of texts.
       reset_state: Optional argument specifying whether to clear the state of
         the layer at the start of the call to `adapt`. This must be True for
         this layer, which does not support repeated calls to `adapt`.
@@ -382,26 +372,39 @@ class TextVectorization(CombinerPreprocessingLayer):
     # Build the layer explicitly with the original data shape instead of relying
     # on an implicit call to `build` in the base layer's `adapt`, since
     # preprocessing changes the input shape.
-    if isinstance(data, np.ndarray):
-      if data.ndim == 1:
-        data = np.expand_dims(data, axis=-1)
+    if isinstance(data, (list, tuple, np.ndarray)):
+      data = ops.convert_to_tensor(data)
+
+    if isinstance(data, ops.Tensor):
+      if data.shape.rank == 1:
+        data = array_ops.expand_dims(data, axis=-1)
       self.build(data.shape)
-      preprocessed_inputs = self._to_numpy(self._preprocess(data))
+      preprocessed_inputs = self._preprocess(data)
     elif isinstance(data, dataset_ops.DatasetV2):
       # TODO(momernick): Replace this with a more V2-friendly API.
       shape = dataset_ops.get_legacy_output_shapes(data)
       if not isinstance(shape, tensor_shape.TensorShape):
         raise ValueError("The dataset passed to 'adapt' must contain a single "
                          "tensor value.")
+      if shape.rank == 0:
+        data = data.map(lambda tensor: array_ops.expand_dims(tensor, 0))
+        shape = dataset_ops.get_legacy_output_shapes(data)
       if shape.rank == 1:
         data = data.map(lambda tensor: array_ops.expand_dims(tensor, -1))
       self.build(dataset_ops.get_legacy_output_shapes(data))
       preprocessed_inputs = data.map(self._preprocess)
     else:
       raise ValueError(
-          "adapt() requires a Dataset or a Numpy array as input, got {}".format(
+          "adapt() requires a Dataset or an array as input, got {}".format(
               type(data)))
-    super(TextVectorization, self).adapt(preprocessed_inputs, reset_state)
+
+    self._index_lookup_layer.adapt(preprocessed_inputs)
+    if self._vectorize_layer:
+      if isinstance(data, ops.Tensor):
+        integer_data = self._index_lookup_layer(preprocessed_inputs)
+      else:
+        integer_data = preprocessed_inputs.map(self._index_lookup_layer)
+      self._vectorize_layer.adapt(integer_data)
 
   def get_vocabulary(self):
     return self._index_lookup_layer.get_vocabulary()
@@ -429,26 +432,21 @@ class TextVectorization(CombinerPreprocessingLayer):
   def set_vocabulary(self,
                      vocab,
                      df_data=None,
-                     oov_df_value=None,
-                     append=False):
+                     oov_df_value=None):
     """Sets vocabulary (and optionally document frequency) data for this layer.
 
     This method sets the vocabulary and DF data for this layer directly, instead
     of analyzing a dataset through 'adapt'. It should be used whenever the vocab
     (and optionally document frequency) information is already known. If
-    vocabulary data is already present in the layer, this method will either
-    replace it, if 'append' is set to False, or append to it (if 'append' is set
-    to True).
+    vocabulary data is already present in the layer, this method will replace
+    it.
 
     Arguments:
       vocab: An array of string tokens.
       df_data: An array of document frequency data. Only necessary if the layer
         output_mode is TFIDF.
       oov_df_value: The document frequency of the OOV token. Only necessary if
-        output_mode is TFIDF. OOV data is optional when appending additional
-        data in TFIDF mode; if an OOV value is supplied it will overwrite the
-        existing OOV value.
-      append: Whether to overwrite or append any existing vocabulary data.
+        output_mode is TFIDF.
 
     Raises:
       ValueError: If there are too many inputs, the inputs do not match, or
@@ -469,8 +467,7 @@ class TextVectorization(CombinerPreprocessingLayer):
                           "be changed after the layer is "
                           "called.").format(mode=self._output_mode))
 
-    current_table_size = self._index_lookup_layer.vocab_size()
-    self._index_lookup_layer.set_vocabulary(vocab, append)
+    self._index_lookup_layer.set_vocabulary(vocab)
 
     # When doing raw or integer output, we don't have a Vectorize layer to
     # manage. In this case, we can return directly.
@@ -478,14 +475,9 @@ class TextVectorization(CombinerPreprocessingLayer):
       return
 
     if not self._pad_to_max or self._max_tokens is None:
-      num_tokens = self._index_lookup_layer.vocab_size() + self._reserved_values
+      num_tokens = self._index_lookup_layer.vocab_size()
       self._vectorize_layer.set_num_elements(num_tokens)
 
-    # We're only _really_ appending if the table_size is nonzero. This is
-    # important for some sanity checks in tfidf mode (specifically, checking if
-    # oov_df_value is set or not) and handling existing tfidf weight data.
-    append = append if current_table_size > 0 else False
-
     if self._output_mode == TFIDF:
       if df_data is None:
         raise ValueError("df_data must be set if output_mode is TFIDF")
@@ -493,31 +485,14 @@ class TextVectorization(CombinerPreprocessingLayer):
         raise ValueError("df_data must be the same length as vocab. "
                          "len(df_data) is %s, len(vocab) is %s" %
                          (len(vocab), len(df_data)))
-      if not append and oov_df_value is None:
-        raise ValueError("You must pass an oov_df_value the first time "
-                         "'set_vocabulary' is called when output_mode is "
+      if oov_df_value is None:
+        raise ValueError("You must pass an oov_df_value when output_mode is "
                          "TFIDF.")
 
       df_data = self._convert_to_ndarray(df_data)
-      if append:
-        # The existing IDF data is stored in a Keras weight, so we can get it
-        # by calling K.get_value() on the weight object. Take the first
-        # table_size+1 values in case we're padding the weight with zeros
-        existing_df_data = K.get_value(
-            self._vectorize_layer.tf_idf_weights)[:current_table_size + 1]
-        df_data = np.append(existing_df_data, df_data, axis=0)
-        # If we are appending and need to replace the OOV DF value, we can
-        # assign it over the existing OOV DF value at index 0 of the (already-
-        # concatenated) DF value array.
-        if oov_df_value is not None:
-          df_data[0] = oov_df_value
-      else:
-        # If we are not appending (that is, we have only new data) we need to
-        # insert the OOV value to the front of the array. (This is a append to
-        # the head, not a replacement of the zeroth value.)
-        if not isinstance(oov_df_value, np.ndarray):
-          oov_df_value = np.array([oov_df_value])
-        df_data = np.insert(df_data, 0, oov_df_value)
+      if not isinstance(oov_df_value, np.ndarray):
+        oov_df_value = np.array([oov_df_value])
+      df_data = np.insert(df_data, 0, oov_df_value)
       self._vectorize_layer.set_tfidf_data(df_data)
 
   def build(self, input_shape):
@@ -537,8 +512,10 @@ class TextVectorization(CombinerPreprocessingLayer):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
     if self._output_mode == TFIDF:
-      self.set_vocabulary(updates[_VOCAB_NAME], updates[_IDF_NAME],
-                          updates[_OOV_IDF_NAME])
+      self.set_vocabulary(
+          updates[_VOCAB_NAME],
+          updates[_IDF_NAME],
+          updates[_OOV_IDF_NAME])
     else:
       self.set_vocabulary(updates[_VOCAB_NAME])
 
@@ -592,6 +569,11 @@ class TextVectorization(CombinerPreprocessingLayer):
     return inputs
 
   def call(self, inputs):
+    if isinstance(inputs, (list, tuple, np.ndarray)):
+      inputs = ops.convert_to_tensor(inputs)
+    if inputs.shape.rank == 1:
+      inputs = array_ops.expand_dims(inputs, axis=-1)
+
     self._called = True
     inputs = self._preprocess(inputs)
 
@@ -631,185 +613,3 @@ class TextVectorization(CombinerPreprocessingLayer):
     # If we're not returning integers here, we rely on the vectorization layer
     # to create the output.
     return self._vectorize_layer(indexed_data)
-
-
-class _TextVectorizationAccumulator(
-    collections.namedtuple("_TextVectorizationAccumulator",
-                           ["count_dict", "per_doc_count_dict", "metadata"])):
-  pass
-
-
-# A note on this combiner: This contains functionality that will be extracted
-# into the Vectorization and IndexLookup combiner objects. At that point,
-# TextVectorization can become a PreprocessingStage instead of a Layer and
-# this combiner can be retired. Until then, we leave this as is instead of
-# attempting a refactor of what will soon be deleted.
-class _TextVectorizationCombiner(Combiner):
-  """Combiner for the TextVectorization preprocessing layer.
-
-  This class encapsulates the logic for computing a vocabulary based on the
-  frequency of each token.
-
-  Attributes:
-    vocab_size: (Optional) If set, only the top `vocab_size` tokens (based on
-      frequency across the dataset) are retained in the vocabulary. If None, or
-      set to a value greater than the total number of distinct tokens in the
-      dataset, all tokens are retained.
-    compute_idf: (Optional) If set, the inverse document frequency will be
-      computed for each value.
-  """
-
-  def __init__(self, vocab_size=None, compute_idf=False):
-    self._vocab_size = vocab_size
-    self._compute_idf = compute_idf
-    self._input_dtype = dtypes.string
-
-  def compute(self, values, accumulator=None):
-    """Compute a step in this computation, returning a new accumulator."""
-    if dtypes.as_dtype(self._input_dtype) != dtypes.as_dtype(values.dtype):
-      raise RuntimeError("Expected input type %s, got %s" %
-                         (self._input_dtype, values.dtype))
-    if ragged_tensor.is_ragged(values):
-      values = values.to_list()
-    if isinstance(values, ops.EagerTensor):
-      values = values.numpy()
-    if isinstance(values, np.ndarray):
-      values = values.tolist()
-
-    if accumulator is None:
-      accumulator = self._create_accumulator()
-
-    # TODO(momernick): Benchmark improvements to this algorithm.
-    for document in values:
-      current_doc_id = accumulator.metadata[0]
-      for token in document:
-        accumulator.count_dict[token] += 1
-        if self._compute_idf:
-          doc_count = accumulator.per_doc_count_dict[token]
-          if doc_count["last_doc_id"] != current_doc_id:
-            doc_count["count"] += 1
-            doc_count["last_doc_id"] = current_doc_id
-      accumulator.metadata[0] += 1
-
-    return accumulator
-
-  def merge(self, accumulators):
-    """Merge several accumulators to a single accumulator."""
-    if not accumulators:
-      return accumulators
-
-    base_accumulator = accumulators[0]
-
-    for accumulator in accumulators[1:]:
-      base_accumulator.metadata[0] += accumulator.metadata[0]
-      for token, value in accumulator.count_dict.items():
-        base_accumulator.count_dict[token] += value
-      if self._compute_idf:
-        for token, value in accumulator.per_doc_count_dict.items():
-          # Any newly created token counts in 'base_accumulator''s
-          # per_doc_count_dict will have a last_doc_id of -1. This is always
-          # less than the next doc id (which are strictly positive), so any
-          # future occurrences are guaranteed to be counted.
-          base_accumulator.per_doc_count_dict[token]["count"] += value["count"]
-
-    return base_accumulator
-
-  def _inverse_document_frequency(self, document_counts, num_documents):
-    """Compute the inverse-document-frequency (IDF) component of TFIDF.
-
-    Uses the default weighting scheme described in
-    https://en.wikipedia.org/wiki/Tf%E2%80%93idf.
-
-    Args:
-      document_counts: An array of the # of documents each token appears in.
-      num_documents: An int representing the total number of documents
-
-    Returns:
-      An array of "inverse document frequency" weights.
-    """
-    return np.log(1 + num_documents / (1 + np.array(document_counts)))
-
-  def extract(self, accumulator):
-    """Convert an accumulator into a dict of output values.
-
-    Args:
-      accumulator: An accumulator aggregating over the full dataset.
-
-    Returns:
-      A dict of:
-        "vocab": A list of the retained items in the vocabulary.
-        "idf": The inverse-document-frequency for each item in vocab.
-          idf[vocab_idx] is the IDF value for the corresponding vocab item.
-        "oov_idf": The inverse-document-frequency for the OOV token.
-    """
-    if self._compute_idf:
-      vocab_counts, document_counts, num_documents = accumulator
-    else:
-      vocab_counts, _, _ = accumulator
-
-    sorted_counts = sorted(
-        vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
-    vocab_data = (
-        sorted_counts[:self._vocab_size] if self._vocab_size else sorted_counts)
-    vocab = [data[0] for data in vocab_data]
-
-    if self._compute_idf:
-      doc_counts = [document_counts[token]["count"] for token in vocab]
-      idf = self._inverse_document_frequency(doc_counts, num_documents[0])
-      oov_idf = np.array([np.log(1 + num_documents[0])])
-      return {_VOCAB_NAME: vocab, _IDF_NAME: idf, _OOV_IDF_NAME: oov_idf}
-    else:
-      return {_VOCAB_NAME: vocab}
-
-  def restore(self, output):
-    """Create an accumulator based on 'output'."""
-    raise NotImplementedError(
-        "TextVectorization does not restore or support streaming updates.")
-
-  def serialize(self, accumulator):
-    """Serialize an accumulator for a remote call."""
-    output_dict = {}
-    output_dict["metadata"] = accumulator.metadata
-    output_dict["vocab"] = list(accumulator.count_dict.keys())
-    output_dict["vocab_counts"] = list(accumulator.count_dict.values())
-    if self._compute_idf:
-      output_dict["idf_vocab"] = list(accumulator.per_doc_count_dict.keys())
-      output_dict["idf_counts"] = [
-          counter["count"]
-          for counter in accumulator.per_doc_count_dict.values()
-      ]
-    return compat.as_bytes(json.dumps(output_dict))
-
-  def deserialize(self, encoded_accumulator):
-    """Deserialize an accumulator received from 'serialize()'."""
-    accumulator_dict = json.loads(compat.as_text(encoded_accumulator))
-
-    accumulator = self._create_accumulator()
-    accumulator.metadata[0] = accumulator_dict["metadata"][0]
-
-    count_dict = dict(
-        zip(accumulator_dict["vocab"], accumulator_dict["vocab_counts"]))
-    accumulator.count_dict.update(count_dict)
-
-    if self._compute_idf:
-      create_dict = lambda x: {"count": x, "last_doc_id": -1}
-      idf_count_dicts = [
-          create_dict(count) for count in accumulator_dict["idf_counts"]
-      ]
-      idf_dict = dict(zip(accumulator_dict["idf_vocab"], idf_count_dicts))
-      accumulator.per_doc_count_dict.update(idf_dict)
-
-    return accumulator
-
-  def _create_accumulator(self):
-    """Accumulate a sorted array of vocab tokens and corresponding counts."""
-
-    count_dict = collections.defaultdict(int)
-    if self._compute_idf:
-      create_default_dict = lambda: {"count": 0, "last_doc_id": -1}
-      per_doc_count_dict = collections.defaultdict(create_default_dict)
-    else:
-      per_doc_count_dict = None
-    metadata = [0]
-    return _TextVectorizationAccumulator(count_dict, per_doc_count_dict,
-                                         metadata)
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
new file mode 100644
index 00000000000..2d80f13684d
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
@@ -0,0 +1,106 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import context
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.layers.preprocessing import text_vectorization
+from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return text_vectorization.TextVectorization
+  else:
+    return text_vectorization_v1.TextVectorization
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=strategy_combinations.all_strategies,
+        mode=["eager", "graph"]))
+class TextVectorizationDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution_strategy_output(self, distribution):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
+        2, drop_remainder=True)
+
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    config.set_soft_device_placement(True)
+
+    with distribution.scope():
+      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+      layer = get_layer_class()(
+          max_tokens=None,
+          standardize=None,
+          split=None,
+          output_mode=text_vectorization.INT)
+      layer.set_vocabulary(vocab_data)
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_dataset)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_distribution_strategy_output_with_adapt(self, distribution):
+    vocab_data = [[
+        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
+        "and", "fire"
+    ]]
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
+        2, drop_remainder=True)
+
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    config.set_soft_device_placement(True)
+
+    with distribution.scope():
+      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+      layer = get_layer_class()(
+          max_tokens=None,
+          standardize=None,
+          split=None,
+          output_mode=text_vectorization.INT)
+      layer.adapt(vocab_dataset)
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_dataset)
+    self.assertAllEqual(expected_output, output_dataset)
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index c50f31b508b..2a6ffd223c8 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -27,7 +27,9 @@ from tensorflow.python import keras
 from tensorflow.python import tf2
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
@@ -35,9 +37,9 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import convolutional
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import embeddings
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.saving import saved_model_experimental as saving
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
@@ -60,7 +62,7 @@ def _get_end_to_end_test_cases():
           "testcase_name":
               "test_simple_tokens_int_mode",
           # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab accumulator
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
           # is sorting by frequency.
           "vocab_data":
               np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
@@ -76,6 +78,26 @@ def _get_end_to_end_test_cases():
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
       },
+      {
+          "testcase_name":
+              "test_simple_tokens_int_mode_hard_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": 6,
+              "standardize": None,
+              "split": None,
+              "output_mode": text_vectorization.INT
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+      },
       {
           "testcase_name":
               "test_documents_int_mode",
@@ -285,6 +307,57 @@ class TextVectorizationLayerTest(keras_parameterized.TestCase,
           adapt_data=vocab_data)
     self.assertAllClose(expected_output, output_data)
 
+  def test_list_inputs_1d(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_tensor_inputs(self):
+    vocab_data = constant_op.constant(
+        ["two two two", "two three three", "three four four five"])
+    input_data = constant_op.constant(["two three", "four five"])
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_list_inputs_2d(self):
+    vocab_data = [
+        ["two two two"], ["two three three"], ["three four four five"]]
+    input_data = [["two three"], ["four five"]]
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_dataset_of_single_strings(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    vocab_ds = dataset_ops.Dataset.from_tensor_slices(vocab_data)  # unbatched
+    layer = get_layer_class()()
+    layer.adapt(vocab_ds)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
 
 @keras_parameterized.run_all_keras_modes
 class TextVectorizationPreprocessingTest(
@@ -568,6 +641,33 @@ class TextVectorizationPreprocessingTest(
     self.assertAllEqual(expected_output, output)
 
 
+@keras_parameterized.run_all_keras_modes
+class TextVectorizationDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution_strategy_output(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    strategy = one_device_strategy.OneDeviceStrategy("/cpu:0")
+    with strategy.scope():
+      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+      layer = get_layer_class()(
+          max_tokens=None,
+          standardize=None,
+          split=None,
+          output_mode=text_vectorization.INT)
+      layer.set_vocabulary(vocab_data)
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
 @keras_parameterized.run_all_keras_modes
 class TextVectorizationOutputTest(
     keras_parameterized.TestCase,
@@ -591,25 +691,6 @@ class TextVectorizationOutputTest(
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
   def test_int_output_densifies_with_zeros(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     # Create an input array that has 5 elements in the first example and 4 in
@@ -924,7 +1005,7 @@ class TextVectorizationOutputTest(
         output_mode=text_vectorization.BINARY,
         pad_to_max_tokens=False)
     _ = layer(input_data)
-    with self.assertRaisesRegex(RuntimeError, "vocabulary cannot be changed"):
+    with self.assertRaisesRegex(RuntimeError, "can't be adapted after being"):
       layer.adapt(vocab_data)
 
   def test_bag_output_soft_maximum_set_state_variables_after_call_fails(self):
@@ -1018,7 +1099,10 @@ class TextVectorizationOutputTest(
         split=None,
         output_mode=text_vectorization.TFIDF,
         pad_to_max_tokens=True)
-    layer.set_vocabulary(vocab_data, df_data=tfidf_data, oov_df_value=.05)
+    layer.set_vocabulary(
+        vocab_data,
+        df_data=tfidf_data,
+        oov_df_value=.05)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -1056,59 +1140,15 @@ class TextVectorizationOutputTest(
     output_dataset = model.predict(input_array)
     self.assertAllClose(expected_output, output_dataset)
 
-  def test_tfidf_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [.1, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+  def test_accept_1D_input(self):
+    input_array = np.array(["earth wind and fire",
+                            "fire and earth michigan"])
     layer = get_layer_class()(
-        max_tokens=5,
         standardize=None,
         split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    layer.set_vocabulary(vocab_data[1], df_data=tfidf_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
-  def test_tfidf_appending_with_oov_replacement(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [1.5, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    # Note that here we've replaced the OOV vaue.
-    layer.set_vocabulary(
-        vocab_data[1], df_data=tfidf_data[1], oov_df_value=.75, append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
+        output_mode="int")
+    layer.adapt(input_array)
+    _ = layer(input_array)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -1236,22 +1276,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 "vocabulary larger than the maximum vocab.*"):
       layer.set_vocabulary(vocab_data)
 
-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-
-    layer = get_layer_class()(
-        max_tokens=4,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-
-    # The first time we call set_vocabulary, we're under the max_tokens limit
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
   def test_setting_vocab_without_tfidf_data_fails_in_tfidf_mode(self):
     vocab_data = ["earth", "wind", "and", "fire"]
 
@@ -1288,18 +1312,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 "You must pass an oov_df_value.*"):
       layer.set_vocabulary(vocab_data, df_data)
 
-  def test_tfidf_set_vocab_with_no_oov_fails_with_append_set(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    df_data = [1, 2, 3, 4]
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    with self.assertRaisesRegex(ValueError,
-                                "You must pass an oov_df_value.*"):
-      layer.set_vocabulary(vocab_data, df_data, append=True)
-
   def test_set_tfidf_in_non_tfidf_fails(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     df_data = [1, 2, 3, 4]
@@ -1355,6 +1367,7 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 ".*`output_sequence_length` must not be set.*"):
       _ = get_layer_class()(output_mode="count", output_sequence_length=2)
 
+
 # Custom functions for the custom callable serialization test. Declared here
 # to avoid multiple registrations from run_all_keras_modes().
 @generic_utils.register_keras_serializable(package="Test")
@@ -1536,208 +1549,5 @@ class TextVectorizationSavingTest(
     self.assertAllClose(new_output_dataset, expected_output)
 
 
-@keras_parameterized.run_all_keras_modes
-class TextVectorizationCombinerTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def compare_text_accumulators(self, a, b, msg=None):
-    if a is None or b is None:
-      self.assertAllEqual(a, b, msg=msg)
-
-    self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
-    self.assertAllEqual(a.metadata, b.metadata, msg=msg)
-
-    if a.per_doc_count_dict is not None:
-
-      def per_doc_counts(accumulator):
-        count_values = [
-            count_dict["count"]
-            for count_dict in accumulator.per_doc_count_dict.values()
-        ]
-        return dict(zip(accumulator.per_doc_count_dict.keys(), count_values))
-
-      self.assertAllEqual(per_doc_counts(a), per_doc_counts(b), msg=msg)
-
-  compare_accumulators = compare_text_accumulators
-
-  def update_accumulator(self, accumulator, data):
-    accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
-    accumulator.metadata[0] = data["num_documents"]
-
-    if "document_counts" in data:
-      create_dict = lambda x: {"count": x, "last_doc_id": -1}
-      idf_count_dicts = [
-          create_dict(count) for count in data["document_counts"]
-      ]
-      idf_dict = dict(zip(data["vocab"], idf_count_dicts))
-
-      accumulator.per_doc_count_dict.update(idf_dict)
-
-    return accumulator
-
-  def test_combiner_api_compatibility_int_mode(self):
-    data = np.array([["earth", "wind", "and", "fire"],
-                     ["earth", "wind", "and", "michigan"]])
-    combiner = text_vectorization._TextVectorizationCombiner(compute_idf=False)
-    expected_accumulator_output = {
-        "vocab": np.array(["and", "earth", "wind", "fire", "michigan"]),
-        "counts": np.array([2, 2, 2, 1, 1]),
-        "num_documents": np.array(2),
-    }
-    expected_extract_output = {
-        "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
-    }
-    expected_accumulator = combiner._create_accumulator()
-    expected_accumulator = self.update_accumulator(expected_accumulator,
-                                                   expected_accumulator_output)
-    self.validate_accumulator_serialize_and_deserialize(combiner, data,
-                                                        expected_accumulator)
-    self.validate_accumulator_uniqueness(combiner, data)
-    self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-  def test_combiner_api_compatibility_tfidf_mode(self):
-    data = np.array([["earth", "wind", "and", "fire"],
-                     ["earth", "wind", "and", "michigan"]])
-    combiner = text_vectorization._TextVectorizationCombiner(compute_idf=True)
-    expected_extract_output = {
-        "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
-        "idf": np.array([0.510826, 0.510826, 0.510826, 0.693147, 0.693147]),
-        "oov_idf": np.array([1.098612])
-    }
-    expected_accumulator_output = {
-        "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
-        "counts": np.array([2, 2, 2, 1, 1]),
-        "document_counts": np.array([2, 2, 2, 1, 1]),
-        "num_documents": np.array(2),
-    }
-
-    expected_accumulator = combiner._create_accumulator()
-    expected_accumulator = self.update_accumulator(expected_accumulator,
-                                                   expected_accumulator_output)
-    self.validate_accumulator_serialize_and_deserialize(combiner, data,
-                                                        expected_accumulator)
-    self.validate_accumulator_uniqueness(combiner, data)
-    self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-  # TODO(askerryryan): Add tests confirming equivalence to behavior of
-  # existing tf.keras.preprocessing.text.Tokenizer.
-  @parameterized.named_parameters(
-      {
-          "testcase_name":
-              "top_k_smaller_than_full_vocab",
-          "data":
-              np.array([["earth", "wind"], ["fire", "wind"], ["and"],
-                        ["fire", "wind"]]),
-          "vocab_size":
-              3,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "counts": np.array([3, 2, 1, 1]),
-              "document_counts": np.array([3, 2, 1, 1]),
-              "num_documents": np.array(4),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth"]),
-              "idf": np.array([0.693147, 0.847298, 1.098612]),
-              "oov_idf": np.array([1.609438]),
-          },
-      },
-      {
-          "testcase_name":
-              "top_k_larger_than_full_vocab",
-          "data":
-              np.array([["earth", "wind"], ["fire", "wind"], ["and"],
-                        ["fire", "wind"]]),
-          "vocab_size":
-              10,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "counts": np.array([3, 2, 1, 1]),
-              "document_counts": np.array([3, 2, 1, 1]),
-              "num_documents": np.array(4),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "idf": np.array([0.693147, 0.847298, 1.098612, 1.098612]),
-              "oov_idf": np.array([1.609438]),
-          },
-      },
-      {
-          "testcase_name":
-              "no_top_k",
-          "data":
-              np.array([["earth", "wind"], ["fire", "wind"], ["and"],
-                        ["fire", "wind"]]),
-          "vocab_size":
-              None,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "counts": np.array([3, 2, 1, 1]),
-              "document_counts": np.array([3, 2, 1, 1]),
-              "num_documents": np.array(4),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "idf": np.array([0.693147, 0.847298, 1.098612, 1.098612]),
-              "oov_idf": np.array([1.609438]),
-          },
-      },
-      {
-          "testcase_name": "single_element_per_row",
-          "data": np.array([["earth"], ["wind"], ["fire"], ["wind"], ["and"]]),
-          "vocab_size": 3,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "and", "earth", "fire"]),
-              "counts": np.array([2, 1, 1, 1]),
-              "document_counts": np.array([2, 1, 1, 1]),
-              "num_documents": np.array(5),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth"]),
-              "idf": np.array([0.980829, 1.252763, 1.252763]),
-              "oov_idf": np.array([1.791759]),
-          },
-      },
-      # Which tokens are retained are based on global frequency, and thus are
-      # sensitive to frequency within a document. In contrast, because idf only
-      # considers the presence of a token in a document, it is insensitive
-      # to the frequency of the token within the document.
-      {
-          "testcase_name":
-              "retained_tokens_sensitive_to_within_document_frequency",
-          "data":
-              np.array([["earth", "earth"], ["wind", "wind"], ["fire", "fire"],
-                        ["wind", "wind"], ["and", "michigan"]]),
-          "vocab_size":
-              3,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "earth", "fire", "and", "michigan"]),
-              "counts": np.array([4, 2, 2, 1, 1]),
-              "document_counts": np.array([2, 1, 1, 1, 1]),
-              "num_documents": np.array(5),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth"]),
-              "idf": np.array([0.980829, 1.252763, 1.252763]),
-              "oov_idf": np.array([1.791759]),
-          },
-      })
-  def test_combiner_computation(self,
-                                data,
-                                vocab_size,
-                                expected_accumulator_output,
-                                expected_extract_output,
-                                compute_idf=True):
-    combiner = text_vectorization._TextVectorizationCombiner(
-        vocab_size=vocab_size, compute_idf=compute_idf)
-    expected_accumulator = combiner._create_accumulator()
-    expected_accumulator = self.update_accumulator(expected_accumulator,
-                                                   expected_accumulator_output)
-    self.validate_accumulator_computation(combiner, data, expected_accumulator)
-    self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
index b869bee52ab..59cf2c61288 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
-from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util.tf_export import keras_export
@@ -84,7 +84,7 @@ class TextVectorization(text_vectorization.TextVectorization,
     return categorical_encoding_v1.CategoricalEncoding
 
   def _get_index_lookup_class(self):
-    return index_lookup_v1.IndexLookup
+    return string_lookup_v1.StringLookup
 
   def _to_numpy(self, data):
     """Converts preprocessed inputs into numpy arrays."""
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index c957167104f..f11f4e88f21 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -438,7 +438,6 @@ class RNN(Layer):
     self._states = None
     self.constants_spec = None
     self._num_constants = 0
-    self._supports_ragged_inputs = True
 
     if stateful:
       if ds_context.has_strategy():
@@ -694,10 +693,14 @@ class RNN(Layer):
     if is_keras_tensor:
       # Compute the full input spec, including state and constants
       full_input = [inputs] + additional_inputs
-      # The original input_spec is None since there could be a nested tensor
-      # input. Update the input_spec to match the inputs.
-      full_input_spec = generic_utils.to_list(
-          nest.map_structure(lambda _: None, inputs)) + additional_specs
+      if self.built:
+        # Keep the input_spec since it has been populated in build() method.
+        full_input_spec = self.input_spec + additional_specs
+      else:
+        # The original input_spec is None since there could be a nested tensor
+        # input. Update the input_spec to match the inputs.
+        full_input_spec = generic_utils.to_list(
+            nest.map_structure(lambda _: None, inputs)) + additional_specs
       # Perform the call with temporarily replaced input_spec
       self.input_spec = full_input_spec
       output = super(RNN, self).__call__(full_input, **kwargs)
@@ -2229,8 +2232,8 @@ class LSTMCell(DropoutRNNCellMixin, Layer):
     unit_forget_bias: Boolean.
       If True, add 1 to the bias of the forget gate at initialization.
       Setting it to true will also force `bias_initializer="zeros"`.
-      This is recommended in [Jozefowicz et
-        al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      This is recommended in [Jozefowicz et al., 2015](
+        http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
     kernel_regularizer: Regularizer function applied to
       the `kernel` weights matrix.
     recurrent_regularizer: Regularizer function applied to
@@ -2499,7 +2502,8 @@ class PeepholeLSTMCell(LSTMCell):
   well as the previous hidden state (which is what LSTMCell is limited to).
   This allows PeepholeLSTMCell to better learn precise timings over LSTMCell.
 
-  From [Gers et al.](http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf):
+  From [Gers et al., 2002](
+    http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf):
 
   "We find that LSTM augmented by 'peephole connections' from its internal
   cells to its multiplicative gates can learn the fine distinction between
@@ -2508,9 +2512,7 @@ class PeepholeLSTMCell(LSTMCell):
 
   The peephole implementation is based on:
 
-  [Long short-term memory recurrent neural network architectures for
-   large scale acoustic modeling.
-  ](https://research.google.com/pubs/archive/43905.pdf)
+  [Sak et al., 2014](https://research.google.com/pubs/archive/43905.pdf)
 
   Example:
 
@@ -2597,8 +2599,8 @@ class LSTM(RNN):
     unit_forget_bias: Boolean.
       If True, add 1 to the bias of the forget gate at initialization.
       Setting it to true will also force `bias_initializer="zeros"`.
-      This is recommended in [Jozefowicz et
-        al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
+      This is recommended in [Jozefowicz et al., 2015](
+        http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
     kernel_regularizer: Regularizer function applied to
       the `kernel` weights matrix.
     recurrent_regularizer: Regularizer function applied to
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index f8857a3032f..b6afe2a0e03 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -602,7 +602,7 @@ class RNNTest(keras_parameterized.TestCase):
     self.assertEqual(layer.get_losses_for(None), [loss_2])
     self.assertEqual(layer.get_losses_for(x), [loss_1])
 
-    # Test `get_updates_for` and `updates`
+    # Test `updates`
     cells = [keras.layers.LSTMCell(1),
              keras.layers.LSTMCell(1)]
     layer = keras.layers.RNN(cells)
@@ -618,8 +618,6 @@ class RNNTest(keras_parameterized.TestCase):
       cells[0].add_update(update_1, inputs=x)
       cells[0].add_update(update_2)
     self.assertEqual(len(layer.updates), 2)
-    self.assertEqual(len(layer.get_updates_for(None)), 1)
-    self.assertEqual(len(layer.get_updates_for(x)), 1)
 
   def test_rnn_dynamic_trainability(self):
     layer_class = keras.layers.SimpleRNN
@@ -1375,7 +1373,15 @@ class RNNTest(keras_parameterized.TestCase):
         initial_state=states
     )
     model = keras.Model([inputs, state_h, state_c], decoder_out)
+    output1 = model.predict(
+        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
+    output2 = model.predict(
+        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
     model.reset_states()
+    output3 = model.predict(
+        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
+    self.assertAllClose(output1, output3)
+    self.assertNotAllClose(output1, output2)
 
   def test_reset_states(self):
     # See https://github.com/tensorflow/tensorflow/issues/25852
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 64bee4d6121..30be3d485df 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras.layers import convolutional_recurrent
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import cudnn_recurrent
 from tensorflow.python.keras.layers import dense_attention
+from tensorflow.python.keras.layers import einsum_dense
 from tensorflow.python.keras.layers import embeddings
 from tensorflow.python.keras.layers import local
 from tensorflow.python.keras.layers import merge
@@ -52,33 +53,17 @@ from tensorflow.python.util import tf_inspect as inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
-ALL_MODULES = (
-    base_layer,
-    input_layer,
-    advanced_activations,
-    convolutional,
-    convolutional_recurrent,
-    core,
-    cudnn_recurrent,
-    dense_attention,
-    embeddings,
-    local,
-    merge,
-    noise,
-    normalization,
-    pooling,
-    image_preprocessing,
-    preprocessing_normalization_v1,
-    recurrent,
-    wrappers
-)
+ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
+               convolutional_recurrent, core, cudnn_recurrent, dense_attention,
+               embeddings, einsum_dense, local, merge, noise, normalization,
+               pooling, image_preprocessing, preprocessing_normalization_v1,
+               recurrent, wrappers)
 ALL_V2_MODULES = (
     rnn_cell_wrapper_v2,
     normalization_v2,
     recurrent_v2,
     preprocessing_normalization
 )
-
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
@@ -125,18 +110,24 @@ def populate_deserializable_objects():
   from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
   from tensorflow.python.keras.premade.linear import LinearModel  # pylint: disable=g-import-not-at-top
   from tensorflow.python.keras.premade.wide_deep import WideDeepModel  # pylint: disable=g-import-not-at-top
-  from tensorflow.python.feature_column import dense_features  # pylint: disable=g-import-not-at-top
-  from tensorflow.python.feature_column import sequence_feature_column as sfc  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.feature_column.sequence_feature_column import SequenceFeatures  # pylint: disable=g-import-not-at-top
 
   LOCAL.ALL_OBJECTS['Input'] = input_layer.Input
   LOCAL.ALL_OBJECTS['InputSpec'] = input_spec.InputSpec
-  LOCAL.ALL_OBJECTS['Network'] = models.Network
+  LOCAL.ALL_OBJECTS['Functional'] = models.Functional
   LOCAL.ALL_OBJECTS['Model'] = models.Model
+  LOCAL.ALL_OBJECTS['SequenceFeatures'] = SequenceFeatures
   LOCAL.ALL_OBJECTS['Sequential'] = models.Sequential
   LOCAL.ALL_OBJECTS['LinearModel'] = LinearModel
   LOCAL.ALL_OBJECTS['WideDeepModel'] = WideDeepModel
-  LOCAL.ALL_OBJECTS['DenseFeatures'] = dense_features.DenseFeatures
-  LOCAL.ALL_OBJECTS['SequenceFeatures'] = sfc.SequenceFeatures
+
+  if tf2.enabled():
+    from tensorflow.python.keras.feature_column.dense_features_v2 import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
+  else:
+    from tensorflow.python.keras.feature_column.dense_features import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
+
   # Merge layers, function versions.
   LOCAL.ALL_OBJECTS['add'] = merge.add
   LOCAL.ALL_OBJECTS['subtract'] = merge.subtract
diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index cd88b072224..920881c6a3e 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -53,7 +53,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
     new_layer = keras.layers.deserialize(config)
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -88,7 +88,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -116,7 +116,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters(
       [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
@@ -135,7 +135,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters([rnn_v1.LSTM, rnn_v2.LSTM])
   def test_serialize_deserialize_lstm(self, layer):
@@ -165,5 +165,6 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertIsInstance(new_layer, rnn_v1.GRU)
       self.assertNotIsInstance(new_layer, rnn_v2.GRU)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 518863c828b..8fe3b3b20bb 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -125,7 +125,6 @@ class TimeDistributed(Wrapper):
               input=layer))
     super(TimeDistributed, self).__init__(layer, **kwargs)
     self.supports_masking = True
-    self._supports_ragged_inputs = True
 
     # It is safe to use the fast, reshape-based approach with all of our
     # built-in Layers.
@@ -356,6 +355,10 @@ class Bidirectional(Wrapper):
   Call arguments:
     The call arguments for this layer are the same as those of the wrapped RNN
       layer.
+    Beware that when passing the `initial_state` argument during the call of
+    this layer, the first half in the list of elements in the `initial_state`
+    list will be passed to the forward RNN call and the last half in the list
+    of elements will be passed to the backward RNN call.
 
   Raises:
     ValueError:
@@ -449,7 +452,6 @@ class Bidirectional(Wrapper):
     self._trainable = True
     self._num_constants = 0
     self.input_spec = layer.input_spec
-    self._supports_ragged_inputs = True
 
   def _verify_layer_config(self):
     """Ensure the forward and backward layers have valid common property."""
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 54069cf1b6a..bb22db25591 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -377,7 +377,8 @@ class TimeDistributedTest(keras_parameterized.TestCase):
           input_layer.compute_output_shape([None, 2, 4]).as_list(),
           [None, 2, 8])
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  # TODO(scottzhu): check why v1 session failed.
   def test_TimeDistributed_with_mask_first_implementation(self):
     np.random.seed(100)
     rnn_layer = keras.layers.LSTM(4, return_sequences=True, stateful=True)
@@ -787,8 +788,6 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
       layer = keras.layers.Bidirectional(keras.layers.SimpleRNN(3))
       _ = layer(x)
       assert not layer.updates
-      assert not layer.get_updates_for(None)
-      assert not layer.get_updates_for(x)
       # TODO(b/128684069): Remove when Wrapper sublayers are __call__'d.
       with base_layer_utils.call_context().enter(layer, x, True, None):
         layer.forward_layer.add_update(x_reachable_update, inputs=x)
@@ -796,33 +795,22 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
         layer.backward_layer.add_update(x_reachable_update, inputs=x)
         layer.backward_layer.add_update(1, inputs=None)
       assert len(layer.updates) == 4
-      assert len(layer.get_updates_for(None)) == 2
-      assert len(layer.get_updates_for(x)) == 2
 
   def test_Bidirectional_losses(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(3, 2))
-      x_reachable_loss = x * x
-      layer = keras.layers.Bidirectional(
-          keras.layers.SimpleRNN(
-              3, kernel_regularizer='l1', bias_regularizer='l1',
-              activity_regularizer='l1'))
-      _ = layer(x)
-      assert len(layer.losses) == 6
-      assert len(layer.get_losses_for(None)) == 4
-      assert len(layer.get_losses_for(x)) == 2
+    x = keras.layers.Input(shape=(3, 2))
+    layer = keras.layers.Bidirectional(
+        keras.layers.SimpleRNN(
+            3,
+            kernel_regularizer='l1',
+            bias_regularizer='l1',
+            activity_regularizer='l1'))
+    _ = layer(x)
+    assert len(layer.losses) == 6
 
-      # Create a random tensor that is not conditional on the inputs.
-      with keras.backend.get_graph().as_default():
-        const_tensor = constant_op.constant(1)
-
-      layer.forward_layer.add_loss(x_reachable_loss, inputs=x)
-      layer.forward_layer.add_loss(const_tensor, inputs=None)
-      layer.backward_layer.add_loss(x_reachable_loss, inputs=x)
-      layer.backward_layer.add_loss(const_tensor, inputs=None)
-      assert len(layer.losses) == 10
-      assert len(layer.get_losses_for(None)) == 6
-      assert len(layer.get_losses_for(x)) == 4
+    loss = x * x
+    layer.forward_layer.add_loss(loss)
+    layer.backward_layer.add_loss(loss, inputs=x)
+    assert len(layer.losses) == 8
 
   def test_Bidirectional_with_constants(self):
     with self.cached_session():
@@ -1237,6 +1225,27 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
       y_merged = ragged_tensor.convert_to_tensor_or_ragged_tensor(y_merged)
       self.assertAllClose(y_merged.flat_values, y_expected.flat_values)
 
+  def test_full_input_spec(self):
+    # See https://github.com/tensorflow/tensorflow/issues/38403
+    inputs = keras.layers.Input(batch_shape=(1, 1, 1))
+    fw_state = keras.layers.Input(batch_shape=(1, 1))
+    bw_state = keras.layers.Input(batch_shape=(1, 1))
+    states = [fw_state, bw_state]
+    bidirectional_rnn = keras.layers.Bidirectional(
+        keras.layers.SimpleRNN(1, stateful=True))
+
+    rnn_output = bidirectional_rnn(inputs, initial_state=states)
+    model = keras.Model([inputs, fw_state, bw_state], rnn_output)
+    output1 = model.predict(
+        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
+    output2 = model.predict(
+        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
+    model.reset_states()
+    output3 = model.predict(
+        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
+    self.assertAllClose(output1, output3)
+    self.assertNotAllClose(output1, output2)
+
 
 class ExampleWrapper(keras.layers.Wrapper):
   """Simple Wrapper subclass."""
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index d97f8d94d50..2bb53dcfaa5 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -22,6 +22,8 @@ import abc
 
 import six
 
+from tensorflow.python.autograph.core import ag_ctx
+from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
@@ -36,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.ops.losses import util as tf_losses_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -63,8 +66,8 @@ class Loss(object):
   types, and reduce losses explicitly in your training loop. Using 'AUTO' or
   'SUM_OVER_BATCH_SIZE' will raise an error.
 
-  Please see this custom training [tutorial]
-  (https://www.tensorflow.org/tutorials/distribute/custom_training) for more
+  Please see this custom training [tutorial](
+    https://www.tensorflow.org/tutorials/distribute/custom_training) for more
   details on this.
 
   You can implement 'SUM_OVER_BATCH_SIZE' using global batch size like:
@@ -88,8 +91,8 @@ class Loss(object):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op.
     """
@@ -142,7 +145,8 @@ class Loss(object):
     graph_ctx = tf_utils.graph_context_for_symbolic_tensors(
         y_true, y_pred, sample_weight)
     with K.name_scope(self._name_scope), graph_ctx:
-      losses = self.call(y_true, y_pred)
+      ag_call = autograph.tf_convert(self.call, ag_ctx.control_status_ctx())
+      losses = ag_call(y_true, y_pred)
       return losses_utils.compute_weighted_loss(
           losses, sample_weight, reduction=self._get_reduction())
 
@@ -222,8 +226,8 @@ class LossFunctionWrapper(Loss):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: (Optional) name for the loss.
       **kwargs: The keyword arguments that are passed on to `fn`.
@@ -245,7 +249,8 @@ class LossFunctionWrapper(Loss):
     if tensor_util.is_tensor(y_pred) and tensor_util.is_tensor(y_true):
       y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions(
           y_pred, y_true)
-    return self.fn(y_true, y_pred, **self._fn_kwargs)
+    ag_fn = autograph.tf_convert(self.fn, ag_ctx.control_status_ctx())
+    return ag_fn(y_true, y_pred, **self._fn_kwargs)
 
   def get_config(self):
     config = {}
@@ -305,8 +310,8 @@ class MeanSquaredError(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'mean_squared_error'.
     """
@@ -364,8 +369,8 @@ class MeanAbsoluteError(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'mean_absolute_error'.
     """
@@ -424,8 +429,8 @@ class MeanAbsolutePercentageError(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to
         'mean_absolute_percentage_error'.
@@ -485,8 +490,8 @@ class MeanSquaredLogarithmicError(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to
         'mean_squared_logarithmic_error'.
@@ -561,8 +566,8 @@ class BinaryCrossentropy(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: (Optional) Name for the op. Defaults to 'binary_crossentropy'.
     """
@@ -641,8 +646,8 @@ class CategoricalCrossentropy(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'categorical_crossentropy'.
     """
@@ -718,8 +723,8 @@ class SparseCategoricalCrossentropy(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to
         'sparse_categorical_crossentropy'.
@@ -782,8 +787,8 @@ class Hinge(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'hinge'.
     """
@@ -843,8 +848,8 @@ class SquaredHinge(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'squared_hinge'.
     """
@@ -903,8 +908,8 @@ class CategoricalHinge(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'categorical_hinge'.
     """
@@ -960,8 +965,8 @@ class Poisson(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'poisson'.
     """
@@ -1017,8 +1022,8 @@ class LogCosh(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'log_cosh'.
     """
@@ -1077,8 +1082,8 @@ class KLDivergence(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'kl_divergence'.
     """
@@ -1145,8 +1150,8 @@ class Huber(LossFunctionWrapper):
         this defaults to `SUM_OVER_BATCH_SIZE`. When used with
         `tf.distribute.Strategy`, outside of built-in training loops such as
         `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial]
-        (https://www.tensorflow.org/tutorials/distribute/custom_training)
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training)
         for more details.
       name: Optional name for the op. Defaults to 'huber_loss'.
     """
@@ -1160,6 +1165,7 @@ class Huber(LossFunctionWrapper):
               'keras.losses.mean_squared_error',
               'keras.losses.mse',
               'keras.losses.MSE')
+@dispatch.add_dispatch_support
 def mean_squared_error(y_true, y_pred):
   """Computes the mean squared error between labels and predictions.
 
@@ -1195,6 +1201,7 @@ def mean_squared_error(y_true, y_pred):
               'keras.losses.mean_absolute_error',
               'keras.losses.mae',
               'keras.losses.MAE')
+@dispatch.add_dispatch_support
 def mean_absolute_error(y_true, y_pred):
   """Computes the mean absolute error between labels and predictions.
 
@@ -1227,6 +1234,7 @@ def mean_absolute_error(y_true, y_pred):
               'keras.losses.mean_absolute_percentage_error',
               'keras.losses.mape',
               'keras.losses.MAPE')
+@dispatch.add_dispatch_support
 def mean_absolute_percentage_error(y_true, y_pred):
   """Computes the mean absolute percentage error between `y_true` and `y_pred`.
 
@@ -1263,6 +1271,7 @@ def mean_absolute_percentage_error(y_true, y_pred):
               'keras.losses.mean_squared_logarithmic_error',
               'keras.losses.msle',
               'keras.losses.MSLE')
+@dispatch.add_dispatch_support
 def mean_squared_logarithmic_error(y_true, y_pred):
   """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
 
@@ -1311,6 +1320,7 @@ def _maybe_convert_labels(y_true):
 
 
 @keras_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
+@dispatch.add_dispatch_support
 def squared_hinge(y_true, y_pred):
   """Computes the squared hinge loss between `y_true` and `y_pred`.
 
@@ -1343,6 +1353,7 @@ def squared_hinge(y_true, y_pred):
 
 
 @keras_export('keras.metrics.hinge', 'keras.losses.hinge')
+@dispatch.add_dispatch_support
 def hinge(y_true, y_pred):
   """Computes the hinge loss between `y_true` and `y_pred`.
 
@@ -1374,6 +1385,7 @@ def hinge(y_true, y_pred):
 
 
 @keras_export('keras.losses.categorical_hinge')
+@dispatch.add_dispatch_support
 def categorical_hinge(y_true, y_pred):
   """Computes the categorical hinge loss between `y_true` and `y_pred`.
 
@@ -1406,6 +1418,7 @@ def categorical_hinge(y_true, y_pred):
 
 
 @keras_export('keras.losses.huber', v1=[])
+@dispatch.add_dispatch_support
 def huber(y_true, y_pred, delta=1.0):
   """Computes Huber loss value.
 
@@ -1428,6 +1441,7 @@ def huber(y_true, y_pred, delta=1.0):
   """
   y_pred = math_ops.cast(y_pred, dtype=K.floatx())
   y_true = math_ops.cast(y_true, dtype=K.floatx())
+  delta = math_ops.cast(delta, dtype=K.floatx())
   error = math_ops.subtract(y_pred, y_true)
   abs_error = math_ops.abs(error)
   quadratic = math_ops.minimum(abs_error, delta)
@@ -1442,6 +1456,7 @@ def huber(y_true, y_pred, delta=1.0):
 
 
 @keras_export('keras.losses.log_cosh', 'keras.losses.logcosh')
+@dispatch.add_dispatch_support
 def log_cosh(y_true, y_pred):
   """Logarithm of the hyperbolic cosine of the prediction error.
 
@@ -1480,6 +1495,7 @@ def log_cosh(y_true, y_pred):
 
 @keras_export('keras.metrics.categorical_crossentropy',
               'keras.losses.categorical_crossentropy')
+@dispatch.add_dispatch_support
 def categorical_crossentropy(y_true,
                              y_pred,
                              from_logits=False,
@@ -1520,6 +1536,7 @@ def categorical_crossentropy(y_true,
 
 @keras_export('keras.metrics.sparse_categorical_crossentropy',
               'keras.losses.sparse_categorical_crossentropy')
+@dispatch.add_dispatch_support
 def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
   """Computes the sparse categorical crossentropy loss.
 
@@ -1551,6 +1568,7 @@ def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
 
 @keras_export('keras.metrics.binary_crossentropy',
               'keras.losses.binary_crossentropy')
+@dispatch.add_dispatch_support
 def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
   """Computes the binary crossentropy loss.
 
@@ -1594,6 +1612,7 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
               'keras.losses.kullback_leibler_divergence',
               'keras.losses.kld',
               'keras.losses.KLD')
+@dispatch.add_dispatch_support
 def kl_divergence(y_true, y_pred):
   """Computes Kullback-Leibler divergence loss between `y_true` and `y_pred`.
 
@@ -1630,6 +1649,7 @@ def kl_divergence(y_true, y_pred):
 
 
 @keras_export('keras.metrics.poisson', 'keras.losses.poisson')
+@dispatch.add_dispatch_support
 def poisson(y_true, y_pred):
   """Computes the Poisson loss between y_true and y_pred.
 
@@ -1671,6 +1691,7 @@ def poisson(y_true, y_pred):
         'keras.losses.cosine',
         'keras.losses.cosine_similarity',
     ])
+@dispatch.add_dispatch_support
 def cosine_similarity(y_true, y_pred, axis=-1):
   """Computes the cosine similarity between labels and predictions.
 
@@ -1876,8 +1897,8 @@ def get(identifier):
   elif callable(identifier):
     return identifier
   else:
-    raise ValueError('Could not interpret '
-                     'loss function identifier:', identifier)
+    raise ValueError(
+        'Could not interpret loss function identifier: {}'.format(identifier))
 
 
 LABEL_DTYPES_FOR_LOSSES = {
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 119cc5db87d..574d3d3f756 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -29,6 +30,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import losses
 from tensorflow.python.keras.utils import losses_utils
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 ALL_LOSSES = [
@@ -193,6 +195,35 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
     # reduced_weighted_mse = (6 + 26) / 2 =
     self.assertAllClose(self.evaluate(loss), 16, 1e-2)
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_loss_wrapper_autograph(self):
+    # Test that functions with control flow wrapped in a LossFunctionWrapper
+    # get autographed when in a tf.function
+    def loss_fn(y_true, y_pred):
+      mse_loss_fn = losses.get('mse')
+      if math_ops.reduce_mean(y_true) > 0:
+        return mse_loss_fn(y_true, y_pred)
+      else:
+        return mse_loss_fn(y_true, y_pred)
+
+    mse_obj = losses.LossFunctionWrapper(loss_fn)
+
+    y_true = constant_op.constant([[1., 9.], [2., 5.]])
+    y_pred = constant_op.constant([[4., 8.], [12., 3.]])
+    sample_weight = constant_op.constant([1.2, 0.5])
+
+    @def_function.function
+    def tf_functioned_loss_fn(y_true, y_pred, sample_weight=None):
+      return mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    loss = tf_functioned_loss_fn(y_true, y_pred, sample_weight=sample_weight)
+
+    # mse = [((4 - 1)^2 + (8 - 9)^2) / 2, ((12 - 2)^2 + (3 - 5)^2) / 2]
+    # mse = [5, 52]
+    # weighted_mse = [5 * 1.2, 52 * 0.5] = [6, 26]
+    # reduced_weighted_mse = (6 + 26) / 2 =
+    self.assertAllClose(self.evaluate(loss), 16, 1e-2)
+
   def test_invalid_reduction(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid Reduction Key Foo.'):
       losses.MeanSquaredError(reduction='Foo')
@@ -203,6 +234,10 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'Invalid Reduction Key Bar.'):
       mse_obj(y, y)
 
+  def test_deserialization_error(self):
+    with self.assertRaisesRegex(ValueError, 'Could not interpret loss'):
+      losses.get(0)
+
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanSquaredErrorTest(test.TestCase):
@@ -1578,6 +1613,56 @@ class HuberLossTest(test.TestCase):
     actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
     self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
 
+  def test_loss_with_non_default_dtype(self):
+    # Test case for GitHub issue:
+    # https://github.com/tensorflow/tensorflow/issues/39004
+    self.setup()
+    h_obj = losses.Huber()
+    try:
+      backend.set_floatx('float64')
+      loss = h_obj(self.y_true, self.y_true)
+      self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+    finally:
+      backend.set_floatx('float32')
+
+
+class BinaryTruePositivesViaControlFlow(losses.Loss):
+
+  def __init__(self, reduction=losses_utils.ReductionV2.AUTO):
+    super(BinaryTruePositivesViaControlFlow, self).__init__(reduction=reduction)
+
+  def call(self, y_true, y_pred):
+    y_true = math_ops.cast(y_true, dtypes.bool)
+    y_pred = math_ops.cast(y_pred, dtypes.bool)
+
+    result = constant_op.constant(0.0)
+    for i in range(len(y_true)):
+      for j in range(len(y_true[i])):
+        if y_true[i][j] and y_pred[i][j]:
+          result = result + 1
+    return result
+
+
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class CustomLossTest(test.TestCase):
+
+  def test_autograph(self):
+    y_true = constant_op.constant([[0, 0.9, 0, 1, 0], [0, 0, 1, 1, 1],
+                                   [1, 1, 1, 1, 0], [0, 0, 0, 0, 1.5]])
+    y_pred = constant_op.constant([[0, 0, 1, 5, 0], [1, 1, 1, 1, 1],
+                                   [0, 1, 0, 1, 0], [1, 10, 1, 1, 1]])
+
+    @def_function.function
+    def loss_fn(y_true, y_pred):
+      loss_obj = BinaryTruePositivesViaControlFlow()
+      return loss_obj(y_true, y_pred)
+
+    loss = loss_fn(y_true, y_pred)
+    self.assertAllEqual(
+        self.evaluate(loss),
+        7.0,
+    )
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 4323c52209f..a67755b9333 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -26,6 +26,8 @@ import types
 import numpy as np
 import six
 
+from tensorflow.python.autograph.core import ag_ctx
+from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -67,6 +69,7 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -138,7 +141,7 @@ class Metric(base_layer.Layer):
       values = tf.cast(values, self.dtype)
       if sample_weight is not None:
         sample_weight = tf.cast(sample_weight, self.dtype)
-        sample_weight = tf.broadcast_weights(sample_weight, values)
+        sample_weight = tf.broadcast_to(sample_weight, values.shape)
         values = tf.multiply(values, sample_weight)
       self.true_positives.assign_add(tf.reduce_sum(values))
 
@@ -165,7 +168,12 @@ class Metric(base_layer.Layer):
     # return ops.
     if (base_layer_utils.is_in_eager_or_tf_function() or
         is_built_in(cls)):
-      update_state_fn = obj.update_state
+      obj_update_state = obj.update_state
+
+      def update_state_fn(*args, **kwargs):
+        control_status = ag_ctx.control_status_ctx()
+        ag_update_state = autograph.tf_convert(obj_update_state, control_status)
+        return ag_update_state(*args, **kwargs)
     else:
       if isinstance(obj.update_state, def_function.Function):
         update_state_fn = obj.update_state
@@ -174,7 +182,16 @@ class Metric(base_layer.Layer):
 
     obj.update_state = types.MethodType(
         metrics_utils.update_state_wrapper(update_state_fn), obj)
-    obj.result = types.MethodType(metrics_utils.result_wrapper(obj.result), obj)
+
+    obj_result = obj.result
+
+    def result_fn(*args, **kwargs):
+      control_status = ag_ctx.control_status_ctx()
+      ag_result = autograph.tf_convert(obj_result, control_status)
+      return ag_result(*args, **kwargs)
+
+    obj.result = types.MethodType(metrics_utils.result_wrapper(result_fn), obj)
+
     return obj
 
   def __call__(self, *args, **kwargs):
@@ -279,15 +296,16 @@ class Metric(base_layer.Layer):
     if distributed_training_utils.is_tpu_strategy(strategy):
       synchronization = tf_variables.VariableSynchronization.ON_WRITE
 
-    return super(Metric, self).add_weight(
-        name=name,
-        shape=shape,
-        dtype=self._dtype if dtype is None else dtype,
-        trainable=False,
-        initializer=initializer,
-        collections=[],
-        synchronization=synchronization,
-        aggregation=aggregation)
+    with ops.init_scope():
+      return super(Metric, self).add_weight(
+          name=name,
+          shape=shape,
+          dtype=self._dtype if dtype is None else dtype,
+          trainable=False,
+          initializer=initializer,
+          collections=[],
+          synchronization=synchronization,
+          aggregation=aggregation)
 
   ### End: For use by subclasses ###
 
@@ -308,13 +326,12 @@ class Reduce(Metric):
   def __init__(self, reduction, name, dtype=None):
     super(Reduce, self).__init__(name=name, dtype=dtype)
     self.reduction = reduction
-    with ops.init_scope():
-      self.total = self.add_weight(
-          'total', initializer=init_ops.zeros_initializer)
-      if reduction in [metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
-                       metrics_utils.Reduction.WEIGHTED_MEAN]:
-        self.count = self.add_weight(
-            'count', initializer=init_ops.zeros_initializer)
+    self.total = self.add_weight(
+        'total', initializer=init_ops.zeros_initializer)
+    if reduction in [metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+                     metrics_utils.Reduction.WEIGHTED_MEAN]:
+      self.count = self.add_weight(
+          'count', initializer=init_ops.zeros_initializer)
 
   def update_state(self, values, sample_weight=None):
     """Accumulates statistics for computing the metric.
@@ -591,7 +608,8 @@ class MeanMetricWrapper(Mean):
     y_pred, y_true = tf_losses_utils.squeeze_or_expand_dimensions(
         y_pred, y_true)
 
-    matches = self._fn(y_true, y_pred, **self._fn_kwargs)
+    ag_fn = autograph.tf_convert(self._fn, ag_ctx.control_status_ctx())
+    matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
     return super(MeanMetricWrapper, self).update_state(
         matches, sample_weight=sample_weight)
 
@@ -1416,8 +1434,8 @@ class Recall(Metric):
 class SensitivitySpecificityBase(Metric):
   """Abstract base class for computing sensitivity and specificity.
 
-  For additional information about specificity and sensitivity, see the
-  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+  For additional information about specificity and sensitivity, see
+  [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
   """
 
   def __init__(self, value, num_thresholds=200, name=None, dtype=None):
@@ -1523,8 +1541,8 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
-  For additional information about specificity and sensitivity, see the
-  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+  For additional information about specificity and sensitivity, see
+  [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
 
   Args:
     specificity: A scalar value in range `[0, 1]`.
@@ -1598,8 +1616,8 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
-  For additional information about specificity and sensitivity, see the
-  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+  For additional information about specificity and sensitivity, see
+  [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
 
   Args:
     sensitivity: A scalar value in range `[0, 1]`.
@@ -1828,13 +1846,14 @@ class AUC(Metric):
       use when discretizing the roc curve. Values must be > 1.
     curve: (Optional) Specifies the name of the curve to be computed, 'ROC'
       [default] or 'PR' for the Precision-Recall-curve.
-    summation_method: (Optional) Specifies the Riemann summation method used
-      (https://en.wikipedia.org/wiki/Riemann_sum): 'interpolation' [default],
-        applies mid-point summation scheme for `ROC`. For PR-AUC, interpolates
-        (true/false) positives but not the ratio that is precision (see Davis
-        & Goadrich 2006 for details); 'minoring' that applies left summation
+    summation_method: (Optional) Specifies the [Riemann summation method](
+        https://en.wikipedia.org/wiki/Riemann_sum) used.
+        'interpolation' (default) applies mid-point summation scheme for `ROC`.
+        For PR-AUC, interpolates (true/false) positives but not the ratio that
+        is precision (see Davis & Goadrich 2006 for details);
+        'minoring' applies left summation
         for increasing intervals and right summation for decreasing intervals;
-        'majoring' that does the opposite.
+        'majoring' does the opposite.
     name: (Optional) string name of the metric instance.
     dtype: (Optional) data type of the metric result.
     thresholds: (Optional) A list of floating point values to use as the
@@ -1924,7 +1943,8 @@ class AUC(Metric):
 
     # Add an endpoint "threshold" below zero and above one for either
     # threshold method to account for floating point imprecisions.
-    self.thresholds = [0.0 - K.epsilon()] + thresholds + [1.0 + K.epsilon()]
+    self._thresholds = np.array([0.0 - K.epsilon()] + thresholds +
+                                [1.0 + K.epsilon()])
 
     if isinstance(curve, metrics_utils.AUCCurve):
       self.curve = curve
@@ -1958,6 +1978,11 @@ class AUC(Metric):
     else:
       self._build(None)
 
+  @property
+  def thresholds(self):
+    """The thresholds used for evaluating AUC."""
+    return list(self._thresholds)
+
   def _build(self, shape):
     """Initialize TP, FP, TN, and FN tensors, given the shape of the data."""
     if self.multi_label:
@@ -2055,7 +2080,7 @@ class AUC(Metric):
           },
           y_true,
           y_pred,
-          self.thresholds,
+          self._thresholds,
           sample_weight=sample_weight,
           multi_label=self.multi_label,
           label_weights=label_weights)
@@ -2226,8 +2251,9 @@ class AUC(Metric):
 class CosineSimilarity(MeanMetricWrapper):
   """Computes the cosine similarity between the labels and predictions.
 
-  cosine similarity = (a . b) / ||a|| ||b||
-  [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
+  `cosine similarity = (a . b) / ||a|| ||b||`
+
+  See: [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity).
 
   This metric keeps the average cosine similarity between `predictions` and
   `labels` over a stream of data.
@@ -3163,7 +3189,8 @@ class SumOverBatchSizeMetricWrapper(SumOverBatchSize):
     y_pred, y_true = tf_losses_utils.squeeze_or_expand_dimensions(
         y_pred, y_true)
 
-    matches = self._fn(y_true, y_pred, **self._fn_kwargs)
+    ag_fn = autograph.tf_convert(self._fn, ag_ctx.control_status_ctx())
+    matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
     return super(SumOverBatchSizeMetricWrapper, self).update_state(
         matches, sample_weight=sample_weight)
 
@@ -3186,6 +3213,7 @@ def accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.binary_accuracy')
+@dispatch.add_dispatch_support
 def binary_accuracy(y_true, y_pred, threshold=0.5):
   """Calculates how often predictions matches binary labels.
 
@@ -3213,6 +3241,7 @@ def binary_accuracy(y_true, y_pred, threshold=0.5):
 
 
 @keras_export('keras.metrics.categorical_accuracy')
+@dispatch.add_dispatch_support
 def categorical_accuracy(y_true, y_pred):
   """Calculates how often predictions matches one-hot labels.
 
@@ -3241,6 +3270,7 @@ def categorical_accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.sparse_categorical_accuracy')
+@dispatch.add_dispatch_support
 def sparse_categorical_accuracy(y_true, y_pred):
   """Calculates how often predictions matches integer labels.
 
@@ -3281,6 +3311,7 @@ def sparse_categorical_accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.top_k_categorical_accuracy')
+@dispatch.add_dispatch_support
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
   """Computes how often targets are in the top `K` predictions.
 
@@ -3306,6 +3337,7 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 
 @keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
+@dispatch.add_dispatch_support
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   """Computes how often integer targets are in the top `K` predictions.
 
@@ -3453,11 +3485,9 @@ def get(identifier):
   elif callable(identifier):
     return identifier
   else:
-    error_msg = 'Could not interpret metric function identifier: {}'.format(
-        identifier)
-    raise ValueError(error_msg)
+    raise ValueError(
+        'Could not interpret metric function identifier: {}'.format(identifier))
 
 
 def is_built_in(cls):
   return cls.__module__ == Metric.__module__
-
diff --git a/tensorflow/python/keras/metrics_correctness_test.py b/tensorflow/python/keras/metrics_correctness_test.py
index e209a81e6c0..4b71cb57199 100644
--- a/tensorflow/python/keras/metrics_correctness_test.py
+++ b/tensorflow/python/keras/metrics_correctness_test.py
@@ -47,15 +47,14 @@ def get_multi_io_model():
 
 def custom_generator_multi_io(sample_weights=None):
   batch_size = 2
-  num_samples = 4
-  inputs = np.asarray([[1.], [2.], [3.], [4.]])
-  targets_1 = np.asarray([[2.], [4.], [6.], [8.]])
-  targets_2 = np.asarray([[1.], [2.], [3.], [4.]])
-  i = 0
+  num_samples = 5
+  inputs = np.asarray([[1.], [2.], [3.], [4.], [5.]])
+  targets_1 = np.asarray([[2.], [4.], [6.], [8.], [10.]])
+  targets_2 = np.asarray([[1.], [2.], [3.], [4.], [5.]])
+  start = 0
   while True:
-    batch_index = i * batch_size % num_samples
-    i += 1
-    start = batch_index
+    if start > num_samples:
+      start = 0
     end = start + batch_size
     x = [inputs[start:end], inputs[start:end]]
     y = [targets_1[start:end], targets_2[start:end]]
@@ -63,6 +62,7 @@ def custom_generator_multi_io(sample_weights=None):
       sw = nest.map_structure(lambda w: w[start:end], sample_weights)
     else:
       sw = None
+    start = end
     yield x, y, sw
 
 
@@ -84,97 +84,103 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
 
   def setUp(self):
     super(TestMetricsCorrectnessMultiIO, self).setUp()
-    self.x = np.asarray([[1.], [2.], [3.], [4.]])
-    self.y1 = np.asarray([[2.], [4.], [6.], [8.]])
-    self.y2 = np.asarray([[1.], [2.], [3.], [4.]])
-    self.sample_weight_1 = np.asarray([2., 3., 4., 5.])
-    self.sample_weight_2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+    self.x = np.asarray([[1.], [2.], [3.], [4.], [5.]])
+    self.y1 = np.asarray([[2.], [4.], [6.], [8.], [10.]])
+    self.y2 = np.asarray([[1.], [2.], [3.], [4.], [5.]])
+    self.sample_weight_1 = np.asarray([2., 3., 4., 5., 6.])
+    self.sample_weight_2 = np.asarray([3.5, 2.5, 1.5, 0.5, 3.])
 
-    # y_true_1 = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
-    # y_true_2 = [[1.], [2.], [3.], [4.]], y_pred = [[3.], [6.], [9.], [12.]]
+    # y_true_1 = [[2.], [4.], [6.], [8.], [10.]]
+    # y_pred_1 = [[3.], [6.], [9.], [12.], [15.]]
+    # y_true_2 = [[1.], [2.], [3.], [4.], [5.]]
+    # y_pred_2 = [[3.], [6.], [9.], [12.], [15.]]
 
     # Weighted metric `output_1`:
-    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
-    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
-    #         = 130
-    #   Count = (2 + 3) + (4 + 5)
-    #   Result = 9.2857141
+    #   Total = ((3 - 2)^2 * 2 + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5) +
+    #           ((15 - 10)^2 *  6)
+    #         = 280
+    #   Count = (2 + 3) + (4 + 5) + 6 = 20
+    #   Result = 14
 
     # Weighted metric `output_2`:
     #   Total = ((3 - 1)^2 * 3.5 + (6 - 2)^2 * 2.5) +
-    #           ((9 - 3)^2 * 1.5 + (12 - 4)^2 * 0.5)
-    #         = 140
-    #   Count = (3.5 + 2.5) + (1.5 + 0.5)
-    #   Result = 17.5
+    #           ((9 - 3)^2 * 1.5 + (12 - 4)^2 * 0.5) +
+    #           (15 - 5)^2 * 3.0
+    #         = 440
+    #   Count = (3.5 + 2.5) + (1.5 + 0.5) + 3.0 = 11.0
+    #   Result = 40
 
     # Loss `output_1` with weights:
     #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
-    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
-    #         = 130
-    #   Count = 2 + 2
-    #   Result = 32.5
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5) +
+    #           ((15 - 10)^2 *  6)
+    #         = 280
+    #   Count = 2 + 2 + 1
+    #   Result = 56
 
     # Loss `output_1` without weights/Metric `output_1`:
-    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) = 30
-    #   Count = 2 + 2
-    #   Result = 7.5
+    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) + (15 - 10)^2
+    #         = 55
+    #   Count = 2 + 2 + 1
+    #   Result = 11
 
     # Loss `output_2` with weights:
     #   Total = ((3 - 1)^2 * 3.5 + (6 - 2)^2 * 2.5) +
-    #           ((9 - 3)^2 * 1.5 + (12 - 4)^2 * 0.5)
-    #         = 140
-    #   Count = 2 + 2
-    #   Result = 35
+    #           ((9 - 3)^2 * 1.5 + (12 - 4)^2 * 0.5) +
+    #           (15 - 5)^2 * 3.0
+    #         = 440
+    #   Count = 2 + 2 + 1
+    #   Result = 88
 
     # Loss `output_2` without weights/Metric `output_2`:
-    #   Total = ((3 - 1)^2 + (6 - 2)^2) + ((9 - 3)^2 + (12 - 4)^2) = 120
-    #   Count = 2 + 2
-    #   Result = 30
+    #   Total = ((3 - 1)^2 + (6 - 2)^2) + ((9 - 3)^2 + (12 - 4)^2) + (15 - 5)^2
+    #         = 220
+    #   Count = 2 + 2 + 1
+    #   Result = 44
 
-    # Total loss with weights = 32.5 + 35 = 67.5
-    # Total loss without weights = 7.5 + 30 = 37.5
+    # Total loss with weights = 56 + 88 = 144
+    # Total loss without weights = 11 + 44 = 55
 
     self.wmse = 'mean_squared_error_2'
     self.expected_fit_result_with_weights = {
-        'output_1_mean_squared_error': [7.5, 7.5],
-        'output_2_mean_squared_error': [30, 30],
-        'output_1_' + self.wmse: [9.286, 9.286],
-        'output_2_' + self.wmse: [17.5, 17.5],
-        'loss': [67.5, 67.5],
-        'output_1_loss': [32.5, 32.5],
-        'output_2_loss': [35, 35],
+        'output_1_mean_squared_error': [11, 11],
+        'output_2_mean_squared_error': [44, 44],
+        'output_1_' + self.wmse: [14, 14],
+        'output_2_' + self.wmse: [40, 40],
+        'loss': [144, 144],
+        'output_1_loss': [56, 56],
+        'output_2_loss': [88, 88],
     }
 
     self.expected_fit_result_with_weights_output_2 = {
-        'output_1_mean_squared_error': [7.5, 7.5],
-        'output_2_mean_squared_error': [30, 30],
-        'output_1_' + self.wmse: [7.5, 7.5],
-        'output_2_' + self.wmse: [17.5, 17.5],
-        'loss': [42.5, 42.5],
-        'output_1_loss': [7.5, 7.5],
-        'output_2_loss': [35, 35],
+        'output_1_mean_squared_error': [11, 11],
+        'output_2_mean_squared_error': [44, 44],
+        'output_1_' + self.wmse: [11, 11],
+        'output_2_' + self.wmse: [40, 40],
+        'loss': [99, 99],
+        'output_1_loss': [11, 11],
+        'output_2_loss': [88, 88],
     }
 
     self.expected_fit_result = {
-        'output_1_mean_squared_error': [7.5, 7.5],
-        'output_2_mean_squared_error': [30, 30],
-        'output_1_' + self.wmse: [7.5, 7.5],
-        'output_2_' + self.wmse: [30, 30],
-        'loss': [37.5, 37.5],
-        'output_1_loss': [7.5, 7.5],
-        'output_2_loss': [30, 30],
+        'output_1_mean_squared_error': [11, 11],
+        'output_2_mean_squared_error': [44, 44],
+        'output_1_' + self.wmse: [11, 11],
+        'output_2_' + self.wmse: [44, 44],
+        'loss': [55, 55],
+        'output_1_loss': [11, 11],
+        'output_2_loss': [44, 44],
     }
 
     # In the order: 'loss', 'output_1_loss', 'output_2_loss',
     # 'output_1_mean_squared_error', 'output_1_mean_squared_error_2',
     # 'output_2_mean_squared_error', 'output_2_mean_squared_error_2'
-    self.expected_batch_result_with_weights = [
-        67.5, 32.5, 35, 7.5, 9.286, 30, 17.5
-    ]
+    self.expected_batch_result_with_weights = [144, 56, 88, 11, 14, 44, 40]
     self.expected_batch_result_with_weights_output_2 = [
-        42.5, 7.5, 35, 7.5, 7.5, 30, 17.5
+        99, 11, 88, 11, 11, 44, 40
     ]
-    self.expected_batch_result = [37.5, 7.5, 30, 7.5, 7.5, 30, 30]
+    self.expected_batch_result = [55, 11, 44, 11, 11, 44, 44]
 
   def test_fit(self):
     model = self._get_compiled_multi_io_model()
@@ -291,7 +297,7 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
   def test_fit_generator(self):
     model = self._get_compiled_multi_io_model()
     history = model.fit_generator(
-        custom_generator_multi_io(), steps_per_epoch=2, epochs=2)
+        custom_generator_multi_io(), steps_per_epoch=3, epochs=2)
     for key, value in self.expected_fit_result.items():
       self.assertAllClose(history.history[key], value, 1e-3)
 
@@ -300,7 +306,7 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     history = model.fit_generator(
         custom_generator_multi_io(
             sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-        steps_per_epoch=2,
+        steps_per_epoch=3,
         epochs=2)
     for key, value in self.expected_fit_result_with_weights.items():
       self.assertAllClose(history.history[key], value, 1e-3)
@@ -309,14 +315,14 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     history = model.fit_generator(
         custom_generator_multi_io(
             sample_weights={'output_2': self.sample_weight_2}),
-        steps_per_epoch=2,
+        steps_per_epoch=3,
         epochs=2)
     for key, value in self.expected_fit_result_with_weights_output_2.items():
       self.assertAllClose(history.history[key], value, 1e-3)
 
   def test_eval_generator(self):
     model = self._get_compiled_multi_io_model()
-    eval_result = model.evaluate_generator(custom_generator_multi_io(), steps=2)
+    eval_result = model.evaluate_generator(custom_generator_multi_io(), steps=3)
     self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
 
   def test_eval_generator_with_sample_weight(self):
@@ -324,7 +330,7 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     eval_result = model.evaluate_generator(
         custom_generator_multi_io(
             sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-        steps=2)
+        steps=3)
     self.assertAllClose(eval_result, self.expected_batch_result_with_weights,
                         1e-3)
 
@@ -332,7 +338,7 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     eval_result = model.evaluate_generator(
         custom_generator_multi_io(
             sample_weights={'output_2': self.sample_weight_2}),
-        steps=2)
+        steps=3)
     self.assertAllClose(eval_result,
                         self.expected_batch_result_with_weights_output_2, 1e-3)
 
@@ -549,7 +555,7 @@ class TestMetricsCorrectnessSingleIO(keras_parameterized.TestCase):
 
 
 @keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
-@keras_parameterized.run_all_keras_modes
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 @parameterized.parameters([
     loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE,
     loss_reduction.ReductionV2.AUTO,
@@ -567,29 +573,34 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
 
   def setUp(self):
     super(TestOutputLossMetrics, self).setUp()
-    self.x = np.asarray([[1.], [2.], [3.], [4.]])
-    self.y1 = np.asarray([[2.], [4.], [6.], [8.]])
-    self.y2 = np.asarray([[1.], [2.], [3.], [4.]])
-    self.sample_weight_1 = np.asarray([2., 3., 4., 5.])
-    self.sample_weight_2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+    self.x = np.asarray([[1.], [2.], [3.], [4.], [5.]])
+    self.y1 = np.asarray([[2.], [4.], [6.], [8.], [10.]])
+    self.y2 = np.asarray([[1.], [2.], [3.], [4.], [5.]])
+    self.sample_weight_1 = np.asarray([2., 3., 4., 5., 6.])
+    self.sample_weight_2 = np.asarray([3.5, 2.5, 1.5, 0.5, 3.])
 
-    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+    # y_true_1 = [[2.], [4.], [6.], [8.], [10.]]
+    # y_pred_1 = [[3.], [6.], [9.], [12.], [15.]]
+    # y_true_2 = [[1.], [2.], [3.], [4.], [5.]]
+    # y_pred_2 = [[3.], [6.], [9.], [12.], [15.]]
 
     # Loss `output_1`:
     #   Per-sample weighted losses
     #   Batch 1 = [(3 - 2)^2 * 2, (6 - 4)^2 * 3)] = [2, 12]
     #   Batch 2 = [((9 - 6)^2 * 4, (12 - 8)^2 * 5)] = [36, 80]
+    #   Batch 3 = [(15 - 10)^2 * 6] = [150]
 
-    #   Result (reduction=SUM) = ((2 + 12) + (36 + 80))/2 = 65
-    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 130 / 4 = 32.5
+    #   Result (reduction=SUM) = ((2 + 12)*2 + (36 + 80)*2 + 150) / 5 = 82
+    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 280 / 5 = 56
 
     # Loss `output_2`:
     #   Per-sample weighted losses
     #   Batch 1 = [(3 - 1)^2 * 3.5, (6 - 2)^2 * 2.5)] = [14, 40]
     #   Batch 2 = [(9 - 3)^2 * 1.5, (12 - 4)^2 * 0.5)] = [54, 32]
+    #   Batch 3 = [(15 - 5)^2 * 3] = [300]
 
-    #   Result (reduction=SUM) = ((14 + 40) + (54 + 32))/2 = 70
-    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 140 / 4 = 35
+    #   Result (reduction=SUM) = ((14 + 40)*2 + (54 + 32)*2 + 300) / 5 = 116
+    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 440 / 5 = 88
 
     # When reduction is 'NONE' loss value that is passed to the optimizer will
     # be vector loss but what is reported is a scalar, which is an average of
@@ -598,18 +609,18 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
     # Total loss = Output_loss_1 + Output_loss_2
 
     sum_over_batch_size_fit_result = {
-        'loss': [67.5, 67.5],
-        'output_1_loss': [32.5, 32.5],
-        'output_2_loss': [35, 35],
+        'loss': [144, 144],
+        'output_1_loss': [56, 56],
+        'output_2_loss': [88, 88],
     }
 
     self.expected_fit_result = {
         loss_reduction.ReductionV2.NONE:
             sum_over_batch_size_fit_result,
         loss_reduction.ReductionV2.SUM: {
-            'loss': [135, 135],
-            'output_1_loss': [65, 65],
-            'output_2_loss': [70, 70],
+            'loss': [198, 198],
+            'output_1_loss': [82, 82],
+            'output_2_loss': [116, 116],
         },
         loss_reduction.ReductionV2.AUTO:
             sum_over_batch_size_fit_result,
@@ -619,12 +630,16 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
 
     # In the order: 'loss', 'output_1_loss', 'output_2_loss',
     self.expected_batch_result = {
-        loss_reduction.ReductionV2.NONE: [67.5, 32.5, 35],
-        loss_reduction.ReductionV2.SUM: [135, 65, 70],
-        loss_reduction.ReductionV2.AUTO: [67.5, 32.5, 35],
-        loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE: [67.5, 32.5, 35],
+        loss_reduction.ReductionV2.NONE: [144, 56, 88],
+        loss_reduction.ReductionV2.SUM: [198, 82, 116],
+        loss_reduction.ReductionV2.AUTO: [144, 56, 88],
+        loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE: [144, 56, 88],
     }
 
+    # 2 + 12 + 36 + 80 + 150 = 280
+    # 14 + 40 + 54 + 32 + 300 = 440
+    self.expected_single_batch_result = [720, 280, 440]
+
   def test_fit(self, reduction):
     model = self._get_compiled_multi_io_model(
         loss=losses.MeanSquaredError(reduction=reduction))
@@ -661,8 +676,7 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
 
     expected_values = self.expected_batch_result[reduction]
     if reduction == loss_reduction.ReductionV2.SUM:
-      # We are taking all the data as one batch, so undo the averaging here.
-      expected_values = [x * 2 for x in self.expected_batch_result[reduction]]
+      expected_values = self.expected_single_batch_result
     self.assertAllClose(result, expected_values)
 
   def test_test_on_batch(self, reduction):
@@ -675,8 +689,7 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
                                  })
     expected_values = self.expected_batch_result[reduction]
     if reduction == loss_reduction.ReductionV2.SUM:
-      # We are taking all the data as one batch, so undo the averaging here.
-      expected_values = [x * 2 for x in self.expected_batch_result[reduction]]
+      expected_values = self.expected_single_batch_result
     self.assertAllClose(result, expected_values)
 
   def test_fit_generator(self, reduction):
@@ -685,7 +698,7 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
     history = model.fit_generator(
         custom_generator_multi_io(
             sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-        steps_per_epoch=2,
+        steps_per_epoch=3,
         epochs=2)
     for key, value in self.expected_fit_result[reduction].items():
       self.assertAllClose(history.history[key], value)
@@ -696,7 +709,7 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
     eval_result = model.evaluate_generator(
         custom_generator_multi_io(
             sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-        steps=2)
+        steps=3)
     self.assertAllClose(eval_result, self.expected_batch_result[reduction])
 
 
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index ba1b76bab32..99eadaec4c8 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -1951,6 +1951,30 @@ class BinaryTruePositives(metrics.Metric):
     return self.true_positives
 
 
+class BinaryTruePositivesViaControlFlow(metrics.Metric):
+
+  def __init__(self, name='binary_true_positives', **kwargs):
+    super(BinaryTruePositivesViaControlFlow, self).__init__(name=name, **kwargs)
+    self.true_positives = self.add_weight(name='tp', initializer='zeros')
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    y_true = math_ops.cast(y_true, dtypes.bool)
+    y_pred = math_ops.cast(y_pred, dtypes.bool)
+
+    for i in range(len(y_true)):
+      for j in range(len(y_true[i])):
+        if y_true[i][j] and y_pred[i][j]:
+          if sample_weight is None:
+            self.true_positives.assign_add(1)
+          else:
+            self.true_positives.assign_add(sample_weight[i][0])
+
+  def result(self):
+    if constant_op.constant(True):
+      return self.true_positives
+    return 0.0
+
+
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CustomMetricsTest(test.TestCase):
 
@@ -1988,6 +2012,55 @@ class CustomMetricsTest(test.TestCase):
     result = btp_obj(y_true, y_pred, sample_weight=sample_weight)
     self.assertEqual(12, self.evaluate(result))
 
+  def test_autograph(self):
+    metric = BinaryTruePositivesViaControlFlow()
+    self.evaluate(variables.variables_initializer(metric.variables))
+    y_true = constant_op.constant([[0, 0.9, 0, 1, 0], [0, 0, 1, 1, 1],
+                                   [1, 1, 1, 1, 0], [0, 0, 0, 0, 1.5]])
+    y_pred = constant_op.constant([[0, 0, 1, 5, 0], [1, 1, 1, 1, 1],
+                                   [0, 1, 0, 1, 0], [1, 10, 1, 1, 1]])
+    sample_weight = constant_op.constant([[1.], [1.5], [2.], [2.5]])
+
+    @def_function.function
+    def compute_metric(y_true, y_pred, sample_weight):
+      metric(y_true, y_pred, sample_weight)
+      return metric.result()
+
+    result = compute_metric(y_true, y_pred, sample_weight)
+    self.assertEqual(12, self.evaluate(result))
+
+  def test_metric_wrappers_autograph(self):
+    def metric_fn(y_true, y_pred):
+      x = constant_op.constant(0.0)
+      for i in range(len(y_true)):
+        for j in range(len(y_true[i])):
+          if math_ops.equal(y_true[i][j], y_pred[i][j]) and y_true[i][j] > 0:
+            x += 1.0
+      return x
+
+    mean_metric = metrics.MeanMetricWrapper(metric_fn)
+    sum_metric = metrics.SumOverBatchSizeMetricWrapper(metric_fn)
+    self.evaluate(variables.variables_initializer(mean_metric.variables))
+    self.evaluate(variables.variables_initializer(sum_metric.variables))
+
+    y_true = constant_op.constant([[0, 0, 0, 1, 0],
+                                   [0, 0, 1, 1, 1],
+                                   [1, 1, 1, 1, 0],
+                                   [1, 1, 1, 0, 1]])
+    y_pred = constant_op.constant([[0, 0, 1, 1, 0],
+                                   [1, 1, 1, 1, 1],
+                                   [0, 1, 0, 1, 0],
+                                   [1, 1, 1, 1, 1]])
+
+    @def_function.function
+    def tf_functioned_metric_fn(metric, y_true, y_pred):
+      return metric(y_true, y_pred)
+
+    metric_result = tf_functioned_metric_fn(mean_metric, y_true, y_pred)
+    self.assertAllClose(self.evaluate(metric_result), 10, 1e-2)
+    metric_result = tf_functioned_metric_fn(sum_metric, y_true, y_pred)
+    self.assertAllClose(self.evaluate(metric_result), 10, 1e-2)
+
 
 def _get_model(compile_metrics):
   model_layers = [
diff --git a/tensorflow/python/keras/mixed_precision/__init__.py b/tensorflow/python/keras/mixed_precision/__init__.py
index 0939d62a3d0..98f7b7b1a55 100644
--- a/tensorflow/python/keras/mixed_precision/__init__.py
+++ b/tensorflow/python/keras/mixed_precision/__init__.py
@@ -15,8 +15,8 @@
 
 """Keras mixed precision API.
 
-See [the mixed precision
-guide](https://www.tensorflow.org/guide/keras/mixed_precision) to learn how to
+See [the mixed precision guide](
+  https://www.tensorflow.org/guide/keras/mixed_precision) to learn how to
 use the API.
 """
 from __future__ import absolute_import
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index 7609745f15d..25d05b78c3e 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -180,7 +180,7 @@ py_library(
 
 cuda_py_test(
     name = "loss_scale_optimizer_test",
-    size = "small",
+    size = "medium",
     srcs = ["loss_scale_optimizer_test.py"],
     python_version = "PY3",
     deps = [
@@ -224,9 +224,16 @@ cuda_py_test(
     name = "keras_test",
     size = "medium",
     srcs = ["keras_test.py"],
+    data = [
+        "//tensorflow/python/keras/mixed_precision/experimental/testdata:lso_ckpt_tf2.2",
+        "//tensorflow/python/keras/mixed_precision/experimental/testdata:lso_savedmodel_tf2.2",
+    ],
     python_version = "PY3",
     shard_count = 10,
-    tags = ["no_windows"],  # b/139083295: bfloat16 tests fail on Windows
+    tags = [
+        "no_pip",
+        "no_windows",  # b/139083295: bfloat16 tests fail on Windows
+    ],
     deps = [
         ":test_util",
         "//tensorflow/python:client_testlib",
@@ -244,6 +251,7 @@ py_test(
     srcs = ["layer_correctness_test.py"],
     python_version = "PY3",
     shard_count = 10,
+    tags = ["no_rocm"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/compat:v2_compat",
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index c43ca21ea06..29e5a68c854 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -23,9 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.types import core
 
 
-class AutoCastVariable(variables.Variable):
+class AutoCastVariable(variables.Variable, core.Tensor):
   """Variable that will cast itself to a different dtype in applicable contexts.
 
   This class wraps a floating-point `tf.Variable`. It emulates the variable
@@ -417,7 +418,6 @@ class AutoCastVariable(variables.Variable):
 
 ops.register_tensor_conversion_function(AutoCastVariable,
                                         AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
-ops.register_dense_tensor_like_type(AutoCastVariable)
 
 
 def create_autocast_variable(variable):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 6ce3079788c..41591d3edbd 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -287,14 +287,14 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.evaluate(x.initializer)
 
       # outside of auto cast scope.
-      v1 = constant_op.constant(3.14, dtype=dtypes.float32)
-      v2 = constant_op.constant(3.14, dtype=dtypes.float16)
+      v1 = constant_op.constant(3., dtype=dtypes.float32)
+      v2 = constant_op.constant(3., dtype=dtypes.float16)
 
       def run_and_check():
         # Assign float32 values
-        self.assertAllClose(3.14, self.evaluate(x.assign(v1)))
-        self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(v1)))
-        self.assertAllClose(3.14, self.evaluate(x.assign_sub(v1)))
+        self.assertAllClose(3., self.evaluate(x.assign(v1)))
+        self.assertAllClose(3. * 2, self.evaluate(x.assign_add(v1)))
+        self.assertAllClose(3., self.evaluate(x.assign_sub(v1)))
 
         # Attempt to assign float16 values
         with self.assertRaisesRegexp(
@@ -312,23 +312,23 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
         # Assign Python floats
         self.assertAllClose(0., self.evaluate(x.assign(0.)))
-        self.assertAllClose(3.14, self.evaluate(x.assign(3.14)))
-        self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(3.14)))
-        self.assertAllClose(3.14, self.evaluate(x.assign_sub(3.14)))
+        self.assertAllClose(3., self.evaluate(x.assign(3.)))
+        self.assertAllClose(3. * 2, self.evaluate(x.assign_add(3.)))
+        self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))
 
         # Assign multiple times
         assign = x.assign(1.)
         self.assertAllClose(1., self.evaluate(assign))
         self.assertAllClose(0., self.evaluate(assign.assign(0.)))
-        assign_add = x.assign_add(3.14)
-        self.assertAllClose(3.14, self.evaluate(assign_add))
-        self.assertAllClose(3.14 * 3,
-                            self.evaluate(x.assign_add(3.14).assign_add(3.14)))
-        self.assertAllClose(3.14 * 3, x)
-        assign_sub = x.assign_sub(3.14)
-        self.assertAllClose(3.14 * 2, self.evaluate(assign_sub))
+        assign_add = x.assign_add(3.)
+        self.assertAllClose(3., self.evaluate(assign_add))
+        self.assertAllClose(3. * 3,
+                            self.evaluate(x.assign_add(3.).assign_add(3.)))
+        self.assertAllClose(3. * 3, x)
+        assign_sub = x.assign_sub(3.)
+        self.assertAllClose(3. * 2, self.evaluate(assign_sub))
         self.assertAllClose(0.,
-                            self.evaluate(x.assign_sub(3.14).assign_sub(3.14)))
+                            self.evaluate(x.assign_sub(3.).assign_sub(3.)))
 
         # Assign with read_value=False
         self.assertIsNone(self.evaluate(x.assign(1., read_value=False)))
@@ -340,10 +340,10 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
         # Use the tf.assign functions instead of the var.assign methods.
         self.assertAllClose(0., self.evaluate(state_ops.assign(x, 0.)))
-        self.assertAllClose(3.14, self.evaluate(state_ops.assign(x, 3.14)))
-        self.assertAllClose(3.14 * 2,
-                            self.evaluate(state_ops.assign_add(x, 3.14)))
-        self.assertAllClose(3.14, self.evaluate(state_ops.assign_sub(x, 3.14)))
+        self.assertAllClose(3., self.evaluate(state_ops.assign(x, 3.)))
+        self.assertAllClose(3. * 2,
+                            self.evaluate(state_ops.assign_add(x, 3.)))
+        self.assertAllClose(3., self.evaluate(state_ops.assign_sub(x, 3.)))
 
       run_and_check()
       # reset x
diff --git a/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check.py b/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check.py
index 9279c37bb52..0b28661d262 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check.py
@@ -154,8 +154,8 @@ def log_device_compatibility_check(policy_name, skip_local):
   if not context.executing_eagerly() or _logged_compatibility_check:
     return
   _logged_compatibility_check = True
-  device_attr_list = device_lib.list_local_devices()
   if not skip_local:
+    device_attr_list = device_lib.list_local_devices()
     _log_device_compatibility_check(policy_name, device_attr_list)
     return
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index a27be08deb2..d2e80cfaf72 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -41,6 +41,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
+from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.mixed_precision.experimental import get_layer_policy
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
@@ -993,6 +994,56 @@ class KerasModelTest(keras_parameterized.TestCase):
     self.assertEqual(backend.get_value(loss_scale()), 2)
     self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_restore_old_loss_scale_checkpoint(self):
+    # Ensure a checkpoint from TF 2.2 can be loaded. The checkpoint format
+    # of LossScaleOptimizer changed, but old checkpoints can still be loaded
+    opt = gradient_descent.SGD(0.1, momentum=0.1)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+    model = sequential.Sequential([core.Dense(2,)])
+
+    # The checkpoint and expected values were obtained from the program in
+    # testdata/BUILD.
+    ckpt_dir = test.test_src_dir_path(
+        'python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2')
+    model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
+    model.compile(opt, 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    model(np.zeros((2, 2)))  # Create model weights
+    opt._create_all_weights(model.weights)
+    expected_kernel = np.array([[9.229685, 10.901115], [10.370763, 9.757362]])
+    expected_slot = np.array([[10.049943, 9.917691], [10.049943, 9.917691]])
+    self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
+    self.assertAllClose(
+        self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
+        expected_slot)
+    self.assertEqual(self.evaluate(opt.loss_scale()), 32768)
+    self.assertEqual(self.evaluate(opt.loss_scale._num_good_steps), 1)
+
+    # Check restoring works even after the model is compiled and the weights
+    # have been created.
+    model.fit(np.random.normal(size=(2, 2)), np.random.normal(size=(2, 2)))
+    self.assertNotAllClose(self.evaluate(model.weights[0]), expected_kernel)
+    self.assertNotAllClose(
+        self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
+        expected_slot)
+    model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
+    self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
+    self.assertAllClose(
+        self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
+        expected_slot)
+    self.assertEqual(self.evaluate(opt.loss_scale()), 32768)
+    self.assertEqual(self.evaluate(opt.loss_scale._num_good_steps), 1)
+
+  def test_restore_old_saved_model(self):
+    saved_model_dir = test.test_src_dir_path(
+        'python/keras/mixed_precision/experimental/testdata/'
+        'lso_savedmodel_tf2.2')
+    model = save.load_model(saved_model_dir)
+    expected_kernel = np.array([[9.229685, 10.901115], [10.370763, 9.757362]])
+    self.assertAllClose(backend.eval(model.weights[0]), expected_kernel)
+    self.assertIsInstance(model.optimizer,
+                          loss_scale_optimizer.LossScaleOptimizer)
+
   @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters(
       {
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index 1c14955cbb3..d6a786aa4e4 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -22,6 +22,7 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend
@@ -32,6 +33,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training.experimental import loss_scale as loss_scale_module
 from tensorflow.python.training.experimental import mixed_precision
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -51,8 +53,126 @@ class _UnwrapPreventer(object):
     self.value = value
 
 
+class _DelegatingTrackableMixin(object):
+  """A mixin that delegates all Trackable methods to another trackable object.
+
+  This class must be used with multiple inheritance. A class that subclasses
+  Trackable can also subclass this class, which causes all Trackable methods to
+  be delegated to the trackable object passed in the constructor.
+
+  A subclass can use this mixin to appear as if it were the trackable passed to
+  the constructor, from a Checkpoint's perspective. LossScaleOptimizer uses this
+  mixin, so that the checkpoint format for a LossScaleOptimizer is identical to
+  the checkpoint format for a normal optimizer. This allows a model to be saved
+  with a normal Optimizer and restored with a LossScaleOptimizer, or vice versa.
+  The only difference in checkpoint format is that the loss scale is also saved
+  with a LossScaleOptimizer.
+  """
+
+  def __init__(self, trackable_obj):
+    self._trackable = trackable_obj
+
+  # pylint: disable=protected-access
+  @property
+  def _setattr_tracking(self):
+    return self._trackable._setattr_tracking
+
+  @_setattr_tracking.setter
+  def _setattr_tracking(self, value):
+    self._trackable._setattr_tracking = value
+
+  @property
+  def _update_uid(self):
+    return self._trackable._update_uid
+
+  @_update_uid.setter
+  def _update_uid(self, value):
+    self._trackable._update_uid = value
+
+  @property
+  def _unconditional_checkpoint_dependencies(self):
+    return self._trackable._unconditional_checkpoint_dependencies
+
+  @property
+  def _unconditional_dependency_names(self):
+    return self._trackable._unconditional_dependency_names
+
+  @property
+  def _name_based_restores(self):
+    return self._trackable._name_based_restores
+
+  def _maybe_initialize_trackable(self):
+    return self._trackable._maybe_initialize_trackable()
+
+  @property
+  def _object_identifier(self):
+    return self._trackable._object_identifier
+
+  @property
+  def _tracking_metadata(self):
+    return self._trackable._tracking_metadata
+
+  def _no_dependency(self, value):
+    return self._trackable._no_dependency(value)
+
+  def _name_based_attribute_restore(self, checkpoint):
+    return self._trackable._name_based_attribute_restore(checkpoint)
+
+  @property
+  def _checkpoint_dependencies(self):
+    return self._trackable._checkpoint_dependencies
+
+  @property
+  def _deferred_dependencies(self):
+    return self._trackable._deferred_dependencies
+
+  def _lookup_dependency(self, name):
+    self._trackable._lookup_dependency(name)
+
+  def _add_variable_with_custom_getter(self,
+                                       name,
+                                       shape=None,
+                                       dtype=dtypes.float32,
+                                       initializer=None,
+                                       getter=None,
+                                       overwrite=False,
+                                       **kwargs_for_getter):
+    return self._trackable._add_variable_with_custom_getter(
+        name, shape, dtype, initializer, getter, overwrite, **kwargs_for_getter)
+
+  def _preload_simple_restoration(self, name, shape):
+    return self._trackable._preload_simple_restoration(name, shape)
+
+  def _track_trackable(self, trackable, name, overwrite=False):  # pylint: disable=redefined-outer-name
+    return self._trackable._track_trackable(trackable, name, overwrite)
+
+  def _handle_deferred_dependencies(self, name, trackable):  # pylint: disable=redefined-outer-name
+    return self._trackable._handle_deferred_dependencies(name, trackable)
+
+  def _restore_from_checkpoint_position(self, checkpoint_position):
+    return self._trackable._restore_from_checkpoint_position(
+        checkpoint_position)
+
+  def _single_restoration_from_checkpoint_position(self, checkpoint_position,
+                                                   visit_queue):
+    return self._trackable._single_restoration_from_checkpoint_position(
+        checkpoint_position, visit_queue)
+
+  def _gather_saveables_for_checkpoint(self):
+    return self._trackable._gather_saveables_for_checkpoint()
+
+  def _list_extra_dependencies_for_serialization(self, serialization_cache):
+    return self._trackable._list_extra_dependencies_for_serialization(
+        serialization_cache)
+
+  def _list_functions_for_serialization(self, serialization_cache):
+    return self._trackable._list_functions_for_serialization(
+        serialization_cache)
+  # pylint: enable=protected-access
+
+
 @keras_export('keras.mixed_precision.experimental.LossScaleOptimizer')
-class LossScaleOptimizer(optimizer_v2.OptimizerV2):
+class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
   """An optimizer that applies loss scaling.
 
   Loss scaling is a process that multiplies the loss by a multiplier called the
@@ -144,6 +264,11 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
     self._loss_scale = keras_loss_scale_module.get(loss_scale)
     if self._loss_scale is None:
       raise ValueError('loss_scale cannot be None.')
+
+    # We don't call super().__init__, since we do not want to call OptimizerV2's
+    # constructor.
+    _DelegatingTrackableMixin.__init__(self, self._optimizer)
+
     for weight in loss_scale_module.get_loss_scale_weights(self._loss_scale):
       # We cannot call `track_variable` in the LossScale class itself, because a
       # file outside of Keras cannot depend on a Keras file. Calling it here
@@ -151,12 +276,15 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
       # a Keras class, and the only way to use LossScale with a Keras class is
       # through the LossScaleOptimizer.
       backend.track_variable(weight)
-    self._track_trackable(self._optimizer, 'base_optimizer')
     self._track_trackable(self._loss_scale, 'loss_scale')
 
     # Needed because the superclass's __getattribute__ checks this.
     self._hyper = {}
 
+    # To support restoring TensorFlow 2.2 checkpoints.
+    self._track_trackable(FakeOptimizerForRestoration(self._optimizer),
+                          'base_optimizer')
+
   @property
   def loss_scale(self):
     """The `LossScale` instance associated with this optimizer."""
@@ -348,6 +476,21 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
   def _aggregate_gradients(self, grads_and_vars):
     return self._optimizer._aggregate_gradients(grads_and_vars)  # pylint: disable=protected-access
 
+  def _restore_slot_variable(self, slot_name, variable, slot_variable):
+    return self._optimizer._restore_slot_variable(slot_name, variable,  # pylint: disable=protected-access
+                                                  slot_variable)
+
+  def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
+                                       variable):
+    return self._optimizer._create_or_restore_slot_variable(  # pylint: disable=protected-access
+        slot_variable_position, slot_name, variable)
+
+  def get_slot(self, var, slot_name):
+    return self._optimizer.get_slot(var, slot_name)
+
+  def add_slot(self, var, slot_name, initializer='zeros'):
+    return self._optimizer.add_slot(var, slot_name, initializer)
+
   # For the most part, we only expose methods in the base OptimizerV2, not
   # individual subclasses like Adam. However, although "learning_rate" and "lr"
   # properties are not part of the base OptimizerV2 class, they are part of most
@@ -369,23 +512,6 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
   def lr(self, lr):
     self._optimizer.lr = lr
 
-  def get_slot(self, var, slot_name):
-    # We cannot implement get_slot for the following reason: When saving a
-    # checkpoint, two optimizers cannot share slot variables. Since both the
-    # LossScaleOptimizer and the wrapped optimizer (self and self._optimizer
-    # respectively) are checkpointed, we cannot expose the wrapped optimizer's
-    # slots in the LossScaleOptimizer. Otherwise, a checkpoint would believe
-    # both optimizers share slot variables.
-    raise AttributeError(
-        'You cannot call get_slot on a LossScaleOptimizer. This limitation '
-        'will be removed in the future.')
-
-  def add_slot(self, var, slot_name, initializer='zeros'):
-    # We disallow adding a slot for consistency with `get_slot`.
-    raise AttributeError(
-        'You cannot call add_slot on a LossScaleOptimizer. This limitation '
-        'will be removed in the future.')
-
   # We do not override some OptimizerV2 methods. For each, we describe why we do
   # not delegate them to self._optimizer:
   # * get_updates: get_updates() calls get_gradients(). Since we override
@@ -402,6 +528,51 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
   # TODO(reedwm): Maybe throw an error if mixed precision is used without this
   # optimizer being used.
 
+  # Trackable delegations: Delegate all Trackable methods to the wrapped
+  # optimizer. This is so the checkpoint format for a LossScaleOptimizer is
+  # identical to the checkpoint format for a normal optimizer, except the loss
+  # scale is stored in the checkpoint.
+
+
+class FakeOptimizerForRestoration(trackable.Trackable):
+  """A fake optimizer used to support restoring TensorFlow 2.2 checkpoints.
+
+  The checkpoint format for LossScaleOptimizers changed after TF 2.2. This class
+  exists to support restoring TF 2.2 checkpoints in newer version of TensorFlow.
+
+  In TF 2.2, LossScaleOptimizer would track the wrapped optimizer by calling the
+  following in LossScaleOptimizer.__init__
+
+  ```
+  self._track_trackable(self._optimizer, 'base_optimizer')
+  ```
+
+  This means a dependency from the LossScaleOptimizer to the wrapped optimizer
+  would be stored in the checkpoint. However now, the checkpoint format with a
+  LossScaleOptimizer is the same as the format without a LossScaleOptimizer,
+  except the loss scale is also stored. This means there is no dependency from
+  the LossScaleOptimizer to the wrapped optimizer. Instead, the
+  LossScaleOptimizer acts as if it is the wrapped optimizer, from a checkpoint's
+  perspective, by overriding all Trackable methods and delegating them to the
+  wrapped optimizer.
+
+  To allow restoring TF 2.2. checkpoints, LossScaleOptimizer adds a dependency
+  on this class instead of the inner optimizer. When restored, this class will
+  instead restore the slot variables of the inner optimizer. Since this class
+  has no variables, it does not affect the checkpoint when saved.
+  """
+
+  def __init__(self, optimizer):
+    self._optimizer = optimizer
+
+  def get_slot_names(self):
+    return self._optimizer.get_slot_names()
+
+  def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
+                                       variable):
+    return self._optimizer._create_or_restore_slot_variable(  # pylint: disable=protected-access
+        slot_variable_position, slot_name, variable)
+
 
 # pylint: disable=protected-access
 mixed_precision._register_wrapper_optimizer_cls(optimizer_v2.OptimizerV2,
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index cbabda3340e..20252ff3885 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -305,20 +305,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       opt.set_weights([np.array(2.)])
       self.assertEqual(self.evaluate(opt.variables()[0]), 2)
 
-  def testSlotMethodErrors(self):
-    opt = gradient_descent.SGD(1.0, momentum=1.0)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
-    with self.assertRaisesRegexp(
-        AttributeError,
-        'You cannot call get_slot on a LossScaleOptimizer. This limitation '
-        'will be removed in the future.'):
-      opt.get_slot(None, None)
-    with self.assertRaisesRegexp(
-        AttributeError,
-        'You cannot call add_slot on a LossScaleOptimizer. This limitation '
-        'will be removed in the future.'):
-      opt.add_slot(None, None)
-
   def testPassingNoneToLossScale(self):
     opt = gradient_descent.SGD()
     with self.assertRaisesRegexp(ValueError, r'loss_scale cannot be None'):
@@ -394,9 +380,49 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       run_fn = lambda: opt.minimize(loss, [var])
       strategy.experimental_run(run_fn)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def testCheckpoint(self, strategy_fn):
+  @parameterized.named_parameters({
+      'testcase_name': 'SaveAndRestoreBase',
+      'strategy_fn': default_strategy_fn,
+      'save_with_ls': True,
+      'restore_with_ls': True,
+  }, {
+      'testcase_name': 'SaveAndRestoreDistribute',
+      'strategy_fn': create_mirrored_strategy,
+      'save_with_ls': True,
+      'restore_with_ls': True,
+  }, {
+      'testcase_name': 'SaveBase',
+      'strategy_fn': default_strategy_fn,
+      'save_with_ls': True,
+      'restore_with_ls': False,
+  }, {
+      'testcase_name': 'SaveDistribute',
+      'strategy_fn': create_mirrored_strategy,
+      'save_with_ls': True,
+      'restore_with_ls': False,
+  }, {
+      'testcase_name': 'RestoreBase',
+      'strategy_fn': default_strategy_fn,
+      'save_with_ls': False,
+      'restore_with_ls': True,
+  }, {
+      'testcase_name': 'RestoreDistribute',
+      'strategy_fn': create_mirrored_strategy,
+      'save_with_ls': False,
+      'restore_with_ls': True,
+  })
+  def testCheckpoint(self, strategy_fn, save_with_ls, restore_with_ls):
+
+    class MySGD(gradient_descent.SGD):
+      """A custom optimizer that tracks an extra variable."""
+
+      def __init__(self, *args, **kwargs):
+        super(MySGD, self).__init__(*args, **kwargs)
+        self.my_var = variables.Variable(0.)
+        self._track_trackable(self.my_var, 'my_var')
+
     strategy = strategy_fn()
+    replicas = strategy.num_replicas_in_sync
     if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and
         not context.executing_eagerly()):
       # TODO(b/121381184): Enable running the test in this case.
@@ -405,38 +431,89 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     with self.test_session(), strategy.scope():
       # Build and run a simple model.
       var = variables.Variable([2.0])
-      loss_scale = loss_scale_module.DynamicLossScale(
-          initial_loss_scale=1., increment_period=2.,
-          multiplier=2.)
-      opt = gradient_descent.SGD(1., momentum=1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
-      run_fn = lambda: opt.minimize(lambda: var + 1., var_list=[var])
+      opt = inner_opt = MySGD(1., momentum=1.)
+      if save_with_ls:
+        loss_scale = loss_scale_module.DynamicLossScale(
+            initial_loss_scale=1., increment_period=2.,
+            multiplier=2.)
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      run_fn = lambda: opt.minimize(lambda: var / replicas + 1., var_list=[var])
       opt_op = strategy.experimental_run(run_fn)
       self.evaluate(variables.global_variables_initializer())
-      self.evaluate(opt_op)
-      self.assertEqual(self.evaluate(loss_scale()), 1.)
-      self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
-      slot_var = opt._optimizer.get_slot(var, 'momentum')
-      slot_value = self.evaluate(slot_var).item()
+      self.evaluate(strategy.experimental_local_results(opt_op))
+
+      # Assert values.
+      self.assertEqual(self.evaluate(var), 1.)
+      if save_with_ls:
+        self.assertEqual(self.evaluate(loss_scale()), 1.)
+        self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
+      slot_var = opt.get_slot(var, 'momentum')
+      self.assertEqual(self.evaluate(slot_var).item(), -1)
+      self.assertEqual(self.evaluate(opt.iterations), 1)
+
+      # Set optimizer variable to check arbitrary optimizer attributes can be
+      # saved/restored
+      self.evaluate(inner_opt.my_var.assign(1.))
 
       # Save a checkpoint.
       checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
       prefix = os.path.join(self.get_temp_dir(), 'ckpt')
       save_path = checkpoint.save(prefix)
 
-      # Run model again.
-      self.evaluate(strategy.experimental_run(run_fn))
-      self.assertEqual(self.evaluate(loss_scale()), 2.)
-      self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0)
-      self.assertNotAlmostEqual(self.evaluate(slot_var).item(), slot_value)
+      # Create new model
+      var = variables.Variable([2.0])
+      opt = inner_opt = MySGD(1., momentum=1.)
+      if restore_with_ls:
+        loss_scale = loss_scale_module.DynamicLossScale(
+            initial_loss_scale=1., increment_period=2.,
+            multiplier=2.)
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
 
-      # Load checkpoint and ensure loss scale is back to it's original value.
+      # Restore new model.
+      checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
       status = checkpoint.restore(save_path)
-      status.assert_consumed()
+      if save_with_ls:
+        status.assert_existing_objects_matched()
+      else:
+        status.assert_nontrivial_match()
+
+      # Assert restored values. We can only assert in eager mode since the
+      # variables are uninitialized in graph mode
+      if context.executing_eagerly():
+        self.assertEqual(self.evaluate(var), 1.)
+        if save_with_ls and restore_with_ls:
+          self.assertEqual(self.evaluate(loss_scale()), 1.)
+          self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
+        elif restore_with_ls:
+          self.assertEqual(self.evaluate(loss_scale()), 1.)
+          self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0)
+        self.assertEqual(self.evaluate(opt.iterations), 1)
+
+      # Run the model again.
+      run_fn = lambda: opt.minimize(lambda: var / replicas + 1., var_list=[var])
+      opt_op = strategy.experimental_run(run_fn)
+
+      # Assert new values.
+      self.evaluate(variables.global_variables_initializer())
       status.run_restore_ops()
-      self.assertEqual(self.evaluate(loss_scale()), 1.)
-      self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
-      self.assertAlmostEqual(self.evaluate(slot_var).item(), slot_value)
+      self.evaluate(strategy.experimental_local_results(opt_op))
+      self.assertEqual(self.evaluate(var), -1)
+      slot_var = opt.get_slot(var, 'momentum')
+      self.assertEqual(self.evaluate(slot_var).item(), -2)
+      self.assertEqual(self.evaluate(opt.iterations), 2)
+      self.assertEqual(self.evaluate(inner_opt.my_var), 1)
+
+      # Restore model again to test restoring after slots are created
+      status = checkpoint.restore(save_path)
+      if save_with_ls and restore_with_ls:
+        status.assert_consumed()
+      elif save_with_ls:
+        status.assert_existing_objects_matched()
+      elif restore_with_ls:
+        status.assert_nontrivial_match()
+      status.run_restore_ops()
+      self.assertEqual(self.evaluate(var), 1)
+      self.assertEqual(self.evaluate(slot_var).item(), -1)
 
   def testGetConfig(self):
     opt = gradient_descent.SGD(2., momentum=0.5)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy.py b/tensorflow/python/keras/mixed_precision/experimental/policy.py
index f9899679a86..0b809e678a9 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy.py
@@ -57,8 +57,8 @@ class Policy(object):
   not have a single dtype. When the variable dtype does not match the compute
   dtype, variables will be automatically casted to the compute dtype to avoid
   type errors. In this case, `tf.keras.layers.Layer.dtype` refers to the
-  variable dtype, not the compute dtype. See [the mixed precision
-  guide](https://www.tensorflow.org/guide/keras/mixed_precision) for more
+  variable dtype, not the compute dtype. See [the mixed precision guide](
+    https://www.tensorflow.org/guide/keras/mixed_precision) for more
   information on how to use mixed precision.
 
   Certain policies also have a `tf.mixed_precision.experimental.LossScale`
@@ -119,8 +119,8 @@ class Policy(object):
   `'mixed_bfloat16'`, no loss scaling is done and loss scaling never needs to be
   manually applied.
 
-  See [the mixed precision
-  guide](https://www.tensorflow.org/guide/keras/mixed_precision) for more
+  See [the mixed precision guide](
+    https://www.tensorflow.org/guide/keras/mixed_precision) for more
   information on using mixed precision
 
   ### How to use float64 in a Keras model
@@ -195,6 +195,10 @@ class Policy(object):
   Other arguments are not automatically casted for technical reasons, but this
   may change in a future minor release.
 
+  The casting only occurs in TensorFlow 2, but can be enabled if
+  `tf.compat.v1.disable_v2_behavior()` has been called with
+  `tf.compat.v1.keras.layers.enable_v2_dtype_behavior()`.
+
   A layer subclass can prevent its inputs from being autocasted by passing
   `autocast=False` to the layer constructor. For example:
 
@@ -547,7 +551,10 @@ def set_policy(policy):
   """
   global _global_policy
   if not base_layer_utils.v2_dtype_behavior_enabled():
-    raise ValueError('The global policy can only be set in TensorFlow 2')
+    raise ValueError('The global policy can only be set in TensorFlow 2 or if '
+                     'V2 dtype behavior has been set. To enable V2 dtype '
+                     'behavior, call '
+                     '"tf.compat.v1.keras.layers.enable_v2_dtype_behavior()"')
   if policy is not None and not isinstance(policy, Policy):
     policy = Policy(policy)
   is_mixed_policy = policy is not None and policy.should_cast_variables
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/BUILD b/tensorflow/python/keras/mixed_precision/experimental/testdata/BUILD
new file mode 100644
index 00000000000..39e89065e68
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/testdata/BUILD
@@ -0,0 +1,48 @@
+# Description:
+#   Contains checkpoints and SavedModels for testing purposes.
+
+package(
+    default_visibility = [
+        "//tensorflow/python/keras:__subpackages__",
+        "//tensorflow/tools/pip_package:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+# These files were generated by running the following program with TensorFlow
+# 2.2rc2. The final release of TF 2.2 was not out when this change was created.:
+
+# import os
+# import numpy as np
+# import tensorflow as tf
+#
+# tf.random.set_seed(1)
+# opt = tf.keras.optimizers.SGD(0.1, momentum=0.1)
+# opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
+# model = tf.keras.Sequential([tf.keras.layers.Dense(2)])
+# model.compile(opt, 'mse')
+#
+# x = np.ones((10, 2))
+# y = x * 100
+# model.fit(x, y)
+# weight_dir = os.environ['TF_LSO_WEIGHT_DIR']
+# model_dir = os.environ['TF_LSO_MODEL_DIR']
+# model.save_weights(weight_dir)
+# model.save(model_dir)
+# print(model.get_weights()[0])
+# print(opt._optimizer.get_slot(model.weights[0], 'momentum'))
+# print(opt.loss_scale)
+
+filegroup(
+    name = "lso_ckpt_tf2.2",
+    srcs = glob(["lso_ckpt_tf2.2/**"]),
+    tags = ["no_pip"],
+)
+
+filegroup(
+    name = "lso_savedmodel_tf2.2",
+    srcs = glob(["lso_savedmodel_tf2.2/**"]),
+    tags = ["no_pip"],
+)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/checkpoint b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/checkpoint
new file mode 100644
index 00000000000..30b525422ea
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "ckpt"
+all_model_checkpoint_paths: "ckpt"
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00000-of-00002 b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00000-of-00002
new file mode 100644
index 00000000000..119d52883c9
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00000-of-00002
@@ -0,0 +1,45 @@
+�
+��f
+6
+layer_with_weights-0
+layer-0
+	optimizer
+
+
+kernel
+bias
+$
+base_optimizer
+
+loss_scale
+ca
+VARIABLE_VALUEsequential/dense/kernel6layer_with_weights-0/kernel/.ATTRIBUTES/VARIABLE_VALUE
+_]
+VARIABLE_VALUEsequential/dense/bias4layer_with_weights-0/bias/.ATTRIBUTES/VARIABLE_VALUE
+V
+iter
+	decay
+	learning_rate
+
+momentummomentummomentum
+(
+current_loss_scale
+
+good_steps
+VT
+VARIABLE_VALUESGD/iter8optimizer/base_optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE
+XV
+VARIABLE_VALUE	SGD/decay9optimizer/base_optimizer/decay/.ATTRIBUTES/VARIABLE_VALUE
+hf
+VARIABLE_VALUESGD/learning_rateAoptimizer/base_optimizer/learning_rate/.ATTRIBUTES/VARIABLE_VALUE
+^\
+VARIABLE_VALUESGD/momentum<optimizer/base_optimizer/momentum/.ATTRIBUTES/VARIABLE_VALUE
+jh
+VARIABLE_VALUEcurrent_loss_scaleBoptimizer/loss_scale/current_loss_scale/.ATTRIBUTES/VARIABLE_VALUE
+ZX
+VARIABLE_VALUE
+good_steps:optimizer/loss_scale/good_steps/.ATTRIBUTES/VARIABLE_VALUE
+��
+VARIABLE_VALUE$SGD/sequential/dense/kernel/momentumhlayer_with_weights-0/kernel/.OPTIMIZER_SLOT/optimizer/base_optimizer/momentum/.ATTRIBUTES/VARIABLE_VALUE
+��
+VARIABLE_VALUE"SGD/sequential/dense/bias/momentumflayer_with_weights-0/bias/.OPTIMIZER_SLOT/optimizer/base_optimizer/momentum/.ATTRIBUTES/VARIABLE_VALUE
\ No newline at end of file
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00001-of-00002 b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00001-of-00002
new file mode 100644
index 00000000000..b3f9682d1f9
Binary files /dev/null and b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00001-of-00002 differ
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.index b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.index
new file mode 100644
index 00000000000..123174c4117
Binary files /dev/null and b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.index differ
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/saved_model.pb b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/saved_model.pb
new file mode 100644
index 00000000000..07701c3810f
Binary files /dev/null and b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/saved_model.pb differ
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00000-of-00002 b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00000-of-00002
new file mode 100644
index 00000000000..70537507f92
Binary files /dev/null and b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00000-of-00002 differ
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00001-of-00002 b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00001-of-00002
new file mode 100644
index 00000000000..0136799ae12
Binary files /dev/null and b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00001-of-00002 differ
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.index b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.index
new file mode 100644
index 00000000000..3f1b412a993
Binary files /dev/null and b/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.index differ
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 0b0121f521e..9f5099e100e 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_v1
@@ -31,7 +31,6 @@ from tensorflow.python.keras.engine.base_layer import AddMetric
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils import generic_utils
@@ -45,6 +44,7 @@ from tensorflow.python.util.tf_export import keras_export
 # API entries importable from `keras.models`:
 Model = training.Model  # pylint: disable=invalid-name
 Sequential = sequential.Sequential  # pylint: disable=invalid-name
+Functional = functional.Functional  # pylint: disable=invalid-name
 save_model = save.save_model
 load_model = save.load_model
 model_from_config = model_config.model_from_config
@@ -112,11 +112,12 @@ def _make_new_nodes(nodes_by_depth, layer_fn, layer_map, tensor_map):
       # then call node.inbound_layer on them.
       if all(
           tensor in tensor_map for tensor in nest.flatten(node.input_tensors)):
-        computed_tensors = nest.map_structure(lambda t: tensor_map[t],
-                                              node.input_tensors)
         # Call layer.
-        kwargs = node.arguments or {}
-        output_tensors = layer(computed_tensors, **kwargs)
+        args = nest.map_structure(lambda t: tensor_map.get(t, t),
+                                  node.call_args)
+        kwargs = nest.map_structure(lambda t: tensor_map.get(t, t),
+                                    node.call_kwargs)
+        output_tensors = layer(*args, **kwargs)
 
         # Thread-safe way to keep track of what node was created.
         first_output_tensor = nest.flatten(output_tensors)[0]
@@ -192,12 +193,12 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
   if not callable(layer_fn):
     raise ValueError('Expected `layer_fn` argument to be a callable.')
 
-  model_config, created_layers = _clone_layers_and_model_config(
+  model_configs, created_layers = _clone_layers_and_model_config(
       model, new_input_layers, layer_fn)
   # Reconstruct model from the config, using the cloned layers.
   input_tensors, output_tensors, created_layers = (
-      network.reconstruct_from_config(model_config,
-                                      created_layers=created_layers))
+      functional.reconstruct_from_config(model_configs,
+                                         created_layers=created_layers))
   metrics_names = model.metrics_names
   model = Model(input_tensors, output_tensors, name=model.name)
   # Layers not directly tied to outputs of the Model, such as loss layers
@@ -208,8 +209,8 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
   if ancillary_layers:
     new_nodes = nest.flatten([
         layer.inbound_nodes[1:]
-        if network._should_skip_first_node(layer) else layer.inbound_nodes
-        for layer in created_layers.values()
+        if functional._should_skip_first_node(layer)
+        else layer.inbound_nodes for layer in created_layers.values()
     ])
     _insert_ancillary_layers(model, ancillary_layers, metrics_names, new_nodes)
   return model
@@ -243,7 +244,8 @@ def _clone_layers_and_model_config(model, input_layers, layer_fn):
       created_layers[layer.name] = layer_fn(layer)
     return {}
 
-  config = network.get_network_config(model, serialize_layer_fn=_copy_layer)
+  config = functional.get_network_config(
+      model, serialize_layer_fn=_copy_layer)
   return config, created_layers
 
 
@@ -494,7 +496,7 @@ def _in_place_subclassed_model_reset(model):
     # This will not work for nested subclassed models used as layers.
     # This would be theoretically possible to support, but would add complexity.
     # Only do it if users complain.
-    if isinstance(layer, Network) and not layer._is_graph_network:
+    if isinstance(layer, training.Model) and not layer._is_graph_network:
       raise ValueError('We do not support the use of nested subclassed models '
                        'in `model_to_estimator` at this time. Found nested '
                        'model: %s' % layer)
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 17e0d9e3852..d7ef7d73208 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -124,7 +124,7 @@ class TestModelCloning(keras_parameterized.TestCase):
     self.assertEqual(new_model._is_graph_network, model._is_graph_network)
     if input_shape:
       # update ops from batch norm needs to be included
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertGreaterEqual(len(new_model.updates), 2)
 
     # On top of new tensor  -- clone model should always have an InputLayer.
     input_a = keras.Input(shape=(4,))
@@ -173,7 +173,7 @@ class TestModelCloning(keras_parameterized.TestCase):
 
     # With placeholder creation
     new_model = clone_fn(model)
-    self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+    self.assertGreaterEqual(len(new_model.updates), 2)
     new_model.compile(
         testing_utils.get_v2_optimizer('rmsprop'),
         'mse',
@@ -185,7 +185,7 @@ class TestModelCloning(keras_parameterized.TestCase):
     input_b = keras.Input(shape=(4,), name='b')
     new_model = keras.models.clone_model(
         model, input_tensors=[input_a, input_b])
-    self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+    self.assertLen(new_model.updates, 2)
     new_model.compile(
         testing_utils.get_v2_optimizer('rmsprop'),
         'mse',
@@ -199,7 +199,7 @@ class TestModelCloning(keras_parameterized.TestCase):
       input_a = keras.backend.variable(val_a)
       input_b = keras.backend.variable(val_b)
       new_model = clone_fn(model, input_tensors=[input_a, input_b])
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertGreaterEqual(len(new_model.updates), 2)
       new_model.compile(
           testing_utils.get_v2_optimizer('rmsprop'),
           'mse',
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index 8636ffb237e..9e844b41332 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -224,6 +224,7 @@ cuda_py_test(
     srcs = ["optimizer_v2_test.py"],
     shard_count = 8,
     tags = [
+        "no_rocm",
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
index c5a7e0414ce..0827d367110 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.platform import test
 
 _DATA_TYPES = [dtypes.half, dtypes.float32, dtypes.float64]
 # TODO(b/143684500): Eigen to support complex sqrt
-if (not test_util.IsBuiltWithNvcc() and not test.is_built_with_rocm()):
+if not test_util.IsBuiltWithNvcc():
   _DATA_TYPES += [dtypes.complex64, dtypes.complex128]
 
 
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index 8c69a19171a..a4b331e622c 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -39,7 +39,7 @@ from tensorflow.python.platform import test
 
 _DATA_TYPES = [dtypes.half, dtypes.float32, dtypes.float64]
 # TODO(b/143684500): Eigen to support complex sqrt
-if (not test_util.IsBuiltWithNvcc() and not test.is_built_with_rocm()):
+if not test_util.IsBuiltWithNvcc():
   _DATA_TYPES += [dtypes.complex64, dtypes.complex128]
 
 
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 01fdadf5439..c55b332bfc0 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import contextlib
 import functools
 
 import six
@@ -337,6 +338,13 @@ class OptimizerV2(trackable.Trackable):
 
     self._hypers_created = False
 
+    # Store the distribution strategy object if the optimizer is created inside
+    # strategy scope, so it could be used to create variables later.
+    if distribute_ctx.has_strategy():
+      self._distribution_strategy = distribute_ctx.get_strategy()
+    else:
+      self._distribution_strategy = None
+
   def minimize(self, loss, var_list, grad_loss=None, name=None):
     """Minimize `loss` by updating `var_list`.
 
@@ -632,7 +640,7 @@ class OptimizerV2(trackable.Trackable):
         # context. (eager updates execute immediately)
         with ops._get_graph_from_inputs(update_ops).as_default():  # pylint: disable=protected-access
           with ops.control_dependencies(update_ops):
-            return self._iterations.assign_add(1).op
+            return self._iterations.assign_add(1, read_value=False)
 
       return self._iterations.assign_add(1)
 
@@ -800,30 +808,32 @@ class OptimizerV2(trackable.Trackable):
   def _create_hypers(self):
     if self._hypers_created:
       return
-    # Iterate hyper values deterministically.
-    for name, value in sorted(self._hyper.items()):
-      if isinstance(
-          value, (ops.Tensor, tf_variables.Variable)) or callable(value):
-        continue
-      else:
-        self._hyper[name] = self.add_weight(
-            name,
-            shape=[],
-            trainable=False,
-            initializer=value,
-            aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+    with self._distribution_strategy_scope():
+      # Iterate hyper values deterministically.
+      for name, value in sorted(self._hyper.items()):
+        if isinstance(value,
+                      (ops.Tensor, tf_variables.Variable)) or callable(value):
+          continue
+        else:
+          self._hyper[name] = self.add_weight(
+              name,
+              shape=[],
+              trainable=False,
+              initializer=value,
+              aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
     self._hypers_created = True
 
   @property
   def iterations(self):
     """Variable. The number of training steps this Optimizer has run."""
     if self._iterations is None:
-      self._iterations = self.add_weight(
-          "iter",
-          shape=[],
-          dtype=dtypes.int64,
-          trainable=False,
-          aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      with self._distribution_strategy_scope():
+        self._iterations = self.add_weight(
+            "iter",
+            shape=[],
+            dtype=dtypes.int64,
+            trainable=False,
+            aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
       self._weights.append(self._iterations)
     return self._iterations
 
@@ -1233,6 +1243,15 @@ class OptimizerV2(trackable.Trackable):
           slot_name, {}).setdefault(variable_key, []).append(
               slot_variable_position)
 
+  @contextlib.contextmanager
+  def _distribution_strategy_scope(self):
+    """Returns the `tf.distribute.Strategy` this optimizer was created under."""
+    if self._distribution_strategy and not distribute_ctx.has_strategy():
+      with self._distribution_strategy.scope():
+        yield self._distribution_strategy.scope()
+    else:
+      yield
+
 
 def _filter_grads(grads_and_vars):
   """Filter out iterable with grad equal to None."""
@@ -1274,7 +1293,7 @@ def _var_key(var):
   # pylint: disable=protected-access
   # Get the distributed variable if it exists.
   if hasattr(var, "_distributed_container"):
-    var = var._distributed_container()
+    var = var._distributed_container
   if var._in_graph_mode:
     return var._shared_name
   return var._unique_id
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index 5de5e59b385..d1deaf34f45 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -121,16 +121,19 @@ class RMSprop(optimizer_v2.OptimizerV2):
         Setting this to `True` may help with training, but is slightly more
         expensive in terms of computation and memory. Defaults to `False`.
       name: Optional name prefix for the operations created when applying
-        gradients. Defaults to "RMSprop".  @compatibility(eager) When eager
-        execution is enabled, `learning_rate`, `decay`, `momentum`, and
-        `epsilon` can each be a callable that takes no arguments and returns the
-        actual value to use. This can be useful for changing these values across
-        different invocations of optimizer functions. @end_compatibility
+        gradients. Defaults to "RMSprop".
       **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
         `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
         gradients by value, `decay` is included for backward compatibility to
         allow time inverse decay of learning rate. `lr` is included for backward
         compatibility, recommended to use `learning_rate` instead.
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `decay`, `momentum`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
     """
     super(RMSprop, self).__init__(name, **kwargs)
     self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index 66da72ba0ac..612d8ba0159 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -41,7 +41,7 @@ from tensorflow.python.platform import test
 
 _DATA_TYPES = [dtypes.half, dtypes.float32, dtypes.float64]
 # TODO(b/143684500): Eigen to support complex sqrt
-if (not test_util.IsBuiltWithNvcc() and not test.is_built_with_rocm()):
+if not test_util.IsBuiltWithNvcc():
   _DATA_TYPES += [dtypes.complex64, dtypes.complex128]
 
 _TEST_PARAM_VALUES = [
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index f218681c12f..5d8e1351ae3 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -899,4 +899,5 @@ def get(identifier):
     config = {'class_name': str(identifier), 'config': {}}
     return deserialize(config)
   else:
-    raise ValueError('Could not interpret optimizer identifier:', identifier)
+    raise ValueError(
+        'Could not interpret optimizer identifier: {}'.format(identifier))
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index c178c8c03c9..db051eafea0 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -253,6 +253,9 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
         batch_size=5,
         verbose=0)
 
+  def test_deserialization_error(self):
+    with self.assertRaisesRegex(ValueError, 'Could not interpret optimizer'):
+      keras.optimizers.get(0)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/premade/linear_test.py b/tensorflow/python/keras/premade/linear_test.py
index 7d61da7cdee..7c560de44dd 100644
--- a/tensorflow/python/keras/premade/linear_test.py
+++ b/tensorflow/python/keras/premade/linear_test.py
@@ -22,8 +22,6 @@ import numpy as np
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features_v2
-from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -31,7 +29,6 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import losses
 from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
@@ -129,33 +126,6 @@ class LinearModelTest(keras_parameterized.TestCase):
         grads_and_vars = zip(grads, model.trainable_variables)
         opt.apply_gradients(grads_and_vars)
 
-  # This test is an example for a regression on categorical inputs, i.e.,
-  # the output is 0.4, 0.6, 0.9 when input is 'alpha', 'beta', 'gamma'
-  # separately.
-  def test_linear_model_with_feature_column(self):
-    with context.eager_mode():
-      vocab_list = ['alpha', 'beta', 'gamma']
-      vocab_val = [0.4, 0.6, 0.9]
-      data = np.random.choice(vocab_list, size=256)
-      y = np.zeros_like(data, dtype=np.float32)
-      for vocab, val in zip(vocab_list, vocab_val):
-        indices = np.where(data == vocab)
-        y[indices] = val + np.random.uniform(
-            low=-0.01, high=0.01, size=indices[0].shape)
-      cat_column = fc.categorical_column_with_vocabulary_list(
-          key='symbol', vocabulary_list=vocab_list)
-      ind_column = fc.indicator_column(cat_column)
-      dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
-      linear_model = linear.LinearModel(
-          use_bias=False, kernel_initializer='zeros')
-      combined = sequential.Sequential([dense_feature_layer, linear_model])
-      opt = gradient_descent.SGD(learning_rate=0.1)
-      combined.compile(opt, 'mse', [])
-      combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
-      self.assertAllClose([[0.4], [0.6], [0.9]],
-                          combined.layers[1].dense_layers[0].kernel.numpy(),
-                          atol=0.01)
-
   def test_config(self):
     linear_model = linear.LinearModel(units=3, use_bias=True)
     config = linear_model.get_config()
diff --git a/tensorflow/python/keras/premade/wide_deep_test.py b/tensorflow/python/keras/premade/wide_deep_test.py
index 12d569331bf..5388d1129ab 100644
--- a/tensorflow/python/keras/premade/wide_deep_test.py
+++ b/tensorflow/python/keras/premade/wide_deep_test.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features_v2
-from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
@@ -188,67 +186,6 @@ class WideDeepModelTest(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly())
     wide_deep_model.fit(inputs, output, epochs=50)
 
-  # This test is an example for cases where linear and dnn model accepts
-  # same raw input and same transformed inputs, i.e., the raw input is
-  # categorical, and both linear and dnn model accept one hot encoding.
-  def test_wide_deep_model_with_single_feature_column(self):
-    vocab_list = ['alpha', 'beta', 'gamma']
-    vocab_val = [0.4, 0.6, 0.9]
-    data = np.random.choice(vocab_list, size=256)
-    y = np.zeros_like(data, dtype=np.float32)
-    for vocab, val in zip(vocab_list, vocab_val):
-      indices = np.where(data == vocab)
-      y[indices] = val + np.random.uniform(
-          low=-0.01, high=0.01, size=indices[0].shape)
-    cat_column = fc.categorical_column_with_vocabulary_list(
-        key='symbol', vocabulary_list=vocab_list)
-    ind_column = fc.indicator_column(cat_column)
-    dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
-    linear_model = linear.LinearModel(
-        use_bias=False, kernel_initializer='zeros')
-    dnn_model = sequential.Sequential([core.Dense(units=1)])
-    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-    combined = sequential.Sequential([dense_feature_layer, wide_deep_model])
-    opt = gradient_descent.SGD(learning_rate=0.1)
-    combined.compile(
-        opt,
-        'mse', [],
-        run_eagerly=testing_utils.should_run_eagerly())
-    combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
-
-  # This test is an example for cases where linear and dnn model accepts
-  # same raw input but different transformed inputs, i.e,. the raw input is
-  # categorical, and linear model accepts one hot encoding, while dnn model
-  # accepts embedding encoding.
-  def test_wide_deep_model_with_two_feature_columns(self):
-    vocab_list = ['alpha', 'beta', 'gamma']
-    vocab_val = [0.4, 0.6, 0.9]
-    data = np.random.choice(vocab_list, size=256)
-    y = np.zeros_like(data, dtype=np.float32)
-    for vocab, val in zip(vocab_list, vocab_val):
-      indices = np.where(data == vocab)
-      y[indices] = val + np.random.uniform(
-          low=-0.01, high=0.01, size=indices[0].shape)
-    cat_column = fc.categorical_column_with_vocabulary_list(
-        key='symbol', vocabulary_list=vocab_list)
-    ind_column = fc.indicator_column(cat_column)
-    emb_column = fc.embedding_column(cat_column, dimension=5)
-    linear_feature_layer = dense_features_v2.DenseFeatures([ind_column])
-    linear_model = linear.LinearModel(
-        use_bias=False, kernel_initializer='zeros')
-    combined_linear = sequential.Sequential(
-        [linear_feature_layer, linear_model])
-    dnn_model = sequential.Sequential([core.Dense(units=1)])
-    dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column])
-    combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model])
-    wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn)
-    opt = gradient_descent.SGD(learning_rate=0.1)
-    wide_deep_model.compile(
-        opt,
-        'mse', [],
-        run_eagerly=testing_utils.should_run_eagerly())
-    wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
-
   def test_config(self):
     linear_model = linear.LinearModel(units=1)
     dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
index 3cfdb1e2c78..24260fb71db 100644
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ b/tensorflow/python/keras/preprocessing/BUILD
@@ -31,6 +31,7 @@ py_library(
 py_library(
     name = "image",
     srcs = [
+        "dataset_utils.py",
         "image.py",
         "image_dataset.py",
     ],
@@ -69,7 +70,9 @@ py_library(
 py_library(
     name = "text",
     srcs = [
+        "dataset_utils.py",
         "text.py",
+        "text_dataset.py",
     ],
     deps = ["//tensorflow/python:util"],
 )
@@ -82,6 +85,7 @@ tf_py_test(
     deps = [
         ":image",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
         "//third_party/py/numpy",
     ],
 )
@@ -124,6 +128,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "text_dataset_test",
+    size = "small",
+    srcs = ["text_dataset_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":text",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/keras",
+    ],
+)
+
 tf_py_test(
     name = "timeseries_test",
     size = "small",
diff --git a/tensorflow/python/keras/preprocessing/dataset_utils.py b/tensorflow/python/keras/preprocessing/dataset_utils.py
new file mode 100644
index 00000000000..bc65c7b9b99
--- /dev/null
+++ b/tensorflow/python/keras/preprocessing/dataset_utils.py
@@ -0,0 +1,217 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras image dataset loading utilities."""
+# pylint: disable=g-classes-have-attributes
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def index_directory(directory,
+                    labels,
+                    formats,
+                    class_names=None,
+                    shuffle=True,
+                    seed=None,
+                    follow_links=False):
+  """Make list of all files in the subdirs of `directory`, with their labels.
+
+  Args:
+    directory: The target directory (string).
+    labels: Either "inferred"
+        (labels are generated from the directory structure),
+        or a list/tuple of integer labels of the same size as the number of
+        valid files found in the directory. Labels should be sorted according
+        to the alphanumeric order of the image file paths
+        (obtained via `os.walk(directory)` in Python).
+    formats: Whitelist of file extensions to index (e.g. ".jpg", ".txt").
+    class_names: Only valid if "labels" is "inferred". This is the explict
+        list of class names (must match names of subdirectories). Used
+        to control the order of the classes
+        (otherwise alphanumerical order is used).
+    shuffle: Whether to shuffle the data. Default: True.
+        If set to False, sorts the data in alphanumeric order.
+    seed: Optional random seed for shuffling.
+    follow_links: Whether to visits subdirectories pointed to by symlinks.
+
+  Returns:
+    tuple (file_paths, labels, class_names).
+      file_paths: list of file paths (strings).
+      labels: list of matching integer labels (same length as file_paths)
+      class_names: names of the classes corresponding to these labels, in order.
+  """
+  inferred_class_names = []
+  for subdir in sorted(os.listdir(directory)):
+    if os.path.isdir(os.path.join(directory, subdir)):
+      inferred_class_names.append(subdir)
+  if not class_names:
+    class_names = inferred_class_names
+  else:
+    if set(class_names) != set(inferred_class_names):
+      raise ValueError(
+          'The `class_names` passed did not match the '
+          'names of the subdirectories of the target directory. '
+          'Expected: %s, but received: %s' %
+          (inferred_class_names, class_names))
+  class_indices = dict(zip(class_names, range(len(class_names))))
+
+  # Build an index of the files
+  # in the different class subfolders.
+  pool = multiprocessing.pool.ThreadPool()
+  results = []
+  filenames = []
+  for dirpath in (os.path.join(directory, subdir) for subdir in class_names):
+    results.append(
+        pool.apply_async(index_subdirectory,
+                         (dirpath, class_indices, follow_links, formats)))
+  labels_list = []
+  for res in results:
+    partial_filenames, partial_labels = res.get()
+    labels_list.append(partial_labels)
+    filenames += partial_filenames
+  if labels != 'inferred':
+    if len(labels) != len(filenames):
+      raise ValueError('Expected the lengths of `labels` to match the number '
+                       'of files in the target directory. len(labels) is %s '
+                       'while we found %s files in %s.' % (
+                           len(labels), len(filenames), directory))
+  else:
+    i = 0
+    labels = np.zeros((len(filenames),), dtype='int32')
+    for partial_labels in labels_list:
+      labels[i:i + len(partial_labels)] = partial_labels
+      i += len(partial_labels)
+
+  print('Found %d files belonging to %d classes.' %
+        (len(filenames), len(class_names)))
+  pool.close()
+  pool.join()
+  file_paths = [os.path.join(directory, fname) for fname in filenames]
+
+  if shuffle:
+    # Shuffle globally to erase macro-structure
+    if seed is None:
+      seed = np.random.randint(1e6)
+    rng = np.random.RandomState(seed)
+    rng.shuffle(file_paths)
+    rng = np.random.RandomState(seed)
+    rng.shuffle(labels)
+  return file_paths, labels, class_names
+
+
+def iter_valid_files(directory, follow_links, formats):
+  walk = os.walk(directory, followlinks=follow_links)
+  for root, _, files in sorted(walk, key=lambda x: x[0]):
+    for fname in sorted(files):
+      if fname.lower().endswith(formats):
+        yield root, fname
+
+
+def index_subdirectory(directory, class_indices, follow_links, formats):
+  """Recursively walks directory and list image paths and their class index.
+
+  Arguments:
+    directory: string, target directory.
+    class_indices: dict mapping class names to their index.
+    follow_links: boolean, whether to recursively follow subdirectories
+      (if False, we only list top-level images in `directory`).
+    formats: Whitelist of file extensions to index (e.g. ".jpg", ".txt").
+
+  Returns:
+    tuple `(filenames, labels)`. `filenames` is a list of relative file
+      paths, and `labels` is a list of integer labels corresponding to these
+      files.
+  """
+  dirname = os.path.basename(directory)
+  valid_files = iter_valid_files(directory, follow_links, formats)
+  labels = []
+  filenames = []
+  for root, fname in valid_files:
+    labels.append(class_indices[dirname])
+    absolute_path = os.path.join(root, fname)
+    relative_path = os.path.join(
+        dirname, os.path.relpath(absolute_path, directory))
+    filenames.append(relative_path)
+  return filenames, labels
+
+
+def get_training_or_validation_split(samples, labels, validation_split, subset):
+  """Potentially restict samples & labels to a training or validation split.
+
+  Args:
+    samples: List of elements.
+    labels: List of corresponding labels.
+    validation_split: Float, fraction of data to reserve for validation.
+    subset: Subset of the data to return.
+      Either "training", "validation", or None. If None, we return all of the
+      data.
+
+  Returns:
+    tuple (samples, labels), potentially restricted to the specified subset.
+  """
+  if not validation_split:
+    return samples, labels
+
+  num_val_samples = int(validation_split * len(samples))
+  if subset == 'training':
+    print('Using %d files for training.' % (len(samples) - num_val_samples,))
+    samples = samples[:-num_val_samples]
+    labels = labels[:-num_val_samples]
+  elif subset == 'validation':
+    print('Using %d files for validation.' % (num_val_samples,))
+    samples = samples[-num_val_samples:]
+    labels = labels[-num_val_samples:]
+  else:
+    raise ValueError('`subset` must be either "training" '
+                     'or "validation", received: %s' % (subset,))
+  return samples, labels
+
+
+def labels_to_dataset(labels, label_mode, num_classes):
+  label_ds = dataset_ops.Dataset.from_tensor_slices(labels)
+  if label_mode == 'binary':
+    label_ds = label_ds.map(
+        lambda x: array_ops.expand_dims(math_ops.cast(x, 'float32'), axis=-1))
+  elif label_mode == 'categorical':
+    label_ds = label_ds.map(lambda x: array_ops.one_hot(x, num_classes))
+  return label_ds
+
+
+def check_validation_split_arg(validation_split, subset, shuffle, seed):
+  """Raise errors in case of invalid argument values."""
+  if validation_split and not 0 < validation_split < 1:
+    raise ValueError(
+        '`validation_split` must be between 0 and 1, received: %s' %
+        (validation_split,))
+  if (validation_split or subset) and not (validation_split and subset):
+    raise ValueError(
+        'If `subset` is set, `validation_split` must be set, and inversely.')
+  if subset not in ('training', 'validation', None):
+    raise ValueError('`subset` must be either "training" '
+                     'or "validation", received: %s' % (subset,))
+  if validation_split and shuffle and seed is None:
+    raise ValueError(
+        'If using `validation_split` and shuffling the data, you must provide '
+        'a `seed` argument, to make sure that there is no overlap between the '
+        'training and validation subset.')
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index 731d99ca9d4..953962c7771 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 # pylint: disable=invalid-name
 # pylint: disable=g-import-not-at-top
+# pylint: disable=g-classes-have-attributes
 """Set of tools for real-time data augmentation on image data.
 """
 from __future__ import absolute_import
@@ -21,15 +22,21 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_preprocessing import image
+import numpy as np
 try:
   from scipy import linalg  # pylint: disable=unused-import
   from scipy import ndimage  # pylint: disable=unused-import
 except ImportError:
   pass
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.preprocessing.image_dataset import image_dataset_from_directory  # pylint: disable=unused-import
 from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
@@ -44,6 +51,104 @@ random_brightness = image.random_brightness
 apply_affine_transform = image.apply_affine_transform
 
 
+@keras_export('keras.preprocessing.image.smart_resize', v1=[])
+def smart_resize(x, size, interpolation='bilinear'):
+  """Resize images to a target size without aspect ratio distortion.
+
+  TensorFlow image datasets typically yield images that have each a different
+  size. However, these images need to be batched before they can be
+  processed by Keras layers. To be batched, images need to share the same height
+  and width.
+
+  You could simply do:
+
+  ````python
+  size = (200, 200)
+  ds = ds.map(lambda img: tf.image.resize(img, size))
+  ```
+
+  However, if you do this, you distort the aspect ratio of your images, since
+  in general they do not all have the same aspect ratio as `size`. This is
+  fine in many cases, but not always (e.g. for GANs this can be a problem).
+
+  Note that passing the argument `preserve_aspect_ratio=True` to `resize`
+  will preserve the aspect ratio, but at the cost of no longer respecting the
+  provided target size. Because `tf.image.resize` doesn't crop images,
+  your output images will still have different sizes.
+
+  This calls for:
+
+  ```python
+  size = (200, 200)
+  ds = ds.map(lambda img: smart_resize(img, size))
+  ```
+
+  Your output images will actually be `(200, 200)`, and will not be distorted.
+  Instead, the parts of the image that do not fit within the target size
+  get cropped out.
+
+  The resizing process is:
+
+  1. Take the largest centered crop of the image that has the same aspect ratio
+  as the target size. For instance, if `size=(200, 200)` and the input image has
+  size `(340, 500)`, we take a crop of `(340, 340)` centered along the width.
+  2. Resize the cropped image to the target size. In the example above,
+  we resize the `(340, 340)` crop to `(200, 200)`.
+
+  Arguments:
+    x: Input image (as a tensor or NumPy array). Must be in format
+      `(height, width, channels)`.
+    size: Tuple of `(height, width)` integer. Target size.
+    interpolation: String, interpolation to use for resizing.
+      Defaults to `'bilinear'`. Supports `bilinear`, `nearest`, `bicubic`,
+      `area`, `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.
+
+  Returns:
+    Array with shape `(size[0], size[1], channels)`. If the input image was a
+    NumPy array, the output is a NumPy array, and if it was a TF tensor,
+    the output is a TF tensor.
+  """
+  if len(size) != 2:
+    raise ValueError('Expected `size` to be a tuple of 2 integers, '
+                     'but got: %s' % (size,))
+  img = ops.convert_to_tensor(x)
+  if img.shape.rank is not None:
+    if img.shape.rank != 3:
+      raise ValueError(
+          'Expected an image array with shape `(height, width, channels)`, but '
+          'got input with incorrect rank, of shape %s' % (img.shape,))
+  shape = array_ops.shape(img)
+  height, width = shape[0], shape[1]
+  target_height, target_width = size
+  target_ratio = float(target_height) / target_width
+  img_ratio = math_ops.cast(
+      height, 'float32') / math_ops.cast(width, 'float32')
+  if target_ratio < img_ratio:
+    crop_height = math_ops.cast(
+        math_ops.cast(width, 'float32') * target_height / target_width, 'int32')
+    crop_box_hstart = math_ops.cast(
+        math_ops.cast(height - crop_height, 'float32') / 2, 'int32')
+    crop_box_start = [crop_box_hstart, 0, 0]
+    crop_box_size = [crop_height, -1, -1]
+  else:
+    crop_width = math_ops.cast(
+        math_ops.cast(height * target_width, 'float32') / target_height,
+        'int32')
+    crop_box_wstart = math_ops.cast((width - crop_width) / 2, 'int32')
+    crop_box_start = [0, crop_box_wstart, 0]
+    crop_box_size = [-1, crop_width, -1]
+  crop_box_start = array_ops.stack(crop_box_start)
+  crop_box_size = array_ops.stack(crop_box_size)
+  img = array_ops.slice(img, crop_box_start, crop_box_size)
+  img = image_ops.resize_images_v2(
+      images=img,
+      size=size,
+      method=interpolation)
+  if isinstance(x, np.ndarray):
+    return img.numpy()
+  return img
+
+
 @keras_export('keras.preprocessing.image.array_to_img')
 def array_to_img(x, data_format=None, scale=True, dtype=None):
   """Converts a 3D Numpy array to a PIL Image instance.
@@ -356,6 +461,123 @@ class NumpyArrayIterator(image.NumpyArrayIterator, Iterator):
         **kwargs)
 
 
+class DataFrameIterator(image.DataFrameIterator, Iterator):
+  """Iterator capable of reading images from a directory on disk as a dataframe.
+
+  Arguments:
+      dataframe: Pandas dataframe containing the filepaths relative to
+        `directory` (or absolute paths if `directory` is None) of the images in
+        a string column. It should include other column/s
+          depending on the `class_mode`: - if `class_mode` is `"categorical"`
+            (default value) it must include the `y_col` column with the class/es
+            of each image. Values in column can be string/list/tuple if a single
+            class or list/tuple if multiple classes. - if `class_mode` is
+            `"binary"` or `"sparse"` it must include the given `y_col` column
+            with class values as strings. - if `class_mode` is `"raw"` or
+            `"multi_output"` it should contain the columns specified in `y_col`.
+            - if `class_mode` is `"input"` or `None` no extra column is needed.
+      directory: string, path to the directory to read images from. If `None`,
+        data in `x_col` column should be absolute paths.
+      image_data_generator: Instance of `ImageDataGenerator` to use for random
+        transformations and normalization. If None, no transformations and
+        normalizations are made.
+      x_col: string, column in `dataframe` that contains the filenames (or
+        absolute paths if `directory` is `None`).
+      y_col: string or list, column/s in `dataframe` that has the target data.
+      weight_col: string, column in `dataframe` that contains the sample
+          weights. Default: `None`.
+      target_size: tuple of integers, dimensions to resize input images to.
+      color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read
+        images.
+      classes: Optional list of strings, classes to use (e.g. `["dogs",
+        "cats"]`). If None, all classes in `y_col` will be used.
+      class_mode: one of "binary", "categorical", "input", "multi_output",
+          "raw", "sparse" or None. Default: "categorical".
+          Mode for yielding the targets:
+          - `"binary"`: 1D numpy array of binary labels,
+          - `"categorical"`: 2D numpy array of one-hot encoded labels. Supports
+            multi-label output.
+          - `"input"`: images identical to input images (mainly used to work
+            with autoencoders),
+          - `"multi_output"`: list with the values of the different columns,
+          - `"raw"`: numpy array of values in `y_col` column(s),
+          - `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
+            are returned (the generator will only yield batches of image data,
+            which is useful to use in `model.predict_generator()`).
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seed for data shuffling.
+      data_format: String, one of `channels_first`, `channels_last`.
+      save_to_dir: Optional directory where to save the pictures being yielded,
+        in a viewable format. This is useful for visualizing the random
+        transformations being applied, for debugging purposes.
+      save_prefix: String prefix to use for saving sample images (if
+        `save_to_dir` is set).
+      save_format: Format to use for saving sample images (if `save_to_dir` is
+        set).
+      subset: Subset of data (`"training"` or `"validation"`) if
+        validation_split is set in ImageDataGenerator.
+      interpolation: Interpolation method used to resample the image if the
+        target size is different from that of the loaded image. Supported
+        methods are "nearest", "bilinear", and "bicubic". If PIL version 1.1.3
+        or newer is installed, "lanczos" is also supported. If PIL version 3.4.0
+        or newer is installed, "box" and "hamming" are also supported. By
+        default, "nearest" is used.
+      dtype: Dtype to use for the generated arrays.
+      validate_filenames: Boolean, whether to validate image filenames in
+        `x_col`. If `True`, invalid images will be ignored. Disabling this
+        option
+      can lead to speed-up in the instantiation of this class. Default: `True`.
+  """
+
+  def __init__(
+      self,
+      dataframe,
+      directory=None,
+      image_data_generator=None,
+      x_col='filename',
+      y_col='class',
+      weight_col=None,
+      target_size=(256, 256),
+      color_mode='rgb',
+      classes=None,
+      class_mode='categorical',
+      batch_size=32,
+      shuffle=True,
+      seed=None,
+      data_format='channels_last',
+      save_to_dir=None,
+      save_prefix='',
+      save_format='png',
+      subset=None,
+      interpolation='nearest',
+      dtype='float32',
+      validate_filenames=True):
+    super(DataFrameIterator, self).__init__(
+        dataframe=dataframe,
+        directory=directory,
+        image_data_generator=image_data_generator,
+        x_col=x_col,
+        y_col=y_col,
+        weight_col=weight_col,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        data_format=data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset,
+        interpolation=interpolation,
+        dtype=dtype,
+        validate_filenames=validate_filenames
+    )
+
+
 @keras_export('keras.preprocessing.image.ImageDataGenerator')
 class ImageDataGenerator(image.ImageDataGenerator):
   """Generate batches of tensor image data with real-time data augmentation.
@@ -583,6 +805,302 @@ class ImageDataGenerator(image.ImageDataGenerator):
         validation_split=validation_split,
         **kwargs)
 
+  def flow(self,
+           x,
+           y=None,
+           batch_size=32,
+           shuffle=True,
+           sample_weight=None,
+           seed=None,
+           save_to_dir=None,
+           save_prefix='',
+           save_format='png',
+           subset=None):
+    """Takes data & label arrays, generates batches of augmented data.
+
+    Arguments:
+        x: Input data. Numpy array of rank 4 or a tuple. If tuple, the first
+          element should contain the images and the second element another numpy
+          array or a list of numpy arrays that gets passed to the output without
+          any modifications. Can be used to feed the model miscellaneous data
+          along with the images. In case of grayscale data, the channels axis of
+          the image array should have value 1, in case of RGB data, it should
+          have value 3, and in case of RGBA data, it should have value 4.
+        y: Labels.
+        batch_size: Int (default: 32).
+        shuffle: Boolean (default: True).
+        sample_weight: Sample weights.
+        seed: Int (default: None).
+        save_to_dir: None or str (default: None). This allows you to optionally
+          specify a directory to which to save the augmented pictures being
+          generated (useful for visualizing what you are doing).
+        save_prefix: Str (default: `''`). Prefix to use for filenames of saved
+          pictures (only relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg"
+            (only relevant if `save_to_dir` is set). Default: "png".
+        subset: Subset of data (`"training"` or `"validation"`) if
+          `validation_split` is set in `ImageDataGenerator`.
+
+    Returns:
+        An `Iterator` yielding tuples of `(x, y)`
+            where `x` is a numpy array of image data
+            (in the case of a single image input) or a list
+            of numpy arrays (in the case with
+            additional inputs) and `y` is a numpy array
+            of corresponding labels. If 'sample_weight' is not None,
+            the yielded tuples are of the form `(x, y, sample_weight)`.
+            If `y` is None, only the numpy array `x` is returned.
+    """
+    return NumpyArrayIterator(
+        x,
+        y,
+        self,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        sample_weight=sample_weight,
+        seed=seed,
+        data_format=self.data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset)
+
+  def flow_from_directory(self,
+                          directory,
+                          target_size=(256, 256),
+                          color_mode='rgb',
+                          classes=None,
+                          class_mode='categorical',
+                          batch_size=32,
+                          shuffle=True,
+                          seed=None,
+                          save_to_dir=None,
+                          save_prefix='',
+                          save_format='png',
+                          follow_links=False,
+                          subset=None,
+                          interpolation='nearest'):
+    """Takes the path to a directory & generates batches of augmented data.
+
+    Arguments:
+        directory: string, path to the target directory. It should contain one
+          subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images inside
+          each of the subdirectories directory tree will be included in the
+          generator. See [this script](
+            https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
+              for more details.
+        target_size: Tuple of integers `(height, width)`, defaults to `(256,
+          256)`. The dimensions to which all images found will be resized.
+        color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb". Whether
+          the images will be converted to have 1, 3, or 4 channels.
+        classes: Optional list of class subdirectories
+            (e.g. `['dogs', 'cats']`). Default: None. If not provided, the list
+              of classes will be automatically inferred from the subdirectory
+              names/structure under `directory`, where each subdirectory will be
+              treated as a different class (and the order of the classes, which
+              will map to the label indices, will be alphanumeric). The
+              dictionary containing the mapping from class names to class
+              indices can be obtained via the attribute `class_indices`.
+        class_mode: One of "categorical", "binary", "sparse",
+            "input", or None. Default: "categorical".
+            Determines the type of label arrays that are returned: -
+              "categorical" will be 2D one-hot encoded labels, - "binary" will
+              be 1D binary labels, "sparse" will be 1D integer labels, - "input"
+              will be images identical to input images (mainly used to work with
+              autoencoders). - If None, no labels are returned (the generator
+              will only yield batches of image data, which is useful to use with
+              `model.predict_generator()`). Please note that in case of
+              class_mode None, the data still needs to reside in a subdirectory
+              of `directory` for it to work correctly.
+        batch_size: Size of the batches of data (default: 32).
+        shuffle: Whether to shuffle the data (default: True) If set to False,
+          sorts the data in alphanumeric order.
+        seed: Optional random seed for shuffling and transformations.
+        save_to_dir: None or str (default: None). This allows you to optionally
+          specify a directory to which to save the augmented pictures being
+          generated (useful for visualizing what you are doing).
+        save_prefix: Str. Prefix to use for filenames of saved pictures (only
+          relevant if `save_to_dir` is set).
+        save_format: One of "png", "jpeg"
+            (only relevant if `save_to_dir` is set). Default: "png".
+        follow_links: Whether to follow symlinks inside
+            class subdirectories (default: False).
+        subset: Subset of data (`"training"` or `"validation"`) if
+          `validation_split` is set in `ImageDataGenerator`.
+        interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image. Supported
+          methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
+          1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
+          version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
+          supported. By default, `"nearest"` is used.
+
+    Returns:
+        A `DirectoryIterator` yielding tuples of `(x, y)`
+            where `x` is a numpy array containing a batch
+            of images with shape `(batch_size, *target_size, channels)`
+            and `y` is a numpy array of corresponding labels.
+    """
+    return DirectoryIterator(
+        directory,
+        self,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        data_format=self.data_format,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        follow_links=follow_links,
+        subset=subset,
+        interpolation=interpolation)
+
+  def flow_from_dataframe(self,
+                          dataframe,
+                          directory=None,
+                          x_col='filename',
+                          y_col='class',
+                          weight_col=None,
+                          target_size=(256, 256),
+                          color_mode='rgb',
+                          classes=None,
+                          class_mode='categorical',
+                          batch_size=32,
+                          shuffle=True,
+                          seed=None,
+                          save_to_dir=None,
+                          save_prefix='',
+                          save_format='png',
+                          subset=None,
+                          interpolation='nearest',
+                          validate_filenames=True,
+                          **kwargs):
+    """Takes the dataframe and the path to a directory + generates batches.
+
+     The generated batches contain augmented/normalized data.
+
+    **A simple tutorial can be found **[here](
+                                http://bit.ly/keras_flow_from_dataframe).
+
+    Arguments:
+        dataframe: Pandas dataframe containing the filepaths relative to
+          `directory` (or absolute paths if `directory` is None) of the images
+          in a string column. It should include other column/s
+            depending on the `class_mode`: - if `class_mode` is `"categorical"`
+              (default value) it must include the `y_col` column with the
+              class/es of each image. Values in column can be string/list/tuple
+              if a single class or list/tuple if multiple classes. - if
+              `class_mode` is `"binary"` or `"sparse"` it must include the given
+              `y_col` column with class values as strings. - if `class_mode` is
+              `"raw"` or `"multi_output"` it should contain the columns
+              specified in `y_col`. - if `class_mode` is `"input"` or `None` no
+              extra column is needed.
+        directory: string, path to the directory to read images from. If `None`,
+          data in `x_col` column should be absolute paths.
+        x_col: string, column in `dataframe` that contains the filenames (or
+          absolute paths if `directory` is `None`).
+        y_col: string or list, column/s in `dataframe` that has the target data.
+        weight_col: string, column in `dataframe` that contains the sample
+            weights. Default: `None`.
+        target_size: tuple of integers `(height, width)`, default: `(256, 256)`.
+          The dimensions to which all images found will be resized.
+        color_mode: one of "grayscale", "rgb", "rgba". Default: "rgb". Whether
+          the images will be converted to have 1 or 3 color channels.
+        classes: optional list of classes (e.g. `['dogs', 'cats']`). Default is
+          None. If not provided, the list of classes will be automatically
+          inferred from the `y_col`, which will map to the label indices, will
+          be alphanumeric). The dictionary containing the mapping from class
+          names to class indices can be obtained via the attribute
+          `class_indices`.
+        class_mode: one of "binary", "categorical", "input", "multi_output",
+            "raw", sparse" or None. Default: "categorical".
+            Mode for yielding the targets:
+            - `"binary"`: 1D numpy array of binary labels,
+            - `"categorical"`: 2D numpy array of one-hot encoded labels.
+              Supports multi-label output.
+            - `"input"`: images identical to input images (mainly used to work
+              with autoencoders),
+            - `"multi_output"`: list with the values of the different columns,
+            - `"raw"`: numpy array of values in `y_col` column(s),
+            - `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
+              are returned (the generator will only yield batches of image data,
+              which is useful to use in `model.predict_generator()`).
+        batch_size: size of the batches of data (default: 32).
+        shuffle: whether to shuffle the data (default: True)
+        seed: optional random seed for shuffling and transformations.
+        save_to_dir: None or str (default: None). This allows you to optionally
+          specify a directory to which to save the augmented pictures being
+          generated (useful for visualizing what you are doing).
+        save_prefix: str. Prefix to use for filenames of saved pictures (only
+          relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg"
+            (only relevant if `save_to_dir` is set). Default: "png".
+        subset: Subset of data (`"training"` or `"validation"`) if
+          `validation_split` is set in `ImageDataGenerator`.
+        interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image. Supported
+          methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
+          1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
+          version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
+          supported. By default, `"nearest"` is used.
+        validate_filenames: Boolean, whether to validate image filenames in
+          `x_col`. If `True`, invalid images will be ignored. Disabling this
+          option can lead to speed-up in the execution of this function.
+          Defaults to `True`.
+        **kwargs: legacy arguments for raising deprecation warnings.
+
+    Returns:
+        A `DataFrameIterator` yielding tuples of `(x, y)`
+        where `x` is a numpy array containing a batch
+        of images with shape `(batch_size, *target_size, channels)`
+        and `y` is a numpy array of corresponding labels.
+    """
+    if 'has_ext' in kwargs:
+      tf_logging.warn(
+          'has_ext is deprecated, filenames in the dataframe have '
+          'to match the exact filenames in disk.', DeprecationWarning)
+    if 'sort' in kwargs:
+      tf_logging.warn(
+          'sort is deprecated, batches will be created in the'
+          'same order than the filenames provided if shuffle'
+          'is set to False.', DeprecationWarning)
+    if class_mode == 'other':
+      tf_logging.warn(
+          '`class_mode` "other" is deprecated, please use '
+          '`class_mode` "raw".', DeprecationWarning)
+      class_mode = 'raw'
+    if 'drop_duplicates' in kwargs:
+      tf_logging.warn(
+          'drop_duplicates is deprecated, you can drop duplicates '
+          'by using the pandas.DataFrame.drop_duplicates method.',
+          DeprecationWarning)
+
+    return DataFrameIterator(
+        dataframe,
+        directory,
+        self,
+        x_col=x_col,
+        y_col=y_col,
+        weight_col=weight_col,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        data_format=self.data_format,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset,
+        interpolation=interpolation,
+        validate_filenames=validate_filenames)
+
+
 keras_export('keras.preprocessing.image.random_rotation')(random_rotation)
 keras_export('keras.preprocessing.image.random_shift')(random_shift)
 keras_export('keras.preprocessing.image.random_shear')(random_shear)
diff --git a/tensorflow/python/keras/preprocessing/image_dataset.py b/tensorflow/python/keras/preprocessing/image_dataset.py
index 500a41fc8c5..a438c429c40 100644
--- a/tensorflow/python/keras/preprocessing/image_dataset.py
+++ b/tensorflow/python/keras/preprocessing/image_dataset.py
@@ -18,17 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import multiprocessing
-import os
-
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras.preprocessing import dataset_utils
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -49,7 +45,7 @@ def image_dataset_from_directory(directory,
                                  subset=None,
                                  interpolation='bilinear',
                                  follow_links=False):
-  """Generates a Dataset from image files in a directory.
+  """Generates a `tf.data.Dataset` from image files in a directory.
 
   If your directory structure is:
 
@@ -63,10 +59,10 @@ def image_dataset_from_directory(directory,
   ......b_image_2.jpg
   ```
 
-  Then calling `from_directory(main_directory, labels='inferred')`
-  will return a Dataset that yields batches of images from
+  Then calling `image_dataset_from_directory(main_directory, labels='inferred')`
+  will return a `tf.data.Dataset` that yields batches of images from
   the subdirectories `class_a` and `class_b`, together with labels
-  0 and 1 (0 corresponding to class_a and 1 corresponding to class_b).
+  0 and 1 (0 corresponding to `class_a` and 1 corresponding to `class_b`).
 
   Supported image formats: jpeg, png, bmp, gif.
   Animated gifs are truncated to the first frame.
@@ -126,22 +122,22 @@ def image_dataset_from_directory(directory,
         has shape `(batch_size, image_size[0], image_size[1], num_channels)`,
         and `labels` follows the format described below.
 
-    Rules regarding labels format:
-      - if `label_mode` is `int`, the labels are an `int32` tensor of shape
-        `(batch_size,)`.
-      - if `label_mode` is `binary`, the labels are a `float32` tensor of
-        1s and 0s of shape `(batch_size, 1)`.
-      - if `label_mode` is `categorial`, the labels are a `float32` tensor
-        of shape `(batch_size, num_classes)`, representing a one-hot
-        encoding of the class index.
+  Rules regarding labels format:
+    - if `label_mode` is `int`, the labels are an `int32` tensor of shape
+      `(batch_size,)`.
+    - if `label_mode` is `binary`, the labels are a `float32` tensor of
+      1s and 0s of shape `(batch_size, 1)`.
+    - if `label_mode` is `categorial`, the labels are a `float32` tensor
+      of shape `(batch_size, num_classes)`, representing a one-hot
+      encoding of the class index.
 
-    Rules regarding number of channels in the yielded images:
-      - if `color_mode` is `grayscale`,
-        there's 1 channel in the image tensors.
-      - if `color_mode` is `rgb`,
-        there are 3 channel in the image tensors.
-      - if `color_mode` is `rgba`,
-        there are 4 channel in the image tensors.
+  Rules regarding number of channels in the yielded images:
+    - if `color_mode` is `grayscale`,
+      there's 1 channel in the image tensors.
+    - if `color_mode` is `rgb`,
+      there are 3 channel in the image tensors.
+    - if `color_mode` is `rgba`,
+      there are 4 channel in the image tensors.
   """
   if labels != 'inferred':
     if not isinstance(labels, (list, tuple)):
@@ -171,86 +167,28 @@ def image_dataset_from_directory(directory,
         '`color_mode` must be one of {"rbg", "rgba", "grayscale"}. '
         'Received: %s' % (color_mode,))
   interpolation = image_preprocessing.get_interpolation(interpolation)
+  dataset_utils.check_validation_split_arg(
+      validation_split, subset, shuffle, seed)
 
-  inferred_class_names = []
-  for subdir in sorted(os.listdir(directory)):
-    if os.path.isdir(os.path.join(directory, subdir)):
-      inferred_class_names.append(subdir)
-  if not class_names:
-    class_names = inferred_class_names
-  else:
-    if set(class_names) != set(inferred_class_names):
-      raise ValueError(
-          'The `class_names` passed did not match the '
-          'names of the subdirectories of the target directory. '
-          'Expected: %s, but received: %s' %
-          (inferred_class_names, class_names))
-  class_indices = dict(zip(class_names, range(len(class_names))))
+  if seed is None:
+    seed = np.random.randint(1e6)
+  image_paths, labels, class_names = dataset_utils.index_directory(
+      directory,
+      labels,
+      formats=WHITELIST_FORMATS,
+      class_names=class_names,
+      shuffle=shuffle,
+      seed=seed,
+      follow_links=follow_links)
 
   if label_mode == 'binary' and len(class_names) != 2:
     raise ValueError(
         'When passing `label_mode="binary", there must exactly 2 classes. '
         'Found the following classes: %s' % (class_names,))
 
-  # Build an index of the images
-  # in the different class subfolders.
-  pool = multiprocessing.pool.ThreadPool()
-  results = []
-  filenames = []
-  for dirpath in (os.path.join(directory, subdir) for subdir in class_names):
-    results.append(
-        pool.apply_async(list_labeled_images_in_directory,
-                         (dirpath, class_indices, follow_links)))
-  labels_list = []
-  for res in results:
-    partial_labels, partial_filenames = res.get()
-    labels_list.append(partial_labels)
-    filenames += partial_filenames
-  if labels != 'inferred':
-    if len(labels) != len(filenames):
-      raise ValueError('Expected the lengths of `labels` to match the number '
-                       'of images in the target directory. len(labels) is %s '
-                       'while we found %s images in %s.' % (
-                           len(labels), len(filenames), directory))
-  else:
-    i = 0
-    labels = np.zeros((len(filenames),), dtype='int32')
-    for partial_labels in labels_list:
-      labels[i:i + len(partial_labels)] = partial_labels
-      i += len(partial_labels)
+  image_paths, labels = dataset_utils.get_training_or_validation_split(
+      image_paths, labels, validation_split, subset)
 
-  print('Found %d images belonging to %d classes.' %
-        (len(filenames), len(class_names)))
-  pool.close()
-  pool.join()
-  image_paths = [os.path.join(directory, fname) for fname in filenames]
-
-  if shuffle:
-    # Shuffle globally to erase macro-structure
-    # (the dataset will be further shuffled within a local buffer
-    # at each iteration)
-    if seed is None:
-      seed = np.random.randint(1e6)
-    rng = np.random.RandomState(seed)
-    rng.shuffle(image_paths)
-    rng = np.random.RandomState(seed)
-    rng.shuffle(labels)
-
-  if validation_split:
-    if not 0 < validation_split < 1:
-      raise ValueError(
-          '`validation_split` must be between 0 and 1, received: %s' %
-          (validation_split,))
-    num_val_samples = int(validation_split * len(image_paths))
-    if subset == 'training':
-      image_paths = image_paths[:-num_val_samples]
-      labels = labels[:-num_val_samples]
-    elif subset == 'validation':
-      image_paths = image_paths[-num_val_samples:]
-      labels = labels[-num_val_samples:]
-    else:
-      raise ValueError('`subset` must be either "training" '
-                       'or "validation", received: %s' % (subset,))
   dataset = paths_and_labels_to_dataset(
       image_paths=image_paths,
       image_size=image_size,
@@ -263,6 +201,8 @@ def image_dataset_from_directory(directory,
     # Shuffle locally at each iteration
     dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
   dataset = dataset.batch(batch_size)
+  # Users may need to reference `class_names`.
+  dataset.class_names = class_names
   return dataset
 
 
@@ -279,51 +219,11 @@ def paths_and_labels_to_dataset(image_paths,
   img_ds = path_ds.map(
       lambda x: path_to_image(x, image_size, num_channels, interpolation))
   if label_mode:
-    label_ds = dataset_ops.Dataset.from_tensor_slices(labels)
-    if label_mode == 'binary':
-      label_ds = label_ds.map(
-          lambda x: array_ops.expand_dims(math_ops.cast(x, 'float32'), axis=-1))
-    elif label_mode == 'categorical':
-      label_ds = label_ds.map(lambda x: array_ops.one_hot(x, num_classes))
+    label_ds = dataset_utils.labels_to_dataset(labels, label_mode, num_classes)
     img_ds = dataset_ops.Dataset.zip((img_ds, label_ds))
   return img_ds
 
 
-def iter_valid_files(directory, follow_links):
-  walk = os.walk(directory, followlinks=follow_links)
-  for root, _, files in sorted(walk, key=lambda x: x[0]):
-    for fname in sorted(files):
-      if fname.lower().endswith(WHITELIST_FORMATS):
-        yield root, fname
-
-
-def list_labeled_images_in_directory(directory, class_indices, follow_links):
-  """Recursively walks directory and list image paths and their class index.
-
-  Arguments:
-    directory: string, target directory.
-    class_indices: dict mapping class names to their index.
-    follow_links: boolean, whether to recursively follow subdirectories
-      (if False, we only list top-level images in `directory`).
-
-  Returns:
-    tuple `(labels, filenames)`. `labels` is a list of integer
-      labels and `filenames` is a list of relative image paths corresponding
-      to these labels.
-  """
-  dirname = os.path.basename(directory)
-  valid_files = iter_valid_files(directory, follow_links)
-  labels = []
-  filenames = []
-  for root, fname in valid_files:
-    labels.append(class_indices[dirname])
-    absolute_path = os.path.join(root, fname)
-    relative_path = os.path.join(
-        dirname, os.path.relpath(absolute_path, directory))
-    filenames.append(relative_path)
-  return labels, filenames
-
-
 def path_to_image(path, image_size, num_channels, interpolation):
   img = io_ops.read_file(path)
   img = image_ops.decode_image(
diff --git a/tensorflow/python/keras/preprocessing/image_dataset_test.py b/tensorflow/python/keras/preprocessing/image_dataset_test.py
index 629f03d4494..b196d8249ff 100644
--- a/tensorflow/python/keras/preprocessing/image_dataset_test.py
+++ b/tensorflow/python/keras/preprocessing/image_dataset_test.py
@@ -35,7 +35,7 @@ except ImportError:
   PIL = None
 
 
-class DatasetFromDirectoryTest(keras_parameterized.TestCase):
+class ImageDatasetFromDirectoryTest(keras_parameterized.TestCase):
 
   def _get_images(self, count=16, color_mode='rgb'):
     width = height = 24
@@ -199,13 +199,13 @@ class DatasetFromDirectoryTest(keras_parameterized.TestCase):
     directory = self._prepare_directory(num_classes=2, count=10)
     dataset = image_dataset.image_dataset_from_directory(
         directory, batch_size=10, image_size=(18, 18),
-        validation_split=0.2, subset='training')
+        validation_split=0.2, subset='training', seed=1337)
     batch = next(iter(dataset))
     self.assertLen(batch, 2)
     self.assertEqual(batch[0].shape, (8, 18, 18, 3))
     dataset = image_dataset.image_dataset_from_directory(
         directory, batch_size=10, image_size=(18, 18),
-        validation_split=0.2, subset='validation')
+        validation_split=0.2, subset='validation', seed=1337)
     batch = next(iter(dataset))
     self.assertLen(batch, 2)
     self.assertEqual(batch[0].shape, (2, 18, 18, 3))
@@ -262,7 +262,7 @@ class DatasetFromDirectoryTest(keras_parameterized.TestCase):
 
     with self.assertRaisesRegex(
         ValueError,
-        'Expected the lengths of `labels` to match the number of images'):
+        'Expected the lengths of `labels` to match the number of files'):
       _ = image_dataset.image_dataset_from_directory(
           directory, labels=[0, 0, 1, 1])
 
@@ -285,6 +285,14 @@ class DatasetFromDirectoryTest(keras_parameterized.TestCase):
       _ = image_dataset.image_dataset_from_directory(
           directory, validation_split=0.2, subset='other')
 
+    with self.assertRaisesRegex(ValueError, '`validation_split` must be set'):
+      _ = image_dataset.image_dataset_from_directory(
+          directory, validation_split=0, subset='training')
+
+    with self.assertRaisesRegex(ValueError, 'must provide a `seed`'):
+      _ = image_dataset.image_dataset_from_directory(
+          directory, validation_split=0.2, subset='training')
+
 
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index 1245c1ecc8e..d2f4b18f7dd 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -24,6 +24,10 @@ import tempfile
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.preprocessing import image as preprocessing_image
 from tensorflow.python.platform import test
 
@@ -51,7 +55,27 @@ def _generate_test_images():
   return [rgb_images, gray_images]
 
 
-class TestImage(test.TestCase):
+class TestImage(keras_parameterized.TestCase):
+
+  @test_util.run_v2_only
+  def test_smart_resize(self):
+    test_input = np.random.random((20, 40, 3))
+    output = preprocessing_image.smart_resize(test_input, size=(50, 50))
+    self.assertIsInstance(output, np.ndarray)
+    self.assertListEqual(list(output.shape), [50, 50, 3])
+    output = preprocessing_image.smart_resize(test_input, size=(10, 10))
+    self.assertListEqual(list(output.shape), [10, 10, 3])
+    output = preprocessing_image.smart_resize(test_input, size=(100, 50))
+    self.assertListEqual(list(output.shape), [100, 50, 3])
+    output = preprocessing_image.smart_resize(test_input, size=(5, 15))
+    self.assertListEqual(list(output.shape), [5, 15, 3])
+
+  def test_smart_resize_errors(self):
+    with self.assertRaisesRegex(ValueError, 'a tuple of 2 integers'):
+      preprocessing_image.smart_resize(
+          np.random.random((20, 20, 2)), size=(10, 5, 3))
+    with self.assertRaisesRegex(ValueError, 'incorrect rank'):
+      preprocessing_image.smart_resize(np.random.random((20, 40)), size=(10, 5))
 
   def test_image_data_generator(self):
     if PIL is None:
@@ -122,8 +146,7 @@ class TestImage(test.TestCase):
       generator = preprocessing_image.ImageDataGenerator(
           data_format='unknown')
 
-    generator = preprocessing_image.ImageDataGenerator(
-        zoom_range=(2, 2))
+    generator = preprocessing_image.ImageDataGenerator(zoom_range=(2., 2.))
 
   def test_image_data_generator_fit(self):
     generator = preprocessing_image.ImageDataGenerator(
@@ -298,14 +321,21 @@ class TestImage(test.TestCase):
     self.assertEqual(
         len(set(train_iterator.filenames) & set(filenames)), num_training)
 
+    model = sequential.Sequential([layers.Flatten(), layers.Dense(2)])
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(train_iterator, epochs=1)
+
     shutil.rmtree(tmp_folder)
 
+  @keras_parameterized.run_all_keras_modes
   def test_directory_iterator_with_validation_split_25_percent(self):
     self.directory_iterator_with_validation_split_test_helper(0.25)
 
+  @keras_parameterized.run_all_keras_modes
   def test_directory_iterator_with_validation_split_40_percent(self):
     self.directory_iterator_with_validation_split_test_helper(0.40)
 
+  @keras_parameterized.run_all_keras_modes
   def test_directory_iterator_with_validation_split_50_percent(self):
     self.directory_iterator_with_validation_split_test_helper(0.50)
 
diff --git a/tensorflow/python/keras/preprocessing/text.py b/tensorflow/python/keras/preprocessing/text.py
index e501789a1a0..5a343e0069c 100644
--- a/tensorflow/python/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/preprocessing/text.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from keras_preprocessing import text
 
+from tensorflow.python.keras.preprocessing.text_dataset import text_dataset_from_directory  # pylint: disable=unused-import
 from tensorflow.python.util.tf_export import keras_export
 
 hashing_trick = text.hashing_trick
diff --git a/tensorflow/python/keras/preprocessing/text_dataset.py b/tensorflow/python/keras/preprocessing/text_dataset.py
new file mode 100644
index 00000000000..c634df86edd
--- /dev/null
+++ b/tensorflow/python/keras/preprocessing/text_dataset.py
@@ -0,0 +1,190 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras text dataset generation utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.keras.preprocessing import dataset_utils
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.preprocessing.text_dataset_from_directory', v1=[])
+def text_dataset_from_directory(directory,
+                                labels='inferred',
+                                label_mode='int',
+                                class_names=None,
+                                batch_size=32,
+                                max_length=None,
+                                shuffle=True,
+                                seed=None,
+                                validation_split=None,
+                                subset=None,
+                                follow_links=False):
+  """Generates a `tf.data.Dataset` from text files in a directory.
+
+  If your directory structure is:
+
+  ```
+  main_directory/
+  ...class_a/
+  ......a_text_1.txt
+  ......a_text_2.txt
+  ...class_b/
+  ......b_text_1.txt
+  ......b_text_2.txt
+  ```
+
+  Then calling `text_dataset_from_directory(main_directory, labels='inferred')`
+  will return a `tf.data.Dataset` that yields batches of texts from
+  the subdirectories `class_a` and `class_b`, together with labels
+  0 and 1 (0 corresponding to `class_a` and 1 corresponding to `class_b`).
+
+  Only `.txt` files are supported at this time.
+
+  Arguments:
+    directory: Directory where the data is located.
+        If `labels` is "inferred", it should contain
+        subdirectories, each containing text files for a class.
+        Otherwise, the directory structure is ignored.
+    labels: Either "inferred"
+        (labels are generated from the directory structure),
+        or a list/tuple of integer labels of the same size as the number of
+        text files found in the directory. Labels should be sorted according
+        to the alphanumeric order of the text file paths
+        (obtained via `os.walk(directory)` in Python).
+    label_mode:
+        - 'int': means that the labels are encoded as integers
+            (e.g. for `sparse_categorical_crossentropy` loss).
+        - 'categorical' means that the labels are
+            encoded as a categorical vector
+            (e.g. for `categorical_crossentropy` loss).
+        - 'binary' means that the labels (there can be only 2)
+            are encoded as `float32` scalars with values 0 or 1
+            (e.g. for `binary_crossentropy`).
+        - None (no labels).
+    class_names: Only valid if "labels" is "inferred". This is the explict
+        list of class names (must match names of subdirectories). Used
+        to control the order of the classes
+        (otherwise alphanumerical order is used).
+    batch_size: Size of the batches of data. Default: 32.
+    max_length: Maximum size of a text string. Texts longer than this will
+      be truncated to `max_length`.
+    shuffle: Whether to shuffle the data. Default: True.
+        If set to False, sorts the data in alphanumeric order.
+    seed: Optional random seed for shuffling and transformations.
+    validation_split: Optional float between 0 and 1,
+        fraction of data to reserve for validation.
+    subset: One of "training" or "validation".
+        Only used if `validation_split` is set.
+    follow_links: Whether to visits subdirectories pointed to by symlinks.
+        Defaults to False.
+
+  Returns:
+    A `tf.data.Dataset` object.
+      - If `label_mode` is None, it yields `string` tensors of shape
+        `(batch_size,)`, containing the contents of a batch of text files.
+      - Otherwise, it yields a tuple `(texts, labels)`, where `texts`
+        has shape `(batch_size,)` and `labels` follows the format described
+        below.
+
+  Rules regarding labels format:
+    - if `label_mode` is `int`, the labels are an `int32` tensor of shape
+      `(batch_size,)`.
+    - if `label_mode` is `binary`, the labels are a `float32` tensor of
+      1s and 0s of shape `(batch_size, 1)`.
+    - if `label_mode` is `categorial`, the labels are a `float32` tensor
+      of shape `(batch_size, num_classes)`, representing a one-hot
+      encoding of the class index.
+  """
+  if labels != 'inferred':
+    if not isinstance(labels, (list, tuple)):
+      raise ValueError(
+          '`labels` argument should be a list/tuple of integer labels, of '
+          'the same size as the number of text files in the target '
+          'directory. If you wish to infer the labels from the subdirectory '
+          'names in the target directory, pass `labels="inferred"`. '
+          'If you wish to get a dataset that only contains text samples '
+          '(no labels), pass `labels=None`.')
+    if class_names:
+      raise ValueError('You can only pass `class_names` if the labels are '
+                       'inferred from the subdirectory names in the target '
+                       'directory (`labels="inferred"`).')
+  if label_mode not in {'int', 'categorical', 'binary', None}:
+    raise ValueError(
+        '`label_mode` argument must be one of "int", "categorical", "binary", '
+        'or None. Received: %s' % (label_mode,))
+  dataset_utils.check_validation_split_arg(
+      validation_split, subset, shuffle, seed)
+
+  if seed is None:
+    seed = np.random.randint(1e6)
+  file_paths, labels, class_names = dataset_utils.index_directory(
+      directory,
+      labels,
+      formats=('.txt',),
+      class_names=class_names,
+      shuffle=shuffle,
+      seed=seed,
+      follow_links=follow_links)
+
+  if label_mode == 'binary' and len(class_names) != 2:
+    raise ValueError(
+        'When passing `label_mode="binary", there must exactly 2 classes. '
+        'Found the following classes: %s' % (class_names,))
+
+  file_paths, labels = dataset_utils.get_training_or_validation_split(
+      file_paths, labels, validation_split, subset)
+
+  dataset = paths_and_labels_to_dataset(
+      file_paths=file_paths,
+      labels=labels,
+      label_mode=label_mode,
+      num_classes=len(class_names),
+      max_length=max_length)
+  if shuffle:
+    # Shuffle locally at each iteration
+    dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
+  dataset = dataset.batch(batch_size)
+  # Users may need to reference `class_names`.
+  dataset.class_names = class_names
+  return dataset
+
+
+def paths_and_labels_to_dataset(file_paths,
+                                labels,
+                                label_mode,
+                                num_classes,
+                                max_length):
+  """Constructs a dataset of text strings and labels."""
+  path_ds = dataset_ops.Dataset.from_tensor_slices(file_paths)
+  string_ds = path_ds.map(
+      lambda x: path_to_string_content(x, max_length))
+  if label_mode:
+    label_ds = dataset_utils.labels_to_dataset(labels, label_mode, num_classes)
+    string_ds = dataset_ops.Dataset.zip((string_ds, label_ds))
+  return string_ds
+
+
+def path_to_string_content(path, max_length):
+  txt = io_ops.read_file(path)
+  if max_length is not None:
+    txt = string_ops.substr(txt, 0, max_length)
+  return txt
diff --git a/tensorflow/python/keras/preprocessing/text_dataset_test.py b/tensorflow/python/keras/preprocessing/text_dataset_test.py
new file mode 100644
index 00000000000..f36bd9d89ad
--- /dev/null
+++ b/tensorflow/python/keras/preprocessing/text_dataset_test.py
@@ -0,0 +1,228 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for text_dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import shutil
+import string
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.preprocessing import text_dataset
+from tensorflow.python.platform import test
+
+
+class TextDatasetFromDirectoryTest(keras_parameterized.TestCase):
+
+  def _prepare_directory(self,
+                         num_classes=2,
+                         nested_dirs=False,
+                         count=16,
+                         length=20):
+    # Get a unique temp directory
+    temp_dir = os.path.join(self.get_temp_dir(), str(random.randint(0, 1e6)))
+    os.mkdir(temp_dir)
+    self.addCleanup(shutil.rmtree, temp_dir)
+
+    # Generate paths to class subdirectories
+    paths = []
+    for class_index in range(num_classes):
+      class_directory = 'class_%s' % (class_index,)
+      if nested_dirs:
+        class_paths = [
+            class_directory, os.path.join(class_directory, 'subfolder_1'),
+            os.path.join(class_directory, 'subfolder_2'), os.path.join(
+                class_directory, 'subfolder_1', 'sub-subfolder')
+        ]
+      else:
+        class_paths = [class_directory]
+      for path in class_paths:
+        os.mkdir(os.path.join(temp_dir, path))
+      paths += class_paths
+
+    for i in range(count):
+      path = paths[count % len(paths)]
+      filename = os.path.join(path, 'text_%s.txt' % (i,))
+      f = open(os.path.join(temp_dir, filename), 'w')
+      text = ''.join([random.choice(string.printable) for _ in range(length)])
+      f.write(text)
+      f.close()
+    return temp_dir
+
+  def test_text_dataset_from_directory_binary(self):
+    directory = self._prepare_directory(num_classes=2)
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, label_mode='int', max_length=10)
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8,))
+    self.assertEqual(batch[0].dtype.name, 'string')
+    self.assertEqual(len(batch[0].numpy()[0]), 10)  # Test max_length
+    self.assertEqual(batch[1].shape, (8,))
+    self.assertEqual(batch[1].dtype.name, 'int32')
+
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, label_mode='binary')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8,))
+    self.assertEqual(batch[0].dtype.name, 'string')
+    self.assertEqual(batch[1].shape, (8, 1))
+    self.assertEqual(batch[1].dtype.name, 'float32')
+
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, label_mode='categorical')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8,))
+    self.assertEqual(batch[0].dtype.name, 'string')
+    self.assertEqual(batch[1].shape, (8, 2))
+    self.assertEqual(batch[1].dtype.name, 'float32')
+
+  def test_sample_count(self):
+    directory = self._prepare_directory(num_classes=4, count=15)
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, label_mode=None)
+    sample_count = 0
+    for batch in dataset:
+      sample_count += batch.shape[0]
+    self.assertEqual(sample_count, 15)
+
+  def test_text_dataset_from_directory_multiclass(self):
+    directory = self._prepare_directory(num_classes=4, count=15)
+
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, label_mode=None)
+    batch = next(iter(dataset))
+    self.assertEqual(batch.shape, (8,))
+
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, label_mode=None)
+    sample_count = 0
+    iterator = iter(dataset)
+    for batch in dataset:
+      sample_count += next(iterator).shape[0]
+    self.assertEqual(sample_count, 15)
+
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, label_mode='int')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8,))
+    self.assertEqual(batch[0].dtype.name, 'string')
+    self.assertEqual(batch[1].shape, (8,))
+    self.assertEqual(batch[1].dtype.name, 'int32')
+
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, label_mode='categorical')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8,))
+    self.assertEqual(batch[0].dtype.name, 'string')
+    self.assertEqual(batch[1].shape, (8, 4))
+    self.assertEqual(batch[1].dtype.name, 'float32')
+
+  def test_text_dataset_from_directory_validation_split(self):
+    directory = self._prepare_directory(num_classes=2, count=10)
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=10, validation_split=0.2, subset='training',
+        seed=1337)
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8,))
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=10, validation_split=0.2, subset='validation',
+        seed=1337)
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (2,))
+
+  def test_text_dataset_from_directory_manual_labels(self):
+    directory = self._prepare_directory(num_classes=2, count=2)
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, labels=[0, 1], shuffle=False)
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertAllClose(batch[1], [0, 1])
+
+  def test_text_dataset_from_directory_follow_links(self):
+    directory = self._prepare_directory(num_classes=2, count=25,
+                                        nested_dirs=True)
+    dataset = text_dataset.text_dataset_from_directory(
+        directory, batch_size=8, label_mode=None, follow_links=True)
+    sample_count = 0
+    for batch in dataset:
+      sample_count += batch.shape[0]
+    self.assertEqual(sample_count, 25)
+
+  def test_text_dataset_from_directory_errors(self):
+    directory = self._prepare_directory(num_classes=3, count=5)
+
+    with self.assertRaisesRegex(ValueError, '`labels` argument should be'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, labels=None)
+
+    with self.assertRaisesRegex(ValueError, '`label_mode` argument must be'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, label_mode='other')
+
+    with self.assertRaisesRegex(
+        ValueError, 'only pass `class_names` if the labels are inferred'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, labels=[0, 0, 1, 1, 1],
+          class_names=['class_0', 'class_1', 'class_2'])
+
+    with self.assertRaisesRegex(
+        ValueError,
+        'Expected the lengths of `labels` to match the number of files'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, labels=[0, 0, 1, 1])
+
+    with self.assertRaisesRegex(
+        ValueError, '`class_names` passed did not match'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, class_names=['class_0', 'class_2'])
+
+    with self.assertRaisesRegex(ValueError, 'there must exactly 2 classes'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, label_mode='binary')
+
+    with self.assertRaisesRegex(ValueError,
+                                '`validation_split` must be between 0 and 1'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, validation_split=2)
+
+    with self.assertRaisesRegex(ValueError,
+                                '`subset` must be either "training" or'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, validation_split=0.2, subset='other')
+
+    with self.assertRaisesRegex(ValueError, '`validation_split` must be set'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, validation_split=0, subset='training')
+
+    with self.assertRaisesRegex(ValueError, 'must provide a `seed`'):
+      _ = text_dataset.text_dataset_from_directory(
+          directory, validation_split=0.2, subset='training')
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/keras/preprocessing/timeseries.py b/tensorflow/python/keras/preprocessing/timeseries.py
index 373594b9356..64e2d06554d 100644
--- a/tensorflow/python/keras/preprocessing/timeseries.py
+++ b/tensorflow/python/keras/preprocessing/timeseries.py
@@ -106,8 +106,8 @@ def timeseries_dataset_from_array(
   ```python
   input_data = data[:-10]
   targets = data[10:]
-  dataset = tf.keras.preprocessing.timeseries.dataset_from_array(
-    input_data, targets, sequence_length=10)
+  dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
+      input_data, targets, sequence_length=10)
   for batch in dataset:
     inputs, targets = batch
     assert np.array_equal(inputs[0], data[:10])  # First sequence: steps [0-9]
diff --git a/tensorflow/python/keras/protobuf/BUILD b/tensorflow/python/keras/protobuf/BUILD
new file mode 100644
index 00000000000..644c32bb507
--- /dev/null
+++ b/tensorflow/python/keras/protobuf/BUILD
@@ -0,0 +1,18 @@
+# Description:
+#   Contains Keras protobufs
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
+
+package(
+    default_visibility = [
+        "//tensorflow/python/keras:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+tf_proto_library(
+    name = "projector_config_proto",
+    srcs = ["projector_config.proto"],
+    cc_api_version = 2,
+)
diff --git a/tensorflow/python/keras/protobuf/projector_config.proto b/tensorflow/python/keras/protobuf/projector_config.proto
new file mode 100644
index 00000000000..b7105b949d0
--- /dev/null
+++ b/tensorflow/python/keras/protobuf/projector_config.proto
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package third_party.tensorflow.python.keras.protobuf;
+
+message SpriteMetadata {
+  string image_path = 1;
+  // [width, height] of a single image in the sprite.
+  repeated uint32 single_image_dim = 2;
+}
+
+message EmbeddingInfo {
+  string tensor_name = 1;
+  string metadata_path = 2;
+  string bookmarks_path = 3;
+  // Shape of the 2D tensor [N x D]. If missing, it will be inferred from the
+  // model checkpoint.
+  repeated uint32 tensor_shape = 4;
+  SpriteMetadata sprite = 5;
+  // Path to the TSV file holding the tensor values. If missing, the tensor
+  // is assumed to be stored in the model checkpoint.
+  string tensor_path = 6;
+}
+
+message ProjectorConfig {
+  // Path to the checkpoint file. Use either this or model_checkpoint_dir.
+  string model_checkpoint_path = 1;
+  repeated EmbeddingInfo embeddings = 2;
+  // Path to the checkpoint directory. The directory will be scanned for the
+  // latest checkpoint file.
+  string model_checkpoint_dir = 3;
+}
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index 4c33b0b3e92..b8bae4cc155 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -14,13 +14,14 @@
 # ==============================================================================
 """Built-in regularizers.
 """
+# pylint: disable=invalid-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import backend
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
@@ -60,8 +61,8 @@ class Regularizer(object):
   >>> layer = tf.keras.layers.Dense(
   ...     5, input_dim=5,
   ...     kernel_initializer='ones',
-  ...     kernel_regularizer=tf.keras.regularizers.l1(0.01),
-  ...     activity_regularizer=tf.keras.regularizers.l2(0.01))
+  ...     kernel_regularizer=tf.keras.regularizers.L1(0.01),
+  ...     activity_regularizer=tf.keras.regularizers.L2(0.01))
   >>> tensor = tf.ones(shape=(5, 5)) * 2.0
   >>> out = layer(tensor)
 
@@ -73,9 +74,9 @@ class Regularizer(object):
   ## Available penalties
 
   ```python
-  tf.keras.regularizers.l1(0.3)  # L1 Regularization Penalty
-  tf.keras.regularizers.l2(0.1)  # L2 Regularization Penalty
-  tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01)  # L1 + L2 penalties
+  tf.keras.regularizers.L1(0.3)  # L1 Regularization Penalty
+  tf.keras.regularizers.L2(0.1)  # L2 Regularization Penalty
+  tf.keras.regularizers.L1L2(l1=0.01, l2=0.01)  # L1 + L2 penalties
   ```
 
   ## Directly calling a regularizer
@@ -84,7 +85,7 @@ class Regularizer(object):
   as if it is a one-argument function.
 
   E.g.
-  >>> regularizer = tf.keras.regularizers.l2(2.)
+  >>> regularizer = tf.keras.regularizers.L2(2.)
   >>> tensor = tf.ones(shape=(5, 5))
   >>> regularizer(tensor)
   <tf.Tensor: shape=(), dtype=float32, numpy=50.0>
@@ -194,13 +195,19 @@ class Regularizer(object):
 
 @keras_export('keras.regularizers.L1L2')
 class L1L2(Regularizer):
-  r"""A regularizer that applies both L1 and L2 regularization penalties.
+  """A regularizer that applies both L1 and L2 regularization penalties.
 
   The L1 regularization penalty is computed as:
-  $$\ell_1\,\,penalty =\ell_1\sum_{i=0}^n|x_i|$$
+  `loss = l1 * reduce_sum(abs(x))`
 
   The L2 regularization penalty is computed as
-  $$\ell_2\,\,penalty =\ell_2\sum_{i=0}^nx_i^2$$
+  `loss = l2 * reduce_sum(square(x))`
+
+  L1L2 may be passed to a layer as a string identifier:
+
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1_l2')
+
+  In this case, the default values used are `l1=0.01` and `l2=0.01`.
 
   Attributes:
       l1: Float; L1 regularization factor.
@@ -208,13 +215,11 @@ class L1L2(Regularizer):
   """
 
   def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
-    self.l1 = K.cast_to_floatx(l1)
-    self.l2 = K.cast_to_floatx(l2)
+    self.l1 = backend.cast_to_floatx(l1)
+    self.l2 = backend.cast_to_floatx(l2)
 
   def __call__(self, x):
-    if not self.l1 and not self.l2:
-      return K.constant(0.)
-    regularization = 0.
+    regularization = backend.constant(0., dtype=x.dtype)
     if self.l1:
       regularization += self.l1 * math_ops.reduce_sum(math_ops.abs(x))
     if self.l2:
@@ -225,39 +230,64 @@ class L1L2(Regularizer):
     return {'l1': float(self.l1), 'l2': float(self.l2)}
 
 
-# Aliases.
-
-
-@keras_export('keras.regularizers.l1')
-def l1(l=0.01):
-  r"""Create a regularizer that applies an L1 regularization penalty.
+@keras_export('keras.regularizers.L1', 'keras.regularizers.l1')
+class L1(Regularizer):
+  """A regularizer that applies a L1 regularization penalty.
 
   The L1 regularization penalty is computed as:
-  $$\ell_1\,\,penalty =\ell_1\sum_{i=0}^n|x_i|$$
+  `loss = l1 * reduce_sum(abs(x))`
 
-  Arguments:
-      l: Float; L1 regularization factor.
+  L1 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L1 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1')
+
+  In this case, the default value used is `l1=0.01`.
+
+  Attributes:
+      l1: Float; L1 regularization factor.
   """
-  return L1L2(l1=l)
+
+  def __init__(self, l1=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l1 = kwargs.pop('l', l1)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l1 = backend.cast_to_floatx(l1)
+
+  def __call__(self, x):
+    return self.l1 * math_ops.reduce_sum(math_ops.abs(x))
+
+  def get_config(self):
+    return {'l1': float(self.l1)}
 
 
-@keras_export('keras.regularizers.l2')
-def l2(l=0.01):
-  r"""Create a regularizer that applies an L2 regularization penalty.
+@keras_export('keras.regularizers.L2', 'keras.regularizers.l2')
+class L2(Regularizer):
+  """A regularizer that applies a L2 regularization penalty.
 
   The L2 regularization penalty is computed as:
-  $$\ell_2\,\,penalty =\ell_2\sum_{i=0}^nx_i^2$$
+  `loss = l2 * reduce_sum(square(x))`
 
-  Arguments:
-      l: Float; L2 regularization factor.
+  L2 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L2 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l2')
+
+  In this case, the default value used is `l2=0.01`.
+
+  Attributes:
+      l2: Float; L2 regularization factor.
   """
-  return L1L2(l2=l)
+
+  def __init__(self, l2=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l2 = kwargs.pop('l', l2)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l2 = backend.cast_to_floatx(l2)
+
+  def __call__(self, x):
+    return self.l2 * math_ops.reduce_sum(math_ops.square(x))
+
+  def get_config(self):
+    return {'l2': float(self.l2)}
 
 
 @keras_export('keras.regularizers.l1_l2')
@@ -265,10 +295,10 @@ def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   r"""Create a regularizer that applies both L1 and L2 penalties.
 
   The L1 regularization penalty is computed as:
-  $$\ell_1\,\,penalty =\ell_1\sum_{i=0}^n|x_i|$$
+  `loss = l1 * reduce_sum(abs(x))`
 
   The L2 regularization penalty is computed as:
-  $$\ell_2\,\,penalty =\ell_2\sum_{i=0}^nx_i^2$$
+  `loss = l2 * reduce_sum(square(x))`
 
   Arguments:
       l1: Float; L1 regularization factor.
@@ -280,6 +310,11 @@ def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   return L1L2(l1=l1, l2=l2)
 
 
+# Deserialization aliases.
+l1 = L1
+l2 = L2
+
+
 @keras_export('keras.regularizers.serialize')
 def serialize(regularizer):
   return serialize_keras_object(regularizer)
@@ -287,6 +322,10 @@ def serialize(regularizer):
 
 @keras_export('keras.regularizers.deserialize')
 def deserialize(config, custom_objects=None):
+  if config == 'l1_l2':
+    # Special case necessary since the defaults used for "l1_l2" (string)
+    # differ from those of the L1L2 class.
+    return L1L2(l1=0.01, l2=0.01)
   return deserialize_keras_object(
       config,
       module_objects=globals(),
@@ -296,20 +335,15 @@ def deserialize(config, custom_objects=None):
 
 @keras_export('keras.regularizers.get')
 def get(identifier):
+  """Retrieve a regularizer instance from a config or identifier."""
   if identifier is None:
     return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
-    identifier = str(identifier)
-    # We have to special-case functions that return classes.
-    # TODO(omalleyt): Turn these into classes or class aliases.
-    special_cases = ['l1', 'l2', 'l1_l2']
-    if identifier in special_cases:
-      # Treat like a class.
-      return deserialize({'class_name': identifier, 'config': {}})
     return deserialize(str(identifier))
   elif callable(identifier):
     return identifier
   else:
-    raise ValueError('Could not interpret regularizer identifier:', identifier)
+    raise ValueError(
+        'Could not interpret regularizer identifier: {}'.format(identifier))
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index b88cd08c079..b10218ba114 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -199,6 +199,10 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
     # - 4 from activity regularizers on the shared_dense layer.
     self.assertLen(model.losses, 9)
 
+  def test_deserialization_error(self):
+    with self.assertRaisesRegex(ValueError, 'Could not interpret regularizer'):
+      keras.regularizers.get(0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index 52f4b2dd71f..67a6a28d7cf 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -109,7 +109,6 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/feature_column:feature_column_v2",
         "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index cae58329005..757385a25ea 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -1210,7 +1210,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   def test_incompatible_checkpoint(self):
     save_path = trackable.Checkpoint().save(
         os.path.join(self.get_temp_dir(), 'ckpt'))
-    m = keras.Model()
+    m = DummySubclassModel()
     with self.assertRaisesRegexp(AssertionError, 'Nothing to load'):
       m.load_weights(save_path)
     m.dense = keras.layers.Dense(2)
@@ -1222,7 +1222,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_directory_passed(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       self.evaluate(v.assign(42.))
       prefix = os.path.join(self.get_temp_dir(),
@@ -1235,7 +1235,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_relative_path(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       os.chdir(self.get_temp_dir())
 
@@ -1266,7 +1266,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_nonexistent_prefix_directory(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       self.evaluate(v.assign(42.))
       prefix = os.path.join(self.get_temp_dir(),
@@ -1276,5 +1276,10 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
       m.load_weights(prefix)
       self.assertEqual(42., self.evaluate(v))
 
+
+class DummySubclassModel(training.Model):
+  pass
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 5c5846fe738..f0a0d82ade9 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -25,19 +25,14 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.eager import context
-from tensorflow.python.feature_column import feature_column_lib
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.ops import lookup_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader_impl
 
@@ -105,101 +100,6 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
       save.save_model(self.model, path, save_format='tf')
       save.load_model(path)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
-  def test_saving_with_dense_features(self):
-    cols = [
-        feature_column_lib.numeric_column('a'),
-        feature_column_lib.indicator_column(
-            feature_column_lib.categorical_column_with_vocabulary_list(
-                'b', ['one', 'two']))
-    ]
-    input_layers = {
-        'a': keras.layers.Input(shape=(1,), name='a'),
-        'b': keras.layers.Input(shape=(1,), name='b', dtype='string')
-    }
-
-    fc_layer = feature_column_lib.DenseFeatures(cols)(input_layers)
-    output = keras.layers.Dense(10)(fc_layer)
-
-    model = keras.models.Model(input_layers, output)
-
-    model.compile(
-        loss=keras.losses.MSE,
-        optimizer='rmsprop',
-        metrics=[keras.metrics.categorical_accuracy])
-
-    config = model.to_json()
-    loaded_model = model_config.model_from_json(config)
-
-    inputs_a = np.arange(10).reshape(10, 1)
-    inputs_b = np.arange(10).reshape(10, 1).astype('str')
-
-    with self.cached_session():
-      # Initialize tables for V1 lookup.
-      if not context.executing_eagerly():
-        self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertLen(loaded_model.predict({'a': inputs_a, 'b': inputs_b}), 10)
-
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
-  def test_saving_with_sequence_features(self):
-    cols = [
-        feature_column_lib.sequence_numeric_column('a'),
-        feature_column_lib.indicator_column(
-            feature_column_lib.sequence_categorical_column_with_vocabulary_list(
-                'b', ['one', 'two']))
-    ]
-    input_layers = {
-        'a':
-            keras.layers.Input(shape=(None, 1), sparse=True, name='a'),
-        'b':
-            keras.layers.Input(
-                shape=(None, 1), sparse=True, name='b', dtype='string')
-    }
-
-    fc_layer, _ = feature_column_lib.SequenceFeatures(cols)(input_layers)
-    # TODO(tibell): Figure out the right dtype and apply masking.
-    # sequence_length_mask = array_ops.sequence_mask(sequence_length)
-    # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
-    x = keras.layers.GRU(32)(fc_layer)
-    output = keras.layers.Dense(10)(x)
-
-    model = keras.models.Model(input_layers, output)
-
-    model.compile(
-        loss=keras.losses.MSE,
-        optimizer='rmsprop',
-        metrics=[keras.metrics.categorical_accuracy])
-
-    config = model.to_json()
-    loaded_model = model_config.model_from_json(config)
-
-    batch_size = 10
-    timesteps = 1
-
-    values_a = np.arange(10, dtype=np.float32)
-    indices_a = np.zeros((10, 3), dtype=np.int64)
-    indices_a[:, 0] = np.arange(10)
-    inputs_a = sparse_tensor.SparseTensor(indices_a, values_a,
-                                          (batch_size, timesteps, 1))
-
-    values_b = np.zeros(10, dtype=np.str)
-    indices_b = np.zeros((10, 3), dtype=np.int64)
-    indices_b[:, 0] = np.arange(10)
-    inputs_b = sparse_tensor.SparseTensor(indices_b, values_b,
-                                          (batch_size, timesteps, 1))
-
-    with self.cached_session():
-      # Initialize tables for V1 lookup.
-      if not context.executing_eagerly():
-        self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertLen(
-          loaded_model.predict({
-              'a': inputs_a,
-              'b': inputs_b
-          }, steps=1), batch_size)
-
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_saving_h5_for_rnn_layers(self):
     # See https://github.com/tensorflow/tensorflow/issues/35731 for details.
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 5ffeb0671a1..13af49e3a0d 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -62,9 +62,9 @@ layers_module = LazyLoader(
 input_layer = LazyLoader(
     "input_layer", globals(),
     "tensorflow.python.keras.engine.input_layer")
-network_lib = LazyLoader(
-    "network_lib", globals(),
-    "tensorflow.python.keras.engine.network")
+functional_lib = LazyLoader(
+    "functional_lib", globals(),
+    "tensorflow.python.keras.engine.functional")
 training_lib = LazyLoader(
     "training_lib", globals(),
     "tensorflow.python.keras.engine.training")
@@ -142,7 +142,7 @@ def _is_graph_network(layer):
   # pylint: disable=protected-access
   if isinstance(layer, RevivedNetwork):
     return False
-  elif isinstance(layer, network_lib.Network):
+  elif isinstance(layer, functional_lib.Functional):
     return (layer._is_graph_network or
             isinstance(layer, models_lib.Sequential))
   return False
@@ -371,7 +371,8 @@ class KerasObjectLoader(tf_load.Loader):
     # functional or Sequential model.
     model_is_functional_or_sequential = (
         metadata.get('is_graph_network', False) or
-        metadata['class_name'] == 'Sequential')
+        metadata['class_name'] == 'Sequential' or
+        metadata['class_name'] == 'Functional')
     if not (generic_utils.validate_config(config) and
             model_is_functional_or_sequential):
       return None  # Revive as custom model.
@@ -383,7 +384,8 @@ class KerasObjectLoader(tf_load.Loader):
     if class_name == 'Sequential':
       model = models_lib.Sequential(name=config['name'])
     else:
-      model = models_lib.Model(name=config['name'])
+      model = models_lib.Functional(
+          inputs=[], outputs=[], name=config['name'])
 
     # Record this model and its layers. This will later be used to reconstruct
     # the model.
@@ -561,10 +563,11 @@ class KerasObjectLoader(tf_load.Loader):
         if not model.built and not isinstance(input_specs, dict):
           model.build(input_shapes)
     else:
-      (inputs, outputs, created_layers) = network_lib.reconstruct_from_config(
-          config, created_layers={layer.name: layer for layer in layers})
+      (inputs, outputs,
+       created_layers) = functional_lib.reconstruct_from_config(
+           config, created_layers={layer.name: layer for layer in layers})
       model.__init__(inputs, outputs, name=config['name'])
-      network_lib.connect_ancillary_layers(model, created_layers)
+      functional_lib.connect_ancillary_layers(model, created_layers)
 
     # Set model dtype and trainable status.
     _set_network_attributes_from_metadata(model)
@@ -764,7 +767,7 @@ def revive_custom_object(identifier, metadata):
   revived_classes = {
       '_tf_keras_layer': (RevivedLayer, base_layer.Layer),
       '_tf_keras_input_layer': (RevivedInputLayer, input_layer.InputLayer),
-      '_tf_keras_network': (RevivedNetwork, network_lib.Network),
+      '_tf_keras_network': (RevivedNetwork, functional_lib.Functional),
       '_tf_keras_model': (RevivedNetwork, model_class),
       '_tf_keras_sequential': (RevivedNetwork, models_lib.Sequential),
   }
@@ -852,7 +855,7 @@ def _revive_setter(layer, name, value):
       layer._track_trackable(value, name=name)
     layer._serialized_attributes[name] = value
     # pylint: enable=protected-access
-  elif (isinstance(layer, network_lib.Network) and
+  elif (isinstance(layer, functional_lib.Functional) and
         re.match(r'^layer(_with_weights)?-[\d+]', name) is not None):
     # Edges named "layer-n" or "layer_with_weights-n", which are tracked in
     # network._track_layers, should not be added as an attribute.
diff --git a/tensorflow/python/keras/saving/saved_model/model_serialization.py b/tensorflow/python/keras/saving/saved_model/model_serialization.py
index 412fb0b54e5..c711e82a045 100644
--- a/tensorflow/python/keras/saving/saved_model/model_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/model_serialization.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import constants
-from tensorflow.python.keras.saving.saved_model import network_serialization
+from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.saving.saved_model import save_impl
 
 
-class ModelSavedModelSaver(network_serialization.NetworkSavedModelSaver):
+class ModelSavedModelSaver(layer_serialization.LayerSavedModelSaver):
   """Model SavedModel serialization."""
 
   @property
@@ -33,6 +33,10 @@ class ModelSavedModelSaver(network_serialization.NetworkSavedModelSaver):
 
   def _python_properties_internal(self):
     metadata = super(ModelSavedModelSaver, self)._python_properties_internal()
+    # Network stateful property is dependent on the child layers.
+    metadata.pop('stateful')
+    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
+
     metadata.update(
         saving_utils.model_metadata(
             self.obj, include_optimizer=True, require_config=False))
diff --git a/tensorflow/python/keras/saving/saved_model/network_serialization.py b/tensorflow/python/keras/saving/saved_model/network_serialization.py
index 1c94377e3db..c98cba47155 100644
--- a/tensorflow/python/keras/saving/saved_model/network_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/network_serialization.py
@@ -18,22 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.saving.saved_model import layer_serialization
+from tensorflow.python.keras.saving.saved_model import model_serialization
 
 
-# Network serialization is pretty much the same as layer serialization.
-class NetworkSavedModelSaver(layer_serialization.LayerSavedModelSaver):
+# FunctionalModel serialization is pretty much the same as Model serialization.
+class NetworkSavedModelSaver(model_serialization.ModelSavedModelSaver):
   """Network serialization."""
 
   @property
   def object_identifier(self):
     return '_tf_keras_network'
-
-  def _python_properties_internal(self):
-    metadata = super(NetworkSavedModelSaver, self)._python_properties_internal()
-
-    # Network stateful property is dependent on the child layers.
-    metadata.pop('stateful')
-
-    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
-    return metadata
diff --git a/tensorflow/python/keras/saving/saved_model/revive_test.py b/tensorflow/python/keras/saving/saved_model/revive_test.py
index 18ac923f6a9..4bd11460181 100644
--- a/tensorflow/python/keras/saving/saved_model/revive_test.py
+++ b/tensorflow/python/keras/saving/saved_model/revive_test.py
@@ -98,7 +98,7 @@ class CustomLayerNoConfig(keras.layers.Layer):
         constant_op.constant(1.0, shape=input_shape[1:]), name=self.name+'_c')
 
   def call(self, inputs):
-    self.add_loss(math_ops.reduce_sum(inputs), inputs)
+    self.add_loss(math_ops.reduce_sum(inputs), inputs=inputs)
     self.add_metric(self.sum_metric(inputs))
     self.add_metric(inputs, aggregation='mean', name='mean')
 
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index 9d9ff2b0ed2..f3be6d0a595 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -36,6 +36,7 @@ from tensorflow.python.keras.saving.saved_model import constants
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import serialized_attributes
 from tensorflow.python.keras.saving.saved_model import utils
+from tensorflow.python.keras.utils import version_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
@@ -563,7 +564,16 @@ def _wrap_call_and_conditional_losses(layer):
   # Create function that generates both outputs and losses
   layer_call = _get_layer_call_method(layer)
   def call_and_return_conditional_losses(inputs, *args, **kwargs):
-    return layer_call(inputs, *args, **kwargs), layer.get_losses_for(inputs)
+    """Returns layer (call_output, conditional losses) tuple."""
+    call_output = layer_call(inputs, *args, **kwargs)
+    if version_utils.is_v1_layer_or_model(layer):
+      conditional_losses = layer.get_losses_for(inputs)
+    else:
+      conditional_losses = [
+          l for l in layer.losses if not hasattr(l, '_unconditional_loss')
+      ]
+    return call_output, conditional_losses
+
   return _create_call_fn_decorator(layer, call_and_return_conditional_losses)
 
 
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index f56d55b18d5..4ada84191dc 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -39,7 +39,6 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column.dense_features import DenseFeatures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -48,6 +47,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column.dense_features import DenseFeatures
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import generic_utils
@@ -86,7 +86,7 @@ class LayerWithLearningPhase(keras.engine.base_layer.Layer):
 class LayerWithLoss(keras.layers.Layer):
 
   def call(self, inputs):
-    self.add_loss(math_ops.reduce_sum(inputs), inputs)
+    self.add_loss(math_ops.reduce_sum(inputs), inputs=inputs)
     return inputs * 2
 
 
@@ -391,6 +391,37 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
       self.evaluate(loaded.get_updates_for(input_arr2))
     self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
 
+  def testDisablingBatchNormTrainableBeforeSaving(self):
+    # We disable trainable on the batchnorm layers before saving
+    model = keras.models.Sequential(
+        keras.layers.BatchNormalization(input_shape=(1,)))
+    model.trainable = False
+    self.evaluate(variables.variables_initializer(model.variables))
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+    loaded = keras_load.load(saved_model_dir)
+    self.evaluate(variables.variables_initializer(loaded.variables))
+    input_arr = array_ops.constant([[11], [12], [13]], dtype=dtypes.float32)
+    input_arr2 = array_ops.constant([[14], [15], [16]], dtype=dtypes.float32)
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0])
+
+    # Trainable should still be disabled after loading
+    self.evaluate(loaded(input_arr, training=True))
+    if not context.executing_eagerly():
+      self.evaluate(loaded.get_updates_for(input_arr))
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.0])
+
+    # Re-enabling trainable on the loaded model should cause the batchnorm
+    # layer to start training again.
+    # Note: this only works in v2.
+    if context.executing_eagerly():
+      loaded.trainable = True
+      self.evaluate(loaded(input_arr, training=True))
+      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+
+      self.evaluate(loaded(input_arr2, training=False))
+      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+
   def testSaveWithSignatures(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(5, input_shape=(3,),
@@ -547,7 +578,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
         else:
           return inputs
 
-    t = array_ops.sequence_mask(1)
+    t = self.evaluate(array_ops.sequence_mask(1))
     inputs = keras.layers.Input(shape=(3))
     model = keras.models.Model(inputs, LayerWithTensorKwarg()(inputs, t))
 
diff --git a/tensorflow/python/keras/saving/saved_model/utils.py b/tensorflow/python/keras/saving/saved_model/utils.py
index fee35999b92..dedcea02a4f 100644
--- a/tensorflow/python/keras/saving/saved_model/utils.py
+++ b/tensorflow/python/keras/saving/saved_model/utils.py
@@ -69,7 +69,7 @@ def use_wrapped_call(layer, call_fn, default_training_value=None,
     inputs = args[inputs_arg_index]
     args = args[inputs_arg_index + 1:]
     outputs, losses = fn(inputs, *args, **kwargs)
-    layer.add_loss(losses, inputs)
+    layer.add_loss(losses, inputs=inputs)
 
     # TODO(kathywu): This is a temporary hack. When a network of layers is
     # revived from SavedModel, only the top-level layer will have losses. This
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 5da6aeef391..b41abbdf1f5 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
@@ -44,6 +45,14 @@ from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
 
+def string_test(actual, expected):
+  np.testing.assert_array_equal(actual, expected)
+
+
+def numeric_test(actual, expected):
+  np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=1e-6)
+
+
 def get_test_data(train_samples,
                   test_samples,
                   input_shape,
@@ -132,6 +141,11 @@ def layer_test(layer_cls,
   if expected_output_dtype is None:
     expected_output_dtype = input_dtype
 
+  if dtypes.as_dtype(expected_output_dtype) == dtypes.string:
+    assert_equal = string_test
+  else:
+    assert_equal = numeric_test
+
   # instantiation
   kwargs = kwargs or {}
   layer = layer_cls(**kwargs)
@@ -199,8 +213,7 @@ def layer_test(layer_cls,
         (layer_cls.__name__, x, actual_output.dtype,
          computed_output_signature.dtype, kwargs))
   if expected_output is not None:
-    np.testing.assert_allclose(actual_output, expected_output,
-                               rtol=1e-3, atol=1e-6)
+    assert_equal(actual_output, expected_output)
 
   # test serialization, weight setting at model level
   model_config = model.get_config()
@@ -209,7 +222,7 @@ def layer_test(layer_cls,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3, atol=1e-6)
+    assert_equal(output, actual_output)
 
   # test training mode (e.g. useful for dropout tests)
   # Rebuild the model to avoid the graph being reused between predict() and
@@ -254,8 +267,7 @@ def layer_test(layer_cls,
              computed_output_shape,
              kwargs))
   if expected_output is not None:
-    np.testing.assert_allclose(actual_output, expected_output,
-                               rtol=1e-3, atol=1e-6)
+    assert_equal(actual_output, expected_output)
 
   # test serialization, weight setting at model level
   model_config = model.get_config()
@@ -264,7 +276,7 @@ def layer_test(layer_cls,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3, atol=1e-6)
+    assert_equal(output, actual_output)
 
   # for further checks in the caller function
   return actual_output
diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index 79d82ae7c7a..da2b24dbdef 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -13,6 +13,22 @@ package(
 
 exports_files(["LICENSE"])
 
+tf_py_test(
+    name = "get_config_test",
+    srcs = ["get_config_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":get_config_samples",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "add_loss_correctness_test",
     srcs = ["add_loss_correctness_test.py"],
@@ -245,6 +261,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "summary_ops_test",
+    size = "small",
+    srcs = ["summary_ops_test.py"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers:core",
+    ],
+)
+
 tf_py_test(
     name = "temporal_sample_weights_correctness_test",
     srcs = ["temporal_sample_weights_correctness_test.py"],
@@ -256,3 +288,10 @@ tf_py_test(
         "//third_party/py/numpy",
     ],
 )
+
+py_library(
+    name = "get_config_samples",
+    srcs = ["get_config_samples.py"],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index 26a799e0f83..a19eec75ffb 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -288,7 +288,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
           model_layers, input_shape=(10,))
 
       x = np.ones((10, 10), 'float32')
-      y = np.ones((10, 1), 'float32')
+      y = np.zeros((10, 1), 'float32')
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       model.compile(
@@ -340,7 +340,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
     class LayerWithLoss(layers.Layer):
 
       def call(self, inputs):
-        self.add_loss(math_ops.reduce_sum(inputs), inputs)
+        self.add_loss(math_ops.reduce_sum(inputs), inputs=inputs)
         return inputs * 2
 
     shared_layer = LayerWithLoss()
@@ -357,7 +357,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
     class LayerWithLoss(layers.Layer):
 
       def call(self, inputs):
-        self.add_loss(math_ops.reduce_sum(inputs), inputs)
+        self.add_loss(math_ops.reduce_sum(inputs), inputs=inputs)
         return inputs * 2
 
     class LayerWithNestedLayerWithLoss(layers.Layer):
diff --git a/tensorflow/python/keras/tests/get_config_samples.py b/tensorflow/python/keras/tests/get_config_samples.py
new file mode 100644
index 00000000000..ca622e82b7d
--- /dev/null
+++ b/tensorflow/python/keras/tests/get_config_samples.py
@@ -0,0 +1,491 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Sample `get_config` results for testing backwards compatibility."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# inputs = tf.keras.Input(10)
+# x = tf.keras.layers.Dense(10, activation='relu')(inputs)
+# outputs = tf.keras.layers.Dense(1)(x)
+# model = tf.keras.Model(inputs, outputs)
+FUNCTIONAL_DNN = {
+    'input_layers': [['input_1', 0, 0]],
+    'layers': [{
+        'class_name': 'InputLayer',
+        'config': {
+            'batch_input_shape': (None, 10),
+            'dtype': 'float32',
+            'name': 'input_1',
+            'ragged': False,
+            'sparse': False
+        },
+        'inbound_nodes': [],
+        'name': 'input_1'
+    }, {
+        'class_name': 'Dense',
+        'config': {
+            'activation': 'relu',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dtype': 'float32',
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'dense',
+            'trainable': True,
+            'units': 10,
+            'use_bias': True
+        },
+        'inbound_nodes': [[['input_1', 0, 0, {}]]],
+        'name': 'dense'
+    }, {
+        'class_name': 'Dense',
+        'config': {
+            'activation': 'linear',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dtype': 'float32',
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'dense_1',
+            'trainable': True,
+            'units': 1,
+            'use_bias': True
+        },
+        'inbound_nodes': [[['dense', 0, 0, {}]]],
+        'name': 'dense_1'
+    }],
+    'name': 'model',
+    'output_layers': [['dense_1', 0, 0]]
+}
+
+# inputs = tf.keras.Input((256, 256, 3))
+# x = tf.keras.layers.Conv2D(filters=3, kernel_size=(3, 3))(inputs)
+# x = tf.keras.layers.Flatten()(x)
+# outputs = tf.keras.layers.Dense(1)(x)
+# model = tf.keras.Model(inputs, outputs)
+FUNCTIONAL_CNN = {
+    'input_layers': [['input_2', 0, 0]],
+    'layers': [{
+        'class_name': 'InputLayer',
+        'config': {
+            'batch_input_shape': (None, 256, 256, 3),
+            'dtype': 'float32',
+            'name': 'input_2',
+            'ragged': False,
+            'sparse': False
+        },
+        'inbound_nodes': [],
+        'name': 'input_2'
+    }, {
+        'class_name': 'Conv2D',
+        'config': {
+            'activation': 'linear',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'data_format': 'channels_last',
+            'dilation_rate': (1, 1),
+            'dtype': 'float32',
+            'filters': 3,
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'kernel_size': (3, 3),
+            'name': 'conv2d',
+            'padding': 'valid',
+            'strides': (1, 1),
+            'trainable': True,
+            'use_bias': True
+        },
+        'inbound_nodes': [[['input_2', 0, 0, {}]]],
+        'name': 'conv2d'
+    }, {
+        'class_name': 'Flatten',
+        'config': {
+            'data_format': 'channels_last',
+            'dtype': 'float32',
+            'name': 'flatten',
+            'trainable': True
+        },
+        'inbound_nodes': [[['conv2d', 0, 0, {}]]],
+        'name': 'flatten'
+    }, {
+        'class_name': 'Dense',
+        'config': {
+            'activation': 'linear',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dtype': 'float32',
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'dense_2',
+            'trainable': True,
+            'units': 1,
+            'use_bias': True
+        },
+        'inbound_nodes': [[['flatten', 0, 0, {}]]],
+        'name': 'dense_2'
+    }],
+    'name': 'model_1',
+    'output_layers': [['dense_2', 0, 0]]
+}
+
+# inputs = tf.keras.Input((10, 3))
+# x = tf.keras.layers.LSTM(10)(inputs)
+# outputs = tf.keras.layers.Dense(1)(x)
+# model = tf.keras.Model(inputs, outputs)
+FUNCTIONAL_LSTM = {
+    'input_layers': [['input_5', 0, 0]],
+    'layers': [{
+        'class_name': 'InputLayer',
+        'config': {
+            'batch_input_shape': (None, 10, 3),
+            'dtype': 'float32',
+            'name': 'input_5',
+            'ragged': False,
+            'sparse': False
+        },
+        'inbound_nodes': [],
+        'name': 'input_5'
+    }, {
+        'class_name': 'LSTM',
+        'config': {
+            'activation': 'tanh',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dropout': 0.0,
+            'dtype': 'float32',
+            'go_backwards': False,
+            'implementation': 2,
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'lstm_2',
+            'recurrent_activation': 'sigmoid',
+            'recurrent_constraint': None,
+            'recurrent_dropout': 0.0,
+            'recurrent_initializer': {
+                'class_name': 'Orthogonal',
+                'config': {
+                    'gain': 1.0,
+                    'seed': None
+                }
+            },
+            'recurrent_regularizer': None,
+            'return_sequences': False,
+            'return_state': False,
+            'stateful': False,
+            'time_major': False,
+            'trainable': True,
+            'unit_forget_bias': True,
+            'units': 10,
+            'unroll': False,
+            'use_bias': True
+        },
+        'inbound_nodes': [[['input_5', 0, 0, {}]]],
+        'name': 'lstm_2'
+    }, {
+        'class_name': 'Dense',
+        'config': {
+            'activation': 'linear',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dtype': 'float32',
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'dense_4',
+            'trainable': True,
+            'units': 1,
+            'use_bias': True
+        },
+        'inbound_nodes': [[['lstm_2', 0, 0, {}]]],
+        'name': 'dense_4'
+    }],
+    'name': 'model_3',
+    'output_layers': [['dense_4', 0, 0]]
+}
+
+# model = tf.keras.Sequential()
+# model.add(tf.keras.layers.Dense(10))
+# model.add(tf.keras.layers.Dense(1))
+SEQUENTIAL_DNN = {
+    'layers': [{
+        'class_name': 'Dense',
+        'config': {
+            'activation': 'linear',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dtype': 'float32',
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'dense_2',
+            'trainable': True,
+            'units': 10,
+            'use_bias': True
+        }
+    }, {
+        'class_name': 'Dense',
+        'config': {
+            'activation': 'linear',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dtype': 'float32',
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'dense_3',
+            'trainable': True,
+            'units': 1,
+            'use_bias': True
+        }
+    }],
+    'name': 'sequential_1'
+}
+
+# model = tf.keras.Sequential()
+# model.add(tf.keras.layers.Conv2D(32, (3, 3)))
+# model.add(tf.keras.layers.Flatten())
+# model.add(tf.keras.layers.Dense(1))
+SEQUENTIAL_CNN = {
+    'layers': [{
+        'class_name': 'Conv2D',
+        'config': {
+            'activation': 'linear',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'data_format': 'channels_last',
+            'dilation_rate': (1, 1),
+            'dtype': 'float32',
+            'filters': 32,
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'kernel_size': (3, 3),
+            'name': 'conv2d_1',
+            'padding': 'valid',
+            'strides': (1, 1),
+            'trainable': True,
+            'use_bias': True
+        }
+    }, {
+        'class_name': 'Flatten',
+        'config': {
+            'data_format': 'channels_last',
+            'dtype': 'float32',
+            'name': 'flatten_1',
+            'trainable': True
+        }
+    }, {
+        'class_name': 'Dense',
+        'config': {
+            'activation': 'linear',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dtype': 'float32',
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'dense_6',
+            'trainable': True,
+            'units': 1,
+            'use_bias': True
+        }
+    }],
+    'name': 'sequential_4'
+}
+
+# model = tf.keras.Sequential()
+# model.add(tf.keras.layers.LSTM(10))
+# model.add(tf.keras.layers.Dense(1))
+SEQUENTIAL_LSTM = {
+    'layers': [{
+        'class_name': 'LSTM',
+        'config': {
+            'activation': 'tanh',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dropout': 0.0,
+            'dtype': 'float32',
+            'go_backwards': False,
+            'implementation': 2,
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'lstm',
+            'recurrent_activation': 'sigmoid',
+            'recurrent_constraint': None,
+            'recurrent_dropout': 0.0,
+            'recurrent_initializer': {
+                'class_name': 'Orthogonal',
+                'config': {
+                    'gain': 1.0,
+                    'seed': None
+                }
+            },
+            'recurrent_regularizer': None,
+            'return_sequences': False,
+            'return_state': False,
+            'stateful': False,
+            'time_major': False,
+            'trainable': True,
+            'unit_forget_bias': True,
+            'units': 10,
+            'unroll': False,
+            'use_bias': True
+        }
+    }, {
+        'class_name': 'Dense',
+        'config': {
+            'activation': 'linear',
+            'activity_regularizer': None,
+            'bias_constraint': None,
+            'bias_initializer': {
+                'class_name': 'Zeros',
+                'config': {}
+            },
+            'bias_regularizer': None,
+            'dtype': 'float32',
+            'kernel_constraint': None,
+            'kernel_initializer': {
+                'class_name': 'GlorotUniform',
+                'config': {
+                    'seed': None
+                }
+            },
+            'kernel_regularizer': None,
+            'name': 'dense_4',
+            'trainable': True,
+            'units': 1,
+            'use_bias': True
+        }
+    }],
+    'name': 'sequential_2'
+}
diff --git a/tensorflow/python/keras/tests/get_config_test.py b/tensorflow/python/keras/tests/get_config_test.py
new file mode 100644
index 00000000000..3274447f9ed
--- /dev/null
+++ b/tensorflow/python/keras/tests/get_config_test.py
@@ -0,0 +1,58 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#,============================================================================
+"""Tests for `get_config` backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.tests import get_config_samples
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class TestGetConfigBackwardsCompatible(keras_parameterized.TestCase):
+
+  def test_functional_dnn(self):
+    model = training.Model.from_config(get_config_samples.FUNCTIONAL_DNN)
+    self.assertLen(model.layers, 3)
+
+  def test_functional_cnn(self):
+    model = training.Model.from_config(get_config_samples.FUNCTIONAL_CNN)
+    self.assertLen(model.layers, 4)
+
+  def test_functional_lstm(self):
+    model = training.Model.from_config(get_config_samples.FUNCTIONAL_LSTM)
+    self.assertLen(model.layers, 3)
+
+  def test_sequential_dnn(self):
+    model = sequential.Sequential.from_config(get_config_samples.SEQUENTIAL_DNN)
+    self.assertLen(model.layers, 2)
+
+  def test_sequential_cnn(self):
+    model = sequential.Sequential.from_config(get_config_samples.SEQUENTIAL_CNN)
+    self.assertLen(model.layers, 3)
+
+  def test_sequential_lstm(self):
+    model = sequential.Sequential.from_config(
+        get_config_samples.SEQUENTIAL_LSTM)
+    self.assertLen(model.layers, 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/tests/integration_test.py b/tensorflow/python/keras/tests/integration_test.py
index dbb6f75f031..8e4d38c1a6a 100644
--- a/tensorflow/python/keras/tests/integration_test.py
+++ b/tensorflow/python/keras/tests/integration_test.py
@@ -112,9 +112,9 @@ class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly())
-    if not testing_utils.should_run_eagerly():
-      self.assertEqual(len(model.get_losses_for(None)), 2)
-      self.assertEqual(len(model.get_updates_for(x)), 2)
+    self.assertLen(model.losses, 2)
+    if not context.executing_eagerly():
+      self.assertLen(model.get_updates_for(x), 2)
     history = model.fit(x_train, y_train, epochs=10, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
diff --git a/tensorflow/python/keras/tests/model_subclassing_test.py b/tensorflow/python/keras/tests/model_subclassing_test.py
index 5af1148f4f0..d2f4ee8d47c 100644
--- a/tensorflow/python/keras/tests/model_subclassing_test.py
+++ b/tensorflow/python/keras/tests/model_subclassing_test.py
@@ -477,8 +477,6 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
       self.assertEqual(0, len(model.updates))
     else:
       self.assertEqual(2, len(model.updates))
-      self.assertEqual(1, len(model.get_updates_for(None)))
-      self.assertEqual(1, len(model.get_updates_for(x)))
 
 
 class GraphSpecificModelSubclassingTests(test.TestCase):
@@ -536,8 +534,8 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
 
       x = array_ops.ones(shape=[100, 784], dtype='float32')
       model(x)
-      self.assertEqual(len(model.get_updates_for(x)), 2)
-      self.assertEqual(len(model.get_losses_for(x)), 1)
+      self.assertLen(model.updates, 2)
+      self.assertLen(model.losses, 1)
 
     # Case 2: placeholder-sequential nested in subclass.
     class TestModel2(keras.Model):
diff --git a/tensorflow/python/keras/tests/summary_ops_test.py b/tensorflow/python/keras/tests/summary_ops_test.py
new file mode 100644
index 00000000000..a62abdccbba
--- /dev/null
+++ b/tensorflow/python/keras/tests/summary_ops_test.py
@@ -0,0 +1,147 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for V2 summary ops from summary_ops_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.core.util import event_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine.sequential import Sequential
+from tensorflow.python.keras.engine.training import Model
+from tensorflow.python.keras.layers.core import Activation
+from tensorflow.python.keras.layers.core import Dense
+from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+
+
+class SummaryOpsTest(test_util.TensorFlowTestCase):
+
+  def tearDown(self):
+    super(SummaryOpsTest, self).tearDown()
+    summary_ops.trace_off()
+
+  def keras_model(self, *args, **kwargs):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    with writer.as_default():
+      summary_ops.keras_model(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    # The first event contains no summary values. The written content goes to
+    # the second event.
+    return events[1]
+
+  @test_util.run_v2_only
+  def testKerasModel(self):
+    model = Sequential(
+        [Dense(10, input_shape=(100,)),
+         Activation('relu', name='my_relu')])
+    event = self.keras_model(name='my_name', data=model, step=1)
+    first_val = event.summary.value[0]
+    self.assertEqual(model.to_json(), first_val.tensor.string_val[0].decode())
+
+  @test_util.run_v2_only
+  def testKerasModel_usesDefaultStep(self):
+    model = Sequential(
+        [Dense(10, input_shape=(100,)),
+         Activation('relu', name='my_relu')])
+    try:
+      summary_ops.set_step(42)
+      event = self.keras_model(name='my_name', data=model)
+      self.assertEqual(42, event.step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+  @test_util.run_v2_only
+  def testKerasModel_subclass(self):
+
+    class SimpleSubclass(Model):
+
+      def __init__(self):
+        super(SimpleSubclass, self).__init__(name='subclass')
+        self.dense = Dense(10, input_shape=(100,))
+        self.activation = Activation('relu', name='my_relu')
+
+      def call(self, inputs):
+        x = self.dense(inputs)
+        return self.activation(x)
+
+    model = SimpleSubclass()
+    with test.mock.patch.object(logging, 'warn') as mock_log:
+      self.assertFalse(
+          summary_ops.keras_model(name='my_name', data=model, step=1))
+      self.assertRegexpMatches(
+          str(mock_log.call_args), 'Model failed to serialize as JSON.')
+
+  @test_util.run_v2_only
+  def testKerasModel_otherExceptions(self):
+    model = Sequential()
+
+    with test.mock.patch.object(model, 'to_json') as mock_to_json:
+      with test.mock.patch.object(logging, 'warn') as mock_log:
+        mock_to_json.side_effect = Exception('oops')
+        self.assertFalse(
+            summary_ops.keras_model(name='my_name', data=model, step=1))
+        self.assertRegexpMatches(
+            str(mock_log.call_args),
+            'Model failed to serialize as JSON. Ignoring... oops')
+
+
+def events_from_file(filepath):
+  """Returns all events in a single event file.
+
+  Args:
+    filepath: Path to the event file.
+
+  Returns:
+    A list of all tf.Event protos in the event file.
+  """
+  records = list(tf_record.tf_record_iterator(filepath))
+  result = []
+  for r in records:
+    event = event_pb2.Event()
+    event.ParseFromString(r)
+    result.append(event)
+  return result
+
+
+def events_from_logdir(logdir):
+  """Returns all events in the single eventfile in logdir.
+
+  Args:
+    logdir: The directory in which the single event file is sought.
+
+  Returns:
+    A list of all tf.Event protos from the single event file.
+
+  Raises:
+    AssertionError: If logdir does not contain exactly one file.
+  """
+  assert gfile.Exists(logdir)
+  files = gfile.ListDirectory(logdir)
+  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
+  return events_from_file(os.path.join(logdir, files[0]))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
index 83dc82ff198..5b3c3df42b8 100644
--- a/tensorflow/python/keras/utils/composite_tensor_support_test.py
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -55,7 +55,6 @@ class ToDense(Layer):
   def __init__(self, default_value, **kwargs):
     super(ToDense, self).__init__(**kwargs)
     self._default_value = default_value
-    self._supports_ragged_inputs = True
 
   def call(self, inputs):
     if isinstance(inputs, dict):  # Dicts are no longer flattened.
@@ -83,7 +82,6 @@ class ToRagged(Layer):
     super(ToRagged, self).__init__(**kwargs)
     self._padding = padding
     self._ragged_rank = ragged_rank
-    self._supports_ragged_inputs = True
 
   def call(self, inputs):
     return ragged_tensor.RaggedTensor.from_tensor(
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index 1cf27f8fb65..6c0122cdf72 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -48,6 +48,7 @@ from six.moves.urllib.request import urlopen
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
@@ -552,6 +553,8 @@ def init_pool(seqs):
   _SHARED_SEQUENCES = seqs
 
 
+@deprecation.deprecated('2020-06-07', 'Please manage pools using the standard '
+                        'Python lib.')
 @keras_export('keras.experimental.terminate_keras_multiprocessing_pools')
 def terminate_keras_multiprocessing_pools(grace_period=0.1, use_sigkill=False):
   """Destroy Keras' multiprocessing pools to prevent deadlocks.
diff --git a/tensorflow/python/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
index 334758871fa..ddaa60c3c24 100644
--- a/tensorflow/python/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -201,7 +201,7 @@ class SerializeKerasObjectTest(test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableInt)
     self.assertEqual(new_layer.units, 3)
 
@@ -253,7 +253,7 @@ class SerializeKerasObjectTest(test.TestCase):
     self.assertEqual(new_layer.name, 'SerializableNestedInt')
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertEqual(new_layer.units.int_obj.__class__, SerializableInt)
@@ -293,7 +293,7 @@ class SerializeKerasObjectTest(test.TestCase):
             'SerializableNestedInt': SerializableNestedInt
         })
     self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L1L2)
+    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L2)
     self.assertIsInstance(new_layer.units, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertIs(new_layer.units.fn, serializable_fn)
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index 1dfd2f517c6..d2d3d919fff 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -55,7 +55,7 @@ def get_source_inputs(tensor, layer=None, node_index=None):
     return [tensor]
   else:
     node = layer._inbound_nodes[node_index]
-    if not node.inbound_layers:
+    if node.is_input:
       # Reached an Input layer, stop recursion.
       return nest.flatten(node.input_tensors)
     else:
@@ -140,7 +140,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     nodes = []
     for v in nodes_by_depth:
       if (len(v) > 1) or (len(v) == 1 and
-                          len(nest.flatten(v[0].inbound_layers)) > 1):
+                          len(nest.flatten(v[0].keras_inputs)) > 1):
         # if the model has multiple nodes
         # or if the nodes have multiple inbound_layers
         # the model is no longer sequential
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 1d769a027c8..58fff40564d 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -267,8 +267,8 @@ def update_confusion_matrix_variables(variables_to_update,
     y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
     y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
       the range `[0, 1]`.
-    thresholds: A float value or a python list or tuple of float thresholds in
-      `[0, 1]`, or NEG_INF (used when top_k is set).
+    thresholds: A float value, float tensor, python list, or tuple of float
+      thresholds in `[0, 1]`, or NEG_INF (used when top_k is set).
     top_k: Optional int, indicates that the positive labels should be limited to
       the top k predictions.
     class_id: Optional int, limits the prediction and labels to the class
@@ -301,9 +301,9 @@ def update_confusion_matrix_variables(variables_to_update,
     return
   y_true = math_ops.cast(y_true, dtype=dtypes.float32)
   y_pred = math_ops.cast(y_pred, dtype=dtypes.float32)
+  thresholds = ops.convert_to_tensor_v2(thresholds, dtype=dtypes.float32)
+  num_thresholds = thresholds.shape[0]
   if multi_label:
-    thresh_shape = array_ops.shape(thresholds)
-    num_thresholds = thresh_shape[0]
     one_thresh = math_ops.equal(
         math_ops.cast(1, dtype=dtypes.int32),
         array_ops.rank(thresholds),
@@ -312,7 +312,6 @@ def update_confusion_matrix_variables(variables_to_update,
     [y_pred,
      y_true], _ = ragged_assert_compatible_and_get_flat_values([y_pred, y_true],
                                                                sample_weight)
-    num_thresholds = len(to_list(thresholds))
     one_thresh = math_ops.cast(True, dtype=dtypes.bool)
 
   if not any(
@@ -388,9 +387,8 @@ def update_confusion_matrix_variables(variables_to_update,
     data_tiles = [num_thresholds, 1]
 
   thresh_tiled = array_ops.tile(
-      array_ops.reshape(
-          array_ops.constant(thresholds, dtype=dtypes.float32),
-          thresh_pretile_shape), array_ops.stack(thresh_tiles))
+      array_ops.reshape(thresholds, thresh_pretile_shape),
+      array_ops.stack(thresh_tiles))
 
   # Tile the predictions for every threshold.
   preds_tiled = array_ops.tile(predictions_extra_dim, data_tiles)
diff --git a/tensorflow/python/keras/utils/version_utils.py b/tensorflow/python/keras/utils/version_utils.py
index 377f370430c..551a07d2422 100644
--- a/tensorflow/python/keras/utils/version_utils.py
+++ b/tensorflow/python/keras/utils/version_utils.py
@@ -105,3 +105,7 @@ def disallow_legacy_graph(cls_name, method_name):
         " call `{cls_name}.{method_name}` with eager mode enabled.")
     error_msg = error_msg.format(cls_name=cls_name, method_name=method_name)
     raise ValueError(error_msg)
+
+
+def is_v1_layer_or_model(obj):
+  return isinstance(obj, (base_layer_v1.Layer, training_v1.Model))
diff --git a/tensorflow/python/keras/utils/version_utils_test.py b/tensorflow/python/keras/utils/version_utils_test.py
index 76e888ca553..0a3cd53f3c0 100644
--- a/tensorflow/python/keras/utils/version_utils_test.py
+++ b/tensorflow/python/keras/utils/version_utils_test.py
@@ -53,12 +53,12 @@ class SplitUtilsTest(keras_parameterized.TestCase):
     inputs = keras.Input(10)
     outputs = keras.layers.Dense(1)(inputs)
     model = keras.Model(inputs, outputs)
-    self._check_model_class(model.__class__)
+    self._check_model_class(model.__class__.__bases__[0])
     self._check_layer_class(model)
 
   def test_sequential_model(self):
     model = keras.Sequential([keras.layers.Dense(1)])
-    model_class = model.__class__.__bases__[0]
+    model_class = model.__class__.__bases__[0].__bases__[0]
     self._check_model_class(model_class)
     self._check_layer_class(model)
 
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 87c436a5bd7..158f6c83748 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -55,10 +55,10 @@ def check_pydot():
 
 
 def is_wrapped_model(layer):
-  from tensorflow.python.keras.engine import network
+  from tensorflow.python.keras.engine import functional
   from tensorflow.python.keras.layers import wrappers
   return (isinstance(layer, wrappers.Wrapper) and
-          isinstance(layer.layer, network.Network))
+          isinstance(layer.layer, functional.Functional))
 
 
 def add_edge(dot, src, dst):
@@ -98,7 +98,7 @@ def model_to_dot(model,
   """
   from tensorflow.python.keras.layers import wrappers
   from tensorflow.python.keras.engine import sequential
-  from tensorflow.python.keras.engine import network
+  from tensorflow.python.keras.engine import functional
 
   if not check_pydot():
     message = (
@@ -147,7 +147,8 @@ def model_to_dot(model,
     class_name = layer.__class__.__name__
 
     if isinstance(layer, wrappers.Wrapper):
-      if expand_nested and isinstance(layer.layer, network.Network):
+      if expand_nested and isinstance(layer.layer,
+                                      functional.Functional):
         submodel_wrapper = model_to_dot(layer.layer, show_shapes,
                                         show_layer_names, rankdir,
                                         expand_nested,
@@ -162,7 +163,7 @@ def model_to_dot(model,
         child_class_name = layer.layer.__class__.__name__
         class_name = '{}({})'.format(class_name, child_class_name)
 
-    if expand_nested and isinstance(layer, network.Network):
+    if expand_nested and isinstance(layer, functional.Functional):
       submodel_not_wrapper = model_to_dot(layer, show_shapes,
                                           show_layer_names, rankdir,
                                           expand_nested,
@@ -200,7 +201,8 @@ def model_to_dot(model,
                                                      inputlabels,
                                                      outputlabels)
 
-    if not expand_nested or not isinstance(layer, network.Network):
+    if not expand_nested or not isinstance(
+        layer, functional.Functional):
       node = pydot.Node(layer_id, label=label)
       dot.add_node(node)
 
@@ -218,16 +220,17 @@ def model_to_dot(model,
             add_edge(dot, inbound_layer_id, layer_id)
           else:
             # if inbound_layer is not Model or wrapped Model
-            if (not isinstance(inbound_layer, network.Network) and
+            if (not isinstance(inbound_layer,
+                               functional.Functional) and
                 not is_wrapped_model(inbound_layer)):
               # if current layer is not Model or wrapped Model
-              if (not isinstance(layer, network.Network) and
+              if (not isinstance(layer, functional.Functional) and
                   not is_wrapped_model(layer)):
                 assert dot.get_node(inbound_layer_id)
                 assert dot.get_node(layer_id)
                 add_edge(dot, inbound_layer_id, layer_id)
               # if current layer is Model
-              elif isinstance(layer, network.Network):
+              elif isinstance(layer, functional.Functional):
                 add_edge(dot, inbound_layer_id,
                          sub_n_first_node[layer.name].get_name())
               # if current layer is wrapped Model
@@ -236,9 +239,9 @@ def model_to_dot(model,
                 name = sub_w_first_node[layer.layer.name].get_name()
                 add_edge(dot, layer_id, name)
             # if inbound_layer is Model
-            elif isinstance(inbound_layer, network.Network):
+            elif isinstance(inbound_layer, functional.Functional):
               name = sub_n_last_node[inbound_layer.name].get_name()
-              if isinstance(layer, network.Network):
+              if isinstance(layer, functional.Functional):
                 output_name = sub_n_first_node[layer.name].get_name()
                 add_edge(dot, name, output_name)
               else:
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 74453d54b64..9e38a78578f 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1,8 +1,11 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 package(
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
@@ -120,6 +123,9 @@ cuda_py_test(
     size = "small",
     srcs = ["list_ops_test.py"],
     grpc_enabled = True,
+    tags = [
+        "noasan",  # TODO(b/155406705): flaky
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -477,6 +483,7 @@ tf_py_test(
     name = "fifo_queue_test",
     size = "small",
     srcs = ["fifo_queue_test.py"],
+    tags = ["no_rocm"],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -857,6 +864,7 @@ cuda_py_test(
     srcs = ["resource_variable_ops_test.py"],
     # TODO(b/128347673): Re-enable.
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1014,6 +1022,7 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:sparse_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1215,8 +1224,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/keras/layers",
         "@six_archive//:six",
     ],
 )
@@ -1506,7 +1513,7 @@ cuda_py_test(
     name = "diag_op_test",
     size = "medium",
     srcs = ["diag_op_test.py"],
-    shard_count = 2,
+    shard_count = 6,
     tags = ["no_windows_gpu"],
     deps = [
         "//tensorflow/python:array_ops",
@@ -2917,7 +2924,7 @@ cuda_py_test(
     srcs = ["conv_ops_test.py"],
     shard_count = 4,
     tags = [
-        "nopug",  # TODO(b/153763253); flaky
+        "nogpu",  # TODO(b/153763253); flaky
         "optonly",  # times out
     ],
     deps = [
@@ -3465,7 +3472,7 @@ cuda_py_test(
     name = "svd_op_test",
     size = "medium",
     srcs = ["svd_op_test.py"],
-    shard_count = 20,
+    shard_count = 30,
     tags = [
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index 023766c899d..8a6ac74849c 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -61,7 +61,7 @@ class ArgMaxTest(test.TestCase):
       self._testArg(method, x, axis, expected_values, False, expected_err_re)
 
   def _testBasic(self, dtype):
-    x = np.arange(200, dtype=dtype)
+    x = np.arange(200, dtype=np.float32).astype(np.bool_).astype(dtype)
     np.random.shuffle(x)
 
     # Check that argmin and argmax match numpy along the primary axis
@@ -78,7 +78,9 @@ class ArgMaxTest(test.TestCase):
 
   def _testDim(self, dtype):
     shape = (3, 2, 4, 5, 6, 3, 7)
-    x = np.arange(functools.reduce(lambda x, y: x * y, shape), dtype=dtype)
+    x = np.arange(
+        functools.reduce(lambda x, y: x * y, shape),
+        dtype=np.float32).astype(dtype)
     np.random.shuffle(x)
     x = x.reshape(shape)
 
@@ -124,6 +126,11 @@ class ArgMaxTest(test.TestCase):
     self._testTieBreaking(np.int64)
     self._testDim(np.int64)
 
+  def testBool(self):
+    self._testBasic(np.bool_)
+    self._testTieBreaking(np.bool_)
+    self._testDim(np.bool_)
+
   def testEmpty(self):
     with self.cached_session():
       for op in math_ops.argmin, math_ops.argmax:
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index ec3ed932996..9eb8bfcef41 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -154,37 +155,43 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
   def testMaskDim1ArrDim2Axis1(self):
     ndims_mask = 1
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
-      self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
+      with self.subTest(arr_shape=arr_shape):
+        self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
 
   @test_util.run_deprecated_v1
   def testMaskDim2ArrDim2Axis1(self):
     ndims_mask = 2
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
-      self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
+      with self.subTest(arr_shape=arr_shape):
+        self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
 
   @test_util.run_deprecated_v1
   def testMaskDim1ArrDim1(self):
     ndims_mask = 1
     for arr_shape in [(1,), (2,), (3,), (10,)]:
-      self.CheckVersusNumpy(ndims_mask, arr_shape)
+      with self.subTest(arr_shape=arr_shape):
+        self.CheckVersusNumpy(ndims_mask, arr_shape)
 
   @test_util.run_deprecated_v1
   def testMaskDim1ArrDim2(self):
     ndims_mask = 1
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
-      self.CheckVersusNumpy(ndims_mask, arr_shape)
+      with self.subTest(arr_shape=arr_shape):
+        self.CheckVersusNumpy(ndims_mask, arr_shape)
 
   @test_util.run_deprecated_v1
   def testMaskDim2ArrDim2(self):
     ndims_mask = 2
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
-      self.CheckVersusNumpy(ndims_mask, arr_shape)
+      with self.subTest(arr_shape=arr_shape):
+        self.CheckVersusNumpy(ndims_mask, arr_shape)
 
   @test_util.run_deprecated_v1
   def testMaskDim2ArrDim3(self):
     ndims_mask = 2
     for arr_shape in [(1, 1, 1), (1, 2, 2), (2, 2, 1)]:
-      self.CheckVersusNumpy(ndims_mask, arr_shape)
+      with self.subTest(arr_shape=arr_shape):
+        self.CheckVersusNumpy(ndims_mask, arr_shape)
 
   @test_util.run_deprecated_v1
   def testEmptyInput2D(self):
@@ -212,8 +219,9 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
     for ndims_mask in range(1, 4):
       for ndims_arr in range(ndims_mask, ndims_mask + 3):
         for _ in range(3):
-          arr_shape = np.random.randint(1, 5, size=ndims_arr)
-          self.CheckVersusNumpy(ndims_mask, arr_shape, make_mask=make_mask)
+          with self.subTest(ndims_mask=ndims_mask, ndims_arr=ndims_arr, _=_):
+            arr_shape = np.random.randint(1, 5, size=ndims_arr)
+            self.CheckVersusNumpy(ndims_mask, arr_shape, make_mask=make_mask)
 
   @test_util.run_deprecated_v1
   def testWorksWithDimensionsEqualToNoneDuringGraphBuild(self):
@@ -281,6 +289,29 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
         result = sess.run(masked_tensor, feed_dict={tile_placeholder: [2, 2]})
         self.assertAllEqual([b"hello", b"hello", b"hello", b"hello"], result)
 
+  def testMaskWithAxisTensor(self):
+
+    @def_function.function(autograph=False)
+    def f():
+      return array_ops.boolean_mask([1, 2, 3], [True, False, True],
+                                    axis=constant_op.constant(
+                                        0, dtype=dtypes.int32))
+
+    self.assertAllEqual(self.evaluate(f()), [1, 3])
+
+  def testMaskWithAxisNonConstTensor(self):
+
+    @def_function.function(
+        autograph=False,
+        input_signature=[
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.int32)
+        ])
+    def f(axis):
+      return array_ops.boolean_mask([1, 2, 3], [True, False, True], axis=axis)
+
+    self.assertAllEqual(
+        self.evaluate(f(constant_op.constant(0, dtype=dtypes.int32))), [1, 3])
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class OperatorShapeTest(test_util.TensorFlowTestCase):
@@ -321,19 +352,21 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
   def testReverse0DimAuto(self):
     x_np = 4
     for use_gpu in [False, True]:
-      with self.cached_session(use_gpu=use_gpu):
-        x_tf = array_ops.reverse_v2(x_np, []).eval()
-        self.assertAllEqual(x_tf, x_np)
+      with self.subTest(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
+          x_tf = array_ops.reverse_v2(x_np, []).eval()
+          self.assertAllEqual(x_tf, x_np)
 
   def _reverse1DimAuto(self, np_dtype):
     x_np = np.array([1, 200, 3, 40, 5], dtype=np_dtype)
 
     for use_gpu in [False, True]:
       for axis_dtype in [dtypes.int32, dtypes.int64]:
-        with self.cached_session(use_gpu=use_gpu):
-          x_tf = array_ops.reverse_v2(
-              x_np, constant_op.constant([0], dtype=axis_dtype)).eval()
-          self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
+        with self.subTest(use_gpu=use_gpu, axis_dtype=axis_dtype):
+          with self.cached_session(use_gpu=use_gpu):
+            x_tf = array_ops.reverse_v2(
+                x_np, constant_op.constant([0], dtype=axis_dtype)).eval()
+            self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
 
   def _reverse2DimAuto(self, np_dtype):
     x_np = np.array([[1, 200, 3], [4, 5, 60]], dtype=np_dtype)
@@ -341,27 +374,29 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
       for use_gpu in [False, True]:
         for axis_dtype in [dtypes.int32, dtypes.int64]:
-          with self.cached_session(use_gpu=use_gpu):
-            x_tf_1 = reverse_f(x_np,
-                               constant_op.constant([0],
-                                                    dtype=axis_dtype)).eval()
-            x_tf_2 = reverse_f(x_np,
-                               constant_op.constant([-2],
-                                                    dtype=axis_dtype)).eval()
-            x_tf_3 = reverse_f(x_np,
-                               constant_op.constant([1],
-                                                    dtype=axis_dtype)).eval()
-            x_tf_4 = reverse_f(x_np,
-                               constant_op.constant([-1],
-                                                    dtype=axis_dtype)).eval()
-            x_tf_5 = reverse_f(x_np,
-                               constant_op.constant([1, 0],
-                                                    dtype=axis_dtype)).eval()
-            self.assertAllEqual(x_tf_1, np.asarray(x_np)[::-1, :])
-            self.assertAllEqual(x_tf_2, np.asarray(x_np)[::-1, :])
-            self.assertAllEqual(x_tf_3, np.asarray(x_np)[:, ::-1])
-            self.assertAllEqual(x_tf_4, np.asarray(x_np)[:, ::-1])
-            self.assertAllEqual(x_tf_5, np.asarray(x_np)[::-1, ::-1])
+          with self.subTest(
+              reverse_f=reverse_f, use_gpu=use_gpu, axis_dtype=axis_dtype):
+            with self.cached_session(use_gpu=use_gpu):
+              x_tf_1 = reverse_f(x_np,
+                                 constant_op.constant([0],
+                                                      dtype=axis_dtype)).eval()
+              x_tf_2 = reverse_f(x_np,
+                                 constant_op.constant([-2],
+                                                      dtype=axis_dtype)).eval()
+              x_tf_3 = reverse_f(x_np,
+                                 constant_op.constant([1],
+                                                      dtype=axis_dtype)).eval()
+              x_tf_4 = reverse_f(x_np,
+                                 constant_op.constant([-1],
+                                                      dtype=axis_dtype)).eval()
+              x_tf_5 = reverse_f(x_np,
+                                 constant_op.constant([1, 0],
+                                                      dtype=axis_dtype)).eval()
+              self.assertAllEqual(x_tf_1, np.asarray(x_np)[::-1, :])
+              self.assertAllEqual(x_tf_2, np.asarray(x_np)[::-1, :])
+              self.assertAllEqual(x_tf_3, np.asarray(x_np)[:, ::-1])
+              self.assertAllEqual(x_tf_4, np.asarray(x_np)[:, ::-1])
+              self.assertAllEqual(x_tf_5, np.asarray(x_np)[::-1, ::-1])
 
   # This test covers the axis validation in the shape function
   # (no eval())
@@ -441,12 +476,16 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
       for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
         for outer_size in (1, 2):
           for middle_size in list(range(50)) + [100000]:
-            x_np = np.reshape(
-                np.arange(outer_size * middle_size * 3, dtype=np.float32),
-                newshape=(outer_size, middle_size, 3))
-            x_tf = reverse_f(x_np, [1]).eval()
-            np_answer = x_np[:, ::-1, :]
-            self.assertAllEqual(x_tf, np_answer)
+            with self.subTest(
+                reverse_f=reverse_f,
+                outer_size=outer_size,
+                middle_size=middle_size):
+              x_np = np.reshape(
+                  np.arange(outer_size * middle_size * 3, dtype=np.float32),
+                  newshape=(outer_size, middle_size, 3))
+              x_tf = reverse_f(x_np, [1]).eval()
+              np_answer = x_np[:, ::-1, :]
+              self.assertAllEqual(x_tf, np_answer)
 
   @test_util.run_deprecated_v1
   def testReverseRowsOf4Channels(self):
@@ -454,12 +493,16 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
       for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
         for outer_size in (1, 2):
           for middle_size in list(range(50)) + [100000]:
-            x_np = np.reshape(
-                np.arange(outer_size * middle_size * 4, dtype=np.float32),
-                newshape=(outer_size, middle_size, 4))
-            x_tf = reverse_f(x_np, [1]).eval()
-            np_answer = x_np[:, ::-1, :]
-            self.assertAllEqual(x_tf, np_answer)
+            with self.subTest(
+                reverse_f=reverse_f,
+                outer_size=outer_size,
+                middle_size=middle_size):
+              x_np = np.reshape(
+                  np.arange(outer_size * middle_size * 4, dtype=np.float32),
+                  newshape=(outer_size, middle_size, 4))
+              x_tf = reverse_f(x_np, [1]).eval()
+              np_answer = x_np[:, ::-1, :]
+              self.assertAllEqual(x_tf, np_answer)
 
   @test_util.run_deprecated_v1
   def testReverseColumnsOf3Channels(self):
@@ -467,12 +510,16 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
       for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
         for outer_size in list(range(50)) + [100000]:
           for middle_size in (1, 2):
-            x_np = np.reshape(
-                np.arange(outer_size * middle_size * 3, dtype=np.float32),
-                newshape=(outer_size, middle_size, 3))
-            x_tf = reverse_f(x_np, [0]).eval()
-            np_answer = x_np[::-1, :, :]
-            self.assertAllEqual(x_tf, np_answer)
+            with self.subTest(
+                reverse_f=reverse_f,
+                outer_size=outer_size,
+                middle_size=middle_size):
+              x_np = np.reshape(
+                  np.arange(outer_size * middle_size * 3, dtype=np.float32),
+                  newshape=(outer_size, middle_size, 3))
+              x_tf = reverse_f(x_np, [0]).eval()
+              np_answer = x_np[::-1, :, :]
+              self.assertAllEqual(x_tf, np_answer)
 
 
 class MeshgridTest(test_util.TensorFlowTestCase):
@@ -503,16 +550,17 @@ class MeshgridTest(test_util.TensorFlowTestCase):
   def testCompare(self):
     for t in (np.float16, np.float32, np.float64, np.int32, np.int64,
               np.complex64, np.complex128):
-      self._compareDiffType(2, t, False)
-      self._compareDiffType(3, t, False)
+      with self.subTest(t=t):
+        self._compareDiffType(2, t, False)
+        self._compareDiffType(3, t, False)
 
-      x = [1, 2, 3]
-      y = [4, 5]
+        x = [1, 2, 3]
+        y = [4, 5]
 
-      a = [[1, 1], [1, 1]]
+        a = [[1, 1], [1, 1]]
 
-      self._compareDiff(x, y, False)
-      self._compareDiff(x, a, False)
+        self._compareDiff(x, y, False)
+        self._compareDiff(x, a, False)
 
 
 class StridedSliceChecker(object):
@@ -585,30 +633,31 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def test_basic_slice(self):
     for tensor_type in STRIDED_SLICE_TYPES:
-      with self.cached_session(use_gpu=True):
-        checker = StridedSliceChecker(
-            self, StridedSliceChecker.REF_TENSOR, tensor_type=tensor_type)
-        _ = checker[:, :, :]
-        # Various ways of representing identity slice
-        _ = checker[:, :, :]
-        _ = checker[::, ::, ::]
-        _ = checker[::1, ::1, ::1]
-        # Not zero slice
-        _ = checker[::1, ::5, ::2]
-        # Reverse in each dimension independently
-        _ = checker[::-1, :, :]
-        _ = checker[:, ::-1, :]
-        _ = checker[:, :, ::-1]
-        ## negative index tests i.e. n-2 in first component
-        _ = checker[-2::-1, :, ::1]
-        # negative index tests i.e. n-2 in first component, non-unit stride
-        _ = checker[-2::-1, :, ::2]
+      with self.subTest(tensor_type=tensor_type):
+        with self.cached_session(use_gpu=True):
+          checker = StridedSliceChecker(
+              self, StridedSliceChecker.REF_TENSOR, tensor_type=tensor_type)
+          _ = checker[:, :, :]
+          # Various ways of representing identity slice
+          _ = checker[:, :, :]
+          _ = checker[::, ::, ::]
+          _ = checker[::1, ::1, ::1]
+          # Not zero slice
+          _ = checker[::1, ::5, ::2]
+          # Reverse in each dimension independently
+          _ = checker[::-1, :, :]
+          _ = checker[:, ::-1, :]
+          _ = checker[:, :, ::-1]
+          ## negative index tests i.e. n-2 in first component
+          _ = checker[-2::-1, :, ::1]
+          # negative index tests i.e. n-2 in first component, non-unit stride
+          _ = checker[-2::-1, :, ::2]
 
-        # Check rank-0 examples
-        checker2 = StridedSliceChecker(self, 5, tensor_type=tensor_type)
-        _ = checker2[None]
-        _ = checker2[...]
-        _ = checker2[tuple()]
+          # Check rank-0 examples
+          checker2 = StridedSliceChecker(self, 5, tensor_type=tensor_type)
+          _ = checker2[None]
+          _ = checker2[...]
+          _ = checker2[tuple()]
 
   def testInt64GPU(self):
     if not test_util.is_gpu_available():
@@ -1107,27 +1156,28 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
 
   def doTestSliceAssign(self, use_resource):
     for dtype in STRIDED_SLICE_TYPES:
-      checker = StridedSliceAssignChecker(
-          self, [[1, 2, 3], [4, 5, 6]],
-          use_resource=use_resource,
-          tensor_type=dtype)
-      # Check if equal
-      checker[:] = [[10, 20, 30], [40, 50, 60]]
-      # Check trivial (1,1) shape tensor
-      checker[1:2, 1:2] = [[66]]
-      # shrinks shape changes
-      checker[1:2, 1] = [66]
-      checker[1, 1:2] = [66]
-      checker[1, 1] = 66
-      # newaxis shape changes
-      checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
-      # shrink and newaxis
-      checker[None, None, 0, 0:1] = [[[99]]]
-      # Non unit strides
-      checker[::1, ::-2] = [[3, 33], [4, 44]]
-      # degenerate interval
-      checker[8:10, 0] = []
-      checker[8:10, 8:10] = [[]]
+      with self.subTest(dtype=dtype):
+        checker = StridedSliceAssignChecker(
+            self, [[1, 2, 3], [4, 5, 6]],
+            use_resource=use_resource,
+            tensor_type=dtype)
+        # Check if equal
+        checker[:] = [[10, 20, 30], [40, 50, 60]]
+        # Check trivial (1,1) shape tensor
+        checker[1:2, 1:2] = [[66]]
+        # shrinks shape changes
+        checker[1:2, 1] = [66]
+        checker[1, 1:2] = [66]
+        checker[1, 1] = 66
+        # newaxis shape changes
+        checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
+        # shrink and newaxis
+        checker[None, None, 0, 0:1] = [[[99]]]
+        # Non unit strides
+        checker[::1, ::-2] = [[3, 33], [4, 44]]
+        # degenerate interval
+        checker[8:10, 0] = []
+        checker[8:10, 8:10] = [[]]
     # Assign vector to scalar (rank-0) using newaxis
     checker2 = StridedSliceAssignChecker(self, 222)
     checker2[()] = 6  # no indices
@@ -1355,11 +1405,12 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testInvertPermutation(self):
     for dtype in [dtypes.int32, dtypes.int64]:
-      with self.cached_session(use_gpu=True):
-        x = constant_op.constant([3, 4, 0, 2, 1], dtype=dtype)
-        y = array_ops.invert_permutation(x)
-        self.assertAllEqual(y.get_shape(), [5])
-        self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
+      with self.subTest(dtype=dtype):
+        with self.cached_session(use_gpu=True):
+          x = constant_op.constant([3, 4, 0, 2, 1], dtype=dtype)
+          y = array_ops.invert_permutation(x)
+          self.assertAllEqual(y.get_shape(), [5])
+          self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
 
 
 class UnravelIndexTest(test_util.TensorFlowTestCase):
@@ -1369,20 +1420,21 @@ class UnravelIndexTest(test_util.TensorFlowTestCase):
   def testUnravelIndex(self):
     with self.cached_session():
       for dtype in [dtypes.int32, dtypes.int64]:
-        indices_1 = constant_op.constant(1621, dtype=dtype)
-        dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
-        out_1 = array_ops.unravel_index(indices_1, dims_1)
-        self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
+        with self.subTest(dtype=dtype):
+          indices_1 = constant_op.constant(1621, dtype=dtype)
+          dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+          out_1 = array_ops.unravel_index(indices_1, dims_1)
+          self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
 
-        indices_2 = constant_op.constant([1621], dtype=dtype)
-        dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
-        out_2 = array_ops.unravel_index(indices_2, dims_2)
-        self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]])
+          indices_2 = constant_op.constant([1621], dtype=dtype)
+          dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+          out_2 = array_ops.unravel_index(indices_2, dims_2)
+          self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]])
 
-        indices_3 = constant_op.constant([22, 41, 37], dtype=dtype)
-        dims_3 = constant_op.constant([7, 6], dtype=dtype)
-        out_3 = array_ops.unravel_index(indices_3, dims_3)
-        self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]])
+          indices_3 = constant_op.constant([22, 41, 37], dtype=dtype)
+          dims_3 = constant_op.constant([7, 6], dtype=dtype)
+          out_3 = array_ops.unravel_index(indices_3, dims_3)
+          self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]])
 
 
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
@@ -1398,13 +1450,14 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
   def testVariables(self):
     with self.cached_session() as sess:
       for use_resource in [False, True]:
-        a = variable_scope.get_variable(
-            "var_{}".format(use_resource), [],
-            initializer=init_ops.constant_initializer(10.0),
-            use_resource=use_resource)
-        guarantee_a = array_ops.guarantee_const(a)
-        self.evaluate(variables.global_variables_initializer())
-        self.assertEqual(10.0, guarantee_a.eval())
+        with self.subTest(use_resource=use_resource):
+          a = variable_scope.get_variable(
+              "var_{}".format(use_resource), [],
+              initializer=init_ops.constant_initializer(10.0),
+              use_resource=use_resource)
+          guarantee_a = array_ops.guarantee_const(a)
+          self.evaluate(variables.global_variables_initializer())
+          self.assertEqual(10.0, guarantee_a.eval())
 
   @test_util.run_deprecated_v1
   def testResourceRejection(self):
@@ -1425,10 +1478,11 @@ class SnapshotOpTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testInvertPermutation(self):
     for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
-      with self.cached_session(use_gpu=True):
-        x = constant_op.constant([0, 1, 2, 3], dtype=dtype)
-        y = gen_array_ops.snapshot(x)
-        self.assertAllEqual(y.eval(), [0, 1, 2, 3])
+      with self.subTest(dtype=dtype):
+        with self.cached_session(use_gpu=True):
+          x = constant_op.constant([0, 1, 2, 3], dtype=dtype)
+          y = gen_array_ops.snapshot(x)
+          self.assertAllEqual(y.eval(), [0, 1, 2, 3])
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -1453,27 +1507,29 @@ class QuantizeAndDequantizeTest(test_util.TensorFlowTestCase):
         [-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128, 0.5],
         dtype=np.float32)
     for axis in [None, 0, 1, 2, 3]:
-      inputs = constant_op.constant(self._scale_per_slice(shape, axis, values))
-      expected = self._scale_per_slice(shape, axis, quant_values)
-      unused_minmax_value = 0 if axis is None else [0] * shape[axis]
-      fake_quantized = self.evaluate(
-          array_ops.quantize_and_dequantize(
-              inputs,
-              unused_minmax_value,
-              unused_minmax_value,
-              range_given=False,
-              round_mode="HALF_UP",
-              axis=axis))
-      self.assertAllEqual(fake_quantized, expected)
-      if axis is not None:
+      with self.subTest(axis=axis):
+        inputs = constant_op.constant(
+            self._scale_per_slice(shape, axis, values))
+        expected = self._scale_per_slice(shape, axis, quant_values)
+        unused_minmax_value = 0 if axis is None else [0] * shape[axis]
         fake_quantized = self.evaluate(
             array_ops.quantize_and_dequantize(
                 inputs,
                 unused_minmax_value,
                 unused_minmax_value,
                 range_given=False,
-                axis=(axis - 4)))
-        self.assertAllClose(fake_quantized, expected)
+                round_mode="HALF_UP",
+                axis=axis))
+        self.assertAllEqual(fake_quantized, expected)
+        if axis is not None:
+          fake_quantized = self.evaluate(
+              array_ops.quantize_and_dequantize(
+                  inputs,
+                  unused_minmax_value,
+                  unused_minmax_value,
+                  range_given=False,
+                  axis=(axis - 4)))
+          self.assertAllClose(fake_quantized, expected)
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -1676,18 +1732,24 @@ class SortedSearchTest(test_util.TensorFlowTestCase):
   def testZeroSequenceSize(self):
     dtype = dtypes.int32
     for side in ("left", "right"):
-      self.assertAllEqual(
-          array_ops.searchsorted(array_ops.ones([2, 0]), array_ops.ones([2, 3]),
-                                 side=side, out_type=dtype),
-          array_ops.zeros([2, 3], dtype))
+      with self.subTest(side=side):
+        self.assertAllEqual(
+            array_ops.searchsorted(
+                array_ops.ones([2, 0]),
+                array_ops.ones([2, 3]),
+                side=side,
+                out_type=dtype), array_ops.zeros([2, 3], dtype))
 
   def testZeroValueSize(self):
     dtype = dtypes.int32
     for side in ("left", "right"):
-      self.assertAllEqual(
-          array_ops.searchsorted(array_ops.ones([2, 3]), array_ops.ones([2, 0]),
-                                 side=side, out_type=dtype),
-          array_ops.zeros([2, 0], dtype))
+      with self.subTest(side=side):
+        self.assertAllEqual(
+            array_ops.searchsorted(
+                array_ops.ones([2, 3]),
+                array_ops.ones([2, 0]),
+                side=side,
+                out_type=dtype), array_ops.zeros([2, 0], dtype))
 
 
 class BatchGatherNdTest(test_util.TensorFlowTestCase):
@@ -1715,17 +1777,21 @@ class BatchGatherNdTest(test_util.TensorFlowTestCase):
     shapes.append(((3, 2, 2, 3, 4), (3, 2, 3, 1), 2),)
 
     for params_shape, indices_shape, batch_dims in shapes:
-      params = constant_op.constant(1.0, shape=(params_shape))
-      indices = constant_op.constant(
-          1, shape=(indices_shape), dtype=dtypes.int32)
-      out = array_ops.batch_gather_nd(
-          params=params, indices=indices, batch_dims=batch_dims)
-      ndims_params = len(params_shape) - batch_dims
-      ndims_rows = ndims_params - indices_shape[-1]
-      expected_out_shape = indices_shape[:-1]
-      if ndims_rows > 0:
-        expected_out_shape += params_shape[-ndims_rows:]
-      self.assertSequenceEqual(out.shape, expected_out_shape)
+      with self.subTest(
+          params_shape=params_shape,
+          indices_shape=indices_shape,
+          batch_dims=batch_dims):
+        params = constant_op.constant(1.0, shape=(params_shape))
+        indices = constant_op.constant(
+            1, shape=(indices_shape), dtype=dtypes.int32)
+        out = array_ops.batch_gather_nd(
+            params=params, indices=indices, batch_dims=batch_dims)
+        ndims_params = len(params_shape) - batch_dims
+        ndims_rows = ndims_params - indices_shape[-1]
+        expected_out_shape = indices_shape[:-1]
+        if ndims_rows > 0:
+          expected_out_shape += params_shape[-ndims_rows:]
+        self.assertSequenceEqual(out.shape, expected_out_shape)
 
   def testReducesToGatherNDWhenBatchDimIsZero(self):
     """Confirms setting batch_dims to zero reduces to tf.gather_nd."""
@@ -1742,11 +1808,12 @@ class BatchGatherNdTest(test_util.TensorFlowTestCase):
     indices_shapes.append((3, 3, 3))
 
     for indices_shape in indices_shapes:
-      indices = np.random.randint(0, 7, size=indices_shape)
-      gather_nd_result = gen_array_ops.gather_nd(params, indices)
-      batch_gather_nd_result = array_ops.batch_gather_nd(
-          params=params, indices=indices, batch_dims=0)
-      self.assertAllEqual(gather_nd_result, batch_gather_nd_result)
+      with self.subTest(indices_shape=indices_shape):
+        indices = np.random.randint(0, 7, size=indices_shape)
+        gather_nd_result = gen_array_ops.gather_nd(params, indices)
+        batch_gather_nd_result = array_ops.batch_gather_nd(
+            params=params, indices=indices, batch_dims=0)
+        self.assertAllEqual(gather_nd_result, batch_gather_nd_result)
 
   def testSameResultAsMapFn(self):
     """Compares results with gather_nd called on every element with map_fn."""
@@ -1768,28 +1835,32 @@ class BatchGatherNdTest(test_util.TensorFlowTestCase):
     shapes.append(((3, 2, 2, 3, 4), (3, 2, 3, 1), 2),)
 
     for params_shape, indices_shape, batch_dims in shapes:
-      params = constant_op.constant(
-          np.random.uniform(0.0, 1.0, size=(params_shape)))
-      indices = np.random.randint(0, 2, size=indices_shape)
-      batch_gather_nd_result = array_ops.batch_gather_nd(
-          params=params, indices=indices, batch_dims=batch_dims)
+      with self.subTest(
+          params_shape=params_shape,
+          indices_shape=indices_shape,
+          batch_dims=batch_dims):
+        params = constant_op.constant(
+            np.random.uniform(0.0, 1.0, size=(params_shape)))
+        indices = np.random.randint(0, 2, size=indices_shape)
+        batch_gather_nd_result = array_ops.batch_gather_nd(
+            params=params, indices=indices, batch_dims=batch_dims)
 
-      if batch_dims > 1:
-        params = array_ops.reshape(
-            params, shape=[-1] + list(params_shape[batch_dims:]))
-        indices = array_ops.reshape(
-            indices, shape=[-1] + list(indices_shape[batch_dims:]))
+        if batch_dims > 1:
+          params = array_ops.reshape(
+              params, shape=[-1] + list(params_shape[batch_dims:]))
+          indices = array_ops.reshape(
+              indices, shape=[-1] + list(indices_shape[batch_dims:]))
 
-      map_fn_gather_nd_result = map_fn.map_fn(
-          fn=self._map_fn_body, elems=(params, indices), dtype=dtypes.float64)
+        map_fn_gather_nd_result = map_fn.map_fn(
+            fn=self._map_fn_body, elems=(params, indices), dtype=dtypes.float64)
 
-      if batch_dims > 1:
-        out_shape = map_fn_gather_nd_result.shape.as_list()
-        out_shape = list(params_shape[:batch_dims]) + out_shape[1:]
-        map_fn_gather_nd_result = array_ops.reshape(
-            map_fn_gather_nd_result, shape=out_shape)
+        if batch_dims > 1:
+          out_shape = map_fn_gather_nd_result.shape.as_list()
+          out_shape = list(params_shape[:batch_dims]) + out_shape[1:]
+          map_fn_gather_nd_result = array_ops.reshape(
+              map_fn_gather_nd_result, shape=out_shape)
 
-      self.assertAllEqual(map_fn_gather_nd_result, batch_gather_nd_result)
+        self.assertAllEqual(map_fn_gather_nd_result, batch_gather_nd_result)
 
   def _map_fn_body(self, elems):
     return gen_array_ops.gather_nd(elems[0], elems[1])
@@ -1803,17 +1874,21 @@ class BatchGatherNdTest(test_util.TensorFlowTestCase):
     shapes.append(((3, 2, 2, 3, 4), (3, 2, 3, 1), 2),)
 
     for params_shape, indices_shape, batch_dims in shapes:
-      params = constant_op.constant(
-          np.random.uniform(0.0, 1.0, size=(params_shape)))
-      indices = np.random.randint(0, 2, size=indices_shape)
-      batch_gather_nd_result = array_ops.gather_nd(
-          params=params, indices=indices, batch_dims=batch_dims)
-      batch_dims_tensor = constant_op.constant([batch_dims])
-      batch_gather_nd_tensor_batch_dims_result = array_ops.gather_nd(
-          params=params, indices=indices, batch_dims=batch_dims_tensor)
+      with self.subTest(
+          params_shape=params_shape,
+          indices_shape=indices_shape,
+          batch_dims=batch_dims):
+        params = constant_op.constant(
+            np.random.uniform(0.0, 1.0, size=(params_shape)))
+        indices = np.random.randint(0, 2, size=indices_shape)
+        batch_gather_nd_result = array_ops.gather_nd(
+            params=params, indices=indices, batch_dims=batch_dims)
+        batch_dims_tensor = constant_op.constant([batch_dims])
+        batch_gather_nd_tensor_batch_dims_result = array_ops.gather_nd(
+            params=params, indices=indices, batch_dims=batch_dims_tensor)
 
-      self.assertAllEqual(batch_gather_nd_tensor_batch_dims_result,
-                          batch_gather_nd_result)
+        self.assertAllEqual(batch_gather_nd_tensor_batch_dims_result,
+                            batch_gather_nd_result)
 
   def testInvalidBatchDimsRaisesException(self):
     """Tests whether invalid batch_dims raise expected exceptions."""
@@ -1920,5 +1995,32 @@ class RepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(v_tf_fn, v_np)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class TileVariantTest(test_util.TensorFlowTestCase):
+
+  def test_tile_tensor_list(self):
+    t = constant_op.constant(np.random.uniform(size=[2, 3, 4]))
+    handle = list_ops.tensor_list_from_tensor(t, element_shape=None)
+    with ops.device("CPU:0"):
+      tiled_handles = array_ops.tile(array_ops.reshape(handle, [1]), [2])
+    tiled_tensor_0 = list_ops.tensor_list_stack(tiled_handles[0], t.dtype, 2,
+                                                [3, 4])
+    tiled_tensor_1 = list_ops.tensor_list_stack(tiled_handles[1], t.dtype, 2,
+                                                [3, 4])
+    self.assertAllEqual(t, tiled_tensor_0)
+    self.assertAllEqual(t, tiled_tensor_1)
+    # Now mutate some of the lists and make sure the changes are not reflected
+    # in the tiled handles.
+    with ops.control_dependencies([
+        list_ops.tensor_list_scatter([t[0] + 1], [0], input_handle=handle),
+        list_ops.tensor_list_set_item(tiled_handles[0], 0, t[0] + 2)]):
+      tiled_tensor_0 = list_ops.tensor_list_stack(tiled_handles[0], t.dtype, 2,
+                                                  [3, 4])
+      tiled_tensor_1 = list_ops.tensor_list_stack(tiled_handles[1], t.dtype, 2,
+                                                  [3, 4])
+    self.assertAllEqual(t, tiled_tensor_0)
+    self.assertAllEqual(t, tiled_tensor_1)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index c4f70b5bc29..c564c822918 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -55,8 +55,8 @@ class BetaincTest(test.TestCase):
       # the scipy version of betainc uses a double-only implementation.
       # TODO(ebrevdo): identify reasons for (sometime) precision loss
       # with doubles
-      rtol = 1e-4 if dtype == dtypes.float32 else 5e-5
-      atol = 9e-6 if dtype == dtypes.float32 else 3e-6
+      rtol = 1e-4
+      atol = 1e-5
       self.assertAllCloseAccordingToType(
           scipy_out, tf_out, rtol=rtol, atol=atol)
 
@@ -66,7 +66,8 @@ class BetaincTest(test.TestCase):
       with self.cached_session():
         tf_comb = math_ops.betainc(a_comb, b_comb, x_comb).eval()
       scipy_comb = special.betainc(a_comb, b_comb, x_comb, dtype=np_dt)
-      self.assertAllCloseAccordingToType(scipy_comb, tf_comb)
+      self.assertAllCloseAccordingToType(
+          scipy_comb, tf_comb, rtol=rtol, atol=atol)
 
       # Test broadcasting between scalars and other shapes
       with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 10878701418..222716dfdfa 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import dtypes
@@ -26,6 +27,9 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import googletest
 
 
@@ -128,5 +132,505 @@ class BincountTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(v2.get_shape().as_list(), [None])
 
 
+class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_bincount_all_count(self, dtype):
+    np.random.seed(42)
+    size = 1000
+    inp = np.random.randint(0, size, (4096), dtype=dtype)
+    np_out = np.bincount(inp, minlength=size)
+    with test_util.use_gpu():
+      self.assertAllEqual(
+          np_out,
+          self.evaluate(
+              gen_math_ops.dense_bincount(input=inp, weights=[], size=size)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_bincount_all_count_with_weights(self, dtype):
+    np.random.seed(42)
+    size = 1000
+    inp = np.random.randint(0, size, (4096,), dtype=dtype)
+    np_weight = np.random.random((4096,))
+    np_out = np.bincount(inp, minlength=size, weights=np_weight)
+    with test_util.use_gpu():
+      self.assertAllEqual(
+          np_out,
+          self.evaluate(
+              gen_math_ops.dense_bincount(
+                  input=inp, weights=np_weight, size=size)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_bincount_all_binary(self, dtype):
+    np.random.seed(42)
+    size = 10
+    inp = np.random.randint(0, size, (4096), dtype=dtype)
+    np_out = np.ones((size,))
+    with test_util.use_gpu():
+      self.assertAllEqual(
+          np_out,
+          self.evaluate(
+              gen_math_ops.dense_bincount(
+                  input=inp, weights=[], size=size, binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_bincount_all_binary_with_weights(self, dtype):
+    np.random.seed(42)
+    size = 10
+    inp = np.random.randint(0, size, (4096,), dtype=dtype)
+    np_weight = np.random.random((4096,))
+    np_out = np.ones((size,))
+    with test_util.use_gpu():
+      self.assertAllEqual(
+          np_out,
+          self.evaluate(
+              gen_math_ops.dense_bincount(
+                  input=inp, weights=np_weight, size=size, binary_output=True)))
+
+  def _test_bincount_col_count(self, num_rows, num_cols, size, dtype):
+    np.random.seed(42)
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate(
+            [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
+            axis=0), (num_rows, size))
+    with test_util.use_gpu():
+      self.assertAllEqual(
+          np_out,
+          self.evaluate(
+              gen_math_ops.dense_bincount(input=inp, weights=[], size=size)))
+
+  def _test_bincount_col_binary(self, num_rows, num_cols, size, dtype):
+    np.random.seed(42)
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate([
+            np.where(np.bincount(inp[j, :], minlength=size) > 0, 1, 0)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    with test_util.use_gpu():
+      self.assertAllEqual(
+          np_out,
+          self.evaluate(
+              gen_math_ops.dense_bincount(
+                  input=inp, weights=[], size=size, binary_output=True)))
+
+  def _test_bincount_col_count_with_weights(self, num_rows, num_cols, size,
+                                            dtype):
+    np.random.seed(42)
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_weight = np.random.random((num_rows, num_cols))
+    np_out = np.reshape(
+        np.concatenate([
+            np.bincount(inp[j, :], weights=np_weight[j, :], minlength=size)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    with test_util.use_gpu():
+      self.assertAllEqual(
+          np_out,
+          self.evaluate(
+              gen_math_ops.dense_bincount(
+                  input=inp, weights=np_weight, size=size)))
+
+  def test_col_reduce_basic(self):
+    with test_util.use_gpu():
+      v = self.evaluate(
+          gen_math_ops.dense_bincount(
+              input=[[1, 2, 3], [0, 3, 2]], weights=[], size=4))
+    expected_out = [[0., 1., 1., 1.], [1., 0., 1., 1.]]
+    self.assertAllEqual(expected_out, v)
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_col_reduce_shared_memory(self, dtype):
+    # num_rows * num_bins less than half of max shared memory.
+    num_rows = 128
+    num_cols = 27
+    size = 10
+    self._test_bincount_col_count(num_rows, num_cols, size, dtype)
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_col_reduce_global_memory(self, dtype):
+    # num_rows * num_bins more than half of max shared memory.
+    num_rows = 128
+    num_cols = 27
+    size = 1024
+    self._test_bincount_col_count(num_rows, num_cols, size, dtype)
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_col_reduce_shared_memory_with_weights(self, dtype):
+    # num_rows * num_bins less than half of max shared memory.
+    num_rows = 128
+    num_cols = 27
+    size = 100
+    self._test_bincount_col_count_with_weights(num_rows, num_cols, size, dtype)
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_col_reduce_global_memory_with_weights(self, dtype):
+    # num_rows * num_bins more than half of max shared memory.
+    num_rows = 128
+    num_cols = 27
+    size = 1024
+    self._test_bincount_col_count_with_weights(num_rows, num_cols, size, dtype)
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_col_reduce_binary(self, dtype):
+    num_rows = 128
+    num_cols = 7
+    size = 10
+    self._test_bincount_col_binary(num_rows, num_cols, size, dtype)
+
+  @test_util.run_deprecated_v1
+  def test_invalid_rank(self):
+    with self.assertRaisesRegexp(ValueError, "at most rank 2"):
+      with test_util.use_gpu():
+        self.evaluate(
+            gen_math_ops.dense_bincount(
+                input=[[[1, 2, 3], [0, 3, 2]]], weights=[], size=10))
+
+
+class SparseBincountOpTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_bincount_all_count(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 1000
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems,))
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+
+    np_out = np.bincount(inp_vals, minlength=size)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            gen_math_ops.sparse_bincount(
+                indices=inp_indices,
+                values=inp_vals,
+                dense_shape=[num_rows],
+                size=size,
+                weights=[])))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_bincount_all_count_with_weights(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 1000
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems,))
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+    inp_weight = np.random.random((n_elems,))
+
+    np_out = np.bincount(inp_vals, minlength=size, weights=inp_weight)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            gen_math_ops.sparse_bincount(
+                indices=inp_indices,
+                values=inp_vals,
+                dense_shape=[num_rows],
+                size=size,
+                weights=inp_weight)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_bincount_all_binary(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 10
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems,))
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+
+    np_out = np.ones((size,))
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            gen_math_ops.sparse_bincount(
+                indices=inp_indices,
+                values=inp_vals,
+                dense_shape=[num_rows],
+                size=size,
+                weights=[],
+                binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_bincount_all_binary_weights(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 10
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems,))
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+    inp_weight = np.random.random((n_elems,))
+
+    np_out = np.ones((size,))
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            gen_math_ops.sparse_bincount(
+                indices=inp_indices,
+                values=inp_vals,
+                dense_shape=[num_rows],
+                size=size,
+                weights=inp_weight,
+                binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_bincount_col_reduce_count(self, dtype):
+    num_rows = 128
+    num_cols = 27
+    size = 100
+    np.random.seed(42)
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate(
+            [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
+            axis=0), (num_rows, size))
+    # from_dense will filter out 0s.
+    inp = inp + 1
+    # from_dense will cause OOM in GPU.
+    with ops.device("/CPU:0"):
+      inp_sparse = sparse_ops.from_dense(inp)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            gen_math_ops.sparse_bincount(
+                indices=inp_sparse.indices,
+                values=inp_sparse.values - 1,
+                dense_shape=inp_sparse.dense_shape,
+                size=size,
+                weights=[])))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_bincount_col_reduce_binary(self, dtype):
+    num_rows = 128
+    num_cols = 27
+    size = 100
+    np.random.seed(42)
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate([
+            np.where(np.bincount(inp[j, :], minlength=size) > 0, 1, 0)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    # from_dense will filter out 0s.
+    inp = inp + 1
+    # from_dense will cause OOM in GPU.
+    with ops.device("/CPU:0"):
+      inp_sparse = sparse_ops.from_dense(inp)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            gen_math_ops.sparse_bincount(
+                indices=inp_sparse.indices,
+                values=inp_sparse.values - 1,
+                dense_shape=inp_sparse.dense_shape,
+                size=size,
+                weights=[],
+                binary_output=True)))
+
+
+class RaggedBincountOpTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_bincount_count(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
+    expected_output = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0,
+                                            0], [1, 1, 0, 1, 0, 0],
+                       [0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 2, 1]]
+    self.assertAllEqual(
+        expected_output,
+        self.evaluate(
+            gen_math_ops.ragged_bincount(
+                splits=x.row_splits, values=x.values, weights=[], size=6)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_bincount_binary(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
+    expected_output = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0,
+                                            0], [1, 1, 0, 1, 0, 0],
+                       [0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
+    self.assertAllEqual(
+        expected_output,
+        self.evaluate(
+            gen_math_ops.ragged_bincount(
+                splits=x.row_splits,
+                values=x.values,
+                weights=[],
+                size=6,
+                binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_bincount_count_with_weights(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
+    weights = ragged_factory_ops.constant([[], [], [.1, .2, .3], [],
+                                           [.2, .5, .6, .3]])
+    expected_output = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
+                       [.2, .3, 0, .1, 0, 0], [0, 0, 0, 0, 0, 0],
+                       [.5, 0, 0, 0, .9, .2]]
+    self.assertAllClose(
+        expected_output,
+        self.evaluate(
+            gen_math_ops.ragged_bincount(
+                splits=x.row_splits,
+                values=x.values,
+                weights=weights.values,
+                size=6)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_bincount_count_np(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    num_cols = 27
+    size = 1000
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate(
+            [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
+            axis=0), (num_rows, size))
+    x = ragged_tensor.RaggedTensor.from_tensor(inp)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            gen_math_ops.ragged_bincount(
+                splits=x.row_splits, values=x.values, weights=[], size=size)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_bincount_count_np_with_weights(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    num_cols = 27
+    size = 1000
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_weight = np.random.random((num_rows, num_cols))
+    np_out = np.reshape(
+        np.concatenate([
+            np.bincount(inp[j, :], weights=np_weight[j, :], minlength=size)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    x = ragged_tensor.RaggedTensor.from_tensor(inp)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            gen_math_ops.ragged_bincount(
+                splits=x.row_splits,
+                values=x.values,
+                weights=np_weight,
+                size=size)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_bincount_binary_np_with_weights(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    num_cols = 27
+    size = 1000
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate([
+            np.where(np.bincount(inp[j, :], minlength=size) > 0, 1, 0)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    x = ragged_tensor.RaggedTensor.from_tensor(inp)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            gen_math_ops.ragged_bincount(
+                splits=x.row_splits,
+                values=x.values,
+                weights=[],
+                size=size,
+                binary_output=True)))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index fb44c33d602..7c3a382c955 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,7 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     self.eps = 0.01
     self.max_elements = 1 << 16
-    self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
+    self.num_quantiles = constant_op.constant(4, dtype=dtypes.int64)
 
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
@@ -183,7 +183,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -202,7 +205,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
@@ -215,7 +221,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -233,7 +242,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 47f392d7438..37ee8d38f53 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -1688,8 +1688,6 @@ class AssertShapesTest(test.TestCase):
         rank_three_shapes, array_ops.constant(1), correct_rank=3, actual_rank=0)
 
   def test_raises_dynamic_incorrect_rank(self):
-    self.skipTest("b/134600611")
-
     x_value = 5
     rank_two_shapes = [(1, 1), (1, 3), ("a", "b"), (None, None)]
     with ops.Graph().as_default():
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index e17a029c5ff..01c497a37ed 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.platform import benchmark
@@ -91,7 +91,7 @@ def TriAngInvCompositeGrad(l, grad):
 
 class CholeskyOpTest(test.TestCase):
 
-  def _verifyCholeskyBase(self, sess, x, chol, verification):
+  def _verifyCholeskyBase(self, x, chol, verification):
     chol_np, verification_np = self.evaluate([chol, verification])
     self.assertAllClose(x, verification_np)
     self.assertShapeEqual(x, chol)
@@ -106,21 +106,24 @@ class CholeskyOpTest(test.TestCase):
 
   def _verifyCholesky(self, x):
     # Verify that LL^T == x.
-    with self.cached_session(use_gpu=True) as sess:
-      chol = linalg_ops.cholesky(x)
-      verification = math_ops.matmul(chol, chol, adjoint_b=True)
-      self._verifyCholeskyBase(sess, x, chol, verification)
+    chol = linalg_ops.cholesky(x)
+    verification = math_ops.matmul(chol, chol, adjoint_b=True)
+    self._verifyCholeskyBase(x, chol, verification)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBasic(self):
     data = np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]])
     for dtype in (np.float32, np.float64):
-      self._verifyCholesky(data.astype(dtype))
+      with self.subTest(dtype=dtype):
+        self._verifyCholesky(data.astype(dtype))
     for dtype in (np.complex64, np.complex128):
-      complex_data = np.tril(1j * data, -1).astype(dtype)
-      complex_data += np.triu(-1j * data, 1).astype(dtype)
-      complex_data += data
-      self._verifyCholesky(complex_data)
+      with self.subTest(dtype=dtype):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyCholesky(complex_data)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBatch(self):
     simple_array = np.array([[[1., 0.], [0., 5.]]])  # shape (1, 2, 2)
     self._verifyCholesky(simple_array)
@@ -131,30 +134,32 @@ class CholeskyOpTest(test.TestCase):
     # Generate random positive-definite matrices.
     matrices = np.random.rand(10, 5, 5)
     for i in xrange(10):
-      matrices[i] = np.dot(matrices[i].T, matrices[i])
+      with self.subTest(i=i):
+        matrices[i] = np.dot(matrices[i].T, matrices[i])
     self._verifyCholesky(matrices)
 
     # Generate random complex valued positive-definite matrices.
     matrices = np.random.rand(10, 5, 5) + 1j * np.random.rand(10, 5, 5)
     for i in xrange(10):
-      matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
+      with self.subTest(i=i):
+        matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
     self._verifyCholesky(matrices)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]]))
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(
           np.array([[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]]
                    ]))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     tensor3 = constant_op.constant([1., 2.])
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(tensor3)
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(tensor3)
 
   # The below invalid Cholesky call returns an error with TF Classic and just
@@ -171,21 +176,23 @@ class CholeskyOpTest(test.TestCase):
         self._verifyCholesky(
             np.array([[1., -1., 0.], [-1., 1., -1.], [0., -1., 1.]]))
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyCholesky(np.empty([0, 2, 2]))
     self._verifyCholesky(np.empty([2, 0, 0]))
 
   @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True)
-      matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
-      c1 = linalg_ops.cholesky(matrix1)
-      c2 = linalg_ops.cholesky(matrix2)
-      c1_val, c2_val = self.evaluate([c1, c2])
-      self.assertAllClose(c1_val, c2_val)
+    seed = [42, 24]
+    matrix_shape = [5, 5]
+    matrix1 = stateless_random_ops.stateless_random_normal(matrix_shape, seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(matrix_shape, seed)
+    matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True)
+    matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
+    c1 = linalg_ops.cholesky(matrix1)
+    c2 = linalg_ops.cholesky(matrix2)
+    c1_val, c2_val = self.evaluate([c1, c2])
+    self.assertAllClose(c1_val, c2_val)
 
 
 class CholeskyGradTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index c1178253a4b..04de4747a69 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import confusion_matrix
@@ -188,7 +189,8 @@ class ConfusionMatrixTest(test.TestCase):
   def testLabelsTooLarge(self):
     labels = np.asarray([1, 1, 0, 3, 5], dtype=np.int32)
     predictions = np.asarray([2, 1, 0, 2, 2], dtype=np.int32)
-    with self.assertRaisesOpError("`labels`.*x < y"):
+    with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError,
+                                             "`labels`.*out of bound"):
       self._testConfMatrix(
           labels=labels, predictions=predictions, num_classes=3, truth=None)
 
@@ -203,7 +205,8 @@ class ConfusionMatrixTest(test.TestCase):
   def testPredictionsTooLarge(self):
     labels = np.asarray([1, 1, 0, 2, 2], dtype=np.int32)
     predictions = np.asarray([2, 1, 0, 3, 5], dtype=np.int32)
-    with self.assertRaisesOpError("`predictions`.*x < y"):
+    with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError,
+                                             "`predictions`.*out of bound"):
       self._testConfMatrix(
           labels=labels, predictions=predictions, num_classes=3, truth=None)
 
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 5d58a325d3c..9192dc05ebc 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -2662,6 +2662,8 @@ class SeparableConv2DTest(test.TestCase):
       if data_format == "NCHW":
         real_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
         strides = [1, 1, stride, stride]
+        if isinstance(padding, list):
+          padding = [padding[0], padding[3], padding[1], padding[2]]
 
       conv = nn_impl.separable_conv2d(
           real_t1,
@@ -2756,6 +2758,45 @@ class SeparableConv2DTest(test.TestCase):
       return
     self._testSeparableConv2DEqualInputOutputDepth("NCHW")
 
+  def _testSeparableConv2dExplicitPadding(self, data_format):
+    tensor_in_sizes = [1, 4, 4, 2]
+    depthwise_filter_in_sizes = [2, 2, 2, 3]
+    pointwise_filter_in_sizes = [1, 1, 6, 7]
+    padding = [[0, 0], [1, 2], [3, 4], [0, 0]]
+    with self.cached_session(use_gpu=True):
+      # Compute the 'expected' values by manually padding before calling
+      # separable_conv2d
+      t1 = self._InitValues(tensor_in_sizes)
+      t1 = array_ops.pad(t1, padding)
+      f1 = self._InitValues(depthwise_filter_in_sizes)
+      f1.set_shape(depthwise_filter_in_sizes)
+      f2 = self._InitValues(pointwise_filter_in_sizes)
+      conv = nn_impl.separable_conv2d(
+          t1,
+          f1,
+          f2,
+          strides=[1, 1, 1, 1],
+          padding="VALID",
+          data_format="NHWC")
+      expected = self.evaluate(conv)
+      expected = np.ravel(expected)
+    self._VerifyValues(
+        tensor_in_sizes=tensor_in_sizes,
+        depthwise_filter_in_sizes=depthwise_filter_in_sizes,
+        pointwise_filter_in_sizes=pointwise_filter_in_sizes,
+        stride=1,
+        padding=padding,
+        expected=expected,
+        data_format=data_format)
+
+  def testSeparableConv2dExplicitPadding(self):
+    self._testSeparableConv2dExplicitPadding("NHWC")
+
+  def testSeparableConv2dExplicitPaddingNCHW(self):
+    if not test.is_gpu_available():
+      return
+    self._testSeparableConv2dExplicitPadding("NCHW")
+
 
 class DeepConv2DTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py b/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py
index aae624f6605..2b0309f26c4 100644
--- a/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py
+++ b/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py
@@ -55,6 +55,11 @@ class CumulativeLogsumexpTest(test.TestCase):
               reverse=reverse, exclusive=exclusive,
               axis=axis)
 
+  def testMinusInfinity(self):
+    x = np.log([0., 0., 1., 1., 1., 1., 0., 0.])
+    self._testLogSumExpAllArgs(x, use_gpu=False)
+    self._testLogSumExpAllArgs(x, use_gpu=True)
+
   def test1D(self):
     x = np.arange(10) / 10.0 - 0.5
     self._testLogSumExpAllArgs(x, use_gpu=False)
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 71006f28f43..4c6a41bf205 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -953,6 +953,44 @@ class ComparisonOpTest(test.TestCase):
             "Incompatible shapes|Dimensions must be equal"):
           f(x.astype(t), y.astype(t))
 
+  def testEqualDType(self):
+    dtypes = [
+        np.float16,
+        np.float32,
+        np.float64,
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+        np.bool,
+    ]
+    x = np.asarray([0, 1, 2, 3, 4])
+    y = np.asarray([0, 1, 2, 3, 4])
+    for dtype in dtypes:
+      xt = x.astype(dtype)
+      yt = y.astype(dtype)
+      cmp_eq = math_ops.equal(xt, yt)
+      cmp_ne = math_ops.not_equal(xt, yt)
+      values = self.evaluate([cmp_eq, cmp_ne])
+      self.assertAllEqual(
+          [[True, True, True, True, True], [False, False, False, False, False]],
+          values)
+    for dtype in [np.complex64, np.complex128]:
+      xt = x.astype(dtype)
+      xt -= 1j * xt
+      yt = y.astype(dtype)
+      yt -= 1j * yt
+      cmp_eq = math_ops.equal(xt, yt)
+      cmp_ne = math_ops.not_equal(xt, yt)
+      values = self.evaluate([cmp_eq, cmp_ne])
+      self.assertAllEqual(
+          [[True, True, True, True, True], [False, False, False, False, False]],
+          values)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 8d29b464f85..8c84bde1431 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -97,23 +97,27 @@ class ComparisonOpTest(test.TestCase):
     for t in dtypes:
       for x in data:
         for y in data:
-          self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
-          self.assertEqual(
-              self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
-          self.assertEqual(
-              self._compareScalar(math_ops.greater, x, y, t), x > y)
-          self.assertEqual(
-              self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
-          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(
-              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
+          with self.subTest(t=t, x=x, y=y):
+            self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
+            self.assertEqual(
+                self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
+            self.assertEqual(
+                self._compareScalar(math_ops.greater, x, y, t), x > y)
+            self.assertEqual(
+                self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
+            self.assertEqual(
+                self._compareScalar(math_ops.equal, x, y, t), x == y)
+            self.assertEqual(
+                self._compareScalar(math_ops.not_equal, x, y, t), x != y)
     data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j]
     for t in [np.complex64, np.complex128]:
       for x in data:
         for y in data:
-          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(
-              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
+          with self.subTest(t=t, x=x, y=y):
+            self.assertEqual(
+                self._compareScalar(math_ops.equal, x, y, t), x == y)
+            self.assertEqual(
+                self._compareScalar(math_ops.not_equal, x, y, t), x != y)
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
@@ -126,22 +130,24 @@ class ComparisonOpTest(test.TestCase):
     x = np.linspace(-15, 15, 6).reshape(1, 3, 2)
     y = np.linspace(20, -10, 6).reshape(1, 3, 2)
     for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(xt, yt, np.less, math_ops.less)
-      self._compare(xt, yt, np.less_equal, math_ops.less_equal)
-      self._compare(xt, yt, np.greater, math_ops.greater)
-      self._compare(xt, yt, np.greater_equal, math_ops.greater_equal)
-      self._compare(xt, yt, np.equal, math_ops.equal)
-      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(xt, yt, np.less, math_ops.less)
+        self._compare(xt, yt, np.less_equal, math_ops.less_equal)
+        self._compare(xt, yt, np.greater, math_ops.greater)
+        self._compare(xt, yt, np.greater_equal, math_ops.greater_equal)
+        self._compare(xt, yt, np.equal, math_ops.equal)
+        self._compare(xt, yt, np.not_equal, math_ops.not_equal)
     # Complex types do not support ordering but do support equality tests.
     for t in [np.complex64, np.complex128]:
-      xt = x.astype(t)
-      xt -= 1j * xt
-      yt = y.astype(t)
-      yt -= 1j * yt
-      self._compare(xt, yt, np.equal, math_ops.equal)
-      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        xt -= 1j * xt
+        yt = y.astype(t)
+        yt -= 1j * yt
+        self._compare(xt, yt, np.equal, math_ops.equal)
+        self._compare(xt, yt, np.not_equal, math_ops.not_equal)
 
   def _compareBCast(self, xs, ys, dtype, np_func, tf_func):
     x = np.linspace(-15, 15, np.prod(xs)).astype(dtype).reshape(xs)
@@ -178,7 +184,8 @@ class ComparisonOpTest(test.TestCase):
 
     for (xs, ys) in shapes:
       for dtype in dtypes:
-        self._compareBCast(xs, ys, dtype, np_func, tf_func)
+        with self.subTest(xs=xs, ys=ys, dtype=dtype):
+          self._compareBCast(xs, ys, dtype, np_func, tf_func)
 
   def testBCastLess(self):
     self._testBCastByFunc(np.less, math_ops.less)
@@ -209,10 +216,11 @@ class ComparisonOpTest(test.TestCase):
     y = np.arange(0, 10).reshape([5, 2])
     for t in dtypes:
       for f in funcs:
-        with self.assertRaisesRegexp(
-            (ValueError, errors.InvalidArgumentError),
-            "Incompatible shapes|Dimensions must be equal"):
-          f(x.astype(t), y.astype(t))
+        with self.subTest(t=t, f=f):
+          with self.assertRaisesRegexp(
+              (ValueError, errors.InvalidArgumentError),
+              "Incompatible shapes|Dimensions must be equal"):
+            f(x.astype(t), y.astype(t))
 
 
 class LogicalOpTest(test.TestCase):
@@ -241,23 +249,27 @@ class LogicalOpTest(test.TestCase):
     data = [np.array([True]), np.array([False])]
     for use_gpu in [True, False]:
       for x in data:
-        self._not(x, use_gpu)
+        with self.subTest(use_gpu=use_gpu, x=x):
+          self._not(x, use_gpu)
       for x in data:
         for y in data:
-          self._compareBinary(x, y, np.logical_and, math_ops.logical_and,
-                              use_gpu)
-          self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
-          self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor,
-                              use_gpu)
+          with self.subTest(use_gpu=use_gpu, x=x, y=y):
+            self._compareBinary(x, y, np.logical_and, math_ops.logical_and,
+                                use_gpu)
+            self._compareBinary(x, y, np.logical_or, math_ops.logical_or,
+                                use_gpu)
+            self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor,
+                                use_gpu)
 
   def testTensor(self):
     x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     y = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     for use_gpu in [True, False]:
-      self._not(x, use_gpu)
-      self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
-      self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
-      self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
+      with self.subTest(use_gpu=use_gpu):
+        self._not(x, use_gpu)
+        self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
+        self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
+        self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
 
   def testBCast(self):
     shapes = [
@@ -277,18 +289,22 @@ class LogicalOpTest(test.TestCase):
       x = np.random.randint(0, 2, np.prod(xs)).astype(np.bool).reshape(xs)
       y = np.random.randint(0, 2, np.prod(ys)).astype(np.bool).reshape(ys)
       for use_gpu in [True, False]:
-        self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
-        self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
-        self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
+        with self.subTest(xs=xs, ys=ys, use_gpu=use_gpu):
+          self._compareBinary(x, y, np.logical_and, math_ops.logical_and,
+                              use_gpu)
+          self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
+          self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor,
+                              use_gpu)
 
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     y = np.random.randint(0, 2, 6).astype(np.bool).reshape(3, 2, 1)
     for f in [math_ops.logical_and, math_ops.logical_or, math_ops.logical_xor]:
-      with self.assertRaisesWithPredicateMatch(
-          ValueError, lambda e: "Dimensions must" in str(e)):
-        f(x, y)
+      with self.subTest(f=f):
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, lambda e: "Dimensions must" in str(e)):
+          f(x, y)
 
   @test_util.run_deprecated_v1
   def testUsingAsPythonValueFails(self):
@@ -389,11 +405,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testScalar(self):
     self._testScalar(array_ops.where)
@@ -404,11 +421,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testScalarBroadcast(self):
     c = True
@@ -450,11 +468,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testBasic(self):
     self._testBasic(array_ops.where)
@@ -465,11 +484,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testBasicBroadcast(self):
     c0 = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
@@ -478,53 +498,55 @@ class SelectOpTest(test.TestCase):
     c3 = np.random.randint(0, 2, 1).astype(np.bool).reshape(1, 1, 1)
     for c in [c0, c1, c2, c3]:
       # where_v2 only
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 3, 1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 2) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 2) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(3, 2) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      with self.subTest(c=c):
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 3, 1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 2) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 2) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(3, 2) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
 
   def _testGradients(self, fn):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
     y = np.random.rand(1, 3, 2) * 100
     for t in [np.float16, np.float32, np.float64]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      if t == np.float16:
-        # Compare fp16 theoretical gradients to fp32 numerical gradients,
-        # since fp16 numerical gradients are too imprecise unless great
-        # care is taken with choosing the inputs and the delta. This is
-        # a weaker check (in particular, it does not test the op itself,
-        # only its gradient), but it's much better than nothing.
-        self._compareGradientX(fn, c, xt, yt, np.float)
-        self._compareGradientY(fn, c, xt, yt, np.float)
-      else:
-        self._compareGradientX(fn, c, xt, yt)
-        self._compareGradientY(fn, c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        if t == np.float16:
+          # Compare fp16 theoretical gradients to fp32 numerical gradients,
+          # since fp16 numerical gradients are too imprecise unless great
+          # care is taken with choosing the inputs and the delta. This is
+          # a weaker check (in particular, it does not test the op itself,
+          # only its gradient), but it's much better than nothing.
+          self._compareGradientX(fn, c, xt, yt, np.float)
+          self._compareGradientY(fn, c, xt, yt, np.float)
+        else:
+          self._compareGradientX(fn, c, xt, yt)
+          self._compareGradientY(fn, c, xt, yt)
 
   @test_util.run_deprecated_v1
   def testGradients(self):
@@ -536,27 +558,28 @@ class SelectOpTest(test.TestCase):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     for t in [np.float32, np.float64]:
       # where_v2 only
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 3, 1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 2) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 2) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(3, 2) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      with self.subTest(t=t):
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 3, 1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 2) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 2) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(3, 2) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
 
   def _testShapeMismatch(self, fn):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
@@ -566,10 +589,11 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      with self.assertRaises(ValueError):
-        fn(c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        with self.assertRaises(ValueError):
+          fn(c, xt, yt)
 
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
@@ -597,9 +621,10 @@ class SelectOpTest(test.TestCase):
       for c in False, True:
         for a in 7.0, np.nan:
           for b in 5.0, np.nan:
-            x = fn(c, a, b).eval()
-            y = a if c else b
-            self.assertEqual(np.isnan(x), np.isnan(y))
+            with self.subTest(c=c, a=a, b=b):
+              x = fn(c, a, b).eval()
+              y = a if c else b
+              self.assertEqual(np.isnan(x), np.isnan(y))
 
   @test_util.run_deprecated_v1
   def testNan(self):
@@ -677,11 +702,12 @@ class BatchSelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(c, xt, yt, use_gpu=True)
 
   @test_util.run_deprecated_v1
   def testGradients(self):
@@ -689,19 +715,20 @@ class BatchSelectOpTest(test.TestCase):
     x = np.random.rand(16, 2, 8) * 100
     y = np.random.rand(16, 2, 8) * 100
     for t in [np.float16, np.float32, np.float64]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      if t == np.float16:
-        # Compare fp16 theoretical gradients to fp32 numerical gradients,
-        # since fp16 numerical gradients are too imprecise unless great
-        # care is taken with choosing the inputs and the delta. This is
-        # a weaker check (in particular, it does not test the op itself,
-        # only its gradient), but it's much better than nothing.
-        self._compareGradientX(c, xt, yt, np.float)
-        self._compareGradientY(c, xt, yt, np.float)
-      else:
-        self._compareGradientX(c, xt, yt)
-        self._compareGradientY(c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        if t == np.float16:
+          # Compare fp16 theoretical gradients to fp32 numerical gradients,
+          # since fp16 numerical gradients are too imprecise unless great
+          # care is taken with choosing the inputs and the delta. This is
+          # a weaker check (in particular, it does not test the op itself,
+          # only its gradient), but it's much better than nothing.
+          self._compareGradientX(c, xt, yt, np.float)
+          self._compareGradientY(c, xt, yt, np.float)
+        else:
+          self._compareGradientX(c, xt, yt)
+          self._compareGradientY(c, xt, yt)
 
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
@@ -712,10 +739,11 @@ class BatchSelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      with self.assertRaises(ValueError):
-        array_ops.where(c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        with self.assertRaises(ValueError):
+          array_ops.where(c, xt, yt)
 
 
 class MinMaxOpTest(test.TestCase):
@@ -733,24 +761,28 @@ class MinMaxOpTest(test.TestCase):
   def testBasic(self):
     x = np.random.rand(1, 3, 2) * 100.
     y = np.random.rand(1, 3, 2) * 100.
-    for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
-      self._compare(x.astype(t), y.astype(t), use_gpu=False)
-      self._compare(x.astype(t), y.astype(t), use_gpu=True)
+    for t in [np.float16, np.float32, np.float64, np.uint8, np.int16, np.int32,
+              np.int64]:
+      with self.subTest(t=t):
+        self._compare(x.astype(t), y.astype(t), use_gpu=False)
+        self._compare(x.astype(t), y.astype(t), use_gpu=True)
 
   def testDifferentShapes(self):
     x = np.random.rand(1, 3, 2) * 100.
     y = np.random.rand(2) * 100.  # should broadcast
     for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
-      self._compare(x.astype(t), y.astype(t), use_gpu=False)
-      self._compare(x.astype(t), y.astype(t), use_gpu=True)
+      with self.subTest(t=t):
+        self._compare(x.astype(t), y.astype(t), use_gpu=False)
+        self._compare(x.astype(t), y.astype(t), use_gpu=True)
 
   def testScalar(self):
     x = np.random.rand(1, 3, 2) * 100.
     y = np.random.rand(1).item() * 100.  # should broadcast
     # dropped np.float64, int64 because TF automatically converts to 32 bit
     for t in [np.float32, np.int32]:
-      self._compare(x.astype(t), t(y), use_gpu=False)
-      self._compare(x.astype(t), t(y), use_gpu=True)
+      with self.subTest(t=t):
+        self._compare(x.astype(t), t(y), use_gpu=False)
+        self._compare(x.astype(t), t(y), use_gpu=True)
 
   def _compareGradientX(self, func, x, y):
     with self.cached_session():
@@ -840,13 +872,15 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        if dtype in (dtypes_lib.complex64,
-                     dtypes_lib.complex128) and tf_func == _FLOORDIV:
-          continue  # floordiv makes no sense for complex
-        self._compareBinary(10, 5, dtype, np_func, tf_func)
+        with self.subTest(dtype=dtype, np_func=np_func, tf_func=tf_func):
+          if dtype in (dtypes_lib.complex64,
+                       dtypes_lib.complex128) and tf_func == _FLOORDIV:
+            continue  # floordiv makes no sense for complex
+          self._compareBinary(10, 5, dtype, np_func, tf_func)
     # Mod only works for int32 and int64.
     for dtype in [dtypes_lib.int32, dtypes_lib.int64]:
-      self._compareBinary(10, 3, dtype, np.mod, _MOD)
+      with self.subTest(dtype=dtype):
+        self._compareBinary(10, 3, dtype, np.mod, _MOD)
 
   def testOverloadComparisons(self):
     dtypes = [
@@ -864,18 +898,20 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        self._compareBinary(10, 5, dtype, np_func, tf_func)
+        with self.subTest(dtype=dtype, np_func=np_func, tf_func=tf_func):
+          self._compareBinary(10, 5, dtype, np_func, tf_func)
     logical_funcs = [(np.logical_and, _AND), (np.logical_or, _OR),
                      (np.logical_xor, _XOR), (np.equal, math_ops.equal),
                      (np.not_equal, math_ops.not_equal)]
     for np_func, tf_func in logical_funcs:
-      self._compareBinary(True, False, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary(True, True, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary(False, False, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary(False, True, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary([True, True, False, False],
-                          [True, False, True, False], dtypes_lib.bool, np_func,
-                          tf_func)
+      with self.subTest(np_func=np_func, tf_func=tf_func):
+        self._compareBinary(True, False, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary(True, True, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary(False, False, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary(False, True, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary([True, True, False, False],
+                            [True, False, True, False], dtypes_lib.bool,
+                            np_func, tf_func)
     self._compareUnary(True, dtypes_lib.bool, np.logical_not, _INV)
     self._compareUnary(False, dtypes_lib.bool, np.logical_not, _INV)
     self._compareUnary([True, False], dtypes_lib.bool, np.logical_not, _INV)
@@ -923,16 +959,17 @@ class IsFiniteInfNanTest(test.TestCase):
         # It is not accurate for very large arguments, so we test for
         # fi.max/100 instead of fi.max here.
         for value in [fi.min, -2, -1, 0, fi.tiny, 1, 2, 1000, fi.max / 100]:
-          x = np.full((size,), value, dtype=dtype)
-          np_y = np.sqrt(x)
-          np_nan = np.isnan(np_y)
-          with test_util.use_gpu():
-            tf_y = math_ops.sqrt(x)
-            tf_nan = math_ops.is_nan(tf_y)
-            if value < 0:
-              self.assertAllEqual(np_nan, self.evaluate(tf_nan))
-            else:
-              self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
+          with self.subTest(dtype=dtype, size=size, value=value):
+            x = np.full((size,), value, dtype=dtype)
+            np_y = np.sqrt(x)
+            np_nan = np.isnan(np_y)
+            with test_util.use_gpu():
+              tf_y = math_ops.sqrt(x)
+              tf_nan = math_ops.is_nan(tf_y)
+              if value < 0:
+                self.assertAllEqual(np_nan, self.evaluate(tf_nan))
+              else:
+                self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
 
 
 class RoundingTest(test.TestCase):
@@ -977,7 +1014,8 @@ class RoundingTest(test.TestCase):
   def testTypes(self):
     self.skipTest("b/131162241")
     for dtype in [np.float16, np.float32, np.float64]:
-      self._testDtype(dtype)
+      with self.subTest(dtype=dtype):
+        self._testDtype(dtype)
 
 
 class ComplexMakeRealImagTest(test.TestCase):
@@ -998,19 +1036,21 @@ class ComplexMakeRealImagTest(test.TestCase):
     real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float32)
     imag = (np.arange(-3, 3) / 5.).reshape([1, 3, 2]).astype(np.float32)
     for use_gpu in [False, True]:
-      self._compareMake(real, imag, use_gpu)
-      self._compareMake(real, 12.0, use_gpu)
-      self._compareMake(23.0, imag, use_gpu)
+      with self.subTest(use_gpu=use_gpu):
+        self._compareMake(real, imag, use_gpu)
+        self._compareMake(real, 12.0, use_gpu)
+        self._compareMake(23.0, imag, use_gpu)
 
   def testRealImagNumericType(self):
     for use_gpu in [True, False]:
       for value in [1., 1j, 1. + 1j]:
-        np_real, np_imag = np.real(value), np.imag(value)
-        with test_util.device(use_gpu=use_gpu):
-          tf_real = math_ops.real(value)
-          tf_imag = math_ops.imag(value)
-          self.assertAllEqual(np_real, self.evaluate(tf_real))
-          self.assertAllEqual(np_imag, self.evaluate(tf_imag))
+        with self.subTest(use_gpu=use_gpu, value=value):
+          np_real, np_imag = np.real(value), np.imag(value)
+          with test_util.device(use_gpu=use_gpu):
+            tf_real = math_ops.real(value)
+            tf_imag = math_ops.imag(value)
+            self.assertAllEqual(np_real, self.evaluate(tf_real))
+            self.assertAllEqual(np_imag, self.evaluate(tf_imag))
 
   def _compareRealImag(self, cplx, use_gpu):
     np_real, np_imag = np.real(cplx), np.imag(cplx)
@@ -1052,27 +1092,36 @@ class ComplexMakeRealImagTest(test.TestCase):
     self.assertAllClose(np_angle, tf_angle_val)
     self.assertShapeEqual(np_angle, tf_angle)
 
-  def testAngle64(self):
-    real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float32)
-    imag = (np.arange(-3, 3) / 5.).reshape([1, 3, 2]).astype(np.float32)
-    cplx = real + 1j * imag
-    self._compareAngle(cplx, use_gpu=False)
-    self._compareAngle(cplx, use_gpu=True)
-
   def testAngle(self):
-    real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float64)
-    imag = (np.arange(-3, 3) / 5.).reshape([1, 3, 2]).astype(np.float64)
-    cplx = real + 1j * imag
+    mag = np.random.rand(10).astype(np.float32)
+    angle = (2 * np.pi * np.arange(10) / 10.).astype(np.float32)
+    cplx = mag * np.exp(1j * angle)
+    cplx = np.append(cplx, [1., 1.j, -1., -1.j])
     self._compareAngle(cplx, use_gpu=False)
     self._compareAngle(cplx, use_gpu=True)
+    real = (np.arange(-2, 2) / 2.).astype(np.float64)
+    self._compareAngle(real, use_gpu=False)
+    self._compareAngle(real, use_gpu=True)
+
+  def testAngle64(self):
+    mag = np.random.rand(10).astype(np.float64)
+    angle = (2 * np.pi * np.arange(10) / 100.).astype(np.float64)
+    cplx = mag * np.exp(1j * angle)
+    cplx = np.append(cplx, [1., 1.j, -1., -1.j])
+    self._compareAngle(cplx, use_gpu=False)
+    self._compareAngle(cplx, use_gpu=True)
+    real = (np.arange(-2, 2) / 2.).astype(np.float64)
+    self._compareAngle(real, use_gpu=False)
+    self._compareAngle(real, use_gpu=True)
 
   @test_util.run_deprecated_v1
   def testRealReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
                   dtypes_lib.float64):
-      x = array_ops.placeholder(dtype)
-      y = math_ops.real(x)
-      self.assertEqual(x, y)
+      with self.subTest(dtype=dtype):
+        x = array_ops.placeholder(dtype)
+        y = math_ops.real(x)
+        self.assertEqual(x, y)
 
   def _compareConj(self, cplx, use_gpu):
     np_ans = np.conj(cplx)
@@ -1101,9 +1150,10 @@ class ComplexMakeRealImagTest(test.TestCase):
   def testConjReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
                   dtypes_lib.float32, dtypes_lib.float64):
-      x = array_ops.placeholder(dtype)
-      y = math_ops.conj(x)
-      self.assertEqual(x, y)
+      with self.subTest(dtype=dtype):
+        x = array_ops.placeholder(dtype)
+        y = math_ops.conj(x)
+        self.assertEqual(x, y)
 
   @test_util.run_deprecated_v1
   def testConjString(self):
@@ -1137,10 +1187,11 @@ class ComplexMakeRealImagTest(test.TestCase):
     epsilon = 1e-3
     with self.cached_session():
       for args in [(x_, 0.), (0., x_)]:
-        z = math_ops.reduce_sum(math_ops.abs(math_ops.complex(*args)))
-        jacob_t, jacob_n = gradient_checker.compute_gradient(
-            x_, list(x.shape), z, [1], x_init_value=x, delta=epsilon)
-        self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
+        with self.subTest(args=args):
+          z = math_ops.reduce_sum(math_ops.abs(math_ops.complex(*args)))
+          jacob_t, jacob_n = gradient_checker.compute_gradient(
+              x_, list(x.shape), z, [1], x_init_value=x, delta=epsilon)
+          self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
   @test_util.run_deprecated_v1
   def testGradient(self):
@@ -1199,7 +1250,8 @@ class PolyvalTest(test.TestCase):
         np.int32, np.float32, np.float64, np.complex64, np.complex128
     ]:
       for degree in range(5):
-        self._runtest(dtype, degree)
+        with self.subTest(dtype=dtype, degree=degree):
+          self._runtest(dtype, degree)
 
   def testBroadcast(self):
     dtype = np.float32
@@ -1207,15 +1259,16 @@ class PolyvalTest(test.TestCase):
     shapes = [(1,), (2, 1), (1, 2), (2, 2)]
     for x_shape in shapes:
       for coeff_shape in shapes:
-        x = np.random.rand(*x_shape).astype(dtype)
-        coeffs = [
-            np.random.rand(*coeff_shape).astype(dtype)
-            for _ in range(degree + 1)
-        ]
-        np_val = np.polyval(coeffs, x)
-        with self.cached_session():
-          tf_val = math_ops.polyval(coeffs, x)
-          self.assertAllClose(np_val, self.evaluate(tf_val))
+        with self.subTest(x_shape=x_shape, coeff_shape=coeff_shape):
+          x = np.random.rand(*x_shape).astype(dtype)
+          coeffs = [
+              np.random.rand(*coeff_shape).astype(dtype)
+              for _ in range(degree + 1)
+          ]
+          np_val = np.polyval(coeffs, x)
+          with self.cached_session():
+            tf_val = math_ops.polyval(coeffs, x)
+            self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testEmpty(self):
     x = np.random.rand(2, 2).astype(np.float32)
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index d824e95f213..6e073f0d526 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -23,7 +23,6 @@ import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -35,32 +34,30 @@ class DenormalTest(test.TestCase):
       tiny = np.finfo(dtype).tiny
       self.assertEqual(tiny, tiny / 16 * 16)
 
-  def _flushDenormalsTest(self, use_gpu, dtypes):
-    if platform.machine() == "ppc64le" or platform.machine(
-    ) == "s390x" or platform.machine() == "aarch64":
+  def _flushDenormalsTest(self, dtypes):
+    if (platform.machine() == "ppc64le" or platform.machine() == "s390x" or
+        platform.machine() == "aarch64"):
       # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
-    with self.cached_session(use_gpu=use_gpu):
-      array_ops.identity(7).eval()
-      for dtype in dtypes:
-        tiny = np.finfo(dtype).tiny
-        # Small shape to test main thread, large shape to test thread pool
-        for shape in (), (1 << 20,):
-          flush = 0.1 * constant_op.constant(tiny, shape=shape)
-          self.assertAllEqual(flush.eval(), np.zeros(shape))
-          # Make sure the flags don't leak out
-          self.testPythonHasDenormals()
+    for dtype in dtypes:
+      tiny = np.finfo(dtype).tiny
+      # Small shape to test main thread, large shape to test thread pool
+      for shape in (), (1 << 20,):
+        flush = 0.1 * constant_op.constant(tiny, shape=shape)
+        self.assertAllEqual(self.evaluate(flush), np.zeros(shape))
+        # Make sure the flags don't leak out
+        self.testPythonHasDenormals()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
-    self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
+    self._flushDenormalsTest(dtypes=(np.float32, np.float64))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
-    self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
+    self._flushDenormalsTest(dtypes=(np.float32,))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 9fbc161aafe..549d7b4c98e 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -189,7 +189,6 @@ cuda_py_test(
 cuda_py_test(
     name = "multinomial_test",
     srcs = ["multinomial_test.py"],
-    tags = ["manual"],  # b/69001419
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/eig_op_test.py b/tensorflow/python/kernel_tests/eig_op_test.py
index beaf0f574ca..b1c83959f27 100644
--- a/tensorflow/python/kernel_tests/eig_op_test.py
+++ b/tensorflow/python/kernel_tests/eig_op_test.py
@@ -24,9 +24,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sort_ops
 from tensorflow.python.platform import test
 
 
@@ -82,7 +84,7 @@ class EigTest(test.TestCase):
             "self_adjoint_eig_fail_if_denorms_flushed.txt")).astype(np.float32)
     self.assertEqual(matrix.shape, (32, 32))
     matrix_tensor = constant_op.constant(matrix)
-    with self.session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as _:
       (e, v) = self.evaluate(linalg_ops.self_adjoint_eig(matrix_tensor))
       self.assertEqual(e.size, 32)
       self.assertAllClose(
@@ -99,9 +101,8 @@ def SortEigenValues(e):
 def SortEigenDecomposition(e, v):
   if v.ndim < 2:
     return e, v
-  else:
-    perm = np.argsort(e.real + e.imag, -1)
-    return np.take(e, perm, -1), np.take(v, perm, -1)
+  perm = np.argsort(e.real + e.imag, -1)
+  return np.take(e, perm, -1), np.take(v, perm, -1)
 
 
 def EquilibrateEigenVectorPhases(x, y):
@@ -147,17 +148,23 @@ def _GetEigTest(dtype_, shape_, compute_v_):
     n = shape_[-1]
     batch_shape = shape_[:-2]
     np_dtype = dtype_.as_numpy_dtype
-    # most of matrices are diagonalizable # TODO
-    a = np.random.uniform(
-        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-    if dtype_.is_complex:
-      a += 1j * np.random.uniform(
+
+    def RandomInput():
+      # Most matrices are diagonalizable
+      a = np.random.uniform(
           low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-    a = np.tile(a, batch_shape + (1, 1))
+      if dtype_.is_complex:
+        a += 1j * np.random.uniform(
+            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+      a = np.tile(a, batch_shape + (1, 1))
+      return a
+
     if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
       atol = 1e-4
     else:
       atol = 1e-12
+
+    a = RandomInput()
     np_e, np_v = np.linalg.eig(a)
     with self.session(use_gpu=True):
       if compute_v_:
@@ -182,6 +189,72 @@ def _GetEigTest(dtype_, shape_, compute_v_):
   return Test
 
 
+class EigGradTest(test.TestCase):
+  pass  # Filled in below
+
+
+def _GetEigGradTest(dtype_, shape_, compute_v_):
+
+  def Test(self):
+    np.random.seed(1)
+    n = shape_[-1]
+    batch_shape = shape_[:-2]
+    np_dtype = dtype_.as_numpy_dtype
+
+    def RandomInput():
+      # Most matrices are diagonalizable
+      a = np.random.uniform(
+          low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+      if dtype_.is_complex:
+        a += 1j * np.random.uniform(
+            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+      a = np.tile(a, batch_shape + (1, 1))
+      return a
+
+    # Optimal stepsize for central difference is O(epsilon^{1/3}).
+    epsilon = np.finfo(np_dtype).eps
+    delta = 0.1 * epsilon**(1.0 / 3.0)
+    # tolerance obtained by looking at actual differences using
+    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    # after discarding one random input sample
+    _ = RandomInput()
+    if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
+      tol = 1e-2
+    else:
+      tol = 1e-7
+    with self.session(use_gpu=True):
+
+      def Compute(x):
+        e, v = linalg_ops.eig(x)
+
+        # We sort eigenvalues by e.real+e.imag to have consistent
+        # order between runs
+        b_dims = len(e.shape) - 1
+        idx = sort_ops.argsort(math_ops.real(e) + math_ops.imag(e), axis=-1)
+        e = array_ops.gather(e, idx, batch_dims=b_dims)
+        v = array_ops.gather(v, idx, batch_dims=b_dims)
+
+        # (complex) Eigenvectors are only unique up to an arbitrary phase
+        # We normalize the vectors such that the first component has phase 0.
+        top_rows = v[..., 0:1, :]
+        angle = -math_ops.angle(top_rows)
+        phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
+        v *= phase
+        return e, v
+
+      if compute_v_:
+        funcs = [lambda x: Compute(x)[0], lambda x: Compute(x)[1]]
+      else:
+        funcs = [linalg_ops.eigvals]
+
+      for f in funcs:
+        theoretical, numerical = gradient_checker_v2.compute_gradient(
+            f, [RandomInput()], delta=delta)
+        self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+  return Test
+
+
 if __name__ == "__main__":
   dtypes_to_test = [
       dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
@@ -194,5 +267,8 @@ if __name__ == "__main__":
           shape = batch_dims + (size, size)
           name = "%s_%s_%s" % (dtype.name, "_".join(map(str, shape)), compute_v)
           _AddTest(EigTest, "Eig", name, _GetEigTest(dtype, shape, compute_v))
-          # No gradient yet
+
+          if dtype not in [dtypes_lib.float32, dtypes_lib.float64]:
+            _AddTest(EigGradTest, "EigGrad", name,
+                     _GetEigGradTest(dtype, shape, compute_v))
   test.main()
diff --git a/tensorflow/python/kernel_tests/einsum_op_test.py b/tensorflow/python/kernel_tests/einsum_op_test.py
index 26cfb6abbb8..47d5d457193 100644
--- a/tensorflow/python/kernel_tests/einsum_op_test.py
+++ b/tensorflow/python/kernel_tests/einsum_op_test.py
@@ -42,10 +42,11 @@ class EinsumOpTest(test.TestCase):
     r = np.random.RandomState(0)
     inputs = []
     for shape in input_shapes:
-      arr = np.array(r.randn(*shape)).astype(dtype)
-      if dtype == np.complex64 or dtype == np.complex128:
-        arr += 1j * np.array(r.randn(*shape)).astype(dtype)
-      inputs.append(arr)
+      with self.subTest(s=s, shape=shape):
+        arr = np.array(r.randn(*shape)).astype(dtype)
+        if dtype == np.complex64 or dtype == np.complex128:
+          arr += 1j * np.array(r.randn(*shape)).astype(dtype)
+        inputs.append(arr)
     input_tensors = [constant_op.constant(x, shape=x.shape) for x in inputs]
     a = np.einsum(s, *inputs)
     b = self.evaluate(gen_linalg_ops.einsum(input_tensors, s))
@@ -160,10 +161,11 @@ class EinsumOpTest(test.TestCase):
       input_shapes = [(2, 2), (2, 2)]
       inputs = []
       for shape in input_shapes:
-        arr = np.array(r.randn(*shape)).astype(dtype)
-        if dtype == np.complex64 or dtype == np.complex128:
-          arr += 1j * np.array(r.randn(*shape)).astype(dtype)
-        inputs.append(arr)
+        with self.subTest(dtype=dtype, shape=shape):
+          arr = np.array(r.randn(*shape)).astype(dtype)
+          if dtype == np.complex64 or dtype == np.complex128:
+            arr += 1j * np.array(r.randn(*shape)).astype(dtype)
+          inputs.append(arr)
       input_tensors = [constant_op.constant(x) for x in inputs]
       if dtype == bfloat16:
         # np.einsum doesn't support bfloat16.
@@ -199,14 +201,15 @@ class EinsumOpTest(test.TestCase):
         ('...ij,...jk->ik', r.randn(2, 2, 3), r.randn(3, 4)),
     ]
     for args in cases:
-      with self.assertRaises((ValueError, errors.InvalidArgumentError)):
-        _ = self.evaluate(gen_linalg_ops.einsum(args[1:], args[0]))
+      with self.subTest(args=args):
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          _ = self.evaluate(gen_linalg_ops.einsum(args[1:], args[0]))
 
-      placeholders = [
-          array_ops.placeholder_with_default(x, shape=None) for x in args[1:]
-      ]
-      with self.assertRaises((ValueError, errors.InvalidArgumentError)):
-        _ = self.evaluate(gen_linalg_ops.einsum(placeholders, args[0]))
+        placeholders = [
+            array_ops.placeholder_with_default(x, shape=None) for x in args[1:]
+        ]
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          _ = self.evaluate(gen_linalg_ops.einsum(placeholders, args[0]))
 
   @test_util.run_in_graph_and_eager_modes
   def testPlaceholder(self):
@@ -216,10 +219,12 @@ class EinsumOpTest(test.TestCase):
       inputs = []
       input_placeholders = []
       for actual_shape, placeholder_shape in input_and_placeholder_shapes:
-        input_np = np.array(r.randn(*actual_shape))
-        inputs.append(input_np)
-        input_placeholders.append(
-            array_ops.placeholder_with_default(input_np, placeholder_shape))
+        with self.subTest(equation=equation, actual_shape=actual_shape,
+                          placeholder_shape=placeholder_shape):
+          input_np = np.array(r.randn(*actual_shape))
+          inputs.append(input_np)
+          input_placeholders.append(
+              array_ops.placeholder_with_default(input_np, placeholder_shape))
 
       a = np.einsum(equation, *inputs)
       b = self.evaluate(gen_linalg_ops.einsum(input_placeholders, equation))
@@ -286,13 +291,24 @@ class EinsumGradTest(test.TestCase):
 
   def _check_gradient(self, s, *input_shapes):
     with self.cached_session():
-      r = np.random.RandomState(0)
-      inputs = [np.array(r.randn(*shape), np.float64) for shape in input_shapes]
-      input_tensors = [constant_op.constant(x, shape=x.shape) for x in inputs]
-      analytical, numerical = gradient_checker_v2.compute_gradient(
-          lambda *xs: gen_linalg_ops.einsum(xs, s), input_tensors)
-      self.assertLess(
-          gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+      r = np.random.RandomState(seed=0)
+      for dtype in (np.float32, np.float64, np.complex64, np.complex128):
+        with self.subTest(s=s, dtype=dtype):
+          tol = 10 * np.sqrt(np.finfo(dtype).resolution)
+          if dtype in (np.complex64, np.complex128):
+            inputs = [
+                np.array(r.randn(*shape), dtype) +
+                1j * np.array(r.randn(*shape), dtype) for shape in input_shapes
+            ]
+          else:
+            inputs = [
+                np.array(r.randn(*shape), dtype) for shape in input_shapes]
+          input_tensors = [
+              constant_op.constant(x, shape=x.shape) for x in inputs]
+          analytical, numerical = gradient_checker_v2.compute_gradient(
+              lambda *xs: gen_linalg_ops.einsum(xs, s), input_tensors)
+          self.assertLess(
+              gradient_checker_v2.max_error(analytical, numerical), tol)
 
   @test_util.disable_xla('b/131919749')
   def testUnary(self):
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 953f18bb07a..b966110963c 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -62,14 +62,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       data = np.array([0, 1, 2, 3, 7, 5])
       for dtype in _TEST_TYPES:
         for indices in 4, [1, 2, 2, 4, 5]:
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          indices_tf = constant_op.constant(indices)
-          gather_t = array_ops.gather(params, indices_tf)
-          gather_val = self.evaluate(gather_t)
-          np_val = params_np[indices]
-          self.assertAllEqual(np_val, gather_val)
-          self.assertEqual(np_val.shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, indices=indices):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            indices_tf = constant_op.constant(indices)
+            gather_t = array_ops.gather(params, indices_tf)
+            gather_val = self.evaluate(gather_t)
+            np_val = params_np[indices]
+            self.assertAllEqual(np_val, gather_val)
+            self.assertEqual(np_val.shape, gather_t.get_shape())
 
   def testScalar2D(self):
     with self.session(use_gpu=True):
@@ -77,14 +78,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
         for axis in range(data.ndim):
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          indices = constant_op.constant(2)
-          gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = self.evaluate(gather_t)
-          self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
-          expected_shape = data.shape[:axis] + data.shape[axis + 1:]
-          self.assertEqual(expected_shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, axis=axis):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            indices = constant_op.constant(2)
+            gather_t = array_ops.gather(params, indices, axis=axis)
+            gather_val = self.evaluate(gather_t)
+            self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
+            expected_shape = data.shape[:axis] + data.shape[axis + 1:]
+            self.assertEqual(expected_shape, gather_t.get_shape())
 
   def testSimpleTwoD32(self):
     with self.session(use_gpu=True):
@@ -92,16 +94,17 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
         for axis in range(data.ndim):
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          # The indices must be in bounds for any axis.
-          indices = constant_op.constant([0, 1, 0, 2])
-          gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = self.evaluate(gather_t)
-          self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
-                              gather_val)
-          expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
-          self.assertEqual(expected_shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, axis=axis):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            # The indices must be in bounds for any axis.
+            indices = constant_op.constant([0, 1, 0, 2])
+            gather_t = array_ops.gather(params, indices, axis=axis)
+            gather_val = self.evaluate(gather_t)
+            self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
+                                gather_val)
+            expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
+            self.assertEqual(expected_shape, gather_t.get_shape())
 
   @test_util.run_deprecated_v1
   def testHigherRank(self):
@@ -112,58 +115,60 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         for axis in range(len(shape)):
           params = self._buildParams(np.random.randn(*shape), dtype)
           indices = np.random.randint(shape[axis], size=indices_shape)
-          with self.cached_session(use_gpu=True) as sess:
-            tf_params = constant_op.constant(params)
-            tf_indices = constant_op.constant(indices)
-            # Check that both positive and negative indices for axis work.
-            tf_axis = constant_op.constant(axis)
-            tf_negative_axis = constant_op.constant(-len(shape) + axis)
-            gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
-            gather_negative_axis = array_ops.gather(
-                tf_params, tf_indices, axis=tf_negative_axis)
-            gather_value, gather_negative_axis_value = sess.run(
-                [gather, gather_negative_axis])
-            gather_np = np.take(params, indices, axis)
-            self.assertAllEqual(gather_np, gather_value)
-            self.assertAllEqual(gather_np, gather_negative_axis_value)
-            expected_shape = (params.shape[:axis] + indices.shape +
-                              params.shape[axis + 1:])
-            self.assertEqual(expected_shape, gather.shape)
-            self.assertEqual(expected_shape, gather_negative_axis.shape)
+          with self.subTest(indices_shape=indices_shape, dtype=dtype, axis=axis,
+                            indices=indices):
+            with self.cached_session(use_gpu=True) as sess:
+              tf_params = constant_op.constant(params)
+              tf_indices = constant_op.constant(indices)
+              # Check that both positive and negative indices for axis work.
+              tf_axis = constant_op.constant(axis)
+              tf_negative_axis = constant_op.constant(-len(shape) + axis)
+              gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
+              gather_negative_axis = array_ops.gather(
+                  tf_params, tf_indices, axis=tf_negative_axis)
+              gather_value, gather_negative_axis_value = sess.run(
+                  [gather, gather_negative_axis])
+              gather_np = np.take(params, indices, axis)
+              self.assertAllEqual(gather_np, gather_value)
+              self.assertAllEqual(gather_np, gather_negative_axis_value)
+              expected_shape = (params.shape[:axis] + indices.shape +
+                                params.shape[axis + 1:])
+              self.assertEqual(expected_shape, gather.shape)
+              self.assertEqual(expected_shape, gather_negative_axis.shape)
 
-            # Test gradients
-            gather_grad = np.random.randn(
-                *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
-            if dtype.is_complex:
-              gather_grad -= 1j * gather_grad
-            params_grad, indices_grad, axis_grad = gradients_impl.gradients(
-                gather, [tf_params, tf_indices, tf_axis], gather_grad)
-            self.assertEqual(indices_grad, None)
-            self.assertEqual(axis_grad, None)
-            if dtype.is_integer:
-              self.assertEqual(params_grad, None)
-              continue
-            # For axis 0, we are able to create an efficient IndexedSlices for
-            # the gradient.
-            if axis == 0:
-              self.assertEqual(type(params_grad), ops.IndexedSlices)
-              params_grad = ops.convert_to_tensor(params_grad)
-            correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
-            outer_dims = axis
-            inner_dims = len(shape) - axis - 1
-            gather_grad = gather_grad.reshape(
-                shape[:axis] + (indices.size,) + shape[axis + 1:])
-            for source_index, dest_index in enumerate(indices.flat):
-              dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
-                            (slice(None),) * inner_dims)
-              source_slice = ((slice(None),) * outer_dims + (source_index,) +
+              # Test gradients
+              gather_grad = np.random.randn(
+                  *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
+              if dtype.is_complex:
+                gather_grad -= 1j * gather_grad
+              params_grad, indices_grad, axis_grad = gradients_impl.gradients(
+                  gather, [tf_params, tf_indices, tf_axis], gather_grad)
+              self.assertEqual(indices_grad, None)
+              self.assertEqual(axis_grad, None)
+              if dtype.is_integer:
+                self.assertEqual(params_grad, None)
+                continue
+              # For axis 0, we are able to create an efficient IndexedSlices for
+              # the gradient.
+              if axis == 0:
+                self.assertEqual(type(params_grad), ops.IndexedSlices)
+                params_grad = ops.convert_to_tensor(params_grad)
+              correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
+              outer_dims = axis
+              inner_dims = len(shape) - axis - 1
+              gather_grad = gather_grad.reshape(
+                  shape[:axis] + (indices.size,) + shape[axis + 1:])
+              for source_index, dest_index in enumerate(indices.flat):
+                dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
                               (slice(None),) * inner_dims)
-              correct_params_grad[dest_slice] += gather_grad[source_slice]
-            self.assertAllClose(
-                correct_params_grad,
-                self.evaluate(params_grad),
-                atol=2e-6,
-                rtol=2e-6)
+                source_slice = ((slice(None),) * outer_dims + (source_index,) +
+                                (slice(None),) * inner_dims)
+                correct_params_grad[dest_slice] += gather_grad[source_slice]
+              self.assertAllClose(
+                  correct_params_grad,
+                  self.evaluate(params_grad),
+                  atol=2e-6,
+                  rtol=2e-6)
 
   @test_util.run_deprecated_v1
   def testString(self):
@@ -177,12 +182,14 @@ class GatherTest(test.TestCase, parameterized.TestCase):
   @test_util.run_deprecated_v1
   def testUInt32AndUInt64(self):
     for unsigned_type in (dtypes.uint32, dtypes.uint64):
-      params = self._buildParams(
-          np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
-      with self.cached_session():
-        self.assertAllEqual([7, 8, 9],
-                            array_ops.gather(params, 1, axis=0).eval())
-        self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1).eval())
+      with self.subTest(unsigned_type=unsigned_type):
+        params = self._buildParams(
+            np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
+        with self.cached_session():
+          self.assertAllEqual([7, 8, 9],
+                              array_ops.gather(params, 1, axis=0).eval())
+          self.assertAllEqual([1, 7],
+                              array_ops.gather(params, 0, axis=1).eval())
 
   @test_util.run_deprecated_v1
   def testUnknownIndices(self):
@@ -237,14 +244,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       indices = 0
       for bad_axis in (1, 2, -2):
         # Shape inference can validate axis for known params rank.
-        with self.assertRaisesWithPredicateMatch(
-            ValueError, "Shape must be at least rank . but is rank 1"):
-          array_ops.gather(params, indices, axis=bad_axis)
-        # If params rank is unknown, an op error occurs.
-        with self.assertRaisesOpError(
-            r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
-          array_ops.gather(params_ph, indices, axis=bad_axis).eval(
-              feed_dict={params_ph: params})
+        with self.subTest(bad_axis=bad_axis):
+          with self.assertRaisesWithPredicateMatch(
+              ValueError, "Shape must be at least rank . but is rank 1"):
+            array_ops.gather(params, indices, axis=bad_axis)
+          # If params rank is unknown, an op error occurs.
+          with self.assertRaisesOpError(
+              r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
+            array_ops.gather(params_ph, indices, axis=bad_axis).eval(
+                feed_dict={params_ph: params})
 
   @test_util.run_deprecated_v1
   def testEmptySlices(self):
@@ -252,20 +260,21 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
           # Leading axis gather.
-          params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
-          indices = np.array([3, 4], dtype=itype)
-          gather = array_ops.gather(params, indices, axis=0)
-          self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
+          with self.subTest(dtype=dtype, itype=itype):
+            params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
+            indices = np.array([3, 4], dtype=itype)
+            gather = array_ops.gather(params, indices, axis=0)
+            self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
 
-          # Middle axis gather.
-          params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
-          gather = array_ops.gather(params, indices, axis=1)
-          self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
+            # Middle axis gather.
+            params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
+            gather = array_ops.gather(params, indices, axis=1)
+            self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
 
-          # Trailing axis gather.
-          params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
-          gather = array_ops.gather(params, indices, axis=2)
-          self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
+            # Trailing axis gather.
+            params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
+            gather = array_ops.gather(params, indices, axis=2)
+            self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
 
   @parameterized.parameters([
       # batch_dims=0 (equivalent to tf.gather)
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 20cd128783e..916d9a4b8c8 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -66,10 +66,11 @@ class CholeskySolveTest(test.TestCase):
                _RandomPDMatrix(n, self.rng)]).astype(np_type)
           chol = linalg_ops.cholesky(array)
           for k in range(1, 3):
-            rhs = self.rng.randn(2, n, k).astype(np_type)
-            x = linalg_ops.cholesky_solve(chol, rhs)
-            self.assertAllClose(
-                rhs, math_ops.matmul(array, x).eval(), atol=atol)
+            with self.subTest(n=n, np_type=np_type, atol=atol, k=k):
+              rhs = self.rng.randn(2, n, k).astype(np_type)
+              x = linalg_ops.cholesky_solve(chol, rhs)
+              self.assertAllClose(
+                  rhs, math_ops.matmul(array, x).eval(), atol=atol)
 
 
 class LogdetTest(test.TestCase):
@@ -82,24 +83,26 @@ class LogdetTest(test.TestCase):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                              (np.complex64, 0.05), (np.complex128, 1e-5)]:
-        matrix = _RandomPDMatrix(n, self.rng, np_dtype)
-        _, logdet_np = np.linalg.slogdet(matrix)
-        with self.session(use_gpu=True):
-          # Create 2 x n x n matrix
-          # matrix = np.array(
-          #     [_RandomPDMatrix(n, self.rng, np_dtype),
-          #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
-          logdet_tf = linalg.logdet(matrix)
-          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
+        with self.subTest(n=n, np_dtype=np_dtype, atol=atol):
+          matrix = _RandomPDMatrix(n, self.rng, np_dtype)
+          _, logdet_np = np.linalg.slogdet(matrix)
+          with self.session(use_gpu=True):
+            # Create 2 x n x n matrix
+            # matrix = np.array(
+            #     [_RandomPDMatrix(n, self.rng, np_dtype),
+            #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
+            logdet_tf = linalg.logdet(matrix)
+            self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                            (np.complex64, 0.05), (np.complex128, 1e-5)]:
-      matrix = (np.eye(20) * 1e-6).astype(np_dtype)
-      _, logdet_np = np.linalg.slogdet(matrix)
-      with self.session(use_gpu=True):
-        logdet_tf = linalg.logdet(matrix)
-        self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
+      with self.subTest(np_dtype=np_dtype, atol=atol):
+        matrix = (np.eye(20) * 1e-6).astype(np_dtype)
+        _, logdet_np = np.linalg.slogdet(matrix)
+        with self.session(use_gpu=True):
+          logdet_tf = linalg.logdet(matrix)
+          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
 
 class SlogdetTest(test.TestCase):
@@ -112,7 +115,20 @@ class SlogdetTest(test.TestCase):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                              (np.complex64, 0.05), (np.complex128, 1e-5)]:
-        matrix = _RandomPDMatrix(n, self.rng, np_dtype)
+        with self.subTest(n=n, np_dtype=np_dtype, atol=atol):
+          matrix = _RandomPDMatrix(n, self.rng, np_dtype)
+          sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
+          with self.session(use_gpu=True):
+            sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
+            self.assertAllClose(
+                log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+            self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
+
+  def test_works_with_underflow_case(self):
+    for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
+                           (np.complex64, 0.05), (np.complex128, 1e-5)]:
+      with self.subTest(np_dtype=np_dtype, atol=atol):
+        matrix = (np.eye(20) * 1e-6).astype(np_dtype)
         sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
         with self.session(use_gpu=True):
           sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
@@ -120,30 +136,20 @@ class SlogdetTest(test.TestCase):
               log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
           self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
-  def test_works_with_underflow_case(self):
-    for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
-                           (np.complex64, 0.05), (np.complex128, 1e-5)]:
-      matrix = (np.eye(20) * 1e-6).astype(np_dtype)
-      sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
-      with self.session(use_gpu=True):
-        sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-        self.assertAllClose(
-            log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
-        self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
-
 
 class AdjointTest(test.TestCase):
 
   def test_compare_to_numpy(self):
     for dtype in np.float64, np.float64, np.complex64, np.complex128:
-      matrix_np = np.array([[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j,
-                                                       6 + 6j]]).astype(dtype)
-      expected_transposed = np.conj(matrix_np.T)
-      with self.session():
-        matrix = ops.convert_to_tensor(matrix_np)
-        transposed = linalg.adjoint(matrix)
-        self.assertEqual((3, 2), transposed.get_shape())
-        self.assertAllEqual(expected_transposed, self.evaluate(transposed))
+      with self.subTest(dtype=dtype):
+        matrix_np = np.array([[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j,
+                                                         6 + 6j]]).astype(dtype)
+        expected_transposed = np.conj(matrix_np.T)
+        with self.session():
+          matrix = ops.convert_to_tensor(matrix_np)
+          transposed = linalg.adjoint(matrix)
+          self.assertEqual((3, 2), transposed.get_shape())
+          self.assertAllEqual(expected_transposed, self.evaluate(transposed))
 
 
 class EyeTest(parameterized.TestCase, test.TestCase):
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index 1c0280c3ce6..de9d8c32cb5 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -128,14 +128,16 @@ class LuOpTest(test.TestCase):
 
     for dtype in (np.float32, np.float64):
       for output_idx_type in (dtypes.int32, dtypes.int64):
-        self._verifyLu(data.astype(dtype), output_idx_type=output_idx_type)
+        with self.subTest(dtype=dtype, output_idx_type=output_idx_type):
+          self._verifyLu(data.astype(dtype), output_idx_type=output_idx_type)
 
     for dtype in (np.complex64, np.complex128):
       for output_idx_type in (dtypes.int32, dtypes.int64):
-        complex_data = np.tril(1j * data, -1).astype(dtype)
-        complex_data += np.triu(-1j * data, 1).astype(dtype)
-        complex_data += data
-        self._verifyLu(complex_data, output_idx_type=output_idx_type)
+        with self.subTest(dtype=dtype, output_idx_type=output_idx_type):
+          complex_data = np.tril(1j * data, -1).astype(dtype)
+          complex_data += np.triu(-1j * data, 1).astype(dtype)
+          complex_data += data
+          self._verifyLu(complex_data, output_idx_type=output_idx_type)
 
   def testPivoting(self):
     # This matrix triggers partial pivoting because the first diagonal entry
@@ -144,38 +146,41 @@ class LuOpTest(test.TestCase):
     self._verifyLu(data.astype(np.float32))
 
     for dtype in (np.float32, np.float64):
-      self._verifyLu(data.astype(dtype))
-      _, p = linalg_ops.lu(data)
-      p_val = self.evaluate([p])
-      # Make sure p_val is not the identity permutation.
-      self.assertNotAllClose(np.arange(3), p_val)
+      with self.subTest(dtype=dtype):
+        self._verifyLu(data.astype(dtype))
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
 
     for dtype in (np.complex64, np.complex128):
-      complex_data = np.tril(1j * data, -1).astype(dtype)
-      complex_data += np.triu(-1j * data, 1).astype(dtype)
-      complex_data += data
-      self._verifyLu(complex_data)
-      _, p = linalg_ops.lu(data)
-      p_val = self.evaluate([p])
-      # Make sure p_val is not the identity permutation.
-      self.assertNotAllClose(np.arange(3), p_val)
+      with self.subTest(dtype=dtype):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyLu(complex_data)
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
 
   def testInvalidMatrix(self):
     # LU factorization gives an error when the input is singular.
     # Note: A singular matrix may return without error but it won't be a valid
     # factorization.
     for dtype in self.float_types:
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(
-            linalg_ops.lu(
-                np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
-                         dtype=dtype)))
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(
-            linalg_ops.lu(
-                np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
-                          [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
-                         dtype=dtype)))
+      with self.subTest(dtype=dtype):
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
+                           dtype=dtype)))
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
+                            [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
+                           dtype=dtype)))
 
   def testBatch(self):
     simple_array = np.array([[[1., -1.], [2., 5.]]])  # shape (1, 2, 2)
@@ -209,15 +214,20 @@ class LuOpTest(test.TestCase):
     data = np.random.rand(n, n) + 1j * np.random.rand(n, n)
     self._verifyLu(data)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyLu(np.empty([0, 2, 2]))
     self._verifyLu(np.empty([2, 0, 0]))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    matrix1 = random_ops.random_normal([5, 5], seed=42)
-    matrix2 = random_ops.random_normal([5, 5], seed=42)
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    self.assertAllEqual(matrix1, matrix2)
     lu1, p1 = linalg_ops.lu(matrix1)
     lu2, p2 = linalg_ops.lu(matrix2)
     lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])
diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 1e10d689886..913c3a49cb0 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -186,6 +187,25 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(-nums, received[1])
     self.assertAllEqual(nums, received[2])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_autograph_indirect(self):
+
+    def test_function(x):
+      cond = constant_op.constant(-1)
+      if cond == 0:
+        result = x
+      else:
+        result = x
+      return result
+
+    @def_function.function
+    def map_call(x):
+      return map_fn.map_fn(test_function, x)
+
+    x = constant_op.constant([1])
+    y = map_call(x)
+    self.assertAllEqual([1], self.evaluate(y))
+
   @test_util.run_in_graph_and_eager_modes
   def testMapShape(self):
     x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index b99c8f6d256..b7a159e2eff 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -89,6 +90,8 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
     if not fast and l2_regularizer != 0:
       # The slow path does not support regularization.
       return
+    if use_placeholder and context.executing_eagerly():
+      return
     maxdim = np.max(x.shape)
     if dtype == np.float32 or dtype == np.complex64:
       tol = maxdim * 5e-4
@@ -109,64 +112,70 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
         b = np.tile(b, batch_shape + (1, 1))
         np_ans = np.tile(np_ans, batch_shape + (1, 1))
         np_r_norm = np.tile(np_r_norm, batch_shape)
-      with self.cached_session(use_gpu=fast) as sess:
-        if use_placeholder:
-          a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
-          b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
-          feed_dict = {a_ph: a, b_ph: b}
-          tf_ans = linalg_ops.matrix_solve_ls(
-              a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer)
-        else:
-          tf_ans = linalg_ops.matrix_solve_ls(
-              a, b, fast=fast, l2_regularizer=l2_regularizer)
-          feed_dict = {}
-          self.assertEqual(np_ans.shape, tf_ans.get_shape())
-        if l2_regularizer == 0:
-          # The least squares solution should satisfy A^H * (b - A*x) = 0.
-          tf_r = b - math_ops.matmul(a, tf_ans)
-          tf_r = math_ops.matmul(a, tf_r, adjoint_a=True)
-          tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1])
-          tf_ans_val, tf_r_norm_val = sess.run(
-              [tf_ans, tf_r_norm], feed_dict=feed_dict)
-          self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol)
-        else:
+      if use_placeholder:
+        a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
+        b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
+        feed_dict = {a_ph: a, b_ph: b}
+        tf_ans = linalg_ops.matrix_solve_ls(
+            a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer)
+      else:
+        tf_ans = linalg_ops.matrix_solve_ls(
+            a, b, fast=fast, l2_regularizer=l2_regularizer)
+        feed_dict = None
+        self.assertEqual(np_ans.shape, tf_ans.get_shape())
+      if feed_dict:
+        with self.session(use_gpu=True) as sess:
           tf_ans_val = sess.run(tf_ans, feed_dict=feed_dict)
-
+      else:
+        tf_ans_val = self.evaluate(tf_ans)
       self.assertEqual(np_ans.shape, tf_ans_val.shape)
       self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
 
-  @test_util.run_v1_only("b/120545219")
+      if l2_regularizer == 0:
+        # The least squares solution should satisfy A^H * (b - A*x) = 0.
+        tf_r = b - math_ops.matmul(a, tf_ans)
+        tf_r = math_ops.matmul(a, tf_r, adjoint_a=True)
+        tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1])
+        if feed_dict:
+          with self.session(use_gpu=True) as sess:
+            tf_ans_val, tf_r_norm_val = sess.run([tf_ans, tf_r_norm],
+                                                 feed_dict=feed_dict)
+        else:
+          tf_ans_val, tf_r_norm_val = self.evaluate([tf_ans, tf_r_norm])
+        self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol)
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
     with self.session(use_gpu=True):
       matrix = constant_op.constant([[1., 0.], [0., 1.]])
       rhs = constant_op.constant([[1., 0.]])
-      with self.assertRaises(ValueError):
+      with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
         linalg_ops.matrix_solve_ls(matrix, rhs)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     full = np.array([[1., 2.], [3., 4.], [5., 6.]])
     empty0 = np.empty([3, 0])
     empty1 = np.empty([0, 2])
     for fast in [True, False]:
-      with self.cached_session(use_gpu=True):
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
-        self.assertEqual(tf_ans.shape, (0, 0))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
-        self.assertEqual(tf_ans.shape, (0, 2))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
-        self.assertEqual(tf_ans.shape, (2, 0))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
-        self.assertEqual(tf_ans.shape, (2, 2))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
+      self.assertEqual(tf_ans.shape, (0, 0))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
+      self.assertEqual(tf_ans.shape, (0, 2))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
+      self.assertEqual(tf_ans.shape, (2, 0))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
+      self.assertEqual(tf_ans.shape, (2, 2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBatchResultSize(self):
     # 3x3x3 matrices, 3x3x1 right-hand sides.
-    matrix = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.] * 3).reshape(3, 3, 3)
+    matrix = np.array([1., 0., 0., 0., 1., 0., 0., 0., 1.] * 3).reshape(3, 3, 3)
     rhs = np.array([1., 2., 3.] * 3).reshape(3, 3, 1)
     answer = linalg_ops.matrix_solve(matrix, rhs)
     ls_answer = linalg_ops.matrix_solve_ls(matrix, rhs)
@@ -358,8 +367,7 @@ if __name__ == "__main__":
     # ROCm does not support BLAS operations for complex types
     dtypes_to_test += [np.complex64, np.complex128]
   for dtype_ in dtypes_to_test:
-    # TF2 does not support placeholders under eager so we skip it
-    for use_placeholder_ in set([False, not tf2.enabled()]):
+    for use_placeholder_ in set([False, True]):
       for fast_ in [True, False]:
         l2_regularizers = [0] if dtype_ == np.complex128 else [0, 0.1]
         for l2_regularizer_ in l2_regularizers:
diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index 0b6b403210c..bbd909c8e58 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -21,14 +21,16 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -56,19 +58,19 @@ class MatrixSolveOpTest(test.TestCase):
           a_np = np.tile(a_np, batch_dims + [1, 1])
           b = np.tile(b, batch_dims + [1, 1])
         np_ans = np.linalg.solve(a_np, b)
-        for use_placeholder in False, True:
-          with self.cached_session(use_gpu=True) as sess:
-            if use_placeholder:
-              a_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
-              b_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
-              tf_ans = linalg_ops.matrix_solve(a_ph, b_ph, adjoint=adjoint)
+        for use_placeholder in set((False, not context.executing_eagerly())):
+          if use_placeholder:
+            a_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
+            b_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
+            tf_ans = linalg_ops.matrix_solve(a_ph, b_ph, adjoint=adjoint)
+            with self.cached_session(use_gpu=True) as sess:
               out = sess.run(tf_ans, {a_ph: a, b_ph: b})
-            else:
-              tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
-              out = self.evaluate(tf_ans)
-              self.assertEqual(tf_ans.get_shape(), out.shape)
-            self.assertEqual(np_ans.shape, out.shape)
-            self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
+          else:
+            tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
+            out = self.evaluate(tf_ans)
+            self.assertEqual(tf_ans.get_shape(), out.shape)
+          self.assertEqual(np_ans.shape, out.shape)
+          self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
 
   def _generateMatrix(self, m, n):
     matrix = (np.random.normal(-5, 5,
@@ -77,7 +79,7 @@ class MatrixSolveOpTest(test.TestCase):
         [m, n]))
     return matrix
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSolve(self):
     for n in 1, 2, 4, 9:
       matrix = self._generateMatrix(n, n)
@@ -85,7 +87,7 @@ class MatrixSolveOpTest(test.TestCase):
         rhs = self._generateMatrix(n, nrhs)
         self._verifySolve(matrix, rhs)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSolveBatch(self):
     for n in 2, 5:
       matrix = self._generateMatrix(n, n)
@@ -94,48 +96,50 @@ class MatrixSolveOpTest(test.TestCase):
         for batch_dims in [[2], [2, 2], [7, 4]]:
           self._verifySolve(matrix, rhs, batch_dims=batch_dims)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
     # When the solve of a non-square matrix is attempted we should return
     # an error
-    with self.session(use_gpu=True):
-      with self.assertRaises(ValueError):
-        matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
-        linalg_ops.matrix_solve(matrix, matrix)
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
+      self.evaluate(linalg_ops.matrix_solve(matrix, matrix))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
-    with self.session(use_gpu=True):
-      matrix = constant_op.constant([[1., 0.], [0., 1.]])
-      rhs = constant_op.constant([[1., 0.]])
-      with self.assertRaises(ValueError):
-        linalg_ops.matrix_solve(matrix, rhs)
+    matrix = constant_op.constant([[1., 0.], [0., 1.]])
+    rhs = constant_op.constant([[1., 0.]])
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      self.evaluate(linalg_ops.matrix_solve(matrix, rhs))
 
   def testNotInvertible(self):
     # The input should be invertible.
-    with self.session(use_gpu=True):
-      with self.assertRaisesOpError("Input matrix is not invertible."):
-        # All rows of the matrix below add to zero
-        matrix = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
-                                       [0., -1., 1.]])
-        linalg_ops.matrix_solve(matrix, matrix).eval()
+    with self.assertRaisesOpError("Input matrix is not invertible."):
+      # All rows of the matrix below add to zero
+      matrix = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
+                                     [0., -1., 1.]])
+      self.evaluate(linalg_ops.matrix_solve(matrix, matrix))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrent(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for adjoint_ in False, True:
-        lhs1 = random_ops.random_normal([3, 3], seed=42)
-        lhs2 = random_ops.random_normal([3, 3], seed=42)
-        rhs1 = random_ops.random_normal([3, 3], seed=42)
-        rhs2 = random_ops.random_normal([3, 3], seed=42)
-        s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
-        s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
-        all_ops += [s1, s2]
-      val = self.evaluate(all_ops)
-      self.assertAllEqual(val[0], val[1])
-      self.assertAllEqual(val[2], val[3])
+    seed = [42, 24]
+    matrix_shape = [3, 3]
+    all_ops = []
+    for adjoint_ in False, True:
+      lhs1 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      lhs2 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      rhs1 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      rhs2 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
+      s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
+      all_ops += [s1, s2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(all_ops), 2):
+      self.assertAllEqual(val[i], val[i + 1])
 
 
 class MatrixSolveBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index c36d83e2530..6cf330ed981 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -21,10 +21,11 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
 
 
@@ -89,31 +90,35 @@ class SquareRootOpTest(test.TestCase):
     self._verifySquareRootReal(np.empty([0, 2, 2]))
     self._verifySquareRootReal(np.empty([2, 0, 0]))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to the square root should be at least a 2-dimensional tensor.
     tensor = constant_op.constant([1., 2.])
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       gen_linalg_ops.matrix_square_root(tensor)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNotSquare(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
       self.evaluate(gen_linalg_ops.matrix_square_root(tensor))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with test_util.use_gpu():
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      square1 = math_ops.matmul(matrix1, matrix1)
-      square2 = math_ops.matmul(matrix2, matrix2)
-      sqrt1 = gen_linalg_ops.matrix_square_root(square1)
-      sqrt2 = gen_linalg_ops.matrix_square_root(square2)
-      all_ops = [sqrt1, sqrt2]
-      sqrt = self.evaluate(all_ops)
-      self.assertAllClose(sqrt[0], sqrt[1])
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    self.assertAllEqual(matrix1, matrix2)
+    square1 = math_ops.matmul(matrix1, matrix1)
+    square2 = math_ops.matmul(matrix2, matrix2)
+    sqrt1 = gen_linalg_ops.matrix_square_root(square1)
+    sqrt2 = gen_linalg_ops.matrix_square_root(square2)
+    all_ops = [sqrt1, sqrt2]
+    sqrt = self.evaluate(all_ops)
+    self.assertAllClose(sqrt[0], sqrt[1])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 4d31cd45289..475badb6efe 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -24,7 +24,6 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -67,7 +66,7 @@ class VerifyTensorAllFiniteTest(test.TestCase):
         self.evaluate(t_verified)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("add_check_numerics_op() is meant to be a v1-only API")
 class NumericsTest(test.TestCase):
 
   def testInf(self):
@@ -132,51 +131,6 @@ class NumericsTest(test.TestCase):
         r"or `tf.while_loop\(\)`\."):
       numerics.add_check_numerics_ops()
 
-  def testCheckNumericsV2OpNegativeAndPositiveInf(self):
-    """Test that CheckNumericsV2 op distinguishes negative and positive infs."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([-1.0, 1.0])
-      t2 = constant_op.constant([0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had -Inf and +Inf values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
-  def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self):
-    """CheckNumericsV2 op distinguishes - & + infs when nan is present."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([-1.0, 1.0, 0.0])
-      t2 = constant_op.constant([0.0, 0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had -Inf, +Inf, and NaN values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
-  def testCheckNumericsV2PositiveInfAndNaN(self):
-    """Test that CheckNumericsV2 op shows sign of inf when nan is present."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([0.0, 1.0])
-      t2 = constant_op.constant([0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had +Inf and NaN values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
 
 if __name__ == "__main__":
   # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index d9643f3d125..0e935dfe8c4 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -1,7 +1,7 @@
 # Tests of tf.io.*proto.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 4e0af934053..b1bbd0aaee3 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -30,7 +31,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -45,35 +46,37 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class QrOpTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
-    # The input to qr should be a tensor of at least rank 2.
+    # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 0"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*0"):
       linalg_ops.qr(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 1"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*1"):
       linalg_ops.qr(vector)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for full_matrices_ in True, False:
-        for rows_ in 4, 5:
-          for cols_ in 4, 5:
-            matrix1 = random_ops.random_normal([rows_, cols_], seed=42)
-            matrix2 = random_ops.random_normal([rows_, cols_], seed=42)
-            q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
-            q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
-            all_ops += [q1, r1, q2, r2]
-      val = self.evaluate(all_ops)
-      for i in range(8):
-        q = 4 * i
-        self.assertAllClose(val[q], val[q + 2])  # q1 == q2
-        self.assertAllClose(val[q + 1], val[q + 3])  # r1 == r2
+    seed = [42, 24]
+    all_ops = []
+    for full_matrices_ in True, False:
+      for rows_ in 4, 5:
+        for cols_ in 4, 5:
+          matrix_shape = [rows_, cols_]
+          matrix1 = stateless_random_ops.stateless_random_normal(
+              matrix_shape, seed)
+          matrix2 = stateless_random_ops.stateless_random_normal(
+              matrix_shape, seed)
+          self.assertAllEqual(matrix1, matrix2)
+          q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
+          q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
+          all_ops += [q1, q2, r1, r2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(val), 2):
+      self.assertAllClose(val[i], val[i + 1])
 
 
 def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
@@ -121,8 +124,10 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-14
     self.assertAllClose(identity, xx, atol=tol)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
+    if not use_static_shape_ and context.executing_eagerly():
+      return
     np.random.seed(1)
     x_np = np.random.uniform(
         low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
@@ -131,7 +136,6 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.session(use_gpu=True) as sess:
       if use_static_shape_:
         x_tf = constant_op.constant(x_np)
       else:
@@ -141,7 +145,8 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       if use_static_shape_:
         q_tf_val, r_tf_val = self.evaluate([q_tf, r_tf])
       else:
-        q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
+        with self.session(use_gpu=True) as sess:
+          q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
 
       q_dims = q_tf_val.shape
       np_q = np.ndarray(q_dims, dtype_)
@@ -266,7 +271,7 @@ if __name__ == "__main__":
         for full_matrices in False, True:
           for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
             # TF2 does not support placeholders under eager so we skip it
-            for use_static_shape in set([True, tf2.enabled()]):
+            for use_static_shape in [True, False]:
               shape = batch_dims + (rows, cols)
               name = "%s_%s_full_%s_static_%s" % (dtype.__name__,
                                                   "_".join(map(str, shape)),
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index c3335cbc546..b5d291d2973 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -87,6 +87,7 @@ cuda_py_test(
     name = "random_ops_test",
     size = "medium",
     srcs = ["random_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -101,6 +102,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
     shard_count = 2,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 4dbbb7c7f1e..73c8bd09db0 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -336,6 +336,8 @@ class RandomUniformTest(RandomOpTestCommon):
       self.assertLess(error.max(), 5 * std)
 
   # Check that minval = maxval is fine iff we're producing no numbers
+  @test_util.disable_tfrt(
+      "TFE_TensorHandleToNumpy not implemented yet. b/156191611")
   def testUniformIntsDegenerate(self):
     for dt in dtypes.int32, dtypes.int64:
       def sample(n):
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index 0b9fbab716c..d7e50083deb 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -154,44 +154,54 @@ class StatelessOpsTest(test.TestCase, parameterized.TestCase):
                                 **kwds),
               functools.partial(random_ops.random_poisson, shape=(10,), **kwds))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchFloat(self):
     self._test_match(self._float_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchInt(self):
     self._test_match(self._int_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchMultinomial(self):
     self._test_match(self._multinomial_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchGamma(self):
     self._test_match(self._gamma_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchPoisson(self):
     self._test_match(self._poisson_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismFloat(self):
     self._test_determinism(
         self._float_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismInt(self):
     self._test_determinism(
         self._int_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismMultinomial(self):
     self._test_determinism(self._multinomial_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismGamma(self):
     self._test_determinism(self._gamma_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismPoisson(self):
     self._test_determinism(self._poisson_cases())
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 41ce9eb8a57..bf229943fd4 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -57,6 +57,8 @@ from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
+@test_util.disable_tfrt(
+    "Trying to assign variable with wrong dtype. b/156200342")
 @test_util.with_control_flow_v2
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
                               parameterized.TestCase):
@@ -332,6 +334,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     g = gradients_impl.gradients(c, [b], unconnected_gradients="zero")[0]
     self.assertAllEqual(g.shape.as_list(), [1, 2])
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_deprecated_v1
   def testGradientCondInWhileLoop(self):
     v = resource_variable_ops.ResourceVariable(initial_value=1.0)
@@ -965,6 +968,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.disable_xla("XLA doesn't allow changing shape at assignment, as "
                          "dictated by tf2xla/xla_resource.cc:SetTypeAndShape")
   @test_util.run_in_graph_and_eager_modes
@@ -1327,6 +1331,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
 
   # TODO(ebrevdo): Add run_in_graph_and_eager_modes once we can create
   # EagerTensor constants with TensorProto inputs.
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes()
   def testVariantInitializer(self):
     variant_shape_and_type_data = self.create_variant_shape_and_type_data()
@@ -1520,6 +1525,7 @@ class PerReplicaResourceHandleTest(test_util.TensorFlowTestCase):
         context.LogicalDeviceConfiguration(),
     ])
 
+  @test_util.disable_tfrt("Multiple device support. b/154956430")
   def testAllowedDevices(self):
     device0 = "/job:localhost/replica:0/task:0/device:CPU:0"
     device1 = "/job:localhost/replica:0/task:0/device:CPU:1"
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 05307c9834a..267decff38b 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -19,10 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -135,56 +136,52 @@ class ReverseSequenceTest(test.TestCase):
     print("ReverseSequence gradient error = %g" % err)
     self.assertLess(err, 1e-8)
 
-  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
-    t = array_ops.reverse_sequence(
-        array_ops.placeholder(
-            dtypes.float32, shape=None),
-        seq_lengths=array_ops.placeholder(
-            dtypes.int64, shape=(32,)),
-        batch_axis=0,
-        seq_axis=1)
-    self.assertIs(t.get_shape().ndims, None)
+    # Enter graph mode since we want to test partial shapes
+    with context.graph_mode():
+      t = array_ops.reverse_sequence(
+          array_ops.placeholder(dtypes.float32, shape=None),
+          seq_lengths=array_ops.placeholder(dtypes.int64, shape=(32,)),
+          batch_axis=0,
+          seq_axis=1)
+      self.assertIs(t.get_shape().ndims, None)
 
+  def testInvalidArguments(self):
     # Batch size mismatched between input and seq_lengths.
-    with self.assertRaises(ValueError):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(33,)),
-          seq_axis=3)
+    # seq_length too long
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 (r"Dimensions must be equal|"
+                                  r"Length of seq_lengths != input.dims\(0\)")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2, 2], seq_axis=1)
+
+    # seq_length too short
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 (r"Dimensions must be equal|"
+                                  r"Length of seq_lengths != input.dims\(0\)")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2], seq_axis=1)
+
+    # Invalid seq_length shape
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 ("Shape must be rank 1 but is rank 2|"
+                                  "seq_lengths must be 1-dim")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [[2, 2]], seq_axis=1)
 
     # seq_axis out of bounds.
-    with self.assertRaisesRegexp(ValueError, "seq_dim must be < input rank"):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(32,)),
-          seq_axis=3)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 "seq_dim must be < input rank"):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=2)
 
     # batch_axis out of bounds.
-    with self.assertRaisesRegexp(ValueError, "batch_dim must be < input rank"):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(32,)),
-          seq_axis=0,
-          batch_axis=3)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 "batch_dim must be < input rank"):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2],
+                                 seq_axis=1,
+                                 batch_axis=3)
 
-    with self.cached_session():
-      inputs = array_ops.placeholder(dtypes.float32, shape=(32, 2, 3))
-      seq_lengths = array_ops.placeholder(dtypes.int64, shape=(32,))
-      output = array_ops.reverse_sequence(
-          inputs, seq_lengths=seq_lengths,
-          seq_axis=0)  # batch_axis default is 0
-      with self.assertRaisesOpError("batch_dim == seq_dim"):
-        output.eval(feed_dict={
-            inputs: np.random.rand(32, 2, 3),
-            seq_lengths: xrange(32)
-        })
+    with self.assertRaisesRegexp((errors.OpError, errors.InvalidArgumentError),
+                                 "batch_dim == seq_dim == 0"):
+      output = array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=0)
+      self.evaluate(output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index ed66a4f75ab..c0e6262a9d5 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -156,16 +156,18 @@ class StatefulScatterNdTest(test.TestCase):
 
   def testSimple(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
-    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-    ref = variables.Variable([0, 0, 0, 0, 0, 0, 0, 0], dtype=dtypes.float32)
-    expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
-    scatter = state_ops.scatter_nd_update(ref, indices, updates)
-    init = variables.global_variables_initializer()
+    for dtype in (dtypes.int64, dtypes.float32, dtypes.float64,
+                  dtypes.complex64, dtypes.complex128):
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)
+      ref = variables.Variable([0, 0, 0, 0, 0, 0, 0, 0], dtype=dtype)
+      expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
+      scatter = state_ops.scatter_nd_update(ref, indices, updates)
+      init = variables.global_variables_initializer()
 
-    with self.session(use_gpu=True) as sess:
-      self.evaluate(init)
-      result = self.evaluate(scatter)
-      self.assertAllClose(result, expected)
+      with test_util.use_gpu():
+        self.evaluate(init)
+        result = self.evaluate(scatter)
+        self.assertAllClose(result, expected)
 
   @test_util.run_in_graph_and_eager_modes
   def testString(self):
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 98a4c0c10ae..e993ae29c10 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -516,6 +516,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         dtypes_lib.int32
     ]
 
+    index_dtypes = [dtypes_lib.int32, dtypes_lib.int64]
+    segment_ids_dtypes = [dtypes_lib.int32, dtypes_lib.int64]
+
     mean_dtypes = [dtypes_lib.float32, dtypes_lib.float64]
 
     # Each item is np_op1, np_op2, tf_op
@@ -531,22 +534,29 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         segment_indices.append(i)
     num_indices = len(segment_indices)
     for dtype in dtypes:
-      with self.cached_session(use_gpu=False):
-        tf_indices, np_indices, tf_x, np_x = self._sparse_input(
-            shape, num_indices, dtype=dtype)
-        for np_op1, np_op2, tf_op in ops_list:
-          if tf_op == math_ops.sparse_segment_mean and dtype not in mean_dtypes:
-            continue
-          np_ans = self._sparseSegmentReduce(np_x, np_indices, segment_indices,
-                                             np_op1, np_op2)
-          s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-          tf_ans = self.evaluate(s)
-          self.assertAllClose(np_ans, tf_ans)
-          # NOTE(mrry): The static shape inference that computes
-          # `tf_ans.shape` can only infer that sizes from dimension 1
-          # onwards, because the size of dimension 0 is data-dependent
-          # and may therefore vary dynamically.
-          self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:])
+      for index_dtype in index_dtypes:
+        for segment_ids_dtype in segment_ids_dtypes:
+          with self.cached_session(use_gpu=False):
+            tf_indices, np_indices, tf_x, np_x = self._sparse_input(
+                shape, num_indices, dtype=dtype)
+            for np_op1, np_op2, tf_op in ops_list:
+              if (tf_op == math_ops.sparse_segment_mean
+                  and dtype not in mean_dtypes):
+                continue
+              np_ans = self._sparseSegmentReduce(np_x, np_indices,
+                                                 segment_indices, np_op1,
+                                                 np_op2)
+              s = tf_op(
+                  data=tf_x,
+                  indices=math_ops.cast(tf_indices, index_dtype),
+                  segment_ids=math_ops.cast(segment_indices, segment_ids_dtype))
+              tf_ans = self.evaluate(s)
+              self.assertAllClose(np_ans, tf_ans)
+              # NOTE(mrry): The static shape inference that computes
+              # `tf_ans.shape` can only infer that sizes from dimension 1
+              # onwards, because the size of dimension 0 is data-dependent
+              # and may therefore vary dynamically.
+              self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:])
 
   def testSegmentIdsHole(self):
     tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index adb12a5e850..bd893184570 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -149,7 +149,6 @@ cuda_py_tests(
     python_version = "PY3",
     shard_count = 4,
     tags = [
-        "no_oss_py38",  #TODO(b/151631881)
         "no_windows_gpu",
     ],
     deps = [
diff --git a/tensorflow/python/kernel_tests/signal/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
index 1e95fe4b28f..e8d477a843b 100644
--- a/tensorflow/python/kernel_tests/signal/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -50,7 +50,7 @@ def grappler_optimize(graph, fetches=None, config_proto=None):
   return tf_optimizer.OptimizeGraph(config_proto, metagraph)
 
 
-def tflite_convert(fn, input_templates, use_mlir=False):
+def tflite_convert(fn, input_templates):
   """Converts the provided fn to tf.lite model.
 
   Args:
@@ -59,7 +59,6 @@ def tflite_convert(fn, input_templates, use_mlir=False):
     input_templates: A list of Tensors, ndarrays or TensorSpecs describing the
       inputs that fn expects. The actual values of the Tensors or ndarrays are
       unused.
-    use_mlir: Experimental. Whether to use the tf.lite MLIR converter.
 
   Returns:
     The serialized tf.lite model.
@@ -67,7 +66,6 @@ def tflite_convert(fn, input_templates, use_mlir=False):
   fn = def_function.function(fn)
   concrete_func = fn.get_concrete_function(*input_templates)
   converter = lite.TFLiteConverterV2([concrete_func])
-  converter.experimental_new_converter = use_mlir
   return converter.convert()
 
 
diff --git a/tensorflow/python/kernel_tests/signal/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
index 9f5fe6f64c7..9432e70c7f2 100644
--- a/tensorflow/python/kernel_tests/signal/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -156,15 +156,14 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase):
       self.assertLen(rewritten_graph.node, 1)
 
   @parameterized.parameters(
-      # Due to control flow, only MLIR is supported.
       # Only float32 is supported.
-      (window_ops.hann_window, 10, False, dtypes.float32, True),
-      (window_ops.hann_window, 10, True, dtypes.float32, True),
-      (window_ops.hamming_window, 10, False, dtypes.float32, True),
-      (window_ops.hamming_window, 10, True, dtypes.float32, True),
-      (window_ops.vorbis_window, 12, None, dtypes.float32, True))
-  def test_tflite_convert(self, window_fn, window_length, periodic, dtype,
-                          use_mlir):
+      (window_ops.hann_window, 10, False, dtypes.float32),
+      (window_ops.hann_window, 10, True, dtypes.float32),
+      (window_ops.hamming_window, 10, False, dtypes.float32),
+      (window_ops.hamming_window, 10, True, dtypes.float32),
+      (window_ops.vorbis_window, 12, None, dtypes.float32))
+  def test_tflite_convert(self, window_fn, window_length, periodic, dtype):
+
     def fn(window_length):
       try:
         return window_fn(window_length, periodic=periodic, dtype=dtype)
@@ -172,7 +171,7 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase):
         return window_fn(window_length, dtype=dtype)
 
     tflite_model = test_util.tflite_convert(
-        fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)], use_mlir)
+        fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)])
     window_length = np.array(window_length).astype(np.int32)
     actual_output, = test_util.evaluate_tflite_model(
         tflite_model, [window_length])
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 5037f82af72..b352c1a080f 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -27,10 +27,55 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
+class BaseSparseCrossOpTest(test.TestCase):
+
+  def _sparse_tensor(self, data, batch_size=-1):
+    """Generates a SparseTensor.
+
+    Args:
+      data: Should be a list of list of strings or int64. Each item of the outer
+        list represents a batch. Each item of the batch is a feature of a
+        specific feature column.
+      batch_size: optional batch size, especially for cases when data has no
+        entry for some batches.
+
+    Returns:
+     A SparseTensor.
+    """
+    indices = []
+    values = []
+    max_col_count = 0
+    for batch, batch_ix in zip(data, range(len(data))):
+      for column, column_ix in zip(batch, range(len(batch))):
+        indices.append([batch_ix, column_ix])
+        values.append(column)
+        max_col_count = max(max_col_count, column_ix + 1)
+    shape = [batch_size if batch_size != -1 else len(data), max_col_count]
+    value_type = (
+        dtypes.string
+        if not values or isinstance(values[0], str) else dtypes.int64)
+    return sparse_tensor.SparseTensor(
+        constant_op.constant(indices, dtypes.int64, [len(indices), 2]),
+        constant_op.constant(values, value_type, [len(indices)]),
+        constant_op.constant(shape, dtypes.int64))
+
+  def _assert_sparse_tensor_equals(self, sp1, sp2):
+    self.assertAllEqual(sp1.indices.eval(), sp2.indices)
+    self.assertAllEqual(sp1.values.eval(), sp2.values)
+    self.assertAllEqual(sp1.dense_shape.eval(), sp2.dense_shape)
+
+  def _assert_sparse_tensor_empty(self, sp):
+    self.assertEqual(0, sp.indices.size)
+    self.assertEqual(0, sp.values.size)
+    # TODO(zakaria): check if we can ignore the first dim of the shape.
+    self.assertEqual(0, sp.dense_shape[1])
+
+
 class SparseCrossOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -459,5 +504,552 @@ class SparseCrossOpTest(test.TestCase):
       self.evaluate(sparse_ops.sparse_cross([st1, st2]))
 
 
+class SparseCrossV2OpTest(BaseSparseCrossOpTest):
+
+  @test_util.run_deprecated_v1
+  def test_sparse(self):
+    """Tests a simple scenario."""
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1'],
+                                    ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_X_batch1-FC2-F1'],
+        ['batch2-FC1-F1_X_batch2-FC2-F1',
+         'batch2-FC1-F1_X_batch2-FC2-F2',
+         'batch2-FC1-F2_X_batch2-FC2-F1',
+         'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_sparse_sep(self):
+    """Tests a simple scenario."""
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1'],
+                                    ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_Y_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_Y_batch1-FC2-F1'],
+        ['batch2-FC1-F1_Y_batch2-FC2-F1',
+         'batch2-FC1-F1_Y_batch2-FC2-F2',
+         'batch2-FC1-F2_Y_batch2-FC2-F1',
+         'batch2-FC1-F2_Y_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_dense(self):
+    """Tests only dense inputs."""
+    dense_inp_1 = constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                                        ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                                       dtypes.string)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2',
+         'batch1-FC1-F2_X_batch1-FC2-F1', 'batch1-FC1-F2_X_batch1-FC2-F2'
+        ],
+        ['batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_dense_sep(self):
+    """Tests only dense inputs."""
+    dense_inp_1 = constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                                        ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                                       dtypes.string)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_batch1-FC2-F1', 'batch1-FC1-F1_batch1-FC2-F2',
+         'batch1-FC1-F2_batch1-FC2-F1', 'batch1-FC1-F2_batch1-FC2-F2'
+        ],
+        ['batch2-FC1-F1_batch2-FC2-F1', 'batch2-FC1-F1_batch2-FC2-F2',
+         'batch2-FC1-F2_batch2-FC2-F1', 'batch2-FC1-F2_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_integer_mixed_string_sparse(self):
+    """Tests mixed type."""
+    sp_inp_1 = self._sparse_tensor([[11], [333, 55555]])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['11_X_batch1-FC2-F1'],
+        ['333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2',
+         '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_integer_mixed_string_dense(self):
+    """Tests mixed dense inputs."""
+    dense_inp_1 = constant_op.constant([[11, 333], [55555, 999999]],
+                                       dtypes.int64)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2',
+         '333_X_batch1-FC2-F1', '333_X_batch1-FC2-F2'
+        ],
+        ['55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2',
+         '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_sparse_cross_dense(self):
+    """Tests sparse and dense inputs."""
+    sp_inp = self._sparse_tensor([['batch1-FC1-F1'],
+                                  ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    dense_inp = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                      ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                     dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp.indices],
+        values=[sp_inp.values],
+        shapes=[sp_inp.dense_shape],
+        dense_inputs=[dense_inp],
+        sep='_X_')
+    expected_out = self._sparse_tensor(
+        [['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2'],
+         [
+             'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+             'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+         ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_permutation_3x3x3(self):
+    """Tests 3x3x3 permutation."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor(
+        [['batch1-FC2-F1', 'batch1-FC2-F2', 'batch1-FC2-F3']])
+    sp_inp_3 = self._sparse_tensor(
+        [['batch1-FC3-F1', 'batch1-FC3-F2', 'batch1-FC3-F3']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
+    ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_permutation_3x1x2(self):
+    """Tests 3x1x2 permutation."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_large_batch(self):
+    """Tests with large batch size to force multithreading."""
+    batch_size = 5000
+    col1 = []
+    col2 = []
+    col3 = []
+    for b in range(batch_size):
+      col1.append(
+          ['batch%d-FC1-F1' % b,
+           'batch%d-FC1-F2' % b,
+           'batch%d-FC1-F3' % b])
+      col2.append(['batch%d-FC2-F1' % b])
+      col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])
+    sp_inp_1 = self._sparse_tensor(col1)
+    sp_inp_2 = self._sparse_tensor(col2)
+    sp_inp_3 = self._sparse_tensor(col3)
+
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+
+    col_out = []
+    for b in range(batch_size):
+      col_out.append([
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b)
+      ])
+
+    expected_out = self._sparse_tensor(col_out)
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_one_column_empty(self):
+    """Tests when one column is empty.
+
+    The crossed tensor should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([], 1)
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_empty(self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_some_columns_empty(self):
+    """Tests when more than one columns are empty.
+
+    Cross for the corresponding batch should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2)
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2)
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]], 2)
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_all_columns_empty(self):
+    """Tests when all columns are empty.
+
+    The crossed tensor should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([])
+    sp_inp_2 = self._sparse_tensor([])
+    sp_inp_3 = self._sparse_tensor([])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_empty(self.evaluate(out))
+
+
+class SparseCrossHashedOpTest(BaseSparseCrossOpTest):
+
+  @test_util.run_deprecated_v1
+  def test_hashed_zero_bucket_no_hash_key(self):
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=0,
+        salt=[1, 1],
+        strong_hash=False)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[9186962005966787372]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+    # salt is not being used when `strong_hash` is False.
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=0,
+        salt=[137, 173],
+        strong_hash=False)
+    out_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out_2))
+
+  @test_util.run_deprecated_v1
+  def test_hashed_output(self):
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=100,
+        salt=[137, 173],
+        strong_hash=False)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[79]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_hashed_has_no_collision(self):
+    """Tests that fingerprint concatenation has no collisions."""
+    # Although the last 10 bits of 359 and 1024+359 are identical.
+    # As a result, all the crosses shouldn't collide.
+    t1 = constant_op.constant([[359], [359 + 1024]], dtype=dtypes.int64)
+    t2 = constant_op.constant(
+        [list(range(10)), list(range(10))], dtype=dtypes.int64)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[t2, t1],
+        num_buckets=1024,
+        salt=[137, 173],
+        strong_hash=False)
+    cross = sparse_tensor.SparseTensor(inds, vals, shapes)
+    cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
+    with session.Session():
+      values = self.evaluate(cross_dense)
+      self.assertTrue(numpy.not_equal(values[0], values[1]).all())
+
+  def test_hashed_3x1x2(self):
+    """Tests 3x1x2 permutation with hashed output."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=1000,
+        salt=[137, 173],
+        strong_hash=False)
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      out = self.evaluate(output)
+      self.assertEqual(6, len(out.values))
+      self.assertAllEqual([[0, i] for i in range(6)], out.indices)
+      self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
+      all_values_are_different = len(out.values) == len(set(out.values))
+      self.assertTrue(all_values_are_different)
+
+  def test_hashed_different_salt(self):
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=False,
+        num_buckets=1000,
+        salt=[137, 173])
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 1])
+    output_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      out = self.evaluate(output)
+      out_2 = self.evaluate(output_2)
+      self.assertAllEqual(out.indices, out_2.indices)
+      self.assertNotAllEqual(out.values, out_2.values)
+
+  def test_sep_ignored_in_hashed_out(self):
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 173])
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 173])
+    output_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      out = self.evaluate(output)
+      out_2 = self.evaluate(output_2)
+      self.assertAllEqual(out.indices, out_2.indices)
+      self.assertAllEqual(out.values, out_2.values)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index 56aaf4cb557..6ec51bb9735 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -41,7 +43,6 @@ class SparseReshapeTest(test.TestCase):
     ind = np.array([[0, 0], [1, 0], [1, 3], [1, 4], [3, 2],
                     [3, 3]]).astype(np.int64)
     val = np.array([0, 10, 13, 14, 32, 33]).astype(np.float64)
-
     shape = np.array([5, 6]).astype(np.int64)
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
@@ -329,5 +330,73 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.dense_shape, new_shape)
 
 
+class EmptySparseTensorReshapeTest(test.TestCase, parameterized.TestCase):
+  """Tests for reshaping 0-sized SparseTensors, compared w/ dense tensors."""
+
+  def _MakeAndReshapeTensor(self, tensor_class, original_shape, target_shape):
+    if tensor_class == "sparse":
+      ind = np.zeros([0, len(original_shape)]).astype(np.int64)
+      val = np.array([]).astype(np.float64)
+      shape = np.array(original_shape).astype(np.int64)
+      sp_input = sparse_tensor.SparseTensorValue(ind, val, shape)
+      sp_output = self.evaluate(
+          sparse_ops.sparse_reshape(sp_input, target_shape))
+      return sp_output.dense_shape
+    else:
+      dense_input = array_ops.zeros(original_shape)
+      dense_output = self.evaluate(array_ops.reshape(dense_input, target_shape))
+      return dense_output.shape
+
+  @parameterized.named_parameters([
+      ("Dense", "dense"),
+      ("Sparse", "sparse"),
+  ])
+  def testImpliedReshapeEmpty1DTensor(self, tensor_class):
+    self.assertAllEqual(
+        self._MakeAndReshapeTensor(tensor_class, [0], [-1, 1]), [0, 1])
+    self.assertAllEqual(
+        self._MakeAndReshapeTensor(tensor_class, [0], [-1, 1, 2]), [0, 1, 2])
+
+  @parameterized.named_parameters([
+      ("Dense", "dense"),
+      ("Sparse", "sparse"),
+  ])
+  def testImpliedReshapeEmpty2DTensor(self, tensor_class):
+    self.assertAllEqual(
+        self._MakeAndReshapeTensor(tensor_class, [1, 0], [-1, 1]), [0, 1])
+    self.assertAllEqual(
+        self._MakeAndReshapeTensor(tensor_class, [1, 0], [-1, 2, 3]), [0, 2, 3])
+
+  @parameterized.named_parameters([
+      ("Dense", "dense"),
+      ("Sparse", "sparse"),
+  ])
+  def testImpliedReshapeEmpty3DTensor(self, tensor_class):
+    self.assertAllEqual(
+        self._MakeAndReshapeTensor(tensor_class, [1, 0, 0], [-1, 2, 3]),
+        [0, 2, 3])
+
+  @parameterized.named_parameters([
+      ("Dense", "dense"),
+      ("Sparse", "sparse"),
+  ])
+  def testImpliedReshapeEmpty4DTensor(self, tensor_class):
+    self.assertAllEqual(
+        self._MakeAndReshapeTensor(tensor_class, [2, 4, 0, 6], [-1, 4, 6, 2]),
+        [0, 4, 6, 2])
+
+  def testImpliedDimTogetherWithZeroDimCausesError(self):
+    # NOTE: When implied dimensions and zero dimensions coexist in the target
+    # shape, the behavior currently differs between sparse and regular tensors.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self._MakeAndReshapeTensor("sparse", [0], [-1, 0])
+    with self.assertRaises(errors.InvalidArgumentError):
+      self._MakeAndReshapeTensor("sparse", [1, 0], [-1, 0])
+    with self.assertRaises(errors.InvalidArgumentError):
+      self._MakeAndReshapeTensor("sparse", [1, 2, 0], [2, -1, 0])
+    with self.assertRaises(errors.InvalidArgumentError):
+      self._MakeAndReshapeTensor("sparse", [1, 2, 3, 0], [2, 0, -1, 3])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index 564491c42e5..aebbeefcc8d 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -42,146 +42,165 @@ def np_split_squeeze(array, axis):
 
 class StackOpTest(test.TestCase):
 
+  def randn(self, shape, dtype):
+    data = np.random.randn(*shape)
+    if dtype == np.bool:
+      return data < 0  # Naive casting yields True with P(1)!
+    else:
+      return data.astype(dtype)
+
   @test_util.run_deprecated_v1
   def testSimple(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [np.bool, np.float32, np.int32, np.int64]:
-          data = np.random.randn(*shape).astype(dtype)
-          # Convert [data[0], data[1], ...] separately to tensorflow
-          # TODO(irving): Remove list() once we handle maps correctly
-          xs = list(map(constant_op.constant, data))
-          # Stack back into a single tensorflow tensor
-          c = array_ops.stack(xs)
-          self.assertAllEqual(c.eval(), data)
+      for shape in (2,), (3,), (2, 3), (3, 2), (8, 2, 10):
+        rank = len(shape)
+        for axis in range(-rank, rank):
+          for dtype in [np.bool, np.float32, np.int32, np.int64]:
+            data = self.randn(shape, dtype)
+            xs = np_split_squeeze(data, axis)
+            # Stack back into a single tensorflow tensor
+            with self.subTest(shape=shape, axis=axis, dtype=dtype):
+              c = array_ops.stack(xs, axis=axis)
+              self.assertAllEqual(c.eval(), data)
 
   @test_util.run_deprecated_v1
   def testSimpleParallelCPU(self):
     np.random.seed(7)
     with self.session(use_gpu=False):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (100, 24, 24, 3):
-        data = np.random.randn(*shape).astype(np.float32)
-        xs = list(map(constant_op.constant, data))
-        c = array_ops.parallel_stack(xs)
-        self.assertAllEqual(c.eval(), data)
+        with self.subTest(shape=shape):
+          data = self.randn(shape, np.float32)
+          xs = list(map(constant_op.constant, data))
+          c = array_ops.parallel_stack(xs)
+          self.assertAllEqual(c.eval(), data)
 
   @test_util.run_deprecated_v1
   def testSimpleParallelGPU(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (100, 24, 24, 3):
-        data = np.random.randn(*shape).astype(np.float32)
-        xs = list(map(constant_op.constant, data))
-        c = array_ops.parallel_stack(xs)
-        self.assertAllEqual(c.eval(), data)
+        with self.subTest(shape=shape):
+          data = self.randn(shape, np.float32)
+          xs = list(map(constant_op.constant, data))
+          c = array_ops.parallel_stack(xs)
+          self.assertAllEqual(c.eval(), data)
 
   @test_util.run_deprecated_v1
   def testConst(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+      # Verify that shape induction works with shapes produced via const stack
+      a = constant_op.constant([1, 2, 3, 4, 5, 6])
+      b = array_ops.reshape(a, array_ops.stack([2, 3]))
+      self.assertAllEqual(b.get_shape(), [2, 3])
+
+      # Check on a variety of shapes and types
+      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (8, 2, 10):
         for dtype in [np.bool, np.float32, np.int16, np.int32, np.int64]:
-          data = np.random.randn(*shape).astype(dtype)
-          # Stack back into a single tensorflow tensor directly using np array
-          c = array_ops.stack(data)
-          # This is implemented via a Const:
-          self.assertEqual(c.op.type, "Const")
-          self.assertAllEqual(c.eval(), data)
+          with self.subTest(shape=shape, dtype=dtype):
+            data = self.randn(shape, dtype)
+            # Stack back into a single tensorflow tensor directly using np array
+            c = array_ops.stack(data)
+            # This is implemented via a Const:
+            self.assertEqual(c.op.type, "Const")
+            self.assertAllEqual(c.eval(), data)
 
-          # Python lists also work for 1-D case:
-          if len(shape) == 1:
-            data_list = list(data)
-            cl = array_ops.stack(data_list)
-            self.assertEqual(cl.op.type, "Const")
-            self.assertAllEqual(cl.eval(), data)
-
-        # Verify that shape induction works with shapes produced via const stack
-        a = constant_op.constant([1, 2, 3, 4, 5, 6])
-        b = array_ops.reshape(a, array_ops.stack([2, 3]))
-        self.assertAllEqual(b.get_shape(), [2, 3])
+            # Python lists also work for 1-D case:
+            if len(shape) == 1:
+              data_list = list(data)
+              cl = array_ops.stack(data_list)
+              self.assertEqual(cl.op.type, "Const")
+              self.assertAllEqual(cl.eval(), data)
 
   @test_util.run_deprecated_v1
   def testConstParallelCPU(self):
     np.random.seed(7)
     with self.session(use_gpu=False):
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        data = np.random.randn(*shape).astype(np.float32)
-        if len(shape) == 1:
-          data_list = list(data)
-          cl = array_ops.parallel_stack(data_list)
-          self.assertAllEqual(cl.eval(), data)
+      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (8, 2, 10):
+        with self.subTest(shape=shape):
+          data = self.randn(shape, np.float32)
+          if len(shape) == 1:
+            data_list = list(data)
+            cl = array_ops.parallel_stack(data_list)
+            self.assertAllEqual(cl.eval(), data)
 
-        data = np.random.randn(*shape).astype(np.float32)
-        c = array_ops.parallel_stack(data)
-        self.assertAllEqual(c.eval(), data)
+          data = self.randn(shape, np.float32)
+          c = array_ops.parallel_stack(data)
+          self.assertAllEqual(c.eval(), data)
 
   @test_util.run_deprecated_v1
   def testConstParallelGPU(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        data = np.random.randn(*shape).astype(np.float32)
-        if len(shape) == 1:
-          data_list = list(data)
-          cl = array_ops.parallel_stack(data_list)
-          self.assertAllEqual(cl.eval(), data)
+        with self.subTest(shape=shape):
+          data = self.randn(shape, np.float32)
+          if len(shape) == 1:
+            data_list = list(data)
+            cl = array_ops.parallel_stack(data_list)
+            self.assertAllEqual(cl.eval(), data)
 
-        data = np.random.randn(*shape).astype(np.float32)
-        c = array_ops.parallel_stack(data)
-        self.assertAllEqual(c.eval(), data)
+          data = self.randn(shape, np.float32)
+          c = array_ops.parallel_stack(data)
+          self.assertAllEqual(c.eval(), data)
 
   @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
     np.random.seed(7)
-    for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+    for shape in (2,), (3,), (2, 3), (3, 2), (8, 2, 10):
       data = np.random.randn(*shape)
       shapes = [shape[1:]] * shape[0]
-      with self.cached_session(use_gpu=True):
-        # TODO(irving): Remove list() once we handle maps correctly
-        xs = list(map(constant_op.constant, data))
-        c = array_ops.stack(xs)
-        err = gradient_checker.compute_gradient_error(xs, shapes, c, shape)
-        self.assertLess(err, 1e-6)
+      with self.subTest(shape=shape):
+        with self.cached_session(use_gpu=True):
+          # TODO(irving): Remove list() once we handle maps correctly
+          xs = list(map(constant_op.constant, data))
+          c = array_ops.stack(xs)
+          err = gradient_checker.compute_gradient_error(xs, shapes, c, shape)
+          self.assertLess(err, 1e-6)
 
   @test_util.run_deprecated_v1
   def testGradientsAxis1(self):
     np.random.seed(7)
-    for shape in (2, 3), (3, 2), (4, 3, 2):
+    for shape in (2, 3), (3, 2), (8, 2, 10):
       data = np.random.randn(*shape)
       shapes = [shape[1:]] * shape[0]
       out_shape = list(shape[1:])
       out_shape.insert(1, shape[0])
-      with self.cached_session(use_gpu=True):
-        # TODO(irving): Remove list() once we handle maps correctly
-        xs = list(map(constant_op.constant, data))
-        c = array_ops.stack(xs, axis=1)
-        err = gradient_checker.compute_gradient_error(xs, shapes, c, out_shape)
-        self.assertLess(err, 1e-6)
+      with self.subTest(shape=shape):
+        with self.cached_session(use_gpu=True):
+          # TODO(irving): Remove list() once we handle maps correctly
+          xs = list(map(constant_op.constant, data))
+          c = array_ops.stack(xs, axis=1)
+          err = gradient_checker.compute_gradient_error(xs, shapes, c,
+                                                        out_shape)
+          self.assertLess(err, 1e-6)
 
   @test_util.run_deprecated_v1
   def testZeroSizeCPU(self):
     # Verify that stack doesn't crash for zero size inputs
     with self.session(use_gpu=False):
       for shape in (0,), (3, 0), (0, 3):
-        x = np.zeros((2,) + shape).astype(np.int32)
-        p = array_ops.stack(list(x)).eval()
-        self.assertAllEqual(p, x)
+        with self.subTest(shape=shape):
+          x = np.zeros((2,) + shape).astype(np.int32)
+          p = array_ops.stack(list(x)).eval()
+          self.assertAllEqual(p, x)
 
-        p = array_ops.parallel_stack(list(x)).eval()
-        self.assertAllEqual(p, x)
+          p = array_ops.parallel_stack(list(x)).eval()
+          self.assertAllEqual(p, x)
 
   @test_util.run_deprecated_v1
   def testZeroSizeGPU(self):
     # Verify that stack doesn't crash for zero size inputs
     with self.session(use_gpu=True):
       for shape in (0,), (3, 0), (0, 3):
-        x = np.zeros((2,) + shape).astype(np.int32)
-        p = array_ops.stack(list(x)).eval()
-        self.assertAllEqual(p, x)
+        with self.subTest(shape=shape):
+          x = np.zeros((2,) + shape).astype(np.int32)
+          p = array_ops.stack(list(x)).eval()
+          self.assertAllEqual(p, x)
 
-        p = array_ops.parallel_stack(list(x)).eval()
-        self.assertAllEqual(p, x)
+          p = array_ops.parallel_stack(list(x)).eval()
+          self.assertAllEqual(p, x)
 
   @test_util.run_deprecated_v1
   def testAxis0DefaultCPU(self):
@@ -207,23 +226,25 @@ class StackOpTest(test.TestCase):
 
   def testAgainstNumpy(self):
     # For 1 to 5 dimensions.
-    for i in range(1, 6):
-      expected = np.random.random(np.random.permutation(i) + 1)
+    for shape in (3,), (2, 2, 3), (4, 1, 2, 2), (8, 2, 10):
+      rank = len(shape)
+      expected = self.randn(shape, np.float32)
+      for dtype in [np.bool, np.float32, np.int32, np.int64]:
+        # For all the possible axis to split it, including negative indices.
+        for axis in range(-rank, rank):
+          test_arrays = np_split_squeeze(expected, axis)
 
-      # For all the possible axis to split it, including negative indices.
-      for j in range(-i, i):
-        test_arrays = np_split_squeeze(expected, j)
+          with self.cached_session(use_gpu=True):
+            with self.subTest(shape=shape, dtype=dtype, axis=axis):
+              actual_pack = array_ops.stack(test_arrays, axis=axis)
+              self.assertEqual(expected.shape, actual_pack.get_shape())
+              actual_pack = self.evaluate(actual_pack)
 
-        with self.cached_session(use_gpu=True):
-          actual_pack = array_ops.stack(test_arrays, axis=j)
-          self.assertEqual(expected.shape, actual_pack.get_shape())
-          actual_pack = self.evaluate(actual_pack)
+              actual_stack = array_ops.stack(test_arrays, axis=axis)
+              self.assertEqual(expected.shape, actual_stack.get_shape())
+              actual_stack = self.evaluate(actual_stack)
 
-          actual_stack = array_ops.stack(test_arrays, axis=j)
-          self.assertEqual(expected.shape, actual_stack.get_shape())
-          actual_stack = self.evaluate(actual_stack)
-
-        self.assertNDArrayNear(expected, actual_stack, 1e-6)
+              self.assertNDArrayNear(expected, actual_stack, 1e-6)
 
   def testDimOutOfRange(self):
     t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
@@ -238,12 +259,13 @@ class StackOpTest(test.TestCase):
   def testComplex(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+      for shape in (2,), (3,), (2, 3), (3, 2), (8, 2, 10):
         for dtype in [np.complex64, np.complex128]:
-          data = np.random.randn(*shape).astype(dtype)
-          xs = list(map(constant_op.constant, data))
-          c = array_ops.stack(xs)
-          self.assertAllEqual(self.evaluate(c), data)
+          with self.subTest(shape=shape, dtype=dtype):
+            data = self.randn(shape, dtype)
+            xs = list(map(constant_op.constant, data))
+            c = array_ops.stack(xs)
+            self.assertAllEqual(self.evaluate(c), data)
 
 
 class AutomaticStackingTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops_test.py
index 8144c0072cc..387083ceff4 100644
--- a/tensorflow/python/kernel_tests/summary_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_ops_test.py
@@ -38,10 +38,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine.sequential import Sequential
-from tensorflow.python.keras.engine.training import Model
-from tensorflow.python.keras.layers.core import Activation
-from tensorflow.python.keras.layers.core import Dense
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
@@ -915,17 +911,6 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
         ],
         step_stats=step_stats)
 
-  def keras_model(self, *args, **kwargs):
-    logdir = self.get_temp_dir()
-    writer = summary_ops.create_file_writer(logdir)
-    with writer.as_default():
-      summary_ops.keras_model(*args, **kwargs)
-    writer.close()
-    events = events_from_logdir(logdir)
-    # The first event contains no summary values. The written content goes to
-    # the second event.
-    return events[1]
-
   def run_trace(self, f, step=1):
     assert context.executing_eagerly()
     logdir = self.get_temp_dir()
@@ -1053,62 +1038,6 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
       # Reset to default state for other tests.
       summary_ops.set_step(None)
 
-  @test_util.run_v2_only
-  def testKerasModel(self):
-    model = Sequential(
-        [Dense(10, input_shape=(100,)),
-         Activation('relu', name='my_relu')])
-    event = self.keras_model(name='my_name', data=model, step=1)
-    first_val = event.summary.value[0]
-    self.assertEqual(model.to_json(), first_val.tensor.string_val[0].decode())
-
-  @test_util.run_v2_only
-  def testKerasModel_usesDefaultStep(self):
-    model = Sequential(
-        [Dense(10, input_shape=(100,)),
-         Activation('relu', name='my_relu')])
-    try:
-      summary_ops.set_step(42)
-      event = self.keras_model(name='my_name', data=model)
-      self.assertEqual(42, event.step)
-    finally:
-      # Reset to default state for other tests.
-      summary_ops.set_step(None)
-
-  @test_util.run_v2_only
-  def testKerasModel_subclass(self):
-
-    class SimpleSubclass(Model):
-
-      def __init__(self):
-        super(SimpleSubclass, self).__init__(name='subclass')
-        self.dense = Dense(10, input_shape=(100,))
-        self.activation = Activation('relu', name='my_relu')
-
-      def call(self, inputs):
-        x = self.dense(inputs)
-        return self.activation(x)
-
-    model = SimpleSubclass()
-    with test.mock.patch.object(logging, 'warn') as mock_log:
-      self.assertFalse(
-          summary_ops.keras_model(name='my_name', data=model, step=1))
-      self.assertRegexpMatches(
-          str(mock_log.call_args), 'Model failed to serialize as JSON.')
-
-  @test_util.run_v2_only
-  def testKerasModel_otherExceptions(self):
-    model = Sequential()
-
-    with test.mock.patch.object(model, 'to_json') as mock_to_json:
-      with test.mock.patch.object(logging, 'warn') as mock_log:
-        mock_to_json.side_effect = Exception('oops')
-        self.assertFalse(
-            summary_ops.keras_model(name='my_name', data=model, step=1))
-        self.assertRegexpMatches(
-            str(mock_log.call_args),
-            'Model failed to serialize as JSON. Ignoring... oops')
-
   @test_util.run_v2_only
   def testTrace(self):
 
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 120e604e7ae..eae42f55a3f 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -31,7 +32,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -46,54 +47,51 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SvdOpTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 0"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*0"):
       linalg_ops.svd(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 1"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*1"):
       linalg_ops.svd(vector)
 
-  @test_util.run_v1_only("b/120545219")
-  def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for compute_uv_ in True, False:
-        for full_matrices_ in True, False:
-          matrix1 = random_ops.random_normal([5, 5], seed=42)
-          matrix2 = random_ops.random_normal([5, 5], seed=42)
-          if compute_uv_:
-            s1, u1, v1 = linalg_ops.svd(
-                matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            s2, u2, v2 = linalg_ops.svd(
-                matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            all_ops += [s1, u1, v1, s2, u2, v2]
-          else:
-            s1 = linalg_ops.svd(
-                matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            s2 = linalg_ops.svd(
-                matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            all_ops += [s1, s2]
-      val = self.evaluate(all_ops)
-      for i in range(2):
-        s = 6 * i
-        self.assertAllEqual(val[s], val[s + 3])  # s1 == s2
-        self.assertAllEqual(val[s + 1], val[s + 4])  # u1 == u2
-        self.assertAllEqual(val[s + 2], val[s + 5])  # v1 == v2
-      for i in range(2):
-        s = 12 + 2 * i
-        self.assertAllEqual(val[s], val[s + 1])  # s1 == s2
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testExecuteMultipleWithoutError(self):
+    all_ops = []
+    shape = [6, 5]
+    seed = [42, 24]
+    for compute_uv_ in True, False:
+      for full_matrices_ in True, False:
+        matrix1 = stateless_random_ops.stateless_random_normal(shape, seed)
+        matrix2 = stateless_random_ops.stateless_random_normal(shape, seed)
+        self.assertAllEqual(matrix1, matrix2)
+        if compute_uv_:
+          s1, u1, v1 = linalg_ops.svd(
+              matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          s2, u2, v2 = linalg_ops.svd(
+              matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          all_ops += [s1, s2, u1, u2, v1, v2]
+        else:
+          s1 = linalg_ops.svd(
+              matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          s2 = linalg_ops.svd(
+              matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          all_ops += [s1, s2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(val), 2):
+      self.assertAllEqual(val[i], val[i + 1])
 
 
 def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
                   full_matrices_):
 
   def CompareSingularValues(self, x, y, tol):
-    self.assertAllClose(x, y, atol=(x[0] + y[0]) * tol)
+    atol = (x[0] + y[0]) * tol if len(x) else tol
+    self.assertAllClose(x, y, atol=atol)
 
   def CompareSingularVectors(self, x, y, rank, tol):
     # We only compare the first 'rank' singular vectors since the
@@ -135,8 +133,10 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
     self.assertAllClose(identity, xx, atol=tol)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
+    if not use_static_shape_ and context.executing_eagerly():
+      return
     is_complex = dtype_ in (np.complex64, np.complex128)
     is_single = dtype_ in (np.float32, np.complex64)
     tol = 3e-4 if is_single else 1e-12
@@ -151,48 +151,48 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.session(use_gpu=True) as sess:
-      if use_static_shape_:
-        x_tf = constant_op.constant(x_np)
-      else:
-        x_tf = array_ops.placeholder(dtype_)
+    if use_static_shape_:
+      x_tf = constant_op.constant(x_np)
+    else:
+      x_tf = array_ops.placeholder(dtype_)
 
-      if compute_uv_:
-        s_tf, u_tf, v_tf = linalg_ops.svd(
-            x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
-        if use_static_shape_:
-          s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
-        else:
+    if compute_uv_:
+      s_tf, u_tf, v_tf = linalg_ops.svd(
+          x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
+      if use_static_shape_:
+        s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
+      else:
+        with self.session(use_gpu=True) as sess:
           s_tf_val, u_tf_val, v_tf_val = sess.run(
               [s_tf, u_tf, v_tf], feed_dict={x_tf: x_np})
+    else:
+      s_tf = linalg_ops.svd(
+          x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
+      if use_static_shape_:
+        s_tf_val = self.evaluate(s_tf)
       else:
-        s_tf = linalg_ops.svd(
-            x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
-        if use_static_shape_:
-          s_tf_val = self.evaluate(s_tf)
-        else:
+        with self.session(use_gpu=True) as sess:
           s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np})
 
-      if compute_uv_:
-        u_np, s_np, v_np = np.linalg.svd(
-            x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
-      else:
-        s_np = np.linalg.svd(
-            x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
-      # We explicitly avoid the situation where numpy eliminates a first
-      # dimension that is equal to one.
-      s_np = np.reshape(s_np, s_tf_val.shape)
+    if compute_uv_:
+      u_np, s_np, v_np = np.linalg.svd(
+          x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
+    else:
+      s_np = np.linalg.svd(
+          x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
+    # We explicitly avoid the situation where numpy eliminates a first
+    # dimension that is equal to one.
+    s_np = np.reshape(s_np, s_tf_val.shape)
 
-      CompareSingularValues(self, s_np, s_tf_val, tol)
-      if compute_uv_:
-        CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol)
-        CompareSingularVectors(self,
-                               np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val,
-                               min(shape_[-2:]), tol)
-        CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val,
-                           full_matrices_, tol)
-        CheckUnitary(self, u_tf_val, tol)
-        CheckUnitary(self, v_tf_val, tol)
+    CompareSingularValues(self, s_np, s_tf_val, tol)
+    if compute_uv_:
+      CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol)
+      CompareSingularVectors(self, np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val,
+                             min(shape_[-2:]), tol)
+      CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val,
+                         full_matrices_, tol)
+      CheckUnitary(self, u_tf_val, tol)
+      CheckUnitary(self, v_tf_val, tol)
 
   return Test
 
@@ -374,18 +374,18 @@ if __name__ == "__main__":
   for compute_uv in False, True:
     for full_matrices in False, True:
       for dtype in dtypes_to_test:
-        for rows in 1, 2, 5, 10, 32, 100:
-          for cols in 1, 2, 5, 10, 32, 100:
+        for rows in 0, 1, 2, 5, 10, 32, 100:
+          for cols in 0, 1, 2, 5, 10, 32, 100:
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
-              shape = batch_dims + (rows, cols)
-              # TF2 does not support placeholders under eager so we skip it
-              for use_static_shape in set([True, tf2.enabled()]):
+              full_shape = batch_dims + (rows, cols)
+              for use_static_shape in set([True, False]):
                 name = "%s_%s_static_shape_%s__compute_uv_%s_full_%s" % (
-                    dtype.__name__, "_".join(map(str, shape)), use_static_shape,
-                    compute_uv, full_matrices)
-                _AddTest(SvdOpTest, "Svd", name,
-                         _GetSvdOpTest(dtype, shape, use_static_shape,
-                                       compute_uv, full_matrices))
+                    dtype.__name__, "_".join(map(str, full_shape)),
+                    use_static_shape, compute_uv, full_matrices)
+                _AddTest(
+                    SvdOpTest, "Svd", name,
+                    _GetSvdOpTest(dtype, full_shape, use_static_shape,
+                                  compute_uv, full_matrices))
   for compute_uv in False, True:
     for full_matrices in False, True:
       dtypes = ([np.float32, np.float64] + [np.complex64, np.complex128] *
@@ -396,16 +396,16 @@ if __name__ == "__main__":
           mat_shapes += [(5, 11), (11, 5)]
         for mat_shape in mat_shapes:
           for batch_dims in [(), (3,)]:
-            shape = batch_dims + mat_shape
-            name = "%s_%s_compute_uv_%s_full_%s" % (
-                dtype.__name__, "_".join(map(str, shape)), compute_uv,
-                full_matrices)
-            _AddTest(SvdGradOpTest, "SvdGrad", name,
-                     _GetSvdGradOpTest(dtype, shape, compute_uv, full_matrices))
+            full_shape = batch_dims + mat_shape
+            name = "%s_%s_compute_uv_%s_full_%s" % (dtype.__name__, "_".join(
+                map(str, full_shape)), compute_uv, full_matrices)
+            _AddTest(
+                SvdGradOpTest, "SvdGrad", name,
+                _GetSvdGradOpTest(dtype, full_shape, compute_uv, full_matrices))
             # The results are too inaccurate for float32.
             if dtype in (np.float64, np.complex128):
               _AddTest(
                   SvdGradGradOpTest, "SvdGradGrad", name,
-                  _GetSvdGradGradOpTest(dtype, shape, compute_uv,
+                  _GetSvdGradGradOpTest(dtype, full_shape, compute_uv,
                                         full_matrices))
   test.main()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 33879232fd3..5d587954858 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -1021,7 +1021,7 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
@@ -1251,7 +1251,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
-  @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
 
@@ -1340,11 +1339,11 @@ class TensorArrayTest(test.TestCase):
       grad = gradients_impl.gradients(ys=[r], xs=[x])
       self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
@@ -1422,7 +1421,7 @@ class TensorArrayTest(test.TestCase):
           v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
         ta.stack().eval()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
 
@@ -1445,11 +1444,11 @@ class TensorArrayTest(test.TestCase):
       # first dimension of zero
       self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1476,7 +1475,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayScatterPartialReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 71e448f7855..7f8c5e9781b 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -41,16 +41,19 @@ def _add_test(test, test_name, fn):
 
 class TensordotTest(test_lib.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_invalid_shape(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4], [5, 6]]
     a_axes = [1]
     b_axes = [0]
     # Invalid static shapes.
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       math_ops.tensordot(a, b, (a_axes, b_axes))
+
     # Invalid dynamic shapes.
+    if context.executing_eagerly():
+      return
     with self.cached_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "Matrix size-incompatible"):
@@ -65,7 +68,7 @@ class TensordotTest(test_lib.TestCase):
                 axes_ph: (a_axes, b_axes)
             })
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
@@ -77,6 +80,8 @@ class TensordotTest(test_lib.TestCase):
     with self.assertRaises(IndexError):
       math_ops.tensordot(a, b, [[0], [7]])
 
+    if context.executing_eagerly():
+      return
     # Invalid dynamic axes.
     a_ph = array_ops.placeholder(dtypes.float32)
     b_ph = array_ops.placeholder(dtypes.float32)
@@ -93,22 +98,22 @@ class TensordotTest(test_lib.TestCase):
                   axes_ph: axes_value
               })
 
-  # Test case for 11950
+  # Test case for https://github.com/tensorflow/tensorflow/issues/11950
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_valid_axis(self):
     for axes_value in [1, 2], [[1], [2]], [[], []], 0:
-      with self.cached_session():
-        np_a = np.ones((3, 3))
-        np_b = np.array([2, 3, 1])[None, None]
-        np_ans = np.tensordot(np_a, np_b, axes_value)
+      np_a = np.ones((3, 3))
+      np_b = np.array([2, 3, 1])[None, None]
+      np_ans = np.tensordot(np_a, np_b, axes_value)
 
-        tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
-        tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
-        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
+      tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
+      tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
 
-        self.assertAllEqual(tf_ans.shape, np_ans.shape)
-        self.assertAllEqual(tf_ans, np_ans)
+      self.assertAllEqual(tf_ans.shape, np_ans.shape)
+      self.assertAllEqual(self.evaluate(tf_ans), np_ans)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Shape inference test")
   def test_partial_shape_inference(self):
     for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
@@ -159,7 +164,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
         size=np.prod(b_shape)).reshape(b_shape).astype(dtype_)
     return a, b, a_dims, b_dims
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_tensordot(self):
+    if dynamic_shape_ and context.executing_eagerly():
+      self.skipTest("Placeholders not support in eager mode")
     num_trials = min(30, num_dims_ * num_dims_)
     if dtype_ == np.float16:
       tol = 0.05
@@ -187,7 +195,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_tensordot_scalar_axes(self):
+    if dynamic_shape_ and context.executing_eagerly():
+      self.skipTest("Placeholders not support in eager mode")
     if num_dims_ < 1:
       self.skipTest("Not a test")
     if dtype_ == np.float16:
@@ -229,7 +240,7 @@ if __name__ == "__main__":
       for rank_b in 1, 2, 4, 5:
         for num_dims in range(0, min(rank_a, rank_b) + 1):
           # TF2 does not support placeholders under eager so we skip it
-          for dynamic_shape in set([False, not tf2.enabled()]):
+          for dynamic_shape in set([False, True]):
             for testcase in _get_tensordot_tests(dtype, rank_a, rank_b,
                                                  num_dims, dynamic_shape):
               name = "%s_%s_%s_%s_%s_%s" % (testcase.__name__, dtype.__name__,
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index 8d1fe388c55..87096211a01 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -164,15 +164,18 @@ class TransposeTest(test.TestCase):
     datatypes = [np.int8, np.float16, np.float32, np.float64, np.complex128]
     for datatype in datatypes:
       for input_shape, perm in zip(large_shapes, perms):
-        total_size = np.prod(input_shape)
-        inp = np.arange(1, total_size + 1, dtype=datatype).reshape(input_shape)
-        np_ans = self._np_transpose(inp, perm)
-        with self.cached_session(use_gpu=True):
-          inx = ops.convert_to_tensor(inp)
-          y = array_ops.transpose(inx, perm)
-          tf_ans = self.evaluate(y)
-        self.assertAllEqual(np_ans, tf_ans)
-        self.assertShapeEqual(np_ans, y)
+        with self.subTest(
+            datatype=datatype, input_shape=input_shape, perm=perm):
+          total_size = np.prod(input_shape)
+          inp = np.arange(
+              1, total_size + 1, dtype=datatype).reshape(input_shape)
+          np_ans = self._np_transpose(inp, perm)
+          with self.cached_session(use_gpu=True):
+            inx = ops.convert_to_tensor(inp)
+            y = array_ops.transpose(inx, perm)
+            tf_ans = self.evaluate(y)
+          self.assertAllEqual(np_ans, tf_ans)
+          self.assertShapeEqual(np_ans, y)
 
   def test4DGPU(self):
     # If no GPU available, skip the test
@@ -185,15 +188,17 @@ class TransposeTest(test.TestCase):
     ]] * 3 + [[2, 3, 0, 1]] * 3
 
     for input_shape, perm in zip(large_shapes, perms):
-      total_size = np.prod(input_shape)
-      inp = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_shape)
-      np_ans = self._np_transpose(inp, perm)
-      with self.cached_session(use_gpu=True):
-        inx = ops.convert_to_tensor(inp)
-        y = array_ops.transpose(inx, perm)
-        tf_ans = self.evaluate(y)
-      self.assertAllEqual(np_ans, tf_ans)
-      self.assertShapeEqual(np_ans, y)
+      with self.subTest(input_shape=input_shape, perm=perm):
+        total_size = np.prod(input_shape)
+        inp = np.arange(
+            1, total_size + 1, dtype=np.float32).reshape(input_shape)
+        np_ans = self._np_transpose(inp, perm)
+        with self.cached_session(use_gpu=True):
+          inx = ops.convert_to_tensor(inp)
+          y = array_ops.transpose(inx, perm)
+          tf_ans = self.evaluate(y)
+        self.assertAllEqual(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, y)
 
     # shapes related to Inception (taken from conv_ops_test.py)
     inception_shapes = [[4, 5, 5, 124], [4, 8, 8, 38], [4, 8, 8, 38], [
@@ -219,16 +224,18 @@ class TransposeTest(test.TestCase):
                         [4, 35, 35, 19], [4, 73, 73, 6], [4, 73, 73,
                                                           6], [4, 147, 147, 2]]
     for input_shape in inception_shapes:
-      perm = [0, 3, 1, 2]
-      total_size = np.prod(input_shape)
-      inp = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_shape)
-      np_ans = self._np_transpose(inp, perm)
-      with self.cached_session(use_gpu=True):
-        inx = ops.convert_to_tensor(inp)
-        y = array_ops.transpose(inx, perm)
-        tf_ans = self.evaluate(y)
-      self.assertAllEqual(np_ans, tf_ans)
-      self.assertShapeEqual(np_ans, y)
+      with self.subTest(input_shape=input_shape):
+        perm = [0, 3, 1, 2]
+        total_size = np.prod(input_shape)
+        inp = np.arange(
+            1, total_size + 1, dtype=np.float32).reshape(input_shape)
+        np_ans = self._np_transpose(inp, perm)
+        with self.cached_session(use_gpu=True):
+          inx = ops.convert_to_tensor(inp)
+          y = array_ops.transpose(inx, perm)
+          tf_ans = self.evaluate(y)
+        self.assertAllEqual(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, y)
 
   def test3DGPU(self):
     # If no GPU available, skip the test
@@ -242,15 +249,18 @@ class TransposeTest(test.TestCase):
                                                                   ] * 3
     for datatype in datatypes:
       for input_shape, perm in zip(large_shapes, perms):
-        total_size = np.prod(input_shape)
-        inp = np.arange(1, total_size + 1, dtype=datatype).reshape(input_shape)
-        np_ans = self._np_transpose(inp, perm)
-        with self.cached_session(use_gpu=True):
-          inx = ops.convert_to_tensor(inp)
-          y = array_ops.transpose(inx, perm)
-          tf_ans = self.evaluate(y)
-        self.assertAllEqual(np_ans, tf_ans)
-        self.assertShapeEqual(np_ans, y)
+        with self.subTest(
+            datatype=datatype, input_shape=input_shape, perm=perm):
+          total_size = np.prod(input_shape)
+          inp = np.arange(
+              1, total_size + 1, dtype=datatype).reshape(input_shape)
+          np_ans = self._np_transpose(inp, perm)
+          with self.cached_session(use_gpu=True):
+            inx = ops.convert_to_tensor(inp)
+            y = array_ops.transpose(inx, perm)
+            tf_ans = self.evaluate(y)
+          self.assertAllEqual(np_ans, tf_ans)
+          self.assertShapeEqual(np_ans, y)
 
   def testLargeSizeGPU(self):
     # If no GPU available, skip the test
@@ -263,15 +273,17 @@ class TransposeTest(test.TestCase):
     perms = [[0, 2, 1]] * 9
 
     for input_shape, perm in zip(large_shapes, perms):
-      total_size = np.prod(input_shape)
-      inp = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_shape)
-      np_ans = self._np_transpose(inp, perm)
-      with self.cached_session(use_gpu=True):
-        inx = ops.convert_to_tensor(inp)
-        y = array_ops.transpose(inx, perm)
-        tf_ans = self.evaluate(y)
-      self.assertAllEqual(np_ans, tf_ans)
-      self.assertShapeEqual(np_ans, y)
+      with self.subTest(input_shape=input_shape, perm=perm):
+        total_size = np.prod(input_shape)
+        inp = np.arange(
+            1, total_size + 1, dtype=np.float32).reshape(input_shape)
+        np_ans = self._np_transpose(inp, perm)
+        with self.cached_session(use_gpu=True):
+          inx = ops.convert_to_tensor(inp)
+          y = array_ops.transpose(inx, perm)
+          tf_ans = self.evaluate(y)
+        self.assertAllEqual(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, y)
 
   def testRandomizedSmallDimLargeSizeGPU(self):
     # If no GPU available, skip the test
@@ -316,15 +328,16 @@ class TransposeTest(test.TestCase):
 
     for input_shape, perm in zip(input_shapes, perms):
       # generate input data with random ints from 0 to 9.
-      inp = np.random.randint(10, size=input_shape)
-      np_ans = self._np_transpose(inp, perm)
-      with self.cached_session(use_gpu=True):
-        inx = ops.convert_to_tensor(inp)
-        y = array_ops.transpose(inx, perm)
-        tf_ans = self.evaluate(y)
-      self.assertAllEqual(np_ans, tf_ans)
-      self.assertShapeEqual(np_ans, y)
-      self._ClearCachedSession()
+      with self.subTest(input_shape=input_shape, perm=perm):
+        inp = np.random.randint(10, size=input_shape)
+        np_ans = self._np_transpose(inp, perm)
+        with self.cached_session(use_gpu=True):
+          inx = ops.convert_to_tensor(inp)
+          y = array_ops.transpose(inx, perm)
+          tf_ans = self.evaluate(y)
+        self.assertAllEqual(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, y)
+        self._ClearCachedSession()
 
   @test_util.run_v1_only("b/120545219")
   def testNop(self):
@@ -338,16 +351,17 @@ class TransposeTest(test.TestCase):
 
   def testPermType(self):
     for perm_dtype in [np.int64, np.int32]:
-      x = np.arange(0, 8).reshape([2, 4]).astype(np.float32)
-      p = np.array([1, 0]).astype(perm_dtype)
-      np_ans = np.copy(x).transpose(p)
-      with self.cached_session(use_gpu=True):
-        inx = ops.convert_to_tensor(x)
-        inp = constant_op.constant(p)
-        y = array_ops.transpose(inx, inp)
-        tf_ans = self.evaluate(y)
-        self.assertShapeEqual(np_ans, y)
-        self.assertAllEqual(np_ans, tf_ans)
+      with self.subTest(perm_dtype=perm_dtype):
+        x = np.arange(0, 8).reshape([2, 4]).astype(np.float32)
+        p = np.array([1, 0]).astype(perm_dtype)
+        np_ans = np.copy(x).transpose(p)
+        with self.cached_session(use_gpu=True):
+          inx = ops.convert_to_tensor(x)
+          inp = constant_op.constant(p)
+          y = array_ops.transpose(inx, inp)
+          tf_ans = self.evaluate(y)
+          self.assertShapeEqual(np_ans, y)
+          self.assertAllEqual(np_ans, tf_ans)
 
   def testHalf(self):
     self._compare(np.arange(0, 21).reshape([3, 7]).astype(np.float16))
@@ -423,9 +437,10 @@ class TransposeTest(test.TestCase):
   def testTranspose2DAuto(self):
     x_np = [[1, 2, 3], [4, 5, 6]]
     for use_gpu in [False, True]:
-      with self.cached_session(use_gpu=use_gpu):
-        x_tf = array_ops.transpose(x_np).eval()
-        self.assertAllEqual(x_tf, [[1, 4], [2, 5], [3, 6]])
+      with self.subTest(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
+          x_tf = array_ops.transpose(x_np).eval()
+          self.assertAllEqual(x_tf, [[1, 4], [2, 5], [3, 6]])
 
   @test_util.run_v1_only("b/120545219")
   def testSingletonDims(self):
@@ -439,8 +454,9 @@ class TransposeTest(test.TestCase):
     # copy here.
     for shape in [[2, 1, 2], [2, 1, 2, 1, 1, 2], [1, 2, 2, 1, 1, 1],
                   [1, 1, 1, 2, 2, 2], [2, 2, 1, 1, 1]]:
-      self._compare_cpu_gpu(
-          np.arange(np.prod(shape)).reshape(shape).astype(np.float32))
+      with self.subTest(shape=shape):
+        self._compare_cpu_gpu(
+            np.arange(np.prod(shape)).reshape(shape).astype(np.float32))
 
   @test_util.run_v1_only("b/120545219")
   def testTransposeShapes(self):
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 3d08bf04217..436fef8171f 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -61,17 +61,18 @@ class UniqueTest(test.TestCase):
 
   def testInt32Axis(self):
     for dtype in [np.int32, np.int64]:
-      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-      y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
-      self.assertEqual(y0.shape.rank, 2)
-      tf_y0, tf_idx0 = self.evaluate([y0, idx0])
-      y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
-      self.assertEqual(y1.shape.rank, 2)
-      tf_y1, tf_idx1 = self.evaluate([y1, idx1])
-      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+      with self.subTest(dtype=dtype):
+        x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+        y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
+        self.assertEqual(y0.shape.rank, 2)
+        tf_y0, tf_idx0 = self.evaluate([y0, idx0])
+        y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
+        self.assertEqual(y1.shape.rank, 2)
+        tf_y1, tf_idx1 = self.evaluate([y1, idx1])
+        self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+        self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+        self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+        self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
 
   def testInt32V2(self):
     # This test is only temporary, once V2 is used
@@ -144,26 +145,28 @@ class UniqueWithCountsTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
     for value, count in zip(tf_y, tf_count):
-      v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
-      self.assertEqual(count, sum(v))
+      with self.subTest(value=value, count=count):
+        v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
+        self.assertEqual(count, sum(v))
 
   def testInt32Axis(self):
     for dtype in [np.int32, np.int64]:
-      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-      y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
-          x, axis=np.array([0], dtype))
-      self.assertEqual(y0.shape.rank, 2)
-      tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
-      y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
-          x, axis=np.array([1], dtype))
-      self.assertEqual(y1.shape.rank, 2)
-      tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
-      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-      self.assertAllEqual(tf_count0, np.array([2, 1]))
-      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
-      self.assertAllEqual(tf_count1, np.array([1, 2]))
+      with self.subTest(dtype=dtype):
+        x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+        y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([0], dtype))
+        self.assertEqual(y0.shape.rank, 2)
+        tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
+        y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([1], dtype))
+        self.assertEqual(y1.shape.rank, 2)
+        tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
+        self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+        self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+        self.assertAllEqual(tf_count0, np.array([2, 1]))
+        self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+        self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+        self.assertAllEqual(tf_count1, np.array([1, 2]))
 
   def testInt32V2(self):
     # This test is only temporary, once V2 is used
@@ -205,6 +208,28 @@ class UniqueWithCountsTest(test.TestCase):
     for value, count in zip(tf_y, tf_count):
       self.assertEqual(count, np.sum(x == value))
 
+  def testFloat(self):
+    # NOTE(mrry): The behavior when a key is NaN is inherited from
+    # `std::unordered_map<float, ...>`: each NaN becomes a unique key in the
+    # map.
+    x = [0.0, 1.0, np.nan, np.nan]
+    y, idx, count = gen_array_ops.unique_with_counts_v2(
+        x, axis=np.array([], np.int32))
+    tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      if np.isnan(x[i]):
+        self.assertTrue(np.isnan(tf_y[tf_idx[i]]))
+      else:
+        self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      if np.isnan(value):
+        self.assertEqual(count, 1)
+      else:
+        self.assertEqual(count, np.sum(x == value))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index 7a15888686e..13611b278bc 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -39,22 +39,49 @@ def np_split_squeeze(array, axis):
 
 class UnstackOpTest(test.TestCase):
 
+  def randn(self, shape, dtype):
+    data = np.random.randn(*shape)
+    if dtype == np.bool:
+      return data < 0  # Naive casting yields True with P(1)!
+    else:
+      return data.astype(dtype)
+
+  def unstackReference(self, data, axis):
+    """Use numpy primitives to implement unstack equivalent."""
+    result = []
+    rank = len(data.shape)
+    axis = axis + rank if axis < 0 else axis
+    for k in range(data.shape[axis]):
+      axis = rank + axis if axis < 0 else axis
+      # Slice in axis dimension of k'th slice.
+      # e.g. if rank=4 k=2, axis=2 then equivalent of data[:,:,2,:]
+      # Give error with loop context
+      slice_spec = tuple(
+          slice(None) if i != axis else k for i in range(rank))
+      result.append(data.__getitem__(slice_spec))
+    return result
+
   def testSimple(self):
     np.random.seed(7)
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-      for dtype in [
-          np.bool, np.float16, np.float32, np.float64, np.uint8, np.int32,
-          np.int64
-      ]:
-        data = np.random.randn(*shape).astype(dtype)
-        # Convert data to a single tensorflow tensor
-        x = constant_op.constant(data)
-        # Unstack into a list of tensors
-        cs = array_ops.unstack(x, num=shape[0])
-        self.assertEqual(type(cs), list)
-        self.assertEqual(len(cs), shape[0])
-        cs = [self.evaluate(c) for c in cs]
-        self.assertAllEqual(cs, data)
+      rank = len(shape)
+      for axis in range(-rank, rank):
+        for dtype in [
+            np.bool, np.float16, np.float32, np.float64, np.uint8, np.int32,
+            np.int64
+        ]:
+          data = self.randn(shape, dtype)
+          # Convert data to a single tensorflow tensor
+          x = constant_op.constant(data)
+
+          # Unstack into a list of tensors
+          ref = self.unstackReference(data, axis)
+          cs = array_ops.unstack(x, axis=axis)
+          self.assertEqual(type(cs), list)
+          self.assertEqual(len(cs), shape[axis])
+          for k, c in enumerate(cs):
+            with self.subTest(shape=shape, k=k, axis=axis, dtype=dtype):
+              self.assertAllEqual(ref[k], self.evaluate(c))
 
   def testSimpleGpu(self):
     if not test_util.is_gpu_available():
@@ -63,19 +90,24 @@ class UnstackOpTest(test.TestCase):
     np.random.seed(7)
     with test_util.force_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [
-            np.bool, np.float16, np.float32, np.float64, np.uint8, np.int32,
-            np.int64
-        ]:
-          data = np.random.randn(*shape).astype(dtype)
-          # Convert data to a single tensorflow tensor
-          x = constant_op.constant(data)
-          # Unstack into a list of tensors
-          cs = array_ops.unstack(x, num=shape[0])
-          self.assertEqual(type(cs), list)
-          self.assertEqual(len(cs), shape[0])
-          cs = [self.evaluate(c) for c in cs]
-          self.assertAllEqual(cs, data)
+        rank = len(shape)
+        for axis in range(-rank, rank):
+          for dtype in [
+              np.bool, np.float16, np.float32, np.float64, np.uint8, np.int32,
+              np.int64
+          ]:
+            data = self.randn(shape, dtype)
+            # Convert data to a single tensorflow tensor
+            x = constant_op.constant(data)
+            # Unstack into a list of tensors
+            ref = self.unstackReference(data, axis)
+            cs = array_ops.unstack(x, axis=axis)
+            self.assertEqual(type(cs), list)
+            self.assertEqual(len(cs), shape[axis])
+            for k, c in enumerate(cs):
+              # Give error with loop context
+              with self.subTest(shape=shape, k=k, axis=axis, dtype=dtype):
+                self.assertAllEqual(ref[k], self.evaluate(c))
 
   @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
diff --git a/tensorflow/python/kernel_tests/variable_ops_test.py b/tensorflow/python/kernel_tests/variable_ops_test.py
index 0f3e2619925..4e23792caa4 100644
--- a/tensorflow/python/kernel_tests/variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variable_ops_test.py
@@ -33,10 +33,13 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 _NP_TO_TF = {
+    np.float16: dtypes.float16,
     np.float32: dtypes.float32,
     np.float64: dtypes.float64,
     np.int32: dtypes.int32,
     np.int64: dtypes.int64,
+    np.complex64: dtypes.complex64,
+    np.complex128: dtypes.complex128,
 }
 
 
@@ -50,7 +53,10 @@ class VariableOpTest(test.TestCase):
       return self.evaluate(p)
 
   def _testTypes(self, vals):
-    for dtype in [np.float32, np.float64, np.int32, np.int64]:
+    for dtype in [
+        np.float16, np.float32, np.float64, np.complex64, np.complex128,
+        np.int32, np.int64
+    ]:
       self.setUp()
       x = vals.astype(dtype)
       tftype = _NP_TO_TF[dtype]
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index 3f53f49fc30..95bbea156f2 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -49,6 +49,7 @@ from tensorflow.python.ops import while_v2
 from tensorflow.python.ops.while_v2 import while_loop as while_loop_v2
 from tensorflow.python.platform import test
 
+
 def random_gamma(shape):  # pylint: disable=invalid-name
   return random_ops.random_gamma(shape, 1.0)
 
@@ -1204,6 +1205,43 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
     F()
 
+  @test_util.run_deprecated_v1  # Need to pass RunMetadata.
+  def testDisableLowering(self):
+    old = control_flow_util_v2._DISABLE_LOWER_USING_SWITCH_MERGE
+    control_flow_util_v2._DISABLE_LOWER_USING_SWITCH_MERGE = True
+    with self.session() as sess:
+      x = constant_op.constant(2.)
+      ret = while_loop_v2(
+          lambda v: v < 8., lambda v: v * v, [x], return_same_structure=False)
+
+      opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      self.assertEqual(sess.run(ret, options=opts, run_metadata=run_metadata),
+                       16)
+      for dev_stat in run_metadata.step_stats.dev_stats:
+        for ns in dev_stat.node_stats:
+          self.assertNotIn("switch", ns.node_name)
+    control_flow_util_v2._DISABLE_LOWER_USING_SWITCH_MERGE = old
+
+  def _runBasicWithConfig(self, config):
+    with ops.device("/cpu:0"):
+      x = constant_op.constant(0)
+      ret, = while_loop_v2(lambda x: x < 1000, lambda x: x + 1, [x])
+    with self.cached_session(config=config):
+      self.assertEqual(1000, self.evaluate(ret))
+
+  @test_util.run_deprecated_v1
+  def testRunKernelsInline(self):
+    config = config_pb2.ConfigProto()
+    config.inter_op_parallelism_threads = -1
+    self._runBasicWithConfig(config)
+
+  @test_util.run_deprecated_v1
+  def testSingleThreadedExecution(self):
+    config = config_pb2.ConfigProto()
+    config.experimental.executor_type = "SINGLE_THREADED_EXECUTOR"
+    self._runBasicWithConfig(config)
+
 
 def ScalarShape():
   return ops.convert_to_tensor([], dtype=dtypes.int32)
diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
index 42b248a7ddb..d165c47910b 100644
--- a/tensorflow/python/lib/core/bfloat16.cc
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -313,11 +313,15 @@ PyTypeObject PyBfloat16_Type = {
 #else
     PyVarObject_HEAD_INIT(nullptr, 0)
 #endif
-    "bfloat16",                                // tp_name
-    sizeof(PyBfloat16),                        // tp_basicsize
-    0,                                         // tp_itemsize
-    nullptr,                                   // tp_dealloc
-    0,                                         // tp_print
+    "bfloat16",          // tp_name
+    sizeof(PyBfloat16),  // tp_basicsize
+    0,                   // tp_itemsize
+    nullptr,             // tp_dealloc
+#if PY_VERSION_HEX < 0x03080000
+    nullptr,  // tp_print
+#else
+    0,  // tp_vectorcall_offset
+#endif
     nullptr,                                   // tp_getattr
     nullptr,                                   // tp_setattr
     nullptr,                                   // tp_compare / tp_reserved
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 2f9972c81bf..2afd2888e8f 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <cstring>
 
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -488,8 +490,9 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
   return Status::OK();
 }
 
-Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor) {
-  DCHECK(out_tensor != nullptr);
+Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
+                       Safe_TF_TensorPtr* ret, bool convert_string) {
+  DCHECK(ret != nullptr);
 
   // Make sure we dereference this array object in case of error, etc.
   Safe_PyObjectPtr array_safe(make_safe(
@@ -515,26 +518,52 @@ Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor) {
   if (dtype == TF_RESOURCE) {
     size_t size = PyArray_NBYTES(array);
     array_safe.release();
-    *out_tensor = make_safe(TF_NewTensor(dtype, {}, 0, PyArray_DATA(array),
-                                         size, &DelayedNumpyDecref, array));
+
+    if (ctx) {
+      *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), {}, 0, PyArray_DATA(array),
+          size, convert_string, &DelayedNumpyDecref, array)});
+    } else {
+      *ret = make_safe(TF_NewTensor(dtype, {}, 0, PyArray_DATA(array), size,
+                                    &DelayedNumpyDecref, array));
+    }
 
   } else if (dtype != TF_STRING) {
     size_t size = PyArray_NBYTES(array);
     array_safe.release();
-    *out_tensor = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
-                                         PyArray_DATA(array), size,
-                                         &DelayedNumpyDecref, array));
+    if (ctx) {
+      *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
+          PyArray_DATA(array), size, convert_string, &DelayedNumpyDecref,
+          array)});
+    } else {
+      *ret = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
+                                    PyArray_DATA(array), size,
+                                    &DelayedNumpyDecref, array));
+    }
+
   } else {
     size_t size = 0;
     void* encoded = nullptr;
     TF_RETURN_IF_ERROR(EncodePyBytesArray(array, nelems, &size, &encoded));
-    *out_tensor = make_safe(TF_NewTensor(
-        dtype, dims.data(), dims.size(), encoded, size,
-        [](void* data, size_t len, void* arg) {
-          delete[] reinterpret_cast<char*>(data);
-        },
-        nullptr));
+    if (ctx) {
+      *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
+          encoded, size, convert_string,
+          [](void* data, size_t len, void* arg) {
+            delete[] reinterpret_cast<char*>(data);
+          },
+          nullptr)});
+    } else {
+      *ret = make_safe(TF_NewTensor(
+          dtype, dims.data(), dims.size(), encoded, size,
+          [](void* data, size_t len, void* arg) {
+            delete[] reinterpret_cast<char*>(data);
+          },
+          nullptr));
+    }
   }
+
   return Status::OK();
 }
 
@@ -543,7 +572,8 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status);
 
 Status NdarrayToTensor(PyObject* obj, Tensor* ret) {
   Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
-  Status s = PyArrayToTF_Tensor(obj, &tf_tensor);
+  Status s = NdarrayToTensor(nullptr /*ctx*/, obj, &tf_tensor,
+                             false /*convert_string*/);
   if (!s.ok()) {
     return s;
   }
diff --git a/tensorflow/python/lib/core/ndarray_tensor.h b/tensorflow/python/lib/core/ndarray_tensor.h
index c5cd24cff2d..38c098417d5 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.h
+++ b/tensorflow/python/lib/core/ndarray_tensor.h
@@ -28,15 +28,21 @@ Status TF_TensorToMaybeAliasedPyArray(Safe_TF_TensorPtr tensor,
 
 Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray);
 
-// Converts the given numpy ndarray to a (safe) TF_Tensor. The returned
-// TF_Tensor in `out_tensor` may have its own Python reference to `ndarray`s
-// data. After `out_tensor` is destroyed, this reference must (eventually) be
-// decremented via ClearDecrefCache().
-//
-// `out_tensor` must be non-null. Caller retains ownership of `ndarray`.
-Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor);
+// Creates a tensor in 'ret' from the input `ndarray`. The returned TF_Tensor
+// in `ret` may have its own Python reference to `ndarray`s data. After `ret`
+// is destroyed, this reference must (eventually) be decremented via
+// ClearDecrefCache().
+// `convert_string` indicates whether it has to handle tstring conversion.
+// Expected to be removed once tstring migration is done.
+ABSL_MUST_USE_RESULT
+Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
+                       Safe_TF_TensorPtr* ret, bool convert_string);
 
 // Creates a tensor in 'ret' from the input Ndarray.
+// TODO(kkb): This is an old conversion function that does not support TFRT.
+// Currently it's used for session, py_func, and an internal project.  Migrate
+// them.
+ABSL_MUST_USE_RESULT
 Status NdarrayToTensor(PyObject* obj, Tensor* ret);
 
 // Creates a numpy array in 'ret' which either aliases the content of 't' or has
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 80b9863ea48..a3c83bb5d59 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -26,7 +26,8 @@ limitations under the License.
 
 #include "numpy/arrayobject.h"
 #include "tensorflow/c/eager/c_api.h"
-#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
@@ -95,8 +96,8 @@ Status MakeArgTuple(const PyCall* call, EagerContext* ctx, PyObject** tuple) {
     if (call->eager) {
       Tensor t = call->ins[i];
       arg = EagerTensorFromHandle(
-          new TFE_TensorHandle{TensorHandle::CreateLocalHandle(
-              std::move(t), ctx->CanonicalDevice(device), nullptr, ctx)});
+          tensorflow::wrap(TensorHandle::CreateLocalHandle(
+              std::move(t), ctx->CanonicalDevice(device), nullptr, ctx)));
       if (arg == nullptr) {
         Py_DECREF(lst);
         return errors::Internal("Unable to procure EagerTensor from Tensor.");
@@ -146,7 +147,7 @@ tensorflow::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
                                                 const Device* expected_device,
                                                 const Tensor** output_tensor) {
   tensorflow::TensorHandle* handle = tensorflow::TensorHandleFromInterface(
-      EagerTensor_Handle(eager_tensor)->handle);
+      tensorflow::unwrap(EagerTensor_Handle(eager_tensor)));
   if (VariantDeviceIsCustom(handle->device())) {
     return errors::Unimplemented(
         "Custom devices are currently not supported with PyFuncs.");
@@ -196,7 +197,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
     TFE_Context* ctx = reinterpret_cast<TFE_Context*>(PyCapsule_GetPointer(
         PyObject_GetAttrString(trampoline, "_ctx"), nullptr));
     CHECK_NE(ctx, nullptr);
-    EagerContext* context = ContextFromInterface(ctx->context);
+    EagerContext* context = ContextFromInterface(tensorflow::unwrap(ctx));
     TF_RETURN_IF_ERROR(MakeArgTuple(call, context, &args));
     new_executor.reset(new EagerExecutor(call->eager_async));
     old_executor = &context->Executor();
@@ -236,7 +237,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
   if (new_executor != nullptr) {
     TFE_Context* ctx = reinterpret_cast<TFE_Context*>(PyCapsule_GetPointer(
         PyObject_GetAttrString(trampoline, "_ctx"), nullptr));
-    EagerContext* context = ContextFromInterface(ctx->context);
+    EagerContext* context = ContextFromInterface(tensorflow::unwrap(ctx));
     s.Update(new_executor->WaitForAllPendingNodes());
     context->SetExecutorForThread(old_executor);
   }
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index f05afeb22e5..22829f546b1 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
 
-#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -305,7 +306,7 @@ struct Converter {
         }
       }
     }
-    *h = new TFE_TensorHandle{ctx->context->CreateLocalHandle(t)};
+    *h = tensorflow::wrap(tensorflow::unwrap(ctx)->CreateLocalHandle(t));
     t->Release();
     return Status::OK();
   }
@@ -316,12 +317,12 @@ struct Converter {
 template <>
 struct ConverterTraits<int64> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx, int64 value) {
-    return ctx->context->CreateInt64Scalar(value);
+    return tensorflow::unwrap(ctx)->CreateInt64Scalar(value);
   }
 
   static AbstractTensorInterface* CreateTensor(
       TFE_Context* ctx, absl::Span<const int64> dim_sizes) {
-    return ctx->context->CreateTensor(DT_INT64, dim_sizes);
+    return tensorflow::unwrap(ctx)->CreateTensor(DT_INT64, dim_sizes);
   }
 
   static const char* ConvertScalar(PyObject* v, int64* out) {
@@ -356,12 +357,12 @@ typedef Converter<int64> Int64Converter;
 template <>
 struct ConverterTraits<uint64> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx, uint64 value) {
-    return ctx->context->CreateUint64Scalar(value);
+    return tensorflow::unwrap(ctx)->CreateUint64Scalar(value);
   }
 
   static AbstractTensorInterface* CreateTensor(
       TFE_Context* ctx, absl::Span<const int64> dim_sizes) {
-    return ctx->context->CreateTensor(DT_UINT64, dim_sizes);
+    return tensorflow::unwrap(ctx)->CreateTensor(DT_UINT64, dim_sizes);
   }
 
   static const char* ConvertScalar(PyObject* v, uint64* out) {
@@ -393,12 +394,12 @@ typedef Converter<uint64> UInt64Converter;
 template <>
 struct ConverterTraits<int32> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx, int32 value) {
-    return ctx->context->CreateInt32Scalar(value);
+    return tensorflow::unwrap(ctx)->CreateInt32Scalar(value);
   }
 
   static AbstractTensorInterface* CreateTensor(
       TFE_Context* ctx, absl::Span<const int64> dim_sizes) {
-    return ctx->context->CreateTensor(DT_INT32, dim_sizes);
+    return tensorflow::unwrap(ctx)->CreateTensor(DT_INT32, dim_sizes);
   }
 
   static const char* ConvertScalar(PyObject* v, int32* out) {
@@ -500,12 +501,12 @@ static const char* ConvertOneFloat(PyObject* v, T* out) {
 template <>
 struct ConverterTraits<float> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx, float value) {
-    return ctx->context->CreateFloatScalar(value);
+    return tensorflow::unwrap(ctx)->CreateFloatScalar(value);
   }
 
   static AbstractTensorInterface* CreateTensor(
       TFE_Context* ctx, absl::Span<const int64> dim_sizes) {
-    return ctx->context->CreateTensor(DT_FLOAT, dim_sizes);
+    return tensorflow::unwrap(ctx)->CreateTensor(DT_FLOAT, dim_sizes);
   }
 
   static const char* ConvertScalar(PyObject* v, float* out) {
@@ -516,12 +517,12 @@ struct ConverterTraits<float> {
 template <>
 struct ConverterTraits<double> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx, double value) {
-    return ctx->context->CreateDoubleScalar(value);
+    return tensorflow::unwrap(ctx)->CreateDoubleScalar(value);
   }
 
   static AbstractTensorInterface* CreateTensor(
       TFE_Context* ctx, absl::Span<const int64> dim_sizes) {
-    return ctx->context->CreateTensor(DT_DOUBLE, dim_sizes);
+    return tensorflow::unwrap(ctx)->CreateTensor(DT_DOUBLE, dim_sizes);
   }
 
   static const char* ConvertScalar(PyObject* v, double* out) {
@@ -536,12 +537,12 @@ template <>
 struct ConverterTraits<Eigen::half> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx,
                                                Eigen::half value) {
-    return ctx->context->CreateHalfScalar(value);
+    return tensorflow::unwrap(ctx)->CreateHalfScalar(value);
   }
 
   static AbstractTensorInterface* CreateTensor(
       TFE_Context* ctx, absl::Span<const int64> dim_sizes) {
-    return ctx->context->CreateTensor(DT_HALF, dim_sizes);
+    return tensorflow::unwrap(ctx)->CreateTensor(DT_HALF, dim_sizes);
   }
 
   static const char* ConvertScalar(PyObject* v, Eigen::half* out) {
@@ -557,12 +558,12 @@ template <>
 struct ConverterTraits<tstring> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx,
                                                tstring value) {
-    return ctx->context->CreateStringScalar(value);
+    return tensorflow::unwrap(ctx)->CreateStringScalar(value);
   }
 
   static AbstractTensorInterface* CreateTensor(
       TFE_Context* ctx, absl::Span<const int64> dim_sizes) {
-    return ctx->context->CreateTensor(DT_STRING, dim_sizes);
+    return tensorflow::unwrap(ctx)->CreateTensor(DT_STRING, dim_sizes);
   }
 
   static const char* ConvertScalar(PyObject* v, tstring* out) {
@@ -624,12 +625,12 @@ template <>
 struct ConverterTraits<complex128> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx,
                                                complex128 value) {
-    return ctx->context->CreateComplex128Scalar(value);
+    return tensorflow::unwrap(ctx)->CreateComplex128Scalar(value);
   }
 
   static AbstractTensorInterface* CreateTensor(
       TFE_Context* ctx, absl::Span<const int64> dim_sizes) {
-    return ctx->context->CreateTensor(DT_COMPLEX128, dim_sizes);
+    return tensorflow::unwrap(ctx)->CreateTensor(DT_COMPLEX128, dim_sizes);
   }
 
   static const char* ConvertScalar(PyObject* v, complex128* out) {
@@ -652,12 +653,12 @@ typedef Converter<complex128> Complex128Converter;
 template <>
 struct ConverterTraits<bool> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx, bool value) {
-    return ctx->context->CreateBoolScalar(value);
+    return tensorflow::unwrap(ctx)->CreateBoolScalar(value);
   }
 
   static AbstractTensorInterface* CreateTensor(
       TFE_Context* ctx, absl::Span<const int64> dim_sizes) {
-    return ctx->context->CreateTensor(DT_BOOL, dim_sizes);
+    return tensorflow::unwrap(ctx)->CreateTensor(DT_BOOL, dim_sizes);
   }
 
   static const char* ConvertScalar(PyObject* v, bool* out) {
@@ -680,9 +681,11 @@ typedef Converter<bool> BoolConverter;
 // The two may share underlying storage so changes to one may reflect in the
 // other.
 TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
-  tensorflow::Tensor tensor;
-  tensorflow::Status status = tensorflow::NdarrayToTensor(obj, &tensor);
-  if (!status.ok()) {
+  Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
+  Status status = tensorflow::NdarrayToTensor(ctx, obj, &tf_tensor,
+                                              true /*convert_string*/);
+
+  if (TF_PREDICT_FALSE(!status.ok())) {
     PyErr_SetString(PyExc_ValueError,
                     tensorflow::strings::StrCat(
                         "Failed to convert a NumPy array to a Tensor (",
@@ -691,8 +694,8 @@ TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
     return nullptr;
   }
 
-  TensorInterface t(std::move(tensor));
-  return new TFE_TensorHandle{ctx->context->CreateLocalHandle(&t)};
+  return tensorflow::wrap(
+      tensorflow::unwrap(ctx)->CreateLocalHandle(tf_tensor->tensor));
 }
 
 }  // namespace
@@ -877,7 +880,7 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj,
       Tensor tensor(requested_dtype == DT_INVALID ? DT_FLOAT : requested_dtype,
                     TensorShape(state.inferred_shape));
       TensorInterface t(std::move(tensor));
-      return new TFE_TensorHandle{ctx->context->CreateLocalHandle(&t)};
+      return tensorflow::wrap(tensorflow::unwrap(ctx)->CreateLocalHandle(&t));
     }
 
     default:
diff --git a/tensorflow/python/lib/core/pybind11_lib.h b/tensorflow/python/lib/core/pybind11_lib.h
index cc2a118d93f..6a0471cb4da 100644
--- a/tensorflow/python/lib/core/pybind11_lib.h
+++ b/tensorflow/python/lib/core/pybind11_lib.h
@@ -41,21 +41,21 @@ namespace tensorflow {
 
 // Convert PyObject* to py::object with no error handling.
 
-inline py::object pyo(PyObject* ptr) {
+inline py::object Pyo(PyObject* ptr) {
   return py::reinterpret_steal<py::object>(ptr);
 }
 
 // Raise an exception if the PyErrOccurred flag is set or else return the Python
 // object.
 
-inline py::object pyo_or_throw(PyObject* ptr) {
+inline py::object PyoOrThrow(PyObject* ptr) {
   if (PyErr_Occurred() || ptr == nullptr) {
     throw py::error_already_set();
   }
-  return pyo(ptr);
+  return Pyo(ptr);
 }
 
-void throwTypeError(const char* error_message) {
+void ThrowTypeError(const char* error_message) {
   PyErr_SetString(PyExc_TypeError, error_message);
   throw pybind11::error_already_set();
 }
diff --git a/tensorflow/python/lib/core/pybind11_status.h b/tensorflow/python/lib/core/pybind11_status.h
index feb974798de..3f9991c6577 100644
--- a/tensorflow/python/lib/core/pybind11_status.h
+++ b/tensorflow/python/lib/core/pybind11_status.h
@@ -69,6 +69,20 @@ inline void MaybeRaiseRegisteredFromStatus(const tensorflow::Status& status) {
   }
 }
 
+inline void MaybeRaiseRegisteredFromStatusWithGIL(
+    const tensorflow::Status& status) {
+  if (!status.ok()) {
+    // Acquire GIL for throwing exception.
+    pybind11::gil_scoped_acquire acquire;
+
+    PyErr_SetObject(PyExceptionRegistry::Lookup(status.code()),
+                    pybind11::make_tuple(pybind11::none(), pybind11::none(),
+                                         status.error_message())
+                        .ptr());
+    throw pybind11::error_already_set();
+  }
+}
+
 inline void MaybeRaiseFromTFStatus(TF_Status* status) {
   TF_Code code = TF_GetCode(status);
   if (code != TF_OK) {
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index d34afb0f1c0..7c484c825d3 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -165,7 +165,7 @@ class FileIO(object):
     self._read_buf.seek(offset)
 
   def readline(self):
-    r"""Reads the next line from the file. Leaves the '\n' at the end."""
+    r"""Reads the next line, keeping \n. At EOF, returns ''."""
     self._preread_check()
     return self._prepare_value(self._read_buf.readline())
 
@@ -345,14 +345,52 @@ def get_matching_files(filename):
     A list of strings containing filenames that match the given pattern(s).
 
   Raises:
-    errors.OpError: If there are filesystem / directory listing errors.
+  *  errors.OpError: If there are filesystem / directory listing errors.
   """
   return get_matching_files_v2(filename)
 
 
 @tf_export("io.gfile.glob")
 def get_matching_files_v2(pattern):
-  """Returns a list of files that match the given pattern(s).
+  r"""Returns a list of files that match the given pattern(s).
+
+  The patterns are defined as strings. Supported patterns are defined
+  here. Note that the pattern can be a Python iteratable of string patterns.
+
+  The format definition of the pattern is:
+
+  **pattern**: `{ term }`
+
+  **term**:
+    * `'*'`: matches any sequence of non-'/' characters
+    * `'?'`: matches a single non-'/' character
+    * `'[' [ '^' ] { match-list } ']'`: matches any single
+      character (not) on the list
+    * `c`: matches character `c`  where `c != '*', '?', '\\', '['`
+    * `'\\' c`: matches character `c`
+
+  **character range**:
+    * `c`: matches character `c` while `c != '\\', '-', ']'`
+    * `'\\' c`: matches character `c`
+    * `lo '-' hi`: matches character `c` for `lo <= c <= hi`
+
+  Examples:
+
+  >>> tf.io.gfile.glob("*.py")
+  ... # For example, ['__init__.py']
+
+  >>> tf.io.gfile.glob("__init__.??")
+  ... # As above
+
+  >>> files = {"*.py"}
+  >>> the_iterator = iter(files)
+  >>> tf.io.gfile.glob(the_iterator)
+  ... # As above
+
+  See the C++ function `GetMatchingPaths` in
+  [`core/platform/file_system.h`]
+  (../../../core/platform/file_system.h)
+  for implementation details.
 
   Args:
     pattern: string or iterable of strings. The glob pattern(s).
diff --git a/tensorflow/python/lib/io/file_io_wrapper.cc b/tensorflow/python/lib/io/file_io_wrapper.cc
index de806a9c969..0a2410b69e1 100644
--- a/tensorflow/python/lib/io/file_io_wrapper.cc
+++ b/tensorflow/python/lib/io/file_io_wrapper.cc
@@ -42,50 +42,65 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       py::gil_scoped_release release;
       status = tensorflow::Env::Default()->FileExists(filename);
     }
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("DeleteFile", [](const std::string& filename) {
-    tensorflow::MaybeRaiseRegisteredFromStatus(
-        tensorflow::Env::Default()->DeleteFile(filename));
+    py::gil_scoped_release release;
+    tensorflow::Status status =
+        tensorflow::Env::Default()->DeleteFile(filename);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("ReadFileToString", [](const std::string& filename) {
     std::string data;
+    py::gil_scoped_release release;
     const auto status =
         ReadFileToString(tensorflow::Env::Default(), filename, &data);
+    pybind11::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return py::bytes(data);
   });
   m.def("WriteStringToFile",
         [](const std::string& filename, tensorflow::StringPiece data) {
-          return WriteStringToFile(tensorflow::Env::Default(), filename, data);
+          py::gil_scoped_release release;
+          const auto status =
+              WriteStringToFile(tensorflow::Env::Default(), filename, data);
+          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         });
   m.def("GetChildren", [](const std::string& dirname) {
     std::vector<std::string> results;
+    py::gil_scoped_release release;
     const auto status =
         tensorflow::Env::Default()->GetChildren(dirname, &results);
+    pybind11::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return results;
   });
   m.def("GetMatchingFiles", [](const std::string& pattern) {
     std::vector<std::string> results;
+    py::gil_scoped_release release;
     const auto status =
         tensorflow::Env::Default()->GetMatchingPaths(pattern, &results);
+    pybind11::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return results;
   });
   m.def("CreateDir", [](const std::string& dirname) {
+    py::gil_scoped_release release;
     const auto status = tensorflow::Env::Default()->CreateDir(dirname);
     if (tensorflow::errors::IsAlreadyExists(status)) {
       return;
     }
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("RecursivelyCreateDir", [](const std::string& dirname) {
-    tensorflow::MaybeRaiseRegisteredFromStatus(
-        tensorflow::Env::Default()->RecursivelyCreateDir(dirname));
+    py::gil_scoped_release release;
+    const auto status =
+        tensorflow::Env::Default()->RecursivelyCreateDir(dirname);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("CopyFile",
         [](const std::string& src, const std::string& target, bool overwrite) {
+          py::gil_scoped_release release;
           auto* env = tensorflow::Env::Default();
           tensorflow::Status status;
           if (!overwrite && env->FileExists(target).ok()) {
@@ -93,10 +108,11 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
           } else {
             status = env->CopyFile(src, target);
           }
-          tensorflow::MaybeRaiseRegisteredFromStatus(status);
+          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         });
   m.def("RenameFile",
         [](const std::string& src, const std::string& target, bool overwrite) {
+          py::gil_scoped_release release;
           auto* env = tensorflow::Env::Default();
           tensorflow::Status status;
           if (!overwrite && env->FileExists(target).ok()) {
@@ -104,9 +120,10 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
           } else {
             status = env->RenameFile(src, target);
           }
-          tensorflow::MaybeRaiseRegisteredFromStatus(status);
+          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         });
   m.def("DeleteRecursively", [](const std::string& dirname) {
+    py::gil_scoped_release release;
     tensorflow::int64 undeleted_files;
     tensorflow::int64 undeleted_dirs;
     auto status = tensorflow::Env::Default()->DeleteRecursively(
@@ -115,23 +132,25 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       status =
           tensorflow::errors::PermissionDenied("could not fully delete dir");
     }
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("IsDirectory", [](const std::string& dirname) {
+    py::gil_scoped_release release;
     const auto status = tensorflow::Env::Default()->IsDirectory(dirname);
     // FAILED_PRECONDITION response means path exists but isn't a dir.
     if (tensorflow::errors::IsFailedPrecondition(status)) {
       return false;
     }
 
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
     return true;
   });
   m.def("HasAtomicMove", [](const std::string& path) {
+    py::gil_scoped_release release;
     bool has_atomic_move;
     const auto status =
         tensorflow::Env::Default()->HasAtomicMove(path, &has_atomic_move);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
     return has_atomic_move;
   });
 
@@ -141,9 +160,11 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       .def_readonly("is_directory", &tensorflow::FileStatistics::is_directory);
 
   m.def("Stat", [](const std::string& filename) {
+    py::gil_scoped_release release;
     std::unique_ptr<tensorflow::FileStatistics> self(
         new tensorflow::FileStatistics);
     const auto status = tensorflow::Env::Default()->Stat(filename, self.get());
+    py::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return self.release();
   });
@@ -151,66 +172,83 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
   using tensorflow::WritableFile;
   py::class_<WritableFile>(m, "WritableFile")
       .def(py::init([](const std::string& filename, const std::string& mode) {
+        py::gil_scoped_release release;
         auto* env = tensorflow::Env::Default();
         std::unique_ptr<WritableFile> self;
         const auto status = mode.find("a") == std::string::npos
                                 ? env->NewWritableFile(filename, &self)
                                 : env->NewAppendableFile(filename, &self);
+        py::gil_scoped_acquire acquire;
         tensorflow::MaybeRaiseRegisteredFromStatus(status);
         return self.release();
       }))
       .def("append",
            [](WritableFile* self, tensorflow::StringPiece data) {
-             tensorflow::MaybeRaiseRegisteredFromStatus(self->Append(data));
+             const auto status = self->Append(data);
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
            })
       // TODO(slebedev): Make WritableFile::Tell const and change self
       // to be a reference.
       .def("tell",
            [](WritableFile* self) {
              tensorflow::int64 pos = -1;
+             py::gil_scoped_release release;
              const auto status = self->Tell(&pos);
-             tensorflow::MaybeRaiseRegisteredFromStatus(status);
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
              return pos;
            })
       .def("flush",
            [](WritableFile* self) {
-             tensorflow::MaybeRaiseRegisteredFromStatus(self->Flush());
+             py::gil_scoped_release release;
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(self->Flush());
            })
       .def("close", [](WritableFile* self) {
-        tensorflow::MaybeRaiseRegisteredFromStatus(self->Close());
+        py::gil_scoped_release release;
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(self->Close());
       });
 
   using tensorflow::io::BufferedInputStream;
   py::class_<BufferedInputStream>(m, "BufferedInputStream")
       .def(py::init([](const std::string& filename, size_t buffer_size) {
+        py::gil_scoped_release release;
         std::unique_ptr<tensorflow::RandomAccessFile> file;
         const auto status =
             tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
-        tensorflow::MaybeRaiseRegisteredFromStatus(status);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         std::unique_ptr<tensorflow::io::RandomAccessInputStream> input_stream(
             new tensorflow::io::RandomAccessInputStream(file.release(),
                                                         /*owns_file=*/true));
+        py::gil_scoped_acquire acquire;
         return new BufferedInputStream(input_stream.release(), buffer_size,
                                        /*owns_input_stream=*/true);
       }))
       .def("read",
            [](BufferedInputStream* self, tensorflow::int64 bytes_to_read) {
+             py::gil_scoped_release release;
              tensorflow::tstring result;
              const auto status = self->ReadNBytes(bytes_to_read, &result);
              if (!status.ok() && !tensorflow::errors::IsOutOfRange(status)) {
                result.clear();
                tensorflow::MaybeRaiseRegisteredFromStatus(status);
              }
+             py::gil_scoped_acquire acquire;
              return py::bytes(result);
            })
       .def("readline",
            [](BufferedInputStream* self) {
-             return py::bytes(self->ReadLineAsString());
+             py::gil_scoped_release release;
+             auto output = self->ReadLineAsString();
+             py::gil_scoped_acquire acquire;
+             return py::bytes(output);
            })
       .def("seek",
            [](BufferedInputStream* self, tensorflow::int64 pos) {
-             tensorflow::MaybeRaiseRegisteredFromStatus(self->Seek(pos));
+             py::gil_scoped_release release;
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(self->Seek(pos));
            })
-      .def("tell", [](BufferedInputStream* self) { return self->Tell(); });
+      .def("tell", [](BufferedInputStream* self) {
+        py::gil_scoped_release release;
+        return self->Tell();
+      });
 }
 }  // namespace
diff --git a/tensorflow/python/lite/toco_python_api_wrapper.cc b/tensorflow/python/lite/toco_python_api_wrapper.cc
index 2cc35534bea..b77200a3bee 100644
--- a/tensorflow/python/lite/toco_python_api_wrapper.cc
+++ b/tensorflow/python/lite/toco_python_api_wrapper.cc
@@ -26,7 +26,7 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
          py::object toco_flags_proto_txt_raw, py::object input_contents_txt_raw,
          bool extended_return, py::object debug_info_txt_raw,
          bool enable_mlir_converter) {
-        return tensorflow::pyo_or_throw(toco::TocoConvert(
+        return tensorflow::PyoOrThrow(toco::TocoConvert(
             model_flags_proto_txt_raw.ptr(), toco_flags_proto_txt_raw.ptr(),
             input_contents_txt_raw.ptr(), extended_return,
             debug_info_txt_raw.ptr(), enable_mlir_converter));
@@ -49,19 +49,32 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
   m.def(
       "TocoGetPotentiallySupportedOps",
       []() {
-        return tensorflow::pyo_or_throw(toco::TocoGetPotentiallySupportedOps());
+        return tensorflow::PyoOrThrow(toco::TocoGetPotentiallySupportedOps());
       },
       R"pbdoc(
       Returns a list of names of all ops potentially supported by tflite.
     )pbdoc");
   m.def(
       "ExperimentalMlirQuantizeModel",
-      [](py::object input_contents_txt_raw, bool fully_quantize) {
-        return tensorflow::pyo_or_throw(toco::MlirQuantizeModel(
-            input_contents_txt_raw.ptr(), fully_quantize));
+      [](py::object input_contents_txt_raw, bool disable_per_channel,
+         bool fully_quantize, int inference_type) {
+        return tensorflow::PyoOrThrow(toco::MlirQuantizeModel(
+            input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize,
+            inference_type));
       },
-      py::arg("input_contents_txt_raw"), py::arg("fully_quantize") = true,
+      py::arg("input_contents_txt_raw"), py::arg("disable_per_channel") = false,
+      py::arg("fully_quantize") = true, py::arg("inference_type") = 9,
       R"pbdoc(
       Returns a quantized model.
     )pbdoc");
+  m.def(
+      "ExperimentalMlirSparsifyModel",
+      [](py::object input_contents_txt_raw) {
+        return tensorflow::PyoOrThrow(
+            toco::MlirSparsifyModel(input_contents_txt_raw.ptr()));
+      },
+      py::arg("input_contents_txt_raw"),
+      R"pbdoc(
+      Returns a sparsified model.
+    )pbdoc");
 }
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index b2fc4ff9645..963d3549b2b 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -32,8 +32,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import layers
-from tensorflow.python.keras import models
 from tensorflow.python.module import module
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -505,42 +503,6 @@ class FlattenTest(parameterized.TestCase, test_util.TensorFlowTestCase):
                       ("decoder", "w", 0, 0, "k"): mod.decoder.w[0][0]["k"],
                       ("decoder", "w", 0, 1, "k"): mod.decoder.w[0][1]["k"]},)
 
-  def test_module_discover_layer_variable(self):
-    m = module.Module()
-    m.a = layers.Dense(1)
-    m.b = layers.Dense(2)
-
-    # The weights of the layer has not been created yet.
-    self.assertEmpty(m.variables)
-    self.assertLen(m.submodules, 2)
-
-    inputs = layers.Input((1,))
-    m.a(inputs)
-    m.b(inputs)
-
-    variable_list = m.variables
-    self.assertLen(variable_list, 4)
-    self.assertIs(variable_list[0], m.a.kernel)
-    self.assertIs(variable_list[1], m.a.bias)
-    self.assertIs(variable_list[2], m.b.kernel)
-    self.assertIs(variable_list[3], m.b.bias)
-
-  def test_model_discover_submodule(self):
-    m = models.Sequential(layers=[layers.Dense(1),
-                                  layers.Dense(2)])
-
-    self.assertEqual(m.submodules, (m.layers[0], m.layers[1]))
-    m(layers.Input((1,)))
-    self.assertLen(m.variables, 4)
-
-  def test_model_wrapped_in_module_discovers_submodules(self):
-    linear = models.Sequential([layers.Dense(units=1, input_shape=[1])])
-    linear.compile(optimizer="sgd", loss="mean_squared_error")
-    m = module.Module()
-    m.l = linear
-    self.assertNotEmpty(m.submodules)
-    self.assertLen(m.variables, 2)
-
   def test_raises_error_with_path(self):
     if six.PY2:
       class NonOrderable(object):
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index e163cf90eb7..ce0755fc782 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gen_math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_array_ops import *
 from tensorflow.python.ops.gen_array_ops import reverse_v2 as reverse  # pylint: disable=unused-import
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
@@ -50,12 +51,13 @@ from tensorflow.python.util.tf_export import tf_export
 newaxis = None
 tf_export("newaxis").export_constant(__name__, "newaxis")
 
-# We override the 'slice' for the "slice" op, so we keep python's
+# We override the 'slice' for the "slice" op, so we keep Python's
 # existing 'slice' for later use in this module.
 _BaseSlice = slice
 
 
 @tf_export("reshape", v1=["reshape", "manip.reshape"])
+@dispatch.add_dispatch_support
 def reshape(tensor, shape, name=None):  # pylint: disable=redefined-outer-name
   r"""Reshapes a tensor.
 
@@ -196,6 +198,7 @@ def reshape(tensor, shape, name=None):  # pylint: disable=redefined-outer-name
 
 
 @tf_export("fill")
+@dispatch.add_dispatch_support
 def fill(dims, value, name=None):
   r"""Creates a tensor filled with a scalar value.
 
@@ -296,7 +299,7 @@ def expand_dims(input, axis=None, name=None, dim=None):
   """Returns a tensor with a length 1 axis inserted at index `axis`.
 
   Given a tensor `input`, this operation inserts a dimension of length 1 at the
-  dimension index `axis` of `input`'s shape. The dimension index follows python
+  dimension index `axis` of `input`'s shape. The dimension index follows Python
   indexing rules: It's zero-based, a negative index it is counted backward
   from the end.
 
@@ -322,14 +325,14 @@ def expand_dims(input, axis=None, name=None, dim=None):
   >>> tf.expand_dims(image, axis=1).shape.as_list()
   [10, 1, 10, 3]
 
-  Following standard python indexing rules, a negative `axis` counts from the
+  Following standard Python indexing rules, a negative `axis` counts from the
   end so `axis=-1` adds an inner most dimension:
 
   >>> tf.expand_dims(image, -1).shape.as_list()
   [10, 10, 3, 1]
 
   This operation requires that `axis` is a valid index for `input.shape`,
-  following python indexing rules:
+  following Python indexing rules:
 
   ```
   -1-tf.rank(input) <= axis <= tf.rank(input)
@@ -368,7 +371,7 @@ def expand_dims_v2(input, axis, name=None):
   """Returns a tensor with a length 1 axis inserted at index `axis`.
 
   Given a tensor `input`, this operation inserts a dimension of length 1 at the
-  dimension index `axis` of `input`'s shape. The dimension index follows python
+  dimension index `axis` of `input`'s shape. The dimension index follows Python
   indexing rules: It's zero-based, a negative index it is counted backward
   from the end.
 
@@ -394,14 +397,14 @@ def expand_dims_v2(input, axis, name=None):
   >>> tf.expand_dims(image, axis=1).shape.as_list()
   [10, 1, 10, 3]
 
-  Following standard python indexing rules, a negative `axis` counts from the
+  Following standard Python indexing rules, a negative `axis` counts from the
   end so `axis=-1` adds an inner most dimension:
 
   >>> tf.expand_dims(image, -1).shape.as_list()
   [10, 10, 3, 1]
 
   This operation requires that `axis` is a valid index for `input.shape`,
-  following python indexing rules:
+  following Python indexing rules:
 
   ```
   -1-tf.rank(input) <= axis <= tf.rank(input)
@@ -454,6 +457,7 @@ listdiff.__doc__ = gen_array_ops.list_diff.__doc__ + "\n" + listdiff.__doc__
                         "This op will be removed after the deprecation date. "
                         "Please switch to tf.sets.difference().")
 @tf_export(v1=["setdiff1d"])
+@dispatch.add_dispatch_support
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
   """Computes the difference between two lists of numbers or strings.
 
@@ -497,6 +501,7 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 
 
 @tf_export("broadcast_dynamic_shape")
+@dispatch.add_dispatch_support
 def broadcast_dynamic_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given symbolic shapes.
 
@@ -522,6 +527,7 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
 
 @tf_export("broadcast_static_shape")
+@dispatch.add_dispatch_support
 def broadcast_static_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given known shapes.
 
@@ -549,6 +555,7 @@ def broadcast_static_shape(shape_x, shape_y):
 
 
 @tf_export("shape", v1=[])
+@dispatch.add_dispatch_support
 def shape_v2(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -595,6 +602,7 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
 
 
 @tf_export(v1=["shape"])
+@dispatch.add_dispatch_support
 def shape(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -649,6 +657,7 @@ def shape_internal(input, name=None, optimize=True, out_type=dtypes.int32):
 
 
 @tf_export("shape_n")
+@dispatch.add_dispatch_support
 def shape_n(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns shape of tensors.
@@ -1006,6 +1015,7 @@ def _slice_helper(tensor, slice_spec, var=None):
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
 @tf_export("slice")
+@dispatch.add_dispatch_support
 def slice(input_, begin, size, name=None):
   # pylint: disable=redefined-builtin
   """Extracts a slice from a tensor.
@@ -1061,6 +1071,7 @@ def slice(input_, begin, size, name=None):
 
 # pylint: disable=invalid-name
 @tf_export("strided_slice")
+@dispatch.add_dispatch_support
 def strided_slice(input_,
                   begin,
                   end,
@@ -1072,7 +1083,7 @@ def strided_slice(input_,
                   shrink_axis_mask=0,
                   var=None,
                   name=None):
-  """Extracts a strided slice of a tensor (generalized python array indexing).
+  """Extracts a strided slice of a tensor (generalized Python array indexing).
 
   See also `tf.slice`.
 
@@ -1252,6 +1263,7 @@ ops.Tensor._override_operator("__getitem__", _slice_helper)
 
 
 @tf_export("parallel_stack")
+@dispatch.add_dispatch_support
 def parallel_stack(values, name="parallel_stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor in parallel.
 
@@ -1381,13 +1393,13 @@ def _autopacking_helper(list_or_tuple, dtype, name):
   if context.executing_eagerly():
     # NOTE: Fast path when all the items are tensors, this doesn't do any type
     # checking.
-    if all(ops.is_dense_tensor_like(elem) for elem in list_or_tuple):
+    if all(isinstance(elem, core.Tensor) for elem in list_or_tuple):
       return gen_array_ops.pack(list_or_tuple, name=name)
   must_pack = False
   converted_elems = []
   with ops.name_scope(name) as scope:
     for i, elem in enumerate(list_or_tuple):
-      if ops.is_dense_tensor_like(elem):
+      if isinstance(elem, core.Tensor):
         if dtype is not None and elem.dtype.base_dtype != dtype:
           raise TypeError("Cannot convert a list containing a tensor of dtype "
                           "%s to %s (Tensor is: %r)" %
@@ -1396,7 +1408,7 @@ def _autopacking_helper(list_or_tuple, dtype, name):
         must_pack = True
       elif isinstance(elem, (list, tuple)):
         converted_elem = _autopacking_helper(elem, dtype, str(i))
-        if ops.is_dense_tensor_like(converted_elem):
+        if isinstance(converted_elem, core.Tensor):
           must_pack = True
         converted_elems.append(converted_elem)
       else:
@@ -1404,7 +1416,7 @@ def _autopacking_helper(list_or_tuple, dtype, name):
     if must_pack:
       elems_as_tensors = []
       for i, elem in enumerate(converted_elems):
-        if ops.is_dense_tensor_like(elem):
+        if isinstance(elem, core.Tensor):
           elems_as_tensors.append(elem)
         else:
           # NOTE(mrry): This is inefficient, but it enables us to
@@ -1429,7 +1441,7 @@ def _get_dtype_from_nested_lists(list_or_tuple):
     such object exists.
   """
   for elem in list_or_tuple:
-    if ops.is_dense_tensor_like(elem):
+    if isinstance(elem, core.Tensor):
       return elem.dtype.base_dtype
     elif isinstance(elem, (list, tuple)):
       maybe_dtype = _get_dtype_from_nested_lists(elem)
@@ -1441,7 +1453,7 @@ def _get_dtype_from_nested_lists(list_or_tuple):
 def _cast_nested_seqs_to_dtype(dtype):
 
   def _maybe_cast(elem):
-    if ops.is_dense_tensor_like(elem):
+    if isinstance(elem, core.Tensor):
       if dtype != elem.dtype.base_dtype:
         elem = gen_math_ops.cast(elem, dtype)
     return elem
@@ -1455,7 +1467,7 @@ _NON_AUTOPACKABLE_TYPES.add(np.ndarray)
 
 def _should_not_autopack(v):
   # The condition we really want is
-  #    ops.is_dense_tensor_like(...)
+  #    any(isinstance(elem, core.Tensor))
   # but it is >5x slower due to abc.ABCMeta.__instancecheck__.
   # pylint: disable=unidiomatic-typecheck
   # TODO(slebedev): add nest.all?
@@ -1488,6 +1500,7 @@ ops.register_tensor_conversion_function((list, tuple),
 
 
 @tf_export("unstack")
+@dispatch.add_dispatch_support
 def unstack(value, num=None, axis=0, name="unstack"):
   """Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
 
@@ -1631,6 +1644,7 @@ def concat(values, axis, name="concat"):
 
 
 @tf_export(v1=["boolean_mask"])
+@dispatch.add_dispatch_support
 def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.
 
@@ -1699,7 +1713,10 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
           "Number of mask dimensions must be specified, even if some dimensions"
           " are None.  E.g. shape=[None] is ok, but shape=None is not.")
     axis = 0 if axis is None else axis
-    shape_tensor[axis:axis + ndims_mask].assert_is_compatible_with(shape_mask)
+    axis_value = tensor_util.constant_value(axis)
+    if axis_value is not None:
+      axis = axis_value
+      shape_tensor[axis:axis + ndims_mask].assert_is_compatible_with(shape_mask)
 
     leading_size = gen_math_ops.prod(shape(tensor)[axis:axis + ndims_mask], [0])
     tensor = reshape(
@@ -1708,10 +1725,15 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
             shape(tensor)[:axis], [leading_size],
             shape(tensor)[axis + ndims_mask:]
         ], 0))
-    first_dim = shape_tensor[axis:axis + ndims_mask].num_elements()
-    tensor.set_shape(
-        tensor_shape.as_shape(shape_tensor[:axis]).concatenate(
-            [first_dim]).concatenate(shape_tensor[axis + ndims_mask:]))
+    # TODO(yongtang): tf.reshape in C++ kernel might have set the shape
+    # correctly, so the following may not be needed? It still might ben
+    # possible that there are some edge case where tensor_util.constant_value
+    # resolves more case than ShapeInference of tf.reshape in C++ kernel.
+    if axis_value is not None:
+      first_dim = shape_tensor[axis:axis + ndims_mask].num_elements()
+      tensor.set_shape(
+          tensor_shape.as_shape(shape_tensor[:axis]).concatenate(
+              [first_dim]).concatenate(shape_tensor[axis + ndims_mask:]))
 
     mask = reshape(mask, [-1])
     return _apply_mask_1d(tensor, mask, axis)
@@ -1815,6 +1837,7 @@ def sparse_mask(a, mask_indices, name=None):
 
 
 @tf_export("unique")
+@dispatch.add_dispatch_support
 def unique(x, out_idx=dtypes.int32, name=None):
   """Finds unique elements in a 1-D tensor.
 
@@ -1862,6 +1885,7 @@ unique.__doc__ = gen_array_ops.unique.__doc__
 
 
 @tf_export("unique_with_counts")
+@dispatch.add_dispatch_support
 def unique_with_counts(x, out_idx=dtypes.int32, name=None):
   """Finds unique elements in a 1-D tensor.
 
@@ -1914,30 +1938,31 @@ unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
 
 
 @tf_export("split")
+@dispatch.add_dispatch_support
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor `value` into a list of sub tensors.
 
   See also `tf.unstack`.
 
-  If `num_or_size_splits` is an integer, then `value` is split along the
-  dimension `axis` into `num_split` smaller tensors. This requires that
-  `value.shape[axis]` is divisible by `num_split`.
+  If `num_or_size_splits` is an integer,  then `value` is split along the
+  dimension `axis` into `num_or_size_splits` smaller tensors. This requires that
+  `value.shape[axis]` is divisible by `num_or_size_splits`.
 
-  If `num_or_size_splits` is a 1-D Tensor (or list), we call it `size_splits`
-  and `value` is split into `len(size_splits)` elements. The shape of the `i`-th
+  If `num_or_size_splits` is a 1-D Tensor (or list), then `value` is split into
+  `len(num_or_size_splits)` elements. The shape of the `i`-th
   element has the same size as the `value` except along dimension `axis` where
-  the size is `size_splits[i]`.
+  the size is `num_or_size_splits[i]`.
 
   For example:
 
   >>> x = tf.Variable(tf.random.uniform([5, 30], -1, 1))
-
-  Split `x` into 3 tensors along dimension 1
+  >>>
+  >>> # Split `x` into 3 tensors along dimension 1
   >>> s0, s1, s2 = tf.split(x, num_or_size_splits=3, axis=1)
   >>> tf.shape(s0).numpy()
   array([ 5, 10], dtype=int32)
-
-  Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
+  >>>
+  >>> # Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
   >>> split0, split1, split2 = tf.split(x, [4, 15, 11], 1)
   >>> tf.shape(split0).numpy()
   array([5, 4], dtype=int32)
@@ -1991,6 +2016,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
 
 @tf_export("transpose", v1=[])
+@dispatch.add_dispatch_support
 def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
   """Transposes `a`, where `a` is a Tensor.
 
@@ -2071,6 +2097,7 @@ def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
 
 
 @tf_export(v1=["transpose"])
+@dispatch.add_dispatch_support
 def transpose(a, perm=None, name="transpose", conjugate=False):
   """Transposes `a`.
 
@@ -2161,6 +2188,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
 @tf_export(
     "linalg.matrix_transpose",
     v1=["linalg.transpose", "linalg.matrix_transpose", "matrix_transpose"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_transpose", "linalg.transpose")
 def matrix_transpose(a, name="matrix_transpose", conjugate=False):
   """Transposes last two dimensions of tensor `a`.
@@ -2220,7 +2248,7 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 
     # If we know the number of dimensions (statically), we can do two things:
     # 1. Check that `a` is a (batch) matrix.
-    # 2. Use a python list for perm.  This preserves static shape information
+    # 2. Use a Python list for perm.  This preserves static shape information
     #    and avoids extra computations.
     a_shape = a.get_shape()
     ndims = a_shape.ndims
@@ -2239,6 +2267,7 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 
 
 @tf_export("linalg.diag", v1=["linalg.diag", "matrix_diag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag")
 def matrix_diag(diagonal,
                 name="diag",
@@ -2407,6 +2436,7 @@ def matrix_diag(diagonal,
 
 
 @tf_export("linalg.diag_part", v1=["linalg.diag_part", "matrix_diag_part"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag_part")
 @dispatch.add_dispatch_support
 def matrix_diag_part(
@@ -2547,6 +2577,7 @@ def matrix_diag_part(
 
 
 @tf_export("linalg.set_diag", v1=["linalg.set_diag", "matrix_set_diag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_set_diag")
 def matrix_set_diag(
     input,  # pylint:disable=redefined-builtin
@@ -2710,6 +2741,7 @@ def _tag_zeros_tensor(fun):
 
 
 @tf_export("zeros")
+@dispatch.add_dispatch_support
 @_tag_zeros_tensor
 def zeros(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to zero.
@@ -2962,6 +2994,7 @@ def ones_like_impl(tensor, dtype, name, optimize=True):
 
 
 @tf_export("ones")
+@dispatch.add_dispatch_support
 def ones(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to one (1).
 
@@ -3173,6 +3206,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
 
 
 @tf_export("pad", v1=[])
+@dispatch.add_dispatch_support
 def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
   """Pads a tensor.
 
@@ -3231,6 +3265,7 @@ def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
 
 
 @tf_export(v1=["pad"])
+@dispatch.add_dispatch_support
 def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
@@ -3348,6 +3383,7 @@ def _get_paddings_constant(paddings):
 
 
 @tf_export("meshgrid")
+@dispatch.add_dispatch_support
 def meshgrid(*args, **kwargs):
   """Broadcasts parameters for evaluation on an N-D grid.
 
@@ -3404,6 +3440,9 @@ def meshgrid(*args, **kwargs):
     ndim = len(args)
     s0 = (1,) * ndim
 
+    if not ndim:
+      return []
+
     # Prepare reshape by inserting dimensions with size 1 where needed
     output = []
     for i, x in enumerate(args):
@@ -3488,6 +3527,7 @@ def _TileGradShape(op):
 
 
 @tf_export("edit_distance")
+@dispatch.add_dispatch_support
 def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   """Computes the Levenshtein distance between sequences.
 
@@ -3682,6 +3722,7 @@ def required_space_to_batch_paddings(input_shape,
 
 
 @tf_export(v1=["nn.space_to_batch", "space_to_batch"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
@@ -3705,6 +3746,7 @@ space_to_batch.__doc__ = gen_array_ops.space_to_batch.__doc__
 
 
 @tf_export("space_to_batch", "nn.space_to_batch", v1=[])
+@dispatch.add_dispatch_support
 def space_to_batch_v2(input, block_shape, paddings, name=None):  # pylint: disable=redefined-builtin
   return space_to_batch_nd(input, block_shape, paddings, name)
 
@@ -3713,6 +3755,7 @@ space_to_batch_v2.__doc__ = gen_array_ops.space_to_batch_nd.__doc__
 
 
 @tf_export(v1=["nn.space_to_depth", "space_to_depth"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("space_to_depth")
 def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
@@ -3722,6 +3765,7 @@ space_to_depth.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
 @tf_export("nn.space_to_depth", v1=[])
+@dispatch.add_dispatch_support
 def space_to_depth_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
 
@@ -3730,6 +3774,7 @@ space_to_depth_v2.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
 @tf_export(v1=["nn.depth_to_space", "depth_to_space"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("depth_to_space")
 def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
@@ -3739,6 +3784,7 @@ depth_to_space.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export("nn.depth_to_space", v1=[])
+@dispatch.add_dispatch_support
 def depth_to_space_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
 
@@ -3747,6 +3793,7 @@ depth_to_space_v2.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export(v1=["batch_to_space"])
+@dispatch.add_dispatch_support
 def batch_to_space(input, crops, block_size, name=None, block_shape=None):  # pylint: disable=redefined-builtin,missing-docstring
   block_size = deprecation.deprecated_argument_lookup("block_shape",
                                                       block_shape, "block_size",
@@ -3764,6 +3811,7 @@ batch_to_space.__doc__ = gen_array_ops.batch_to_space.__doc__
 
 
 @tf_export("batch_to_space", v1=[])
+@dispatch.add_dispatch_support
 def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=redefined-builtin
   """BatchToSpace for N-D tensors of type T.
 
@@ -3808,68 +3856,88 @@ def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=
            block_shape[0] - crops[0,0] - crops[0,1], ..., input_shape[M] *
            block_shape[M-1] - crops[M-1,0] - crops[M-1,1],  input_shape[M+1],
            ..., input_shape[N-1]]
-      Some Examples:
-      (1) For the following input of shape `[4, 1, 1, 1]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         [[[[1]]],
-          [[[2]]],
-          [[[3]]],
-          [[[4]]]]
-         ```
-         The output tensor has shape `[1, 2, 2, 1]` and value:
-         ``` x = [[[[1], [2]],
-                   [[3], [4]]]] ```
-      (2) For the following input of shape `[4, 1, 1, 3]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         [[[1,  2,   3]],
-          [[4,  5,   6]],
-          [[7,  8,   9]],
-          [[10, 11, 12]]]
-         ```
-         The output tensor has shape `[1, 2, 2, 3]` and value:
-         ```python
-         x = [[[[1, 2, 3], [4,  5,  6 ]],
-               [[7, 8, 9], [10, 11, 12]]]]
-         ```
-      (3) For the following
-         input of shape `[4, 2, 2, 1]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         x = [[[[1], [3]], [[ 9], [11]]],
-              [[[2], [4]], [[10], [12]]],
-              [[[5], [7]], [[13], [15]]],
-              [[[6], [8]], [[14], [16]]]]
-         ```
-         The output tensor has shape `[1, 4, 4, 1]` and value:
-         ```python
-         x = [[[1],  [2],  [ 3], [ 4]],
-              [[5],  [6],  [ 7], [ 8]],
-              [[9],  [10], [11], [12]],
-              [[13], [14], [15], [16]]]
-         ```
-       (4) For the following input of shape
-          `[8, 1, 3, 1]`,
-          `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
-          ```python
-          x = [[[[0], [ 1], [ 3]]],
-               [[[0], [ 9], [11]]],
-               [[[0], [ 2], [ 4]]],
-               [[[0], [10], [12]]],
-               [[[0], [ 5], [ 7]]],
-               [[[0], [13], [15]]],
-               [[[0], [ 6], [ 8]]],
-               [[[0], [14], [16]]]]
-          ```
-          The output tensor has shape `[2, 2, 4, 1]` and value:
-          ```python
-          x = [[[[ 1], [ 2], [ 3], [ 4]],
-                [[ 5], [ 6], [ 7], [ 8]]],
-               [[[ 9], [10], [11], [12]],
-                [[13], [14], [15], [16]]]] ```
     name: A name for the operation (optional).
 
+  Examples:
+
+  (1) For the following input of shape `[4, 1, 1, 1]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     [[[[1]]],
+      [[[2]]],
+      [[[3]]],
+      [[[4]]]]
+     ```
+
+    The output tensor has shape `[1, 2, 2, 1]` and value:
+
+     ```
+     x = [[[[1], [2]],
+         [[3], [4]]]]
+     ```
+
+  (2) For the following input of shape `[4, 1, 1, 3]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     [[[1,  2,   3]],
+      [[4,  5,   6]],
+      [[7,  8,   9]],
+      [[10, 11, 12]]]
+     ```
+
+    The output tensor has shape `[1, 2, 2, 3]` and value:
+
+    ```python
+     x = [[[[1, 2, 3], [4,  5,  6 ]],
+           [[7, 8, 9], [10, 11, 12]]]]
+     ```
+
+  (3) For the following
+     input of shape `[4, 2, 2, 1]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     x = [[[[1], [3]], [[ 9], [11]]],
+          [[[2], [4]], [[10], [12]]],
+          [[[5], [7]], [[13], [15]]],
+          [[[6], [8]], [[14], [16]]]]
+     ```
+
+    The output tensor has shape `[1, 4, 4, 1]` and value:
+
+    ```python
+     x = [[[1],  [2],  [ 3], [ 4]],
+          [[5],  [6],  [ 7], [ 8]],
+          [[9],  [10], [11], [12]],
+          [[13], [14], [15], [16]]]
+     ```
+
+   (4) For the following input of shape
+      `[8, 1, 3, 1]`,
+      `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
+
+      ```python
+      x = [[[[0], [ 1], [ 3]]],
+           [[[0], [ 9], [11]]],
+           [[[0], [ 2], [ 4]]],
+           [[[0], [10], [12]]],
+           [[[0], [ 5], [ 7]]],
+           [[[0], [13], [15]]],
+           [[[0], [ 6], [ 8]]],
+           [[[0], [14], [16]]]]
+      ```
+
+      The output tensor has shape `[2, 2, 4, 1]` and value:
+
+      ```python
+      x = [[[[ 1], [ 2], [ 3], [ 4]],
+            [[ 5], [ 6], [ 7], [ 8]]],
+           [[[ 9], [10], [11], [12]],
+            [[13], [14], [15], [16]]]]
+      ```
+
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
@@ -4059,6 +4127,7 @@ def _all_dimensions(x):
 
 
 @tf_export("sequence_mask")
+@dispatch.add_dispatch_support
 def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
   """Returns a mask tensor representing the first N positions of each cell.
 
@@ -4285,6 +4354,7 @@ def where(condition, x=None, y=None, name=None):
 
 
 @tf_export("where", v1=["where_v2"])
+@dispatch.add_dispatch_support
 def where_v2(condition, x=None, y=None, name=None):
   """Return the elements where `condition` is `True` (multiplexing `x` and `y`).
 
@@ -4403,8 +4473,8 @@ def reverse_sequence(input,
   dimension `seq_axis`.
 
   The elements of `seq_lengths` must obey `seq_lengths[i] <=
-  input.dims[seq_dim]`, and `seq_lengths` must be a vector of length
-  `input.dims[batch_dim]`.
+  input.dims[seq_axis]`, and `seq_lengths` must be a vector of length
+  `input.dims[batch_axis]`.
 
   The output slice `i` along dimension `batch_axis` is then given by
   input slice `i`, with the first `seq_lengths[i]` slices along
@@ -4426,8 +4496,8 @@ def reverse_sequence(input,
   Args:
     input: A `Tensor`. The input to reverse.
     seq_lengths: A `Tensor`. Must be one of the following types: `int32`,
-      `int64`. 1-D with length `input.dims(batch_dim)` and `max(seq_lengths) <=
-      input.dims(seq_dim)`
+      `int64`. 1-D with length `input.dims(batch_axis)` and `max(seq_lengths) <=
+      input.dims(seq_axis)`
     seq_axis: An `int`. The dimension which is partially reversed.
     batch_axis: An optional `int`. Defaults to `0`. The dimension along which
       reversal is performed.
@@ -4971,6 +5041,7 @@ def batch_gather_nd(params, indices, batch_dims, name=None):
 # because round_mode was added later.
 # (And also now because of 'axis' processing).
 @tf_export(v1=["quantize_v2"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
@@ -5024,6 +5095,7 @@ quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 # tf.quantization.quantize; we can deprecate tf.quantization.quantize in next
 # version of TensorFlow.
 @tf_export("quantization.quantize", v1=["quantization.quantize", "quantize"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("quantize")
 def quantize(
     input,  # pylint: disable=redefined-builtin
@@ -5063,6 +5135,7 @@ def quantize(
 
 @tf_export("quantization.dequantize", v1=["quantization.dequantize",
                                           "dequantize"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("dequantize")
 def dequantize(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
@@ -5098,6 +5171,7 @@ dequantize.__doc__ = gen_array_ops.dequantize.__doc__
 
 
 @tf_export("quantization.quantize_and_dequantize")
+@dispatch.add_dispatch_support
 def quantize_and_dequantize(
     input,  # pylint: disable=redefined-builtin
     input_min,
@@ -5157,6 +5231,7 @@ def quantize_and_dequantize(
 
 
 @tf_export("searchsorted")
+@dispatch.add_dispatch_support
 def searchsorted(sorted_sequence,
                  values,
                  side="left",
@@ -5221,6 +5296,7 @@ quantize.__doc__ = gen_array_ops.quantize_v2.__doc__
 
 
 @tf_export("image.extract_patches")
+@dispatch.add_dispatch_support
 def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
   r"""Extract `patches` from `images`.
 
@@ -5259,8 +5335,8 @@ def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
     # We generate two outputs as follows:
     # 1. 3x3 patches with stride length 5
     # 2. Same as above, but the rate is increased to 2
-    tf.extract_image_patches(images=images,
-                             ksizes=[1, 3, 3, 1],
+    tf.image.extract_patches(images=images,
+                             sizes=[1, 3, 3, 1],
                              strides=[1, 5, 5, 1],
                              rates=[1, 1, 1, 1],
                              padding='VALID')
@@ -5289,7 +5365,7 @@ def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
   ```
 
   ```
-    tf.extract_image_patches(images=images,
+    tf.image.extract_patches(images=images,
                              sizes=[1, 3, 3, 1],
                              strides=[1, 5, 5, 1],
                              rates=[1, 2, 2, 1],
@@ -5342,6 +5418,7 @@ def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
 
 
 @tf_export(v1=["image.extract_image_patches", "extract_image_patches"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "ksizes is deprecated, use sizes instead",
                              "ksizes")
 def extract_image_patches(  # pylint: disable=missing-docstring
@@ -5390,6 +5467,7 @@ extract_image_patches.__doc__ = gen_array_ops.extract_image_patches.__doc__
 
 
 @tf_export("fingerprint")
+@dispatch.add_dispatch_support
 def fingerprint(data, method="farmhash64", name=None):
   r"""Generates fingerprint values.
 
@@ -5636,6 +5714,7 @@ def _with_nonzero_rank(data):
 
 
 @tf_export("repeat")
+@dispatch.add_dispatch_support
 def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
   """Repeat elements of `input`.
   
diff --git a/tensorflow/python/ops/array_ops_test.py b/tensorflow/python/ops/array_ops_test.py
index 1bdd9d7361f..d8e2dcd0fb3 100644
--- a/tensorflow/python/ops/array_ops_test.py
+++ b/tensorflow/python/ops/array_ops_test.py
@@ -72,6 +72,9 @@ class ArrayOpTest(test.TestCase):
             math_ops.cast(array_ops.shape(x), dtypes.float32), dtypes.int32))
     self.assertAllEqual(c.shape.as_list(), [None, None, None])
 
+  def testEmptyMeshgrid(self):
+    self.assertEqual(array_ops.meshgrid(), [])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/bincount.py b/tensorflow/python/ops/bincount.py
new file mode 100644
index 00000000000..68950eaf596
--- /dev/null
+++ b/tensorflow/python/ops/bincount.py
@@ -0,0 +1,298 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# maxlengthations under the License.
+# ==============================================================================
+"""tf.sparse.bincount ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_count_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("sparse.bincount")
+def sparse_bincount(values,
+                    weights=None,
+                    axis=0,
+                    minlength=None,
+                    maxlength=None,
+                    binary_output=False,
+                    name=None):
+  """Count the number of times an integer value appears in a tensor.
+
+  This op takes an N-dimensional `Tensor`, `RaggedTensor`, or `SparseTensor`,
+  and returns an N-dimensional int64 SparseTensor where element
+  `[i0...i[axis], j]` contains the number of times the value `j` appears in
+  slice `[i0...i[axis], :]` of the input tensor.  Currently, only N=0 and
+  N=-1 are supported.
+
+  Args:
+    values: A Tensor, RaggedTensor, or SparseTensor whose values should be
+      counted. These tensors must have a rank of 1 or 2.
+    weights: A 1-dimensional Tensor of weights. If specified, the input array is
+      weighted by the weight array, i.e. if a value `n` is found at position
+      `i`, `out[n]`  will be increased by `weight[i]` instead of 1.
+    axis: The axis to slice over. Axes at and below `axis` will be flattened
+      before bin counting. Currently, only `0`, and `-1` are supported. If None,
+      all axes will be flattened (identical to passing `0`).
+    minlength: If given, skips `values` that are less than `minlength`, and
+      ensures that the output has a `dense_shape` of at least `minlength` in the
+      inner dimension.
+    maxlength: If given, skips `values` that are greater than or equal to
+      `maxlength`, and ensures that the output has a `dense_shape` of at most
+      `maxlength` in the inner dimension.
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
+    name: A name for this op.
+
+  Returns:
+    A SparseTensor with `output.shape = values.shape[:axis] + [N]`, where `N` is
+      * `maxlength` (if set);
+      * `minlength` (if set, and `minlength > reduce_max(values)`);
+      * `0` (if `values` is empty);
+      * `reduce_max(values) + 1` otherwise.
+
+
+  Examples:
+
+  **Bin-counting every item in individual batches**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
+  number of times value j appears in batch i.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([1 2 1 2 1 1], shape=(6,), dtype=int64),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  **Bin-counting with defined output shape**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
+  number of times value j appears in batch i. However, all values of j
+  above 'maxlength' are ignored. The dense_shape of the output sparse tensor
+  is set to 'minlength'. Note that, while the input is identical to the
+  example above, the value '10001' in batch item 2 is dropped, and the
+  dense shape is [2, 500] instead of [2,10002] or [2, 102].
+
+  >>> minlength = maxlength = 500
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(
+  ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[  0  10]
+   [  0  20]
+   [  0  30]
+   [  1  11]
+   [  1 101]], shape=(5, 2), dtype=int64),
+   values=tf.Tensor([1 2 1 2 1], shape=(5,), dtype=int64),
+   dense_shape=tf.Tensor([  2 500], shape=(2,), dtype=int64))
+
+  **Binary bin-counting**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where (i,j) is 1 if the value j
+  appears in batch i at least once and is 0 otherwise. Note that, even though
+  some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
+  the 'values' tensor is all 1s.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  **Weighted bin-counting**
+
+  This example takes two inputs - a values tensor and a weights tensor. These
+  tensors must be identically shaped, and have the same row splits or indices
+  in the case of RaggedTensors or SparseTensors. When performing a weighted
+  count, the op will output a SparseTensor where the value of (i, j) is the
+  sum of the values in the weight tensor's batch i in the locations where
+  the values tensor has the value j. In this case, the output dtype is the
+  same as the dtype of the weights tensor.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
+  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  """
+  with ops.name_scope(name, "count", [values, weights]):
+    if not isinstance(values, sparse_tensor.SparseTensor):
+      values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+          values, name="values")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
+
+    if weights is not None and binary_output:
+      raise ValueError("binary_output and weights are mutually exclusive.")
+
+    if axis is None:
+      axis = 0
+
+    if axis not in [0, -1]:
+      raise ValueError("Unsupported axis value %s. Only 0 and -1 are currently "
+                       "supported." % axis)
+
+    minlength_value = minlength if minlength is not None else -1
+    maxlength_value = maxlength if maxlength is not None else -1
+
+    if axis == 0:
+      if isinstance(values, sparse_tensor.SparseTensor):
+        if weights is not None:
+          weights = validate_sparse_weights(values, weights)
+        values = values.values
+      elif isinstance(values, ragged_tensor.RaggedTensor):
+        if weights is not None:
+          weights = validate_ragged_weights(values, weights)
+        values = values.values
+      else:
+        if weights is not None:
+          weights = array_ops.reshape(weights, [-1])
+        values = array_ops.reshape(values, [-1])
+
+    if isinstance(values, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
+          values.indices,
+          values.values,
+          values.dense_shape,
+          weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+    elif isinstance(values, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
+          values.row_splits,
+          values.values,
+          weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+    else:
+      weights = validate_dense_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
+          values,
+          weights=weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+
+    return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
+
+
+def validate_dense_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.dtype)
+
+  if not isinstance(weights, ops.Tensor):
+    raise ValueError(
+        "`weights` must be a tf.Tensor if `values` is a tf.Tensor.")
+
+  return weights
+
+
+def validate_sparse_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, sparse_tensor.SparseTensor):
+    raise ValueError(
+        "`weights` must be a SparseTensor if `values` is a SparseTensor.")
+
+  checks = []
+  if weights.dense_shape is not values.dense_shape:
+    checks.append(
+        check_ops.assert_equal(
+            weights.dense_shape,
+            values.dense_shape,
+            message="'weights' and 'values' must have the same dense shape."))
+  if weights.indices is not values.indices:
+    checks.append(
+        check_ops.assert_equal(
+            weights.indices,
+            values.indices,
+            message="'weights' and 'values' must have the same indices.")
+    )
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
+
+
+def validate_ragged_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, ragged_tensor.RaggedTensor):
+    raise ValueError(
+        "`weights` must be a RaggedTensor if `values` is a RaggedTensor.")
+
+  checks = []
+  if weights.row_splits is not values.row_splits:
+    checks.append(
+        check_ops.assert_equal(
+            weights.row_splits,
+            values.row_splits,
+            message="'weights' and 'values' must have the same row splits."))
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
diff --git a/tensorflow/python/ops/bincount_test.py b/tensorflow/python/ops/bincount_test.py
new file mode 100644
index 00000000000..839af8dcc35
--- /dev/null
+++ b/tensorflow/python/ops/bincount_test.py
@@ -0,0 +1,604 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# maxlengthations under the License.
+# ==============================================================================
+"""Tests for bincount ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import bincount
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import test
+
+
+class TestSparseCount(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "_no_maxlength",
+          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
+          "expected_values": [1, 1, 1, 2, 1],
+          "expected_shape": [2, 6]
+      }, {
+          "testcase_name": "_maxlength",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "maxlength": 7,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
+          "expected_values": [1, 1, 1, 1, 2],
+          "expected_shape": [2, 7]
+      }, {
+          "testcase_name": "_minlength",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 9,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [1, 1, 1, 1, 1, 2, 1],
+          "expected_shape": [2, 9]
+      }, {
+          "testcase_name": "_minlength_larger_values",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 3,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [1, 1, 1, 1, 1, 2, 1],
+          "expected_shape": [2, 8]
+      }, {
+          "testcase_name": "_no_maxlength_binary",
+          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
+          "expected_values": [1, 1, 1, 1, 1],
+          "expected_shape": [2, 6],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_maxlength_binary",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "maxlength": 7,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
+          "expected_values": [1, 1, 1, 1, 1],
+          "expected_shape": [2, 7],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_minlength_binary",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 9,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [1, 1, 1, 1, 1, 1, 1],
+          "expected_shape": [2, 9],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_minlength_larger_values_binary",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 3,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [1, 1, 1, 1, 1, 1, 1],
+          "expected_shape": [2, 8],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_no_maxlength_weights",
+          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
+          "expected_values": [2, 1, 0.5, 9, 3],
+          "expected_shape": [2, 6],
+          "weights": [[0.5, 1, 2], [3, 4, 5]]
+      }, {
+          "testcase_name": "_maxlength_weights",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "maxlength": 7,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
+          "expected_values": [2, 1, 0.5, 3, 9],
+          "expected_shape": [2, 7],
+          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
+      }, {
+          "testcase_name": "_minlength_weights",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 9,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
+          "expected_shape": [2, 9],
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
+      }, {
+          "testcase_name": "_minlength_larger_values_weights",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 3,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
+          "expected_shape": [2, 8],
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
+      }, {
+          "testcase_name": "_1d",
+          "x": np.array([3, 2, 1, 1], dtype=np.int32),
+          "expected_indices": [[1], [2], [3]],
+          "expected_values": [2, 1, 1],
+          "expected_shape": [4]
+      }, {
+          "testcase_name": "_all_axes",
+          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+          "expected_indices": [[1], [2], [3], [4], [5]],
+          "expected_values": [1, 1, 1, 2, 1],
+          "expected_shape": [6],
+          "axis": None
+      })
+  def test_dense_input(self,
+                       x,
+                       expected_indices,
+                       expected_values,
+                       expected_shape,
+                       minlength=None,
+                       maxlength=None,
+                       binary_output=False,
+                       weights=None,
+                       axis=-1):
+    y = bincount.sparse_bincount(
+        x,
+        weights=weights,
+        minlength=minlength,
+        maxlength=maxlength,
+        binary_output=binary_output,
+        axis=axis)
+    self.assertAllEqual(expected_indices, y.indices)
+    self.assertAllEqual(expected_values, y.values)
+    self.assertAllEqual(expected_shape, y.dense_shape)
+
+  @parameterized.named_parameters(
+      {
+          "testcase_name":
+              "_no_maxlength",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 2, 1],
+          "expected_shape": [3, 6],
+      },
+      {
+          "testcase_name":
+              "_maxlength",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 2, 1],
+          "expected_shape": [3, 7],
+          "maxlength":
+              7,
+      },
+      {
+          "testcase_name":
+              "_minlength",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 2, 1],
+          "expected_shape": [3, 9],
+          "minlength":
+              9,
+      },
+      {
+          "testcase_name":
+              "_minlength_larger_values",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 2, 1],
+          "expected_shape": [3, 8],
+          "minlength":
+              3,
+      },
+      {
+          "testcase_name":
+              "_no_maxlength_binary",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 1],
+          "expected_shape": [3, 6],
+          "binary_output":
+              True,
+      },
+      {
+          "testcase_name":
+              "_maxlength_binary",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 1],
+          "expected_shape": [3, 7],
+          "maxlength":
+              7,
+          "binary_output":
+              True,
+      },
+      {
+          "testcase_name":
+              "_minlength_binary",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 1, 1],
+          "expected_shape": [3, 9],
+          "minlength":
+              9,
+          "binary_output":
+              True,
+      },
+      {
+          "testcase_name":
+              "_minlength_larger_values_binary",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 1, 1],
+          "expected_shape": [3, 8],
+          "minlength":
+              3,
+          "binary_output":
+              True,
+      },
+      {
+          "testcase_name":
+              "_no_maxlength_weights",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [2, 6, 7, 10],
+          "expected_shape": [3, 6],
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
+      },
+      {
+          "testcase_name":
+              "_maxlength_weights",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [2, 6, 7, 10],
+          "expected_shape": [3, 7],
+          "maxlength":
+              7,
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
+      },
+      {
+          "testcase_name":
+              "_minlength_weights",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [2, 6, 14, 6.5, 10],
+          "expected_shape": [3, 9],
+          "minlength":
+              9,
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
+      },
+      {
+          "testcase_name":
+              "_minlength_larger_values_weights",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [2, 6, 14, 6.5, 10],
+          "expected_shape": [3, 8],
+          "minlength":
+              3,
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
+      },
+      {
+          "testcase_name": "_1d",
+          "x": np.array([3, 0, 1, 1], dtype=np.int32),
+          "expected_indices": [[1], [3]],
+          "expected_values": [2, 1],
+          "expected_shape": [4],
+      },
+      {
+          "testcase_name":
+              "_all_axes",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[1], [3], [4], [5]],
+          "expected_values": [1, 1, 2, 1],
+          "expected_shape": [6],
+          "axis":
+              None,
+      },
+  )
+  def test_sparse_input(self,
+                        x,
+                        expected_indices,
+                        expected_values,
+                        expected_shape,
+                        maxlength=None,
+                        minlength=None,
+                        binary_output=False,
+                        weights=None,
+                        axis=-1):
+    x_sparse = sparse_ops.from_dense(x)
+    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
+    y = bincount.sparse_bincount(
+        x_sparse,
+        weights=w_sparse,
+        minlength=minlength,
+        maxlength=maxlength,
+        binary_output=binary_output,
+        axis=axis)
+    self.assertAllEqual(expected_indices, y.indices)
+    self.assertAllEqual(expected_values, y.values)
+    self.assertAllEqual(expected_shape, y.dense_shape)
+
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "_no_maxlength",
+          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [1, 1, 1, 1, 2, 1],
+          "expected_shape": [5, 6],
+      },
+      {
+          "testcase_name": "_maxlength",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "maxlength": 7,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [1, 1, 1, 1, 2, 1],
+          "expected_shape": [5, 7],
+      },
+      {
+          "testcase_name": "_minlength",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 9,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 2, 1],
+          "expected_shape": [5, 9],
+      },
+      {
+          "testcase_name": "_minlength_larger_values",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 3,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 2, 1],
+          "expected_shape": [5, 8],
+      },
+      {
+          "testcase_name": "_no_maxlength_binary",
+          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 1],
+          "expected_shape": [5, 6],
+          "binary_output": True,
+      },
+      {
+          "testcase_name": "_maxlength_binary",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "maxlength": 7,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 1],
+          "expected_shape": [5, 7],
+          "binary_output": True,
+      },
+      {
+          "testcase_name": "_minlength_binary",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 9,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 1, 1],
+          "expected_shape": [5, 9],
+          "binary_output": True,
+      },
+      {
+          "testcase_name": "_minlength_larger_values_binary",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 3,
+          "binary_output": True,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 1, 1],
+          "expected_shape": [5, 8],
+      },
+      {
+          "testcase_name": "_no_maxlength_weights",
+          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
+          "expected_shape": [5, 6],
+          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
+      },
+      {
+          "testcase_name": "_maxlength_weights",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "maxlength": 7,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
+          "expected_shape": [5, 7],
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+      },
+      {
+          "testcase_name": "_minlength_weights",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 9,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
+          "expected_shape": [5, 9],
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+      },
+      {
+          "testcase_name": "_minlength_larger_values_weights",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 3,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
+          "expected_shape": [5, 8],
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+      },
+      {
+          "testcase_name": "_1d",
+          "x": [3, 0, 1, 1],
+          "expected_indices": [[0], [1], [3]],
+          "expected_values": [1, 2, 1],
+          "expected_shape": [4],
+      },
+      {
+          "testcase_name": "_all_axes",
+          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+          "expected_indices": [[0], [1], [3], [4], [5]],
+          "expected_values": [2, 1, 1, 2, 1],
+          "expected_shape": [6],
+          "axis": None,
+      },
+  )
+  def test_ragged_input(self,
+                        x,
+                        expected_indices,
+                        expected_values,
+                        expected_shape,
+                        maxlength=None,
+                        minlength=None,
+                        binary_output=False,
+                        weights=None,
+                        axis=-1):
+    x_ragged = ragged_factory_ops.constant(x)
+    w = ragged_factory_ops.constant(weights) if weights is not None else None
+    y = bincount.sparse_bincount(
+        x_ragged,
+        weights=w,
+        minlength=minlength,
+        maxlength=maxlength,
+        binary_output=binary_output,
+        axis=axis)
+    self.assertAllEqual(expected_indices, y.indices)
+    self.assertAllEqual(expected_values, y.values)
+    self.assertAllEqual(expected_shape, y.dense_shape)
+
+
+class TestSparseCountFailureModes(test.TestCase):
+
+  def test_dense_input_sparse_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_ragged_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_wrong_shape_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = np.array([[3, 2], [5, 4], [4, 3]])
+    # Note: Eager mode and graph mode throw different errors here. Graph mode
+    # will fail with a ValueError from the shape checking logic, while Eager
+    # will fail with an InvalidArgumentError from the kernel itself.
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must have the same shape"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+    else:
+      with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_dense_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_ragged_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same indices"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_too_many_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible shapes"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_shape_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
+                 dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same dense shape"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_dense_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_sparse_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_different_shape_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same row splits"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 56f76a49d51..6c1a36e65c9 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -24,12 +24,14 @@ from tensorflow.python.ops import array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_candidate_sampling_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(
     'random.uniform_candidate_sampler',
     v1=['random.uniform_candidate_sampler', 'nn.uniform_candidate_sampler'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('nn.uniform_candidate_sampler')
 def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                               range_max, seed=None, name=None):
@@ -92,6 +94,7 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
         'random.log_uniform_candidate_sampler',
         'nn.log_uniform_candidate_sampler'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('nn.log_uniform_candidate_sampler')
 def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                                   range_max, seed=None, name=None):
@@ -154,6 +157,7 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
 @tf_export(
     'random.learned_unigram_candidate_sampler',
     'nn.learned_unigram_candidate_sampler')
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints(['nn.learned_unigram_candidate_sampler'])
 def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
                                       unique, range_max, seed=None, name=None):
@@ -213,6 +217,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
 
 @tf_export('random.fixed_unigram_candidate_sampler',
            'nn.fixed_unigram_candidate_sampler')
+@dispatch.add_dispatch_support
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -341,6 +346,7 @@ def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
 
 
 @tf_export('nn.compute_accidental_hits')
+@dispatch.add_dispatch_support
 def compute_accidental_hits(true_classes, sampled_candidates, num_true,
                             seed=None, name=None):
   """Compute the position ids in `sampled_candidates` matching `true_classes`.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 3085e05eaf6..9a5b86a1deb 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 NUMERIC_TYPES = frozenset(
@@ -375,6 +376,7 @@ def _binary_assert(sym, opname, op_func, static_func, x, y, data, summarize,
 @tf_export(
     'debugging.assert_proper_iterable',
     v1=['debugging.assert_proper_iterable', 'assert_proper_iterable'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_proper_iterable')
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
@@ -404,6 +406,7 @@ def assert_proper_iterable(values):
 
 
 @tf_export('debugging.assert_negative', v1=[])
+@dispatch.add_dispatch_support
 def assert_negative_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
 
@@ -436,6 +439,7 @@ def assert_negative_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_negative', 'assert_negative'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_negative')
 @_unary_assert_doc('< 0', 'negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -456,6 +460,7 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_positive', v1=[])
+@dispatch.add_dispatch_support
 def assert_positive_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
 
@@ -488,6 +493,7 @@ def assert_positive_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_positive', 'assert_positive'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_positive')
 @_unary_assert_doc('> 0', 'positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -507,6 +513,7 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_non_negative', v1=[])
+@dispatch.add_dispatch_support
 def assert_non_negative_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
 
@@ -541,6 +548,7 @@ def assert_non_negative_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_non_negative', 'assert_non_negative'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_non_negative')
 @_unary_assert_doc('>= 0', 'non-negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -561,6 +569,7 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_non_positive', v1=[])
+@dispatch.add_dispatch_support
 def assert_non_positive_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
 
@@ -595,6 +604,7 @@ def assert_non_positive_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_non_positive', 'assert_non_positive'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_non_positive')
 @_unary_assert_doc('<= 0', 'non-positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -615,6 +625,7 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_equal', 'assert_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -649,6 +660,7 @@ def assert_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_equal', 'assert_equal'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('==')
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   with ops.name_scope(name, 'assert_equal', [x, y, data]):
@@ -660,6 +672,7 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_none_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
   """Assert the condition `x != y` holds for all elements.
 
@@ -698,6 +711,7 @@ def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_none_equal', 'assert_none_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_none_equal')
 @_binary_assert_doc('!=')
 def assert_none_equal(
@@ -707,6 +721,7 @@ def assert_none_equal(
 
 
 @tf_export('debugging.assert_near', v1=[])
+@dispatch.add_dispatch_support
 def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
                    name=None):
   """Assert the condition `x` and `y` are close element-wise.
@@ -760,6 +775,7 @@ def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
 
 
 @tf_export(v1=['debugging.assert_near', 'assert_near'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_near')
 def assert_near(
     x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
@@ -839,6 +855,7 @@ def assert_near(
 
 
 @tf_export('debugging.assert_less', 'assert_less', v1=[])
+@dispatch.add_dispatch_support
 def assert_less_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -874,6 +891,7 @@ def assert_less_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_less', 'assert_less'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('<')
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   return _binary_assert('<', 'assert_less', math_ops.less, np.less, x, y, data,
@@ -881,6 +899,7 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_less_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
 
@@ -917,6 +936,7 @@ def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_less_equal', 'assert_less_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_less_equal')
 @_binary_assert_doc('<=')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
@@ -925,6 +945,7 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_greater', 'assert_greater', v1=[])
+@dispatch.add_dispatch_support
 def assert_greater_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -961,6 +982,7 @@ def assert_greater_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_greater', 'assert_greater'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('>')
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   return _binary_assert('>', 'assert_greater', math_ops.greater, np.greater, x,
@@ -968,6 +990,7 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  #
 
 
 @tf_export('debugging.assert_greater_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x >= y` holds element-wise.
 
@@ -1005,6 +1028,7 @@ def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_greater_equal')
 @_binary_assert_doc('>=')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
@@ -1062,6 +1086,7 @@ def _assert_rank_condition(
 
 
 @tf_export('debugging.assert_rank', 'assert_rank', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_v2(x, rank, message=None, name=None):
   """Assert that `x` has rank equal to `rank`.
 
@@ -1095,6 +1120,7 @@ def assert_rank_v2(x, rank, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank', 'assert_rank'])
+@dispatch.add_dispatch_support
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -1157,6 +1183,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_rank_at_least', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_at_least_v2(x, rank, message=None, name=None):
   """Assert that `x` has rank of at least `rank`.
 
@@ -1190,6 +1217,7 @@ def assert_rank_at_least_v2(x, rank, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
@@ -1322,6 +1350,7 @@ def _assert_ranks_condition(
 
 
 @tf_export('debugging.assert_rank_in', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_in_v2(x, ranks, message=None, name=None):
   """Assert that `x` has a rank in `ranks`.
 
@@ -1354,6 +1383,7 @@ def assert_rank_in_v2(x, ranks, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank_in', 'assert_rank_in'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
@@ -1417,6 +1447,7 @@ def assert_rank_in(
 
 
 @tf_export('debugging.assert_integer', v1=[])
+@dispatch.add_dispatch_support
 def assert_integer_v2(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
 
@@ -1437,6 +1468,7 @@ def assert_integer_v2(x, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_integer', 'assert_integer'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
@@ -1476,6 +1508,7 @@ def assert_integer(x, message=None, name=None):
 
 
 @tf_export('debugging.assert_type', v1=[])
+@dispatch.add_dispatch_support
 def assert_type_v2(tensor, tf_type, message=None, name=None):
   """Asserts that the given `Tensor` is of the specified type.
 
@@ -1495,6 +1528,7 @@ def assert_type_v2(tensor, tf_type, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_type', 'assert_type'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
@@ -1584,6 +1618,7 @@ _TensorDimSizes = collections.namedtuple(
 
 
 @tf_export('debugging.assert_shapes', v1=[])
+@dispatch.add_dispatch_support
 def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
                      name=None):
   """Assert tensor shapes and dimension size relationships between tensors.
@@ -1650,6 +1685,7 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
 
 
 @tf_export(v1=['debugging.assert_shapes'])
+@dispatch.add_dispatch_support
 def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
   """Assert tensor shapes and dimension size relationships between tensors.
 
@@ -1845,7 +1881,12 @@ def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
                 'Specified by tensor %s dimension %d' %
                 (tensor_name(specified_by_y), specified_at_dim))
 
-          actual_size = sizes.actual_sizes[tensor_dim]
+          # This is extremely subtle. If actual_sizes is dynamic, we must
+          # make sure a control dependency is inserted here so that this slice
+          # can not execute until the rank is asserted to be enough for the
+          # slice to not fail.
+          with ops.control_dependencies(rank_assertions):
+            actual_size = sizes.actual_sizes[tensor_dim]
           if _has_known_value(actual_size) and _has_known_value(specified_size):
             if int(actual_size) != int(specified_size):
               raise ValueError(
@@ -1871,12 +1912,17 @@ def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
           size_assertions.append(
               control_flow_ops.Assert(condition, data_, summarize=summarize))
         else:
-          size = sizes.actual_sizes[tensor_dim]
+          # Not sure if actual_sizes is a constant, but for safety, guard
+          # on rank. See explanation above about actual_sizes need for safety.
+          with ops.control_dependencies(rank_assertions):
+            size = sizes.actual_sizes[tensor_dim]
           size_specifications[size_symbol] = (size, sizes.x, tensor_dim)
 
-    with ops.control_dependencies(rank_assertions):
-      shapes_assertion = control_flow_ops.group(size_assertions)
-    return shapes_assertion
+  # Ensure both assertions actually occur.
+  with ops.control_dependencies(rank_assertions):
+    shapes_assertion = control_flow_ops.group(size_assertions)
+
+  return shapes_assertion
 
 
 # pylint: disable=line-too-long
@@ -1929,6 +1975,7 @@ def is_numeric_tensor(tensor):
         'math.is_non_decreasing', 'debugging.is_non_decreasing',
         'is_non_decreasing'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('debugging.is_non_decreasing',
                                   'is_non_decreasing')
 def is_non_decreasing(x, name=None):
@@ -1970,6 +2017,7 @@ def is_non_decreasing(x, name=None):
         'math.is_strictly_increasing', 'debugging.is_strictly_increasing',
         'is_strictly_increasing'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('debugging.is_strictly_increasing',
                                   'is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
@@ -2056,6 +2104,7 @@ def _assert_same_base_type(items, expected_type=None):
 @tf_export(
     'debugging.assert_same_float_dtype',
     v1=['debugging.assert_same_float_dtype', 'assert_same_float_dtype'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_same_float_dtype')
 def assert_same_float_dtype(tensors=None, dtype=None):
   """Validate and return float type based on `tensors` and `dtype`.
@@ -2088,6 +2137,7 @@ def assert_same_float_dtype(tensors=None, dtype=None):
 
 
 @tf_export('debugging.assert_scalar', v1=[])
+@dispatch.add_dispatch_support
 def assert_scalar_v2(tensor, message=None, name=None):
   """Asserts that the given `tensor` is a scalar.
 
@@ -2110,6 +2160,7 @@ def assert_scalar_v2(tensor, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_scalar')
 def assert_scalar(tensor, name=None, message=None):
   """Asserts that the given `tensor` is a scalar (i.e. zero-dimensional).
@@ -2144,6 +2195,7 @@ def assert_scalar(tensor, name=None, message=None):
 
 
 @tf_export('ensure_shape')
+@dispatch.add_dispatch_support
 def ensure_shape(x, shape, name=None):
   """Updates the shape of a tensor and checks at runtime that the shape holds.
 
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 32f558b3708..f7662516b4f 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -152,6 +152,7 @@ def _clip_by_value_grad(op, grad):
 
 
 @tf_export("clip_by_norm")
+@dispatch.add_dispatch_support
 def clip_by_norm(t, clip_norm, axes=None, name=None):
   """Clips tensor values to a maximum L2-norm.
 
@@ -171,12 +172,33 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
   of the output will have L2-norm less than or equal to `clip_norm`. If
   `axes == [0]` instead, each column of the output will be clipped.
 
+  Code example:
+
+  >>> some_nums = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.float32)
+  >>> tf.clip_by_norm(some_nums, 2.0).numpy()
+  array([[0.26967996, 0.5393599 , 0.80903983, 1.0787199 , 1.3483998 ]],
+        dtype=float32)
+
   This operation is typically used to clip gradients before applying them with
-  an optimizer.
+  an optimizer.  Most gradient data is a collection of different shaped tensors
+  for different parts of the model.  Thus, this is a common usage:
+
+  ```
+  # Get your gradients after training
+  loss_value, grads = grad(model, features, labels)
+
+  # Apply some clipping
+  grads = [tf.clip_by_norm(g, norm)
+               for g in grads]
+
+  # Continue on with training
+  optimizer.apply_gradients(grads)
+  ```
 
   Args:
-    t: A `Tensor` or `IndexedSlices`.
-    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
+    t: A `Tensor` or `IndexedSlices`.  This must be a floating point type.
+    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value, also
+      floating point
     axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions
       to use for computing the L2-norm. If `None` (the default), uses all
       dimensions.
@@ -214,6 +236,7 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
 
 
 @tf_export("linalg.global_norm", v1=["linalg.global_norm", "global_norm"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("global_norm")
 def global_norm(t_list, name=None):
   """Computes the global norm of multiple tensors.
@@ -264,6 +287,7 @@ def global_norm(t_list, name=None):
 
 
 @tf_export("clip_by_global_norm")
+@dispatch.add_dispatch_support
 def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   """Clips values of multiple tensors by the ratio of the sum of their norms.
 
@@ -361,6 +385,7 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
     "use clip_by_norm(t, clip_norm * tf.cast(tf.size(t), tf.float32), name) "
     "instead.")
 @tf_export(v1=["clip_by_average_norm"])
+@dispatch.add_dispatch_support
 def clip_by_average_norm(t, clip_norm, name=None):
   """Clips tensor values to a maximum average L2-norm.
 
diff --git a/tensorflow/python/ops/collective_ops_xla_test.py b/tensorflow/python/ops/collective_ops_xla_test.py
new file mode 100644
index 00000000000..613dd2527f4
--- /dev/null
+++ b/tensorflow/python/ops/collective_ops_xla_test.py
@@ -0,0 +1,79 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Collective Operations with XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.platform import test
+
+
+class CollectiveOpXlaTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testScopedAllocatorWithXla(self):
+    group_size = 2
+    group_key = 1
+    instance_key1 = 1
+    instance_key2 = 2
+    tensor_size = 10
+
+    graph_options = config_pb2.GraphOptions(
+        optimizer_options=config_pb2.OptimizerOptions(
+            do_constant_folding=False))
+    cfg = config_pb2.ConfigProto(device_count={'CPU': group_size},
+                                 graph_options=graph_options)
+    rewrite_options = cfg.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_optimization = (
+        rewriter_config_pb2.RewriterConfig.ON)
+    del rewrite_options.scoped_allocator_opts.enable_op[:]
+    rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
+
+    with self.session(config=cfg) as sess:
+      run_ops = []
+      for i in range(group_size):
+        with ops.device('CPU:%d' % i):
+          tensor_val = [i + 1.] * tensor_size
+          constant = constant_op.constant(tensor_val)
+
+          @def_function.function(experimental_compile=True)
+          def f(x):
+            return 2 * x + 1
+
+          input_tensor1 = array_ops.identity(f(constant))
+          input_tensor2 = array_ops.identity(f(constant))
+          reduced_tensor1 = collective_ops.all_reduce(
+              input_tensor1, group_size, group_key, instance_key1, 'Add', 'Id')
+          reduced_tensor2 = collective_ops.all_reduce(
+              input_tensor2, group_size, group_key, instance_key2, 'Add', 'Id')
+          run_ops.append(array_ops.identity(reduced_tensor1))
+          run_ops.append(array_ops.identity(reduced_tensor2))
+      results = sess.run(run_ops)
+      for result in results:
+        for result_val in result:
+          self.assertEqual(result_val, 8.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 3e885975b03..39177defe57 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -93,6 +94,7 @@ def remove_squeezable_dimensions(
 
 
 @tf_export('math.confusion_matrix', v1=[])
+@dispatch.add_dispatch_support
 def confusion_matrix(labels,
                      predictions,
                      num_classes=None,
@@ -202,6 +204,7 @@ def confusion_matrix(labels,
 
 
 @tf_export(v1=['math.confusion_matrix', 'confusion_matrix'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('confusion_matrix', 'train.confusion_matrix')
 def confusion_matrix_v1(labels,
                         predictions,
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index bbb4f917b12..918c989432d 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -54,6 +54,7 @@ from tensorflow.python.ops.gen_control_flow_ops import *
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -110,6 +111,7 @@ def _summarize_eager(tensor, summarize=None):
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
 @tf_export("debugging.Assert", "Assert")
+@dispatch.add_dispatch_support
 @tf_should_use.should_use_result
 def Assert(condition, data, summarize=None, name=None):
   """Asserts that the given condition is true.
@@ -1095,6 +1097,7 @@ def _UnpackIfSingleton(res):
 # pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
 @tf_export(v1=["cond"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(
     None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
@@ -1318,6 +1321,7 @@ def _cast_indexed_slice_indices(a, b):
 
 
 @tf_export("cond", v1=[])
+@dispatch.add_dispatch_support
 def cond_for_tf_v2(pred, true_fn=None, false_fn=None, name=None):
   """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
@@ -2870,6 +2874,23 @@ def group(*inputs, **kwargs):
   When this op finishes, all ops in `inputs` have finished. This op has no
   output.
 
+  Note: *In TensorFlow 2 with eager and/or Autograph, you should not require
+  this method, as code executes in your expected order.* Only use tf.group when
+  working with v1-style code or in a graph context such as inside `Dataset.map`.
+
+  When operating in a v1-style graph context, ops are not executed in the same
+  order as specified in the code; TensorFlow will attempt to execute ops in
+  parallel or in an order convienient to the result it is computing.  `tf.group`
+  allows you to request that one or more results finish before execution
+  continues.
+
+  `tf.group` creates a single op (of type `NoOp`), and then adds appropriate
+  control dependencies.  Thus, `c = tf.group(a, b)` will compute the same graph
+  as this:
+
+      with tf.control_dependencies([a, b]):
+          c = tf.no_op()
+
   See also `tf.tuple` and
   `tf.control_dependencies`.
 
@@ -2925,6 +2946,7 @@ def group(*inputs, **kwargs):
 
 
 @tf_export("tuple", v1=[])
+@dispatch.add_dispatch_support
 def tuple_v2(tensors, control_inputs=None, name=None):
   """Group tensors together.
 
@@ -2961,6 +2983,7 @@ def tuple_v2(tensors, control_inputs=None, name=None):
 
 
 @tf_export(v1=["tuple"])
+@dispatch.add_dispatch_support
 def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
@@ -3295,6 +3318,7 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
 
 
 @tf_export("case", v1=[])
+@dispatch.add_dispatch_support
 def case_v2(pred_fn_pairs,
             default=None,
             exclusive=False,
@@ -3399,6 +3423,7 @@ def case_v2(pred_fn_pairs,
 
 
 @tf_export(v1=["case"])
+@dispatch.add_dispatch_support
 def case(pred_fn_pairs,
          default=None,
          exclusive=False,
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 8cea6108722..2979eb79bfd 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -39,8 +39,10 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_v2_toggles
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
@@ -957,7 +959,8 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def make_name(self):
-    return self.id().split(".")[-1].replace("(", "_").replace(")", "")
+    name = self.id().split(".")[-1].replace("(", "_").replace(")", "")
+    return name.replace(" ", "_")
 
   def disabled_testCase_ticklesGpuVsHostMemoryIssueWithInt32(self):
     nbranches = 5
@@ -1366,6 +1369,30 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
         c, b, [i], return_same_structure=True, maximum_iterations=50)
     self.assertEqual(self.evaluate(r), [10])
 
+  @test_util.enable_control_flow_v2
+  @test_util.run_in_graph_and_eager_modes
+  def testSkipsUnnecessaryCaptureGradients(self):
+    @custom_gradient.custom_gradient
+    def gradient_trap(t):
+      def grad(w):
+        # Computing this gradient should fail the test
+        check_ops.assert_equal(0, 1)
+        return w
+      return t, grad
+
+    x = array_ops.constant(0.0, name="x")
+    y = array_ops.constant(1.0, name="y")
+    def cond(s):
+      return s < 10.0
+    def body(s):
+      return s + 2*x + gradient_trap(y)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      out = control_flow_ops.while_loop(cond, body, (array_ops.constant(0.0),))
+
+    grad = tape.gradient(out, x)
+    self.assertAllEqual(grad, 20.0)
+
 
 class AssertTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index b64aec17392..7e87d25fe99 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -33,6 +33,7 @@ from tensorflow.python.util import tf_contextlib
 
 _EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = None
 _KERAS_LAYER_CONTEXT_FUNCTION = None
+_DISABLE_LOWER_USING_SWITCH_MERGE = False
 
 
 CondBranchFuncGraph = control_flow_v2_func_graphs.CondBranchFuncGraph
@@ -111,7 +112,8 @@ def maybe_set_lowering_attr(op):
   Args:
     op: An `If` or `While` Operation.
   """
-  if (not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
+  if (not _DISABLE_LOWER_USING_SWITCH_MERGE and
+      not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
       context.context().function_call_options.executor_type !=
       "SINGLE_THREADED_EXECUTOR"):
     # pylint: disable=protected-access
@@ -283,6 +285,7 @@ def output_all_intermediates():
 
 def get_func_graph(op, input_shapes, func_name):
   """Generates and returns a FuncGraph for the given op and input_shapes."""
+  fdef = None
   graph = op.graph
   # Recursively search the func in graphs.
   while graph is not None:
@@ -295,6 +298,9 @@ def get_func_graph(op, input_shapes, func_name):
     else:
       break
 
+  if fdef is None:
+    raise KeyError("%s cannot be found in the graph" % func_name)
+
   # `op.graph` may not be the same as `ops.get_default_graph()` e.g.
   # in the case of nested if ops or when the gradient is being computed
   # from inside a Defun. We build the `func_graph` with `op.graph` as its
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index d989bc0be44..6c9cdf1dd08 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -70,6 +71,7 @@ def _generate_defun_backend(unique_api_name, preferred_device, func):
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss"])
+@dispatch.add_dispatch_support
 def ctc_loss(labels,
              inputs=None,
              sequence_length=None,
@@ -284,6 +286,7 @@ def _CTCLossV2Grad(op, grad_loss, _):
 
 
 @tf_export("nn.ctc_greedy_decoder")
+@dispatch.add_dispatch_support
 def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
   """Performs greedy decoding on the logits given in input (best path).
 
@@ -333,6 +336,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
 
 
 @tf_export(v1=["nn.ctc_beam_search_decoder"])
+@dispatch.add_dispatch_support
 def ctc_beam_search_decoder(inputs,
                             sequence_length,
                             beam_width=100,
@@ -395,6 +399,7 @@ def ctc_beam_search_decoder(inputs,
 
 
 @tf_export("nn.ctc_beam_search_decoder", v1=["nn.ctc_beam_search_decoder_v2"])
+@dispatch.add_dispatch_support
 def ctc_beam_search_decoder_v2(inputs,
                                sequence_length,
                                beam_width=100,
@@ -731,6 +736,7 @@ def _ctc_loss_shape(op):
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss_v2"])
+@dispatch.add_dispatch_support
 def ctc_loss_v2(labels,
                 logits,
                 label_length,
@@ -825,6 +831,7 @@ def ctc_loss_v2(labels,
 
 
 @tf_export("nn.ctc_loss", v1=[])
+@dispatch.add_dispatch_support
 def ctc_loss_v3(labels,
                 logits,
                 label_length,
@@ -1056,6 +1063,7 @@ def ctc_loss_dense(labels,
 
 
 @tf_export("nn.collapse_repeated")
+@dispatch.add_dispatch_support
 def collapse_repeated(labels, seq_length, name=None):
   """Merge repeated labels into single labels.
 
@@ -1153,6 +1161,7 @@ def dense_labels_to_sparse(dense, length):
 
 
 @tf_export("nn.ctc_unique_labels")
+@dispatch.add_dispatch_support
 def ctc_unique_labels(labels, name=None):
   """Get unique labels and indices for batched labels for `tf.nn.ctc_loss`.
 
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 785813d9c11..4040a4db038 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -315,7 +315,7 @@ def _graph_mode_decorator(f, args, kwargs):
       v.ref() for v in current_var_scope.global_variables() +
       current_var_scope.local_variables()
   ])
-  with backprop.GradientTape() as tape:
+  with tape_lib.VariableWatcher() as variable_watcher:
     result, grad_fn = f(*args)
   after_vars = set([
       v.ref() for v in current_var_scope.global_variables() +
@@ -332,8 +332,9 @@ def _graph_mode_decorator(f, args, kwargs):
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
   inputs = args
-  variables_in_tape = frozenset([v.ref() for v in tape.watched_variables()
-                                ]) - frozenset(v.ref() for v in inputs)
+  variables_in_tape = frozenset([
+      v.ref() for v in variable_watcher.watched_variables()
+  ]) - frozenset(v.ref() for v in inputs)
   variables_in_subgraph = frozenset([
       v.ref()
       for v in get_dependent_variables(input_ops=inputs, output_ops=result)
@@ -405,14 +406,14 @@ def _graph_mode_decorator(f, args, kwargs):
 
 def _eager_mode_decorator(f, args, kwargs):
   """Implement custom gradient decorator for eager mode."""
-  with backprop.GradientTape() as tape:
+  with tape_lib.VariableWatcher() as variable_watcher:
     result, grad_fn = f(*args, **kwargs)
   all_inputs = list(args) + list(kwargs.values())
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
   variables = [
       v.deref()  # pylint: disable=g-complex-comprehension
-      for v in set(v.ref() for v in tape.watched_variables())
+      for v in set(v.ref() for v in variable_watcher.watched_variables())
       if all(v.deref() is not i for i in all_inputs)
   ]
   grad_argspec = tf_inspect.getfullargspec(grad_fn)
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 731599dad05..1c7b204fa58 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,7 +36,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -250,6 +251,7 @@ def _embedding_lookup_and_transform(params,
 
 
 @tf_export(v1=["nn.embedding_lookup"])
+@dispatch.add_dispatch_support
 def embedding_lookup(
     params,
     ids,
@@ -257,7 +259,7 @@ def embedding_lookup(
     name=None,
     validate_indices=True,  # pylint: disable=unused-argument
     max_norm=None):
-  """Looks up `ids` in a list of embedding tensors.
+  """Looks up embeddings for the given `ids` from a list of tensors.
 
   This function is used to perform parallel lookups on the list of tensors in
   `params`.  It is a generalization of `tf.gather`, where `params` is
@@ -327,37 +329,30 @@ def embedding_lookup(
 
 
 @tf_export("nn.embedding_lookup", v1=[])
+@dispatch.add_dispatch_support
 def embedding_lookup_v2(params, ids, max_norm=None, name=None):
-  """Looks up `ids` in a list of embedding tensors.
+  """Looks up embeddings for the given `ids` from a list of tensors.
 
-  This function is used to perform parallel lookups on the list of
-  tensors in `params`.  It is a generalization of
-  `tf.gather`, where `params` is
-  interpreted as a partitioning of a large embedding tensor.  `params` may be
-  a `PartitionedVariable` as returned by using `tf.compat.v1.get_variable()`
-  with a
-  partitioner.
+  This function is used to perform parallel lookups on the list of tensors in
+  `params`.  It is a generalization of `tf.gather`, where `params` is
+  interpreted as a partitioning of a large embedding tensor.
 
-  If `len(params) > 1`, each element `id` of `ids` is partitioned between
-  the elements of `params` according to the `partition_strategy`.
-  In all strategies, if the id space does not evenly divide the number of
-  partitions, each of the first `(max_id + 1) % len(params)` partitions will
-  be assigned one more id.
-
-  The `partition_strategy` is always `"div"` currently. This means that we
+  If `len(params) > 1`, each element `id` of `ids` is partitioned between the
+  elements of `params` according to the "div" partition strategy, which means we
   assign ids to partitions in a contiguous manner. For instance, 13 ids are
   split across 5 partitions as:
-  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
+  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`.
+
+  If the id space does not evenly divide the number of partitions, each of the
+  first `(max_id + 1) % len(params)` partitions will be assigned one more id.
 
   The results of the lookup are concatenated into a dense
   tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
 
   Args:
     params: A single tensor representing the complete embedding tensor, or a
-      list of P tensors all of same shape except for the first dimension,
-      representing sharded embedding tensors.  Alternatively, a
-      `PartitionedVariable`, created by partitioning along dimension 0. Each
-      element must be appropriately sized for the 'div' `partition_strategy`.
+      list of tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors following "div" partition strategy.
     ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
       up in `params`.
     max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
@@ -367,6 +362,32 @@ def embedding_lookup_v2(params, ids, max_norm=None, name=None):
   Returns:
     A `Tensor` with the same type as the tensors in `params`.
 
+    For instance, if `params` is a 5x2 matrix:
+
+    ```python
+    [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    ```
+
+    or a list of matrices:
+
+    ```python
+    params[0]: [[1, 2], [3, 4]]
+    params[1]: [[5, 6], [7, 8]]
+    params[2]: [[9, 10]]
+    ```
+
+    and `ids` is:
+
+    ```python
+    [0, 3, 4]
+    ```
+
+    The output will be a 3x2 matrix:
+
+    ```python
+    [[1, 2], [7, 8], [9, 10]]
+    ```
+
   Raises:
     ValueError: If `params` is empty.
   """
@@ -374,6 +395,7 @@ def embedding_lookup_v2(params, ids, max_norm=None, name=None):
 
 
 @tf_export(v1=["nn.embedding_lookup_sparse"])
+@dispatch.add_dispatch_support
 def embedding_lookup_sparse(params,
                             sp_ids,
                             sp_weights,
@@ -381,19 +403,22 @@ def embedding_lookup_sparse(params,
                             name=None,
                             combiner=None,
                             max_norm=None):
-  """Computes embeddings for the given ids and weights.
+  """Looks up embeddings for the given ids and weights from a list of tensors.
 
   This op assumes that there is at least one id for each row in the dense tensor
   represented by sp_ids (i.e. there are no rows with empty features), and that
   all the indices of sp_ids are in canonical row-major order.
 
+  `sp_ids` and `sp_weights` (if not None) are `SparseTensor`s with rank of 2.
+  Embeddings are always aggregated along the last dimension.
+
   It also assumes that all id values lie in the range [0, p0), where p0
   is the sum of the size of params along dimension 0.
 
   Args:
     params: A single tensor representing the complete embedding tensor, or a
-      list of P tensors all of same shape except for the first dimension,
-      representing sharded embedding tensors.  Alternatively, a
+      list tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors. Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for the given `partition_strategy`.
     sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size
@@ -409,7 +434,7 @@ def embedding_lookup_sparse(params,
       and "sum" are supported. "sum" computes the weighted sum of the embedding
       results for each row. "mean" is the weighted sum divided by the total
       weight. "sqrtn" is the weighted sum divided by the square root of the sum
-      of the squares of the weights.
+      of the squares of the weights. Defaults to `mean`.
     max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
       than this value, before combining.
 
@@ -425,11 +450,11 @@ def embedding_lookup_sparse(params,
 
     and
 
-      `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`
+      `shape(sp_ids) = shape(sp_weights) = [d0, d1]`
 
     then
 
-      `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`.
+      `shape(output) = [d0, p1, ..., pm]`.
 
     For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
 
@@ -454,8 +479,6 @@ def embedding_lookup_sparse(params,
     ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
   """
   if combiner is None:
-    logging.warn("The default value of combiner will change from \"mean\" "
-                 "to \"sqrtn\" after 2016/11/01.")
     combiner = "mean"
   if combiner not in ("mean", "sqrtn", "sum"):
     raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")
@@ -481,8 +504,6 @@ def embedding_lookup_sparse(params,
   with ops.name_scope(name, "embedding_lookup_sparse",
                       params + [sp_ids]) as name:
     segment_ids = sp_ids.indices[:, 0]
-    if segment_ids.dtype != dtypes.int32:
-      segment_ids = math_ops.cast(segment_ids, dtypes.int32)
 
     ids = sp_ids.values
     ids, idx = array_ops.unique(ids)
@@ -492,6 +513,9 @@ def embedding_lookup_sparse(params,
     if embeddings.dtype in (dtypes.float16, dtypes.bfloat16):
       embeddings = math_ops.cast(embeddings, dtypes.float32)
     if not ignore_weights:
+      if segment_ids.dtype != dtypes.int32:
+        segment_ids = math_ops.cast(segment_ids, dtypes.int32)
+
       weights = sp_weights.values
       if weights.dtype != embeddings.dtype:
         weights = math_ops.cast(weights, embeddings.dtype)
@@ -531,6 +555,12 @@ def embedding_lookup_sparse(params,
       else:
         assert False, "Unrecognized combiner"
     else:
+      if compat.forward_compatible(2020, 5, 14):
+        if segment_ids.dtype not in (dtypes.int32, dtypes.int64):
+          segment_ids = math_ops.cast(segment_ids, dtypes.int32)
+      else:
+        if segment_ids.dtype != dtypes.int32:
+          segment_ids = math_ops.cast(segment_ids, dtypes.int32)
       assert idx is not None
       if combiner == "sum":
         embeddings = math_ops.sparse_segment_sum(
@@ -548,27 +578,38 @@ def embedding_lookup_sparse(params,
 
 
 @tf_export("nn.embedding_lookup_sparse", v1=[])
+@dispatch.add_dispatch_support
 def embedding_lookup_sparse_v2(params,
                                sp_ids,
                                sp_weights,
                                combiner=None,
                                max_norm=None,
                                name=None):
-  """Computes embeddings for the given ids and weights.
+  """Looks up embeddings for the given ids and weights from a list of tensors.
 
   This op assumes that there is at least one id for each row in the dense tensor
   represented by sp_ids (i.e. there are no rows with empty features), and that
   all the indices of sp_ids are in canonical row-major order.
 
+  `sp_ids` and `sp_weights` (if not None) are `SparseTensor`s with rank of 2.
+  Embeddings are always aggregated along the last dimension.
+
   It also assumes that all id values lie in the range [0, p0), where p0
   is the sum of the size of params along dimension 0.
 
+  If `len(params) > 1`, each element of `sp_ids` is partitioned between the
+  elements of `params` according to the "div" partition strategy, which means we
+  assign ids to partitions in a contiguous manner. For instance, 13 ids are
+  split across 5 partitions as:
+  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`.
+
+  If the id space does not evenly divide the number of partitions, each of the
+  first `(max_id + 1) % len(params)` partitions will be assigned one more id.
+
   Args:
     params: A single tensor representing the complete embedding tensor, or a
-      list of P tensors all of same shape except for the first dimension,
-      representing sharded embedding tensors.  Alternatively, a
-      `PartitionedVariable`, created by partitioning along dimension 0. Each
-      element must be appropriately sized for ``"div"`` `partition_strategy`.
+      list of tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors following "div" partition strategy.
     sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size
       and M is arbitrary.
     sp_weights: either a `SparseTensor` of float / double weights, or `None` to
@@ -578,7 +619,7 @@ def embedding_lookup_sparse_v2(params,
       and "sum" are supported. "sum" computes the weighted sum of the embedding
       results for each row. "mean" is the weighted sum divided by the total
       weight. "sqrtn" is the weighted sum divided by the square root of the sum
-      of the squares of the weights.
+      of the squares of the weights. Defaults to `mean`.
     max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
       than this value, before combining.
     name: Optional name for the op.
@@ -595,11 +636,11 @@ def embedding_lookup_sparse_v2(params,
 
     and
 
-      `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`
+      `shape(sp_ids) = shape(sp_weights) = [d0, d1]`
 
     then
 
-      `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`.
+      `shape(output) = [d0, p1, ..., pm]`.
 
     For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
 
@@ -628,6 +669,7 @@ def embedding_lookup_sparse_v2(params,
 
 
 @tf_export("nn.safe_embedding_lookup_sparse", v1=[])
+@dispatch.add_dispatch_support
 def safe_embedding_lookup_sparse_v2(embedding_weights,
                                     sparse_ids,
                                     sparse_weights=None,
@@ -639,10 +681,7 @@ def safe_embedding_lookup_sparse_v2(embedding_weights,
 
   The partitioned embedding in `embedding_weights` must all be the same shape
   except for the first dimension. The first dimension is allowed to vary as the
-  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
-  may be a `PartitionedVariable` as returned by using
-  `tf.compat.v1.get_variable()` with a
-  partitioner.
+  vocabulary size is not necessarily a multiple of num of shards.
 
   Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
   with non-positive weight. For an entry with no features, the embedding vector
@@ -651,16 +690,21 @@ def safe_embedding_lookup_sparse_v2(embedding_weights,
   The ids and weights may be multi-dimensional. Embeddings are always aggregated
   along the last dimension.
 
-  Note: when doing embedding lookup on `embedding_weights`, "div" partition
-  strategy will be used. Support for other partition strategy will be added
-  later.
+  If `len(embedding_weights) > 1`, each element `id` of `ids` is partitioned
+  between the elements of `embedding_weights` according to the "div" partition
+  strategy, which means we assign ids to partitions in a contiguous manner. For
+  instance, 13 ids are split across 5 partitions as:
+  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`.
+
+  If the id space does not evenly divide the number of partitions, each of the
+  first `(max_id + 1) % len(embedding_weights)` partitions will be assigned one
+  more id.
 
   Args:
-    embedding_weights:  A list of `P` float `Tensor`s or values representing
-      partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
-      created by partitioning along dimension 0.  The total unpartitioned shape
-      should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size
-      and `e_1, ..., e_m` are the embedding dimensions.
+    embedding_weights: A single tensor representing the complete embedding
+      tensor, or a list of tensors all of same shape except for the first
+      dimension, representing sharded embedding tensors following "div"
+      partition strategy.
     sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
       ids. `d_0` is typically batch size.
     sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
@@ -669,13 +713,48 @@ def safe_embedding_lookup_sparse_v2(embedding_weights,
     combiner: A string specifying how to combine embedding results for each
       entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
       default.
-    default_id: The id to use for an entry with no features.
+    default_id: The id to use for an entry with no features. Defaults to
+      0-vector.
     max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
       combining.
     name: A name for this operation (optional).
 
   Returns:
-    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
+    A dense tensor representing the combined embeddings for the
+    sparse ids. For each row in the dense tensor represented by `sparse_ids`,
+    the op looks up the embeddings for all ids in that row, multiplies them by
+    the corresponding weight, and combines these embeddings as specified.
+
+    In other words, if
+
+      `shape(combined embedding_weights) = [p0, p1, ..., pm]`
+
+    and
+
+      `shape(sparse_ids) = shape(sparse_weights) = [d0, d1, ..., dn]`
+
+    then
+
+      `shape(output) = [d0, d1, ... dn-1, p1, ..., pm]`.
+
+    For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
+
+      ```python
+      [0, 0]: id 1, weight 2.0
+      [0, 1]: id 3, weight 0.5
+      [1, 0]: id -1, weight 1.0
+      [2, 3]: id 1, weight 3.0
+      ```
+
+    `default_id` is 0.
+
+    with `combiner`="mean", then the output will be a 3x20 matrix where
+
+      ```python
+      output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
+      output[1, :] = (params[0, :] * 1.0) / 1.0
+      output[2, :] = (params[1, :] * 3.0) / 3.0
+      ```
 
   Raises:
     ValueError: if `embedding_weights` is empty.
@@ -692,6 +771,7 @@ def safe_embedding_lookup_sparse_v2(embedding_weights,
 
 
 @tf_export(v1=["nn.safe_embedding_lookup_sparse"])
+@dispatch.add_dispatch_support
 def safe_embedding_lookup_sparse(embedding_weights,
                                  sparse_ids,
                                  sparse_weights=None,
@@ -717,11 +797,11 @@ def safe_embedding_lookup_sparse(embedding_weights,
   along the last dimension.
 
   Args:
-    embedding_weights:  A list of `P` float `Tensor`s or values representing
-      partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
-      created by partitioning along dimension 0.  The total unpartitioned shape
-      should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size
-      and `e_1, ..., e_m` are the embedding dimensions.
+    embedding_weights: A single tensor representing the complete embedding
+      tensor, or a list tensors all of same shape except for the first
+      dimension, representing sharded embedding tensors. Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
     sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
       ids. `d_0` is typically batch size.
     sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
@@ -738,7 +818,41 @@ def safe_embedding_lookup_sparse(embedding_weights,
       combining.
 
   Returns:
-    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
+    A dense tensor representing the combined embeddings for the
+    sparse ids. For each row in the dense tensor represented by `sp_ids`, the op
+    looks up the embeddings for all ids in that row, multiplies them by the
+    corresponding weight, and combines these embeddings as specified.
+
+    In other words, if
+
+      `shape(combined embedding_weights) = [p0, p1, ..., pm]`
+
+    and
+
+      `shape(sparse_ids) = shape(sparse_weights) = [d0, d1, ..., dn]`
+
+    then
+
+      `shape(output) = [d0, d1, ... dn-1, p1, ..., pm]`.
+
+    For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
+
+      ```python
+      [0, 0]: id 1, weight 2.0
+      [0, 1]: id 3, weight 0.5
+      [1, 0]: id -1, weight 1.0
+      [2, 3]: id 1, weight 3.0
+      ```
+
+    `default_id` is 0.
+
+    with `combiner`="mean", then the output will be a 3x20 matrix where
+
+      ```python
+      output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
+      output[1, :] = (params[0, :] * 1.0) / 1.0
+      output[2, :] = (params[1, :] * 3.0) / 3.0
+      ```
 
   Raises:
     ValueError: if `embedding_weights` is empty.
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index fe66e8ccdfb..37b41a55eb9 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops.gen_functional_ops import remote_call
 from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -45,6 +46,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # TODO(yuanbyu, mrry): Handle stride to support sliding windows.
 @tf_export(v1=["foldl"])
+@dispatch.add_dispatch_support
 def foldl(fn,
           elems,
           initializer=None,
@@ -162,6 +164,7 @@ def foldl(fn,
 
 
 @tf_export("foldl", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -238,6 +241,7 @@ def foldl_v2(fn,
 
 
 @tf_export(v1=["foldr"])
+@dispatch.add_dispatch_support
 def foldr(fn,
           elems,
           initializer=None,
@@ -356,6 +360,7 @@ def foldr(fn,
 
 
 @tf_export("foldr", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -432,6 +437,7 @@ def foldr_v2(fn,
 
 
 @tf_export(v1=["scan"])
+@dispatch.add_dispatch_support
 def scan(fn,
          elems,
          initializer=None,
@@ -686,6 +692,7 @@ def scan(fn,
 
 
 @tf_export("scan", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -863,11 +870,24 @@ def Gradient(inputs, f, name=None):
   return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
 
 
+def _GetInputDtypes(func):
+  """Returns the input dtypes of func, excluding dtypes for captured inputs."""
+  if isinstance(func, function._DefinedFunction):  # pylint: disable=protected-access
+    return func.declared_input_types
+
+  # We assume that `func` is a ConcreteFunction here, but we are not able to
+  # verify since importing eager function library will cause cyclic dependence.
+  #
+  # ConcreteFunction.inputs includes captured inputs.
+  num_non_captured_inputs = len(func.inputs) - len(func.captured_inputs)
+  inputs_without_captured = func.inputs[:num_non_captured_inputs]
+  return [t.dtype for t in inputs_without_captured]
+
+
 def _LoopBodyCaptureWrapper(func):
   """Returns a wrapper for `func` that handles loop-carried captured inputs."""
 
-  @function.Defun(
-      *func.declared_input_types, func_name="%s_Wrapper" % func.name)
+  @function.Defun(*_GetInputDtypes(func), func_name="%s_Wrapper" % func.name)
   def Wrapper(*args):
     """A wrapper that handles loop-carried captured inputs."""
     result = func(*args)
@@ -877,11 +897,11 @@ def _LoopBodyCaptureWrapper(func):
     if isinstance(result, ops.Operation):
       return extra_args
     # Unary functions return a single Tensor value.
-    elif not isinstance(result, tuple):
+    elif not isinstance(result, (list, tuple)):
       return (result,) + extra_args
     # N-ary functions return a tuple of Tensors.
     else:
-      return result + extra_args
+      return result + type(result)(extra_args)
 
   return Wrapper
 
@@ -917,19 +937,23 @@ def While(input_, cond, body, name=None, hostmem=None):
     raise ValueError("While op 'cond' argument must be a function "
                      "without implicitly captured inputs.")
 
-  if cond.declared_input_types != body.declared_input_types:
+  cond_input_types = _GetInputDtypes(cond)
+  body_input_types = _GetInputDtypes(body)
+
+  if cond_input_types != body_input_types:
     raise ValueError(
         "While op 'cond' and 'body' signatures do not match. %r vs %r" %
-        (cond.declared_input_types, body.declared_input_types))
+        (cond_input_types, body_input_types))
 
   if body.captured_inputs:
-    cond_dtypes = list(
-        body.declared_input_types) + [t.dtype for t in body.captured_inputs]
+    cond_dtypes = list(body_input_types) + [
+        t.dtype for t in body.captured_inputs
+    ]
 
     @function.Defun(*cond_dtypes, func_name="%s_Wrapper" % cond.name)
     def CondWrapper(*args):
       """A wrapper that handles loop-carried captured inputs."""
-      return cond(*args[:len(body.declared_input_types)])
+      return cond(*args[:len(body_input_types)])
 
     ret = gen_functional_ops._while(
         input_ + body.captured_inputs,
@@ -1184,8 +1208,8 @@ def partitioned_call(args,
   if hasattr(f, "graph"):
     _set_read_only_resource_inputs_attr(op, f.graph)
     if hasattr(f.graph, "collective_manager_ids_used"):
-      ops.set_int_list_attr(
-          op, acd.COLLECTIVE_MANAGER_IDS, f.graph.collective_manager_ids_used)
+      ops.set_int_list_attr(op, acd.COLLECTIVE_MANAGER_IDS,
+                            f.graph.collective_manager_ids_used)
   return outputs if outputs else op
 
 
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index 92ca9c2971e..c8ebf12569a 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -149,7 +149,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertAllEqual(correct, analytical)
       self.assertAllClose(correct, numerical, rtol=1e-4)
       self.assertLess(
-          gradient_checker.compute_gradient_error(x, size, y, size), 2e-4)
+          gradient_checker.compute_gradient_error(x, size, y, size), 3e-4)
 
   @test_util.run_deprecated_v1
   def testComplexConj(self):
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index a7958c0e04a..8575ea807e4 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -184,6 +184,10 @@ def gradients_v2(ys,  # pylint: disable=invalid-name
                  unconnected_gradients=UnconnectedGradients.NONE):
   """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.
 
+  `tf.gradients` is only valid in a graph context. In particular,
+  it is valid in the context of a `tf.function` wrapper, where code
+  is executing as a graph.
+
   `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
   is a list of `Tensor`, holding the gradients received by the
   `ys`. The list must be the same length as `ys`.
@@ -206,22 +210,28 @@ def gradients_v2(ys,  # pylint: disable=invalid-name
   other things, this allows computation of partial derivatives as opposed to
   total derivatives. For example:
 
-  ```python
-  a = tf.constant(0.)
-  b = 2 * a
-  g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
-  ```
+  >>> @tf.function
+  ... def example():
+  ...   a = tf.constant(0.)
+  ...   b = 2 * a
+  ...   return tf.gradients(a + b, [a, b], stop_gradients=[a, b])
+  >>> example()
+  [<tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.0>]
 
   Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
   total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
   influence of `a` on `b` and evaluate to `[3.0, 1.0]`.  Note that the above is
   equivalent to:
 
-  ```python
-  a = tf.stop_gradient(tf.constant(0.))
-  b = tf.stop_gradient(2 * a)
-  g = tf.gradients(a + b, [a, b])
-  ```
+  >>> @tf.function
+  ... def example():
+  ...   a = tf.stop_gradient(tf.constant(0.))
+  ...   b = tf.stop_gradient(2 * a)
+  ...   return tf.gradients(a + b, [a, b])
+  >>> example()
+  [<tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.0>]
 
   `stop_gradients` provides a way of stopping gradient after the graph has
   already been constructed, as compared to `tf.stop_gradient` which is used
@@ -238,29 +248,35 @@ def gradients_v2(ys,  # pylint: disable=invalid-name
   using the `'zero'` option. `tf.UnconnectedGradients` provides the
   following options and behaviors:
 
-  ```python
-  a = tf.ones([1, 2])
-  b = tf.ones([3, 1])
-  g1 = tf.gradients([b], [a], unconnected_gradients='none')
-  sess.run(g1)  # [None]
-
-  g2 = tf.gradients([b], [a], unconnected_gradients='zero')
-  sess.run(g2)  # [array([[0., 0.]], dtype=float32)]
-  ```
+  >>> @tf.function
+  ... def example(use_zero):
+  ...   a = tf.ones([1, 2])
+  ...   b = tf.ones([3, 1])
+  ...   if use_zero:
+  ...     return tf.gradients([b], [a], unconnected_gradients='zero')
+  ...   else:
+  ...     return tf.gradients([b], [a], unconnected_gradients='none')
+  >>> example(False)
+  [None]
+  >>> example(True)
+  [<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0., 0.]], ...)>]
 
   Let us take one practical example which comes during the back propogation
   phase. This function is used to evaluate the derivatives of the cost function
   with respect to Weights `Ws` and Biases `bs`. Below sample implementation
   provides the exaplantion of what it is actually used for :
 
-  ```python
-  Ws = tf.constant(0.)
-  bs = 2 * Ws
-  cost = Ws + bs  # This is just an example. So, please ignore the formulas.
-  g = tf.gradients(cost, [Ws, bs])
-  dCost_dW, dCost_db = g
-  ```
-
+  >>> @tf.function
+  ... def example():
+  ...   Ws = tf.constant(0.)
+  ...   bs = 2 * Ws
+  ...   cost = Ws + bs  # This is just an example. Please ignore the formulas.
+  ...   g = tf.gradients(cost, [Ws, bs])
+  ...   dCost_dW, dCost_db = g
+  ...   return dCost_dW, dCost_db
+  >>> example()
+  (<tf.Tensor: shape=(), dtype=float32, numpy=3.0>,
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.0>)
 
   Args:
     ys: A `Tensor` or list of tensors to be differentiated.
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 9e9ebccfe0f..817d8a1adbe 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -33,7 +33,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
-from tensorflow.python.keras.engine import training
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
@@ -1324,41 +1323,8 @@ class TensorListGradientsTest(test_util.TensorFlowTestCase):
         self.assertEquals(self.evaluate(grad), 5.)
 
 
-class TestKerasModelClass(training.Model):
-  """A simple tensorflow keras Model class definition."""
-
-  def __init__(self, width):
-    super(TestKerasModelClass, self).__init__()
-
-    self.weight = variable_scope.get_variable(
-        name="test_keras_var",
-        shape=width,
-        dtype=dtypes.float32,
-        trainable=True,
-        use_resource=True,
-    )
-
-  def call(self, inputs):
-    return self.weight * inputs
-
-
 class VariablesGradientTest(test_util.TensorFlowTestCase):
 
-  def _TestVariablesGradient(self, inputs, test_model, vars_to_grad):
-    """Returns gradients of `test_model` with respect to `vars_to_grad`."""
-
-    test_model_re = custom_gradient.recompute_grad(test_model)
-
-    with backprop.GradientTape(persistent=True) as tape:
-      tape.watch(vars_to_grad)
-      out_re = test_model_re(inputs)
-      out = test_model(inputs)
-
-    grads_re = tape.gradient(out_re, vars_to_grad)
-    grads = tape.gradient(out, vars_to_grad)
-
-    return grads_re, grads
-
   def _TestFnVariablesGradient(self, inputs, test_fn, vars_to_grad):
     """Returns gradients of `test_model` with respect to `vars_to_grad`."""
 
@@ -1374,30 +1340,6 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
 
     return grads_re, grads
 
-  @test_util.run_in_graph_and_eager_modes
-  def testKerasRecompute(self):
-    """Checks that recompute_grad works for a simple Keras Model."""
-
-    test_model = TestKerasModelClass(10)
-    test_input = constant(np.zeros((10, 10), dtype=np.float32))
-    self.evaluate(variables.global_variables_initializer())
-    test_model(test_input)  # Ensures keras model is initialized.
-    grads_re, grads = self._TestVariablesGradient(test_input, test_model,
-                                                  test_input)
-
-    grads_re = self.evaluate(grads_re)
-    grads = self.evaluate(grads)
-    for g, g_re in zip(grads, grads_re):
-      self.assertAllClose(g, g_re)
-
-    grads_re, grads = self._TestVariablesGradient(test_input, test_model,
-                                                  test_model.variables)
-
-    grads_re = self.evaluate(grads_re)
-    grads = self.evaluate(grads)
-    for g, g_re in zip(grads, grads_re):
-      self.assertAllClose(g, g_re)
-
   @test_util.run_in_graph_and_eager_modes
   def testFnRecompute(self):
     """Checks that recompute_grad works grads of function args."""
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 92f3e7a24ba..d88025d653c 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -26,10 +26,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('histogram_fixed_width_bins')
+@dispatch.add_dispatch_support
 def histogram_fixed_width_bins(values,
                                value_range,
                                nbins=100,
@@ -101,6 +103,7 @@ def histogram_fixed_width_bins(values,
 
 
 @tf_export('histogram_fixed_width')
+@dispatch.add_dispatch_support
 def histogram_fixed_width(values,
                           value_range,
                           nbins=100,
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 26d08ca6166..4920be213d8 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import sort_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable('RandomCrop')
@@ -323,6 +324,7 @@ def fix_image_flip_shape(image, result):
 
 
 @tf_export('image.random_flip_up_down')
+@dispatch.add_dispatch_support
 def random_flip_up_down(image, seed=None):
   """Randomly flips an image vertically (upside down).
 
@@ -363,6 +365,7 @@ def random_flip_up_down(image, seed=None):
 
 
 @tf_export('image.random_flip_left_right')
+@dispatch.add_dispatch_support
 def random_flip_left_right(image, seed=None):
   """Randomly flip an image horizontally (left to right).
 
@@ -450,6 +453,7 @@ def _random_flip(image, flip_index, seed, scope_name):
 
 
 @tf_export('image.flip_left_right')
+@dispatch.add_dispatch_support
 def flip_left_right(image):
   """Flip an image horizontally (left to right).
 
@@ -484,6 +488,7 @@ def flip_left_right(image):
 
 
 @tf_export('image.flip_up_down')
+@dispatch.add_dispatch_support
 def flip_up_down(image):
   """Flip an image vertically (upside down).
 
@@ -549,6 +554,7 @@ def _flip(image, flip_index, scope_name):
 
 
 @tf_export('image.rot90')
+@dispatch.add_dispatch_support
 def rot90(image, k=1, name=None):
   """Rotate image(s) counter-clockwise by 90 degrees.
 
@@ -660,6 +666,7 @@ def _rot90_4D(images, k, name_scope):
 
 
 @tf_export('image.transpose', v1=['image.transpose', 'image.transpose_image'])
+@dispatch.add_dispatch_support
 def transpose(image, name=None):
   """Transpose image(s) by swapping the height and width dimension.
 
@@ -718,6 +725,7 @@ def transpose(image, name=None):
 
 
 @tf_export('image.central_crop')
+@dispatch.add_dispatch_support
 def central_crop(image, central_fraction):
   """Crop the central region of the image(s).
 
@@ -850,6 +858,7 @@ def central_crop(image, central_fraction):
 
 
 @tf_export('image.pad_to_bounding_box')
+@dispatch.add_dispatch_support
 def pad_to_bounding_box(image, offset_height, offset_width, target_height,
                         target_width):
   """Pad `image` with zeros to the specified `height` and `width`.
@@ -959,6 +968,7 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
 
 
 @tf_export('image.crop_to_bounding_box')
+@dispatch.add_dispatch_support
 def crop_to_bounding_box(image, offset_height, offset_width, target_height,
                          target_width):
   """Crops an image to a specified bounding box.
@@ -1041,6 +1051,7 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
 @tf_export(
     'image.resize_with_crop_or_pad',
     v1=['image.resize_with_crop_or_pad', 'image.resize_image_with_crop_or_pad'])
+@dispatch.add_dispatch_support
 def resize_image_with_crop_or_pad(image, target_height, target_width):
   """Crops and/or pads an image to a target width and height.
 
@@ -1231,8 +1242,10 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
                                    name='size')
 
     size_const_as_shape = tensor_util.constant_value_as_shape(size)
-    new_height_const = size_const_as_shape.dims[0].value
-    new_width_const = size_const_as_shape.dims[1].value
+    new_height_const = tensor_shape.dimension_at_index(size_const_as_shape,
+                                                       0).value
+    new_width_const = tensor_shape.dimension_at_index(size_const_as_shape,
+                                                      1).value
 
     # If we can determine that the height and width will be unmodified by this
     # transformation, we avoid performing the resize.
@@ -1256,6 +1269,7 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
 
 
 @tf_export(v1=['image.resize_images', 'image.resize'])
+@dispatch.add_dispatch_support
 def resize_images(images,
                   size,
                   method=ResizeMethodV1.BILINEAR,
@@ -1341,6 +1355,7 @@ def resize_images(images,
 
 
 @tf_export('image.resize', v1=[])
+@dispatch.add_dispatch_support
 def resize_images_v2(images,
                      size,
                      method=ResizeMethod.BILINEAR,
@@ -1592,6 +1607,7 @@ def _resize_image_with_pad_common(image, target_height, target_width,
 
 
 @tf_export(v1=['image.resize_image_with_pad'])
+@dispatch.add_dispatch_support
 def resize_image_with_pad_v1(image,
                              target_height,
                              target_width,
@@ -1634,6 +1650,7 @@ def resize_image_with_pad_v1(image,
 
 
 @tf_export('image.resize_with_pad', v1=[])
+@dispatch.add_dispatch_support
 def resize_image_with_pad_v2(image,
                              target_height,
                              target_width,
@@ -1674,6 +1691,7 @@ def resize_image_with_pad_v2(image,
 
 
 @tf_export('image.per_image_standardization')
+@dispatch.add_dispatch_support
 def per_image_standardization(image):
   """Linearly scales each image in `image` to have mean 0 and variance 1.
 
@@ -1719,6 +1737,7 @@ def per_image_standardization(image):
 
 
 @tf_export('image.random_brightness')
+@dispatch.add_dispatch_support
 def random_brightness(image, max_delta, seed=None):
   """Adjust the brightness of images by a random factor.
 
@@ -1754,6 +1773,7 @@ def random_brightness(image, max_delta, seed=None):
 
 
 @tf_export('image.random_contrast')
+@dispatch.add_dispatch_support
 def random_contrast(image, lower, upper, seed=None):
   """Adjust the contrast of an image or images by a random factor.
 
@@ -1794,6 +1814,7 @@ def random_contrast(image, lower, upper, seed=None):
 
 
 @tf_export('image.adjust_brightness')
+@dispatch.add_dispatch_support
 def adjust_brightness(image, delta):
   """Adjust the brightness of RGB or Grayscale images.
 
@@ -1845,6 +1866,7 @@ def adjust_brightness(image, delta):
 
 
 @tf_export('image.adjust_contrast')
+@dispatch.add_dispatch_support
 def adjust_contrast(images, contrast_factor):
   """Adjust contrast of RGB or grayscale images.
 
@@ -1901,6 +1923,7 @@ def adjust_contrast(images, contrast_factor):
 
 
 @tf_export('image.adjust_gamma')
+@dispatch.add_dispatch_support
 def adjust_gamma(image, gamma=1, gain=1):
   """Performs [Gamma Correction](http://en.wikipedia.org/wiki/Gamma_correction).
 
@@ -1965,6 +1988,7 @@ def adjust_gamma(image, gamma=1, gain=1):
 
 
 @tf_export('image.convert_image_dtype')
+@dispatch.add_dispatch_support
 def convert_image_dtype(image, dtype, saturate=False, name=None):
   """Convert `image` to `dtype`, scaling its values if needed.
 
@@ -2064,6 +2088,7 @@ def convert_image_dtype(image, dtype, saturate=False, name=None):
 
 
 @tf_export('image.rgb_to_grayscale')
+@dispatch.add_dispatch_support
 def rgb_to_grayscale(images, name=None):
   """Converts one or more images from RGB to Grayscale.
 
@@ -2099,6 +2124,7 @@ def rgb_to_grayscale(images, name=None):
 
 
 @tf_export('image.grayscale_to_rgb')
+@dispatch.add_dispatch_support
 def grayscale_to_rgb(images, name=None):
   """Converts one or more images from Grayscale to RGB.
 
@@ -2135,6 +2161,7 @@ def grayscale_to_rgb(images, name=None):
 
 # pylint: disable=invalid-name
 @tf_export('image.random_hue')
+@dispatch.add_dispatch_support
 def random_hue(image, max_delta, seed=None):
   """Adjust the hue of RGB images by a random factor.
 
@@ -2177,6 +2204,7 @@ def random_hue(image, max_delta, seed=None):
 
 
 @tf_export('image.adjust_hue')
+@dispatch.add_dispatch_support
 def adjust_hue(image, delta, name=None):
   """Adjust hue of RGB images.
 
@@ -2244,6 +2272,7 @@ def adjust_hue(image, delta, name=None):
 
 # pylint: disable=invalid-name
 @tf_export('image.random_jpeg_quality')
+@dispatch.add_dispatch_support
 def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
   """Randomly changes jpeg encoding quality for inducing jpeg noise.
 
@@ -2291,6 +2320,7 @@ def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
 
 
 @tf_export('image.adjust_jpeg_quality')
+@dispatch.add_dispatch_support
 def adjust_jpeg_quality(image, jpeg_quality, name=None):
   """Adjust jpeg encoding quality of an image.
 
@@ -2308,10 +2338,10 @@ def adjust_jpeg_quality(image, jpeg_quality, name=None):
   ...       [10.0, 11.0, 12.0]]]
   >>> tf.image.adjust_jpeg_quality(x, 75)
   <tf.Tensor: shape=(2, 2, 3), dtype=float32, numpy=
-  array([[[1.        , 1.        , 1.        ],
-          [0.9960785 , 0.9960785 , 0.9960785 ]],
-         [[0.98823535, 0.98823535, 0.98823535],
-          [0.98823535, 0.98823535, 0.98823535]]], dtype=float32)>
+  array([[[1., 1., 1.],
+          [1., 1., 1.]],
+         [[1., 1., 1.],
+          [1., 1., 1.]]], dtype=float32)>
 
   Args:
     image: 3D image. The size of the last dimension must be None, 1 or 3.
@@ -2330,17 +2360,18 @@ def adjust_jpeg_quality(image, jpeg_quality, name=None):
     channels = image.shape.as_list()[-1]
     # Remember original dtype to so we can convert back if needed
     orig_dtype = image.dtype
-    image = convert_image_dtype(image, dtypes.uint8)
+    image = convert_image_dtype(image, dtypes.uint8, saturate=True)
     if not _is_tensor(jpeg_quality):
       # If jpeg_quality is a int (not tensor).
       jpeg_quality = ops.convert_to_tensor(jpeg_quality, dtype=dtypes.int32)
     image = gen_image_ops.encode_jpeg_variable_quality(image, jpeg_quality)
 
     image = gen_image_ops.decode_jpeg(image, channels=channels)
-    return convert_image_dtype(image, orig_dtype)
+    return convert_image_dtype(image, orig_dtype, saturate=True)
 
 
 @tf_export('image.random_saturation')
+@dispatch.add_dispatch_support
 def random_saturation(image, lower, upper, seed=None):
   """Adjust the saturation of RGB images by a random factor.
 
@@ -2387,6 +2418,7 @@ def random_saturation(image, lower, upper, seed=None):
 
 
 @tf_export('image.adjust_saturation')
+@dispatch.add_dispatch_support
 def adjust_saturation(image, saturation_factor, name=None):
   """Adjust saturation of RGB images.
 
@@ -2478,42 +2510,43 @@ tf_export(
     'io.decode_and_crop_jpeg',
     'image.decode_and_crop_jpeg',
     v1=['io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg'])(
-        gen_image_ops.decode_and_crop_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.decode_and_crop_jpeg))
 
 tf_export(
     'io.decode_bmp',
     'image.decode_bmp',
     v1=['io.decode_bmp', 'image.decode_bmp'])(
-        gen_image_ops.decode_bmp)
+        dispatch.add_dispatch_support(gen_image_ops.decode_bmp))
 tf_export(
     'io.decode_gif',
     'image.decode_gif',
     v1=['io.decode_gif', 'image.decode_gif'])(
-        gen_image_ops.decode_gif)
+        dispatch.add_dispatch_support(gen_image_ops.decode_gif))
 tf_export(
     'io.decode_jpeg',
     'image.decode_jpeg',
     v1=['io.decode_jpeg', 'image.decode_jpeg'])(
-        gen_image_ops.decode_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.decode_jpeg))
 tf_export(
     'io.decode_png',
     'image.decode_png',
     v1=['io.decode_png', 'image.decode_png'])(
-        gen_image_ops.decode_png)
+        dispatch.add_dispatch_support(gen_image_ops.decode_png))
 
 tf_export(
     'io.encode_jpeg',
     'image.encode_jpeg',
     v1=['io.encode_jpeg', 'image.encode_jpeg'])(
-        gen_image_ops.encode_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.encode_jpeg))
 tf_export(
     'io.extract_jpeg_shape',
     'image.extract_jpeg_shape',
     v1=['io.extract_jpeg_shape', 'image.extract_jpeg_shape'])(
-        gen_image_ops.extract_jpeg_shape)
+        dispatch.add_dispatch_support(gen_image_ops.extract_jpeg_shape))
 
 
 @tf_export('io.encode_png', 'image.encode_png')
+@dispatch.add_dispatch_support
 def encode_png(image, compression=-1, name=None):
   r"""PNG-encode an image.
 
@@ -2546,6 +2579,7 @@ def encode_png(image, compression=-1, name=None):
     'io.decode_image',
     'image.decode_image',
     v1=['io.decode_image', 'image.decode_image'])
+@dispatch.add_dispatch_support
 def decode_image(contents,
                  channels=None,
                  dtype=dtypes.uint8,
@@ -2659,6 +2693,7 @@ def decode_image(contents,
 
 
 @tf_export('image.total_variation')
+@dispatch.add_dispatch_support
 def total_variation(images, name=None):
   """Calculate and return the total variation for one or more images.
 
@@ -2730,6 +2765,7 @@ def total_variation(images, name=None):
 
 
 @tf_export('image.sample_distorted_bounding_box', v1=[])
+@dispatch.add_dispatch_support
 def sample_distorted_bounding_box_v2(image_size,
                                      bounding_boxes,
                                      seed=0,
@@ -2829,6 +2865,7 @@ def sample_distorted_bounding_box_v2(image_size,
 
 
 @tf_export(v1=['image.sample_distorted_bounding_box'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None,
     instructions='`seed2` arg is deprecated.'
@@ -2943,6 +2980,7 @@ def sample_distorted_bounding_box(image_size,
 
 
 @tf_export('image.non_max_suppression')
+@dispatch.add_dispatch_support
 def non_max_suppression(boxes,
                         scores,
                         max_output_size,
@@ -2995,6 +3033,7 @@ def non_max_suppression(boxes,
 
 
 @tf_export('image.non_max_suppression_with_scores')
+@dispatch.add_dispatch_support
 def non_max_suppression_with_scores(boxes,
                                     scores,
                                     max_output_size,
@@ -3081,6 +3120,7 @@ def non_max_suppression_with_scores(boxes,
 
 
 @tf_export('image.non_max_suppression_overlaps')
+@dispatch.add_dispatch_support
 def non_max_suppression_with_overlaps(overlaps,
                                       scores,
                                       max_output_size,
@@ -3132,6 +3172,7 @@ _rgb_to_yiq_kernel = [[0.299, 0.59590059, 0.2115],
 
 
 @tf_export('image.rgb_to_yiq')
+@dispatch.add_dispatch_support
 def rgb_to_yiq(images):
   """Converts one or more images from RGB to YIQ.
 
@@ -3165,6 +3206,7 @@ _yiq_to_rgb_kernel = [[1, 1, 1], [0.95598634, -0.27201283, -1.10674021],
 
 
 @tf_export('image.yiq_to_rgb')
+@dispatch.add_dispatch_support
 def yiq_to_rgb(images):
   """Converts one or more images from YIQ to RGB.
 
@@ -3193,6 +3235,7 @@ _rgb_to_yuv_kernel = [[0.299, -0.14714119, 0.61497538],
 
 
 @tf_export('image.rgb_to_yuv')
+@dispatch.add_dispatch_support
 def rgb_to_yuv(images):
   """Converts one or more images from RGB to YUV.
 
@@ -3219,6 +3262,7 @@ _yuv_to_rgb_kernel = [[1, 1, 1], [0, -0.394642334, 2.03206185],
 
 
 @tf_export('image.yuv_to_rgb')
+@dispatch.add_dispatch_support
 def yuv_to_rgb(images):
   """Converts one or more images from YUV to RGB.
 
@@ -3312,6 +3356,7 @@ def _verify_compatible_image_shapes(img1, img2):
 
 
 @tf_export('image.psnr')
+@dispatch.add_dispatch_support
 def psnr(a, b, max_val, name=None):
   """Returns the Peak Signal-to-Noise Ratio between a and b.
 
@@ -3523,6 +3568,7 @@ def _ssim_per_channel(img1,
 
 
 @tf_export('image.ssim')
+@dispatch.add_dispatch_support
 def ssim(img1,
          img2,
          max_val,
@@ -3602,6 +3648,7 @@ _MSSSIM_WEIGHTS = (0.0448, 0.2856, 0.3001, 0.2363, 0.1333)
 
 
 @tf_export('image.ssim_multiscale')
+@dispatch.add_dispatch_support
 def ssim_multiscale(img1,
                     img2,
                     max_val,
@@ -3729,6 +3776,7 @@ def ssim_multiscale(img1,
 
 
 @tf_export('image.image_gradients')
+@dispatch.add_dispatch_support
 def image_gradients(image):
   """Returns image gradients (dy, dx) for each color channel.
 
@@ -3802,6 +3850,7 @@ def image_gradients(image):
 
 
 @tf_export('image.sobel_edges')
+@dispatch.add_dispatch_support
 def sobel_edges(image):
   """Returns a tensor holding Sobel edge maps.
 
@@ -3886,21 +3935,22 @@ resize_area_deprecation = deprecation.deprecated(
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.AREA...)` instead.'))
 tf_export(v1=['image.resize_area'])(
-    resize_area_deprecation(gen_image_ops.resize_area))
+    resize_area_deprecation(
+        dispatch.add_dispatch_support(gen_image_ops.resize_area)))
 
 resize_bicubic_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BICUBIC...)` instead.'))
 tf_export(v1=['image.resize_bicubic'])(
-    resize_bicubic_deprecation(resize_bicubic))
+    dispatch.add_dispatch_support(resize_bicubic_deprecation(resize_bicubic)))
 
 resize_bilinear_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.'))
 tf_export(v1=['image.resize_bilinear'])(
-    resize_bilinear_deprecation(resize_bilinear))
+    dispatch.add_dispatch_support(resize_bilinear_deprecation(resize_bilinear)))
 
 resize_nearest_neighbor_deprecation = deprecation.deprecated(
     date=None,
@@ -3908,10 +3958,12 @@ resize_nearest_neighbor_deprecation = deprecation.deprecated(
         'Use `tf.image.resize(...method=ResizeMethod.NEAREST_NEIGHBOR...)` '
         'instead.'))
 tf_export(v1=['image.resize_nearest_neighbor'])(
-    resize_nearest_neighbor_deprecation(resize_nearest_neighbor))
+    dispatch.add_dispatch_support(
+        resize_nearest_neighbor_deprecation(resize_nearest_neighbor)))
 
 
 @tf_export('image.crop_and_resize', v1=[])
+@dispatch.add_dispatch_support
 def crop_and_resize_v2(image,
                        boxes,
                        box_indices,
@@ -3995,6 +4047,7 @@ def crop_and_resize_v2(image,
 
 
 @tf_export(v1=['image.crop_and_resize'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              'box_ind is deprecated, use box_indices instead',
                              'box_ind')
@@ -4017,6 +4070,7 @@ crop_and_resize_v1.__doc__ = gen_image_ops.crop_and_resize.__doc__
 
 
 @tf_export(v1=['image.extract_glimpse'])
+@dispatch.add_dispatch_support
 def extract_glimpse(
     input,  # pylint: disable=redefined-builtin
     size,
@@ -4102,6 +4156,7 @@ def extract_glimpse(
 
 
 @tf_export('image.extract_glimpse', v1=[])
+@dispatch.add_dispatch_support
 def extract_glimpse_v2(
     input,  # pylint: disable=redefined-builtin
     size,
@@ -4188,6 +4243,7 @@ def extract_glimpse_v2(
 
 
 @tf_export('image.combined_non_max_suppression')
+@dispatch.add_dispatch_support
 def combined_non_max_suppression(boxes,
                                  scores,
                                  max_output_size_per_class,
@@ -4440,6 +4496,7 @@ def _suppression_loop_body(boxes, iou_threshold, output_size, idx, tile_size):
 
 
 @tf_export('image.non_max_suppression_padded')
+@dispatch.add_dispatch_support
 def non_max_suppression_padded(boxes,
                                scores,
                                max_output_size,
@@ -4814,6 +4871,7 @@ def non_max_suppression_padded_v1(boxes,
 
 
 @tf_export('image.draw_bounding_boxes', v1=[])
+@dispatch.add_dispatch_support
 def draw_bounding_boxes_v2(images, boxes, colors, name=None):
   """Draw bounding boxes on a batch of images.
 
@@ -4868,6 +4926,7 @@ def draw_bounding_boxes_v2(images, boxes, colors, name=None):
 
 
 @tf_export(v1=['image.draw_bounding_boxes'])
+@dispatch.add_dispatch_support
 def draw_bounding_boxes(images, boxes, name=None, colors=None):
   """Draw bounding boxes on a batch of images.
 
@@ -4920,6 +4979,7 @@ def draw_bounding_boxes(images, boxes, name=None, colors=None):
 
 
 @tf_export('image.generate_bounding_box_proposals')
+@dispatch.add_dispatch_support
 def generate_bounding_box_proposals(scores,
                                     bbox_deltas,
                                     image_info,
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index f7617d83caf..82acd09caec 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -41,7 +41,7 @@ cholesky = linalg_ops.cholesky
 cholesky_solve = linalg_ops.cholesky_solve
 det = linalg_ops.matrix_determinant
 slogdet = gen_linalg_ops.log_matrix_determinant
-tf_export('linalg.slogdet')(slogdet)
+tf_export('linalg.slogdet')(dispatch.add_dispatch_support(slogdet))
 diag = array_ops.matrix_diag
 diag_part = array_ops.matrix_diag_part
 eigh = linalg_ops.self_adjoint_eig
@@ -51,7 +51,7 @@ eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
 logm = gen_linalg_ops.matrix_logarithm
 lu = gen_linalg_ops.lu
-tf_export('linalg.logm')(logm)
+tf_export('linalg.logm')(dispatch.add_dispatch_support(logm))
 lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
 qr = linalg_ops.qr
@@ -230,6 +230,7 @@ def _matrix_exp_pade13(matrix):
 
 
 @tf_export('linalg.expm')
+@dispatch.add_dispatch_support
 def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the matrix exponential of one or more square matrices.
 
@@ -340,6 +341,7 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
 
 
 @tf_export('linalg.tridiagonal_solve')
+@dispatch.add_dispatch_support
 def tridiagonal_solve(diagonals,
                       rhs,
                       diagonals_format='compact',
@@ -541,6 +543,7 @@ def _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
 
 
 @tf_export('linalg.tridiagonal_matmul')
+@dispatch.add_dispatch_support
 def tridiagonal_matmul(diagonals, rhs, diagonals_format='compact', name=None):
   r"""Multiplies tridiagonal matrix by matrix.
 
@@ -638,6 +641,7 @@ def _maybe_validate_matrix(a, validate_args):
 
 
 @tf_export('linalg.matrix_rank')
+@dispatch.add_dispatch_support
 def matrix_rank(a, tol=None, validate_args=False, name=None):
   """Compute the matrix rank of one or more matrices.
 
@@ -676,6 +680,7 @@ def matrix_rank(a, tol=None, validate_args=False, name=None):
 
 
 @tf_export('linalg.pinv')
+@dispatch.add_dispatch_support
 def pinv(a, rcond=None, validate_args=False, name=None):
   """Compute the Moore-Penrose pseudo-inverse of one or more matrices.
 
@@ -805,6 +810,7 @@ def pinv(a, rcond=None, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_solve')
+@dispatch.add_dispatch_support
 def lu_solve(lower_upper, perm, rhs, validate_args=False, name=None):
   """Solves systems of linear eqns `A X = RHS`, given LU factorizations.
 
@@ -902,6 +908,7 @@ def lu_solve(lower_upper, perm, rhs, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_matrix_inverse')
+@dispatch.add_dispatch_support
 def lu_matrix_inverse(lower_upper, perm, validate_args=False, name=None):
   """Computes the inverse given the LU decomposition(s) of one or more matrices.
 
@@ -966,6 +973,7 @@ def lu_matrix_inverse(lower_upper, perm, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_reconstruct')
+@dispatch.add_dispatch_support
 def lu_reconstruct(lower_upper, perm, validate_args=False, name=None):
   """The reconstruct one or more matrices from their LU decomposition(s).
 
diff --git a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
index 613309f856d..6794636c3fd 100644
--- a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
+++ b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
@@ -27,10 +27,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('linalg.experimental.conjugate_gradient')
+@dispatch.add_dispatch_support
 def conjugate_gradient(operator,
                        rhs,
                        preconditioner=None,
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index f456581ef60..8d3664144a1 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -332,6 +332,10 @@ def _EinsumGrad(op, grad):
   # Obtain the gradients wrt the inputs x and y, without taking into account
   # the unbroadcasting.
   x, y = op.inputs[0], op.inputs[1]
+  if grad.dtype.is_complex:
+    x = math_ops.conj(x)
+    y = math_ops.conj(y)
+
   x_shape = array_ops.shape(x)
   y_shape = array_ops.shape(y)
   grad_x = _GetGradWrt(grad, y, x_shape, x_subs, y_subs, output_subs)
@@ -633,6 +637,67 @@ def _MatrixTriangularSolveGrad(op, grad):
   return grad_a, grad_b
 
 
+# To avoid nan in cases with degenerate eigenvalues or
+# degenerate/zero singular values in calculations of
+# f and s_inv_mat, we introduce a Lorentz broadening.
+def _SafeReciprocal(x, epsilon=1E-20):
+  return x * math_ops.reciprocal(x * x + epsilon)
+
+
+@ops.RegisterGradient("Eig")
+def _EigGrad(op, grad_e, grad_v):
+  """Gradient for Eig.
+
+  Based on eq. 4.77 from paper by
+  Christoph Boeddeker et al.
+  https://arxiv.org/abs/1701.00392
+  See also
+  "Computation of eigenvalue and eigenvector derivatives
+  for a general complex-valued eigensystem" by Nico van der Aa.
+  As for now only distinct eigenvalue case is considered.
+  """
+  e = op.outputs[0]
+  compute_v = op.get_attr("compute_v")
+  # a = op.inputs[0], which satisfies
+  # a[...,:,:] * v[...,:,i] = e[...,i] * v[...,i]
+  with ops.control_dependencies([grad_e, grad_v]):
+    if compute_v:
+      v = op.outputs[1]
+      vt = _linalg.adjoint(v)
+      # Construct the matrix f(i,j) = (i != j ? 1 / (e_i - e_j) : 0).
+      # Notice that because of the term involving f, the gradient becomes
+      # infinite (or NaN in practice) when eigenvalues are not unique.
+      # Mathematically this should not be surprising, since for (k-fold)
+      # degenerate eigenvalues, the corresponding eigenvectors are only defined
+      # up to arbitrary rotation in a (k-dimensional) subspace.
+      f = array_ops.matrix_set_diag(
+          _SafeReciprocal(
+              array_ops.expand_dims(e, -2) - array_ops.expand_dims(e, -1)),
+          array_ops.zeros_like(e))
+      f = math_ops.conj(f)
+      vgv = math_ops.matmul(vt, grad_v)
+      mid = array_ops.matrix_diag(grad_e)
+      diag_grad_part = array_ops.matrix_diag(
+          array_ops.matrix_diag_part(
+              math_ops.cast(math_ops.real(vgv), vgv.dtype)))
+      mid += f * (vgv - math_ops.matmul(math_ops.matmul(vt, v), diag_grad_part))
+      # vt is formally invertible as long as the original matrix is
+      # diagonalizable. However, in practice, vt may
+      # be ill-conditioned when matrix original matrix is close to
+      # non-diagonalizable one
+      grad_a = linalg_ops.matrix_solve(vt, math_ops.matmul(mid, vt))
+    else:
+      _, v = linalg_ops.eig(op.inputs[0])
+      vt = _linalg.adjoint(v)
+      # vt is formally invertible as long as the original matrix is
+      # diagonalizable. However, in practice, vt may
+      # be ill-conditioned when matrix original matrix is close to
+      # non-diagonalizable one
+      grad_a = linalg_ops.matrix_solve(
+          vt, math_ops.matmul(array_ops.matrix_diag(grad_e), vt))
+    return math_ops.cast(grad_a, op.inputs[0].dtype)
+
+
 @ops.RegisterGradient("SelfAdjointEigV2")
 def _SelfAdjointEigV2Grad(op, grad_e, grad_v):
   """Gradient for SelfAdjointEigV2."""
@@ -650,7 +715,7 @@ def _SelfAdjointEigV2Grad(op, grad_e, grad_v):
       # degenerate eigenvalues, the corresponding eigenvectors are only defined
       # up to arbitrary rotation in a (k-dimensional) subspace.
       f = array_ops.matrix_set_diag(
-          math_ops.reciprocal(
+          _SafeReciprocal(
               array_ops.expand_dims(e, -2) - array_ops.expand_dims(e, -1)),
           array_ops.zeros_like(e))
       grad_a = math_ops.matmul(
@@ -745,11 +810,6 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
     # only defined up a (k-dimensional) subspace. In practice, this can
     # lead to numerical instability when singular values are close but not
     # exactly equal.
-    # To avoid nan in cases with degenerate sigular values or zero singular values
-    # in calculating f and s_inv_mat, we introduce a Lorentz brodening.
-
-    def _SafeReciprocal(x, epsilon=1E-20):
-      return x * math_ops.reciprocal(x * x + epsilon)
 
     s_shape = array_ops.shape(s)
     f = array_ops.matrix_set_diag(
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index abca7df19e0..03b7b98119d 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 # Names below are lower_case.
@@ -82,6 +83,7 @@ def _RegularizedGramianCholesky(matrix, l2_regularizer, first_kind):
 @tf_export(
     'linalg.triangular_solve',
     v1=['linalg.triangular_solve', 'matrix_triangular_solve'])
+@dispatch.add_dispatch_support
 def matrix_triangular_solve(matrix, rhs, lower=True, adjoint=False, name=None):
   """Solve systems of linear equations with upper or lower triangular matrices.
 
@@ -143,6 +145,7 @@ def matrix_triangular_solve(matrix, rhs, lower=True, adjoint=False, name=None):
 
 @tf_export(
     'linalg.cholesky_solve', v1=['linalg.cholesky_solve', 'cholesky_solve'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('cholesky_solve')
 def cholesky_solve(chol, rhs, name=None):
   """Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
@@ -187,6 +190,7 @@ def cholesky_solve(chol, rhs, name=None):
 
 
 @tf_export('eye', 'linalg.eye')
+@dispatch.add_dispatch_support
 def eye(num_rows,
         num_columns=None,
         batch_shape=None,
@@ -234,6 +238,7 @@ def eye(num_rows,
 
 
 @tf_export('linalg.lstsq', v1=['linalg.lstsq', 'matrix_solve_ls'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('matrix_solve_ls')
 def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
   r"""Solves one or more linear least-squares problems.
@@ -371,6 +376,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
 
 
 @tf_export('linalg.eig', 'eig', v1=[])
+@dispatch.add_dispatch_support
 def eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of matrices.
 
@@ -401,6 +407,7 @@ def eig(tensor, name=None):
 
 
 @tf_export('linalg.eigvals', 'eigvals', v1=[])
+@dispatch.add_dispatch_support
 def eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more matrices.
 
@@ -427,6 +434,7 @@ def eigvals(tensor, name=None):
 
 
 @tf_export('linalg.eigh', v1=['linalg.eigh', 'self_adjoint_eig'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('self_adjoint_eig')
 def self_adjoint_eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of self-adjoint matrices.
@@ -450,6 +458,7 @@ def self_adjoint_eig(tensor, name=None):
 
 
 @tf_export('linalg.eigvalsh', v1=['linalg.eigvalsh', 'self_adjoint_eigvals'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('self_adjoint_eigvals')
 def self_adjoint_eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more self-adjoint matrices.
@@ -473,6 +482,7 @@ def self_adjoint_eigvals(tensor, name=None):
 
 
 @tf_export('linalg.svd', v1=['linalg.svd', 'svd'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('svd')
 def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   r"""Computes the singular value decompositions of one or more matrices.
@@ -544,6 +554,7 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export('norm', 'linalg.norm', v1=[])
+@dispatch.add_dispatch_support
 def norm_v2(tensor,
             ord='euclidean',
             axis=None,
@@ -615,6 +626,7 @@ def norm_v2(tensor,
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=['norm', 'linalg.norm'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(
     None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
 def norm(tensor,
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 7e980a0dbb3..8ca63f55987 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.gen_logging_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -71,6 +72,7 @@ except NameError:
             "only a concern in graph mode. Below is an example "
             "of how to ensure tf.print executes in graph mode:\n")
 @tf_export(v1=["Print"])
+@dispatch.add_dispatch_support
 def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
   """Prints a list of tensors.
 
@@ -136,6 +138,7 @@ def _is_filepath(output_stream):
 # function definition.
 # pylint: disable=g-doc-args
 @tf_export("print")
+@dispatch.add_dispatch_support
 def print_v2(*inputs, **kwargs):
   """Print the specified inputs.
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 556c646f2a7..6a7b4b68420 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
@@ -136,6 +137,7 @@ def _num_elements(losses):
 
 
 @tf_export(v1=["losses.compute_weighted_loss"])
+@dispatch.add_dispatch_support
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -204,6 +206,7 @@ def compute_weighted_loss(
 
 
 @tf_export(v1=["losses.absolute_difference"])
+@dispatch.add_dispatch_support
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -257,6 +260,7 @@ def absolute_difference(
 
 
 @tf_export(v1=["losses.cosine_distance"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -313,6 +317,7 @@ def cosine_distance(
 
 
 @tf_export(v1=["losses.hinge_loss"])
+@dispatch.add_dispatch_support
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -363,6 +368,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
 
 
 @tf_export(v1=["losses.huber_loss"])
+@dispatch.add_dispatch_support
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -439,6 +445,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
 
 
 @tf_export(v1=["losses.log_loss"])
+@dispatch.add_dispatch_support
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -496,6 +503,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 # TODO(b/37208492): Add reduction arg.
 @tf_export(v1=["losses.mean_pairwise_squared_error"])
+@dispatch.add_dispatch_support
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -592,6 +600,7 @@ def mean_pairwise_squared_error(
 
 
 @tf_export(v1=["losses.mean_squared_error"])
+@dispatch.add_dispatch_support
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -645,6 +654,7 @@ def mean_squared_error(
 
 
 @tf_export(v1=["losses.sigmoid_cross_entropy"])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -709,6 +719,7 @@ def sigmoid_cross_entropy(
 
 
 @tf_export(v1=["losses.softmax_cross_entropy"])
+@dispatch.add_dispatch_support
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -831,6 +842,7 @@ def _remove_squeezable_dimensions(
 
 
 @tf_export(v1=["losses.sparse_softmax_cross_entropy"])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index 56e8a894c24..fe99696f82f 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
 @tf_export('roll', v1=['roll', 'manip.roll'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('manip.roll')
 def roll(input, shift, axis, name=None):  # pylint: disable=redefined-builtin
   return _gen_manip_ops.roll(input, shift, axis, name)
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 2c9c678336e..40f8edfcdd1 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -22,6 +22,8 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
+from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -477,7 +479,9 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
-      result_value = fn(elems_value)
+      ag_ctx = autograph_ctx.control_status_ctx()
+      autographed_fn = autograph.tf_convert(fn, ag_ctx)
+      result_value = autographed_fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)
       result_value_batchable = _result_value_flat_to_batchable(
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index f062047cec2..31994c16ddd 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -104,6 +104,7 @@ nextafter = gen_math_ops.next_after
 
 
 @tf_export("linspace", v1=["lin_space", "linspace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("lin_space")
 def linspace_nd(start, stop, num, name=None, axis=0):
   r"""Generates evenly-spaced values in an interval along a given axis.
@@ -214,8 +215,8 @@ linspace = linspace_nd
 
 arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
-tf_export(v1=["arg_max"])(arg_max)
-tf_export(v1=["arg_min"])(arg_min)
+tf_export(v1=["arg_max"])(dispatch.add_dispatch_support(arg_max))
+tf_export(v1=["arg_min"])(dispatch.add_dispatch_support(arg_min))
 
 
 # This is set by resource_variable_ops.py. It is included in this way since
@@ -234,6 +235,7 @@ def _set_doc(doc):
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=["math.argmax", "argmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -250,6 +252,7 @@ def argmax(input,
 
 
 @tf_export("math.argmax", "argmax", v1=[])
+@dispatch.add_dispatch_support
 def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the largest value across axes of a tensor.
 
@@ -283,6 +286,7 @@ def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
 
 
 @tf_export(v1=["math.argmin", "argmin"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -299,6 +303,7 @@ def argmin(input,
 
 
 @tf_export("math.argmin", "argmin", v1=[])
+@dispatch.add_dispatch_support
 def argmin_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the smallest value across axes of a tensor.
 
@@ -549,6 +554,7 @@ def _neg(x, name=None):
 
 
 @tf_export(v1=["math.scalar_mul", "scalar_mul"])
+@dispatch.add_dispatch_support
 def scalar_mul(scalar, x, name=None):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
 
@@ -581,6 +587,7 @@ def scalar_mul(scalar, x, name=None):
 
 
 @tf_export("math.scalar_mul", "scalar_mul", v1=[])
+@dispatch.add_dispatch_support
 @_set_doc(scalar_mul.__doc__)
 def scalar_mul_v2(scalar, x, name=None):
   with ops.name_scope(name, "scalar_mul", [x]) as name:
@@ -701,6 +708,7 @@ def sign(x, name=None):
 
 
 @tf_export("math.real", v1=["math.real", "real"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("real")
 @dispatch.add_dispatch_support
 def real(input, name=None):
@@ -735,6 +743,7 @@ def real(input, name=None):
 
 
 @tf_export("math.imag", v1=["math.imag", "imag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("imag")
 @dispatch.add_dispatch_support
 def imag(input, name=None):
@@ -768,6 +777,7 @@ def imag(input, name=None):
 
 
 @tf_export("math.angle", v1=["math.angle", "angle"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("angle")
 @dispatch.add_dispatch_support
 def angle(input, name=None):
@@ -804,7 +814,8 @@ def angle(input, name=None):
     if input.dtype.is_complex:
       return gen_math_ops.angle(input, Tout=input.dtype.real_dtype, name=name)
     else:
-      return array_ops.zeros_like(input)
+      return array_ops.where(input < 0, np.pi * array_ops.ones_like(input),
+                             array_ops.zeros_like(input))
 
 
 # pylint: enable=redefined-outer-name,redefined-builtin
@@ -936,6 +947,7 @@ def saturate_cast(value, dtype, name=None):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_float"])
+@dispatch.add_dispatch_support
 def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
 
@@ -955,6 +967,7 @@ def to_float(x, name="ToFloat"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_double"])
+@dispatch.add_dispatch_support
 def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
 
@@ -974,6 +987,7 @@ def to_double(x, name="ToDouble"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int32"])
+@dispatch.add_dispatch_support
 def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
 
@@ -993,6 +1007,7 @@ def to_int32(x, name="ToInt32"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int64"])
+@dispatch.add_dispatch_support
 def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
 
@@ -1012,6 +1027,7 @@ def to_int64(x, name="ToInt64"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_bfloat16"])
+@dispatch.add_dispatch_support
 def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
 
@@ -1031,6 +1047,7 @@ def to_bfloat16(x, name="ToBFloat16"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex64"])
+@dispatch.add_dispatch_support
 def to_complex64(x, name="ToComplex64"):
   """Casts a tensor to type `complex64`.
 
@@ -1050,6 +1067,7 @@ def to_complex64(x, name="ToComplex64"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex128"])
+@dispatch.add_dispatch_support
 def to_complex128(x, name="ToComplex128"):
   """Casts a tensor to type `complex128`.
 
@@ -1264,6 +1282,7 @@ def truediv(x, y, name=None):
     date=None,
     instructions="Deprecated in favor of operator or tf.math.divide.")
 @tf_export(v1=["div"])
+@dispatch.add_dispatch_support
 def div(x, y, name=None):
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
@@ -1287,6 +1306,7 @@ def div(x, y, name=None):
 
 
 @tf_export("math.divide_no_nan", v1=["math.divide_no_nan", "div_no_nan"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("div_no_nan")
 @dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
@@ -1619,6 +1639,7 @@ ops.Tensor._override_operator("__ne__", tensor_not_equals)
 
 
 @tf_export("range")
+@dispatch.add_dispatch_support
 def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disable=redefined-builtin
   """Creates a sequence of numbers.
 
@@ -1750,6 +1771,7 @@ def _may_reduce_to_scalar(keepdims, axis, output):
 
 
 @tf_export(v1=["math.reduce_sum", "reduce_sum"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -1884,6 +1906,7 @@ def reduce_sum_with_dims(input_tensor,
 
 
 @tf_export("math.reduce_euclidean_norm")
+@dispatch.add_dispatch_support
 def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the Euclidean norm of elements across dimensions of a tensor.
 
@@ -1927,6 +1950,7 @@ def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.count_nonzero", "count_nonzero"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2004,6 +2028,7 @@ def count_nonzero(input_tensor=None,
 
 
 @tf_export("math.count_nonzero", v1=[])
+@dispatch.add_dispatch_support
 def count_nonzero_v2(
     input,  # pylint: disable=redefined-builtin
     axis=None,
@@ -2071,6 +2096,7 @@ def count_nonzero_v2(
 
 
 @tf_export(v1=["math.reduce_mean", "reduce_mean"])
+@dispatch.add_dispatch_support
 def reduce_mean_v1(input_tensor,
                    axis=None,
                    keepdims=None,
@@ -2197,6 +2223,7 @@ def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("math.reduce_variance")
+@dispatch.add_dispatch_support
 def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the variance of elements across dimensions of a tensor.
 
@@ -2245,6 +2272,7 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("math.reduce_std")
+@dispatch.add_dispatch_support
 def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the standard deviation of elements across dimensions of a tensor.
 
@@ -2327,6 +2355,7 @@ def reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_prod", "reduce_prod"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2372,6 +2401,7 @@ def reduce_prod_v1(input_tensor,
 
 
 @tf_export(v1=["math.reduce_min", "reduce_min"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2458,6 +2488,7 @@ def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_max", "reduce_max"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2562,6 +2593,7 @@ def reduce_max_with_dims(input_tensor,
 
 
 @tf_export(v1=["math.reduce_all", "reduce_all"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2661,6 +2693,7 @@ def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_any", "reduce_any"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2760,6 +2793,7 @@ def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_logsumexp", "reduce_logsumexp"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2816,6 +2850,7 @@ def reduce_logsumexp_v1(input_tensor,
 
 
 @tf_export("math.reduce_logsumexp", "reduce_logsumexp", v1=[])
+@dispatch.add_dispatch_support
 def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
   """Computes log(sum(exp(elements across dimensions of a tensor))).
 
@@ -2876,6 +2911,7 @@ def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("linalg.trace", v1=["linalg.trace", "trace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("trace")
 @dispatch.add_dispatch_support
 def trace(x, name=None):
@@ -3115,6 +3151,7 @@ def matmul(a,
 
 
 @tf_export("linalg.matvec")
+@dispatch.add_dispatch_support
 def matvec(a,
            b,
            transpose_a=False,
@@ -3218,6 +3255,7 @@ _OverrideBinaryOperatorHelper(matmul, "matmul")
 sparse_matmul = deprecation.deprecated(None, "Use `tf.linalg.matmul` instead")(
     gen_math_ops.sparse_mat_mul)
 tf_export(v1=["sparse_matmul"])(sparse_matmul)
+@dispatch.add_dispatch_support
 
 
 @ops.RegisterStatistics("MatMul", "flops")
@@ -3370,6 +3408,7 @@ def add_n(inputs, name=None):
 
 
 @tf_export("math.accumulate_n", v1=["math.accumulate_n", "accumulate_n"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("accumulate_n")
 def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   """Returns the element-wise sum of a list of tensors.
@@ -3448,6 +3487,7 @@ def _accumulate_n_grad(op, grad):
 
 
 @tf_export("math.sigmoid", "nn.sigmoid", "sigmoid")
+@dispatch.add_dispatch_support
 def sigmoid(x, name=None):
   r"""Computes sigmoid of `x` element-wise.
 
@@ -3520,6 +3560,7 @@ def log_sigmoid(x, name=None):
 
 
 @tf_export("math.bincount", v1=[])
+@dispatch.add_dispatch_support
 def bincount(arr,
              weights=None,
              minlength=None,
@@ -3595,6 +3636,7 @@ def bincount(arr,
 
 
 @tf_export(v1=["math.bincount", "bincount"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("bincount")
 def bincount_v1(arr,
                 weights=None,
@@ -3628,6 +3670,7 @@ def bincount_v1(arr,
 
 
 @tf_export("math.cumsum", "cumsum")
+@dispatch.add_dispatch_support
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative sum of the tensor `x` along `axis`.
 
@@ -3699,6 +3742,7 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.cumprod", v1=["math.cumprod", "cumprod"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("cumprod")
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative product of the tensor `x` along `axis`.
@@ -3752,6 +3796,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.cumulative_logsumexp", v1=["math.cumulative_logsumexp"])
+@dispatch.add_dispatch_support
 def cumulative_logsumexp(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative log-sum-exp of the tensor `x` along `axis`.
 
@@ -3911,6 +3956,7 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
 @tf_export(
     "math.unsorted_segment_mean",
     v1=["math.unsorted_segment_mean", "unsorted_segment_mean"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("unsorted_segment_mean")
 @dispatch.add_dispatch_support
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
@@ -3957,6 +4003,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
 @tf_export(
     "math.unsorted_segment_sqrt_n",
     v1=["math.unsorted_segment_sqrt_n", "unsorted_segment_sqrt_n"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("unsorted_segment_sqrt_n")
 @dispatch.add_dispatch_support
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
@@ -4306,6 +4353,7 @@ def sparse_segment_sqrt_n_v2(data,
 
 
 @tf_export("tensordot", "linalg.tensordot")
+@dispatch.add_dispatch_support
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes and outer product.
 
@@ -4492,6 +4540,7 @@ def tensordot(a, b, axes, name=None):
 
 
 @tf_export("math.polyval")
+@dispatch.add_dispatch_support
 def polyval(coeffs, x, name=None):
   r"""Computes the elementwise value of a polynomial.
 
@@ -4562,6 +4611,7 @@ def polyval(coeffs, x, name=None):
 
 
 @tf_export("math.reciprocal_no_nan")
+@dispatch.add_dispatch_support
 def reciprocal_no_nan(x, name=None):
   """Performs a safe reciprocal operation, element wise.
 
@@ -4664,6 +4714,7 @@ def ndtri(x, name=None):
 
 
 @tf_export("math.ceil", v1=["math.ceil", "ceil"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("ceil")
 @dispatch.add_dispatch_support
 def ceil(x, name=None):
@@ -4777,6 +4828,7 @@ def exp(x, name=None):
 
 
 @tf_export("math.sobol_sample")
+@dispatch.add_dispatch_support
 def sobol_sample(dim, num_results, skip=0, dtype=dtypes.float32, name=None):
   """Generates points from the Sobol sequence.
 
@@ -4801,6 +4853,7 @@ def sobol_sample(dim, num_results, skip=0, dtype=dtypes.float32, name=None):
 
 
 @tf_export("math.rsqrt", v1=["math.rsqrt", "rsqrt"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("rsqrt")
 @dispatch.add_dispatch_support
 def rsqrt(x, name=None):
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 8a3a620f765..4bda85077bc 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -39,12 +39,14 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import util as losses_util
 from tensorflow.python.platform import device_context
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("nn.log_poisson_loss")
+@dispatch.add_dispatch_support
 def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
   """Computes log Poisson loss given `log_input`.
 
@@ -110,6 +112,7 @@ def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
 
 
 @tf_export(v1=["nn.sigmoid_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
     _sentinel=None,
     labels=None,
@@ -192,6 +195,7 @@ def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
 # Note: intentionally calling this v2 to not allow existing code with indirect
 # imports to ignore the sentinel behavior.
 @tf_export("nn.sigmoid_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
     labels=None,
     logits=None,
@@ -242,6 +246,7 @@ def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
 
 
 @tf_export("nn.weighted_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
                                           name=None):
   """Computes a weighted cross entropy.
@@ -320,6 +325,7 @@ def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
 
 
 @tf_export(v1=["nn.weighted_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "targets is deprecated, use labels instead", "targets")
 def weighted_cross_entropy_with_logits(labels=None,
                                        logits=None,
@@ -384,6 +390,7 @@ def weighted_cross_entropy_with_logits(labels=None,
 
 
 @tf_export("nn.compute_average_loss")
+@dispatch.add_dispatch_support
 def compute_average_loss(per_example_loss,
                          sample_weight=None,
                          global_batch_size=None):
@@ -440,6 +447,7 @@ def compute_average_loss(per_example_loss,
 
 
 @tf_export("nn.scale_regularization_loss")
+@dispatch.add_dispatch_support
 def scale_regularization_loss(regularization_loss):
   """Scales the sum of the given regularization losses by number of replicas.
 
@@ -478,6 +486,7 @@ def scale_regularization_loss(regularization_loss):
 
 
 @tf_export(v1=["nn.relu_layer"])
+@dispatch.add_dispatch_support
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).
 
@@ -501,6 +510,7 @@ def relu_layer(x, weights, biases, name=None):
 
 
 @tf_export("nn.swish")
+@dispatch.add_dispatch_support
 @custom_gradient.custom_gradient
 def swish(features):
   # pylint: disable=g-doc-args
@@ -538,6 +548,7 @@ def swish(features):
 
 # pylint: disable=redefined-builtin
 @tf_export("linalg.normalize")
+@dispatch.add_dispatch_support
 def normalize(tensor, ord="euclidean", axis=None, name=None):
   """Normalizes `tensor` along dimension `axis` using specified norm.
 
@@ -590,6 +601,7 @@ def normalize(tensor, ord="euclidean", axis=None, name=None):
 
 
 @tf_export(v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.
@@ -618,6 +630,7 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
 
 
 @tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize", v1=[])
+@dispatch.add_dispatch_support
 def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
   """Normalizes along dimension `axis` using an L2 norm.
 
@@ -668,6 +681,7 @@ def _count_nonzero(input_tensor, dtype=dtypes.int64):
 
 
 @tf_export("math.zero_fraction", "nn.zero_fraction")
+@dispatch.add_dispatch_support
 def zero_fraction(value, name=None):
   """Returns the fraction of zeros in `value`.
 
@@ -710,6 +724,7 @@ def zero_fraction(value, name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=["nn.depthwise_conv2d"])
+@dispatch.add_dispatch_support
 def depthwise_conv2d(input,
                      filter,
                      strides,
@@ -741,6 +756,7 @@ def depthwise_conv2d(input,
   to 1.
 
   Usage Example:
+
   >>> x = np.array([
   ...     [1., 2.],
   ...     [3., 4.],
@@ -837,6 +853,7 @@ def depthwise_conv2d(input,
 
 
 @tf_export("nn.depthwise_conv2d", v1=[])
+@dispatch.add_dispatch_support
 def depthwise_conv2d_v2(input,
                         filter,
                         strides,
@@ -867,6 +884,7 @@ def depthwise_conv2d_v2(input,
   to 1.
 
   Usage Example:
+
   >>> x = np.array([
   ...     [1., 2.],
   ...     [3., 4.],
@@ -933,6 +951,7 @@ def depthwise_conv2d_v2(input,
 
 # pylint: disable=redefined-builtin,line-too-long
 @tf_export(v1=["nn.separable_conv2d"])
+@dispatch.add_dispatch_support
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
@@ -974,8 +993,14 @@ def separable_conv2d(input,
       filter to mix channels after `depthwise_filter` has convolved spatially.
     strides: 1-D of size 4.  The strides for the depthwise convolution for
       each dimension of `input`.
-    padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
+    padding: Controls how to pad the image before applying the depthwise
+      convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
+      of padding algorithm to use, or a Python list indicating the explicit
+      paddings at the start and end of each dimension. When explicit padding is
+      used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
+      padding used and data_format is `"NCHW"`, this should be in the form
+      `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
     rate: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
       greater than 1, then all values of strides must be 1.
@@ -1034,6 +1059,7 @@ def separable_conv2d(input,
 
 
 @tf_export("nn.separable_conv2d", v1=[])
+@dispatch.add_dispatch_support
 def separable_conv2d_v2(
     input,
     depthwise_filter,
@@ -1076,8 +1102,14 @@ def separable_conv2d_v2(
       `depthwise_filter` has convolved spatially.
     strides: 1-D of size 4.  The strides for the depthwise convolution for each
       dimension of `input`.
-    padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm. See
-      the "returns" section of `tf.nn.convolution` for details.
+    padding: Controls how to pad the image before applying the depthwise
+      convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
+      of padding algorithm to use, or a Python list indicating the explicit
+      paddings at the start and end of each dimension. When explicit padding is
+      used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
+      padding used and data_format is `"NCHW"`, this should be in the form
+      `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: The data format for input. Either "NHWC" (default) or "NCHW".
     dilations: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
@@ -1103,6 +1135,7 @@ def separable_conv2d_v2(
 
 
 @tf_export(v1=["nn.sufficient_statistics"])
+@dispatch.add_dispatch_support
 def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
                           keepdims=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
@@ -1160,6 +1193,7 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
 
 
 @tf_export("nn.sufficient_statistics", v1=[])
+@dispatch.add_dispatch_support
 def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
@@ -1189,6 +1223,7 @@ def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
 
 
 @tf_export("nn.normalize_moments")
+@dispatch.add_dispatch_support
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
@@ -1221,6 +1256,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
 
 
 @tf_export(v1=["nn.moments"])
+@dispatch.add_dispatch_support
 def moments(
     x,
     axes,
@@ -1286,6 +1322,7 @@ def moments(
 
 
 @tf_export("nn.moments", v1=[])
+@dispatch.add_dispatch_support
 def moments_v2(
     x,
     axes,
@@ -1322,6 +1359,7 @@ def moments_v2(
 
 
 @tf_export(v1=["nn.weighted_moments"])
+@dispatch.add_dispatch_support
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
                      keepdims=None):
   """Returns the frequency-weighted mean and variance of `x`.
@@ -1400,6 +1438,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
 
 
 @tf_export("nn.weighted_moments", v1=[])
+@dispatch.add_dispatch_support
 def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
   """Returns the frequency-weighted mean and variance of `x`.
 
@@ -1424,6 +1463,7 @@ def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
 
 
 @tf_export("nn.batch_normalization")
+@dispatch.add_dispatch_support
 def batch_normalization(x,
                         mean,
                         variance,
@@ -1494,6 +1534,7 @@ def batch_normalization(x,
 
 
 @tf_export(v1=["nn.fused_batch_norm"])
+@dispatch.add_dispatch_support
 def fused_batch_norm(
     x,
     scale,
@@ -1617,6 +1658,7 @@ def fused_batch_norm(
 
 
 @tf_export(v1=["nn.batch_norm_with_global_normalization"])
+@dispatch.add_dispatch_support
 def batch_norm_with_global_normalization(t=None,
                                          m=None,
                                          v=None,
@@ -1671,6 +1713,7 @@ def batch_norm_with_global_normalization(t=None,
 
 # pylint: disable=redefined-builtin,line-too-long
 @tf_export("nn.batch_norm_with_global_normalization", v1=[])
+@dispatch.add_dispatch_support
 def batch_norm_with_global_normalization_v2(input,
                                             mean,
                                             variance,
@@ -1920,6 +1963,7 @@ def _compute_sampled_logits(weights,
 
 
 @tf_export("nn.nce_loss", v1=[])
+@dispatch.add_dispatch_support
 def nce_loss_v2(weights,
                 biases,
                 labels,
@@ -2024,6 +2068,7 @@ def nce_loss_v2(weights,
 
 
 @tf_export(v1=["nn.nce_loss"])
+@dispatch.add_dispatch_support
 def nce_loss(weights,
              biases,
              labels,
@@ -2135,6 +2180,7 @@ def nce_loss(weights,
 
 
 @tf_export("nn.sampled_softmax_loss", v1=[])
+@dispatch.add_dispatch_support
 def sampled_softmax_loss_v2(weights,
                             biases,
                             labels,
@@ -2226,6 +2272,7 @@ def sampled_softmax_loss_v2(weights,
 
 
 @tf_export(v1=["nn.sampled_softmax_loss"])
+@dispatch.add_dispatch_support
 def sampled_softmax_loss(weights,
                          biases,
                          labels,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 9a22f40f8a2..e7955100b24 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import device_context
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
@@ -238,6 +239,7 @@ class _NonAtrousConvolution(object):
 
 
 @tf_export("nn.dilation2d", v1=[])
+@dispatch.add_dispatch_support
 def dilation2d_v2(
     input,   # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
@@ -305,6 +307,7 @@ def dilation2d_v2(
 
 
 @tf_export(v1=["nn.dilation2d"])
+@dispatch.add_dispatch_support
 def dilation2d_v1(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
     filter=None,  # pylint: disable=redefined-builtin
@@ -323,6 +326,7 @@ dilation2d_v1.__doc__ = gen_nn_ops.dilation2d.__doc__
 
 
 @tf_export("nn.with_space_to_batch")
+@dispatch.add_dispatch_support
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
     dilation_rate,
@@ -771,6 +775,7 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
 
 
 @tf_export(v1=["nn.convolution"])
+@dispatch.add_dispatch_support
 def convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -906,7 +911,8 @@ def convolution(
 
 
 @tf_export("nn.convolution", v1=[])
-def convolution_v2(
+@dispatch.add_dispatch_support
+def convolution_v2(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
     filters,
     strides=None,
@@ -1115,6 +1121,7 @@ class Convolution(object):
 
 
 @tf_export(v1=["nn.pool"])
+@dispatch.add_dispatch_support
 def pool(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1289,6 +1296,7 @@ def pool(
 
 
 @tf_export("nn.pool", v1=[])
+@dispatch.add_dispatch_support
 def pool_v2(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1388,6 +1396,7 @@ def pool_v2(
 
 
 @tf_export("nn.atrous_conv2d")
+@dispatch.add_dispatch_support
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
 
@@ -1575,6 +1584,7 @@ def convert_padding(padding):
 
 
 @tf_export(v1=["nn.conv1d"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     "`NCHW` for data_format is deprecated, use `NCW` instead",
@@ -1673,6 +1683,7 @@ def conv1d(
 
 
 @tf_export("nn.conv1d", v1=[])
+@dispatch.add_dispatch_support
 def conv1d_v2(
     input,  # pylint: disable=redefined-builtin
     filters,
@@ -1738,6 +1749,7 @@ def conv1d_v2(
 
 
 @tf_export("nn.conv1d_transpose")
+@dispatch.add_dispatch_support
 def conv1d_transpose(
     input,  # pylint: disable=redefined-builtin
     filters,
@@ -1826,6 +1838,7 @@ def conv1d_transpose(
 
 
 @tf_export("nn.conv2d", v1=[])
+@dispatch.add_dispatch_support
 def conv2d_v2(input,  # pylint: disable=redefined-builtin
               filters,
               strides,
@@ -1926,6 +1939,7 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
 
 
 @tf_export(v1=["nn.conv2d"])
+@dispatch.add_dispatch_support
 def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
     filter=None,
@@ -2023,6 +2037,7 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
 
 
 @tf_export(v1=["nn.conv2d_backprop_filter"])
+@dispatch.add_dispatch_support
 def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
     filter_sizes,
@@ -2083,6 +2098,7 @@ def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-defau
 
 
 @tf_export(v1=["nn.conv2d_backprop_input"])
+@dispatch.add_dispatch_support
 def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
     filter=None,
@@ -2147,6 +2163,7 @@ def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-defaul
 
 
 @tf_export(v1=["nn.conv2d_transpose"])
+@dispatch.add_dispatch_support
 def conv2d_transpose(
     value=None,
     filter=None,  # pylint: disable=redefined-builtin
@@ -2223,6 +2240,7 @@ def conv2d_transpose(
 
 
 @tf_export("nn.conv2d_transpose", v1=[])
+@dispatch.add_dispatch_support
 def conv2d_transpose_v2(
     input,  # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
@@ -2300,6 +2318,7 @@ def conv2d_transpose_v2(
 
 
 @tf_export("nn.atrous_conv2d_transpose")
+@dispatch.add_dispatch_support
 def atrous_conv2d_transpose(value,
                             filters,
                             output_shape,
@@ -2458,6 +2477,7 @@ def atrous_conv2d_transpose(value,
 
 
 @tf_export(v1=["nn.depthwise_conv2d_native"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native")
 def depthwise_conv2d_native(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -2537,6 +2557,7 @@ def depthwise_conv2d_native(  # pylint: disable=redefined-builtin,dangerous-defa
         "nn.depthwise_conv2d_native_backprop_input",
         "nn.depthwise_conv2d_backprop_input"
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native_backprop_input")
 def depthwise_conv2d_native_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
@@ -2606,6 +2627,7 @@ def depthwise_conv2d_native_backprop_input(  # pylint: disable=redefined-builtin
         "nn.depthwise_conv2d_native_backprop_filter",
         "nn.depthwise_conv2d_backprop_filter"
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native_backprop_filter")
 def depthwise_conv2d_native_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -2671,6 +2693,7 @@ def depthwise_conv2d_native_backprop_filter(  # pylint: disable=redefined-builti
 
 
 @tf_export("nn.conv3d", v1=[])
+@dispatch.add_dispatch_support
 def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
               filters,
               strides,
@@ -2690,6 +2713,7 @@ def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
 
 
 @tf_export(v1=["nn.conv3d"])
+@dispatch.add_dispatch_support
 def conv3d_v1(  # pylint: disable=missing-docstring,dangerous-default-value
     input,  # pylint: disable=redefined-builtin
     filter=None,  # pylint: disable=redefined-builtin
@@ -2710,6 +2734,7 @@ conv3d_v1.__doc__ = gen_nn_ops.conv3d.__doc__
 
 
 @tf_export(v1=["nn.conv3d_transpose"])
+@dispatch.add_dispatch_support
 def conv3d_transpose(
     value,
     filter=None,  # pylint: disable=redefined-builtin
@@ -2781,6 +2806,7 @@ def conv3d_transpose(
 
 
 @tf_export("nn.conv3d_transpose", v1=[])
+@dispatch.add_dispatch_support
 def conv3d_transpose_v2(input,  # pylint: disable=redefined-builtin
                         filters,
                         output_shape,
@@ -2860,6 +2886,7 @@ CONV_TRANSPOSE_OPS = (
 
 
 @tf_export("nn.conv_transpose")
+@dispatch.add_dispatch_support
 def conv_transpose(input,  # pylint: disable=redefined-builtin
                    filters,
                    output_shape,
@@ -2957,6 +2984,7 @@ _tf_deterministic_ops.value = None
 
 
 @tf_export("nn.bias_add")
+@dispatch.add_dispatch_support
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
 
@@ -3046,6 +3074,7 @@ def bias_add_v1(value, bias, name=None):
 
 
 @tf_export(v1=["nn.crelu"])
+@dispatch.add_dispatch_support
 def crelu(features, name=None, axis=-1):
   """Computes Concatenated ReLU.
 
@@ -3078,12 +3107,14 @@ def crelu(features, name=None, axis=-1):
 
 
 @tf_export("nn.crelu", v1=[])
+@dispatch.add_dispatch_support
 def crelu_v2(features, axis=-1, name=None):
   return crelu(features, name=name, axis=axis)
 crelu_v2.__doc__ = crelu.__doc__
 
 
 @tf_export("nn.relu6")
+@dispatch.add_dispatch_support
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
 
@@ -3106,6 +3137,7 @@ def relu6(features, name=None):
 
 
 @tf_export("nn.leaky_relu")
+@dispatch.add_dispatch_support
 def leaky_relu(features, alpha=0.2, name=None):
   """Compute the Leaky ReLU activation function.
 
@@ -3244,6 +3276,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
 
 @tf_export(v1=["nn.softmax", "math.softmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -3255,6 +3288,7 @@ def softmax(logits, axis=None, name=None, dim=None):
   See: https://en.wikipedia.org/wiki/Softmax_function
 
   Example usage:
+
   >>> tf.nn.softmax([-1, 0., 1.])
   <tf.Tensor: shape=(3,), dtype=float32,
   numpy=array([0.09003057, 0.24472848, 0.66524094], dtype=float32)>
@@ -3287,6 +3321,7 @@ def softmax(logits, axis=None, name=None, dim=None):
 
 
 @tf_export("nn.softmax", "math.softmax", v1=[])
+@dispatch.add_dispatch_support
 def softmax_v2(logits, axis=None, name=None):
   """Computes softmax activations.
 
@@ -3314,6 +3349,7 @@ def softmax_v2(logits, axis=None, name=None):
 
 
 @tf_export(v1=["nn.log_softmax", "math.log_softmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -3344,6 +3380,7 @@ def log_softmax(logits, axis=None, name=None, dim=None):
 
 
 @tf_export("nn.log_softmax", "math.log_softmax", v1=[])
+@dispatch.add_dispatch_support
 def log_softmax_v2(logits, axis=None, name=None):
   """Computes log softmax activations.
 
@@ -3380,6 +3417,7 @@ def _ensure_xent_args(name, sentinel, labels, logits):
 
 
 @tf_export("nn.softmax_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
@@ -3397,6 +3435,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
   one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
 
   Usage:
+
   >>> logits = [[4.0, 2.0, 1.0], [0.0, 5.0, 1.0]]
   >>> labels = [[1.0, 0.0, 0.0], [0.0, 0.8, 0.2]]
   >>> tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
@@ -3441,6 +3480,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
 
 
 @tf_export(v1=["nn.softmax_cross_entropy_with_logits_v2"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax_cross_entropy_with_logits_v2_helper(
     labels, logits, axis=None, name=None, dim=None):
@@ -3568,6 +3608,7 @@ See `tf.nn.softmax_cross_entropy_with_logits_v2`.
 
 
 @tf_export(v1=["nn.softmax_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
 def softmax_cross_entropy_with_logits(
     _sentinel=None,  # pylint: disable=invalid-name
@@ -3636,6 +3677,7 @@ def softmax_cross_entropy_with_logits(
 
 
 @tf_export(v1=["nn.sparse_softmax_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy_with_logits(
     _sentinel=None,  # pylint: disable=invalid-name
     labels=None,
@@ -3761,6 +3803,7 @@ def sparse_softmax_cross_entropy_with_logits(
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy_with_logits_v2(labels, logits, name=None):
   """Computes sparse softmax cross entropy between `logits` and `labels`.
 
@@ -3813,6 +3856,7 @@ def sparse_softmax_cross_entropy_with_logits_v2(labels, logits, name=None):
 
 
 @tf_export("nn.avg_pool", v1=["nn.avg_pool_v2"])
+@dispatch.add_dispatch_support
 def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  # pylint: disable=redefined-builtin
   """Performs the avg pooling on the input.
 
@@ -3875,6 +3919,7 @@ def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  #
 
 
 @tf_export(v1=["nn.avg_pool", "nn.avg_pool2d"])
+@dispatch.add_dispatch_support
 def avg_pool(value, ksize, strides, padding, data_format="NHWC",
              name=None, input=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
@@ -3919,6 +3964,7 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC",
 
 
 @tf_export("nn.avg_pool2d", v1=[])
+@dispatch.add_dispatch_support
 def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -3958,6 +4004,7 @@ def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
 
 @tf_export("nn.avg_pool1d")
+@dispatch.add_dispatch_support
 def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -4003,6 +4050,7 @@ def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  #
 
 
 @tf_export("nn.avg_pool3d")
+@dispatch.add_dispatch_support
 def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -4043,6 +4091,7 @@ def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool", v1=["nn.max_pool_v2"])
+@dispatch.add_dispatch_support
 def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
   """Performs the max pooling on the input.
 
@@ -4103,6 +4152,7 @@ def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
 
 
 @tf_export(v1=["nn.max_pool"])
+@dispatch.add_dispatch_support
 def max_pool(value,
              ksize,
              strides,
@@ -4152,6 +4202,7 @@ def max_pool(value,
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool1d")
+@dispatch.add_dispatch_support
 def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4196,6 +4247,7 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool2d")
+@dispatch.add_dispatch_support
 def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4234,6 +4286,7 @@ def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool3d")
+@dispatch.add_dispatch_support
 def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4276,6 +4329,7 @@ def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
 
 
 @tf_export("nn.max_pool_with_argmax", v1=[])
+@dispatch.add_dispatch_support
 def max_pool_with_argmax_v2(
     input,  # pylint: disable=redefined-builtin
     ksize,
@@ -4345,6 +4399,7 @@ def max_pool_with_argmax_v2(
 
 
 @tf_export(v1=["nn.max_pool_with_argmax"])
+@dispatch.add_dispatch_support
 def max_pool_with_argmax_v1(  # pylint: disable=missing-docstring,invalid-name
     input,  # pylint: disable=redefined-builtin
     ksize,
@@ -4439,6 +4494,7 @@ def _calc_bias_add_flops(graph, node):
 
 
 @tf_export(v1=["nn.xw_plus_b"])
+@dispatch.add_dispatch_support
 def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
   """Computes matmul(x, weights) + biases.
 
@@ -4511,6 +4567,7 @@ def _get_noise_shape(x, noise_shape):
 
 
 @tf_export(v1=["nn.dropout"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Please use `rate` instead of `keep_prob`. "
                              "Rate should be set to `rate = 1 - keep_prob`.",
                              "keep_prob")
@@ -4565,6 +4622,7 @@ def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
 
 
 @tf_export("nn.dropout", v1=[])
+@dispatch.add_dispatch_support
 def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
   """Computes dropout: randomly sets elements to zero to prevent overfitting.
 
@@ -4686,6 +4744,7 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
 
 
 @tf_export("math.top_k", "nn.top_k")
+@dispatch.add_dispatch_support
 def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-builtin
   """Finds values and indices of the `k` largest entries for the last dimension.
 
@@ -4746,6 +4805,7 @@ def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefine
 
 
 @tf_export(v1=["nn.fractional_max_pool"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
                         "args are deprecated.  Use fractional_max_pool_v2.")
 def fractional_max_pool(value,
@@ -4832,6 +4892,7 @@ def fractional_max_pool(value,
 
 
 @tf_export("nn.fractional_max_pool", v1=[])
+@dispatch.add_dispatch_support
 def fractional_max_pool_v2(value,
                            pooling_ratio,
                            pseudo_random=False,
@@ -4917,6 +4978,7 @@ def fractional_max_pool_v2(value,
 
 
 @tf_export(v1=["nn.fractional_avg_pool"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
                         "args are deprecated.  Use fractional_avg_pool_v2.")
 def fractional_avg_pool(value,
@@ -4982,6 +5044,7 @@ def fractional_avg_pool(value,
 
 
 @tf_export("nn.fractional_avg_pool", v1=[])
+@dispatch.add_dispatch_support
 def fractional_avg_pool_v2(value,
                            pooling_ratio,
                            pseudo_random=False,
@@ -5060,6 +5123,7 @@ def _calc_dilation2d_flops(graph, node):
 
 
 @tf_export(v1=["nn.erosion2d"])
+@dispatch.add_dispatch_support
 def erosion2d(value, kernel, strides, rates, padding, name=None):
   """Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
 
@@ -5119,6 +5183,7 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
 
 
 @tf_export("nn.erosion2d", v1=[])
+@dispatch.add_dispatch_support
 def erosion2d_v2(value,
                  filters,
                  strides,
@@ -5188,6 +5253,7 @@ def erosion2d_v2(value,
 
 
 @tf_export(v1=["math.in_top_k", "nn.in_top_k"])
+@dispatch.add_dispatch_support
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
@@ -5222,6 +5288,7 @@ def in_top_k(predictions, targets, k, name=None):
 
 
 @tf_export("math.in_top_k", "nn.in_top_k", v1=[])
+@dispatch.add_dispatch_support
 def in_top_k_v2(targets, predictions, k, name=None):
   return in_top_k(predictions, targets, k, name)
 
@@ -5229,7 +5296,11 @@ def in_top_k_v2(targets, predictions, k, name=None):
 in_top_k_v2.__doc__ = in_top_k.__doc__
 
 
-tf_export(v1=["nn.quantized_avg_pool"])(gen_nn_ops.quantized_avg_pool)
-tf_export(v1=["nn.quantized_conv2d"])(gen_nn_ops.quantized_conv2d)
-tf_export(v1=["nn.quantized_relu_x"])(gen_nn_ops.quantized_relu_x)
-tf_export(v1=["nn.quantized_max_pool"])(gen_nn_ops.quantized_max_pool)
+tf_export(v1=["nn.quantized_avg_pool"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_avg_pool))
+tf_export(v1=["nn.quantized_conv2d"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_conv2d))
+tf_export(v1=["nn.quantized_relu_x"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_relu_x))
+tf_export(v1=["nn.quantized_max_pool"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_max_pool))
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 9f9e7229442..81a532bb150 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -25,10 +25,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
 def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
@@ -50,6 +52,7 @@ def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
 
 
 @tf_export("debugging.assert_all_finite", v1=[])
+@dispatch.add_dispatch_support
 def verify_tensor_all_finite_v2(x, message, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
 
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
new file mode 100644
index 00000000000..5b4dae352d6
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -0,0 +1,16 @@
+# TF numpy API
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "numpy_ops",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
new file mode 100644
index 00000000000..d78a4c3a6fb
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensorflow numpy API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 88ddf7a7ec8..cf9f485a40f 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -114,6 +114,7 @@ cuda_py_test(
         ":test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:gradients",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:parsing_ops",
@@ -131,6 +132,7 @@ cuda_py_test(
         # XLA is not enabled by default on Mac or Windows.
         "no_mac",
         "no_windows",
+        "nogpu",  # TODO(b/155761551): Flaky on GPU on TAP
     ],
     xla_enabled = True,
     deps = [
@@ -138,6 +140,8 @@ cuda_py_test(
         ":test_util",
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/compiler/xla",
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index 5339af538fd..a7649778161 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -195,7 +195,7 @@ def pfor(loop_fn, iters, fallback_to_while_loop=True, parallel_iterations=None):
     if functions_run_eagerly:
       logging.warning(
           "It looks like tf.function behavior was disabled, perhaps using "
-          "tf.config.experimental_run_functions_eagerly. Vectorization "
+          "tf.config.run_functions_eagerly. Vectorization "
           "primitives (e.g. tf.vectorized_map) require tf.function to work. "
           "These primitives will override the disable.")
       def_function.run_functions_eagerly(False)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 180bccb6582..5becfa9efb7 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -36,12 +36,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradients as gradient_ops
@@ -80,7 +81,7 @@ class PForTest(PForTestCase):
       x_i = array_ops.gather(x, i)
       return nn.top_k(x_i)
 
-    with self.assertRaisesRegexp(ValueError, "No converter defined"):
+    with self.assertRaisesRegexp(ValueError, "No pfor vectorization"):
       self._test_loop_fn(loop_fn, 3, fallback_to_while_loop=False)
     self._test_loop_fn(loop_fn, 3, fallback_to_while_loop=True)
 
@@ -140,28 +141,6 @@ class PForTest(PForTestCase):
     c = pfor_control_flow_ops.vectorized_map(outer_product, a)
     self.assertAllEqual((batch_size, 32, 32, 32, 32), c.shape)
 
-  def test_vectorized_map_example_2(self):
-    batch_size = 10
-    num_features = 32
-    layer = keras_core.Dense(1)
-
-    def model_fn(arg):
-      with backprop.GradientTape() as g:
-        inp, label = arg
-        inp = array_ops.expand_dims(inp, 0)
-        label = array_ops.expand_dims(label, 0)
-        prediction = layer(inp)
-        loss = nn.l2_loss(label - prediction)
-      return g.gradient(loss, (layer.kernel, layer.bias))
-
-    inputs = random_ops.random_uniform([batch_size, num_features])
-    labels = random_ops.random_uniform([batch_size, 1])
-    per_example_gradients = pfor_control_flow_ops.vectorized_map(
-        model_fn, (inputs, labels))
-    self.assertAllEqual(per_example_gradients[0].shape,
-                        (batch_size, num_features, 1))
-    self.assertAllEqual(per_example_gradients[1].shape, (batch_size, 1))
-
   def test_disable_tf_function(self):
     def_function.run_functions_eagerly(True)
     # vectorized_map should ignore disabling tf.functions
@@ -1010,8 +989,9 @@ class WhileV1Test(PForTestCase):
 
     def loop_fn(_):
       return control_flow_ops.while_loop(
-          lambda j, x: j < 4, lambda j, x:
-          (j + 1, x + random_ops.random_uniform([])), [0, 0.])[0]
+          lambda j, x: j < 4,
+          lambda j, x: (j + 1, x + random_ops.random_uniform([])),
+          [0, 0.])[0]
 
     self._test_loop_fn(loop_fn, 3)
 
@@ -1019,8 +999,9 @@ class WhileV1Test(PForTestCase):
   def test_while_unstacked_condition(self):
 
     def loop_fn(i):
-      return control_flow_ops.while_loop(lambda j, x: j < 4, lambda j, x:
-                                         (j + 1, x + i), [0, 0])
+      return control_flow_ops.while_loop(
+          lambda j, x: j < 4,
+          lambda j, x: (j + 1, x + i), [0, 0])
 
     self._test_loop_fn(loop_fn, 3)
 
@@ -1034,8 +1015,8 @@ class WhileV1Test(PForTestCase):
       lengths_i = array_ops.gather(lengths, i)
 
       _, total = control_flow_ops.while_loop(
-          lambda j, _: j < lengths_i, lambda j, t:
-          (j + 1, t + array_ops.gather(x_i, j)), [0, 0.])
+          lambda j, _: j < lengths_i,
+          lambda j, t: (j + 1, t + array_ops.gather(x_i, j)), [0, 0.])
       return total
 
     self._test_loop_fn(loop_fn, 3)
@@ -1225,6 +1206,143 @@ def create_dynamic_lstm(cell_fn, batch_size, state_size, max_steps):
   return pfor_output, tf_output
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class WhileV2Test(PForTestCase):
+
+  def setUp(self):
+    self._enabled = control_flow_v2_toggles.control_flow_v2_enabled()
+    control_flow_v2_toggles.enable_control_flow_v2()
+    super(WhileV2Test, self).setUp()
+
+  def tearDown(self):
+    if not self._enabled:
+      control_flow_v2_toggles.disable_control_flow_v2()
+    super(WhileV2Test, self).tearDown()
+
+  def test_while_outside_loop(self):
+    def _f():
+      return control_flow_ops.while_loop(lambda j: j < 4, lambda j: j + 1, [0])
+
+    def loop_fn(i):
+      return _f() + i
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_invariant_while(self):
+
+    def loop_fn(_):
+      return control_flow_ops.while_loop(lambda j: j < 4, lambda j: j + 1, [0])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_invariant_while_with_control_dependency(self):
+
+    def loop_fn(i):
+      with ops.control_dependencies([i]):
+        return control_flow_ops.while_loop(lambda j: j < 4, lambda j: j + 1,
+                                           [0])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_while_with_stateful_ops(self):
+
+    def loop_fn(_):
+      j, _ = control_flow_ops.while_loop(
+          lambda j, x: j < 4,
+          lambda j, x: (j + 1, x + random_ops.random_uniform([])),
+          [0, 0.])
+      return j
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_while_with_variable(self):
+    v = resource_variable_ops.ResourceVariable(5.)
+
+    def loop_fn(_):
+      _, output = control_flow_ops.while_loop(
+          lambda j, x: j < 4,
+          lambda j, x: (j + 1, x + v), [0, 0.])
+      return output
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_while_unstacked_condition(self):
+
+    def loop_fn(i):
+      return control_flow_ops.while_loop(
+          lambda j, x: j < 4,
+          lambda j, x: (j + 1, x + i), [0, 0])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_while(self):
+    x = random_ops.random_uniform([3, 5])
+    lengths = constant_op.constant([4, 0, 2])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      lengths_i = array_ops.gather(lengths, i)
+
+      return control_flow_ops.while_loop(
+          lambda j, _: j < lengths_i,
+          lambda j, t: (j + 1, t + array_ops.gather(x_i, j)), [0, 0.])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_while_change_input_invariance(self):
+    # This tests cases where a loop invariant input to while has loop dependent
+    # operations applied to it inside the while body.
+    # It also test inputs that are passed through.
+    def loop_fn(i):
+      return control_flow_ops.while_loop(
+          lambda j, *_: j < i,
+          lambda j, x, y, z, w: (j + 1, x + i, y + x, z, w),
+          [0,
+           constant_op.constant(0),
+           constant_op.constant(1),
+           i,
+           constant_op.constant(2)])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_while_shape_invariants(self):
+    def loop_fn(i):
+      return control_flow_ops.while_loop(
+          lambda j, *_: j < 4,
+          lambda j, x, y: (j + 1, x + i, y + 1),
+          [0, constant_op.constant([0, 1]), constant_op.constant([2, 3])],
+          shape_invariants=[None,
+                            tensor_shape.TensorShape([2]),
+                            tensor_shape.TensorShape([2])])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_while_jacobian(self):
+    # Note that we wrap the code below in a tf.function since we don't want the
+    # while_loop call to be evaluated eagerly using a python loop.
+    @def_function.function
+    def _f(x, y, use_pfor):
+      # out = x @ y @ y @ y @ y, where @ is matmul operator.
+      _, out = control_flow_ops.while_loop(
+          lambda i, _: i < 4, lambda i, out: (i + 1, math_ops.matmul(out, y)),
+          [0, x])
+
+      def loop_fn(i):
+        out_i = array_ops.gather(out, i, axis=1)
+        grad = gradient_ops.gradients(out_i, x)
+        return array_ops.reshape(grad[0], [-1])
+
+      if use_pfor:
+        return pfor_control_flow_ops.pfor(loop_fn, iters=3)
+      else:
+        return pfor_control_flow_ops.for_loop(loop_fn, iters=3,
+                                              loop_fn_dtypes=out.dtype)
+
+    x = constant_op.constant(np.random.uniform(size=(1, 3)))
+    y = constant_op.constant(np.random.uniform(size=(3, 3)))
+    self.assertAllClose(_f(x, y, True), _f(x, y, False))
+
+
 @test_util.run_all_in_graph_and_eager_modes
 @test_util.with_control_flow_v2
 class StatelessIfTest(PForTestCase):
@@ -1282,6 +1400,8 @@ class StatelessIfTest(PForTestCase):
 class IfTest(PForTestCase):
 
   def test_read_var(self):
+    self.skipTest("b/156438918")  # Flaky
+
     x = [1, 2, 3, 4, 5.]
     y = 2.5
     z = resource_variable_ops.ResourceVariable(5.)
@@ -1406,8 +1526,9 @@ class Benchmarks(test.Benchmark):
     with ops.Graph().as_default():
 
       def loop_fn(i):
-        _, s = control_flow_ops.while_loop(lambda t, x: t < i, lambda t, x:
-                                           (t + 1, x + i), [0, 0])
+        _, s = control_flow_ops.while_loop(lambda t, x: t < i,
+                                           lambda t, x: (t + 1, x + i),
+                                           [0, 0])
         return s
 
       iters = 50
@@ -1782,6 +1903,14 @@ class VariableTest(PForTestCase):
     ):
       pfor_control_flow_ops.vectorized_map(f, x)
 
+  @test_util.run_all_in_graph_and_eager_modes
+  def test_variable_shape(self):
+    v = resource_variable_ops.ResourceVariable([1, 2])
+
+    def loop_fn(_):
+      return resource_variable_ops.variable_shape(v.handle)
+
+    self._test_loop_fn(loop_fn, 2)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 773195283d6..8e18b9968fe 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -150,72 +151,81 @@ class MathTest(PForTestCase, parameterized.TestCase):
       self._test_loop_fn(loop_fn, 3)
 
   def test_binary_cwise_ops(self):
-    logical_ops = [
-        math_ops.logical_and, math_ops.logical_or, math_ops.logical_xor
-    ]
+    # Enable tensor equality to test `equal` and `not_equal` ops below.
+    default_equality = framework_ops.Tensor._USE_EQUALITY
+    framework_ops.enable_tensor_equality()
+    try:
+      logical_ops = [
+          math_ops.logical_and, math_ops.logical_or, math_ops.logical_xor
+      ]
 
-    # Wrapper functions restricting the range of inputs of zeta and polygamma.
-    def safe_polygamma(x, y):
-      return math_ops.polygamma(
-          math_ops.round(clip_ops.clip_by_value(y, 1, 10)), x * x + 1)
+      # Wrapper functions restricting the range of inputs of zeta and polygamma.
+      def safe_polygamma(x, y):
+        return math_ops.polygamma(
+            math_ops.round(clip_ops.clip_by_value(y, 1, 10)), x * x + 1)
 
-    def safe_zeta(x, y):
-      return math_ops.zeta(x * x + 1, y * y)
+      def safe_zeta(x, y):
+        return math_ops.zeta(x * x + 1, y * y)
 
-    float_ops = [
-        math_ops.add,
-        math_ops.add_v2,
-        math_ops.atan2,
-        math_ops.complex,
-        math_ops.div,
-        math_ops.divide,
-        math_ops.div_no_nan,
-        math_ops.equal,
-        math_ops.floor_mod,
-        math_ops.greater,
-        math_ops.greater_equal,
-        math_ops.igamma,
-        math_ops.igammac,
-        math_ops.igamma_grad_a,
-        math_ops.less,
-        math_ops.less_equal,
-        math_ops.maximum,
-        math_ops.minimum,
-        math_ops.mod,
-        math_ops.multiply,
-        math_ops.not_equal,
-        math_ops.pow,
-        math_ops.squared_difference,
-        math_ops.subtract,
-        math_ops.truncate_mod,
-        safe_polygamma,
-        safe_zeta,
-    ]
-    # FloorDiv fails on XLA due floor's discontinuities exacerbating small
-    # division differences.
-    if not test_util.is_xla_enabled():
-      float_ops += [math_ops.floor_div]
-    for op in logical_ops + float_ops:
-      x = random_ops.random_uniform([7, 3, 5])
-      y = random_ops.random_uniform([3, 5])
-      if op in logical_ops:
-        x = x > 0
-        y = y > 0
+      float_ops = [
+          math_ops.add,
+          math_ops.add_v2,
+          math_ops.atan2,
+          math_ops.complex,
+          math_ops.div,
+          math_ops.divide,
+          math_ops.div_no_nan,
+          math_ops.equal,
+          lambda x, y: framework_ops.convert_to_tensor(x == y),
+          lambda x, y: framework_ops.convert_to_tensor(x != y),
+          math_ops.floor_mod,
+          math_ops.greater,
+          math_ops.greater_equal,
+          math_ops.igamma,
+          math_ops.igammac,
+          math_ops.igamma_grad_a,
+          math_ops.less,
+          math_ops.less_equal,
+          math_ops.maximum,
+          math_ops.minimum,
+          math_ops.mod,
+          math_ops.multiply,
+          math_ops.not_equal,
+          math_ops.pow,
+          math_ops.squared_difference,
+          math_ops.subtract,
+          math_ops.truncate_mod,
+          safe_polygamma,
+          safe_zeta,
+      ]
+      # FloorDiv fails on XLA due floor's discontinuities exacerbating small
+      # division differences.
+      if not test_util.is_xla_enabled():
+        float_ops += [math_ops.floor_div]
+      for op in logical_ops + float_ops:
+        x = random_ops.random_uniform([7, 3, 5])
+        y = random_ops.random_uniform([3, 5])
+        if op in logical_ops:
+          x = x > 0
+          y = y > 0
 
-      output_dtypes = []
+        output_dtypes = []
 
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y1 = array_ops.gather(y, i)
-        outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
-        del output_dtypes[:]
-        output_dtypes.extend(t.dtype for t in outputs)
-        return outputs
+        # pylint: disable=cell-var-from-loop
+        def loop_fn(i):
+          x1 = array_ops.gather(x, i)
+          y1 = array_ops.gather(y, i)
+          outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
+          del output_dtypes[:]
+          output_dtypes.extend(t.dtype for t in outputs)
+          return outputs
 
-      # pylint: enable=cell-var-from-loop
+        # pylint: enable=cell-var-from-loop
 
-      self._test_loop_fn(loop_fn, 3)
+        self._test_loop_fn(loop_fn, 3)
+    finally:
+      if not default_equality:
+        framework_ops.disable_tensor_equality()
 
   def test_approximate_equal(self):
     x = random_ops.random_uniform([3, 5])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 5c21620dc66..128bbd48629 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -93,6 +93,7 @@ def _stack(t, length):
 passthrough_stateful_ops = set([
     "VariableV2",
     "VarHandleOp",
+    "VariableShape",
     "ReadVariableOp",
     "StackV2",
     "TensorArrayWriteV3",
@@ -1473,14 +1474,19 @@ class PFor(object):
           else:
             converter = _pfor_converter_registry.get(y_op.type, None)
           if converter is None:
-            if self._fallback_to_while_loop:
+            has_variant_outputs = any(x.dtype == dtypes.variant for x in
+                                      y_op.outputs)
+            if self._fallback_to_while_loop and not has_variant_outputs:
               converter = _fallback_converter
             else:
-              raise ValueError("No converter defined for %s\n%s\ninputs: %s. "
-                               "\nEither add a converter or "
-                               "enable fallback_to_while_loop "
-                               "option to pfor, which may run slower" %
-                               (y_op.type, y_op, converted_inputs))
+              message = ("No pfor vectorization defined for %s\n"
+                         "%s\n"
+                         "inputs: %s. " %
+                         (y_op.type, y_op, converted_inputs))
+              if not self._fallback_to_while_loop:
+                message += ("Consider enabling the fallback_to_while_loop "
+                            "option to pfor, which may run slower.")
+              raise ValueError(message)
           # TODO(rachelim): Handle the case where some inputs are sparsely
           # stacked. We should only call the converter if it supports handling
           # those inputs.
@@ -2779,8 +2785,8 @@ def _convert_equal(pfor_input):
   x = pfor_input.input(0)[0]
   y = pfor_input.input(1)[0]
   incompatible_shape_error = pfor_input.get_attr("incompatible_shape_error")
-  assert incompatible_shape_error
-  return wrap(math_ops.equal(x, y), True)
+  return wrap(gen_math_ops.equal(
+      x, y, incompatible_shape_error=incompatible_shape_error), True)
 
 
 @RegisterPFor("NotEqual")
@@ -2789,8 +2795,8 @@ def _convert_not_equal(pfor_input):
   x = pfor_input.input(0)[0]
   y = pfor_input.input(1)[0]
   incompatible_shape_error = pfor_input.get_attr("incompatible_shape_error")
-  assert incompatible_shape_error
-  return wrap(math_ops.not_equal(x, y), True)
+  return wrap(gen_math_ops.not_equal(
+      x, y, incompatible_shape_error=incompatible_shape_error), True)
 
 
 @RegisterPFor("ApproximateEqual")
@@ -3727,9 +3733,12 @@ def _outputs_for_branch(func_name, indices, pfor_input, inputs):
   return stacked_outputs
 
 
+# TODO(agarwal): Currently the converted code aggressively tiles loop variant
+# outputs from the then/else branches. Instead, it could do so only if at least
+# one of the branch outputs is loop variant.
 @RegisterPFor("StatelessIf")
 @RegisterPFor("If")
-def _convert_stateless_if(pfor_input):
+def _convert_if(pfor_input):
   cond, cond_stacked, _ = pfor_input.input(0)
   inputs = pfor_input.inputs[1:]
   then_branch = pfor_input.get_attr("then_branch")
@@ -3780,6 +3789,322 @@ def _convert_stateless_if(pfor_input):
     return [wrap(t, True) for t in outputs]
 
 
+class WhileV2(object):
+  """Object for vectorizing V2 while_loop op."""
+
+  def __init__(self, pfor_input):
+    self._pfor_input = pfor_input
+    self._pfor = pfor_input.pfor
+    cond_func_name = pfor_input.get_attr("cond").name
+    self._cond_func = pfor_input.op.graph._get_function(compat.as_bytes(
+        cond_func_name))
+    body_func_name = pfor_input.get_attr("body").name
+    self._body_func = pfor_input.op.graph._get_function(compat.as_bytes(
+        body_func_name))
+    if self._cond_func is None or self._body_func is None:
+      raise ValueError("Error extracting cond and body functions for op %s." % (
+          self._pfor_input.op))
+    # Indices of inputs that are passed unchanged through the while loop body.
+    # Typically these are tensors captured from outside the body context.
+    self._body_pass_through_indices = set()
+    for i, (inp, out) in enumerate(zip(self._body_func.graph.inputs,
+                                       self._body_func.graph.outputs)):
+      if id(inp) == id(out):
+        self._body_pass_through_indices.add(i)
+    self._parallel_iterations = self._pfor_input.get_attr("parallel_iterations")
+
+  def _output_shapes(self):
+    # Calculate output shape for vectorized loop. This will be used as
+    # shape_invariant. Merges shape inference outputs with the `output_shapes`
+    # attribute of the op.
+    output_shapes = [out.shape for out in self._pfor_input.op.outputs]
+    shapes = self._pfor_input.get_attr("output_shapes")
+    if not shapes:
+      shapes = [tensor_shape.TensorShape(None) for _ in output_shapes]
+    else:
+      shapes = [tensor_shape.TensorShape(shape) for shape in shapes]
+    for i, shape in enumerate(shapes):
+      shape = shape.merge_with(output_shapes[i])
+      if self._pfor_input.input(i).is_stacked:
+        shape = tensor_shape.TensorShape([None]).concatenate(shape)
+      output_shapes[i] = shape
+    assert len(output_shapes) == self._pfor_input.num_inputs
+    return output_shapes
+
+  def _init_values(self):
+    """Create arguments passed to converted while_loop."""
+    loop_len = self._pfor.loop_len_vector[0]
+    inputs = []
+    # TensorArrays for outputs of converted while loop
+    output_tas = []
+
+    with ops.name_scope("while_init"):
+      for inp in self._pfor_input.inputs:
+        inputs.append(inp.t)
+        output_tas.append(tensor_array_ops.TensorArray(inp.t.dtype, loop_len))
+    # See documentation for __call__ for the structure of init_values.
+    return [True, self._pfor.all_indices] + inputs + output_tas
+
+  def _process_cond_unstacked(self, conditions, indices, inputs, output_tas):
+    """Handles case when condition is pfor loop invariant."""
+    # Note that all iterations end together. So we don't need to partition the
+    # inputs.
+    not_all_done = array_ops.reshape(conditions, [])
+    return not_all_done, indices, inputs, output_tas
+
+  def _process_cond_stacked(self, conditions, indices, inputs, inputs_stacked,
+                            output_tas):
+    """Handles case when condition is pfor loop dependent."""
+    # Compute if all iterations are done.
+    not_all_done = math_ops.reduce_any(conditions)
+    conditions_int = math_ops.cast(conditions, dtypes.int32)
+    # Partition the indices.
+    done_indices, new_indices = data_flow_ops.dynamic_partition(
+        indices, conditions_int, 2)
+
+    new_inputs = []
+    new_output_tas = []
+    for i, (inp, stacked) in enumerate(zip(inputs, inputs_stacked)):
+      pass_through = i in self._body_pass_through_indices
+      # Partition the inputs.
+      if stacked:
+        done_inp, new_inp = data_flow_ops.dynamic_partition(
+            inp, conditions_int, 2)
+      else:
+        if not pass_through:
+          done_inp = _stack(inp, [array_ops.size(done_indices)]).t
+        new_inp = inp
+
+      new_inputs.append(new_inp)
+      out_ta = output_tas[i]
+      if not pass_through:
+        # Note that done_indices can be empty. done_inp should also be empty
+        # in that case.
+        out_ta = out_ta.scatter(done_indices, done_inp)
+      new_output_tas.append(out_ta)
+
+    assert len(new_output_tas) == len(output_tas)
+    assert len(new_inputs) == len(inputs)
+    return not_all_done, new_indices, new_inputs, new_output_tas
+
+  def _process_body(self, inputs_stacked, new_indices, cond_stacked,
+                    new_inputs, not_all_done):
+    """Convert the body function."""
+    # This is used to store the indices of inputs to the while op that need to
+    # be stacked. This stacking may be needed in cases where the input to the
+    # while_loop is loop_invariant but the corresponding output is not.
+    mismatching_stacked_indices = []
+
+    def true_fn():
+      """Converts the body function for all but last iteration."""
+      wrapped_inputs = [wrap(inp, stacked) for inp, stacked in
+                        zip(new_inputs, inputs_stacked)]
+      # Note the iterative process below to figure out loop invariance.
+      # Here we iterate on vectorization process till a fixed point. The issue
+      # is that the while body can take pfor loop invariant inputs but return
+      # loop variant outputs. For any loop variant output, the corresponding
+      # input has to be then made loop variant (since subsequent while
+      # iterations will need to see loop variant values).
+      # However once we make a new input loop variant, we might make other
+      # outputs loop variant. Hence we need to iterate till we get fixed point.
+      while True:
+        body_pfor = PFor(
+            loop_var=self._pfor.loop_var,
+            loop_len=array_ops.size(new_indices),
+            pfor_ops=self._body_func.graph.get_operations(),
+            fallback_to_while_loop=self._pfor.fallback_to_while_loop,
+            all_indices=new_indices,
+            all_indices_partitioned=(self._pfor.all_indices_partitioned or
+                                     cond_stacked),
+            pfor_config=self._pfor.pfor_config)
+        stacking_mismatch = False
+        outputs = _convert_function_call(self._body_func,
+                                         body_pfor,
+                                         wrapped_inputs)
+        for i, (out, inp) in enumerate(zip(outputs, wrapped_inputs)):
+          if out.is_stacked != inp.is_stacked:
+            stacking_mismatch = True
+            mismatching_stacked_indices.append(i)
+            wrapped_inputs[i] = _stack(inp.t, [array_ops.size(new_indices)])
+        if not stacking_mismatch:
+          if mismatching_stacked_indices:
+            # We needed to stack some inputs. This code will be abandoned and
+            # should not get executed. Hence we simply return `new_inputs` to
+            # make sure the graph construction code completes.
+            with ops.control_dependencies([
+                control_flow_ops.Assert(
+                    False, ["pfor ERROR: this branch should never execute"])]):
+              return [array_ops.identity(x) for x in new_inputs]
+          else:
+            return [out.t for out in outputs]
+
+    # If all are done, we simply return `new_inputs`. Else we need to run the
+    # body function.
+    return control_flow_ops.cond(
+        not_all_done,
+        true_fn,
+        lambda: list(new_inputs)), mismatching_stacked_indices
+
+  def __call__(self):
+    """Converter for the V2 while_loop.
+
+    The conversion of a while_loop is another while_loop.
+
+    The arguments to this converted while_loop are as follows:
+    not_all_done: Boolean scalar Tensor indicating if all the pfor iterations
+      are done.
+    indices: int32 1-D Tensor storing the id of the pfor iterations that are not
+      done.
+    args: Remaining arguments. These can be divided into 2 categories:
+      - The first set of arguments correspond one-to-one to the inputs to the
+        unvectorized while_loop.
+      - The second set are TensorArrays, corresponding one-to-one to each output
+        of the unvectorized while_loop. Each TensorArray has `PFor.loop_len`
+        elements, i.e. the number of pfor iterations. At the end, the i'th
+        element of each TensorArray will contain the output computed by the i'th
+        iteration of pfor. Note that elements can be written into these tensors
+        arrays in any order, depending on when the corresponding pfor iteration
+        is done.
+    In each iteration, the while_loop body recomputes the condition for all
+    active pfor iterations to see which of them are now done. It then partitions
+    all the inputs and passes them along to the converted body. Values for all
+    the iterations that are done are written to TensorArrays indexed by the pfor
+    iteration number. When all iterations are done, the TensorArrays are stacked
+    to get the final value.
+
+    Returns:
+      List of converted outputs.
+    """
+    output_shapes = self._output_shapes()
+    # Note that we use these lists as a hack since we need the `body` to compute
+    # these values during construction of the while_loop graph.
+    cond_is_stacked = [None]
+    indices_to_stack = []
+
+    def cond(not_all_done, *_):
+      return not_all_done
+
+    def body(not_all_done, indices, *args):
+      # See documentation for __call__ for the structure of *args.
+      num_inputs = self._pfor_input.num_inputs
+      inputs = args[:num_inputs]
+      output_tas = args[num_inputs:]
+      inputs_stacked = [x.is_stacked for x in self._pfor_input.inputs]
+      assert len(inputs) >= len(output_tas)
+      assert len(inputs) == len(inputs_stacked)
+      # Convert condition
+      with ops.name_scope("while_cond"):
+        # Note that we set all_indices_partitioned to True here. At this point
+        # we don't know if indices will be partitioned. Hence we use the
+        # conservative value.
+        cond_pfor = PFor(
+            loop_var=self._pfor.loop_var,
+            loop_len=array_ops.size(indices),
+            pfor_ops=self._cond_func.graph.get_operations(),
+            fallback_to_while_loop=self._pfor.fallback_to_while_loop,
+            all_indices=indices,
+            all_indices_partitioned=True,
+            pfor_config=self._pfor.pfor_config)
+
+        wrapped_inputs = [wrap(inp, stacked) for inp, stacked
+                          in zip(inputs, inputs_stacked)]
+        conditions, cond_stacked, _ = _convert_function_call(
+            self._cond_func,
+            cond_pfor,
+            wrapped_inputs)[0]
+        cond_is_stacked[0] = cond_stacked
+
+      # Recompute the new condition, write outputs of done iterations, and
+      # partition the inputs if needed.
+      if not cond_stacked:
+        (not_all_done, new_indices, new_inputs,
+         new_output_tas) = self._process_cond_unstacked(conditions, indices,
+                                                        inputs, output_tas)
+      else:
+        (not_all_done, new_indices, new_inputs,
+         new_output_tas) = self._process_cond_stacked(conditions, indices,
+                                                      inputs, inputs_stacked,
+                                                      output_tas)
+      # Convert body
+      with ops.name_scope("while_body"):
+        #  Compute the outputs from the body.
+        new_outputs, mismatching_stacked_indices = self._process_body(
+            inputs_stacked, new_indices, cond_stacked, new_inputs, not_all_done)
+
+      indices_to_stack[:] = mismatching_stacked_indices
+      for i, new_output in enumerate(new_outputs):
+        new_output.set_shape(output_shapes[i])
+      new_args = ([not_all_done, new_indices] + new_outputs +
+                  list(new_output_tas))
+      return tuple(new_args)
+
+    # Note that we run the code below in a function since we might abandon the
+    # generated code in cases where the conversion dictates that some inputs be
+    # further stacked. Hence we run the graph construction using
+    # `get_concrete_function` and avoid calling the constructed function if not
+    # needed.
+    @def_function.function
+    def while_fn():
+      # Create init_values that will be passed to the while_loop.
+      init_values = self._init_values()
+      ta_shape_invariants = [tensor_shape.TensorShape([]) for _ in
+                             self._pfor_input.outputs]
+      shape_invariants = (
+          [tensor_shape.TensorShape([]), tensor_shape.TensorShape([None])]
+          + output_shapes + ta_shape_invariants)
+
+      while_outputs = control_flow_ops.while_loop(
+          cond, body, init_values,
+          shape_invariants=shape_invariants,
+          parallel_iterations=self._parallel_iterations)
+      if indices_to_stack:
+        # This function will be abandoned.
+        return while_outputs
+      else:
+        num_inputs = self._pfor_input.num_inputs
+        new_inputs = while_outputs[2:num_inputs+2]
+        output_tas = while_outputs[num_inputs+2:]
+        assert cond_is_stacked[0] is not None
+        outputs = []
+        for i, inp in enumerate(new_inputs):
+          if cond_is_stacked[0]:
+            if i in self._body_pass_through_indices:
+              outputs.append(init_values[i + 2])
+            else:
+              ta = output_tas[i]
+              outputs.append(ta.stack())
+          else:
+            outputs.append(inp)
+        return outputs
+
+    _ = while_fn.get_concrete_function()
+    if indices_to_stack:
+      # Need to abandon the current conversion, stack some inputs and restart.
+      self._pfor_input.stack_inputs(stack_indices=indices_to_stack)
+      # Note that this call will recurse at most one time. The first call will
+      # do the required stacking, based on the iterative procedure in
+      # _process_body, and the next invocation to __call__ should not need to do
+      # any more stacking.
+      # We invoke `self()` here as a way to discard any corrupted state.
+      return self()
+    else:
+      outputs = while_fn()
+      wrapped_outputs = []
+      for i, (out, inp) in enumerate(zip(outputs, self._pfor_input.inputs)):
+        if i not in self._body_pass_through_indices and cond_is_stacked[0]:
+          wrapped_outputs.append(wrap(out, True))
+        else:
+          wrapped_outputs.append(wrap(out, inp.is_stacked))
+      return wrapped_outputs
+
+
+@RegisterPFor("StatelessWhile")
+@RegisterPFor("While")
+def _convert_while(pfor_input):
+  converter = WhileV2(pfor_input)
+  return converter()
+
+
 # spectral_ops
 
 
diff --git a/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
index 9d0fac6db4c..b1762e2f55f 100644
--- a/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
@@ -20,13 +20,17 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.compiler.tf2xla.python import xla as xla_ops
+from tensorflow.python.compiler.xla import jit
 from tensorflow.python.compiler.xla import xla
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
 from tensorflow.python.ops.parallel_for.test_util import PForTestCase
 from tensorflow.python.platform import test
@@ -118,5 +122,120 @@ class PForTest(PForTestCase):
     self.assertAllClose(ans_val, output_val)
 
 
-if __name__ == '__main__':
+def _make_unstacked(cond, body, pfor_config):
+
+  def _cond(*args):
+    return math_ops.reduce_any(pfor_config.reduce_concat(args[0]))
+
+  def _body(*args):
+    not_done = args[0]
+    args = args[1:]
+    not_done = math_ops.logical_and(not_done, cond(*args))
+    outputs = body(*args)
+    return (not_done,) + tuple(
+        array_ops.where_v2(not_done, x, y) for x, y in zip(outputs, args))
+
+  return _cond, _body
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class WhileV2Test(PForTestCase):
+
+  def setUp(self):
+    self._enabled = control_flow_v2_toggles.control_flow_v2_enabled()
+    control_flow_v2_toggles.enable_control_flow_v2()
+    super(WhileV2Test, self).setUp()
+
+  def tearDown(self):
+    if not self._enabled:
+      control_flow_v2_toggles.disable_control_flow_v2()
+    super(WhileV2Test, self).tearDown()
+
+  def _test_loop_fn(self, loop_fn, iters, force_xla=False):
+
+    def f():
+      return pfor_control_flow_ops.pfor(loop_fn, iters)
+
+    @def_function.function
+    def jit_f():
+      with jit.experimental_jit_scope():
+        return f()
+
+    out = f()
+    jit_out = jit_f()
+    self.run_and_assert_equal(out, jit_out)
+    # TODO(agarwal): The following may complain about uncompilable nodes. Hence
+    # these are currently not enabled for all tests.
+    if force_xla:
+      out_exp_compile_f = def_function.function(experimental_compile=True)(f)()
+      self.run_and_assert_equal(out, out_exp_compile_f)
+      out_xla_compile_f = xla.compile(f, inputs=[])
+      self.run_and_assert_equal(out, out_xla_compile_f)
+
+  def test_stateless_while(self):
+    x = random_ops.random_uniform([3, 5])
+    lengths = constant_op.constant([4, 0, 2])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      lengths_i = array_ops.gather(lengths, i)
+
+      return control_flow_ops.while_loop(
+          lambda j, _: j < lengths_i,
+          lambda j, t: (j + 1, t + array_ops.gather(x_i, j)),
+          [0, 0.])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_while_with_variable(self):
+    v = resource_variable_ops.ResourceVariable(5.)
+
+    def loop_fn(_):
+      _, output = control_flow_ops.while_loop(
+          lambda j, x: j < 4,
+          lambda j, x: (j + 1, x + v),
+          [0, 0.])
+      return output
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_while_unstacked_condition(self):
+
+    def loop_fn(i):
+      return control_flow_ops.while_loop(
+          lambda j, x: j < 4,
+          lambda j, x: (j + 1, x + i), [0, 0])
+
+    self._test_loop_fn(loop_fn, 3, force_xla=True)
+
+  def test_while_force_unstacked_condition(self):
+    # The while_loop in this setup is similar to the one in test_stateless_while
+    # whose condition is loop variant. However here we wrap the cond and body of
+    # the loop in a way that makes the while_loop condition pfor loop invariant.
+    # This allows xla compilation to work since the vectorized code no longer
+    # needs to perform dynamic partitioning of the inputs.
+    x = random_ops.random_uniform([3, 5])
+    lengths = constant_op.constant([4, 0, 2])
+
+    def loop_fn(i, pfor_config):
+      x_i = array_ops.gather(x, i)
+      lengths_i = array_ops.gather(lengths, i)
+
+      def _cond(j, _):
+        return j < lengths_i
+
+      def _body(j, t):
+        return (j + 1, t + array_ops.gather(x_i, j))
+
+      cond, body = _make_unstacked(_cond, _body, pfor_config)
+      return control_flow_ops.while_loop(
+          cond,
+          body,
+          [True, 0, 0.])
+
+    # b/155430349: Enabling forrce_xla=True triggers a CHECK in debug mode.
+    self._test_loop_fn(loop_fn, 3, force_xla=False)
+
+
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 8e518e913be..edcae89aada 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import parsing_config
 from tensorflow.python.ops.gen_parsing_ops import *
 # pylint: enable=wildcard-import,undefined-variable
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -77,6 +78,7 @@ def _prepend_none_dimension(features):
 
 
 @tf_export("io.parse_example", v1=[])
+@dispatch.add_dispatch_support
 def parse_example_v2(serialized, features, example_names=None, name=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -314,6 +316,7 @@ def parse_example_v2(serialized, features, example_names=None, name=None):
 
 
 @tf_export(v1=["io.parse_example", "parse_example"])
+@dispatch.add_dispatch_support
 def parse_example(serialized, features, name=None, example_names=None):
   return parse_example_v2(serialized, features, example_names, name)
 
@@ -373,6 +376,7 @@ def _parse_example_raw(serialized, names, params, name):
 
 
 @tf_export(v1=["io.parse_single_example", "parse_single_example"])
+@dispatch.add_dispatch_support
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -407,6 +411,7 @@ def parse_single_example(serialized, features, name=None, example_names=None):
 
 
 @tf_export("io.parse_single_example", v1=[])
+@dispatch.add_dispatch_support
 def parse_single_example_v2(
     serialized, features, example_names=None, name=None
     ):
@@ -448,6 +453,7 @@ def parse_single_example_v2(
 
 
 @tf_export("io.parse_sequence_example")
+@dispatch.add_dispatch_support
 def parse_sequence_example(serialized,
                            context_features=None,
                            sequence_features=None,
@@ -692,6 +698,7 @@ def _parse_sequence_example_raw(serialized,
 @tf_export("io.parse_single_sequence_example",
            v1=["io.parse_single_sequence_example",
                "parse_single_sequence_example"])
+@dispatch.add_dispatch_support
 def parse_single_sequence_example(
     serialized, context_features=None, sequence_features=None,
     example_name=None, name=None):
@@ -835,6 +842,7 @@ def _parse_single_sequence_example_raw(serialized,
 
 
 @tf_export("io.decode_raw", v1=[])
+@dispatch.add_dispatch_support
 def decode_raw(input_bytes,
                out_type,
                little_endian=True,
@@ -877,6 +885,7 @@ def decode_raw(input_bytes,
 
 
 @tf_export(v1=["decode_raw", "io.decode_raw"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "bytes is deprecated, use input_bytes instead",
                              "bytes")
@@ -921,6 +930,7 @@ def decode_raw_v1(
 
 # Swap `name` and `na_value` for backward compatibility.
 @tf_export(v1=["io.decode_csv", "decode_csv"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("decode_csv")
 def decode_csv(records,
                record_defaults,
@@ -970,6 +980,7 @@ def decode_csv(records,
 
 
 @tf_export("io.decode_csv", v1=[])
+@dispatch.add_dispatch_support
 def decode_csv_v2(records,
                   record_defaults,
                   field_delim=",",
diff --git a/tensorflow/python/ops/proto_ops.py b/tensorflow/python/ops/proto_ops.py
index 1f7300dbef9..0e19aad584c 100644
--- a/tensorflow/python/ops/proto_ops.py
+++ b/tensorflow/python/ops/proto_ops.py
@@ -22,10 +22,11 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops.gen_decode_proto_ops import decode_proto_v2 as decode_proto
 from tensorflow.python.ops.gen_encode_proto_ops import encode_proto
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
-tf_export("io.decode_proto")(decode_proto)
-tf_export("io.encode_proto")(encode_proto)
+tf_export("io.decode_proto")(dispatch.add_dispatch_support(decode_proto))
+tf_export("io.encode_proto")(dispatch.add_dispatch_support(encode_proto))
 
 ops.NotDifferentiable("DecodeProtoV2")
 ops.NotDifferentiable("EncodeProto")
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 4d2161a93b8..66cac6a11d2 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -1108,7 +1108,6 @@ py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python/keras:backend",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 7f971cd558f..782902f2f71 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 #===============================================================================
@@ -40,6 +41,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('ragged.boolean_mask')
+@dispatch.add_dispatch_support
 def boolean_mask(data, mask, name=None):
   """Applies a boolean mask to `data` without flattening the mask dimensions.
 
@@ -538,6 +540,7 @@ def ragged_one_hot(indices,
 # ragged.stack_dynamic_partitions
 #===============================================================================
 @tf_export('ragged.stack_dynamic_partitions')
+@dispatch.add_dispatch_support
 def stack_dynamic_partitions(data, partitions, num_partitions, name=None):
   """Stacks dynamic partitions of a Tensor or RaggedTensor.
 
@@ -699,6 +702,7 @@ def reverse(tensor, axis, name=None):
 
 
 @tf_export('ragged.cross')
+@dispatch.add_dispatch_support
 def cross(inputs, name=None):
   """Generates feature cross from a list of tensors.
 
@@ -725,6 +729,7 @@ def cross(inputs, name=None):
 
 
 @tf_export('ragged.cross_hashed')
+@dispatch.add_dispatch_support
 def cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
   """Generates hashed feature cross from a list of tensors.
 
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
index 9bcb1aa4765..cd710f449a6 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -71,6 +72,7 @@ def concat(values, axis, name=None):
 
 
 @tf_export('ragged.stack')
+@dispatch.add_dispatch_support
 def stack(values, axis=0, name=None):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` `RaggedTensor`.
 
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index dd5bd782462..f13bed07ba0 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
@@ -453,6 +454,26 @@ def _ragged_dynamic_partition(data, partitions, num_partitions, name=None):
                                                      num_partitions, name)
   return [result[i] for i in range(num_partitions)]
 
+
+def _ragged_nn_dropout_v1(x, keep_prob=None, noise_shape=None, seed=None,
+                          name=None, rate=None):
+  if noise_shape is not None:
+    raise ValueError('noise_shape is not supported yet for RaggedTensor x')
+  with ops.name_scope(name, 'RaggedNNDropout', [x, rate]):
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+    return x.with_flat_values(nn_ops.dropout(x.flat_values, keep_prob=keep_prob,
+                                             seed=seed, rate=rate))
+
+
+def _ragged_nn_dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
+  if noise_shape is not None:
+    raise ValueError('noise_shape is not supported yet for RaggedTensor x')
+  with ops.name_scope(name, 'RaggedNNDropout', [x, rate]):
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+    return x.with_flat_values(nn_ops.dropout_v2(x.flat_values, rate=rate,
+                                                seed=seed))
+
+
 # (original_op, ragged_op, ragged_args)
 _RAGGED_DISPATCH_OPS = [
     (array_ops.batch_gather, ragged_batch_gather_ops.batch_gather,
@@ -497,6 +518,8 @@ _RAGGED_DISPATCH_OPS = [
     (math_ops.reduce_mean, ragged_math_ops.reduce_mean, ['input_tensor']),
     (math_ops.reduce_any, ragged_math_ops.reduce_any, ['input_tensor']),
     (math_ops.reduce_all, ragged_math_ops.reduce_all, ['input_tensor']),
+    (nn_ops.dropout, _ragged_nn_dropout_v1, ['x']),
+    (nn_ops.dropout_v2, _ragged_nn_dropout_v2, ['x']),
 ]
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 0ce9a6f9771..60d9f6c8713 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_dispatch
@@ -232,6 +233,10 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
           {'op': array_ops.check_numerics,
            'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
            'message': 'check-numerics'},
+          {'op': nn_ops.dropout,
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
+           'rate': 0.5,
+           'seed': 1},
       ]
       )  # pyformat: disable
   def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
@@ -820,7 +825,8 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
         'strings.substr', 'strings.to_hash_bucket_fast',
         'strings.to_hash_bucket_strong', 'strings.to_hash_bucket',
         'strings.to_number', 'strings.unicode_script', 'tile', 'truncatediv',
-        'truncatemod', 'zeros_like', 'dynamic_partition', 'reverse'
+        'truncatemod', 'zeros_like', 'dynamic_partition', 'reverse',
+        'nn.dropout',
     ]
 
     # Ops that should be listed as supported in v1 only.
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index aa148ae7fe8..3a6f6231149 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -34,6 +35,7 @@ from tensorflow.python.util.tf_export import tf_export
 # Op to construct a constant RaggedTensor from a nested Python list.
 #===============================================================================
 @tf_export("ragged.constant")
+@dispatch.add_dispatch_support
 def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
              name=None, row_splits_dtype=dtypes.int64):
   """Constructs a constant RaggedTensor from a nested Python list.
@@ -86,6 +88,7 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
 
 
 @tf_export(v1=["ragged.constant_value"])
+@dispatch.add_dispatch_support
 def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None,
                    row_splits_dtype="int64"):
   """Constructs a RaggedTensorValue from a nested Python list.
@@ -311,6 +314,7 @@ def _default_inner_shape_for_pylist(pylist, ragged_rank):
 
 
 @tf_export(v1=["ragged.placeholder"])
+@dispatch.add_dispatch_support
 def placeholder(dtype, ragged_rank, value_shape=None, name=None):
   """Creates a placeholder for a `tf.RaggedTensor` that will always be fed.
 
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index cc45f729e58..00b5ced6170 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -24,10 +24,12 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("ragged.map_flat_values")
+@dispatch.add_dispatch_support
 def map_flat_values(op, *args, **kwargs):
   """Applies `op` to the values of one or more RaggedTensors.
 
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index c0325628e6e..9e74de4bc35 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import backend
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops as mo
 from tensorflow.python.ops import string_ops
@@ -229,7 +228,7 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
 
     def _zip(foo):
       y_val, x_val = foo
-      bar = backend.tile(y_val, array_ops.shape(x_val))
+      bar = array_ops.tile(y_val, array_ops.shape(x_val))
       return array_ops.stack([bar, x_val], axis=1)
 
     output = ragged_map_ops.map_fn(
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 5483cda571c..73a53583ada 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -38,6 +39,7 @@ from tensorflow.python.util.tf_export import tf_export
 #===============================================================================
 # pylint: disable=redefined-builtin
 @tf_export('ragged.range')
+@dispatch.add_dispatch_support
 def range(starts, limits=None, deltas=1, dtype=None,
           name=None, row_splits_dtype=dtypes.int64):
   """Returns a `RaggedTensor` containing the specified sequences of numbers.
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index d5f21832044..0d9c4d506f3 100755
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -29,10 +29,12 @@ from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("strings.bytes_split")
+@dispatch.add_dispatch_support
 def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
   """Split string elements of `input` into bytes.
 
@@ -80,6 +82,7 @@ def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_encode")
+@dispatch.add_dispatch_support
 def unicode_encode(input,
                    output_encoding,
                    errors="replace",
@@ -177,6 +180,7 @@ def unicode_encode(input,
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_decode")
+@dispatch.add_dispatch_support
 def unicode_decode(input,
                    input_encoding,
                    errors="replace",
@@ -222,6 +226,7 @@ def unicode_decode(input,
 
 
 @tf_export("strings.unicode_decode_with_offsets")
+@dispatch.add_dispatch_support
 def unicode_decode_with_offsets(input,
                                 input_encoding,
                                 errors="replace",
@@ -283,6 +288,7 @@ def unicode_decode_with_offsets(input,
 
 
 @tf_export("strings.unicode_split")
+@dispatch.add_dispatch_support
 def unicode_split(input,
                   input_encoding,
                   errors="replace",
@@ -330,6 +336,7 @@ def unicode_split(input,
 
 
 @tf_export("strings.unicode_split_with_offsets")
+@dispatch.add_dispatch_support
 def unicode_split_with_offsets(input,
                                input_encoding,
                                errors="replace",
@@ -453,6 +460,7 @@ def _unicode_decode(input, input_encoding, errors, replacement_char,
 
 
 @tf_export("strings.split", v1=[])
+@dispatch.add_dispatch_support
 def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable=redefined-builtin
   """Split elements of `input` based on `sep` into a `RaggedTensor`.
 
@@ -514,6 +522,7 @@ def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable
 
 
 @tf_export(v1=["string_split"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "delimiter is deprecated, please use sep instead.",
                              "delimiter")
@@ -578,6 +587,7 @@ def string_split(source, sep=None, skip_empty=True, delimiter=None,
 # In TensorFlow 1.x, "tf.strings.split" uses the new signature (with maxsplit),
 # but we need to add the result_type argument.
 @tf_export(v1=["strings.split"])
+@dispatch.add_dispatch_support
 def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redefined-builtin
                      result_type="SparseTensor", source=None, name=None):
   """Split elements of `input` based on `sep`.
@@ -651,6 +661,7 @@ def reduce_join(inputs, axis=None, keepdims=None, separator="", name=None):
 
 
 @tf_export("strings.ngrams")
+@dispatch.add_dispatch_support
 def ngrams(data,
            ngram_width,
            separator=" ",
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 32e388e480d..34c55954d62 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -30,7 +30,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_like
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
@@ -44,6 +43,7 @@ from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged.row_partition import RowPartition
+from tensorflow.python.types import internal as internal_types
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
@@ -56,7 +56,8 @@ _convert_row_partition = RowPartition._convert_row_partition
 
 
 @tf_export("RaggedTensor")
-class RaggedTensor(composite_tensor.CompositeTensor, tensor_like.TensorLike):
+class RaggedTensor(composite_tensor.CompositeTensor,
+                   internal_types.NativeObject):
   """Represents a ragged tensor.
 
   A `RaggedTensor` is a tensor with one or more *ragged dimensions*, which are
@@ -1373,7 +1374,7 @@ class RaggedTensor(composite_tensor.CompositeTensor, tensor_like.TensorLike):
     if not outer_axis < inner_axis:
       raise ValueError("Expected outer_axis (%d) to be less than "
                        "inner_axis (%d)" % (outer_axis, inner_axis))
-    return _merge_dims(self, outer_axis, inner_axis)
+    return merge_dims(self, outer_axis, inner_axis)
 
   def _set_shape(self, shape):
     """Updates the static shape of `self` to be `shape`.
@@ -2490,7 +2491,7 @@ def _nrows(tensor, out_type=dtypes.int32):
     return array_ops.shape(tensor, out_type=out_type)[0]
 
 
-def _merge_dims(value, outer_axis, inner_axis):
+def merge_dims(value, outer_axis, inner_axis):
   """Merges value[outer_axis...inner_axis] into a single dimension.
 
   See `RaggedTensor.merge_dims()` for more details.  This helper differs from
@@ -2528,7 +2529,7 @@ def _merge_dims(value, outer_axis, inner_axis):
   # Handle outer_axis>1 via recursion.
   if outer_axis > 1:
     return value.with_values(
-        _merge_dims(value.values, outer_axis - 1, inner_axis - 1))
+        merge_dims(value.values, outer_axis - 1, inner_axis - 1))
 
   # At this point, we know outer_axis == 1, and value is a RaggedTensor.
   # So we need to flatten the values and build a corresponding splits tensor.
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index 5329860743e..0d4a58bfea4 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -25,12 +25,14 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.row_splits_to_segment_ids")
+@dispatch.add_dispatch_support
 def row_splits_to_segment_ids(splits, name=None, out_type=None):
   """Generates the segmentation corresponding to a RaggedTensor `row_splits`.
 
@@ -74,6 +76,7 @@ def row_splits_to_segment_ids(splits, name=None, out_type=None):
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.segment_ids_to_row_splits")
+@dispatch.add_dispatch_support
 def segment_ids_to_row_splits(segment_ids, num_segments=None,
                               out_type=None, name=None):
   """Generates the RaggedTensor `row_splits` corresponding to a segmentation.
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 83cb7fcc92a..1af91ed0dd3 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -36,10 +36,12 @@ from tensorflow.python.ops.gen_random_ops import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("random.normal", v1=["random.normal", "random_normal"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_normal")
 def random_normal(shape,
                   mean=0.0,
@@ -155,6 +157,7 @@ def parameterized_truncated_normal(shape,
 
 @tf_export("random.truncated_normal",
            v1=["random.truncated_normal", "truncated_normal"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("truncated_normal")
 def truncated_normal(shape,
                      mean=0.0,
@@ -202,6 +205,7 @@ ops.NotDifferentiable("TruncatedNormal")
 
 
 @tf_export("random.uniform", v1=["random.uniform", "random_uniform"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_uniform")
 def random_uniform(shape,
                    minval=0,
@@ -313,6 +317,7 @@ ops.NotDifferentiable("RandomUniform")
 
 
 @tf_export("random.shuffle", v1=["random.shuffle", "random_shuffle"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_shuffle")
 def random_shuffle(value, seed=None, name=None):
   """Randomly shuffles a tensor along its first dimension.
@@ -345,6 +350,7 @@ def random_shuffle(value, seed=None, name=None):
 
 
 @tf_export("image.random_crop", v1=["image.random_crop", "random_crop"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_crop")
 def random_crop(value, size, seed=None, name=None):
   """Randomly crops a tensor to a given size.
@@ -389,6 +395,7 @@ def random_crop(value, size, seed=None, name=None):
 
 
 @tf_export(v1=["random.multinomial", "multinomial"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None, instructions="Use `tf.random.categorical` instead.")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
@@ -468,6 +475,7 @@ def _maybe_set_static_shape_helper(tensor, shape, postfix_tensor):
 
 
 @tf_export("random.gamma", v1=["random.gamma", "random_gamma"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_gamma")
 def random_gamma(shape,
                  alpha,
@@ -561,6 +569,7 @@ def random_gamma(shape,
 
 
 @tf_export(v1=["random.poisson", "random_poisson"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
@@ -601,6 +610,7 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
 
 
 @tf_export("random.poisson", v1=[])
+@dispatch.add_dispatch_support
 def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index f99f886f210..d8a7765a208 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -49,6 +49,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
@@ -330,7 +331,7 @@ def variable_accessed(variable):
     tape.variable_accessed(variable)
 
 
-class BaseResourceVariable(variables.VariableV1):
+class BaseResourceVariable(variables.VariableV1, core.Tensor):
   """A python variable from an existing handle."""
 
   # TODO(wangpeng): Deprecate `constraint` when callers no long pass it in.
@@ -1830,7 +1831,6 @@ def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
 # allowing instances of the class to be used as tensors.
 ops.register_tensor_conversion_function(BaseResourceVariable,
                                         _dense_var_to_tensor)
-ops.register_dense_tensor_like_type(BaseResourceVariable)
 
 
 class _UnreadVariable(BaseResourceVariable):
@@ -1955,9 +1955,6 @@ class _UnreadVariable(BaseResourceVariable):
     return self._parent_op
 
 
-ops.register_dense_tensor_like_type(_UnreadVariable)
-
-
 @ops.RegisterGradient("ReadVariableOp")
 def _ReadGrad(_, grad):
   """Gradient for read op."""
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index b87e5d65a37..6c11ebefb1c 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -342,6 +343,7 @@ def _reverse_seq(input_seq, lengths):
                         "keras.layers.RNN(cell))`, which is equivalent to "
                         "this API")
 @tf_export(v1=["nn.bidirectional_dynamic_rnn"])
+@dispatch.add_dispatch_support
 def bidirectional_dynamic_rnn(cell_fw,
                               cell_bw,
                               inputs,
@@ -499,6 +501,7 @@ def bidirectional_dynamic_rnn(cell_fw,
     None,
     "Please use `keras.layers.RNN(cell)`, which is equivalent to this API")
 @tf_export(v1=["nn.dynamic_rnn"])
+@dispatch.add_dispatch_support
 def dynamic_rnn(cell,
                 inputs,
                 sequence_length=None,
@@ -912,6 +915,7 @@ def _dynamic_rnn_loop(cell,
 
 
 @tf_export(v1=["nn.raw_rnn"])
+@dispatch.add_dispatch_support
 def raw_rnn(cell,
             loop_fn,
             parallel_iterations=None,
@@ -1238,6 +1242,7 @@ def raw_rnn(cell,
                         "Please use `keras.layers.RNN(cell, unroll=True)`, "
                         "which is equivalent to this API")
 @tf_export(v1=["nn.static_rnn"])
+@dispatch.add_dispatch_support
 def static_rnn(cell,
                inputs,
                initial_state=None,
@@ -1416,6 +1421,7 @@ def static_rnn(cell,
                         "Please use `keras.layers.RNN(cell, stateful=True)`, "
                         "which is equivalent to this API")
 @tf_export(v1=["nn.static_state_saving_rnn"])
+@dispatch.add_dispatch_support
 def static_state_saving_rnn(cell,
                             inputs,
                             state_saver,
@@ -1510,6 +1516,7 @@ def static_state_saving_rnn(cell,
                         "keras.layers.RNN(cell, unroll=True))`, which is "
                         "equivalent to this API")
 @tf_export(v1=["nn.static_bidirectional_rnn"])
+@dispatch.add_dispatch_support
 def static_bidirectional_rnn(cell_fw,
                              cell_bw,
                              inputs,
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index bee85dc4a5b..7ee5a16ca9a 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gen_script_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
@@ -370,6 +371,7 @@ def _EagerPyFuncGrad(op, *dy):
 
 
 @tf_export("py_function")
+@dispatch.add_dispatch_support
 def eager_py_func(func, inp, Tout, name=None):
   """Wraps a python function into a TensorFlow op that executes it eagerly.
 
@@ -551,6 +553,7 @@ def py_func_common(func, inp, Tout, stateful=True, name=None):
     stateful argument making all functions stateful.
     """)
 @tf_export(v1=["py_func"])
+@dispatch.add_dispatch_support
 def py_func(func, inp, Tout, stateful=True, name=None):
   return py_func_common(func, inp, Tout, stateful, name=name)
 
@@ -559,6 +562,7 @@ py_func.__doc__ = "%s" % py_func_common.__doc__
 
 
 @tf_export("numpy_function")
+@dispatch.add_dispatch_support
 def numpy_function(func, inp, Tout, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index 988d437bae8..0b65033ce8c 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import gen_set_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -32,6 +33,7 @@ _VALID_DTYPES = set([
 
 
 @tf_export("sets.size", v1=["sets.size", "sets.set_size"])
+@dispatch.add_dispatch_support
 def set_size(a, validate_indices=True):
   """Compute number of unique elements along last dimension of `a`.
 
@@ -135,6 +137,7 @@ def _set_operation(a, b, set_operation, validate_indices=True):
 
 @tf_export(
     "sets.intersection", v1=["sets.intersection", "sets.set_intersection"])
+@dispatch.add_dispatch_support
 def set_intersection(a, b, validate_indices=True):
   """Compute set intersection of elements in last dimension of `a` and `b`.
 
@@ -205,6 +208,7 @@ def set_intersection(a, b, validate_indices=True):
 
 @tf_export(
     "sets.difference", v1=["sets.difference", "sets.set_difference"])
+@dispatch.add_dispatch_support
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -286,6 +290,7 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
 
 @tf_export(
     "sets.union", v1=["sets.union", "sets.set_union"])
+@dispatch.add_dispatch_support
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
index d628e54cdf9..18730743941 100644
--- a/tensorflow/python/ops/signal/dct_ops.py
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -50,6 +51,7 @@ def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm):
 
 # TODO(rjryan): Implement `axis` parameter.
 @tf_export("signal.dct", v1=["signal.dct", "spectral.dct"])
+@dispatch.add_dispatch_support
 def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
   """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
 
@@ -181,6 +183,7 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disabl
 
 # TODO(rjryan): Implement `n` and `axis` parameters.
 @tf_export("signal.idct", v1=["signal.idct", "spectral.idct"])
+@dispatch.add_dispatch_support
 def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
   """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
 
diff --git a/tensorflow/python/ops/signal/fft_ops.py b/tensorflow/python/ops/signal/fft_ops.py
index 6e9e8ef80e4..86a94cf5de7 100644
--- a/tensorflow/python/ops/signal/fft_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -181,17 +182,23 @@ ifft2d = gen_spectral_ops.ifft2d
 fft3d = gen_spectral_ops.fft3d
 ifft3d = gen_spectral_ops.ifft3d
 rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
-tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(rfft)
+tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(
+    dispatch.add_dispatch_support(rfft))
 irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
-tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(irfft)
+tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(
+    dispatch.add_dispatch_support(irfft))
 rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
-tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(rfft2d)
+tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(
+    dispatch.add_dispatch_support(rfft2d))
 irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
-tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(irfft2d)
+tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(
+    dispatch.add_dispatch_support(irfft2d))
 rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
-tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(rfft3d)
+tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(
+    dispatch.add_dispatch_support(rfft3d))
 irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
-tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(irfft3d)
+tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(
+    dispatch.add_dispatch_support(irfft3d))
 
 
 def _fft_size_for_grad(grad, rank):
@@ -363,6 +370,7 @@ def _irfft_grad_helper(rank, rfft_fn):
 
 
 @tf_export("signal.fftshift")
+@dispatch.add_dispatch_support
 def fftshift(x, axes=None, name=None):
   """Shift the zero-frequency component to the center of the spectrum.
 
@@ -407,6 +415,7 @@ def fftshift(x, axes=None, name=None):
 
 
 @tf_export("signal.ifftshift")
+@dispatch.add_dispatch_support
 def ifftshift(x, axes=None, name=None):
   """The inverse of fftshift.
 
diff --git a/tensorflow/python/ops/signal/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py
index aa0769166a4..cf0bed9ef1b 100644
--- a/tensorflow/python/ops/signal/mel_ops.py
+++ b/tensorflow/python/ops/signal/mel_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -90,6 +91,7 @@ def _validate_arguments(num_mel_bins, sample_rate,
 
 
 @tf_export('signal.linear_to_mel_weight_matrix')
+@dispatch.add_dispatch_support
 def linear_to_mel_weight_matrix(num_mel_bins=20,
                                 num_spectrogram_bins=129,
                                 sample_rate=8000,
@@ -128,8 +130,6 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
       # S has shape [..., num_spectrogram_bins].
       # M has shape [..., num_mel_bins].
       M = tf.tensordot(S, A, 1)
-      # tf.tensordot does not support shape inference for this case yet.
-      M.set_shape(S.shape[:-1].concatenate(A.shape[-1:]))
 
   Args:
     num_mel_bins: Python int. How many bands in the resulting mel spectrum.
diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
index 56cbff40bca..948b78a858e 100644
--- a/tensorflow/python/ops/signal/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -22,10 +22,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import dct_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('signal.mfccs_from_log_mel_spectrograms')
+@dispatch.add_dispatch_support
 def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.
 
diff --git a/tensorflow/python/ops/signal/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
index fcdcf592f14..e340e97b3e5 100644
--- a/tensorflow/python/ops/signal/reconstruction_ops.py
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -23,10 +23,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("signal.overlap_and_add")
+@dispatch.add_dispatch_support
 def overlap_and_add(signal, frame_step, name=None):
   """Reconstructs a signal from a framed representation.
 
diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index 1c95873fc3d..7a3acce3475 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import util_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,6 +56,7 @@ def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
 
 
 @tf_export("signal.frame")
+@dispatch.add_dispatch_support
 def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
           name=None):
   """Expands `signal`'s `axis` dimension into frames of `frame_length`.
diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
index d096e53e8f8..7c4c5542b84 100644
--- a/tensorflow/python/ops/signal/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -31,10 +31,12 @@ from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.ops.signal import reconstruction_ops
 from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.ops.signal import window_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('signal.stft')
+@dispatch.add_dispatch_support
 def stft(signals, frame_length, frame_step, fft_length=None,
          window_fn=window_ops.hann_window,
          pad_end=False, name=None):
@@ -95,6 +97,7 @@ def stft(signals, frame_length, frame_step, fft_length=None,
 
 
 @tf_export('signal.inverse_stft_window_fn')
+@dispatch.add_dispatch_support
 def inverse_stft_window_fn(frame_step,
                            forward_window_fn=window_ops.hann_window,
                            name=None):
@@ -156,6 +159,7 @@ def inverse_stft_window_fn(frame_step,
 
 
 @tf_export('signal.inverse_stft')
+@dispatch.add_dispatch_support
 def inverse_stft(stfts,
                  frame_length,
                  frame_step,
@@ -291,6 +295,7 @@ def _enclosing_power_of_two(value):
 
 
 @tf_export('signal.mdct')
+@dispatch.add_dispatch_support
 def mdct(signals, frame_length, window_fn=window_ops.vorbis_window,
          pad_end=False, norm=None, name=None):
   """Computes the [Modified Discrete Cosine Transform][mdct] of `signals`.
@@ -366,6 +371,7 @@ def mdct(signals, frame_length, window_fn=window_ops.vorbis_window,
 
 
 @tf_export('signal.inverse_mdct')
+@dispatch.add_dispatch_support
 def inverse_mdct(mdcts,
                  window_fn=window_ops.vorbis_window,
                  norm=None,
diff --git a/tensorflow/python/ops/signal/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
index bb10bdf4be5..eb33c3f3b58 100644
--- a/tensorflow/python/ops/signal/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -52,6 +53,7 @@ def _check_params(window_length, dtype):
 
 
 @tf_export('signal.kaiser_window')
+@dispatch.add_dispatch_support
 def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
   """Generate a [Kaiser window][kaiser].
 
@@ -91,6 +93,7 @@ def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.kaiser_bessel_derived_window')
+@dispatch.add_dispatch_support
 def kaiser_bessel_derived_window(window_length, beta=12.,
                                  dtype=dtypes.float32, name=None):
   """Generate a [Kaiser Bessel derived window][kbd].
@@ -118,6 +121,7 @@ def kaiser_bessel_derived_window(window_length, beta=12.,
 
 
 @tf_export('signal.vorbis_window')
+@dispatch.add_dispatch_support
 def vorbis_window(window_length, dtype=dtypes.float32, name=None):
   """Generate a [Vorbis power complementary window][vorbis].
 
@@ -142,6 +146,7 @@ def vorbis_window(window_length, dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.hann_window')
+@dispatch.add_dispatch_support
 def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   """Generate a [Hann window][hann].
 
@@ -167,6 +172,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.hamming_window')
+@dispatch.add_dispatch_support
 def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
                    name=None):
   """Generate a [Hamming][hamming] window.
diff --git a/tensorflow/python/ops/sort_ops.py b/tensorflow/python/ops/sort_ops.py
index 92435e6bdef..4e66a80bc01 100644
--- a/tensorflow/python/ops/sort_ops.py
+++ b/tensorflow/python/ops/sort_ops.py
@@ -30,10 +30,12 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('sort')
+@dispatch.add_dispatch_support
 def sort(values, axis=-1, direction='ASCENDING', name=None):
   """Sorts a tensor.
   
@@ -67,6 +69,7 @@ def sort(values, axis=-1, direction='ASCENDING', name=None):
 
 
 @tf_export('argsort')
+@dispatch.add_dispatch_support
 def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
   """Returns the indices of a tensor that give its sorted order along an axis.
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 5096b332364..c4c88ab86ef 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -860,14 +860,19 @@ def sparse_reshape(sp_input, shape, name=None):
       original_reshaped_shape = list(reshaped_shape_const)  # A copy
       in_shape_size = np.prod(sp_input.shape.as_list())
       num_implied = sum(dim is None for dim in reshaped_shape_const)
-      if num_implied == 1:
+
+      # If there is a 0 dim in the user-provided shape, we cannot infer the
+      # unknown dim reliably. This is why we skip the `if` branch below when
+      # a 0 is present in `reshaped_shape_const`. Same below.
+      if num_implied == 1 and 0 not in reshaped_shape_const:
         implied_idx = original_reshaped_shape.index(None)
         non_implied_idx = (
             original_reshaped_shape[:implied_idx] +
             original_reshaped_shape[implied_idx + 1:])
         reshaped_shape_const[implied_idx] = int(
             in_shape_size // np.prod(non_implied_idx))
-      if num_implied <= 1:
+      if num_implied == 0 or (num_implied == 1 and
+                              0 not in reshaped_shape_const):
         reshaped_size = np.prod(reshaped_shape_const)
         if reshaped_size != in_shape_size:
           raise ValueError(
@@ -1060,6 +1065,7 @@ def sparse_slice(sp_input, start, size, name=None):
 
 
 @tf_export(v1=["sparse_to_dense"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     None,
     "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.")
@@ -1989,6 +1995,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
 
 
 @tf_export(v1=["io.serialize_sparse", "serialize_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("serialize_sparse")
 def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
@@ -2009,6 +2016,7 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
 
 
 @tf_export("io.serialize_sparse", v1=[])
+@dispatch.add_dispatch_support
 def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
 
@@ -2035,6 +2043,7 @@ def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
 
 
 @tf_export(v1=["io.serialize_many_sparse", "serialize_many_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("serialize_many_sparse")
 def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
@@ -2064,6 +2073,7 @@ def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
 
 
 @tf_export("io.serialize_many_sparse", v1=[])
+@dispatch.add_dispatch_support
 def serialize_many_sparse_v2(sp_input, out_type=dtypes.string, name=None):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
 
@@ -2167,6 +2177,7 @@ def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
 @tf_export(
     "io.deserialize_many_sparse",
     v1=["io.deserialize_many_sparse", "deserialize_many_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("deserialize_many_sparse")
 def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   """Deserialize and concatenate `SparseTensors` from a serialized minibatch.
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index a05a488408d..036346cdecd 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -42,11 +42,13 @@ from tensorflow.python.ops import gen_special_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
 @tf_export('math.lbeta', v1=['math.lbeta', 'lbeta'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('lbeta')
 def lbeta(x, name=None):
   r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
@@ -102,6 +104,7 @@ def lbeta(x, name=None):
 
 
 @tf_export('math.special.dawsn')
+@dispatch.add_dispatch_support
 def dawsn(x, name=None):
   """Computes Dawson's integral of `x` element-wise.
 
@@ -131,6 +134,7 @@ def dawsn(x, name=None):
 
 
 @tf_export('math.special.expint')
+@dispatch.add_dispatch_support
 def expint(x, name=None):
   """Computes the Exponential integral of `x` element-wise.
 
@@ -159,6 +163,7 @@ def expint(x, name=None):
 
 
 @tf_export('math.special.fresnel_cos')
+@dispatch.add_dispatch_support
 def fresnel_cos(x, name=None):
   """Computes Fresnel's cosine integral of `x` element-wise.
 
@@ -188,6 +193,7 @@ def fresnel_cos(x, name=None):
 
 
 @tf_export('math.special.fresnel_sin')
+@dispatch.add_dispatch_support
 def fresnel_sin(x, name=None):
   """Computes Fresnel's sine integral of `x` element-wise.
 
@@ -216,6 +222,7 @@ def fresnel_sin(x, name=None):
 
 
 @tf_export('math.special.spence')
+@dispatch.add_dispatch_support
 def spence(x, name=None):
   """Computes Spence's integral of `x` element-wise.
 
@@ -244,6 +251,7 @@ def spence(x, name=None):
 
 
 @tf_export('math.bessel_i0')
+@dispatch.add_dispatch_support
 def bessel_i0(x, name=None):
   """Computes the Bessel i0 function of `x` element-wise.
 
@@ -268,6 +276,7 @@ def bessel_i0(x, name=None):
 
 
 @tf_export('math.bessel_i1')
+@dispatch.add_dispatch_support
 def bessel_i1(x, name=None):
   """Computes the Bessel i1 function of `x` element-wise.
 
@@ -325,6 +334,7 @@ def _enclosing_tpu_context():
 
 
 @tf_export('einsum', 'linalg.einsum')
+@dispatch.add_dispatch_support
 def einsum(equation, *inputs, **kwargs):
   """Tensor contraction over specified indices and outer product.
 
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 2bf53d3a0f7..0ae29ba0219 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_stateless_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable("StatelessMultinomial")
@@ -40,6 +41,7 @@ ops.NotDifferentiable("StatelessTruncatedNormal")
 
 
 @tf_export("random.experimental.stateless_split")
+@dispatch.add_dispatch_support
 def split(seed, num=2):
   """Splits an RNG seed into `num` new seeds by adding a leading axis.
 
@@ -73,6 +75,7 @@ def split(seed, num=2):
 
 
 @tf_export("random.experimental.stateless_fold_in")
+@dispatch.add_dispatch_support
 def fold_in(seed, data):
   """Folds in data to an RNG seed to form a new RNG seed.
 
@@ -111,6 +114,7 @@ def fold_in(seed, data):
 
 
 @tf_export("random.stateless_uniform")
+@dispatch.add_dispatch_support
 def stateless_random_uniform(shape,
                              seed,
                              minval=0,
@@ -205,6 +209,7 @@ def stateless_random_uniform(shape,
 
 
 @tf_export("random.stateless_binomial")
+@dispatch.add_dispatch_support
 def stateless_random_binomial(shape,
                               seed,
                               counts,
@@ -274,6 +279,7 @@ def stateless_random_binomial(shape,
 
 
 @tf_export("random.stateless_gamma")
+@dispatch.add_dispatch_support
 def stateless_random_gamma(shape,
                            seed,
                            alpha,
@@ -372,6 +378,7 @@ def stateless_random_gamma(shape,
 
 
 @tf_export("random.stateless_poisson")
+@dispatch.add_dispatch_support
 def stateless_random_poisson(shape,
                              seed,
                              lam,
@@ -434,6 +441,7 @@ def stateless_random_poisson(shape,
 
 
 @tf_export("random.stateless_normal")
+@dispatch.add_dispatch_support
 def stateless_random_normal(shape,
                             seed,
                             mean=0.0,
@@ -474,6 +482,7 @@ def stateless_random_normal(shape,
 
 
 @tf_export("random.stateless_truncated_normal")
+@dispatch.add_dispatch_support
 def stateless_truncated_normal(shape,
                                seed,
                                mean=0.0,
@@ -520,6 +529,7 @@ def stateless_truncated_normal(shape,
 
 
 @tf_export(v1=["random.stateless_multinomial"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None, instructions="Use `tf.random.stateless_categorical` instead.")
 def stateless_multinomial(logits,
@@ -562,6 +572,7 @@ def stateless_multinomial(logits,
 
 
 @tf_export("random.stateless_categorical")
+@dispatch.add_dispatch_support
 def stateless_categorical(logits,
                           num_samples,
                           seed,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 113795df629..dd0ae223d9d 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -73,6 +73,7 @@ regex_full_match.__doc__ = gen_string_ops.regex_full_match.__doc__
 
 @tf_export(
     "strings.regex_replace", v1=["strings.regex_replace", "regex_replace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("regex_replace")
 @dispatch.add_dispatch_support
 def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
@@ -112,6 +113,7 @@ def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
 
 
 @tf_export("strings.format")
+@dispatch.add_dispatch_support
 def string_format(template, inputs, placeholder="{}", summarize=3, name=None):
   r"""Formats a string template using a list of tensors.
 
@@ -122,37 +124,18 @@ def string_format(template, inputs, placeholder="{}", summarize=3, name=None):
 
   Example:
     Formatting a single-tensor template:
-    ```python
-    sess = tf.compat.v1.Session()
-    with sess.as_default():
-        tensor = tf.range(10)
-        formatted = tf.strings.format("tensor: {}, suffix", tensor)
-        out = sess.run(formatted)
-        expected = "tensor: [0 1 2 ... 7 8 9], suffix"
 
-        assert(out.decode() == expected)
-    ```
+    >>> tensor = tf.range(5)
+    >>> tf.strings.format("tensor: {}, suffix", tensor)
+    <tf.Tensor: shape=(), dtype=string, numpy=b'tensor: [0 1 2 3 4], suffix'>
 
     Formatting a multi-tensor template:
-    ```python
-    sess = tf.compat.v1.Session()
-    with sess.as_default():
-        tensor_one = tf.reshape(tf.range(100), [10, 10])
-        tensor_two = tf.range(10)
-        formatted = tf.strings.format("first: {}, second: {}, suffix",
-          (tensor_one, tensor_two))
 
-        out = sess.run(formatted)
-        expected = ("first: [[0 1 2 ... 7 8 9]\n"
-              " [10 11 12 ... 17 18 19]\n"
-              " [20 21 22 ... 27 28 29]\n"
-              " ...\n"
-              " [70 71 72 ... 77 78 79]\n"
-              " [80 81 82 ... 87 88 89]\n"
-              " [90 91 92 ... 97 98 99]], second: [0 1 2 ... 7 8 9], suffix")
+    >>> tensor_a = tf.range(2)
+    >>> tensor_b = tf.range(1, 4, 2)
+    >>> tf.strings.format("a: {}, b: {}, suffix", (tensor_a, tensor_b))
+    <tf.Tensor: shape=(), dtype=string, numpy=b'a: [0 1], b: [1 3], suffix'>
 
-        assert(out.decode() == expected)
-    ```
 
   Args:
     template: A string template to format tensor values into.
@@ -319,6 +302,7 @@ def _reduce_join_reduction_dims(x, axis):
 
 
 @tf_export(v1=["strings.reduce_join", "reduce_join"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -431,6 +415,7 @@ string_length_v2.__doc__ = gen_string_ops.string_length.__doc__
 
 
 @tf_export(v1=["substr"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(None, "Use `tf.strings.substr` instead of `tf.substr`.")
 def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
   return substr(input, pos, len, name=name, unit=unit)
@@ -495,6 +480,7 @@ def string_to_number(input, out_type=dtypes.float32, name=None):
 
 
 @tf_export(v1=["strings.to_number", "string_to_number"])
+@dispatch.add_dispatch_support
 def string_to_number_v1(
     string_tensor=None,
     out_type=dtypes.float32,
@@ -538,6 +524,7 @@ def string_to_hash_bucket(input, num_buckets, name=None):
 
 
 @tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])
+@dispatch.add_dispatch_support
 def string_to_hash_bucket_v1(
     string_tensor=None,
     num_buckets=None,
@@ -551,6 +538,7 @@ string_to_hash_bucket_v1.__doc__ = gen_string_ops.string_to_hash_bucket.__doc__
 
 
 @tf_export("strings.join", v1=["strings.join", "string_join"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("string_join")
 @dispatch.add_dispatch_support
 def string_join(inputs, separator="", name=None):
diff --git a/tensorflow/python/ops/structured/BUILD b/tensorflow/python/ops/structured/BUILD
index a45496a175f..64b7bd7f1d5 100644
--- a/tensorflow/python/ops/structured/BUILD
+++ b/tensorflow/python/ops/structured/BUILD
@@ -4,6 +4,8 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
     default_visibility = [
+        "//learning/tfx/autotfx:__subpackages__",
+        "//research/graph/convolutions/model/autotfx:__subpackages__",
         "//tensorflow:internal",
     ],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index ad56aa72975..2007b68a548 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -30,9 +30,12 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged.row_partition import RowPartition
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 
@@ -59,21 +62,22 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
   ```python
   >>> # A scalar StructuredTensor describing a single person.
-  >>> s1 = tf.structured.constant({"age": 82, "nicknames": ["Bob", "Bobby"]})
-  >>> print s1.shape
-  ()
-  >>> print s1["age"]
-  tf.Tensor(82, shape=(), dtype=int32)
+  >>> s1 = StructuredTensor.from_pyval(
+  ...     {"age": 82, "nicknames": ["Bob", "Bobby"]})
+  >>> s1.shape
+  TensorShape([])
+  >>> s1["age"]
+  <tf.Tensor: shape=(), dtype=int32, numpy=82>
 
   >>> # A vector StructuredTensor describing three people.
-  >>> s2 = stf.struct.constant([
+  >>> s2 = StructuredTensor.from_pyval([
   ...     {"age": 12, "nicknames": ["Josaphine"]},
   ...     {"age": 82, "nicknames": ["Bob", "Bobby"]},
-  ...     {"age": 82, "nicknames": ["Elmo"]}])
-  >>> print s2.shape
-  (3,)
-  >>> print s2[0]["age"]
-  tf.Tensor(12, shape=(), dtype=int32)
+  ...     {"age": 42, "nicknames": ["Elmo"]}])
+  >>> s2.shape
+  TensorShape([3])
+  >>> s2[0]["age"]
+  <tf.Tensor: shape=(), dtype=int32, numpy=12>
   ```
 
   ### Field Paths
@@ -86,143 +90,167 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   # Constructor & Factory Methods
   #=============================================================================
 
-  # TODO(edloper): Add optional shape validation:
-  # Check that the fields all have the same runtime-shape.  (We check static
-  # shape now, but that doesn't capture ragged shapes or shapes that aren't
-  # statically known.)  I.e., if shape validation is turned on, then check that
-  # the outer shape.rank dimensions of each value in fields is the same.  For
-  # ragged tensors, this means checking their row-splits.
-  def __init__(self, shape, fields):
+  def __init__(self, fields, shape, nrows, row_partitions, internal=False):
+    """Private constructor -- use factory methods to create StructuredTensors.
+
+    This constructor builds a `StructuredTensor` from the given attributes,
+    performing minimal validation.
+
+    Args:
+      fields: A dictionary mapping from string to `Tensor`, `RaggedTensor`, or
+        `StructuredTensor`.  (This dict is not copied, so the caller must ensure
+        that it does not get mutated via leaked references.)
+      shape: `tf.TensorShape` with statically known rank.
+      nrows: scalar integer `tf.Tensor`, or `None` if `shape.rank==0`.
+      row_partitions: tuple of `RowPartition`s, with length `shape.rank-1`.
+      internal: Private key value, required to ensure that this private
+        constructor is *only* called from the factory methods.
+    """
+    if internal is not _structured_tensor_factory_key:
+      raise ValueError('StructuredTensor constructor is private; please use '
+                       'one of the factory methods instead (e.g., '
+                       'StructuredTensor.from_fields())')
+    assert isinstance(fields, dict), fields
+    assert isinstance(shape, tensor_shape.TensorShape), shape
+    assert nrows is None or isinstance(nrows, ops.Tensor), nrows
+    assert isinstance(row_partitions, tuple), row_partitions
+    self._fields = fields
+    self._shape = shape
+    self._nrows = nrows
+    self._row_partitions = row_partitions
+
+  @classmethod
+  def from_fields(cls,
+                  fields,
+                  shape=(),
+                  nrows=None,
+                  row_partitions=None,
+                  validate=False):
     """Creates a `StructuredTensor` from a dictionary of fields.
 
     Args:
-      shape: A `TensorShape`: static information about the shape of the
-        `StructuredTensor`.  Must have a known `rank`.
       fields: A dictionary mapping from string to `Tensor`, `RaggedTensor`, or
         `StructuredTensor`, providing the values for individual fields in each
-        structure.  If `ndims > 0`, then every tensor in `fields` must have the
-        same shape in the first `shape.rank` dimensions; and that shape must be
-        compatible with `shape`.
+        structure.  If `shape.rank > 0`, then every tensor in `fields` must have
+        the same shape in the first `shape.rank` dimensions; and that shape must
+        be compatible with `shape`; and
+        `result[i1...iN][key] = fields[key][i1...iN]` (where `N==shape.rank`).
+      shape: A `TensorShape`: static information about the shape of the
+        `StructuredTensor`.  Must have a known `rank`.  Defaults to scalar
+        shape (i.e. `rank=0`).
+      nrows: scalar integer tensor containing the number of rows in this
+        `StructuredTensor`.  Should only be specified if `shape.rank > 0`.
+        Default value is inferred from the `fields` values.  If `fields` is
+        empty, then this must be specified.
+      row_partitions: A list of `RowPartition`s describing the (possibly ragged)
+        shape of this `StructuredTensor`.  Should only be specified if
+        `shape.rank > 1`.  Default value is inferred from the `fields` values.
+        If `fields` is empty, then this must be specified.
+      validate: If true, then add runtime validation ops that check that the
+        field values all have compatible shapes in the outer `shape.rank`
+        dimensions.
 
     Returns:
       A `StructuredTensor`.
+
+    Examples:
+
+      >>> StructuredTensor.from_fields({'x': 1, 'y': [1, 2, 3]})
+      <StructuredTensor(fields={
+                            x: tf.Tensor(1, shape=(), dtype=int32),
+                            y: tf.Tensor([1 2 3], shape=(3,), dtype=int32)},
+                        shape=())>
+
+      >>> StructuredTensor.from_fields({'foo': [1, 2], 'bar': [3, 4]},
+      ...                              shape=[2])
+      <StructuredTensor(fields={
+                            bar: tf.Tensor([3 4], shape=(2,), dtype=int32),
+                            foo: tf.Tensor([1 2], shape=(2,), dtype=int32)},
+                        shape=(2,))>
+
     """
     shape = tensor_shape.as_shape(shape)
-    rank = shape.ndims
+    rank = shape.rank
     if rank is None:
       raise ValueError("StructuredTensor's shape must have known rank.")
     if not isinstance(fields, dict):
       raise TypeError('fields must be a dictionary, got %s' %
                       type(fields).__name__)
-    self._fields = {}
+    if rank < 2 and row_partitions:
+      raise ValueError('row_partitions must be None or [] if shape.rank<2')
+    if rank == 0 and nrows is not None:
+      raise ValueError('nrows must be None if shape.rank==0')
+    if row_partitions is not None:
+      row_partitions = tuple(row_partitions)
+      if len(row_partitions) != max(0, rank - 1):
+        raise ValueError('len(row_partitions) must be shape.rank-1')
+    elif rank < 2:
+      row_partitions = ()
+
+    fields = dict(fields)  # Make a private copy.
     with ops.name_scope(None, 'StructuredTensor', fields.values()):
-      for (key, value) in fields.items():
+
+      # Validate keys and convert field values to tensors.
+      for key, value in fields.items():
         if not isinstance(key, str):
           raise TypeError('Unexpected type for key in `fields`: %r' % key)
         if not _FIELD_NAME_RE.match(key):
           raise ValueError('Field name %r is not currently allowed.' % key)
-        if not isinstance(
-            value, (ops.Tensor, ragged_tensor.RaggedTensor, StructuredTensor)):
-          if ragged_tensor.is_ragged(value):
-            value = ragged_tensor.convert_to_tensor_or_ragged_tensor(value)
+        fields[key] = _convert_to_structured_field_value(value)
+
+      # Determine dtype for row_partitions and nrows.
+      shape_dtype = _find_shape_dtype(fields, nrows, row_partitions)
+      if nrows is not None:
+        nrows = ops.convert_to_tensor(nrows, shape_dtype)
+
+      # Get the static TensorShape for this StructuredTensor.
+      if rank > 0:
+        for key, value in fields.items():
+          if not shape.is_compatible_with(value.shape[:rank]):
+            raise ValueError('Field {} has shape {}, which is incompatible '
+                             'with the shape that was specified or inferred '
+                             'from other fields: {}'.format(
+                                 key, value.shape[:rank], shape))
+          shape = shape.merge_with(value.shape[:rank])
+
+      if rank == 1:
+        # Find a consistent value for `nrows`.
+        static_nrows = tensor_shape.dimension_at_index(shape, 0)
+        for value in fields.values():
+          nrows, static_nrows = _merge_nrows(nrows, static_nrows, value,
+                                             shape_dtype, validate)
+        if nrows is None:
+          if static_nrows.value is None:
+            raise ValueError('nrows must be specified if rank==1 '
+                             'and `fields` is empty.')
           else:
-            try:
-              value = ops.convert_to_tensor(value)
-            except (ValueError, TypeError):
-              raise TypeError('Unexpected type for value in `fields`: %r' %
-                              value)
-        self._fields[key] = value
+            nrows = constant_op.constant(static_nrows.value, shape_dtype)
 
-    # Check the static TensorShape for this StructuredTensor.
-    self._static_shape = shape
-    if rank > 0:
-      for value in self._fields.values():
-        self._static_shape = self._static_shape.merge_with(value.shape[:rank])
+      if rank > 1:
+        # Find a consistent list of RowPartitions.
+        for value in fields.values():
+          row_partitions = _merge_row_partitions(row_partitions, value, rank,
+                                                 shape_dtype, validate)
+        if row_partitions is None:
+          if not shape.is_fully_defined():
+            raise ValueError('row_partitions must be specified if rank>1 '
+                             'and `fields` is empty.')
+          else:
+            row_partitions = _row_partitions_for_uniform_shape(
+                np.array(shape.as_list(), dtype=shape_dtype.as_numpy_dtype),
+                shape.rank)
+        assert len(row_partitions) == rank - 1
+        nrows = row_partitions[0].nrows()
+        # Update all field values to use the shared RowPartition objects.
+        fields = dict([(k, _replace_row_partitions(v, row_partitions))
+                       for (k, v) in fields.items()])
 
-    self._nested_row_splits = []
-    if rank > 1:
-      # If any fields are ragged, then check that all row-splits match.
-      shared_row_splits = []
-      for field in self._fields.values():
-        # TODO(edloper): A field shouldn't count as ragged if it has
-        # uniform_row_length defined for all the dimensions in question.
-        if isinstance(field, ragged_tensor.RaggedTensor):
-          shared_row_splits.append(field.nested_row_splits[:rank - 1])
-        elif isinstance(field, StructuredTensor) and field.ragged_rank > 0:
-          shared_row_splits.append(field.nested_row_splits[:rank - 1])
-      if shared_row_splits:
-        if len(shared_row_splits) != len(self._fields):
-          raise ValueError('Ragged StructuredTensor contains non-ragged fields')
-
-        # Check if the splits are identical.  This should be the common case.
-        identical_splits = True
-        for splits in shared_row_splits[1:]:
-          if len(splits) != len(shared_row_splits[0]):
-            raise ValueError('Fields have inconsistent ragged_rank')
-          for (s1, s2) in zip(splits, shared_row_splits[0]):
-            if s1 is not s2:
-              identical_splits = False
-
-        if identical_splits:
-          self._nested_row_splits = shared_row_splits[0]
-        else:
-          # If splits aren't identical, then add assertions to check that they
-          # match.
-          with ops.control_dependencies(
-              ragged_util.assert_splits_match(shared_row_splits)):
-            self._nested_row_splits = [array_ops.identity(splits)
-                                       for splits in shared_row_splits[0]]
-
-          # TODO(edloper): Rebuild all fields to ensure that they use the
-          # identical row_splits tensor.
-
-  @classmethod
-  def from_row_splits(cls, values, row_splits, validate=True):
-    """Creates a ragged StructuredTensor with rows partitioned by `row_splits`.
-
-    See `tf.RaggedTensor` for information about row_splits.
-
-    Args:
-      values: A `StructuredTensor` with shape `[nvals, ...]`.
-      row_splits: A 1-D integer tensor with shape `[nrows+1]`.  Must not be
-        empty, and must be sorted in ascending order.  `row_splits[0]` must be
-        zero and `row_splits[-1]` must be `nvals`.
-      validate: If true, then use assertions to check that the arguments form
-        a valid ragged `StructuredTensor`.
-
-    Returns:
-      A ragged `StructuredTensor`.  `result.rank = values.rank + 1`.
-    """
-    if not isinstance(values, StructuredTensor):
-      raise TypeError('values must be a StructuredTensor.')
-    if values.shape.rank == 0:
-      raise ValueError('Shape %s must have rank at least 1' % values.shape)
-    row_splits = ops.convert_to_tensor(row_splits, name='row_splits')
-    row_splits.shape.assert_has_rank(1)
-    if tensor_shape.dimension_value(row_splits.shape[0]) == 0:
-      raise ValueError('row_splits may not be empty')
-    if row_splits.dtype not in (dtypes.int32, dtypes.int64):
-      raise ValueError('Row-partitioning tensors must have dtype '
-                       'int32 or int64')
-
-    if (row_splits.shape and
-        tensor_shape.dimension_value(row_splits.shape[0]) is not None):
-      nrows = tensor_shape.dimension_value(row_splits.shape[0]) - 1
-    else:
-      nrows = None
-    result_shape = tensor_shape.TensorShape([nrows, None
-                                            ]).concatenate(values.shape[1:])
-    result_fields = {}
-    for (name, field) in values._fields.items():
-      if isinstance(field, StructuredTensor):
-        result_fields[name] = StructuredTensor.from_row_splits(
-            field, row_splits)
-      else:
-        result_fields[name] = ragged_tensor.RaggedTensor.from_row_splits(
-            field, row_splits, validate=validate)
-    return cls(result_shape, result_fields)
-
-  # @TODO(edloper): Add from_row_lengths, etc.
+    return cls(
+        fields,
+        shape,
+        nrows,
+        row_partitions,
+        internal=_structured_tensor_factory_key)
 
   #=============================================================================
   # Properties
@@ -231,7 +259,7 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   @property
   def rank(self):
     """The rank of this StructuredTensor.  Guaranteed not to be `None`."""
-    return self._static_shape.rank
+    return self._shape.rank
 
   @property
   def shape(self):
@@ -243,33 +271,34 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     Returns:
       `tf.TensorShape`
     """
-    return self._static_shape
+    return self._shape
 
+  # TODO(edloper): Make this a func instead of a property?  Or make nrows
+  # a property instead of a func?  Seems like these should be consistent.
   @property
-  def nested_row_splits(self):
-    """A tuple containing the row_splits for all ragged dimensions.
+  def row_partitions(self):
+    """A tuple of `RowPartition`s defining the shape of this `StructuredTensor`.
 
-    If non-empty, then every `field` in this StructuredTensor is ragged, and
-    has these `nested_row_splits` as their outermost row-splits tensors.
+    If this `StructuredTensor` has a ragged shape, then all fields will be
+    encoded as either `RaggedTensor`s or `StructuredTensor`s with these
+    `RowPartition`s used to define their outermost `self.rank` dimensions.
+
+    If this `StructuredTensor` has a uniform (non-ragged) shape, then these
+    row partitions will all be defined using `uniform_row_length`.
 
     Returns:
-      A `tuple` of 1-D integer `Tensor`s.  The length of this tuple will
-      always be less than `self.rank`.
+      A `tuple` of `RowPartition` objects with length `self.rank - 1`
+      (or `0` if `self.rank < 2`).
     """
-    return self._nested_row_splits
+    return self._row_partitions
 
-  @property
-  def ragged_rank(self):
-    """The number of ragged dimensions in this StructuredTensor.
-
-    See `tf.RaggedTensor` for more information about ragged dimensions and
-    `ragged_rank`.
+  def nrows(self):
+    """The number of rows in this StructuredTensor (if rank>0).
 
     Returns:
-      A Python `int` indicating the number of ragged dimensions in this ragged
-      tensor.  The outermost dimension is not considered ragged.
+      A scalar integer `Tensor` (or `None` if `self.rank == 0`).
     """
-    return len(self._nested_row_splits)
+    return self._nrows
 
   def _is_eager(self):
     """True if all fields are composed of eager tensors."""
@@ -290,7 +319,7 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     If `field_name` is a `string`, then it names a field directly owned by this
     `StructuredTensor`.  If this `StructuredTensor` has shape `[D1...DN]`, then
     the returned tensor will have shape `[D1...DN, V1...VM]`, where the slice
-    `result[d1...dN]`contains the field value for the structure at
+    `result[d1...dN]` contains the field value for the structure at
     `self[d1...dN]`.
 
     If `field_name` is a `tuple` of `string`, then it specifies a path to a
@@ -304,10 +333,16 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
     Returns:
       `Tensor`, `StructuredTensor`, or `RaggedTensor`.
+
+    Raises:
+      KeyError: If the given field_name is not found.
     """
     if isinstance(field_name, (list, tuple)):
       value = self
       for f in field_name:
+        if not isinstance(value, StructuredTensor):
+          raise KeyError('Field path {} not found in {}'.format(
+              field_name, self))
         value = value.field_value(f)
       return value
     return self._fields[field_name]
@@ -356,17 +391,17 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     if not key:
       return self
 
-    if self._static_shape.ndims == 0:
+    if self._shape.rank == 0:
       return self._scalar_getitem(key)
     else:
       return self._tensor_getitem(key)
 
   def _scalar_getitem(self, key):
-    if (isinstance(key[0], slice) and slice.start is None and
-        slice.stop is None and slice.step is None):
+    if (isinstance(key[0], slice) and key[0].start is None and
+        key[0].stop is None and key[0].step is None):
       fields = dict((field_name, field_value.__getitem__(key[1:]))
                     for (field_name, field_value) in self._fields.items())
-      return StructuredTensor(self._static_shape[1:], fields)
+      return StructuredTensor.from_fields(fields, self._shape)
 
     elif not isinstance(key[0], compat.bytes_or_text_types):
       raise ValueError('Key for indexing a StructuredTensor must be a '
@@ -375,7 +410,7 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     return self._fields[key[0]].__getitem__(key[1:])
 
   def _tensor_getitem(self, key):
-    rank = self._static_shape.ndims
+    rank = self._shape.rank
     if len(key) <= rank:
       new_fields = dict((field_name, field_value.__getitem__(key))
                         for (field_name, field_value) in self._fields.items())
@@ -387,20 +422,24 @@ class StructuredTensor(composite_tensor.CompositeTensor):
             result_shape[d] = None
         elif isinstance(k, (int, ops.Tensor)):
           result_shape[d] = -1  # mark for deletion
+        elif k is None:
+          raise ValueError('Slicing not supported for tf.newaxis')
         else:
           # Ellipsis, tf.newaxis:
           raise ValueError('Slicing not supported for %r' % k)
       result_shape = [d for d in result_shape if d != -1]
-      return StructuredTensor(result_shape, new_fields)
+      return StructuredTensor.from_fields(new_fields, result_shape)
 
     else:
       if not isinstance(key[rank], compat.bytes_or_text_types):
+        # TODO(edloper): Also support full slice here?
         raise ValueError('Key for indexing a StructuredTensor must be a string')
       return self._fields[key[rank]].__getitem__(key[:rank] + key[rank + 1:])
 
   def __repr__(self):
-    return '<StructuredTensor(shape=%s, fields=%r)>' % (self._static_shape,
-                                                        self._fields)
+    return '<StructuredTensor(fields={%s}, shape=%s)>' % (', '.join(
+        '"%s": %s' % (k, v)
+        for k, v in sorted(self._fields.items())), self._shape)
 
   #=============================================================================
   # Conversion
@@ -427,8 +466,9 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
     Requires that all fields are Eager tensors.
 
-    >>> print(StructuredTensor([3], {'a': [1, 2, 3]}).to_pyval())
-    [{b'a': 1}, {b'a': 2}, {b'a': 3}]
+    >>> StructuredTensor.from_fields(
+    ...     {'a': [1, 2, 3]}, [3]).to_pyval()
+    [{'a': 1}, {'a': 2}, {'a': 3}]
 
     Note that `StructuredTensor.from_pyval(pyval).to_pyval() == pyval`.
 
@@ -454,10 +494,9 @@ class StructuredTensor(composite_tensor.CompositeTensor):
       result[key] = value
 
     # If rank>0, then re-group each value from dict-of-list to list-of-dict.
-    if len(self._static_shape) > 0:  # pylint: disable=g-explicit-length-test
-      return _pyval_field_major_to_node_major(list(result.keys()),
-                                              list(result.values()),
-                                              self._static_shape.as_list())
+    if len(self._shape) > 0:  # pylint: disable=g-explicit-length-test
+      return _pyval_field_major_to_node_major(
+          list(result.keys()), list(result.values()), self._shape.as_list())
     else:
       return result
 
@@ -465,9 +504,12 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   def from_pyval(cls, pyval, typespec=None):
     """Constructs a StructuredTensor from a nested Python structure.
 
-    >>> print StructuredTensor.from_pyval(
+    >>> StructuredTensor.from_pyval(
     ...     {'a': [1, 2, 3], 'b': [[4, 5], [6, 7]]})
-    <StructuredTensor {'a': [1, 2, 3], 'b': [[4, 5], [6, 7]]}>
+    <StructuredTensor(fields={
+                          a: tf.Tensor([1 2 3], shape=(3,), dtype=int32),
+                          b: <tf.RaggedTensor [[4, 5], [6, 7]]>},
+                      shape=())>
 
     Note that `StructuredTensor.from_pyval(pyval).to_pyval() == pyval`.
 
@@ -503,12 +545,12 @@ class StructuredTensor(composite_tensor.CompositeTensor):
       spec_shape = typespec._shape  # pylint: disable=protected-access
       field_specs = typespec._field_specs  # pylint: disable=protected-access
       if not (isinstance(typespec, StructuredTensorSpec) and
-              spec_shape.ndims == 0 and set(pyval) == set(field_specs)):
+              spec_shape.rank == 0 and set(pyval) == set(field_specs)):
         raise ValueError('Value does not match typespec: %r vs %r' %
                          (pyval, typespec))
       fields = dict(
           (k, cls.from_pyval(v, field_specs[k])) for (k, v) in pyval.items())
-    return StructuredTensor(shape=(), fields=fields)
+    return StructuredTensor.from_fields(fields=fields, shape=(), validate=False)
 
   @classmethod
   def _from_pylist_of_dict(cls, pyval, keys, rank, typespec):
@@ -532,10 +574,8 @@ class StructuredTensor(composite_tensor.CompositeTensor):
                          '%r vs %r' % (pyval, typespec))
       for (key, spec) in field_specs.items():
         fields[key] = cls.from_pyval(fields.get(key, []), spec)
-        if not spec.is_compatible_with(fields[key]):
-          raise ValueError('Value does not match typespec: %r vs %r' %
-                           (spec, fields[key]))
-    return StructuredTensor(shape=shape, fields=fields)
+    return StructuredTensor.from_fields(
+        fields=fields, shape=shape, validate=False)
 
   @classmethod
   def _from_pylist_of_value(cls, pyval, typespec):
@@ -543,8 +583,11 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     if typespec is None:
       return ragged_factory_ops.constant(pyval)
     elif isinstance(typespec, tensor_spec.TensorSpec):
-      # TODO(edloper): Check that typespec.shape matches.
-      return constant_op.constant(pyval, typespec.dtype)
+      result = constant_op.constant(pyval, typespec.dtype)
+      if not typespec.shape.is_compatible_with(result.shape):
+        raise ValueError('Value does not match typespec: %r vs %r' %
+                         (typespec, pyval))
+      return result
     elif isinstance(typespec, ragged_tensor.RaggedTensorSpec):
       # pylint: disable=protected-access
       return ragged_factory_ops.constant(
@@ -571,11 +614,87 @@ class StructuredTensor(composite_tensor.CompositeTensor):
       return constant_op.constant(pyval)
     else:
       if not (isinstance(typespec, tensor_spec.TensorSpec) and
-              typespec.shape.ndims == 0):
-        raise ValueError('Value does not match typespec.')
+              typespec.shape.rank == 0):
+        raise ValueError('Value does not match typespec: %r vs %r' %
+                         (typespec, pyval))
       # TODO(edloper): Check that typespec.shape matches.
       return constant_op.constant(pyval, typespec.dtype)
 
+  #=============================================================================
+  # Transforms
+  #=============================================================================
+
+  # TODO(edloper): Add a 'validate' option here?
+  # TODO(edloper): Unify nomenclature with RaggedTensor.  Should RaggedTensor
+  # have a partition_outer_dimension method?
+  def partition_outer_dimension(self, row_partition):
+    """Partitions the outer dimension of this StructuredTensor.
+
+    Returns a new `StructuredTensor` with the same values as `self`, where
+    the outer dimension is partitioned into two (possibly ragged) dimensions.
+    Requires that this StructuredTensor have an outer dimension (i.e.,
+    `self.shape.rank > 0`).
+
+    >>> st = StructuredTensor.from_pyval(
+    ...     [{'foo': 12}, {'foo': 33}, {'foo': 99}])
+    >>> partition = RowPartition.from_row_lengths([2, 0, 1])
+    >>> st.partition_outer_dimension(partition)
+    <StructuredTensor(fields={
+                          foo: <tf.RaggedTensor [[12, 33], [], [99]]>},
+                      shape=(3, None))>
+
+    Args:
+      row_partition: A `RowPartition`.
+
+    Returns:
+      A `StructuredTensor` with rank `values.rank + 1`.
+    """
+    if not isinstance(row_partition, RowPartition):
+      raise TypeError('row_partition must be a RowPartition.')
+    if self.shape.rank == 0:
+      raise ValueError('Shape %s must have rank at least 1' % self.shape)
+    return _partition_outer_dimension(self, row_partition)
+
+  def merge_dims(self, outer_axis, inner_axis):
+    """Merges outer_axis...inner_axis into a single dimension.
+
+    Returns a copy of this RaggedTensor with the specified range of dimensions
+    flattened into a single dimension, with elements in row-major order.
+
+    >>> st = StructuredTensor.from_pyval(
+    ...     [[{'foo': 12}, {'foo': 33}], [], [{'foo': 99}]])
+    >>> st.merge_dims(0, 1)
+    <StructuredTensor(fields={
+                          foo: tf.Tensor([12 33 99], shape=(3,), dtype=int32)},
+                      shape=(3,))>
+
+    Args:
+      outer_axis: `int`: The first dimension in the range of dimensions to
+        merge. May be negative (to index from the last dimension).
+      inner_axis: `int`: The last dimension in the range of dimensions to merge.
+        May be negative (to index from the last dimension).
+
+    Returns:
+      A copy of this tensor, with the specified dimensions merged into a
+      single dimension.  The shape of the returned tensor will be
+      `self.shape[:outer_axis] + [N] + self.shape[inner_axis + 1:]`, where `N`
+      is the total number of slices in the merged dimensions.
+    """
+    outer_axis = array_ops.get_positive_axis(
+        outer_axis,
+        self.shape.rank,
+        axis_name='outer_axis',
+        ndims_name='rank(self)')
+    inner_axis = array_ops.get_positive_axis(
+        inner_axis,
+        self.shape.rank,
+        axis_name='inner_axis',
+        ndims_name='rank(self)')
+    if not outer_axis < inner_axis:
+      raise ValueError('Expected outer_axis (%d) to be less than '
+                       'inner_axis (%d)' % (outer_axis, inner_axis))
+    return _merge_dims(self, outer_axis, inner_axis)
+
   #=============================================================================
   # Composite Tensor
   #=============================================================================
@@ -594,7 +713,7 @@ class StructuredTensorSpec(type_spec.BatchableTypeSpec):
     """Build a type specification for a StructuredTensor.
 
     Args:
-      shape: The shape of the StructuredTensor.  shape.ndims must not be None.
+      shape: The shape of the StructuredTensor.  shape.rank must not be None.
       field_specs: A dictionary mapping from field name to TypeSpec, specifying
         the tensor type used to encode each field. These TypeSpecs should
         specify the type of the entire field (including outer dimensions which
@@ -602,20 +721,23 @@ class StructuredTensorSpec(type_spec.BatchableTypeSpec):
         contains an int32 vector of size `10` for each structure, then
         `field_specs['x']` should be `tf.TensorSpec([2, 3, 10], tf.int32)`.
     """
-    self._shape = tensor_shape.as_shape(shape)
-    self._field_specs = dict(field_specs)
+    shape = tensor_shape.as_shape(shape)
 
     # Perform a few sanity checks on the inputs.
-    if self._shape.ndims is None:
+    if shape.rank is None:
       raise TypeError("StructuredTensor's shape must have known rank.")
-    if not isinstance(self._field_specs, dict):
-      raise TypeError('field_specs must be a dictionary')
-    for key, value in self._field_specs.items():
+    if not isinstance(field_specs, dict):
+      raise TypeError('field_specs must be a dictionary.')
+    for key, value in field_specs.items():
       if not isinstance(key, str):
         raise TypeError('field_specs must be a dictionary with string keys.')
       if not isinstance(value, (StructuredTensorSpec, tensor_spec.TensorSpec,
                                 ragged_tensor.RaggedTensorSpec)):
-        raise TypeError('field_spec must be a dictionary with TypeSpec values.')
+        raise TypeError('field_specs must be a dictionary with '
+                        'TypeSpec values.')
+
+    self._shape = shape
+    self._field_specs = dict(field_specs)
 
   @property
   def value_type(self):
@@ -625,7 +747,7 @@ class StructuredTensorSpec(type_spec.BatchableTypeSpec):
     return value._fields
 
   def _from_components(self, components):
-    return StructuredTensor(self._shape, components)
+    return StructuredTensor.from_fields(components, self._shape, validate=False)
 
   @property
   def _component_specs(self):
@@ -659,6 +781,143 @@ class StructuredTensorSpec(type_spec.BatchableTypeSpec):
 _FIELD_NAME_RE = re.compile('^[a-zA-Z][a-zA-Z0-9_]*$')
 
 
+#=============================================================================
+# Helper funtions
+#=============================================================================
+# TODO(edloper): Move some of these helpers to row_partition.py?
+
+
+def _convert_to_structured_field_value(value):
+  """Converts `value` to a Tensor, RaggedTensor, or StructuredTensor."""
+  if isinstance(value,
+                (ops.Tensor, ragged_tensor.RaggedTensor, StructuredTensor)):
+    return value
+  elif ragged_tensor.is_ragged(value):
+    return ragged_tensor.convert_to_tensor_or_ragged_tensor(value)
+  else:
+    try:
+      return ops.convert_to_tensor(value)
+    except (ValueError, TypeError):
+      raise TypeError('Unexpected type for value in `fields`: %r' % value)
+
+
+def _find_shape_dtype(fields, nrows, row_partitions):
+  """Return a consistent dtype for fields, nrows, & row_partitions."""
+  shape_dtypes = set()
+  for value in fields.values():
+    if isinstance(value, ragged_tensor.RaggedTensor):
+      shape_dtypes.add(value.row_splits.dtype)
+    elif isinstance(value, StructuredTensor) and value.rank > 0:
+      shape_dtypes.add(value.nrows().dtype)
+  if isinstance(nrows, ops.Tensor):
+    shape_dtypes.add(nrows.dtype)
+  if row_partitions is not None:
+    for partition in row_partitions:
+      shape_dtypes.add(partition.dtype)
+  if len(shape_dtypes) > 1:
+    raise ValueError('field values have incompatible row_partition dtypes.')
+  elif shape_dtypes:
+    return shape_dtypes.pop()
+  else:
+    return dtypes.int64
+
+
+def _merge_nrows(nrows, static_nrows, value, dtype, validate):
+  """Merges `nrows` with `nrows(value)`.
+
+  Checks that `value` has the expected number of rows (`nrows`), and returns
+  `nrows`.  If `validate` is true, then add validation ops that check that
+  the `nrows` values match.
+
+  Args:
+    nrows: scalar integer Tensor.
+    static_nrows: tf.Dimension: static value of nrows, if known.
+    value: Tensor or RaggedTensor or StructuredTensor
+    dtype: dtype for `nrows`.
+    validate: bool -- whether to add validation ops.
+
+  Returns:
+    A tuple `(nrows, static_nrows)`.
+  """
+  static_value_nrows = tensor_shape.dimension_at_index(value.shape, 0)
+  if isinstance(value, ops.Tensor):
+    value_nrows = array_ops.shape(value, out_type=dtype)[0]
+  else:
+    value_nrows = value.nrows()
+  if nrows is None:
+    nrows = value_nrows
+  elif (static_value_nrows.value is not None and
+        static_nrows.value is not None):
+    if not static_value_nrows.is_compatible_with(static_nrows):
+      raise ValueError('fields have incompatible nrows')
+    nrows = value_nrows  # No need to add an assertion op.
+  elif validate:
+    nrows = control_flow_ops.with_dependencies([
+        check_ops.assert_equal(nrows, value_nrows,
+                               message='fields have incompatible nrows')
+    ], nrows)
+  return nrows, static_nrows.merge_with(static_value_nrows)
+
+
+def _merge_row_partitions(row_partitions, value, rank, dtype, validate):
+  """Merges `row_partitions` with `row_partitions(value)`."""
+  if isinstance(value, ops.Tensor):
+    value_row_partitions = _row_partitions_for_tensor(value, rank, dtype)
+
+  elif isinstance(value, ragged_tensor.RaggedTensor):
+    value_row_partitions = _row_partitions_for_ragged_tensor(value, rank, dtype)
+
+  else:
+    assert isinstance(value, StructuredTensor), type(value)
+    value_row_partitions = value.row_partitions[:rank - 1]
+
+  assert len(value_row_partitions) == rank - 1
+  if row_partitions is None:
+    return tuple(value_row_partitions)
+  else:
+    return tuple([
+        p1.merge_precomputed_encodings(p2, validate)
+        for (p1, p2) in zip(row_partitions, value_row_partitions)
+    ])
+
+
+def _row_partitions_for_tensor(value, rank, dtype):
+  """Returns the row partitions for a tf.Tensor."""
+  shape = array_ops.shape(value, out_type=dtype)
+  return _row_partitions_for_uniform_shape(shape, rank)
+
+
+def _row_partitions_for_ragged_tensor(value, rank, dtype):
+  """Returns the row partitions for a tf.RaggedTensor."""
+  assert rank > 1
+  value_row_partitions = value._nested_row_partitions[:rank - 1]  # pylint: disable=protected-access
+  if len(value_row_partitions) < (rank - 1):
+    value_row_partitions += _row_partitions_for_tensor(
+        value.flat_values, rank - len(value_row_partitions), dtype)
+  assert len(value_row_partitions) == rank - 1
+  return value_row_partitions
+
+
+def _row_partitions_for_uniform_shape(shape, rank):
+  """Returns row partitions for the given shape Tensor.
+
+  Args:
+    shape: A vector describing a uniform shape.
+    rank: The number of dimensions to generate row partitions for
+
+  Returns:
+    A list of (rank-1) `RowPartition`s with uniform row length.
+  """
+  shape_cumprod = math_ops.cumprod(shape[:rank])
+  # pylint: disable=g-complex-comprehension
+  return tuple([
+      RowPartition.from_uniform_row_length(
+          uniform_row_length=shape[i + 1],
+          nvals=shape_cumprod[i + 1],
+          nrows=shape_cumprod[i]) for i in range(rank - 1)
+  ])
+
+
 def _pyval_field_major_to_node_major(keys, values, shape):
   """Regroup each field (k, v) from dict-of-list to list-of-dict.
 
@@ -767,3 +1026,142 @@ def _pyval_empty_list_depth(pyval):
       return max(depths) + 1
   else:
     return None
+
+
+def _replace_row_partitions(value, new_partitions):
+  """Updates `value` to use `new_partitions` as its (outer) row partitions.
+
+  This is used to ensure that all fields in a `StructuredTensor` use identical
+  `RowPartition` objects for the shared dimensions.  In particular,
+  `StructuredTensor.from_fields` first merges all of the row partitions from
+  any fields, and then replaces the outer row partitions of all fields with
+  the merged row partitions (using this function).
+
+  Args:
+    value: A `Tensor`, `RaggedTensor`, or `StructuredTensor`.
+    new_partitions: A list of row-partitions that should be used by `value`.
+      Must be equivalent to `value`'s current row partitions.
+
+  Returns:
+    A value that is equivalent to `value`, where outer row partitions have been
+    replaced by `new_partitions`.
+  """
+  if isinstance(value, ops.Tensor) or not new_partitions:
+    return value
+
+  elif isinstance(value, ragged_tensor.RaggedTensor):
+    return ragged_tensor.RaggedTensor._from_row_partition(  # pylint: disable=protected-access
+        values=_replace_row_partitions(value.values, new_partitions[1:]),
+        row_partition=new_partitions[0])
+
+  else:
+    assert isinstance(value, StructuredTensor)
+    new_fields = dict((k, _replace_row_partitions(v, new_partitions))
+                      for (k, v) in value._fields.items())
+    return StructuredTensor(
+        fields=new_fields,
+        shape=value.shape,
+        nrows=value.nrows(),
+        row_partitions=new_partitions,
+        internal=_structured_tensor_factory_key)
+
+
+def _partition_outer_dimension(value, row_partition):
+  """Partitions the outer dimension of `value` using `row_partitions`.
+
+  Examples:
+
+    >>> partition = row_partition.RowPartition.from_row_lengths([2, 0, 1])
+    >>> _partition_outer_dimension(tf.constant([1, 2, 3]), partition)
+    <tf.RaggedTensor [[1, 2], [], [3]]>
+
+    >>> struct_value = StructuredTensor.from_pyval(
+    ...     [{'x': 1}, {'x': 2}, {'x': 3}])
+    >>> _partition_outer_dimension(struct_value, partition)
+    <StructuredTensor(fields={
+                          x: <tf.RaggedTensor [[1, 2], [], [3]]>},
+                      shape=(3, None))>
+
+  Args:
+    value: Tensor, RaggedTensor, or StructuredTensor
+    row_partition: RowPartition
+
+  Returns:
+    A value with the same type as `value`, where
+    `result.rank = value.rank + 1`.
+  """
+  is_ragged = row_partition.uniform_row_length() is None
+  if isinstance(value, ops.Tensor) and not is_ragged:
+    new_shape = array_ops.concat(
+        [[row_partition.nrows(),
+          row_partition.uniform_row_length()],
+         array_ops.shape(value, out_type=row_partition.dtype)[1:]],
+        axis=0)
+    return array_ops.reshape(value, new_shape)
+  elif isinstance(value, (ops.Tensor, ragged_tensor.RaggedTensor)):
+    return ragged_tensor.RaggedTensor._from_row_partition(  # pylint: disable=protected-access
+        value, row_partition)
+  else:
+    assert isinstance(value, StructuredTensor)
+    nrows = row_partition.static_nrows
+    ncols = row_partition.static_uniform_row_length
+    shape = tensor_shape.TensorShape([nrows, ncols]).concatenate(
+        value.shape[1:])
+    fields = dict((k, _partition_outer_dimension(v, row_partition))
+                  for (k, v) in value._fields.items())
+    return StructuredTensor(
+        fields,
+        shape,
+        row_partition.nrows(), (row_partition,) + value.row_partitions,
+        internal=_structured_tensor_factory_key)
+
+
+def _merge_dims(value, outer_axis, inner_axis):
+  """Merges `outer_axis...inner_axis` of `value` into a single dimension."""
+  assert outer_axis < inner_axis
+  if isinstance(value, (ops.Tensor, ragged_tensor.RaggedTensor)):
+    return ragged_tensor.merge_dims(value, outer_axis, inner_axis)
+  else:
+    assert isinstance(value, StructuredTensor)
+
+    # Build the new fields.
+    fields = dict((k, _merge_dims(v, outer_axis, inner_axis))
+                  for (k, v) in value._fields.items())
+
+    # Build the new shape.
+    value_shape = value.shape
+    shape = (
+        value_shape[:outer_axis] +
+        [value_shape[outer_axis:inner_axis].num_elements()] +
+        value_shape[inner_axis + 1:])
+
+    # Build the new row_partitions & nrows
+    if outer_axis == 0:
+      if inner_axis == value.shape.rank - 1:
+        partitions = ()
+        nrows = value.row_partitions[-1].nvals()
+      else:
+        partitions = value.row_partitions[inner_axis:]
+        nrows = partitions[0].nrows()
+    else:
+      # Use tf.gather to merge row_splits from the merged row partitions.
+      merged_splits = value.row_partitions[outer_axis - 1].row_splits()
+      for dim in range(outer_axis, inner_axis):
+        merged_splits = array_ops.gather(value.row_partitions[dim].row_splits(),
+                                         merged_splits)
+
+      partitions = (
+          value.row_partitions[:outer_axis - 1] +
+          (RowPartition.from_row_splits(merged_splits),) +
+          value.row_partitions[inner_axis:])
+      nrows = partitions[0].nrows()
+
+    return StructuredTensor(
+        fields,
+        shape,
+        nrows,
+        partitions,
+        internal=_structured_tensor_factory_key)
+
+
+_structured_tensor_factory_key = object()  # unique private object
diff --git a/tensorflow/python/ops/structured/structured_tensor_slice_test.py b/tensorflow/python/ops/structured/structured_tensor_slice_test.py
index e4011f20e30..0eaef216a01 100644
--- a/tensorflow/python/ops/structured/structured_tensor_slice_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_slice_test.py
@@ -201,6 +201,10 @@ class StructuredTensorSliceTest(test_util.TensorFlowTestCase,
       (SLICE_BUILDER["f4", 1:, "f4_2"], [b"b"]),
       (SLICE_BUILDER["f4", :, "f4_2"], [b"a", b"b"]),
       (SLICE_BUILDER["f5", :, :, "f5_1"], [[1, 2], [3, 4]]),
+      # Slicing over multiple keys
+      (SLICE_BUILDER[:], EXAMPLE_STRUCT),
+      # List-valued key.
+      (["f2", 1], EXAMPLE_STRUCT["f2"][1]),
   ])
   def testGetitemFromScalarStruct(self, slice_spec, expected):
     # By default, lists are converted to RaggedTensors.
@@ -242,6 +246,29 @@ class StructuredTensorSliceTest(test_util.TensorFlowTestCase,
 
   # TODO(edloper): Add tests for slicing from matrix StructuredTensors.
 
+  @parameterized.parameters([
+      (SLICE_BUILDER[:2], r"Key for indexing a StructuredTensor must be "
+       r"a string or a full slice \(':'\)"),
+      (SLICE_BUILDER["f4", ...], r"Slicing not supported for Ellipsis"),
+      (SLICE_BUILDER["f4", None], r"Slicing not supported for tf.newaxis"),
+      (SLICE_BUILDER["f4", :, 0],
+       r"Key for indexing a StructuredTensor must be a string"),
+  ])
+  def testGetItemError(self, slice_spec, error, exception=ValueError):
+    struct = structured_tensor.StructuredTensor.from_pyval(EXAMPLE_STRUCT)
+    with self.assertRaisesRegexp(exception, error):
+      struct.__getitem__(slice_spec)
+
+  @parameterized.parameters([
+      (SLICE_BUILDER[:, 1],
+       r"Key for indexing a StructuredTensor must be a string"),
+  ])
+  def testGetItemFromVectorError(self, slice_spec, error, exception=ValueError):
+    struct = structured_tensor.StructuredTensor.from_pyval(
+        EXAMPLE_STRUCT_VECTOR)
+    with self.assertRaisesRegexp(exception, error):
+      struct.__getitem__(slice_spec)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/structured/structured_tensor_spec_test.py b/tensorflow/python/ops/structured/structured_tensor_spec_test.py
index be926bd38d2..3684c84d8f5 100644
--- a/tensorflow/python/ops/structured/structured_tensor_spec_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_spec_test.py
@@ -81,6 +81,18 @@ class StructuredTensorSpecTest(test_util.TensorFlowTestCase,
     self.assertEqual(spec2._shape, (1, 2))
     self.assertEqual(spec2._field_specs, spec2_fields)
 
+  @parameterized.parameters([
+      (None, {}, r"StructuredTensor's shape must have known rank\."),
+      ([], None, r'field_specs must be a dictionary\.'),
+      ([], {1: tensor_spec.TensorSpec(None)},
+       r'field_specs must be a dictionary with string keys\.'),
+      ([], {'x': 0},
+       r'field_specs must be a dictionary with TypeSpec values\.'),
+  ])
+  def testConstructionErrors(self, shape, field_specs, error):
+    with self.assertRaisesRegexp(TypeError, error):
+      structured_tensor.StructuredTensorSpec(shape, field_specs)
+
   def testValueType(self):
     spec1 = StructuredTensorSpec([1, 2, 3], dict(a=T_1_2))
     self.assertEqual(spec1.value_type, StructuredTensor)
@@ -118,11 +130,13 @@ class StructuredTensorSpecTest(test_util.TensorFlowTestCase,
           'fields': dict(x=[[1.0, 2.0]]),
           'field_specs': dict(x=T_1_2),
       },
-      {
-          'shape': [1, 2, 3],
-          'fields': {},
-          'field_specs': {},
-      },
+      # TODO(edloper): Enable this test once we update StructuredTensorSpec
+      # to contain the shared row partitions.
+      #{
+      #    'shape': [1, 2, 3],
+      #    'fields': {},
+      #    'field_specs': {},
+      #},
       {
           'shape': [2],
           'fields': dict(
@@ -133,7 +147,7 @@ class StructuredTensorSpecTest(test_util.TensorFlowTestCase,
   ])  # pyformat: disable
   def testToFromComponents(self, shape, fields, field_specs):
     components = fields
-    struct = StructuredTensor(shape, fields)
+    struct = StructuredTensor.from_fields(fields, shape)
     spec = StructuredTensorSpec(shape, field_specs)
     actual_components = spec._to_components(struct)
     self.assertAllTensorsEqual(actual_components, components)
@@ -164,39 +178,40 @@ class StructuredTensorSpecTest(test_util.TensorFlowTestCase,
   @parameterized.parameters([
       {
           'unbatched': lambda: [
-              StructuredTensor([], {'a': 1, 'b': [5, 6]}),
-              StructuredTensor([], {'a': 2, 'b': [7, 8]})],
+              StructuredTensor.from_fields({'a': 1, 'b': [5, 6]}),
+              StructuredTensor.from_fields({'a': 2, 'b': [7, 8]})],
           'batch_size': 2,
-          'batched': lambda: StructuredTensor([2], {
+          'batched': lambda: StructuredTensor.from_fields(shape=[2], fields={
               'a': [1, 2],
               'b': [[5, 6], [7, 8]]}),
       },
       {
           'unbatched': lambda: [
-              StructuredTensor([3], {
+              StructuredTensor.from_fields(shape=[3], fields={
                   'a': [1, 2, 3],
                   'b': [[5, 6], [6, 7], [7, 8]]}),
-              StructuredTensor([3], {
+              StructuredTensor.from_fields(shape=[3], fields={
                   'a': [2, 3, 4],
                   'b': [[2, 2], [3, 3], [4, 4]]})],
           'batch_size': 2,
-          'batched': lambda: StructuredTensor([2, 3], {
+          'batched': lambda: StructuredTensor.from_fields(shape=[2, 3], fields={
               'a': [[1, 2, 3], [2, 3, 4]],
               'b': [[[5, 6], [6, 7], [7, 8]],
                     [[2, 2], [3, 3], [4, 4]]]}),
       },
       {
           'unbatched': lambda: [
-              StructuredTensor([], {
+              StructuredTensor.from_fields(shape=[], fields={
                   'a': 1,
-                  'b': StructuredTensor([], {'x': [5]})}),
-              StructuredTensor([], {
+                  'b': StructuredTensor.from_fields({'x': [5]})}),
+              StructuredTensor.from_fields(shape=[], fields={
                   'a': 2,
-                  'b': StructuredTensor([], {'x': [6]})})],
+                  'b': StructuredTensor.from_fields({'x': [6]})})],
           'batch_size': 2,
-          'batched': lambda: StructuredTensor([2], {
+          'batched': lambda: StructuredTensor.from_fields(shape=[2], fields={
               'a': [1, 2],
-              'b': StructuredTensor([2], {'x': [[5], [6]]})}),
+              'b': StructuredTensor.from_fields(shape=[2], fields={
+                  'x': [[5], [6]]})}),
       },
   ])  # pyformat: disable
   def testBatchUnbatchValues(self, unbatched, batch_size, batched):
@@ -225,5 +240,6 @@ class StructuredTensorSpecTest(test_util.TensorFlowTestCase,
     for (actual, expected) in zip(actual_unbatched, unbatched):
       self.assertAllEqual(actual, expected)
 
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/structured/structured_tensor_test.py b/tensorflow/python/ops/structured/structured_tensor_test.py
index 5c5854ba17e..420705b07e7 100644
--- a/tensorflow/python/ops/structured/structured_tensor_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_test.py
@@ -22,15 +22,20 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import row_partition
 from tensorflow.python.ops.structured import structured_tensor
+from tensorflow.python.ops.structured.structured_tensor import StructuredTensor
 from tensorflow.python.platform import googletest
 
 
@@ -43,105 +48,442 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     if not (isinstance(a, structured_tensor.StructuredTensor) or
             isinstance(b, structured_tensor.StructuredTensor)):
       return super(StructuredTensorTest, self).assertAllEqual(a, b, msg)
-    if not (isinstance(a, structured_tensor.StructuredTensor) and
-            isinstance(b, structured_tensor.StructuredTensor)):
-      # TODO(edloper) Add support for this once structured_factory_ops is added.
-      raise ValueError("Not supported yet")
+    if not isinstance(a, structured_tensor.StructuredTensor):
+      a = structured_tensor.StructuredTensor.from_pyval(a)
+      self._assertStructuredEqual(a, b, msg, False)
+    elif not isinstance(b, structured_tensor.StructuredTensor):
+      b = structured_tensor.StructuredTensor.from_pyval(b)
+      self._assertStructuredEqual(a, b, msg, False)
+    else:
+      self._assertStructuredEqual(a, b, msg, True)
 
-    self.assertEqual(repr(a.shape), repr(b.shape))
+  def _assertStructuredEqual(self, a, b, msg, check_shape):
+    if check_shape:
+      self.assertEqual(repr(a.shape), repr(b.shape))
     self.assertEqual(set(a.field_names()), set(b.field_names()))
     for field in a.field_names():
-      self.assertAllEqual(a.field_value(field), b.field_value(field))
+      a_value = a.field_value(field)
+      b_value = b.field_value(field)
+      self.assertIs(type(a_value), type(b_value))
+      if isinstance(a_value, structured_tensor.StructuredTensor):
+        self._assertStructuredEqual(a_value, b_value, msg, check_shape)
+      else:
+        self.assertAllEqual(a_value, b_value, msg)
 
-  @parameterized.parameters([
+  def testConstructorIsPrivate(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 "StructuredTensor constructor is private"):
+      structured_tensor.StructuredTensor({}, (), None, ())
+
+  @parameterized.named_parameters([
+      # Scalar (rank=0) StructuredTensors.
       {
+          "testcase_name": "Rank0_WithNoFields",
           "shape": [],
           "fields": {},
       },
       {
-          "shape": [None],
-          "fields": {},
-      },
-      {
-          "shape": [1, 5, 3],
-          "fields": {},
-      },
-      {
+          "testcase_name": "Rank0_WithTensorFields",
           "shape": [],
           "fields": {"Foo": 5, "Bar": [1, 2, 3]},
       },
       {
-          "shape": [2],
-          "fields": {"x": [1, 2], "y": [[1, 2], [3, 4]]},
-      },
-      {
-          "shape": [None],
-          "fields": {"x": [1, 2], "y": [[1, 2], [3, 4]]},
-          "expected_shape": [2],  # inferred from field values.
-      },
-      {
+          "testcase_name": "Rank0_WithRaggedFields",
           "shape": [],
           "fields": {
-              "r": ragged_factory_ops.constant_value([[1, 2], [3]]),
+              # note: fields have varying rank & ragged_rank.
+              "p": ragged_factory_ops.constant_value([[1, 2], [3]]),
+              "q": ragged_factory_ops.constant_value([[[4]], [], [[5, 6]]]),
+              "r": ragged_factory_ops.constant_value([[[4]], [], [[5]]],
+                                                     ragged_rank=1),
+              "s": ragged_factory_ops.constant_value([[[4]], [], [[5]]],
+                                                     ragged_rank=2),
           },
       },
       {
+          "testcase_name": "Rank0_WithStructuredFields",
+          "shape": [],
+          "fields": lambda: {
+              "foo": StructuredTensor.from_pyval({"a": 1, "b": [1, 2, 3]}),
+              "bar": StructuredTensor.from_pyval(
+                  [[{"x": 12}], [{"x": 13}, {"x": 14}]]),
+              },
+      },
+      {
+          "testcase_name": "Rank0_WithMixedFields",
+          "shape": [],
+          "fields": lambda: {
+              "f1": 5,
+              "f2": [1, 2, 3],
+              "f3": ragged_factory_ops.constant_value([[1, 2], [3]]),
+              "f4": StructuredTensor.from_pyval({"a": 1, "b": [1, 2, 3]}),
+          },
+      },
+      # Vector (rank=1) StructuredTensors.
+      {
+          "testcase_name": "Rank1_WithNoFields",
+          "shape": [2],
+          "fields": {},
+      },
+      {
+          "testcase_name": "Rank1_WithExplicitNrows",
+          "shape": [None],
+          "nrows": 2,
+          "fields": {"x": [1, 2], "y": [[1, 2], [3, 4]]},
+          "expected_shape": [2],
+      },
+      {
+          "testcase_name": "Rank1_WithTensorFields",
+          "shape": [2],
+          "fields": {"x": [1, 2], "y": [[1, 2], [3, 4]]},
+      },
+      {
+          "testcase_name": "Rank1_WithRaggedFields",
           "shape": [2],
           "fields": {
+              # note: fields have varying rank & ragged_rank.
+              "p": ragged_factory_ops.constant_value([[1, 2], [3]]),
+              "q": ragged_factory_ops.constant_value([[[4]], [[5, 6], [7]]]),
+              "r": ragged_factory_ops.constant_value([[], [[[12]], [[13]]]]),
+              "s": ragged_factory_ops.constant_value([[], [[[12]], [[13]]]],
+                                                     ragged_rank=1),
+              "t": ragged_factory_ops.constant_value([[], [[[12]], [[13]]]],
+                                                     ragged_rank=2),
+          },
+      },
+      {
+          "testcase_name": "Rank1_WithStructuredFields",
+          "shape": [2],
+          "fields": lambda: {
+              "foo": StructuredTensor.from_pyval(
+                  [{"a": 1, "b": [1, 2, 3]}, {"a": 2, "b": []}]),
+              "bar": StructuredTensor.from_pyval(
+                  [[{"x": 12}], [{"x": 13}, {"x": 14}]]),
+          },
+      },
+      {
+          "testcase_name": "Rank1_WithMixedFields",
+          "shape": [2],
+          "fields": lambda: {
+              "x": [1, 2],
+              "y": [[1, 2], [3, 4]],
               "r": ragged_factory_ops.constant_value([[1, 2], [3]]),
+              "s": StructuredTensor.from_pyval(
+                  [[{"x": 12}], [{"x": 13}, {"x": 14}]]),
           },
       },
       {
-          "shape": [2, None],
-          "fields": {
-              "r": ragged_factory_ops.constant_value(
-                  [[[1, 2], [3]], [[4, 5, 6], [7], [8, 9]]]),
+          "testcase_name": "Rank1_WithNoElements",
+          "shape": [0],
+          "fields": lambda: {
+              "x": [],
+              "y": np.zeros([0, 8]),
+              "r": ragged_factory_ops.constant([], ragged_rank=1),
+              "s": StructuredTensor.from_pyval([]),
           },
-          "expected_ragged_rank": 1,
       },
       {
-          # Note: fields must have identical row_splits.
+          "testcase_name": "Rank1_InferDimSize",
+          "shape": [None],
+          "fields": lambda: {
+              "x": [1, 2],
+              "y": [[1, 2], [3, 4]],
+              "r": ragged_factory_ops.constant_value([[1, 2], [3]]),
+              "p": ragged_factory_ops.constant_value([[4], [5, 6, 7]]),
+              "foo": StructuredTensor.from_pyval(
+                  [{"a": 1, "b": [1, 2, 3]}, {"a": 2, "b": []}]),
+              "bar": StructuredTensor.from_pyval(
+                  [[{"x": 12}], [{"x": 13}, {"x": 14}]]),
+          },
+          "expected_shape": [2],  # inferred from field values.
+      },
+      # Matrix (rank=2) StructuredTensors.
+      {
+          "testcase_name": "Rank2_WithNoFields",
+          "shape": [2, 8],
+          "fields": {},
+      },
+      {
+          "testcase_name": "Rank2_WithNoFieldsAndExplicitRowPartitions",
           "shape": [2, None],
+          "row_partitions":
+              lambda: [row_partition.RowPartition.from_row_lengths([3, 7])],
+          "fields": {},
+      },
+      {
+          "testcase_name": "Rank2_WithTensorFields",
+          "shape": [None, None],
           "fields": {
+              "x": [[1, 2, 3], [4, 5, 6]],
+              "y": np.ones([2, 3, 8])
+          },
+          "expected_shape": [2, 3],  # inferred from field values.
+      },
+      {
+          "testcase_name": "Rank2_WithRaggedFields",
+          "shape": [2, None],  # ragged shape = [[*, *], [*]]
+          "fields": {
+              # Note: fields must have identical row_splits.
               "a": ragged_factory_ops.constant_value([[1, 2], [3]]),
               "b": ragged_factory_ops.constant_value([[4, 5], [6]]),
+              "c": ragged_factory_ops.constant_value([[[1, 2], [3]], [[4, 5]]]),
+              "d": ragged_factory_ops.constant_value(
+                  [[[[1, 2], [3]], [[4], [], [5]]], [[[6, 7, 8], []]]]),
           },
-          "expected_ragged_rank": 1,
       },
       {
-          # Note: fields must have identical outer row_splits.
-          "shape": [2, None],
-          "fields": {
-              "a": ragged_factory_ops.constant_value(
-                  [[[1, 2], [3]], [[4, 5, 6], [7], [8, 9]]]),
-              "b": ragged_factory_ops.constant_value(
-                  [[[1], []], [[2, 3], [4, 5, 6], [7, 8]]]),
+          "testcase_name": "Rank2_WithStructuredFields",
+          "shape": [2, None],  # ragged shape = [[*], [*, *]]
+          "fields": lambda: {
+              # Note: fields must have identical row_splits.
+              "a": StructuredTensor.from_pyval(
+                  [[{"x": 1}], [{"x": 2}, {"x": 3}]]),
+              "b": StructuredTensor.from_pyval(
+                  [[[{"y": 1}]], [[], [{"y": 2}, {"y": 3}]]]),
           },
-          "expected_ragged_rank": 1,
+      },
+      {
+          "testcase_name": "Rank2_WithMixedFields",
+          "shape": [2, None],
+          "fields": lambda: {
+              "a": [[1, 2], [3, 4]],
+              "b": ragged_factory_ops.constant_value([[1, 2], [3, 4]]),
+              "c": StructuredTensor.from_pyval(
+                  [[[{"y": 1}], []], [[], [{"y": 2}, {"y": 3}]]]),
+              "d": ragged_factory_ops.constant_value(
+                  [[[1, 2], []], [[3], [4]]]),
+          },
+          "expected_shape": [2, 2],
+      },
+      # Rank=4 StructuredTensors.
+      {
+          "testcase_name": "Rank4_WithNoFields",
+          "shape": [1, None, None, 3],
+          "fields": {},
+          "row_partitions": lambda: [
+              row_partition.RowPartition.from_row_lengths([3]),
+              row_partition.RowPartition.from_row_lengths([2, 0, 1]),
+              row_partition.RowPartition.from_uniform_row_length(3, nvals=9)
+          ]
+      },
+      {
+          "testcase_name": "Rank4_WithMixedFields",
+          "shape": [1, None, None, 1],
+          "fields": lambda: {
+              "a": np.ones([1, 2, 3, 1]),
+              "b": np.ones([1, 2, 3, 1, 5]),
+              "c": ragged_factory_ops.constant(np.zeros([1, 2, 3, 1])),
+              "d": ragged_factory_ops.constant(
+                  np.zeros([1, 2, 3, 1, 3]).tolist(), ragged_rank=1),
+              "e": ragged_factory_ops.constant(
+                  np.zeros([1, 2, 3, 1, 2, 2]).tolist(), ragged_rank=2),
+              "f": ragged_factory_ops.constant(np.zeros([1, 2, 3, 1, 3])),
+              "g": StructuredTensor.from_pyval(
+                  [[[[{"x": j, "y": k}] for k in range(3)]
+                    for j in range(2)]]),
+              "h": StructuredTensor.from_pyval(
+                  [[[[[{"x": j, "y": k, "z": z} for z in range(j)]]
+                     for k in range(3)]
+                    for j in range(2)]]),
+          },
+          "expected_shape": [1, 2, 3, 1],  # inferred from field values.
       },
   ])  # pyformat: disable
-  def testConstructor(self, shape, fields, expected_shape=None,
-                      expected_ragged_rank=0):
-    struct = structured_tensor.StructuredTensor(shape, fields)
-    if expected_shape is None:
-      expected_shape = shape
-    self.assertEqual(struct.shape.as_list(), expected_shape)
-    self.assertLen(expected_shape, struct.rank)
-    self.assertCountEqual(struct.field_names(), tuple(fields.keys()))
-    self.assertEqual(struct.ragged_rank, expected_ragged_rank)
-    for field, value in fields.items():
-      self.assertIsInstance(
-          struct.field_value(field),
-          (ops.Tensor, structured_tensor.StructuredTensor,
-           ragged_tensor.RaggedTensor))
-      self.assertAllEqual(struct.field_value(field), value)
+  def testFromFields(self,
+                     shape,
+                     fields,
+                     expected_shape=None,
+                     nrows=None,
+                     row_partitions=None):
+    if callable(fields):
+      fields = fields()  # deferred construction: fields may include tensors.
+    if callable(nrows):
+      nrows = nrows()  # deferred construction.
+    if callable(row_partitions):
+      row_partitions = row_partitions()  # deferred construction.
+    for validate in (True, False):
+      struct = StructuredTensor.from_fields(
+          fields,
+          shape,
+          nrows=nrows,
+          row_partitions=row_partitions,
+          validate=validate)
+      if expected_shape is None:
+        expected_shape = shape
+      self.assertEqual(struct.shape.as_list(), expected_shape)
+      self.assertLen(expected_shape, struct.rank)
+      self.assertCountEqual(struct.field_names(), tuple(fields.keys()))
+      for field, value in fields.items():
+        self.assertIsInstance(
+            struct.field_value(field),
+            (ops.Tensor, structured_tensor.StructuredTensor,
+             ragged_tensor.RaggedTensor))
+        self.assertAllEqual(struct.field_value(field), value)
+
+  @parameterized.parameters([
+      dict(fields={}, shape=object(), err=TypeError),
+      dict(
+          fields=object(),
+          shape=[],
+          err=TypeError,
+          msg="fields must be a dictionary"),
+      dict(
+          fields={1: 2}, shape=[], err=TypeError,
+          msg="Unexpected type for key"),
+      dict(
+          fields={"x": object()},
+          shape=[],
+          err=TypeError,
+          msg="Unexpected type for value"),
+      dict(
+          fields={},
+          shape=None,
+          err=ValueError,
+          msg="StructuredTensor's shape must have known rank"),
+      dict(
+          fields={"f": 5},
+          shape=[5],
+          err=ValueError,
+          msg=r"Field f has shape \(\), which is incompatible with the shape "
+          r"that was specified or inferred from other fields: \(5,\)"),
+      dict(
+          fields=dict(x=[1], y=[]),
+          shape=[None],
+          err=ValueError,
+          msg=r"Field . has shape .*, which is incompatible with the shape "
+          r"that was specified or inferred from other fields: .*"),
+      dict(
+          fields={"": 5},
+          shape=[],
+          err=ValueError,
+          msg="Field name '' is not currently allowed."),
+      dict(
+          fields={"_": 5},
+          shape=[],
+          err=ValueError,
+          msg="Field name '_' is not currently allowed."),
+      dict(
+          fields={
+              "r1": ragged_factory_ops.constant_value([[1, 2], [3]]),
+              "r2": ragged_factory_ops.constant_value([[1, 2, 3], [4]])
+          },
+          shape=[2, None],
+          validate=True,
+          err=errors.InvalidArgumentError,
+          msg=r"incompatible row_splits",
+      ),
+      dict(
+          fields={},
+          shape=(),
+          nrows=5,
+          err=ValueError,
+          msg="nrows must be None if shape.rank==0"),
+      dict(
+          fields={},
+          shape=(),
+          row_partitions=[0],
+          err=ValueError,
+          msg=r"row_partitions must be None or \[\] if shape.rank<2"),
+      dict(
+          fields={},
+          shape=(None, None, None),
+          row_partitions=[],
+          err=ValueError,
+          msg=r"len\(row_partitions\) must be shape.rank-1"),
+      dict(
+          fields={},
+          shape=[None],
+          err=ValueError,
+          msg="nrows must be specified if rank==1 and `fields` is empty."),
+      dict(
+          fields={},
+          shape=[None, None],
+          err=ValueError,
+          msg="row_partitions must be specified if rank>1 and `fields` "
+          "is empty."),
+      dict(
+          fields={},
+          shape=[None, None],
+          nrows=lambda: constant_op.constant(2, dtypes.int32),
+          row_partitions=lambda:
+          [row_partition.RowPartition.from_row_lengths([3, 4])],
+          err=ValueError,
+          msg="field values have incompatible row_partition dtypes"),
+      dict(
+          fields=lambda: {
+              "a":
+                  ragged_factory_ops.constant([[1]],
+                                              row_splits_dtype=dtypes.int32),
+              "b":
+                  ragged_factory_ops.constant([[1]],
+                                              row_splits_dtype=dtypes.int64)
+          },
+          shape=[None, None],
+          err=ValueError,
+          msg="field values have incompatible row_partition dtypes"),
+      dict(
+          fields=lambda: {
+              "a":
+                  array_ops.placeholder_with_default(np.array([1, 2, 3]), None),
+              "b":
+                  array_ops.placeholder_with_default(np.array([4, 5]), None)
+          },
+          validate=True,
+          shape=[None],
+          err=(ValueError, errors.InvalidArgumentError),
+          msg="fields have incompatible nrows",
+          test_in_eager=False),
+  ])
+  def testFromFieldsErrors(self,
+                           fields,
+                           shape,
+                           nrows=None,
+                           row_partitions=None,
+                           validate=False,
+                           err=ValueError,
+                           msg=None,
+                           test_in_eager=True):
+    if not test_in_eager and context.executing_eagerly():
+      return
+    if callable(fields):
+      fields = fields()  # deferred construction.
+    if callable(nrows):
+      nrows = nrows()  # deferred construction.
+    if callable(row_partitions):
+      row_partitions = row_partitions()  # deferred construction.
+    with self.assertRaisesRegexp(err, msg):
+      struct = StructuredTensor.from_fields(
+          fields=fields,
+          shape=shape,
+          nrows=nrows,
+          row_partitions=row_partitions,
+          validate=validate)
+      for field_name in struct.field_names():
+        self.evaluate(struct.field_value(field_name))
+      self.evaluate(struct.nrows())
+
+  def testMergeNrowsErrors(self):
+    nrows = constant_op.constant(5)
+    static_nrows = tensor_shape.Dimension(5)
+    value = constant_op.constant([1, 2, 3])
+    with self.assertRaisesRegexp(ValueError, "fields have incompatible nrows"):
+      structured_tensor._merge_nrows(nrows, static_nrows, value, dtypes.int32,
+                                     validate=False)
 
   def testNestedStructConstruction(self):
     rt = ragged_factory_ops.constant([[1, 2], [3]])
-    struct1 = structured_tensor.StructuredTensor([], {"x": [1, 2]})
-    struct2 = structured_tensor.StructuredTensor([2], {"x": [1, 2]})
-    struct3 = structured_tensor.StructuredTensor([], {"r": rt, "s": struct1})
-    struct4 = structured_tensor.StructuredTensor([2], {"r": rt, "s": struct2})
+    struct1 = StructuredTensor.from_fields(shape=[], fields={"x": [1, 2]})
+    struct2 = StructuredTensor.from_fields(shape=[2], fields={"x": [1, 2]})
+    struct3 = StructuredTensor.from_fields(
+        shape=[], fields={
+            "r": rt,
+            "s": struct1
+        })
+    struct4 = StructuredTensor.from_fields(
+        shape=[2], fields={
+            "r": rt,
+            "s": struct2
+        })
 
     self.assertEqual(struct3.shape.as_list(), [])
     self.assertEqual(struct3.rank, 0)
@@ -155,93 +497,63 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(struct4.field_value("r"), rt)
     self.assertAllEqual(struct4.field_value("s"), struct2)
 
-  @parameterized.parameters([
-      (object(), {}, TypeError),
-      ([], object(), TypeError, "fields must be a dictionary"),
-      ([], {1: 2}, TypeError, "Unexpected type for key"),
-      ([], {"x": object()}, TypeError, "Unexpected type for value"),
-      (None, {}, ValueError, "StructuredTensor's shape must have known rank"),
-      ([5], {"f": 5}, ValueError, r"Shapes \(5,\) and \(\) are not compatible"),
-      ([None], {"x": [1], "y": []}, ValueError,
-       r"Shapes \([01],\) and \([01],\) are not compatible"),
-      ([], {"": 5}, ValueError, "Field name '' is not currently allowed."),
-      ([], {"_": 5}, ValueError, "Field name '_' is not currently allowed."),
-      {
-          # Note: fields must have identical outer row_splits.
-          "shape": [2, None],
-          "fields": {
-              "r1": ragged_factory_ops.constant_value(
-                  [[1, 2], [3]]),
-              "r2": ragged_factory_ops.constant_value(
-                  [[1, 2, 3], [4]]),
-          },
-          "err": errors.InvalidArgumentError,
-          "msg": r"Inputs must have identical ragged splits"
-      },
-  ])  # pyformat: disable
-  def testConstructorErrors(self, shape, fields, err, msg=None):
-    with self.assertRaisesRegexp(err, msg):
-      struct = structured_tensor.StructuredTensor(shape, fields)
-      self.evaluate(struct.field_value(struct.field_names()[0]))
+  def testPartitionOuterDims(self):
+    if not context.executing_eagerly(): return  # TESTING
+    a = dict(x=1, y=[1, 2])
+    b = dict(x=2, y=[3, 4])
+    c = dict(x=3, y=[5, 6])
+    d = dict(x=4, y=[7, 8])
+    st1 = StructuredTensor.from_pyval([a, b, c, d])
 
-  @parameterized.parameters([
-      {
-          "shape": [3],
-          "fields": {"x": [1, 2, 3], "y": [[1, 2], [3, 4], [5, 6]]},
-          "row_splits": [0, 2, 3],
-      },
-  ])  # pyformat: disable
-  def testFromRowSplits(self, shape, fields, row_splits, expected_shape=None):
-    values = structured_tensor.StructuredTensor(shape, fields)
-    struct = structured_tensor.StructuredTensor.from_row_splits(
-        values, row_splits)
-    if expected_shape is None:
-      expected_shape = (
-          tensor_shape.TensorShape([None, None]).concatenate(shape[1:]))
-      struct.shape.assert_is_compatible_with(expected_shape)
-    else:
-      self.assertEqual(struct.shape.as_list(), expected_shape)
-    self.assertEqual(struct.shape.rank, struct.rank)
-    self.assertCountEqual(struct.field_names(), tuple(fields.keys()))
-    for field, value in fields.items():
-      self.assertIsInstance(
-          struct.field_value(field),
-          (ops.Tensor, structured_tensor.StructuredTensor,
-           ragged_tensor.RaggedTensor))
-      self.assertAllEqual(
-          struct.field_value(field),
-          ragged_tensor.RaggedTensor.from_row_splits(value, row_splits))
+    st2 = st1.partition_outer_dimension(
+        row_partition.RowPartition.from_row_splits([0, 2, 2, 3, 4]))
+    self.assertAllEqual(st2, [[a, b], [], [c], [d]])
 
-  @parameterized.parameters([
-      ([], {}, ["x"], ValueError,
-       r"Shape \(\) must have rank at least 1"),
-      ([0], {}, ["x"], ValueError,
-       r"Row-partitioning tensors must have dtype int32 or int64"),
-      ([0], {}, [[0]], ValueError,
-       r"Shape \(1, 1\) must have rank 1"),
-      ([0], {}, np.array([], np.int32), ValueError,
-       r"row_splits may not be empty"),
-  ])  # pyformat: disable
-  def testFromRowSplitsErrors(self, shape, fields, row_splits, err, msg=None):
-    with self.assertRaisesRegexp(err, msg):
-      values = structured_tensor.StructuredTensor(shape, fields)
-      structured_tensor.StructuredTensor.from_row_splits(values, row_splits)
+    st3 = st2.partition_outer_dimension(
+        row_partition.RowPartition.from_row_lengths([1, 0, 3, 0]))
+    self.assertAllEqual(st3, [[[a, b]], [], [[], [c], [d]], []])
+
+    # If we partition with uniform_row_lengths, then `x` is partitioned into
+    # a Tensor (not a RaggedTensor).
+    st4 = st1.partition_outer_dimension(
+        row_partition.RowPartition.from_uniform_row_length(
+            uniform_row_length=2, nvals=4, nrows=2))
+    self.assertAllEqual(st4, structured_tensor.StructuredTensor.from_pyval(
+        [[a, b], [c, d]], structured_tensor.StructuredTensorSpec([2, 2], {
+            "x": tensor_spec.TensorSpec([2, 2], dtypes.int32),
+            "y": ragged_tensor.RaggedTensorSpec([2, 2, None], dtypes.int32)})))
+
+  def testPartitionOuterDimension3(self):
+    rt = ragged_tensor.RaggedTensor.from_value_rowids(
+        array_ops.constant([[1, 2], [3, 4], [5, 6]]), [0, 0, 1])
+    struct = structured_tensor.StructuredTensor.from_fields({"r": rt}, [2])
+    struct_2 = struct.partition_outer_dimension(
+        row_partition.RowPartition.from_row_splits([0, 1, 2]))
+    struct_3 = struct_2.partition_outer_dimension(
+        row_partition.RowPartition.from_row_splits([0, 1, 2]))
+    self.assertEqual(3, struct_3.rank)
+
+  def testPartitionOuterDimsErrors(self):
+    st = StructuredTensor.from_fields({})
+    partition = row_partition.RowPartition.from_row_splits([0])
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Shape \(\) must have rank at least 1"):
+      st.partition_outer_dimension(partition)
 
-  def testFromRowSplitsBadValueType(self):
     with self.assertRaisesRegexp(TypeError,
-                                 "values must be a StructuredTensor"):
-      structured_tensor.StructuredTensor.from_row_splits([1, 2], [0, 2])
+                                 "row_partition must be a RowPartition"):
+      st.partition_outer_dimension(10)
 
   @parameterized.named_parameters([
       {
           "testcase_name": "ScalarEmpty",
           "pyval": {},
-          "expected": lambda: structured_tensor.StructuredTensor([], {})
+          "expected": lambda: StructuredTensor.from_fields(shape=[], fields={})
       },
       {
           "testcase_name": "ScalarSimple",
           "pyval": {"a": 12, "b": [1, 2, 3], "c": [[1, 2], [3]]},
-          "expected": lambda: structured_tensor.StructuredTensor([], {
+          "expected": lambda: StructuredTensor.from_fields(shape=[], fields={
               "a": 12,
               "b": [1, 2, 3],
               "c": ragged_factory_ops.constant([[1, 2], [3]])})
@@ -253,7 +565,7 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
               "a": tensor_spec.TensorSpec([], dtypes.int32),
               "b": tensor_spec.TensorSpec([None], dtypes.int32),
               "c": ragged_tensor.RaggedTensorSpec([None, None], dtypes.int32)}),
-          "expected": lambda: structured_tensor.StructuredTensor([], {
+          "expected": lambda: StructuredTensor.from_fields(shape=[], fields={
               "a": 12,
               "b": [1, 2, 3],
               "c": ragged_factory_ops.constant([[1, 2], [3]])})
@@ -261,10 +573,10 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
       {
           "testcase_name": "ScalarWithNestedStruct",
           "pyval": {"a": 12, "b": [1, 2, 3], "c": {"x": b"Z", "y": [10, 20]}},
-          "expected": lambda: structured_tensor.StructuredTensor([], {
+          "expected": lambda: StructuredTensor.from_fields(shape=[], fields={
               "a": 12,
               "b": [1, 2, 3],
-              "c": structured_tensor.StructuredTensor([], {
+              "c": StructuredTensor.from_fields(shape=[], fields={
                   "x": "Z",
                   "y": [10, 20]})})
       },
@@ -274,51 +586,73 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
           "expected": lambda: [],
       },
       {
-          "testcase_name": "EmptyListWithTypeSpec",
+          "testcase_name": "ListOfEmptyList",
+          "pyval": [[], []],
+          "expected": lambda: [[], []],
+      },
+      {
+          "testcase_name": "EmptyListWithTypeSpecAndFields",
           "pyval": [],
           "type_spec": structured_tensor.StructuredTensorSpec([0], {
               "a": tensor_spec.TensorSpec(None, dtypes.int32)}),
-          "expected": lambda: structured_tensor.StructuredTensor([0], {
+          "expected": lambda: StructuredTensor.from_fields(shape=[0], fields={
               "a": []})
       },
+      {
+          "testcase_name": "EmptyListWithTypeSpecNoFieldsShape0_5",
+          "pyval": [],
+          "type_spec": structured_tensor.StructuredTensorSpec([0, 5], {}),
+          "expected": lambda: StructuredTensor.from_fields(shape=[0, 5],
+                                                           fields={})
+      },
+      {
+          "testcase_name": "EmptyListWithTypeSpecNoFieldsShape1_0",
+          "pyval": [[]],
+          "type_spec": structured_tensor.StructuredTensorSpec([1, 0], {}),
+          "expected": lambda: StructuredTensor.from_fields(shape=[1, 0],
+                                                           fields={})
+      },
       {
           "testcase_name": "VectorOfDict",
           "pyval": [{"a": 1}, {"a": 2}],
-          "expected": lambda: structured_tensor.StructuredTensor([2], {
+          "expected": lambda: StructuredTensor.from_fields(shape=[2], fields={
               "a": [1, 2]})
       },
       {
           "testcase_name": "VectorOfDictWithNestedStructScalar",
           "pyval": [{"a": 1, "b": {"x": [1, 2]}},
                     {"a": 2, "b": {"x": [3]}}],
-          "expected": lambda: structured_tensor.StructuredTensor([2], {
+          "expected": lambda: StructuredTensor.from_fields(shape=[2], fields={
               "a": [1, 2],
-              "b": structured_tensor.StructuredTensor([2], {
+              "b": StructuredTensor.from_fields(shape=[2], fields={
                   "x": ragged_factory_ops.constant([[1, 2], [3]])})}),
       },
       {
           "testcase_name": "VectorOfDictWithNestedStructVector",
           "pyval": [{"a": 1, "b": [{"x": [1, 2]}, {"x": [5]}]},
                     {"a": 2, "b": [{"x": [3]}]}],
-          "expected": lambda: structured_tensor.StructuredTensor([2], {
+          "expected": lambda: StructuredTensor.from_fields(shape=[2], fields={
               "a": [1, 2],
-              "b": structured_tensor.StructuredTensor([2, None], {
+              "b": StructuredTensor.from_fields(shape=[2, None], fields={
                   "x": ragged_factory_ops.constant([[[1, 2], [5]], [[3]]])})}),
       },
       {
           "testcase_name": "Ragged2DOfDict",
           "pyval": [[{"a": 1}, {"a": 2}, {"a": 3},],
                     [{"a": 4}, {"a": 5}]],
-          "expected": lambda: structured_tensor.StructuredTensor([2, None], {
-              "a": ragged_factory_ops.constant([[1, 2, 3], [4, 5]])})
+          "expected": lambda: StructuredTensor.from_fields(
+              shape=[2, None],
+              fields={
+                  "a": ragged_factory_ops.constant([[1, 2, 3], [4, 5]])})
       },
       {
           # With no type-spec, all tensors>1D are encoded as ragged:
           "testcase_name": "MatrixOfDictWithoutTypeSpec",
           "pyval": [[{"a": 1}, {"a": 2}, {"a": 3},],
                     [{"a": 4}, {"a": 5}, {"a": 6}]],
-          "expected": lambda: structured_tensor.StructuredTensor([2, None], {
-              "a": ragged_factory_ops.constant([[1, 2, 3], [4, 5, 6]])})
+          "expected": lambda: StructuredTensor.from_fields(
+              shape=[2, None], fields={
+                  "a": ragged_factory_ops.constant([[1, 2, 3], [4, 5, 6]])})
       },
       {
           # TypeSpec can be used to specify StructuredTensor shape.
@@ -327,8 +661,8 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
                     [{"a": 4}, {"a": 5}, {"a": 6}]],
           "type_spec": structured_tensor.StructuredTensorSpec([2, 3], {
               "a": tensor_spec.TensorSpec(None, dtypes.int32)}),
-          "expected": lambda: structured_tensor.StructuredTensor([2, 3], {
-              "a": [[1, 2, 3], [4, 5, 6]]})
+          "expected": lambda: StructuredTensor.from_fields(
+              shape=[2, 3], fields={"a": [[1, 2, 3], [4, 5, 6]]})
       },
   ])  # pyformat: disable
   def testPyvalConversion(self, pyval, expected, type_spec=None):
@@ -340,17 +674,283 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
         self.assertEqual(actual.to_pyval(), pyval)
 
   @parameterized.named_parameters([
-      {
-          "testcase_name": "MissingKeys",
-          "pyval": [{"a": [1, 2]}, {"b": [3, 4]}],
-          "err": KeyError,
-          "msg": "'b'"
-      }
+      dict(testcase_name="MissingKeys",
+           pyval=[{"a": [1, 2]}, {"b": [3, 4]}],
+           err=KeyError,
+           msg="'b'"),
+      dict(testcase_name="TypeSpecMismatch_DictKey",
+           pyval={"a": 1},
+           type_spec=structured_tensor.StructuredTensorSpec(
+               shape=[1],
+               field_specs={"b": tensor_spec.TensorSpec([], dtypes.int32)}),
+           msg="Value does not match typespec"),
+      dict(testcase_name="TypeSpecMismatch_ListDictKey",
+           pyval=[{"a": 1}],
+           type_spec=structured_tensor.StructuredTensorSpec(
+               shape=[1],
+               field_specs={"b": tensor_spec.TensorSpec([], dtypes.int32)}),
+           msg="Value does not match typespec"),
+      dict(testcase_name="TypeSpecMismatch_RankMismatch",
+           pyval=[{"a": 1}],
+           type_spec=structured_tensor.StructuredTensorSpec(
+               shape=[],
+               field_specs={"a": tensor_spec.TensorSpec([], dtypes.int32)}),
+           msg=r"Value does not match typespec \(rank mismatch\)"),
+      dict(testcase_name="TypeSpecMismatch_Scalar",
+           pyval=0,
+           type_spec=structured_tensor.StructuredTensorSpec(
+               shape=[], field_specs={}),
+           msg="Value does not match typespec"),
+      dict(testcase_name="TypeSpecMismatch_ListTensor",
+           pyval={"a": [[1]]},
+           type_spec=structured_tensor.StructuredTensorSpec(
+               shape=[],
+               field_specs={"a": tensor_spec.TensorSpec([], dtypes.int32)}),
+           msg="Value does not match typespec"),
+      dict(testcase_name="TypeSpecMismatch_ListSparse",
+           pyval=[1, 2],
+           type_spec=sparse_tensor.SparseTensorSpec([None], dtypes.int32),
+           msg="Value does not match typespec"),
+      dict(testcase_name="TypeSpecMismatch_ListStruct",
+           pyval=[[1]],
+           type_spec=structured_tensor.StructuredTensorSpec(
+               shape=[1, 1],
+               field_specs={"a": tensor_spec.TensorSpec([], dtypes.int32)}),
+           msg="Value does not match typespec"),
+      dict(testcase_name="InconsistentDictionaryDepth",
+           pyval=[{}, [{}]],
+           msg="Inconsistent depth of dictionaries"),
+      dict(testcase_name="FOO",
+           pyval=[[{}], 5],
+           msg="Expected dict or nested list/tuple of dict"),
+
   ])  # pyformat: disable
-  def testFromPyvalError(self, pyval, err, type_spec=None, msg=None):
+  def testFromPyvalError(self, pyval, err=ValueError, type_spec=None, msg=None):
     with self.assertRaisesRegexp(err, msg):
       structured_tensor.StructuredTensor.from_pyval(pyval, type_spec)
 
+  def testToPyvalRequiresEagerMode(self):
+    st = structured_tensor.StructuredTensor.from_pyval({"a": 5})
+    if not context.executing_eagerly():
+      with self.assertRaisesRegexp(ValueError, "only supported in eager mode."):
+        st.to_pyval()
+
+  @parameterized.named_parameters([
+      (
+          "Rank0",
+          [],
+      ),
+      (
+          "Rank1",
+          [5, 3],
+      ),
+      (
+          "Rank2",
+          [5, 8, 3],
+      ),
+      (
+          "Rank5",
+          [1, 2, 3, 4, 5],
+      ),
+  ])
+  def testRowPartitionsFromUniformShape(self, shape):
+    for rank in range(len(shape)):
+      partitions = structured_tensor._row_partitions_for_uniform_shape(
+          ops.convert_to_tensor(shape), rank)
+      self.assertLen(partitions, max(0, rank - 1))
+      if partitions:
+        self.assertAllEqual(shape[0], partitions[0].nrows())
+      for (dim, partition) in enumerate(partitions):
+        self.assertAllEqual(shape[dim + 1], partition.uniform_row_length())
+
+  @parameterized.named_parameters([
+      # For shapes: U = uniform dimension; R = ragged dimension.
+      dict(
+          testcase_name="Shape_UR_Rank2",
+          rt=[[1, 2], [], [3]],
+          rt_ragged_rank=1,
+          rank=2,
+          expected_row_lengths=[[2, 0, 1]]),
+      dict(
+          testcase_name="Shape_URR_Rank2",
+          rt=[[[1, 2], []], [[3]]],
+          rt_ragged_rank=2,
+          rank=2,
+          expected_row_lengths=[[2, 1]]),
+      dict(
+          testcase_name="Shape_URU_Rank2",
+          rt=[[[1], [2]], [[3]]],
+          rt_ragged_rank=1,
+          rank=2,
+          expected_row_lengths=[[2, 1]]),
+      dict(
+          testcase_name="Shape_URR_Rank3",
+          rt=[[[1, 2], []], [[3]]],
+          rt_ragged_rank=2,
+          rank=3,
+          expected_row_lengths=[[2, 1], [2, 0, 1]]),
+      dict(
+          testcase_name="Shape_URU_Rank3",
+          rt=[[[1], [2]], [[3]]],
+          rt_ragged_rank=1,
+          rank=3,
+          expected_row_lengths=[[2, 1], [1, 1, 1]]),
+      dict(
+          testcase_name="Shape_URRUU_Rank2",
+          rt=[[[[[1, 2]]]]],
+          rt_ragged_rank=2,
+          rank=2,
+          expected_row_lengths=[[1]]),
+      dict(
+          testcase_name="Shape_URRUU_Rank3",
+          rt=[[[[[1, 2]]]]],
+          rt_ragged_rank=2,
+          rank=3,
+          expected_row_lengths=[[1], [1]]),
+      dict(
+          testcase_name="Shape_URRUU_Rank4",
+          rt=[[[[[1, 2]]]]],
+          rt_ragged_rank=2,
+          rank=4,
+          expected_row_lengths=[[1], [1], [1]]),
+      dict(
+          testcase_name="Shape_URRUU_Rank5",
+          rt=[[[[[1, 2]]]]],
+          rt_ragged_rank=2,
+          rank=5,
+          expected_row_lengths=[[1], [1], [1], [2]]),
+  ])
+  def testRowPartitionsForRaggedTensor(self, rt, rt_ragged_rank, rank,
+                                       expected_row_lengths):
+    rt = ragged_factory_ops.constant(rt, rt_ragged_rank)
+    partitions = structured_tensor._row_partitions_for_ragged_tensor(
+        rt, rank, dtypes.int64)
+    self.assertLen(partitions, rank - 1)
+    self.assertLen(partitions, len(expected_row_lengths))
+    for partition, expected in zip(partitions, expected_row_lengths):
+      self.assertAllEqual(partition.row_lengths(), expected)
+
+  @parameterized.named_parameters([
+      dict(
+          testcase_name="2D_0_1",
+          st=[[{"x": 1}, {"x": 2}], [{"x": 3}]],
+          outer_axis=0, inner_axis=1,
+          expected=[{"x": 1}, {"x": 2}, {"x": 3}]),
+      dict(
+          testcase_name="3D_0_1",
+          st=[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]]],
+          outer_axis=0, inner_axis=1,
+          expected=[[{"x": 1}, {"x": 2}], [{"x": 3}], [{"x": 4}]]),
+      dict(
+          testcase_name="3D_1_2",
+          st=[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]]],
+          outer_axis=1, inner_axis=2,
+          expected=[[{"x": 1}, {"x": 2}, {"x": 3}], [{"x": 4}]]),
+      dict(
+          testcase_name="3D_0_2",
+          st=[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]]],
+          outer_axis=0, inner_axis=2,
+          expected=[{"x": 1}, {"x": 2}, {"x": 3}, {"x": 4}]),
+      dict(
+          testcase_name="4D_0_1",
+          st=[[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]]],
+              [[[{"x": 5}]], [[{"x": 6}], [{"x": 7}]]]],
+          outer_axis=0, inner_axis=1,
+          expected=[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]],
+                    [[{"x": 5}]], [[{"x": 6}], [{"x": 7}]]]),
+      dict(
+          testcase_name="4D_0_2",
+          st=[[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]]],
+              [[[{"x": 5}]], [[{"x": 6}], [{"x": 7}]]]],
+          outer_axis=0, inner_axis=2,
+          expected=[[{"x": 1}, {"x": 2}], [{"x": 3}], [{"x": 4}],
+                    [{"x": 5}], [{"x": 6}], [{"x": 7}]]),
+      dict(
+          testcase_name="4D_0_3",
+          st=[[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]]],
+              [[[{"x": 5}]], [[{"x": 6}], [{"x": 7}]]]],
+          outer_axis=0, inner_axis=3,
+          expected=[{"x": 1}, {"x": 2}, {"x": 3}, {"x": 4},
+                    {"x": 5}, {"x": 6}, {"x": 7}]),
+      dict(
+          testcase_name="4D_1_2",
+          st=[[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]]],
+              [[[{"x": 5}]], [[{"x": 6}], [{"x": 7}]]]],
+          outer_axis=1, inner_axis=2,
+          expected=[[[{"x": 1}, {"x": 2}], [{"x": 3}], [{"x": 4}]],
+                    [[{"x": 5}], [{"x": 6}], [{"x": 7}]]]),
+      dict(
+          testcase_name="4D_1_3",
+          st=[[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]]],
+              [[[{"x": 5}]], [[{"x": 6}], [{"x": 7}]]]],
+          outer_axis=1, inner_axis=3,
+          expected=[[{"x": 1}, {"x": 2}, {"x": 3}, {"x": 4}],
+                    [{"x": 5}, {"x": 6}, {"x": 7}]]),
+      dict(
+          testcase_name="4D_2_3",
+          st=[[[[{"x": 1}, {"x": 2}], [{"x": 3}]], [[{"x": 4}]]],
+              [[[{"x": 5}]], [[{"x": 6}], [{"x": 7}]]]],
+          outer_axis=2, inner_axis=3,
+          expected=[[[{"x": 1}, {"x": 2}, {"x": 3}], [{"x": 4}]],
+                    [[{"x": 5}], [{"x": 6}, {"x": 7}]]]),
+  ])  # pyformat: disable
+  def testMergeDims(self, st, outer_axis, inner_axis, expected):
+    st = StructuredTensor.from_pyval(st)
+    result = st.merge_dims(outer_axis, inner_axis)
+    self.assertAllEqual(result, expected)
+
+  def testMergeDims_0_1(self):
+    rt = ragged_tensor.RaggedTensor.from_value_rowids(
+        array_ops.constant([[1, 2], [3, 4], [5, 6]]), [0, 0, 1])
+    struct = StructuredTensor.from_fields({"r": rt}, [2])
+    struct_2 = struct.partition_outer_dimension(
+        row_partition.RowPartition.from_row_splits([0, 1, 2]))
+    struct_3 = struct_2.partition_outer_dimension(
+        row_partition.RowPartition.from_row_splits([0, 1, 2]))
+    self.assertLen(struct_3.row_partitions, 2)
+    merged = struct_3.merge_dims(0, 1)
+    self.assertLen(merged.row_partitions, 1)
+
+  def testMergeDimsError(self):
+    st = StructuredTensor.from_pyval([[[{"a": 5}]]])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Expected outer_axis \(2\) to be less than inner_axis \(1\)"):
+      st.merge_dims(2, 1)
+
+  def testTupleFieldValue(self):
+    st = StructuredTensor.from_pyval({"a": 5, "b": {"c": [1, 2, 3]}})
+    self.assertAllEqual(st.field_value(("a",)), 5)
+    self.assertAllEqual(st.field_value(("b", "c")), [1, 2, 3])
+    expected = "Field path \(.*a.*,.*b.*\) not found in .*"
+    with self.assertRaisesRegexp(KeyError, expected):
+      st.field_value(("a", "b"))
+
+  def testRepr(self):
+    st = StructuredTensor.from_pyval({"a": 5, "b": {"c": [1, 2, 3]}})
+    if context.executing_eagerly():
+      expected = ("<StructuredTensor(fields={"
+                  '"a": tf.Tensor(5, shape=(), dtype=int32), '
+                  '"b": <StructuredTensor(fields={'
+                  '"c": tf.Tensor([1 2 3], shape=(3,), dtype=int32)}, '
+                  "shape=())>}, shape=())>")
+    else:
+      expected = ("<StructuredTensor(fields={"
+                  '"a": Tensor("Const:0", shape=(), dtype=int32), '
+                  '"b": <StructuredTensor(fields={'
+                  '"c": Tensor("RaggedConstant/Const:0", shape=(3,), '
+                  "dtype=int32)}, shape=())>}, shape=())>")
+    self.assertEqual(repr(st), expected)
+
+  def testPartitionOuterDimension2DDenseField(self):
+    struct = structured_tensor.StructuredTensor.from_fields(
+        fields={"r": array_ops.constant([[1, 2], [3, 4]])}, shape=[2])
+
+    result = struct.partition_outer_dimension(
+        row_partition.RowPartition.from_uniform_row_length(2, 2))
+    r = result.field_value("r")
+    self.assertAllEqual(r, [[[1, 2], [3, 4]]])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 6f4472dbc09..91b8e61b341 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -93,6 +93,11 @@ def _should_record_summaries_internal(default_state):
   if _summary_state.writer is None:
     return constant_op.constant(False)
 
+  if not callable(_summary_state.is_recording):
+    static_cond = tensor_util.constant_value(_summary_state.is_recording)
+    if static_cond is not None and not static_cond:
+      return constant_op.constant(False)
+
   resolve = lambda x: x() if callable(x) else x
   cond_distributed = resolve(_summary_state.is_recording_distribution_strategy)
   cond = resolve(_summary_state.is_recording)
@@ -110,6 +115,7 @@ def _should_record_summaries_v2():
   return _should_record_summaries_internal(default_state=True)
 
 
+@tf_export("summary.should_record_summaries", v1=[])
 def should_record_summaries():
   """Returns boolean Tensor which is true if summaries should be recorded."""
   return _should_record_summaries_internal(default_state=False)
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index e8ea9ff4e4d..d386d14b64a 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -1122,7 +1122,7 @@ class TensorArray(object):
     Returns:
       A new TensorArray object with flow that ensures the control dependencies
       from the contexts will become control dependencies for writes, reads, etc.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
     """
     return self._implementation.identity()
 
@@ -1152,7 +1152,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the write occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if there are more writers than specified.
@@ -1217,7 +1217,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the unstack occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
@@ -1236,7 +1236,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the scatter occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
@@ -1255,7 +1255,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the split occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index d65cd235ca8..81c3f9a2f70 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
@@ -1000,7 +1001,7 @@ class _VariableStore(object):
     return initializer, initializing_from_value
 
 
-class _LazyEvalTensor(object):
+class _LazyEvalTensor(core.Tensor):
   """A Tensor-like object that only evaluates its thunk when used."""
 
   def __init__(self, thunk):
@@ -1069,8 +1070,6 @@ session.register_session_run_conversion_functions(
     lambda fetch: ([fetch._master_tensor], lambda fetched_vals: fetched_vals[0])  # pylint: disable=protected-access
     )
 
-ops.register_dense_tensor_like_type(_LazyEvalTensor)
-
 
 # To stop regularization, use this regularizer
 @tf_export(v1=["no_regularizer"])
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 1080778e3d3..d3df0659b5a 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -47,6 +47,7 @@ from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.types import core
 
 
 def default_variable_creator(_, **kwds):
@@ -264,6 +265,7 @@ class VariableMetaclass(type):
 
 
 @tf_export("Variable", v1=[])
+# TODO(mdan): This should subclass core.Tensor, and not all its subclasses?
 class Variable(six.with_metaclass(VariableMetaclass, trackable.Trackable)):
   """See the [variable guide](https://tensorflow.org/guide/variable).
 
@@ -1551,7 +1553,7 @@ class VariableV1(Variable):
 
 
 # TODO(apassos): do not repeat all comments here
-class RefVariable(VariableV1):
+class RefVariable(VariableV1, core.Tensor):
   """Ref-based implementation of variables."""
 
   def __init__(
@@ -3032,7 +3034,6 @@ class PartitionedVariable(object):
 # allowing instances of the class to be used as tensors.
 ops.register_tensor_conversion_function(RefVariable,
                                         RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
-ops.register_dense_tensor_like_type(RefVariable)
 
 
 @tf_export(v1=["global_variables"])
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 7cb3b53ae1b..8e92c44e707 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -335,6 +335,14 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
           while_op.outputs[:num_original_outputs])
   ] + [None] * num_intermediates
 
+  # Skip gradients with respect to the captures whenever possible.
+  if "skip_input_indices" in op.__dict__ and op.skip_input_indices is not None:
+    captures_start_index = (
+        len(body_graph.inputs) - len(body_graph.internal_captures))
+    for i in op.skip_input_indices:
+      if i >= captures_start_index:
+        grads[i] = None
+
   # We compute the gradient for the sub-graph between trainable ys and xs
   # with non-None incoming gradients. We later pad the None's to the list of
   # outputs.
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 1f80b864549..ffc090a4676 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -59,6 +59,7 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "no_pip",
+        "no_rocm",
     ],
     deps = [
         ":profiler_v2",
@@ -223,9 +224,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python/profiler/internal:_pywrap_traceme",
-        "@six_archive//:six",
     ],
 )
 
@@ -237,13 +237,3 @@ py_library(
         ":trace",
     ],
 )
-
-py_library(
-    name = "scoped_annotation",
-    srcs = ["scoped_annotation.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/profiler/internal:_pywrap_scoped_annotation",
-        "@six_archive//:six",
-    ],
-)
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index d9f93c2fb21..b6648462224 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -80,28 +80,28 @@ cuda_py_test(
 tf_python_pybind_extension(
     name = "_pywrap_traceme",
     srcs = ["traceme_wrapper.cc"],
-    features = ["-layering_check"],
     module_name = "_pywrap_traceme",
     visibility = [
         "//perftools/accelerators/xprof/xprofilez/integration_tests:__pkg__",
         "//tensorflow/python/profiler:__subpackages__",
     ],
     deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:traceme_headers",
-        "@com_google_absl//absl/types:optional",
+        ":traceme_context_manager",
         "@pybind11",
     ],
 )
 
-tf_python_pybind_extension(
-    name = "_pywrap_scoped_annotation",
-    srcs = ["scoped_annotation_wrapper.cc"],
+cc_library(
+    name = "traceme_context_manager",
+    hdrs = ["traceme_context_manager.h"],
     features = ["-layering_check"],
-    module_name = "_pywrap_scoped_annotation",
+    visibility = [
+        "//tensorflow/compiler/xla/python:__pkg__",
+    ],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:scoped_annotation_headers",
+        "//tensorflow/core/profiler/lib:traceme_headers",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
     ],
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 7f99d64fd8f..63300f2a1ec 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -41,6 +41,7 @@ tensorflow::ProfileRequest MakeProfileRequest(
   request.add_tools("input_pipeline");
   request.add_tools("kernel_stats");
   request.add_tools("tensorflow_stats");
+  request.add_tools("memory_profile");
   request.set_host_name(host);
   request.set_repository_root(logdir);
   request.set_session_id(session_id);
diff --git a/tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc b/tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc
deleted file mode 100644
index 078ebb0966c..00000000000
--- a/tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <utility>
-
-#include "absl/types/optional.h"
-#include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/scoped_annotation.h"
-
-namespace py = pybind11;
-
-namespace {
-
-// Helper to implement ScopedAnnotation as a context manager in Python.
-class ScopedAnnotationWrapper {
- public:
-  explicit ScopedAnnotationWrapper(const tensorflow::string& name)
-      : name_(name) {}
-
-  void Enter() { annotation_.emplace(std::move(name_)); }
-
-  void Exit() { annotation_.reset(); }
-
-  static bool IsEnabled() {
-    return tensorflow::profiler::ScopedAnnotation::IsEnabled();
-  }
-
- private:
-  tensorflow::string name_;
-  absl::optional<tensorflow::profiler::ScopedAnnotation> annotation_;
-};
-
-}  // namespace
-
-PYBIND11_MODULE(_pywrap_scoped_annotation, m) {
-  py::class_<ScopedAnnotationWrapper> scoped_annotation_class(
-      m, "ScopedAnnotation");
-  scoped_annotation_class.def(py::init<const tensorflow::string&>())
-      .def("Enter", &ScopedAnnotationWrapper::Enter)
-      .def("Exit", &ScopedAnnotationWrapper::Exit)
-      .def_static("IsEnabled", &ScopedAnnotationWrapper::IsEnabled);
-};
diff --git a/tensorflow/python/profiler/internal/traceme_context_manager.h b/tensorflow/python/profiler/internal/traceme_context_manager.h
new file mode 100644
index 00000000000..fd281684de8
--- /dev/null
+++ b/tensorflow/python/profiler/internal/traceme_context_manager.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_CONTEXT_MANAGER_
+#define TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_CONTEXT_MANAGER_
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "pybind11/pytypes.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace py = pybind11;
+
+namespace tensorflow {
+namespace profiler {
+
+// Helper to implement TraceMe as a context manager in Python.
+class TraceMeContextManager {
+ public:
+  explicit TraceMeContextManager(py::str name, py::kwargs kwargs)
+      : name_(std::move(name)), kwargs_(std::move(kwargs)) {}
+
+  void Enter() {
+    if (IsEnabled()) {
+      traceme_.emplace([this]() {
+        std::string name(name_);
+        if (!kwargs_.empty()) {
+          AppendMetadata(&name, kwargs_);
+        }
+        return name;
+      });
+    }
+  }
+
+  void SetMetadata(py::kwargs kwargs) {
+    if (TF_PREDICT_TRUE(traceme_.has_value() && !kwargs.empty())) {
+      traceme_->AppendMetadata([&kwargs]() {
+        std::string metadata;
+        AppendMetadata(&metadata, kwargs);
+        return metadata;
+      });
+    }
+  }
+
+  void Exit() { traceme_.reset(); }
+
+  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
+
+ private:
+  // Converts kwargs to strings and appends them to name encoded as TraceMe
+  // metadata.
+  static void AppendMetadata(std::string* name, const py::kwargs& kwargs) {
+    name->push_back('#');
+    for (const auto& kv : kwargs) {
+      absl::StrAppend(name, std::string(py::str(kv.first)), "=",
+                      std::string(py::str(kv.second)), ",");
+    }
+    name->back() = '#';
+  }
+
+  py::str name_;
+  py::kwargs kwargs_;
+  absl::optional<tensorflow::profiler::TraceMe> traceme_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_CONTEXT_MANAGER_
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index a1b5370836b..b3403fa298f 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -13,46 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <utility>
-
-#include "absl/types/optional.h"
+#include "pybind11/attr.h"
 #include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/python/profiler/internal/traceme_context_manager.h"
 
-namespace py = pybind11;
-
-namespace {
-
-// Helper to implement TraceMe as a context manager in Python.
-class TraceMeWrapper {
- public:
-  explicit TraceMeWrapper(const tensorflow::string& name) : name_(name) {}
-
-  void Enter() { traceme_.emplace(std::move(name_)); }
-
-  void SetMetadata(const tensorflow::string& new_metadata) {
-    if (TF_PREDICT_TRUE(traceme_)) {
-      traceme_->SetMetadata(new_metadata);
-    }
-  }
-
-  void Exit() { traceme_.reset(); }
-
-  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
-
- private:
-  tensorflow::string name_;
-  absl::optional<tensorflow::profiler::TraceMe> traceme_;
-};
-
-}  // namespace
+using ::tensorflow::profiler::TraceMeContextManager;
 
 PYBIND11_MODULE(_pywrap_traceme, m) {
-  py::class_<TraceMeWrapper> traceme_class(m, "TraceMe");
-  traceme_class.def(py::init<const tensorflow::string&>())
-      .def("Enter", &TraceMeWrapper::Enter)
-      .def("Exit", &TraceMeWrapper::Exit)
-      .def("SetMetadata", &TraceMeWrapper::SetMetadata)
-      .def_static("IsEnabled", &TraceMeWrapper::IsEnabled);
+  py::class_<TraceMeContextManager> traceme_class(m, "TraceMe",
+                                                  py::module_local());
+  traceme_class.def(py::init<py::str, py::kwargs>())
+      .def("Enter", &TraceMeContextManager::Enter)
+      .def("Exit", &TraceMeContextManager::Exit)
+      .def("SetMetadata", &TraceMeContextManager::SetMetadata)
+      .def_static("IsEnabled", &TraceMeContextManager::IsEnabled);
 };
diff --git a/tensorflow/python/profiler/scoped_annotation.py b/tensorflow/python/profiler/scoped_annotation.py
deleted file mode 100644
index 1d7e2b024b4..00000000000
--- a/tensorflow/python/profiler/scoped_annotation.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ScopedAnnotation allows the profiler to annotate device (e.g., GPU) events.
-
-Usage:
-    with scoped_annotation.ScopedAnnotation('name'):
-      ...
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-
-from tensorflow.python.profiler.internal import _pywrap_scoped_annotation
-
-
-class ScopedAnnotation(object):
-  """Context manager that generates an annotation for the profiler."""
-
-  def __init__(self, name, **kwargs):
-    if _pywrap_scoped_annotation.ScopedAnnotation.IsEnabled():
-      if kwargs:
-        name += '#' + ','.join(key + '=' + str(value)
-                               for key, value in six.iteritems(kwargs)) + '#'
-      self._scoped_annotation = _pywrap_scoped_annotation.ScopedAnnotation(name)
-    else:
-      self._scoped_annotation = None
-
-  def __enter__(self):
-    if self._scoped_annotation:
-      self._scoped_annotation.Enter()
-
-  def __exit__(self, exc_type, exc_val, exc_tb):
-    if self._scoped_annotation:
-      self._scoped_annotation.Exit()
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
index 424bdd6f3fc..2cdbad5118c 100644
--- a/tensorflow/python/profiler/trace.py
+++ b/tensorflow/python/profiler/trace.py
@@ -18,29 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.python.profiler.internal import _pywrap_traceme
 from tensorflow.python.util.tf_export import tf_export
 
 
-def encode_metadata(metadata):
-  """Encodes the given metadata to a string.
-
-  Args:
-    metadata: in key-value pairs.
-
-  Returns:
-    The encoded string.
-  """
-  if not metadata:
-    return ''
-  content = []
-  for key, value in six.iteritems(metadata):
-    content.append('%s=%s'%(key, value))
-  return '#' + ','.join(content) + '#'
-
-
 @tf_export('profiler.experimental.Trace', v1=[])
 class Trace(object):
   """Context manager that generates a trace event in the profiler.
@@ -92,8 +73,7 @@ class Trace(object):
       training step being traced.
     """
     if _pywrap_traceme.TraceMe.IsEnabled():
-      name += encode_metadata(kwargs)
-      self._traceme = _pywrap_traceme.TraceMe(name)
+      self._traceme = _pywrap_traceme.TraceMe(name, **kwargs)
     else:
       self._traceme = None
 
@@ -134,8 +114,7 @@ class Trace(object):
     to measure the entire duration of call()).
     """
     if self._traceme and kwargs:
-      additional_metadata = encode_metadata(kwargs)
-      self._traceme.SetMetadata(additional_metadata)
+      self._traceme.SetMetadata(**kwargs)
 
   def __exit__(self, exc_type, exc_val, exc_tb):
     if self._traceme:
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index f99340e6bad..5c30d320fb7 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -2,6 +2,8 @@
 # TensorFlow SavedModel.
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
@@ -310,6 +312,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/saving:checkpoint_options",
         "//tensorflow/python/training/saving:functional_saver",
         "//tensorflow/python/training/tracking",
         "//tensorflow/python/training/tracking:base",
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index 9fcffc8ccdf..b36a1f27456 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -43,6 +43,8 @@ def _is_tensor(t):
   return isinstance(t, (ops.Tensor, resource_variable_ops.BaseResourceVariable))
 
 
+# TODO(edloper): Update this to just use ConcreteFunction.__call__ with the
+# structured signature.
 def _call_concrete_function(function, inputs):
   """Calls a restored Function with structured inputs.
 
@@ -137,8 +139,6 @@ def _deserialize_function_spec_as_nonmethod(function_spec_proto, coder):
   input_signature = coder.decode_proto(function_spec_proto.input_signature)
   return function_lib.FunctionSpec(fullargspec=fullargspec,
                                    is_method=False,
-                                   args_to_prepend=[],
-                                   kwargs_to_include={},
                                    input_signature=input_signature)
 
 
@@ -191,6 +191,8 @@ def recreate_function(saved_function, concrete_functions):
   Args:
     saved_function: `SavedFunction` proto.
     concrete_functions: map from function name to `ConcreteFunction`.
+      As a side effect of this function, the `FunctionSpec` from
+      `saved_function` is added to each `ConcreteFunction` in this map.
 
   Returns:
     A `Function`.
@@ -254,6 +256,9 @@ def recreate_function(saved_function, concrete_functions):
   for concrete_function_name in saved_function.concrete_functions:
     concrete_function_objects.append(concrete_functions[concrete_function_name])
 
+  for cf in concrete_function_objects:
+    cf._set_function_spec(function_spec)  # pylint: disable=protected-access
+
   restored_function = RestoredFunction(
       restored_function_body,
       restored_function_body.__name__,
@@ -317,6 +322,11 @@ def load_function_def_library(library, load_shared_name_suffix=None):
 
     for dep in _list_function_deps(fdef, library_function_names):
       functions[dep].add_to_graph(func_graph)
+
+    # We do not initialize the new ConcreteFunction's function_spec or
+    # arg_keywords here (which are used to parse the structured and flat
+    # signatures, respectively).  function_spec is set up later by
+    # recreate_function(); and arg_keywords by setup_bare_concrete_function().
     func = function_lib.ConcreteFunction(func_graph)
     func.add_to_graph(graph)
 
diff --git a/tensorflow/python/saved_model/function_serialization.py b/tensorflow/python/saved_model/function_serialization.py
index a6b84d1598b..13280d9441a 100644
--- a/tensorflow/python/saved_model/function_serialization.py
+++ b/tensorflow/python/saved_model/function_serialization.py
@@ -77,6 +77,10 @@ def serialize_concrete_function(concrete_function, node_ids, coder):
 
 def serialize_bare_concrete_function(concrete_function, name_map):
   """Build a SavedBareConcreteFunction."""
+  # TODO(edloper): Currently, bare concrete functions don't have access to a
+  # function_spec, so they can't be called with the structured signature.
+  # Update the serialization to include a function_spec.
+
   # pylint: disable=protected-access
   name = name_map.get(compat.as_text(concrete_function.name),
                       concrete_function.name)
@@ -151,7 +155,8 @@ def wrap_cached_variables(concrete_function):
   func_graph_module.func_graph_from_py_func(
       None, wrap_function, args=tuple(args), kwargs={},
       func_graph=outer_graph)
-  fn = defun.ConcreteFunction(outer_graph)
+  fn = defun.ConcreteFunction(
+      outer_graph, function_spec=concrete_function._function_spec)  # pylint: disable=protected-access
   fn._arg_keywords = concrete_function._arg_keywords  # pylint: disable=protected-access
   fn._num_positional_args = concrete_function._num_positional_args  # pylint: disable=protected-access
 
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 05f9c216394..e0fbb7db270 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -173,12 +173,12 @@ class Loader(object):
       # The original_outputs here had Tensors converted to TensorSpecs, so
       # the restored function's structured_outputs field will not be
       # exactly the same. Fortunately the repacking logic cares only about
-      # the structure.
-      # TODO(vbardiovsky): Should we just replicate the structures, with
-      # Nones instead of real objects?
+      # the structure; and the unpacking logic cares only about structure
+      # and types.
       concrete_function._func_graph.structured_outputs = original_outputs  # pylint: disable=protected-access
       concrete_function._func_graph.structured_input_signature = (  # pylint: disable=protected-access
           coder.decode_proto(proto.canonicalized_input_signature))
+      concrete_function._initialize_function_spec()  # pylint: disable=protected-access
 
   def _setup_functions_captures(self):
     """Setup captures and variables in restored functions."""
@@ -324,7 +324,10 @@ class Loader(object):
       restore_ops = position.restore_ops()
       if restore_ops:
         if resource_variable_ops.is_resource_variable(obj):
-          obj._initializer_op = restore_ops
+          if len(restore_ops) == 1:
+            obj._initializer_op = restore_ops[0]
+          else:
+            obj._initializer_op = control_flow_ops.group(*restore_ops)
         elif isinstance(obj, lookup_ops.LookupInterface):
           # We don't need to check for eager execution here, since this code
           # path should only be taken if we are restoring in graph mode.
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 69ce35609c6..462f6f50f11 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -30,6 +30,7 @@ from absl.testing import parameterized
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.eager import wrap_function
@@ -104,14 +105,24 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertIs(imported.dep_three, imported.dep_two.dep)
     self.assertIsNot(imported.dep_one, imported.dep_two)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_variables(self, cycles):
     root = tracking.AutoTrackable()
     root.v1 = variables.Variable(1., trainable=True)
     root.v2 = variables.Variable(2., trainable=False)
-    imported = cycle(root, cycles)
-    self.assertEqual(imported.v1.numpy(), 1.0)
+    self.evaluate([root.v1.initializer, root.v2.initializer])
+
+    for _ in range(cycles):
+      imported = cycle(root, 1)
+      self.evaluate([imported.v1.initializer, imported.v2.initializer])
+
+    if not context.executing_eagerly():
+      self.assertIsInstance(imported.v1.initializer, ops.Operation)
+      self.assertIsInstance(imported.v2.initializer, ops.Operation)
+
+    self.assertEqual(self.evaluate(imported.v1), 1.0)
     self.assertTrue(imported.v1.trainable)
-    self.assertEqual(imported.v2.numpy(), 2.0)
+    self.assertEqual(self.evaluate(imported.v2), 2.0)
     self.assertFalse(imported.v2.trainable)
 
   def test_variables_name(self, cycles):
@@ -995,9 +1006,10 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([6],
                         imported.f(constant_op.constant([3])).numpy())
 
+  @test_util.run_in_graph_and_eager_modes
   def test_concrete_function_backprop(self, cycles):
     @def_function.function(
-        input_signature=[tensor_spec.TensorSpec([None], dtypes.float32)])
+        input_signature=[tensor_spec.TensorSpec([], dtypes.float32)])
     def func(x):
       return x ** 2.
     root = tracking.AutoTrackable()
@@ -1010,10 +1022,10 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         output = function(inp)
       return tape.gradient(output, inp)
 
-    self.assertEqual(2., _compute_gradient(root.f).numpy())
+    self.assertAllEqual(2., _compute_gradient(root.f))
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
     imported = cycle(root, cycles, signatures={})
-    self.assertEqual(2., _compute_gradient(imported.f).numpy())
+    self.assertAllEqual(2., _compute_gradient(imported.f))
 
   def test_revived_concrete_function_kwargs(self, cycles):
 
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index a5d6353280c..9553fb5b196 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -52,6 +52,7 @@ from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import signature_serialization
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
+from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
@@ -941,6 +942,7 @@ def save(obj, export_dir, signatures=None, options=None):
   May not be called from within a function body.
   @end_compatibility
   """
+  options = options or save_options.SaveOptions()
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
   # making a SavedModel proto and writing it directly.
@@ -954,7 +956,10 @@ def save(obj, export_dir, signatures=None, options=None):
   # Write the checkpoint, copy assets into the assets directory, and write out
   # the SavedModel proto itself.
   utils_impl.get_or_create_variables_dir(export_dir)
-  object_saver.save(utils_impl.get_variables_path(export_dir))
+  ckpt_options = checkpoint_options.CheckpointOptions(
+      experimental_io_device=options.experimental_io_device)
+  object_saver.save(utils_impl.get_variables_path(export_dir),
+                    options=ckpt_options)
   builder_impl.copy_assets_to_destination_dir(asset_info.asset_filename_map,
                                               export_dir)
   # Note that this needs to be the last file operation when saving the
@@ -976,6 +981,7 @@ def save(obj, export_dir, signatures=None, options=None):
 
 def export_meta_graph(obj, filename, signatures=None, options=None):
   """Exports the MetaGraph proto to a file."""
+  options = options or save_options.SaveOptions()
   export_dir = os.path.dirname(filename)
   meta_graph_def, exported_graph, _, _ = _build_meta_graph(
       obj, export_dir, signatures, options)
@@ -1001,7 +1007,6 @@ def _build_meta_graph(obj, export_dir, signatures, options,
   if not isinstance(obj, base.Trackable):
     raise ValueError(
         "Expected a Trackable object for export, got {}.".format(obj))
-  options = options or save_options.SaveOptions()
   meta_graph_def = meta_graph_def or meta_graph_pb2.MetaGraphDef()
 
   checkpoint_graph_view = _AugmentedGraphView(obj)
diff --git a/tensorflow/python/saved_model/save_options.py b/tensorflow/python/saved_model/save_options.py
index a8528c002e3..748ae7600eb 100644
--- a/tensorflow/python/saved_model/save_options.py
+++ b/tensorflow/python/saved_model/save_options.py
@@ -33,12 +33,14 @@ class SaveOptions(object):
   """
 
   # Define object attributes in __slots__ for improved memory and performance.
-  __slots__ = ("namespace_whitelist", "save_debug_info", "function_aliases")
+  __slots__ = ("namespace_whitelist", "save_debug_info", "function_aliases",
+               "experimental_io_device")
 
   def __init__(self,
                namespace_whitelist=None,
                save_debug_info=False,
-               function_aliases=None):
+               function_aliases=None,
+               experimental_io_device=None):
     """Creates an object that stores options for SavedModel saving.
 
     Args:
@@ -46,16 +48,15 @@ class SaveOptions(object):
         when saving a model. Saving an object that uses namespaced ops must
         explicitly add all namespaces to the whitelist. The namespaced ops must
         be registered into the framework when loading the SavedModel.
-      save_debug_info: Boolean indicating whether debug information is saved.
-        If True, then a debug/saved_model_debug_info.pb file will be written
-        with the contents of a GraphDebugInfo binary protocol buffer containing
-        stack trace information for all ops and functions that are saved.
+      save_debug_info: Boolean indicating whether debug information is saved. If
+        True, then a debug/saved_model_debug_info.pb file will be written with
+        the contents of a GraphDebugInfo binary protocol buffer containing stack
+        trace information for all ops and functions that are saved.
       function_aliases: Python dict. Mapping from string to object returned by
-        @tf.function.
-        A single tf.function can generate many ConcreteFunctions. If a
-        downstream tool wants to refer to all concrete functions generated by a
-        single tf.function you can use the `function_aliases` argument to store
-        a map from the alias name to all concrete function names.
+        @tf.function. A single tf.function can generate many ConcreteFunctions.
+        If a downstream tool wants to refer to all concrete functions generated
+        by a single tf.function you can use the `function_aliases` argument to
+        store a map from the alias name to all concrete function names.
         E.g.
         ```python
         class MyModel:
@@ -77,11 +78,21 @@ class SaveOptions(object):
         })
         tf.saved_model.save(model, export_dir, signatures, options)
         ```
+      experimental_io_device: string. Applies in a distributed setting.
+        Tensorflow device to use to access the filesystem. If `None` (default)
+        then for each variable the filesystem is accessed from the CPU:0 device
+        of the host where that variable is assigned. If specified, the
+        filesystem is instead accessed from that device for all variables.
+
+        This is for example useful if you want to save to a local directory,
+        such as "/tmp" when running in a distributed setting. In that case pass
+        a device for the host where the "/tmp" directory is accessible.
     """
     self.namespace_whitelist = _validate_namespace_whitelist(
         namespace_whitelist)
     self.save_debug_info = save_debug_info
     self.function_aliases = function_aliases if function_aliases else dict()
+    self.experimental_io_device = experimental_io_device
 
 
 def _validate_namespace_whitelist(namespace_whitelist):
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index cae8c4c7c96..09e7296a483 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -577,6 +577,12 @@ class SavingOptionsTest(test.TestCase):
     self.assertEqual(function_cache[0].name.decode("utf-8"),
                      list(function_aliases.keys())[0])
 
+  def test_accepts_io_device(self):
+    options = save_options.SaveOptions()
+    self.assertEqual(None, options.experimental_io_device)
+    options = save_options.SaveOptions(experimental_io_device="/job:localhost")
+    self.assertEqual("/job:localhost", options.experimental_io_device)
+
 
 class AssetTests(test.TestCase):
 
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 42e971d050d..0f635b6bf85 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -178,7 +178,7 @@ def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
     spec = struct_coder.decode_proto(spec_proto)
     components = [_get_tensor(component.name) for component in
                   tensor_info.composite_tensor.components]
-    return spec._from_components(components)  # pylint: disable=protected-access
+    return nest.pack_sequence_as(spec, components, expand_composites=True)
   else:
     raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
 
diff --git a/tensorflow/python/tf_program/BUILD b/tensorflow/python/tf_program/BUILD
new file mode 100644
index 00000000000..9dfb0df8a24
--- /dev/null
+++ b/tensorflow/python/tf_program/BUILD
@@ -0,0 +1,22 @@
+package(licenses = ["notice"])
+
+py_library(
+    name = "pywrap_tfd",
+    srcs = ["pywrap_tfd.py"],
+    deps = [
+        "//tensorflow/compiler/mlir/python/mlir_wrapper",
+    ],
+)
+
+py_library(
+    name = "mlir_gen",
+    srcs = ["mlir_gen.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pywrap_tfd",
+        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/types",
+        "@gast_archive//:gast",
+    ],
+)
diff --git a/tensorflow/python/tf_program/mlir_gen.py b/tensorflow/python/tf_program/mlir_gen.py
new file mode 100644
index 00000000000..8395848a53a
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_gen.py
@@ -0,0 +1,456 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""mlir_gen: Generate mlir code from python code."""
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast as ast
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct import naming
+from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.autograph.pyct.static_analysis import activity
+from tensorflow.python.autograph.pyct.static_analysis import annos
+from tensorflow.python.autograph.pyct.static_analysis import liveness
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
+import tensorflow.python.tf_program.pywrap_tfd as tfp
+from tensorflow.python.types import core
+
+
+class SymbolTable(object):
+  """Symbol Table for python code."""
+
+  def __init__(self):
+    self.symbols = []
+    self.enter_scope()
+
+  def enter_scope(self):
+    """Enter a new scope - at function level."""
+    self.symbols.append({'types': {}, 'symbols': {}})
+    self.curr_table = self.symbols[len(self.symbols) - 1]
+
+  def insert_symbol(self, name, value):
+    self.curr_table['symbols'][name] = value
+    self.curr_table['types'][name] = value.getType()
+    return value
+
+  def insert_type(self, name, type_):
+    self.curr_table['types'][name] = type_
+
+  def exit_scope(self):
+    self.symbols.pop()
+    self.curr_table = self.symbols[len(self.symbols) - 1]
+
+  def lookup(self, name):
+    curr_idx = len(self.symbols) - 1
+    while curr_idx >= 0 and (name not in self.symbols[curr_idx]['symbols']):
+      curr_idx -= 1
+    if curr_idx < 0:
+      return None
+    return self.symbols[curr_idx]['symbols'][name]
+
+  def lookup_type(self, name):
+    curr_idx = len(self.symbols) - 1
+    while curr_idx >= 0 and (name not in self.symbols[curr_idx]['types']):
+      curr_idx -= 1
+    if curr_idx < 0:
+      return None
+    return self.symbols[curr_idx]['types'][name]
+
+  def __repr__(self):
+    s = '\n'.join(
+        ' ' * idx * 2 + str(table) for idx, table in enumerate(self.symbols))
+    return s
+
+
+class ProcessType(ast.NodeVisitor):
+  """Visit a node and return processed type Currently only visits annotations and gives their type.
+  """
+
+  def __init__(self, prog, ctx):
+    self.prog = prog
+    self.ctx = ctx
+
+  def visit_Attribute(self, node):
+    # Supported: core.Tensor
+    value = self.visit(node.value)
+    if value is None or not hasattr(value, node.attr):
+      raise AttributeError(str(type(value)) + ' has no attribute ' + node.attr)
+    attr = getattr(value, node.attr)
+
+    if attr == core.Tensor:
+      return tfp.UnrankedTensorType.get(tfp.IntegerType.get(32, self.prog.ctx))
+    return attr
+
+  def visit_Name(self, node):
+    if node.id == 'int':
+      return tfp.IntegerType.get(32, self.prog.ctx)
+    if node.id == 'bool':
+      return tfp.IntegerType.get(1, self.prog.ctx)
+    if node.id in self.ctx.info.namespace:
+      return self.ctx.info.namespace[node.id]
+
+
+class MLIRGen(ast.NodeVisitor):
+  """Visit the AST and generate MLIR code Requires liveness, reading_definitions.
+  """
+
+  def __init__(self, ctx):
+    self.ctx = ctx
+    self.symbol_table = SymbolTable()
+    self.prog = tfp.TFProgram()
+    self.opbuilder = None
+
+  def visit_block(self, block):
+    return [self.visit(item) for item in block]
+
+  def process_type(self, node):
+    return ProcessType(self.prog, self.ctx).visit(node)
+
+  def visit_Assign(self, node):
+    value = self.visit(node.value)
+    if isinstance(value, tuple):
+      # If it is a tuple of values, assign one to each in targets
+      # TODO: This currently is assuming that all elts in targets[0] are Name
+      # objects. This might not be always True.
+      for key, val in zip(node.targets[0].elts, value):
+        self.symbol_table.insert_symbol(key.id, val)
+    else:
+      self.symbol_table.insert_symbol(node.targets[0].id, value)
+
+  def visit_BinOp(self, node):
+    left = self.visit(node.left)
+    right = self.visit(node.right)
+    if isinstance(node.op, ast.Sub):
+      return tfp.Tf_SubOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                                 left, right).getResult(0)
+    if isinstance(node.op, ast.Add):
+      return tfp.Tf_AddV2Op.create(self.opbuilder,
+                                   self.opbuilder.getUnknownLoc(), left,
+                                   right).getResult(0)
+
+  def visit_BoolOp(self, node):
+    values = [self.visit(value) for value in node.values]
+    if isinstance(node.op, ast.Or):
+      return tfp.OrOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                             values).getResult(0)
+    if isinstance(node.op, ast.And):
+      return tfp.AndOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                              values).getResult(0)
+
+  def visit_Call(self, node):
+    func = self.visit(node.func)
+    args = [self.visit(arg) for arg in node.args]
+    callop = tfp.Tf_LegacyCallOp.create(self.opbuilder,
+                                        self.opbuilder.getUnknownLoc(),
+                                        func.getType().getResults(), args,
+                                        func.getName())
+    if callop.getNumResults() == 1:
+      return callop[0]
+    return tuple(callop.getResult(idx) for idx in range(callop.getNumResults()))
+
+  def visit_Compare(self, node):
+    left = self.visit(node.left)
+    opb = self.opbuilder
+    for op, right in zip(node.ops, node.comparators):
+      if isinstance(op, ast.Eq):
+        left = tfp.Tf_EqualOp.create(opb, opb.getUnknownLoc(), left,
+                                     self.visit(right)).getResult(0)
+      elif isinstance(op, ast.Lt):
+        left = tfp.Tf_LessOp.create(opb, opb.getUnknownLoc(), left,
+                                    self.visit(right)).getResult(0)
+      elif isinstance(op, ast.LtE):
+        left = tfp.Tf_LessEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                         self.visit(right)).getResult(0)
+      elif isinstance(op, ast.Gt):
+        left = tfp.Tf_GreaterOp.create(opb, opb.getUnknownLoc(), left,
+                                       self.visit(right)).getResult(0)
+      elif isinstance(op, ast.GtE):
+        left = tfp.Tf_GreaterEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                            self.visit(right)).getResult(0)
+      elif isinstance(op, ast.NotEq):
+        left = tfp.Tf_NotEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                        self.visit(right)).getResult(0)
+      else:
+        raise NotImplementedError('CompareOp operator not recognized')
+    return left
+
+  def visit_Constant(self, node):
+    opb = self.opbuilder
+    value = None
+    if isinstance(node.value, int):
+      value = tfp.Tf_ConstOp.create(
+          opb, opb.getUnknownLoc(),
+          tfp.IntegerAttr.get(
+              tfp.IntegerType.get(32, self.prog.ctx), node.value)).getResult(0)
+    return value
+
+  def visit_FunctionDef(self, node):
+    # Cache the current builder
+    cache_builder = self.opbuilder
+    inputs, outputs = [], []
+
+    for arg in node.args.args:
+      inputs.append(self.process_type(arg.annotation))
+
+    if node.returns:
+      outputs = [self.process_type(node.returns)]
+
+    currfunc = self.prog.add_function(
+        self.ctx.namer.new_symbol(node.name, []),
+        self.prog.get_function_type(inputs, outputs))
+
+    # Add the function to symbol table and enter new scope
+    self.symbol_table.insert_symbol(node.name, currfunc)
+    self.symbol_table.enter_scope()
+
+    # Add arguments to symbol table
+    for arg, value in zip(node.args.args, currfunc.getArguments()):
+      self.symbol_table.insert_symbol(arg.id, value)
+    self.opbuilder = tfp.OpBuilder(currfunc.getBody())
+
+    self.visit_block(node.body)
+    self.symbol_table.exit_scope()
+    self.opbuilder = cache_builder
+
+  def visit_If(self, node):
+    cond = self.visit(node.test)
+
+    # Create ifop
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
+    modified_in_cond = list(body_scope.modified | orelse_scope.modified)
+    outputs = [
+        self.symbol_table.lookup_type(str(var)) for var in modified_in_cond
+    ]
+    ifop = tfp.IfOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), cond,
+                           outputs)
+
+    # Cache the builder
+    cache_builder = self.opbuilder
+
+    # Visit body
+    self.opbuilder = tfp.OpBuilder(ifop.getRegion(0))
+    # Enter scope to avoid values generated inside the region to come in symbol
+    # table
+    self.symbol_table.enter_scope()
+    for stmt in node.body:
+      self.visit(stmt)
+    retvals = [
+        self.symbol_table.lookup(str(varname)) for varname in modified_in_cond
+    ]
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), retvals)
+    self.symbol_table.exit_scope()
+
+    # Visit orelse
+    self.opbuilder = tfp.OpBuilder(ifop.getRegion(1))
+    self.symbol_table.enter_scope()
+    for stmt in node.orelse:
+      self.visit(stmt)
+    retvals = [
+        self.symbol_table.lookup(str(varname)) for varname in modified_in_cond
+    ]
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), retvals)
+    self.symbol_table.exit_scope()
+
+    # Reset builder and enter return values in symbol table
+    self.opbuilder = cache_builder
+    for idx, var in enumerate(modified_in_cond):
+      self.symbol_table.insert_symbol(str(var), ifop.getResult(idx))
+
+    if ifop.getNumResults() == 1:
+      return ifop.getResult(0)
+
+    return tuple(ifop.getResult(i) for i in range(ifop.getNumResults()))
+
+  def visit_Name(self, node):
+    if self.symbol_table.lookup(node.id):
+      return self.symbol_table.lookup(node.id)
+    raise NotImplementedError('Symbol not found' + node.id)
+
+  def visit_Return(self, node):
+    opb = self.opbuilder
+    value = self.visit(node.value)
+    if isinstance(value, tuple):
+      # For more than one return values
+      return tfp.ReturnOp.create(opb, opb.getUnknownLoc(), list(value))
+    return tfp.ReturnOp.create(opb, opb.getUnknownLoc(), [value])
+
+  def visit_Tuple(self, node):
+    return tuple(self.visit(elt) for elt in node.elts)
+
+  def visit_UnaryOp(self, node):
+    operand = self.visit(node.operand)
+    if isinstance(node.op, ast.USub):
+      return tfp.Tf_NegOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                                 operand).getResult(0)
+
+  def _get_basic_loop_vars(self, modified, live_in, live_out):
+    # [This is directly from
+    # tensorflow/python/autograph/converters/control_flow.py]
+    # The loop variables corresponding to simple symbols (e.g. `x`).
+    basic_loop_vars = []
+    for s in modified:
+      if s.is_composite():
+        # TODO: Raise an error when this happens for a TF loop.
+        continue
+      # Variables not live into or out of the loop are considered local to the
+      # loop.
+      if s not in live_in and s not in live_out:
+        continue
+      basic_loop_vars.append(s)
+    return frozenset(basic_loop_vars)
+
+  def _get_composite_loop_vars(self, modified, live_in):
+    # [This is directly from
+    # tensorflow/python/autograph/converters/control_flow.py]
+    # The loop variables corresponding to composite symbols (e.g. `self.x`).
+    composite_loop_vars = []
+    for s in modified:
+      if not s.is_composite():
+        continue
+      # Mutations made to objects created inside the loop will appear as writes
+      # to composite symbols. Because these mutations appear as modifications
+      # made to composite symbols, we check whether the composite's parent is
+      # actually live into the loop.
+      # Example:
+      #   while cond:
+      #     x = Foo()
+      #     x.foo = 2 * x.foo  # x.foo is live into the loop, but x is not.
+      #
+      # Note that some parents might not be symbols - for example, in x['foo'],
+      # 'foo' is a parent, but it's a literal, not a symbol. We don't check the
+      # liveness of literals.
+      support_set_symbols = tuple(
+          sss for sss in s.support_set if sss.is_symbol())
+      if not all(sss in live_in for sss in support_set_symbols):
+        continue
+      composite_loop_vars.append(s)
+    return frozenset(composite_loop_vars)
+
+  def _get_loop_vars(self, node, modified):
+    # [This is directly from python/autograph/converters/control_flow.py]
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
+    live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
+    live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
+    reserved_symbols = body_scope.referenced
+
+    basic_loop_vars = self._get_basic_loop_vars(modified, live_in, live_out)
+    composite_loop_vars = self._get_composite_loop_vars(modified, live_in)
+    loop_vars = tuple(basic_loop_vars | composite_loop_vars)
+
+    # Variable that are used or defined inside the loop, but not defined
+    # before entering the loop. Only simple variables must be defined. The
+    # composite ones will be implicitly checked at runtime.
+    undefined_lives = basic_loop_vars - defined_in
+
+    return loop_vars, reserved_symbols, undefined_lives
+
+  def visit_While(self, node):
+
+    # Create a new WhileOp
+    # `inputs` are initial values for loop variables
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    loop_vars, _, _ = self._get_loop_vars(node, body_scope.modified)
+    inputs = [self.symbol_table.lookup(str(name)) for name in loop_vars]
+    types = [input_.getType() for input_ in inputs]
+    while_op = tfp.WhileOp.create(self.opbuilder,
+                                  self.opbuilder.getUnknownLoc(), inputs, types)
+
+    # cache the current builder
+    cache_builder = self.opbuilder
+
+    # Process cond
+    self.symbol_table.enter_scope()
+    for input_, type_ in zip(loop_vars, types):
+      self.symbol_table.insert_symbol(
+          str(input_),
+          while_op.getRegion(0).front().addArgument(type_))
+    self.opbuilder = tfp.OpBuilder(while_op.getRegion(0))
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                        [self.visit(node.test)])
+    self.symbol_table.exit_scope()
+
+    # Process body
+    self.symbol_table.enter_scope()
+    for input_, type_ in zip(loop_vars, types):
+      self.symbol_table.insert_symbol(
+          str(input_),
+          while_op.getRegion(1).front().addArgument(type_))
+    self.opbuilder = tfp.OpBuilder(while_op.getRegion(1))
+    self.visit_block(node.body)
+    tfp.ReturnOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(),
+        [self.symbol_table.lookup(str(name)) for name in loop_vars])
+    self.symbol_table.exit_scope()
+
+    # Enter new values as symbols
+    for idx, var in enumerate(loop_vars):
+      self.symbol_table.insert_symbol(str(var), while_op.getResult(idx))
+
+    # Restore builder
+    self.opbuilder = cache_builder
+
+
+def mlir_gen_internal(node, entity_info):
+  """Returns mlir module for unprocessed node `node`."""
+  namer = naming.Namer({})
+  graphs = cfg.build(node)
+  ctx = transformer.Context(entity_info, namer, None)
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx)
+  node = reaching_definitions.resolve(node, ctx, graphs)
+  node = reaching_fndefs.resolve(node, ctx, graphs)
+  node = liveness.resolve(node, ctx, graphs)
+  mlir_generator = MLIRGen(ctx)
+  mlir_generator.visit(node)
+  return mlir_generator.prog
+
+
+def mlir_gen(func):
+  """Parse a function and return TFProgram."""
+  node, source = parser.parse_entity(func, future_features=())
+  entity_info = transformer.EntityInfo(
+      name=func.__name__,
+      source_code=source,
+      source_file=None,
+      future_features=(),
+      namespace=inspect_utils.getnamespace(func))
+  return mlir_gen_internal(node, entity_info)
+
+
+def mlir_gen_from_source(source=None, src_file=None):
+  """Parse a function as either a string or from a supplied file path and return a TFProgram.
+  """
+  if source is None:
+    source = open(src_file).read()
+  node = ast.parse(source)
+  entity_info = transformer.EntityInfo(
+      name='mlir_module',
+      source_code=source,
+      source_file=None,
+      future_features=(),
+      namespace={})
+  return mlir_gen_internal(node, entity_info)
diff --git a/tensorflow/python/tf_program/pywrap_tfd.py b/tensorflow/python/tf_program/pywrap_tfd.py
new file mode 100644
index 00000000000..0d9a236f5d3
--- /dev/null
+++ b/tensorflow/python/tf_program/pywrap_tfd.py
@@ -0,0 +1,159 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Intermediate between python bindings for MLIR and mlir generation for tensorflow program.
+
+This passes most of the mlir classes as is, but adds a few new operations and
+the basic structure for a tensorflow program.
+"""
+
+# pylint: disable=invalid-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.mlir.python.mlir_wrapper import mlir_wrapper as mlir
+
+# Class Definitions
+OpBuilder = mlir.OpBuilder
+Block = mlir.Block
+
+# Types
+Type = mlir.Type
+IntegerType = mlir.IntegerType
+FloatType = mlir.FloatType
+RankedTensorType = mlir.RankedTensorType
+UnrankedTensorType = mlir.UnrankedTensorType
+IntegerAttr = mlir.IntegerAttr
+
+# Standard Ops
+ReturnOp = mlir.ReturnOp
+
+# TF Dialect Ops
+Tf_AnyOp = mlir.Tf_AnyOp
+Tf_AddV2Op = mlir.Tf_AddV2Op
+Tf_ConstOp = mlir.Tf_ConstOp
+Tf_EqualOp = mlir.Tf_EqualOp
+Tf_GreaterEqualOp = mlir.Tf_GreaterEqualOp
+Tf_GreaterOp = mlir.Tf_GreaterOp
+Tf_LegacyCallOp = mlir.Tf_LegacyCallOp
+Tf_LessEqualOp = mlir.Tf_LessEqualOp
+Tf_LessOp = mlir.Tf_LessOp
+Tf_NegOp = mlir.Tf_NegOp
+Tf_NotEqualOp = mlir.Tf_NotEqualOp
+Tf_SubOp = mlir.Tf_SubOp
+
+
+class IfOp(object):
+  """
+  tfp.if(cond) ({body}, {orelse}) : type If `cond` is true, `body` is
+  executed, otherwise `orelse` is executed.
+  """
+
+  @classmethod
+  def create(cls, opb, loc, cond, outputs):
+    state = mlir.OperationState(loc, "tfp.If")
+    state.addOperands([cond])
+    state.addTypes(outputs)
+    state.addRegion().push_back(Block.new())  # body region
+    state.addRegion().push_back(Block.new())  # orelse region
+    return opb.createOperation(state)
+
+
+class OrOp(object):
+  """
+  tfp.Or(ops...) This is like tf.Any, except that the first dimension is opened
+  into `ops`.
+
+  Returns a tensor of 1-bit integers which is "Logical OR" of the
+  coressponding elements in ops...
+  """
+
+  @classmethod
+  def create(cls, opb, loc, values):
+    state = mlir.OperationState(loc, "tfp.Or")
+    state.addTypes(
+        [UnrankedTensorType.get(IntegerType.get(1, opb.getContext()))])
+    state.addOperands(values)
+    return opb.createOperation(state)
+
+
+class AndOp(object):
+  """
+  tfp.And(ops...) This is like tf.All, except that the first dimension is opened
+  to `ops`.
+
+  Returns a tensor of 1-bit integers which is "Logical AND" of the
+  coressponding elements in ops...
+  """
+
+  @classmethod
+  def create(cls, opb, loc, values):
+    state = mlir.OperationState(loc, "tfp.And")
+    state.addTypes(
+        [UnrankedTensorType.get(IntegerType.get(1, opb.getContext()))])
+    state.addOperands(values)
+    return opb.createOperation(state)
+
+
+class WhileOp(object):
+  """tfp.While(init-vals, {
+
+    ^bb1(cond-args):
+      cond-region
+      return cond
+  }, {
+    ^bb1(body-args):
+      body-region
+  })
+  As long as `cond-region` returns a "true"-like value, the body-region
+  is executed and the arguments are replaced by its return values for the next
+  iteration.
+  """
+
+  @classmethod
+  def create(cls, opb, loc, inputs, outputs):
+    state = mlir.OperationState(loc, "tfp.While")
+    state.addOperands(inputs)
+    state.addTypes(outputs)
+    state.addRegion().push_back(Block.new())  # cond region
+    state.addRegion().push_back(Block.new())  # body region
+    return opb.createOperation(state)
+
+
+class TFProgram(object):
+  """Python wrap for a Tensorflow Program (essentially an mlir Module)."""
+
+  def __init__(self):
+    mlir.registerDialects()
+    self.ctx = mlir.MLIRContext()
+    self.builder = mlir.Builder(self.ctx)
+    self.module = mlir.ModuleOp.create(mlir.UnknownLoc.get(self.ctx))
+    self.curr_func = None
+
+  def add_function(self, name, func_type):
+    self.curr_func = mlir.FuncOp.create(
+        mlir.UnknownLoc.get(self.ctx), name, func_type)
+    self.module.push_back(self.curr_func)
+    return self.curr_func
+
+  def get_function_type(self, inputs, outputs):
+    return self.builder.getFunctionType(inputs, outputs)
+
+  def dump(self):
+    self.module.dump()
+
+  def __str__(self):
+    return self.module.getAsStr()
diff --git a/tensorflow/python/tf_program/tests/BUILD b/tensorflow/python/tf_program/tests/BUILD
new file mode 100644
index 00000000000..1cf0fad6c93
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/BUILD
@@ -0,0 +1,20 @@
+package(licenses = ["notice"])
+
+py_test(
+    name = "mlir_gen_test",
+    size = "small",
+    testonly = True,
+    srcs = ["mlir_gen_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/python/mlir_wrapper:filecheck_wrapper",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/tf_program:mlir_gen",
+        "//tensorflow/python/types",
+    ],
+)
diff --git a/tensorflow/python/tf_program/tests/mlir_gen_test.py b/tensorflow/python/tf_program/tests/mlir_gen_test.py
new file mode 100644
index 00000000000..49737352d73
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/mlir_gen_test.py
@@ -0,0 +1,247 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `mlir_gen` module"""
+
+# pylint: disable=missing-function-docstring
+# pylint: disable=invalid-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.types import core
+from tensorflow.python.tf_program.mlir_gen import mlir_gen
+
+import tensorflow.compiler.mlir.python.mlir_wrapper.filecheck_wrapper as fw
+
+
+class MLIRGenTestBase(test.TestCase):
+
+  def _check_code(self, mlir_code, exp_mlir_code):
+    return self.assertTrue(fw.check(str(mlir_code), exp_mlir_code))
+
+
+class MLIRGenTest(MLIRGenTestBase):
+  """MLIR Generation Tests for Tensorflow Program"""
+
+  def test_simple(self):
+
+    def test_fn():
+      pass
+
+    mlir_code = mlir_gen(test_fn)
+    mlir_code_exp = r"""
+      CHECK-LABEL: @test_fn
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_argument(self):
+
+    def test_fn(x: core.Tensor) -> core.Tensor:
+      return x
+
+    mlir_code = mlir_gen(test_fn)
+    mlir_code_exp = r"""
+      CHECK-LABEL: @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+        CHECK-NEXT: return %arg0 : tensor<*xi32>
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_constant(self):
+
+    def test_fn() -> int:
+      return 23
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn() -> i32
+      CHECK: %[[r0:[0-9]+]] = "tf.Const"() {value = dense<23> : tensor<i32>} : () -> tensor<i32>
+      CHECK: return %[[r0]] : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_BoolOp(self):
+
+    def test_fn(x: bool, y: bool) -> bool:
+      return x or y or x and x and y
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn(%arg0: i1, %arg1: i1) -> i1
+      CHECK: %[[r0:[0-9]+]] = "tfp.And"(%arg0, %arg0, %arg1) : (i1, i1, i1) -> tensor<*xi1>
+      CHECK: %[[r1:[0-9]+]] = "tfp.Or"(%arg0, %arg1, %[[r0]]) : (i1, i1, tensor<*xi1>) -> tensor<*xi1>
+      CHECK: return %[[r1]] : tensor<*xi1>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Call(self):
+
+    def test_fn():
+
+      def f1():
+        return 23
+
+      def f2():
+        return f1()
+
+      f2()
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn()
+        CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @f2} : () -> ()
+      CHECK: }
+      CHECK-LABEL: func @f1() {
+        CHECK: %[[r0:[0-9]+]] = "tf.Const"() {value = dense<23> : tensor<i32>} : () -> tensor<i32>
+        CHECK: return %[[r0]] : tensor<i32>
+      CHECK: }
+      CHECK-LABEL: func @f2() {
+        CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @f1} : () -> ()
+      }
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Compare(self):
+
+    def test_fn(x: core.Tensor, y: core.Tensor, z: core.Tensor):
+      return x > y < z
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<*xi32>)
+      CHECK: %[[r0:[0-9]+]] = "tf.Greater"(%arg0, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+      CHECK: %[[r1:[0-9]+]] = "tf.Less"(%[[r0]], %arg2) : (tensor<*xi1>, tensor<*xi32>) -> tensor<*xi1>
+      CHECK: return %[[r1]] : tensor<*xi1>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Assign_BinOp(self):
+
+    def test_fn() -> int:
+      y = 12 + 23 - 24
+      return y
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn() -> i32
+      CHECK: %[[r0:[0-9]+]] = "tf.AddV2"(%{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      CHECK: %[[r1:[0-9]+]] = "tf.Sub"(%{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      CHECK: return %[[r1]] : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_if(self):
+
+    def test_fn(x: core.Tensor) -> int:
+      res = 0
+      if x > 0:
+        res = 1
+      elif x < 0:
+        res = -1
+      else:
+        res = 0
+      return res
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> i32
+
+      CHECK: %[[r1:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK-NEXT: %[[r2:[0-9]+]] = "tfp.If"(%[[r1]]) ( {
+        CHECK: return %{{[0-9]+}} : tensor<i32>
+      CHECK-NEXT: },  {
+        CHECK: %[[r3:[0-9]+]] = "tf.Less"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+        CHECK: %[[r4:[0-9]+]] = "tfp.If"(%[[r3]]) ( {
+          CHECK: %[[r5:[0-9]+]] = "tf.Neg"(%{{[0-9]+}}) : (tensor<i32>) -> tensor<i32>
+          CHECK: return %[[r5]] : tensor<i32>
+        CHECK-NEXT: },  {
+          CHECK: return %{{[0-9]+}} : tensor<i32>
+        CHECK-NEXT: }) : (tensor<*xi1>) -> tensor<i32>
+        CHECK: return %[[r4]] : tensor<i32>
+      CHECK-NEXT: }) : (tensor<*xi1>) -> tensor<i32>
+      CHECK-NEXT: return %[[r2]] : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_while(self):
+
+    def test_fn(x: core.Tensor) -> core.Tensor:
+      s = 0
+      while x > 0:
+        s = s + x
+      return s
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32>
+
+      CHECK: %[[r1:[0-9]+]] = "tfp.While"(%0) ( {
+      CHECK-NEXT: ^{{[^ ]+}}(%arg1: tensor<i32>):
+        CHECK: %[[r2:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+        CHECK-NEXT: return %[[r2]] : tensor<*xi1>
+      CHECK-NEXT: },  {
+      CHECK-NEXT: ^{{[^ ]+}}(%arg1: tensor<i32>):
+        CHECK: %[[r3:[0-9]+]] = "tf.AddV2"(%arg1, %arg0) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi32>
+        CHECK-NEXT: return %[[r3]] : tensor<*xi32>
+      CHECK-NEXT: }) : (tensor<i32>) -> tensor<i32>
+      CHECK-NEXT: return %[[r1]] : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_fibonacci(self):
+
+    def test_fn(x: core.Tensor) -> core.Tensor:
+      res, idx = 0, 2
+      a, b = 0, 1
+      if x == 0 or x == 1:
+        res = x
+      else:
+        while idx <= x:
+          res = a + b
+          a = b
+          b = res
+          idx = idx + 1
+      return res
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32>
+      CHECK: %[[r5:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK: %[[r7:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK: %[[r8:[0-9]+]] = "tfp.Or"(%[[r5]], %[[r7]]) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+
+      CHECK: %[[r9:[0-9]+]]:4 = "tfp.If"(%[[r8]]) ( {
+        CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>
+        CHECK-NEXT: },  {
+        CHECK-NEXT: %[[r10:[0-9]+]]:4 = "tfp.While"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) ( {
+          CHECK-NEXT: ^{{[^ ]*}}(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
+          CHECK-NEXT: %[[r11:[0-9]+]] = "tf.LessEqual"(%arg{{[0-9]+}}, %arg{{[0-9]+}}) : (tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>) -> tensor<*xi1>
+          CHECK-NEXT: return %[[r11]] : tensor<*xi1>
+        CHECK-NEXT: },  {
+          CHECK-NEXT: ^{{[^ ]*}}(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
+          CHECK-NEXT: %[[r12:[0-9]+]] = "tf.AddV2"(%arg{{[0-9]+}}, %arg{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          CHECK: %[[r13:[0-9]+]] = "tf.AddV2"(%arg{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+        CHECK-NEXT: }) : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+        CHECK-NEXT: return %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}} : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+      CHECK-NEXT: }) : (tensor<*xi1>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+      CHECK-NEXT: return %[[r9]]#{{[0-9]+}} : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index f0839cb5721..efcd912f430 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -110,14 +110,14 @@ TFE_InputTensorHandles InputTFE_InputTensorHandles(
   TFE_InputTensorHandles input_tensor_handles;
   if (input_tensors.ptr() != Py_None) {
     if (!PyList_Check(input_tensors.ptr())) {
-      tensorflow::throwTypeError("must provide a list of Tensors as inputs");
+      tensorflow::ThrowTypeError("must provide a list of Tensors as inputs");
     }
     Py_ssize_t len = PyList_Size(input_tensors.ptr());
     input_tensor_handles.resize(len);
     for (Py_ssize_t i = 0; i < len; ++i) {
       PyObject* elem = PyList_GetItem(input_tensors.ptr(), i);
       if (!elem) {
-        tensorflow::throwTypeError("Input Tensor does not exist.");
+        tensorflow::ThrowTypeError("Input Tensor does not exist.");
       }
       if (EagerTensor_CheckExact(elem)) {
         (input_tensor_handles)[i] = EagerTensor_Handle(elem);
@@ -139,7 +139,7 @@ TFE_InputTensorHandles InputTFE_InputTensorHandles(
         } else {
           // This is a subclass of EagerTensor that we don't support.
           PyErr_Clear();
-          tensorflow::throwTypeError(
+          tensorflow::ThrowTypeError(
               tensorflow::strings::StrCat(
                   "Saw an object that is an instance of a strict subclass of "
                   "EagerTensor, which is not supported.  Item ",
@@ -151,7 +151,7 @@ TFE_InputTensorHandles InputTFE_InputTensorHandles(
         // tensor.
         tensorflow::Safe_PyObjectPtr name_attr(
             PyObject_GetAttrString(elem, "name"));
-        tensorflow::throwTypeError(
+        tensorflow::ThrowTypeError(
             tensorflow::strings::StrCat(
                 "An op outside of the function building code is being passed\n"
                 "a \"Graph\" tensor. It is possible to have Graph tensors\n"
@@ -166,7 +166,7 @@ TFE_InputTensorHandles InputTFE_InputTensorHandles(
                 name_attr ? TFE_GetPythonString(name_attr.get()) : "<unknown>")
                 .c_str());
       } else {
-        tensorflow::throwTypeError(
+        tensorflow::ThrowTypeError(
             tensorflow::strings::StrCat(
                 "provided list of inputs contains objects other "
                 "than 'EagerTensor'. Item ",
@@ -210,6 +210,22 @@ TFE_OutputTensorHandles InputTFE_OutputTensorHandles(
   return output_tensor_handles;
 }
 
+// Packs multiple `EagerTensor`s of the same dtype and shape into one
+// `EagerTensor`.
+py::object TFE_Py_PackEagerTensors_wrapper(const py::handle& context,
+                                           const py::handle& tensors) {
+  TFE_Context* ctx = tensorflow::InputTFE_Context(context);
+  TFE_InputTensorHandles handles = InputTFE_InputTensorHandles(tensors);
+  tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus());
+  int size = handles.size();
+  TFE_TensorHandle* packed_handle =
+      TFE_CreatePackedTensorHandle(ctx, handles.data(), &size, status.get());
+  tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+  PyObject* packed_tensor =
+      EagerTensorFromHandle(packed_handle, /*is_packed=*/true);
+  return tensorflow::PyoOrThrow(packed_tensor);
+}
+
 // This function was created from fusing the typemap logic in platform/base.i.
 py::object TFE_Py_ExecuteCancelable_wrapper(
     const py::handle& context, const char* device_name, const char* op_name,
@@ -234,7 +250,7 @@ py::object TFE_Py_ExecuteCancelable_wrapper(
     PyList_SetItem(output_list, i, output);
   }
   tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
-  return tensorflow::pyo_or_throw(output_list);
+  return tensorflow::PyoOrThrow(output_list);
 }
 
 static py::object TF_ListPhysicalDevices() {
@@ -253,7 +269,7 @@ static py::object TF_ListPhysicalDevices() {
     PyList_SetItem(result, i, dev_obj);
     ++i;
   }
-  return tensorflow::pyo_or_throw(result);
+  return tensorflow::PyoOrThrow(result);
 }
 
 static py::object TFE_ClearScalarCache() {
@@ -270,7 +286,6 @@ static py::object TFE_ClearScalarCache() {
 // are only assigning this to functions that return opaque types.
 
 PYBIND11_MODULE(_pywrap_tfe, m) {
-  py::class_<TFE_Context> TFE_Context_class(m, "TFE_Context");
   py::class_<TFE_Executor> TFE_Executor_class(m, "TFE_Executor");
   py::class_<TFE_ContextOptions> TFE_ContextOptions_class(m,
                                                           "TFE_ContextOptions");
@@ -323,10 +338,10 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   py::class_<TF_Function> TF_Function_class(m, "TF_Function");
 
   m.def("TFE_Py_RegisterExceptionClass", [](const py::handle& e) {
-    return tensorflow::pyo_or_throw(TFE_Py_RegisterExceptionClass(e.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_RegisterExceptionClass(e.ptr()));
   });
   m.def("TFE_Py_RegisterFallbackExceptionClass", [](const py::handle& e) {
-    return tensorflow::pyo_or_throw(
+    return tensorflow::PyoOrThrow(
         TFE_Py_RegisterFallbackExceptionClass(e.ptr()));
   });
 
@@ -347,7 +362,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
             tensorflow::make_safe(TF_NewStatus());
         TFE_Context* context = TFE_NewContext(opts, status.get());
         tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
-        return tensorflow::pyo_or_throw(tensorflow::OutputTFE_Context(context));
+        return tensorflow::PyoOrThrow(tensorflow::OutputTFE_Context(context));
       },
       py::return_value_policy::reference);
   m.def("TFE_DeleteContext", [](py::handle& o) {
@@ -489,6 +504,18 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     // NOTE: different from TFE_ContextSyncExecutors that raises potential
     // errors, deliberately ignore executor statuses in cleanup.
   });
+  m.def("TFE_ContextSetSoftDevicePlacement", [](py::handle& ctx, bool enable) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TFE_ContextSetSoftDevicePlacement(tensorflow::InputTFE_Context(ctx), enable,
+                                      status.get());
+  });
+  m.def("TFE_ContextSetLogDevicePlacement", [](py::handle& ctx, bool enable) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TFE_ContextSetSoftDevicePlacement(tensorflow::InputTFE_Context(ctx), enable,
+                                      status.get());
+  });
 
   // TFE_Executor logic
   m.def(
@@ -540,19 +567,23 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
           if (*is_list == 1) {
             PyObject* list = PyList_New(1);
             PyList_SetItem(list, 0, output_pyo);
-            return tensorflow::pyo_or_throw(list);
+            return tensorflow::PyoOrThrow(list);
           }
-          return tensorflow::pyo_or_throw(output_pyo);
+          return tensorflow::PyoOrThrow(output_pyo);
         });
   m.def("TFE_Py_InitEagerTensor", [](const py::handle& o) {
-    return tensorflow::pyo_or_throw(TFE_Py_InitEagerTensor(o.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_InitEagerTensor(o.ptr()));
   });
+  m.def("TFE_Py_PackEagerTensors",
+        [](const py::handle& context, const py::handle& handles) {
+          return tensorflow::TFE_Py_PackEagerTensors_wrapper(context, handles);
+        });
   m.def("TFE_Py_SetEagerTensorProfiler", &TFE_Py_SetEagerTensorProfiler);
   m.def("TFE_Py_RegisterJVPFunction", [](const py::handle& o) {
-    return tensorflow::pyo_or_throw(TFE_Py_RegisterJVPFunction(o.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_RegisterJVPFunction(o.ptr()));
   });
   m.def("TFE_Py_RegisterGradientFunction", [](const py::handle& o) {
-    return tensorflow::pyo_or_throw(TFE_Py_RegisterGradientFunction(o.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_RegisterGradientFunction(o.ptr()));
   });
   m.def("TFE_Py_Execute",
         [](const py::handle& context, const char* device_name,
@@ -574,22 +605,22 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
       });
   m.def("TFE_Py_FastPathExecute", [](const py::args args) {
     // TFE_Py_FastPathExecute requires error checking prior to returning.
-    return tensorflow::pyo_or_throw(TFE_Py_FastPathExecute_C(args.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_FastPathExecute_C(args.ptr()));
   });
   m.def("TFE_Py_RecordGradient",
         [](const py::handle& op_name, const py::handle& inputs,
            const py::handle& attrs, const py::handle& results,
            const py::handle& forward_pass_name_scope) {
-          return tensorflow::pyo_or_throw(TFE_Py_RecordGradient(
+          return tensorflow::PyoOrThrow(TFE_Py_RecordGradient(
               op_name.ptr(), inputs.ptr(), attrs.ptr(), results.ptr(),
               forward_pass_name_scope.ptr()));
         });
-  m.def("TFE_Py_UID", []() { return tensorflow::pyo_or_throw(TFE_Py_UID()); });
+  m.def("TFE_Py_UID", []() { return tensorflow::PyoOrThrow(TFE_Py_UID()); });
 
   // TFE_Py_Tape Logic
   m.def("TFE_Py_TapeSetNew", [](const py::handle& persistent,
                                 const py::handle& watch_accessed_variables) {
-    return tensorflow::pyo_or_throw(
+    return tensorflow::PyoOrThrow(
         TFE_Py_TapeSetNew(persistent.ptr(), watch_accessed_variables.ptr()));
   });
   m.def("TFE_Py_TapeSetAdd",
@@ -599,15 +630,15 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_Py_TapeSetStopOnThread", &TFE_Py_TapeSetStopOnThread);
   m.def("TFE_Py_TapeSetRestartOnThread", &TFE_Py_TapeSetRestartOnThread);
   m.def("TFE_Py_TapeSetIsStopped",
-        []() { return tensorflow::pyo_or_throw(TFE_Py_TapeSetIsStopped()); });
+        []() { return tensorflow::PyoOrThrow(TFE_Py_TapeSetIsStopped()); });
   m.def("TFE_Py_TapeSetIsEmpty",
-        []() { return tensorflow::pyo_or_throw(TFE_Py_TapeSetIsEmpty()); });
+        []() { return tensorflow::PyoOrThrow(TFE_Py_TapeSetIsEmpty()); });
   m.def("TFE_Py_TapeSetShouldRecordBackprop", [](const py::handle& tensors) {
-    return tensorflow::pyo_or_throw(
+    return tensorflow::PyoOrThrow(
         TFE_Py_TapeSetShouldRecordBackprop(tensors.ptr()));
   });
   m.def("TFE_Py_TapeSetPossibleGradientTypes", [](const py::handle& tensors) {
-    return tensorflow::pyo_or_throw(
+    return tensorflow::PyoOrThrow(
         TFE_Py_TapeSetPossibleGradientTypes(tensors.ptr()));
   });
   m.def("TFE_Py_TapeSetDeleteTrace", &TFE_Py_TapeSetDeleteTrace);
@@ -615,7 +646,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
         [](const py::handle& op_type, const py::handle& output_tensors,
            const py::handle& input_tensors, const py::handle& backward_function,
            const py::handle& forward_function) {
-          return tensorflow::pyo_or_throw(TFE_Py_TapeSetRecordOperation(
+          return tensorflow::PyoOrThrow(TFE_Py_TapeSetRecordOperation(
               op_type.ptr(), output_tensors.ptr(), input_tensors.ptr(),
               backward_function.ptr(), forward_function.ptr()));
         });
@@ -623,19 +654,19 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
       "TFE_Py_TapeSetRecordOperationBackprop",
       [](const py::handle& op_type, const py::handle& output_tensors,
          const py::handle& input_tensors, const py::handle& backward_function) {
-        return tensorflow::pyo_or_throw(TFE_Py_TapeSetRecordOperationBackprop(
+        return tensorflow::PyoOrThrow(TFE_Py_TapeSetRecordOperationBackprop(
             op_type.ptr(), output_tensors.ptr(), input_tensors.ptr(),
             backward_function.ptr()));
       });
-  m.def("TFE_Py_TapeSetRecordOperationForwardprop",
-        [](const py::handle& op_type, const py::handle& output_tensors,
-           const py::handle& input_tensors, const py::handle& backward_function,
-           const py::handle& forwardprop_output_indices) {
-          return tensorflow::pyo_or_throw(
-              TFE_Py_TapeSetRecordOperationForwardprop(
-                  op_type.ptr(), output_tensors.ptr(), input_tensors.ptr(),
-                  backward_function.ptr(), forwardprop_output_indices.ptr()));
-        });
+  m.def(
+      "TFE_Py_TapeSetRecordOperationForwardprop",
+      [](const py::handle& op_type, const py::handle& output_tensors,
+         const py::handle& input_tensors, const py::handle& backward_function,
+         const py::handle& forwardprop_output_indices) {
+        return tensorflow::PyoOrThrow(TFE_Py_TapeSetRecordOperationForwardprop(
+            op_type.ptr(), output_tensors.ptr(), input_tensors.ptr(),
+            backward_function.ptr(), forwardprop_output_indices.ptr()));
+      });
   m.def("TFE_Py_TapeGradient",
         [](const py::handle& tape, const py::handle& target,
            const py::handle& sources, const py::handle& output_gradients,
@@ -647,7 +678,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
               tape.ptr(), target.ptr(), sources.ptr(), output_gradients.ptr(),
               sources_raw.ptr(), unconnected_gradients.ptr(), status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
-          return tensorflow::pyo_or_throw(output);
+          return tensorflow::PyoOrThrow(output);
         });
 
   m.def("TFE_Py_TapeVariableAccessed", [](const py::handle& variable) {
@@ -662,15 +693,32 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
           TFE_Py_TapeWatchVariable(tape.ptr(), variable.ptr());
         });
   m.def("TFE_Py_TapeWatchedVariables", [](const py::handle& tape) {
-    return tensorflow::pyo_or_throw(TFE_Py_TapeWatchedVariables(tape.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_TapeWatchedVariables(tape.ptr()));
   });
 
+  // TFE_Py_VariableWatcher logic.
+  m.def("TFE_Py_VariableWatcherNew",
+        []() { return tensorflow::PyoOrThrow(TFE_Py_VariableWatcherNew()); });
+  m.def("TFE_Py_VariableWatcherRemove", [](const py::handle& variable_watcher) {
+    TFE_Py_VariableWatcherRemove(variable_watcher.ptr());
+  });
+  m.def("TFE_Py_VariableWatcherVariableAccessed",
+        [](const py::handle& variable) {
+          TFE_Py_VariableWatcherVariableAccessed(variable.ptr());
+        });
+  m.def("TFE_Py_VariableWatcherWatchedVariables",
+        [](const py::handle& variable_watcher) {
+          return tensorflow::PyoOrThrow(
+              TFE_Py_VariableWatcherWatchedVariables(variable_watcher.ptr()));
+        });
+
+  // TFE_Py_ForwardAccumulator logic.
   m.def("TFE_Py_ForwardAccumulatorNew", []() {
-    return tensorflow::pyo_or_throw(TFE_Py_ForwardAccumulatorNew());
+    return tensorflow::PyoOrThrow(TFE_Py_ForwardAccumulatorNew());
   });
 
   m.def("TFE_Py_ForwardAccumulatorSetAdd", [](const py::handle& accumulator) {
-    return tensorflow::pyo_or_throw(
+    return tensorflow::PyoOrThrow(
         TFE_Py_ForwardAccumulatorSetAdd(accumulator.ptr()));
   });
   m.def("TFE_Py_ForwardAccumulatorSetRemove",
@@ -686,17 +734,17 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
         });
   m.def("TFE_Py_ForwardAccumulatorJVP",
         [](const py::handle& accumulator, const py::handle& tensor) {
-          return tensorflow::pyo_or_throw(
+          return tensorflow::PyoOrThrow(
               TFE_Py_ForwardAccumulatorJVP(accumulator.ptr(), tensor.ptr()));
         });
   m.def("TFE_Py_ForwardAccumulatorPushState", []() {
-    return tensorflow::pyo_or_throw(TFE_Py_ForwardAccumulatorPushState());
+    return tensorflow::PyoOrThrow(TFE_Py_ForwardAccumulatorPushState());
   });
   m.def("TFE_Py_ForwardAccumulatorPopState", []() {
-    return tensorflow::pyo_or_throw(TFE_Py_ForwardAccumulatorPopState());
+    return tensorflow::PyoOrThrow(TFE_Py_ForwardAccumulatorPopState());
   });
   m.def("TFE_Py_PackJVPs", [](const py::handle& tensors) {
-    return tensorflow::pyo_or_throw(TFE_Py_PackJVPs(tensors.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_PackJVPs(tensors.ptr()));
   });
 
   // TFE_ContextOptions Logic
@@ -726,30 +774,32 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   // TFE_Py_TensorShape Logic
   m.def("TFE_Py_TensorShapeSlice",
         [](const py::handle& tensors, int slice_dim) {
-          return tensorflow::pyo_or_throw(
+          return tensorflow::PyoOrThrow(
               TFE_Py_TensorShapeSlice(tensors.ptr(), slice_dim));
         });
   m.def("TFE_Py_TensorShapeOnDevice", [](const py::handle& tensors,
                                          int slice_dim) {
-    return tensorflow::pyo_or_throw(TFE_Py_TensorShapeOnDevice(tensors.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_TensorShapeOnDevice(tensors.ptr()));
   });
   m.def("TFE_Py_EnableInteractivePythonLogging",
         &TFE_Py_EnableInteractivePythonLogging);
 
   // Additional Context Logic
   m.def("TFE_Py_SetEagerContext", [](const py::handle& o) {
-    return tensorflow::pyo_or_throw(TFE_Py_SetEagerContext(o.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_SetEagerContext(o.ptr()));
   });
   m.def("TFE_ContextStartStep", [](py::handle& o) {
     TFE_ContextStartStep(tensorflow::InputTFE_Context(o.ptr()));
   });
-  m.def("TFE_ContextEndStep", &TFE_ContextEndStep);
+  m.def("TFE_ContextEndStep", [](py::handle& o) {
+    TFE_ContextEndStep(tensorflow::InputTFE_Context(o.ptr()));
+  });
   m.def("TFE_Py_RegisterVSpace", [](const py::handle& o) {
-    return tensorflow::pyo_or_throw(TFE_Py_RegisterVSpace(o.ptr()));
+    return tensorflow::PyoOrThrow(TFE_Py_RegisterVSpace(o.ptr()));
   });
   m.def("TFE_Py_EncodeArg",
         [](const py::handle& o, bool include_tensor_ranks_only) {
-          return tensorflow::pyo_or_throw(
+          return tensorflow::PyoOrThrow(
               TFE_Py_EncodeArg(o.ptr(), include_tensor_ranks_only));
         });
   m.def("TFE_EnableCollectiveOps", [](const py::handle& ctx, py::str proto) {
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 13068a8090e..03120fb8dc4 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -67,6 +67,7 @@ TENSORFLOW_API_INIT_FILES = [
     "summary/experimental/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "tpu/experimental/embedding/__init__.py",
     "tpu/experimental/__init__.py",
     "tpu/__init__.py",
     "train/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index e5f0f46898f..a8154c6f35c 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -85,6 +85,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "summary/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "tpu/experimental/embedding/__init__.py",
     "tpu/experimental/__init__.py",
     "tpu/__init__.py",
     "train/__init__.py",
diff --git a/tensorflow/python/tools/saved_model_aot_compile.py b/tensorflow/python/tools/saved_model_aot_compile.py
index a8694454ef2..5a34d10420a 100644
--- a/tensorflow/python/tools/saved_model_aot_compile.py
+++ b/tensorflow/python/tools/saved_model_aot_compile.py
@@ -215,6 +215,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
                                    signature_def_key,
                                    cpp_class,
                                    target_triple,
+                                   target_cpu,
                                    variables_to_feed=(),
                                    enable_multithreading=False):
   """Compile a `MetaGraphDef` to header+object files in `output_prefix`.
@@ -239,6 +240,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
     signature_def_key: String, the signature_def to use in the SavedModel.
     cpp_class: String, Name of output C++ class.
     target_triple: String, LLVM target triple.
+    target_cpu: String, LLVM target cpu name.
     variables_to_feed: A list of strings, the variables that will be fed by the
       user; these won't be frozen.  If `None`, then we will extract all the
       variables in the graph and mark them as to-feed.  The default behavior is
@@ -367,6 +369,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
       config=config_pbtxt_location,
       cpp_class=cpp_class,
       target_triple=target_triple,
+      target_cpu=target_cpu,
       entry_point='entry_{}'.format(entry_digest),
       out_function_object='{}.o'.format(output_prefix),
       out_header='{}.h'.format(output_prefix),
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 261ee1b9e9d..0f8f68436a3 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -825,6 +825,7 @@ def aot_compile_cpu(args):
       variables_to_feed=variables_to_feed,
       output_prefix=args.output_prefix,
       target_triple=args.target_triple,
+      target_cpu=args.target_cpu,
       cpp_class=args.cpp_class,
       enable_multithreading=args.enable_multithreading)
 
@@ -1096,6 +1097,14 @@ def add_aot_compile_cpu_subparser(subparsers):
             'x86_64-none-darwin, x86_64-apple-ios, arm64-none-ios, '
             'armv7-none-android.  More examples are available in tfcompile.bzl '
             'in the tensorflow codebase.'))
+  parser_compile.add_argument(
+      '--target_cpu',
+      type=str,
+      default='',
+      help=('Target cpu name for LLVM during AOT compilation.  Examples: '
+            'x86_64, skylake, haswell, westmere, <empty> (unknown).  For '
+            'a complete list of options, run (for x86 targets): '
+            '`llc -march=x86 -mcpu=help`'))
   parser_compile.add_argument(
       '--checkpoint_path',
       type=str,
diff --git a/tensorflow/python/tools/tools.bzl b/tensorflow/python/tools/tools.bzl
index c6853e1fc63..79f771bbcad 100644
--- a/tensorflow/python/tools/tools.bzl
+++ b/tensorflow/python/tools/tools.bzl
@@ -1,6 +1,7 @@
 """Definitions for using tools like saved_model_cli."""
 
 load("//tensorflow:tensorflow.bzl", "clean_dep", "if_xla_available")
+load("//tensorflow:tensorflow.bzl", "tfcompile_target_cpu")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "target_llvm_triple")
 
 def _maybe_force_compile(args, force_compile):
@@ -19,6 +20,7 @@ def saved_model_compile_aot(
         signature_def = "serving_default",
         variables_to_feed = "",
         target_triple = None,
+        target_cpu = None,
         force_without_xla_support_flag = True,
         tags = None):
     """Compile a SavedModel directory accessible from a filegroup.
@@ -88,7 +90,9 @@ def saved_model_compile_aot(
         uninitialized in the compiled object (this applies to all input
         arguments from the signature as well).
       target_triple: The LLVM target triple to use (defaults to current build's
-        target architecture's triple).
+        target architecture's triple).  Similar to clang's -target flag.
+      target_cpu: The LLVM cpu name used for compilation.  Similar to clang's
+        -mcpu flag.
       force_without_xla_support_flag: Whether to compile even when
         `--define=with_xla_support=true` is not set.  If `False`, and the
         define is not passed when building, then the created `cc_library`
@@ -100,6 +104,7 @@ def saved_model_compile_aot(
     """
     saved_model = "{}/saved_model.pb".format(directory)
     target_triple = target_triple or target_llvm_triple()
+    target_cpu = target_cpu or tfcompile_target_cpu() or ""
     variables_to_feed = variables_to_feed or "''"
     if checkpoint_path:
         checkpoint_cmd_args = (
@@ -131,6 +136,7 @@ def saved_model_compile_aot(
             "--variables_to_feed {} ".format(variables_to_feed) +
             "--signature_def_key {} ".format(signature_def) +
             "--target_triple " + target_triple + " " +
+            ("--target_cpu " + target_cpu + " " if target_cpu else "") +
             "--tag_set {} ".format(tag_set)
         ),
         tags = tags,
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index ebf0a4ffc57..5b466d7e20a 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -179,6 +179,8 @@ py_library(
         ":feature_column_v2",
         ":preempted_hook_py",
         ":tpu_embedding",
+        ":tpu_embedding_v2",
+        ":tpu_embedding_v2_utils",
         ":tpu_lib",
     ],
 )
@@ -435,6 +437,45 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "tpu_embedding_v2_utils",
+    srcs = ["tpu_embedding_v2_utils.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain/contrib/learn/tpu:__subpackages__",
+        "//quality/deepsearch:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/training/saving:saveable_hook",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "tpu_embedding_v2",
+    srcs = ["tpu_embedding_v2.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain/contrib/learn/tpu:__subpackages__",
+        "//quality/deepsearch:__subpackages__",
+    ],
+    deps = [
+        ":tpu_embedding_v2_utils",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/training/saving:saveable_hook",
+        "@six_archive//:six",
+    ],
+)
+
 tf_proto_library(
     name = "tensor_tracer_proto",
     srcs = ["tensor_tracer.proto"],
diff --git a/tensorflow/python/tpu/api.py b/tensorflow/python/tpu/api.py
index 7296de81dfe..a7db89ec0a5 100644
--- a/tensorflow/python/tpu/api.py
+++ b/tensorflow/python/tpu/api.py
@@ -27,5 +27,7 @@ from tensorflow.python.tpu import bfloat16
 from tensorflow.python.tpu import feature_column_v2
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_embedding
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_utils
 from tensorflow.python.tpu import tpu_optimizer
 # pylint: enable=unused-import
diff --git a/tensorflow/python/tpu/client/client.py b/tensorflow/python/tpu/client/client.py
index fdfda90f7d0..bc693cbef68 100644
--- a/tensorflow/python/tpu/client/client.py
+++ b/tensorflow/python/tpu/client/client.py
@@ -29,7 +29,7 @@ from six.moves.urllib.error import HTTPError
 
 _GOOGLE_API_CLIENT_INSTALLED = True
 try:
-  from apiclient import discovery  # pylint: disable=g-import-not-at-top
+  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
   from oauth2client import client  # pylint: disable=g-import-not-at-top
 except ImportError:
   _GOOGLE_API_CLIENT_INSTALLED = False
diff --git a/tensorflow/python/tpu/client/pip_package/setup.py b/tensorflow/python/tpu/client/pip_package/setup.py
index e27e006646d..74f81f48265 100644
--- a/tensorflow/python/tpu/client/pip_package/setup.py
+++ b/tensorflow/python/tpu/client/pip_package/setup.py
@@ -50,5 +50,5 @@ setup(
     ],
     license='Apache 2.0',
     keywords='tensorflow tpu',
-    install_requires=['google-api-python-client', 'oauth2client']
+    install_requires=['google-api-python-client==1.8.0', 'oauth2client']
 )
diff --git a/tensorflow/python/tpu/client/version.py b/tensorflow/python/tpu/client/version.py
index 3cb8087b1f1..42d04fa2a92 100644
--- a/tensorflow/python/tpu/client/version.py
+++ b/tensorflow/python/tpu/client/version.py
@@ -18,4 +18,4 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-__version__ = "0.7"
+__version__ = "0.8"
diff --git a/tensorflow/python/tpu/feature_column_v2.py b/tensorflow/python/tpu/feature_column_v2.py
index d9820425467..1012506c48b 100644
--- a/tensorflow/python/tpu/feature_column_v2.py
+++ b/tensorflow/python/tpu/feature_column_v2.py
@@ -31,15 +31,18 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu.feature_column import _is_running_on_cpu
 from tensorflow.python.tpu.feature_column import _record_variable_scope_and_name
 from tensorflow.python.tpu.feature_column import _SUPPORTED_CATEGORICAL_COLUMNS_V2
+from tensorflow.python.tpu.feature_column import _SUPPORTED_SEQUENCE_COLUMNS
 from tensorflow.python.tpu.feature_column import _TPUBaseEmbeddingColumn
 from tensorflow.python.util.tf_export import tf_export
 # pylint: disable=protected-access
 
 _ALLOWED_DEVICES = ['cpu', 'tpu_tensor_core', 'tpu_embedding_core']
+_TENSOR_CORE_MASK_KEY_SUFFIX = '__TENSOR_CORE_MASK'
 
 
 class EmbeddingDevice(enum.Enum):
@@ -174,10 +177,13 @@ def embedding_column_v2(categorical_column,
   elif embedding_lookup_device == 'tpu_embedding_core':
     embedding_lookup_device = EmbeddingDevice.TPU_EMBEDDING_CORE
 
-  if (embedding_lookup_device == EmbeddingDevice.TPU_TENSOR_CORE and
-      not tensor_core_shape):
-    raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
-                     'tensor_core_shape to be set.')
+  if embedding_lookup_device == EmbeddingDevice.TPU_TENSOR_CORE:
+    if not tensor_core_shape:
+      raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
+                       'tensor_core_shape to be set.')
+    if isinstance(categorical_column, _SUPPORTED_SEQUENCE_COLUMNS):
+      raise ValueError('embedding_lookup_device=tpu_tensor_core currently does '
+                       'not support sequence columns.')
 
   if not embedding_lookup_device:
     return _TPUEmbeddingColumnV2(
@@ -372,10 +378,14 @@ def shared_embedding_columns_v2(categorical_columns,
   elif embedding_lookup_device == 'tpu_embedding_core':
     embedding_lookup_device = EmbeddingDevice.TPU_EMBEDDING_CORE
 
-  if (embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE and
-      not tensor_core_shape):
-    raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
-                     'tensor_core_shape to be set.')
+  if embedding_lookup_device == EmbeddingDevice.TPU_TENSOR_CORE:
+    if not tensor_core_shape:
+      raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
+                       'tensor_core_shape to be set.')
+    for c in sorted_columns:
+      if isinstance(c, _SUPPORTED_SEQUENCE_COLUMNS):
+        raise ValueError('embedding_lookup_device=tpu_tensor_core currently '
+                         'does not support sequence columns.')
 
   # Create the state (_SharedEmbeddingColumnLayer) here.
   for categorical_column, max_sequence_length in zip(
@@ -807,7 +817,13 @@ def sparse_embedding_aggregate_slice(params,
     if combiner == 'sum':
       return aggregate_emb
     elif combiner == 'mean':
-      return aggregate_emb / math_ops.reduce_sum(values_mask_broadcast, axis=1)
+      # In the case we have an empty row, both aggregate_emb and
+      # math_ops.reduce_sum(values_mask_broadcast, axis=1) will be 0. Thus,
+      # we can take max it with a non-zero value to prevent NaNs. Note that
+      # math_ops.reduce_sum(values_mask_broadcast, axis=1) will have integer
+      # values so 1.0 is the smallest value.
+      return aggregate_emb / math_ops.maximum(
+          math_ops.reduce_sum(values_mask_broadcast, axis=1), 1.0)
     else:
       raise ValueError('Dense TPU Embedding does not support combiner '
                        'other than sum and mean.')
@@ -851,6 +867,20 @@ def pad_sparse_embedding_lookup_indices(sparse_indices, padded_size):
   return padded_values, padded_mask
 
 
+def _check_invalid_cases(embedding_lookup_device):
+  """Checks for invalid embedding_lookup_device configurations."""
+  if (tpu.under_tpu_inference_context() and
+      embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE):
+    raise ValueError(
+        'Using embedding_lookup_device=tpu_embedding_core during inference '
+        'is not supported.')
+  if embedding_lookup_device == EmbeddingDevice.CPU:
+    if not tpu.under_tpu_inference_context():
+      raise ValueError(
+          'Using TPUEmbeddingColumn with embedding_lookup_device="cpu" '
+          'during training is not supported.')
+
+
 class _TPUDeviceSpecificEmbeddingColumnV2(_TPUEmbeddingColumnV2):
   """TPUEmbeddingColumn which allows serving on TensorCore."""
 
@@ -874,46 +904,108 @@ class _TPUDeviceSpecificEmbeddingColumnV2(_TPUEmbeddingColumnV2):
       del kwargs['embedding_lookup_device']
     _TPUEmbeddingColumnV2.__init__(self, *args, **kwargs)
 
-  def create_state(self, state_manager):
-    if (tpu.under_tpu_inference_context() and
-        self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE):
-      raise ValueError(
-          'Using embedding_lookup_device=tpu_embedding_core during inference '
-          'is not supported.')
-    if self._embedding_lookup_device == EmbeddingDevice.CPU:
-      if tpu.under_tpu_inference_context():
-        return fc_lib.EmbeddingColumn.create_state(self, state_manager)
-      else:
-        raise ValueError(
-            'Using TPUEmbeddingColumn with embedding_lookup_device="cpu" '
-            'during training is not supported.')
+  def __deepcopy__(self, memo):
+    return _TPUDeviceSpecificEmbeddingColumnV2(
+        *(copy.deepcopy(a, memo) for a in self.__getnewargs__()),
+        tensor_core_shape=self._tensor_core_shape,
+        embedding_lookup_device=self._embedding_lookup_device)
 
-    return super(_TPUDeviceSpecificEmbeddingColumnV2,
-                 self).create_state(state_manager)
+  def create_state(self, state_manager):
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU case.
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
+      return fc_lib.EmbeddingColumn.create_state(self, state_manager)
+    # TPU_EMBEDDING_CORE case.
+    elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self).create_state(state_manager)
+
+    # TPU_EMBEDDING_CORE case.
+    return fc_lib.EmbeddingColumn.create_state(self, state_manager)
 
   def get_dense_tensor(self, transformation_cache, state_manager):
     """Private method that follows get_dense_tensor."""
-
-    # If we aren't inferencing on TensorCore, just delegate to parent.
-    if not tpu.under_tpu_inference_context() or not self._tensor_core_shape:
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU Case.
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self).get_dense_tensor(transformation_cache, state_manager)
+    # TPU_EMBEDDING_CORE case.
+    elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
       return super(_TPUDeviceSpecificEmbeddingColumnV2,
                    self).get_dense_tensor(transformation_cache, state_manager)
-    sparse_tensor = transformation_cache.get(self.categorical_column.name,
-                                             state_manager)
 
-    # Use outside compile to densify and pad the input tensors.
-    def host_computation():
-      return pad_sparse_embedding_lookup_indices(sparse_tensor,
-                                                 self._tensor_core_shape[1])
+    # TPU_EMBEDDING_CORE cases.
+    if tpu.under_tpu_inference_context():
+      # For inference, use outside compile to densify and pad the input tensors.
+      sparse_tensor = transformation_cache.get(self.categorical_column.name,
+                                               state_manager)
 
-    values, mask = tpu.outside_compilation(host_computation)
+      def host_computation():
+        return pad_sparse_embedding_lookup_indices(sparse_tensor,
+                                                   self._tensor_core_shape[1])
 
-    # Do a dense embedding lookup on TensorCore.
-    embedding_weights = state_manager.get_variable(self, 'embedding_weights')
-    embedding = sparse_embedding_aggregate_slice(embedding_weights,
-                                                 (values, mask),
-                                                 self.get_combiner())
-    return embedding
+      values, mask = tpu.outside_compilation(host_computation)
+    else:
+      # For training, the inputs should already have been densified and padded.
+      values = transformation_cache.get(self.categorical_column.name,
+                                        state_manager)
+      mask = transformation_cache.get(
+          self.categorical_column.name + _TENSOR_CORE_MASK_KEY_SUFFIX,
+          state_manager)
+    embedding_weights = state_manager.get_variable(
+        self, name='embedding_weights')
+    return sparse_embedding_aggregate_slice(embedding_weights, (values, mask),
+                                            self.get_combiner())
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU Case.
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor(inputs, weight_collections,
+                                           trainable)
+    # TPU_EMBEDDING_CORE case.
+    elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor(inputs, weight_collections,
+                                           trainable)
+
+    # TPU_EMBEDDING_CORE cases.
+    if tpu.under_tpu_inference_context():
+      # For inference, use outside compile to densify and pad the input tensors.
+      sparse_tensor = inputs.get(self.get_feature_key_name())
+
+      def host_computation():
+        return pad_sparse_embedding_lookup_indices(sparse_tensor,
+                                                   self._tensor_core_shape[1])
+
+      values, mask = tpu.outside_compilation(host_computation)
+    else:
+      # For training, the inputs should already have been densified and padded.
+      values = inputs.get(self.get_feature_key_name())
+      mask = inputs.get(self.get_feature_key_name() +
+                        _TENSOR_CORE_MASK_KEY_SUFFIX)
+
+    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+    if (weight_collections and
+        ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections):
+      weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+    embedding_weights = variable_scope.get_variable(
+        name='embedding_weights',
+        shape=embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self.initializer,
+        trainable=self.trainable and trainable,
+        collections=weight_collections)
+    return sparse_embedding_aggregate_slice(embedding_weights, (values, mask),
+                                            self.get_combiner())
 
 
 class _TPUSharedDeviceSpecificEmbeddingColumnV2(_TPUSharedEmbeddingColumnV2):
@@ -940,34 +1032,48 @@ class _TPUSharedDeviceSpecificEmbeddingColumnV2(_TPUSharedEmbeddingColumnV2):
       del kwargs['embedding_lookup_device']
     _TPUSharedEmbeddingColumnV2.__init__(self, *args, **kwargs)
 
+  def __deepcopy__(self, memo):
+    return _TPUSharedDeviceSpecificEmbeddingColumnV2(
+        *(copy.deepcopy(a, memo) for a in self.__getnewargs__()),
+        tensor_core_shape=self._tensor_core_shape,
+        embedding_lookup_device=self._embedding_lookup_device)
+
   def _get_dense_tensor_internal(self, transformation_cache, state_manager):
     """Private method that follows _get_dense_tensor_internal."""
-    if (tpu.under_tpu_inference_context() and
-        self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE):
-      raise ValueError('Using embedding_lookup_device=tpu_embedding_core '
-                       'during inference is not supported.')
-    if self._embedding_lookup_device == EmbeddingDevice.CPU:
-      if tpu.under_tpu_inference_context():
-        return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
-                     self)._get_dense_tensor_internal(transformation_cache,
-                                                      state_manager)
-      else:
-        raise ValueError(
-            'Using TPUSharedEmbeddingColumn with '
-            'embedding_lookup_device="cpu" during training is not supported.')
-    sparse_tensor = transformation_cache.get(self.categorical_column.name,
-                                             state_manager)
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU Case.
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
+      return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor_internal(transformation_cache,
+                                                    state_manager)
+    # TPU_EMBEDDING_CORE case.
+    if self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
+      return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor_internal(transformation_cache,
+                                                    state_manager)
 
-    # Use outside compile to densify and pad the input tensors.
-    def host_computation():
-      return pad_sparse_embedding_lookup_indices(sparse_tensor,
-                                                 self._tensor_core_shape[1])
+    # TPU_EMBEDDING_CORE cases.
+    if tpu.under_tpu_inference_context():
+      # For inference, use outside compile to densify and pad the input tensors.
+      sparse_tensor = transformation_cache.get(self.categorical_column.name,
+                                               state_manager)
 
-    values, mask = tpu.outside_compilation(host_computation)
+      def host_computation():
+        return pad_sparse_embedding_lookup_indices(sparse_tensor,
+                                                   self._tensor_core_shape[1])
+
+      values, mask = tpu.outside_compilation(host_computation)
+    else:
+      # For training, the inputs should already have been densified and padded.
+      values = transformation_cache.get(self.categorical_column.name,
+                                        state_manager)
+      mask = transformation_cache.get(
+          self.categorical_column.name + _TENSOR_CORE_MASK_KEY_SUFFIX,
+          state_manager)
 
     # Do a dense embedding lookup on TensorCore.
     embedding_weights = self.shared_embedding_column_creator.embedding_weights
-    embedding = sparse_embedding_aggregate_slice(embedding_weights,
-                                                 (values, mask),
-                                                 self.get_combiner())
-    return embedding
+    return sparse_embedding_aggregate_slice(embedding_weights, (values, mask),
+                                            self.get_combiner())
diff --git a/tensorflow/python/tpu/feature_column_v2_test.py b/tensorflow/python/tpu/feature_column_v2_test.py
index 282d176b301..932fe4e5a0a 100644
--- a/tensorflow/python/tpu/feature_column_v2_test.py
+++ b/tensorflow/python/tpu/feature_column_v2_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import feature_column_v2 as tpu_fc
 from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_function
 
 
 def _initialized_session():
@@ -514,50 +515,119 @@ class DeviceSpecificEmbeddingColumnTestV2(test.TestCase,
           embedding_lookup_device='tpu_tensor_core',
           tensor_core_shape=[None, 3])
 
-    # Run in TPUInferenceContext so that we hit the intended densification case.
+    # Run in TPUContexts so that we hit the intended densification case.
     context = tpu._TPUInferenceContext('tpu_inference')
     context.Enter()
+    with tpu_function.tpu_shard_context(1):
+      dense_features = fc_lib.DenseFeatures(embedding_column)
+      # Sqrtn combiner not supported for now.
+      if combiner == 'sqrtn':
+        with self.assertRaisesRegexp(
+            ValueError, 'Dense TPU Embedding does not support combiner'):
+          embedding_lookup = dense_features(input_features)
+        return
+      if combiner == 'mean':
+        expected_lookups = (
+            # example 0:
+            (7., 11.),  # ids [2], embedding = [7, 11]
+            # example 1:
+            (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) =
+            # [2, 3.5]
+        )
+      elif combiner == 'sum':
+        expected_lookups = (
+            # example 0:
+            (7., 11.),  # ids [2], embedding = [7, 11]
+            # example 1:
+            (4., 7),  # ids [0, 1], embedding = sum([1, 2] + [3, 5]) = [4, 7]
+        )
 
-    dense_features = fc_lib.DenseFeatures(embedding_column)
-    # Sqrtn combiner not supported for now.
-    if combiner == 'sqrtn':
-      with self.assertRaisesRegexp(
-          ValueError, 'Dense TPU Embedding does not support combiner'):
-        embedding_lookup = dense_features(input_features)
-      return
-    if combiner == 'mean':
+      embedding_lookup = dense_features(input_features)
+
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      if shared:
+        self.assertCountEqual(('inp_shared_embedding:0',),
+                              tuple([v.name for v in global_vars]))
+      else:
+        self.assertCountEqual(
+            ('dense_features/inp_embedding/embedding_weights:0',),
+            tuple([v.name for v in global_vars]))
+
+      embedding_var = global_vars[0]
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, embedding_var.eval())
+        eval_res = embedding_lookup.eval()
+        self.assertAllEqual(expected_lookups, eval_res)
+      context.Exit()
+
+  @test_util.deprecated_graph_mode_only
+  def test_empty_row(self):
+    # Inputs.
+    vocabulary_size = 3
+    input_sparse_tensor = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [0, 1, 3]
+        indices=((1, 0), (1, 1), (1, 4)),
+        values=(0, 1, 3),
+        dense_shape=(2, 5))
+    input_features = {'inp': input_sparse_tensor}
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.),  # id 2
+        (13., 17.)  # id 3
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Build columns.
+    categorical_column_input = fc_lib.categorical_column_with_identity(
+        key='inp', num_buckets=vocabulary_size)
+
+    # Set tensor_core_shape to be [None, 20] to ensure some padding and
+    # dynamic batch size.
+    embedding_column = tpu_fc.embedding_column_v2(
+        categorical_column_input,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        combiner='mean',
+        embedding_lookup_device='tpu_tensor_core',
+        tensor_core_shape=[None, 3])
+
+    # Run in TPUContexts so that we hit the intended densification case.
+    context = tpu._TPUInferenceContext('tpu_inference')
+    context.Enter()
+    with tpu_function.tpu_shard_context(1):
+      dense_features = fc_lib.DenseFeatures(embedding_column)
       expected_lookups = (
           # example 0:
-          (7., 11.),  # ids [2], embedding = [7, 11]
+          (0., 0.),  # ids [], embedding = [0, 0]
           # example 1:
           (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
       )
-    elif combiner == 'sum':
-      expected_lookups = (
-          # example 0:
-          (7., 11.),  # ids [2], embedding = [7, 11]
-          # example 1:
-          (4., 7),  # ids [0, 1], embedding = sum([1, 2] + [3, 5]) = [4, 7]
-      )
 
-    embedding_lookup = dense_features(input_features)
+      embedding_lookup = dense_features(input_features)
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    if shared:
-      self.assertCountEqual(('inp_shared_embedding:0',),
-                            tuple([v.name for v in global_vars]))
-    else:
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
       self.assertCountEqual(
           ('dense_features/inp_embedding/embedding_weights:0',),
           tuple([v.name for v in global_vars]))
 
-    embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      eval_res = embedding_lookup.eval()
-      self.assertAllEqual(expected_lookups, eval_res)
-    context.Exit()
+      embedding_var = global_vars[0]
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, embedding_var.eval())
+        eval_res = embedding_lookup.eval()
+        self.assertAllEqual(expected_lookups, eval_res)
+      context.Exit()
 
   @test_util.deprecated_graph_mode_only
   def test_error_dense_shape_invalid(self):
diff --git a/tensorflow/python/tpu/tensor_tracer.proto b/tensorflow/python/tpu/tensor_tracer.proto
index ad5392d65fe..7b745f0f45b 100644
--- a/tensorflow/python/tpu/tensor_tracer.proto
+++ b/tensorflow/python/tpu/tensor_tracer.proto
@@ -21,6 +21,10 @@ message TensorTracerReport {
   // A map from tensor name to its TracedTensorDef.
   map<string, TracedTensorDef> tensordef = 3;
 
+  // The fingerprint of the TensorTracerReport (fingerprint calculation excludes
+  // this field and graphdef).
+  string fingerprint = 4;
+
   message TensorTracerConfig {
     // Tensor tracer version, e.g. hostcall, outside compilation.
     string version = 1;
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index d355bd6205a..b4f99897094 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -64,6 +64,7 @@ _REASON_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
 _REASON_UNSAFE_OP = 'not-traced-unsafe-op'
 _REASON_WHILELOOP_OP = 'not-traced-special-whileloop-op'
 _REASON_CONTROLFLOW_OP = 'not-traced-control-flow-op'
+_REASON_IN_CONTROL_FLOW = 'not-traced-in-control-flow'
 _REASON_UNSAFE_SCALAR = 'not-traced-unsafe-scalar'
 _REASON_SKIP_SCALAR = 'not-traced-scalar'
 _REASON_LESS_INTERESTING_OP = 'not-traced-less-interesting-op'
@@ -85,6 +86,7 @@ _COMPACT_TRACE_ENTRY_INIT_VALUE = -1.0
 _TENSOR_TRACER_STORAGE = 'tensor_tracer_storage'
 _TT_SNAPSHOT = 'tensor_tracer_snapshot'
 _REPLICA_ID_TAG = '#replica-id: '
+_SKIP_REPORT_FILE = 'None'  # Do not write report proto if --report_file=None
 
 _TT_SUMMARY_NORM = tensor_tracer_flags.TT_SUMMARY_NORM
 _TT_SUMMARY_MAX = tensor_tracer_flags.TT_SUMMARY_MAX
@@ -98,7 +100,7 @@ _TT_TENSORBOARD_PLUGIN_NAME = 'tensor_tracer'
 _TT_HOSTCALL_KEY = 'tensor_tracer_host_call'
 _TT_EVENT_FILE_SUFFIX = '.tensor_tracer'
 
-_TT_SUMMARY_MAX_QUEUE = 100
+_TT_SUMMARY_MAX_QUEUE = 10
 
 
 def set_parameters(tensor_tracer_params=None):
@@ -204,6 +206,9 @@ def set_parameters(tensor_tracer_params=None):
           -> op2 -> op1 -> op0, if op0 has a NaN and trace_stack_size is 1, the
           result of op1 will also be printed. trace_stack_size is 2, the result
           of op1 and op2 will be printed.
+        - use_fingerprint_subdirectory: The trace directory will be chosen as
+          using the fingerprint of the trace metadata under the provided
+          trace_dir.
   """
   flags = '--%s=1' % tensor_tracer_flags.FLAG_NAME_ENABLE
   if tensor_tracer_params:
@@ -486,8 +491,6 @@ class TensorTracer(object):
   def unsafe_op(op):
     """Returns True if this op is not safe to be traced."""
 
-    if control_flow_util.IsInCond(op):
-      return True
     # Reasons for not including following op types:
     #    Assign: cause incorrect result with CPU tracing.
     if op.type == 'Assign':
@@ -541,11 +544,37 @@ class TensorTracer(object):
     """
     self._replica_id = None
     self._tt_config = tensor_tracer_report.TensorTracerConfig()
-    self._parameters = tensor_tracer_flags.TTParameters()
-    self._included_op_full_names = set()
+    self._parameters = None
     self._host_call_fn = {}
     self._cache_variables = {}
     self._traced_op_names = set()
+    self._report_proto = None
+    self._temp_cache_var = []
+    self._report_proto_path = ''
+
+  def report_proto(self):
+    """Getter for tensor_tracer.proto object for summary and full_tensor_summary modes.
+
+    Returns:
+      A tensor_tracer.proto object.
+    Raises:
+      ValueError if called before tracing happens, or when trace mode is not
+      summary or full_tensor_summary.
+    """
+    if self._report_proto:
+      return self._report_proto
+    else:
+      raise ValueError('Call to report_proto must be done after tracing.'
+                       'Report proto only exists for '
+                       'trace_mode=[summary|full_tensor_summary]')
+
+  def report_proto_path(self):
+    """Getter for path where tensor_tracer.proto object should be written.
+
+    Returns:
+      A string path.
+    """
+    return self._report_proto_path
 
   def _get_all_cache_variables(self):
     return self._cache_variables
@@ -627,30 +656,24 @@ class TensorTracer(object):
 
     def _is_op_or_any_neighbor_included(op, check_before=0, check_after=0):
       """Helper function to check if op is included or not."""
-      if op.name in self._included_op_full_names:
-        return True
       for opname_re in self._parameters.included_opname_re_list:
         if opname_re.match(op.name):
-          self._included_op_full_names.add(op.name)
           return True
 
       for optype_re in self._parameters.included_optype_re_list:
         if optype_re.match(op.type):
-          self._included_op_full_names.add(op.name)
           return True
 
       if check_after > 0:
         for out_tensor in op.outputs:
           for consumer in out_tensor.consumers():
             if _is_op_or_any_neighbor_included(consumer, check_after - 1, 0):
-              self._included_op_full_names.add(op.name)
               return True
       if check_before > 0:
         for input_tensor in op.inputs:
           if _is_op_or_any_neighbor_included(input_tensor.op,
                                              0,
                                              check_before - 1):
-            self._included_op_full_names.add(op.name)
             return True
       return False
     # check_after and check_before are swapped below, as below operation
@@ -682,6 +705,23 @@ class TensorTracer(object):
   def _num_signature_dimensions(self):
     return len(self._signature_types())
 
+  def _use_temp_cache(self):
+    """Returns true if the intermediate values should be stacked instead of being stored in a tf.Variable.
+
+    Returns:
+      A boolean, denoting whether to use a temporary cache or not.
+    """
+    # If full tensors need to be stored tf.variables, then do not use temp
+    # variables to store them.
+    if self._use_tensor_buffer():
+      return False
+    if self._use_tensor_values_cache():
+      return self._parameters.use_temp_cache_var
+    else:
+      # Temporary caches only replaces tf.Variables caches. If no cache is used
+      # return False.
+      return False
+
   def _use_tensor_values_cache(self):
     """Returns True if immediate tensors should be first saved to a cache."""
     if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_SUMMARY:
@@ -705,6 +745,44 @@ class TensorTracer(object):
     return (self._parameters.trace_mode ==
             tensor_tracer_flags.TRACE_MODE_FULL_TENSOR_SUMMARY)
 
+  def _merge_tensor_signatures(self, signatures):
+    """Returns a tensor that merges the given signatures.
+
+    Args:
+      signatures: A dictionary of the signature updates from signature name to
+      a tensor of dimension [1].
+    Returns:
+      A tensor that concats the signature values in a predefined order.
+    """
+    sorted_update = []
+    if self._num_signature_dimensions() > 1:
+      signature_indices = self._signature_types()
+      for _, val in sorted(signatures.items(),
+                           key=lambda item: signature_indices[item[0]]):
+        sorted_update.append(val)
+      updates = array_ops.stack(
+          sorted_update, axis=0, name='merge_single_op_signatures')
+    elif self._num_signature_dimensions() == 1:
+      # Avoid stack operation if there is only a single signature.
+      (_, val), = signatures.items()
+      updates = val
+    else:
+      raise ValueError('Cannot merge 0 signatures.')
+    return updates
+
+  def _save_tensor_value_to_tmp_cache(self, cache_idx, updates):
+    """Returns an op that will save the given updates to an entry in the cache.
+
+    Args:
+      cache_idx: The cache index of the tensor within the cache.
+      updates: A dictionary of the signature updates from signature name to
+      a tensor of dimension [1].
+    """
+    updates = self._merge_tensor_signatures(updates)
+    updates = array_ops.reshape(updates,
+                                [self._num_signature_dimensions()])
+    self._temp_cache_var[cache_idx] = updates
+
   def _save_tensor_value_to_cache_op(self, cache_idx, updates):
     """Returns an op that will save the given updates to an entry in the cache.
 
@@ -717,18 +795,9 @@ class TensorTracer(object):
     # state_ops.scatter_update allows updates only along the first dimension.
     # Make a compact array by concatenating different signatures, and update
     # them all together.
-    sorted_update = []
-    if self._num_signature_dimensions() > 1:
-      signature_indices = self._signature_types()
-      for _, val in sorted(updates.items(),
-                           key=lambda item: signature_indices[item[0]]):
-        sorted_update.append(val)
-      updates = array_ops.stack(sorted_update, axis=0)
-      updates = array_ops.reshape(updates, [1,
-                                            self._num_signature_dimensions()])
-    else:
-      (_, val), = updates.items()
-      updates = array_ops.reshape(val, [1, self._num_signature_dimensions()])
+    updates = self._merge_tensor_signatures(updates)
+    updates = array_ops.reshape(updates,
+                                [1, self._num_signature_dimensions()])
     indices = constant_op.constant([cache_idx])
     cache = self._create_or_get_tensor_values_cache(_TT_SUMMARY_TAG)
     return state_ops.scatter_update(cache, indices, updates).op
@@ -1006,6 +1075,30 @@ class TensorTracer(object):
     raise RuntimeError('Tensor trace fun for %s is not yet implemented'
                        %self._parameters.trace_mode)
 
+  def _is_in_control_flow(self, op):
+    """Returns true if the given op is inside a tf.cond or in tf.while_loop.
+
+    Args:
+      op: A tensorflow op that should be checked whether in control flow or not.
+    Returns:
+      A boolean value whether the op is in control flow or not.
+    """
+    return control_flow_util.IsInCond(op)
+
+  def _should_trace_in_control_flow(self):
+    """Returns false incase it is not safe to trace ops in tf.cond or tf.while_loop."""
+    # As different from the other trace modes, TRACE_MODE_OPTIONAL_SUMMARY
+    # forces the execution of the traced tensors. We should not trace the ops
+    # that may not be executed due to control flow.
+    if self._use_temp_cache():
+      return False
+    elif self._tt_config.device_type == _DEVICE_TYPE_TPU:
+      # On TPUs do not trace in control flow unless we use caches to store
+      # intermediate values as calling outside compilation within an inner loop
+      # causes errors.
+      return self._use_tensor_values_cache() or self._use_tensor_buffer()
+    return True
+
   def _skip_op(self, op_id, op, ops_in_exec_path, report_handler):
     """Returns True if we should not trace Op.
 
@@ -1037,7 +1130,11 @@ class TensorTracer(object):
       report_handler.instrument_op(
           op, TensorTracer.reason(op_id, _REASON_NOT_EXECUTED))
       return True
-
+    if self._is_in_control_flow(op):
+      if not self._should_trace_in_control_flow():
+        report_handler.instrument_op(
+            op, TensorTracer.reason(op_id, _REASON_IN_CONTROL_FLOW))
+        return True
     if self._is_user_included_op(op):
       report_handler.instrument_op(
           op, TensorTracer.reason(op_id, _REASON_USER_INCLUDED))
@@ -1228,6 +1325,21 @@ class TensorTracer(object):
         if not gfile.Exists(self._parameters.trace_dir):
           raise RuntimeError('Failed to create %s'%self._parameters.trace_dir)
 
+  def _create_temp_cache(self, num_traced_tensors, num_signatures):
+    """Creates a temporary cache with the given dimensions.
+
+    Fills the self._temp_cache_var with num_traced_tensors tf.constant() ops
+    that have shape of [num_signatures].
+    Args:
+      num_traced_tensors: Int, denoting total number of traced tensors.
+      num_signatures: Int, denoting the number of statistics collected per
+        tensors.
+    """
+    init_value = constant_op.constant(_COMPACT_TRACE_ENTRY_INIT_VALUE,
+                                      dtype=dtypes.float32,
+                                      shape=[num_signatures])
+    self._temp_cache_var = [init_value for _ in range(num_traced_tensors)]
+
   def _determine_trace_and_create_report(self, graph, ops_in_exec_path):
     """Work needs to be done prior to TPU or CPU tracing.
 
@@ -1251,20 +1363,30 @@ class TensorTracer(object):
     tensor_trace_order = tensor_tracer_report.TensorTraceOrder(graph_order,
                                                                traced_tensors)
     num_signatures = self._num_signature_dimensions()
-    if num_signatures:
-      self._create_or_get_tensor_values_cache(_TT_SUMMARY_TAG,
-                                              graph,
-                                              [len(traced_tensors),
-                                               num_signatures])
+    # Create a cache variable if compact_tracing is used.
+    if num_signatures and self._use_tensor_values_cache():
+      if self._use_temp_cache():
+        self._create_temp_cache(len(traced_tensors), num_signatures)
+      else:
+        self._create_or_get_tensor_values_cache(_TT_SUMMARY_TAG,
+                                                graph,
+                                                [len(traced_tensors),
+                                                 num_signatures])
     if self._parameters.trace_mode in (
         tensor_tracer_flags.TRACE_MODE_SUMMARY,
         tensor_tracer_flags.TRACE_MODE_FULL_TENSOR_SUMMARY):
-      report_proto = report_handler.create_report_proto(self._tt_config,
-                                                        self._parameters,
-                                                        tensor_trace_order,
-                                                        tensor_trace_points,
-                                                        self._signature_types())
-      report_handler.write_report_proto(report_proto, self._parameters)
+      self._report_proto = report_handler.create_report_proto(
+          self._tt_config, self._parameters, tensor_trace_order,
+          tensor_trace_points, self._signature_types())
+      if self._parameters.use_fingerprint_subdir:
+        self._parameters.trace_dir = os.path.join(
+            self._parameters.trace_dir, self._report_proto.fingerprint)
+        logging.info('TensorTracer updating trace_dir to %s',
+                     self._parameters.trace_dir)
+      self._report_proto_path = tensor_tracer_report.report_proto_path(
+          self._parameters.trace_dir)
+      if self._parameters.report_file_path != _SKIP_REPORT_FILE:
+        report_handler.write_report_proto(self._report_proto, self._parameters)
     else:
       report_handler.create_report(self._tt_config, self._parameters,
                                    tensor_trace_order, tensor_trace_points)
@@ -1328,19 +1450,27 @@ class TensorTracer(object):
       return control_flow_ops.case(flush_op_cases, exclusive=True)
 
     cache = self._create_or_get_tensor_values_cache(_TT_SUMMARY_TAG)
+    if self._use_temp_cache():
+      cache_val = cache
+    else:
+      cache_val = cache.value()
     if on_tpu:
       flush_op = tpu.outside_compilation(_flush_fun,
-                                         cache.value(), self._replica_id)
+                                         cache_val, self._replica_id)
     else:
-      flush_op = _flush_fun(cache.value(), self._replica_id)
-
-    with ops.control_dependencies([flush_op]):
-      reset_value = constant_op.constant(_COMPACT_TRACE_ENTRY_INIT_VALUE,
-                                         dtype=cache.dtype,
-                                         shape=cache.shape)
-      assign_op = state_ops.assign(cache, reset_value).op
-      with ops.control_dependencies([assign_op]):
+      flush_op = _flush_fun(cache_val, self._replica_id)
+    if self._use_temp_cache():
+      with ops.control_dependencies([flush_op]):
         return constant_op.constant(0).op
+    else:
+      # Re-initialize the local cache variable.
+      with ops.control_dependencies([flush_op]):
+        reset_value = constant_op.constant(_COMPACT_TRACE_ENTRY_INIT_VALUE,
+                                           dtype=cache.dtype,
+                                           shape=cache.shape)
+        assign_op = state_ops.assign(cache, reset_value).op
+        with ops.control_dependencies([assign_op]):
+          return constant_op.constant(0).op
 
   def _flush_tensor_values_cache(self, tensor_fetches, op_fetches, on_tpu):
     """Flushes the intermediate tensor values in the graph to the cache.
@@ -1431,6 +1561,75 @@ class TensorTracer(object):
       op_control_flow_context = op_control_flow_context.outer_context
     return op_control_flow_context
 
+  def merge_caches_on_tpu(self, local_tpu_cache_tensor):
+    """Merges the given caches on tpu.
+
+    Args:
+      local_tpu_cache_tensor: A local tensor that needs to be merged
+        by concanting data from other tpu cores.
+    Returns:
+      A merged tf.Tensor.
+    Raises:
+      RuntimeError: if there is no aggregate function defined for a signature.
+    """
+
+    global_cache_shape = local_tpu_cache_tensor.shape.as_list()
+    global_cache_shape[0] = self._tt_config.num_replicas
+
+    # Each replica will insert their local cache into the
+    # replica_id index of the global replica.
+    indices = array_ops.reshape(self._replica_id, [1, 1])
+    global_cache = array_ops.scatter_nd(indices, local_tpu_cache_tensor,
+                                        global_cache_shape)
+    merged_global_cache = tpu_ops.cross_replica_sum(
+        global_cache, [list(range(self._tt_config.num_replicas))])
+    return merged_global_cache
+
+  def aggregate_global_cache(self, global_tt_summary_cache):
+    """Merges the given caches on tpu.
+
+    Args:
+      global_tt_summary_cache: The global tensor tracer summary cache tensor
+        with shape (num_cores, num_traced_tensors, num_traced_signatures). First
+        dimension corresponds to core_id, where global_tpu_cache_tensor[i]
+        correspond to the local cache from core-i.
+    Returns:
+      An aggregated tf.Tensor.
+    Raises:
+      RuntimeError: if there is no aggregate function defined for a signature.
+    """
+
+    # Merge only statistics tensor, if it is any other tensor we simply,
+    # concatenate them.
+    agg_fn_map = self._parameters.get_signature_to_agg_fn_map()
+    signature_idx_map = self._signature_types()
+    aggregation_result = []
+    for signature, idx in sorted(signature_idx_map.items(),
+                                 key=operator.itemgetter(1)):
+      if signature not in agg_fn_map:
+        raise RuntimeError('No aggregation function is defined for '
+                           'signature %s.' % signature)
+      # The dimensions of the statistics tensor is
+      # num_cores x num_traced_tensors x num_signatures
+      # value[:,:,idx] will return the portion of the tensor related
+      # to signature.
+      signature_tensor = global_tt_summary_cache[:, :, idx]
+      # Merge it along the first (core) axis.
+      agg_fn = agg_fn_map[signature]
+      agg_tensor = agg_fn(signature_tensor, axis=0)
+      aggregation_result.append(agg_tensor)
+    # Merge results corresponding to different signatures
+
+    merged_signatures = array_ops.stack(aggregation_result)
+    # merged_signatures has dimensions
+    # num_signatures x num_traced_tensors, transpose it so that it
+    # will match with the original structure
+    # num_traced_tensors x num_signatures.
+    transposed_signatures = array_ops.transpose(merged_signatures)
+    # Expand 1 more dimension so that it will match with the expected
+    # structure num_cores x num_traced_tensors x num_signatures.
+    return array_ops.expand_dims(transposed_signatures, axis=0)
+
   def _prepare_host_call_fn(self, processed_t_fetches, op_fetches):
     """Creates a host call function that will write the cache as tb summary.
 
@@ -1484,36 +1683,10 @@ class TensorTracer(object):
           if not self._parameters.collect_summary_per_core:
             # Merge only statistics tensor, if it is any other tensor we simply,
             # concatenate them.
-            if key == _TT_SUMMARY_TAG:
-              agg_fn_map = self._parameters.get_signature_to_agg_fn_map()
-              signature_idx_map = self._signature_types()
-              aggregation_result = []
-              for signature, idx in sorted(signature_idx_map.items(),
-                                           key=operator.itemgetter(1)):
-                if signature not in agg_fn_map:
-                  raise RuntimeError('No aggregation function is defined for '
-                                     'signature %s.' % signature)
-
-                # The dimensions of the statistics tensor is
-                # num_cores x num_traced_tensors x num_signatures
-                # value[:,:,idx] will return the portion of the tensor relasted
-                # to signature.
-                signature_tensor = value[:, :, idx]
-                # Merge it along the first (core) axis.
-                agg_fn = agg_fn_map[signature]
-                agg_tensor = agg_fn(signature_tensor, axis=0)
-                aggregation_result.append(agg_tensor)
-              # Merge results corresponding to different signatures
-
-              merged_signatures = array_ops.stack(aggregation_result)
-              # merged_signatures has dimensions
-              # num_signatures x num_traced_tensors, transpose it so that it
-              # will match with the original structure
-              # num_traced_tensors x num_signatures.
-              transposed_signatures = array_ops.transpose(merged_signatures)
-              # Expand 1 more dimension so that it will match with the expected
-              # structure num_cores x num_traced_tensors x num_signatures.
-              value = array_ops.expand_dims(transposed_signatures, axis=0)
+            # Also, if there is only a single core (first dim. is 0), then skip
+            # aggregation.
+            if key == _TT_SUMMARY_TAG and value.shape.as_list()[0] != 1:
+              value = self.aggregate_global_cache(value)
 
           with ops.control_dependencies(
               summary.summary_writer_initializer_op()):
@@ -1535,7 +1708,7 @@ class TensorTracer(object):
         # tensors from different replicas, we can identify them with [core_id].
         new_cache_shape = [1]
         new_cache_shape.extend(cache_variable.shape.as_list())
-        cache = array_ops.reshape(cache_variable.value(), new_cache_shape)
+        cache = array_ops.reshape(cache_variable, new_cache_shape)
         caches_to_write[cache_name] = cache
     # Add step to parameter dictionary.
     caches_to_write['step'] = step
@@ -1657,10 +1830,16 @@ class TensorTracer(object):
                 processed_tensors[signature])
 
         if self._use_tensor_values_cache():
-          # Use a small cache to store the characteristics of the tensor.
-          cache_idx = tensor_trace_order.tensorname_to_cache_idx[tensor_name]
-          trace_op = self._save_tensor_value_to_cache_op(cache_idx,
-                                                         processed_tensors)
+          # Use a small cache (either temp cache or tf local variable) to store
+          # the characteristics of the tensor.
+          if self._use_temp_cache():
+            cache_idx = tensor_trace_order.tensorname_to_cache_idx[tensor_name]
+            self._save_tensor_value_to_tmp_cache(cache_idx, processed_tensors)
+            trace_op = None
+          else:
+            cache_idx = tensor_trace_order.tensorname_to_cache_idx[tensor_name]
+            trace_op = self._save_tensor_value_to_cache_op(cache_idx,
+                                                           processed_tensors)
         elif self._use_tensor_buffer():
           if len(processed_tensors) != 1:
             raise RuntimeError('Multiple stats are only allowed in compact '
@@ -1712,16 +1891,16 @@ class TensorTracer(object):
           # pylint: disable=protected-access
           graph._set_control_flow_context(current_control_flow_context)
           # pylint: enable=protected-access
-
-        if is_a_fetched_tensor:
-          tracing_ops.append(trace_op)
-          continue
-        # Add it to all consumers, as some consumers may not be executed if they
-        # are in a control flow.
-        for consumer_op in consumers:
-          # pylint: disable=protected-access
-          consumer_op._add_control_input(trace_op)
-          # pylint: enable=protected-access
+        if trace_op:
+          if is_a_fetched_tensor:
+            tracing_ops.append(trace_op)
+            continue
+          # Add it to all consumers, as some consumers may not be executed if
+          # they are in a control flow.
+          for consumer_op in consumers:
+            # pylint: disable=protected-access
+            consumer_op._add_control_input(trace_op)
+            # pylint: enable=protected-access
 
     # pylint: disable=protected-access
     graph._set_control_flow_context(current_control_flow_context)
@@ -1732,6 +1911,11 @@ class TensorTracer(object):
       processed_t_fetches = control_flow_ops.tuple(processed_t_fetches,
                                                    control_inputs=tracing_ops)
     if self._use_tensor_values_cache() or self._use_tensor_buffer():
+      if self._use_temp_cache():
+        # Create the temporary tf cache variable by concantanating all
+        # statistics.
+        self._cache_variables[_TT_SUMMARY_TAG] = array_ops.stack(
+            self._temp_cache_var, axis=0, name='stack_all_op_signatures')
       if self._create_host_call():
         self._prepare_host_call_fn(processed_t_fetches, op_fetches)
         if not on_tpu:
@@ -1782,7 +1966,8 @@ class TensorTracer(object):
       return tensor_fetches
     else:
       TensorTracer._traced_graphs.add(graph)
-
+    # Reset the parameters in case parameters are changed.
+    self._parameters = tensor_tracer_flags.TTParameters()
     self._tt_config.device_type = _DEVICE_TYPE_TPU
     self._tt_config.num_replicas = num_replicas
     self._tt_config.num_replicas_per_host = num_replicas_per_host
@@ -1832,6 +2017,8 @@ class TensorTracer(object):
       return tensor_fetches
     else:
       TensorTracer._traced_graphs.add(graph)
+    # Reset the parameters in case parameters are changed.
+    self._parameters = tensor_tracer_flags.TTParameters()
 
     self._tt_config.device_type = _DEVICE_TYPE_CPU
     self._tt_config.num_replicas = 1
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index badc44f263d..4e412c46e82 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -73,6 +73,8 @@ FLAG_NAME_OP_RANGE = 'op_range'
 FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
 FLAG_NAME_SUMMARY_SIGNATURES = 'signatures'
 FLAG_NAME_SUMMARY_PER_CORE = 'collect_summary_per_core'
+FLAG_NAME_TEMP_CACHE_VAR = 'use_temp_cache'
+FLAG_NAME_FINGERPRINT_DIR = 'use_fingerprint_subdirectory'
 
 _OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
 _TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
@@ -125,6 +127,8 @@ class TTParameters(object):
     self.is_conditional_trace = self._is_conditional_trace_mode()
     self.trace_scalar_ops = self.is_flag_on(FLAG_NAME_TRACE_SCALAR_OPS)
     self.use_compact_trace = self.is_flag_on(FLAG_NAME_USE_COMPACT_TRACE)
+    self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR)
+    self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR)
 
     # _trace_ops_before_included and _trace_ops_after_included denotes to depth
     # of tracing relative to the ops given in --included_opnames or
@@ -271,7 +275,8 @@ class TTParameters(object):
         FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
         FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, FLAG_NAME_OP_RANGE,
         FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
-        FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE
+        FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
+        FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR
     ]
     tensor_tracer_flags = self._env.get(FLAGS_ENV_VAR)
     if not tensor_tracer_flags:
diff --git a/tensorflow/python/tpu/tensor_tracer_report.py b/tensorflow/python/tpu/tensor_tracer_report.py
index e8a122d981f..3270b2a2fd3 100644
--- a/tensorflow/python/tpu/tensor_tracer_report.py
+++ b/tensorflow/python/tpu/tensor_tracer_report.py
@@ -19,8 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import hashlib
 import os
 
+
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import tensor_tracer_pb2
@@ -53,6 +55,18 @@ _CURRENT_VERSION = 'use-outside-compilation'
 _TT_REPORT_PROTO = 'tensor_tracer_report.report_pb'
 
 
+def report_proto_path(trace_dir):
+  """Returns the path where report proto should be written.
+
+  Args:
+     trace_dir: String denoting the trace directory.
+
+  Returns:
+     A string denoting the path to the report proto.
+  """
+  return os.path.join(trace_dir, _TT_REPORT_PROTO)
+
+
 def topological_sort(g):
   """Performs topological sort on the given graph.
 
@@ -206,6 +220,12 @@ class OpenReportFile(object):
       self._report_file.close()
 
 
+def proto_fingerprint(message_proto):
+  serialized_message = message_proto.SerializeToString()
+  hasher = hashlib.sha256(serialized_message)
+  return hasher.hexdigest()
+
+
 class TTReportHandle(object):
   """Utility class responsible from creating a tensor tracer report."""
 
@@ -255,8 +275,6 @@ class TTReportHandle(object):
                                     key=lambda x: x[1]):
       report.config.signatures.append(signature_name)
 
-    tf_graph = tensor_trace_order.graph_order.graph
-    report.graphdef.CopyFrom(tf_graph.as_graph_def())
     for tensor in tensor_trace_order.graph_order.tensors:
       tensor_def = tensor_tracer_pb2.TensorTracerReport.TracedTensorDef()
       tensor_def.name = tensor.name
@@ -265,6 +283,11 @@ class TTReportHandle(object):
         tensor_def.cache_index = (
             tensor_trace_order.tensorname_to_cache_idx[tensor.name])
       else:
+        # To prevent small changes affecting the fingerprint calculation, avoid
+        # writing the untraced tensors to metadata. Fingerprints will be
+        # different only when the list of the traced tensors are different.
+        if tt_parameters.use_fingerprint_subdir:
+          continue
         tensor_def.is_traced = False
 
       if tensor.name in tensor_trace_points:
@@ -274,12 +297,17 @@ class TTReportHandle(object):
       elif tensor.op.name in self.instrument_records:
         tensor_def.explanation = self.instrument_records[tensor.op.name]
       report.tensordef[tensor.name].CopyFrom(tensor_def)
+    report.fingerprint = proto_fingerprint(report)
+    logging.info('TensorTracerProto fingerprint is %s.',
+                 report.fingerprint)
+    tf_graph = tensor_trace_order.graph_order.graph
+    report.graphdef.CopyFrom(tf_graph.as_graph_def())
     return report
 
   def write_report_proto(self, report_proto, tt_parameters):
     """Writes the given report proto under trace_dir."""
     gfile.MakeDirs(tt_parameters.trace_dir)
-    report_path = os.path.join(tt_parameters.trace_dir, _TT_REPORT_PROTO)
+    report_path = report_proto_path(tt_parameters.trace_dir)
     with gfile.GFile(report_path, 'wb') as f:
       f.write(report_proto.SerializeToString())
 
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index c70a26f2b4d..28eba69b7da 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -1353,7 +1353,7 @@ def split_compile_and_replicate(computation,
 
       def custom_getter(getter, name, *args, **kwargs):
         """Variables on TPU have a few restrictions."""
-        partitioner = kwargs["partitioner"]
+        partitioner = kwargs.get("partitioner", None)
         if partitioner is not None:
           kwargs["partitioner"] = None
           logging.warning(
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index 24ba9295cbb..d1848f34502 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -51,7 +51,8 @@ INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
 class TableConfig(
     collections.namedtuple('TableConfig', [
         'vocabulary_size', 'dimension', 'initializer', 'combiner',
-        'hot_id_replication', 'learning_rate', 'learning_rate_fn'
+        'hot_id_replication', 'learning_rate', 'learning_rate_fn',
+        'optimization_parameters',
     ])):
   """Embedding table configuration."""
 
@@ -62,7 +63,8 @@ class TableConfig(
               combiner='mean',
               hot_id_replication=False,
               learning_rate=None,
-              learning_rate_fn=None):
+              learning_rate_fn=None,
+              optimization_parameters=None):
     """Embedding table configuration.
 
     Args:
@@ -81,16 +83,19 @@ class TableConfig(
       hot_id_replication: If true, enables hot id replication, which can make
         embedding lookups faster if there are some hot rows in the table.
       learning_rate: float, static learning rate for this table. If
-        learning_rate and learning_rate_fn are both `None`, global
-        static learning rate as specified in `optimization_parameters` in
-        `TPUEmbedding` constructor will be used. `learning_rate_fn` must be
-        `None` if `learning_rate` is not `None.
+        learning_rate and learning_rate_fn are both `None`, static learning
+        rate as specified in local `optimization_parameters` will be used.
+        In case local `optimization_parameters` is `None`, global
+        `optimization_parameters` in `TPUEmbedding` constructor will be used.
+        `learning_rate_fn` must be `None` if `learning_rate` is not `None.
       learning_rate_fn: string, use dynamic learning rate given by the function.
         This function function will be passed the current global step. If
-        learning_rate and learning_rate_fn are both `None`, global static
-        learning rate as specified in `optimization_parameters` in
-        `TPUEmbedding` constructor will be used. `learning_rate` must be `None`
-        if `learning_rate_fn` is not `None.
+        learning_rate and learning_rate_fn are both `None`, static
+        learning rate as specified in `optimization_parameters` is used.
+        `learning_rate` must be `None` if `learning_rate_fn` is not `None.
+      optimization_parameters: `AdagradParameters`, `AdamParameters`,
+        `Stochasticgradientdescentparameters`. Specifies table level optimizer.
+        If it's `None` global optimizer in `TPUEmbedding` constructor is used.
 
     Returns:
       `TableConfig`.
@@ -123,9 +128,17 @@ class TableConfig(
                        'can be None; got {} and {}'
                        .format(learning_rate, learning_rate_fn))
 
-    return super(TableConfig, cls).__new__(
-        cls, vocabulary_size, dimension, initializer, combiner,
-        hot_id_replication, learning_rate, learning_rate_fn)
+    if optimization_parameters is not None:
+      if not isinstance(optimization_parameters, _OptimizationParameters):
+        raise ValueError('`optimization_parameters` must inherit from '
+                         '`_OptimizationParameters`. '
+                         '`type(optimization_parameters)`={}'.format(
+                             type(optimization_parameters)))
+
+    return super(TableConfig,
+                 cls).__new__(cls, vocabulary_size, dimension, initializer,
+                              combiner, hot_id_replication, learning_rate,
+                              learning_rate_fn, optimization_parameters)
 
 
 class FeatureConfig(
@@ -815,7 +828,7 @@ class TPUEmbedding(object):
   ...     end_learning_rate=0.0)
   >>> wordpiece_table_config = TableConfig(
   ...   vocabulary_size=119547,
-  ...   dimension=768,
+  ...   dimension=256,
   ...   learning_rate_fn=learning_rate_fn)
   >>> wordpiece_feature_config = FeatureConfig(
   ...   table_id='bert/embeddings/word_embeddings',
@@ -833,11 +846,11 @@ class TPUEmbedding(object):
   ...  batch_size=128,
   ...  mode=TRAINING,
   ...  optimization_parameters=optimization_parameters,
-  ...  device_config=DeviceConfig(
-  ...    num_cores=64, num_hosts=4, job_name='tpu_worker'))
+  ...  master='')
   >>> with tf.Graph().as_default():
   ...   init_tpu_op = tf.compat.v1.tpu.initialize_system(
-  ...     embedding_config=tpu_embedding.config_proto, job='tpu_worker')
+  ...     embedding_config=tpu_embedding.config_proto)
+  ...   tf.compat.v1.Session().run(init_tpu_op)
   """
 
   # TODO(shizhiw): Consider adding a field to FeatureConfig that indicates that
@@ -882,8 +895,9 @@ class TPUEmbedding(object):
       mode: `TRAINING` or `INFERENCE`.
       master: A `string` representing the TensorFlow master to use.
       optimization_parameters: `AdagradParameters`, `AdamParameters`,
-        `Stochasticgradientdescentparameters`. Must be set in training and must
-        be `None` in inference.
+        `Stochasticgradientdescentparameters`. Must be set in training unless
+        all tables specify their own optimizers. And it must be `None` in
+        inference.
       cluster_def: A ClusterDef object describing the TPU cluster.
       pipeline_execution_with_tensor_core: setting this to `True` makes training
         faster, but trained model will be different if step N and step N+1
@@ -963,7 +977,8 @@ class TPUEmbedding(object):
 
     # TODO(shizhiw): remove `mode`?
     if mode == TRAINING:
-      _validate_optimization_parameters(optimization_parameters)
+      _validate_optimization_parameters(optimization_parameters,
+                                        self._table_to_config_dict)
       self._optimization_parameters = optimization_parameters
     elif mode == INFERENCE:
       if optimization_parameters is not None:
@@ -980,8 +995,8 @@ class TPUEmbedding(object):
     # and create special handler for inference that inherits from
     # StochasticGradientDescentHandler with more user-friendly error message
     # on get_slot().
-    self._optimizer_handler = _get_optimization_handler(
-        self._optimization_parameters)
+    self._optimizer_handler_dict = self._get_optimizer_handler_by_table()
+
     self._pipeline_execution_with_tensor_core = (
         pipeline_execution_with_tensor_core)
     self._learning_rate_fn = list(set(
@@ -1076,35 +1091,39 @@ class TPUEmbedding(object):
 
       table_descriptor.num_features = self._table_to_num_features_dict[table]
 
+      optimization_parameters = (
+          self._optimizer_handler_dict[table].get_optimization_parameters())
+
       parameters = table_descriptor.optimization_parameters
       if table_config.learning_rate:
-        parameters.learning_rate.constant = (table_config.learning_rate)
+        parameters.learning_rate.constant = table_config.learning_rate
       elif table_config.learning_rate_fn:
         parameters.learning_rate.dynamic.tag = (
             self._learning_rate_fn_to_tag[table_config.learning_rate_fn])
       else:
         parameters.learning_rate.constant = (
-            self._optimization_parameters.learning_rate)
+            optimization_parameters.learning_rate)
       parameters.gradient_accumulation_status = (
           optimization_parameters_pb2.GradientAccumulationStatus.ENABLED
-          if self._optimization_parameters.use_gradient_accumulation else
+          if optimization_parameters.use_gradient_accumulation else
           optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
-      if self._optimization_parameters.clip_weight_min is not None:
+      if optimization_parameters.clip_weight_min is not None:
         parameters.clipping_limits.lower.value = (
-            self._optimization_parameters.clip_weight_min)
-      if self._optimization_parameters.clip_weight_max is not None:
+            optimization_parameters.clip_weight_min)
+      if optimization_parameters.clip_weight_max is not None:
         parameters.clipping_limits.upper.value = (
-            self._optimization_parameters.clip_weight_max)
-      if self._optimization_parameters.weight_decay_factor:
+            optimization_parameters.clip_weight_max)
+      if optimization_parameters.weight_decay_factor:
         parameters.weight_decay_factor = (
-            self._optimization_parameters.weight_decay_factor)
-        if (self._optimization_parameters
+            optimization_parameters.weight_decay_factor)
+        if (optimization_parameters
             .multiply_weight_decay_factor_by_learning_rate):
           parameters.multiply_weight_decay_factor_by_learning_rate = True
       if table_config.hot_id_replication:
         parameters.hot_id_replication_configuration.status = (
             optimization_parameters_pb2.HotIdReplicationConfiguration.ENABLED)
-      self._optimizer_handler.set_optimization_parameters(table_descriptor)
+      optimizer_handler = self._optimizer_handler_dict[table]
+      optimizer_handler.set_optimization_parameters(table_descriptor)
 
     config_proto.mode = self._mode
     config_proto.batch_size_per_tensor_core = self._batch_size_per_core
@@ -1167,8 +1186,9 @@ class TPUEmbedding(object):
       if slot_variable_names_by_table:
         slot_variable_names = slot_variable_names_by_table[table]
       else:
+        optimizer_handler = self._optimizer_handler_dict[table]
         slot_variable_names = (
-            self._optimizer_handler.get_default_slot_variable_names(table))
+            optimizer_handler.get_default_slot_variable_names(table))
 
       # TODO(b/139144091): Multi-host support for mid-level API in
       #  eager context (TF 2.0)
@@ -1192,7 +1212,7 @@ class TPUEmbedding(object):
         # on the first host, other nodes would use config from the first node.
         config = None if i else self.config_proto.SerializeToString()
         slot_variables_for_table, load_ops_fn, retrieve_ops_fn = (
-            self._optimizer_handler.create_variables_and_ops(
+            self._optimizer_handler_dict[table].create_variables_and_ops(
                 table, slot_variable_names, self._num_hosts,
                 self._table_to_config_dict[table], table_variables, config))
         slot_variables_by_table[table] = slot_variables_for_table
@@ -1524,6 +1544,17 @@ class TPUEmbedding(object):
                         for fn in self._learning_rate_fn],
         config=self.config_proto.SerializeToString())
 
+  def _get_optimizer_handler_by_table(self):
+    optimizer_handlers = {}
+    for table, table_config in self.table_to_config_dict.items():
+      if table_config.optimization_parameters is not None:
+        optimizer = table_config.optimization_parameters
+      else:
+        optimizer = self._optimization_parameters
+      optimizer_handlers[table] = _get_optimization_handler(optimizer)
+
+    return optimizer_handlers
+
 
 def _validate_table_to_config_dict(table_to_config_dict):
   """Validate `table_to_config_dict`."""
@@ -1560,12 +1591,35 @@ def _validate_batch_size(batch_size, num_cores):
                          batch_size, num_cores))
 
 
-def _validate_optimization_parameters(optimization_parameters):
-  if not isinstance(optimization_parameters, _OptimizationParameters):
-    raise ValueError('`optimization_parameters` must inherit from '
-                     '`_OptimizationParameters`. '
-                     '`type(optimization_parameters)`={}'.format(
-                         type(optimization_parameters)))
+def _validate_optimization_parameters(optimization_parameters,
+                                      table_to_config_dict):
+  """Validate global optimization_parameters and per table optimizers.
+
+  If global optimizer is `None`, all table optimizers should be non `None`.
+
+  Args:
+      optimization_parameters: global optimizer provided in `TPUEmbedding`
+         constructor.
+      table_to_config_dict: A dictionary mapping from string of table name to
+        `TableConfig`.
+
+  """
+  tbl_optimizer_missing = False
+  for _, table_config in table_to_config_dict.items():
+    if table_config.optimization_parameters is None:
+      tbl_optimizer_missing = True
+      break
+
+  if optimization_parameters:
+    if not isinstance(optimization_parameters, _OptimizationParameters):
+      raise ValueError('`optimization_parameters` must inherit from '
+                       '`_OptimizationParameters`. '
+                       '`type(optimization_parameters)`={}'.format(
+                           type(optimization_parameters)))
+  else:
+    # Missing global optimization_parameters.
+    if tbl_optimizer_missing:
+      ValueError('`optimization_parameters` is missing.')
 
 
 class _OptimizerHandler(object):
@@ -1574,6 +1628,9 @@ class _OptimizerHandler(object):
   def __init__(self, optimization_parameters):
     self._optimization_parameters = optimization_parameters
 
+  def get_optimization_parameters(self):
+    return self._optimization_parameters
+
   def set_optimization_parameters(self, table_descriptor):
     raise NotImplementedError()
 
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
new file mode 100644
index 00000000000..5a66f6ce8b9
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -0,0 +1,1321 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mid level API for TPU Embeddings."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import functools
+from absl import logging
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute import values as tf_values
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training.saving import saveable_hook
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+_HOOK_KEY = "TPUEmbedding_saveable"
+_NAME_KEY = "_tpu_embedding_layer"
+
+
+# TODO(bfontain): Cleanup and remove this once there is an implementation of
+# sharded variables that can be used in the PSStrategy with optimizers.
+# We implement just enough of the of a tf.Variable so that this could be passed
+# to an optimizer.
+class TPUShardedVariable(sharded_variable.ShardedVariable):
+  """A ShardedVariable class for TPU."""
+
+  @property
+  def _in_graph_mode(self):
+    return self.variables[0]._in_graph_mode  # pylint: disable=protected-access
+
+  @property
+  def _unique_id(self):
+    return self.variables[0]._unique_id  # pylint: disable=protected-access
+
+  @property
+  def _distribute_strategy(self):
+    return self.variables[0]._distribute_strategy  # pylint: disable=protected-access
+
+  @property
+  def _shared_name(self):
+    return self._name
+
+
+def _add_key_attr(op, name):
+  op._set_attr(_NAME_KEY, attr_value_pb2.AttrValue(s=compat.as_bytes(name)))  # pylint: disable=protected-access
+
+
+@tf_export("tpu.experimental.embedding.TPUEmbedding")
+class TPUEmbedding(tracking.AutoTrackable):
+  """The TPUEmbedding mid level API.
+
+  NOTE: When instantiated under a TPUStrategy, this class can only be created
+  once per call to `tf.tpu.experimental.initialize_tpu_system`. If you wish to
+  re-initialize the embedding engine you must re-initialize the tpu as well.
+  Doing this will clear any variables from TPU, so ensure you have checkpointed
+  before you do this. If a further instances of the class are needed,
+  set the `initialize_tpu_embedding` argument to `False`.
+
+  This class can be used to support training large embeddings on TPU. When
+  creating an instance of this class, you must specify the complete set of
+  tables and features you expect to lookup in those tables. See the
+  documentation of `tf.tpu.experimental.embedding.TableConfig` and
+  `tf.tpu.experimental.embedding.FeatureConfig` for more details on the complete
+  set of options. We will cover the basic usage here.
+
+  NOTE: multiple `FeatureConfig` objects can use the same `TableConfig` object,
+  allowing different features to share the same table:
+
+  ```python
+  table_config_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  table_config_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  feature_config = {
+      'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_two)}
+  ```
+
+  There are two modes under which the `TPUEmbedding` class can used. This
+  depends on if the class was created under a `TPUStrategy` scope or not.
+
+  Under `TPUStrategy`, we allow access to the method `enqueue`, `dequeue` and
+  `apply_gradients`. We will show examples below of how to use these to train
+  and evaluate your model. Under CPU, we only access to the `embedding_tables`
+  property which allow access to the embedding tables so that you can use them
+  to run model evaluation/prediction on CPU.
+
+  First lets look at the `TPUStrategy` mode. Initial setup looks like:
+
+  ```python
+  strategy = tf.distribute.experimental.TPUStrategy(...)
+  with strategy.scope():
+    embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+        feature_config=feature_config,
+        batch_size=1024,
+        optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  ```
+
+  To use this API on TPU you should use a custom training loop. Below is an
+  example of a training and evaluation step:
+
+  ```python
+  @tf.function
+  def training_step(dataset_iterator, num_steps):
+    def tpu_step(tpu_features):
+      with tf.GradientTape() as tape:
+        activations = embedding.dequeue()
+        tape.watch(activations)
+        model_output = model(activations)
+        loss = ...  # some function of labels and model_output
+
+      embedding_gradients = tape.gradient(loss, activations)
+      embedding.apply_gradients(embedding_gradients)
+      # Insert your model gradient and optimizer application here
+
+    for _ in tf.range(num_steps):
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features, ))
+
+  @tf.function
+  def evalution_step(dataset_iterator, num_steps):
+    def tpu_step(tpu_features):
+      activations = embedding.dequeue()
+      model_output = model(activations)
+      # Insert your evaluation code here.
+
+    for _ in tf.range(num_steps):
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=False)
+      strategy.run(tpu_step, args=(embedding_features, ))
+  ```
+
+  NOTE: The calls to `enqueue` have `training` set to `True` when
+  `embedding.apply_gradients` is used and set to `False` when
+  `embedding.apply_gradients` is not present in the function. If you don't
+  follow this pattern you may cause an error to be raised or the tpu may
+  deadlock.
+
+  In the above examples, we assume that the user has a dataset which returns
+  a tuple where the first element of the tuple matches the structure of what
+  was passed as the `feature_config` argument to the object initializer. Also we
+  utilize `tf.range` to get a `tf.while_loop` in order to increase performance.
+
+  When checkpointing your model, you should include your
+  `tf.tpu.experimental.embedding.TPUEmbedding` object in the checkpoint. It is a
+  trackable object and saving it will save the embedding tables and their
+  optimizer slot variables:
+
+  ```python
+  checkpoint = tf.train.Checkpoint(model=model, embedding=embedding)
+  checkpoint.save(...)
+  ```
+
+  On CPU, only the `embedding_table` property is usable. This will allow you to
+  restore a checkpoint to the object and have access to the table variables:
+
+  ```python
+  model = model_fn(...)
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=1024,
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  checkpoint = tf.train.Checkpoint(model=model, embedding=embedding)
+  checkpoint.restore(...)
+
+  tables = embedding.embedding_tables
+  ```
+
+  You can now use table in functions like `tf.nn.embedding_lookup` to perform
+  your embedding lookup and pass to your model.
+
+  """
+
+  def __init__(self, feature_config, batch_size, optimizer,
+               pipeline_execution_with_tensor_core=False,
+               initialize_tpu_embedding=True):
+    """Creates the TPUEmbedding mid level API object.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+          feature_config=tf.tpu.experimental.embedding.FeatureConfig(
+              table=tf.tpu.experimental.embedding.TableConfig(
+                  dim=...,
+                  vocabulary_size=...)))
+    ```
+
+    Args:
+      feature_config: A nested structure of
+        `tf.tpu.experimental.embedding.FeatureConfig` configs.
+      batch_size: The global batch size that you indend to use. Note that is
+        fixed and the same batch size must be used for both training and
+        evaluation.
+      optimizer: An instance of one of `tf.tpu.experimental.embedding.SGD`,
+        `tf.tpu.experimental.embedding.Adagrad` or
+        `tf.tpu.experimental.embedding.Adam`.
+      pipeline_execution_with_tensor_core: If True, the TPU embedding
+        computations will overlap with the TensorCore computations (and hence
+        will be one step old). Set to True for improved performance.
+      initialize_tpu_embedding: If False, will not initialize the TPU embedding
+        engine. If this is set to False and another instance of this class has
+        not initialized the tpu embedding engine, the creation of this object
+        will fail.
+
+    Raises:
+      ValueError: If optimizer is not one of tf.tpu.experimental.embedding.(SGD,
+      Adam or Adagrad).
+    """
+    self._strategy = distribution_strategy_context.get_strategy()
+    self._using_tpu = isinstance(self._strategy, tpu_strategy.TPUStrategy)
+    self._pipeline_execution_with_tensor_core = (
+        pipeline_execution_with_tensor_core)
+
+    self._feature_config = feature_config
+
+    # The TPU embedding ops are slightly inconsistent with how they refer to
+    # tables:
+    # * The enqueue op takes a parallel list of tensors for input, one of those
+    #   is the table id for the feature which matches the integer index of the
+    #   table in the proto created by _create_config_proto().
+    # * The recv_tpu_embedding_activations op emits lookups per table in the
+    #   order from the config proto.
+    # * The send_tpu_embedding_gradients expects input tensors to be per table
+    #   in the same order as the config proto.
+    # * Per optimizer load and retrieve ops are specified per table and take the
+    #   table name rather than the table id.
+    # Thus we must fix a common order to tables and ensure they have unique
+    # names.
+
+    # Set table order here
+    self._table_config = list(
+        {feature.table for feature in nest.flatten(feature_config)})
+
+    # Ensure tables have unique names. Also error check the optimizer as we
+    # specifically don't do that in the TableConfig class to allow high level
+    # APIs that are built on this to use strings/other classes to represent
+    # optimizers (before they are passed to this class).
+    table_names = []
+    for i, table in enumerate(self._table_config):
+      if table.optimizer is None:
+        # TODO(bfontain) Should we allow some sort of optimizer merging here?
+        table.optimizer = optimizer
+      if not isinstance(table.optimizer, tpu_embedding_v2_utils._Optimizer):  # pylint: disable=protected-access
+        raise ValueError("{} is an unsupported optimizer class. Please pass an "
+                         "instance of one of the optimizer classes under "
+                         "tf.tpu.experimental.embedding.".format(
+                             type(table.optimizer)))
+      if table.name is None:
+        table.name = "table_{}".format(i)
+      if table.name in table_names:
+        raise ValueError("Multiple tables with name {} found.".format(
+            table.name))
+      table_names.append(table.name)
+
+    if self._using_tpu:
+      # Extract a list of callable learning rates also in fixed order. Each
+      # table in the confix proto will get a index into this list and we will
+      # pass this list in the same order after evaluation to the
+      # send_tpu_embedding_gradients op.
+      self._dynamic_learning_rates = list({
+          table.optimizer.learning_rate for table in self._table_config if
+          callable(table.optimizer.learning_rate)})
+
+      # We need to list of host devices for the load/retrieve operations.
+      self._hosts = get_list_of_hosts(self._strategy)
+
+      # TODO(bfontain) Remove this once we have an official way of splitting
+      # prefetch between host and device.
+      self._strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
+
+      # We generally use the per core batch size, but will have the user pass
+      # in a global batch size.
+      self._batch_size = batch_size // self._strategy.num_replicas_in_sync
+
+      self._config_proto = self._create_config_proto()
+      if initialize_tpu_embedding:
+        # This is mainly for testing purposes, sometimes we don't want to
+        # initialize the embedding engine, but just want a copy of the API
+        # which can interact with an already initialized engine.
+        logging.info("Initializing TPU Embedding engine with config: %s",
+                     self._config_proto)
+        @def_function.function
+        def load_config():
+          tpu.initialize_system_for_tpu_embedding(self._config_proto)
+
+        load_config()
+        logging.info("Done initializing TPU Embedding engine.")
+
+    # Create and load variables and slot variables into the TPU.
+    # Note that this is a dict of dicts. Keys to the first dict are table names.
+    # We would prefer to use TableConfigs, but then these variables won't be
+    # properly tracked by the tracking API.
+    self._variables = self._create_variables_and_slots()
+    if self._using_tpu:
+      self._load_variables()
+
+  @property
+  def embedding_tables(self):
+    """Returns a dict of embedding tables, keyed by `TableConfig`.
+
+    This property only works when the `TPUEmbedding` object is created under a
+    non-TPU strategy. This is intended to be used to for CPU based lookup when
+    creating a serving checkpoint.
+
+    Returns:
+      A dict of embedding tables, keyed by `TableConfig`.
+
+    Raises:
+      RuntimeError: If object was created under a `TPUStrategy`.
+    """
+    # We don't support returning tables on TPU due to their sharded nature and
+    # the fact that when using a TPUStrategy:
+    # 1. Variables are stale and are only updated when a checkpoint is made.
+    # 2. Updating the variables won't affect the actual tables on the TPU.
+    if self._using_tpu:
+      raise RuntimeError("Unable to retrieve embedding tables when using a TPU "
+                         "strategy. If you need access, save your model, "
+                         "create this object under a CPU strategy and restore.")
+
+    # Only return the tables and not the slot variables. On CPU this are honest
+    # tf.Variables.
+    return {table: self._variables[table.name]["parameters"]
+            for table in self._table_config}
+
+  def _create_config_proto(self):
+    """Creates the TPUEmbeddingConfiguration proto.
+
+    This proto is used to initialize the TPU embedding engine.
+
+    Returns:
+      A TPUEmbeddingConfiguration proto.
+    """
+
+    config_proto = tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration()
+
+    # There are several things that need to be computed here:
+    # 1. Each table has a num_features, which corresponds to the number of
+    #    output rows per example for this table. Sequence features count for
+    #    their maximum sequence length.
+    # 2. Learning rate index: the index of the dynamic learning rate for this
+    #    table (if it exists) in the list we created at initialization.
+    #    We don't simply create one learning rate index per table as this has
+    #    extremely bad performance characteristics. The more separate
+    #    optimization configurations we have, the worse the performance will be.
+    num_features = {table: 0 for table in self._table_config}
+    for feature in nest.flatten(self._feature_config):
+      num_features[feature.table] += (1 if feature.max_sequence_length == 0
+                                      else feature.max_sequence_length)
+
+    # Map each callable dynamic learning rate to its in index in the list.
+    learning_rate_index = {r: i for i, r in enumerate(
+        self._dynamic_learning_rates)}
+
+    for table in self._table_config:
+      table_descriptor = config_proto.table_descriptor.add()
+      table_descriptor.name = table.name
+
+      # For small tables, we pad to the number of hosts so that at least one
+      # id will be assigned to each host.
+      table_descriptor.vocabulary_size = max(table.vocabulary_size,
+                                             self._strategy.extended.num_hosts)
+      table_descriptor.dimension = table.dim
+
+      table_descriptor.num_features = num_features[table]
+
+      parameters = table_descriptor.optimization_parameters
+
+      # We handle the learning rate separately here and don't allow the
+      # optimization class to handle this, as it doesn't know about dynamic
+      # rates.
+      if callable(table.optimizer.learning_rate):
+        parameters.learning_rate.dynamic.tag = (
+            learning_rate_index[table.optimizer.learning_rate])
+      else:
+        parameters.learning_rate.constant = table.optimizer.learning_rate
+
+      # Use optimizer to handle the rest of the parameters.
+      table.optimizer._set_optimization_parameters(parameters)  # pylint: disable=protected-access
+
+    # Always set mode to training, we override the mode during enqueue.
+    config_proto.mode = (
+        tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration.TRAINING)
+
+    config_proto.batch_size_per_tensor_core = self._batch_size
+    config_proto.num_hosts = self._strategy.extended.num_hosts
+    config_proto.num_tensor_cores = self._strategy.num_replicas_in_sync
+
+    # TODO(bfontain): Allow users to pick MOD for the host sharding.
+    config_proto.sharding_strategy = (
+        tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration.DIV_DEFAULT)
+    config_proto.pipeline_execution_with_tensor_core = (
+        self._pipeline_execution_with_tensor_core)
+
+    return config_proto
+
+  def _compute_per_table_gradients(self, gradients):
+    """Computes a dict of lists of gradients, keyed by table name.
+
+    Args:
+      gradients: A nested structure of Tensors (and Nones) with the same
+        structure as the feature config.
+
+    Returns:
+      A dict of lists of tensors, keyed by the table names, containing the
+    gradients in the correct order with None gradients repalaced by zeros.
+    """
+
+    nest.assert_same_structure(self._feature_config, gradients)
+
+    per_table_gradients = {table: [] for table in self._table_config}
+    for (path, gradient), feature in zip(
+        nest.flatten_with_joined_string_paths(gradients),
+        nest.flatten(self._feature_config)):
+      if gradient is not None and not isinstance(gradient, ops.Tensor):
+        raise ValueError(
+            "Found {} at path {} in gradients. Expected Tensor.".format(
+                type(gradient), path))
+
+      # Expected tensor shape differs for sequence and non-sequence features.
+      if feature.max_sequence_length > 0:
+        shape = [self._batch_size, feature.max_sequence_length,
+                 feature.table.dim]
+      else:
+        shape = [self._batch_size, feature.table.dim]
+
+      if gradient is not None:
+        if gradient.shape != shape:
+          raise ValueError("Found gradient of shape {} at path {}. Expected "
+                           "shape {}.".format(gradient.shape, path, shape))
+
+        # We expand dims on non-sequence features so that all features are
+        # of rank 3 and we can concat on axis=1.
+        if len(shape) == 2:
+          gradient = array_ops.expand_dims(gradient, axis=1)
+      else:
+        # No gradient for this feature, since we must give a gradient for all
+        # features, pass in a zero tensor here. Note that this is not correct
+        # for all optimizers.
+        logging.warn("No gradient passed for feature %s, sending zero "
+                     "gradient. This may not be correct behavior for certain "
+                     "optimizers like Adam.", path)
+        # Create a shape to mimic the expand_dims above for non-sequence
+        # features.
+        if len(shape) == 2:
+          shape = [shape[0], 1, shape[1]]
+        gradient = array_ops.zeros(shape, dtype=dtypes.float32)
+      per_table_gradients[feature.table].append(gradient)
+
+    return per_table_gradients
+
+  def apply_gradients(self, gradients, name=None):
+    """Applies the gradient update to the embedding tables.
+
+    If a gradient of `None` is passed in any position of the nested structure,
+    then an gradient update with a zero gradient is applied for that feature.
+    For optimizers like SGD or Adagrad, this is the same as applying no update
+    at all. For lazy Adam and other sparsely applied optimizers with decay,
+    ensure you understand the effect of applying a zero gradient.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    dataset_iterator = iter(distributed_dataset)
+
+    @tf.function
+    def training_step():
+      def tpu_step(tpu_features):
+        with tf.GradientTape() as tape:
+          activations = embedding.dequeue()
+          tape.watch(activations)
+
+          loss = ... #  some computation involving activations
+
+        embedding_gradients = tape.gradient(loss, activations)
+        embedding.apply_gradients(embedding_gradients)
+
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features, ))
+
+    training_step()
+    ```
+
+    Args:
+      gradients: A nested structure of gradients, with structure matching the
+        `feature_config` passed to this object.
+      name: A name for the underlying op.
+
+    Raises:
+      RuntimeError: If called when object wasn't created under a `TPUStrategy`.
+      ValueError: If a non-`tf.Tensor` non-`None` gradient is passed in, or a
+        `tf.Tensor` of the incorrect shape is passed in. Also if
+        the size of any sequence in `gradients` does not match corresponding
+        sequence in `feature_config`.
+      TypeError: If the type of any sequence in `gradients` does not match
+        corresponding sequence in `feature_config`.
+    """
+    if not self._using_tpu:
+      raise RuntimeError("apply_gradients is not valid when TPUEmbedding "
+                         "object is not created under a TPUStrategy.")
+
+    # send_tpu_embedding_gradients requires per table gradient, if we only have
+    # one feature per table this isn't an issue. When multiple features share
+    # the same table, the order of the features in per table tensor returned by
+    # recv_tpu_embedding_activations matches the order in which they were passed
+    # to enqueue.
+    # In all three places, we use the fixed order given by nest.flatten to have
+    # a consistent feature order.
+
+    # First construct a dict of tensors one for each table.
+    per_table_gradients = self._compute_per_table_gradients(gradients)
+
+    # Now that we have a list of gradients we can compute a list of gradients
+    # in the fixed order of self._table_config which interleave the gradients of
+    # the individual features. We concat on axis 1 and then reshape into a 2d
+    # tensor. The send gradients op expects a tensor of shape
+    # [num_features*batch_size, dim] for each table.
+    interleaved_gradients = []
+    for table in self._table_config:
+      interleaved_gradients.append(array_ops.reshape(
+          array_ops.concat(per_table_gradients[table], axis=1),
+          [-1, table.dim]))
+    op = tpu_ops.send_tpu_embedding_gradients(
+        inputs=interleaved_gradients,
+        learning_rates=[math_ops.cast(fn(), dtype=dtypes.float32)
+                        for fn in self._dynamic_learning_rates],
+        config=self._config_proto.SerializeToString())
+
+    # Apply the name tag to the op.
+    if name is not None:
+      _add_key_attr(op, name)
+
+  def dequeue(self, name=None):
+    """Get the embedding results.
+
+    Returns a nested structure of `tf.Tensor` objects, matching the structure of
+    the `feature_config` argument to the `TPUEmbedding` class. The output shape
+    of the tensors is `(batch_size, dim)`, where `batch_size` is the per core
+    batch size, `dim` is the dimension of the corresponding `TableConfig`. If
+    the feature's corresponding `FeatureConfig` has `max_sequence_length`
+    greater than 0, the output will be a sequence of shape
+    `(batch_size, max_sequence_length, dim)` instead.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    dataset_iterator = iter(distributed_dataset)
+
+    @tf.function
+    def training_step():
+      def tpu_step(tpu_features):
+        with tf.GradientTape() as tape:
+          activations = embedding.dequeue()
+          tape.watch(activations)
+
+          loss = ... #  some computation involving activations
+
+        embedding_gradients = tape.gradient(loss, activations)
+        embedding.apply_gradients(embedding_gradients)
+
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features, ))
+
+    training_step()
+    ```
+
+    Args:
+      name: A name for the underlying op.
+
+    Returns:
+      A nested structure of tensors, with the same structure as `feature_config`
+    passed to this instance of the `TPUEmbedding` object.
+
+    Raises:
+      RuntimeError: If called when object wasn't created under a `TPUStrategy`.
+    """
+    if not self._using_tpu:
+      raise RuntimeError("dequeue is not valid when TPUEmbedding object is not "
+                         "created under a TPUStrategy.")
+
+    # The activations returned by this op are per table. So we must separate
+    # them out into per feature activations. The activations are interleaved:
+    # for each table, we expect a [num_features*batch_size, dim] tensor.
+    # E.g. we expect the slice [:num_features, :] to contain the lookups for the
+    # first example of all features using this table.
+    activations = tpu_ops.recv_tpu_embedding_activations(
+        num_outputs=len(self._table_config),
+        config=self._config_proto.SerializeToString())
+
+    # Apply the name tag to the op.
+    if name is not None:
+      _add_key_attr(activations[0].op, name)
+
+    # Compute the number of features for this  table.
+    num_features = {table: 0 for table in self._table_config}
+    for feature in nest.flatten(self._feature_config):
+      num_features[feature.table] += (1 if feature.max_sequence_length == 0
+                                      else feature.max_sequence_length)
+
+    # Activations are reshaped so that they are indexed by batch size and then
+    # by the 'feature' index within the batch. The final dimension should equal
+    # the dimension of the table.
+    table_to_activation = {
+        table: array_ops.reshape(activation,
+                                 [self._batch_size, num_features[table], -1])
+        for table, activation in zip(self._table_config, activations)}
+
+    # We process the features in the same order we enqueued them.
+    # For each feature we take the next slice of the activations, so need to
+    # track the activations and the current position we are in.
+    table_to_position = {table: 0 for table in self._table_config}
+
+    per_feature_activations = []
+    for feature in nest.flatten(self._feature_config):
+      activation = table_to_activation[feature.table]
+      feature_index = table_to_position[feature.table]
+      # We treat non-sequence and sequence features differently here as sequence
+      # features have rank 3 while non-sequence features have rank 2.
+      if feature.max_sequence_length == 0:
+        per_feature_activations.append(
+            activation[:, feature_index, :])
+        table_to_position[feature.table] += 1
+      else:
+        per_feature_activations.append(
+            activation[:, feature_index:(
+                feature_index+feature.max_sequence_length), :])
+        table_to_position[feature.table] += feature.max_sequence_length
+
+    # Pack the list back into the same nested structure as the features.
+    return nest.pack_sequence_as(self._feature_config, per_feature_activations)
+
+  def _create_variables_and_slots(self):
+    """Create variables for TPU embeddings.
+
+    Note under TPUStrategy this will ensure that all creations happen within a
+    variable creation scope of the sharded variable creator.
+
+    Returns:
+      A dict of dicts. The outer dict is keyed by the table names and the inner
+      dicts are keyed by 'parameters' and the slot variable names.
+    """
+
+    def create_variables(table):
+      """Create all variables."""
+      shape = (table.vocabulary_size, table.dim)
+
+      # We use functools.partial here for the initial_value so that we have a
+      # variable creation that is compatible with both the sharded variable
+      # creator and the normal variable creator. The sharded variable creator
+      # will extract the shape of the tensor from the functool.partial object to
+      # decide on the sharding.
+      parameters = tf_variables.Variable(
+          name=table.name,
+          initial_value=functools.partial(
+              table.initializer, shape=shape, dtype=dtypes.float32),
+          trainable=not self._using_tpu)
+      slot_vars = table.optimizer._create_slots(parameters)  # pylint: disable=protected-access
+      slot_vars["parameters"] = parameters
+      return slot_vars
+
+    # Store tables based on name rather than TableConfig as we can't track
+    # through dicts with non-string keys, i.e. we won't be able to save.
+    variables = {}
+    for table in self._table_config:
+      if not self._using_tpu:
+        variables[table.name] = create_variables(table)
+      else:
+        with variable_scope.variable_creator_scope(
+            make_sharded_variable_creator(self._hosts)):
+          variables[table.name] = create_variables(table)
+
+    return variables
+
+  @def_function.function
+  def _load_variables(self):
+    """Load embedding tables to onto TPU for each table and host."""
+
+    def select_fn(host_id):
+      return lambda x: x.variables[host_id]
+
+    num_hosts = self._strategy.extended.num_hosts
+    config = self._config_proto.SerializeToString()
+    for host_id, host in enumerate(self._hosts):
+      variables = nest.map_structure(select_fn(host_id), self._variables)
+      with ops.device(host):
+        for table in self._table_config:
+          table.optimizer._load()(  # pylint: disable=protected-access
+              table_name=table.name,
+              num_shards=num_hosts,
+              shard_id=host_id,
+              config=config,
+              **variables[table.name])
+          # Ensure that only the first table/first host gets a config so that we
+          # don't bloat graph by attaching this large string to each op.
+          # We have num tables * num hosts of these so for models with a large
+          # number of tables training on a large slice, this can be an issue.
+          config = None
+
+  @def_function.function
+  def _retrieve_variables(self):
+    """Retrieve embedding tables from TPU to host memory."""
+    num_hosts = self._strategy.extended.num_hosts
+    config = self._config_proto.SerializeToString()
+    for host_id, host in enumerate(self._hosts):
+      with ops.device(host):
+        for table in self._table_config:
+          retrieved = table.optimizer._retrieve()(  # pylint: disable=protected-access
+              table_name=table.name,
+              num_shards=num_hosts,
+              shard_id=host_id,
+              config=config)
+          # When there are no slot variables (e.g with SGD) this returns a
+          # single tensor rather than a tuple. In this case we put the tensor in
+          # a list to make the following code easier to write.
+          if not isinstance(retrieved, tuple):
+            retrieved = (retrieved,)
+
+          for i, slot in enumerate(["parameters"] +
+                                   table.optimizer._slot_names()):  # pylint: disable=protected-access
+            # We must assign the CPU variables the values of tensors that were
+            # returned from the TPU.
+            self._variables[table.name][slot].variables[host_id].assign(
+                retrieved[i])
+          # Ensure that only the first table/first host gets a config so that we
+          # don't bloat graph by attaching this large string to each op.
+          # We have num tables * num hosts of these so for models with a large
+          # number of tables training on a large slice, this can be an issue.
+          config = None
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides default Trackable implementation to add load/retrieve hook."""
+    # This saveable should be here in both TPU and CPU checkpoints, so when on
+    # CPU, we add the hook with no functions.
+    # TODO(bfontain): Update restore logic in saver so that these hooks are
+    # always executed. Once that is done, we can output an empty list when on
+    # CPU.
+    def factory(name=_HOOK_KEY):
+      return TPUEmbeddingSaveable(
+          name,
+          self._load_variables if self._using_tpu else None,
+          self._retrieve_variables if self._using_tpu else None)
+    return {_HOOK_KEY: factory}
+
+  # Some helper functions for the below enqueue function.
+  def _add_data_for_tensor(self, tensor, weight, indices, values, weights,
+                           int_zeros, float_zeros, path):
+    if weight is not None:
+      raise ValueError(
+          "Weight specified for dense input {}, which is not allowed. "
+          "Weight will always be 1 in this case.".format(path))
+    # For tensors, there are no indices and no weights.
+    indices.append(int_zeros)
+    values.append(math_ops.cast(tensor, dtypes.int32))
+    weights.append(float_zeros)
+
+  def _add_data_for_sparse_tensor(self, tensor, weight, indices, values,
+                                  weights, int_zeros, float_zeros, path):
+    indices.append(math_ops.cast(tensor.indices, dtypes.int32))
+    values.append(math_ops.cast(tensor.values, dtypes.int32))
+    # If we have weights they must be a SparseTensor.
+    if weight is not None:
+      if not isinstance(weight, sparse_tensor.SparseTensor):
+        raise ValueError("Weight for {} is type {} which does not match "
+                         "type input which is SparseTensor.".format(
+                             path, type(weight)))
+      weights.append(math_ops.cast(weight.values, dtypes.float32))
+    else:
+      weights.append(float_zeros)
+
+  def _add_data_for_ragged_tensor(self, tensor, weight, indices, values,
+                                  weights, int_zeros, float_zeros, path):
+    indices.append(math_ops.cast(tensor.row_splits, dtypes.int32))
+    values.append(math_ops.cast(tensor.values, dtypes.int32))
+    # If we have weights they must be a RaggedTensor.
+    if weight is not None:
+      if not isinstance(weight, ragged_tensor.RaggedTensor):
+        raise ValueError("Weight for {} is type {} which does not match "
+                         "type input which is RaggedTensor.".format(
+                             path, type(weight)))
+      weights.append(math_ops.cast(weight.values, dtypes.float32))
+    else:
+      weights.append(float_zeros)
+
+  def _generate_enqueue_op(self, flat_inputs, flat_weights, flat_features,
+                           device_ordinal, mode_override):
+    """Outputs a the enqueue op given the inputs and weights.
+
+    Args:
+      flat_inputs: A list of input tensors.
+      flat_weights: A list of input weights (or None) of the same length as
+        flat_inputs.
+      flat_features: A list of FeatureConfigs of the same length as flat_inputs.
+      device_ordinal: The device to create the enqueue op for.
+      mode_override: A tensor containing the string "train" or "inference".
+
+    Returns:
+      The enqueue op.
+    """
+
+    # First we need to understand which op to use. This depends on if sparse
+    # or ragged tensors are in the flat_inputs.
+    sparse = False
+    ragged = False
+    for inp in flat_inputs:
+      if isinstance(inp, sparse_tensor.SparseTensor):
+        sparse = True
+      elif isinstance(inp, ragged_tensor.RaggedTensor):
+        ragged = True
+    if sparse and ragged:
+      raise ValueError(
+          "Found both SparseTensors and RaggedTensors in the input to the "
+          "enqueue operation. Please ensure that your data does not include "
+          "both SparseTensors and RaggedTensors. It is ok to have Tensors in "
+          "combination with one of the previous types.")
+
+    # Combiners are per table, list in the same order as the table order.
+    combiners = [table.combiner for table in self._table_config]
+
+    # Reverse mapping of self._table_config, so that we can lookup the table
+    # index.
+    table_to_id = {table: i for i, table in enumerate(self._table_config)}
+
+    # These parallel arrays will be the inputs to the enqueue op.
+    indices = []  # sample_indices for sparse, sample_splits for ragged.
+    values = []
+    weights = []
+    table_ids = []
+    max_sequence_lengths = []
+
+    # We have to supply a empty/zero tensor in a list position where we don't
+    # have data (e.g. indices for standard Tensor input, weight when no weight
+    # is specified). We create one op here per call, so that we reduce the
+    # graph size.
+    int_zeros = array_ops.zeros((0,), dtype=dtypes.int32)
+    float_zeros = array_ops.zeros((0,), dtype=dtypes.float32)
+
+    # In the following loop we insert casts so that everything is either int32
+    # or float32. This is because op inputs which are lists of tensors must be
+    # of the same type within the list. Moreover the CPU implementions of these
+    # ops cast to these types anyway, so we don't lose any data by casting
+    # early.
+    for inp, weight, (path, feature) in zip(
+        flat_inputs, flat_weights, flat_features):
+      table_ids.append(table_to_id[feature.table])
+      max_sequence_lengths.append(feature.max_sequence_length)
+      if isinstance(inp, ops.Tensor):
+        self._add_data_for_tensor(inp, weight, indices, values, weights,
+                                  int_zeros, float_zeros, path)
+      elif isinstance(inp, sparse_tensor.SparseTensor):
+        self._add_data_for_sparse_tensor(inp, weight, indices, values, weights,
+                                         int_zeros, float_zeros, path)
+      elif isinstance(inp, ragged_tensor.RaggedTensor):
+        self._add_data_for_ragged_tensor(inp, weight, indices, values, weights,
+                                         int_zeros, float_zeros, path)
+      else:
+        raise ValueError("Input {} is of unknown type {}. Please only pass "
+                         "Tensor, SparseTensor or RaggedTensor as input to "
+                         "enqueue.".format(path, type(inp)))
+
+    if ragged:
+      return tpu_ops.enqueue_tpu_embedding_ragged_tensor_batch(
+          sample_splits=indices,
+          embedding_indices=values,
+          aggregation_weights=weights,
+          mode_override=mode_override,
+          device_ordinal=device_ordinal,
+          combiners=combiners,
+          table_ids=table_ids,
+          max_sequence_lengths=max_sequence_lengths)
+    return tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch(
+        sample_indices=indices,
+        embedding_indices=values,
+        aggregation_weights=weights,
+        mode_override=mode_override,
+        device_ordinal=device_ordinal,
+        combiners=combiners,
+        table_ids=table_ids,
+        max_sequence_lengths=max_sequence_lengths)
+
+  def _raise_error_for_incorrect_control_flow_context(self):
+    """Raises an error if we are not in the TPUReplicateContext."""
+    # Do not allow any XLA control flow (i.e. control flow in between a
+    # TPUStrategy's run call and the call to this function), as we can't
+    # extract the enqueue from the head when in XLA control flow.
+    graph = ops.get_default_graph()
+    in_tpu_ctx = False
+    while graph is not None:
+      ctx = graph._get_control_flow_context()  # pylint: disable=protected-access
+      while ctx is not None:
+        if isinstance(ctx, tpu.TPUReplicateContext):
+          in_tpu_ctx = True
+          break
+        ctx = ctx.outer_context
+      if in_tpu_ctx:
+        break
+      graph = getattr(graph, "outer_graph", None)
+    if graph != ops.get_default_graph() and in_tpu_ctx:
+      raise RuntimeError(
+          "Current graph {} does not match graph which contains "
+          "TPUReplicateContext {}. This is most likely due to the fact that "
+          "enqueueing embedding data is called inside control flow or a "
+          "nested function inside `strategy.run`. This is not supported "
+          "because outside compilation fails to extract the enqueue ops as "
+          "head of computation.".format(ops.get_default_graph(), graph))
+    return in_tpu_ctx
+
+  def _raise_error_for_non_direct_inputs(self, features):
+    """Checks all tensors in features to see if they are a direct input."""
+
+    # expand_composites here is important: as composite tensors pass through
+    # tpu.replicate, they get 'flattened' into their component tensors and then
+    # repacked before being passed to the tpu function. In means that it is the
+    # component tensors which are produced by an op with the
+    # "_tpu_input_identity" attribute.
+    for path, input_tensor in nest.flatten_with_joined_string_paths(
+        features, expand_composites=True):
+      if input_tensor.op.type == "Placeholder":
+        continue
+      try:
+        is_input = input_tensor.op.get_attr("_tpu_input_identity")
+      except ValueError:
+        is_input = False
+      if not is_input:
+        raise ValueError(
+            "Received input tensor {} which is the output of op {} (type {}) "
+            "which does not have the `_tpu_input_identity` attr. Please "
+            "ensure that the inputs to this layer are taken directly from "
+            "the arguments of the function called by "
+            "strategy.run. Two possible causes are: dynamic batch size "
+            "support or you are using a keras layer and are not passing "
+            "tensors which match the dtype of the `tf.keras.Input`s."
+            "If you are triggering dynamic batch size support, you can "
+            "disable it by passing tf.distribute.RunOptions("
+            "experimental_enable_dynamic_batch_size=False) to the options "
+            "argument of strategy.run().".format(path,
+                                                 input_tensor.op.name,
+                                                 input_tensor.op.type))
+
+  def enqueue(self, features, weights=None, training=True, name=None):
+    """Enqueues id tensors for embedding lookup.
+
+    This function enqueues a structure of features to be looked up in the
+    embedding tables. We expect that the batch size of each of the tensors in
+    features matches the per core batch size. This will automatically happen if
+    your input dataset is batched to the global batch size and you use
+    `tf.distribute.experimental.TPUStrategy`'s `experimental_distribute_dataset`
+    or if you use `experimental_distribute_datasets_from_function` and batch
+    to the per core batch size computed by the context passed to your input
+    function.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    dataset_iterator = iter(distributed_dataset)
+
+    @tf.function
+    def training_step():
+      def tpu_step(tpu_features):
+        with tf.GradientTape() as tape:
+          activations = embedding.dequeue()
+          tape.watch(activations)
+
+          loss = ... #  some computation involving activations
+
+        embedding_gradients = tape.gradient(loss, activations)
+        embedding.apply_gradients(embedding_gradients)
+
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features,))
+
+    training_step()
+    ```
+
+    NOTE: You should specify `training=True` when using
+    `embedding.apply_gradients` as above and `training=False` when not using
+    `embedding.apply_gradients` (e.g. for frozen embeddings or when doing
+    evaluation).
+
+    Args:
+      features: A nested structure of `tf.Tensor`s, `tf.SparseTensor`s or
+        `tf.RaggedTensor`s, with the same structure as `feature_config`. Inputs
+        will be downcast to `tf.int32`. Only one type out of `tf.SparseTensor`
+        or `tf.RaggedTensor` is supported per call.
+      weights: If not `None`, a nested structure of `tf.Tensor`s,
+        `tf.SparseTensor`s or `tf.RaggedTensor`s, matching the above, except
+        that the tensors should be of float type (and they will be downcast to
+        `tf.float32`). For `tf.SparseTensor`s we assume the `indices` are the
+        same for the parallel entries from `features` and similarly for
+        `tf.RaggedTensor`s we assume the row_splits are the same.
+      training: Defaults to `True`. If `False`, enqueue the batch as inference
+        batch (forward pass only). Do not call `apply_gradients` when this is
+        `False` as this may lead to a deadlock.
+       name: A name for the underlying op.
+
+    Raises:
+      ValueError: When called inside a strategy.run call and input is not
+        directly taken from the args of the `strategy.run` call. Also if
+        the size of any sequence in `features` does not match corresponding
+        sequence in `feature_config`. Similarly for `weights`, if not `None`.
+      RuntimeError: When called inside a strategy.run call and inside XLA
+        control flow.
+      TypeError: If the type of any sequence in `features` does not match
+        corresponding sequence in `feature_config`. Similarly for `weights`, if
+        not `None`.
+    """
+    if not self._using_tpu:
+      raise RuntimeError("enqueue is not valid when TPUEmbedding object is not "
+                         "created under a TPUStrategy.")
+
+    nest.assert_same_structure(self._feature_config, features)
+
+    # TODO(bfontain): Add a check that the input batch_size matches the per core
+    # batch size that this instance of the API was initialized with.
+
+    flat_inputs = nest.flatten(features)
+    flat_weights = [None] * len(flat_inputs)
+    if weights is not None:
+      nest.assert_same_structure(self._feature_config, weights)
+      flat_weights = nest.flatten(weights)
+    flat_features = nest.flatten_with_joined_string_paths(self._feature_config)
+
+    in_tpu_context = self._raise_error_for_incorrect_control_flow_context()
+    # If we are in a tpu_context, automatically apply outside compilation.
+    if in_tpu_context:
+      self._raise_error_for_non_direct_inputs(features)
+
+      def generate_enqueue_ops():
+        """Generate enqueue ops for outside compilation."""
+        # Note that we put array_ops.where_v2 rather than a python if so that
+        # the op is explicitly create and the constant ops are both in the graph
+        # even though we don't expect training to be a tensor (and thus generate
+        # control flow automatically). This need to make it easier to re-write
+        # the graph later if we need to fix which mode needs to be used.
+        mode_override = array_ops.where_v2(training,
+                                           constant_op.constant("train"),
+                                           constant_op.constant("inference"))
+
+        # Device ordinal is -1 here, a later rewrite will fix this once the op
+        # is expanded by outside compilation.
+        enqueue_op = self._generate_enqueue_op(
+            flat_inputs, flat_weights, flat_features, device_ordinal=-1,
+            mode_override=mode_override)
+
+        # Apply the name tag to the op.
+        if name is not None:
+          _add_key_attr(enqueue_op, name)
+
+        # Ensure that this op has outbound control flow, otherwise it won't be
+        # executed.
+        ops.get_default_graph().control_outputs.append(enqueue_op)
+
+      tpu.outside_compilation(generate_enqueue_ops)
+
+    else:
+      mode_override = "train" if training else "inference"
+      # We generate enqueue ops per device, so we need to gather the all
+      # features for a single device in to a dict.
+      # We rely here on the fact that the devices in the PerReplica value occur
+      # in the same (standard) order as self._strategy.extended.worker_devices.
+      enqueue_ops = []
+      for replica_id in range(self._strategy.num_replicas_in_sync):
+        replica_inputs = tf_values.select_replica(replica_id, flat_inputs)
+        replica_weights = tf_values.select_replica(replica_id, flat_weights)
+        tpu_device = self._strategy.extended.worker_devices[replica_id]
+        # TPU devices string are like /job:worker/replica:0/task:0/device:TPU:0
+        # the device ordinal is the last number
+        device_ordinal = int(tpu_device.rsplit(":", 1)[1])
+        with ops.device(device_util.get_host_for_device(tpu_device)):
+          enqueue_op = self._generate_enqueue_op(
+              replica_inputs, replica_weights, flat_features,
+              device_ordinal=device_ordinal, mode_override=mode_override)
+
+          # Apply the name tag to the op.
+          if name is not None:
+            _add_key_attr(enqueue_op, name)
+          enqueue_ops.append(enqueue_op)
+      ops.get_default_graph().control_outputs.extend(enqueue_ops)
+
+
+class TPUEmbeddingSaveable(saveable_hook.SaveableHook):
+  """Save/Restore hook to Retrieve/Load TPUEmbedding variables."""
+
+  def __init__(self, name, load, retrieve):
+    self._load = load
+    self._retrieve = retrieve
+    super(TPUEmbeddingSaveable, self).__init__(name=name)
+
+  def before_save(self):
+    if self._retrieve is not None:
+      self._retrieve()
+
+  def after_restore(self):
+    if self._load is not None:
+      self._load()
+
+
+def _ragged_embedding_lookup_with_reduce(table, ragged, weights, combiner):
+  """Compute a ragged lookup followed by a reduce on axis 1.
+
+  Args:
+    table: The embedding table.
+    ragged: A RaggedTensor of ids to look up.
+    weights: A RaggedTensor of weights (or None).
+    combiner: One of "mean", "sum", "sqrtn".
+
+  Returns:
+    A Tensor.
+  """
+  if weights is None:
+    weights = array_ops.ones_like(ragged)
+  weights = array_ops.expand_dims(weights, axis=2)
+  ragged_result = embedding_ops.embedding_lookup_ragged(table, ragged)
+  ragged_result = math_ops.reduce_sum(ragged_result * weights, axis=1)
+  if combiner == "mean":
+    ragged_result = ragged_result / math_ops.reduce_sum(weights, axis=1)
+  elif combiner == "sqrtn":
+    ragged_result = ragged_result, math_ops.sqrt(math_ops.reduce_sum(
+        weights*weights, axis=1))
+  return ragged_result
+
+
+def cpu_embedding_lookup(inputs, weights, tables, feature_config):
+  """Uses CPU embedding lookup for embedding ids in features.
+
+  Args:
+    inputs: a nested structure of Tensors, SparseTensors or RaggedTensors.
+    weights: a nested structure of Tensors, SparseTensors or RaggedTensors or
+      None for no weights.
+    tables: a dict of mapping TableConfig objects to Variables.
+    feature_config: a nested structure of FeatureConfig objects with the same
+      structure as inputs.
+
+  Returns:
+    A nested structure of Tensors with the same structure as inputs.
+  """
+
+  nest.assert_same_structure(inputs, feature_config)
+
+  flat_inputs = nest.flatten(inputs)
+  flat_weights = [None] * len(flat_inputs)
+  if weights is not None:
+    nest.assert_same_structure(inputs, weights)
+    flat_weights = nest.flatten(weights)
+  flat_features = nest.flatten_with_joined_string_paths(feature_config)
+
+  outputs = []
+  for inp, weight, (path, feature) in zip(
+      flat_inputs, flat_weights, flat_features):
+    table = tables[feature.table]
+    if feature.max_sequence_length > 0:
+      raise ValueError("Sequence features unsupported at this time.")
+
+    if weight is not None:
+      if isinstance(inp, ops.Tensor):
+        raise ValueError(
+            "Weight specified for {}, but input is dense.".format(path))
+      elif type(weight) is not type(inp):
+        raise ValueError(
+            "Weight for {} is of type {} but it does not match type of the "
+            "input which is {}.".format(path, type(weight), type(inp)))
+
+    if isinstance(inp, ops.Tensor):
+      outputs.append(embedding_ops.embedding_lookup_v2(table, inp))
+
+    elif isinstance(inp, sparse_tensor.SparseTensor):
+      outputs.append(embedding_ops.safe_embedding_lookup_sparse_v2(
+          table, inp, sparse_weights=weight, combiner=feature.table.combiner))
+
+    elif isinstance(inp, ragged_tensor.RaggedTensor):
+      outputs.append(_ragged_embedding_lookup_with_reduce(
+          table, inp, weight, feature.table.combiner))
+
+    else:
+      raise ValueError("Input {} is type {}. Tensor, SparseTensor or "
+                       "RaggedTensor expected.".format(path, type(inp)))
+  return nest.pack_sequence_as(feature_config, outputs)
+
+
+def get_list_of_hosts(strategy):
+  """Returns a sorted list of CPU devices for the remote jobs.
+
+  Args:
+    strategy: A TPUStrategy object.
+
+  Returns:
+    A sort list of device strings.
+  """
+  list_of_hosts = []
+  # Assume this is sorted by task
+  for tpu_device in strategy.extended.worker_devices:
+    host = device_util.get_host_for_device(tpu_device)
+    if host not in list_of_hosts:
+      list_of_hosts.append(host)
+  assert len(list_of_hosts) == strategy.extended.num_hosts
+  return list_of_hosts
+
+
+def extract_variable_info(kwargs):
+  """Extracts the variable creation attributes from the kwargs.
+
+  Args:
+    kwargs: a dict of keyword arguments that were passed to a variable creator
+      scope.
+
+  Returns:
+    A tuple of variable name, initialization function, shape, and dtype.
+  """
+  if (isinstance(kwargs["initial_value"], functools.partial) and (
+      "shape" in kwargs["initial_value"].keywords or
+      kwargs["initial_value"].args)):
+    # Sometimes shape is passed positionally, sometimes it's passed as a kwarg.
+    if "shape" in kwargs["initial_value"].keywords:
+      shape = kwargs["initial_value"].keywords["shape"]
+    else:
+      shape = kwargs["initial_value"].args[0]
+    return (kwargs["name"], shape,
+            kwargs["initial_value"].keywords.get("dtype", kwargs["dtype"]),
+            kwargs["initial_value"].func)
+  elif "shape" not in kwargs or kwargs["shape"] is None:
+    raise ValueError(
+        "Unable to extract initializer function and shape from {}. Please "
+        "either pass a function that expects a shape and dtype as the "
+        "initial value for your variable or functools.partial object with "
+        "the shape and dtype kwargs set. This is needed so that we can "
+        "initialize the shards of the ShardedVariable locally.".format(
+            kwargs["initial_value"]))
+  else:
+    return (kwargs["name"], kwargs["shape"], kwargs["dtype"],
+            kwargs["initial_value"])
+
+
+def make_sharded_variable_creator(hosts):
+  """Makes a sharded variable creator given a list of hosts.
+
+  Args:
+    hosts: a list of tensorflow devices on which to shard the tensors.
+
+  Returns:
+    A variable creator function.
+  """
+
+  def sharded_variable_creator(next_creator, *args, **kwargs):
+    """The sharded variable creator."""
+    kwargs["skip_mirrored_creator"] = True
+
+    num_hosts = len(hosts)
+    name, shape, dtype, initial_value = extract_variable_info(kwargs)
+    rows = shape[0]
+    cols = shape[1]
+    missing = rows % num_hosts
+    # we partition as if we were using MOD sharding.
+    partitions = ([rows // num_hosts + 1] * missing + [rows // num_hosts] *
+                  (num_hosts - missing))
+    variables = []
+    newkwargs = kwargs
+    newkwargs["dtype"] = dtype
+    for i, p in enumerate(partitions):
+      with ops.device(hosts[i]):
+        newkwargs["shape"] = (p, cols)
+        newkwargs["name"] = "{}_{}".format(name, i)
+        newkwargs["initial_value"] = (
+            lambda: initial_value(newkwargs["shape"], dtype=dtype))
+        variables.append(next_creator(*args, **kwargs))
+    return TPUShardedVariable(variables, name=name)
+  return sharded_variable_creator
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
new file mode 100644
index 00000000000..bba0d10a62f
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -0,0 +1,624 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Companion classes for mid level API for TPU Embeddings in TF2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import abc
+import functools
+import math
+import six
+
+from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _Optimizer(object):
+  """Base class for all optimizers, with common parameters."""
+
+  def __init__(self, learning_rate, use_gradient_accumulation, clip_weight_min,
+               clip_weight_max, weight_decay_factor,
+               multiply_weight_decay_factor_by_learning_rate,
+               slot_variable_creation_fn=None):
+    self.learning_rate = learning_rate
+    self.use_gradient_accumulation = use_gradient_accumulation
+    self.clip_weight_min = clip_weight_min
+    self.clip_weight_max = clip_weight_max
+    self.weight_decay_factor = weight_decay_factor
+    self.multiply_weight_decay_factor_by_learning_rate = (
+        multiply_weight_decay_factor_by_learning_rate)
+
+    if (slot_variable_creation_fn is not None and
+        not callable(slot_variable_creation_fn)):
+      raise ValueError("slot_variable_creation_fn must be either None or a "
+                       "callable.")
+    self.slot_variable_creation_fn = slot_variable_creation_fn
+
+  @abc.abstractmethod
+  def _slot_names(self):
+    """Returns the name of all the slot variables.
+
+    This does not include the 'parameters' variable and these names must match
+    the names of the slots variables as used in the corresponding
+    `tpu_ops.load_tpu_embedding_*` ops.
+    """
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def _slot_initializers(self):
+    """Returns initializers for slot variables.
+
+    This returns a parallel list to self._slot_names().
+    """
+    raise NotImplementedError
+
+  def _set_optimization_parameters(self, parameters):
+    """Sets the optimizer fields in the OptimizationParameters."""
+    if self.use_gradient_accumulation:
+      parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.ENABLED)
+    else:
+      parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
+
+    if self.clip_weight_min is not None:
+      parameters.clipping_limits.lower.value = self.clip_weight_min
+
+    if self.clip_weight_max is not None:
+      parameters.clipping_limits.upper.value = self.clip_weight_max
+
+    if self.weight_decay_factor:
+      parameters.weight_decay_factor = self.weight_decay_factor
+      if self.multiply_weight_decay_factor_by_learning_rate:
+        parameters.multiply_weight_decay_factor_by_learning_rate = True
+
+  @abc.abstractmethod
+  def _load(self):
+    """Returns the load function for the optimizer."""
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def _retrieve(self):
+    """Returns the retrieve function for the optimizer."""
+    raise NotImplementedError
+
+  def _create_slots(self, table):
+    """Creates slot variables for table.
+
+    Uses shape of table to create parallel slot variables.
+
+    Args:
+      table: A Variable or equivalent.
+
+    Returns:
+      A dict of variables, keyed by self._slot_names().
+    """
+    if self.slot_variable_creation_fn is not None:
+      return self.slot_variable_creation_fn(table, self._slot_names())
+    else:
+      slots = {}
+      for slot, initializer in zip(self._slot_names(),
+                                   self._slot_initializers()):
+        slots[slot] = tf_variables.Variable(
+            name=table.name + "/" + slot,
+            initial_value=functools.partial(
+                initializer, shape=table.shape, dtype=table.dtype),
+            trainable=False)
+      return slots
+
+
+@tf_export("tpu.experimental.embedding.SGD")
+class SGD(_Optimizer):
+  """Optimization parameters for stochastic gradient descent for TPU embeddings.
+
+  Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
+  argument to set the global optimizer and its parameters:
+
+  ```
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      ...
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  ```
+
+  This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  optimizer parameter to set a table specific optimizer. This will override the
+  optimizer and parameters for global embedding optimizer defined above:
+
+  ```
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      optimizer=tf.tpu.experimental.embedding.SGD(0.2))
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  ```
+
+  In the above example, the first feature will be looked up in a table that has
+  a learning rate of 0.2 while the second feature will be looked up in a table
+  that has a learning rate of 0.1.
+
+  See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+  complete description of these parameters and their impacts on the optimizer
+  algorithm.
+  """
+
+  def __init__(self,
+               learning_rate=0.01,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None):
+    """Optimization parameters for stochastic gradient descent.
+
+    Args:
+      learning_rate: The learning rate. It should be a floating point value or a
+        callable taking no arguments for a dynamic learning rate.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed. Weights are decayed by multiplying the weight
+        by this factor each step.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+    """
+    super(SGD, self).__init__(
+        learning_rate, False, clip_weight_min, clip_weight_max,
+        weight_decay_factor, multiply_weight_decay_factor_by_learning_rate)
+
+  def _slot_names(self):
+    return []
+
+  def _slot_initializers(self):
+    return []
+
+  def _set_optimization_parameters(self, parameters):
+    super(SGD, self)._set_optimization_parameters(parameters)
+    parameters.stochastic_gradient_descent.SetInParent()
+
+  def _load(self):
+    return tpu_ops.load_tpu_embedding_stochastic_gradient_descent_parameters
+
+  def _retrieve(self):
+    return tpu_ops.retrieve_tpu_embedding_stochastic_gradient_descent_parameters
+
+
+@tf_export("tpu.experimental.embedding.Adagrad")
+class Adagrad(_Optimizer):
+  """Optimization parameters for Adagrad with TPU embeddings.
+
+  Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
+  argument to set the global optimizer and its parameters:
+
+  ```python
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      ...
+      optimizer=tf.tpu.experimental.embedding.Adagrad(0.1))
+  ```
+
+  This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  optimizer parameter to set a table specific optimizer. This will override the
+  optimizer and parameters for global embedding optimizer defined above:
+
+  ```python
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      optimizer=tf.tpu.experimental.embedding.Adagrad(0.2))
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adagrad(0.1))
+  ```
+
+  In the above example, the first feature will be looked up in a table that has
+  a learning rate of 0.2 while the second feature will be looked up in a table
+  that has a learning rate of 0.1.
+
+  See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+  complete description of these parameters and their impacts on the optimizer
+  algorithm.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               initial_accumulator_value=0.1,
+               use_gradient_accumulation=True,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None,
+               slot_variable_creation_fn=None):
+    """Optimization parameters for Adagrad.
+
+    Args:
+      learning_rate: The learning rate. It should be a floating point value or a
+        callable taking no arguments for a dynamic learning rate.
+      initial_accumulator_value: initial accumulator for Adagrad.
+      use_gradient_accumulation: setting this to `False` makes embedding
+        gradients calculation less accurate but faster.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+      slot_variable_creation_fn: Defaults to `None`. If you wish do directly
+        control the creation of the slot variables, set this to a callable
+        taking two parameters, a variable and a list of slot names to create for
+        it. This function should return a dict with the slot names as keys and
+        the created variables as values. When set to None (the default), uses
+        the built-in variable creation.
+    """
+    super(Adagrad, self).__init__(
+        learning_rate, use_gradient_accumulation, clip_weight_min,
+        clip_weight_max, weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        slot_variable_creation_fn)
+    if initial_accumulator_value <= 0:
+      raise ValueError("Adagrad initial_accumulator_value must be positive")
+    self.initial_accumulator_value = initial_accumulator_value
+
+  def _slot_names(self):
+    return ["accumulators"]
+
+  def _slot_initializers(self):
+    return [init_ops_v2.Constant(self.initial_accumulator_value)]
+
+  def _set_optimization_parameters(self, parameters):
+    super(Adagrad, self)._set_optimization_parameters(parameters)
+    parameters.adagrad.SetInParent()
+
+  def _load(self):
+    return tpu_ops.load_tpu_embedding_adagrad_parameters
+
+  def _retrieve(self):
+    return tpu_ops.retrieve_tpu_embedding_adagrad_parameters
+
+
+@tf_export("tpu.experimental.embedding.Adam")
+class Adam(_Optimizer):
+  """Optimization parameters for Adam with TPU embeddings.
+
+  Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
+  argument to set the global optimizer and its parameters:
+
+  NOTE: By default this optimizer is lazy, i.e. it will not apply the gradient
+  update of zero to rows that were not looked up. You can change this behavior
+  by setting `lazy_adam` to `False`.
+
+  ```python
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      ...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  optimizer parameter to set a table specific optimizer. This will override the
+  optimizer and parameters for global embedding optimizer defined above:
+
+  ```python
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      optimizer=tf.tpu.experimental.embedding.Adam(0.2))
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  In the above example, the first feature will be looked up in a table that has
+  a learning rate of 0.2 while the second feature will be looked up in a table
+  that has a learning rate of 0.1.
+
+  See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+  complete description of these parameters and their impacts on the optimizer
+  algorithm.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-07,
+               lazy_adam=True,
+               sum_inside_sqrt=True,
+               use_gradient_accumulation=True,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None,
+               slot_variable_creation_fn=None):
+    """Optimization parameters for Adam.
+
+    See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+    complete description of these parameters and their impacts on the optimizer
+    algorithm.
+
+    Args:
+      learning_rate: The learning rate. It should be a floating point value or a
+        callable taking no arguments for a dynamic learning rate.
+      beta_1: A float value.
+        The exponential decay rate for the 1st moment estimates.
+      beta_2: A float value.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability.
+      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
+      sum_inside_sqrt: When this is true, the Adam update formula is changed
+        from `m / (sqrt(v) + epsilon)` to `m / sqrt(v + epsilon**2)`. This
+        option improves the performance of TPU training and is not expected to
+        harm model quality.
+      use_gradient_accumulation: Setting this to `False` makes embedding
+        gradients calculation less accurate but faster.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+      slot_variable_creation_fn: a callable taking two parameters, a variable
+        and a list of slot names to create for it. This function should return
+        a dict with the slot names as keys and the created variables as values.
+        When set to None (the default), uses the built-in variable creation.
+    """
+    super(Adam, self).__init__(
+        learning_rate, use_gradient_accumulation, clip_weight_min,
+        clip_weight_max, weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        slot_variable_creation_fn)
+    if beta_1 < 0. or beta_1 >= 1.:
+      raise ValueError("beta1 must be in the range [0, 1), but received {}."
+                       .format(beta_1))
+    if beta_2 < 0. or beta_2 >= 1.:
+      raise ValueError("beta2 must be in the range [0, 1), but received {}."
+                       .format(beta_2))
+    if epsilon <= 0.:
+      raise ValueError("epsilon must be positive; got {}.".format(epsilon))
+    if not use_gradient_accumulation and not lazy_adam:
+      raise ValueError(
+          "When disabling Lazy Adam, gradient accumulation must be used.")
+
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.lazy_adam = lazy_adam
+    self.sum_inside_sqrt = sum_inside_sqrt
+
+  def _slot_names(self):
+    return ["momenta", "velocities"]
+
+  def _slot_initializers(self):
+    return [init_ops_v2.Constant(), init_ops_v2.Constant()]
+
+  def _set_optimization_parameters(self, parameters):
+    super(Adam, self)._set_optimization_parameters(parameters)
+    parameters.adam.beta1 = self.beta_1
+    parameters.adam.beta2 = self.beta_2
+    parameters.adam.epsilon = self.epsilon
+    parameters.adam.use_non_lazy_adam = not self.lazy_adam
+    parameters.adam.use_sum_inside_sqrt = self.sum_inside_sqrt
+
+  def _load(self):
+    return tpu_ops.load_tpu_embedding_adam_parameters
+
+  def _retrieve(self):
+    return tpu_ops.retrieve_tpu_embedding_adam_parameters
+
+
+@tf_export("tpu.experimental.embedding.TableConfig")
+class TableConfig(object):
+  """Configuration data for one embedding table.
+
+  This class holds the configuration data for a single embedding table. It is
+  used as the `table` parameter of a
+  `tf.tpu.experimental.embedding.FeatureConfig`. Multiple
+  `tf.tpu.experimental.embedding.FeatureConfig` objects can use the same
+  `tf.tpu.experimental.embedding.TableConfig` object. In this case a shared
+  table will be created for those feature lookups.
+
+  ```python
+  table_config_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  table_config_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  feature_config = {
+      'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_two)}
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  The above configuration has 2 tables, and three features. The first two
+  features will be looked up in the first table and the third feature will be
+  looked up in the second table.
+
+  """
+
+  def __init__(self, vocabulary_size, dim, initializer, optimizer=None,
+               combiner="mean", name=None):
+    """Embedding table configuration.
+
+    Args:
+      vocabulary_size: Size of the table's vocabulary (number of rows).
+      dim: The embedding dimension (width) of the table.
+      initializer: A callable initializer taking one parameter, the shape of the
+        variable that will be initialized. Will be called once per task, to
+        initialize that task's shard of the embedding table. If not specified,
+        defaults to `truncated_normal_initializer` with mean `0.0` and standard
+        deviation `1/sqrt(dim)`.
+      optimizer: An optional instance of an optimizer parameters class, instance
+        of one of `tf.tpu.experimental.embedding.SGD`,
+        `tf.tpu.experimental.embedding.Adagrad` or
+        `tf.tpu.experimental.embedding.Adam`. It set will override the global
+        optimizer passed to `tf.tpu.experimental.embedding.TPUEmbedding`.
+      combiner: A string specifying how to reduce if there are multiple entries
+        in a single row. Currently 'mean', 'sqrtn', 'sum' are
+        supported, with 'mean' the default. 'sqrtn' often achieves good
+        accuracy, in particular with bag-of-words columns. For more information,
+        see `tf.nn.embedding_lookup_sparse`.
+      name: An optional string used to name the table. Useful for debugging.
+
+    Returns:
+      `TableConfig`.
+
+    Raises:
+      ValueError: if `vocabulary_size` is not a positive integer.
+      ValueError: if `dim` is not a positive integer.
+      ValueError: if `initializer` is specified and is not callable.
+      ValueError: if `combiner` is not supported.
+    """
+    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
+      raise ValueError("Invalid vocabulary_size {}.".format(vocabulary_size))
+
+    if not isinstance(dim, int) or dim < 1:
+      raise ValueError("Invalid dim {}.".format(dim))
+
+    if (initializer is not None) and (not callable(initializer)):
+      raise ValueError("initializer must be callable if specified.")
+    if initializer is None:
+      initializer = init_ops_v2.TruncatedNormal(mean=0.0,
+                                                stddev=1/math.sqrt(dim))
+
+    if combiner not in ("mean", "sum", "sqrtn"):
+      raise ValueError("Invalid combiner {}".format(combiner))
+
+    self.vocabulary_size = vocabulary_size
+    self.dim = dim
+    self.initializer = initializer
+    self.optimizer = optimizer
+    self.combiner = combiner
+    self.name = name
+
+
+@tf_export("tpu.experimental.embedding.FeatureConfig")
+class FeatureConfig(object):
+  """Configuration data for one embedding feature.
+
+  This class holds the configuration data for a single embedding feature. The
+  main use is to assign features to `tf.tpu.experimental.embedding.TableConfig`s
+  via the table parameter:
+
+  ```python
+  table_config_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  table_config_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  feature_config = {
+      'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_two)}
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  The above configuration has 2 tables, and three features. The first two
+  features will be looked up in the first table and the third feature will be
+  looked up in the second table.
+
+  When feeding features into `embedding.enqueue` they can be `tf.Tensor`s,
+  `tf.SparseTensor`s or `tf.RaggedTensor`s. When the argument
+  `max_sequence_length` is 0, the default, you should expect a output of
+  `embedding.dequeue` for this feature of shape `(batch_size, dim)`. If
+  `max_sequence_length` is greater than 0, the feature is embedded as a sequence
+  and padded up to the given length. The shape of the output for this feature
+  will be `(batch_size, max_sequence_length, dim)`.
+  """
+
+  def __init__(self, table, max_sequence_length=0, name=None):
+    """Feature configuration.
+
+    Args:
+      table: An instance of `tf.tpu.experimental.embedding.TableConfig`,
+        describing the table in which this feature should be looked up.
+      max_sequence_length: If positive, the feature is a sequence feature with
+        the corresponding maximum sequence length. If the sequence is longer
+        than this, it will be truncated. If 0, the feature is not a sequence
+        feature.
+      name: An optional name for the feature, useful for debugging.
+
+    Returns:
+      `FeatureConfig`.
+
+    Raises:
+      ValueError: if `table` is not an instance of
+        `tf.tpu.experimental.embedding.TableConfig`.
+      ValueError: if `max_sequence_length` not an integer or is negative.
+    """
+    if not isinstance(table, TableConfig):
+      raise ValueError("table is type {}, expected "
+                       "`tf.tpu.experimental.embedding.TableConfig`".format(
+                           type(table)))
+
+    if not isinstance(max_sequence_length, int) or max_sequence_length < 0:
+      raise ValueError("Invalid max_sequence_length {}.".format(
+          max_sequence_length))
+
+    self.table = table
+    self.max_sequence_length = max_sequence_length
+    self.name = name
diff --git a/tensorflow/python/tpu/tpu_feed.py b/tensorflow/python/tpu/tpu_feed.py
index 2b01eeb3934..ce5f9aa6b8b 100644
--- a/tensorflow/python/tpu/tpu_feed.py
+++ b/tensorflow/python/tpu/tpu_feed.py
@@ -556,8 +556,8 @@ class InfeedQueue(object):
     """Generates the host-side Ops to enqueue the shards of a tuple.
 
     sharded_inputs is a list, one for each shard, of lists of
-    Tensors. sharded_inputs[0] is the tuple of Tensors to use to feed
-    shard 0 if the queue. Returns the host-side Ops that must be run to
+    Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed
+    shard i of the queue. Returns the host-side Ops that must be run to
     enqueue the sharded tuple. The Op for shard i is colocated with the inputs
     for shard i.
 
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index 0e5af5a9222..5bc2937e144 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -158,7 +158,7 @@ class AdadeltaOptimizerTest(test.TestCase):
     with self.cached_session():
       self.doTestBasic(use_resource=False)
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @test_util.run_in_graph_and_eager_modes
   def testResourceBasic(self):
     self.doTestBasic(use_resource=True)
 
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index 3528fdaa8b0..4c0ee1c66f5 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -84,7 +84,7 @@ class AdagradOptimizerTest(test.TestCase):
   def testBasic(self):
     self.doTestBasic(use_locking=False)
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @test_util.run_in_graph_and_eager_modes
   def testBasicResource(self):
     self.doTestBasic(use_locking=False, use_resource=True)
 
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 615ac587c21..93bacbdc0bb 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -92,11 +92,14 @@ class AdamOptimizer(optimizer.Optimizer):
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".  @compatibility(eager) When eager execution is
-        enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a
-        callable that takes no arguments and returns the actual value to use.
-        This can be useful for changing these values across different
-        invocations of optimizer functions. @end_compatibility
+        Defaults to "Adam".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
     """
     super(AdamOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 8ac5f944cd6..d0ff8603da3 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -243,7 +243,7 @@ class AdamOptimizerTest(test.TestCase):
     with self.cached_session():
       self.doTestBasic(use_resource=False)
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @test_util.run_in_graph_and_eager_modes
   def testResourceBasic(self):
     self.doTestBasic(use_resource=True)
 
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index df68d43bb66..e54ed9907c1 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -511,7 +511,7 @@ def meta_graph_filename(checkpoint_filename, meta_graph_suffix="meta"):
 # TODO(allenl): Allow tf.keras.Model instances in the constructor directly?
 @tf_export("train.CheckpointManager")
 class CheckpointManager(object):
-  """Deletes old checkpoints.
+  """Manages multiple checkpoints by keeping some and deleting unneeded ones.
 
   Example usage:
 
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 8d27e957fc8..639276988a1 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -126,7 +126,7 @@ class MomentumOptimizerTest(test.TestCase):
     with self.cached_session():
       self.doTestBasic(use_resource=False)
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @test_util.run_in_graph_and_eager_modes
   def testResourceBasic(self):
     self.doTestBasic(use_resource=True)
 
@@ -229,7 +229,7 @@ class MomentumOptimizerTest(test.TestCase):
           self.assertAllClose(var0_np, self.evaluate(var0))
           self.assertAllClose(var1_np, self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @test_util.run_in_graph_and_eager_modes
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       # This test invokes the ResourceSparseApplyMomentum operation, which
@@ -259,7 +259,7 @@ class MomentumOptimizerTest(test.TestCase):
       # Validate updated params
       self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @test_util.run_in_graph_and_eager_modes
   def testMinimizeWith2DIndicesForEmbeddingLookup(self):
     # This test invokes the ResourceSparseApplyMomentum operation, which
     # did not have a registered GPU kernel as of April 2018. With graph
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index d77278e98f4..ab63f4237da 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -1189,7 +1189,7 @@ class _WrappedSession(object):
       try:
         self._sess.close()
       except _PREEMPTION_ERRORS as e:
-        logging.warning(
+        logging.error(
             'An error occurred when attempting to close the '
             'session. This may be due to a preemption in a '
             'connected worker or parameter server. Error: %s', e)
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index f1a31d01dd4..1fe8a8c729b 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -759,7 +759,7 @@ class Optimizer(
     if hasattr(var, "_distributed_container"):
       # NOTE: If this isn't patched, then there is no `handle` in
       # `_resource_apply_dense`.
-      distributed_container = var._distributed_container()
+      distributed_container = var._distributed_container
       assert distributed_container is not None
       if ops.executing_eagerly_outside_functions():
         key = distributed_container._unique_id
@@ -768,7 +768,7 @@ class Optimizer(
       # pylint: enable=protected-access
       mirrored_slot = named_slots.get(key, None)
       if mirrored_slot is None: return None
-      return mirrored_slot._get_closest()  # pylint: disable=protected-access
+      return mirrored_slot._get_on_device_or_primary()  # pylint: disable=protected-access
 
     return named_slots.get(_var_key(var), None)
 
diff --git a/tensorflow/python/training/quantize_training_wrapper.cc b/tensorflow/python/training/quantize_training_wrapper.cc
index 27abc27c4a2..af5a4788d3b 100644
--- a/tensorflow/python/training/quantize_training_wrapper.cc
+++ b/tensorflow/python/training/quantize_training_wrapper.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "pybind11/pybind11.h"
-#include "tensorflow/core/graph/quantize_training.h"
+#include "tensorflow/core/common_runtime/quantize_training.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 
@@ -41,7 +41,7 @@ static PyObject* DoQuantizeTrainingOnGraphDefHelper(const string& input_graph,
 PYBIND11_MODULE(_pywrap_quantize_training, m) {
   m.def("DoQuantizeTrainingOnGraphDefHelper",
         [](const py::object input_graph, int num_bits) {
-          return tensorflow::pyo_or_throw(
+          return tensorflow::PyoOrThrow(
               tensorflow::DoQuantizeTrainingOnGraphDefHelper(
                   input_graph.cast<std::string>(), num_bits));
         });
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index a8f595f3ac6..670a4c35c6f 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -12,11 +12,20 @@ package(
 
 exports_files(["LICENSE"])
 
+py_library(
+    name = "checkpoint_options",
+    srcs = ["checkpoint_options.py"],
+    deps = [
+        "//tensorflow/python:tf_export",
+    ],
+)
+
 py_library(
     name = "functional_saver",
     srcs = ["functional_saver.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":checkpoint_options",
         ":saveable_hook",
         ":saveable_object",
         ":saveable_object_util",
@@ -31,6 +40,7 @@ cuda_py_test(
         "functional_saver_test.py",
     ],
     deps = [
+        ":checkpoint_options",
         ":functional_saver",
         ":saveable_hook",
         "//tensorflow/python/eager:test",
diff --git a/tensorflow/python/training/saving/checkpoint_options.py b/tensorflow/python/training/saving/checkpoint_options.py
new file mode 100644
index 00000000000..92fd679943c
--- /dev/null
+++ b/tensorflow/python/training/saving/checkpoint_options.py
@@ -0,0 +1,58 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Options for saving Checkpoints."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("train.CheckpointOptions")
+class CheckpointOptions(object):
+  """Options for constructing a Checkpoint.
+
+  Used as the `_options` argument to the `tf.Checkpoint` constructor to adjust
+  how variables are saved.
+
+  Example: Run IO ops on "localhost" while saving a checkpoint:
+
+  ```
+  step = tf.Variable(0, name="step")
+  checkpoint = tf.Checkpoint(step=step)
+  options = tf.CheckpointOptions(experimental_io_device="/job:localhost")
+  checkpoint.save("/tmp/ckpt", options=options)
+  ```
+  """
+
+  # Define object attributes in __slots__ for improved memory and performance.
+  __slots__ = ("experimental_io_device",)
+
+  def __init__(self, experimental_io_device=None):
+    """Creates an object that stores options for a Checkpoint.
+
+    Args:
+      experimental_io_device: string. Applies in a distributed setting.
+        Tensorflow device to use to access the filesystem. If `None` (default)
+        then for each variable the filesystem is accessed from the CPU:0 device
+        of the host where that variable is assigned. If specified, the
+        filesystem is instead accessed from that device for all variables.
+
+        This is for example useful if you want to save to a local directory,
+        such as "/tmp" when running in a distributed setting. In that case pass
+        a device for the host where the "/tmp" directory is accessible.
+    """
+    self.experimental_io_device = experimental_io_device
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index d85852dabe6..c4334e096df 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import saveable_hook
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
@@ -52,15 +53,17 @@ class _SingleDeviceSaver(object):
             "Expected a list of SaveableObjects, got %s." % (saveable,))
     self._saveable_objects = saveable_objects
 
-  def save(self, file_prefix):
+  def save(self, file_prefix, options=None):
     """Save the saveable objects to a checkpoint with `file_prefix`.
 
     Args:
       file_prefix: A string or scalar string Tensor containing the prefix to
         save under.
+      options: Optional `CheckpointOptions` object.
     Returns:
       An `Operation`, or None when executing eagerly.
     """
+    options = options or checkpoint_options.CheckpointOptions()
     tensor_names = []
     tensors = []
     tensor_slices = []
@@ -69,19 +72,22 @@ class _SingleDeviceSaver(object):
         tensor_names.append(spec.name)
         tensors.append(spec.tensor)
         tensor_slices.append(spec.slice_spec)
-    with ops.device("cpu:0"):
+    save_device = options.experimental_io_device or "cpu:0"
+    with ops.device(save_device):
       return io_ops.save_v2(file_prefix, tensor_names, tensor_slices, tensors)
 
-  def restore(self, file_prefix):
+  def restore(self, file_prefix, options=None):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
 
     Args:
       file_prefix: A string or scalar string Tensor containing the prefix for
         files to read from.
+      options: Optional `CheckpointOptions` object.
 
     Returns:
       A dictionary mapping from SaveableObject names to restore operations.
     """
+    options = options or checkpoint_options.CheckpointOptions()
     restore_specs = []
     tensor_structure = []
     for saveable in self._saveable_objects:
@@ -91,7 +97,8 @@ class _SingleDeviceSaver(object):
         saveable_tensor_structure.append(spec.name)
         restore_specs.append((spec.name, spec.slice_spec, spec.dtype))
     tensor_names, tensor_slices, tensor_dtypes = zip(*restore_specs)
-    with ops.device("cpu:0"):
+    restore_device = options.experimental_io_device or "cpu:0"
+    with ops.device(restore_device):
       restored_tensors = io_ops.restore_v2(
           file_prefix, tensor_names, tensor_slices, tensor_dtypes)
     structured_restored_tensors = nest.pack_sequence_as(
@@ -190,15 +197,17 @@ class MultiDeviceSaver(object):
       with ops.control_dependencies(restore_ops.values()):
         return array_ops.identity(file_prefix)
 
-  def save(self, file_prefix):
+  def save(self, file_prefix, options=None):
     """Save the saveable objects to a checkpoint with `file_prefix`.
 
     Args:
       file_prefix: A string or scalar string Tensor containing the prefix to
         save under.
+      options: Optional `CheckpointOptions` object.
     Returns:
       An `Operation`, or None when executing eagerly.
     """
+    options = options or checkpoint_options.CheckpointOptions()
     for callback in self._before_save_callbacks:
       callback()
 
@@ -253,32 +262,37 @@ class MultiDeviceSaver(object):
       with ops.device(device):
         # _SingleDeviceSaver will use the CPU device when necessary, but initial
         # read operations should be placed on the SaveableObject's device.
-        sharded_saves.append(saver.save(shard_prefix))
+        sharded_saves.append(saver.save(shard_prefix, options))
 
     with ops.control_dependencies(sharded_saves):
-      # Co-locates the merge step with the last device.
-      with ops.device(saveable_object_util.set_cpu0(last_device)):
+      # Merge on the io_device if specified, otherwise co-locates the merge op
+      # with the last device used.
+      merge_device = (options.experimental_io_device or
+                      saveable_object_util.set_cpu0(last_device))
+      with ops.device(merge_device):
         # V2 format write path consists of a metadata merge step.  Once merged,
         # attempts to delete the temporary directory, "<user-fed prefix>_temp".
         return gen_io_ops.merge_v2_checkpoints(
             sharded_prefixes, file_prefix, delete_old_dirs=True)
 
-  def restore(self, file_prefix):
+  def restore(self, file_prefix, options=None):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
 
     Args:
       file_prefix: A string or scalar string Tensor containing the prefix for
         files to read from.
+      options: Optional `CheckpointOptions` object.
 
     Returns:
       A dictionary mapping from SaveableObject names to restore operations.
     """
+    options = options or checkpoint_options.CheckpointOptions()
     restore_ops = {}
     # Sort by device name to avoid propagating non-deterministic dictionary
     # ordering in some Python versions.
     for device, saver in sorted(self._single_device_savers.items()):
       with ops.device(device):
-        restore_ops.update(saver.restore(file_prefix))
+        restore_ops.update(saver.restore(file_prefix, options))
 
     for callback in self._after_restore_callbacks:
       callback()
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
index dfa2023a091..7db32ff72d7 100644
--- a/tensorflow/python/training/saving/functional_saver_test.py
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -20,21 +20,37 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import gfile
+from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
 from tensorflow.python.training.saving import saveable_hook
 from tensorflow.python.training.saving import saveable_object_util
 
+LOCALHOST = "/job:localhost/replica:0/task:0/device:CPU:0"
+
 
 class SaverTest(test.TestCase):
 
+  def setUp(self):
+    super(SaverTest, self).setUp()
+    cpus = config.list_physical_devices("CPU")
+    # Set 3 virtual CPUs
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    self.local_options = checkpoint_options.CheckpointOptions(
+        experimental_io_device=LOCALHOST)
+
   @test_util.run_in_graph_and_eager_modes
   def test_resource_variable(self):
     v1 = resource_variable_ops.ResourceVariable(2.)
@@ -55,6 +71,33 @@ class SaverTest(test.TestCase):
     self.evaluate(second_saver.restore(prefix))
     self.assertEqual(2., self.evaluate(v2))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_resource_variable_use_localhost(self):
+    v1 = resource_variable_ops.ResourceVariable(2.)
+    self.evaluate(v1.initializer)
+    saver = functional_saver._SingleDeviceSaver(
+        saveable_object_util.saveable_objects_for_op(v1, "x"))
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(saver.save(constant_op.constant(prefix), self.local_options))
+    self.assertEqual(2, len(gfile.Glob(prefix + "*")))
+    self.evaluate(v1.assign(1.))
+    self.evaluate(saver.restore(prefix, self.local_options))
+    self.assertEqual(2., self.evaluate(v1))
+
+    v2 = resource_variable_ops.ResourceVariable(3.)
+    self.evaluate(v2.initializer)
+    second_saver = functional_saver._SingleDeviceSaver(
+        saveable_object_util.saveable_objects_for_op(v2, "x"))
+    self.evaluate(second_saver.restore(prefix, self.local_options))
+    self.assertEqual(2., self.evaluate(v2))
+
+    # In graph mode, verify that the save and restore ops were set to run on
+    # localhost.
+    if not context.executing_eagerly():
+      for op in ops.get_default_graph().get_operations():
+        if op.type in ("SaveV2", "RestoreV2"):
+          self.assertEqual(LOCALHOST, op.device)
+
   def test_to_proto(self):
     v1 = resource_variable_ops.ResourceVariable(2.)
     saver = functional_saver.MultiDeviceSaver(
@@ -83,12 +126,7 @@ class SaverTest(test.TestCase):
     second_saver.restore(save_path)
     self.assertEqual(2., self.evaluate(v2))
 
-  @test_util.run_v1_only(
-      "Needs an API to setup multiple devices, b/124805129")
-  # Set up multiple devices when graph building. Before test.main() we configure
-  # the devices for eager execution.
-  @test_util.run_in_graph_and_eager_modes(
-      config=config_pb2.ConfigProto(device_count={"CPU": 3}))
+  @test_util.run_in_graph_and_eager_modes
   def test_checkpoint_is_sharded_by_device(self):
     with ops.device("cpu:0"):
       v0 = resource_variable_ops.ResourceVariable(0.)
@@ -99,9 +137,9 @@ class SaverTest(test.TestCase):
 
     self.evaluate([v0.initializer, v1.initializer, v2.initializer])
     saver = functional_saver.MultiDeviceSaver(
-        list(saveable_object_util.saveable_objects_for_op(v0, "v0"))
-        + list(saveable_object_util.saveable_objects_for_op(v1, "v1"))
-        + list(saveable_object_util.saveable_objects_for_op(v2, "v2")))
+        list(saveable_object_util.saveable_objects_for_op(v0, "v0")) +
+        list(saveable_object_util.saveable_objects_for_op(v1, "v1")) +
+        list(saveable_object_util.saveable_objects_for_op(v2, "v2")))
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(saver.save(constant_op.constant(prefix)))
     self.assertEqual(4, len(gfile.Glob(prefix + "*")))
@@ -113,8 +151,38 @@ class SaverTest(test.TestCase):
     self.assertEqual(1., self.evaluate(v1))
     self.assertEqual(2., self.evaluate(v2))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_checkpoint_multi_device_using_localhost(self):
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.)
+    with ops.device("cpu:1"):
+      v1 = resource_variable_ops.ResourceVariable(1.)
+    with ops.device("cpu:2"):
+      v2 = resource_variable_ops.ResourceVariable(2.)
 
-class SaveableHookTest(test.TestCase):
+    self.evaluate([v0.initializer, v1.initializer, v2.initializer])
+    saver = functional_saver.MultiDeviceSaver(
+        list(saveable_object_util.saveable_objects_for_op(v0, "v0")) +
+        list(saveable_object_util.saveable_objects_for_op(v1, "v1")) +
+        list(saveable_object_util.saveable_objects_for_op(v2, "v2")))
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(saver.save(constant_op.constant(prefix), self.local_options))
+    self.assertEqual(4, len(gfile.Glob(prefix + "*")))
+    self.evaluate(v0.assign(-1.))
+    self.evaluate(v1.assign(-1.))
+    self.evaluate(v2.assign(-1.))
+    self.evaluate(
+        saver.restore(constant_op.constant(prefix), self.local_options))
+    self.assertEqual(0., self.evaluate(v0))
+    self.assertEqual(1., self.evaluate(v1))
+    self.assertEqual(2., self.evaluate(v2))
+
+    # In graph mode, verify that the save and restore ops were set to run on
+    # localhost.
+    if not context.executing_eagerly():
+      for op in ops.get_default_graph().get_operations():
+        if op.type in ("SaveV2", "RestoreV2", "MergeV2Checkpoints"):
+          self.assertEqual(LOCALHOST, op.device)
 
   def test_callbacks_run(self):
     #  Use dict because an int would be shadowed inside callback.
@@ -144,6 +212,5 @@ class SaveableHookTest(test.TestCase):
 
 
 if __name__ == "__main__":
-  ops.enable_eager_execution(
-      config=config_pb2.ConfigProto(device_count={"CPU": 3}))
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 11467969ab2..be7a9f62d4f 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -150,17 +150,22 @@ class Server(object):
       self.start()
 
   def __del__(self):
+    # At shutdown, `errors` may have been garbage collected.
+    if errors is not None:
+      exception = errors.UnimplementedError
+    else:
+      exception = Exception
     try:
       c_api.TF_ServerStop(self._server)
       # Clean shutdown of servers is not yet implemented, so
       # we leak instead of calling c_api.TF_DeleteServer here.
       # See:
       # https://github.com/tensorflow/tensorflow/blob/0495317a6e9dd4cac577b9d5cf9525e62b571018/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h#L73
-    except errors.UnimplementedError:
-      pass
     except AttributeError:
       # At shutdown, `c_api` may have been garbage collected.
       pass
+    except exception:
+      pass
     self._server = None
 
   def start(self):
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index 943490218a0..f893e29feab 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -150,6 +150,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/keras:backend",
+        "//tensorflow/python/training/saving:checkpoint_options",
         "//tensorflow/python/training/saving:functional_saver",
         "//tensorflow/python/training/saving:saveable_object_util",
         "@six_archive//:six",
@@ -191,6 +192,7 @@ tf_py_test(
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras/layers",
         "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python/training/saving:checkpoint_options",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/training/tracking/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
index 513b440bc24..8dc252efbf9 100644
--- a/tensorflow/python/training/tracking/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -518,9 +518,18 @@ class ListWrapper(
                self, self._storage, self._last_wrapped_list_snapshot)))
     return super(ListWrapper, self)._checkpoint_dependencies
 
+  def _has_mutation_or_trackable(self):
+    """Short-circuits a check for trackables if there's already a mutation."""
+    if self._non_append_mutation:
+      return True
+    return any(isinstance(element, base.Trackable) for element in self._storage)
+
   def __delitem__(self, key):
-    self._non_append_mutation = True
+    self._check_external_modification()
+    if self._has_mutation_or_trackable():
+      self._non_append_mutation = True
     del self._storage[key]
+    self._update_snapshot()
 
   def __setitem__(self, key, value):
     self._check_external_modification()
@@ -566,8 +575,11 @@ class ListWrapper(
 
   def __imul__(self, y):
     if y <= 0:
-      self._self_non_append_mutation = True
+      self._check_external_modification()
+      if self._has_mutation_or_trackable():
+        self._non_append_mutation = True
       self._storage *= y
+      self._update_snapshot()
       return self
 
     # Relies on super() calling append, which updates the snapshot.
@@ -597,19 +609,28 @@ class ListWrapper(
     raise TypeError("unhashable type: 'ListWrapper'")
 
   def insert(self, index, obj):
-    self._non_append_mutation = True
+    self._check_external_modification()
+    if (self._has_mutation_or_trackable() or isinstance(obj, base.Trackable)):
+      self._non_append_mutation = True
     self._storage.insert(index, obj)
+    self._update_snapshot()
 
   def sort(self):
-    self._non_append_mutation = True
+    self._check_external_modification()
+    if self._has_mutation_or_trackable():
+      self._non_append_mutation = True
     self._storage.sort()
+    self._update_snapshot()
 
   def __setslice__(self, i, j, y):
     self.__setitem__(slice(i, j), y)
 
   def __delslice__(self, i, j):
-    self._non_append_mutation = True
+    self._check_external_modification()
+    if self._has_mutation_or_trackable():
+      self._non_append_mutation = True
     del self._storage[slice(i, j)]
+    self._update_snapshot()
 
   def _track_value(self, value, name):
     """Allows storage of non-trackable objects."""
diff --git a/tensorflow/python/training/tracking/data_structures_test.py b/tensorflow/python/training/tracking/data_structures_test.py
index f5ce679f0ef..79c88d6873a 100644
--- a/tensorflow/python/training/tracking/data_structures_test.py
+++ b/tensorflow/python/training/tracking/data_structures_test.py
@@ -380,6 +380,13 @@ class ListWrapperTest(test.TestCase):
     l = [1]
     nest.assert_same_structure(l, data_structures.ListWrapper(copy.copy(l)))
 
+  def testMutateWithoutTrackableComponents(self):
+    m = module.Module()
+    m.l = [1, 2]
+    m.l.insert(0, 0)
+    self.assertEqual(m.l, [0, 1, 2])
+    self.assertEqual(m.l._checkpoint_dependencies, [])
+
   def testFunctionCaching(self):
     @def_function.function
     def f(list_input):
@@ -459,15 +466,15 @@ class ListWrapperTest(test.TestCase):
       hash(data_structures.ListWrapper())
 
   def testDelItem(self):
-    l = data_structures.ListWrapper([1, 2, 3, 4])
+    l = data_structures.ListWrapper([1, 2, 3, [4]])
     del l[0]
-    self.assertEqual(l, [2, 3, 4])
+    self.assertEqual(l, [2, 3, [4]])
     self.assertUnableToSave(l, "Unable to save .*__delitem__")
 
   def testDelSlice(self):
-    l = data_structures.ListWrapper([1, 2, 3, 4])
+    l = data_structures.ListWrapper([1, 2, 3, [4]])
     del l[2:3]
-    self.assertEqual(l, [1, 2, 4])
+    self.assertEqual(l, [1, 2, [4]])
     self.assertUnableToSave(l, "Unable to save .*__delslice__")
 
   def testSetSlice_canSaveForNonTrackableItems(self):
@@ -496,9 +503,9 @@ class ListWrapperTest(test.TestCase):
     self.assertEqual(l, [1, 2, 1, 2, 3, 4])
 
   def testIMulNegative(self):
-    l = data_structures.ListWrapper([1, 2, 3, 4])
+    l = data_structures.ListWrapper([1, 2, 3, [4]])
     l *= -1
-    self.assertEqual(l, [1, 2, 3, 4] * -1)
+    self.assertEqual(l, [1, 2, 3, [4]] * -1)
     self.assertUnableToSave(l, "Unable to save")
 
   def testIMulPositive(self):
@@ -516,9 +523,9 @@ class ListWrapperTest(test.TestCase):
     self.assertAllClose(1., v.numpy())
 
   def testSort(self):
-    l = data_structures.ListWrapper([1, 2, 3, 4])
+    l = data_structures.ListWrapper([[1], [2], [3], [4]])
     l.sort()
-    self.assertEqual(l, [1, 2, 3, 4])
+    self.assertAllEqual(l, [[1], [2], [3], [4]])
     # Regardless of being a no-op for the input list, we still refuse to save.
     # This is intentional since otherwise we would end up with a hard to debug
     # case for users (e.g. sometimes sort on a ListWrapper is trackable and
diff --git a/tensorflow/python/training/tracking/tracking_test.py b/tensorflow/python/training/tracking/tracking_test.py
index 7fcd27f5624..cf2da4c9afa 100644
--- a/tensorflow/python/training/tracking/tracking_test.py
+++ b/tensorflow/python/training/tracking/tracking_test.py
@@ -29,6 +29,7 @@ import six
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
+from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import base
@@ -215,7 +216,7 @@ class InterfaceTests(test.TestCase):
     checkpoint = util.Checkpoint(a=a)
     checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
     a.l2 = []
-    a.l2.insert(1, 0)
+    a.l2.insert(1, module.Module())
     with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
index 24a28e94031..7b603ed5dc2 100644
--- a/tensorflow/python/training/tracking/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -44,6 +44,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training import saver as v1_saver_lib
+from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base
@@ -168,7 +169,7 @@ class _CheckpointRestoreCoordinator(object):
   """Holds the status of an object-based checkpoint load."""
 
   def __init__(self, object_graph_proto, save_path, save_path_tensor,
-               restore_op_cache, graph_view):
+               restore_op_cache, graph_view, options):
     """Specify the checkpoint being loaded.
 
     Args:
@@ -184,7 +185,9 @@ class _CheckpointRestoreCoordinator(object):
         `restore()` calls.
       graph_view: A graph_view_lib.ObjectGraphView object for the restored
         objects.
+      options: A CheckpointOptions object.
     """
+    self.options = options
     self.object_graph_proto = object_graph_proto
     self.restore_uid = ops.uid()
     # Maps from proto ids to lists of attributes which were in the checkpoint
@@ -291,7 +294,7 @@ class _CheckpointRestoreCoordinator(object):
             ("Saveable keys changed when validating. Got back %s, was "
              "expecting %s") % (tensor_saveables.keys(), validated_names))
       new_restore_ops = functional_saver.MultiDeviceSaver(
-          validated_saveables).restore(self.save_path_tensor)
+          validated_saveables).restore(self.save_path_tensor, self.options)
       if not context.executing_eagerly():
         for name, restore_op in sorted(new_restore_ops.items()):
           restore_ops.append(restore_op)
@@ -1113,13 +1116,15 @@ class TrackableSaver(object):
 
   def _save_cached_when_graph_building(self,
                                        file_prefix,
-                                       object_graph_tensor=None):
+                                       object_graph_tensor,
+                                       options):
     """Create or retrieve save ops.
 
     Args:
       file_prefix: The prefix for saved checkpoint files.
       object_graph_tensor: A `Tensor` to which the current object graph will be
         fed.
+      options: `CheckpointOptions` object.
 
     Returns:
       A two-element tuple with a filename tensor and a feed_dict of tensors to
@@ -1137,14 +1142,15 @@ class TrackableSaver(object):
         # var_list.
         or context.executing_eagerly() or ops.inside_function()):
       saver = functional_saver.MultiDeviceSaver(named_saveable_objects)
-      save_op = saver.save(file_prefix)
+      save_op = saver.save(file_prefix, options=options)
       with ops.device("/cpu:0"):
         with ops.control_dependencies([save_op]):
           self._cached_save_operation = array_ops.identity(file_prefix)
       self._last_save_object_graph = graph_proto
     return self._cached_save_operation, feed_additions
 
-  def save(self, file_prefix, checkpoint_number=None, session=None):
+  def save(self, file_prefix, checkpoint_number=None, session=None,
+           options=None):
     """Save a training checkpoint.
 
     The saved checkpoint includes variables created by this object and any
@@ -1162,10 +1168,12 @@ class TrackableSaver(object):
       session: The session to evaluate variables in. Ignored when executing
         eagerly. If not provided when graph building, the default session is
         used.
+      options: Optional `tf.train.CheckpointOptions` object.
 
     Returns:
       The full path to the checkpoint.
     """
+    options = options or checkpoint_options.CheckpointOptions()
     feed_dict = {}
     use_session = (not context.executing_eagerly() and
                    not ops.inside_function())
@@ -1189,7 +1197,7 @@ class TrackableSaver(object):
 
     file_io.recursive_create_dir(os.path.dirname(file_prefix))
     save_path, new_feed_additions = self._save_cached_when_graph_building(
-        file_prefix=file_prefix_tensor, object_graph_tensor=object_graph_tensor)
+        file_prefix_tensor, object_graph_tensor, options)
     if new_feed_additions:
       feed_dict.update(new_feed_additions)
     if not use_session:
@@ -1202,7 +1210,7 @@ class TrackableSaver(object):
     else:
       return save_path
 
-  def restore(self, save_path):
+  def restore(self, save_path, options=None):
     """Restore a training checkpoint.
 
     Restores `root_trackable` and any objects that it tracks
@@ -1250,6 +1258,7 @@ class TrackableSaver(object):
         object which may run initializers for objects in the dependency graph.
         If the checkpoint was written by the name-based
         `tf.compat.v1.train.Saver`, names are used to match variables.
+      options: Optional `tf.train.CheckpointOptions` object.
 
     Returns:
       A load status object, which can be used to make assertions about the
@@ -1260,6 +1269,7 @@ class TrackableSaver(object):
       If `save_path` points to a name-based checkpoint, a `NameBasedSaverStatus`
       object is returned which runs restore ops from a name-based saver.
     """
+    options = options or checkpoint_options.CheckpointOptions()
     if save_path is None:
       return InitializationOnlyStatus(self._graph_view, ops.uid())
     reader = py_checkpoint_reader.NewCheckpointReader(save_path)
@@ -1304,7 +1314,8 @@ class TrackableSaver(object):
         save_path=save_path,
         save_path_tensor=file_prefix_tensor,
         restore_op_cache=self._restore_op_cache,
-        graph_view=self._graph_view)
+        graph_view=self._graph_view,
+        options=options)
     base.CheckpointPosition(
         checkpoint=checkpoint, proto_id=0).restore(self._graph_view.root)
     load_status = CheckpointLoadStatus(
@@ -1736,6 +1747,8 @@ class Checkpoint(tracking.AutoTrackable):
   checkpoint_directory = "/tmp/training_checkpoints"
   checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
+  # Create a Checkpoint that will manage two objects with trackable state,
+  # one we name "optimizer" and the other we name "model".
   checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
   status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
   for _ in range(num_training_steps):
@@ -1744,7 +1757,7 @@ class Checkpoint(tracking.AutoTrackable):
   checkpoint.save(file_prefix=checkpoint_prefix)
   ```
 
-  `Checkpoint.save` and `Checkpoint.restore` write and read object-based
+  `Checkpoint.save()` and `Checkpoint.restore()` write and read object-based
   checkpoints, in contrast to TensorFlow 1.x's `tf.compat.v1.train.Saver` which
   writes and
   reads `variable.name` based checkpoints. Object-based checkpointing saves a
@@ -1757,7 +1770,7 @@ class Checkpoint(tracking.AutoTrackable):
   arguments to their constructors, and each dependency is given a name that is
   identical to the name of the keyword argument for which it was created.
   TensorFlow classes like `Layer`s and `Optimizer`s will automatically add
-  dependencies on their variables (e.g. "kernel" and "bias" for
+  dependencies on their own variables (e.g. "kernel" and "bias" for
   `tf.keras.layers.Dense`). Inheriting from `tf.keras.Model` makes managing
   dependencies easy in user-defined classes, since `Model` hooks into attribute
   assignment. For example:
@@ -1840,7 +1853,7 @@ class Checkpoint(tracking.AutoTrackable):
                 dtype=dtypes.int64,
                 trainable=False))
 
-  def write(self, file_prefix):
+  def write(self, file_prefix, options=None):
     """Writes a training checkpoint.
 
     The checkpoint includes variables created by this object and any
@@ -1854,14 +1867,35 @@ class Checkpoint(tracking.AutoTrackable):
 
     Checkpoints written with `write` must be read with `read`.
 
+    Example usage:
+
+    ```
+    step = tf.Variable(0, name="step")
+    checkpoint = tf.Checkpoint(step=step)
+    checkpoint.write("/tmp/ckpt")
+
+    # Later, read the checkpoint with read()
+    checkpoint.read("/tmp/ckpt").assert_consumed()
+
+    # You can also pass options to write() and read(). For example this
+    # runs the IO ops on the localhost:
+    options = tf.CheckpointOptions(experimental_io_device="/job:localhost")
+    checkpoint.write("/tmp/ckpt", options=options)
+
+    # Later, read the checkpoint with read()
+    checkpoint.read("/tmp/ckpt", options=options).assert_consumed()
+    ```
+
     Args:
       file_prefix: A prefix to use for the checkpoint filenames
         (/path/to/directory/and_a_prefix).
+      options: Optional `tf.train.CheckpointOptions` object.
 
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
-    output = self._saver.save(file_prefix=file_prefix)
+    options = options or checkpoint_options.CheckpointOptions()
+    output = self._saver.save(file_prefix=file_prefix, options=options)
     if tensor_util.is_tensor(output):
       if context.executing_eagerly():
         return compat.as_str(output.numpy())
@@ -1884,7 +1918,7 @@ class Checkpoint(tracking.AutoTrackable):
     self._maybe_create_save_counter()
     return self._save_counter
 
-  def save(self, file_prefix):
+  def save(self, file_prefix, options=None):
     """Saves a training checkpoint and provides basic checkpoint management.
 
     The saved checkpoint includes variables created by this object and any
@@ -1898,14 +1932,33 @@ class Checkpoint(tracking.AutoTrackable):
     provided by other utilities which also wrap `write` and `read`.
     (`tf.train.CheckpointManager` for example).
 
+    ```
+    step = tf.Variable(0, name="step")
+    checkpoint = tf.Checkpoint(step=step)
+    checkpoint.save("/tmp/ckpt")
+
+    # Later, read the checkpoint with restore()
+    checkpoint.restore("/tmp/ckpt").assert_consumed()
+
+    # You can also pass options to save() and restore(). For example this
+    # runs the IO ops on the localhost:
+    options = tf.CheckpointOptions(experimental_io_device="/job:localhost")
+    checkpoint.save("/tmp/ckpt", options=options)
+
+    # Later, read the checkpoint with restore()
+    checkpoint.restore("/tmp/ckpt", options=options).assert_consumed()
+    ```
+
     Args:
       file_prefix: A prefix to use for the checkpoint filenames
         (/path/to/directory/and_a_prefix). Names are generated based on this
         prefix and `Checkpoint.save_counter`.
+      options: Optional `tf.train.CheckpointOptions` object.
 
     Returns:
       The full path to the checkpoint.
     """
+    options = options or checkpoint_options.CheckpointOptions()
     graph_building = not context.executing_eagerly()
     if graph_building:
       if ops.inside_function():
@@ -1931,7 +1984,8 @@ class Checkpoint(tracking.AutoTrackable):
       checkpoint_number = session.run(self._save_assign_op)
     else:
       checkpoint_number = assign_op.numpy()
-    file_path = self.write("%s-%d" % (file_prefix, checkpoint_number))
+    file_path = self.write("%s-%d" % (file_prefix, checkpoint_number),
+                           options=options)
     checkpoint_management.update_checkpoint_state_internal(
         save_dir=os.path.dirname(file_prefix),
         model_checkpoint_path=file_path,
@@ -1939,7 +1993,7 @@ class Checkpoint(tracking.AutoTrackable):
         save_relative_paths=True)
     return file_path
 
-  def read(self, save_path):
+  def read(self, save_path, options=None):
     """Read a training checkpoint written with `write`.
 
     Reads this `Checkpoint` and any objects it depends on.
@@ -1962,18 +2016,25 @@ class Checkpoint(tracking.AutoTrackable):
     # Later, load the checkpoint with read()
     # With restore() assert_consumed() would have failed.
     checkpoint.read(path).assert_consumed()
+
+    # You can also pass options to restore(). For example this
+    # runs the IO ops on the localhost:
+    options = tf.CheckpointOptions(experimental_io_device="/job:localhost")
+    checkpoint.read(path, options=options)
     ```
 
     Args:
       save_path: The path to the checkpoint as returned by `write`.
+      options: Optional `tf.train.CheckpointOptions` object.
 
     Returns:
       A load status object, which can be used to make assertions about the
       status of a checkpoint restoration.  See `restore` for details.
     """
-    return self._saver.restore(save_path=save_path)
+    options = options or checkpoint_options.CheckpointOptions()
+    return self._saver.restore(save_path=save_path, options=options)
 
-  def restore(self, save_path):
+  def restore(self, save_path, options=None):
     """Restore a training checkpoint.
 
     Restores this `Checkpoint` and any objects it depends on.
@@ -1995,6 +2056,10 @@ class Checkpoint(tracking.AutoTrackable):
     ```python
     checkpoint = tf.train.Checkpoint( ... )
     checkpoint.restore(path).assert_consumed()
+
+    # You can additionally pass options to restore():
+    options = tf.CheckpointOptions(experimental_io_device="/job:localhost")
+    checkpoint.restore(path, options=options).assert_consumed()
     ```
 
     An exception will be raised if any Python objects in the dependency graph
@@ -2011,6 +2076,7 @@ class Checkpoint(tracking.AutoTrackable):
         `tf.train.latest_checkpoint`. If the checkpoint was written by the
         name-based `tf.compat.v1.train.Saver`, names are used to match
         variables.
+      options: Optional `tf.train.CheckpointOptions` object.
 
     Returns:
       A load status object, which can be used to make assertions about the
@@ -2049,7 +2115,7 @@ class Checkpoint(tracking.AutoTrackable):
           checkpoint file or object when the `Checkpoint` object is deleted
           (often at program shutdown).
     """
-    status = self.read(save_path)
+    status = self.read(save_path, options=options)
     # Create the save counter now so it gets initialized with other variables
     # when graph building. Creating it earlier would lead to errors when using,
     # say, train.Saver() to save the model before initializing it.
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index a69a34c1038..7a96fedc89b 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
@@ -409,6 +410,28 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     del ckpt
     status.assert_consumed()
 
+  @test_util.run_in_graph_and_eager_modes
+  def testPassingCheckpointOptions(self):
+    localhost = "/job:localhost/device:CPU:0"
+    options = checkpoint_options.CheckpointOptions(
+        experimental_io_device=localhost)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    v = variable_scope.get_variable(name="v", initializer=0.)
+    self.evaluate(v.initializer)
+    ckpt = trackable_utils.Checkpoint(v=v)
+    self.evaluate(trackable_utils.gather_initializers(ckpt))
+    save_path = ckpt.save(file_prefix=prefix, options=options)
+    status = ckpt.restore(save_path=save_path, options=options)
+    del ckpt
+    status.assert_consumed()
+
+    # In graph mode, verify that the save and restore ops were set to run on
+    # localhost.
+    if not context.executing_eagerly():
+      for op in ops.get_default_graph().get_operations():
+        if op.type in ("SaveV2", "RestoreV2"):
+          self.assertEqual(localhost, op.device)
+
   @test_util.run_in_graph_and_eager_modes
   def testSaveRestore(self):
     model = MyModel()
diff --git a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
index 657659d580c..d4857677046 100644
--- a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
+++ b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
@@ -289,16 +289,17 @@ class CheckpointingTests(test.TestCase):
           functools.partial(model, input_value),
           global_step=root.optimizer_step)
 
-    for training_continuation in range(3):
-      strategy = mirrored_strategy.MirroredStrategy()
-      with strategy.scope():
+    strategy = mirrored_strategy.MirroredStrategy()
+    with strategy.scope():
+      for training_continuation in range(3):
         model = MyModel()
         optimizer = adam.AdamOptimizer(0.001)
         root = trackable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
+            optimizer=optimizer,
+            model=model,
             optimizer_step=training_util.get_or_create_global_step())
-        root.restore(checkpoint_management.latest_checkpoint(
-            checkpoint_directory))
+        root.restore(
+            checkpoint_management.latest_checkpoint(checkpoint_directory))
 
         for _ in range(num_training_steps):
           strategy.extended.call_for_each_replica(
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index 4d329b4b316..11c7782e76e 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -363,10 +363,8 @@ def _get_grouped_variables(vars_to_warm_start):
   # out the list.
   grouped_variables = {}
   for v in list_of_vars:
-    if not isinstance(v, list):
-      var_name = _infer_var_name([v])
-    else:
-      var_name = _infer_var_name(v)
+    t = [v] if not isinstance(v, list) else v
+    var_name = _infer_var_name(t)
     grouped_variables.setdefault(var_name, []).append(v)
 
   return grouped_variables
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index 040555b910f..e93bf5c10b3 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -23,9 +23,13 @@ py_strict_library(
     srcs = [
         "__init__.py",
         "core.py",
+        "distribute.py",
+        "internal.py",
     ],
     srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
+    visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:types_whitelist",
     ],
+    deps = [],
 )
diff --git a/tensorflow/python/types/distribute.py b/tensorflow/python/types/distribute.py
new file mode 100644
index 00000000000..fb2a439ba41
--- /dev/null
+++ b/tensorflow/python/types/distribute.py
@@ -0,0 +1,61 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Types specific to tf.distribute."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# TODO(mdan, anjalisridhar): Decide the location of this file.
+
+
+class Iterable(object):
+  """Interface for distributed objects that admit iteration/reduction."""
+
+  def __iter__(self):
+    pass
+
+  # TODO(mdan): Describe this contract.
+  def reduce(self, initial_state, reduce_func):
+    """Reduces this iterable object to a single element.
+
+    The transformation calls `reduce_func` successively on each element.
+    The `initial_state` argument is used for the initial state and the final
+    state is returned as the result.
+
+    Args:
+      initial_state: An element representing the initial state of the
+        reduction.
+      reduce_func: A function that maps `(old_state, input_element)` to
+        `new_state`. The structure of `new_state` must match the structure of
+        `old_state`. For the first element, `old_state` is `initial_state`.
+
+    Returns:
+      The final state of the transformation.
+    """
+
+
+class Iterator(object):
+  """Interface for distributed iterators."""
+
+  def get_next(self):
+    """Unlike __next__, this may use a non-raising mechanism."""
+
+  def __next__(self):
+    pass
+
+  def __iter__(self):
+    pass
diff --git a/tensorflow/python/types/internal.py b/tensorflow/python/types/internal.py
new file mode 100644
index 00000000000..892fd96cffc
--- /dev/null
+++ b/tensorflow/python/types/internal.py
@@ -0,0 +1,30 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Types internal to TensorFlow.
+
+These types should not be exported. External code should not rely on these.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# TODO(mdan): Is this strictly needed? Only ops.py really uses it.
+class NativeObject(object):
+  """Types natively supported by various TF operations.
+
+  The most notable example of NativeObject is Tensor.
+  """
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index e94e3345348..51dfe3793ae 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -39,6 +39,10 @@ from tensorflow.python.util import tf_inspect
 DISPATCH_ATTR = "_tf_dispatchers"
 
 
+# OpDispatchers which should be used for all operations.
+_GLOBAL_DISPATCHERS = []
+
+
 class OpDispatcher(object):
   """Abstract base class for TensorFlow operator dispatchers.
 
@@ -82,7 +86,20 @@ class OpDispatcher(object):
     getattr(op, DISPATCH_ATTR).append(self)
 
 
-def dispatch(op, *args, **kwargs):
+class GlobalOpDispatcher(object):
+  """Abstract base class for TensorFlow global operator dispatchers."""
+
+  NOT_SUPPORTED = OpDispatcher.NOT_SUPPORTED
+
+  def handle(self, op, args, kwargs):
+    """Handle the specified operation with the specified arguments."""
+
+  def register(self):
+    """Register this dispatcher as a handler for all ops."""
+    _GLOBAL_DISPATCHERS.append(self)
+
+
+def dispatch(op, args, kwargs):
   """Returns the result from the first successful dispatcher for a given op.
 
   Calls the `handle` method of each `OpDispatcher` that has been registered
@@ -90,8 +107,8 @@ def dispatch(op, *args, **kwargs):
 
   Args:
     op: Python function: the operation to dispatch for.
-    *args: The arguments to the operation.
-    **kwargs: They keyword arguments to the operation.
+    args: The arguments to the operation.
+    kwargs: They keyword arguments to the operation.
 
   Returns:
     The result of the operation, or `NOT_SUPPORTED` if no registered
@@ -101,6 +118,10 @@ def dispatch(op, *args, **kwargs):
     result = dispatcher.handle(args, kwargs)
     if result is not OpDispatcher.NOT_SUPPORTED:
       return result
+  for dispatcher in _GLOBAL_DISPATCHERS:
+    result = dispatcher.handle(op, args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
   return OpDispatcher.NOT_SUPPORTED
 
 
@@ -181,7 +202,7 @@ def add_dispatch_support(target):
     except (TypeError, ValueError):
       # Note: convert_to_eager_tensor currently raises a ValueError, not a
       # TypeError, when given unexpected types.  So we need to catch both.
-      result = dispatch(wrapper, *args, **kwargs)
+      result = dispatch(wrapper, args, kwargs)
       if result is not OpDispatcher.NOT_SUPPORTED:
         return result
       else:
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index 89999fcf843..bd35c391924 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -45,6 +45,47 @@ def test_op(x, y, z):
   return x + (2 * y) + (3 * z)
 
 
+class TensorTracer(object):
+  """An object used to trace TensorFlow graphs.
+
+  This is an example class that is used to test global op dispatchers.  The
+  global op dispatcher for TensorTracers is defined below.
+  """
+
+  def __init__(self, name, args=None, kwargs=None):
+    self.name = name
+    self.args = args
+    self.kwargs = kwargs
+
+  def __repr__(self):
+    if self.args is None and self.kwargs is None:
+      return self.name
+    else:
+      args = [str(x) for x in self.args]
+      args += sorted(
+          ["{}={}".format(name, x) for (name, x) in self.kwargs.items()])
+      return "{}({})".format(self.name, ", ".join(args))
+
+
+class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
+  """Global op dispatcher for TensorTracer."""
+
+  def handle(self, op, args, kwargs):
+    # Dispatcher only applies if at least one arg is a TensorTracer.
+    if not (any(self.is_tensor_tracer_arg(x) for x in args) or
+            any(self.is_tensor_tracer_arg(x) for x in kwargs.values())):
+      return self.NOT_SUPPORTED
+
+    return TensorTracer(op.__name__, args, kwargs)
+
+  def is_tensor_tracer_arg(self, value):
+    if isinstance(value, TensorTracer):
+      return True
+    if isinstance(value, (list, tuple)):
+      if any(isinstance(x, TensorTracer) for x in value):
+        return True
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class DispatchTest(test_util.TensorFlowTestCase):
 
@@ -131,8 +172,21 @@ class DispatchTest(test_util.TensorFlowTestCase):
         r".*some_op \(from __main__\) is deprecated and will be "
         "removed in a future version.*")
 
+  def testGlobalDispatcher(self):
+    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
+    try:
+      TensorTracerOpDispatcher().register()
+
+      x = TensorTracer("x")
+      y = TensorTracer("y")
+      trace = math_ops.reduce_sum(math_ops.add(math_ops.abs(x), y), axis=3)
+      self.assertEqual(
+          str(trace), "reduce_sum(add(name=None, x=abs(x), y=y), axis=3)")
+
+    finally:
+      # Clean up.
+      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
+
 
 if __name__ == "__main__":
   googletest.main()
-
-
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 517030193de..695cc4cc909 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -231,7 +231,7 @@ def _yield_sorted_items(iterable):
       yield field, getattr(iterable, field)
   elif _is_composite_tensor(iterable):
     type_spec = iterable._type_spec  # pylint: disable=protected-access
-    yield type(iterable).__name__, type_spec._to_components(iterable)  # pylint: disable=protected-access
+    yield type_spec.value_type.__name__, type_spec._to_components(iterable)  # pylint: disable=protected-access
   elif _is_type_spec(iterable):
     # Note: to allow CompositeTensors and their TypeSpecs to have matching
     # structures, we need to use the same key string here.
diff --git a/tensorflow/python/util/py_checkpoint_reader_wrapper.cc b/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
index d034505c66d..315d6d02f10 100644
--- a/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
+++ b/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
@@ -112,7 +112,7 @@ static py::object CheckpointReader_GetTensor(
   tensorflow::MaybeRaiseFromStatus(
       tensorflow::TensorToNdarray(*tensor, &py_obj));
 
-  return tensorflow::pyo_or_throw(
+  return tensorflow::PyoOrThrow(
       PyArray_Return(reinterpret_cast<PyArrayObject*>(py_obj)));
 }
 
diff --git a/tensorflow/python/util/util_wrapper.cc b/tensorflow/python/util/util_wrapper.cc
index 244ab3e5fc2..dd74306413c 100644
--- a/tensorflow/python/util/util_wrapper.cc
+++ b/tensorflow/python/util/util_wrapper.cc
@@ -27,7 +27,7 @@ PYBIND11_MODULE(_pywrap_utils, m) {
   )pbdoc";
   m.def("RegisterType",
         [](const py::handle& type_name, const py::handle& type) {
-          return tensorflow::pyo_or_throw(
+          return tensorflow::PyoOrThrow(
               tensorflow::swig::RegisterType(type_name.ptr(), type.ptr()));
         });
   m.def(
@@ -116,7 +116,7 @@ PYBIND11_MODULE(_pywrap_utils, m) {
   m.def(
       "IsNamedtuple",
       [](const py::handle& o, bool strict) {
-        return tensorflow::pyo_or_throw(
+        return tensorflow::PyoOrThrow(
             tensorflow::swig::IsNamedtuple(o.ptr(), strict));
       },
       R"pbdoc(
@@ -197,7 +197,7 @@ PYBIND11_MODULE(_pywrap_utils, m) {
   m.def(
       "SameNamedtuples",
       [](const py::handle& o1, const py::handle& o2) {
-        return tensorflow::pyo_or_throw(
+        return tensorflow::PyoOrThrow(
             tensorflow::swig::SameNamedtuples(o1.ptr(), o2.ptr()));
       },
       R"pbdoc(
@@ -220,7 +220,7 @@ PYBIND11_MODULE(_pywrap_utils, m) {
   m.def(
       "Flatten",
       [](const py::handle& o, bool expand_composites) {
-        return tensorflow::pyo_or_throw(
+        return tensorflow::PyoOrThrow(
             tensorflow::swig::Flatten(o.ptr(), expand_composites));
       },
       R"pbdoc(
@@ -280,7 +280,7 @@ PYBIND11_MODULE(_pywrap_utils, m) {
   m.def(
       "FlattenForData",
       [](const py::handle& o) {
-        return tensorflow::pyo_or_throw(
+        return tensorflow::PyoOrThrow(
             tensorflow::swig::FlattenForData(o.ptr()));
       },
       R"pbdoc(
diff --git a/tensorflow/security/advisory/tfsa-2018-001.md b/tensorflow/security/advisory/tfsa-2018-001.md
index 1966789c846..f882edca186 100644
--- a/tensorflow/security/advisory/tfsa-2018-001.md
+++ b/tensorflow/security/advisory/tfsa-2018-001.md
@@ -2,7 +2,7 @@
 
 ### CVE Number
 
-CVE-2018-7574
+CVE-2018-21233
 
 ### Issue Description
 
diff --git a/tensorflow/stream_executor/allocator_stats.cc b/tensorflow/stream_executor/allocator_stats.cc
index 8a45efdef83..0e25063a446 100644
--- a/tensorflow/stream_executor/allocator_stats.cc
+++ b/tensorflow/stream_executor/allocator_stats.cc
@@ -20,13 +20,18 @@ namespace stream_executor {
 
 std::string AllocatorStats::DebugString() const {
   return absl::StrFormat(
-      "Limit:        %20lld\n"
-      "InUse:        %20lld\n"
-      "MaxInUse:     %20lld\n"
-      "NumAllocs:    %20lld\n"
-      "MaxAllocSize: %20lld\n",
+      "Limit:            %20lld\n"
+      "InUse:            %20lld\n"
+      "MaxInUse:         %20lld\n"
+      "NumAllocs:        %20lld\n"
+      "MaxAllocSize:     %20lld\n"
+      "Reserved:         %20lld\n"
+      "PeakReserved:     %20lld\n"
+      "LargestFreeBlock: %20lld\n",
       this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
-      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size);
+      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size,
+      this->bytes_reserved, this->peak_bytes_reserved,
+      this->largest_free_block_bytes);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/allocator_stats.h b/tensorflow/stream_executor/allocator_stats.h
index 9a99c1099c9..94dafdb6da6 100644
--- a/tensorflow/stream_executor/allocator_stats.h
+++ b/tensorflow/stream_executor/allocator_stats.h
@@ -43,13 +43,16 @@ struct AllocatorStats {
   // if such a limit is known.
   absl::optional<int64> bytes_reservable_limit;
 
+  int64 largest_free_block_bytes;  // Largest free block's size in heap.
+
   AllocatorStats()
       : num_allocs(0),
         bytes_in_use(0),
         peak_bytes_in_use(0),
         largest_alloc_size(0),
         bytes_reserved(0),
-        peak_bytes_reserved(0) {}
+        peak_bytes_reserved(0),
+        largest_free_block_bytes(0) {}
 
   std::string DebugString() const;
 };
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 8fb8d032198..c9f0fc462c9 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -44,8 +44,6 @@ limitations under the License.
 #define EIGEN_HAS_CUDA_FP16
 #endif
 
-#include <assert.h>
-
 #include <complex>
 
 #include "absl/strings/str_cat.h"
@@ -490,8 +488,9 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
                           std::complex<float> alpha,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<std::complex<float>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasCaxpy, stream, true /* = pointer_mode_host */,
-                        elem_count, GpuComplex(&alpha),
+                        elem_count, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(x)), incx,
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
@@ -500,8 +499,9 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
                           std::complex<double> alpha,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<std::complex<double>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasZaxpy, stream, true /* = pointer_mode_host */,
-                        elem_count, GpuComplex(&alpha),
+                        elem_count, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(x)), incx,
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
@@ -752,30 +752,32 @@ bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
                           DeviceMemory<std::complex<float>> *x, int incx) {
   return DoBlasInternal(cublasCsscal, stream, true /* = pointer_mode_host */,
-                        elem_count, GpuComplex(&alpha),
-                        GpuComplex(GpuMemoryMutable(x)), incx);
+                        elem_count, &alpha, GpuComplex(GpuMemoryMutable(x)),
+                        incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
                           DeviceMemory<std::complex<double>> *x, int incx) {
   return DoBlasInternal(cublasZdscal, stream, true /* = pointer_mode_host */,
-                        elem_count, GpuComplex(&alpha),
-                        GpuComplex(GpuMemoryMutable(x)), incx);
+                        elem_count, &alpha, GpuComplex(GpuMemoryMutable(x)),
+                        incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
                           std::complex<float> alpha,
                           DeviceMemory<std::complex<float>> *x, int incx) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasCscal, stream, true /* = pointer_mode_host */,
-                        elem_count, GpuComplex(&alpha),
+                        elem_count, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
                           std::complex<double> alpha,
                           DeviceMemory<std::complex<double>> *x, int incx) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasZscal, stream, true /* = pointer_mode_host */,
-                        elem_count, GpuComplex(&alpha),
+                        elem_count, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
@@ -904,10 +906,12 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasCgbmv, stream, true /* = pointer_mode_host */,
                         CUDABlasTranspose(trans), m, n, kl, ku,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta),
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(x)), incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -918,10 +922,12 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZgbmv, stream, true /* = pointer_mode_host */,
                         CUDABlasTranspose(trans), m, n, kl, ku,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta),
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(x)), incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -951,10 +957,12 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasCgemv, stream, true /* = pointer_mode_host */,
-                        CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
+                        CUDABlasTranspose(trans), m, n, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
-                        incx, GpuComplex(&beta),
+                        incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -964,10 +972,12 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZgemv, stream, true /* = pointer_mode_host */,
-                        CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
+                        CUDABlasTranspose(trans), m, n, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
-                        incx, GpuComplex(&beta),
+                        incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -994,9 +1004,10 @@ bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *a, int lda) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasCgerc, stream, true /* = pointer_mode_host */, m,
-                        n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
-                        GpuComplex(GpuMemory(y)), incy,
+                        n, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(GpuMemory(y)), incy,
                         GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
@@ -1005,9 +1016,10 @@ bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *a, int lda) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasZgerc, stream, true /* = pointer_mode_host */, m,
-                        n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
-                        GpuComplex(GpuMemory(y)), incy,
+                        n, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(GpuMemory(y)), incy,
                         GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
@@ -1016,9 +1028,10 @@ bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *a, int lda) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasCgeru, stream, true /* = pointer_mode_host */, m,
-                        n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
-                        GpuComplex(GpuMemory(y)), incy,
+                        n, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(GpuMemory(y)), incy,
                         GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
@@ -1027,9 +1040,10 @@ bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *a, int lda) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasZgeru, stream, true /* = pointer_mode_host */, m,
-                        n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
-                        GpuComplex(GpuMemory(y)), incy,
+                        n, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(GpuMemory(y)), incy,
                         GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
@@ -1039,10 +1053,12 @@ bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasChbmv, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, k, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
-                        incx, GpuComplex(&beta),
+                        incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -1052,10 +1068,12 @@ bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZhbmv, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, k, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
-                        incx, GpuComplex(&beta),
+                        incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -1065,10 +1083,12 @@ bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasChemv, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
-                        incx, GpuComplex(&beta),
+                        incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -1078,10 +1098,12 @@ bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZhemv, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
-                        incx, GpuComplex(&beta),
+                        incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -1110,8 +1132,9 @@ bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *a, int lda) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasCher2, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(x)), incx,
                         GpuComplex(GpuMemory(y)), incy,
                         GpuComplex(GpuMemoryMutable(a)), lda);
@@ -1122,8 +1145,9 @@ bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *a, int lda) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasZher2, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(x)), incx,
                         GpuComplex(GpuMemory(y)), incy,
                         GpuComplex(GpuMemoryMutable(a)), lda);
@@ -1135,10 +1159,12 @@ bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasChpmv, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
-                        incx, GpuComplex(&beta),
+                        incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -1148,10 +1174,12 @@ bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZhpmv, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
                         GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
-                        incx, GpuComplex(&beta),
+                        incx, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
@@ -1160,7 +1188,7 @@ bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          const DeviceMemory<std::complex<float>> &x, int incx,
                          DeviceMemory<std::complex<float>> *ap) {
   return DoBlasInternal(cublasChpr, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, &alpha,
                         GpuComplex(GpuMemory(x)), incx,
                         GpuComplex(GpuMemoryMutable(ap)));
 }
@@ -1170,7 +1198,7 @@ bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          const DeviceMemory<std::complex<double>> &x, int incx,
                          DeviceMemory<std::complex<double>> *ap) {
   return DoBlasInternal(cublasZhpr, stream, true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        CUDABlasUpperLower(uplo), n, &alpha,
                         GpuComplex(GpuMemory(x)), incx,
                         GpuComplex(GpuMemoryMutable(ap)));
 }
@@ -1180,10 +1208,12 @@ bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *ap) {
-  return DoBlasInternal(
-      cublasChpr2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)),
-      incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap)));
+  auto cb_alpha = GpuComplexValue(alpha);
+  return DoBlasInternal(cublasChpr2, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1191,10 +1221,12 @@ bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *ap) {
-  return DoBlasInternal(
-      cublasZhpr2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)),
-      incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap)));
+  auto cb_alpha = GpuComplexValue(alpha);
+  return DoBlasInternal(cublasZhpr2, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1684,11 +1716,14 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                           const DeviceMemory<std::complex<float>> &b, int ldb,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasCgemm, stream, true /* = pointer_mode_host */,
                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
-                        n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
-                        GpuComplex(GpuMemoryMutable(c)), ldc);
+                        n, k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemory(b)), ldb,
+                        GpuComplex(&cb_beta), GpuComplex(GpuMemoryMutable(c)),
+                        ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -1698,11 +1733,14 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                           const DeviceMemory<std::complex<double>> &b, int ldb,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZgemm, stream, true /* = pointer_mode_host */,
                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
-                        n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
-                        GpuComplex(GpuMemoryMutable(c)), ldc);
+                        n, k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemory(b)), ldb,
+                        GpuComplex(&cb_beta), GpuComplex(GpuMemoryMutable(c)),
+                        ldc);
 }
 
 bool CUDABlas::DoBlasGemvWithProfiling(
@@ -2149,6 +2187,15 @@ struct HalfAsFloat<Eigen::half> {
   typedef float type;
 };
 
+namespace {
+// pass-through for non-complex types that don't need conversion to
+// cublas-specific type.
+template <typename T>
+T inline GpuComplexValue(T v) {
+  return v;
+}
+}  // namespace
+
 template <typename T, typename Scalar, typename FuncT>
 port::Status CUDABlas::DoBlasGemmBatchedInternal(
     FuncT cublas_func, Stream *stream, blas::Transpose transa,
@@ -2250,11 +2297,13 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
 #endif
   // either CUDA_VERSION < 9.1 or SM < 5.0
   if (data_type != CUDA_R_16F) {
+    auto cb_alpha = GpuComplexValue(alpha);
+    auto cb_beta = GpuComplexValue(beta);
     bool ok = DoBlasInternal(
         cublas_func, stream, true /* = pointer_mode_host */,
         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-        GpuComplex(&alpha), const_cast<const CUDA_T **>(GpuMemory(a)), lda,
-        const_cast<const CUDA_T **>(GpuMemory(b)), ldb, GpuComplex(&beta),
+        GpuComplex(&cb_alpha), const_cast<const CUDA_T **>(GpuMemory(a)), lda,
+        const_cast<const CUDA_T **>(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
         const_cast<CUDA_T **>(GpuMemory(c)), ldc, batch_count);
     if (ok) {
       return port::Status::OK();
@@ -2454,11 +2503,13 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
     std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
     int64 stride_c, int batch_count) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(
       cublasCgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
-      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
+      GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
+      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&cb_beta),
       GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
 }
 
@@ -2469,11 +2520,13 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     const DeviceMemory<std::complex<double>> &b, int ldb, int64 stride_b,
     std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
     int64 stride_c, int batch_count) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(
       cublasZgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
-      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
+      GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
+      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&cb_beta),
       GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
 }
 
@@ -2484,10 +2537,12 @@ bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
                           const DeviceMemory<std::complex<float>> &b, int ldb,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasChemm, stream, true /* = pointer_mode_host */,
                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
@@ -2498,10 +2553,12 @@ bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
                           const DeviceMemory<std::complex<double>> &b, int ldb,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZhemm, stream, true /* = pointer_mode_host */,
                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
@@ -2513,8 +2570,8 @@ bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
                           int ldc) {
   return DoBlasInternal(cublasCherk, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
+                        k, &alpha, GpuComplex(GpuMemory(a)), lda, &beta,
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
@@ -2525,8 +2582,8 @@ bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
                           int ldc) {
   return DoBlasInternal(cublasZherk, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
+                        k, &alpha, GpuComplex(GpuMemory(a)), lda, &beta,
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
@@ -2536,9 +2593,10 @@ bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
                            const DeviceMemory<std::complex<float>> &b, int ldb,
                            float beta, DeviceMemory<std::complex<float>> *c,
                            int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasCher2k, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
                         GpuComplex(GpuMemory(b)), ldb, &beta,
                         GpuComplex(GpuMemoryMutable(c)), ldc);
 }
@@ -2550,9 +2608,10 @@ bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
                            const DeviceMemory<std::complex<double>> &b, int ldb,
                            double beta, DeviceMemory<std::complex<double>> *c,
                            int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasZher2k, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
                         GpuComplex(GpuMemory(b)), ldb, &beta,
                         GpuComplex(GpuMemoryMutable(c)), ldc);
 }
@@ -2586,10 +2645,12 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           const DeviceMemory<std::complex<float>> &b, int ldb,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasCsymm, stream, true /* = pointer_mode_host */,
                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
@@ -2600,10 +2661,12 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           const DeviceMemory<std::complex<double>> &b, int ldb,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZsymm, stream, true /* = pointer_mode_host */,
                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
@@ -2633,10 +2696,12 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasCsyrk, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)),
+                        k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(&cb_beta), GpuComplex(GpuMemoryMutable(c)),
                         ldc);
 }
 
@@ -2646,10 +2711,12 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZsyrk, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)),
+                        k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(&cb_beta), GpuComplex(GpuMemoryMutable(c)),
                         ldc);
 }
 
@@ -2682,10 +2749,12 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
                            const DeviceMemory<std::complex<float>> &b, int ldb,
                            std::complex<float> beta,
                            DeviceMemory<std::complex<float>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasCsyr2k, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
@@ -2696,10 +2765,12 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
                            const DeviceMemory<std::complex<double>> &b, int ldb,
                            std::complex<double> beta,
                            DeviceMemory<std::complex<double>> *c, int ldc) {
+  auto cb_alpha = GpuComplexValue(alpha);
+  auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(cublasZsyr2k, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
-                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
                         GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
@@ -2733,10 +2804,11 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
                           std::complex<float> alpha,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           DeviceMemory<std::complex<float>> *b, int ldb) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasCtrmm, stream, true /* = pointer_mode_host */,
                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
                         GpuComplex(GpuMemoryMutable(b)), ldb,
                         GpuComplex(GpuMemoryMutable(b)), ldb);
 }
@@ -2747,10 +2819,11 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
                           std::complex<double> alpha,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *b, int ldb) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasZtrmm, stream, true /* = pointer_mode_host */,
                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
                         GpuComplex(GpuMemoryMutable(b)), ldb,
                         GpuComplex(GpuMemoryMutable(b)), ldb);
 }
@@ -2783,10 +2856,11 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           std::complex<float> alpha,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           DeviceMemory<std::complex<float>> *b, int ldb) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasCtrsm, stream, true /* = pointer_mode_host */,
                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
                         GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
@@ -2796,10 +2870,11 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           std::complex<double> alpha,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *b, int ldb) {
+  auto cb_alpha = GpuComplexValue(alpha);
   return DoBlasInternal(cublasZtrsm, stream, true /* = pointer_mode_host */,
                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
-                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
                         GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index 5ee106a65fd..3b9e0f2937b 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -131,6 +131,13 @@ extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
   return func_ptr(gridDim, blockDim, sharedMem, stream);
 }
 
+extern char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
+  using FuncPtr = char(CUDARTAPI *)(void **fatCubinHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaInitModule");
+  if (!func_ptr) return 0;
+  return func_ptr(fatCubinHandle);
+}
+
 #if CUDART_VERSION >= 10010
 extern void CUDARTAPI __cudaRegisterFatBinaryEnd(void **fatCubinHandle) {
   using FuncPtr = void(CUDARTAPI *)(void **fatCubinHandle);
diff --git a/tensorflow/stream_executor/device_options.h b/tensorflow/stream_executor/device_options.h
index b195bc84e14..98660441b42 100644
--- a/tensorflow/stream_executor/device_options.h
+++ b/tensorflow/stream_executor/device_options.h
@@ -39,19 +39,19 @@ struct DeviceOptions {
   // this flag prevents it from ever being deallocated. Potentially saves
   // thrashing the thread stack memory allocation, but at the potential cost of
   // some memory space.
-  static const unsigned kDoNotReclaimStackAllocation = 0x1;
+  static constexpr unsigned kDoNotReclaimStackAllocation = 0x1;
 
   // The following options refer to synchronization options when
   // using SynchronizeStream or SynchronizeContext.
 
   // Synchronize with spinlocks.
-  static const unsigned kScheduleSpin = 0x02;
+  static constexpr unsigned kScheduleSpin = 0x02;
   // Synchronize with spinlocks that also call CPU yield instructions.
-  static const unsigned kScheduleYield = 0x04;
+  static constexpr unsigned kScheduleYield = 0x04;
   // Synchronize with a "synchronization primitive" (e.g. mutex).
-  static const unsigned kScheduleBlockingSync = 0x08;
+  static constexpr unsigned kScheduleBlockingSync = 0x08;
 
-  static const unsigned kMask = 0xf;  // Mask of all available flags.
+  static constexpr unsigned kMask = 0xf;  // Mask of all available flags.
 
   // Constructs an or-d together set of device options.
   explicit DeviceOptions(unsigned flags) : flags_(flags) {
@@ -64,7 +64,8 @@ struct DeviceOptions {
   unsigned flags() const { return flags_; }
 
   bool operator==(const DeviceOptions& other) const {
-    return flags_ == other.flags_;
+    return flags_ == other.flags_ &&
+           non_portable_tags == other.non_portable_tags;
   }
 
   bool operator!=(const DeviceOptions& other) const {
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index bfc80cb4a0e..9744fc82593 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -112,7 +112,10 @@ cc_library(
 cc_library(
     name = "gpu_helpers_header",
     hdrs = if_gpu_is_configured(["gpu_helpers.h"]),
-    deps = [":gpu_types_header"],
+    deps = [
+        ":gpu_types_header",
+        "//tensorflow/core/platform:logging",
+    ],
 )
 
 cc_library(
@@ -219,11 +222,11 @@ cc_library(
     hdrs = if_gpu_is_configured(["asm_compiler.h"]),
     copts = tf_copts(),
     visibility = [
+        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/service/mlir_gpu:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
         "//tensorflow/stream_executor:__subpackages__",
-        "//third_party/tf_runtime/tools/tf_kernel_gen:__subpackages__",
     ],
     deps = if_gpu_is_configured([
         ":gpu_asm_opts",
diff --git a/tensorflow/stream_executor/gpu/gpu_helpers.h b/tensorflow/stream_executor/gpu/gpu_helpers.h
index 117a71718f2..68f65a5ce65 100644
--- a/tensorflow/stream_executor/gpu/gpu_helpers.h
+++ b/tensorflow/stream_executor/gpu/gpu_helpers.h
@@ -22,8 +22,10 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
 
 #include <stddef.h>
+
 #include <complex>
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/stream_executor/gpu/gpu_types.h"
 
 namespace stream_executor {
@@ -83,12 +85,18 @@ struct GpuComplexT<std::complex<double>> {
 
 template <typename T>
 inline const typename GpuComplexT<T>::type* GpuComplex(const T* p) {
-  return reinterpret_cast<const typename GpuComplexT<T>::type*>(p);
+  auto* result = reinterpret_cast<const typename GpuComplexT<T>::type*>(p);
+  CHECK_EQ(reinterpret_cast<uintptr_t>(p) % alignof(decltype(*result)), 0)
+      << "Source pointer is not aligned by " << alignof(decltype(*result));
+  return result;
 }
 
 template <typename T>
 inline typename GpuComplexT<T>::type* GpuComplex(T* p) {
-  return reinterpret_cast<typename GpuComplexT<T>::type*>(p);
+  auto* result = reinterpret_cast<typename GpuComplexT<T>::type*>(p);
+  CHECK_EQ(reinterpret_cast<uintptr_t>(p) % alignof(decltype(*result)), 0)
+      << "Source pointer is not aligned by " << alignof(decltype(*result));
+  return result;
 }
 
 // Converts values of std::complex<float/double> to values of
diff --git a/tensorflow/stream_executor/gpu/redzone_allocator.h b/tensorflow/stream_executor/gpu/redzone_allocator.h
index 77755ccd3c6..e5e42df73bd 100644
--- a/tensorflow/stream_executor/gpu/redzone_allocator.h
+++ b/tensorflow/stream_executor/gpu/redzone_allocator.h
@@ -39,10 +39,10 @@ namespace stream_executor {
 // memory for cudnn convolutions.
 class RedzoneAllocator : public ScratchAllocator {
  public:
-  static const int64 kDefaultMemoryLimit = 1LL << 32;  // 4GB
-  static const int64 kDefaultRedzoneSize =
+  static constexpr int64 kDefaultMemoryLimit = 1LL << 32;  // 4GB
+  static constexpr int64 kDefaultRedzoneSize =
       1LL << 23;  // 8MiB per side, 16MiB total.
-  static const uint8 kDefaultRedzonePattern = -1;
+  static constexpr uint8 kDefaultRedzonePattern = -1;
   RedzoneAllocator(Stream* stream, DeviceMemoryAllocator* memory_allocator,
                    GpuAsmOpts gpu_compilation_opts_,
                    int64 memory_limit = kDefaultMemoryLimit,
diff --git a/tensorflow/stream_executor/host/BUILD b/tensorflow/stream_executor/host/BUILD
index 362e2199284..be5af1f6ee7 100644
--- a/tensorflow/stream_executor/host/BUILD
+++ b/tensorflow/stream_executor/host/BUILD
@@ -112,6 +112,7 @@ cc_library(
         "//tensorflow/stream_executor:stream_executor_pimpl",
         "//tensorflow/stream_executor:timer",
         "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
     ],
     alwayslink = True,
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 5242420fcdb..d6fd0ce9821 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include <string.h>
 
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
@@ -42,6 +44,20 @@ HostExecutor::HostExecutor(const PluginConfig &plugin_config)
 
 HostExecutor::~HostExecutor() {}
 
+port::Status HostExecutor::Init(int device_ordinal,
+                                DeviceOptions device_options) {
+  auto it =
+      device_options.non_portable_tags.find("host_thread_stack_size_in_bytes");
+  if (it != device_options.non_portable_tags.end()) {
+    if (!absl::SimpleAtoi(it->second, &thread_stack_size_in_bytes_)) {
+      return port::InvalidArgumentError(absl::StrCat(
+          "Unable to parse host_thread_stack_size_in_bytes as an integer: ",
+          it->second));
+    }
+  }
+  return port::Status::OK();
+}
+
 DeviceMemoryBase HostExecutor::Allocate(uint64 size, int64 memory_space) {
   CHECK_EQ(memory_space, 0);
   // Use a minimum alignment of 64 bytes to be friendly to AVX512 code.
@@ -332,5 +348,11 @@ rng::RngSupport *HostExecutor::CreateRng() {
   return status.ValueOrDie()(this);
 }
 
+std::unique_ptr<internal::StreamInterface>
+HostExecutor::GetStreamImplementation() {
+  return std::unique_ptr<internal::StreamInterface>(
+      new HostStream(thread_stack_size_in_bytes_));
+}
+
 }  // namespace host
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index d40a7a88015..c971ec89bf0 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -46,9 +46,9 @@ class HostExecutor : public internal::StreamExecutorInterface {
   explicit HostExecutor(const PluginConfig &plugin_config);
   ~HostExecutor() override;
 
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
-    return port::Status::OK();
-  }
+  // The stack size used for host streams can be set via
+  // device_options.non_portable_tags["host_stack_size"].
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
 
   port::Status GetKernel(const MultiKernelLoaderSpec &spec,
                          KernelBase *kernel) override {
@@ -184,10 +184,7 @@ class HostExecutor : public internal::StreamExecutorInterface {
     return nullptr;
   }
 
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
-      override {
-    return std::unique_ptr<internal::StreamInterface>(new HostStream());
-  }
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
 
   std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override {
     return std::unique_ptr<internal::TimerInterface>(new HostTimer());
@@ -197,6 +194,8 @@ class HostExecutor : public internal::StreamExecutorInterface {
 
  private:
   const PluginConfig plugin_config_;
+  // Size of thread stacks for streams in bytes. '0' means "the default size".
+  size_t thread_stack_size_in_bytes_ = 0;
 };
 
 }  // namespace host
diff --git a/tensorflow/stream_executor/host/host_stream.cc b/tensorflow/stream_executor/host/host_stream.cc
index 413edc6739a..320b79ff37a 100644
--- a/tensorflow/stream_executor/host/host_stream.cc
+++ b/tensorflow/stream_executor/host/host_stream.cc
@@ -24,9 +24,20 @@ limitations under the License.
 namespace stream_executor {
 namespace host {
 
-HostStream::HostStream()
+namespace {
+
+port::ThreadOptions GetThreadOptions(size_t stack_size_in_bytes) {
+  port::ThreadOptions options;
+  options.stack_size = stack_size_in_bytes;
+  return options;
+}
+
+}  // namespace
+
+HostStream::HostStream(size_t stack_size_in_bytes)
     : thread_(port::Env::Default()->StartThread(
-          port::ThreadOptions(), "host_executor", [this]() { WorkLoop(); })) {}
+          GetThreadOptions(stack_size_in_bytes), "host_executor",
+          [this]() { WorkLoop(); })) {}
 
 HostStream::~HostStream() {
   {
diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h
index 0a353d4a19b..2ee3f1f449c 100644
--- a/tensorflow/stream_executor/host/host_stream.h
+++ b/tensorflow/stream_executor/host/host_stream.h
@@ -31,7 +31,9 @@ namespace host {
 
 class HostStream : public internal::StreamInterface {
  public:
-  HostStream();
+  // stack_size_in_bytes may be '0', meaning "use the default thread stack
+  // size".
+  explicit HostStream(size_t stack_size_in_bytes);
   ~HostStream() override;
 
   bool EnqueueTask(std::function<void()> task);
diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h
index 87269b4591a..170a7955979 100644
--- a/tensorflow/stream_executor/lib/status.h
+++ b/tensorflow/stream_executor/lib/status.h
@@ -36,6 +36,9 @@ using Status = tensorflow::Status;
 inline Status UnimplementedError(absl::string_view message) {
   return Status(error::UNIMPLEMENTED, message);
 }
+inline Status InvalidArgumentError(absl::string_view message) {
+  return Status(error::INVALID_ARGUMENT, message);
+}
 inline Status InternalError(absl::string_view message) {
   return Status(error::INTERNAL, message);
 }
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index dfee2152165..64543a8ae4d 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -64,6 +65,13 @@ class MultiPlatformManagerImpl {
   port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Returns the names of the initialied platforms satisfying the given filter.
+  // By default, it will return all initialized platform names.
+  std::vector<std::string> InitializedPlatformNamesWithFilter(
+      const std::function<bool(const Platform*)>& filter = [](const Platform*) {
+        return true;
+      }) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   absl::Mutex mu_;
   std::vector<std::unique_ptr<Listener>> listeners_ TF_GUARDED_BY(mu_);
   absl::flat_hash_map<Platform::Id, Platform*> id_map_ TF_GUARDED_BY(mu_);
@@ -179,6 +187,23 @@ MultiPlatformManagerImpl::PlatformsWithFilter(
   return platforms;
 }
 
+std::vector<std::string>
+MultiPlatformManagerImpl::InitializedPlatformNamesWithFilter(
+    const std::function<bool(const Platform*)>& filter) {
+  CHECK_EQ(id_map_.size(), name_map_.size());
+  std::vector<std::string> initialized_platforms_names;
+  initialized_platforms_names.reserve(id_map_.size());
+  for (const auto& entry : id_map_) {
+    Platform* platform = entry.second;
+    if (filter(platform)) {
+      if (platform->Initialized()) {
+        initialized_platforms_names.push_back(platform->Name());
+      }
+    }
+  }
+  return initialized_platforms_names;
+}
+
 port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
     absl::string_view target) {
   auto it = name_map_.find(absl::AsciiStrToLower(target));
@@ -186,7 +211,8 @@ port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
     return port::Status(
         port::error::NOT_FOUND,
         absl::StrCat("Could not find registered platform with name: \"", target,
-                     "\""));
+                     "\". Available platform names are: ",
+                     absl::StrJoin(InitializedPlatformNamesWithFilter(), " ")));
   }
   return it->second;
 }
diff --git a/tensorflow/stream_executor/rng.h b/tensorflow/stream_executor/rng.h
index acbf8fce4ca..3dee347cb12 100644
--- a/tensorflow/stream_executor/rng.h
+++ b/tensorflow/stream_executor/rng.h
@@ -40,8 +40,8 @@ namespace rng {
 // thread-hostility.
 class RngSupport {
  public:
-  static const int kMinSeedBytes = 16;
-  static const int kMaxSeedBytes = INT_MAX;
+  static constexpr int kMinSeedBytes = 16;
+  static constexpr int kMaxSeedBytes = INT_MAX;
 
   // Releases any random-number-generation resources associated with this
   // support object in the underlying platform implementation.
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.cc b/tensorflow/stream_executor/rocm/rocm_blas.cc
index 328cce5d1fe..5ddad13ddf9 100644
--- a/tensorflow/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@@ -1519,7 +1519,7 @@ bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                           float beta, DeviceMemory<Eigen::half> *c, int ldc) {
   blas_log("DoBlasGemm");
   VLOG(1) << absl::StreamFormat(
-      "doing rocBLAS SGEMM: at=%d bt=%d m=%u n=%u "
+      "doing rocBLAS SGEMM<half>: at=%d bt=%d m=%u n=%u "
       "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
       "c=%p ldc=%d",
       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
@@ -1565,7 +1565,7 @@ bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                           DeviceMemory<float> *c, int ldc) {
   blas_log("DoBlasGemm");
   VLOG(1) << absl::StreamFormat(
-      "doing rocBLAS SGEMM: at=%d bt=%d m=%u n=%u "
+      "doing rocBLAS SGEMM<float>: at=%d bt=%d m=%u n=%u "
       "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
       "c=%p ldc=%d",
       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
@@ -2473,7 +2473,12 @@ bool ROCMBlas::DoBlasGemmStridedBatched(
     int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
     float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
     int batch_count) {
-  blas_log("DoBlasGemmStridedBatched");
+  VLOG(1) << absl::StreamFormat(
+      "doing rocBLAS SGEMM Strided Batched<float>: at=%d bt=%d m=%u n=%u "
+      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+      "c=%p ldc=%d",
+      static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
   return DoBlasInternal(wrap::rocblas_sgemm_strided_batched, stream,
                         false, /* pointer_mode_host */
                         ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m,
@@ -2487,7 +2492,12 @@ bool ROCMBlas::DoBlasGemmStridedBatched(
     int64 stride_a, const DeviceMemory<double> &b, int ldb, int64 stride_b,
     double beta, DeviceMemory<double> *c, int ldc, int64 stride_c,
     int batch_count) {
-  blas_log("DoBlasGemmStridedBatched");
+  VLOG(1) << absl::StreamFormat(
+      "doing rocBLAS SGEMM Strided Batched<double>: at=%d bt=%d m=%u n=%u "
+      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+      "c=%p ldc=%d",
+      static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
   return DoBlasInternal(wrap::rocblas_dgemm_strided_batched, stream,
                         false, /* pointer_mode_host */
                         ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m,
@@ -2502,10 +2512,13 @@ bool ROCMBlas::DoBlasGemmStridedBatched(
     const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
     std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
     int64 stride_c, int batch_count) {
-  LOG(ERROR) << "rocBLAS does not currently support the "
-                "DoBlasGemmStridedBatched operation "
-             << "for the \"complex<float>\" datatype";
-  return false;
+  return DoBlasInternal(wrap::rocblas_cgemm_strided_batched, stream,
+                        false, /* pointer_mode_host */
+                        ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m,
+                        n, k, complex_cast(alpha), complex_cast(a), lda,
+                        stride_a, complex_cast(b), ldb, stride_b,
+                        complex_cast(beta), complex_cast(c), ldc, stride_c,
+                        batch_count);
 }
 bool ROCMBlas::DoBlasGemmStridedBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
@@ -2514,10 +2527,13 @@ bool ROCMBlas::DoBlasGemmStridedBatched(
     const DeviceMemory<std::complex<double>> &b, int ldb, int64 stride_b,
     std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
     int64 stride_c, int batch_count) {
-  LOG(ERROR) << "rocBLAS does not currently support the "
-                "DoBlasGemmStridedBatched operation "
-             << "for the \"complex<double>\" datatype";
-  return false;
+  return DoBlasInternal(wrap::rocblas_zgemm_strided_batched, stream,
+                        false, /* pointer_mode_host */
+                        ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m,
+                        n, k, complex_cast(alpha), complex_cast(a), lda,
+                        stride_a, complex_cast(b), ldb, stride_b,
+                        complex_cast(beta), complex_cast(c), ldc, stride_c,
+                        batch_count);
 }
 
 port::Status ROCMBlas::GetVersion(string *version) {
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index e22a243a70b..fd3b5f19913 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -132,6 +132,11 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
     VLOG(3) << "Unloading  HSACO module " << module;
     GpuDriver::UnloadModule(context_, module);
     gpu_binary_to_module_.erase(module_it);
+    const char* mem_it = nullptr;
+    for (auto x : in_memory_modules_) {
+      if (x.second == module) mem_it = x.first;
+    }
+    if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
   }
   return true;
 }
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index eeb07100a19..f7f69f78e89 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -723,7 +723,7 @@ class StreamExecutor {
 
   // Only one worker thread is needed; little work will be done by the
   // executor.
-  static const int kNumBackgroundThreads = 1;
+  static constexpr int kNumBackgroundThreads = 1;
 
   // Indicates if StreamExecutor operation tracing should be performed.
   bool tracing_enabled_;
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 0f03162bb48..9a780839be3 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -48,6 +48,7 @@ load(
     "//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
     "if_mkl_v1_open_source_only",
+    "if_mkldnn_threadpool",
 )
 load(
     "//third_party/ngraph:build_defs.bzl",
@@ -58,7 +59,7 @@ load(
 # not contain rc or alpha, only numbers.
 # Also update tensorflow/core/public/version.h
 # and tensorflow/tools/pip_package/setup.py
-VERSION = "2.1.0"
+VERSION = "2.2.0"
 VERSION_MAJOR = VERSION.split(".")[0]
 
 # Sanitize a dependency so that it works correctly from code that includes
@@ -193,10 +194,10 @@ def if_macos(a, otherwise = []):
         "//conditions:default": otherwise,
     })
 
-def if_ios(a):
+def if_ios(a, otherwise = []):
     return select({
         clean_dep("//tensorflow:ios"): a,
-        "//conditions:default": [],
+        "//conditions:default": otherwise,
     })
 
 def if_ios_x86_64(a):
@@ -327,6 +328,11 @@ def tf_copts(
         if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
         if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
         if_mkl_v1_open_source_only(["-DENABLE_MKLDNN_V1"]) +
+        if_mkldnn_threadpool([
+            "-DENABLE_MKLDNN_THREADPOOL",
+            "-DENABLE_MKLDNN_V1",
+            "-DINTEL_MKL_DNN_ONLY",
+        ]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_ngraph(["-DINTEL_NGRAPH=1"]) +
         if_android_arm(["-mfpu=neon"]) +
@@ -348,7 +354,9 @@ def tf_copts(
     )
 
 def tf_openmp_copts():
-    return if_mkl_lnx_x64(["-fopenmp"])
+    # TODO(intel-mkl): Remove -fopenmp for threadpool after removing all
+    # omp pragmas in tensorflow/core.
+    return if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fopenmp"])
 
 def tfe_xla_copts():
     return select({
@@ -615,6 +623,9 @@ def tf_cc_shared_object(
             linkshared = 1,
             data = data + data_extra,
             linkopts = linkopts + _rpath_linkopts(name_os_full) + select({
+                clean_dep("//tensorflow:ios"): [
+                    "-Wl,-install_name,@rpath/" + soname,
+                ],
                 clean_dep("//tensorflow:macos"): [
                     "-Wl,-install_name,@rpath/" + soname,
                 ],
@@ -863,7 +874,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
-            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+            clean_dep("//tensorflow/core:portable_tensorflow_lib"),
         ]),
         copts = tf_copts(),
         alwayslink = 1,
@@ -880,7 +891,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
-            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+            clean_dep("//tensorflow/core:portable_tensorflow_lib"),
         ]),
         copts = tf_copts(),
         alwayslink = 1,
@@ -1128,7 +1139,7 @@ def tf_gpu_cc_test(
         kernels = kernels,
         linkopts = linkopts,
         linkstatic = linkstatic,
-        tags = tags + ["manual"],
+        tags = tags,
         deps = deps,
     )
     tf_cc_test(
@@ -1946,7 +1957,7 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}",
 )
 
-# In tf_py_wrap_cc generated libraries
+# In tf_py_wrap_cc_opensource generated libraries
 # module init functions are not exported unless
 # they contain one of the keywords in the version file
 # this prevents custom python modules.
@@ -1991,7 +2002,7 @@ _append_init_to_versionscript = rule(
 )
 
 # This macro should only be used for pywrap_tensorflow_internal.so.
-# It was copied and refined from the original tf_py_wrap_cc rule.
+# It was copied and refined from the original tf_py_wrap_cc_opensource rule.
 # buildozer: disable=function-docstring-args
 def pywrap_tensorflow_macro(
         name,
@@ -2129,150 +2140,6 @@ def pywrap_tensorflow_macro(
         }),
     )
 
-# DO NOT USE! We are in the process of deprecating this. If you use
-# this rule within third_party/tensorflow you will be rolled back. b/153452665
-# buildozer: enable=function-docstring-args
-def tf_py_wrap_cc(
-        name,
-        srcs = [],
-        swig_includes = [],
-        deps = [],
-        copts = [],
-        version_script = None,
-        **kwargs):
-    """Builds a Python extension module."""
-    module_name = name.split("/")[-1]
-
-    # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
-    # and use that as the name for the rule producing the .so file.
-    cc_library_base = "/".join(name.split("/")[:-1] + ["_" + module_name])
-
-    # TODO(b/137885063): tf_cc_shared_object needs to be cleaned up; we really
-    # shouldn't be passing a name qualified with .so here.
-    cc_library_name = cc_library_base + ".so"
-    cc_library_pyd_name = "/".join(
-        name.split("/")[:-1] + ["_" + module_name + ".pyd"],
-    )
-    extra_deps = []
-
-    # TODO(amitpatankar): Migrate from py_wrap_cc to cc_shared_library.
-    # TensorFlow python does not use any SWIG sources so we create
-    # an empty SWIG file. This rule cannot be cleaned up until bazel shared
-    # library support lands.
-    if srcs == []:
-        srcs = ["default.swig"]
-        native.genrule(
-            name = "default_swig_rule",
-            outs = srcs,
-            cmd = "touch $@",
-        )
-
-    _py_wrap_cc(
-        name = name + "_py_wrap",
-        srcs = srcs,
-        module_name = module_name,
-        py_module_name = name,
-        swig_includes = swig_includes,
-        toolchain_deps = ["@bazel_tools//tools/cpp:current_cc_toolchain"],
-        deps = deps + extra_deps,
-    )
-    if not version_script:
-        version_script = select({
-            "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
-            "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
-        })
-    vscriptname = name + "_versionscript"
-    _append_init_to_versionscript(
-        name = vscriptname,
-        is_version_script = select({
-            "@local_config_cuda//cuda:darwin": False,
-            "//conditions:default": True,
-        }),
-        module_name = module_name,
-        template_file = version_script,
-    )
-    extra_linkopts = select({
-        "@local_config_cuda//cuda:darwin": [
-            "-Wl,-exported_symbols_list,$(location %s.lds)" % vscriptname,
-        ],
-        clean_dep("//tensorflow:windows"): [],
-        "//conditions:default": [
-            "-Wl,--version-script",
-            "$(location %s.lds)" % vscriptname,
-        ],
-    })
-    extra_deps += select({
-        "@local_config_cuda//cuda:darwin": [
-            "%s.lds" % vscriptname,
-        ],
-        clean_dep("//tensorflow:windows"): [],
-        "//conditions:default": [
-            "%s.lds" % vscriptname,
-        ],
-    })
-
-    # Due to b/149224972 we have to add libtensorflow_framework.so
-    # as a dependency so the linker doesn't try and optimize and
-    # remove it from pywrap_tensorflow_internal.so
-    # Issue: https://github.com/tensorflow/tensorflow/issues/34117
-    # Fix: https://github.com/tensorflow/tensorflow/commit/5caa9e83798cb510c9b49acee8a64efdb746207c
-    extra_deps += if_static(
-        extra_deps = [],
-        otherwise = [
-            clean_dep("//tensorflow:libtensorflow_framework_import_lib"),
-        ],
-    )
-
-    tf_cc_shared_object(
-        name = cc_library_name,
-        srcs = [module_name + ".cc"],
-        # framework_so is no longer needed as libtf.so is included via the extra_deps.
-        framework_so = [],
-        copts = copts + if_not_windows([
-            "-Wno-self-assign",
-            "-Wno-sign-compare",
-            "-Wno-write-strings",
-        ]),
-        linkopts = extra_linkopts,
-        linkstatic = 1,
-        deps = deps + extra_deps,
-        **kwargs
-    )
-
-    # When a non-versioned .so is added as a 'src' to a bazel target, it uses
-    # -l%(so_name) instead of -l:%(so_file) during linking.  When -l%(so_name)
-    # is passed to ld, it will look for an associated file with the schema
-    # lib%(so_name).so.  Since pywrap_tensorflow is not explicitly versioned
-    # and is not prefixed with lib_, we add a rule for the creation of an .so
-    # file with the canonical lib schema (e.g. libNAME.so), so that
-    # -l%(so_name) is resolved during linking.
-    #
-    # See: https://github.com/bazelbuild/bazel/blob/7a6808260a733d50983c1adf0cf5a7493472267f/src/main/java/com/google/devtools/build/lib/rules/cpp/LibrariesToLinkCollector.java#L319
-    for pattern in SHARED_LIBRARY_NAME_PATTERNS:
-        name_os = pattern % (cc_library_base, "")
-        native.genrule(
-            name = name_os + "_rule",
-            srcs = [":" + cc_library_name],
-            outs = [name_os],
-            cmd = "cp $< $@",
-        )
-
-    native.genrule(
-        name = "gen_" + cc_library_pyd_name,
-        srcs = [":" + cc_library_name],
-        outs = [cc_library_pyd_name],
-        cmd = "cp $< $@",
-    )
-    native.py_library(
-        name = name,
-        srcs = [":" + name + ".py"],
-        srcs_version = "PY2AND3",
-        data = select({
-            clean_dep("//tensorflow:windows"): [":" + cc_library_pyd_name],
-            "//conditions:default": [":" + cc_library_name],
-        }),
-    )
-
 # This macro is for running python tests against system installed pip package
 # on Windows.
 #
@@ -2351,6 +2218,15 @@ def tf_py_test(
         xla_enabled = False,
         grpc_enabled = False,
         tfrt_enabled = False,
+        # `tfrt_enabled` is set for some test targets, and if we enable
+        # TFRT tests just by that, this will enable TFRT builds for open source.
+        # TFRT open source is not fully integrated yet so we need a temporary
+        # workaround to enable TFRT only for internal builds. `tfrt_enabled_internal`
+        # will be set by `tensorflow.google.bzl`'s `tf_py_test` target, which is
+        # only applied for internal builds.
+        # TODO(b/156911178): Revert this temporary workaround once TFRT open source
+        # is fully integrated with TF.
+        tfrt_enabled_internal = False,
         **kwargs):
     """Create one or more python tests with extra tensorflow dependencies."""
     xla_test_true_list = []
@@ -2367,14 +2243,11 @@ def tf_py_test(
         deps = deps + tf_additional_xla_deps_py()
     if grpc_enabled:
         deps = deps + tf_additional_grpc_deps_py()
-    if tfrt_enabled:
-        deps = deps + ["//tensorflow/python:is_tfrt_test_true"]
 
     # NOTE(ebrevdo): This is a workaround for depset() not being able to tell
     # the difference between 'dep' and 'clean_dep(dep)'.
     for to_add in [
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:gradient_checker",
     ]:
         if to_add not in deps and clean_dep(to_add) not in deps:
             deps.append(clean_dep(to_add))
@@ -2397,6 +2270,23 @@ def tf_py_test(
         deps = depset(deps + xla_test_true_list),
         **kwargs
     )
+    if tfrt_enabled_internal:
+        py_test(
+            name = name + "_tfrt",
+            size = size,
+            srcs = srcs,
+            args = args,
+            data = data,
+            flaky = flaky,
+            kernels = kernels,
+            main = main,
+            shard_count = shard_count,
+            tags = tags,
+            visibility = [clean_dep("//tensorflow:internal")] +
+                         additional_visibility,
+            deps = depset(deps + xla_test_true_list + ["//tensorflow/python:is_tfrt_test_true"]),
+            **kwargs
+        )
 
 register_extension_info(
     extension_name = "tf_py_test",
@@ -2648,40 +2538,98 @@ def tf_genrule_cmd_append_to_srcs(to_append):
     return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
             " >> $(@)")
 
-def tf_version_info_genrule(name, out):
-    native.genrule(
-        name = name,
-        srcs = [
-            clean_dep("@local_config_git//:gen/spec.json"),
-            clean_dep("@local_config_git//:gen/head"),
-            clean_dep("@local_config_git//:gen/branch_ref"),
-        ],
-        outs = [out],
-        cmd =
-            "$(location //tensorflow/tools/git:gen_git_source) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
-        local = 1,
-        exec_tools = [clean_dep("//tensorflow/tools/git:gen_git_source")],
+def _local_exec_transition_impl(settings, attr):
+    return {
+        # Force all targets in the subgraph to build on the local machine.
+        "//command_line_option:modify_execution_info": ".*=+no-remote-exec",
+    }
+
+# A transition that forces all targets in the subgraph to be built locally.
+_local_exec_transition = transition(
+    implementation = _local_exec_transition_impl,
+    inputs = [],
+    outputs = [
+        "//command_line_option:modify_execution_info",
+    ],
+)
+
+def _local_genrule_impl(ctx):
+    ctx.actions.run_shell(
+        outputs = [ctx.outputs.out],
+        inputs = [f for t in ctx.attr.srcs for f in t.files.to_list()],
+        tools = [ctx.executable.exec_tool],
+        arguments = [f.path for t in ctx.attr.srcs for f in t.files.to_list()] +
+                    [ctx.outputs.out.path],
+        command = "%s %s" % (ctx.executable.exec_tool.path, ctx.attr.arguments),
+        execution_requirements = {"no-remote-exec": ""},
+        use_default_shell_env = True,
     )
 
-def tf_py_build_info_genrule(name, out, **kwargs):
-    native.genrule(
+# A genrule that executes locally and forces the tool it runs to be built locally.
+# For python, we want to build all py_binary rules locally that we also want
+# to execute locally, as the remote image might use a different python version.
+# TODO(klimek): Currently we still need to annotate the py_binary rules to use
+# the local platform when building. When we know how to change the platform
+# (https://github.com/bazelbuild/bazel/issues/11035) use this to not require
+# annotating the py_binary rules.
+_local_genrule_internal = rule(
+    implementation = _local_genrule_impl,
+    attrs = {
+        "out": attr.output(),
+        "exec_tool": attr.label(
+            executable = True,
+            cfg = _local_exec_transition,
+            allow_files = True,
+        ),
+        "arguments": attr.string(),
+        "srcs": attr.label_list(
+            allow_files = True,
+        ),
+        "_whitelist_function_transition": attr.label(default = "@bazel_tools//tools/whitelists/function_transition_whitelist"),
+    },
+)
+
+# Wrap the rule in a macro so we can pass in exec_compatible_with.
+def _local_genrule(**kwargs):
+    _local_genrule_internal(
+        exec_compatible_with = [
+            "@local_execution_config_platform//:platform_constraint",
+        ],
+        **kwargs
+    )
+
+def tf_version_info_genrule(name, out):
+    # TODO(gunan): Investigate making this action hermetic so we do not need
+    # to run it locally.
+    _local_genrule(
         name = name,
-        outs = [out],
-        cmd =
-            "$(location //tensorflow/tools/build_info:gen_build_info) --raw_generate \"$@\" " +
+        out = out,
+        exec_tool = "//tensorflow/tools/git:gen_git_source",
+        srcs = [
+            "@local_config_git//:gen/spec.json",
+            "@local_config_git//:gen/head",
+            "@local_config_git//:gen/branch_ref",
+        ],
+        arguments = "--generate \"$@\" --git_tag_override=${GIT_TAG_OVERRIDE:-}",
+    )
+
+def tf_py_build_info_genrule(name, out):
+    _local_genrule(
+        name = name,
+        out = out,
+        exec_tool = "//tensorflow/tools/build_info:gen_build_info",
+        arguments =
+            "--raw_generate \"$@\" " +
             " --is_config_cuda " + if_cuda("True", "False") +
             " --is_config_rocm " + if_rocm("True", "False") +
             " --key_value " +
-            if_cuda(" cuda_version_number=$${TF_CUDA_VERSION:-} cudnn_version_number=$${TF_CUDNN_VERSION:-} ", "") +
+            if_cuda(" cuda_version_number=${TF_CUDA_VERSION:-} cudnn_version_number=${TF_CUDNN_VERSION:-} ", "") +
             if_windows(" msvcp_dll_names=msvcp140.dll,msvcp140_1.dll ", "") +
             if_windows_cuda(" ".join([
                 "nvcuda_dll_name=nvcuda.dll",
-                "cudart_dll_name=cudart64_$$(echo $${TF_CUDA_VERSION:-} | sed \"s/\\.//\").dll",
-                "cudnn_dll_name=cudnn64_$${TF_CUDNN_VERSION:-}.dll",
+                "cudart_dll_name=cudart64_$(echo $${TF_CUDA_VERSION:-} | sed \"s/\\.//\").dll",
+                "cudnn_dll_name=cudnn64_${TF_CUDNN_VERSION:-}.dll",
             ]), ""),
-        local = 1,
-        exec_tools = [clean_dep("//tensorflow/tools/build_info:gen_build_info")],
-        **kwargs
     )
 
 def cc_library_with_android_deps(
@@ -2918,7 +2866,7 @@ def if_mlir(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
-def tfcompile_extra_flags():
+def tfcompile_target_cpu():
     return ""
 
 def tf_external_workspace_visible(visibility):
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index 734b09005ae..911363bcef3 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -1,4 +1,5 @@
 *tensorflow*
+*absl*kSeed*;
 *toco*
 *perftools*gputools*
 *tf_*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index a32da327aaa..5796385dbc5 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -1,6 +1,7 @@
 tensorflow {
   global:
     *tensorflow*;
+    *absl*kSeed*;
     *toco*;
     *perftools*gputools*;
     *tf_*;
diff --git a/tensorflow/tools/android/inference_interface/BUILD b/tensorflow/tools/android/inference_interface/BUILD
index cbd161f05b3..fb3ab00f9bc 100644
--- a/tensorflow/tools/android/inference_interface/BUILD
+++ b/tensorflow/tools/android/inference_interface/BUILD
@@ -34,7 +34,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
         "//tensorflow/java/src/main/native",
     ],
     alwayslink = 1,
@@ -83,7 +83,7 @@ cc_binary(
     ],
     deps = [
         ":android_tensorflow_inference_jni",
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
         LINKER_SCRIPT,
     ],
 )
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
index e9e805426a1..4da828020f8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.IndexedSlices"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.indexed_slices.IndexedSlices\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
index 44a66874e70..7a41cfea4b7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.RaggedTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
index d71812ce83a..aa89308999c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
index 33742e3b867..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.Tensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
index 7876afae9a4..bf0c669cd93 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
@@ -40,6 +40,10 @@ tf_module {
     name: "experimental_run_functions_eagerly"
     argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "functions_run_eagerly"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_logical_device_configuration"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
@@ -60,6 +64,10 @@ tf_module {
     name: "list_physical_devices"
     argspec: "args=[\'device_type\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "run_functions_eagerly"
+    argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_logical_device_configuration"
     argspec: "args=[\'device\', \'logical_devices\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index c0dc0054165..658212aca5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver.TPUClusterResolver\'>"
   is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
index b59d695214d..c0d1c52631a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], "
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\', \'None\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
index 46d0362a705..355c57269fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
@@ -35,6 +35,10 @@ tf_class {
     name: "table_to_config_dict"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "tensor_core_feature_columns"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index af6424f8c75..d696021fcb4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -139,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -199,7 +198,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index bbfc3e343f5..b8486a27b9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -144,7 +144,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -204,7 +204,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index 8fa4bd916ef..7bf71844fa6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.LinearModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -140,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -200,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 3b12b4e8055..ee5e5b884a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
index 578fbf03f77..41483f2b83d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.experimental.SequenceFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.sequence_feature_column.SequenceFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.sequence_feature_column.SequenceFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 4642f0288fc..87a7319639b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.WideDeepModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -140,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -200,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 3ba96bab6fe..8c80da861f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 3f59d9987a5..ef8efd606c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index acc72ebf939..60578d2cc59 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 839d57e4c94..3bd1f2c7623 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
index 1c22721666b..f6f8d3914b4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index cf883e74088..3e408e96036 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
index 70800bccf8c..4197c1a88f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 11f70522f1a..153a801e1d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index ff311806b47..66e261111ae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index dc3cc76d9e1..b247490b067 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index 6fdcb8c9000..0f1808332c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index a5d912c9b8e..567143eb41d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 7471b7306d3..56a2db85419 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 323c0d51988..c0ab32fd7c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index b5de4b0e7a0..5ff17a5b422 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 16143b3b20e..ccb55ec0c52 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -122,7 +122,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index 2bea88de2fd..c44ff9e48e2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 444220d4e06..43112cfe785 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -207,7 +207,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
index 22de9fb79ff..3a592d713bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index b45954626ba..cb2f7f03e56 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index da6bfec7499..535243a2224 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index b66d4fc4d3c..9d847c759a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 4e9ce619361..afcc8822af6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index fedb39dbd21..68cbf32998e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
index 28357ffa0f6..d81e4546670 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 6d97faacece..76d66200fbc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 830caf7f693..c1f49885d87 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index df115f618c7..8d874ede685 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 69f71b6a3ff..5d1d6d04505 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index f58aa3e1baa..f97c7617dbd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 44b66135732..29c0cd34098 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index 63591c0e984..b1ecb7d1204 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index b5e96804759..1e0fe6e6cf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index c11ee1eea4c..4a2a4d19048 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index a2a805817fd..282d24fe9e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index f816c00d9d5..ba9156d7f95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 31b101ce81b..025c35eca17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 46138e74b4b..d15459798cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 4f45a085317..777248192c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index 869d8d4817b..fe114648bff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index 33a95bd2312..19429711e80 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index 35c25eab279..5ac35db6734 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index 955ec7a0a49..ff17ea72d45 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 02dc67771b7..146333b09f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 939dde608aa..86fb73d68f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -190,7 +190,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index b966a1fa48a..8cfe9f9c692 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index bcadf04ab46..a64897f8849 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 93f9f085028..7363d9d6521 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index c1988faf3d7..58a08cd2d94 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 516e93110c5..1ec5624d8bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 545af759275..7931f0deb12 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 13fc0dade36..6db66c8ba9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 5c6515f166d..ffd750c0522 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 27bde045cbd..11762c021a9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 21ee43eb016..08043cb2926 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 14fac4a4edd..f4155ac58aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 0cc18b9a462..6be5e6d4dea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index cb26f965881..69719674f2f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index aef01152cfe..624163caa84 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 6366a29f0b9..39e79f7980b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index a15b042d96e..4f88e672708 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 975df5f3b1e..6a799057db0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -190,7 +190,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 14b809390eb..22fa730112f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
index f1adf9b2178..d8e9445b8cb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 2dcb55a3331..cdc76a45594 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 85b4a635d9e..06ffbc8fdf3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index bb4c63d4289..c2826298321 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 8068baf2931..da6934bae44 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 775cc8f4458..205bf1ed369 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 8fd7d059937..df8c2dcd736 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index aadaea15b7b..20a2d5162f4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index ea1c60e48d3..0bddc075006 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index b9f09656973..ac7827999ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index ade1e839676..f3ae4bb7e5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 2d129d415da..419b64d142f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index b4adbbcbea2..6535a951a1e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index 12d2cc690b8..d54b4d1bb60 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 5e5d3992927..4dee52c2ac6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 733fb63d1fb..84025572e83 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 3e2d70a5a0a..9483167cb23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 3018929154e..7143160bed9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 7af41433d28..31b6b03c1b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 52eb2c247cf..e5295928656 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index 08658b26be3..8b4773bc4f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 9bab5a78338..dae9b58bc55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 2bcc06f9330..53ee61ca723 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 823e28a8bb9..28935f62922 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index c27047ecd71..8c00f85609f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 417e79df321..068788d6b34 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index e6e12106c6c..ddb87d74337 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -178,7 +178,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 8b435bd2b41..cc5165ea47a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index d5fbff4d5c6..9fff96d8764 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 287e0167076..24fbd03ee49 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 78ab93ae395..50ec54308c9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 27afe1a56c6..9de71f557f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index b060c3169fd..05bbc5ad1be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 272fd09afc6..2d34bf8754c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index 95274944084..c153411811f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 8c8f4f287bd..07ca8e40761 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index c56ea3122ed..f5b5f8bbf85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 80c647c9fc1..acfc1a33cfb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index 63423b9ee0c..58082daa2fa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index e5a31b88df9..40e7a43ad53 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index b170d030fe8..473feb798f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 6010e155661..ab8fca29714 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
new file mode 100644
index 00000000000..8a782f6666f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.EinsumDense"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.einsum_dense.EinsumDense\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'equation\', \'output_shape\', \'activation\', \'bias_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
new file mode 100644
index 00000000000..cadf62e0d37
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.RandomFourierFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.kernelized.RandomFourierFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'output_dim\', \'kernel_initializer\', \'scale\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'gaussian\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt
index 7f6d81d297a..81d2acbd71f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.keras.layers.experimental"
 tf_module {
+  member {
+    name: "EinsumDense"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomFourierFeatures"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "preprocessing"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
new file mode 100644
index 00000000000..0407188ab6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.categorical_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'depth\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "partial_crossing"
+    argspec: "args=[\'self\', \'partial_inputs\', \'ragged_out\', \'sparse_out\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
index 4a846b138a9..5640a4d1dcd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
index 9feb216577a..88a89805ec9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
index d84d810bdd0..305b239c3e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
index c8cc33fea5d..3fc5402fb39 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
index 2c6b4bc0c9c..250880c9ae8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
index 782a7d56892..39cd6af00a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
index 769fbd0b5ac..ce654bd1537 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index f539ee33804..95c9cb2dd73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index 57b20ce4031..92dfa72a7a5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
index d4b19e22028..88108bfe9aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index 57eb2c9c175..85850223bcb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -113,11 +113,11 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 7816930fd5c..7036fb926a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
index 05f110140bf..5313dfe9907 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index ab5067a23ca..4f5b0f480e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -221,7 +221,7 @@ tf_class {
   }
   member_method {
     name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 09bd5f15ad3..0964922ea26 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CategoryCrossing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
index 10dbfb56078..ea139297807 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
@@ -448,10 +448,18 @@ tf_module {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "disable_v2_dtype_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'inputs\', \'axes\', \'normalize\'], varargs=None, keywords=kwargs, defaults=[\'False\'], "
   }
+  member_method {
+    name: "enable_v2_dtype_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "maximum"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
index 64ccf7c98ac..56704ace966 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "thresholds"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -118,7 +122,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index d211a16597e..fb970c23732 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 58103637fe3..09863e42eb1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 4f748914101..eb033ce30a5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 42e57f86769..9de555e3427 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 6ef136de517..fa41859b37e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index f3379748fb9..3ebaddb0e58 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 9367edcb228..7d8eafeb393 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index 820d2ed1e7c..3fa0db2af91 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index de23747dcef..4e2a380445e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
index edae2d27448..66e416d57f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index 171ade560a9..6fbbe6b9336 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index 8713d9aa427..f7f8f79eb17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 6c5541e71d9..adaf33d3608 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 45b94842278..2f743849a8b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 90733200606..26fe404372d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 5997066e37a..1d3eae22f8c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -120,7 +120,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index b6a0e00ffa0..3fe23a73576 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index b698ab5ff65..2b98c31a6c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 01a3d3f6e07..772bf62a923 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index c47a1bc749c..bc14d53dbee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
index fe72d0ad1d6..8a6977835a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
index befbf09ed11..ff7fdbb6382 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt
index 3f001a9d4e2..81d8a8d94d6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index e4d66868b1b..7a2dc1f7eaf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt
index daad023ef66..f57b210b9e8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 40329ff21be..e62bad28d0c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index a05adf6070a..3ee6eb4995e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -120,7 +120,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 466ef391017..ddfadbdc66f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 12a1f5daa14..f9c77f7a8a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index 86ae28fb876..7e1abedfebb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 6a52c10edbb..4a222c840a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 08a9118eac0..ce55374bd73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 810f4e61806..a9192f88606 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
index 9ee1af61e34..a0def9553f5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 41dcc25644f..24511bd678c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index 3726bff3850..b840940d24a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index 1ca4fc5c21b..87f6a87de98 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index b486a58fc0b..4f586cfc1ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.mixed_precision.experimental.LossScaleOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer.LossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer._DelegatingTrackableMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 738d413ef87..00c9fc22def 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -139,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -199,7 +198,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index f250d42c1b6..d3cca7311ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -144,7 +144,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -204,7 +204,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
index 7e73d3a8079..fdc7a9e4014 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
@@ -36,7 +36,7 @@ tf_class {
   }
   member_method {
     name: "resize_tensor_input"
-    argspec: "args=[\'self\', \'input_index\', \'tensor_size\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_index\', \'tensor_size\', \'strict\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "set_tensor"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt
index 0c43fc556aa..e7689b4320f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt
@@ -1,11 +1,13 @@
 path: "tensorflow.lite.TFLiteConverter"
 tf_class {
   is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverter\'>"
+  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteFrozenGraphConverter\'>"
+  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverterBaseV1\'>"
   is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverterBase\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'graph_def\', \'input_tensors\', \'output_tensors\', \'input_arrays_with_shape\', \'output_arrays\', \'experimental_debug_info_func\', \'saved_model_dir\', \'saved_model_tags\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'graph_def\', \'input_tensors\', \'output_tensors\', \'input_arrays_with_shape\', \'output_arrays\', \'experimental_debug_info_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "convert"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 3428796ea75..f798ebf25fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -428,10 +428,6 @@ tf_module {
     name: "BatchToSpaceND"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "BeginEpoch"
-    argspec: "args=[\'dataset_id\', \'address\', \'protocol\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "BesselI0e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -938,7 +934,7 @@ tf_module {
   }
   member_method {
     name: "DataServiceDataset"
-    argspec: "args=[\'address\', \'protocol\', \'max_outstanding_requests\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'dataset_id\', \'processing_mode\', \'address\', \'protocol\', \'job_name\', \'max_outstanding_requests\', \'iteration_counter\', \'output_types\', \'output_shapes\', \'task_refresh_interval_hint_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "DatasetCardinality"
@@ -1076,6 +1072,14 @@ tf_module {
     name: "DeleteSessionTensor"
     argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DenseBincount"
+    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "DenseCountSparseOutput"
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "DenseToCSRSparseMatrix"
     argspec: "args=[\'dense_input\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1176,6 +1180,18 @@ tf_module {
     name: "DrawBoundingBoxesV2"
     argspec: "args=[\'images\', \'boxes\', \'colors\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DummyIterationCounter"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DummyMemoryCache"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DummySeedGenerator"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DynamicPartition"
     argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2188,10 +2204,6 @@ tf_module {
     name: "Lu"
     argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
-  member_method {
-    name: "MakeDataServiceIterator"
-    argspec: "args=[\'dataset\', \'epoch_id\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "MakeIterator"
     argspec: "args=[\'dataset\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3056,6 +3068,14 @@ tf_module {
     name: "RGBToHSV"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RaggedBincount"
+    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "RaggedCountSparseOutput"
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "RaggedCross"
     argspec: "args=[\'ragged_values\', \'ragged_row_splits\', \'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'dense_inputs\', \'input_order\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_values_type\', \'out_row_splits_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3890,7 +3910,11 @@ tf_module {
   }
   member_method {
     name: "ShuffleAndRepeatDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ShuffleAndRepeatDatasetV2"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'seed_generator\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "ShuffleDataset"
@@ -3900,6 +3924,10 @@ tf_module {
     name: "ShuffleDatasetV2"
     argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed_generator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "ShuffleDatasetV3"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'seed_generator\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
   member_method {
     name: "ShutdownDistributedTPU"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4052,6 +4080,10 @@ tf_module {
     name: "SparseApplyRMSProp"
     argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "SparseBincount"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "SparseConcat"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'concat_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4060,10 +4092,22 @@ tf_module {
     name: "SparseConditionalAccumulator"
     argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'MEAN\', \'None\'], "
   }
+  member_method {
+    name: "SparseCountSparseOutput"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "SparseCross"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SparseCrossHashed"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'num_buckets\', \'strong_hash\', \'salt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseCrossV2"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'sep\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SparseDenseCwiseAdd"
     argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
index 98462326401..6a8163c1335 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.saved_model.SaveOptions"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.save_options.SaveOptions\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_io_device"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "function_aliases"
     mtype: "<type \'member_descriptor\'>"
@@ -16,6 +20,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_io_device\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
index a3ea216468e..a49cd1ccc4d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.sparse.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 27c64f2cbf7..f8f8edb26a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "add"
     argspec: "args=[\'a\', \'b\', \'threshold\', \'thresh\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "bincount"
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "concat"
     argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\', \'expand_nonconcat_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
@@ -38,7 +42,7 @@ tf_module {
   }
   member_method {
     name: "from_dense"
-    argspec: "args=[\'tensor\', 'name'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
new file mode 100644
index 00000000000..e2c6bbd43d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adagrad"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
new file mode 100644
index 00000000000..941e81acbbb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adam\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
new file mode 100644
index 00000000000..b2c31d00ad8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.FeatureConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.FeatureConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
new file mode 100644
index 00000000000..9a3f47406b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.SGD"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.SGD\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
new file mode 100644
index 00000000000..9cc8354b4bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding.TPUEmbedding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2.TPUEmbedding\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "embedding_tables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_config\', \'batch_size\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'initialize_tpu_embedding\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'gradients\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'features\', \'weights\', \'training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
new file mode 100644
index 00000000000..6be35ed6fb6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.TableConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.TableConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'vocabulary_size\', \'dim\', \'initializer\', \'optimizer\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
new file mode 100644
index 00000000000..9d4f24f4edd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding"
+tf_module {
+  member {
+    name: "Adagrad"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FeatureConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SGD"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUEmbedding"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TableConfig"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
index ef1c8078cca..f9925518a1a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "Topology"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "embedding"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "embedding_column"
     argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'max_sequence_length\', \'learning_rate_fn\', \'embedding_lookup_device\', \'tensor_core_shape\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'0\', \'None\', \'None\', \'None\', \'True\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt
new file mode 100644
index 00000000000..b86e4cbb762
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.train.CheckpointOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.saving.checkpoint_options.CheckpointOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_io_device"
+    mtype: "<type \'member_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'experimental_io_device\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index c71bc4af3ec..f89c502a73b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "CheckpointManager"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
index e9e805426a1..4da828020f8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.IndexedSlices"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.indexed_slices.IndexedSlices\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
index 44a66874e70..7a41cfea4b7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.RaggedTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
index d71812ce83a..aa89308999c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
index 33742e3b867..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.Tensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
index 7876afae9a4..bf0c669cd93 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
@@ -40,6 +40,10 @@ tf_module {
     name: "experimental_run_functions_eagerly"
     argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "functions_run_eagerly"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_logical_device_configuration"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
@@ -60,6 +64,10 @@ tf_module {
     name: "list_physical_devices"
     argspec: "args=[\'device_type\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "run_functions_eagerly"
+    argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_logical_device_configuration"
     argspec: "args=[\'device\', \'logical_devices\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index c0dc0054165..658212aca5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver.TPUClusterResolver\'>"
   is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
index 742a095820b..ee3b35aab71 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
@@ -22,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'sum\'], "
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'sparse_combiner\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'sum\', \'None\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index af6424f8c75..d696021fcb4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -139,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -199,7 +198,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index bbfc3e343f5..b8486a27b9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -144,7 +144,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -204,7 +204,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index 8fa4bd916ef..7bf71844fa6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.LinearModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -140,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -200,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 3b12b4e8055..ee5e5b884a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
index 578fbf03f77..41483f2b83d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.experimental.SequenceFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.sequence_feature_column.SequenceFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.sequence_feature_column.SequenceFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 4642f0288fc..87a7319639b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.WideDeepModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -140,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -200,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 3ba96bab6fe..8c80da861f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 3f59d9987a5..ef8efd606c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index acc72ebf939..60578d2cc59 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 839d57e4c94..3bd1f2c7623 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
index 1c22721666b..f6f8d3914b4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index cf883e74088..3e408e96036 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
index 70800bccf8c..4197c1a88f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 11f70522f1a..153a801e1d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index ff311806b47..66e261111ae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index dc3cc76d9e1..b247490b067 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index 6fdcb8c9000..0f1808332c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index a5d912c9b8e..567143eb41d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 7471b7306d3..56a2db85419 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 323c0d51988..c0ab32fd7c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 71ca168a55c..6b1c609774e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 16143b3b20e..ccb55ec0c52 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -122,7 +122,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index 2bea88de2fd..c44ff9e48e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 444220d4e06..43112cfe785 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -207,7 +207,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
index 22de9fb79ff..3a592d713bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index b45954626ba..cb2f7f03e56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index da6bfec7499..535243a2224 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index b66d4fc4d3c..9d847c759a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 4e9ce619361..afcc8822af6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index fedb39dbd21..68cbf32998e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
index 28357ffa0f6..d81e4546670 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 6d97faacece..76d66200fbc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 830caf7f693..c1f49885d87 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index df115f618c7..8d874ede685 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 69f71b6a3ff..5d1d6d04505 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index f58aa3e1baa..f97c7617dbd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 44b66135732..29c0cd34098 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index 63591c0e984..b1ecb7d1204 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index b5e96804759..1e0fe6e6cf8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index d035db30248..130a9954202 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features_v2.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 31b101ce81b..025c35eca17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 46138e74b4b..d15459798cf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 4f45a085317..777248192c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index 869d8d4817b..fe114648bff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index 33a95bd2312..19429711e80 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index 35c25eab279..5ac35db6734 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index 955ec7a0a49..ff17ea72d45 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 0bbca8b0628..df2fa8a2f5d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 8365c652b9d..24510d6a2bd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -192,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index b966a1fa48a..8cfe9f9c692 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index bcadf04ab46..a64897f8849 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 93f9f085028..7363d9d6521 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index c1988faf3d7..58a08cd2d94 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 516e93110c5..1ec5624d8bf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 545af759275..7931f0deb12 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 13fc0dade36..6db66c8ba9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 5c6515f166d..ffd750c0522 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 27bde045cbd..11762c021a9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 21ee43eb016..08043cb2926 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 14fac4a4edd..f4155ac58aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 0cc18b9a462..6be5e6d4dea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index cb26f965881..69719674f2f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index aef01152cfe..624163caa84 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 6366a29f0b9..39e79f7980b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index a0deeb6dbd3..428b078e9d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index e000180ee73..0ddf19fcde9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -192,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 14b809390eb..22fa730112f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
index f1adf9b2178..d8e9445b8cb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 2dcb55a3331..cdc76a45594 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 85b4a635d9e..06ffbc8fdf3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index bb4c63d4289..c2826298321 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 8068baf2931..da6934bae44 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 775cc8f4458..205bf1ed369 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 8fd7d059937..df8c2dcd736 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index aadaea15b7b..20a2d5162f4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index ea1c60e48d3..0bddc075006 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index b9f09656973..ac7827999ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index ade1e839676..f3ae4bb7e5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 2d129d415da..419b64d142f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index b4adbbcbea2..6535a951a1e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index 12d2cc690b8..d54b4d1bb60 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 5e5d3992927..4dee52c2ac6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 733fb63d1fb..84025572e83 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 3e2d70a5a0a..9483167cb23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 3018929154e..7143160bed9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 7af41433d28..31b6b03c1b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 52eb2c247cf..e5295928656 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index 08658b26be3..8b4773bc4f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 9bab5a78338..dae9b58bc55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 2bcc06f9330..53ee61ca723 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 823e28a8bb9..28935f62922 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index c27047ecd71..8c00f85609f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 417e79df321..068788d6b34 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index e6e12106c6c..ddb87d74337 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -178,7 +178,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 8b435bd2b41..cc5165ea47a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index d5fbff4d5c6..9fff96d8764 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 287e0167076..24fbd03ee49 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 78ab93ae395..50ec54308c9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 27afe1a56c6..9de71f557f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index b060c3169fd..05bbc5ad1be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 272fd09afc6..2d34bf8754c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index 95274944084..c153411811f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 8c8f4f287bd..07ca8e40761 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index c56ea3122ed..f5b5f8bbf85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 80c647c9fc1..acfc1a33cfb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index 63423b9ee0c..58082daa2fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index e5a31b88df9..40e7a43ad53 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index b170d030fe8..473feb798f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 6010e155661..ab8fca29714 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
new file mode 100644
index 00000000000..8a782f6666f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.EinsumDense"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.einsum_dense.EinsumDense\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'equation\', \'output_shape\', \'activation\', \'bias_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
new file mode 100644
index 00000000000..cadf62e0d37
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.RandomFourierFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.kernelized.RandomFourierFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'output_dim\', \'kernel_initializer\', \'scale\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'gaussian\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
index e3a91f6791b..a589ffff174 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt
index f9d1e84781d..53d4adbed30 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.keras.layers.experimental"
 tf_module {
+  member {
+    name: "EinsumDense"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomFourierFeatures"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SyncBatchNormalization"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
new file mode 100644
index 00000000000..0407188ab6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.categorical_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'depth\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "partial_crossing"
+    argspec: "args=[\'self\', \'partial_inputs\', \'ragged_out\', \'sparse_out\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
index 4a846b138a9..5640a4d1dcd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
index ea54293bca1..d52fffa12a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
@@ -123,7 +123,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
index d84d810bdd0..305b239c3e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
index c8cc33fea5d..3fc5402fb39 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
index 2c6b4bc0c9c..250880c9ae8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
index 782a7d56892..39cd6af00a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
index 769fbd0b5ac..ce654bd1537 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index f539ee33804..95c9cb2dd73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index 57b20ce4031..92dfa72a7a5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
index d4b19e22028..88108bfe9aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index 57eb2c9c175..85850223bcb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -113,11 +113,11 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 7816930fd5c..7036fb926a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
index 05f110140bf..5313dfe9907 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 025b1a013cb..a33f65189fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -123,7 +123,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -219,7 +219,7 @@ tf_class {
   }
   member_method {
     name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 09bd5f15ad3..0964922ea26 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CategoryCrossing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
index 64ccf7c98ac..56704ace966 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "thresholds"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -118,7 +122,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index d211a16597e..fb970c23732 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 58103637fe3..09863e42eb1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 4f748914101..eb033ce30a5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 42e57f86769..9de555e3427 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 6ef136de517..fa41859b37e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index f3379748fb9..3ebaddb0e58 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 9367edcb228..7d8eafeb393 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index 820d2ed1e7c..3fa0db2af91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index de23747dcef..4e2a380445e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
index edae2d27448..66e416d57f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index 171ade560a9..6fbbe6b9336 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index 8713d9aa427..f7f8f79eb17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 6c5541e71d9..adaf33d3608 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 45b94842278..2f743849a8b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 90733200606..26fe404372d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 5997066e37a..1d3eae22f8c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -120,7 +120,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index b6a0e00ffa0..3fe23a73576 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index b698ab5ff65..2b98c31a6c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 01a3d3f6e07..772bf62a923 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index c47a1bc749c..bc14d53dbee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
index fe72d0ad1d6..8a6977835a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
index befbf09ed11..ff7fdbb6382 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt
index 3f001a9d4e2..81d8a8d94d6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index e4d66868b1b..7a2dc1f7eaf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt
index daad023ef66..f57b210b9e8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 40329ff21be..e62bad28d0c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index a05adf6070a..3ee6eb4995e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -120,7 +120,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 466ef391017..ddfadbdc66f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 12a1f5daa14..f9c77f7a8a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index 86ae28fb876..7e1abedfebb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 6a52c10edbb..4a222c840a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 08a9118eac0..ce55374bd73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 810f4e61806..a9192f88606 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
index 9ee1af61e34..a0def9553f5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 41dcc25644f..24511bd678c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index 3726bff3850..b840940d24a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index 1ca4fc5c21b..87f6a87de98 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index b486a58fc0b..4f586cfc1ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.mixed_precision.experimental.LossScaleOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer.LossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer._DelegatingTrackableMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 738d413ef87..00c9fc22def 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -139,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -199,7 +198,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index f250d42c1b6..d3cca7311ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -144,7 +144,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
@@ -204,7 +204,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
index 0b49aa9f3d4..e59c78cc496 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
@@ -68,4 +68,8 @@ tf_module {
     name: "save_img"
     argspec: "args=[\'path\', \'x\', \'data_format\', \'file_format\', \'scale\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "smart_resize"
+    argspec: "args=[\'x\', \'size\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'bilinear\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.pbtxt
index aa78190ed73..3189c502774 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "image_dataset_from_directory"
     argspec: "args=[\'directory\', \'labels\', \'label_mode\', \'class_names\', \'color_mode\', \'batch_size\', \'image_size\', \'shuffle\', \'seed\', \'validation_split\', \'subset\', \'interpolation\', \'follow_links\'], varargs=None, keywords=None, defaults=[\'inferred\', \'int\', \'None\', \'rgb\', \'32\', \'(256, 256)\', \'True\', \'None\', \'None\', \'None\', \'bilinear\', \'False\'], "
   }
+  member_method {
+    name: "text_dataset_from_directory"
+    argspec: "args=[\'directory\', \'labels\', \'label_mode\', \'class_names\', \'batch_size\', \'max_length\', \'shuffle\', \'seed\', \'validation_split\', \'subset\', \'follow_links\'], varargs=None, keywords=None, defaults=[\'inferred\', \'int\', \'None\', \'32\', \'None\', \'True\', \'None\', \'None\', \'None\', \'False\'], "
+  }
   member_method {
     name: "timeseries_dataset_from_array"
     argspec: "args=[\'data\', \'targets\', \'sequence_length\', \'sequence_stride\', \'sampling_rate\', \'batch_size\', \'shuffle\', \'seed\', \'start_index\', \'end_index\'], varargs=None, keywords=None, defaults=[\'1\', \'1\', \'128\', \'False\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
index 7e73d3a8079..fdc7a9e4014 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
@@ -36,7 +36,7 @@ tf_class {
   }
   member_method {
     name: "resize_tensor_input"
-    argspec: "args=[\'self\', \'input_index\', \'tensor_size\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_index\', \'tensor_size\', \'strict\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "set_tensor"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
index c575283b74d..c8c163d2f2a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
@@ -1,11 +1,13 @@
 path: "tensorflow.lite.TFLiteConverter"
 tf_class {
   is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverterV2\'>"
+  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteFrozenGraphConverterV2\'>"
+  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverterBaseV2\'>"
   is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverterBase\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'funcs\', \'trackable_obj\', \'saved_model_dir\', \'saved_model_tags\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'funcs\', \'trackable_obj\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "convert"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
index 66ab6b5ca67..03451eb1ae7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "thresholds"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -118,7 +122,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
index 021fe16877a..7011c6d74fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
index c87a817fd32..1259bbab1eb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
index 03480e9b8c9..edceee32ad0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
index fb635300604..442e48571cb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
index 155ab36818a..ac5beaf47db 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
index f5fb6f79099..0999ad8ba56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
index 8c8e76c2f3e..a77ac482bdb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
index 1427375ea4d..171820ce02d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
index 06095a48cab..2ae474f4faa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
index 623f74d34fc..daddc6e44da 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
index 6bf1092aa68..4b1e4bfb92c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
index 7812d8715b5..e37629b0dc6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
index 0719052ccab..c2267f83969 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
index c644e71f8a1..c7f57ed5244 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
index 102f4715c7e..32b6b4e9fad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
index c4c5608c4c0..23e79579eb8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
@@ -120,7 +120,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
index 6847db69ead..c924ffb55b9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
index fe8c55b9465..f3d29557a17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
index e43e5265258..2a22c1a5b63 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
index 943b3029985..94fb1937b76 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
index 2f9d17d37e9..f3bb587cca5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
index b583863298c..0fa457f3553 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt
index ddd3954ed18..a26305e2d0f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
index 9d74d0fbc70..73743ba32a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt
index 7ad458bc2db..87f38a2d95d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
index c6f4672ff20..4e339c3f772 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
index 97ba4531f73..0f57806d0f4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
@@ -120,7 +120,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
index b29b4cdfa54..e82f86f28e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
index 408743cb054..60f2e2e89b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
index ebb08e7efc1..a24216cd16e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 4c8925cd1c9..e4139b23999 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
index c714447e72a..cf55bba0ee9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
index ca6a07356ca..997363b92bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
index b5985644e73..2a3c2a398eb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
index 99b712beb94..006bbc68a40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
@@ -121,7 +121,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
index 2c31194e622..5550520f3c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
index d2567e9050d..06892683ac3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
index 7b08c50d6f9..cdcf7f23edc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
@@ -128,7 +128,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
index 7fc672d250e..75e827a8ab9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
@@ -132,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
index f7cbc16a57d..faf951f2153 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
@@ -128,7 +128,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_metric"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 3428796ea75..f798ebf25fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -428,10 +428,6 @@ tf_module {
     name: "BatchToSpaceND"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "BeginEpoch"
-    argspec: "args=[\'dataset_id\', \'address\', \'protocol\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "BesselI0e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -938,7 +934,7 @@ tf_module {
   }
   member_method {
     name: "DataServiceDataset"
-    argspec: "args=[\'address\', \'protocol\', \'max_outstanding_requests\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'dataset_id\', \'processing_mode\', \'address\', \'protocol\', \'job_name\', \'max_outstanding_requests\', \'iteration_counter\', \'output_types\', \'output_shapes\', \'task_refresh_interval_hint_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "DatasetCardinality"
@@ -1076,6 +1072,14 @@ tf_module {
     name: "DeleteSessionTensor"
     argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DenseBincount"
+    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "DenseCountSparseOutput"
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "DenseToCSRSparseMatrix"
     argspec: "args=[\'dense_input\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1176,6 +1180,18 @@ tf_module {
     name: "DrawBoundingBoxesV2"
     argspec: "args=[\'images\', \'boxes\', \'colors\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DummyIterationCounter"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DummyMemoryCache"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DummySeedGenerator"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DynamicPartition"
     argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2188,10 +2204,6 @@ tf_module {
     name: "Lu"
     argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
-  member_method {
-    name: "MakeDataServiceIterator"
-    argspec: "args=[\'dataset\', \'epoch_id\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "MakeIterator"
     argspec: "args=[\'dataset\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3056,6 +3068,14 @@ tf_module {
     name: "RGBToHSV"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RaggedBincount"
+    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "RaggedCountSparseOutput"
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "RaggedCross"
     argspec: "args=[\'ragged_values\', \'ragged_row_splits\', \'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'dense_inputs\', \'input_order\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_values_type\', \'out_row_splits_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3890,7 +3910,11 @@ tf_module {
   }
   member_method {
     name: "ShuffleAndRepeatDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ShuffleAndRepeatDatasetV2"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'seed_generator\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "ShuffleDataset"
@@ -3900,6 +3924,10 @@ tf_module {
     name: "ShuffleDatasetV2"
     argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed_generator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "ShuffleDatasetV3"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'seed_generator\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
   member_method {
     name: "ShutdownDistributedTPU"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4052,6 +4080,10 @@ tf_module {
     name: "SparseApplyRMSProp"
     argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "SparseBincount"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "SparseConcat"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'concat_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4060,10 +4092,22 @@ tf_module {
     name: "SparseConditionalAccumulator"
     argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'MEAN\', \'None\'], "
   }
+  member_method {
+    name: "SparseCountSparseOutput"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "SparseCross"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SparseCrossHashed"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'num_buckets\', \'strong_hash\', \'salt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseCrossV2"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'sep\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SparseDenseCwiseAdd"
     argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
index 98462326401..6a8163c1335 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.saved_model.SaveOptions"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.save_options.SaveOptions\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_io_device"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "function_aliases"
     mtype: "<type \'member_descriptor\'>"
@@ -16,6 +20,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_io_device\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
index a3ea216468e..a49cd1ccc4d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.sparse.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_like.TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index da3149947b3..67235bb2cf2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "add"
     argspec: "args=[\'a\', \'b\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "bincount"
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "concat"
     argspec: "args=[\'axis\', \'sp_inputs\', \'expand_nonconcat_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -34,7 +38,7 @@ tf_module {
   }
   member_method {
     name: "from_dense"
-    argspec: "args=[\'tensor\', 'name'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index a81480f5c38..6cadb563cc7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -40,6 +40,10 @@ tf_module {
     name: "scalar"
     argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "should_record_summaries"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "text"
     argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
new file mode 100644
index 00000000000..e2c6bbd43d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adagrad"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
new file mode 100644
index 00000000000..941e81acbbb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adam\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
new file mode 100644
index 00000000000..b2c31d00ad8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.FeatureConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.FeatureConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
new file mode 100644
index 00000000000..9a3f47406b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.SGD"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.SGD\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
new file mode 100644
index 00000000000..9cc8354b4bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding.TPUEmbedding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2.TPUEmbedding\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "embedding_tables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_config\', \'batch_size\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'initialize_tpu_embedding\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'gradients\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'features\', \'weights\', \'training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
new file mode 100644
index 00000000000..6be35ed6fb6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.TableConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.TableConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'vocabulary_size\', \'dim\', \'initializer\', \'optimizer\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
new file mode 100644
index 00000000000..9d4f24f4edd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding"
+tf_module {
+  member {
+    name: "Adagrad"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FeatureConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SGD"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUEmbedding"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TableConfig"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
index df31799828c..5c547f4f49b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "Topology"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "embedding"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "initialize_tpu_system"
     argspec: "args=[\'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt
new file mode 100644
index 00000000000..b86e4cbb762
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.train.CheckpointOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.saving.checkpoint_options.CheckpointOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_io_device"
+    mtype: "<type \'member_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'experimental_io_device\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index d7e93a0f937..56651271c13 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -14,18 +14,18 @@ tf_class {
   }
   member_method {
     name: "read"
-    argspec: "args=[\'self\', \'save_path\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'save_path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "restore"
-    argspec: "args=[\'self\', \'save_path\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'save_path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'file_prefix\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'file_prefix\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "write"
-    argspec: "args=[\'self\', \'file_prefix\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'file_prefix\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index 13dc9829d66..f354e5d6e1c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "CheckpointManager"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ClusterDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 93b408d522e..674133431f1 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -28,8 +28,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 4f74b2150f2..88aeda6698c 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -26,13 +26,13 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/tools/build_info/BUILD b/tensorflow/tools/build_info/BUILD
index 91627d41d04..556dd0c86f0 100644
--- a/tensorflow/tools/build_info/BUILD
+++ b/tensorflow/tools/build_info/BUILD
@@ -9,8 +9,10 @@ package(
 py_binary(
     name = "gen_build_info",
     srcs = ["gen_build_info.py"],
+    exec_compatible_with = ["@local_execution_config_platform//:platform_constraint"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
+    tags = ["no-remote-exec"],
     deps = [
         "@six_archive//:six",
     ],
diff --git a/tensorflow/tools/ci_build/Dockerfile.micro b/tensorflow/tools/ci_build/Dockerfile.micro
index ead38b2df1a..5da26218ecd 100644
--- a/tensorflow/tools/ci_build/Dockerfile.micro
+++ b/tensorflow/tools/ci_build/Dockerfile.micro
@@ -5,5 +5,5 @@ FROM python:3.5-stretch
 
 LABEL maintainer="Pete Warden <petewarden@google.com>"
 
-RUN apt-get update && apt-get install -y zip
+RUN apt-get update && apt-get install -y zip xxd
 RUN pip install six
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
index d2713e8805b..91d501109d0 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
@@ -75,14 +75,12 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 RUN python3.8 get-pip.py
 RUN python3.8 -m pip install --upgrade pip setuptools wheel
 
+# Overwrite include paths that are generated for the multipython image.
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
 
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
-
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
 
 # Make apt work with python 3.6.
 RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
new file mode 100644
index 00000000000..9c85091563e
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
@@ -0,0 +1,91 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
+
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04 as devtoolset
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+      cpio \
+      file \
+      flex \
+      g++ \
+      make \
+      rpm2cpio \
+      unar \
+      wget \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+ADD devtoolset/fixlinks.sh fixlinks.sh
+ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
+ADD devtoolset/rpm-patch.sh rpm-patch.sh
+
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
+RUN /build_devtoolset.sh devtoolset-7 /dt7
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
+RUN /build_devtoolset.sh devtoolset-8 /dt8
+
+# TODO(klimek): Split up into two different docker images.
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04
+COPY --from=devtoolset /dt7 /dt7
+COPY --from=devtoolset /dt8 /dt8
+
+# Install TensorRT.
+RUN apt-get update && apt-get install -y \
+    libnvinfer-dev=6.0.1-1+cuda10.1 \
+    libnvinfer6=6.0.1-1+cuda10.1 \
+    libnvinfer-plugin-dev=6.0.1-1+cuda10.1 \
+    libnvinfer-plugin6=6.0.1-1+cuda10.1 \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy and run the install scripts.
+ARG DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+# Install additional packages needed for this image:
+# - dependencies to build Python from source
+# - patchelf, as it is required by auditwheel
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/install_bazel.sh /install/
+RUN /install/install_bazel.sh
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "2.7.17" "--enable-unicode=ucs4"
+RUN /install/build_and_install_python.sh "3.5.9"
+RUN /install/build_and_install_python.sh "3.6.9"
+RUN /install/build_and_install_python.sh "3.7.7"
+RUN /install/build_and_install_python.sh "3.8.2"
+
+COPY install/install_pip_packages_by_version.sh /install/
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip2.7"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.5"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
+
+ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
+COPY install/install_latest_clang.sh /install/
+RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
index 516129ccd43..a14b9ac2a3e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
@@ -73,13 +73,12 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 RUN python3.8 get-pip.py
 RUN python3.8 -m pip install --upgrade pip setuptools wheel
 
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
+# Overwrite include paths that are generated for the multipython image.
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
 
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
 
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index c1928c8e504..6d124204ed8 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -3,8 +3,8 @@
 FROM ubuntu:xenial
 MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
 
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.0/
-ARG ROCM_PATH=/opt/rocm
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/
+ARG ROCM_PATH=/opt/rocm-3.3.0
 
 ENV DEBIAN_FRONTEND noninteractive
 ENV TF_NEED_ROCM 1
@@ -71,7 +71,21 @@ ENV PATH="$ROCM_PATH/bin:${PATH}"
 ENV PATH="$OPENCL_ROOT/bin:${PATH}"
 
 # Add target file to help determine which device(s) to build for
-RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> /opt/rocm/bin/target.lst'
+RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> ${ROCM_PATH}/bin/target.lst'
+
+# Need to explicitly create the $ROCM_PATH/.info/version file to workaround what seems to be a bazel bug
+# The env vars being set via --action_env in .bazelrc and .tf_configure.bazelrc files are sometimes
+# not getting set in the build command being spawned by bazel (in theory this should not happen)
+# As a consequence ROCM_PATH is sometimes not set for the hipcc commands.
+# When hipcc incokes hcc, it specifies $ROCM_PATH/.../include dirs via the `-isystem` options
+# If ROCM_PATH is not set, it defaults to /opt/rocm, and as a consequence a dependency is generated on the
+# header files included within `/opt/rocm`, which then leads to bazel dependency errors
+# Explicitly creating the $ROCM_PATH/.info/version allows ROCM path to be set correrctly, even when ROCM_PATH
+# is not explicitly set, and thus avoids the eventual bazel dependency error.
+# The bazel bug needs to be root-caused and addressed, but that is out of our control and may take a long time
+# to come to fruition, so implementing the workaround to make do till then
+# Filed https://github.com/bazelbuild/bazel/issues/11163 for tracking this
+RUN touch ${ROCM_PATH}/.info/version
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
@@ -90,3 +104,7 @@ COPY install/.bazelrc /etc/bazel.bazelrc
 # Configure the build for our ROCm configuration.
 ENV TF_NEED_ROCM 1
 
+# This is a temporary workaround to fix Out-Of-Memory errors we are running into with XLA perf tests
+# By default, HIP runtime "hides" 256MB from the TF Runtime, but with recent changes (update to ROCm2.3, dynamic loading of roc* libs, et al)
+# it seems that we need to up the threshold slightly to 320MB
+ENV HIP_HIDDEN_FREE_MEM=320
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 44180b8bf84..a281afe7442 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -54,7 +54,7 @@ function build_libtensorflow_tarball() {
   BAZEL_OPTS="--config=opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
   export CC_OPT_FLAGS="-mavx -msse4.2"
   if [ "${TF_NEED_CUDA}" == "1" ]; then
-    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
+    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
     export TF_NEED_ROCM=0
   fi
   bazel clean --expunge
diff --git a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
index 93a1888571e..f0184788a96 100644
--- a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+++ b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
@@ -18,7 +18,8 @@
 set -e
 set -x
 
-MAX_WHL_SIZE=550M
+CPU_MAX_WHL_SIZE=190M
+GPU_MAX_WHL_SIZE=510M
 
 function run_smoke_test() {
   VENV_TMP_DIR=$(mktemp -d)
@@ -82,10 +83,22 @@ function test_tf_imports() {
 }
 
 function test_tf_whl_size() {
-  if [[ $(find $WHL_NAME -type f -size +${MAX_WHL_SIZE}) ]]; then
-    echo "The whl size has exceeded 550MB. To keep within pypi's CDN
-distribution limit, we must not exceed that threshold."
-    return 1
+  # We do not need a separate check for MacOS regular binaries.
+  # We check for the `_cpu` string in the whl file name.
+  if [[ "$WHL_NAME" == *"_cpu"* ]]; then
+    # Check CPU whl size.
+    if [[ $(find $WHL_NAME -type f -size +${CPU_MAX_WHL_SIZE}) ]]; then
+      echo "The CPU whl size has exceeded ${CPU_MAX_WHL_SIZE}MB. To keep
+within pypi's CDN distribution limit, we must not exceed that threshold."
+      return 1
+    fi
+  else
+    # Check GPU whl size.
+    if [[ $(find $WHL_NAME -type f -size +${GPU_MAX_WHL_SIZE}) ]]; then
+      echo "The GPU whl size has exceeded ${GPU_MAX_WHL_SIZE}MB. To keep
+within pypi's CDN distribution limit, we must not exceed that threshold."
+      return 1
+    fi
   fi
 }
 
diff --git a/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh b/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh
index 4343438f205..1afb14dd160 100755
--- a/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh
+++ b/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh
@@ -132,4 +132,8 @@ cp "./x86_64-pc-linux-gnu/libstdc++-v3/src/.libs/libstdc++_nonshared44.a" \
 # TODO(klimek): Automate linking in all non-gcc / non-kernel include
 # directories.
 mkdir -p "/${TARGET}/usr/include/x86_64-linux-gnu"
-ln -s "/usr/include/x86_64-linux-gnu/python3.5m" "/${TARGET}/usr/include/x86_64-linux-gnu/python3.5m"
+PYTHON_VERSIONS=("python2.7" "python3.5m" "python3.6m" "python3.7m" "python3.8")
+for v in "${PYTHON_VERSIONS[@]}"; do
+  ln -s "/usr/local/include/${v}" "/${TARGET}/usr/include/x86_64-linux-gnu/${v}"
+done
+
diff --git a/tensorflow/tools/ci_build/install/build_and_install_python.sh b/tensorflow/tools/ci_build/install/build_and_install_python.sh
new file mode 100755
index 00000000000..fb8b6298542
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/build_and_install_python.sh
@@ -0,0 +1,28 @@
+#!/bin/bash -eu
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+VERSION="$1"
+shift
+
+mkdir /build
+cd /build
+wget "https://www.python.org/ftp/python/${VERSION}/Python-${VERSION}.tgz"
+tar xvzf "Python-${VERSION}.tgz"
+cd "Python-${VERSION}"
+./configure --enable-optimizations "$@"
+make altinstall
+
+rm -rf /build
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 7da01266ebb..f14740ea6f9 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="2.0.0"
+BAZEL_VERSION="3.0.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index c4942fba942..3c40f0c0f34 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="2.0.0"
+BAZEL_VERSION="3.0.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_patchelf.sh b/tensorflow/tools/ci_build/install/install_patchelf.sh
new file mode 100644
index 00000000000..44a91c78b07
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_patchelf.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -eu
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+apt-get update && apt-get install -y patchelf
diff --git a/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
index 7688a081d6f..3bda56af648 100755
--- a/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
@@ -15,12 +15,14 @@
 # ==============================================================================
 
 dpkg --add-architecture armhf
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+dpkg --add-architecture arm64
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
 sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
 yes | add-apt-repository ppa:deadsnakes/ppa
 apt-get update
 apt-get install -y python3.7 python3-numpy python3.7-dev python3-pip
 apt-get install -y libpython3.7-dev:armhf
+apt-get install -y libpython3.7-dev:arm64
diff --git a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
index 7c87a3fc7c5..b02c35c612d 100755
--- a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
@@ -15,11 +15,13 @@
 # ==============================================================================
 
 dpkg --add-architecture armhf
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+dpkg --add-architecture arm64
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
 sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
 apt-get update
 apt-get install -y libpython3-all-dev:armhf
+apt-get install -y libpython3-all-dev:arm64
 apt-get install -y python3 python3-numpy python3-dev python3-pip
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
new file mode 100755
index 00000000000..81e5f2b6406
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -0,0 +1,71 @@
+#!/bin/bash -eu
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+PIP="$1"
+PIP_INSTALL=("${PIP}" "install" "--prefer-binary" --upgrade)
+
+if [[ ! -x "$(which "${PIP}")" ]]; then
+  # Python2 doesn't ship with pip by default.
+  PYTHON="${PIP/pip/python}"
+  wget "https://bootstrap.pypa.io/get-pip.py"
+  "${PYTHON}" "get-pip.py"
+  rm "get-pip.py"
+fi
+
+PACKAGES=(
+  "auditwheel"
+  "wheel"
+  "setuptools"
+  "virtualenv"
+  "six"
+  "future"
+  "absl-py"
+  "werkzeug"
+  "bleach"
+  "markdown"
+  "protobuf"
+  "numpy"
+  "scipy"
+  "scikit-learn"
+  "pandas"
+  "psutil"
+  "py-cpuinfo"
+  "lazy-object-proxy"
+  "pylint"
+  "pycodestyle"
+  "portpicker"
+  "grpcio"
+  "astor"
+  "gast"
+  "termcolor"
+  "keras_preprocessing"
+  "h5py"
+  "tf-estimator-nightly"
+  "tb-nightly"
+  "argparse"
+  "dm-tree"
+)
+
+# tf.mock require the following for python2:
+if [[ "${PIP}" == *pip2* ]]; then
+  PACKAGES+=("mock")
+fi
+
+# Get the latest version of pip so it recognize manylinux2010
+"${PIP}" "install" "--upgrade" "pip"
+
+"${PIP_INSTALL[@]}" "${PACKAGES[@]}"
+
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index 467b8dc8083..1b255682671 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -36,7 +36,7 @@ DOCKER_BINARY="docker"
 if [ "${TF_NEED_CUDA}" == "1" ]; then
   DOCKER_IMAGE="tf-tensorflow-gpu"
   DOCKER_BINARY="nvidia-docker"
-  DOCKER_FILE="Dockerfile.gpu"
+  DOCKER_FILE="Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010"
 fi
 if [ "${TF_NEED_ROCM}" == "1" ]; then
   DOCKER_IMAGE="tf-tensorflow-rocm"
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index 3388845f6f6..eceef65aa38 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -301,8 +301,6 @@ do
       if [[ "${PYTHON}" == "python3" ]]; then
         TF_DOCKER_BUILD_ARGS+=("--build-arg WHL_DIR=/tmp/pip3")
         TF_DOCKER_BUILD_ARGS+=("--build-arg PIP=pip3")
-        FINAL_TAG="${FINAL_TAG}-py3"
-        ROOT_CONTAINER_TAG="${ROOT_CONTAINER_TAG}-py3"
       fi
 
       TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON=${PYTHON}")
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
index 08d99f41622..b69431a938e 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
@@ -30,6 +30,7 @@ export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
+export ROCM_PATH=/opt/rocm-3.3.0
 export TF_GPU_COUNT=${N_GPUS}
 
 yes "" | $PYTHON_BIN_PATH configure.py
@@ -42,7 +43,7 @@ bazel test \
       --test_lang_filters=cc \
       --jobs=${N_JOBS} \
       --local_test_jobs=${TF_GPU_COUNT}\
-      --test_timeout 300,450,1200,3600 \
+      --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
       --test_sharding_strategy=disabled \
@@ -51,8 +52,7 @@ bazel test \
       -- \
       //tensorflow/... \
       -//tensorflow/compiler/... \
-      -//tensorflow/lite/delegates/gpu/gl/... \
-      -//tensorflow/lite/delegates/gpu/cl/... \
+      -//tensorflow/lite/... \
 && bazel test \
       --config=rocm \
       -k \
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
index 61813dfde30..df69044837f 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
@@ -30,6 +30,7 @@ export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
+export ROCM_PATH=/opt/rocm-3.3.0
 export TF_GPU_COUNT=${N_GPUS}
 
 yes "" | $PYTHON_BIN_PATH configure.py
@@ -38,12 +39,13 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test \
       --config=rocm \
       -k \
-      --test_tag_filters=gpu,-no_gpu,-no_rocm,-benchmark-test,-no_oss,-oss_serial,-rocm_multi_gpu, \
-      --test_timeout 600,900,2400,7200 \
-      --test_output=errors \
+      --test_tag_filters=gpu,-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
       --jobs=${N_JOBS} \
       --local_test_jobs=${TF_GPU_COUNT} \
+      --test_timeout 600,900,2400,7200 \
+      --test_output=errors \
       --test_sharding_strategy=disabled \
+      --test_size_filters=small,medium \
       --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
       -- \
       //tensorflow/... \
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
index 64bfffad149..d55fa56f970 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@@ -30,6 +30,7 @@ export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
+export ROCM_PATH=/opt/rocm-3.3.0
 export TF_GPU_COUNT=${N_GPUS}
 
 yes "" | $PYTHON_BIN_PATH configure.py
@@ -46,7 +47,9 @@ bazel test \
       --build_tests_only \
       --test_output=errors \
       --test_sharding_strategy=disabled \
+      --test_size_filters=small,medium \
       --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
       -- \
       //tensorflow/... \
-      -//tensorflow/compiler/...
+      -//tensorflow/compiler/... \
+      -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 0343efa3b74..b3bb368173f 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -50,34 +50,34 @@ fi
 
 WORKSPACE_PATH=`pwd`
 
-# Build the OpenBLAS library, which is faster than Eigen on the Pi Zero/One.
-# TODO(petewarden) - It would be nicer to move this into the main Bazel build
-# process if we can maintain a build file for this.
-TOOLCHAIN_INSTALL_PATH=/tmp/toolchain_install/
-sudo rm -rf ${TOOLCHAIN_INSTALL_PATH}
-mkdir ${TOOLCHAIN_INSTALL_PATH}
-cd ${TOOLCHAIN_INSTALL_PATH}
-curl -L https://github.com/rvagg/rpi-newer-crosstools/archive/eb68350c5c8ec1663b7fe52c742ac4271e3217c5.tar.gz -o toolchain.tar.gz
-tar xzf toolchain.tar.gz
-mv rpi-newer-crosstools-eb68350c5c8ec1663b7fe52c742ac4271e3217c5 tools
-
-CROSSTOOL_CC=${TOOLCHAIN_INSTALL_PATH}/tools/x64-gcc-6.5.0/arm-rpi-linux-gnueabihf/bin/arm-rpi-linux-gnueabihf-gcc
-
-OPENBLAS_SRC_PATH=/tmp/openblas_src/
-sudo rm -rf ${OPENBLAS_SRC_PATH}
-git clone https://github.com/xianyi/OpenBLAS ${OPENBLAS_SRC_PATH}
-cd ${OPENBLAS_SRC_PATH}
-# The commit after this introduced Fortran compile issues. In theory they should
-# be solvable using NOFORTRAN=1 on the make command, but my initial tries didn't
-# work, so pinning to the last know good version.
-git checkout 5a6a2bed9aff0ba8a18651d5514d029c8cae336a
-# If this path is changed, you'll also need to update
-# cxx_builtin_include_directory in third_party/toolchains/cpus/arm/CROSSTOOL.tpl
-OPENBLAS_INSTALL_PATH=/tmp/openblas_install/
-make CC=${CROSSTOOL_CC} FC=${CROSSTOOL_CC} HOSTCC=gcc TARGET=ARMV6
-make PREFIX=${OPENBLAS_INSTALL_PATH} install
-
 if [[ $1 == "PI_ONE" ]]; then
+  # Build the OpenBLAS library, which is faster than Eigen on the Pi Zero/One.
+  # TODO(petewarden) - It would be nicer to move this into the main Bazel build
+  # process if we can maintain a build file for this.
+  TOOLCHAIN_INSTALL_PATH=/tmp/toolchain_install/
+  sudo rm -rf ${TOOLCHAIN_INSTALL_PATH}
+  mkdir ${TOOLCHAIN_INSTALL_PATH}
+  cd ${TOOLCHAIN_INSTALL_PATH}
+  curl -L https://github.com/rvagg/rpi-newer-crosstools/archive/eb68350c5c8ec1663b7fe52c742ac4271e3217c5.tar.gz -o toolchain.tar.gz
+  tar xzf toolchain.tar.gz
+  mv rpi-newer-crosstools-eb68350c5c8ec1663b7fe52c742ac4271e3217c5 tools
+
+  CROSSTOOL_CC=${TOOLCHAIN_INSTALL_PATH}/tools/x64-gcc-6.5.0/arm-rpi-linux-gnueabihf/bin/arm-rpi-linux-gnueabihf-gcc
+
+  OPENBLAS_SRC_PATH=/tmp/openblas_src/
+  sudo rm -rf ${OPENBLAS_SRC_PATH}
+  git clone https://github.com/xianyi/OpenBLAS ${OPENBLAS_SRC_PATH}
+  cd ${OPENBLAS_SRC_PATH}
+  # The commit after this introduced Fortran compile issues. In theory they should
+  # be solvable using NOFORTRAN=1 on the make command, but my initial tries didn't
+  # work, so pinning to the last know good version.
+  git checkout 5a6a2bed9aff0ba8a18651d5514d029c8cae336a
+  # If this path is changed, you'll also need to update
+  # cxx_builtin_include_directory in third_party/toolchains/cpus/arm/CROSSTOOL.tpl
+  OPENBLAS_INSTALL_PATH=/tmp/openblas_install/
+  make CC=${CROSSTOOL_CC} FC=${CROSSTOOL_CC} HOSTCC=gcc TARGET=ARMV6
+  make PREFIX=${OPENBLAS_INSTALL_PATH} install
+
   PI_COPTS="--copt=-march=armv6 --copt=-mfpu=vfp
   --copt=-DUSE_GEMM_FOR_CONV --copt=-DUSE_OPENBLAS
   --copt=-isystem --copt=${OPENBLAS_INSTALL_PATH}/include/
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index 3e3a22f2358..0a9f6eae0b3 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -17,7 +17,7 @@
 
 # Keep in sync with tensorflow_estimator and configure.py.
 # LINT.IfChange
-LATEST_BAZEL_VERSION=2.0.0
+LATEST_BAZEL_VERSION=3.0.0
 # LINT.ThenChange(
 #   //tensorflow/opensource_only/configure.py,
 #   //tensorflow_estimator/google/kokoro/common.sh,
@@ -146,6 +146,7 @@ function install_pip_deps {
   ${PIP_CMD} install --user --upgrade attrs
   ${PIP_CMD} install --user --upgrade tf-estimator-nightly
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
+  ${PIP_CMD} install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_16_pip_installations)
 }
 
@@ -176,8 +177,10 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install scipy --user
   "${PIP_CMD}" install scikit-learn --user
   "${PIP_CMD}" install PyYAML==3.13 --user
-  "${PIP_CMD}" install --user --upgrade tf-estimator-nightly
+  # b/156523241
+  "${PIP_CMD}" install --force-reinstall --user --upgrade tf-estimator-nightly
   "${PIP_CMD}" install --user --upgrade tb-nightly
+  "${PIP_CMD}" install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_pip_installations)
 }
 
@@ -218,7 +221,9 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install --upgrade grpcio
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade attrs
-  ${PIP_CMD} install --user --upgrade tf-estimator-nightly
+  # b/156523241
+  ${PIP_CMD} install --force-reinstall --user --upgrade tf-estimator-nightly
+  ${PIP_CMD} install --user --upgrade wrapt
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
 }
 
@@ -268,3 +273,46 @@ function copy_to_new_project_name {
   popd
   rm -rf "${TMP_DIR}"
 }
+
+# Create minimalist test XML for web view. It includes the pass/fail status
+# of each target, without including errors or stacktraces.
+# Remember to "set +e" before calling bazel or we'll only generate the XML for
+# passing runs.
+function test_xml_summary {
+  set +x
+  set +e
+  mkdir -p "${KOKORO_ARTIFACTS_DIR}/${KOKORO_JOB_NAME}/summary"
+  # First build the repeated inner XML blocks, since the header block needs to
+  # report the number of test cases / failures / errors.
+  # TODO(rsopher): handle build breakages
+  # TODO(rsopher): extract per-test times as well
+  TESTCASE_XML="$(sed -n '/INFO:\ Build\ completed/,/INFO:\ Build\ completed/p' \
+    /tmpfs/kokoro_build.log \
+    | grep -E '(PASSED|FAILED|TIMEOUT)\ in' \
+    | while read -r line; \
+      do echo '<testcase name="'"$(echo "${line}" | tr -s ' ' | cut -d ' ' -f 1)"\
+          '" status="run" classname="" time="0">'"$( \
+        case "$(echo "${line}" | tr -s ' ' | cut -d ' ' -f 2)" in \
+          FAILED) echo '<failure message="" type=""/>' ;; \
+          TIMEOUT) echo '<failure message="timeout" type=""/>' ;; \
+        esac; \
+      )"'</testcase>'; done; \
+  )"
+  NUMBER_OF_TESTS="$(echo "${TESTCASE_XML}" | wc -l)"
+  NUMBER_OF_FAILURES="$(echo "${TESTCASE_XML}" | grep -c '<failure')"
+  echo '<?xml version="1.0" encoding="UTF-8"?>'\
+  '<testsuites name="1"  tests="1" failures="0" errors="0" time="0">'\
+  '<testsuite name="Kokoro Summary" tests="'"${NUMBER_OF_TESTS}"\
+  '" failures="'"${NUMBER_OF_FAILURES}"'" errors="0" time="0">'\
+  "${TESTCASE_XML}"'</testsuite></testsuites>'\
+  > "${KOKORO_ARTIFACTS_DIR}/${KOKORO_JOB_NAME}/summary/sponge_log.xml"
+}
+
+# Create minimalist test XML for web view, then exit.
+# Ends script with value of previous command, meant to be called immediately
+# after bazel as the last call in the build script.
+function test_xml_summary_exit {
+  RETVAL=$?
+  test_xml_summary
+  exit "${RETVAL}"
+}
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 85f22c1e4cb..d34c92736c0 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -28,7 +28,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 
 %PIP_EXE% install setuptools --upgrade
 %PIP_EXE% install future>=0.17.1 --no-deps
-%PIP_EXE% install tf-estimator-nightly --no-deps
+%PIP_EXE% install --force-reinstall tf-estimator-nightly --no-deps
 %PIP_EXE% install tb-nightly --no-deps
 %PIP_EXE% install numpy --upgrade --no-deps
 %PIP_EXE% install opt_einsum --upgrade
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
index 1a0cdd26d55..02e9e2eb9f8 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
@@ -41,9 +41,11 @@ source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py2,-v1only,-gpu,-tpu,-benchmark-test"
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
+test_xml_summary_exit
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
index f6de18d81ac..0630c117036 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
index c64d9c00787..188e47fa74b 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
@@ -39,7 +39,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
index 63b614dd687..06fabd7b1c7 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
@@ -41,9 +41,11 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35,-v1only,-gpu,-tpu,-
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
index 8c9b91dd55e..3f31033b2ac 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
index e03f4c4ce2f..dcbd5b504c8 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
@@ -43,7 +43,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
index a80cdd88ddc..51cc3da62d6 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
@@ -41,10 +41,11 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py36,-v1only,-gpu,-tpu,-
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
index a66dca3885e..26ee4ea8edb 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
index dc153b16a43..3d04cf1d9ba 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
index e7234024ca5..e0f2968b45a 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
@@ -41,9 +41,11 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-bench
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
index 5d75224a45c..ed577db961a 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py37,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
index afe933a1912..c3840aa2dc8 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
index b9a4157577d..22475f35491 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
@@ -41,9 +41,11 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-bench
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
index a5a5b6a34c4..f8eda5a7520 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py38,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
index 8634b9bfb97..5bdb5794e95 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
@@ -38,6 +38,7 @@ tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2,-v1only"
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -45,3 +46,4 @@ bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
index ad14d8724b8..8524bbbad03 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
@@ -46,7 +46,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
index a4d9bb1de03..bd2e27e8781 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
@@ -43,7 +43,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
index 5b161b2f53b..5339671cce3 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
@@ -37,6 +37,7 @@ tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -44,3 +45,4 @@ bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
index 3842410edb2..5d0cbacb0b7 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
index cd8cdd98014..1e2665f4120 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
index 2b621a5d8ca..c2790420afc 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
@@ -37,6 +37,7 @@ tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py36,-v1only"
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -44,3 +45,4 @@ bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
index d23ce016080..25c4de88cdd 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
index 084bfeb3a22..c4d78dc3fe5 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh
index 3fd9fd66afd..f6415a7c9ad 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh
@@ -37,6 +37,7 @@ tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37,-v1only"
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -44,3 +45,4 @@ bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
index 9cded426bde..940cef32ef8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
index 2df3c0e61e7..2208327388f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh
index 205c488847d..ff7a9f3baef 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh
@@ -37,6 +37,7 @@ tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py38,-v1only"
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 # Run tests
+set +e
 bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -44,3 +45,4 @@ bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
index 366f2464612..a27d1f863d6 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py38,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
index 6e913071043..34ef1974916 100755
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
@@ -53,4 +53,4 @@ WHL_PATH=$(ls "${PIP_WHL_DIR}"/*.whl)
 
 cp "${WHL_PATH}" "$(pwd)"/.
 chmod +x tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
-docker run -e "BAZEL_VERSION=${BAZEL_VERSION}" -e "CI_BUILD_USER=$(id -u -n)" -e "CI_BUILD_UID=$(id -u)"  -e "CI_BUILD_GROUP=$(id -g -n)" -e "CI_BUILD_GID=$(id -g)"  -e "CI_BUILD_HOME=/bazel_pip" -v "$(pwd)":/bazel_pip tensorflow/tensorflow:devel-py3 "./bazel_pip/tensorflow/tools/ci_build/builds/with_the_same_user" "./bazel_pip/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh"
+docker run -e "BAZEL_VERSION=${BAZEL_VERSION}" -e "CI_BUILD_USER=$(id -u -n)" -e "CI_BUILD_UID=$(id -u)"  -e "CI_BUILD_GROUP=$(id -g -n)" -e "CI_BUILD_GID=$(id -g)"  -e "CI_BUILD_HOME=/bazel_pip" -v "$(pwd)":/bazel_pip tensorflow/tensorflow:devel "./bazel_pip/tensorflow/tools/ci_build/builds/with_the_same_user" "./bazel_pip/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
index bfacea08173..a886b42daa4 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
@@ -45,6 +45,7 @@ source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py2"
 
+set +e
 bazel test --config=cuda --config=opt \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -56,3 +57,4 @@ bazel test --config=cuda --config=opt \
   --test_output=errors --verbose_failures=true --keep_going \
   --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
   -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
index 12290d1b0b5..dd618031c0d 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
@@ -58,7 +58,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
index d5e5c76ce82..db0c3a22c06 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
index c576be51bd7..bdff1f654f8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
@@ -45,6 +45,7 @@ source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
 
+set +e
 bazel test --config=cuda --config=opt \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -56,3 +57,4 @@ bazel test --config=cuda --config=opt \
   --test_output=errors --verbose_failures=true --keep_going \
   --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
   -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
index be97cc4bfa8..0e8cd8cd784 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
index a3104e88395..4bbbd50724b 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
index deb0dcafbb0..3fecf9abd29 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
@@ -45,6 +45,7 @@ source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
 
+set +e
 bazel test --config=cuda --config=opt \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -56,3 +57,4 @@ bazel test --config=cuda --config=opt \
   --test_output=errors --verbose_failures=true --keep_going \
   --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
   -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
index 15f7db11a87..0b26173ca5f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
index c1fc598eed6..484daa63cb8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
index 7c85556b722..ff11f954c67 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
@@ -45,6 +45,7 @@ source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
 
+set +e
 bazel test --config=cuda --config=opt \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -56,3 +57,4 @@ bazel test --config=cuda --config=opt \
   --test_output=errors --verbose_failures=true --keep_going \
   --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
   -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
index 56f2a7f66e9..00047b775b1 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
index e5d3fda2b73..50cf3d61e4a 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
index 639ba9edb5a..917fbce7563 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
@@ -45,6 +45,7 @@ source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
 
+test +e
 bazel test --config=cuda --config=opt \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
@@ -56,3 +57,4 @@ bazel test --config=cuda --config=opt \
   --test_output=errors --verbose_failures=true --keep_going \
   --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
   -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
index 28b633c390e..9aa5fdf68c8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
new file mode 100644
index 00000000000..abb85c18711
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
new file mode 100644
index 00000000000..c399ed2680f
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
@@ -0,0 +1,30 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+export TF_NEED_CUDA=1
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
index fd803569d33..1667316d214 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
@@ -61,4 +61,6 @@ test_args=(
   --local_test_jobs=1
 )
 
+set +e
 bazel test "${bazel_args[@]}" "${test_args[@]}" -- "${test_patterns[@]}"
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
index 9288b7b3582..6ce1fad9cc7 100755
--- a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
@@ -30,6 +30,7 @@ export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
+export ROCM_PATH=/opt/rocm-3.3.0
 export TF_GPU_COUNT=${N_GPUS}
 
 yes "" | $PYTHON_BIN_PATH configure.py
@@ -47,6 +48,7 @@ bazel test \
       --build_tests_only \
       --test_output=errors \
       --test_sharding_strategy=disabled \
+      --test_size_filters=small,medium \
       --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
       -- \
       //tensorflow/compiler/... \
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 1a0afb6c804..8bccf4f0487 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -524,6 +524,10 @@ renames = {
         'tf.compat.v1.keras.layers.CuDNNGRU',
     'tf.keras.layers.CuDNNLSTM':
         'tf.compat.v1.keras.layers.CuDNNLSTM',
+    'tf.keras.layers.disable_v2_dtype_behavior':
+        'tf.compat.v1.keras.layers.disable_v2_dtype_behavior',
+    'tf.keras.layers.enable_v2_dtype_behavior':
+        'tf.compat.v1.keras.layers.enable_v2_dtype_behavior',
     'tf.keras.losses.cosine':
         'tf.keras.losses.cosine_similarity',
     'tf.keras.losses.cosine_proximity':
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index d7da4b7ad3d..ed8747a73f0 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -76,18 +76,27 @@ tensorflow::Status::code
 tensorflow::Status::error_message
 tensorflow::Status::ok()
 
-[core_cpu_impl]  # device_lib, tfe, tf_session
+[device]  # device_lib, tfe, tf_session
 tensorflow::Device::attributes
+
+[device_factory]  # device_lib, tfe, tf_session
 tensorflow::DeviceFactory::AddDevices
-tensorflow::SessionOptions::SessionOptions
-tensorflow::DoQuantizeTrainingOnSerializedGraphDef
 tensorflow::DeviceFactory::ListAllPhysicalDevices
+
+[session_options]  # device_lib, tfe, tf_session
+tensorflow::SessionOptions::SessionOptions
+
+[quantize_training]  # quantize_training
+tensorflow::DoQuantizeTrainingOnSerializedGraphDef
+
+[session_state]  # tf_session
 tensorflow::SessionState::kTensorHandleResourceTypeName
 
 [server_lib] # server_lib
-tensorflow::data::GrpcDataServer::Start
-tensorflow::data::GrpcDataServer::Stop
-tensorflow::data::GrpcDataServer::Target
+tensorflow::data::GrpcDataServerBase::Start
+tensorflow::data::GrpcDataServerBase::Stop
+tensorflow::data::GrpcDataServerBase::BoundPort
+tensorflow::data::MasterGrpcDataServer::NumTasks
 tensorflow::data::NewMasterServer
 tensorflow::data::NewWorkerServer
 
@@ -108,6 +117,7 @@ tensorflow::swig::TryFindKernelClass
 toco::TocoConvert
 toco::TocoGetPotentiallySupportedOps
 toco::MlirQuantizeModel
+toco::MlirSparsifyModel
 
 [transform_graph_lib] # transform_graph
 tensorflow::graph_transforms::TransformGraph
@@ -173,6 +183,10 @@ TFE_Py_TapeGradient
 TFE_Py_FastPathExecute_C
 TFE_Py_RecordGradient
 TFE_Py_TapeWatchedVariables
+TFE_Py_VariableWatcherNew
+TFE_Py_VariableWatcherRemove
+TFE_Py_VariableWatcherVariableAccessed
+TFE_Py_VariableWatcherWatchedVariables
 TFE_Py_ForwardAccumulatorNew
 TFE_Py_ForwardAccumulatorSetAdd
 TFE_Py_ForwardAccumulatorSetRemove
@@ -206,7 +220,7 @@ tensorflow::ImportGraphDef
 [op_gen_lib] # tf_session
 tensorflow::ApiDefMap::~ApiDefMap
 
-[core_cpu_base_no_ops] # tf_session
+[graph_constructor] # tf_session
 tensorflow::ShapeRefiner::~ShapeRefiner
 
 [python_api] # tf_session
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index 14261d324c5..107d1b426c1 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -53,9 +53,9 @@ RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==$
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index 87a1383e87a..1024f64bfbb 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -93,7 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=2.0.0
+ARG BAZEL_VERSION=3.0.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
@@ -104,9 +104,9 @@ RUN mkdir /bazel && \
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index ee81a14d04f..f20b4c28e3a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -93,7 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=2.0.0
+ARG BAZEL_VERSION=3.0.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index 8ffe23dc6bf..091ac0d3931 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -135,7 +135,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=2.0.0
+ARG BAZEL_VERSION=3.0.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
@@ -146,9 +146,9 @@ RUN mkdir /bazel && \
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 7d89af26b45..9f7e67eeee3 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -135,7 +135,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=2.0.0
+ARG BAZEL_VERSION=3.0.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index bb8bf7e9308..d4d913ce34a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -102,9 +102,9 @@ RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==$
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
index b74c776c0e4..db669389df4 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
@@ -93,7 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=2.0.0
+ARG BAZEL_VERSION=3.0.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
@@ -156,9 +156,9 @@ RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --recursive https://github
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
index bfd1fd1c128..5d90624f64f 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
@@ -93,7 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=2.0.0
+ARG BAZEL_VERSION=3.0.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod-jupyter.Dockerfile
index 4f078949f08..00c21e287f1 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod-jupyter.Dockerfile
@@ -105,9 +105,9 @@ RUN python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
index 0e37354d2e0..0a284f4dcb0 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
@@ -46,7 +46,7 @@ RUN ln -s $(which python3) /usr/local/bin/python
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN apt-get update && apt-get install -y curl libhdf5-dev wget
-RUN python3 -m pip install --global-option=build_ext \
+RUN python3 -m pip install --no-cache-dir --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
             h5py
@@ -66,14 +66,14 @@ RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
     MINOR=`python3 -c 'import sys; print(sys.version_info[1])'`; \
     PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
     wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
-    python3 -m pip install ${PACKAGE}
+    python3 -m pip install --no-cache-dir ${PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
index 47bfeabedae..831e5aead05 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
@@ -46,7 +46,7 @@ RUN ln -s $(which python3) /usr/local/bin/python
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN apt-get update && apt-get install -y curl libhdf5-dev wget
-RUN python3 -m pip install --global-option=build_ext \
+RUN python3 -m pip install --no-cache-dir --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
             h5py
@@ -66,7 +66,7 @@ RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
     MINOR=`python3 -c 'import sys; print(sys.version_info[1])'`; \
     PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
     wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
-    python3 -m pip install ${PACKAGE}
+    python3 -m pip install --no-cache-dir ${PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
index b6f0c3c43e3..53ccffd1403 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -91,7 +91,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
  # Build and install bazel
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
@@ -105,9 +105,9 @@ RUN mkdir /bazel && \
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
index 4f833225c2d..1bbe7129479 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -91,7 +91,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
  # Build and install bazel
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index 8a3fc26812a..0700a354d3c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -133,7 +133,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
  # Build and install bazel
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
@@ -147,9 +147,9 @@ RUN mkdir /bazel && \
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index d748f70aef1..b6d8ff8b90e 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -133,7 +133,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
  # Build and install bazel
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
index fe568c04240..6ef08101304 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
@@ -95,7 +95,7 @@ RUN ln -s $(which python3) /usr/local/bin/python
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN apt-get update && apt-get install -y curl libhdf5-dev wget
-RUN python3 -m pip install --global-option=build_ext \
+RUN python3 -m pip install --no-cache-dir --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
             h5py
@@ -115,14 +115,14 @@ RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
     MINOR=`python3 -c 'import sys; print(sys.version_info[1])'`; \
     PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
     wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
-    python3 -m pip install ${PACKAGE}
+    python3 -m pip install --no-cache-dir ${PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
index 1fce2e5a325..f10e9f95182 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
@@ -95,7 +95,7 @@ RUN ln -s $(which python3) /usr/local/bin/python
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN apt-get update && apt-get install -y curl libhdf5-dev wget
-RUN python3 -m pip install --global-option=build_ext \
+RUN python3 -m pip install --no-cache-dir --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
             h5py
@@ -115,7 +115,7 @@ RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
     MINOR=`python3 -c 'import sys; print(sys.version_info[1])'`; \
     PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
     wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
-    python3 -m pip install ${PACKAGE}
+    python3 -m pip install --no-cache-dir ${PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
index bbe58b7b17d..cd84872a986 100644
--- a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
@@ -1,6 +1,6 @@
-RUN python3 -m pip install jupyter matplotlib
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 # Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
index 1772a39b98f..f28f4d57767 100644
--- a/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
@@ -5,7 +5,7 @@
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN apt-get update && apt-get install -y curl libhdf5-dev wget
-RUN python3 -m pip install --global-option=build_ext \
+RUN python3 -m pip install --no-cache-dir --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
             h5py
@@ -25,4 +25,4 @@ RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
     MINOR=`python3 -c 'import sys; print(sys.version_info[1])'`; \
     PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
     wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
-    python3 -m pip install ${PACKAGE}
+    python3 -m pip install --no-cache-dir ${PACKAGE}
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index 3e890ce7bd1..fc6baecb9b6 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -23,7 +23,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=2.0.0
+ARG BAZEL_VERSION=3.0.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index 64645650b03..2b4761abc39 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -21,7 +21,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
  # Build and install bazel
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 97aedfd9fa2..c0442a5986d 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -2,6 +2,7 @@
 #   Doc generator
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
     default_visibility = ["//tensorflow:__subpackages__"],
@@ -22,6 +23,7 @@ py_library(
 py_test(
     name = "tf_doctest",
     srcs = ["tf_doctest.py"],
+    args = ["--module_prefix_skip=tpu.,distribute.tpu_strategy"],
     python_version = "PY3",
     tags = [
         "no_oss_py2",
@@ -40,6 +42,28 @@ py_test(
     ],
 )
 
+tpu_py_test(
+    name = "tf_doctest_tpu",
+    srcs = ["tf_doctest.py"],
+    args = ["--module=tpu.,distribute.tpu_strategy"],
+    disable_experimental = True,
+    disable_v3 = True,
+    main = "tf_doctest.py",
+    python_version = "PY3",
+    tags = [
+        "no_oss",
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
+    deps = [
+        ":tf_doctest_lib",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/preprocessing",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "tf_doctest_test",
     srcs = ["tf_doctest_test.py"],
@@ -186,6 +210,17 @@ py_library(
     ],
 )
 
+py_binary(
+    name = "build_java_api_docs",
+    srcs = ["build_java_api_docs.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+    ],
+)
+
 py_library(
     name = "py_guide_parser",
     srcs = ["py_guide_parser.py"],
diff --git a/tensorflow/tools/docs/build_java_api_docs.py b/tensorflow/tools/docs/build_java_api_docs.py
new file mode 100644
index 00000000000..343a561d225
--- /dev/null
+++ b/tensorflow/tools/docs/build_java_api_docs.py
@@ -0,0 +1,79 @@
+# Lint as: python3
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generate Java reference docs for TensorFlow.org."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pathlib
+import shutil
+import subprocess
+import tempfile
+
+from absl import app
+from absl import flags
+
+from tensorflow_docs.api_generator import gen_java
+
+FLAGS = flags.FLAGS
+
+# These flags are required by infrastructure, not all of them are used.
+flags.DEFINE_string('output_dir', None,
+                    ("Use this branch as the root version and don't"
+                     ' create in version directory'))
+
+flags.DEFINE_string('site_path', 'api_docs/java',
+                    'Path prefix in the _toc.yaml')
+
+flags.DEFINE_string('code_url_prefix', None,
+                    '[UNUSED] The url prefix for links to code.')
+
+flags.DEFINE_bool(
+    'search_hints', True,
+    '[UNUSED] Include metadata search hints in the generated files')
+
+# Use this flag to disable bazel generation if you're not setup for it.
+flags.DEFINE_bool('gen_ops', True, 'enable/disable bazel-generated ops')
+
+# __file__ is the path to this file
+DOCS_TOOLS_DIR = pathlib.Path(__file__).resolve().parent
+TENSORFLOW_ROOT = DOCS_TOOLS_DIR.parents[2]
+SOURCE_PATH = TENSORFLOW_ROOT / 'tensorflow/java/src/main/java'
+OP_SOURCE_PATH = (
+    TENSORFLOW_ROOT /
+    'bazel-bin/tensorflow/java/ops/src/main/java/org/tensorflow/op')
+
+
+def main(unused_argv):
+  merged_source = pathlib.Path(tempfile.mkdtemp())
+  shutil.copytree(SOURCE_PATH, merged_source / 'java')
+
+  if FLAGS.gen_ops:
+    subprocess.check_call(
+        ['bazel', 'build', '//tensorflow/java:java_op_gen_sources'],
+        cwd=TENSORFLOW_ROOT)
+    shutil.copytree(OP_SOURCE_PATH, merged_source / 'java/org/tensorflow/ops')
+
+  gen_java.gen_java_docs(
+      package='org.tensorflow',
+      source_path=merged_source / 'java',
+      output_dir=pathlib.Path(FLAGS.output_dir),
+      site_path=pathlib.Path(FLAGS.site_path))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(['output_dir'])
+  app.run(main)
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 12c0b5151ed..6c10cda59bb 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -227,7 +227,7 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
 
   out_path = pathlib.Path(output_dir)
   num_files = len(list(out_path.rglob("*")))
-  if num_files < 2500:
+  if num_files < 2000:
     raise ValueError("The TensorFlow api should be more than 2500 files"
                      "(found {}).".format(num_files))
   expected_path_contents = {
diff --git a/tensorflow/tools/docs/tf_doctest.py b/tensorflow/tools/docs/tf_doctest.py
index 19624659e37..fc81d33cfde 100644
--- a/tensorflow/tools/docs/tf_doctest.py
+++ b/tensorflow/tools/docs/tf_doctest.py
@@ -42,7 +42,9 @@ tf.keras.preprocessing = preprocessing
 
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string('module', None, 'A specific module to run doctest on.')
+flags.DEFINE_list('module', [], 'A list of specific module to run doctest on.')
+flags.DEFINE_list('module_prefix_skip', [],
+                  'A list of modules to ignore when resolving modules.')
 flags.DEFINE_boolean('list', None,
                      'List all the modules in the core package imported.')
 flags.DEFINE_string('file', None, 'A specific file to run doctest on.')
@@ -50,6 +52,7 @@ flags.DEFINE_string('file', None, 'A specific file to run doctest on.')
 flags.mark_flags_as_mutual_exclusive(['module', 'file'])
 flags.mark_flags_as_mutual_exclusive(['list', 'file'])
 
+# Both --module and --module_prefix_skip are relative to PACKAGE.
 PACKAGE = 'tensorflow.python.'
 
 
@@ -68,23 +71,24 @@ def find_modules():
   return tf_modules
 
 
-def filter_on_submodules(all_modules, submodule):
-  """Filters all the modules based on the module flag.
+def filter_on_submodules(all_modules, submodules):
+  """Filters all the modules based on the modules flag.
 
   The module flag has to be relative to the core package imported.
-  For example, if `submodule=keras.layers` then, this function will return
+  For example, if `module=keras.layers` then, this function will return
   all the modules in the submodule.
 
   Args:
     all_modules: All the modules in the core package.
-    submodule: Submodule to filter from all the modules.
+    submodules: Submodules to filter from all the modules.
 
   Returns:
     All the modules in the submodule.
   """
 
   filtered_modules = [
-      mod for mod in all_modules if PACKAGE + submodule in mod.__name__
+      mod for mod in all_modules
+      if any(PACKAGE + submodule in mod.__name__ for submodule in submodules)
   ]
   return filtered_modules
 
@@ -140,6 +144,9 @@ def load_tests(unused_loader, tests, unused_ignore):
     tf_modules = get_module_and_inject_docstring(FLAGS.file)
 
   for module in tf_modules:
+    if any(module.__name__.startswith(PACKAGE + prefix)
+           for prefix in FLAGS.module_prefix_skip):
+      continue
     testcase = TfTestCase()
     tests.addTests(
         doctest.DocTestSuite(
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index c1f0577f33b..405c3f82515 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -11,6 +11,8 @@ package(
 py_binary(
     name = "gen_git_source",
     srcs = ["gen_git_source.py"],
+    exec_compatible_with = ["@local_execution_config_platform//:platform_constraint"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
+    tags = ["no-remote-exec"],
 )
diff --git a/tensorflow/tools/graph_transforms/backports.cc b/tensorflow/tools/graph_transforms/backports.cc
index 041e7eedfb7..82494457e0a 100644
--- a/tensorflow/tools/graph_transforms/backports.cc
+++ b/tensorflow/tools/graph_transforms/backports.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/flatten_atrous.cc b/tensorflow/tools/graph_transforms/flatten_atrous.cc
index c80b28fbbca..9d564ed809c 100644
--- a/tensorflow/tools/graph_transforms/flatten_atrous.cc
+++ b/tensorflow/tools/graph_transforms/flatten_atrous.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
index f59a7abbea9..1a9befd78b3 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index d16d829a16e..652a0861917 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index 814384d8a13..885fed2628f 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fuse_convolutions.cc b/tensorflow/tools/graph_transforms/fuse_convolutions.cc
index 7754dde9c68..dd71ed3586c 100644
--- a/tensorflow/tools/graph_transforms/fuse_convolutions.cc
+++ b/tensorflow/tools/graph_transforms/fuse_convolutions.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index e7978627541..2d427db46fe 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/obfuscate_names.cc b/tensorflow/tools/graph_transforms/obfuscate_names.cc
index ee8ca3d097d..67a9fb6f68b 100644
--- a/tensorflow/tools/graph_transforms/obfuscate_names.cc
+++ b/tensorflow/tools/graph_transforms/obfuscate_names.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index a03553af8f1..2c2138c17cd 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
diff --git a/tensorflow/tools/graph_transforms/quantize_weights.cc b/tensorflow/tools/graph_transforms/quantize_weights.cc
index a1a6e27171e..70a5ceb49ea 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
diff --git a/tensorflow/tools/graph_transforms/remove_attribute.cc b/tensorflow/tools/graph_transforms/remove_attribute.cc
index 19bc1dc7357..f933cca9d34 100644
--- a/tensorflow/tools/graph_transforms/remove_attribute.cc
+++ b/tensorflow/tools/graph_transforms/remove_attribute.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_control_dependencies.cc b/tensorflow/tools/graph_transforms/remove_control_dependencies.cc
index 4a7285f1d47..5e790172596 100644
--- a/tensorflow/tools/graph_transforms/remove_control_dependencies.cc
+++ b/tensorflow/tools/graph_transforms/remove_control_dependencies.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
diff --git a/tensorflow/tools/graph_transforms/remove_device.cc b/tensorflow/tools/graph_transforms/remove_device.cc
index fdd43168a11..1387e45c470 100644
--- a/tensorflow/tools/graph_transforms/remove_device.cc
+++ b/tensorflow/tools/graph_transforms/remove_device.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_nodes.cc b/tensorflow/tools/graph_transforms/remove_nodes.cc
index aa0288689d9..787fd3f6bcb 100644
--- a/tensorflow/tools/graph_transforms/remove_nodes.cc
+++ b/tensorflow/tools/graph_transforms/remove_nodes.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/rename_attribute.cc b/tensorflow/tools/graph_transforms/rename_attribute.cc
index f92e6a3cf9e..cee5bb52ca0 100644
--- a/tensorflow/tools/graph_transforms/rename_attribute.cc
+++ b/tensorflow/tools/graph_transforms/rename_attribute.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/rename_op.cc b/tensorflow/tools/graph_transforms/rename_op.cc
index 7a35619f792..4e6c367ed27 100644
--- a/tensorflow/tools/graph_transforms/rename_op.cc
+++ b/tensorflow/tools/graph_transforms/rename_op.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/round_weights.cc b/tensorflow/tools/graph_transforms/round_weights.cc
index 3a145ac1f6b..666fd8f51d9 100644
--- a/tensorflow/tools/graph_transforms/round_weights.cc
+++ b/tensorflow/tools/graph_transforms/round_weights.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
diff --git a/tensorflow/tools/graph_transforms/sort_by_execution_order.cc b/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
index 548f5ba4820..30578dc818c 100644
--- a/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
+++ b/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 9aaf9b98e19..f990a165f21 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
index 5ca45ac90e9..546b9eef053 100644
--- a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
+++ b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
 #include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
index 8cd9e32ba6f..89cd2f0e0ce 100644
--- a/tensorflow/tools/optimization/optimization_pass_runner.cc
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c8df1e34bc0..2c3734d2fc2 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -28,6 +28,7 @@ transitive_hdrs(
     deps = [
         "//tensorflow/c/experimental:network",
         "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -95,6 +96,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow:tensorflow_py",
     "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_hdrs",
     "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_srcs",
+    "//tensorflow/compiler/mlir/tensorflow:gen_mlir_passthrough_op_py",
     "//tensorflow/core:protos_all_proto_srcs",
     "//tensorflow/examples/saved_model/integration_tests:mnist_util",
     "//tensorflow/lite/python/testdata:interpreter_test_data",
@@ -111,6 +113,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/distribute:distribute_test_lib_pip",
     "//tensorflow/python:loss_scale",
     "//tensorflow/python:loss_scale_optimizer",
+    "//tensorflow/python:memory_checker",
     "//tensorflow/python:meta_graph_testdata",
     "//tensorflow/python:util_example_parser_configuration",
     "//tensorflow/python/data/benchmarks:benchmark_base",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 56be82fcace..cb0a587b5eb 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -34,6 +34,11 @@ PIP_PACKAGE_QUERY_EXPRESSION = (
 # pip smoke test.
 BUILD_BLACKLIST = [
     "tensorflow/lite",
+    "tensorflow/compiler/mlir/lite",
+    "tensorflow/python/kernel_tests/signal",
+    "tensorflow/examples",
+    "tensorflow/tools/android",
+    "tensorflow/python/eager/benchmarks",
 ]
 
 def GetBuild(dir_base):
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 1c9a37bf652..8a5450d78b6 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -43,43 +43,31 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-DOCLINES = __doc__.split('\n')
-
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.1.0'
+_VERSION = '2.2.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.7.0',
     'astunparse == 1.6.3',
-    'backports.weakref >= 1.0rc1;python_version<"3.4"',
-    'enum34 >= 1.1.6;python_version<"3.4"',
     'gast == 0.3.3',
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
-    'keras_preprocessing >= 1.1.0',
+    'keras_preprocessing >= 1.1.1, < 1.2',
     'numpy >= 1.16.0, < 2.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',
     'tensorboard >= 2.2.0, < 2.3.0',
-    'tensorflow_estimator >= 2.1.0, < 2.2.0',
+    'tensorflow_estimator >= 2.2.0, < 2.3.0',
     'termcolor >= 1.1.0',
     'wrapt >= 1.11.1',
-    # python3 requires wheel 0.26
-    'wheel >= 0.26;python_version>="3"',
-    'wheel;python_version<"3"',
-    # mock comes with unittest.mock for python3, need to install for python2
-    'mock >= 2.0.0;python_version<"3"',
-    # functools comes with python3, need to install the backport for python2
-    'functools32 >= 3.2.3;python_version<"3"',
+    'wheel >= 0.26',
     'six >= 1.12.0',
     # scipy < 1.4.1 causes segfaults due to pybind11
-    # Latest scipy pip for py2 is scipy==1.2.2
-    'scipy == 1.4.1;python_version>="3"',
-    'scipy == 1.2.2;python_version<"3"',
+    'scipy == 1.4.1',
 ]
 
 if sys.byteorder == 'little':
@@ -100,11 +88,19 @@ if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
       REQUIRED_PACKAGES[i] = 'tb-nightly >= 2.3.0a0, < 2.4.0a0'
-    elif 'tensorflow_estimator' in pkg and '2.0' in project_name:
-      REQUIRED_PACKAGES[i] = 'tensorflow-estimator-2.0-preview'
     elif 'tensorflow_estimator' in pkg:
       REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
+DOCLINES = __doc__.split('\n')
+if project_name.endswith('-gpu'):
+  project_name_no_gpu = project_name[:-len('-gpu')]
+  _GPU_PACKAGE_NOTE = 'Note that %s package by default supports both CPU and '\
+      'GPU. %s has the same content and exists solely for backward '\
+      'compatiblity. Please migrate to %s for GPU support.'\
+      % (project_name_no_gpu, project_name, project_name_no_gpu)
+  DOCLINES.append(_GPU_PACKAGE_NOTE)
+
+
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
     'toco_from_protos = tensorflow.lite.toco.python.toco_from_protos:main',
@@ -121,11 +117,6 @@ CONSOLE_SCRIPTS = [
 ]
 # pylint: enable=line-too-long
 
-# Only keep freeze_graph console script in 1.X.
-if _VERSION.startswith('1.') and '_2.0' not in project_name:
-  CONSOLE_SCRIPTS.append(
-      'freeze_graph = tensorflow.python.tools.freeze_graph:run_main')
-
 # remove the tensorboard console script if building tf_nightly
 if 'tf_nightly' in project_name:
   CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')
@@ -252,6 +243,7 @@ headers = (
     list(find_files('*.h', 'tensorflow/c')) +
     list(find_files('*.h', 'tensorflow/cc')) +
     list(find_files('*.h', 'tensorflow/compiler')) +
+    list(find_files('*.h.inc', 'tensorflow/compiler')) +
     list(find_files('*.h', 'tensorflow/core')) +
     list(find_files('*.h', 'tensorflow/python')) +
     list(find_files('*.h', 'tensorflow/stream_executor')) +
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 080d4f8c5e1..452152efacf 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -77,10 +77,18 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_repositories(path_prefix, tf_repo_name)
     tf_bind()
 
+# Toolchains & platforms required by Tensorflow to build.
+def tf_toolchains():
+    native.register_execution_platforms("@local_execution_config_platform//:platform")
+    native.register_toolchains("@local_execution_config_python//:py_toolchain")
+
 # Define all external repositories required by TensorFlow
 def tf_repositories(path_prefix = "", tf_repo_name = ""):
     """All external dependencies for TF builds."""
 
+    # Initialize toolchains and platforms.
+    tf_toolchains()
+
     # Loads all external repos to configure RBE builds.
     initialize_rbe_configs()
 
@@ -156,31 +164,31 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "246aa56afc5263f1d41fc4a3437ecd51b56f78e16421818961cf79e39431c1df",
-        strip_prefix = "XNNPACK-b9d07cfa38af15c2abf564c980e00c965857ba21",
+        sha256 = "0440d9ad632945f10992664be84eb0c0c76581f8474df3c124aa30350981126c",
+        strip_prefix = "XNNPACK-d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/b9d07cfa38af15c2abf564c980e00c965857ba21.zip",
-            "https://github.com/google/XNNPACK/archive/b9d07cfa38af15c2abf564c980e00c965857ba21.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
+            "https://github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
         ],
     )
 
     tf_http_archive(
         name = "FXdiv",
-        sha256 = "8224ff187cdfa178b8c54d36eea70520391781eda16d13a418ab5ae53289e1ab",
-        strip_prefix = "FXdiv-561254d968e5679460e6a0a743206410284d9f46",
+        sha256 = "ab7dfb08829bee33dca38405d647868fb214ac685e379ec7ef2bebcd234cd44d",
+        strip_prefix = "FXdiv-b408327ac2a15ec3e43352421954f5b1967701d1",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/FXdiv/archive/561254d968e5679460e6a0a743206410284d9f46.zip",
-            "https://github.com/Maratyszcza/FXdiv/archive/561254d968e5679460e6a0a743206410284d9f46.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/FXdiv/archive/b408327ac2a15ec3e43352421954f5b1967701d1.zip",
+            "https://github.com/Maratyszcza/FXdiv/archive/b408327ac2a15ec3e43352421954f5b1967701d1.zip",
         ],
     )
 
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "f894d845cefc091291329712deec85ce7020546f6eaff200b690ae04b6094535",
-        strip_prefix = "pthreadpool-bfa3b9ce6cb71dc8b792e39d24717320a4f92572",
+        sha256 = "c4b148fba41fc937fdf96bc195caadf0cf0be83f1c3e335ef5355934d4501f83",
+        strip_prefix = "pthreadpool-e918b206d26b1f3b2100b0edabf445c18708d2b7",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/bfa3b9ce6cb71dc8b792e39d24717320a4f92572.zip",
-            "https://github.com/Maratyszcza/pthreadpool/archive/bfa3b9ce6cb71dc8b792e39d24717320a4f92572.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/e918b206d26b1f3b2100b0edabf445c18708d2b7.zip",
+            "https://github.com/Maratyszcza/pthreadpool/archive/e918b206d26b1f3b2100b0edabf445c18708d2b7.zip",
         ],
     )
 
@@ -203,11 +211,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn_v1",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn_v1.BUILD"),
-        sha256 = "a71ec1f27c30b8a176605e8a78444f1f12301a3c313b70ff93290926c140509c",
-        strip_prefix = "mkl-dnn-1.2.2",
+        sha256 = "54737bcb4dc1961d32ee75da3ecc529fa48198f8b2ca863a079e19a9c4adb70f",
+        strip_prefix = "oneDNN-1.4",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v1.2.2.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/v1.2.2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
+            "https://github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
         ],
     )
 
@@ -229,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "5f6f627434a264fbfca037488ff343c2fe03c12597ee05081b3105710855e87e",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-4aae8ac693899dd45c4b4630f9a61c2d8914c8f2",
+        sha256 = "59f7cc665fff375f142d558e7c08c95ac254fa13d077cbecce757a556d30e0d9",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-9b411757abd8458f9689b1384c6bf75da9b82357",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/4aae8ac693899dd45c4b4630f9a61c2d8914c8f2/eigen-4aae8ac693899dd45c4b4630f9a61c2d8914c8f2.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/4aae8ac693899dd45c4b4630f9a61c2d8914c8f2/eigen-4aae8ac693899dd45c4b4630f9a61c2d8914c8f2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/9b411757abd8458f9689b1384c6bf75da9b82357/eigen-9b411757abd8458f9689b1384c6bf75da9b82357.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/9b411757abd8458f9689b1384c6bf75da9b82357/eigen-9b411757abd8458f9689b1384c6bf75da9b82357.tar.gz",
         ],
     )
 
@@ -333,11 +341,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "gemmlowp",
-        sha256 = "6678b484d929f2d0d3229d8ac4e3b815a950c86bb9f17851471d143f6d4f7834",  # SHARED_GEMMLOWP_SHA
-        strip_prefix = "gemmlowp-12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3",
+        sha256 = "43146e6f56cb5218a8caaab6b5d1601a083f1f31c06ff474a4378a7d35be9cfb",  # SHARED_GEMMLOWP_SHA
+        strip_prefix = "gemmlowp-fda83bdc38b118cc6b56753bd540caa49e570745",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
-            "https://github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
+            "https://github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
         ],
     )
 
@@ -635,17 +643,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "com_github_nanopb_nanopb",
-        sha256 = "18234d9f01b57248472a9bfa65c3379352b5d66c15b0ef1c2b4feece4b5670fe",
-        build_file = "@com_github_grpc_grpc//third_party:nanopb.BUILD",
-        strip_prefix = "nanopb-0.4.1",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nanopb/nanopb/archive/0.4.1.tar.gz",
-            "https://github.com/nanopb/nanopb/archive/0.4.1.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "linenoise",
         build_file = clean_dep("//third_party:linenoise.BUILD"),
@@ -658,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "813f438baaa9638529023b2218875e01ea037735"
-    LLVM_SHA256 = "d264734ecd31d6d3d16b6a0c10eeb2b43a9ba8ddc9a79db6256847b91c956aa9"
+    LLVM_COMMIT = "7af0c8559b6d9426dd5e977370516d2baa4c206f"
+    LLVM_SHA256 = "4c5efbc48755f9983a8522eddd6e448f0b93e3e75a56a507c1ecb44d367db6d5"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
@@ -736,12 +733,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "snappy",
         build_file = clean_dep("//third_party:snappy.BUILD"),
-        sha256 = "3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4",
-        strip_prefix = "snappy-1.1.7",
+        sha256 = "16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f",
+        strip_prefix = "snappy-1.1.8",
         system_build_file = clean_dep("//third_party/systemlibs:snappy.BUILD"),
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/snappy/archive/1.1.7.tar.gz",
-            "https://github.com/google/snappy/archive/1.1.7.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/snappy/archive/1.1.8.tar.gz",
+            "https://github.com/google/snappy/archive/1.1.8.tar.gz",
         ],
     )
 
@@ -990,30 +987,33 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     # https://github.com/bazelbuild/rules_apple/releases
     tf_http_archive(
         name = "build_bazel_rules_apple",
-        sha256 = "a045a436b642c70fb0c10ca84ff0fd2dcbd59cc89100d597a61e8374afafb366",
+        sha256 = "ee9e6073aeb5a65c100cb9c44b0017c937706a4ae03176e14a7e78620a198079",
+        strip_prefix = "rules_apple-5131f3d46794bf227d296c82f30c2499c9de3c5b",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_apple/releases/download/0.18.0/rules_apple.0.18.0.tar.gz",
-            "https://github.com/bazelbuild/rules_apple/releases/download/0.18.0/rules_apple.0.18.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_apple/archive/5131f3d46794bf227d296c82f30c2499c9de3c5b.tar.gz",
+            "https://github.com/bazelbuild/rules_apple/archive/5131f3d46794bf227d296c82f30c2499c9de3c5b.tar.gz",
         ],
     )
 
     # https://github.com/bazelbuild/rules_swift/releases
     tf_http_archive(
         name = "build_bazel_rules_swift",
-        sha256 = "18cd4df4e410b0439a4935f9ca035bd979993d42372ba79e7f2d4fafe9596ef0",
+        sha256 = "d0833bc6dad817a367936a5f902a0c11318160b5e80a20ece35fb85a5675c886",
+        strip_prefix = "rules_swift-3eeeb53cebda55b349d64c9fc144e18c5f7c0eb8",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_swift/releases/download/0.12.1/rules_swift.0.12.1.tar.gz",
-            "https://github.com/bazelbuild/rules_swift/releases/download/0.12.1/rules_swift.0.12.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_swift/archive/3eeeb53cebda55b349d64c9fc144e18c5f7c0eb8.tar.gz",
+            "https://github.com/bazelbuild/rules_swift/archive/3eeeb53cebda55b349d64c9fc144e18c5f7c0eb8.tar.gz",
         ],
     )
 
     # https://github.com/bazelbuild/apple_support/releases
     tf_http_archive(
         name = "build_bazel_apple_support",
-        sha256 = "122ebf7fe7d1c8e938af6aeaee0efe788a3a2449ece5a8d6a428cb18d6f88033",
+        sha256 = "ad8ae80e93612b8151019367a3d1604d7a51c14480dae1254e10252007e8260c",
+        strip_prefix = "apple_support-501b4afb27745c4813a88ffa28acd901408014e4",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/apple_support/releases/download/0.7.1/apple_support.0.7.1.tar.gz",
-            "https://github.com/bazelbuild/apple_support/releases/download/0.7.1/apple_support.0.7.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/apple_support/archive/501b4afb27745c4813a88ffa28acd901408014e4.tar.gz",
+            "https://github.com/bazelbuild/apple_support/archive/501b4afb27745c4813a88ffa28acd901408014e4.tar.gz",
         ],
     )
 
diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index fd355eeceb1..d9e40703caa 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -31,6 +31,14 @@ cc_library(
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
         "//conditions:default": [],
+    }) + select({
+        "//conditions:default": glob([
+            "aws-cpp-sdk-core/source/net/linux-shared/*.cpp",
+        ]),
+        "@org_tensorflow//tensorflow:windows": glob([
+            "aws-cpp-sdk-core/source/platform/windows/*.cpp",
+            "aws-cpp-sdk-core/source/net/windows/*.cpp",
+        ]),
     }) + glob([
         "aws-cpp-sdk-core/include/**/*.h",
         "aws-cpp-sdk-core/source/*.cpp",
@@ -59,7 +67,6 @@ cc_library(
         "aws-cpp-sdk-transfer/include/**/*.h",
         "aws-cpp-sdk-transfer/source/**/*.cpp",
         "aws-cpp-sdk-core/source/monitoring/*.cpp",
-        "aws-cpp-sdk-core/source/net/linux-shared/*.cpp",
         "aws-cpp-sdk-core/source/utils/memory/*.cpp",
         "aws-cpp-sdk-core/source/utils/crypto/openssl/*.cpp",
     ]),
@@ -94,6 +101,11 @@ cc_library(
             "ENABLE_CURL_CLIENT",
             "OPENSSL_IS_BORINGSSL",
         ],
+        "@org_tensorflow//tensorflow:windows": [
+            "PLATFORM_WINDOWS",
+            "ENABLE_CURL_CLIENT",
+            "OPENSSL_IS_BORINGSSL",
+        ],
         "//conditions:default": [],
     }),
     includes = [
@@ -101,6 +113,13 @@ cc_library(
         "aws-cpp-sdk-s3/include/",
         "aws-cpp-sdk-transfer/include/",
     ],
+    linkopts = select({
+        "@org_tensorflow//tensorflow:windows": [
+            "-DEFAULTLIB:Userenv.lib",
+            "-DEFAULTLIB:Version.lib",
+        ],
+        "//conditions:default": [],
+    }),
     deps = [
         "@aws-c-common",
         "@aws-c-event-stream",
diff --git a/third_party/aws/aws-c-common.bazel b/third_party/aws/aws-c-common.bazel
index f27f50a6eb3..a66fbcb1164 100644
--- a/third_party/aws/aws-c-common.bazel
+++ b/third_party/aws/aws-c-common.bazel
@@ -28,6 +28,9 @@ cc_library(
         "@org_tensorflow//tensorflow:raspberry_pi_armeabi": glob([
             "source/posix/*.c",
         ]),
+        "@org_tensorflow//tensorflow:windows": glob([
+            "source/windows/*.c",
+        ]),
         "//conditions:default": [],
     }) + glob([
         "source/*.c",
@@ -38,6 +41,12 @@ cc_library(
         "include/**/*.h",
         "include/aws/common/**/*.inl"
     ]),
+    linkopts = select({
+        "@org_tensorflow//tensorflow:windows": [
+            "-DEFAULTLIB:BCrypt.lib",
+        ],
+        "//conditions:default": [],
+    }),
     includes = [
         "include/",
     ],
diff --git a/third_party/aws/aws-checksums.bazel b/third_party/aws/aws-checksums.bazel
index 5aa175795b8..759cb2e6fcf 100644
--- a/third_party/aws/aws-checksums.bazel
+++ b/third_party/aws/aws-checksums.bazel
@@ -9,7 +9,12 @@ exports_files(["LICENSE"])
 
 cc_library(
     name = "aws-checksums",
-    srcs = glob([
+    srcs = select({
+        "@org_tensorflow//tensorflow:windows": glob([
+            "source/visualc/*.c",
+        ]),
+        "//conditions:default": [],
+    }) + glob([
         "source/intel/*.c",
         "source/*.c",
     ]),
diff --git a/third_party/clog/BUILD.bazel b/third_party/clog/BUILD.bazel
index 6431f980d97..ee601b85f2b 100644
--- a/third_party/clog/BUILD.bazel
+++ b/third_party/clog/BUILD.bazel
@@ -15,15 +15,13 @@ cc_library(
     hdrs = [
         "deps/clog/include/clog.h",
     ],
-    copts = [
-        "-Wno-unused-result",
-    ],
+    copts = select({
+        ":windows": [],
+        "//conditions:default": ["-Wno-unused-result"],
+    }),
     linkopts = select({
-        ":android": [
-            "-llog",
-        ],
-        "//conditions:default": [
-        ],
+        ":android": ["-llog"],
+        "//conditions:default": [],
     }),
     linkstatic = True,
     strip_include_prefix = "deps/clog/include",
@@ -32,5 +30,9 @@ cc_library(
 config_setting(
     name = "android",
     values = {"crosstool_top": "//external:android/crosstool"},
-    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
 )
diff --git a/third_party/cpuinfo/BUILD.bazel b/third_party/cpuinfo/BUILD.bazel
index 67787e7ac43..5510905cbef 100644
--- a/third_party/cpuinfo/BUILD.bazel
+++ b/third_party/cpuinfo/BUILD.bazel
@@ -97,10 +97,11 @@ cc_library(
     name = "cpuinfo_impl",
     srcs = select({
         ":linux_x86_64": COMMON_SRCS + X86_SRCS + LINUX_SRCS + LINUX_X86_SRCS,
-        ":linux_aarch64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS,
-        ":linux_armhf": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
         ":linux_arm": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
+        ":linux_armhf": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
+        ":linux_aarch64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS,
         ":macos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+        ":windows_x86_64": COMMON_SRCS + X86_SRCS + WINDOWS_X86_SRCS,
         ":android_armv7": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS + ANDROID_ARM_SRCS,
         ":android_arm64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS + ANDROID_ARM_SRCS,
         ":android_x86": COMMON_SRCS + X86_SRCS + LINUX_SRCS + LINUX_X86_SRCS,
@@ -116,9 +117,11 @@ cc_library(
         ":watchos_arm64_32": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
         ":tvos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
         ":tvos_arm64": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
-        ":emscripten_wasm": COMMON_SRCS + EMSCRIPTEN_SRCS,
     }),
-    copts = C99OPTS + [
+    copts = select({
+        ":windows_x86_64": [],
+        "//conditions:default": C99OPTS,
+    }) + [
         "-Iexternal/cpuinfo/include",
         "-Iexternal/cpuinfo/src",
     ],
@@ -164,23 +167,19 @@ config_setting(
     values = {"cpu": "k8"},
 )
 
-config_setting(
-    name = "linux_aarch64",
-    values = {"cpu": "aarch64"},
-)
-
 config_setting(
     name = "linux_arm",
     values = {"cpu": "arm"},
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_armhf",
-    values = {
-        "cpu": "armhf",
-    },
-    visibility = ["//visibility:public"],
+    values = {"cpu": "armhf"},
+)
+
+config_setting(
+    name = "linux_aarch64",
+    values = {"cpu": "aarch64"},
 )
 
 config_setting(
@@ -191,6 +190,11 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "windows_x86_64",
+    values = {"cpu": "x64_windows"},
+)
+
 config_setting(
     name = "android_armv7",
     values = {
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h
index 182e0131864..1a7cd03d498 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h
@@ -11,19 +11,8 @@
 namespace Eigen {
 namespace internal {
 
-typedef struct Packet32q8i {
-  __m256i val;
-  operator __m256i() const { return val; }
-  Packet32q8i() : val(_mm256_setzero_si256()){};
-  Packet32q8i(__m256i val) : val(val) {}
-} Packet32q8i;
-
-typedef struct Packet16q8i {
-  __m128i val;
-  operator __m128i() const { return val; }
-  Packet16q8i() : val(_mm_setzero_si128()) {}
-  Packet16q8i(__m128i val) : val(val) {}
-} Packet16q8i;
+typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i;
+typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i;
 
 template <>
 struct packet_traits<QInt8> : default_packet_traits {
@@ -102,23 +91,23 @@ EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.val);
+      reinterpret_cast<__m256i*>(to), from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
-                                            from.val);
+                                            from.m_val);
 }
 
 typedef __m256 Packet8f;
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 2b16715c723..4c5e02abc9d 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -27,61 +27,14 @@ inline int _mm256_extract_epi8_N1(const __m256i X) {
 namespace Eigen {
 namespace internal {
 
-typedef struct Packet32q8i {
-  __m256i val;
-  operator __m256i() const { return val; }
-  Packet32q8i() : val(_mm256_setzero_si256()){};
-  Packet32q8i(__m256i val) : val(val) {}
-} Packet32q8i;
-
-typedef struct Packet16q16i {
-  __m256i val;
-  operator __m256i() const { return val; }
-  Packet16q16i() : val(_mm256_setzero_si256()){};
-  Packet16q16i(__m256i val) : val(val) {}
-} Packet16q16i;
-
-typedef struct Packet32q8u {
-  __m256i val;
-  operator __m256i() const { return val; }
-  Packet32q8u() : val(_mm256_setzero_si256()){};
-  Packet32q8u(__m256i val) : val(val) {}
-} Packet32q8u;
-
-typedef struct Packet16q8i {
-  __m128i val;
-  operator __m128i() const { return val; }
-  Packet16q8i() : val(_mm_setzero_si128()) {}
-  Packet16q8i(__m128i val) : val(val) {}
-} Packet16q8i;
-
-typedef struct Packet16q8u {
-  __m128i val;
-  operator __m128i() const { return val; }
-  Packet16q8u() : val(_mm_setzero_si128()) {}
-  Packet16q8u(__m128i val) : val(val) {}
-} Packet16q8u;
-
-typedef struct Packet8q16i {
-  __m128i val;
-  operator __m128i() const { return val; }
-  Packet8q16i() : val(_mm_setzero_si128()) {}
-  Packet8q16i(__m128i val) : val(val) {}
-} Packet8q16i;
-
-typedef struct Packet8q32i {
-  __m256i val;
-  operator __m256i() const { return val; }
-  Packet8q32i() : val(_mm256_setzero_si256()){};
-  Packet8q32i(__m256i val) : val(val) {}
-} Packet8q32i;
-
-typedef struct Packet4q32i {
-  __m128i val;
-  operator __m128i() const { return val; }
-  Packet4q32i() : val(_mm_setzero_si128()) {}
-  Packet4q32i(__m128i val) : val(val) {}
-} Packet4q32i;
+typedef eigen_packet_wrapper<__m256i, 20> Packet32q8i;
+typedef eigen_packet_wrapper<__m256i, 21> Packet16q16i;
+typedef eigen_packet_wrapper<__m256i, 22> Packet32q8u;
+typedef eigen_packet_wrapper<__m128i, 23> Packet16q8i;
+typedef eigen_packet_wrapper<__m128i, 25> Packet16q8u;
+typedef eigen_packet_wrapper<__m128i, 26> Packet8q16i;
+typedef eigen_packet_wrapper<__m256i, 27> Packet8q32i;
+typedef eigen_packet_wrapper<__m128i, 28> Packet4q32i;
 
 #ifndef EIGEN_VECTORIZE_AVX512
 template <>
@@ -315,64 +268,64 @@ EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.val);
+      reinterpret_cast<__m256i*>(to), from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.val);
+      reinterpret_cast<__m256i*>(to), from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.val);
+      reinterpret_cast<__m256i*>(to), from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.val);
+      reinterpret_cast<__m256i*>(to), from.m_val);
 }
 
 // Aligned store
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
-                                            from.val);
+                                            from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
-                                            from.val);
+                                            from.m_val);
 }
 
 // Extract first element.
@@ -382,15 +335,15 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
-  return _mm256_extract_epi16_N0(a.val);
+  return _mm256_extract_epi16_N0(a.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
-  return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.val));
+  return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.m_val));
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
-  return _mm256_extract_epi8_N0(a.val);
+  return _mm256_extract_epi8_N0(a.m_val);
 }
 
 // Initialize to constant value.
@@ -411,7 +364,7 @@ EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
 template <>
 EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
                                                   const Packet8q32i& b) {
-  return _mm256_add_epi32(a.val, b.val);
+  return _mm256_add_epi32(a.m_val, b.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
@@ -420,62 +373,62 @@ EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
 template <>
 EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
                                                   const Packet8q32i& b) {
-  return _mm256_sub_epi32(a.val, b.val);
+  return _mm256_sub_epi32(a.m_val, b.m_val);
 }
 // Note: mullo truncates the result to 32 bits.
 template <>
 EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
                                                   const Packet8q32i& b) {
-  return _mm256_mullo_epi32(a.val, b.val);
+  return _mm256_mullo_epi32(a.m_val, b.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
-  return _mm256_sub_epi32(_mm256_setzero_si256(), a.val);
+  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m_val);
 }
 
 // Min and max.
 template <>
 EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
                                                   const Packet8q32i& b) {
-  return _mm256_min_epi32(a.val, b.val);
+  return _mm256_min_epi32(a.m_val, b.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
                                                   const Packet8q32i& b) {
-  return _mm256_max_epi32(a.val, b.val);
+  return _mm256_max_epi32(a.m_val, b.m_val);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
                                                     const Packet16q16i& b) {
-  return _mm256_min_epi16(a.val, b.val);
+  return _mm256_min_epi16(a.m_val, b.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
                                                     const Packet16q16i& b) {
-  return _mm256_max_epi16(a.val, b.val);
+  return _mm256_max_epi16(a.m_val, b.m_val);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
                                                   const Packet32q8u& b) {
-  return _mm256_min_epu8(a.val, b.val);
+  return _mm256_min_epu8(a.m_val, b.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
                                                   const Packet32q8u& b) {
-  return _mm256_max_epu8(a.val, b.val);
+  return _mm256_max_epu8(a.m_val, b.m_val);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
                                                   const Packet32q8i& b) {
-  return _mm256_min_epi8(a.val, b.val);
+  return _mm256_min_epi8(a.m_val, b.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
                                                   const Packet32q8i& b) {
-  return _mm256_max_epi8(a.val, b.val);
+  return _mm256_max_epi8(a.m_val, b.m_val);
 }
 
 // Reductions.
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index 6c77aa7b511..5a0ae2e8c8c 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -6,33 +6,10 @@
 namespace Eigen {
 namespace internal {
 
-typedef struct Packet64q8i {
-  __m512i val;
-  operator __m512i() const { return val; }
-  Packet64q8i();
-  Packet64q8i(__m512i val) : val(val) {}
-} Packet64q8i;
-
-typedef struct Packet32q16i {
-  __m512i val;
-  operator __m512i() const { return val; }
-  Packet32q16i();
-  Packet32q16i(__m512i val) : val(val) {}
-} Packet32q16i;
-
-typedef struct Packet64q8u {
-  __m512i val;
-  operator __m512i() const { return val; }
-  Packet64q8u();
-  Packet64q8u(__m512i val) : val(val) {}
-} Packet64q8u;
-
-typedef struct Packet16q32i {
-  __m512i val;
-  operator __m512i() const { return val; }
-  Packet16q32i();
-  Packet16q32i(__m512i val) : val(val) {}
-} Packet16q32i;
+typedef eigen_packet_wrapper<__m512i, 30> Packet64q8i;
+typedef eigen_packet_wrapper<__m512i, 31> Packet32q16i;
+typedef eigen_packet_wrapper<__m512i, 32> Packet64q8u;
+typedef eigen_packet_wrapper<__m512i, 33> Packet16q32i;
 
 template <>
 struct packet_traits<QInt8> : default_packet_traits {
@@ -216,44 +193,44 @@ EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
-      reinterpret_cast<__m512i*>(to), from.val);
+      reinterpret_cast<__m512i*>(to), from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
-      reinterpret_cast<__m512i*>(to), from.val);
+      reinterpret_cast<__m512i*>(to), from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
-      reinterpret_cast<__m512i*>(to), from.val);
+      reinterpret_cast<__m512i*>(to), from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
-      reinterpret_cast<__m512i*>(to), from.val);
+      reinterpret_cast<__m512i*>(to), from.m_val);
 }
 
 // Aligned store
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
-                                               from.val);
+                                               from.m_val);
 }
 
 // Extract first element.
@@ -264,15 +241,15 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
 template <>
 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
   return static_cast<uint8_t>(
-      _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
+      _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0));
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
-  return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0);
+  return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
-  return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0);
+  return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
 }
 
 // Initialize to constant value.
@@ -297,46 +274,46 @@ EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
 template <>
 EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
                                                     const Packet16q32i& b) {
-  return _mm512_add_epi32(a.val, b.val);
+  return _mm512_add_epi32(a.m_val, b.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
                                                     const Packet16q32i& b) {
-  return _mm512_sub_epi32(a.val, b.val);
+  return _mm512_sub_epi32(a.m_val, b.m_val);
 }
 // Note: mullo truncates the result to 32 bits.
 template <>
 EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
                                                     const Packet16q32i& b) {
-  return _mm512_mullo_epi32(a.val, b.val);
+  return _mm512_mullo_epi32(a.m_val, b.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
-  return _mm512_sub_epi32(_mm512_setzero_si512(), a.val);
+  return _mm512_sub_epi32(_mm512_setzero_si512(), a.m_val);
 }
 
 // Min and max.
 template <>
 EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
                                                     const Packet16q32i& b) {
-  return _mm512_min_epi32(a.val, b.val);
+  return _mm512_min_epi32(a.m_val, b.m_val);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
                                                     const Packet16q32i& b) {
-  return _mm512_max_epi32(a.val, b.val);
+  return _mm512_max_epi32(a.m_val, b.m_val);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
                                                   const Packet64q8u& b) {
 #ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_min_epu8(a.val, b.val);
+  return _mm512_min_epu8(a.m_val, b.m_val);
 #else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
   __m256i r0 = _mm256_min_epu8(ap0, bp0);
   __m256i r1 = _mm256_min_epu8(ap1, bp1);
   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
@@ -346,12 +323,12 @@ template <>
 EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
                                                   const Packet64q8u& b) {
 #ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_max_epu8(a.val, b.val);
+  return _mm512_max_epu8(a.m_val, b.m_val);
 #else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
   __m256i r0 = _mm256_max_epu8(ap0, bp0);
   __m256i r1 = _mm256_max_epu8(ap1, bp1);
   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
@@ -362,12 +339,12 @@ template <>
 EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
                                                   const Packet64q8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_min_epi8(a.val, b.val);
+  return _mm512_min_epi8(a.m_val, b.m_val);
 #else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
   __m256i r0 = _mm256_min_epi8(ap0, bp0);
   __m256i r1 = _mm256_min_epi8(ap1, bp1);
   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
@@ -377,12 +354,12 @@ template <>
 EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
                                                     const Packet32q16i& b) {
 #ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_min_epi16(a.val, b.val);
+  return _mm512_min_epi16(a.m_val, b.m_val);
 #else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
   __m256i r0 = _mm256_min_epi16(ap0, bp0);
   __m256i r1 = _mm256_min_epi16(ap1, bp1);
   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
@@ -392,12 +369,12 @@ template <>
 EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
                                                   const Packet64q8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_max_epi8(a.val, b.val);
+  return _mm512_max_epi8(a.m_val, b.m_val);
 #else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
   __m256i r0 = _mm256_max_epi8(ap0, bp0);
   __m256i r1 = _mm256_max_epi8(ap1, bp1);
   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
@@ -407,12 +384,12 @@ template <>
 EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
                                                     const Packet32q16i& b) {
 #ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_max_epi16(a.val, b.val);
+  return _mm512_max_epi16(a.m_val, b.m_val);
 #else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
   __m256i r0 = _mm256_max_epi16(ap0, bp0);
   __m256i r1 = _mm256_max_epi16(ap1, bp1);
   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
@@ -422,112 +399,112 @@ EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
 // Reductions.
 template <>
 EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
   Packet4i res =
       _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
   res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(
-      _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  return pfirst(res);
 }
 template <>
 EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
   Packet4i res =
       _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
   res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(
-      _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  return pfirst(res);
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
   Packet4i res =
       _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
   res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w = pfirst(
-      _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
   return std::min(
       {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
   Packet4i res =
       _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
   res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w = pfirst(
-      _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
   return std::max(
       {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
   Packet4i res =
       _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
   res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w = pfirst(
-      _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
   return std::min(
       {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
        static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
   Packet4i res =
       _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
   res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w = pfirst(
-      _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
   return std::max(
       {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
        static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
   Packet4i res =
       _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
   res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w = pfirst(
-      _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
   return std::min(
       {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
        static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
   Packet4i res =
       _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
   res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w = pfirst(
-      _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
   return std::min(
       {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
        static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
index d6954b7b3c4..5dd2cd309b8 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
@@ -13,7 +13,7 @@ struct type_casting_traits<QInt32, float> {
 
 template <>
 EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) {
-  return _mm256_cvtepi32_ps(a.val);
+  return _mm256_cvtepi32_ps(a.m_val);
 }
 
 template <>
@@ -35,8 +35,8 @@ template <>
 EIGEN_STRONG_INLINE Packet32q8i
 pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b,
                                 const Packet8q32i& c, const Packet8q32i& d) {
-  __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.val, b.val),
-                                         _mm256_packs_epi32(c.val, d.val));
+  __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.m_val, b.m_val),
+                                         _mm256_packs_epi32(c.m_val, d.m_val));
   // Since packs does not cross 128 bit lane boundaries,
   // we have to permute to properly order the final result.
   const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
@@ -51,7 +51,7 @@ struct type_casting_traits<float, QInt8> {
 template <>
 EIGEN_STRONG_INLINE Packet32q8i
 pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
-                                const Packet8f& c, const Packet8f& d) {
+                             const Packet8f& c, const Packet8f& d) {
   const __m256i a_conv = _mm256_cvtps_epi32(a);
   const __m256i b_conv = _mm256_cvtps_epi32(b);
   const __m256i c_conv = _mm256_cvtps_epi32(c);
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
index d3b02402971..17408d13abf 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
@@ -14,7 +14,7 @@ struct type_casting_traits<QInt32, float> {
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pcast<Packet16q32i>(const Packet16q32i& a) {
-  return _mm512_cvtepi32_ps(a.val);
+  return _mm512_cvtepi32_ps(a.m_val);
 }
 
 template <>
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index d07ad18630f..9be627119cf 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -472,6 +472,7 @@ def flatbuffer_java_library(
     native.java_library(
         name = name,
         srcs = [out_srcjar],
+        javacopts = ["-source 7 -target 7"],
         deps = [
             "@flatbuffers//:runtime_java",
         ],
@@ -562,7 +563,6 @@ def flatbuffer_android_library(
         srcs,
         custom_package = "",
         package_prefix = "",
-        javacopts = None,
         include_paths = DEFAULT_INCLUDE_PATHS,
         flatc_args = DEFAULT_FLATC_ARGS,
         visibility = None):
@@ -575,7 +575,6 @@ def flatbuffer_android_library(
           namespace in the schema files will be used. (optional)
       package_prefix: like custom_package, but prefixes to the existing
           namespace. (optional)
-      javacopts: List of options to pass to javac.
       include_paths: List of paths that includes files can be found in. (optional)
       flatc_args: List of additional arguments to pass to flatc. (optional)
       visibility: Visibility setting for the android_library rule. (optional)
@@ -604,6 +603,7 @@ def flatbuffer_android_library(
     android_library(
         name = name,
         srcs = [out_srcjar],
+        javacopts = ["-source 7 -target 7"],
         visibility = visibility,
         deps = [
             "@flatbuffers//:runtime_android",
diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index 7b249c0c606..4acc05ff88c 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -12,1440 +12,237 @@ load(
     "tool",
     "tool_path",
     "variable_with_value",
+    "with_feature_set",
 )
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
+def all_assembly_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
+def all_compile_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-    host_system_name = "local"
+def all_c_compile_actions():
+    return [
+        ACTION_NAMES.c_compile,
+    ]
 
-    target_system_name = "local"
+def all_cpp_compile_actions():
+    return [
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+    ]
 
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
+def all_preprocessed_actions():
+    return [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = ctx.attr.builtin_sysroot
-
-    all_link_actions = [
+def all_link_actions():
+    return [
         ACTION_NAMES.cpp_link_executable,
         ACTION_NAMES.cpp_link_dynamic_library,
         ACTION_NAMES.cpp_link_nodeps_dynamic_library,
     ]
 
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
+def all_executable_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+    ]
+
+def all_shared_library_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_archive_actions():
+    return [ACTION_NAMES.cpp_link_static_library]
+
+def all_strip_actions():
+    return [ACTION_NAMES.strip]
+
+def _library_to_link(flag_prefix, value, iterate = None):
+    return flag_group(
+        flags = [
+            "{}%{{libraries_to_link.{}}}".format(
+                flag_prefix,
+                iterate if iterate else "name",
+            ),
         ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
+        iterate_over = ("libraries_to_link." + iterate if iterate else None),
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = value,
+        ),
     )
 
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
+def _surround_static_library(prefix, suffix):
+    return [
+        flag_group(
+            flags = [prefix, "%{libraries_to_link.name}", suffix],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _prefix_static_library(prefix):
+    return [
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = [prefix + "%{libraries_to_link.name}"],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
+    if alwayslink_suffix:
+        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
+    else:
+        flag_groups = _prefix_static_library(alwayslink_prefix)
+    return flag_group(
+        flag_groups = flag_groups,
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = "static_library",
+        ),
     )
 
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
+def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
+    return flag_group(
+        iterate_over = iterate_over,
+        expand_if_available = iterate_over,
+        flag_groups = flag_groups,
+        flags = flags,
     )
 
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
+def _libraries_to_link_group(flavour):
+    if flavour == "linux":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                flag_group(
+                    flags = ["-Wl,--start-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file_group", "object_files"),
+                flag_group(
+                    flags = ["-Wl,--end-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "darwin":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-force_load,"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "msvc":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("/WHOLEARCHIVE:"),
+            ],
+        )
+
+def _action_configs_with_tool(path, actions):
+    return [
+        action_config(
+            action_name = name,
+            enabled = True,
+            tools = [tool(path = path)],
+        )
+        for name in actions
+    ]
+
+def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
+    return _action_configs_with_tool(
+        assembly_path,
+        all_assembly_actions(),
+    ) + _action_configs_with_tool(
+        c_compiler_path,
+        all_c_compile_actions(),
+    ) + _action_configs_with_tool(
+        cc_compiler_path,
+        all_cpp_compile_actions(),
+    ) + _action_configs_with_tool(
+        archiver_path,
+        all_archive_actions(),
+    ) + _action_configs_with_tool(
+        linker_path,
+        all_link_actions(),
+    ) + _action_configs_with_tool(
+        strip_path,
+        all_strip_actions(),
     )
 
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
+def _tool_paths(cpu, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + (
+                "/ar" if cpu == "local" else "/libtool"
+            )),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
         ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if ctx.attr.compiler == "clang":
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags",
-          enabled = True,
-          flag_sets = [
-              flag_set(
-                  actions = [
-                      ACTION_NAMES.assemble,
-                      ACTION_NAMES.preprocess_assemble,
-                      ACTION_NAMES.linkstamp_compile,
-                      ACTION_NAMES.c_compile,
-                      ACTION_NAMES.cpp_compile,
-                      ACTION_NAMES.cpp_header_parsing,
-                      ACTION_NAMES.cpp_module_compile,
-                      ACTION_NAMES.cpp_module_codegen,
-                      ACTION_NAMES.lto_backend,
-                      ACTION_NAMES.clif_match,
-                  ],
-                  flag_groups = [
-                      flag_group(
-                          flags = [
-                              "-fexperimental-new-pass-manager",
-                          ],
-                      ),
-                  ],
-              ),
-          ],
-      )
-
-    elif ctx.attr.compiler == "msvc":
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags",
-          enabled = True,
-          flag_sets = [
-              flag_set(
-                  actions = [
-                      ACTION_NAMES.assemble,
-                      ACTION_NAMES.preprocess_assemble,
-                      ACTION_NAMES.linkstamp_compile,
-                      ACTION_NAMES.c_compile,
-                      ACTION_NAMES.cpp_compile,
-                      ACTION_NAMES.cpp_header_parsing,
-                      ACTION_NAMES.cpp_module_compile,
-                      ACTION_NAMES.cpp_module_codegen,
-                      ACTION_NAMES.lto_backend,
-                      ACTION_NAMES.clif_match,
-                  ],
-                  flag_groups = [
-                      flag_group(
-                          flags = [
-                              "/DCOMPILER_MSVC",
-                              "/DNOMINMAX",
-                              "/D_WIN32_WINNT=0x0600",
-                              "/D_CRT_SECURE_NO_DEPRECATE",
-                              "/D_CRT_SECURE_NO_WARNINGS",
-                              "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                              "/bigobj",
-                              "/Zm500",
-                              "/J",
-                              "/Gy",
-                              "/GF",
-                              "/EHsc",
-                              "/wd4351",
-                              "/wd4291",
-                              "/wd4250",
-                              "/wd4996",
-                          ],
-                      ),
-                  ],
-              ),
-          ],
-      )
-
-    else:
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags")
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cuda_path_feature = feature(
-        name = "cuda_path",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--cuda-path=" + ctx.attr.cuda_path],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cpp11_feature = feature(
-        name = "c++11",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-std=c++11"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            default_compile_flags_feature,
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-        if ctx.attr.cuda_path:
-            features += [cuda_path_feature]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
+    elif cpu == "x64_windows":
+        return [
             tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
             tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
             tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
@@ -1466,58 +263,766 @@ def _impl(ctx):
                 path = "wrapper/bin/msvc_nop.bat",
             ),
         ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+    else:
+        fail("Unreachable")
+
+def _sysroot_group():
+    return flag_group(
+        flags = ["--sysroot=%{sysroot}"],
+        expand_if_available = "sysroot",
+    )
+
+def _no_canonical_prefixes_group(extra_flags):
+    return flag_group(
+        flags = [
+            "-no-canonical-prefixes",
+        ] + extra_flags,
+    )
+
+def _cuda_set(cuda_path, actions):
+    if cuda_path:
+        return flag_set(
+            actions = actions,
+            flag_groups = [
+                flag_group(
+                    flags = ["--cuda-path=" + cuda_path],
+                ),
+            ],
+        )
+    else:
+        return []
+
+def _nologo():
+  return flag_group(flags = ["/nologo"])
+
+def _features(cpu, compiler, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-MD", "-MF", "%{dependency_file}"],
+                                expand_if_available = "dependency_file",
+                            ),
+                            flag_group(
+                                flags = ["-gsplit-dwarf"],
+                                expand_if_available = "per_object_debug_info_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-frandom-seed=%{output_file}"],
+                                expand_if_available = "output_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-include", "%{includes}"],
+                                iterate_over = "includes",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-iquote", "%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-isystem", "%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-F", "%{framework_include_paths}"],
+                                iterate_over = "framework_include_paths",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_cpp_compile_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-fexperimental-new-pass-manager"]),
+                        ] if compiler == "clang" else [],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-Wno-builtin-macro-redefined",
+                                    "-D__DATE__=\"redacted\"",
+                                    "-D__TIMESTAMP__=\"redacted\"",
+                                    "-D__TIME__=\"redacted\"",
+                                ],
+                            ),
+                            flag_group(
+                                flags = ["-fPIC"],
+                                expand_if_available = "pic",
+                            ),
+                            flag_group(
+                                flags = ["-fPIE"],
+                                expand_if_not_available = "pic",
+                            ),
+                            flag_group(
+                                flags = [
+                                    "-U_FORTIFY_SOURCE",
+                                    "-D_FORTIFY_SOURCE=1",
+                                    "-fstack-protector",
+                                    "-Wall",
+                                ] + ctx.attr.host_compiler_warnings + [
+                                    "-fno-omit-frame-pointer",
+                                ],
+                            ),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["disable-assertions"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-g0",
+                                    "-O2",
+                                    "-ffunction-sections",
+                                    "-fdata-sections",
+                                ],
+                            ),
+                        ],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-g"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                ] + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_compile_actions,
+                ) + [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                            _sysroot_group(),
+                            flag_group(
+                                expand_if_available = "source_file",
+                                flags = ["-c", "%{source_file}"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_assembly_file",
+                                flags = ["-S"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_preprocess_file",
+                                flags = ["-E"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_file",
+                                flags = ["-o", "%{output_file}"],
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                expand_if_available = "linker_param_file",
+                                flags = ["@%{linker_param_file}"],
+                            ),
+                            flag_group(flags = ["rcsD"]),
+                            flag_group(
+                                flags = ["%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            flag_group(
+                                iterate_over = "libraries_to_link",
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file",
+                                        ),
+                                    ),
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.object_files}"],
+                                        iterate_over = "libraries_to_link.object_files",
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file_group",
+                                        ),
+                                    ),
+                                ],
+                                expand_if_available = "libraries_to_link",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["-shared"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["-o", "%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-L%{library_search_directories}"],
+                                iterate_over = "library_search_directories",
+                            ),
+                            _iterate_flag_group(
+                                iterate_over = "runtime_library_search_directories",
+                                flags = [
+                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
+                                ] if cpu == "local" else [
+                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
+                                ],
+                            ),
+                            _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,--gdb-index"],
+                                expand_if_available = "is_using_fission",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,-S"],
+                                expand_if_available = "strip_debug_symbols",
+                            ),
+                            flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_executable_link_actions(),
+                        flag_groups = [flag_group(flags = ["-pie"])],
+                    ),
+                ] + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = [
+                            "-Wl,-z,relro,-z,now",
+                        ])],
+                    ),
+                ] if cpu == "local" else []) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
+                        with_features = [with_feature_set(features = ["alwayslink"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path]),
+                        ],
+                    ),
+                ] + ([flag_set(
+                    actions = all_link_actions(),
+                    flag_groups = [
+                        flag_group(flags = ["-Wl,--gc-sections"]),
+                        flag_group(
+                            flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                        ),
+                    ],
+                )] if cpu == "local" else []) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+                    ),
+                ] if cpu == "darwin" else []) + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_link_actions(),
+                ) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _sysroot_group(),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "alwayslink", enabled = cpu == "local"),
+            feature(name = "opt"),
+            feature(name = "fastbuild"),
+            feature(name = "dbg"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(name = "pic", enabled = True),
+            feature(name = "supports_pic", enabled = True),
+            feature(name = "has_configured_linker_path", enabled = True),
         ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+    elif cpu == "x64_windows":
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "common_flags",
+                enabled = True,
+                env_sets = [
+                    env_set(
+                        actions = all_compile_actions() + all_link_actions() + all_archive_actions(),
+                        env_entries = [
+                            env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                            env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include),
+                            env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
+                            env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                            env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-B",
+                                    "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
+                                ],
+                            ),
+                            _nologo(),
+                            flag_group(
+                                flags = [
+                                    "/DCOMPILER_MSVC",
+                                    "/DNOMINMAX",
+                                    "/D_WIN32_WINNT=0x0600",
+                                    "/D_CRT_SECURE_NO_DEPRECATE",
+                                    "/D_CRT_SECURE_NO_WARNINGS",
+                                    "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
+                                    "/bigobj",
+                                    "/Zm500",
+                                    "/J",
+                                    "/Gy",
+                                    "/GF",
+                                    "/EHsc",
+                                    "/wd4351",
+                                    "/wd4291",
+                                    "/wd4250",
+                                    "/wd4996",
+                                ],
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [flag_group(flags = ["/showIncludes"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MT"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MD"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MTd"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MDd"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                        with_features = [with_feature_set(features = ["fastbuild"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                        ] + ([
+                            flag_group(flags = ctx.attr.host_unfiltered_compile_flags),
+                        ] if ctx.attr.host_unfiltered_compile_flags else []),
+                    ),
+                    flag_set(
+                        actions = [ACTION_NAMES.assemble],
+                        flag_groups = [
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fo%{output_file}", "/Zi"],
+                                        expand_if_not_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fo%{output_file}"],
+                                        expand_if_not_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fa%{output_file}"],
+                                        expand_if_available = "output_assembly_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/P", "/Fi%{output_file}"],
+                                        expand_if_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/c", "%{source_file}"],
+                                expand_if_available = "source_file",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            _nologo(),
+                            flag_group(
+                                flags = ["/OUT:%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DLL"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _nologo(),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["/OUT:%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/IMPLIB:%{interface_library_output_path}"],
+                                expand_if_available = "interface_library_output_path",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions() +
+                                  all_archive_actions(),
+                        flag_groups = [
+                            _libraries_to_link_group("msvc"),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["/SUBSYSTEM:CONSOLE"]),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(flags = ["/MACHINE:X64"]),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions() +
+                                  all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
+                        ],
+                        with_features = [with_feature_set(features = ["fastbuild"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                                expand_if_available = "def_file_path",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "parse_showincludes", enabled = True),
+            feature(name = "no_stripping", enabled = True),
+            feature(
+                name = "targets_windows",
+                enabled = True,
+                implies = ["copy_dynamic_libraries_to_binary"],
+            ),
+            feature(name = "copy_dynamic_libraries_to_binary"),
+            feature(
+                name = "generate_pdb_file",
+                requires = [
+                    feature_set(features = ["dbg"]),
+                    feature_set(features = ["fastbuild"]),
+                ],
+            ),
+            feature(name = "static_link_msvcrt"),
+            feature(
+                name = "static_link_msvcrt_no_debug",
+                requires = [
+                    feature_set(features = ["fastbuild"]),
+                    feature_set(features = ["opt"]),
+                ],
+            ),
+            feature(
+                name = "dynamic_link_msvcrt_no_debug",
+                requires = [
+                    feature_set(features = ["fastbuild"]),
+                    feature_set(features = ["opt"]),
+                ],
+            ),
+            feature(
+                name = "static_link_msvcrt_debug",
+                requires = [feature_set(features = ["dbg"])],
+            ),
+            feature(
+                name = "dynamic_link_msvcrt_debug",
+                requires = [feature_set(features = ["dbg"])],
+            ),
+            feature(
+                name = "dbg",
+                implies = ["generate_pdb_file"],
+            ),
+            feature(
+                name = "fastbuild",
+                implies = ["generate_pdb_file"],
+            ),
+            feature(
+                name = "opt",
+            ),
+            feature(name = "windows_export_all_symbols"),
+            feature(name = "no_windows_export_all_symbols"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(
+                name = "supports_interface_shared_libraries",
+                enabled = True,
+            ),
+            feature(name = "has_configured_linker_path", enabled = True),
         ]
     else:
         fail("Unreachable")
 
+def _impl(ctx):
+    cpu = ctx.attr.cpu
+    compiler = ctx.attr.compiler
+
+    if (cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+        target_cpu = "darwin"
+        target_libc = "macosx"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/libtool",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+    elif (cpu == "local"):
+        toolchain_identifier = "local_linux"
+        target_cpu = "local"
+        target_libc = "local"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+    elif (cpu == "x64_windows"):
+        toolchain_identifier = "local_windows"
+        target_cpu = "x64_windows"
+        target_libc = "msvcrt"
+        compiler = "msvc-cl"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.msvc_ml_path,
+            c_compiler_path = ctx.attr.msvc_cl_path,
+            cc_compiler_path = ctx.attr.msvc_cl_path,
+            archiver_path = ctx.attr.msvc_lib_path,
+            linker_path = ctx.attr.msvc_link_path,
+            strip_path = "fake_tool_strip_not_supported",
+        )
+    else:
+        fail("Unreachable")
+
     out = ctx.actions.declare_file(ctx.label.name)
     ctx.actions.write(out, "Fake executable")
     return [
         cc_common.create_cc_toolchain_config_info(
             ctx = ctx,
-            features = features,
+            features = _features(cpu, compiler, ctx),
             action_configs = action_configs,
             artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
+            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
             toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
+            host_system_name = "local",
+            target_system_name = "local",
             target_cpu = target_cpu,
             target_libc = target_libc,
             compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
+            abi_version = "local",
+            abi_libc_version = "local",
+            tool_paths = _tool_paths(cpu, ctx),
             make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
+            builtin_sysroot = ctx.attr.builtin_sysroot,
+            cc_target_os = None,
         ),
         DefaultInfo(
             executable = out,
@@ -1528,6 +1033,7 @@ cc_toolchain_config = rule(
     implementation = _impl,
     attrs = {
         "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
+        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default = "unknown"),
         "builtin_include_directories": attr.string_list(),
         "extra_no_canonical_prefixes_flags": attr.string_list(),
         "host_compiler_path": attr.string(),
@@ -1545,7 +1051,6 @@ cc_toolchain_config = rule(
         "msvc_lib_path": attr.string(default = "msvc_not_used"),
         "msvc_link_path": attr.string(default = "msvc_not_used"),
         "msvc_ml_path": attr.string(default = "msvc_not_used"),
-        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default="unknown"),
     },
     provides = [CcToolchainConfigInfo],
     executable = True,
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index 573db7efde9..303339e77f7 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -53,13 +53,6 @@ NVCC_PATH = '%{nvcc_path}'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
 NVCC_VERSION = '%{cuda_version}'
 
-
-# TODO(amitpatankar): Benchmark enabling all capabilities by default.
-# Environment variable for supported TF CUDA Compute Capabilities
-# eg. export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
-CUDA_COMPUTE_ENV_VAR = 'TF_CUDA_COMPUTE_CAPABILITIES'
-DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,6.0'
-
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
 
@@ -78,7 +71,8 @@ def GetOptionValue(argv, option):
   """
 
   parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-').replace('-', '_')
   args, _ = parser.parse_known_args(argv)
   if not args or not vars(args)[option]:
     return []
@@ -151,6 +145,21 @@ def GetNvccOptions(argv):
     return ' '.join(['--'+a for a in options])
   return ''
 
+def system(cmd):
+  """Invokes cmd with os.system().
+
+  Args:
+    cmd: The command.
+
+  Returns:
+    The exit code if the process exited with exit() or -signal
+    if the process was terminated by a signal.
+  """
+  retv = os.system(cmd)
+  if os.WIFEXITED(retv):
+    return os.WEXITSTATUS(retv)
+  else:
+    return -os.WTERMSIG(retv)
 
 def InvokeNvcc(argv, log=False):
   """Call nvcc with arguments assembled from argv.
@@ -160,30 +169,30 @@ def InvokeNvcc(argv, log=False):
     log: True if logging is requested.
 
   Returns:
-    The return value of calling os.system('nvcc ' + args)
+    The return value of calling system('nvcc ' + args)
   """
 
   host_compiler_options = GetHostCompilerOptions(argv)
   nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
+  opt_option = GetOptionValue(argv, '-O')
+  m_options = GetOptionValue(argv, '-m')
   m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
+  include_options = GetOptionValue(argv, '-I')
+  out_file = GetOptionValue(argv, '-o')
+  depfiles = GetOptionValue(argv, '-MF')
+  defines = GetOptionValue(argv, '-D')
   defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
+  undefines = GetOptionValue(argv, '-U')
   undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  # currently only c++11 is supported by Cuda 7.0 std argument
-  nvcc_allowed_std_options = ["c++11"]
+  std_options = GetOptionValue(argv, '-std')
+  # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang.
+  nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
   std_options = ''.join([' -std=' + define
-      for define in std_options if define in nvcc_allowed_std_options])
+      for define in std_options if define in nvcc_allowed_std_options][-1:])
 
   # The list of source files get passed after the -c option. I don't know of
   # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
+  src_files = GetOptionValue(argv, '-c')
 
   # Pass -w through from host to nvcc, but don't do anything fancier with
   # warnings-related flags, since they're not necessarily the same across
@@ -209,13 +218,12 @@ def InvokeNvcc(argv, log=False):
   srcs = ' '.join(src_files)
   out = ' -o ' + out_file[0]
 
-  supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ]
   nvccopts = '-D_FORCE_INLINES '
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
+  for capability in GetOptionValue(argv, "--cuda-gpu-arch"):
+    capability = capability[len('sm_'):]
     nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
         capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += nvcc_compiler_options
   nvccopts += undefines
   nvccopts += defines
   nvccopts += std_options
@@ -231,7 +239,7 @@ def InvokeNvcc(argv, log=False):
            ' -I .' +
            ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
     if log: Log(cmd)
-    exit_status = os.system(cmd)
+    exit_status = system(cmd)
     if exit_status != 0:
       return exit_status
 
@@ -245,7 +253,7 @@ def InvokeNvcc(argv, log=False):
   # Need to investigate and fix.
   cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
   if log: Log(cmd)
-  return os.system(cmd)
+  return system(cmd)
 
 
 def main():
@@ -257,6 +265,7 @@ def main():
   if args.x and args.x[0] == 'cuda':
     if args.cuda_log: Log('-x cuda')
     leftover = [pipes.quote(s) for s in leftover]
+    args.cuda_log = True
     if args.cuda_log: Log('using nvcc')
     return InvokeNvcc(leftover, log=args.cuda_log)
 
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index f5ac7b39dfd..89275128a9c 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -179,7 +179,7 @@ def InvokeHipcc(argv, log=False):
   # Also we need to retain warning about uninitialised shared variable as
   # warning only, even when -Werror option is specified.
   if HIPCC_IS_HIPCLANG:
-    hipccopts += ' --include=hip/hip_runtime.h -Wno-error=cuda-shared-init '
+    hipccopts += ' --include=hip/hip_runtime.h '
   hipccopts += ' ' + hipcc_compiler_options
   # Use -fno-gpu-rdc by default for early GPU kernel finalization
   # This flag would trigger GPU kernels be generated at compile time, instead
@@ -258,6 +258,8 @@ def main():
     gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
+    if HIPCC_IS_HIPCLANG:
+      gpu_linker_flags.append("-lrt")
 
     if VERBOSE: print(' '.join([CPU_COMPILER] + gpu_linker_flags))
     return subprocess.call([CPU_COMPILER] + gpu_linker_flags)
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index 46e8aef3606..de6512e3088 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -37,13 +37,6 @@ GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
 NVCC_PATH = '%{nvcc_path}'
 NVCC_VERSION = '%{cuda_version}'
 NVCC_TEMP_DIR = "%{nvcc_tmp_dir}"
-DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,6.0'
-
-# Taken from environment variable for supported TF CUDA Compute Capabilities
-# eg. export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
-supported_cuda_compute_capabilities = os.environ.get(
-    'TF_CUDA_COMPUTE_CAPABILITIES',
-    DEFAULT_CUDA_COMPUTE_CAPABILITIES).split(',')
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
@@ -53,7 +46,7 @@ def GetOptionValue(argv, option):
   """Extract the list of values for option from options.
 
   Args:
-    option: The option whose value to extract, without the leading '/'.
+    option: The option whose value to extract.
 
   Returns:
     1. A list of values, either directly following the option,
@@ -62,8 +55,9 @@ def GetOptionValue(argv, option):
     2. The leftover options.
   """
 
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
+  parser = ArgumentParser(prefix_chars='-/')
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-/').replace('-', '_')
   args, leftover = parser.parse_known_args(argv)
   if args and vars(args)[option]:
     return (sum(vars(args)[option], []), leftover)
@@ -122,18 +116,18 @@ def InvokeNvcc(argv, log=False):
 
   nvcc_compiler_options, argv = GetNvccOptions(argv)
 
-  opt_option, argv = GetOptionValue(argv, 'O')
+  opt_option, argv = GetOptionValue(argv, '/O')
   opt = ['-g']
   if (len(opt_option) > 0 and opt_option[0] != 'd'):
     opt = ['-O2']
 
-  include_options, argv = GetOptionValue(argv, 'I')
+  include_options, argv = GetOptionValue(argv, '/I')
   includes = ["-I " + include for include in include_options]
 
-  defines, argv = GetOptionValue(argv, 'D')
+  defines, argv = GetOptionValue(argv, '/D')
   defines = ['-D' + define for define in defines]
 
-  undefines, argv = GetOptionValue(argv, 'U')
+  undefines, argv = GetOptionValue(argv, '/U')
   undefines = ['-U' + define for define in undefines]
 
   # The rest of the unrecognized options should be passed to host compiler
@@ -142,8 +136,10 @@ def InvokeNvcc(argv, log=False):
   m_options = ["-m64"]
 
   nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
+  compute_capabilities, argv = GetOptionValue(argv, "--cuda-gpu-arch")
+  for capability in compute_capabilities:
+    print(capability)
+    capability = capability[len('sm_'):]
     nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
         capability, capability, capability)]
   nvccopts += nvcc_compiler_options
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 9d17e1b8f35..92586dd7d11 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -166,6 +166,14 @@ cc_library(
     data = [":cuda-nvvm"],
 )
 
+filegroup(
+    name = "cuda_root",
+    srcs = [
+        "cuda/bin/fatbinary",
+        "cuda/bin/bin2c",
+    ],
+)
+
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index 3280d6b041f..bba772e2377 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -51,6 +51,10 @@ def cuda_is_configured():
     """Returns true if CUDA was enabled during the configure process."""
     return %{cuda_is_configured}
 
+def cuda_gpu_architectures():
+    """Returns a list of supported GPU architectures."""
+    return %{cuda_gpu_architectures}
+
 def if_cuda_is_configured(x):
     """Tests if the CUDA was enabled during the configure process.
 
diff --git a/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/gpus/cuda/cuda_config.h.tpl
index 916315d8c3e..dbd846307bb 100644
--- a/third_party/gpus/cuda/cuda_config.h.tpl
+++ b/third_party/gpus/cuda/cuda_config.h.tpl
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef CUDA_CUDA_CONFIG_H_
 #define CUDA_CUDA_CONFIG_H_
 
-#define TF_CUDA_CAPABILITIES %{cuda_compute_capabilities}
-
 #define TF_CUDA_VERSION "%{cuda_version}"
 #define TF_CUDA_LIB_VERSION "%{cuda_lib_version}"
 #define TF_CUDNN_VERSION "%{cudnn_version}"
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 8fa64f264dc..91eb0444b7c 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -603,7 +603,7 @@ def _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries):
         "f = open('script.py', 'wb');" +
         "f.write(script);" +
         "f.close();" +
-        "system('%s script.py %s');" % (python_bin, " ".join(cuda_libraries))
+        "system('\"%s\" script.py %s');" % (python_bin, " ".join(cuda_libraries))
     )
 
     return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
@@ -714,6 +714,7 @@ def _create_dummy_repository(repository_ctx):
         {
             "%{cuda_is_configured}": "False",
             "%{cuda_extra_copts}": "[]",
+            "%{cuda_gpu_architectures}": "[]",
         },
     )
     _tpl(
@@ -771,10 +772,6 @@ filegroup(name="cudnn-include")
             "%{cuda_version}": "",
             "%{cuda_lib_version}": "",
             "%{cudnn_version}": "",
-            "%{cuda_compute_capabilities}": ",".join([
-                "CudaVersion(\"%s\")" % c
-                for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-            ]),
             "%{cuda_toolkit_path}": "",
         },
         "cuda/cuda/cuda_config.h",
@@ -812,23 +809,35 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs):
     cmd = \"""%s \""",
 )""" % (name, "\n".join(outs), " && \\\n".join(cmds))
 
-def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
-    """Returns a rule to recursively copy a directory."""
+def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions = None):
+    """Returns a rule to recursively copy a directory.
+    If exceptions is not None, it must be a list of files or directories in
+    'src_dir'; these will be excluded from copying.
+    """
     src_dir = _norm_path(src_dir)
     out_dir = _norm_path(out_dir)
     outs = read_dir(repository_ctx, src_dir)
+    post_cmd = ""
+    if exceptions != None:
+        outs = [x for x in outs if not any([
+            x.startswith(src_dir + "/" + y)
+            for y in exceptions
+        ])]
     outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
 
     # '@D' already contains the relative path for a single file, see
     # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
     out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    if exceptions != None:
+        for x in exceptions:
+            post_cmd += " ; rm -fR " + out_dir + "/" + x
     return """genrule(
     name = "%s",
     outs = [
 %s
     ],
-    cmd = \"""cp -rLf "%s/." "%s/" \""",
-)""" % (name, "\n".join(outs), src_dir, out_dir)
+    cmd = \"""cp -rLf "%s/." "%s/" %s\""",
+)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd)
 
 def _flag_enabled(repository_ctx, flag_name):
     return get_host_environ(repository_ctx, flag_name) == "1"
@@ -844,10 +853,17 @@ def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
         "--cuda-gpu-arch=sm_" + cap.replace(".", "")
         for cap in compute_capabilities
     ]
+    return str(capability_flags)
 
-    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
-    # TODO(csigg): Make this consistent with cuda clang and pass unconditionally.
-    return "if_cuda_clang(%s)" % str(capability_flags)
+def _compute_cuda_gpu_architectures(repository_ctx, compute_capabilities):
+    gpu_architectures = [
+        "sm_" + capability.replace(".", "")
+        for capability in compute_capabilities
+    ]
+
+    # Make the list unique.
+    gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys()
+    return str(gpu_architectures)
 
 def _tpl_path(repository_ctx, filename):
     return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
@@ -980,6 +996,10 @@ def _create_local_cuda_repository(repository_ctx):
                 repository_ctx,
                 cuda_config.compute_capabilities,
             ),
+            "%{cuda_gpu_architectures}": _compute_cuda_gpu_architectures(
+                repository_ctx,
+                cuda_config.compute_capabilities,
+            ),
         },
     )
 
@@ -1096,9 +1116,6 @@ def _create_local_cuda_repository(repository_ctx):
             "%{cuda_version}": cuda_config.cuda_version,
             "%{nvcc_path}": nvcc_path,
             "%{gcc_host_compiler_path}": str(cc),
-            "%{cuda_compute_capabilities}": ", ".join(
-                ["\"%s\"" % c for c in cuda_config.compute_capabilities],
-            ),
             "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
         }
         repository_ctx.template(
@@ -1140,10 +1157,6 @@ def _create_local_cuda_repository(repository_ctx):
             "%{cuda_version}": cuda_config.cuda_version,
             "%{cuda_lib_version}": cuda_config.cuda_lib_version,
             "%{cudnn_version}": cuda_config.cudnn_version,
-            "%{cuda_compute_capabilities}": ", ".join([
-                "CudaVersion(\"%s\")" % c
-                for c in cuda_config.compute_capabilities
-            ]),
             "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
         },
     )
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 3c345e6724b..4cfec2459e4 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -615,6 +615,7 @@ def _create_local_rocm_repository(repository_ctx):
             name = "rocm-include",
             src_dir = rocm_toolkit_path + "/include",
             out_dir = "rocm/include",
+            exceptions = ["gtest", "gmock"],
         ),
         make_copy_dir_rule(
             repository_ctx,
diff --git a/third_party/hexagon/workspace.bzl b/third_party/hexagon/workspace.bzl
index 35b0786cd8d..1a682f0e8ad 100644
--- a/third_party/hexagon/workspace.bzl
+++ b/third_party/hexagon/workspace.bzl
@@ -2,14 +2,14 @@
 
 load("//third_party:repo.bzl", "third_party_http_archive")
 
-# Note: Use libhexagon_nn_skel version 1.14 Only with the current version.
+# Note: Use libhexagon_nn_skel version 1.17 Only with the current version.
 # This comment will be updated with compatible version.
 def repo():
     third_party_http_archive(
         name = "hexagon_nn",
-        sha256 = "5a0e72b20a47d826c3f0437a2fbc099bb214413244ab42979c9832fefe15ff63",
+        sha256 = "a0c011f7795e1a09eb7355be295d6442718b8565cc0e3c58a91671dde2bc99fb",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_headers_v1.14.0.0.tgz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_headers_v1.17.0.0.tgz",
         ],
         build_file = "//third_party/hexagon:BUILD",
     )
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 92c2469d011..4c3d56c42a7 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -1934,6 +1934,7 @@ cc_library(
     ]),
     copts = llvm_copts,
     deps = [
+        ":binary_format",
         ":config",
         ":debug_info_code_view",
         ":debug_info_msf",
@@ -2006,6 +2007,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "extensions",
+    srcs = glob([
+        "lib/Extensions/*.c",
+        "lib/Extensions/*.cpp",
+        "lib/Extensions/*.inc",
+        "lib/Extensions/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Extensions/*.h",
+        "include/llvm/Extensions/*.def",
+        "include/llvm/Extensions/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [":config"],
+)
+
 cc_library(
     name = "frontend_open_mp",
     srcs = glob([
@@ -2386,6 +2404,7 @@ cc_library(
         ":code_gen",
         ":config",
         ":core",
+        ":extensions",
         ":inst_combine",
         ":ipo",
         ":linker",
@@ -4330,7 +4349,6 @@ cc_library(
         ":x86_defs",
         ":x86_desc",
         ":x86_info",
-        ":x86_utils",
     ],
 )
 
@@ -4355,7 +4373,6 @@ cc_library(
         ":mc_disassembler",
         ":support",
         ":x86_info",
-        ":x86_utils",
     ],
 )
 
@@ -4404,27 +4421,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "x86_utils",
-    srcs = glob([
-        "lib/Target/X86/Utils/*.c",
-        "lib/Target/X86/Utils/*.cpp",
-        "lib/Target/X86/Utils/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/X86/Utils/*.h",
-        "include/llvm/Target/X86/Utils/*.def",
-        "include/llvm/Target/X86/Utils/*.inc",
-        "lib/Target/X86/Utils/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/X86"],
-    deps = [
-        ":code_gen",
-        ":config",
-        ":support",
-    ],
-)
-
 cc_library(
     name = "x_core_code_gen",
     srcs = glob([
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 4b8fb83eb09..bd0686523bc 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -107,6 +107,7 @@ def mkl_deps():
     return select({
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": ["@mkl_dnn"],
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": ["@mkl_dnn_v1//:mkl_dnn"],
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": ["@mkl_dnn_v1//:mkl_dnn"],
         "@org_tensorflow//third_party/mkl:build_with_mkl_ml_only": ["@org_tensorflow//third_party/mkl:intel_binary_blob"],
         "@org_tensorflow//third_party/mkl:build_with_mkl": [
             "@org_tensorflow//third_party/mkl:intel_binary_blob",
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index 774e5b0e2c0..fe558322916 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -27,6 +27,15 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "build_with_mkldnn_threadpool",
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_mkldnn_threadpool": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index af05333c947..bd3b4b94f29 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -29,3 +29,19 @@ def if_mkl_v1_open_source_only(if_true, if_false = []):
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": if_true,
         "//conditions:default": if_false,
     })
+
+def if_mkldnn_threadpool(if_true, if_false = []):
+    """Returns `if_true` if MKL-DNN v1.x is used.
+
+    Shorthand for select()'ing on whether we're building with
+    MKL-DNN v1.x open source library only with user specified threadpool, without depending on MKL binary form.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with MKL-DNN v1.x open source library only with user specified threadpool. Otherwise, the
+    select statement evaluates to if_false.
+
+    """
+    return select({
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": if_true,
+        "//conditions:default": if_false,
+    })
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index b916738cb20..313f81c8108 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -4,6 +4,7 @@ load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
     "if_mkl_v1_open_source_only",
+    "if_mkldnn_threadpool",
 )
 load(
     "@org_tensorflow//third_party:common.bzl",
@@ -18,15 +19,26 @@ config_setting(
     },
 )
 
+_DNNL_RUNTIME_OMP = {
+    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
+    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
+    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+}
+
+_DNNL_RUNTIME_THREADPOOL = {
+    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
+    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
+    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+}
+
 template_rule(
     name = "dnnl_config_h",
     src = "include/dnnl_config.h.in",
     out = "include/dnnl_config.h",
-    substitutions = {
-        "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-        "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-        "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    },
+    substitutions = if_mkldnn_threadpool(
+        _DNNL_RUNTIME_THREADPOOL,
+        if_false = _DNNL_RUNTIME_OMP,
+    ),
 )
 
 # Create the file mkldnn_version.h with MKL-DNN version numbers.
@@ -43,7 +55,7 @@ template_rule(
     out = "include/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "2",
+        "@DNNL_VERSION_MINOR@": "4",
         "@DNNL_VERSION_PATCH@": "0",
         "@DNNL_VERSION_HASH@": "N/A",
     },
@@ -59,9 +71,10 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
-    ]) + if_mkl_v1_open_source_only([
+    ]) + [
         ":dnnl_config_h",
-    ]) + [":dnnl_version_h"],
+        ":dnnl_version_h",
+    ],
     hdrs = glob(["include/*"]),
     copts = [
         "-fexceptions",
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 00c14ae60ca..5ebcbb6e3d2 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2,6 +2,7 @@
 #   The MLIR "Multi-Level Intermediate Representation" Compiler Infrastructure
 
 load("@org_tensorflow//third_party/mlir:tblgen.bzl", "gentbl")
+load("@org_tensorflow//third_party/mlir:linalggen.bzl", "genlinalg")
 
 licenses(["notice"])
 
@@ -51,6 +52,26 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "SymbolInterfacesIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/IR/SymbolInterfaces.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/IR/SymbolInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/IR/SymbolInterfaces.td",
+    td_srcs = [
+        ":OpBaseTdFiles",
+    ],
+)
+
 cc_library(
     name = "IR",
     srcs = glob([
@@ -70,6 +91,7 @@ cc_library(
         ":OpAsmInterfacesIncGen",
         ":SideEffectInterfacesIncGen",
         ":Support",
+        ":SymbolInterfacesIncGen",
         "@llvm-project//llvm:support",
     ],
 )
@@ -105,7 +127,6 @@ cc_library(
     hdrs = [
         "include/mlir-c/Core.h",
         "include/mlir/EDSC/Builders.h",
-        "include/mlir/EDSC/Intrinsics.h",
     ],
     includes = ["include"],
     deps = [
@@ -135,6 +156,7 @@ cc_library(
 filegroup(
     name = "OpBaseTdFiles",
     srcs = [
+        "include/mlir/Dialect/Affine/IR/AffineOpsBase.td",
         "include/mlir/IR/OpBase.td",
     ],
 )
@@ -156,7 +178,7 @@ filegroup(
         "include/mlir/Dialect/Affine/IR/AffineOps.td",
         "include/mlir/Dialect/Affine/IR/AffineOpsBase.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -195,7 +217,7 @@ filegroup(
         "include/mlir/Dialect/AVX512/AVX512.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/IR/OpBase.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -275,36 +297,37 @@ cc_library(
 )
 
 filegroup(
-    name = "LoopOpsTdFiles",
+    name = "SCFTdFiles",
     srcs = [
-        "include/mlir/Dialect/LoopOps/LoopOps.td",
+        "include/mlir/Dialect/SCF/SCFOps.td",
+        "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
 
 gentbl(
-    name = "LoopOpsIncGen",
+    name = "SCFIncGen",
     strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/Dialect/LoopOps/LoopOps.h.inc",
+            "include/mlir/Dialect/SCF/SCFOps.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/Dialect/LoopOps/LoopOps.cpp.inc",
+            "include/mlir/Dialect/SCF/SCFOps.cpp.inc",
         ),
         (
             "-gen-dialect-decls",
-            "include/mlir/Dialect/LoopOps/LoopOpsDialect.h.inc",
+            "include/mlir/Dialect/SCF/SCFOpsDialect.h.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/LoopOps/LoopOps.td",
+    td_file = "include/mlir/Dialect/SCF/SCFOps.td",
     td_srcs = [
-        ":LoopOpsTdFiles",
+        ":SCFTdFiles",
     ],
 )
 
@@ -314,30 +337,30 @@ gentbl(
     tbl_outs = [
         (
             "-gen-pass-decls",
-            "include/mlir/Dialect/LoopOps/Passes.h.inc",
+            "include/mlir/Dialect/SCF/Passes.h.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/LoopOps/Passes.td",
+    td_file = "include/mlir/Dialect/SCF/Passes.td",
     td_srcs = [
         ":PassBaseTdFiles",
     ],
 )
 
 cc_library(
-    name = "LoopOpsTransforms",
+    name = "SCFTransforms",
     srcs = glob([
-        "lib/Dialect/LoopOps/Transforms/*.cpp",
-        "lib/Dialect/LoopOps/Transforms/*.h",
+        "lib/Dialect/SCF/Transforms/*.cpp",
+        "lib/Dialect/SCF/Transforms/*.h",
     ]),
-    hdrs = ["include/mlir/Dialect/LoopOps/Passes.h"],
+    hdrs = ["include/mlir/Dialect/SCF/Passes.h"],
     includes = ["include"],
     deps = [
         ":Affine",
         ":IR",
-        ":LoopOps",
         ":LoopPassIncGen",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Transforms",
         "@llvm-project//llvm:support",
@@ -351,7 +374,8 @@ filegroup(
         "include/mlir/IR/OpAsmInterface.td",
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
+        "include/mlir/Interfaces/ViewLikeInterface.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -449,6 +473,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "AffineUtils",
+    srcs = glob(
+        [
+            "lib/Dialect/Affine/Utils/*.cpp",
+            "lib/Dialect/Affine/Utils/*.h",
+        ],
+    ),
+    hdrs = ["include/mlir/Dialect/Affine/Utils.h"],
+    includes = ["include"],
+    deps = [
+        ":Affine",
+        ":IR",
+        "@llvm-project//llvm:support",
+    ],
+)
+
 gentbl(
     name = "AffinePassIncGen",
     strip_include_prefix = "include",
@@ -480,8 +521,8 @@ cc_library(
         ":AffinePassIncGen",
         ":Analysis",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -518,11 +559,12 @@ cc_library(
         ":Affine",
         ":ConversionPassIncGen",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
+        ":VectorOps",
     ],
 )
 
@@ -547,24 +589,25 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopOps",
+    name = "SCFDialect",
     srcs = glob(
         [
-            "lib/Dialect/LoopOps/*.cpp",
-            "lib/Dialect/LoopOps/*.h",
-            "lib/Dialect/LoopOps/EDSC/*.cpp",
+            "lib/Dialect/SCF/*.cpp",
+            "lib/Dialect/SCF/*.h",
+            "lib/Dialect/SCF/EDSC/*.cpp",
         ],
     ),
     hdrs = glob([
-        "include/mlir/Dialect/LoopOps/*.h",
-        "include/mlir/Dialect/LoopOps/EDSC/*.h",
+        "include/mlir/Dialect/SCF/*.h",
+        "include/mlir/Dialect/SCF/EDSC/*.h",
     ]),
     includes = ["include"],
     deps = [
+        ":ControlFlowInterfaces",
         ":EDSC",
         ":IR",
         ":LoopLikeInterface",
-        ":LoopOpsIncGen",
+        ":SCFIncGen",
         ":SideEffects",
         ":StandardOps",
         ":Support",
@@ -583,6 +626,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ViewLikeInterface",
+    srcs = ["lib/Interfaces/ViewLikeInterface.cpp"],
+    hdrs = ["include/mlir/Interfaces/ViewLikeInterface.h"],
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":ViewLikeInterfaceIncGen",
+    ],
+)
+
 gentbl(
     name = "ShapeOpsIncGen",
     strip_include_prefix = "include",
@@ -604,6 +658,7 @@ gentbl(
     td_file = "include/mlir/Dialect/Shape/IR/ShapeOps.td",
     td_srcs = [
         ":StdOpsTdFiles",
+        "include/mlir/Dialect/Shape/IR/ShapeBase.td",
         "include/mlir/Interfaces/InferTypeOpInterface.td",
     ],
 )
@@ -622,6 +677,7 @@ cc_library(
     deps = [
         ":CallOpInterfaces",
         ":CommonFolders",
+        ":Dialect",
         ":IR",
         ":InferTypeOpInterface",
         ":ShapeOpsIncGen",
@@ -656,28 +712,40 @@ cc_library(
         ":SideEffects",
         ":StandardOpsIncGen",
         ":Support",
+        ":ViewLikeInterface",
         "@llvm-project//llvm:support",
     ],
 )
 
+gentbl(
+    name = "StandardOpsTransformsPassIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [(
+        "-gen-pass-decls",
+        "include/mlir/Dialect/StandardOps/Transforms/Passes.h.inc",
+    )],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/StandardOps/Transforms/Passes.td",
+    td_srcs = [":PassBaseTdFiles"],
+)
+
 cc_library(
     name = "StandardOpsTransforms",
-    srcs = glob(
-        [
-            "lib/Dialect/StandardOps/Transforms/*.cpp",
-            "lib/Dialect/StandardOps/Transforms/*.h",
-        ],
-    ),
-    hdrs = glob([
-        "include/mlir/Dialect/StandardOps/Transforms/*.h",
+    srcs = glob([
+        "lib/Dialect/StandardOps/Transforms/*.cpp",
+        "lib/Dialect/StandardOps/Transforms/*.h",
     ]),
+    hdrs = glob(["include/mlir/Dialect/StandardOps/Transforms/*.h"]),
     includes = ["include"],
     deps = [
         ":Analysis",
         ":ControlFlowInterfaces",
         ":IR",
+        ":Pass",
         ":StandardOps",
+        ":StandardOpsTransformsPassIncGen",
         ":Support",
+        ":Transforms",
         "@llvm-project//llvm:support",
     ],
 )
@@ -720,15 +788,11 @@ cc_library(
             "lib/Support/*.h",
         ],
         exclude = [
-            # TODO(herhut): Move JitRunner out of Support so that Support does not
-            # depend on dialect.
-            "lib/Support/JitRunner.cpp",
             # TODO(jpienaar): Move this out, else Support depends on Analysis/
             "lib/Support/MlirOptMain.cpp",
         ],
     ),
     hdrs = glob([
-        "include/mlir/ADT/*.h",
         "include/mlir/Support/*.h",
     ]),
     includes = ["include"],
@@ -885,6 +949,8 @@ cc_library(
         ":SideEffects",
         ":Support",
         "@llvm-project//llvm:asm_parser",
+        "@llvm-project//llvm:bit_reader",
+        "@llvm-project//llvm:bit_writer",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
     ],
@@ -928,7 +994,8 @@ filegroup(
         "include/mlir/Dialect/GPU/GPUBase.td",
         "include/mlir/Dialect/GPU/GPUOps.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/IR/SymbolInterfaces.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1044,9 +1111,9 @@ cc_library(
         ":GPUDialect",
         ":GPUPassIncGen",
         ":IR",
-        ":LoopOps",
         ":ParallelLoopMapperAttrGen",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -1059,8 +1126,9 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/LLVMOps.td",
+        "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1071,6 +1139,10 @@ cc_library(
         "lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h",
         "lib/Conversion/GPUCommon/OpToFuncCallLowering.h",
     ],
+    # TODO(b/155492113): Move back to hdrs once fixed.
+    textual_hdrs = [
+        "lib/Conversion/GPUCommon/GPUOpsLowering.h",
+    ],
     deps = [
         ":GPUDialect",
         ":IR",
@@ -1123,6 +1195,23 @@ cc_library(
     ],
 )
 
+gentbl(
+    name = "GPUToROCDLTGen",
+    strip_include_prefix = "lib/Conversion/GPUToROCDL",
+    tbl_outs = [
+        (
+            "-gen-rewriters",
+            "lib/Conversion/GPUToROCDL/GPUToROCDL.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "lib/Conversion/GPUToROCDL/GPUToROCDL.td",
+    td_srcs = [
+        ":GPUOpsTdFiles",
+        ":ROCDLOpsTdFiles",
+    ],
+)
+
 cc_library(
     name = "GPUToROCDLTransforms",
     srcs = [
@@ -1137,10 +1226,15 @@ cc_library(
         ":ConversionPassIncGen",
         ":GPUCommonTransforms",
         ":GPUDialect",
+        ":GPUToROCDLTGen",
+        ":GPUTransforms",
         ":LLVMTransforms",
         ":Pass",
         ":ROCDLDialect",
         ":Transforms",
+        ":VectorOps",
+        ":VectorToLLVM",
+        "@llvm-project//llvm:support",
     ],
 )
 
@@ -1228,8 +1322,8 @@ cc_library(
         ":GPUDialect",
         ":GPUToSPIRVIncGen",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVLowering",
         ":StandardToSPIRVConversions",
@@ -1321,7 +1415,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/NVVMOps.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1393,7 +1487,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/ROCDLOps.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1442,9 +1536,10 @@ gentbl(
 filegroup(
     name = "SPIRVOpsTdFiles",
     srcs = [
+        "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ] + glob(["include/mlir/Dialect/SPIRV/*.td"]),
 )
@@ -1704,28 +1799,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "StandardToStandard",
-    srcs = glob([
-        "lib/Conversion/StandardToStandard/*.cpp",
-        "lib/Conversion/StandardToStandard/*.h",
-    ]),
-    hdrs = glob([
-        "include/mlir/Conversion/StandardToStandard/*.h",
-    ]),
-    includes = [
-        "include",
-        "lib/Conversion/StandardToStandard",
-    ],
-    deps = [
-        ":ConversionPassIncGen",
-        ":IR",
-        ":Pass",
-        ":StandardOps",
-        ":Transforms",
-    ],
-)
-
 cc_library(
     name = "SPIRVSerialization",
     srcs = glob(
@@ -1786,7 +1859,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LoopLikeInterface",
-        ":LoopOps",
+        ":SCFDialect",
         ":SideEffects",
         ":StandardOps",
         ":Support",
@@ -1851,6 +1924,26 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "ViewLikeInterfaceIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/Interfaces/ViewLikeInterface.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/Interfaces/ViewLikeInterface.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Interfaces/ViewLikeInterface.td",
+    td_srcs = [
+        ":OpBaseTdFiles",
+    ],
+)
+
 gentbl(
     name = "TransformsPassIncGen",
     strip_include_prefix = "include",
@@ -1880,16 +1973,16 @@ cc_library(
     deps = [
         ":Affine",
         ":Analysis",
+        ":ControlFlowInterfaces",
         ":IR",
         ":LoopLikeInterface",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":SideEffects",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
         ":TransformsPassIncGen",
-        ":VectorOps",
         "@llvm-project//llvm:support",
     ],
 )
@@ -1909,9 +2002,9 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopsToGPU",
-    srcs = ["lib/Conversion/LoopsToGPU/LoopsToGPU.cpp"],
-    hdrs = ["include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h"],
+    name = "SCFToGPU",
+    srcs = ["lib/Conversion/SCFToGPU/SCFToGPU.cpp"],
+    hdrs = ["include/mlir/Conversion/SCFToGPU/SCFToGPU.h"],
     includes = ["include"],
     deps = [
         ":Affine",
@@ -1920,8 +2013,8 @@ cc_library(
         ":GPUDialect",
         ":GPUTransforms",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -1931,22 +2024,22 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopsToGPUPass",
+    name = "SCFToGPUPass",
     srcs = [
-        "lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp",
         "lib/Conversion/PassDetail.h",
+        "lib/Conversion/SCFToGPU/SCFToGPUPass.cpp",
     ],
     hdrs = [
-        "include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h",
+        "include/mlir/Conversion/SCFToGPU/SCFToGPUPass.h",
     ],
     includes = ["include"],
     deps = [
         ":Affine",
         ":ConversionPassIncGen",
         ":GPUDialect",
-        ":LoopOps",
-        ":LoopsToGPU",
         ":Pass",
+        ":SCFDialect",
+        ":SCFToGPU",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -1957,19 +2050,19 @@ cc_library(
 cc_library(
     name = "CFGTransforms",
     srcs = [
-        "lib/Conversion/LoopToStandard/LoopToStandard.cpp",
         "lib/Conversion/PassDetail.h",
+        "lib/Conversion/SCFToStandard/SCFToStandard.cpp",
     ],
     hdrs = [
-        "include/mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h",
+        "include/mlir/Conversion/SCFToStandard/SCFToStandard.h",
     ],
     includes = ["include"],
     deps = [
         ":ConversionPassIncGen",
         ":IR",
         ":LLVMDialect",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -2127,7 +2220,7 @@ gentbl(
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Interfaces/SideEffects.td",
+    td_file = "include/mlir/Interfaces/SideEffectInterfaces.td",
     td_srcs = [
         ":OpBaseTdFiles",
     ],
@@ -2136,10 +2229,10 @@ gentbl(
 cc_library(
     name = "SideEffects",
     srcs = [
-        "lib/Interfaces/SideEffects.cpp",
+        "lib/Interfaces/SideEffectInterfaces.cpp",
     ],
     hdrs = [
-        "include/mlir/Interfaces/SideEffects.h",
+        "include/mlir/Interfaces/SideEffectInterfaces.h",
     ],
     includes = ["include"],
     deps = [
@@ -2175,7 +2268,7 @@ cc_library(
         ":Affine",
         ":CallOpInterfaces",
         ":IR",
-        ":LoopOps",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:support",
@@ -2362,16 +2455,17 @@ cc_library(
         ":LLVMTransforms",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
-        ":LoopOpsTransforms",
+        ":LinalgToStandard",
         ":NVVMDialect",
         ":Parser",
         ":Pass",
+        ":SCFTransforms",
+        ":StandardOpsTransforms",
         ":StandardToSPIRVConversions",
-        ":StandardToStandard",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
-        ":VectorToLoops",
+        ":VectorToSCF",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir/test:TestAffine",
         "@llvm-project//mlir/test:TestDialect",
@@ -2447,24 +2541,26 @@ cc_library(
         ":LinalgPassIncGen",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
+        ":LinalgToStandard",
         ":LinalgTransforms",
-        ":LoopOps",
-        ":LoopOpsTransforms",
         ":LoopPassIncGen",
-        ":LoopsToGPUPass",
         ":NVVMDialect",
         ":OpenMPDialect",
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
+        ":SCFDialect",
+        ":SCFToGPUPass",
+        ":SCFTransforms",
         ":SDBM",
         ":SPIRVDialect",
         ":SPIRVLowering",
         ":SPIRVPassIncGen",
         ":Shape",
         ":StandardOps",
+        ":StandardOpsTransforms",
+        ":StandardOpsTransformsPassIncGen",
         ":StandardToSPIRVConversions",
-        ":StandardToStandard",
         ":Transforms",
         ":TransformsPassIncGen",
         ":VectorOps",
@@ -2487,6 +2583,7 @@ cc_library(
     srcs = [
         "tools/mlir-opt/mlir-opt.cpp",
     ],
+    copts = ["-DMLIR_INCLUDE_TESTS"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
         ":Analysis",
@@ -2503,11 +2600,11 @@ cc_binary(
     deps = [
         ":Analysis",
         ":IR",
-        ":LoopsToGPUPass",
         ":MlirOptLib",
         ":MlirOptMain",
         ":OpenMPDialect",
         ":QuantOps",
+        ":SCFToGPUPass",
         ":Transforms",
         "@llvm-project//llvm:all_targets",
         "@llvm-project//llvm:support",
@@ -2522,8 +2619,8 @@ cc_binary(
 
 cc_library(
     name = "MlirJitRunner",
-    srcs = ["lib/Support/JitRunner.cpp"],
-    hdrs = ["include/mlir/Support/JitRunner.h"],
+    srcs = ["lib/ExecutionEngine/JitRunner.cpp"],
+    hdrs = ["include/mlir/ExecutionEngine/JitRunner.h"],
     includes = ["include"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
@@ -2583,6 +2680,7 @@ cc_binary(
     srcs = ["tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp"],
     linkshared = True,
     deps = [
+        ":mlir_c_runner_utils",
         "//third_party/gpus/cuda:cuda_headers",
         "//third_party/gpus/cuda:cuda_runtime",
         "//third_party/gpus/cuda:libcuda",
@@ -2590,6 +2688,37 @@ cc_binary(
     ],
 )
 
+cc_library(
+    name = "VulkanRuntime",
+    srcs = [
+        "tools/mlir-vulkan-runner/VulkanRuntime.cpp",
+    ],
+    hdrs = [
+        "tools/mlir-vulkan-runner/VulkanRuntime.h",
+    ],
+    deps = [
+        ":IR",
+        ":Pass",
+        ":SPIRVDialect",
+        ":SideEffects",
+        ":StandardOps",
+        ":Support",
+        "//third_party/vulkan_loader",
+        "@llvm-project//llvm:support",
+        "@vulkan_headers",
+    ],
+)
+
+cc_binary(
+    name = "tools/libvulkan-runtime-wrappers.so",
+    srcs = ["tools/mlir-vulkan-runner/vulkan-runtime-wrappers.cpp"],
+    linkshared = True,
+    deps = [
+        ":VulkanRuntime",
+        "@llvm-project//llvm:support",
+    ],
+)
+
 cc_binary(
     name = "mlir-cuda-runner",
     srcs = ["tools/mlir-cuda-runner/mlir-cuda-runner.cpp"],
@@ -2619,6 +2748,28 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "mlir-vulkan-runner",
+    srcs = ["tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp"],
+    data = [
+        ":tools/libvulkan-runtime-wrappers.so",
+        "@llvm-project//mlir/test/mlir-cpu-runner:libmlir_runner_utils.so",
+    ],
+    deps = [
+        ":AllPassesAndDialectsNoRegistration",
+        ":ExecutionEngineUtils",
+        ":GPUToSPIRVTransforms",
+        ":GPUToVulkanTransforms",
+        ":GPUTransforms",
+        ":LLVMTransforms",
+        ":MlirJitRunner",
+        ":Pass",
+        ":SPIRVDialect",
+        ":StandardToSPIRVConversions",
+        "@llvm-project//llvm:support",
+    ],
+)
+
 cc_library(
     name = "TableGen",
     srcs = glob(["lib/TableGen/*.cpp"]),
@@ -2671,6 +2822,10 @@ cc_binary(
     srcs = glob([
         "tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp",
     ]),
+    linkopts = [
+        "-lm",
+        "-lpthread",
+    ],
     deps = [
         ":IR",
         ":Support",
@@ -2733,7 +2888,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/Quant/QuantOps.td",
         "include/mlir/Dialect/Quant/QuantOpsBase.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -2822,6 +2977,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/Linalg/IR/LinalgBase.td",
         "include/mlir/Dialect/Linalg/IR/LinalgOps.td",
+        "include/mlir/Interfaces/ViewLikeInterface.td",
         ":AffineOpsTdFiles",
         ":OpBaseTdFiles",
     ],
@@ -2851,11 +3007,29 @@ gentbl(
     ],
 )
 
+genlinalg(
+    name = "LinalgNamedStructuredOpsIncGen",
+    src = "include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc",
+    linalg_outs = [
+        (
+            "-gen-impl",
+            "include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.cpp.inc",
+        ),
+        (
+            "-gen-ods-decl",
+            "include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.td",
+        ),
+    ],
+    linalggen = ":mlir-linalg-ods-gen",
+)
+
 filegroup(
     name = "LinalgStructuredOpsTdFiles",
     srcs = [
+        "include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.td",
         "include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td",
         "include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td",
+        "include/mlir/Interfaces/ViewLikeInterface.td",
         ":AffineOpsTdFiles",
         ":LinalgOpsTdFiles",
         ":OpBaseTdFiles",
@@ -2887,6 +3061,7 @@ gentbl(
     td_file = "include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td",
     td_srcs = [
         ":LinalgStructuredOpsTdFiles",
+        "include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.td",
     ],
 )
 
@@ -2915,32 +3090,6 @@ gentbl(
     ],
 )
 
-filegroup(
-    name = "LinalgTransformPatternsTdFiles",
-    srcs = [
-        "include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td",
-        ":AffineOpsTdFiles",
-        ":LinalgOpsTdFiles",
-        ":LinalgStructuredOpsTdFiles",
-        ":OpBaseTdFiles",
-    ],
-)
-
-gentbl(
-    name = "LinalgTransformPatternsIncGen",
-    tbl_outs = [
-        (
-            "-gen-rewriters",
-            "include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.h.inc",
-        ),
-    ],
-    tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td",
-    td_srcs = [
-        ":LinalgTransformPatternsTdFiles",
-    ],
-)
-
 cc_library(
     name = "LinalgToLLVM",
     srcs = glob([
@@ -2967,6 +3116,32 @@ cc_library(
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
+        ":VectorToSCF",
+        "@llvm-project//llvm:core",
+        "@llvm-project//llvm:support",
+    ],
+)
+
+cc_library(
+    name = "LinalgToStandard",
+    srcs = glob([
+        "lib/Conversion/LinalgToStandard/*.cpp",
+        "lib/Conversion/LinalgToStandard/*.h",
+    ]) + ["lib/Conversion/PassDetail.h"],
+    hdrs = glob([
+        "include/mlir/Conversion/LinalgToStandard/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":Affine",
+        ":ConversionPassIncGen",
+        ":IR",
+        ":LinalgOps",
+        ":Pass",
+        ":SCFDialect",
+        ":StandardOps",
+        ":Support",
+        ":Transforms",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
     ],
@@ -3002,20 +3177,25 @@ cc_library(
         "lib/Dialect/Linalg/IR/LinalgTypes.cpp",
     ],
     hdrs = [
+        "include/mlir/Dialect/Linalg/EDSC/Intrinsics.h",
         "include/mlir/Dialect/Linalg/IR/LinalgOps.h",
         "include/mlir/Dialect/Linalg/IR/LinalgTraits.h",
         "include/mlir/Dialect/Linalg/IR/LinalgTypes.h",
     ],
     includes = ["include"],
     deps = [
+        ":Affine",
         ":DialectUtils",
+        ":EDSC",
         ":IR",
+        ":LinalgNamedStructuredOpsIncGen",
         ":LinalgOpsIncGen",
         ":LinalgStructuredOpsIncGen",
         ":Parser",
         ":SideEffects",
         ":StandardOps",
         ":Support",
+        ":ViewLikeInterface",
         "@llvm-project//llvm:support",
     ],
 )
@@ -3049,9 +3229,9 @@ cc_library(
     hdrs = [
         "include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h",
         "include/mlir/Dialect/Linalg/EDSC/Builders.h",
-        "include/mlir/Dialect/Linalg/EDSC/Intrinsics.h",
+        "include/mlir/Dialect/Linalg/EDSC/FoldedIntrinsics.h",
         "include/mlir/Dialect/Linalg/Passes.h",
-        "include/mlir/Dialect/Linalg/Transforms/LinalgTransforms.h",
+        "include/mlir/Dialect/Linalg/Transforms/Transforms.h",
         "include/mlir/Dialect/Linalg/Utils/Utils.h",
     ],
     includes = ["include"],
@@ -3068,8 +3248,8 @@ cc_library(
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgStructuredOpsIncGen",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -3174,13 +3354,13 @@ cc_library(
 )
 
 cc_library(
-    name = "VectorToLoops",
+    name = "VectorToSCF",
     srcs = glob([
-        "lib/Conversion/VectorToLoops/*.cpp",
-        "lib/Conversion/VectorToLoops/*.h",
+        "lib/Conversion/VectorToSCF/*.cpp",
+        "lib/Conversion/VectorToSCF/*.h",
     ]),
     hdrs = glob([
-        "include/mlir/Conversion/VectorToLoops/*.h",
+        "include/mlir/Conversion/VectorToSCF/*.h",
     ]),
     includes = ["include"],
     deps = [
@@ -3190,8 +3370,8 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":LLVMTransforms",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -3213,11 +3393,13 @@ exports_files(
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.h",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
+        "include/mlir/Interfaces/ViewLikeInterface.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/StandardOps/IR/Ops.td",
         "include/mlir/IR/OpAsmInterface.td",
         "include/mlir/IR/OpBase.td",
+        "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Transforms/InliningUtils.h",
     ],
     visibility = [":friends"],
diff --git a/third_party/mlir/linalggen.bzl b/third_party/mlir/linalggen.bzl
new file mode 100644
index 00000000000..5162911720f
--- /dev/null
+++ b/third_party/mlir/linalggen.bzl
@@ -0,0 +1,39 @@
+"""BUILD extensions for MLIR linalg generation."""
+
+def genlinalg(name, linalggen, src, linalg_outs):
+    """genlinalg() generates code from a tc spec file.
+
+    Args:
+      name: The name of the build rule for use in dependencies.
+      linalggen: The binary used to produce the output.
+      src: The tc spec file.
+      linalg_outs: A list of tuples (opts, out), where each opts is a string of
+        options passed to linalggen, and the out is the corresponding output file
+        produced.
+    """
+
+    for (opts, out) in linalg_outs:
+        # All arguments to generate the output except output destination.
+        base_args = [
+            "$(location %s)" % linalggen,
+            "%s" % opts,
+            "$(location %s)" % src,
+        ]
+        rule_suffix = "_".join(opts.replace("-", "_").replace("=", "_").split(" "))
+
+        # Rule to generate code using generated shell script.
+        native.genrule(
+            name = "%s_%s_genrule" % (name, rule_suffix),
+            srcs = [src],
+            outs = [out],
+            tools = [linalggen],
+            cmd = ("echo " + " ".join(base_args) + " -o $@; " + " ".join(base_args) + " -o $@"),
+        )
+
+    # List of opts that do not generate cc files.
+    hdrs = [f for (opts, f) in linalg_outs]
+    native.cc_library(
+        name = name,
+        hdrs = hdrs,
+        textual_hdrs = hdrs,
+    )
diff --git a/third_party/mlir/tblgen.bzl b/third_party/mlir/tblgen.bzl
index 6434bba762c..bbe64c11a02 100644
--- a/third_party/mlir/tblgen.bzl
+++ b/third_party/mlir/tblgen.bzl
@@ -21,6 +21,7 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], stri
         srcs += [td_file]
 
     td_includes_cmd = ["-I external/llvm-project/mlir/include -I external/org_tensorflow"]
+    td_includes_cmd += ["-I $(GENDIR)/external/llvm-project/mlir/include"]
     for td_include in td_includes:
         td_includes_cmd += ["-I%s" % td_include]
     local_inc = "-I $$(dirname $(location %s))" % td_file
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index f242ae76287..24b310f076e 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -16,21 +16,6 @@ cc_library(
     includes = ["."],
 )
 
-gentbl(
-    name = "TestLinalgTransformPatternsIncGen",
-    tbl_outs = [
-        (
-            "-gen-rewriters",
-            "lib/DeclarativeTransforms/TestLinalgTransformPatterns.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lib/DeclarativeTransforms/TestLinalgTransformPatterns.td",
-    td_srcs = [
-        "@llvm-project//mlir:LinalgTransformPatternsTdFiles",
-    ],
-)
-
 gentbl(
     name = "TestVectorTransformPatternsIncGen",
     tbl_outs = [
@@ -46,22 +31,6 @@ gentbl(
     ],
 )
 
-gentbl(
-    name = "TestLinalgMatmulToVectorPatternsIncGen",
-    tbl_outs = [
-        (
-            "-gen-rewriters",
-            "lib/DeclarativeTransforms/TestLinalgMatmulToVectorPatterns.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lib/DeclarativeTransforms/TestLinalgMatmulToVectorPatterns.td",
-    td_srcs = [
-        "@llvm-project//mlir:VectorTransformPatternsTdFiles",
-        "@llvm-project//mlir:LinalgTransformPatternsTdFiles",
-    ],
-)
-
 gentbl(
     name = "TestOpsIncGen",
     strip_include_prefix = "lib/Dialect/Test",
@@ -86,6 +55,14 @@ gentbl(
             "-gen-enum-defs",
             "lib/Dialect/Test/TestOpEnums.cpp.inc",
         ),
+        (
+            "-gen-struct-attr-decls",
+            "lib/Dialect/Test/TestOpStructs.h.inc",
+        ),
+        (
+            "-gen-struct-attr-defs",
+            "lib/Dialect/Test/TestOpStructs.cpp.inc",
+        ),
         (
             "-gen-rewriters",
             "lib/Dialect/Test/TestPatterns.inc",
@@ -96,10 +73,11 @@ gentbl(
     td_srcs = [
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/IR/OpAsmInterface.td",
+        "@llvm-project//mlir:include/mlir/IR/SymbolInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
     test = True,
 )
@@ -121,13 +99,14 @@ cc_library(
         ":TestOpsIncGen",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:ControlFlowInterfaces",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:StandardToStandard",
+        "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
@@ -173,8 +152,6 @@ cc_library(
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
-        ":TestLinalgMatmulToVectorPatternsIncGen",
-        ":TestLinalgTransformPatternsIncGen",
         ":TestVectorTransformPatternsIncGen",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:Affine",
@@ -186,15 +163,15 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LoopOps",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorOps",
         "@llvm-project//mlir:VectorToLLVM",
-        "@llvm-project//mlir:VectorToLoops",
+        "@llvm-project//mlir:VectorToSCF",
     ],
 )
 
@@ -207,6 +184,7 @@ cc_library(
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AffineTransforms",
+        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index e734e49f9dc..7585949ea92 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -1,6 +1,6 @@
 """Repository rule for NCCL."""
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
 def _gen_device_srcs_impl(ctx):
@@ -285,7 +285,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         name = dlink_hdrs,
         deps = [lib],
         out = dlink_cc,
-        gpu_archs = %{gpu_architectures},
+        gpu_archs = cuda_gpu_architectures(),
         nvlink_args = select({
             "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
             "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 92acb204097..d59e861d70b 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -13,7 +13,6 @@
 
 load(
     "//third_party/gpus:cuda_configure.bzl",
-    "compute_capabilities",
     "enable_cuda",
     "find_cuda_config",
 )
@@ -84,16 +83,7 @@ def _create_local_nccl_repository(repository_ctx):
         # Alias to open source build from @nccl_archive.
         repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
 
-        # TODO(csigg): implement and reuse in cuda_configure.bzl.
-        gpu_architectures = [
-            "sm_" + capability.replace(".", "")
-            for capability in compute_capabilities(repository_ctx)
-        ]
-
-        # Round-about way to make the list unique.
-        gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys()
         config_wrap = {
-            "%{gpu_architectures}": str(gpu_architectures),
             "%{use_bin2c_path}": "False",
         }
         if (int(cuda_major), int(cuda_minor)) <= (10, 1):
diff --git a/third_party/psimd/workspace.bzl b/third_party/psimd/workspace.bzl
index ca0bca77d17..03d010c3db8 100644
--- a/third_party/psimd/workspace.bzl
+++ b/third_party/psimd/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "psimd",
-        strip_prefix = "psimd-10b4ffc6ea9e2e11668f86969586f88bc82aaefa",
-        sha256 = "1fefd66702cb2eb3462b962f33d4fb23d59a55d5889ee6372469d286c4512df4",
+        strip_prefix = "psimd-85427dd4c8521cc037a1ffa6fcd25c55fafc8a00",
+        sha256 = "db23c2bc4a58d6f40c181797e43103300edac7cf9d286ca81590543f66ab95d2",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/psimd/archive/10b4ffc6ea9e2e11668f86969586f88bc82aaefa.tar.gz",
-            "https://github.com/Maratyszcza/psimd/archive/10b4ffc6ea9e2e11668f86969586f88bc82aaefa.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/psimd/archive/85427dd4c8521cc037a1ffa6fcd25c55fafc8a00.zip",
+            "https://github.com/Maratyszcza/psimd/archive/85427dd4c8521cc037a1ffa6fcd25c55fafc8a00.zip",
         ],
         build_file = "//third_party/psimd:BUILD.bazel",
     )
diff --git a/third_party/py/BUILD.tpl b/third_party/py/BUILD.tpl
index 3a0be1b0173..08ba167a998 100644
--- a/third_party/py/BUILD.tpl
+++ b/third_party/py/BUILD.tpl
@@ -28,6 +28,8 @@ toolchain(
     name = "py_toolchain",
     toolchain = ":py_runtime_pair",
     toolchain_type = "@bazel_tools//tools/python:toolchain_type",
+    target_compatible_with = [%{PLATFORM_CONSTRAINT}],
+    exec_compatible_with = [%{PLATFORM_CONSTRAINT}],
 )
 
 # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 6e9a22f8063..2f75262ea9f 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -240,11 +240,15 @@ def _create_local_python_repository(repository_ctx):
         "numpy_include",
     )
 
+    platform_constraint = ""
+    if repository_ctx.attr.platform_constraint:
+        platform_constraint = "\"%s\"" % repository_ctx.attr.platform_constraint
     repository_ctx.template("BUILD", build_tpl, {
         "%{PYTHON_BIN_PATH}": python_bin,
         "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
         "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
         "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
+        "%{PLATFORM_CONSTRAINT}": platform_constraint,
     })
 
 def _create_remote_python_repository(repository_ctx, remote_config_repo):
@@ -268,18 +272,31 @@ _ENVIRONS = [
     PYTHON_LIB_PATH,
 ]
 
+local_python_configure = repository_rule(
+    implementation = _create_local_python_repository,
+    environ = _ENVIRONS,
+    attrs = {
+        "environ": attr.string_dict(),
+        "platform_constraint": attr.string(),
+    },
+)
+
 remote_python_configure = repository_rule(
     implementation = _create_local_python_repository,
     environ = _ENVIRONS,
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "platform_constraint": attr.string(),
     },
 )
 
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
     environ = _ENVIRONS + [TF_PYTHON_CONFIG_REPO],
+    attrs = {
+        "platform_constraint": attr.string(),
+    },
 )
 """Detects and configures the local Python.
 
diff --git a/third_party/remote_config/BUILD.tpl b/third_party/remote_config/BUILD.tpl
index 7bcee410756..15a3b3bf8fe 100644
--- a/third_party/remote_config/BUILD.tpl
+++ b/third_party/remote_config/BUILD.tpl
@@ -1,8 +1,26 @@
+# Each platform creates a constraint @<platform>//:platform_constraint that
+# is listed in its constraint_values; rule that want to select a specific
+# platform to run on can put @<platform>//:platform_constraing into their
+# exec_compatible_with attribute.
+# Toolchains can similarly be marked with target_compatible_with or
+# exec_compatible_with to bind them to this platform.
+constraint_setting(
+    name = "platform_setting"
+)
+
+constraint_value(
+    name = "platform_constraint",
+    constraint_setting = ":platform_setting",
+    visibility = ["//visibility:public"],
+)
+
 platform(
     name = "platform",
+    visibility = ["//visibility:public"],
     constraint_values = [
-        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:%{cpu}",
         "@bazel_tools//platforms:%{platform}",
+        ":platform_constraint",
     ],
     exec_properties = %{exec_properties},
 )
diff --git a/third_party/remote_config/remote_platform_configure.bzl b/third_party/remote_config/remote_platform_configure.bzl
index 5c2918b8e84..386ad603950 100644
--- a/third_party/remote_config/remote_platform_configure.bzl
+++ b/third_party/remote_config/remote_platform_configure.bzl
@@ -2,6 +2,27 @@
 
 def _remote_platform_configure_impl(repository_ctx):
     platform = repository_ctx.attr.platform
+    if platform == "local":
+        os = repository_ctx.os.name.lower()
+        if os.startswith("windows"):
+            platform = "windows"
+        elif os.startswith("mac os"):
+            platform = "osx"
+        else:
+            platform = "linux"
+
+    cpu = "x86_64"
+    machine_type = repository_ctx.execute(["bash", "-c", "echo $MACHTYPE"]).stdout
+    if (machine_type.startswith("ppc") or
+        machine_type.startswith("powerpc")):
+        cpu = "ppc"
+    elif machine_type.startswith("s390x"):
+        cpu = "s390x"
+    elif machine_type.startswith("aarch64"):
+        cpu = "aarch64"
+    elif machine_type.startswith("arm"):
+        cpu = "arm"
+
     exec_properties = repository_ctx.attr.platform_exec_properties
 
     serialized_exec_properties = "{"
@@ -15,6 +36,7 @@ def _remote_platform_configure_impl(repository_ctx):
         {
             "%{platform}": platform,
             "%{exec_properties}": serialized_exec_properties,
+            "%{cpu}": cpu,
         },
     )
 
@@ -22,6 +44,6 @@ remote_platform_configure = repository_rule(
     implementation = _remote_platform_configure_impl,
     attrs = {
         "platform_exec_properties": attr.string_dict(mandatory = True),
-        "platform": attr.string(default = "linux", values = ["linux", "windows"]),
+        "platform": attr.string(default = "linux", values = ["linux", "windows", "local"]),
     },
 )
diff --git a/third_party/ruy/workspace.bzl b/third_party/ruy/workspace.bzl
index 203b89aa7e9..c4ed692df4d 100644
--- a/third_party/ruy/workspace.bzl
+++ b/third_party/ruy/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "ruy",
-        sha256 = "ac6d71df496a20043252f451d82a01636bb8bba9c3d6b5dc9fadadaffa392751",
-        strip_prefix = "ruy-91d62808498cea7ccb48aa59181e218b4ad05701",
+        sha256 = "b21524de00c63b3d5683b42557f78452e791cf77fddb2e63f9bcba1f7bd99093",
+        strip_prefix = "ruy-1b313682ef8b8fc8ed08719c610d1c3503b016bf",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/ruy/archive/91d62808498cea7ccb48aa59181e218b4ad05701.zip",
-            "https://github.com/google/ruy/archive/91d62808498cea7ccb48aa59181e218b4ad05701.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip",
+            "https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip",
         ],
         build_file = "//third_party/ruy:BUILD",
     )
diff --git a/third_party/toolchains/cpus/arm/cc_config.bzl.tpl b/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
index 6c68660a162..f6981490b8d 100644
--- a/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
+++ b/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
@@ -347,15 +347,17 @@ def _impl(ctx):
                             flags = [
                                 "-std=c++11",
                                 "-isystem",
-			 	"%{ARM_COMPILER_PATH}%/lib/gcc/arm-rpi-linux-gnueabihf/6.5.0/include",
+                                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-rpi-linux-gnueabihf/6.5.0/include",
                                 "-isystem",
                                 "%{ARM_COMPILER_PATH}%/lib/gcc/arm-rpi-linux-gnueabihf/6.5.0/include-fixed",
                                 "-isystem",
+                                "%{ARM_COMPILER_PATH}%/arm-rpi-linux-gnueabihf/include/c++/6.5.0/",
+                                "-isystem",
                                 "%{ARM_COMPILER_PATH}%/arm-rpi-linux-gnueabihf/sysroot/usr/include/",
                                 "-isystem",
-		                "%{ARM_COMPILER_PATH}%/arm-rpi-linux-gnueabihf/include/c++/6.5.0/",
-                                "-isystem",
                                 "%{PYTHON_INCLUDE_PATH}%",
+                                "-isystem",
+                                "/usr/include/",
                             ],
                         ),
                     ],
diff --git a/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl b/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl
index da4282d0215..af34133f27c 100644
--- a/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl
+++ b/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl
@@ -10,6 +10,16 @@ def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
     )
 
 def _arm_linux_toolchain_configure_impl(repository_ctx):
+    # We need to find a cross-compilation include directory for Python, so look
+    # for an environment variable. Be warned, this crosstool template is only
+    # regenerated on the first run of Bazel, so if you change the variable after
+    # it may not be reflected in later builds. Doing a shutdown and clean of Bazel
+    # doesn't fix this, you'll need to delete the generated file at something like:
+    # external/local_config_arm_compiler/CROSSTOOL in your Bazel install.
+    if "CROSSTOOL_PYTHON_INCLUDE_PATH" in repository_ctx.os.environ:
+        python_include_path = repository_ctx.os.environ["CROSSTOOL_PYTHON_INCLUDE_PATH"]
+    else:
+        python_include_path = "/usr/include/python3.5"
     _tpl(repository_ctx, "cc_config.bzl", {
         "%{AARCH64_COMPILER_PATH}%": str(repository_ctx.path(
             repository_ctx.attr.aarch64_repo,
@@ -17,6 +27,7 @@ def _arm_linux_toolchain_configure_impl(repository_ctx):
         "%{ARMHF_COMPILER_PATH}%": str(repository_ctx.path(
             repository_ctx.attr.armhf_repo,
         )),
+        "%{PYTHON_INCLUDE_PATH}%": python_include_path,
     })
     repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
 
diff --git a/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl b/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl
index 06aaaecfa74..afbea6a3e34 100644
--- a/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl
+++ b/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl
@@ -252,6 +252,10 @@ def _impl(ctx):
                                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/include/c++/8.3.0/",
                                 "-isystem",
                                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/libc/usr/include/",
+                                "-isystem",
+                                "%{PYTHON_INCLUDE_PATH}%",
+                                "-isystem",
+                                "/usr/include/",
                             ],
                         ),
                     ],
@@ -347,6 +351,10 @@ def _impl(ctx):
                                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/8.3.0/",
                                 "-isystem",
                                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                                "-isystem",
+                                "%{PYTHON_INCLUDE_PATH}%",
+                                "-isystem",
+                                "/usr/include/",
                             ],
                         ),
                     ],
@@ -466,6 +474,7 @@ def _impl(ctx):
                 "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-linux-gnu/8.3.0/include-fixed",
                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/include/c++/8.3.0/",
                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/libc/usr/include/",
+                "/usr/include",
             ]
     elif (ctx.attr.cpu == "armhf"):
         cxx_builtin_include_directories = [
@@ -473,6 +482,7 @@ def _impl(ctx):
                 "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/8.3.0/include-fixed",
                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/8.3.0/",
                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                "/usr/include",
             ]
     else:
         fail("Unreachable")
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index bf2a655acc4..8e6f48df99e 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -2,13 +2,14 @@
 container_digests = {
     "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
     "centos6": "sha256:d09c12fb26fbbe8398b4973260c75172eb67d509dae9d6f4ad54279b7d6b0494",
-    "ubuntu16.04-manylinux2010": "sha256:b5227c4069980005336dd5cf04e3122974984da3396a514a06d7db3a7ae7b2f9",
+    "ubuntu16.04-manylinux2010": "sha256:d5b056506e14eb216b6e27988814617a09dea77ec1ab46972072038f9df3e728",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
     "cuda10.0-cudnn7-centos6": "sha256:a1909ba09c703340ee0074ce63dd94fe8fea48035a25264677907a609e2375e0",
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:cc7f760195d7bbe283b45ae740409751d0b74d8ffbdc2f7a3cb62c71a71fbe25",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:1e4e888f14a3d5b127151f7970487613a46ca957babe0432786627c78c0b1a36",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:13aa5e700bb609521cd4365d4152d7d8f4118cae7ce174ce7d54cc529e21766a",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }
diff --git a/third_party/toolchains/remote_config/configs.bzl b/third_party/toolchains/remote_config/configs.bzl
index 4c94abf45b3..4098e5f1580 100644
--- a/third_party/toolchains/remote_config/configs.bzl
+++ b/third_party/toolchains/remote_config/configs.bzl
@@ -1,12 +1,16 @@
 """Configurations of RBE builds used with remote config."""
 
-load("//third_party/toolchains/remote_config:rbe_config.bzl", "tensorflow_rbe_config", "tensorflow_rbe_win_config")
+load("//third_party/toolchains/remote_config:rbe_config.bzl", "tensorflow_local_config", "tensorflow_rbe_config", "tensorflow_rbe_win_config")
 
 def initialize_rbe_configs():
+    tensorflow_local_config(
+        name = "local_execution",
+    )
+
     tensorflow_rbe_config(
         name = "ubuntu16.04-manylinux2010-py3",
         os = "ubuntu16.04-manylinux2010",
-        python_version = "3",
+        python_versions = ["3"],
         compiler = "",
     )
 
@@ -17,11 +21,27 @@ def initialize_rbe_configs():
         cuda_version = "10.0",
         cudnn_version = "7",
         os = "ubuntu16.04-manylinux2010",
-        python_version = "3",
+        python_versions = ["3"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "5.1",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0",
+        compiler = "/dt7/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "10.1",
+        cudnn_version = "7",
+        os = "ubuntu16.04-manylinux2010-multipython",
+        python_versions = ["2.7", "3.5", "3.6", "3.7", "3.8"],
+        tensorrt_install_path = "/usr",
+        tensorrt_version = "6.0",
+        python_install_path = "/usr/local",
+    )
+
+    # TODO(klimek): Delete this once all users are migrated to a python-version
+    # independent configuration. In the future, use
+    # "ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0" instead.
     tensorflow_rbe_config(
         name = "ubuntu16.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0",
         compiler = "/dt7/usr/bin/gcc",
@@ -29,28 +49,29 @@ def initialize_rbe_configs():
         cuda_version = "10.1",
         cudnn_version = "7",
         os = "ubuntu16.04-manylinux2010",
-        python_version = "3",
+        python_versions = ["3"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "6.0",
     )
 
     tensorflow_rbe_config(
-        name = "ubuntu16.04-py3-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0",
+        name = "ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0",
         compiler = "/clang_r42cab985fd95ba4f3f290e7bb26b93805edb447d/bin/clang",
         cuda_version = "10.1",
         cudnn_version = "7",
-        os = "ubuntu16.04-manylinux2010",
-        python_version = "3",
+        os = "ubuntu16.04-manylinux2010-multipython",
+        python_versions = ["2.7", "3.5", "3.6", "3.7", "3.8"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "6.0",
         sysroot = "/dt7",
+        python_install_path = "/usr/local",
     )
 
     tensorflow_rbe_config(
         name = "ubuntu16.04-py3_opt-gcc5-rocm",
         compiler = "gcc",
         os = "ubuntu16.04",
-        python_version = "3",
+        python_versions = ["3"],
         rocm_version = "2.5",  # Any version will do.
     )
 
diff --git a/third_party/toolchains/remote_config/containers.bzl b/third_party/toolchains/remote_config/containers.bzl
index 26cb3ea3367..7b948f78f56 100644
--- a/third_party/toolchains/remote_config/containers.bzl
+++ b/third_party/toolchains/remote_config/containers.bzl
@@ -24,6 +24,13 @@ containers = {
         "digest": container_digests["cuda10.1-cudnn7-ubuntu16.04-manylinux2010"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython.
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython",
+        "digest": container_digests["cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
     "rocm-ubuntu16.04": {
         "registry": "gcr.io",
diff --git a/third_party/toolchains/remote_config/rbe_config.bzl b/third_party/toolchains/remote_config/rbe_config.bzl
index 6709cad4eb3..744496e8335 100644
--- a/third_party/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/toolchains/remote_config/rbe_config.bzl
@@ -1,6 +1,6 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/py:python_configure.bzl", "remote_python_configure")
+load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
 load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
@@ -12,7 +12,7 @@ def _container_image_uri(container_name):
     container = containers[container_name]
     return "docker://%s/%s@%s" % (container["registry"], container["repository"], container["digest"])
 
-def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = None, cuda_version = None, cudnn_version = None, tensorrt_version = None, tensorrt_install_path = None, cudnn_install_path = None, compiler_prefix = None, sysroot = None):
+def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = None, cuda_version = None, cudnn_version = None, tensorrt_version = None, tensorrt_install_path = None, cudnn_install_path = None, compiler_prefix = None, sysroot = None, python_install_path = "/usr"):
     if cuda_version != None and rocm_version != None:
         fail("Specifying both cuda_version and rocm_version is not supported.")
 
@@ -26,7 +26,6 @@ def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = No
         "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
         "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
         "CC": compiler,
-        "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
         "CLEAR_CACHE": "1",
         "HOST_CXX_COMPILER": compiler,
         "HOST_C_COMPILER": compiler,
@@ -59,18 +58,6 @@ def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = No
             "Pool": "default",
         }
 
-        remote_platform_configure(
-            name = "%s_config_platform" % name,
-            platform = "linux",
-            platform_exec_properties = exec_properties,
-        )
-
-        remote_python_configure(
-            name = "%s_config_python" % name,
-            environ = env,
-            exec_properties = exec_properties,
-        )
-
         remote_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
@@ -103,38 +90,41 @@ def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = No
             "Pool": "default",
         }
 
-        remote_platform_configure(
-            name = "%s_config_platform" % name,
-            platform = "linux",
-            platform_exec_properties = exec_properties,
-        )
-
-        remote_python_configure(
-            name = "%s_config_python" % name,
-            environ = env,
-            exec_properties = exec_properties,
-        )
-
         remote_rocm_configure(
             name = "%s_config_rocm" % name,
             environ = env,
             exec_properties = exec_properties,
         )
-    elif python_version != None:
+    elif python_versions != None:
         container_image = _container_image_uri(os)
         exec_properties = {
             "container-image": container_image,
             "Pool": "default",
         }
 
-        remote_python_configure(
-            name = "%s_config_python" % name,
-            environ = env,
-            exec_properties = exec_properties,
-        )
     else:
         fail("Neither cuda_version, rocm_version nor python_version specified.")
 
+    remote_platform_configure(
+        name = "%s_config_platform" % name,
+        platform = "linux",
+        platform_exec_properties = exec_properties,
+    )
+    for python_version in python_versions:
+        env.update({
+            "PYTHON_BIN_PATH": "%s/bin/python%s" % (python_install_path, python_version),
+        })
+
+        # For backwards compatibility do not add the python version to the name
+        # if we only create a single python configuration.
+        version = python_version if len(python_versions) > 1 else ""
+        remote_python_configure(
+            name = "%s_config_python%s" % (name, version),
+            environ = env,
+            exec_properties = exec_properties,
+            platform_constraint = "@%s_config_platform//:platform_constraint" % name,
+        )
+
 def _tensorflow_rbe_win_config(name, python_bin_path, container_name = "windows-1803"):
     container_image = _container_image_uri(container_name)
     exec_properties = {
@@ -156,7 +146,20 @@ def _tensorflow_rbe_win_config(name, python_bin_path, container_name = "windows-
         name = "%s_config_python" % name,
         environ = env,
         exec_properties = exec_properties,
+        platform_constraint = "@%s_config_platform//:platform_constraint" % name,
+    )
+
+def _tensorflow_local_config(name):
+    remote_platform_configure(
+        name = "%s_config_platform" % name,
+        platform = "local",
+        platform_exec_properties = {},
+    )
+    local_python_configure(
+        name = "%s_config_python" % name,
+        platform_constraint = "@%s_config_platform//:platform_constraint" % name,
     )
 
 tensorflow_rbe_config = _tensorflow_rbe_config
 tensorflow_rbe_win_config = _tensorflow_rbe_win_config
+tensorflow_local_config = _tensorflow_local_config